From fe508089aaabf49ca1fd8670e1684c82a82e9c8d Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Tue, 4 Feb 2025 16:14:13 +0000
Subject: [PATCH 0001/1240] feat : adding a new parser for llama

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/__init__.py           |   3 +-
 .../llama_usr_defined_tool_parser.py          | 248 ++++++++++++++++++
 2 files changed, 250 insertions(+), 1 deletion(-)
 create mode 100644 vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index d1c3afa64b9..512335177e1 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -9,10 +9,11 @@
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
 from .pythonic_tool_parser import PythonicToolParser
+from .llama_usr_defined_tool_parser import Llama3UserDefinedCustomToolParser
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser"
+    "PythonicToolParser", "Llama3UserDefinedCustomToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
new file mode 100644
index 00000000000..ddc11c8e945
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import re
+from typing import Dict, List, Sequence, Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+def _count_substring(string, substring):
+        """
+        Counts the number of non-overlapping occurrences of a substring in a string.
+        
+        Args:
+            string (str): The string to search in.
+            substring (str): The substring to search for.
+            
+        Returns:
+            int: The number of non-overlapping occurrences of the substring in the string.
+        """
+        count = 0
+        start = 0
+        while True:
+            start = string.find(substring, start)
+            if start == -1:
+                break
+            count += 1
+            start += len(substring)
+        return count
+
+@ToolParserManager.register_module("llama3_user_defined_custom")
+class Llama3UserDefinedCustomToolParser(ToolParser):
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        if isinstance(self.model_tokenizer, MistralTokenizer):
+            logger.error(
+                "Detected Mistral tokenizer when using a Llama model")
+            self.model_tokenizer = self.model_tokenizer.tokenizer
+
+        self.prev_tool_call_arr: List[Dict] = []
+        self.streamed_args_for_tool: List[str] = []
+        self.is_parsing_toolcall = False
+        
+        self.nb_tool_calls = 0
+        self.current_tool_name=""
+        self.current_tool_call_uuid=""
+        self.is_current_tool_name_sent = False
+        self.tool_call_start_token: str = "<function"
+        self.tool_call_precall_token: str = '>{"'
+        self.tool_call_end_token: str = "</function>"
+        self.bot_token = "<|python_tag|>"
+
+        self.tool_call_start_token_id = tokenizer.encode(self.tool_call_start_token,
+                                             add_special_tokens=False)
+        
+        self.tool_call_end_token_id = tokenizer.encode(self.tool_call_end_token,
+                                             add_special_tokens=False)
+          
+        self.tool_call_preargs_token_id = tokenizer.encode(self.tool_call_precall_token,
+                                             add_special_tokens=False)   
+                                            
+        self.bot_token_id = tokenizer.encode(self.bot_token,
+                                             add_special_tokens=False)
+
+        self.tool_call_regex = re.compile(r"<function=([^>]+)>\{([^}]+)\}(?:</function>|>)?")
+
+        if not self.model_tokenizer:
+            raise ValueError(
+                "The model tokenizer must be passed to the ToolParser "
+                "constructor during construction.")
+        
+    def extract_tool_calls(
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+    ) -> ExtractedToolCallInformation:
+
+        # sanity check; avoid unnecessary processing
+        if self.tool_call_start_token not in model_output:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        else:
+            try:
+                # there are two possible captures - between tags, or between a
+                # tag and end-of-string so the result of
+                # findall is an array of tuples where one is a function call and
+                # the other is None
+                function_call_tuples = self.tool_call_regex.findall(model_output)
+                
+                logger.info("function_call_tuples: %s", function_call_tuples)
+                print("function_call_tuples: %s", function_call_tuples)
+                
+                # load the JSON, and then use it to build the Function and
+                # Tool Call
+                raw_function_calls = [
+                    {
+                        "name":match[0],
+                        "arguments":json.loads("{"+match[1]+"}")
+                     } 
+                     for match in function_call_tuples
+                ]
+                tool_calls = [
+                    ToolCall(
+                        type="function",
+                        function=FunctionCall(
+                            name=function_call["name"],
+                            # function call args are JSON but as a string
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False)))
+                    for function_call in raw_function_calls
+                ]
+
+                content = model_output[:model_output.
+                                       find(self.tool_call_start_token)]
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=tool_calls,
+                    content=content if content else None)
+
+            except Exception:
+                logger.exception(
+                    "Error in extracting tool call from response.")
+                return ExtractedToolCallInformation(tools_called=False,
+                                                    tool_calls=[],
+                                                    content=model_output)
+    
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+        """
+        Extract tool calls from a streaming response.
+        Handles format: <function=functionName{arguments}>
+        Returns DeltaMessage with either tool_calls or content.
+        """
+        logger.debug("\n" + "="*50)
+        logger.debug("STREAMING FUNCTION CALLED")
+        logger.debug("Tool call start token id IDs:", self.tool_call_start_token_id)
+        logger.debug("Tool call precall token id IDs:", self.tool_call_preargs_token_id)
+        logger.debug("Tool call end token id IDs:", self.tool_call_end_token_id)
+        logger.debug("Previous text:", previous_text)
+        logger.debug("Current text:", current_text)
+        logger.debug("Delta text:", delta_text)
+        logger.debug("Previous token IDs:", previous_token_ids)
+        logger.debug("Current token IDs:", current_token_ids)
+        logger.debug("Delta token IDs:", delta_token_ids)
+        logger.debug("Current tool name sent:", self.is_current_tool_name_sent)
+        logger.debug("-"*50 + "\n")
+        flags = Allow.ALL if self.is_current_tool_name_sent \
+                else Allow.ALL & ~Allow.STR
+        
+        logger.debug(f"{delta_token_ids[0] in self.tool_call_start_token_id=}")
+        if delta_token_ids[0] in self.tool_call_start_token_id : 
+            # We possibly have a tool call (not sure yet) we don't stream
+          
+            logger.debug(f"{_count_substring(current_text,self.tool_call_start_token)=}")
+            if _count_substring(current_text,self.tool_call_start_token) > self.nb_tool_calls \
+                and not self.is_parsing_toolcall :
+
+                self.is_parsing_toolcall=True
+                self.nb_tool_calls +=1 #will serve as id
+                self.current_tool_call_uuid = random_uuid()
+                logger.debug("New tool call detected, id:", self.nb_tool_calls-1)
+                return None # going to the next iter 
+            else : 
+                logger.debug("Tool call already parsed, id:", self.nb_tool_calls-1)
+            
+        if self.is_parsing_toolcall and not self.is_current_tool_name_sent : 
+            logger.debug("Parsing tool call, id:", self.nb_tool_calls-1)
+            # We are parsing a tool call, we need to parse the tool name
+            if delta_token_ids != self.tool_call_preargs_token_id:
+                self.current_tool_name += delta_text
+                logger.debug(f"{self.current_tool_name=}")
+                return None # moving on to the next iteration
+            else : 
+                self.current_tool_name = self.current_tool_name.lstrip('=')
+                self.is_current_tool_name_sent = True
+                return DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.nb_tool_calls - 1,
+                                    type="function",
+                                    id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
+                                    function=DeltaFunctionCall(
+                                        name=self.current_tool_name))
+                ])
+            
+        if self.is_current_tool_name_sent :
+            logger.debug("Parsed tool name : ", self.current_tool_name)
+
+            if _count_substring(current_text,self.tool_call_end_token) < self.nb_tool_calls:
+                self.streamed_args_for_tool.append(delta_text)
+                return None # moving on to the next iteration
+            else :
+                arguments = '{"'+''.join(self.streamed_args_for_tool) # adding back {" at the beginning for valid JSON
+                arguments = arguments.rstrip(self.tool_call_end_token) # removing the end token
+                logger.debug("Concatenated tool call arguments  : ", arguments)
+
+                current_tool_args = partial_json_parser.loads(
+                arguments or "{}",
+                flags) if self.streamed_args_for_tool else None
+                
+                logger.debug("Parsed tool call arguments : ", current_tool_args)
+
+                
+                delta = DeltaMessage(tool_calls=[
+                    DeltaToolCall(index=self.nb_tool_calls - 1,
+                                    type="function",
+                                    id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
+                                    function=DeltaFunctionCall(
+                                        name=self.current_tool_name,
+                                        arguments=json.dumps(current_tool_args)))
+                ])
+
+                self.reset_state()
+                
+                return delta 
+        else : 
+            logger.debug("No tool call detected, returning just text : ", delta_text)
+            return DeltaMessage(content=delta_text)
+            
+    def reset_state(self):
+        self.current_tool_name = ''
+        self.is_parsing_toolcall=False
+        self.is_current_tool_name_sent = False
+        self.streamed_args_for_tool = []
\ No newline at end of file

From 085fe89737d98664ad1a4efe9b58037acb743abd Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Tue, 4 Feb 2025 16:46:13 +0000
Subject: [PATCH 0002/1240] feat: add an example of the chat template required
 for the usr def tool parser

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ..._template_llama3.1_usr_def_tool_call.jinja | 131 ++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 examples/tool_chat_template_llama3.1_usr_def_tool_call.jinja

diff --git a/examples/tool_chat_template_llama3.1_usr_def_tool_call.jinja b/examples/tool_chat_template_llama3.1_usr_def_tool_call.jinja
new file mode 100644
index 00000000000..07a118c6e76
--- /dev/null
+++ b/examples/tool_chat_template_llama3.1_usr_def_tool_call.jinja
@@ -0,0 +1,131 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+
+{%- if builtin_tools is defined %}
+    {{- "# Tool Instructions\n"}}
+    {{- "- Always execute python code in messages that you share.\n"}}
+    {{- "- When looking for real time information use relevant functions if available else fallback to brave_search\n\n\n"}}
+{%- endif %}
+
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions:\n\n"}}
+
+    {%- for t in tools %}
+        {%- if t.function is defined %}
+                {%- set t = t.function %}
+        {%- endif -%}
+        {{- "Use the function '"+t.name+"' to: "+t.description+"\n"}}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "If a you choose to call a function ONLY reply in the following format:\n"}}
+    {{- "<{start_tag}={function_name}>{parameters}{end_tag}\n" }}
+    {{- "where\n\n"}}
+    {{- "start_tag => `<function`\n" }}
+    {{- "parameters => a JSON dict with the function argument name as key and function argument value as value.\n"}}
+    {{- "end_tag => `</function>`" }}
+    {{- "\n\n" }}
+    {{- "Here is an example,\n"}}
+    {{- "<function=example_function_name>{\"example_name\": \"example_value\"}</function>"}}
+    {{- "\n\n"  }}
+    {{- "Reminder:\n"}}
+    {{- "- Function calls MUST follow the specified format\n"}}
+    {{- "- Required parameters MUST be specified\n"}}
+    {{- "- Only call one function at a time\n"}}
+    {{- "- Put the entire function call reply on one line\n"}}
+    {{- "- Always use the information returned by the function to answer to the user\n"}}
+    {{- "- If there is no relevant function available, do NOT call any function: respond directly to the user\n\n"}}
+
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '<function=' + tool_call.name + '>' + tool_call.arguments + '</function>'}}
+        {%- endif %}
+        {%- if builtin_tools is defined or tools is not none%}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
\ No newline at end of file

From df0ac52d8d7425bf2965cf0646b2cc8405a25ce1 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 14:09:38 +0000
Subject: [PATCH 0003/1240] fix: addressing the linting comments

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llama_usr_defined_tool_parser.py          | 52 ++++++++++++-------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index ddc11c8e945..c40df2a161a 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -22,14 +22,16 @@
 
 def _count_substring(string, substring):
         """
-        Counts the number of non-overlapping occurrences of a substring in a string.
+        Counts the number of non-overlapping occurrences of a substring in 
+        a string.
         
         Args:
             string (str): The string to search in.
             substring (str): The substring to search for.
             
         Returns:
-            int: The number of non-overlapping occurrences of the substring in the string.
+            int: The number of non-overlapping occurrences of the substring in
+            the string.
         """
         count = 0
         start = 0
@@ -65,19 +67,20 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.tool_call_end_token: str = "</function>"
         self.bot_token = "<|python_tag|>"
 
-        self.tool_call_start_token_id = tokenizer.encode(self.tool_call_start_token,
-                                             add_special_tokens=False)
+        self.tool_call_start_token_id = tokenizer.encode(
+            self.tool_call_start_token, add_special_tokens=False)
         
         self.tool_call_end_token_id = tokenizer.encode(self.tool_call_end_token,
                                              add_special_tokens=False)
           
-        self.tool_call_preargs_token_id = tokenizer.encode(self.tool_call_precall_token,
-                                             add_special_tokens=False)   
+        self.tool_call_preargs_token_id = tokenizer.encode(
+            self.tool_call_precall_token, add_special_tokens=False)   
                                             
         self.bot_token_id = tokenizer.encode(self.bot_token,
                                              add_special_tokens=False)
 
-        self.tool_call_regex = re.compile(r"<function=([^>]+)>\{([^}]+)\}(?:</function>|>)?")
+        self.tool_call_regex = re.compile(
+            r"<function=([^>]+)>\{([^}]+)\}(?:</function>|>)?")
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -157,10 +160,12 @@ def extract_tool_calls_streaming(
         Handles format: <function=functionName{arguments}>
         Returns DeltaMessage with either tool_calls or content.
         """
-        logger.debug("\n" + "="*50)
+        logger.debug("\n" , "=" * 50)
         logger.debug("STREAMING FUNCTION CALLED")
-        logger.debug("Tool call start token id IDs:", self.tool_call_start_token_id)
-        logger.debug("Tool call precall token id IDs:", self.tool_call_preargs_token_id)
+        logger.debug(
+            "Tool call start token id IDs:", self.tool_call_start_token_id)
+        logger.debug(
+            "Tool call precall token id IDs:", self.tool_call_preargs_token_id)
         logger.debug("Tool call end token id IDs:", self.tool_call_end_token_id)
         logger.debug("Previous text:", previous_text)
         logger.debug("Current text:", current_text)
@@ -169,16 +174,21 @@ def extract_tool_calls_streaming(
         logger.debug("Current token IDs:", current_token_ids)
         logger.debug("Delta token IDs:", delta_token_ids)
         logger.debug("Current tool name sent:", self.is_current_tool_name_sent)
-        logger.debug("-"*50 + "\n")
+        logger.debug("-"*50)
+        logger.debug("\n")
         flags = Allow.ALL if self.is_current_tool_name_sent \
                 else Allow.ALL & ~Allow.STR
         
-        logger.debug(f"{delta_token_ids[0] in self.tool_call_start_token_id=}")
+        logger.debug("%s=", delta_token_ids[0]
+                      in self.tool_call_start_token_id)
         if delta_token_ids[0] in self.tool_call_start_token_id : 
             # We possibly have a tool call (not sure yet) we don't stream
           
-            logger.debug(f"{_count_substring(current_text,self.tool_call_start_token)=}")
-            if _count_substring(current_text,self.tool_call_start_token) > self.nb_tool_calls \
+            logger.debug(
+                "%s=", _count_substring(current_text,self.tool_call_start_token)
+                )
+            if _count_substring(
+                current_text,self.tool_call_start_token) > self.nb_tool_calls \
                 and not self.is_parsing_toolcall :
 
                 self.is_parsing_toolcall=True
@@ -194,7 +204,7 @@ def extract_tool_calls_streaming(
             # We are parsing a tool call, we need to parse the tool name
             if delta_token_ids != self.tool_call_preargs_token_id:
                 self.current_tool_name += delta_text
-                logger.debug(f"{self.current_tool_name=}")
+                logger.debug("self.current_tool_name=",self.current_tool_name)
                 return None # moving on to the next iteration
             else : 
                 self.current_tool_name = self.current_tool_name.lstrip('=')
@@ -210,12 +220,15 @@ def extract_tool_calls_streaming(
         if self.is_current_tool_name_sent :
             logger.debug("Parsed tool name : ", self.current_tool_name)
 
-            if _count_substring(current_text,self.tool_call_end_token) < self.nb_tool_calls:
+            if _count_substring(
+                current_text,self.tool_call_end_token) < self.nb_tool_calls:
                 self.streamed_args_for_tool.append(delta_text)
                 return None # moving on to the next iteration
             else :
-                arguments = '{"'+''.join(self.streamed_args_for_tool) # adding back {" at the beginning for valid JSON
-                arguments = arguments.rstrip(self.tool_call_end_token) # removing the end token
+                # adding back {" at the beginning for valid JSON
+                arguments = '{"'+''.join(self.streamed_args_for_tool) 
+                # removing the end token
+                arguments = arguments.rstrip(self.tool_call_end_token) 
                 logger.debug("Concatenated tool call arguments  : ", arguments)
 
                 current_tool_args = partial_json_parser.loads(
@@ -238,7 +251,8 @@ def extract_tool_calls_streaming(
                 
                 return delta 
         else : 
-            logger.debug("No tool call detected, returning just text : ", delta_text)
+            logger.debug(
+                "No tool call detected, returning just text : ", delta_text)
             return DeltaMessage(content=delta_text)
             
     def reset_state(self):

From 1eb96301abaf5ae2dbf753196286fddba3d83a05 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 5 Feb 2025 00:46:54 +0000
Subject: [PATCH 0004/1240] [V1][Metrics] Add request_success_total counter,
 labelled with finish reason (#12579)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/v1/engine/__init__.py               | 21 +++++++++++++++++++--
 vllm/v1/engine/detokenizer.py            |  9 +++++----
 vllm/v1/engine/output_processor.py       | 22 ++++++++++++----------
 vllm/v1/metrics/loggers.py               | 15 ++++++++++++++-
 vllm/v1/metrics/stats.py                 | 10 +++++++---
 vllm/v1/request.py                       | 15 ++++++++-------
 7 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index a9134be6232..de2333901cc 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -205,6 +205,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:gpu_cache_usage_perc",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:request_success_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 912b92862c9..6bd548bdcd8 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,23 @@
     from vllm.sampling_params import SamplingParams
 
 
+class RequestFinishedReason(enum.IntEnum):
+    """
+    Reason a request finished - stop, length, or abort.
+
+    stop - a stop string was emitted
+    length - max_tokens was consumed, or max_model_len was reached
+    abort - aborted for another reason
+
+    """
+    STOP = 0
+    LENGTH = 1
+    ABORT = 2
+
+    def __str__(self):
+        return self.name.lower()
+
+
 @dataclass
 class EngineCoreRequest:
 
@@ -45,7 +62,7 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -56,7 +73,7 @@ class EngineCoreOutputs(
         gc=False):  # type: ignore[call-arg]
 
     #NOTE(Nick): We could consider ways to make this more compact,
-    # e.g. columnwise layout and using an int enum for finish/stop reason
+    # e.g. columnwise layout
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 6d800f026b2..2bce23e68d2 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
+                            RequestFinishedReason)
 
 logger = init_logger(__name__)
 
@@ -18,7 +19,7 @@ class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
     finished: bool
-    finish_reason: Optional[str] = None
+    finish_reason: Optional[RequestFinishedReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -147,13 +148,13 @@ def update_from_output(
                 stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = "stop"  # TODO: use constant
+                finish_reason = RequestFinishedReason.STOP
                 stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
 
         # 3) Update the RequestOutput object with the new text.
-        finished = bool(finish_reason)
+        finished = finish_reason is not None
         if self.output_kind == RequestOutputKind.FINAL_ONLY \
             and not finished:
             return None
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index aeefd52399d..94736669147 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -161,8 +161,10 @@ def process_outputs(
                 engine_core_output)
 
             # 3) Create and handle RequestOutput objects.
-            if request_output := self._make_request_output(
-                    req_state, detokenizer_output):
+            if detokenizer_output is not None:
+                request_output = self._make_request_output(
+                    req_state, detokenizer_output)
+
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -172,6 +174,8 @@ def process_outputs(
 
                 # Free completed requests.
                 if request_output.finished:
+                    assert detokenizer_output.finish_reason is not None
+
                     self.request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
@@ -180,7 +184,8 @@ def process_outputs(
 
                     # Track per-request stats
                     iteration_stats.update_from_finished_request(
-                        request_output, req_state.stats)
+                        detokenizer_output.finish_reason, request_output,
+                        req_state.stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
@@ -191,12 +196,8 @@ def process_outputs(
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
-        detokenizer_output: Optional[DetokenizerOutput],
-    ) -> Optional[RequestOutput]:
-
-        if detokenizer_output is None:
-            return None
-
+        detokenizer_output: DetokenizerOutput,
+    ) -> RequestOutput:
         request_output = RequestOutput.new(
             request_state.request_id,
             request_state.prompt,
@@ -207,7 +208,8 @@ def _make_request_output(
         )
         if detokenizer_output.finished:
             completion_output = request_output.outputs[0]
-            completion_output.finish_reason = detokenizer_output.finish_reason
+            completion_output.finish_reason = str(
+                detokenizer_output.finish_reason)
             completion_output.stop_reason = detokenizer_output.stop_reason
 
         return request_output
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index f736e38f192..b62351a8fd6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -2,13 +2,14 @@
 
 import time
 from abc import ABC, abstractmethod
-from typing import List
+from typing import Dict, List
 
 import numpy as np
 import prometheus_client
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.engine import RequestFinishedReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -116,6 +117,17 @@ def __init__(self, model_config: ModelConfig):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_request_success: Dict[RequestFinishedReason,
+                                           prometheus_client.Counter] = {}
+        counter_request_success_base = prometheus_client.Counter(
+            name="vllm:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"])
+        for reason in RequestFinishedReason:
+            self.counter_request_success[
+                reason] = counter_request_success_base.labels(*(labelvalues +
+                                                                [str(reason)]))
+
         self.histogram_num_prompt_tokens_request = \
             prometheus_client.Histogram(
                 name="vllm:request_prompt_tokens",
@@ -163,6 +175,7 @@ def log(self, scheduler_stats: SchedulerStats,
             iteration_stats.num_generation_tokens)
 
         for finished_request in iteration_stats.finished_requests:
+            self.counter_request_success[finished_request.finish_reason].inc()
             self.histogram_num_prompt_tokens_request.observe(
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 88f2c083530..36c95e07d8a 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput
+    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
 
 
 @dataclass
@@ -32,6 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
+    finish_reason: "RequestFinishedReason"
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
 
@@ -73,8 +74,11 @@ def update_from_output(self, output: "EngineCoreOutput",
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
-    def update_from_finished_request(self, request_output: "RequestOutput",
+    def update_from_finished_request(self,
+                                     finish_reason: "RequestFinishedReason",
+                                     request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):
         self.finished_requests.append(
-            FinishedRequestStats(len(request_output.prompt_token_ids),
+            FinishedRequestStats(finish_reason,
+                                 len(request_output.prompt_token_ids),
                                  request_state_stats.num_generation_tokens))
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 0519d9e7875..eb9bf99b406 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[str, None]:
+    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
@@ -149,7 +149,8 @@ def is_finished(status: "RequestStatus") -> bool:
         return status > RequestStatus.PREEMPTED
 
     @staticmethod
-    def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
+    def get_finished_reason(
+            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
         return _FINISHED_REASON_MAP.get(status)
 
 
@@ -158,8 +159,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: "stop",
-    RequestStatus.FINISHED_LENGTH_CAPPED: "length",
-    RequestStatus.FINISHED_ABORTED: "abort",
-    RequestStatus.FINISHED_IGNORED: "length",
+    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
 }

From 2f2621acfae5bcdb6d7e88488e5f2e107725feb0 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 4 Feb 2025 21:22:24 -0500
Subject: [PATCH 0005/1240] [Perf] Mem align KV caches for CUDA devices (MLA
 perf improvement) (#12676)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Lucas Wilkinson <lcwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cache.h                                  |   3 +
 csrc/cache_kernels.cu                         |  82 +++++-
 csrc/torch_bindings.cpp                       |   4 +
 tests/kernels/test_cache.py                   | 262 ++++++++++++++++++
 vllm/_custom_ops.py                           |   5 +
 vllm/attention/backends/triton_mla.py         |   5 +-
 vllm/attention/ops/triton_decode_attention.py |  16 +-
 vllm/envs.py                                  |  10 +
 vllm/utils.py                                 |  10 +
 vllm/worker/cache_engine.py                   |  66 ++++-
 10 files changed, 429 insertions(+), 34 deletions(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index 55ed30bd8ce..cf4a65c2905 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -15,6 +15,9 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
                  std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping);
+
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 23a46b6ed8a..0960888d1f7 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -46,7 +46,10 @@ void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
   char* src_ptr = static_cast<char*>(src.data_ptr());
   char* dst_ptr = static_cast<char*>(dst.data_ptr());
 
-  const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  const int64_t block_size_in_bytes = src.element_size() * src.stride(0);
   const at::cuda::OptionalCUDAGuard device_guard(
       src_device.is_cuda() ? src_device : dst_device);
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
@@ -93,6 +96,24 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
   }
 }
 
+// Kernel for MLA, which works on a single joint kv_cache
+// Grid: (num_layers, num_pairs)
+template <typename scalar_t>
+__global__ void copy_blocks_mla_kernel(
+    int64_t* cache_ptrs, const int64_t* __restrict__ block_mapping,
+    const int mem_footprint_per_block) {
+  const int layer_idx = blockIdx.x;
+  const int pair_idx = blockIdx.y;
+  scalar_t* cache = reinterpret_cast<scalar_t*>(cache_ptrs[layer_idx]);
+  int64_t src_block = block_mapping[2 * pair_idx];
+  int64_t dst_block = block_mapping[2 * pair_idx + 1];
+  int64_t src_offset = src_block * mem_footprint_per_block;
+  int64_t dst_offset = dst_block * mem_footprint_per_block;
+  for (int i = threadIdx.x; i < mem_footprint_per_block; i += blockDim.x) {
+    cache[dst_offset + i] = cache[src_offset + i];
+  }
+}
+
 }  // namespace vllm
 
 // Note: the key_caches and value_caches vectors are constant but
@@ -147,6 +168,42 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
       }));
 }
 
+// copy blocks kernel for MLA (assumes a joint KV-cache)
+void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches,
+                     const torch::Tensor& block_mapping) {
+  int num_layers = kv_caches.size();
+  if (num_layers == 0) {
+    return;
+  }
+  torch::Device cache_device = kv_caches[0].device();
+  TORCH_CHECK(cache_device.is_cuda(), "kv_cache must be on CUDA");
+
+  std::vector<int64_t> cache_ptrs(num_layers);
+  for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
+    cache_ptrs[layer_idx] =
+        reinterpret_cast<int64_t>(kv_caches[layer_idx].data_ptr());
+  }
+  torch::Tensor cache_ptrs_tensor =
+      torch::from_blob(cache_ptrs.data(), {num_layers}, torch::kInt64)
+          .to(cache_device);
+
+  int num_pairs = block_mapping.size(0);
+  // We use the stride instead of numel in case the cache is padded for memory
+  // alignment reasons, we assume the blocks data (inclusive of any padding)
+  // is contiguous in memory
+  int mem_footprint_per_block = kv_caches[0].stride(0);
+  dim3 grid(num_layers, num_pairs);
+  dim3 block(std::min(1024, mem_footprint_per_block));
+  const at::cuda::OptionalCUDAGuard device_guard(cache_device);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
+      kv_caches[0].scalar_type(), "copy_blocks_mla_kernel", ([&] {
+        vllm::copy_blocks_mla_kernel<scalar_t><<<grid, block, 0, stream>>>(
+            cache_ptrs_tensor.data_ptr<int64_t>(),
+            block_mapping.data_ptr<int64_t>(), mem_footprint_per_block);
+      }));
+}
+
 namespace vllm {
 
 template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
@@ -254,6 +311,7 @@ __global__ void concat_and_cache_mla_kernel(
                                      // + pe_dim)]
     const int64_t* __restrict__ slot_mapping,  // [num_tokens]
     const int block_stride,                    //
+    const int entry_stride,                    //
     const int kv_c_stride,                     //
     const int k_pe_stride,                     //
     const int kv_lora_rank,                    //
@@ -274,9 +332,8 @@ __global__ void concat_and_cache_mla_kernel(
                   int src_stride, int dst_stride, int size, int offset) {
     for (int i = threadIdx.x; i < size; i += blockDim.x) {
       const int64_t src_idx = token_idx * src_stride + i;
-      const int64_t dst_idx = block_idx * block_stride +
-                              block_offset * (kv_lora_rank + pe_dim) + i +
-                              offset;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
         dst[dst_idx] = src[src_idx];
       } else {
@@ -391,14 +448,14 @@ void reshape_and_cache_flash(
 // KV_T is the stored data type of kv-cache.
 // CACHE_T is the data type of key and value tensors.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)             \
-  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>           \
-      <<<grid, block, 0, stream>>>(                                    \
-          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                    \
-          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                    \
-          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),             \
-          slot_mapping.data_ptr<int64_t>(), block_stride, kv_c_stride, \
-          k_pe_stride, kv_lora_rank, pe_dim, block_size,               \
+#define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
+  vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
 void concat_and_cache_mla(
@@ -428,6 +485,7 @@ void concat_and_cache_mla(
   int kv_c_stride = kv_c.stride(0);
   int k_pe_stride = k_pe.stride(0);
   int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
 
   dim3 grid(num_tokens);
   dim3 block(std::min(kv_lora_rank, 512));
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 186e9c0e81b..c03806f430a 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -450,6 +450,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "Tensor block_mapping) -> ()");
   cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
 
+  cache_ops.def(
+      "copy_blocks_mla(Tensor(a!)[] kv_caches, Tensor block_mapping) -> ()");
+  cache_ops.impl("copy_blocks_mla", torch::kCUDA, &copy_blocks_mla);
+
   // Reshape the key and value tensors and cache them.
   cache_ops.def(
       "reshape_and_cache(Tensor key, Tensor value,"
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 6f909b6803d..21c02c5de35 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -9,6 +9,7 @@
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils import align_to_256bytes
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -18,6 +19,13 @@
 HEAD_SIZES = [64, 80, 120, 256]
 BLOCK_SIZES = [8, 16, 32]
 
+# Parameters for MLA tests.
+KV_LORA_RANKS = [512]
+QK_ROPE_HEAD_DIMS = [64]
+NUM_TOKENS_MLA = [42]
+BLOCK_SIZES_MLA = [16]
+NUM_BLOCKS_MLA = [8]
+
 # Arbitrary values for testing
 # don't make it too large. e.g. [1024, 36000] will OOM
 NUM_BLOCKS = [1024, 10000]
@@ -432,3 +440,257 @@ def test_fp8_e4m3_conversion(
     ops.convert_fp8(converted_cache, cache_fp8)
 
     torch.testing.assert_close(cache, converted_cache, atol=0.001, rtol=0.1)
+
+
+def _create_mla_cache(
+    num_blocks: int,
+    block_size: int,
+    entry_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: str,
+    device: str,
+    align_cache: bool,
+) -> torch.Tensor:
+    cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
+
+    if align_cache:
+        alloc_entry_size = align_to_256bytes(entry_size, cache_dtype)
+        alloc_shape = (num_blocks, block_size, alloc_entry_size)
+        cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device)
+        cache = cache_full[..., :entry_size]
+    else:
+        cache = torch.zeros(num_blocks,
+                            block_size,
+                            entry_size,
+                            dtype=cache_dtype,
+                            device=device)
+    return cache
+
+
+def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
+    rand_dtype = torch.float16 if kv_cache_dtype == "fp8" else cache.dtype
+
+    vals = torch.randn(*cache.shape, device=cache.device, dtype=rand_dtype)
+    if kv_cache_dtype == "fp8":
+        temp = torch.zeros_like(cache)
+        ops.convert_fp8(temp, vals, 1.0, kv_dtype=kv_cache_dtype)
+        vals = temp
+    cache.copy_(vals)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False])
+@torch.inference_mode()
+def test_concat_and_cache_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device, align_cache)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+
+    if kv_cache_dtype == "fp8":
+        result_temp = torch.empty_like(kv_cache, dtype=torch.float16)
+        ops.convert_fp8(result_temp,
+                        kv_cache.contiguous(),
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        expected_temp = torch.empty_like(ref_kv_cache, dtype=torch.float16)
+        ops.convert_fp8(expected_temp,
+                        ref_kv_cache,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+        torch.testing.assert_close(result_temp,
+                                   expected_temp,
+                                   atol=0.001,
+                                   rtol=0.1)
+    else:
+        torch.testing.assert_close(kv_cache, ref_kv_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("num_layers", NUM_LAYERS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False, True])
+@torch.inference_mode()
+def test_copy_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    num_layers: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    kv_caches = []
+    for _ in range(num_layers):
+        kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                     kv_cache_dtype, device, align_cache)
+        _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
+        kv_caches.append(kv_cache)
+
+    ref_caches = [kv_cache.clone() for kv_cache in kv_caches]
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining, 2 * num_mappings)
+    block_mapping = []
+    for i in range(num_mappings):
+        src = src_blocks[i]
+        dst1 = dst_blocks[2 * i]
+        dst2 = dst_blocks[2 * i + 1]
+        block_mapping.append((src, dst1))
+        block_mapping.append((src, dst2))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device=device).view(-1, 2)
+
+    for src, dst in block_mapping:
+        for ref_cache in ref_caches:
+            ref_cache[dst].copy_(ref_cache[src])
+
+    opcheck(
+        torch.ops._C_cache_ops.copy_blocks_mla,
+        (kv_caches, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+    ops.copy_blocks_mla(kv_caches, block_mapping_tensor)
+
+    for kv_cache, ref_cache in zip(kv_caches, ref_caches):
+        torch.testing.assert_close(kv_cache, ref_cache)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
+@pytest.mark.parametrize("align_cache", [False, True])
+@torch.inference_mode()
+def test_swap_blocks_mla(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    kv_cache_dtype: str,
+    align_cache: bool,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device, align_cache)
+    dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device, align_cache)
+
+    _fill_mla_cache(src_cache, kv_cache_dtype)
+    _fill_mla_cache(dst_cache, kv_cache_dtype)
+
+    src_cache_clone = src_cache.clone()
+
+    num_mappings = min(2, num_blocks // 2)
+    src_blocks = random.sample(range(num_blocks), num_mappings)
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    dst_blocks = random.sample(remaining_blocks, num_mappings)
+    block_mapping = list(zip(src_blocks, dst_blocks))
+    block_mapping_tensor = torch.tensor(block_mapping,
+                                        dtype=torch.int64,
+                                        device="cpu").view(-1, 2)
+
+    opcheck(
+        torch.ops._C_cache_ops.swap_blocks,
+        (src_cache, dst_cache, block_mapping_tensor),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+        cond=(kv_lora_rank == KV_LORA_RANKS[0]
+              and qk_rope_head_dim == QK_ROPE_HEAD_DIMS[0]),
+    )
+
+    ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor)
+
+    for src, dst in block_mapping:
+        torch.testing.assert_close(
+            src_cache_clone[src].cpu(),
+            dst_cache[dst].cpu(),
+            msg=f"Block {src} from src should have been swapped to block "
+            f"{dst} in dst_cache.")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index bdc9a6a33df..a6823501676 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1037,6 +1037,11 @@ def copy_blocks(key_caches: List[torch.Tensor],
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
+def copy_blocks_mla(kv_caches: List[torch.Tensor],
+                    block_mapping: torch.Tensor) -> None:
+    torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
+
+
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 20d7ef0fa88..9a1984a931b 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -26,7 +26,6 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
-from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
@@ -72,14 +71,14 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        PagedAttention.copy_blocks(kv_caches, src_to_dists)
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index ec5ec4ce6e6..057fccb5e59 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -204,10 +204,10 @@ def _decode_att_m_fwd(
         Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
-        k_buffer.stride(-2),
-        k_buffer.stride(-1),
-        v_buffer.stride(-2),
-        v_buffer.stride(-1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
         att_out.stride(0),
         att_out.stride(1),
         att_out.stride(2),
@@ -438,10 +438,10 @@ def _decode_grouped_att_m_fwd(
         Req_to_tokens.stride(0),
         q.stride(0),
         q.stride(1),
-        k_buffer.stride(-2),
-        k_buffer.stride(-1),
-        v_buffer.stride(-2),
-        v_buffer.stride(-1),
+        k_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        k_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-3),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
+        v_buffer.stride(-2),  # Assume (..., PAGE_SIZE, NUM_HEADS, HEAD_DIM)
         att_out.stride(0),
         att_out.stride(1),
         att_out.stride(2),
diff --git a/vllm/envs.py b/vllm/envs.py
index 5018f6deb7f..2c731eda783 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -82,6 +82,7 @@
     VLLM_MLA_DISABLE: bool = False
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
+    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
 
 
@@ -539,6 +540,15 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
+
+    # When on a Nvidia GPU aligns single entries (within a page) so they are 256
+    # byte aligned for better performance, this increases the memory usage of
+    # the cache. Currently this only affects MLA that results in non-256
+    # byte aligned entries. This matches the alignment the CUDA runtime uses
+    # for all allocations. Currently this primarily affects MLA, for most other
+    # models the alignment is already naturally aligned to 256 bytes.
+    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
+    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/utils.py b/vllm/utils.py
index a2b53fcf252..8b926959875 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -563,6 +563,10 @@ def cdiv(a: int, b: int) -> int:
     return -(a // -b)
 
 
+def round_up(x: int, y: int) -> int:
+    return ((x + y - 1) // y) * y
+
+
 def _generate_random_fp8(
     tensor: torch.Tensor,
     low: float,
@@ -794,6 +798,12 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
+def align_to_256bytes(extent: int, dtype: torch.dtype) -> int:
+    dtype_size = get_dtype_size(dtype)
+    eles_per_256bytes = 256 // dtype_size
+    return round_up(extent, eles_per_256bytes)
+
+
 # `collections` helpers
 def is_list_of(
     value: object,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 252fe06600d..3960392cf74 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -2,13 +2,17 @@
 """CacheEngine class for managing the KV cache."""
 from typing import List
 
+import numpy as np
 import torch
 
+from vllm import envs
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
-                        get_dtype_size, is_pin_memory_available)
+                        align_to_256bytes, get_dtype_size,
+                        is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -38,6 +42,7 @@ def __init__(
         self.num_attention_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        self.align_cache = self._align_cache(model_config)
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
@@ -75,15 +80,39 @@ def _allocate_kv_cache(
             num_blocks, self.block_size, self.num_kv_heads, self.head_size)
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
+
+        # Align entries so they are 256 byte aligned for better performance
+        # Primarily targets MLA as this typically only ends up having entries
+        # be 128 byte aligned.
+        if self.align_cache:
+            # We assume the cache shape is:
+            #    (TOTAL_PAGES, PAGE_SIZE, entry_shape...)
+            # NOTE this assumption currently only holds for MLA so we only apply
+            # this optimization when `use_mla` is true
+            entry_shape = kv_cache_shape[2:]
+            entry_size = np.prod(entry_shape)
+            alloc_entry_size = align_to_256bytes(entry_size, self.dtype)
+            alloc_shape = (*kv_cache_shape[:2], alloc_entry_size)
+        else:
+            alloc_shape = kv_cache_shape
+
         for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
-            kv_cache.append(
-                torch.zeros(kv_cache_shape,
-                            dtype=self.dtype,
-                            pin_memory=pin_memory,
-                            device=device))
+            layer_kv_cache = torch.zeros(alloc_shape,
+                                         dtype=self.dtype,
+                                         pin_memory=pin_memory,
+                                         device=device)
+
+            # If we allocated with padding for alignment reasons truncate the
+            # shape while preserving the aligned stride
+            if self.align_cache:
+                layer_kv_cache = layer_kv_cache[..., :entry_size]
+
+            # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
+            # when entry_shape is higher than 1D
+            kv_cache.append(layer_kv_cache.view(kv_cache_shape))
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
@@ -99,6 +128,14 @@ def swap_out(self, src_to_dst: torch.Tensor) -> None:
     def copy(self, src_to_dsts: torch.Tensor) -> None:
         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
 
+    @staticmethod
+    def _align_cache(model_config: ModelConfig):
+        # Currently align_cache only applies to MLA models since the other
+        # cache kernels haven't been updated yet to support non-continguous
+        # tensors
+        return model_config.use_mla and current_platform.is_cuda() \
+            and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE
+
     @staticmethod
     def get_cache_block_size(
         cache_config: CacheConfig,
@@ -110,14 +147,21 @@ def get_cache_block_size(
         num_attention_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
 
-        key_cache_block = cache_config.block_size * num_heads * head_size
-        # For MLA there is no value cache, since the latent vector
-        # is joint keys and values.
-        value_cache_block = key_cache_block if not model_config.use_mla else 0
-        total = num_attention_layers * (key_cache_block + value_cache_block)
         if cache_config.cache_dtype == "auto":
             dtype = model_config.dtype
         else:
             dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+
+        key_cache_entry = num_heads * head_size
+        if CacheEngine._align_cache(model_config):
+            key_cache_entry = align_to_256bytes(key_cache_entry,
+                                                model_config.dtype)
+
+        # For MLA there is no value cache, since the latent vector
+        # is joint keys and values.
+        value_cache_entry = key_cache_entry if not model_config.use_mla else 0
+        total = num_attention_layers * cache_config.block_size * \
+            (key_cache_entry + value_cache_entry)
+
         dtype_size = get_dtype_size(dtype)
         return dtype_size * total

From a586285588de65b1451dfeaa7bc100fd8053bca9 Mon Sep 17 00:00:00 2001
From: Aviv Keshet <akeshet@gmail.com>
Date: Tue, 4 Feb 2025 18:46:26 -0800
Subject: [PATCH 0006/1240] [Core] add and implement
 `VLLM_LOGITS_PROCESSOR_THREADS` (#12368)

Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                                  |  9 ++++
 .../model_executor/layers/logits_processor.py | 46 ++++++++++++++-----
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 2c731eda783..bb419dacb1e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -31,6 +31,7 @@
     VLLM_LOGGING_LEVEL: str = "INFO"
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
+    VLLM_LOGITS_PROCESSOR_THREADS: Optional[int] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
@@ -282,6 +283,14 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_LOGGING_PREFIX":
     lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
 
+    # if set, vllm will call logits processors in a thread pool with this many
+    # threads. This is useful when using custom logits processors that either
+    # (a) launch additional CUDA kernels or (b) do significant CPU-bound work
+    # while not holding the python GIL, or both.
+    "VLLM_LOGITS_PROCESSOR_THREADS":
+    lambda: int(os.getenv("VLLM_LOGITS_PROCESSOR_THREADS", "0"))
+    if "VLLM_LOGITS_PROCESSOR_THREADS" in os.environ else None,
+
     # Trace function calls
     # If set to 1, vllm will trace function calls
     # Useful for debugging
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index ebf74c67d64..cdc67ca83d4 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """A layer that compute logits from hidden_stats."""
 import inspect
+from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 
 import torch
@@ -15,6 +16,11 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.platforms import current_platform
 
+_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None
+if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
+    _logits_processor_threadpool = ThreadPoolExecutor(
+        envs.VLLM_LOGITS_PROCESSOR_THREADS)
+
 
 class LogitsProcessor(nn.Module):
     """Process logits and apply logits processors from sampling metadata.
@@ -135,6 +141,7 @@ def _apply_logits_processors(
 ) -> torch.Tensor:
     found_logits_processors = False
     logits_processed = 0
+    logits_row_ids_and_logits_row_futures = []
     for seq_group in sampling_metadata.seq_groups:
         seq_ids = seq_group.seq_ids
         sampling_params = seq_group.sampling_params
@@ -148,22 +155,39 @@ def _apply_logits_processors(
                 past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids
                 prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids
 
-                for logits_processor in logits_processors:
-                    parameters = inspect.signature(logits_processor).parameters
-                    if len(parameters) == 3:
-                        logits_row = logits_processor(prompt_tokens_ids,
-                                                      past_tokens_ids,
-                                                      logits_row)
-                    else:
-                        logits_row = logits_processor(past_tokens_ids,
-                                                      logits_row)
-
-                logits[logits_row_idx] = logits_row
+                if _logits_processor_threadpool is not None:
+                    logits_row_ids_and_logits_row_futures.append(
+                        (logits_row_idx,
+                         _logits_processor_threadpool.submit(
+                             _apply_logits_processors_single_seq, logits_row,
+                             logits_processors, past_tokens_ids,
+                             prompt_tokens_ids)))
+                else:
+                    logits[logits_row_idx] = \
+                        _apply_logits_processors_single_seq(
+                            logits_row, logits_processors, past_tokens_ids,
+                            prompt_tokens_ids)
 
         logits_processed += len(seq_group.sample_indices) + len(
             seq_group.prompt_logprob_indices)
 
+    for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
+        logits[logits_row_idx] = future.result()
+
     if found_logits_processors:
         # verifies that no rows in logits were missed unexpectedly
         assert logits_processed == logits.shape[0]
     return logits
+
+
+def _apply_logits_processors_single_seq(logits_row, logits_processors,
+                                        past_tokens_ids,
+                                        prompt_tokens_ids) -> torch.Tensor:
+    for logits_processor in logits_processors:
+        parameters = inspect.signature(logits_processor).parameters
+        if len(parameters) == 3:
+            logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids,
+                                          logits_row)
+        else:
+            logits_row = logits_processor(past_tokens_ids, logits_row)
+    return logits_row

From 45492fa86adaa98ada8f99f60681eef083225d2d Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Tue, 4 Feb 2025 19:58:22 -0800
Subject: [PATCH 0007/1240] [ROCM][AMD][TRITON] Halving warps number for
 fw_prefill to reduce spilling (#12713)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/ops/prefix_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index fbb6757ee30..5fca1639363 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -11,7 +11,7 @@
 
 # Static kernels parameters
 BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64
-NUM_WARPS = 8
+NUM_WARPS = 4 if current_platform.is_rocm() else 8
 
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)

From cde9404c7b7617ab781fcc38efaf6fa77f68e562 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Feb 2025 04:31:12 +0000
Subject: [PATCH 0008/1240] Refactor `Linear` handling in `TransformersModel`
 (#12727)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/linear.py       | 30 ++++-----
 vllm/model_executor/models/transformers.py | 76 ++++++++++------------
 2 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 08f1e103e53..da8db08fe71 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,7 +2,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -47,8 +47,8 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
 
 
 def adjust_bitsandbytes_4bit_shard(param: Parameter,
-                                   shard_offsets: Dict[str, Tuple[int, int]],
-                                   loaded_shard_id: str) -> Tuple[int, int]:
+                                   shard_offsets: dict[str, tuple[int, int]],
+                                   loaded_shard_id: str) -> tuple[int, int]:
     """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
 
     total, _ = shard_offsets["total"]
@@ -90,7 +90,7 @@ class LinearMethodBase(QuantizeMethodBase):
     @abstractmethod
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
+                       output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
@@ -123,7 +123,7 @@ class UnquantizedLinearMethod(LinearMethodBase):
 
     def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
+                       output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
         weight = Parameter(torch.empty(sum(output_partition_sizes),
@@ -179,7 +179,8 @@ def __init__(
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward(self,
+                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
         raise NotImplementedError
 
 
@@ -240,9 +241,8 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         assert param.size() == loaded_weight.size()
         param.data.copy_(loaded_weight)
 
-    def forward(
-        self, x: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    def forward(self,
+                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)
@@ -288,7 +288,7 @@ def __init__(self,
                  skip_bias_add: bool = False,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[List[int]] = None,
+                 output_sizes: Optional[list[int]] = None,
                  prefix: str = ""):
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config, prefix)
@@ -374,7 +374,7 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.reshape(1)
         param.load_column_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_):
+    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -422,7 +422,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
 
     def __init__(self,
                  input_size: int,
-                 output_sizes: List[int],
+                 output_sizes: list[int],
                  bias: bool = True,
                  gather_output: bool = False,
                  skip_bias_add: bool = False,
@@ -500,7 +500,7 @@ def weight_loader(self,
             current_shard_offset = 0
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit",
                                             False)
-            shard_offsets: List[Tuple[int, int, int]] = []
+            shard_offsets: list[tuple[int, int, int]] = []
             for i, output_size in enumerate(self.output_sizes):
                 shard_offsets.append((i, current_shard_offset, output_size))
                 current_shard_offset += output_size
@@ -602,7 +602,7 @@ def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
         """
 
         current_shard_offset = 0
-        shard_offsets: List[Tuple[int, int, int]] = []
+        shard_offsets: list[tuple[int, int, int]] = []
         for i, output_size in enumerate(self.output_sizes):
             shard_offsets.append((i, current_shard_offset, output_size))
             current_shard_offset += output_size
@@ -1124,7 +1124,7 @@ def weight_loader_v2(self, param: BasevLLMParameter,
 
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_):
+    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 160beaa146e..dfc7143823d 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+
 # Copyright 2024 The vLLM team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,7 +15,7 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Union
 
 import torch
 from torch import nn
@@ -71,23 +72,10 @@ def vllm_flash_attention_forward(
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
-# Linear Layer that is compatible with transformers internal forward
-# TODO: This is a temporary solution, we should find a better way to integrate
-class HFColumnParallelLinear(ColumnParallelLinear):
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return super().forward(input)[0]
-
-
-class HFRowParallelLinear(RowParallelLinear):
-
-    def forward(self, input: torch.Tensor) -> torch.Tensor:
-        return super().forward(input)[0]
-
-
-def replace_tp_linear_class(orig_module: nn.Linear,
-                            style: str,
-                            quant_config=None):
+def replace_linear_class(
+        linear: nn.Linear,
+        style: str,
+        quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     In model configurations, we use a neutral type (string) to specify parallel
     styles, here we use it to translate nn.Linear into vllm-style tp Linear.
@@ -99,26 +87,28 @@ def replace_tp_linear_class(orig_module: nn.Linear,
         raise ValueError(
             f"Unsupported parallel style type {type(style)}, expected str")
 
-    input_size = orig_module.in_features
-    output_size = orig_module.out_features
-    bias = orig_module.bias is not None
+    vllm_linear_cls = {
+        "colwise": ColumnParallelLinear,
+        "rowwise": RowParallelLinear,
+    }.get(style)
 
-    if style == "colwise":
-        return HFColumnParallelLinear(
-            input_size,
-            output_size,
-            bias,
-        )
-    elif style == "rowwise":
-        return HFRowParallelLinear(
-            input_size,
-            output_size,
-            bias,
-        )
-    # We don't consider colwise_rep since it's used in lm_head
-    else:
+    if vllm_linear_cls is None:
         raise ValueError(f"Unsupported parallel style value: {style}")
 
+    class HFCompatibleLinear(vllm_linear_cls):
+        """
+        Wrapper class that removes `output_bias` from returned output.
+        """
+
+        def forward(self, input: torch.Tensor) -> torch.Tensor:
+            return super().forward(input)[0]
+
+    return HFCompatibleLinear(
+        input_size=linear.in_features,
+        output_size=linear.out_features,
+        bias=linear.bias is not None,
+    )
+
 
 class TransformersModel(nn.Module):
     embedding_padding_modules = ["lm_head"]
@@ -192,16 +182,16 @@ def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
                 "support it yet!")
 
         for child_name, child_module in module.named_children():
-            qual_name = prefix + child_name
+            qual_name = maybe_prefix(prefix, child_name)
             for pattern, style in self.config.base_model_tp_plan.items():
                 if re.match(pattern, qual_name) and isinstance(
                         child_module, nn.Linear):
-                    new_module = replace_tp_linear_class(
-                        child_module, style, self.quant_config)
+                    new_module = replace_linear_class(child_module, style,
+                                                      self.quant_config)
                     setattr(module, child_name, new_module)
                     self.log_replacement(qual_name, child_module, new_module)
             else:
-                self.tensor_parallelize(child_module, prefix=f"{qual_name}.")
+                self.tensor_parallelize(child_module, prefix=qual_name)
 
     def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
@@ -219,7 +209,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],  # argument not used
+        kv_caches: list[torch.Tensor],  # argument not used
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -249,10 +239,10 @@ def sample(self, logits: torch.Tensor,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
+        loaded_params = set[str]()
         for name, loaded_weight in weights:
             if name not in params_dict:
                 name = f"{self.model.base_model_prefix}.{name}"

From 34dc5b29bbc81ce0eb3ed3bbea5015444051ea71 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 5 Feb 2025 12:44:26 +0800
Subject: [PATCH 0009/1240] [VLM] Add MLA with pure RoPE support for
 deepseek-vl2 models (#12729)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/utils.py      | 30 ++++++++++++++++++++---
 vllm/model_executor/models/deepseek_v2.py |  3 ++-
 vllm/model_executor/models/deepseek_v3.py |  3 ++-
 3 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index 8e584cca365..cd8c08e5ab4 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -26,7 +26,8 @@
     apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_dequantize, scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -174,6 +175,8 @@ def __init__(
         self.v_head_dim = v_head_dim
 
         self.rotary_emb = rotary_emb
+        self.use_yarn_rope = isinstance(rotary_emb,
+                                        DeepseekScalingRotaryEmbedding)
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
@@ -420,6 +423,24 @@ def _forward_decode(
     ) -> torch.Tensor:
         raise NotImplementedError
 
+    def apply_pure_rope(
+        self,
+        input_positions: torch.Tensor,
+        q_pe: torch.Tensor,
+        k_pe: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        seq_len = input_positions.size(0)
+        ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
+
+        q_pe, k_pe = self.rotary_emb(
+            input_positions,
+            q_pe.reshape(seq_len, -1),
+            k_pe.reshape(seq_len, -1),
+        )
+        q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
+
+        return q_pe, k_pe
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -444,13 +465,14 @@ def forward(
         # Restore head dim (for rotary embedding)
         k_pe = k_pe.unsqueeze(1)
         assert hasattr(attn_metadata, "input_positions")
+        rope_fn = (self.rotary_emb
+                   if self.use_yarn_rope else self.apply_pure_rope)
 
         if is_decode:
             q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
             q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
-            q_pe, k_pe = \
-                self.rotary_emb(attn_metadata.input_positions, q_pe, k_pe)
+            q_pe, k_pe = rope_fn(attn_metadata.input_positions, q_pe, k_pe)
         else:
             assert is_prefill
             q = self.q_proj(hidden_states_or_q_c)[0]\
@@ -458,7 +480,7 @@ def forward(
 
             # TODO(lucas): there must be a nicer way to write this line
             q[..., self.qk_nope_head_dim:], k_pe = \
-                self.rotary_emb(
+                rope_fn(
                     attn_metadata.input_positions,
                     q[..., self.qk_nope_head_dim:], k_pe)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index f5fede4d822..fdd584f9d6d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -414,7 +414,8 @@ def __init__(
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
 
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
index a4829aa1a57..81f82b182f1 100644
--- a/vllm/model_executor/models/deepseek_v3.py
+++ b/vllm/model_executor/models/deepseek_v3.py
@@ -422,7 +422,8 @@ def __init__(
                                         quant_config=quant_config,
                                         prefix=f"{prefix}.o_proj")
 
-        rope_scaling["rope_type"] = 'deepseek_yarn'
+        if rope_scaling:
+            rope_scaling["rope_type"] = 'deepseek_yarn'
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,

From 283df29f8b8bf556b4efb3b568f88e335034484b Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Tue, 4 Feb 2025 23:44:48 -0500
Subject: [PATCH 0010/1240] [Misc] Bump the compressed-tensors version (#12736)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 97e33a6dbd8..cfa02025629 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.0 # required for compressed-tensors
+compressed-tensors == 0.9.1 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py

From 24248d407dc51294a63dd41d5abadf1b903a3b6e Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Wed, 5 Feb 2025 00:32:06 -0500
Subject: [PATCH 0011/1240] [Model][Quant] Fix GLM, Fix fused module mappings
 for quantization (#12634)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/base_config.py        |   3 +-
 .../compressed_tensors/compressed_tensors.py  |  37 +++--
 .../quantization/compressed_tensors/utils.py  | 140 +++++++-----------
 .../layers/quantization/quark/quark.py        |  10 +-
 .../layers/quantization/quark/utils.py        |  17 ++-
 .../layers/quantization/utils/quant_utils.py  |  26 ++--
 vllm/model_executor/model_loader/loader.py    |   4 +
 vllm/model_executor/model_loader/utils.py     |  22 +++
 vllm/model_executor/models/chatglm.py         |  26 +++-
 .../models/glm4_vision_encoder.py             |  31 ++--
 vllm/model_executor/models/minicpmv.py        |  14 +-
 vllm/model_executor/models/qwen.py            |  16 +-
 12 files changed, 195 insertions(+), 151 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 2eefcc4f305..c0d8553c0df 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -2,7 +2,7 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Mapping, Optional, Type
 
 import torch
 from torch import nn
@@ -59,6 +59,7 @@ def method_has_implemented_embedding(
 
 class QuantizationConfig(ABC):
     """Base class for quantization configs."""
+    packed_modules_mapping: Mapping[str, List[str]] = dict()
 
     @abstractmethod
     def get_name(self) -> str:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 1a11b2419cc..0e3258e4afb 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -83,7 +83,9 @@ def get_quant_method(
 
         # Check if the layer is skipped for quantization.
         # TODO (@robertgshaw2): support module names
-        if should_ignore_layer(prefix, ignore=self.ignore):
+        if should_ignore_layer(prefix,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -379,34 +381,29 @@ def get_scheme(self,
 
         # Will be empty for models with only sparsity
         weight_quant = input_quant = None
-        sparsity_scheme: Optional[SparsityCompressionConfig] = None
         if self.target_scheme_map:
             matched_target = find_matched_target(
                 layer_name=layer_name,
                 module=layer,
-                targets=self.target_scheme_map.keys())
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping)
 
             scheme_dict = self.target_scheme_map[matched_target]
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
 
-        if self.sparsity_scheme_map:
-            is_ignored = False
-            with suppress(ValueError):
-                is_ignored = find_matched_target(
-                    layer_name=layer_name,
-                    module=layer,
-                    targets=self.sparsity_ignore_list)
-
-            # if the layer is in the sparsity ignore list,
-            # we should not apply any sparsity scheme
-
-            if not is_ignored:
-                matched_target = find_matched_target(
-                    layer_name=layer_name,
-                    module=layer,
-                    targets=self.sparsity_scheme_map.keys())
-                sparsity_scheme = self.sparsity_scheme_map.get(matched_target)
+        # Find the sparsity scheme of the layer
+        # assume that fused layers inerhit first component's sparsity scheme
+        sparsity_targets = (self.sparsity_scheme_map.keys() -
+                            set(self.sparsity_ignore_list))
+        sparsity_scheme: Optional[SparsityCompressionConfig] = None
+        with suppress(ValueError):
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=sparsity_targets,
+                fused_mapping=self.packed_modules_mapping)
+            sparsity_scheme = self.sparsity_scheme_map[matched_target]
 
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index 4ea79531efe..85ae1d5cb78 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Iterable, Optional
+from types import MappingProxyType
+from typing import Iterable, List, Mapping, Optional
 
 from compressed_tensors import CompressionFormat
 from torch.nn import Module
 
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
-
 
 def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
@@ -19,8 +17,11 @@ def is_activation_quantization_format(format: str) -> bool:
     return format in _ACTIVATION_QUANTIZATION_FORMATS
 
 
-def should_ignore_layer(layer_name: Optional[str],
-                        ignore: Iterable[str]) -> bool:
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str] = tuple(),
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     if layer_name is None:
         return False
 
@@ -32,8 +33,8 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING and layer_name not in ignore:
-        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+    if proj_name in fused_mapping and layer_name not in ignore:
+        shard_proj_names = fused_mapping[proj_name]
 
         # Convert fused_name --> [shard_names]
         shard_names = [
@@ -79,55 +80,12 @@ def check_equal_or_regex_match(layer_name: str,
     return False
 
 
-def _handle_fused_layers(func):
-    """
-    Decorator to handle fused layers by mapping vllm fused layer names
-    to their corresponding unfused layer names for quantization/pruning schemes.
-    """
-    # fused_layer_name -> unfused_layer_name
-    fused_layer_map = {
-        "qkv_proj": "q_proj",
-        "gate_up_proj": "up_proj",
-    }
-
-    def fused_layer_handler(layer_name: Optional[str], module: Module,
-                            targets: Iterable[str]) -> Optional[str]:
-        """
-        Wrapper function specifically designed to support the
-        find_matched_target function.
-
-        It handles cases where the provided layer name corresponds to a
-        fused layer in vllm, mapping it to its equivalent unfused layer name
-        based on the predefined fused_layer_map. If the original layer name
-        raises a ValueError in the wrapped function, this handler
-        will attempt to resolve the issue by substituting with unfused
-        layer name.
-
-        :param layer_name: Name of the layer, which may be fused.
-        :param module: An instance of torch.nn.Module.
-        :param targets: A list of target names or patterns to match.
-        :return: The result of the wrapped find_matched_target function with
-            the resolved layer name.
-        :raises ValueError: If the layer name cannot be resolved to a 
-            valid target.
-        """
-        try:
-            return func(layer_name, module, targets)
-        except ValueError:
-            if layer_name is None:
-                layer_name = ""
-            parent_name, fused_proj_name = layer_name.rsplit(".", 1)
-            unfused_proj_name = fused_layer_map.get(fused_proj_name,
-                                                    fused_proj_name)
-            new_layer_name = f"{parent_name}.{unfused_proj_name}"
-            return func(new_layer_name, module, targets)
-
-    return fused_layer_handler
-
-
-@_handle_fused_layers
-def find_matched_target(layer_name: Optional[str], module: Module,
-                        targets: Iterable[str]) -> str:
+def find_matched_target(
+    layer_name: Optional[str],
+    module: Module,
+    targets: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> str:
     """
     Helper function to look up which "target" in the compressed-tensors
     config that a layer corresponds to.
@@ -141,19 +99,25 @@ def find_matched_target(layer_name: Optional[str], module: Module,
 
     First, we try to match the layer_name with a target
     Second, we try to match the module's name with a target
+    Third, we try to map the layer_name to a list of fused module names.
+        *All* component module names must match in order for a match to be
+        successful. A successful match returns the first component target
 
     :param layer_name: layer name
     :param module: torch.nn.Module
     :param targets: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
+    :param fused_strategy: either "all" or "any". If using "all", fused
+        layers match if "all" of its components match
     """
 
     if layer_name is None:
         layer_name = ""
 
-    matched_target = (_find_first_match(layer_name, targets)
-                      or _find_first_match(module.__class__.__name__, targets,
-                                           True)
-                      or _match_fused_layer(layer_name, targets))
+    matched_target = (
+        _find_first_match(layer_name, targets)
+        or _find_first_match(module.__class__.__name__, targets, True)
+        or _match_fused_layer(layer_name, targets, fused_mapping))
 
     if matched_target is None:
         raise ValueError(
@@ -205,11 +169,19 @@ def _is_equal_or_regex_match(value: str,
     return False
 
 
-def _match_fused_layer(layer_name: str,
-                       target_layers: Iterable[str]) -> Optional[str]:
+def _match_fused_layer(
+        layer_name: str, target_layers: Iterable[str],
+        fused_mapping: Mapping[str, List[str]]) -> Optional[str]:
     """
     Match a fused layer name to its corresponding individual layer in 
-    target_layers.
+    target_layers. Returns first value in fused_mapping which matches targets
+
+    Implements an "all" matching strategy where a fused layer matches iff
+    "all" of its components match
+
+    :param layer_name: layer name
+    :param target_layers: list of targets to match the layer against
+    :param fused_mapping: map from fused layer names to its components
 
     Examples:
         layer_name = "model.layers.0.self_attn.qkv_proj"
@@ -217,27 +189,25 @@ def _match_fused_layer(layer_name: str,
                         "model.layers.0.self_attn.k_proj",
                         "model.layers.0.self_attn.v_proj"]
     """
-    # Split into parent path and layer type
-    # e.g., "model.layers.0.self_attn" and "qkv_proj"
-    parent_path = ".".join(layer_name.split(".")[:-1])
-    layer_type = layer_name.split(".")[-1]
-
-    if layer_type not in FUSED_LAYER_NAME_MAPPING:
+    # find layer_name in mapping
+    fused = next((key for key in fused_mapping if layer_name.endswith(key)),
+                 None)
+    if fused is None:
         return None
 
-    possible_layer_types = FUSED_LAYER_NAME_MAPPING[layer_type]
-
-    # Look for a target layer that:
-    # 1. Has the same parent path
-    # 2. Ends with one of the possible individual layer types
-    for target in target_layers:
-        is_same_parent = parent_path in target
-        is_matching_type = any(type_suffix in target
-                               for type_suffix in possible_layer_types)
-
-        if is_same_parent and is_matching_type and all(
-            (f"{parent_path}.{type_suffix}" in target_layers)
-                for type_suffix in possible_layer_types):
-            return target
+    # expand path of unfused components
+    unfused_paths = [
+        layer_name.replace(fused, unfused) for unfused in fused_mapping[fused]
+    ]
 
-    return None
+    # for each unfused component, find a match in targets
+    unfused_matches: List[Optional[str]] = []
+    for unfused in unfused_paths:
+        for target in target_layers:
+            if _is_equal_or_regex_match(unfused, target):
+                unfused_matches.append(target)
+                break
+        else:
+            unfused_matches.append(None)
+
+    return unfused_matches[0] if all(unfused_matches) else None
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 0451cf82b99..ba123565a0e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -18,8 +18,6 @@
     QuarkScheme, QuarkW8A8Fp8, QuarkW8A8Int8)
 from vllm.model_executor.layers.quantization.quark.utils import (
     deep_compare, should_ignore_layer)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
 from vllm.platforms import current_platform
 
 __all__ = ["QuarkLinearMethod"]
@@ -58,7 +56,9 @@ def get_quant_method(self, layer: torch.nn.Module,
 
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(List[str], self.quant_config.get("exclude"))
-        if should_ignore_layer(prefix, ignore=exclude_layers):
+        if should_ignore_layer(prefix,
+                               ignore=exclude_layers,
+                               fused_mapping=self.packed_modules_mapping):
             return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -201,8 +201,8 @@ def _find_matched_config(self, layer_name: str,
                              module: torch.nn.Module) -> Dict[str, Any]:
 
         proj_name = layer_name.split(".")[-1]
-        if proj_name in FUSED_LAYER_NAME_MAPPING:
-            shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+        if proj_name in self.packed_modules_mapping:
+            shard_proj_names = self.packed_modules_mapping[proj_name]
 
             # Convert fused_name --> [shard_names]
             shard_names = [
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index afb1d9d63e7..17e0df02108 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Any, Iterable, Optional
-
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    FUSED_LAYER_NAME_MAPPING)
+from types import MappingProxyType
+from typing import Any, Iterable, List, Mapping, Optional
 
 
 def deep_compare(dict1: Any, dict2: Any) -> bool:
@@ -20,8 +18,11 @@ def deep_compare(dict1: Any, dict2: Any) -> bool:
         return dict1 == dict2
 
 
-def should_ignore_layer(layer_name: Optional[str],
-                        ignore: Iterable[str]) -> bool:
+def should_ignore_layer(
+    layer_name: Optional[str],
+    ignore: Iterable[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     if layer_name is None:
         return False
 
@@ -33,8 +34,8 @@ def should_ignore_layer(layer_name: Optional[str],
     # in the safetensors checkpoint. So, we convert the name
     # from the fused version to unfused + check to make sure that
     # each shard of the fused layer has the same scheme.
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
-        shard_proj_names = FUSED_LAYER_NAME_MAPPING[proj_name]
+    if proj_name in fused_mapping:
+        shard_proj_names = fused_mapping[proj_name]
 
         # Convert fused_name --> [shard_names]
         shard_names = [
diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py
index 62484f62f61..c7ce3a42c81 100644
--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """This file is used for /tests and /benchmarks"""
-from typing import List, Optional, Tuple
+from types import MappingProxyType
+from typing import List, Mapping, Optional, Tuple
 
 import numpy
 import torch
@@ -12,14 +13,6 @@
 SUPPORTED_GPTQ_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
 SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
 
-# Note: this is a hack. We should update each model to register the
-# stacked params and get it from there instead in a future PR.
-# fused_name: List[shard_name]
-FUSED_LAYER_NAME_MAPPING = {
-    "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    "gate_up_proj": ["gate_proj", "up_proj"]
-}
-
 
 # Normalize the group_shape to the full extent for any dims that are -1
 def _normalize_quant_group_shape(x: torch.Tensor, group_shape: Tuple[int,
@@ -178,14 +171,23 @@ def unpack_quantized_values_into_int32(w_q: torch.Tensor,
     return res.permute(inv_perm)
 
 
-def is_layer_skipped(prefix: str, ignored_layers: List[str]) -> bool:
+def is_layer_skipped(
+    prefix: str,
+    ignored_layers: List[str],
+    fused_mapping: Mapping[str, List[str]] = MappingProxyType({})
+) -> bool:
     # prefix: model.layers.0.self_attn.q_proj
     # proj_name: q_proj
     proj_name = prefix.split(".")[-1]
-    if proj_name in FUSED_LAYER_NAME_MAPPING:
+
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    if proj_name in fused_mapping:
         shard_prefixes = [
             prefix.replace(proj_name, shard_proj_name)
-            for shard_proj_name in FUSED_LAYER_NAME_MAPPING[proj_name]
+            for shard_proj_name in fused_mapping[proj_name]
         ]
 
         is_skipped = None
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 19e3bc6a259..2a2c2523b72 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -43,6 +43,7 @@
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
 from vllm.model_executor.model_loader.utils import (ParamMapping,
+                                                    configure_quant_config,
                                                     get_model_architecture,
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -113,6 +114,9 @@ def _initialize_model(
     model_config = vllm_config.model_config
     model_class, _ = get_model_architecture(model_config)
 
+    if vllm_config.quant_config is not None:
+        configure_quant_config(vllm_config.quant_config, model_class)
+
     signatures = inspect.signature(model_class.__init__)
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 7a82a695c50..dc620d4984a 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -11,6 +11,8 @@
 
 from vllm.config import ModelConfig, ModelImpl
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.models.adapters import (as_classification_model,
                                                  as_embedding_model,
@@ -138,3 +140,23 @@ def get_sub_modules(self,
             if module_name.endswith(key):
                 return key, value
         return None
+
+
+def configure_quant_config(quant_config: QuantizationConfig,
+                           model_class: Type[nn.Module]):
+    """
+    Pass packed_modules_mapping by reference to quant_config so that
+    quant_config can properly match fused modules
+
+    Note that model attributes are passed by reference to quant_config,
+    enabling them to be updated by model_class.__new__ (ex. chatglm, qwen)
+    """
+    packed_mapping = getattr(model_class, "packed_modules_mapping", None)
+    if packed_mapping is not None:
+        # pass packed_modules_mapping by reference to quant_config
+        quant_config.packed_modules_mapping = packed_mapping
+    else:
+        logger.warning(
+            "The model class %s has not defined `packed_modules_mapping`, "
+            "this may lead to incorrect mapping of quantized or ignored "
+            "modules", model_class.__name__)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index b81a9e917d4..a3164867525 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -265,12 +265,14 @@ def __init__(
             self.total_num_kv_heads,
             bias=config.add_bias_linear or config.add_qkv_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             config.hidden_size,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
@@ -327,6 +329,7 @@ def __init__(
         self,
         config: ChatGLMConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
 
@@ -338,6 +341,7 @@ def __init__(
             [config.ffn_hidden_size] * 2,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
         )
 
         self.activation_func = SiluAndMul()
@@ -348,6 +352,7 @@ def __init__(
             config.hidden_size,
             bias=config.add_bias_linear,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
         )
 
     def forward(self, hidden_states):
@@ -396,7 +401,7 @@ def __init__(
             config.hidden_size, eps=config.layernorm_epsilon)
 
         # MLP
-        self.mlp = GLMMLP(config, quant_config)
+        self.mlp = GLMMLP(config, quant_config, prefix=f"{prefix}.mlp")
 
     def forward(
         self,
@@ -507,7 +512,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
                                                 config.hidden_size,
-                                                quant_config=quant_config)
+                                                quant_config=quant_config,
+                                                prefix=f"{prefix}.embedding")
 
         self.num_layers = config.num_layers
         self.multi_query_group_num = config.multi_query_group_num
@@ -766,6 +772,7 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                          SupportsMultiModal):
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -777,9 +784,18 @@ def __new__(
         prefix: str = "",
     ) -> None:
         config = vllm_config.model_config.hf_config
+
         # Initialize VL
-        if hasattr(config, "vision_config"):
-            return ChatGLMV(vllm_config=vllm_config, prefix=prefix)
+        if hasattr(config, "vision_config"):  # noqa: SIM108
+            instance_cls = ChatGLMV
         # Initialize LLM
         else:
-            return ChatGLM(vllm_config=vllm_config, prefix=prefix)
\ No newline at end of file
+            instance_cls = ChatGLM
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
index 4449eb8e8b1..2facd1353ae 100644
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ b/vllm/model_executor/models/glm4_vision_encoder.py
@@ -74,11 +74,13 @@ def __init__(
             self.head_dim,
             config.num_heads,
             quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
         )
         self.dense = RowParallelLinear(
             config.hidden_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.dense",
         )
 
         self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
@@ -101,6 +103,7 @@ def __init__(
         self,
         config,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         super().__init__()
         self.config = config
@@ -109,11 +112,13 @@ def __init__(
             config.hidden_size,
             config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
         )
         self.fc2 = RowParallelLinear(
             config.intermediate_size,
             config.hidden_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -137,7 +142,9 @@ def __init__(
         self.attention = Attention(config,
                                    quant_config=quant_config,
                                    prefix=f"{prefix}.attention")
-        self.mlp = MLP(config, quant_config=quant_config)
+        self.mlp = MLP(config,
+                       quant_config=quant_config,
+                       prefix=f"{prefix}.mlp")
         self.post_attention_layernorm = LayerNorm(config.hidden_size,
                                                   eps=config.layer_norm_eps)
 
@@ -164,7 +171,7 @@ def __init__(
         self.layers = nn.ModuleList([
             TransformerLayer(config,
                              quant_config=quant_config,
-                             prefix=f"{prefix}.layer.{layer_idx}")
+                             prefix=f"{prefix}.layers.{layer_idx}")
             for layer_idx in range(config.num_hidden_layers)
         ])
 
@@ -181,6 +188,7 @@ def __init__(
         config,
         in_features,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
     ):
         """
         The original implementation is the same as:
@@ -222,7 +230,8 @@ def __init__(
         self.linear_proj = ReplicatedLinear(in_features,
                                             config.hidden_size,
                                             bias=False,
-                                            quant_config=quant_config)
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.linear_proj")
         self.norm1 = nn.LayerNorm(config.hidden_size)
         self.act1 = nn.GELU()
         self.act2 = SiluAndMul()
@@ -230,12 +239,15 @@ def __init__(
         self.merged_proj = MergedColumnParallelLinear(
             config.hidden_size, [config.ffn_hidden_size] * 2,
             bias=False,
-            quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.merged_proj")
 
-        self.dense_4h_to_h = RowParallelLinear(config.ffn_hidden_size,
-                                               config.hidden_size,
-                                               bias=False,
-                                               quant_config=quant_config)
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h")
 
     def forward(self, x):
         x, _ = self.linear_proj(x)
@@ -262,7 +274,8 @@ def __init__(
                                        prefix=f"{prefix}.transformer")
         self.linear_proj = GLU(config,
                                in_features=config.hidden_size,
-                               quant_config=quant_config)
+                               quant_config=quant_config,
+                               prefix=f"{prefix}.linear_proj")
         self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
                               out_channels=config.hidden_size,
                               kernel_size=2,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 3d16d635b57..20f3a3d1989 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1473,6 +1473,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -1489,8 +1490,15 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
             version = str(config.version).split(".")
             version = tuple([int(x) for x in version])
         # Dispatch class based on version
-        instance_class = _SUPPORT_VERSION.get(version)
-        if instance_class is None:
+        instance_cls = _SUPPORT_VERSION.get(version)
+        if instance_cls is None:
             raise ValueError(
                 "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-        return instance_class(vllm_config=vllm_config, prefix=prefix)
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 327fad0f570..89706612431 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1135,6 +1135,7 @@ class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
     """
     # Ensure that the LoRA support check passes when the class is not
     # initialized, but set all these attributes to empty.
+    # These will be updated when an instance class is selected
     packed_modules_mapping = {}
     supported_lora_modules = []
     embedding_modules = {}
@@ -1146,9 +1147,18 @@ def __new__(
         prefix: str = "",
     ) -> QWenBaseModel:
         config = vllm_config.model_config.hf_config
+
         # Initialize VL
-        if hasattr(config, "visual"):
-            return QWenVL(vllm_config=vllm_config, prefix=prefix)
+        if hasattr(config, "visual"):  # noqa: SIM108
+            instance_cls = QWenVL
         # Initialize LLM
         else:
-            return QWenLLM(vllm_config=vllm_config, prefix=prefix)
+            instance_cls = QWenLLM
+
+        # quant_config references base class members,
+        # so update values before init is called
+        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
+        cls.supported_lora_modules += instance_cls.supported_lora_modules
+        cls.embedding_modules.update(instance_cls.embedding_modules)
+        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
+        return instance_cls(vllm_config=vllm_config, prefix=prefix)

From 7f63211895726491e1361a63fc312c6f2a9984bb Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Feb 2025 01:42:09 -0500
Subject: [PATCH 0012/1240] [Doc] Update PR Reminder with link to Developer
 Slack (#12748)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/reminder_comment.yml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
index df62539c0b3..27318c2fdd9 100644
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
 on:
   pull_request_target:
     types: [opened]
-
 jobs:
   pr_reminder:
     runs-on: ubuntu-latest
@@ -15,7 +14,12 @@ jobs:
               owner: context.repo.owner,
               repo: context.repo.repo,
               issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
+                '🚀'
             })
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From c4022c995055b4f300c6cb7d5a1ad6e7287e574d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 5 Feb 2025 06:42:46 +0000
Subject: [PATCH 0013/1240] [Bugfix] Fix OpenVINO model runner (#12750)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/openvino.py          |  4 ++++
 vllm/model_executor/model_loader/openvino.py | 11 +++++------
 vllm/worker/openvino_model_runner.py         |  9 +++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
index f58528dbf5b..9908620a32a 100644
--- a/vllm/attention/backends/openvino.py
+++ b/vllm/attention/backends/openvino.py
@@ -140,3 +140,7 @@ class OpenVINOAttentionMetadata:
     # `model_executable`.
     multi_modal_placeholder_index_maps: Optional[Dict[
         str, MultiModalPlaceholderMap.IndexMap]]
+
+    # Enable/disable KV scales calculation. This is so that we can disable the
+    # calculation until after prefill and cuda graph capture.
+    enable_kv_scales_calculation: bool
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 7bd531c568f..fde200d576e 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -13,7 +13,7 @@
 
 import vllm.envs as envs
 from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
-from vllm.config import DeviceConfig, ModelConfig
+from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
                                                          _prune_hidden_states)
@@ -103,7 +103,6 @@ def __init__(
         self,
         ov_core: ov.Core,
         model_config: ModelConfig,
-        device_config: DeviceConfig,
         kv_cache_dtype: ov.Type,
     ) -> None:
         super().__init__()
@@ -187,8 +186,7 @@ def sample(
 
 
 def get_model(
-    model_config: ModelConfig,
-    device_config: DeviceConfig,
+    vllm_config: VllmConfig,
     kv_cache_dtype: ov.Type,
     **kwargs,
 ) -> torch.nn.Module:
@@ -201,5 +199,6 @@ def get_model(
             "be added in the future. If this is important to you, "
             "please open an issue on github.")
 
-    return OpenVINOCausalLM(ov_core, model_config, device_config,
-                            kv_cache_dtype)
+    with set_current_vllm_config(vllm_config):
+        return OpenVINOCausalLM(ov_core, vllm_config.model_config,
+                                kv_cache_dtype)
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 44442cddbd4..f7a5ab9de9f 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -54,15 +54,13 @@ def __init__(
     ):
         self.ov_core = ov_core
         ModelRunnerBase.__init__(self, vllm_config=vllm_config)
-        cache_config = self.cache_config
-        model_config = self.model_config
         self.is_driver_worker = is_driver_worker
 
         self.device = self.device_config.device
 
         self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = model_config.get_sliding_window()
-        self.block_size = cache_config.block_size
+        self.sliding_window = self.model_config.get_sliding_window()
+        self.block_size = self.cache_config.block_size
 
         self.attn_backend = get_attn_backend(
             self.model_config.get_head_size(),
@@ -81,8 +79,7 @@ def __init__(
         self.model: nn.Module  # Set after init_Model
 
     def load_model(self) -> None:
-        self.model = get_model(model_config=self.model_config,
-                               device_config=self.device_config,
+        self.model = get_model(vllm_config=self.vllm_config,
                                kv_cache_dtype=self.kv_cache_dtype,
                                ov_core=self.ov_core)
 

From 6443b4f65880873472118293b44ef2044755b122 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Tue, 4 Feb 2025 22:43:02 -0800
Subject: [PATCH 0014/1240] [V1][Misc] Shorten `FinishReason` enum and use
 constant strings (#12760)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/__init__.py    | 12 +++++++++---
 vllm/v1/engine/detokenizer.py |  7 +++----
 vllm/v1/metrics/loggers.py    |  6 +++---
 vllm/v1/metrics/stats.py      |  7 +++----
 vllm/v1/request.py            | 14 +++++++-------
 5 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 6bd548bdcd8..d5933cac50c 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -14,11 +14,17 @@
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.sampling_params import SamplingParams
 
+# These are possible values of RequestOutput.finish_reason,
+# so form part of the external API.
+FINISH_REASON_STRINGS = ("stop", "length", "abort")
 
-class RequestFinishedReason(enum.IntEnum):
+
+class FinishReason(enum.IntEnum):
     """
     Reason a request finished - stop, length, or abort.
 
+    Int rather than Str for more compact serialization.
+
     stop - a stop string was emitted
     length - max_tokens was consumed, or max_model_len was reached
     abort - aborted for another reason
@@ -29,7 +35,7 @@ class RequestFinishedReason(enum.IntEnum):
     ABORT = 2
 
     def __str__(self):
-        return self.name.lower()
+        return FINISH_REASON_STRINGS[self.value]
 
 
 @dataclass
@@ -62,7 +68,7 @@ class EngineCoreOutput(
     request_id: str
     new_token_ids: List[int]
     finished: bool
-    finish_reason: Optional[RequestFinishedReason] = None
+    finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
 
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 2bce23e68d2..861fcb012c3 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -8,8 +8,7 @@
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest,
-                            RequestFinishedReason)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 
 logger = init_logger(__name__)
 
@@ -19,7 +18,7 @@ class DetokenizerOutput:
     output_text: str
     token_ids: List[int]
     finished: bool
-    finish_reason: Optional[RequestFinishedReason] = None
+    finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
 
 
@@ -148,7 +147,7 @@ def update_from_output(
                 stop_str, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = RequestFinishedReason.STOP
+                finish_reason = FinishReason.STOP
                 stop_reason = stop_str
 
         # TODO: handle stop_token_ids here too?
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index b62351a8fd6..eb1acf584c6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -9,7 +9,7 @@
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.v1.engine import RequestFinishedReason
+from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
 logger = init_logger(__name__)
@@ -117,13 +117,13 @@ def __init__(self, model_config: ModelConfig):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
-        self.counter_request_success: Dict[RequestFinishedReason,
+        self.counter_request_success: Dict[FinishReason,
                                            prometheus_client.Counter] = {}
         counter_request_success_base = prometheus_client.Counter(
             name="vllm:request_success_total",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + ["finished_reason"])
-        for reason in RequestFinishedReason:
+        for reason in FinishReason:
             self.counter_request_success[
                 reason] = counter_request_success_base.labels(*(labelvalues +
                                                                 [str(reason)]))
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 36c95e07d8a..e3f1efcc9b1 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason
+    from vllm.v1.engine import EngineCoreOutput, FinishReason
 
 
 @dataclass
@@ -32,7 +32,7 @@ class RequestStateStats:
 class FinishedRequestStats:
     """Stats associated with a finished request."""
 
-    finish_reason: "RequestFinishedReason"
+    finish_reason: "FinishReason"
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
 
@@ -74,8 +74,7 @@ def update_from_output(self, output: "EngineCoreOutput",
         request_state_stats.num_generation_tokens += num_new_generation_tokens
         request_state_stats.last_token_time = now
 
-    def update_from_finished_request(self,
-                                     finish_reason: "RequestFinishedReason",
+    def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
                                      request_state_stats: RequestStateStats):
         self.finished_requests.append(
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index eb9bf99b406..89b39ea615d 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,7 +6,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason
+from vllm.v1.engine import EngineCoreRequest, FinishReason
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -109,7 +109,7 @@ def num_output_tokens(self) -> int:
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
-    def get_finished_reason(self) -> Union[RequestFinishedReason, None]:
+    def get_finished_reason(self) -> Union[FinishReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
@@ -150,7 +150,7 @@ def is_finished(status: "RequestStatus") -> bool:
 
     @staticmethod
     def get_finished_reason(
-            status: "RequestStatus") -> Union[RequestFinishedReason, None]:
+            status: "RequestStatus") -> Union[FinishReason, None]:
         return _FINISHED_REASON_MAP.get(status)
 
 
@@ -159,8 +159,8 @@ def get_finished_reason(
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
-    RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP,
-    RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH,
-    RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT,
-    RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH,
+    RequestStatus.FINISHED_STOPPED: FinishReason.STOP,
+    RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH,
+    RequestStatus.FINISHED_ABORTED: FinishReason.ABORT,
+    RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH,
 }

From 431f65e18fc8d4fddf3ae9d80fb9f6a1e4d4ce07 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Feb 2025 01:43:11 -0500
Subject: [PATCH 0015/1240] [Doc] Remove performance warning for auto_awq.md
 (#12743)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/auto_awq.md | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 30735b1161f..fa0bebeb8ba 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -2,12 +2,6 @@
 
 # AutoAWQ
 
-:::{warning}
-Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
-accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
-inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
-:::
-
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.

From c9451cf2b87018582126a945b4dcdeac672ca717 Mon Sep 17 00:00:00 2001
From: Akash kaothalkar <61960177+Akashcodes732@users.noreply.github.com>
Date: Wed, 5 Feb 2025 12:41:02 +0530
Subject: [PATCH 0016/1240] [Bugfix] Fix 'ModuleNotFoundError: No module named
 'intel_extension_for_pytorch'' for --tensor-parallel-size more than 1 
 (#12546)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/parallel_state.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c5c5dfbbab7..321902d11fd 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -329,9 +329,17 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             return input_
 
         if input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
-            return input_
+            try:
+                import intel_extension_for_pytorch as ipex
+                ipex.distributed.all_reduce(input_, group=self.device_group)
+                return input_
+            except ImportError:
+                """
+                Intel IPEX not found. Falling back to PyTorch native 
+                all_reduce for CPU
+                """
+                torch.distributed.all_reduce(input_, group=self.device_group)
+                return input_
 
         if self.tpu_communicator is not None and \
             not self.tpu_communicator.disabled:

From 643c3a155468bfeb3c7d09ea6b8651185f588003 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Feb 2025 02:03:19 +0800
Subject: [PATCH 0017/1240] [core][distributed] exact ray placement control
 (#12732)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml               |   2 +
 examples/offline_inference/ray_placement.py | 121 ++++++++++++++++++++
 vllm/envs.py                                |  14 +++
 vllm/executor/ray_distributed_executor.py   |  36 +++---
 vllm/platforms/cuda.py                      |   8 ++
 vllm/platforms/interface.py                 |   5 +
 6 files changed, 173 insertions(+), 13 deletions(-)
 create mode 100644 examples/offline_inference/ray_placement.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a847a68a6ef..7ef40564c5b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -128,6 +128,7 @@ steps:
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
   - examples/offline_inference/rlhf.py
+  - examples/offline_inference/ray_placement.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -136,6 +137,7 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - python3 ../examples/offline_inference/rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/ray_placement.py
new file mode 100644
index 00000000000..cd801a3c0c8
--- /dev/null
+++ b/examples/offline_inference/ray_placement.py
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+a simple demonstration to show how to control
+the placement of the vLLM workers with Ray.
+The key is to set VLLM_RAY_PER_WORKER_GPUS and
+VLLM_RAY_BUNDLE_INDICES properly.
+"""
+import os
+
+import ray
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+from vllm import LLM
+from vllm.worker.worker import Worker
+
+
+class MyWorker(Worker):
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(self.device.index)
+
+
+class MyLLM(LLM):
+
+    def __init__(self, *args, bundle_indices: list, **kwargs):
+        # a hack to make the script work.
+        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # at the top-level
+        del os.environ["CUDA_VISIBLE_DEVICES"]
+        # every worker will use 0.4 GPU, so that we can schedule
+        # 2 instances on the same GPUs.
+        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(
+            map(str, bundle_indices))
+        print(f"creating LLM with bundle_indices={bundle_indices}")
+        super().__init__(*args, **kwargs)
+
+
+class RayTrainingActor:
+
+    def report_device_id(self) -> str:
+        # the argument for get_device_uuid is the index
+        # of the GPU in the visible devices.
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from vllm.platforms import current_platform
+        return current_platform.get_device_uuid(0)
+
+
+# ray manages 4 GPUs
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+ray.init()
+
+# we want to co-locate vLLM instance and the training actor
+# on the same set of GPUs.
+# the placement plan is as follows:
+# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
+# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
+
+pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
+ray.get(pg.ready())
+print(f"placement group has bundles {pg.bundle_specs=}")
+
+training_actors = []
+training_actor_device_ids = []
+inference_engines = []
+inference_engine_device_ids = []
+
+for bundle_index in [0, 1, 2, 3]:
+    training_actor = ray.remote(
+        num_cpus=0,
+        num_gpus=0.4,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+            placement_group_bundle_index=bundle_index,
+        ),
+    )(RayTrainingActor).remote()
+    training_actors.append(training_actor)
+    device_id = ray.get(training_actor.report_device_id.remote())
+    print(f"training actor {bundle_index} is on {device_id}")
+    training_actor_device_ids.append(device_id)
+
+for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
+    # IMPORTANT: when creating vLLM instances, we need to
+    # make sure there are no GPU activities on the target GPUs,
+    # otherwise, they will interfere with the vLLM memory profiling,
+    # and cause unexpected behaviors.
+    llm = ray.remote(
+        num_cpus=0,
+        num_gpus=0,
+        scheduling_strategy=PlacementGroupSchedulingStrategy(
+            placement_group=pg,
+            placement_group_capture_child_tasks=True,
+        ),
+    )(MyLLM).remote(
+        model="facebook/opt-125m",
+        enforce_eager=True,
+        worker_cls=MyWorker,
+        tensor_parallel_size=2,
+        distributed_executor_backend="ray",
+        gpu_memory_utilization=0.4,
+        bundle_indices=bundle_indices,
+    )
+    inference_engines.append(llm)
+    # don't call any method on the inference engine here,
+    # otherwise it will block until the vLLM instance is created.
+
+for i, llm in enumerate(inference_engines):
+    inference_engine_device_ids.append(
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())))
+    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
+
+# check the placement
+# the first two training actors should be
+# on the same GPUs as the first inference engine
+assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
+# the last two training actors should be
+# on the same GPUs as the second inference engine
+assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
diff --git a/vllm/envs.py b/vllm/envs.py
index bb419dacb1e..745b068b7a4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -85,6 +85,8 @@
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
+    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+    VLLM_RAY_BUNDLE_INDICES: str = ""
 
 
 def get_default_cache_root():
@@ -550,6 +552,18 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
+    # Number of GPUs per worker in Ray, if it is set to be a fraction,
+    # it allows ray to schedule multiple actors on a single GPU,
+    # so that users can colocate other actors on the same GPUs as vLLM.
+    "VLLM_RAY_PER_WORKER_GPUS":
+    lambda: float(os.getenv("VLLM_RAY_PER_WORKER_GPUS", "1.0")),
+
+    # Bundle indices for Ray, if it is set, it can control precisely
+    # which indices are used for the Ray bundle, for every worker.
+    # Format: comma-separated list of integers, e.g. "0,1,2,3"
+    "VLLM_RAY_BUNDLE_INDICES":
+    lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
+
     # When on a Nvidia GPU aligns single entries (within a page) so they are 256
     # byte aligned for better performance, this increases the memory usage of
     # the cache. Currently this only affects MLA that results in non-256
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 80e7a1c405f..6a25a4d50fb 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -129,13 +129,7 @@ def _get_env_vars_to_be_updated(self):
 
     def _init_workers_ray(self, placement_group: "PlacementGroup",
                           **ray_remote_kwargs):
-        if (self.parallel_config.tensor_parallel_size == 1
-                and self.parallel_config.pipeline_parallel_size == 1):
-            # For single GPU case, we use a ray worker with constrained memory.
-            num_gpus = self.cache_config.gpu_memory_utilization
-        else:
-            # Otherwise, the ray workers are allocated with a full GPU.
-            num_gpus = 1
+        num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS
 
         # The driver dummy worker does not actually use any resources.
         # It holds the resource for the driver worker.
@@ -155,12 +149,29 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
 
         # Create the workers.
-        driver_ip = get_ip()
-        rank = 0
+        bundle_indices: List[int]
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            # Use the bundle indices specified by the user.
+            bundle_indices = list(
+                map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(",")))
+            assert len(bundle_indices) == self.parallel_config.world_size, \
+            ("VLLM_RAY_BUNDLE_INDICES must have the same size"
+            f" as the world size, but got {bundle_indices=} "
+            f"and {self.parallel_config.world_size=}")
+            assert len(set(bundle_indices)) == len(bundle_indices), \
+            ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+            f" but got {bundle_indices=}")
+        else:
+            # use the first N bundles that have GPU resources.
+            bundle_indices = []
+            for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+                if bundle.get(current_platform.ray_device_key, 0):
+                    bundle_indices.append(bundle_id)
+            bundle_indices = bundle_indices[:self.parallel_config.world_size]
+
         worker_metadata: List[RayWorkerMetaData] = []
-        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
-            if not bundle.get(current_platform.ray_device_key, 0):
-                continue
+        driver_ip = get_ip()
+        for rank, bundle_id in enumerate(bundle_indices):
             scheduling_strategy = PlacementGroupSchedulingStrategy(
                 placement_group=placement_group,
                 placement_group_capture_child_tasks=True,
@@ -187,7 +198,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
                                            rpc_rank=rank)
             worker_metadata.append(
                 RayWorkerMetaData(worker=worker, created_rank=rank))
-            rank += 1
 
         worker_ips = ray.get([
             each.worker.get_node_ip.remote()  # type: ignore[attr-defined]
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index b49852a727f..991d55ac861 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -275,6 +275,14 @@ def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
         return cls._get_physical_device_name(physical_device_id)
 
+    @classmethod
+    @lru_cache(maxsize=8)
+    @with_nvml_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
+        return pynvml.nvmlDeviceGetUUID(handle)
+
     @classmethod
     @lru_cache(maxsize=8)
     @with_nvml_context
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index dc6545c933d..211e288b125 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -183,6 +183,11 @@ def get_device_name(cls, device_id: int = 0) -> str:
         """Get the name of a device."""
         raise NotImplementedError
 
+    @classmethod
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        """Get the uuid of a device, e.g. the PCI bus ID."""
+        raise NotImplementedError
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         """Get the total memory of a device in bytes."""

From af5101697b9ef8ad938d916bac130ef80128c6ba Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Thu, 6 Feb 2025 05:24:26 +0800
Subject: [PATCH 0018/1240] Merging PR #12536

Merged via CLI script

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/layer.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 19ee89630ff..e4df7ffc588 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -156,9 +156,13 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        if self.calculate_kv_scales and \
-            attn_metadata.enable_kv_scales_calculation:
-            self.calc_kv_scales(key, value)
+        # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments
+        # directly, use `self.kv_cache` and
+        # `get_forward_context().attn_metadata` instead.
+        if self.calculate_kv_scales:
+            ctx_attn_metadata = get_forward_context().attn_metadata
+            if ctx_attn_metadata.enable_kv_scales_calculation:
+                self.calc_kv_scales(key, value)
         if self.use_output:
             output = torch.empty_like(query)
             hidden_size = query.size(-1)
@@ -172,15 +176,27 @@ def forward(
             if value is not None:
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
-                unified_attention_with_output(query, key, value, output,
-                                              self.layer_name)
+                forward_context: ForwardContext = get_forward_context()
+                ctx_attn_metadata = forward_context.attn_metadata
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                self.impl.forward(self,
+                                  query,
+                                  key,
+                                  value,
+                                  self_kv_cache,
+                                  ctx_attn_metadata,
+                                  output=output)
             else:
                 torch.ops.vllm.unified_attention_with_output(
                     query, key, value, output, self.layer_name)
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
-                return unified_attention(query, key, value, self.layer_name)
+                forward_context = get_forward_context()
+                ctx_attn_metadata = forward_context.attn_metadata
+                self_kv_cache = self.kv_cache[forward_context.virtual_engine]
+                return self.impl.forward(self, query, key, value,
+                                         self_kv_cache, ctx_attn_metadata)
             else:
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)

From 032a267b3aa05e510c18bf5c8dba5ee7bb4bbf46 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Thu, 6 Feb 2025 02:59:45 +0530
Subject: [PATCH 0019/1240] [Hardware][Intel-Gaudi] Enable FusedSDPA support
 for Intel Gaudi (HPU)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/hpu_attn.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 1518e518e91..1ad5e6e8e4e 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -10,7 +10,8 @@
 
 import torch
 import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+from vllm_hpu_extension.utils import (Matmul, ModuleFusedSDPA, Softmax,
+                                      VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
@@ -137,9 +138,17 @@ def __init__(
 
         self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                               '0').lower() in ['1', 'true']
+        self.fused_scaled_dot_product_attention = None
         if self.prefill_usefusedsdpa:
             assert alibi_slopes is None, \
                 'Prefill with FusedSDPA not supported with alibi slopes!'
+            try:
+                from habana_frameworks.torch.hpex.kernels import FusedSDPA
+                self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
+                    FusedSDPA)
+            except ImportError:
+                logger().warning("Could not import HPU FusedSDPA kernel. "
+                                 "vLLM will use native implementation.")
 
         suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
@@ -227,6 +236,7 @@ def forward(
                 matmul_qk_op=self.matmul_qk,
                 softmax_op=self.softmax,
                 matmul_av_op=self.matmul_av,
+                fsdpa_op=self.fused_scaled_dot_product_attention,
             )
             output = out.reshape(batch_size, seq_len, hidden_size)
         else:

From 9cc4f783c04f375e1c03915c401328381677549c Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 5 Feb 2025 15:30:43 -0600
Subject: [PATCH 0020/1240] Add: Support for Sparse24Bitmask Compressed Models

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../SparseLlama3.1_2of4_fp8_compressed.yaml   |  11 +
 tests/quantization/test_compressed_tensors.py | 332 +++++++++++++++---
 .../compressed_tensors/compressed_tensors.py  |  34 +-
 .../schemes/compressed_tensors_24.py          | 238 ++++++++++---
 4 files changed, 503 insertions(+), 112 deletions(-)
 create mode 100644 .buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml

diff --git a/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
new file mode 100644
index 00000000000..2928d75ce44
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/SparseLlama3.1_2of4_fp8_compressed.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
+model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.6353
+  - name: "exact_match,flexible-extract"
+    value: 0.637
+limit: null
+num_fewshot: null 
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 7e2e6f6ed58..0655f2b385f 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -3,6 +3,7 @@
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """
+
 from typing import Optional
 
 import pytest
@@ -22,12 +23,30 @@
 
 @pytest.mark.parametrize(
     "model_args",
-    [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
-      QuantizationType.INT, 2560, True),
-     ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
-      QuantizationType.INT, 2560, True),
-     ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor",
-      QuantizationType.INT, 2560, False)])
+    [
+        (
+            "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+            "channel",
+            QuantizationType.INT,
+            2560,
+            True,
+        ),
+        (
+            "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+            "tensor",
+            QuantizationType.INT,
+            2560,
+            False,
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
     with vllm_runner(model_path, enforce_eager=True) as llm:
@@ -85,21 +104,31 @@ def zp_valid(zp: Optional[torch.Tensor]):
         assert output
 
 
-@pytest.mark.parametrize("model_path", [
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
-])
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+        "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    ],
+)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
-def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
-                                          example_prompts, model_path,
-                                          max_tokens, num_logprobs):
+def test_compressed_tensors_w8a8_logprobs(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model_path,
+    max_tokens,
+    num_logprobs,
+):
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
-    if model_path == "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym":  # noqa: E501
+    if (model_path ==
+            "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym"
+        ):  # noqa: E501
         example_prompts = example_prompts[0:-1]
 
     with hf_runner(model_path, dtype=dtype) as hf_model:
@@ -125,13 +154,21 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
         assert output
 
 
-@pytest.mark.parametrize("model_args", [
-    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
-    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
-    ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-     "channel"),
-])
+@pytest.mark.parametrize(
+    "model_args",
+    [
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
+        ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+            "channel",
+        ),
+        (
+            "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+            "channel",
+        ),
+    ],
+)
 def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
     model_path, strategy = model_args
     with vllm_runner(model_path, dtype=torch.float16) as llm:
@@ -156,9 +193,12 @@ def check_model(model):
 
 @pytest.mark.parametrize(
     "wNa16_args",
-    [("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
-     ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
-     ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
+    [
+        ("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
+        ("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
+        ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
+    ],
+)
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
@@ -218,7 +258,8 @@ def check_model(model):
                               CompressedTensorsLinearMethod)
             assert isinstance(
                 qkv_proj.scheme,
-                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8))
+                (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8),
+            )
 
             assert qkv_proj.input_scale.dtype is torch.float32
 
@@ -241,9 +282,14 @@ def test_compressed_tensors_kv_cache(vllm_runner):
         assert output
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+def _test_2of4_quant_models(qkv_proj,
+                            weight_strategy,
+                            input_strategy,
+                            format="dense"):
     assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
     assert isinstance(qkv_proj.scheme, CompressedTensors24)
 
@@ -252,22 +298,39 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy):
     assert qkv_proj.scheme.quantized
     assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
     sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
-    assert sparsity_map.get("Linear").format == "dense"
+    assert sparsity_map.get("Linear").format == format
     assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
 
-@pytest.mark.skipif(not current_platform.has_device_capability(90),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing", "channel",
-     "token"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-     "channel", "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
@@ -286,16 +349,134 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skipif(not sparse_cutlass_supported(),
-                    reason="Sparse FP8 is not yet supported on this GPU type.")
-@pytest.mark.parametrize("args_2of4", [
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-     "channel", "token"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing", "tensor",
-     "tensor"),
-    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-     "tensor", "token"),
-])
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(90),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_fp8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+            "channel",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+            "tensor",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+            "tensor",
+            "tensor",
+        ),
+    ],
+)
+def test_compressed_tensors_2of4_quant_int8_compressed(vllm_runner, args_2of4):
+    model, weight_strategy, input_strategy = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert qkv_proj.scheme.weights_dtype == torch.int8
+            _test_2of4_quant_models(
+                qkv_proj,
+                weight_strategy,
+                input_strategy,
+                format="sparse-24-bitmask",
+            )
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Sparse FP8 is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4",
+    [
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+            "channel",
+            "token",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+            "tensor",
+            "tensor",
+        ),
+        (
+            "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+            "tensor",
+            "token",
+        ),
+    ],
+)
 def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4):
     model, weight_strategy, input_strategy = args_2of4
     with vllm_runner(model) as llm:
@@ -317,10 +498,12 @@ def check_model(model):
 @pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(
     not sparse_cutlass_supported(),
-    reason="2of4 Sparse is not yet supported on this GPU type.")
+    reason="2of4 Sparse is not yet supported on this GPU type.",
+)
 @pytest.mark.parametrize(
     "args_2of4",
-    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")])
+    [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")],
+)
 def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4):
     model = args_2of4
     with vllm_runner(model) as llm:
@@ -337,7 +520,9 @@ def check_model(model):
             assert qkv_proj.scheme.input_quant is None
             assert not qkv_proj.scheme.quantized
             assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
-            sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map  # noqa: E501
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
             assert sparsity_map.get("Linear").format == "dense"
             assert sparsity_map.get("Linear").sparsity_structure == "2:4"
 
@@ -346,3 +531,38 @@ def check_model(model):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(
+    not sparse_cutlass_supported(),
+    reason="Cutlass is not yet supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "args_2of4", [("nm-testing/llama2.c-stories42M-pruned2.4-compressed")])
+def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
+    model = args_2of4
+    with vllm_runner(model) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensors24)
+
+            assert qkv_proj.scheme.weight_quant is None
+            assert qkv_proj.scheme.input_quant is None
+            assert not qkv_proj.scheme.quantized
+            assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            sparsity_map = (
+                qkv_proj.quant_method.quantization_config.sparsity_scheme_map
+            )  # noqa: E501
+            assert sparsity_map.get("Linear").format == "sparse-24-bitmask"
+            assert sparsity_map.get("Linear").sparsity_structure == "2:4"
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0e3258e4afb..6ee3e9362f8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -417,15 +417,22 @@ def get_scheme(self,
                 return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
-            scheme = CompressedTensors24(quantized=weight_quant is not None
-                                         or input_quant is not None,
-                                         weight_quant=weight_quant,
-                                         input_quant=input_quant)
+            model_compression_config = (None if sparsity_scheme is None
+                                        or sparsity_scheme.format == "dense"
+                                        else self.config)
+
+            scheme = CompressedTensors24(
+                quantized=weight_quant is not None or input_quant is not None,
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                model_compression_config=model_compression_config,
+            )
         elif weight_quant is None:
             logger.warning_once("Acceleration for non-quantized schemes is "
                                 "not supported by Compressed Tensors. "
                                 "Falling back to UnquantizedLinearMethod")
             return None
+
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
@@ -475,10 +482,21 @@ def supports_cutlass_24(
         :return: True if the layer is supported by the Cutlass 2:4 Kernel
             False otherwise
         """
-        is_valid_sparsity = (sparsity_scheme is not None
-                             and sparsity_scheme.sparsity_structure
-                             == SparsityStructure.TWO_FOUR.value
-                             and sparsity_scheme.format == "dense")
+        if sparsity_scheme is None:
+            return False
+
+        is_valid_sparsity_structure: bool = (
+            sparsity_scheme.sparsity_structure ==
+            SparsityStructure.TWO_FOUR.value)
+
+        valid_compressors = {
+            CompressionFormat.dense.value,
+            CompressionFormat.sparse_24_bitmask.value
+        }
+
+        is_valid_sparsity = (is_valid_sparsity_structure
+                             and sparsity_scheme.format in valid_compressors)
+
         if not is_valid_sparsity:
             return False
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 84f924b236a..0fb8dfa96a1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
+from compressed_tensors import CompressionFormat, ModelCompressor
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
+from compressed_tensors.utils import combine_shards
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -22,26 +26,39 @@
 
 class CompressedTensors24(CompressedTensorsScheme):
 
-    def __init__(self,
-                 quantized: bool = False,
-                 weight_quant: Optional[QuantizationArgs] = None,
-                 input_quant: Optional[QuantizationArgs] = None):
-
+    def __init__(
+        self,
+        quantized: bool = False,
+        weight_quant: Optional[QuantizationArgs] = None,
+        input_quant: Optional[QuantizationArgs] = None,
+        model_compression_config: Optional[Dict[str, Any]] = None,
+    ):
         self.quantized = quantized
         self.weight_quant = weight_quant
         self.input_quant = input_quant
+        self.model_compressor = (
+            ModelCompressor.from_compression_config(model_compression_config)
+            if model_compression_config is not None else None)
+        self.do_sparse_decompress = (
+            self.model_compressor is not None
+            and self.model_compressor.sparsity_config.format
+            == CompressionFormat.sparse_24_bitmask.value)
 
     @classmethod
     def get_min_capability(cls) -> int:
         # Only cutlass 3.x kernels are implemented so far
         return 90
 
-    def create_weights(self, layer: torch.nn.Module, input_size: int,
-                       output_partition_sizes: List[int],
-                       input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
-
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
         if not sparse_cutlass_supported():
             raise ValueError(
                 "Sparse CUTLASS not supported. vLLM must be built with "
@@ -49,16 +66,56 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
 
         self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
+        layer.input_size = input_size
+        layer.input_size_per_partition = input_size_per_partition
         self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
 
         # parameter to store uncompressed weight
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=self.weights_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=self.weights_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        if self.do_sparse_decompress:
+            assert all(partition_size % 8 == 0
+                       for partition_size in output_partition_sizes
+                       ), "All partitions must be divisible by 8 for "
+            "2:4 sparse compressed models"
+
+            shape = BasevLLMParameter(
+                data=torch.empty(2, 1, dtype=torch.int64),
+                weight_loader=weight_loader,
+            )
+            compressed_weight = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 2,
+                    dtype=self.weights_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            bitmask = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 8,
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+
+            layer.register_parameter("shape", shape)
+            layer.register_parameter("compressed", compressed_weight)
+            layer.register_parameter("bitmask", bitmask)
 
         # Check if quantized, not just 2:4 Sparse
         if self.quantized:
@@ -68,14 +125,16 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                     data=torch.empty((sum(output_partition_sizes), 1),
                                      dtype=torch.float32),
                     output_dim=0,
-                    weight_loader=weight_loader)
+                    weight_loader=weight_loader,
+                )
             else:
                 assert (self.weight_quant and self.weight_quant.strategy
                         == QuantizationStrategy.TENSOR.value)
                 weight_scale = PerTensorScaleParameter(
                     data=torch.empty(len(output_partition_sizes),
                                      dtype=torch.float32),
-                    weight_loader=weight_loader)
+                    weight_loader=weight_loader,
+                )
 
             layer.register_parameter("weight_scale", weight_scale)
 
@@ -84,9 +143,10 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
                 # register input quant scale
                 assert (self.input_quant.strategy ==
                         QuantizationStrategy.TENSOR.value)
-                input_scale = BasevLLMParameter(data=torch.empty(
-                    1, dtype=torch.float32),
-                                                weight_loader=weight_loader)
+                input_scale = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
 
                 layer.register_parameter("input_scale", input_scale)
 
@@ -107,13 +167,25 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         """
         Compress weights after loading. Store compressed weight and meta
             tensor
-        
+
         :post-condition: layer.w_compressed and layer.meta are
             set to the compressed weight and meta tensor in the
             format expected by the Cutlass kernels
         :param layer: The layer with the weights to be processed
-        
+
         """
+        if self.do_sparse_decompress:
+            layer.weight.data = self._decompress_bitmask_compressed_weight(
+                compressed=layer.compressed,
+                bitmask=layer.bitmask,
+                layer=layer,
+            )
+
+            # compressed and bitmask tensors
+            # are no longer needed after decompression
+            del layer.compressed
+            del layer.bitmask
+
         # torch.compile workaround
         if hasattr(layer, "input_scale"):
             layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
@@ -121,10 +193,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
 
         if self.weight_quant:
             if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
-                layer.weight_scale = torch.nn.Parameter(convert_to_channelwise(
-                    weight_scale=layer.weight_scale,
-                    logical_widths=layer.logical_widths),
-                                                        requires_grad=False)
+                layer.weight_scale = torch.nn.Parameter(
+                    convert_to_channelwise(
+                        weight_scale=layer.weight_scale,
+                        logical_widths=layer.logical_widths,
+                    ),
+                    requires_grad=False,
+                )
             else:
                 # torch.compile workaround
                 layer.weight_scale = torch.nn.Parameter(
@@ -134,20 +209,22 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
 
-    def apply_weights(self,
-                      layer: torch.nn.Module,
-                      x: torch.Tensor,
-                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         """
-        Returns the output tensor for the layer with 2:4 
+        Returns the output tensor for the layer with 2:4
         sparse compressed weights, given the input tensor
         and bias
 
-        :param layer: The layer with 2:4 sparse compressed 
+        :param layer: The layer with 2:4 sparse compressed
             weights to be used for the computation
         :param x: The input tensor to the layer
         :param bias: The bias to be added to the output tensor
-        :return: The output tensor of the layer 
+        :return: The output tensor of the layer
         """
         if self.quantized:
             scale = None
@@ -171,13 +248,15 @@ def apply_weights(self,
             input_scale = layer.input_scale
             q_input = x
 
-        out = ops.cutlass_scaled_sparse_mm(a=q_input,
-                                           bt_nzs=layer.weight,
-                                           bt_meta=layer.meta,
-                                           scale_a=input_scale,
-                                           scale_b=layer.weight_scale,
-                                           out_dtype=self.output_dtype,
-                                           bias=bias)
+        out = ops.cutlass_scaled_sparse_mm(
+            a=q_input,
+            bt_nzs=layer.weight,
+            bt_meta=layer.meta,
+            scale_a=input_scale,
+            scale_b=layer.weight_scale,
+            out_dtype=self.output_dtype,
+            bias=bias,
+        )
         assert out.is_contiguous()
         return out
 
@@ -203,8 +282,71 @@ def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
 
         raise ValueError("Quantization type not supported by Cutlass")
 
+    def _decompress_bitmask_compressed_weight(
+        self,
+        compressed: torch.Tensor,
+        bitmask: torch.Tensor,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """
+        Decompress a compressed 2:4 sparse weight tensor using the bitmask and
+        return the result.
+
+        This function also supports sharded decompression.
+
+        :param compressed: The 2:4 sparse weight tensor compressed using the
+            sparse-24-bitmask compressor. This is different from
+            `cutlass_sparse_compress` which uses a different scheme (2 bits for
+            every nonzero element that represent the coordinate within the block
+            of 4). The bitmask compression here uses a bitmask to indicate the
+            positions of non-zero elements.
+        :param bitmask: The 2:4 bitmask associated with the compressed weights,
+            representing the positions of non-zero elements in the compressed
+            tensor.
+        :param layer: The layer whose weights need to be processed after 
+            loading.
+        :return: The decompressed 2:4 sparse weight tensor.
+        """
 
-def check_24(tensor):
-    new_tensor = tensor.view(-1, 4)
-    zero_counts = (new_tensor == 0).sum(dim=1)
-    return (zero_counts >= 2).all().item()
+        sparsity_compressor = self.model_compressor.sparsity_compressor
+
+        def _process_split(
+            bitmask_compressed_weight: torch.Tensor,
+            shape,
+            bitmask: torch.Tensor,
+        ) -> torch.Tensor:
+            weight_data = dict(
+                compressed=bitmask_compressed_weight,
+                shape=shape,
+                bitmask=bitmask,
+            )
+            return sparsity_compressor.decompress_weight(weight_data)
+
+        split_weights: List[torch.Tensor] = []
+        split_bitmask: List[torch.Tensor] = []
+        split_shape: List[Tuple[int, int]] = []
+
+        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
+            split_weights = torch.split(compressed, layer.logical_widths)
+            split_bitmask = torch.split(bitmask, layer.logical_widths)
+            split_shape = [(out, layer.input_size_per_partition)
+                           for out in layer.logical_widths]
+
+        if split_weights:
+            decompressed_shards = [
+                _process_split(compressed_weight, shape, bitmask)
+                for compressed_weight, shape, bitmask in zip(
+                    split_weights, split_shape, split_bitmask)
+            ]
+            decompressed = combine_shards(decompressed_shards)
+        else:
+            decompressed = sparsity_compressor.decompress_weight(
+                dict(
+                    compressed=compressed,
+                    shape=(
+                        layer.logical_widths[0],
+                        layer.input_size_per_partition,
+                    ),
+                    bitmask=bitmask,
+                ))
+        return decompressed

From 82cfb7b96a0e5baa759fb8c2032200462e6370cc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Feb 2025 05:30:46 +0800
Subject: [PATCH 0021/1240] [VLM] Use shared field to pass token ids to model

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/internvl.py |   6 +-
 vllm/multimodal/inputs.py              | 275 +++++++++++++++++++++----
 2 files changed, 235 insertions(+), 46 deletions(-)

diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 08fc659ab61..380eb40d9eb 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -564,8 +564,7 @@ def _call_hf_processor(
         # Since there may be extra tokens in the feature placeholders,
         # we need to pass the image token ID to the model to select the
         # tokens to merge from the vision encoder outputs
-        processed_outputs["image_token_id"] = [image_token_id
-                                               ] * len(image_data)
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
 
         return processed_outputs
 
@@ -575,13 +574,14 @@ def _get_mm_fields_config(
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
 
         return dict(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
-            image_token_id=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
 
     def _get_prompt_replacements(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 8e4af7f88f9..2f2535f368c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -4,6 +4,7 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
+from functools import partial
 from itertools import accumulate
 from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
                     Union, cast, final)
@@ -164,51 +165,112 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 
 @dataclass(frozen=True)
 class MultiModalFieldElem:
-    """Contains metadata and data of an item in :class:`MultiModalKwargs`."""
-    field: "BaseMultiModalField"
+    """
+    Represents a keyword argument corresponding to a multi-modal item
+    in :class:`MultiModalKwargs`.
+    """
+
+    modality: str
+    """
+    The modality of the corresponding multi-modal item.
+    Each multi-modal item can consist of multiple keyword arguments.
+    """
+
+    key: str
+    """
+    The key of this field in :class:`MultiModalKwargs`,
+    i.e. the name of the keyword argument to be passed to the model.
+    """
+
     data: NestedTensors
+    """
+    The tensor data of this field in :class:`MultiModalKwargs`,
+    i.e. the value of the keyword argument to be passed to the model.
+    """
+
+    field: "BaseMultiModalField"
+    """
+    Defines how to combine the tensor data of this field with others
+    in order to batch multi-modal items together for model inference.
+    """
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False
 
-        return (self.field == other.field
-                and nested_tensors_equal(self.data, other.data))
+        return ((self.modality, self.key) == (other.modality, other.key)
+                and nested_tensors_equal(self.data, other.data)
+                and type(self.field) == type(other.field))  # noqa: E721
 
 
 @dataclass(frozen=True)
 class BaseMultiModalField(ABC):
-    """Abstract base class for a field in :class:`MultiModalKwargs`."""
-    key: str
-    modality: str
+    """
+    Defines how to interpret tensor data belonging to a keyword argument in
+    :class:`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    """
+
+    def _field_factory(self, *, modality: str, key: str):
+        f = partial(
+            MultiModalFieldElem,
+            modality=modality,
+            key=key,
+            field=self,
+        )
+
+        # Allow passing data as positional argument
+        def factory(data: NestedTensors) -> MultiModalFieldElem:
+            return f(data=data)
+
+        return factory
 
     @abstractmethod
-    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        """
+        Construct :class:`MultiModalFieldElem` instances to represent
+        the provided data.
+        
+        This is the inverse of :meth:`reduce_data`.
+        """
         raise NotImplementedError
 
-    def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem:
-        return MultiModalFieldElem(self, data)
+    @abstractmethod
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        raise NotImplementedError
 
-    def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem:
-        """Merge multiple instances of :class:`MultiModalFieldElem` together."""
-        fields = [item.field for item in batch]
-        if len(set(fields)) > 1:
-            raise ValueError(f"Cannot merge different {fields=}")
+    def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
+        """
+        Merge the data from multiple instances of :class:`MultiModalFieldElem`.
 
-        data = self._reduce_data([item.data for item in batch])
+        This is the inverse of :meth:`build_elems`.
+        """
+        field_types = [type(item.field) for item in elems]
+        if len(set(field_types)) > 1:
+            raise ValueError(f"Cannot merge different {field_types=}")
 
-        return self._build_elem(data)
+        return self._reduce_data([item.data for item in elems])
 
 
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an element in the batch
-    is obtained by indexing into the first dimension of the underlying data.
+    See also:
+        :func:`MultiModalFieldConfig.batched`
     """
 
-    def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]:
-        return [self._build_elem(item) for item in batch]
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(item) for item in data]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -227,16 +289,20 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    A :class:`BaseMultiModalField` implementation where an element in the batch
-    is obtained by slicing along the first dimension of the underlying data.
+    See also:
+        :func:`MultiModalFieldConfig.flat`
+        :func:`MultiModalFieldConfig.flat_from_sizes`
     """
+    slices: Sequence[slice]
 
     def build_elems(
         self,
-        batch: NestedTensors,
-        slices: Sequence[slice],
-    ) -> list[MultiModalFieldElem]:
-        return [self._build_elem(batch[slice_]) for slice_ in slices]
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data[s]) for s in self.slices]
 
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"):
@@ -252,25 +318,121 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         return [e for elem in batch for e in elem]
 
 
+@dataclass(frozen=True)
+class MultiModalSharedField(BaseMultiModalField):
+    """
+    See also:
+        :func:`MultiModalFieldConfig.shared`
+    """
+    batch_size: int
+
+    def build_elems(
+        self,
+        modality: str,
+        key: str,
+        data: NestedTensors,
+    ) -> Sequence[MultiModalFieldElem]:
+        field_factory = self._field_factory(modality=modality, key=key)
+        return [field_factory(data)] * self.batch_size
+
+    def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
+        return batch[0]
+
+
 class MultiModalFieldConfig:
 
     @staticmethod
     def batched(modality: str):
+        """
+        Defines a field where an element in the batch is obtained by
+        indexing into the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+
+        Example:
+
+        .. code-block::
+
+            Input:
+                Data: [[AAAA]
+                       [BBBB]
+                       [CCCC]]
+
+            Output:
+                Element 1: [AAAA]
+                Element 2: [BBBB]
+                Element 3: [CCCC]
+        """
         return MultiModalFieldConfig(
-            field_cls=MultiModalBatchedField,
+            field=MultiModalBatchedField(),
             modality=modality,
         )
 
     @staticmethod
     def flat(modality: str, slices: Sequence[slice]):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, a slice that is used to extract
+                the data corresponding to it.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+
+            Input:
+                Data: [AAABBBBCC]
+
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+        """
         return MultiModalFieldConfig(
-            field_cls=MultiModalFlatField,
+            field=MultiModalFlatField(slices=slices),
             modality=modality,
-            slices=slices,
         )
 
     @staticmethod
     def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
+        """
+        Defines a field where an element in the batch is obtained by
+        slicing along the first dimension of the underlying data.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            slices: For each multi-modal item, the size of the slice that
+                is used to extract the data corresponding to it.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                size_per_item: [3, 4, 2]
+
+            Input:
+                Data: [AAABBBBCC]
+
+            Output:
+                Element 1: [AAA]
+                Element 2: [BBBB]
+                Element 3: [CC]
+    
+        See also:
+            :func:`MultiModalFieldConfig.flat`
+        """
+
         slice_idxs = [0, *accumulate(size_per_item)]
         slices = [
             slice(slice_idxs[i], slice_idxs[i + 1])
@@ -279,25 +441,52 @@ def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
 
         return MultiModalFieldConfig.flat(modality, slices)
 
-    def __init__(
-        self,
-        field_cls: type[BaseMultiModalField],
-        modality: str,
-        **field_config: Any,
-    ) -> None:
+    @staticmethod
+    def shared(modality: str, batch_size: int):
+        """
+        Defines a field where an element in the batch is obtained by
+        taking the entirety of the underlying data.
+
+        This means that the data is the same for each element in the batch.
+
+        Args:
+            modality: The modality of the multi-modal item that uses this
+                keyword argument.
+            batch_size: The number of multi-modal items which share this data.
+
+        Example:
+
+        .. code-block::
+    
+            Given:
+                batch_size: 4
+
+            Input:
+                Data: [XYZ]
+
+            Output:
+                Element 1: [XYZ]
+                Element 2: [XYZ]
+                Element 3: [XYZ]
+                Element 4: [XYZ]
+        """
+        return MultiModalFieldConfig(
+            field=MultiModalSharedField(batch_size),
+            modality=modality,
+        )
+
+    def __init__(self, field: BaseMultiModalField, modality: str) -> None:
         super().__init__()
 
-        self.field_cls = field_cls
+        self.field = field
         self.modality = modality
-        self.field_config = field_config
 
     def build_elems(
         self,
         key: str,
         batch: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
-        field = self.field_cls(key=key, modality=self.modality)
-        return field.build_elems(batch, **self.field_config)  # type: ignore
+        return self.field.build_elems(self.modality, key, batch)
 
 
 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
@@ -308,11 +497,11 @@ class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
 
     @staticmethod
     def from_elems(elems: Sequence[MultiModalFieldElem]):
-        return MultiModalKwargsItem({elem.field.key: elem for elem in elems})
+        return MultiModalKwargsItem({elem.key: elem for elem in elems})
 
     @property
     def modality(self) -> str:
-        modalities = {elem.field.modality for elem in self.data.values()}
+        modalities = {elem.modality for elem in self.data.values()}
         assert len(modalities) == 1, f"Found different modalities={modalities}"
         return next(iter(modalities))
 
@@ -372,7 +561,7 @@ def from_items(items: Sequence[MultiModalKwargsItem]):
                 elems_by_key[key].append(elem)
 
         data = {
-            key: elems[0].field.reduce(elems).data
+            key: elems[0].field.reduce_data(elems)
             for key, elems in elems_by_key.items() if len(elems) > 0
         }
 

From d8ff174fbabc0ce3546bcd1b78d06a6f1ea38dbf Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 5 Feb 2025 16:30:50 -0500
Subject: [PATCH 0022/1240] [Docs] Drop duplicate [source] links

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/conf.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ea3b56e02d1..f4e8c8b9491 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -37,7 +37,6 @@
 # ones.
 extensions = [
     "sphinx.ext.napoleon",
-    "sphinx.ext.viewcode",
     "sphinx.ext.linkcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",

From 06177c84967d8f83a0e29c9155b12d706bf10900 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 5 Feb 2025 13:31:38 -0800
Subject: [PATCH 0023/1240] [VLM] Qwen2.5-VL

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   11 +
 examples/offline_inference/vision_language.py |   31 +
 .../vision_language_multi_image.py            |   58 +
 .../vision_language/test_models.py            |   22 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    2 +
 vllm/entrypoints/chat_utils.py                |    4 +-
 .../model_executor/layers/rotary_embedding.py |   58 +-
 vllm/model_executor/models/qwen2_5_vl.py      | 1133 +++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        |   16 +-
 vllm/model_executor/models/registry.py        |    1 +
 vllm/v1/worker/gpu_model_runner.py            |   12 +-
 vllm/worker/cpu_model_runner.py               |    9 +-
 vllm/worker/model_runner.py                   |    9 +-
 14 files changed, 1315 insertions(+), 52 deletions(-)
 create mode 100644 vllm/model_executor/models/qwen2_5_vl.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index d8e28429295..3e8b2f89642 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -846,6 +846,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Qwen2_5_VLForConditionalGeneration`
+  * Qwen2.5-VL
+  * T + I<sup>E+</sup> + V<sup>E+</sup>
+  * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
@@ -880,6 +887,10 @@ The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingf
 A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
 :::
 
+:::{note}
+To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 65940b6ada8..436c3657059 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -531,6 +531,36 @@ def run_qwen2_vl(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Qwen2.5-VL
+def run_qwen2_5_vl(question: str, modality: str):
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+            "fps": 1,
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+              f"{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "aria": run_aria,
     "blip-2": run_blip2,
@@ -557,6 +587,7 @@ def run_qwen2_vl(question: str, modality: str):
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
+    "qwen2_5_vl": run_qwen2_5_vl,
 }
 
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 601ac96e16e..8d2172a606f 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -392,6 +392,63 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+    try:
+        from qwen_vl_utils import process_vision_info
+    except ModuleNotFoundError:
+        print('WARNING: `qwen-vl-utils` not installed, input images will not '
+              'be automatically resized. You can enable this functionality by '
+              '`pip install qwen-vl-utils`.')
+        process_vision_info = None
+
+    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=32768 if process_vision_info is None else 4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    stop_token_ids = None
+
+    if process_vision_info is None:
+        image_data = [fetch_image(url) for url in image_urls]
+    else:
+        image_data, _ = process_vision_info(messages,
+                                            return_video_sample_fps=False)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=image_data,
+        chat_template=None,
+    )
+
+
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
@@ -404,6 +461,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
+    "qwen2_5_vl": load_qwen2_5_vl,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 85bc4ac1318..95505dcf5c2 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -121,6 +121,8 @@
                else ("half", "float")),
         marks=[pytest.mark.core_model],
     ),
+    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
+    # once we upgraded to transformers>=4.49.0.
     "qwen2_vl": VLMTestInfo(
         models=["Qwen/Qwen2-VL-2B-Instruct"],
         test_type=(
@@ -138,6 +140,26 @@
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
+    "qwen2_5_vl": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.skipif(
+                TRANSFORMERS_VERSION < "4.49.0",
+                reason="HF model requires transformers>=4.49.0",
+            ), pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 5cd749cbd77..77cf3442df9 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -161,6 +161,7 @@ def _test_processing_correctness(
     "nvidia/NVLM-D-72B",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
     "fixie-ai/ultravox-v0_3",
 ])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 285fbe48480..20787fe008a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -264,6 +264,8 @@ def check_available_online(
                                        trust_remote_code=True),
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
+                                                          min_transformers_version="4.49"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
                                      trust_remote_code=True),
     # [Encoder-decoder]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3a6e75b1d8e..f04902ae1c7 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -410,7 +410,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
@@ -430,7 +430,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
-            if model_type == "qwen2_vl":
+            if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<video>./</video>)"
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 814c3b7d9cd..b3b9b0e8760 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -27,6 +27,7 @@
 
 import torch
 import torch.nn as nn
+from transformers import PretrainedConfig
 
 from vllm.model_executor.custom_op import CustomOp
 
@@ -772,8 +773,12 @@ def __init__(
         dtype: torch.dtype,
         mrope_section: Optional[List[int]] = None,
     ) -> None:
-        super().__init__(head_size, rotary_dim, max_position_embeddings, base,
-                         is_neox_style, dtype)
+        # In Qwen2.5-VL, the maximum index value is related to the duration of
+        # the input video. We enlarge max_position_embeddings to 4 times to get
+        # a larger the cos and sin cache.
+        self.cache_max_position_num = max_position_embeddings * 4
+        super().__init__(head_size, rotary_dim, self.cache_max_position_num,
+                         base, is_neox_style, dtype)
 
         self.mrope_section = mrope_section
         if self.mrope_section:
@@ -831,13 +836,10 @@ def forward(
     @staticmethod
     def get_input_positions(
         input_tokens: List[int],
+        hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
-        vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
+        second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
@@ -845,16 +847,13 @@ def get_input_positions(
 
         llm_positions, mrope_position_delta = \
             MRotaryEmbedding.get_input_positions_tensor(
-                input_tokens,
-                image_grid_thw,
-                video_grid_thw,
-                image_token_id,
-                video_token_id,
-                vision_start_token_id,
-                vision_end_token_id,
-                spatial_merge_size,
-                context_len,
-                seq_len,
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+                context_len=context_len,
+                seq_len=seq_len,
             )
 
         return llm_positions.tolist(), mrope_position_delta
@@ -862,18 +861,22 @@ def get_input_positions(
     @staticmethod
     def get_input_positions_tensor(
         input_tokens: List[int],
+        hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        image_token_id: int,
-        video_token_id: int,
-        vision_start_token_id: int,
-        vision_end_token_id: int,
-        spatial_merge_size: int,
+        second_per_grid_ts: Optional[List[float]] = None,
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[torch.Tensor, int]:
         """Get mrope input positions and delta value."""
 
+        image_token_id = hf_config.image_token_id
+        video_token_id = hf_config.video_token_id
+        vision_start_token_id = hf_config.vision_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        tokens_per_second = getattr(hf_config.vision_config,
+                                    "tokens_per_second", 1.0)
+
         if isinstance(image_grid_thw, torch.Tensor):
             image_grid_thw = image_grid_thw.tolist()
         if isinstance(video_grid_thw, torch.Tensor):
@@ -892,6 +895,7 @@ def get_input_positions_tensor(
 
         image_index, video_index = 0, 0
         for _ in range(image_nums + video_nums):
+            video_second_per_grid_t = 0.0
             if image_token_id in input_tokens and remain_images > 0:
                 ed_image = input_tokens.index(image_token_id, st)
             else:
@@ -915,9 +919,13 @@ def get_input_positions_tensor(
                     video_grid_thw[video_index][1],
                     video_grid_thw[video_index][2],
                 )
+                video_second_per_grid_t = 1.0
+                if second_per_grid_ts is not None:
+                    video_second_per_grid_t = second_per_grid_ts[video_index]
                 video_index += 1
                 remain_videos -= 1
                 ed = ed_video
+
             llm_grid_t, llm_grid_h, llm_grid_w = \
                 t, h // spatial_merge_size, w // spatial_merge_size
             text_len = ed - st
@@ -927,8 +935,10 @@ def get_input_positions_tensor(
             llm_pos_ids_list.append(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
-            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
-                -1, llm_grid_h * llm_grid_w).flatten()
+            t_index = (torch.arange(llm_grid_t).view(-1, 1).expand(
+                -1, llm_grid_h * llm_grid_w) * video_second_per_grid_t *
+                       tokens_per_second).long().flatten()
+
             h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
                 llm_grid_t, -1, llm_grid_w).flatten()
             w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
new file mode 100644
index 00000000000..e93cf46b900
--- /dev/null
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -0,0 +1,1133 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The Qwen Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
+from functools import cached_property, partial
+from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature
+from transformers.models.qwen2_5_vl import (Qwen2_5_VLImageProcessor,
+                                            Qwen2_5_VLProcessor)
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
+    Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig
+from vllm.platforms import _Backend
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import uses_mrope
+
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
+from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
+                       apply_rotary_pos_emb_vision)
+from .utils import (AutoWeightsLoader, WeightsMapper,
+                    init_vllm_registered_model, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class Qwen2_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+class Qwen2_5_VLImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLImageInputs = Union[Qwen2_5_VLImagePixelInputs,
+                              Qwen2_5_VLImageEmbeddingInputs]
+
+
+class Qwen2_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+    second_per_grid_ts: torch.Tensor
+    """
+    The video time interval (in seconds) for each grid along the temporal 
+    dimension in the 3D position IDs. Returned when `videos` is not `None`.
+    """
+
+
+class Qwen2_5_VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+      (concatenation of all videos' feature tensors).
+
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2_5_VLVideoInputs = Union[Qwen2_5_VLVideoPixelInputs,
+                              Qwen2_5_VLVideoEmbeddingInputs]
+
+# === Vision Encoder === #
+
+
+class Qwen2_5_VisionMLP(nn.Module):
+
+    def __init__(self,
+                 in_features: int,
+                 hidden_features: int,
+                 bias: bool = False,
+                 act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.gate_proj = ColumnParallelLinear(in_features,
+                                              hidden_features,
+                                              bias=bias,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.gate_proj")
+        self.up_proj = ColumnParallelLinear(in_features,
+                                            hidden_features,
+                                            bias=bias,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.up_proj")
+        self.down_proj = RowParallelLinear(hidden_features,
+                                           in_features,
+                                           bias=bias,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.down_proj")
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_gate, _ = self.gate_proj(x)
+        x_gate = self.act_fn(x_gate)
+        x_up, _ = self.up_proj(x)
+        x_down, _ = self.down_proj(x_gate * x_up)
+        return x_down
+
+
+class Qwen2_5_VisionAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, world_size)
+
+        self.qkv = ColumnParallelLinear(input_size=embed_dim,
+                                        output_size=3 * projection_size,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS
+        }:
+            raise RuntimeError(
+                f"Qwen2.5-VL does not support {self.attn_backend} backend now."
+            )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        x = x.view(*new_x_shape)
+
+        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
+        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            seq_length = q.size(1)
+            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
+            attention_mask = torch.zeros([1, seq_length, seq_length],
+                                         device=q.device,
+                                         dtype=torch.bool)
+            for i in range(1, len(cu_seqlens)):
+                attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i],
+                               cu_seqlens[i - 1]:cu_seqlens[i]] = True
+            output = F.scaled_dot_product_attention(q,
+                                                    k,
+                                                    v,
+                                                    attention_mask,
+                                                    dropout_p=0.0)
+            context_layer = rearrange(output, "b h s d -> b s h d ")
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Qwen2RMSNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen2_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.silu,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = Qwen2_5_VisionAttention(embed_dim=dim,
+                                            num_heads=num_heads,
+                                            projection_size=dim,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.attn")
+        self.mlp = Qwen2_5_VisionMLP(dim,
+                                     mlp_hidden_dim,
+                                     act_fn=act_fn,
+                                     bias=True,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.mlp")
+
+    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
+                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x),
+                          cu_seqlens=cu_seqlens,
+                          rotary_pos_emb=rotary_pos_emb)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class Qwen2_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        in_channels: int = 3,
+        hidden_size: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.hidden_size = hidden_size
+
+        kernel_size = (temporal_patch_size, patch_size, patch_size)
+        self.proj = nn.Conv3d(in_channels,
+                              hidden_size,
+                              kernel_size=kernel_size,
+                              stride=kernel_size,
+                              bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size,
+                   self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
+        return x
+
+
+class Qwen2_5_VisionPatchMerger(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        context_dim: int,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        spatial_merge_size: int = 2,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.ln_q = norm_layer(context_dim)
+        self.mlp = nn.ModuleList([
+            ColumnParallelLinear(self.hidden_size,
+                                 self.hidden_size,
+                                 bias=True,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.mlp.0"),
+            nn.GELU(),
+            RowParallelLinear(self.hidden_size,
+                              d_model,
+                              bias=True,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.mlp.2"),
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x)
+        x = x.view(-1, self.hidden_size)
+
+        mlp_fc1, mlp_act, mlp_fc2 = self.mlp
+        x_parallel, _ = mlp_fc1(x)
+        x_parallel = mlp_act(x_parallel)
+        out, _ = mlp_fc2(x_parallel)
+        return out
+
+
+class Qwen2_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        inv_freq = 1.0 / (theta
+                          **(torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = 0
+        self._freqs_cached = None
+
+    def update_freqs_cache(self, seqlen: int) -> None:
+        if seqlen > self._seq_len_cached:
+            seqlen *= 2
+            self._seq_len_cached = seqlen
+            self.inv_freq = 1.0 / (self.theta**(torch.arange(
+                0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device)
+                                                / self.dim))
+            seq = torch.arange(seqlen,
+                               device=self.inv_freq.device,
+                               dtype=self.inv_freq.dtype)
+            freqs = torch.outer(seq, self.inv_freq)
+            self._freqs_cached = freqs
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        self.update_freqs_cache(seqlen)
+        return self._freqs_cached[:seqlen]
+
+
+class Qwen2_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config: Qwen2_5_VLVisionConfig,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        patch_size = vision_config.patch_size
+        temporal_patch_size = vision_config.temporal_patch_size
+        in_channels = vision_config.in_channels
+        depth = vision_config.depth
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_heads
+
+        # args for get_window_index
+        self.window_size = vision_config.window_size
+        self.patch_size = vision_config.patch_size
+        self.spatial_merge_size = vision_config.spatial_merge_size
+        self.fullatt_block_indexes = vision_config.fullatt_block_indexes
+        self.spatial_merge_unit = self.spatial_merge_size**2
+
+        self.patch_embed = Qwen2_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            temporal_patch_size=temporal_patch_size,
+            in_channels=in_channels,
+            hidden_size=self.hidden_size,
+        )
+
+        # NOTE: We use torch native RMSNorm here for precision purposes.
+        norm_layer = partial(Qwen2RMSNorm, eps=norm_eps)
+        head_dim = self.hidden_size // self.num_heads
+        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Qwen2_5_VisionBlock(
+                dim=self.hidden_size,
+                num_heads=self.num_heads,
+                mlp_hidden_dim=vision_config.intermediate_size,
+                act_fn=_ACTIVATION_REGISTRY[vision_config.hidden_act],
+                norm_layer=norm_layer,
+                quant_config=quant_config,
+                prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+        self.merger = Qwen2_5_VisionPatchMerger(
+            d_model=vision_config.out_hidden_size,
+            context_dim=self.hidden_size,
+            norm_layer=norm_layer,
+            spatial_merge_size=self.spatial_merge_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merger",
+        )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def get_window_index(self, grid_thw):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = (self.window_size //
+                                  self.spatial_merge_size // self.patch_size)
+
+        for grid_t, grid_h, grid_w in grid_thw:
+            llm_grid_h = grid_h // self.spatial_merge_size
+            llm_grid_w = grid_w // self.spatial_merge_size
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(
+                grid_t, llm_grid_h, llm_grid_w)
+            pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size
+            pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100)
+            index_padded = index_padded.reshape(grid_t, num_windows_h,
+                                                vit_merger_window_size,
+                                                num_windows_w,
+                                                vit_merger_window_size)
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t, num_windows_h * num_windows_w, vit_merger_window_size,
+                vit_merger_window_size)
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(
+                0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        # patchify
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.patch_embed(hidden_states)
+
+        # compute position embedding
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+
+        # windows attention
+        window_index, cu_window_seqlens = self.get_window_index(grid_thw)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(
+            seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        # compute cu_seqlens
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
+
+        # transformers
+        hidden_states = hidden_states.unsqueeze(1)
+        for layer_num, blk in enumerate(self.blocks):
+            if layer_num in self.fullatt_block_indexes:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            hidden_states = blk(hidden_states,
+                                cu_seqlens=cu_seqlens_now,
+                                rotary_pos_emb=rotary_pos_emb)
+
+        # adapter
+        hidden_states = self.merger(hidden_states)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :]
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if name.endswith("qkv.weight"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.hidden_size
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size,
+                                                       visual_embed_dim)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
+                elif name.endswith("qkv.bias"):
+                    visual_num_heads = self.num_heads
+                    visual_embed_dim = self.hidden_size
+                    head_size = visual_embed_dim // visual_num_heads
+                    loaded_weight = loaded_weight.view(3, visual_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen2_5_VLProcessingInfo(Qwen2VLProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen2_5_VLConfig)
+
+    def get_hf_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        fps: Optional[float] = 2.0,
+    ) -> Qwen2_5_VLProcessor:
+        hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+
+        if min_pixels:
+            image_processor.min_pixels = min_pixels
+        if max_pixels:
+            image_processor.max_pixels = max_pixels
+        if max_pixels or min_pixels:
+            image_processor.size = {
+                "min_pixels": image_processor.min_pixels,
+                "max_pixels": image_processor.max_pixels,
+            }
+
+        return hf_processor
+
+    def get_image_processor(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        fps: Optional[float] = 2.0,
+    ) -> Qwen2_5_VLImageProcessor:
+        hf_processor = self.get_hf_processor(
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            fps=fps,
+        )
+        image_processor = hf_processor.image_processor  # type: ignore
+        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+        return image_processor
+
+
+class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
+            second_per_grid_ts=MultiModalFieldConfig.batched("video"),
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen2_5_VLMultiModalProcessor,
+    info=Qwen2_5_VLProcessingInfo,
+    dummy_inputs=Qwen2_5_VLDummyInputsBuilder)
+class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                         SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ]
+    }
+
+    # LoRA specific attributes, TODO: double check
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "gate_proj"
+        "up_proj",
+        # vision tower
+        "qkv",
+        "attn.proj",  # Distinguish patch_embed.proj
+        "fc1",
+        "fc2",
+        # projector
+        "mlp.0",
+        "mlp.2"
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
+        "lm_head.": "language_model.lm_head.",
+        "model.": "language_model.model.",
+    })
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: Qwen2_5_VLConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.visual = Qwen2_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=self._maybe_ignore_quant_config(quant_config),
+            prefix=maybe_prefix(prefix, "visual"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Qwen2_5_VLImagePixelInputs(type="pixel_values",
+                                              pixel_values=pixel_values,
+                                              image_grid_thw=image_grid_thw)
+
+        if image_embeds is not None:
+            image_embeds = self._validate_and_reshape_mm_tensor(
+                image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+            return Qwen2_5_VLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Qwen2_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+        second_per_grid_ts = kwargs.pop("second_per_grid_ts", None)
+
+        if pixel_values_videos is None and video_embeds is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2_5_VLVideoEmbeddingInputs(
+                type="video_embeds",
+                video_embeds=video_embeds,
+                video_grid_thw=video_grid_thw)
+
+    def _process_image_input(
+            self,
+            image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each image item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Qwen2_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        if video_input["type"] == "video_embeds":
+            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values_videos = video_input["pixel_values_videos"].type(
+                self.visual.dtype)
+            video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)
+
+        # Split concatenated embeddings for each video item.
+        merge_size = self.visual.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_id, self.config.video_token_id])
+        return inputs_embeds
+
+    def get_input_embeddings_v0(
+        self,
+        input_ids: torch.Tensor,
+        image_input: Optional[tuple[torch.Tensor, ...]] = None,
+        video_input: Optional[tuple[torch.Tensor, ...]] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.get_input_embeddings(input_ids)
+        if image_input is not None:
+            image_embeds = self._process_image_input(image_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                image_embeds,
+                placeholder_token_id=self.config.image_token_id,
+            )
+
+        if video_input is not None:
+            video_embeds = self._process_video_input(video_input)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                video_embeds,
+                placeholder_token_id=self.config.video_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Qwen2.5-VL.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen2.5-VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            pixel_values: Pixel values to be fed to a model.
+                `None` if no images are passed.
+            image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in LLM.
+                `None` if no images are passed.
+            pixel_values_videos: Pixel values of videos to be fed to a model.
+                `None` if no videos are passed.
+            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
+                `None` if no videos are passed.
+            second_per_grid_ts: Tensor `(num_videos)` of video time interval (
+                in seconds) for each grid along the temporal dimension in the
+                3D position IDs. `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            video_input = self._parse_and_validate_video_input(**kwargs)
+
+            if image_input is None and video_input is None:
+                inputs_embeds = None
+            else:
+                if uses_mrope(self.config):
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}")
+                inputs_embeds = self.get_input_embeddings_v0(
+                    input_ids,
+                    image_input=image_input,
+                    video_input=video_input)
+                input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="visual.",
+            tower_model="visual.merger.")
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2b2638cf68f..34ae7b8c946 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -650,8 +650,8 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
-                                            dict[str, torch.Tensor]]):
+class Qwen2VLEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
+                                              dict[str, torch.Tensor]]):
 
     def __init__(self, data: dict, modality: str) -> None:
         super().__init__(data, modality)
@@ -683,26 +683,26 @@ def get_passthrough_data(self) -> Mapping[str, object]:
         return self.data
 
 
-class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems):
+class Qwen2VLImageEmbeddingItems(Qwen2VLEmbeddingItems):
 
     def __init__(self, data: dict) -> None:
         super().__init__(data, "image")
 
 
-class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems):
+class Qwen2VLVideoEmbeddingItems(Qwen2VLEmbeddingItems):
 
     def __init__(self, data: dict) -> None:
         super().__init__(data, "video")
 
 
-class Qwen2MultiModalDataParser(MultiModalDataParser):
+class Qwen2VLMultiModalDataParser(MultiModalDataParser):
 
     def _parse_image_data(
         self,
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2EmbeddingItems(data, modality="image")
+            return Qwen2VLEmbeddingItems(data, modality="image")
 
         return super()._parse_image_data(data)
 
@@ -711,7 +711,7 @@ def _parse_video_data(
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2EmbeddingItems(data, modality="video")
+            return Qwen2VLEmbeddingItems(data, modality="video")
 
         return super()._parse_video_data(data)
 
@@ -948,7 +948,7 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
                                  ):
 
     def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2MultiModalDataParser()
+        return Qwen2VLMultiModalDataParser()
 
     def _get_prompt_replacements(
         self,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 962f95f10fc..b6708f77d8a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -172,6 +172,7 @@
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7841fac1df3..ec6d04cd497 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -285,6 +285,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             if self.model_config.uses_mrope:
                 image_grid_thw = []
                 video_grid_thw = []
+                second_per_grid_ts = []
                 for mm_input in self.requests[req_id].mm_inputs:
                     if mm_input.get("image_grid_thw") is not None:
                         image_grid_thw.extend(
@@ -292,6 +293,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                     if mm_input.get("video_grid_thw") is not None:
                         video_grid_thw.extend(
                             mm_input["video_grid_thw"].tolist())
+                    if mm_input.get("second_per_grid_ts") is not None:
+                        second_per_grid_ts.extend(
+                            mm_input["second_per_grid_ts"])
 
                 hf_config = self.model_config.hf_config
 
@@ -299,14 +303,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                     self.requests[req_id].mrope_position_delta = \
                     MRotaryEmbedding.get_input_positions_tensor(
                         self.requests[req_id].prompt_token_ids,
+                        hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
-                        image_token_id=hf_config.image_token_id,
-                        video_token_id=hf_config.video_token_id,
-                        vision_start_token_id=hf_config.vision_start_token_id,
-                        vision_end_token_id=hf_config.vision_end_token_id,
-                        spatial_merge_size=hf_config.vision_config.
-                        spatial_merge_size,
+                        second_per_grid_ts=second_per_grid_ts,
                     )
 
             req_ids_to_add.append(req_id)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 1c3feece95a..9400893105d 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -386,20 +386,17 @@ def _compute_multi_modal_input(self,
                 "mrope embedding type requires multi-modal input mapper "
                 "returns 'image_grid_thw' or 'video_grid_thw'.")
 
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
             hf_config = self.runner.model_config.hf_config
             token_ids = seq_data.get_token_ids()
 
             mrope_positions, mrope_position_delta = \
                 MRotaryEmbedding.get_input_positions(
                     token_ids,
+                    hf_config=hf_config,
                     image_grid_thw=image_grid_thw,
                     video_grid_thw=video_grid_thw,
-                    image_token_id=hf_config.image_token_id,
-                    video_token_id=hf_config.video_token_id,
-                    vision_start_token_id=hf_config.vision_start_token_id,
-                    vision_end_token_id=hf_config.vision_end_token_id,
-                    spatial_merge_size=hf_config.vision_config.
-                    spatial_merge_size,
+                    second_per_grid_ts=second_per_grid_ts,
                     context_len=computed_len,
                 )
             seq_data.mrope_position_delta = mrope_position_delta
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0bbba55b3b3..12baecde6e4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -702,6 +702,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                 "mrope embedding type requires multi-modal input mapper "
                 "returns 'image_grid_thw' or 'video_grid_thw'.")
 
+            second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
             hf_config = self.runner.model_config.hf_config
 
             inter_data.mrope_input_positions = [None] * inter_data.n_seqs
@@ -713,14 +714,10 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                 mrope_input_positions, mrope_position_delta = \
                     MRotaryEmbedding.get_input_positions(
                         token_ids,
+                        hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
                         video_grid_thw=video_grid_thw,
-                        image_token_id=hf_config.image_token_id,
-                        video_token_id=hf_config.video_token_id,
-                        vision_start_token_id=hf_config.vision_start_token_id,
-                        vision_end_token_id=hf_config.vision_end_token_id,
-                        spatial_merge_size=hf_config.vision_config.
-                        spatial_merge_size,
+                        second_per_grid_ts=second_per_grid_ts,
                         context_len=inter_data.context_lens[seq_idx],
                         seq_len=inter_data.seq_lens[seq_idx],
                     )

From b4c634b17106be64d3d269416269898235059081 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Feb 2025 11:09:45 +0800
Subject: [PATCH 0024/1240] [VLM] Update compatibility with transformers 4.49

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  3 +-
 examples/template_pixtral_hf.jinja            | 38 -------------------
 tests/entrypoints/test_chat_utils.py          |  1 -
 .../vision_language/test_models.py            |  4 +-
 .../vision_language/test_llava_next.py        |  7 ++--
 vllm/model_executor/models/llava.py           | 33 +++++++++++-----
 vllm/model_executor/models/llava_next.py      | 10 ++++-
 vllm/model_executor/models/minicpmv.py        |  9 +++++
 vllm/multimodal/inputs.py                     |  4 +-
 9 files changed, 50 insertions(+), 59 deletions(-)
 delete mode 100644 examples/template_pixtral_hf.jinja

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3e8b2f89642..ef7e77fa3ec 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -883,8 +883,7 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 
 :::{note}
-The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)).
-A corrected version is available at <gh-file:examples/template_pixtral_hf.jinja>.
+`mistral-community/pixtral-12b` does not support V1 yet.
 :::
 
 :::{note}
diff --git a/examples/template_pixtral_hf.jinja b/examples/template_pixtral_hf.jinja
deleted file mode 100644
index e94661cb390..00000000000
--- a/examples/template_pixtral_hf.jinja
+++ /dev/null
@@ -1,38 +0,0 @@
-{%- if messages[0]["role"] == "system" %}
-    {%- set system_message = messages[0]["content"] %}
-    {%- set loop_messages = messages[1:] %}
-{%- else %}
-    {%- set loop_messages = messages %}
-{%- endif %}
-
-{{- bos_token }}
-{%- for message in loop_messages %}
-    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
-    {%- endif %}
-    {%- if message["role"] == "user" %}
-        {%- if loop.last and system_message is defined %}
-            {{- "[INST]" + system_message + "\n" }}
-        {%- else %}
-            {{- "[INST]" }}
-        {%- endif %}
-        {%- if message["content"] is not string %}
-            {%- for chunk in message["content"] %}
-                {%- if chunk["type"] == "text" %}
-                    {{- chunk["text"] }}
-                {%- elif chunk["type"] == "image" %}
-                    {{- "[IMG]" }}
-                {%- else %}
-                    {{- raise_exception("Unrecognized content type!") }}
-                {%- endif %}
-            {%- endfor %}
-        {%- else %}
-            {{- message["content"] }}
-        {%- endif %}
-        {{- "[/INST]" }}
-    {%- elif message["role"] == "assistant" %}
-        {{- message["content"] + eos_token}}
-    {%- else %}
-        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
-    {%- endif %}
-{%- endfor %}
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 737f733092b..5c469007af2 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -761,7 +761,6 @@ def test_resolve_content_format_hf_defined(model, expected_format):
      ("template_falcon.jinja", "string"),
      ("template_inkbot.jinja", "string"),
      ("template_llava.jinja", "string"),
-     ("template_pixtral_hf.jinja", "openai"),
      ("template_vlm2vec.jinja", "openai"),
      ("tool_chat_template_granite_20b_fc.jinja", "string"),
      ("tool_chat_template_hermes.jinja", "string"),
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 95505dcf5c2..b00ec6fa699 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -224,7 +224,7 @@
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48.0",
+                reason="HF model is not compatible with transformers>=4.48",
             )
         ],
     ),
@@ -359,7 +359,7 @@
         marks=[
             pytest.mark.skipif(
                 Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48.0",
+                reason="HF model is not compatible with transformers>=4.48",
             )
         ],
     ),
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 6ba3c540389..990c6c150fc 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -4,7 +4,6 @@
 
 import pytest
 import torch.nn.functional as F
-import transformers
 from transformers import AutoModelForVision2Seq
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
@@ -57,6 +56,10 @@ def _run_test(
 
     with hf_runner(model, dtype=dtype,
                    auto_cls=AutoModelForVision2Seq) as hf_model:
+        # Patch the issue where generation_config.json is missing
+        hf_model.processor.patch_size = \
+            hf_model.model.config.vision_config.patch_size
+
         # Patch the issue where image_token_id
         # exceeds the maximum allowed vocab size
         hf_model.model.resize_token_embeddings(
@@ -88,8 +91,6 @@ def _run_test(
     )
 
 
-@pytest.mark.skipif(transformers.__version__ >= "4.46",
-                    reason="Model broken with changes in transformers 4.46")
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 63d308ef6d1..b1fee3eeb54 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -293,16 +293,29 @@ def _call_hf_processor(
 
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
-            images = mm_data["images"]
-            assert isinstance(images, list)
-
-            # Original output: (1, num_images, C, H, W)
-            # New output: (num_images, C, H, W)
-            assert (isinstance(pixel_values, list) and len(pixel_values) == 1)
-            assert (isinstance(pixel_values[0], list)
-                    and len(pixel_values[0]) == len(images))
-
-            processed_outputs["pixel_values"] = pixel_values[0]
+            # Before/after https://github.com/huggingface/transformers/pull/35122
+            if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"):
+                images = mm_data["images"]
+                assert isinstance(images, list)
+
+                # Original output: (1, num_images, C, H, W)
+                # New output: (num_images, C, H, W)
+                assert (isinstance(pixel_values, list)
+                        and len(pixel_values) == 1)
+                assert (isinstance(pixel_values[0], list)
+                        and len(pixel_values[0]) == len(images))
+
+                processed_outputs["pixel_values"] = pixel_values[0]
+            else:
+                # Avoid padding since we need the output for each image to be
+                # independent of other images for the cache to work correctly
+                image_sizes = processed_outputs["image_sizes"]
+                assert len(pixel_values) == len(image_sizes)
+
+                processed_outputs["pixel_values"] = [
+                    p[:, :h, :w]
+                    for p, (h, w) in zip(pixel_values, image_sizes)
+                ]
 
         return processed_outputs
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index defdeb54afb..719916642f2 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -73,7 +73,15 @@ def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
     def get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaNextProcessor)
+        hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
+
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+
+        return hf_processor
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L113
     def get_num_image_tokens(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 20f3a3d1989..58a4448d436 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -342,6 +342,15 @@ def get_hf_processor(
         **kwargs: object,
     ):
         hf_processor = self.ctx.get_hf_processor()
+
+        # NumPy arrays are considered as Iterable but not Sequence in
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
+        image_processor = hf_processor.image_processor  # type: ignore
+        for attr in ("mean", "std"):
+            val = getattr(image_processor, attr)
+            if isinstance(val, np.ndarray):
+                setattr(image_processor, attr, val.tolist())
+
         return hf_processor
 
     def get_image_processor(self):
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2f2535f368c..5f9593ee8b2 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -141,9 +141,9 @@ class PlaceholderRange(TypedDict):
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
     """Equality check between :data:`NestedTensors` objects."""
     if isinstance(a, torch.Tensor):
-        return isinstance(b, torch.Tensor) and bool((a == b).all().item())
+        return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
-        return isinstance(a, torch.Tensor) and bool((b == a).all().item())
+        return isinstance(a, torch.Tensor) and torch.equal(b, a)
 
     if isinstance(a, list):
         return (isinstance(b, list)

From 8361a3e5d1b9853f48cd17e70efd90cecd4a60a5 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Wed, 5 Feb 2025 22:15:08 -0500
Subject: [PATCH 0025/1240] [ROCm][Kernel] Using the correct warp_size value

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index ff74a42d7e8..01dac404465 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -207,8 +207,8 @@ __global__ void sgl_moe_align_block_size_kernel(
   __shared__ int32_t shared_counts[32][8];
   __shared__ int32_t local_offsets[256];
 
-  const int warp_id = threadIdx.x / WARP_SIZE;
-  const int lane_id = threadIdx.x % WARP_SIZE;
+  const int warp_id = threadIdx.x / 32;
+  const int lane_id = threadIdx.x % 32;
   const int experts_per_warp = 8;
   const int my_expert_start = warp_id * experts_per_warp;
 

From 108718fd8831dc2dc258829c5c54e30976c840f3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 5 Feb 2025 22:22:19 -0500
Subject: [PATCH 0026/1240] [Bugfix] Better FP8 supported defaults

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/utils/fp8_utils.py    | 28 +++++++++++--------
 .../layers/quantization/utils/w8a8_utils.py   |  6 +++-
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 10ff71e5757..99fbda314f6 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     _normalize_quant_group_shape, scaled_dequantize)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
@@ -38,7 +38,7 @@ def apply_w8a8_block_fp8_linear(
     weight_scale: torch.Tensor,
     input_scale: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
-    cutlass_block_fp8_supported: bool = True,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
 ) -> torch.Tensor:
     assert input_scale is None
     # View input as 2D matrix for fp8 methods
@@ -85,12 +85,14 @@ def apply_w8a8_block_fp8_linear(
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally
 def apply_fp8_linear_generic(
-        input: torch.Tensor,
-        weight: torch.Tensor,
-        weight_scale: torch.Tensor,
-        input_group_shape: Tuple[int, int],
-        weight_group_shape: Tuple[int, int],
-        input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_group_shape: Tuple[int, int],
+    weight_group_shape: Tuple[int, int],
+    input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
+    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
 ) -> torch.Tensor:
     # View input as 2D matrix for fp8 methods
     input = input.view(-1, input.shape[-1])
@@ -105,14 +107,18 @@ def is_dim_blocked(dim, shape, group_shape):
     if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
      and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
      input_group_shape == (1, weight_group_shape[1]):
-        return apply_w8a8_block_fp8_linear(input, weight,
-                                           list(weight_group_shape),
-                                           weight_scale)
+        return apply_w8a8_block_fp8_linear(
+            input,
+            weight,
+            list(weight_group_shape),
+            weight_scale,
+            cutlass_block_fp8_supported=cutlass_block_fp8_supported)
     else:
         # Despite having linear in the it doesn't conform to
         # `torch.nn.functional.linear` which is defined as `input @ weight.T`
         # so we explicitly transpose the weight matrix here
         return apply_fp8_linear(input, weight.T, weight_scale.T,
+                    cutlass_fp8_supported=cutlass_fp8_supported,
                          use_per_token_if_dynamic=\
                              (input_group_shape == (1, input.shape[1])))
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 3fd88e8754a..dedeb0c296b 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -42,6 +42,10 @@ def cutlass_block_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_block_fp8(capability)
 
 
+CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
+CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
+
+
 def per_tensor_dequantize(
         tensor: torch.Tensor, inv_scale: Union[float,
                                                torch.Tensor]) -> torch.Tensor:
@@ -109,7 +113,7 @@ def apply_fp8_linear(
     input_scale: Optional[torch.Tensor] = None,
     input_scale_ub: Optional[torch.Tensor] = None,
     bias: Optional[torch.Tensor] = None,
-    cutlass_fp8_supported: bool = True,
+    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
     use_per_token_if_dynamic: bool = False,
 ) -> torch.Tensor:
     # ops.scaled_fp8_quant supports both dynamic and static quant.

From d1739653772a4373e71c8b537d467bf1568520f2 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Feb 2025 19:23:35 -0800
Subject: [PATCH 0027/1240] [Misc][Easy] Remove the space from the file name

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 vllm/model_executor/layers/fused_moe/fused_moe.py               | 2 +-
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...IA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ..._name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 vllm/model_executor/layers/quantization/utils/fp8_utils.py      | 2 +-
 42 files changed, 2 insertions(+), 2 deletions(-)
 rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1bed35525e9..f14200e0288 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -765,7 +765,7 @@ def get_config_file_name(E: int,
     device_name = current_platform.get_device_name().replace(" ", "_")
     dtype_selector = "" if not dtype else f",dtype={dtype}"
     block_shape_selector = ("" if not block_shape or not all(block_shape) else
-                            f",block_shape={block_shape}")
+                            f",block_shape={block_shape}").replace(" ", "")
     return f"E={E},N={N},device_name={device_name}{dtype_selector}{block_shape_selector}.json"  # noqa: E501
 
 
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 99fbda314f6..9895537c219 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -423,7 +423,7 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int,
     # First look up if an optimized configuration is available in the configs
     # directory
     device_name = current_platform.get_device_name().replace(" ", "_")
-    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n}, {block_k}].json"  # noqa: E501
+    json_file_name = f"N={N},K={K},device_name={device_name},dtype=fp8_w8a8,block_shape=[{block_n},{block_k}].json"  # noqa: E501
 
     config_file_path = os.path.join(
         os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)

From 5f79d8a178df3622e28b75b52d3fd4dfb1d58df7 Mon Sep 17 00:00:00 2001
From: Sumit Vij <sumitvij11+github@gmail.com>
Date: Wed, 5 Feb 2025 19:54:13 -0800
Subject: [PATCH 0028/1240] [Model] LoRA Support for Ultravox model (#11253)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |   2 +-
 tests/conftest.py                      |  16 +++-
 tests/lora/test_ultravox.py            | 121 +++++++++++++++++++++++++
 vllm/model_executor/models/ultravox.py |  28 +++++-
 4 files changed, 160 insertions(+), 7 deletions(-)
 create mode 100644 tests/lora/test_ultravox.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ef7e77fa3ec..32f3e9deff6 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -857,7 +857,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * Ultravox
   * T + A<sup>E+</sup>
   * `fixie-ai/ultravox-v0_3`
-  *
+  * ✅︎
   * ✅︎
   * ✅︎
 :::
diff --git a/tests/conftest.py b/tests/conftest.py
index 85dd5bcb0dd..02105900f30 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -737,6 +737,7 @@ def generate(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
@@ -744,7 +745,8 @@ def generate(
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         outputs: List[Tuple[List[List[int]], List[str]]] = []
         for req_output in req_outputs:
@@ -782,6 +784,7 @@ def generate_w_logprobs(
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         inputs = self.get_inputs(prompts,
@@ -790,7 +793,8 @@ def generate_w_logprobs(
                                  audios=audios)
 
         req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params)
+                                          sampling_params=sampling_params,
+                                          **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -826,13 +830,15 @@ def generate_greedy(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts,
                                 greedy_params,
                                 images=images,
                                 videos=videos,
-                                audios=audios)
+                                audios=audios,
+                                **kwargs)
         return [(output_ids[0], output_str[0])
                 for output_ids, output_str in outputs]
 
@@ -847,6 +853,7 @@ def generate_greedy_logprobs(
         videos: Optional[PromptVideoInput] = None,
         stop_token_ids: Optional[List[int]] = None,
         stop: Optional[List[str]] = None,
+        **kwargs: Any,
     ) -> Union[List[TokensTextLogprobs],
                List[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
@@ -861,7 +868,8 @@ def generate_greedy_logprobs(
                                         greedy_logprobs_params,
                                         images=images,
                                         audios=audios,
-                                        videos=videos)
+                                        videos=videos,
+                                        **kwargs)
 
     def generate_encoder_decoder_greedy_logprobs(
         self,
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
new file mode 100644
index 00000000000..1218dfa34be
--- /dev/null
+++ b/tests/lora/test_ultravox.py
@@ -0,0 +1,121 @@
+import shutil
+from os import path
+from tempfile import TemporaryDirectory
+from typing import List, Tuple
+
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file, save_file
+from transformers import AutoTokenizer
+
+from vllm.lora.request import LoRARequest
+
+from ..models.utils import check_outputs_equal
+
+ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
+LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
+
+VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
+
+PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
+
+
+def llama3_1_8b_chess_lora_path():
+    return snapshot_download(
+        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
+
+
+# can't use llama lora adapter without module name transformation
+# because ultravox nest language model
+def transform_module_names_for_ultravox(state_dict):
+    transformed_state_dict = {}
+    for key, value in state_dict.items():
+        new_key = key.replace("base_model.model",
+                              "base_model.model.language_model")
+        transformed_state_dict[new_key] = value
+    return transformed_state_dict
+
+
+def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
+    tensor_file = "adapter_model.safetensors"
+    state_dict = load_file(path.join(source_repo, tensor_file))
+    transformed_state_dict = transform_module_names_for_ultravox(state_dict)
+
+    save_file(transformed_state_dict, path.join(target_path, tensor_file))
+
+    config_file = "adapter_config.json"
+    shutil.copyfile(path.join(source_repo, config_file),
+                    path.join(target_path, config_file))
+    return target_path
+
+
+def _get_prompt(audio_count, question, placeholder, model_name) -> str:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    placeholder = f"{placeholder}\n" * audio_count
+
+    return tokenizer.apply_chat_template([{
+        'role': 'user',
+        'content': f"{placeholder}{question}"
+    }],
+                                         tokenize=False,
+                                         add_generation_prompt=True)
+
+
+def test_ultravox_lora(vllm_runner):
+    """
+    TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
+    """
+    # Workaround to prevent device mismatch in Whisper.
+    # Can be removed when it is fixed upstream in transformer
+    # https://github.com/huggingface/transformers/pull/35866
+    torch.set_default_device("cpu")
+
+    llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
+    with TemporaryDirectory() as temp_ultravox_lora_dir:
+        llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
+            llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
+        with vllm_runner(
+                ULTRAVOX_MODEL_NAME,
+                enforce_eager=True,
+                max_num_seqs=2,
+                enable_lora=True,
+                max_loras=1,
+                max_lora_rank=128,
+                dtype="bfloat16",
+                max_model_len=1024,
+        ) as vllm_model:
+            ultravox_outputs: List[Tuple[
+                List[int], str]] = vllm_model.generate_greedy(
+                    [
+                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
+                                    ULTRAVOX_MODEL_NAME)
+                    ],
+                    256,
+                    lora_request=LoRARequest(str(1), 1,
+                                             llama3_1_8b_ultravox_chess_lora),
+                )
+
+    # run llama with and without lora to compare outputs with above
+    with vllm_runner(
+            LLMA_MODEL_NAME,
+            enforce_eager=True,
+            max_num_seqs=2,
+            enable_lora=True,
+            max_loras=1,
+            max_lora_rank=128,
+            dtype="bfloat16",
+            max_model_len=1024,
+    ) as vllm_model:
+        llama_outputs: List[Tuple[List[int], str]] = (
+            vllm_model.generate_greedy(
+                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
+                256,
+                lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
+            ))
+
+    check_outputs_equal(
+        outputs_0_lst=ultravox_outputs,
+        outputs_1_lst=llama_outputs,
+        name_0="ultravox",
+        name_1="llama",
+    )
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 52a4d798f4b..9da0682cfa8 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
@@ -33,7 +34,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings,
@@ -343,7 +344,20 @@ def forward(
     UltravoxMultiModalProcessor,
     info=UltravoxProcessingInfo,
     dummy_inputs=UltravoxDummyInputsBuilder)
-class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    # LoRA specific attributes
+    # TODO : Add LoRA to the audio tower and projector.
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
@@ -391,6 +405,16 @@ def sampler(self):
 
         return get_sampler()
 
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.",
+            connector="multi_modal_projector.",
+            tower_model="audio_tower.",
+        )
+
     def _audio_features_to_embeddings(
             self, input_features: torch.Tensor) -> torch.Tensor:
         audio_input = input_features.to(self.audio_tower.dtype)

From 5e4a3af268a65f8b8d5e2bd8a314db2cbe2df299 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Feb 2025 21:25:54 -0800
Subject: [PATCH 0029/1240] [Bugfix] Fix the test_ultravox.py's license
 (#12806)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_ultravox.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index 1218dfa34be..703f92ce8b6 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import shutil
 from os import path
 from tempfile import TemporaryDirectory

From c0db76bb0632769dda0966c2b8f74d16a4bf7c86 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:24:57 +0000
Subject: [PATCH 0030/1240] Improve `TransformersModel` UX (#12785)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/transformers.py | 53 +++++++++++++---------
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index dfc7143823d..43d2c88d3b9 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
-from typing import Iterable, Optional, Union
+from typing import Iterable, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -72,15 +72,24 @@ def vllm_flash_attention_forward(
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
 
 
+def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
+    logger.debug("%s: %s -> %s", name, old_module, new_module)
+
+
 def replace_linear_class(
         linear: nn.Linear,
-        style: str,
+        style: Literal["colwise", "rowwise"],
         quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
-    In model configurations, we use a neutral type (string) to specify parallel
-    styles, here we use it to translate nn.Linear into vllm-style tp Linear.
-
-    Quant config is not supported yet
+    Replace nn.Linear with one of vLLM's tensor parallel linear classes.
+    
+    `quant_config` is not yet supported.
+    Args:
+        linear (nn.Linear): `nn.Linear` to be replaced.
+        style (str): Tensor parallel style of the new linear, e.g. "colwise".
+        quant_config (QuantConfig): Quantization config for the new linear.
+    Returns:
+        Union[ColumnParallelLinear, RowParallelLinear]: The new linear.
     """
 
     if not isinstance(style, str):
@@ -93,7 +102,10 @@ def replace_linear_class(
     }.get(style)
 
     if vllm_linear_cls is None:
-        raise ValueError(f"Unsupported parallel style value: {style}")
+        logger.warning(
+            "Unsupported parallel style value: %s. "
+            "This layer will not be tensor parallelized.", style)
+        return linear
 
     class HFCompatibleLinear(vllm_linear_cls):
         """
@@ -119,25 +131,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         logger.info("Using Transformers backend.")
 
-        self.vllm_config = vllm_config
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        self.quant_config = quant_config
+
         self.config = config
+        self.quant_config = quant_config
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
             attn_implementation="vllm",
-            torch_dtype=vllm_config.model_config.dtype,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
         )
         prefix = self.model.base_model_prefix
 
         # MLP modifications
-        self.tensor_parallelize(self.model)
+        self.apply_base_model_tp_plan(self.model)
 
         # Attention modifications (assumes 1 attention op per hidden layer)
         tp_size = get_tensor_model_parallel_world_size()
@@ -170,13 +181,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                                                 config.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
-    def log_replacement(self, name: str, old_module: nn.Module,
-                        new_module: nn.Module):
-        logger.debug("%s: %s -> %s", name, old_module, new_module)
-
-    def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
+    def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
+        """
+        Apply the base model tensor parallelization plan to a module.
+        Currently only supports linear layers.
+        """
         if (self.config.base_model_tp_plan is None
-                and self.vllm_config.parallel_config.tensor_parallel_size > 1):
+                and get_tensor_model_parallel_world_size() > 1):
             raise ValueError(
                 "Trying to run tensor parallelization but the model does not "
                 "support it yet!")
@@ -189,9 +200,9 @@ def tensor_parallelize(self, module: nn.Module, prefix: str = ""):
                     new_module = replace_linear_class(child_module, style,
                                                       self.quant_config)
                     setattr(module, child_name, new_module)
-                    self.log_replacement(qual_name, child_module, new_module)
+                    log_replacement(qual_name, child_module, new_module)
             else:
-                self.tensor_parallelize(child_module, prefix=qual_name)
+                self.apply_base_model_tp_plan(child_module, prefix=qual_name)
 
     def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
@@ -201,8 +212,8 @@ def replace_vocab_embed_class(self, module: nn.Module):
             org_num_embeddings=self.config.vocab_size,
             quant_config=None,
         )
-        self.log_replacement("input embedding",
-                             self.model.get_input_embeddings(), new_module)
+        log_replacement("input embedding", self.model.get_input_embeddings(),
+                        new_module)
         self.model.set_input_embeddings(new_module)
 
     def forward(

From 41f9ff30b4a800062ecf6c82af9969053761bf57 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Feb 2025 02:16:20 -0500
Subject: [PATCH 0031/1240] [Misc] Remove duplicated DeepSeek V2/V3 model
 definition (#12793)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                            |   1 -
 vllm/model_executor/models/deepseek_v2.py |  48 +-
 vllm/model_executor/models/deepseek_v3.py | 806 ----------------------
 vllm/model_executor/models/registry.py    |   2 +-
 4 files changed, 36 insertions(+), 821 deletions(-)
 delete mode 100644 vllm/model_executor/models/deepseek_v3.py

diff --git a/vllm/config.py b/vllm/config.py
index bc4bf627b8e..9ba49757612 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -754,7 +754,6 @@ def get_hidden_size(self) -> int:
 
     @property
     def is_deepseek_mla(self) -> bool:
-        # TODO add deepseek_v3
         return (hasattr(self.hf_text_config, "model_type")) \
                 and (self.hf_text_config.model_type in \
                     ('deepseek_v2', 'deepseek_v3'))\
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fdd584f9d6d..773f5abe71d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -21,7 +21,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only DeepseekV2 model."""
+"""Inference-only DeepseekV2/DeepseekV3 model."""
 from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
@@ -115,23 +115,32 @@ def __init__(
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "
                              "Only silu is supported for now.")
 
-        self.experts = FusedMoE(num_experts=config.n_routed_experts,
-                                top_k=config.num_experts_per_tok,
-                                hidden_size=config.hidden_size,
-                                intermediate_size=config.moe_intermediate_size,
-                                reduce_results=False,
-                                renormalize=config.norm_topk_prob,
-                                quant_config=quant_config,
-                                use_grouped_topk=True,
-                                num_expert_group=config.n_group,
-                                topk_group=config.topk_group,
-                                prefix=f"{prefix}.experts")
-
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.n_routed_experts,
                                      bias=False,
                                      quant_config=None,
                                      prefix=f"{prefix}.gate")
+        if config.topk_method == "noaux_tc":
+            self.gate.e_score_correction_bias = nn.Parameter(
+                torch.empty(config.n_routed_experts))
+        else:
+            self.gate.e_score_correction_bias = None
+
+        self.experts = FusedMoE(
+            num_experts=config.n_routed_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=config.norm_topk_prob,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=config.scoring_func,
+            e_score_correction_bias=self.gate.e_score_correction_bias)
+
         if config.n_shared_experts is not None:
             intermediate_size = (config.moe_intermediate_size *
                                  config.n_shared_experts)
@@ -732,6 +741,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+
+            # TODO(simon): support nextn predict layers
+            if hasattr(self.config, "num_nextn_predict_layers"
+                       ) and self.config.num_nextn_predict_layers > 0:
+                assert self.config.num_nextn_predict_layers == 1
+                layer_idx = self.config.num_hidden_layers
+                if name.startswith(f"model.layers.{layer_idx}"):
+                    continue
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
                 if weight_name not in name:
@@ -793,3 +811,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
                     weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
+
+
+class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
+    pass
diff --git a/vllm/model_executor/models/deepseek_v3.py b/vllm/model_executor/models/deepseek_v3.py
deleted file mode 100644
index 81f82b182f1..00000000000
--- a/vllm/model_executor/models/deepseek_v3.py
+++ /dev/null
@@ -1,806 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only DeepseekV3 model."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
-
-import torch
-from torch import nn
-from transformers import PretrainedConfig
-
-from vllm.attention import Attention, AttentionMetadata
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, ModelConfig, VllmConfig
-from vllm.distributed import (get_pp_group,
-                              get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import FusedMoE
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
-
-from .interfaces import SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
-
-
-class DeepseekV3MLP(nn.Module):
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-        quant_config: Optional[QuantizationConfig] = None,
-        reduce_results: bool = True,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
-        if hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {hidden_act}. "
-                             "Only silu is supported for now.")
-        self.act_fn = SiluAndMul()
-
-    def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
-        return x
-
-
-class DeepseekV3MoE(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
-        super().__init__()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.routed_scaling_factor = config.routed_scaling_factor
-        self.n_shared_experts = config.n_shared_experts
-        self.routed_scaling_factor = config.routed_scaling_factor
-        if self.tp_size > config.n_routed_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.n_routed_experts}.")
-
-        if config.hidden_act != "silu":
-            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
-                             "Only silu is supported for now.")
-
-        self.gate = ReplicatedLinear(config.hidden_size,
-                                     config.n_routed_experts,
-                                     bias=False,
-                                     quant_config=None,
-                                     prefix=f"{prefix}.gate")
-        if config.topk_method == "noaux_tc":
-            self.gate.e_score_correction_bias = nn.Parameter(
-                torch.empty(config.n_routed_experts))
-        else:
-            self.gate.e_score_correction_bias = None
-
-        self.experts = FusedMoE(
-            num_experts=config.n_routed_experts,
-            top_k=config.num_experts_per_tok,
-            hidden_size=config.hidden_size,
-            intermediate_size=config.moe_intermediate_size,
-            reduce_results=False,
-            renormalize=config.norm_topk_prob,
-            quant_config=quant_config,
-            use_grouped_topk=True,
-            num_expert_group=config.n_group,
-            topk_group=config.topk_group,
-            prefix=f"{prefix}.experts",
-            scoring_func=config.scoring_func,
-            e_score_correction_bias=self.gate.e_score_correction_bias)
-
-        if config.n_shared_experts is not None:
-            intermediate_size = (config.moe_intermediate_size *
-                                 config.n_shared_experts)
-            self.shared_experts = DeepseekV3MLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                reduce_results=False,
-            )
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        num_tokens, hidden_dim = hidden_states.shape
-        hidden_states = hidden_states.view(-1, hidden_dim)
-        if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
-        # router_logits: (num_tokens, n_experts)
-        router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits) * self.routed_scaling_factor
-        if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
-        if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(
-                final_hidden_states)
-
-        return final_hidden_states.view(num_tokens, hidden_dim)
-
-
-def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
-    import math
-    if scale <= 1:
-        return 1.0
-    return 0.1 * mscale * math.log(scale) + 1.0
-
-
-class DeepseekV3Attention(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        hidden_size: int,
-        num_heads: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        v_head_dim: int,
-        q_lora_rank: int,
-        kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-        self.num_heads = num_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        assert num_heads % tp_size == 0
-        self.num_local_heads = num_heads // tp_size
-        self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                             self.q_lora_rank,
-                                             bias=False,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.q_a_proj")
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
-        else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
-
-        self.kv_a_proj_with_mqa = ReplicatedLinear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
-        self.kv_b_proj = ColumnParallelLinear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
-        # O projection.
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
-        if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
-            self.use_normal_rope = False
-        else:
-            self.use_normal_rope = True
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
-
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
-            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
-            self.scaling = self.scaling * mscale * mscale
-
-        self.attn = Attention(self.num_local_heads,
-                              self.qk_head_dim,
-                              self.scaling,
-                              num_kv_heads=self.num_local_heads,
-                              cache_config=cache_config,
-                              quant_config=quant_config,
-                              prefix=f"{prefix}.attn")
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        if self.q_lora_rank is not None:
-            q = self.q_a_proj(hidden_states)[0]
-            q = self.q_a_layernorm(q)
-            q = self.q_b_proj(q)[0].view(-1, self.num_local_heads,
-                                         self.qk_head_dim)
-        else:
-            q = self.q_proj(hidden_states)[0].view(-1, self.num_local_heads,
-                                                   self.qk_head_dim)
-        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim],
-                               dim=-1)
-        latent_cache = self.kv_a_proj_with_mqa(hidden_states)[0]
-        kv_a, _ = latent_cache.split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        latent_cache = latent_cache.unsqueeze(1)
-        kv_a = self.kv_a_layernorm(kv_a.contiguous())
-        kv = self.kv_b_proj(kv_a)[0]
-        kv = kv.view(-1, self.num_local_heads,
-                     self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-        k_pe = latent_cache[:, :, self.kv_lora_rank:]
-
-        if self.use_normal_rope:
-            seq_len = positions.size(0)
-            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
-            q_pe = q_pe.reshape(seq_len, -1)
-            k_pe = k_pe.reshape(seq_len, -1)
-
-        q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
-
-        if self.use_normal_rope:
-            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
-
-        q[..., self.qk_nope_head_dim:] = q_pe
-        k = torch.empty_like(q)
-        k[..., :self.qk_nope_head_dim] = k_nope
-        k[..., self.qk_nope_head_dim:] = k_pe
-        # padding value to qk_head_dim for alignment
-        v = torch.nn.functional.pad(
-            v, [0, self.qk_head_dim - self.v_head_dim],
-            value=0).view(-1, self.num_local_heads * self.qk_head_dim)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
-        attn_output = attn_output.view(
-            -1, self.num_local_heads,
-            self.qk_head_dim)[..., :self.v_head_dim].reshape(
-                -1, self.num_local_heads * self.v_head_dim)
-        output, _ = self.o_proj(attn_output)
-        return output
-
-
-class DeepseekV3MLAAttention(nn.Module):
-    """
-    Main reference: DeepseekV2 paper, and FlashInfer Implementation
-    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-    
-    For more info see MLACommonImpl in: vllm/attention/backends/mla/utils.py
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        hidden_size: int,
-        num_heads: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        v_head_dim: int,
-        q_lora_rank: Optional[int],
-        kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        max_position_embeddings: int = 8192,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
-        self.v_head_dim = v_head_dim
-
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-
-        self.num_heads = num_heads
-        tp_size = get_tensor_model_parallel_world_size()
-        assert num_heads % tp_size == 0
-        self.num_local_heads = num_heads // tp_size
-
-        self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.max_position_embeddings = max_position_embeddings
-
-        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(self.hidden_size,
-                                             self.q_lora_rank,
-                                             bias=False,
-                                             quant_config=quant_config,
-                                             prefix=f"{prefix}.q_a_proj")
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank,
-                                         eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(q_lora_rank,
-                                                 self.num_heads *
-                                                 self.qk_head_dim,
-                                                 bias=False,
-                                                 quant_config=quant_config,
-                                                 prefix=f"{prefix}.q_b_proj")
-        else:
-            self.q_proj = ColumnParallelLinear(self.hidden_size,
-                                               self.num_heads *
-                                               self.qk_head_dim,
-                                               bias=False,
-                                               quant_config=quant_config,
-                                               prefix=f"{prefix}.q_proj")
-
-        self.kv_a_proj_with_mqa = ReplicatedLinear(
-            self.hidden_size,
-            self.kv_lora_rank + self.qk_rope_head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_a_proj_with_mqa")
-        self.kv_a_layernorm = RMSNorm(self.kv_lora_rank,
-                                      eps=config.rms_norm_eps)
-        self.kv_b_proj = ColumnParallelLinear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.kv_b_proj")
-        self.o_proj = RowParallelLinear(self.num_heads * self.v_head_dim,
-                                        self.hidden_size,
-                                        bias=False,
-                                        quant_config=quant_config,
-                                        prefix=f"{prefix}.o_proj")
-
-        if rope_scaling:
-            rope_scaling["rope_type"] = 'deepseek_yarn'
-        self.rotary_emb = get_rope(qk_rope_head_dim,
-                                   rotary_dim=qk_rope_head_dim,
-                                   max_position=max_position_embeddings,
-                                   base=rope_theta,
-                                   rope_scaling=rope_scaling,
-                                   is_neox_style=False)
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
-            mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
-            self.scaling = self.scaling * mscale * mscale
-
-        self.mla_attn = Attention(
-            num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank,
-            scale=self.scaling,
-            num_kv_heads=1,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.attn",
-            use_mla=True,
-            # MLA Args
-            q_lora_rank=self.q_lora_rank,
-            kv_lora_rank=self.kv_lora_rank,
-            qk_nope_head_dim=self.qk_nope_head_dim,
-            qk_rope_head_dim=self.qk_rope_head_dim,
-            qk_head_dim=self.qk_head_dim,
-            v_head_dim=self.v_head_dim,
-            rotary_emb=self.rotary_emb,
-            q_proj=self.q_proj if self.q_lora_rank is None else self.q_b_proj,
-            kv_b_proj=self.kv_b_proj,
-            o_proj=self.o_proj,
-        )
-
-        self.prefix = prefix
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
-        if self.q_lora_rank is not None:
-            ckq = self.q_a_proj(hidden_states)[0]
-            hidden_states_or_q_c = self.q_a_layernorm(ckq)
-        else:
-            hidden_states_or_q_c = hidden_states
-        kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
-            [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
-        kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
-                             attn_metadata)
-
-
-class DeepseekV3DecoderLayer(nn.Module):
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        prefix: str,
-        model_config: ModelConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ) -> None:
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        max_position_embeddings = getattr(config, "max_position_embeddings",
-                                          8192)
-        # DecoderLayers are created with `make_layers` which passes the prefix
-        # with the layer's index.
-        layer_idx = int(prefix.split(sep='.')[-1])
-        if model_config.use_mla:
-            attn_cls = DeepseekV3MLAAttention
-        else:
-            attn_cls = DeepseekV3Attention
-        self.self_attn = attn_cls(
-            config=config,
-            hidden_size=self.hidden_size,
-            num_heads=config.num_attention_heads,
-            qk_nope_head_dim=config.qk_nope_head_dim,
-            qk_rope_head_dim=config.qk_rope_head_dim,
-            v_head_dim=config.v_head_dim,
-            q_lora_rank=config.q_lora_rank
-            if hasattr(config, "q_lora_rank") else None,
-            kv_lora_rank=config.kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            max_position_embeddings=max_position_embeddings,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix=f"{prefix}.self_attn",
-        )
-        if (config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0):
-            self.mlp = DeepseekV3MoE(
-                config=config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        else:
-            self.mlp = DeepseekV3MLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        self.input_layernorm = RMSNorm(config.hidden_size,
-                                       eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size,
-                                                eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
-        residual: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
-
-        # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
-        return hidden_states, residual
-
-
-@support_torch_compile
-class DeepseekV3Model(nn.Module):
-
-    fall_back_to_pt_during_load = False
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-
-        config = vllm_config.model_config.hf_config
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        if get_pp_group().is_first_rank:
-            self.embed_tokens = VocabParallelEmbedding(
-                config.vocab_size,
-                config.hidden_size,
-            )
-        else:
-            self.embed_tokens = PPMissingLayer()
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers,
-            lambda prefix: DeepseekV3DecoderLayer(
-                config,
-                prefix,
-                model_config=model_config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-            ),
-            prefix=f"{prefix}.layers")
-
-        if get_pp_group().is_last_rank:
-            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.norm = PPMissingLayer()
-        self.make_empty_intermediate_tensors = (
-            make_empty_intermediate_tensors_factory(
-                ["hidden_states", "residual"], config.hidden_size))
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors],
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if get_pp_group().is_first_rank:
-            if inputs_embeds is not None:
-                hidden_states = inputs_embeds
-            else:
-                hidden_states = self.get_input_embeddings(input_ids)
-            residual = None
-        else:
-            assert intermediate_tensors is not None
-            hidden_states = intermediate_tensors["hidden_states"]
-            residual = intermediate_tensors["residual"]
-
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
-
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({
-                "hidden_states": hidden_states,
-                "residual": residual
-            })
-
-        hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
-
-
-class DeepseekV3ForCausalLM(nn.Module, SupportsPP):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
-        config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        self.config = config
-        self.quant_config = quant_config
-        self.model = DeepseekV3Model(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = get_sampler()
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors)
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
-                                   inputs_embeds)
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: Optional[torch.Tensor],
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-    def make_empty_intermediate_tensors(
-            self, batch_size: int, dtype: torch.dtype,
-            device: torch.device) -> IntermediateTensors:
-        return IntermediateTensors({
-            "hidden_states":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-            "residual":
-            torch.zeros((batch_size, self.config.hidden_size),
-                        dtype=dtype,
-                        device=device),
-        })
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        # Params for weights, fp8 weight scales, fp8 activation scales
-        # (param_name, weight_name, expert_id, shard_id)
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
-            ckpt_gate_proj_name="gate_proj",
-            ckpt_down_proj_name="down_proj",
-            ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts)
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            # TODO(simon): support nextn predict layers
-            if hasattr(self.config, "num_nextn_predict_layers"
-                       ) and self.config.num_nextn_predict_layers > 0:
-                assert self.config.num_nextn_predict_layers == 1
-                layer_idx = self.config.num_hidden_layers
-                if name.startswith(f"model.layers.{layer_idx}"):
-                    continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # Skip non-stacked layers and experts (experts handled below).
-                if weight_name not in name:
-                    continue
-                # We have mlp.experts[0].gate_proj in the checkpoint.
-                # Since we handle the experts below in expert_params_mapping,
-                # we need to skip here BEFORE we update the name, otherwise
-                # name will be updated to mlp.experts[0].gate_up_proj, which
-                # will then be updated below in expert_params_mapping
-                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
-                if (("mlp.experts." in name) and name not in params_dict):
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param,
-                                  loaded_weight,
-                                  name,
-                                  shard_id=shard_id,
-                                  expert_id=expert_id)
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    if is_pp_missing_parameter(name, self):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(param, "weight_loader",
-                                            default_weight_loader)
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index b6708f77d8a..3b2a7069efc 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -45,7 +45,7 @@
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
-    "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"),
+    "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
     "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"),
     "FalconForCausalLM": ("falcon", "FalconForCausalLM"),
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),

From d2b687a722e60a40963b6df3f9f3779cac768cb8 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Feb 2025 15:23:50 +0800
Subject: [PATCH 0032/1240] [Misc] Improve error message for incorrect pynvml
 (#12809)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 9c98942b556..e4767a378f4 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -41,7 +41,11 @@ def cuda_platform_plugin() -> Optional[str]:
                 is_cuda = True
         finally:
             pynvml.nvmlShutdown()
-    except Exception:
+    except Exception as e:
+        if "nvml" not in e.__class__.__name__.lower():
+            # If the error is not related to NVML, re-raise it.
+            raise e
+
         # CUDA is supported on Jetson, but NVML may not be.
         import os
 

From 56c370e80637e3565d0279e56f74b6720a2f2158 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Thu, 6 Feb 2025 04:02:14 -0500
Subject: [PATCH 0033/1240] [Misc] Update w2 scale loading for GPTQMarlinMoE
 (#12757)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/weight_loading/models-large.txt         |  2 ++
 vllm/model_executor/layers/fused_moe/layer.py |  4 ++--
 .../layers/quantization/gptq_marlin.py        | 23 ++++++++++++++-----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tests/weight_loading/models-large.txt b/tests/weight_loading/models-large.txt
index 8ab7f05d7d1..9c1c11da572 100644
--- a/tests/weight_loading/models-large.txt
+++ b/tests/weight_loading/models-large.txt
@@ -1,5 +1,7 @@
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
+compressed-tensors, nm-testing/test-w4a16-mixtral-actorder-group, main
 gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, main
+gptq_marlin, TheBloke/Mixtral-8x7B-v0.1-GPTQ, gptq-8bit-128g-actorder_True
 awq_marlin, casperhansen/deepseek-coder-v2-instruct-awq, main
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 3c7ef5e0080..f18c0313355 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -302,8 +302,8 @@ def __init__(
             "weight_loader": self.weight_loader,
         }
         # need full intermediate size pre-sharding for WNA16 act order
-        if (self.quant_method.__class__.__name__ ==
-                "CompressedTensorsWNA16MoEMethod"):
+        if (self.quant_method.__class__.__name__
+                in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod")):
             moe_quant_params["intermediate_size_full"] = intermediate_size
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 99ab299958b..84c53b2c16d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -323,13 +323,18 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
-        # Currently assuming is_k_full is always True
-        # (input size per partition is the same as full input size)
-        # Supports only sym for now (no zp)
+        intermediate_size_full = extra_weight_attrs.pop(
+            "intermediate_size_full")
+
+        self.is_k_full = (not self.quant_config.desc_act) or (
+            intermediate_size_per_partition == intermediate_size_full)
+
         if self.quant_config.group_size != -1:
             scales_size13 = hidden_size // self.quant_config.group_size
-            scales_size2 = (intermediate_size_per_partition //
-                            self.quant_config.group_size)
+            w2_scales_size = (intermediate_size_full
+                              if self.quant_config.desc_act else
+                              intermediate_size_per_partition)
+            scales_size2 = (w2_scales_size // self.quant_config.group_size)
             strategy = FusedMoeWeightScaleSupported.GROUP.value
         else:
             scales_size13 = 1
@@ -385,6 +390,9 @@ def create_weights(
         )
         layer.register_parameter("w2_scales", w2_scales)
         set_weight_attrs(w2_scales, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_scales,
+                         {"load_full_w2": self.quant_config.desc_act})
         # up_proj scales
         w13_qzeros = torch.nn.Parameter(
             torch.empty(num_experts,
@@ -406,6 +414,9 @@ def create_weights(
         )
         layer.register_parameter("w2_qzeros", w2_qzeros)
         set_weight_attrs(w2_qzeros, extra_weight_attrs)
+        # dont shard the w2 scales when running act order
+        set_weight_attrs(w2_qzeros,
+                         {"load_full_w2": self.quant_config.desc_act})
         w13_g_idx = torch.nn.Parameter(
             torch.empty(
                 num_experts,
@@ -575,4 +586,4 @@ def apply(
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             num_bits=self.quant_config.quant_type.size_bits,
-        ).to(orig_dtype)
+            is_k_full=self.is_k_full).to(orig_dtype)

From be2d146094ef3e005bd4742d46c39ab1d856c12c Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 6 Feb 2025 01:02:38 -0800
Subject: [PATCH 0034/1240] [Docs] Add Google Cloud Slides (#12814)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 09c2c6d35e6..cd0b1c517fd 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
-- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
+- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!

From 72210ed75e62f4164666569f0f1f8964364c7bcb Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:43:12 -0500
Subject: [PATCH 0035/1240] [Attention] Use FA3 for MLA on Hopper (#12807)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/flash_attn.py    | 44 ++++++------------------
 vllm/attention/backends/mla/utils.py     |  2 ++
 vllm/attention/backends/utils.py         | 34 ++++++++++++++++++
 vllm/v1/attention/backends/flash_attn.py | 30 +++-------------
 4 files changed, 51 insertions(+), 59 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 6a82127acdf..971fe411695 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -14,19 +14,16 @@
                                               AttentionMetadataBuilder,
                                               AttentionType)
 from vllm.attention.backends.utils import (
-    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
-    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
-    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
-    is_all_encoder_attn_metadata_set, is_block_tables_empty)
-from vllm.envs import VLLM_FLASH_ATTN_VERSION
+    PAD_SLOT_ID, VLLM_FLASH_ATTN_VERSION, CommonAttentionState,
+    compute_slot_mapping, compute_slot_mapping_start_idx,
+    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
+    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set,
+    is_block_tables_empty)
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  flash_attn_with_kvcache,
-                                  is_fa_version_supported)
+from vllm.vllm_flash_attn import (flash_attn_varlen_func,
+                                  flash_attn_with_kvcache)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -644,25 +641,6 @@ def __init__(
                 f"Supported head sizes are: {support_head_sizes}.")
         self.attn_type = attn_type
 
-        # if hopper default to FA3, otherwise stick to FA2 for now
-        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
-        #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] >= 9:
-            self.fa_version = 3 if is_fa_version_supported(3) else 2
-        else:
-            self.fa_version = 2
-
-        if VLLM_FLASH_ATTN_VERSION is not None:
-            assert VLLM_FLASH_ATTN_VERSION in [2, 3]
-            self.fa_version = VLLM_FLASH_ATTN_VERSION
-
-        if not is_fa_version_supported(self.fa_version):
-            logger.error("Cannot use FA version %d is not supported due to %s",
-                         self.fa_version,
-                         fa_version_unsupported_reason(self.fa_version))
-
-        assert is_fa_version_supported(self.fa_version)
-
     def forward(
         self,
         layer: AttentionLayer,
@@ -781,7 +759,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=prefill_output,
-                    fa_version=self.fa_version,
+                    fa_version=VLLM_FLASH_ATTN_VERSION,
                 )
             else:
                 # prefix-enabled attention
@@ -804,7 +782,7 @@ def forward(
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
                     out=prefill_output,
-                    fa_version=self.fa_version,
+                    fa_version=VLLM_FLASH_ATTN_VERSION,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -833,7 +811,7 @@ def forward(
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
                     out=decode_output,
-                    fa_version=self.fa_version,
+                    fa_version=VLLM_FLASH_ATTN_VERSION,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -854,7 +832,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=decode_output.unsqueeze(1),
-                    fa_version=self.fa_version,
+                    fa_version=VLLM_FLASH_ATTN_VERSION,
                 )
         return output
 
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index cd8c08e5ab4..e1285d1fad3 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -12,6 +12,7 @@
 from vllm.attention.backends.abstract import (AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl, T)
+from vllm.attention.backends.utils import VLLM_FLASH_ATTN_VERSION
 from vllm.distributed import (get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -533,6 +534,7 @@ def _forward_prefill_flash(
             max_seqlen_k=max_prefill_seq_len,
             softmax_scale=self.scale,
             causal=True,
+            fa_version=VLLM_FLASH_ATTN_VERSION,
         )
         attn_output = attn_output\
             .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index ad53e4e70b0..3c5028a66d5 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -8,12 +8,17 @@
 import numpy as np
 import torch
 
+from vllm import envs
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
+from vllm.logger import logging
 from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
+logger = logging.getLogger(__name__)
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner_base import ModelRunnerBase
 
@@ -580,3 +585,32 @@ def get_num_prefill_decode_query_kv_tokens(
 
     return (num_prefill_query_tokens, num_prefill_kv_tokens,
             num_decode_query_tokens)
+
+
+try:
+    from vllm.vllm_flash_attn.flash_attn_interface import (
+        fa_version_unsupported_reason, is_fa_version_supported)
+
+    def flash_attn_version():
+        # if hopper default to FA3, otherwise stick to FA2 for now
+        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
+        #  use FA3 as default for both
+        if current_platform.get_device_capability()[0] >= 9:
+            fa_version = 3 if is_fa_version_supported(3) else 2
+        else:
+            fa_version = 2
+
+        if envs.VLLM_FLASH_ATTN_VERSION is not None:
+            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
+            fa_version = envs.VLLM_FLASH_ATTN_VERSION
+
+        if not is_fa_version_supported(fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         fa_version, fa_version_unsupported_reason(fa_version))
+
+        assert is_fa_version_supported(fa_version)
+        return fa_version
+
+    VLLM_FLASH_ATTN_VERSION = flash_attn_version()
+except ImportError:
+    VLLM_FLASH_ATTN_VERSION = None
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 837d7faf437..204afc9f402 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,13 +10,10 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.envs import VLLM_FLASH_ATTN_VERSION
+from vllm.attention.backends.utils import VLLM_FLASH_ATTN_VERSION
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn import (fa_version_unsupported_reason,
-                                  flash_attn_varlen_func,
-                                  is_fa_version_supported)
+from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 logger = init_logger(__name__)
 
@@ -136,25 +133,6 @@ def __init__(
                                       "are not implemented for "
                                       "FlashAttentionImpl")
 
-        # if hopper default to FA3, otherwise stick to FA2 for now
-        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
-        #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] >= 9:
-            self.fa_version = 3 if is_fa_version_supported(3) else 2
-        else:
-            self.fa_version = 2
-
-        if VLLM_FLASH_ATTN_VERSION is not None:
-            assert VLLM_FLASH_ATTN_VERSION in [2, 3]
-            self.fa_version = VLLM_FLASH_ATTN_VERSION
-
-        if not is_fa_version_supported(self.fa_version):
-            logger.error("Cannot use FA version %d is not supported due to %s",
-                         self.fa_version,
-                         fa_version_unsupported_reason(self.fa_version))
-
-        assert is_fa_version_supported(self.fa_version)
-
     def forward(
         self,
         layer: torch.nn.Module,
@@ -227,7 +205,7 @@ def forward(
                 window_size=self.sliding_window,
                 block_table=attn_metadata.block_table,
                 softcap=self.logits_soft_cap,
-                fa_version=self.fa_version,
+                fa_version=VLLM_FLASH_ATTN_VERSION,
             )
             return output
 
@@ -249,7 +227,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
-            fa_version=self.fa_version,
+            fa_version=VLLM_FLASH_ATTN_VERSION,
         )
         return output
 

From 0b2083683c78a3d5805e47cfad399da23a703bee Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 6 Feb 2025 06:59:18 -0800
Subject: [PATCH 0036/1240] [misc] Reduce number of config file requests to
 HuggingFace (#12797)

Signed-off-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 36 ++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1c0f20a6e04..85056158bab 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -7,7 +7,7 @@
 from typing import Any, Dict, Optional, Type, Union
 
 import huggingface_hub
-from huggingface_hub import (file_exists, hf_hub_download,
+from huggingface_hub import (file_exists, hf_hub_download, list_repo_files,
                              try_to_load_from_cache)
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
                                    LocalEntryNotFoundError,
@@ -395,18 +395,28 @@ def get_sentence_transformer_tokenizer_config(model: str,
     - dict: A dictionary containing the configuration parameters 
     for the Sentence Transformer BERT model.
     """
-    for config_name in [
-            "sentence_bert_config.json",
-            "sentence_roberta_config.json",
-            "sentence_distilbert_config.json",
-            "sentence_camembert_config.json",
-            "sentence_albert_config.json",
-            "sentence_xlm-roberta_config.json",
-            "sentence_xlnet_config.json",
-    ]:
-        encoder_dict = get_hf_file_to_dict(config_name, model, revision)
-        if encoder_dict:
-            break
+    sentence_transformer_config_files = [
+        "sentence_bert_config.json",
+        "sentence_roberta_config.json",
+        "sentence_distilbert_config.json",
+        "sentence_camembert_config.json",
+        "sentence_albert_config.json",
+        "sentence_xlm-roberta_config.json",
+        "sentence_xlnet_config.json",
+    ]
+    try:
+        # If model is on HuggingfaceHub, get the repo files
+        repo_files = list_repo_files(model, revision=revision, token=HF_TOKEN)
+    except Exception as e:
+        logger.debug("Error getting repo files", e)
+        repo_files = []
+
+    encoder_dict = None
+    for config_name in sentence_transformer_config_files:
+        if config_name in repo_files or Path(model).exists():
+            encoder_dict = get_hf_file_to_dict(config_name, model, revision)
+            if encoder_dict:
+                break
 
     if not encoder_dict:
         return None

From 0781829a4a1d28ea03621e6a323dfb0760d135bd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 7 Feb 2025 00:45:44 +0800
Subject: [PATCH 0037/1240] [Misc] Remove unnecessary decode call (#12833)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/preprocess.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 4d8f28cb041..53f89996f0f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -260,8 +260,6 @@ def _process_multimodal(
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
 
-        if isinstance(prompt, list):
-            prompt = tokenizer.decode(prompt)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 

From a8ac7f7e66eb89f359c4ba3c92bca79d7fcdc7e5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 7 Feb 2025 00:46:13 +0800
Subject: [PATCH 0038/1240] [Kernel] Make rotary_embedding ops more flexible
 with input shape (#12777)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/pos_encoding_kernels.cu              | 103 +++++++++++++++++++---
 tests/kernels/test_pos_encoding.py        |  31 +++++--
 vllm/attention/backends/mla/utils.py      |  25 +-----
 vllm/model_executor/models/deepseek_v2.py |  13 +--
 4 files changed, 115 insertions(+), 57 deletions(-)

diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index 97184a87355..c085d31a3e9 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -124,18 +124,54 @@ __global__ void batched_rotary_embedding_kernel(
 void rotary_embedding(
     torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
     torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
-                           // [num_tokens, num_heads * head_size]
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
-                           // [num_tokens, num_kv_heads * head_size]
+                           // [num_tokens, num_kv_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
     int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
     bool is_neox) {
-  int64_t num_tokens = query.numel() / query.size(-1);
+  // num_tokens = batch_size * seq_len
+  int64_t num_tokens = positions.numel();
+  int positions_ndim = positions.dim();
+
+  // Make sure num_tokens dim is consistent across positions, query, and key.
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) && key.size(0) == positions.size(0),
+        "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            key.size(0) == positions.size(0) &&
+            query.size(1) == positions.size(1) &&
+            key.size(1) == positions.size(1),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  // hidden_size = num_heads * head_size
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.numel() / num_tokens;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have consistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key_hidden_size / head_size;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
   int rot_dim = cos_sin_cache.size(1);
-  int num_heads = query.size(-1) / head_size;
-  int num_kv_heads = key.size(-1) / head_size;
-  int64_t query_stride = query.stride(-2);
-  int64_t key_stride = key.stride(-2);
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.stride(seq_dim_idx);
 
   dim3 grid(num_tokens);
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
@@ -165,19 +201,58 @@ and process in batched manner.
 void batched_rotary_embedding(
     torch::Tensor& positions,  // [batch_size, seq_len] or [num_tokens]
     torch::Tensor& query,  // [batch_size, seq_len, num_heads * head_size] or
-                           // [num_tokens, num_heads * head_size]
+                           // [num_tokens, num_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
-                           // [num_tokens, num_kv_heads * head_size]
+                           // [num_tokens, num_kv_heads * head_size] or
+                           // [batch_size, seq_len, num_heads, head_size] or
+                           // [num_tokens, num_heads, head_size]
     int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
     bool is_neox, int64_t rot_dim,
-    torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
+    torch::Tensor& cos_sin_cache_offsets  // [num_tokens] or [batch_size]
 ) {
+  // num_tokens = batch_size * seq_len
   int64_t num_tokens = cos_sin_cache_offsets.size(0);
-  int num_heads = query.size(-1) / head_size;
-  int num_kv_heads = key.size(-1) / head_size;
-  int64_t query_stride = query.stride(-2);
-  int64_t key_stride = key.stride(-2);
+  TORCH_CHECK(
+      positions.size(0) == num_tokens || positions.numel() == num_tokens,
+      "positions must have the same num_tokens or batch_size as "
+      "cos_sin_cache_offsets");
+
+  int positions_ndim = positions.dim();
+  // Make sure num_tokens dim is consistent across positions, query, and key.
+  TORCH_CHECK(
+      positions_ndim == 1 || positions_ndim == 2,
+      "positions must have shape [num_tokens] or [batch_size, seq_len]");
+  if (positions_ndim == 1) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) && key.size(0) == positions.size(0),
+        "query, key and positions must have the same number of tokens");
+  }
+  if (positions_ndim == 2) {
+    TORCH_CHECK(
+        query.size(0) == positions.size(0) &&
+            key.size(0) == positions.size(0) &&
+            query.size(1) == positions.size(1) &&
+            key.size(1) == positions.size(1),
+        "query, key and positions must have the same batch_size and seq_len");
+  }
+
+  // Make sure head_size is valid for query and key
+  int query_hidden_size = query.numel() / num_tokens;
+  int key_hidden_size = key.numel() / num_tokens;
+  TORCH_CHECK(query_hidden_size % head_size == 0);
+  TORCH_CHECK(key_hidden_size % head_size == 0);
+
+  // Make sure query and key have concistent number of heads
+  int num_heads = query_hidden_size / head_size;
+  int num_kv_heads = key_hidden_size / head_size;
+  TORCH_CHECK(num_heads % num_kv_heads == 0);
+
+  int seq_dim_idx = positions_ndim - 1;
+  int64_t query_stride = query.stride(seq_dim_idx);
+  int64_t key_stride = key.stride(seq_dim_idx);
 
   dim3 grid(num_tokens);
   dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index 5b7b0fda2be..af9bfd2f0f5 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from itertools import accumulate, product
-from typing import Dict, List, Optional
+from typing import Callable, Dict, List, Optional
 
 import pytest
 import torch
@@ -24,7 +24,21 @@
 ]
 
 
+def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                           head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads * head_size)
+
+
+def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
+                            head_size: int) -> tuple[int, ...]:
+    return (batch_size, seq_len, num_heads, head_size)
+
+
+TENSORS_SHAPES_FN = [_get_batch_tensor_shape, _get_flat_tensor_shape]
+
+
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -36,6 +50,7 @@
 @torch.inference_mode()
 def test_rotary_embedding(
     is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
     batch_size: int,
     seq_len: int,
     num_heads: int,
@@ -58,10 +73,8 @@ def test_rotary_embedding(
     rope = rope.to(dtype=dtype)
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
     key = torch.randn_like(query)
 
     # NOTE(woosuk): The reference implementation should be executed first
@@ -80,6 +93,7 @@ def test_rotary_embedding(
 
 
 @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
+@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("seq_len", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
@@ -91,6 +105,7 @@ def test_rotary_embedding(
 @torch.inference_mode()
 def test_batched_rotary_embedding(
     is_neox_style: bool,
+    tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
     batch_size: int,
     seq_len: int,
     num_heads: int,
@@ -113,10 +128,8 @@ def test_batched_rotary_embedding(
     rope = rope.to(dtype=dtype)
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
+    query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
+    query = torch.randn(query_shape, dtype=dtype)
     key = torch.randn_like(query)
 
     # NOTE(woosuk): The reference implementation should be executed first
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index e1285d1fad3..c22f7e92103 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -424,24 +424,6 @@ def _forward_decode(
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def apply_pure_rope(
-        self,
-        input_positions: torch.Tensor,
-        q_pe: torch.Tensor,
-        k_pe: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        seq_len = input_positions.size(0)
-        ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
-
-        q_pe, k_pe = self.rotary_emb(
-            input_positions,
-            q_pe.reshape(seq_len, -1),
-            k_pe.reshape(seq_len, -1),
-        )
-        q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
-
-        return q_pe, k_pe
-
     def forward(
         self,
         layer: AttentionLayer,
@@ -466,14 +448,13 @@ def forward(
         # Restore head dim (for rotary embedding)
         k_pe = k_pe.unsqueeze(1)
         assert hasattr(attn_metadata, "input_positions")
-        rope_fn = (self.rotary_emb
-                   if self.use_yarn_rope else self.apply_pure_rope)
 
         if is_decode:
             q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
             q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
-            q_pe, k_pe = rope_fn(attn_metadata.input_positions, q_pe, k_pe)
+            q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions, q_pe,
+                                         k_pe)
         else:
             assert is_prefill
             q = self.q_proj(hidden_states_or_q_c)[0]\
@@ -481,7 +462,7 @@ def forward(
 
             # TODO(lucas): there must be a nicer way to write this line
             q[..., self.qk_nope_head_dim:], k_pe = \
-                rope_fn(
+                self.rotary_emb(
                     attn_metadata.input_positions,
                     q[..., self.qk_nope_head_dim:], k_pe)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 773f5abe71d..0c6f07ce7b1 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -257,9 +257,7 @@ def __init__(
                                         prefix=f"{prefix}.o_proj")
         if rope_scaling:
             rope_scaling["rope_type"] = 'deepseek_yarn'
-            self.use_normal_rope = False
-        else:
-            self.use_normal_rope = True
+
         self.rotary_emb = get_rope(qk_rope_head_dim,
                                    rotary_dim=qk_rope_head_dim,
                                    max_position=max_position_embeddings,
@@ -309,17 +307,8 @@ def forward(
         k_nope, v = kv.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
         k_pe = latent_cache[:, :, self.kv_lora_rank:]
 
-        if self.use_normal_rope:
-            seq_len = positions.size(0)
-            ori_q_pe_shape, ori_k_pe_shape = q_pe.shape, k_pe.shape
-            q_pe = q_pe.reshape(seq_len, -1)
-            k_pe = k_pe.reshape(seq_len, -1)
-
         q_pe, k_pe = self.rotary_emb(positions, q_pe, k_pe)
 
-        if self.use_normal_rope:
-            q_pe, k_pe = q_pe.view(ori_q_pe_shape), k_pe.view(ori_k_pe_shape)
-
         q[..., self.qk_nope_head_dim:] = q_pe
         k = torch.empty_like(q)
         k[..., :self.qk_nope_head_dim] = k_nope

From b6b7759bea9e9e31f99496047ce7bcd4ede334d2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 7 Feb 2025 01:09:07 +0800
Subject: [PATCH 0039/1240] [torch.compile] PyTorch 2.6 and nightly
 compatibility (#12393)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/piecewise/test_simple.py    |   2 +-
 tests/compile/piecewise/test_toy_llama.py |   6 +-
 vllm/compilation/backends.py              | 437 +++++++---------------
 vllm/compilation/compiler_interface.py    | 340 +++++++++++++++++
 vllm/compilation/counter.py               |   2 +-
 vllm/compilation/inductor_pass.py         |   1 -
 vllm/compilation/pass_manager.py          |  16 +-
 vllm/config.py                            |   9 -
 8 files changed, 493 insertions(+), 320 deletions(-)
 create mode 100644 vllm/compilation/compiler_interface.py

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index 9d633ad259b..143cb49697f 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -92,7 +92,7 @@ def test_simple_piecewise_compile():
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=5,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=3,  # 1 + num_layers
-            num_inductor_compilations=3,  # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=3,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_caputured=
             6,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 0404722bab8..021bd4cc463 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -322,7 +322,7 @@ def test_toy_llama():
             num_graphs_seen=0,
             num_piecewise_graphs_seen=0,
             num_piecewise_capturable_graphs_seen=0,
-            num_inductor_compilations=0,
+            num_backend_compilations=0,
             num_cudagraph_caputured=0,
     ):
         outputs.append(run_model(llama_config, use_compile=False))
@@ -332,7 +332,7 @@ def test_toy_llama():
             num_graphs_seen=1,  # one graph for the model
             num_piecewise_graphs_seen=1,
             num_piecewise_capturable_graphs_seen=1,
-            num_inductor_compilations=1,  # num_piecewise_capturable_graphs_seen
+            num_backend_compilations=1,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_caputured=
             2,  # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
     ):
@@ -345,7 +345,7 @@ def test_toy_llama():
             1,  # 2 * num_layers + 1
             num_piecewise_capturable_graphs_seen=1 +
             llama_config.num_layers,  # 1 + num_layers
-            num_inductor_compilations=1 +
+            num_backend_compilations=1 +
             llama_config.num_layers,  # num_piecewise_capturable_graphs_seen
             num_cudagraph_caputured=2 *
         (1 + llama_config.num_layers
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 979890170c1..b972f03c968 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ast
-import copy
 import dataclasses
 import os
 import pprint
 import time
-from collections import defaultdict
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
@@ -19,6 +17,7 @@
 from vllm.logger import init_logger
 from vllm.utils import weak_ref_tensors
 
+from .compiler_interface import EagerAdaptor, InductorAdaptor
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
 from .monitor import end_monitoring_torch_compile
@@ -27,306 +26,128 @@
 logger = init_logger(__name__)
 
 
-@dataclasses.dataclass
-class InductorArtifact:
-    hash_str: str = ""
-    file_path: str = ""
+class CompilerManager:
+    """
+    A manager to manage the compilation process, including
+    caching the compiled graph, loading the compiled graph,
+    and compiling the graph.
 
+    The cache is a dict mapping
+    `(runtime_shape, graph_index, backend_name)`
+    to `any_data` returned from the compiler.
 
-class InductorHashCache:
+    When serializing the cache, we save it to a Python file
+    for readability. We don't use json here because json doesn't
+    support int as key.
     """
-    Disk format: a Python list of tuples, each tuple is
-    (runtime_shape, graph_index, hash_str, file_path)
-    We use list of tuple for readability.
 
-    In-memory format: a defaultdict of dict, where the key is
-    runtime_shape, and the value is a dict of graph_index to hash_str.
+    def __init__(self, use_inductor: bool):
+        self.cache: Dict[Tuple[Optional[int], int, str], Any] = dict()
+        cls = InductorAdaptor if use_inductor else EagerAdaptor
+        self.compiler = cls()
 
-    The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`,
-    we don't use json here because json doesn't support int as key.
-
-    TODO: better off-the-shelf solution to serialize the data?
-    """
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        return self.compiler.compute_hash(vllm_config)
 
-    def __init__(self, cache_dir: str, disabled: bool = False):
-        self.cache: Dict[Optional[int],
-                         Dict[int, InductorArtifact]] = defaultdict(dict)
-        self.disabled = disabled
+    def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        self.disable_cache = disable_cache
         self.cache_dir = cache_dir
-        self.cache_file_path = os.path.join(cache_dir,
-                                            "inductor_hash_cache.py")
-        if disabled:
-            return
-        # set flags so that Inductor and Triton store their cache
-        # in the cache_dir, then users only need to copy the cache_dir
-        # to another machine to reuse the cache.
-        inductor_cache = os.path.join(cache_dir, "inductor_cache")
-        os.makedirs(inductor_cache, exist_ok=True)
-        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
-        triton_cache = os.path.join(cache_dir, "triton_cache")
-        os.makedirs(triton_cache, exist_ok=True)
-        os.environ["TRITON_CACHE_DIR"] = triton_cache
-        if os.path.exists(self.cache_file_path):
+        self.cache_file_path = os.path.join(cache_dir, "vllm_compile_cache.py")
+
+        if not disable_cache and os.path.exists(self.cache_file_path):
+            # load the cache from the file
             with open(self.cache_file_path) as f:
-                self.deserialize(f.read())
-
-    def deserialize(self, data: str):
-        # we use ast.literal_eval to parse the data
-        # because it is a safe way to parse Python literals.
-        # do not use eval(), it is unsafe.
-        list_data = ast.literal_eval(data)
-        for item in list_data:
-            runtime_shape = item[0]
-            graph_index = item[1]
-            hash_str = item[2]
-            # for compatibility of old version,
-            # where we don't have file_path.
-            # NOTE: after running the new code, the file_path
-            # will be updated.
-            file_path = "" if len(item) == 3 else item[3]
-            self.cache[runtime_shape][graph_index] = InductorArtifact(
-                hash_str=hash_str, file_path=file_path)
-
-    def serialize(self) -> str:
-        data = []
-        for runtime_shape, value in self.cache.items():
-            for graph_index, inductor_artifact in value.items():
-                data.append(
-                    (runtime_shape, graph_index, inductor_artifact.hash_str,
-                     inductor_artifact.file_path))
-        printer = pprint.PrettyPrinter(indent=4)
-        return printer.pformat(data)
+                # we use ast.literal_eval to parse the data
+                # because it is a safe way to parse Python literals.
+                # do not use eval(), it is unsafe.
+                self.cache = ast.literal_eval(f.read())
+
+        self.compiler.initialize_cache(cache_dir=cache_dir,
+                                       disable_cache=disable_cache)
 
     def save_to_file(self):
-        if self.disabled:
+        if self.disable_cache:
             return
         with open(self.cache_file_path, "w") as f:
-            f.write(self.serialize())
-
-    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
-        if self.disabled:
-            return False
-        runtime_shape, graph_index = key
-        return runtime_shape in self.cache and graph_index in self.cache[
-            runtime_shape]
-
-    def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact:
-        if self.disabled:
-            raise KeyError("cannot read from disabled cache")
-        runtime_shape, graph_index = key
-        return self.cache[runtime_shape][graph_index]
-
-    def __setitem__(self, key: Tuple[Optional[int], int],
-                    value: InductorArtifact):
-        # setitem for disabled cache is fine, because we
-        # don't actually write to the disk
-        runtime_shape, graph_index = key
-        self.cache[runtime_shape][graph_index] = value
-
-
-class AlwaysHitShapeEnv:
-    """
-    Why do we need this class:
-
-    For normal `torch.compile` usage, every compilation will have
-    one Dynamo bytecode compilation and one Inductor compilation.
-    The Inductor compilation happens under the context of the
-    Dynamo bytecode compilation, and that context is used to
-    determine the dynamic shape information, etc.
-
-    For our use case, we only run Dynamo bytecode compilation once,
-    and run Inductor compilation multiple times with different shapes
-    plus a general shape. The compilation for specific shapes happens
-    outside of the context of the Dynamo bytecode compilation. At that
-    time, we don't have shape environment to provide to Inductor, and
-    it will fail the Inductor code cache lookup.
-
-    By providing a dummy shape environment that always hits, we can
-    make the Inductor code cache lookup always hit, and we can
-    compile the graph for different shapes as needed.
-
-    The following dummy methods are obtained by trial-and-error
-    until it works.
-    """
-
-    def __init__(self) -> None:
-        self.guards: List[Any] = []
-
-    def evaluate_guards_expression(self, *args, **kwargs):
-        return True
-
-    def get_pruned_guards(self, *args, **kwargs):
-        return []
-
-    def produce_guards_expression(self, *args, **kwargs):
-        return ""
-
-
-def wrap_inductor(graph: fx.GraphModule,
-                  example_inputs,
-                  additional_inductor_config,
-                  compilation_config: CompilationConfig,
-                  vllm_backend: "VllmBackend",
-                  graph_index: int = 0,
-                  num_graphs: int = 1,
-                  runtime_shape: Optional[int] = None,
-                  use_inductor: bool = True) -> Any:
-    if graph_index == 0:
-        # before compiling the first graph, record the start time
-        global compilation_start_time
-        compilation_start_time = time.time()
-
-    if not use_inductor:
-        return graph
-
-    compilation_counter.num_inductor_compilations += 1
-
-    from torch._inductor import config
-    current_config = config.get_config_copy()
-    from torch._inductor.compile_fx import compile_fx
-
-    if additional_inductor_config is not None:
-        current_config.update(additional_inductor_config)
-
-    if isinstance(runtime_shape, int):
-        # for a specific batchsize, tuning triton kernel parameters
-        # can be beneficial
-        current_config["max_autotune"] = True
-        current_config["coordinate_descent_tuning"] = True
-
-    # inductor can inplace modify the graph, so we need to copy it
-    # see https://github.com/pytorch/pytorch/issues/138980
-    graph = copy.deepcopy(graph)
-
-    cache_data = vllm_backend.inductor_hash_cache
-    if (runtime_shape, graph_index) in cache_data:
-        # we compiled this graph before
-        # so we can directly lookup the compiled graph via hash
-        inductor_artifact = cache_data[(runtime_shape, graph_index)]
-        hash_str = inductor_artifact.hash_str
-        if graph_index == 0:
-            # adds some info logging for the first graph
-            logger.info(
-                "Directly lookup the graph for shape %s from the cache",
-                str(runtime_shape))  # noqa
+            printer = pprint.PrettyPrinter(indent=4)
+            data = printer.pformat(self.cache)
+            f.write(data)
+
+    def load(self,
+             graph: fx.GraphModule,
+             example_inputs: List[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Optional[Callable]:
+        if (runtime_shape, graph_index, self.compiler.name) not in self.cache:
+            return None
+        handle = self.cache[(runtime_shape, graph_index, self.compiler.name)]
+        compiled_graph = self.compiler.load(handle, graph, example_inputs,
+                                            graph_index, runtime_shape)
         logger.debug(
-            "directly lookup the %s-th graph for shape %s via hash %s",
-            graph_index, str(runtime_shape), hash_str)
-        from torch._inductor.codecache import FxGraphCache
-        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
-            inductor_compiled_graph = FxGraphCache._lookup_graph(
-                hash_str, example_inputs, True, False)
-            assert inductor_compiled_graph is not None, (
-                "Inductor cache lookup failed. Please remove"
-                f"the cache file {cache_data.cache_file_path} and try again."  # noqa
-            )
-            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
-
-        # Inductor calling convention (function signature):
-        # f(list) -> tuple
-        # Dynamo calling convention (function signature):
-        # f(*args) -> Any
-
-        # need to know if the graph returns a tuple
-        from torch._inductor.compile_fx import graph_returns_tuple
-        returns_tuple = graph_returns_tuple(graph)
-
-        # this is the callable we return to Dynamo to run
-        def compiled_graph(*args):
-            # convert args to list
-            list_args = list(args)
-            graph_output = inductor_compiled_graph(list_args)
-            # unpack the tuple if needed
-            if returns_tuple:
-                return graph_output
-            else:
-                return graph_output[0]
-    else:
-        # it's the first time we compile this graph
-        # the assumption is that we don't have nested Inductor compilation.
-        # compiled_fx_graph_hash will only be called once, and we can hook
-        # it to get the hash of the compiled graph directly.
-
-        inductor_artifact = InductorArtifact()
-        from torch._inductor.codecache import (FxGraphCache,
-                                               compiled_fx_graph_hash)
-        original_load = FxGraphCache.load
-
-        def hijack_load(*args, **kwargs):
-            inductor_compiled_graph = original_load(*args, **kwargs)
-            inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
-            return inductor_compiled_graph
-
-        def hijack_compiled_fx_graph_hash(*args, **kwargs):
-            out = compiled_fx_graph_hash(*args, **kwargs)
-            inductor_artifact.hash_str = out[0]
-            return out
-
-        def _check_can_cache(*args, **kwargs):
-            # no error means it can be cached.
-            # Inductor refuses to cache the graph outside of Dynamo
-            # tracing context, and also disables caching for graphs
-            # with high-order ops.
-            # For vLLM, in either case, we want to cache the graph.
-            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
-            return
-
-        def _get_shape_env() -> AlwaysHitShapeEnv:
-            return AlwaysHitShapeEnv()
-
-        with ExitStack() as stack:
-            if not cache_data.disabled:
-                # compilation cache is enabled, patch several functions
-
-                # hijack to get the compiled graph itself
-                stack.enter_context(
-                    patch("torch._inductor.codecache.FxGraphCache.load",
-                          hijack_load))
-
-                # for hijacking the hash of the compiled graph
-                stack.enter_context(
-                    patch("torch._inductor.codecache.compiled_fx_graph_hash",
-                          hijack_compiled_fx_graph_hash))
-
-                # for providing a dummy shape environment
-                stack.enter_context(
-                    patch(
-                        "torch._inductor.codecache.FxGraphCache._get_shape_env",
-                        _get_shape_env))
-
-                # for forcing the graph to be cached
-                stack.enter_context(
-                    patch(
-                        "torch._inductor.codecache.FxGraphCache._check_can_cache",
-                        _check_can_cache))
-
-            compiled_graph = compile_fx(graph,
-                                        example_inputs,
-                                        config_patches=current_config)
-        # store the inductor_artifact in the cache
-        cache_data[(runtime_shape, graph_index)] = inductor_artifact
+            "Directly load the %s-th graph for shape %s from %s via "
+            "handle %s", graph_index, str(runtime_shape), self.compiler.name,
+            handle)
+        return compiled_graph
+
+    def compile(self,
+                graph: fx.GraphModule,
+                example_inputs,
+                additional_inductor_config,
+                compilation_config: CompilationConfig,
+                graph_index: int = 0,
+                num_graphs: int = 1,
+                runtime_shape: Optional[int] = None) -> Any:
         if graph_index == 0:
-            # adds some info logging for the first graph
-            logger.info("Cache the graph of shape %s for later use",
-                        str(runtime_shape))
-        logger.debug(
-            "store the %s-th graph for shape %s via hash %s from file %s",
-            graph_index, str(runtime_shape), inductor_artifact.hash_str,
-            inductor_artifact.file_path)
-    # after compiling the last graph, record the end time
-    if graph_index == num_graphs - 1:
-        now = time.time()
-        elapsed = now - compilation_start_time
-        compilation_config.compilation_time += elapsed
-        if runtime_shape is None:
-            logger.info("Compiling a graph for general shape takes %.2f s",
-                        elapsed)
-        else:
-            logger.info("Compiling a graph for shape %s takes %.2f s",
-                        runtime_shape, elapsed)
+            # before compiling the first graph, record the start time
+            global compilation_start_time
+            compilation_start_time = time.time()
+
+        compilation_counter.num_backend_compilations += 1
+
+        compiled_graph = None
+
+        # try to load from the cache
+        compiled_graph = self.load(graph, example_inputs, graph_index,
+                                   runtime_shape)
+        if compiled_graph is not None:
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Directly load the compiled graph for shape %s "
+                            "from the cache", str(runtime_shape))  # noqa
+            return compiled_graph
+
+        # no compiler cached the graph, or the cache is disabled,
+        # we need to compile it
+        compiled_graph, handle = self.compiler.compile(
+            graph, example_inputs, additional_inductor_config, runtime_shape)
+
+        assert compiled_graph is not None, "Failed to compile the graph"
+
+        # store the artifact in the cache
+        if handle is not None:
+            self.cache[(runtime_shape, graph_index,
+                        self.compiler.name)] = handle
+            if graph_index == 0:
+                # adds some info logging for the first graph
+                logger.info("Cache the graph of shape %s for later use",
+                            str(runtime_shape))
+            logger.debug(
+                "store the %s-th graph for shape %s from %s via handle %s",
+                graph_index, str(runtime_shape), self.compiler.name, handle)
+
+        # after compiling the last graph, record the end time
+        if graph_index == num_graphs - 1:
+            now = time.time()
+            elapsed = now - compilation_start_time
+            compilation_config.compilation_time += elapsed
+            if runtime_shape is None:
+                logger.info("Compiling a graph for general shape takes %.2f s",
+                            elapsed)
+            else:
+                logger.info("Compiling a graph for shape %s takes %.2f s",
+                            runtime_shape, elapsed)
 
-    return compiled_graph
+        return compiled_graph
 
 
 @dataclasses.dataclass
@@ -436,16 +257,15 @@ def call_module(self, target: torch.fx.node.Target,
                 i for i, x in enumerate(args) if isinstance(x, torch.SymInt)
             ]
             global compilation_start_time
-            compiled_graph_for_general_shape = wrap_inductor(
+            compiled_graph_for_general_shape = self.vllm_backend.\
+                compiler_manager.compile(
                 submod,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
-                self.vllm_backend,
                 graph_index=index,
                 num_graphs=len(self.compile_submod_names),
-                runtime_shape=None,
-                use_inductor=self.compilation_config.use_inductor)
+                runtime_shape=None)
 
             self.module.__dict__[target] = PiecewiseBackend(
                 submod, self.vllm_config, self.graph_pool, index,
@@ -483,7 +303,7 @@ class VllmBackend:
     post_grad_passes: Sequence[Callable]
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
-    inductor_hash_cache: InductorHashCache
+    compiler_manager: CompilerManager
 
     def __init__(
         self,
@@ -507,6 +327,9 @@ def __init__(
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
 
+        self.compiler_manager: CompilerManager = CompilerManager(
+            self.compilation_config.use_inductor)
+
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
@@ -533,9 +356,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
 
+            factors = []
             # 1. factors come from the vllm_config (it mainly summarizes how the
             #    model is created)
             config_hash = vllm_config.compute_hash()
+            factors.append(config_hash)
 
             # 2. factors come from the code files that are traced by Dynamo (
             #    it mainly summarizes how the model is used in forward pass)
@@ -553,10 +378,15 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             import hashlib
             code_hash = hashlib.md5(
                 "\n".join(hash_content).encode()).hexdigest()
+            factors.append(code_hash)
+
+            # 3. compiler hash
+            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
+            factors.append(compiler_hash)
+
+            # combine all factors to generate the cache dir
+            hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10]
 
-            # combine the two hashes to generate the cache dir
-            hash_key = hashlib.md5(
-                f"{config_hash}_{code_hash}".encode()).hexdigest()[:10]
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
                 "torch_compile_cache",
@@ -570,15 +400,16 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             cache_dir, f"rank_{vllm_config.parallel_config.rank}")
         self.compilation_config.local_cache_dir = local_cache_dir
 
-        disabled = envs.VLLM_DISABLE_COMPILE_CACHE
-        self.inductor_hash_cache: InductorHashCache = InductorHashCache(
-            local_cache_dir, disabled=disabled)
-        if disabled:
+        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
+
+        if disable_cache:
             logger.info("vLLM's torch.compile cache is disabled.")
         else:
             logger.info("Using cache directory: %s for vLLM's torch.compile",
                         local_cache_dir)
 
+        self.compiler_manager.initialize_cache(local_cache_dir, disable_cache)
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -759,7 +590,7 @@ def check_for_ending_compilation(self):
         if self.is_last_graph and not self.to_be_compiled_sizes:
             # no specific sizes to compile
             # save the hash of the inductor graph for the next run
-            self.vllm_backend.inductor_hash_cache.save_to_file()
+            self.vllm_backend.compiler_manager.save_to_file()
             end_monitoring_torch_compile(self.vllm_config)
 
     def __call__(self, *args) -> Any:
@@ -782,16 +613,14 @@ def __call__(self, *args) -> Any:
             entry.compiled = True
             self.to_be_compiled_sizes.remove(runtime_shape)
             # args are real arguments
-            entry.runnable = wrap_inductor(
+            entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
                 self.compilation_config.inductor_compile_config,
                 self.compilation_config,
-                self.vllm_backend,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape,
-                use_inductor=self.compilation_config.use_inductor)
+                runtime_shape=runtime_shape)
 
             # finished compilations for all required shapes
             if self.is_last_graph and not self.to_be_compiled_sizes:
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
new file mode 100644
index 00000000000..ac0544ad640
--- /dev/null
+++ b/vllm/compilation/compiler_interface.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+import copy
+import hashlib
+import os
+from contextlib import ExitStack
+from typing import Any, Callable, Dict, List, Optional, Tuple
+from unittest.mock import patch
+
+import torch
+import torch._inductor.compile_fx
+import torch.fx as fx
+
+from vllm.config import VllmConfig
+
+
+class CompilerInterface:
+    """
+    The interface for a compiler that can be used by vLLM.
+    """
+    # The name of the compiler, e.g. inductor.
+    # This is a class-level attribute.
+    name: str
+
+    def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        """
+        when the vLLM process uses `cache_dir` as the cache directory,
+        the compiler should initialize itself with the cache directory,
+        e.g. by re-directing its own cache directory to a sub-directory.
+        """
+        pass
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        """
+        Gather all the relevant information from the VLLM config,
+        to compute a hash so that we can cache the compiled model.
+
+        See :meth:`VllmConfig.compute_hash` to check what information
+        is already considered by default. This function should only
+        consider the information that is specific to the compiler.
+        """
+        return ""
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: List[Any],
+        compiler_config: Dict[str, Any],
+        runtime_shape: Optional[int] = None
+    ) -> Tuple[Optional[Callable], Optional[Any]]:
+        """
+        Compile the graph with the given example inputs and compiler config,
+        with a runtime shape. If the `runtime_shape` is None, it means
+        the `example_inputs` have a dynamic shape. Otherwise, the
+        `runtime_shape` specifies the shape of the inputs. Right now we only
+        support one variable shape for all inputs, which is the batchsize
+        (number of tokens) during inference.
+
+        Dynamo will make sure `graph(*example_inputs)` is valid.
+
+        The function should return a compiled callable function, as well as
+        a handle that can be used to directly load the compiled function.
+
+        The handle should be a plain Python object, preferably a string or a
+        file path for readability.
+
+        If the compiler doesn't support caching, it should return None for the
+        handle. If the compiler fails to compile the graph, it should return
+        None for the compiled function as well.
+        """
+        return None, None
+
+    def load(self,
+             handle: Any,
+             graph: fx.GraphModule,
+             example_inputs: List[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Callable:
+        """
+        Load the compiled function from the handle.
+        Raises an error if the handle is invalid.
+
+        The handle is the second return value of the `compile` function.
+        """
+        raise NotImplementedError("caching is not supported")
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
+
+
+class InductorAdaptor(CompilerInterface):
+    """
+    The adaptor for the Inductor compiler, version 2.5 and 2.6.
+    """
+    name = "inductor"
+
+    def compute_hash(self, vllm_config: VllmConfig) -> str:
+        factors: List[Any] = []
+        # summarize system state
+        from torch._inductor.codecache import CacheBase
+        system_factors = CacheBase.get_system()
+        factors.append(system_factors)
+
+        # summarize pytorch state
+        from torch._inductor.codecache import torch_key
+        torch_factors = torch_key()
+        factors.append(torch_factors)
+        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        return hash_str
+
+    def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        if disable_cache:
+            return
+        # redirect the cache directory to a sub-directory
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: List[Any],
+        compiler_config: Dict[str, Any],
+        runtime_shape: Optional[int] = None
+    ) -> Tuple[Optional[Callable], Optional[Any]]:
+        from torch._inductor import config
+        current_config = config.get_config_copy()
+        from torch._inductor.compile_fx import compile_fx
+
+        # disable remote cache
+        current_config["fx_graph_cache"] = True
+        current_config["fx_graph_remote_cache"] = False
+
+        if compiler_config is not None:
+            current_config.update(compiler_config)
+
+        if isinstance(runtime_shape, int):
+            # for a specific batchsize, tuning triton kernel parameters
+            # can be beneficial
+            current_config["max_autotune"] = True
+            current_config["coordinate_descent_tuning"] = True
+
+        # inductor can inplace modify the graph, so we need to copy it
+        # see https://github.com/pytorch/pytorch/issues/138980
+        graph = copy.deepcopy(graph)
+
+        # it's the first time we compile this graph
+        # the assumption is that we don't have nested Inductor compilation.
+        # compiled_fx_graph_hash will only be called once, and we can hook
+        # it to get the hash of the compiled graph directly.
+
+        hash_str, file_path = None, None
+        from torch._inductor.codecache import (FxGraphCache,
+                                               compiled_fx_graph_hash)
+
+        if torch.__version__.startswith("2.5"):
+            original_load = FxGraphCache.load
+            original_load_name = "torch._inductor.codecache.FxGraphCache.load"
+
+            def hijack_load(*args, **kwargs):
+                inductor_compiled_graph = original_load(*args, **kwargs)
+                nonlocal file_path
+                file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                return inductor_compiled_graph
+
+            hijacked_compile_fx_inner = torch._inductor.compile_fx.compile_fx_inner  # noqa
+        elif torch.__version__ >= "2.6":
+            # function renamed in 2.6
+            original_load_name = None
+
+            def hijacked_compile_fx_inner(*args, **kwargs):
+                output = torch._inductor.compile_fx.compile_fx_inner(
+                    *args, **kwargs)
+                nonlocal hash_str
+                inductor_compiled_graph = output
+                if inductor_compiled_graph is not None:
+                    nonlocal file_path
+                    file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                    hash_str = inductor_compiled_graph._fx_graph_cache_key
+                return output
+
+        def hijack_compiled_fx_graph_hash(*args, **kwargs):
+            out = compiled_fx_graph_hash(*args, **kwargs)
+            nonlocal hash_str
+            hash_str = out[0]
+            return out
+
+        def _check_can_cache(*args, **kwargs):
+            # no error means it can be cached.
+            # Inductor refuses to cache the graph outside of Dynamo
+            # tracing context, and also disables caching for graphs
+            # with high-order ops.
+            # For vLLM, in either case, we want to cache the graph.
+            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
+            return
+
+        def _get_shape_env() -> AlwaysHitShapeEnv:
+            return AlwaysHitShapeEnv()
+
+        with ExitStack() as stack:
+            # hijack to get the compiled graph itself
+            if original_load_name is not None:
+                stack.enter_context(patch(original_load_name, hijack_load))
+
+            # for hijacking the hash of the compiled graph
+            stack.enter_context(
+                patch("torch._inductor.codecache.compiled_fx_graph_hash",
+                      hijack_compiled_fx_graph_hash))
+
+            # for providing a dummy shape environment
+            stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      _get_shape_env))
+
+            # for forcing the graph to be cached
+            stack.enter_context(
+                patch(
+                    "torch._inductor.codecache.FxGraphCache._check_can_cache",
+                    _check_can_cache))
+
+            compiled_graph = compile_fx(
+                graph,
+                example_inputs,
+                inner_compile=hijacked_compile_fx_inner,
+                config_patches=current_config)
+
+        assert hash_str is not None, (
+            "failed to get the hash of the compiled graph")
+        assert file_path is not None, (
+            "failed to get the file path of the compiled graph")
+        return compiled_graph, (hash_str, file_path)
+
+    def load(self,
+             handle: Any,
+             graph: fx.GraphModule,
+             example_inputs: List[Any],
+             graph_index: int,
+             runtime_shape: Optional[int] = None) -> Callable:
+        assert isinstance(handle, tuple)
+        assert isinstance(handle[0], str)
+        assert isinstance(handle[1], str)
+        hash_str = handle[0]
+
+        from torch._inductor.codecache import FxGraphCache
+        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+            if torch.__version__.startswith("2.5"):
+                inductor_compiled_graph = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, False)
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+            elif torch.__version__ >= "2.6":
+                from torch._inductor.output_code import (
+                    CompiledFxGraphConstantsWithGm)
+                constants = CompiledFxGraphConstantsWithGm(graph)
+                inductor_compiled_graph, _ = FxGraphCache._lookup_graph(
+                    hash_str, example_inputs, True, None, constants)
+                assert inductor_compiled_graph is not None, (
+                    "Inductor cache lookup failed. Please remove"
+                    f"the cache directory and try again."  # noqa
+                )
+
+        # Inductor calling convention (function signature):
+        # f(list) -> tuple
+        # Dynamo calling convention (function signature):
+        # f(*args) -> Any
+
+        # need to know if the graph returns a tuple
+        from torch._inductor.compile_fx import graph_returns_tuple
+        returns_tuple = graph_returns_tuple(graph)
+
+        # this is the callable we return to Dynamo to run
+        def compiled_graph(*args):
+            # convert args to list
+            list_args = list(args)
+            graph_output = inductor_compiled_graph(list_args)
+            # unpack the tuple if needed
+            if returns_tuple:
+                return graph_output
+            else:
+                return graph_output[0]
+
+        return compiled_graph
+
+
+class EagerAdaptor(CompilerInterface):
+    name = "eager"
+
+    def compile(
+        self,
+        graph: fx.GraphModule,
+        example_inputs: List[Any],
+        compiler_config: Dict[str, Any],
+        runtime_shape: Optional[int] = None
+    ) -> Tuple[Optional[Callable], Optional[Any]]:
+        # we don't need to compile the graph, just return the graph itself.
+        # It does not support caching, return None for the handle.
+        return graph, None
diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py
index a6f11a3af4d..5be452593c6 100644
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -13,7 +13,7 @@ class CompilationCounter:
     num_piecewise_graphs_seen: int = 0
     # not including the splitting ops
     num_piecewise_capturable_graphs_seen: int = 0
-    num_inductor_compilations: int = 0
+    num_backend_compilations: int = 0
     num_cudagraph_caputured: int = 0
 
     def clone(self) -> "CompilationCounter":
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index be663946f4d..1fea927aac3 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -13,7 +13,6 @@
 class InductorPass(ABC):
     """
     General custom inductor pass interface.
-    TODO(torch==2.6) use torch._inductor.custom_graph_pass.CustomGraphPass
     """
 
     @abstractmethod
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index c7387fb7c2d..52f8c3b1ec1 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Dict, List
 
+import torch
 from torch import fx as fx
 
 from vllm.config import CompilationConfig
@@ -15,7 +16,17 @@
 logger = init_logger(__name__)
 
 
-class PostGradPassManager:
+class PlaceHolder:
+    pass
+
+
+if torch.__version__ < "2.6":
+    Parent = PlaceHolder  # type: ignore
+else:
+    Parent = torch._inductor.custom_graph_pass.CustomGraphPass  # type: ignore
+
+
+class PostGradPassManager(Parent):
     """
     The pass manager for post-grad passes.
     It handles configuration, adding custom passes, and running passes.
@@ -55,6 +66,9 @@ def add(self, pass_: InductorPass):
         assert isinstance(pass_, InductorPass)
         self.passes.append(pass_)
 
+    def uuid(self):
+        return self.__getstate__()
+
     def __getstate__(self) -> Dict[str, List[Any]]:
         """
         Custom pickling for the pass manager, as some passes cannot be pickled.
diff --git a/vllm/config.py b/vllm/config.py
index 9ba49757612..5579d6936d1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3072,15 +3072,6 @@ def compute_hash(self) -> str:
         the final hidden states.
         """
         factors: List[Any] = []
-        # summarize system state
-        from torch._inductor.codecache import CacheBase
-        system_factors = CacheBase.get_system()
-        factors.append(system_factors)
-
-        # summarize pytorch state
-        from torch._inductor.codecache import torch_key
-        torch_factors = torch_key()
-        factors.append(torch_factors)
 
         # summarize vllm config
         vllm_factors: List[Any] = []

From d187c0811e3f4758d3d4ffcab6487c09aa6e9af0 Mon Sep 17 00:00:00 2001
From: Jitse Klomp <jitse@jitseklomp.nl>
Date: Thu, 6 Feb 2025 18:17:55 +0100
Subject: [PATCH 0040/1240] [Doc] double quote cmake package in build.inc.md
 (#12840)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/cpu/build.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index f8d1044a0d1..2a8173803c0 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -10,7 +10,7 @@ Second, install Python packages for vLLM CPU backend building:
 
 ```console
 pip install --upgrade pip
-pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
 pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 

From 6999f71ed09d6084e19d800363aa84207362ec21 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 7 Feb 2025 01:18:22 +0800
Subject: [PATCH 0041/1240] [Bugfix] Fix unsupported FA version check for
 Turing GPU (#12828)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 3c5028a66d5..e8a34434122 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -612,5 +612,5 @@ def flash_attn_version():
         return fa_version
 
     VLLM_FLASH_ATTN_VERSION = flash_attn_version()
-except ImportError:
+except (ImportError, AssertionError):
     VLLM_FLASH_ATTN_VERSION = None

From 1d744fc40d58d250db09df2bc83cafbe6534ad31 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 6 Feb 2025 23:02:51 +0530
Subject: [PATCH 0042/1240] [V1] LoRA Support (#10957)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py                        |  17 +++
 tests/lora/test_baichuan.py                   |   8 ++
 tests/lora/test_chatglm3_tp.py                |  13 ++
 tests/lora/test_gemma.py                      |   8 ++
 tests/lora/test_llama_tp.py                   |  12 ++
 tests/lora/test_lora_bias_e2e.py              |  11 ++
 tests/lora/test_phi.py                        |  13 ++
 tests/lora/test_quant_model.py                |   8 ++
 tests/v1/core/test_kv_cache_utils.py          |   2 +-
 vllm/lora/layers.py                           |   8 +-
 .../model_executor/layers/logits_processor.py |  28 ++--
 vllm/v1/core/kv_cache_utils.py                | 101 ++++++++++----
 vllm/v1/core/scheduler.py                     |  32 ++++-
 vllm/v1/worker/gpu_input_batch.py             |  63 ++++++++-
 vllm/v1/worker/gpu_model_runner.py            |  56 ++++++--
 vllm/v1/worker/lora_model_runner_mixin.py     | 129 ++++++++++++++++++
 16 files changed, 453 insertions(+), 56 deletions(-)
 create mode 100644 vllm/v1/worker/lora_model_runner_mixin.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 071cdbecc68..5ea66518b41 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -306,3 +306,20 @@ def get_model_patched(**kwargs):
 def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
     yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
            model_runner.model)
+
+
+@pytest.fixture(params=[True, False])
+def run_with_both_engines_lora(request, monkeypatch):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        monkeypatch.setenv('VLLM_USE_V1', '1')
+    else:
+        monkeypatch.setenv('VLLM_USE_V1', '0')
+
+    yield
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 249f7619d62..d3992594804 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -42,6 +42,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 0aa9fe7a949..ee09afe8677 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -2,6 +2,8 @@
 
 from typing import List
 
+import pytest
+
 import vllm
 from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
@@ -47,6 +49,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -66,6 +77,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@@ -87,6 +99,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 8923aa2210a..a1b4c897c45 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -33,6 +33,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.xfail(current_platform.is_rocm(),
                    reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 39f779f400c..564818f23fd 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -2,6 +2,7 @@
 
 from typing import List
 
+import pytest
 import ray
 
 import vllm
@@ -73,6 +74,14 @@ def generate_and_test(llm, sql_lora_files):
     print("removing lora")
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @fork_new_process_for_each_test
 def test_llama_lora(sql_lora_files):
 
@@ -85,6 +94,9 @@ def test_llama_lora(sql_lora_files):
     generate_and_test(llm, sql_lora_files)
 
 
+# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
+# used by the engine yet.
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
index cbdd688311d..3a7b391692c 100644
--- a/tests/lora/test_lora_bias_e2e.py
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -30,6 +30,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+# Skipping for V1 for now as we are hitting,
+# "Head size 80 is not supported by FlashAttention." error.
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("lora_bias", [True])
 @pytest.mark.parametrize("fully_sharded", [True, False])
 def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 651c89ffce2..8999e0cf319 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -2,6 +2,8 @@
 
 from typing import List
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -48,6 +50,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+# Skipping for V1 for now as we are hitting,
+# "Head size 80 is not supported by FlashAttention." error.
+@pytest.mark.skip_v1
 def test_phi2_lora(phi2_lora_files):
     # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
     # Otherwise, the lora-test will fail due to CUDA OOM.
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 5702aa26bd9..7f687f563eb 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -70,6 +70,14 @@ def format_prompt_tuples(prompt):
     return generated_texts
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 60cf4384d3f..8df4cbe1be7 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -163,7 +163,7 @@ def test_generate_block_hash_extra_keys():
 
     # Test with no overlap
     extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
-    assert extra_keys == ()
+    assert extra_keys is None
     assert next_mm_idx == 1
 
     # Test with multiple extra keys
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9f0297596cc..9826aeb9dc2 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -16,8 +16,7 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce,
-                              tensor_model_parallel_gather)
+                              tensor_model_parallel_all_reduce)
 from vllm.distributed.utils import divide
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -1043,7 +1042,10 @@ def _get_logits(
         logits = lm_head.linear_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
-        logits = tensor_model_parallel_gather(logits)
+
+        # Gather logits for TP
+        logits = self.base_layer._gather_logits(logits)
+
         if logits is None:
             return None
 
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index cdc67ca83d4..0565c6e8be3 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -51,7 +51,6 @@ def __init__(self,
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-
         parallel_config = get_current_vllm_config().parallel_config
         self.use_all_gather = current_platform.is_tpu() \
             or envs.VLLM_USE_V1 \
@@ -88,6 +87,20 @@ def forward(
 
         return logits
 
+    def _gather_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        """gather/all-gather the logits tensor across model parallel group."""
+        if self.use_all_gather:
+            # Gather is not supported for some devices such as TPUs.
+            # Use all-gather instead.
+            # NOTE(woosuk): Here, the outputs of every device should not be None
+            # because XLA requires strict SPMD among all devices. Every device
+            # should execute the same operations after gathering the logits.
+            logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
+        return logits
+
     def _get_logits(
         self,
         hidden_states: torch.Tensor,
@@ -99,16 +112,9 @@ def _get_logits(
                                              hidden_states,
                                              bias=embedding_bias)
 
-        if self.use_all_gather:
-            # Gather is not supported for some devices such as TPUs.
-            # Use all-gather instead.
-            # NOTE(woosuk): Here, the outputs of every device should not be None
-            # because XLA requires strict SPMD among all devices. Every device
-            # should execute the same operations after gathering the logits.
-            logits = tensor_model_parallel_all_gather(logits)
-        else:
-            # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
+        # Gather logits for TP
+        logits = self._gather_logits(logits)
+
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., :self.org_vocab_size]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e0976ba8577..6888f1a3e18 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -170,14 +170,28 @@ def get_all_free_blocks(self) -> List[KVCacheBlock]:
         return ret
 
 
-def generate_block_hash_extra_keys(
-        request: Request, start_token_idx: int, end_token_idx: int,
-        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
-    """Generate extra keys for the block hash. The extra keys can come from
-    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
-    For multi-modal inputs, the extra keys are (mm_hash, start_offset) that
-    indicate a mm input contained in the block and its starting offset in
-    the block tokens.
+def need_extra_keys(request: Request) -> bool:
+    """Check whether the blocks allocated to this request need extra hash keys.
+
+    Args:
+        request (Request): The request. 
+
+    Returns:
+        bool: Whether blocks allocated to this request need extra hash keys. 
+    """
+
+    # Multimodal requests need to include the MM hash.
+    # LoRA requests need to include the LoRA ID.
+    return bool(request.mm_positions) or (request.lora_request is not None)
+
+
+def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
+                            end_token_idx: int,
+                            start_mm_idx: int) -> Tuple[List[Any], int]:
+    """Generate extra keys related to MultiModal request for block hash
+    computation. For multi-modal inputs, the extra keys are
+    (mm_hash, start_offset) that indicate a mm input contained in the
+    block and its starting offset in the block tokens.
     
     Args:
         request: The request object.
@@ -188,10 +202,11 @@ def generate_block_hash_extra_keys(
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
+    extra_keys: List[Any] = []
 
     mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
     if not mm_positions:
-        return None, start_mm_idx
+        return extra_keys, start_mm_idx
 
     if mm_positions and len(mm_positions) != len(mm_hashes):
         raise ValueError(
@@ -204,14 +219,13 @@ def generate_block_hash_extra_keys(
     # range. This usually happens in the late prefill phase and decoding phase.
     if mm_positions[-1]["offset"] + mm_positions[-1][
             "length"] < start_token_idx:
-        return None, start_mm_idx
+        return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
     if start_mm_idx < 0:
         assert -start_mm_idx <= len(mm_positions)
         start_mm_idx = len(mm_positions) + start_mm_idx
 
-    extra_keys = []
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
@@ -237,7 +251,50 @@ def generate_block_hash_extra_keys(
         else:
             # This block has not reached the current mm input.
             break
-    return tuple(extra_keys), curr_mm_idx
+    return extra_keys, curr_mm_idx
+
+
+def _gen_lora_extra_hash_keys(request: Request) -> List[int]:
+    """Generate extra keys related to LoRA for block hash computation.
+    
+    Args:
+        request: The request object.
+    
+    Returns:
+        Return LoRA id of the request if it is a LoRA request. Return empty
+        list otherwise.
+    """
+    if not request.lora_request:
+        return []
+    return [request.lora_request.lora_int_id]
+
+
+def generate_block_hash_extra_keys(
+        request: Request, start_token_idx: int, end_token_idx: int,
+        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+    """Generate extra keys for the block hash. The extra keys can come from
+    the multi-modal inputs and request specific metadata (e.g., LoRA ID).
+    
+    Args:
+        request: The request object.
+        start_token_idx: The start token index of the block.
+        end_token_idx: The end token index of the block.
+        start_mm_idx: The start multi-modal index of the block.
+    
+    Returns:
+        A tuple of extra keys and the next multi-modal index.
+    """
+    mm_extra_keys: List[Any]
+    mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
+        request, start_token_idx, end_token_idx, start_mm_idx)
+    lora_extra_keys: List[int] = _gen_lora_extra_hash_keys(request)
+
+    extra_keys: List[Any] = lora_extra_keys + mm_extra_keys
+
+    if not extra_keys:
+        return None, new_start_mm_idx
+
+    return tuple(extra_keys), new_start_mm_idx
 
 
 def hash_block_tokens(
@@ -249,9 +306,6 @@ def hash_block_tokens(
     prefix caching. We use LRU cache for this function to avoid recomputing
     hash values for the same block contents.
 
-    TODO: Support arbitrary metadata so that we could support more
-    features such as LoRA adapter.
-
     Args:
         parent_block_hash: The hash of the parent block. None
             if this is the first block.
@@ -291,14 +345,9 @@ def hash_request_tokens(block_size: int,
         The list of computed hash values.
     """
     token_ids = request.all_token_ids
-    mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
-    if mm_positions and len(mm_positions) != len(mm_hashes):
-        raise ValueError(
-            "The number of multi-modal positions and hashes must match.")
 
-    # TODO: Extend this to support other features such as LoRA.
-    need_extra_keys = bool(mm_positions)
-    extra_keys = None
+    req_need_extra_keys = need_extra_keys(request)
+    req_extra_keys = None
     curr_mm_idx = 0
 
     ret = []
@@ -310,13 +359,13 @@ def hash_request_tokens(block_size: int,
         if len(block_token_ids) < block_size:
             break
 
-        # Add extra keys if the block is a multi-modal block.
-        if need_extra_keys:
-            extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
+        if req_need_extra_keys:
+            # MM and LoRA requests need extra keys for block-hash computation.
+            req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start, end, curr_mm_idx)
 
         block_hash = hash_block_tokens(parent_block_hash_value,
-                                       block_token_ids, extra_keys)
+                                       block_token_ids, req_extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
     return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index fb5e83fe062..6c44fec6439 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -7,6 +7,7 @@
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
@@ -35,8 +36,6 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        # TODO: Support LoRA.
-        assert lora_config is None, "V1 does not support LoRA yet."
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -180,6 +179,14 @@ def schedule(self) -> "SchedulerOutput":
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_budget = new_encoder_budget
 
+        # Record the LoRAs in scheduled_running_reqs
+        requested_loras: Set[int] = set()
+        if self.lora_config:
+            requested_loras = set(
+                req.lora_request.lora_int_id for req in scheduled_running_reqs
+                if req.lora_request and req.lora_request.lora_int_id > 0)
+            assert len(requested_loras) <= self.lora_config.max_loras
+
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting and token_budget > 0:
@@ -187,6 +194,23 @@ def schedule(self) -> "SchedulerOutput":
                     break
 
                 request = self.waiting[0]
+
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request:
+                    req_lora_id = request.lora_request.lora_int_id
+                    if len(requested_loras) == self.lora_config.max_loras and (
+                            req_lora_id not in requested_loras):
+                        # Cannot schedule.
+                        # TODO (varun): This means all the other requests in
+                        # the WAITING queue will be blocked by this request,
+                        # even if,
+                        # 1. these other requests do not use LoRA, or,
+                        # 2. these other requests use the already requested
+                        # LoRAs.
+                        # This is too conservative and could be optimized.
+                        break
+
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
                     self.kv_cache_manager.get_computed_blocks(request)
@@ -234,6 +258,8 @@ def schedule(self) -> "SchedulerOutput":
                     raise RuntimeError(
                         f"Invalid request status: {request.status}")
 
+                if self.lora_config and request.lora_request:
+                    requested_loras.add(request.lora_request.lora_int_id)
                 req_to_new_block_ids[request.request_id] = [
                     b.block_id for b in computed_blocks + new_blocks
                 ]
@@ -568,6 +594,7 @@ class NewRequestData:
     sampling_params: SamplingParams
     block_ids: List[int]
     num_computed_tokens: int
+    lora_request: Optional[LoRARequest]
 
     @classmethod
     def from_request(
@@ -586,6 +613,7 @@ def from_request(
             sampling_params=request.sampling_params,
             block_ids=block_ids,
             num_computed_tokens=num_computed_tokens,
+            lora_request=request.lora_request,
         )
 
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 39708f833fd..a31e8886561 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -3,11 +3,12 @@
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 import numpy as np
 import torch
 
+from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -35,6 +36,8 @@ class CachedRequestState:
     mrope_positions: Optional[torch.Tensor] = None
     mrope_position_delta: Optional[int] = None
 
+    lora_request: Optional[LoRARequest] = None
+
     @property
     def num_tokens(self) -> int:
         return len(self.prompt_token_ids) + len(self.output_token_ids)
@@ -161,6 +164,12 @@ def __init__(
         ]
         self.prompt_token_ids: Optional[torch.Tensor] = None
 
+        # lora related
+        self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
+                                             dtype=np.int32)
+        self.lora_id_to_request_ids: Dict[int, Set[str]] = {}
+        self.lora_id_to_lora_request: Dict[int, LoRARequest] = {}
+
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
@@ -235,6 +244,19 @@ def add_request(
         if sampling_params.prompt_logprobs:
             self.prompt_logprob_reqs.add(req_id)
 
+        # Add request lora ID
+        if request.lora_request:
+            lora_id = request.lora_request.lora_int_id
+            if lora_id not in self.lora_id_to_request_ids:
+                self.lora_id_to_request_ids[lora_id] = set()
+
+            self.request_lora_mapping[req_index] = lora_id
+            self.lora_id_to_request_ids[lora_id].add(request.req_id)
+            self.lora_id_to_lora_request[lora_id] = request.lora_request
+        else:
+            # No LoRA
+            self.request_lora_mapping[req_index] = 0
+
     def remove_request(self, req_id: str) -> Optional[int]:
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
@@ -251,6 +273,16 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.prompt_logprob_reqs.discard(req_id)
+
+        # LoRA
+        lora_id = self.request_lora_mapping[req_index]
+        if lora_id != 0:
+            self.lora_id_to_request_ids[lora_id].discard(req_id)
+            if len(self.lora_id_to_request_ids[lora_id]) == 0:
+                self.lora_id_to_request_ids.pop(lora_id)
+                self.lora_id_to_lora_request.pop(lora_id)
+            self.request_lora_mapping[req_index] = 0
+
         return req_index
 
     def clear(self) -> None:
@@ -266,6 +298,9 @@ def clear(self) -> None:
         self.generators.clear()
         self.num_logprobs.clear()
         self.prompt_logprob_reqs.clear()
+        self.request_lora_mapping.fill(0)
+        self.lora_id_to_lora_request.clear()
+        self.lora_id_to_request_ids.clear()
 
     def condense(self, empty_req_indices: List[int]) -> None:
         if self.num_reqs == 0:
@@ -318,6 +353,9 @@ def condense(self, empty_req_indices: List[int]) -> None:
             if generator is not None:
                 self.generators[empty_index] = generator
 
+            self.request_lora_mapping[empty_index] = self.request_lora_mapping[
+                last_req_index]
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -401,6 +439,29 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
         return prompt_token_ids_cpu_tensor.to(device=self.device,
                                               non_blocking=True)
 
+    def make_lora_inputs(
+        self, num_scheduled_tokens: np.ndarray
+    ) -> Tuple[Tuple[int, ...], Tuple[int, ...], Set[LoRARequest]]:
+        """
+        Given the num_scheduled_tokens for each request in the batch, return
+        datastructures used to activate the current LoRAs.
+        Returns:
+            1. prompt_lora_mapping: A tuple of size self.num_reqs where,
+               prompt_lora_mapping[i] is the LoRA id to use for the ith prompt.
+            2. token_lora_mapping: A tuple of size np.sum(num_scheduled_tokens)
+               where, token_lora_mapping[i] is the LoRA id to use for ith token.
+            3. lora_requests: Set of relevant LoRA requests.
+        """
+
+        req_lora_mapping = self.request_lora_mapping[:self.num_reqs]
+        prompt_lora_mapping = tuple(req_lora_mapping)
+        token_lora_mapping = tuple(
+            req_lora_mapping.repeat(num_scheduled_tokens))
+        active_lora_requests: Set[LoRARequest] = set(
+            self.lora_id_to_lora_request.values())
+
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
     @property
     def num_reqs(self) -> int:
         return len(self.req_id_to_index)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index ec6d04cd497..bfc9d1ca83f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -33,6 +33,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
     from vllm.v1.core.scheduler import SchedulerOutput
@@ -40,7 +41,7 @@
 logger = init_logger(__name__)
 
 
-class GPUModelRunner:
+class GPUModelRunner(LoRAModelRunnerMixin):
 
     def __init__(
         self,
@@ -279,6 +280,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 block_ids=new_req_data.block_ids,
                 num_computed_tokens=new_req_data.num_computed_tokens,
                 output_token_ids=[],
+                lora_request=new_req_data.lora_request,
             )
 
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -372,15 +374,16 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Get the number of scheduled tokens for each request.
         # TODO: The Python loop can be slow. Optimize.
-        num_scheduled_tokens = []
+        num_scheduled_tokens_list: List[int] = []
         max_num_scheduled_tokens = 0
         for req_id in self.input_batch.req_ids[:num_reqs]:
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            num_scheduled_tokens.append(num_tokens)
+            num_scheduled_tokens_list.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
                                            num_tokens)
-        num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32)
+        num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
+                                                    dtype=np.int32)
         assert max_num_scheduled_tokens > 0
 
         # Get request indices.
@@ -565,6 +568,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             prefix_kv_lens=prefix_kv_lens,
             suffix_kv_lens=suffix_kv_lens,
         )
+
+        # Hot-Swap lora model
+        if self.lora_config:
+            self.set_active_loras(self.input_batch, num_scheduled_tokens)
+
         # NOTE(woosuk): Due to chunked prefills, the batch may contain partial
         # requests. While we should not sample any token from these partial
         # requests, we do so for simplicity. We will ignore the sampled
@@ -867,6 +875,12 @@ def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
             self.model = get_model(vllm_config=self.vllm_config)
+            if self.lora_config:
+                self.model = self.load_lora_model(self.model,
+                                                  self.model_config,
+                                                  self.scheduler_config,
+                                                  self.lora_config,
+                                                  self.device)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",
@@ -1005,14 +1019,32 @@ def profile_run(self) -> None:
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
-        # Trigger compilation for general shape.
-        hidden_states = self._dummy_run(self.max_num_tokens, dummy_kv_caches)
-        logits = self.model.compute_logits(hidden_states, None)
-        logits = logits[:self.max_num_tokens]
-        # TODO(woosuk): Consider the memory usage of the sampler.
-        torch.cuda.synchronize()
-        del hidden_states, logits
-        self.encoder_cache.clear()
+        # For profile, have maximum num_reqs and that collectively have
+        # maximum num_tokens.
+        num_reqs = self.scheduler_config.max_num_seqs
+        num_tokens = self.max_num_tokens
+        min_tokens_per_req: int = num_tokens // num_reqs
+
+        num_scheduled_tokens_list: List[int] = [min_tokens_per_req] * num_reqs
+        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        assert sum(num_scheduled_tokens_list) == num_tokens
+        assert len(num_scheduled_tokens_list) == num_reqs
+
+        num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
+                                                    dtype=np.int32)
+        logit_indices = np.cumsum(num_scheduled_tokens) - 1
+
+        with self.maybe_profile_with_lora(self.lora_config,
+                                          num_scheduled_tokens):
+            # Trigger compilation for general shape.
+            hidden_states = self._dummy_run(self.max_num_tokens,
+                                            dummy_kv_caches)
+            hidden_states = hidden_states[logit_indices]
+            logits = self.model.compute_logits(hidden_states, None)
+            # TODO(woosuk): Consider the memory usage of the sampler.
+            torch.cuda.synchronize()
+            del hidden_states, logits
+            self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None:
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
new file mode 100644
index 00000000000..e7501ad2ea1
--- /dev/null
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Define LoRA functionality mixin for model runners.
+"""
+
+from contextlib import contextmanager
+from typing import Set, Tuple
+
+import numpy as np
+import torch.nn as nn
+
+from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor.models import supports_lora, supports_multimodal
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+logger = init_logger(__name__)
+
+
+# Defined as a mixin for GPUModelRunner
+class LoRAModelRunnerMixin:
+
+    LORA_WARMUP_RANK = 8
+
+    def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
+                        scheduler_config: SchedulerConfig,
+                        lora_config: LoRAConfig, device: str) -> nn.Module:
+
+        assert supports_lora(
+            model), f"{model.__class__.__name__} does not support LoRA yet."
+
+        if supports_multimodal(model):
+            logger.warning("Regarding multimodal models, vLLM currently "
+                           "only supports adding LoRA to language model.")
+
+        # It's necessary to distinguish between the max_position_embeddings
+        # of VLMs and LLMs.
+        if hasattr(model.config, "max_position_embeddings"):
+            max_pos_embeddings = model.config.max_position_embeddings
+        else:
+            max_pos_embeddings = (
+                model.config.text_config.max_position_embeddings)
+
+        # Add LoRA Manager to the Model Runner
+        self.lora_manager = LRUCacheWorkerLoRAManager(
+            scheduler_config.max_num_seqs,
+            scheduler_config.max_num_batched_tokens,
+            model_config.get_vocab_size(),
+            lora_config,
+            device,
+            model.embedding_modules,
+            model.embedding_padding_modules,
+            max_position_embeddings=max_pos_embeddings,
+        )
+        return self.lora_manager.create_lora_manager(model)
+
+    def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...],
+                          token_lora_mapping: Tuple[int, ...],
+                          lora_requests: Set[LoRARequest]) -> None:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+
+        # We dont make any distinction between prefills and decodes in the
+        # scheduler. To that effect, set is_prefill to True so we use the
+        # sgmv punica kernels always.
+        lora_mapping = LoRAMapping(token_lora_mapping,
+                                   prompt_lora_mapping,
+                                   is_prefill=True)
+        self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
+
+    def set_active_loras(self, input_batch: InputBatch,
+                         num_scheduled_tokens: np.ndarray) -> None:
+
+        prompt_lora_mapping: Tuple[int, ...]  # of size input_batch.num_reqs
+        token_lora_mapping: Tuple[int,
+                                  ...]  # of size np.sum(num_scheduled_tokens)
+        lora_requests: Set[LoRARequest]
+        prompt_lora_mapping, token_lora_mapping, lora_requests = \
+                            input_batch.make_lora_inputs(num_scheduled_tokens)
+        return self._set_active_loras(prompt_lora_mapping, token_lora_mapping,
+                                      lora_requests)
+
+    @contextmanager
+    def maybe_profile_with_lora(self, lora_config: LoRAConfig,
+                                num_scheduled_tokens: np.ndarray):
+        if lora_config is None:
+            yield
+        else:
+            # __enter__ code
+            assert self.lora_manager is not None, "LoRA is not enabled"
+
+            num_reqs = len(num_scheduled_tokens)
+            num_loras = lora_config.max_loras
+
+            # Make prompt lora mapping
+            # Assign LoRA IDs cyclically to simulate a worst-case scenario.
+            prompt_lora_mapping = (np.arange(num_reqs, dtype=np.int32) %
+                                   num_loras) + 1
+
+            # Make token lora mapping
+            token_lora_mapping = np.repeat(prompt_lora_mapping,
+                                           num_scheduled_tokens)
+
+            # Make dummy lora requests
+            lora_requests: Set[LoRARequest] = {
+                LoRARequest(lora_name=f"warmup_{lora_id}",
+                            lora_int_id=lora_id,
+                            lora_path="/not/a/real/path")
+                for lora_id in range(1, num_loras + 1)
+            }
+
+            with self.lora_manager.dummy_lora_cache():
+                # Add the dummy LoRAs here so _set_active_loras doesn't try to
+                # load from disk.
+                for lr in lora_requests:
+                    self.lora_manager.add_dummy_lora(
+                        lr, rank=self.LORA_WARMUP_RANK)
+
+                self._set_active_loras(tuple(prompt_lora_mapping),
+                                       tuple(token_lora_mapping),
+                                       lora_requests)
+
+                yield
+
+            # __exit__ code
+            self.lora_manager.remove_all_adapters()

From ea95d5f5ab987f93eec67e0a354d43b223e414e5 Mon Sep 17 00:00:00 2001
From: Yu Chin Fabian Lim <fabianlim@users.noreply.github.com>
Date: Fri, 7 Feb 2025 07:22:42 +0800
Subject: [PATCH 0043/1240] Add Bamba Model (#10909)

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_mamba_mixer2.py            | 125 +++
 tests/kernels/test_mamba_ssm_ssd.py           | 304 +++++++
 .../{test_jamba.py => test_hybrid.py}         |  35 +-
 tests/models/registry.py                      |   1 +
 vllm/attention/backends/placeholder_attn.py   | 140 ++--
 .../layers/mamba/mamba_mixer2.py              | 534 +++++++++++++
 .../layers/mamba/ops/mamba_ssm.py             |   2 +-
 .../layers/mamba/ops/ssd_bmm.py               | 261 ++++++
 .../layers/mamba/ops/ssd_chunk_scan.py        | 615 ++++++++++++++
 .../layers/mamba/ops/ssd_chunk_state.py       | 750 ++++++++++++++++++
 .../layers/mamba/ops/ssd_combined.py          | 223 ++++++
 .../layers/mamba/ops/ssd_state_passing.py     | 207 +++++
 vllm/model_executor/models/bamba.py           | 592 ++++++++++++++
 vllm/model_executor/models/jamba.py           |  11 +-
 vllm/model_executor/models/mamba.py           |  10 +-
 vllm/model_executor/models/mamba_cache.py     |   7 +-
 vllm/model_executor/models/registry.py        |   1 +
 17 files changed, 3706 insertions(+), 112 deletions(-)
 create mode 100644 tests/kernels/test_mamba_mixer2.py
 create mode 100644 tests/kernels/test_mamba_ssm_ssd.py
 rename tests/models/decoder_only/language/{test_jamba.py => test_hybrid.py} (91%)
 create mode 100644 vllm/model_executor/layers/mamba/mamba_mixer2.py
 create mode 100644 vllm/model_executor/layers/mamba/ops/ssd_bmm.py
 create mode 100644 vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
 create mode 100644 vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
 create mode 100644 vllm/model_executor/layers/mamba/ops/ssd_combined.py
 create mode 100644 vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
 create mode 100644 vllm/model_executor/models/bamba.py

diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/test_mamba_mixer2.py
new file mode 100644
index 00000000000..8c441fcbe61
--- /dev/null
+++ b/tests/kernels/test_mamba_mixer2.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+from typing import Tuple
+
+import pytest
+import torch
+
+from tests.utils import multi_gpu_test
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [128])
+@pytest.mark.parametrize(
+    "hidden_size_n_groups",
+    [
+        (64, 1),
+        (64, 2),
+        (64, 4),  # hidden_size be divisible by num_gpus
+        (100, 5),  # and n_groups must divide hidden_size
+    ])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_mixer2_gated_norm_multi_gpu(
+    batch_size: int,
+    seq_len: int,
+    hidden_size_n_groups: Tuple[int, int],
+    dtype: torch.dtype,
+    device: str = 'cuda',
+):
+    hidden_size, n_groups = hidden_size_n_groups
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(
+                                        num_processes,
+                                        batch_size,
+                                        seq_len,
+                                        hidden_size,
+                                        n_groups,
+                                        dtype,
+                                        device,
+                                    ),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)
+
+
+def mixer2_gated_norm_tensor_parallel(
+    local_rank: int,
+    world_size: int,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    n_groups: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # create random weights an inputs
+    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+    gate_states = torch.randn(batch_size, seq_len, hidden_size)
+
+    # create gated-norm with TP
+    mixer = Mixer2RMSNormGated(
+        full_hidden_size=hidden_size,
+        full_n_groups=n_groups,
+    )
+    mixer.weight.weight_loader(mixer.weight, weight)  # load
+
+    # create gated-norm without TP to compute reference
+    # - utilize mock patching to disable TP when
+    with (unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_world_size",
+            return_value=1),
+          unittest.mock.patch(
+              "vllm.model_executor.layers.mamba.mamba_mixer2."
+              "get_tensor_model_parallel_rank",
+              return_value=0)):
+        mixer_single_gpu = Mixer2RMSNormGated(
+            full_hidden_size=hidden_size,
+            full_n_groups=n_groups,
+        )
+    # assign weight to single-gpu mixer
+    mixer_single_gpu.weight.data = weight
+
+    # generate and compare
+    N = hidden_size // world_size
+    output = mixer(
+        hidden_states[..., local_rank * N:(local_rank + 1) * N],
+        gate_states[..., local_rank * N:(local_rank + 1) * N],
+    )
+    ref_output = mixer_single_gpu(hidden_states, gate_states)
+    torch.allclose(output,
+                   ref_output[..., local_rank * N:(local_rank + 1) * N],
+                   atol=1e-3,
+                   rtol=1e-3)
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/test_mamba_ssm_ssd.py
new file mode 100644
index 00000000000..882513116ed
--- /dev/null
+++ b/tests/kernels/test_mamba_ssm_ssd.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Tuple
+
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
+from vllm.platforms import current_platform
+
+# Added by the IBM Team, 2024
+
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
+
+
+# this is the segsum implementation taken from above
+def segsum(x):
+    """Calculates segment sum."""
+    T = x.size(-1)
+    x = repeat(x, "... d -> ... d e", e=T)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=-1)
+    x = x.masked_fill(~mask, 0)
+    x_segsum = torch.cumsum(x, dim=-2)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=0)
+    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
+    return x_segsum
+
+
+def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
+    """
+    Arguments:
+        X: (batch, length, n_heads, d_head)
+        A: (batch, length, n_heads)
+        B: (batch, length, n_heads, d_state)
+        C: (batch, length, n_heads, d_state)
+    Return:
+        Y: (batch, length, n_heads, d_head)
+    """
+    assert X.dtype == A.dtype == B.dtype == C.dtype
+    assert X.shape[1] % block_len == 0
+
+    # Rearrange into blocks/chunks
+    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
+                  for x in (X, A, B, C))
+
+    A = rearrange(A, "b c l h -> b h c l")
+    A_cumsum = torch.cumsum(A, dim=-1)
+
+    # 1. Compute the output for each intra-chunk (diagonal blocks)
+    L = torch.exp(segsum(A))
+    Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", C, B, L, X)
+
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+    states = torch.einsum("bclhn,bhcl,bclhp->bchpn", B, decay_states, X)
+
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at
+    #    chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    if initial_states is None:
+        initial_states = torch.zeros_like(states[:, :1])
+    states = torch.cat([initial_states, states], dim=1)
+    decay_chunk = torch.exp(segsum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
+    new_states = torch.einsum("bhzc,bchpn->bzhpn", decay_chunk, states)
+    states, final_state = new_states[:, :-1], new_states[:, -1]
+
+    # 4. Compute state -> output conversion per chunk
+    # (left term of low-rank factorization of off-diagonal blocks; C terms)
+    state_decay_out = torch.exp(A_cumsum)
+    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+
+    # Add output of intra-chunk and inter-chunk terms
+    # (diagonal and off-diagonal blocks)
+    Y = rearrange(Y_diag + Y_off, "b c l h p -> b (c l) h p")
+    return Y, final_state
+
+
+def generate_random_inputs(batch_size,
+                           seqlen,
+                           n_heads,
+                           d_head,
+                           itype,
+                           device='cuda'):
+
+    current_platform.seed_everything(0)
+    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    dt = F.softplus(
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
+        4)
+    X = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+
+    return A, dt, X, B, C
+
+
+def generate_continous_batched_examples(example_lens_by_batch,
+                                        num_examples,
+                                        full_length,
+                                        last_taken,
+                                        exhausted,
+                                        n_heads,
+                                        d_head,
+                                        itype,
+                                        device='cuda'):
+
+    # this function generates a random examples of certain length
+    # and then cut according to "example_lens_by_batch" and feed
+    # them in continuous batches to the kernels
+
+    # generate the full-length example
+    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
+                                            d_head, itype)
+
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
+                                                  A * dt,
+                                                  B,
+                                                  C,
+                                                  block_len=full_length // 4)
+
+    # internal function that outputs a cont batch of examples
+    # given a tuple of lengths for each example in the batch
+    # e.g., example_lens=(8, 4) means take 8 samples from first eg,
+    #       4 examples from second eg, etc
+    def get_continuous_batch(example_lens: Tuple[int, ...]):
+
+        indices = []
+        for i, x in enumerate(example_lens):
+            c = last_taken.get(i, 0)
+            indices.append((c, c + x))
+            last_taken[i] = (c + x) % full_length
+            exhausted[i] = last_taken[i] == 0
+
+        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
+                              ]).unsqueeze(0) for x in (dt, X, B, C))
+
+    # internal function that maps "n" to the appropriate right boundary
+    # value when forming continuous batches from examples of length given
+    # by "full_length".
+    # - e.g., when n > full_length, returns n % full_length
+    #         when n == full_length, returns full_length
+    def end_boundary(n: int):
+        return n - ((n - 1) // full_length) * full_length
+
+    IND_E = None
+    for spec in example_lens_by_batch:
+
+        # get the (maybe partial) example seen in this cont batch
+        dt2, X2, B2, C2 = get_continuous_batch(spec)
+
+        # get the metadata
+        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
+        sed_idx = torch.zeros(cu_seqlens[-1],
+                              dtype=torch.int32,
+                              device=cu_seqlens.device)
+        for i, (srt, end) in enumerate(zip(
+                cu_seqlens,
+                cu_seqlens[1:],
+        )):
+            sed_idx[srt:end] = i
+
+        # for cont batch
+        if IND_E is None:
+            IND_S = [0 for _ in range(len(spec))]
+        else:
+            IND_S = [x % full_length for x in IND_E]
+        IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
+
+        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
+               cu_seqlens, sed_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+
+
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
+@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
+@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
+                                         itype):
+
+    # this tests the kernels on a single example (no batching)
+
+    # set seed
+    batch_size = 1  # batch_size
+    # ssd_minimal_discrete requires chunk_size divide seqlen
+    # - this is only required for generating the reference seqs,
+    #   it is not an operational limitation.
+    seqlen, chunk_size = seq_len_chunk_size
+
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
+                                            d_head, itype)
+
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
+                                                  B, C, chunk_size)
+
+    Y, final_state = mamba_chunk_scan_combined(X,
+                                               dt,
+                                               A,
+                                               B,
+                                               C,
+                                               chunk_size,
+                                               D=None,
+                                               return_final_states=True)
+
+    # just test the last in sequence
+    torch.allclose(Y[:, -1], Y_min[:, -1], atol=1e-3, rtol=1e-3)
+
+    # just test the last head
+    # NOTE, in the kernel we always cast states to fp32
+    torch.allclose(final_state[:, -1],
+                   final_state_min[:, -1].to(torch.float32),
+                   atol=1e-3,
+                   rtol=1e-3)
+
+
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("n_heads", [4, 8, 13])
+@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
+@pytest.mark.parametrize(
+    "seq_len_chunk_size_cases",
+    [
+
+        # small-ish chunk_size (8)
+        (64, 8, 2, [(64, 32), (64, 32)]),
+        (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
+        (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
+        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
+                    (4, 4)]),  # chunk_size larger than cont batches
+        (64, 8, 5, [
+            (64, 32, 16, 8, 8),
+            (8, 16, 32, 16, 8),
+            (8, 8, 16, 32, 16),
+        ]),  # mode examples with varied lengths
+
+        # odd chunk_size
+        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
+                     (21, 15)]),  # irregular sizes
+
+        # large-ish chunk_size (256)
+        (64, 256, 1, [(5, ), (1, ), (1, ),
+                      (1, )]),  # irregular sizes with small sequences
+        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
+                      (1, 2)]),  # irregular sizes with small sequences
+    ])
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
+                                     itype):
+
+    # this test with multiple examples in a continuous batch
+    # (i.e. chunked prefill)
+
+    seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
+
+    # hold state during the cutting process so we know if an
+    # example has been exhausted and needs to cycle
+    last_taken: Dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: Dict = {}  # map: eg -> boolean indicating example is exhausted
+
+    states = None
+    for Y_min, cu_seqlens, sed_idx, (A, dt, X, B,
+                                     C) in generate_continous_batched_examples(
+                                         cases, num_examples, seqlen,
+                                         last_taken, exhausted, n_heads,
+                                         d_head, itype):
+
+        Y, new_states = mamba_chunk_scan_combined(
+            X,
+            dt,
+            A,
+            B,
+            C,
+            chunk_size,
+            D=None,
+            cu_seqlens=cu_seqlens,
+            seq_idx=sed_idx,
+            return_varlen_states=True,
+            initial_states=states,
+        )
+
+        # just test the last in sequence
+        for i in range(num_examples):
+
+            # just test one dim and dstate
+            Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_min_eg = Y_min[i][:, 0, 0]
+            torch.allclose(Y_eg, Y_min_eg, atol=1e-3, rtol=1e-3)
+
+        # update states
+        states = new_states
+        for i, clear in exhausted.items():
+            if clear:
+                states[i].fill_(0.)
+                exhausted[i] = False
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_hybrid.py
similarity index 91%
rename from tests/models/decoder_only/language/test_jamba.py
rename to tests/models/decoder_only/language/test_hybrid.py
index cc98f1d7b5c..a39b1192358 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -8,7 +8,8 @@
 
 from ...utils import check_outputs_equal
 
-MODELS = ["ai21labs/Jamba-tiny-dev"]
+# This test is for the hybrid models
+MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -23,6 +24,10 @@ def test_models(
     max_tokens: int,
 ) -> None:
 
+    # numeric error produces different generation
+    if 'Bamba' in model:
+        example_prompts.pop(3)
+
     with hf_runner(
             model,
             dtype=dtype,
@@ -108,15 +113,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [7])
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                 model: str, dtype: str,
                                 max_tokens: int) -> None:
     # numeric error during prefill chucking produces different generation
     # compared to w/o prefill chunking for those examples, removed them for now
-    example_prompts.pop(7)
-    example_prompts.pop(2)
-    example_prompts.pop(1)
+    if 'Jamba' in model:
+        example_prompts.pop(7)
+        example_prompts.pop(2)
+        example_prompts.pop(1)
+    elif 'Bamba' in model:
+        example_prompts.pop(6)
+        example_prompts.pop(3)
+        example_prompts.pop(2)
+        dtype = "half"  # use a different dtype for Bamba
 
     with hf_runner(
             model,
@@ -145,7 +156,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_parallel_sampling(
     vllm_runner,
@@ -249,17 +260,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
     dtype: str,
     example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba inner state management doesn't
+    # This test is for verifying that the hybrid inner state management doesn't
     # collapse in case where the number of incoming requests and
     # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that Jamba does support
+    # This could generally happen due to the fact that hybrid does support
     # statelessness mechanism where it can cleanup new incoming requests in
     # a single step.
     try:
         with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
             vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
     except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
                     "steps finished requests registered unnecessarily ")
 
 
@@ -271,14 +282,14 @@ def test_state_cleanup(
     dtype: str,
     example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba state is cleaned up between
+    # This test is for verifying that the Hybrid state is cleaned up between
     # steps, If its not cleaned, an error would be expected.
     try:
         with vllm_runner(model, dtype=dtype) as vllm_model:
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
                     "could be related to finished_requests_ids")
 
 
@@ -324,7 +335,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_jamba_distributed_produces_identical_generation(
+def test_hybrid_distributed_produces_identical_generation(
         vllm_runner, model: str, dtype: str, max_tokens: int,
         example_prompts) -> None:
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 20787fe008a..3fd94b89c8a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -102,6 +102,7 @@ def check_available_online(
                                          trust_remote_code=True),
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
                                          trust_remote_code=True),
+    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index 9f6e731afd1..f363ba0c1e3 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -2,6 +2,7 @@
 
 from collections import defaultdict
 from dataclasses import dataclass
+from itertools import accumulate
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type
 
 import torch
@@ -15,6 +16,7 @@
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
+from vllm.utils import async_tensor_h2d
 
 # Placeholder attention backend for models like Mamba and pooling models that
 # lack attention.
@@ -77,43 +79,39 @@ class PlaceholderAttentionMetadata(AttentionMetadata):
     # seq_lens stored as a tensor.
     seq_lens_tensor: Optional[torch.Tensor]
 
-    # Maximum query length in the batch.
-    max_query_len: Optional[int]
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int]
-
     # Maximum sequence length among prefill batch. 0 if there are decoding
     # requests only.
     max_prefill_seq_len: int
     # Maximum sequence length among decode batch. 0 if there are prefill
     # requests only.
     max_decode_seq_len: int
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor]
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor]
     # (batch_size,) A tensor of context lengths (tokens that are computed
     # so far).
     context_lens_tensor: Optional[torch.Tensor]
 
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
-
     # Whether or not if cuda graph is enabled.
     # Cuda-graph is currently enabled for decoding only.
     # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
     use_cuda_graph: bool
 
+    # Maximum query length in the batch.
+    max_query_len: Optional[int]
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int]
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # Placeholder.
+    block_tables: Optional[torch.Tensor] = None
+
     _cached_prefill_metadata: Optional["PlaceholderAttentionMetadata"] = None
     _cached_decode_metadata: Optional["PlaceholderAttentionMetadata"] = None
 
@@ -125,11 +123,17 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         if self._cached_prefill_metadata is not None:
             return self._cached_prefill_metadata
 
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-        assert self.query_start_loc is not None
-        assert self.context_lens_tensor is not None
-        assert self.seq_start_loc is not None
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
 
         # Placeholders
         slot_mapping = torch.empty(0)
@@ -143,15 +147,15 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             multi_modal_placeholder_index_maps=self.
             multi_modal_placeholder_index_maps,
             enable_kv_scales_calculation=self.enable_kv_scales_calculation,
-            seq_lens=self.seq_lens[:self.num_prefills],
-            seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills],
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
             max_decode_query_len=0,
             max_query_len=self.max_query_len,
             max_prefill_seq_len=self.max_prefill_seq_len,
             max_decode_seq_len=0,
-            query_start_loc=self.query_start_loc[:self.num_prefills + 1],
-            seq_start_loc=self.seq_start_loc[:self.num_prefills + 1],
-            context_lens_tensor=self.context_lens_tensor[:self.num_prefills],
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=False,
         )
@@ -169,6 +173,8 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
         # Placeholders
         slot_mapping = torch.empty(0)
         block_tables = torch.empty(0)
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
 
         self._cached_decode_metadata = PlaceholderAttentionMetadata(
             num_prefills=0,
@@ -178,13 +184,16 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
             multi_modal_placeholder_index_maps=None,
             enable_kv_scales_calculation=True,
             seq_lens=None,
-            seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:],
+            seq_lens_tensor=seq_lens_tensor,
             max_decode_query_len=self.max_decode_query_len,
             max_query_len=None,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=None,
-            seq_start_loc=None,
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
             context_lens_tensor=None,
             block_tables=block_tables,
             use_cuda_graph=self.use_cuda_graph,
@@ -235,8 +244,6 @@ def advance_step(self,
         assert self.context_lens_tensor is not None
         assert self.context_lens_tensor.shape == (num_queries, )
 
-        assert self.block_tables is not None
-
         # Update query lengths. Note that we update only queries and not seqs,
         # since tensors may be padded due to captured cuda graph batch size
         for i in range(num_queries):
@@ -299,9 +306,6 @@ def _add_seq_group(
                 self.num_prefill_tokens += token_len
                 self.prefill_seq_lens.append(seq_len)
             else:
-                assert query_len == 1, (
-                    "seq_len: {}, context_len: {}, query_len: {}".format(
-                        seq_len, context_len, query_len))
                 self.num_decode_tokens += query_len
                 self.curr_seq_lens.append(curr_seq_len)
 
@@ -323,15 +327,6 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
-        logits_soft_cap = getattr(self.runner.model_config.hf_config,
-                                  "attn_logit_softcapping", None)
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "Please use Flashinfer backend for models with logits_soft_cap"
-                " (i.e., Gemma-2). Otherwise, the output might be wrong."
-                " Set Flashinfer backend by "
-                "export VLLM_ATTENTION_BACKEND=FLASHINFER.")
-
         max_query_len = max(query_lens)
         decode_query_lens = query_lens[self.num_prefills:]
         if len(decode_query_lens) > 0:
@@ -341,48 +336,37 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
         max_decode_seq_len = max(self.curr_seq_lens, default=0)
         num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
 
         if use_captured_graph:
-            num_decode_tokens = batch_size
-
+            num_decode_tokens = batch_size - self.num_prefill_tokens
         assert max_query_len > 0, ("query_lens: {}".format(query_lens))
 
-        context_lens_tensor = torch.tensor(self.context_lens,
-                                           dtype=torch.int,
-                                           device=device)
-        seq_lens_tensor = torch.tensor(seq_lens,
-                                       dtype=torch.int,
-                                       device=device)
-        query_lens_tensor = torch.tensor(query_lens,
-                                         dtype=torch.long,
-                                         device=device)
-        query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1,
-                                      dtype=torch.int32,
-                                      device=device)
-        seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1,
-                                    dtype=torch.int32,
-                                    device=device)
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+
         placeholder_index_maps = {
             modality: placeholder_map.index_map()
             for modality, placeholder_map in
             self.multimodal_placeholder_maps.items()
         }
-        torch.cumsum(seq_lens_tensor,
-                     dim=0,
-                     dtype=seq_start_loc.dtype,
-                     out=seq_start_loc[1:])
-        torch.cumsum(query_lens_tensor,
-                     dim=0,
-                     dtype=query_start_loc.dtype,
-                     out=query_start_loc[1:])
 
         # Placeholders
-        slot_mapping = torch.empty(0)
+        slot_mapping_tensor = torch.empty(0)
         block_tables = torch.empty(0)
 
         return PlaceholderAttentionMetadata(
             num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping,
+            slot_mapping=slot_mapping_tensor,
             multi_modal_placeholder_index_maps=placeholder_index_maps,
             enable_kv_scales_calculation=True,
             num_prefill_tokens=self.num_prefill_tokens,
@@ -393,8 +377,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             max_decode_query_len=max_decode_query_len,
             max_prefill_seq_len=max_prefill_seq_len,
             max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
             context_lens_tensor=context_lens_tensor,
             block_tables=block_tables,
             use_cuda_graph=use_captured_graph,
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
new file mode 100644
index 00000000000..5fd12649102
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -0,0 +1,534 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.attention.backends.placeholder_attn import (
+    PlaceholderAttentionMetadata)
+from vllm.attention.backends.xformers import XFormersMetadata
+from vllm.distributed import (divide, get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_gather,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.ops.causal_conv1d import (
+    causal_conv1d_fn, causal_conv1d_update)
+from vllm.model_executor.layers.mamba.ops.mamba_ssm import (
+    selective_state_update)
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import (
+    LoaderFunction, composed_weight_loader, sharded_weight_loader)
+from vllm.model_executor.models.mamba_cache import MambaCacheParams
+from vllm.model_executor.utils import set_weight_attrs
+
+# Added by the IBM Team, 2024
+
+
+# Adapted from transformers.models.mamba2.modeling_mamba2.MambaRMSNormGated
+@CustomOp.register("mixer2_gated_rms_norm")
+class Mixer2RMSNormGated(CustomOp):
+
+    def __init__(self, full_hidden_size, full_n_groups, eps=1e-6):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.full_hidden_size = full_hidden_size
+        self.group_size = full_hidden_size // full_n_groups
+        self.per_rank_hidden_size = full_hidden_size // self.tp_size
+        self.n_groups = full_hidden_size // self.group_size
+
+        self.variance_epsilon = eps
+        self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size))
+        set_weight_attrs(self.weight,
+                         {"weight_loader": sharded_weight_loader(0)})
+        assert self.full_hidden_size % self.tp_size== 0,\
+            "Tensor parallel world size must divide hidden size."
+
+    def forward_native(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ):
+        # Three tensor-parallel cases:
+        #   1. n_groups is 1
+        #      In this case we parallelize along the reduction dim.
+        #      Each rank computes a local sum of squares followed by AllReduce
+        #   2. tp_size divides n_groups
+        #      Each rank only reduces within its local group(s).
+        #      No collective ops necessary.
+        #   3. The general case can be pretty complicated so we AllGather
+        #      the input and then redundantly compute the RMSNorm.
+        input_dtype = x.dtype
+        x = x * nn.functional.silu(gate.to(torch.float32))
+
+        if self.n_groups == 1:
+            if self.tp_size > 1:
+                # Compute local sum and then reduce to obtain global sum
+                local_sums = x.pow(2).sum(dim=-1, keepdim=True)
+                global_sums = tensor_model_parallel_all_reduce(local_sums)
+                # Calculate the variance
+                count = self.tp_size * x.shape[-1]
+                variance = (global_sums / count)
+
+            else:
+                variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.variance_epsilon)
+        else:
+            redundant_tp: bool = self.n_groups % self.tp_size != 0
+            if redundant_tp:
+                # To handle the general case, redundantly apply the variance
+                x = tensor_model_parallel_all_gather(x, -1)
+
+            *prefix_dims, hidden_dim = x.shape
+            group_count = hidden_dim // self.group_size
+            x_grouped = x.view(*prefix_dims, group_count, self.group_size)
+            variance = x_grouped.pow(2).mean(-1, keepdim=True)
+            x_grouped = x_grouped * torch.rsqrt(variance +
+                                                self.variance_epsilon)
+            x = x_grouped.view(*prefix_dims, hidden_dim)
+
+            if redundant_tp:
+                start = self.per_rank_hidden_size * self.tp_rank
+                end = start + self.per_rank_hidden_size
+                x = x[..., start:end]
+
+        return self.weight * x.to(input_dtype)
+
+    def forward_cuda(
+        self,
+        x: torch.Tensor,
+        gate: torch.Tensor,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+
+        if self.tp_size > 1 or self.n_groups != 1:
+            return self.forward_native(x, gate)
+
+        from vllm import _custom_ops as ops
+
+        # cast x and gate to float32 before silu
+        out = torch.empty_like(x)
+        y = x * nn.functional.silu(gate.to(torch.float32))
+        ops.rms_norm(
+            out,
+            y.to(x.dtype),
+            self.weight.data,
+            self.variance_epsilon,
+        )
+        return out
+
+
+def extra_groups_for_head_shards(ngroups: int, tp_size: int):
+    """Compute the increase in group numbers to account for 
+    replication in order to accompany the head shards."""
+
+    # in the case ngoups % tp_size == 0, this will be zero
+    if ngroups % tp_size == 0:
+        return 0
+
+    return tp_size - ngroups % tp_size
+
+
+def mamba_v2_sharded_weight_loader(
+    shard_spec: List[Tuple[int, int, float]],
+    tp_size: int,
+    tp_rank: int,
+) -> LoaderFunction:
+    """Create a weight loader for mamba v2. This ensures that the projections 
+    are correctly sharded so that they can be split into x, B, C. It also 
+    ensures the the all the groups corresponding to a head shard is placed 
+    together with it.
+    """
+
+    def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+
+        # - track boundary of (sharded) param, and loaded_weight, respectively
+        boundary, loaded_boundary = 0, 0
+
+        # - iterate over the shard specs
+        for full_dim, extra, ratio in shard_spec:
+            # - full dim is the model dim (before TP).
+            # - extra > 0, means there is expected overall increase
+            #   of dimensions. This is so because of replication.
+            # - ratio is used map the tp_rank to the actual shard
+            #   rank. This is useful when there is replication of
+            #   groups to accompany head shards.
+
+            # - size of the loaded shard
+            shard_size = full_dim // tp_size
+
+            # - compute the rank into the loaded shard.
+            # - if there is replication, different TP shards will
+            #   take from the same rank.
+            rank = tp_rank // ratio
+
+            # - leftmost boundary index into loaded weight.
+            loaded_skip = rank * shard_size
+            loaded_start_idx = loaded_boundary + loaded_skip
+
+            # - take these many dims from the loaded weight.
+            take = min(shard_size, full_dim - extra - loaded_skip)
+
+            # - always shard on dim 0
+            # - the ignore is for a mundane mypy error as it does not
+            #   seem to handle slices well.
+            # https://github.com/python/mypy/issues/2410
+            param.data[
+                boundary:(boundary + take),  # type: ignore[misc]
+                ...] = loaded_weight[loaded_start_idx:(  # type: ignore[misc]
+                    loaded_start_idx + take)]  # type: ignore[misc]
+
+            # move indexing boundaries
+            boundary += shard_size
+            loaded_boundary += (full_dim - extra)
+
+    return loader
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+@CustomOp.register("mamba_mixer2")
+class MambaMixer2(CustomOp):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute
+    the `contextualized_states`. A, D are input independent
+    (see Mamba paper [1] Section 3.5.2 "Interpretation of A"
+    for why A isn't selective) ∆, B, C are input-dependent
+    (this is a key difference between Mamba and the linear time
+    invariant S4, and is why Mamba is called
+    **selective** state spaces)
+    """
+
+    def __init__(self,
+                 hidden_size: int,
+                 ssm_state_size: int,
+                 conv_kernel_size: int,
+                 intermediate_size: int,
+                 use_conv_bias: bool,
+                 use_bias: bool,
+                 n_groups: int = 1,
+                 num_heads: int = 128,
+                 head_dim: int = 64,
+                 rms_norm_eps: float = 1e-5,
+                 activation="silu",
+                 chunk_size: int = 256,
+                 quant_config: Optional[QuantizationConfig] = None):
+        super().__init__()
+
+        # For TP, the sharding plan is as follows:
+        # - for the conv modules, since
+        #   conv_dim = intermediate_size * 2 * n_groups * ssm_state_size,
+        #   we shard intermediate_size and n_groups
+        # - since intermediate_size = n_heads * head_dim, sharding on
+        #   intermediate_size is achieved by sharding on n_heads.
+        # - IF, world_size divides groups, then sharding
+        #   (n_groups / world_size, n_heads / world_size)
+        #   also maintains the invariant n_heads % n_groups == 0
+        # - HOWEVER IF, world_size DOES NOT divide groups, then we need
+        #   to allocate extra space in the shard, such that groups
+        #   may be replicated to follow the head shard.
+        self.tp_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        assert num_heads % self.tp_size == 0, \
+            "Tensor parallel world size must divide num heads."
+
+        self.ssm_state_size = ssm_state_size
+        self.activation = activation
+
+        self.chunk_size = chunk_size
+        self.intermediate_size = intermediate_size
+        self.head_dim = head_dim
+        self.num_heads = num_heads
+
+        self.n_groups = n_groups
+        if n_groups % self.tp_size != 0:
+            # - for TP we shard conv_dim by sharding on n_groups,
+            # - but if n_groups cannot divide tp_size, we need to
+            #   extend some extra groups
+            self.n_groups = n_groups + extra_groups_for_head_shards(
+                n_groups, self.tp_size)
+
+        self.conv_dim = (intermediate_size +
+                         2 * self.n_groups * ssm_state_size)
+        self.conv1d = ColumnParallelLinear(
+            input_size=conv_kernel_size,
+            output_size=self.conv_dim,
+            bias=use_conv_bias,
+            quant_config=None,
+        )
+        # unsqueeze to fit conv1d weights shape into the linear weights shape.
+        # Can't do this in `weight_loader` since it already exists in
+        # `ColumnParallelLinear` and `set_weight_attrs`
+        # doesn't allow to override it
+        self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+
+        self.in_proj = ColumnParallelLinear(input_size=hidden_size,
+                                            output_size=intermediate_size +
+                                            self.conv_dim + self.num_heads,
+                                            bias=use_bias,
+                                            quant_config=quant_config)
+
+        # - because in_proj is a concatenation of 3 weights, we
+        #   need to interleave them before sharding
+        # - use the custom weight loader mamba_v2_sharded_weight_loader
+        #   for conv1d.bias, covn1d.weight and in_proj.weight
+        # - need to set these settings, to assign the groups to the head shards
+        group_shard_settings = (
+            self.n_groups * self.ssm_state_size,  # expected model size
+            (self.n_groups - n_groups) *
+            self.ssm_state_size,  # extra dims assigned
+            self.num_heads //
+            n_groups,  # ratio for mapping back to original group
+        )
+        intermediate_settings = (intermediate_size, 0, 1)
+        head_setings = (self.num_heads, 0, 1)
+
+        # - the weight already has a "weight_loader" attribute
+        #   which set_weight_attrs will raise if we do not
+        #   delete before trying to override it
+        # - ditto for the otther two weights below
+        delattr(self.conv1d.bias, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.bias, {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader(
+                    [
+                        intermediate_settings,
+                        group_shard_settings,
+                        group_shard_settings,
+                    ],
+                    self.tp_size,
+                    tp_rank,
+                )
+            })
+
+        delattr(self.conv1d.weight, "weight_loader")
+        set_weight_attrs(
+            self.conv1d.weight, {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader([
+                    intermediate_settings,
+                    group_shard_settings,
+                    group_shard_settings,
+                ], self.tp_size, tp_rank)
+            })
+
+        delattr(self.in_proj.weight, "weight_loader")
+        set_weight_attrs(
+            self.in_proj.weight,
+            {
+                "weight_loader":
+                mamba_v2_sharded_weight_loader(
+                    [
+                        intermediate_settings,  # for gate
+                        intermediate_settings,
+                        group_shard_settings,
+                        group_shard_settings,
+                        head_setings,  # for dt
+                    ],
+                    self.tp_size,
+                    tp_rank)
+            })
+
+        # - these are TPed by heads to reduce the size of the
+        #   temporal shape
+        self.A = nn.Parameter(
+            torch.empty(
+                divide(num_heads, self.tp_size),
+                dtype=torch.float32,
+            ))
+        self.D = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size))
+
+        set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
+        a_weight_loader = composed_weight_loader(
+            sharded_weight_loader(0), lambda x: -torch.exp(x.float()))
+        set_weight_attrs(self.A, {"weight_loader": a_weight_loader})
+        set_weight_attrs(self.dt_bias,
+                         {"weight_loader": sharded_weight_loader(0)})
+
+        self.out_proj = RowParallelLinear(intermediate_size,
+                                          hidden_size,
+                                          bias=use_bias,
+                                          input_is_parallel=True,
+                                          quant_config=quant_config)
+
+        self.norm = Mixer2RMSNormGated(intermediate_size,
+                                       n_groups,
+                                       eps=rms_norm_eps)
+
+    def forward_native(self, hidden_states: torch.Tensor,
+                       attn_metadata: AttentionMetadata,
+                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+        pass
+
+    def forward_cuda(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        mamba_cache_params: MambaCacheParams,
+        sequence_idx: Optional[torch.Tensor] = None,
+    ):
+
+        seq_len, _ = hidden_states.shape
+        groups_time_state_size = self.n_groups * self.ssm_state_size
+
+        # detect if there are prefills
+        has_prefill = attn_metadata.num_prefills > 0
+
+        # - also need flags to indicate if there are initial states
+        # - currently we really only support the FlashAttention backend
+        has_initial_states = None
+        if (isinstance(attn_metadata,
+                       (FlashAttentionMetadata, XFormersMetadata,
+                        PlaceholderAttentionMetadata))
+                and attn_metadata.context_lens_tensor is not None):
+            has_initial_states = attn_metadata.context_lens_tensor > 0
+
+        # 1. Gated MLP's linear projection
+        projected_states, _ = self.in_proj(hidden_states)
+        gate, hidden_states_B_C, dt = torch.split(
+            projected_states,
+            [
+                self.intermediate_size // self.tp_size,
+                self.conv_dim // self.tp_size,
+                self.num_heads // self.tp_size,
+            ],
+            dim=-1,
+        )
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0),
+                                               self.conv1d.weight.size(2))
+
+        if has_prefill:
+            # |---------- N-1 iteration --------|
+            # |---------------- N iteration ---------------------|
+            # |- tokenA -|......................|-- newTokens ---|
+            # |---------- context_len ----------|
+            # |-------------------- seq_len ---------------------|
+            #                                   |-- query_len ---|
+
+            # - "cache_indices" updates the conv_state cache in positions
+            #   pointed to by "mamba_cache_params.state_indices_tensor"
+            hidden_states_B_C = causal_conv1d_fn(
+                hidden_states_B_C.transpose(0, 1),
+                conv_weights,
+                self.conv1d.bias,
+                activation=self.activation,
+                conv_states=mamba_cache_params.conv_state,
+                has_initial_state=has_initial_states,
+                cache_indices=mamba_cache_params.state_indices_tensor,
+                query_start_loc=attn_metadata.query_start_loc).transpose(
+                    0, 1)[:seq_len]
+
+            # TODO: Why is this needed?
+            hidden_states_B_C = hidden_states_B_C.contiguous()
+        else:
+            hidden_states_B_C = causal_conv1d_update(
+                hidden_states_B_C,
+                mamba_cache_params.conv_state,
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+                conv_state_indices=mamba_cache_params.state_indices_tensor)
+
+        # - get hidden_states, B and C after depthwise convolution.
+        hidden_states, B, C = torch.split(
+            hidden_states_B_C,
+            [
+                self.intermediate_size // self.tp_size,
+                groups_time_state_size // self.tp_size,
+                groups_time_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
+        # 3. State Space Model sequence transformation
+        if has_prefill:
+
+            initial_states = None
+            if has_initial_states is not None and any(has_initial_states):
+                for idx in mamba_cache_params.state_indices_tensor[
+                        ~has_initial_states]:
+                    mamba_cache_params.ssm_state[idx].zero_()
+                initial_states = mamba_cache_params.ssm_state[
+                    mamba_cache_params.state_indices_tensor]
+
+            scan_output, varlen_state = mamba_chunk_scan_combined(
+                hidden_states.view(1, seq_len, self.num_heads // self.tp_size,
+                                   self.head_dim),
+                dt.unsqueeze(0),
+                self.A,
+                B.view(1, seq_len, self.n_groups // self.tp_size, -1),
+                C.view(1, seq_len, self.n_groups // self.tp_size, -1),
+                chunk_size=self.chunk_size,
+                D=self.D,
+                z=None,
+                dt_bias=self.dt_bias,
+                seq_idx=sequence_idx,
+                cu_seqlens=attn_metadata.query_start_loc,
+                initial_states=initial_states,
+                return_varlen_states=True,
+                return_final_states=False,
+                dt_softplus=True,
+                dt_limit=(0.0, float("inf")),
+            )
+
+            # update ssm states
+            # - varlen state is a (batch, nheads, headdim, dstate) tensor
+            for i, idx in enumerate(mamba_cache_params.state_indices_tensor):
+                mamba_cache_params.ssm_state[idx].copy_(varlen_state[i])
+
+            # - reshape
+            hidden_states = scan_output.view(seq_len, -1)
+        else:
+
+            n_groups = self.n_groups // self.tp_size
+            A = self.A[:, None, ...][:, :, None].expand(
+                -1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32)
+            dt = dt[:, :, None].expand(-1, -1, self.head_dim)
+            dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim)
+            D = self.D[:, None, ...].expand(-1, self.head_dim)
+            B = B.view(-1, n_groups, B.shape[1] // n_groups)
+            C = C.view(-1, n_groups, C.shape[1] // n_groups)
+            hidden_states_reshaped = hidden_states.view(
+                -1, self.num_heads // self.tp_size, self.head_dim)
+
+            # - the hidden is reshaped into number of current batches
+            # - in this case there is no more prefill, so the batches gen
+            #   1 token at a time
+            # - thus hidden will be (bs, num_heads, head_dim)
+            # - mamba_cache_params.ssm_state's slots will be selected
+            #   using "mamba_cache_params.state_indices_tensor", just as
+            #   above in the prefill case
+
+            hidden_states = selective_state_update(
+                mamba_cache_params.ssm_state,
+                hidden_states_reshaped,
+                dt,
+                A,
+                B,
+                C,
+                D,
+                z=None,
+                dt_bias=dt_bias,
+                dt_softplus=True,
+                state_batch_indices=mamba_cache_params.state_indices_tensor,
+            )
+            hidden_states = hidden_states.view(
+                -1, (self.num_heads // self.tp_size) * self.head_dim)
+
+        # # 4. gated MLP
+        hidden_states = self.norm(hidden_states, gate)
+
+        # # 5. Final linear projection
+        out, _ = self.out_proj(hidden_states)
+        return out
diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
index 3c35f1ac0dc..b31b980fbe8 100644
--- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
+++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Copyright (c) 2024, Tri Dao, Albert Gu.
-# Adapted from https://github.com/state-spaces/mamba/blob/main/mamba_ssm/ops/triton/selective_state_update.py
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py
 
 import torch
 import triton
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
new file mode 100644
index 00000000000..388a6332721
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py
@@ -0,0 +1,261 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py
+
+# ruff: noqa: E501,SIM102
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['chunk_size', 'K', 'IS_CAUSAL'],
+)
+@triton.jit
+def _bmm_chunk_fwd_kernel(
+    # Pointers to matrices
+    a_ptr,
+    b_ptr,
+    out_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    seqlen,
+    chunk_size,
+    K,
+    ngroups,
+    stride_a_batch,
+    stride_a_seqlen,
+    stride_a_head,
+    stride_ak,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_bk,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_outm,
+    stride_outn,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    dot_dtype: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_ch = tl.program_id(axis=2).to(tl.int64)
+    pid_c = pid_ch // ngroups
+    pid_h = pid_ch - pid_c * ngroups
+    num_pid_n = tl.cdiv(chunk_size, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    if IS_CAUSAL:
+        if pid_n * BLOCK_SIZE_N >= (pid_m + 1) * BLOCK_SIZE_M:
+            return
+    a_ptr += pid_b * stride_a_batch + pid_c * chunk_size * stride_a_seqlen + pid_h * stride_a_head
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + pid_h * stride_b_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_a_seqlen +
+                      offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk +
+                      offs_n[None, :] * stride_b_seqlen)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        a = tl.load(a_ptrs,
+                    mask=(offs_m[:, None] < chunk_size_limit) &
+                    (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                    other=0.0).to(dot_dtype)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < K - k * BLOCK_SIZE_K) &
+                    (offs_n[None, :] < chunk_size_limit),
+                    other=0.0).to(dot_dtype)
+        acc += tl.dot(a, b)
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    if HAS_SEQ_IDX:
+        chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+        seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+                            mask=offs_m < chunk_size_limit,
+                            other=-1)
+        seq_idx_n = tl.load(seq_idx_ptr + offs_n * stride_seq_idx_seqlen,
+                            mask=offs_n < chunk_size_limit,
+                            other=-2)
+        acc = tl.where(seq_idx_m[:, None] == seq_idx_n[None, :], acc, 0.0)
+    out = acc.to(out_ptr.dtype.element_ty)
+
+    out_ptr += pid_b * stride_out_batch + pid_c * stride_out_chunk + pid_h * stride_out_head
+    out_ptrs = out_ptr + (stride_outm * offs_m[:, None] +
+                          offs_n[None, :] * stride_outn)
+    tl.store(out_ptrs,
+             out,
+             mask=(offs_m[:, None] < chunk_size) &
+             (offs_n[None, :] < chunk_size))
+
+
+def _bmm_chunk_fwd(a,
+                   b,
+                   chunk_size,
+                   seq_idx=None,
+                   causal=False,
+                   output_dtype=None):
+    """
+    Argument:
+        a: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        b: (batch, seqlen, k) or (batch, seqlen, ngroups, k)
+        seq_idx: (batch, seqlen) or None. out[i, j] for seq_idx[i] != seq_idx[j] will be zeroed out.
+        causal: if True, then out[i, j] for i > j will be arbitrary, only out[i, j] for i <= j are
+            guaranteed to be correct.
+    Return:
+        out: (batch, nchunks, chunk_size, chunk_size) or (batch, nchunks, ngroups, chunk_size, chunk_size)
+    """
+    # Check constraints.
+    has_groups = a.dim() == 4
+    if not has_groups:
+        batch, seqlen, k = a.shape
+    else:
+        batch, seqlen, ngroups, k = a.shape
+    assert b.shape == a.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if a.stride(-1) != 1 and a.stride(1) != 1:
+        a = a.contiguous()
+    if b.stride(-1) != 1 and b.stride(1) != 1:
+        b = b.contiguous()
+    nchunks = math.ceil(seqlen / chunk_size)
+    # Allocates output.
+    out_dtype = a.dtype if output_dtype is None else output_dtype
+    out = torch.empty(
+        (batch, nchunks, chunk_size, chunk_size) if not has_groups else
+        (batch, nchunks, ngroups, chunk_size, chunk_size),
+        device=a.device,
+        dtype=out_dtype)
+    dot_dtype = (tl.bfloat16
+                 if a.dtype == torch.bfloat16 or b.dtype == torch.bfloat16 else
+                 (tl.float16 if a.dtype == torch.float16
+                  or b.dtype == torch.float16 else tl.float32))
+    grid = lambda META: (triton.cdiv(
+        chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
+            chunk_size, META['BLOCK_SIZE_N']), batch, nchunks
+                         if not has_groups else nchunks * ngroups)
+    with torch.cuda.device(a.device.index):
+        _bmm_chunk_fwd_kernel[grid](
+            a,
+            b,
+            out,
+            seq_idx,
+            seqlen,
+            chunk_size,
+            k,
+            ngroups if has_groups else 1,
+            a.stride(0),
+            a.stride(1),
+            0 if not has_groups else a.stride(2),
+            a.stride(-1),
+            b.stride(0),
+            b.stride(1),
+            0 if not has_groups else b.stride(2),
+            b.stride(-1),
+            out.stride(0),
+            out.stride(1),
+            0 if not has_groups else out.stride(2),
+            out.stride(-2),
+            out.stride(-1),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            causal,
+            dot_dtype,
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return out
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
new file mode 100644
index 00000000000..722fbd714ca
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -0,0 +1,615 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py
+
+# ruff: noqa: E501,SIM102
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+from packaging import version
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['chunk_size', 'hdim', 'dstate', 'IS_CAUSAL'],
+)
+@triton.jit
+def _chunk_scan_fwd_kernel(
+    # Pointers to matrices
+    cb_ptr,
+    x_ptr,
+    z_ptr,
+    out_ptr,
+    out_x_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    C_ptr,
+    states_ptr,
+    D_ptr,
+    initstates_ptr,
+    chunk_indices_ptr,
+    chunk_offsets_ptr,
+    chunk_meta_num,
+    # Matrix dimensions
+    chunk_size,
+    hdim,
+    dstate,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_cb_batch,
+    stride_cb_chunk,
+    stride_cb_head,
+    stride_cb_csize_m,
+    stride_cb_csize_k,
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_z_batch,
+    stride_z_seqlen,
+    stride_z_head,
+    stride_z_hdim,
+    stride_out_batch,
+    stride_out_seqlen,
+    stride_out_head,
+    stride_out_hdim,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    stride_C_batch,
+    stride_C_seqlen,
+    stride_C_head,
+    stride_C_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    stride_D_head,
+    # Meta-parameters
+    IS_CAUSAL: tl.constexpr,
+    HAS_D: tl.constexpr,
+    D_HAS_HDIM: tl.constexpr,
+    HAS_Z: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    BLOCK_SIZE_DSTATE: tl.constexpr,
+    IS_TRITON_22: tl.constexpr,
+    HAS_INITSTATES: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    if not HAS_INITSTATES:
+        c_idx = pid_c
+        c_off = 0
+    else:
+        c_idx = tl.load(chunk_indices_ptr + pid_c, mask=pid_c > -1, other=0)
+        c_off = tl.load(chunk_offsets_ptr + pid_c, mask=pid_c > -1, other=0)
+
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(hdim, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    cb_ptr += pid_b * stride_cb_batch + c_idx * stride_cb_chunk + (
+        pid_h // nheads_ngroups_ratio) * stride_cb_head
+    x_ptr += pid_b * stride_x_batch + c_idx * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + c_idx * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + c_idx * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    C_ptr += pid_b * stride_C_batch + c_idx * chunk_size * stride_C_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_C_head
+
+    # M-block offsets and prev states
+    #  - logic in next block may override these if there is an active offset
+    offs_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    prev_states_ptr = states_ptr + pid_b * stride_states_batch + c_idx * stride_states_chunk + pid_h * stride_states_head
+    prev_states_hdim = stride_states_hdim
+    prev_states_dstate = stride_states_dstate
+
+    chunk_size_limit = min(chunk_size, seqlen - c_idx * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + c_idx * chunk_size * stride_seq_idx_seqlen
+
+        # - we only need seq_idx_prev to be aligned to chunk boundary
+        seq_idx_prev = tl.load(seq_idx_ptr - stride_seq_idx_seqlen,
+                               mask=c_idx >= 1,
+                               other=0)
+
+        if HAS_INITSTATES:
+            # if there are init states, we only need seq_idx_m to point
+            # what is the current seq_idx
+
+            # get current seq idx
+            if (pid_m * BLOCK_SIZE_M + c_off) < chunk_size_limit:
+                seq_idx_m = tl.load(
+                    seq_idx_ptr +
+                    (pid_m * BLOCK_SIZE_M + c_off) * stride_seq_idx_seqlen, )
+
+                # - recall that in ssd_state_passing, for the case c_off == 0
+                # i.e., the very first sequence, we made states_ptr hold its initial state
+                # so this edge case is taken care of
+                if ((c_off == 0) and
+                    (seq_idx_prev != seq_idx_m
+                     )  # if a seq is changed exactly on boundary
+                        or (c_off > 0)  # implies a new example (pseudo chunk)
+                    ):
+
+                    # - replace prev_states_ptr with init_states
+                    prev_states_ptr = initstates_ptr + seq_idx_m * stride_init_states_batch + pid_h * stride_init_states_head
+                    prev_states_hdim = stride_init_states_hdim  # override strides
+                    prev_states_dstate = stride_init_states_dstate
+
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dA_cs_m = tl.load(dA_cumsum_ptr + offs_m * stride_dA_cs_csize,
+                      mask=offs_m < chunk_size,
+                      other=0.0).to(tl.float32)
+
+    # - handle chunk state limit
+    if HAS_INITSTATES:
+
+        # have to split this if otherwise compilation will have problems
+        dA_cs_m_boundary = 0.0
+
+        # get the c_idx for the next (logica) chunk
+        c_idx_n = tl.load(
+            chunk_indices_ptr + (pid_c + 1),
+            mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+            other=-1  # to trigger different chunk
+        )
+
+        # - there are things to consider
+        # A. if c_off > 0 then we need to move the dA_cs boundary to ensure correct
+        #    contribution of past states
+        # B. if c_off_n < chunk_size_limit, then we need to adjust this so as not to
+        #    encroach into the next sequence, where c_off_n is the offset of the next
+        #    (logical) chunk.
+        # An equivalent check for B is c_idx == c_idx_n, where there is repetition in
+        # (logical) chunk indices.
+
+        if (c_idx == c_idx_n) or c_off > 0:
+
+            # get the next offset
+            c_off_n = tl.load(chunk_offsets_ptr + (pid_c + 1),
+                              mask=pid_c > -1 and (pid_c + 1) < chunk_meta_num,
+                              other=chunk_size)
+
+            # in this case, adjust down the chunk_size_limit
+            if c_idx == c_idx_n:
+                chunk_size_limit = min(c_off_n, chunk_size_limit)
+
+            # get the cs at the offset boundary
+            # - c_off == 0 is a passthrough
+            dA_cs_m_boundary = tl.load(
+                dA_cumsum_ptr +
+                (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize,
+                mask=(pid_m * BLOCK_SIZE_M + c_off - 1) > -1,
+                other=0.0).to(tl.float32)
+
+    if HAS_SEQ_IDX:
+        # - handle seq idx when HAS_INITSTATES==False
+        if not HAS_INITSTATES:
+            seq_idx_m = tl.load(seq_idx_ptr + offs_m * stride_seq_idx_seqlen,
+                                mask=offs_m < chunk_size_limit,
+                                other=-1)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    # Without the if (pid_c > -1), with Triton 2.1.0, I get
+    # Assertion `!(srcMmaLayout && dstMmaLayout) && "Unexpected mma -> mm a layout conversion"' failed.
+    # With Triton 2.2.0, this works
+    if IS_TRITON_22 or c_idx > -1:
+        # Faster to just do 1 iteration with larger BLOCK_SIZE_K, up to block size 128
+        offs_k_dstate = tl.arange(
+            0, BLOCK_SIZE_DSTATE if BLOCK_SIZE_DSTATE <= 128 else BLOCK_SIZE_K)
+        C_ptrs = C_ptr + (offs_m[:, None] * stride_C_seqlen +
+                          offs_k_dstate[None, :] * stride_C_dstate)
+
+        prev_states_ptrs = prev_states_ptr + (
+            offs_n[None, :] * prev_states_hdim +
+            offs_k_dstate[:, None] * prev_states_dstate)
+        if HAS_SEQ_IDX:
+
+            if not HAS_INITSTATES:
+                # - this is for continuous batching where there is no init states
+                scale_m = tl.where(seq_idx_m == seq_idx_prev, tl.exp(dA_cs_m),
+                                   0.0)
+            else:
+                # - if there is initstates, we will rely on prev_states, no zeroing
+                #   required.
+                scale_m = tl.exp(dA_cs_m - dA_cs_m_boundary)
+        else:
+            scale_m = tl.exp(dA_cs_m)
+        if BLOCK_SIZE_DSTATE <= 128:
+            C = tl.load(C_ptrs,
+                        mask=(offs_m[:, None] < chunk_size_limit) &
+                        (offs_k_dstate[None, :] < dstate),
+                        other=0.0)
+
+            prev_states = tl.load(prev_states_ptrs,
+                                  mask=(offs_k_dstate[:, None] < dstate) &
+                                  (offs_n[None, :] < hdim),
+                                  other=0.0)
+            prev_states = prev_states.to(C_ptr.dtype.element_ty)
+            acc = tl.dot(C, prev_states) * scale_m[:, None]
+        else:
+            for k in range(0, dstate, BLOCK_SIZE_K):
+                C = tl.load(C_ptrs,
+                            mask=(offs_m[:, None] < chunk_size_limit) &
+                            (offs_k_dstate[None, :] < dstate - k),
+                            other=0.0)
+                # C = (C * scale_m[:, None]).to(C_ptr.dtype.element_ty)
+                prev_states = tl.load(
+                    prev_states_ptrs,
+                    mask=(offs_k_dstate[:, None] < dstate - k) &
+                    (offs_n[None, :] < hdim),
+                    other=0.0)
+                prev_states = prev_states.to(C_ptr.dtype.element_ty)
+                acc += tl.dot(C, prev_states)
+                C_ptrs += BLOCK_SIZE_K
+                prev_states_ptrs += BLOCK_SIZE_K
+            acc *= scale_m[:, None]
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K) + c_off
+    cb_ptrs = cb_ptr + (offs_m[:, None] * stride_cb_csize_m +
+                        offs_k[None, :] * stride_cb_csize_k)
+    x_ptrs = x_ptr + (offs_k[:, None] * stride_x_seqlen +
+                      offs_n[None, :] * stride_x_hdim)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    K_MAX = chunk_size_limit if not IS_CAUSAL else min(
+        (pid_m + 1) * BLOCK_SIZE_M, chunk_size_limit)
+    for k in range(0, K_MAX, BLOCK_SIZE_K):
+        cb = tl.load(cb_ptrs,
+                     mask=(offs_m[:, None] < chunk_size) &
+                     (offs_k[None, :] < chunk_size - k),
+                     other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size - k,
+                          other=0.0).to(tl.float32)
+        # If there's seq_idx, we already set cb[i, j] = 0 for seq_idx[i] != seq_idx[j].
+        # So we don't need masking wrt seq_idx here.
+        cb *= tl.exp(dA_cs_m[:, None] - dA_cs_k[None, :])
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size - k,
+                       other=0.0).to(tl.float32)
+        cb *= dt_k
+        if IS_CAUSAL:
+            mask = offs_m[:, None] >= k + offs_k[None, :]
+            cb = tl.where(mask, cb, 0.0)
+        cb = cb.to(x_ptr.dtype.element_ty)
+        x = tl.load(x_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < hdim),
+                    other=0.0)
+        acc += tl.dot(cb, x)
+        cb_ptrs += BLOCK_SIZE_K * stride_cb_csize_k
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    offs_out_m = pid_m * BLOCK_SIZE_M + c_off + tl.arange(0, BLOCK_SIZE_M)
+    offs_out_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+    if HAS_D:
+        if D_HAS_HDIM:
+            D = tl.load(D_ptr + pid_h * stride_D_head + offs_n,
+                        mask=offs_n < hdim,
+                        other=0.0).to(tl.float32)
+        else:
+            D = tl.load(D_ptr + pid_h * stride_D_head).to(tl.float32)
+        x_residual = tl.load(x_ptr + (offs_m[:, None] * stride_x_seqlen +
+                                      offs_n[None, :] * stride_x_hdim),
+                             mask=(offs_m[:, None] < chunk_size_limit) &
+                             (offs_n[None, :] < hdim),
+                             other=0.0).to(tl.float32)
+        acc += x_residual * D
+
+    if HAS_Z:
+        out_x_ptr += pid_b * stride_out_batch + c_idx * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+        out_x_ptrs = out_x_ptr + (stride_out_seqlen * offs_out_m[:, None] +
+                                  offs_out_n[None, :])
+        tl.store(out_x_ptrs,
+                 acc,
+                 mask=(offs_out_m[:, None] < chunk_size_limit) &
+                 (offs_out_n[None, :] < hdim))
+
+        z_ptr += pid_b * stride_z_batch + c_idx * chunk_size * stride_z_seqlen + pid_h * stride_z_head
+        z_ptrs = z_ptr + (stride_z_seqlen * offs_out_m[:, None] +
+                          stride_z_hdim * offs_out_n[None, :])
+        z = tl.load(z_ptrs,
+                    mask=(offs_out_m[:, None] < chunk_size_limit) &
+                    (offs_out_n[None, :] < hdim),
+                    other=0.0).to(tl.float32)
+        acc *= z * tl.sigmoid(z)
+
+    out_ptr += pid_b * stride_out_batch + c_idx * chunk_size * stride_out_seqlen + pid_h * stride_out_head
+    out_ptrs = out_ptr + (stride_out_seqlen * offs_out_m[:, None] +
+                          offs_out_n[None, :] * stride_out_hdim)
+    tl.store(out_ptrs,
+             acc,
+             mask=(offs_out_m[:, None] < chunk_size_limit) &
+             (offs_out_n[None, :] < hdim))
+
+
+def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
+
+    # convert seq_idx to chunk indices and offsets
+    # - derive the cu_seqlens
+    _, cu_seqlens = torch.where(seq_idx.diff())
+    cu_seqlens += 1
+
+    # outputs will have length expansion of chunks that do not divide
+    # chunk_size
+    N = math.ceil(seq_idx.shape[-1] / chunk_size) + (cu_seqlens % chunk_size
+                                                     > 0).sum()
+    chunk_indices = torch.arange(N, dtype=torch.int, device=seq_idx.device)
+    chunk_offsets = torch.zeros((N, ), dtype=torch.int, device=seq_idx.device)
+
+    cu_seqlens = cu_seqlens.tolist() + [seq_idx.shape[-1]]
+    p = 0  # num of insertions
+    for s, e in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+
+        # if does not divide chunk_size, then there is one chunk insertion
+        p += (s % chunk_size > 0)
+
+        # get the dimensions
+        _s, _e = s // chunk_size + p, e // chunk_size + p + 1
+
+        # adjust inidces and offsets
+        chunk_indices[_s:_e] -= p
+        chunk_offsets[_s] = s % chunk_size
+
+    return chunk_indices, chunk_offsets
+
+
+def _chunk_scan_fwd(
+    cb,
+    x,
+    dt,
+    dA_cumsum,
+    C,
+    states,
+    D=None,
+    z=None,
+    seq_idx=None,
+    initial_states=None,
+):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = C.shape
+    assert nheads % ngroups == 0
+    assert C.shape == (batch, seqlen, ngroups, dstate)
+    assert cb.shape == (batch, nchunks, ngroups, chunk_size, chunk_size)
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == (batch, nheads, nchunks, chunk_size)
+    assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+
+    chunk_indices, chunk_offsets = None, None
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+
+        if initial_states is not None:
+            # with initial states, we need to take care of how
+            # seq_idx crosses the boundaries
+            assert batch == 1, "chunk scan only supports initial states with batch 1"
+            assert initial_states.shape == (seq_idx[0].max() + 1, nheads,
+                                            headdim, dstate)
+
+            if initial_states.shape[0] == 1:
+                # no in this case no point to use initial states
+                initial_states = None
+            else:
+                chunk_indices, chunk_offsets = _seq_idx_to_chunk_indices_offsets(
+                    seq_idx, chunk_size)
+
+    # Allocates output.
+    out = torch.empty(batch,
+                      seqlen,
+                      nheads,
+                      headdim,
+                      device=x.device,
+                      dtype=x.dtype)
+    if z is not None:
+        out_x = torch.empty(batch,
+                            seqlen,
+                            nheads,
+                            headdim,
+                            device=x.device,
+                            dtype=x.dtype)
+        assert out_x.stride() == out.stride()
+    else:
+        out_x = None
+
+    grid = lambda META: (
+        triton.cdiv(chunk_size, META['BLOCK_SIZE_M']) * triton.cdiv(
+            headdim, META['BLOCK_SIZE_N']), batch * nchunks
+        if chunk_offsets is None else len(chunk_offsets), nheads)
+    z_strides = ((z.stride(0), z.stride(1), z.stride(2),
+                  z.stride(3)) if z is not None else (0, 0, 0, 0))
+    _chunk_scan_fwd_kernel[grid](
+        cb,
+        x,
+        z,
+        out,
+        out_x,
+        dt,
+        dA_cumsum,
+        seq_idx,
+        C,
+        states,
+        D,
+        initial_states,
+        chunk_indices,
+        chunk_offsets,
+        len(chunk_indices) if chunk_indices is not None else 0,
+        chunk_size,
+        headdim,
+        dstate,
+        batch,
+        seqlen,
+        nheads // ngroups,
+        cb.stride(0),
+        cb.stride(1),
+        cb.stride(2),
+        cb.stride(3),
+        cb.stride(4),
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        x.stride(3),
+        z_strides[0],
+        z_strides[1],
+        z_strides[2],
+        z_strides[3],
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        out.stride(3),
+        dt.stride(0),
+        dt.stride(2),
+        dt.stride(1),
+        dt.stride(3),
+        dA_cumsum.stride(0),
+        dA_cumsum.stride(2),
+        dA_cumsum.stride(1),
+        dA_cumsum.stride(3),
+        *((seq_idx.stride(0), seq_idx.stride(1)) if seq_idx is not None else
+          (0, 0)),
+        C.stride(0),
+        C.stride(1),
+        C.stride(2),
+        C.stride(3),
+        states.stride(0),
+        states.stride(1),
+        states.stride(2),
+        states.stride(3),
+        states.stride(4),
+        *((initial_states.stride(0), initial_states.stride(1),
+           initial_states.stride(2),
+           initial_states.stride(3)) if initial_states is not None else
+          (0, 0, 0, 0)),
+        D.stride(0) if D is not None else 0,
+        True,
+        D is not None,
+        D.dim() == 2 if D is not None else True,
+        BLOCK_SIZE_DSTATE=max(triton.next_power_of_2(dstate), 16),
+        HAS_Z=z is not None,
+        HAS_SEQ_IDX=seq_idx is not None,
+        IS_TRITON_22=TRITON_22,
+        HAS_INITSTATES=initial_states is not None,
+    )
+    return out, out_x
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
new file mode 100644
index 00000000000..a970ac94580
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@@ -0,0 +1,750 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py
+
+# ruff: noqa: E501
+
+import math
+
+import torch
+import triton
+import triton.language as tl
+
+from .mamba_ssm import softplus
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE_H': 1}),
+        triton.Config({'BLOCK_SIZE_H': 2}),
+        triton.Config({'BLOCK_SIZE_H': 4}),
+        triton.Config({'BLOCK_SIZE_H': 8}),
+        triton.Config({'BLOCK_SIZE_H': 16}),
+        triton.Config({'BLOCK_SIZE_H': 32}),
+        triton.Config({'BLOCK_SIZE_H': 64}),
+    ],
+    key=['chunk_size', 'nheads'],
+)
+@triton.jit
+def _chunk_cumsum_fwd_kernel(
+    # Pointers to matrices
+    dt_ptr,
+    A_ptr,
+    dt_bias_ptr,
+    dt_out_ptr,
+    dA_cumsum_ptr,
+    # Matrix dimension
+    batch,
+    seqlen,
+    nheads,
+    chunk_size,
+    dt_min,
+    dt_max,
+    # Strides
+    stride_dt_batch,
+    stride_dt_seqlen,
+    stride_dt_head,
+    stride_A_head,
+    stride_dt_bias_head,
+    stride_dt_out_batch,
+    stride_dt_out_chunk,
+    stride_dt_out_head,
+    stride_dt_out_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    # Meta-parameters
+    DT_SOFTPLUS: tl.constexpr,
+    HAS_DT_BIAS: tl.constexpr,
+    BLOCK_SIZE_H: tl.constexpr,
+    BLOCK_SIZE_CHUNK: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=0)
+
+    # if dt is long, may cause problems, so use 64 bit
+    # https://github.com/triton-lang/triton/issues/1058
+    pid_c = tl.program_id(axis=1).to(tl.int64)
+    pid_h = tl.program_id(axis=2)
+    dt_ptr += pid_b * stride_dt_batch + pid_c * chunk_size * stride_dt_seqlen
+    dt_out_ptr += pid_b * stride_dt_out_batch + pid_c * stride_dt_out_chunk
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk
+
+    offs_h = pid_h * BLOCK_SIZE_H + tl.arange(0, BLOCK_SIZE_H)
+    offs_c = tl.arange(0, BLOCK_SIZE_CHUNK)
+    dt_ptrs = dt_ptr + (offs_h[:, None] * stride_dt_head +
+                        offs_c[None, :] * stride_dt_seqlen)
+    A_ptrs = A_ptr + offs_h * stride_A_head
+    dt_out_ptrs = dt_out_ptr + (offs_h[:, None] * stride_dt_out_head +
+                                offs_c[None, :] * stride_dt_out_csize)
+    dA_cs_ptrs = dA_cumsum_ptr + (offs_h[:, None] * stride_dA_cs_head +
+                                  offs_c[None, :] * stride_dA_cs_csize)
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+
+    dt = tl.load(dt_ptrs,
+                 mask=(offs_h[:, None] < nheads) &
+                 (offs_c[None, :] < chunk_size_limit),
+                 other=0.0).to(tl.float32)
+    if HAS_DT_BIAS:
+        dt_bias = tl.load(dt_bias_ptr + offs_h * stride_dt_bias_head,
+                          mask=offs_h < nheads,
+                          other=0.0).to(tl.float32)
+        dt += dt_bias[:, None]
+    if DT_SOFTPLUS:
+        dt = tl.where(dt <= 20.0, softplus(dt), dt)
+    # As of Triton 2.2.0, tl.clamp is not available yet
+    # dt = tl.clamp(dt, dt_min, dt_max)
+    dt = tl.minimum(tl.maximum(dt, dt_min), dt_max)
+    dt = tl.where(
+        (offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size_limit), dt,
+        0.0)
+    tl.store(dt_out_ptrs,
+             dt,
+             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+    A = tl.load(A_ptrs, mask=offs_h < nheads, other=0.0).to(tl.float32)
+    dA = dt * A[:, None]
+    dA_cs = tl.cumsum(dA, axis=1)
+    tl.store(dA_cs_ptrs,
+             dA_cs,
+             mask=(offs_h[:, None] < nheads) & (offs_c[None, :] < chunk_size))
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_fwd_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    states_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    batch,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_batch,
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_batch,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_dt_batch,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_SEQ_IDX: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    pid_bc = tl.program_id(axis=1).to(tl.int64)
+    pid_c = pid_bc // batch
+    pid_b = pid_bc - pid_c * batch
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    b_ptr += pid_b * stride_b_batch + pid_c * chunk_size * stride_b_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_b * stride_x_batch + pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_b * stride_dt_batch + pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_b * stride_dA_cs_batch + pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch + pid_c * chunk_size * stride_seq_idx_seqlen
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
+                      offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
+                      offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr +
+                         (chunk_size - 1) * stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+    if HAS_SEQ_IDX:
+        seq_idx_ptrs = seq_idx_ptr + offs_k * stride_seq_idx_seqlen
+
+    chunk_size_limit = min(chunk_size, seqlen - pid_c * chunk_size)
+    if HAS_SEQ_IDX:
+        seq_idx_last = tl.load(seq_idx_ptr +
+                               (chunk_size_limit - 1) * stride_seq_idx_seqlen)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs,
+                    mask=(offs_m[:, None] < hdim) &
+                    (offs_k[None, :] < chunk_size_limit - k),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < dstate),
+                    other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size_limit - k,
+                          other=0.0).to(tl.float32)
+        if HAS_SEQ_IDX:
+            seq_idx_k = tl.load(seq_idx_ptrs,
+                                mask=offs_k < chunk_size_limit - k,
+                                other=-1)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
+                       other=0.0).to(tl.float32)
+        if not HAS_SEQ_IDX:
+            scale = tl.exp(dA_cs_last - dA_cs_k) * dt_k
+        else:
+            scale = tl.where(seq_idx_k == seq_idx_last,
+                             tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+        if HAS_SEQ_IDX:
+            seq_idx_ptrs += BLOCK_SIZE_K * stride_seq_idx_seqlen
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_c * stride_states_chunk + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
+                                offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 64
+            },
+            num_stages=3,
+            num_warps=8),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 256,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 128,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 128,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=4),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 32,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 32,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=5,
+            num_warps=2),
+        triton.Config(
+            {
+                'BLOCK_SIZE_M': 64,
+                'BLOCK_SIZE_N': 64,
+                'BLOCK_SIZE_K': 32
+            },
+            num_stages=4,
+            num_warps=2),
+    ],
+    key=['hdim', 'dstate', 'chunk_size'],
+)
+@triton.jit
+def _chunk_state_varlen_kernel(
+    # Pointers to matrices
+    x_ptr,
+    b_ptr,
+    dt_ptr,
+    dA_cumsum_ptr,
+    chunk_states_ptr,
+    cu_seqlens_ptr,
+    states_ptr,
+    initstates_ptr,
+    # Matrix dimensions
+    hdim,
+    dstate,
+    chunk_size,
+    seqlen,
+    nheads_ngroups_ratio,
+    # Strides
+    stride_x_seqlen,
+    stride_x_head,
+    stride_x_hdim,
+    stride_b_seqlen,
+    stride_b_head,
+    stride_b_dstate,
+    stride_dt_chunk,
+    stride_dt_head,
+    stride_dt_csize,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_dA_cs_csize,
+    stride_chunk_states_chunk,
+    stride_chunk_states_head,
+    stride_chunk_states_hdim,
+    stride_chunk_states_dstate,
+    stride_states_batch,
+    stride_states_head,
+    stride_states_hdim,
+    stride_states_dstate,
+    stride_init_states_batch,
+    stride_init_states_head,
+    stride_init_states_hdim,
+    stride_init_states_dstate,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    HAS_INITSTATES: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    num_pid_n = tl.cdiv(dstate, BLOCK_SIZE_N)
+    pid_m = tl.program_id(axis=0) // num_pid_n
+    pid_n = tl.program_id(axis=0) % num_pid_n
+    end_idx = tl.load(cu_seqlens_ptr + pid_b + 1)
+    pid_c = (end_idx - 1) // chunk_size
+    b_ptr += pid_c * chunk_size * stride_b_seqlen + (
+        pid_h // nheads_ngroups_ratio) * stride_b_head
+    x_ptr += pid_c * chunk_size * stride_x_seqlen + pid_h * stride_x_head
+    dt_ptr += pid_c * stride_dt_chunk + pid_h * stride_dt_head
+    dA_cumsum_ptr += pid_c * stride_dA_cs_chunk + pid_h * stride_dA_cs_head
+    chunk_states_ptr += pid_c * stride_chunk_states_chunk + pid_h * stride_chunk_states_head
+
+    if HAS_INITSTATES:
+        # if there are init states provided, we differentiate between states (which
+        # are boundary conditions at a chunk boundary) and initstates (which are boundary
+        # conditions when a new example in a cont batch starts)
+        initstates_ptr += pid_h * stride_init_states_head
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_x_hdim +
+                      offs_k[None, :] * stride_x_seqlen)
+    b_ptrs = b_ptr + (offs_n[None, :] * stride_b_dstate +
+                      offs_k[:, None] * stride_b_seqlen)
+    dt_ptrs = dt_ptr + offs_k * stride_dt_csize
+    dA_cs_last = tl.load(dA_cumsum_ptr + (end_idx - pid_c * chunk_size - 1) *
+                         stride_dA_cs_csize).to(tl.float32)
+    dA_cumsum_ptrs = dA_cumsum_ptr + offs_k * stride_dA_cs_csize
+
+    chunk_size_limit = end_idx - pid_c * chunk_size
+    start_idx = tl.load(cu_seqlens_ptr + pid_b)
+    start_idx_cur = tl.maximum(start_idx - pid_c * chunk_size, 0)
+
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, chunk_size_limit, BLOCK_SIZE_K):
+        x = tl.load(x_ptrs,
+                    mask=(offs_m[:, None] < hdim) &
+                    (offs_k[None, :] < chunk_size_limit - k) &
+                    (offs_k[None, :] >= start_idx_cur - k),
+                    other=0.0)
+        b = tl.load(b_ptrs,
+                    mask=(offs_k[:, None] < chunk_size_limit - k) &
+                    (offs_n[None, :] < dstate) &
+                    (offs_k[:, None] >= start_idx_cur - k),
+                    other=0.0).to(tl.float32)
+        dA_cs_k = tl.load(dA_cumsum_ptrs,
+                          mask=offs_k < chunk_size_limit - k,
+                          other=0.0).to(tl.float32)
+        dt_k = tl.load(dt_ptrs, mask=offs_k < chunk_size_limit - k,
+                       other=0.0).to(tl.float32)
+        scale = tl.where(
+            (offs_k >= start_idx_cur - k) & (offs_k < chunk_size_limit - k),
+            tl.exp(dA_cs_last - dA_cs_k) * dt_k, 0.0)
+        b *= scale[:, None]
+        b = b.to(x_ptr.dtype.element_ty)
+        acc += tl.dot(x, b)
+        x_ptrs += BLOCK_SIZE_K * stride_x_seqlen
+        b_ptrs += BLOCK_SIZE_K * stride_b_seqlen
+        dt_ptrs += BLOCK_SIZE_K * stride_dt_csize
+        dA_cumsum_ptrs += BLOCK_SIZE_K * stride_dA_cs_csize
+
+    # If the sequence starts after the last chunk idx, we don't need to add the contribution from the last chunk
+    # If HAS_INITSTATES==True need to consider two possiblties
+    # - if start_idx < pid_c * chunk_size, then we need to take the past_states_ptrs
+    # - if state_idx >= pid * chunk_size, then we need to insert initstates
+    if ((start_idx < pid_c * chunk_size)  # first chunk
+            or (HAS_INITSTATES)):
+
+        dA_cs_boundary = 0.0  # default
+
+        if not HAS_INITSTATES:
+            past_states_ptrs = chunk_states_ptr + (
+                offs_m[:, None] * stride_chunk_states_hdim +
+                offs_n[None, :] * stride_chunk_states_dstate)
+        else:
+
+            # - this seems repetitve, buts its to help the compiler
+            if start_idx < pid_c * chunk_size:
+                past_states_ptrs = chunk_states_ptr + (
+                    offs_m[:, None] * stride_chunk_states_hdim +
+                    offs_n[None, :] * stride_chunk_states_dstate)
+            else:
+                past_states_ptrs = initstates_ptr + (
+                    pid_b * stride_init_states_batch +
+                    offs_m[:, None] * stride_init_states_hdim +
+                    offs_n[None, :] * stride_init_states_dstate)
+
+                # need to adjust the boundary
+                if start_idx > pid_c * chunk_size:
+                    dA_cs_boundary = tl.load(dA_cumsum_ptr +
+                                             (start_idx - pid_c * chunk_size -
+                                              1) * stride_dA_cs_csize).to(
+                                                  tl.float32)
+
+        past_states = tl.load(past_states_ptrs,
+                              mask=(offs_m[:, None] < hdim) &
+                              (offs_n[None, :] < dstate),
+                              other=0.0).to(tl.float32)
+
+        scale = tl.exp(dA_cs_last - dA_cs_boundary)
+        acc += past_states * scale
+
+    states = acc.to(states_ptr.dtype.element_ty)
+
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    states_ptrs = states_ptr + (offs_m[:, None] * stride_states_hdim +
+                                offs_n[None, :] * stride_states_dstate)
+    c_mask = (offs_m[:, None] < hdim) & (offs_n[None, :] < dstate)
+    tl.store(states_ptrs, states, mask=c_mask)
+
+
+def _chunk_cumsum_fwd(dt,
+                      A,
+                      chunk_size,
+                      dt_bias=None,
+                      dt_softplus=False,
+                      dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads = dt.shape
+    assert A.shape == (nheads, )
+    if dt_bias is not None:
+        assert dt_bias.shape == (nheads, )
+    nchunks = math.ceil(seqlen / chunk_size)
+    dt_out = torch.empty(batch,
+                         nheads,
+                         nchunks,
+                         chunk_size,
+                         device=dt.device,
+                         dtype=torch.float32)
+    dA_cumsum = torch.empty(batch,
+                            nheads,
+                            nchunks,
+                            chunk_size,
+                            device=dt.device,
+                            dtype=torch.float32)
+    grid_chunk_cs = lambda META: (batch, nchunks,
+                                  triton.cdiv(nheads, META['BLOCK_SIZE_H']))
+    with torch.cuda.device(dt.device.index):
+        _chunk_cumsum_fwd_kernel[grid_chunk_cs](
+            dt,
+            A,
+            dt_bias,
+            dt_out,
+            dA_cumsum,
+            batch,
+            seqlen,
+            nheads,
+            chunk_size,
+            dt_limit[0],
+            dt_limit[1],
+            dt.stride(0),
+            dt.stride(1),
+            dt.stride(2),
+            A.stride(0),
+            dt_bias.stride(0) if dt_bias is not None else 0,
+            dt_out.stride(0),
+            dt_out.stride(2),
+            dt_out.stride(1),
+            dt_out.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            dt_softplus,
+            HAS_DT_BIAS=dt_bias is not None,
+            BLOCK_SIZE_CHUNK=triton.next_power_of_2(chunk_size),
+        )
+    return dA_cumsum, dt_out
+
+
+def _chunk_state_fwd(B,
+                     x,
+                     dt,
+                     dA_cumsum,
+                     seq_idx=None,
+                     states=None,
+                     states_in_fp32=True):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, nchunks, chunk_size = dt.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert dt.shape == (batch, nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if states is not None:
+        assert states.shape == (batch, nchunks, nheads, headdim, dstate)
+    else:
+        states_dtype = torch.float32 if states_in_fp32 else B.dtype
+        states = torch.empty((batch, nchunks, nheads, headdim, dstate),
+                             device=x.device,
+                             dtype=states_dtype)
+    grid = lambda META: (
+        triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.cdiv(
+            dstate, META['BLOCK_SIZE_N']), batch * nchunks, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_fwd_kernel[grid](
+            x,
+            B,
+            states,
+            dt,
+            dA_cumsum,
+            seq_idx,
+            headdim,
+            dstate,
+            chunk_size,
+            batch,
+            seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            x.stride(3),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            B.stride(-1),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            states.stride(4),
+            dt.stride(0),
+            dt.stride(2),
+            dt.stride(1),
+            dt.stride(3),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(3),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_SEQ_IDX=seq_idx is not None,
+        )
+    return states
+
+
+def chunk_state_varlen(B,
+                       x,
+                       dt,
+                       dA_cumsum,
+                       cu_seqlens,
+                       chunk_states,
+                       initial_states=None):
+    total_seqlen, nheads, headdim = x.shape
+    _, nchunks, chunk_size = dt.shape
+    _, ngroups, dstate = B.shape
+    batch = cu_seqlens.shape[0] - 1
+    cu_seqlens = cu_seqlens.contiguous()
+    assert nheads % ngroups == 0
+    assert B.shape == (total_seqlen, ngroups, dstate)
+    assert dt.shape == (nheads, nchunks, chunk_size)
+    assert dA_cumsum.shape == dt.shape
+    assert chunk_states.shape == (nchunks, nheads, headdim, dstate)
+
+    if initial_states is not None:
+        assert initial_states.shape == (batch, nheads, headdim, dstate)
+
+    states = torch.empty(batch,
+                         nheads,
+                         headdim,
+                         dstate,
+                         dtype=chunk_states.dtype,
+                         device=chunk_states.device)
+    grid = lambda META: (triton.cdiv(headdim, META['BLOCK_SIZE_M']) * triton.
+                         cdiv(dstate, META['BLOCK_SIZE_N']), batch, nheads)
+    with torch.cuda.device(x.device.index):
+        _chunk_state_varlen_kernel[grid](
+            x,
+            B,
+            dt,
+            dA_cumsum,
+            chunk_states,
+            cu_seqlens,
+            states,
+            initial_states,
+            headdim,
+            dstate,
+            chunk_size,
+            total_seqlen,
+            nheads // ngroups,
+            x.stride(0),
+            x.stride(1),
+            x.stride(2),
+            B.stride(0),
+            B.stride(1),
+            B.stride(2),
+            dt.stride(1),
+            dt.stride(0),
+            dt.stride(2),
+            dA_cumsum.stride(1),
+            dA_cumsum.stride(0),
+            dA_cumsum.stride(2),
+            chunk_states.stride(0),
+            chunk_states.stride(1),
+            chunk_states.stride(2),
+            chunk_states.stride(3),
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            *((initial_states.stride(0), initial_states.stride(1),
+               initial_states.stride(2),
+               initial_states.stride(3)) if initial_states is not None else
+              (0, 0, 0, 0)),
+            HAS_INITSTATES=initial_states is not None)
+    return states
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
new file mode 100644
index 00000000000..97cdb70b63c
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py
+
+# ruff: noqa: E501
+
+import torch
+import triton
+from einops import rearrange
+from packaging import version
+
+from .ssd_bmm import _bmm_chunk_fwd
+from .ssd_chunk_scan import _chunk_scan_fwd
+from .ssd_chunk_state import (_chunk_cumsum_fwd, _chunk_state_fwd,
+                              chunk_state_varlen)
+from .ssd_state_passing import _state_passing_fwd
+
+TRITON_22 = version.parse(triton.__version__) >= version.parse('2.2.0')
+
+
+def _mamba_chunk_scan_combined_fwd(x,
+                                   dt,
+                                   A,
+                                   B,
+                                   C,
+                                   chunk_size,
+                                   D=None,
+                                   z=None,
+                                   dt_bias=None,
+                                   initial_states=None,
+                                   seq_idx=None,
+                                   cu_seqlens=None,
+                                   dt_softplus=False,
+                                   dt_limit=(0.0, float("inf"))):
+    batch, seqlen, nheads, headdim = x.shape
+    _, _, ngroups, dstate = B.shape
+    assert nheads % ngroups == 0
+    assert B.shape == (batch, seqlen, ngroups, dstate)
+    assert x.shape == (batch, seqlen, nheads, headdim)
+    assert dt.shape == (batch, seqlen, nheads)
+    assert A.shape == (nheads, )
+    assert C.shape == B.shape
+    if z is not None:
+        assert z.shape == x.shape
+    if D is not None:
+        assert D.shape == (nheads, headdim) or D.shape == (nheads, )
+    if seq_idx is not None:
+        assert seq_idx.shape == (batch, seqlen)
+    if B.stride(-1) != 1:
+        B = B.contiguous()
+    if C.stride(-1) != 1:
+        C = C.contiguous()
+    if x.stride(-1) != 1 and x.stride(
+            1) != 1:  # Either M or K dimension should be contiguous
+        x = x.contiguous()
+    if z is not None and z.stride(-1) != 1 and z.stride(
+            1) != 1:  # Either M or K dimension should be contiguous
+        z = z.contiguous()
+    if D is not None and D.stride(-1) != 1:
+        D = D.contiguous()
+    if initial_states is not None:
+        if cu_seqlens is None:
+            assert initial_states.shape == (batch, nheads, headdim, dstate)
+        else:
+            assert initial_states.shape == (len(cu_seqlens) - 1, nheads,
+                                            headdim, dstate)
+
+    # This function executes 5 sub-functions for computing mamba
+    # - a good resource is the blog https://goombalab.github.io/blog/2024/mamba2-part3-algorithm/
+    #   which has a minimal implementation to understand the below operations
+    # - as explained by the blog, mamba is a special case of causal attention
+    # - the idea is to chunk the attention matrix and compute each
+    #   submatrix separately using different optimizations.
+    # - see the blog and paper for a visualization of the submatrices
+    #   which we refer to in the comments below
+
+    # 1. Compute chunked cumsum of A * dt
+    # - here dt may go through a softplus activation
+    dA_cumsum, dt = _chunk_cumsum_fwd(dt,
+                                      A,
+                                      chunk_size,
+                                      dt_bias=dt_bias,
+                                      dt_softplus=dt_softplus,
+                                      dt_limit=dt_limit)
+
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    states = _chunk_state_fwd(B,
+                              x,
+                              dt,
+                              dA_cumsum,
+                              seq_idx=seq_idx,
+                              states_in_fp32=True)
+
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    # - for handling chunked prefill, this requires i) initial_states
+    #   ii) seq_idx and iii) has_cu_seqlens to be all specified.
+    # - When a new seq_idx is detected, we will stop passing the prev_state
+    #   and switch accordingly to the init_state corresponding to the new seq_idx.
+    # - this will ensure that states will be updated with the rightmost flushed seq_idx
+    #   of the previous chunk. This implies that the first chunk of states is either 0
+    #   or equal to init_states of the first example.
+    states, final_states = _state_passing_fwd(
+        rearrange(states, "... p n -> ... (p n)"),
+        dA_cumsum[:, :, :, -1],
+        initial_states=rearrange(initial_states, "... p n -> ... (p n)")
+        if initial_states is not None else None,
+        seq_idx=seq_idx,
+        chunk_size=chunk_size,
+        out_dtype=C.dtype,
+        is_cont_batched=cu_seqlens is not None)
+    states, final_states = (rearrange(t, "... (p n) -> ... p n", n=dstate)
+                            for t in [states, final_states])
+
+    # 4. Compute batched matrix multiply for C_j^T B_i terms
+    CB = _bmm_chunk_fwd(C,
+                        B,
+                        chunk_size,
+                        seq_idx=seq_idx,
+                        output_dtype=torch.float32)
+
+    # 5. Scan and compute the diagonal blocks, taking into
+    #    account past causal states.
+    # - if initial states are provided, then states information will be
+    #   augmented with initial_states.
+    # - to do this properly, we need to account for example changes in
+    #   the continuous batch, therefore we introduce pseudo chunks, which is
+    #   a chunk that is split up each time an example changes.
+    # - in each (pseudo) chunk, we detect if the previous (pseudo) chunk had
+    #   a seq_idx change, in which case we take states information from
+    #   init_states.
+    out, out_x = _chunk_scan_fwd(
+        CB,
+        x,
+        dt,
+        dA_cumsum,
+        C,
+        states,
+        D=D,
+        z=z,
+        seq_idx=seq_idx,
+        initial_states=initial_states,
+    )
+    if cu_seqlens is None:
+        return out, out_x, dt, dA_cumsum, states, final_states
+    else:
+        assert batch == 1, "passing cu_seqlens to get the varlen states is only supported if batch dimension is 1"
+        varlen_states = chunk_state_varlen(
+            B.squeeze(0),
+            x.squeeze(0),
+            dt.squeeze(0),
+            dA_cumsum.squeeze(0),
+            cu_seqlens,
+            states.squeeze(0),
+            initial_states=initial_states,
+        )
+        return out, out_x, dt, dA_cumsum, states, final_states, varlen_states
+
+
+def mamba_chunk_scan_combined(x,
+                              dt,
+                              A,
+                              B,
+                              C,
+                              chunk_size,
+                              D=None,
+                              z=None,
+                              dt_bias=None,
+                              initial_states=None,
+                              seq_idx=None,
+                              cu_seqlens=None,
+                              dt_softplus=False,
+                              dt_limit=(0.0, float("inf")),
+                              return_final_states=False,
+                              return_varlen_states=False):
+    """
+    Argument:
+        x: (batch, seqlen, nheads, headdim)
+        dt: (batch, seqlen, nheads)
+        A: (nheads)
+        B: (batch, seqlen, ngroups, dstate)
+        C: (batch, seqlen, ngroups, dstate)
+        chunk_size: int
+        D: (nheads, headdim) or (nheads,)
+        z: (batch, seqlen, nheads, headdim)
+        dt_bias: (nheads,)
+        initial_states: (batch, nheads, headdim, dstate)
+        seq_idx: (batch, seqlen)
+        cu_seqlens: (num_sequences + 1) or None, only used if return_varlen_states is True
+        dt_softplus: Whether to apply softplus to dt
+    Return:
+        out: (batch, seqlen, nheads, headdim)
+    """
+
+    if not return_varlen_states:
+        cu_seqlens = None
+    else:
+        assert cu_seqlens is not None, "cu_seqlens must be provided if return_varlen_states is True"
+    out, out_x, dt_out, dA_cumsum, states, final_states, *rest = _mamba_chunk_scan_combined_fwd(
+        x,
+        dt,
+        A,
+        B,
+        C,
+        chunk_size,
+        D=D,
+        z=z,
+        dt_bias=dt_bias,
+        initial_states=initial_states,
+        seq_idx=seq_idx,
+        cu_seqlens=cu_seqlens,
+        dt_softplus=dt_softplus,
+        dt_limit=dt_limit)
+    if not return_varlen_states:
+        return out if not return_final_states else (out, final_states)
+    else:
+        varlen_states = rest[0]
+        return (out,
+                varlen_states) if not return_final_states else (out,
+                                                                final_states,
+                                                                varlen_states)
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
new file mode 100644
index 00000000000..d8f87c113f1
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) 2024, Tri Dao, Albert Gu.
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py
+
+# ruff: noqa: E501
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_SIZE': 64}),
+        triton.Config({'BLOCK_SIZE': 128}),
+        triton.Config({'BLOCK_SIZE': 256}),
+        triton.Config({'BLOCK_SIZE': 512}),
+        triton.Config({'BLOCK_SIZE': 1024}),
+        triton.Config({'BLOCK_SIZE': 2048}),
+    ],
+    key=['dim'],
+)
+@triton.jit
+def _state_passing_fwd_kernel(
+    # Pointers to matrices
+    states_ptr,
+    out_ptr,
+    final_states_ptr,
+    dA_cs_ptr,
+    initstates_ptr,
+    seq_idx_ptr,
+    # Matrix dimensions
+    dim,
+    nchunks,
+    seqlen,
+    chunk_size,
+    # Strides
+    stride_states_batch,
+    stride_states_chunk,
+    stride_states_head,
+    stride_states_dim,
+    stride_out_batch,
+    stride_out_chunk,
+    stride_out_head,
+    stride_out_dim,
+    stride_final_states_batch,
+    stride_final_states_head,
+    stride_final_states_dim,
+    stride_dA_cs_batch,
+    stride_dA_cs_chunk,
+    stride_dA_cs_head,
+    stride_initstates_batch,
+    stride_initstates_head,
+    stride_initstates_dim,
+    stride_seq_idx_batch,
+    stride_seq_idx_seqlen,
+    # Meta-parameters
+    HAS_INITSTATES: tl.constexpr,
+    HAS_SEQ_IDX: tl.constexpr,
+    IS_CONT_BATCHED: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid_b = tl.program_id(axis=1)
+    pid_h = tl.program_id(axis=2)
+    pid_m = tl.program_id(axis=0)
+    states_ptr += pid_b * stride_states_batch + pid_h * stride_states_head
+    dA_cs_ptr += pid_b * stride_dA_cs_batch + pid_h * stride_dA_cs_head
+    out_ptr += pid_b * stride_out_batch + pid_h * stride_out_head
+    final_states_ptr += pid_b * stride_final_states_batch + pid_h * stride_final_states_head
+    if HAS_INITSTATES:
+        initstates_ptr += pid_h * stride_initstates_head
+        if not IS_CONT_BATCHED:
+            initstates_ptr += pid_b * stride_initstates_batch
+
+    if HAS_SEQ_IDX:
+        seq_idx_ptr += pid_b * stride_seq_idx_batch
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    states_ptrs = states_ptr + offs_m * stride_states_dim
+    out_ptrs = out_ptr + offs_m * stride_out_dim
+    final_states_ptrs = final_states_ptr + offs_m * stride_final_states_dim
+
+    # - states will be the past state of the sequence that continues on the current check
+    if not HAS_INITSTATES:
+        states = tl.zeros((BLOCK_SIZE, ), dtype=tl.float32)
+    else:
+        initstates_ptr += offs_m * stride_initstates_dim
+        initstates_ptrs = initstates_ptr
+        # - for cont batches, for the first chunk mean it will be the first batch's
+        #   init state
+        states = tl.load(initstates_ptrs, mask=offs_m < dim,
+                         other=0.0).to(tl.float32)
+
+    tl.store(out_ptrs, states, mask=offs_m < dim)
+    out_ptrs += stride_out_chunk
+    seq_idx = 0
+    for c in range(nchunks):
+        new_states = tl.load(states_ptrs, mask=offs_m < dim,
+                             other=0.0).to(tl.float32)
+        dA_cs = tl.load(dA_cs_ptr).to(tl.float32)
+        scale = tl.exp(dA_cs)
+        if HAS_SEQ_IDX:
+            # - the seq to pass forward is the one that is flushed to the right
+            #   boundary.
+            # - that is given by seq_idx_new below.
+            seq_idx_new = tl.load(seq_idx_ptr +
+                                  (min((c + 1) * chunk_size, seqlen) - 1) *
+                                  stride_seq_idx_seqlen)
+            if HAS_INITSTATES:
+                if IS_CONT_BATCHED and seq_idx != seq_idx_new:
+                    # this means in the current chunk the rightmost flushed seq
+                    # has changed.
+                    # - so we do not propagate the state from previous chunk
+                    # - but rather we load that sequence's init state
+                    initstates_ptrs = initstates_ptr + seq_idx_new * stride_initstates_batch
+
+                    # - update state with seq_idx_new's init state
+                    states = tl.load(initstates_ptrs,
+                                     mask=offs_m < dim,
+                                     other=0.0).to(tl.float32)
+            else:
+                scale = tl.where(seq_idx_new == seq_idx, scale, 0.0)
+
+            seq_idx = seq_idx_new
+        states = scale * states + new_states
+        if c < nchunks - 1:
+            tl.store(out_ptrs, states, mask=offs_m < dim)
+        else:
+            tl.store(final_states_ptrs, states, mask=offs_m < dim)
+        states_ptrs += stride_states_chunk
+        dA_cs_ptr += stride_dA_cs_chunk
+        out_ptrs += stride_out_chunk
+
+
+def _state_passing_fwd(
+    states,
+    dA_chunk_cumsum,
+    initial_states=None,
+    seq_idx=None,
+    chunk_size=None,
+    out_dtype=None,
+    is_cont_batched=False,
+):
+    batch, nchunks, nheads, dim = states.shape
+    assert dA_chunk_cumsum.shape == (batch, nheads, nchunks)
+    if initial_states is not None:
+        if is_cont_batched:
+            # - if cu_seqlens is provided, then the initial states
+            #   are used for continuous batching. In which case we
+            #   require seq_idx to be provided
+            assert seq_idx is not None, ""
+            assert initial_states.shape == (seq_idx.max().item() + 1, nheads,
+                                            dim)
+        else:
+            # - this is the regular batching case, where initial
+            #   states are used are for each example of the batch.
+            assert initial_states.shape == (batch, nheads, dim)
+
+    if seq_idx is not None:
+        assert chunk_size is not None
+        seqlen = seq_idx.shape[-1]
+        assert seq_idx.shape == (batch, seqlen)
+    out_dtype = states.dtype if out_dtype is None else out_dtype
+    out = torch.empty((batch, nchunks, nheads, dim),
+                      device=states.device,
+                      dtype=out_dtype)
+    final_states = torch.empty((batch, nheads, dim),
+                               device=states.device,
+                               dtype=torch.float32)
+    grid = lambda META: (triton.cdiv(dim, META['BLOCK_SIZE']), batch, nheads)
+    with torch.cuda.device(states.device.index):
+        _state_passing_fwd_kernel[grid](
+            states,
+            out,
+            final_states,
+            dA_chunk_cumsum,
+            initial_states,
+            seq_idx,
+            dim,
+            nchunks,
+            seqlen if seq_idx is not None else 0,
+            chunk_size if seq_idx is not None else 0,
+            states.stride(0),
+            states.stride(1),
+            states.stride(2),
+            states.stride(3),
+            out.stride(0),
+            out.stride(1),
+            out.stride(2),
+            out.stride(3),
+            final_states.stride(0),
+            final_states.stride(1),
+            final_states.stride(2),
+            dA_chunk_cumsum.stride(0),
+            dA_chunk_cumsum.stride(2),
+            dA_chunk_cumsum.stride(1),
+            *((initial_states.stride(0), initial_states.stride(1),
+               initial_states.stride(2)) if initial_states is not None else
+              (0, 0, 0)),
+            *((seq_idx.stride(0),
+               seq_idx.stride(1)) if seq_idx is not None else (0, 0)),
+            HAS_INITSTATES=initial_states is not None,
+            HAS_SEQ_IDX=seq_idx is not None,
+            IS_CONT_BATCHED=is_cont_batched,
+        )
+    return out, final_states
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
new file mode 100644
index 00000000000..72b74e31b6c
--- /dev/null
+++ b/vllm/model_executor/models/bamba.py
@@ -0,0 +1,592 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only Bamba model."""
+# Added by the IBM Team, 2024
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import BambaConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class BambaMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: BambaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BambaMixerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: BambaConfig,
+                 layer_idx: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.mamba = MambaMixer2(hidden_size= config.hidden_size,
+                                ssm_state_size = config.mamba_d_state,
+                                conv_kernel_size = config.mamba_d_conv,
+                                intermediate_size = config.mamba_expand *\
+                                                    config.hidden_size,
+                                use_conv_bias = config.mamba_conv_bias,
+                                use_bias = config.mamba_proj_bias,
+                                n_groups=config.mamba_n_groups,
+                                num_heads=config.mamba_n_heads,
+                                head_dim=config.mamba_d_head,
+                                rms_norm_eps=config.rms_norm_eps,
+                                activation=config.hidden_act,
+                                chunk_size=config.mamba_chunk_size,
+                                quant_config=quant_config)
+
+        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        sequence_idx: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.mamba(hidden_states, attn_metadata,
+                                   mamba_cache_params, sequence_idx)
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+class BambaAttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: BambaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if hasattr(config, "partial_rotary_factor"):
+            rotary_dim = self.head_dim * config.partial_rotary_factor
+        elif hasattr(config, "attn_rotary_emb"):
+            rotary_dim = config.attn_rotary_emb  # for backward compatibility
+        else:
+            rotary_dim = self.head_dim  # default
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            rope_scaling=rope_scaling,
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=torch.get_default_dtype(),  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.feed_forward = BambaMLP(config, quant_config=quant_config)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+        # Fully Connected
+        hidden_states, residual = self.pre_ff_layernorm(
+            hidden_states, residual)
+        hidden_states = self.feed_forward(hidden_states)
+        return hidden_states, residual
+
+
+ALL_DECODER_LAYER_TYPES = {
+    "attention": BambaAttentionDecoderLayer,
+    "mamba": BambaMixerDecoderLayer
+}
+
+
+class BambaModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = ALL_DECODER_LAYER_TYPES[
+                config.layers_block_type[layer_idx]]
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        seq_idx = None
+        if attn_metadata.num_prefills > 0:
+            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
+            for i, (srt, end) in enumerate(
+                    zip(
+                        attn_metadata.query_start_loc,
+                        attn_metadata.query_start_loc[1:],
+                    )):
+                seq_idx[srt:end] = i
+            seq_idx.unsqueeze_(0)
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        residual = None
+        num_attn = 0
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+            kv_cache = None
+            if isinstance(layer, BambaAttentionDecoderLayer):
+                kv_cache = kv_caches[num_attn]
+                num_attn += 1
+
+            layer_mamba_cache_params = None
+            if isinstance(layer, BambaMixerDecoderLayer):
+                layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
+                    i - num_attn)
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                kv_cache=kv_cache,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                mamba_cache_params=layer_mamba_cache_params,
+                sequence_idx=seq_idx,
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+
+
+class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                       IsHybrid):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["up_proj", "down_proj"]
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Bamba currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = BambaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+        # follow jamba
+        if self.scheduler_config is not None and \
+            not self.model_config.enforce_eager:
+            # for compilation
+            if self.scheduler_config.max_num_seqs > \
+                vllm_config.compilation_config.max_capture_size:
+                self.max_batch_size = \
+                    vllm_config.compilation_config.max_capture_size
+            else:
+                self.max_batch_size = vllm_config.pad_for_cudagraph(
+                    self.scheduler_config.max_num_seqs)
+        elif self.scheduler_config is not None:
+            # for eager just take the scheduler_config if avail
+            self.max_batch_size = self.scheduler_config.max_num_seqs
+        else:
+            self.max_batch_size = 8192 + 2
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[KVCache],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+
+            self.mamba_cache = MambaCacheManager(
+                self.lm_head.weight.dtype, num_mamba_layers,
+                self.max_batch_size, *self._get_mamba_cache_shape())
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, mamba_cache_params,
+                                   intermediate_tensors, inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = self.config.mamba_expand * hidden_size
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (self.config.mamba_n_groups + extra_groups_for_head_shards(
+            self.config.mamba_n_groups, world_size))
+
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size +
+                    2 * n_groups * self.config.mamba_d_state)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.mamba_d_conv - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.mamba_n_heads, world_size),
+            self.config.mamba_d_head,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index d82c0815213..f307f279dad 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -455,14 +455,9 @@ def forward(self,
             self.mamba_cache = MambaCacheManager(
                 self.lm_head.weight.dtype, num_mamba_layers,
                 self.max_batch_size, *self._get_mamba_cache_shape())
-        (
-            mamba_cache_tensors,
-            state_indices_tensor,
-        ) = self.mamba_cache.current_run_tensors(input_ids, attn_metadata,
-                                                 **kwargs)
-        mamba_cache_params = MambaCacheParams(mamba_cache_tensors[0],
-                                              mamba_cache_tensors[1],
-                                              state_indices_tensor)
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 5034b334564..3bbc219e92a 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -232,15 +232,7 @@ def forward(self,
                 self.lm_head.weight.dtype, num_mamba_layers,
                 self.max_batch_size, *self._get_mamba_cache_shape())
 
-        (
-            mamba_cache_tensors,
-            state_indices_tensor,
-        ) = self.mamba_cache.current_run_tensors(input_ids, attn_metadata,
-                                                 **kwargs)
-
-        mamba_cache_params = MambaCacheParams(mamba_cache_tensors[0],
-                                              mamba_cache_tensors[1],
-                                              state_indices_tensor)
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
         hidden_states = self.backbone(input_ids, positions, attn_metadata,
                                       mamba_cache_params, intermediate_tensors,
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index 353177f784b..ce4197507da 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.utils import PAD_SLOT_ID
 
 
@@ -42,8 +41,7 @@ def __init__(self, dtype, num_mamba_layers, max_batch_size,
         self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
         self.free_cache_indices = list(range(max_batch_size))
 
-    def current_run_tensors(self, input_ids: torch.Tensor,
-                            attn_metadata: AttentionMetadata, **kwargs):
+    def current_run_tensors(self, **kwargs) -> MambaCacheParams:
         """
         Return the tensors for the current run's conv and ssm state.
         """
@@ -66,7 +64,8 @@ def current_run_tensors(self, input_ids: torch.Tensor,
             (mamba_cache_tensors,
              state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
 
-        return (mamba_cache_tensors, state_indices_tensor)
+        return MambaCacheParams(mamba_cache_tensors[0], mamba_cache_tensors[1],
+                                state_indices_tensor)
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
         """
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3b2a7069efc..c2d0fae7056 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -37,6 +37,7 @@
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
+    "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
     # ChatGLMModel supports multimodal
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),

From f36207578b972750461374c2274660d2350e015f Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Thu, 6 Feb 2025 15:36:21 -0800
Subject: [PATCH 0044/1240] [MISC] Check space in the file names in the pre
 commit checks (#12804)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml                                     | 6 ++++++
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 ...e=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} | 0
 14 files changed, 6 insertions(+)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)
 rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json => N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json} (100%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4568efcbba2..0b1c4fdf26a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -108,3 +108,9 @@ repos:
     language: system
     verbose: true
     pass_filenames: false
+  - id: check-filenames
+    name: Check for spaces in all filenames
+    entry: bash -c 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
+    language: system
+    always_run: true
+    pass_filenames: false
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
similarity index 100%
rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json
rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json

From 9da1901cb49727d0e36d5f131dcf1e46019b473a Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 6 Feb 2025 16:29:12 -0800
Subject: [PATCH 0045/1240] [misc] Revert # 12833 (#12857)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/preprocess.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 53f89996f0f..035e84cc063 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -260,6 +260,9 @@ def _process_multimodal(
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
 
+        if isinstance(prompt, list):
+            prompt = tokenizer.decode(prompt)
+
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 

From 2060c48cf1490141ca622d60e83ad132b9385b80 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 6 Feb 2025 22:54:07 -0500
Subject: [PATCH 0046/1240] [Bugfix] FA2 illegal memory access (#12848)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c823c9ff895..b99061dfde4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -581,7 +581,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
+          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

From 9d456fa381ee32d39ad9d05a7155fddec8d00e0b Mon Sep 17 00:00:00 2001
From: ZSL98 <36250440+ZSL98@users.noreply.github.com>
Date: Fri, 7 Feb 2025 11:54:20 +0800
Subject: [PATCH 0047/1240] Make vllm compatible with verl (#12824)

Co-authored-by: zhangshulai <zhangshulai@bytedance.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/parallel_state.py | 7 -------
 vllm/executor/uniproc_executor.py  | 2 +-
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 321902d11fd..bfc41703b94 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1024,13 +1024,6 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
-    if (world_size
-            != tensor_model_parallel_size * pipeline_model_parallel_size):
-        raise RuntimeError(
-            f"world_size ({world_size}) is not equal to "
-            f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
-            f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
-
     # Build the tensor model-parallel groups.
     num_tensor_model_parallel_groups: int = (world_size //
                                              tensor_model_parallel_size)
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index dcb4a8f27c2..e5464cafaec 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -101,7 +101,7 @@ def _init_executor(self) -> None:
         # - MASTER_PORT
         distributed_init_method = "env://"
         rank = int(os.environ["RANK"])
-        local_rank = rank
+        local_rank = int(os.environ["LOCAL_RANK"])
         is_driver_worker = True
         kwargs = dict(
             vllm_config=self.vllm_config,

From 9fb617f52ff9a2288d168d08c8c8933d1b6c29ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Fri, 7 Feb 2025 06:35:09 +0100
Subject: [PATCH 0048/1240] [Bugfix] Missing quant_config in deepseek embedding
 layer (#12836)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 0c6f07ce7b1..fd0e58fa145 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -581,7 +581,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.embed_tokens = VocabParallelEmbedding(
                 config.vocab_size,
                 config.hidden_size,
-            )
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
         else:
             self.embed_tokens = PPMissingLayer()
 

From 2779a12285019dff4bc0ded39346cd5a1e4b5d09 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Fri, 7 Feb 2025 02:37:41 -0300
Subject: [PATCH 0049/1240] Prevent unecessary requests to huggingface hub
 (#12837)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../offline_mode/test_offline_mode.py         |  21 ++++
 vllm/transformers_utils/config.py             | 115 ++++++++++++------
 2 files changed, 96 insertions(+), 40 deletions(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index eac76f2ba0f..85156d6931c 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -4,6 +4,7 @@
 import sys
 
 import pytest
+import urllib3
 
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -28,6 +29,15 @@
         "tensor_parallel_size": 1,
         "tokenizer_mode": "mistral",
     },
+    {
+        "model": "sentence-transformers/all-MiniLM-L12-v2",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
 ]
 
 
@@ -47,6 +57,16 @@ def test_offline_mode(monkeypatch):
     # Set HF to offline mode and ensure we can still construct an LLM
     try:
         monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+
+        def disable_connect(*args, **kwargs):
+            raise RuntimeError("No http calls allowed")
+
+        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
+                            disable_connect)
+        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
+                            disable_connect)
+
         # Need to re-import huggingface_hub and friends to setup offline mode
         _re_import_modules()
         # Cached model files should be used in offline mode
@@ -56,6 +76,7 @@ def test_offline_mode(monkeypatch):
         # Reset the environment after the test
         # NB: Assuming tests are run in online mode
         monkeypatch.delenv("HF_HUB_OFFLINE")
+        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
         _re_import_modules()
         pass
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 85056158bab..fb5cc3ec072 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -10,7 +10,7 @@
 from huggingface_hub import (file_exists, hf_hub_download, list_repo_files,
                              try_to_load_from_cache)
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
-                                   LocalEntryNotFoundError,
+                                   HFValidationError, LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
                                    RevisionNotFoundError)
 from torch import nn
@@ -265,49 +265,66 @@ def get_config(
     return config
 
 
+def try_get_local_file(model: Union[str, Path],
+                       file_name: str,
+                       revision: Optional[str] = 'main') -> Optional[Path]:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(repo_id=model,
+                                                     filename=file_name,
+                                                     revision=revision)
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except HFValidationError:
+            ...
+    return None
+
+
 def get_hf_file_to_dict(file_name: str,
                         model: Union[str, Path],
                         revision: Optional[str] = 'main'):
     """
-    Downloads a file from the Hugging Face Hub and returns 
+    Downloads a file from the Hugging Face Hub and returns
     its contents as a dictionary.
 
     Parameters:
     - file_name (str): The name of the file to download.
     - model (str): The name of the model on the Hugging Face Hub.
-    - revision (str): The specific version of the model. 
+    - revision (str): The specific version of the model.
 
     Returns:
-    - config_dict (dict): A dictionary containing 
+    - config_dict (dict): A dictionary containing
     the contents of the downloaded file.
     """
-    file_path = Path(model) / file_name
 
-    if file_or_path_exists(model=model,
-                           config_name=file_name,
-                           revision=revision):
+    file_path = try_get_local_file(model=model,
+                                   file_name=file_name,
+                                   revision=revision)
 
-        if not file_path.is_file():
-            try:
-                hf_hub_file = hf_hub_download(model,
-                                              file_name,
-                                              revision=revision)
-            except (RepositoryNotFoundError, RevisionNotFoundError,
-                    EntryNotFoundError, LocalEntryNotFoundError) as e:
-                logger.debug("File or repository not found in hf_hub_download",
-                             e)
-                return None
-            except HfHubHTTPError as e:
-                logger.warning(
-                    "Cannot connect to Hugging Face Hub. Skipping file "
-                    "download for '%s':",
-                    file_name,
-                    exc_info=e)
-                return None
-            file_path = Path(hf_hub_file)
+    if file_path is None and file_or_path_exists(
+            model=model, config_name=file_name, revision=revision):
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except (RepositoryNotFoundError, RevisionNotFoundError,
+                EntryNotFoundError, LocalEntryNotFoundError) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file "
+                "download for '%s':",
+                file_name,
+                exc_info=e)
+            return None
+        file_path = Path(hf_hub_file)
 
+    if file_path is not None and file_path.is_file():
         with open(file_path) as file:
             return json.load(file)
+
     return None
 
 
@@ -328,7 +345,12 @@ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
 
     modules_file_name = "modules.json"
-    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
+
+    modules_dict = None
+    if file_or_path_exists(model=model,
+                           config_name=modules_file_name,
+                           revision=revision):
+        modules_dict = get_hf_file_to_dict(modules_file_name, model, revision)
 
     if modules_dict is None:
         return None
@@ -382,17 +404,17 @@ def get_sentence_transformer_tokenizer_config(model: str,
                                               revision: Optional[str] = 'main'
                                               ):
     """
-    Returns the tokenization configuration dictionary for a 
+    Returns the tokenization configuration dictionary for a
     given Sentence Transformer BERT model.
 
     Parameters:
-    - model (str): The name of the Sentence Transformer 
+    - model (str): The name of the Sentence Transformer
     BERT model.
     - revision (str, optional): The revision of the m
     odel to use. Defaults to 'main'.
 
     Returns:
-    - dict: A dictionary containing the configuration parameters 
+    - dict: A dictionary containing the configuration parameters
     for the Sentence Transformer BERT model.
     """
     sentence_transformer_config_files = [
@@ -404,20 +426,33 @@ def get_sentence_transformer_tokenizer_config(model: str,
         "sentence_xlm-roberta_config.json",
         "sentence_xlnet_config.json",
     ]
-    try:
-        # If model is on HuggingfaceHub, get the repo files
-        repo_files = list_repo_files(model, revision=revision, token=HF_TOKEN)
-    except Exception as e:
-        logger.debug("Error getting repo files", e)
-        repo_files = []
-
     encoder_dict = None
-    for config_name in sentence_transformer_config_files:
-        if config_name in repo_files or Path(model).exists():
-            encoder_dict = get_hf_file_to_dict(config_name, model, revision)
+
+    for config_file in sentence_transformer_config_files:
+        if try_get_local_file(model=model,
+                              file_name=config_file,
+                              revision=revision) is not None:
+            encoder_dict = get_hf_file_to_dict(config_file, model, revision)
             if encoder_dict:
                 break
 
+    if not encoder_dict:
+        try:
+            # If model is on HuggingfaceHub, get the repo files
+            repo_files = list_repo_files(model,
+                                         revision=revision,
+                                         token=HF_TOKEN)
+        except Exception as e:
+            logger.debug("Error getting repo files", e)
+            repo_files = []
+
+        for config_name in sentence_transformer_config_files:
+            if config_name in repo_files:
+                encoder_dict = get_hf_file_to_dict(config_name, model,
+                                                   revision)
+                if encoder_dict:
+                    break
+
     if not encoder_dict:
         return None
 

From a55f9c09cfd737eee245071a3c8e39d7e8f909e7 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 7 Feb 2025 05:04:39 -0800
Subject: [PATCH 0050/1240] [MISC][EASY] Break check file names into entry and
 args in the pre-commit hooks (#12880)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0b1c4fdf26a..3fb74ab9b23 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -110,7 +110,10 @@ repos:
     pass_filenames: false
   - id: check-filenames
     name: Check for spaces in all filenames
-    entry: bash -c 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
+    entry: bash
+    args:
+      - -c
+      - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
     language: system
     always_run: true
     pass_filenames: false

From c3b7ef3d5312fc5df102ef04a718190e242b2b3e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 7 Feb 2025 22:21:17 +0800
Subject: [PATCH 0051/1240] [Misc] Remove unnecessary detokenization in
 multimodal processing (#12868)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_audio.py            | 6 +++---
 tests/entrypoints/openai/test_vision.py           | 4 ++--
 tests/entrypoints/openai/test_vision_embedding.py | 4 ++--
 vllm/inputs/preprocess.py                         | 3 ---
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 6e206dfd99b..3459f24834d 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -83,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -140,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -196,7 +196,7 @@ async def test_single_chat_session_input_audio(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 029c9b038b0..c954fca696f 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -92,7 +92,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=775, total_tokens=785)
+        completion_tokens=10, prompt_tokens=774, total_tokens=784)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -185,7 +185,7 @@ async def test_single_chat_session_image_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=775, total_tokens=785)
+        completion_tokens=10, prompt_tokens=774, total_tokens=784)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index f2ff4a0b07a..cee5274561f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -93,5 +93,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 764
-    assert embeddings.usage.total_tokens == 764
+    assert embeddings.usage.prompt_tokens == 763
+    assert embeddings.usage.total_tokens == 763
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 035e84cc063..53f89996f0f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -260,9 +260,6 @@ def _process_multimodal(
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
 
-        if isinstance(prompt, list):
-            prompt = tokenizer.decode(prompt)
-
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 

From 697220f2b9cc58ecac623a25bd9dd8b48645c430 Mon Sep 17 00:00:00 2001
From: Amit Garg <mitgarg17495@gmail.com>
Date: Fri, 7 Feb 2025 06:22:37 -0800
Subject: [PATCH 0052/1240] PR #12718 (#12718)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/rotary_embedding.py | 18 +++++++++++-------
 vllm/model_executor/models/llama.py            |  5 ++++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index b3b9b0e8760..ec204b32f67 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -509,15 +509,12 @@ def __init__(
     ):
         super().__init__()
 
-        if rotary_dim != head_size:
-            raise ValueError(
-                f"`Phi3LongRoPEScaledRotaryEmbedding` does not support \
-                    rotary_dim != head_size ({rotary_dim}!={head_size}).")
         if is_neox_style is False:
             raise ValueError(
                 "`Phi3LongRoPEScaledRotaryEmbedding` only supports neox_style."
             )
 
+        self.rotary_dim = rotary_dim
         self.head_size = head_size
         self.max_position_embeddings = max_position_embeddings
         self.original_max_position_embeddings = original_max_position_embeddings
@@ -557,7 +554,7 @@ def __init__(
     def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
         rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
         inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
-            0, self.head_size, 2, dtype=torch.float) / self.head_size)))
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)))
         return inv_freq
 
     def _compute_cos_sin_cache(
@@ -596,8 +593,15 @@ def forward(
         cos = cos.repeat(1, 2).unsqueeze(-2)
         sin = sin.repeat(1, 2).unsqueeze(-2)
 
-        query = query * cos + _rotate_neox(query) * sin
-        key = key * cos + _rotate_neox(key) * sin
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = query_rot * cos + _rotate_neox(query_rot) * sin
+        query = torch.cat((query_rot, query_pass), dim=-1)
+
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = key_rot * cos + _rotate_neox(key_rot) * sin
+        key = torch.cat((key_rot, key_pass), dim=-1)
 
         return query.flatten(-2), key.flatten(-2)
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d91c8782a12..866c6923475 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -128,6 +128,9 @@ def __init__(self,
         # MistralConfig has an optional head_dim introduced by Mistral-Nemo
         self.head_dim = getattr(config, "head_dim",
                                 self.hidden_size // self.total_num_heads)
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -159,7 +162,7 @@ def __init__(self,
 
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
+            rotary_dim=self.rotary_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,

From a213181169509e54a42ce36d72289320aa945f2e Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 10:26:20 -0500
Subject: [PATCH 0053/1240] [V1] Logprobs and prompt logprobs support (#9880)

This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>

Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py               |   4 +-
 tests/v1/engine/conftest.py                   |  90 +++
 tests/v1/engine/test_async_llm.py             |  49 +-
 tests/v1/engine/test_llm_engine.py            |  23 +
 tests/v1/engine/test_output_processor.py      | 553 +++++++++++++++---
 tests/v1/engine/utils.py                      | 382 ++++++++++++
 tests/v1/entrypoints/__init__.py              |   0
 tests/v1/entrypoints/conftest.py              | 161 +++++
 .../v1/entrypoints/openai/test_completion.py  | 475 +++++++++++++++
 tests/v1/sample/test_logprobs.py              | 392 +++++++++++++
 tests/v1/sample/test_logprobs_e2e.py          |  52 ++
 tests/v1/sample/utils.py                      | 120 ++++
 vllm/outputs.py                               |  10 +-
 vllm/transformers_utils/detokenizer_utils.py  |  19 +
 vllm/v1/core/scheduler.py                     |  43 +-
 vllm/v1/engine/__init__.py                    |  10 +-
 vllm/v1/engine/core.py                        |   5 +-
 vllm/v1/engine/core_client.py                 |   5 +-
 vllm/v1/engine/detokenizer.py                 |  54 +-
 vllm/v1/engine/llm_engine.py                  |   1 +
 vllm/v1/engine/logprobs.py                    | 194 ++++++
 vllm/v1/engine/output_processor.py            | 126 ++--
 vllm/v1/engine/processor.py                   |  39 +-
 vllm/v1/metrics/stats.py                      |  19 +-
 vllm/v1/outputs.py                            |  54 +-
 vllm/v1/sample/metadata.py                    |   3 +-
 vllm/v1/sample/sampler.py                     |  94 ++-
 vllm/v1/serial_utils.py                       |  50 +-
 vllm/v1/worker/gpu_input_batch.py             |  27 +-
 vllm/v1/worker/gpu_model_runner.py            | 102 +++-
 30 files changed, 2869 insertions(+), 287 deletions(-)
 create mode 100644 tests/v1/engine/conftest.py
 create mode 100644 tests/v1/engine/test_llm_engine.py
 create mode 100644 tests/v1/engine/utils.py
 create mode 100644 tests/v1/entrypoints/__init__.py
 create mode 100644 tests/v1/entrypoints/conftest.py
 create mode 100644 tests/v1/entrypoints/openai/test_completion.py
 create mode 100644 tests/v1/sample/test_logprobs.py
 create mode 100644 tests/v1/sample/test_logprobs_e2e.py
 create mode 100644 tests/v1/sample/utils.py
 create mode 100644 vllm/v1/engine/logprobs.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 8eb08f3e842..0d29729a454 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -195,8 +195,8 @@ def test_schedule_partial_requests():
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[0] * len(requests),
-        logprob_token_ids_cpu=None,
-        logprobs_cpu=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
     )
     scheduler.update_from_output(output, model_runner_output)
 
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
new file mode 100644
index 00000000000..560dc312185
--- /dev/null
+++ b/tests/v1/engine/conftest.py
@@ -0,0 +1,90 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Tuple
+
+import pytest
+import torch
+from transformers import AutoTokenizer
+
+from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
+                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST, PROMPT_LEN,
+                                   TOKENIZER_NAME,
+                                   DummyOutputProcessorTestVectors,
+                                   generate_dummy_prompt_logprobs_tensors,
+                                   generate_dummy_sample_logprobs)
+from vllm.engine.arg_utils import EngineArgs
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+
+from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
+
+EngineCoreSampleLogprobsType = List[Tuple[torch.Tensor, torch.Tensor]]
+EngineCorePromptLogprobsType = Tuple[torch.Tensor, torch.Tensor]
+
+
+def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
+    """Generate output processor dummy test vectors, without logprobs
+    
+    Returns:
+      DummyOutputProcessorTestVectors instance with no logprobs
+    """
+
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+    vllm_config = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+    # Tokenize prompts under test & create dummy generated tokens
+    prompt_tokens = [
+        tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+    ]
+    generation_tokens = [
+        tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+    ]
+    # Generate prompt strings
+    prompt_strings = [
+        tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+        for prompt_tokens in prompt_tokens
+    ]
+    prompt_strings_len = [
+        len(prompt_string) for prompt_string in prompt_strings
+    ]
+    return DummyOutputProcessorTestVectors(
+        tokenizer=tokenizer,
+        tokenizer_group=init_tokenizer_from_configs(
+            vllm_config.model_config, vllm_config.scheduler_config,
+            vllm_config.parallel_config, vllm_config.lora_config),
+        vllm_config=vllm_config,
+        full_tokens=[tokenizer(text).input_ids for text in FULL_STRINGS],
+        prompt_tokens=prompt_tokens,
+        generation_tokens=generation_tokens,
+        prompt_strings=prompt_strings,
+        prompt_strings_len=prompt_strings_len,
+        generation_strings=[
+            text[prompt_len:]
+            for text, prompt_len in zip(FULL_STRINGS, prompt_strings_len)
+        ],
+        prompt_logprobs=[],
+        generation_logprobs=[])
+
+
+@pytest.fixture
+def dummy_test_vectors() -> DummyOutputProcessorTestVectors:
+    """Generate output processor dummy test vectors, with logprobs
+    
+    Returns:
+      DummyOutputProcessorTestVectors instance with logprobs
+    """
+    # Build dummy test vectors without logprobs
+    dtv = _build_test_vectors_no_logprobs()
+    # Inject logprobs into dummy test vectors
+    # data structure
+    dtv.generation_logprobs = [
+        generate_dummy_sample_logprobs(
+            sampled_tokens_list=tokens_list,
+            num_logprobs=NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+            tokenizer=dtv.tokenizer) for tokens_list in dtv.generation_tokens
+    ]
+    dtv.prompt_logprobs = [
+        generate_dummy_prompt_logprobs_tensors(
+            prompt_tokens_list=tokens_list,
+            num_logprobs=NUM_PROMPT_LOGPROBS_UNDER_TEST,
+            tokenizer=dtv.tokenizer) for tokens_list in dtv.prompt_tokens
+    ]
+    return dtv
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 4b5bc9ced37..94e18289e3c 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -2,10 +2,11 @@
 
 import asyncio
 from contextlib import ExitStack
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 
+from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.platforms import current_platform
@@ -21,13 +22,19 @@
                               disable_log_requests=True)
 
 
-async def generate(engine: AsyncLLM, request_id: str,
+async def generate(engine: AsyncLLM,
+                   request_id: str,
                    output_kind: RequestOutputKind,
-                   max_tokens: int) -> Tuple[int, str]:
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> Tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
     count = 0
     sampling_params = SamplingParams(max_tokens=max_tokens,
                                      output_kind=output_kind,
-                                     temperature=0)
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
                                      prompt="Hello my name is Robert and",
                                      sampling_params=sampling_params):
@@ -43,6 +50,40 @@ async def generate(engine: AsyncLLM, request_id: str,
     return count, request_id
 
 
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_async_llm_refuses_prompt_logprobs_with_apc(
+        monkeypatch, output_kind: RequestOutputKind):
+    """Test passes if AsyncLLM raises an exception when it is configured
+    for automatic prefix caching and it receives a request with
+    prompt_logprobs enabled, which is incompatible."""
+    # TODO(rickyx): Remove monkeypatch VLLM_USE_V1 setting once we have a
+    # better way to test V1 so that in the future when we switch, we don't
+    # have to change all the tests.
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # Create AsyncLLM engine with APC
+    apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
+                                      enable_prefix_caching=True,
+                                      gpu_memory_utilization=0.8,
+                                      disable_log_requests=True)
+    engine = AsyncLLM.from_engine_args(apc_engine_args)
+    try:
+        with pytest.raises(ValueError) as excinfo:
+            # Issue a request with prompt logprobs enabled, which should fail
+            await asyncio.create_task(
+                generate(engine,
+                         "request-0",
+                         output_kind,
+                         10,
+                         prompt_logprobs=5))
+        # Validate exception string is correct
+        assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
+    finally:
+        # Shut down engine
+        engine.shutdown()
+
+
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.asyncio
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
new file mode 100644
index 00000000000..84b634316cb
--- /dev/null
+++ b/tests/v1/engine/test_llm_engine.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
+from vllm import LLM, SamplingParams
+
+
+def test_llm_engine_refuses_prompt_logprobs_with_apc(monkeypatch):
+    """Test passes if LLMEngine raises an exception when it is configured
+    for automatic prefix caching and it receives a request with
+    prompt_logprobs enabled, which is incompatible."""
+
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # TODO(nick): Single-proc to work around a ZMQ shutdown hang for now.
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    with pytest.raises(ValueError) as excinfo:
+        LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
+            "Hello, my name is",
+            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5))
+
+    # Validate exception string is correct
+    assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 5782a249f36..c8f43edb70b 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,82 +1,47 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
+import math
+from typing import Dict, List, Optional
 
 import pytest
-from transformers import AutoTokenizer
 
-from vllm.engine.arg_utils import EngineArgs
+from tests.v1.engine.utils import (NUM_PROMPT_LOGPROBS_UNDER_TEST,
+                                   NUM_SAMPLE_LOGPROBS_UNDER_TEST,
+                                   STOP_STRINGS,
+                                   DummyOutputProcessorTestVectors,
+                                   MockEngineCore)
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.sequence import PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
 
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
-VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
-TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
-                                              VLLM_CONFIG.scheduler_config,
-                                              VLLM_CONFIG.parallel_config,
-                                              VLLM_CONFIG.lora_config)
-tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
-
-FULL_STRINGS = [
-    "My name is Robert from Neural Magic and I love working on vLLM so much!",
-    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
-    "Nick is the name of my brother in addition to my colleague from Red Hat.",
-]
-
-STOP_STRINGS = ["I love working on", "company by far", "brother in"]
-
-FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
-PROMPT_LEN = 5
-PROMPT_TOKENS = [
-    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
-]
-GENERATION_TOKENS = [
-    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
-]
-PROMPT_STRINGS = [
-    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
-    for prompt_tokens in PROMPT_TOKENS
-]
-PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
-GENERATION_STRINGS = [
-    text[prompt_len:]
-    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
-]
-
-
-class MockEngineCore:
-    """Mock outputs form premade tokens lists."""
-
-    def __init__(self, tokens_list: List[List[int]]):
-        self.tokens_list = tokens_list
-        self.current_idx = 0
-
-    def get_outputs(self) -> List[EngineCoreOutput]:
-        token_idx = self.current_idx
-        self.current_idx += 1
-
-        outputs = []
-        for req_idx, token_ids in enumerate(self.tokens_list):
-            if len(token_ids) > token_idx:
-                output = EngineCoreOutput(request_id=f"request-{req_idx}",
-                                          new_token_ids=[token_ids[token_idx]],
-                                          finished=False)
-                if token_idx == len(token_ids) - 1:
-                    output.finished = True
-                    output.finish_reason = "stopped"
-                outputs.append(output)
-
-        return outputs
+
+def _ref_convert_id_to_token(
+    tokenizer: AnyTokenizer,
+    token_id: int,
+) -> str:
+    """Reference impl of logprobs detokenization.
+
+    Args:
+      tokenizer: tokenizer used by the model under test
+      token_id: convert this token id
+
+    Returns:
+      String representation of input token id
+    """
+    return tokenizer.convert_ids_to_tokens(token_id) or ""
 
 
 @pytest.mark.parametrize(
     "request_output_kind",
     [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+def test_incremental_detokenization(request_output_kind: RequestOutputKind,
+                                    dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -94,10 +59,10 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
                               spaces_between_special_tokens=False,
                               output_kind=request_output_kind,
                               stop=[],
-                              include_stop_str_in_output=False))
-        for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+                              include_stop_str_in_output=False,
+                          )) for idx, (prompt, prompt_tokens) in enumerate(
+                              zip(dummy_test_vectors.prompt_strings,
+                                  dummy_test_vectors.prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -113,7 +78,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
             break
 
         # Step the Detokenizer.
-        processed_outputs = output_processor.process_outputs(outputs, )
+        processed_outputs = output_processor.process_outputs(outputs)
         request_outputs = processed_outputs.request_outputs
         requests_to_abort = processed_outputs.reqs_to_abort
         assert len(requests_to_abort) == 0
@@ -132,7 +97,8 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
-            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+            zip(dummy_test_vectors.generation_strings,
+                dummy_test_vectors.generation_tokens)):
         gen_str = gen_strings[f"request-{idx}"]
         gen_toks = gen_tokens[f"request-{idx}"]
 
@@ -143,15 +109,390 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
     assert not output_processor.has_unfinished_requests()
 
 
+def _validate_logprobs(
+    gen_tokens: Dict[str, List[int]],
+    gen_logprobs: Dict[str, Optional[SampleLogprobs]],
+    gen_prompt_logprobs: Dict[str, Optional[PromptLogprobs]],
+    gen_cumulative_logprob: Dict[str, float],
+    dtv: DummyOutputProcessorTestVectors,
+    request_id_list: List[str],
+    num_sample_logprobs: Optional[int],
+    num_prompt_logprobs: Optional[int],
+) -> None:
+    for req_idx, req_id in enumerate(request_id_list):
+        new_tokens = gen_tokens[req_id]
+        logprobs = gen_logprobs[req_id]
+        prompt_logprobs = gen_prompt_logprobs[req_id]
+        cumulative_logprob = gen_cumulative_logprob[req_id]
+        prompt_token_ids = dtv.prompt_tokens[req_idx]
+        ref_logprobs = dtv.generation_logprobs[req_idx]
+        ref_prompt_logprobs = dtv.prompt_logprobs[req_idx]
+        if num_sample_logprobs is not None:
+            # Validate sample logprobs
+            assert logprobs is not None, (f"Request {req_id} requires sample"
+                                          " logprobs but sample logprobs are"
+                                          " None.")
+            # Require num sampled tokens to match num
+            # sampled logprobs - especially important
+            # to check since the detokenizer can cause
+            # a request to finish early due to a stop
+            # string being hit
+            num_new_tokens = len(new_tokens)
+            len_sample_logprobs = len(logprobs)
+            assert num_new_tokens == len_sample_logprobs, (
+                f"Request {req_id} has {num_new_tokens}"
+                " completion tokens but has"
+                f" {len_sample_logprobs} sample logprobs.")
+            ref_cumulative_logprob = 0.0
+            for idx, (sampled_token,
+                      pos_logprob_dict) in enumerate(zip(new_tokens,
+                                                         logprobs)):
+                # Break out the reference log probability value &
+                # logprob token id tensors associated with this
+                # position in the completion. Also break out the
+                # sampled token ranks
+                (ref_pos_logprob_toks, ref_pos_logprob_vals,
+                 ref_sampled_token_rank) = ref_logprobs[idx]
+                # For each position in the completion sequence,
+                # ensure the actual sampled token is among the
+                # logprobs
+                assert sampled_token in pos_logprob_dict, (
+                    f"Sampled token {sampled_token} not"
+                    f" present in logprob at index {idx}")
+
+                # Validate number of sample logprobs
+                num_lp_toks = len(pos_logprob_dict)
+                assert (num_lp_toks == num_sample_logprobs
+                        or num_lp_toks == num_sample_logprobs +
+                        1), ("Valid numbers of sample logprobs are"
+                             f" {num_sample_logprobs} or"
+                             f" {num_sample_logprobs+1} but"
+                             f" {num_lp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate sampled token logprob rank
+                smp_lp = pos_logprob_dict[sampled_token]
+                smp_lp_rank = smp_lp.rank
+                assert (ref_sampled_token_rank == smp_lp_rank), (
+                    "Sampled token logprob rank"
+                    f" {smp_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_sampled_token_rank}"
+                    f" in Logprob {smp_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct log probabilities and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_lp_val = ref_pos_logprob_vals[jdx]
+                    ref_tok_id = ref_pos_logprob_toks[jdx]
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    lp = pos_logprob_dict[ref_tok_id]
+                    lp_val = lp.logprob
+                    lp_rank = lp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if lp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert lp_rank >= 1, (f"Logprob {lp} has invalid"
+                                          f" rank {lp_rank} < 1."
+                                          f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(lp_val, ref_lp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {lp_val} but {ref_lp_val} was"
+                        f" expected. Logprob: {lp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate logprobs detokenization
+                for lp_tok in pos_logprob_dict:
+                    # Confirm that sample logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[lp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, lp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Sampled logprob token id {lp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+
+                ref_cumulative_logprob += pos_logprob_dict[
+                    sampled_token].logprob
+            # Assert that cumulative logprobs are correct
+            assert math.isclose(cumulative_logprob, ref_cumulative_logprob)
+        else:
+            # Sample logprobs disabled for this request
+            assert logprobs is None
+            assert cumulative_logprob is None
+
+        if num_prompt_logprobs is not None:
+            # Validate prompt logprobs
+            assert prompt_logprobs is not None, (
+                f"Request {req_id} requires prompt"
+                " logprobs but prompt logprobs are"
+                " None.")
+            # Require num prompt tokens to match num
+            # prompt logprobs
+            num_prompt_tokens = len(prompt_token_ids)
+            len_prompt_logprobs = len(prompt_logprobs)
+            assert num_prompt_tokens == len_prompt_logprobs, (
+                f"Request {req_id} has {num_prompt_tokens}"
+                " prompt tokens but has"
+                f" {len_prompt_logprobs} prompt logprobs.")
+            # First prompt logprob is None
+            first_plp_dict = prompt_logprobs[0]
+            assert first_plp_dict is None, (
+                f"Request {req_id} first prompt logprob"
+                f" should be None but has following value"
+                f" instead: {first_plp_dict}")
+            # Break out the reference prompt log prob value &
+            # logprob token id matrices for the whole prompt.
+            # Also break out the prompt token rank vector
+            (ref_prompt_logprob_toks, ref_prompt_logprob_vals,
+             ref_prompt_token_ranks) = ref_prompt_logprobs
+            for idx, (prompt_token, pos_logprob_dict) in enumerate(
+                    zip(prompt_token_ids[1:], prompt_logprobs[1:])):
+
+                # Break out the reference prompt log prob value
+                # vector, prompt logprob token id vector, and
+                # prompt token rank at the current position.
+                (ref_pos_prompt_logprob_toks, ref_pos_prompt_logprob_vals,
+                 ref_pos_prompt_token_rank) = (ref_prompt_logprob_toks[idx, :],
+                                               ref_prompt_logprob_vals[idx, :],
+                                               ref_prompt_token_ranks[idx])
+
+                # For each position in the prompt sequence,
+                # ensure the actual prompt token is among the
+                # logprobs
+                assert prompt_token in pos_logprob_dict, (
+                    f"Prompt token {prompt_token} not"
+                    f" present in logprob at index {idx}")
+                # Validate number of prompt logprobs
+                num_plp_toks = len(pos_logprob_dict)
+                assert (num_plp_toks == num_prompt_logprobs
+                        or num_plp_toks == num_prompt_logprobs +
+                        1), ("Valid numbers of prompt logprobs are"
+                             f" {num_prompt_logprobs} or"
+                             f" {num_prompt_logprobs+1} but"
+                             f" {num_plp_toks} logprobs found at"
+                             f" position {idx}. Logprobs dict:"
+                             f" {pos_logprob_dict}")
+
+                # Validate prompt token logprob rank
+                prmpt_tok_lp = pos_logprob_dict[prompt_token]
+                prmpt_tok_lp_rank = prmpt_tok_lp.rank
+                ref_prmpt_tok_lp_rank = ref_pos_prompt_token_rank
+                assert (ref_prmpt_tok_lp_rank == prmpt_tok_lp_rank), (
+                    "Prompt token logprob rank"
+                    f" {prmpt_tok_lp_rank} does not match"
+                    " correct value"
+                    f" {ref_prmpt_tok_lp_rank}"
+                    f" in Logprob {prmpt_tok_lp}")
+
+                # Validate that the logprob processor yields
+                # the correct prompt log probs and valid
+                # rankings
+                rank_one_appears = False
+                for jdx in range(1, len(ref_pos_prompt_logprob_toks)):
+                    # Iterate over the (logprob val,logprob tok id)
+                    # pairs expected by the test fixture at this
+                    # position in the completion.
+                    ref_plp_val = float(ref_pos_prompt_logprob_vals[jdx])
+                    ref_tok_id = int(ref_pos_prompt_logprob_toks[jdx])
+                    assert ref_tok_id in pos_logprob_dict, (
+                        f"Expected token {ref_tok_id} to be"
+                        f" in logprob dict but it is not.")
+
+                    # Extract actually-generated logprob
+                    # info
+                    plp = pos_logprob_dict[ref_tok_id]
+                    plp_val = plp.logprob
+                    plp_rank = plp.rank
+
+                    # A "top" (rank 1) logprob must be
+                    # present
+                    rank_one_appears = (True
+                                        if plp_rank == 1 else rank_one_appears)
+
+                    # Rank must be >= 1
+                    assert plp_rank >= 1, (
+                        f"Logprob {plp} has invalid"
+                        f" rank {plp_rank} < 1."
+                        f" Logprob dict: {pos_logprob_dict}")
+
+                    # Validate log probability
+                    assert math.isclose(plp_val, ref_plp_val), (
+                        f"Token id {ref_tok_id} appears in logprobs dict"
+                        f" at position {idx} in completion with log"
+                        f" probability {plp_val} but {ref_plp_val} was"
+                        f" expected. Logprob: {plp}")
+
+                assert rank_one_appears, (f"No Logprob has rank 1"
+                                          " in the following Logprob"
+                                          f" dict: {pos_logprob_dict}")
+
+                # Validate prompt logprob detokenization
+                for plp_tok in pos_logprob_dict:
+                    # Confirm that prompt logprob decoded token matches
+                    # the logprob token id at this sequence position
+                    decoded_token = pos_logprob_dict[plp_tok].decoded_token
+                    ref_decoded_token = _ref_convert_id_to_token(
+                        dtv.tokenizer, plp_tok)
+                    assert decoded_token == ref_decoded_token, (
+                        f"Prompt logprob token id {plp_tok} decodes to"
+                        f" {ref_decoded_token} but Logprob decoded"
+                        f" token is {decoded_token} instead"
+                        f" (at position {idx})")
+        else:
+            # Prompt logprobs disabled for this request
+            assert prompt_logprobs is None
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs",
+                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(request_output_kind: RequestOutputKind,
+                            num_sample_logprobs: Optional[int],
+                            num_prompt_logprobs: Optional[int],
+                            dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None if num_sample_logprobs is None else
+        dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None else dummy_test_vectors.prompt_logprobs)
+
+    # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
+    requests = [
+        EngineCoreRequest(request_id=request_id_list[idx],
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False,
+                              logprobs=num_sample_logprobs,
+                              prompt_logprobs=num_prompt_logprobs,
+                          )) for idx, (prompt, prompt_tokens) in enumerate(
+                              zip(dummy_test_vectors.prompt_strings,
+                                  dummy_test_vectors.prompt_tokens))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        output_processor.add_request(request)
+
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the logprobs processor.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
+            if request_id not in gen_logprobs:
+                # Start tracking sample and prompt logprobs for this request
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
+            else:
+                # Extend logprobs tracker
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
+
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs,
+                       num_prompt_logprobs)
+
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-def test_stop_string(include_stop_str_in_output: bool):
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs",
+                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_stop_string(include_stop_str_in_output: bool,
+                     num_sample_logprobs: Optional[int],
+                     num_prompt_logprobs: Optional[int], dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs else None,
+        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
+        if num_prompt_logprobs else None)
 
     # Make N requests.
+    request_id_list = [
+        f"request-{idx}"
+        for idx in range(len(dummy_test_vectors.prompt_strings))
+    ]
     requests = [
         EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=request_id_list[idx],
             prompt=prompt,
             prompt_token_ids=prompt_tokens,
             arrival_time=0,
@@ -166,9 +507,11 @@ def test_stop_string(include_stop_str_in_output: bool):
                 output_kind=RequestOutputKind.DELTA,
                 stop=STOP_STRINGS,
                 include_stop_str_in_output=include_stop_str_in_output,
-            )) for idx, (
-                prompt,
-                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+                logprobs=num_sample_logprobs,
+                prompt_logprobs=num_prompt_logprobs,
+            )) for idx, (prompt, prompt_tokens) in enumerate(
+                zip(dummy_test_vectors.prompt_strings,
+                    dummy_test_vectors.prompt_tokens))
     ]
 
     # Add requests to the detokenizer.
@@ -176,6 +519,10 @@ def test_stop_string(include_stop_str_in_output: bool):
         output_processor.add_request(request)
 
     gen_strings = {}
+    gen_tokens = {}
+    gen_logprobs = {}
+    gen_prompt_logprobs = {}
+    gen_cumulative_logprobs = {}
     aborted = []
     while True:
         # Mock output from the EngineCore.
@@ -199,14 +546,29 @@ def test_stop_string(include_stop_str_in_output: bool):
 
             request_id = request_output.request_id
             new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            prompt_logprobs = request_output.prompt_logprobs
+            logprobs = request_output.outputs[0].logprobs
+            gen_cumulative_logprobs[request_id] = request_output.outputs[
+                0].cumulative_logprob
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+                gen_logprobs[request_id] = logprobs
+                gen_prompt_logprobs[request_id] = prompt_logprobs
             else:
                 gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+                lp = gen_logprobs[request_id]
+                plp = gen_prompt_logprobs[request_id]
+                if lp:
+                    lp.extend(logprobs)
+                if plp:
+                    plp.extend(prompt_logprobs)
 
     # Confirmed tracked values matches what we expected.
-    for idx, (ref_gen_str,
-              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+    for idx, (ref_gen_str, stop_str) in enumerate(
+            zip(dummy_test_vectors.generation_strings, STOP_STRINGS)):
 
         # Request should be aborted.
         request_id = f"request-{idx}"
@@ -227,13 +589,20 @@ def test_stop_string(include_stop_str_in_output: bool):
             assert gen_str == ref_str_exc_stop, (
                 f"{gen_str=}, {ref_str_exc_stop=}")
 
+    # Confirmed tracked logprobs match what we expect
+    _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
+                       gen_cumulative_logprobs, dummy_test_vectors,
+                       request_id_list, num_sample_logprobs,
+                       num_prompt_logprobs)
+
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
 
 
-def test_iteration_stats():
-    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
-    engine_core = MockEngineCore(GENERATION_TOKENS)
+def test_iteration_stats(dummy_test_vectors):
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=True)
+    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -248,13 +617,13 @@ def test_iteration_stats():
             eos_token_id=None,
             lora_request=None,
             sampling_params=SamplingParams(),
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+        ) for idx, (prompt, prompt_tokens) in enumerate(
+            zip(dummy_test_vectors.prompt_strings,
+                dummy_test_vectors.prompt_tokens))
     ]
 
     # Add all requests except one to the OutputProcessor.
-    num_active = len(GENERATION_TOKENS) - 1
+    num_active = len(dummy_test_vectors.generation_tokens) - 1
     for request in requests[:num_active]:
         output_processor.add_request(request)
     inactive_request = requests[num_active]
@@ -263,8 +632,10 @@ def test_iteration_stats():
     outputs = engine_core.get_outputs()[:num_active]
     processed_outputs = output_processor.process_outputs(outputs)
     iteration_stats = processed_outputs.iteration_stats
-    total_prompt_tokens = sum(
-        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+    total_prompt_tokens = sum([
+        len(prompt_tokens)
+        for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
+    ])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
@@ -283,7 +654,7 @@ def test_iteration_stats():
     outputs = engine_core.get_outputs()[:num_active]
     processed_outputs = output_processor.process_outputs(outputs)
     iteration_stats = processed_outputs.iteration_stats
-    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+    total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
     assert iteration_stats.num_generation_tokens == num_active
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
new file mode 100644
index 00000000000..39248ce86f2
--- /dev/null
+++ b/tests/v1/engine/utils.py
@@ -0,0 +1,382 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import random
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+    BaseTokenizerGroup)
+from vllm.v1.engine import EngineCoreOutput, FinishReason
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+GeneralTokenizerType = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+
+# Number of sample logprobs to request when testing sample logprobs
+NUM_SAMPLE_LOGPROBS_UNDER_TEST = 5
+# Number of prompt logprobs to request when testing prompt logprobs
+NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
+
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+PROMPT_LEN = 5
+
+PLP_APC_UNSUPPORTED_MSG = ("Prefix caching with prompt logprobs not yet "
+                           "supported on VLLM V1.")
+
+random.seed(42)
+
+
+def _create_random_top_logprob_test_vector(
+    num_logprobs: int,
+    lower: float,
+    upper: float,
+) -> torch.Tensor:
+    """Create a random vector of top logprob float values.
+    
+    Use to create fake sample logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order, something
+    which is omitted in this function.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      1D length-`num_logprobs` torch Tensor of float logprob values
+    """
+    return torch.rand(num_logprobs) * (upper - lower) + lower
+
+
+def _create_random_top_logprob_test_matrix(
+    shape: Tuple,
+    lower: float,
+    upper: float,
+) -> torch.Tensor:
+    """Create a random matrix of top logprob float values.
+    
+    Use to create fake prompt logprobs for testing.
+
+    Note that a real production scenario would require
+    logprobs to be sorted in descending order along rows,
+    something which is omitted in this function.
+
+    Args:
+      shape: (num_tokens,num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of logprob float values
+      upper: upper range of logprob float values
+
+    Returns:
+      2D num_tokens x num_logprobs torch Tensor of float logprob values
+    """
+    return torch.rand(*shape) * (upper - lower) + lower
+
+
+def _create_random_top_token_test_vector(
+        num_logprobs: int,
+        lower: int,
+        upper: int,
+        sampled_token_id: int,
+        adjust_num_logprobs: bool = True) -> Tuple[torch.Tensor, int]:
+    """Create a random vector of top logprob token indices
+
+    Use to create fake sample logprobs for testing. The sampled token
+    ID must always be one of the top logprobs, which this dummy test
+    vector generator enforces. OpenAI API
+    compatible engines must be able to return an additional sample
+    logprob for the sampled token if the sampled token was not
+    among the top sample logprobs; `adjust_num_logprobs` emulates
+    this behavior by increasing the vector length by 1 if
+    `adjust_num_logprobs` is set.
+
+    Args:
+      num_logprobs: number of top logprobs
+      lower: lower range of token ids
+      upper: upper range of token ids
+      sampled_token_id: the token actually sampled
+      adjust_num_logprobs: if True, emulate situation where sampled
+                           token logprob must be injected into top
+                           logprobs
+
+    Returns:
+      1D length-x torch Tensor of token ids where x is
+      `num_logprobs+1` if `adjust_num_logprobs` and
+      `num_logprobs` otherwise
+      sampled_token_rank: the rank of sampled_token_id in the vocab
+                          vector when sorted in descending order by
+                          logprob
+    """
+
+    # Calculate the final number of logprobs required
+    total_logprobs = num_logprobs + 1 if adjust_num_logprobs else num_logprobs
+
+    # Generate random indices using torch
+    choice_tensor = torch.randperm(upper - lower)[:total_logprobs] + lower
+
+    # Ensure the sampled token ID is included in the tensor
+    choice_tensor[0] = sampled_token_id
+
+    # Check if the sampled_token_id occurs in choice_tensor[1:]
+    if sampled_token_id in choice_tensor[1:]:
+        sampled_token_rank = (choice_tensor[1:] == sampled_token_id).nonzero(
+            as_tuple=True)[0].item()
+    else:
+        # If not found, assign a random int between num_logprobs and 50700
+        sampled_token_rank = random.randint(num_logprobs, 50700)
+
+    return choice_tensor, sampled_token_rank
+
+
+def _create_random_top_token_test_matrix(
+    shape: Tuple[int, int],
+    lower: int,
+    upper: int,
+    tokens_list: List[int],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Create a random matrix of top logprob token indices
+
+    Use to create fake prompt logprobs for testing.
+
+    Token ids are generated randomly and sampled without
+    replacement.
+
+    Args:
+      shape: (num_tokens, num_logprobs) tuple representing
+             matrix shape
+      lower: lower range of token ids
+      upper: upper range of token ids
+
+    Returns:
+      Tuple containing:
+      - 2D num_tokens x num_logprobs+1 torch Tensor of token ids
+      - 1D tensor of ranks of prompt tokens in their respective
+        rows, or random values
+    """
+    num_elements = shape[0] * shape[1]
+    choice_tensor = torch.randperm(upper - lower)[:num_elements] + lower
+    matrix = torch.cat(
+        (torch.tensor(tokens_list, dtype=torch.int).unsqueeze(-1),
+         choice_tensor.view(shape)),
+        dim=1)
+
+    # Initialize the tensor for storing the ranks
+    prompt_token_ranks = torch.empty(shape[0], dtype=torch.int)
+
+    # Iterate over each row to check presence of
+    # tokens_list[rdx] and determine its index
+    for rdx in range(shape[0]):
+        row = matrix[rdx,
+                     1:]  # Skip the first column as it contains the token list
+        token_index = (row == tokens_list[rdx]).nonzero(as_tuple=True)[0]
+        if token_index.numel() > 0:
+            prompt_token_ranks[rdx] = token_index.item()
+        else:
+            prompt_token_ranks[rdx] = random.randint(shape[1], 50700)
+
+    return matrix, prompt_token_ranks
+
+
+def decode_token(
+    tok_id: int,
+    tokenizer: PreTrainedTokenizer,
+) -> str:
+    """Reproduce the process of detokenizing a token for testing purposes.
+
+    Args:
+      tok_id: token id to detokenize
+      tokenizer: tokenizer to use for detokenization
+
+    Returns:
+      string representation of token
+    """
+    return tokenizer.convert_ids_to_tokens(tok_id)
+
+
+def generate_dummy_sample_logprobs(
+    sampled_tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> List[Tuple[List[int], List[float], int]]:
+    """Generate dummy sample logprobs
+
+    Generate a test data structure which imitates the list of sample logprobs
+    which would be assembled in the engine core during decode phase.
+
+    Args:
+      sampled_tokens_list: list of sampled tokens
+      num_logprobs: return `num_logprobs` or `num_logprobs+1` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      List of (top token ids vector, logprobs vector, sampled token rank)
+      Python lists tuples; in each tuple the logprobs and top token ids
+      vectors have the same length which is either `num_logprobs` or
+      `num_logprobs+1`. Sampled token rank is the rank (index+1) of the
+      sampled token within the vocab vector when sorted by logprob in
+      descending order.
+    """
+    res = []
+    for sampled_token_id in sampled_tokens_list:
+        (
+            token_vector,
+            sampled_token_rank,
+        ) = _create_random_top_token_test_vector(num_logprobs, 0,
+                                                 len(tokenizer.vocab) - 1,
+                                                 sampled_token_id)
+
+        res.append(
+            (token_vector,
+             _create_random_top_logprob_test_vector(num_logprobs + 1, -100,
+                                                    0), sampled_token_rank))
+
+    # Convert tensors in the list tuples to Python lists
+    res_list_format = [
+        (log_probs_tensor.tolist(), token_ids_tensor.tolist(),
+         sampled_token_rank)
+        for log_probs_tensor, token_ids_tensor, sampled_token_rank in res
+    ]
+
+    return res_list_format
+
+
+def generate_dummy_prompt_logprobs_tensors(
+    prompt_tokens_list: List,
+    num_logprobs: int,
+    tokenizer: PreTrainedTokenizer,
+) -> LogprobsTensors:
+    """Generate dummy prompt logprobs tensors
+
+    Generate a test data structure which imitates the torch Tensors of prompt
+    logprobs which would be assembled in the engine core during chunked
+    prefill.
+
+    Args:
+      prompt_tokens_list: list of prompt tokens
+      num_logprobs: return `num_logprobs` logprobs per token
+      tokenizer: model tokenizer to use for detokenization
+
+    Returns
+      Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor,
+      where both matrices have dimensions
+      num_prompt_tokens x num_logprobs
+    """
+    # For now, assume the whole prompt is processed in one chunk; thus,
+    # the number of non-`None` prompt logprobs is `len(prompt_tokens_list)-1`.
+    # Prior to injecting `None` at the beginning of prompt logprobs (which
+    # happens later in the detokenizer, not here), the prompt logprobs in
+    # the ith position are predicting the probability distribution of the
+    # prompt token in (i+1)st position. Thus, we concat
+    # `prompt_tokens_list[1:]` to the dummy token ids, just as the engine
+    # would.
+    num_prompt_logprobs = len(prompt_tokens_list) - 1
+    (
+        token_vector,
+        prompt_token_ranks,
+    ) = _create_random_top_token_test_matrix(
+        (num_prompt_logprobs, num_logprobs), 0,
+        len(tokenizer.vocab) - 1, prompt_tokens_list[1:])
+    return LogprobsTensors(
+        token_vector,
+        _create_random_top_logprob_test_matrix(
+            (num_prompt_logprobs, num_logprobs + 1), -100, 0),
+        prompt_token_ranks)
+
+
+@dataclass
+class DummyOutputProcessorTestVectors:
+    """Dummy test vectors for output processor tests"""
+    tokenizer: GeneralTokenizerType
+    tokenizer_group: BaseTokenizerGroup
+    vllm_config: EngineArgs
+    full_tokens: List[List[int]]  # Prompt + generated tokens
+    prompt_tokens: List[List[int]]
+    generation_tokens: List[List[int]]
+    # Each request is associated with a tuple of
+    # (top tokens, top logprobs, ranks) prompt logprobs tensors
+    prompt_logprobs: List[LogprobsTensors]
+    # Each request is associated with a sample logprobs; a request's
+    # sample logprobs are a list of (top tokens, top logprobs, ranks)
+    # sample logprobs tensors at each sequence position
+    generation_logprobs: List[List[Tuple[List[int], List[float], int]]]
+    prompt_strings: List[str]
+    prompt_strings_len: List[int]
+    generation_strings: List[str]
+
+
+class MockEngineCore:
+    """Mock engine core outputs form premade tokens lists."""
+
+    def __init__(
+        self,
+        tokens_list: List[List[int]],
+        # For each request, for each sampled token offset,
+        # a tuple of
+        # (list of topk token ids, list of sample logprob vals, rank)
+        generated_logprobs_raw: Optional[List[List[Tuple[List[int],
+                                                         List[float],
+                                                         int]]]] = None,
+        # For each request, a tuple of
+        # (prompt logprob val matrix, prompt logprob tok id matrix);
+        # each matrix has dimensions
+        # (num prompt toks) x (num prompt logprobs+1)
+        prompt_logprobs_raw: Optional[List[LogprobsTensors]] = None,
+    ) -> None:
+        self.tokens_list = tokens_list
+        self.current_idx = 0
+        self.generated_logprobs_raw = generated_logprobs_raw
+        self.do_logprobs = generated_logprobs_raw is not None
+        self.prompt_logprobs_raw = prompt_logprobs_raw
+        self.do_prompt_logprobs = prompt_logprobs_raw is not None
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        do_logprobs = self.do_logprobs
+        do_prompt_logprobs = self.do_prompt_logprobs
+        token_idx = self.current_idx
+
+        outputs = []
+        for req_idx, token_ids in enumerate(self.tokens_list):
+            if len(token_ids) > token_idx:
+                if do_logprobs:
+                    assert self.generated_logprobs_raw is not None
+                    (logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
+                        self.generated_logprobs_raw[req_idx][token_idx])
+                    logprobs = LogprobsLists(
+                        [logprobs_token_ids_],
+                        [logprobs_],
+                        [sampled_token_ranks_],
+                    )
+                else:
+                    logprobs = None
+                if do_prompt_logprobs:
+                    if self.current_idx == 0:
+                        assert self.prompt_logprobs_raw is not None
+                        prompt_logprobs = self.prompt_logprobs_raw[req_idx]
+                    else:
+                        prompt_logprobs = None
+                else:
+                    prompt_logprobs = None
+                output = EngineCoreOutput(
+                    request_id=f"request-{req_idx}",
+                    new_token_ids=[token_ids[token_idx]],
+                    new_logprobs=logprobs,
+                    new_prompt_logprobs_tensors=prompt_logprobs,
+                )
+                if token_idx == len(token_ids) - 1:
+                    output.finish_reason = FinishReason.STOP
+                outputs.append(output)
+
+        self.current_idx += 1
+        return outputs
diff --git a/tests/v1/entrypoints/__init__.py b/tests/v1/entrypoints/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
new file mode 100644
index 00000000000..b00e168db9d
--- /dev/null
+++ b/tests/v1/entrypoints/conftest.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+
+@pytest.fixture
+def sample_prompts():
+    return [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+
+@pytest.fixture
+def sample_token_ids():
+    return [
+        [0],
+        [0, 1],
+        [0, 2, 1],
+        [0, 3, 1, 2],
+    ]
+
+
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
+
+
+@pytest.fixture
+def sample_complex_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100  # Numeric range
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$"  # Regex pattern
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
+            },
+            "tags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "pattern":
+                    "^[a-z]{1,10}$"  # Combining length and pattern restrictions
+                }
+            }
+        },
+        "required": ["score", "grade", "email", "tags"]
+    }
+
+
+@pytest.fixture
+def sample_definition_json_schema():
+    return {
+        '$defs': {
+            'Step': {
+                'properties': {
+                    'explanation': {
+                        'title': 'Explanation',
+                        'type': 'string'
+                    },
+                    'output': {
+                        'title': 'Output',
+                        'type': 'string'
+                    }
+                },
+                'required': ['explanation', 'output'],
+                'title': 'Step',
+                'type': 'object'
+            }
+        },
+        'properties': {
+            'steps': {
+                'items': {
+                    '$ref': '#/$defs/Step'
+                },
+                'title': 'Steps',
+                'type': 'array'
+            },
+            'final_answer': {
+                'title': 'Final Answer',
+                'type': 'string'
+            }
+        },
+        'required': ['steps', 'final_answer'],
+        'title': 'MathReasoning',
+        'type': 'object'
+    }
+
+
+@pytest.fixture
+def sample_guided_choice():
+    return [
+        "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+        "Ruby", "Swift", "Kotlin"
+    ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+    return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
new file mode 100644
index 00000000000..ef46a16ef34
--- /dev/null
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -0,0 +1,475 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from typing import Dict, List, Optional
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+from openai import BadRequestError
+
+from tests.utils import RemoteOpenAIServer
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+# any model with a chat template should work here
+MODEL_NAME = "facebook/opt-125m"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager"
+    ]
+
+
+@pytest.fixture(scope="module",
+                params=[["--no-enable-prefix-caching"],
+                        [
+                            "--no-enable-prefix-caching",
+                            "--disable-frontend-multiprocessing"
+                        ]])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args.extend(request.param)
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_single_completion(client: openai.AsyncOpenAI,
+                                 model_name: str) -> None:
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+
+    choice = completion.choices[0]
+    assert len(choice.text) >= 5
+    assert choice.finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 1
+    assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_no_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=None,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_zero_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=0,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert len(choice.logprobs.top_logprobs[0]) == 1
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_some_logprobs(client: openai.AsyncOpenAI, model_name: str):
+    # test using token IDs
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+        logprobs=5,
+    )
+    choice = completion.choices[0]
+    assert choice.logprobs is not None
+    assert choice.logprobs.token_logprobs is not None
+    assert choice.logprobs.top_logprobs is not None
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
+                                            model_name: str) -> None:
+
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=21,
+        )
+        ...
+    with pytest.raises(
+        (openai.BadRequestError, openai.APIError)):  # test using token IDs
+        stream = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+            # vLLM has higher default max_logprobs (20 instead of 5) to support
+            # both Completion API and Chat Completion API
+            logprobs=30,
+            stream=True,
+        )
+        async for chunk in stream:
+            ...
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    assert len(completion.choices[0].text) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name, prompt_logprobs", [(MODEL_NAME, -1),
+                                                         (MODEL_NAME, 0),
+                                                         (MODEL_NAME, 1),
+                                                         (MODEL_NAME, None)])
+async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
+                                          model_name: str,
+                                          prompt_logprobs: Optional[int]):
+    params: Dict = {
+        "prompt": ["A robot may not injure another robot", "My name is"],
+        "model": model_name,
+    }
+    if prompt_logprobs is not None:
+        params["extra_body"] = {"prompt_logprobs": prompt_logprobs}
+
+    if prompt_logprobs is not None and prompt_logprobs < 0:
+        with pytest.raises(BadRequestError):
+            await client.completions.create(**params)
+    else:
+        completion = await client.completions.create(**params)
+        if prompt_logprobs is not None:
+            assert completion.choices[0].prompt_logprobs is not None
+            assert len(completion.choices[0].prompt_logprobs) > 0
+
+            assert completion.choices[1].prompt_logprobs is not None
+            assert len(completion.choices[1].prompt_logprobs) > 0
+
+        else:
+            assert completion.choices[0].prompt_logprobs is None
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_streaming(client: openai.AsyncOpenAI,
+                                    model_name: str) -> None:
+    prompt = "What is an LLM?"
+
+    single_completion = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+    )
+    single_output = single_completion.choices[0].text
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True)
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == "length"
+    assert chunk.choices[0].text
+    assert "".join(chunks) == single_output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_completion_stream_options(client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": False, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": False,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": False}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 False,
+                                             })
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=True, stream_options=
+    #     {"include_usage": True, "continuous_usage_stats": True}
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=5,
+                                             temperature=0.0,
+                                             stream=True,
+                                             stream_options={
+                                                 "include_usage": True,
+                                                 "continuous_usage_stats":
+                                                 True,
+                                             })
+    async for chunk in stream:
+        assert chunk.usage is not None
+        assert chunk.usage.prompt_tokens > 0
+        assert chunk.usage.completion_tokens > 0
+        assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens +
+                                            chunk.usage.completion_tokens)
+        if chunk.choices[0].finish_reason is not None:
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options=
+    #     {"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options=
+    #    {"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+    # Test stream=False, stream_options=
+    #     {"continuous_usage_stats": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": None})
+
+    # Test stream=False, stream_options=
+    #    {"continuous_usage_stats": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"continuous_usage_stats": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batch_completions(client: openai.AsyncOpenAI, model_name: str):
+    # test both text and token IDs
+    for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2):
+        # test simple list
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+        )
+        assert len(batch.choices) == 2
+        assert batch.choices[0].text == batch.choices[1].text
+
+        # test n = 2
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            n=2,
+            max_tokens=5,
+            temperature=0.0,
+            extra_body=dict(
+                # NOTE: this has to be true for n > 1 in vLLM, but
+                # not necessary for official client.
+                use_beam_search=True),
+        )
+        assert len(batch.choices) == 4
+        assert batch.choices[0].text != batch.choices[
+            1].text, "beam search should be different"
+        assert batch.choices[0].text == batch.choices[
+            2].text, "two copies of the same prompt should be the same"
+        assert batch.choices[1].text == batch.choices[
+            3].text, "two copies of the same prompt should be the same"
+
+        # test streaming
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompts,
+            max_tokens=5,
+            temperature=0.0,
+            stream=True,
+        )
+        texts = [""] * 2
+        async for chunk in batch:
+            assert len(chunk.choices) == 1
+            choice = chunk.choices[0]
+            texts[choice.index] += choice.text
+        assert texts[0] == texts[1]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
+async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
+                                       model_name: str, logprobs_arg: int):
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    # test using text and token IDs
+    for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
+        completion = await client.completions.create(model=model_name,
+                                                     prompt=prompt,
+                                                     max_tokens=5,
+                                                     temperature=0.0,
+                                                     echo=True,
+                                                     logprobs=logprobs_arg)
+
+        prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
+                                                             list) else prompt
+        assert re.search(r"^" + prompt_text, completion.choices[0].text)
+        logprobs = completion.choices[0].logprobs
+        assert logprobs is not None
+        assert len(logprobs.text_offset) > 5
+        assert (len(logprobs.token_logprobs) > 5
+                and logprobs.token_logprobs[0] is None)
+        assert (len(logprobs.top_logprobs) > 5
+                and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
+        assert len(logprobs.tokens) > 5
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
new file mode 100644
index 00000000000..86c576cd70a
--- /dev/null
+++ b/tests/v1/sample/test_logprobs.py
@@ -0,0 +1,392 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+from typing import List, Tuple
+
+import pytest
+import torch
+
+from tests.kernels.utils import override_backend_env_variable
+from tests.v1.sample.utils import (
+    assert_incr_detok_str_matches_non_incr_detok_str,
+    compute_correct_cumulative_logprob, get_test_batch)
+from vllm import SamplingParams
+
+from ...conftest import VllmRunner
+
+MODEL = "meta-llama/Llama-3.2-1B"
+DTYPE = "half"
+
+
+@pytest.fixture(scope="module")
+def vllm_model(vllm_runner):
+    with vllm_runner(
+            MODEL,
+            dtype=DTYPE,
+            max_logprobs=7,
+            # Very small number of batched tokens to ensure
+            # that we test chunking.
+            max_num_batched_tokens=16,
+            max_num_seqs=16,
+            max_model_len=128,
+            enforce_eager=True,
+            #TODO: enable this once we support it for
+            # prompt logprobs.
+            enable_prefix_caching=False,
+            gpu_memory_utilization=0.5,
+    ) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(scope="module")
+def hf_model(hf_runner):
+    with hf_runner(MODEL, dtype=DTYPE) as hf_model:
+        yield hf_model
+
+
+def _repeat_logprob_config(
+    test_prompts,
+    logprob_prompt_logprob_list: List[Tuple],
+) -> List[Tuple]:
+    """Ensure each test prompt has a logprob config.
+    
+    A logprob config specifies the optional (i.e.
+    may-be-`None`) number of sample logprobs and
+    the optional number of prompt logprobs.
+
+    If more test prompts than logprob configs are
+    provided, the provided logprob configs are
+    tiled to match the number of test prompts.
+
+    If fewer test prompts than logprob configs
+    are provided, the list of logprob configs
+    is truncated to match the number of test
+    prompts.
+
+    Otherwise, the list of logprob configs
+    is returned as-is.
+
+    Args:
+      test_prompts: list of prompts under test
+      logprob_prompt_logprob_list: list of
+                            (optional num sample logprob,
+                             optional num prompt logprob)
+                             tuples
+    
+    Returns:
+      List of
+      (optional num sample logprob,optional num prompt logprob)
+      tuples which is either identical to
+      `logprob_prompt_logprob_list`, or else repeats
+      `logprob_prompt_logprob_list` enough times to match the
+      number of `test_prompts`, or else is truncated to match
+      the number of `test_prompts`
+    """
+    num_test_prompts = len(test_prompts)
+    # Make sure there is a logprobs configuration for each test prompt
+    logprob_prompt_logprob_list = list(
+        itertools.islice(itertools.cycle(logprob_prompt_logprob_list),
+                         num_test_prompts))
+    # Now the number of prompts should match the number of sample params combos
+    assert num_test_prompts == len(logprob_prompt_logprob_list)
+    return logprob_prompt_logprob_list
+
+
+def _test_case_get_logprobs_and_prompt_logprobs(
+    hf_model,
+    vllm_model,
+    batch_logprobs_composition: str,
+    temperature: float,
+    example_prompts,
+) -> None:
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=num_lp,
+                       prompt_logprobs=num_plp,
+                       temperature=temperature,
+                       seed=1984)
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+
+    vllm_results = vllm_model.model.generate(
+        test_prompts, sampling_params=vllm_sampling_params)
+
+    for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
+            vllm_results, hf_logprobs, hf_outputs,
+            logprob_prompt_logprob_list):
+
+        # Extract request-level (prompt)logprobs config
+        num_top_logprobs, num_top_prompt_logprobs = logprob_prompt_logprob
+
+        # Test whether sampled token output is consistent between vLLM and HF
+        # vLLM prompt+completion should match HF output
+        if temperature == 0.0:
+            assert (vllm_result.prompt_token_ids +
+                    vllm_result.outputs[0].token_ids == hf_output[0])
+        else:
+            # Sampled tokens won't match if not greedy
+            assert (vllm_result.prompt_token_ids == hf_output[0]
+                    [:len(vllm_result.prompt_token_ids)])
+
+        # Validate sample logprobs
+        if num_top_logprobs is not None:
+            assert num_top_logprobs is not None
+            # Confirm that the structure of the sample logprobs in the result is
+            # correct
+            assert vllm_result.outputs[0].logprobs is not None
+            assert len(vllm_result.outputs[0].logprobs) == max_tokens
+            for logprobs, token_id in zip(vllm_result.outputs[0].logprobs,
+                                          vllm_result.outputs[0].token_ids):
+                assert logprobs is not None
+
+                # Confirm that the output token appears among the logprobs
+                assert token_id in logprobs
+                token_in_topk = logprobs[token_id].rank <= num_top_logprobs
+
+                # If the output token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_logprobs != 0:
+                    assert len(logprobs) == num_top_logprobs
+                else:
+                    assert len(logprobs) == num_top_logprobs + 1
+
+                if num_top_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in logprobs.values()}
+                    assert all(r in all_ranks
+                               for r in range(1, num_top_logprobs + 1))
+
+            output_text = vllm_result.outputs[0].text
+            output_string_from_most_likely_tokens_lst: List[str] = []
+            for top_logprobs in vllm_result.outputs[0].logprobs:
+                top_logprob = next(iter(top_logprobs.values()))
+                output_string_from_most_likely_tokens_lst.append(
+                    top_logprob.decoded_token)
+
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens_lst)
+            assert_incr_detok_str_matches_non_incr_detok_str(
+                output_text, output_string_from_most_likely_tokens,
+                "The output text from the top logprob for each token "
+                "position should be the same as the output text in the "
+                "result.")
+
+            # Compare vLLM sample logprobs to HF
+            vllm_sample_logprobs = vllm_result.outputs[0].logprobs
+            for i, top_logprobs in enumerate(vllm_sample_logprobs):
+                for token_id, sample_logprob in top_logprobs.items():
+                    if temperature == 0.0 or i == 0:
+                        logprob = sample_logprob.logprob
+                        torch.testing.assert_close(
+                            logprob,
+                            hf_logprob[i][-1][token_id].item(),
+                            atol=1e-2,
+                            rtol=1e-2)
+                    assert isinstance(
+                        sample_logprob.decoded_token,
+                        str), ("The token should be decoded by the time it is"
+                               " returned to the user.")
+
+            # At this point we know the sample logprobs are correct for this
+            # request. Validate that cumulative_logprob is actually the sum.
+            # For each request, assert that the returned cumulative logprob
+            # matches the correct value, which is computed below.
+            torch.testing.assert_close(
+                vllm_result.outputs[0].cumulative_logprob,
+                compute_correct_cumulative_logprob(vllm_result.outputs[0]),
+                atol=1e-6,
+                rtol=1e-6)
+        else:
+            # Logprobs disabled for this request; should be None
+            assert vllm_result.outputs[0].logprobs is None
+
+        # Validate prompt logprobs
+        if num_top_prompt_logprobs is not None:
+            # Confirm that structure of prompt logprobs in result is correct
+            assert vllm_result.prompt_logprobs is not None
+            # - The first prompt logprob is always None
+            assert vllm_result.prompt_logprobs[0] is None
+            # - Prompt logprobs are returned for all indices in
+            #   the prompt
+            assert len(vllm_result.prompt_logprobs) == len(
+                vllm_result.prompt_token_ids)
+            for prompt_logprobs, prompt_token_id in zip(
+                    vllm_result.prompt_logprobs[1:],
+                    vllm_result.prompt_token_ids[1:]):
+                assert prompt_logprobs is not None
+
+                # Confirm that the prompt token appears among the logprobs
+                assert prompt_token_id in prompt_logprobs
+                token_in_topk = prompt_logprobs[
+                    prompt_token_id].rank <= num_top_prompt_logprobs
+
+                # If the prompt token is not included in the top K
+                # logprob, it can return 1 more data
+                if token_in_topk and num_top_prompt_logprobs != 0:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs
+                else:
+                    assert len(prompt_logprobs) == num_top_prompt_logprobs + 1
+
+                if num_top_prompt_logprobs > 0:
+                    # We should have an entry for each of the topk ranks
+                    all_ranks = {lp.rank for lp in prompt_logprobs.values()}
+                    assert all(r in all_ranks
+                               for r in range(1, num_top_prompt_logprobs + 1))
+
+            # Compare prompt logprobs to HF
+            # The first prompt logprob is always None, so we compare it from
+            # 1:.
+            vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
+            for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
+                for token_id, logprob in vllm_prompt_logprob_dict.items():
+                    torch.testing.assert_close(
+                        logprob.logprob,
+                        hf_logprob[0][i][token_id].item(),
+                        atol=2e-2,
+                        rtol=2e-2)
+        else:
+            assert vllm_result.prompt_logprobs is None
+
+
+#@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("batch_logprobs_composition",
+                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+@pytest.mark.parametrize("temperature", [0.0, 2.0])
+def test_get_logprobs_and_prompt_logprobs(
+    hf_model,
+    vllm_model,
+    batch_logprobs_composition: str,
+    temperature: float,
+    example_prompts,
+) -> None:
+    """Test V1 Engine logprobs & prompt logprobs
+    
+    Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
+    settings and validate that
+    * The generated logprobs and prompt logprobs are consistent with the
+      configuration settings, in terms of whether or not the logprobs
+      (of either type) were requested and how many were requested
+    * The generated logprobs are consistent with the generated tokens
+    * The generated (prompt)logprobs are consistent with HuggingFace
+      (prompt)logprobs, as a reference
+
+    batch_logprobs_composition controls the logprobs configurations for
+    requests in the batch under test.
+
+    Args:
+      hf_model
+      vllm_model
+      batch_logprobs_composition: logprobs configuration for test batch
+      example_prompts
+      monkeypatch
+    """
+    _test_case_get_logprobs_and_prompt_logprobs(
+        hf_model=hf_model,
+        vllm_model=vllm_model,
+        batch_logprobs_composition=batch_logprobs_composition,
+        temperature=temperature,
+        example_prompts=example_prompts)
+
+
+def test_max_logprobs(monkeypatch):
+    """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
+    
+    Should also fail for `prompt_logprobs > max_logprobs`
+    
+    Args:
+      monkeypatch
+    """
+    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
+
+    runner = VllmRunner("facebook/opt-125m",
+                        max_logprobs=1,
+                        enable_prefix_caching=False,
+                        max_model_len=256)
+    vllm_sampling_params = SamplingParams(logprobs=1)
+    # should pass
+    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+
+    bad_sampling_params = SamplingParams(logprobs=2)
+    with pytest.raises(ValueError):
+        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+
+
+def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
+    """Engine should return `logprobs` and `prompt_logprobs` as `None`
+    
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+      monkeypatch: supports editing env vars and rolling back changes
+                   after the test
+    """
+    max_tokens = 5
+
+    sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
+                                                   logprobs=None,
+                                                   prompt_logprobs=None,
+                                                   temperature=0.0)
+    results_logprobs_none = vllm_model.model.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_none)
+
+    for i in range(len(results_logprobs_none)):
+        # Check sample logprobs are None
+        assert results_logprobs_none[i].outputs[0].logprobs is None
+        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
+        # Check prompt logprobs are None
+        assert results_logprobs_none[i].prompt_logprobs is None
+
+
+def test_zero_logprobs(vllm_model, example_prompts, monkeypatch):
+    """Engine should return sampled token and prompt token logprobs
+    
+    Args:
+      vllm_model: vLLM model fixture
+      example_prompts: list of example prompts (test fixture)
+      monkeypatch: supports editing env vars and rolling back changes
+                   after the test
+    """
+    max_tokens = 5
+
+    sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+                                                   logprobs=0,
+                                                   prompt_logprobs=0,
+                                                   temperature=0.0)
+    results_logprobs_zero = vllm_model.model.generate(
+        example_prompts, sampling_params=sampling_params_logprobs_zero)
+
+    for i in range(len(results_logprobs_zero)):
+        # Check that there is one sample logprob dict for each
+        # sample token
+        logprobs = results_logprobs_zero[i].outputs[0].logprobs
+        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+        assert logprobs is not None
+        assert len(sampled_token_ids) == len(logprobs)
+        assert results_logprobs_zero[i].outputs[
+            0].cumulative_logprob is not None
+        # Check that there is one prompt logprob dict for each
+        # prompt token
+        assert prompt_logprobs is not None
+        assert len(prompt_token_ids) == len(prompt_logprobs)
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
new file mode 100644
index 00000000000..28c177fd497
--- /dev/null
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import lm_eval
+
+from ...utils import RemoteOpenAIServer
+
+# arc-easy uses prompt_logprobs=1, logprobs=1
+TASK = "arc_easy"
+FILTER = "acc_norm,none"
+RTOL = 0.03
+EXPECTED_VALUE = 0.62
+
+# FIXME(rob): enable prefix caching once supported.
+MODEL = "meta-llama/Llama-3.2-1B"
+MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
+SERVER_ARGS = [
+    "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"
+]
+NUM_CONCURRENT = 100
+
+
+def test_prompt_logprobs_e2e():
+    results = lm_eval.simple_evaluate(model="vllm",
+                                      model_args=MODEL_ARGS,
+                                      tasks=TASK,
+                                      batch_size="auto")
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+def test_promt_logprobs_e2e_server():
+    with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
+        url = f"{remote_server.url_for('v1')}/completions"
+
+        model_args = (
+            f"model={MODEL},"
+            f"base_url={url},"
+            f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+        results = lm_eval.simple_evaluate(
+            model="local-completions",
+            model_args=model_args,
+            tasks=TASK,
+        )
+
+        measured_value = results["results"][TASK][FILTER]
+        assert (measured_value - RTOL < EXPECTED_VALUE
+                and measured_value + RTOL > EXPECTED_VALUE
+                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
new file mode 100644
index 00000000000..e1465b12396
--- /dev/null
+++ b/tests/v1/sample/utils.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from typing import List, Tuple
+
+from vllm import CompletionOutput
+
+
+def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+    """Generate logprobs configs for a batch of requests
+    
+    A given request's logprobs configuration is (1) num_sample_logprobs and (2)
+    num_prompt_logprobs. The batch logprobs configuration is the list of request
+    logprobs configs.
+
+    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    logprobs
+
+    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    configured for sample logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    configured for prompt logprobs only, and others configured for no logprobs
+
+    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    requests configured for sample logprobs and prompt logprobs, some configured
+    for only sample logprobs or only prompt logprobs, and some configured for
+    no logprobs
+
+    Args:
+      batch_logprobs_composition: types of logprobs configs to include in batch
+
+    Returns:
+
+      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      tuples
+    """
+    if batch_logprobs_composition == "NONE":
+        # No requests with sample or prompt logprobs
+        return [(None, None)]
+    elif batch_logprobs_composition == "SAMPLE":
+        # Requests requiring sample logprobs or no logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+        ]
+    elif batch_logprobs_composition == "PROMPT":
+        # Requests requiring prompt logprobs or no logprobs
+        return [
+            (None, None),
+            (None, 0),
+            (None, 6),
+            (None, 5),
+        ]
+    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+        # Requests requiring either no logprobs, just
+        # sample logprobs, just prompt logprobs, or
+        # both sample and prompt logprobs
+        return [
+            (None, None),
+            (0, None),
+            (5, None),
+            (3, None),
+            (0, 3),
+            (6, 0),
+            (6, 3),
+            (None, 6),
+            (None, 5),
+            (None, 0),
+        ]
+    else:
+        raise ValueError("Invalid logprobs batch configuration for test.")
+
+
+def assert_incr_detok_str_matches_non_incr_detok_str(
+    incremental_detokenization_str: str,
+    non_incremental_detokenization_str: str,
+    msg: str,
+) -> None:
+    """Compare incrementally detok. text to non-incrementally detok. text
+    
+    Fail if the strings mismatch after non-alphanumeric characters are stripped
+    out.
+
+    Rationale: incremental detokenization in the text generation process allows
+    the tokenizer to adjust the next token text output based on the token's
+    context in the string. However, logprobs detokenization detokenizes each
+    token individually, and the resultant strings may include some
+    non-alphanumeric placeholder characters where there could be i.e.
+    whitespace. So, this function compares only the alphanumeric text
+    between two strings and fails if there is a mismatch, which helps
+    with validating logprobs detokenization.
+
+    Args:
+      incremental_detokenization_str: incrementally-detokenized generated text
+      non_incremental_detokenization_str: non-incrementally-detokenized logprob
+                                          tokens
+      msg: error message if `assert` fails
+    """
+    rgx = r'[^a-zA-Z0-9]+'
+    assert (re.sub(rgx, '', incremental_detokenization_str) == re.sub(
+        rgx, '', non_incremental_detokenization_str)), (msg)
+
+
+def compute_correct_cumulative_logprob(
+        completion_output: CompletionOutput) -> float:
+    """Compute known-good value for evaluating cumulative logprob
+    
+    Args:
+      completion_output: completion output from engine
+
+    Returns:
+      Known-good cumulative logprob value
+    """
+    token_ids = completion_output.token_ids
+    logprobs = completion_output.logprobs
+    assert logprobs is not None
+    return sum([lp[tok_id].logprob for tok_id, lp in zip(token_ids, logprobs)])
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 786380c37f6..030119710a1 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -142,6 +142,9 @@ def new(
         prompt_token_ids: Optional[List[int]],
         text: str,
         token_ids: List[int],
+        logprobs: Optional[SampleLogprobs],
+        prompt_logprobs: Optional[PromptLogprobs],
+        cumulative_logprob: Optional[float],
         finished: bool = False,
     ) -> "RequestOutput":
         """Initialize a new RequestOutput object."""
@@ -151,15 +154,14 @@ def new(
             index=0,
             text=text,
             token_ids=token_ids,
-            cumulative_logprob=None,
-            logprobs=None,  # TODO
-        )
+            cumulative_logprob=cumulative_logprob,
+            logprobs=logprobs)
 
         return RequestOutput(
             request_id=request_id,
             prompt=prompt,
             prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=None,  # TODO
+            prompt_logprobs=prompt_logprobs,
             outputs=[completion_output],
             finished=finished,
         )
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py
index 8160a35ff22..a1fa27773fe 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -74,6 +74,25 @@ def convert_prompt_ids_to_tokens(
     return new_tokens, prefix_offset, read_offset
 
 
+def convert_ids_list_to_tokens(
+    tokenizer: AnyTokenizer,
+    token_ids: List[int],
+) -> List[str]:
+    """Detokenize the input ids individually.
+
+    Args:
+      tokenizer: tokenizer used by model under test
+      token_ids: convert these tokens (Python list form)
+
+    Returns:
+      Python list of token string representations
+    
+    """
+    token_str_lst = tokenizer.convert_ids_to_tokens(token_ids)
+    _replace_none_with_empty(token_str_lst)  # type: ignore
+    return token_str_lst
+
+
 # Based on
 # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
 # under Apache 2.0 license
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 6c44fec6439..35d9424f942 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -437,6 +437,8 @@ def update_from_output(
     ) -> EngineCoreOutputs:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
+        logprobs = model_runner_output.logprobs
+        prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
@@ -471,6 +473,13 @@ def update_from_output(
                         self.encoder_cache_manager.free_encoder_input(
                             request, input_id)
 
+            # Get prompt logprobs for this request.
+            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
+
+            stopped = False
+            new_logprobs = None
+            new_token_ids = None
+
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
@@ -486,20 +495,30 @@ def update_from_output(
                 if stopped:
                     self._free_request(request)
 
+                # Extract sample logprobs if needed.
+                if request.sampling_params.logprobs is not None:
+                    assert logprobs is not None
+                    # NOTE: once we support N tokens per step (spec decode),
+                    # the outer lists can be of length > 1.
+                    new_logprobs = logprobs.slice(req_index, req_index + 1)
+
+                new_token_ids = request.output_token_ids[-num_new_tokens:]
+
+            # Transmit partial if chunked prefill & prompt logprobs is enabled
+            if new_token_ids or prompt_logprobs_tensors is not None:
                 # Add EngineCoreOutput for this Request.
-                output = EngineCoreOutput(
-                    request_id=req_id,
-                    new_token_ids=request.output_token_ids[-num_new_tokens:],
-                    finished=request.is_finished(),
-                    finish_reason=request.get_finished_reason(),
-                    stop_reason=request.stop_reason)
-                outputs.append(output)
-
-                # Breakout of the loop.
-                if stopped:
-                    continue
+                outputs.append(
+                    EngineCoreOutput(
+                        request_id=req_id,
+                        new_token_ids=new_token_ids or [],
+                        finish_reason=request.get_finished_reason(),
+                        new_logprobs=new_logprobs,
+                        new_prompt_logprobs_tensors=prompt_logprobs_tensors,
+                        stop_reason=request.stop_reason))
+
+            if not stopped:
+                new_running.append(request)
 
-            new_running.append(request)
         self.running = new_running
         return EngineCoreOutputs(
             outputs=outputs,
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index d5933cac50c..b05ef3cc8c7 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -7,6 +7,7 @@
 import msgspec
 
 from vllm.v1.metrics.stats import SchedulerStats
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
@@ -67,10 +68,17 @@ class EngineCoreOutput(
 
     request_id: str
     new_token_ids: List[int]
-    finished: bool
+
+    new_logprobs: Optional[LogprobsLists] = None
+    new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
+
     finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
 
+    @property
+    def finished(self) -> bool:
+        return self.finish_reason is not None
+
 
 class EngineCoreOutputs(
         msgspec.Struct,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 29a9ac1868f..f3d40aa1e9c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,7 +11,6 @@
 import psutil
 import zmq
 import zmq.asyncio
-from msgspec import msgpack
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -26,7 +25,7 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.serial_utils import MsgpackEncoder, PickleEncoder
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -292,7 +291,7 @@ def process_output_socket(self, output_path: str):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
-        encoder = msgpack.Encoder()
+        encoder = MsgpackEncoder()
         # Reuse send buffer.
         buffer = bytearray()
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 247380ef7cf..cdc63acdb74 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -7,7 +7,6 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Type
 
-import msgspec
 import zmq
 import zmq.asyncio
 
@@ -20,7 +19,7 @@
                             EngineCoreRequestUnion, EngineCoreResetPrefixCache)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.serial_utils import PickleEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, PickleEncoder
 from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
@@ -163,7 +162,7 @@ def sigusr1_handler(signum, frame):
 
         # Serialization setup.
         self.encoder = PickleEncoder()
-        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
+        self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
         self.ctx = (
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 861fcb012c3..629da06f492 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,27 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import List, Optional
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
-from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.detokenizer_utils import (
     AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine import EngineCoreRequest
 
 logger = init_logger(__name__)
 
 
-@dataclass
-class DetokenizerOutput:
-    output_text: str
-    token_ids: List[int]
-    finished: bool
-    finish_reason: Optional[FinishReason] = None
-    stop_reason: Union[int, str, None] = None
-
-
 @dataclass
 class IncrementalDetokenizer:
 
@@ -42,7 +32,6 @@ class IncrementalDetokenizer:
     # Parameters for detokenization
     skip_special_tokens: bool
     spaces_between_special_tokens: bool
-    output_kind: RequestOutputKind
 
     # Tokenizer for this request
     tokenizer: AnyTokenizer
@@ -90,25 +79,19 @@ def from_new_request(
             skip_special_tokens=request.sampling_params.skip_special_tokens,
             spaces_between_special_tokens=request.sampling_params.
             spaces_between_special_tokens,
-            output_kind=request.sampling_params.output_kind,
             prompt_len=len(request.prompt_token_ids),
             tokenizer=tokenizer,
             stop_buffer_length=stop_buffer_length,
         )
 
-    def update_from_output(
-        self,
-        output: EngineCoreOutput,
-    ) -> Optional[DetokenizerOutput]:
+    def update(self, new_token_ids: List[int]) -> Optional[str]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
-            2) Update the RequestOutput with the new text.
-        """
+            2) Evaluate stop criteria.
 
-        new_token_ids = output.new_token_ids
-        finish_reason = output.finish_reason
-        stop_reason = output.stop_reason
+        Return matched stop string or None.
+        """
 
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
@@ -131,11 +114,13 @@ def update_from_output(
             self.tokens.extend(new_tokens)
             self.prefix_offset = prefix_offset
             self.read_offset = read_offset
-            self.output_text += new_decoded_token_text
 
             decoded_text += new_decoded_token_text
 
+        self.output_text += decoded_text
+
         # 2) Evaluate stop criteria.
+        stop_string = None
         if self.stop:
             stop = StopChecker.check_stop_strings(
                 output_text=self.output_text,
@@ -144,28 +129,13 @@ def update_from_output(
                 include_in_output=self.include_stop_str_in_output,
             )
             if stop is not None:
-                stop_str, truncate_to = stop
+                stop_string, truncate_to = stop
                 if truncate_to != -1:
                     self.output_text = self.output_text[:truncate_to]
-                finish_reason = FinishReason.STOP
-                stop_reason = stop_str
-
-        # TODO: handle stop_token_ids here too?
-
-        # 3) Update the RequestOutput object with the new text.
-        finished = finish_reason is not None
-        if self.output_kind == RequestOutputKind.FINAL_ONLY \
-            and not finished:
-            return None
-
-        delta = self.output_kind == RequestOutputKind.DELTA
-        output_text = self._get_next_output_text(finished, delta)
-        token_ids = new_token_ids if delta else self.output_token_ids
 
-        return DetokenizerOutput(output_text, token_ids, finished,
-                                 finish_reason, stop_reason)
+        return stop_string
 
-    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
+    def get_next_output_text(self, finished: bool, delta: bool) -> str:
         """If delta is True, only new text since the last call to
         this method is returned"""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index e0452bcad7b..3ef5a970606 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -45,6 +45,7 @@ def __init__(
         multiprocess_mode: bool = False,
     ) -> None:
         self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
new file mode 100644
index 00000000000..4622cafa4a0
--- /dev/null
+++ b/vllm/v1/engine/logprobs.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from vllm.logger import init_logger
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_ids_list_to_tokens)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class LogprobsProcessor:
+
+    # Tokenizer for this request
+    tokenizer: AnyTokenizer
+
+    # Logprobs for this request
+    logprobs: Optional[SampleLogprobs]
+    prompt_logprobs: Optional[PromptLogprobs]
+    cumulative_logprob: Optional[float]
+    num_logprobs: Optional[int]
+    num_prompt_logprobs: Optional[int]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: EngineCoreRequest,
+    ) -> "LogprobsProcessor":
+        num_logprobs = request.sampling_params.logprobs
+        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        return cls(
+            tokenizer=tokenizer,
+            cumulative_logprob=(None if num_logprobs is None else 0.),
+            logprobs=(None if num_logprobs is None else []),
+            # NOTE: logprob of first prompt token is None.
+            prompt_logprobs=(None if num_prompt_logprobs is None else [None]),
+            num_prompt_logprobs=num_prompt_logprobs,
+            num_logprobs=num_logprobs,
+        )
+
+    def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
+        """Update with sample logprobs from EngineCore.
+
+        Outer lists are only of len > 1 if EngineCore made
+        >1 tokens in prior step (e.g. in spec decoding).
+
+        Args:
+          logprobs_lists: the lists of logprob tokens, logprobs, and ranks.
+
+        """
+
+        assert self.num_logprobs is not None
+        assert self.logprobs is not None
+        assert self.cumulative_logprob is not None
+
+        token_ids_lst, logprobs_lst, ranks_lst = logprobs_lists
+
+        for rank, logprobs, token_ids in zip(ranks_lst, logprobs_lst,
+                                             token_ids_lst):
+
+            # Detokenize (non-incrementally).
+            decoded_tokens = convert_ids_list_to_tokens(
+                self.tokenizer, token_ids)
+
+            # Sampler puts the sampled logprob in first.
+            sampled_token_logprob = logprobs[0]
+            self.cumulative_logprob += sampled_token_logprob
+
+            # Update with the Logprob dictionary for this pos.
+            self.logprobs.append(
+                self._make_logprob_dict(
+                    logprobs,
+                    token_ids,
+                    decoded_tokens,
+                    rank,
+                    self.num_logprobs,
+                ))
+
+    def _update_prompt_logprobs(
+        self,
+        prompt_logprobs_tensors: LogprobsTensors,
+    ) -> None:
+        """Update with prompt logprobs from EngineCore.
+
+        Args:
+          prompt_logprobs_tensors: tuple containing the prompt logprobs
+                                   tensors.
+
+        """
+
+        # Prompt logprobs are enabled.
+        assert self.num_prompt_logprobs is not None
+        assert self.prompt_logprobs is not None
+
+        token_ids, logprobs, ranks = prompt_logprobs_tensors
+
+        # Detokenize non-incrementally.
+        # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
+        decoded_tokens = convert_ids_list_to_tokens(
+            self.tokenizer,
+            token_ids.flatten().tolist())
+
+        # Recover shapes.
+        num_prompt_tokens, num_logprobs = logprobs.shape
+
+        # Pythonize the torch tensors.
+        # TODO(rob): experiment with doing this in EngineCore?
+        prompt_token_ranks = ranks.tolist()
+        prompt_logprobs = logprobs.tolist()
+        token_ids = token_ids.tolist()
+
+        # Make Logprob for each position.
+        for pos in range(num_prompt_tokens):
+            # Handle flattening.
+            offset = pos * num_logprobs
+            offset_end = offset + num_logprobs
+            decoded_tokens_for_pos = decoded_tokens[offset:offset_end]
+
+            # Update with the Logprob dictionary for this pos.
+            self.prompt_logprobs.append(
+                self._make_logprob_dict(prompt_logprobs[pos], token_ids[pos],
+                                        decoded_tokens_for_pos,
+                                        prompt_token_ranks[pos],
+                                        self.num_prompt_logprobs))
+
+    def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
+        """Pop and return all request prompt logprobs
+        
+        The logprobs processor aggregates prompt chunk logprobs
+        over one or more prefill chunks. This method returns
+        all prompt logprobs at once and then forgets them.
+        Ensures correct RequestOutputKind.DELTA semantics
+        wherein all prompt logprobs are returned at once at
+        the end of prefill.
+
+        Returns:
+          None if prompt logprobs are disabled for this request.
+          List of all prompt logprobs, otherwise.
+        """
+        plp = self.prompt_logprobs
+        if plp:
+            self.prompt_logprobs = []
+        return plp
+
+    @staticmethod
+    def _make_logprob_dict(
+        logprobs: List[float],
+        logprob_token_ids: List[int],
+        decoded_tokens: List[str],
+        rank: int,
+        num_logprobs: int,
+    ) -> Dict[int, Logprob]:
+        """Make a Logprob dictionary for a position.
+
+        Args:
+          logprobs: list of log probabilities
+          logprob_token_ids: list of top token ids
+          decoded_tokens: list of decoded top tokens
+          rank: rank of the sampled token
+          num_logprobs: number of logprobs requested
+            by the user (in addition to sampled logprob)
+
+        Returns:
+          Dict[token id, Logprob]
+        """
+
+        # We do not need a special case for the sampled token
+        # being in the topk, since inserting duplicated data
+        # into a dictionary twice is the same as doing it once.
+        topk_ranks = range(1, num_logprobs + 1)
+        ranks = itertools.chain((rank, ), topk_ranks)
+
+        return {
+            token_id: Logprob(
+                logprob=logprob,
+                rank=rank,
+                decoded_token=token,
+            )
+            for token_id, logprob, rank, token in zip(
+                logprob_token_ids, logprobs, ranks, decoded_tokens)
+        }
+
+    def update_from_output(self, output: EngineCoreOutput) -> None:
+        if output.new_logprobs is not None:
+            self._update_sample_logprobs(output.new_logprobs)
+        if output.new_prompt_logprobs_tensors is not None:
+            self._update_prompt_logprobs(output.new_prompt_logprobs_tensors)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 94736669147..5dbf530caa1 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -5,11 +5,12 @@
 from typing import Dict, List, Optional
 
 from vllm.outputs import RequestOutput
-from vllm.transformers_utils.detokenizer_utils import AnyTokenizer
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
-from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
-from vllm.v1.engine.detokenizer import (DetokenizerOutput,
-                                        IncrementalDetokenizer)
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
+from vllm.v1.engine.detokenizer import IncrementalDetokenizer
+from vllm.v1.engine.logprobs import LogprobsProcessor
 from vllm.v1.metrics.stats import IterationStats, RequestStateStats
 
 
@@ -26,16 +27,20 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        output_kind: RequestOutputKind,
         prompt: Optional[str],
         prompt_token_ids: List[int],
+        logprobs_processor: LogprobsProcessor,
         detokenizer: IncrementalDetokenizer,
         arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
     ):
         self.request_id = request_id
+        self.output_kind = output_kind
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
         self.prompt_len = len(prompt_token_ids)
+        self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
         self.is_prefilling = True
         self.queue = queue
@@ -51,8 +56,13 @@ def from_new_request(
     ) -> "RequestState":
         return cls(
             request_id=request.request_id,
+            output_kind=request.sampling_params.output_kind,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
+            logprobs_processor=LogprobsProcessor.from_new_request(
+                tokenizer=tokenizer,
+                request=request,
+            ),
             detokenizer=IncrementalDetokenizer.from_new_request(
                 tokenizer=tokenizer,
                 request=request,
@@ -127,13 +137,8 @@ def process_outputs(
         batch to ensure system overheads are minimized. This is the 
         only function that should loop over EngineCoreOutputs.
 
-        If you need to touch every element of the batch, implement a
-        method called XXXClass.update_from_output() to be called
-        within the loop below. For examples, see:
-            * IterationStats.update_from_output()
-            * Detokenizer.update_from_output()
-        
-        TODO(rob): add Protocol makes update_from_output explicit.
+        If you need to touch every element of the batch, do it from
+        within the loop below.
         
         **********************************************************
         """
@@ -154,17 +159,37 @@ def process_outputs(
                                                req_state.is_prefilling,
                                                req_state.prompt_len,
                                                req_state.stats)
-            req_state.is_prefilling = False
-
-            # 2) Detokenize the token ids into text.
-            detokenizer_output = req_state.detokenizer.update_from_output(
-                engine_core_output)
-
-            # 3) Create and handle RequestOutput objects.
-            if detokenizer_output is not None:
-                request_output = self._make_request_output(
-                    req_state, detokenizer_output)
 
+            new_token_ids = engine_core_output.new_token_ids
+            finish_reason = engine_core_output.finish_reason
+
+            # TODO(andy): prompt logprobs + chunked prefill can
+            # result in engine core returning an output for a
+            # partial prefill (in order to send back partial
+            # prompt logprobs.) This breaks the invariant that
+            # process_outputs is only operating on engine core
+            # outputs associated with non-partial completions.
+            # Currently this is handled by having `is_prefilling`
+            # check for new decoded tokens, indicating that
+            # the completion is not partial.
+            #
+            # Follow up will aggregate partial prompt logprobs
+            # in the EngineCore.
+            req_state.is_prefilling = not new_token_ids
+
+            # 2) Detokenize the token ids into text and check for stop
+            #    strings.
+            stop_reason = req_state.detokenizer.update(new_token_ids)
+            if stop_reason:
+                finish_reason = FinishReason.STOP
+
+            # 3) Compute sample and prompt logprobs for request,
+            #    if required.
+            req_state.logprobs_processor.update_from_output(engine_core_output)
+
+            # 4) Create and handle RequestOutput objects.
+            if request_output := self._make_request_output(
+                    req_state, new_token_ids, finish_reason, stop_reason):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -174,18 +199,16 @@ def process_outputs(
 
                 # Free completed requests.
                 if request_output.finished:
-                    assert detokenizer_output.finish_reason is not None
-
                     self.request_states.pop(req_id)
                     if not engine_core_output.finished:
                         # If req not finished in EngineCore, but Detokenizer
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
-                    # Track per-request stats
+                    # Track per-request stats.
+                    assert finish_reason is not None
                     iteration_stats.update_from_finished_request(
-                        detokenizer_output.finish_reason, request_output,
-                        req_state.stats)
+                        finish_reason, request_output, req_state.stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
@@ -196,20 +219,47 @@ def process_outputs(
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
-        detokenizer_output: DetokenizerOutput,
-    ) -> RequestOutput:
+        new_token_ids: List[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Optional[str],
+    ) -> Optional[RequestOutput]:
+
+        finished = finish_reason is not None
+        output_kind = request_state.output_kind
+        # In follow up, we will switch to invariant where EngineCore
+        # does not stream partial prefills.
+        if not finished and (request_state.is_prefilling
+                             or output_kind == RequestOutputKind.FINAL_ONLY):
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+
+        detokenizer = request_state.detokenizer
+        logprobs_processor = request_state.logprobs_processor
+
+        delta = output_kind == RequestOutputKind.DELTA
+        logprobs = logprobs_processor.logprobs
+        if delta:
+            if logprobs:
+                logprobs = logprobs[-len(new_token_ids):]
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = logprobs_processor.prompt_logprobs
+
         request_output = RequestOutput.new(
-            request_state.request_id,
-            request_state.prompt,
-            request_state.prompt_token_ids,
-            detokenizer_output.output_text,
-            detokenizer_output.token_ids,
-            detokenizer_output.finished,
+            request_id=request_state.request_id,
+            prompt=request_state.prompt,
+            prompt_token_ids=request_state.prompt_token_ids,
+            text=detokenizer.get_next_output_text(finished, delta),
+            token_ids=new_token_ids if delta else detokenizer.output_token_ids,
+            logprobs=logprobs,
+            prompt_logprobs=prompt_logprobs,
+            cumulative_logprob=logprobs_processor.cumulative_logprob,
+            finished=finished,
         )
-        if detokenizer_output.finished:
+        if finished:
             completion_output = request_output.outputs[0]
-            completion_output.finish_reason = str(
-                detokenizer_output.finish_reason)
-            completion_output.stop_reason = detokenizer_output.stop_reason
+            completion_output.finish_reason = str(finish_reason)
+            completion_output.stop_reason = stop_reason
 
         return request_output
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 366287951ed..70876b03a82 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -33,6 +33,7 @@ def __init__(
     ):
 
         self.model_config = model_config
+        self.cache_config = cache_config
         self.lora_config = lora_config
         self.tokenizer = tokenizer
 
@@ -51,6 +52,37 @@ def __init__(
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
 
+    def _validate_logprobs(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        if not isinstance(params, SamplingParams):
+            return
+
+        max_logprobs = self.model_config.max_logprobs
+        # Validate sample logprobs.
+        if params.logprobs and params.logprobs > max_logprobs:
+            raise ValueError(
+                f"Requested sample logprobs of {params.logprobs}, "
+                f"which is greater than max allowed: {max_logprobs}")
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs and params.prompt_logprobs > max_logprobs:
+            raise ValueError(
+                f"Requested prompt logprobs of {params.prompt_logprobs}, "
+                f"which is greater than max allowed: {max_logprobs}")
+
+        # TODO(andy): enable this in follow up by recomputing.
+        if (params.prompt_logprobs is not None
+                and self.cache_config.enable_prefix_caching):
+            raise ValueError("Prefix caching with prompt logprobs not yet "
+                             "supported on VLLM V1.")
+
+    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+
     def process_inputs(
         self,
         request_id: str,
@@ -64,12 +96,11 @@ def process_inputs(
     ) -> EngineCoreRequest:
 
         # TODO(woosuk): Support pooling models.
-        # TODO(woosuk): Check max_logprobs
         # TODO(woosuk): Support encoder-decoder models.
 
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
+        self._validate_logprobs(params)
+        self._validate_lora(lora_request)
+
         if arrival_time is None:
             arrival_time = time.time()
         assert priority == 0, "vLLM V1 does not support priority at the moment."
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index e3f1efcc9b1..5e588d35ea4 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -60,14 +60,17 @@ def update_from_output(self, output: "EngineCoreOutput",
 
         self.num_generation_tokens += num_new_generation_tokens
         if is_prefilling:
-            # This relies on the invariant that EngineCore does
-            # not stream outputs for partially completed prefills
-            # (scheduler.update_from_output makes EngineCoreOutput
-            # iff num_computed_tokens == num_tokens).
-            assert (num_new_generation_tokens > 0)
-            self.num_prompt_tokens += prompt_len
-
-            self.time_to_first_tokens_iter.append(last_token_latency)
+            # TODO(andy): we used to assert that num_new_generation_tokens
+            # > 0 with an invariant that EngineCore does not stream outputs
+            # for partially completed prefills (scheduler.update_from_output
+            # makes EngineCoreOutput iff num_computed_tokens == num_tokens).
+            # When prompt logprobs are enabled, we currently stream out the
+            # partially completed prompt.
+            # This will be reverted in a follow up PR and we should re-enable
+            # this assertion / invariant.
+            if num_new_generation_tokens > 0:
+                self.num_prompt_tokens += prompt_len
+                self.time_to_first_tokens_iter.append(last_token_latency)
         else:
             self.time_per_output_tokens_iter.append(last_token_latency)
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 6e82bffd7e5..27fd2dbda8b 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,25 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List, NamedTuple, Optional
 
 import torch
 
 
-@dataclass
-class SamplerOutput:
+class LogprobsLists(NamedTuple):
 
+    # [num_reqs, max_num_logprobs + 1]
+    logprob_token_ids: List[List[int]]
+    # [num_reqs, max_num_logprobs + 1]
+    logprobs: List[List[float]]
     # [num_reqs]
-    sampled_token_ids: torch.Tensor
+    sampled_token_ranks: List[int]
+
+    def slice(self, start: int, end: int):
+        return LogprobsLists(
+            self.logprob_token_ids[start:end],
+            self.logprobs[start:end],
+            self.sampled_token_ranks[start:end],
+        )
+
+
+class LogprobsTensors(NamedTuple):
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: Optional[torch.Tensor]
+    logprob_token_ids: torch.Tensor
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: Optional[torch.Tensor]
+    logprobs: torch.Tensor
+    # [num_reqs]
+    selected_token_ranks: torch.Tensor
 
-    # TODO: Support prompt logprobs.
-    prompt_logprob_token_ids: Optional[torch.Tensor]
-    prompt_logprobs: Optional[torch.Tensor]
+    def tolists(self):
+        return LogprobsLists(
+            self.logprob_token_ids.tolist(),
+            self.logprobs.tolist(),
+            self.selected_token_ranks.tolist(),
+        )
+
+
+@dataclass
+class SamplerOutput:
+
+    # [num_reqs]
+    sampled_token_ids: torch.Tensor
+    logprobs_tensors: Optional[LogprobsTensors]
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
@@ -36,6 +62,12 @@ class ModelRunnerOutput:
     sampled_token_ids: List[int]
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids_cpu: Optional[torch.Tensor]
     # [num_reqs, max_num_logprobs + 1]
-    logprobs_cpu: Optional[torch.Tensor]
+    # [num_reqs]
+    logprobs: Optional[LogprobsLists]
+
+    # req_id -> (token_ids, logprobs, ranks)
+    # [prompt_len, num_prompt_logprobs]
+    # [prompt_len, num_prompt_logprobs]
+    # [prompt_len]
+    prompt_logprobs_dict: Dict[str, LogprobsTensors]
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 8e54de34548..1a2771baba9 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -20,7 +20,8 @@ class SamplingMetadata:
 
     generators: Dict[int, torch.Generator]
 
-    max_num_logprobs: int
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: Optional[int]
 
     no_penalties: bool
     prompt_token_ids: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 3da7498e0da..43fd64aaaa8 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Tuple
 
 import torch
 import torch.nn as nn
 
-from vllm.v1.outputs import SamplerOutput
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
@@ -25,20 +24,16 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        needs_logprobs = sampling_metadata.max_num_logprobs > 0
-        if needs_logprobs:
-            # NOTE(woosuk): Use the original logits (before any penalties or
-            # temperature scaling) for the top-k logprobs.
-            # This is different from the V0 sampler, which uses the logits that
-            # is used for sampling (after penalties and temperature scaling).
-            # NOTE: We compute logprobs first because the below ops may
-            # modify the logits tensor in-place (and we don't want to clone
-            # the logits tensor for memory efficiency).
-            topk_logprobs, topk_indices = self.get_topk_logprobs(
-                logits, sampling_metadata)
-        else:
-            topk_logprobs = None
-            topk_indices = None
+
+        # NOTE(woosuk): Use the original logits (before any penalties or
+        # temperature scaling) for the top-k logprobs.
+        # This is different from the V0 sampler, which uses the logits that
+        # is used for sampling (after penalties and temperature scaling).
+        # TODO(rob): provide option for logprobs post sampling.
+        # See https://vllm-dev.slack.com/archives/C07UUL8E61Z/p1735907856007919 # noqa: E501
+        num_logprobs = sampling_metadata.max_num_logprobs
+        if num_logprobs is not None:
+            raw_logprobs = self.compute_logprobs(logits)
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
@@ -48,15 +43,19 @@ def forward(
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
         # Sample the next token.
         sampled = self.sample(logits, sampling_metadata)
+
+        # Gather the logprobs of the topk and sampled token (if requested).
+        # Get logprobs and rank tensors (if requested)
+        logprobs_tensors = None if num_logprobs is None else \
+            self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=sampled)
+
         # Use int32 to reduce the tensor size.
         sampled = sampled.to(torch.int32)
 
+        # These are GPU tensors.
         sampler_output = SamplerOutput(
             sampled_token_ids=sampled,
-            logprob_token_ids=topk_indices,
-            logprobs=topk_logprobs,
-            prompt_logprob_token_ids=None,
-            prompt_logprobs=None,
+            logprobs_tensors=logprobs_tensors,
         )
         return sampler_output
 
@@ -103,19 +102,52 @@ def sample(
         )
         return sampled
 
-    def get_topk_logprobs(
+    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.log_softmax(dim=-1, dtype=torch.float32)
+
+    def gather_logprobs(
         self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        logprobs = logits.log_softmax(dim=-1, dtype=torch.float32)
-        # FIXME: Mask the sampled token_id, get topk logprobs,
-        # and concatenate the topk with the sampled token_id.
-        topk_logprobs, topk_indices = torch.topk(
-            logprobs, sampling_metadata.max_num_logprobs, dim=-1)
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        """
+        Gather logprobs for topk and sampled/prompt token.
+
+        Args:
+          logits: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+
+        Returns:
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Sampled token rank tensor, (num tokens)
+        """
+        # Find the topK values.
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
+
+        # Get with the logprob of the prompt or sampled token.
+        token_ids = token_ids.unsqueeze(-1)
+        token_logprobs = logprobs.gather(-1, token_ids)
+
+        # Compute the ranks of the actual token.
+        token_ranks = (logprobs >= token_logprobs).sum(-1)
+
+        # Concatenate together with the topk.
+        indices = torch.cat((token_ids, topk_indices), dim=1)
+        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
         # Use int32 to reduce the tensor size.
-        topk_indices = topk_indices.to(torch.int32)
-        return topk_logprobs, topk_indices
+        indices = indices.to(torch.int32)
+
+        return LogprobsTensors(indices, logprobs, token_ranks)
 
     def apply_penalties(
         self,
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 1791dfa2b63..a7fba65e7c9 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,12 +1,58 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
+from typing import Any
+
+import torch
+from msgspec import msgpack
+
+CUSTOM_TYPE_CODE_PICKLE = 1
 
 
 class PickleEncoder:
 
-    def encode(self, obj):
+    def encode(self, obj: Any):
         return pickle.dumps(obj)
 
-    def decode(self, data):
+    def decode(self, data: Any):
         return pickle.loads(data)
+
+
+class MsgpackEncoder:
+    """Encoder with custom torch tensor serialization."""
+
+    def __init__(self):
+        self.encoder = msgpack.Encoder(enc_hook=custom_enc_hook)
+
+    def encode(self, obj: Any) -> bytes:
+        return self.encoder.encode(obj)
+
+    def encode_into(self, obj: Any, buf: bytearray) -> None:
+        self.encoder.encode_into(obj, buf)
+
+
+class MsgpackDecoder:
+    """Decoder with custom torch tensor serialization."""
+
+    def __init__(self, t: Any):
+        self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
+
+    def decode(self, obj: Any):
+        return self.decoder.decode(obj)
+
+
+def custom_enc_hook(obj: Any) -> Any:
+    if isinstance(obj, torch.Tensor):
+        # NOTE(rob): it is fastest to use numpy + pickle
+        # when serializing torch tensors.
+        # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
+        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
+
+    raise NotImplementedError(f"Objects of type {type(obj)} are not supported")
+
+
+def custom_ext_hook(code: int, data: memoryview) -> Any:
+    if code == CUSTOM_TYPE_CODE_PICKLE:
+        return torch.from_numpy(pickle.loads(data))
+
+    raise NotImplementedError(f"Extension type code {code} is not supported")
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index a31e8886561..d5b8fd21841 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -176,7 +176,9 @@ def __init__(
         self.generators: Dict[int, torch.Generator] = {}
 
         self.num_logprobs: Dict[str, int] = {}
-        self.prompt_logprob_reqs: Set[str] = set()
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: Dict[str, int] = {}
 
     def add_request(
         self,
@@ -238,11 +240,10 @@ def add_request(
         if request.generator is not None:
             self.generators[req_index] = request.generator
 
-        num_logprobs = sampling_params.logprobs
-        if num_logprobs is not None and num_logprobs > 0:
-            self.num_logprobs[req_id] = num_logprobs
-        if sampling_params.prompt_logprobs:
-            self.prompt_logprob_reqs.add(req_id)
+        if sampling_params.logprobs is not None:
+            self.num_logprobs[req_id] = sampling_params.logprobs
+        if sampling_params.prompt_logprobs is not None:
+            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
 
         # Add request lora ID
         if request.lora_request:
@@ -272,7 +273,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.prompt_logprob_reqs.discard(req_id)
+        self.num_prompt_logprobs.pop(req_id, None)
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
@@ -297,7 +298,7 @@ def clear(self) -> None:
         self.repetition_penalties_reqs.clear()
         self.generators.clear()
         self.num_logprobs.clear()
-        self.prompt_logprob_reqs.clear()
+        self.num_prompt_logprobs.clear()
         self.request_lora_mapping.fill(0)
         self.lora_id_to_lora_request.clear()
         self.lora_id_to_request_ids.clear()
@@ -489,13 +490,9 @@ def no_penalties(self) -> bool:
                 and len(self.repetition_penalties_reqs) == 0)
 
     @property
-    def max_num_logprobs(self) -> int:
-        return max(self.num_logprobs.values()) if self.num_logprobs else 0
-
-    @property
-    def no_logprob(self) -> bool:
-        return len(self.num_logprobs) == 0
+    def max_num_logprobs(self) -> Optional[int]:
+        return max(self.num_logprobs.values()) if self.num_logprobs else None
 
     @property
     def no_prompt_logprob(self) -> bool:
-        return len(self.prompt_logprob_reqs) == 0
+        return not self.num_prompt_logprobs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bfc9d1ca83f..561c3cf39e9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -29,7 +29,7 @@
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -804,8 +804,8 @@ def execute_model(
                 inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
+        sample_hidden_states = hidden_states[logits_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self._prepare_sampling(batch_changed)
@@ -818,7 +818,8 @@ def execute_model(
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
-        for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
+        for i, req_id in enumerate(  # type: ignore[assignment]
+                self.input_batch.req_ids[:num_reqs]):
             assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
@@ -847,27 +848,28 @@ def execute_model(
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
         sampled_token_ids = sampler_output.sampled_token_ids.tolist()
+        logprobs_tensors = sampler_output.logprobs_tensors
+        logprobs_lists = logprobs_tensors.tolists() \
+            if logprobs_tensors is not None else None
+
+        # Compute prompt logprobs if needed.
+        prompt_logprobs_dict = self._get_prompt_logprobs_dict(
+            hidden_states,
+            scheduler_output,
+        )
+
         # Update with the actual token ids
         for i, req_state, seq_len in request_seq_lens:
             token_id = sampled_token_ids[i]
             self.input_batch.token_ids_cpu[i, seq_len] = token_id
             req_state.output_token_ids[-1] = token_id
 
-        if sampler_output.logprob_token_ids is None:
-            logprob_token_ids = None
-        else:
-            logprob_token_ids = sampler_output.logprob_token_ids.cpu()
-        if sampler_output.logprobs is None:
-            logprobs = None
-        else:
-            logprobs = sampler_output.logprobs.cpu()
-
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=sampled_token_ids,
-            logprob_token_ids_cpu=logprob_token_ids,
-            logprobs_cpu=logprobs,
+            logprobs=logprobs_lists,
+            prompt_logprobs_dict=prompt_logprobs_dict,
         )
         return model_runner_output
 
@@ -886,6 +888,76 @@ def load_model(self) -> None:
         logger.info("Loading model weights took %.4f GB",
                     self.model_memory_usage / float(2**30))
 
+    def _get_prompt_logprobs_dict(
+        self,
+        hidden_states: torch.Tensor,
+        scheduler_output: "SchedulerOutput",
+    ) -> Dict[str, LogprobsTensors]:
+        num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
+        if not num_prompt_logprobs_dict:
+            return {}
+
+        prompt_logprobs_dict: Dict[str, LogprobsTensors] = {}
+
+        # Since prompt logprobs are a rare feature, prioritize simple,
+        # maintainable loop over optimal performance.
+        completed_prefill_reqs = []
+        for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
+
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+
+            # Get metadata for this request.
+            request = self.requests[req_id]
+            num_prompt_tokens = len(request.prompt_token_ids)
+            prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
+                self.device, non_blocking=True)
+
+            # Determine number of logits to retrieve.
+            start_tok = request.num_computed_tokens + 1
+            num_remaining_tokens = num_prompt_tokens - start_tok
+            if num_tokens < num_remaining_tokens:
+                # This is a chunk, more tokens remain.
+                num_logits = num_tokens
+            else:
+                # This is the last chunk of prompt tokens to return.
+                num_logits = num_remaining_tokens
+                completed_prefill_reqs.append(req_id)
+
+            # Get the logits corresponding to this req's prompt tokens.
+            # If this is a partial request (i.e. chunked prefill),
+            # then there is prompt logprob generated for each index.
+            req_idx = self.input_batch.req_id_to_index[req_id]
+            offset = self.query_start_loc_np[req_idx].item()
+            prompt_hidden_states = hidden_states[offset:offset + num_logits]
+            logits = self.model.compute_logits(prompt_hidden_states, None)
+
+            # Get the "target" tokens for each index. For prompt at index i,
+            # the token at prompt index i+1 is the "sampled" token we want
+            # to gather the logprob for.
+            tgt_token_ids = prompt_token_ids[start_tok:start_tok + num_logits]
+
+            # Compute prompt logprobs.
+            logprobs = self.model.sampler.compute_logprobs(logits)
+            token_ids, logprobs, ranks = self.model.sampler.gather_logprobs(
+                logprobs, num_prompt_logprobs, tgt_token_ids)
+
+            # Transfer GPU->CPU async.
+            prompt_logprobs_dict[req_id] = LogprobsTensors(
+                token_ids.to("cpu", non_blocking=True),
+                logprobs.to("cpu", non_blocking=True),
+                ranks.to("cpu", non_blocking=True),
+            )
+
+        # Remove requests that have completed prefill from the batch
+        # num_prompt_logprobs_dict.
+        for req_id in completed_prefill_reqs:
+            del num_prompt_logprobs_dict[req_id]
+
+        # Must synchronize the non-blocking GPU->CPU transfers.
+        torch.cuda.synchronize()
+
+        return prompt_logprobs_dict
+
     @torch.inference_mode()
     def _dummy_run(
         self,

From 2824b375da3b46535ecb60a507494a4abb0a8ced Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Sat, 8 Feb 2025 00:13:43 +0800
Subject: [PATCH 0054/1240] [ROCm] [Feature] [Doc] [Dockerfile] [BugFix]
 Support Per-Token-Activation Per-Channel-Weight FP8 Quantization Inferencing
 (#12501)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.rocm_base                          |   2 +-
 .../installation/gpu/rocm.inc.md              |  64 ++++++---
 tests/quantization/test_fp8.py                |  49 +++++--
 tests/quantization/test_ptpc_fp8.py           |  55 ++++++++
 .../layers/quantization/__init__.py           |   3 +
 .../layers/quantization/ptpc_fp8.py           | 125 ++++++++++++++++++
 .../layers/quantization/utils/w8a8_utils.py   |  27 ++++
 vllm/platforms/rocm.py                        |   2 +-
 8 files changed, 295 insertions(+), 32 deletions(-)
 create mode 100644 tests/quantization/test_ptpc_fp8.py
 create mode 100644 vllm/model_executor/layers/quantization/ptpc_fp8.py

diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
index 5bbe98b0c22..e33e73b3030 100644
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@@ -6,7 +6,7 @@ ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="8d4926e"
+ARG PYTORCH_BRANCH="3a585126"
 ARG PYTORCH_VISION_BRANCH="v0.19.1"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index c8fd11415cf..336d578de40 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -1,6 +1,6 @@
 # Installation
 
-vLLM supports AMD GPUs with ROCm 6.2.
+vLLM supports AMD GPUs with ROCm 6.3.
 
 :::{attention}
 There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
@@ -9,7 +9,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
 ## Requirements
 
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
-- ROCm 6.2
+- ROCm 6.3
 
 ## Set up using Python
 
@@ -24,9 +24,15 @@ Currently, there are no pre-built ROCm wheels.
 - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
 - [PyTorch](https://pytorch.org/)
 
-    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
+    For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
 
-    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/)
+    Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/). Example:
+
+    ```console
+    # Install PyTorch
+    $ pip uninstall torch -y
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
+    ```
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)
 
@@ -37,7 +43,7 @@ Currently, there are no pre-built ROCm wheels.
     pip uninstall -y triton
     git clone https://github.com/OpenAI/triton.git
     cd triton
-    git checkout e192dba
+    git checkout e5be006
     cd python
     pip3 install .
     cd ../..
@@ -49,15 +55,15 @@ Currently, there are no pre-built ROCm wheels.
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
 
-    Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
     Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
-    For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
+    For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.
 
     ```console
     git clone https://github.com/ROCm/flash-attention.git
     cd flash-attention
-    git checkout 3cea2fb
+    git checkout b7d29fb
     git submodule update --init
     GPU_ARCHS="gfx90a" python3 setup.py install
     cd ..
@@ -67,20 +73,16 @@ Currently, there are no pre-built ROCm wheels.
     You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
     :::
 
-3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps:
+3. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
 
     ```bash
     $ pip install --upgrade pip
 
-    # Install PyTorch
-    $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2
-
     # Build & install AMD SMI
     $ pip install /opt/rocm/share/amd_smi
 
     # Install dependencies
-    $ pip install --upgrade numba scipy huggingface-hub[cli]
+    $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm
     $ pip install "numpy<2"
     $ pip install -r requirements-rocm.txt
 
@@ -104,7 +106,7 @@ Currently, there are no pre-built ROCm wheels.
   For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
 :::
 
-## Set up using Docker
+## Set up using Docker (Recommended)
 
 ### Pre-built images
 
@@ -120,7 +122,12 @@ for instructions on how to use this prebuilt docker image.
 
 Building the Docker image from source is the recommended way to use vLLM with ROCm.
 
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+#### (Optional) Build an image with ROCm software stack
+
+Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+**This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
+If you choose to build this rocm_base image yourself, the steps are as follows.
+
 It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```console
@@ -131,7 +138,26 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-<gh-file:Dockerfile.rocm> uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
+To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
+
+```console
+DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
+```
+
+#### Build an image with vLLM
+
+First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+
+```console
+{
+    "features": {
+        "buildkit": true
+    }
+}
+```
+
+<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
 - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
@@ -141,13 +167,13 @@ It provides flexibility to customize the build of docker image using the followi
 
 Their values can be passed in when running `docker build` with `--build-arg` options.
 
-To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
+To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
 DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
 ```
 
-To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
+To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
 DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 5616935ebdc..3a7f0a196b5 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -55,10 +55,21 @@ def check_model(model):
 
             assert isinstance(attn.quant_method, Fp8KVCacheMethod)
 
-            # NOTE: it is valid for scales to be 1.0 (default value), but
-            # we know these checkpoints have scales < 1.0
-            assert 0.0 < attn._k_scale < 1.0
-            assert 0.0 < attn._v_scale < 1.0
+            if not current_platform.is_rocm():
+                # NOTE: This code path requires validation on Non-CUDA platform
+                # NOTE: it is valid for scales to be 1.0 (default value), but
+                # we know these checkpoints have scales < 1.0
+                assert 0.0 < attn._k_scale < 1.0
+                assert 0.0 < attn._v_scale < 1.0
+            else:
+                # NOTE: This code path is for ROCm platform
+                # NOTE: it is valid for scales to be 1.0 (default value), but
+                # we know these checkpoints have scales < 1.0
+                # However on ROCm platform, the _k_scale and _v_scale will be
+                # scaled by a factor of 2 as described in
+                # vllm/model_executor/layers/quantization/kv_cache.py
+                assert 0.0 < attn._k_scale < (1.0 * 2.0)
+                assert 0.0 < attn._v_scale < (1.0 * 2.0)
 
         llm.apply_model(check_model)
 
@@ -91,13 +102,29 @@ def check_model(model):
                 assert attn._k_scale == 1.0
                 assert attn._v_scale == 1.0
 
-            if current_platform.has_device_capability(89) and not force_marlin:
-                # For GPUs with hardware support, we keep weights in fp8
-                assert fc1.weight.dtype == torch.float8_e4m3fn
-            else:
-                # For GPUs without hardware support, we pack the fp8 weights
-                # for weight-only quantization using Marlin kernels
-                assert fc1.weight.dtype == torch.int32
+            if current_platform.is_cuda():
+                if current_platform.has_device_capability(
+                        89) and not force_marlin:
+                    # For GPUs with hardware support, we keep weights in fp8
+                    assert fc1.weight.dtype == torch.float8_e4m3fn
+                else:
+                    # For GPUs without hardware support, we pack the fp8 weights
+                    # for weight-only quantization using Marlin kernels
+                    assert fc1.weight.dtype == torch.int32
+            elif current_platform.is_rocm():
+                # Only MI300 and above support quantization='fp8'
+                if current_platform.has_device_capability(
+                        94) and not force_marlin:
+                    # For GPUs with hardware support, we keep weights in fp8
+                    assert fc1.weight.dtype == torch.float8_e4m3fnuz
+                else:  # unsupported ROCm platform
+                    pytest.skip(
+                        "Skip `test_load_fp16_model`. "
+                        "It only runs on ROCm platform with FP8 compute."
+                        " e.g. MI300X and above.")
+            else:  # unsupported platform
+                pytest.skip("Skip `test_load_fp16_model`. "
+                            "It only runs on CUDA and ROCm platform.")
 
         llm.apply_model(check_model)
 
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
new file mode 100644
index 00000000000..9bbb5e32796
--- /dev/null
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -0,0 +1,55 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests whether PTPC w8a8 FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
+from vllm.model_executor.layers.quantization.ptpc_fp8 import (
+    PTPCFp8LinearMethod)
+from vllm.platforms import current_platform
+
+
+@pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"),
+                    reason="PTPC FP8 is not supported on this GPU type.")
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="This test is for ROCm GPU.")
+@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
+def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
+
+    try:
+        with vllm_runner("facebook/opt-125m",
+                         dtype=dtype,
+                         quantization="ptpc_fp8",
+                         kv_cache_dtype=kv_cache_dtype) as llm:
+
+            model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+            fc1 = model.model.decoder.layers[0].fc1
+            assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
+            if kv_cache_dtype == "ptpc_fp8":
+                attn = model.model.decoder.layers[0].self_attn.attn
+                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
+                assert attn._k_scale == 1.0
+                assert attn._v_scale == 1.0
+
+            if current_platform.has_device_capability(94):
+                # For GPUs with hardware support, we keep weights in fp8
+                assert fc1.weight.dtype == torch.float8_e4m3fnuz
+            else:
+                pytest.skip()
+
+            output = llm.generate_greedy("Hello my name is", max_tokens=20)
+            assert output
+    except AssertionError as e:
+        if str(
+                e
+        ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.":  # noqa: E501
+            # If the error message matches, the test passes
+            pass
+        else:
+            # If the error message does not match, re-raise the exception
+            raise
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 6ded3874fc1..6cd508d057a 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -11,6 +11,7 @@
     "deepspeedfp",
     "tpu_int8",
     "fp8",
+    "ptpc_fp8",
     "fbgemm_fp8",
     "modelopt",
     # The order of gptq methods is important for config.py iteration over
@@ -99,6 +100,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .modelopt import ModelOptFp8Config
     from .moe_wna16 import MoeWNA16Config
     from .neuron_quant import NeuronQuantConfig
+    from .ptpc_fp8 import PTPCFp8Config
     from .qqq import QQQConfig
     from .tpu_int8 import Int8TpuConfig
 
@@ -120,6 +122,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "gptq": GPTQConfig,
         "compressed-tensors": CompressedTensorsConfig,
         "bitsandbytes": BitsAndBytesConfig,
+        "ptpc_fp8": PTPCFp8Config,
         "qqq": QQQConfig,
         "hqq": HQQMarlinConfig,
         "experts_int8": ExpertsInt8Config,
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
new file mode 100644
index 00000000000..1ded5389e5f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -0,0 +1,125 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         Fp8KVCacheMethod,
+                                                         Fp8LinearMethod)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
+from vllm.platforms import current_platform
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+class PTPCFp8Config(Fp8Config):
+    """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
+
+    def __init__(
+        self,
+        activation_scheme: str = "dynamic",
+        ignored_layers: Optional[List[str]] = None,
+    ) -> None:
+        if not current_platform.is_rocm():
+            raise ValueError(
+                "ptpc_fp8 quantization is supported only on ROCm.")
+
+        if not current_platform.has_device_capability(94):
+            raise ValueError(
+                "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
+            )
+        if activation_scheme == "static":
+            raise ValueError(
+                "ptpc_fp8 as of now only support dynamic quantization.")
+
+        super().__init__(is_checkpoint_fp8_serialized=False,
+                         activation_scheme=activation_scheme,
+                         ignored_layers=ignored_layers)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "ptpc_fp8"
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "PTPCFp8Config":
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
+        return cls(activation_scheme=activation_scheme,
+                   ignored_layers=ignored_layers)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.ignored_layers):
+                return UnquantizedLinearMethod()
+            return PTPCFp8LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return Fp8KVCacheMethod(self)
+        return None
+
+
+class PTPCFp8LinearMethod(Fp8LinearMethod):
+    """Linear method for Per-Token and Per-Channel FP8 Quantization.
+    Only supports loading quantized BF16 model checkpoints with dynamic
+    activation scaling. To load FP16 model checkpoints, user must specify
+    to convert the FP16 model weight loading into BF16. 
+    The weight scaling factor will be initialized after
+    the model weights are loaded.
+
+    Limitations:
+    1. Only support float8_e4m3fnuz data type due to the limitation of
+       torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: PTPCFp8Config):
+        super().__init__(quant_config=quant_config)
+        # Force weight quantization
+        self.quant_config.is_checkpoint_fp8_serialized = False
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.weight = torch.nn.Parameter(layer.weight.data,
+                                          requires_grad=False)
+
+        assert layer.weight.data.dtype == torch.bfloat16, \
+            f"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. {str(layer.weight.data.dtype)} is specified." # noqa: E501
+        # Quantize the weights.
+        qweight, weight_scale = ops.scaled_fp8_quant(
+            layer.weight, scale=None, use_per_token_if_dynamic=True)
+
+        # Update the layer with the new values.
+        layer.weight = Parameter(
+            qweight.t(), requires_grad=False)  # Pretranspose the weight
+        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+        layer.input_scale = None
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        return apply_fp8_linear(input=x,
+                                weight=layer.weight,
+                                weight_scale=layer.weight_scale,
+                                input_scale=None,
+                                input_scale_ub=None,
+                                bias=bias,
+                                cutlass_fp8_supported=False,
+                                use_per_token_if_dynamic=True)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index dedeb0c296b..bea6390f71f 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -11,6 +11,13 @@
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
 TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
+# The condition to determine if it is on a platform that supports
+# torch._scaled_mm rowwise feature.
+# The condition is determined once as the operations
+# are time consuming.
+USE_ROWWISE_TORCH_SCALED_MM = (current_platform.is_rocm()
+                               and current_platform.has_device_capability(94))
+
 
 def sparse_cutlass_supported() -> bool:
     if not current_platform.is_cuda():
@@ -172,6 +179,26 @@ def apply_fp8_linear(
             return torch.narrow(output, 0, 0,
                                 input_2d.shape[0]).view(*output_shape)
 
+        elif (use_per_token_if_dynamic and not per_tensor_weights
+              and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
+            # For now validated on ROCm platform
+            # fp8 rowwise scaling in torch._scaled_mm is introduced in
+            # https://github.com/pytorch/pytorch/pull/144432 using
+            # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
+            # For CUDA platform please validate if the
+            # torch._scaled_mm support rowwise scaled GEMM
+            # Fused GEMM_DQ Rowwise GEMM
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      out_dtype=input.dtype,
+                                      scale_a=x_scale,
+                                      scale_b=weight_scale.t(),
+                                      bias=bias)
+
+            output = torch.narrow(output, 0, 0, input_2d.shape[0])
+            output = output.view(*output_shape)
+            return output
+
         else:
             # Fallback for channelwise case, where we use unfused DQ
             # due to limitations with scaled_mm
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 035766289ae..1f690b7111e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -72,7 +72,7 @@ class RocmPlatform(Platform):
 
     supported_quantization: list[str] = [
         "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
-        "fbgemm_fp8", "gguf", "quark"
+        "fbgemm_fp8", "gguf", "quark", "ptpc_fp8"
     ]
 
     @classmethod

From b712ab494ccc5efadf9a064da1268fb436085b2b Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 7 Feb 2025 18:07:03 -0500
Subject: [PATCH 0055/1240] [V1] LM Eval With Streaming Integration Tests
 (#11590)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7ef40564c5b..ab6a576b22b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -195,6 +195,9 @@ steps:
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
     - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/test_accuracy.py::test_lm_eval_accuracy_v1_engine
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"

From e7c9e42e283daea576d83e4c2005844a6186fe85 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:39:50 -0800
Subject: [PATCH 0056/1240] [Bugfix] Fix disagg hang caused by the prefill and
 decode communication issues (#12723)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../kv_lookup_buffer/simple_buffer.py         | 87 +++++++++----------
 1 file changed, 40 insertions(+), 47 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 5e1b62352d1..3462f7de020 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -10,7 +10,6 @@
       stop the prefill instance when the decode instance is slow.
 """
 import threading
-import time
 from collections import deque
 from typing import Deque, List, Optional, Union
 
@@ -29,13 +28,13 @@ class SimpleBuffer(KVLookupBufferBase):
     def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
                  buffer_size_thresh: float):
         """
-        signal_pipe: on CPU 
-        
-        NOTE: on-device recv will block all threads in the process, making the 
-        KV cache producer unable to listen to new request while transmitting 
-        KV cache. Luckily CPU recv only blocks the current thread so we use 
+        signal_pipe: on CPU
+
+        NOTE: on-device recv will block all threads in the process, making the
+        KV cache producer unable to listen to new request while transmitting
+        KV cache. Luckily CPU recv only blocks the current thread so we use
         CPU recv to listen to new request.
-        
+
         data_pipe: on device (e.g. GPU)
         """
 
@@ -43,7 +42,7 @@ def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
 
         self.buffer_size = 0
         self.buffer_size_threshold = buffer_size_thresh
-        self.buffer_lock = threading.Lock()
+        self.buffer_cv = threading.Condition()
         self.signal_pipe = signal_pipe
         self.data_pipe = data_pipe
         self.request_handling_thread: Optional[threading.Thread] = None
@@ -116,11 +115,19 @@ def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
             hidden = hidden.clone()
 
         buffer_item = [input_tokens, roi, key, value, hidden]
+        data_size = sum([self._get_element_size(data) for data in buffer_item])
+
+        with self.buffer_cv:
+            if self.buffer_size + data_size > self.buffer_size_threshold:
+                # log outside the while loop to avoid this message being logged
+                # repeatedly.
+                logger.debug("KV transfer buffer is full. Handling...")
+                while self.buffer_size + data_size > self.buffer_size_threshold:
+                    self.buffer_cv.wait()
 
-        with self.buffer_lock:
-            for data in buffer_item:
-                self.buffer_size += self._get_element_size(data)
+            self.buffer_size += data_size
             self.buffer.append(buffer_item)
+            self.buffer_cv.notify()
 
     def _is_end_signal(self, signal):
         return signal is None
@@ -143,35 +150,31 @@ def drop_select_handler(self):
                 roi = (roi > 0.5)
                 tokens_roi_recver = [input_tokens, roi]
 
-                matched_length = 0
-
-                # perform input tokens and roi matching
-                # FIXME: this matching is O(n), ideally it should be O(1)
-                # but this buffer size won't (and shouldn't) be too large so
-                # the fix is not urgent.
-                with self.buffer_lock:
-
+                def is_buffer_available(
+                    tokens_roi_recver: List[torch.Tensor], ) -> bool:
+                    # perform input tokens and roi matching
+                    # FIXME: this matching is O(n), ideally it should be O(1)
+                    # but this buffer size won't (and shouldn't) be too large so
+                    # the fix is not urgent.
                     for _ in range(len(self.buffer)):
-
-                        temp_length = self._matches(self.buffer[0],
-                                                    tokens_roi_recver)
-                        if temp_length > 0:
-                            matched_length = temp_length
-                            break
+                        if self._matches(self.buffer[0],
+                                         tokens_roi_recver) > 0:
+                            return True
                         # rotate the element we just accessed to the end
                         self.buffer.rotate(-1)
-
-                    if matched_length > 0:
-                        # need to clone the tensor
-                        # in case the tensor is freed before sending finishes
-                        matched_item = self.buffer.popleft()
-                        for tensor in matched_item:
-                            self._send_tensor_and_dec_size(tensor)
-
-                    else:
-                        # no match, just send None
-                        for _ in range(5):
-                            self.data_pipe.send_tensor(None)
+                    return False
+
+                with self.buffer_cv:
+                    while not is_buffer_available(tokens_roi_recver):
+                        logger.debug(
+                            "KV transfer buffer is not available. Waiting...")
+                        self.buffer_cv.wait()
+                    # need to clone the tensor
+                    # in case the tensor is freed before sending finishes
+                    matched_item = self.buffer.popleft()
+                    for tensor in matched_item:
+                        self._send_tensor_and_dec_size(tensor)
+                    self.buffer_cv.notify()
 
         except RuntimeError as e:
             if 'Connection closed by peer' not in str(e):
@@ -208,20 +211,10 @@ def drop_select(
 
         return [input_tokens, roi, key, value, hidden]
 
-    def full_handler(self):
-        time.sleep(0.001)
-
     def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
                key: torch.Tensor, value: torch.Tensor,
                hidden: torch.Tensor) -> None:
 
-        if self.buffer_size > self.buffer_size_threshold:
-            # log outside the while loop to avoid this message being logged
-            # repeatedly.
-            logger.debug("KV transfer buffer is full. Handling...")
-        while self.buffer_size > self.buffer_size_threshold:
-            self.full_handler()
-
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
 
         # when calling the insert, the current process is a sender

From 1f927ce8c53d19093320eddae17d011ba7f6d994 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 7 Feb 2025 19:07:37 -0800
Subject: [PATCH 0057/1240] [V1][Minor] Remove outdated comment (#12928)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/kv_cache_manager.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index de349ec1209..df3dc6c28e3 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -299,9 +299,7 @@ def get_num_common_prefix_blocks(
 
         While all scheduled requests must be in the RUNNING state, the inverse
         is not necessarily true. There may be RUNNING requests that are not
-        scheduled in the current step. As of 1/1/2025, the scheduler does not
-        allow this case, but it is possible in the future, as we allow more
-        flexible scheduling.
+        scheduled in the current step.
 
         This can result in an edge case where the number of common prefix blocks
         is 0, even though all scheduled requests share a common prefix. This

From f53c84d9d48ce202c2dc96fd94678923a270cf73 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 7 Feb 2025 19:14:10 -0800
Subject: [PATCH 0058/1240] [V1] Move KV block hashes from Request to
 KVCacheManager (#12922)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_prefix_caching.py | 21 ++++++++++---------
 vllm/v1/core/kv_cache_manager.py     | 31 +++++++++++++++++++++-------
 vllm/v1/core/scheduler.py            |  1 +
 vllm/v1/request.py                   | 13 ------------
 4 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index a6c0162d3f3..d598d12571f 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -51,7 +51,7 @@ def test_prefill():
     all_token_ids = common_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
-    assert len(req0.kv_block_hashes) == 3
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
@@ -76,7 +76,7 @@ def test_prefill():
     unique_token_ids = [3] * 5
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
-    assert len(req1.kv_block_hashes) == 3
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -107,7 +107,7 @@ def test_prefill():
     unique_token_ids = [3] * 6
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert len(req2.kv_block_hashes) == 3
+    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
     assert [b.block_id for b in computed_blocks] == [0, 1, 2]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
@@ -494,10 +494,11 @@ def test_mm_prefix_caching():
     # Completed block should have hashes with extra keys.
     assert not computed_blocks
     assert num_computed_tokens == 0
-    assert len(req0.kv_block_hashes) == 3
-    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
-    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
-    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
+    block_hashes = manager.req_to_block_hashes[req0.request_id]
+    assert len(block_hashes) == 3
+    assert block_hashes[0].extra_keys == ("aaa", )
+    assert block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
     assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -510,8 +511,8 @@ def test_mm_prefix_caching():
     assert new_blocks is not None and len(new_blocks) == 0
 
     # The just completed block should have hashes with extra keys.
-    assert len(req0.kv_block_hashes) == 4
-    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
+    assert len(block_hashes) == 4
+    assert block_hashes[3].extra_keys == ("ccc", )
 
     # Cache hit.
     unique_token_ids = [-1] * 7 + [200] * 5
@@ -613,7 +614,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req1 = make_request("1", all_token_ids)
     computed_blocks, _ = manager.get_computed_blocks(req1)
-    assert len(req1.kv_block_hashes) == 3
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert len(computed_blocks) == 3
     blocks = manager.allocate_slots(req1, 7, computed_blocks)
     assert [b.block_id for b in blocks] == [4]
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index df3dc6c28e3..eefc2e19c20 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -72,6 +72,12 @@ def __init__(
         self.req_to_blocks: DefaultDict[str,
                                         List[KVCacheBlock]] = defaultdict(list)
 
+        # Mapping from request ID to kv block hashes.
+        # This is to avoid recomputing the block hashes for each call of
+        # `get_computed_blocks` or `allocate_slots`.
+        self.req_to_block_hashes: DefaultDict[
+            str, List[BlockHashType]] = defaultdict(list)
+
     @property
     def usage(self) -> float:
         return 1.0 - (self.free_block_queue.num_free_blocks /
@@ -97,11 +103,11 @@ def get_computed_blocks(
         computed_blocks = []
 
         # The block hashes for the request may already be computed
-        # if the request was preempted and resumed.
-        if not request.kv_block_hashes:
-            request.set_kv_block_hashes(
-                hash_request_tokens(self.block_size, request))
-        block_hashes = request.kv_block_hashes
+        # if the scheduler has tried to schedule the request before.
+        block_hashes = self.req_to_block_hashes[request.request_id]
+        if not block_hashes:
+            block_hashes = hash_request_tokens(self.block_size, request)
+            self.req_to_block_hashes[request.request_id] = block_hashes
 
         for block_hash in block_hashes:
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -435,7 +441,8 @@ def _cache_full_blocks(
             full_blocks: The list of blocks to update hash metadata.
             prev_block: The previous block in the chain.
         """
-        num_cached_block_hashes = len(request.kv_block_hashes)
+        block_hashes = self.req_to_block_hashes[request.request_id]
+        num_cached_block_hashes = len(block_hashes)
 
         # Update the new blocks with the block hashes through the chain.
         prev_block_hash_value = None
@@ -468,7 +475,7 @@ def _cache_full_blocks(
                 # this request (either the prompt tokens or the previously
                 # generated tokens with preemption). In this case we simply
                 # reuse the block hash.
-                block_hash = request.kv_block_hashes[blk_idx]
+                block_hash = block_hashes[blk_idx]
             else:
                 # Otherwise compute the block hash and cache it in the request
                 # in case it will be preempted in the future.
@@ -490,9 +497,17 @@ def _cache_full_blocks(
                 # Compute the hash of the current block.
                 block_hash = hash_block_tokens(prev_block_hash_value,
                                                block_tokens, extra_keys)
-                request.append_kv_block_hashes(block_hash)
+                block_hashes.append(block_hash)
 
             # Update and added the full block to the cache.
             blk.block_hash = block_hash
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
             prev_block_hash_value = block_hash.hash_value
+
+    def free_block_hashes(self, request: Request) -> None:
+        """Discard the block hashes for the request.
+
+        NOTE: Unlike `free`, this method should be called only when the request
+        is finished, not when it is preempted.
+        """
+        self.req_to_block_hashes.pop(request.request_id, None)
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 35d9424f942..1aa34ee3860 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -579,6 +579,7 @@ def finish_requests(
     def _free_request(self, request: Request) -> None:
         assert request.is_finished()
         self.kv_cache_manager.free(request)
+        self.kv_cache_manager.free_block_hashes(request)
         self.encoder_cache_manager.free(request)
         self._cached_reqs_data.pop(request.request_id, None)
         del self.requests[request.request_id]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 89b39ea615d..bb4d2c19197 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -12,7 +12,6 @@
 if TYPE_CHECKING:
     from vllm.multimodal import MultiModalKwargs
     from vllm.multimodal.inputs import PlaceholderRange
-    from vllm.v1.core.kv_cache_utils import BlockHashType
 
 
 class Request:
@@ -63,11 +62,6 @@ def __init__(
         if self.mm_hashes:
             assert len(self.mm_inputs) == len(self.mm_hashes)
 
-        # Cache the computed kv block hashes of the request to avoid
-        # recomputing.
-        self._kv_block_hashes: List[BlockHashType] = []
-        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
-
         # Read-only views
         # Prevent directly appending to the these lists since
         # they should also be updated simultaneously.
@@ -124,13 +118,6 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
-    def set_kv_block_hashes(self, value: List["BlockHashType"]) -> None:
-        self._kv_block_hashes = value
-        self.kv_block_hashes = ConstantList(self._kv_block_hashes)
-
-    def append_kv_block_hashes(self, block_hash: "BlockHashType") -> None:
-        self._kv_block_hashes.append(block_hash)
-
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""

From d92a1b958254da9915d8466adeaa1a72efd9b5c7 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 8 Feb 2025 13:02:53 +0800
Subject: [PATCH 0059/1240] [Bugfix] Fix Qwen2_5_VLForConditionalGeneration
 packed_modules_mapping (#12905)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index e93cf46b900..1f350ab203f 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -760,9 +760,12 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "q_proj",
             "k_proj",
             "v_proj",
-        ]
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
     }
-
     # LoRA specific attributes, TODO: double check
     supported_lora_modules = [
         "qkv_proj",

From e0e57cea834a81e318525a456affb42ff2ebcecb Mon Sep 17 00:00:00 2001
From: Ke Zhao <yingxiongraomingzk@gmail.com>
Date: Sat, 8 Feb 2025 14:56:43 +0800
Subject: [PATCH 0060/1240] [Misc] Fix typo in the example file (#12896)

Signed-off-by: Zhao Ke <yingxiongraomingzk@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai_chat_embedding_client_for_multimodal.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index f49d7a22819..e410620378a 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -44,7 +44,7 @@ def vlm2vec():
 
 def dse_qwen2_vl(inp: dict):
     # Embedding an Image
-    if inp["dtype"] == "image":
+    if inp["type"] == "image":
         messages = [{
             "role":
             "user",
@@ -113,10 +113,10 @@ def dse_qwen2_vl(inp: dict):
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
         dse_qwen2_vl({
-            "dtye": "image",
+            "type": "image",
             "image_url": image_url,
         })
         dse_qwen2_vl({
-            "dtype": "text",
+            "type": "text",
             "content": "What is the weather like today?",
         })

From 61525b4fbf55a6f575529aea05b5c64128e36a44 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Fri, 7 Feb 2025 23:04:34 -0800
Subject: [PATCH 0061/1240] [Bugfix] Fix multi-round chat error when mistral
 tokenizer is used (#12859)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 1550f978ed2..7a1dba42446 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -291,6 +291,16 @@ def apply_chat_template(self,
 
         from mistral_common.protocol.instruct.request import (
             ChatCompletionRequest)
+
+        # mistral-common requires AssistantMessage content to be string [1].
+        #
+        # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+        for message in messages:
+            if message.get("role") == "assistant":
+                content = message.get("content")
+                if isinstance(content, list):
+                    content = "\n".join(chunk.get("text") for chunk in content)
+                    message["content"] = content
         request = ChatCompletionRequest(messages=messages,
                                         tools=tools)  # type: ignore[type-var]
         encoded = self.mistral.encode_chat_completion(request)

From 066625063f34deea5fffabd2457c7574f1b94e23 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Feb 2025 16:17:08 +0800
Subject: [PATCH 0062/1240] [bugfix] respect distributed_executor_backend in
 world_size=1 (#12934)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...st_custom_executor.py => test_executor.py} | 21 ++++++++-
 vllm/config.py                                |  3 ++
 vllm/engine/llm_engine.py                     | 44 +++++++++----------
 vllm/v1/executor/abstract.py                  | 17 ++++---
 4 files changed, 53 insertions(+), 32 deletions(-)
 rename tests/engine/{test_custom_executor.py => test_executor.py} (79%)

diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_executor.py
similarity index 79%
rename from tests/engine/test_custom_executor.py
rename to tests/engine/test_executor.py
index 3e77faecbd3..84cc3ed63bb 100644
--- a/tests/engine/test_custom_executor.py
+++ b/tests/engine/test_executor.py
@@ -55,6 +55,7 @@ def test_custom_executor(model, tmp_path):
         engine_args = EngineArgs(
             model=model,
             distributed_executor_backend=CustomUniExecutor,
+            enforce_eager=True,  # reduce test time
         )
         engine = LLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
@@ -75,7 +76,10 @@ def test_custom_executor_async(model, tmp_path):
         assert not os.path.exists(".marker")
 
         engine_args = AsyncEngineArgs(
-            model=model, distributed_executor_backend=CustomUniExecutorAsync)
+            model=model,
+            distributed_executor_backend=CustomUniExecutorAsync,
+            enforce_eager=True,  # reduce test time
+        )
         engine = AsyncLLMEngine.from_engine_args(engine_args)
         sampling_params = SamplingParams(max_tokens=1)
 
@@ -89,3 +93,18 @@ async def t():
         assert os.path.exists(".marker")
     finally:
         os.chdir(cwd)
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_respect_ray(model):
+    # even for TP=1 and PP=1,
+    # if users specify ray, we should use ray.
+    # users might do this if they want to manage the
+    # resources using ray.
+    engine_args = EngineArgs(
+        model=model,
+        distributed_executor_backend="ray",
+        enforce_eager=True,  # reduce test time
+    )
+    engine = LLMEngine.from_engine_args(engine_args)
+    assert engine.model_executor.uses_ray
diff --git a/vllm/config.py b/vllm/config.py
index 5579d6936d1..426ba380802 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1401,6 +1401,9 @@ def __post_init__(self) -> None:
             logger.info("Defaulting to use %s for distributed inference",
                         backend)
 
+        if self.distributed_executor_backend is None and self.world_size == 1:
+            self.distributed_executor_backend = "uni"
+
         self._verify_args()
 
     @property
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d82d9ad9df3..2e5bc75c6db 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -434,6 +434,7 @@ def _initialize_kv_caches(self) -> None:
     @classmethod
     def _get_executor_cls(cls,
                           engine_config: VllmConfig) -> Type[ExecutorBase]:
+        # distributed_executor_backend must be set in VllmConfig.__post_init__
         distributed_executor_backend = (
             engine_config.parallel_config.distributed_executor_backend)
         # Initialize the cluster and specify the executor class.
@@ -443,30 +444,29 @@ def _get_executor_cls(cls,
                     "distributed_executor_backend must be a subclass of "
                     f"ExecutorBase. Got {distributed_executor_backend}.")
             executor_class = distributed_executor_backend
-        elif engine_config.parallel_config.world_size > 1:
-            if distributed_executor_backend == "ray":
-                from vllm.executor.ray_distributed_executor import (
-                    RayDistributedExecutor)
-                executor_class = RayDistributedExecutor
-            elif distributed_executor_backend == "mp":
-                from vllm.executor.mp_distributed_executor import (
-                    MultiprocessingDistributedExecutor)
-                assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
-                    "multiprocessing distributed executor backend does not "
-                    "support VLLM_USE_RAY_SPMD_WORKER=1")
-                executor_class = MultiprocessingDistributedExecutor
-            elif distributed_executor_backend == "uni":
-                # JAX-style, single-process, multi-device executor.
-                from vllm.executor.uniproc_executor import UniProcExecutor
-                executor_class = UniProcExecutor
-            elif distributed_executor_backend == "external_launcher":
-                # executor with external launcher
-                from vllm.executor.uniproc_executor import (  # noqa
-                    ExecutorWithExternalLauncher)
-                executor_class = ExecutorWithExternalLauncher
-        else:
+        elif distributed_executor_backend == "ray":
+            from vllm.executor.ray_distributed_executor import (
+                RayDistributedExecutor)
+            executor_class = RayDistributedExecutor
+        elif distributed_executor_backend == "mp":
+            from vllm.executor.mp_distributed_executor import (
+                MultiprocessingDistributedExecutor)
+            assert not envs.VLLM_USE_RAY_SPMD_WORKER, (
+                "multiprocessing distributed executor backend does not "
+                "support VLLM_USE_RAY_SPMD_WORKER=1")
+            executor_class = MultiprocessingDistributedExecutor
+        elif distributed_executor_backend == "uni":
+            # JAX-style, single-process, multi-device executor.
             from vllm.executor.uniproc_executor import UniProcExecutor
             executor_class = UniProcExecutor
+        elif distributed_executor_backend == "external_launcher":
+            # executor with external launcher
+            from vllm.executor.uniproc_executor import (  # noqa
+                ExecutorWithExternalLauncher)
+            executor_class = ExecutorWithExternalLauncher
+        else:
+            raise ValueError("unrecognized distributed_executor_backend: "
+                             f"{distributed_executor_backend}")
         return executor_class
 
     @classmethod
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index ac10d43eb0d..093be09ae11 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -25,15 +25,14 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         parallel_config = vllm_config.parallel_config
         distributed_executor_backend = (
             parallel_config.distributed_executor_backend)
-        if distributed_executor_backend is None:
-            # If the user does not specify the distributed executor backend,
-            # we will choose the backend based on the world size.
-            if parallel_config.world_size > 1:
-                distributed_executor_backend = "mp"
-            else:
-                distributed_executor_backend = "uni"
-
-        if distributed_executor_backend == "ray":
+        # distributed_executor_backend must be set in VllmConfig.__post_init__
+        if isinstance(distributed_executor_backend, type):
+            if not issubclass(distributed_executor_backend, ExecutorBase):
+                raise TypeError(
+                    "distributed_executor_backend must be a subclass of "
+                    f"ExecutorBase. Got {distributed_executor_backend}.")
+            executor_class = distributed_executor_backend
+        elif distributed_executor_backend == "ray":
             executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor

From 31ef1dd3f16b7958afa5b421733b65aa081d26ce Mon Sep 17 00:00:00 2001
From: Shaoting <shaotingf@uchicago.edu>
Date: Sat, 8 Feb 2025 02:38:20 -0600
Subject: [PATCH 0063/1240] [Misc] Add offline test for disaggregated prefill
 (#12418)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../disaggregated_prefill.py                  | 111 ++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 examples/offline_inference/disaggregated_prefill.py

diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
new file mode 100644
index 00000000000..2e41cabacca
--- /dev/null
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of disaggregated prefilling
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and then transfer the KV cache between them.
+"""
+import os
+import time
+from multiprocessing import Event, Process
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+
+def run_prefill(prefill_done):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    # The prefill node receives two requests, while the decode node receives
+    # three requests. So the decode node will only receive the KV Cache for
+    # requests 1 and 3. The decode node will use the KV Cache of requests 1
+    # and 3 and do prefilling on request 2.
+    prompts = [
+        "Hello, my name is",
+        # "Hi, your name is",
+        # The decode node will actually "prefill" this request.
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the prefill node (kv_producer, rank 0).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8)
+
+    llm.generate(prompts, sampling_params)
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # To keep the prefill node running in case the decode node is not done;
+    # otherwise, the script might exit prematurely, causing incomplete decoding.
+    try:
+        while True:
+            time.sleep(1)
+    except KeyboardInterrupt:
+        print("Script stopped by user.")
+
+
+def run_decode(prefill_done):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    prompts = [
+        "Hello, my name is",
+        "Hi, your name is",
+        "Tell me a very long story",
+    ]
+    sampling_params = SamplingParams(temperature=0, top_p=0.95)
+
+    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # This instance is the decode node (kv_consumer, rank 1).
+    # The number of parallel instances for KV cache transfer is set to 2,
+    # as required for PyNcclConnector.
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+    )
+
+    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
+    # memory. You may need to adjust the value to fit your GPU.
+    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+              kv_transfer_config=ktc,
+              max_model_len=2000,
+              gpu_memory_utilization=0.8)
+
+    # Wait for the producer to start the pipe
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+
+    # At this point when the prefill_done is set, the kv-cache should have been
+    # transferred to this decode node, so we can start decoding.
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
+    decode_process = Process(target=run_decode, args=(prefill_done, ))
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Terminate the prefill node when decode is finished
+    decode_process.join()
+    prefill_process.terminate()

From 25d8512cac71b9a62a9cc3415c01b196ff91b88b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 8 Feb 2025 00:39:09 -0800
Subject: [PATCH 0064/1240] [V1][Minor] Move cascade attn logic outside
 _prepare_inputs (#12943)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 150 +++++++++++++++++------------
 1 file changed, 89 insertions(+), 61 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 561c3cf39e9..e0a096a9106 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -476,67 +476,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             self.device, non_blocking=True).long()
 
         # Prepare for cascade attention if needed.
-        common_prefix_len = (scheduler_output.num_common_prefix_blocks *
-                             self.block_size)
-        if common_prefix_len == 0:
-            # Common case.
-            use_cascade = False
-        else:
-            # NOTE(woosuk): Cascade attention uses two attention kernels: one
-            # for the common prefix and the other for the rest. For the first
-            # kernel, we concatenate all the query tokens (possibly from
-            # different requests) and treat them as if they are from the same
-            # request. Then, we use bi-directional attention to process the
-            # common prefix in the KV cache. Importantly, this means that the
-            # first kernel does not do any masking.
-
-            # Consider the following example:
-            # Request 1's input query: [D, E, X]
-            # Request 1's kv cache: [A, B, C, D, E, X]
-            # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
-            # Request 2's input query: [E, Y]
-            # Request 2's kv cache: [A, B, C, D, E, Y]
-            # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
-
-            # If we use [A, B, C, D, E] as the common prefix, then the
-            # first kernel will compute the bi-directional attention between
-            # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
-            # However, this is wrong because D in Request 1 should not attend to
-            # E in the common prefix (i.e., we need masking).
-            # To avoid this, [A, B, C, D] should be the common prefix.
-            # That is, the common prefix should be capped by the minimum
-            # num_computed_tokens among the requests, and plus one to include
-            # the first token of the query.
-
-            # In practice, we use [A, B, C] as the common prefix, instead of
-            # [A, B, C, D] (i.e., the common prefix is capped by the minimum
-            # num_computed_tokens, without plus one).
-            # This is because of an implementation detail: We want to always
-            # use two kernels for cascade attention. Let's imagine:
-            # Request 3's input query: [D]
-            # Request 3's kv cache: [A, B, C, D]
-            # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
-            # If we use [A, B, C, D] as the common prefix for Request 1-3,
-            # then Request 3 will be processed only by the first kernel,
-            # and the second kernel will get an empty input. While this is not
-            # a fundamental problem, our current implementation does not support
-            # this case.
-            common_prefix_len = min(
-                common_prefix_len,
-                self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
-            # common_prefix_len should be a multiple of the block size.
-            common_prefix_len = (common_prefix_len // self.block_size *
-                                 self.block_size)
-            use_cascade = FlashAttentionBackend.use_cascade_attention(
-                common_prefix_len=common_prefix_len,
-                query_lens=num_scheduled_tokens,
-                num_query_heads=self.num_query_heads,
-                num_kv_heads=self.num_kv_heads,
-                use_alibi=False,  # FIXME
-                use_sliding_window=self.sliding_window is not None,
-                num_sms=self.num_sms,
-            )
-
+        common_prefix_len = self._compute_cascade_attn_prefix_len(
+            num_scheduled_tokens,
+            scheduler_output.num_common_prefix_blocks,
+        )
+        use_cascade = common_prefix_len > 0
         if use_cascade:
             # TODO: Optimize.
             cu_prefix_query_lens = torch.tensor(
@@ -581,6 +525,90 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
 
+    def _compute_cascade_attn_prefix_len(
+        self,
+        num_scheduled_tokens: np.ndarray,
+        num_common_prefix_blocks: int,
+    ) -> int:
+        """Compute the length of the common prefix for cascade attention.
+
+        NOTE(woosuk): The common prefix length returned by this function
+        represents the length used specifically for cascade attention, not the
+        actual number of tokens shared between requests. When cascade attention
+        is disabled (use_cascade=False), this function returns 0 even if
+        requests share common tokens. Additionally, the common prefix length is
+        truncated to a multiple of the block size and may be further truncated
+        due to implementation details explained below.
+
+        Args:
+            num_scheduled_tokens: Number of tokens scheduled per request.
+            num_common_prefix_blocks: Number of shared KV cache blocks.
+
+        Returns:
+            int: Length of common prefix in tokens.
+        """
+        common_prefix_len = num_common_prefix_blocks * self.block_size
+        if common_prefix_len == 0:
+            # Common case.
+            return 0
+
+        # NOTE(woosuk): Cascade attention uses two attention kernels: one
+        # for the common prefix and the other for the rest. For the first
+        # kernel, we concatenate all the query tokens (possibly from
+        # different requests) and treat them as if they are from the same
+        # request. Then, we use bi-directional attention to process the
+        # common prefix in the KV cache. Importantly, this means that the
+        # first kernel does not do any masking.
+
+        # Consider the following example:
+        # Request 1's input query: [D, E, X]
+        # Request 1's kv cache: [A, B, C, D, E, X]
+        # Request 1's num_computed_tokens: 3 (i.e., [A, B, C])
+        # Request 2's input query: [E, Y]
+        # Request 2's kv cache: [A, B, C, D, E, Y]
+        # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D])
+
+        # If we use [A, B, C, D, E] as the common prefix, then the
+        # first kernel will compute the bi-directional attention between
+        # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E].
+        # However, this is wrong because D in Request 1 should not attend to
+        # E in the common prefix (i.e., we need masking).
+        # To avoid this, [A, B, C, D] should be the common prefix.
+        # That is, the common prefix should be capped by the minimum
+        # num_computed_tokens among the requests, and plus one to include
+        # the first token of the query.
+
+        # In practice, we use [A, B, C] as the common prefix, instead of
+        # [A, B, C, D] (i.e., the common prefix is capped by the minimum
+        # num_computed_tokens, without plus one).
+        # This is because of an implementation detail: We want to always
+        # use two kernels for cascade attention. Let's imagine:
+        # Request 3's input query: [D]
+        # Request 3's kv cache: [A, B, C, D]
+        # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+        # If we use [A, B, C, D] as the common prefix for Request 1-3,
+        # then Request 3 will be processed only by the first kernel,
+        # and the second kernel will get an empty input. While this is not
+        # a fundamental problem, our current implementation does not support
+        # this case.
+        num_reqs = len(num_scheduled_tokens)
+        common_prefix_len = min(
+            common_prefix_len,
+            self.input_batch.num_computed_tokens_cpu[:num_reqs].min())
+        # common_prefix_len should be a multiple of the block size.
+        common_prefix_len = (common_prefix_len // self.block_size *
+                             self.block_size)
+        use_cascade = FlashAttentionBackend.use_cascade_attention(
+            common_prefix_len=common_prefix_len,
+            query_lens=num_scheduled_tokens,
+            num_query_heads=self.num_query_heads,
+            num_kv_heads=self.num_kv_heads,
+            use_alibi=False,  # FIXME
+            use_sliding_window=self.sliding_window is not None,
+            num_sms=self.num_sms,
+        )
+        return common_prefix_len if use_cascade else 0
+
     def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
         mrope_pos_ptr = 0
         num_reqs = self.input_batch.num_reqs

From 4df6adca2ef10fd8b9e72d52094483db63707951 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Sat, 8 Feb 2025 17:15:15 +0800
Subject: [PATCH 0065/1240] [Build] Make pypi install work on CPU platform
 (#12874)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index a4043c43a7d..dc517dafa31 100755
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,11 @@ def load_module_from_path(module_name, path):
         "Building on %s, "
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
+elif (sys.platform.startswith("linux") and torch.version.cuda is None
+      and os.getenv("VLLM_TARGET_DEVICE") is None):
+    # if cuda is not available and VLLM_TARGET_DEVICE is not set,
+    # fallback to cpu
+    VLLM_TARGET_DEVICE = "cpu"
 
 MAIN_CUDA_VERSION = "12.1"
 
@@ -482,7 +487,6 @@ def get_vllm_version() -> str:
     version = get_version(
         write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
     )
-
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
     if _no_device():
@@ -520,7 +524,8 @@ def get_vllm_version() -> str:
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
-        version += f"{sep}cpu"
+        if envs.VLLM_TARGET_DEVICE == "cpu":
+            version += f"{sep}cpu"
     elif _is_xpu():
         version += f"{sep}xpu"
     else:

From 79eb3fd258fd421b496ea3f76659b972aa82d997 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Sat, 8 Feb 2025 14:45:30 +0530
Subject: [PATCH 0066/1240] [Hardware][Intel-Gaudi] Enable long-contexts + LoRA
 support for Intel Gaudi (#12812)

Signed-off-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/punica_wrapper/punica_hpu.py        | 57 ++++++++++++++++++-
 .../model_executor/layers/rotary_embedding.py |  3 +-
 vllm/worker/hpu_model_runner.py               | 17 +++++-
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py
index 51e1bfab3f5..3661a721464 100644
--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
@@ -1,12 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional, Tuple, Union, final
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final
 
 import torch
 from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
                                     dispatch_bgmv_linear)
 
 from .punica_base import PunicaWrapperBase
+from .utils import convert_mapping
+
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
 
 
 @final
@@ -19,6 +25,55 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
                                    max_batches, device)
 
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
+                            extra_vocab_size, self.device, None)
+        # Updating each element in `long_lora_offsets` with `lora_offset` slows
+        # down perf in HPU due to a series of `strided_insert` ops during lazy
+        # graph accumulation. Hence HPU appends `lora_offset` to a list and
+        # converts it to a tensor only after it is ready.
+        if long_lora_context:
+            index_mapping_indices: List[int] = list(
+                mapping.index_mapping).copy()
+            long_lora_offsets: List[int] = []
+            for i in range(len(index_mapping_indices)):
+                lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                    index_mapping_indices[i], 0)
+                long_lora_offsets.append(lora_offset)
+            long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
+                                                    device=self.device,
+                                                    dtype=torch.long)
+            indices_len[-1] = long_lora_offsets_tensor.shape[-1]
+
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+        self.indices_len[:] = indices_len
+
     def add_lora_embedding(self,
                            y: torch.Tensor,
                            x: torch.Tensor,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ec204b32f67..5d7f9396c20 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -206,9 +206,10 @@ def forward_hpu(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         from habana_frameworks.torch.hpex.kernels import (
             RotaryPosEmbeddingMode, apply_rotary_pos_emb)
-        positions = positions.flatten()
         if offsets is not None:
+            offsets = offsets.view(positions.shape[0], -1)
             positions = positions + offsets
+        positions = positions.flatten()
         num_tokens = positions.shape[0]
         cos_sin = self.cos_sin_cache.index_select(0, positions).view(
             num_tokens, 1, -1)
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index b846d4387ba..774049a5281 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -639,12 +639,25 @@ def load_model(self) -> None:
                     "Bias support in LoRA is not enabled in HPU yet."
                 assert not self.lora_config.fully_sharded_loras, \
                     "Fully sharded LoRAs is not enabled in HPU yet."
+                # It's necessary to distinguish between the
+                # max_position_embeddings of VLMs and LLMs.
+                if hasattr(self.model.config, "max_position_embeddings"):
+                    max_pos_embeddings = (
+                        self.model.config.max_position_embeddings)
+                else:
+                    max_pos_embeddings = (
+                        self.model.config.text_config.max_position_embeddings)
+
                 self.lora_manager = LRUCacheWorkerLoRAManager(
                     self.scheduler_config.max_num_seqs,
                     self.scheduler_config.max_num_batched_tokens,
-                    self.vocab_size, self.lora_config, self.device,
+                    self.vocab_size,
+                    self.lora_config,
+                    self.device,
                     self.model.embedding_modules,
-                    self.model.embedding_padding_modules)
+                    self.model.embedding_padding_modules,
+                    max_position_embeddings=max_pos_embeddings,
+                )
                 self.model = self.lora_manager.create_lora_manager(self.model)
 
             if self.model_config.quantization == 'inc':

From 1544cbb875265f553b9d5bde3ca1edbcc1e47d0f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sat, 8 Feb 2025 14:45:44 +0530
Subject: [PATCH 0067/1240] [misc]  Add LoRA to benchmark_serving (#12898)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e934d228f7f..1044bef5941 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -537,6 +537,7 @@ async def benchmark(
     ignore_eos: bool,
     goodput_config_dict: Dict[str, float],
     max_concurrency: Optional[int],
+    lora_modules: Optional[List[str]],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -562,6 +563,7 @@ async def benchmark(
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )
+
     test_output = await request_func(request_func_input=test_input)
     if not test_output.success:
         raise ValueError(
@@ -570,6 +572,11 @@ async def benchmark(
     else:
         print("Initial test run completed. Starting main benchmark run...")
 
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))])
+
     if profile:
         print("Starting profiler...")
         profile_input = RequestFuncInput(model=model_id,
@@ -616,8 +623,13 @@ async def limited_request_func(request_func_input, pbar):
     tasks: List[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
-        request_func_input = RequestFuncInput(model=model_id,
-                                              model_name=model_name,
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(model=req_model_id,
+                                              model_name=req_model_name,
                                               prompt=prompt,
                                               api_url=api_url,
                                               prompt_len=prompt_len,
@@ -900,6 +912,7 @@ def main(args: argparse.Namespace):
             ignore_eos=args.ignore_eos,
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
         ))
 
     # Save config and results to json
@@ -1237,5 +1250,12 @@ def main(args: argparse.Namespace):
                         "If not specified, the model name will be the "
                         "same as the ``--model`` argument. ")
 
+    parser.add_argument("--lora-modules",
+                        nargs='+',
+                        default=None,
+                        help="A subset of LoRA module names passed in when "
+                        "launching the server. For each request, the "
+                        "script chooses a LoRA module at random.")
+
     args = parser.parse_args()
     main(args)

From 064606f2296544dd450068ca4e9180634682e9f8 Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Sat, 8 Feb 2025 04:16:42 -0500
Subject: [PATCH 0068/1240] [Misc] Log time consumption on weight downloading
 (#12926)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/weight_utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index cade0a1dd59..68ade319df2 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -6,6 +6,7 @@
 import json
 import os
 import tempfile
+import time
 from collections import defaultdict
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
@@ -14,7 +15,8 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
+from huggingface_hub import (HfFileSystem, hf_hub_download, scan_cache_dir,
+                             snapshot_download)
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
@@ -253,6 +255,8 @@ def download_weights_from_hf(
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
+        start_size = scan_cache_dir().size_on_disk
+        start_time = time.perf_counter()
         hf_folder = snapshot_download(
             model_name_or_path,
             allow_patterns=allow_patterns,
@@ -262,6 +266,11 @@ def download_weights_from_hf(
             revision=revision,
             local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
         )
+        end_time = time.perf_counter()
+        end_size = scan_cache_dir().size_on_disk
+        if end_size != start_size:
+            logger.info("Time took to download weights for %s: %.6f seconds",
+                        model_name_or_path, end_time - start_time)
     return hf_folder
 
 
From 4bec03c38e976e27e6ba7664885ba6a71297a135 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Sat, 8 Feb 2025 01:41:35 -0800
Subject: [PATCH 0069/1240] [CI] Resolve transformers-neuronx version conflict
 (#12925)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-neuron-test.sh | 3 ---
 Dockerfile.neuron             | 8 +++++++-
 requirements-neuron.txt       | 1 -
 setup.py                      | 7 +------
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 1ad77cf50f6..55c374fcc33 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -29,9 +29,6 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
         docker image prune -f
         # Remove unused volumes / force the system prune for old images as well.
         docker volume prune -f && docker system prune -f
-        # Remove huggingface model artifacts and compiler cache
-        rm -rf "${HF_MOUNT:?}/*"
-        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index e9cb82889de..27658d836d9 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -23,10 +23,12 @@ WORKDIR ${APP_MOUNT}/vllm
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
-RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 
+# uninstall transformers-neuronx package explicitly to avoid version conflict
+RUN python3 -m pip uninstall -y transformers-neuronx
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -43,6 +45,10 @@ RUN --mount=type=bind,source=.git,target=.git \
 # install development dependencies (for testing)
 RUN python3 -m pip install -e tests/vllm_test_utils
 
+# install transformers-neuronx package as an optional dependencies (for V0)
+# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
+RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
+
 # overwrite entrypoint to run bash script
 RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
 
diff --git a/requirements-neuron.txt b/requirements-neuron.txt
index 5e08d101fcd..09820c73e4e 100644
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@@ -2,6 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for Neuron devices
-transformers-neuronx >= 0.13.0
 torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/setup.py b/setup.py
index dc517dafa31..3e2adadf670 100755
--- a/setup.py
+++ b/setup.py
@@ -374,12 +374,7 @@ def _is_hip() -> bool:
 
 
 def _is_neuron() -> bool:
-    torch_neuronx_installed = True
-    try:
-        subprocess.run(["neuron-ls"], capture_output=True, check=True)
-    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        torch_neuronx_installed = False
-    return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron"
+    return VLLM_TARGET_DEVICE == "neuron"
 
 
 def _is_tpu() -> bool:

From 05539265fc4f39232cf48a6c2d4a6da338a2e25e Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Sat, 8 Feb 2025 04:42:15 -0500
Subject: [PATCH 0070/1240] [Doc] Correct HF repository for TeleChat2 models
 (#12949)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 32f3e9deff6..38f36b54d89 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -429,7 +429,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `TeleChat2ForCausalLM`
   * TeleChat2
-  * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc.
+  * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
   * ✅︎
   * ✅︎
 - * `XverseForCausalLM`

From 254fa8c511b505c4475ead3f5387e765db2be931 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 8 Feb 2025 20:24:47 +0800
Subject: [PATCH 0071/1240] [Misc] Add qwen2.5-vl BNB support (#12944)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 59 ++++++++++++------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 1f350ab203f..d4c48dbdab1 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -40,7 +40,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
-from vllm.distributed import parallel_state
+from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -207,11 +207,12 @@ def __init__(
     ) -> None:
         super().__init__()
         # Per attention head and per partition values.
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads)
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_heads, world_size)
+            num_heads, self.tp_size)
 
         self.qkv = ColumnParallelLinear(input_size=embed_dim,
                                         output_size=3 * projection_size,
@@ -231,6 +232,29 @@ def __init__(
                 f"Qwen2.5-VL does not support {self.attn_backend} backend now."
             )
 
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = tensor_model_parallel_all_gather(qkv)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
     def forward(
         self,
         x: torch.Tensor,
@@ -240,15 +264,8 @@ def forward(
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
 
-        # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim]
-        new_x_shape = x.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * self.hidden_size_per_attention_head,
-        )
-        x = x.view(*new_x_shape)
-
-        # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim]
-        q, k, v = dist_utils.split_tensor_along_last_dim(x, 3)
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
@@ -665,24 +682,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                if name.endswith("qkv.weight"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.hidden_size
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size,
-                                                       visual_embed_dim)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1, visual_embed_dim)
-                elif name.endswith("qkv.bias"):
-                    visual_num_heads = self.num_heads
-                    visual_embed_dim = self.hidden_size
-                    head_size = visual_embed_dim // visual_num_heads
-                    loaded_weight = loaded_weight.view(3, visual_num_heads,
-                                                       head_size)
-                    loaded_weight = loaded_weight.transpose(0, 1)
-                    loaded_weight = loaded_weight.reshape(-1)
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)

From 0f837b965817dee46913f2806abea7720c975a51 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 8 Feb 2025 20:25:15 +0800
Subject: [PATCH 0072/1240] [CI/Build] Auto-fix Markdown files (#12941)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/nightly-benchmarks/README.md       | 46 ++++++---------
 .../nightly-benchmarks/nightly-annotation.md  | 21 ++++---
 .../nightly-descriptions.md                   |  6 +-
 .../performance-benchmarks-descriptions.md    | 10 +---
 .github/PULL_REQUEST_TEMPLATE.md              |  3 +-
 .pre-commit-config.yaml                       |  2 +-
 CODE_OF_CONDUCT.md                            |  1 -
 README.md                                     | 14 +++--
 benchmarks/README.md                          |  2 +
 csrc/quantization/cutlass_w8a8/Epilogues.md   | 44 ++++++++++----
 csrc/quantization/machete/Readme.md           | 14 ++---
 .../installation/gpu/rocm.inc.md              |  9 ++-
 docs/source/serving/engine_args.md            |  4 +-
 .../offline_inference/openai/openai_batch.md  | 59 +++++++++----------
 .../offline_inference/profiling_tpu/README.md |  6 +-
 examples/online_serving/chart-helm/README.md  |  2 +-
 examples/online_serving/opentelemetry/Otel.md | 32 ++++++----
 .../prometheus_grafana/README.md              | 14 +++--
 examples/other/logging_configuration.md       |  5 --
 vllm/distributed/kv_transfer/README.md        |  5 +-
 20 files changed, 158 insertions(+), 141 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index fbf41eb10a3..d3f5fc5cd4c 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,15 +1,13 @@
 # vLLM benchmark suite
 
-
 ## Introduction
 
 This directory contains two sets of benchmark for vllm.
+
 - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 
-
-See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
-
+See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 
 ## Performance benchmark quick overview
 
@@ -19,17 +17,14 @@ See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
 
 **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 
-
 ## Nightly benchmark quick overview
 
-**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. 
+**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
 
 **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
 
 **Benchmarking Duration**: about 3.5hrs.
 
-
-
 ## Trigger the benchmark
 
 Performance benchmark will be triggered when:
@@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
 Nightly benchmark will be triggered when:
 - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 
-
-
-
 ## Performance benchmark details
 
-
 See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 
-
-#### Latency test
+### Latency test
 
 Here is an example of one test inside `latency-tests.json`:
 
@@ -68,23 +58,25 @@ Here is an example of one test inside `latency-tests.json`:
 ```
 
 In this example:
--  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
--  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
+
+- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
+- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 
 Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
 
+### Throughput test
 
-#### Throughput test
 The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 
 The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 
-#### Serving test
+### Serving test
+
 We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 
-```
+```json
 [
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
@@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
 ```
 
 Inside this example:
+
 - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 - The `server-parameters` includes the command line arguments for vLLM server.
 - The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
@@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
 
 WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
 
-#### Visualizing the results
+### Visualizing the results
+
 The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 If you do not see the table, please wait till the benchmark finish running.
 The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 
-
-
 ## Nightly test details
 
 See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 
+### Workflow
 
-#### Workflow
-
-- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. 
+- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 
-#### Nightly tests
+### Nightly tests
 
 In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 
-#### Docker containers
+### Docker containers
 
 The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 
 WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 
 WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
-
diff --git a/.buildkite/nightly-benchmarks/nightly-annotation.md b/.buildkite/nightly-benchmarks/nightly-annotation.md
index 1e33793842b..e43ea765f15 100644
--- a/.buildkite/nightly-benchmarks/nightly-annotation.md
+++ b/.buildkite/nightly-benchmarks/nightly-annotation.md
@@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results.
 
 Please download the visualization scripts in the post
 
-
 ## Results reproduction
 
 - Find the docker we use in `benchmarking pipeline`
 - Deploy the docker, and inside the docker:
-  - Download `nightly-benchmarks.zip`. 
-  - In the same folder, run the following code
-```
-export HF_TOKEN=<your HF token>
-apt update
-apt install -y git
-unzip nightly-benchmarks.zip
-VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
-```
+  - Download `nightly-benchmarks.zip`.
+  - In the same folder, run the following code:
 
-And the results will be inside `./benchmarks/results`.
+  ```console
+  export HF_TOKEN=<your HF token>
+  apt update
+  apt install -y git
+  unzip nightly-benchmarks.zip
+  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+  ```
 
+And the results will be inside `./benchmarks/results`.
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 7dec7a0fe0b..5f003f42f07 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -2,6 +2,7 @@
 # Nightly benchmark
 
 This benchmark aims to:
+
 - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
 
@@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
 
 Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 
-
 ## Setup
 
 - Docker images:
@@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
     - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
   - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 
-# Known issues
+## Known issues
 
 - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
-- TGI does not support `ignore-eos` flag.
\ No newline at end of file
+- TGI does not support `ignore-eos` flag.
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index da32d1f073c..cacaef986c6 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -7,10 +7,8 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: end-to-end latency (mean, median, p99).
 
-
 {latency_tests_markdown_table}
 
-
 ## Throughput tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@@ -19,10 +17,8 @@
 - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput.
 
-
 {throughput_tests_markdown_table}
 
-
 ## Serving tests
 
 - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@@ -33,13 +29,11 @@
 - We also added a speculative decoding test for llama-3 70B, under QPS 2
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
-
 {serving_tests_markdown_table}
 
-
 ## json version of the benchmarking tables
 
-This section contains the data of the markdown tables above in JSON format. 
+This section contains the data of the markdown tables above in JSON format.
 You can load the benchmarking tables into pandas dataframes as follows:
 
 ```python
@@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
 ```
 
 The json string for all benchmarking tables:
+
 ```json
 {benchmarking_results_in_json_string}
 ```
 
 You can also check the raw experiment data in the Artifact tab of the Buildkite page.
-
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 51a73c857cc..a20c5baf895 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,4 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
 
 FIX #xxxx (*link existing issues this PR will resolve*)
 
-**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
+<!--- pyml disable-next-line no-emphasis-as-heading -->
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3fb74ab9b23..118451593d2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
   rev: v0.9.27
   hooks:
   - id: pymarkdown
-    files: docs/.*
+    args: [fix]
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
   hooks:
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 1a9596841cc..5268ff135c9 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
-
diff --git a/README.md b/README.md
index cd0b1c517fd..f04acf09cff 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
+
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
@@ -33,7 +34,9 @@ Easy, fast, and cheap LLM serving for everyone
 - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 
 ---
+
 ## About
+
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
 Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
@@ -127,6 +130,7 @@ We also have an official fundraising venue through [OpenCollective](https://open
 ## Citation
 
 If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
+
 ```bibtex
 @inproceedings{kwon2023efficient,
   title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
@@ -138,11 +142,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
-* For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users and coordinating contributions and development, please use Slack.
-* For security disclosures, please use Github's security advisory feature.
-* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use Github issues or discussions.
+- For discussing with fellow users and coordinating contributions and development, please use Slack.
+- For security disclosures, please use Github's security advisory feature.
+- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 
 ## Media Kit
 
-* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
+- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 2aa4a285021..890a2525bcf 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -3,6 +3,7 @@
 ## Downloading the ShareGPT dataset
 
 You can download the dataset by running:
+
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
@@ -11,6 +12,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 
 The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
 will ignore a datapoint if the referred image is missing.
+
 ```bash
 wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
 mkdir coco -p
diff --git a/csrc/quantization/cutlass_w8a8/Epilogues.md b/csrc/quantization/cutlass_w8a8/Epilogues.md
index aae04157b10..a30e1fdf3ac 100644
--- a/csrc/quantization/cutlass_w8a8/Epilogues.md
+++ b/csrc/quantization/cutlass_w8a8/Epilogues.md
@@ -1,17 +1,19 @@
 # CUTLASS Epilogues
 
 ## Introduction
-This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. 
+
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs.
 
 Currently, we only support symmetric quantization for weights,
 and symmetric and asymmetric quantization for activations.
 Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
 
 There are 4 epilogues:
-1. ScaledEpilogue: symmetric quantization for activations, no bias.
-1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
-1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
-1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
+
+1. `ScaledEpilogue`: symmetric quantization for activations, no bias.
+1. `ScaledEpilogueBias`: symmetric quantization for activations, supports bias.
+1. `ScaledEpilogueAzp`: asymmetric per-tensor quantization for activations, supports bias.
+1. `ScaledEpilogueAzpPerToken`: asymmetric per-token quantization for activations, supports bias.
 
 We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
 Instead, if no bias is passed, the epilogue will use 0 as the bias.
@@ -26,12 +28,15 @@ If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
 ```math
 A = s_a (\widehat A - J_a z_a)
 ```
+
 ```math
 B = s_b \widehat B
 ```
+
 ```math
 D = A B + C
 ```
+
 ```math
 D = s_a s_b \widehat D + C
 ```
@@ -48,9 +53,11 @@ Expanding further, we can calculate $` \widehat D `$ as follows:
 ```math
 A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
 ```
+
 ```math
 A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
 ```
+
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
@@ -61,16 +68,19 @@ Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of
 
 ## Epilogues
 
-### ScaledEpilogue
+### `ScaledEpilogue`
+
 This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
 The output of the GEMM is:
 
 ```math
 \widehat D = \widehat A \widehat B
 ```
+
 ```math
 D = s_a s_b \widehat D
 ```
+
 ```math
 D = s_a s_b \widehat A \widehat B
 ```
@@ -79,44 +89,51 @@ Epilogue parameters:
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 
-### ScaledEpilogueBias
+### `ScaledEpilogueBias`
+
 This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
 The output of the GEMM is:
 
 ```math
 \widehat D = \widehat A \widehat B
 ```
+
 ```math
 D = s_a s_b \widehat D + C 
 ```
+
 ```math
 D = s_a s_b \widehat A \widehat B + C
 ```
 
-
 Epilogue parameters:
+
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
 - `bias` is the bias, is always per-channel (row-vector).
 
-### ScaledEpilogueAzp
+### `ScaledEpilogueAzp`
+
 This epilogue computes the asymmetric per-tensor quantization for activations with bias.
 The output of the GEMM is:
 
 ```math
 \widehat D = \widehat A \widehat B - z_a J_a \widehat B
 ```
+
 ```math
 D = s_a s_b \widehat D + C 
 ```
+
 ```math
 D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
 ```
 
-Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. 
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$.
 That is precomputed and stored in `azp_with_adj` as a row-vector.
 
 Epilogue parameters:
+
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-tensor as the zero-points are per-tensor.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
@@ -125,13 +142,15 @@ Epilogue parameters:
 
 To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
 
-### ScaledEpilogueAzpPerToken
+### `ScaledEpilogueAzpPerToken`
+
 This epilogue computes the asymmetric per-token quantization for activations with bias.
 
 The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
 That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
 
 Epilogue parameters:
+
 - `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
   - Generally this will be per-token as the zero-points are per-token.
 - `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
@@ -142,6 +161,7 @@ Epilogue parameters:
 To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
 
 The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
-```
+
+```math
 out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
 ```
diff --git a/csrc/quantization/machete/Readme.md b/csrc/quantization/machete/Readme.md
index 9ddf8da993b..6ffb2416b73 100644
--- a/csrc/quantization/machete/Readme.md
+++ b/csrc/quantization/machete/Readme.md
@@ -6,25 +6,25 @@ Machete is a spiritual successor to the Marlin kernel but optimized for Hopper a
 
 Machete effectively performs
 
-```
+```python
 scale_type = w_s.dtype
 compute_type = a.dtype
 out = (w_q.to(scale_type) * w_s - w_z.to(scale_type)) @ a
 ```
 
-Where `w_q` is a quantized weight matrix, `w_s` is the quantization scales, and 
+Where `w_q` is a quantized weight matrix, `w_s` is the quantization scales, and
 `w_z` is the quantization zeropoints.
 
-> **_NOTE:_**  `w_z` is added after the scales so we can 
+> **_NOTE:_**  `w_z` is added after the scales so we can
 use FMA operations, but this means they must have the scales pre-applied if the
-supplied zeropoints assume that they will be subtracted before the scales are 
+supplied zeropoints assume that they will be subtracted before the scales are
 applied.
 
 ## API
 
 The main optimization within Machete is prepacking the weight matrix to more closely match the tensor core layouts, allowing for wider shared memory loads when loading the weight matrix. This means that the weight matrix must be prepacked before calling `machete_gemm`. The flow looks something like:
 
-```
+```python
 from vllm import _custom_ops as ops
 
 ...
@@ -40,6 +40,6 @@ output = ops.machete_gemm(
 
 ## Code Generation
 
-Since Machete is based on Cutlass, we can generate multiple type pairs and different tile shapes using the same kernel template. We generate multiple instantiations of this template using `generate.py`. 
+Since Machete is based on Cutlass, we can generate multiple type pairs and different tile shapes using the same kernel template. We generate multiple instantiations of this template using `generate.py`.
 
-New type pairs (`TypeConfig`s) can be appended to `impl_configs` (in `generate()`), and these will get automatically generated (assuming they can be supported without issues). For each `TypeConfig`, you must also provide an `ImplConfig`, which bundles a `TypeConfig` with a list of `ScheduleConfig`s, `Specialization`s, and a default heuristic. The `ScheduleConfig`s (which contain info on tile shapes, tile scheduler, etc.) can perform differently for different problem shapes, and there is almost never one `ScheduleConfig` that works well for all problem shapes, so it is generally beneficial to generate different `ScheduleConfig`s for different potential problem shapes. This is where the heuristic comes in. For each `TypeConfig`, a default heuristic should be provided. This maps different problem shapes to different `ScheduleConfig`s and is used when the user does not provide the `schedule` parameter to `machete_gemm`. The `Specialization`s define what feature combinations to generate, i.e., `with_zeropoints`, `with_scales`, etc. We can reduce compile times and the final binary size by limiting the set of feature combinations we generate.
\ No newline at end of file
+New type pairs (`TypeConfig`s) can be appended to `impl_configs` (in `generate()`), and these will get automatically generated (assuming they can be supported without issues). For each `TypeConfig`, you must also provide an `ImplConfig`, which bundles a `TypeConfig` with a list of `ScheduleConfig`s, `Specialization`s, and a default heuristic. The `ScheduleConfig`s (which contain info on tile shapes, tile scheduler, etc.) can perform differently for different problem shapes, and there is almost never one `ScheduleConfig` that works well for all problem shapes, so it is generally beneficial to generate different `ScheduleConfig`s for different potential problem shapes. This is where the heuristic comes in. For each `TypeConfig`, a default heuristic should be provided. This maps different problem shapes to different `ScheduleConfig`s and is used when the user does not provide the `schedule` parameter to `machete_gemm`. The `Specialization`s define what feature combinations to generate, i.e., `with_zeropoints`, `with_scales`, etc. We can reduce compile times and the final binary size by limiting the set of feature combinations we generate.
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 336d578de40..7004313c90f 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -93,12 +93,11 @@ Currently, there are no pre-built ROCm wheels.
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-<!--- pyml disable-num-lines 5 ul-indent-->
     :::{tip}
-    - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-    - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-    - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
-    - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
+   - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+   - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+   - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+   - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
     :::
 
 :::{tip}
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index 827c25b5052..f4587b94ede 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -4,7 +4,7 @@
 
 Below, you can find an explanation of every engine argument for vLLM:
 
-<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
+<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
@@ -17,7 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
 Below are the additional arguments related to the asynchronous engine:
 
-<!--- pyml disable-num-lines 7 no-space-in-emphasis-->
+<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
 .. argparse::
     :module: vllm.engine.arg_utils
diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md
index 953e6ef130f..d271573aa96 100644
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -5,50 +5,49 @@ This is a guide to performing batch inference using the OpenAI batch file format
 ```
 
 ## File Format
- 
+
 The OpenAI batch file format consists of a series of json objects on new lines.
- 
+
 [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
- 
+
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
- 
+
 ```{note}
 We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
 ```
- 
+
 ## Pre-requisites
 
 * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
   - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
   - Install the token on your machine (Run `huggingface-cli login`).
   - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
- 
- 
+
 ## Example 1: Running with a local file
 
 ### Step 1: Create your batch file
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
-```
+```console
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
-```
+```console
 $ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
 
 ### Step 2: Run the batch
- 
+
 The batch running tool is designed to be used from the command line.
 
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 
-```
+```console
 python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
@@ -56,7 +55,7 @@ python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_e
 
 You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
 
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
 {"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
@@ -68,7 +67,7 @@ The batch runner supports remote input and output urls that are accessible via h
 
 For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
 
-```
+```console
 python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 
@@ -80,7 +79,7 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 
 ### Additional prerequisites
 
-* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). 
+* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
 * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
   - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
 * The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
@@ -89,13 +88,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 
-```
+```console
 wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
 ```
 
 Once you've created your batch file it should look like this
 
-```
+```console
 $ cat offline_inference/openai/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
@@ -103,7 +102,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl
 
 Now upload your batch file to your S3 bucket.
 
-```
+```console
 aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 
@@ -111,9 +110,9 @@ aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_
 
 Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
 
-(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py)
+(The script is adapted from <https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py>)
 
-```
+```python
 import boto3
 from botocore.exceptions import ClientError
 
@@ -149,7 +148,7 @@ print(f"{output_url=}")
 
 This script should output
 
-```
+```text
 input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
 output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
 ```
@@ -158,7 +157,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW
 
 You can now run the batch runner, using the urls generated in the previous section.
 
-```
+```console
 python -m vllm.entrypoints.openai.run_batch \
     -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
     -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
@@ -169,7 +168,7 @@ python -m vllm.entrypoints.openai.run_batch \
 
 Your results are now on S3. You can view them in your terminal by running
 
-```
+```console
 aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 ```
 
@@ -180,10 +179,10 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
 * Ensure you are using `vllm >= 0.5.5`.
 
 ### Step 1: Create your batch file
- 
+
 Add embedding requests to your batch file. The following is an example:
- 
-```
+
+```text
 {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
 ```
@@ -198,7 +197,7 @@ You can run the batch using the same command as in earlier examples.
 
 You can check your results by running `cat results.jsonl`
 
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
 ...
@@ -211,10 +210,10 @@ $ cat results.jsonl
 * Ensure you are using `vllm >= 0.7.0`.
 
 ### Step 1: Create your batch file
- 
+
 Add score requests to your batch file. The following is an example:
- 
-```
+
+```text
 {"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
 ```
@@ -229,7 +228,7 @@ You can run the batch using the same command as in earlier examples.
 
 You can check your results by running `cat results.jsonl`
 
-```
+```console
 $ cat results.jsonl
 {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
 {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
index 08efa63dc10..6595efec437 100644
--- a/examples/offline_inference/profiling_tpu/README.md
+++ b/examples/offline_inference/profiling_tpu/README.md
@@ -29,7 +29,6 @@ python3 profiling.py \
     --profile-result-dir profiles
 ```
 
-
 ### Generate Decode Trace
 
 This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
@@ -51,17 +50,18 @@ python3 profiling.py \
     --max-model-len 2048 --tensor-parallel-size 8
 ```
 
-
 ## Visualizing the profiles
 
 Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
 
 Here are most likely the dependencies you need to install:
+
 ```bash
 pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
 ```
 
 Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
+
 ```bash
 tensorboard --logdir profiles/ --port 6006
-```
\ No newline at end of file
+```
diff --git a/examples/online_serving/chart-helm/README.md b/examples/online_serving/chart-helm/README.md
index 6aa126d4fd2..bfe81121d1f 100644
--- a/examples/online_serving/chart-helm/README.md
+++ b/examples/online_serving/chart-helm/README.md
@@ -18,4 +18,4 @@ This directory contains a Helm chart for deploying the vllm application. The cha
 - templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
 - templates/pvc.yaml: Template for Persistent Volume Claims.
 - templates/secrets.yaml: Template for Kubernetes Secrets.
-- templates/service.yaml: Template for creating Services.
\ No newline at end of file
+- templates/service.yaml: Template for creating Services.
diff --git a/examples/online_serving/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/Otel.md
index 96d1f96bfa1..af003400797 100644
--- a/examples/online_serving/opentelemetry/Otel.md
+++ b/examples/online_serving/opentelemetry/Otel.md
@@ -1,7 +1,8 @@
 # Setup OpenTelemetry POC
 
 1. Install OpenTelemetry packages:
-    ```
+
+    ```console
     pip install \
       'opentelemetry-sdk>=1.26.0,<1.27.0' \
       'opentelemetry-api>=1.26.0,<1.27.0' \
@@ -10,7 +11,8 @@
     ```
 
 1. Start Jaeger in a docker container:
-    ```
+
+    ```console
     # From: https://www.jaegertracing.io/docs/1.57/getting-started/
     docker run --rm --name jaeger \
         -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
@@ -28,19 +30,23 @@
     ```
 
 1. In a new shell, export Jaeger IP:
-    ```
+
+    ```console
     export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
     export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
     ```
+
     Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
-    ```
+
+    ```console
     export OTEL_SERVICE_NAME="vllm-server"
     export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
     vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
     ```
 
 1. In a new shell, send requests with trace context from a dummy client
-    ```
+
+    ```console
     export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
     export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
     export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
@@ -48,7 +54,7 @@
     python dummy_client.py
     ```
 
-1. Open Jaeger webui: http://localhost:16686/
+1. Open Jaeger webui: <http://localhost:16686/>
 
     In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
     ![Traces](https://i.imgur.com/GYHhFjo.png)
@@ -57,26 +63,32 @@
 ![Spans details](https://i.imgur.com/OPf6CBL.png)
 
 ## Exporter Protocol
+
 OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
 By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
-```
+
+```console
 export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
 vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
 ```
 
 ## Instrumentation of FastAPI
+
 OpenTelemetry allows automatic instrumentation of FastAPI.
+
 1. Install the instrumentation library
-    ```
+
+    ```console
     pip install opentelemetry-instrumentation-fastapi
     ```
 
 1. Run vLLM with `opentelemetry-instrument`
-    ```
+
+    ```console
     opentelemetry-instrument vllm serve facebook/opt-125m
     ```
 
 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
 
-![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 4a85f953b0b..6df95945166 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -1,14 +1,16 @@
-# Prometheus and Grafana 
+# Prometheus and Grafana
 
-This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
+This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
+
+Install:
 
-Install: 
 - [`docker`](https://docs.docker.com/engine/install/)
 - [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
 
 ## Launch
 
 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
+
 ```bash
 vllm serve mistralai/Mistral-7B-v0.1 \
     --max-model-len 2048 \
@@ -16,11 +18,13 @@ vllm serve mistralai/Mistral-7B-v0.1 \
 ```
 
 Launch Prometheus and Grafana servers with `docker compose`:
+
 ```bash
 docker compose up
 ```
 
 Submit some sample requests to the server:
+
 ```bash
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
@@ -41,13 +45,13 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de
 
 ### Add Prometheus Data Source
 
-Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
+Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
 
 On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
-### Import Dashboard 
+### Import Dashboard
 
 Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
 
diff --git a/examples/other/logging_configuration.md b/examples/other/logging_configuration.md
index 9ac8b13cd5e..acd9c1f2bc0 100644
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@@ -15,7 +15,6 @@ more-complex-and-more-flexible.
   - Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
     set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
 
-
 ## Logging Configuration Environment Variables
 
 ### `VLLM_CONFIGURE_LOGGING`
@@ -45,7 +44,6 @@ schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-
 If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
 disabled, an error will occur while starting vLLM.
 
-
 ## Examples
 
 ### Example 1: Customize vLLM root logger
@@ -98,7 +96,6 @@ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
     vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
-
 ### Example 2: Silence a particular vLLM logger
 
 To silence a particular vLLM logger, it is necessary to provide custom logging
@@ -153,7 +150,6 @@ VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
     vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
-
 ### Example 3: Disable vLLM default logging configuration
 
 To disable vLLM's default logging configuration and silence all vLLM loggers,
@@ -166,7 +162,6 @@ VLLM_CONFIGURE_LOGGING=0 \
     vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```
 
-
 ## Additional resources
 
 - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index e20c992a381..c408d4a6752 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -14,8 +14,8 @@ The KV cache transfer contains three layer of abstractions:
 
 Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer.
 
-NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed 
-communication service already supports key-value-based lookup (like redis or 
+NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed
+communication service already supports key-value-based lookup (like redis or
 RDMA database).
 
 NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates.
@@ -27,4 +27,3 @@ The example usage is in [this file](../../../examples/online_serving/disaggregat
 Here is the diagram of how we run disaggretgated prefilling.
 
 ![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)
-

From c32d4bf1526b8f9201e2033951b2ae545c99baa6 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sat, 8 Feb 2025 22:46:19 +0800
Subject: [PATCH 0073/1240] [Bugfix] Remove unused seq_group_metadata_list from
 ModelInputForGPU (#12935)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/model_runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 12baecde6e4..c7814f17375 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -98,7 +98,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
     finished_requests_ids: Optional[List[str]] = None
     virtual_engine: int = 0
     async_callback: Optional[Callable] = None
-    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
     scheduler_outputs: Optional[SchedulerOutputs] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:

From 59392a88463e20de7f87c20e9fab3f41df4bf803 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 00:06:56 +0800
Subject: [PATCH 0074/1240] [bugfix] fix early import of flash attention
 (#12959)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/flash_attn.py    | 13 +++++++------
 vllm/attention/backends/mla/utils.py     |  5 +++--
 vllm/attention/backends/utils.py         | 14 ++++++--------
 vllm/v1/attention/backends/flash_attn.py |  7 ++++---
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 971fe411695..5aca10079f9 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -14,8 +14,8 @@
                                               AttentionMetadataBuilder,
                                               AttentionType)
 from vllm.attention.backends.utils import (
-    PAD_SLOT_ID, VLLM_FLASH_ATTN_VERSION, CommonAttentionState,
-    compute_slot_mapping, compute_slot_mapping_start_idx,
+    PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
+    compute_slot_mapping_start_idx, get_flash_attn_version,
     get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
     is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set,
     is_block_tables_empty)
@@ -640,6 +640,7 @@ def __init__(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
         self.attn_type = attn_type
+        self.vllm_flash_attn_version = get_flash_attn_version()
 
     def forward(
         self,
@@ -759,7 +760,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=prefill_output,
-                    fa_version=VLLM_FLASH_ATTN_VERSION,
+                    fa_version=self.vllm_flash_attn_version,
                 )
             else:
                 # prefix-enabled attention
@@ -782,7 +783,7 @@ def forward(
                     block_table=prefill_meta.block_tables,
                     softcap=logits_soft_cap,
                     out=prefill_output,
-                    fa_version=VLLM_FLASH_ATTN_VERSION,
+                    fa_version=self.vllm_flash_attn_version,
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -811,7 +812,7 @@ def forward(
                     softcap=logits_soft_cap,
                     block_table=decode_meta.block_tables,
                     out=decode_output,
-                    fa_version=VLLM_FLASH_ATTN_VERSION,
+                    fa_version=self.vllm_flash_attn_version,
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -832,7 +833,7 @@ def forward(
                     alibi_slopes=alibi_slopes,
                     softcap=logits_soft_cap,
                     out=decode_output.unsqueeze(1),
-                    fa_version=VLLM_FLASH_ATTN_VERSION,
+                    fa_version=self.vllm_flash_attn_version,
                 )
         return output
 
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index c22f7e92103..a41140ec837 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -12,7 +12,7 @@
 from vllm.attention.backends.abstract import (AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl, T)
-from vllm.attention.backends.utils import VLLM_FLASH_ATTN_VERSION
+from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.distributed import (get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -181,6 +181,7 @@ def __init__(
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
+        self.vllm_flash_attn_version = get_flash_attn_version()
 
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
@@ -515,7 +516,7 @@ def _forward_prefill_flash(
             max_seqlen_k=max_prefill_seq_len,
             softmax_scale=self.scale,
             causal=True,
-            fa_version=VLLM_FLASH_ATTN_VERSION,
+            fa_version=self.vllm_flash_attn_version,
         )
         attn_output = attn_output\
             .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index e8a34434122..5c1f9916e22 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -587,11 +587,11 @@ def get_num_prefill_decode_query_kv_tokens(
             num_decode_query_tokens)
 
 
-try:
-    from vllm.vllm_flash_attn.flash_attn_interface import (
-        fa_version_unsupported_reason, is_fa_version_supported)
+def get_flash_attn_version():
+    try:
+        from vllm.vllm_flash_attn.flash_attn_interface import (
+            fa_version_unsupported_reason, is_fa_version_supported)
 
-    def flash_attn_version():
         # if hopper default to FA3, otherwise stick to FA2 for now
         # TODO(lucas): profile FA3 on ampere to see if it makes sense to
         #  use FA3 as default for both
@@ -610,7 +610,5 @@ def flash_attn_version():
 
         assert is_fa_version_supported(fa_version)
         return fa_version
-
-    VLLM_FLASH_ATTN_VERSION = flash_attn_version()
-except (ImportError, AssertionError):
-    VLLM_FLASH_ATTN_VERSION = None
+    except (ImportError, AssertionError):
+        return None
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 204afc9f402..5cb1e2fd26a 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,7 +10,7 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
-from vllm.attention.backends.utils import VLLM_FLASH_ATTN_VERSION
+from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -132,6 +132,7 @@ def __init__(
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "FlashAttentionImpl")
+        self.vllm_flash_attn_version = get_flash_attn_version()
 
     def forward(
         self,
@@ -205,7 +206,7 @@ def forward(
                 window_size=self.sliding_window,
                 block_table=attn_metadata.block_table,
                 softcap=self.logits_soft_cap,
-                fa_version=VLLM_FLASH_ATTN_VERSION,
+                fa_version=self.vllm_flash_attn_version,
             )
             return output
 
@@ -227,7 +228,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
-            fa_version=VLLM_FLASH_ATTN_VERSION,
+            fa_version=self.vllm_flash_attn_version,
         )
         return output
 

From ba42e05c8f631ddf97ca87cca9a169d04a3244b9 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 9 Feb 2025 04:32:16 +0800
Subject: [PATCH 0075/1240] [VLM] Merged multi-modal processor for GLM4V
 (#12449)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   2 +-
 examples/offline_inference/vision_language.py |   4 +-
 .../multimodal/processing/test_common.py      |   1 +
 vllm/model_executor/models/chatglm.py         | 382 ++++++++++--------
 4 files changed, 222 insertions(+), 167 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 38f36b54d89..91e6c42d526 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -719,7 +719,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `THUDM/glm-4v-9b` etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `H2OVLChatModel`
   * H2OVL
   * T + I<sup>E+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 436c3657059..9a4183106cf 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -106,7 +106,9 @@ def run_glm4v(question: str, modality: str):
               trust_remote_code=True,
               enforce_eager=True,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    prompt = question
+    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>"
+
     stop_token_ids = [151329, 151336, 151338]
     return llm, prompt, stop_token_ids
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 77cf3442df9..8658e60bc5b 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -147,6 +147,7 @@ def _test_processing_correctness(
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
     "adept/fuyu-8b",
+    "THUDM/glm-4v-9b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index a3164867525..9ee9e9ca800 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -4,20 +4,21 @@
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
-from array import array
-from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
-from PIL import Image
 from torch import nn
 from torch.nn import LayerNorm
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import PreTrainedTokenizer, TensorType
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -35,73 +36,55 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs,
-                                    NestedTensors)
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, BatchFeature,
+                                        BoundPromptReplacement,
+                                        MultiModalFieldConfig,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
+IMAGE_TOKEN_ID = 151329
 
-def calculate_image_placeholder(vision_config):
-    return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
 
+def build_normalization_transform(image_size: int) -> transforms.Compose:
+    """
+    Build a normalization transform which can be applied to one or
+    more input images from which we want to extract visual features.
+
+    Args:
+        image_size: size of the image to be processed for visual embeddings.
+    
+    Returns:
+        Callable transform for normalizing and resizing one RGB image.
+    """
 
-def mm_input_mapper_for_glmv(
-    ctx: InputContext,
-    data: ModalityData[object],
-) -> Dict:
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-    if tokenizer is None:
-        raise RuntimeError("No HuggingFace processor is available "
-                           "to process the image object")
-    try:
-        raw_batch_data = tokenizer.apply_chat_template(
-            conversation=[{
-                "role": "user",
-                "image": data
-            }],
-            add_generation_prompt=True,
-            tokenize=True,
-            return_tensors="pt",
-            return_dict=True).data
-    except Exception:
-        logger.error("Failed to process image (%s)", data)
-        raise
-    pixel_values = raw_batch_data['images']
-
-    return MultiModalKwargs({'pixel_values': pixel_values})
-
-
-def merge_glm_vision_embeddings(
-    input_ids: torch.Tensor,
-    inputs_embeds: torch.Tensor,
-    vision_embeddings: torch.Tensor,
-    boi_token_id: int,
-    eoi_token_id: int,
-) -> torch.Tensor:
-
-    boi_positions = (input_ids == boi_token_id).nonzero(as_tuple=True)[0]
-    eoi_positions = (input_ids == eoi_token_id).nonzero(as_tuple=True)[0]
-
-    mask = torch.zeros_like(input_ids, dtype=torch.bool)
-
-    for boi_pos, eoi_pos in zip(boi_positions, eoi_positions):
-        assert boi_pos < eoi_pos
-        mask[boi_pos:eoi_pos + 1] = True
-    inputs_embeds[mask] = vision_embeddings.view(-1,
-                                                 vision_embeddings.shape[-1])
-    return inputs_embeds
+    return transforms.Compose([
+        transforms.Resize(
+            (image_size, image_size),
+            interpolation=InterpolationMode.BICUBIC,
+        ),
+        transforms.ToTensor(),
+        transforms.Normalize(
+            (0.48145466, 0.4578275, 0.40821073),
+            (0.26862954, 0.26130258, 0.27577711),
+        ),
+    ])
+
+
+def calculate_image_placeholder(vision_config):
+    return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
 
 
 class GLMImagePixelInputs(TypedDict):
@@ -109,120 +92,177 @@ class GLMImagePixelInputs(TypedDict):
     """Shape: `(batch_size, num_channels, height, width)`"""
 
 
-def get_max_glmv_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(ChatGLMConfig)
+class GLM4VProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
 
-    vision_config = getattr(hf_config, 'vision_config', None)
-    if vision_config is None:
-        return 1
-    elif isinstance(vision_config, dict):
-        return calculate_image_placeholder(vision_config)
+    """
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
 
+        self.config = config
+        self.tokenizer = tokenizer
 
-def dummy_data_for_glmv(ctx: InputContext, seq_len: int,
-                        mm_counts: Mapping[str, int]) -> DummyData:
-    hf_config = ctx.get_hf_config(ChatGLMConfig)
-    vision_config = getattr(hf_config, 'vision_config', None)
+        if hasattr(self.config, "vision_config"):
+            self.image_transform = build_normalization_transform(
+                config.vision_config["image_size"])
+        else:
+            self.image_transform = None
 
-    if vision_config is None:
-        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)
-        seq_data = SequenceData(token_ids)
-        return DummyData(seq_data, None)
-    elif isinstance(vision_config, dict):
-        image_size = vision_config["image_size"]
-        image_placeholder_length = calculate_image_placeholder(vision_config)
-        token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [hf_config.boi_token_id] +
-                          [0] * image_placeholder_length +
-                          [hf_config.eoi_token_id])
-        token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                           [0] * (seq_len - image_placeholder_length - 2))
-        seq_data = SequenceData(token_ids)
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+        text_inputs = self.tokenizer(text)
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            if self.image_transform is None:
+                raise ValueError("This model does not support image inputs")
+
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
 
-        mm_data = {
-            "image": Image.new("RGB", (image_size, image_size), color=0)
-        }
 
-        return DummyData(seq_data, mm_data)
+class GLM4VProcessingInfo(BaseProcessingInfo):
 
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
+    def __init__(self, ctx):
+        super().__init__(ctx)
+        self._pre_calculate()
 
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-def find_all_positions(input_ids: List[int], target: int) -> List[int]:
-    return [index for index, value in enumerate(input_ids) if value == target]
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
 
+        return {"image": self.image_token_num + 2}
 
-def input_processor_for_glmv(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+    def _pre_calculate(self):
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        self.image_token_num = calculate_image_placeholder(vision_config)
+        self.image_size = vision_config["image_size"]
 
-    hf_config = ctx.get_hf_config(ChatGLMConfig)
-    vision_config = getattr(hf_config, 'vision_config', None)
+    def get_num_image_tokens(self) -> int:
+        return self.image_token_num + 2
 
-    if vision_config is None:
-        return inputs
-    elif isinstance(vision_config, dict):
-        image_placeholder_length = calculate_image_placeholder(vision_config)
-    else:
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
+    def get_image_size(self) -> ImageSize:
 
-    input_ids = inputs["prompt_token_ids"]
+        return ImageSize(height=self.image_size, width=self.image_size)
 
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.model,
-        trust_remote_code=ctx.model_config.trust_remote_code)
+    def get_hf_processor(self) -> GLM4VProcessor:
+        return GLM4VProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+        )
 
-    try:
-        raw_batch_data = tokenizer.apply_chat_template(
-            conversation=[{
-                "role": "user",
-                "image": multi_modal_data["image"],
-                "content": inputs['prompt'],
-            }],
-            add_generation_prompt=True,
-            tokenize=True,
-            return_tensors="pt",
-            return_dict=True,
-        ).data
-    except Exception:
-        logger.error("Failed to process content (%s)", inputs['prompt'])
-        raise
-    input_ids = raw_batch_data['input_ids'][0].tolist()
 
-    boi_token_id = hf_config.boi_token_id
-    eoi_token_id = hf_config.eoi_token_id
-    boi_positions = find_all_positions(input_ids, boi_token_id)
-    eoi_positions = find_all_positions(input_ids, eoi_token_id)
+class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
 
-    assert len(boi_positions) == len(eoi_positions)
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        target_width, target_height = self.info.get_image_size()
 
-    new_input_ids = []
-    final_processed_position = 0
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+        text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+        return ProcessorInputs(
+            prompt_text=text,
+            mm_data=mm_data,
+        )
 
-    for boi_position, eoi_position in zip(boi_positions, eoi_positions):
-        assert boi_position < eoi_position
-        new_input_ids.extend(input_ids[final_processed_position:boi_position +
-                                       1])
-        new_input_ids.extend([input_ids[boi_position + 1]] *
-                             image_placeholder_length)
-        final_processed_position = eoi_position
 
-    new_input_ids.extend(input_ids[final_processed_position:])
+class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+
+        def get_replacement(item_idx: int):
+            image_tokens = self.info.image_token_num
+            return [IMAGE_TOKEN_ID] * image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[IMAGE_TOKEN_ID],
+                replacement=get_replacement,
+            ),
+        ]
 
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        prompt = tokenizer.decode(new_input_ids)
+    def _apply_prompt_replacements(
+        self,
+        token_ids: list[int],
+        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        mm_item_counts: Mapping[str, int],
+    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        token_ids, text, placeholders = super()._apply_prompt_replacements(
+            token_ids=token_ids,
+            mm_prompt_repls=mm_prompt_repls,
+            mm_item_counts=mm_item_counts,
+        )
+        hf_config = self.info.get_hf_config()
+        boi_token_id = hf_config.boi_token_id
+        eoi_token_id = hf_config.eoi_token_id
+        placeholders = {
+            modality: [
+                PlaceholderFeaturesInfo(
+                    modality=p.modality,
+                    item_idx=p.item_idx,
+                    start_idx=p.start_idx - 1,
+                    tokens=[boi_token_id] + p.tokens + [eoi_token_id],
+                ) for p in ps
+            ]
+            for modality, ps in placeholders.items()
+        }
 
-    return token_inputs(
-        prompt_token_ids=new_input_ids,
-        prompt=prompt,
-        multi_modal_data=multi_modal_data,
-    )
+        return token_ids, text, placeholders
 
 
 class GLMAttention(nn.Module):
@@ -572,12 +612,16 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.embedding(input_ids)
         if multimodal_embeddings is not None:
-            inputs_embeds = merge_glm_vision_embeddings(
+            inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                vision_embeddings=multimodal_embeddings,
-                boi_token_id=self.config.boi_token_id,
-                eoi_token_id=self.config.eoi_token_id)
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=[
+                    self.config.boi_token_id,
+                    IMAGE_TOKEN_ID,
+                    self.config.eoi_token_id,
+                ],
+            )
         return inputs_embeds
 
     def forward(
@@ -593,14 +637,12 @@ def forward(
 
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
-        if intermediate_tensors is None and inputs_embeds is None:
+        if intermediate_tensors is not None:
+            inputs_embeds = intermediate_tensors["hidden_states"]
+        elif inputs_embeds is None:
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
-            input_ids = None
-        else:
-            inputs_embeds = intermediate_tensors["hidden_states"]
-
         # Run encoder.
         hidden_states = self.encoder(
             hidden_states=inputs_embeds,
@@ -763,11 +805,21 @@ def get_mm_mapping(self) -> MultiModelKeys:
             connector="transformer.vision.linear_proj",
             tower_model="transformer.vision.transformer")
 
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        return self.transformer.get_multimodal_embeddings(**kwargs)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids,
+                                                     multimodal_embeddings)
+
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv)
+@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
+                                        info=GLM4VProcessingInfo,
+                                        dummy_inputs=GLM4VDummyInputsBuilder)
 class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
                          SupportsMultiModal):
     # Ensure that the LoRA support check passes when the class is not

From 9ed7b4d836bdd15f6dc58d5f27d1fbb39c26862d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 8 Feb 2025 12:48:30 -0800
Subject: [PATCH 0076/1240] [V1][Minor] Remove outdated comment (#12968)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/kv_cache_manager.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index eefc2e19c20..f8d08d0e402 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -205,8 +205,6 @@ def allocate_slots(
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
                 # [..., max_num_blocks_per_req].
-                # TODO(woosuk): Check and reject requests if
-                # num_prompt_tokens + max_tokens > max_model_len.
                 self.max_num_blocks_per_req - len(req_blocks),
             )
             assert num_new_blocks > 0

From 8bf5b67da74bb12a4cca7493f4d9c5023f58b0d9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Sat, 8 Feb 2025 22:12:53 +0100
Subject: [PATCH 0077/1240] [RFC] [Mistral] FP8 format (#10130)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/llama.py           | 20 ++++++++--
 vllm/model_executor/models/pixtral.py         |  7 +++-
 vllm/transformers_utils/config.py             | 37 ++++++++++++++++---
 vllm/transformers_utils/tokenizers/mistral.py |  3 +-
 4 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 866c6923475..2ff52dd7891 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -467,6 +467,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     mistral_mapping = {
         "layers": "model.layers",
         "attention": "self_attn",
+        "qscale_act": "input_scale",
+        "qscale_weight": "weight_scale",
+        "kv_fake_quantizer.qscale_act": "kv_scale",
         "wq": "q_proj",
         "wk": "k_proj",
         "wv": "v_proj",
@@ -590,15 +593,24 @@ def permute(w: torch.Tensor, n_heads: int):
         modules = name.split(".")
 
         # rotary embeds should be sliced
-        if "wk" in modules:
+        if "wk" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_key_value_heads)
-        elif "wq" in modules:
+        elif "wq" in modules and modules[-1] == "weight":
             loaded_weight = permute(loaded_weight,
                                     self.config.num_attention_heads)
 
-        for item in modules:
-            if item in mapping and mapping[item] not in name:
+        num_modules = len(modules)
+        for i in range(num_modules):
+            item = modules[i]
+            next_item = modules[i + 1] if i < num_modules - 1 else None
+
+            combined_item = (f"{item}.{next_item}"
+                             if next_item is not None else None)
+
+            if combined_item in mapping:
+                name = name.replace(combined_item, mapping[combined_item])
+            elif item in mapping and mapping[item] not in name:
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 003e9c84c1c..e78e8d62cc4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -54,8 +54,11 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
         tokenizer_mode=ctx.model_config.tokenizer_mode)
     mm_encoder = tokenizer.instruct.mm_encoder
 
-    max_image_size = mm_encoder.mm_config.max_image_size
-    image_patch_size = mm_encoder.mm_config.image_patch_size
+    image_config = mm_encoder.mm_config if hasattr(
+        mm_encoder, "mm_config") else mm_encoder.image_config
+
+    max_image_size = image_config.max_image_size
+    image_patch_size = image_config.image_patch_size
 
     return ((max_image_size // image_patch_size)**2)
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index fb5cc3ec072..42b45e10e3f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,7 +4,7 @@
 import json
 import os
 from pathlib import Path
-from typing import Any, Dict, Optional, Type, Union
+from typing import Any, Dict, Literal, Optional, Type, Union
 
 import huggingface_hub
 from huggingface_hub import (file_exists, hf_hub_download, list_repo_files,
@@ -554,7 +554,8 @@ def recurse_elems(elem: Any):
             for key, value in elem.items():
                 key = config_mapping.get(key, key)
                 config_dict[key] = recurse_elems(value)
-            return PretrainedConfig(**config_dict)
+
+            return config_dict
         else:
             return elem
 
@@ -566,12 +567,30 @@ def recurse_elems(elem: Any):
     config_dict["max_position_embeddings"] = config_dict.get(
         "max_position_embeddings", 128_000)
 
+    if config_dict.get("quantization") is not None:
+        quantization = config_dict.get("quantization", {})
+        if quantization.get("qformat_weight") == "fp8_e4m3":
+            # This maps to the FP8 static per-tensor quantization scheme
+            quantization_config = {
+                "quant_method": "fp8",
+                "activation_scheme": "static"
+            }
+        else:
+            raise ValueError(
+                f"Found unknown quantization='{quantization}' in config")
+
+        config_dict["quantization_config"] = quantization_config
+
+    config_type: Literal["text",
+                         "multimodal"] = "multimodal" if config_dict.get(
+                             "vision_encoder") is not None else "text"
+
     if config_dict.get("moe") is not None:
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
         config_dict["architectures"] = ["MistralForCausalLM"]
 
-    if config_dict.get("vision_encoder") is not None:
+    if config_type == "multimodal":
         multimodal_config = config_dict.pop("vision_encoder")
 
         config_dict = {
@@ -583,8 +602,16 @@ def recurse_elems(elem: Any):
 
     config_dict.update(kwargs)
 
-    config = recurse_elems(config_dict)
-    return config
+    config_dict = recurse_elems(config_dict)
+
+    # transform to HF config format
+    if config_type == "multimodal":
+        config_dict["text_config"] = PretrainedConfig(
+            **config_dict["text_config"])
+        config_dict["vision_config"] = PretrainedConfig(
+            **config_dict["vision_config"])
+
+    return PretrainedConfig(**config_dict)
 
 
 def get_hf_image_processor_config(
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 7a1dba42446..8d96fcd278e 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -88,7 +88,8 @@ def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
 
 
 def find_tokenizer_file(files: List[str]):
-    file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$")
+    file_pattern = re.compile(
+        r"^tokenizer\.model\.v.*$|^tekken\.json$|^tokenizer\.mm\.model\.v.*$")
 
     matched_files = [file for file in files if file_pattern.match(file)]
     if len(matched_files) > 1:

From 3eede7eec891b11f16729621b296f0f4d5023aa6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 8 Feb 2025 15:32:32 -0800
Subject: [PATCH 0078/1240] [V1] Cache `uses_mrope` in GPUModelRunner (#12969)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e0a096a9106..fdbca70bda7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -92,6 +92,7 @@ def __init__(
         # Multi-modal data support
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
+        self.uses_mrope = model_config.uses_mrope
 
         # NOTE: Initialized input mapper is only used for processing dummy
         # multimodal data into multimodal kwargs for GPU memory profiling.
@@ -147,7 +148,7 @@ def __init__(
                                      device=self.device)
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
-        if self.model_config.uses_mrope:
+        if self.uses_mrope:
             # NOTE: `mrope_positions` is implemented with one additional dummy
             # position on purpose to make it non-contiguous so that it can work
             # with torch compile.
@@ -284,7 +285,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             )
 
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
-            if self.model_config.uses_mrope:
+            if self.uses_mrope:
                 image_grid_thw = []
                 video_grid_thw = []
                 second_per_grid_ts = []
@@ -411,7 +412,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Calculate M-RoPE positions.
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
-        if self.model_config.uses_mrope:
+        if self.uses_mrope:
             self._calc_mrope_positions(scheduler_output)
 
         # Get token indices.
@@ -458,7 +459,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
             self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
-        if self.model_config.uses_mrope:
+        if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
                 self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
@@ -817,13 +818,14 @@ def execute_model(
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids[:num_input_tokens]
             inputs_embeds = None
+        if self.uses_mrope:
+            positions = self.mrope_positions[:, :num_input_tokens]
+        else:
+            positions = self.positions[:num_input_tokens]
 
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
-            positions = self.mrope_positions[:, :num_input_tokens] \
-                if self.model_config.uses_mrope \
-                else self.positions[:num_input_tokens]
             hidden_states = self.model(
                 input_ids=input_ids,
                 positions=positions,
@@ -1001,10 +1003,11 @@ def _dummy_run(
         else:
             input_ids = self.input_ids[:num_tokens]
             inputs_embeds = None
+        if self.uses_mrope:
+            positions = self.mrope_positions[:, :num_tokens]
+        else:
+            positions = self.positions[:num_tokens]
         with set_forward_context(None, self.vllm_config):
-            positions = self.mrope_positions[:, :num_tokens] \
-                if self.model_config.uses_mrope \
-                else self.positions[:num_tokens]
             hidden_states = model(
                 input_ids=input_ids,
                 positions=positions,

From e3280b7950d45a3ae55eeb8f4cb19a8abaf9e16c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 9 Feb 2025 15:00:00 +0800
Subject: [PATCH 0079/1240] [core] port pynvml into vllm codebase (#12963)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml      |   20 +-
 requirements-cuda.txt        |    1 -
 tests/utils.py               |    5 +-
 vllm/third_party/__init__.py |    0
 vllm/third_party/pynvml.py   | 6139 ++++++++++++++++++++++++++++++++++
 vllm/utils.py                |   39 +-
 6 files changed, 6169 insertions(+), 35 deletions(-)
 create mode 100644 vllm/third_party/__init__.py
 create mode 100644 vllm/third_party/pynvml.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 118451593d2..352eb2df01b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,25 +8,28 @@ repos:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
+    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
+    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.0
   hooks:
   - id: codespell
-    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
+    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|vllm/third_party/.*'
 - repo: https://github.com/PyCQA/isort
   rev: 5.13.2
   hooks:
   - id: isort
+    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/pre-commit/mirrors-clang-format
   rev: v19.1.7
   hooks:
   - id: clang-format
-    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
+    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
     types_or: [c++, cuda]
     args: [--style=file, --verbose]
 - repo: https://github.com/jackdewinter/pymarkdown
@@ -34,10 +37,12 @@ repos:
   hooks:
   - id: pymarkdown
     args: [fix]
+    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
   hooks:
   - id: actionlint
+    exclude: 'vllm/third_party/.*'
 - repo: local
   hooks:
   - id: mypy-local
@@ -47,6 +52,7 @@ repos:
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
     stages: [pre-commit] # Don't run in CI
+    exclude: 'vllm/third_party/.*'
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
     entry: tools/mypy.sh 1 "3.9"
@@ -54,6 +60,7 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
+    exclude: 'vllm/third_party/.*'
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"
@@ -61,6 +68,7 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
+    exclude: 'vllm/third_party/.*'
   - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.11
     entry: tools/mypy.sh 1 "3.11"
@@ -68,6 +76,7 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
+    exclude: 'vllm/third_party/.*'
   - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.12
     entry: tools/mypy.sh 1 "3.12"
@@ -75,16 +84,19 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
+    exclude: 'vllm/third_party/.*'
   - id: shellcheck
     name: Lint shell scripts
     entry: tools/shellcheck.sh
     language: script
     types: [shell]
+    exclude: 'vllm/third_party/.*'
   - id: png-lint
     name: Lint PNG exports from excalidraw
     entry: tools/png-lint.sh
     language: script
     types: [png]
+    exclude: 'vllm/third_party/.*'
   - id: signoff-commit
     name: Sign-off Commit
     entry: bash
@@ -97,17 +109,20 @@ repos:
     language: system
     verbose: true
     stages: [commit-msg]
+    exclude: 'vllm/third_party/.*'
   - id: check-spdx-header
     name: Check SPDX headers
     entry: python tools/check_spdx_header.py
     language: python
     types: [python]
+    exclude: 'vllm/third_party/.*'
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
     pass_filenames: false
+    exclude: 'vllm/third_party/.*'
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
@@ -117,3 +132,4 @@ repos:
     language: system
     always_run: true
     pass_filenames: false
+    exclude: 'vllm/third_party/.*'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 78fa360f2dc..0e7217fb376 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -3,7 +3,6 @@
 
 # Dependencies for NVIDIA GPUs
 ray[default] >= 2.9
-nvidia-ml-py >= 12.560.30 # for pynvml package
 torch == 2.5.1
 torchaudio==2.5.1
 # These must be updated alongside torch
diff --git a/tests/utils.py b/tests/utils.py
index 3b32052fe4c..f39cbe7ede0 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -46,8 +46,9 @@ def _nvml():
         finally:
             amdsmi_shut_down()
 elif current_platform.is_cuda():
-    from pynvml import (nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo,
-                        nvmlInit, nvmlShutdown)
+    from vllm.third_party.pynvml import (nvmlDeviceGetHandleByIndex,
+                                         nvmlDeviceGetMemoryInfo, nvmlInit,
+                                         nvmlShutdown)
 
     @contextmanager
     def _nvml():
diff --git a/vllm/third_party/__init__.py b/vllm/third_party/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py
new file mode 100644
index 00000000000..0a4be23a093
--- /dev/null
+++ b/vllm/third_party/pynvml.py
@@ -0,0 +1,6139 @@
+# SPDX-License-Identifier: Apache-2.0
+# copied from https://pypi.org/project/nvidia-ml-py
+# version 12.570.86
+
+#####
+# Copyright (c) 2011-2023, NVIDIA Corporation.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#    * Redistributions of source code must retain the above copyright notice,
+#      this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in the
+#      documentation and/or other materials provided with the distribution.
+#    * Neither the name of the NVIDIA Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived from
+#      this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+#####
+
+##
+# Python bindings for the NVML library
+##
+from ctypes import *
+from ctypes.util import find_library
+from functools import wraps
+import sys
+import os
+import threading
+import string
+
+## C Type mappings ##
+## Enums
+_nvmlEnableState_t = c_uint
+NVML_FEATURE_DISABLED    = 0
+NVML_FEATURE_ENABLED     = 1
+
+_nvmlBrandType_t = c_uint
+NVML_BRAND_UNKNOWN             = 0
+NVML_BRAND_QUADRO              = 1
+NVML_BRAND_TESLA               = 2
+NVML_BRAND_NVS                 = 3
+NVML_BRAND_GRID                = 4   # Deprecated from API reporting. Keeping definition for backward compatibility.
+NVML_BRAND_GEFORCE             = 5
+NVML_BRAND_TITAN               = 6
+NVML_BRAND_NVIDIA_VAPPS        = 7   # NVIDIA Virtual Applications
+NVML_BRAND_NVIDIA_VPC          = 8   # NVIDIA Virtual PC
+NVML_BRAND_NVIDIA_VCS          = 9   # NVIDIA Virtual Compute Server
+NVML_BRAND_NVIDIA_VWS          = 10  # NVIDIA RTX Virtual Workstation
+NVML_BRAND_NVIDIA_CLOUD_GAMING = 11  # NVIDIA Cloud Gaming
+NVML_BRAND_NVIDIA_VGAMING      = NVML_BRAND_NVIDIA_CLOUD_GAMING # Deprecated from API reporting. Keeping definition for backward compatibility.
+NVML_BRAND_QUADRO_RTX          = 12
+NVML_BRAND_NVIDIA_RTX          = 13
+NVML_BRAND_NVIDIA              = 14
+NVML_BRAND_GEFORCE_RTX         = 15  # Unused
+NVML_BRAND_TITAN_RTX           = 16  # Unused
+NVML_BRAND_COUNT               = 17
+
+_nvmlTemperatureThresholds_t = c_uint
+NVML_TEMPERATURE_THRESHOLD_SHUTDOWN      = 0
+NVML_TEMPERATURE_THRESHOLD_SLOWDOWN      = 1
+NVML_TEMPERATURE_THRESHOLD_MEM_MAX       = 2
+NVML_TEMPERATURE_THRESHOLD_GPU_MAX       = 3
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN  = 4
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5
+NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX  = 6
+NVML_TEMPERATURE_THRESHOLD_GPS_CURR      = 7
+NVML_TEMPERATURE_THRESHOLD_COUNT         = 8
+
+_nvmlTemperatureSensors_t = c_uint
+NVML_TEMPERATURE_GPU     = 0
+NVML_TEMPERATURE_COUNT   = 1
+
+
+_nvmlComputeMode_t = c_uint
+NVML_COMPUTEMODE_DEFAULT           = 0
+NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1  ## Support Removed
+NVML_COMPUTEMODE_PROHIBITED        = 2
+NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3
+NVML_COMPUTEMODE_COUNT             = 4
+
+_nvmlMemoryLocation_t = c_uint
+NVML_MEMORY_LOCATION_L1_CACHE = 0
+NVML_MEMORY_LOCATION_L2_CACHE = 1
+NVML_MEMORY_LOCATION_DEVICE_MEMORY = 2
+NVML_MEMORY_LOCATION_DRAM = 2
+NVML_MEMORY_LOCATION_REGISTER_FILE = 3
+NVML_MEMORY_LOCATION_TEXTURE_MEMORY = 4
+NVML_MEMORY_LOCATION_TEXTURE_SHM = 5
+NVML_MEMORY_LOCATION_CBU = 6
+NVML_MEMORY_LOCATION_SRAM = 7
+NVML_MEMORY_LOCATION_COUNT = 8
+
+NVML_NVLINK_MAX_LINKS = 18
+
+# For backwards compatibility, maintain the incorrectly-named "LANES" define
+NVML_NVLINK_MAX_LANES = NVML_NVLINK_MAX_LINKS
+
+_nvmlNvLinkErrorCounter_t = c_uint
+NVML_NVLINK_ERROR_DL_REPLAY = 0
+NVML_NVLINK_ERROR_DL_RECOVERY = 1
+NVML_NVLINK_ERROR_DL_CRC_FLIT = 2
+NVML_NVLINK_ERROR_DL_CRC_DATA = 3
+NVML_NVLINK_ERROR_DL_ECC_DATA = 4
+NVML_NVLINK_ERROR_COUNT = 5
+
+_nvmlNvLinkEccLaneErrorCounter_t = c_uint
+NVML_NVLINK_ERROR_DL_ECC_LANE0 = 0
+NVML_NVLINK_ERROR_DL_ECC_LANE1 = 1
+NVML_NVLINK_ERROR_DL_ECC_LANE2 = 2
+NVML_NVLINK_ERROR_DL_ECC_LANE3 = 3
+NVML_NVLINK_ERROR_DL_ECC_COUNT = 5
+
+_nvmlNvLinkCapability_t = c_uint
+NVML_NVLINK_CAP_P2P_SUPPORTED = 0
+NVML_NVLINK_CAP_SYSMEM_ACCESS = 1
+NVML_NVLINK_CAP_P2P_ATOMICS   = 2
+NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3
+NVML_NVLINK_CAP_SLI_BRIDGE    = 4
+NVML_NVLINK_CAP_VALID         = 5
+NVML_NVLINK_CAP_COUNT         = 6
+
+_nvmlNvLinkUtilizationCountPktTypes_t = c_uint
+NVML_NVLINK_COUNTER_PKTFILTER_NOP        = 0x1
+NVML_NVLINK_COUNTER_PKTFILTER_READ       = 0x2
+NVML_NVLINK_COUNTER_PKTFILTER_WRITE      = 0x4
+NVML_NVLINK_COUNTER_PKTFILTER_RATOM      = 0x8
+NVML_NVLINK_COUNTER_PKTFILTER_NRATOM     = 0x10
+NVML_NVLINK_COUNTER_PKTFILTER_FLUSH      = 0x20
+NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA   = 0x40
+NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80
+NVML_NVLINK_COUNTER_PKTFILTER_ALL        = 0xFF
+
+_nvmlNvLinkUtilizationCountUnits_t = c_uint
+NVML_NVLINK_COUNTER_UNIT_CYCLES   = 0
+NVML_NVLINK_COUNTER_UNIT_PACKETS  = 1
+NVML_NVLINK_COUNTER_UNIT_BYTES    = 2
+NVML_NVLINK_COUNTER_UNIT_RESERVED = 3
+NVML_NVLINK_COUNTER_UNIT_COUNT    = 4
+
+_nvmlNvLinkDeviceType_t = c_uint
+NVML_NVLINK_DEVICE_TYPE_GPU     = 0x00
+NVML_NVLINK_DEVICE_TYPE_IBMNPU  = 0x01
+NVML_NVLINK_DEVICE_TYPE_SWITCH  = 0x02
+NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF
+
+# These are deprecated, instead use _nvmlMemoryErrorType_t
+_nvmlEccBitType_t = c_uint
+NVML_SINGLE_BIT_ECC    = 0
+NVML_DOUBLE_BIT_ECC    = 1
+NVML_ECC_ERROR_TYPE_COUNT = 2
+
+_nvmlEccCounterType_t = c_uint
+NVML_VOLATILE_ECC      = 0
+NVML_AGGREGATE_ECC     = 1
+NVML_ECC_COUNTER_TYPE_COUNT = 2
+
+_nvmlMemoryErrorType_t = c_uint
+NVML_MEMORY_ERROR_TYPE_CORRECTED   = 0
+NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1
+NVML_MEMORY_ERROR_TYPE_COUNT       = 2
+
+_nvmlClockType_t = c_uint
+NVML_CLOCK_GRAPHICS  = 0
+NVML_CLOCK_SM        = 1
+NVML_CLOCK_MEM       = 2
+NVML_CLOCK_VIDEO     = 3
+NVML_CLOCK_COUNT     = 4
+
+_nvmlClockId_t = c_uint
+NVML_CLOCK_ID_CURRENT            = 0
+NVML_CLOCK_ID_APP_CLOCK_TARGET   = 1
+NVML_CLOCK_ID_APP_CLOCK_DEFAULT  = 2
+NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3
+NVML_CLOCK_ID_COUNT              = 4
+
+_nvmlDriverModel_t = c_uint
+NVML_DRIVER_WDDM       = 0
+NVML_DRIVER_WDM        = 1
+NVML_DRIVER_MCDM       = 2
+
+NVML_MAX_GPU_PERF_PSTATES = 16
+
+_nvmlPstates_t = c_uint
+NVML_PSTATE_0               = 0
+NVML_PSTATE_1               = 1
+NVML_PSTATE_2               = 2
+NVML_PSTATE_3               = 3
+NVML_PSTATE_4               = 4
+NVML_PSTATE_5               = 5
+NVML_PSTATE_6               = 6
+NVML_PSTATE_7               = 7
+NVML_PSTATE_8               = 8
+NVML_PSTATE_9               = 9
+NVML_PSTATE_10              = 10
+NVML_PSTATE_11              = 11
+NVML_PSTATE_12              = 12
+NVML_PSTATE_13              = 13
+NVML_PSTATE_14              = 14
+NVML_PSTATE_15              = 15
+NVML_PSTATE_UNKNOWN         = 32
+
+_nvmlInforomObject_t = c_uint
+NVML_INFOROM_OEM            = 0
+NVML_INFOROM_ECC            = 1
+NVML_INFOROM_POWER          = 2
+NVML_INFOROM_DEN            = 3
+NVML_INFOROM_COUNT          = 4
+
+_nvmlReturn_t = c_uint
+NVML_SUCCESS                         = 0
+NVML_ERROR_UNINITIALIZED             = 1
+NVML_ERROR_INVALID_ARGUMENT          = 2
+NVML_ERROR_NOT_SUPPORTED             = 3
+NVML_ERROR_NO_PERMISSION             = 4
+NVML_ERROR_ALREADY_INITIALIZED       = 5
+NVML_ERROR_NOT_FOUND                 = 6
+NVML_ERROR_INSUFFICIENT_SIZE         = 7
+NVML_ERROR_INSUFFICIENT_POWER        = 8
+NVML_ERROR_DRIVER_NOT_LOADED         = 9
+NVML_ERROR_TIMEOUT                   = 10
+NVML_ERROR_IRQ_ISSUE                 = 11
+NVML_ERROR_LIBRARY_NOT_FOUND         = 12
+NVML_ERROR_FUNCTION_NOT_FOUND        = 13
+NVML_ERROR_CORRUPTED_INFOROM         = 14
+NVML_ERROR_GPU_IS_LOST               = 15
+NVML_ERROR_RESET_REQUIRED            = 16
+NVML_ERROR_OPERATING_SYSTEM          = 17
+NVML_ERROR_LIB_RM_VERSION_MISMATCH   = 18
+NVML_ERROR_IN_USE                    = 19
+NVML_ERROR_MEMORY                    = 20
+NVML_ERROR_NO_DATA                   = 21
+NVML_ERROR_VGPU_ECC_NOT_SUPPORTED    = 22
+NVML_ERROR_INSUFFICIENT_RESOURCES    = 23
+NVML_ERROR_FREQ_NOT_SUPPORTED        = 24
+NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25
+NVML_ERROR_DEPRECATED                = 26
+NVML_ERROR_NOT_READY                 = 27
+NVML_ERROR_GPU_NOT_FOUND             = 28
+NVML_ERROR_INVALID_STATE             = 29
+NVML_ERROR_UNKNOWN                   = 999
+
+_nvmlFanState_t = c_uint
+NVML_FAN_NORMAL             = 0
+NVML_FAN_FAILED             = 1
+
+_nvmlFanControlPolicy_t = c_uint
+NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW = 0
+NVML_FAN_POLICY_MANUAL                   = 1
+
+_nvmlLedColor_t = c_uint
+NVML_LED_COLOR_GREEN        = 0
+NVML_LED_COLOR_AMBER        = 1
+
+_nvmlGpuOperationMode_t = c_uint
+NVML_GOM_ALL_ON                 = 0
+NVML_GOM_COMPUTE                = 1
+NVML_GOM_LOW_DP                 = 2
+
+_nvmlPageRetirementCause_t = c_uint
+NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0
+NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR           = 1
+NVML_PAGE_RETIREMENT_CAUSE_COUNT                          = 2
+
+_nvmlRestrictedAPI_t = c_uint
+NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS                = 0
+NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS               = 1
+NVML_RESTRICTED_API_COUNT                                 = 2
+
+_nvmlBridgeChipType_t = c_uint
+NVML_BRIDGE_CHIP_PLX = 0
+NVML_BRIDGE_CHIP_BRO4 = 1
+NVML_MAX_PHYSICAL_BRIDGE = 128
+
+_nvmlValueType_t = c_uint
+NVML_VALUE_TYPE_DOUBLE = 0
+NVML_VALUE_TYPE_UNSIGNED_INT = 1
+NVML_VALUE_TYPE_UNSIGNED_LONG = 2
+NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3
+NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4
+NVML_VALUE_TYPE_SIGNED_INT = 5
+NVML_VALUE_TYPE_UNSIGNED_SHORT = 6
+NVML_VALUE_TYPE_COUNT = 7
+
+_nvmlNvlinkVersion_t = c_uint
+NVML_NVLINK_VERSION_INVALID = 0
+NVML_NVLINK_VERSION_1_0 = 1
+NVML_NVLINK_VERSION_2_0 = 2
+NVML_NVLINK_VERSION_2_2 = 3
+NVML_NVLINK_VERSION_3_0 = 4
+NVML_NVLINK_VERSION_3_1 = 5
+NVML_NVLINK_VERSION_4_0 = 6
+NVML_NVLINK_VERSION_5_0 = 7
+
+_nvmlPerfPolicyType_t = c_uint
+NVML_PERF_POLICY_POWER = 0
+NVML_PERF_POLICY_THERMAL = 1
+NVML_PERF_POLICY_SYNC_BOOST = 2
+NVML_PERF_POLICY_BOARD_LIMIT = 3
+NVML_PERF_POLICY_LOW_UTILIZATION = 4
+NVML_PERF_POLICY_RELIABILITY = 5
+NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10
+NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11
+NVML_PERF_POLICY_COUNT = 12
+
+_nvmlEncoderQueryType_t = c_uint
+NVML_ENCODER_QUERY_H264 = 0
+NVML_ENCODER_QUERY_HEVC = 1
+NVML_ENCODER_QUERY_AV1 = 2
+NVML_ENCODER_QUERY_UNKNOWN = 255
+
+_nvmlFBCSessionType_t = c_uint
+NVML_FBC_SESSION_TYPE_UNKNOWN = 0
+NVML_FBC_SESSION_TYPE_TOSYS = 1
+NVML_FBC_SESSION_TYPE_CUDA = 2
+NVML_FBC_SESSION_TYPE_VID = 3
+NVML_FBC_SESSION_TYPE_HWENC = 4
+
+_nvmlDetachGpuState_t = c_uint
+NVML_DETACH_GPU_KEEP = 0
+NVML_DETACH_GPU_REMOVE = 1
+
+_nvmlPcieLinkState_t = c_uint
+NVML_PCIE_LINK_KEEP = 0
+NVML_PCIE_LINK_SHUT_DOWN = 1
+
+_nvmlSamplingType_t = c_uint
+NVML_TOTAL_POWER_SAMPLES = 0
+NVML_GPU_UTILIZATION_SAMPLES = 1
+NVML_MEMORY_UTILIZATION_SAMPLES = 2
+NVML_ENC_UTILIZATION_SAMPLES = 3
+NVML_DEC_UTILIZATION_SAMPLES = 4
+NVML_PROCESSOR_CLK_SAMPLES = 5
+NVML_MEMORY_CLK_SAMPLES = 6
+NVML_MODULE_POWER_SAMPLES = 7
+NVML_JPG_UTILIZATION_SAMPLES = 8
+NVML_OFA_UTILIZATION_SAMPLES = 9
+NVML_SAMPLINGTYPE_COUNT = 10
+
+_nvmlPcieUtilCounter_t = c_uint
+NVML_PCIE_UTIL_TX_BYTES = 0
+NVML_PCIE_UTIL_RX_BYTES = 1
+NVML_PCIE_UTIL_COUNT = 2
+
+_nvmlGpuTopologyLevel_t = c_uint
+NVML_TOPOLOGY_INTERNAL = 0
+NVML_TOPOLOGY_SINGLE = 10
+NVML_TOPOLOGY_MULTIPLE = 20
+NVML_TOPOLOGY_HOSTBRIDGE = 30
+NVML_TOPOLOGY_NODE = 40
+NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE
+NVML_TOPOLOGY_SYSTEM = 50
+
+_nvmlGpuP2PCapsIndex_t = c_uint
+NVML_P2P_CAPS_INDEX_READ = 0,
+NVML_P2P_CAPS_INDEX_WRITE = 1
+NVML_P2P_CAPS_INDEX_NVLINK =2
+NVML_P2P_CAPS_INDEX_ATOMICS = 3
+#
+# NVML_P2P_CAPS_INDEX_PROP is deprecated.
+# Use NVML_P2P_CAPS_INDEX_PCI instead.
+#
+NVML_P2P_CAPS_INDEX_PROP = 4
+NVML_P2P_CAPS_INDEX_PCI = 4
+NVML_P2P_CAPS_INDEX_UNKNOWN = 5
+
+_nvmlGpuP2PStatus_t = c_uint
+NVML_P2P_STATUS_OK     = 0
+NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED = 1
+NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED
+NVML_P2P_STATUS_GPU_NOT_SUPPORTED = 2
+NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED =3
+NVML_P2P_STATUS_DISABLED_BY_REGKEY =4
+NVML_P2P_STATUS_NOT_SUPPORTED =5
+NVML_P2P_STATUS_UNKNOWN =6
+
+_nvmlDeviceArchitecture_t = c_uint
+NVML_DEVICE_ARCH_KEPLER   = 2
+NVML_DEVICE_ARCH_MAXWELL  = 3
+NVML_DEVICE_ARCH_PASCAL   = 4
+NVML_DEVICE_ARCH_VOLTA    = 5
+NVML_DEVICE_ARCH_TURING   = 6
+NVML_DEVICE_ARCH_AMPERE   = 7
+NVML_DEVICE_ARCH_ADA      = 8
+NVML_DEVICE_ARCH_HOPPER   = 9
+NVML_DEVICE_ARCH_BLACKWELL   = 10
+NVML_DEVICE_ARCH_T23X     = 11
+NVML_DEVICE_ARCH_UNKNOWN  = 0xffffffff
+
+# PCI bus Types
+_nvmlBusType_t = c_uint
+NVML_BUS_TYPE_UNKNOWN = 0
+NVML_BUS_TYPE_PCI     = 1
+NVML_BUS_TYPE_PCIE    = 2
+NVML_BUS_TYPE_FPCI    = 3
+NVML_BUS_TYPE_AGP     = 4
+
+_nvmlPowerSource_t = c_uint
+NVML_POWER_SOURCE_AC         = 0x00000000
+NVML_POWER_SOURCE_BATTERY    = 0x00000001
+NVML_POWER_SOURCE_UNDERSIZED = 0x00000002
+
+_nvmlAdaptiveClockInfoStatus_t = c_uint
+NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0x00000000
+NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 0x00000001
+
+_nvmlClockLimitId_t = c_uint
+NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00
+NVML_CLOCK_LIMIT_ID_TDP         = 0xffffff01
+NVML_CLOCK_LIMIT_ID_UNLIMITED   = 0xffffff02
+
+_nvmlPcieLinkMaxSpeed_t = c_uint
+NVML_PCIE_LINK_MAX_SPEED_INVALID   = 0x00000000
+NVML_PCIE_LINK_MAX_SPEED_2500MBPS  = 0x00000001
+NVML_PCIE_LINK_MAX_SPEED_5000MBPS  = 0x00000002
+NVML_PCIE_LINK_MAX_SPEED_8000MBPS  = 0x00000003
+NVML_PCIE_LINK_MAX_SPEED_16000MBPS = 0x00000004
+NVML_PCIE_LINK_MAX_SPEED_32000MBPS = 0x00000005
+NVML_PCIE_LINK_MAX_SPEED_64000MBPS = 0x00000006
+
+_nvmlPcieAtomicsCapability_t = c_uint
+NVML_PCIE_ATOMICS_CAP_FETCHADD32  = 0x01
+NVML_PCIE_ATOMICS_CAP_FETCHADD64  = 0x02
+NVML_PCIE_ATOMICS_CAP_SWAP32      = 0x04
+NVML_PCIE_ATOMICS_CAP_SWAP64      = 0x08
+NVML_PCIE_ATOMICS_CAP_CAS32       = 0x10
+NVML_PCIE_ATOMICS_CAP_CAS64       = 0x20
+NVML_PCIE_ATOMICS_CAP_CAS128      = 0x40
+NVML_PCIE_ATOMICS_OPS_MAX         = 7
+
+_nvmlAffinityScope_t = c_uint
+NVML_AFFINITY_SCOPE_NODE   = 0
+NVML_AFFINITY_SCOPE_SOCKET = 1
+
+_nvmlDeviceGpuRecoveryAction_t = c_uint
+NVML_GPU_RECOVERY_ACTION_NONE        = 0
+NVML_GPU_RECOVERY_ACTION_GPU_RESET   = 1
+NVML_GPU_RECOVERY_ACTION_NODE_REBOOT = 2
+NVML_GPU_RECOVERY_ACTION_DRAIN_P2P   = 3
+NVML_GPU_RECOVERY_ACTION_DRAIN_AND_RESET = 4
+
+# C preprocessor defined values
+nvmlFlagDefault             = 0
+nvmlFlagForce               = 1
+NVML_INIT_FLAG_NO_GPUS      = 1
+NVML_INIT_FLAG_NO_ATTACH    = 2
+
+NVML_MAX_GPC_COUNT          = 32
+
+# buffer size
+NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE      = 16
+NVML_DEVICE_UUID_BUFFER_SIZE                 = 80
+NVML_DEVICE_UUID_V2_BUFFER_SIZE              = 96
+NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE       = 80
+NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE         = 80
+NVML_DEVICE_NAME_BUFFER_SIZE                 = 64
+NVML_DEVICE_NAME_V2_BUFFER_SIZE              = 96
+NVML_DEVICE_SERIAL_BUFFER_SIZE               = 30
+NVML_DEVICE_PART_NUMBER_BUFFER_SIZE          = 80
+NVML_DEVICE_GPU_PART_NUMBER_BUFFER_SIZE      = 80
+NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE        = 32
+NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE           = 32
+NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE        = 16
+NVML_GRID_LICENSE_BUFFER_SIZE                = 128
+NVML_VGPU_NAME_BUFFER_SIZE                   = 64
+NVML_GRID_LICENSE_FEATURE_MAX_COUNT          = 3
+NVML_VGPU_METADATA_OPAQUE_DATA_SIZE          = sizeof(c_uint) + 256
+NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE     = 256
+NVML_DEVICE_GPU_FRU_PART_NUMBER_BUFFER_SIZE  = 0x14 # NV2080_GPU_MAX_PRODUCT_PART_NUMBER_LENGTH
+NVML_PERF_MODES_BUFFER_SIZE                  = 2048
+
+# Format strings
+NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT   = "%04X:%02X:%02X.0"
+NVML_DEVICE_PCI_BUS_ID_FMT          = "%08X:%02X:%02X.0"
+
+NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1)
+NVML_VALUE_NOT_AVAILABLE_uint = c_uint(-1)
+
+'''
+ Field Identifiers.
+
+ All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+'''
+NVML_FI_DEV_ECC_CURRENT          = 1   # Current ECC mode. 1=Active. 0=Inactive
+NVML_FI_DEV_ECC_PENDING          = 2   # Pending ECC mode. 1=Active. 0=Inactive
+
+#ECC Count Totals
+NVML_FI_DEV_ECC_SBE_VOL_TOTAL    = 3   # Total single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_TOTAL    = 4   # Total double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_TOTAL    = 5   # Total single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_TOTAL    = 6   # Total double bit aggregate (persistent) ECC errors
+#Individual ECC locations
+NVML_FI_DEV_ECC_SBE_VOL_L1       = 7   # L1 cache single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_L1       = 8   # L1 cache double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_L2       = 9   # L2 cache single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_L2       = 10  # L2 cache double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_DEV      = 11  # Device memory single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_DEV      = 12  # Device memory double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_REG      = 13  # Register file single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_REG      = 14  # Register file double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_VOL_TEX      = 15  # Texture memory single bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_TEX      = 16  # Texture memory double bit volatile ECC errors
+NVML_FI_DEV_ECC_DBE_VOL_CBU      = 17  # CBU double bit volatile ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_L1       = 18  # L1 cache single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_L1       = 19  # L1 cache double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_L2       = 20  # L2 cache single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_L2       = 21  # L2 cache double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_DEV      = 22  # Device memory single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_DEV      = 23  # Device memory double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_REG      = 24  # Register File single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_REG      = 25  # Register File double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_SBE_AGG_TEX      = 26  # Texture memory single bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_TEX      = 27  # Texture memory double bit aggregate (persistent) ECC errors
+NVML_FI_DEV_ECC_DBE_AGG_CBU      = 28  # CBU double bit aggregate ECC errors
+
+# Page Retirement
+NVML_FI_DEV_RETIRED_SBE          = 29  # Number of retired pages because of single bit errors
+NVML_FI_DEV_RETIRED_DBE          = 30  # Number of retired pages because of double bit errors
+NVML_FI_DEV_RETIRED_PENDING      = 31  # If any pages are pending retirement. 1=yes. 0=no.
+
+# NvLink Flit Error Counters
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0   = 32 # NVLink flow control CRC  Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1   = 33 # NVLink flow control CRC  Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2   = 34 # NVLink flow control CRC  Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3   = 35 # NVLink flow control CRC  Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4   = 36 # NVLink flow control CRC  Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5   = 37 # NVLink flow control CRC  Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 38 # NVLink flow control CRC  Error Counter total for all Lanes
+
+# NvLink CRC Data Error Counters
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0   = 39 # NVLink data CRC Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1   = 40 # NVLink data CRC Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2   = 41 # NVLink data CRC Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3   = 42 # NVLink data CRC Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4   = 43 # NVLink data CRC Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5   = 44 # NVLink data CRC Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 45 # NvLink data CRC Error Counter total for all Lanes
+
+# NvLink Replay Error Counters
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0     = 46 # NVLink Replay Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1     = 47 # NVLink Replay Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2     = 48 # NVLink Replay Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3     = 49 # NVLink Replay Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4     = 50 # NVLink Replay Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5     = 51 # NVLink Replay Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL  = 52 # NVLink Replay Error Counter total for all Lanes
+
+# NvLink Recovery Error Counters
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0   = 53 # NVLink Recovery Error Counter for Lane 0
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1   = 54 # NVLink Recovery Error Counter for Lane 1
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2   = 55 # NVLink Recovery Error Counter for Lane 2
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3   = 56 # NVLink Recovery Error Counter for Lane 3
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4   = 57 # NVLink Recovery Error Counter for Lane 4
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5   = 58 # NVLink Recovery Error Counter for Lane 5
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 59 # NVLink Recovery Error Counter total for all Lanes
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0    = 60 # NVLink Bandwidth Counter for Counter Set 0, Lane 0
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1    = 61 # NVLink Bandwidth Counter for Counter Set 0, Lane 1
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2    = 62 # NVLink Bandwidth Counter for Counter Set 0, Lane 2
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3    = 63 # NVLink Bandwidth Counter for Counter Set 0, Lane 3
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4    = 64 # NVLink Bandwidth Counter for Counter Set 0, Lane 4
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5    = 65 # NVLink Bandwidth Counter for Counter Set 0, Lane 5
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL = 66 # NVLink Bandwidth Counter Total for Counter Set 0, All Lanes
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0    = 67 # NVLink Bandwidth Counter for Counter Set 1, Lane 0
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1    = 68 # NVLink Bandwidth Counter for Counter Set 1, Lane 1
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2    = 69 # NVLink Bandwidth Counter for Counter Set 1, Lane 2
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3    = 70 # NVLink Bandwidth Counter for Counter Set 1, Lane 3
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4    = 71 # NVLink Bandwidth Counter for Counter Set 1, Lane 4
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5    = 72 # NVLink Bandwidth Counter for Counter Set 1, Lane 5
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL = 73 # NVLink Bandwidth Counter Total for Counter Set 1, All Lanes
+
+# Perf Policy Counters
+NVML_FI_DEV_PERF_POLICY_POWER             = 74   # Perf Policy Counter for Power Policy
+NVML_FI_DEV_PERF_POLICY_THERMAL           = 75   # Perf Policy Counter for Thermal Policy
+NVML_FI_DEV_PERF_POLICY_SYNC_BOOST        = 76   # Perf Policy Counter for Sync boost Policy
+NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT       = 77   # Perf Policy Counter for Board Limit
+NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION   = 78   # Perf Policy Counter for Low GPU Utilization Policy
+NVML_FI_DEV_PERF_POLICY_RELIABILITY       = 79   # Perf Policy Counter for Reliability Policy
+NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS  = 80   # Perf Policy Counter for Total App Clock Policy
+NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS = 81   # Perf Policy Counter for Total Base Clocks Policy
+
+# Memory temperatures
+NVML_FI_DEV_MEMORY_TEMP  = 82 # Memory temperature for the device
+
+# Energy Counter
+NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last reloaded
+
+# NVLink Speed
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L0     = 84
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L1     = 85
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L2     = 86
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L3     = 87
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L4     = 88
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L5     = 89
+NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON = 90
+
+# NVLink Link Count
+NVML_FI_DEV_NVLINK_LINK_COUNT = 91
+
+# Page Retirement pending fields
+NVML_FI_DEV_RETIRED_PENDING_SBE = 92
+NVML_FI_DEV_RETIRED_PENDING_DBE = 93
+
+# PCIe replay and replay rollover counters
+NVML_FI_DEV_PCIE_REPLAY_COUNTER = 94
+NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER = 95
+
+# NvLink Flit Error Counters
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6   = 96 # NVLink flow control CRC  Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7   = 97 # NVLink flow control CRC  Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8   = 98 # NVLink flow control CRC  Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9   = 99 # NVLink flow control CRC  Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10  = 100 # NVLink flow control CRC  Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11  = 101 # NVLink flow control CRC  Error Counter for Lane 11
+
+# NvLink CRC Data Error Counters
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6   = 102 # NVLink data CRC Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7   = 103 # NVLink data CRC Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8   = 104 # NVLink data CRC Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9   = 105 # NVLink data CRC Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10  = 106 # NVLink data CRC Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11  = 107 # NVLink data CRC Error Counter for Lane 11
+
+# NvLink Replay Error Counters
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6     = 108 # NVLink Replay Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7     = 109 # NVLink Replay Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8     = 110 # NVLink Replay Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9     = 111 # NVLink Replay Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10    = 112 # NVLink Replay Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11    = 113 # NVLink Replay Error Counter for Lane 11
+
+# NvLink Recovery Error Counters
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6   = 114 # NVLink Recovery Error Counter for Lane 6
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7   = 115 # NVLink Recovery Error Counter for Lane 7
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8   = 116 # NVLink Recovery Error Counter for Lane 8
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9   = 117 # NVLink Recovery Error Counter for Lane 9
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10  = 118 # NVLink Recovery Error Counter for Lane 10
+NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11  = 119 # NVLink Recovery Error Counter for Lane 11
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6    = 120 # NVLink Bandwidth Counter for Counter Set 0, Lane 6
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7    = 121 # NVLink Bandwidth Counter for Counter Set 0, Lane 7
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8    = 122 # NVLink Bandwidth Counter for Counter Set 0, Lane 8
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9    = 123 # NVLink Bandwidth Counter for Counter Set 0, Lane 9
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10   = 124 # NVLink Bandwidth Counter for Counter Set 0, Lane 10
+NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11   = 125 # NVLink Bandwidth Counter for Counter Set 0, Lane 11
+
+# NvLink Bandwidth Counters
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6    = 126 # NVLink Bandwidth Counter for Counter Set 1, Lane 6
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7    = 127 # NVLink Bandwidth Counter for Counter Set 1, Lane 7
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8    = 128 # NVLink Bandwidth Counter for Counter Set 1, Lane 8
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9    = 129 # NVLink Bandwidth Counter for Counter Set 1, Lane 9
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10   = 130 # NVLink Bandwidth Counter for Counter Set 1, Lane 10
+NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11   = 131 # NVLink Bandwidth Counter for Counter Set 1, Lane 11
+
+# NVLink Speed
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L6     = 132
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L7     = 133
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L8     = 134
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L9     = 135
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L10    = 136
+NVML_FI_DEV_NVLINK_SPEED_MBPS_L11    = 137
+
+# NVLink Throughput Counters
+NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX = 138 # NVLink TX Data throughput in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX = 139 # NVLink RX Data throughput in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX  = 140 # NVLink TX Data + protocol overhead in KiB
+NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX  = 141 # NVLink RX Data + protocol overhead in KiB
+
+# Row Remapper
+NVML_FI_DEV_REMAPPED_COR        = 142
+NVML_FI_DEV_REMAPPED_UNC        = 143
+NVML_FI_DEV_REMAPPED_PENDING    = 144
+NVML_FI_DEV_REMAPPED_FAILURE    = 145
+
+#Remote device NVLink ID
+NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID = 146
+
+# Number of NVLinks connected to NVSwitch
+NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT = 147
+
+# NvLink ECC Data Error Counters
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0    = 148 #< NVLink data ECC Error Counter for Link 0
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1    = 149 #< NVLink data ECC Error Counter for Link 1
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2    = 150 #< NVLink data ECC Error Counter for Link 2
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3    = 151 #< NVLink data ECC Error Counter for Link 3
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4    = 152 #< NVLink data ECC Error Counter for Link 4
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5    = 153 #< NVLink data ECC Error Counter for Link 5
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6    = 154 #< NVLink data ECC Error Counter for Link 6
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7    = 155 #< NVLink data ECC Error Counter for Link 7
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8    = 156 #< NVLink data ECC Error Counter for Link 8
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9    = 157 #< NVLink data ECC Error Counter for Link 9
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10   = 158 #< NVLink data ECC Error Counter for Link 10
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11   = 159 #< NVLink data ECC Error Counter for Link 11
+NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 #< NvLink data ECC Error Counter total for all Links
+
+NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY            = 161
+NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY          = 162
+NVML_FI_DEV_NVLINK_ERROR_DL_CRC               = 163
+NVML_FI_DEV_NVLINK_GET_SPEED                  = 164
+NVML_FI_DEV_NVLINK_GET_STATE                  = 165
+NVML_FI_DEV_NVLINK_GET_VERSION                = 166
+
+NVML_FI_DEV_NVLINK_GET_POWER_STATE            = 167
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD        = 168
+
+NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER       = 169
+
+NVML_FI_DEV_C2C_LINK_COUNT                    = 170
+NVML_FI_DEV_C2C_LINK_GET_STATUS               = 171
+NVML_FI_DEV_C2C_LINK_GET_MAX_BW               = 172
+
+NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS     = 173
+NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED          = 174
+NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR         = 175
+NVML_FI_DEV_PCIE_COUNT_BAD_TLP                = 176
+NVML_FI_DEV_PCIE_COUNT_NAKS_SENT              = 177
+NVML_FI_DEV_PCIE_COUNT_BAD_DLLP               = 178
+NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR        = 179
+NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR            = 180
+NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ        = 181
+NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR             = 182
+NVML_FI_DEV_PCIE_COUNT_LANE_ERROR             = 183
+
+NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED        = 184
+
+NVML_FI_DEV_POWER_AVERAGE                     = 185
+NVML_FI_DEV_POWER_INSTANT                     = 186
+NVML_FI_DEV_POWER_MIN_LIMIT                   = 187
+NVML_FI_DEV_POWER_MAX_LIMIT                   = 188
+NVML_FI_DEV_POWER_DEFAULT_LIMIT               = 189
+NVML_FI_DEV_POWER_CURRENT_LIMIT               = 190
+NVML_FI_DEV_ENERGY                            = 191
+NVML_FI_DEV_POWER_REQUESTED_LIMIT             = 192
+
+NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT       = 193
+NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT       = 194
+NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT        = 195
+NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT        = 196
+
+NVML_FI_DEV_PCIE_COUNT_TX_BYTES               = 197
+NVML_FI_DEV_PCIE_COUNT_RX_BYTES               = 198
+
+NVML_FI_DEV_IS_MIG_MODE_INDEPENDENT_MIG_QUERY_CAPABLE   = 199
+
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MAX              = 200
+
+NVML_FI_DEV_NVLINK_COUNT_XMIT_PACKETS                    = 201
+NVML_FI_DEV_NVLINK_COUNT_XMIT_BYTES                      = 202
+NVML_FI_DEV_NVLINK_COUNT_RCV_PACKETS                     = 203
+NVML_FI_DEV_NVLINK_COUNT_RCV_BYTES                       = 204
+NVML_FI_DEV_NVLINK_COUNT_VL15_DROPPED                    = 205 # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_MALFORMED_PACKET_ERRORS         = 206
+NVML_FI_DEV_NVLINK_COUNT_BUFFER_OVERRUN_ERRORS           = 207
+NVML_FI_DEV_NVLINK_COUNT_RCV_ERRORS                      = 208
+NVML_FI_DEV_NVLINK_COUNT_RCV_REMOTE_ERRORS               = 209
+NVML_FI_DEV_NVLINK_COUNT_RCV_GENERAL_ERRORS              = 210
+NVML_FI_DEV_NVLINK_COUNT_LOCAL_LINK_INTEGRITY_ERRORS     = 211
+NVML_FI_DEV_NVLINK_COUNT_XMIT_DISCARDS                   = 212
+
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_SUCCESSFUL_EVENTS = 213
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_FAILED_EVENTS     = 214
+NVML_FI_DEV_NVLINK_COUNT_LINK_RECOVERY_EVENTS            = 215
+
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE0                   = 216  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER_LANE1                   = 217  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_RAW_BER                         = 218  # Deprecated, do not use
+NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_ERRORS                = 219
+NVML_FI_DEV_NVLINK_COUNT_EFFECTIVE_BER                   = 220
+NVML_FI_DEV_NVLINK_COUNT_SYMBOL_ERRORS                   = 221
+NVML_FI_DEV_NVLINK_COUNT_SYMBOL_BER                      = 222
+
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_MIN               = 223
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS             = 224 # Values are in the form NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_*
+NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_SUPPORTED         = 225
+
+NVML_FI_DEV_RESET_STATUS                                 = 226 # Deprecated use NVML_FI_DEV_GET_GPU_RECOVERY_ACTION instead 
+NVML_FI_DEV_DRAIN_AND_RESET_STATUS                       = 227 # Deprecated use NVML_FI_DEV_GET_GPU_RECOVERY_ACTION instead
+NVML_FI_DEV_PCIE_OUTBOUND_ATOMICS_MASK                   = 228
+NVML_FI_DEV_PCIE_INBOUND_ATOMICS_MASK                    = 229
+NVML_FI_DEV_GET_GPU_RECOVERY_ACTION                      = 230
+
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_0                   = 235
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_1                   = 236
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_2                   = 237
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_3                   = 238
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_4                   = 239
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_5                   = 240
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_6                   = 241
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_7                   = 242
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_8                   = 243
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_9                   = 244
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_10                  = 245
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_11                  = 246
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_12                  = 247
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_13                  = 248
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_14                  = 249
+NVML_FI_DEV_NVLINK_COUNT_FEC_HISTORY_15                  = 250
+NVML_FI_PWR_SMOOTHING_ENABLED                                   = 251 # Enablement (0/DISABLED or 1/ENABLED)
+NVML_FI_PWR_SMOOTHING_PRIV_LVL                                  = 252 # Current privilege level
+NVML_FI_PWR_SMOOTHING_IMM_RAMP_DOWN_ENABLED                     = 253 # Immediate ramp down enablement (0/DISABLED or 1/ENABLED)
+NVML_FI_PWR_SMOOTHING_APPLIED_TMP_CEIL                          = 254 # Applied TMP ceiling value
+NVML_FI_PWR_SMOOTHING_APPLIED_TMP_FLOOR                         = 255 # Applied TMP floor value
+NVML_FI_PWR_SMOOTHING_MAX_PERCENT_TMP_FLOOR_SETTING             = 256 # Max % TMP Floor value
+NVML_FI_PWR_SMOOTHING_MIN_PERCENT_TMP_FLOOR_SETTING             = 257 # Min % TMP Floor value
+NVML_FI_PWR_SMOOTHING_HW_CIRCUITRY_PERCENT_LIFETIME_REMAINING   = 258 # HW Circuitry % lifetime remaining
+NVML_FI_PWR_SMOOTHING_MAX_NUM_PRESET_PROFILES                   = 259 # Max number of preset profiles
+NVML_FI_PWR_SMOOTHING_PROFILE_PERCENT_TMP_FLOOR                 = 260 # % TMP floor for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_UP_RATE                      = 261 # Ramp up rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_RATE                    = 262 # Ramp down rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_PROFILE_RAMP_DOWN_HYST_VAL                = 263 # Ramp down hysteresis value in ms for a given profile
+NVML_FI_PWR_SMOOTHING_ACTIVE_PRESET_PROFILE                     = 264 # Active preset profile number
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_PERCENT_TMP_FLOOR          = 265 # % TMP floor for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_UP_RATE               = 266 # Ramp up rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_RATE             = 267 # Ramp down rate in mW/s for a given profile
+NVML_FI_PWR_SMOOTHING_ADMIN_OVERRIDE_RAMP_DOWN_HYST_VAL         = 268 # Ramp down hysteresis value in ms for a given profile
+
+NVML_FI_MAX = 269 # One greater than the largest field ID defined above
+
+# NVML_FI_DEV_NVLINK_GET_STATE state enums
+NVML_NVLINK_STATE_INACTIVE = 0x0
+NVML_NVLINK_STATE_ACTIVE   = 0x1
+NVML_NVLINK_STATE_SLEEP    = 0x2
+
+NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_100US = 0 # NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
+NVML_NVLINK_LOW_POWER_THRESHOLD_UNIT_50US  = 1 # NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD_UNITS
+
+## Enums needed for the method nvmlDeviceGetVirtualizationMode and nvmlDeviceSetVirtualizationMode
+NVML_GPU_VIRTUALIZATION_MODE_NONE        = 0  # Represents Bare Metal GPU
+NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1  # Device is associated with GPU-Passthorugh
+NVML_GPU_VIRTUALIZATION_MODE_VGPU        = 2  # Device is associated with vGPU inside virtual machine.
+NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU   = 3  # Device is associated with VGX hypervisor in vGPU mode
+NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA   = 4  # Device is associated with VGX hypervisor in vSGA mode
+
+## Lib loading ##
+nvmlLib = None
+libLoadLock = threading.Lock()
+_nvmlLib_refcount = 0 # Incremented on each nvmlInit and decremented on nvmlShutdown
+
+## vGPU Management
+_nvmlVgpuTypeId_t   = c_uint
+_nvmlVgpuInstance_t = c_uint
+
+_nvmlVgpuVmIdType_t = c_uint
+NVML_VGPU_VM_ID_DOMAIN_ID    = 0
+NVML_VGPU_VM_ID_UUID         = 1
+
+_nvmlGridLicenseFeatureCode_t = c_uint
+NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN      = 0
+NVML_GRID_LICENSE_FEATURE_CODE_VGPU         = 1
+NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX   = 2
+NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 # deprecated, use NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX.
+NVML_GRID_LICENSE_FEATURE_CODE_GAMING       = 3
+NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE      = 4
+
+_nvmlGridLicenseExpiryStatus_t = c_uint8
+NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE    = 0,   # Expiry information not available
+NVML_GRID_LICENSE_EXPIRY_INVALID          = 1,   # Invalid expiry or error fetching expiry
+NVML_GRID_LICENSE_EXPIRY_VALID            = 2,   # Valid expiry
+NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE   = 3,   # Expiry not applicable
+NVML_GRID_LICENSE_EXPIRY_PERMANENT        = 4,   # Permanent expiry
+
+_nvmlVgpuCapability_t = c_uint
+NVML_VGPU_CAP_NVLINK_P2P                    = 0  # vGPU P2P over NVLink is supported
+NVML_VGPU_CAP_GPUDIRECT                     = 1  # GPUDirect capability is supported
+NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE          = 2  # vGPU profile cannot be mixed with other vGPU profiles in same VM
+NVML_VGPU_CAP_EXCLUSIVE_TYPE                = 3  # vGPU profile cannot run on a GPU alongside other profiles of different type
+NVML_VGPU_CAP_EXCLUSIVE_SIZE                = 4  # vGPU profile cannot run on a GPU alongside other profiles of different size
+NVML_VGPU_CAP_COUNT                         = 5
+
+_nvmlVgpuDriverCapability_t = c_uint
+NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU   = 0  # Supports mixing of different vGPU profiles within one guest VM
+NVML_VGPU_DRIVER_CAP_WARM_UPDATE                = 1  # Supports FSR and warm update of vGPU host driver without terminating the running guest VM
+NVML_VGPU_DRIVER_CAP_COUNT                      = 2
+
+_nvmlDeviceVgpuCapability_t = c_uint
+NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU             = 0  # Query whether the fractional vGPU profiles on this GPU can be used in multi-vGPU configurations
+NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES  = 1  # Query whether the GPU supports concurrent execution of timesliced vGPU profiles of differing types
+NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES     = 2  # Query whether the GPU supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes
+NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW             = 3  # Query the GPU's read_device_buffer expected bandwidth capacity in megabytes per second
+NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW            = 4  # Query the GPU's write_device_buffer expected bandwidth capacity in megabytes per second
+NVML_DEVICE_VGPU_CAP_DEVICE_STREAMING                  = 5  # Query whether the vGPU profiles on the GPU supports migration data streaming
+NVML_DEVICE_VGPU_CAP_MINI_QUARTER_GPU                  = 6  # Set/Get support of mini-quarter vGPU profiles
+NVML_DEVICE_VGPU_CAP_COMPUTE_MEDIA_ENGINE_GPU          = 7  # Set/Get support for compute media engine vGPU profiles
+NVML_DEVICE_VGPU_CAP_WARM_UPDATE                       = 8  # Query whether the GPU supports FSR and warm update
+NVML_DEVICE_VGPU_CAP_HOMOGENEOUS_PLACEMENTS            = 9  # Query whether the GPU supports reporting of placements of timesliced vGPU profiles with identical framebuffer sizes
+NVML_DEVICE_VGPU_CAP_COUNT                             = 10
+
+_nvmlVgpuGuestInfoState_t = c_uint
+NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0
+NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED   = 1
+
+_nvmlVgpuVmCompatibility_t = c_uint
+NVML_VGPU_VM_COMPATIBILITY_NONE         = 0x0
+NVML_VGPU_VM_COMPATIBILITY_COLD         = 0x1
+NVML_VGPU_VM_COMPATIBILITY_HIBERNATE    = 0x2
+NVML_VGPU_VM_COMPATIBILITY_SLEEP        = 0x4
+NVML_VGPU_VM_COMPATIBILITY_LIVE         = 0x8
+
+_nvmlVgpuPgpuCompatibilityLimitCode_t = c_uint
+NVML_VGPU_COMPATIBILITY_LIMIT_NONE          = 0x0
+NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER   = 0x1
+NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER  = 0x2
+NVML_VGPU_COMPATIBILITY_LIMIT_GPU           = 0x4
+NVML_VGPU_COMPATIBILITY_LIMIT_OTHER         = 0x80000000
+
+_nvmlHostVgpuMode_t = c_uint
+NVML_HOST_VGPU_MODE_NON_SRIOV   = 0
+NVML_HOST_VGPU_MODE_SRIOV       = 1
+
+_nvmlConfComputeGpusReadyState_t = c_uint
+NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0
+NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE = 1
+
+_nvmlConfComputeGpuCaps_t = c_uint
+NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE = 0
+NVML_CC_SYSTEM_GPUS_CC_CAPABLE = 1
+
+_nvmlConfComputeCpuCaps_t = c_uint
+NVML_CC_SYSTEM_CPU_CAPS_NONE = 0
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV = 1
+NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX = 2
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV_SNP = 3
+NVML_CC_SYSTEM_CPU_CAPS_AMD_SNP_VTOM = 4
+
+_nvmlConfComputeDevToolsMode_t = c_uint
+NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF = 0
+NVML_CC_SYSTEM_DEVTOOLS_MODE_ON = 1
+
+NVML_CC_SYSTEM_MULTIGPU_NONE = 0
+NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE = 1
+ 
+NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE = 0
+NVML_CC_SYSTEM_ENVIRONMENT_SIM = 1
+NVML_CC_SYSTEM_ENVIRONMENT_PROD = 2
+ 
+_nvmlConfComputeCcFeature_t = c_uint
+NVML_CC_SYSTEM_FEATURE_DISABLED = 0
+NVML_CC_SYSTEM_FEATURE_ENABLED = 1
+
+_nvmlConfComputeCcKeyRotationThreshAttackerAdv_t = c_uint
+NVML_CC_KEY_ROTATION_THRESH_ATTACKER_ADVANTAGE_MIN = 50
+NVML_CC_KEY_ROTATION_THRESH_ATTACKER_ADVANTAGE_MAX = 65
+
+# GSP firmware
+NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40
+
+class NVMLLibraryMismatchError(Exception):
+    pass
+
+## Error Checking ##
+class NVMLError(Exception):
+    _valClassMapping = dict()
+    # List of currently known error codes
+    _errcode_to_string = {
+        NVML_ERROR_UNINITIALIZED:       "Uninitialized",
+        NVML_ERROR_INVALID_ARGUMENT:    "Invalid Argument",
+        NVML_ERROR_NOT_SUPPORTED:       "Not Supported",
+        NVML_ERROR_NO_PERMISSION:       "Insufficient Permissions",
+        NVML_ERROR_ALREADY_INITIALIZED: "Already Initialized",
+        NVML_ERROR_NOT_FOUND:           "Not Found",
+        NVML_ERROR_INSUFFICIENT_SIZE:   "Insufficient Size",
+        NVML_ERROR_INSUFFICIENT_POWER:  "Insufficient External Power",
+        NVML_ERROR_DRIVER_NOT_LOADED:   "Driver Not Loaded",
+        NVML_ERROR_TIMEOUT:             "Timeout",
+        NVML_ERROR_IRQ_ISSUE:           "Interrupt Request Issue",
+        NVML_ERROR_LIBRARY_NOT_FOUND:   "NVML Shared Library Not Found",
+        NVML_ERROR_FUNCTION_NOT_FOUND:  "Function Not Found",
+        NVML_ERROR_CORRUPTED_INFOROM:   "Corrupted infoROM",
+        NVML_ERROR_GPU_IS_LOST:         "GPU is lost",
+        NVML_ERROR_RESET_REQUIRED:      "GPU requires restart",
+        NVML_ERROR_OPERATING_SYSTEM:    "The operating system has blocked the request.",
+        NVML_ERROR_LIB_RM_VERSION_MISMATCH: "RM has detected an NVML/RM version mismatch.",
+        NVML_ERROR_MEMORY:              "Insufficient Memory",
+        NVML_ERROR_UNKNOWN:             "Unknown Error",
+        }
+    def __new__(typ, value):
+        '''
+        Maps value to a proper subclass of NVMLError.
+        See _extractNVMLErrorsAsClasses function for more details
+        '''
+        if typ == NVMLError:
+            typ = NVMLError._valClassMapping.get(value, typ)
+        obj = Exception.__new__(typ)
+        obj.value = value
+        return obj
+    def __str__(self):
+        try:
+            if self.value not in NVMLError._errcode_to_string:
+                NVMLError._errcode_to_string[self.value] = str(nvmlErrorString(self.value))
+            return NVMLError._errcode_to_string[self.value]
+        except NVMLError:
+            return "NVML Error with code %d" % self.value
+    def __eq__(self, other):
+        return self.value == other.value
+
+def nvmlExceptionClass(nvmlErrorCode):
+    if nvmlErrorCode not in NVMLError._valClassMapping:
+        raise ValueError('nvmlErrorCode %s is not valid' % nvmlErrorCode)
+    return NVMLError._valClassMapping[nvmlErrorCode]
+
+def _extractNVMLErrorsAsClasses():
+    '''
+    Generates a hierarchy of classes on top of NVMLError class.
+
+    Each NVML Error gets a new NVMLError subclass. This way try,except blocks can filter appropriate
+    exceptions more easily.
+
+    NVMLError is a parent class. Each NVML_ERROR_* gets it's own subclass.
+    e.g. NVML_ERROR_ALREADY_INITIALIZED will be turned into NVMLError_AlreadyInitialized
+    '''
+    this_module = sys.modules[__name__]
+    nvmlErrorsNames = [x for x in dir(this_module) if x.startswith("NVML_ERROR_")]
+    for err_name in nvmlErrorsNames:
+        # e.g. Turn NVML_ERROR_ALREADY_INITIALIZED into NVMLError_AlreadyInitialized
+        class_name = "NVMLError_" + string.capwords(err_name.replace("NVML_ERROR_", ""), "_").replace("_", "")
+        err_val = getattr(this_module, err_name)
+        def gen_new(val):
+            def new(typ):
+                obj = NVMLError.__new__(typ, val)
+                return obj
+            return new
+        new_error_class = type(class_name, (NVMLError,), {'__new__': gen_new(err_val)})
+        new_error_class.__module__ = __name__
+        setattr(this_module, class_name, new_error_class)
+        NVMLError._valClassMapping[err_val] = new_error_class
+_extractNVMLErrorsAsClasses()
+
+def _nvmlCheckReturn(ret):
+    if (ret != NVML_SUCCESS):
+        raise NVMLError(ret)
+    return ret
+
+## Function access ##
+_nvmlGetFunctionPointer_cache = dict() # function pointers are cached to prevent unnecessary libLoadLock locking
+def _nvmlGetFunctionPointer(name):
+    global nvmlLib
+
+    if name in _nvmlGetFunctionPointer_cache:
+        return _nvmlGetFunctionPointer_cache[name]
+
+    libLoadLock.acquire()
+    try:
+        # ensure library was loaded
+        if (nvmlLib == None):
+            raise NVMLError(NVML_ERROR_UNINITIALIZED)
+        try:
+            _nvmlGetFunctionPointer_cache[name] = getattr(nvmlLib, name)
+            return _nvmlGetFunctionPointer_cache[name]
+        except AttributeError:
+            raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND)
+    finally:
+        # lock is always freed
+        libLoadLock.release()
+
+## Alternative object
+# Allows the object to be printed
+# Allows mismatched types to be assigned
+#  - like None when the Structure variant requires c_uint
+class nvmlFriendlyObject(object):
+    def __init__(self, dictionary):
+        for x in dictionary:
+            setattr(self, x, dictionary[x])
+    def __str__(self):
+        return self.__dict__.__str__()
+
+def nvmlStructToFriendlyObject(struct):
+    d = {}
+    for x in struct._fields_:
+        key = x[0]
+        value = getattr(struct, key)
+        # only need to convert from bytes if bytes, no need to check python version.
+        d[key] = value.decode() if isinstance(value, bytes) else value
+    obj = nvmlFriendlyObject(d)
+    return obj
+
+# pack the object so it can be passed to the NVML library
+def nvmlFriendlyObjectToStruct(obj, model):
+    for x in model._fields_:
+        key = x[0]
+        value = obj.__dict__[key]
+        # any c_char_p in python3 needs to be bytes, default encoding works fine.
+        if sys.version_info >= (3,):
+            setattr(model, key, value.encode())
+        else:
+            setattr(model, key, value)
+    return model
+
+## Unit structures
+class struct_c_nvmlUnit_t(Structure):
+    pass # opaque handle
+c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t)
+
+class _PrintableStructure(Structure):
+    """
+    Abstract class that produces nicer __str__ output than ctypes.Structure.
+    e.g. instead of:
+      >>> print str(obj)
+      <class_name object at 0x7fdf82fef9e0>
+    this class will print
+      class_name(field_name: formatted_value, field_name: formatted_value)
+
+    _fmt_ dictionary of <str _field_ name> -> <str format>
+    e.g. class that has _field_ 'hex_value', c_uint could be formatted with
+      _fmt_ = {"hex_value" : "%08X"}
+    to produce nicer output.
+    Default fomratting string for all fields can be set with key "<default>" like:
+      _fmt_ = {"<default>" : "%d MHz"} # e.g all values are numbers in MHz.
+    If not set it's assumed to be just "%s"
+
+    Exact format of returned str from this class is subject to change in the future.
+    """
+    _fmt_ = {}
+    def __str__(self):
+        result = []
+        for x in self._fields_:
+            key = x[0]
+            value = getattr(self, key)
+            fmt = "%s"
+            if key in self._fmt_:
+                fmt = self._fmt_[key]
+            elif "<default>" in self._fmt_:
+                fmt = self._fmt_["<default>"]
+            result.append(("%s: " + fmt) % (key, value))
+        return self.__class__.__name__ + "(" +  ", ".join(result) + ")"
+
+    def __getattribute__(self, name):
+        res = super(_PrintableStructure, self).__getattribute__(name)
+        # need to convert bytes to unicode for python3 don't need to for python2
+        # Python 2 strings are of both str and bytes
+        # Python 3 strings are not of type bytes
+        # ctypes should convert everything to the correct values otherwise
+        if isinstance(res, bytes):
+            if isinstance(res, str):
+                return res
+            return res.decode()
+        return res
+
+    def __setattr__(self, name, value):
+        if isinstance(value, str):
+            # encoding a python2 string returns the same value, since python2 strings are bytes already
+            # bytes passed in python3 will be ignored.
+            value = value.encode()
+        super(_PrintableStructure, self).__setattr__(name, value)
+
+class c_nvmlUnitInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('name', c_char * 96),
+        ('id', c_char * 96),
+        ('serial', c_char * 96),
+        ('firmwareVersion', c_char * 96),
+    ]
+
+class c_nvmlC2cModeInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('isC2cEnabled', c_uint)
+    ]
+
+nvmlC2cModeInfo_v1 = 0x1000008;
+
+class c_nvmlLedState_t(_PrintableStructure):
+    _fields_ = [
+        ('cause', c_char * 256),
+        ('color', _nvmlLedColor_t),
+    ]
+
+class c_nvmlPSUInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('state', c_char * 256),
+        ('current', c_uint),
+        ('voltage', c_uint),
+        ('power', c_uint),
+    ]
+
+class c_nvmlUnitFanInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('speed', c_uint),
+        ('state', _nvmlFanState_t),
+    ]
+
+class c_nvmlUnitFanSpeeds_t(_PrintableStructure):
+    _fields_ = [
+        ('fans', c_nvmlUnitFanInfo_t * 24),
+        ('count', c_uint)
+    ]
+
+## Device structures
+class struct_c_nvmlDevice_t(Structure):
+    pass # opaque handle
+c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t)
+
+class nvmlPciInfoExt_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+        ('pciSubSystemId', c_uint),
+        ('baseClass', c_uint),
+        ('subClass', c_uint),
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE),
+    ]
+    _fmt_ = {
+            'version'        : "0x%04X",
+            'domain'         : "0x%04X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            'baseClass'      : "0x%01X",
+            'subClass'       : "0x%01X",
+            }
+
+nvmlPciInfoExt_v1 = 0x1000040
+
+# Legacy pciInfo used for _v1 and _v2
+class nvmlPciInfo_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+
+        # Added in 2.285
+        ('pciSubSystemId', c_uint),
+        ('reserved0', c_uint),
+        ('reserved1', c_uint),
+        ('reserved2', c_uint),
+        ('reserved3', c_uint),
+    ]
+    _fmt_ = {
+            'domain'         : "0x%04X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            }
+
+class nvmlPciInfo_t(_PrintableStructure):
+    _fields_ = [
+        # Moved to the new busId location below
+        ('busIdLegacy', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE),
+        ('domain', c_uint),
+        ('bus', c_uint),
+        ('device', c_uint),
+        ('pciDeviceId', c_uint),
+
+        # Added in 2.285
+        ('pciSubSystemId', c_uint),
+        # New busId replaced the long deprecated and reserved fields with a
+        # field of the same size in 9.0
+        ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE),
+    ]
+    _fmt_ = {
+            'domain'         : "0x%08X",
+            'bus'            : "0x%02X",
+            'device'         : "0x%02X",
+            'pciDeviceId'    : "0x%08X",
+            'pciSubSystemId' : "0x%08X",
+            }
+
+class c_nvmlSystemDriverBranchInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ("branch", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+    ]
+
+SystemDriverBranchInfo_v1 = 0x1000054
+
+class c_nvmlExcludedDeviceInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('pci', nvmlPciInfo_t),
+        ('uuid', c_char * NVML_DEVICE_UUID_BUFFER_SIZE)
+    ]
+
+class nvmlNvLinkUtilizationControl_t(_PrintableStructure):
+    _fields_ = [
+        ('units', _nvmlNvLinkUtilizationCountUnits_t),
+        ('pktfilter', _nvmlNvLinkUtilizationCountPktTypes_t),
+    ]
+
+class c_nvmlMemory_t(_PrintableStructure):
+    _fields_ = [
+        ('total', c_ulonglong),
+        ('free', c_ulonglong),
+        ('used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+class c_nvmlMemory_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('total', c_ulonglong),
+        ('reserved', c_ulonglong),
+        ('free', c_ulonglong),
+        ('used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+nvmlMemory_v2 = 0x02000028
+
+class c_nvmlBAR1Memory_t(_PrintableStructure):
+    _fields_ = [
+        ('bar1Total', c_ulonglong),
+        ('bar1Free', c_ulonglong),
+        ('bar1Used', c_ulonglong),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+class nvmlClkMonFaultInfo_t(Structure):
+    _fields_ = [("clkApiDomain", c_uint),
+                ("clkDomainFaultMask", c_uint)
+    ]
+
+MAX_CLK_DOMAINS = 32
+
+class nvmlClkMonStatus_t(Structure):
+    _fields_ = [("bGlobalStatus", c_uint),
+                ("clkMonListSize", c_uint),
+                ("clkMonList", nvmlClkMonFaultInfo_t * MAX_CLK_DOMAINS)
+    ]
+
+# On Windows with the WDDM driver, usedGpuMemory is reported as None
+# Code that processes this structure should check for None, I.E.
+#
+# if (info.usedGpuMemory == None):
+#     # TODO handle the error
+#     pass
+# else:
+#    print("Using %d MiB of memory" % (info.usedGpuMemory / 1024 / 1024))
+# endif
+#
+# See NVML documentation for more information
+class c_nvmlProcessInfo_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('usedGpuMemory', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+    ]
+    _fmt_ = {'usedGpuMemory': "%d B"}
+
+c_nvmlProcessInfo_v3_t = c_nvmlProcessInfo_v2_t
+
+c_nvmlProcessInfo_t = c_nvmlProcessInfo_v3_t
+
+_nvmlProcessMode_t = c_uint
+NVML_PROCESS_MODE_COMPUTE  = 0
+NVML_PROCESS_MODE_GRAPHICS = 1
+NVML_PROCESS_MODE_MPS      = 2
+
+class c_nvmlProcessDetail_v1_t(Structure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('usedGpuMemory', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint),
+        ('usedGpuCcProtectedMemory', c_ulonglong),
+    ]
+
+class c_nvmlProcessDetailList_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('mode', _nvmlProcessMode_t),
+        ('numProcArrayEntries', c_uint),
+        ('procArray', POINTER(c_nvmlProcessDetail_v1_t)),
+    ]
+    _fmt_ = {'numProcArrayEntries': "%d B"}
+
+c_nvmlProcessDetailList_t = c_nvmlProcessDetailList_v1_t
+
+nvmlProcessDetailList_v1 = 0x1000018
+
+class c_nvmlBridgeChipInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('type', _nvmlBridgeChipType_t),
+        ('fwVersion', c_uint),
+    ]
+
+class c_nvmlBridgeChipHierarchy_t(_PrintableStructure):
+    _fields_ = [
+        ('bridgeCount', c_uint),
+        ('bridgeChipInfo', c_nvmlBridgeChipInfo_t * 128),
+    ]
+
+class c_nvmlEccErrorCounts_t(_PrintableStructure):
+    _fields_ = [
+        ('l1Cache', c_ulonglong),
+        ('l2Cache', c_ulonglong),
+        ('deviceMemory', c_ulonglong),
+        ('registerFile', c_ulonglong),
+    ]
+
+class c_nvmlUtilization_t(_PrintableStructure):
+    _fields_ = [
+        ('gpu', c_uint),
+        ('memory', c_uint),
+    ]
+    _fmt_ = {'<default>': "%d %%"}
+
+# Added in 2.285
+class c_nvmlHwbcEntry_t(_PrintableStructure):
+    _fields_ = [
+        ('hwbcId', c_uint),
+        ('firmwareVersion', c_char * 32),
+    ]
+
+class c_nvmlValue_t(Union):
+    _fields_ = [
+        ('dVal', c_double),
+        ('uiVal', c_uint),
+        ('ulVal', c_ulong),
+        ('ullVal', c_ulonglong),
+        ('sllVal', c_longlong),
+        ('siVal', c_int),
+        ('usVal', c_ushort),
+    ]
+
+class c_nvmlSample_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('sampleValue', c_nvmlValue_t),
+    ]
+
+class c_nvmlViolationTime_t(_PrintableStructure):
+    _fields_ = [
+        ('referenceTime', c_ulonglong),
+        ('violationTime', c_ulonglong),
+    ]
+
+class c_nvmlFieldValue_t(_PrintableStructure):
+    _fields_ = [
+        ('fieldId', c_uint32),
+        ('scopeId', c_uint32),
+        ('timestamp', c_int64),
+        ('latencyUsec', c_int64),
+        ('valueType', _nvmlValueType_t),
+        ('nvmlReturn', _nvmlReturn_t),
+        ('value', c_nvmlValue_t)
+    ]
+
+NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES = 23
+
+nvmlNvlinkSupportedBwModes_v1 = 0x100001c
+class c_nvmlNvlinkSupportedBwModes_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bwModes', c_uint8 * NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES),
+        ('totalBwModes', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkSupportedBwModes_v1_t, self).__init__(version=nvmlNvlinkSupportedBwModes_v1)
+
+nvmlNvlinkGetBwMode_v1 = 0x100000c
+class c_nvmlNvlinkGetBwMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bIsBest', c_uint),
+        ('bwMode', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkGetBwMode_v1_t, self).__init__(version=nvmlNvlinkGetBwMode_v1)
+
+nvmlNvlinkSetBwMode_v1 = 0x100000c
+class c_nvmlNvlinkSetBwMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bSetBest', c_uint),
+        ('bwMode', c_uint8)
+    ]
+
+    def __init__(self):
+        super(c_nvmlNvlinkSetBwMode_v1_t, self).__init__(version=nvmlNvlinkSetBwMode_v1)
+
+class c_nvmlVgpuHeterogeneousMode_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('mode', c_uint),
+    ]
+
+VgpuHeterogeneousMode_v1 = 0x1000008
+
+class c_nvmlVgpuPlacementId_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('placementId', c_uint),
+    ]
+
+VgpuPlacementId_v1 = 0x1000008
+
+class c_nvmlVgpuPlacementList_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('count', c_uint),
+        ('placementSize', c_uint),
+        ('placementIds', POINTER(c_uint)),
+    ]
+
+VgpuPlacementList_v1 = 0x1000018
+
+NVML_VGPU_PGPU_HETEROGENEOUS_MODE   = 0
+NVML_VGPU_PGPU_HOMOGENEOUS_MODE     = 1
+
+class c_nvmlVgpuPlacementList_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('placementSize', c_uint),
+        ('count', c_uint),
+        ('placementIds', POINTER(c_uint)),
+        ('mode', c_uint),
+    ]
+
+VgpuPlacementList_v2 = 0x2000020
+
+class c_nvmlVgpuTypeBar1Info_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('bar1Size', c_ulonglong),
+    ]
+
+VgpuTypeBar1Info_v1 = 0x1000010
+
+class c_nvmlVgpuInstanceUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_nvmlValue_t),
+        ('memUtil', c_nvmlValue_t),
+        ('encUtil', c_nvmlValue_t),
+        ('decUtil', c_nvmlValue_t),
+    ]
+
+class c_nvmlVgpuInstanceUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('smUtil', c_nvmlValue_t),
+        ('memUtil', c_nvmlValue_t),
+        ('encUtil', c_nvmlValue_t),
+        ('decUtil', c_nvmlValue_t),
+        ('jpgUtil', c_nvmlValue_t),
+        ('ofaUtil', c_nvmlValue_t),
+    ]
+
+class c_nvmlVgpuInstancesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('sampleValType', _nvmlValueType_t),
+        ('vgpuInstanceCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('vgpuUtilArray', POINTER(c_nvmlVgpuInstanceUtilizationInfo_v1_t)),
+    ]
+
+VgpuInstancesUtilizationInfo_v1 = 0x01000020
+
+class c_nvmlVgpuProcessUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('pid', c_uint),
+        ('processName', c_char * NVML_VGPU_NAME_BUFFER_SIZE),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+    ]
+
+class c_nvmlVgpuProcessUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('processName', c_char * NVML_VGPU_NAME_BUFFER_SIZE),
+        ('timeStamp', c_ulonglong),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('pid', c_uint),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+        ('jpgUtil', c_uint),
+        ('ofaUtil', c_uint),
+    ]
+
+class c_nvmlVgpuProcessesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('vgpuProcessCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('vgpuProcUtilArray', POINTER(c_nvmlVgpuProcessUtilizationInfo_v1_t)),
+    ]
+
+VgpuProcessesUtilizationInfo_v1 = 0x01000018
+
+class nvmlVgpuRuntimeState_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('size', c_ulonglong),
+    ]
+
+VgpuRuntimeState_v1 = 0x1000010
+
+class c_nvmlVgpuLicenseExpiry_t(_PrintableStructure):
+    _fields_ = [
+        ('year',    c_uint32),
+        ('month',   c_uint16),
+        ('day',     c_uint16),
+        ('hour',    c_uint16),
+        ('min',     c_uint16),
+        ('sec',     c_uint16),
+        ('status',  c_uint8),
+    ]
+
+NVML_GRID_LICENSE_STATE_UNKNOWN                 = 0
+NVML_GRID_LICENSE_STATE_UNINITIALIZED           = 1
+NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED = 2
+NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED   = 3
+NVML_GRID_LICENSE_STATE_UNLICENSED              = 4
+NVML_GRID_LICENSE_STATE_LICENSED                = 5
+
+class c_nvmlVgpuLicenseInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('isLicensed',      c_uint8),
+        ('licenseExpiry',   c_nvmlVgpuLicenseExpiry_t),
+        ('currentState',    c_uint),
+    ]
+
+class c_nvmlEncoderSession_t(_PrintableStructure):
+    _fields_ = [
+        ('sessionId', c_uint),
+        ('pid', c_uint),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('codecType', c_uint),
+        ('hResolution', c_uint),
+        ('vResolution', c_uint),
+        ('averageFps', c_uint),
+        ('encodeLatency', c_uint),
+    ]
+
+class c_nvmlProcessUtilizationSample_t(_PrintableStructure):
+    _fields_ = [
+        ('pid', c_uint),
+        ('timeStamp', c_ulonglong),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+    ]
+
+class c_nvmlProcessUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('timeStamp', c_ulonglong),
+        ('pid', c_uint),
+        ('smUtil', c_uint),
+        ('memUtil', c_uint),
+        ('encUtil', c_uint),
+        ('decUtil', c_uint),
+        ('jpgUtil', c_uint),
+        ('ofaUtil', c_uint),
+    ]
+
+class c_nvmlProcessesUtilizationInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('processSamplesCount', c_uint),
+        ('lastSeenTimeStamp', c_ulonglong),
+        ('procUtilArray', POINTER(c_nvmlProcessUtilizationInfo_v1_t)),
+    ]
+
+ProcessesUtilizationInfo_v1 = 0x01000018
+
+class c_nvmlGridLicenseExpiry_t(_PrintableStructure):
+    _fields_ = [
+        ('year',    c_uint32),
+        ('month',   c_uint16),
+        ('day',     c_uint16),
+        ('hour',    c_uint16),
+        ('min',     c_uint16),
+        ('sec',     c_uint16),
+        ('status',  c_uint8),
+    ]
+
+class c_nvmlGridLicensableFeature_v4_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode',    _nvmlGridLicenseFeatureCode_t),
+        ('featureState',   c_uint),
+        ('licenseInfo',    c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName',    c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('featureEnabled', c_uint),
+        ('licenseExpiry',  c_nvmlGridLicenseExpiry_t),
+    ]
+
+class c_nvmlGridLicensableFeatures_v4_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported',  c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures',  c_nvmlGridLicensableFeature_v4_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_v3_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('featureEnabled', c_uint),
+    ]
+
+class c_nvmlGridLicensableFeatures_v3_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v3_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+        ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+    ]
+
+class c_nvmlGridLicensableFeatures_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v2_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlGridLicensableFeature_t(_PrintableStructure):
+    _fields_ = [
+        ('featureCode', _nvmlGridLicenseFeatureCode_t),
+        ('featureState', c_uint),
+        ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE),
+    ]
+
+class c_nvmlGridLicensableFeatures_t(_PrintableStructure):
+    _fields_ = [
+        ('isGridLicenseSupported', c_int),
+        ('licensableFeaturesCount', c_uint),
+        ('gridLicensableFeatures', c_nvmlGridLicensableFeature_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT),
+    ]
+
+class c_nvmlMarginTemperature_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('marginTemperature', c_int),
+    ]
+
+nvmlMarginTemperature_v1 = 0x1000008
+
+## Event structures
+class struct_c_nvmlEventSet_t(Structure):
+    pass # opaque handle
+c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t)
+
+nvmlEventTypeSingleBitEccError      = 0x0000000000000001
+nvmlEventTypeDoubleBitEccError      = 0x0000000000000002
+nvmlEventTypePState                 = 0x0000000000000004
+nvmlEventTypeXidCriticalError       = 0x0000000000000008
+nvmlEventTypeClock                  = 0x0000000000000010
+nvmlEventTypePowerSourceChange      = 0x0000000000000080
+nvmlEventMigConfigChange            = 0x0000000000000100
+nvmlEventTypeSingleBitEccErrorStorm = 0x0000000000000200
+nvmlEventTypeDramRetirementEvent    = 0x0000000000000400
+nvmlEventTypeDramRetirementFailure  = 0x0000000000000800
+nvmlEventTypeNonFatalPoisonError    = 0x0000000000001000
+nvmlEventTypeFatalPoisonError       = 0x0000000000002000
+nvmlEventTypeGpuUnavailableError    = 0x0000000000004000
+nvmlEventTypeGpuRecoveryAction      = 0x0000000000008000
+nvmlEventTypeNone                   = 0x0000000000000000
+nvmlEventTypeAll                    = (
+                                        nvmlEventTypeNone
+                                        | nvmlEventTypeSingleBitEccError
+                                        | nvmlEventTypeDoubleBitEccError
+                                        | nvmlEventTypePState
+                                        | nvmlEventTypeClock
+                                        | nvmlEventTypePowerSourceChange
+                                        | nvmlEventTypeXidCriticalError
+                                        | nvmlEventMigConfigChange
+                                        | nvmlEventTypeSingleBitEccErrorStorm
+                                        | nvmlEventTypeDramRetirementEvent
+                                        | nvmlEventTypeDramRetirementFailure
+                                        | nvmlEventTypeNonFatalPoisonError
+                                        | nvmlEventTypeFatalPoisonError
+                                        | nvmlEventTypeGpuUnavailableError
+                                        | nvmlEventTypeGpuRecoveryAction
+                                        )
+
+## Clock Event Reasons defines
+nvmlClocksEventReasonGpuIdle              = 0x0000000000000001
+nvmlClocksEventReasonApplicationsClocksSetting = 0x0000000000000002
+nvmlClocksEventReasonUserDefinedClocks         = nvmlClocksEventReasonApplicationsClocksSetting # deprecated, use nvmlClocksEventReasonApplicationsClocksSetting
+nvmlClocksEventReasonSwPowerCap           = 0x0000000000000004
+nvmlClocksEventReasonHwSlowdown           = 0x0000000000000008
+nvmlClocksEventReasonSyncBoost            = 0x0000000000000010
+nvmlClocksEventReasonSwThermalSlowdown    = 0x0000000000000020
+nvmlClocksEventReasonHwThermalSlowdown    = 0x0000000000000040
+nvmlClocksEventReasonHwPowerBrakeSlowdown = 0x0000000000000080
+nvmlClocksEventReasonDisplayClockSetting  = 0x0000000000000100
+nvmlClocksEventReasonNone                 = 0x0000000000000000
+nvmlClocksEventReasonAll                  = (
+                                                  nvmlClocksEventReasonNone |
+                                                  nvmlClocksEventReasonGpuIdle |
+                                                  nvmlClocksEventReasonApplicationsClocksSetting |
+                                                  nvmlClocksEventReasonSwPowerCap |
+                                                  nvmlClocksEventReasonHwSlowdown |
+                                                  nvmlClocksEventReasonSyncBoost |
+                                                  nvmlClocksEventReasonSwThermalSlowdown |
+                                                  nvmlClocksEventReasonHwThermalSlowdown |
+                                                  nvmlClocksEventReasonHwPowerBrakeSlowdown |
+                                                  nvmlClocksEventReasonDisplayClockSetting
+                                               )
+
+## Following have been deprecated
+nvmlClocksThrottleReasonGpuIdle              = 0x0000000000000001
+nvmlClocksThrottleReasonApplicationsClocksSetting = 0x0000000000000002
+nvmlClocksThrottleReasonUserDefinedClocks         = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, use nvmlClocksThrottleReasonApplicationsClocksSetting
+nvmlClocksThrottleReasonSwPowerCap           = 0x0000000000000004
+nvmlClocksThrottleReasonHwSlowdown           = 0x0000000000000008
+nvmlClocksThrottleReasonSyncBoost            = 0x0000000000000010
+nvmlClocksThrottleReasonSwThermalSlowdown    = 0x0000000000000020
+nvmlClocksThrottleReasonHwThermalSlowdown    = 0x0000000000000040
+nvmlClocksThrottleReasonHwPowerBrakeSlowdown = 0x0000000000000080
+nvmlClocksThrottleReasonDisplayClockSetting  = 0x0000000000000100
+nvmlClocksThrottleReasonNone                 = 0x0000000000000000
+nvmlClocksThrottleReasonAll                  = (
+                                                  nvmlClocksThrottleReasonNone |
+                                                  nvmlClocksThrottleReasonGpuIdle |
+                                                  nvmlClocksThrottleReasonApplicationsClocksSetting |
+                                                  nvmlClocksThrottleReasonSwPowerCap |
+                                                  nvmlClocksThrottleReasonHwSlowdown |
+                                                  nvmlClocksThrottleReasonSyncBoost |
+                                                  nvmlClocksThrottleReasonSwThermalSlowdown |
+                                                  nvmlClocksThrottleReasonHwThermalSlowdown |
+                                                  nvmlClocksThrottleReasonHwPowerBrakeSlowdown |
+                                                  nvmlClocksThrottleReasonDisplayClockSetting
+                                               )
+
+class c_nvmlEventData_t(_PrintableStructure):
+    _fields_ = [
+        ('device', c_nvmlDevice_t),
+        ('eventType', c_ulonglong),
+        ('eventData', c_ulonglong),
+        ('gpuInstanceId', c_uint),
+        ('computeInstanceId', c_uint)
+    ]
+    _fmt_ = {'eventType': "0x%08X"}
+
+class c_nvmlAccountingStats_t(_PrintableStructure):
+    _fields_ = [
+        ('gpuUtilization', c_uint),
+        ('memoryUtilization', c_uint),
+        ('maxMemoryUsage', c_ulonglong),
+        ('time', c_ulonglong),
+        ('startTime', c_ulonglong),
+        ('isRunning', c_uint),
+        ('reserved', c_uint * 5)
+    ]
+
+class c_nvmlVgpuVersion_t(Structure):
+    _fields_ = [("minVersion", c_uint),
+                ("maxVersion", c_uint)
+               ]
+
+class c_nvmlVgpuMetadata_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("revision", c_uint),
+                ("guestInfoState", _nvmlVgpuGuestInfoState_t),
+                ("guestDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("hostDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("reserved", c_uint * 6),
+                ("vgpuVirtualizationCaps", c_uint),
+                ("guestVgpuVersion", c_uint),
+                ("opaqueDataSize", c_uint),
+                ("opaqueData", c_char * NVML_VGPU_METADATA_OPAQUE_DATA_SIZE)
+               ]
+
+class c_nvmlVgpuPgpuMetadata_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("revision", c_uint),
+                ("hostDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE),
+                ("pgpuVirtualizationCaps", c_uint),
+                ("reserved", c_uint * 5),
+                ("hostSupportedVgpuRange", c_nvmlVgpuVersion_t),
+                ("opaqueDataSize", c_uint),
+                ("opaqueData", c_char * NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE)
+               ]
+
+class c_nvmlVgpuPgpuCompatibility_t(Structure):
+    _fields_ = [("vgpuVmCompatibility", _nvmlVgpuVmCompatibility_t),
+                ("compatibilityLimitCode", _nvmlVgpuPgpuCompatibilityLimitCode_t)
+               ]
+
+## vGPU scheduler policy defines
+NVML_VGPU_SCHEDULER_POLICY_UNKNOWN      = 0
+NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT  = 1
+NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE  = 2
+NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE  = 3
+
+## Supported vGPU scheduler policy count
+NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT  = 3
+
+NVML_SCHEDULER_SW_MAX_LOG_ENTRIES           = 200
+
+NVML_VGPU_SCHEDULER_ARR_DEFAULT   = 0
+NVML_VGPU_SCHEDULER_ARR_DISABLE   = 1
+NVML_VGPU_SCHEDULER_ARR_ENABLE    = 2
+
+class c_nvmlVgpuSchedDataWithARR_t(_PrintableStructure):
+    _fields_ = [
+        ('avgFactor',   c_uint),
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedData_t(_PrintableStructure):
+    _fields_ = [
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedulerParams_t(Union):
+    _fields_ = [
+        ('vgpuSchedDataWithARR', c_nvmlVgpuSchedDataWithARR_t),
+        ('vgpuSchedData',        c_nvmlVgpuSchedData_t),
+    ]
+
+class c_nvmlVgpuSchedulerLogEntry_t(_PrintableStructure):
+    _fields_ = [
+        ('timestamp',                   c_ulonglong),
+        ('timeRunTotal',                c_ulonglong),
+        ('timeRun',                     c_ulonglong),
+        ('swRunlistId',                 c_uint),
+        ('targetTimeSlice',             c_ulonglong),
+        ('cumulativePreemptionTime',    c_ulonglong),
+    ]
+
+class c_nvmlVgpuSchedulerLog_t(_PrintableStructure):
+    _fields_ = [
+        ('engineId',        c_uint),
+        ('schedulerPolicy', c_uint),
+        ('arrMode',         c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerParams_t),
+        ('entriesCount',    c_uint),
+        ('logEntries',      c_nvmlVgpuSchedulerLogEntry_t * NVML_SCHEDULER_SW_MAX_LOG_ENTRIES),
+    ]
+
+class c_nvmlVgpuSchedulerGetState_t(_PrintableStructure):
+    _fields_ = [
+        ('schedulerPolicy', c_uint),
+        ('arrMode',         c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerParams_t),
+    ]
+
+class c_nvmlVgpuSchedSetDataWithARR_t(_PrintableStructure):
+    _fields_ = [
+        ('avgFactor',   c_uint),
+        ('frequency',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedSetData_t(_PrintableStructure):
+    _fields_ = [
+        ('timeslice',   c_uint),
+    ]
+
+class c_nvmlVgpuSchedulerSetParams_t(Union):
+    _fields_ = [
+        ('vgpuSchedDataWithARR', c_nvmlVgpuSchedSetDataWithARR_t),
+        ('vgpuSchedData',        c_nvmlVgpuSchedSetData_t),
+    ]
+
+class c_nvmlVgpuSchedulerSetState_t(_PrintableStructure):
+    _fields_ = [
+        ('schedulerPolicy', c_uint),
+        ('enableARRMode',   c_uint),
+        ('schedulerParams', c_nvmlVgpuSchedulerSetParams_t),
+    ]
+
+class c_nvmlVgpuSchedulerCapabilities_t(_PrintableStructure):
+    _fields_ = [
+        ('supportedSchedulers', c_uint * NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT),
+        ('maxTimeslice',        c_uint),
+        ('minTimeslice',        c_uint),
+        ('isArrModeSupported',  c_uint),
+        ('maxFrequencyForARR',  c_uint),
+        ('minFrequencyForARR',  c_uint),
+        ('maxAvgFactorForARR',  c_uint),
+        ('minAvgFactorForARR',  c_uint),
+    ]
+
+class c_nvmlFBCStats_t(Structure):
+    _fields_ = [("sessionsCount", c_uint),
+                ("averageFPS", c_uint),
+                ("averageLatency", c_uint)
+               ]
+
+class c_nvmlFBCSession_t(_PrintableStructure):
+    _fields_ = [
+        ('sessionId', c_uint),
+        ('pid', c_uint),
+        ('vgpuInstance', _nvmlVgpuInstance_t),
+        ('displayOrdinal', c_uint),
+        ('sessionType', c_uint),
+        ('sessionFlags', c_uint),
+        ('hMaxResolution', c_uint),
+        ('vMaxResolution', c_uint),
+        ('hResolution', c_uint),
+        ('vResolution', c_uint),
+        ('averageFPS', c_uint),
+        ('averageLatency', c_uint),
+    ]
+
+NVML_DEVICE_MIG_DISABLE = 0x0
+NVML_DEVICE_MIG_ENABLE  = 0x1
+
+NVML_GPU_INSTANCE_PROFILE_1_SLICE      = 0x0
+NVML_GPU_INSTANCE_PROFILE_2_SLICE      = 0x1
+NVML_GPU_INSTANCE_PROFILE_3_SLICE      = 0x2
+NVML_GPU_INSTANCE_PROFILE_4_SLICE      = 0x3
+NVML_GPU_INSTANCE_PROFILE_7_SLICE      = 0x4
+NVML_GPU_INSTANCE_PROFILE_8_SLICE      = 0x5
+NVML_GPU_INSTANCE_PROFILE_6_SLICE      = 0x6
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7
+NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 0x8
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 0x9
+NVML_GPU_INSTANCE_PROFILE_1_SLICE_GFX  = 0xA
+NVML_GPU_INSTANCE_PROFILE_2_SLICE_GFX  = 0xB
+NVML_GPU_INSTANCE_PROFILE_4_SLICE_GFX  = 0xC
+NVML_GPU_INSTANCE_PROFILE_COUNT        = 0xD
+
+class c_nvmlGpuInstancePlacement_t(Structure):
+    _fields_ = [("start", c_uint),
+                ("size", c_uint)
+               ]
+
+class c_nvmlGpuInstanceProfileInfo_t(Structure):
+    _fields_ = [("id", c_uint),
+                ("isP2pSupported", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("copyEngineCount", c_uint),
+                ("decoderCount", c_uint),
+                ("encoderCount", c_uint),
+                ("jpegCount", c_uint),
+                ("ofaCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+               ]
+
+nvmlGpuInstanceProfileInfo_v2 = 0x02000098
+
+class c_nvmlGpuInstanceProfileInfo_v2_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("id", c_uint),
+                ("isP2pSupported", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("copyEngineCount", c_uint),
+                ("decoderCount", c_uint),
+                ("encoderCount", c_uint),
+                ("jpegCount", c_uint),
+                ("ofaCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+                ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+               ]
+    
+    def __init__(self):
+        super(c_nvmlGpuInstanceProfileInfo_v2_t, self).__init__(version=nvmlGpuInstanceProfileInfo_v2)
+
+class c_nvmlGpuInstanceInfo_t(Structure):
+    _fields_ = [("device", c_nvmlDevice_t),
+                ("id", c_uint),
+                ("profileId", c_uint),
+                ("placement", c_nvmlGpuInstancePlacement_t)
+               ]
+
+class struct_c_nvmlGpuInstance_t(Structure):
+    pass # opaque handle
+c_nvmlGpuInstance_t = POINTER(struct_c_nvmlGpuInstance_t)
+
+NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE      = 0x0
+NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE      = 0x1
+NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE      = 0x2
+NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE      = 0x3
+NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE      = 0x4
+NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE      = 0x5
+NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE      = 0x6
+NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7
+NVML_COMPUTE_INSTANCE_PROFILE_COUNT        = 0x8
+
+NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x0
+NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 0x1
+
+class c_nvmlComputeInstancePlacement_t(Structure):
+    _fields_ = [("start", c_uint),
+                ("size", c_uint)
+               ]
+
+class c_nvmlComputeInstanceProfileInfo_t(Structure):
+    _fields_ = [("id", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint)
+               ]
+
+nvmlComputeInstanceProfileInfo_v2 = 0x02000088
+
+class c_nvmlComputeInstanceProfileInfo_v2_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("id", c_uint),
+                ("sliceCount", c_uint),
+                ("instanceCount", c_uint),
+                ("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint),
+                ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+               ]
+
+    def __init__(self):
+        super(c_nvmlComputeInstanceProfileInfo_v2_t, self).__init__(version=nvmlComputeInstanceProfileInfo_v2)
+
+class c_nvmlComputeInstanceInfo_t(Structure):
+    _fields_ = [("device", c_nvmlDevice_t),
+                ("gpuInstance", c_nvmlGpuInstance_t),
+                ("id", c_uint),
+                ("profileId", c_uint),
+                ("placement", c_nvmlComputeInstancePlacement_t)
+               ]
+
+NVML_MAX_GPU_UTILIZATIONS = 8
+NVML_GPU_UTILIZATION_DOMAIN_GPU    = 0
+NVML_GPU_UTILIZATION_DOMAIN_FB     = 1
+NVML_GPU_UTILIZATION_DOMAIN_VID    = 2
+NVML_GPU_UTILIZATION_DOMAIN_BUS    = 3
+class c_nvmlGpuDynamicPstatesUtilization_t(Structure):
+    _fields_ = [("bIsPresent", c_uint, 1),
+                ("percentage", c_uint),
+                ("incThreshold", c_uint),
+                ("decThreshold", c_uint)]
+class c_nvmlGpuDynamicPstatesInfo_t(Structure):
+    _fields_ = [("flags", c_uint),
+                ("utilization", c_nvmlGpuDynamicPstatesUtilization_t * NVML_MAX_GPU_UTILIZATIONS)]
+
+NVML_MAX_THERMAL_SENSORS_PER_GPU = 3
+
+NVML_THERMAL_TARGET_NONE          = 0
+NVML_THERMAL_TARGET_GPU           = 1
+NVML_THERMAL_TARGET_MEMORY        = 2
+NVML_THERMAL_TARGET_POWER_SUPPLY  = 4
+NVML_THERMAL_TARGET_BOARD         = 8
+NVML_THERMAL_TARGET_VCD_BOARD     = 9
+NVML_THERMAL_TARGET_VCD_INLET     = 10
+NVML_THERMAL_TARGET_VCD_OUTLET    = 11
+NVML_THERMAL_TARGET_ALL           = 15
+NVML_THERMAL_TARGET_UNKNOWN       = -1
+
+NVML_THERMAL_CONTROLLER_NONE            = 0
+NVML_THERMAL_CONTROLLER_GPU_INTERNAL    = 1
+NVML_THERMAL_CONTROLLER_ADM1032         = 2
+NVML_THERMAL_CONTROLLER_ADT7461         = 3
+NVML_THERMAL_CONTROLLER_MAX6649         = 4
+NVML_THERMAL_CONTROLLER_MAX1617         = 5
+NVML_THERMAL_CONTROLLER_LM99            = 6
+NVML_THERMAL_CONTROLLER_LM89            = 7
+NVML_THERMAL_CONTROLLER_LM64            = 8
+NVML_THERMAL_CONTROLLER_G781            = 9
+NVML_THERMAL_CONTROLLER_ADT7473         = 10
+NVML_THERMAL_CONTROLLER_SBMAX6649       = 11
+NVML_THERMAL_CONTROLLER_VBIOSEVT        = 12
+NVML_THERMAL_CONTROLLER_OS              = 13
+NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS = 14
+NVML_THERMAL_CONTROLLER_NVSYSCON_E551   = 15
+NVML_THERMAL_CONTROLLER_MAX6649R        = 16
+NVML_THERMAL_CONTROLLER_ADT7473S        = 17
+NVML_THERMAL_CONTROLLER_UNKNOWN         = -1
+
+class c_nvmlGpuThermalSensor_t(Structure):
+    _fields_ = [("controller", c_int),
+                ("defaultMinTemp", c_int),
+                ("defaultMaxTemp", c_int),
+                ("currentTemp", c_int),
+                ("target", c_int)]
+class c_nvmlGpuThermalSettings_t(Structure):
+    _fields_ = [("count", c_uint),
+                ("sensor", c_nvmlGpuThermalSensor_t * NVML_MAX_THERMAL_SENSORS_PER_GPU)]
+
+_nvmlCoolerControl_t = c_uint
+NVML_THERMAL_COOLER_SIGNAL_NONE        = 0
+NVML_THERMAL_COOLER_SIGNAL_TOGGLE      = 1
+NVML_THERMAL_COOLER_SIGNAL_VARIABLE    = 2
+NVML_THERMAL_COOLER_SIGNAL_COUNT       = 3
+
+_nvmlCoolerTarget_t = c_uint
+NVML_THERMAL_COOLER_TARGET_NONE          = (1 << 0)
+NVML_THERMAL_COOLER_TARGET_GPU           = (1 << 1)
+NVML_THERMAL_COOLER_TARGET_MEMORY        = (1 << 2)
+NVML_THERMAL_COOLER_TARGET_POWER_SUPPLY  = (1 << 3)
+NVML_THERMAL_COOLER_TARGET_GPU_RELATED   = (NVML_THERMAL_COOLER_TARGET_GPU | NVML_THERMAL_COOLER_TARGET_MEMORY | NVML_THERMAL_COOLER_TARGET_POWER_SUPPLY)
+
+class c_nvmlCoolerInfo_t(_PrintableStructure):
+    _fields_ = [("version", c_uint),
+                ("index", c_uint),
+                ("coolerControlType", _nvmlCoolerControl_t),
+                ("coolerTarget", _nvmlCoolerTarget_t)
+               ]
+
+nvmlCoolerInfo_v1 = 0x1000010
+
+def nvmlDeviceGetCoolerInfo(handle):
+    c_coolerInfo = c_nvmlCoolerInfo_t()
+    c_coolerInfo.version = nvmlCoolerInfo_v1
+    c_coolerInfo.index = 0
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCoolerInfo")
+    ret = fn(handle, byref(c_coolerInfo))
+    _nvmlCheckReturn(ret)
+    return [c_coolerInfo.coolerControlType, c_coolerInfo.coolerTarget]
+
+class struct_c_nvmlComputeInstance_t(Structure):
+    pass # opaque handle
+c_nvmlComputeInstance_t = POINTER(struct_c_nvmlComputeInstance_t)
+
+class c_nvmlDeviceAttributes(Structure):
+    _fields_ = [("multiprocessorCount", c_uint),
+                ("sharedCopyEngineCount", c_uint),
+                ("sharedDecoderCount", c_uint),
+                ("sharedEncoderCount", c_uint),
+                ("sharedJpegCount", c_uint),
+                ("sharedOfaCount", c_uint),
+                ("gpuInstanceSliceCount", c_uint),
+                ("computeInstanceSliceCount", c_uint),
+                ("memorySizeMB", c_ulonglong),
+               ]
+
+class c_nvmlRowRemapperHistogramValues(Structure):
+    _fields_ = [("max", c_uint),
+                ("high", c_uint),
+                ("partial", c_uint),
+                ("low", c_uint),
+                ("none", c_uint)
+               ]
+
+NVML_GPU_CERT_CHAIN_SIZE                = 0x1000
+NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE    = 0x1400
+NVML_CC_GPU_CEC_NONCE_SIZE              = 0x20
+NVML_CC_GPU_ATTESTATION_REPORT_SIZE     = 0x2000
+NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE = 0x1000
+NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT = 0
+NVML_CC_CEC_ATTESTATION_REPORT_PRESENT     = 1
+
+class c_nvmlConfComputeSystemState_t(Structure):
+    _fields_ = [('environment', c_uint),
+                ('ccFeature', c_uint),
+                ('devToolsMode', c_uint),
+               ]
+
+nvmlSystemConfComputeSettings_v1 = 0x1000014
+
+class c_nvmlSystemConfComputeSettings_v1_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('environment', c_uint),
+                ('ccFeature', c_uint),
+                ('devToolsMode', c_uint),
+                ('multiGpuMode', c_uint),
+               ]
+    def __init__(self):
+        super(c_nvmlSystemConfComputeSettings_v1_t, self).__init__(version=nvmlSystemConfComputeSettings_v1)
+
+class c_nvmlConfComputeSystemCaps_t(Structure):
+    _fields_ = [('cpuCaps', c_uint),
+                ('gpusCaps', c_uint),
+               ]
+
+class c_nvmlConfComputeMemSizeInfo_t(Structure):
+    _fields_ = [('protectedMemSizeKib', c_ulonglong),
+                ('unprotectedMemSizeKib', c_ulonglong),
+               ]
+
+class c_nvmlConfComputeGpuCertificate_t(Structure):
+    _fields_ = [('certChainSize', c_uint),
+                ('attestationCertChainSize', c_uint),
+                ('certChain', c_uint8 * NVML_GPU_CERT_CHAIN_SIZE),
+                ('attestationCertChain', c_uint8 * NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE),
+               ]
+
+class c_nvmlConfComputeGpuAttestationReport_t(Structure):
+    _fields_ = [('isCecAttestationReportPresent', c_uint),
+                ('attestationReportSize', c_uint),
+                ('cecAttestationReportSize', c_uint),
+                ('nonce', c_uint8 * NVML_CC_GPU_CEC_NONCE_SIZE),
+                ('attestationReport', c_uint8 * NVML_CC_GPU_ATTESTATION_REPORT_SIZE),
+                ('cecAttestationReport', c_uint8 * NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE),
+               ]
+
+class c_nvmlConfComputeSetKeyRotationThresholdInfo_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('maxAttackerAdvantage', c_ulong),
+               ]
+ConfComputeSetKeyRotationThresholdInfo_v1 = 0x1000010
+
+class c_nvmlConfComputeGetKeyRotationThresholdInfo_t(Structure):
+    _fields_ = [('version', c_uint),
+                ('attackerAdvantage', c_ulong),
+               ]
+ConfComputeGetKeyRotationThresholdInfo_v1 = 0x1000010
+
+
+## string/bytes conversion for ease of use
+def convertStrBytes(func):
+    '''
+    In python 3, strings are unicode instead of bytes, and need to be converted for ctypes
+    Args from caller: (1, 'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF>)
+    Args passed to function: (1, b'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF)>
+    ----
+    Returned from function: b'returned string'
+    Returned to caller: 'returned string'
+    '''
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # encoding a str returns bytes in python 2 and 3
+        args = [arg.encode() if isinstance(arg, str) else arg for arg in args]
+        res = func(*args, **kwargs)
+        # In python 2, str and bytes are the same
+        # In python 3, str is unicode and should be decoded.
+        # Ctypes handles most conversions, this only effects c_char and char arrays.
+        if isinstance(res, bytes):
+            if isinstance(res, str):
+                return res
+            return res.decode()
+        return res
+
+    if sys.version_info >= (3,):
+        return wrapper
+    return func
+
+def throwOnVersionMismatch(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except NVMLError_FunctionNotFound:
+            raise NVMLLibraryMismatchError("Unversioned function called and the "
+                                           "pyNVML version does not match the NVML lib version. "
+                                           "Either use matching pyNVML and NVML lib versions or "
+                                           "use a versioned function such as " + func.__name__ + "_v2")
+    return wrapper
+
+## C function wrappers ##
+def nvmlInitWithFlags(flags):
+    _LoadNvmlLibrary()
+
+    #
+    # Initialize the library
+    #
+    fn = _nvmlGetFunctionPointer("nvmlInitWithFlags")
+    ret = fn(flags)
+    _nvmlCheckReturn(ret)
+
+    # Atomically update refcount
+    global _nvmlLib_refcount
+    libLoadLock.acquire()
+    _nvmlLib_refcount += 1
+    libLoadLock.release()
+    return None
+
+def nvmlInit():
+    nvmlInitWithFlags(0)
+    return None
+
+def _LoadNvmlLibrary():
+    '''
+    Load the library if it isn't loaded already
+    '''
+    global nvmlLib
+
+    if (nvmlLib == None):
+        # lock to ensure only one caller loads the library
+        libLoadLock.acquire()
+
+        try:
+            # ensure the library still isn't loaded
+            if (nvmlLib == None):
+                try:
+                    if (sys.platform[:3] == "win"):
+                        # cdecl calling convention
+                        try:
+                            # Check for nvml.dll in System32 first for DCH drivers
+                            nvmlLib = CDLL(os.path.join(os.getenv("WINDIR", "C:/Windows"), "System32/nvml.dll"))
+                        except OSError as ose:
+                            # If nvml.dll is not found in System32, it should be in ProgramFiles
+                            # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll
+                            nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), "NVIDIA Corporation/NVSMI/nvml.dll"))
+                    else:
+                        # assume linux
+                        nvmlLib = CDLL("libnvidia-ml.so.1")
+                except OSError as ose:
+                    _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND)
+                if (nvmlLib == None):
+                    _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND)
+        finally:
+            # lock is always freed
+            libLoadLock.release()
+
+def nvmlShutdown():
+    #
+    # Leave the library loaded, but shutdown the interface
+    #
+    fn = _nvmlGetFunctionPointer("nvmlShutdown")
+    ret = fn()
+    _nvmlCheckReturn(ret)
+
+    # Atomically update refcount
+    global _nvmlLib_refcount
+    libLoadLock.acquire()
+    if (0 < _nvmlLib_refcount):
+        _nvmlLib_refcount -= 1
+    libLoadLock.release()
+    return None
+
+# Added in 2.285
+@convertStrBytes
+def nvmlErrorString(result):
+    fn = _nvmlGetFunctionPointer("nvmlErrorString")
+    fn.restype = c_char_p # otherwise return is an int
+    ret = fn(result)
+    return ret
+
+# Added in 2.285
+@convertStrBytes
+def nvmlSystemGetNVMLVersion():
+    c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion")
+    ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+def nvmlSystemGetCudaDriverVersion():
+    c_cuda_version = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion")
+    ret = fn(byref(c_cuda_version))
+    _nvmlCheckReturn(ret)
+    return c_cuda_version.value
+
+def nvmlSystemGetCudaDriverVersion_v2():
+    c_cuda_version = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion_v2")
+    ret = fn(byref(c_cuda_version))
+    _nvmlCheckReturn(ret)
+    return c_cuda_version.value
+
+# Added in 2.285
+@convertStrBytes
+def nvmlSystemGetProcessName(pid):
+    c_name = create_string_buffer(1024)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName")
+    ret = fn(c_uint(pid), c_name, c_uint(1024))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+@convertStrBytes
+def nvmlSystemGetDriverVersion():
+    c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion")
+    ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 2.285
+def nvmlSystemGetHicVersion():
+    c_count = c_uint(0)
+    hics = None
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion")
+
+    # get the count
+    ret = fn(byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # If there are no hics
+    if (c_count.value == 0):
+        return []
+
+    hic_array = c_nvmlHwbcEntry_t * c_count.value
+    hics = hic_array()
+    ret = fn(byref(c_count), hics)
+    _nvmlCheckReturn(ret)
+    return hics
+
+def nvmlSystemGetDriverBranch():
+    c_branchInfo = c_nvmlSystemDriverBranchInfo_v1_t(0)
+    c_branchInfo.version = SystemDriverBranchInfo_v1
+    fn  = _nvmlGetFunctionPointer("nvmlSystemGetDriverBranch")
+    ret = fn(byref(c_branchInfo), c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_branchInfo
+
+## Unit get functions
+def nvmlUnitGetCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetCount")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlUnitGetHandleByIndex(index):
+    c_index = c_uint(index)
+    unit = c_nvmlUnit_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex")
+    ret = fn(c_index, byref(unit))
+    _nvmlCheckReturn(ret)
+    return unit
+
+def nvmlUnitGetUnitInfo(unit):
+    c_info = c_nvmlUnitInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo")
+    ret = fn(unit, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlUnitGetLedState(unit):
+    c_state =  c_nvmlLedState_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState")
+    ret = fn(unit, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state
+
+def nvmlUnitGetPsuInfo(unit):
+    c_info = c_nvmlPSUInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo")
+    ret = fn(unit, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlUnitGetTemperature(unit, type):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature")
+    ret = fn(unit, c_uint(type), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlUnitGetFanSpeedInfo(unit):
+    c_speeds = c_nvmlUnitFanSpeeds_t()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo")
+    ret = fn(unit, byref(c_speeds))
+    _nvmlCheckReturn(ret)
+    return c_speeds
+
+# added to API
+def nvmlUnitGetDeviceCount(unit):
+    c_count = c_uint(0)
+    # query the unit to determine device count
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices")
+    ret = fn(unit, byref(c_count), None)
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = NVML_SUCCESS
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlUnitGetDevices(unit):
+    c_count = c_uint(nvmlUnitGetDeviceCount(unit))
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices")
+    ret = fn(unit, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return c_devices
+
+## Device get functions
+def nvmlDeviceGetCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount_v2")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetHandleByIndex(index):
+    c_index = c_uint(index)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex_v2")
+    ret = fn(c_index, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleBySerial(serial):
+    c_serial = c_char_p(serial)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial")
+    ret = fn(c_serial, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleByUUID(uuid):
+    c_uuid = c_char_p(uuid)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID")
+    ret = fn(c_uuid, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetHandleByPciBusId(pciBusId):
+    c_busId = c_char_p(pciBusId)
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId_v2")
+    ret = fn(c_busId, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+@convertStrBytes
+def nvmlDeviceGetName(handle):
+    c_name = create_string_buffer(NVML_DEVICE_NAME_V2_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetName")
+    ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_V2_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+class c_nvmlDevicePerfModes_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('str', c_char * NVML_PERF_MODES_BUFFER_SIZE),
+    ]
+
+nvmlDevicePerfModes_v1 = 0x1000804
+
+@convertStrBytes
+def nvmlDeviceGetPerformanceModes(handle):
+    perfModes = c_nvmlDevicePerfModes_v1_t()
+    perfModes.version = nvmlDevicePerfModes_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceModes")
+    ret = fn(handle, byref(perfModes))
+    _nvmlCheckReturn(ret)
+    return perfModes.str
+
+class c_nvmlDeviceCurrentClockFreqs_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('str', c_char * NVML_PERF_MODES_BUFFER_SIZE),
+    ]
+
+nvmlDeviceCurrentClockFreqs_v1 = 0x1000804
+
+@convertStrBytes
+def nvmlDeviceGetCurrentClockFreqs(handle):
+    currentClockFreqs = c_nvmlDeviceCurrentClockFreqs_v1_t()
+    currentClockFreqs.version = nvmlDeviceCurrentClockFreqs_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClockFreqs")
+    ret = fn(handle, byref(currentClockFreqs))
+    _nvmlCheckReturn(ret)
+    return currentClockFreqs.str
+
+def nvmlDeviceGetBoardId(handle):
+    c_id = c_uint();
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardId")
+    ret = fn(handle, byref(c_id))
+    _nvmlCheckReturn(ret)
+    return c_id.value
+
+def nvmlDeviceGetMultiGpuBoard(handle):
+    c_multiGpu = c_uint();
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMultiGpuBoard")
+    ret = fn(handle, byref(c_multiGpu))
+    _nvmlCheckReturn(ret)
+    return c_multiGpu.value
+
+def nvmlDeviceGetBrand(handle):
+    c_type = _nvmlBrandType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBrand")
+    ret = fn(handle, byref(c_type))
+    _nvmlCheckReturn(ret)
+    return c_type.value
+
+def nvmlDeviceGetC2cModeInfoV1(handle):
+    c_info = c_nvmlC2cModeInfo_v1_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetC2cModeInfoV")
+    ret = fn(handle, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlDeviceGetC2cModeInfoV(handle):
+    return nvmlDeviceGetC2cModeInfoV1(handle)
+
+@convertStrBytes
+def nvmlDeviceGetBoardPartNumber(handle):
+    c_part_number = create_string_buffer(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardPartNumber")
+    ret = fn(handle, c_part_number, c_uint(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_part_number.value
+
+@convertStrBytes
+def nvmlDeviceGetSerial(handle):
+    c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial")
+    ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_serial.value
+
+def nvmlDeviceGetModuleId(handle, moduleId=c_uint()):
+    isReference = type(moduleId) is not c_uint
+    moduleIdRef = moduleId if isReference else byref(moduleId)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetModuleId")
+    ret = fn(handle, moduleIdRef)
+    if isReference:
+        return ret
+    else:
+        _nvmlCheckReturn(ret)
+        return moduleId.value
+
+def nvmlDeviceGetMemoryAffinity(handle, nodeSetSize, scope):
+    affinity_array = c_ulonglong * nodeSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryAffinity")
+    ret = fn(handle, nodeSetSize, byref(c_affinity), _nvmlAffinityScope_t(scope))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceGetCpuAffinityWithinScope(handle, cpuSetSize, scope):
+    affinity_array = c_ulonglong * cpuSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCpuAffinityWithinScope")
+    ret = fn(handle, cpuSetSize, byref(c_affinity), _nvmlAffinityScope_t(scope))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceGetCpuAffinity(handle, cpuSetSize):
+    affinity_array = c_ulonglong * cpuSetSize
+    c_affinity = affinity_array()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCpuAffinity")
+    ret = fn(handle, cpuSetSize, byref(c_affinity))
+    _nvmlCheckReturn(ret)
+    return c_affinity
+
+def nvmlDeviceSetCpuAffinity(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetCpuAffinity")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearCpuAffinity(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearCpuAffinity")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNumaNodeId(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumaNodeId")
+    node = c_int()
+    ret = fn(handle, byref(node))
+    _nvmlCheckReturn(ret)
+    return node.value
+
+def nvmlDeviceGetMinorNumber(handle):
+    c_minor_number = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinorNumber")
+    ret = fn(handle, byref(c_minor_number))
+    _nvmlCheckReturn(ret)
+    return c_minor_number.value
+
+@convertStrBytes
+def nvmlDeviceGetUUID(handle):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID")
+    ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_V2_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlDeviceGetInforomVersion(handle, infoRomObject):
+    c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion")
+    ret = fn(handle, _nvmlInforomObject_t(infoRomObject),
+                 c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 4.304
+@convertStrBytes
+def nvmlDeviceGetInforomImageVersion(handle):
+    c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomImageVersion")
+    ret = fn(handle, c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 4.304
+def nvmlDeviceGetInforomConfigurationChecksum(handle):
+    c_checksum = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomConfigurationChecksum")
+    ret = fn(handle, byref(c_checksum))
+    _nvmlCheckReturn(ret)
+    return c_checksum.value
+
+# Added in 4.304
+def nvmlDeviceValidateInforom(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceValidateInforom")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetLastBBXFlushTime(handle):
+    c_timestamp = c_ulonglong()
+    c_durationUs = c_ulong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetLastBBXFlushTime")
+    ret = fn(handle, byref(c_timestamp), byref(c_durationUs))
+    _nvmlCheckReturn(ret)
+    return [c_timestamp.value, c_durationUs.value]
+
+def nvmlDeviceGetDisplayMode(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceGetDisplayActive(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayActive")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+
+def nvmlDeviceGetPersistenceMode(handle):
+    c_state = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode")
+    ret = fn(handle, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+def nvmlDeviceGetPciInfoExt(handle, c_info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfoExt")
+    ret = fn(handle, c_info)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetPciInfo_v3(handle):
+    c_info = nvmlPciInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v3")
+    ret = fn(handle, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlDeviceGetPciInfo(handle):
+    return nvmlDeviceGetPciInfo_v3(handle)
+
+def nvmlDeviceGetClockInfo(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 2.285
+def nvmlDeviceGetMaxClockInfo(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 4.304
+def nvmlDeviceGetApplicationsClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetApplicationsClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+def nvmlDeviceGetMaxCustomerBoostClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxCustomerBoostClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+def nvmlDeviceGetClock(handle, type, id):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClock")
+    ret = fn(handle, _nvmlClockType_t(type), _nvmlClockId_t(id), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 5.319
+def nvmlDeviceGetDefaultApplicationsClock(handle, type):
+    c_clock = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultApplicationsClock")
+    ret = fn(handle, _nvmlClockType_t(type), byref(c_clock))
+    _nvmlCheckReturn(ret)
+    return c_clock.value
+
+# Added in 4.304
+def nvmlDeviceGetSupportedMemoryClocks(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedMemoryClocks")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no clocks
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        clocks_array = c_uint * c_count.value
+        c_clocks = clocks_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_clocks)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            procs.append(c_clocks[i])
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+# Added in 4.304
+def nvmlDeviceGetSupportedGraphicsClocks(handle, memoryClockMHz):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedGraphicsClocks")
+    ret = fn(handle, c_uint(memoryClockMHz), byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no clocks
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        clocks_array = c_uint * c_count.value
+        c_clocks = clocks_array()
+
+        # make the call again
+        ret = fn(handle, c_uint(memoryClockMHz), byref(c_count), c_clocks)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            procs.append(c_clocks[i])
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetFanSpeed(handle):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed")
+    ret = fn(handle, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetFanSpeed_v2(handle, fan):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed_v2")
+    ret = fn(handle, fan, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+class c_nvmlFanSpeedInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('fan', c_uint),
+        ('speed', c_uint),
+    ]
+
+nvmlFanSpeedInfo_v1 = 0x100000C
+
+def nvmlDeviceGetFanSpeedRPM(handle):
+    c_fanSpeed = c_nvmlFanSpeedInfo_t()
+    c_fanSpeed.fan = 0
+    c_fanSpeed.version = nvmlFanSpeedInfo_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeedRPM")
+    ret = fn(handle, byref(c_fanSpeed))
+    _nvmlCheckReturn(ret)
+    return c_fanSpeed.speed
+
+def nvmlDeviceGetTargetFanSpeed(handle, fan):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTargetFanSpeed")
+    ret = fn(handle, fan, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetNumFans(device):
+    c_numFans = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumFans")
+    ret = fn(device, byref(c_numFans))
+    _nvmlCheckReturn(ret)
+    return c_numFans.value
+
+def nvmlDeviceSetDefaultFanSpeed_v2(handle, index):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultFanSpeed_v2");
+    ret = fn(handle, index)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetMinMaxFanSpeed(handle, minSpeed=c_uint(), maxSpeed=c_uint()):
+    isReference = (type(minSpeed) is not c_uint) or (type(maxSpeed) is not c_uint)
+    minSpeedRef = minSpeed if isReference else byref(minSpeed)
+    maxSpeedRef = maxSpeed if isReference else byref(maxSpeed)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxFanSpeed")
+    ret = fn(handle, minSpeedRef, maxSpeedRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else [minSpeed.value, maxSpeed.value]
+
+def nvmlDeviceGetFanControlPolicy_v2(handle, fan, fanControlPolicy=c_uint()):
+    isReference = type(fanControlPolicy) is not c_uint
+    fanControlPolicyRef = fanControlPolicy if isReference else byref(fanControlPolicy)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanControlPolicy_v2")
+    ret = fn(handle, fan, fanControlPolicyRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else fanControlPolicy.value
+
+def nvmlDeviceSetFanControlPolicy(handle, fan, fanControlPolicy):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanControlPolicy")
+    ret = fn(handle, fan, _nvmlFanControlPolicy_t(fanControlPolicy))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+class c_nvmlTemperature_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('sensorType', _nvmlTemperatureSensors_t),
+        ('temperature', c_int),
+    ]
+nvmlTemperature_v1 = 0x100000C
+
+def nvmlDeviceGetTemperatureV1(handle, sensor):
+    c_temp = c_nvmlTemperature_v1_t()
+    c_temp.version = nvmlTemperature_v1
+    c_temp.sensorType = _nvmlTemperatureSensors_t(sensor) 
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperatureV")
+    ret = fn(handle, byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.temperature
+
+def nvmlDeviceGetTemperatureV(handle, sensor, version=nvmlTemperature_v1):
+    if version == nvmlTemperature_v1:
+        return nvmlDeviceGetTemperatureV1(handle, sensor)
+    else:
+        raise NVMLError(NVML_ERROR_ARGUMENT_VERSION_MISMATCH)
+
+# DEPRECATED use nvmlDeviceGetTemperatureV instead
+def nvmlDeviceGetTemperature(handle, sensor):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature")
+    ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlDeviceGetTemperatureThreshold(handle, threshold):
+    c_temp = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperatureThreshold")
+    ret = fn(handle, _nvmlTemperatureThresholds_t(threshold), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return c_temp.value
+
+def nvmlDeviceSetTemperatureThreshold(handle, threshold, temp):
+    c_temp = c_uint()
+    c_temp.value = temp
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetTemperatureThreshold")
+    ret = fn(handle, _nvmlTemperatureThresholds_t(threshold), byref(c_temp))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetMarginTemperature(handle):
+    c_marginTempInfo = c_nvmlMarginTemperature_v1_t()
+    c_marginTempInfo.version = nvmlMarginTemperature_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMarginTemperature")
+    ret = fn(handle, byref(c_marginTempInfo))
+    _nvmlCheckReturn(ret)
+    return c_marginTempInfo.marginTemperature
+
+# DEPRECATED use nvmlDeviceGetPerformanceState
+def nvmlDeviceGetPowerState(handle):
+    c_pstate = _nvmlPstates_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState")
+    ret = fn(handle, byref(c_pstate))
+    _nvmlCheckReturn(ret)
+    return c_pstate.value
+
+def nvmlDeviceGetPerformanceState(handle):
+    c_pstate = _nvmlPstates_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState")
+    ret = fn(handle, byref(c_pstate))
+    _nvmlCheckReturn(ret)
+    return c_pstate.value
+
+def nvmlDeviceGetPowerManagementMode(handle):
+    c_pcapMode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode")
+    ret = fn(handle, byref(c_pcapMode))
+    _nvmlCheckReturn(ret)
+    return c_pcapMode.value
+
+def nvmlDeviceGetPowerManagementLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+# Added in 4.304
+def nvmlDeviceGetPowerManagementLimitConstraints(handle):
+    c_minLimit = c_uint()
+    c_maxLimit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimitConstraints")
+    ret = fn(handle, byref(c_minLimit), byref(c_maxLimit))
+    _nvmlCheckReturn(ret)
+    return [c_minLimit.value, c_maxLimit.value]
+
+# Added in 4.304
+def nvmlDeviceGetPowerManagementDefaultLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementDefaultLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+
+# Added in 331
+def nvmlDeviceGetEnforcedPowerLimit(handle):
+    c_limit = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEnforcedPowerLimit")
+    ret = fn(handle, byref(c_limit))
+    _nvmlCheckReturn(ret)
+    return c_limit.value
+
+def nvmlDeviceGetPowerUsage(handle):
+    c_watts = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage")
+    ret = fn(handle, byref(c_watts))
+    _nvmlCheckReturn(ret)
+    return c_watts.value
+
+def nvmlDeviceGetTotalEnergyConsumption(handle):
+    c_millijoules = c_uint64()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEnergyConsumption")
+    ret = fn(handle, byref(c_millijoules))
+    _nvmlCheckReturn(ret)
+    return c_millijoules.value
+
+# Added in 4.304
+def nvmlDeviceGetGpuOperationMode(handle):
+    c_currState = _nvmlGpuOperationMode_t()
+    c_pendingState = _nvmlGpuOperationMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuOperationMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.value, c_pendingState.value]
+
+# Added in 4.304
+def nvmlDeviceGetCurrentGpuOperationMode(handle):
+    return nvmlDeviceGetGpuOperationMode(handle)[0]
+
+# Added in 4.304
+def nvmlDeviceGetPendingGpuOperationMode(handle):
+    return nvmlDeviceGetGpuOperationMode(handle)[1]
+
+def nvmlDeviceGetMemoryInfo(handle, version=None):
+    if not version:
+        c_memory = c_nvmlMemory_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo")
+    else:
+        c_memory = c_nvmlMemory_v2_t()
+        c_memory.version = version
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo_v2")
+    ret = fn(handle, byref(c_memory))
+    _nvmlCheckReturn(ret)
+    return c_memory
+
+def nvmlDeviceGetBAR1MemoryInfo(handle):
+    c_bar1_memory = c_nvmlBAR1Memory_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBAR1MemoryInfo")
+    ret = fn(handle, byref(c_bar1_memory))
+    _nvmlCheckReturn(ret)
+    return c_bar1_memory
+
+def nvmlDeviceGetComputeMode(handle):
+    c_mode = _nvmlComputeMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceGetCudaComputeCapability(handle):
+    c_major = c_int()
+    c_minor = c_int()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCudaComputeCapability")
+    ret = fn(handle, byref(c_major), byref(c_minor))
+    _nvmlCheckReturn(ret)
+    return (c_major.value, c_minor.value)
+
+def nvmlDeviceGetEccMode(handle):
+    c_currState = _nvmlEnableState_t()
+    c_pendingState = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.value, c_pendingState.value]
+
+# added to API
+def nvmlDeviceGetCurrentEccMode(handle):
+    return nvmlDeviceGetEccMode(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingEccMode(handle):
+    return nvmlDeviceGetEccMode(handle)[1]
+
+def nvmlDeviceGetDefaultEccMode(handle):
+    c_defaultState = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultEccMode")
+    ret = fn(handle, byref(c_defaultState))
+    _nvmlCheckReturn(ret)
+    return [c_defaultState.value]
+
+def nvmlDeviceGetTotalEccErrors(handle, errorType, counterType):
+    c_count = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors")
+    ret = fn(handle, _nvmlMemoryErrorType_t(errorType),
+                 _nvmlEccCounterType_t(counterType), byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+# This is deprecated, instead use nvmlDeviceGetMemoryErrorCounter
+def nvmlDeviceGetDetailedEccErrors(handle, errorType, counterType):
+    c_counts = c_nvmlEccErrorCounts_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors")
+    ret = fn(handle, _nvmlMemoryErrorType_t(errorType),
+                 _nvmlEccCounterType_t(counterType), byref(c_counts))
+    _nvmlCheckReturn(ret)
+    return c_counts
+
+# Added in 4.304
+def nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, locationType):
+    c_count = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryErrorCounter")
+    ret = fn(handle,
+             _nvmlMemoryErrorType_t(errorType),
+             _nvmlEccCounterType_t(counterType),
+             _nvmlMemoryLocation_t(locationType),
+             byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetUtilizationRates(handle):
+    c_util = c_nvmlUtilization_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates")
+    ret = fn(handle, byref(c_util))
+    _nvmlCheckReturn(ret)
+    return c_util
+
+def nvmlDeviceGetEncoderUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetDecoderUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDecoderUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetJpgUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetJpgUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetOfaUtilization(handle):
+    c_util = c_uint()
+    c_samplingPeriod = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetOfaUtilization")
+    ret = fn(handle, byref(c_util), byref(c_samplingPeriod))
+    _nvmlCheckReturn(ret)
+    return [c_util.value, c_samplingPeriod.value]
+
+def nvmlDeviceGetPcieReplayCounter(handle):
+    c_replay = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieReplayCounter")
+    ret = fn(handle, byref(c_replay))
+    _nvmlCheckReturn(ret)
+    return c_replay.value
+
+def nvmlDeviceGetDriverModel(handle):
+    c_currModel = _nvmlDriverModel_t()
+    c_pendingModel = _nvmlDriverModel_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel")
+    ret = fn(handle, byref(c_currModel), byref(c_pendingModel))
+    _nvmlCheckReturn(ret)
+    return [c_currModel.value, c_pendingModel.value]
+
+# added to API
+def nvmlDeviceGetCurrentDriverModel(handle):
+    return nvmlDeviceGetDriverModel(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingDriverModel(handle):
+    return nvmlDeviceGetDriverModel(handle)[1]
+
+# Added in 2.285
+@convertStrBytes
+def nvmlDeviceGetVbiosVersion(handle):
+    c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion")
+    ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+# Added in 2.285
+def nvmlDeviceGetComputeRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+# Added in 2.285
+def nvmlDeviceGetComputeRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetComputeRunningProcesses(handle):
+    return nvmlDeviceGetComputeRunningProcesses_v3(handle)
+
+def nvmlDeviceGetGraphicsRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetGraphicsRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetGraphicsRunningProcesses(handle):
+    return nvmlDeviceGetGraphicsRunningProcesses_v3(handle)
+
+@throwOnVersionMismatch
+def nvmlDeviceGetMPSComputeRunningProcesses(handle):
+    return nvmlDeviceGetMPSComputeRunningProcesses_v3(handle)
+
+def nvmlDeviceGetMPSComputeRunningProcesses_v2(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMPSComputeRunningProcesses_v2")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v2_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle):
+    # first call to get the size
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMPSComputeRunningProcesses_v3")
+    ret = fn(handle, byref(c_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        # oversize the array incase more processes are created
+        c_count.value = c_count.value * 2 + 5
+        proc_array = c_nvmlProcessInfo_v3_t * c_count.value
+        c_procs = proc_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_count), c_procs)
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_count.value):
+            # use an alternative struct for this object
+            obj = nvmlStructToFriendlyObject(c_procs[i])
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                # special case for WDDM on Windows, see comment above
+                obj.usedGpuMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetRunningProcessDetailList(handle, version, mode):
+    c_processDetailList = c_nvmlProcessDetailList_t()
+    c_processDetailList.version = version
+    c_processDetailList.mode = mode
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRunningProcessDetailList")
+
+    # first call to get the size
+    ret = fn(handle, byref(c_processDetailList))
+    if (ret == NVML_SUCCESS):
+        # special case, no running processes
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        c_procs = c_nvmlProcessDetail_v1_t * c_processDetailList.numProcArrayEntries
+        c_processDetailList.procArray = cast((c_procs)(), POINTER(c_nvmlProcessDetail_v1_t))
+
+        # make the call again
+        ret = fn(handle, byref(c_processDetailList))
+        _nvmlCheckReturn(ret)
+
+        procs = []
+        for i in range(c_processDetailList.numProcArrayEntries):
+            # use an alternative struct for this object
+            obj = c_processDetailList.procArray[i]
+            if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                obj.usedGpuMemory = None
+            if (obj.usedGpuCcProtectedMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+                obj.usedGpuCcProtectedMemory = None
+            procs.append(obj)
+
+        return procs
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetAutoBoostedClocksEnabled(handle):
+    c_isEnabled = _nvmlEnableState_t()
+    c_defaultIsEnabled = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAutoBoostedClocksEnabled")
+    ret = fn(handle, byref(c_isEnabled), byref(c_defaultIsEnabled))
+    _nvmlCheckReturn(ret)
+    return [c_isEnabled.value, c_defaultIsEnabled.value]
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+## Set functions
+def nvmlUnitSetLedState(unit, color):
+    fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState")
+    ret = fn(unit, _nvmlLedColor_t(color))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetPersistenceMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetComputeMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode")
+    ret = fn(handle, _nvmlComputeMode_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetEccMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearEccErrorCounts(handle, counterType):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts")
+    ret = fn(handle, _nvmlEccCounterType_t(counterType))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetDriverModel(handle, model):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel")
+    ret = fn(handle, _nvmlDriverModel_t(model))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetAutoBoostedClocksEnabled(handle, enabled):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAutoBoostedClocksEnabled")
+    ret = fn(handle, _nvmlEnableState_t(enabled))
+    _nvmlCheckReturn(ret)
+    return None
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+def nvmlDeviceSetDefaultAutoBoostedClocksEnabled(handle, enabled, flags):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultAutoBoostedClocksEnabled")
+    ret = fn(handle, _nvmlEnableState_t(enabled), c_uint(flags))
+    _nvmlCheckReturn(ret)
+    return None
+    #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks
+
+def nvmlDeviceSetGpuLockedClocks(handle, minGpuClockMHz, maxGpuClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuLockedClocks")
+    ret = fn(handle, c_uint(minGpuClockMHz), c_uint(maxGpuClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetGpuLockedClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetGpuLockedClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetMemoryLockedClocks(handle, minMemClockMHz, maxMemClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemoryLockedClocks")
+    ret = fn(handle, c_uint(minMemClockMHz), c_uint(maxMemClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetMemoryLockedClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetMemoryLockedClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetClkMonStatus(handle, c_clkMonInfo=nvmlClkMonStatus_t()):
+    isReference = type(c_clkMonInfo) is not nvmlClkMonStatus_t
+    c_clkMonInfoRef = c_clkMonInfo if isReference else byref(c_clkMonInfo)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClkMonStatus")
+    ret = fn(handle, c_clkMonInfoRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_clkMonInfo
+
+# Added in 4.304
+def nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetApplicationsClocks")
+    ret = fn(handle, c_uint(maxMemClockMHz), c_uint(maxGraphicsClockMHz))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceResetApplicationsClocks(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetApplicationsClocks")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceSetPowerManagementLimit(handle, limit):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit")
+    ret = fn(handle, c_uint(limit))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 4.304
+def nvmlDeviceSetGpuOperationMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuOperationMode")
+    ret = fn(handle, _nvmlGpuOperationMode_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 2.285
+def nvmlEventSetCreate():
+    fn = _nvmlGetFunctionPointer("nvmlEventSetCreate")
+    eventSet = c_nvmlEventSet_t()
+    ret = fn(byref(eventSet))
+    _nvmlCheckReturn(ret)
+    return eventSet
+
+# Added in 2.285
+def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents")
+    ret = fn(handle, c_ulonglong(eventTypes), eventSet)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 2.285
+def nvmlDeviceGetSupportedEventTypes(handle):
+    c_eventTypes = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes")
+    ret = fn(handle, byref(c_eventTypes))
+    _nvmlCheckReturn(ret)
+    return c_eventTypes.value
+
+# raises NVML_ERROR_TIMEOUT exception on timeout
+def nvmlEventSetWait_v2(eventSet, timeoutms):
+    fn = _nvmlGetFunctionPointer("nvmlEventSetWait_v2")
+    data = c_nvmlEventData_t()
+    ret = fn(eventSet, byref(data), c_uint(timeoutms))
+    _nvmlCheckReturn(ret)
+    return data
+
+def nvmlEventSetWait(eventSet, timeoutms):
+    return nvmlEventSetWait_v2(eventSet, timeoutms)
+
+# Added in 2.285
+def nvmlEventSetFree(eventSet):
+    fn = _nvmlGetFunctionPointer("nvmlEventSetFree")
+    ret = fn(eventSet)
+    _nvmlCheckReturn(ret)
+    return None
+
+# Added in 3.295
+def nvmlDeviceOnSameBoard(handle1, handle2):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard")
+    onSameBoard = c_int()
+    ret = fn(handle1, handle2, byref(onSameBoard))
+    _nvmlCheckReturn(ret)
+    return (onSameBoard.value != 0)
+
+# Added in 3.295
+def nvmlDeviceGetCurrPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 3.295
+def nvmlDeviceGetMaxPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 3.295
+def nvmlDeviceGetCurrPcieLinkWidth(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth")
+    width = c_uint()
+    ret = fn(handle, byref(width))
+    _nvmlCheckReturn(ret)
+    return width.value
+
+# Added in 3.295
+def nvmlDeviceGetMaxPcieLinkWidth(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth")
+    width = c_uint()
+    ret = fn(handle, byref(width))
+    _nvmlCheckReturn(ret)
+    return width.value
+
+def nvmlDeviceGetGpuMaxPcieLinkGeneration(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuMaxPcieLinkGeneration")
+    gen = c_uint()
+    ret = fn(handle, byref(gen))
+    _nvmlCheckReturn(ret)
+    return gen.value
+
+# Added in 4.304
+def nvmlDeviceGetSupportedClocksThrottleReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksThrottleReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+def nvmlDeviceGetSupportedClocksEventReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksEventReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+# Added in 4.304
+def nvmlDeviceGetCurrentClocksThrottleReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksThrottleReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+def nvmlDeviceGetCurrentClocksEventReasons(handle):
+    c_reasons= c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksEventReasons")
+    ret = fn(handle, byref(c_reasons))
+    _nvmlCheckReturn(ret)
+    return c_reasons.value
+
+# Added in 5.319
+def nvmlDeviceGetIndex(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetIndex")
+    c_index = c_uint()
+    ret = fn(handle, byref(c_index))
+    _nvmlCheckReturn(ret)
+    return c_index.value
+
+# Added in 5.319
+def nvmlDeviceGetAccountingMode(handle):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingMode")
+    ret = fn(handle, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlDeviceSetAccountingMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAccountingMode")
+    ret = fn(handle, _nvmlEnableState_t(mode))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceClearAccountingPids(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearAccountingPids")
+    ret = fn(handle)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetAccountingStats(handle, pid):
+    stats = c_nvmlAccountingStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingStats")
+    ret = fn(handle, c_uint(pid), byref(stats))
+    _nvmlCheckReturn(ret)
+    if (stats.maxMemoryUsage == NVML_VALUE_NOT_AVAILABLE_ulonglong.value):
+        # special case for WDDM on Windows, see comment above
+        stats.maxMemoryUsage = None
+    return stats
+
+def nvmlDeviceGetAccountingPids(handle):
+    count = c_uint(nvmlDeviceGetAccountingBufferSize(handle))
+    pids = (c_uint * count.value)()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingPids")
+    ret = fn(handle, byref(count), pids)
+    _nvmlCheckReturn(ret)
+    return list(map(int, pids[0:count.value]))
+
+def nvmlDeviceGetAccountingBufferSize(handle):
+    bufferSize = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingBufferSize")
+    ret = fn(handle, byref(bufferSize))
+    _nvmlCheckReturn(ret)
+    return int(bufferSize.value)
+
+def nvmlDeviceGetRetiredPages(device, sourceFilter):
+    c_source = _nvmlPageRetirementCause_t(sourceFilter)
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPages")
+
+    # First call will get the size
+    ret = fn(device, c_source, byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    # oversize the array for the rare cases where additional pages
+    # are retired between NVML calls
+    c_count.value = c_count.value * 2 + 5
+    page_array = c_ulonglong * c_count.value
+    c_pages = page_array()
+    ret = fn(device, c_source, byref(c_count), c_pages)
+    _nvmlCheckReturn(ret)
+    return list(map(int, c_pages[0:c_count.value]))
+
+def nvmlDeviceGetRetiredPages_v2(device, sourceFilter):
+    c_source = _nvmlPageRetirementCause_t(sourceFilter)
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPages_v2")
+
+    # First call will get the size
+    ret = fn(device, c_source, byref(c_count), None)
+
+    # this should only fail with insufficient size
+    if ((ret != NVML_SUCCESS) and
+        (ret != NVML_ERROR_INSUFFICIENT_SIZE)):
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    # oversize the array for the rare cases where additional pages
+    # are retired between NVML calls
+    c_count.value = c_count.value * 2 + 5
+    page_array = c_ulonglong * c_count.value
+    c_pages = page_array()
+    times_array = c_ulonglong * c_count.value
+    c_times = times_array()
+    ret = fn(device, c_source, byref(c_count), c_pages, c_times)
+    _nvmlCheckReturn(ret)
+    return [ { 'address': int(c_pages[i]), 'timestamp': int(c_times[i]) } for i in range(c_count.value) ];
+
+def nvmlDeviceGetRetiredPagesPendingStatus(device):
+    c_pending = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRetiredPagesPendingStatus")
+    ret = fn(device, byref(c_pending))
+    _nvmlCheckReturn(ret)
+    return int(c_pending.value)
+
+def nvmlDeviceGetAPIRestriction(device, apiType):
+    c_permission = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAPIRestriction")
+    ret = fn(device, _nvmlRestrictedAPI_t(apiType), byref(c_permission))
+    _nvmlCheckReturn(ret)
+    return int(c_permission.value)
+
+def nvmlDeviceSetAPIRestriction(handle, apiType, isRestricted):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetAPIRestriction")
+    ret = fn(handle, _nvmlRestrictedAPI_t(apiType), _nvmlEnableState_t(isRestricted))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetBridgeChipInfo(handle):
+    bridgeHierarchy = c_nvmlBridgeChipHierarchy_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBridgeChipInfo")
+    ret = fn(handle, byref(bridgeHierarchy))
+    _nvmlCheckReturn(ret)
+    return bridgeHierarchy
+
+def nvmlDeviceGetSamples(device, sampling_type, timeStamp):
+    c_sampling_type = _nvmlSamplingType_t(sampling_type)
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_sample_count = c_uint(0)
+    c_sample_value_type = _nvmlValueType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSamples")
+
+    ## First Call gets the size
+    ret = fn(device, c_sampling_type, c_time_stamp, byref(c_sample_value_type), byref(c_sample_count), None)
+
+    # Stop if this fails
+    if (ret != NVML_SUCCESS):
+        raise NVMLError(ret)
+
+    sampleArray = c_sample_count.value * c_nvmlSample_t
+    c_samples = sampleArray()
+    ret = fn(device, c_sampling_type, c_time_stamp,  byref(c_sample_value_type), byref(c_sample_count), c_samples)
+    _nvmlCheckReturn(ret)
+    return (c_sample_value_type.value, c_samples[0:c_sample_count.value])
+
+def nvmlDeviceGetViolationStatus(device, perfPolicyType):
+    c_perfPolicy_type = _nvmlPerfPolicyType_t(perfPolicyType)
+    c_violTime = c_nvmlViolationTime_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetViolationStatus")
+
+    ## Invoke the method to get violation time
+    ret = fn(device, c_perfPolicy_type, byref(c_violTime))
+    _nvmlCheckReturn(ret)
+    return c_violTime
+
+def nvmlDeviceGetPcieThroughput(device, counter):
+    c_util = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieThroughput")
+    ret = fn(device, _nvmlPcieUtilCounter_t(counter), byref(c_util))
+    _nvmlCheckReturn(ret)
+    return c_util.value
+
+def nvmlSystemGetTopologyGpuSet(cpuNumber):
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetTopologyGpuSet")
+
+    # First call will get the size
+    ret = fn(cpuNumber, byref(c_count), None)
+
+    if ret != NVML_SUCCESS:
+        raise NVMLError(ret)
+    # call again with a buffer
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    ret = fn(cpuNumber, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return list(c_devices[0:c_count.value])
+
+def nvmlDeviceGetTopologyNearestGpus(device, level):
+    c_count = c_uint(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyNearestGpus")
+
+    # First call will get the size
+    ret = fn(device, level, byref(c_count), None)
+
+    if ret != NVML_SUCCESS:
+        raise NVMLError(ret)
+
+    # call again with a buffer
+    device_array = c_nvmlDevice_t * c_count.value
+    c_devices = device_array()
+    ret = fn(device, level, byref(c_count), c_devices)
+    _nvmlCheckReturn(ret)
+    return list(c_devices[0:c_count.value])
+
+def nvmlDeviceGetTopologyCommonAncestor(device1, device2):
+    c_level = _nvmlGpuTopologyLevel_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyCommonAncestor")
+    ret = fn(device1, device2, byref(c_level))
+    _nvmlCheckReturn(ret)
+    return c_level.value
+
+def nvmlDeviceGetNvLinkUtilizationCounter(device, link, counter):
+    c_rxcounter = c_ulonglong()
+    c_txcounter = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkUtilizationCounter")
+    ret = fn(device, link, counter, byref(c_rxcounter), byref(c_txcounter))
+    _nvmlCheckReturn(ret)
+    return (c_rxcounter.value, c_txcounter.value)
+
+def nvmlDeviceFreezeNvLinkUtilizationCounter(device, link, counter, freeze):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceFreezeNvLinkUtilizationCounter")
+    ret = fn(device, link, counter, freeze)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceResetNvLinkUtilizationCounter(device, link, counter):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkUtilizationCounter")
+    ret = fn(device, link, counter)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceSetNvLinkUtilizationControl(device, link, counter, control, reset):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkUtilizationControl")
+    ret = fn(device, link, counter, byref(control), reset)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNvLinkUtilizationControl(device, link, counter):
+    c_control = nvmlNvLinkUtilizationControl_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkUtilizationControl")
+    ret = fn(device, link, counter, byref(c_control))
+    _nvmlCheckReturn(ret)
+    return c_control
+
+def nvmlDeviceGetNvLinkCapability(device, link, capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkCapability")
+    ret = fn(device, link, capability, byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceGetNvLinkErrorCounter(device, link, counter):
+    c_result = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkErrorCounter")
+    ret = fn(device, link, counter, byref(c_result))
+    _nvmlCheckReturn(ret)
+    return c_result.value
+
+def nvmlDeviceResetNvLinkErrorCounters(device, link):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkErrorCounters")
+    ret = fn(device, link)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetNvLinkRemotePciInfo(device, link):
+    c_pci = nvmlPciInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemotePciInfo_v2")
+    ret = fn(device, link, byref(c_pci))
+    _nvmlCheckReturn(ret)
+    return c_pci
+
+def nvmlDeviceGetNvLinkRemoteDeviceType(handle, link):
+    c_type = _nvmlNvLinkDeviceType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemoteDeviceType")
+    ret = fn(handle, link, byref(c_type))
+    _nvmlCheckReturn(ret)
+    return c_type.value
+
+def nvmlDeviceGetNvLinkState(device, link):
+    c_isActive = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkState")
+    ret = fn(device, link, byref(c_isActive))
+    _nvmlCheckReturn(ret)
+    return c_isActive.value
+
+def nvmlDeviceGetNvLinkVersion(device, link):
+    c_version = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkVersion")
+    ret = fn(device, link, byref(c_version))
+    _nvmlCheckReturn(ret)
+    return c_version.value
+
+def nvmlDeviceModifyDrainState(pciInfo, newState):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceModifyDrainState")
+    ret = fn(pointer(pciInfo), newState)
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceQueryDrainState(pciInfo):
+    c_newState = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceQueryDrainState")
+    ret = fn(pointer(pciInfo), byref(c_newState))
+    _nvmlCheckReturn(ret)
+    return c_newState.value
+
+def nvmlDeviceRemoveGpu(pciInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceRemoveGpu")
+    ret = fn(pointer(pciInfo))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceDiscoverGpus(pciInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceDiscoverGpus")
+    ret = fn(pointer(pciInfo))
+    _nvmlCheckReturn(ret)
+    return None
+
+def nvmlDeviceGetFieldValues(handle, fieldIds):
+    values_arr = c_nvmlFieldValue_t * len(fieldIds)
+    values = values_arr()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFieldValues")
+
+    for i, fieldId in enumerate(fieldIds):
+        try:
+            (values[i].fieldId, values[i].scopeId) = fieldId
+        except TypeError:
+            values[i].fieldId = fieldId
+
+    ret = fn(handle, c_int32(len(fieldIds)), byref(values))
+    _nvmlCheckReturn(ret)
+    return values
+
+def nvmlDeviceClearFieldValues(handle, fieldIds):
+    values_arr = c_nvmlFieldValue_t * len(fieldIds)
+    values = values_arr()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceClearFieldValues")
+
+    for i, fieldId in enumerate(fieldIds):
+        try:
+            (values[i].fieldId, values[i].scopeId) = fieldId
+        except TypeError:
+            values[i].fieldId = fieldId
+
+    ret = fn(handle, c_int32(len(fieldIds)), byref(values))
+    _nvmlCheckReturn(ret)
+    return values
+
+def nvmlDeviceGetVirtualizationMode(handle):
+    c_virtualization_mode = c_ulonglong()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVirtualizationMode")
+    ret = fn(handle, byref(c_virtualization_mode))
+    _nvmlCheckReturn(ret)
+    return c_virtualization_mode.value
+
+def nvmlDeviceSetVirtualizationMode(handle, virtualization_mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVirtualizationMode")
+    return fn(handle, virtualization_mode)
+
+def nvmlDeviceGetVgpuHeterogeneousMode(handle):
+    c_vgpuHeterogeneousMode = c_nvmlVgpuHeterogeneousMode_v1_t(0)
+    c_vgpuHeterogeneousMode.version = VgpuHeterogeneousMode_v1
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuHeterogeneousMode")
+    ret = fn(handle, byref(c_vgpuHeterogeneousMode))
+    _nvmlCheckReturn(ret)
+    return c_vgpuHeterogeneousMode.mode
+
+def nvmlDeviceSetVgpuHeterogeneousMode(handle, heterogeneous_mode):
+    c_vgpuHeterogeneousMode = c_nvmlVgpuHeterogeneousMode_v1_t(0)
+    c_vgpuHeterogeneousMode.version = VgpuHeterogeneousMode_v1
+    c_vgpuHeterogeneousMode.mode = heterogeneous_mode
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuHeterogeneousMode")
+    ret = fn(handle, byref(c_vgpuHeterogeneousMode))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlVgpuInstanceGetPlacementId(vgpuInstance):
+    c_placement = c_nvmlVgpuPlacementId_v1_t(0)
+    c_placement.version = VgpuPlacementId_v1
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetPlacementId")
+    ret = fn(vgpuInstance, byref(c_placement))
+    _nvmlCheckReturn(ret)
+    return c_placement.placementId
+
+def nvmlDeviceGetVgpuTypeSupportedPlacements(handle, vgpuTypeId, mode=0, version=1):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+
+    if version == 2:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v2_t()
+        c_vgpu_placements.version = VgpuPlacementList_v2
+        c_vgpu_placements.count = c_max_instances.value
+        c_vgpu_placements.mode = mode
+    elif version == 1:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v1_t()
+        c_vgpu_placements.version = VgpuPlacementList_v1
+    else:
+        raise NVMLError(NVML_ERROR_ARGUMENT_VERSION_MISMATCH)
+
+    c_placements = c_uint * c_max_instances.value
+    c_vgpu_placements.placementIds = c_placements()
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuTypeSupportedPlacements")
+    ret = fn(handle, vgpuTypeId, byref(c_vgpu_placements))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_placements
+
+def nvmlDeviceGetVgpuTypeCreatablePlacements(handle, vgpuTypeId, version=1):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+
+    if version == 2:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v2_t()
+        c_vgpu_placements.version = VgpuPlacementList_v2
+        c_vgpu_placements.count = c_max_instances.value
+    elif version == 1:
+        c_vgpu_placements = c_nvmlVgpuPlacementList_v1_t()
+        c_vgpu_placements.version = VgpuPlacementList_v1
+
+    c_placements = c_uint * c_max_instances.value
+    c_vgpu_placements.placementIds = c_placements()
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuTypeCreatablePlacements")
+    ret = fn(handle, vgpuTypeId, byref(c_vgpu_placements))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_placements
+
+def nvmlGetVgpuDriverCapabilities(capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuDriverCapabilities")
+    ret = fn(_nvmlVgpuDriverCapability_t(capability), byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceGetVgpuCapabilities(handle, capability):
+    c_capResult = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuCapabilities")
+    ret = fn(handle, _nvmlDeviceVgpuCapability_t(capability), byref(c_capResult))
+    _nvmlCheckReturn(ret)
+    return c_capResult.value
+
+def nvmlDeviceSetVgpuCapabilities(handle, capability, state):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuCapabilities")
+    ret = fn(handle, _nvmlDeviceVgpuCapability_t(capability), state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetSupportedVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn =  _nvmlGetFunctionPointer("nvmlDeviceGetSupportedVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no supported vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_type_ids_array = _nvmlVgpuTypeId_t * c_vgpu_count.value
+        c_vgpu_type_ids = vgpu_type_ids_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_type_ids)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_type_ids[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetCreatableVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn =  _nvmlGetFunctionPointer("nvmlDeviceGetCreatableVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no supported vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_type_ids_array = _nvmlVgpuTypeId_t * c_vgpu_count.value
+        c_vgpu_type_ids = vgpu_type_ids_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_type_ids)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_type_ids[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuTypeGetGpuInstanceProfileId(vgpuTypeId):
+    c_profile_id = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGpuInstanceProfileId")
+    ret = fn(vgpuTypeId, byref(c_profile_id))
+    _nvmlCheckReturn(ret)
+    return (c_profile_id.value)
+
+@convertStrBytes
+def nvmlVgpuTypeGetClass(vgpuTypeId):
+    c_class = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetClass")
+    ret = fn(vgpuTypeId, c_class, byref(c_buffer_size))
+    _nvmlCheckReturn(ret)
+    return c_class.value
+
+@convertStrBytes
+def nvmlVgpuTypeGetName(vgpuTypeId):
+    c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetName")
+    ret = fn(vgpuTypeId, c_name, byref(c_buffer_size))
+    _nvmlCheckReturn(ret)
+    return c_name.value
+
+def nvmlVgpuTypeGetDeviceID(vgpuTypeId):
+    c_device_id    = c_ulonglong(0)
+    c_subsystem_id = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetDeviceID")
+    ret = fn(vgpuTypeId, byref(c_device_id), byref(c_subsystem_id))
+    _nvmlCheckReturn(ret)
+    return (c_device_id.value, c_subsystem_id.value)
+
+def nvmlVgpuTypeGetFramebufferSize(vgpuTypeId):
+    c_fb_size = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFramebufferSize")
+    ret = fn(vgpuTypeId, byref(c_fb_size))
+    _nvmlCheckReturn(ret)
+    return c_fb_size.value
+
+def nvmlVgpuTypeGetNumDisplayHeads(vgpuTypeId):
+    c_num_heads = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetNumDisplayHeads")
+    ret = fn(vgpuTypeId, byref(c_num_heads))
+    _nvmlCheckReturn(ret)
+    return c_num_heads.value
+
+def nvmlVgpuTypeGetResolution(vgpuTypeId):
+    c_xdim = c_uint(0)
+    c_ydim = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetResolution")
+    ret = fn(vgpuTypeId, 0, byref(c_xdim), byref(c_ydim))
+    _nvmlCheckReturn(ret)
+    return (c_xdim.value, c_ydim.value)
+
+@convertStrBytes
+def nvmlVgpuTypeGetLicense(vgpuTypeId):
+    c_license = create_string_buffer(NVML_GRID_LICENSE_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetLicense")
+    ret = fn(vgpuTypeId, c_license, c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_license.value
+
+def nvmlVgpuTypeGetFrameRateLimit(vgpuTypeId):
+    c_frl_config = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFrameRateLimit")
+    ret = fn(vgpuTypeId, byref(c_frl_config))
+    _nvmlCheckReturn(ret)
+    return c_frl_config.value
+
+def nvmlVgpuTypeGetGspHeapSize(vgpuTypeId):
+    c_gsp_heap = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGspHeapSize")
+    ret = fn(vgpuTypeId, byref(c_gsp_heap))
+    _nvmlCheckReturn(ret)
+    return c_gsp_heap.value
+
+def nvmlVgpuTypeGetFbReservation(vgpuTypeId):
+    c_fb_reservation = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFbReservation")
+    ret = fn(vgpuTypeId, byref(c_fb_reservation))
+    _nvmlCheckReturn(ret)
+    return c_fb_reservation.value
+
+def nvmlVgpuInstanceGetRuntimeStateSize(vgpuInstance):
+    c_runtime_state = nvmlVgpuRuntimeState_v1_t()
+    c_runtime_state.version = VgpuRuntimeState_v1
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetRuntimeStateSize")
+    ret = fn(vgpuInstance, byref(c_runtime_state))
+    _nvmlCheckReturn(ret)
+    return c_runtime_state
+
+def nvmlVgpuTypeGetMaxInstances(handle, vgpuTypeId):
+    c_max_instances = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances")
+    ret = fn(handle, vgpuTypeId, byref(c_max_instances))
+    _nvmlCheckReturn(ret)
+    return c_max_instances.value
+
+def nvmlVgpuTypeGetMaxInstancesPerVm(vgpuTypeId):
+    c_max_instances_per_vm = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstancesPerVm")
+    ret = fn(vgpuTypeId, byref(c_max_instances_per_vm))
+    _nvmlCheckReturn(ret)
+    return c_max_instances_per_vm.value
+
+def nvmlVgpuTypeGetBAR1Info(vgpuTypeId):
+    c_bar1Info = c_nvmlVgpuTypeBar1Info_v1_t(0)
+    c_bar1Info.version = VgpuTypeBar1Info_v1
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetBAR1Info")
+    ret = fn(vgpuTypeId, byref(c_bar1Info))
+    _nvmlCheckReturn(ret)
+    return c_bar1Info
+
+def nvmlDeviceGetActiveVgpus(handle):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetActiveVgpus")
+    ret = fn(handle, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        vgpu_instance_array = _nvmlVgpuInstance_t * c_vgpu_count.value
+        c_vgpu_instances = vgpu_instance_array()
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpu_count), c_vgpu_instances)
+        _nvmlCheckReturn(ret)
+        vgpus = []
+        for i in range(c_vgpu_count.value):
+            vgpus.append(c_vgpu_instances[i])
+        return vgpus
+    else:
+        # error case
+        raise NVMLError(ret)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetVmID(vgpuInstance):
+    c_vm_id = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE)
+    c_vm_id_type  = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmID")
+    ret = fn(vgpuInstance, byref(c_vm_id), c_buffer_size, byref(c_vm_id_type))
+    _nvmlCheckReturn(ret)
+    return (c_vm_id.value, c_vm_id_type.value)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetUUID(vgpuInstance):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetUUID")
+    ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlVgpuInstanceGetMdevUUID(vgpuInstance):
+    c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMdevUUID")
+    ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_uuid.value
+
+@convertStrBytes
+def nvmlVgpuInstanceGetVmDriverVersion(vgpuInstance):
+    c_driver_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    c_buffer_size = c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmDriverVersion")
+    ret = fn(vgpuInstance, byref(c_driver_version), c_buffer_size)
+    _nvmlCheckReturn(ret)
+    return c_driver_version.value
+
+def nvmlVgpuInstanceGetLicenseStatus(vgpuInstance):
+    c_license_status = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseStatus")
+    ret = fn(vgpuInstance, byref(c_license_status))
+    _nvmlCheckReturn(ret)
+    return c_license_status.value
+
+def nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance):
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseInfo_v2")
+    c_license_info = c_nvmlVgpuLicenseInfo_t()
+    ret = fn(vgpuInstance, byref(c_license_info))
+    _nvmlCheckReturn(ret)
+    return c_license_info
+
+def nvmlVgpuInstanceGetLicenseInfo(vgpuInstance):
+    return nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance)
+
+def nvmlVgpuInstanceGetFrameRateLimit(vgpuInstance):
+    c_frl = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFrameRateLimit")
+    ret = fn(vgpuInstance, byref(c_frl))
+    _nvmlCheckReturn(ret)
+    return c_frl.value
+
+def nvmlVgpuInstanceGetEccMode(vgpuInstance):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEccMode")
+    ret = fn(vgpuInstance, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlVgpuInstanceGetType(vgpuInstance):
+    c_vgpu_type = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetType")
+    ret = fn(vgpuInstance, byref(c_vgpu_type))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_type.value
+
+def nvmlVgpuInstanceGetEncoderCapacity(vgpuInstance):
+    c_encoder_capacity = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderCapacity")
+    ret = fn(vgpuInstance, byref(c_encoder_capacity))
+    _nvmlCheckReturn(ret)
+    return c_encoder_capacity.value
+
+def nvmlVgpuInstanceSetEncoderCapacity(vgpuInstance, encoder_capacity):
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceSetEncoderCapacity")
+    return fn(vgpuInstance, encoder_capacity)
+
+def nvmlVgpuInstanceGetFbUsage(vgpuInstance):
+    c_fb_usage = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFbUsage")
+    ret = fn(vgpuInstance, byref(c_fb_usage))
+    _nvmlCheckReturn(ret)
+    return c_fb_usage.value
+
+def nvmlVgpuTypeGetCapabilities(vgpuTypeId, capability):
+    c_cap_result = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuTypeGetCapabilities")
+    ret = fn(vgpuTypeId, _nvmlVgpuCapability_t(capability), byref(c_cap_result))
+    _nvmlCheckReturn(ret)
+    return (c_cap_result.value)
+
+def nvmlVgpuInstanceGetGpuInstanceId(vgpuInstance):
+    c_id = c_uint(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuInstanceId")
+    ret = fn(vgpuInstance, byref(c_id))
+    _nvmlCheckReturn(ret)
+    return (c_id.value)
+
+@convertStrBytes
+def nvmlVgpuInstanceGetGpuPciId(vgpuInstance):
+    c_vgpuPciId = create_string_buffer(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuPciId")
+    ret = fn(vgpuInstance, c_vgpuPciId, byref(c_uint(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE)))
+    _nvmlCheckReturn(ret)
+    return c_vgpuPciId.value
+
+def nvmlDeviceGetVgpuUtilization(handle, timeStamp):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_sample_value_type = _nvmlValueType_t()
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuUtilization")
+    ret = fn(handle, c_time_stamp, byref(c_sample_value_type), byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpu_count.value * c_nvmlVgpuInstanceUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_time_stamp, byref(c_sample_value_type), byref(c_vgpu_count), c_samples)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpu_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetVgpuInstancesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_vgpuUtilInfo = c_nvmlVgpuInstancesUtilizationInfo_v1_t(0)
+    c_vgpuUtilInfo.version = VgpuInstancesUtilizationInfo_v1
+    c_vgpuUtilInfo.sampleValType = _nvmlValueType_t()
+    c_vgpuUtilInfo.vgpuInstanceCount = c_uint(0)
+    c_vgpuUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuInstancesUtilizationInfo")
+    ret = fn(handle, byref(c_vgpuUtilInfo))
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpuUtilInfo.vgpuInstanceCount * c_nvmlVgpuInstanceUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_vgpuUtilInfo.vgpuUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpuUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpuUtilInfo.vgpuInstanceCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetP2PStatus(device1, device2, p2pIndex):
+    c_p2pstatus = _nvmlGpuP2PStatus_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetP2PStatus")
+    ret = fn(device1, device2,p2pIndex, byref(c_p2pstatus))
+    _nvmlCheckReturn(ret)
+    return c_p2pstatus.value
+
+def nvmlDeviceGetGridLicensableFeatures_v4(handle):
+    c_get_grid_licensable_features = c_nvmlGridLicensableFeatures_v4_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGridLicensableFeatures_v4")
+    ret = fn(handle, byref(c_get_grid_licensable_features))
+    _nvmlCheckReturn(ret)
+
+    return (c_get_grid_licensable_features)
+
+def nvmlDeviceGetGridLicensableFeatures(handle):
+    return nvmlDeviceGetGridLicensableFeatures_v4(handle)
+
+def nvmlDeviceGetGspFirmwareVersion(handle, version=None):
+    isUserDefined = version is not None
+    if not isUserDefined:
+        version = (c_char * NVML_GSP_FIRMWARE_VERSION_BUF_SIZE)()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareVersion")
+    ret = fn(handle, version)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isUserDefined else version.value
+
+def nvmlDeviceGetGspFirmwareMode(handle, isEnabled=c_uint(), defaultMode=c_uint()):
+    isReference = type(isEnabled) is not c_uint
+    isEnabledRef = isEnabled if isReference else byref(isEnabled)
+    defaultModeRef = defaultMode if isReference else byref(defaultMode)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareMode")
+    ret = fn(handle, isEnabledRef, defaultModeRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else [isEnabled.value, defaultMode.value]
+
+def nvmlDeviceGetEncoderCapacity(handle, encoderQueryType):
+    c_encoder_capacity = c_ulonglong(0)
+    c_encoderQuery_type = _nvmlEncoderQueryType_t(encoderQueryType)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderCapacity")
+    ret = fn(handle, c_encoderQuery_type, byref(c_encoder_capacity))
+    _nvmlCheckReturn(ret)
+    return c_encoder_capacity.value
+
+def nvmlDeviceGetVgpuProcessUtilization(handle, timeStamp):
+    # first call to get the size
+    c_vgpu_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessUtilization")
+    ret = fn(handle, c_time_stamp, byref(c_vgpu_count), None)
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpu_count.value * c_nvmlVgpuProcessUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_time_stamp, byref(c_vgpu_count), c_samples)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpu_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetVgpuProcessesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_vgpuProcUtilInfo = c_nvmlVgpuProcessesUtilizationInfo_v1_t(0)
+    c_vgpuProcUtilInfo.version = VgpuProcessesUtilizationInfo_v1
+    c_vgpuProcUtilInfo.vgpuProcessCount = c_uint(0)
+    c_vgpuProcUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessesUtilizationInfo")
+    ret = fn(handle, byref(c_vgpuProcUtilInfo))
+
+    if (ret == NVML_SUCCESS):
+        # special case, no active vGPUs
+        return []
+    elif (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_vgpuProcUtilInfo.vgpuProcessCount * c_nvmlVgpuProcessUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_vgpuProcUtilInfo.vgpuProcUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_vgpuProcUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_vgpuProcUtilInfo.vgpuProcessCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetEncoderStats(handle):
+    c_encoderCount = c_ulonglong(0)
+    c_encodeFps = c_ulonglong(0)
+    c_encoderLatency = c_ulonglong(0)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderStats")
+    ret = fn(handle, byref(c_encoderCount), byref(c_encodeFps), byref(c_encoderLatency))
+    _nvmlCheckReturn(ret)
+    return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value)
+
+def nvmlDeviceGetEncoderSessions(handle):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderSessions")
+    ret = fn(handle, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlEncoderSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(handle, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetFBCStats(handle):
+    c_fbcStats = c_nvmlFBCStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetFBCStats")
+    ret = fn(handle, byref(c_fbcStats))
+    _nvmlCheckReturn(ret)
+    return c_fbcStats
+
+def nvmlDeviceGetFBCSessions(handle):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetFBCSessions")
+    ret = fn(handle, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlFBCSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(handle, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetEncoderStats(vgpuInstance):
+    c_encoderCount    = c_ulonglong(0)
+    c_encodeFps       = c_ulonglong(0)
+    c_encoderLatency  = c_ulonglong(0)
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderStats")
+    ret = fn(vgpuInstance, byref(c_encoderCount), byref(c_encodeFps), byref(c_encoderLatency))
+    _nvmlCheckReturn(ret)
+    return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value)
+
+def nvmlVgpuInstanceGetEncoderSessions(vgpuInstance):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderSessions")
+    ret = fn(vgpuInstance, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlEncoderSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(vgpuInstance, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetFBCStats(vgpuInstance):
+    c_fbcStats = c_nvmlFBCStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCStats")
+    ret = fn(vgpuInstance, byref(c_fbcStats))
+    _nvmlCheckReturn(ret)
+    return c_fbcStats
+
+def nvmlVgpuInstanceGetFBCSessions(vgpuInstance):
+    # first call to get the size
+    c_session_count = c_uint(0)
+
+    fn  = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCSessions")
+    ret = fn(vgpuInstance, byref(c_session_count), None)
+
+    if (ret == NVML_SUCCESS):
+        if (c_session_count.value != 0):
+            # typical case
+            session_array = c_nvmlFBCSession_t * c_session_count.value
+            c_sessions = session_array()
+
+            # make the call again
+            ret = fn(vgpuInstance, byref(c_session_count), c_sessions)
+            _nvmlCheckReturn(ret)
+            sessions = []
+            for i in range(c_session_count.value):
+                sessions.append(c_sessions[i])
+            return sessions
+        else:
+            return []  # no active sessions
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetProcessUtilization(handle, timeStamp):
+    # first call to get the size
+    c_count = c_uint(0)
+    c_time_stamp = c_ulonglong(timeStamp)
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetProcessUtilization")
+    ret = fn(handle, None, byref(c_count), c_time_stamp)
+
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_count.value * c_nvmlProcessUtilizationSample_t
+        c_samples = sampleArray()
+
+        # make the call again
+        ret = fn(handle, c_samples, byref(c_count), c_time_stamp)
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_count.value]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlDeviceGetProcessesUtilizationInfo(handle, timeStamp):
+    # first call to get the size
+    c_time_stamp = c_ulonglong(timeStamp)
+    c_processesUtilInfo = c_nvmlProcessesUtilizationInfo_v1_t(0)
+    c_processesUtilInfo.version = ProcessesUtilizationInfo_v1
+    c_processesUtilInfo.processSamplesCount = c_uint(0)
+    c_processesUtilInfo.lastSeenTimeStamp = c_time_stamp
+
+    fn  = _nvmlGetFunctionPointer("nvmlDeviceGetProcessesUtilizationInfo")
+    ret = fn(handle, byref(c_processesUtilInfo))
+
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        # typical case
+        sampleArray = c_processesUtilInfo.processSamplesCount * c_nvmlProcessUtilizationInfo_v1_t
+        c_samples = sampleArray()
+        c_processesUtilInfo.procUtilArray = c_samples
+
+        # make the call again
+        ret = fn(handle, byref(c_processesUtilInfo))
+        _nvmlCheckReturn(ret)
+
+        return c_samples[0:c_processesUtilInfo.processSamplesCount]
+    else:
+        # error case
+        raise NVMLError(ret)
+
+def nvmlVgpuInstanceGetMetadata(vgpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMetadata")
+    c_vgpuMetadata = c_nvmlVgpuMetadata_t()
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(vgpuInstance, byref(c_vgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(vgpuInstance, byref(c_vgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return c_vgpuMetadata
+
+def nvmlDeviceGetVgpuMetadata(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuMetadata")
+    c_vgpuPgpuMetadata = c_nvmlVgpuPgpuMetadata_t()
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(handle, byref(c_vgpuPgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(handle, byref(c_vgpuPgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return c_vgpuPgpuMetadata
+
+def nvmlGetVgpuCompatibility(vgpuMetadata, pgpuMetadata):
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuCompatibility")
+    c_vgpuPgpuCompatibility = c_nvmlVgpuPgpuCompatibility_t()
+    ret = fn(byref(vgpuMetadata), byref(pgpuMetadata), byref(c_vgpuPgpuCompatibility))
+    _nvmlCheckReturn(ret)
+    return c_vgpuPgpuCompatibility
+
+@convertStrBytes
+def nvmlDeviceGetPgpuMetadataString(handle):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPgpuMetadataString")
+    c_pgpuMetadata = create_string_buffer(NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE)
+    c_bufferSize = c_uint(0)
+    # Make the first NVML API call to get the c_bufferSize value.
+    # We have already allocated required buffer above.
+    ret = fn(handle, byref(c_pgpuMetadata), byref(c_bufferSize))
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        ret = fn(handle, byref(c_pgpuMetadata), byref(c_bufferSize))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return (c_pgpuMetadata.value, c_bufferSize.value)
+
+def nvmlDeviceGetVgpuSchedulerLog(handle):
+    c_vgpu_sched_log = c_nvmlVgpuSchedulerLog_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerLog")
+    ret = fn(handle, byref(c_vgpu_sched_log))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_log
+
+def nvmlDeviceGetVgpuSchedulerState(handle):
+    c_vgpu_sched_state = c_nvmlVgpuSchedulerGetState_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerState")
+    ret = fn(handle, byref(c_vgpu_sched_state))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_state
+
+def nvmlDeviceGetVgpuSchedulerCapabilities(handle):
+    c_vgpu_sched_caps = c_nvmlVgpuSchedulerCapabilities_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerCapabilities")
+    ret = fn(handle, byref(c_vgpu_sched_caps))
+    _nvmlCheckReturn(ret)
+    return c_vgpu_sched_caps
+
+def nvmlDeviceSetVgpuSchedulerState(handle, sched_state):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuSchedulerState")
+    ret = fn(handle, byref(sched_state))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSetVgpuVersion(vgpuVersion):
+    fn = _nvmlGetFunctionPointer("nvmlSetVgpuVersion")
+    ret = fn(byref(vgpuVersion))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGetVgpuVersion(supported=None, current=None):
+    isUserDefined = (supported is not None) or (current is not None)
+    if not isUserDefined:
+        supported = c_nvmlVgpuVersion_t()
+        current = c_nvmlVgpuVersion_t()
+    fn = _nvmlGetFunctionPointer("nvmlGetVgpuVersion")
+    ret = fn(byref(supported), byref(current))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isUserDefined else [(supported.minVersion,
+                                                supported.maxVersion),
+                                               (current.minVersion,
+                                                current.maxVersion)]
+
+def nvmlVgpuInstanceGetAccountingMode(vgpuInstance):
+    c_mode = _nvmlEnableState_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingMode")
+    ret = fn(vgpuInstance, byref(c_mode))
+    _nvmlCheckReturn(ret)
+    return c_mode.value
+
+def nvmlVgpuInstanceGetAccountingPids(vgpuInstance):
+    c_pidCount = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingPids")
+    ret = fn(vgpuInstance, byref(c_pidCount), None)
+    if (ret == NVML_ERROR_INSUFFICIENT_SIZE):
+        sampleArray = c_pidCount.value * c_uint
+        c_pidArray = sampleArray()
+        ret = fn(vgpuInstance, byref(c_pidCount), byref(c_pidArray))
+        _nvmlCheckReturn(ret)
+    else:
+        raise NVMLError(ret)
+    return (c_pidCount, c_pidArray)
+
+def nvmlVgpuInstanceGetAccountingStats(vgpuInstance, pid):
+    c_accountingStats = c_nvmlAccountingStats_t()
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingStats")
+    ret = fn(vgpuInstance, pid, byref(c_accountingStats))
+    _nvmlCheckReturn(ret)
+    return c_accountingStats
+
+def nvmlVgpuInstanceClearAccountingPids(vgpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceClearAccountingPids")
+    ret = fn(vgpuInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGetExcludedDeviceCount():
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceCount")
+    ret = fn(byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlGetExcludedDeviceInfoByIndex(index):
+    c_index = c_uint(index)
+    info = c_nvmlExcludedDeviceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceInfoByIndex")
+    ret = fn(c_index, byref(info))
+    _nvmlCheckReturn(ret)
+    return info
+
+def nvmlDeviceGetHostVgpuMode(handle):
+    c_host_vgpu_mode = _nvmlHostVgpuMode_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetHostVgpuMode")
+    ret = fn(handle, byref(c_host_vgpu_mode))
+    _nvmlCheckReturn(ret)
+    return c_host_vgpu_mode.value
+
+def nvmlDeviceSetMigMode(device, mode):
+    c_activationStatus = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMigMode")
+    ret = fn(device, mode, byref(c_activationStatus))
+    _nvmlCheckReturn(ret)
+    return c_activationStatus.value
+
+def nvmlDeviceGetMigMode(device):
+    c_currentMode = c_uint()
+    c_pendingMode = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMigMode")
+    ret = fn(device, byref(c_currentMode), byref(c_pendingMode))
+    _nvmlCheckReturn(ret)
+    return [c_currentMode.value, c_pendingMode.value]
+
+def nvmlDeviceGetGpuInstanceProfileInfo(device, profile, version=2):
+    if version == 2:
+        c_info = c_nvmlGpuInstanceProfileInfo_v2_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfoV")
+    elif version == 1:
+        c_info = c_nvmlGpuInstanceProfileInfo_t()
+        fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfo")
+    else:
+        raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND)
+    ret = fn(device, profile, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+# Define function alias for the API exposed by NVML
+nvmlDeviceGetGpuInstanceProfileInfoV = nvmlDeviceGetGpuInstanceProfileInfo
+
+def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceRemainingCapacity")
+    ret = fn(device, profileId, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetGpuInstancePossiblePlacements(device, profileId, placementsRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstancePossiblePlacements_v2")
+    ret = fn(device, profileId, placementsRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceCreateGpuInstance(device, profileId):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstance")
+    ret = fn(device, profileId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlDeviceCreateGpuInstanceWithPlacement(device, profileId, placement):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstanceWithPlacement")
+    ret = fn(device, profileId, placement, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceDestroy(gpuInstance):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceDestroy")
+    ret = fn(gpuInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuInstances(device, profileId, gpuInstancesRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstances")
+    ret = fn(device, profileId, gpuInstancesRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuInstanceById(device, gpuInstanceId):
+    c_instance = c_nvmlGpuInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceById")
+    ret = fn(device, gpuInstanceId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceGetInfo(gpuInstance):
+    c_info = c_nvmlGpuInstanceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetInfo")
+    ret = fn(gpuInstance, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile, version=2):
+    if version == 2:
+        c_info = c_nvmlComputeInstanceProfileInfo_v2_t()
+        fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfoV")
+    elif version == 1:
+        c_info = c_nvmlComputeInstanceProfileInfo_t()
+        fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfo")
+    else:
+        raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) 
+    ret = fn(device, profile, engProfile, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+# Define function alias for the API exposed by NVML
+nvmlGpuInstanceGetComputeInstanceProfileInfoV = nvmlGpuInstanceGetComputeInstanceProfileInfo
+
+def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceRemainingCapacity")
+    ret = fn(gpuInstance, profileId, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlGpuInstanceGetComputeInstancePossiblePlacements(gpuInstance, profileId, placementsRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstancePossiblePlacements")
+    ret = fn(gpuInstance, profileId, placementsRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstance")
+    ret = fn(gpuInstance, profileId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlGpuInstanceCreateComputeInstanceWithPlacement(gpuInstance, profileId, placement):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstanceWithPlacement")
+    ret = fn(gpuInstance, profileId, placement, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlComputeInstanceDestroy(computeInstance):
+    fn = _nvmlGetFunctionPointer("nvmlComputeInstanceDestroy")
+    ret = fn(computeInstance)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceGetComputeInstances(gpuInstance, profileId, computeInstancesRef, countRef):
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstances")
+    ret = fn(gpuInstance, profileId, computeInstancesRef, countRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpuInstanceGetComputeInstanceById(gpuInstance, computeInstanceId):
+    c_instance = c_nvmlComputeInstance_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceById")
+    ret = fn(gpuInstance, computeInstanceId, byref(c_instance))
+    _nvmlCheckReturn(ret)
+    return c_instance
+
+def nvmlComputeInstanceGetInfo_v2(computeInstance):
+    c_info = c_nvmlComputeInstanceInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlComputeInstanceGetInfo_v2")
+    ret = fn(computeInstance, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return c_info
+
+def nvmlComputeInstanceGetInfo(computeInstance):
+    return nvmlComputeInstanceGetInfo_v2(computeInstance)
+
+def nvmlDeviceIsMigDeviceHandle(device):
+    c_isMigDevice = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceIsMigDeviceHandle")
+    ret = fn(device, byref(c_isMigDevice))
+    _nvmlCheckReturn(ret)
+    return c_isMigDevice
+
+def nvmlDeviceGetGpuInstanceId(device):
+    c_gpuInstanceId = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceId")
+    ret = fn(device, byref(c_gpuInstanceId))
+    _nvmlCheckReturn(ret)
+    return c_gpuInstanceId.value
+
+def nvmlDeviceGetComputeInstanceId(device):
+    c_computeInstanceId = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeInstanceId")
+    ret = fn(device, byref(c_computeInstanceId))
+    _nvmlCheckReturn(ret)
+    return c_computeInstanceId.value
+
+def nvmlDeviceGetMaxMigDeviceCount(device):
+    c_count = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxMigDeviceCount")
+    ret = fn(device, byref(c_count))
+    _nvmlCheckReturn(ret)
+    return c_count.value
+
+def nvmlDeviceGetMigDeviceHandleByIndex(device, index):
+    c_index = c_uint(index)
+    migDevice = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMigDeviceHandleByIndex")
+    ret = fn(device, c_index, byref(migDevice))
+    _nvmlCheckReturn(ret)
+    return migDevice
+
+def nvmlDeviceGetDeviceHandleFromMigDeviceHandle(migDevice):
+    device = c_nvmlDevice_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDeviceHandleFromMigDeviceHandle")
+    ret = fn(migDevice, byref(device))
+    _nvmlCheckReturn(ret)
+    return device
+
+def nvmlDeviceGetAttributes_v2(device):
+    c_attrs = c_nvmlDeviceAttributes()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAttributes_v2")
+    ret = fn(device, byref(c_attrs))
+    _nvmlCheckReturn(ret)
+    return c_attrs
+
+def nvmlDeviceGetAttributes(device):
+    return nvmlDeviceGetAttributes_v2(device)
+
+def nvmlDeviceGetRemappedRows(device):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRemappedRows")
+    c_corr = c_uint()
+    c_unc = c_uint()
+    c_bpending = c_uint()
+    c_bfailure = c_uint()
+    ret = fn(device, byref(c_corr), byref(c_unc), byref(c_bpending), byref(c_bfailure))
+    _nvmlCheckReturn(ret)
+    return (c_corr.value, c_unc.value, c_bpending.value, c_bfailure.value)
+
+def nvmlDeviceGetRowRemapperHistogram(device):
+    c_vals = c_nvmlRowRemapperHistogramValues()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetRowRemapperHistogram")
+    ret = fn(device, byref(c_vals))
+    _nvmlCheckReturn(ret)
+    return c_vals
+
+def nvmlDeviceGetArchitecture(device):
+    arch = _nvmlDeviceArchitecture_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetArchitecture")
+    ret = fn(device, byref(arch))
+    _nvmlCheckReturn(ret)
+    return arch.value
+
+def nvmlDeviceGetBusType(device):
+    c_busType = _nvmlBusType_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetBusType")
+    ret = fn(device, byref(c_busType))
+    _nvmlCheckReturn(ret)
+    return c_busType.value
+
+def nvmlDeviceGetIrqNum(device):
+    c_irqNum = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetIrqNum")
+    ret = fn(device, byref(c_irqNum))
+    _nvmlCheckReturn(ret)
+    return c_irqNum.value
+
+def nvmlDeviceGetNumGpuCores(device):
+    c_numCores = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumGpuCores")
+    ret = fn(device, byref(c_numCores))
+    _nvmlCheckReturn(ret)
+    return c_numCores.value
+
+def nvmlDeviceGetPowerSource(device):
+    c_powerSource = _nvmlPowerSource_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerSource")
+    ret = fn(device, byref(c_powerSource))
+    _nvmlCheckReturn(ret)
+    return c_powerSource.value
+
+def nvmlDeviceGetMemoryBusWidth(device):
+    c_memBusWidth = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryBusWidth")
+    ret = fn(device, byref(c_memBusWidth))
+    _nvmlCheckReturn(ret)
+    return c_memBusWidth.value
+
+def nvmlDeviceGetPcieLinkMaxSpeed(device):
+    c_speed = _nvmlPcieLinkMaxSpeed_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieLinkMaxSpeed")
+    ret = fn(device, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetAdaptiveClockInfoStatus(device):
+    c_adaptiveClockInfoStatus = _nvmlAdaptiveClockInfoStatus_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetAdaptiveClockInfoStatus")
+    ret = fn(device, byref(c_adaptiveClockInfoStatus))
+    _nvmlCheckReturn(ret)
+    return c_adaptiveClockInfoStatus.value
+
+def nvmlDeviceGetPcieSpeed(device):
+    c_speed = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieSpeed")
+    ret = fn(device, byref(c_speed))
+    _nvmlCheckReturn(ret)
+    return c_speed.value
+
+def nvmlDeviceGetDynamicPstatesInfo(device, c_dynamicpstatesinfo=c_nvmlGpuDynamicPstatesInfo_t()):
+    isReference = type(c_dynamicpstatesinfo) is not c_nvmlGpuDynamicPstatesInfo_t
+    dynamicpstatesinfoRef = c_dynamicpstatesinfo if isReference else byref(c_dynamicpstatesinfo)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDynamicPstatesInfo");
+    ret = fn(device, dynamicpstatesinfoRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_dynamicpstatesinfo
+
+def nvmlDeviceSetFanSpeed_v2(handle, index, speed):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanSpeed_v2");
+    ret = fn(handle, index, speed)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetThermalSettings(device, sensorindex, c_thermalsettings=c_nvmlGpuThermalSettings_t()):
+    isReference = type(c_thermalsettings) is not c_nvmlGpuThermalSettings_t
+    thermalsettingsRef = c_thermalsettings if isReference else byref(c_thermalsettings)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetThermalSettings");
+    ret = fn(device, sensorindex, thermalsettingsRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else c_thermalsettings.sensor[:]
+
+def nvmlDeviceGetMinMaxClockOfPState(device, clockType, pstate, minClockMHz=c_uint(), maxClockMHz=c_uint()):
+    isReference = (type(minClockMHz) is not c_uint) or (type(maxClockMHz) is not c_uint)
+    minClockMHzRef = minClockMHz if isReference else byref(minClockMHz)
+    maxClockMHzRef = maxClockMHz if isReference else byref(maxClockMHz)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxClockOfPState");
+    ret = fn(device, _nvmlClockType_t(clockType), _nvmlClockType_t(pstate), minClockMHzRef, maxClockMHzRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minClockMHz.value, maxClockMHz.value)
+
+class c_nvmlClockOffset_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('type', _nvmlClockType_t),
+        ('pstate', _nvmlPstates_t),
+        ('clockOffsetMHz', c_int),
+        ('minClockOffsetMHz', c_int),
+        ('maxClockOffsetMHz', c_int),
+    ]
+
+nvmlClockOffset_v1 = 0x1000018
+
+def nvmlDeviceGetClockOffsets(device, info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockOffsets");
+    ret = fn(device, info)
+    return NVML_SUCCESS
+
+def nvmlDeviceSetClockOffsets(device, info):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetClockOffsets");
+    ret = fn(device, info)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetSupportedPerformanceStates(device):
+    pstates = []
+    c_count = c_uint(NVML_MAX_GPU_PERF_PSTATES)
+    c_size = sizeof(c_uint)*c_count.value
+
+    # NOTE: use 'c_uint' to represent the size of the nvmlPstate_t enumeration.
+    pstates_array = _nvmlPstates_t * c_count.value
+    c_pstates = pstates_array()
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedPerformanceStates")
+    ret = fn(device, c_pstates, c_size)
+    _nvmlCheckReturn(ret)
+
+    for value in c_pstates:
+        if value != NVML_PSTATE_UNKNOWN:
+            pstates.append(value)
+
+    return pstates
+
+def nvmlDeviceGetGpcClkVfOffset(device):
+    offset = c_int32()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkVfOffset")
+    ret = fn(device, byref(offset))
+    _nvmlCheckReturn(ret)
+    return offset.value
+
+def nvmlDeviceSetGpcClkVfOffset(device, offset):
+    c_offset = c_int32(offset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpcClkVfOffset")
+    ret = fn(device, c_offset)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpcClkMinMaxVfOffset(device, minOffset=c_int(), maxOffset=c_int()):
+    isReference = (type(minOffset) is not c_int) or (type(maxOffset) is not c_int)
+    minOffsetRef = minOffset if isReference else byref(minOffset)
+    maxOffsetRef = maxOffset if isReference else byref(maxOffset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkMinMaxVfOffset")
+    ret = fn(device, minOffsetRef, maxOffsetRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minOffset.value, maxOffset.value)
+
+def nvmlDeviceGetMemClkVfOffset(device):
+    offset = c_int32()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkVfOffset")
+    ret = fn(device, byref(offset))
+    _nvmlCheckReturn(ret)
+    return offset.value
+
+def nvmlDeviceSetMemClkVfOffset(device, offset):
+    c_offset = c_int32(offset)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemClkVfOffset")
+    ret = fn(device, c_offset)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetMemClkMinMaxVfOffset(device, minOffset=c_int(), maxOffset=c_int()):
+    isReference = (type(minOffset) is not c_int) or (type(maxOffset) is not c_int)
+    minOffsetRef = minOffset if isReference else byref(minOffset)
+    maxOffsetRef = maxOffset if isReference else byref(maxOffset)
+
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkMinMaxVfOffset")
+    ret = fn(device, minOffsetRef, maxOffsetRef)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS if isReference else (minOffset.value, maxOffset.value)
+
+def nvmlSystemSetConfComputeGpusReadyState(state):
+    c_state = c_uint(state)
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetConfComputeGpusReadyState")
+    ret = fn(c_state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetConfComputeGpusReadyState():
+    c_state = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeGpusReadyState")
+    ret = fn(byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+def nvmlSystemGetConfComputeCapabilities():
+    c_ccSysCaps = c_nvmlConfComputeSystemCaps_t()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeCapabilities")
+    ret = fn(byref(c_ccSysCaps))
+    _nvmlCheckReturn(ret)
+    return c_ccSysCaps
+
+def nvmlSystemGetConfComputeState():
+    c_state = c_nvmlConfComputeSystemState_t()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeState")
+    ret = fn(byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state
+
+def nvmlSystemGetConfComputeSettings(settings):
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeSettings")
+    return fn(settings)
+
+def nvmlDeviceSetConfComputeUnprotectedMemSize(device, c_ccMemSize):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetConfComputeUnprotectedMemSize")
+    ret = fn(device, c_ccMemSize)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetConfComputeMemSizeInfo(device):
+    c_ccMemSize = c_nvmlConfComputeMemSizeInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeMemSizeInfo")
+    ret = fn(device, byref(c_ccMemSize))
+    _nvmlCheckReturn(ret)
+    return c_ccMemSize
+
+def nvmlDeviceGetConfComputeProtectedMemoryUsage(device):
+    c_memory = c_nvmlMemory_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeProtectedMemoryUsage")
+    ret = fn(device, byref(c_memory))
+    _nvmlCheckReturn(ret)
+    return c_memory
+
+def nvmlDeviceGetConfComputeGpuCertificate(device):
+    c_cert = c_nvmlConfComputeGpuCertificate_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuCertificate")
+    ret = fn(device, byref(c_cert))
+    _nvmlCheckReturn(ret)
+    return c_cert
+
+def nvmlDeviceGetConfComputeGpuAttestationReport(device, c_nonce):
+    c_attestReport = c_nvmlConfComputeGpuAttestationReport_t()
+    c_nonce_arr = (c_uint8 * len(c_nonce))(*(c_nonce))
+    setattr(c_attestReport, 'nonce', c_nonce_arr)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuAttestationReport")
+    ret = fn(device, byref(c_attestReport))
+    _nvmlCheckReturn(ret)
+    return c_attestReport
+
+def nvmlSystemSetConfComputeKeyRotationThresholdInfo(max_atk_adv):
+    c_keyRotationThrInfo = c_nvmlConfComputeSetKeyRotationThresholdInfo_t(0)
+    c_keyRotationThrInfo.version = ConfComputeSetKeyRotationThresholdInfo_v1
+    c_keyRotationThrInfo.maxAttackerAdvantage = max_atk_adv
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetConfComputeKeyRotationThresholdInfo")
+    ret = fn(byref(c_keyRotationThrInfo))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetConfComputeKeyRotationThresholdInfo():
+    c_keyRotationThrInfo = c_nvmlConfComputeGetKeyRotationThresholdInfo_t(0)
+    c_keyRotationThrInfo.version = ConfComputeGetKeyRotationThresholdInfo_v1
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeKeyRotationThresholdInfo")
+    ret = fn(byref(c_keyRotationThrInfo))
+    _nvmlCheckReturn(ret)
+    return c_keyRotationThrInfo
+
+## GPM ##
+#########
+
+## Enums/defines
+
+#### GPM Metric Identifiers
+NVML_GPM_METRIC_GRAPHICS_UTIL               = 1 # Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0
+NVML_GPM_METRIC_SM_UTIL                     = 2 # Percentage of SMs that were busy. 0.0 - 100.0
+NVML_GPM_METRIC_SM_OCCUPANCY                = 3 # Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0
+NVML_GPM_METRIC_INTEGER_UTIL                = 4 # Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0
+NVML_GPM_METRIC_ANY_TENSOR_UTIL             = 5 # Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_DFMA_TENSOR_UTIL            = 6 # Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_HMMA_TENSOR_UTIL            = 7 # Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_IMMA_TENSOR_UTIL            = 9 # Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0
+NVML_GPM_METRIC_DRAM_BW_UTIL                = 10 # Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0
+NVML_GPM_METRIC_FP64_UTIL                   = 11 # Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0
+NVML_GPM_METRIC_FP32_UTIL                   = 12 # Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0
+NVML_GPM_METRIC_FP16_UTIL                   = 13 # Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0
+NVML_GPM_METRIC_PCIE_TX_PER_SEC             = 20 # PCIe traffic from this GPU in MiB/sec
+NVML_GPM_METRIC_PCIE_RX_PER_SEC             = 21 # PCIe traffic to this GPU in MiB/sec
+NVML_GPM_METRIC_NVDEC_0_UTIL                = 30 # Percent utilization of NVDEC 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_1_UTIL                = 31 # Percent utilization of NVDEC 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_2_UTIL                = 32 # Percent utilization of NVDEC 2. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_3_UTIL                = 33 # Percent utilization of NVDEC 3. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_4_UTIL                = 34 # Percent utilization of NVDEC 4. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_5_UTIL                = 35 # Percent utilization of NVDEC 5. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_6_UTIL                = 36 # Percent utilization of NVDEC 6. 0.0 - 100.0
+NVML_GPM_METRIC_NVDEC_7_UTIL                = 37 # Percent utilization of NVDEC 7. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_0_UTIL                = 40 # Percent utilization of NVJPG 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_1_UTIL                = 41 # Percent utilization of NVJPG 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_2_UTIL                = 42 # Percent utilization of NVJPG 2. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_3_UTIL                = 43 # Percent utilization of NVJPG 3. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_4_UTIL                = 44 # Percent utilization of NVJPG 4. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_5_UTIL                = 45 # Percent utilization of NVJPG 5. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_6_UTIL                = 46 # Percent utilization of NVJPG 6. 0.0 - 100.0
+NVML_GPM_METRIC_NVJPG_7_UTIL                = 47 # Percent utilization of NVJPG 7. 0.0 - 100.0
+NVML_GPM_METRIC_NVOFA_0_UTIL                = 50 # Percent utilization of NVOFA 0. 0.0 - 100.0
+NVML_GPM_METRIC_NVOFA_1_UTIL                = 51 # Percent utilization of NVOFA 1. 0.0 - 100.0
+NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC     = 60 # NvLink read bandwidth for all links in MiB/sec
+NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC     = 61 # NvLink write bandwidth for all links in MiB/sec
+NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC        = 62 # NvLink read bandwidth for link 0 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC        = 63 # NvLink write bandwidth for link 0 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC        = 64 # NvLink read bandwidth for link 1 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC        = 65 # NvLink write bandwidth for link 1 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC        = 66 # NvLink read bandwidth for link 2 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC        = 67 # NvLink write bandwidth for link 2 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC        = 68 # NvLink read bandwidth for link 3 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC        = 69 # NvLink write bandwidth for link 3 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC        = 70 # NvLink read bandwidth for link 4 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC        = 71 # NvLink write bandwidth for link 4 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC        = 72 # NvLink read bandwidth for link 5 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC        = 73 # NvLink write bandwidth for link 5 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC        = 74 # NvLink read bandwidth for link 6 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC        = 75 # NvLink write bandwidth for link 6 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC        = 76 # NvLink read bandwidth for link 7 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC        = 77 # NvLink write bandwidth for link 7 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC        = 78 # NvLink read bandwidth for link 8 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC        = 79 # NvLink write bandwidth for link 8 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC        = 80 # NvLink read bandwidth for link 9 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC        = 81 # NvLink write bandwidth for link 9 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC       = 82 # NvLink read bandwidth for link 10 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC       = 83 # NvLink write bandwidth for link 10 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC       = 84 # NvLink read bandwidth for link 11 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC       = 85 # NvLink write bandwidth for link 11 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC       = 86 # NvLink read bandwidth for link 12 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC       = 87 # NvLink write bandwidth for link 12 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC       = 88 # NvLink read bandwidth for link 13 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC       = 89 # NvLink write bandwidth for link 13 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC       = 90 # NvLink read bandwidth for link 14 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC       = 91 # NvLink write bandwidth for link 14 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC       = 92 # NvLink read bandwidth for link 15 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC       = 93 # NvLink write bandwidth for link 15 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC       = 94 # NvLink read bandwidth for link 16 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC       = 95 # NvLink write bandwidth for link 16 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC       = 96 # NvLink read bandwidth for link 17 in MiB/sec
+NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC       = 97 # NvLink write bandwidth for link 17 in MiB/sec
+NVML_GPM_METRIC_MAX                         = 98
+
+## Structs
+
+class c_nvmlUnitInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('name', c_char * 96),
+        ('id', c_char * 96),
+        ('serial', c_char * 96),
+        ('firmwareVersion', c_char * 96),
+    ]
+
+class struct_c_nvmlGpmSample_t(Structure):
+    pass # opaque handle
+c_nvmlGpmSample_t = POINTER(struct_c_nvmlGpmSample_t)
+
+class c_metricInfo_t(Structure):
+    _fields_ = [
+        ("shortName", c_char_p),
+        ("longName", c_char_p),
+        ("unit", c_char_p),
+    ]
+
+class c_nvmlGpmMetric_t(_PrintableStructure):
+    _fields_ = [
+        ('metricId', c_uint),
+        ('nvmlReturn', _nvmlReturn_t),
+        ('value', c_double),
+        ('metricInfo', c_metricInfo_t)
+    ]
+
+class c_nvmlGpmMetricsGet_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('numMetrics', c_uint),
+        ('sample1', c_nvmlGpmSample_t),
+        ('sample2', c_nvmlGpmSample_t),
+        ('metrics', c_nvmlGpmMetric_t * NVML_GPM_METRIC_MAX)
+    ]
+
+NVML_GPM_METRICS_GET_VERSION = 1
+
+class c_nvmlGpmSupport_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('isSupportedDevice', c_uint),
+    ]
+
+NVML_GPM_SUPPORT_VERSION = 1
+
+## Functions
+
+def nvmlGpmMetricsGet(metricsGet):
+    fn = _nvmlGetFunctionPointer("nvmlGpmMetricsGet")
+    ret = fn(byref(metricsGet))
+    _nvmlCheckReturn(ret)
+    return metricsGet
+
+def nvmlGpmSampleFree(gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleFree")
+    ret = fn(gpmSample)
+    _nvmlCheckReturn(ret)
+    return
+
+def nvmlGpmSampleAlloc():
+    gpmSample = c_nvmlGpmSample_t()
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleAlloc")
+    ret = fn(byref(gpmSample))
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmSampleGet(device, gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmSampleGet")
+    ret = fn(device, gpmSample)
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmMigSampleGet(device, gpuInstanceId, gpmSample):
+    fn = _nvmlGetFunctionPointer("nvmlGpmMigSampleGet")
+    ret = fn(device, gpuInstanceId, gpmSample)
+    _nvmlCheckReturn(ret)
+    return gpmSample
+
+def nvmlGpmQueryDeviceSupport(device):
+    gpmSupport = c_nvmlGpmSupport_t()
+    gpmSupport.version = NVML_GPM_SUPPORT_VERSION
+    fn = _nvmlGetFunctionPointer("nvmlGpmQueryDeviceSupport")
+    ret = fn(device, byref(gpmSupport))
+    _nvmlCheckReturn(ret)
+    return gpmSupport
+
+def nvmlGpmSetStreamingEnabled(device, state):
+    c_state = c_uint(state)
+    fn = _nvmlGetFunctionPointer("nvmlGpmSetStreamingEnabled")
+    ret = fn(device, c_state)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlGpmQueryIfStreamingEnabled(device):
+    c_state = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlGpmQueryIfStreamingEnabled")
+    ret = fn(device, byref(c_state))
+    _nvmlCheckReturn(ret)
+    return c_state.value
+
+# Low Power Structure and Function
+
+NVML_NVLINK_POWER_STATE_HIGH_SPEED    = 0x0
+NVML_NVLINK_POWER_STATE_LOW           = 0x1
+
+NVML_NVLINK_LOW_POWER_THRESHOLD_MIN     = 0x1
+NVML_NVLINK_LOW_POWER_THRESHOLD_MAX     = 0x1FFF
+NVML_NVLINK_LOW_POWER_THRESHOLD_RESET   = 0xFFFFFFFF
+NVML_NVLINK_LOW_POWER_THRESHOLD_DEFAULT = NVML_NVLINK_LOW_POWER_THRESHOLD_RESET
+
+class c_nvmlNvLinkPowerThres_t(Structure):
+    _fields_ = [
+        ("lowPwrThreshold", c_uint),
+    ]
+
+def nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, l1threshold):
+    c_info = c_nvmlNvLinkPowerThres_t()
+    c_info.lowPwrThreshold = l1threshold
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkDeviceLowPowerThreshold")
+    ret = fn(device, byref(c_info))
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+NVML_GPU_FABRIC_UUID_LEN = 16
+
+_nvmlGpuFabricState_t = c_uint
+NVML_GPU_FABRIC_STATE_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_STATE_NOT_STARTED   = 1
+NVML_GPU_FABRIC_STATE_IN_PROGRESS   = 2
+NVML_GPU_FABRIC_STATE_COMPLETED     = 3
+
+class c_nvmlGpuFabricInfo_t(_PrintableStructure):
+    _fields_ = [
+        ("clusterUuid", c_char * NVML_DEVICE_UUID_BUFFER_SIZE),
+        ("status", _nvmlReturn_t),
+        ("cliqueId", c_uint32),
+        ("state", _nvmlGpuFabricState_t)
+    ]
+
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE          = 1
+NVML_GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE         = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW         = 0
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW         = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED   = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE            = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE           = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY           = 2
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY           = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED  = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE           = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE          = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY          = 4
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY          = 0x11
+
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_NOT_SUPPORTED = 0
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_TRUE          = 1
+NVML_GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_RECOVERY_FALSE         = 2
+NVML_GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT_RECOVERY         = 6
+NVML_GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT_RECOVERY         = 0x11
+
+nvmlGpuFabricInfo_v2 = 0x02000024
+
+class c_nvmlGpuFabricInfoV_t(_PrintableStructure):
+    _fields_ = [
+        ("version", c_uint),
+        ("clusterUuid", c_char * NVML_GPU_FABRIC_UUID_LEN),
+        ("status", _nvmlReturn_t),
+        ("cliqueId", c_uint32),
+        ("state", _nvmlGpuFabricState_t),
+        ("healthMask", c_uint32)
+    ]
+
+    def __init__(self):
+        super(c_nvmlGpuFabricInfoV_t, self).__init__(version=nvmlGpuFabricInfo_v2)
+
+def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfo");
+    ret = fn(device, gpuFabricInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetGpuFabricInfoV(device, gpuFabricInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfoV");
+    ret = fn(device, gpuFabricInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+######################
+## Enums/defines
+#### NVML GPU NVLINK BW MODE
+NVML_GPU_NVLINK_BW_MODE_FULL      = 0x0
+NVML_GPU_NVLINK_BW_MODE_OFF       = 0x1
+NVML_GPU_NVLINK_BW_MODE_MIN       = 0x2
+NVML_GPU_NVLINK_BW_MODE_HALF      = 0x3
+NVML_GPU_NVLINK_BW_MODE_3QUARTER  = 0x4
+NVML_GPU_NVLINK_BW_MODE_COUNT     = 0x5
+
+def nvmlSystemSetNvlinkBwMode(mode):
+    fn = _nvmlGetFunctionPointer("nvmlSystemSetNvlinkBwMode")
+    ret = fn(mode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlSystemGetNvlinkBwMode():
+    mode = c_uint()
+    fn = _nvmlGetFunctionPointer("nvmlSystemGetNvlinkBwMode")
+    ret = fn(byref(mode))
+    _nvmlCheckReturn(ret)
+    return mode.value
+
+_nvmlPowerScopeType_t = c_uint
+NVML_POWER_SCOPE_GPU     = 0
+NVML_POWER_SCOPE_MODULE  = 1
+NVML_POWER_SCOPE_MEMORY  = 2
+
+class c_nvmlPowerValue_v2_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('powerScope', _nvmlPowerScopeType_t),
+        ('powerValueMw', c_uint),
+    ]
+    _fmt_ = {'<default>': "%d B"}
+
+nvmlPowerValue_v2 = 0x0200000C
+
+def nvmlDeviceSetPowerManagementLimit_v2(device, powerScope, powerLimit, version=nvmlPowerValue_v2):
+    c_powerScope = _nvmlPowerScopeType_t(powerScope)
+    c_powerValue = c_nvmlPowerValue_v2_t()
+    c_powerValue.version = c_uint(version)
+    c_powerValue.powerScope = c_powerScope
+    c_powerValue.powerValueMw = c_uint(powerLimit)
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit_v2")
+    ret = fn(device, byref(c_powerValue))
+    return NVML_SUCCESS
+
+class c_nvmlEccSramErrorStatus_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('aggregateUncParity', c_ulonglong),
+        ('aggregateUncSecDed', c_ulonglong),
+        ('aggregateCor', c_ulonglong),
+        ('volatileUncParity', c_ulonglong),
+        ('volatileUncSecDed', c_ulonglong),
+        ('volatileCor', c_ulonglong),
+        ('aggregateUncBucketL2', c_ulonglong),
+        ('aggregateUncBucketSm', c_ulonglong),
+        ('aggregateUncBucketPcie', c_ulonglong),
+        ('aggregateUncBucketMcu', c_ulonglong),
+        ('aggregateUncBucketOther', c_ulonglong),
+        ('bThresholdExceeded', c_uint)
+    ]
+
+    def __init__(self):
+        super(c_nvmlEccSramErrorStatus_v1_t, self).__init__(version=nvmlEccSramErrorStatus_v1)
+
+nvmlEccSramErrorStatus_v1 = 0x1000068
+def nvmlDeviceGetSramEccErrorStatus(device, status):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetSramEccErrorStatus")
+    ret = fn(device, status)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+NVML_DEV_CAP_EGM = (1 << 0)
+nvmlDeviceCapabilities_v1 = 0x1000008
+
+class c_nvmlDeviceCapabilities_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('capMask', c_uint),
+    ]
+
+    def __init__(self):
+        super(c_nvmlDeviceCapabilities_v1_t, self).__init__(version=nvmlDeviceCapabilities_v1)
+
+
+def nvmlDeviceGetCapabilities(device, caps):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetCapabilities")
+    return fn(device, caps)
+
+class c_nvmlPlatformInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('ibGuid', c_char * 16),
+        ('rackGuid', c_char * 16),
+        ('chassisPhysicalSlotNumber', c_char),
+        ('computeSlotIndex', c_char),
+        ('nodeIndex', c_char),
+        ('peerType', c_char),
+        ('moduleId', c_char)
+    ]
+
+    def __init__(self):
+        super(c_nvmlPlatformInfo_v1_t, self).__init__(version=nvmlPlatformInfo_v1)
+
+nvmlPlatformInfo_v1 = 0x100002c
+def nvmlDeviceGetPlatformInfo(device, platformInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetPlatformInfo")
+    ret = fn(device, platformInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+class c_nvmlMask255_t(_PrintableStructure):
+    _fields_ = [
+        ('mask', c_uint * 8),
+    ]
+
+NVML_WORKLOAD_POWER_MAX_PROFILES    = 255
+NVML_POWER_PROFILE_MAX_P            = 0
+NVML_POWER_PROFILE_MAX_Q            = 1
+NVML_POWER_PROFILE_COMPUTE          = 2
+NVML_POWER_PROFILE_MEMORY_BOUND     = 3
+NVML_POWER_PROFILE_NETWORK          = 4
+NVML_POWER_PROFILE_BALANCED         = 5
+NVML_POWER_PROFILE_LLM_INFERENCE    = 6
+NVML_POWER_PROFILE_LLM_TRAINING     = 7
+NVML_POWER_PROFILE_RBM              = 8
+NVML_POWER_PROFILE_DCPCIE           = 9
+NVML_POWER_PROFILE_HMMA_SPARSE      = 10
+NVML_POWER_PROFILE_HMMA_DENSE       = 11
+NVML_POWER_PROFILE_SYNC_BALANCED    = 12
+NVML_POWER_PROFILE_HPC              = 13
+NVML_POWER_PROFILE_MIG              = 14
+NVML_POWER_PROFILE_MAX              = 15
+
+nvmlWorkloadPowerProfileInfo_v1 = 0x100002c
+class c_nvmlWorkloadPowerProfileInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('profileId', c_uint),
+        ('priority', c_uint),
+        ('conflictingmask', c_nvmlMask255_t)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileInfo_v1_t, self).__init__(version=nvmlWorkloadPowerProfileInfo_v1)
+
+nvmlWorkloadPowerProfileProfilesInfo_v1 = 0x1002bf8
+class c_nvmlWorkloadPowerProfileProfilesInfo_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('perfProfilesMask', c_nvmlMask255_t),
+        ('perfProfile', c_nvmlWorkloadPowerProfileInfo_v1_t * NVML_WORKLOAD_POWER_MAX_PROFILES)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileProfilesInfo_v1_t, self).__init__(version=nvmlWorkloadPowerProfileProfilesInfo_v1)
+
+nvmlWorkloadPowerProfileCurrentProfiles_v1 = 0x1000064
+class c_nvmlWorkloadPowerProfileCurrentProfiles_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('perfProfilesMask', c_nvmlMask255_t),
+        ('requestedProfilesMask', c_nvmlMask255_t),
+        ('enforcedProfilesMask', c_nvmlMask255_t)
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileCurrentProfiles_v1_t, self).__init__(version=nvmlWorkloadPowerProfileCurrentProfiles_v1)
+
+nvmlWorkloadPowerProfileRequestedProfiles_v1 = 0x1000024
+class c_nvmlWorkloadPowerProfileRequestedProfiles_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('requestedProfilesMask', c_nvmlMask255_t),
+    ]
+
+    def __init__(self):
+        super(c_nvmlWorkloadPowerProfileRequestedProfiles_v1_t, self).__init__(version=nvmlWorkloadPowerProfileRequestedProfiles_v1)
+
+def nvmlDeviceWorkloadPowerProfileGetProfilesInfo(device, profilesInfo):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileGetProfilesInfo")
+    ret = fn(device, profilesInfo)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileGetCurrentProfiles(device, currentProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileGetCurrentProfiles")
+    ret = fn(device, currentProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileSetRequestedProfiles(device, requestedProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileSetRequestedProfiles")
+    ret = fn(device, requestedProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceWorkloadPowerProfileClearRequestedProfiles(device, requestedProfiles):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceWorkloadPowerProfileClearRequestedProfiles")
+    ret = fn(device, requestedProfiles)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetNvlinkSupportedBwModes(device, supportedBwModes):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvlinkSupportedBwModes")
+    ret = fn(device, supportedBwModes)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceGetNvlinkBwMode(device, getBwMode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvlinkBwMode")
+    ret = fn(device, getBwMode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+def nvmlDeviceSetNvlinkBwMode(device, setBwMode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvlinkBwMode")
+    ret = fn(device, setBwMode)
+    _nvmlCheckReturn(ret)
+    return NVML_SUCCESS
+
+nvmlDramEncryptionInfo_v1 = 0x01000008
+
+class c_nvmlDramEncryptionInfo_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('encryptionState',  _nvmlEnableState_t),
+    ]
+
+    def __init__(self):
+        super(c_nvmlDramEncryptionInfo_t, self).__init__(version=nvmlDramEncryptionInfo_v1)
+
+def nvmlDeviceGetDramEncryptionMode(handle):
+    c_currState = c_nvmlDramEncryptionInfo_t()
+    c_pendingState = c_nvmlDramEncryptionInfo_t()
+    fn = _nvmlGetFunctionPointer("nvmlDeviceGetDramEncryptionMode")
+    ret = fn(handle, byref(c_currState), byref(c_pendingState))
+    _nvmlCheckReturn(ret)
+    return [c_currState.encryptionState, c_pendingState.encryptionState]
+
+# added to API
+def nvmlDeviceGetCurrentDramEncryptionMode(handle):
+    return nvmlDeviceGetDramEncryptionMode(handle)[0]
+
+# added to API
+def nvmlDeviceGetPendingDramEncryptionMode(handle):
+    return nvmlDeviceGetDramEncryptionMode(handle)[1]
+
+def nvmlDeviceSetDramEncryptionMode(handle, mode):
+    fn = _nvmlGetFunctionPointer("nvmlDeviceSetDramEncryptionMode")
+    c_dramEncryptionMode = c_nvmlDramEncryptionInfo_t()
+    c_dramEncryptionMode.encryptionState = mode;
+    ret = fn(handle, byref(c_dramEncryptionMode))
+    _nvmlCheckReturn(ret)
+    return None
+
+# Power Smoothing defines
+NVML_POWER_SMOOTHING_MAX_NUM_PROFILES                   = 5
+NVML_POWER_SMOOTHING_ADMIN_OVERRIDE_NOT_SET             = 0xFFFFFFFF
+NVML_POWER_SMOOTHING_PROFILE_PARAM_PERCENT_TMP_FLOOR    = 0
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_UP_RATE         = 1
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_RATE       = 2
+NVML_POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_HYSTERESIS = 3
+
+nvmlPowerSmoothingState_v1=0x1000008
+class c_nvmlPowerSmoothingState_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('state', c_uint),
+    ]
+
+    def __init__(self):
+        super(c_nvmlPowerSmoothingState_v1_t, self).__init__(version=nvmlPowerSmoothingState_v1)
+
+nvmlPowerSmoothingProfile_v1=0x1000018
+class c_nvmlPowerSmoothingProfile_v1_t(_PrintableStructure):
+    _fields_ = [
+        ('version', c_uint),
+        ('profileId', c_uint),
+        ('paramId', c_uint),
+        ('value', c_double),
+    ]
+
+    def __init__(self):
+        super(c_nvmlPowerSmoothingProfile_v1_t, self).__init__(version=nvmlPowerSmoothingProfile_v1)
+
+def nvmlDevicePowerSmoothingActivatePresetProfile(device, profile):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingActivatePresetProfile")
+    ret = fn(device, profile)
+    _nvmlCheckReturn(ret)
+
+def nvmlDevicePowerSmoothingUpdatePresetProfileParam(device, profile):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingUpdatePresetProfileParam")
+    ret = fn(device, profile)
+    _nvmlCheckReturn(ret)
+
+def nvmlDevicePowerSmoothingSetState(device, state):
+    fn = _nvmlGetFunctionPointer("nvmlDevicePowerSmoothingSetState")
+    ret = fn(device, state)
+    _nvmlCheckReturn(ret)
+
diff --git a/vllm/utils.py b/vllm/utils.py
index 8b926959875..e1687527666 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2239,34 +2239,13 @@ def import_pynvml():
         This causes errors when both of them are installed.
         Starting from version 12.0, it migrates to a new module
         named `pynvml_utils` to avoid the conflict.
-    
-    TL;DR: if users have pynvml<12.0 installed, it will cause problems.
-    Otherwise, `import pynvml` will import the correct module.
-    We take the safest approach here, to manually import the correct
-    `pynvml.py` module from the `nvidia-ml-py` package.
+    It is so confusing that many packages in the community use the
+    unofficial one by mistake, and we have to handle this case.
+    For example, `nvcr.io/nvidia/pytorch:24.12-py3` uses the unofficial
+    one, and it will cause errors, see the issue
+    https://github.com/vllm-project/vllm/issues/12847 for example.
+    After all the troubles, we decide to copy the official `pynvml`
+    module to our codebase, and use it directly.
     """
-    if TYPE_CHECKING:
-        import pynvml
-        return pynvml
-    if "pynvml" in sys.modules:
-        import pynvml
-        if pynvml.__file__.endswith("__init__.py"):
-            # this is pynvml < 12.0
-            raise RuntimeError(
-                "You are using a deprecated `pynvml` package. "
-                "Please uninstall `pynvml` or upgrade to at least"
-                " version 12.0. See https://pypi.org/project/pynvml "
-                "for more information.")
-        return sys.modules["pynvml"]
-    import importlib.util
-    import os
-    import site
-    for site_dir in site.getsitepackages():
-        pynvml_path = os.path.join(site_dir, "pynvml.py")
-        if os.path.exists(pynvml_path):
-            spec = importlib.util.spec_from_file_location(
-                "pynvml", pynvml_path)
-            pynvml = importlib.util.module_from_spec(spec)
-            sys.modules["pynvml"] = pynvml
-            spec.loader.exec_module(pynvml)
-            return pynvml
+    import vllm.third_party.pynvml as pynvml
+    return pynvml

From ee92c4e8a8d87c35d8f6ea7fea0efe26c4b411ec Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Sun, 9 Feb 2025 02:56:40 -0800
Subject: [PATCH 0080/1240] [MISC] Always import version library first in the
 vllm package (#12979)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 566c5116d5f..457780824c7 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -1,5 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+# The version.py should be independent library, and we always import the
+# version library first.  Such assumption is critical for some customization.
+from .version import __version__, __version_tuple__  # isort:skip
+
 import os
 
 import torch
@@ -19,8 +23,6 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-from .version import __version__, __version_tuple__
-
 # set some common config/environment variables that should be set
 # for all processes created by vllm and all processes
 # that interact with vllm workers.

From 4d5e3fea24a0136f906f49818e2624b56c9aed27 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Feb 2025 09:38:57 +0800
Subject: [PATCH 0081/1240] [core] improve error handling when wake up from
 sleep mode (#12981)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cumem_allocator.cpp              | 63 ++++++++++++++++++++++-----
 tests/basic_correctness/test_cumem.py | 27 ++++++++++++
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp
index e8555d853b7..fab6ca36d42 100644
--- a/csrc/cumem_allocator.cpp
+++ b/csrc/cumem_allocator.cpp
@@ -12,15 +12,21 @@ extern "C" {
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 
-#define CUDA_CHECK(condition)                                                  \
-  do {                                                                         \
-    CUresult error = condition;                                                \
-    if (error != 0) {                                                          \
-      char* error_string;                                                      \
-      cuGetErrorString(error, (const char**)&error_string);                    \
-      std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \
-                << __LINE__ << std::endl;                                      \
-    }                                                                          \
+char error_msg[10240];  // 10KB buffer to store error messages
+CUresult no_error = CUresult(0);
+CUresult error_code = no_error;  // store error code
+
+#define CUDA_CHECK(condition)                                           \
+  do {                                                                  \
+    CUresult error = condition;                                         \
+    if (error != 0) {                                                   \
+      error_code = error;                                               \
+      char* error_string;                                               \
+      cuGetErrorString(error, (const char**)&error_string);             \
+      snprintf(error_msg, sizeof(error_msg), "CUDA Error: %s at %s:%d", \
+               error_string, __FILE__, __LINE__);                       \
+      std::cerr << error_msg << std::endl;                              \
+    }                                                                   \
   } while (0)
 
 // Global references to Python callables
@@ -54,14 +60,22 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
 
   // Allocate memory using cuMemCreate
   CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0));
+  if (error_code != 0) {
+    return;
+  }
   CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0));
-
+  if (error_code != 0) {
+    return;
+  }
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = device;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
   CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1));
+  if (error_code != 0) {
+    return;
+  }
   // std::cout << "create_and_map: device=" << device << ", size=" << size << ",
   // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
 }
@@ -73,7 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
   // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
   ensure_context(device);
   CUDA_CHECK(cuMemUnmap(d_mem, size));
+  if (error_code != 0) {
+    return;
+  }
   CUDA_CHECK(cuMemRelease(*p_memHandle));
+  if (error_code != 0) {
+    return;
+  }
 }
 
 PyObject* create_tuple_from_c_integers(unsigned long long a,
@@ -121,12 +141,16 @@ void* my_malloc(ssize_t size, int device, CUstream stream) {
   size_t granularity;
   CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
                                            CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-
+  if (error_code != 0) {
+    return nullptr;
+  }
   size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
 
   CUdeviceptr d_mem;
   CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0));
-
+  if (error_code != 0) {
+    return nullptr;
+  }
   // allocate the CUmemGenericAllocationHandle
   CUmemGenericAllocationHandle* p_memHandle =
       (CUmemGenericAllocationHandle*)malloc(
@@ -208,6 +232,9 @@ void my_free(void* ptr, ssize_t size, int device, CUstream stream) {
 
   // free address and the handle
   CUDA_CHECK(cuMemAddressFree(d_mem, size));
+  if (error_code != 0) {
+    return;
+  }
   free(p_memHandle);
 }
 
@@ -258,6 +285,12 @@ static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
 
   unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
+  if (error_code != 0) {
+    error_code = no_error;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
   Py_RETURN_NONE;
 }
 
@@ -282,6 +315,12 @@ static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
 
   create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
 
+  if (error_code != 0) {
+    error_code = no_error;
+    PyErr_SetString(PyExc_RuntimeError, error_msg);
+    return nullptr;
+  }
+
   Py_RETURN_NONE;
 }
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index da9239b0940..4e9f1bf1cf8 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
 import torch
 
 from vllm import LLM, SamplingParams
@@ -9,6 +10,32 @@
 from ..utils import fork_new_process_for_each_test
 
 
+@fork_new_process_for_each_test
+def test_python_error():
+    """
+    Test if Python error occurs when there's low-level
+    error happening from the C++ side.
+    """
+    allocator = CuMemAllocator.get_instance()
+    total_bytes = torch.cuda.mem_get_info()[1]
+    alloc_bytes = int(total_bytes * 0.7)
+    tensors = []
+    with allocator.use_memory_pool():
+        # allocate 70% of the total memory
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        tensors.append(x)
+    # release the memory
+    allocator.sleep()
+
+    # allocate more memory than the total memory
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    tensors.append(y)
+    with pytest.raises(RuntimeError):
+        # when the allocator is woken up, it should raise an error
+        # because we don't have enough memory
+        allocator.wake_up()
+
+
 @fork_new_process_for_each_test
 def test_basic_cumem():
     # some tensors from default memory pool

From 3088ecb610f235996c980a454b9aaca9f48e7e6a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Feb 2025 10:28:59 +0800
Subject: [PATCH 0082/1240] [core][rlhf] add colocate example for RLHF (#12984)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  4 +-
 .../{ray_placement.py => rlhf_colocate.py}    | 84 +++++++++++++++++--
 2 files changed, 78 insertions(+), 10 deletions(-)
 rename examples/offline_inference/{ray_placement.py => rlhf_colocate.py} (56%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ab6a576b22b..948eab97ffa 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -128,7 +128,7 @@ steps:
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
   - examples/offline_inference/rlhf.py
-  - examples/offline_inference/ray_placement.py
+  - examples/offline_inference/rlhf_colocate.py
   commands:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -137,7 +137,7 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - python3 ../examples/offline_inference/rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
+  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2 
diff --git a/examples/offline_inference/ray_placement.py b/examples/offline_inference/rlhf_colocate.py
similarity index 56%
rename from examples/offline_inference/ray_placement.py
rename to examples/offline_inference/rlhf_colocate.py
index cd801a3c0c8..b921bc71feb 100644
--- a/examples/offline_inference/ray_placement.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -1,13 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-a simple demonstration to show how to control
-the placement of the vLLM workers with Ray.
-The key is to set VLLM_RAY_PER_WORKER_GPUS and
-VLLM_RAY_BUNDLE_INDICES properly.
+a simple demonstration to show how to co-locate
+vLLM worker with training actors on the same GPUs,
+for RLHF-like applications.
+The key points:
+- Control the placement of the vLLM workers with Ray, by setting
+    VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
+- Use cuda-ipc to pass tensors, since NCCL does not work when we have
+    multiple processes on the same GPU.
 """
 import os
 
 import ray
+import torch
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
@@ -19,7 +24,33 @@ class MyWorker(Worker):
 
     def report_device_id(self) -> str:
         from vllm.platforms import current_platform
-        return current_platform.get_device_uuid(self.device.index)
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
 
 
 class MyLLM(LLM):
@@ -40,12 +71,32 @@ def __init__(self, *args, bundle_indices: list, **kwargs):
 
 class RayTrainingActor:
 
-    def report_device_id(self) -> str:
+    def __init__(self):
+        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        from transformers import AutoModelForCausalLM
+        self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+        self.model.to("cuda:0")
+        for name, p in self.model.named_parameters():
+            p.data.zero_()
+        torch.cuda.synchronize()
         # the argument for get_device_uuid is the index
         # of the GPU in the visible devices.
-        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
         from vllm.platforms import current_platform
-        return current_platform.get_device_uuid(0)
+        self.device_uuid = current_platform.get_device_uuid(0)
+
+    def report_device_id(self) -> str:
+        return self.device_uuid
+
+    def get_weight_ipc_handles(self):
+        from torch.multiprocessing.reductions import reduce_tensor
+        data = {}
+        for name, p in self.model.named_parameters():
+            # the training actor might only have a subset of the weights
+            # and need to all-gather the weights from all the actors.
+            # for demonstration, here we assume all training actors have
+            # the full weights.
+            data[name] = reduce_tensor(p.detach())
+        return {self.device_uuid: data}
 
 
 # ray manages 4 GPUs
@@ -78,6 +129,8 @@ def report_device_id(self) -> str:
         ),
     )(RayTrainingActor).remote()
     training_actors.append(training_actor)
+
+for bundle_index, training_actor in enumerate(training_actors):
     device_id = ray.get(training_actor.report_device_id.remote())
     print(f"training actor {bundle_index} is on {device_id}")
     training_actor_device_ids.append(device_id)
@@ -119,3 +172,18 @@ def report_device_id(self) -> str:
 # the last two training actors should be
 # on the same GPUs as the second inference engine
 assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
+
+print("gather all the IPC handles from the training actors")
+ipc_handles = {}
+for actor in training_actors:
+    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
+
+print("update the weights of the inference engines")
+for llm in inference_engines:
+    ray.get(
+        llm.collective_rpc.remote("update_weights_from_ipc_handles",
+                                  args=(ipc_handles, )))
+print("check if the weights are updated")
+for llm in inference_engines:
+    assert ray.get(
+        llm.collective_rpc.remote("check_weights_changed", args=tuple()))

From a22ab352f37d9f75c30a4ce54ceb892a73b00fda Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Sun, 9 Feb 2025 19:35:56 -0800
Subject: [PATCH 0083/1240] [V1] Use msgpack for core request serialization
 (#12918)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/__init__.py    | 42 ++++++++----------------
 vllm/v1/engine/core.py        | 61 +++++++++++++++--------------------
 vllm/v1/engine/core_client.py | 27 +++++++---------
 vllm/v1/serial_utils.py       | 27 +++++++---------
 4 files changed, 62 insertions(+), 95 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index b05ef3cc8c7..30e1185019d 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,20 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import List, Optional, Union
 
 import msgspec
 
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
 
-if TYPE_CHECKING:
-    from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.inputs import PlaceholderRange
-    from vllm.sampling_params import SamplingParams
-
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
 FINISH_REASON_STRINGS = ("stop", "length", "abort")
@@ -39,8 +36,11 @@ def __str__(self):
         return FINISH_REASON_STRINGS[self.value]
 
 
-@dataclass
-class EngineCoreRequest:
+class EngineCoreRequest(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        omit_defaults=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
 
     # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
     # but this object is currently not playing well with msgspec
@@ -51,13 +51,13 @@ class EngineCoreRequest:
     # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
+    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
     mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[List["PlaceholderRange"]]
-    sampling_params: "SamplingParams"
+    mm_placeholders: Optional[List[PlaceholderRange]]
+    sampling_params: SamplingParams
     eos_token_id: Optional[int]
     arrival_time: float
-    lora_request: Optional["LoRARequest"]
+    lora_request: Optional[LoRARequest]
 
 
 class EngineCoreOutput(
@@ -94,16 +94,6 @@ class EngineCoreOutputs(
     scheduler_stats: SchedulerStats
 
 
-@dataclass
-class EngineCoreProfile:
-    is_start: bool
-
-
-@dataclass
-class EngineCoreResetPrefixCache:
-    pass
-
-
 class EngineCoreRequestType(enum.Enum):
     """
     Request types defined as hex byte strings, so it can be sent over sockets
@@ -113,7 +103,3 @@ class EngineCoreRequestType(enum.Enum):
     ABORT = b'\x01'
     PROFILE = b'\x02'
     RESET_PREFIX_CACHE = b'\x03'
-
-
-EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile,
-                               EngineCoreResetPrefixCache, List[str]]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f3d40aa1e9c..c90667ba033 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pickle
 import queue
 import signal
 import threading
 import time
 from multiprocessing.connection import Connection
-from typing import List, Tuple, Type
+from typing import Any, List, Tuple, Type
 
 import psutil
 import zmq
@@ -19,13 +18,12 @@
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.core.scheduler import Scheduler
-from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
-                            EngineCoreRequest, EngineCoreRequestType,
-                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
+                            EngineCoreRequestType)
 from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
-from vllm.v1.serial_utils import MsgpackEncoder, PickleEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -161,7 +159,8 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[EngineCoreRequestUnion] = queue.Queue()
+        self.input_queue: queue.Queue[Tuple[EngineCoreRequestType,
+                                            Any]] = queue.Queue()
         self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
                          args=(input_path, ),
@@ -223,7 +222,7 @@ def run_busy_loop(self):
                 while True:
                     try:
                         req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
-                        self._handle_client_request(req)
+                        self._handle_client_request(*req)
                         break
                     except queue.Empty:
                         logger.debug("EngineCore busy loop waiting.")
@@ -233,10 +232,10 @@ def run_busy_loop(self):
                     except BaseException:
                         raise
 
-            # 2) Handle any new client requests (Abort or Add).
+            # 2) Handle any new client requests.
             while not self.input_queue.empty():
                 req = self.input_queue.get_nowait()
-                self._handle_client_request(req)
+                self._handle_client_request(*req)
 
             # 3) Step the engine core.
             outputs = self.step()
@@ -244,48 +243,40 @@ def run_busy_loop(self):
             # 5) Put EngineCoreOutputs into the output queue.
             self.output_queue.put_nowait(outputs)
 
-    def _handle_client_request(self, request: EngineCoreRequestUnion) -> None:
-        """Handle EngineCoreRequest or EngineCoreABORT from Client."""
+    def _handle_client_request(self, request_type: EngineCoreRequestType,
+                               request: Any) -> None:
+        """Dispatch request from client."""
 
-        if isinstance(request, EngineCoreRequest):
+        if request_type == EngineCoreRequestType.ADD:
             self.add_request(request)
-        elif isinstance(request, EngineCoreProfile):
-            self.model_executor.profile(request.is_start)
-        elif isinstance(request, EngineCoreResetPrefixCache):
-            self.reset_prefix_cache()
-        else:
-            # TODO: make an EngineCoreAbort wrapper
-            assert isinstance(request, list)
+        elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
+        elif request_type == EngineCoreRequestType.RESET_PREFIX_CACHE:
+            self.reset_prefix_cache()
+        elif request_type == EngineCoreRequestType.PROFILE:
+            self.model_executor.profile(request)
 
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
-        decoder_add_req = PickleEncoder()
-        decoder_abort_req = PickleEncoder()
+        add_request_decoder = MsgpackDecoder(EngineCoreRequest)
+        generic_decoder = MsgpackDecoder()
 
         with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
-                request_type = type_frame.buffer
-                request_data = data_frame.buffer
+                request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
-                if request_type == EngineCoreRequestType.ADD.value:
-                    request = decoder_add_req.decode(request_data)
-                elif request_type == EngineCoreRequestType.ABORT.value:
-                    request = decoder_abort_req.decode(request_data)
-                elif request_type in (
-                        EngineCoreRequestType.PROFILE.value,
-                        EngineCoreRequestType.RESET_PREFIX_CACHE.value):
-                    request = pickle.loads(request_data)
-                else:
-                    raise ValueError(f"Unknown RequestType: {request_type}")
+                decoder = add_request_decoder if (
+                    request_type
+                    == EngineCoreRequestType.ADD) else generic_decoder
+                request = decoder.decode(data_frame.buffer)
 
                 # Push to input queue for core busy loop.
-                self.input_queue.put_nowait(request)
+                self.input_queue.put_nowait((request_type, request))
 
     def process_output_socket(self, output_path: str):
         """Output socket IO thread."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index cdc63acdb74..2d7d6b42ced 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -5,7 +5,7 @@
 import signal
 import weakref
 from abc import ABC, abstractmethod
-from typing import List, Optional, Type
+from typing import Any, List, Optional, Type
 
 import zmq
 import zmq.asyncio
@@ -14,12 +14,11 @@
 from vllm.logger import init_logger
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
-from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile,
-                            EngineCoreRequest, EngineCoreRequestType,
-                            EngineCoreRequestUnion, EngineCoreResetPrefixCache)
+from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
+                            EngineCoreRequestType)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.serial_utils import MsgpackDecoder, PickleEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
@@ -161,7 +160,7 @@ def sigusr1_handler(signum, frame):
         signal.signal(signal.SIGUSR1, sigusr1_handler)
 
         # Serialization setup.
-        self.encoder = PickleEncoder()
+        self.encoder = MsgpackEncoder()
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
@@ -220,7 +219,7 @@ def get_output(self) -> EngineCoreOutputs:
         return self.decoder.decode(frame.buffer)
 
     def _send_input(self, request_type: EngineCoreRequestType,
-                    request: EngineCoreRequestUnion) -> None:
+                    request: Any) -> None:
 
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
@@ -237,12 +236,10 @@ def abort_requests(self, request_ids: List[str]) -> None:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
-        self._send_input(EngineCoreRequestType.PROFILE,
-                         EngineCoreProfile(is_start))
+        self._send_input(EngineCoreRequestType.PROFILE, is_start)
 
     def reset_prefix_cache(self) -> None:
-        self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE,
-                         EngineCoreResetPrefixCache())
+        self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
 
 
 class AsyncMPClient(MPClient):
@@ -277,7 +274,7 @@ async def process_outputs_socket():
         return self.decoder.decode(await self.outputs_queue.get())
 
     async def _send_input(self, request_type: EngineCoreRequestType,
-                          request: EngineCoreRequestUnion) -> None:
+                          request: Any) -> None:
 
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
@@ -293,9 +290,7 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
-        await self._send_input(EngineCoreRequestType.PROFILE,
-                               EngineCoreProfile(is_start))
+        await self._send_input(EngineCoreRequestType.PROFILE, is_start)
 
     async def reset_prefix_cache_async(self) -> None:
-        await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE,
-                               EngineCoreResetPrefixCache())
+        await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index a7fba65e7c9..3f000abcde0 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,21 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
-from typing import Any
+from typing import Any, Optional
 
 import torch
 from msgspec import msgpack
 
-CUSTOM_TYPE_CODE_PICKLE = 1
-
-
-class PickleEncoder:
-
-    def encode(self, obj: Any):
-        return pickle.dumps(obj)
-
-    def decode(self, data: Any):
-        return pickle.loads(data)
+CUSTOM_TYPE_TENSOR = 1
+CUSTOM_TYPE_PICKLE = 2
 
 
 class MsgpackEncoder:
@@ -34,8 +26,9 @@ def encode_into(self, obj: Any, buf: bytearray) -> None:
 class MsgpackDecoder:
     """Decoder with custom torch tensor serialization."""
 
-    def __init__(self, t: Any):
-        self.decoder = msgpack.Decoder(t, ext_hook=custom_ext_hook)
+    def __init__(self, t: Optional[Any] = None):
+        args = () if t is None else (t, )
+        self.decoder = msgpack.Decoder(*args, ext_hook=custom_ext_hook)
 
     def decode(self, obj: Any):
         return self.decoder.decode(obj)
@@ -46,13 +39,15 @@ def custom_enc_hook(obj: Any) -> Any:
         # NOTE(rob): it is fastest to use numpy + pickle
         # when serializing torch tensors.
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
-        return msgpack.Ext(CUSTOM_TYPE_CODE_PICKLE, pickle.dumps(obj.numpy()))
+        return msgpack.Ext(CUSTOM_TYPE_TENSOR, pickle.dumps(obj.numpy()))
 
-    raise NotImplementedError(f"Objects of type {type(obj)} are not supported")
+    return msgpack.Ext(CUSTOM_TYPE_PICKLE, pickle.dumps(obj))
 
 
 def custom_ext_hook(code: int, data: memoryview) -> Any:
-    if code == CUSTOM_TYPE_CODE_PICKLE:
+    if code == CUSTOM_TYPE_TENSOR:
         return torch.from_numpy(pickle.loads(data))
+    if code == CUSTOM_TYPE_PICKLE:
+        return pickle.loads(data)
 
     raise NotImplementedError(f"Extension type code {code} is not supported")

From 6e07e814970880424910fc46d291da0021387c1f Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sun, 9 Feb 2025 22:45:07 -0500
Subject: [PATCH 0084/1240] Check if selected backend is None in
 get_attn_backend_cls() (#12975)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 4e0683b8a2d..179ee6a7d24 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -35,7 +35,7 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool,
                              use_mla: bool) -> str:
-        if selected_backend != _Backend.TORCH_SDPA:
+        if selected_backend and selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         logger.info("Using Torch SDPA backend.")
         return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"

From eeff9b3393896a79c5f3ceb325b9e3e5b8d393fc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Feb 2025 13:03:43 +0800
Subject: [PATCH 0085/1240] [core] fix sleep mode and pytorch checkpoint
 compatibility (#13001)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py            | 10 ++++++++--
 vllm/model_executor/model_loader/weight_utils.py |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 4e9f1bf1cf8..3ac948799d7 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -115,10 +115,16 @@ def model(x):
 
 
 @fork_new_process_for_each_test
-def test_end_to_end():
+@pytest.mark.parametrize(
+    "model",
+    [
+        "meta-llama/Llama-3.2-1B",  # sleep mode with safetensors
+        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+    ])
+def test_end_to_end(model):
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
+    llm = LLM(model, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 68ade319df2..8b2c5610f1f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -462,7 +462,6 @@ def pt_weights_iterator(
         state = torch.load(bin_file, map_location="cpu", weights_only=True)
         yield from state.items()
         del state
-        torch.cuda.empty_cache()
 
 
 def get_gguf_extra_tensor_names(

From 7d352c9450c1b09d1fd2661795026015e659c6a4 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 10 Feb 2025 01:09:33 -0500
Subject: [PATCH 0086/1240] [Doc] Add link to tool_choice tracking issue in
 tool_calling.md (#13003)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/tool_calling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 027ddb6d5ed..85a9e037398 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -1,6 +1,6 @@
 # Tool Calling
 
-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap.
+vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
 
 ## Quickstart
 

From 03d234509596cad9bbbc5c24cf5491b4caae5b3c Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 10 Feb 2025 01:15:02 -0800
Subject: [PATCH 0087/1240] [misc] Add retries with exponential backoff for HF
 file existence check (#13008)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 61 ++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 42b45e10e3f..aade28610b3 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -3,6 +3,7 @@
 import enum
 import json
 import os
+import time
 from pathlib import Path
 from typing import Any, Dict, Literal, Optional, Type, Union
 
@@ -100,15 +101,33 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
 
     # NB: file_exists will only check for the existence of the config file on
     # hf_hub. This will fail in offline mode.
-    try:
-        return file_exists(model,
-                           config_name,
-                           revision=revision,
-                           token=HF_TOKEN)
-    except huggingface_hub.errors.OfflineModeIsEnabled:
-        # Don't raise in offline mode, all we know is that we don't have this
-        # file cached.
-        return False
+
+    # Call HF to check if the file exists
+    # 2 retries and exponential backoff
+    max_retries = 2
+    retry_delay = 2
+    for attempt in range(max_retries):
+        try:
+            return file_exists(model,
+                               config_name,
+                               revision=revision,
+                               token=HF_TOKEN)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return False
+        except Exception as e:
+            logger.error(
+                "Error checking file existence: %s, retrying %d of %d", e,
+                attempt + 1, max_retries)
+            if attempt == max_retries - 1:
+                logger.error("Error checking file existence: %s", e)
+                raise
+            time.sleep(retry_delay)
+            retry_delay *= 2
+            continue
+    return False
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -193,10 +212,26 @@ def get_config(
             # raise an offline mode error to indicate to the user that they
             # don't have files cached and may need to go online.
             # This is conveniently triggered by calling file_exists().
-            file_exists(model,
-                        HF_CONFIG_NAME,
-                        revision=revision,
-                        token=HF_TOKEN)
+
+            # Call HF to check if the file exists
+            # 2 retries and exponential backoff
+            max_retries = 2
+            retry_delay = 2
+            for attempt in range(max_retries):
+                try:
+                    file_exists(model,
+                                HF_CONFIG_NAME,
+                                revision=revision,
+                                token=HF_TOKEN)
+                except Exception as e:
+                    logger.error(
+                        "Error checking file existence: %s, retrying %d of %d",
+                        e, attempt + 1, max_retries)
+                    if attempt == max_retries:
+                        logger.error("Error checking file existence: %s", e)
+                        raise e
+                    time.sleep(retry_delay)
+                    retry_delay *= 2
 
             raise ValueError(f"No supported config format found in {model}")
 

From 74c9b3899b53f8923766e253136b329345038b97 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Feb 2025 18:45:21 +0800
Subject: [PATCH 0088/1240] [Bugfix] Clean up and fix multi-modal processors
 (#13012)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/compatibility_matrix.md  |   2 +-
 .../decoder_only/language/test_models.py      |  10 ++
 .../multimodal/processing/test_common.py      |   2 +-
 tests/multimodal/utils.py                     |   3 -
 vllm/model_executor/models/chatglm.py         | 160 +++++++-----------
 vllm/model_executor/models/qwen.py            |  91 +++++-----
 vllm/model_executor/models/qwen2_vl.py        |  10 +-
 7 files changed, 124 insertions(+), 154 deletions(-)

diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index b0018ebccf5..ee5db70c7d5 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -297,7 +297,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ?
-  * [✗](gh-issue:7968>)
+  * [✗](gh-issue:7968)
   * ?
   * ✅
   *
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 1ad56241535..c6d5244318a 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -26,6 +26,9 @@
             "google/gemma-1.1-2b-it",  # gemma
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
+        pytest.param(
+            "THUDM/chatglm3-6b",  # ChatGLM (text-only)
+        ),
         pytest.param(
             "meta-llama/Llama-3.2-1B-Instruct",  # llama
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@@ -43,6 +46,9 @@
             "microsoft/phi-2",  # phi
             marks=[pytest.mark.core_model],
         ),
+        pytest.param(
+            "Qwen/Qwen-7B",  # qwen (text-only)
+        ),
         pytest.param(
             "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
             marks=[pytest.mark.core_model],
@@ -68,6 +74,10 @@ def test_models(
 ) -> None:
 
     with hf_runner(model, dtype=dtype) as hf_model:
+        if model.startswith("THUDM/chatglm3"):
+            hf_model.model.get_output_embeddings = lambda: \
+                hf_model.model.transformer.output_layer
+
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 8658e60bc5b..a56a9e2beef 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -89,7 +89,7 @@ def _test_processing_correctness(
         mm_data = {
             k:
             [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit))]
+             for _ in range(rng.randint(limit + 1))]
             for k, limit in limit_mm_per_prompt.items()
         }
 
diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py
index 9a336b7e60f..40fcfeeeac7 100644
--- a/tests/multimodal/utils.py
+++ b/tests/multimodal/utils.py
@@ -17,10 +17,7 @@ def random_video(
     min_wh: int,
     max_wh: int,
 ):
-    # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
     num_frames = rng.randint(min_frames, max_frames)
-    num_frames = (num_frames // 2) * 2
-
     w, h = rng.randint(min_wh, max_wh, size=(2, ))
     return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8)
 
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 9ee9e9ca800..153c85cfb21 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -4,8 +4,8 @@
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
-from typing import (Iterable, List, Mapping, Optional, Sequence, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 from torch import nn
@@ -19,7 +19,6 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -37,12 +36,10 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BatchFeature,
-                                        BoundPromptReplacement,
                                         MultiModalFieldConfig,
-                                        PlaceholderFeaturesInfo,
                                         PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -53,39 +50,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
-IMAGE_TOKEN_ID = 151329
-
-
-def build_normalization_transform(image_size: int) -> transforms.Compose:
-    """
-    Build a normalization transform which can be applied to one or
-    more input images from which we want to extract visual features.
-
-    Args:
-        image_size: size of the image to be processed for visual embeddings.
-    
-    Returns:
-        Callable transform for normalizing and resizing one RGB image.
-    """
-
-    return transforms.Compose([
-        transforms.Resize(
-            (image_size, image_size),
-            interpolation=InterpolationMode.BICUBIC,
-        ),
-        transforms.ToTensor(),
-        transforms.Normalize(
-            (0.48145466, 0.4578275, 0.40821073),
-            (0.26862954, 0.26130258, 0.27577711),
-        ),
-    ])
-
-
-def calculate_image_placeholder(vision_config):
-    return (vision_config["image_size"] // vision_config["patch_size"] // 2)**2
-
 
 class GLMImagePixelInputs(TypedDict):
     pixel_values: torch.Tensor
@@ -109,9 +73,20 @@ def __init__(
         self.config = config
         self.tokenizer = tokenizer
 
-        if hasattr(self.config, "vision_config"):
-            self.image_transform = build_normalization_transform(
-                config.vision_config["image_size"])
+        if vision_config := getattr(config, "vision_config", None):
+            image_size = vision_config["image_size"]
+
+            self.image_transform = transforms.Compose([
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ])
         else:
             self.image_transform = None
 
@@ -150,9 +125,19 @@ def __call__(
 
 class GLM4VProcessingInfo(BaseProcessingInfo):
 
-    def __init__(self, ctx):
-        super().__init__(ctx)
-        self._pre_calculate()
+    def get_tokenizer(self):
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+        return tokenizer
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(ChatGLMConfig)
+
+    def get_hf_processor(self) -> GLM4VProcessor:
+        return GLM4VProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+        )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
@@ -162,27 +147,21 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
+        return {"image": self.get_num_image_feature_tokens()}
 
-        return {"image": self.image_token_num + 2}
-
-    def _pre_calculate(self):
+    def get_num_image_tokens(self) -> int:
         hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        self.image_token_num = calculate_image_placeholder(vision_config)
-        self.image_size = vision_config["image_size"]
+        if not (vision_config := getattr(hf_config, "vision_config", None)):
+            return 0
 
-    def get_num_image_tokens(self) -> int:
-        return self.image_token_num + 2
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
 
-    def get_image_size(self) -> ImageSize:
-
-        return ImageSize(height=self.image_size, width=self.image_size)
-
-    def get_hf_processor(self) -> GLM4VProcessor:
-        return GLM4VProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
-        )
+    def get_num_image_feature_tokens(self) -> int:
+        # EVA2CLIPModel has embeddings for boi and eoi tokens as well
+        return self.get_num_image_tokens() + 2
 
 
 class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
@@ -192,8 +171,12 @@ def get_dummy_processor_inputs(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        if not (vision_config := getattr(hf_config, "vision_config", None)):
+            return ProcessorInputs(prompt_text="", mm_data={})
+
+        target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
-        target_width, target_height = self.info.get_image_size()
 
         mm_data = {
             "image":
@@ -201,9 +184,11 @@ def get_dummy_processor_inputs(
                                    height=target_height,
                                    num_images=num_images)
         }
-        text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
         return ProcessorInputs(
-            prompt_text=text,
+            prompt_text=base_text * num_images,
             mm_data=mm_data,
         )
 
@@ -223,47 +208,28 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+        if not hasattr(hf_config, "vision_config"):
+            return []
+
+        boi_token_id = hf_config.boi_token_id
+        image_token_id = hf_config.pad_token_id
+        eoi_token_id = hf_config.eoi_token_id
 
         def get_replacement(item_idx: int):
-            image_tokens = self.info.image_token_num
-            return [IMAGE_TOKEN_ID] * image_tokens
+            num_image_tokens = self.info.get_num_image_tokens()
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return [boi_token_id] + image_tokens + [eoi_token_id]
 
         return [
             PromptReplacement(
                 modality="image",
-                target=[IMAGE_TOKEN_ID],
+                target=[boi_token_id, image_token_id, eoi_token_id],
                 replacement=get_replacement,
             ),
         ]
 
-    def _apply_prompt_replacements(
-        self,
-        token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
-        mm_item_counts: Mapping[str, int],
-    ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
-        token_ids, text, placeholders = super()._apply_prompt_replacements(
-            token_ids=token_ids,
-            mm_prompt_repls=mm_prompt_repls,
-            mm_item_counts=mm_item_counts,
-        )
-        hf_config = self.info.get_hf_config()
-        boi_token_id = hf_config.boi_token_id
-        eoi_token_id = hf_config.eoi_token_id
-        placeholders = {
-            modality: [
-                PlaceholderFeaturesInfo(
-                    modality=p.modality,
-                    item_idx=p.item_idx,
-                    start_idx=p.start_idx - 1,
-                    tokens=[boi_token_id] + p.tokens + [eoi_token_id],
-                ) for p in ps
-            ]
-            for modality, ps in placeholders.items()
-        }
-
-        return token_ids, text, placeholders
-
 
 class GLMAttention(nn.Module):
 
@@ -618,7 +584,7 @@ def get_input_embeddings(
                 multimodal_embeddings=multimodal_embeddings,
                 placeholder_token_id=[
                     self.config.boi_token_id,
-                    IMAGE_TOKEN_ID,
+                    self.config.pad_token_id,
                     self.config.eoi_token_id,
                 ],
             )
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 89706612431..4b8aeaddbdd 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -63,18 +63,6 @@
 
 logger = init_logger(__name__)
 
-# NOTE: Qwen models have a few other special tags, e.g., ref, bbox, quad;
-# for the time being, these tags are not considered as special at encoding
-# time. This may change as VLLMs multimodal API changes in the future.
-IMG_START = "<img>"
-IMG_END = "</img>"
-IMG_PAD = "<imgpad>"
-# Image context is fixed at 256 for all images
-MAX_QWEN_IMG_TOKENS = 256
-# Image normalization params
-CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
-CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
-
 
 class QwenImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -622,25 +610,6 @@ def forward(
         return hidden_states
 
 
-def build_normalization_transform(image_size: int) -> transforms.Compose:
-    """
-    Build a normalization transform which can be applied to one or
-    more input images from which we want to extract visual features.
-
-    Args:
-        image_size: size of the image to be processed for visual embeddings.
-    
-    Returns:
-        Callable transform for normalizing and resizing one RGB image.
-    """
-    return transforms.Compose([
-        transforms.Resize((image_size, image_size),
-                          interpolation=InterpolationMode.BICUBIC),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=CLIP_MEAN, std=CLIP_STD),
-    ])
-
-
 @lru_cache(maxsize=1)
 def _get_tokenizer_without_image_pad(
         tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
@@ -716,16 +685,34 @@ def __init__(
         self.config = config
         self.tokenizer = tokenizer
 
-        if hasattr(self.config, "visual"):
-            self.image_transform = build_normalization_transform(
-                config.visual["image_size"])
+        if vision_config := getattr(self.config, "visual", None):
+            image_size = vision_config["image_size"]
+
+            self.image_transform = transforms.Compose([
+                transforms.Resize(
+                    (image_size, image_size),
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=(0.48145466, 0.4578275, 0.40821073),
+                    std=(0.26862954, 0.26130258, 0.27577711),
+                ),
+            ])
         else:
             self.image_transform = None
 
-        special_tokens: dict[str,
-                             int] = tokenizer.special_tokens  # type: ignore
-        self.img_start_id = special_tokens[IMG_START]
-        self.img_end_id = special_tokens[IMG_END]
+    @property
+    def image_start_tag(self) -> str:
+        return self.tokenizer.image_start_tag  # type: ignore
+
+    @property
+    def image_end_tag(self) -> str:
+        return self.tokenizer.image_end_tag  # type: ignore
+
+    @property
+    def image_pad_tag(self) -> str:
+        return self.tokenizer.image_pad_tag  # type: ignore
 
     def __call__(
         self,
@@ -787,7 +774,14 @@ def get_mm_max_tokens_per_item(
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
-        return MAX_QWEN_IMG_TOKENS
+        hf_config = self.get_hf_config()
+        if not (vision_config := getattr(hf_config, "visual", None)):
+            return 0
+
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
 
 
 class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
@@ -798,10 +792,12 @@ def get_dummy_processor_inputs(
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
         hf_config = self.info.get_hf_config()
-        if not hasattr(hf_config, "visual"):
+        if not (vision_config := getattr(hf_config, "visual", None)):
             return ProcessorInputs(prompt_text="", mm_data={})
 
-        vision_config = hf_config.visual
+        processor = self.info.get_hf_processor()
+        img_start = processor.image_start_tag
+        img_end = processor.image_end_tag
 
         target_width = target_height = vision_config["image_size"]
         num_images = mm_counts.get("image", 0)
@@ -814,7 +810,7 @@ def get_dummy_processor_inputs(
         }
 
         return ProcessorInputs(
-            prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n"
+            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
                                 for i in range(1, num_images + 1)),
             mm_data=mm_data,
         )
@@ -869,13 +865,18 @@ def _get_prompt_replacements(
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
     ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+        if not hasattr(hf_config, "visual"):
+            return []
+
         tokenizer = self.info.get_tokenizer()
         special_tokens: dict[str,
                              int] = tokenizer.special_tokens  # type: ignore
 
-        img_start_id = special_tokens[IMG_START]
-        img_end_id = special_tokens[IMG_END]
-        img_pad_id = special_tokens[IMG_PAD]
+        processor = self.info.get_hf_processor()
+        img_start_id = special_tokens[processor.image_start_tag]
+        img_end_id = special_tokens[processor.image_end_tag]
+        img_pad_id = special_tokens[processor.image_pad_tag]
 
         num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [img_pad_id] * num_image_tokens
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 34ae7b8c946..f2071eaff48 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -885,14 +885,10 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
-        num_frames = min(max(max_total_frames // max(max_videos, 1), 1),
-                         _MAX_FRAMES_PER_VIDEO)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        # Temporary workaround for https://github.com/huggingface/transformers/issues/35412
-        if num_frames > 1 and num_frames % 2 == 1:
-            num_frames += 1
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
     def get_max_video_tokens(self, seq_len: int) -> int:
         target_width, target_height = self.get_image_size_with_most_features()

From c063ff9b01158f1126f1a53ef87f0e9dd7a2968e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E0=AE=AE=E0=AE=A9=E0=AF=8B=E0=AE=9C=E0=AF=8D=E0=AE=95?=
 =?UTF-8?q?=E0=AF=81=E0=AE=AE=E0=AE=BE=E0=AE=B0=E0=AF=8D=20=E0=AE=AA?=
 =?UTF-8?q?=E0=AE=B4=E0=AE=A9=E0=AE=BF=E0=AE=9A=E0=AF=8D=E0=AE=9A=E0=AE=BE?=
 =?UTF-8?q?=E0=AE=AE=E0=AE=BF?= <smartmanoj42857@gmail.com>
Date: Mon, 10 Feb 2025 20:56:50 +0530
Subject: [PATCH 0089/1240] Fix seed parameter behavior in vLLM (#13007)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/seed_parameter_behavior.md | 51 +++++++++++++++++++++++++++++++++
 tests/test_seed_behavior.py     | 39 +++++++++++++++++++++++++
 vllm/platforms/interface.py     |  9 +++---
 3 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 docs/seed_parameter_behavior.md
 create mode 100644 tests/test_seed_behavior.py

diff --git a/docs/seed_parameter_behavior.md b/docs/seed_parameter_behavior.md
new file mode 100644
index 00000000000..ff17525cf8e
--- /dev/null
+++ b/docs/seed_parameter_behavior.md
@@ -0,0 +1,51 @@
+# Seed Parameter Behavior in vLLM
+
+## Overview
+
+The `seed` parameter in vLLM is used to control the random states for various random number generators. This parameter can affect the behavior of random operations in user code, especially when working with models in vLLM.
+
+## Default Behavior
+
+By default, the `seed` parameter is set to `None`. When the `seed` parameter is `None`, the global random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that the random operations will behave as expected, without any fixed random states.
+
+## Specifying a Seed
+
+If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. This can be useful for reproducibility, as it ensures that the random operations produce the same results across multiple runs.
+
+## Example Usage
+
+### Without Specifying a Seed
+
+```python
+import random
+from vllm import LLM
+
+# Initialize a vLLM model without specifying a seed
+model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
+
+# Try generating random numbers
+print(random.randint(0, 100))  # Outputs different numbers across runs
+```
+
+### Specifying a Seed
+
+```python
+import random
+from vllm import LLM
+
+# Initialize a vLLM model with a specific seed
+model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", seed=42)
+
+# Try generating random numbers
+print(random.randint(0, 100))  # Outputs the same number across runs
+```
+
+## Important Notes
+
+- If the `seed` parameter is not specified, the behavior of global random states remains unaffected.
+- If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set to that value.
+- This behavior can be useful for reproducibility but may lead to non-intuitive behavior if the user is not explicitly aware of it.
+
+## Conclusion
+
+Understanding the behavior of the `seed` parameter in vLLM is crucial for ensuring the expected behavior of random operations in your code. By default, the `seed` parameter is set to `None`, which means that the global random states are not affected. However, specifying a seed value can help achieve reproducibility in your experiments.
diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
new file mode 100644
index 00000000000..7e4e71563e7
--- /dev/null
+++ b/tests/test_seed_behavior.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+import random
+
+import numpy as np
+import torch
+
+from vllm.platforms.interface import Platform
+
+
+def test_seed_behavior():
+    # Test with seed=None
+    Platform.seed_everything(None)
+    random_value_1 = random.randint(0, 100)
+    np_random_value_1 = np.random.randint(0, 100)
+    torch_random_value_1 = torch.randint(0, 100, (1, )).item()
+
+    Platform.seed_everything(None)
+    random_value_2 = random.randint(0, 100)
+    np_random_value_2 = np.random.randint(0, 100)
+    torch_random_value_2 = torch.randint(0, 100, (1, )).item()
+
+    assert random_value_1 != random_value_2
+    assert np_random_value_1 != np_random_value_2
+    assert torch_random_value_1 != torch_random_value_2
+
+    # Test with a specific seed
+    Platform.seed_everything(42)
+    random_value_3 = random.randint(0, 100)
+    np_random_value_3 = np.random.randint(0, 100)
+    torch_random_value_3 = torch.randint(0, 100, (1, )).item()
+
+    Platform.seed_everything(42)
+    random_value_4 = random.randint(0, 100)
+    np_random_value_4 = np.random.randint(0, 100)
+    torch_random_value_4 = torch.randint(0, 100, (1, )).item()
+
+    assert random_value_3 == random_value_4
+    assert np_random_value_3 == np_random_value_4
+    assert torch_random_value_3 == torch_random_value_4
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 211e288b125..645d98a1bb4 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -211,16 +211,17 @@ def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def seed_everything(cls, seed: int) -> None:
+    def seed_everything(cls, seed: Optional[int] = None) -> None:
         """
         Set the seed of each random module.
         `torch.manual_seed` will set seed on all devices.
 
         Loosely based on: https://github.com/Lightning-AI/pytorch-lightning/blob/2.4.0/src/lightning/fabric/utilities/seed.py#L20
         """
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
 
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:

From f4165ae0924a0693c491f135ae5e9089fd87804e Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad.abdolhosseini@gmail.com>
Date: Mon, 10 Feb 2025 14:02:48 -0800
Subject: [PATCH 0090/1240] [Model] Ultravox Model: Support v0.5 Release
 (#12912)

Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  2 +-
 docs/source/serving/multimodal_inputs.md      |  4 +--
 examples/offline_inference/audio_language.py  |  4 +--
 ...i_chat_completion_client_for_multimodal.py |  2 +-
 tests/distributed/test_pipeline_parallel.py   |  4 +--
 tests/entrypoints/openai/test_audio.py        |  2 +-
 tests/entrypoints/test_chat_utils.py          |  2 +-
 .../audio_language/test_ultravox.py           |  2 +-
 .../multimodal/processing/test_common.py      |  2 +-
 tests/models/registry.py                      |  2 +-
 vllm/model_executor/models/ultravox.py        | 26 ++++++++++++-------
 vllm/transformers_utils/configs/ultravox.py   |  6 +++++
 12 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 91e6c42d526..55b3f52356c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -856,7 +856,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
-  * `fixie-ai/ultravox-v0_3`
+  * `fixie-ai/ultravox-v0_5-llama-3_2-1b`
   * ✅︎
   * ✅︎
   * ✅︎
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 217b531e837..ade59e37738 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -359,12 +359,12 @@ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 ### Audio
 
 Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
-Here is a simple example using Ultravox-v0.3.
+Here is a simple example using Ultravox-v0.5-1B.
 
 First, launch the OpenAI-compatible server:
 
 ```bash
-vllm serve fixie-ai/ultravox-v0_3
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
 ```
 
 Then, you can use the OpenAI client as follows:
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 707ca9f8789..3e3034a02f0 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -24,9 +24,9 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-# Ultravox 0.3
+# Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int):
-    model_name = "fixie-ai/ultravox-v0_3"
+    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index d5f798a8dae..ecfcf05a90d 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -12,7 +12,7 @@
     --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 (audio inference with Ultravox)
-vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
 """
 import base64
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5b6741d74ef..5d7cb9e4089 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -215,7 +215,7 @@ def iter_params(self, model_name: str):
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_3": PPTestSettings.fast(trust_remote_code=True),
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -234,7 +234,7 @@ def iter_params(self, model_name: str):
     # [MULTIMODAL GENERATION]
     "OpenGVLab/InternVL2-1B",
     "microsoft/Phi-3-vision-128k-instruct",
-    "fixie-ai/ultravox-v0_3",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     # [LANGUAGE GENERATION - HYBRID ARCH]
     "ai21labs/Jamba-tiny-dev",
 ]
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 3459f24834d..fe7299a48e6 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -11,7 +11,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 TEST_AUDIO_URLS = [
     AudioAsset("winning_call").url,
 ]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5c469007af2..c52fa905c80 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -21,7 +21,7 @@
 EXAMPLES_DIR = VLLM_PATH / "examples"
 
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
-ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
+ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index fe9361d1261..d1f643a8fdb 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -15,7 +15,7 @@
 from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
-MODEL_NAME = "fixie-ai/ultravox-v0_3"
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
 AudioTuple = Tuple[np.ndarray, int]
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a56a9e2beef..6244056c747 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -164,7 +164,7 @@ def _test_processing_correctness(
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_3",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3fd94b89c8a..66b7d3c2e77 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -267,7 +267,7 @@ def check_available_online(
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3",
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
                                      trust_remote_code=True),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9da0682cfa8..063997a14a6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -258,27 +258,35 @@ def __init__(self, config: UltravoxConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self._pad_and_stack = StackAudioFrames(config.stack_factor)
-        dim = config.audio_config.hidden_size * config.stack_factor
-        self.ln_pre = RMSNorm(dim)
-        self.linear_1 = nn.Linear(dim, self.hidden_dim, bias=False)
-        dim = self.hidden_dim
+        dim_in = config.audio_config.hidden_size * config.stack_factor
+        self.ln_pre = RMSNorm(dim_in)
+        self.linear_1 = nn.Linear(dim_in, self.hidden_dim, bias=False)
+        dim_mid = self.hidden_dim
 
         if config.projector_act == "swiglu":
             self.act = MulAndSilu()
-            dim = dim // 2
+            dim_mid = dim_mid // 2
         else:
             self.act = get_act_fn(config.projector_act)
 
-        self.linear_2 = nn.Linear(dim,
-                                  config.text_config.hidden_size,
-                                  bias=False)
-        self.ln_post = RMSNorm(config.text_config.hidden_size)
+        dim_out = config.text_config.hidden_size
+        self.linear_2 = nn.Linear(dim_mid, dim_out, bias=False)
+
+        # Ultravox v0.4.1 and below use layer_norm after the second linear layer
+        # while v0.5.0 and above uses layer_norm after the first linear layer.
+        if config.projector_ln_mid:
+            self.ln_mid: nn.Module = RMSNorm(dim_mid)
+            self.ln_post = nn.Identity()
+        else:
+            self.ln_mid = nn.Identity()
+            self.ln_post = RMSNorm(dim_out)
 
     def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
         audio_features = self._pad_and_stack(audio_features)
         audio_features = self.ln_pre(audio_features)
         hidden_states = self.linear_1(audio_features)
         hidden_states = self.act(hidden_states)
+        hidden_states = self.ln_mid(hidden_states)
         hidden_states = self.linear_2(hidden_states)
         hidden_states = self.ln_post(hidden_states)
         return hidden_states
diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py
index 99715ba6d0b..6b2765db94e 100644
--- a/vllm/transformers_utils/configs/ultravox.py
+++ b/vllm/transformers_utils/configs/ultravox.py
@@ -37,6 +37,10 @@ class UltravoxConfig(transformers.PretrainedConfig):
             The LoRA configuration for finetuning the text model.
         audio_model_lora_config (`LoraConfigSimplified`, *optional*):
             The LoRA configuration for finetuning the audio model.
+        projector_ln_mid (`bool`, *optional*, defaults to `False`):
+            Whether to apply layer normalization at the middle of the
+            projector or at the end. Versions v0.4.1 and below
+            use `False`, but v0.5 and above use `True`.
     """
 
     model_type = "ultravox"
@@ -56,6 +60,7 @@ def __init__(
         projector_act: str = "swiglu",
         text_model_lora_config: Optional[Dict[str, Any]] = None,
         audio_model_lora_config: Optional[Dict[str, Any]] = None,
+        projector_ln_mid: bool = False,
         **kwargs,
     ):
         self.ignore_index = ignore_index
@@ -68,6 +73,7 @@ def __init__(
         self.stack_factor = stack_factor
         self.norm_init = norm_init
         self.projector_act = projector_act
+        self.projector_ln_mid = projector_ln_mid
 
         if text_model_id is not None:
             # Avoid circular import

From 1ad4f7b80a9df2e22b1a96837943d4e3a8bce9d5 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 10 Feb 2025 18:06:16 -0800
Subject: [PATCH 0091/1240] [misc] Fix setup.py condition to avoid AMD from
 being mistaken with CPU (#13022)

Signed-off-by: kevin <kevin@anyscale.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 3e2adadf670..27e5aab760f 100755
--- a/setup.py
+++ b/setup.py
@@ -48,8 +48,9 @@ def load_module_from_path(module_name, path):
         "so vLLM may not be able to run correctly", sys.platform)
     VLLM_TARGET_DEVICE = "empty"
 elif (sys.platform.startswith("linux") and torch.version.cuda is None
-      and os.getenv("VLLM_TARGET_DEVICE") is None):
-    # if cuda is not available and VLLM_TARGET_DEVICE is not set,
+      and os.getenv("VLLM_TARGET_DEVICE") is None
+      and torch.version.hip is None):
+    # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 

From 110e41ee6bb61f116fec7510fc6a1630a5176d82 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 18:10:06 -0800
Subject: [PATCH 0092/1240] [V1][Minor] Move scheduler outputs to a separate
 file (#13062)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/scheduler.py          |  89 +-----------------------
 vllm/v1/core/scheduler_output.py   | 108 +++++++++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py |   2 +-
 vllm/v1/worker/gpu_worker.py       |   3 +-
 4 files changed, 113 insertions(+), 89 deletions(-)
 create mode 100644 vllm/v1/core/scheduler_output.py

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 1aa34ee3860..1c54914d182 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,26 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections import deque
-from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
-                    Tuple, Union)
+from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingParams
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
+                                           SchedulerOutput)
 from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
-
 logger = init_logger(__name__)
 
 
@@ -600,80 +594,3 @@ def make_stats(self) -> SchedulerStats:
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
         )
-
-
-@dataclass
-class NewRequestData:
-
-    req_id: str
-    prompt_token_ids: List[int]
-    prompt: Optional[str]
-    mm_inputs: List["MultiModalKwargs"]
-    mm_hashes: List[str]
-    mm_positions: List["PlaceholderRange"]
-    sampling_params: SamplingParams
-    block_ids: List[int]
-    num_computed_tokens: int
-    lora_request: Optional[LoRARequest]
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "NewRequestData":
-        return cls(
-            req_id=request.request_id,
-            prompt_token_ids=request.prompt_token_ids,
-            prompt=request.prompt,
-            mm_inputs=request.mm_inputs,
-            mm_hashes=request.mm_hashes,
-            mm_positions=request.mm_positions,
-            sampling_params=request.sampling_params,
-            block_ids=block_ids,
-            num_computed_tokens=num_computed_tokens,
-            lora_request=request.lora_request,
-        )
-
-
-@dataclass
-class CachedRequestData:
-
-    req_id: str
-    # If resumed_from_preemption is False, new_block_ids will be appended to
-    # the request's block IDs. If True, new_block_ids will be used as the
-    # request's block IDs instead of appending to the existing block IDs.
-    resumed_from_preemption: bool
-    new_block_ids: List[int]
-    num_computed_tokens: int
-
-    @classmethod
-    def from_request(
-        cls,
-        request: Request,
-        resumed_from_preemption: bool,
-        new_block_ids: List[int],
-        num_computed_tokens: int,
-    ) -> "CachedRequestData":
-        return cls(
-            req_id=request.request_id,
-            resumed_from_preemption=resumed_from_preemption,
-            new_block_ids=new_block_ids,
-            num_computed_tokens=num_computed_tokens,
-        )
-
-
-@dataclass
-class SchedulerOutput:
-
-    scheduled_new_reqs: List[NewRequestData]
-    scheduled_cached_reqs: List[CachedRequestData]
-
-    num_scheduled_tokens: Dict[str, int]
-    total_num_scheduled_tokens: int
-    scheduled_encoder_inputs: Dict[str, List[int]]
-    num_common_prefix_blocks: int
-
-    finished_req_ids: Set[str]
-    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
new file mode 100644
index 00000000000..990b3dd0ed7
--- /dev/null
+++ b/vllm/v1/core/scheduler_output.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
+    from vllm.v1.request import Request
+
+
+@dataclass
+class NewRequestData:
+
+    req_id: str
+    prompt_token_ids: List[int]
+    prompt: Optional[str]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_hashes: List[str]
+    mm_positions: List["PlaceholderRange"]
+    sampling_params: "SamplingParams"
+    block_ids: List[int]
+    num_computed_tokens: int
+    lora_request: Optional["LoRARequest"]
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "NewRequestData":
+        return cls(
+            req_id=request.request_id,
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_hashes=request.mm_hashes,
+            mm_positions=request.mm_positions,
+            sampling_params=request.sampling_params,
+            block_ids=block_ids,
+            num_computed_tokens=num_computed_tokens,
+            lora_request=request.lora_request,
+        )
+
+
+@dataclass
+class CachedRequestData:
+
+    req_id: str
+    # If resumed_from_preemption is False, new_block_ids will be appended to
+    # the request's block IDs. If True, new_block_ids will be used as the
+    # request's block IDs instead of appending to the existing block IDs.
+    resumed_from_preemption: bool
+    new_block_ids: List[int]
+    num_computed_tokens: int
+
+    @classmethod
+    def from_request(
+        cls,
+        request: "Request",
+        resumed_from_preemption: bool,
+        new_block_ids: List[int],
+        num_computed_tokens: int,
+    ) -> "CachedRequestData":
+        return cls(
+            req_id=request.request_id,
+            resumed_from_preemption=resumed_from_preemption,
+            new_block_ids=new_block_ids,
+            num_computed_tokens=num_computed_tokens,
+        )
+
+
+@dataclass
+class SchedulerOutput:
+
+    # List of the requests that are scheduled for the first time.
+    # We cache the request's data in each worker process, so that we don't
+    # need to re-send it every scheduling step.
+    scheduled_new_reqs: List[NewRequestData]
+    # List of the requests that have been scheduled before.
+    # Since the request's data is already cached in the worker processes,
+    # we only send the diff to minimize the communication cost.
+    scheduled_cached_reqs: List[CachedRequestData]
+
+    # req_id -> num_scheduled_tokens
+    # Number of tokens scheduled for each request.
+    num_scheduled_tokens: Dict[str, int]
+    # Total number of tokens scheduled for all requests.
+    # Equal to sum(num_scheduled_tokens.values())
+    total_num_scheduled_tokens: int
+    # req_id -> encoder input indices that need processing.
+    # E.g., if a request has [0, 1], it could mean the vision encoder needs
+    # to process that the request's 0-th and 1-th images in the current step.
+    scheduled_encoder_inputs: Dict[str, List[int]]
+    # Number of common prefix blocks for all requests.
+    # This can be used for cascade attention.
+    num_common_prefix_blocks: int
+
+    # Request IDs that are finished in between the previous and the current
+    # steps. This is used to notify the workers about the finished requests
+    # so that they can free the cached states for those requests.
+    finished_req_ids: Set[str]
+    # List of (req_id, encoder_input_index) tuples.
+    # Used to free the encoder cache.
+    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fdbca70bda7..9b1eab613bf 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -36,7 +36,7 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 0adb6907339..ad53f90b866 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -18,7 +18,6 @@
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
-from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -26,7 +25,7 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.scheduler_output import SchedulerOutput
 
 
 class Worker:

From 110d57748a66d7a8aabbc77f5871a4f0f5879aa8 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 10 Feb 2025 18:24:29 -0800
Subject: [PATCH 0093/1240] [Docs] Annouce Meta Meetup (#13065)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index f04acf09cff..f22a1f9c5c8 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,10 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
+
+---
+
 *Latest News* 🔥
 
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

From 3d7e275c12a2cc387a2cb707e49ef273c6cddfbc Mon Sep 17 00:00:00 2001
From: Florian Greinacher <florian.greinacher@siemens.com>
Date: Tue, 11 Feb 2025 04:33:33 +0100
Subject: [PATCH 0094/1240] [Bugfix] Support missing tool parameters in mistral
 tokenizer (#12884)

Signed-off-by: Florian Greinacher <florian.greinacher@siemens.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/tokenization/test_mistral_tokenizer.py  | 50 ++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py | 57 ++++++++++++-------
 2 files changed, 88 insertions(+), 19 deletions(-)
 create mode 100644 tests/tokenization/test_mistral_tokenizer.py

diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
new file mode 100644
index 00000000000..03e1f1fadd7
--- /dev/null
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from mistral_common.protocol.instruct.messages import UserMessage
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.protocol.instruct.tool_calls import Function, Tool
+
+from vllm.transformers_utils.tokenizers.mistral import (
+    make_mistral_chat_completion_request)
+
+
+# yapf: enable
+@pytest.mark.parametrize(
+    "openai_request,expected_mistral_request",
+    [(
+        {
+            "messages": [{
+                "role": "user",
+                "content": "What is the current local date and time?",
+            }],
+            "tools": [{
+                "type": "function",
+                "function": {
+                    "description": "Fetch the current local date and time.",
+                    "name": "get_current_time",
+                },
+            }],
+        },
+        ChatCompletionRequest(
+            messages=[
+                UserMessage(content="What is the current local date and time?")
+            ],
+            tools=[
+                Tool(
+                    type="function",
+                    function=Function(
+                        name="get_current_time",
+                        description="Fetch the current local date and time.",
+                        parameters={},
+                    ),
+                )
+            ],
+        ),
+    )],
+)
+def test_make_mistral_chat_completion_request(openai_request,
+                                              expected_mistral_request):
+    assert (make_mistral_chat_completion_request(
+        openai_request["messages"],
+        openai_request["tools"]) == expected_mistral_request)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 8d96fcd278e..f08923e7401 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -104,6 +104,42 @@ def find_tokenizer_file(files: List[str]):
     return matched_files[0]
 
 
+def make_mistral_chat_completion_request(
+        messages: List["ChatCompletionMessageParam"],
+        tools: Optional[List[Dict[str,
+                                  Any]]] = None) -> "ChatCompletionRequest":
+    last_message = cast(Dict[str, Any], messages[-1])
+    if last_message["role"] == "assistant":
+        last_message["prefix"] = True
+
+        last_message = cast(Dict[str, Any], messages[-1])
+        if last_message["role"] == "assistant":
+            last_message["prefix"] = True
+
+    # mistral-common requires AssistantMessage content to be string [1].
+    #
+    # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
+    for message in messages:
+        if message.get("role") == "assistant":
+            content = message.get("content")
+            if isinstance(content, list):
+                content = "\n".join(chunk.get("text") for chunk in content)
+                message["content"] = content
+
+    # The Mistral client, in comparison to the OpenAI client, requires the
+    # "parameters" dict to be present, even if it's empty.
+    if tools:
+        for function in [
+                tool["function"] for tool in tools
+                if tool["type"] == "function"
+        ]:
+            function.setdefault("parameters", {})
+
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    return ChatCompletionRequest(messages=messages,
+                                 tools=tools)  # type: ignore[type-var]
+
+
 class MistralTokenizer:
 
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
@@ -283,27 +319,10 @@ def encode(self, prompt: str) -> List[int]:
 
     def apply_chat_template(self,
                             messages: List["ChatCompletionMessageParam"],
-                            tools: Optional[Dict[str, Any]] = None,
+                            tools: Optional[List[Dict[str, Any]]] = None,
                             **kwargs) -> List[int]:
 
-        last_message = cast(Dict[str, Any], messages[-1])
-        if last_message["role"] == "assistant":
-            last_message["prefix"] = True
-
-        from mistral_common.protocol.instruct.request import (
-            ChatCompletionRequest)
-
-        # mistral-common requires AssistantMessage content to be string [1].
-        #
-        # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
-        for message in messages:
-            if message.get("role") == "assistant":
-                content = message.get("content")
-                if isinstance(content, list):
-                    content = "\n".join(chunk.get("text") for chunk in content)
-                    message["content"] = content
-        request = ChatCompletionRequest(messages=messages,
-                                        tools=tools)  # type: ignore[type-var]
+        request = make_mistral_chat_completion_request(messages, tools)
         encoded = self.mistral.encode_chat_completion(request)
 
         # encode-decode to get clean prompt

From d057777b3c5240d985708c3c06369a07447fa0b0 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Feb 2025 21:25:30 -0800
Subject: [PATCH 0095/1240] [Benchmark] Add BurstGPT to benchmark_serving
 (#13063)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md            |  8 +++++++
 benchmarks/benchmark_serving.py | 40 ++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 890a2525bcf..367ef93457f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -19,3 +19,11 @@ mkdir coco -p
 wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
 unzip coco/train2017.zip -d coco/
 ```
+
+# Downloading the BurstGPT dataset
+
+You can download the BurstGPT v1.1 dataset by running:
+
+```bash
+wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
+```
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1044bef5941..0c892384236 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -38,6 +38,7 @@
 from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
 
 import numpy as np
+import pandas as pd
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                   RequestFuncOutput)
 from datasets import load_dataset
@@ -131,6 +132,35 @@ def sample_sharegpt_requests(
     return filtered_dataset
 
 
+def sample_burstgpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    random_seed: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int, None]]:
+    df = pd.read_csv(dataset_path)
+    gpt4_df = df[df["Model"] == "GPT-4"]
+    # Remove the failed requests (i.e., response length is 0)
+    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+    # Randomly sample num_requests from the dataset
+    if num_requests <= len(gpt4_df):
+        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
+    else:
+        gpt4_df = gpt4_df.sample(n=num_requests,
+                                 random_state=random_seed,
+                                 replace=True)
+    # Convert the dataframe to a list of tuples
+    dataset = gpt4_df.values.tolist()
+    input_requests = []
+    for i in range(num_requests):
+        input_len = int(dataset[i][2])
+        output_len = int(dataset[i][3])
+        prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
+                                   for j in range(input_len)])
+        input_requests.append((prompt, input_len, output_len, None))
+    return input_requests
+
+
 def sample_sonnet_requests(
     dataset_path: str,
     num_requests: int,
@@ -830,6 +860,14 @@ def main(args: argparse.Namespace):
             fixed_output_len=args.sharegpt_output_len,
         )
 
+    elif args.dataset_name == "burstgpt":
+        input_requests = sample_burstgpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            random_seed=args.seed,
+            tokenizer=tokenizer,
+        )
+
     elif args.dataset_name == "sonnet":
         # Do not format the prompt, pass to message directly
         if args.backend == "openai-chat":
@@ -995,7 +1033,7 @@ def main(args: argparse.Namespace):
         "--dataset-name",
         type=str,
         default="sharegpt",
-        choices=["sharegpt", "sonnet", "random", "hf"],
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
         help="Name of the dataset to benchmark on.",
     )
     parser.add_argument("--dataset-path",

From a1296c58225268d091234bfd2acd6ffd3cbaf25d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Feb 2025 02:25:25 -0500
Subject: [PATCH 0096/1240] [Core] Don't do platform detection at import time
 (#12933)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/executor_base.py | 6 +++---
 vllm/executor/ray_utils.py     | 6 +++---
 vllm/platforms/cuda.py         | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index fb76276bb4b..242690f8e1b 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -8,11 +8,11 @@
 import torch.nn as nn
 from typing_extensions import TypeVar
 
+import vllm.platforms
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.platforms import current_platform
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput
 from vllm.utils import make_async
@@ -108,8 +108,8 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """
         # NOTE: This is logged in the executor because there can be >1 workers.
         logger.info("# %s blocks: %d, # CPU blocks: %d",
-                    current_platform.dispatch_key, num_gpu_blocks,
-                    num_cpu_blocks)
+                    vllm.platforms.current_platform.dispatch_key,
+                    num_gpu_blocks, num_cpu_blocks)
         max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
                            self.model_config.max_model_len)
         logger.info("Maximum concurrency for %s tokens per request: %.2fx",
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7b30155971a..33c0a25803c 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -7,10 +7,10 @@
 
 import msgspec
 
+import vllm.platforms
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -54,10 +54,10 @@ def get_node_ip(self) -> str:
 
         def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
             node_id = ray.get_runtime_context().get_node_id()
-            device_key = current_platform.ray_device_key
+            device_key = vllm.platforms.current_platform.ray_device_key
             if not device_key:
                 raise RuntimeError("current platform %s does not support ray.",
-                                   current_platform.device_name)
+                                   vllm.platforms.current_platform.device_name)
             gpu_ids = ray.get_runtime_context().get_accelerator_ids(
             )[device_key]
             return node_id, gpu_ids
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 991d55ac861..9deb0294668 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -334,10 +334,10 @@ def log_warnings(cls):
             if (len(set(device_names)) > 1
                     and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"):
                 logger.warning(
-                    "Detected different devices in the system: \n%s\nPlease"
+                    "Detected different devices in the system: %s. Please"
                     " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to "
                     "avoid unexpected behavior.",
-                    "\n".join(device_names),
+                    ", ".join(device_names),
                 )
 
 
From 2948f294962a4ab0250e680fb5115d750edbb7a5 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 11 Feb 2025 12:56:03 +0530
Subject: [PATCH 0097/1240] [Misc] LoRA - Refactor Punica ops tests (#12970)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_punica_ops.py           | 652 ++++++++++++++++++++++++
 tests/lora/test_punica_ops_sizes.py     | 401 ---------------
 tests/lora/test_punica_ops_variation.py | 317 ------------
 tests/lora/utils.py                     |  41 +-
 4 files changed, 686 insertions(+), 725 deletions(-)
 create mode 100644 tests/lora/test_punica_ops.py
 delete mode 100644 tests/lora/test_punica_ops_sizes.py
 delete mode 100644 tests/lora/test_punica_ops_variation.py

diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
new file mode 100644
index 00000000000..032e20470bc
--- /dev/null
+++ b/tests/lora/test_punica_ops.py
@@ -0,0 +1,652 @@
+# SPDX-License-Identifier: Apache-2.0
+from threading import Lock
+from typing import List
+
+import pytest
+import torch
+
+import vllm.lora.ops.triton_ops  # noqa: F401
+from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
+                                     bgmv_shrink, sgmv_expand,
+                                     sgmv_expand_slice, sgmv_shrink)
+from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
+
+from .utils import (PunicaTensors, assert_close, generate_data,
+                    generate_data_for_expand_nslices,
+                    generate_data_for_nslices)
+
+
+# Utility shrink and expand operations used as reference implementations.
+def sgmv_shrink_for_nslices(
+        nslices: int, inputs_tensor: torch.Tensor,
+        lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
+        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
+        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
+        num_tokens: int, scaling: float):
+    """
+    Wrapper around sgmv_shrink that handles any nslices.
+    """
+    for index in range(nslices):
+        sgmv_shrink(
+            inputs_tensor,
+            lora_weights_lst[index],
+            out_tensor[index],
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            scaling,
+        )
+
+
+def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
+                            inputs_tensor: torch.Tensor,
+                            lora_weights_lst: List[torch.Tensor],
+                            out_tensor: torch.Tensor,
+                            b_seq_start_loc: torch.Tensor,
+                            seq_len_tensor: torch.Tensor,
+                            prompt_lora_mapping: torch.Tensor, batches: int,
+                            max_seq_length: int, num_tokens: int,
+                            add_inputs: bool) -> None:
+    """
+    Wrapper around sgmv_expand that handles any nslices.
+    """
+    if nslices == 1:
+        # Verify the torch's sgmv_expand op
+        sgmv_expand(
+            inputs_tensor[0],
+            lora_weights_lst[0],
+            out_tensor,
+            b_seq_start_loc,
+            seq_len_tensor,
+            prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            num_tokens,
+            add_inputs=add_inputs,
+        )
+    else:
+        slice_offset = 0
+        for index in range(nslices):
+            lora_weights = lora_weights_lst[index]
+            sgmv_expand_slice(
+                inputs_tensor[index],
+                lora_weights,
+                out_tensor,
+                b_seq_start_loc,
+                seq_len_tensor,
+                prompt_lora_mapping,
+                batches,
+                max_seq_length,
+                num_tokens,
+                slice_offset,
+                hidden_size,
+                add_inputs=add_inputs,
+            )
+            slice_offset += hidden_size
+
+
+_dict_lock = Lock()
+
+
+def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, scaling: float):
+    """
+    Compare outputs of vllm.sgmv_shrink kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "shrink",
+        device,
+    )
+    max_seq_length, token_nums = data.meta()
+
+    # Preventing cache error pointer.
+    with _dict_lock:
+        _LORA_A_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_shrink(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+
+        sgmv_shrink_for_nslices(
+            nslices,
+            data.inputs_tensor,
+            data.lora_weights,
+            data.ref_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            scaling,
+        )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_sgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, nslices: int, dtype: torch.dtype,
+                      device: str, seq_length: int, add_inputs: bool):
+    """
+    Compare outputs of vllm.sgmv_expand kernel against a reference
+    implementation.
+    """
+    data: PunicaTensors = generate_data_for_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        nslices,
+        dtype,
+        "expand",
+        device,
+    )
+
+    max_seq_length, token_nums = data.meta()
+
+    with _dict_lock:
+        _LORA_B_PTR_DICT.clear()
+        torch.ops.vllm.sgmv_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            data.our_out_tensor,
+            data.b_seq_start_loc,
+            data.seq_len_tensor,
+            data.prompt_lora_mapping,
+            batches,
+            max_seq_length,
+            token_nums,
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
+
+    sgmv_expand_for_nslices(nslices,
+                            hidden_size,
+                            data.inputs_tensor,
+                            data.lora_weights,
+                            data.ref_out_tensor,
+                            data.b_seq_start_loc,
+                            data.seq_len_tensor,
+                            data.prompt_lora_mapping,
+                            batches,
+                            max_seq_length,
+                            token_nums,
+                            add_inputs=add_inputs)
+
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      scaling: float):
+    """
+    Compare vllm.bgmv_shrink against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "shrink",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    bgmv_shrink(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        scaling,
+    )
+
+    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand(batches: int, num_loras: int, rank: int,
+                      hidden_size: int, dtype: torch.dtype, device: str,
+                      add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        "expand",
+        device,
+    )
+
+    torch.ops.vllm.bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.our_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    bgmv_expand(
+        data.inputs_tensor,
+        data.lora_weights,
+        data.ref_out_tensor,
+        data.token_lora_mapping,
+        add_inputs=add_inputs,
+    )
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
+                            hidden_size: int, nslices: int, dtype: torch.dtype,
+                            device: str, add_inputs: bool):
+    """
+    Compare vllm.bgmv_expand_slice against a reference implementation.
+    """
+    seq_length = 1
+    data: PunicaTensors = generate_data_for_expand_nslices(
+        batches,
+        hidden_size,
+        num_loras,
+        rank,
+        seq_length,
+        dtype,
+        nslices,
+        device,
+    )
+
+    slice_offset = 0
+    for index in range(nslices):
+        torch.ops.vllm.bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.our_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+        bgmv_expand_slice(
+            data.inputs_tensor,
+            data.lora_weights[index],
+            data.ref_out_tensor,
+            data.token_lora_mapping,
+            slice_offset,
+            slice_size=hidden_size,
+            add_inputs=add_inputs,
+        )
+
+        slice_offset += hidden_size
+    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+
+# Tests
+# We test the punica kernels along 2 verticals mainly.
+# 1. Variations in hidden_dim size
+# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
+#  etc.)
+
+# We have collected the hidden_sizes included in the LoRA models
+# currently supported by vLLM. It tests whether the corresponding Triton
+# kernel can run normally when tensor parallelism is set to
+# [1, 2, 4, 8, 16, 32, 64].
+HIDDEN_SIZES = [
+    128,
+    256,
+    512,
+    896,
+    1024,
+    1152,
+    1216,
+    1280,
+    1536,
+    1664,
+    2048,
+    2240,
+    2304,
+    2368,
+    2432,
+    2560,
+    2752,
+    3072,
+    3328,
+    3456,
+    3584,
+    3712,
+    4096,
+    4480,
+    4608,
+    4736,
+    4864,
+    5120,
+    5504,
+    5632,
+    5888,
+    6144,
+    6400,
+    6848,
+    6912,
+    7168,
+    7424,
+    8192,
+    8960,
+    9216,
+    9472,
+    10240,
+    11008,
+    11264,
+    13824,
+    14336,
+    14784,
+    14848,
+    15360,
+    18944,
+    22016,
+    22528,
+    24576,
+    27392,
+    27648,
+    29568,
+    29696,
+    32000,
+    32256,
+    32512,
+    32768,
+    33024,
+    36864,
+    43264,
+    49152,
+    49408,
+    60544,
+    60672,
+    64000,
+    64256,
+    102400,
+    102656,
+    128000,
+    128256,
+]
+#The size of TP
+divisibility = [1, 2, 8, 16, 64]
+
+all_hidden_size = []
+for div in divisibility:
+    for hidden_size in HIDDEN_SIZES:
+        all_hidden_size.append(hidden_size // div)
+
+HIDDEN_SIZES = list(set(all_hidden_size))
+
+# Test params that focuses on hidden_size variation.
+hs_test_params = {
+    "hidden_sizes": HIDDEN_SIZES,
+    "batches": [4],
+    "num_loras": [4],
+    "max_ranks": [32],
+}
+
+# General tests params that tests for variations in all dimensions
+# except hidden_size.
+test_params = {
+    "hidden_sizes": [2049],
+    "batches": [1, 4, 16, 32],
+    "num_loras": [1, 8, 32, 128],
+    "max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
+}
+
+DTYPES = [torch.float16, torch.bfloat16]
+DEVICES = [f"cuda:{0}"]
+SEED = [0]
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [1, 2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_sgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_sgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          scaling=0.5)
+    else:
+        check_sgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          nslices=nslices,
+                          dtype=dtype,
+                          device=device,
+                          seq_length=128,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("op_type", ["shrink", "expand"])
+def test_punica_bgmv_hidden_size(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+    op_type: str,
+):
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    if op_type == "shrink":
+        check_bgmv_shrink(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          scaling=0.5)
+    else:
+        check_bgmv_expand(batches=batches,
+                          num_loras=num_loras,
+                          rank=rank,
+                          hidden_size=hidden_size,
+                          dtype=dtype,
+                          device=device,
+                          add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", test_params['batches'])
+@pytest.mark.parametrize("num_loras", test_params['num_loras'])
+@pytest.mark.parametrize("rank", test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
+                                    hidden_size: int, nslices: int,
+                                    dtype: torch.dtype, device: str,
+                                    seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
+
+
+@pytest.mark.parametrize("batches", hs_test_params['batches'])
+@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
+@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
+@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("nslices", [2, 3])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEED)
+def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
+                                                rank: int, hidden_size: int,
+                                                nslices: int,
+                                                dtype: torch.dtype,
+                                                device: str, seed: int):
+
+    torch.set_default_device(device)
+    current_platform.seed_everything(seed)
+
+    check_bgmv_expand_slice(batches=batches,
+                            num_loras=num_loras,
+                            rank=rank,
+                            hidden_size=hidden_size,
+                            nslices=nslices,
+                            dtype=dtype,
+                            device=device,
+                            add_inputs=True)
diff --git a/tests/lora/test_punica_ops_sizes.py b/tests/lora/test_punica_ops_sizes.py
deleted file mode 100644
index ecd3bc4978f..00000000000
--- a/tests/lora/test_punica_ops_sizes.py
+++ /dev/null
@@ -1,401 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This script is mainly used to tests various hidden_sizes. We have collected the
-hidden_sizes included in the LoRA models currently supported by vLLM. It tests
-whether the corresponding Triton kernel can run normally when tensor parallelism
-is set to [1, 2, 4, 8, 16, 32, 64].
-"""
-from threading import Lock
-
-import pytest
-import torch
-
-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
-
-from .utils import (assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
-
-HIDDEN_SIZES = [
-    128,
-    256,
-    512,
-    896,
-    1024,
-    1152,
-    1216,
-    1280,
-    1536,
-    1664,
-    2048,
-    2240,
-    2304,
-    2368,
-    2432,
-    2560,
-    2752,
-    3072,
-    3328,
-    3456,
-    3584,
-    3712,
-    4096,
-    4480,
-    4608,
-    4736,
-    4864,
-    5120,
-    5504,
-    5632,
-    5888,
-    6144,
-    6400,
-    6848,
-    6912,
-    7168,
-    7424,
-    8192,
-    8960,
-    9216,
-    9472,
-    10240,
-    11008,
-    11264,
-    13824,
-    14336,
-    14784,
-    14848,
-    15360,
-    18944,
-    22016,
-    22528,
-    24576,
-    27392,
-    27648,
-    29568,
-    29696,
-    32000,
-    32256,
-    32512,
-    32768,
-    33024,
-    36864,
-    43264,
-    49152,
-    49408,
-    60544,
-    60672,
-    64000,
-    64256,
-    102400,
-    102656,
-    128000,
-    128256,
-]
-#The size of TP
-divisibility = [1, 2, 8, 16, 64]
-
-all_hidden_size = []
-for div in divisibility:
-    for hidden_size in HIDDEN_SIZES:
-        all_hidden_size.append(hidden_size // div)
-
-HIDDEN_SIZES = list(set(all_hidden_size))
-
-BATCHES = [4]
-NUM_LORA = [4]
-DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [32]
-SCALES = [0.5]
-SEED = [0]
-DEVICES = [f"cuda:{0}"]
-
-_dict_lock = Lock()
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("nslices", [1, 2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_sgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    nslices: int,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 128
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        nslices,
-        dtype,
-        op_type,
-        device,
-    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        # Preventing cache error pointer.
-        with _dict_lock:
-            _LORA_A_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-        for index in range(nslices):
-            sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst[index],
-                ref_out_tensor[index],
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-
-    else:
-        with _dict_lock:
-            _LORA_B_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_expand(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                offset_start=0,
-                add_inputs=True,
-            )
-        if nslices == 1:
-            # Verify the torch's sgmv_expand op
-            sgmv_expand(
-                inputs_tensor[0],
-                lora_weights_lst[0],
-                ref_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                add_inputs=True,
-            )
-        else:
-            slice_offset = 0
-            for index in range(nslices):
-                lora_weights = lora_weights_lst[index]
-                sgmv_expand_slice(
-                    inputs_tensor[index],
-                    lora_weights,
-                    ref_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batches,
-                    max_seq_length,
-                    token_nums,
-                    slice_offset,
-                    hidden_size,
-                    add_inputs=True,
-                )
-                slice_offset += hidden_size
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        op_type,
-        device,
-    )
-    if op_type == "shrink":
-        torch.ops.vllm.bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
-
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            scaling,
-        )
-
-    else:
-        torch.ops.vllm.bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv_expand_nslices(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        torch.ops.vllm.bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            ref_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/test_punica_ops_variation.py b/tests/lora/test_punica_ops_variation.py
deleted file mode 100644
index 6d1d3c9430f..00000000000
--- a/tests/lora/test_punica_ops_variation.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-This script is mainly used to test whether trtion kernels can run normally
-under different conditions, including various batches, numbers of LoRA , and
-maximum ranks.
-"""
-from threading import Lock
-
-import pytest
-import torch
-
-# Enable custom op register
-import vllm.lora.ops.triton_ops  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
-from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.platforms import current_platform
-
-from .utils import (assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
-
-HIDDEN_SIZES = [2049]
-
-BATCHES = [1, 4, 16, 32]
-NUM_LORA = [1, 8, 32, 128]
-DTYPES = [torch.float16, torch.bfloat16]
-MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
-SCALES = [0.5]
-SEED = [0]
-DEVICES = [f"cuda:{0}"]
-
-_dict_lock = Lock()
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("nslices", [1, 2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_sgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    nslices: int,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 128
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        nslices,
-        dtype,
-        op_type,
-        device,
-    )
-    max_seq_length = seq_len_tensor.max()
-    token_nums = seq_len_tensor.sum().item()
-    if isinstance(max_seq_length, tuple):
-        max_seq_length = max_seq_length[0].item()
-    else:
-        max_seq_length = max_seq_length.item()
-    if op_type == "shrink":
-        # Preventing cache error pointer.
-        with _dict_lock:
-            _LORA_A_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-        for index in range(nslices):
-            sgmv_shrink(
-                inputs_tensor,
-                lora_weights_lst[index],
-                ref_out_tensor[index],
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                scaling,
-            )
-
-    else:
-        with _dict_lock:
-            _LORA_B_PTR_DICT.clear()
-            torch.ops.vllm.sgmv_expand(
-                inputs_tensor,
-                lora_weights_lst,
-                our_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                offset_start=0,
-                add_inputs=True,
-            )
-        slice_offset = 0
-        if nslices == 1:
-            # Verify the torch's sgmv_expand op
-            sgmv_expand(
-                inputs_tensor[0],
-                lora_weights_lst[0],
-                ref_out_tensor,
-                b_seq_start_loc,
-                seq_len_tensor,
-                lora_indices_tensor,
-                batches,
-                max_seq_length,
-                token_nums,
-                add_inputs=True,
-            )
-        else:
-            for index in range(nslices):
-                lora_weights = lora_weights_lst[index]
-                sgmv_expand_slice(
-                    inputs_tensor[index],
-                    lora_weights,
-                    ref_out_tensor,
-                    b_seq_start_loc,
-                    seq_len_tensor,
-                    lora_indices_tensor,
-                    batches,
-                    max_seq_length,
-                    token_nums,
-                    slice_offset,
-                    hidden_size,
-                    add_inputs=True,
-                )
-                slice_offset += hidden_size
-
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("scaling", SCALES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    scaling: float,
-    dtype: torch.dtype,
-    op_type: str,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights,
-        our_out_tensor,
-        ref_out_tensor,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        op_type,
-        device,
-    )
-    if op_type == "shrink":
-        torch.ops.vllm.bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            scaling,
-        )
-
-        bgmv_shrink(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            scaling,
-        )
-
-    else:
-        torch.ops.vllm.bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            our_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-        bgmv_expand(
-            inputs_tensor,
-            lora_weights,
-            ref_out_tensor,
-            indices,
-            add_inputs=True,
-        )
-
-    if op_type == "shrink":
-        ref_out_tensor = ref_out_tensor.to(torch.float32)
-    assert_close(our_out_tensor, ref_out_tensor)
-
-
-@pytest.mark.parametrize("batches", BATCHES)
-@pytest.mark.parametrize("num_loras", NUM_LORA)
-@pytest.mark.parametrize("rank", MAX_RANKS)
-@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("device", DEVICES)
-def test_punica_bgmv_expand_nslices(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    nslices: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    seq_length = 1
-    (
-        inputs_tensor,
-        lora_weights_lst,
-        our_outputs,
-        ref_outputs,
-        b_seq_start_loc,
-        lora_indices_tensor,
-        seq_len_tensor,
-        indices,
-    ) = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-    slice_offset = 0
-    for index in range(nslices):
-        lora_weights = lora_weights_lst[index]
-        torch.ops.vllm.bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            our_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-        bgmv_expand_slice(
-            inputs_tensor,
-            lora_weights,
-            ref_outputs,
-            indices,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=True,
-        )
-
-        slice_offset += hidden_size
-    assert_close(our_outputs, ref_outputs)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index bda00e08190..1e163fbf97c 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -106,6 +107,31 @@ def assert_close(a, b):
     torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
 
 
+@dataclass
+class PunicaTensors:
+    inputs_tensor: torch.Tensor
+    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    our_out_tensor: torch.Tensor
+    ref_out_tensor: torch.Tensor
+    b_seq_start_loc: torch.Tensor
+    prompt_lora_mapping: torch.Tensor
+    seq_len_tensor: torch.Tensor
+    token_lora_mapping: torch.Tensor
+
+    def meta(self) -> Tuple[int, int]:
+        """
+        Infer max_seq_length and token_nums from the tensors
+        and return them.
+        """
+        max_seq_length = self.seq_len_tensor.max()
+        token_nums = self.seq_len_tensor.sum().item()
+        if isinstance(max_seq_length, tuple):
+            max_seq_length = max_seq_length[0].item()
+        else:
+            max_seq_length = max_seq_length.item()
+        return max_seq_length, token_nums
+
+
 def generate_data(
     batches,
     hidden_size,
@@ -115,7 +141,7 @@ def generate_data(
     dtype,
     op_type,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -164,7 +190,8 @@ def generate_data(
         indices[current_offset:current_offset +
                 seq_len_tensor[b_id]].copy_(lora_index)
         current_offset += seq_len_tensor[b_id].item()
-    return (
+
+    return PunicaTensors(
         inputs_tensor,
         lora_weights,
         our_out_tensor,
@@ -185,7 +212,7 @@ def generate_data_for_expand_nslices(
     dtype,
     nslices,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -222,7 +249,7 @@ def generate_data_for_expand_nslices(
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
         inputs_tensor,
         lora_weights_lst,
         our_out_tensor,
@@ -244,7 +271,7 @@ def generate_data_for_nslices(
     dtype,
     op_type,
     device,
-):
+) -> PunicaTensors:
     seq_len_tensor = torch.randint(seq_length, seq_length + 1,
                                    (batches, )).to(device)
     b_seq_start_loc = torch.cumsum(
@@ -302,7 +329,7 @@ def generate_data_for_nslices(
         current_offset += seq_len_tensor[b_id].item()
 
     lora_indices_tensor = lora_indices_tensor.to(device)
-    return (
+    return PunicaTensors(
         inputs_tensor,
         lora_weights_lst,
         our_out_tensor,

From 5b567e6364fbc38d0de687ff510797e20fca9f4f Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Tue, 11 Feb 2025 15:49:03 +0800
Subject: [PATCH 0098/1240] [Bugfix]: Reasoning output bug according to the
 chat template change (#13025)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai_chat_completion_with_reasoning.py  |   8 +-
 .../test_deepseekr1_reasoning_parser.py       | 108 +++++++++++++++---
 .../deepseek_r1_reasoning_parser.py           |  58 ++++++----
 3 files changed, 129 insertions(+), 45 deletions(-)

diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index a88c8adb55c..b5dbed1205d 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -36,8 +36,8 @@
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
 
-print("reasoning_content:", reasoning_content)
-print("content:", content)
+print("reasoning_content for Round 1:", reasoning_content)
+print("content for Round 1:", content)
 
 # Round 2
 messages.append({"role": "assistant", "content": content})
@@ -50,5 +50,5 @@
 reasoning_content = response.choices[0].message.reasoning_content
 content = response.choices[0].message.content
 
-print("reasoning_content:", reasoning_content)
-print("content:", content)
+print("reasoning_content for Round 2:", reasoning_content)
+print("content for Round 2:", content)
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
index f7b81be48bd..fdadb2e21ff 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -15,32 +15,62 @@
 end_token = "</think>"
 
 SIMPLE_REASONING = {
-    "output": "<think>This is a reasoning section</think>This is the rest",
+    "output": "This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
 }
 COMPLETE_REASONING = {
-    "output": "<think>This is a reasoning section</think>",
+    "output": "This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
 }
 NO_REASONING = {
-    "output": "This is a reasoning section",
+    "output": "This is content",
     "reasoning_content": None,
-    "content": "This is a reasoning section",
+    "content": "This is content",
+}
+NO_REASONING_STREAMING = {
+    "output": "This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
 }
 MULTIPLE_LINES = {
-    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "output": "This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
 }
 SHORTEST_REASONING_NO_STREAMING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
 }
 SHORTEST_REASONING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+    "output": "</think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING_WITH_THINK = {
+    "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
 }
@@ -49,37 +79,37 @@
     pytest.param(
         False,
         SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning",
     ),
     pytest.param(
         True,
         SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning_streaming",
     ),
     pytest.param(
         False,
         COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning",
     ),
     pytest.param(
         True,
         COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning_streaming",
     ),
     pytest.param(
         False,
         NO_REASONING,
-        id="no_streaming",
+        id="no_reasoning_token",
     ),
     pytest.param(
         True,
-        NO_REASONING,
-        id="no_streaming",
+        NO_REASONING_STREAMING,
+        id="no_reasoning_token_streaming",
     ),
     pytest.param(
         False,
         MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        id="multiple_lines",
     ),
     pytest.param(
         True,
@@ -89,23 +119,65 @@
     pytest.param(
         True,
         SHORTEST_REASONING,
-        id="shortest_streaming",
+        id="shortest",
     ),
     pytest.param(
         False,
         SHORTEST_REASONING_NO_STREAMING,
         id="shortest_streaming",
     ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="shortest_with_think",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_WITH_THINK,
+        id="shortest_with_think_streaming",
+    ),
 ]
 
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+tokenizer.add_tokens([start_token, end_token])
+
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
     streaming: bool,
     param_dict: dict,
 ):
-    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-    tokenizer.add_tokens([start_token, end_token])
     output = tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: List[str] = [
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index 5c19888d454..33bba04882b 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -67,6 +67,8 @@ def extract_reasoning_content_streaming(
         ]):
             return None
 
+        # Check if <think> is present in previous or delta.
+        # Keep compatibility with models that don't generate <think> tokens.
         if self.think_start_token_id in previous_token_ids:
             if self.think_end_token_id in delta_token_ids:
                 # <think> in previous, </think> in delta,
@@ -85,7 +87,6 @@ def extract_reasoning_content_streaming(
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
         elif self.think_start_token_id in delta_token_ids:
-            logger.info(delta_text)
             if self.think_end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
                 start_index = delta_text.find(self.think_start_token)
@@ -101,35 +102,46 @@ def extract_reasoning_content_streaming(
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
         else:
-            # No <think> in previous or delta, reasoning content continues.
-            return DeltaMessage(content=delta_text)
+            # No <think> in previous or delta, also need to check for </think>.
+            # Because the model may have generated </think> without <think>
+            # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+            if self.think_end_token_id in delta_token_ids:
+                # </think> in delta with more tokens,
+                # extract reasoning content and content
+                end_index = delta_text.find(self.think_end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.think_end_token):]
+                return DeltaMessage(reasoning_content=reasoning_content,
+                                    content=content if content else None)
+            elif self.think_end_token_id in previous_token_ids:
+                # </think> in previous, thinking content ends
+                return DeltaMessage(content=delta_text)
+            else:
+                # no </think> in previous or delta, reasoning content continues
+                return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> Tuple[Optional[str], Optional[str]]:
 
-        # Check if the model output contains the <think> tokens.
-        if (self.think_start_token not in model_output
-                or self.think_end_token not in model_output):
+        # DeepSeek R1 doesn't generate <think> now.
+        # Thus we assume the reasoning content is always at the start.
+        # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
+        if self.think_end_token not in model_output:
             return None, model_output
         else:
+            # Add a start token if it's missing to keep compatibility.
+            if self.think_start_token not in model_output:
+                model_output = f"{self.think_start_token}{model_output}"
             # Use a regex to find the reasoning content
             reasoning_content = self.reasoning_regex.findall(model_output)[0]
 
-            # Remove the reasoning content from the model output
-            # Although deepseek's <think> token is always at the
-            # beginning of the line, we cannot guarantee that the
-            # other models will follow this convention.
-            # Therefore, we need to add :start_index.
-            start_index = model_output.find(self.think_start_token)
-            if start_index != -1:
-                end_index = start_index + len(
-                    f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
-                )
-                model_output = model_output[:start_index] + \
-                                model_output[end_index:]
-
-                if len(model_output) == 0:
-                    return reasoning_content, None
-
-            return reasoning_content, model_output
+            end_index = len(
+                f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
+            )
+            final_output = model_output[end_index:]
+
+            if len(final_output) == 0:
+                return reasoning_content, None
+
+            return reasoning_content, final_output

From 64cfd9797608765002114894c0829dc2014b0cd3 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 11 Feb 2025 00:27:25 -0800
Subject: [PATCH 0099/1240] [V1][Metrics] Add GPU prefix cache hit rate % gauge
 (#12592)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  2 +
 tests/v1/core/test_kv_cache_utils.py     | 39 ++++++++++++++-
 vllm/v1/core/kv_cache_manager.py         | 24 +++++++++
 vllm/v1/core/kv_cache_utils.py           | 64 ++++++++++++++++++++++++
 vllm/v1/core/scheduler.py                |  1 +
 vllm/v1/metrics/loggers.py               | 29 ++++++++++-
 vllm/v1/metrics/stats.py                 | 20 +++++++-
 7 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index de2333901cc..8c1bb1a897e 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -203,6 +203,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
+    "vllm:gpu_prefix_cache_queries",
+    "vllm:gpu_prefix_cache_hits",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:request_success_total",
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 8df4cbe1be7..ba08b83ec54 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -5,10 +5,11 @@
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
+                                         KVCacheBlock, PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
 
@@ -277,3 +278,39 @@ def test_hash_request_tokens_no_mm_inputs():
     assert block_hashes[0].extra_keys is None
     assert block_hashes[1].token_ids == (3, 4, 5)
     assert block_hashes[1].extra_keys is None
+
+
+def test_metrics():
+    """
+    Test the prefix caching metrics.
+    """
+
+    def stats(requests, queries, hits):
+        return PrefixCacheStats(requests=requests, queries=queries, hits=hits)
+
+    metrics = PrefixCachingMetrics(interval=5)
+    assert metrics.hit_rate == 0.0
+
+    metrics.observe(stats(1, 20, 9))
+    # 9 / 20 = 0.45
+    assert metrics.hit_rate == 0.45
+
+    metrics.observe(stats(4, 80, 16))
+
+    # 25 / 100 = 0.25
+    assert metrics.hit_rate == 0.25
+
+    metrics.observe(stats(1, 10, 2))
+
+    # Remove (20, 9) and add (10, 2): 18 / 90 = 0.2
+    assert metrics.aggregated_requests == 5
+    assert metrics.aggregated_query_total == 90
+    assert metrics.aggregated_query_hit == 18
+    assert metrics.hit_rate == 0.2
+
+    metrics.reset()
+    assert metrics.hit_rate == 0.0
+    assert metrics.aggregated_requests == 0
+    assert metrics.aggregated_query_total == 0
+    assert metrics.aggregated_query_hit == 0
+    assert not metrics.query_queue
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index f8d08d0e402..f75d31f542c 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -10,6 +10,7 @@
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
 
 logger = init_logger(__name__)
@@ -78,11 +79,28 @@ def __init__(
         self.req_to_block_hashes: DefaultDict[
             str, List[BlockHashType]] = defaultdict(list)
 
+        self.prefix_cache_stats = PrefixCacheStats()
+
     @property
     def usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
         return 1.0 - (self.free_block_queue.num_free_blocks /
                       self.num_gpu_blocks)
 
+    def make_prefix_cache_stats(self) -> PrefixCacheStats:
+        """Get (and reset) the prefix cache stats.
+
+        Returns:
+            The current prefix caching stats.
+        """
+        stats = self.prefix_cache_stats
+        self.prefix_cache_stats = PrefixCacheStats()
+        return stats
+
     def get_computed_blocks(
             self, request: Request) -> Tuple[List[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
@@ -118,6 +136,10 @@ def get_computed_blocks(
             else:
                 break
 
+        self.prefix_cache_stats.requests += 1
+        self.prefix_cache_stats.queries += len(block_hashes)
+        self.prefix_cache_stats.hits += len(computed_blocks)
+
         # NOTE(woosuk): Since incomplete blocks are not eligible for
         # sharing, `num_computed_tokens` is always a multiple of
         # `block_size`.
@@ -280,6 +302,8 @@ def reset_prefix_cache(self) -> bool:
         for block in self.block_pool:
             block.reset_hash()
 
+        self.prefix_cache_stats.reset = True
+
         logger.info("Successfully reset prefix cache")
         return True
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6888f1a3e18..bddb482d291 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
+from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
 from typing import Any, List, NamedTuple, Optional, Tuple
@@ -8,6 +9,7 @@
 from vllm.logger import init_logger
 from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
                                         KVCacheTensor)
+from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -28,6 +30,68 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+class PrefixCachingMetrics:
+    """Metrics for prefix caching with a hit rate of the most recent N requests.
+
+    Args:
+        interval: The number of the most recent requests to aggregate.
+            Defaults to 1000.
+    """
+
+    def __init__(self, interval: int = 1000):
+        self.interval = interval
+        # The current aggregated values.
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        # A deque of (requests, queries, hits) for the most recent requests.
+        self.query_queue: deque[Tuple[int, int, int]] = deque()
+
+    def observe(self, stats: PrefixCacheStats):
+        """Observe the prefix caching for a set of requests.
+
+        This function is called with information gathered when new requests
+        are being scheduled and are looking for computed blocks.
+
+        When there are more than `interval` requests, the oldest set of
+        requestsare removed from the metrics.
+
+        Args:
+            stats: The prefix cache stats.
+        """
+        # reset_prefix_cache was invoked before the current update.
+        # Reset the metrics before aggregating the current stats.
+        if stats.reset:
+            self.reset()
+
+        # Update the metrics.
+        self.query_queue.append((stats.requests, stats.queries, stats.hits))
+        self.aggregated_requests += stats.requests
+        self.aggregated_query_total += stats.queries
+        self.aggregated_query_hit += stats.hits
+
+        # Remove the oldest stats if the number of requests exceeds.
+        if self.aggregated_requests > self.interval:
+            old_requests, old_queries, old_hits = self.query_queue.popleft()
+            self.aggregated_requests -= old_requests
+            self.aggregated_query_total -= old_queries
+            self.aggregated_query_hit -= old_hits
+
+    def reset(self):
+        """Reset the metrics."""
+        self.aggregated_requests = 0
+        self.aggregated_query_total = 0
+        self.aggregated_query_hit = 0
+        self.query_queue.clear()
+
+    @property
+    def hit_rate(self) -> float:
+        """Calculate the hit rate for the past N requests."""
+        if self.aggregated_query_total == 0:
+            return 0.0
+        return self.aggregated_query_hit / self.aggregated_query_total
+
+
 @dataclass
 class KVCacheBlock:
     """KV-cache block metadata."""
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 1c54914d182..985fcf01bb2 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -593,4 +593,5 @@ def make_stats(self) -> SchedulerStats:
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
+            prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
         )
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index eb1acf584c6..3472761dc18 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -9,6 +9,7 @@
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
 
@@ -37,6 +38,9 @@ def _reset(self, now):
         self.num_prompt_tokens: List[int] = []
         self.num_generation_tokens: List[int] = []
 
+        # Prefix cache metrics. TODO: Make the interval configurable.
+        self.prefix_caching_metrics = PrefixCachingMetrics()
+
     def _local_interval_elapsed(self, now: float) -> bool:
         # Log every _LOCAL_LOGGING_INTERVAL_SEC.
         elapsed_time = now - self.last_log_time
@@ -58,6 +62,8 @@ def log(self, scheduler_stats: SchedulerStats,
 
         self._track_iteration_stats(iteration_stats)
 
+        self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
+
         now = time.monotonic()
         if not self._local_interval_elapsed(now):
             return
@@ -72,13 +78,15 @@ def log(self, scheduler_stats: SchedulerStats,
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
-            "Running: %d reqs, Waiting: %d reqs "
-            "GPU KV cache usage: %.1f%%.",
+            "Running: %d reqs, Waiting: %d reqs, "
+            "GPU KV cache usage: %.1f%%, "
+            "Prefix cache hit rate: %.1f%%",
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
             scheduler_stats.num_waiting_reqs,
             scheduler_stats.gpu_cache_usage * 100,
+            self.prefix_caching_metrics.hit_rate * 100,
         )
 
 
@@ -107,6 +115,18 @@ def __init__(self, model_config: ModelConfig):
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
+            name="vllm:gpu_prefix_cache_queries",
+            documentation=
+            "GPU prefix cache queries, in terms of number of queried blocks.",
+            labelnames=labelnames).labels(*labelvalues)
+
+        self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
+            name="vllm:gpu_prefix_cache_hits",
+            documentation=
+            "GPU prefix cache hits, in terms of number of cached blocks.",
+            labelnames=labelnames).labels(*labelvalues)
+
         self.counter_prompt_tokens = prometheus_client.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
@@ -170,6 +190,11 @@ def log(self, scheduler_stats: SchedulerStats,
 
         self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
 
+        self.counter_gpu_prefix_cache_queries.inc(
+            scheduler_stats.prefix_cache_stats.queries)
+        self.counter_gpu_prefix_cache_hits.inc(
+            scheduler_stats.prefix_cache_stats.hits)
+
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 5e588d35ea4..f806b0adf5d 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
@@ -9,6 +9,20 @@
     from vllm.v1.engine import EngineCoreOutput, FinishReason
 
 
+@dataclass
+class PrefixCacheStats:
+    """Stores prefix cache hit statistics."""
+    # Whether reset_prefix_cache was invoked.
+    reset: bool = False
+    # The number of requests in this update.
+    requests: int = 0
+    # The number of queries in these requests. Note that "queries" here
+    # means the number of blocks that were queried from the cache.
+    queries: int = 0
+    # The number of hits in these requests.
+    hits: int = 0
+
+
 @dataclass
 class SchedulerStats:
     """Stats associated with the scheduler."""
@@ -17,7 +31,9 @@ class SchedulerStats:
     num_waiting_reqs: int = 0
 
     gpu_cache_usage: float = 0.0
-    # gpu_prefix_cache_hit_rate: float = 0.0
+
+    prefix_cache_stats: PrefixCacheStats = field(
+        default_factory=PrefixCacheStats)
 
 
 @dataclass

From b150df11e94f91b852858c6045d49075fef11b84 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Tue, 11 Feb 2025 21:20:53 +0800
Subject: [PATCH 0100/1240] [executor] init `local_rank` as device index
 (#13027)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/uniproc_executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index e5464cafaec..94db232240d 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -28,6 +28,11 @@ def _init_executor(self) -> None:
         distributed_init_method = get_distributed_init_method(
             get_ip(), get_open_port())
         local_rank = 0
+        # set local rank as the device index if specified
+        device_info = self.vllm_config.device_config.device.__str__().split(
+            ":")
+        if len(device_info) > 1:
+            local_rank = int(device_info[1])
         rank = 0
         kwargs = dict(
             vllm_config=self.vllm_config,

From f1d60510f200c0f4677d50f768183387071ebae4 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 11 Feb 2025 08:47:10 -0500
Subject: [PATCH 0101/1240] [ROCm] Using a more precise memory profiling
 (#12624)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1f690b7111e..13aebc605af 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -169,4 +169,5 @@ def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
                                  ) -> float:
         torch.cuda.reset_peak_memory_stats(device)
-        return torch.cuda.max_memory_allocated(device)
+        return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
+            device)[0]

From 4e7f7885cff59528fb780697d616a535af4d9f95 Mon Sep 17 00:00:00 2001
From: Yuhong Guo <yuhong.gyh@antgroup.com>
Date: Tue, 11 Feb 2025 21:55:57 +0800
Subject: [PATCH 0102/1240] [Build] Fix cuda link target of cumem_allocator in
 CPU env (#12863)

Signed-off-by: YuhongGuo <yuhong.gyh@antgroup.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b99061dfde4..a0fd346c6c1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,7 @@ set_gencode_flags_for_srcs(
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   message(STATUS "Enabling cumem allocator extension.")
   # link against cuda driver library
-  list(APPEND CUMEM_LIBS cuda)
+  list(APPEND CUMEM_LIBS CUDA::cuda_driver)
   define_gpu_extension_target(
     cumem_allocator
     DESTINATION vllm

From 24fb1890872ecfe0fdf44b61e80afe981d73a769 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 11 Feb 2025 22:06:46 +0800
Subject: [PATCH 0103/1240] [Platform] add pre_register_and_update function
 (#12432)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py              |  3 ++-
 vllm/engine/arg_utils.py    | 21 +++++++++++++++++++++
 vllm/platforms/interface.py | 18 ++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 426ba380802..1d8c42dd276 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3057,7 +3057,8 @@ class VllmConfig:
     kv_transfer_config: KVTransferConfig = field(default=None,
                                                  init=True)  # type: ignore
     # some opaque config, only used to provide additional information
-    # for the hash computation, mainly used for testing and debugging.
+    # for the hash computation, mainly used for testing, debugging or out of
+    # tree config registration.
     additional_config: SupportsHash = field(default=None,
                                             init=True)  # type: ignore
     instance_id: str = ""
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 40c6fb45679..4232ad9204f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -20,6 +20,7 @@
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.plugins import load_general_plugins
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -203,6 +204,8 @@ class EngineArgs:
 
     calculate_kv_scales: Optional[bool] = None
 
+    additional_config: Optional[Dict[str, Any]] = None
+
     def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
@@ -984,6 +987,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'be loaded from the model checkpoint if available. '
             'Otherwise, the scales will default to 1.0.')
 
+        parser.add_argument(
+            "--additional-config",
+            type=json.loads,
+            default=None,
+            help="Additional config for specified platform in JSON format. "
+            "Different platforms may support different configs. Make sure the "
+            "configs are valid for the platform you are using. The input format"
+            " is like '{\"config_key\":\"config_value\"}'")
         return parser
 
     @classmethod
@@ -1044,6 +1055,9 @@ def create_load_config(self) -> LoadConfig:
     def create_engine_config(self,
                              usage_context: Optional[UsageContext] = None
                              ) -> VllmConfig:
+        from vllm.platforms import current_platform
+        current_platform.pre_register_and_update()
+
         if envs.VLLM_USE_V1:
             self._override_v1_engine_args(usage_context)
 
@@ -1287,6 +1301,7 @@ def create_engine_config(self,
             prompt_adapter_config=prompt_adapter_config,
             compilation_config=self.compilation_config,
             kv_transfer_config=self.kv_transfer_config,
+            additional_config=self.additional_config,
         )
 
         if envs.VLLM_USE_V1:
@@ -1347,6 +1362,12 @@ def add_cli_args(parser: FlexibleArgumentParser,
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
+        # Initialize plugin to update the parser, for example, The plugin may
+        # adding a new kind of quantization method to --quantization argument or
+        # a new device to --device argument.
+        load_general_plugins()
+        from vllm.platforms import current_platform
+        current_platform.pre_register_and_update(parser)
         return parser
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 645d98a1bb4..61673b08543 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -13,8 +13,10 @@
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.utils import FlexibleArgumentParser
 else:
     VllmConfig = None
+    FlexibleArgumentParser = None
 
 logger = init_logger(__name__)
 
@@ -223,6 +225,22 @@ def seed_everything(cls, seed: Optional[int] = None) -> None:
             np.random.seed(seed)
             torch.manual_seed(seed)
 
+    @classmethod
+    def pre_register_and_update(cls,
+                                parser: Optional[FlexibleArgumentParser] = None
+                                ) -> None:
+        """
+        Do some pre-registeration or update action for the current platform.
+
+        This function is called before global VllmConfig is initialized or cli
+        arguments are parsed. It's used for out-of-tree platforms to register or
+        update the configuration.
+
+        For example, the out-of-tree quantization config can be imported and
+        registered here dynamically.
+        """
+        pass
+
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         """

From 46634567ce97675dfbbb66fbc6dbfd158c57f0ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E0=AE=AE=E0=AE=A9=E0=AF=8B=E0=AE=9C=E0=AF=8D=E0=AE=95?=
 =?UTF-8?q?=E0=AF=81=E0=AE=AE=E0=AE=BE=E0=AE=B0=E0=AF=8D=20=E0=AE=AA?=
 =?UTF-8?q?=E0=AE=B4=E0=AE=A9=E0=AE=BF=E0=AE=9A=E0=AF=8D=E0=AE=9A=E0=AE=BE?=
 =?UTF-8?q?=E0=AE=AE=E0=AE=BF?= <smartmanoj42857@gmail.com>
Date: Tue, 11 Feb 2025 20:11:20 +0530
Subject: [PATCH 0104/1240] [Bugfix] fix flaky test (#13089)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/test_seed_behavior.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py
index 7e4e71563e7..c45ed6926d7 100644
--- a/tests/test_seed_behavior.py
+++ b/tests/test_seed_behavior.py
@@ -8,32 +8,17 @@
 
 
 def test_seed_behavior():
-    # Test with seed=None
-    Platform.seed_everything(None)
+    # Test with a specific seed
+    Platform.seed_everything(42)
     random_value_1 = random.randint(0, 100)
     np_random_value_1 = np.random.randint(0, 100)
     torch_random_value_1 = torch.randint(0, 100, (1, )).item()
 
-    Platform.seed_everything(None)
+    Platform.seed_everything(42)
     random_value_2 = random.randint(0, 100)
     np_random_value_2 = np.random.randint(0, 100)
     torch_random_value_2 = torch.randint(0, 100, (1, )).item()
 
-    assert random_value_1 != random_value_2
-    assert np_random_value_1 != np_random_value_2
-    assert torch_random_value_1 != torch_random_value_2
-
-    # Test with a specific seed
-    Platform.seed_everything(42)
-    random_value_3 = random.randint(0, 100)
-    np_random_value_3 = np.random.randint(0, 100)
-    torch_random_value_3 = torch.randint(0, 100, (1, )).item()
-
-    Platform.seed_everything(42)
-    random_value_4 = random.randint(0, 100)
-    np_random_value_4 = np.random.randint(0, 100)
-    torch_random_value_4 = torch.randint(0, 100, (1, )).item()
-
-    assert random_value_3 == random_value_4
-    assert np_random_value_3 == np_random_value_4
-    assert torch_random_value_3 == torch_random_value_4
+    assert random_value_1 == random_value_2
+    assert np_random_value_1 == np_random_value_2
+    assert torch_random_value_1 == torch_random_value_2

From 4e60861c9a04e460c63d13b3c891c8d114bb70fb Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 11 Feb 2025 15:14:00 +0000
Subject: [PATCH 0105/1240] [V1][Metrics] Add several request timing histograms
 (#12644)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py   | 31 +++++++
 tests/v1/core/test_scheduler.py            |  3 +-
 tests/v1/engine/test_engine_core.py        |  6 +-
 tests/v1/engine/test_engine_core_client.py |  2 +
 tests/v1/engine/test_output_processor.py   | 23 +++--
 vllm/v1/core/kv_cache_manager.py           |  3 +
 vllm/v1/core/scheduler.py                  | 33 +++++++-
 vllm/v1/engine/__init__.py                 | 33 +++++++-
 vllm/v1/engine/async_llm.py                | 24 +++---
 vllm/v1/engine/core.py                     | 10 ++-
 vllm/v1/engine/core_client.py              | 19 +++--
 vllm/v1/engine/llm_engine.py               |  1 +
 vllm/v1/engine/output_processor.py         | 59 +++++++++----
 vllm/v1/metrics/loggers.py                 | 49 +++++++++++
 vllm/v1/metrics/stats.py                   | 97 +++++++++++++++++-----
 vllm/v1/request.py                         | 25 ++++--
 16 files changed, 334 insertions(+), 84 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 8c1bb1a897e..34b648b6e99 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -85,6 +85,10 @@ async def client(server):
     "vllm:time_per_output_token_seconds":
     [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
     "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
     "vllm:request_prompt_tokens":
     [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
      ("_count", _NUM_REQUESTS)],
@@ -169,6 +173,18 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:e2e_request_latency_seconds_sum",
     "vllm:e2e_request_latency_seconds_bucket",
     "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
     "vllm:request_prompt_tokens_count",
@@ -220,6 +236,21 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:time_per_output_token_seconds_sum",
     "vllm:time_per_output_token_seconds_bucket",
     "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
 ]
 
 
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 0d29729a454..8aba46aec47 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -38,7 +38,8 @@ def create_scheduler(
     return Scheduler(scheduler_config,
                      model_config,
                      cache_config,
-                     lora_config=None)
+                     lora_config=None,
+                     log_stats=True)
 
 
 def create_requests(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 6a91f190118..36b31550dc0 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -50,7 +50,8 @@ def test_engine_core(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
 
         # First request.
@@ -157,7 +158,8 @@ def test_engine_core_advanced_sampling(monkeypatch):
         executor_class = Executor.get_class(vllm_config)
 
         engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class)
+                                 executor_class=executor_class,
+                                 log_stats=True)
         """Test basic request lifecycle."""
         # First request.
         request: EngineCoreRequest = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index b2539132f4e..45080be8e8c 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -94,6 +94,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,
         )
 
         MAX_TOKENS = 20
@@ -163,6 +164,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=True,
         )
 
         MAX_TOKENS = 20
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c8f43edb70b..1d47df417dd 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+import time
 from typing import Dict, List, Optional
 
 import pytest
@@ -15,6 +16,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.metrics.stats import IterationStats
 
 
 def _ref_convert_id_to_token(
@@ -603,6 +605,7 @@ def test_iteration_stats(dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
                                        log_stats=True)
     engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
+    engine_core_timestamp = time.monotonic()
 
     # Make N requests.
     requests = [
@@ -630,8 +633,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # First iteration has 2 prefills.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
     total_prompt_tokens = sum([
         len(prompt_tokens)
         for prompt_tokens in dummy_test_vectors.prompt_tokens[:num_active]
@@ -642,8 +646,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
@@ -652,8 +657,9 @@ def test_iteration_stats(dummy_test_vectors):
     output_processor.add_request(inactive_request)
     num_active += 1
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
     total_prompt_tokens = len(dummy_test_vectors.prompt_tokens[num_active - 1])
 
     assert iteration_stats.num_prompt_tokens == total_prompt_tokens
@@ -661,8 +667,9 @@ def test_iteration_stats(dummy_test_vectors):
 
     # Just decodes in this step.
     outputs = engine_core.get_outputs()[:num_active]
-    processed_outputs = output_processor.process_outputs(outputs)
-    iteration_stats = processed_outputs.iteration_stats
+    iteration_stats = IterationStats()
+    output_processor.process_outputs(outputs, engine_core_timestamp,
+                                     iteration_stats)
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index f75d31f542c..0381e5cdd09 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -26,6 +26,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         num_preallocate_tokens: int = 64,
+        log_stats: bool = False,
     ) -> None:
         self.block_size = block_size
         self.num_gpu_blocks = num_gpu_blocks
@@ -33,6 +34,8 @@ def __init__(
         self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
+        # FIXME: make prefix cache stats conditional on log_stats
+        self.log_stats = log_stats
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
         # blocks for each request. For example, when a request reaches the end
         # of its block table, we preallocate N blocks in advance. This way, we
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 985fcf01bb2..e32e557ae23 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import time
 from collections import deque
 from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
@@ -10,7 +11,8 @@
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
                                            SchedulerOutput)
-from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreOutput, EngineCoreOutputs)
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -26,10 +28,12 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        log_stats: bool,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.log_stats = log_stats
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -45,7 +49,8 @@ def __init__(
             num_gpu_blocks=num_gpu_blocks,
             max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+            log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 
         # req_id -> Request
@@ -107,6 +112,8 @@ def schedule(self) -> "SchedulerOutput":
         scheduled_encoder_inputs: Dict[str, List[int]] = {}
         encoder_budget = self.max_num_encoder_input_tokens
 
+        scheduled_timestamp = time.monotonic()
+
         # First, schedule the RUNNING requests.
         req_index = 0
         while req_index < len(self.running) and token_budget > 0:
@@ -246,6 +253,7 @@ def schedule(self) -> "SchedulerOutput":
                 self.running.append(request)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
+                    self.request_scheduled(request, scheduled_timestamp)
                 elif request.status == RequestStatus.PREEMPTED:
                     scheduled_resumed_reqs.append(request)
                 else:
@@ -508,7 +516,8 @@ def update_from_output(
                         finish_reason=request.get_finished_reason(),
                         new_logprobs=new_logprobs,
                         new_prompt_logprobs_tensors=prompt_logprobs_tensors,
-                        stop_reason=request.stop_reason))
+                        stop_reason=request.stop_reason,
+                        events=request.take_events()))
 
             if not stopped:
                 new_running.append(request)
@@ -541,6 +550,7 @@ def _check_stop(self, request: Request) -> bool:
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
+        self.request_queued(request)
 
     def finish_requests(
         self,
@@ -588,7 +598,22 @@ def has_unfinished_requests(self) -> bool:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> SchedulerStats:
+    def request_queued(self, request: Request):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED))
+
+    def request_scheduled(self, request: Request, timestamp: float):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
+                                      timestamp))
+
+    def make_stats(self) -> Optional[SchedulerStats]:
+        if not self.log_stats:
+            return None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 30e1185019d..782fdcee380 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
+import time
 from typing import List, Optional, Union
 
 import msgspec
@@ -60,6 +61,30 @@ class EngineCoreRequest(
     lora_request: Optional[LoRARequest]
 
 
+class EngineCoreEventType(enum.IntEnum):
+    """The type of engine core request event."""
+    QUEUED = 1
+    SCHEDULED = 2
+
+
+class EngineCoreEvent(msgspec.Struct):
+    """A timestamped engine core event associated with a request.
+
+    The timestamp is a monotonic timestamps and is used for by the engine
+    frontend to calculate intervals between engine core events. These
+    timestamps should not be compared with timestamps from other processes.
+    """
+    type: EngineCoreEventType
+    timestamp: float
+
+    @classmethod
+    def new_event(cls,
+                  event_type: EngineCoreEventType,
+                  timestamp: Optional[float] = None) -> "EngineCoreEvent":
+        timestamp = time.monotonic() if timestamp is None else timestamp
+        return cls(event_type, timestamp)
+
+
 class EngineCoreOutput(
         msgspec.Struct,
         array_like=True,  # type: ignore[call-arg]
@@ -74,6 +99,7 @@ class EngineCoreOutput(
 
     finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
+    events: Optional[List[EngineCoreEvent]] = None
 
     @property
     def finished(self) -> bool:
@@ -91,7 +117,12 @@ class EngineCoreOutputs(
 
     # [num_reqs]
     outputs: List[EngineCoreOutput]
-    scheduler_stats: SchedulerStats
+    scheduler_stats: Optional[SchedulerStats]
+    timestamp: float = 0.0
+
+    def __post_init__(self):
+        if self.timestamp == 0.0:
+            self.timestamp = time.monotonic()
 
 
 class EngineCoreRequestType(enum.Enum):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3c4e35e4aa2..f19d2ed8bcb 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -53,10 +53,12 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: List[StatLoggerBase] = [
-            LoggingStatLogger(),
-            PrometheusStatLogger(vllm_config.model_config),
-        ]
+        self.stat_loggers: List[StatLoggerBase] = []
+        if self.log_stats:
+            self.stat_loggers.extend([
+                LoggingStatLogger(),
+                PrometheusStatLogger(vllm_config.model_config),
+            ])
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -85,6 +87,7 @@ def __init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=self.log_stats,
         )
 
         self.output_handler: Optional[asyncio.Task] = None
@@ -246,6 +249,8 @@ async def _run_output_handler(self):
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
 
+                iteration_stats = IterationStats() if self.log_stats else None
+
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                 # event loop for too long.
@@ -257,14 +262,12 @@ async def _run_output_handler(self):
                         outputs.outputs,
                         cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE))
 
-                iteration_stats = None
                 for i, outputs_slice in enumerate(slices):
                     # 2) Process EngineCoreOutputs.
                     processed_outputs = self.output_processor.process_outputs(
-                        outputs_slice, iteration_stats)
+                        outputs_slice, outputs.timestamp, iteration_stats)
                     # NOTE: RequestOutputs are pushed to their queues.
                     assert not processed_outputs.request_outputs
-                    iteration_stats = processed_outputs.iteration_stats
 
                     # Allow other asyncio tasks to run between chunks
                     if i + 1 < len(slices):
@@ -277,7 +280,6 @@ async def _run_output_handler(self):
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
-                assert iteration_stats is not None
                 self._log_stats(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
@@ -299,12 +301,14 @@ async def abort(self, request_id: str) -> None:
 
     def _log_stats(
         self,
-        scheduler_stats: SchedulerStats,
-        iteration_stats: IterationStats,
+        scheduler_stats: Optional[SchedulerStats],
+        iteration_stats: Optional[IterationStats],
     ):
         if not self.log_stats:
             return
 
+        assert scheduler_stats is not None
+        assert iteration_stats is not None
         for logger in self.stat_loggers:
             logger.log(scheduler_stats=scheduler_stats,
                        iteration_stats=iteration_stats)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c90667ba033..e4677681bd2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -38,12 +38,15 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
 
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
 
+        self.log_stats = log_stats
+
         # Setup Model.
         self.model_executor = executor_class(vllm_config)
 
@@ -59,6 +62,7 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            log_stats=self.log_stats,
         )
 
         self.mm_input_mapper_server = MMInputMapperServer(
@@ -148,11 +152,9 @@ def __init__(
         ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
-        log_stats: bool = False,
+        log_stats: bool,
     ):
-        super().__init__(vllm_config, executor_class)
-
-        self.log_stats = log_stats
+        super().__init__(vllm_config, executor_class, log_stats)
 
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 2d7d6b42ced..b3de5cdc244 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -41,6 +41,7 @@ def make_client(
         asyncio_mode: bool,
         vllm_config: VllmConfig,
         executor_class: Type[Executor],
+        log_stats: bool,
     ) -> "EngineCoreClient":
 
         # TODO: support this for debugging purposes.
@@ -50,12 +51,12 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
-            return AsyncMPClient(vllm_config, executor_class)
+            return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
-            return SyncMPClient(vllm_config, executor_class)
+            return SyncMPClient(vllm_config, executor_class, log_stats)
 
-        return InprocClient(vllm_config, executor_class)
+        return InprocClient(vllm_config, executor_class, log_stats)
 
     @abstractmethod
     def shutdown(self):
@@ -204,13 +205,13 @@ def shutdown(self):
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,
+            log_stats=log_stats,
         )
 
     def get_output(self) -> EngineCoreOutputs:
@@ -245,13 +246,13 @@ def reset_prefix_cache(self) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig,
-                 executor_class: Type[Executor]):
+    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+                 log_stats: bool):
         super().__init__(
             asyncio_mode=True,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=True,
+            log_stats=log_stats,
         )
 
         self.outputs_queue: Optional[asyncio.Queue[bytes]] = None
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 3ef5a970606..c9a4c5369df 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -73,6 +73,7 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
+            log_stats=False,  # FIXME: implement
         )
 
     @classmethod
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 5dbf530caa1..7973c62c381 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -19,7 +19,6 @@ class OutputProcessorOutput:
 
     request_outputs: List[RequestOutput]
     reqs_to_abort: List[str]
-    iteration_stats: IterationStats
 
 
 class RequestState:
@@ -34,6 +33,7 @@ def __init__(
         detokenizer: IncrementalDetokenizer,
         arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
+        log_stats: bool,
     ):
         self.request_id = request_id
         self.output_kind = output_kind
@@ -45,14 +45,16 @@ def __init__(
         self.is_prefilling = True
         self.queue = queue
 
-        self.stats = RequestStateStats(last_token_time=arrival_time)
+        self.stats = RequestStateStats(
+            arrival_time=arrival_time) if log_stats else None
 
     @classmethod
     def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
-        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+        queue: Optional[asyncio.Queue[RequestOutput]],
+        log_stats: bool,
     ) -> "RequestState":
         return cls(
             request_id=request.request_id,
@@ -69,6 +71,7 @@ def from_new_request(
             ),
             arrival_time=request.arrival_time,
             queue=queue,
+            log_stats=log_stats,
         )
 
 
@@ -112,11 +115,13 @@ def add_request(
         self.request_states[request_id] = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
-            queue=queue)
+            queue=queue,
+            log_stats=self.log_stats)
 
     def process_outputs(
         self,
         engine_core_outputs: List[EngineCoreOutput],
+        engine_core_timestamp: Optional[float] = None,
         iteration_stats: Optional[IterationStats] = None,
     ) -> OutputProcessorOutput:
         """
@@ -145,8 +150,6 @@ def process_outputs(
 
         request_outputs: List[RequestOutput] = []
         reqs_to_abort: List[str] = []
-        if not iteration_stats:
-            iteration_stats = IterationStats(self.log_stats)
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
@@ -155,10 +158,9 @@ def process_outputs(
                 continue
 
             # 1) Compute stats for this iteration.
-            iteration_stats.update_from_output(engine_core_output,
-                                               req_state.is_prefilling,
-                                               req_state.prompt_len,
-                                               req_state.stats)
+            self._update_stats_from_output(req_state, engine_core_output,
+                                           engine_core_timestamp,
+                                           iteration_stats)
 
             new_token_ids = engine_core_output.new_token_ids
             finish_reason = engine_core_output.finish_reason
@@ -205,17 +207,44 @@ def process_outputs(
                         # detected stop string, abort needed in EngineCore.
                         reqs_to_abort.append(req_id)
 
-                    # Track per-request stats.
-                    assert finish_reason is not None
-                    iteration_stats.update_from_finished_request(
-                        finish_reason, request_output, req_state.stats)
+                    # Track per-request stats
+                    self._update_stats_from_finished(req_state, request_output,
+                                                     finish_reason,
+                                                     iteration_stats)
 
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
-            iteration_stats=iteration_stats,
         )
 
+    def _update_stats_from_output(self, req_state: RequestState,
+                                  engine_core_output: EngineCoreOutput,
+                                  engine_core_timestamp: Optional[float],
+                                  iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert engine_core_timestamp is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_output(engine_core_output,
+                                           engine_core_timestamp,
+                                           req_state.is_prefilling,
+                                           req_state.prompt_len,
+                                           req_state.stats)
+
+    def _update_stats_from_finished(self, req_state: RequestState,
+                                    request_output: RequestOutput,
+                                    finish_reason: Optional[FinishReason],
+                                    iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+
+        assert finish_reason is not None
+        assert req_state.stats is not None
+        iteration_stats.update_from_finished_request(finish_reason,
+                                                     request_output,
+                                                     req_state.stats)
+
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 3472761dc18..439be38a3e7 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -182,6 +182,45 @@ def __init__(self, model_config: ModelConfig):
                 ],
                 labelnames=labelnames).labels(*labelvalues)
 
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+            40.0, 50.0, 60.0
+        ]
+        self.histogram_e2e_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:e2e_request_latency_seconds",
+                documentation="Histogram of e2e request latency in seconds.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_queue_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_queue_time_seconds",
+                documentation=
+                "Histogram of time spent in WAITING phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_inference_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_inference_time_seconds",
+                documentation=
+                "Histogram of time spent in RUNNING phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_prefill_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_prefill_time_seconds",
+                documentation=
+                "Histogram of time spent in PREFILL phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+        self.histogram_decode_time_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_decode_time_seconds",
+                documentation=
+                "Histogram of time spent in DECODE phase for request.",
+                buckets=request_latency_buckets,
+                labelnames=labelnames).labels(*labelvalues)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""
@@ -201,6 +240,12 @@ def log(self, scheduler_stats: SchedulerStats,
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason].inc()
+            self.histogram_e2e_time_request.observe(
+                finished_request.e2e_latency)
+            self.histogram_inference_time_request.observe(
+                finished_request.inference_time)
+            self.histogram_decode_time_request.observe(
+                finished_request.decode_time)
             self.histogram_num_prompt_tokens_request.observe(
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
@@ -210,6 +255,10 @@ def log(self, scheduler_stats: SchedulerStats,
             self.histogram_time_to_first_token.observe(ttft)
         for tpot in iteration_stats.time_per_output_tokens_iter:
             self.histogram_time_per_output_token.observe(tpot)
+        for queue_time in iteration_stats.queue_times_iter:
+            self.histogram_queue_time_request.observe(queue_time)
+        for prefill_time in iteration_stats.prefill_times_iter:
+            self.histogram_prefill_time_request.observe(prefill_time)
 
     @staticmethod
     def _unregister_vllm_metrics():
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index f806b0adf5d..a0e6204929e 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
-    from vllm.v1.engine import EngineCoreOutput, FinishReason
+    from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
 
 
 @dataclass
@@ -41,7 +41,15 @@ class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
 
     num_generation_tokens: int = 0
-    last_token_time: float = 0.0
+
+    # This is a engine frontend timestamp (wall-clock)
+    arrival_time: float = 0.0
+
+    # These are engine core timestamps (monotonic)
+    queued_ts: float = 0.0
+    scheduled_ts: float = 0.0
+    first_token_ts: float = 0.0
+    last_token_ts: float = 0.0
 
 
 @dataclass
@@ -49,33 +57,37 @@ class FinishedRequestStats:
     """Stats associated with a finished request."""
 
     finish_reason: "FinishReason"
+    e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
+    inference_time: float = 0.0
+    decode_time: float = 0.0
 
 
 class IterationStats:
     """Stats associated with a single set of EngineCoreOutputs."""
 
-    def __init__(self, log_stats: bool):
-        self.log_stats = log_stats
+    def __init__(self):
+        self.iteration_timestamp = time.time()
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
         self.finished_requests: List[FinishedRequestStats] = []
         self.time_to_first_tokens_iter: List[float] = []
         self.time_per_output_tokens_iter: List[float] = []
+        self.queue_times_iter: List[float] = []
+        self.prefill_times_iter: List[float] = []
 
-    def update_from_output(self, output: "EngineCoreOutput",
-                           is_prefilling: bool, prompt_len: int,
-                           request_state_stats: RequestStateStats):
-        if not self.log_stats:
-            return
+    def _time_since(self, start: float) -> float:
+        """Calculate an interval relative to this iteration's timestamp."""
+        return self.iteration_timestamp - start
 
+    def update_from_output(self, output: "EngineCoreOutput",
+                           engine_core_timestamp: float, is_prefilling: bool,
+                           prompt_len: int, req_stats: RequestStateStats):
         num_new_generation_tokens = len(output.new_token_ids)
-        now = time.time()
-        last_token_latency = now - request_state_stats.last_token_time
 
         self.num_generation_tokens += num_new_generation_tokens
-        if is_prefilling:
+        if is_prefilling and num_new_generation_tokens > 0:
             # TODO(andy): we used to assert that num_new_generation_tokens
             # > 0 with an invariant that EngineCore does not stream outputs
             # for partially completed prefills (scheduler.update_from_output
@@ -84,19 +96,58 @@ def update_from_output(self, output: "EngineCoreOutput",
             # partially completed prompt.
             # This will be reverted in a follow up PR and we should re-enable
             # this assertion / invariant.
+            self.num_prompt_tokens += prompt_len
+
+            first_token_latency = self._time_since(req_stats.arrival_time)
+            self.time_to_first_tokens_iter.append(first_token_latency)
+
+        req_stats.num_generation_tokens += num_new_generation_tokens
+
+        # Process request-level engine core events
+        if output.events is not None:
+            self.update_from_events(output.events, is_prefilling, req_stats)
+
+        # Process the batch-level "new tokens" engine core event
+        if is_prefilling:
+            # TODO: re-enable no-output-for-partial-prefills invariant as above
             if num_new_generation_tokens > 0:
-                self.num_prompt_tokens += prompt_len
-                self.time_to_first_tokens_iter.append(last_token_latency)
+                prefill_interval = \
+                    engine_core_timestamp - req_stats.scheduled_ts
+                self.prefill_times_iter.append(prefill_interval)
+                req_stats.first_token_ts = engine_core_timestamp
         else:
-            self.time_per_output_tokens_iter.append(last_token_latency)
-
-        request_state_stats.num_generation_tokens += num_new_generation_tokens
-        request_state_stats.last_token_time = now
+            tpot = engine_core_timestamp - req_stats.last_token_ts
+            self.time_per_output_tokens_iter.append(tpot)
+
+        # TODO: re-enable no-output-for-partial-prefills invariant as above
+        if num_new_generation_tokens > 0:
+            req_stats.last_token_ts = engine_core_timestamp
+
+    def update_from_events(self, events: List["EngineCoreEvent"],
+                           is_prefilling: bool, req_stats: RequestStateStats):
+        # Avoid circular dependency
+        from vllm.v1.engine import EngineCoreEventType
+        for event in events:
+            if event.type == EngineCoreEventType.QUEUED:
+                req_stats.queued_ts = event.timestamp
+            elif event.type == EngineCoreEventType.SCHEDULED:
+                queued_interval = event.timestamp - req_stats.queued_ts
+                self.queue_times_iter.append(queued_interval)
+                req_stats.scheduled_ts = event.timestamp
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
-                                     request_state_stats: RequestStateStats):
-        self.finished_requests.append(
-            FinishedRequestStats(finish_reason,
-                                 len(request_output.prompt_token_ids),
-                                 request_state_stats.num_generation_tokens))
+                                     req_stats: RequestStateStats):
+        e2e_latency = self._time_since(req_stats.arrival_time)
+
+        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+        decode_time = req_stats.last_token_ts - req_stats.first_token_ts
+
+        finished_req = \
+            FinishedRequestStats(finish_reason=finish_reason,
+                                 e2e_latency=e2e_latency,
+                                 num_prompt_tokens=len(request_output.prompt_token_ids),
+                                 num_generation_tokens=req_stats.num_generation_tokens,
+                                 inference_time=inference_time,
+                                 decode_time=decode_time)
+        self.finished_requests.append(finished_req)
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index bb4d2c19197..0ebaa71ce74 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -5,8 +5,8 @@
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import RequestMetrics
-from vllm.v1.engine import EngineCoreRequest, FinishReason
+from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
+                            EngineCoreRequest, FinishReason)
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -33,14 +33,10 @@ def __init__(
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
-        self.metrics = RequestMetrics(arrival_time=arrival_time,
-                                      last_token_time=arrival_time,
-                                      first_scheduled_time=None,
-                                      first_token_time=None,
-                                      time_in_queue=None)
         self.lora_request = lora_request
 
         self.status = RequestStatus.WAITING
+        self.events: List[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
@@ -83,6 +79,21 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             lora_request=request.lora_request,
         )
 
+    def queued(self, timestamp: Optional[float] = None) -> None:
+        self.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, timestamp))
+
+    def scheduled(self, timestamp: Optional[float] = None) -> None:
+        self.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
+                                      timestamp))
+
+    def take_events(self) -> Optional[List[EngineCoreEvent]]:
+        if not self.events:
+            return None
+        events, self.events = self.events, []
+        return events
+
     def append_output_token_ids(
         self,
         token_ids: Union[int, List[int]],

From 7bdc45757846e9dad6a7c68d4eca24f5414720b5 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 11 Feb 2025 15:51:19 +0000
Subject: [PATCH 0106/1240] Set `torch_dtype` in `TransformersModel` (#13088)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/transformers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 43d2c88d3b9..1605467bc3d 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -143,6 +143,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
             attn_implementation="vllm",
+            torch_dtype=vllm_config.model_config.dtype,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
         )
         prefix = self.model.base_model_prefix

From 97320060c4c9d30efadb50c85a4c557cf6cacbf5 Mon Sep 17 00:00:00 2001
From: Jewon Lee <105219284+je1lee@users.noreply.github.com>
Date: Wed, 12 Feb 2025 01:20:37 +0900
Subject: [PATCH 0107/1240] [Misc] Fix typo at comments at metrics.py (#13024)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index ce806b4a937..7c55d66e507 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -237,7 +237,7 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + [Metrics.labelname_finish_reason])
 
-        # Speculatie decoding stats
+        # Speculative decoding stats
         self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls(
             name="vllm:spec_decode_draft_acceptance_rate",
             documentation="Speulative token acceptance rate.",

From 7db43352add763d8d734ffad6df7301dbcdf0fa1 Mon Sep 17 00:00:00 2001
From: MoonRide303 <130458190+MoonRide303@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:21:18 +0100
Subject: [PATCH 0108/1240] [Bugfix] Do not use resource module on Windows
 (#12858) (#13029)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index e1687527666..6a41afff8f0 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -15,7 +15,6 @@
 import multiprocessing
 import os
 import re
-import resource
 import signal
 import socket
 import subprocess
@@ -2070,6 +2069,11 @@ def memory_profiling(
 
 # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501
 def set_ulimit(target_soft_limit=65535):
+    if sys.platform.startswith('win'):
+        logger.info("Windows detected, skipping ulimit adjustment.")
+        return
+
+    import resource
     resource_type = resource.RLIMIT_NOFILE
     current_soft, current_hard = resource.getrlimit(resource_type)
 

From 3232f847c091a63788d75646fb52b091f4d7bac7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Tue, 11 Feb 2025 18:21:50 +0200
Subject: [PATCH 0109/1240] [BugFix] Pop instead of del CUDA_VISIBLE_DEVICES
 (#12962)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/rlhf.py          |  2 +-
 examples/offline_inference/rlhf_colocate.py |  2 +-
 tests/distributed/test_comm_ops.py          | 10 +++++-----
 tests/distributed/test_custom_all_reduce.py |  4 ++--
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 5000251c099..172d18cbce2 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -92,7 +92,7 @@ def __init__(self, *args, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         super().__init__(*args, **kwargs)
 
 
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index b921bc71feb..15dc7edc18a 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -59,7 +59,7 @@ def __init__(self, *args, bundle_indices: list, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
         # at the top-level
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         # every worker will use 0.4 GPU, so that we can schedule
         # 2 instances on the same GPUs.
         os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index bc916e8de07..7b0346b8ab5 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -22,7 +22,7 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -44,7 +44,7 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -72,7 +72,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -108,7 +108,7 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
                                       distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -148,7 +148,7 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 @ray.remote(num_gpus=1, max_calls=1)
 def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
                           distributed_init_port: str):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 46887bca42a..4928690bebb 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -24,7 +24,7 @@
 
 @ray.remote(num_gpus=1, max_calls=1)
 def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -80,7 +80,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
 
 @ray.remote(num_gpus=1, max_calls=1)
 def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    del os.environ["CUDA_VISIBLE_DEVICES"]
+    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,

From 7e6917e6a7af5ae7ee073859067eef3af34d6954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Tue, 11 Feb 2025 17:38:48 +0100
Subject: [PATCH 0110/1240] Fix initializing GGUF weights for
 ColumnParallelLinear when using tensor parallel > 1 (#13023)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/linear.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index da8db08fe71..dad16112082 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -335,6 +335,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
 
+        is_sharded_weight = getattr(param, "is_sharded_weight", False)
+        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+        # bitsandbytes loads the weights of the specific portion
+        # no need to narrow
+        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+
         # Special case for GGUF
         is_gguf_weight = getattr(param, "is_gguf_weight", False)
         is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
@@ -343,13 +349,12 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
 
         # Materialize GGUF UninitializedParameter
         if is_gguf_weight and isinstance(param, UninitializedParameter):
-            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
-
-        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
-        is_sharded_weight = getattr(param, "is_sharded_weight", False)
-        # bitsandbytes loads the weights of the specific portion
-        # no need to narrow
-        is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit
+            final_shape = list(loaded_weight.shape)
+            if output_dim is not None:
+                tp_size = get_tensor_model_parallel_world_size()
+                assert final_shape[output_dim] % tp_size == 0
+                final_shape[output_dim] = final_shape[output_dim] // tp_size
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
 
         param_data = param.data
         if output_dim is not None and not is_sharded_weight:

From de483ed3a94a5d7e57f7fb5fa31aad286fd9aed6 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 12 Feb 2025 00:55:56 +0800
Subject: [PATCH 0111/1240] [CI/Build][Bugfix] Fix CPU backend default threads
 num (#13077)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cpu.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 179ee6a7d24..a9216c2322e 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -115,6 +115,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Environment variables for CPU executor
         #
 
+        # Set default threads num for OpenMP parallel
+        os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads())
+
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 

From ee7c0cb79dc27f1324a97386b9e8205fb9957ea8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 11 Feb 2025 18:02:46 +0000
Subject: [PATCH 0112/1240] [Doc] Improve OpenVINO installation doc (#13102)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../installation/ai_accelerator/openvino.inc.md           | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 112e8d4d9b2..4f25252d9da 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -19,17 +19,19 @@ Currently, there are no pre-built OpenVINO wheels.
 
 ### Build wheel from source
 
-First, install Python. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
 
 ```console
 sudo apt-get update  -y
 sudo apt-get install python3
+pip install --upgrade pip
 ```
 
-Second, install prerequisites vLLM OpenVINO backend installation:
+Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
 
 ```console
-pip install --upgrade pip
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
 pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 

From 9f554437be3ee15e338826f8d82e5e4266ad92bc Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 11 Feb 2025 13:17:44 -0500
Subject: [PATCH 0113/1240] [Bugfix] Guided decoding falls back to outlines
 when fails to import xgrammar (#12976)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/guided_decoding/__init__.py          | 9 +++++++++
 vllm/model_executor/guided_decoding/xgrammar_decoding.py | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index cf96461a549..3eb7d186eb0 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -40,6 +40,8 @@ def maybe_backend_fallback(
             guided_params.backend = "outlines"
 
     if guided_params.backend == "xgrammar":
+        from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+            xgr_installed)
         # xgrammar only has x86 wheels for linux, fallback to outlines
         from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
@@ -77,6 +79,13 @@ def maybe_backend_fallback(
                     "Falling back to use outlines instead.")
                 guided_params.backend = "outlines"
 
+        # If the xgrammar module cannot be imported successfully,
+        # we should still allow users to use guided decoding with a fallback.
+        elif not xgr_installed:
+            logger.warning("xgrammar module cannot be imported successfully. "
+                           "Falling back to use outlines instead.")
+            guided_params.backend = "outlines"
+
     if (guided_params.backend == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to xgrammar
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index c01bd3af1d5..fc3a4cd4beb 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -14,7 +14,9 @@
 try:
     import xgrammar as xgr
     from xgrammar.base import _core as xgr_core
+    xgr_installed = True
 except ImportError:
+    xgr_installed = False
     pass
 
 from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,

From 85ddfcfbe6d7b12ca0055617763fe891063f0760 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Feb 2025 17:34:16 -0500
Subject: [PATCH 0114/1240] [Misc] Move pre-commit suggestion back to the end
 (#13114)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 352eb2df01b..22b51afdc57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -116,13 +116,6 @@ repos:
     language: python
     types: [python]
     exclude: 'vllm/third_party/.*'
-  - id: suggestion
-    name: Suggestion
-    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
-    language: system
-    verbose: true
-    pass_filenames: false
-    exclude: 'vllm/third_party/.*'
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
@@ -133,3 +126,12 @@ repos:
     always_run: true
     pass_filenames: false
     exclude: 'vllm/third_party/.*'
+  # Keep `suggestion` last
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true
+    pass_filenames: false
+    exclude: 'vllm/third_party/.*'
+  # Insert new entries above the `suggestion` entry

From affb7da2267e19143fff33e53d0bcd00501d095c Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Tue, 11 Feb 2025 20:25:58 -0800
Subject: [PATCH 0115/1240] [RFC][vllm-API] Support tokenizer registry for
 customized tokenizer in vLLM (#12518)

Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving.py               |   5 +-
 tests/tokenization/test_tokenizer_registry.py | 123 +++++++++++++++
 vllm/config.py                                |   9 +-
 vllm/engine/arg_utils.py                      |   6 +-
 vllm/entrypoints/llm.py                       |  31 ++--
 vllm/entrypoints/openai/serving_engine.py     |   3 +-
 vllm/entrypoints/openai/serving_score.py      |   2 +-
 vllm/logits_process.py                        |   2 +-
 vllm/transformers_utils/tokenizer.py          |  18 ++-
 vllm/transformers_utils/tokenizer_base.py     | 146 ++++++++++++++++++
 vllm/transformers_utils/tokenizers/mistral.py |  39 +++--
 11 files changed, 343 insertions(+), 41 deletions(-)
 create mode 100644 tests/tokenization/test_tokenizer_registry.py
 create mode 100644 vllm/transformers_utils/tokenizer_base.py

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0c892384236..90eb052399b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1275,11 +1275,12 @@ def main(args: argparse.Namespace):
         '--tokenizer-mode',
         type=str,
         default="auto",
-        choices=['auto', 'slow', 'mistral'],
+        choices=['auto', 'slow', 'mistral', 'custom'],
         help='The tokenizer mode.\n\n* "auto" will use the '
         'fast tokenizer if available.\n* "slow" will '
         'always use the slow tokenizer. \n* '
-        '"mistral" will always use the `mistral_common` tokenizer.')
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.')
 
     parser.add_argument("--served-model-name",
                         type=str,
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
new file mode 100644
index 00000000000..793d38f9c36
--- /dev/null
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TestTokenizer(TokenizerBase):
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
+        return TestTokenizer()
+
+    @property
+    def all_special_tokens_extended(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_tokens(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    def all_special_ids(self) -> List[int]:
+        raise NotImplementedError()
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __call__(
+        self,
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    def get_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        raise NotImplementedError()
+
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
+        raise NotImplementedError()
+
+    def apply_chat_template(self,
+                            messages: List["ChatCompletionMessageParam"],
+                            tools: Optional[List[Dict[str, Any]]] = None,
+                            **kwargs) -> List[int]:
+        raise NotImplementedError()
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        raise NotImplementedError()
+
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    def convert_ids_to_tokens(
+        self,
+        ids: List[int],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
+        raise NotImplementedError()
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register("test_tokenizer",
+                               "tests.tokenization.test_tokenizer_registry",
+                               "TestTokenizer")
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+
+    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
diff --git a/vllm/config.py b/vllm/config.py
index 1d8c42dd276..1740871e7c1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -102,8 +102,9 @@ class ModelConfig:
             it; otherwise, you must specify explicitly which task to use.
         tokenizer: Name or path of the huggingface tokenizer to use.
         tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
-            available, "slow" will always use the slow tokenizer, and
-            "mistral" will always use the tokenizer from `mistral_common`.
+            available, "slow" will always use the slow tokenizer,
+            "mistral" will always use the tokenizer from `mistral_common`, and
+            "custom" will use --tokenizer to select the preregistered tokenizer.
         trust_remote_code: Trust remote code (e.g., from HuggingFace) when
             downloading the model and tokenizer.
         allowed_local_media_path: Allowing API requests to read local images or
@@ -467,10 +468,10 @@ def _init_has_inner_state(self) -> bool:
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = self.tokenizer_mode.lower()
-        if tokenizer_mode not in ["auto", "slow", "mistral"]:
+        if tokenizer_mode not in ["auto", "slow", "mistral", "custom"]:
             raise ValueError(
                 f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                "either 'auto', 'slow' or 'mistral'.")
+                "either 'auto', 'slow', 'mistral' or 'custom'.")
         self.tokenizer_mode = tokenizer_mode
 
     def _get_preferred_task(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4232ad9204f..83ee6b97f93 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -284,11 +284,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--tokenizer-mode',
             type=str,
             default=EngineArgs.tokenizer_mode,
-            choices=['auto', 'slow', 'mistral'],
+            choices=['auto', 'slow', 'mistral', 'custom'],
             help='The tokenizer mode.\n\n* "auto" will use the '
             'fast tokenizer if available.\n* "slow" will '
             'always use the slow tokenizer. \n* '
-            '"mistral" will always use the `mistral_common` tokenizer.')
+            '"mistral" will always use the `mistral_common` tokenizer. \n* '
+            '"custom" will use --tokenizer to select the '
+            'preregistered tokenizer.')
         parser.add_argument('--trust-remote-code',
                             action='store_true',
                             help='Trust remote code from huggingface.')
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d071a0b3cfc..73593f0c6f0 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1051,9 +1051,9 @@ def _embedding_score(
 
     def _cross_encoding_score(
         self,
-        tokenizer: Union[AnyTokenizer],
-        text_1: List[Union[str, TextPrompt, TokensPrompt]],
-        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        tokenizer: AnyTokenizer,
+        text_1: List[str],
+        text_2: List[str],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
         lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
@@ -1176,29 +1176,36 @@ def ensure_str(prompt: SingletonPrompt):
         if isinstance(text_1, (str, dict)):
             # Convert a single prompt to a list.
             text_1 = [text_1]
-        text_1 = [ensure_str(t) for t in text_1]
+        input_text_1: List[str] = [ensure_str(t) for t in text_1]
 
         if isinstance(text_2, (str, dict)):
             # Convert a single prompt to a list.
             text_2 = [text_2]
-        text_2 = [ensure_str(t) for t in text_2]
+        input_text_2: List[str] = [ensure_str(t) for t in text_2]
 
-        if len(text_1) > 1 and len(text_1) != len(text_2):
+        if len(input_text_1) > 1 and len(input_text_1) != len(input_text_2):
             raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
-        if len(text_1) == 0:
+        if len(input_text_1) == 0:
             raise ValueError("At least one text element must be given")
-        if len(text_2) == 0:
+        if len(input_text_2) == 0:
             raise ValueError("At least one text_pair element must be given")
 
         if self.llm_engine.model_config.is_cross_encoder:
-            return self._cross_encoding_score(tokenizer, text_1, text_2,
+            return self._cross_encoding_score(tokenizer, input_text_1,
+                                              input_text_2,
                                               truncate_prompt_tokens, use_tqdm,
                                               lora_request,
                                               prompt_adapter_request)
         else:
-            return self._embedding_score(tokenizer, text_1, text_2,
-                                         truncate_prompt_tokens, use_tqdm,
-                                         lora_request, prompt_adapter_request)
+
+            return self._embedding_score(
+                tokenizer,
+                input_text_1,  # type: ignore[arg-type]
+                input_text_2,  # type: ignore[arg-type]
+                truncate_prompt_tokens,
+                use_tqdm,
+                lora_request,
+                prompt_adapter_request)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 8d39fdcb748..9efb5e6fa39 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -400,8 +400,7 @@ async def _preprocess_chat(
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
         request_prompt: Union[str, List[int]]
-        is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
-        if is_mistral_tokenizer:
+        if isinstance(tokenizer, MistralTokenizer):
             request_prompt = apply_mistral_chat_template(
                 tokenizer,
                 messages=messages,
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 832aa8516cc..c7597808f7f 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -121,7 +121,7 @@ async def create_score(
 
                 tokenize_async = make_async(tokenizer.__call__,
                                             executor=self._tokenizer_executor)
-                prompt_inputs = await tokenize_async(text=q,
+                prompt_inputs = await tokenize_async(q,
                                                      text_pair=t,
                                                      **tokenization_kwargs)
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index d02072e8f81..a810be7bc7a 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -31,7 +31,7 @@ def get_bad_words_logits_processors(
 
             if isinstance(tokenizer, MistralTokenizer):
                 # Mistral tokenizers should not add special tokens
-                prompt_token_ids = tokenizer.encode(prompt=prompt)
+                prompt_token_ids = tokenizer.encode(text=prompt)
             else:
                 prompt_token_ids = tokenizer.encode(text=prompt,
                                                     add_special_tokens=False)
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 520870b563c..0c0f68ac123 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -14,6 +14,8 @@
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
+                                                    TokenizerRegistry)
 from vllm.transformers_utils.tokenizers import MistralTokenizer
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
@@ -21,7 +23,7 @@
 logger = init_logger(__name__)
 
 AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
-                     MistralTokenizer]
+                     TokenizerBase]
 
 
 def decode_tokens(
@@ -47,11 +49,7 @@ def encode_tokens(
     Backend-agnostic equivalent of HF's
     :code:`tokenizer.encode(text, add_special_tokens=...)`.
     """
-    if isinstance(tokenizer, MistralTokenizer):
-        return tokenizer.tokenizer.encode(text,
-                                          bos=add_special_tokens,
-                                          eos=add_special_tokens)
-    elif add_special_tokens is not None:
+    if add_special_tokens is not None:
         return tokenizer.encode(text, add_special_tokens=add_special_tokens)
     return tokenizer.encode(text)
 
@@ -183,9 +181,17 @@ def get_tokenizer(
             'encoding and decoding.',
             FutureWarning,
             stacklevel=2)
+
+    tokenizer: AnyTokenizer
     if tokenizer_mode == "mistral":
         tokenizer = MistralTokenizer.from_pretrained(str(tokenizer_name),
                                                      revision=revision)
+    elif tokenizer_mode == "custom":
+        tokenizer = TokenizerRegistry.get_tokenizer(str(tokenizer_name),
+                                                    *args,
+                                                    revision=revision,
+                                                    download_dir=download_dir,
+                                                    **kwargs)
     else:
         try:
             tokenizer = AutoTokenizer.from_pretrained(
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
new file mode 100644
index 00000000000..bb5ddaf88b2
--- /dev/null
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import importlib
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerBase(ABC):
+
+    @property
+    @abstractmethod
+    def all_special_tokens_extended(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_tokens(self) -> List[str]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def all_special_ids(self) -> List[int]:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def bos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def eos_token_id(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def is_fast(self) -> bool:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def vocab_size(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    @abstractmethod
+    def max_token_id(self) -> int:
+        raise NotImplementedError()
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    @abstractmethod
+    def __call__(
+        self,
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
+        add_special_tokens: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def get_added_vocab(self) -> Dict[str, int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode_one(
+        self,
+        text: str,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+    ) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def apply_chat_template(self,
+                            messages: List["ChatCompletionMessageParam"],
+                            tools: Optional[List[Dict[str, Any]]] = None,
+                            **kwargs) -> List[int]:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def decode(self,
+               ids: Union[List[int], int],
+               skip_special_tokens: bool = True) -> str:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def convert_ids_to_tokens(
+        self,
+        ids: List[int],
+        skip_special_tokens: bool = True,
+    ) -> List[str]:
+        raise NotImplementedError()
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> (tokenizer module, tokenizer class)
+    REGISTRY: Dict[str, Tuple[str, str]] = {}
+
+    @staticmethod
+    def register(name: str, module: str, class_name: str) -> None:
+        TokenizerRegistry.REGISTRY[name] = (module, class_name)
+
+    @staticmethod
+    def get_tokenizer(
+        tokenizer_name: str,
+        *args,
+        **kwargs,
+    ) -> TokenizerBase:
+        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
+        if tokenizer_cls is None:
+            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
+
+        tokenizer_module = importlib.import_module(tokenizer_cls[0])
+        class_ = getattr(tokenizer_module, tokenizer_cls[1])
+        return class_.from_pretrained(*args, **kwargs)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index f08923e7401..59131a9d7bf 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -10,6 +10,7 @@
 from huggingface_hub import HfApi, hf_hub_download
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_base import TokenizerBase
 from vllm.utils import is_list_of
 
 if TYPE_CHECKING:
@@ -140,7 +141,7 @@ def make_mistral_chat_completion_request(
                                  tools=tools)  # type: ignore[type-var]
 
 
-class MistralTokenizer:
+class MistralTokenizer(TokenizerBase):
 
     def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.mistral = tokenizer
@@ -251,6 +252,14 @@ def bos_token_id(self) -> int:
     def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
+    @property
+    def sep_token(self) -> str:
+        raise NotImplementedError()
+
+    @property
+    def pad_token(self) -> str:
+        raise NotImplementedError()
+
     @property
     def is_fast(self) -> bool:
         return True
@@ -268,25 +277,26 @@ def __len__(self) -> int:
 
     def __call__(
         self,
-        prompt: Union[str, List[str], List[int]],
+        text: Union[str, List[str], List[int]],
+        text_pair: Optional[str] = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ):
         input_ids: Union[List[int], List[List[int]]]
         # For List[str], original prompt text
-        if is_list_of(prompt, str):
+        if is_list_of(text, str):
             input_ids_: List[List[int]] = []
-            for p in prompt:
+            for p in text:
                 each_input_ids = self.encode_one(p, truncation, max_length)
                 input_ids_.append(each_input_ids)
             input_ids = input_ids_
         # For List[int], apply chat template output, already tokens.
-        elif is_list_of(prompt, int):
-            input_ids = prompt
+        elif is_list_of(text, int):
+            input_ids = text
         # For str, single prompt text
         else:
-            input_ids = self.encode_one(prompt, truncation, max_length)
+            input_ids = self.encode_one(text, truncation, max_length)
         return Encoding(input_ids=input_ids)
 
     def get_vocab(self) -> Dict[str, int]:
@@ -300,22 +310,29 @@ def get_added_vocab(self) -> Dict[str, int]:
 
     def encode_one(
         self,
-        prompt: str,
+        text: str,
         truncation: bool = False,
         max_length: Optional[int] = None,
     ) -> List[int]:
         # Mistral Tokenizers should not add special tokens
-        input_ids = self.encode(prompt)
+        input_ids = self.encode(text)
 
         if truncation:
             input_ids = input_ids[:max_length]
         return input_ids
 
-    def encode(self, prompt: str) -> List[int]:
+    def encode(self,
+               text: str,
+               add_special_tokens: Optional[bool] = None) -> List[int]:
         # `encode` should only be used for prompt completion
         # it should never be used for chat_completion.
         # For chat completion use `apply_chat_template`
-        return self.tokenizer.encode(prompt, bos=True, eos=False)
+        if add_special_tokens is not None:
+            return self.tokenizer.encode(text,
+                                         bos=add_special_tokens,
+                                         eos=add_special_tokens)
+        else:
+            return self.tokenizer.encode(text, bos=True, eos=False)
 
     def apply_chat_template(self,
                             messages: List["ChatCompletionMessageParam"],

From 267d03a79d2411f5af938d305b4e14e82393efdd Mon Sep 17 00:00:00 2001
From: Christian Pinto <chrpinto@gmail.com>
Date: Wed, 12 Feb 2025 04:34:30 +0000
Subject: [PATCH 0116/1240] [Model] IBM/NASA Prithvi Geospatial model  (#12830)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../prithvi_geospatial_mae.py                 | 530 ++++++++++++++++++
 tests/models/registry.py                      |   4 +
 vllm/attention/backends/placeholder_attn.py   |  11 +-
 vllm/inputs/preprocess.py                     |  22 +-
 .../models/prithvi_geospatial_mae.py          | 238 ++++++++
 vllm/model_executor/models/registry.py        |   4 +
 vllm/worker/pooling_model_runner.py           |  11 +-
 7 files changed, 811 insertions(+), 9 deletions(-)
 create mode 100644 examples/offline_inference/prithvi_geospatial_mae.py
 create mode 100644 vllm/model_executor/models/prithvi_geospatial_mae.py

diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
new file mode 100644
index 00000000000..298f0801900
--- /dev/null
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -0,0 +1,530 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This is a demo script showing how to use the
+PrithviGeospatialMAE model with vLLM
+This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
+
+Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
+
+The requirements for running this script are:
+- Installing [terratorch, albumentations, rasterio] in your python environment
+- downloading the model weights in a 'model' folder local to the script
+  (temporary measure until the proper config.json file is uploaded to HF)
+- download an input example image (India_900498_S2Hand.tif) and place it in
+  the same folder with the script (or specify with the --data_file argument)
+
+Run the example:
+python prithvi_geospatial_mae.py
+
+""" # noqa: E501
+import argparse
+import datetime
+import os
+import re
+from typing import List, Union
+
+import albumentations
+import numpy as np
+import rasterio
+import torch
+from einops import rearrange
+from terratorch.datamodules import Sen1Floods11NonGeoDataModule
+
+from vllm import LLM
+
+NO_DATA = -9999
+NO_DATA_FLOAT = 0.0001
+OFFSET = 0
+PERCENTILE = 99
+
+model_config = """{
+  "architectures": ["PrithviGeoSpatialMAE"],
+  "num_classes": 0,
+  "pretrained_cfg": {
+    "task_args": {
+      "task": "SemanticSegmentationTask",
+      "model_factory": "EncoderDecoderFactory",
+      "loss": "ce",
+      "ignore_index": -1,
+      "lr": 0.001,
+      "freeze_backbone": false,
+      "freeze_decoder": false,
+      "plot_on_val": 10,
+      "optimizer": "AdamW",
+      "scheduler": "CosineAnnealingLR"
+    },
+    "model_args": {
+      "backbone_pretrained": false,
+      "backbone": "prithvi_eo_v2_300_tl",
+      "decoder": "UperNetDecoder",
+      "decoder_channels": 256,
+      "decoder_scale_modules": true,
+      "num_classes": 2,
+      "rescale": true,
+      "backbone_bands": [
+        "BLUE",
+        "GREEN",
+        "RED",
+        "NIR_NARROW",
+        "SWIR_1",
+        "SWIR_2"
+      ],
+      "head_dropout": 0.1,
+      "necks": [
+        {
+          "name": "SelectIndices",
+          "indices": [
+            5,
+            11,
+            17,
+            23
+          ]
+        },
+        {
+          "name": "ReshapeTokensToImage"
+        }
+      ]
+    },
+    "optimizer_params" : {
+      "lr": 5.0e-05,
+      "betas": [0.9, 0.999],
+      "eps": [1.0e-08],
+      "weight_decay": 0.05,
+      "amsgrad": false,
+      "maximize": false,
+      "capturable": false,
+      "differentiable": false
+    },
+    "scheduler_params" : {
+        "T_max": 50,
+        "eta_min": 0,
+        "last_epoch": -1,
+        "verbose": "deprecated"
+    }
+  },
+
+
+  "torch_dtype": "float32"
+}
+"""
+
+# Temporarily creating the "config.json" for the model.
+# This is going to disappear once the correct config.json is available on HF
+with open(os.path.join(os.path.dirname(__file__), "./model/config.json"),
+          'w') as config_file:
+    config_file.write(model_config)
+
+datamodule_config = {
+    'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'],
+    'batch_size':
+    16,
+    'constant_scale':
+    0.0001,
+    'data_root':
+    '/dccstor/geofm-finetuning/datasets/sen1floods11',
+    'drop_last':
+    True,
+    'no_data_replace':
+    0.0,
+    'no_label_replace':
+    -1,
+    'num_workers':
+    8,
+    'test_transform': [
+        albumentations.Resize(always_apply=False,
+                              height=448,
+                              interpolation=1,
+                              p=1,
+                              width=448),
+        albumentations.pytorch.ToTensorV2(transpose_mask=False,
+                                          always_apply=True,
+                                          p=1.0)
+    ],
+}
+
+
+class PrithviMAE:
+
+    def __init__(self):
+        print("Initializing PrithviMAE model")
+        self.model = LLM(model=os.path.join(os.path.dirname(__file__),
+                                            "./model"),
+                         skip_tokenizer_init=True,
+                         dtype="float32")
+
+    def run(self, input_data, location_coords):
+        print("################ Running inference on vLLM ##############")
+        # merge the inputs into one data structure
+        mm_data = {
+            "pixel_values":
+            torch.empty(0) if input_data is None else input_data,
+            "location_coords":
+            torch.empty(0) if location_coords is None else location_coords
+        }
+
+        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
+
+        outputs = self.model.encode(prompt, use_tqdm=False)
+        print(
+            "################ Inference done (it took seconds)  ##############"
+        )
+
+        return outputs[0].outputs.data
+
+
+def generate_datamodule():
+    datamodule = Sen1Floods11NonGeoDataModule(
+        data_root=datamodule_config['data_root'],
+        batch_size=datamodule_config["batch_size"],
+        num_workers=datamodule_config["num_workers"],
+        bands=datamodule_config["bands"],
+        drop_last=datamodule_config["drop_last"],
+        test_transform=datamodule_config["test_transform"
+                                         ""])
+
+    return datamodule
+
+
+def process_channel_group(orig_img, channels):
+    """
+    Args:
+        orig_img: torch.Tensor representing original image (reference)
+                  with shape = (bands, H, W).
+        channels: list of indices representing RGB channels.
+
+    Returns:
+        torch.Tensor with shape (num_channels, height, width) for original image
+    """
+
+    orig_img = orig_img[channels, ...]
+    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
+    valid_mask[orig_img == NO_DATA_FLOAT] = False
+
+    # Rescale (enhancing contrast)
+    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
+    min_value = OFFSET
+
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0,
+                           1)
+
+    # No data as zeros
+    orig_img[~valid_mask] = 0
+
+    return orig_img
+
+
+def read_geotiff(file_path: str):
+    """Read all bands from *file_path* and return image + meta info.
+
+    Args:
+        file_path: path to image file.
+
+    Returns:
+        np.ndarray with shape (bands, height, width)
+        meta info dict
+    """
+
+    with rasterio.open(file_path) as src:
+        img = src.read()
+        meta = src.meta
+        try:
+            coords = src.lnglat()
+        except Exception:
+            # Cannot read coords
+            coords = None
+
+    return img, meta, coords
+
+
+def save_geotiff(image, output_path: str, meta: dict):
+    """Save multi-band image in Geotiff file.
+
+    Args:
+        image: np.ndarray with shape (bands, height, width)
+        output_path: path where to save the image
+        meta: dict with meta info.
+    """
+
+    with rasterio.open(output_path, "w", **meta) as dest:
+        for i in range(image.shape[0]):
+            dest.write(image[i, :, :], i + 1)
+
+    return
+
+
+def _convert_np_uint8(float_image: torch.Tensor):
+    image = float_image.numpy() * 255.0
+    image = image.astype(dtype=np.uint8)
+
+    return image
+
+
+def load_example(
+    file_paths: List[str],
+    mean: List[float] = None,
+    std: List[float] = None,
+    indices: Union[list[int], None] = None,
+):
+    """Build an input example by loading images in *file_paths*.
+
+    Args:
+        file_paths: list of file paths .
+        mean: list containing mean values for each band in the images
+              in *file_paths*.
+        std: list containing std values for each band in the images
+             in *file_paths*.
+
+    Returns:
+        np.array containing created example
+        list of meta info for each image in *file_paths*
+    """
+
+    imgs = []
+    metas = []
+    temporal_coords = []
+    location_coords = []
+
+    for file in file_paths:
+        img, meta, coords = read_geotiff(file)
+
+        # Rescaling (don't normalize on nodata)
+        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
+        if indices is not None:
+            img = img[..., indices]
+        if mean is not None and std is not None:
+            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
+
+        imgs.append(img)
+        metas.append(meta)
+        if coords is not None:
+            location_coords.append(coords)
+
+        try:
+            match = re.search(r'(\d{7,8}T\d{6})', file)
+            if match:
+                year = int(match.group(1)[:4])
+                julian_day = match.group(1).split('T')[0][4:]
+                if len(julian_day) == 3:
+                    julian_day = int(julian_day)
+                else:
+                    julian_day = datetime.datetime.strptime(
+                        julian_day, '%m%d').timetuple().tm_yday
+                temporal_coords.append([year, julian_day])
+        except Exception as e:
+            print(f'Could not extract timestamp for {file} ({e})')
+
+    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")
+    imgs = np.expand_dims(imgs, axis=0)  # add batch di
+
+    return imgs, temporal_coords, location_coords, metas
+
+
+def run_model(input_data,
+              temporal_coords,
+              location_coords,
+              model,
+              datamodule,
+              img_size,
+              lightning_model=None):
+    # Reflect pad if not divisible by img_size
+    original_h, original_w = input_data.shape[-2:]
+    pad_h = (img_size - (original_h % img_size)) % img_size
+    pad_w = (img_size - (original_w % img_size)) % img_size
+    input_data = np.pad(input_data,
+                        ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
+                        mode="reflect")
+
+    # Build sliding window
+    batch_size = 1
+    batch = torch.tensor(input_data, device="cpu")
+    windows = (batch.unfold(3, img_size,
+                            img_size).unfold(4, img_size, img_size))
+    h1, w1 = windows.shape[3:5]
+    windows = rearrange(windows,
+                        "b c t h1 w1 h w -> (b h1 w1) c t h w",
+                        h=img_size,
+                        w=img_size)
+
+    # Split into batches if number of windows > batch_size
+    num_batches = windows.shape[0] // batch_size if windows.shape[
+        0] > batch_size else 1
+    windows = torch.tensor_split(windows, num_batches, dim=0)
+
+    if torch.cuda.is_available():
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+
+    if temporal_coords:
+        temporal_coords = torch.tensor(temporal_coords,
+                                       device=device).unsqueeze(0)
+    else:
+        temporal_coords = None
+    if location_coords:
+        location_coords = torch.tensor(location_coords[0],
+                                       device=device).unsqueeze(0)
+    else:
+        location_coords = None
+
+    # Run model
+    pred_imgs = []
+    for x in windows:
+        # Apply standardization
+        x = datamodule.test_transform(
+            image=x.squeeze().numpy().transpose(1, 2, 0))
+        x = datamodule.aug(x)['image']
+
+        with torch.no_grad():
+            x = x.to(device)
+            pred = model.run(x, location_coords=location_coords)
+            if lightning_model:
+                pred_lightning = lightning_model(
+                    x,
+                    temporal_coords=temporal_coords,
+                    location_coords=location_coords)
+                pred_lightning = pred_lightning.output.detach().cpu()
+                if not torch.equal(pred, pred_lightning):
+                    print("Inference output is not equal")
+        y_hat = pred.argmax(dim=1)
+
+        y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(),
+                                                size=img_size,
+                                                mode="nearest")
+
+        pred_imgs.append(y_hat)
+
+    pred_imgs = torch.concat(pred_imgs, dim=0)
+
+    # Build images from patches
+    pred_imgs = rearrange(
+        pred_imgs,
+        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
+        h=img_size,
+        w=img_size,
+        b=1,
+        c=1,
+        h1=h1,
+        w1=w1,
+    )
+
+    # Cut padded area back to original size
+    pred_imgs = pred_imgs[..., :original_h, :original_w]
+
+    # Squeeze (batch size 1)
+    pred_imgs = pred_imgs[0]
+
+    return pred_imgs
+
+
+def main(
+    data_file: str,
+    output_dir: str,
+    rgb_outputs: bool,
+    input_indices: list[int] = None,
+):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load model ---------------------------------------------------------------
+
+    model_obj = PrithviMAE()
+    datamodule = generate_datamodule()
+    img_size = 256  # Size of Sen1Floods11
+
+    # Loading data -------------------------------------------------------------
+
+    input_data, temporal_coords, location_coords, meta_data = load_example(
+        file_paths=[data_file],
+        indices=input_indices,
+    )
+
+    meta_data = meta_data[0]  # only one image
+
+    if input_data.mean() > 1:
+        input_data = input_data / 10000  # Convert to range 0-1
+
+    # Running model ------------------------------------------------------------
+
+    channels = [
+        datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"]
+    ]  # BGR -> RGB
+
+    pred = run_model(input_data, temporal_coords, location_coords, model_obj,
+                     datamodule, img_size)
+
+    # Save pred
+    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
+    pred_file = os.path.join(
+        output_dir,
+        f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
+
+    # Save image + pred
+    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
+
+    if input_data.mean() < 1:
+        input_data = input_data * 10000  # Scale to 0-10000
+
+    rgb_orig = process_channel_group(
+        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
+        channels=channels,
+    )
+
+    pred[pred == 0.] = np.nan
+    img_pred = rgb_orig * 0.7 + pred * 0.3
+    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
+
+    img_pred_file = os.path.join(
+        output_dir,
+        f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+    save_geotiff(
+        image=_convert_np_uint8(img_pred),
+        output_path=img_pred_file,
+        meta=meta_data,
+    )
+
+    # Save image rgb
+    if rgb_outputs:
+        rgb_file = os.path.join(
+            output_dir, "original_rgb_"
+            f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+        save_geotiff(
+            image=_convert_np_uint8(rgb_orig),
+            output_path=rgb_file,
+            meta=meta_data,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help=
+        "0-based indices of the six Prithvi channels to be selected from the  "
+        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
+
+    main(**vars(args))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 66b7d3c2e77..7b1db55494f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -214,6 +214,10 @@ def check_available_online(
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
                                          trust_remote_code=True),
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
+    # The model on Huggingface is currently being updated,
+    # hence I temporarily mark it as not available online
+    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",  # noqa: E501
+                                            is_available_online=False),
 }
 
 _CROSS_ENCODER_EXAMPLE_MODELS = {
diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py
index f363ba0c1e3..f1def25c89c 100644
--- a/vllm/attention/backends/placeholder_attn.py
+++ b/vllm/attention/backends/placeholder_attn.py
@@ -320,9 +320,14 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                                  -1 if cuda graph is not used.
             batch_size: The maybe padded batch size.
         """
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled)
+
+        # Some input builders such as ModelInputForCPUBuilder do not have the
+        # "inter_data_list" attribute.
+        # Let's check inter_data_list exists before we reference it.
+        if hasattr(self.input_builder, "inter_data_list"):
+            for inter_data in self.input_builder.inter_data_list:
+                self._add_seq_group(inter_data,
+                                    self.input_builder.chunked_prefill_enabled)
 
         device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 53f89996f0f..656f2f2b766 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -254,8 +254,14 @@ def _process_multimodal(
         Apply the model's multi-modal processor to a multi-modal prompt,
         returning the corresponding token IDs and metadata.
         """
-        tokenizer_group = self.get_tokenizer_group()
-        tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = None
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
@@ -273,9 +279,15 @@ async def _process_multimodal_async(
         lora_request: Optional[LoRARequest],
     ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
-        tokenizer_group = self.get_tokenizer_group()
-        tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
-                                                                   )
+        # At the moment on model (PrithviGeoSpatialMAE) requires to be
+        # initialized without a tokenizer while using also multi-modal
+        # input.
+        if not self.tokenizer:
+            tokenizer = None
+        else:
+            tokenizer_group = self.get_tokenizer_group()
+            tokenizer = await tokenizer_group.get_lora_tokenizer_async(
+                lora_request)
 
         mm_processor = self.mm_registry.create_processor(
             self.model_config, tokenizer)
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
new file mode 100644
index 00000000000..9383cbae11b
--- /dev/null
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 IBM.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only IBM/NASA Prithvi Geospatial model."""
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (IsAttentionFree,
+                                                   SupportsMultiModal)
+from vllm.model_executor.models.utils import AutoWeightsLoader
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import (IntermediateTensors, PoolerOutput,
+                           PoolingSequenceGroupOutput)
+
+
+class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        pass
+
+
+class PrithviGeoSpatialMAEInputBuilder(
+        BaseDummyInputsBuilder[PrithviGeoSpatialMAEProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        return ProcessorInputs(
+            prompt_text="",
+            # This model input is fixed and is in the form of a torch Tensor.
+            # The size of pixel_values might change in the cases where we resize
+            # the input but never exceeds the dimensions below.
+            mm_data={
+                "pixel_values": torch.full((1, 6, 512, 512), 1.0),
+                "location_coords": torch.full((1, 2), 1.0)
+            })
+
+
+class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            location_coords=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        pass
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        pass
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        mm_kwargs = {}
+
+        for k, v in mm_data.items():
+            mm_kwargs[k] = v
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=[1],
+            mm_kwargs=MultiModalKwargs(mm_kwargs),
+            mm_placeholders={},
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PrithviGeoSpatialMAEMultiModalProcessor,
+    info=PrithviGeoSpatialMAEProcessingInfo,
+    dummy_inputs=PrithviGeoSpatialMAEInputBuilder)
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
+    """ Prithvi Masked Autoencoder"""
+
+    def _instantiate_model(self, config: dict) -> nn.Module | None:
+
+        # We might be able/need to support different tasks with this same model
+        if config["task_args"]["task"] == "SemanticSegmentationTask":
+            from terratorch.cli_tools import SemanticSegmentationTask
+            task = SemanticSegmentationTask(
+                config["model_args"],
+                config["task_args"]["model_factory"],
+                loss=config["task_args"]["loss"],
+                lr=config["task_args"]["lr"],
+                ignore_index=config["task_args"]["ignore_index"],
+                optimizer=config["task_args"]["optimizer"],
+                optimizer_hparams=config["optimizer_params"],
+                scheduler=config["task_args"]["scheduler"],
+                scheduler_hparams=config["scheduler_params"],
+                plot_on_val=config["task_args"]["plot_on_val"],
+                freeze_decoder=config["task_args"]["freeze_decoder"],
+                freeze_backbone=config["task_args"]["freeze_backbone"])
+
+            return task.model
+        else:
+            return None
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        # the actual model is dynamically instantiated using terratorch
+        # allowing us to perform changes to the model architecture
+        # at startup time (e.g., change the model decoder class.)
+        self.model = self._instantiate_model(
+            vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
+        if self.model is None:
+            raise ValueError(
+                "Unsupported task."
+                "Only SemanticSegmentationTask is supported for now"
+                "by PrithviGeospatialMAE.")
+
+    def _parse_and_validate_multimodal_data(
+            self, **kwargs) -> Tuple[torch.Tensor, torch.Tensor | None]:
+
+        pixel_values = kwargs.pop("pixel_values", None)
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError(f"Incorrect type of pixel_values. "
+                             f"Got type: {type(pixel_values)}")
+        pixel_values = torch.unbind(pixel_values, dim=0)[0]
+
+        location_coords = kwargs.pop("location_coords", None)
+        if not isinstance(location_coords, torch.Tensor):
+            raise ValueError(f"Incorrect type of location_coords. "
+                             f"Got type: {type(location_coords)}")
+        location_coords = torch.unbind(location_coords, dim=0)[0]
+        if location_coords.shape == torch.Size([0]):
+            location_coords = None
+
+        return pixel_values, location_coords
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ):
+
+        pixel_values, location_coords = (
+            self._parse_and_validate_multimodal_data(**kwargs))
+        model_output = self.model(pixel_values,
+                                  location_coords=location_coords)
+
+        return model_output.output
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)])
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_list = []
+        model_buffers = dict(self.named_buffers())
+        loaded_buffers = []
+        for key, value in weights:
+            if key == "state_dict":
+                weights_to_parse = value
+                for name, weight in weights_to_parse.items():
+                    if "pos_embed" in name:
+                        continue
+
+                    if "_timm_module." in name:
+                        name = name.replace("_timm_module.", "")
+
+                    # this model requires a couple of buffers to be loaded
+                    # that are not loadable with the AutoWeightsLoader
+                    if name in model_buffers:
+                        if "_timm_module." in name:
+                            name = name.replace("_timm_module.", "")
+                        buffer = model_buffers[name]
+                        weight_loader = getattr(buffer, "weight_loader",
+                                                default_weight_loader)
+                        weight_loader(buffer, weight)
+                        loaded_buffers.append(name)
+                    else:
+                        params_list.append((name, weight))
+                break
+
+        # Load the remaining model parameters
+        loader = AutoWeightsLoader(self)
+        autoloaded_weights = loader.load_weights(params_list)
+
+        return autoloaded_weights.union(set(loaded_buffers))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c2d0fae7056..ebf6a88f21b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -137,6 +137,10 @@
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     # [Auto-converted (see adapters.py)]
     "Qwen2ForSequenceClassification": ("qwen2", "Qwen2ForCausalLM"),
+    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
+    # input and output. I am adding it here because it piggy-backs on embedding
+    # models for the time being.
+    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
 }
 
 _CROSS_ENCODER_MODELS = {
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index f43085b0e96..4cbe5db4453 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -74,7 +74,16 @@ def execute_model(
         prefill_meta = model_input.attn_metadata.prefill_metadata
         decode_meta = model_input.attn_metadata.decode_metadata
         virtual_engine = model_input.virtual_engine
-        if prefill_meta is None and decode_meta.use_cuda_graph:
+        # Pooling models are (ab-)used also to integrate non text models that
+        # are not autoregressive (PrithviGeosaptialMAE).
+        # These model might not use attention and do not really have a prefill
+        # and decode phase. The model input is processed in one shot and both
+        # decode_metadata and prefill_metadata would be None for such models.
+        # See the PlaceholderAttentionMetadata class.
+        # TODO: Figure out if cuda_graph is of any use for these models and
+        #  explore how to leverage it.
+        if (prefill_meta is None and decode_meta is not None
+                and decode_meta.use_cuda_graph):
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[virtual_engine][

From c0396ab85c8d66890c4c8d7b6c626a90fc3b83e3 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 11 Feb 2025 20:38:10 -0800
Subject: [PATCH 0117/1240] [ci] Add more source file dependencies for some
 tests (#13123)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 948eab97ffa..e26b1bf3818 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -107,6 +107,10 @@ steps:
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  - tests/entrypoints/offline_mode
   commands:
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
@@ -124,9 +128,10 @@ steps:
   source_file_dependencies:
   - vllm/distributed/
   - vllm/core/
-  - tests/distributed
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
   - tests/spec_decode/e2e/test_integration_dist_tp4
-  - tests/compile
+  - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   commands:
@@ -174,6 +179,9 @@ steps:
   - vllm/
   - tests/engine
   - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py
   # OOM in the CI unless we run this separately

From a3117c39668e21962d48be668d4dc0459a1e1f0c Mon Sep 17 00:00:00 2001
From: Lingfan Yu <lingfany@amazon.com>
Date: Tue, 11 Feb 2025 21:12:37 -0800
Subject: [PATCH 0118/1240] [Neuron][Kernel] Support Longer Sequences in
 NKI-based Flash PagedAttention and Improve Efficiency (#12921)

Signed-off-by: Lingfan Yu <lingfany@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/test_prefix_prefill.py  | 118 ++++++++-------
 vllm/attention/ops/nki_flash_attn.py | 216 +++++++++++----------------
 2 files changed, 154 insertions(+), 180 deletions(-)

diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index dfbcfc15e23..04d1bd3f0eb 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import random
 from typing import Optional
 
 import pytest
@@ -171,12 +170,22 @@ def ref_context_attention(
         return output
 
 
+@pytest.mark.parametrize(
+    "block_size, large_tile_size",
+    [
+        (32, 2048),  # 64 blocks
+        (32, 4096),  # 128 blocks
+        (32, 8192),  # 256 blocks
+        (64, 8192),  # 128 blocks
+    ],
+)
 @pytest.mark.parametrize(
     "num_heads,num_queries_per_kv,head_size,mixed_precision",
     [
         (4, 2, 8, False),
         (4, 2, 8, True),
         (32, 8, 64, True),
+        (16, 2, 128, True),
     ],
 )
 @torch.inference_mode()
@@ -184,6 +193,8 @@ def test_contexted_kv_attention(
     num_heads: int,
     num_queries_per_kv: int,
     head_size: int,
+    block_size: int,
+    large_tile_size,
     mixed_precision: bool,
 ) -> None:
     import os
@@ -192,40 +203,46 @@ def test_contexted_kv_attention(
 
     from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
 
+    assert large_tile_size % block_size == 0
+
     device = xm.xla_device()
 
-    os.environ["NEURON_CC_FLAGS"] = (
-        " --model-type=transformer -O1 "
-        " --internal-hlo2tensorizer-options='--verify-hlo' ")
+    compiler_flags = [
+        "--model-type=transformer -O1",
+        "--internal-hlo2tensorizer-options='--verify-hlo'",
+        "--retry_failed_compilation",
+    ]
+    compiler_flags_str = " ".join(compiler_flags)
+    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
 
-    random.seed(0)
     torch.manual_seed(0)
     torch.set_printoptions(sci_mode=False)
 
-    min_ctx_len = 2
-    max_ctx_len = 64
-    min_query_len = 2
-    max_query_len = 64
-    prefill_batch_size = 2
-    decode_batch_size = 6
+    min_ctx_len = 32
+    max_ctx_len = 1024
+    min_query_len = 16
+    max_query_len = 512
+    prefill_batch_size = 4
+    decode_batch_size = 12
     batch_size = prefill_batch_size + decode_batch_size
-    block_size = 32
     max_model_len = (max_query_len + max_ctx_len) * 4
 
     max_block_per_request = max_model_len // block_size
     dtype = torch.float32
     cache_size = (batch_size * max_block_per_request) + 2
-    ctx_lens = [
-        random.randint(min_ctx_len, max_ctx_len)
-        for _ in range(prefill_batch_size)
-    ] + [
-        random.randint(min_ctx_len, max_ctx_len)
-        for _ in range(decode_batch_size)
-    ]
-    query_lens = [
-        random.randint(min_query_len, max_query_len)
-        for _ in range(prefill_batch_size)
-    ] + [1 for _ in range(decode_batch_size)]
+    prefill_ctx_lens = torch.randint(min_ctx_len,
+                                     max_ctx_len + 1, (prefill_batch_size, ),
+                                     dtype=torch.long).tolist()
+    decode_ctx_lens = torch.randint(min_ctx_len,
+                                    max_ctx_len + 1, (decode_batch_size, ),
+                                    dtype=torch.long).tolist()
+    ctx_lens = prefill_ctx_lens + decode_ctx_lens
+    query_lens = torch.randint(
+        min_query_len,
+        max_query_len + 1,
+        (prefill_batch_size, ),
+        dtype=torch.long,
+    ).tolist() + [1 for _ in range(decode_batch_size)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
     num_kv_heads = num_heads // num_queries_per_kv
 
@@ -254,7 +271,6 @@ def test_contexted_kv_attention(
     values = values[torch.randperm(cache_size)]
     block_table = values[:batch_size * max_block_per_request].view(
         batch_size, max_block_per_request)
-    torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
     b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
                                             dtype=torch.long),
@@ -311,9 +327,7 @@ def test_contexted_kv_attention(
     # build neuron program
     return_debug_tensors = False
     B_P_SIZE = 128
-    LARGE_TILE_SZ = 2048
-    max_num_queries = (
-        (sum(query_lens) + block_size - 1) // block_size) * block_size
+    LARGE_TILE_SZ = large_tile_size
 
     def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
                                 num_blocks):
@@ -332,26 +346,28 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
             0,
         )
 
-    def shift_bit_length(x):
-        return 1 << (x - 1).bit_length()
+    def ceil_div(a, b):
+        return (a + b - 1) // b
+
+    def pad_to_multiple(a, b):
+        return ceil_div(a, b) * b
+
+    def pad_to_next_power_of_2(a):
+        assert a > 0
+        return 2**int(a - 1).bit_length()
 
     # calculate input shapes
-    max_num_queries_shifted = shift_bit_length(max_num_queries)
-    max_num_queries_factor = B_P_SIZE // max_num_queries_shifted
-    max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor
-    assert (max_num_queries_padded == B_P_SIZE
-            ), "invalid {max_num_queries_padded=}"
+    max_num_queries = pad_to_multiple(sum(query_lens), block_size)
+    max_num_queries = pad_to_next_power_of_2(max_num_queries)
     head_size_padded = B_P_SIZE
+    assert head_size_padded >= head_size
     context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks_shifted = shift_bit_length(
-        ((context_lens + block_size - 1) // block_size).sum().item())
-    num_active_blocks_factor = (LARGE_TILE_SZ // block_size //
-                                num_active_blocks_shifted)
-    num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor
-    assert (num_active_blocks *
-            block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}"
+    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+    num_active_blocks = pad_to_multiple(num_active_blocks,
+                                        LARGE_TILE_SZ // block_size)
     context_kv_len = num_active_blocks * block_size
-    assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}"
+    assert (context_kv_len %
+            LARGE_TILE_SZ == 0), f"invalid context_kv_len={context_kv_len}"
 
     # pad QKV tensors
     pad_dims = (
@@ -360,7 +376,7 @@ def shift_bit_length(x):
         0,
         0,
         0,
-        max_num_queries_padded - query.shape[0],
+        max_num_queries - query.shape[0],
     )
     query = F.pad(query, pad_dims, "constant", 0)
     k = F.pad(k, pad_dims, "constant", 0)
@@ -397,7 +413,7 @@ def shift_bit_length(x):
                     0,
                     context_kv_len - prior_mask.shape[1],
                     0,
-                    B_P_SIZE - prior_mask.shape[0],
+                    max_num_queries - prior_mask.shape[0],
                 ),
                 "constant",
                 0,
@@ -406,9 +422,9 @@ def shift_bit_length(x):
                 active_mask,
                 (
                     0,
-                    B_P_SIZE - active_mask.shape[1],
+                    max_num_queries - active_mask.shape[1],
                     0,
-                    B_P_SIZE - active_mask.shape[0],
+                    max_num_queries - active_mask.shape[0],
                 ),
                 "constant",
                 0,
@@ -430,6 +446,8 @@ def shift_bit_length(x):
         n_kv_head=num_kv_heads,
         head_size=head_size,
         mixed_precision=mixed_precision,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
+        return_debug_tensors=return_debug_tensors,
     )
 
     if return_debug_tensors:
@@ -439,17 +457,15 @@ def shift_bit_length(x):
         output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
         debug_tensors = []
 
-    output_nki = torch.tensor(output_nki).cpu()
     debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
 
     num_actual_tokens = sum(query_lens)
-    print(f"{num_actual_tokens=}")
     # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.permute(
-        0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :]
+    output_nki = output_nki.cpu().permute(0, 2, 1, 3)[:, :, :, :head_size]
+    output_nki = output_nki[0, :num_actual_tokens, :, :]
     output_ref_padded = F.pad(
         output_ref,
-        (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]),
+        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
         "constant",
         0,
     )
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 68aa63f5ac1..5e2a1f7e66d 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -28,7 +28,6 @@ class FlashConfig:
 def transpose_p_local(p_local_transposed,
                       p_local,
                       LARGE_TILE_SZ,
-                      forward_mask,
                       B_F_SIZE=512):
     for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
         if nisa.get_nc_version() == nisa.nc_version.gen3:
@@ -46,13 +45,13 @@ def transpose_p_local(p_local_transposed,
 
             if nisa.get_nc_version() == nisa.nc_version.gen3:
                 p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose(
-                    p_local[:, i_j_128_slice], mask=forward_mask)
+                    p_local[:, i_j_128_slice])
             else:
                 p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose(
-                    p_local[:, i_j_128_slice], mask=forward_mask)
+                    p_local[:, i_j_128_slice])
 
         p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy(
-            p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask)
+            p_local_t_tmp, dtype=p_local_transposed.dtype)
 
 
 @nki.jit
@@ -60,36 +59,25 @@ def _flash_attention_core(
     q_local_tile,
     k,
     v,
-    q_h_per_k_h,
-    seqlen_q,
-    nheads,
     o_buffer,
     l_buffer,
     m_buffer,
-    batch_id,
-    head_id,
-    gqa_head_idx,
     q_tile_idx,
-    local_k_large_tile_idx,
     kernel_dtype,
     acc_type,
     flash_config: FlashConfig,
-    use_causal_mask=False,
-    continuous_batching_mask=None,
+    use_causal_mask,
+    tile_mask,
     initialize=False,
     B_P_SIZE=128,
     B_F_SIZE=512,
     B_D_SIZE=128,
-    dropout_p=0.0,
-    dropout_p_tensor=None,
-    seed_tensor=None,
-    logit_bias_tile=None,
     qk_res_buffer=None,
 ):
     """
     The flash attention core function to calculate self attention between a tile
     of q and a block of K and V.
-    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF 
+    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF
     already. The block size of K and V
     is defined in the seq_tile_size of the flash_config. The results are stored
     in the following three buffers
@@ -99,24 +87,9 @@ def _flash_attention_core(
     """
     LARGE_TILE_SZ = flash_config.seq_tile_size
     num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
-    seqlen_k = k.shape[-1]
-    seqlen_q // B_P_SIZE
-    seqlen_k // B_F_SIZE
-
-    # TODO : support logit_bias with continuous_batching_mask
-    assert not use_causal_mask, "causal mask is not supported."
-    assert (continuous_batching_mask
-            is not None), "continuous_batching_mask input is required."
-    if continuous_batching_mask is not None:
-        assert (
-            logit_bias_tile
-            is None), "continuous_batching_mask does not support logit_bias!"
 
     # mask are used to only apply computation to the lower half of the matrix,
     # which reduce the arithmetic intensity by half
-    forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx *
-                    LARGE_TILE_SZ if use_causal_mask else None)
-
     qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                             buffer=nl.sbuf,
                             dtype=acc_type)
@@ -125,20 +98,27 @@ def _flash_attention_core(
     for k_i in nl.affine_range(num_k_tile_per_large_tile):
         k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
 
-        qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE),
-                           dtype=np.float32,
-                           buffer=nl.psum)  # (128, 512)
-        qk_psum[:, :] = nl.matmul(q_local_tile,
-                                  k[:, k_i_b_f_slice],
-                                  transpose_x=True,
-                                  mask=None)  # (p(128), 512)
-
-        qk_res_buf[:, k_i_b_f_slice] = nl.where(
-            continuous_batching_mask[:, k_i_b_f_slice],
-            qk_psum[:, nl.ds(0, B_F_SIZE)],
-            -9984.0,
-            dtype=acc_type,
-        )
+        if use_causal_mask:
+            multiplication_required_selection = (q_tile_idx * B_P_SIZE
+                                                 >= k_i * B_F_SIZE)
+        else:
+            multiplication_required_selection = True
+
+        if multiplication_required_selection:
+            qk_psum = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE),
+                                 dtype=np.float32,
+                                 buffer=nl.psum)  # (128, 512)
+            qk_psum[:, :] = nl.matmul(q_local_tile,
+                                      k[:, k_i_b_f_slice],
+                                      transpose_x=True)  # (p(128), 512)
+            qk_res_buf[:, k_i_b_f_slice] = nl.where(
+                tile_mask[:, k_i_b_f_slice],
+                qk_psum[:, nl.ds(0, B_F_SIZE)],
+                -9984.0,
+                dtype=acc_type,
+            )
+        else:
+            qk_res_buf[:, k_i_b_f_slice] = -9984.0
 
         # Calculate max of the current tile
         max_local[:, k_i] = nisa.tensor_reduce(
@@ -147,7 +127,6 @@ def _flash_attention_core(
             axis=(1, ),
             dtype=acc_type,
             negate=False,
-            mask=forward_mask,
         )
 
     if qk_res_buffer is not None:
@@ -159,7 +138,6 @@ def _flash_attention_core(
         axis=(1, ),
         dtype=acc_type,
         negate=False,
-        mask=forward_mask,
     )
 
     o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE),
@@ -170,8 +148,7 @@ def _flash_attention_core(
         m_current = max_
     else:
         m_previous = nl.copy(m_buffer[:, 0])
-        m_buffer[:, 0] = nl.maximum(m_previous, max_,
-                                    mask=forward_mask)  # (128,1)
+        m_buffer[:, 0] = nl.maximum(m_previous, max_)  # (128,1)
 
         m_current = m_buffer[:, 0]
         # Compute scaling factor
@@ -180,11 +157,8 @@ def _flash_attention_core(
             m_previous,
             bias=-1 * m_current,
             scale=1.0,
-            mask=forward_mask,
         )
-        o_previous_scaled[...] = nl.multiply(o_buffer[:, :],
-                                             alpha,
-                                             mask=forward_mask)
+        o_previous_scaled[...] = nl.multiply(o_buffer[:, :], alpha)
 
     p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                          dtype=kernel_dtype)
@@ -207,10 +181,9 @@ def _flash_attention_core(
             reduce_op=nl.add,
             reduce_res=p_partial_sum[:, k_r_i],
             dtype=kernel_dtype,
-            mask=forward_mask,
         )
 
-    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask)
+    ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type)
 
     p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                                     dtype=kernel_dtype)
@@ -218,7 +191,6 @@ def _flash_attention_core(
         p_local_transposed=p_local_transposed,
         p_local=p_local,
         LARGE_TILE_SZ=LARGE_TILE_SZ,
-        forward_mask=forward_mask,
         B_F_SIZE=B_F_SIZE,
     )
 
@@ -230,27 +202,20 @@ def _flash_attention_core(
             p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
             v[k_i, :, :],
             transpose_x=True,
-            mask=forward_mask,
         )  # (128, 128) (p(Br), d)
 
     if initialize:
         o_buffer[:, :] = nl.copy(pv_psum[:, :])
         l_buffer[:, 0] = nl.add(nl.log(ps), max_)
     else:
-        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask)
+        o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum)
 
         l_prev = l_buffer[:, 0]
         l_exp = nl.add(
-            nl.exp(
-                nl.subtract(l_prev, m_current, mask=forward_mask),
-                mask=forward_mask,
-            ),
+            nl.exp(nl.subtract(l_prev, m_current)),
             ps,
-            mask=forward_mask,
         )
-        l_buffer[:, 0] = nl.add(m_current,
-                                nl.log(l_exp, mask=forward_mask),
-                                mask=forward_mask)
+        l_buffer[:, 0] = nl.add(m_current, nl.log(l_exp))
 
 
 @nki.jit
@@ -279,6 +244,21 @@ def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
     )
 
 
+@nki.jit
+def load_block_tables(block_tables_hbm, num_tiles):
+    (num_blocks, ) = block_tables_hbm.shape
+    assert num_blocks % num_tiles == 0
+    num_blocks_per_tile = num_blocks // num_tiles
+    block_tables_hbm = block_tables_hbm.reshape(
+        (num_tiles, num_blocks_per_tile))
+    block_tables_buffer = nl.load(block_tables_hbm, dtype=nl.int32)
+    return block_tables_buffer
+
+
+def is_power_of_2(x):
+    return x > 0 and (x & (x - 1)) == 0
+
+
 @nki.jit
 def flash_paged_attention(
     query,
@@ -316,24 +296,24 @@ def flash_paged_attention(
       - We use paged cache blocks (key_cache, value_cache) to store KV cache.
 
     IO tensor dtypes:
-      - This kernel assumes all IO tensors have the same dtype except for 
+      - This kernel assumes all IO tensors have the same dtype except for
         block_tables (int32) and mask (int32)
-      - If mixed_percision is True, then all Tensor Engine operation will be 
-        performed in bfloat16 and accumulation will be performed in float32. 
+      - If mixed_percision is True, then all Tensor Engine operation will be
+        performed in bfloat16 and accumulation will be performed in float32.
         Otherwise the intermediates will be in the same type as the inputs.
 
     Compile-time Constants:
       - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
       - mixed_precision: flag to set non-matmul ops in fp32 precision, default
-        is set to `true`, if false, we use same precision as input types 
+        is set to `true`, if false, we use same precision as input types
       - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
           with Performance config parameters for flash attention with default
           values
-        seq_tile_size: `default=2048`, size of the kv tile size for attention 
+        seq_tile_size: `default=2048`, size of the kv tile size for attention
           computation reduction
 
     GQA support Notes:
-      the spmd kernel for launching kernel should be on kv_heads instead of 
+      the spmd kernel for launching kernel should be on kv_heads instead of
       nheads
 
     Example usage:
@@ -415,18 +395,13 @@ def flash_paged_attention(
             ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
     num_large_k_tile = context_kv_len // LARGE_TILE_SZ
     num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
-    assert (num_blocks_per_large_tile <= B_P_SIZE
-    ), f"The number of blocks in each large tile " \
-    f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}"
-
-    block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile),
-                                0,
-                                dtype=np.int32,
-                                buffer=nl.sbuf)
-    for j in nl.affine_range(num_large_k_tile):
-        i_p = nl.arange(num_blocks_per_large_tile)[:, None]
-        block_tables_sbuf[i_p, j] = nl.load(
-            block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32)
+    assert block_size % 32 == 0, "block_size is expected to be a multiple of 32"
+    assert is_power_of_2(
+        num_blocks_per_large_tile
+    ), "The number of blocks in each large tile is expected of be power of 2"
+    assert is_power_of_2(seqlen_q), "seqlen_q is expected to be power of 2"
+
+    block_tables_sbuf = load_block_tables(block_tables, num_large_k_tile)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -457,7 +432,7 @@ def flash_paged_attention(
         )
 
         for k_i in nl.affine_range(num_blocks_per_large_tile):
-            loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :,
+            loaded = nl.load(key_cache[block_tables_sbuf[j, k_i], :,
                                        head_id, :])
             cur_k_tile[:, nl.ds(k_i *
                                 block_size, block_size)] = nl.transpose(loaded)
@@ -469,7 +444,7 @@ def flash_paged_attention(
                     num_blocks_per_partition):
                 v_i = (partition_idx * num_blocks_per_partition +
                        block_in_partition)
-                loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :,
+                loaded_v = nl.load(value_cache[block_tables_sbuf[j, v_i], :,
                                                head_id, :])
                 cur_v_tile[
                     partition_idx,
@@ -477,14 +452,15 @@ def flash_paged_attention(
                     :,
                 ] = loaded_v
 
-        cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                              dtype=mask.dtype)
-        for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
-            cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(
-                mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)])
-
-        for i_q_h in nl.affine_range(q_h_per_k_h):
-            for i in nl.affine_range(n_tile_q):
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
+                                  dtype=mask.dtype)
+            for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
+                cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(mask[
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE),
+                ])
+            for i_q_h in nl.affine_range(q_h_per_k_h):
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
                 q_sbuf_tile = nl.load(
@@ -497,35 +473,24 @@ def flash_paged_attention(
                     q_local_tile=q_tile,
                     k=cur_k_tile,
                     v=cur_v_tile,
-                    q_h_per_k_h=q_h_per_k_h,
-                    seqlen_q=seqlen_q,
-                    nheads=h,
                     o_buffer=o_buffer[i, i_q_h],
                     l_buffer=l_buffer[:, i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    batch_id=batch_id,
-                    head_id=head_id,
-                    gqa_head_idx=i_q_h,
                     q_tile_idx=i,
-                    local_k_large_tile_idx=j,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
                     flash_config=config,
                     use_causal_mask=False,
-                    continuous_batching_mask=cur_mask,
+                    tile_mask=cur_mask,
                     initialize=j == 0,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
-                    dropout_p=0.0,
-                    dropout_p_tensor=None,
-                    seed_tensor=None,
-                    logit_bias_tile=None,
                 )
 
     # compute attention between input query, key and value
     if key is not None and value is not None:
-        B_F_SIZE = seqlen_q
+        B_F_SIZE = min(seqlen_q, B_F_SIZE)
         LARGE_TILE_SZ = seqlen_q
         active_config = FlashConfig(
             seq_tile_size=LARGE_TILE_SZ,
@@ -552,11 +517,16 @@ def flash_paged_attention(
                 config=active_config,
             )
 
-        cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype)
-        cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)])
+        for i in nl.affine_range(n_tile_q):
+            cur_mask = nl.load(
+                mask[
+                    nl.ds(i * B_P_SIZE, B_P_SIZE),
+                    nl.ds(context_kv_len, LARGE_TILE_SZ),
+                ],
+                dtype=mask.dtype,
+            )
+            for i_q_h in nl.affine_range(q_h_per_k_h):
 
-        for i_q_h in nl.affine_range(q_h_per_k_h):
-            for i in nl.affine_range(n_tile_q):
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
                 q_sbuf_tile = nl.load(
@@ -568,32 +538,21 @@ def flash_paged_attention(
                     q_local_tile=q_tile,
                     k=cur_k_tile,
                     v=cur_v_tile,
-                    q_h_per_k_h=q_h_per_k_h,
-                    seqlen_q=seqlen_q,
-                    nheads=h,
                     o_buffer=o_buffer[i, i_q_h],
                     l_buffer=l_buffer[:, i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    batch_id=batch_id,
-                    head_id=head_id,
-                    gqa_head_idx=i_q_h,
                     q_tile_idx=i,
-                    local_k_large_tile_idx=0,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
                     flash_config=active_config,
-                    use_causal_mask=False,
-                    continuous_batching_mask=cur_mask,
+                    use_causal_mask=True,
+                    tile_mask=cur_mask,
                     initialize=False,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
-                    dropout_p=0.0,
-                    dropout_p_tensor=None,
-                    seed_tensor=None,
-                    logit_bias_tile=None,
-                    qk_res_buffer=qk_res_buffer[i, i_q_h]
-                    if qk_res_buffer is not None else None,
+                    qk_res_buffer=(qk_res_buffer[i, i_q_h]
+                                   if qk_res_buffer is not None else None),
                 )
 
     # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- #
@@ -652,7 +611,6 @@ def flash_attn_varlen_nkifunc(
     attn_mask,
     n_kv_head=None,
     head_size=None,
-    B_P_SIZE=128,
     LARGE_TILE_SZ=2048,
     return_debug_tensors=False,
     mixed_precision=True,

From bbd07a54341a8ecd94d655775f86e258d075cbf5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 00:40:19 -0800
Subject: [PATCH 0119/1240] Bump helm/kind-action from 1.10.0 to 1.12.0
 (#11612)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 556b60d2fca..9d2e54ce926 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -47,7 +47,7 @@ jobs:
           aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
 
       - name: Create kind cluster
-        uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
+        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 
       - name: Build the Docker image vllm cpu
         run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

From 3af7305db7a245554f75fed29ac2e2a3714ab6c5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 00:40:25 -0800
Subject: [PATCH 0120/1240] Bump actions/stale from 9.0.0 to 9.1.0 (#12462)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/stale.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index 81e7c9b0507..656f3d3fa7b 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -13,7 +13,7 @@ jobs:
       actions: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
         with:
           # Increasing this value ensures that changes to this workflow
           # propagate to all issues and PRs in days rather than months

From ecea1e5b11df42e108283728c1e522a2ca10a627 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 16:41:06 +0800
Subject: [PATCH 0121/1240] Bump helm/chart-testing-action from 2.6.1 to 2.7.0
 (#12463)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 9d2e54ce926..99365c67c29 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -22,7 +22,7 @@ jobs:
           python-version: '3.13'
 
       - name: Set up chart-testing
-        uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
+        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
         with:
           version: v3.10.1
 

From 3c4784fc324189472d83c3878275b0346ca2af0c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 12 Feb 2025 16:41:22 +0800
Subject: [PATCH 0122/1240] Bump actions/setup-python from 5.3.0 to 5.4.0
 (#12672)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/cleanup_pr_body.yml  | 2 +-
 .github/workflows/lint-and-deploy.yaml | 2 +-
 .github/workflows/pre-commit.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 0085a1cc223..50fea0c43cb 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
         with:
           python-version: '3.12'
 
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index 99365c67c29..a4e9acc414d 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -17,7 +17,7 @@ jobs:
           version: v3.14.4
 
        #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
         with:
           python-version: '3.13'
 
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 06564969dc7..dc10b9116bb 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -10,7 +10,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"

From b0adb6a1f9b3d5981faf910543bbaff8e04091ab Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Wed, 12 Feb 2025 07:34:09 -0300
Subject: [PATCH 0123/1240] Further reduce the HTTP calls to huggingface.co
 (#13107)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 135 +++++++++++++++++-------------
 1 file changed, 79 insertions(+), 56 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index aade28610b3..4b76509e454 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -4,12 +4,14 @@
 import json
 import os
 import time
+from functools import cache
 from pathlib import Path
-from typing import Any, Dict, Literal, Optional, Type, Union
+from typing import Any, Callable, Dict, Literal, Optional, Type, Union
 
 import huggingface_hub
-from huggingface_hub import (file_exists, hf_hub_download, list_repo_files,
-                             try_to_load_from_cache)
+from huggingface_hub import hf_hub_download
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub import try_to_load_from_cache
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
                                    HFValidationError, LocalEntryNotFoundError,
                                    RepositoryNotFoundError,
@@ -86,6 +88,65 @@ class ConfigFormat(str, enum.Enum):
     MISTRAL = "mistral"
 
 
+def with_retry(func: Callable[[], Any],
+               log_msg: str,
+               max_retries: int = 2,
+               retry_delay: int = 2):
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error("%s: %s, retrying %d of %d", log_msg, e, attempt + 1,
+                         max_retries)
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> list[str]:
+
+    def lookup_files():
+        try:
+            return hf_list_repo_files(repo_id,
+                                      revision=revision,
+                                      repo_type=repo_type,
+                                      token=token)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> bool:
+
+    file_list = list_repo_files(repo_id,
+                                repo_type=repo_type,
+                                revision=revision,
+                                token=token)
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
 def file_or_path_exists(model: Union[str, Path], config_name: str,
                         revision: Optional[str]) -> bool:
     if Path(model).exists():
@@ -103,31 +164,10 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
     # hf_hub. This will fail in offline mode.
 
     # Call HF to check if the file exists
-    # 2 retries and exponential backoff
-    max_retries = 2
-    retry_delay = 2
-    for attempt in range(max_retries):
-        try:
-            return file_exists(model,
-                               config_name,
-                               revision=revision,
-                               token=HF_TOKEN)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            # Don't raise in offline mode,
-            # all we know is that we don't have this
-            # file cached.
-            return False
-        except Exception as e:
-            logger.error(
-                "Error checking file existence: %s, retrying %d of %d", e,
-                attempt + 1, max_retries)
-            if attempt == max_retries - 1:
-                logger.error("Error checking file existence: %s", e)
-                raise
-            time.sleep(retry_delay)
-            retry_delay *= 2
-            continue
-    return False
+    return file_exists(str(model),
+                       config_name,
+                       revision=revision,
+                       token=HF_TOKEN)
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -208,32 +248,7 @@ def get_config(
                                  revision=revision):
             config_format = ConfigFormat.MISTRAL
         else:
-            # If we're in offline mode and found no valid config format, then
-            # raise an offline mode error to indicate to the user that they
-            # don't have files cached and may need to go online.
-            # This is conveniently triggered by calling file_exists().
-
-            # Call HF to check if the file exists
-            # 2 retries and exponential backoff
-            max_retries = 2
-            retry_delay = 2
-            for attempt in range(max_retries):
-                try:
-                    file_exists(model,
-                                HF_CONFIG_NAME,
-                                revision=revision,
-                                token=HF_TOKEN)
-                except Exception as e:
-                    logger.error(
-                        "Error checking file existence: %s, retrying %d of %d",
-                        e, attempt + 1, max_retries)
-                    if attempt == max_retries:
-                        logger.error("Error checking file existence: %s", e)
-                        raise e
-                    time.sleep(retry_delay)
-                    retry_delay *= 2
-
-            raise ValueError(f"No supported config format found in {model}")
+            raise ValueError(f"No supported config format found in {model}.")
 
     if config_format == ConfigFormat.HF:
         config_dict, _ = PretrainedConfig.get_config_dict(
@@ -339,10 +354,11 @@ def get_hf_file_to_dict(file_name: str,
                                    file_name=file_name,
                                    revision=revision)
 
-    if file_path is None and file_or_path_exists(
-            model=model, config_name=file_name, revision=revision):
+    if file_path is None:
         try:
             hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
         except (RepositoryNotFoundError, RevisionNotFoundError,
                 EntryNotFoundError, LocalEntryNotFoundError) as e:
             logger.debug("File or repository not found in hf_hub_download", e)
@@ -363,6 +379,7 @@ def get_hf_file_to_dict(file_name: str,
     return None
 
 
+@cache
 def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
     This function gets the pooling and normalize 
@@ -390,6 +407,8 @@ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     if modules_dict is None:
         return None
 
+    logger.info("Found sentence-transformers modules configuration.")
+
     pooling = next((item for item in modules_dict
                     if item["type"] == "sentence_transformers.models.Pooling"),
                    None)
@@ -408,6 +427,7 @@ def get_pooling_config(model: str, revision: Optional[str] = 'main'):
         if pooling_type_name is not None:
             pooling_type_name = get_pooling_config_name(pooling_type_name)
 
+        logger.info("Found pooling configuration.")
         return {"pooling_type": pooling_type_name, "normalize": normalize}
 
     return None
@@ -435,6 +455,7 @@ def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
     return None
 
 
+@cache
 def get_sentence_transformer_tokenizer_config(model: str,
                                               revision: Optional[str] = 'main'
                                               ):
@@ -491,6 +512,8 @@ def get_sentence_transformer_tokenizer_config(model: str,
     if not encoder_dict:
         return None
 
+    logger.info("Found sentence-transformers tokenize configuration.")
+
     if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
         return encoder_dict
     return None

From 6de91448a3f160a3230b7a33f5c20bbea8650e47 Mon Sep 17 00:00:00 2001
From: Shiyan Deng <842974287@qq.com>
Date: Wed, 12 Feb 2025 02:36:10 -0800
Subject: [PATCH 0124/1240] [Misc] AMD Build Improvements (#12923)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu           |  2 +-
 csrc/rocm/attention.cu                      |  2 +-
 vllm/model_executor/models/registry.py      | 15 +++++++++++----
 vllm/transformers_utils/configs/__init__.py |  2 +-
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 01dac404465..c072744f066 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -3,7 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include <ATen/ATen.h>
-#include <THC/THCAtomics.cuh>
+#include <ATen/cuda/Atomic.cuh>
 
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index ffa9d44610a..366b3cdc23a 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -1122,4 +1122,4 @@ void paged_attention(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ebf6a88f21b..198b6d13471 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -205,6 +205,14 @@
     **_FALLBACK_MODEL,
 }
 
+# This variable is used as the args for subprocess.run(). We
+# can modify  this variable to alter the args if needed. e.g.
+# when we use par format to pack things together, sys.executable
+# might not be the target we want to run.
+_SUBPROCESS_COMMAND = [
+    sys.executable, "-m", "vllm.model_executor.models.registry"
+]
+
 
 @dataclass(frozen=True)
 class _ModelInfo:
@@ -502,10 +510,9 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T:
 
         # cannot use `sys.executable __file__` here because the script
         # contains relative imports
-        returned = subprocess.run(
-            [sys.executable, "-m", "vllm.model_executor.models.registry"],
-            input=input_bytes,
-            capture_output=True)
+        returned = subprocess.run(_SUBPROCESS_COMMAND,
+                                  input=input_bytes,
+                                  capture_output=True)
 
         # check if the subprocess is successful
         try:
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index c484a755ab4..9060565596b 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -45,4 +45,4 @@
     "SolarConfig",
     "Telechat2Config",
     "UltravoxConfig",
-]
\ No newline at end of file
+]

From e69a3d673a3fc61bd13b263c921ccb31df7bb0b9 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Wed, 12 Feb 2025 05:39:16 -0500
Subject: [PATCH 0125/1240] [Bug] [V1] Try fetching stop_reason from
 EngineOutput before checking the request (#13108)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/output_processor.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 7973c62c381..1438f9d5a7b 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
@@ -164,6 +164,7 @@ def process_outputs(
 
             new_token_ids = engine_core_output.new_token_ids
             finish_reason = engine_core_output.finish_reason
+            stop_reason = engine_core_output.stop_reason
 
             # TODO(andy): prompt logprobs + chunked prefill can
             # result in engine core returning an output for a
@@ -181,9 +182,10 @@ def process_outputs(
 
             # 2) Detokenize the token ids into text and check for stop
             #    strings.
-            stop_reason = req_state.detokenizer.update(new_token_ids)
-            if stop_reason:
+            stop_string = req_state.detokenizer.update(new_token_ids)
+            if stop_string and finish_reason != FinishReason.STOP:
                 finish_reason = FinishReason.STOP
+                stop_reason = stop_string
 
             # 3) Compute sample and prompt logprobs for request,
             #    if required.
@@ -250,7 +252,7 @@ def _make_request_output(
         request_state: RequestState,
         new_token_ids: List[int],
         finish_reason: Optional[FinishReason],
-        stop_reason: Optional[str],
+        stop_reason: Union[int, str, None],
     ) -> Optional[RequestOutput]:
 
         finished = finish_reason is not None

From a23fd789988ce3517ad000554455dca3d0a43dc0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 12 Feb 2025 19:55:23 +0800
Subject: [PATCH 0126/1240] [Bugfix] Fix num video tokens calculation for
 Qwen2-VL (#13148)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_vl.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index f2071eaff48..d3294a4d4a3 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -800,7 +800,11 @@ def _get_vision_info(
             preprocessed_size = ImageSize(width=image_width,
                                           height=image_height)
 
-        grid_t = max(num_frames // temporal_patch_size, 1)
+        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
+        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
+        padded_num_frames = num_frames + num_frames % temporal_patch_size
+
+        grid_t = max(padded_num_frames // temporal_patch_size, 1)
         grid_h = preprocessed_size.height // patch_size
         grid_w = preprocessed_size.width // patch_size
 

From 30640379060e9af3a2591de3e0f4f3ab1561c01b Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Wed, 12 Feb 2025 11:29:56 -0500
Subject: [PATCH 0127/1240] [Frontend] Generate valid tool call IDs when using
 `tokenizer-mode=mistral` (#12332)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/mistral_tool_use/__init__.py            |  0
 tests/mistral_tool_use/conftest.py            | 40 +++++++++++++++++++
 .../test_mistral_tool_calls.py                | 29 ++++++++++++++
 tests/mistral_tool_use/utils.py               | 33 +++++++++++++++
 vllm/entrypoints/openai/serving_chat.py       | 16 +++++---
 .../tool_parsers/mistral_tool_parser.py       |  2 +-
 .../transformers_utils/tokenizers/__init__.py |  7 +++-
 vllm/transformers_utils/tokenizers/mistral.py | 30 ++++++++++++++
 8 files changed, 149 insertions(+), 8 deletions(-)
 create mode 100644 tests/mistral_tool_use/__init__.py
 create mode 100644 tests/mistral_tool_use/conftest.py
 create mode 100644 tests/mistral_tool_use/test_mistral_tool_calls.py
 create mode 100644 tests/mistral_tool_use/utils.py

diff --git a/tests/mistral_tool_use/__init__.py b/tests/mistral_tool_use/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py
new file mode 100644
index 00000000000..39ab01c9b87
--- /dev/null
+++ b/tests/mistral_tool_use/conftest.py
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+from .utils import ARGS, CONFIGS, ServerConfig
+
+
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+
+
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py
new file mode 100644
index 00000000000..bbb3a07895f
--- /dev/null
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai
+import pytest
+
+from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
+
+
+# test: a tool_choice with mistral-tokenizer results in an ID of length 9
+@pytest.mark.asyncio
+async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice=WEATHER_TOOL,
+        logprobs=False)
+
+    choice = chat_completion.choices[0]
+
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 1
+    assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
new file mode 100644
index 00000000000..971ed55ca3c
--- /dev/null
+++ b/tests/mistral_tool_use/utils.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Optional
+
+from typing_extensions import TypedDict
+
+
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: List[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
+
+
+ARGS: List[str] = ["--max-model-len", "1024"]
+
+CONFIGS: Dict[str, ServerConfig] = {
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tokenizer-mode", "mistral",
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+}
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 107220d548a..934bd2a9506 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -28,12 +28,15 @@
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
+    MistralToolCall)
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
+from vllm.transformers_utils.tokenizers import (maybe_serialize_tool_calls,
+                                                truncate_tool_call_ids)
 
 logger = init_logger(__name__)
 
@@ -150,11 +153,12 @@ async def create_chat_completion(
                 return self.create_error_response(
                     "tool_choice = \"required\" is not supported!")
 
-            # because of issues with pydantic we need to potentially
-            # re-serialize the tool_calls field of the request
-            # for more info: see comment in `maybe_serialize_tool_calls`
             if isinstance(tokenizer, MistralTokenizer):
+                # because of issues with pydantic we need to potentially
+                # re-serialize the tool_calls field of the request
+                # for more info: see comment in `maybe_serialize_tool_calls`
                 maybe_serialize_tool_calls(request)
+                truncate_tool_call_ids(request)
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
@@ -745,11 +749,13 @@ async def chat_completion_full_generator(
             elif request.tool_choice and type(
                     request.tool_choice) is ChatCompletionNamedToolChoiceParam:
 
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
                 message = ChatMessage(
                     role=role,
                     content="",
                     tool_calls=[
-                        ToolCall(function=FunctionCall(
+                        tool_call_class(function=FunctionCall(
                             name=request.tool_choice.function.name,
                             arguments=output.text))
                     ])
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 51354f7c956..4f048088299 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -33,7 +33,7 @@ class MistralToolCall(ToolCall):
 
     @staticmethod
     def generate_random_id():
-        # Mistral Tool Call Ids must be alphanumeric with a maximum length of 9.
+        # Mistral Tool Call Ids must be alphanumeric with a length of 9.
         # https://github.com/mistralai/mistral-common/blob/21ee9f6cee3441e9bb1e6ed2d10173f90bd9b94b/src/mistral_common/protocol/instruct/validator.py#L299
         return "".join(choices(ALPHANUMERIC, k=9))
 
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 2b64f3fc705..c12388d9b20 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from .mistral import MistralTokenizer, maybe_serialize_tool_calls
+from .mistral import (MistralTokenizer, maybe_serialize_tool_calls,
+                      truncate_tool_call_ids)
 
-__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
+__all__ = [
+    "MistralTokenizer", "maybe_serialize_tool_calls", "truncate_tool_call_ids"
+]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 59131a9d7bf..4e76f2dc871 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -68,6 +68,36 @@ def maybe_serialize_tool_calls(request: "ChatCompletionRequest"):
             request.messages[i]["tool_calls"] = validated_tool_calls
 
 
+def truncate_tool_call_ids(request: "ChatCompletionRequest"):
+    """Truncates tool call IDs for Mistral's ID requirements."""
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls = message.get("tool_calls", [])
+            for tool_call in tool_calls:
+                if len(tool_call["id"]) > 9:
+                    logger.warning(
+                        "Truncating tool call ID: %s to %s",
+                        tool_call["id"],
+                        tool_call["id"][-9:],
+                    )
+                    tool_call["id"] = tool_call["id"][-9:]
+
+            request.messages[i]["tool_calls"] = tool_calls
+
+        elif message.get("role") in {"tool_results", "tool"}:
+            if "tool_call_id" in message:
+                tool_call_id = message["tool_call_id"]
+
+                if len(tool_call_id) > 9:
+                    logger.warning(
+                        "Truncating tool_call_id: %s to %s",
+                        tool_call_id,
+                        tool_call_id[-9:],
+                    )
+                    tool_call_id = tool_call_id[-9:]
+                request.messages[i]["tool_call_id"] = tool_call_id
+
+
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
     repo_cache = os.path.join(
         huggingface_hub.constants.HF_HUB_CACHE,

From 103de59c7e76c9ea627fd34d53e67d614748668f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 13 Feb 2025 00:58:24 +0800
Subject: [PATCH 0128/1240] [Misc] Delete unused LoRA modules (#13151)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_lora_manager.py         | 18 ++++++++++++------
 vllm/lora/models.py                     |  8 +++++++-
 vllm/lora/punica_wrapper/punica_base.py |  2 +-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 6666f54fdeb..9fecd11f57a 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -606,20 +606,26 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
 
     assert isinstance(model.get_submodule("gate_up_proj"),
                       MergedColumnParallelLinearWithLoRA)
+    # Verify packed lora is correct
+    model_lora_clone = model_lora.clone(1)
+    model_lora_clone1 = model_lora1.clone(1)
     assert manager.add_adapter(model_lora)
     assert manager.add_adapter(model_lora1)
 
+    assert model_lora.get_lora("gate_proj") is None
+    assert model_lora.get_lora("up_proj") is None
+    assert model_lora1.get_lora("up_proj") is None
     packed_lora = model_lora.get_lora("gate_up_proj")
     assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
 
     torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora.get_lora("gate_proj").lora_a)
+                               model_lora_clone.get_lora("gate_proj").lora_a)
     torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora.get_lora("gate_proj").lora_b)
+                               model_lora_clone.get_lora("gate_proj").lora_b)
     torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora.get_lora("up_proj").lora_a)
+                               model_lora_clone.get_lora("up_proj").lora_a)
     torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora.get_lora("up_proj").lora_b)
+                               model_lora_clone.get_lora("up_proj").lora_b)
 
     packed_lora1 = model_lora1.get_lora("gate_up_proj")
     assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
@@ -627,6 +633,6 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
     assert packed_lora1.lora_a[0] is None
     assert packed_lora1.lora_b[0] is None
     torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora1.get_lora("up_proj").lora_a)
+                               model_lora_clone1.get_lora("up_proj").lora_a)
     torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora1.get_lora("up_proj").lora_b)
+                               model_lora_clone1.get_lora("up_proj").lora_b)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index ef77fd4b74c..b7403980d0b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -5,7 +5,8 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
+from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Type,
+                    Union)
 
 import safetensors.torch
 import torch
@@ -619,12 +620,14 @@ def _register_packed_modules(self, module_full_name: str) -> None:
     def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
         for module_name, new_module_names in self.packed_modules.items():
             replacement_loras: List[Optional[LoRALayerWeights]] = []
+            replaced_module: Set[str] = set()
             has_replacement = False
             for r in new_module_names:
                 lora = lora_model.get_lora(r)
                 replacement_loras.append(lora)
                 if lora:
                     has_replacement = True
+                    replaced_module.add(r)
             if not has_replacement:
                 continue
             for i in range(len(replacement_loras)):
@@ -633,6 +636,9 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
                 replacement_loras[i] = None
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
                 replacement_loras)
+            # Remove the modules that have been replaced.
+            for module in replaced_module:
+                lora_model.loras.pop(module, None)
 
     def deactivate_adapter(self, adapter_id: int) -> bool:
         return deactivate_adapter(adapter_id, self._active_adapters,
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index 1a2282ae9ac..dad98f8e212 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -147,7 +147,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
                                               dtype=torch.long,
                                               device=device)
 
-        # 5 is the number of indicies tensors.
+        # 5 is the number of indices tensors.
         # base_indices, sampler_indices, sampler_indices_padded,
         # embeddings_indices,long_lora_indices
         self.indices_len: List[Optional[int]] = [None] * 5

From 9e54dbad571331f61b9cfe9a1e47dbab4b1bbe7c Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 12 Feb 2025 09:06:13 -0800
Subject: [PATCH 0129/1240] Introduce VLLM_CUDART_SO_PATH to allow users
 specify the .so path (#12998)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../device_communicators/cuda_wrapper.py      | 32 ++++++++++++++++++-
 vllm/envs.py                                  |  6 ++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 010caf7ebac..bc2cfbf3218 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -5,12 +5,14 @@
 """
 
 import ctypes
+import glob
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
 # this line makes it possible to directly load `libcudart.so` using `ctypes`
 import torch  # noqa
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -60,6 +62,29 @@ def find_loaded_library(lib_name) -> Optional[str]:
     return path
 
 
+def get_cudart_lib_path_from_env() -> Optional[str]:
+    """
+    In some system, find_loaded_library() may not work. So we allow users to
+    specify the path through environment variable VLLM_CUDART_SO_PATH.
+    """
+    cudart_so_env = envs.VLLM_CUDART_SO_PATH
+    if cudart_so_env is not None:
+        cudart_paths = [
+            cudart_so_env,
+        ]
+        for path in cudart_paths:
+            file_paths = glob.glob(path)
+            if len(file_paths) > 0:
+                logger.info(
+                    "Found cudart library at %s through env var"
+                    "VLLM_CUDART_SO_PATH=%s",
+                    file_paths[0],
+                    cudart_so_env,
+                )
+                return file_paths[0]
+    return None
+
+
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
@@ -105,8 +130,13 @@ class CudaRTLibrary:
     def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
+            if so_file is None:
+                so_file = get_cudart_lib_path_from_env()
             assert so_file is not None, \
-                "libcudart is not loaded in the current process"
+                (
+                    "libcudart is not loaded in the current process, "
+                    "try setting VLLM_CUDART_SO_PATH"
+                )
         if so_file not in CudaRTLibrary.path_to_library_cache:
             lib = ctypes.CDLL(so_file)
             CudaRTLibrary.path_to_library_cache[so_file] = lib
diff --git a/vllm/envs.py b/vllm/envs.py
index 745b068b7a4..d99c794e69e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,7 @@
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
+    VLLM_CUDART_SO_PATH: Optional[str] = None
 
 
 def get_default_cache_root():
@@ -572,6 +573,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # models the alignment is already naturally aligned to 256 bytes.
     "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
     lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+
+    # In some system, find_loaded_library() may not work. So we allow users to
+    # specify the path through environment variable VLLM_CUDART_SO_PATH.
+    "VLLM_CUDART_SO_PATH":
+    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
 }
 
 # end-env-vars-definition

From 479dca47aa1fa707d32165c95e720cdd3e0a6575 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 12 Feb 2025 12:12:22 -0500
Subject: [PATCH 0130/1240] [CI/Build] Use mypy matcher for pre-commit CI job
 (#13162)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/pre-commit.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index dc10b9116bb..6ab63a40277 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -14,6 +14,7 @@ jobs:
       with:
         python-version: "3.12"
     - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
     - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
       with:
         extra_args: --all-files --hook-stage manual

From 7e85d576088b2be66f7be35f149495218a86ee1c Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Thu, 13 Feb 2025 01:19:43 +0800
Subject: [PATCH 0131/1240] [CORE] [QUANT] Support for GPTQModel's `dynamic`
 quantization per module override/control (#7086)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_gptq_dynamic.py       | 68 ++++++++++++++
 tests/quantization/test_lm_head.py            | 25 +++--
 vllm/lora/layers.py                           |  2 +-
 .../model_executor/layers/logits_processor.py |  6 +-
 .../layers/quantization/gptq.py               | 47 ++++++++--
 .../layers/quantization/gptq_marlin.py        | 59 +++++++++---
 .../layers/quantization/utils/gptq_utils.py   | 94 +++++++++++++++++++
 .../layers/vocab_parallel_embedding.py        | 36 +++----
 8 files changed, 281 insertions(+), 56 deletions(-)
 create mode 100644 tests/quantization/test_gptq_dynamic.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/gptq_utils.py

diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
new file mode 100644
index 00000000000..c6f34fef274
--- /dev/null
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -0,0 +1,68 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Tests whether gptq models with dynamic quantized can be loaded.
+
+Run `pytest tests/quantization/test_gptq_dynamic.py --forked`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
+from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+    GPTQMarlinLinearMethod)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_dynamic_override)
+
+PROMPT = "On the surface of Mars, we found"
+
+# The first layer is quantized using bits=4, group_size=128
+# The second layer is quantized using bits=8, group_size=32
+# All other layers (layer index >= 2) are not quantized
+MODEL_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+     True),
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+     False),
+]
+
+
+@pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
+def test_gptq_with_dynamic(vllm_runner, model_id: str,
+                           use_marlin_kernel: bool):
+
+    vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
+
+    linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
+        GPTQLinearMethod)
+
+    for name, submodule in (vllm_model.model.llm_engine.model_executor.
+                            driver_worker.model_runner.model.named_modules()):
+        if name == "lm_head":
+            assert isinstance(submodule.quant_method, linear_method_cls)
+        elif name == 'model.layers.0.self_attn.qkv_proj':
+            # The first layer is quantized using bits=4, group_size=128
+            # desc_act=True
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert config.weight_bits == 4
+            assert config.group_size == 128
+            assert config.desc_act
+        elif name == 'model.layers.1.self_attn.qkv_proj':
+            # The second layer is quantized using bits=8, group_size=32
+            # desc_act=False
+            assert isinstance(submodule.quant_method, linear_method_cls)
+            config = submodule.quant_method.quant_config
+            assert get_dynamic_override(config, layer_name=name,
+                                        key="bits") == 8
+            assert get_dynamic_override(config,
+                                        layer_name=name,
+                                        key="group_size") == 32
+            assert not get_dynamic_override(
+                config, layer_name=name, key="desc_act")
+        elif (name == 'model.layers.2.self_attn.qkv_proj'
+              or name == 'model.layers.2.mlp.gate_up_proj'):
+            # All other layers (layer index >= 2) are not quantized
+            assert isinstance(submodule.quant_method, UnquantizedLinearMethod)
+
+    del vllm_model
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index ec60d8a5755..20435a287e3 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`.
 """
-from typing import Tuple
 
 import pytest
 import torch
@@ -17,31 +16,31 @@
 
 PROMPT = "On the surface of Mars, we found"
 
-MODELS_QUANT = [(
-    "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
-    True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-                ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
+MODELS_QUANT = [
+    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
+    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
+    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
+    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
+]
 
 
-@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
+@pytest.mark.parametrize("model_id, lm_head_quantized", MODELS_QUANT)
 def test_lm_head(
     vllm_runner,
-    model_lm_head_quant: Tuple[str, bool],
+    model_id: str,
+    lm_head_quantized: bool,
 ) -> None:
-    model, lm_head_quantized = model_lm_head_quant
-
-    with vllm_runner(model, dtype=torch.float16,
+    with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
         def check_model(model):
             lm_head_layer = model.lm_head
-
             if lm_head_quantized:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   (GPTQLinearMethod, GPTQMarlinLinearMethod,
                                    MarlinLinearMethod))
             else:
-                assert isinstance(lm_head_layer.linear_method,
+                assert isinstance(lm_head_layer.quant_method,
                                   UnquantizedEmbeddingMethod)
 
         vllm_model.apply_model(check_model)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 9826aeb9dc2..7f68dae9717 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1039,7 +1039,7 @@ def _get_logits(
         embedding_bias: Optional[torch.Tensor] = None,
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head, hidden_states)
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
         if embedding_bias is not None:
             logits += embedding_bias
 
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 0565c6e8be3..9b174299857 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -108,9 +108,9 @@ def _get_logits(
         embedding_bias: Optional[torch.Tensor],
     ) -> Optional[torch.Tensor]:
         # Get the logits for the next tokens.
-        logits = lm_head.linear_method.apply(lm_head,
-                                             hidden_states,
-                                             bias=embedding_bias)
+        logits = lm_head.quant_method.apply(lm_head,
+                                            hidden_states,
+                                            bias=embedding_bias)
 
         # Gather logits for TP
         logits = self._gather_logits(logits)
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 0cb77a7546d..6d1f0cc2eb4 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -3,16 +3,17 @@
 import enum
 from enum import Enum
 from fractions import Fraction
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -32,7 +33,33 @@ def __init__(
         group_size: int,
         desc_act: bool,
         lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.desc_act = desc_act
@@ -47,7 +74,8 @@ def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}),"
-                f"lm_head_quantized={self.lm_head_quantized}")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -68,19 +96,20 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
-        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized,
+                   dynamic)
 
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["GPTQLinearMethod"]:
-        if (isinstance(layer, LinearBase) or
-            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
-            return GPTQLinearMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix, GPTQLinearMethod)
 
 
 class ExllamaState(Enum):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 84c53b2c16d..0a9d86b008d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -9,17 +9,21 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+                                               UnquantizedLinearMethod,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -47,12 +51,41 @@ def __init__(
         desc_act: bool,
         is_sym: bool,
         lm_head_quantized: bool,
+        dynamic: Dict[str, Dict[str, Union[int, bool]]],
     ) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
             desc_act = False
 
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is Dict[str, Dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        self.dynamic = dynamic
+
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.desc_act = desc_act
@@ -68,7 +101,8 @@ def __repr__(self) -> str:
         return (f"GPTQMarlinConfig(quant_type={self.quant_type}, "
                 f"group_size={self.group_size}, "
                 f"desc_act={self.desc_act}, "
-                f"lm_head_quantized={self.lm_head_quantized})")
+                f"lm_head_quantized={self.lm_head_quantized}), "
+                f"dynamic={self.dynamic}")
 
     @classmethod
     def get_name(cls) -> str:
@@ -88,6 +122,9 @@ def get_config_filenames(cls) -> List[str]:
 
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        dynamic = {} if dynamic is None else dynamic
+
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         desc_act = cls.get_from_keys(config, ["desc_act"])
@@ -95,7 +132,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized)
+                   lm_head_quantized, dynamic)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -120,17 +157,15 @@ def override_quantization_method(cls, hf_quant_cfg,
 
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod"]]:
-        if isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead)
-                                             and self.lm_head_quantized):
-            return GPTQMarlinLinearMethod(self)
-        elif isinstance(layer, FusedMoE):
+    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
+                        UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
+        if isinstance(layer, FusedMoE):
             return GPTQMarlinMoEMethod(self)
-        return None
+        return get_linear_quant_method(self, layer, prefix,
+                                       GPTQMarlinLinearMethod)
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
-        # Extract data from quant config.
         quant_method = quant_config.get("quant_method", "").lower()
         num_bits = quant_config.get("bits")
         group_size = quant_config.get("group_size")
@@ -143,7 +178,7 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         if quant_method != "gptq":
             return False
 
-        # If we cannot find the info needed in the config, cannot convert.
+        # Marlin conversion is only valid if required properties are found
         if (num_bits is None or group_size is None or sym is None
                 or desc_act is None):
             return False
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
new file mode 100644
index 00000000000..5b0e6299f47
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+import re
+from copy import deepcopy
+from typing import Dict, Optional, Union
+
+import torch
+
+from vllm.config import QuantizationConfig
+from vllm.model_executor.layers.linear import (LinearBase,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, UnquantizedEmbeddingMethod)
+
+
+# Match dynamic rules with module name (prefix) and override quantize
+# config if module (prefix) matches a rule
+def override_config(config: QuantizationConfig, prefix: str):
+    weight_bits = get_dynamic_override(config, prefix, "bits",
+                                       config.weight_bits)
+    if isinstance(weight_bits, int):
+        config.weight_bits = weight_bits
+    group_size = get_dynamic_override(config, prefix, "group_size",
+                                      config.group_size)
+    if isinstance(group_size, int):
+        config.group_size = group_size
+    desc_act = get_dynamic_override(config, prefix, "desc_act",
+                                    config.desc_act)
+    if isinstance(desc_act, bool):
+        config.desc_act = desc_act
+
+    config.pack_factor = 32 // config.weight_bits  # packed into int32
+    if config.get_name() == "gptq_marlin":
+        is_sym = get_dynamic_override(config, prefix, "sym", config.is_sym)
+        if isinstance(is_sym, bool):
+            config.is_sym = is_sym
+
+        if (config.weight_bits, config.is_sym) not in config.TYPE_MAP:
+            raise ValueError("Unsupported quantization config: "
+                             f"bits={config.weight_bits}, sym={config.is_sym}")
+
+        config.quant_type = config.TYPE_MAP[(config.weight_bits,
+                                             config.is_sym)]
+    elif config.get_name() == "gptq":
+        if config.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {config.weight_bits} bits.")
+
+
+def get_dynamic_override(
+    config: QuantizationConfig,
+    layer_name: str,
+    key: Optional[str] = None,
+    default_value: Union[int, bool,
+                         None] = None) -> Union[Dict, int, bool, None]:
+    for pattern, pattern_dict in config.dynamic.items():
+        # Negative match: matched modules are excluded from quantized init
+        if pattern.startswith("-:"):
+            if re.match(pattern.removeprefix("-:"), layer_name):
+                return False
+        # Positive match: matched modules have quant properties overrides
+        # base quant config
+        elif re.match(pattern.removeprefix("+:"), layer_name):
+            if key is None:
+                return pattern_dict
+            else:
+                return pattern_dict.get(key, default_value)
+    return default_value
+
+
+def get_linear_quant_method(
+    config: QuantizationConfig,
+    layer: torch.nn.Module,
+    prefix: str,
+    linear_method_cls: type,
+):
+    cloned_config = deepcopy(config)
+    parallel_lm_head_quantized = isinstance(
+        layer, ParallelLMHead) and cloned_config.lm_head_quantized
+    if isinstance(layer, LinearBase) or parallel_lm_head_quantized:
+        # False = skip module, None = no override, else = Positive match
+        if get_dynamic_override(  # noqa: E712
+                cloned_config,  # noqa: E712
+                layer_name=prefix) == False:  # noqa: E712
+            if parallel_lm_head_quantized:
+                return UnquantizedEmbeddingMethod()
+            return UnquantizedLinearMethod()
+
+        if prefix:
+            # Dynamic per module/layer rules may override base config
+            override_config(cloned_config, prefix=prefix)
+
+        return linear_method_cls(cloned_config)
+    return None
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index e409094dd53..f65dfc3cb32 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -226,24 +226,24 @@ def __init__(self,
                                                self.tp_size)
         self.embedding_dim = embedding_dim
 
-        linear_method = None
+        quant_method = None
         if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedEmbeddingMethod()
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
 
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
-        linear_method_implements_embedding = method_has_implemented_embedding(
-            type(linear_method))
-        if is_embedding_layer and not linear_method_implements_embedding:
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method))
+        if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
-                f"The class {type(linear_method).__name__} must implement "
+                f"The class {type(quant_method).__name__} must implement "
                 "the 'embedding' method, see UnquantizedEmbeddingMethod.")
 
-        self.linear_method: QuantizeMethodBase = linear_method
+        self.quant_method: QuantizeMethodBase = quant_method
 
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -260,13 +260,13 @@ def __init__(self,
             self.shard_indices.added_vocab_end_index -
             self.shard_indices.added_vocab_start_index)
 
-        self.linear_method.create_weights(self,
-                                          self.embedding_dim,
-                                          [self.num_embeddings_per_partition],
-                                          self.embedding_dim,
-                                          self.num_embeddings_padded,
-                                          params_dtype=params_dtype,
-                                          weight_loader=self.weight_loader)
+        self.quant_method.create_weights(self,
+                                         self.embedding_dim,
+                                         [self.num_embeddings_per_partition],
+                                         self.embedding_dim,
+                                         self.num_embeddings_padded,
+                                         params_dtype=params_dtype,
+                                         weight_loader=self.weight_loader)
 
     @classmethod
     def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
@@ -412,8 +412,8 @@ def forward(self, input_):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.linear_method.embedding(self,
-                                                       masked_input.long())
+        output_parallel = self.quant_method.embedding(self,
+                                                      masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)

From 0b09613250e7ccaa09f435a0984290f822fee1a2 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Feb 2025 12:19:53 -0500
Subject: [PATCH 0132/1240] [Bugfix] Allow fallback to AWQ from AWQMarlin at
 per-layer granularity (#13119)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/linear.py          | 35 ++++++++++---------
 .../layers/quantization/awq_marlin.py         | 28 +++++++++------
 .../layers/quantization/moe_wna16.py          |  9 +++--
 .../layers/quantization/utils/marlin_utils.py | 15 ++++++++
 4 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index dad16112082..521724765be 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -290,29 +290,30 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None,
                  output_sizes: Optional[list[int]] = None,
                  prefix: str = ""):
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
-
-        self.gather_output = gather_output
-
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
-        assert self.quant_method is not None
-        self.output_size_per_partition = divide(self.output_size, tp_size)
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = input_size
+        self.output_size_per_partition = divide(output_size, self.tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
         # If QKV or MergedColumn, use output size of each partition.
         if hasattr(self, "output_sizes"):
             self.output_partition_sizes = [
-                divide(output_size, tp_size)
+                divide(output_size, self.tp_size)
                 for output_size in self.output_sizes
             ]
 
+        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
+                         quant_config, prefix)
+
+        self.gather_output = gather_output
+
         if output_sizes is None:
             output_sizes = [output_size]
 
+        assert self.quant_method is not None
         self.quant_method.create_weights(
             layer=self,
-            input_size_per_partition=self.input_size,
+            input_size_per_partition=self.input_size_per_partition,
             output_partition_sizes=self.output_partition_sizes,
             input_size=self.input_size,
             output_size=self.output_size,
@@ -1044,22 +1045,24 @@ def __init__(self,
                  reduce_results: bool = True,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
+        # Divide the weight matrix along the first dimension.
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        self.output_size_per_partition = output_size
+        self.output_partition_sizes = [output_size]
+
         super().__init__(input_size, output_size, skip_bias_add, params_dtype,
                          quant_config, prefix)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
 
-        # Divide the weight matrix along the last dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
-
         self.quant_method.create_weights(
             layer=self,
             input_size_per_partition=self.input_size_per_partition,
-            output_partition_sizes=[self.output_size],
+            output_partition_sizes=self.output_partition_sizes,
             input_size=self.input_size,
             output_size=self.output_size,
             params_dtype=self.params_dtype,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 8849ba29282..a43b2e597c1 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -13,15 +13,17 @@
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod,
                                                set_weight_attrs)
-from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
+from vllm.model_executor.layers.quantization.awq import (AWQConfig,
+                                                         is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
-    marlin_make_empty_g_idx, marlin_make_workspace, marlin_moe_permute_scales,
-    marlin_permute_scales, moe_awq_to_marlin_zero_points,
-    verify_marlin_supported, verify_marlin_supports_shape)
+    check_marlin_supports_layer, marlin_make_empty_g_idx,
+    marlin_make_workspace, marlin_moe_permute_scales, marlin_permute_scales,
+    moe_awq_to_marlin_zero_points, verify_marlin_supported,
+    verify_marlin_supports_shape)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.parameter import (GroupQuantScaleParameter,
                                            PackedvLLMParameter)
@@ -40,18 +42,17 @@ class AWQMarlinConfig(QuantizationConfig):
         8: scalar_types.uint8,
     }
 
-    def __init__(self,
-                 weight_bits: int,
-                 group_size: int,
-                 zero_point: bool,
+    def __init__(self, weight_bits: int, group_size: int, zero_point: bool,
                  lm_head_quantized: bool,
-                 modules_to_not_convert: Optional[List[str]] = None) -> None:
+                 modules_to_not_convert: Optional[List[str]],
+                 full_config: Dict[str, Any]) -> None:
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.zero_point = zero_point
         self.lm_head_quantized = lm_head_quantized
         self.weight_bits = weight_bits
         self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
 
         if self.weight_bits not in self.TYPE_MAP:
             raise ValueError(f"Unsupported num_bits = {self.weight_bits}. "
@@ -96,7 +97,7 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig":
         modules_to_not_convert = cls.get_from_keys_or(
             config, ["modules_to_not_convert"], None)
         return cls(weight_bits, group_size, zero_point, lm_head_quantized,
-                   modules_to_not_convert)
+                   modules_to_not_convert, config)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -124,6 +125,13 @@ def get_quant_method(self, layer: torch.nn.Module,
             (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
             if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
                 return UnquantizedLinearMethod()
+            # Check if the layer is supported by AWQMarlin.
+            if not check_marlin_supports_layer(layer, self.group_size):
+                logger.warning_once(
+                    f"Layer '{prefix}' is not supported by AWQMarlin. "
+                    "Falling back to unoptimized AWQ kernels.")
+                return AWQConfig.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return AWQMoEMethod(self)
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 56fa597e201..b9460e7d798 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -16,6 +16,8 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    check_marlin_supports_layer)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 
@@ -87,8 +89,8 @@ def from_config(cls, config: Dict[str, Any]) -> "MoeWNA16Config":
             modules_to_not_convert = []
         elif linear_quant_method == "awq":
             has_zp = cls.get_from_keys(config, ["zero_point"])
-            modules_to_not_convert = cls.get_from_keys(
-                config, ["modules_to_not_convert"])
+            modules_to_not_convert = cls.get_from_keys_or(
+                config, ["modules_to_not_convert"], None)
         else:
             raise ValueError("moe_wna16 only support gptq and awq.")
 
@@ -135,7 +137,8 @@ def get_quant_method(self, layer: torch.nn.Module,
                     return GPTQConfig.from_config(
                         self.full_config).get_quant_method(layer, prefix)
             elif self.linear_quant_method == "awq":
-                if self.use_marlin:
+                if self.use_marlin and check_marlin_supports_layer(
+                        layer, self.group_size):
                     return AWQMarlinConfig.from_config(
                         self.full_config).get_quant_method(layer, prefix)
                 else:
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 3beba308324..05e37251aa1 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -6,6 +6,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import LinearBase
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -135,6 +136,20 @@ def check_marlin_supports_shape(output_size_per_partition: int,
     return True, None
 
 
+def check_marlin_supports_layer(layer: LinearBase, group_size: int) \
+                                    -> bool:
+    output_size_per_partition = getattr(layer, "output_size_per_partition",
+                                        None) or layer.output_size
+    input_size_per_partition = getattr(layer, "input_size_per_partition",
+                                       None) or layer.input_size
+
+    return check_marlin_supports_shape(
+        output_size_per_partition=output_size_per_partition,
+        input_size_per_partition=input_size_per_partition,
+        input_size=layer.input_size,
+        group_size=group_size)[0]
+
+
 def marlin_make_workspace(output_size_per_partition: int,
                           device: torch.device) -> torch.Tensor:
     max_workspace_size = (output_size_per_partition //

From bf80f9356ad36410009630bb64b0ded4e74b51ce Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Feb 2025 14:16:06 -0500
Subject: [PATCH 0133/1240] [CI] Fix failing FP8 cpu offload test (#13170)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_cpu_offload.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 29a5721ef36..de03d37a74b 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -1,5 +1,5 @@
-# SPDX-License-Identifier: Apache-2.0
-
+# SPDX-License-Identifier: Apache-2.0
+
 # Expanded quantized model tests for CPU offloading
 # Base tests: tests/basic_correctness/test_cpu_offload.py
 
@@ -14,13 +14,13 @@
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
     # Test quantization of an unquantized checkpoint
-    compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct",
                          ["--quantization", "fp8"],
-                         ["--quantization", "fp8", "--cpu-offload-gb", "2"],
+                         ["--quantization", "fp8", "--cpu-offload-gb", "1"],
                          max_wait_seconds=480)
     # Test loading a quantized checkpoint
-    compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
-                         ["--cpu-offload-gb", "2"],
+    compare_two_settings("neuralmagic/Qwen2-1.5B-Instruct-FP8", [],
+                         ["--cpu-offload-gb", "1"],
                          max_wait_seconds=480)
 
 
From e31b93baba442e15b4febfaa383f0b5e51bc31b0 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Wed, 12 Feb 2025 12:58:11 -0800
Subject: [PATCH 0134/1240] [V1][Bugfix] Copy encoder input ids to fix set
 iteration issue during VLM abort (#13173)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/encoder_cache_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 651bc01aa5c..13ad14e45b3 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -54,7 +54,7 @@ def free_encoder_input(self, request: Request, input_id: int) -> None:
 
     def free(self, request: Request) -> None:
         """Free all cached input ids for the request."""
-        input_ids = self.get_cached_input_ids(request)
+        input_ids = self.get_cached_input_ids(request).copy()
         for input_id in input_ids:
             self.free_encoder_input(request, input_id)
 

From f060ae081a2a35106584311d752434b6203ae443 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 12 Feb 2025 22:48:31 -0500
Subject: [PATCH 0135/1240] [CI/Build] Ignore ruff warning up007 (#13182)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9892967b82d..849e8781e24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,8 @@ ignore = [
     "UP032",
     # Python 3.8 typing
     "UP006", "UP035",
-
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
 ]
 
 [tool.mypy]

From 0464e64b794a78e79f77d41058350871c4027612 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 12 Feb 2025 19:51:33 -0800
Subject: [PATCH 0136/1240] [perf-benchmark] cleanup unused Docker images and
 volumes in H100 benchmark instance (#12706)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 679abf1814a..df95e46d6dd 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -70,6 +70,12 @@ steps:
     #key: block-h100
     #depends_on: ~
 
+  - label: "Cleanup H100"
+    agents:
+      queue: H100
+    depends_on: ~
+    command: docker system prune -a --volumes --force
+
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:

From e06cbfa3f9950dfd25f9063549b122c2b71d18dc Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Wed, 12 Feb 2025 19:51:51 -0800
Subject: [PATCH 0137/1240] [NVIDIA] Support nvfp4 quantization (#12784)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                               |  18 +
 cmake/utils.cmake                            |  18 +-
 csrc/cuda_utils.h                            |  12 +
 csrc/cuda_utils_kernels.cu                   |  22 +-
 csrc/ops.h                                   |   4 +
 csrc/quantization/fp4/nvfp4_quant_entry.cu   |  32 ++
 csrc/quantization/fp4/nvfp4_quant_kernels.cu | 379 +++++++++++++++++++
 csrc/torch_bindings.cpp                      |   6 +
 tests/kernels/test_nvfp4_quant.py            | 149 ++++++++
 tests/test_scalartype.py                     |   1 +
 vllm/_custom_ops.py                          |  57 +++
 vllm/scalar_type.py                          |   3 +
 12 files changed, 688 insertions(+), 13 deletions(-)
 create mode 100644 csrc/quantization/fp4/nvfp4_quant_entry.cu
 create mode 100644 csrc/quantization/fp4/nvfp4_quant_kernels.cu
 create mode 100644 tests/kernels/test_nvfp4_quant.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0fd346c6c1..244ceb721c9 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -264,6 +264,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_compressor_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
@@ -377,6 +378,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # FP4 Archs and flags
+  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
+    set(SRCS 
+      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${FP4_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
+    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
+  else()
+    message(STATUS "Not building NVFP4 as no compatible archs were found.")
+    # clear FP4_ARCHS
+    set(FP4_ARCHS)
+  endif()
 
   #
   # Machete kernels
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 1c1c539819d..c9cd099b82a 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -257,9 +257,9 @@ endmacro()
 #  where `<=` is the version comparison operator.
 # In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
 #  in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
-# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
-#  in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
-#  9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). 
+# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
+#  in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
+#  x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS). 
 # The result is stored in `OUT_CUDA_ARCHS`.
 #
 # Example:
@@ -272,8 +272,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
   list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
   set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
 
-  # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
-  # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
+  # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
+  # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
   set(_CUDA_ARCHS)
   if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
     list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
@@ -283,6 +283,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
     endif()
   endif()
 
+  if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
+    list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
+    if ("10.0" IN_LIST TGT_CUDA_ARCHS)
+      list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
+      set(_CUDA_ARCHS "10.0a")
+    endif()
+  endif()
+
   list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
 
   # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index c35224218e9..6f79d2b7445 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <stdio.h>
+
 #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
   #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
   #define DEVICE_INLINE __forceinline__ __device__
@@ -10,6 +12,16 @@
   #define HOST_INLINE inline
 #endif
 
+#define CUDA_CHECK(cmd)                                             \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
 int64_t get_device_attribute(int64_t attribute, int64_t device_id);
 
 int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
index d6f9eb646fa..0627a42675b 100644
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -1,16 +1,22 @@
+#include "cuda_utils.h"
 #ifdef USE_ROCM
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
 #endif
+
 int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
-  int device, value;
-  if (device_id < 0) {
-    cudaGetDevice(&device);
-  } else {
-    device = device_id;
-  }
-  cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
-                         device);
+  // Return the cached value on subsequent calls
+  static int value = [=]() {
+    int device = static_cast<int>(device_id);
+    if (device < 0) {
+      CUDA_CHECK(cudaGetDevice(&device));
+    }
+    int value;
+    CUDA_CHECK(cudaDeviceGetAttribute(
+        &value, static_cast<cudaDeviceAttr>(attribute), device));
+    return static_cast<int>(value);
+  }();
+
   return value;
 }
 
diff --git a/csrc/ops.h b/csrc/ops.h
index e39d4ef3188..70e864cc6a8 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -195,6 +195,10 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale);
+
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale);
 
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
new file mode 100644
index 00000000000..b1426c43b45
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf);
+#endif
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+}
diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
new file mode 100644
index 00000000000..c3b8e9b3ec4
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -0,0 +1,379 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+
+#include "cuda_utils.h"
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = half;
+};
+
+template <>
+struct TypeConverter<half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = __nv_bfloat16;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t cvt_warp_fp16_to_fp4(PackedVec<Type>& vec, float SFScaleVal,
+                                         uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(vec.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(vec.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, half>) {
+      fp2Vals[i] = __half22float2(vec.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(vec.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(512, 4) cvt_fp16_to_fp4(
+#else
+cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = inOffset;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos =
+          cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(in_vec, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+template <typename T>
+void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale,
+                           int64_t* output, int32_t* SFOuput, bool useUE8M0,
+                           int multiProcessorCount, cudaStream_t stream) {
+  // Grid, Block size.
+  // Each thread converts 8 values.
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 512));
+  // Get number of blocks per SM (assume we can fully utilize the SM).
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+
+  // Launch the cvt kernel.
+  if (useUE8M0) {
+    cvt_fp16_to_fp4<T, true><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  } else {
+    cvt_fp16_to_fp4<T, false><<<grid, block, 0, stream>>>(
+        m, n, input, SFScale, reinterpret_cast<uint32_t*>(output),
+        reinterpret_cast<uint32_t*>(SFOuput));
+  }
+}
+
+// Instantiate the function.
+template void invokeFP4Quantization(int m, int n, half const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+template void invokeFP4Quantization(int m, int n, __nv_bfloat16 const* input,
+                                    float const* SFScale, int64_t* output,
+                                    int32_t* SFOuput, bool useUE8M0,
+                                    int multiProcessorCount,
+                                    cudaStream_t stream);
+
+void scaled_fp4_quant_sm100a(torch::Tensor const& output,
+                             torch::Tensor const& input,
+                             torch::Tensor const& output_sf,
+                             torch::Tensor const& input_sf) {
+  int32_t m = input.size(0);
+  int32_t n = input.size(1);
+
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  at::cuda::CUDAGuard device_guard{(char)input.get_device()};
+  auto stream = at::cuda::getStreamFromPool(false, input.get_device());
+  if (stream == nullptr) {
+    std::cerr << "Warning: Null CUDA stream" << std::endl;
+  }
+
+  // We don't support e8m0 scales at this moment.
+  bool useUE8M0 = false;
+
+  switch (input.scalar_type()) {
+    case torch::kHalf: {
+      auto input_ptr = reinterpret_cast<half const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
+                            useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    case torch::kBFloat16: {
+      auto input_ptr = reinterpret_cast<__nv_bfloat16 const*>(input.data_ptr());
+      invokeFP4Quantization(m, n, input_ptr, input_sf_ptr, output_ptr, sf_out,
+                            useUE8M0, multiProcessorCount, stream);
+      break;
+    }
+    default: {
+      std::cerr << "Observing: " << input.scalar_type()
+                << " for the input datatype which is invalid";
+      throw std::runtime_error(
+          "Unsupported input data type for quantize_to_fp4.");
+    }
+  }
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index c03806f430a..784ded26299 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -423,6 +423,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
            &dynamic_per_token_scaled_fp8_quant);
 
+  // Compute NVFP4 block quantized tensor.
+  ops.def(
+      "scaled_fp4_quant(Tensor! output, Tensor input,"
+      "                 Tensor! output_scale, Tensor input_scale) -> ()");
+  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
diff --git a/tests/kernels/test_nvfp4_quant.py b/tests/kernels/test_nvfp4_quant.py
new file mode 100644
index 00000000000..93735fc096d
--- /dev/null
+++ b/tests/kernels/test_nvfp4_quant.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+PAD_SHAPES = [(90, 64), (150, 64), (128, 48), (128, 80), (150, 80), (90, 48),
+              (90, 128), (150, 128), (150, 48), (90, 80)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1fn.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+# E2M1 to float
+# 0111 -> 6
+# 0110 -> 4
+# 0101 -> 3
+# 0100 -> 2
+# 0011 -> 1.5
+# 0010 -> 1
+# 0001 -> 0.5
+# 0000 -> 0
+E2M1_TO_FLOAT32 = [
+    0., 0.5, 1., 1.5, 2., 3., 4., 6., 0., -0.5, -1., -1.5, -2., -3., -4., -6.
+]
+BLOCK_SIZE = 16
+
+
+def cast_from_fp4(x, m, n):
+    # The fp4 values are packed in uint8 as [v_1st | v_2nd]
+    v_2nd = x & 0xF
+    v_1st = (x >> 4) & 0xF
+    c = torch.stack((v_2nd, v_1st), dim=-1)
+    out = torch.tensor([E2M1_TO_FLOAT32[x] for x in c.flatten()])
+    out = out.reshape(m, n).to(torch.float32)
+    return out
+
+
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+
+
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+
+
+def ref_nvfp4_quant(x, global_scale):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
+
+
+def recover_swizzled_scales(scale, m, n):
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // BLOCK_SIZE
+    rounded_n = round_up(scale_n, 4)
+    # Recover the swizzled scaling factor to linear layout
+    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    return result[:m, :scale_n]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
+
+
+@pytest.mark.parametrize("pad_shape", PAD_SHAPES)
+@torch.inference_mode()
+def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
+    dtype = torch.float16
+    current_platform.seed_everything(42)
+    torch.set_default_device('cuda:0')
+
+    m, n = pad_shape
+
+    x = torch.randn((m, n), dtype=dtype)
+
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py
index 6e36f2c337f..d0e57ea86fc 100644
--- a/tests/test_scalartype.py
+++ b/tests/test_scalartype.py
@@ -11,6 +11,7 @@
     (0, 15, scalar_types.uint4),
     (-8, 7, scalar_types.uint4b8),
     (-128, 127, scalar_types.uint8b128),
+    (-6., 6., scalar_types.float4_e2m1fn),
     (-28., 28., scalar_types.float6_e3m2f),
     (torch.int8, scalar_types.int8),
     (torch.uint8, scalar_types.uint8),
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a6823501676..67843c17740 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -765,6 +765,63 @@ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
     return torch.ops._C.permute_cols(a, perm)
 
 
+# fp4
+def scaled_fp4_quant(
+        input: torch.Tensor,
+        input_global_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in the sizzled layout.
+    """
+    assert input.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {input.ndim}.')
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert input.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {input.dtype}.')
+
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // block_size
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=device,
+                               dtype=torch.int32)
+
+    torch.ops._C.scaled_fp4_quant(output, input, output_scale,
+                                  input_global_scale)
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
+
+
 # fp8
 def scaled_fp8_quant(
     input: torch.Tensor,
diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py
index 9f6e85920ac..1d7675dda43 100644
--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
@@ -321,6 +321,9 @@ class scalar_types:
     # fp6, https://github.com/usyd-fsalab/fp6_llm/tree/main
     float6_e3m2f = ScalarType.float_(3, 2, True, NanRepr.NONE)
 
+    # fp4, https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    float4_e2m1fn = ScalarType.float_(2, 1, True, NanRepr.NONE)
+
     # "gptq" types
     uint2b2 = ScalarType.uint(2, 2)
     uint3b4 = ScalarType.uint(3, 4)

From dde4651d4f62886954e5da85046fbc7960b99222 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 12 Feb 2025 22:52:11 -0500
Subject: [PATCH 0138/1240] [Bugfix][Example] Fix GCed profiling server for TPU
 (#12792)

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/profiling_tpu/profiling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index b1fe829b3c3..d54117d6262 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -24,7 +24,7 @@ def main(args: argparse.Namespace):
 
     engine_args = EngineArgs.from_cli_args(args)
     llm = LLM(**dataclasses.asdict(engine_args))
-    _ = xp.start_server(9012)
+    server = xp.start_server(9012)  # noqa: F841
 
     sampling_params = SamplingParams(
         temperature=0.0,

From e100355bdb82f3b6507d596c6bdfdc54a4c7711a Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 13 Feb 2025 12:26:21 +0800
Subject: [PATCH 0139/1240] [VLM] Implement merged multimodal processor for
 Mllama (#11427)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_mllama.py            |  71 ++-
 .../multimodal/processing/test_common.py      |  13 +-
 vllm/inputs/preprocess.py                     |  90 +++-
 vllm/inputs/registry.py                       |   3 +-
 vllm/model_executor/models/mllama.py          | 408 +++++++++---------
 vllm/multimodal/inputs.py                     |  16 +
 vllm/multimodal/processing.py                 |  60 ++-
 vllm/multimodal/profiling.py                  |  28 +-
 8 files changed, 456 insertions(+), 233 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 4cd2dbdb4f9..202516f4c20 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -7,11 +7,11 @@
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
+from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
-from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
-                                               MllamaForConditionalGeneration)
+from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -21,6 +21,7 @@
 from ...utils import check_logprobs_close
 
 _LIMIT_IMAGE_PER_PROMPT = 3
+MLLAMA_IMAGE_TOKEN_ID = 128256
 
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
 
@@ -396,6 +397,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
         )
 
 
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_explicit_implicit_prompt(
+    image_assets: _ImageAssets,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+):
+    stop_sign = image_assets[0].pil_image
+    # yapf: disable
+    prompts = [
+        # explicit prompt
+        {
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {"image": stop_sign},
+            },
+            "decoder_prompt": {
+                "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374],  # noqa: E501
+            }
+        },
+        {
+            "encoder_prompt": "Not <|image|>",
+            "decoder_prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+        # implicit prompt
+        {
+            "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
+            "multi_modal_data": {"image": stop_sign},
+        },
+        {
+            "prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+    ]
+    # yapf: enable
+    llm = LLM(
+        model=model,
+        dtype=dtype,
+        max_model_len=4096,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_tokens,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    n_prompts = len(prompts)
+    explicit_outputs = outputs[:n_prompts // 2]
+    implicit_outputs = outputs[n_prompts // 2:]
+    for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
+        assert exp_output.outputs[0].text == imp_output.outputs[0].text
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
@@ -458,6 +517,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
                                             images=images)
 
 
+class DummyModel:
+    image_token_id = MLLAMA_IMAGE_TOKEN_ID
+
+
 @pytest.mark.core_model
 @pytest.mark.parametrize(
     "input_indices_and_output",
@@ -499,7 +562,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
         use_cuda_graph=False,
     )
 
-    dummy: dict[str, str] = {}
+    dummy = DummyModel()
 
     cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
         .get_cross_attention_mask(dummy,
@@ -556,7 +619,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
         use_cuda_graph=False,
     )
 
-    dummy: dict[str, str] = {}
+    dummy = DummyModel()
 
     full_text_row_masked_out_mask = MllamaForConditionalGeneration\
         .get_full_text_row_masked_out_mask(dummy,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6244056c747..67ef8b17ab8 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -85,6 +85,14 @@ def _test_processing_correctness(
         partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
+    tokenizer_encode_kwargs = {}
+    if model_config.hf_config.model_type == "mllama":
+        # For Mllama, tokenizer will always add bos_token at the beginning of
+        # prompt by default, causing hf_processor outputs incorrect token ids.
+        # So we need use `add_special_tokens=False` here to leave bos_token
+        # to be added by the processor.
+        tokenizer_encode_kwargs = {"add_special_tokens": False}
+
     for batch_idx in range(num_batches):
         mm_data = {
             k:
@@ -122,7 +130,7 @@ def _test_processing_correctness(
             f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
         baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
             mm_data=mm_data,
             hf_processor_mm_kwargs={},
         )
@@ -131,7 +139,7 @@ def _test_processing_correctness(
             f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
         cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
             mm_data=mm_data,
             hf_processor_mm_kwargs={},
         )
@@ -155,6 +163,7 @@ def _test_processing_correctness(
     "llava-hf/llava-v1.6-mistral-7b-hf",
     "llava-hf/LLaVA-NeXT-Video-7B-hf",
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-o-2_6",
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 656f2f2b766..bc5856990da 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-from typing import List, Mapping, Optional, Union
+from typing import List, Mapping, Optional, Tuple, Union, cast
 
 from typing_extensions import assert_never
 
@@ -9,7 +9,8 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                                    MultiModalInputs)
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 
@@ -495,6 +496,51 @@ def _build_enc_dec_llm_inputs(
             decoder=decoder_inputs,
         )
 
+    def _separate_enc_dec_inputs_from_mm_processor_outputs(
+        self,
+        inputs: SingletonInputs,
+        decoder_inputs_to_override: Optional[SingletonInputs] = None,
+    ) -> Tuple[SingletonInputs, SingletonInputs]:
+        """
+        For encoder/decoder models only:
+        Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
+        """
+        encoder_inputs: SingletonInputs
+        decoder_inputs: SingletonInputs
+        if inputs["type"] == "multimodal":
+            # Multimodal data inputs
+            assert ("encoder_prompt" in inputs
+                    and "encoder_prompt_token_ids" in inputs)
+            inputs = cast(MultiModalEncDecInputs, inputs)
+            encoder_inputs = token_inputs(
+                prompt=inputs["encoder_prompt"],
+                prompt_token_ids=inputs["encoder_prompt_token_ids"],
+            )
+            if decoder_inputs_to_override is not None:
+                decoder_inputs = MultiModalInputs(
+                    type="multimodal",
+                    prompt=decoder_inputs_to_override.get("prompt", ""),
+                    prompt_token_ids=decoder_inputs_to_override[
+                        "prompt_token_ids"],
+                    mm_kwargs=inputs["mm_kwargs"],
+                    mm_placeholders=inputs["mm_placeholders"],
+                )
+            else:
+                decoder_inputs = MultiModalInputs(
+                    type="multimodal",
+                    prompt=inputs["prompt"],
+                    prompt_token_ids=inputs["prompt_token_ids"],
+                    mm_kwargs=inputs["mm_kwargs"],
+                    mm_placeholders=inputs["mm_placeholders"],
+                )
+        elif inputs["type"] == "token":
+            # Text-only inputs
+            encoder_inputs = token_inputs(prompt="", prompt_token_ids=[])
+            decoder_inputs = decoder_inputs_to_override or inputs
+        else:
+            assert_never(inputs)  # type: ignore[arg-type]
+        return encoder_inputs, decoder_inputs
+
     def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
@@ -539,7 +585,6 @@ def _process_encoder_decoder_prompt(
                 prompt["encoder_prompt"],
                 request_id=request_id,
             )
-
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
@@ -547,13 +592,28 @@ def _process_encoder_decoder_prompt(
                     decoder_input,
                     request_id=request_id,
                 )
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        encoder_inputs, decoder_inputs))
         else:
-            encoder_inputs = self._prompt_to_llm_inputs(
+            inputs = self._prompt_to_llm_inputs(
                 prompt,
                 request_id=request_id,
             )
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        inputs))
+            else:
+                encoder_inputs = inputs
 
-            decoder_inputs = None
+                decoder_inputs = None
 
         return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
 
@@ -583,13 +643,29 @@ async def _process_encoder_decoder_prompt_async(
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
                     encoder_task, decoder_task)
+
+            # For multimodal model, override decoder prompt from processor
+            # with explicit decoder prompt.
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        encoder_inputs, decoder_inputs))
         else:
-            encoder_inputs = await self._prompt_to_llm_inputs_async(
+            inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 request_id=request_id,
             )
+            if self.model_config.is_multimodal_model and (
+                    self._can_process_multimodal()):
+                # Encoder-Decoder Multimodal model
+                encoder_inputs, decoder_inputs = (
+                    self._separate_enc_dec_inputs_from_mm_processor_outputs(
+                        inputs))
+            else:
+                encoder_inputs = inputs
 
-            decoder_inputs = None
+                decoder_inputs = None
 
         return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs)
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index cd421443981..87b7a7631e4 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -350,7 +350,8 @@ def dummy_data_for_profiling(
             )
             processor = mm_registry.create_processor(model_config, tokenizer)
             profiler = MultiModalProfiler(processor)
-            dummy_data = profiler.get_dummy_data(seq_len)
+            dummy_data = profiler.get_dummy_data(
+                seq_len, is_encoder_data=is_encoder_data)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d1cb04cdb24..3ca22d346b7 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -23,14 +23,15 @@
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers.models.mllama.configuration_mllama as config_mllama
-from PIL import Image
+from PIL.Image import Image
 from torch import nn
+from transformers import BatchFeature, MllamaConfig
 from transformers.modeling_outputs import (BaseModelOutput,
                                            CausalLMOutputWithPast)
 from transformers.models.mllama.image_processing_mllama import (
     get_optimal_tiled_canvas)
 from transformers.models.mllama.processing_mllama import (
-    get_cross_attention_token_mask)
+    MllamaProcessor, get_cross_attention_token_mask)
 
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
@@ -38,8 +39,6 @@
 from vllm.attention.selector import _Backend
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
-                         InputContext, TokenInputs, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -54,8 +53,13 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import SequenceData
-from vllm.utils import is_list_of
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataDict, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal
@@ -63,8 +67,6 @@
 from .utils import maybe_prefix
 
 logger = init_logger(__name__)
-MLLAMA_IMAGE_TOKEN_ID = 128256
-MLLAMA_IMAGE_TOKEN = "<|image|>"
 
 
 class MllamaImagePixelInputs(TypedDict):
@@ -81,158 +83,191 @@ class MllamaImagePixelInputs(TypedDict):
 # TODO: support LlamaImageEmbeddingInputs
 
 
-def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int:
-    num_images = 0
-    for token_id in prompt_token_ids[::-1]:
-        if token_id == MLLAMA_IMAGE_TOKEN_ID:
-            num_images += 1
-        elif num_images > 0:
-            break
-    return num_images
-
-
-def input_processor_for_mllama(
-    ctx: InputContext,
-    inputs: EncoderDecoderInputs,
-) -> EncoderDecoderInputs:
-    # Example input to processor:
-    # {
-    #     'encoder': {
-    #         'type': 'token',
-    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
-    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
-    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-    #     },
-    #     'decoder': {
-    #         'type': 'token',
-    #         'prompt_token_ids': [128000],
-    #     },
-    # }
-
-    # move encoder prompt to decoder
-    dec_inputs = TokenInputs(**inputs["encoder"])
-
-    multi_modal_data = dec_inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        # text-only
-        return EncoderDecoderInputs(
-            encoder=token_inputs([]),
-            decoder=dec_inputs,
+def calc_token_per_chunk(image_size: int) -> int:
+    assert image_size % 14 == 0, "chunk size should be multiple of 14"
+    token_per_chunk = (image_size // 14)**2 + 1
+    return token_per_chunk
+
+
+class MllamaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> MllamaConfig:
+        return self.ctx.get_hf_config(MllamaConfig)
+
+    def get_hf_processor(self) -> MllamaProcessor:
+        return self.ctx.get_hf_processor(MllamaProcessor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_token_per_chunk_from_config(self) -> int:
+        image_size = self.get_hf_config().vision_config.image_size
+        return calc_token_per_chunk(image_size)
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        vision_config = self.get_hf_config().vision_config
+        token_per_chunk = self.get_token_per_chunk_from_config()
+        mm_max_tokens = vision_config.max_num_tiles * token_per_chunk
+        return {"image": mm_max_tokens}
+
+    def get_num_tiles_per_image(self, image_height: int,
+                                image_width: int) -> int:
+        vision_config = self.get_hf_config().vision_config
+        max_num_tiles = vision_config.max_num_tiles
+        image_size = vision_config.image_size
+        tiled_height, tiled_width = get_optimal_tiled_canvas(
+            image_height,
+            image_width,
+            max_num_tiles,
+            tile_size=image_size,
+        )
+        num_tiles_height = tiled_height // image_size
+        num_tiles_width = tiled_width // image_size
+        return num_tiles_height * num_tiles_width
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_config = self.get_hf_config().vision_config
+        image_size = vision_config.image_size
+        max_num_tiles = vision_config.max_num_tiles
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=max_num_tiles * image_size, width=image_size)
+
+
+class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
         )
 
-    image_data = multi_modal_data["image"]
-    if isinstance(image_data, Image.Image):
-        image_data = [image_data]
-
-    assert is_list_of(image_data, Image.Image)
-
-    num_image_tokens = dec_inputs['prompt_token_ids'].count(
-        MLLAMA_IMAGE_TOKEN_ID)
-    if num_image_tokens != len(image_data):
-        raise ValueError(
-            f"The number of image tokens ({num_image_tokens}) must be"
-            f" the same as the number of images ({len(image_data)})")
-
-    # Since only the last group of consecutive images
-    # are attended by the decoded tokens, we only need to
-    # get the number of tiles for those images.
-    num_decode_images = _get_num_image_in_last_group(
-        dec_inputs["prompt_token_ids"])
-
-    hf_config = ctx.model_config.hf_config
-    vision_config = hf_config.vision_config
-
-    num_tiles = 0
-    for image in image_data[::-1]:
-        width, height = image.size
-        tile_size = vision_config.image_size
-        canvas_height, canvas_width = get_optimal_tiled_canvas(
-            image_height=height,
-            image_width=width,
-            max_image_tiles=vision_config.max_num_tiles,
-            tile_size=tile_size,
+
+class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
+                                ):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+        if mm_data:
+            num_tiles = [
+                self.info.get_num_tiles_per_image(img.height, img.width)
+                for img in mm_data["images"]
+            ]
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs)
+            processed_outputs["num_tiles"] = torch.tensor(num_tiles)
+            for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
+                processed_outputs[k] = processed_outputs[k].squeeze(0)
+            # Example input to encoder and decoder:
+            # {
+            #     'encoder': {
+            #         'type': 'token',
+            #         'prompt_token_ids': [128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+            #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+            #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+            #     },
+            #     'decoder': {
+            #         'type': 'token',
+            #         'prompt_token_ids': [128000],
+            #     },
+            # }
+            processed_token_ids = processed_outputs.pop("input_ids")
+            start_idx, end_idx = 0, processed_token_ids.size(1)
+            processed_prompt_text = tokenizer.decode(processed_token_ids[0])
+
+            hf_processor = self.info.get_hf_processor()
+            bos_token = hf_processor.bos_token
+            # Remove the bos_token from the start of prompt,
+            # because we all know there would be image_token.
+            if processed_prompt_text.startswith(bos_token):
+                start_idx += 1
+            # Remove the bos_token from the end of prompt,
+            # because text is empty in this case.
+            if processed_prompt_text.endswith(bos_token):
+                end_idx -= 1
+            processed_outputs[
+                "input_ids"] = processed_token_ids[:, start_idx:end_idx]
+        else:
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=False,
+                                          return_tensors="pt")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            aspect_ratio_ids=MultiModalFieldConfig.batched("image"),
+            aspect_ratio_mask=MultiModalFieldConfig.batched("image"),
+            num_tiles=MultiModalFieldConfig.batched("image"),
         )
-        num_tiles_height = canvas_height // tile_size
-        num_tiles_width = canvas_width // tile_size
-        num_tiles += num_tiles_height * num_tiles_width
-        num_decode_images -= 1
-        if num_decode_images == 0:
-            break
-
-    # Set encoder prompt length based on the number of tiles.
-    # This tells the block manager to allocate correct number
-    # of slots for encoder tokens.
-    assert vision_config.image_size % 14 == 0, \
-        "chunk size should be multiple of 14"
-    token_per_chunk = (vision_config.image_size // 14)**2 + 1
-    num_tokens = num_tiles * token_per_chunk
-
-    # Example output from processor:
-    # {
-    #     'encoder': {
-    #         'type': 'token',
-    #         'prompt_token_ids': [128256, 128256, ..., 128256],
-    #         'prompt': '<|image|><|image|>...<|image|>',
-    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-    #     },
-    #     'decoder': {
-    #         'type': 'token',
-    #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
-    #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
-    #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-    #     },
-    # }
-    return EncoderDecoderInputs(
-        encoder=token_inputs(
-            prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens,
-            prompt=MLLAMA_IMAGE_TOKEN * num_tokens,
-            multi_modal_data=multi_modal_data,
-        ),
-        decoder=dec_inputs,
-    )
-
-
-def get_max_mllama_image_tokens(ctx: InputContext) -> int:
-    hf_config = ctx.model_config.hf_config
-    token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1
-    return hf_config.vision_config.max_num_tiles * token_per_chunk
-
-
-def dummy_decoder_seq_data(seq_len: int, num_images: int):
-    # <|image|> * num_images + 0 * (seq_len - num_images)
-    assert seq_len >= num_images, \
-        "seq_len should be greater than or equal to num_images"
-
-    return SequenceData.from_prompt_token_counts(
-        (MLLAMA_IMAGE_TOKEN_ID, num_images),
-        (0, seq_len - num_images),
-    )
-
-
-def dummy_encoder_seq_data(ctx: InputContext, num_images: int):
-    num_tokens = get_max_mllama_image_tokens(ctx) * num_images
-
-    return SequenceData.from_prompt_token_counts(
-        (MLLAMA_IMAGE_TOKEN_ID, num_tokens))
-
-
-def dummy_image(num_images: int, ):
-    width = height = 1024
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int,
-                                  mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    return DummyData(dummy_decoder_seq_data(seq_len, num_images))
-
-
-def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int,
-                                  mm_counts: Mapping[str, int]):
-    num_images = mm_counts["image"]
-    return DummyData(dummy_encoder_seq_data(ctx, num_images),
-                     dummy_image(num_images))
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        data = mm_data.get("image", [])
+        num_images = 1 if isinstance(data, Image) else len(data)
+        image_token_id = self.info.get_hf_config().image_token_index
+        return [image_token_id] * num_images
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        token_per_chunk = self.info.get_token_per_chunk_from_config()
+        image_token_id = self.info.get_hf_config().image_token_index
+
+        def get_replacement_mllama(item_idx):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+            num_tile = self.info.get_num_tiles_per_image(
+                image_height=image_size.height,
+                image_width=image_size.width,
+            )
+            num_tokens = num_tile * token_per_chunk
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_mllama,
+            )
+        ]
 
 
 def _prepare_aspect_ratio_attention_mask(
@@ -1107,11 +1142,9 @@ def forward(
         return hidden_states
 
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_mllama_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_decoder_data_for_mllama)
-@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_mllama)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_mllama)
+@MULTIMODAL_REGISTRY.register_processor(MllamaMultiModalProcessor,
+                                        info=MllamaProcessingInfo,
+                                        dummy_inputs=MllamaDummyInputsBuilder)
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -1120,7 +1153,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
+        config: MllamaConfig = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         self.quant_config = quant_config
         self.vocab_size = config.text_config.vocab_size
@@ -1130,6 +1163,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.pad_token_id = \
             config.pad_token_id if config.pad_token_id is not None else -1
         self.image_size = config.vision_config.image_size
+        self.image_token_id = config.image_token_index
 
         self.vision_model = MllamaVisionModel(config.vision_config,
                                               quant_config,
@@ -1204,48 +1238,12 @@ def _parse_and_validate_image_input(self, **kwargs: object):
         if pixel_values is not None:
             assert aspect_ratio_ids is not None
             assert aspect_ratio_mask is not None
-            max_num_images = max([len(x[0]) for x in pixel_values])
-            if max_num_images == 0:
-                raise ValueError("No images provided.")
-            max_num_tiles = max(
-                max([len(x) for x in y[0]]) for y in pixel_values)
-            device = next(self.multi_modal_projector.parameters()).device
-            bsz = len(pixel_values)
-            out_num_tiles = []
-            out_images = torch.zeros(
-                bsz,
-                max_num_images,
-                max_num_tiles,
-                3,
-                self.image_size,
-                self.image_size,
-                dtype=torch.float32,
-                device=device,
-            )
-            out_ar_ids = torch.ones(bsz,
-                                    max_num_images,
-                                    dtype=torch.int64,
-                                    device=device)
-            out_ar_mask = torch.zeros(bsz,
-                                      max_num_images,
-                                      max_num_tiles,
-                                      dtype=torch.int64,
-                                      device=device)
-            for b in range(len(pixel_values)):
-                _num_tiles = []
-                for i in range(len(pixel_values[b][0])):
-                    img = pixel_values[b][0][i]
-                    out_images[b, i, :img.shape[0]] = img
-                    out_ar_ids[b, i] = aspect_ratio_ids[b][0][i]
-                    out_ar_mask[b, i] = aspect_ratio_mask[b][0][i]
-                    _num_tiles.append(img.shape[0])
-                out_num_tiles.append(_num_tiles)
 
             return MllamaImagePixelInputs(
                 type="pixel_values",
-                data=out_images,
-                aspect_ratio_ids=out_ar_ids,
-                aspect_ratio_mask=out_ar_mask,
+                data=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
             )
 
         if image_embeds is not None:
@@ -1312,7 +1310,7 @@ def get_cross_attention_mask(
             batch_token_ids.append(token_ids[start:start + seq_len])
             start += seq_len
         sparse_mask = [
-            get_cross_attention_token_mask(t, MLLAMA_IMAGE_TOKEN_ID)
+            get_cross_attention_token_mask(t, self.image_token_id)
             for t in batch_token_ids
         ]
 
@@ -1384,8 +1382,8 @@ def forward(
             # block manager to allocate blocks for those images only.
             # See input_processor_for_mllama() for more details.
             num_tiles_tensor = kwargs.pop("num_tiles")
-            num_tiles = [t[0].tolist() for t in num_tiles_tensor]
-            num_tokens_per_tile = (self.image_size // 14)**2 + 1
+            num_tiles = [t.tolist() for t in num_tiles_tensor]
+            num_tokens_per_tile = calc_token_per_chunk(self.image_size)
             actual_encoder_seq_lens = [
                 sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
             ]
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 5f9593ee8b2..25ca8d1e71f 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -739,3 +739,19 @@ class MultiModalInputs(TypedDict):
     For each modality, information about the placeholder tokens in
     :code:`prompt_token_ids`.
     """
+
+
+class MultiModalEncDecInputs(MultiModalInputs):
+    """
+    Represents the outputs of :class:`vllm.multimodal.EncDecMultiModalProcessor`
+    ready to be passed to vLLM internals.
+    """
+
+    encoder_prompt: str
+    """The processed encoder prompt text."""
+
+    encoder_prompt_token_ids: list[int]
+    """The processed token IDs of the encoder prompt."""
+
+    encoder_token_type_ids: NotRequired[list[int]]
+    """The token type IDs of the encoder prompt."""
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index d704fa59b96..74479f5ffad 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -20,9 +20,9 @@
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
 from .hasher import MultiModalHasher
-from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                     MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem,
-                     PlaceholderRange)
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
+                     MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
 
 if TYPE_CHECKING:
@@ -1293,3 +1293,57 @@ def apply(
             mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
+
+
+class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    @abstractmethod
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        """Create input prompt for the encoder."""
+        raise NotImplementedError
+
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalEncDecInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+        The main processing steps are modified to fit encoder-decoder model:
+        1. Create encoder prompt from input prompt text.
+        2. Apply the HF processor on encoder prompt.
+        3. Copy the input prompt text as decoder prompt inputs.
+        """
+        encoder_prompt = self.create_encoder_prompt(prompt, mm_data)
+        encoder_inputs = super().apply(
+            encoder_prompt,
+            mm_data,
+            hf_processor_mm_kwargs,
+        )
+
+        # We assumed the decoder prompt text is copied from
+        # the original encoder prompt without extra process
+        tokenizer = self.info.get_tokenizer()
+        if isinstance(prompt, str):
+            decoder_prompt = prompt
+            decoder_prompt_ids = encode_tokens(tokenizer,
+                                               prompt,
+                                               add_special_tokens=False)
+        else:
+            decoder_prompt = decode_tokens(tokenizer, prompt)
+            decoder_prompt_ids = prompt
+
+        mm_inputs = MultiModalEncDecInputs(
+            encoder_prompt=encoder_inputs["prompt"],
+            encoder_prompt_token_ids=encoder_inputs["prompt_token_ids"],
+            **encoder_inputs)
+        mm_inputs.update({
+            "prompt": decoder_prompt,
+            "prompt_token_ids": decoder_prompt_ids
+        })
+        return mm_inputs
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 5dd75485404..81c92b38f8e 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -144,7 +144,11 @@ def _get_dummy_mm_inputs(
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
 
-    def get_dummy_data(self, seq_len: int) -> DummyData:
+    def get_dummy_data(
+        self,
+        seq_len: int,
+        is_encoder_data: bool = False,
+    ) -> DummyData:
         # Avoid circular import
         from vllm.sequence import SequenceData
 
@@ -183,16 +187,18 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
         total_len = len(prompt_token_ids)
 
         # V0 does not support chunked prefill.
-        if total_len > seq_len and not envs.VLLM_USE_V1:
-            logger.warning(
-                "The context length (%d) of the model is too short "
-                "to hold the multi-modal embeddings in the worst case "
-                "(%d tokens in total, out of which %s are reserved for "
-                "multi-modal embeddings). This may cause certain multi-modal "
-                "inputs to fail during inference, even when the input text is "
-                "short. To avoid this, you should increase `max_model_len`, "
-                "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len,
-                total_len, total_placeholders_by_modality)
+        if (total_len > seq_len and not envs.VLLM_USE_V1) or is_encoder_data:
+            if total_len > seq_len:
+                logger.warning(
+                    "The context length (%d) of the model is too short "
+                    "to hold the multi-modal embeddings in the worst case "
+                    "(%d tokens in total, out of which %s are reserved for "
+                    "multi-modal embeddings). This may cause certain "
+                    "multi-modal inputs to fail during inference, even when "
+                    "the input text is short. To avoid this, you should "
+                    "increase `max_model_len`, reduce `max_num_seqs`, "
+                    "and/or reduce `mm_counts`.", seq_len, total_len,
+                    total_placeholders_by_modality)
 
             return DummyData(
                 seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),

From bbd664cfd019ba0b70df75dae94b4aa61cf23ef9 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 12 Feb 2025 21:52:41 -0800
Subject: [PATCH 0140/1240] Simplify logic of locating CUDART so file path
 (#13203)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../device_communicators/cuda_wrapper.py      | 26 +------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index bc2cfbf3218..1d53b1c5b80 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -5,7 +5,6 @@
 """
 
 import ctypes
-import glob
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
@@ -62,29 +61,6 @@ def find_loaded_library(lib_name) -> Optional[str]:
     return path
 
 
-def get_cudart_lib_path_from_env() -> Optional[str]:
-    """
-    In some system, find_loaded_library() may not work. So we allow users to
-    specify the path through environment variable VLLM_CUDART_SO_PATH.
-    """
-    cudart_so_env = envs.VLLM_CUDART_SO_PATH
-    if cudart_so_env is not None:
-        cudart_paths = [
-            cudart_so_env,
-        ]
-        for path in cudart_paths:
-            file_paths = glob.glob(path)
-            if len(file_paths) > 0:
-                logger.info(
-                    "Found cudart library at %s through env var"
-                    "VLLM_CUDART_SO_PATH=%s",
-                    file_paths[0],
-                    cudart_so_env,
-                )
-                return file_paths[0]
-    return None
-
-
 class CudaRTLibrary:
     exported_functions = [
         # ​cudaError_t cudaSetDevice ( int  device )
@@ -131,7 +107,7 @@ def __init__(self, so_file: Optional[str] = None):
         if so_file is None:
             so_file = find_loaded_library("libcudart")
             if so_file is None:
-                so_file = get_cudart_lib_path_from_env()
+                so_file = envs.VLLM_CUDART_SO_PATH  # fallback to env var
             assert so_file is not None, \
                 (
                     "libcudart is not loaded in the current process, "

From e8c1301bd2dd5abccd4e06c87452d95e794f1ba7 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 12 Feb 2025 23:10:28 -0800
Subject: [PATCH 0141/1240] [Build] Automatically use the wheel of the base
 commit with Python-only build (#13178)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../installation/gpu/cuda.inc.md              | 16 ++++++++---
 setup.py                                      | 27 ++++++++++++++++---
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 5c2ea30dbfd..948bdbffbeb 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -89,12 +89,22 @@ cd vllm
 VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
-This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation.
+This command will do the following:
+1. Look for the current branch in your vLLM clone.
+2. Identify the corresponding base commit in the main branch.
+3. Download the pre-built wheel of the base commit.
+4. Use its compiled libraries in the installation.
 
-The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files):
+:::{note}
+1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
+2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
+:::
+
+In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```console
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 pip install --editable .
 ```
 
diff --git a/setup.py b/setup.py
index 27e5aab760f..5a74a44c0a6 100755
--- a/setup.py
+++ b/setup.py
@@ -268,15 +268,34 @@ def run(self):
 
 class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
-    default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
-    def run(self) -> None:
-        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
-                                   self.default_wheel)
+    def get_base_commit_in_main_branch(self) -> str:
+        import subprocess
+
+        try:
+            current_branch = subprocess.check_output(
+                ["git", "branch", "--show-current"]).decode("utf-8").strip()
+
+            base_commit = subprocess.check_output(
+                ["git", "merge-base", "main",
+                 current_branch]).decode("utf-8").strip()
+            return base_commit
+        except Exception as err:
+            logger.warning(
+                "Failed to get the base commit in the main branch. "
+                "Using the nightly wheel. The libraries in this "
+                "wheel may not be compatible with your dev branch: %s", err)
+            return "nightly"
 
+    def run(self) -> None:
         assert _is_cuda(
         ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
 
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
+        if wheel_location is None:
+            base_commit = self.get_base_commit_in_main_branch()
+            wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
         import zipfile
 
         if os.path.isfile(wheel_location):

From db0bd6b12e3a045d0e5b7b3315f70132981d0061 Mon Sep 17 00:00:00 2001
From: LikeSundayLikeRain <monsoon1013@gmail.com>
Date: Thu, 13 Feb 2025 02:11:26 -0500
Subject: [PATCH 0142/1240] [Bugfix] deepseek_r1_reasoning_parser put reason
 content in wrong field in certain edge case (#13097)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../test_deepseekr1_reasoning_parser.py                | 10 +++++-----
 .../reasoning_parsers/deepseek_r1_reasoning_parser.py  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
index fdadb2e21ff..ea504f3d0b4 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -24,10 +24,10 @@
     "reasoning_content": "This is a reasoning section",
     "content": None,
 }
-NO_REASONING = {
+NO_CONTENT = {
     "output": "This is content",
-    "reasoning_content": None,
-    "content": "This is content",
+    "reasoning_content": "This is content",
+    "content": None,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
@@ -98,8 +98,8 @@
     ),
     pytest.param(
         False,
-        NO_REASONING,
-        id="no_reasoning_token",
+        NO_CONTENT,
+        id="no_content_token",
     ),
     pytest.param(
         True,
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index 33bba04882b..e5ab6e6b233 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -128,7 +128,7 @@ def extract_reasoning_content(
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
         if self.think_end_token not in model_output:
-            return None, model_output
+            return model_output, None
         else:
             # Add a start token if it's missing to keep compatibility.
             if self.think_start_token not in model_output:

From 4d2fdbbb4709ffe759caeb81a5c561503bb1d021 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 13 Feb 2025 02:12:21 -0500
Subject: [PATCH 0143/1240] [Frontend] Move CLI code into vllm.cmd package
 (#12971)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/arch_overview.md   |   2 +-
 setup.py                              |   2 +-
 vllm/entrypoints/cli/__init__.py      |   0
 vllm/entrypoints/cli/main.py          |  79 ++++++++++
 vllm/entrypoints/cli/openai.py        | 172 +++++++++++++++++++++
 vllm/entrypoints/cli/serve.py         |  63 ++++++++
 vllm/entrypoints/cli/types.py         |  24 +++
 vllm/entrypoints/openai/api_server.py |   3 +-
 vllm/scripts.py                       | 208 +-------------------------
 9 files changed, 348 insertions(+), 205 deletions(-)
 create mode 100644 vllm/entrypoints/cli/__init__.py
 create mode 100644 vllm/entrypoints/cli/main.py
 create mode 100644 vllm/entrypoints/cli/openai.py
 create mode 100644 vllm/entrypoints/cli/serve.py
 create mode 100644 vllm/entrypoints/cli/types.py

diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md
index 04886e5981e..7bed0a001d6 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/source/design/arch_overview.md
@@ -66,7 +66,7 @@ This server can be started using the `vllm serve` command.
 vllm serve <model>
 ```
 
-The code for the `vllm` CLI can be found in <gh-file:vllm/scripts.py>.
+The code for the `vllm` CLI can be found in <gh-file:vllm/entrypoints/cli/main.py>.
 
 Sometimes you may see the API server entrypoint used directly instead of via the
 `vllm` CLI command. For example:
diff --git a/setup.py b/setup.py
index 5a74a44c0a6..7243a2ab30a 100755
--- a/setup.py
+++ b/setup.py
@@ -689,7 +689,7 @@ def _read_requirements(filename: str) -> List[str]:
     package_data=package_data,
     entry_points={
         "console_scripts": [
-            "vllm=vllm.scripts:main",
+            "vllm=vllm.entrypoints.cli.main:main",
         ],
     },
 )
diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
new file mode 100644
index 00000000000..e94d9a0561f
--- /dev/null
+++ b/vllm/entrypoints/cli/main.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# The CLI entrypoint to vLLM.
+import os
+import signal
+import sys
+
+import vllm.entrypoints.cli.openai
+import vllm.entrypoints.cli.serve
+import vllm.version
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+CMD_MODULES = [
+    vllm.entrypoints.cli.openai,
+    vllm.entrypoints.cli.serve,
+]
+
+
+def register_signal_handlers():
+
+    def signal_handler(sig, frame):
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+
+def main():
+    env_setup()
+
+    parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser.add_argument('-v',
+                        '--version',
+                        action='version',
+                        version=vllm.version.__version__)
+    subparsers = parser.add_subparsers(required=False, dest="subparser")
+    cmds = {}
+    for cmd_module in CMD_MODULES:
+        new_cmds = cmd_module.cmd_init()
+        for cmd in new_cmds:
+            cmd.subparser_init(subparsers).set_defaults(
+                dispatch_function=cmd.cmd)
+            cmds[cmd.name] = cmd
+    args = parser.parse_args()
+    if args.subparser in cmds:
+        cmds[args.subparser].validate(args)
+
+    if hasattr(args, "dispatch_function"):
+        args.dispatch_function(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
new file mode 100644
index 00000000000..73df900f610
--- /dev/null
+++ b/vllm/entrypoints/cli/openai.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# Commands that act as an interactive OpenAI API client
+
+import argparse
+import os
+import signal
+import sys
+from typing import List, Optional, Tuple
+
+from openai import OpenAI
+from openai.types.chat import ChatCompletionMessageParam
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+
+
+def _register_signal_handlers():
+
+    def signal_handler(sig, frame):
+        sys.exit(0)
+
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]:
+    _register_signal_handlers()
+
+    base_url = args.url
+    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+    openai_client = OpenAI(api_key=api_key, base_url=base_url)
+
+    if args.model_name:
+        model_name = args.model_name
+    else:
+        available_models = openai_client.models.list()
+        model_name = available_models.data[0].id
+
+    print(f"Using model: {model_name}")
+
+    return model_name, openai_client
+
+
+def chat(system_prompt: Optional[str], model_name: str,
+         client: OpenAI) -> None:
+    conversation: List[ChatCompletionMessageParam] = []
+    if system_prompt is not None:
+        conversation.append({"role": "system", "content": system_prompt})
+
+    print("Please enter a message for the chat model:")
+    while True:
+        try:
+            input_message = input("> ")
+        except EOFError:
+            return
+        conversation.append({"role": "user", "content": input_message})
+
+        chat_completion = client.chat.completions.create(model=model_name,
+                                                         messages=conversation)
+
+        response_message = chat_completion.choices[0].message
+        output = response_message.content
+
+        conversation.append(response_message)  # type: ignore
+        print(output)
+
+
+def _add_query_options(
+        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument(
+        "--url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="url of the running OpenAI-Compatible RESTful API server")
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default=None,
+        help=("The model name used in prompt completion, default to "
+              "the first model in list models API call."))
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help=(
+            "API key for OpenAI services. If provided, this api key "
+            "will overwrite the api key obtained through environment variables."
+        ))
+    return parser
+
+
+class ChatCommand(CLISubcommand):
+    """The `chat` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "chat"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        system_prompt = args.system_prompt
+        conversation: List[ChatCompletionMessageParam] = []
+        if system_prompt is not None:
+            conversation.append({"role": "system", "content": system_prompt})
+
+        print("Please enter a message for the chat model:")
+        while True:
+            try:
+                input_message = input("> ")
+            except EOFError:
+                return
+            conversation.append({"role": "user", "content": input_message})
+
+            chat_completion = client.chat.completions.create(
+                model=model_name, messages=conversation)
+
+            response_message = chat_completion.choices[0].message
+            output = response_message.content
+
+            conversation.append(response_message)  # type: ignore
+            print(output)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        chat_parser = subparsers.add_parser(
+            "chat",
+            help="Generate chat completions via the running API server",
+            usage="vllm chat [options]")
+        _add_query_options(chat_parser)
+        chat_parser.add_argument(
+            "--system-prompt",
+            type=str,
+            default=None,
+            help=("The system prompt to be added to the chat template, "
+                  "used for models that support system prompts."))
+        return chat_parser
+
+
+class CompleteCommand(CLISubcommand):
+    """The `complete` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "complete"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        model_name, client = _interactive_cli(args)
+        print("Please enter prompt to complete:")
+        while True:
+            input_prompt = input("> ")
+            completion = client.completions.create(model=model_name,
+                                                   prompt=input_prompt)
+            output = completion.choices[0].text
+            print(output)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        complete_parser = subparsers.add_parser(
+            "complete",
+            help=("Generate text completions based on the given prompt "
+                  "via the running API server"),
+            usage="vllm complete [options]")
+        _add_query_options(complete_parser)
+        return complete_parser
+
+
+def cmd_init() -> List[CLISubcommand]:
+    return [ChatCommand(), CompleteCommand()]
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
new file mode 100644
index 00000000000..1afead8a120
--- /dev/null
+++ b/vllm/entrypoints/cli/serve.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+from typing import List
+
+import uvloop
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                              validate_parsed_serve_args)
+from vllm.utils import FlexibleArgumentParser
+
+
+class ServeSubcommand(CLISubcommand):
+    """The `serve` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "serve"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        # The default value of `--model`
+        if args.model != EngineArgs.model:
+            raise ValueError(
+                "With `vllm serve`, you should provide the model as a "
+                "positional argument instead of via the `--model` option.")
+
+        # EngineArgs expects the model name to be passed as --model.
+        args.model = args.model_tag
+
+        uvloop.run(run_server(args))
+
+    def validate(self, args: argparse.Namespace) -> None:
+        validate_parsed_serve_args(args)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            "serve",
+            help="Start the vLLM OpenAI Compatible API server",
+            usage="vllm serve <model_tag> [options]")
+        serve_parser.add_argument("model_tag",
+                                  type=str,
+                                  help="The model tag to serve")
+        serve_parser.add_argument(
+            "--config",
+            type=str,
+            default='',
+            required=False,
+            help="Read CLI options from a config file."
+            "Must be a YAML with the following options:"
+            "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
+        )
+
+        return make_arg_parser(serve_parser)
+
+
+def cmd_init() -> List[CLISubcommand]:
+    return [ServeSubcommand()]
diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py
new file mode 100644
index 00000000000..f739a68c5f4
--- /dev/null
+++ b/vllm/entrypoints/cli/types.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from vllm.utils import FlexibleArgumentParser
+
+
+class CLISubcommand:
+    """Base class for CLI argument handlers."""
+
+    name: str
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        raise NotImplementedError("Subclasses should implement this method")
+
+    def validate(self, args: argparse.Namespace) -> None:
+        # No validation by default
+        pass
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        raise NotImplementedError("Subclasses should implement this method")
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b8f54d6c780..127ee941497 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -901,7 +901,8 @@ def signal_handler(*_) -> None:
 
 if __name__ == "__main__":
     # NOTE(simon):
-    # This section should be in sync with vllm/scripts.py for CLI entrypoints.
+    # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
+    # entrypoints.
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
diff --git a/vllm/scripts.py b/vllm/scripts.py
index 467cab28f02..7e569d2d24f 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -1,210 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# The CLI entrypoint to vLLM.
-import argparse
-import os
-import signal
-import sys
-from typing import List, Optional
-
-import uvloop
-from openai import OpenAI
-from openai.types.chat import ChatCompletionMessageParam
-
-import vllm.version
-from vllm.engine.arg_utils import EngineArgs
-from vllm.entrypoints.openai.api_server import run_server
-from vllm.entrypoints.openai.cli_args import (make_arg_parser,
-                                              validate_parsed_serve_args)
+from vllm.entrypoints.cli.main import main as vllm_main
 from vllm.logger import init_logger
-from vllm.utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
 
 
-def register_signal_handlers():
-
-    def signal_handler(sig, frame):
-        sys.exit(0)
-
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTSTP, signal_handler)
-
-
-def serve(args: argparse.Namespace) -> None:
-    # The default value of `--model`
-    if args.model != EngineArgs.model:
-        raise ValueError(
-            "With `vllm serve`, you should provide the model as a "
-            "positional argument instead of via the `--model` option.")
-
-    # EngineArgs expects the model name to be passed as --model.
-    args.model = args.model_tag
-
-    uvloop.run(run_server(args))
-
-
-def interactive_cli(args: argparse.Namespace) -> None:
-    register_signal_handlers()
-
-    base_url = args.url
-    api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
-    openai_client = OpenAI(api_key=api_key, base_url=base_url)
-
-    if args.model_name:
-        model_name = args.model_name
-    else:
-        available_models = openai_client.models.list()
-        model_name = available_models.data[0].id
-
-    print(f"Using model: {model_name}")
-
-    if args.command == "complete":
-        complete(model_name, openai_client)
-    elif args.command == "chat":
-        chat(args.system_prompt, model_name, openai_client)
-
-
-def complete(model_name: str, client: OpenAI) -> None:
-    print("Please enter prompt to complete:")
-    while True:
-        input_prompt = input("> ")
-
-        completion = client.completions.create(model=model_name,
-                                               prompt=input_prompt)
-        output = completion.choices[0].text
-        print(output)
-
-
-def chat(system_prompt: Optional[str], model_name: str,
-         client: OpenAI) -> None:
-    conversation: List[ChatCompletionMessageParam] = []
-    if system_prompt is not None:
-        conversation.append({"role": "system", "content": system_prompt})
-
-    print("Please enter a message for the chat model:")
-    while True:
-        input_message = input("> ")
-        conversation.append({"role": "user", "content": input_message})
-
-        chat_completion = client.chat.completions.create(model=model_name,
-                                                         messages=conversation)
-
-        response_message = chat_completion.choices[0].message
-        output = response_message.content
-
-        conversation.append(response_message)  # type: ignore
-        print(output)
-
-
-def _add_query_options(
-        parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-    parser.add_argument(
-        "--url",
-        type=str,
-        default="http://localhost:8000/v1",
-        help="url of the running OpenAI-Compatible RESTful API server")
-    parser.add_argument(
-        "--model-name",
-        type=str,
-        default=None,
-        help=("The model name used in prompt completion, default to "
-              "the first model in list models API call."))
-    parser.add_argument(
-        "--api-key",
-        type=str,
-        default=None,
-        help=(
-            "API key for OpenAI services. If provided, this api key "
-            "will overwrite the api key obtained through environment variables."
-        ))
-    return parser
-
-
-def env_setup():
-    # The safest multiprocessing method is `spawn`, as the default `fork` method
-    # is not compatible with some accelerators. The default method will be
-    # changing in future versions of Python, so we should use it explicitly when
-    # possible.
-    #
-    # We only set it here in the CLI entrypoint, because changing to `spawn`
-    # could break some existing code using vLLM as a library. `spawn` will cause
-    # unexpected behavior if the code is not protected by
-    # `if __name__ == "__main__":`.
-    #
-    # References:
-    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
-    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
-    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
-    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
-        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
+# Backwards compatibility for the move from vllm.scripts to
+# vllm.entrypoints.cli.main
 def main():
-    env_setup()
-
-    parser = FlexibleArgumentParser(description="vLLM CLI")
-    parser.add_argument('-v',
-                        '--version',
-                        action='version',
-                        version=vllm.version.__version__)
-
-    subparsers = parser.add_subparsers(required=True, dest="subparser")
-
-    serve_parser = subparsers.add_parser(
-        "serve",
-        help="Start the vLLM OpenAI Compatible API server",
-        usage="vllm serve <model_tag> [options]")
-    serve_parser.add_argument("model_tag",
-                              type=str,
-                              help="The model tag to serve")
-    serve_parser.add_argument(
-        "--config",
-        type=str,
-        default='',
-        required=False,
-        help="Read CLI options from a config file."
-        "Must be a YAML with the following options:"
-        "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
-    )
-
-    serve_parser = make_arg_parser(serve_parser)
-    serve_parser.set_defaults(dispatch_function=serve)
-
-    complete_parser = subparsers.add_parser(
-        "complete",
-        help=("Generate text completions based on the given prompt "
-              "via the running API server"),
-        usage="vllm complete [options]")
-    _add_query_options(complete_parser)
-    complete_parser.set_defaults(dispatch_function=interactive_cli,
-                                 command="complete")
-
-    chat_parser = subparsers.add_parser(
-        "chat",
-        help="Generate chat completions via the running API server",
-        usage="vllm chat [options]")
-    _add_query_options(chat_parser)
-    chat_parser.add_argument(
-        "--system-prompt",
-        type=str,
-        default=None,
-        help=("The system prompt to be added to the chat template, "
-              "used for models that support system prompts."))
-    chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
-
-    args = parser.parse_args()
-    if args.subparser == "serve":
-        validate_parsed_serve_args(args)
-
-    # One of the sub commands should be executed.
-    if hasattr(args, "dispatch_function"):
-        args.dispatch_function(args)
-    else:
-        parser.print_help()
-
-
-if __name__ == "__main__":
-    main()
+    logger.warning("vllm.scripts.main() is deprecated. Please re-install "
+                   "vllm or use vllm.entrypoints.cli.main.main() instead.")
+    vllm_main()

From 9f647c6d35759c9dee08ce3477a450189e3d48a1 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 12 Feb 2025 23:13:08 -0800
Subject: [PATCH 0144/1240] Allow Unsloth Dynamic 4bit BnB quants to work
 (#12974)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/bitsandbytes.py              | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 889eda009df..49d992d4cb0 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -133,8 +133,16 @@ def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
     components = prefix.split('.')
 
     # Check if any of the skip modules exactly matches any component
-    return any(module_name in components
-               for module_name in llm_int8_skip_modules)
+    substr_check = any(module_name in components
+                       for module_name in llm_int8_skip_modules)
+
+    # Allow certain layers to not be quantized
+    set_components = set(".".join(components[:i + 1])
+                         for i in range(len(components)))
+    set_llm_int8_skip_modules = set(llm_int8_skip_modules)
+    prefix_check = len(set_llm_int8_skip_modules & set_components) != 0
+
+    return substr_check or prefix_check
 
 
 class BitsAndBytesLinearMethod(LinearMethodBase):

From d906aa5657c8917e83467c62148754df00ac21e8 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 13 Feb 2025 02:45:38 -0500
Subject: [PATCH 0145/1240] [CI/Build] Allow ruff to auto-fix some issues
 (#13180)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 22b51afdc57..f664b4c558b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -13,7 +13,7 @@ repos:
   rev: v0.9.3
   hooks:
   - id: ruff
-    args: [--output-format, github]
+    args: [--output-format, github, --fix]
     exclude: 'vllm/third_party/.*'
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.0

From 231d1c27bdc1df192027c812b5fa86c464955707 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 13 Feb 2025 00:02:46 -0800
Subject: [PATCH 0146/1240] [V1][core] Implement pipeline parallel on Ray
 (#12996)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/distributed/test_pipeline_parallel.py | 51 ++++++++++++++++-----
 vllm/executor/ray_utils.py                  | 11 ++++-
 vllm/v1/core/kv_cache_utils.py              | 41 +++++++++++------
 vllm/v1/engine/core.py                      | 19 +++++---
 vllm/v1/executor/abstract.py                | 12 ++---
 vllm/v1/worker/gpu_model_runner.py          | 16 ++++++-
 vllm/v1/worker/gpu_worker.py                |  5 +-
 7 files changed, 110 insertions(+), 45 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5d7cb9e4089..6a54fb74ba9 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -40,10 +40,23 @@ class PPTestOptions(NamedTuple):
 @dataclass
 class PPTestSettings:
     parallel_setups: List[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
     distributed_backends: List[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: List[str]
     task: TaskOption
     test_options: PPTestOptions
 
+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
     @staticmethod
     def detailed(
         *,
@@ -79,7 +92,9 @@ def detailed(
                               eager_mode=True,
                               chunked_prefill=False),
             ],
-            distributed_backends=["mp", "ray"],
+            # only ray is supported for V1
+            distributed_backends=["mp", "ray", "ray"],
+            vllm_major_versions=["0", "0", "1"],
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        trust_remote_code=trust_remote_code,
@@ -108,6 +123,7 @@ def fast(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
+            vllm_major_versions=["0"],
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        trust_remote_code=trust_remote_code,
@@ -120,8 +136,9 @@ def iter_params(self, model_name: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
-            for distributed_backend in self.distributed_backends:
-                yield (model_name, parallel_setup, distributed_backend,
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_name, parallel_setup, backend, vllm_major_version,
                        self.task, opts)
 
 
@@ -244,6 +261,7 @@ def _compare_tp(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
+    vllm_major_version: str,
     task: TaskOption,
     test_options: PPTestOptions,
     num_gpus_available: int,
@@ -296,10 +314,13 @@ def _compare_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", hf_overrides])
 
-    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
-            and chunked_prefill):
-        # Test Ray ADAG for a subset of the tests
+    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    if distributed_backend == "ray" and (vllm_major_version == "1"
+                                         or specific_case):
+        # For V1, test Ray ADAG for all the tests
+        # For V0, test Ray ADAG for a subset of the tests
         pp_env = {
+            "VLLM_USE_V1": vllm_major_version,
             "VLLM_USE_RAY_COMPILED_DAG": "1",
             "VLLM_USE_RAY_SPMD_WORKER": "1",
             "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -348,8 +369,8 @@ def _compare_tp(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
     [
         params for model_name, settings in TEXT_GENERATION_MODELS.items()
         for params in settings.iter_params(model_name)
@@ -361,6 +382,7 @@ def test_tp_language_generation(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
+    vllm_major_version: str,
     task: TaskOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -368,6 +390,7 @@ def test_tp_language_generation(
     _compare_tp(model_name,
                 parallel_setup,
                 distributed_backend,
+                vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
@@ -375,8 +398,8 @@ def test_tp_language_generation(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
     [
         params for model_name, settings in EMBEDDING_MODELS.items()
         for params in settings.iter_params(model_name)
@@ -388,6 +411,7 @@ def test_tp_language_embedding(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
+    vllm_major_version: str,
     task: TaskOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -395,6 +419,7 @@ def test_tp_language_embedding(
     _compare_tp(model_name,
                 parallel_setup,
                 distributed_backend,
+                vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
@@ -402,8 +427,8 @@ def test_tp_language_embedding(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
     [
         params for model_name, settings in MULTIMODAL_MODELS.items()
         for params in settings.iter_params(model_name)
@@ -415,6 +440,7 @@ def test_tp_multimodal_generation(
     model_name: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
+    vllm_major_version: str,
     task: TaskOption,
     test_options: PPTestOptions,
     num_gpus_available,
@@ -422,6 +448,7 @@ def test_tp_multimodal_generation(
     _compare_tp(model_name,
                 parallel_setup,
                 distributed_backend,
+                vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 33c0a25803c..8ad466a5572 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -35,7 +35,7 @@
 
     class RayWorkerWrapper(WorkerWrapperBase):
         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
-        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+        lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 
         def __init__(self, *args, **kwargs) -> None:
             super().__init__(*args, **kwargs)
@@ -118,7 +118,14 @@ def execute_model(
         ) -> "ModelRunnerOutput":
             self.setup_device_if_necessary()
             assert self.worker is not None, "Worker is not initialized"
-            output = self.worker.model_runner.execute_model(scheduler_output)
+            if isinstance(scheduler_output, tuple):
+                scheduler_output, intermediate_tensors = scheduler_output
+            else:
+                scheduler_output, intermediate_tensors = scheduler_output, None
+            output = self.worker.model_runner.execute_model(
+                scheduler_output, intermediate_tensors)
+            if isinstance(output, IntermediateTensors):
+                output = scheduler_output, output
             return output
 
         def override_env_vars(self, vars: Dict[str, str]):
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index bddb482d291..6dec87d4dd2 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -488,7 +488,8 @@ def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
 
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                                       kv_cache_spec: KVCacheSpec,
-                                      available_memory: int) -> KVCacheConfig:
+                                      available_memory: int,
+                                      num_layers: int) -> KVCacheConfig:
     """
     Generates the KV cache configuration for a model with one type of KV cache.
     Divide the available memory equally among all layers.
@@ -497,6 +498,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
         vllm_config: The global VllmConfig
         kv_cache_spec: The kv cache spec of the model
         available_memory: Memory available for KV cache in bytes.
+        num_layers: The number of layers in the model.
 
     Returns:
         The generated KVCacheConfig
@@ -506,7 +508,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     assert len(page_sizes) == 1
     page_size = page_sizes.pop()
 
-    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
+    num_blocks = int(available_memory // page_size // num_layers)
     num_blocks = max(num_blocks, 0)
 
     if vllm_config.cache_config.num_gpu_blocks_override is not None:
@@ -536,25 +538,36 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     return kv_cache_config
 
 
-def get_kv_cache_config(vllm_config: VllmConfig, kv_cache_spec: KVCacheSpec,
-                        available_memory: int) -> KVCacheConfig:
+def get_kv_cache_configs(vllm_config: VllmConfig,
+                         kv_cache_specs: List[KVCacheSpec],
+                         available_memory: int) -> List[KVCacheConfig]:
     """
     Generates the KV cache configuration for a model
     TODO: support hybrid models with more than one type of KV cache.
 
     Args:
         vllm_config: The global VllmConfig
-        kv_cache_spec: The kv cache spec of the model
+        kv_cache_specs: The kv cache specs of the model
         available_memory: Memory available for KV cache in bytes.
 
     Returns:
-        The generated KVCacheConfig
+        The generated KVCacheConfigs
     """
-    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
-    if is_kv_cache_type_uniform(kv_cache_spec):
-        # KV cache of all layers are the same, which is true for most models.
-        # Allocate the same amount of memory for each layer.
-        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
-                                                 available_memory)
-    else:
-        raise NotImplementedError
+    # Use the max number of layers to conservatively determine
+    # the number of blocks.
+    num_layers = max(len(kv_cache_spec) for kv_cache_spec in kv_cache_specs)
+    kv_cache_configs = []
+    for kv_cache_spec in kv_cache_specs:
+        check_enough_kv_cache_memory(vllm_config, kv_cache_spec,
+                                     available_memory)
+        if is_kv_cache_type_uniform(kv_cache_spec):
+            # KV cache of all layers are the same, which is true for
+            # most models. Allocate the same amount of memory for
+            # each layer.
+            kv_cache_configs.append(
+                _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                  available_memory,
+                                                  num_layers))
+        else:
+            raise NotImplementedError
+    return kv_cache_configs
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e4677681bd2..e1968035558 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -16,7 +16,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
-from vllm.v1.core.kv_cache_utils import get_kv_cache_config
+from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType)
@@ -73,20 +73,25 @@ def _initialize_kv_caches(self,
         start = time.time()
 
         # Get all kv cache needed by the model
-        kv_cache_spec = self.model_executor.get_kv_cache_spec()
+        kv_cache_specs = self.model_executor.get_kv_cache_specs()
 
         # Profiles the peak memory usage of the model to determine how much
         # memory can be allocated for kv cache.
-        availble_gpu_memory = self.model_executor.determine_available_memory()
+        available_gpu_memory = self.model_executor.determine_available_memory()
 
         # Get the kv cache tensor size
-        kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec,
-                                              availble_gpu_memory)
-        num_gpu_blocks = kv_cache_config.num_blocks
+        kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
+                                                available_gpu_memory)
+        num_gpu_blocks_set = set(config.num_blocks
+                                 for config in kv_cache_configs)
+        assert len(num_gpu_blocks_set) == 1, (
+            f"num_gpu_blocks need to be the same across workers, "
+            f"but they are different: {num_gpu_blocks_set}")
+        num_gpu_blocks = num_gpu_blocks_set.pop()
         num_cpu_blocks = 0
 
         # Initialize kv cache and warmup the execution
-        self.model_executor.initialize(kv_cache_config)
+        self.model_executor.initialize(kv_cache_configs)
 
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 093be09ae11..d1ffc891ad6 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Type
+from typing import List, Type
 
 from vllm.config import VllmConfig
 from vllm.executor.executor_base import ExecutorBase
@@ -48,12 +48,12 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
                              f"{distributed_executor_backend}")
         return executor_class
 
-    def initialize(self, kv_cache_config: KVCacheConfig) -> None:
+    def initialize(self, kv_cache_configs: List[KVCacheConfig]) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        self.collective_rpc("initialize_cache", args=(kv_cache_config, ))
+        self.collective_rpc("initialize_cache", args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
     def determine_available_memory(self) -> int:  # in bytes
@@ -63,11 +63,9 @@ def determine_available_memory(self) -> int:  # in bytes
         # operators can be applied to all workers.
         return min(output)
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_specs(self) -> List[KVCacheSpec]:
         output = self.collective_rpc("get_kv_cache_spec")
-        for x in output:
-            assert x == output[0]
-        return output[0]
+        return output
 
     def execute_model(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b1eab613bf..5d8da7545f0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -12,7 +12,7 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
-from vllm.distributed.parallel_state import graph_capture
+from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
@@ -21,6 +21,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
+from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
@@ -773,6 +774,7 @@ def get_model(self) -> nn.Module:
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> ModelRunnerOutput:
         batch_changed = self._update_states(scheduler_output)
 
@@ -831,8 +833,11 @@ def execute_model(
                 positions=positions,
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
+                intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
             )
+        if not get_pp_group().is_last_rank:
+            return hidden_states
         hidden_states = hidden_states[:num_scheduled_tokens]
         sample_hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
@@ -1007,12 +1012,19 @@ def _dummy_run(
             positions = self.mrope_positions[:, :num_tokens]
         else:
             positions = self.positions[:num_tokens]
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                batch_size=num_tokens,
+                dtype=self.model_config.dtype,
+                device=self.device)
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(
                 input_ids=input_ids,
                 positions=positions,
                 kv_caches=kv_caches,
                 attn_metadata=None,
+                intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
             )
         return hidden_states
@@ -1142,6 +1154,8 @@ def profile_run(self) -> None:
             # Trigger compilation for general shape.
             hidden_states = self._dummy_run(self.max_num_tokens,
                                             dummy_kv_caches)
+            if not get_pp_group().is_last_rank:
+                return hidden_states
             hidden_states = hidden_states[logit_indices]
             logits = self.model.compute_logits(hidden_states, None)
             # TODO(woosuk): Consider the memory usage of the sampler.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index ad53f90b866..beedca05cd5 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional
 
 import torch
 import torch.distributed
@@ -194,8 +194,9 @@ def determine_available_memory(self) -> int:
     def get_kv_cache_spec(self) -> KVCacheSpec:
         return self.model_runner.get_kv_cache_spec()
 
-    def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None:
+    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
+        kv_cache_config = kv_cache_configs[self.rank]
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")

From 15aedbf545b6e9bd1f865ae5a9d632be34065f90 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 13 Feb 2025 16:31:37 +0800
Subject: [PATCH 0147/1240] [VLM] Remove input processor from clip and siglip
 (#13165)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/clip.py   | 149 ++-------------------------
 vllm/model_executor/models/siglip.py |  74 +------------
 2 files changed, 10 insertions(+), 213 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1e784f5b417..547f6244781 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,156 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
-import numpy as np
 import torch
 import torch.nn as nn
-from PIL import Image
 from transformers import CLIPVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
-from vllm.sequence import SequenceData
 
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
-def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
-    assert image_size % patch_size == 0
-    return image_size // patch_size
-
-
-def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_clip_patch_grid_length(image_size=image_size,
-                                             patch_size=patch_size)
-    return grid_length * grid_length
-
-
-def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
-    return get_clip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size) + 1
-
-
-def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int:
-    return get_clip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig,
-                            seq_len: int,
-                            num_images: int,
-                            *,
-                            image_token_id: int,
-                            image_feature_size_override: Optional[int] = None,
-                            mm_key: str = "image"):
-    if image_feature_size_override is None:
-        image_feature_size = get_clip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        mm_key:
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_clip(
-    hf_config: CLIPVisionConfig,
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-def dummy_video_for_clip(
-    hf_config: CLIPVisionConfig,
-    num_frames: int,
-    num_videos: int = 1,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    pil_frame = dummy_image_for_clip(
-        hf_config,
-        num_images=1,
-        image_width_override=image_width_override,
-        image_height_override=image_height_override)
-    np_frame = np.array(pil_frame["image"])
-    mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
-    video_data = [mm_data_per_video] * num_videos
-    mm_data = {"video": video_data}
-    return mm_data
-
-
-def input_processor_for_clip(
-    model_config: ModelConfig,
-    hf_config: CLIPVisionConfig,
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[Union[int, List[int]]] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_data = multi_modal_data["image"]
-        if isinstance(image_data, Image.Image):
-            image_feature_size = get_clip_image_feature_size(hf_config)
-        elif isinstance(image_data, torch.Tensor):
-            num_images, image_feature_size, hidden_size = image_data.shape
-        else:
-            raise TypeError(f"Invalid image type: {type(image_data)}")
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]):
 
     def get_num_image_tokens(
@@ -159,10 +27,10 @@ def get_num_image_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        return get_clip_image_feature_size(self.vision_config)
+        return self.get_patch_grid_length()**2 + 1
 
     def get_max_image_tokens(self) -> int:
-        return get_max_clip_image_tokens(self.vision_config)
+        return self.get_patch_grid_length()**2 + 1
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
@@ -171,10 +39,9 @@ def get_patch_size(self) -> int:
         return self.vision_config.patch_size
 
     def get_patch_grid_length(self) -> int:
-        return get_clip_patch_grid_length(
-            image_size=self.vision_config.image_size,
-            patch_size=self.vision_config.patch_size,
-        )
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+        assert image_size % patch_size == 0
+        return image_size // patch_size
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
@@ -186,6 +53,7 @@ def __init__(self, config: CLIPVisionConfig):
         self.embed_dim = config.hidden_size
         self.image_size = config.image_size
         self.patch_size = config.patch_size
+        assert self.image_size % self.patch_size == 0
 
         self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
 
@@ -197,8 +65,7 @@ def __init__(self, config: CLIPVisionConfig):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(image_size=self.image_size,
-                                                patch_size=self.patch_size)
+        self.num_patches = (self.image_size // self.patch_size)**2
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index a81462f6fbf..ddae78d7739 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -3,18 +3,15 @@
 within a vision language model."""
 
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
-import numpy as np
 import torch
 from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
-from vllm.config import ModelConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import DecoderOnlyInputs, token_inputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -23,9 +20,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges,
-                                   repeat_and_pad_placeholder_tokens)
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import SequenceData
 
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
@@ -93,71 +88,6 @@ def dummy_image_for_siglip(
     return {"image": image if num_images == 1 else [image] * num_images}
 
 
-def dummy_video_for_siglip(
-    hf_config: SiglipVisionConfig,
-    num_frames: int,
-    num_videos: int = 1,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    pil_frame = dummy_image_for_siglip(
-        hf_config,
-        num_images=1,
-        image_width_override=image_width_override,
-        image_height_override=image_height_override)
-    np_frame = np.array(pil_frame["image"])
-    mm_data_per_video = np.repeat([np_frame], num_frames, axis=0)
-    video_data = [mm_data_per_video] * num_videos
-    mm_data = {"video": video_data}
-    return mm_data
-
-
-def input_processor_for_siglip(
-    model_config: ModelConfig,
-    hf_config: SiglipVisionConfig,
-    inputs: DecoderOnlyInputs,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[Union[int, List[int]]] = None,
-):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    if "multi_modal_placeholders" in inputs and "image" in inputs[
-            "multi_modal_placeholders"]:
-        # The inputs already have placeholders.
-        return inputs
-
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    if image_feature_size_override is None:
-        image_data = multi_modal_data["image"]
-        if isinstance(image_data, Image.Image):
-            image_feature_size = get_siglip_image_feature_size(hf_config)
-        elif isinstance(image_data, torch.Tensor):
-            num_images, image_feature_size, hidden_size = image_data.shape
-        else:
-            raise TypeError(f"Invalid image type: {type(image_data)}")
-    else:
-        image_feature_size = image_feature_size_override
-
-    new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-        tokenizer,
-        inputs.get("prompt"),
-        inputs["prompt_token_ids"],
-        placeholder_token_id=image_token_id,
-        repeat_count=image_feature_size,
-    )
-
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": ranges})
-
-
 class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
 
     def get_num_image_tokens(

From 28a084c4fd3579996842231359b26f8d8d76f74e Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 13 Feb 2025 03:51:46 -0500
Subject: [PATCH 0148/1240] [Frontend] Pass pre-created socket to uvicorn
 (#13113)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/api_server.py        |  1 +
 vllm/entrypoints/launcher.py          |  9 ++++++---
 vllm/entrypoints/openai/api_server.py | 13 ++++++++++---
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 96818507d58..00793d4b967 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -127,6 +127,7 @@ async def run_server(args: Namespace,
 
     shutdown_task = await serve_http(
         app,
+        sock=None,
         host=args.host,
         port=args.port,
         log_level=args.log_level,
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 351a39525fa..79946a498da 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -2,8 +2,9 @@
 
 import asyncio
 import signal
+import socket
 from http import HTTPStatus
-from typing import Any
+from typing import Any, Optional
 
 import uvicorn
 from fastapi import FastAPI, Request, Response
@@ -17,7 +18,8 @@
 logger = init_logger(__name__)
 
 
-async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
+async def serve_http(app: FastAPI, sock: Optional[socket.socket],
+                     **uvicorn_kwargs: Any):
     logger.info("Available routes are:")
     for route in app.routes:
         methods = getattr(route, "methods", None)
@@ -34,7 +36,8 @@ async def serve_http(app: FastAPI, **uvicorn_kwargs: Any):
 
     loop = asyncio.get_running_loop()
 
-    server_task = loop.create_task(server.serve())
+    server_task = loop.create_task(
+        server.serve(sockets=[sock] if sock else None))
 
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 127ee941497..588a7781c11 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -10,7 +10,6 @@
 import re
 import signal
 import socket
-import sys
 import tempfile
 import uuid
 from argparse import Namespace
@@ -831,6 +830,7 @@ def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
 
     sock = socket.socket(family=family, type=socket.SOCK_STREAM)
     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)
     sock.bind(addr)
 
     return sock
@@ -878,8 +878,17 @@ def signal_handler(*_) -> None:
         model_config = await engine_client.get_model_config()
         await init_app_state(engine_client, model_config, app.state, args)
 
+        def _listen_addr(a: str) -> str:
+            if is_valid_ipv6_address(a):
+                return '[' + a + ']'
+            return a or "0.0.0.0"
+
+        logger.info("Starting vLLM API server on http://%s:%d",
+                    _listen_addr(sock_addr[0]), sock_addr[1])
+
         shutdown_task = await serve_http(
             app,
+            sock=sock,
             host=args.host,
             port=args.port,
             log_level=args.uvicorn_log_level,
@@ -888,8 +897,6 @@ def signal_handler(*_) -> None:
             ssl_certfile=args.ssl_certfile,
             ssl_ca_certs=args.ssl_ca_certs,
             ssl_cert_reqs=args.ssl_cert_reqs,
-            # Workaround to work on macOS
-            fd=sock.fileno() if sys.platform.startswith("darwin") else None,
             **uvicorn_kwargs,
         )
 

From 4cd0d5da2653462995673f7701256338623a9c8e Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 13 Feb 2025 03:43:24 -0800
Subject: [PATCH 0149/1240] [V1] Clarify input processing and multimodal
 feature caching logic (#13211)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core.py                        | 16 +++++-----
 .../{mm_input_mapper.py => mm_input_cache.py} | 29 ++++++++++++-------
 vllm/v1/engine/processor.py                   | 20 +++++++++----
 vllm/v1/worker/gpu_model_runner.py            |  7 +++--
 4 files changed, 45 insertions(+), 27 deletions(-)
 rename vllm/v1/engine/{mm_input_mapper.py => mm_input_cache.py} (82%)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e1968035558..4642ac1778e 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
+from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -65,7 +65,7 @@ def __init__(
             log_stats=self.log_stats,
         )
 
-        self.mm_input_mapper_server = MMInputMapperServer(
+        self.mm_input_cache_server = MMInputCacheServer(
             vllm_config.model_config)
 
     def _initialize_kv_caches(self,
@@ -102,13 +102,13 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         if request.mm_hashes is not None:
-            # Here, if hash exists for an image, then it will be fetched
-            # from the cache, else it will be added to the cache.
-            # Note that the cache here is mirrored with the client side of the
-            # MM mapper, so anything that has a hash must have a HIT cache
-            # entry here as well.
+            # Here, if hash exists for a multimodal input, then it will be
+            # fetched from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client cache, so
+            # anything that has a hash must have a HIT cache entry here
+            # as well.
             assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update(
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_cache.py
similarity index 82%
rename from vllm/v1/engine/mm_input_mapper.py
rename to vllm/v1/engine/mm_input_cache.py
index 83a0d9db161..e1b6679c284 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -10,12 +10,18 @@
 
 logger = init_logger(__name__)
 
-# The idea of MM preprocessor caching is based on having a client and a server,
-# where the client executes in the frontend process (=P0) and the server in the
-# core process (=P1).
+# The idea of multimodal preprocessing caching is based on having a client and
+# a server, where the client executes in the frontend process (=P0) and the
+# server in the core process (=P1).
 #
-# -- Client: Executes the MM mapper and performs caching of the results.
-# -- Server: Performs caching of the results
+# -- Client:
+#  - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
+#  - Perform caching of the generated MultiModalKwargs.
+#  - This client can be deprecated once all mutimodal models migrate to use
+#    merged preprocessor with built-in caching functionality.
+#
+# -- Server:
+#  - Perform caching of the received MultiModalKwargs.
 #
 # The caching for both client and server is mirrored/similar, and this allows us
 # to avoid the serialization of "mm_inputs" (like pixel values) between
@@ -27,7 +33,9 @@
 MM_CACHE_SIZE = 256
 
 
-class MMInputMapperClient:
+# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
+# merged preprocessor with built-in caching functionality.
+class MMInputCacheClient:
 
     def __init__(
         self,
@@ -54,7 +62,8 @@ def cache_hit_ratio(self, steps):
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
 
-    # TODO: Support modalities beyond image.
+    # NOTE: process_inputs only supports image inputs since all multimodal
+    # models with other modalities have migrated to use merged preprocessor.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
@@ -95,7 +104,7 @@ def process_inputs(
                     # Reuse precomputed input (for merged preprocessor)
                     mm_input = precomputed_mm_inputs[input_id]
                 else:
-                    # Apply MM mapper
+                    # Apply legacy input_mapper
                     mm_input = self.multi_modal_input_mapper(
                         {"image": [image_inputs[input_id]]},
                         mm_processor_kwargs=mm_processor_kwargs,
@@ -114,13 +123,13 @@ def process_inputs(
         return ret_inputs
 
 
-class MMInputMapperServer:
+class MMInputCacheServer:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
-    def process_inputs(
+    def get_and_update(
         self,
         mm_inputs: List[Optional[MultiModalKwargs]],
         mm_hashes: List[str],
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 70876b03a82..b7eee5a3997 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,7 +17,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 
 
 class Processor:
@@ -46,7 +46,7 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+        self.mm_input_cache_client = MMInputCacheClient(model_config)
 
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
@@ -106,16 +106,24 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Process inputs.
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        # 3. Apply prompt adapter to prompt token ids if one exists.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
+
+        # Process prompt and prompt token ids.
+        # Only applicable to multimodal models with legacy input processor.
         processed_inputs = self.input_processor(preprocessed_inputs)
+
         self._validate_model_inputs(processed_inputs)
-        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         if is_encoder_decoder_inputs(processed_inputs):
             decoder_inputs = SingletonInputsAdapter(
@@ -200,8 +208,8 @@ def process_inputs(
                     key=lambda mm_input: modality_order_dict[list(
                         mm_input.modalities)[0]])
 
-            # Apply mm input cache update (and input mapper if necessary).
-            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+            # Apply mm input cache update and legacy input mapper if one exists.
+            sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
                 mm_data=decoder_mm_data,
                 mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5d8da7545f0..fa4bd81a28d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -27,7 +27,7 @@
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
@@ -95,9 +95,10 @@ def __init__(
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
-        # NOTE: Initialized input mapper is only used for processing dummy
+        # NOTE: Initialized client is only used for processing dummy
         # multimodal data into multimodal kwargs for GPU memory profiling.
-        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        # Only applicable to multimodal models with legacy input mapper.
+        self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False
 
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(

From 77487561c08926723f87b92e7e0399a2475ba52d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 13 Feb 2025 20:34:00 +0800
Subject: [PATCH 0150/1240] [VLM] Merged multi-modal processor for Molmo
 (#12966)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |    2 +-
 .../decoder_only/language/test_models.py      |    2 +-
 .../vision_language/test_models.py            |    5 +-
 .../vision_language/vlm_utils/model_utils.py  |   98 +-
 .../multimodal/processing/test_common.py      |    2 +
 tests/models/registry.py                      |    1 +
 vllm/model_executor/models/molmo.py           | 1023 +++++++++++------
 vllm/multimodal/inputs.py                     |   80 +-
 vllm/utils.py                                 |   35 +-
 9 files changed, 750 insertions(+), 498 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 55b3f52356c..86b74617822 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -793,7 +793,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `MolmoForCausalLM`
   * Molmo
   * T + I
-  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
+  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
   * ✅︎
   * ✅︎
   * ✅︎
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index c6d5244318a..71e4a9f11ab 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -27,7 +27,7 @@
             marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         ),
         pytest.param(
-            "THUDM/chatglm3-6b",  # ChatGLM (text-only)
+            "THUDM/chatglm3-6b",  # chatglm (text-only)
         ),
         pytest.param(
             "meta-llama/Llama-3.2-1B-Instruct",  # llama
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index b00ec6fa699..4ed61cfc9b7 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -404,11 +404,10 @@
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
         test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        prompt_formatter=identity,
         max_model_len=4096,
         max_num_seqs=2,
-        image_size_factors=[(),(1.0, 1.0, 1.0)],
-        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
+        patch_hf_runner=model_utils.molmo_patch_hf_runner,
         postprocess_inputs=model_utils.molmo_post_processor,
     ),
     # Tests for phi3v currently live in another file because of a bug in
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index ced891e1e2c..408ce9cfead 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,7 +6,7 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import torch
 from PIL.Image import Image
@@ -17,9 +17,7 @@
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
-from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
-                           PromptImageInput, PromptVideoInput, _ImageAssets)
-from ....utils import TokensTextLogprobs
+from .....conftest import HfRunner, ImageAsset, _ImageAssets
 from .types import RunnerOutput
 
 
@@ -522,74 +520,7 @@ def _generate(self, *args, **kwargs):
     return hf_model
 
 
-def _generate_greedy_logprobs_limit(
-    self,
-    prompts: List[str],
-    max_tokens: int,
-    num_logprobs: int,
-    images: Optional[PromptImageInput] = None,
-    audios: Optional[PromptAudioInput] = None,
-    videos: Optional[PromptVideoInput] = None,
-    **kwargs: Any,
-) -> List[TokensTextLogprobs]:
-    all_inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
-
-    # Process in batches for inference.
-    if len(all_inputs):
-        input_ids_lst = []
-        images_lst = []
-        images_input_idx_lst = []
-        imges_masks_lst = []
-        for inputs in all_inputs:
-            input_ids_lst.append(inputs["input_ids"])
-            images_lst.append(inputs["images"])
-            images_input_idx_lst.append(inputs["image_input_idx"])
-            imges_masks_lst.append(inputs["image_masks"])
-        batch_inputs = {}
-        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
-        batch_inputs['images'] = torch.cat(images_lst, dim=0)
-        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
-                                                    dim=0)
-        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
-
-        outputs = self.model.generate_from_batch(
-            batch=self.wrap_device(batch_inputs,
-                                   device=self.model.device.type),
-            generation_config=GenerationConfig(
-                max_new_tokens=max_tokens,
-                stop_strings="<|endoftext|>",
-                do_sample=False,
-            ),
-            tokenizer=self.tokenizer,
-            output_hidden_states=True,
-            return_dict_in_generate=True,
-        )
-
-    all_logprobs: List[List[Dict[int, float]]] = []
-    all_output_ids: List[List[int]] = []
-    all_output_strs: List[str] = []
-
-    for index in range(len(all_inputs)):
-        (
-            seq_logprobs_lst,
-            output_len,
-        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
-                                            num_logprobs)
-        all_logprobs.append(seq_logprobs_lst)
-        seq_ids = outputs.sequences[index]
-        output_ids = seq_ids[-output_len:]
-        all_output_ids.append(output_ids.tolist())
-        all_output_strs.append(self.tokenizer.decode(output_ids))
-    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-    return [(output_ids, output_str, output_logprobs)
-            for output_ids, output_str, output_logprobs in outputs]
-
-
-####### Molmo-specific HuggingFace runner patchers
-def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for Molmo."""
     hf_processor = hf_model.processor
 
@@ -598,10 +529,23 @@ def _processor(*args, **kwargs):
 
     hf_model.processor = _processor
 
-    setattr(  # noqa: B010
-        hf_model,
-        "generate_greedy_logprobs_limit",
-        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
-    )
+    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
+        batch = {
+            k: kwargs.pop(k)
+            for k in ("input_ids", "images", "image_input_idx", "image_masks")
+            if k in kwargs
+        }
+
+        return self.generate_from_batch(
+            batch,
+            generation_config=GenerationConfig(
+                max_new_tokens=max_new_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=do_sample,
+            ),
+            **kwargs,
+        )
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
 
     return hf_model
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 67ef8b17ab8..88dcc32f44f 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -168,6 +168,8 @@ def _test_processing_correctness(
     "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/Molmo-7B-O-0924",
     "nvidia/NVLM-D-72B",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7b1db55494f..66a487ca60e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -256,6 +256,7 @@ def check_available_online(
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
                               trust_remote_code=True),
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b524a14977b..feb58502231 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,18 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-import re
-from array import array
 from dataclasses import dataclass
-from functools import lru_cache, partial
-from typing import Iterable, List, Mapping, Optional, Set, Tuple, TypedDict
+from functools import cached_property, partial
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union, cast)
 
+import numpy as np
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
-from PIL import Image
-from torch import nn
-from torch.nn import functional as F
-from transformers import PretrainedConfig
+from transformers import (BatchFeature, PretrainedConfig, ProcessorMixin,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.layer import MultiHeadAttention
@@ -22,8 +24,6 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather)
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
                                                    SiluAndMul)
@@ -40,15 +40,21 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
-                           SequenceData)
-from vllm.transformers_utils.processor import get_processor
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.utils import JSONTree, json_map_leaves
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -56,38 +62,39 @@
 VIT_LAYERS = [-2, -9]
 NUM_PREFIX_TOKENS = 1
 ADDITIONAL_VOCAB_SIZE = 128
-DEFAULT_IMAGE_PATCH_TOKEN_ID = 152066
-DEFAULT_IM_START_TOKEN_ID = 152067
-DEFAULT_IM_END_TOKEN_ID = 152064
-DEFAULT_IM_COL_TOKEN_ID = 152065
+IMAGE_PATCH_TOKEN = "<im_patch>"
+IM_COL_TOKEN = "<im_col>"
+IM_START_TOKEN = "<im_start>"
+IM_END_TOKEN = "<im_end>"
+POOLING_SIZE = 2
 
 
 class MolmoImageInputs(TypedDict):
-    images: torch.Tensor
-    """Shape:
-    `(batch_size, num_crops, num_patch, patch_dim)`
-    """
+    images: Union[torch.Tensor, List[torch.Tensor]]
+    """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
 
-    image_input_idx: torch.Tensor
-    """Shape:
-    `(batch_size, num_crops, num_patch)`
-    """
+    image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]]
+    """Shape: `(batch_size, num_crops, num_patch)`"""
 
-    seq_len: torch.Tensor
-    """Shape:
-    `(batch_size, )`
+    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
     """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
 
-    image_masks: Optional[torch.Tensor]
-    """Shape:
-    `(batch_size, num_crops, num_patch)`
+    Shape: `(batch_size, num_crops, num_patch)`
     """
 
-    image_start_end: Tuple[int, int]
-    """Starting and ending index of placeholder 
-    tokens
+    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_embeds)`
     """
 
+    num_crops: torch.Tensor
+    """Shape: `(batch_size, num_images)`"""
+
 
 @dataclass
 class VisionBackboneConfig:
@@ -335,7 +342,7 @@ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor:
 
     def forward(self,
                 x: torch.Tensor,
-                patch_num: int = None) -> List[torch.Tensor]:
+                patch_num: Optional[int] = None) -> List[torch.Tensor]:
         """
         : param x: (batch_size, num_patch, n_pixels)
         """
@@ -465,7 +472,7 @@ def forward(
         return output
 
 
-class LanuageModelMLP(nn.Module):
+class LanguageModelMLP(nn.Module):
     """Molmo's LLM mlp."""
 
     def __init__(self,
@@ -559,7 +566,7 @@ def __init__(
                                         prefix=f"{prefix}.self_attn")
 
         # MLP block.
-        self.mlp = LanuageModelMLP(config, quant_config=quant_config)
+        self.mlp = LanguageModelMLP(config, quant_config=quant_config)
 
         # LayerNorm
         assert config.layer_norm_type == "rms"
@@ -638,8 +645,8 @@ def __init__(
         self.vit_layers = VIT_LAYERS
         self.image_num_patch = vision_config.image_num_patch
         self.llm_patches_per_crop = (
-            (self.image_num_patch[0] + 1) // 2,
-            (self.image_num_patch[1] + 1) // 2,
+            (self.image_num_patch[0] + 1) // POOLING_SIZE,
+            (self.image_num_patch[1] + 1) // POOLING_SIZE,
         )
         self.image_vit = VisionTransformer(vision_config,
                                            quant_config=quant_config)
@@ -723,19 +730,19 @@ def forward(
         image_features = image_features.reshape(
             (batch_size, num_image) + self.image_num_patch + (-1, ), )
 
-        if self.image_num_patch[0] % 2 == 1:
-            # Pad so we can still pool 2x2 patches
+        if (missing_w := self.image_num_patch[0] % POOLING_SIZE):
+            # Padding for image pooling (see below)
             image_features = F.pad(
                 image_features,
-                (0, 0, 0, 1, 0, 1, 0, 0, 0, 0),
+                (0, 0, 0, missing_w, 0, missing_w, 0, 0, 0, 0),
             )
 
         # image pooling
         image_features = rearrange(
             image_features,
             'b n (h dh) (w dw) c -> (b n h w) (dh dw) c',
-            dh=2,
-            dw=2,
+            dh=POOLING_SIZE,
+            dw=POOLING_SIZE,
         )
 
         query = image_features.mean(-2, keepdim=True)
@@ -888,249 +895,513 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-cached_get_processor = lru_cache(get_processor)
+def _lowest_multiple(x: int, k: int) -> int:
+    return (x // k) * k
+
 
+def get_num_patches(
+    num_tiles: int,
+    *,
+    crop_patches: int,
+    left_margin: int,
+    right_margin: int,
+    pooling_size: int,
+) -> int:
+    if num_tiles == 1:
+        return _lowest_multiple(crop_patches + pooling_size - 1, pooling_size)
 
-def get_num_patches(num_tiles: int, crop_patches: int, left_margin: int,
-                    right_margin: int, pooling_size: int) -> int:
     crop_window_patches = crop_patches - (left_margin + right_margin)
-    if num_tiles > 1:
-        left_crop_window_patches = (crop_window_patches + left_margin +
-                                    pooling_size -
-                                    1) // pooling_size * pooling_size
-        middle_crop_window_patches = (crop_window_patches + pooling_size -
-                                      1) // pooling_size * pooling_size
-        right_crop_window_patches = (crop_window_patches + right_margin +
-                                     pooling_size -
-                                     1) // pooling_size * pooling_size
-        return left_crop_window_patches + (
-            num_tiles -
-            2) * middle_crop_window_patches + right_crop_window_patches
-    else:
-        single_crop_window_patches = (crop_patches + pooling_size -
-                                      1) // pooling_size * pooling_size
-        return single_crop_window_patches
-
-
-def get_tokens(tiling_h: int, tiling_w: int, crop_patches: int,
-               left_margin: int, right_margin: int, pooling_size: int) -> int:
-    h = get_num_patches(tiling_h, crop_patches, left_margin, right_margin,
-                        pooling_size)
-    w = get_num_patches(tiling_w, crop_patches, left_margin, right_margin,
-                        pooling_size)
-    per_row = w // pooling_size + 1
-    joint = per_row * (h // pooling_size) + 2
-    image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-    resize = (image_token_length + 1) * image_token_length + 2
-    return resize + joint
-
-
-def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int,
-                   right_margin: int, pooling_size: int) -> int:
-    tilings = []
-    for i in range(1, max_crops + 1):
-        for j in range(1, max_crops + 1):
-            if i * j <= max_crops:
-                tilings.append((i, j))
-    tokens = [
-        get_tokens(tilings[i][0], tilings[i][1], crop_patches, left_margin,
-                   right_margin, pooling_size) for i in range(len(tilings))
-    ]
-    return max(tokens)
-
-
-def get_max_molmo_image_tokens(ctx: InputContext) -> int:
-    processor = cached_get_processor(
-        ctx.model_config.model,
-        trust_remote_code=ctx.model_config.trust_remote_code,
-        revision=ctx.model_config.code_revision)
-    image_processor = processor.image_processor
-    max_llm_image_tokens = get_max_tokens(
-        image_processor.max_crops,
-        image_processor.base_image_input_size[0] //
-        image_processor.image_patch_size,
-        image_processor.overlap_margins[0],
-        image_processor.overlap_margins[1],
-        2,
+
+    left_num = _lowest_multiple(
+        crop_window_patches + left_margin + pooling_size - 1,
+        pooling_size,
+    )
+    middle_num = _lowest_multiple(
+        crop_window_patches + pooling_size - 1,
+        pooling_size,
+    )
+    right_num = _lowest_multiple(
+        crop_window_patches + right_margin + pooling_size - 1,
+        pooling_size,
     )
-    return max_llm_image_tokens
 
+    return left_num + (num_tiles - 2) * middle_num + right_num
+
+
+def get_patches_grid_size(
+    *,
+    tiling_h: int,
+    tiling_w: int,
+    crop_patches: int,
+    left_margin: int,
+    right_margin: int,
+    pooling_size: int,
+) -> tuple[int, int]:
+    nrows = get_num_patches(
+        tiling_h,
+        crop_patches=crop_patches,
+        left_margin=left_margin,
+        right_margin=right_margin,
+        pooling_size=pooling_size,
+    )
+    ncols = get_num_patches(
+        tiling_w,
+        crop_patches=crop_patches,
+        left_margin=left_margin,
+        right_margin=right_margin,
+        pooling_size=pooling_size,
+    )
 
-# NOTE: preprocessing for the image data has been included in the
-# 'input_processor_for_molmo' function
-def image_input_mapper_for_molmo(
-    ctx: InputContext,
-    data: object,
-):
-    if isinstance(data, list):
-        assert len(data) == 1, "Molmo supports only one image per prompt."
-        data = data[0]
-
-    return MultiModalKwargs(data)
-
-
-def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
-                         mm_counts: Mapping[str, int]):
-    processor = cached_get_processor(
-        ctx.model_config.model,
-        trust_remote_code=ctx.model_config.trust_remote_code,
-        revision=ctx.model_config.code_revision)
-    image_processor = processor.image_processor
-
-    base_image_input_d = image_processor.image_patch_size
-    left_margin, right_margin = image_processor.overlap_margins
-    max_crops = image_processor.max_crops
-
-    # Assume: prompt_token_ids always starts with bos_token_id followed image tokens # noqa: E501
-    max_llm_image_tokens = get_max_molmo_image_tokens(ctx)
-    if seq_len - max_llm_image_tokens - 1 < 0:
-        raise RuntimeError(
-            f"Molmo cannot process {max_crops} crops in a prompt, "
-            "please increase max_model_len or reduce number of crops")
-
-    # The vertical image has the maximum number of image tokens due to column tokens. # noqa: E501
-    tiling = (max_crops, 1)
-    total_margin_pixels = base_image_input_d * (right_margin + left_margin)
-    crop_patches = image_processor.base_image_input_size[
-        0] // base_image_input_d
-    crop_window_patches = crop_patches - (right_margin + left_margin)
-    crop_window_size = crop_window_patches * base_image_input_d
-
-    h = crop_window_size * tiling[0] + total_margin_pixels
-    w = crop_window_size * tiling[1] + total_margin_pixels
-
-    dummy_image = Image.new("RGB", (w, h), color="red")
-
-    out = processor.process("dummy prompt", dummy_image)
-
-    token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                      out["input_ids"][:1 + max_llm_image_tokens])
-    token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE,
-                       [0]) * (seq_len - max_llm_image_tokens - 1)
-    dummy_seqdata = SequenceData(token_ids)
-    dummy_imgdata = {
-        "images": out["images"],
-        "image_input_idx": out["image_input_idx"],
-    }
-    if "image_masks" in out:
-        dummy_imgdata["image_masks"] = out["image_masks"]
-    dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long)
-    size = 0
-    offset = -1
-    for i in range(len(token_ids)):
-        if token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
-                            DEFAULT_IM_START_TOKEN_ID, DEFAULT_IM_END_TOKEN_ID,
-                            DEFAULT_IM_COL_TOKEN_ID):
-            if offset < 0:
-                offset = i
-            size += 1
-    dummy_imgdata["image_start_end"] = (offset, offset + size)
-    return DummyData(seq_data=dummy_seqdata,
-                     multi_modal_data={"image": dummy_imgdata},
-                     multi_modal_placeholders={
-                         "image":
-                         [PlaceholderRange(offset=offset, length=size)]
-                     })
-
-
-def pad_images(
-    max_total_crops: int,
-    images: torch.Tensor,
-    image_input_idx: torch.Tensor,
-    image_masks: Optional[torch.Tensor] = None,
+    return nrows, ncols
+
+
+def get_candidate_tilings(max_num: int) -> list[tuple[int, int]]:
+    tilings = [(i, j) for i in range(1, max_num + 1)
+               for j in range(1, max_num + 1) if i * j <= max_num]
+    return sorted(tilings, key=lambda x: x[0] * x[1])
+
+
+def select_tiling(
+    *,
+    height: int,
+    width: int,
+    patch_size: int,
+    max_num_patches: int,
 ):
-    n = max_total_crops - images.shape[0]
-    images = F.pad(images, (0, 0, 0, 0, 0, n), value=-1)
-    image_input_idx = F.pad(image_input_idx, (0, 0, 0, n), value=-1)
-    if image_masks is not None:
-        image_masks = F.pad(image_masks, (0, 0, 0, n), value=-1)
-    return images, image_input_idx, image_masks
-
-
-def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
-    prompt = inputs.get("prompt")
-    multi_modal_data = inputs.get("multi_modal_data")
-    image = None if multi_modal_data is None else multi_modal_data.get("image")
-
-    model_config = ctx.model_config
-    processor = cached_get_processor(
-        ctx.model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=ctx.model_config.code_revision)
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer,
-        trust_remote_code=model_config.trust_remote_code)
-
-    # NOTE: message formatting for raw text prompt is only applied for
-    # offline inference; for online serving, the prompt is always in
-    # instruction format and tokenized.
-    if prompt is not None and re.match(r"^User:[\s\S]*?(Assistant:)*$",
-                                       prompt):
-        out = processor.process(prompt, image, message_format="none")
-    elif prompt is not None:
-        out = processor.process(prompt, image)
+    tilings = get_candidate_tilings(max_num_patches)
+    candidate_tilings = np.array(tilings, dtype=np.int32)
+    candidate_resolutions = candidate_tilings * patch_size
+
+    original_size = np.array([height, width], dtype=np.float32)
+    required_scale_d = candidate_resolutions.astype(np.float32) / original_size
+    required_scale = required_scale_d.min(axis=-1, keepdims=True)
+
+    if (required_scale < 1).all():
+        ix = required_scale.argmax()
     else:
-        out = processor.process(None, image, tokens=inputs["prompt_token_ids"])
-
-    # If there is no image, return directly.
-    if image is None:
-        new_prompt_token_ids = out["input_ids"].tolist()
-        prompt = inputs.get("prompt")
-        if prompt is None:
-            prompt = tokenizer.decode(new_prompt_token_ids)
-        return token_inputs(
-            prompt_token_ids=new_prompt_token_ids,
-            prompt=prompt,
+        ix = np.where(required_scale < 1.0, 10e9, required_scale).argmin()
+
+    return candidate_tilings[ix]
+
+
+class MolmoProcessorWrapper:
+    """
+    Wraps :class:`MolmoProcessor` so that it can be called directly.
+
+    The original definition can be found here:
+    https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
+    """
+
+    def __init__(self, processor: ProcessorMixin):
+        super().__init__()
+
+        self.processor = processor
+
+    @cached_property
+    def vocab(self) -> dict[str, int]:
+        return self.processor.tokenizer.vocab  # type: ignore
+
+    @cached_property
+    def max_crops(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        max_crops = image_processor.max_crops
+        assert isinstance(max_crops, int)
+
+        return max_crops
+
+    @cached_property
+    def base_image_input_size(self) -> tuple[int, int]:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        base_image_input_size = image_processor.base_image_input_size
+        if isinstance(base_image_input_size, int):
+            return base_image_input_size, base_image_input_size
+
+        return tuple(base_image_input_size)
+
+    @cached_property
+    def image_patch_size(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_patch_size = image_processor.image_patch_size
+        assert isinstance(image_patch_size, int)
+
+        return image_patch_size
+
+    @cached_property
+    def overlap_margins(self) -> tuple[int, int]:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        left_margin, right_margin = image_processor.overlap_margins
+        assert isinstance(left_margin, int)
+        assert isinstance(right_margin, int)
+
+        return left_margin, right_margin
+
+    @cached_property
+    def image_token_length_w(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_token_length_w = image_processor.image_token_length_w
+        assert isinstance(image_token_length_w, int)
+
+        return image_token_length_w
+
+    @cached_property
+    def image_token_length_h(self) -> int:
+        image_processor = self.processor.image_processor  # type: ignore
+
+        image_token_length_h = image_processor.image_token_length_h
+        assert isinstance(image_token_length_h, int)
+
+        return image_token_length_h
+
+    @property
+    def message_format(self) -> Optional[str]:
+        return "role"
+
+    @property
+    def always_start_with_space(self) -> bool:
+        return True
+
+    @cached_property
+    def image_patch_id(self) -> int:
+        return self.vocab[IMAGE_PATCH_TOKEN]
+
+    @cached_property
+    def im_col_id(self) -> int:
+        return self.vocab[IM_COL_TOKEN]
+
+    @cached_property
+    def im_start_id(self) -> int:
+        return self.vocab[IM_START_TOKEN]
+
+    @cached_property
+    def im_end_id(self) -> int:
+        return self.vocab[IM_END_TOKEN]
+
+    @property
+    def pooling_size(self) -> int:
+        return POOLING_SIZE
+
+    def select_tiling(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_crops = self.max_crops
+        left_margin, right_margin = self.overlap_margins
+        base_image_input_size = self.base_image_input_size
+        base_image_input_d = self.image_patch_size
+
+        total_margin_pixels = base_image_input_d * (right_margin + left_margin)
+        crop_patches = base_image_input_size[0] // base_image_input_d
+        crop_window_patches = crop_patches - (right_margin + left_margin)
+        crop_window_size = crop_window_patches * base_image_input_d
+        tiling_h, tiling_w = select_tiling(
+            height=image_height - total_margin_pixels,
+            width=image_width - total_margin_pixels,
+            patch_size=crop_window_size,
+            max_num_patches=max_crops,
         )
 
-    image_processor = processor.image_processor
-    max_total_crops = 1 + image_processor.max_crops
-    images, image_input_idx, image_masks = pad_images(
-        max_total_crops,
-        out["images"],
-        out["image_input_idx"],
-        out.get("image_masks"),
-    )
-    image_data = dict(
-        images=images,
-        image_input_idx=image_input_idx,
-    )
-    if image_masks is not None:
-        image_data["image_masks"] = image_masks
-
-    new_prompt_token_ids = out["input_ids"].tolist()
-    image_data["seq_len"] = torch.tensor(len(new_prompt_token_ids),
-                                         dtype=torch.long)
-
-    multi_modal_data = dict(image=image_data)
-    size = 0
-    offset = -1
-    for i in range(len(new_prompt_token_ids)):
-        if new_prompt_token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID,
-                                       DEFAULT_IM_START_TOKEN_ID,
-                                       DEFAULT_IM_END_TOKEN_ID,
-                                       DEFAULT_IM_COL_TOKEN_ID):
-            if offset < 0:
-                offset = i
-            size += 1
-    image_data["image_start_end"] = (offset, offset + size)
-    prompt = inputs.get("prompt")
-    if prompt is None:
-        prompt = tokenizer.decode(new_prompt_token_ids)
-    return token_inputs(
-        prompt_token_ids=new_prompt_token_ids,
-        prompt=prompt,
-        multi_modal_data=multi_modal_data,
-        multi_modal_placeholders={
-            "image": [PlaceholderRange(offset=offset, length=size)]
-        },
-    )
+        return tiling_w, tiling_h
+
+    def get_patches_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        left_margin, right_margin = self.overlap_margins
+        base_image_input_size = self.base_image_input_size
+        base_image_input_d = self.image_patch_size
+        pooling_size = self.pooling_size
+
+        crop_patches = base_image_input_size[0] // base_image_input_d
+        tiling_w, tiling_h = self.select_tiling(
+            image_height=image_height,
+            image_width=image_width,
+        )
+
+        nrows, ncols = get_patches_grid_size(
+            tiling_h=tiling_h,
+            tiling_w=tiling_w,
+            crop_patches=crop_patches,
+            left_margin=left_margin,
+            right_margin=right_margin,
+            pooling_size=pooling_size,
+        )
+
+        return ncols, nrows
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        outputs = self.processor.process(  # type: ignore
+            text, images, **kwargs)
+
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        input_ids: torch.Tensor = outputs.pop("input_ids")
+        outputs["input_ids"] = input_ids.unsqueeze(0)
+
+        image_input_idx = outputs.pop("image_input_idx", None)
+        if image_input_idx is not None:
+            input_is_patch = input_ids == self.image_patch_id
+            image_input_idx_flat: torch.Tensor = image_input_idx.view(-1)
+            image_valid_flat = image_input_idx_flat >= 0
+            feat_is_patch_flat = image_valid_flat.clone()
+            feat_is_patch_flat[image_valid_flat] = (
+                input_is_patch[image_input_idx_flat[image_valid_flat]])
+            feat_is_patch = feat_is_patch_flat.view(*image_input_idx.shape)
+
+            input_is_embed = torch.isin(
+                input_ids,
+                torch.tensor([
+                    self.image_patch_id,
+                    self.im_col_id,
+                    self.im_start_id,
+                    self.im_end_id,
+                ]),
+            )
+            embed_ids = input_ids[input_is_embed]
+            embed_is_patch = embed_ids == self.image_patch_id
+            assert embed_is_patch.sum() == feat_is_patch.sum()
 
+            tilings = [
+                self.select_tiling(
+                    image_width=image.size[0],
+                    image_height=image.size[1],
+                ) for image in images
+            ]
+            # For each image: tiling_h * tiling_w + extra
+            num_crops = torch.tensor(tilings).prod(-1) + 1
+            assert num_crops.sum() == len(feat_is_patch)
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(image_input_mapper_for_molmo)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
+            outputs["feat_is_patch"] = feat_is_patch
+            outputs["embed_is_patch"] = embed_is_patch
+            outputs["num_crops"] = num_crops
+            outputs["img_patch_id"] = self.image_patch_id
+
+        return BatchFeature(outputs, tensor_type=return_tensors)
+
+
+class MolmoProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_processor(self) -> MolmoProcessorWrapper:
+        processor = self.ctx.get_hf_processor()
+        return MolmoProcessorWrapper(processor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[MolmoProcessorWrapper],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        ncols, nrows = processor.get_patches_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+        pooling_size = processor.pooling_size
+
+        base_image_input_size = processor.base_image_input_size
+        base_image_input_d = processor.image_patch_size
+
+        crop_patches = base_image_input_size[0] // base_image_input_d
+
+        per_row = ncols // pooling_size + 1
+        joint = per_row * (nrows // pooling_size) + 2
+        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
+        resize = (image_token_length + 1) * image_token_length + 2
+
+        return resize + joint
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        tilings = get_candidate_tilings(processor.max_crops)
+        base_h, base_w = processor.base_image_input_size
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in tilings:
+            width, height = base_w * wr, base_h * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class MolmoMultiModalProcessor(BaseMultiModalProcessor[MolmoProcessingInfo]):
+
+    def _apply_hf_processor_tokens_only(
+        self,
+        prompt_tokens: list[int],
+    ) -> list[int]:
+        processor = self.info.get_hf_processor()
+
+        # Apply the chat template to the tokens
+        tokens = processor.processor.get_tokens_input(  # type: ignore
+            self.info.get_tokenizer().decode(prompt_tokens),
+            message_format=processor.message_format,
+            always_start_with_space=processor.always_start_with_space,
+        )
+
+        processed_data = self.info.ctx.call_hf_processor(
+            processor,  # type: ignore
+            dict(tokens=tokens),
+        )
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+
+        return prompt_ids
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_crops = hf_inputs.get("num_crops", torch.empty(0))
+        num_images = len(num_crops)
+
+        return dict(
+            images=MultiModalFieldConfig.flat_from_sizes("image", num_crops),
+            image_masks=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            embed_is_patch=MultiModalFieldConfig.shared("image", num_images),
+            num_crops=MultiModalFieldConfig.batched("image"),
+            img_patch_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
+        pooling_size = processor.pooling_size
+
+        user_str = "User:"
+        if processor.always_start_with_space:
+            user_str = " " + user_str
+
+        user_tokens = tokenizer.encode(user_str, add_special_tokens=False)
+
+        img_patch_id = processor.image_patch_id
+        img_col_id = processor.im_col_id
+        img_start_id = processor.im_start_id
+        img_end_id = processor.im_end_id
+
+        extra_row = [img_patch_id] * image_token_length_w + [img_col_id]
+        extra_joint = ([img_start_id] + extra_row * image_token_length_h +
+                       [img_end_id])
+
+        def get_replacement_molmo(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = processor.get_patches_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            joint_row = ([img_patch_id] * ((ncols + 1) // pooling_size) +
+                         [img_col_id])
+            joint = ([img_start_id] + joint_row *
+                     ((nrows + 1) // pooling_size) + [img_end_id])
+
+            image_tokens = extra_joint + joint
+
+            return PromptReplacementDetails(
+                full=image_tokens + user_tokens,
+                features=image_tokens,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=user_str,
+                replacement=get_replacement_molmo,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(MolmoMultiModalProcessor,
+                                        info=MolmoProcessingInfo,
+                                        dummy_inputs=MolmoDummyInputsBuilder)
 class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
                        SupportsLoRA):
     hf_to_vllm_mapper = WeightsMapper(
@@ -1202,6 +1473,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                    quant_config)
         self.model = MolmoModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
+        self.img_patch_id = None
 
         if self.config.weight_tying:
             self.lm_head = self.model.transformer.wte
@@ -1224,85 +1496,143 @@ def _parse_and_validate_image_input(
         **kwargs: object,
     ) -> Optional[MolmoImageInputs]:
         images = kwargs.pop("images", None)
-        image_masks = kwargs.pop("image_masks", None)
-        image_start_end = kwargs.pop("image_start_end", None)
         if images is None:
             return None
 
-        image_input_idx = kwargs.pop("image_input_idx", None)
-        seq_len = kwargs.pop("seq_len", None)
-        if image_input_idx is None:
-            raise ValueError("image_input_idx is required for Molmo model.")
-        if seq_len is None:
-            raise ValueError("seq_len is required for Molmo model.")
-        if not isinstance(seq_len, torch.Tensor):
-            seq_len = torch.tensor(seq_len)
+        if not isinstance(images, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of images. "
+                             f"Got type: {type(images)}")
+
+        image_masks = kwargs.pop("image_masks", None)
+        if not (image_masks is None or isinstance(image_masks,
+                                                  (torch.Tensor, list))):
+            raise ValueError("Incorrect type of image_masks. "
+                             f"Got type: {type(image_masks)}")
+
+        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        if not isinstance(feat_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of feat_is_patch. "
+                             f"Got type: {type(feat_is_patch)}")
+
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        num_crops = kwargs.pop("num_crops", None)
+        if not isinstance(num_crops, torch.Tensor):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+
+        img_patch_id = kwargs.pop("img_patch_id", None)
+        if not isinstance(img_patch_id, torch.Tensor):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+        self.img_patch_id = img_patch_id.flatten().unique().item()
 
         return MolmoImageInputs(
             images=images,
-            image_input_idx=image_input_idx,
-            seq_len=seq_len,
             image_masks=image_masks,
-            image_start_end=image_start_end,
+            feat_is_patch=feat_is_patch,
+            embed_is_patch=embed_is_patch,
+            num_crops=num_crops,
         )
 
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> torch.Tensor:
-
-        image_features = self.vision_backbone(
-            images=image_input["images"],
-            image_masks=image_input["image_masks"],
-        )
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        if isinstance(image_input["images"], list):
+            # Call the vision backbone on the whole batch at once
+            images_flat = flatten_bn(image_input["images"], concat=True)
+            image_masks_flat = (None if (image_masks :=
+                                         image_input["image_masks"]) is None
+                                else flatten_bn(image_masks, concat=True))
+
+            image_features_flat = self.vision_backbone(
+                images=images_flat.unsqueeze(0),
+                image_masks=(None if image_masks_flat is None else
+                             image_masks_flat.unsqueeze(0)),
+            ).squeeze(0)
+
+            # Reconstruct the batch dimension
+            image_features = image_features_flat.split(
+                image_input["num_crops"].sum(-1).tolist())
+        else:
+            image_features = self.vision_backbone(
+                images=image_input["images"],
+                image_masks=image_input["image_masks"],
+            )
 
         return image_features
 
+    def _get_mm_embeds(
+            self,
+            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
+            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
+            num_crops: torch.Tensor,  # Shape: (num_images,)
+            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
+    ) -> list[torch.Tensor]:
+        """
+        Scatter the patch features into a contiguous tensor that corresponds
+        to the embedding tokens defined by the multimodal processor.
+
+        Note:
+            The original code only considers patch tokens as feature
+            tokens, but our processor considers all image-related tokens
+            as feature tokens because the feature tokens need to be
+            consecutive in `input_ids`.
+        
+        Example:
+            A simplified example for one item in the batch:
+
+            .. code-block::
+
+                Embedding tokens (from HF processor):
+                [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+
+                embed_is_patch (from HF processor):
+                [ False   True    True    False    True    True   False  False ]
+    
+                Encoder outputs (from model):
+                        [  p1      p2       0       p3      p4      0   ]
+
+                feat_is_patch (from HF processor):
+                        [ True    True    False    True    True   False ]
+
+                The resulting embedding tensor is:
+                [  nan     p1      p2      nan      p3      p4     nan    nan  ]
+        """
+        num_crops_per_image = num_crops.tolist()
+        feats_per_image = features.split(num_crops_per_image)
+        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
+
+        _, _, embed_dim = features.shape
+        (num_embeds, ) = embed_is_patch.shape
+
+        embeds_in_batch = list[torch.Tensor]()
+        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
+            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
+            embeds[embed_is_patch] = feats[f_is_patch]
+            embeds_in_batch.append(embeds)
+
+        return embeds_in_batch
+
     def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
+
         image_features = self._process_image_input(image_input)
-        image_input_idx = image_input["image_input_idx"]
-        seq_len = image_input["seq_len"]
-        batch_size, num_image, num_patch = image_features.shape[:3]
-        assert image_input_idx.shape == (batch_size, num_image, num_patch)
-
-        # insert the image feature into the embedding.
-        image_features = image_features.view(batch_size, num_image * num_patch,
-                                             -1)
-        image_input_idx = image_input_idx.view(batch_size,
-                                               num_image * num_patch)
-
-        valid = image_input_idx >= 0
-        image_features = image_features * valid[:, :, None].to(
-            image_features.dtype)
-        image_features = image_features.view(
-            batch_size * num_image * num_patch, -1).contiguous()
-
-        image_input_idx = image_input_idx * valid.to(image_input_idx.dtype)
-        offset = torch.cat([seq_len.new_zeros(1),
-                            seq_len.cumsum(dim=0)[:-1]],
-                           dim=0)[:, None]
-        image_input_idx = image_input_idx + offset.to(image_input_idx.dtype)
-        image_input_idx = image_input_idx.flatten()[:, None]
-        mat = image_input_idx == torch.arange(
-            seq_len.sum().item(), device=image_features.device)[None, :]
-        mat = mat.to(image_features.dtype)
-
-        # Note: In this original implementation from AI2, the final
-        # vision_embeddings will be always be the same length
-        # of input embeddings.
-        vision_embeddings = torch.einsum('nd,nm->md', image_features, mat)
-
-        # Split by the sizes of the input sequences. For each full embedding,
-        # extract the actual vision embeddings to be merged.
-        vision_embeddings = list(vision_embeddings.split(seq_len.tolist()))
-        for i in range(len(vision_embeddings)):
-            start, end = image_input['image_start_end'][i]
-            vision_embeddings[i] = vision_embeddings[i][start:end]
-
-        return vision_embeddings
+
+        return [
+            self._get_mm_embeds(*args) for args in zip(
+                image_features,
+                image_input["feat_is_patch"],
+                image_input["num_crops"],
+                image_input["embed_is_patch"],
+            )
+        ]
 
     def get_input_embeddings(
         self,
@@ -1311,11 +1641,20 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
+            assert self.img_patch_id is not None
+
+            # Extract the patch tokens scattered in _get_mm_embeds
+            patch_embeddings = json_map_leaves(
+                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+                cast(JSONTree[torch.Tensor], multimodal_embeddings),
+            )
+
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings, [
-                    DEFAULT_IMAGE_PATCH_TOKEN_ID, DEFAULT_IM_START_TOKEN_ID,
-                    DEFAULT_IM_END_TOKEN_ID, DEFAULT_IM_COL_TOKEN_ID
-                ])
+                input_ids,
+                inputs_embeds,
+                cast(NestedTensors, patch_embeddings),
+                self.img_patch_id,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 25ca8d1e71f..e93fa24a6e4 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -353,17 +353,17 @@ def batched(modality: str):
 
         Example:
 
-        .. code-block::
+            .. code-block::
 
-            Input:
-                Data: [[AAAA]
-                       [BBBB]
-                       [CCCC]]
+                Input:
+                    Data: [[AAAA]
+                        [BBBB]
+                        [CCCC]]
 
-            Output:
-                Element 1: [AAAA]
-                Element 2: [BBBB]
-                Element 3: [CCCC]
+                Output:
+                    Element 1: [AAAA]
+                    Element 2: [BBBB]
+                    Element 3: [CCCC]
         """
         return MultiModalFieldConfig(
             field=MultiModalBatchedField(),
@@ -384,18 +384,18 @@ def flat(modality: str, slices: Sequence[slice]):
 
         Example:
 
-        .. code-block::
-    
-            Given:
-                slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
+            .. code-block::
+        
+                Given:
+                    slices: [slice(0, 3), slice(3, 7), slice(7, 9)]
 
-            Input:
-                Data: [AAABBBBCC]
+                Input:
+                    Data: [AAABBBBCC]
 
-            Output:
-                Element 1: [AAA]
-                Element 2: [BBBB]
-                Element 3: [CC]
+                Output:
+                    Element 1: [AAA]
+                    Element 2: [BBBB]
+                    Element 3: [CC]
         """
         return MultiModalFieldConfig(
             field=MultiModalFlatField(slices=slices),
@@ -416,18 +416,18 @@ def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
 
         Example:
 
-        .. code-block::
-    
-            Given:
-                size_per_item: [3, 4, 2]
+            .. code-block::
+        
+                Given:
+                    size_per_item: [3, 4, 2]
 
-            Input:
-                Data: [AAABBBBCC]
+                Input:
+                    Data: [AAABBBBCC]
 
-            Output:
-                Element 1: [AAA]
-                Element 2: [BBBB]
-                Element 3: [CC]
+                Output:
+                    Element 1: [AAA]
+                    Element 2: [BBBB]
+                    Element 3: [CC]
     
         See also:
             :func:`MultiModalFieldConfig.flat`
@@ -456,19 +456,19 @@ def shared(modality: str, batch_size: int):
 
         Example:
 
-        .. code-block::
-    
-            Given:
-                batch_size: 4
+            .. code-block::
+        
+                Given:
+                    batch_size: 4
 
-            Input:
-                Data: [XYZ]
+                Input:
+                    Data: [XYZ]
 
-            Output:
-                Element 1: [XYZ]
-                Element 2: [XYZ]
-                Element 3: [XYZ]
-                Element 4: [XYZ]
+                Output:
+                    Element 1: [XYZ]
+                    Element 2: [XYZ]
+                    Element 3: [XYZ]
+                    Element 4: [XYZ]
         """
         return MultiModalFieldConfig(
             field=MultiModalSharedField(batch_size),
diff --git a/vllm/utils.py b/vllm/utils.py
index 6a41afff8f0..79981fa0953 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -33,8 +33,7 @@
 from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generator, Generic, Iterator, List, Literal,
-                    NamedTuple, Optional, Tuple, Type, TypeVar, Union,
-                    overload)
+                    NamedTuple, Optional, Tuple, Type, TypeVar, Union)
 from uuid import uuid4
 
 import cloudpickle
@@ -826,38 +825,6 @@ def is_list_of(
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
 
-@overload
-def json_map_leaves(
-    func: Callable[[T], U],
-    value: Dict[str, JSONTree[T]],
-) -> Dict[str, JSONTree[U]]:
-    ...
-
-
-@overload
-def json_map_leaves(
-    func: Callable[[T], U],
-    value: List[JSONTree[T]],
-) -> List[JSONTree[U]]:
-    ...
-
-
-@overload
-def json_map_leaves(
-    func: Callable[[T], U],
-    value: Tuple[JSONTree[T], ...],
-) -> Tuple[JSONTree[U], ...]:
-    ...
-
-
-@overload
-def json_map_leaves(
-    func: Callable[[T], U],
-    value: JSONTree[T],
-) -> JSONTree[U]:
-    ...
-
-
 def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
     if isinstance(value, dict):
         return {k: json_map_leaves(func, v) for k, v in value.items()}

From d2f9db72e6ba23f789317a9627aec2ffc799de9f Mon Sep 17 00:00:00 2001
From: Aoyu <aoyuzhang1989@gmail.com>
Date: Thu, 13 Feb 2025 20:35:18 +0800
Subject: [PATCH 0151/1240] [V1][Core] Add worker_base for v1 worker (#12816)

Signed-off-by: Aoyu <aoyuzhan@amazon.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Aoyu <aoyuzhan@amazon.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py                 | 43 +++++++++++++++++++++
 vllm/v1/worker/gpu_worker.py  | 28 +++++---------
 vllm/v1/worker/worker_base.py | 63 +++++++++++++++++++++++++++++++
 vllm/worker/worker_base.py    | 71 +++++++++++++++++++----------------
 4 files changed, 153 insertions(+), 52 deletions(-)
 create mode 100644 vllm/v1/worker/worker_base.py

diff --git a/vllm/utils.py b/vllm/utils.py
index 79981fa0953..1d7fbd4a787 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2220,3 +2220,46 @@ def import_pynvml():
     """
     import vllm.third_party.pynvml as pynvml
     return pynvml
+
+
+def warn_for_unimplemented_methods(cls: Type[T]) -> Type[T]:
+    """
+    A replacement for `abc.ABC`.
+    When we use `abc.ABC`, subclasses will fail to instantiate
+    if they do not implement all abstract methods.
+    Here, we only require `raise NotImplementedError` in the
+    base class, and log a warning if the method is not implemented
+    in the subclass.
+    """
+
+    original_init = cls.__init__
+
+    def find_unimplemented_methods(self: object):
+        unimplemented_methods = []
+        for attr_name in dir(self):
+            # bypass inner method
+            if attr_name.startswith('_'):
+                continue
+
+            try:
+                attr = getattr(self, attr_name)
+                # get the func of callable method
+                if callable(attr):
+                    attr_func = attr.__func__
+            except AttributeError:
+                continue
+            src = inspect.getsource(attr_func)
+            if "NotImplementedError" in src:
+                unimplemented_methods.append(attr_name)
+        if unimplemented_methods:
+            method_names = ','.join(unimplemented_methods)
+            msg = (f"Methods {method_names} not implemented in {self}")
+            logger.warning(msg)
+
+    @wraps(original_init)
+    def wrapped_init(self, *args, **kwargs) -> None:
+        original_init(self, *args, **kwargs)
+        find_unimplemented_methods(self)
+
+    type.__setattr__(cls, '__init__', wrapped_init)
+    return cls
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index beedca05cd5..8f2ffe5f16f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -21,6 +21,7 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+from vllm.v1.worker.worker_base import WorkerBase
 
 logger = init_logger(__name__)
 
@@ -28,7 +29,7 @@
     from vllm.v1.core.scheduler_output import SchedulerOutput
 
 
-class Worker:
+class Worker(WorkerBase):
 
     def __init__(
         self,
@@ -39,23 +40,11 @@ def __init__(
         is_driver_worker: bool = False,
     ):
 
-        # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.prompt_adapter_config = vllm_config.prompt_adapter_config
-        self.observability_config = vllm_config.observability_config
-
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
+        super().__init__(vllm_config=vllm_config,
+                         local_rank=local_rank,
+                         rank=rank,
+                         distributed_init_method=distributed_init_method,
+                         is_driver_worker=is_driver_worker)
 
         if self.model_config.trust_remote_code:
             # note: lazy import to avoid importing torch before initializing
@@ -126,7 +115,8 @@ def init_device(self):
         set_random_seed(self.model_config.seed)
 
         # Construct the model runner
-        self.model_runner = GPUModelRunner(self.vllm_config, self.device)
+        self.model_runner: GPUModelRunner = GPUModelRunner(
+            self.vllm_config, self.device)
 
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
new file mode 100644
index 00000000000..bc7e76c38ae
--- /dev/null
+++ b/vllm/v1/worker/worker_base.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import KVCacheSpec
+from vllm.worker.worker_base import WorkerBase as WorkerBaseV0
+
+logger = init_logger(__name__)
+
+
+class WorkerBase(WorkerBaseV0):
+    """
+    Abstract class for v1 worker, mainly define some methods for v1.
+    For methods shared by v0 and v1, define them in v0 WorkerBase
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        """
+        Initialize common worker components.
+        
+        Args:
+            vllm_config: Complete vLLM configuration
+            local_rank: Local device index
+            rank: Global rank in distributed setup
+            distributed_init_method: Distributed initialization method
+            is_driver_worker: Whether this worker handles driver 
+            responsibilities
+        """
+        # Configuration storage
+        super().__init__(vllm_config=vllm_config)
+
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+
+        # Device and model state
+        self.device: Optional[torch.device] = None
+        self.model_runner: Optional[nn.Module] = None
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """Get specifications for KV cache implementation."""
+        raise NotImplementedError
+
+    def compile_or_warm_up_model(self) -> None:
+        """Prepare model for execution through compilation/warmup."""
+        raise NotImplementedError
+
+    def check_health(self) -> None:
+        """Basic health check (override for device-specific checks)."""
+        return
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 819b81fbfdb..83fcf0865ae 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -3,7 +3,7 @@
 import dataclasses
 import os
 import time
-from abc import ABC, abstractmethod
+from abc import abstractmethod
 from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union
 
 import cloudpickle
@@ -19,7 +19,8 @@
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         resolve_obj_by_qualname, run_method,
-                        update_environment_variables)
+                        update_environment_variables,
+                        warn_for_unimplemented_methods)
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -27,7 +28,8 @@
 logger = init_logger(__name__)
 
 
-class WorkerBase(ABC):
+@warn_for_unimplemented_methods
+class WorkerBase:
     """Worker interface that allows vLLM to cleanly separate implementations for
     different hardware. Also abstracts control plane communication, e.g., to
     communicate request metadata to other workers.
@@ -53,35 +55,31 @@ def __init__(
         from vllm.platforms import current_platform
         self.current_platform = current_platform
 
-    @abstractmethod
     def init_device(self) -> None:
         """Initialize device state, such as loading the model or other on-device
         memory allocations.
         """
         raise NotImplementedError
 
-    @abstractmethod
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        The implementation may run profiling or other heuristics to determine
-        the size of caches.
-
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         """Initialize the KV cache with the given size in blocks.
         """
         raise NotImplementedError
 
+    def get_model(self) -> nn.Module:
+        raise NotImplementedError
+
+    def load_model(self) -> None:
+        """Load model onto target device."""
+        raise NotImplementedError
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        raise NotImplementedError
+
     def start_worker_execution_loop(self) -> None:
         """Execute model loop in parallel worker.
 
@@ -94,40 +92,43 @@ def start_worker_execution_loop(self) -> None:
                 if output is None:
                     return None
 
-    @abstractmethod
-    def get_model(self) -> nn.Module:
-        raise NotImplementedError
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available blocks for the GPU KV cache and
+        swappable CPU KV cache.
 
-    @abstractmethod
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
+        The implementation may run profiling or other heuristics to determine
+        the size of caches.
+
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        are blocks that are "active" on the device and can be appended to.
+        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        appended to.
+        """
         raise NotImplementedError
 
-    @abstractmethod
     def get_cache_block_size_bytes(self) -> int:
         """Return the size of a single cache block, in bytes. Used in
         speculative decoding.
         """
         raise NotImplementedError
 
-    @abstractmethod
     def add_lora(self, lora_request: LoRARequest) -> bool:
         raise NotImplementedError
 
-    @abstractmethod
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    @abstractmethod
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    @abstractmethod
     def list_loras(self) -> Set[int]:
         raise NotImplementedError
 
+    @property
+    def vocab_size(self) -> int:
+        """Get vocabulary size from model configuration."""
+        return self.model_config.get_vocab_size()
+
 
 class DelegateWorkerBase(WorkerBase):
     """
@@ -156,6 +157,10 @@ def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
 
+    def load_model(self) -> None:
+        """Load model onto target device."""
+        self.worker.load_model()
+
     def get_model(self) -> nn.Module:
         return self.worker.get_model()
 

From 4c622b49c14fd49c57a7fefcf90d6add56d2395f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=87=83?= <wulipc@163.com>
Date: Thu, 13 Feb 2025 22:17:57 +0800
Subject: [PATCH 0152/1240] [Misc] Qwen2.5-VL Optimization (#13155)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 61 ++++++++++--------------
 vllm/model_executor/models/qwen2_vl.py   | 37 ++++++++------
 2 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index d4c48dbdab1..6aec99b3f96 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -45,6 +45,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -271,8 +272,13 @@ def forward(
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
                    for x in (q, k, v))
         if rotary_pos_emb is not None:
-            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
-            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+            use_flash_attn = self.attn_backend == _Backend.FLASH_ATTN
+            q = apply_rotary_pos_emb_vision(q,
+                                            rotary_pos_emb,
+                                            use_flash_attn=use_flash_attn)
+            k = apply_rotary_pos_emb_vision(k,
+                                            rotary_pos_emb,
+                                            use_flash_attn=use_flash_attn)
 
         if self.attn_backend == _Backend.FLASH_ATTN:
             # from vllm_flash_attn.flash_attn_interface import (
@@ -296,20 +302,23 @@ def forward(
                                       "(b s) ... -> b s ...",
                                       b=batch_size)
         elif self.attn_backend == _Backend.TORCH_SDPA:
-            seq_length = q.size(1)
-            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
-            attention_mask = torch.zeros([1, seq_length, seq_length],
-                                         device=q.device,
-                                         dtype=torch.bool)
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
             for i in range(1, len(cu_seqlens)):
-                attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i],
-                               cu_seqlens[i - 1]:cu_seqlens[i]] = True
-            output = F.scaled_dot_product_attention(q,
-                                                    k,
-                                                    v,
-                                                    attention_mask,
-                                                    dropout_p=0.0)
-            context_layer = rearrange(output, "b h s d -> b s h d ")
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
         elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
@@ -327,25 +336,6 @@ def forward(
         return output
 
 
-class Qwen2RMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
 class Qwen2_5_VisionBlock(nn.Module):
 
     def __init__(
@@ -516,8 +506,7 @@ def __init__(
             hidden_size=self.hidden_size,
         )
 
-        # NOTE: We use torch native RMSNorm here for precision purposes.
-        norm_layer = partial(Qwen2RMSNorm, eps=norm_eps)
+        norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
         self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index d3294a4d4a3..961f53cef13 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -226,11 +226,15 @@ def apply_rotary_emb_torch(x: torch.Tensor,
 
 
 def apply_rotary_pos_emb_vision(t: torch.Tensor,
-                                freqs: torch.Tensor) -> torch.Tensor:
+                                freqs: torch.Tensor,
+                                use_flash_attn=False) -> torch.Tensor:
     t_ = t.float()
     cos = freqs.cos()
     sin = freqs.sin()
-    output = apply_rotary_emb_torch(t_, cos, sin).type_as(t)
+    apply_rotary_emb = apply_rotary_emb_torch
+    if use_flash_attn:
+        from flash_attn.layers.rotary import apply_rotary_emb
+    output = apply_rotary_emb(t_, cos, sin).type_as(t)
     return output
 
 
@@ -336,20 +340,23 @@ def forward(
                                       "(b s) ... -> b s ...",
                                       b=batch_size)
         elif self.attn_backend == _Backend.TORCH_SDPA:
-            seq_length = q.size(1)
-            q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v])
-            attention_mask = torch.zeros([1, seq_length, seq_length],
-                                         device=q.device,
-                                         dtype=torch.bool)
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
             for i in range(1, len(cu_seqlens)):
-                attention_mask[..., cu_seqlens[i - 1]:cu_seqlens[i],
-                               cu_seqlens[i - 1]:cu_seqlens[i]] = True
-            output = F.scaled_dot_product_attention(q,
-                                                    k,
-                                                    v,
-                                                    attention_mask,
-                                                    dropout_p=0.0)
-            context_layer = rearrange(output, "b h s d -> b s h d ")
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
         elif self.attn_backend == _Backend.XFORMERS:
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask

From f2c56f805f92abeb552ab75d44fa52ab912cbb2b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 13 Feb 2025 22:19:15 +0800
Subject: [PATCH 0153/1240] [VLM] Separate text-only and vision variants of the
 same model architecture (#13157)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  17 +-
 examples/offline_inference/vision_language.py |   3 +
 .../vision_language_multi_image.py            |   5 +-
 tests/distributed/test_pipeline_parallel.py   | 171 ++--
 .../vision_language/test_models.py            |  11 +-
 .../vision_language/vlm_utils/core.py         |  62 +-
 .../vision_language/vlm_utils/types.py        |  10 +-
 tests/models/registry.py                      |  37 +-
 tests/models/test_initialization.py           |   3 +-
 vllm/model_executor/models/chatglm.py         | 420 ++-------
 .../models/glm4_vision_encoder.py             | 312 -------
 vllm/model_executor/models/glm4v.py           | 662 ++++++++++++++
 vllm/model_executor/models/qwen.py            | 856 +-----------------
 vllm/model_executor/models/qwen_vl.py         | 794 ++++++++++++++++
 vllm/model_executor/models/registry.py        |   9 +-
 15 files changed, 1729 insertions(+), 1643 deletions(-)
 delete mode 100644 vllm/model_executor/models/glm4_vision_encoder.py
 create mode 100644 vllm/model_executor/models/glm4v.py
 create mode 100644 vllm/model_executor/models/qwen_vl.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 86b74617822..e498efc2208 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `DeepseekVLV2ForCausalLM`
+- * `DeepseekVLV2ForCausalLM`<sup>^</sup>
   * DeepSeek-VL2
   * T + I<sup>+</sup>
-  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
+  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.
   *
   * ✅︎
   * ✅︎
@@ -713,10 +713,10 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `ChatGLMModel`
+- * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
-  * `THUDM/glm-4v-9b` etc.
+  * `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.
   * ✅︎
   * ✅︎
   * ✅︎
@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `QWenLMHeadModel`
+- * `QwenVLForConditionalGeneration`<sup>^</sup>
   * Qwen-VL
   * T + I<sup>E+</sup>
   * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
@@ -862,13 +862,12 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 :::
 
+<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
+&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-:::{note}
-To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
-:::
-
 :::{note}
 H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
 :::
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 9a4183106cf..b9963669a0d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
               max_num_seqs=2,
               trust_remote_code=True,
               enforce_eager=True,
+              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
     prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
         {question}<|assistant|>"
 
@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 8d2172a606f..1a5ea0c70bc 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
     )
 
 
-def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-2b"
 
     llm = LLM(
@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
         trust_remote_code=True,
         max_model_len=1024,
         max_num_seqs=2,
+        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
         limit_mm_per_prompt={"image": len(image_urls)},
     )
     placeholders = "".join(f"Picture {i}: <img></img>\n"
@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
-    "h2ovl_chat": load_h2onvl,
+    "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
     "mllama": load_mllama,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 6a54fb74ba9..eb9cd5db9a4 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -6,6 +6,7 @@
  all workers in a node other than the head node, which can cause the test
  to fail.
 """
+import json
 import os
 from dataclasses import dataclass
 from typing import List, Literal, NamedTuple, Optional
@@ -15,6 +16,7 @@
 from vllm.config import TaskOption
 from vllm.logger import init_logger
 
+from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, fork_new_process_for_each_test
 
 logger = init_logger("test_pipeline_parallel")
@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple):
 
 class PPTestOptions(NamedTuple):
     multi_node_only: bool
-    trust_remote_code: bool
-    tokenizer_mode: Optional[str]
     load_format: Optional[str] = None
-    hf_overrides: Optional[str] = None
 
 
 @dataclass
@@ -64,10 +63,7 @@ def detailed(
         pp_base: int = 2,
         multi_node_only: bool = False,
         task: TaskOption = "auto",
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
     ):
         return PPTestSettings(
             parallel_setups=[
@@ -97,10 +93,7 @@ def detailed(
             vllm_major_versions=["0", "0", "1"],
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+                                       load_format=load_format),
         )
 
     @staticmethod
@@ -110,10 +103,7 @@ def fast(
         pp_base: int = 2,
         task: TaskOption = "auto",
         multi_node_only: bool = False,
-        trust_remote_code: bool = False,
-        tokenizer_mode: Optional[str] = None,
         load_format: Optional[str] = None,
-        hf_overrides: Optional[str] = None,
     ):
         return PPTestSettings(
             parallel_setups=[
@@ -126,19 +116,16 @@ def fast(
             vllm_major_versions=["0"],
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+                                       load_format=load_format),
         )
 
-    def iter_params(self, model_name: str):
+    def iter_params(self, model_id: str):
         opts = self.test_options
 
         for parallel_setup in self.parallel_setups:
             for backend, vllm_major_version in zip(self.distributed_backends,
                                                    self.vllm_major_versions):
-                yield (model_name, parallel_setup, backend, vllm_major_version,
+                yield (model_id, parallel_setup, backend, vllm_major_version,
                        self.task, opts)
 
 
@@ -150,16 +137,16 @@ def iter_params(self, model_name: str):
     # [Decoder-only]
     # Uses Llama
     # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
-    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True),  # noqa: E501
-    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
-    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
+    "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
+    "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
     "bigscience/bloomz-1b1": PPTestSettings.fast(),
-    "THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
-    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True),  # noqa: E501
-    "databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
-    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
+    "THUDM/chatglm3-6b": PPTestSettings.fast(),
+    "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
+    "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
+    "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
     "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
     "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
     "tiiuae/falcon-7b": PPTestSettings.fast(),
     "google/gemma-2b": PPTestSettings.fast(),
@@ -172,36 +159,36 @@ def iter_params(self, model_name: str):
     "ibm/PowerMoE-3b": PPTestSettings.fast(),
     # Uses Llama
     # "internlm/internlm-chat-7b": PPTestSettings.fast(),
-    "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
+    "internlm/internlm2-chat-7b": PPTestSettings.fast(),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
-    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
-    "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
+    "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
     # Uses Llama
     # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
     "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
     "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
     "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
-    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
     "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "microsoft/phi-2": PPTestSettings.fast(),
-    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
-    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"),  # noqa: E501
+    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
     "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
     "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
     "bigcode/starcoder2-3b": PPTestSettings.fast(),
-    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
+    "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"),  # noqa: E501
     # FIXME: Cannot load tokenizer in latest transformers version.
     # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
-    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
     # [Encoder-only]
     # TODO: Implement PP
     # "facebook/bart-base": PPTestSettings.fast(),
@@ -211,7 +198,7 @@ def iter_params(self, model_name: str):
     # [Text-only]
     "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
     "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True),  # noqa: E501
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
 }
 
 MULTIMODAL_MODELS = {
@@ -219,20 +206,20 @@ def iter_params(self, model_name: str):
     "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
-    "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
-    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
+    "THUDM/glm-4v-9b": PPTestSettings.fast(),
+    "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
     "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
     "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
     "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
-    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
-    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
-    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"),  # noqa: E501
-    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
+    "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
+    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
+    "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
+    "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
     # [Encoder-decoder]
     # TODO: Implement PP
     # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
@@ -258,7 +245,7 @@ def iter_params(self, model_name: str):
 
 
 def _compare_tp(
-    model_name: str,
+    model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
@@ -267,6 +254,7 @@ def _compare_tp(
     num_gpus_available: int,
     *,
     method: Literal["generate", "encode"],
+    is_multimodal: bool,
 ):
     (
         tp_size,
@@ -274,13 +262,32 @@ def _compare_tp(
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
-    (
-        multi_node_only,
-        trust_remote_code,
-        tokenizer_mode,
-        load_format,
-        hf_overrides,
-    ) = test_options
+
+    multi_node_only, load_format = test_options
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+
+    trust_remote_code = model_info.trust_remote_code
+    tokenizer_mode = model_info.tokenizer_mode
+    hf_overrides = model_info.hf_overrides
+
+    if load_format == "dummy":
+        # Avoid OOM
+        text_overrides = {
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+            "num_experts": 2,
+            "num_experts_per_tok": 2,
+            "num_local_experts": 2,
+        }
+
+        if is_multimodal:
+            hf_overrides.update({"text_config": text_overrides})
+        else:
+            hf_overrides.update(text_overrides)
+    else:
+        model_info.check_available_online(on_fail="skip")
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -312,7 +319,7 @@ def _compare_tp(
     if load_format:
         common_args.extend(["--load-format", load_format])
     if hf_overrides:
-        common_args.extend(["--hf-overrides", hf_overrides])
+        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
     if distributed_backend == "ray" and (vllm_major_version == "1"
@@ -355,11 +362,7 @@ def _compare_tp(
     ]
 
     try:
-        compare_two_settings(model_name,
-                             pp_args,
-                             tp_args,
-                             pp_env,
-                             method=method)
+        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
     except Exception:
         if pp_env is None:
             raise
@@ -369,17 +372,16 @@ def _compare_tp(
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
     [
-        params for model_name, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
 @fork_new_process_for_each_test
 def test_tp_language_generation(
-    model_name: str,
+    model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
@@ -387,28 +389,28 @@ def test_tp_language_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=False)
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
     [
-        params for model_name, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
 @fork_new_process_for_each_test
 def test_tp_language_embedding(
-    model_name: str,
+    model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
@@ -416,28 +418,28 @@ def test_tp_language_embedding(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
-                method="encode")
+                method="encode",
+                is_multimodal=False)
 
 
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
-     "vllm_major_version", "task", "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
+     "task", "test_options"),
     [
-        params for model_name, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_name)
-        if model_name in TEST_MODELS
+        params for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
 @fork_new_process_for_each_test
 def test_tp_multimodal_generation(
-    model_name: str,
+    model_id: str,
     parallel_setup: ParallelSetup,
     distributed_backend: str,
     vllm_major_version: str,
@@ -445,11 +447,12 @@ def test_tp_multimodal_generation(
     test_options: PPTestOptions,
     num_gpus_available,
 ):
-    _compare_tp(model_name,
+    _compare_tp(model_id,
                 parallel_setup,
                 distributed_backend,
                 vllm_major_version,
                 task,
                 test_options,
                 num_gpus_available,
-                method="generate")
+                method="generate",
+                is_multimodal=True)
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 4ed61cfc9b7..2c66edb539d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -155,10 +155,7 @@
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.skipif(
-                TRANSFORMERS_VERSION < "4.49.0",
-                reason="HF model requires transformers>=4.49.0",
-            ), pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
     "aria": VLMTestInfo(
@@ -215,7 +212,6 @@
             "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
         }),
         multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
         postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
@@ -240,7 +236,7 @@
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
-    "glm4": VLMTestInfo(
+    "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=identity,
@@ -351,7 +347,6 @@
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
         get_stop_token_ids=lambda tok: [128009],
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
@@ -437,7 +432,7 @@
         auto_cls=AutoModelForVision2Seq,
         marks=[large_gpu_mark(min_gb=48)],
     ),
-    "qwen": VLMTestInfo(
+    "qwen_vl": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=identity,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 0aed267692a..f2260f56737 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -4,12 +4,14 @@
 
 import torch
 from PIL.Image import Image
-from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
+from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config import TaskOption
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .....conftest import HfRunner, VllmRunner
+from ....registry import HF_EXAMPLE_MODELS
 from .types import RunnerOutput
 
 
@@ -31,10 +33,8 @@ def run_test(
     use_tokenizer_eos: bool,
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
-                                          List[int]]],
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
     stop_str: Optional[List[str]],
-    tokenizer_mode: str,
     limit_mm_per_prompt: Dict[str, int],
     vllm_runner_kwargs: Optional[Dict[str, Any]],
     hf_model_kwargs: Optional[Dict[str, Any]],
@@ -48,7 +48,10 @@ def run_test(
     """Modality agnostic test test executor for comparing HF/vLLM outputs."""
     # In the case of embeddings, vLLM takes separate input tensors
     vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
 
     vllm_outputs_per_mm = []
     hf_outputs_per_mm = []
@@ -57,17 +60,19 @@ def run_test(
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
-    vllm_kwargs: Dict[str, Any] = {}
-    if get_stop_token_ids is not None:
-        vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
-    if stop_str:
-        vllm_kwargs["stop"] = stop_str
 
-    if vllm_runner_kwargs is None:
-        vllm_runner_kwargs = {}
+    vllm_runner_kwargs_: Dict[str, Any] = {}
+    if model_info.tokenizer:
+        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
+    if model_info.tokenizer_mode:
+        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
+    if model_info.hf_overrides:
+        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+
+    if vllm_runner_kwargs:
+        vllm_runner_kwargs_.update(vllm_runner_kwargs)
 
     with vllm_runner(model,
-                     tokenizer_mode=tokenizer_mode,
                      max_model_len=max_model_len,
                      max_num_seqs=max_num_seqs,
                      dtype=dtype,
@@ -76,7 +81,15 @@ def run_test(
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=enforce_eager,
                      task=task,
-                     **vllm_runner_kwargs) as vllm_model:
+                     **vllm_runner_kwargs_) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+
+        vllm_kwargs: Dict[str, Any] = {}
+        if get_stop_token_ids is not None:
+            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+        if stop_str:
+            vllm_kwargs["stop"] = stop_str
+
         for prompts, media in vllm_inputs:
             vllm_kwargs[runner_mm_key] = media
             vllm_output = vllm_model.generate_greedy_logprobs(
@@ -93,16 +106,19 @@ def run_test(
     if patch_hf_runner is not None:
         hf_model = patch_hf_runner(hf_model)
 
-    # Some models need to explicitly pass the eos_token_id off the tokenizer or
-    # processor for a good comparison; currently assume processor/tokenizer
-    # agree on the EOS, and pull it off the tokenizer if requested.
-    hf_kwargs = {}
-    if use_tokenizer_eos:
-        hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
-    if stop_str:
-        hf_kwargs["stop_strings"] = stop_str
-
     with hf_model, torch.no_grad():
+        tokenizer = hf_model.tokenizer
+
+        # Some models need to explicitly pass the eos_token_id off the tokenizer
+        # or processor for a good comparison;
+        # currently assume processor/tokenizer agree on the EOS, and pull it off
+        # the tokenizer if requested.
+        hf_kwargs = {}
+        if use_tokenizer_eos:
+            hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+        if stop_str:
+            hf_kwargs["stop_strings"] = stop_str
+
         for prompts, media in inputs:
             hf_kwargs[runner_mm_key] = media
             hf_output = hf_model.generate_greedy_logprobs_limit(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index ae3b9d59bf9..ecb86609c52 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -8,12 +8,12 @@
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import (AutoModelForCausalLM, BatchEncoding,
-                          PreTrainedTokenizerBase)
+from transformers import AutoModelForCausalLM, BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import identity
 
 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
     vllm_runner_kwargs: Optional[Dict[str, Any]] = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
-                                          List[int]]] = None
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
     stop_str: Optional[List[str]] = None
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
 
     marks: Optional[List[MarkDecorator]] = None
 
-    tokenizer_mode: str = "auto"
-
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
         in all test types, which are NOT used when creating the parametrized
@@ -180,7 +177,6 @@ def get_non_parametrized_runner_kwargs(self):
             "hf_model_kwargs": self.hf_model_kwargs,
             "stop_str": self.stop_str,
             "patch_hf_runner": self.patch_hf_runner,
-            "tokenizer_mode": self.tokenizer_mode
         }
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 66a487ca60e..9c0e6b3374d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -104,7 +104,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
-    # ChatGLMModel supports multimodal
+    "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
+                                    trust_remote_code=True),
     "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
                                          trust_remote_code=True),
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
@@ -138,7 +139,8 @@ def check_available_online(
     "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
                                             trust_remote_code=True),
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
-    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
+    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
+                                        extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
     "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
@@ -167,7 +169,8 @@ def check_available_online(
                                             trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
-    # QWenLMHeadModel supports multimodal
+    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
+                                       trust_remote_code=True),
     "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
@@ -232,18 +235,19 @@ def check_available_online(
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
-    "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
-                                    extras={"text_only": "THUDM/chatglm3-6b"},
-                                    trust_remote_code=True),
-    "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
-                                                       is_available_online=False),
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
-    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
+    "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
+                                        trust_remote_code=True,
+                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
+    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
+                                         extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
-    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
+                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
@@ -253,21 +257,24 @@ def check_available_online(
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
-    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
                               trust_remote_code=True),
-    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"),  # noqa: E501
+    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
+                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
-    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
-                                       extras={"text_only": "Qwen/Qwen-7B-Chat"},  # noqa: E501
-                                       trust_remote_code=True),
+    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
+                                                      extras={"chat": "Qwen/Qwen-VL-Chat"},  # noqa: E501
+                                                      trust_remote_code=True,
+                                                      hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 64928a65d85..c58c6372316 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -18,8 +18,7 @@ def test_can_initialize(model_arch):
 
     # Avoid OOM
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
-        if hf_config.model_type == "deepseek_vl_v2":
-            hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
+        hf_config.update(model_info.hf_overrides)
 
         if hasattr(hf_config, "text_config"):
             text_config: PretrainedConfig = hf_config.text_config
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 153c85cfb21..26b4a95c530 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -1,20 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-
 # Adapted from
-# https://github.com/THUDM/CogAgent
-"""Inference-only CogAgent model compatible with THUDM weights."""
-from argparse import Namespace
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+# https://github.com/THUDM/ChatGLM2-6B
+"""Inference-only ChatGLM model compatible with THUDM weights."""
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from torch.nn import LayerNorm
-from torchvision import transforms
-from torchvision.transforms import InterpolationMode
-from transformers import PreTrainedTokenizer, TensorType
-from transformers.image_utils import ImageInput
-from transformers.tokenization_utils_base import TextInput
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
@@ -31,204 +23,14 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
-from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
-from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, BatchFeature,
-                                        MultiModalFieldConfig,
-                                        PromptReplacement)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, merge_multimodal_embeddings)
-
-
-class GLMImagePixelInputs(TypedDict):
-    pixel_values: torch.Tensor
-    """Shape: `(batch_size, num_channels, height, width)`"""
-
-
-class GLM4VProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    """
-
-    def __init__(
-        self,
-        config: ChatGLMConfig,
-        tokenizer: PreTrainedTokenizer,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        if vision_config := getattr(config, "vision_config", None):
-            image_size = vision_config["image_size"]
-
-            self.image_transform = transforms.Compose([
-                transforms.Resize(
-                    (image_size, image_size),
-                    interpolation=InterpolationMode.BICUBIC,
-                ),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ])
-        else:
-            self.image_transform = None
-
-    def __call__(
-        self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-        text_inputs = self.tokenizer(text)
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            if self.image_transform is None:
-                raise ValueError("This model does not support image inputs")
-
-            pixel_values = [self.image_transform(image) for image in images]
-            image_inputs = {"pixel_values": torch.stack(pixel_values)}
-
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
-
-
-class GLM4VProcessingInfo(BaseProcessingInfo):
-
-    def get_tokenizer(self):
-        tokenizer = self.ctx.tokenizer
-        assert isinstance(tokenizer, PreTrainedTokenizer)
-        return tokenizer
-
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(ChatGLMConfig)
-
-    def get_hf_processor(self) -> GLM4VProcessor:
-        return GLM4VProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
-        )
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": 1}
-
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_feature_tokens()}
-
-    def get_num_image_tokens(self) -> int:
-        hf_config = self.get_hf_config()
-        if not (vision_config := getattr(hf_config, "vision_config", None)):
-            return 0
-
-        image_size = vision_config["image_size"]
-        patch_size = vision_config["patch_size"]
-        grid_length = image_size // patch_size // 2
-        return grid_length * grid_length
-
-    def get_num_image_feature_tokens(self) -> int:
-        # EVA2CLIPModel has embeddings for boi and eoi tokens as well
-        return self.get_num_image_tokens() + 2
-
-
-class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
-
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.info.get_hf_config()
-        if not (vision_config := getattr(hf_config, "vision_config", None)):
-            return ProcessorInputs(prompt_text="", mm_data={})
-
-        target_width = target_height = vision_config["image_size"]
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-
-        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
-
-        return ProcessorInputs(
-            prompt_text=base_text * num_images,
-            mm_data=mm_data,
-        )
-
-
-class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
-
-    def _get_prompt_replacements(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
-        hf_config = self.info.get_hf_config()
-        if not hasattr(hf_config, "vision_config"):
-            return []
-
-        boi_token_id = hf_config.boi_token_id
-        image_token_id = hf_config.pad_token_id
-        eoi_token_id = hf_config.eoi_token_id
-
-        def get_replacement(item_idx: int):
-            num_image_tokens = self.info.get_num_image_tokens()
-            image_tokens = [image_token_id] * num_image_tokens
-
-            return [boi_token_id] + image_tokens + [eoi_token_id]
-
-        return [
-            PromptReplacement(
-                modality="image",
-                target=[boi_token_id, image_token_id, eoi_token_id],
-                replacement=get_replacement,
-            ),
-        ]
+                    maybe_prefix)
 
 
 class GLMAttention(nn.Module):
@@ -489,7 +291,7 @@ def forward(
         position_ids: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, IntermediateTensors]:
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             hidden_states = layer(
@@ -498,8 +300,12 @@ def forward(
                 kv_cache=kv_caches[i - self.start_layer],
                 attn_metadata=attn_metadata,
             )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
         # Final layer norm.
-        if get_pp_group().is_last_rank and self.post_layer_norm:
+        if self.post_layer_norm:
             hidden_states = self.final_layernorm(hidden_states)
 
         return hidden_states
@@ -534,61 +340,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                            quant_config=quant_config,
                                            prefix=f"{prefix}.output_layer")
 
-        vision_config_flag = getattr(config, 'vision_config', None)
-        if vision_config_flag is not None:
-            self.vision_config = Namespace(**config.vision_config)
-            self.vision = EVA2CLIPModel(self.config,
-                                        quant_config,
-                                        prefix=f"{prefix}.vision")
-        else:
-            self.vision = None
-
         self.make_empty_intermediate_tensors = (
             self.encoder.make_empty_intermediate_tensors)
 
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> GLMImagePixelInputs:
-
-        pixel_values = kwargs.pop("pixel_values", None)
-        if pixel_values is not None and self.vision is not None:
-            if isinstance(pixel_values, torch.Tensor):
-                if pixel_values.ndim > 2:
-                    pixel_values = torch.concat(list(pixel_values))
-            elif isinstance(pixel_values, list):
-                return torch.concat(pixel_values)
-            else:
-                raise TypeError("""pixel_values must be a torch.Tensor
-                    or a list of torch.Tensor
-                    """)
-        return GLMImagePixelInputs(pixel_values=pixel_values)
-
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        if image_input["pixel_values"] is None:
-            return None
-        pixel_values = image_input["pixel_values"].to(
-            dtype=self.config.torch_dtype)
-        vision_embeddings = self.vision(pixel_values)
-        return vision_embeddings
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.embedding(input_ids)
-        if multimodal_embeddings is not None:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=[
-                    self.config.boi_token_id,
-                    self.config.pad_token_id,
-                    self.config.eoi_token_id,
-                ],
-            )
-        return inputs_embeds
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embedding(input_ids)
 
     def forward(
         self,
@@ -599,26 +355,24 @@ def forward(
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
 
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        if intermediate_tensors is not None:
-            inputs_embeds = intermediate_tensors["hidden_states"]
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
         # Run encoder.
         hidden_states = self.encoder(
-            hidden_states=inputs_embeds,
+            hidden_states=hidden_states,
             position_ids=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
         )
 
-        if not get_pp_group().is_last_rank:
-            return IntermediateTensors({"hidden_states": hidden_states})
         return hidden_states
 
     def load_weights(self, weights: Iterable[Tuple[str,
@@ -660,12 +414,18 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+class ChatGLMBaseModel(nn.Module):
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={".word_embeddings": ""}, )
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[ChatGLMModel] = ChatGLMModel,
+    ) -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -678,27 +438,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
-        self.transformer = ChatGLMModel(vllm_config=vllm_config,
-                                        prefix=maybe_prefix(
-                                            prefix, "transformer"))
+        self.transformer = transformer_type(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.transformer.output_layer.weight = (
                 self.transformer.embedding.weight)
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
         self.sampler = get_sampler()
-
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
-                intermediate_tensors: Optional[IntermediateTensors] = None,
-                **kwargs) -> torch.Tensor:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         **kwargs)
-        return hidden_states
+        self.make_empty_intermediate_tensors = (
+            self.transformer.make_empty_intermediate_tensors)
 
     def compute_logits(
         self,
@@ -722,7 +472,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-class ChatGLM(ChatGLMBaseModel):
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
@@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel):
     embedding_modules = {}
     embedding_padding_modules = []
 
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        if hasattr(config, "vision_config"):
+            hf_overrides = {"architectures": ["GLM4VForCausalLM"]}
+            raise RuntimeError(
+                "The configuration of this model indicates that it supports "
+                "vision inputs, but you instantiated the text-only version "
+                "of this model. Please use the vision model by setting "
+                f"`--hf-overrides {hf_overrides!r}`")
 
-class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
-
-    packed_modules_mapping = {
-        "query_key_value": ["query_key_value"],
-        "dense_h_to_4h": ["dense_h_to_4h"],
-        "merged_proj": ["gate_proj", "dense_h_to_4h"]
-    }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "query_key_value",
-        "dense",
-        "dense_h_to_4h",
-        "dense_4h_to_h",
-        # vision
-        "fc1",
-        "fc2",
-        "merged_proj",
-        "linear_proj"
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="transformer.encoder",
-            connector="transformer.vision.linear_proj",
-            tower_model="transformer.vision.transformer")
-
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
-        return self.transformer.get_multimodal_embeddings(**kwargs)
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-    def get_input_embeddings(
+    def forward(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        return self.transformer.get_input_embeddings(input_ids,
-                                                     multimodal_embeddings)
-
-
-@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
-                                        info=GLM4VProcessingInfo,
-                                        dummy_inputs=GLM4VDummyInputsBuilder)
-class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
-                         SupportsMultiModal):
-    # Ensure that the LoRA support check passes when the class is not
-    # initialized, but set all these attributes to empty.
-    # These will be updated when an instance class is selected
-    packed_modules_mapping = {}
-    supported_lora_modules = []
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-    def __new__(
-        cls,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
-        config = vllm_config.model_config.hf_config
-
-        # Initialize VL
-        if hasattr(config, "vision_config"):  # noqa: SIM108
-            instance_cls = ChatGLMV
-        # Initialize LLM
-        else:
-            instance_cls = ChatGLM
-
-        # quant_config references base class members,
-        # so update values before init is called
-        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
-        cls.supported_lora_modules += instance_cls.supported_lora_modules
-        cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
-        return instance_cls(vllm_config=vllm_config, prefix=prefix)
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
+        return hidden_states
diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py
deleted file mode 100644
index 2facd1353ae..00000000000
--- a/vllm/model_executor/models/glm4_vision_encoder.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/THUDM/GLM-4
-"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
-from argparse import Namespace
-from typing import Optional
-
-import torch
-from torch import nn
-from torch.nn import LayerNorm
-
-from vllm.attention.layer import MultiHeadAttention
-from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               QKVParallelLinear,
-                                               ReplicatedLinear,
-                                               RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
-
-
-class PatchEmbedding(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.proj = nn.Conv2d(config.in_channels,
-                              config.hidden_size,
-                              kernel_size=config.patch_size,
-                              stride=config.patch_size)
-        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
-        self.position_embedding = nn.Embedding(config.num_positions,
-                                               config.hidden_size)
-
-    def forward(self, images: torch.Tensor) -> torch.Tensor:
-        """
-        Parameters:
-        images : torch.Tensor
-            Input image tensor with shape (B, C, H, W)
-
-        Returns:
-        torch.Tensor
-            Transformed tensor with shape (B, L, D)
-        """
-        images = images.to(device=self.proj.weight.device,
-                           dtype=self.proj.weight.dtype)
-        x = self.proj(images)
-        x = x.flatten(2).transpose(1, 2)
-        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
-        x = torch.cat((cls_token, x), dim=1)
-        x += self.position_embedding.weight.unsqueeze(0)
-        return x
-
-
-class Attention(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.num_heads_per_rank = config.num_heads // self.tp_size
-        self.head_dim = config.hidden_size // config.num_heads
-        self.scale = self.head_dim**-0.5
-
-        self.query_key_value = QKVParallelLinear(
-            config.hidden_size,
-            self.head_dim,
-            config.num_heads,
-            quant_config=quant_config,
-            prefix=f"{prefix}.query_key_value",
-        )
-        self.dense = RowParallelLinear(
-            config.hidden_size,
-            config.hidden_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.dense",
-        )
-
-        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
-                                       self.scale)
-        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
-        q, k, v = qkv.chunk(3, dim=-1)
-
-        out = self.attn(q, k, v)
-        output, _ = self.dense(out)
-        output = self.output_dropout(output)
-        return output
-
-
-class MLP(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        super().__init__()
-        self.config = config
-        self.activation_fn = get_act_fn(config.hidden_act)
-        self.fc1 = ColumnParallelLinear(
-            config.hidden_size,
-            config.intermediate_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.fc1",
-        )
-        self.fc2 = RowParallelLinear(
-            config.intermediate_size,
-            config.hidden_size,
-            quant_config=quant_config,
-            prefix=f"{prefix}.fc2",
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x, _ = self.fc1(x)
-        x = self.activation_fn(x)
-        x, _ = self.fc2(x)
-        return x
-
-
-class TransformerLayer(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        super().__init__()
-        self.input_layernorm = LayerNorm(config.hidden_size,
-                                         eps=config.layer_norm_eps)
-        self.attention = Attention(config,
-                                   quant_config=quant_config,
-                                   prefix=f"{prefix}.attention")
-        self.mlp = MLP(config,
-                       quant_config=quant_config,
-                       prefix=f"{prefix}.mlp")
-        self.post_attention_layernorm = LayerNorm(config.hidden_size,
-                                                  eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        attention_input = hidden_states
-        attention_output = self.input_layernorm(
-            self.attention(attention_input))
-        hidden_states = attention_input + attention_output
-        mlp_input = hidden_states
-        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
-        output = mlp_input + mlp_output
-        return output
-
-
-class Transformer(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        super().__init__()
-        self.layers = nn.ModuleList([
-            TransformerLayer(config,
-                             quant_config=quant_config,
-                             prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
-        ])
-
-    def forward(self, hidden_states):
-        for layer_module in self.layers:
-            hidden_states = layer_module(hidden_states)
-        return hidden_states
-
-
-class GLU(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        in_features,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        """
-        The original implementation is the same as:
-        ```python
-        self.dense_h_to_4h = ColumnParallelLinear(
-            config.hidden_size,
-            config.ffn_hidden_size,
-            bias=False,
-            quant_config=quant_config
-        )
-
-        self.gate_proj = ColumnParallelLinear(
-            config.hidden_size,
-            config.ffn_hidden_size,
-            bias=False,
-            quant_config=quant_config
-        )
-        ```
-        ```
-        gate_proj_output, _ = self.gate_proj(x)
-        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
-        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
-        ```
-
-        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
-        ```
-        self.merged_proj = MergedColumnParallelLinear(
-            config.hidden_size,
-            [config.ffn_hidden_size] * 2,
-            bias=False,
-            quant_config=quant_config
-        )
-        ```
-        ```
-        x, _ = self.merged_proj(x)
-        ```
-        """
-        super().__init__()
-        self.linear_proj = ReplicatedLinear(in_features,
-                                            config.hidden_size,
-                                            bias=False,
-                                            quant_config=quant_config,
-                                            prefix=f"{prefix}.linear_proj")
-        self.norm1 = nn.LayerNorm(config.hidden_size)
-        self.act1 = nn.GELU()
-        self.act2 = SiluAndMul()
-
-        self.merged_proj = MergedColumnParallelLinear(
-            config.hidden_size, [config.ffn_hidden_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.merged_proj")
-
-        self.dense_4h_to_h = RowParallelLinear(
-            config.ffn_hidden_size,
-            config.hidden_size,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.dense_4h_to_h")
-
-    def forward(self, x):
-        x, _ = self.linear_proj(x)
-        x = self.act1(self.norm1(x))
-        x, _ = self.merged_proj(x)
-        x = self.act2(x)
-        x, _ = self.dense_4h_to_h(x)
-        return x
-
-
-class EVA2CLIPModel(nn.Module):
-
-    def __init__(
-        self,
-        config,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = '',
-    ):
-        super().__init__()
-        vision_config = Namespace(**config.vision_config)
-        self.patch_embedding = PatchEmbedding(vision_config)
-        self.transformer = Transformer(vision_config,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.transformer")
-        self.linear_proj = GLU(config,
-                               in_features=config.hidden_size,
-                               quant_config=quant_config,
-                               prefix=f"{prefix}.linear_proj")
-        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
-                              out_channels=config.hidden_size,
-                              kernel_size=2,
-                              stride=2)
-        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
-        self.scaling_factor = vision_config.scaling_factor
-
-    def forward(self, images: torch.Tensor) -> torch.Tensor:
-        """
-        Parameters:
-        images : torch.Tensor
-            Input image tensor with shape (B, C, H, W)
-
-        Returns:
-        torch.Tensor
-            Transformed tensor with shape (B, L, D)
-        """
-        x = self.patch_embedding(images)
-        x = self.transformer(x)
-        x = x[:, 1:]
-
-        b, s, h = x.shape
-        grid_size = int(s**0.5)
-        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
-        x = self.conv(x)
-
-        x = x.flatten(2).transpose(1, 2)
-        x = self.linear_proj(x)
-        boi = self.boi.expand(x.shape[0], -1, -1)
-        eoi = self.eoi.expand(x.shape[0], -1, -1)
-        x = torch.cat((boi, x, eoi), dim=1)
-        x = x / self.scaling_factor
-        return x
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
new file mode 100644
index 00000000000..67f19841f4a
--- /dev/null
+++ b/vllm/model_executor/models/glm4v.py
@@ -0,0 +1,662 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/THUDM/CogAgent
+"""Inference-only CogAgent model compatible with THUDM weights."""
+from argparse import Namespace
+from typing import List, Literal, Mapping, Optional, TypedDict, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import PreTrainedTokenizer, TensorType
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, BatchFeature,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs import ChatGLMConfig
+
+from .chatglm import ChatGLMBaseModel, ChatGLMModel
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .utils import flatten_bn, merge_multimodal_embeddings
+
+
+class GLMVImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size, num_channels, height, width)`"""
+
+
+class EVA2CLIPPatchEmbedding(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.proj = nn.Conv2d(config.in_channels,
+                              config.hidden_size,
+                              kernel_size=config.patch_size,
+                              stride=config.patch_size)
+        self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.position_embedding = nn.Embedding(config.num_positions,
+                                               config.hidden_size)
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        images = images.to(device=self.proj.weight.device,
+                           dtype=self.proj.weight.dtype)
+        x = self.proj(images)
+        x = x.flatten(2).transpose(1, 2)
+        cls_token = self.cls_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat((cls_token, x), dim=1)
+        x += self.position_embedding.weight.unsqueeze(0)
+        return x
+
+
+class EVA2CLIPAttention(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_rank = config.num_heads // self.tp_size
+        self.head_dim = config.hidden_size // config.num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.query_key_value = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            config.num_heads,
+            quant_config=quant_config,
+            prefix=f"{prefix}.query_key_value",
+        )
+        self.dense = RowParallelLinear(
+            config.hidden_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense",
+        )
+
+        self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim,
+                                       self.scale)
+        self.output_dropout = torch.nn.Dropout(config.dropout_prob)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        qkv, _ = self.query_key_value(x)  # B, L, 3 * H * D
+        q, k, v = qkv.chunk(3, dim=-1)
+
+        out = self.attn(q, k, v)
+        output, _ = self.dense(out)
+        output = self.output_dropout(output)
+        return output
+
+
+class EVA2CLIPMLP(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.config = config
+        self.activation_fn = get_act_fn(config.hidden_act)
+        self.fc1 = ColumnParallelLinear(
+            config.hidden_size,
+            config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc1",
+        )
+        self.fc2 = RowParallelLinear(
+            config.intermediate_size,
+            config.hidden_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.fc2",
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.fc1(x)
+        x = self.activation_fn(x)
+        x, _ = self.fc2(x)
+        return x
+
+
+class EVA2CLIPTransformerLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.input_layernorm = LayerNorm(config.hidden_size,
+                                         eps=config.layer_norm_eps)
+        self.attention = EVA2CLIPAttention(config,
+                                           quant_config=quant_config,
+                                           prefix=f"{prefix}.attention")
+        self.mlp = EVA2CLIPMLP(config,
+                               quant_config=quant_config,
+                               prefix=f"{prefix}.mlp")
+        self.post_attention_layernorm = LayerNorm(config.hidden_size,
+                                                  eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        attention_input = hidden_states
+        attention_output = self.input_layernorm(
+            self.attention(attention_input))
+        hidden_states = attention_input + attention_output
+        mlp_input = hidden_states
+        mlp_output = self.post_attention_layernorm(self.mlp(mlp_input))
+        output = mlp_input + mlp_output
+        return output
+
+
+class EVA2CLIPTransformer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        self.layers = nn.ModuleList([
+            EVA2CLIPTransformerLayer(config,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.layers.{layer_idx}")
+            for layer_idx in range(config.num_hidden_layers)
+        ])
+
+    def forward(self, hidden_states):
+        for layer_module in self.layers:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class EVA2CLIPGLU(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        in_features,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        """
+        The original implementation is the same as:
+        ```python
+        self.dense_h_to_4h = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+
+        self.gate_proj = ColumnParallelLinear(
+            config.hidden_size,
+            config.ffn_hidden_size,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        gate_proj_output, _ = self.gate_proj(x)
+        dense_h_to_4h_output, _ = self.dense_h_to_4h(x)
+        x = torch.cat([gate_proj_output, dense_h_to_4h_output], dim=-1)
+        ```
+
+        We merge two ColumnParallelLinear into one MergedColumnParallelLinear:
+        ```
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size,
+            [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config
+        )
+        ```
+        ```
+        x, _ = self.merged_proj(x)
+        ```
+        """
+        super().__init__()
+        self.linear_proj = ReplicatedLinear(in_features,
+                                            config.hidden_size,
+                                            bias=False,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.linear_proj")
+        self.norm1 = nn.LayerNorm(config.hidden_size)
+        self.act1 = nn.GELU()
+        self.act2 = SiluAndMul()
+
+        self.merged_proj = MergedColumnParallelLinear(
+            config.hidden_size, [config.ffn_hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.merged_proj")
+
+        self.dense_4h_to_h = RowParallelLinear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h")
+
+    def forward(self, x):
+        x, _ = self.linear_proj(x)
+        x = self.act1(self.norm1(x))
+        x, _ = self.merged_proj(x)
+        x = self.act2(x)
+        x, _ = self.dense_4h_to_h(x)
+        return x
+
+
+class EVA2CLIPModel(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = '',
+    ):
+        super().__init__()
+        vision_config = Namespace(**config.vision_config)
+        self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
+        self.transformer = EVA2CLIPTransformer(vision_config,
+                                               quant_config=quant_config,
+                                               prefix=f"{prefix}.transformer")
+        self.linear_proj = EVA2CLIPGLU(config,
+                                       in_features=config.hidden_size,
+                                       quant_config=quant_config,
+                                       prefix=f"{prefix}.linear_proj")
+        self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
+                              out_channels=config.hidden_size,
+                              kernel_size=2,
+                              stride=2)
+        self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.scaling_factor = vision_config.scaling_factor
+
+    def forward(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+        images : torch.Tensor
+            Input image tensor with shape (B, C, H, W)
+
+        Returns:
+        torch.Tensor
+            Transformed tensor with shape (B, L, D)
+        """
+        x = self.patch_embedding(images)
+        x = self.transformer(x)
+        x = x[:, 1:]
+
+        b, s, h = x.shape
+        grid_size = int(s**0.5)
+        x = x.view(b, grid_size, grid_size, h).permute(0, 3, 1, 2)
+        x = self.conv(x)
+
+        x = x.flatten(2).transpose(1, 2)
+        x = self.linear_proj(x)
+        boi = self.boi.expand(x.shape[0], -1, -1)
+        eoi = self.eoi.expand(x.shape[0], -1, -1)
+        x = torch.cat((boi, x, eoi), dim=1)
+        x = x / self.scaling_factor
+        return x
+
+
+class GLM4VModel(ChatGLMModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        quant_config = vllm_config.quant_config
+
+        self.vision = EVA2CLIPModel(self.config,
+                                    quant_config,
+                                    prefix=f"{prefix}.vision")
+
+
+class GLM4VProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+    """
+
+    def __init__(
+        self,
+        config: ChatGLMConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        vision_config = config.vision_config
+        image_size = vision_config["image_size"]
+
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ])
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class GLM4VProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self):
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+        return tokenizer
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(ChatGLMConfig)
+
+    def get_hf_processor(self) -> GLM4VProcessor:
+        return GLM4VProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+        )
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_num_image_feature_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
+
+    def get_num_image_feature_tokens(self) -> int:
+        # EVA2CLIPModel has embeddings for boi and eoi tokens as well
+        return self.get_num_image_tokens() + 2
+
+
+class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        target_width = target_height = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
+
+        return ProcessorInputs(
+            prompt_text=base_text * num_images,
+            mm_data=mm_data,
+        )
+
+
+class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+
+        boi_token_id = hf_config.boi_token_id
+        image_token_id = hf_config.pad_token_id
+        eoi_token_id = hf_config.eoi_token_id
+
+        def get_replacement(item_idx: int):
+            num_image_tokens = self.info.get_num_image_tokens()
+            image_tokens = [image_token_id] * num_image_tokens
+
+            return [boi_token_id] + image_tokens + [eoi_token_id]
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[boi_token_id, image_token_id, eoi_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
+                                        info=GLM4VProcessingInfo,
+                                        dummy_inputs=GLM4VDummyInputsBuilder)
+class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                       SupportsMultiModal):
+
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"],
+        "merged_proj": ["gate_proj", "dense_h_to_4h"]
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+        # vision
+        "fc1",
+        "fc2",
+        "merged_proj",
+        "linear_proj"
+    ]
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="transformer.encoder",
+            connector="transformer.vision.linear_proj",
+            tower_model="transformer.vision.transformer")
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[GLM4VModel] = GLM4VModel,
+    ) -> None:
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            transformer_type=transformer_type,
+        )
+
+        self.transformer: GLM4VModel
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return GLMVImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        return None
+
+    def _process_image_input(
+            self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
+
+        return self.transformer.vision(pixel_values)
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=[
+                    self.config.boi_token_id,
+                    self.config.pad_token_id,
+                    self.config.eoi_token_id,
+                ],
+            )
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
+
+        return hidden_states
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 4b8aeaddbdd..a45e9463ab6 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -6,381 +6,35 @@
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
 
-import copy
-import math
-import re
-import unicodedata
-from functools import lru_cache, partial
-from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable,
-                    List, Literal, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
-from torchvision import transforms
-from torchvision.transforms import InterpolationMode
-from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
-                          TensorType)
-from transformers.image_utils import ImageInput
-from transformers.tokenization_utils_base import TextInput
+from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.logger import init_logger
-from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
-                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
-from vllm.multimodal.parse import MultiModalDataItems
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import (flatten_bn, is_pp_missing_parameter,
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, merge_multimodal_embeddings)
-
-logger = init_logger(__name__)
-
-
-class QwenImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
-    """
-    Shape: `(batch_size * num_images, 3, image_size, image_size)`
-
-    Note that image_size is the value in the vision config to which we resize
-    the image to in the normalization transform. Currently multi-image support
-    can only be leveraged by passing image embeddings directly.
-    """
-
-
-class QwenImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, 256, hidden_size)`
-
-    `hidden_size` must match the hidden size of the language model backbone
-    and is stored in the visual config of the model if we have one.
-    """
-
-
-QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
-
-
-class VisualAttention(nn.Module):
-    """self-attention layer class.
-    Self-attention layer takes input with size [s, b, h]
-    and returns output of the same size.
-    """
-
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        bias: bool = True,
-        kdim: Optional[int] = None,
-        vdim: Optional[int] = None,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = self.kdim == embed_dim \
-            and self.vdim == embed_dim
-
-        self.num_heads = num_heads
-
-        # Per attention head and per partition values.
-        assert embed_dim % num_heads == 0
-        self.hidden_size_per_attention_head = embed_dim // num_heads
-        self.num_attention_heads_per_partition = num_heads
-        self.hidden_size_per_partition = embed_dim
-
-        # Strided linear layer.
-        assert self._qkv_same_embed_dim, \
-                'Visual Attention implementation only supports self-attention'
-        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
-        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
-        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        # query/key/value: [sq, b, h]
-        sq, b, _ = x.size()
-        mixed_x_layer, _ = self.in_proj(x)
-
-        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
-        new_tensor_shape = mixed_x_layer.size()[:-1] + \
-            (self.num_attention_heads_per_partition,
-             3 * self.hidden_size_per_attention_head)
-        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
-        query_layer, key_layer, value_layer = mixed_x_layer.split(
-            self.hidden_size_per_attention_head, dim=-1)
-
-        # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
-        # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
-
-        q_scaled = query_layer / self.norm_factor
-        if attn_mask is not None:
-            attention_probs = torch.baddbmm(attn_mask, q_scaled,
-                                            key_layer.transpose(-2, -1))
-        else:
-            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
-        attention_probs = attention_probs.softmax(dim=-1)
-
-        value_layer = value_layer.view(
-            sq, b * self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head).transpose(0, 1)
-
-        # matmul: [b * np, sq, hn]
-        context_layer = torch.bmm(attention_probs, value_layer)
-
-        # change view [b, np, sq, hn]
-        context_layer = context_layer.view(
-            b, self.num_attention_heads_per_partition, sq,
-            self.hidden_size_per_attention_head)
-
-        # [b, np, sq, hn] --> [sq, b, np, hn]
-        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
-
-        # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        output, _ = self.out_proj(context_layer)
-
-        return output
-
-
-class QwenVMLP(nn.Module):
-    """MLP for the visual component of the Qwen model."""
-
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        self.c_fc = ColumnParallelLinear(hidden_size,
-                                         intermediate_size,
-                                         bias=True,
-                                         quant_config=quant_config)
-        self.act_fn = get_act_fn("gelu")
-        self.c_proj = RowParallelLinear(
-            intermediate_size,
-            hidden_size,
-            bias=True,
-            quant_config=quant_config,
-        )
-
-    def forward(self, x):
-        x, _ = self.c_fc(x)
-        x = self.act_fn(x)
-        x, _ = self.c_proj(x)
-        return x
-
-
-class VisualAttentionBlock(nn.Module):
-
-    def __init__(
-        self,
-        d_model: int,
-        n_head: int,
-        mlp_ratio: float = 4.0,
-        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-
-        self.ln_1 = norm_layer(d_model)
-        self.ln_2 = norm_layer(d_model)
-        mlp_width = int(d_model * mlp_ratio)
-        self.attn = VisualAttention(d_model, n_head)
-        self.mlp = QwenVMLP(
-            hidden_size=d_model,
-            intermediate_size=mlp_width,
-            quant_config=quant_config,
-        )
-
-    def attention(
-        self,
-        x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
-        return self.attn(x, attn_mask=attn_mask)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        attn_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-
-class TransformerBlock(nn.Module):
-
-    def __init__(
-        self,
-        width: int,
-        layers: int,
-        heads: int,
-        mlp_ratio: float = 4.0,
-        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
-        super().__init__()
-        self.width = width
-        self.layers = layers
-
-        self.resblocks = nn.ModuleList([
-            VisualAttentionBlock(width,
-                                 heads,
-                                 mlp_ratio,
-                                 norm_layer=norm_layer,
-                                 quant_config=quant_config)
-            for _ in range(layers)
-        ])
-
-    def get_cast_dtype(self) -> torch.dtype:
-        return self.resblocks[0].mlp.c_fc.weight.dtype
-
-    def get_cast_device(self) -> torch.device:
-        return self.resblocks[0].mlp.c_fc.weight.device
-
-    def forward(self,
-                x: torch.Tensor,
-                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
-        for r in self.resblocks:
-            x = r(x, attn_mask=attn_mask)
-        return x
-
-
-class VisionTransformer(nn.Module):
-
-    def __init__(self,
-                 image_size: int,
-                 patch_size: int,
-                 width: int,
-                 layers: int,
-                 heads: int,
-                 mlp_ratio: float,
-                 n_queries: int = 256,
-                 output_dim: int = 512,
-                 image_start_id: int = 151857,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 **kwargs):
-        super().__init__()
-        image_height, image_width = self.image_size = (image_size, image_size)
-        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
-        self.grid_size = (image_height // patch_height,
-                          image_width // patch_width)
-        self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(in_channels=3,
-                               out_channels=width,
-                               kernel_size=patch_size,
-                               stride=patch_size,
-                               bias=False)
-
-        # class embeddings and positional embeddings
-        scale = width**-0.5
-        self.positional_embedding = nn.Parameter(scale *
-                                                 torch.randn(256, width))
-
-        norm_layer = partial(nn.LayerNorm, eps=1e-6)
-
-        self.ln_pre = norm_layer(width)
-        self.transformer = TransformerBlock(width,
-                                            layers,
-                                            heads,
-                                            mlp_ratio,
-                                            norm_layer=norm_layer,
-                                            quant_config=quant_config)
-
-        self.attn_pool = Resampler2(
-            grid_size=int(math.sqrt(n_queries)),
-            embed_dim=output_dim,
-            num_heads=output_dim // 128,
-            kv_dim=width,
-            norm_layer=norm_layer,
-            adaptive=False,
-            do_post_projection=False,
-        ).to(
-            device=self.positional_embedding.device,
-            dtype=self.positional_embedding.dtype,
-        )
-
-        self.ln_post = norm_layer(output_dim)
-        self.proj = nn.Parameter(
-            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
-
-        self.image_start_id = image_start_id
-        self.image_end_id = image_start_id + 1
-        self.image_pad_id = image_start_id + 2
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x.to(
-            dtype=self.transformer.get_cast_dtype(),
-            device=self.transformer.get_cast_device(),
-        )
-
-        # to patches
-        x = self.conv1(x)  # shape = [*, width, grid, grid]
-        x = x.reshape(x.shape[0], x.shape[1],
-                      -1)  # shape = [*, width, grid ** 2]
-        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
-
-        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
-            x.size(1))))
-
-        x = self.ln_pre(x)
-
-        x = x.permute(1, 0, 2)  # NLD -> LND
-        x = self.transformer(x)
-        x = x.permute(1, 0, 2)  # LND -> NLD
-
-        x = self.attn_pool(x)
-        x = self.ln_post(x)
-        x = x @ self.proj
-
-        return x
+                    maybe_prefix)
 
 
 class QWenMLP(nn.Module):
@@ -564,12 +218,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
-        if (vision_config := getattr(config, "visual", None)):
-            self.visual = VisionTransformer(**vision_config,
-                                            quant_config=quant_config)
-        else:
-            self.visual = None
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.wte(input_ids)
 
@@ -592,6 +240,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
+
         for i in range(self.start_layer, self.end_layer):
             layer = self.h[i]
             hidden_states, residual = layer(
@@ -610,302 +259,25 @@ def forward(
         return hidden_states
 
 
-@lru_cache(maxsize=1)
-def _get_tokenizer_without_image_pad(
-        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
-    """
-    The logic of adding image pad tokens should only be applied in
-    :class:`QWenVLProcessor`, so they are patched out here.
-
-    The definition of the wrapped tokenizer can be found here:
-    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
-    """
-    new_tokenizer = copy.deepcopy(tokenizer)
-
-    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
-
-        def tokenize(
-            self,
-            text: str,
-            allowed_special: Union[AbstractSet[str], str] = "all",
-            disallowed_special: Union[Collection[str], str] = (),
-            **kwargs,
-        ) -> list[Union[bytes, str]]:
-            text = unicodedata.normalize("NFC", text)
-
-            return [
-                self.decoder[t] for t in self.tokenizer.encode(
-                    text,
-                    allowed_special=allowed_special,
-                    disallowed_special=disallowed_special,
-                )
-            ]
-
-        def _decode(
-            self,
-            token_ids: Union[int, List[int]],
-            skip_special_tokens: bool = False,
-            errors: Optional[str] = None,
-            **kwargs,
-        ) -> str:
-            if isinstance(token_ids, int):
-                token_ids = [token_ids]
-
-            return self.tokenizer.decode(
-                token_ids,
-                errors=errors or self.errors,
-            )
-
-    TokenizerWithoutImagePad.__name__ = \
-        f"{tokenizer.__class__.__name__}WithoutImagePad"
-
-    new_tokenizer.__class__ = TokenizerWithoutImagePad
-    return new_tokenizer
-
-
-class QWenVLProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    We call the wrapped tokenizer to automatically insert image pad tokens:
-    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
-
-    The image processor is defined here:
-    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
-    """
+class QWenBaseModel(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        tokenizer: PreTrainedTokenizer,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[QWenModel] = QWenModel,
     ) -> None:
         super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
-        if vision_config := getattr(self.config, "visual", None):
-            image_size = vision_config["image_size"]
-
-            self.image_transform = transforms.Compose([
-                transforms.Resize(
-                    (image_size, image_size),
-                    interpolation=InterpolationMode.BICUBIC,
-                ),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ])
-        else:
-            self.image_transform = None
-
-    @property
-    def image_start_tag(self) -> str:
-        return self.tokenizer.image_start_tag  # type: ignore
-
-    @property
-    def image_end_tag(self) -> str:
-        return self.tokenizer.image_end_tag  # type: ignore
-
-    @property
-    def image_pad_tag(self) -> str:
-        return self.tokenizer.image_pad_tag  # type: ignore
-
-    def __call__(
-        self,
-        text: Optional[Union[TextInput, list[TextInput]]] = None,
-        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        text_inputs = self.tokenizer(text)
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            if self.image_transform is None:
-                raise ValueError("This model does not support image inputs")
-
-            pixel_values = [self.image_transform(image) for image in images]
-            image_inputs = {"pixel_values": torch.stack(pixel_values)}
-
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
-
-
-class QWenVLProcessingInfo(BaseProcessingInfo):
-
-    def get_tokenizer(self) -> PreTrainedTokenizer:
-        tokenizer = self.ctx.tokenizer
-        assert isinstance(tokenizer, PreTrainedTokenizer)
-
-        return _get_tokenizer_without_image_pad(tokenizer)
-
-    def get_hf_processor(self) -> QWenVLProcessor:
-        tokenizer = self.ctx.tokenizer
-        assert isinstance(tokenizer, PreTrainedTokenizer)
-
-        return QWenVLProcessor(self.get_hf_config(), tokenizer)
-
-    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None}
-
-    def get_mm_max_tokens_per_item(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> Mapping[str, int]:
-        return {"image": self.get_num_image_tokens()}
-
-    def get_num_image_tokens(self) -> int:
-        hf_config = self.get_hf_config()
-        if not (vision_config := getattr(hf_config, "visual", None)):
-            return 0
-
-        image_size = vision_config["image_size"]
-        patch_size = vision_config["patch_size"]
-        grid_length = image_size // patch_size // 2
-        return grid_length * grid_length
-
-
-class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):
-
-    def get_dummy_processor_inputs(
-        self,
-        seq_len: int,
-        mm_counts: Mapping[str, int],
-    ) -> ProcessorInputs:
-        hf_config = self.info.get_hf_config()
-        if not (vision_config := getattr(hf_config, "visual", None)):
-            return ProcessorInputs(prompt_text="", mm_data={})
-
-        processor = self.info.get_hf_processor()
-        img_start = processor.image_start_tag
-        img_end = processor.image_end_tag
-
-        target_width = target_height = vision_config["image_size"]
-        num_images = mm_counts.get("image", 0)
-
-        mm_data = {
-            "image":
-            self._get_dummy_images(width=target_width,
-                                   height=target_height,
-                                   num_images=num_images)
-        }
-
-        return ProcessorInputs(
-            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
-                                for i in range(1, num_images + 1)),
-            mm_data=mm_data,
-        )
-
-
-class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]):
-
-    def _call_hf_processor(
-        self,
-        prompt: str,
-        mm_data: Mapping[str, object],
-        mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        # Drops anything between <img>/</img> tags; encoding with the tokenizer
-        # will automatically add the image pads for the context.
-        prompt, num_matched_images = re.subn(
-            r"(Picture \d*: <img>).*?(<\/img>\n)",
-            r"\1\2",
-            prompt,
-        )
-
-        image_data = mm_data.get("images")
-        if image_data is not None:
-            assert isinstance(image_data, list)
-
-            num_images = len(image_data)
-            if num_matched_images != num_images:
-                logger.warning(
-                    "Number of matched image placeholders %s doesn't match "
-                    "the number of expected images %s; check your placeholder "
-                    "formatting.", num_matched_images, num_images)
-
-        return super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-        )
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            image_embeds=MultiModalFieldConfig.batched("image"),
-        )
-
-    def _get_prompt_replacements(
-        self,
-        mm_items: MultiModalDataItems,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
-        hf_config = self.info.get_hf_config()
-        if not hasattr(hf_config, "visual"):
-            return []
-
-        tokenizer = self.info.get_tokenizer()
-        special_tokens: dict[str,
-                             int] = tokenizer.special_tokens  # type: ignore
-
-        processor = self.info.get_hf_processor()
-        img_start_id = special_tokens[processor.image_start_tag]
-        img_end_id = special_tokens[processor.image_end_tag]
-        img_pad_id = special_tokens[processor.image_pad_tag]
-
-        num_image_tokens = self.info.get_num_image_tokens()
-        image_tokens = [img_pad_id] * num_image_tokens
-
-        return [
-            PromptReplacement(
-                modality="image",
-                target=[img_start_id, img_end_id],
-                replacement=PromptReplacementDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
-                ),
-            )
-        ]
-
-
-class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA):
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.transformer = QWenModel(vllm_config=vllm_config,
-                                     prefix=maybe_prefix(
-                                         prefix, "transformer"))
+        self.transformer = transformer_type(vllm_config=vllm_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "transformer"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -916,104 +288,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.visual["image_size"]
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[QwenImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return QwenImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
-            )
-
-        if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            return QwenImageEmbeddingInputs(
-                type="image_embeds",
-                data=flatten_bn(image_embeds),
-            )
-
-        return None
-
-    def _process_image_input(self,
-                             image_input: QwenImageInputs) -> torch.Tensor:
-        if image_input["type"] == "image_embeds":
-            return image_input["data"]
-
-        assert self.transformer.visual is not None
-        return self.transformer.visual(image_input["data"])
-
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        if image_input is None:
-            return None
-
-        vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
-
-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
-
-        if multimodal_embeddings is not None:
-            assert self.transformer.visual is not None
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.transformer.visual.image_pad_id)
-
-        return inputs_embeds
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs: object,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if intermediate_tensors is not None:
-            inputs_embeds = None
-
-        # NOTE: In v1, inputs_embeds is always generated at model runner, this
-        # condition is for v0 compatibility.
-        elif inputs_embeds is None:
-            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
-            inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      vision_embeddings)
-            input_ids = None
-
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
-        return hidden_states
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
@@ -1072,26 +346,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class QWenLLM(QWenBaseModel):
-    packed_modules_mapping = {
-        "c_attn": ["c_attn"],
-        "gate_up_proj": [
-            "w2",
-            "w1",
-        ],
-    }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "c_attn",
-        "gate_up_proj",
-        "c_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-
-class QWenVL(QWenBaseModel, SupportsMultiModal):
+class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
     packed_modules_mapping = {
         "c_attn": ["c_attn"],
         "gate_up_proj": [
@@ -1104,62 +359,35 @@ class QWenVL(QWenBaseModel, SupportsMultiModal):
         "c_attn",
         "gate_up_proj",
         "c_proj",
-        # visual module
-        "out_proj",
-        "in_proj",
-        "c_fc",
-        # resampler
-        "kv_proj",
     ]
 
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def get_mm_mapping(self) -> MultiModelKeys:
-        """
-        Get the module prefix in multimodal models
-        """
-        return MultiModelKeys.from_string_field(
-            language_model="transformer.h",
-            connector="transformer.visual.attn_pool",
-            tower_model="transformer.visual.transformer")
-
-
-@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor,
-                                        info=QWenVLProcessingInfo,
-                                        dummy_inputs=QWenVLDummyInputsBuilder)
-class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA):
-    """
-    QWenLMHeadModel is not only applicable to LLM  but also to VL, which is not 
-    conducive to the current integration logic of LoRA in vLLM. Therefore, it 
-    is necessary to separate them.
-    """
-    # Ensure that the LoRA support check passes when the class is not
-    # initialized, but set all these attributes to empty.
-    # These will be updated when an instance class is selected
-    packed_modules_mapping = {}
-    supported_lora_modules = []
-    embedding_modules = {}
-    embedding_padding_modules = []
-
-    def __new__(
-        cls,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> QWenBaseModel:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
+        if hasattr(config, "visual"):
+            hf_overrides = {
+                "architectures": ["QwenVLForConditionalGeneration"]
+            }
+            raise RuntimeError(
+                "The configuration of this model indicates that it supports "
+                "vision inputs, but you instantiated the text-only version "
+                "of this model. Please use the vision model by setting "
+                f"`--hf-overrides {hf_overrides!r}`")
+
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-        # Initialize VL
-        if hasattr(config, "visual"):  # noqa: SIM108
-            instance_cls = QWenVL
-        # Initialize LLM
-        else:
-            instance_cls = QWenLLM
-
-        # quant_config references base class members,
-        # so update values before init is called
-        cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
-        cls.supported_lora_modules += instance_cls.supported_lora_modules
-        cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
-        return instance_cls(vllm_config=vllm_config, prefix=prefix)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
+        return hidden_states
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
new file mode 100644
index 00000000000..5316eb7e002
--- /dev/null
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -0,0 +1,794 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py
+# Copyright (c) Alibaba Cloud.
+"""Inference-only Qwen-VL model compatible with HuggingFace weights."""
+
+import copy
+import math
+import re
+import unicodedata
+from functools import lru_cache, partial
+from typing import (AbstractSet, Callable, Collection, List, Literal, Mapping,
+                    Optional, TypedDict, Union)
+
+import torch
+from torch import nn
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer,
+                          TensorType)
+from transformers.image_utils import ImageInput
+from transformers.tokenization_utils_base import TextInput
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .qwen import QWenBaseModel, QWenModel
+from .utils import flatten_bn, merge_multimodal_embeddings
+
+
+class QwenImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, 3, image_size, image_size)`
+
+    Note that image_size is the value in the vision config to which we resize
+    the image to in the normalization transform. Currently multi-image support
+    can only be leveraged by passing image embeddings directly.
+    """
+
+
+class QwenImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, 256, hidden_size)`
+
+    `hidden_size` must match the hidden size of the language model backbone
+    and is stored in the visual config of the model if we have one.
+    """
+
+
+QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
+
+
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        bias: bool = True,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim \
+            and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, \
+                'Visual Attention implementation only supports self-attention'
+        self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim)
+        self.out_proj = ReplicatedLinear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # query/key/value: [sq, b, h]
+        sq, b, _ = x.size()
+        mixed_x_layer, _ = self.in_proj(x)
+
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+            (self.num_attention_heads_per_partition,
+             3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(
+            self.hidden_size_per_attention_head, dim=-1)
+
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled,
+                                            key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        value_layer = value_layer.view(
+            sq, b * self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head).transpose(0, 1)
+
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(
+            b, self.num_attention_heads_per_partition, sq,
+            self.hidden_size_per_attention_head)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        output, _ = self.out_proj(context_layer)
+
+        return output
+
+
+class QwenVLMLP(nn.Module):
+    """MLP for the visual component of the Qwen model."""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.c_fc = ColumnParallelLinear(hidden_size,
+                                         intermediate_size,
+                                         bias=True,
+                                         quant_config=quant_config)
+        self.act_fn = get_act_fn("gelu")
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+
+    def forward(self, x):
+        x, _ = self.c_fc(x)
+        x = self.act_fn(x)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class VisualAttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        d_model: int,
+        n_head: int,
+        mlp_ratio: float = 4.0,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+
+        self.ln_1 = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = QwenVLMLP(
+            hidden_size=d_model,
+            intermediate_size=mlp_width,
+            quant_config=quant_config,
+        )
+
+    def attention(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None
+        return self.attn(x, attn_mask=attn_mask)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x = x + self.attention(self.ln_1(x), attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class TransformerBlock(nn.Module):
+
+    def __init__(
+        self,
+        width: int,
+        layers: int,
+        heads: int,
+        mlp_ratio: float = 4.0,
+        norm_layer: Callable[[int], nn.Module] = nn.LayerNorm,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+
+        self.resblocks = nn.ModuleList([
+            VisualAttentionBlock(width,
+                                 heads,
+                                 mlp_ratio,
+                                 norm_layer=norm_layer,
+                                 quant_config=quant_config)
+            for _ in range(layers)
+        ])
+
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+
+    def forward(self,
+                x: torch.Tensor,
+                attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self,
+                 image_size: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 mlp_ratio: float,
+                 n_queries: int = 256,
+                 output_dim: int = 512,
+                 image_start_id: int = 151857,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 **kwargs):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height,
+                          image_width // patch_width)
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3,
+                               out_channels=width,
+                               kernel_size=patch_size,
+                               stride=patch_size,
+                               bias=False)
+
+        # class embeddings and positional embeddings
+        scale = width**-0.5
+        self.positional_embedding = nn.Parameter(scale *
+                                                 torch.randn(256, width))
+
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(width,
+                                            layers,
+                                            heads,
+                                            mlp_ratio,
+                                            norm_layer=norm_layer,
+                                            quant_config=quant_config)
+
+        self.attn_pool = Resampler2(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+            adaptive=False,
+            do_post_projection=False,
+        ).to(
+            device=self.positional_embedding.device,
+            dtype=self.positional_embedding.dtype,
+        )
+
+        self.ln_post = norm_layer(output_dim)
+        self.proj = nn.Parameter(
+            (output_dim**-0.5) * torch.randn(output_dim, output_dim))
+
+        self.image_start_id = image_start_id
+        self.image_end_id = image_start_id + 1
+        self.image_pad_id = image_start_id + 2
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        x = x + get_abs_pos(self.positional_embedding, int(math.sqrt(
+            x.size(1))))
+
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.attn_pool(x)
+        x = self.ln_post(x)
+        x = x @ self.proj
+
+        return x
+
+
+class QwenVLModel(QWenModel):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.visual = VisionTransformer(**config.visual,
+                                        quant_config=quant_config)
+
+
+@lru_cache(maxsize=1)
+def _get_tokenizer_without_image_pad(
+        tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
+    """
+    The logic of adding image pad tokens should only be applied in
+    :class:`QwenVLProcessor`, so they are patched out here.
+
+    The definition of the wrapped tokenizer can be found here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
+    """
+    new_tokenizer = copy.deepcopy(tokenizer)
+
+    class TokenizerWithoutImagePad(tokenizer.__class__):  # type: ignore
+
+        def tokenize(
+            self,
+            text: str,
+            allowed_special: Union[AbstractSet[str], str] = "all",
+            disallowed_special: Union[Collection[str], str] = (),
+            **kwargs,
+        ) -> list[Union[bytes, str]]:
+            text = unicodedata.normalize("NFC", text)
+
+            return [
+                self.decoder[t] for t in self.tokenizer.encode(
+                    text,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            ]
+
+        def _decode(
+            self,
+            token_ids: Union[int, List[int]],
+            skip_special_tokens: bool = False,
+            errors: Optional[str] = None,
+            **kwargs,
+        ) -> str:
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
+
+            return self.tokenizer.decode(
+                token_ids,
+                errors=errors or self.errors,
+            )
+
+    TokenizerWithoutImagePad.__name__ = \
+        f"{tokenizer.__class__.__name__}WithoutImagePad"
+
+    new_tokenizer.__class__ = TokenizerWithoutImagePad
+    return new_tokenizer
+
+
+class QwenVLProcessor:
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    We call the wrapped tokenizer to automatically insert image pad tokens:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
+
+    The image processor is defined here:
+    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: PreTrainedTokenizer,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        vision_config = config.visual
+        image_size = vision_config["image_size"]
+
+        self.image_transform = transforms.Compose([
+            transforms.Resize(
+                (image_size, image_size),
+                interpolation=InterpolationMode.BICUBIC,
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=(0.48145466, 0.4578275, 0.40821073),
+                std=(0.26862954, 0.26130258, 0.27577711),
+            ),
+        ])
+
+    @property
+    def image_start_tag(self) -> str:
+        return self.tokenizer.image_start_tag  # type: ignore
+
+    @property
+    def image_end_tag(self) -> str:
+        return self.tokenizer.image_end_tag  # type: ignore
+
+    @property
+    def image_pad_tag(self) -> str:
+        return self.tokenizer.image_pad_tag  # type: ignore
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        text_inputs = self.tokenizer(text)
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values = [self.image_transform(image) for image in images]
+            image_inputs = {"pixel_values": torch.stack(pixel_values)}
+
+        return BatchFeature(
+            {
+                **text_inputs,
+                **image_inputs,
+            },
+            tensor_type=return_tensors,
+        )
+
+
+class QwenVLProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> PreTrainedTokenizer:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return _get_tokenizer_without_image_pad(tokenizer)
+
+    def get_hf_processor(self) -> QwenVLProcessor:
+        tokenizer = self.ctx.tokenizer
+        assert isinstance(tokenizer, PreTrainedTokenizer)
+
+        return QwenVLProcessor(self.get_hf_config(), tokenizer)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
+
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.visual
+
+        image_size = vision_config["image_size"]
+        patch_size = vision_config["patch_size"]
+        grid_length = image_size // patch_size // 2
+        return grid_length * grid_length
+
+
+class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.visual
+
+        processor = self.info.get_hf_processor()
+        img_start = processor.image_start_tag
+        img_end = processor.image_end_tag
+
+        target_width = target_height = vision_config["image_size"]
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="".join(f"Picture {i}: {img_start}{img_end}\n"
+                                for i in range(1, num_images + 1)),
+            mm_data=mm_data,
+        )
+
+
+class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # Drops anything between <img>/</img> tags; encoding with the tokenizer
+        # will automatically add the image pads for the context.
+        prompt, num_matched_images = re.subn(
+            r"(Picture \d*: <img>).*?(<\/img>\n)",
+            r"\1\2",
+            prompt,
+        )
+
+        image_data = mm_data.get("images")
+        if image_data is not None:
+            assert isinstance(image_data, list)
+
+            num_images = len(image_data)
+            assert num_matched_images == num_images
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        tokenizer = self.info.get_tokenizer()
+        special_tokens: dict[str,
+                             int] = tokenizer.special_tokens  # type: ignore
+
+        processor = self.info.get_hf_processor()
+        img_start_id = special_tokens[processor.image_start_tag]
+        img_end_id = special_tokens[processor.image_end_tag]
+        img_pad_id = special_tokens[processor.image_pad_tag]
+
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [img_pad_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[img_start_id, img_end_id],
+                replacement=PromptReplacementDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(QwenVLMultiModalProcessor,
+                                        info=QwenVLProcessingInfo,
+                                        dummy_inputs=QwenVLDummyInputsBuilder)
+class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
+                                     SupportsMultiModal):
+    packed_modules_mapping = {
+        "c_attn": ["c_attn"],
+        "gate_up_proj": [
+            "w2",
+            "w1",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "c_attn",
+        "gate_up_proj",
+        "c_proj",
+        # visual module
+        "out_proj",
+        "in_proj",
+        "c_fc",
+        # resampler
+        "kv_proj",
+    ]
+
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="transformer.h",
+            connector="transformer.visual.attn_pool",
+            tower_model="transformer.visual.transformer")
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        transformer_type: type[QwenVLModel] = QwenVLModel,
+    ) -> None:
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            transformer_type=transformer_type,
+        )
+
+        self.transformer: QwenVLModel
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.visual["image_size"]
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[QwenImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return QwenImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return QwenImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        return None
+
+    def _process_image_input(self,
+                             image_input: QwenImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        return self.transformer.visual(image_input["data"])
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.transformer.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.transformer.visual.image_pad_id)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
+        return hidden_states
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 198b6d13471..08c4642b4a9 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -39,7 +39,7 @@
     "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
     "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
     "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
-    # ChatGLMModel supports multimodal
+    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
@@ -90,7 +90,7 @@
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
-    # QWenLMHeadModel supports multimodal
+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
     "RWForCausalLM": ("falcon", "FalconForCausalLM"),
@@ -156,10 +156,9 @@
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
-    "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
-    "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+    "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
@@ -175,7 +174,7 @@
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
     "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"),  # noqa: E501
-    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
+    "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"),  # noqa: E501
     "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501

From 34bcd1cd907b39089c5e54667472dcd85756fa96 Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vajain@redhat.com>
Date: Thu, 13 Feb 2025 20:22:22 +0530
Subject: [PATCH 0154/1240] [Bugfix] Missing Content Type returns 500 Internal
 Server Error (#13193)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_basic.py | 16 ++++++++++
 vllm/entrypoints/openai/api_server.py  | 42 +++++++++++++++++---------
 2 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index 0d44a7611ae..a970981b756 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -156,3 +156,19 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
                                                     max_tokens=10)
 
     assert len(response.choices) == 1
+
+
+@pytest.mark.asyncio
+async def test_request_wrong_content_type(server: RemoteOpenAIServer):
+
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client()
+
+    with pytest.raises(openai.APIStatusError):
+        await client.chat.completions.create(
+            messages=chat_input,
+            model=MODEL_NAME,
+            max_tokens=10000,
+            extra_headers={
+                "Content-Type": "application/x-www-form-urlencoded"
+            })
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 588a7781c11..b50a72f3a6c 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -19,7 +19,7 @@
 from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union
 
 import uvloop
-from fastapi import APIRouter, FastAPI, HTTPException, Request
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -252,6 +252,15 @@ def _cleanup_ipc_path():
             multiprocess.mark_process_dead(engine_process.pid)
 
 
+async def validate_json_request(raw_request: Request):
+    content_type = raw_request.headers.get("content-type", "").lower()
+    if content_type != "application/json":
+        raise HTTPException(
+            status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
+            detail="Unsupported Media Type: Only 'application/json' is allowed"
+        )
+
+
 router = APIRouter()
 
 
@@ -335,7 +344,7 @@ async def ping(raw_request: Request) -> Response:
     return await health(raw_request)
 
 
-@router.post("/tokenize")
+@router.post("/tokenize", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def tokenize(request: TokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
@@ -350,7 +359,7 @@ async def tokenize(request: TokenizeRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/detokenize")
+@router.post("/detokenize", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def detokenize(request: DetokenizeRequest, raw_request: Request):
     handler = tokenization(raw_request)
@@ -379,7 +388,8 @@ async def show_version():
     return JSONResponse(content=ver)
 
 
-@router.post("/v1/chat/completions")
+@router.post("/v1/chat/completions",
+             dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
@@ -400,7 +410,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/v1/completions")
+@router.post("/v1/completions", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
@@ -418,7 +428,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post("/v1/embeddings")
+@router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
@@ -464,7 +474,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/pooling")
+@router.post("/pooling", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_pooling(request: PoolingRequest, raw_request: Request):
     handler = pooling(raw_request)
@@ -482,7 +492,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/score")
+@router.post("/score", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
@@ -500,7 +510,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/v1/score")
+@router.post("/v1/score", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def create_score_v1(request: ScoreRequest, raw_request: Request):
     logger.warning(
@@ -510,7 +520,7 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
-@router.post("/rerank")
+@router.post("/rerank", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def do_rerank(request: RerankRequest, raw_request: Request):
     handler = rerank(raw_request)
@@ -527,7 +537,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
     assert_never(generator)
 
 
-@router.post("/v1/rerank")
+@router.post("/v1/rerank", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def do_rerank_v1(request: RerankRequest, raw_request: Request):
     logger.warning_once(
@@ -538,7 +548,7 @@ async def do_rerank_v1(request: RerankRequest, raw_request: Request):
     return await do_rerank(request, raw_request)
 
 
-@router.post("/v2/rerank")
+@router.post("/v2/rerank", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def do_rerank_v2(request: RerankRequest, raw_request: Request):
     return await do_rerank(request, raw_request)
@@ -582,7 +592,7 @@ async def reset_prefix_cache(raw_request: Request):
         return Response(status_code=200)
 
 
-@router.post("/invocations")
+@router.post("/invocations", dependencies=[Depends(validate_json_request)])
 async def invocations(raw_request: Request):
     """
     For SageMaker, routes requests to other handlers based on model `task`.
@@ -632,7 +642,8 @@ async def stop_profile(raw_request: Request):
         "Lora dynamic loading & unloading is enabled in the API server. "
         "This should ONLY be used for local development!")
 
-    @router.post("/v1/load_lora_adapter")
+    @router.post("/v1/load_lora_adapter",
+                 dependencies=[Depends(validate_json_request)])
     async def load_lora_adapter(request: LoadLoraAdapterRequest,
                                 raw_request: Request):
         handler = models(raw_request)
@@ -643,7 +654,8 @@ async def load_lora_adapter(request: LoadLoraAdapterRequest,
 
         return Response(status_code=200, content=response)
 
-    @router.post("/v1/unload_lora_adapter")
+    @router.post("/v1/unload_lora_adapter",
+                 dependencies=[Depends(validate_json_request)])
     async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
                                   raw_request: Request):
         handler = models(raw_request)

From 65b9d2f8fc477503ab4dd778e4b94bc3e3a67ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 13 Feb 2025 16:23:45 +0100
Subject: [PATCH 0155/1240] [Frontend] Add `/v1/audio/transcriptions` OpenAI
 API endpoint (#12909)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  12 +-
 .../serving/openai_compatible_server.md       |  13 +
 .../openai_transcription_client.py            |  23 ++
 requirements-common.txt                       |   7 +-
 requirements-test.in                          |   1 +
 requirements-test.txt                         |   5 +
 .../openai/correctness/__init__.py            |   0
 .../test_lmeval.py}                           |   2 +-
 .../test_transcription_api_correctness.py     | 166 ++++++++++
 .../openai/test_transcription_validation.py   | 122 +++++++
 tests/test_config.py                          |   1 +
 vllm/assets/audio.py                          |   5 +
 vllm/config.py                                |  11 +-
 vllm/entrypoints/openai/api_server.py         |  43 ++-
 vllm/entrypoints/openai/protocol.py           | 163 +++++++++-
 vllm/entrypoints/openai/serving_engine.py     |   6 +-
 .../openai/serving_transcription.py           | 305 ++++++++++++++++++
 vllm/model_executor/models/interfaces.py      |  27 ++
 vllm/model_executor/models/registry.py        |  12 +-
 vllm/model_executor/models/whisper.py         |   5 +-
 20 files changed, 910 insertions(+), 19 deletions(-)
 create mode 100644 examples/online_serving/openai_transcription_client.py
 create mode 100644 tests/entrypoints/openai/correctness/__init__.py
 rename tests/entrypoints/openai/{test_accuracy.py => correctness/test_lmeval.py} (98%)
 create mode 100644 tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
 create mode 100644 tests/entrypoints/openai/test_transcription_validation.py
 create mode 100644 vllm/entrypoints/openai/serving_transcription.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e26b1bf3818..9991060a316 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -117,7 +117,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -205,7 +205,7 @@ steps:
     - VLLM_USE_V1=1 pytest -v -s v1/e2e
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
-    - pytest -v -s entrypoints/openai/test_accuracy.py::test_lm_eval_accuracy_v1_engine
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
 - label: Examples Test # 25min
   working_dir: "/vllm-workspace/examples"
@@ -339,6 +339,14 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
+- label: OpenAI API correctness
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
+
 - label: Encoder Decoder tests # 5min
   source_file_dependencies:
   - vllm/
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 82ef54c16da..64439475fdb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -41,6 +41,8 @@ We currently support the following OpenAI APIs:
   - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
   - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
+  - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
 
 In addition, we have the following custom APIs:
 
@@ -296,6 +298,17 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 :end-before: end-chat-embedding-extra-params
 :::
 
+(transcriptions-api)=
+
+### Transcriptions API
+
+Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
+you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
+
+<!-- TODO: api enforced limits + uploading audios -->
+
+Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+
 (tokenizer-api)=
 
 ### Tokenizer API
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
new file mode 100644
index 00000000000..bd3c02a8a95
--- /dev/null
+++ b/examples/online_serving/openai_transcription_client.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+from openai import OpenAI
+
+from vllm.assets.audio import AudioAsset
+
+mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
+winning_call = AudioAsset('winning_call').get_local_path()
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+with open(str(mary_had_lamb), "rb") as f:
+    transcription = client.audio.transcriptions.create(
+        file=f,
+        model="openai/whisper-large-v3",
+        language="en",
+        response_format="text",
+        temperature=0.0)
+    print("transcription result:", transcription)
diff --git a/requirements-common.txt b/requirements-common.txt
index cfa02025629..0b7253cc121 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -8,12 +8,11 @@ py-cpuinfo
 transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
-fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
+fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
+fastapi[standard]  >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
-uvicorn[standard]
-pydantic >= 2.9  # Required for fastapi >= 0.113.0
+pydantic >= 2.9
 prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
diff --git a/requirements-test.in b/requirements-test.in
index 229d743ec80..ecf874ecc50 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -19,6 +19,7 @@ pqdm
 ray[adag]==2.40.0
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
+jiwer # required for audio tests
 timm # required for internvl test
 torch==2.5.1
 torchaudio==2.5.1
diff --git a/requirements-test.txt b/requirements-test.txt
index e032aac710d..648a2626c85 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -66,6 +66,7 @@ charset-normalizer==3.4.0
 click==8.1.7
     # via
     #   black
+    #   jiwer
     #   nltk
     #   ray
 colorama==0.4.6
@@ -187,6 +188,8 @@ jinja2==3.1.4
     # via
     #   datamodel-code-generator
     #   torch
+jiwer==3.0.5
+    # via -r requirements-test.in
 jmespath==1.0.1
     # via
     #   boto3
@@ -470,6 +473,8 @@ pyyaml==6.0.2
     #   timm
     #   transformers
     #   vocos
+rapidfuzz==3.12.1
+    # via jiwer
 ray[adag]==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
diff --git a/tests/entrypoints/openai/correctness/__init__.py b/tests/entrypoints/openai/correctness/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/correctness/test_lmeval.py
similarity index 98%
rename from tests/entrypoints/openai/test_accuracy.py
rename to tests/entrypoints/openai/correctness/test_lmeval.py
index df25780cd0f..ebb2ea4d9d1 100644
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -13,7 +13,7 @@
 
 from vllm.platforms import current_platform
 
-from ...utils import RemoteOpenAIServer
+from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
 NUM_CONCURRENT = 500
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
new file mode 100644
index 00000000000..19d4735b9dd
--- /dev/null
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Evaluate Transcription API correctness by computing Word Error Rate (WER)
+on a given ASR dataset. When provided, it will also compare the WER against
+a baseline.
+This simulates real work usage of the API and makes sure that the frontend and
+AsyncLLMEngine are working correctly.
+"""
+import asyncio
+import io
+import time
+from statistics import mean, median
+from typing import List
+
+import librosa
+import pytest
+import soundfile
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import AutoTokenizer
+
+from ....utils import RemoteOpenAIServer
+
+
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+
+
+async def transcribe_audio(client, tokenizer, y, sr):
+    # Send loaded audio directly instead of loading from disk,
+    # dont account for that time though
+    with to_bytes(y, sr) as f:
+        start_time = time.perf_counter()
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model=tokenizer.name_or_path,
+            language="en",
+            temperature=0.0,
+        )
+        end_time = time.perf_counter()
+        # NOTE there's no streaming in transcriptions, can't measure ttft
+    latency = end_time - start_time
+    num_output_tokens = len(
+        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+    return latency, num_output_tokens, transcription.text
+
+
+async def bound_transcribe(model_name, sem, client, audio, reference):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Use semaphore to limit concurrent requests.
+    async with sem:
+        result = await transcribe_audio(client, tokenizer, *audio)
+        # Normalize *english* output/reference for evaluation.
+        out = tokenizer.normalize(result[2])
+        ref = tokenizer.normalize(reference)
+        return result[:2] + (out, ref)
+
+
+async def process_dataset(model, client, data, concurrent_request):
+    sem = asyncio.Semaphore(concurrent_request)
+
+    # Warmup call as the first `librosa.load` server-side is quite slow.
+    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
+    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+
+    tasks: List[asyncio.Task] = []
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        task = asyncio.create_task(
+            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+        tasks.append(task)
+    return await asyncio.gather(*tasks)
+
+
+def print_performance_metrics(results, total_time):
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+
+    total = len(results)
+    print(f"Total Requests: {total}")
+    print(f"Successful Requests: {len(latencies)}")
+    print(f"Average Latency: {mean(latencies):.4f} seconds")
+    print(f"Median Latency: {median(latencies):.4f} seconds")
+    perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
+    print(f"95th Percentile Latency: {perc:.4f} seconds")
+    # Throughput
+    req_throughput = len(latencies) / total_time
+    print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
+    throughput = total_tokens / total_time
+    print(f"Estimated Throughput: {throughput:.2f} tok/s")
+
+
+def add_duration(sample):
+    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
+    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    return sample
+
+
+def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+    ## Load and filter the dataset
+    dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
+    if 'duration_ms' not in dataset[0]:
+        # compute duration to filter
+        dataset = dataset.map(add_duration)
+
+    # Whisper max supported duration
+    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    return dataset
+
+
+def run_evaluation(model: str,
+                   client,
+                   dataset,
+                   max_concurrent_reqs: int,
+                   n_examples: int = -1,
+                   print_metrics: bool = True):
+    if n_examples > 0:
+        dataset = dataset.select(range(n_examples))
+    start = time.perf_counter()
+    results = asyncio.run(
+        process_dataset(model, client, dataset, max_concurrent_reqs))
+    end = time.perf_counter()
+    total_time = end - start
+    print(f"Total Test Time: {total_time:.4f} seconds")
+    if print_metrics:
+        print_performance_metrics(results, total_time)
+    # Compute WER
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer = load("wer")
+    wer_score = 100 * wer.compute(references=references,
+                                  predictions=predictions)
+    print("WER:", wer_score)
+    return wer_score
+
+
+# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
+# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
+@pytest.mark.parametrize(
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+# NOTE: Expected WER measured with equivalent hf.transformers args:
+# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
+@pytest.mark.parametrize("expected_wer", [12.744980])
+def test_wer_correctness(model_name,
+                         dataset_repo,
+                         expected_wer,
+                         n_examples=-1,
+                         max_concurrent_request=None):
+    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+        dataset = load_hf_dataset(dataset_repo)
+
+        if not max_concurrent_request:
+            # No max concurrency
+            max_concurrent_request = n_examples if n_examples > 0\
+                else len(dataset)
+
+        client = remote_server.get_async_client()
+        wer = run_evaluation(model_name, client, dataset,
+                             max_concurrent_request, n_examples)
+        if expected_wer:
+            torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
new file mode 100644
index 00000000000..5d4a5de4bad
--- /dev/null
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -0,0 +1,122 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# imports for guided decoding tests
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import soundfile as sf
+
+from vllm.assets.audio import AudioAsset
+
+from ...utils import RemoteOpenAIServer
+
+
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    prompt = "THE FIRST WORDS I SPOKE"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Mary had a little lamb," in out
+        # This should "force" whisper to continue prompt in all caps
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_capital = json.loads(transcription_wprompt)['text']
+        assert prompt not in out_capital
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+
+        # invalid language
+        with pytest.raises(openai.BadRequestError):
+            await client.audio.transcriptions.create(model=model_name,
+                                                     file=mary_had_lamb,
+                                                     language="hh",
+                                                     temperature=0.0)
+
+        # Expect audio too long: repeat the timeseries
+        mary_had_lamb.seek(0)
+        audio, sr = librosa.load(mary_had_lamb)
+        repeated_audio = np.tile(audio, 10)
+        # Repeated audio to buffer
+        buffer = io.BytesIO()
+        sf.write(buffer, repeated_audio, sr, format='WAV')
+        buffer.seek(0)
+        with pytest.raises(openai.BadRequestError):
+            await client.audio.transcriptions.create(model=model_name,
+                                                     file=buffer,
+                                                     language="en",
+                                                     temperature=0.0)
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(winning_call):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.transcriptions.create(model=model_name,
+                                                       file=winning_call,
+                                                       language="en",
+                                                       temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Transcriptions API"
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints():
+    # text to text model
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }])
+        assert res.code == 400
+        assert res.message == "The model does not support Chat Completions API"
+
+        res = await client.completions.create(model=model_name, prompt="Hello")
+        assert res.code == 400
+        assert res.message == "The model does not support Completions API"
diff --git a/tests/test_config.py b/tests/test_config.py
index 2dfae218b47..3fb83b4c032 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -17,6 +17,7 @@
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
+        ("openai/whisper-small", "transcription", "transcription"),
     ],
 )
 def test_auto_task(model_id, expected_runner_type, expected_task):
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index d9e51082e6c..0203dc092a7 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Literal
 from urllib.parse import urljoin
 
@@ -28,6 +29,10 @@ def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
                                             s3_prefix=ASSET_DIR)
         return librosa.load(audio_path, sr=None)
 
+    def get_local_path(self) -> Path:
+        return get_vllm_public_assets(filename=f"{self.name}.ogg",
+                                      s3_prefix=ASSET_DIR)
+
     @property
     def url(self) -> str:
         return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
diff --git a/vllm/config.py b/vllm/config.py
index 1740871e7c1..10004b8f629 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -54,17 +54,18 @@
 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
 
 TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
-                     "score", "reward"]
+                     "score", "reward", "transcription"]
 
 _ResolvedTask = Literal["generate", "embed", "classify", "score", "reward",
-                        "draft"]
+                        "draft", "transcription"]
 
-RunnerType = Literal["generate", "pooling", "draft"]
+RunnerType = Literal["generate", "pooling", "draft", "transcription"]
 
 _RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
     "generate": ["generate"],
     "pooling": ["embed", "classify", "score", "reward"],
     "draft": ["draft"],
+    "transcription": ["transcription"],
 }
 
 _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
@@ -484,6 +485,8 @@ def _get_preferred_task(
             return "embed"
         if ModelRegistry.is_cross_encoder_model(architectures):
             return "score"
+        if ModelRegistry.is_transcription_model(architectures):
+            return "transcription"
 
         suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
             # Other models follow this pattern
@@ -516,6 +519,8 @@ def _resolve_task(
         runner_support: Dict[RunnerType, bool] = {
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
+            "transcription":
+            ModelRegistry.is_transcription_model(architectures),
             "generate": ModelRegistry.is_text_generation_model(architectures),
             "pooling": ModelRegistry.is_pooling_model(architectures),
         }
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b50a72f3a6c..ad391d6737b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -16,10 +16,10 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Dict, Optional, Set, Tuple, Union
+from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union
 
 import uvloop
-from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -61,6 +61,8 @@
                                               ScoreRequest, ScoreResponse,
                                               TokenizeRequest,
                                               TokenizeResponse,
+                                              TranscriptionRequest,
+                                              TranscriptionResponse,
                                               UnloadLoraAdapterRequest)
 from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
@@ -75,6 +77,8 @@
 from vllm.entrypoints.openai.serving_score import OpenAIServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
+from vllm.entrypoints.openai.serving_transcription import (
+    OpenAIServingTranscription)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import with_cancellation
 from vllm.logger import init_logger
@@ -327,6 +331,10 @@ def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
 
+def transcription(request: Request) -> OpenAIServingTranscription:
+    return request.app.state.openai_serving_transcription
+
+
 def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
@@ -520,6 +528,31 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
     return await create_score(request, raw_request)
 
 
+@router.post("/v1/audio/transcriptions")
+@with_cancellation
+async def create_transcriptions(request: Annotated[TranscriptionRequest,
+                                                   Form()],
+                                raw_request: Request):
+
+    handler = transcription(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support Transcriptions API")
+
+    audio_data = await request.file.read()
+    generator = await handler.create_transcription(audio_data, request,
+                                                   raw_request)
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+
+    elif isinstance(generator, TranscriptionResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+
+
 @router.post("/rerank", dependencies=[Depends(validate_json_request)])
 @with_cancellation
 async def do_rerank(request: RerankRequest, raw_request: Request):
@@ -832,6 +865,12 @@ async def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     )
+    state.openai_serving_transcription = OpenAIServingTranscription(
+        engine_client,
+        model_config,
+        state.openai_serving_models,
+        request_logger=request_logger,
+    ) if model_config.runner_type == "transcription" else None
     state.task = model_config.task
 
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 83b84182623..2bcfdc23577 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -8,9 +8,10 @@
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
 
 import torch
+from fastapi import UploadFile
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
                       ValidationInfo, field_validator, model_validator)
-from typing_extensions import Annotated
+from typing_extensions import Annotated, TypeAlias
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.logger import init_logger
@@ -1426,3 +1427,163 @@ class LoadLoraAdapterRequest(BaseModel):
 class UnloadLoraAdapterRequest(BaseModel):
     lora_name: str
     lora_int_id: Optional[int] = Field(default=None)
+
+
+## Protocols for Audio
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json",
+                                         "vtt"]
+
+
+class TranscriptionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    #https://platform.openai.com/docs/api-reference/audio/createTranscription
+
+    file: UploadFile
+    """
+    The audio file object (not file name) to transcribe, in one of these
+    formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
+    """
+
+    model: str
+    """ID of the model to use.
+    """
+
+    language: Optional[str] = None
+    """The language of the input audio.
+
+    Supplying the input language in
+    [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format
+    will improve accuracy and latency.
+    """
+
+    prompt: str = Field(default="")
+    """An optional text to guide the model's style or continue a previous audio
+    segment.
+
+    The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+    should match the audio language.
+    """
+
+    response_format: AudioResponseFormat = Field(default="json")
+    """
+    The format of the output, in one of these options: `json`, `text`, `srt`,
+    `verbose_json`, or `vtt`.
+    """
+
+    ## TODO (varun) : Support if set to 0, certain thresholds are met !!
+    temperature: float = Field(default=0.0)
+    """The sampling temperature, between 0 and 1.
+
+    Higher values like 0.8 will make the output more random, while lower values
+    like 0.2 will make it more focused / deterministic. If set to 0, the model
+    will use [log probability](https://en.wikipedia.org/wiki/Log_probability)
+    to automatically increase the temperature until certain thresholds are hit.
+    """
+
+    timestamp_granularities: List[Literal["word", "segment"]] = Field(
+        alias="timestamp_granularities[]", default=[])
+    """The timestamp granularities to populate for this transcription.
+
+    `response_format` must be set `verbose_json` to use timestamp granularities.
+    Either or both of these options are supported: `word`, or `segment`. Note:
+    There is no additional latency for segment timestamps, but generating word
+    timestamps incurs additional latency.
+    """
+
+    # Default sampling parameters for transcription requests.
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "temperature": 0,
+    }
+
+    def to_sampling_params(
+            self,
+            default_max_tokens: int,
+            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        # TODO(#9845): remove max_tokens when field is removed from OpenAI API
+        max_tokens = default_max_tokens
+
+        if default_sampling_params is None:
+            default_sampling_params = {}
+        # Default parameters
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
+
+        return SamplingParams.from_optional(temperature=temperature,
+                                            max_tokens=max_tokens)
+
+
+# Transcription response objects
+class TranscriptionResponse(OpenAIBaseModel):
+    text: str
+    """The transcribed text."""
+
+
+class TranscriptionWord(OpenAIBaseModel):
+    end: float
+    """End time of the word in seconds."""
+
+    start: float
+    """Start time of the word in seconds."""
+
+    word: str
+    """The text content of the word."""
+
+
+class TranscriptionSegment(OpenAIBaseModel):
+    id: int
+    """Unique identifier of the segment."""
+
+    avg_logprob: float
+    """Average logprob of the segment.
+
+    If the value is lower than -1, consider the logprobs failed.
+    """
+
+    compression_ratio: float
+    """Compression ratio of the segment.
+
+    If the value is greater than 2.4, consider the compression failed.
+    """
+
+    end: float
+    """End time of the segment in seconds."""
+
+    no_speech_prob: float
+    """Probability of no speech in the segment.
+
+    If the value is higher than 1.0 and the `avg_logprob` is below -1, consider
+    this segment silent.
+    """
+
+    seek: int
+    """Seek offset of the segment."""
+
+    start: float
+    """Start time of the segment in seconds."""
+
+    temperature: float
+    """Temperature parameter used for generating the segment."""
+
+    text: str
+    """Text content of the segment."""
+
+    tokens: List[int]
+    """Array of token IDs for the text content."""
+
+
+class TranscriptionResponseVerbose(OpenAIBaseModel):
+    duration: str
+    """The duration of the input audio."""
+
+    language: str
+    """The language of the input audio."""
+
+    text: str
+    """The transcribed text."""
+
+    segments: Optional[List[TranscriptionSegment]] = None
+    """Segments of the transcribed text and their corresponding details."""
+
+    words: Optional[List[TranscriptionWord]] = None
+    """Extracted words and their corresponding timestamps."""
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 9efb5e6fa39..785117ca1d4 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -31,7 +31,8 @@
                                               ErrorResponse, RerankRequest,
                                               ScoreRequest,
                                               TokenizeChatRequest,
-                                              TokenizeCompletionRequest)
+                                              TokenizeCompletionRequest,
+                                              TranscriptionRequest)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 # yapf: enable
@@ -57,7 +58,8 @@
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
 
-AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest]
+AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest,
+                   TranscriptionRequest]
 
 
 class TextTokensPrompt(TypedDict):
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
new file mode 100644
index 00000000000..da4930e0e2d
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -0,0 +1,305 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import io
+from typing import AsyncGenerator, Optional, Union, cast
+
+from fastapi import Request
+
+from vllm.config import ModelConfig
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (ErrorResponse,
+                                              RequestResponseMetadata,
+                                              TranscriptionRequest,
+                                              TranscriptionResponse,
+                                              TranscriptionResponseVerbose)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import PromptType
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.utils import PlaceholderModule
+
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+
+logger = init_logger(__name__)
+
+# From https://platform.openai.com/docs/guides/speech-to-text/supported-languages#supported-languages
+# TODO these configs should live somewhere with the model so we can support
+# additional ones
+
+ISO639_1_SUPPORTED_LANGS = {
+    "af": "Afrikaans",
+    "ar": "Arabic",
+    "hy": "Armenian",
+    "az": "Azerbaijani",
+    "be": "Belarusian",
+    "bs": "Bosnian",
+    "bg": "Bulgarian",
+    "ca": "Catalan",
+    "zh": "Chinese",
+    "hr": "Croatian",
+    "cs": "Czech",
+    "da": "Danish",
+    "nl": "Dutch",
+    "en": "English",
+    "et": "Estonian",
+    "fi": "Finnish",
+    "fr": "French",
+    "gl": "Galician",
+    "de": "German",
+    "el": "Greek",
+    "he": "Hebrew",
+    "hi": "Hindi",
+    "hu": "Hungarian",
+    "is": "Icelandic",
+    "id": "Indonesian",
+    "it": "Italian",
+    "ja": "Japanese",
+    "kn": "Kannada",
+    "kk": "Kazakh",
+    "ko": "Korean",
+    "lv": "Latvian",
+    "lt": "Lithuanian",
+    "mk": "Macedonian",
+    "ms": "Malay",
+    "mr": "Marathi",
+    "mi": "Maori",
+    "ne": "Nepali",
+    "no": "Norwegian",
+    "fa": "Persian",
+    "pl": "Polish",
+    "pt": "Portuguese",
+    "ro": "Romanian",
+    "ru": "Russian",
+    "sr": "Serbian",
+    "sk": "Slovak",
+    "sl": "Slovenian",
+    "es": "Spanish",
+    "sw": "Swahili",
+    "sv": "Swedish",
+    "tl": "Tagalog",
+    "ta": "Tamil",
+    "th": "Thai",
+    "tr": "Turkish",
+    "uk": "Ukrainian",
+    "ur": "Urdu",
+    "vi": "Vietnamese",
+    "cy": "Welsh"
+}
+ISO639_1_OTHER_LANGS = {
+    "lo": "Lao",
+    "jw": "Javanese",
+    "tk": "Turkmen",
+    "yi": "Yiddish",
+    "so": "Somali",
+    "bn": "Bengali",
+    "nn": "Norwegian Nynorsk",
+    "si": "Sinhala",
+    "yo": "Yoruba",
+    "sa": "Sanskrit",
+    "mi": "Māori",
+    "fo": "Faroese",  # codespell:ignore
+    "mt": "Maltese",
+    "tg": "Tajik",
+    "mg": "Malagasy",
+    "haw": "Hawaiian",
+    "km": "Khmer",
+    "br": "Breton",
+    "ps": "Pashto",
+    "ln": "Lingala",
+    "la": "Latin",
+    "ml": "Malayalam",
+    "sq": "Albanian",
+    "su": "Sundanese",
+    "eu": "Basque",
+    "ka": "Georgian",
+    "uz": "Uzbek",
+    "sn": "Shona",
+    "ht": "Haitian",
+    "as": "Assamese",
+    "mn": "Mongolian",
+    "te": "Telugu",
+    "pa": "Panjabi",
+    "tt": "Tatar",
+    "gu": "Gujarati",
+    "oc": "Occitan",
+    "ha": "Hausa",
+    "ba": "Bashkir",
+    "my": "Burmese",
+    "sd": "Sindhi",
+    "am": "Amharic",
+    "lb": "Luxembourgish",
+    "bo": "Tibetan"
+}
+
+# As per https://platform.openai.com/docs/guides/speech-to-text#overview.
+# TODO configurable
+MAX_AUDIO_CLIP_FILESIZE_MB = 25
+# TODO get from processor.feature_extractor.chunk_length
+MAX_AUDIO_CLIP_DURATION_S = 30
+
+
+class OpenAIServingTranscription(OpenAIServing):
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        models: OpenAIServingModels,
+        *,
+        request_logger: Optional[RequestLogger],
+        return_tokens_as_token_ids: bool = False,
+    ):
+        super().__init__(engine_client=engine_client,
+                         model_config=model_config,
+                         models=models,
+                         request_logger=request_logger,
+                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+
+        diff_sampling_param = self.model_config.get_diff_sampling_param()
+        if diff_sampling_param:
+            logger.info(
+                "Overwriting default completion sampling param with: %s",
+                diff_sampling_param)
+
+    async def _preprocess_transcription(
+        self,
+        request: TranscriptionRequest,
+        audio_data: bytes,
+    ) -> PromptType:
+        # Validate request
+        # TODO language should be optional and can be guessed.
+        # For now we default to en. See
+        # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520
+        lang_token = f"<|{request.language}|>" if request.language else "<|en|>"
+        if request.language:
+            if request.language in ISO639_1_SUPPORTED_LANGS:
+                pass
+            elif request.language in ISO639_1_OTHER_LANGS:
+                logger.warning(
+                    "The selected language %s has limited accuracy with"
+                    " reported WER>=0.5. Results may be less accurate "
+                    "for this choice.", request.language)
+            else:
+                raise ValueError(
+                    f"Unsupported language: {request.language}."
+                    "Language should be one of:" +
+                    f" {list(ISO639_1_SUPPORTED_LANGS.values())}" +
+                    f"or {list(ISO639_1_OTHER_LANGS.values())}")
+
+        if len(audio_data) / 1024**2 > MAX_AUDIO_CLIP_FILESIZE_MB:
+            raise ValueError("Maximum file size exceeded.")
+
+        with io.BytesIO(audio_data) as bytes_:
+            y, sr = librosa.load(bytes_)
+        if librosa.get_duration(y=y, sr=sr) > MAX_AUDIO_CLIP_DURATION_S:
+            raise ValueError(
+                f"Maximum clip duration ({MAX_AUDIO_CLIP_DURATION_S}s) "
+                "exceeded.")
+
+        prompt = {
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": (y, sr),
+                },
+            },
+            "decoder_prompt":
+            f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}"
+        }
+        return cast(PromptType, prompt)
+
+    # TODO (varun) : Make verbose response work !
+    async def create_transcription(
+        self, audio_data: bytes, request: TranscriptionRequest,
+        raw_request: Request
+    ) -> Union[TranscriptionResponse, TranscriptionResponseVerbose,
+               ErrorResponse]:
+        """Transcription API similar to OpenAI's API.
+
+        See https://platform.openai.com/docs/api-reference/audio/createTranscription
+        for the API specification. This API mimics the OpenAI transcription API.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        if request.response_format not in ['text', 'json']:
+            return self.create_error_response(
+                "Currently only support response_format `text` or `json`")
+
+        # TODO cmpl->transcription?
+        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        try:
+            (
+                lora_request,
+                prompt_adapter_request,
+            ) = self._maybe_get_adapters(request)
+
+            if lora_request:
+                return self.create_error_response(
+                    "Currently do not support LoRA for Transcription.")
+            if prompt_adapter_request:
+                return self.create_error_response(
+                    "Currently do not support PromptAdapter for Transcription."
+                )
+
+            prompt = await self._preprocess_transcription(
+                request=request,
+                audio_data=audio_data,
+            )
+
+        except ValueError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+
+        result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None
+        try:
+            # TODO(rob): subtract len of tokenized prompt.
+            default_max_tokens = self.model_config.max_model_len
+            default_params = self.model_config.get_diff_sampling_param()
+            sampling_params = request.to_sampling_params(
+                default_max_tokens, default_params)
+
+            self._log_inputs(
+                request_id,
+                prompt['decoder_prompt'],  # type: ignore
+                params=sampling_params,
+                lora_request=None,
+                prompt_adapter_request=None)
+
+            result_generator = self.engine_client.generate(
+                prompt,
+                sampling_params,
+                request_id,
+            )
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
+
+        # TODO(rob): figure out a way to pipe streaming in.
+        # Non-streaming response.
+        try:
+            async for op in result_generator:
+                result = op
+            return TranscriptionResponse(text=result.outputs[0].text)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 0fc5c4db179..a0a1b69ad50 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -441,3 +441,30 @@ def supports_cross_encoding(
     model: Union[Type[object], object],
 ) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]:
     return is_pooling_model(model) and _supports_cross_encoding(model)
+
+
+@runtime_checkable
+class SupportsTranscription(Protocol):
+    """The interface required for all models that support transcription."""
+
+    supports_transcription: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_transcription(
+        model: Type[object]) -> TypeIs[Type[SupportsTranscription]]:
+    ...
+
+
+@overload
+def supports_transcription(model: object) -> TypeIs[SupportsTranscription]:
+    ...
+
+
+def supports_transcription(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]:
+    if isinstance(model, type):
+        return isinstance(model, SupportsTranscription)
+
+    return isinstance(model, SupportsTranscription)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 08c4642b4a9..7260d973bfb 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -22,7 +22,7 @@
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
-                         supports_pp)
+                         supports_pp, supports_transcription)
 from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
@@ -224,6 +224,7 @@ class _ModelInfo:
     has_inner_state: bool
     is_attention_free: bool
     is_hybrid: bool
+    supports_transcription: bool
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
@@ -237,7 +238,7 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
             is_hybrid=is_hybrid(model),
-        )
+            supports_transcription=supports_transcription(model))
 
 
 class _BaseRegisteredModel(ABC):
@@ -485,6 +486,13 @@ def is_hybrid_model(
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_hybrid
 
+    def is_transcription_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.supports_transcription
+
 
 ModelRegistry = _ModelRegistry({
     model_arch:
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 0a3011d3610..0b506072094 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -31,7 +31,7 @@
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
 
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsTranscription
 from .utils import AutoWeightsLoader, WeightsMapper, make_layers
 
 logger = init_logger(__name__)
@@ -637,7 +637,8 @@ def input_mapper_for_whisper(
 @MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
 @MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
     "audio", get_max_whisper_audio_tokens)
-class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
+class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
+                                      SupportsMultiModal):
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",

From 355ea7babe9d9f0fb2482cabba43e1b296e20076 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 13 Feb 2025 20:51:30 +0000
Subject: [PATCH 0156/1240] Add label if pre-commit passes (#12527)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/add_label_precommit.yml | 38 +++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 .github/workflows/add_label_precommit.yml

diff --git a/.github/workflows/add_label_precommit.yml b/.github/workflows/add_label_precommit.yml
new file mode 100644
index 00000000000..a88b44f03a5
--- /dev/null
+++ b/.github/workflows/add_label_precommit.yml
@@ -0,0 +1,38 @@
+name: Add label on pre-commit success
+on:
+    workflow_run:
+        workflows: [pre-commit]
+        types: [requested, completed]
+jobs:
+    add-label-on-pre-commit-success:
+        runs-on: ubuntu-latest
+        if: ${{ github.event.workflow_run.conclusion == 'success' }}
+        steps:
+            -   name: Add label
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                with:
+                    script: |
+                        github.rest.issues.addLabels({
+                            owner: context.repo.owner,
+                            repo: context.repo.repo,
+                            issue_number: context.issue.number,
+                            labels: ['pre-commit-passed']
+                        })
+                env:
+                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    remove-label-on-pre-commit-not-success:
+        runs-on: ubuntu-latest
+        if: ${{ github.event.workflow_run.conclusion != 'success' }}
+        steps:
+            -   name: Remove label
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+                with:
+                    script: |
+                        github.rest.issues.removeLabels({
+                            owner: context.repo.owner,
+                            repo: context.repo.repo,
+                            issue_number: context.issue.number,
+                            labels: ['pre-commit passed']
+                        })
+                env:
+                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From 8a61bcc9b2c5509d904b6c01c22a8d29cb22f433 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 13 Feb 2025 18:43:37 -0500
Subject: [PATCH 0157/1240] Optimize moe_align_block_size for deepseek_v3
 (#12850)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu             | 52 +++++++++++++------
 .../layers/fused_moe/fused_moe.py             |  3 +-
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index c072744f066..d7be769458e 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -198,26 +198,27 @@ __global__ void moe_align_block_size_global_mem_kernel(
 }
 
 // taken from
-// https://github.com/sgl-project/sglang/commit/ded9fcd09a43d5e7d5bb31a2bc3e9fc21bf65d2a
+// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
 template <typename scalar_t>
 __global__ void sgl_moe_align_block_size_kernel(
     scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
     int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
     int32_t block_size, size_t numel, int32_t* cumsum) {
   __shared__ int32_t shared_counts[32][8];
-  __shared__ int32_t local_offsets[256];
 
   const int warp_id = threadIdx.x / 32;
-  const int lane_id = threadIdx.x % 32;
   const int experts_per_warp = 8;
   const int my_expert_start = warp_id * experts_per_warp;
 
+  // Initialize shared_counts for this warp's experts
   for (int i = 0; i < experts_per_warp; ++i) {
     if (my_expert_start + i < num_experts) {
       shared_counts[warp_id][i] = 0;
     }
   }
 
+  __syncthreads();
+
   const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
@@ -230,6 +231,7 @@ __global__ void sgl_moe_align_block_size_kernel(
 
   __syncthreads();
 
+  // Single thread computes cumulative sum and total tokens
   if (threadIdx.x == 0) {
     cumsum[0] = 0;
     for (int i = 1; i <= num_experts; ++i) {
@@ -246,19 +248,28 @@ __global__ void sgl_moe_align_block_size_kernel(
 
   __syncthreads();
 
+  // Assign expert IDs to blocks
   if (threadIdx.x < num_experts) {
     for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
          i += block_size) {
       expert_ids[i / block_size] = threadIdx.x;
     }
-    local_offsets[threadIdx.x] = cumsum[threadIdx.x];
   }
+}
 
-  __syncthreads();
-
-  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
+// taken from
+// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
+template <typename scalar_t>
+__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
+                                          int32_t* sorted_token_ids,
+                                          int32_t* cumsum_buffer,
+                                          size_t numel) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < numel; i += stride) {
     int32_t expert_id = topk_ids[i];
-    int32_t rank_post_pad = atomicAdd(&local_offsets[expert_id], 1);
+    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
     sorted_token_ids[rank_post_pad] = i;
   }
 }
@@ -377,23 +388,34 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                               torch::Tensor experts_ids,
                               torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  TORCH_CHECK(num_experts == 256,
+              "sgl_moe_align_block_size kernel only supports deepseek v3.");
+
   VLLM_DISPATCH_INTEGRAL_TYPES(
       topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
+        // calc needed amount of shared mem for `cumsum` tensors
         auto options_int =
             torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
-        // torch::Tensor token_cnts_buffer =
-        //     torch::empty({(num_experts + 1) * num_experts}, options_int);
         torch::Tensor cumsum_buffer =
-            torch::empty({num_experts + 1}, options_int);
+            torch::zeros({num_experts + 1}, options_int);
 
-        auto kernel = vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
-        kernel<<<1, 1024, 0, stream>>>(
+        auto align_kernel =
+            vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
+        align_kernel<<<1, 1024, 0, stream>>>(
             topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
             experts_ids.data_ptr<int32_t>(),
             num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
             topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
+
+        const int block_threads = 256;
+        const int num_blocks =
+            (topk_ids.numel() + block_threads - 1) / block_threads;
+        const int max_blocks = 65535;
+        const int actual_blocks = std::min(num_blocks, max_blocks);
+        auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
+        sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
+            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+            cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
       });
 }
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index f14200e0288..d0b6249e1c3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -596,7 +596,7 @@ def moe_align_block_size(
                                       dtype=torch.int32,
                                       device=topk_ids.device)
     if num_experts >= 224:
-        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
             moe_align_block_size_triton(
                 topk_ids,
                 num_experts,
@@ -606,6 +606,7 @@ def moe_align_block_size(
                 num_tokens_post_pad,
             )
         else:
+            # Currently requires num_experts=256
             ops.sgl_moe_align_block_size(
                 topk_ids,
                 num_experts,

From 00b7f5ef47fedbd0d1e9e2f17ff6609968359b99 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Feb 2025 19:01:14 -0500
Subject: [PATCH 0158/1240] [Kernel][Bugfix] Refactor and Fix CUTLASS 2:4
 Sparse Kernels (#13198)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |  10 +-
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |  69 ++++-
 csrc/ops.h                                    |   3 +-
 .../cutlass_w8a8/c3x/scaled_mm.cuh            |  13 +-
 .../cutlass_w8a8/scaled_mm_c2x.cuh            |  11 +-
 csrc/sparse/cutlass/sparse_compressor_c3x.cu  | 165 -----------
 csrc/sparse/cutlass/sparse_compressor_c3x.cuh |  90 ++++++
 .../sparse/cutlass/sparse_compressor_entry.cu |  42 ---
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu   | 264 +++++++++---------
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  | 232 +++++++++------
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu |  30 ++
 csrc/torch_bindings.cpp                       |   6 +-
 tests/kernels/test_cutlass_2of4_sparse.py     |  81 +++++-
 vllm/_custom_ops.py                           |  17 +-
 .../compressed_tensors/compressed_tensors.py  |   7 -
 .../schemes/compressed_tensors_24.py          |   9 +-
 16 files changed, 576 insertions(+), 473 deletions(-)
 delete mode 100644 csrc/sparse/cutlass/sparse_compressor_c3x.cu
 create mode 100644 csrc/sparse/cutlass/sparse_compressor_c3x.cuh
 delete mode 100644 csrc/sparse/cutlass/sparse_compressor_entry.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 244ceb721c9..8e8f7adf6ea 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -228,7 +228,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
-  set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
+  # Please keep this in sync with FetchContent_Declare line below.
+  set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -245,6 +246,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        # Please keep this in sync with CUTLASS_REVISION line above.
         GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
@@ -266,7 +268,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
-    "csrc/sparse/cutlass/sparse_compressor_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
 
   set_gencode_flags_for_srcs(
@@ -359,8 +360,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
   # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
-             "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
+    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -476,7 +476,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
-  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index c590c66a666..583fa3c4551 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -16,6 +16,30 @@ namespace vllm::c3x {
 
 using namespace cute;
 
+template <typename T>
+struct identity {
+  CUTLASS_HOST_DEVICE
+  T operator()(T lhs) const { return lhs; }
+};
+
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct TrivialEpilogue {
+ private:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+  using Compute = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::epilogue::thread::Identity, ElementD, ElementAcc,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute = cutlass::epilogue::fusion::Sm90EVT<Compute, Accum>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  template <typename... Args>
+  static ArgumentType prepare_args(Args... args) {
+    return {};
+  }
+};
+
 /*
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
@@ -174,6 +198,49 @@ struct ScaledEpilogueBias
   }
 };
 
+/*
+ * This epilogue performs the same operation as ScaledEpilogueBias, but the
+ * bias is a column vector instead of a row vector. Useful e.g. if we are
+ * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueColumnBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template ColLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
 /*
  * This epilogue directly supports per-tensor azp in int32 form.
  * As opposed to the per-token epilogue below, this epilogue only has an azp_adj
@@ -314,4 +381,4 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
-};  // namespace vllm::c3x
\ No newline at end of file
+};  // namespace vllm::c3x
diff --git a/csrc/ops.h b/csrc/ops.h
index 70e864cc6a8..46007889672 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -176,8 +176,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               torch::Tensor const& b_scales,
                               std::optional<torch::Tensor> const& bias);
 
-bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed,
-                                   torch::Tensor& e, torch::Tensor const& a);
+std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index 9227ebb7352..d2f43e2b7a8 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -53,12 +53,17 @@ struct cutlass_3x_gemm {
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
+          ElementAcc, float, ElementC, StrideC, AlignmentCD, ElementD, StrideD,
+          AlignmentCD, EpilogueSchedule, EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
       sizeof(typename CollectiveEpilogue::SharedStorage);
@@ -69,8 +74,8 @@ struct cutlass_3x_gemm {
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
index f2fae4b66d6..ce7cf2f3528 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh
@@ -103,14 +103,19 @@ struct cutlass_2x_gemm {
 
   using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
 
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentCD = 4;
+
   // clang-format off
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
   using KernelType =
     ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
+      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, AlignmentAB,
+      float, cutlass::layout::RowMajor, AlignmentCD,
       ElementAcc, float, cutlass::arch::OpClassTensorOp,
       Arch,
       TileShape, WarpShape, InstructionShape,
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cu b/csrc/sparse/cutlass/sparse_compressor_c3x.cu
deleted file mode 100644
index bd536955032..00000000000
--- a/csrc/sparse/cutlass/sparse_compressor_c3x.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12020
-#include "sparse_scaled_mm_c3x.cuh"
-
-#include "cutlass/numeric_conversion.h"
-#include "cutlass/transform/device/transform_universal_adapter.hpp"
-#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
-#include "cutlass/epilogue/collective/default_epilogue.hpp"
-
-#include "cutlass/util/host_tensor.h"
-#include "cutlass/util/packed_stride.hpp"
-// clang-format on
-
-using namespace cute;
-using namespace vllm;
-
-/// Make A structured sparse by replacing elements with 0 and compress it
-template <typename ElementA_, typename ElementAcc_>
-bool cutlass_sparse_compress(torch::Tensor& a_nzs, torch::Tensor& a_meta,
-                             torch::Tensor const& a) {
-  // Checks for conformality
-  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
-              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
-  TORCH_CHECK(a.dim() == 2)
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
-  TORCH_CHECK(a.stride(1) == 1)
-
-  int m = a.size(0);
-  int k = a.size(1);
-
-  // Sparse kernel setup; this kernel is not used for matmul,
-  // but just for setting up the compressor utility
-  // A matrix configuration
-  using ElementA = ElementA_;
-  using LayoutTagA = cutlass::layout::RowMajor;
-  constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
-  // B matrix configuration
-  using ElementB = ElementA;
-  using LayoutTagB = cutlass::layout::ColumnMajor;
-  constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
-  // C/D matrix configuration
-  using ElementC = float;
-  using LayoutTagC = cutlass::layout::ColumnMajor;
-  constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
-  // Core kernel configurations
-  using ElementAccumulator = ElementAcc_;
-  using TileShape = Shape<_128, _128, _128>;
-  using TileShapeRef = Shape<_128, _128, _64>;
-  using ClusterShape = Shape<_1, _2, _1>;
-  using KernelSchedule = typename std::conditional<
-      std::is_same_v<ElementA, cutlass::float_e4m3_t>,
-      cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum,
-      cutlass::gemm::KernelTmaWarpSpecialized>::type;
-
-  using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized;
-  using ProblemShape = Shape<int, int, int, int>;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAccumulator, ElementAccumulator, ElementC, LayoutTagC,
-          AlignmentC, ElementC, LayoutTagC, AlignmentC,
-          EpilogueSchedule>::CollectiveOp;
-
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, ElementA,
-          LayoutTagA, AlignmentA, ElementB, LayoutTagB, AlignmentB,
-          ElementAccumulator, TileShape, ClusterShape,
-          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
-              sizeof(typename CollectiveEpilogue::SharedStorage))>,
-          KernelSchedule>::CollectiveOp;
-
-  using GemmKernel =
-      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
-                                           CollectiveEpilogue>;
-
-  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-
-  using StrideA = cutlass::gemm::TagToStrideA_t<LayoutTagA>;
-  using StrideE = StrideA;
-
-  using StrideA = Stride<int64_t, Int<1>, int64_t>;
-
-  // The n (=1) dimension does not matter for the compressor
-  typename GemmKernel::ProblemShape prob_shape{m, 1, k, 1};
-
-  using LayoutA = typename GemmKernel::CollectiveMainloop::LayoutA;
-  using LayoutE = typename GemmKernel::CollectiveMainloop::LayoutE;
-
-  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
-  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
-
-  // Offline compressor kernel
-  using CompressorUtility =
-      cutlass::transform::kernel::StructuredSparseCompressorUtility<
-          ProblemShape, ElementA, LayoutTagA, SparseConfig>;
-
-  using CompressorKernel =
-      cutlass::transform::kernel::StructuredSparseCompressor<
-          ProblemShape, ElementA, LayoutTagA, SparseConfig,
-          cutlass::arch::Sm90>;
-
-  using Compressor =
-      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
-
-  auto [M, N, K, L] = prob_shape;
-
-  StrideA stride_A;
-  stride_A =
-      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
-
-  CompressorUtility compressor_utility(prob_shape, stride_A);
-
-  int ME = compressor_utility.get_metadata_m_physical();
-  int KE = compressor_utility.get_metadata_k_physical();
-  int KC = compressor_utility.get_tensorA_k_physical();
-
-  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
-
-  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
-  auto a_meta_ptr = static_cast<typename Gemm::CollectiveMainloop::ElementE*>(
-      a_meta.data_ptr());
-
-  cutlass::KernelHardwareInfo hw_info;
-  hw_info.device_id = 0;
-  hw_info.sm_count =
-      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          hw_info.device_id);
-  typename Compressor::Arguments arguments{
-      prob_shape, {a_ptr, stride_A, a_nzs_ptr, a_meta_ptr}, {hw_info}};
-
-  Compressor compressor_op;
-  size_t workspace_size = Compressor::get_workspace_size(arguments);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  CUTLASS_CHECK(compressor_op.can_implement(arguments));
-  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.get()));
-  CUTLASS_CHECK(compressor_op.run());
-  CUDA_CHECK(cudaDeviceSynchronize());
-
-  return true;
-}
-
-bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
-                                  torch::Tensor const& a) {
-  if (a.dtype() == torch::kBFloat16) {
-    return cutlass_sparse_compress<cutlass::bfloat16_t, float>(a_nzs, a_meta,
-                                                               a);
-  } else if (a.dtype() == torch::kFloat16) {
-    return cutlass_sparse_compress<cutlass::half_t, float>(a_nzs, a_meta, a);
-  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
-    return cutlass_sparse_compress<cutlass::float_e4m3_t, float>(a_nzs, a_meta,
-                                                                 a);
-  } else if (a.dtype() == torch::kInt8) {
-    return cutlass_sparse_compress<int8_t, int32_t>(a_nzs, a_meta, a);
-  }
-  return false;
-}
-#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor_c3x.cuh b/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
new file mode 100644
index 00000000000..2cc235f3a68
--- /dev/null
+++ b/csrc/sparse/cutlass/sparse_compressor_c3x.cuh
@@ -0,0 +1,90 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12020
+#include "sparse_scaled_mm_c3x.cuh"
+
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+
+// clang-format on
+
+using namespace cute;
+using namespace vllm;
+
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename Gemm>
+CompressorResult cutlass_sparse_compress(torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  using GemmKernel = typename Gemm::KernelType;
+  using ElementA = typename Gemm::ElementAB;
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  using ProblemShape = typename GemmKernel::ProblemShape;
+  ProblemShape prob_shape{m, 1, k, 1};
+
+  int64_t lda = a.stride(0);
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using CompressorUtility = typename Gemm::CompressorUtility;
+  CompressorUtility compressor_utility(prob_shape, a_stride);
+
+  // Allocate buffers for the metadata E and the compressed matrix A
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int MC = compressor_utility.get_tensorA_m_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto const a_meta_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto const a_nzs_options =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+
+  auto a_meta = torch::zeros({ME, KE}, a_meta_options);
+  auto a_nzs = torch::zeros({MC, KC}, a_nzs_options);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<ElementE*>(a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using Compressor = typename Gemm::Compressor;
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, a_stride, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.data_ptr()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return {a_meta, a_nzs};
+}
+
+#endif
diff --git a/csrc/sparse/cutlass/sparse_compressor_entry.cu b/csrc/sparse/cutlass/sparse_compressor_entry.cu
deleted file mode 100644
index 3401761c1b7..00000000000
--- a/csrc/sparse/cutlass/sparse_compressor_entry.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <cudaTypedefs.h>
-
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/all.h>
-
-#include "cutlass_extensions/common.hpp"
-
-#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-bool cutlass_sparse_compress_sm90(torch::Tensor& a_nzs, torch::Tensor& a_meta,
-                                  torch::Tensor const& a);
-#endif
-
-bool cutlass_sparse_compress_entry(torch::Tensor& a_nzs, torch::Tensor& a_meta,
-                                   torch::Tensor const& a) {
-  // Checks for conformality
-  TORCH_CHECK(a.dim() == 2 && a_meta.dim() == 2 && a_nzs.dim() == 2);
-  TORCH_CHECK(a.size(0) == a_nzs.size(0) && a.size(0) == a_meta.size(0) &&
-              a_nzs.size(1) * 2 == a.size(1) &&
-              a_meta.size(1) * 2 * 4 == a.size(1));
-  // Considering elemsPerMetaElem = 8b / 2b_per_nz = 4
-
-  // Check for strides and alignment
-  TORCH_CHECK(a.stride(1) == 1 && a_nzs.stride(1) == 1 &&
-              a_meta.stride(1) == 1);  // Row-major
-  TORCH_CHECK(a.stride(0) % 8 == 0);   // 8 Byte Alignment for Compression
-
-  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
-  int32_t version_num = get_sm_version_num();
-
-  // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-  if (version_num >= 90) {
-    return cutlass_sparse_compress_sm90(a_nzs, a_meta, a);
-  }
-#endif
-
-  TORCH_CHECK_NOT_IMPLEMENTED(
-      false,
-      "No compiled cutlass_scaled_sparse_mm for a compute capability less than "
-      "CUDA device capability: ",
-      version_num);
-}
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
index 5a1879787c3..3dcaa6373f1 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -9,17 +9,30 @@
 using namespace cute;
 using namespace vllm;
 
+struct GemmCallerTraits {
+  using return_type = void;
+
+  template <typename GemmConfig, typename... Args>
+  static return_type invoke(Args&&... args) {
+    return cutlass_sparse_gemm_caller<GemmConfig>(std::forward<Args>(args)...);
+  }
+};
+
+struct GemmCompressorTraits {
+  using return_type = CompressorResult;
+
+  template <typename GemmConfig, typename... Args>
+  static return_type invoke(Args&&... args) {
+    return cutlass_sparse_compress<GemmConfig>(std::forward<Args>(args)...);
+  }
+};
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& bt_nzs,
-                                    torch::Tensor const& bt_meta,
-                                    EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
-  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat8_e4m3fn);
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_fp8_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
+  static_assert(std::is_same_v<InType, cutlass::float_e4m3_t>);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -49,122 +62,87 @@ void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
   using Cutlass3xGemm8 =
       typename sm90_fp8_config_8<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  uint32_t const n = bt_nzs.size(0);
-  uint32_t const m = a.size(0);  // Batch size
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
 
   if (mp2 <= 64) {
     if (n == 28672) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm2>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm2>(
+          std::forward<Args>(args)...);
     } else if (n == 4096 || n == 6144) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm1>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm1>(
+          std::forward<Args>(args)...);
     }
   } else if (mp2 <= 128) {
     if (n == 4096) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm3>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm3>(
+          std::forward<Args>(args)...);
     } else if (n == 28672) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm5>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm5>(
+          std::forward<Args>(args)...);
     } else if (n == 6144) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm4>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm4>(
+          std::forward<Args>(args)...);
     }
   } else if (mp2 <= 256) {
     if (n == 4096) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm6>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm6>(
+          std::forward<Args>(args)...);
     } else if (n == 28672) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm8>(
+          std::forward<Args>(args)...);
     } else if (n == 6144) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm7>(
+          std::forward<Args>(args)...);
     }
   } else {
     if (n == 6144 || n == 28672) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm8>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm8>(
+          std::forward<Args>(args)...);
     } else if (n == 4096) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemm7>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemm7>(
+          std::forward<Args>(args)...);
     }
   }
 
   // Otherwise the default heuristic
   if (mp2 <= 64) {
     // n in [1, 64]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM64>(
+        std::forward<Args>(args)...);
   } else if (mp2 <= 128) {
     // n in (64, 128]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM128>(
+        std::forward<Args>(args)...);
   } else if (mp2 <= 256) {
     // n in (128, 256]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM256>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM256>(
+        std::forward<Args>(args)...);
   } else {
     // n in (256, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM512>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM512>(
+        std::forward<Args>(args)...);
   }
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_fp16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& bt_nzs,
-                                     torch::Tensor const& bt_meta,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::half_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat16);
-  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
-  TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  // m in (128, inf)
-  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_bf16_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& bt_nzs,
-                                     torch::Tensor const& bt_meta,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::bfloat16_t>());
-  TORCH_CHECK(a.dtype() == torch::kBFloat16);
-  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
-  TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
-
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_16bit_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
 
-  // m in (128, inf)
-  return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-      out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+  return DispatchFunc::template invoke<Cutlass3xGemmDefault>(
+      std::forward<Args>(args)...);
 }
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& bt_nzs,
-                                     torch::Tensor const& bt_meta,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
-  TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
+          typename DispatchFunc, typename... Args>
+typename DispatchFunc::return_type cutlass_gemm_sm90_int8_dispatch(
+    uint32_t m, uint32_t n, Args&&... args) {
+  static_assert(std::is_same_v<InType, int8_t>);
 
   using Cutlass3xGemmDefault =
       typename sm90_config_default<InType, OutType, Epilogue>::Cutlass3xGemm;
@@ -179,37 +157,35 @@ void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
       typename sm90_int8_config_M32_NSmall<InType, OutType,
                                            Epilogue>::Cutlass3xGemm;
 
-  uint32_t const n = out.size(1);
   bool const is_small_n = n < 8192;
-
-  uint32_t const m = a.size(0);
   uint32_t const mp2 =
       std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
 
   if (mp2 <= 32) {
     // m in [1, 32]
     if (is_small_n) {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemmM32NSmall>(
+          std::forward<Args>(args)...);
     } else {
-      return cutlass_sparse_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+      return DispatchFunc::template invoke<Cutlass3xGemmM32NBig>(
+          std::forward<Args>(args)...);
     }
   } else if (mp2 <= 64) {
     // m in (32, 64]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM64>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM64>(
+        std::forward<Args>(args)...);
   } else if (mp2 <= 128) {
     // m in (64, 128]
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmM128>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmM128>(
+        std::forward<Args>(args)...);
   } else {
     // m in (128, inf)
-    return cutlass_sparse_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, bt_nzs, bt_meta, std::forward<EpilogueArgs>(args)...);
+    return DispatchFunc::template invoke<Cutlass3xGemmDefault>(
+        std::forward<Args>(args)...);
   }
 }
 
+// Dispatch to GEMM implementations based on element types
 template <template <typename, typename, typename> typename Epilogue,
           typename... EpilogueArgs>
 void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
@@ -217,19 +193,24 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
                                             torch::Tensor const& bt_nzs,
                                             torch::Tensor const& bt_meta,
                                             EpilogueArgs&&... epilogue_args) {
+  uint32_t const m = out.size(0);
+  uint32_t const n = out.size(1);
+
+  // TODO: add dispatch functions to all of these
   TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
   if (a.dtype() == torch::kInt8) {
     TORCH_CHECK(bt_nzs.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, bt_nzs, bt_meta,
+                                             Epilogue, GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
           std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
+      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue,
+                                             GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
           std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat8_e4m3fn) {
@@ -237,47 +218,34 @@ void cutlass_scaled_sparse_mm_sm90_epilogue(torch::Tensor& out,
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
+                                            cutlass::bfloat16_t, Epilogue,
+                                            GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
           std::forward<EpilogueArgs>(epilogue_args)...);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
+      return cutlass_gemm_sm90_fp8_dispatch<
+          cutlass::float_e4m3_t, cutlass::half_t, Epilogue, GemmCallerTraits>(
+          m, n, out, a, bt_nzs, bt_meta,
           std::forward<EpilogueArgs>(epilogue_args)...);
     }
   } else if (a.dtype() == torch::kFloat16) {
     TORCH_CHECK(bt_nzs.dtype() == torch::kFloat16);
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
 
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t,
-                                             cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
-          std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp16_dispatch<cutlass::half_t, cutlass::half_t,
-                                             Epilogue>(
-          out, a, bt_nzs, bt_meta,
-          std::forward<EpilogueArgs>(epilogue_args)...);
-    }
+    return cutlass_gemm_sm90_16bit_dispatch<cutlass::half_t, cutlass::half_t,
+                                            Epilogue, GemmCallerTraits>(
+        m, n, out, a, bt_nzs, bt_meta,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   } else {  // a.dtype() == torch::kBFloat16
     TORCH_CHECK(a.dtype() == torch::kBFloat16);
     TORCH_CHECK(bt_nzs.dtype() == torch::kBFloat16);
+    TORCH_CHECK(out.dtype() == torch::kBFloat16);
 
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
-                                             cutlass::bfloat16_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
-          std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_bf16_dispatch<cutlass::bfloat16_t,
-                                             cutlass::half_t, Epilogue>(
-          out, a, bt_nzs, bt_meta,
-          std::forward<EpilogueArgs>(epilogue_args)...);
-    }
+    return cutlass_gemm_sm90_16bit_dispatch<
+        cutlass::bfloat16_t, cutlass::bfloat16_t, Epilogue, GemmCallerTraits>(
+        m, n, out, a, bt_nzs, bt_meta,
+        std::forward<EpilogueArgs>(epilogue_args)...);
   }
 }
 
@@ -287,17 +255,53 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(bt_meta.dtype() == torch::kUInt8);
   TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
   TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
   if (bias) {
     TORCH_CHECK(bias->dtype() == out.dtype(),
-                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
-        out, a, bt_nzs, bt_meta, b_scales, a_scales, *bias);
+                "CUTLASS scaled_mm bias dtype must match output dtype ",
+                out.dtype());
+    return cutlass_scaled_sparse_mm_sm90_epilogue<
+        c3x::ScaledEpilogueColumnBias>(out, a, bt_nzs, bt_meta, b_scales,
+                                       a_scales, *bias);
   } else {
     return cutlass_scaled_sparse_mm_sm90_epilogue<c3x::ScaledEpilogue>(
         out, a, bt_nzs, bt_meta, b_scales, a_scales);
   }
 }
 
+CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
+  // These m and n variables are fordispatching to different GEMM algorithms.
+  uint32_t const m = 1;  // Set M to 1 for compression
+  uint32_t const n = a.size(1);
+
+  // Note: For correctess, the compressed format must be invariant in:
+  //  - M, the flattened number of tokens
+  //  - Whether output dtype is fp16 or bf16
+  //  - CUTLASS epilogues
+
+  if (a.dtype() == torch::kInt8) {
+    return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
+                                           c3x::TrivialEpilogue,
+                                           GemmCompressorTraits>(m, n, a);
+  } else if (a.dtype() == torch::kFloat8_e4m3fn) {
+    return cutlass_gemm_sm90_fp8_dispatch<
+        cutlass::float_e4m3_t, cutlass::bfloat16_t, c3x::TrivialEpilogue,
+        GemmCompressorTraits>(m, n, a);
+  } else if (a.dtype() == torch::kFloat16) {
+    return cutlass_gemm_sm90_16bit_dispatch<
+        cutlass::bfloat16_t, cutlass::bfloat16_t, c3x::TrivialEpilogue,
+        GemmCompressorTraits>(m, n, a);
+  } else {
+    TORCH_CHECK(a.dtype() == torch::kBFloat16,
+                "cutlass_sparse_compress only supports int8, fp8_e4m3, fp16, "
+                "and bf16 datatypes");
+    return cutlass_gemm_sm90_16bit_dispatch<cutlass::half_t, cutlass::half_t,
+                                            c3x::TrivialEpilogue,
+                                            GemmCompressorTraits>(m, n, a);
+  }
+}
+
 #endif
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 10178b53f4a..5fb4aec5332 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -1,3 +1,5 @@
+#pragma once
+
 // clang-format will break include orders
 // clang-format off
 #include <cudaTypedefs.h>
@@ -12,6 +14,9 @@
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
+#include "cutlass/transform/device/transform_universal_adapter.hpp"
+#include "cutlass/transform/kernel/sparse_gemm_compressor.hpp"
+
 #include "core/math.hpp"
 #include "cutlass_extensions/cute_utils.cuh"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
@@ -22,7 +27,7 @@
 using namespace cute;
 
 /*
-   This file defines sparse quantized GEMM operations using the CUTLASS 3.x API,
+   This file defines 2:4 sparse GEMM operations using the CUTLASS 3.x API,
    for NVIDIA GPUs with sm90a (Hopper) or later.
 */
 
@@ -45,17 +50,20 @@ struct enable_sm90_or_later : Kernel {
 
 using GemmUniversalMode = cutlass::gemm::GemmUniversalMode;
 
+/*
+ * cutlass_sparse_3x_gemm defines a 2:4 sparse GEMM kernel via CUTLASS
+ * for SM90 Hopper systems.
+ */
 template <typename ElementAB_, typename ElementD_,
           template <typename, typename, typename> typename Epilogue_,
           typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule, typename AccType,
-          typename TileSchedule = cutlass::gemm::PersistentScheduler,
-          GemmUniversalMode Mode_ = GemmUniversalMode::kGemm>
+          typename EpilogueSchedule>
 struct cutlass_sparse_3x_gemm {
-  static const GemmUniversalMode Mode = Mode_;
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
-  using ElementAcc = AccType;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
 
   using EpilogueDescriptor =
       cutlass::epilogue::collective::detail::EpilogueDescriptor<
@@ -66,30 +74,22 @@ struct cutlass_sparse_3x_gemm {
 
   using ElementC = void;
   using LayoutC = cutlass::layout::RowMajor;
-  using LayoutD = LayoutC;
-  using StrideC = cutlass::detail::TagToStrideA_t<LayoutC>;
-  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
-
   using LayoutC_Transpose =
       typename cutlass::layout::LayoutTranspose<LayoutC>::type;
-  using LayoutD_Transpose =
-      typename cutlass::layout::LayoutTranspose<LayoutD>::type;
 
   using EVTCompute = typename Epilogue::EVTCompute;
 
-  static constexpr int AlignmentA =
-      128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentB =
+  // These are the minimum alignments needed for the kernels to compile
+  static constexpr int AlignmentAB =
       128 / cutlass::sizeof_bits<ElementAB>::value;
-  static constexpr int AlignmentCD =
-      128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentCD = 4;
 
   using CollectiveEpilogue =
       typename cutlass::epilogue::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
           ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, ElementAcc, ElementC, LayoutC_Transpose, AlignmentCD,
-          ElementD, LayoutD_Transpose, AlignmentCD, EpilogueSchedule,
+          ElementAcc, float, ElementC, LayoutC_Transpose, AlignmentCD, ElementD,
+          LayoutC_Transpose, AlignmentCD, EpilogueSchedule,
           EVTCompute>::CollectiveOp;
 
   static constexpr size_t CEStorageSize =
@@ -101,8 +101,8 @@ struct cutlass_sparse_3x_gemm {
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
           cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, AlignmentA, 
-          ElementAB, cutlass::layout::ColumnMajor, AlignmentB, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
@@ -110,11 +110,100 @@ struct cutlass_sparse_3x_gemm {
 
   using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
       cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      TileSchedule>>;
+      cutlass::gemm::PersistentScheduler>>;
 
   struct GemmKernel : public KernelType {};
+
+  // Sparse compressor definitions
+  using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
+  using LayoutTagA = cutlass::layout::RowMajor;
+  using CompressorUtility =
+      cutlass::transform::kernel::StructuredSparseCompressorUtility<
+          typename GemmKernel::ProblemShape, ElementAB, LayoutTagA,
+          SparseConfig>;
+  using CompressorKernel =
+      cutlass::transform::kernel::StructuredSparseCompressor<
+          typename GemmKernel::ProblemShape, ElementAB, LayoutTagA,
+          SparseConfig, cutlass::arch::Sm90>;
+  using Compressor =
+      cutlass::transform::device::TransformUniversalAdapter<CompressorKernel>;
 };
 
+/*
+ * This class defines kernel to compress a 2:4 sparse matrix.
+ * The particular format is defined by the Gemm template parameter,
+ * which is a cutlass_sparse_3x_gemm.
+ */
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+/// Make A structured sparse by replacing elements with 0 and compress it
+template <typename Gemm>
+CompressorResult cutlass_sparse_compress(torch::Tensor const& a) {
+  // Checks for conformality
+  TORCH_CHECK(a.dtype() == torch::kInt8 || a.dtype() == torch::kFloat8_e4m3fn ||
+              a.dtype() == torch::kFloat16 || a.dtype() == torch::kBFloat16);
+  TORCH_CHECK(a.dim() == 2)
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(0) % 4 == 0)  // Required for semi-structured sparsity
+  TORCH_CHECK(a.stride(1) == 1)
+
+  using GemmKernel = typename Gemm::KernelType;
+  using ElementA = typename Gemm::ElementAB;
+  using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
+
+  int m = a.size(0);
+  int k = a.size(1);
+  using ProblemShape = typename GemmKernel::ProblemShape;
+  ProblemShape prob_shape{m, 1, k, 1};
+
+  int64_t lda = a.stride(0);
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  StrideA a_stride{lda, Int<1>{}, 0};
+
+  using CompressorUtility = typename Gemm::CompressorUtility;
+  CompressorUtility compressor_utility(prob_shape, a_stride);
+
+  // Allocate buffers for the metadata E and the compressed matrix A
+  int ME = compressor_utility.get_metadata_m_physical();
+  int KE = compressor_utility.get_metadata_k_physical();
+  int MC = compressor_utility.get_tensorA_m_physical();
+  int KC = compressor_utility.get_tensorA_k_physical();
+
+  auto const a_meta_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto const a_nzs_options =
+      torch::TensorOptions().dtype(a.dtype()).device(a.device());
+
+  auto a_meta = torch::zeros({ME, KE}, a_meta_options);
+  auto a_nzs = torch::zeros({MC, KC}, a_nzs_options);
+
+  auto a_ptr = static_cast<ElementA*>(a.data_ptr());
+  auto a_nzs_ptr = static_cast<ElementA*>(a_nzs.data_ptr());
+  auto a_meta_ptr = static_cast<ElementE*>(a_meta.data_ptr());
+
+  cutlass::KernelHardwareInfo hw_info;
+  hw_info.device_id = a.device().index();
+  hw_info.sm_count =
+      cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          hw_info.device_id);
+
+  using Compressor = typename Gemm::Compressor;
+  typename Compressor::Arguments arguments{
+      prob_shape, {a_ptr, a_stride, a_nzs_ptr, a_meta_ptr}, {hw_info}};
+
+  Compressor compressor_op;
+  size_t workspace_size = Compressor::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(compressor_op.can_implement(arguments));
+  CUTLASS_CHECK(compressor_op.initialize(arguments, workspace.data_ptr()));
+  CUTLASS_CHECK(compressor_op.run());
+  CUDA_CHECK(cudaDeviceSynchronize());
+
+  return {a_meta, a_nzs};
+}
+
 template <typename Gemm, typename... EpilogueArgs>
 void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                 torch::Tensor const& bt_nzs,
@@ -126,27 +215,25 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   // Interface stride expected from the argument a (will get transposed)
   // We compute C^T = B^T * A^T, but we assume B is transposed before
   // compression and hence the bt_* naming
-  using LayoutA = cutlass::layout::RowMajor;
   using LayoutB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutA;
   using LayoutE = typename Gemm::GemmKernel::CollectiveMainloop::LayoutE;
-  using LayoutD = cutlass::layout::RowMajor;
 
-  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
-  using StrideD = cutlass::detail::TagToStrideA_t<LayoutD>;
+  // M, N, K after transposition
+  int32_t m = out.size(1);
+  int32_t n = out.size(0);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldc = out.stride(0);
 
-  auto layout_A = make_cute_layout<StrideA>(a, "A");
-  auto layout_D = make_cute_layout<StrideD>(out, "D");
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = Stride<Int<1>, int64_t, int64_t>;
 
-  // Transpose A and D
-  // A doesn't need to be transposed since cutlass expects a NxK matrix
-  // for B (which is At)
-  auto stride_At = layout_A.stride();
-  auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride();
+  StrideA a_stride{lda, Int<1>{}, Int<0>{}};
+  StrideC c_stride{Int<1>{}, ldc, Int<0>{}};
 
   using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{
-      static_cast<int>(bt_nzs.size(0)), static_cast<int>(size<0>(layout_A)),
-      static_cast<int>(size<1>(layout_A)), 1};
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
 
   using ElementE = typename GemmKernel::CollectiveMainloop::ElementE;
   using SparseConfig = typename GemmKernel::CollectiveMainloop::SparseConfig;
@@ -158,13 +245,13 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   auto b_ptr = static_cast<ElementAB*>(bt_nzs.data_ptr());
   auto e_ptr = static_cast<ElementE*>(bt_meta.data_ptr());
   typename GemmKernel::MainloopArguments mainloop_args{
-      b_ptr, b_layout, a_ptr, stride_At, e_ptr, e_layout};
+      b_ptr, b_layout, a_ptr, a_stride, e_ptr, e_layout};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       Gemm::Epilogue::prepare_args(
           std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, stride_Dt, c_ptr, stride_Dt};
+      c_ptr, c_stride, c_ptr, c_stride};
 
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
                                       prob_shape, mainloop_args, epilogue_args};
@@ -185,6 +272,10 @@ void cutlass_sparse_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(status);
 }
 
+//////////////////////////////////////////////////
+// Gemm Configs are defined below
+//////////////////////////////////////////////////
+
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_config_default {};
@@ -192,28 +283,25 @@ struct sm90_config_default {};
 template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_config_default<half_t, OutType, Epilogue> {
-  // M in (128, inf)
-  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
+  using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<half_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename OutType,
           template <typename, typename, typename> typename Epilogue>
 struct sm90_config_default<cutlass::bfloat16_t, OutType, Epilogue> {
-  // M in (128, inf)
-  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+  using KernelSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
   using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
   using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
+  using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<cutlass::bfloat16_t, OutType, Epilogue, TileShape,
-                             ClusterShape, KernelSchedule, EpilogueSchedule,
-                             float>;
+                             ClusterShape, KernelSchedule, EpilogueSchedule>;
 };
 
 //////////////////////// Cherry-Picking Kernels ////////////////////////
@@ -227,7 +315,7 @@ struct sm90_fp8_config_1 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -242,7 +330,7 @@ struct sm90_fp8_config_2 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -255,7 +343,7 @@ struct sm90_fp8_config_3 {
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -269,7 +357,7 @@ struct sm90_fp8_config_4 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -283,7 +371,7 @@ struct sm90_fp8_config_5 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -296,7 +384,7 @@ struct sm90_fp8_config_6 {
   using ClusterShape = Shape<_1, _2, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -311,7 +399,7 @@ struct sm90_fp8_config_7 {
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -326,7 +414,7 @@ struct sm90_fp8_config_8 {
   using ClusterShape = Shape<_8, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 ////////////////////////////////////////////////////////////////////////
 
@@ -341,7 +429,7 @@ struct sm90_config_default<cutlass::float_e4m3_t, OutType, Epilogue> {
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<cutlass::float_e4m3_t, OutType, Epilogue,
                              TileShape, ClusterShape, KernelSchedule,
-                             EpilogueSchedule, float>;
+                             EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -355,12 +443,9 @@ struct sm90_fp8_config_M64 {
   using TileShape = Shape<_64, _64, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
-  using TileSchedule = cutlass::gemm::PersistentScheduler;
-
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float,
-                             TileSchedule>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -374,12 +459,9 @@ struct sm90_fp8_config_M128 {
   using TileShape = Shape<_64, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
-  using TileSchedule = cutlass::gemm::PersistentScheduler;
-
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float,
-                             TileSchedule>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -394,12 +476,9 @@ struct sm90_fp8_config_M256 {
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
-  using TileSchedule = cutlass::gemm::PersistentScheduler;
-
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float,
-                             TileSchedule>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -414,12 +493,9 @@ struct sm90_fp8_config_M512 {
   using TileShape = Shape<_128, _128, _256>;
   using ClusterShape = Shape<_1, _1, _1>;
 
-  using TileSchedule = cutlass::gemm::PersistentScheduler;
-
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, float,
-                             TileSchedule>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename OutType,
@@ -433,7 +509,7 @@ struct sm90_config_default<int8_t, OutType, Epilogue> {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<int8_t, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -448,7 +524,7 @@ struct sm90_int8_config_M128 {
   using ClusterShape = Shape<_2, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -462,7 +538,7 @@ struct sm90_int8_config_M64 {
   using ClusterShape = Shape<_1, _1, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -476,7 +552,7 @@ struct sm90_int8_config_M32_NBig {
   using ClusterShape = Shape<_1, _4, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
 template <typename InType, typename OutType,
@@ -490,7 +566,7 @@ struct sm90_int8_config_M32_NSmall {
   using ClusterShape = Shape<_1, _8, _1>;
   using Cutlass3xGemm =
       cutlass_sparse_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                             KernelSchedule, EpilogueSchedule, int32_t>;
+                             KernelSchedule, EpilogueSchedule>;
 };
 
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 371de0950bc..8c408719e8e 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -23,6 +23,9 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                                    torch::Tensor const& a_scales,
                                    torch::Tensor const& b_scales,
                                    std::optional<torch::Tensor> const& bias);
+
+using CompressorResult = std::tuple<torch::Tensor, torch::Tensor>;
+CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a);
 #endif
 
 void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
@@ -68,3 +71,30 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
       "CUDA device capability: ",
       version_num);
 }
+
+std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1);      // Row-major
+  TORCH_CHECK(a.stride(0) % 8 == 0);  // 8 Byte Alignment for Compression
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
+  if (version_num >= 90) {
+    std::vector<torch::Tensor> result_tensors;
+
+    auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);
+    result_tensors.push_back(std::move(a_nzs));
+    result_tensors.push_back(std::move(a_meta));
+    return result_tensors;
+  }
+#endif
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_sparse_compress for a compute capability less than "
+      "CUDA device capability: ",
+      version_num);
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 784ded26299..2fd45545eaf 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -348,10 +348,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
-  ops.def(
-      "cutlass_sparse_compress_entry(Tensor! a_nzs, Tensor! a_meta,"
-      "                              Tensor a) -> bool");
-  ops.impl("cutlass_sparse_compress_entry", &cutlass_sparse_compress_entry);
+  ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
+  ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
 
   // Mamba selective scan kernel
   ops.def(
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
index 4c613b75fc6..b0c5804715a 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -7,7 +7,6 @@
 
 import pytest
 import torch
-import torch.nn.functional as F
 
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -55,11 +54,39 @@ def prune_to_2_4(tensor):
     return pruned.reshape(original_shape)
 
 
+# This function checks that applying an identity matrix multiplication
+# to the compressed weights yields the original uncompressed weights.
+def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
+                                         b_compressed: torch.Tensor,
+                                         b_metadata: torch.Tensor):
+
+    # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
+    # same dtype as its inputs. This line addresses that constraint while
+    # arbitrarily using bfloat16 for the int8/fp8 cases.
+    out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16
+
+    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
+    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
+                                            b_compressed,
+                                            b_metadata,
+                                            eye_scale,
+                                            eye_scale,
+                                            out_dtype=out_dtype)
+
+    torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)
+
+
 def make_rand_sparse_tensors(
         dtype: torch.dtype, m: int, n: int, k: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
-    b = torch.randn((n, k), device='cuda').t() * 5
+    a = torch.randn((m, k), device='cuda')
+    b = torch.randn((n, k), device='cuda').t()
+
+    if dtype == torch.int8:
+        # ensure A and B aren't all zeros after rounding
+        a = a * 5.0
+        b = b * 5.0
 
     b = prune_to_2_4(b.t()).t()
 
@@ -75,6 +102,7 @@ def make_rand_sparse_tensors(
         raise ValueError("unsupported dtype")
 
     b_compressed, e = ops.cutlass_sparse_compress(b.t())
+    check_compress_decompress_invariance(dtype, b, b_compressed, e)
 
     # Compressed B, Metadata, Original A, B
     return b_compressed, e, a, b
@@ -134,27 +162,37 @@ def test_cutlass_sparse_subset():
 
 
 # Test working with a subset of A and B for sparse matmul
-@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(not sparse_cutlass_supported(),
                     reason="Sparse CUTLASS is not supported on this GPU type.")
-@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.parametrize("m, n, k", MNK_FACTORS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype],
+                             use_bias: bool):
 
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
     scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
     scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
 
+    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
+
     out = ops.cutlass_scaled_sparse_mm(a,
                                        b_comp,
                                        e,
                                        scale_a,
                                        scale_b,
-                                       out_dtype=dtype)
-    baseline = F.linear(a, b.T)
+                                       out_dtype=dtype,
+                                       bias=bias)
 
-    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=dtype,
+                                  bias=bias)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
 @pytest.mark.skipif(not sparse_cutlass_supported(),
@@ -162,27 +200,34 @@ def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
 @pytest.mark.parametrize("m, k, n", MNK_FACTORS)
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                     reason="FP8 is not supported on this GPU type.")
-def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
 
     # Create tensors
     b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
     scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
     scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_sparse_mm(a,
                                        b_comp,
                                        e,
                                        scale_a,
                                        scale_b,
-                                       out_dtype=torch.bfloat16)
+                                       out_dtype=out_dtype,
+                                       bias=bias)
 
     baseline = baseline_scaled_mm(a,
                                   b,
                                   scale_a,
                                   scale_b,
-                                  out_dtype=torch.bfloat16)
+                                  out_dtype=out_dtype,
+                                  bias=bias)
 
-    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 
 
 @pytest.mark.skipif(not sparse_cutlass_supported(),
@@ -198,18 +243,24 @@ def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
     b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
     scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
     scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
 
     out = ops.cutlass_scaled_sparse_mm(a,
                                        b_comp,
                                        e,
                                        scale_a,
                                        scale_b,
-                                       out_dtype=torch.bfloat16)
+                                       out_dtype=out_dtype,
+                                       bias=bias)
 
     baseline = baseline_scaled_mm(a,
                                   b,
                                   scale_a,
                                   scale_b,
-                                  out_dtype=torch.bfloat16)
+                                  out_dtype=out_dtype,
+                                  bias=bias)
 
     torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 67843c17740..9f2ced8fb08 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -564,22 +564,9 @@ def cutlass_sparse_compress(a: torch.Tensor) \
 
     # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
     elemsPerMetaElem = 4
+    assert (a.shape[1] % (2 * elemsPerMetaElem) == 0)
 
-    m = a.shape[0]
-    k = a.shape[1]
-    assert (k % 2 == 0)
-    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
-    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
-                         dtype=torch.uint8,
-                         device=a.device)
-
-    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
-        raise ValueError
-
-    assert (a_nzs.is_contiguous())
-    assert (a_meta.is_contiguous())
-
-    return a_nzs, a_meta
+    return torch.ops._C.cutlass_sparse_compress(a)
 
 
 def cutlass_scaled_sparse_mm(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 6ee3e9362f8..4c974d31319 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -408,13 +408,6 @@ def get_scheme(self,
         if self.supports_cutlass_24(weight_quant=weight_quant,
                                     input_quant=input_quant,
                                     sparsity_scheme=sparsity_scheme):
-            # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels
-            # currently produce bad output in some cases
-            if weight_quant is None:
-                logger.warning_once(
-                    "CompressedTensors24 scheme is disabled for the w16a16 "
-                    "case. Falling back to UnquantizedLinearMethod")
-                return None
             # Have a valid sparsity scheme
             # Validate layer is supported by Cutlass 2:4 Kernel
             model_compression_config = (None if sparsity_scheme is None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
index 0fb8dfa96a1..ec805c934e4 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py
@@ -64,7 +64,6 @@ def create_weights(
                 "Sparse CUTLASS not supported. vLLM must be built with "
                 "CUDA 12.2 or later to use this feature")
 
-        self.output_dtype = params_dtype
         layer.logical_widths = output_partition_sizes
         layer.input_size = input_size
         layer.input_size_per_partition = input_size_per_partition
@@ -205,6 +204,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.weight_scale = torch.nn.Parameter(
                     layer.weight_scale.data, requires_grad=False)
 
+        # Set all negative zero values to 0 prior to compression
+        if (layer.weight.dtype.is_floating_point
+                and layer.weight.dtype.itemsize >= 2):
+            layer.weight.data[layer.weight.data == -0.0] = 0.0
+
         w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
         layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
         layer.meta = torch.nn.Parameter(meta, requires_grad=False)
@@ -254,9 +258,10 @@ def apply_weights(
             bt_meta=layer.meta,
             scale_a=input_scale,
             scale_b=layer.weight_scale,
-            out_dtype=self.output_dtype,
+            out_dtype=x.dtype,
             bias=bias,
         )
+
         assert out.is_contiguous()
         return out
 

From 77dca1ca2db17c1e540b24813c291c4f919e3ca1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 14 Feb 2025 00:12:32 +0000
Subject: [PATCH 0159/1240] Revert "Add label if pre-commit passes" (#13242)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/add_label_precommit.yml | 38 -----------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 .github/workflows/add_label_precommit.yml

diff --git a/.github/workflows/add_label_precommit.yml b/.github/workflows/add_label_precommit.yml
deleted file mode 100644
index a88b44f03a5..00000000000
--- a/.github/workflows/add_label_precommit.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: Add label on pre-commit success
-on:
-    workflow_run:
-        workflows: [pre-commit]
-        types: [requested, completed]
-jobs:
-    add-label-on-pre-commit-success:
-        runs-on: ubuntu-latest
-        if: ${{ github.event.workflow_run.conclusion == 'success' }}
-        steps:
-            -   name: Add label
-                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-                with:
-                    script: |
-                        github.rest.issues.addLabels({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            issue_number: context.issue.number,
-                            labels: ['pre-commit-passed']
-                        })
-                env:
-                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-    remove-label-on-pre-commit-not-success:
-        runs-on: ubuntu-latest
-        if: ${{ github.event.workflow_run.conclusion != 'success' }}
-        steps:
-            -   name: Remove label
-                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-                with:
-                    script: |
-                        github.rest.issues.removeLabels({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            issue_number: context.issue.number,
-                            labels: ['pre-commit passed']
-                        })
-                env:
-                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From e4b4813da5e5df22490a328313882b416c4185f7 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 13 Feb 2025 20:29:26 -0500
Subject: [PATCH 0160/1240] [ROCm] Avoid using the default stream on ROCm
 (#13238)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1d7fbd4a787..b1bac649c97 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -942,11 +942,16 @@ def current_stream() -> torch.cuda.Stream:
     the underlying hypothesis is that we do not call `torch._C._cuda_setStream`
     from C/C++ code.
     """
+    from vllm.platforms import current_platform
     global _current_stream
     if _current_stream is None:
         # when this function is called before any stream is set,
         # we return the default stream.
-        _current_stream = torch.cuda.current_stream()
+        # On ROCm using the default 0 stream in combination with RCCL
+        # is hurting performance. Therefore creating a dedicated stream
+        # per process
+        _current_stream = torch.cuda.Stream() if current_platform.is_rocm(
+        ) else torch.cuda.current_stream()
     return _current_stream
 
 
From f7447b27314e2cede0c88a689a098eb3c41e0d10 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Fri, 14 Feb 2025 12:07:05 +0800
Subject: [PATCH 0161/1240] [Kernel] Fix awq error when n is not divisable by
 128 (#13227)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/awq/gemm_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index 9da724a1b43..53c47679cdd 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -334,7 +334,7 @@ __global__ void __launch_bounds__(64)
   }
 
   // TODO: Shang: Hoist loop invariance.
-  for (int ax1_0_1 = 0; ax1_0_1 < 4; ++ax1_0_1) {
+  for (int ax1_0_1 = 0; ax1_0_1 < (N / 32); ++ax1_0_1) {
     for (int local_id = 0; local_id < 8; ++local_id) {
       int row_offset = (((int)blockIdx_y) / j_factors1) * 16 +
                        ((int)threadIdx.x) / 4 + (local_id % 4) / 2 * 8;

From 7af83cdf23da80602f5a4888ca877fd5b19910e3 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 13 Feb 2025 20:19:03 -0800
Subject: [PATCH 0162/1240] [V1] Consolidate MM cache size to vllm.envs
 (#13239)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                     | 11 +++++++++--
 vllm/multimodal/registry.py      |  6 ++----
 vllm/v1/engine/mm_input_cache.py | 12 +++++++-----
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index d99c794e69e..f8a18cc662a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -55,6 +55,7 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
+    VLLM_MM_INPUT_CACHE_SIZE: int = 256
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -401,15 +402,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
 
     # Timeout for fetching videos when serving multimodal models
-    # Default is 15 seconds
+    # Default is 30 seconds
     "VLLM_VIDEO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
+    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
 
     # Timeout for fetching audio when serving multimodal models
     # Default is 10 seconds
     "VLLM_AUDIO_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
+    # Cache size for multimodal feature/input cache for multimodal models
+    # in unit of number of multimodal data items (e.g. image, video, audio).
+    # Default is 256 multimodal data items.
+    "VLLM_MM_INPUT_CACHE_SIZE":
+    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")),
+
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
     "VLLM_XLA_CACHE_PATH":
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 04141114288..613d1db4167 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -8,6 +8,7 @@
 
 import torch.nn as nn
 
+from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -28,9 +29,6 @@
 
 logger = init_logger(__name__)
 
-# TODO: Tune the MM cache size
-MM_CACHE_SIZE = 256
-
 N = TypeVar("N", bound=Type[nn.Module])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
@@ -121,7 +119,7 @@ def __init__(
 
         self._limits_by_model = _MultiModalLimits()
 
-        self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
+        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE)
 
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index e1b6679c284..a1d802bf818 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List, Optional
 
 from vllm.config import ModelConfig
+from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
@@ -28,9 +29,8 @@
 # client (=P0) and server (=P1) processes.
 
 # Both Client and Server must use the same cache size
-# (to perform mirrored caching)
-# TODO: Tune the MM cache size
-MM_CACHE_SIZE = 256
+# (to perform mirrored caching). This cache size is set by the environment
+# variable VLLM_MM_INPUT_CACHE_SIZE.
 
 
 # TODO(ywang96): Deprecate this class once all multimodal models migrate to use
@@ -50,7 +50,8 @@ def __init__(
 
         # Init cache
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+        self.mm_cache = LRUCache[str,
+                                 MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
@@ -127,7 +128,8 @@ class MMInputCacheServer:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
+        self.mm_cache = LRUCache[str,
+                                 MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
 
     def get_and_update(
         self,

From 47810b8202b7305dd5524d97c1d365845f9c1e2c Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Feb 2025 23:19:25 -0500
Subject: [PATCH 0163/1240] [Bugfix/CI] Turn
 test_compressed_tensors_2of4_sparse back on (#13250)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_compressed_tensors.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 0655f2b385f..c187b4c7ed9 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -495,7 +495,6 @@ def check_model(model):
         assert output
 
 
-@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(
     not sparse_cutlass_supported(),
     reason="2of4 Sparse is not yet supported on this GPU type.",

From e57a135efe39f7b63ffa36f55cd115dac845d32a Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 13 Feb 2025 23:19:43 -0500
Subject: [PATCH 0164/1240] [Bugfix][CI] Inherit codespell settings from
 pyproject.toml in the pre-commit-config (#13237)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 3 ++-
 pyproject.toml          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f664b4c558b..b1967065c09 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,8 @@ repos:
   rev: v2.4.0
   hooks:
   - id: codespell
-    exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|vllm/third_party/.*'
+    additional_dependencies: ['tomli']
+    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
   rev: 5.13.2
   hooks:
diff --git a/pyproject.toml b/pyproject.toml
index 849e8781e24..96d4aa149ab 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,7 @@ exclude = [
 
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE"
-skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
+skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
 
 [tool.isort]
 use_parentheses = true

From 71de38196a42ec25dfc10b0ebdbd5e15532d0ede Mon Sep 17 00:00:00 2001
From: XiaobingZhang <xiaobingzhangupc@gmail.com>
Date: Fri, 14 Feb 2025 12:20:47 +0800
Subject: [PATCH 0165/1240] [Bugfix] Offline example of disaggregated prefill
 (#13214)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/disaggregated_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index 2e41cabacca..36ee24bf7f1 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -22,7 +22,7 @@ def run_prefill(prefill_done):
     # and 3 and do prefilling on request 2.
     prompts = [
         "Hello, my name is",
-        # "Hi, your name is",
+        "Hi, your name is",
         # The decode node will actually "prefill" this request.
         "Tell me a very long story",
     ]

From 0e00a25e04aedb63cc6ac4bcfd9af53d3a0149c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Fri, 14 Feb 2025 14:07:25 +0800
Subject: [PATCH 0166/1240] [Misc] Remove redundant statements in scheduler.py
 (#13229)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/core/scheduler.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f507847ad82..1ba1d175a00 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -770,7 +770,6 @@ def _schedule_swapped(
             swapped_queue.popleft()
             self._swap_in(seq_group, blocks_to_swap_in)
             self._append_slots(seq_group, blocks_to_copy, enable_chunking)
-            is_prefill = seq_group.is_prefill()
             if is_prefill:
                 prefill_seq_groups.append(
                     ScheduledSequenceGroup(

From 767c34eb4dd7641223f9bbcc12f2767f7ec6226e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 14 Feb 2025 06:18:03 +0000
Subject: [PATCH 0167/1240] Consolidate Llama model usage in tests (#13094)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_basic_correctness.py  | 10 +++++-----
 tests/basic_correctness/test_chunked_prefill.py    |  6 +++---
 tests/basic_correctness/test_cpu_offload.py        |  2 +-
 tests/basic_correctness/test_cumem.py              |  2 +-
 tests/compile/test_basic_correctness.py            |  2 +-
 tests/compile/utils.py                             | 14 ++++----------
 tests/distributed/test_pipeline_parallel.py        |  4 ++--
 tests/entrypoints/openai/test_serving_models.py    |  2 +-
 tests/entrypoints/openai/test_shutdown.py          |  2 +-
 tests/kv_transfer/disagg_test.py                   | 10 ++++------
 tests/models/decoder_only/language/test_fp8.py     |  8 ++++----
 tests/models/registry.py                           |  2 +-
 .../test_register_quantization_config.py           |  2 +-
 tests/samplers/test_ignore_eos.py                  |  2 +-
 tests/spec_decode/e2e/test_compatibility.py        |  6 +++---
 tests/test_config.py                               |  2 +-
 tests/test_sharded_state_loader.py                 |  8 ++++----
 tests/tokenization/test_detokenize.py              |  2 +-
 tests/tokenization/test_get_eos.py                 |  4 ++--
 tests/v1/engine/test_async_llm.py                  |  2 +-
 tests/v1/sample/test_logprobs.py                   |  2 +-
 tests/v1/sample/test_logprobs_e2e.py               |  2 +-
 22 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index f001a893565..bd97dd945fe 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -17,7 +17,7 @@
 
 MODELS = [
     "google/gemma-2-2b-it",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@@ -96,12 +96,12 @@ def test_models(
     "test_suite", [
         ("facebook/opt-125m", "ray", "", "L4"),
         ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
         ("facebook/opt-125m", "ray", "", "A100"),
         ("facebook/opt-125m", "mp", "", "A100"),
         ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
     hf_runner,
@@ -116,7 +116,7 @@ def test_models_distributed(
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
-    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
         # test ray adag
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index cefd54d1c71..d041f0c4d09 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -20,7 +20,7 @@
 
 MODELS = [
     "facebook/opt-125m",
-    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
 ]
 
 
@@ -92,7 +92,7 @@ def test_models_distributed(
 ) -> None:
     override_backend_env_variable(monkeypatch, attention_backend)
 
-    if (model == "meta-llama/Llama-2-7b-hf"
+    if (model == "meta-llama/Llama-3.2-1B-Instruct"
             and distributed_executor_backend == "ray"):
         # test ray adag
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
@@ -221,7 +221,7 @@ def test_with_prefix_caching(
     Checks exact match decode with and without prefix caching
     with chunked prefill enabled.
     """
-    model = "meta-llama/Llama-2-7b-chat-hf"
+    model = "meta-llama/Llama-3.2-1B-Instruct"
     # The common prompt has 142 tokens with Llama-2 tokenizer.
     common_prompt = "You are a helpful AI assistant " * 20
     unique_prompts = [
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index b4d558ce22e..be3ad12396b 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -4,5 +4,5 @@
 
 
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B", [],
+    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 3ac948799d7..f16b8007a74 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -118,7 +118,7 @@ def model(x):
 @pytest.mark.parametrize(
     "model",
     [
-        "meta-llama/Llama-3.2-1B",  # sleep mode with safetensors
+        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
         "facebook/opt-125m"  # sleep mode with pytorch checkpoint
     ])
 def test_end_to_end(model):
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index d7acec690d8..587c0a60cee 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -26,7 +26,7 @@ class TestSetting:
 test_settings = [
     # basic llama model
     TestSetting(
-        model="meta-llama/Llama-3.2-1B",
+        model="meta-llama/Llama-3.2-1B-Instruct",
         model_args=[],
         pp_size=2,
         tp_size=2,
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index e4a88584e15..fb8270c26b1 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -6,7 +6,6 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
 TEST_MODELS = [
@@ -15,14 +14,14 @@
         "dtype": torch.float16,
         "quantization": "compressed-tensors"
     }),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
+    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
         "dtype": torch.float16,
-        "quantization": "fp8"
+        "quantization": "compressed-tensors"
     }),
-    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
         "quantization": "compressed-tensors"
     }),
-    ("meta-llama/Meta-Llama-3-8B", {}),
+    ("meta-llama/Llama-3.2-1B-Instruct", {}),
 ]
 
 if is_quant_method_supported("aqlm"):
@@ -69,11 +68,6 @@ def check_full_graph_support(model,
     # make sure these models can be captured in full graph mode
     os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
-    # The base meta llama uses too much memory.
-    if (model == "meta-llama/Meta-Llama-3-8B"
-            and optimization_level >= CompilationLevel.PIECEWISE):
-        return
-
     print(f"MODEL={model}")
 
     prompts = [
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index eb9cd5db9a4..06f9435816d 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -162,7 +162,7 @@ def iter_params(self, model_id: str):
     "internlm/internlm2-chat-7b": PPTestSettings.fast(),
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
-    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
     # Uses Llama
@@ -230,7 +230,7 @@ def iter_params(self, model_id: str):
 TEST_MODELS = [
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
-    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B-Instruct",
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 70ca8507a54..55900163eef 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -14,7 +14,7 @@
                                                     OpenAIServingModels)
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "meta-llama/Llama-2-7b"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = (
     "Success: LoRA adapter '{lora_name}' added successfully.")
diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py
index 5edf85ab52f..0f12ac9b260 100644
--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
@@ -5,7 +5,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "meta-llama/Llama-3.2-1B"
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
 
 @pytest.mark.asyncio
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index 97e0d6eb1f9..5b9ea6dba40 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -28,7 +28,7 @@ def setup_servers():
         "-m",
         "vllm.entrypoints.openai.api_server",
         "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
         "--port",
         "8100",
         "--gpu-memory-utilization",
@@ -49,7 +49,7 @@ def setup_servers():
         "-m",
         "vllm.entrypoints.openai.api_server",
         "--model",
-        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "meta-llama/Llama-3.2-1B-Instruct",
         "--port",
         "8200",
         "--gpu-memory-utilization",
@@ -100,8 +100,7 @@ def test_disaggregated_prefilling(prompt):
     response = requests.post("http://localhost:8100/v1/completions",
                              headers={"Content-Type": "application/json"},
                              json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                  "prompt": prompt,
                                  "max_tokens": 1,
                                  "temperature": 0
@@ -112,8 +111,7 @@ def test_disaggregated_prefilling(prompt):
     response = requests.post("http://localhost:8200/v1/completions",
                              headers={"Content-Type": "application/json"},
                              json={
-                                 "model":
-                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "model": "meta-llama/Llama-3.2-1B-Instruct",
                                  "prompt": prompt,
                                  "max_tokens": 10,
                                  "temperature": 0
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 6a0e148d567..27c125160aa 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -26,12 +26,12 @@
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
         ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
          "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
-        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
         ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
          "meta-llama/Llama-3.2-1B-Instruct"),
-        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
-         "meta-llama/Llama-2-7b-chat-hf")
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct")
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 9c0e6b3374d..c3e1c785979 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -141,7 +141,7 @@ def check_available_online(
     "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
     "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
                                         extras={"tiny": "ai21labs/Jamba-tiny-dev"}),  # noqa: E501
-    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.2-1B-Instruct"),
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 9e1867f913e..da59dc75afc 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -99,7 +99,7 @@ def test_register_quantization_config():
 
 @pytest.mark.parametrize(argnames="model",
                          argvalues=[
-                             "meta-llama/Meta-Llama-3-8B-Instruct",
+                             "meta-llama/Llama-3.2-1B-Instruct",
                          ])
 def test_custom_quant(vllm_runner, model):
     """Test infer with the custom quantization method."""
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 7f26698c927..9a92b08ff3f 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -10,7 +10,7 @@
 
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
+MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 14a0ebf1d63..83d1551afe5 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "model": "meta-llama/Llama-3.2-1B-Instruct",
     "speculative_model": "JackFram/llama-68m",
     "num_speculative_tokens": 5,
 }])
@@ -27,8 +27,8 @@
         },
         {
             # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
-            "speculative_max_model_len": 4096 + 1,
+            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_max_model_len": 131072 + 1,
         },
     ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
diff --git a/tests/test_config.py b/tests/test_config.py
index 3fb83b4c032..746ca7295a8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -251,7 +251,7 @@ def test_rope_customization():
 @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
     ("facebook/opt-125m", False),
     ("facebook/bart-base", True),
-    ("meta-llama/Llama-3.2-1B", False),
+    ("meta-llama/Llama-3.2-1B-Instruct", False),
     ("meta-llama/Llama-3.2-11B-Vision", True),
 ])
 def test_is_encoder_decoder(model_id, is_encoder_decoder):
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 088b95be721..8406f305215 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -46,9 +46,9 @@ def test_filter_subtensors():
 
 
 @pytest.fixture(scope="module")
-def llama_2_7b_files():
+def llama_3p2_1b_files():
     with TemporaryDirectory() as cache_dir:
-        input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
+        input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
                                       cache_dir=cache_dir,
                                       ignore_patterns=["*.bin*", "original/*"])
 
@@ -81,13 +81,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_2_7b_files):
+                              llama_3p2_1b_files):
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     weights_patterns = ("*.safetensors", )
     gpu_memory_utilization = 0.8
-    input_dir = llama_2_7b_files
+    input_dir = llama_3p2_1b_files
     ctx = mp.get_context("spawn")
 
     # Run in separate processes for memory & CUDA isolation
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 57832394d0f..851c79d2e09 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -31,7 +31,7 @@
     "bigscience/bloom-560m",
     "mosaicml/mpt-7b",
     "tiiuae/falcon-7b",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B-Instruct",
     "codellama/CodeLlama-7b-hf",
     "mistralai/Pixtral-12B-2409",
 ]
diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py
index 787fb6ea63f..fc47bcb9de3 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/tokenization/test_get_eos.py
@@ -9,7 +9,7 @@
 
 
 def test_get_llama3_eos_token():
-    model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
 
     tokenizer = get_tokenizer(model_name)
     assert tokenizer.eos_token_id == 128009
@@ -17,7 +17,7 @@ def test_get_llama3_eos_token():
     generation_config = try_get_generation_config(model_name,
                                                   trust_remote_code=False)
     assert generation_config is not None
-    assert generation_config.eos_token_id == [128001, 128009]
+    assert generation_config.eos_token_id == [128001, 128008, 128009]
 
 
 def test_get_blip2_eos_token():
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 94e18289e3c..05197f44f93 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -17,7 +17,7 @@
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
 
-ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
                               enforce_eager=True,
                               disable_log_requests=True)
 
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 86c576cd70a..a26a8c4ed07 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -14,7 +14,7 @@
 
 from ...conftest import VllmRunner
 
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 DTYPE = "half"
 
 
diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py
index 28c177fd497..f6277006016 100644
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@@ -11,7 +11,7 @@
 EXPECTED_VALUE = 0.62
 
 # FIXME(rob): enable prefix caching once supported.
-MODEL = "meta-llama/Llama-3.2-1B"
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 MODEL_ARGS = f"pretrained={MODEL},enforce_eager=True,enable_prefix_caching=False"  # noqa: E501
 SERVER_ARGS = [
     "--enforce_eager", "--no_enable_prefix_caching", "--disable-log-requests"

From e3e1adc270400e65adbc95519cd9ca3d6ee22fda Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 14 Feb 2025 01:19:22 -0500
Subject: [PATCH 0168/1240] Expand MLA to support most types of quantization
 (#13181)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/utils.py       | 71 +++++++----------
 vllm/config.py                             | 32 +-------
 vllm/model_executor/model_loader/loader.py | 90 ++++++++--------------
 3 files changed, 61 insertions(+), 132 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index a41140ec837..e9b4dff74f4 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -26,7 +26,7 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    scaled_dequantize, scaled_quantize)
+    scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
@@ -220,16 +220,6 @@ def _q_proj_and_k_up_proj(self, x):
                 .view(-1, self.num_heads, self.kv_lora_rank)
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
-
-        def is_layer_fp8(layer: LinearBase) -> bool:
-            return isinstance(layer.quant_method, Fp8LinearMethod) or\
-                (isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
-                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8))
-
-        def quantization_scheme_supported(layer: LinearBase) -> bool:
-            return isinstance(layer.quant_method, UnquantizedLinearMethod) or \
-                is_layer_fp8(layer)
-
         # TODO(lucas) This is very gross, we need a more wide scale refactor of
         # all the FP8 code with a more standard way of
         # defining schemes/group-shapes, we should also potentially force
@@ -239,7 +229,7 @@ def quantization_scheme_supported(layer: LinearBase) -> bool:
         def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
             Tuple[Tuple[int, int], Tuple[int, int]]:
             if isinstance(layer.quant_method, Fp8LinearMethod):
-                if layer.quant_method.block_quant is not None:
+                if layer.quant_method.block_quant:
                     weight_block_size = \
                         layer.quant_method.quant_config.weight_block_size
                     # per-token-group (1, X), block-quantized (X, Y)
@@ -267,41 +257,32 @@ def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
                     f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
                 )
 
-        def get_scales(layer: LinearBase) -> torch.Tensor:
-            if hasattr(layer, "weight_scale_inv"):
-                return layer.weight_scale_inv
-            return layer.weight_scale
-
-        def get_and_maybe_dequant_weights(layer: LinearBase):
-            if is_layer_fp8(layer):
-                if isinstance(layer.quant_method, \
-                    CompressedTensorsLinearMethod) and \
-                    isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
-                    # NOTE(lucas): note sure why but `CompressedTensorsW8A8Fp8`
-                    # seems to store weights as (input, output) instead of
-                    # (output, input) so we need to transpose
-                    weight = layer.weight.T  # standardize to (output, input)
-                else:
-                    weight = layer.weight
-                _, weight_scale_group_shape = \
-                    get_scale_group_shapes_for_fp8(layer)
-                scales = get_scales(layer)
-
-                return scaled_dequantize(weight, scales,
-                                         weight_scale_group_shape)
-            else:
+        def get_layer_weight(layer):
+            if hasattr(layer, "weight"):
                 return layer.weight
+            elif hasattr(layer, "qweight"):
+                return layer.qweight
+            else:
+                raise AttributeError(
+                    f"Layer '{layer}' has neither weight nor qweight")
 
-        if not (quantization_scheme_supported(self.kv_b_proj) and\
-            quantization_scheme_supported(self.q_proj) and\
-                quantization_scheme_supported(self.o_proj)):
-            raise NotImplementedError(
-                "Only FP8 and UnquantizedLinearMethod are supported for MLA"
-                ", please run with VLLM_MLA_DISABLE=1")
-
-        weight_dtype = self.kv_b_proj.weight.dtype
-        assert self.o_proj.weight.dtype == weight_dtype
-        assert self.q_proj.weight.dtype == weight_dtype
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
+        assert get_layer_weight(self.o_proj).dtype == weight_dtype
+        assert get_layer_weight(self.q_proj).dtype == weight_dtype
 
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
diff --git a/vllm/config.py b/vllm/config.py
index 10004b8f629..87ceb19056e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -991,37 +991,7 @@ def is_cross_encoder(self) -> bool:
 
     @property
     def use_mla(self) -> bool:
-        if not self.is_deepseek_mla or envs.VLLM_MLA_DISABLE:
-            return False
-
-        if self.quantization is not None and self.quantization not in [\
-            "fp8", "compressed-tensors"]:
-            logger.warning(
-                "MLA is not supported with %s quantization. "
-                "Disabling MLA.", self.quantization)
-            return False
-
-        # If using a "compressed-tensors" checkpoint, check that all groups
-        # have fp8 for both weights and activations.
-        if self.quantization == "compressed-tensors":
-            quant_config = self._parse_quant_hf_config()
-            for group_name, cfg in quant_config.get("config_groups", {
-                    "": {}
-            }).items():
-                act_cfg = cfg.get("input_activations", {})
-                act_type = None if act_cfg is None else act_cfg.get("type", "")
-                w_cfg = cfg.get("weights", {})
-                w_type = None if w_cfg is None else w_cfg.get("type", "")
-                if act_type != "fp8" or w_type != "fp8":
-                    logger.warning(
-                        "compressed-tensors MLA support requires fp8 "
-                        "activations and weights in group '%s', but got "
-                        "activations type '%s' and weights type '%s'.\n "
-                        "Full config: %s", group_name, act_type, w_type,
-                        quant_config)
-                    return False
-
-        return True
+        return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
 
     @property
     def supported_runner_types(self) -> Set[RunnerType]:
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 2a2c2523b72..230484a36de 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -153,6 +153,30 @@ def _initialize_model(
         return model_class(**kwargs)
 
 
+def _process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
+                                   target_device: torch.device) -> None:
+    for _, module in model.named_modules():
+        quant_method = getattr(module, "quant_method", None)
+        if isinstance(quant_method, QuantizeMethodBase):
+            # When quant methods need to process weights after loading
+            # (for repacking, quantizing, etc), they expect parameters
+            # to be on the global target device. This scope is for the
+            # case where cpu offloading is used, where we will move the
+            # parameters onto device for processing and back off after.
+            with device_loading_context(module, target_device):
+                quant_method.process_weights_after_loading(module)
+
+    # Currently only used by MLA.
+    # NOTE: This intentionally happens after other modules so we can easily
+    # decompress the weights for MLA.
+    for _, module in model.named_modules():
+        if isinstance(module, Attention) and \
+            hasattr(module, "process_weights_after_loading"):
+            # TODO(lucas): see if there is a way to unify the signatures
+            # of process_weights_after_loading
+            module.process_weights_after_loading(model_config.dtype)
+
+
 class BaseModelLoader(ABC):
     """Base class for model loaders."""
 
@@ -376,7 +400,6 @@ def download_model(self, model_config: ModelConfig) -> None:
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
-
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
@@ -394,23 +417,8 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                         "Following weights were not initialized from "
                         f"checkpoint: {weights_not_loaded}")
 
-            for _, module in model.named_modules():
-                quant_method = getattr(module, "quant_method", None)
-                if isinstance(quant_method, QuantizeMethodBase):
-                    # When quant methods need to process weights after loading
-                    # (for repacking, quantizing, etc), they expect parameters
-                    # to be on the global target device. This scope is for the
-                    # case where cpu offloading is used, where we will move the
-                    # parameters onto device for processing and back off after.
-                    with device_loading_context(module, target_device):
-                        quant_method.process_weights_after_loading(module)
-                if isinstance(module, Attention) and \
-                    hasattr(module, "process_weights_after_loading"):
-                    # When attention modules need to process weights after
-                    # currently only used by MLA
-                    # TODO(lucas): see if there is a way to unify the signatures
-                    # of process_weights_after_loading
-                    module.process_weights_after_loading(model_config.dtype)
+            _process_weights_after_loading(model, model_config, target_device)
+
         return model.eval()
 
 
@@ -429,29 +437,15 @@ def download_model(self, model_config: ModelConfig) -> None:
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
+        target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
             # NOTE(woosuk): For accurate performance evaluation, we assign
             # random values to the weights.
             initialize_dummy_weights(model)
 
-            for _, module in model.named_modules():
-                quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
-                    # When quant methods need to process weights after loading
-                    # (for repacking, quantizing, etc), they expect parameters
-                    # to be on the global target device. This scope is for the
-                    # case where cpu offloading is used, where we will move the
-                    # parameters onto device for processing and back off after.
-                    with device_loading_context(
-                            module, torch.device(device_config.device)):
-                        quant_method.process_weights_after_loading(module)
-                if isinstance(module, Attention) and \
-                    hasattr(module, "process_weights_after_loading"):
-                    # When attention modules need to process weights after
-                    # currently only used by MLA
-                    module.process_weights_after_loading(model_config.dtype)
+            _process_weights_after_loading(model, model_config, target_device)
         return model.eval()
 
 
@@ -632,6 +626,7 @@ def download_model(self, model_config: ModelConfig) -> None:
     def load_model(self, vllm_config: VllmConfig) -> nn.Module:
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
+        target_device = torch.device(device_config.device)
         from safetensors.torch import safe_open
 
         from vllm.distributed import get_tensor_model_parallel_rank
@@ -640,18 +635,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                                                  model_config.revision)
 
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
-                for _, module in model.named_modules():
-                    quant_method = getattr(module, "quant_method", None)
-                    if quant_method is not None:
-                        quant_method.process_weights_after_loading(module)
-                    if isinstance(module, Attention) and \
-                        hasattr(module, "process_weights_after_loading"):
-                        # When attention modules need to process weights after
-                        # currently only used by MLA
-                        module.process_weights_after_loading(
-                            model_config.dtype)
+                _process_weights_after_loading(model, model_config,
+                                               target_device)
             rank = get_tensor_model_parallel_rank()
             pattern = os.path.join(
                 local_model_path,
@@ -1401,16 +1388,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                 self._get_weights_iterator(model_weights,
                                            model_config.revision))
 
-            for _, module in model.named_modules():
-                quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
-                    with device_loading_context(module, target_device):
-                        quant_method.process_weights_after_loading(module)
-                if isinstance(module, Attention) and \
-                    hasattr(module, "process_weights_after_loading"):
-                    # When attention modules need to process weights after
-                    # currently only used by MLA
-                    module.process_weights_after_loading(model_config.dtype)
+            _process_weights_after_loading(model, model_config, target_device)
         return model.eval()
 
 
From 7b3aa02c69db19ab1f317e65abf0f32629fc206a Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 14 Feb 2025 11:51:12 +0530
Subject: [PATCH 0169/1240] [V1] LoRA - Enable Serving Usecase (#12883)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_add_lora.py               | 165 ++++++++++++++++++++++
 vllm/v1/engine/__init__.py                |   1 +
 vllm/v1/engine/async_llm.py               |   8 +-
 vllm/v1/engine/core.py                    |  18 ++-
 vllm/v1/engine/core_client.py             |  16 +++
 vllm/v1/worker/gpu_worker.py              |   4 +
 vllm/v1/worker/lora_model_runner_mixin.py |   5 +
 7 files changed, 210 insertions(+), 7 deletions(-)
 create mode 100644 tests/lora/test_add_lora.py

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
new file mode 100644
index 00000000000..df8031cba68
--- /dev/null
+++ b/tests/lora/test_add_lora.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import time
+from pathlib import Path
+from typing import List
+
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.utils import merge_async_iterators
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
+LORA_RANK = 8
+DEFAULT_MAX_LORAS = 16 * 3
+
+
+def download_and_prepare_lora_module():
+    """
+    Request submission is expensive when the LoRA adapters have their own
+    tokenizers. This is because, for each request with a new LoRA adapter ID,
+    the front-end loads the tokenizer from disk.
+
+    In this test, as we are comparing request processing times, we want to
+    minimize any extra activity. To this effect, we download the LoRA
+    adapter and remove all the tokenizer files, so the engine will default
+    to the base model tokenizer.
+    """
+    global LORA_MODULE_DOWNLOAD_PATH
+
+    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
+    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
+
+    tokenizer_files = [
+        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
+        'tokenizer.model'
+    ]
+    for tokenizer_file in tokenizer_files:
+        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
+        del_path.unlink()
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def get_lora_requests() -> List[LoRARequest]:
+    lora_requests: List[LoRARequest] = [
+        LoRARequest(lora_name=f"{i}",
+                    lora_int_id=i,
+                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
+        for i in range(1, DEFAULT_MAX_LORAS + 1)
+    ]
+    return lora_requests
+
+
+async def requests_processing_time(llm,
+                                   lora_requests: List[LoRARequest]) -> float:
+
+    sampling_params = SamplingParams(n=1,
+                                     temperature=0.0,
+                                     top_p=1.0,
+                                     ignore_eos=True,
+                                     max_tokens=1)
+
+    generators = []
+    start = time.perf_counter()
+
+    for lora_request in lora_requests:
+        lora_int_id = lora_request.lora_int_id
+        generator = llm.generate(
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
+                              multi_modal_data=None),  # type: ignore 
+            sampling_params=sampling_params,
+            lora_request=lora_request,
+            request_id=f"test{lora_int_id}")
+        generators.append(generator)
+
+    all_gens = merge_async_iterators(*generators)
+    async for i, res in all_gens:
+        pass
+
+    end = time.perf_counter()
+    return end - start
+
+
+@pytest.mark.asyncio
+async def test_add_lora():
+    """ 
+    The add_lora function is used to pre-load some LoRA adapters into the
+    engine in anticipation of future requests using these adapters. To test
+    this functionality, we use the async engine to process some requests - We
+    do it twice, once with add_lora() pre-loading and once without.
+
+    We measure the request processing time in both cases and expect the time 
+    to be lesser in the case with add_lora() calls.
+    """
+
+    download_and_prepare_lora_module()
+
+    lora_requests: List[LoRARequest] = get_lora_requests()
+
+    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,  #avoid OOM
+        enforce_eager=True)
+
+    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
+    # environment variable. reload vllm.enging.async_llm_engine as
+    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
+    # env var.
+    import importlib
+
+    import vllm.engine.async_llm_engine
+    importlib.reload(vllm.engine.async_llm_engine)
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
+
+    # split lora_requests into 3 parts
+    part_size = len(lora_requests) // 3
+    dummy_run_requests = lora_requests[:part_size]
+    warmup_run_requests = lora_requests[part_size:part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2:]
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+
+        # Dummy run - So any 1-time functionality like triton kernel compilation
+        # is complete here.
+        await requests_processing_time(llm, dummy_run_requests)
+
+        # Run with warmup
+        for lr in warmup_run_requests:
+            await llm.add_lora(lr)
+        # Wait for the add_lora function to complete on the server side.
+        await asyncio.sleep(30)
+        time_with_add_lora = await requests_processing_time(
+            llm, warmup_run_requests)
+
+        # Run without any warmup
+        time_cold_start = await requests_processing_time(
+            llm, cold_run_requests)
+
+    print(f"time hot-start {time_with_add_lora} vs "
+          f"time cold-start {time_cold_start} ")
+
+    assert time_with_add_lora < time_cold_start, (
+        f"time_with_add_lora={time_with_add_lora}, "
+        f"time_cold_start={time_cold_start}"
+        "The engine request processing time with LoRA pre-loading "
+        "must be less than the version that does on-demand LoRA loading.")
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 782fdcee380..dee7102bb47 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -134,3 +134,4 @@ class EngineCoreRequestType(enum.Enum):
     ABORT = b'\x01'
     PROFILE = b'\x02'
     RESET_PREFIX_CACHE = b'\x03'
+    ADD_LORA = b'\x04'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index f19d2ed8bcb..a669c9f6267 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -361,6 +361,10 @@ async def stop_profile(self) -> None:
     async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        await self.engine_core.add_lora_async(lora_request)
+
     @property
     def is_running(self) -> bool:
         return True
@@ -376,7 +380,3 @@ def errored(self) -> bool:
     @property
     def dead_error(self) -> BaseException:
         return Exception()  # TODO: implement
-
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        """Load a new LoRA adapter into the engine for future requests."""
-        raise NotImplementedError("LoRA not yet supported in V1")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 4642ac1778e..401f331d81d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -13,6 +13,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
@@ -146,6 +147,9 @@ def profile(self, is_start: bool = True):
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
+    def add_lora(self, lora_request: LoRARequest) -> None:
+        self.model_executor.add_lora(lora_request)
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -262,12 +266,15 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
             self.reset_prefix_cache()
         elif request_type == EngineCoreRequestType.PROFILE:
             self.model_executor.profile(request)
+        elif request_type == EngineCoreRequestType.ADD_LORA:
+            self.model_executor.add_lora(request)
 
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
+        add_lora_decoder = MsgpackDecoder(LoRARequest)
         generic_decoder = MsgpackDecoder()
 
         with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
@@ -277,9 +284,14 @@ def process_input_socket(self, input_path: str):
                 request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
-                decoder = add_request_decoder if (
-                    request_type
-                    == EngineCoreRequestType.ADD) else generic_decoder
+                decoder = None
+                if request_type == EngineCoreRequestType.ADD:
+                    decoder = add_request_decoder
+                elif request_type == EngineCoreRequestType.ADD_LORA:
+                    decoder = add_lora_decoder
+                else:
+                    decoder = generic_decoder
+
                 request = decoder.decode(data_frame.buffer)
 
                 # Push to input queue for core busy loop.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index b3de5cdc244..07176629e94 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,6 +12,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
@@ -77,6 +78,9 @@ def reset_prefix_cache(self) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
+    def add_lora(self, lora_request: LoRARequest) -> None:
+        raise NotImplementedError
+
     async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
@@ -92,6 +96,9 @@ async def reset_prefix_cache_async(self) -> None:
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
+    async def add_lora_async(self, lora_request: LoRARequest) -> None:
+        raise NotImplementedError
+
 
 class InprocClient(EngineCoreClient):
     """
@@ -125,6 +132,9 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         self.engine_core.reset_prefix_cache()
 
+    def add_lora(self, lora_request: LoRARequest) -> None:
+        self.engine_core.add_lora(lora_request)
+
 
 class MPClient(EngineCoreClient):
     """
@@ -242,6 +252,9 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
 
+    def add_lora(self, lora_request: LoRARequest) -> None:
+        self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -295,3 +308,6 @@ async def profile_async(self, is_start: bool = True) -> None:
 
     async def reset_prefix_cache_async(self) -> None:
         await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
+
+    async def add_lora_async(self, lora_request: LoRARequest) -> None:
+        await self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 8f2ffe5f16f..10154a75239 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -15,6 +15,7 @@
                               init_distributed_environment,
                               set_custom_all_reduce)
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
@@ -234,6 +235,9 @@ def profile(self, is_start: bool = True):
         else:
             self.profiler.stop()
 
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
+
     def check_health(self) -> None:
         # worker will always be healthy as long as it's running.
         return
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index e7501ad2ea1..053897da0aa 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -127,3 +127,8 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig,
 
             # __exit__ code
             self.lora_manager.remove_all_adapters()
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.add_adapter(lora_request)
\ No newline at end of file

From 5d9b11fb671f3423bd03f284a77bfe02dccb9782 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 13 Feb 2025 22:21:50 -0800
Subject: [PATCH 0170/1240] [ROCm][V1] Add intial ROCm support to V1 (#12790)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-rocm-build.txt              |  16 ++
 vllm/attention/ops/prefix_prefill.py     |   6 +-
 vllm/platforms/rocm.py                   |  45 ++++--
 vllm/v1/attention/backends/flash_attn.py |   5 +-
 vllm/v1/attention/backends/rocm_attn.py  | 182 +++++++++++++++++++++++
 5 files changed, 236 insertions(+), 18 deletions(-)
 create mode 100644 requirements-rocm-build.txt
 create mode 100644 vllm/v1/attention/backends/rocm_attn.py

diff --git a/requirements-rocm-build.txt b/requirements-rocm-build.txt
new file mode 100644
index 00000000000..00ae0340fc5
--- /dev/null
+++ b/requirements-rocm-build.txt
@@ -0,0 +1,16 @@
+# Common dependencies
+-r requirements-common.txt
+
+--extra-index-url https://download.pytorch.org/whl/rocm6.2
+torch==2.5.1
+torchvision==0.20.1
+torchaudio==2.5.1
+
+cmake>=3.26
+ninja
+packaging
+setuptools>=61
+setuptools-scm>=8
+wheel
+jinja2
+amdsmi==6.2.4
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 5fca1639363..362c46a95f3 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -718,7 +718,8 @@ def context_attention_fwd(q,
                               k_scale: torch.Tensor,
                               v_scale: torch.Tensor,
                               alibi_slopes=None,
-                              sliding_window=None):
+                              sliding_window=None,
+                              sm_scale=None):
 
         q_dtype_is_f32 = q.dtype is torch.float32
         # need to reduce num. blocks when using fp32
@@ -759,7 +760,8 @@ def context_attention_fwd(q,
         # round up Lk to a power of 2 - this is required for Triton block size
         Lk_padded = triton.next_power_of_2(Lk)
 
-        sm_scale = 1.0 / (Lq**0.5)
+        if sm_scale is None:
+            sm_scale = 1.0 / (Lq**0.5)
         batch, head = b_seq_len.shape[0], q.shape[1]
         num_queries_per_kv = q.shape[1] // k.shape[1]
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 13aebc605af..d57cce4231d 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 from functools import lru_cache
 from typing import TYPE_CHECKING, Dict, List, Optional
 
@@ -29,12 +28,6 @@
 except ImportError as e:
     logger.warning("Failed to import from vllm._rocm_C with %r", e)
 
-if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
-    logger.warning("`fork` method is not supported by ROCm. "
-                   "VLLM_WORKER_MULTIPROC_METHOD is overridden to"
-                   " `spawn` instead.")
-    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
 # Models not supported by ROCm.
 _ROCM_UNSUPPORTED_MODELS: List[str] = []
 
@@ -84,6 +77,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             return "vllm.attention.backends.triton_mla.TritonMLABackend"
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
+        if envs.VLLM_USE_V1:
+            logger.info("Using ROCm Attention backend on V1 engine.")
+            return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
         if selected_backend == _Backend.ROCM_FLASH:
             if not cls.has_device_capability(90):
                 # not Instinct series GPUs.
@@ -102,7 +98,11 @@ def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
     @classmethod
     @lru_cache(maxsize=8)
     def get_device_name(cls, device_id: int = 0) -> str:
-        return torch.cuda.get_device_name(device_id)
+        # NOTE: When using V1 this function is called when overriding the
+        # engine args. Calling torch.cuda.get_device_name(device_id) here
+        # will result in the ROCm context being initialized before other
+        # processes can be created.
+        return "AMD"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -129,15 +129,30 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
-                parallel_config.worker_cls = \
-                    "vllm.worker.multi_step_worker.MultiStepWorker"
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on VLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
-                parallel_config.worker_cls = \
-                    "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-                parallel_config.sd_worker_cls = \
-                    "vllm.worker.worker.Worker"
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Speculative decoding is not yet supported on VLLM V1."
+                    )
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = \
+                        "vllm.worker.worker.Worker"
             else:
-                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
     @classmethod
     def verify_model_arch(cls, model_arch: str) -> None:
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 5cb1e2fd26a..b1b5cc35925 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -12,8 +12,11 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.vllm_flash_attn import flash_attn_varlen_func
+
+if current_platform.is_cuda():
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
new file mode 100644
index 00000000000..5f3eb37514d
--- /dev/null
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with PagedAttention on rocm"""
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.ops.paged_attn import PagedAttention
+from vllm.attention.ops.prefix_prefill import context_attention_fwd
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+
+logger = init_logger(__name__)
+
+
+class ROCmAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_ATTN_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> Type["ROCmAttentionImpl"]:
+        return ROCmAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class ROCmAttentionImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        if blocksparse_params is not None:
+            raise ValueError(
+                "ROCmAttention does not support block-sparse attention.")
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.alibi_slopes = alibi_slopes
+        if sliding_window is None:
+            self.sliding_window = (-1, -1)
+        else:
+            self.sliding_window = (sliding_window - 1, 0)
+        self.kv_cache_dtype = kv_cache_dtype
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        support_head_sizes = ROCmAttentionBackend.get_supported_head_sizes()
+        if head_size not in support_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by ROCmAttention. "
+                f"Supported head sizes are: {support_head_sizes}.")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "ROCmAttentionImpl")
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        assert attn_metadata.use_cascade is False
+
+        # IMPORTANT!
+        # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in
+        # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead
+        # in this method. For example, `view` and `slice` (or `[:n]`) operations
+        # are surprisingly slow even in the case they do not invoke any GPU ops.
+        # Minimize the PyTorch ops in this method as much as possible.
+        # Whenever making a change in this method, please benchmark the
+        # performance to make sure it does not introduce any overhead.
+
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        key_cache, value_cache = PagedAttention.split_kv_cache(
+            kv_cache, self.num_kv_heads, self.head_size)
+
+        # Reshape the input keys and values and store them in the cache.
+        PagedAttention.write_to_paged_cache(
+            key,
+            value,
+            key_cache,
+            value_cache,
+            attn_metadata.slot_mapping,
+            self.kv_cache_dtype,
+            layer._k_scale,
+            layer._v_scale,
+        )
+
+        # TODO(sage): Refactor the context_attention_fwd kernel so that this
+        # overhead can be removed
+        context_lens = torch.empty_like(attn_metadata.seq_lens)
+        batch_size = len(attn_metadata.query_start_loc) - 1
+        assert len(context_lens) == batch_size
+        for i in range(batch_size):
+            query_start = attn_metadata.query_start_loc[i]
+            query_end = attn_metadata.query_start_loc[i + 1]
+            context_lens[i] = attn_metadata.seq_lens[i] - (query_end -
+                                                           query_start)
+
+        # Compute attention and update output up to `num_actual_tokens`.
+        context_attention_fwd(q=query[:num_actual_tokens],
+                              k=key[:num_actual_tokens],
+                              v=value[:num_actual_tokens],
+                              o=output[:num_actual_tokens],
+                              kv_cache_dtype=self.kv_cache_dtype,
+                              k_cache=key_cache,
+                              v_cache=value_cache,
+                              b_loc=attn_metadata.block_table,
+                              b_start_loc=attn_metadata.query_start_loc,
+                              b_seq_len=attn_metadata.seq_lens,
+                              b_ctx_len=context_lens,
+                              max_input_len=attn_metadata.max_query_len,
+                              k_scale=layer._k_scale,
+                              v_scale=layer._v_scale,
+                              alibi_slopes=self.alibi_slopes,
+                              sliding_window=self.sliding_window[0],
+                              sm_scale=self.scale)
+        return output

From 214d63234fc85ab53e82106e0241c667f447af83 Mon Sep 17 00:00:00 2001
From: Kero Liang <kerorek@outlook.com>
Date: Fri, 14 Feb 2025 14:39:20 +0800
Subject: [PATCH 0171/1240] [Bugfix][V1] GPUModelRunner._update_states should
 return True when there is a finished request in batch (#13126)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 236 +++++++++++++++++++++++
 vllm/v1/worker/gpu_model_runner.py       |   3 +-
 2 files changed, 238 insertions(+), 1 deletion(-)
 create mode 100644 tests/v1/worker/test_gpu_model_runner.py

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
new file mode 100644
index 00000000000..f5219b676a8
--- /dev/null
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
+                                           SchedulerOutput)
+from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+
+@pytest.fixture
+def model_runner():
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+    )
+
+    device = "cuda"
+    return GPUModelRunner(vllm_config, device)
+
+
+def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
+    new_reqs = []
+    num_scheduled_tokens = {}
+    total_num_scheduled_tokens = 0
+    for req_id in req_ids:
+        new_reqs.append(
+            NewRequestData(
+                req_id=req_id,
+                prompt_token_ids=[1, 2, 3],
+                prompt="test",
+                mm_inputs=[],
+                mm_hashes=[],
+                mm_positions=[],
+                sampling_params=SamplingParams(),
+                block_ids=[0],
+                num_computed_tokens=0,
+                lora_request=None,
+            ))
+        num_scheduled_tokens[req_id] = 3
+        total_num_scheduled_tokens += num_scheduled_tokens[req_id]
+
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+    )
+
+
+def _is_req_scheduled(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.input_batch.req_id_to_index
+
+
+def _is_req_added(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.requests
+
+
+def test_update_states_new_request(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    batch_changed = model_runner._update_states(scheduler_output)
+    assert batch_changed is True
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_finished(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # finish req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={req_id},
+        free_encoder_input_ids=[],
+    )
+
+    batch_changed = model_runner._update_states(scheduler_output)
+    assert batch_changed is True
+    assert not _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_resumed(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # unschedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={},
+        free_encoder_input_ids=[],
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+    # resume req
+    cached_req_data = CachedRequestData(
+        req_id=req_id,
+        resumed_from_preemption=False,
+        new_block_ids=[],
+        num_computed_tokens=0,
+    )
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[cached_req_data],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+    )
+
+    batch_changed = model_runner._update_states(scheduler_output)
+    assert batch_changed is True
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_no_changes(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # schedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+    )
+
+    batch_changed = model_runner._update_states(scheduler_output)
+    assert batch_changed is False
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_unscheduled(model_runner):
+    req_ids = ("req_0", "req_1")
+
+    # new reqs
+    scheduler_output = _schedule_new_request(*req_ids)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert _is_req_scheduled(model_runner, req_ids[1])
+
+    # unschedule req_1
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_ids[0]: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+    )
+
+    batch_changed = model_runner._update_states(scheduler_output)
+    assert batch_changed is True
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fa4bd81a28d..b2f6c33858c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -363,7 +363,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
-        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+        return (len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+                or len(scheduler_output.finished_req_ids) > 0)
 
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens

From eb32c51ed887fa9f8c540ad2fe198d963653301c Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Fri, 14 Feb 2025 03:21:53 -0500
Subject: [PATCH 0172/1240] [WIP] TPU V1 Support Refactored (#13049)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/llm/test_accuracy.py        |   20 +-
 .../openai/correctness/test_lmeval.py         |   15 +-
 vllm/platforms/interface.py                   |    1 +
 vllm/platforms/tpu.py                         |   54 +-
 vllm/v1/attention/backends/pallas.py          |  353 ++++++
 vllm/v1/worker/block_table.py                 |    8 +
 vllm/v1/worker/tpu_model_runner.py            | 1109 +++++++++++++++++
 vllm/v1/worker/tpu_worker.py                  |  203 +++
 8 files changed, 1738 insertions(+), 25 deletions(-)
 create mode 100644 vllm/v1/attention/backends/pallas.py
 create mode 100644 vllm/v1/worker/tpu_model_runner.py
 create mode 100644 vllm/v1/worker/tpu_worker.py

diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 29ff00df6d5..620355923b4 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -21,10 +21,13 @@
 EXPECTED_VALUE = 0.58
 
 
-def run_test():
+def run_test(more_args=None):
     """Run the end to end accuracy test."""
 
-    model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
+    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
 
     results = lm_eval.simple_evaluate(
         model="vllm",
@@ -39,14 +42,21 @@ def run_test():
             ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="V1 is currently only supported on CUDA.")
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 is currently only supported on CUDA and TPU")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
-        run_test()
+
+        more_args = None
+        if current_platform.is_tpu():
+            # Limit compilation time for TPU V1
+            more_args = "max_num_seqs=64"
+
+        run_test(more_args)
 
 
 def test_lm_eval_accuracy_v0_engine(monkeypatch):
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index ebb2ea4d9d1..902df929e78 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -21,7 +21,7 @@
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
 MORE_ARGS_LIST = [
     [],  # Default
     ["--enable-chunked-prefill"],  # Chunked
@@ -67,14 +67,21 @@ def run_test(more_args):
                 ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
-@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="V1 currently only supported on CUDA")
+@pytest.mark.skipif(not current_platform.is_cuda()
+                    and not current_platform.is_tpu(),
+                    reason="V1 currently only supported on CUDA and TPU")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
-        run_test([])
+        more_args = []
+
+        # Limit compilation time for V1
+        if current_platform.is_tpu():
+            more_args = ["--max-num-seqs", "64"]
+
+        run_test(more_args)
 
 
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 61673b08543..19adc2af8c6 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -37,6 +37,7 @@ class _Backend(enum.Enum):
     TRITON_MLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
+    PALLAS_VLLM_V1 = enum.auto()
     IPEX = enum.auto()
     BLOCK_SPARSE_FLASH_ATTN = enum.auto()
     NO_ATTENTION = enum.auto()
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index fffc61bbaac..0c81d6a9389 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -4,6 +4,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 from .interface import Platform, PlatformEnum, _Backend
@@ -33,14 +34,20 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],
                              block_size: int, use_v1: bool,
                              use_mla: bool) -> str:
-        if selected_backend != _Backend.PALLAS:
+        if (selected_backend != _Backend.PALLAS
+                and selected_backend != _Backend.PALLAS_VLLM_V1):
             logger.info("Cannot use %s backend on TPU.", selected_backend)
-        logger.info("Using Pallas backend.")
-        return "vllm.attention.backends.pallas.PallasAttentionBackend"
+
+        if use_v1:
+            logger.info("Using Pallas V1 backend.")
+            return "vllm.v1.attention.backends.pallas.PallasAttentionBackend"
+        else:
+            logger.info("Using Pallas backend.")
+            return "vllm.attention.backends.pallas.PallasAttentionBackend"
 
     @classmethod
     def get_device_name(cls, device_id: int = 0) -> str:
-        raise NotImplementedError
+        return "tpu"
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
@@ -48,7 +55,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
 
     @classmethod
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return True
+        return not envs.VLLM_USE_V1
 
     @classmethod
     def inference_mode(cls):
@@ -63,11 +70,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             cache_config.block_size = 16
 
         compilation_config = vllm_config.compilation_config
-        if compilation_config.level == CompilationLevel.NO_COMPILATION:
-            # TPU does not support NO_COMPILATION
+
+        # TPU only supports DYNAMO_ONCE compilation level
+        if compilation_config.level != CompilationLevel.DYNAMO_ONCE:
+            logger.info("[TPU] Forcing DYNAMO_ONCE compilation level")
             compilation_config.level = CompilationLevel.DYNAMO_ONCE
-        assert compilation_config.level < CompilationLevel.PIECEWISE,\
-            "TPU does not support Inductor."
 
         if compilation_config.backend == "":
             compilation_config.backend = "openxla"
@@ -75,10 +82,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         assert vllm_config.speculative_config is None, \
             "TPU does not support speculative decoding"
 
-        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
-            "Chunked prefill is not yet supported for TPU backend")
-        assert not vllm_config.speculative_config, (
-            "Speculative decoding is not yet supported for TPU backend")
         if vllm_config.model_config.dtype in (torch.float16, torch.float32):
             logger.warning(
                 "The TPU backend currently does not support %s. "
@@ -88,8 +91,27 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
+            if envs.VLLM_USE_V1:
                 parallel_config.worker_cls = \
-                    "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+                    "vllm.v1.worker.tpu_worker.TPUWorker"
             else:
-                parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker"
+                if scheduler_config.is_multi_step:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+                else:
+                    parallel_config.worker_cls = \
+                        "vllm.worker.tpu_worker.TPUWorker"
+
+        # Adjust scheduler config for V1
+        # TODO: Add support for these
+        if envs.VLLM_USE_V1 and vllm_config.cache_config.enable_prefix_caching:
+            logger.warning("[V1][TPU] Disable prefix caching")
+            vllm_config.cache_config.enable_prefix_caching = False
+
+        assert not vllm_config.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on TPU.")
+        return False
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
new file mode 100644
index 00000000000..37bf33f6e3e
--- /dev/null
+++ b/vllm/v1/attention/backends/pallas.py
@@ -0,0 +1,353 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import torch_xla.experimental.custom_kernel  # Required to register custom ops.
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonAttentionState
+
+
+class PallasAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "PALLAS_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
+        return PallasAttentionBackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["PallasMetadata"]:
+        return PallasMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_kv_heads, num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        raise RuntimeError("swap_blocks is not used for the TPU backend.")
+
+    @torch.compile(backend="openxla")
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
+    ) -> None:
+        src_indices, dst_indices = src_to_dists
+        for k_cache, v_cache in kv_caches:
+            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
+            k_cache[:, dst_indices] = k_cache[:, src_indices]
+            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
+            v_cache[:, dst_indices] = v_cache[:, src_indices]
+
+
+@dataclass
+class PallasMetadata(AttentionMetadata):
+
+    # Currently, input sequences can only contain all prefills
+    # or all decoding.
+    block_tables: Optional[torch.Tensor] = None
+    context_lens: Optional[torch.Tensor] = None
+    effective_query_lens: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        assert self.num_decode_tokens == 0
+        return self
+
+    @property
+    def decode_metadata(self) -> Optional["PallasMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.block_tables is not None
+        assert self.context_lens is not None
+        return self
+
+
+class PallasAttentionBackendImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        if head_size % 128 != 0:
+            raise NotImplementedError("Head size must be a multiple of 128.")
+        if alibi_slopes is not None:
+            raise NotImplementedError("Alibi slopes is not supported.")
+        if sliding_window is not None:
+            raise NotImplementedError("Sliding window is not supported.")
+        if kv_cache_dtype != "auto":
+            raise NotImplementedError("FP8 KV cache dtype is not supported.")
+        if blocksparse_params is not None:
+            raise NotImplementedError("Blocksparse is not supported.")
+        if logits_soft_cap is not None:
+            raise NotImplementedError(
+                "Attention logits soft-capping is not supported.")
+
+        if torch_xla.tpu.version() < 4:
+            raise NotImplementedError("TPU version must be 4 or higher.")
+
+        self.megacore_mode = None
+        tpu_env = torch_xla.tpu.get_tpu_env()
+        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
+                    or tpu_env.get("TYPE", None)
+                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
+        assert tpu_type is not None
+        tpu_type = tpu_type.lower()
+
+        if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
+            if self.num_kv_heads % 2 == 0:
+                self.megacore_mode = "kv_head"
+            else:
+                # NOTE(woosuk): If the batch size is not a multiple of 2, the
+                # megacore mode will be None.
+                self.megacore_mode = "batch"
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        attn_metadata: PallasMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with Pallas attention.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
+            kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
+                NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor 
+                with shape [0] for profiling run.
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+
+        if attn_metadata is None:
+            if output is None:
+                output = torch.ones_like(query)
+            return output
+
+        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
+        batch_size, seq_len, hidden_size = query.shape
+        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
+        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
+        value = value.view(batch_size, seq_len, self.num_kv_heads,
+                           self.head_size)
+
+        if kv_cache[0].numel() > 0:
+            slot_mapping = attn_metadata.slot_mapping
+            key_cache, value_cache = kv_cache
+            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+
+        query = query * self.scale
+        if attn_metadata.num_prefills > 0:
+            if attn_metadata.block_tables is None:
+                # Prefill without paged KV cache.
+                assert seq_len % 16 == 0, (
+                    "Pallas FlashAttention kernel requires seq_len to be a "
+                    f"multiple of 16 but got {seq_len}")
+
+                # Handle GQA/MQA.
+                if self.num_kv_heads != self.num_heads:
+                    key = key.repeat_interleave(self.num_queries_per_kv,
+                                                dim=-2)
+                    key = key.view(batch_size, seq_len, self.num_heads,
+                                   self.head_size)
+                    value = value.repeat_interleave(self.num_queries_per_kv,
+                                                    dim=-2)
+                    value = value.view(batch_size, seq_len, self.num_heads,
+                                       self.head_size)
+                # FlashAttention kernel requires the input shape to be
+                # [batch_size, num_heads, seq_len, d_model]
+                # while the input is [batch_size, seq_len, num_heads, d_model].
+                # Permute the input to match the required format.
+                output = torch.ops.xla.flash_attention(
+                    query.permute(0, 2, 1, 3),
+                    key.permute(0, 2, 1, 3),
+                    value.permute(0, 2, 1, 3),
+                    True,
+                )
+                output = output.permute(0, 2, 1, 3)
+            else:
+                # Prefill with paged KV cache.
+                # TODO(woosuk): Tune the below knobs.
+                num_kv_pages_per_compute_block = 16
+                num_queries_per_compute_block = 16
+                assert seq_len % num_queries_per_compute_block == 0
+                output = torch.ops.xla.multi_queries_paged_attention(
+                    query,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    attn_metadata.effective_query_lens,
+                    num_kv_pages_per_compute_block,
+                    num_queries_per_compute_block,
+                    use_kernel=True,
+                )
+        else:
+            # Decoding run.
+            assert kv_cache[0].numel() > 0
+            query = query.squeeze(dim=1)
+            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
+
+            assert attn_metadata.block_tables is not None
+            assert attn_metadata.context_lens is not None
+            # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
+            # block table in SMEM. Therefore, if the block table is too large,
+            # the kernel compilation will fail. To avoid this, we split the
+            # batch dimension into smaller chunks and run the kernel multiple
+            # times.
+            MAX_SMEM_USAGE = 512 * 1024
+            size_per_seq = 4 * attn_metadata.block_tables.shape[1]
+            max_num_seq = MAX_SMEM_USAGE // size_per_seq
+
+            if batch_size <= max_num_seq:
+                output = paged_attention(
+                    query,
+                    key_cache,
+                    value_cache,
+                    attn_metadata.context_lens,
+                    attn_metadata.block_tables,
+                    pages_per_compute_block,
+                    self.megacore_mode,
+                )
+            else:
+                chunk_size = max_num_seq
+                # Make sure the chunk size is a multiple of 2.
+                chunk_size = chunk_size // 2 * 2
+                num_chunks = (batch_size + chunk_size - 1) // chunk_size
+
+                output = torch.empty_like(query)
+                for chunk_idx in range(num_chunks):
+                    chunk_start = chunk_idx * chunk_size
+                    chunk_end = chunk_start + chunk_size
+                    # NOTE(woosuk): We skip this line because it causes Dynamo
+                    # compilation error. Instead, we rely on the slice operation
+                    # to handle the out-of-bound case.
+                    # chunk_end = min(chunk_end, batch_size)
+                    chunk_output = paged_attention(
+                        query[chunk_start:chunk_end],
+                        key_cache,
+                        value_cache,
+                        attn_metadata.context_lens[chunk_start:chunk_end],
+                        attn_metadata.block_tables[chunk_start:chunk_end],
+                        pages_per_compute_block,
+                        self.megacore_mode,
+                    )
+                    output[chunk_start:chunk_end] = chunk_output
+
+        # Reshape the output tensor.
+        return output.reshape(batch_size, seq_len, hidden_size)
+
+
+def write_to_kv_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
+    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+
+    key = key.flatten(0, 2)
+    value = value.flatten(0, 2)
+    key_cache = key_cache.flatten(0, 2)
+    value_cache = value_cache.flatten(0, 2)
+    key_cache.index_copy_(0, slot_mapping, key)
+    value_cache.index_copy_(0, slot_mapping, value)
+
+
+def paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    pages_per_compute_block: int,
+    megacore_mode: Optional[str],
+) -> torch.Tensor:
+    batch_size = query.shape[0]
+    if megacore_mode == "batch" and batch_size % 2 != 0:
+        megacore_mode = None
+    else:
+        megacore_mode = megacore_mode
+
+    # NOTE(woosuk): A temporary workaround to avoid the error:
+    # "xla::paged_attention() Expected a value of type 'str' for
+    # argument 'megacore_mode' but instead found type 'NoneType'."
+    if megacore_mode is not None:
+        output = torch.ops.xla.paged_attention(
+            query,
+            key_cache,
+            value_cache,
+            context_lens,
+            block_tables,
+            pages_per_compute_block,
+            megacore_mode=megacore_mode,
+        )
+    else:
+        output = torch.ops.xla.paged_attention(
+            query,
+            key_cache,
+            value_cache,
+            context_lens,
+            block_tables,
+            pages_per_compute_block,
+        )
+    return output
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index f520ee9586c..669175f5d9c 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -61,6 +61,14 @@ def move_row(self, src: int, tgt: int) -> None:
             src, :num_blocks]
         self.num_blocks_per_row[tgt] = num_blocks
 
+    def swap_row(self, src: int, tgt: int) -> None:
+        num_blocks_src = self.num_blocks_per_row[src]
+        num_blocks_tgt = self.num_blocks_per_row[tgt]
+        self.num_blocks_per_row[src] = num_blocks_tgt
+        self.num_blocks_per_row[tgt] = num_blocks_src
+
+        self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
+
     def commit(self, num_reqs: int) -> None:
         self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
                                           non_blocking=True)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
new file mode 100644
index 00000000000..b64581bf5f4
--- /dev/null
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -0,0 +1,1109 @@
+# SPDX-License-Identifier: Apache-2.0
+import enum
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from unittest.mock import patch
+
+import numpy as np
+import torch
+import torch.distributed
+import torch.nn as nn
+# TPU XLA related
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.sampling_params import SamplingType
+from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
+                                               PallasMetadata)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
+from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.utils import bind_kv_cache
+from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler import SchedulerOutput
+
+logger = init_logger(__name__)
+
+# Here we utilize the behavior that out-of-bound index is ignored.
+# FIXME(woosuk): Find a more reliable way to prevent possible bugs.
+_PAD_SLOT_ID = 1_000_000_000
+
+
+class ExecutionMode(enum.Enum):
+    PREFILL = enum.auto()
+    DECODE = enum.auto()
+    PREFIX_PREFILL = enum.auto()
+
+    def is_prefill(self) -> bool:
+        return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL)
+
+
+@dataclass
+class PromptDecodeInfo:
+    prompt_req_ids: List[str]
+    decode_req_ids: List[str]
+    prompt_scheduled_tokens: List[int]
+
+
+@dataclass
+class PromptData:
+    input_tokens: torch.Tensor
+    input_positions: torch.Tensor
+    attn_metadata: PallasMetadata
+
+
+@dataclass
+class DecodeData:
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    attn_metadata: Optional[PallasMetadata] = None
+
+
+class TPUModelRunner:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+        self.device_config = vllm_config.device_config
+
+        model_config = self.model_config
+        cache_config = self.cache_config
+        scheduler_config = self.scheduler_config
+        parallel_config = self.parallel_config
+        self.device = device
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+
+        self.is_multimodal_model = model_config.is_multimodal_model
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+        self.max_model_len = model_config.max_model_len
+        self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
+        self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
+
+        # Model-related.
+        self.num_attn_layers = model_config.get_num_layers_by_block_type(
+            parallel_config, LayerBlockType.attention)
+        self.num_query_heads = model_config.get_num_attention_heads(
+            parallel_config)
+        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        self.head_size = model_config.get_head_size()
+        self.hidden_size = model_config.get_hidden_size()
+
+        self.model: Optional[nn.Module] = None
+
+        # Persistent batch.
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_blocks_per_req=self.max_num_blocks_per_req,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+        )
+
+        # Request states.
+        self.requests: Dict[str, CachedRequestState] = {}
+
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
+
+        # KV caches for forward pass
+        self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = []
+
+        # Cached torch/numpy tensors
+        self.num_swaps = 2
+        self.cur_swap_id = 0
+        self.input_ids_cpu = []
+        self.input_ids_np = []
+        self.input_positions_cpu = []
+        self.input_positions_np = []
+        self.slot_mapping_cpu = []
+        self.slot_mapping_np = []
+        self.prompt_context_lens_cpu = []
+        self.prompt_effective_query_lens_cpu = []
+        self.decode_context_lens_cpu = []
+        self.decode_context_lens_np = []
+        for _ in range(self.num_swaps):
+            self.input_ids_cpu.append(
+                torch.empty(self.max_num_tokens,
+                            dtype=torch.int32,
+                            device="cpu"))
+            self.input_ids_np.append(self.input_ids_cpu[-1].numpy())
+
+            self.input_positions_cpu.append(
+                torch.empty(self.max_num_tokens,
+                            dtype=torch.int32,
+                            device="cpu"))
+            self.input_positions_np.append(
+                self.input_positions_cpu[-1].numpy())
+
+            self.slot_mapping_cpu.append(
+                torch.empty(self.max_num_tokens,
+                            dtype=torch.int64,
+                            device="cpu"))
+            self.slot_mapping_np.append(self.slot_mapping_cpu[-1].numpy())
+
+            self.prompt_context_lens_cpu.append(
+                torch.empty((1), dtype=torch.int32, device="cpu"))
+            self.prompt_effective_query_lens_cpu.append(
+                torch.empty((1), dtype=torch.int32, device="cpu"))
+
+            self.decode_context_lens_cpu.append(
+                torch.empty(self.max_num_tokens,
+                            dtype=torch.int32,
+                            device="cpu"))
+            self.decode_context_lens_np.append(
+                self.decode_context_lens_cpu[-1].numpy())
+
+        # Range tensor with values [0 .. self.max_num_tokens - 1].
+        # Used to initialize positions / context_lens / seq_lens
+        self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
+
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
+        """Update the cached states and the persistent batch with the scheduler
+        output.
+
+        The updated states are used by the `_prepare_inputs` function to create
+        the input GPU tensors for the model.
+
+        Returns:
+            True if there is a new/resumed/paused/finished request in the batch.
+            If False, we can skip copying SamplingMetadata to the GPU.
+        """
+        # Remove finished requests from the cached states.
+        for req_id in scheduler_output.finished_req_ids:
+            self.requests.pop(req_id, None)
+
+        # Remove the finished requests from the persistent batch.
+        # NOTE(woosuk): There could be an edge case where finished_req_ids and
+        # scheduled_req_ids overlap. This happens when a request is aborted and
+        # then resubmitted with the same ID. In this case, we treat them as two
+        # distinct requests - clearing the cached states for the first request
+        # and handling the second as a new request.
+        removed_req_indices: List[int] = []
+        for req_id in scheduler_output.finished_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            if req_index is not None:
+                removed_req_indices.append(req_index)
+
+        # Remove the unscheduled requests from the persistent batch.
+        # NOTE(woosuk): The unscheduled requests are either preempted requests
+        # or running requests that are not scheduled in this step. We remove
+        # them from the persistent batch but keep their cached states since
+        # they will be scheduled again sometime in the future.
+        scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+        cached_req_ids = self.input_batch.req_id_to_index.keys()
+        unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+        # NOTE(woosuk): The persistent batch optimization assumes that
+        # consecutive batches contain mostly the same requests. If batches
+        # have low request overlap (e.g., alternating between two distinct
+        # sets of requests), this optimization becomes very inefficient.
+        for req_id in unscheduled_req_ids:
+            req_index = self.input_batch.remove_request(req_id)
+            assert req_index is not None
+            removed_req_indices.append(req_index)
+
+        req_ids_to_add: List[str] = []
+        # Add new requests to the cached states.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            req_id = new_req_data.req_id
+            sampling_params = new_req_data.sampling_params
+            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
+                generator = torch.Generator(device=self.device)
+                generator.manual_seed(sampling_params.seed)
+            else:
+                generator = None
+
+            self.requests[req_id] = CachedRequestState(
+                req_id=req_id,
+                prompt_token_ids=new_req_data.prompt_token_ids,
+                prompt=new_req_data.prompt,
+                mm_inputs=new_req_data.mm_inputs,
+                mm_positions=new_req_data.mm_positions,
+                sampling_params=sampling_params,
+                generator=generator,
+                block_ids=new_req_data.block_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                output_token_ids=[],
+                lora_request=new_req_data.lora_request,
+            )
+
+            req_ids_to_add.append(req_id)
+
+        # Update the states of the running/resumed requests.
+        for req_data in scheduler_output.scheduled_cached_reqs:
+            req_id = req_data.req_id
+            req_state = self.requests[req_id]
+
+            # Update the cached states.
+            req_state.num_computed_tokens = req_data.num_computed_tokens
+            if not req_data.resumed_from_preemption:
+                # Append the new blocks to the existing block IDs.
+                req_state.block_ids.extend(req_data.new_block_ids)
+            else:
+                # The request is resumed from preemption.
+                # Replace the existing block IDs with the new ones.
+                req_state.block_ids = req_data.new_block_ids
+
+            req_index = self.input_batch.req_id_to_index.get(req_id)
+            if req_index is None:
+                # The request is not in the persistent batch.
+                # The request was either preempted and resumed later, or was not
+                # scheduled in the previous step and needs to be added again.
+                req_ids_to_add.append(req_id)
+                continue
+
+            # Update the persistent batch.
+            self.input_batch.num_computed_tokens_cpu[req_index] = (
+                req_data.num_computed_tokens)
+            start_index = len(req_state.block_ids) - len(
+                req_data.new_block_ids)
+            self.input_batch.block_table.append_row(req_index, start_index,
+                                                    req_data.new_block_ids)
+
+        # Add the new or resumed requests to the persistent batch.
+        # The smaller empty indices are filled first.
+        removed_req_indices = sorted(removed_req_indices, reverse=True)
+        for req_id in req_ids_to_add:
+            req_state = self.requests[req_id]
+            if removed_req_indices:
+                # Fill the empty index.
+                req_index = removed_req_indices.pop()
+            else:
+                # Append to the end.
+                req_index = None
+            self.input_batch.add_request(req_state, req_index)
+
+        # Condense the batched states if there are empty indices.
+        if removed_req_indices:
+            self.input_batch.condense(removed_req_indices)
+        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+
+    def swap_step(self):
+        self.cur_swap_id = (self.cur_swap_id + 1) % self.num_swaps
+
+    def get_model(self) -> nn.Module:
+        assert self.model is not None
+        return self.model
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        """
+        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Attention module in the static forward context.
+        Returns:
+            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            format. Layers that do not need KV cache are not included.
+        """
+
+        forward_ctx = self.vllm_config.compilation_config.static_forward_context
+        block_size = self.vllm_config.cache_config.block_size
+        kv_cache_spec: KVCacheSpec = {}
+        for layer_name, attn_module in forward_ctx.items():
+            # TODO: Support other attention modules, e.g., sliding window,
+            # cross-attention, MLA.
+            assert isinstance(attn_module, Attention)
+            if attn_module.attn_type == AttentionType.DECODER:
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=attn_module.num_kv_heads,
+                    head_size=attn_module.head_size,
+                    dtype=attn_module.dtype,
+                )
+            elif attn_module.attn_type in (AttentionType.ENCODER,
+                                           AttentionType.ENCODER_ONLY):
+                # encoder-only attention does not need KV cache.
+                continue
+            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                raise NotImplementedError
+            else:
+                raise ValueError(
+                    f"Unknown attention type: {attn_module.attn_type}")
+
+        return kv_cache_spec
+
+    def _get_prompts_and_decodes(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> PromptDecodeInfo:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        assert total_num_scheduled_tokens > 0
+        num_reqs = self.input_batch.num_reqs
+        assert num_reqs > 0
+
+        # Traverse decodes first
+        decode_req_ids = []
+        for i in range(num_reqs):
+            req_id = self.input_batch.req_ids[i]
+            assert req_id is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
+            num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+
+            if num_computed_tokens < num_prompt_tokens:
+                # This is prompt
+                break
+
+            # This is decode
+            assert num_scheduled_tokens == 1
+            decode_req_ids.append(req_id)
+
+        # Traverse prompts
+        prompt_req_ids = []
+        prompt_scheduled_tokens = []
+        for i in range(len(decode_req_ids), num_reqs):
+            req_id = self.input_batch.req_ids[i]
+            assert req_id is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
+            num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+
+            # Must be prompt
+            assert num_computed_tokens < num_prompt_tokens
+
+            prompt_req_ids.append(req_id)
+            prompt_scheduled_tokens.append(num_scheduled_tokens)
+
+        return PromptDecodeInfo(prompt_req_ids, decode_req_ids,
+                                prompt_scheduled_tokens)
+
+    def _prepare_prompt(self, req_index: int,
+                        num_scheduled_tokens: int) -> PromptData:
+        num_computed_tokens = self.input_batch.num_computed_tokens_cpu[
+            req_index]
+        num_prompt_tokens = self.input_batch.num_prompt_tokens[req_index]
+
+        # Must be prompt
+        assert num_computed_tokens < num_prompt_tokens
+
+        # Prompt len
+        prompt_len = num_scheduled_tokens
+        padded_prompt_len = _get_padded_prompt_len(prompt_len)
+        assert padded_prompt_len <= self.max_model_len
+
+        # Seq len
+        seq_len = num_computed_tokens + prompt_len
+        padded_seq_len = num_computed_tokens + padded_prompt_len
+
+        # Input tokens
+        input_tokens_cpu = self.input_batch.token_ids_cpu_tensor[
+            req_index, num_computed_tokens:padded_seq_len]
+        input_tokens_cpu[prompt_len:] = 0
+
+        # Input positions
+        input_positions_np = self.input_positions_np[
+            self.cur_swap_id][:padded_prompt_len]
+        np.add(num_computed_tokens,
+               self.arange_np[:padded_prompt_len],
+               out=input_positions_np)
+        input_positions_np[prompt_len:] = 0
+
+        # Slot mapping
+        block_table_np = \
+            self.input_batch.block_table.get_numpy_array()
+        block_numbers_np = block_table_np[req_index, input_positions_np //
+                                          self.block_size]
+        block_offsets_np = input_positions_np % self.block_size
+
+        slot_mapping_np = self.slot_mapping_np[
+            self.cur_swap_id][:padded_prompt_len]
+        np.add(block_numbers_np * self.block_size,
+               block_offsets_np,
+               out=slot_mapping_np)
+        slot_mapping_np[prompt_len:] = _PAD_SLOT_ID
+
+        # Block table
+        block_table_cpu = None
+        if num_computed_tokens > 0:
+            block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
+            block_table_cpu = block_table_cpu[req_index]
+
+        # Context len
+        self.prompt_context_lens_cpu[self.cur_swap_id][0] = 0
+        if num_computed_tokens > 0:
+            self.prompt_context_lens_cpu[self.cur_swap_id][0] = seq_len
+
+        # Effective query len
+        self.prompt_effective_query_lens_cpu[self.cur_swap_id][0] = prompt_len
+
+        # Get final tensors
+        input_tokens = input_tokens_cpu.reshape(1, -1).to(self.device)
+        input_positions = self.input_positions_cpu[
+            self.cur_swap_id][:padded_prompt_len].reshape(1,
+                                                          -1).to(self.device)
+        slot_mapping = self.slot_mapping_cpu[
+            self.cur_swap_id][:padded_prompt_len].reshape(1,
+                                                          -1).to(self.device)
+        block_table = block_table_cpu.reshape(1, -1).to(
+            self.device) if block_table_cpu is not None else None
+
+        context_lens = self.prompt_context_lens_cpu[self.cur_swap_id].to(
+            self.device)
+        effective_query_lens = self.prompt_effective_query_lens_cpu[
+            self.cur_swap_id].to(self.device)
+
+        self.swap_step()
+
+        # Attn metadata
+        attn_metadata = PallasMetadata(
+            num_prefills=1,
+            num_prefill_tokens=0,  # NOTE: This is not used.
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            block_tables=block_table,
+            context_lens=context_lens,
+            effective_query_lens=effective_query_lens,
+        )
+
+        return PromptData(input_tokens, input_positions, attn_metadata)
+
+    def _prepare_decode(
+        self,
+        decode_req_ids: List[str],
+    ) -> DecodeData:
+        # Batch size
+        batch_size = len(decode_req_ids)
+        padded_batch_size = _get_padded_batch_size(batch_size)
+        assert padded_batch_size <= self.max_model_len
+
+        # Init [0 .. batch_size - 1]
+        req_indices_np = self.arange_np[:padded_batch_size]
+
+        # Input positions
+        input_positions_np = self.input_positions_np[
+            self.cur_swap_id][:padded_batch_size]
+        np.add(self.input_batch.num_computed_tokens_cpu[:padded_batch_size],
+               0,
+               out=input_positions_np)
+        input_positions_np[batch_size:] = 0
+        input_positions_cpu = self.input_positions_cpu[
+            self.cur_swap_id][:padded_batch_size]
+
+        # Input tokens
+        token_indices_np = (
+            input_positions_np +
+            req_indices_np * self.input_batch.token_ids_cpu.shape[1])
+        input_tokens_cpu = self.input_ids_cpu[
+            self.cur_swap_id][:padded_batch_size]
+        torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
+                           0,
+                           torch.from_numpy(token_indices_np),
+                           out=input_tokens_cpu)
+        input_tokens_cpu[batch_size:] = 0
+
+        # Slot mapping
+        block_table_indices_np = (
+            req_indices_np * self.max_num_blocks_per_req +
+            input_positions_np // self.block_size)
+
+        block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
+
+        block_numbers_np = block_table_cpu.flatten(
+        )[block_table_indices_np].numpy()
+
+        block_offsets_np = input_positions_np % self.block_size
+
+        slot_mapping_np = self.slot_mapping_np[
+            self.cur_swap_id][:padded_batch_size]
+        np.add(block_numbers_np * self.block_size,
+               block_offsets_np,
+               out=slot_mapping_np)
+        slot_mapping_np[batch_size:] = _PAD_SLOT_ID
+
+        block_table_cpu = block_table_cpu[:padded_batch_size]
+
+        # Context lens
+        context_lens_np = self.decode_context_lens_np[
+            self.cur_swap_id][:padded_batch_size]
+        np.add(self.input_batch.num_computed_tokens_cpu[:padded_batch_size],
+               1,
+               out=context_lens_np)
+        context_lens_np[batch_size:] = 0
+
+        # Get final tensors
+        input_tokens = input_tokens_cpu.reshape(-1, 1).to(self.device)
+        input_positions = input_positions_cpu.reshape(-1, 1).to(self.device)
+        slot_mapping = self.slot_mapping_cpu[
+            self.cur_swap_id][:padded_batch_size].reshape(-1,
+                                                          1).to(self.device)
+        block_table = block_table_cpu.to(self.device)
+        context_lens = self.decode_context_lens_cpu[
+            self.cur_swap_id][:padded_batch_size].to(self.device)
+
+        self.swap_step()
+
+        # Attn metadata
+        attn_metadata = PallasMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=padded_batch_size,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=True,
+            block_tables=block_table,
+            context_lens=context_lens,
+            effective_query_lens=None,
+        )
+
+        return DecodeData(input_tokens=input_tokens,
+                          input_positions=input_positions,
+                          attn_metadata=attn_metadata)
+
+    @torch.no_grad()
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> ModelRunnerOutput:
+        # Update cached state
+        self._update_states(scheduler_output)
+
+        # If necessary, swap decodes/prompts to have all decodes on the start
+        ensure_decodes_first(self.input_batch)
+
+        # Prepare prompts/decodes info
+        pd_info = self._get_prompts_and_decodes(scheduler_output)
+
+        # Init
+        num_prompts = len(pd_info.prompt_req_ids)
+        num_decodes = len(pd_info.decode_req_ids)
+        decode_data = None
+        sampled_token_ids = [0] * self.input_batch.num_reqs
+
+        # Run each prompt individually
+        is_first = True
+        for i in range(num_prompts):
+            req_id = pd_info.prompt_req_ids[i]
+            req_index = num_decodes + i
+            assert req_index == self.input_batch.req_id_to_index[
+                req_id]  # TODO: Remove
+            req_state = self.requests[req_id]
+            num_scheduled_tokens = pd_info.prompt_scheduled_tokens[i]
+            prompt_len = num_scheduled_tokens
+            seq_len = req_state.num_computed_tokens + num_scheduled_tokens
+
+            # Prepare first prompt
+            if is_first:
+                prompt_data = self._prepare_prompt(req_index,
+                                                   num_scheduled_tokens)
+                is_first = False
+
+            # Run forward pass
+            with set_forward_context(prompt_data.attn_metadata,
+                                     self.vllm_config):
+                assert self.model is not None
+                selected_token_ids = self.model(prompt_data.input_tokens,
+                                                prompt_data.input_positions,
+                                                prompt_data.attn_metadata,
+                                                self.kv_caches)
+
+            # In parallel to TPU execution, prepare the next iteration
+            if i < num_prompts - 1:
+                # There is next prompt => prepare it
+                prompt_data = self._prepare_prompt(
+                    req_index + 1, pd_info.prompt_scheduled_tokens[i + 1])
+            elif i == num_prompts - 1 and num_decodes > 0:
+                # There is next decode => prepare it
+                decode_data = self._prepare_decode(pd_info.decode_req_ids)
+
+            # Update cached state (if prompt is fully done)
+            if seq_len >= len(req_state.prompt_token_ids):
+                # Transfer sampled tokens from TPU to CPU
+                selected_token_ids_cpu = selected_token_ids.cpu()
+
+                # Get output token
+                token_id = selected_token_ids_cpu[prompt_len - 1].item()
+                sampled_token_ids[req_index] = token_id
+
+                # Add output token to the request
+                self.input_batch.token_ids_cpu[req_index, seq_len] = token_id
+                self.input_batch.num_tokens[req_index] += 1
+                req_state.output_token_ids.append(token_id)
+
+        # Run decodes (a single batch)
+        if num_decodes > 0:
+
+            # Prepare decode (if was not yet prepared)
+            if decode_data is None:
+                decode_data = self._prepare_decode(pd_info.decode_req_ids)
+
+            # Run forward pass
+            with set_forward_context(decode_data.attn_metadata,
+                                     self.vllm_config):
+                assert self.model is not None
+                selected_token_ids = self.model(decode_data.input_tokens,
+                                                decode_data.input_positions,
+                                                decode_data.attn_metadata,
+                                                self.kv_caches)
+
+            # Transfer sampled tokens from TPU to CPU
+            decode_token_ids_cpu = selected_token_ids.cpu()
+            # Convert to list
+            decode_token_ids_list = decode_token_ids_cpu.tolist()
+
+            # Update cached state for each decode request
+            for i in range(num_decodes):
+                req_id = pd_info.decode_req_ids[i]
+                req_index = i
+                assert req_index == self.input_batch.req_id_to_index[
+                    req_id]  # TODO: Remove
+                req_state = self.requests[req_id]
+                seq_len = req_state.num_computed_tokens + 1
+
+                token_id = decode_token_ids_list[i]
+                sampled_token_ids[req_index] = token_id
+
+                self.input_batch.token_ids_cpu[req_index, seq_len] = token_id
+                self.input_batch.num_tokens[req_index] += 1
+                req_state.output_token_ids.append(token_id)
+
+        # Create output.
+        all_req_ids = pd_info.decode_req_ids + pd_info.prompt_req_ids
+        prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {}
+        for req_id in all_req_ids:
+            prompt_logprobs_dict[req_id] = None
+
+        model_runner_output = ModelRunnerOutput(
+            req_ids=all_req_ids,
+            req_id_to_index=self.input_batch.req_id_to_index,
+            sampled_token_ids=sampled_token_ids,
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
+        )
+
+        return model_runner_output
+
+    def load_model(self) -> None:
+        self.device = self.device_config.device
+
+        # NOTE(woosuk): While the executor assigns the TP ranks to the worker
+        # process, the ranks can be different from the ranks internally assigned
+        # by the xm runtime. Therefore, there is a mismatch in the rank
+        # assignment between the gloo (cpu) runtime and the xm (tpu) runtime.
+        # This is not a problem in linear layers because all-reduce is
+        # rank-agnostic. However, it matters for all-gather as the ranks
+        # determine the order of concatenating the output tensors.
+        # As a workaround, we use the xm's rank assignment only when loading
+        # the embedding weights.
+        xm_tp_rank = xr.global_ordinal()
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding."
+                "get_tensor_model_parallel_rank",
+                return_value=xm_tp_rank):
+            model = get_model(vllm_config=self.vllm_config)
+        model = model.eval()
+        xm.mark_step()
+        xm.wait_device_ops()
+        model = ModelWrapperV1(model)
+        self.model = torch.compile(model,
+                                   backend="openxla",
+                                   fullgraph=True,
+                                   dynamic=False)
+
+    def dummy_run(
+        self,
+        kv_caches,
+        num_tokens: int,
+        seq_len: Optional[int] = None,
+        exec_mode: Optional[ExecutionMode] = None,
+    ) -> None:
+        assert seq_len is not None
+        assert exec_mode is not None
+
+        exec_mode = ExecutionMode(exec_mode)
+        if exec_mode.is_prefill():
+            seq_len = (seq_len + 15) // 16 * 16
+            token_ids = torch.zeros((num_tokens, seq_len),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            position_ids = torch.zeros((num_tokens, seq_len),
+                                       dtype=torch.int32,
+                                       device=self.device)
+            slot_mapping = torch.zeros((num_tokens, seq_len),
+                                       dtype=torch.int64,
+                                       device=self.device)
+            if exec_mode == ExecutionMode.PREFILL:
+                attn_metadata = PallasMetadata(
+                    num_prefills=num_tokens,
+                    num_prefill_tokens=num_tokens * seq_len,
+                    num_decode_tokens=0,
+                    slot_mapping=slot_mapping,
+                    multi_modal_placeholder_index_maps=None,
+                    enable_kv_scales_calculation=True,
+                    block_tables=None,
+                    context_lens=None,
+                    effective_query_lens=None,
+                )
+
+            else:
+                context_lens = torch.ones((num_tokens, ),
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+                block_tables = torch.zeros(
+                    (num_tokens, self.max_num_blocks_per_req),
+                    dtype=torch.int32,
+                    device=self.device)
+
+                effective_query_lens = torch.ones_like(context_lens)
+
+                attn_metadata = PallasMetadata(
+                    num_prefills=num_tokens,
+                    num_prefill_tokens=num_tokens * seq_len,
+                    num_decode_tokens=0,
+                    slot_mapping=slot_mapping,
+                    multi_modal_placeholder_index_maps=None,
+                    enable_kv_scales_calculation=True,
+                    block_tables=block_tables,
+                    context_lens=context_lens,
+                    effective_query_lens=effective_query_lens,
+                )
+        else:
+            assert seq_len == 1
+            token_ids = torch.zeros((num_tokens, seq_len),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            position_ids = torch.zeros((num_tokens, seq_len),
+                                       dtype=torch.int32,
+                                       device=self.device)
+            slot_mapping = torch.zeros((num_tokens, seq_len),
+                                       dtype=torch.int64,
+                                       device=self.device)
+            block_tables = torch.zeros(
+                (num_tokens, self.max_num_blocks_per_req),
+                dtype=torch.int32,
+                device=self.device)
+            context_lens = torch.ones((num_tokens, ),
+                                      dtype=torch.int32,
+                                      device=self.device)
+            attn_metadata = PallasMetadata(
+                num_prefills=0,
+                num_prefill_tokens=0,
+                num_decode_tokens=num_tokens * seq_len,
+                slot_mapping=slot_mapping,
+                multi_modal_placeholder_index_maps=None,
+                enable_kv_scales_calculation=True,
+                block_tables=block_tables,
+                context_lens=context_lens,
+            )
+
+        # NOTE(woosuk): There are two stages of compilation: torch.compile and
+        # XLA compilation. Using `mark_dynamic` can reduce the torch.compile
+        # overhead by reusing the FX graph for different shapes.
+        # However, the XLA graph will still require static shapes and needs to
+        # be re-compiled for every different shapes. This overhead is inevitable
+        # in the first run, but can be skipped afterwards as we cache the XLA
+        # graphs in the disk (VLLM_XLA_CACHE_PATH).
+        if exec_mode.is_prefill():
+            # Prefll
+            torch._dynamo.mark_dynamic(token_ids, 1)
+            torch._dynamo.mark_dynamic(position_ids, 1)
+            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1)
+        else:
+            # Decode
+            torch._dynamo.mark_dynamic(token_ids, 0)
+            torch._dynamo.mark_dynamic(position_ids, 0)
+            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
+            torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
+            torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
+
+        with set_forward_context(attn_metadata, self.vllm_config, 0):
+            assert self.model is not None
+            self.model(token_ids, position_ids, attn_metadata, kv_caches)
+
+    def capture_model(self) -> None:
+        """Compile the model."""
+
+        # Prefill
+        logger.info(
+            "Compiling the model with different input shapes for prefill:")
+        start = time.time()
+        for batch_size in [1]:
+            seq_len = 16
+            while seq_len <= self.model_config.max_model_len:
+                self.dummy_run(self.kv_caches,
+                               batch_size,
+                               seq_len,
+                               exec_mode=ExecutionMode.PREFILL)
+                xm.wait_device_ops()
+                logger.info("  batch_size: %d, seq_len: %d", batch_size,
+                            seq_len)
+                num_tokens = batch_size * seq_len
+                if num_tokens >= self.scheduler_config.max_num_batched_tokens:
+                    break
+                seq_len = seq_len * 2
+
+        end = time.time()
+        logger.info("    -- Compilation for prefill done in %.2f [secs].",
+                    end - start)
+
+        # Prefix prefill
+        if self.scheduler_config.enable_chunked_prefill:
+            logger.info("Compiling the model with different input shapes for "
+                        "prefix prefill:")
+            start = time.time()
+            for batch_size in [1]:
+                seq_len = 16
+                while seq_len <= self.model_config.max_model_len:
+                    self.dummy_run(self.kv_caches,
+                                   batch_size,
+                                   seq_len,
+                                   exec_mode=ExecutionMode.PREFIX_PREFILL)
+                    xm.wait_device_ops()
+                    logger.info("  batch_size: %d, seq_len: %d", batch_size,
+                                seq_len)
+                    num_tokens = batch_size * seq_len
+                    if (num_tokens
+                            >= self.scheduler_config.max_num_batched_tokens):
+                        break
+                    seq_len = seq_len * 2
+            end = time.time()
+            logger.info(
+                "    -- Compilation for prefix prefill done in %.2f [secs].",
+                end - start)
+
+        # Decode
+        logger.info(
+            "Compiling the model with different input shapes for decode:")
+        start = time.time()
+        seq_len = 1
+        batch_size = 8  # Must be in sync with _get_padded_batch_size()
+        while True:
+            self.dummy_run(self.kv_caches,
+                           batch_size,
+                           seq_len,
+                           exec_mode=ExecutionMode.DECODE)
+            xm.wait_device_ops()
+            logger.info("  batch_size: %d, seq_len: %d", batch_size, seq_len)
+
+            if batch_size >= self.scheduler_config.max_num_seqs:
+                break
+            batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
+
+        end = time.time()
+        logger.info("    -- Compilation for decode done in %.2f [secs].",
+                    end - start)
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Initialize KV cache based on `kv_cache_config`.
+        Args:
+            kv_cache_config: Configuration for the KV cache, including the KV 
+            cache size of each layer
+        """
+        if len(kv_cache_config.groups) > 1:
+            raise NotImplementedError(
+                "Hybrid models with more than one KV cache type are not "
+                "supported yet.")
+
+        kv_caches: Dict[str, torch.Tensor] = {}
+
+        for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
+            tensor_config = kv_cache_config.tensors[layer_name]
+            assert tensor_config.size % layer_spec.page_size_bytes == 0
+            num_blocks = tensor_config.size // layer_spec.page_size_bytes
+            if isinstance(layer_spec, FullAttentionSpec):
+                kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
+                    num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
+                    layer_spec.head_size)
+                dtype = layer_spec.dtype
+
+                tpu_k_cache = torch.zeros(kv_cache_shape,
+                                          dtype=dtype,
+                                          device=self.device)
+                tpu_v_cache = torch.zeros_like(tpu_k_cache)
+
+                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+            else:
+                raise NotImplementedError
+
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            self.kv_caches)
+
+
+class ModelWrapperV1(nn.Module):
+
+    def __init__(self, model: nn.Module):
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        token_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+    ) -> torch.Tensor:
+        """Executes the forward pass of the model and samples the next token.
+
+        Args:
+            token_ids: The input token IDs of shape [batch_size, seq_len].
+            position_ids: The input position IDs of shape [batch_size, seq_len].
+            attn_metadata: The Pallas attention metadata.
+            input_lens: The actual input lengths of shape [batch_size].
+            t: The sampling temperature of shape [batch_size].
+            p: The top-p probability of shape [batch_size].
+            num_samples: Number of samples to draw from each logits vector.
+            kv_caches: The key and value caches. They can be None during the
+                memory profiling at initialization.
+        """
+        # Skip this in memory profiling at initialization.
+        if attn_metadata is not None and kv_caches[0][0].numel() > 0:
+            # index_copy_(slot_mapping) only works when the inserted dimension
+            # is 0. However, the KV cache in the Pallas backend has the shape
+            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
+            # work, we need to flatten the first three dimensions and modify
+            # the slot_mapping accordingly.
+            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
+            slot_mapping = attn_metadata.slot_mapping
+            slot_mapping = slot_mapping.flatten()
+            head_indicies = torch.arange(0,
+                                         num_kv_heads,
+                                         device=slot_mapping.device,
+                                         dtype=slot_mapping.dtype)
+            head_indicies *= block_size * num_blocks
+            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
+                -1, num_kv_heads)
+            slot_mapping = slot_mapping + head_indicies.view(1, -1)
+            slot_mapping = slot_mapping.flatten()
+            attn_metadata.slot_mapping = slot_mapping
+
+        assert self.model is not None
+        hidden_states = self.model(
+            token_ids,
+            position_ids,
+            kv_caches,
+            attn_metadata,
+        )
+
+        hidden_states = hidden_states.flatten(0, 1)
+        logits = self.model.compute_logits(hidden_states, None)
+
+        # Greedy sampling.
+        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        argmax_token_ids = argmax_token_ids.squeeze(dim=-1)
+        return argmax_token_ids
+
+
+def swap_positions(b: InputBatch, id_1, id_2):
+    assert id_1 != id_2
+    req_id_1 = b.req_ids[id_1]
+    req_id_2 = b.req_ids[id_2]
+    assert req_id_1 is not None
+    assert req_id_2 is not None
+    assert id_1 == b.req_id_to_index[req_id_1]
+    assert id_2 == b.req_id_to_index[req_id_2]
+
+    b.req_ids[id_1], b.req_ids[id_2] = b.req_ids[id_2], b.req_ids[id_1]
+    b.req_id_to_index[req_id_1], b.req_id_to_index[
+        req_id_2] = b.req_id_to_index[req_id_2], b.req_id_to_index[req_id_1]
+
+    ids = [id_1, id_2]
+    rev_ids = [id_2, id_1]
+    b.num_tokens[ids] = b.num_tokens[rev_ids]
+    b.token_ids_cpu[ids] = b.token_ids_cpu[rev_ids]
+    b.num_prompt_tokens[ids] = b.num_prompt_tokens[rev_ids]
+    b.num_computed_tokens_cpu[ids] = b.num_computed_tokens_cpu[rev_ids]
+
+    b.block_table.swap_row(id_1, id_2)
+
+    b.temperature_cpu[ids] = b.temperature_cpu[rev_ids]
+    b.top_p_cpu[ids] = b.top_p_cpu[rev_ids]
+    b.top_k_cpu[ids] = b.top_k_cpu[rev_ids]
+    b.frequency_penalties_cpu[ids] = b.frequency_penalties_cpu[rev_ids]
+    b.presence_penalties_cpu[ids] = b.presence_penalties_cpu[rev_ids]
+    b.repetition_penalties_cpu[ids] = b.repetition_penalties_cpu[rev_ids]
+
+    b.min_tokens[id_1], b.min_tokens[id_2] = b.min_tokens[id_2], b.min_tokens[
+        id_1]
+    b.stop_token_ids[id_1], b.stop_token_ids[id_2] = b.stop_token_ids[
+        id_2], b.stop_token_ids[id_1]
+
+    gen_1 = b.generators.pop(id_1, None)
+    gen_2 = b.generators.pop(id_2, None)
+    if gen_1 is not None:
+        b.generators[id_2] = gen_1
+    if gen_2 is not None:
+        b.generators[id_1] = gen_2
+
+
+def ensure_decodes_first(b: InputBatch):
+    num_reqs = b.num_reqs
+    while True:
+        # Find the first prompt index
+        first_prompt_index = None
+        for i in range(num_reqs):
+            if b.num_computed_tokens_cpu[i] < b.num_prompt_tokens[i]:
+                first_prompt_index = i
+                break
+        if first_prompt_index is None:
+            break
+
+        # Find the last decode index
+        last_decode_index = None
+        for i in reversed(range(num_reqs)):
+            if b.num_computed_tokens_cpu[i] >= b.num_prompt_tokens[i]:
+                last_decode_index = i
+                break
+        if last_decode_index is None:
+            break
+
+        # Sanity
+        assert first_prompt_index != last_decode_index
+
+        # Check if done
+        if first_prompt_index > last_decode_index:
+            break
+
+        # Swap
+        swap_positions(b, first_prompt_index, last_decode_index)
+
+
+def _get_padded_prompt_len(x: int) -> int:
+    # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence
+    # length to be a multiple of 16. We pad the prompt length to the nearest
+    # multiple of 16. This is also good for performance.
+    if x <= 16:
+        return 16
+    return 1 << (x - 1).bit_length()
+
+
+def _get_padded_batch_size(batch_size: int) -> int:
+    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
+    # To meet this requirement in the simplest way, we set the minimal batch
+    # size to 8.
+    if batch_size <= 8:
+        return 8
+    else:
+        return ((batch_size + 15) // 16) * 16
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
new file mode 100644
index 00000000000..f29edd34ede
--- /dev/null
+++ b/vllm/v1/worker/tpu_worker.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A TPU worker class."""
+import os
+from typing import Dict, List, Optional
+
+import torch
+import torch.distributed
+import torch.nn as nn
+import torch_xla.core.xla_model as xm
+import torch_xla.runtime as xr
+
+import vllm.envs as envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment)
+from vllm.logger import init_logger
+from vllm.model_executor import set_random_seed
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheSpec)
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.utils import bind_kv_cache
+from vllm.v1.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class TPUWorker:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device_config = vllm_config.device_config
+        self.speculative_config = vllm_config.speculative_config
+        self.prompt_adapter_config = vllm_config.prompt_adapter_config
+        self.observability_config = vllm_config.observability_config
+
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+
+        if self.cache_config.cache_dtype == "auto":
+            self.cache_dtype = self.model_config.dtype
+        else:
+            self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype]
+
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+    def init_device(self):
+        os.environ["PJRT_DEVICE"] = "TPU"
+        torch.set_grad_enabled(False)
+        torch.set_default_dtype(self.model_config.dtype)
+
+        # Initialize the distributed environment.
+        init_tpu_worker_distributed_environment(self.parallel_config,
+                                                self.rank,
+                                                self.distributed_init_method,
+                                                self.local_rank)
+
+        # Device initialization should happen after initializing
+        # the distributed runtime.
+        self.device = xm.xla_device()
+        self.device_config.device = self.device
+
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+        xm.set_rng_state(self.model_config.seed, self.device)
+
+        # Increase the cache size limit, which is the maximum number of
+        # dynamo graphs that can be compiled.
+        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
+        # 30-40 graphs for decode. 128 is an arbitrary safe number.
+        torch._dynamo.config.cache_size_limit = 128
+        # Use persistent cache to avoid XLA recompilation.
+        # NOTE(woosuk): Set per-rank cache path since different ranks
+        # can have slightly different XLA graphs.
+        world_size = self.parallel_config.world_size
+        rank = xr.global_ordinal()
+        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                     f"tp{world_size}_rank{rank}")
+        xr.initialize_cache(per_rank_path, readonly=False)
+
+        # Init ModelRunner here, so that we have access to self.device.
+        self.model_runner = TPUModelRunner(self.vllm_config, self.device)
+
+    def determine_available_memory(self) -> int:
+        kv_caches: Dict[str, torch.Tensor] = {}
+        kv_cache_spec = self.model_runner.get_kv_cache_spec()
+        for layer_name, layer_spec in kv_cache_spec.items():
+            if isinstance(layer_spec, FullAttentionSpec):
+                dtype = layer_spec.dtype
+
+                # Use an empty tensor instead of `None`` to force Dynamo to pass
+                # it by reference, rather by specializing on the value ``None``.
+                tpu_k_cache = torch.tensor([], dtype=dtype, device=self.device)
+                tpu_v_cache = torch.tensor([], dtype=dtype, device=self.device)
+
+                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+            else:
+                raise NotImplementedError
+
+        runner_kv_caches: List[torch.Tensor] = []
+        bind_kv_cache(
+            kv_caches,
+            self.vllm_config.compilation_config.static_forward_context,
+            runner_kv_caches)
+
+        self.model_runner.dummy_run(
+            runner_kv_caches,
+            num_tokens=1,
+            seq_len=self.scheduler_config.max_num_batched_tokens,
+            exec_mode=ExecutionMode.PREFILL,
+        )
+
+        # Synchronize before measuring the memory usage.
+        xm.wait_device_ops()
+
+        # Get the maximum amount of memory used by the model weights and
+        # intermediate activations.
+        m = xm.get_memory_info(self.device)
+        total_memory_size = m["bytes_limit"]
+        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
+
+        # Calculate the TPU KV cache size based on profiling.
+        usable_memory_size = int(total_memory_size *
+                                 self.cache_config.gpu_memory_utilization)
+        tpu_kv_cache_bytes = max(usable_memory_size - profiled, 0)
+
+        return int(tpu_kv_cache_bytes)
+
+    def execute_model(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> Optional[ModelRunnerOutput]:
+        output = self.model_runner.execute_model(scheduler_output)
+        return output if self.rank == 0 else None
+
+    def load_model(self) -> None:
+        self.model_runner.load_model()
+
+    def compile_or_warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model()
+
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
+    def get_kv_cache_spec(self) -> KVCacheSpec:
+        return self.model_runner.get_kv_cache_spec()
+
+    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+        """Allocate GPU KV cache with the specified kv_cache_config."""
+        kv_cache_config = kv_cache_configs[self.rank]
+        self.model_runner.initialize_kv_cache(kv_cache_config)
+
+    def check_health(self) -> None:
+        # worker will always be healthy as long as it's running.
+        return
+
+
+def init_tpu_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+
+    # NOTE(woosuk): This is just to initialize the TP group and broadcast
+    # the input objects on CPU. The all-reduce and all-gather ops on TPU
+    # are invoked by `xm.all_reduce` and `xm.all_gather` which use their
+    # own context.
+    init_distributed_environment(
+        world_size=parallel_config.world_size,
+        rank=rank,
+        local_rank=local_rank,
+        distributed_init_method=distributed_init_method,
+        backend="gloo",
+    )
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)

From 768b3bf16b536fe022b0cb73e926f5963cf26318 Mon Sep 17 00:00:00 2001
From: Pooya Davoodi <pooya.davoodi@parasail.io>
Date: Fri, 14 Feb 2025 00:22:42 -0800
Subject: [PATCH 0173/1240] [Frontend] Optionally remove memory buffer used for
 uploading to URLs in run_batch (#12927)

Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/run_batch.py | 123 +++++++++++++++++++++++----
 1 file changed, 108 insertions(+), 15 deletions(-)

diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 675d3cdcf97..81e7028ad77 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import tempfile
 from http import HTTPStatus
 from io import StringIO
 from typing import Awaitable, Callable, List, Optional
@@ -51,6 +52,13 @@ def parse_args():
         help="The path or url to a single output file. Currently supports "
         "local file paths, or web (http or https) urls. If a URL is specified,"
         " the file should be available via HTTP PUT.")
+    parser.add_argument(
+        "--output-tmp-dir",
+        type=str,
+        default=None,
+        help="The directory to store the output file before uploading it "
+        "to the output URL.",
+    )
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
@@ -134,17 +142,107 @@ async def read_file(path_or_url: str) -> str:
             return f.read()
 
 
-async def write_file(path_or_url: str, data: str) -> None:
+async def write_local_file(output_path: str,
+                           batch_outputs: List[BatchRequestOutput]) -> None:
+    """
+    Write the responses to a local file.
+    output_path: The path to write the responses to.
+    batch_outputs: The list of batch outputs to write.
+    """
+    # We should make this async, but as long as run_batch runs as a
+    # standalone program, blocking the event loop won't effect performance.
+    with open(output_path, "w", encoding="utf-8") as f:
+        for o in batch_outputs:
+            print(o.model_dump_json(), file=f)
+
+
+async def upload_data(output_url: str, data_or_file: str,
+                      from_file: bool) -> None:
+    """
+    Upload a local file to a URL.
+    output_url: The URL to upload the file to.
+    data_or_file: Either the data to upload or the path to the file to upload.
+    from_file: If True, data_or_file is the path to the file to upload.
+    """
+    # Timeout is a common issue when uploading large files.
+    # We retry max_retries times before giving up.
+    max_retries = 5
+    # Number of seconds to wait before retrying.
+    delay = 5
+
+    for attempt in range(1, max_retries + 1):
+        try:
+            # We increase the timeout to 1000 seconds to allow
+            # for large files (default is 300).
+            async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
+                    total=1000)) as session:
+                if from_file:
+                    with open(data_or_file, "rb") as file:
+                        async with session.put(output_url,
+                                               data=file) as response:
+                            if response.status != 200:
+                                raise Exception(f"Failed to upload file.\n"
+                                                f"Status: {response.status}\n"
+                                                f"Response: {response.text()}")
+                else:
+                    async with session.put(output_url,
+                                           data=data_or_file) as response:
+                        if response.status != 200:
+                            raise Exception(f"Failed to upload data.\n"
+                                            f"Status: {response.status}\n"
+                                            f"Response: {response.text()}")
+
+        except Exception as e:
+            if attempt < max_retries:
+                logger.error(
+                    f"Failed to upload data (attempt {attempt}). "
+                    f"Error message: {str(e)}.\nRetrying in {delay} seconds..."
+                )
+                await asyncio.sleep(delay)
+            else:
+                raise Exception(f"Failed to upload data (attempt {attempt}). "
+                                f"Error message: {str(e)}.") from e
+
+
+async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput],
+                     output_tmp_dir: str) -> None:
+    """
+    Write batch_outputs to a file or upload to a URL.
+    path_or_url: The path or URL to write batch_outputs to.
+    batch_outputs: The list of batch outputs to write.
+    output_tmp_dir: The directory to store the output file before uploading it
+    to the output URL.
+    """
     if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
-        async with aiohttp.ClientSession() as session, \
-                   session.put(path_or_url, data=data.encode("utf-8")):
-            pass
+        if output_tmp_dir is None:
+            logger.info("Writing outputs to memory buffer")
+            output_buffer = StringIO()
+            for o in batch_outputs:
+                print(o.model_dump_json(), file=output_buffer)
+            output_buffer.seek(0)
+            logger.info("Uploading outputs to %s", path_or_url)
+            await upload_data(
+                path_or_url,
+                output_buffer.read().strip().encode("utf-8"),
+                from_file=False,
+            )
+        else:
+            # Write responses to a temporary file and then upload it to the URL.
+            with tempfile.NamedTemporaryFile(
+                    mode="w",
+                    encoding="utf-8",
+                    dir=output_tmp_dir,
+                    prefix="tmp_batch_output_",
+                    suffix=".jsonl",
+            ) as f:
+                logger.info("Writing outputs to temporary local file %s",
+                            f.name)
+                await write_local_file(f.name, batch_outputs)
+                logger.info("Uploading outputs to %s", path_or_url)
+                await upload_data(path_or_url, f.name, from_file=True)
     else:
-        # We should make this async, but as long as this is always run as a
-        # standalone program, blocking the event loop won't effect performance
-        # in this particular case.
-        with open(path_or_url, "w", encoding="utf-8") as f:
-            f.write(data)
+        logger.info("Writing outputs to local file %s", path_or_url)
+        await write_local_file(path_or_url, batch_outputs)
 
 
 def make_error_request_output(request: BatchRequestInput,
@@ -317,12 +415,7 @@ async def main(args):
     with tracker.pbar():
         responses = await asyncio.gather(*response_futures)
 
-    output_buffer = StringIO()
-    for response in responses:
-        print(response.model_dump_json(), file=output_buffer)
-
-    output_buffer.seek(0)
-    await write_file(args.output_file, output_buffer.read().strip())
+    await write_file(args.output_file, responses, args.output_tmp_dir)
 
 
 if __name__ == "__main__":

From 025ea43a7a350b17d7ce661cc2d8eba73fd42487 Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Fri, 14 Feb 2025 17:07:10 +0800
Subject: [PATCH 0174/1240] [Bugfix] Fix missing parentheses (#13263)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/sequence.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 534b9e60610..98578ee04d5 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -381,7 +381,7 @@ def __repr__(self) -> str:
                 f"prompt_token_ids={self._prompt_token_ids}, "
                 f"output_token_ids={self.output_token_ids}, "
                 f"cumulative_logprob={self.cumulative_logprob}, "
-                f"get_num_computed_tokens={self.get_num_computed_tokens()}")
+                f"get_num_computed_tokens={self.get_num_computed_tokens()})")
 
 
 class Sequence:
@@ -606,7 +606,7 @@ def is_prefill(self) -> bool:
     def __repr__(self) -> str:
         return (f"Sequence(seq_id={self.seq_id}, "
                 f"status={self.status.name}, "
-                f"num_blocks={self.n_blocks}, ")
+                f"num_blocks={self.n_blocks})")
 
 
 class SequenceGroupState(msgspec.Struct,

From 707ee6c1f02b12acc1952b94f366b1f1d868a71a Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Fri, 14 Feb 2025 07:10:21 -0500
Subject: [PATCH 0175/1240] [Misc] Log time consumption of sleep and wake-up
 (#13115)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/executor_base.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 242690f8e1b..75e3c67c556 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import time
 from abc import ABC, abstractmethod
 from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
                     Union)
@@ -200,15 +201,23 @@ def sleep(self, level: int = 1):
         if self.is_sleeping:
             logger.warning("Executor is already sleeping.")
             return
+        time_before_sleep = time.perf_counter()
         self.collective_rpc("sleep", kwargs=dict(level=level))
+        time_after_sleep = time.perf_counter()
         self.is_sleeping = True
+        logger.info("It took %.6f seconds to fall asleep.",
+                    time_after_sleep - time_before_sleep)
 
     def wake_up(self):
         if not self.is_sleeping:
             logger.warning("Executor is not sleeping.")
             return
+        time_before_wakeup = time.perf_counter()
         self.collective_rpc("wake_up")
+        time_after_wakeup = time.perf_counter()
         self.is_sleeping = False
+        logger.info("It took %.6f seconds to wake up.",
+                    time_after_wakeup - time_before_wakeup)
 
     def save_sharded_state(
         self,

From e88c3b0863114d51af2ce70a4b61e1af2563e2c1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Feb 2025 20:20:46 +0800
Subject: [PATCH 0176/1240] [VLM] Keep track of whether prompt replacements
 have been applied (#13215)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/glm4v.py           |   8 +
 vllm/model_executor/models/llava.py           |   3 +-
 vllm/model_executor/models/llava_onevision.py |  57 ++++-
 vllm/model_executor/models/minicpmo.py        |  90 +++----
 vllm/model_executor/models/minicpmv.py        | 221 ++++++++----------
 vllm/model_executor/models/qwen2_audio.py     |  10 -
 vllm/model_executor/models/qwen2_vl.py        | 100 +++-----
 vllm/model_executor/models/qwen_vl.py         |  13 +-
 vllm/multimodal/parse.py                      |  58 ++++-
 vllm/multimodal/processing.py                 | 142 ++++++-----
 10 files changed, 373 insertions(+), 329 deletions(-)

diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 67f19841f4a..450421302a1 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -484,6 +484,14 @@ def get_dummy_processor_inputs(
 
 class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b1fee3eeb54..dcd90474e93 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -294,7 +294,7 @@ def _call_hf_processor(
         pixel_values = processed_outputs.get("pixel_values")
         if pixel_values is not None:
             # Before/after https://github.com/huggingface/transformers/pull/35122
-            if Version(TRANSFORMERS_VERSION) <= Version("4.48.2"):
+            if Version(TRANSFORMERS_VERSION) <= Version("4.48.3"):
                 images = mm_data["images"]
                 assert isinstance(images, list)
 
@@ -819,7 +819,6 @@ def get_replacement_mantis(item_idx: int):
             prompt_ids,
             mm_item_counts,
         )
-
         self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
         mm_placeholder_ranges = {
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 2889426283f..084d4d51ad2 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -299,36 +299,69 @@ def _call_hf_processor(
                 mm_kwargs=mm_kwargs,
             )
 
+        # LLaVA-OneVision processor doesn't support multiple videos
+        # with different sizes when converting back to tensors
+        # So, we process each component separately
+        # NOTE: No prompt replacement is applied in this case
         processor = self.info.get_hf_processor()
+        image_token = processor.image_token
         video_token = processor.video_token
 
-        # LLaVA-OneVision processor doesn't support multiple videos
-        # with different sizes when converting back to tensors
-        text_image_outputs = super()._call_hf_processor(
+        text_outputs = super()._call_hf_processor(
             prompt=prompt,
-            mm_data=mm_data,
+            mm_data={},
             mm_kwargs=mm_kwargs,
         )
 
+        images = mm_data.pop("images", [])
+        assert isinstance(images, list)
+        if images:
+            processor_outputs = super()._call_hf_processor(
+                prompt=image_token * len(images),
+                mm_data={"images": images},
+                mm_kwargs=mm_kwargs,
+            )
+            image_outputs = {
+                k: v
+                for k, v in processor_outputs.items()
+                if k in ("pixel_values", "image_sizes")
+            }
+        else:
+            image_outputs = {}
+
         pixel_values_videos = []
         for video in videos:
-            item_processor_data = dict(prompt=video_token, videos=video)
-
             item_outputs = super()._call_hf_processor(
-                prompt=prompt,
-                mm_data=item_processor_data,
+                prompt=video_token,
+                mm_data={"videos": video},
                 mm_kwargs=mm_kwargs,
             )
 
-            pixel_values_videos.append(
-                item_outputs.pop("pixel_values_videos")[0])
+            pixel_values_videos.append(item_outputs["pixel_values_videos"][0])
+
+        video_outputs = {"pixel_values_videos": pixel_values_videos}
 
         combined_outputs = dict(
-            **text_image_outputs,
-            pixel_values_videos=pixel_values_videos,
+            text_outputs,
+            **image_outputs,
+            **video_outputs,
         )
         return BatchFeature(combined_outputs)
 
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        base_result = super()._hf_processor_applies_repl(
+            prompt_text=prompt_text,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return base_result and mm_items.get_count("video", strict=False) == 0
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ab697fb8cc6..473881f9554 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -27,8 +27,8 @@
                     Tuple, TypedDict, Union)
 
 import torch
-import torch.types
 from torch import nn
+from transformers import BatchFeature
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.whisper.modeling_whisper import (
     ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
@@ -37,23 +37,21 @@
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import MultiModalFieldConfig
-from vllm.multimodal.parse import (ModalityData, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser,
-                                   VideoItem)
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        PromptReplacement)
+from vllm.multimodal.parse import (AudioItem, DictEmbeddingItems, ModalityData,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import PromptReplacement
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
 from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
-                       MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser,
-                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo)
+                       MiniCPMVMultiModalDataParser,
+                       MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
+                       _minicpmv_field_config)
 from .utils import AutoWeightsLoader, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
 
-MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems
-
 
 class MiniCPMOAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
@@ -103,28 +101,49 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
                             MiniCPMOAudioEmbeddingInputs]
 
 
-class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems):
+def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
+
+    return dict(
+        **_minicpmv_field_config(hf_inputs),
+        audio_features=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_num_slices),
+        audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_num_slices),
+        audio_num_slices=MultiModalFieldConfig.batched("audio"),
+        audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
+        audio_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "audio", audio_num_slices),
+    )
+
 
-    def __init__(self, data: Dict) -> None:
-        super().__init__(data, "audio")
-        audio_embeds = self.data.get("audio_embeds", None)
-        if audio_embeds is None:
-            raise ValueError("Incorrect type of video_embeds",
-                             "Got type: None")
-        self.data["audio_embeds"] = audio_embeds
+class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
 
-    def get(self, index: int) -> object:
-        return self.data["audio_embeds"][index]
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_config: Mapping[str, MultiModalFieldConfig],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="image",
+            fields_config=fields_config,
+            required_fields={"audio_embeds"},
+        )
 
 
 class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser):
 
     def _parse_audio_data(
         self,
-        data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
+        data: Union[dict[str, torch.Tensor], ModalityData[AudioItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return MiniCPMOAudioEmbeddingItems(data)
+            return MiniCPMOAudioEmbeddingItems(
+                data,
+                fields_config=_minicpmo_field_config(data),
+            )
+
         return super()._parse_audio_data(data)
 
 
@@ -167,6 +186,10 @@ def get_max_audio_tokens_per_chunk(self) -> int:
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
 
+    def get_max_audio_tokens(self) -> int:
+        return self.get_max_audio_tokens_per_chunk(
+        ) * self.get_max_audio_chunks_with_most_features()
+
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
         # exclude <audio> </audio>
@@ -194,7 +217,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         return num_frames
 
 
-class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder):
+class MiniCPMODummyInputsBuilder(
+        MiniCPMVDummyInputsBuilder[MiniCPMOProcessingInfo]):
 
     def get_dummy_processor_inputs(
             self, seq_len: int, mm_counts: Mapping[str,
@@ -222,8 +246,7 @@ def get_dummy_processor_inputs(
 
 
 class MiniCPMOMultiModalProcessor(
-        MiniCPMVMultiModalProcessor,
-        BaseMultiModalProcessor[MiniCPMOProcessingInfo]):
+        MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMOMultiModalDataParser(
@@ -369,21 +392,10 @@ def get_replacement_minicpmv(item_idx: int, modality: str):
 
     def _get_mm_fields_config(
         self,
-        hf_inputs,
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
-
-        return dict(
-            **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs),
-            audio_features=MultiModalFieldConfig.flat_from_sizes(
-                "audio", audio_num_slices),
-            audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
-                "audio", audio_num_slices),
-            audio_num_slices=MultiModalFieldConfig.batched("audio"),
-            audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
-            audio_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "audio", audio_num_slices))
+        return _minicpmo_field_config(hf_inputs)
 
 
 class MultiModalProjector(nn.Module):
@@ -406,7 +418,7 @@ def forward(self, audio_features: torch.Tensor) -> torch.Tensor:
 
 class MiniCPMWhisperEncoderLayer(nn.Module):
 
-    def __init__(self, config: WhisperConfig, layer_idx: int = None):
+    def __init__(self, config: WhisperConfig, layer_idx: int):
         super().__init__()
         self.embed_dim = config.d_model
         self.self_attn = WHISPER_ATTENTION_CLASSES[
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 58a4448d436..77ac9eb467b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -35,6 +35,7 @@
 from PIL import Image
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
+from typing_extensions import TypeVar
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -51,9 +52,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, PlaceholderRange)
-from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData,
-                                   ModalityDataItems, MultiModalDataItems,
-                                   MultiModalDataParser, VideoItem)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, ImageSize,
+                                   ModalityData, ModalityDataItems,
+                                   MultiModalDataItems, MultiModalDataParser,
+                                   VideoItem)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -115,93 +117,6 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
 
-
-class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
-                                               dict[str, torch.Tensor]]):
-
-    def __init__(self, data: Dict, modality: str) -> None:
-        super().__init__(data, modality)
-
-    def get_processor_data(self) -> Mapping[str, object]:
-        return self.data
-
-    def get_passthrough_data(self) -> Mapping[str, object]:
-        return {}
-
-    def get_count(self) -> int:
-        return len(self.data[f"{self.modality}_embeds"])
-
-    def get(self, index: int) -> Dict[str, torch.Tensor]:
-        out = {}
-        for k, v in self.data.items():
-            out[k] = v[index]
-        return out
-
-
-class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems):
-
-    def __init__(self, data: Dict) -> None:
-        super().__init__(data, "image")
-        image_embeds = self.data.get("image_embeds", None)
-        image_sizes = self.data.get("image_sizes", None)
-        if image_embeds is None:
-            raise ValueError("In correct type of image_embeds",
-                             "Got type: None")
-        if not isinstance(image_embeds[0], torch.Tensor):
-            raise ValueError("In correct type of image_embeds",
-                             f"Got type: {type(image_embeds[0])}")
-        if image_sizes is None:
-            raise ValueError(
-                "In correct type of image_sizes", "Got type: None."
-                "If you're using `image_size_list`, "
-                "please rename it to `image_sizes`")
-        if len(image_embeds[0].shape) == 2:
-            image_embeds = [image_embeds]
-            image_sizes = [image_sizes]
-        self.data["image_embeds"] = image_embeds
-        self.data["image_sizes"] = image_sizes
-
-    def get_image_size(self, index: int) -> ImageSize:
-        image_size = self.data["image_sizes"][index]
-        return ImageSize(width=image_size[0], height=image_size[1])
-
-
-class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems):
-
-    def __init__(self, data: Dict) -> None:
-        super().__init__(data, "video")
-        video_embeds = self.data.get("video_embeds", None)
-        image_sizes = self.data.get("image_sizes", None)
-        num_frames = self.data.get("num_frames", None)
-        if video_embeds is None:
-            raise ValueError("In correct type of video_embeds",
-                             "Got type: None")
-        if not isinstance(video_embeds[0], torch.Tensor):
-            raise ValueError("In correct type of video_embeds",
-                             f"Got type: {type(video_embeds[0])}")
-        if image_sizes is None:
-            raise ValueError(
-                "In correct type of image_sizes", "Got type: None."
-                "If you're using `image_size_list`, "
-                "please rename it to `image_sizes`")
-        if num_frames is None:
-            raise ValueError("In correct type of numframes", "Got type: None")
-        if len(video_embeds[0].shape) == 2:
-            video_embeds = [video_embeds]
-            image_sizes = [image_sizes]
-            num_frames = [num_frames]
-        self.data["video_embeds"] = video_embeds
-        self.data["image_sizes"] = image_sizes
-        self.data["num_frames"] = num_frames
-
-    def get_frame_size(self, index: int) -> ImageSize:
-        frame_size = self.data["image_sizes"][index]
-        return ImageSize(width=frame_size[0], height=frame_size[1])
-
-    def get_num_frames(self, index: int) -> int:
-        return self.data["num_frames"][index]
-
-
 DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6)
 
 
@@ -311,6 +226,71 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
     return tuple(int(x) for x in version_str.split("."))
 
 
+def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0))
+    video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0))
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_num_slices),
+        image_sizes=MultiModalFieldConfig.batched("image"),
+        tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_num_slices),
+        image_num_slices=MultiModalFieldConfig.batched("image"),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_num_slices),
+        video_pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_num_slices),
+        video_image_sizes=MultiModalFieldConfig.batched("video"),
+        video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_num_slices),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_num_slices),
+        video_num_slices=MultiModalFieldConfig.batched("video"),
+    )
+
+
+class MiniCPMVImageEmbeddingItems(DictEmbeddingItems):
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_config: Mapping[str, MultiModalFieldConfig],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="image",
+            fields_config=fields_config,
+            required_fields={"image_embeds", "image_sizes"},
+        )
+
+    def get_image_size(self, index: int) -> ImageSize:
+        image_size = self.get(index)["image_sizes"].tolist()
+        return ImageSize(width=image_size[0], height=image_size[1])
+
+
+class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        fields_config: Mapping[str, MultiModalFieldConfig],
+    ) -> None:
+        super().__init__(
+            data,
+            modality="video",
+            fields_config=fields_config,
+            required_fields={"video_embeds", "video_image_sizes"},
+        )
+
+    def get_frame_size(self, index: int) -> ImageSize:
+        frame_size = self.get(index)["video_image_sizes"].tolist()
+        return ImageSize(width=frame_size[0], height=frame_size[1])
+
+    def get_num_frames(self, index: int) -> int:
+        return len(self.get(index)["video_image_sizes"])
+
+
 class MiniCPMVMultiModalDataParser(MultiModalDataParser):
 
     def _parse_image_data(
@@ -318,7 +298,11 @@ def _parse_image_data(
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return MiniCPMVImageEmbeddingItems(data)
+            return MiniCPMVImageEmbeddingItems(
+                data,
+                fields_config=_minicpmv_field_config(data),
+            )
+
         return super()._parse_image_data(data)
 
     def _parse_video_data(
@@ -326,7 +310,11 @@ def _parse_video_data(
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return MiniCPMVVideoEmbeddingItems(data)
+            return MiniCPMVVideoEmbeddingItems(
+                data,
+                fields_config=_minicpmv_field_config(data),
+            )
+
         return super()._parse_video_data(data)
 
 
@@ -392,10 +380,6 @@ def get_max_video_tokens(self, seq_len: int) -> int:
         return self.get_max_video_frame_tokens(
         ) * self.get_num_frames_with_most_features(seq_len)
 
-    def get_max_audio_tokens(self) -> int:
-        return self.get_max_audio_tokens_per_chunk(
-        ) * self.get_max_audio_chunks_with_most_features()
-
     def get_slice_query_num(self) -> int:
         hf_config = self.get_hf_config()
         query_num = getattr(hf_config, "query_num", 64)
@@ -476,8 +460,12 @@ def get_default_image_sizes(self, num_slices: int) -> ImageSize:
         return ImageSize(width=image_size, height=image_size * num_slices)
 
 
-class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo]
-                                 ):
+_I = TypeVar("_I",
+             bound=MiniCPMVProcessingInfo,
+             default=MiniCPMVProcessingInfo)
+
+
+class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
 
     def get_dummy_processor_inputs(
         self,
@@ -514,8 +502,7 @@ def get_dummy_processor_inputs(
                                mm_data=mm_data)
 
 
-class MiniCPMVMultiModalProcessor(
-        BaseMultiModalProcessor[MiniCPMVProcessingInfo]):
+class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
 
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMVMultiModalDataParser()
@@ -675,7 +662,7 @@ def get_num_slices_by_modality(self, inputs: Dict[str, object],
                 self.info.get_video_max_slice_num()
             ) * inputs[modality]["num_frames"][index]
         else:
-            raise ValueError(f"UnExpected modality: {modality}")
+            raise ValueError(f"Unexpected modality: {modality}")
 
     def check_mm_inputs(self, inputs: Dict[str, object],
                         matches: List[str]) -> None:
@@ -700,7 +687,7 @@ def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
                 inputs["video"]["video_image_sizes"][index],
                 inputs["video"]["num_frames"][index])
         else:
-            raise ValueError(f"UnExpected modality: {modality}")
+            raise ValueError(f"Unexpected modality: {modality}")
 
     def call_base_hf_processor(
         self,
@@ -742,6 +729,14 @@ def _call_hf_processor(
             }
         }
 
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
     def _get_prompt_replacements(
             self, mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, Any],
@@ -770,28 +765,10 @@ def get_replacement_minicpmv(item_idx: int, modality: str):
 
     def _get_mm_fields_config(
         self,
-        hf_inputs,
+        hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0))
-        video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0))
-
-        return dict(pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_num_slices),
-                    image_sizes=MultiModalFieldConfig.batched("image"),
-                    tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
-                        "image", image_num_slices),
-                    image_num_slices=MultiModalFieldConfig.batched("image"),
-                    image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                        "image", image_num_slices),
-                    video_pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                        "video", video_num_slices),
-                    video_image_sizes=MultiModalFieldConfig.batched("video"),
-                    video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
-                        "video", video_num_slices),
-                    video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                        "video", video_num_slices),
-                    video_num_slices=MultiModalFieldConfig.batched("video"))
+        return _minicpmv_field_config(hf_inputs)
 
     def apply(
         self,
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index f09529ca4bd..cf79544e60e 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -243,16 +243,6 @@ def get_replacement_qwen2_audio(item_idx: int):
             )
         ]
 
-    def _always_apply_prompt_replacements(self) -> bool:
-        # Qwen2-Audio processor will start inserting placeholder tokens
-        # in an upcoming release:
-        # https://github.com/huggingface/transformers/pull/35534
-        # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF
-        # has already performed processing for multi-audio input when the input
-        # audios are short (the corresponding placeholders may take up fewer
-        # tokens than the number of audio items)
-        return not hasattr(self.info.get_hf_processor(), "audio_token")
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen2AudioMultiModalProcessor,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 961f53cef13..ce927fbbf12 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -58,8 +58,9 @@
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
                                     MultiModalFieldConfig, MultiModalKwargs,
                                     VideoItem)
-from vllm.multimodal.parse import (ImageSize, ModalityDataItems,
-                                   MultiModalDataItems, MultiModalDataParser)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
+                                   ModalityDataItems, MultiModalDataItems,
+                                   MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -657,49 +658,25 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class Qwen2VLEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor],
-                                              dict[str, torch.Tensor]]):
-
-    def __init__(self, data: dict, modality: str) -> None:
-        super().__init__(data, modality)
-
-        grid_thw = data[f"{modality}_grid_thw"]
-        slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist()
-        self._slices = [
-            slice(slice_idxs[i], slice_idxs[i + 1])
-            for i in range(len(grid_thw))
-        ]
-
-    def get_count(self) -> int:
-        return len(self.data[f"{self.modality}_grid_thw"])
-
-    def get(self, index: int) -> dict[str, torch.Tensor]:
-        out = {}
-        for k, v in self.data.items():
-            if v != f"{self.modality}_grid_thw":
-                v = v[self._slices[index]]
-
-            out[k] = v
-
-        return out
-
-    def get_processor_data(self) -> Mapping[str, object]:
-        return {}
-
-    def get_passthrough_data(self) -> Mapping[str, object]:
-        return self.data
-
-
-class Qwen2VLImageEmbeddingItems(Qwen2VLEmbeddingItems):
-
-    def __init__(self, data: dict) -> None:
-        super().__init__(data, "image")
-
-
-class Qwen2VLVideoEmbeddingItems(Qwen2VLEmbeddingItems):
-
-    def __init__(self, data: dict) -> None:
-        super().__init__(data, "video")
+def _qwen2vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+
+    video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+    video_grid_sizes = video_grid_thw.prod(-1)
+
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+        pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_embeds=MultiModalFieldConfig.flat_from_sizes(
+            "video", video_grid_sizes),
+        video_grid_thw=MultiModalFieldConfig.batched("video"),
+    )
 
 
 class Qwen2VLMultiModalDataParser(MultiModalDataParser):
@@ -709,7 +686,12 @@ def _parse_image_data(
         data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2VLEmbeddingItems(data, modality="image")
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                fields_config=_qwen2vl_field_config(data),
+                required_fields={"image_embeds", "image_grid_thw"},
+            )
 
         return super()._parse_image_data(data)
 
@@ -718,7 +700,12 @@ def _parse_video_data(
         data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]],
     ) -> ModalityDataItems[Any, Any]:
         if isinstance(data, dict):
-            return Qwen2VLEmbeddingItems(data, modality="video")
+            return DictEmbeddingItems(
+                data,
+                modality="video",
+                fields_config=_qwen2vl_field_config(data),
+                required_fields={"video_embeds", "video_grid_thw"},
+            )
 
         return super()._parse_video_data(data)
 
@@ -999,24 +986,7 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
-        image_grid_sizes = image_grid_thw.prod(-1)
-
-        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
-        video_grid_sizes = video_grid_thw.prod(-1)
-
-        return dict(
-            pixel_values=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
-            image_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "image", image_grid_sizes),
-            image_grid_thw=MultiModalFieldConfig.batched("image"),
-            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
-            video_embeds=MultiModalFieldConfig.flat_from_sizes(
-                "video", video_grid_sizes),
-            video_grid_thw=MultiModalFieldConfig.batched("video"),
-        )
+        return _qwen2vl_field_config(hf_inputs)
 
 
 @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor,
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 5316eb7e002..0f4f5072fb2 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -520,10 +520,7 @@ def get_tokenizer(self) -> PreTrainedTokenizer:
         return _get_tokenizer_without_image_pad(tokenizer)
 
     def get_hf_processor(self) -> QwenVLProcessor:
-        tokenizer = self.ctx.tokenizer
-        assert isinstance(tokenizer, PreTrainedTokenizer)
-
-        return QwenVLProcessor(self.get_hf_config(), tokenizer)
+        return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer())
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -605,6 +602,14 @@ def _call_hf_processor(
             mm_kwargs=mm_kwargs,
         )
 
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 063f458b2c4..fb07c5c6a25 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -9,13 +9,15 @@
 import numpy as np
 import torch
 from PIL.Image import Image
+from transformers import BatchFeature
 from typing_extensions import TypeAlias, TypeGuard, assert_never
 
 from vllm.utils import is_list_of
 
 from .audio import resample_audio
 from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
-                     ImageItem, ModalityData, MultiModalDataDict, VideoItem)
+                     ImageItem, ModalityData, MultiModalDataDict,
+                     MultiModalFieldConfig, MultiModalKwargs, VideoItem)
 
 _T = TypeVar("_T")
 _I = TypeVar("_I")
@@ -111,6 +113,60 @@ def get_feature_size(self, item_idx: int) -> int:
         return len(self.get(item_idx))
 
 
+class DictEmbeddingItems(ModalityDataItems[Mapping[str, torch.Tensor],
+                                           Mapping[str, torch.Tensor]]):
+    """
+    Base class for data items that are expressed as a dictionary of tensors.
+
+    Usually, the dictionary keys correspond to the outputs of HF processor.
+    """
+
+    def __init__(
+        self,
+        data: Mapping[str, torch.Tensor],
+        modality: str,
+        fields_config: Mapping[str, MultiModalFieldConfig],
+        required_fields: set[str],
+    ) -> None:
+        super().__init__(data, modality)
+
+        missing_required_fields = required_fields - fields_config.keys()
+        if missing_required_fields:
+            fields = set(fields_config.keys())
+            msg = f"{required_fields=} should be a subset of {fields=}"
+            raise ValueError(msg)
+
+        missing_required_data_keys = required_fields - data.keys()
+        if missing_required_data_keys:
+            data_keys = set(data.keys())
+            msg = (f"The data should contain the fields: {required_fields}, "
+                   f"but only found the following keys: {data_keys}")
+            raise ValueError(msg)
+
+        self.fields_config = fields_config
+        self.required_fields = required_fields
+
+        self._kwargs = MultiModalKwargs.from_hf_inputs(
+            BatchFeature(dict(data)),
+            fields_config,
+        )
+
+    def get_count(self) -> int:
+        return self._kwargs.get_item_count(self.modality)
+
+    def get(self, index: int) -> Mapping[str, torch.Tensor]:
+        return {
+            k: v.data
+            for k, v in self._kwargs.get_item(self.modality, index).items()
+        }
+
+    def get_processor_data(self) -> Mapping[str, object]:
+        return {}
+
+    def get_passthrough_data(self) -> Mapping[str, object]:
+        return self.data
+
+
 class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
 
     def __init__(self, data: Sequence[HfAudioItem]) -> None:
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 74479f5ffad..fcd02fbd520 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,7 +23,8 @@
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
-from .parse import MultiModalDataItems, MultiModalDataParser
+from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
+                    MultiModalDataParser)
 
 if TYPE_CHECKING:
     from .profiling import BaseDummyInputsBuilder
@@ -830,15 +831,34 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        """
+        Return whether the HF processor applies prompt replacements.
+
+        For most HF processors, this should be :code:`True` when multi-modal
+        data items are passed, but :code:`False` when multi-modal embeddings
+        are passed.
+        """
+        return not any(
+            isinstance(items, (EmbeddingItems, DictEmbeddingItems))
+            for items in mm_items.values())
+
     def _apply_hf_processor_text_mm(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs]:
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data
         together.
+
+        In addition, return whether prompt replacements have been applied.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -856,7 +876,13 @@ def _apply_hf_processor_text_mm(
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
         )
 
-        return prompt_ids, mm_kwargs
+        is_repl_applied = self._hf_processor_applies_repl(
+            prompt_text=prompt_text,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        return prompt_ids, mm_kwargs, is_repl_applied
 
     def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
@@ -866,7 +892,7 @@ def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         correspond to each other, we create dummy multi-modal items
         to go along with the text.
         """
-        prompt_ids, _ = self._apply_hf_processor_text_mm(
+        prompt_ids, _, _ = self._apply_hf_processor_text_mm(
             prompt_text=prompt_text,
             mm_items=MultiModalDataItems({}),
             hf_processor_mm_kwargs={},
@@ -908,7 +934,7 @@ def _apply_hf_processor_mm_only(
             mm_counts,
         )
 
-        _, mm_kwargs = self._apply_hf_processor_text_mm(
+        _, mm_kwargs, _ = self._apply_hf_processor_text_mm(
             prompt_text=dummy_inputs.prompt_text,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -923,13 +949,17 @@ def _apply_hf_processor_main(
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
         enable_hf_prompt_replacement: bool,
-    ) -> tuple[list[int], MultiModalKwargs]:
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data.
 
+        In addition, return whether prompt replacements have been applied
+        (for most HF processors, this should be :code:`True`).
+
         Note:
-            If :code:`enable_hf_prompt_replacement=False`, the prompt should
-            correspond to the multi-modal items.
+            If :code:`enable_hf_prompt_replacement=False`, we use HF processor
+            to perform prompt replacement if available; HF processor requires
+            that the prompt corresponds to multi-modal items.
         """
         if isinstance(prompt, str):
             if enable_hf_prompt_replacement:
@@ -943,19 +973,19 @@ def _apply_hf_processor_main(
         else:
             prompt_ids = self._apply_hf_processor_tokens_only(prompt)
 
-        mm_missing_kwargs = self._apply_hf_processor_mm_only(
+        mm_kwargs = self._apply_hf_processor_mm_only(
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
-        return prompt_ids, mm_missing_kwargs
+        return prompt_ids, mm_kwargs, False
 
     def _cached_apply_hf_processor(
         self,
         prompt: Union[str, list[int]],
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> tuple[list[int], MultiModalKwargs]:
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
         """
         Apply the HF processor on the full prompt text,
         caching the results and reusing cached results.
@@ -992,8 +1022,13 @@ def _cached_apply_hf_processor(
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
-        # so we need to pass `enable_hf_prompt_replacement=False`
-        prompt_ids, mm_missing_kwargs = self._apply_hf_processor_main(
+        # so we can't apply prompt replacements until the new multimodal
+        # items are combined with the cached multimodal items
+        (
+            prompt_ids,
+            mm_missing_kwargs,
+            is_repl_applied,
+        ) = self._apply_hf_processor_main(
             prompt=prompt,
             mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -1036,7 +1071,7 @@ def _cached_apply_hf_processor(
 
         mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
-        return prompt_ids, mm_kwargs
+        return prompt_ids, mm_kwargs, is_repl_applied
 
     def _bind_and_group_repls(
         self,
@@ -1047,18 +1082,6 @@ def _bind_and_group_repls(
         it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
         return dict(full_groupby_modality(it))
 
-    def _always_apply_prompt_replacements(self) -> bool:
-        """
-        A flag which can be overridden so that
-        :meth:`_apply_prompt_replacements` is always called even if we
-        detect that HF has performed processing via
-        :meth:`_find_placeholders_by_modality`.
-
-        This is useful in cases where :meth:`_find_placeholders_by_modality`
-        cannot be reliably used to detect whether HF has performed processing.
-        """
-        return False
-
     def _apply_prompt_replacements(
         self,
         token_ids: list[int],
@@ -1155,29 +1178,21 @@ def _validate_mm_placeholders(
         self,
         mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
         mm_item_counts: Mapping[str, int],
-        *,
-        allow_missing: bool = False,
-    ) -> Mapping[str, int]:
-        missing_repl_counts = dict[str, int]()
-
+    ) -> None:
         for modality, item_count in mm_item_counts.items():
             placeholders = mm_placeholders.get(modality, [])
 
-            if len(placeholders) != item_count and not allow_missing:
+            if len(placeholders) != item_count:
                 raise RuntimeError(
                     f"Expected there to be {item_count} prompt replacements "
-                    f"corresponding to {item_count} {modality} items, but only "
-                    f"found {len(placeholders)} prompt replacements! Either "
-                    "the prompt text has missing/incorrect tokens for "
+                    f"corresponding to {item_count} {modality} items, but "
+                    f"instead found {len(placeholders)} prompt replacements! "
+                    "Either the prompt text has missing/incorrect tokens for "
                     "multi-modal inputs, or there is a problem with your "
                     "implementation of merged multi-modal processor for this "
                     "model (usually arising from an inconsistency between "
                     "`_call_hf_processor` and `_get_prompt_replacements`).")
 
-            missing_repl_counts[modality] = item_count - len(placeholders)
-
-        return missing_repl_counts
-
     def apply(
         self,
         prompt: Union[str, list[int]],
@@ -1217,7 +1232,11 @@ def apply(
         else:
             mm_hashes = None
 
-        prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_repl_applied,
+        ) = self._cached_apply_hf_processor(
             prompt,
             mm_items,
             hf_processor_mm_kwargs,
@@ -1233,52 +1252,27 @@ def apply(
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
-        hf_mm_placeholders = self._find_mm_placeholders(
-            mm_prompt_repls,
-            prompt_ids,
-            mm_item_counts,
-        )
-
-        if self._always_apply_prompt_replacements():
-            mm_missing_repl_counts = mm_item_counts
-            mm_missing_repls = dict(mm_prompt_repls)
-        else:
-            mm_missing_repl_counts = self._validate_mm_placeholders(
-                hf_mm_placeholders,
+        if is_repl_applied:
+            mm_placeholders = self._find_mm_placeholders(
+                mm_prompt_repls,
+                prompt_ids,
                 mm_item_counts,
-                allow_missing=True,
             )
+            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
-            mm_missing_repls = dict[str, list[BoundPromptReplacement]]()
-            for modality, missing_repl_count in mm_missing_repl_counts.items():
-                if missing_repl_count == 0:
-                    mm_missing_repls[modality] = []
-                elif missing_repl_count == mm_item_counts.get(modality, 0):
-                    mm_missing_repls[modality] = mm_prompt_repls[modality]
-                else:
-                    raise ValueError("Partial prompt replacement within "
-                                     f"{modality=} is not supported")
-
-        # If HF processor already inserts placeholder tokens,
-        # there is no need for us to insert them
-        if all(len(repls) == 0 for repls in mm_missing_repls.values()):
             tokenizer = self.info.get_tokenizer()
             prompt = decode_tokens(tokenizer, prompt_ids)
-            mm_placeholders = hf_mm_placeholders
         else:
             (
                 prompt_ids,
                 prompt,
-                missing_mm_placeholders,
+                mm_placeholders,
             ) = self._apply_prompt_replacements(
                 prompt_ids,
-                mm_missing_repls,
-                mm_missing_repl_counts,
+                mm_prompt_repls,
+                mm_item_counts,
             )
-
-            mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders}
-
-        self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
+            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
 
         mm_placeholder_ranges = {
             modality: [item.to_range() for item in placeholders]

From 5351433ff74ce2eac528f06e52754cb2503ca6eb Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 14 Feb 2025 04:33:43 -0800
Subject: [PATCH 0177/1240] [V1] Simplify GPUModelRunner._update_states check
 (#13265)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b2f6c33858c..e90b76dcdd9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -347,6 +347,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.input_batch.block_table.append_row(req_index, start_index,
                                                     req_data.new_block_ids)
 
+        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
+
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         removed_req_indices = sorted(removed_req_indices, reverse=True)
@@ -363,8 +365,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
-        return (len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
-                or len(scheduler_output.finished_req_ids) > 0)
+
+        return batch_changed
 
     def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens

From aa59a16764891f44f4f5fb14e1cdcb86352ed7d5 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 14 Feb 2025 04:34:59 -0800
Subject: [PATCH 0178/1240] Support logit_bias in v1 Sampler (#13079)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampler.py         |  71 ++++++++++--
 tests/v1/worker/test_gpu_input_batch.py | 142 +++++++++++++-----------
 vllm/sampling_params.py                 |   4 +-
 vllm/v1/sample/metadata.py              |   2 +
 vllm/v1/sample/sampler.py               |  16 +++
 vllm/v1/worker/gpu_input_batch.py       |  66 ++++++-----
 6 files changed, 200 insertions(+), 101 deletions(-)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index f7eedcb9c58..03606af3867 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 import numpy as np
 import pytest
@@ -45,6 +45,18 @@ def _create_prompt_tokens_tensor(
     )
 
 
+def _create_logit_bias(
+    batch_size: int,
+    vocab_size: int,
+    bias_value: float,
+) -> List[Optional[Dict[int, float]]]:
+    res: List[Optional[Dict[int, float]]] = []
+    for i in range(batch_size):
+        logit_bias = {min(i, vocab_size - 1): bias_value}
+        res.append(logit_bias)
+    return res
+
+
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -80,6 +92,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         min_tokens=[],
         stop_token_ids=[],
+        logit_bias=[None] * batch_size,
     )
     return fake_sampling_metadata
 
@@ -89,14 +102,14 @@ def _generate_min_token_penalties_and_stop_tokens(
     batch_indices_for_min_token_penalty: List[int]
 ) -> Tuple[List[int], List[Set[int]]]:
     """
-    Generates and returns a list of minimum token penalties (`min_tokens`) 
-    and a corresponding list of stop token IDs (`stop_token_ids`) for each 
+    Generates and returns a list of minimum token penalties (`min_tokens`)
+    and a corresponding list of stop token IDs (`stop_token_ids`) for each
     batch.
 
-    If a batch index is included in `batch_indices_for_min_token_penalty`, 
-    a higher `min_tokens` value is assigned (within a randomized range), 
-    and a random set of stop token IDs is created. Otherwise, a lower 
-    `min_tokens` value is assigned, and the stop token IDs set is empty.   
+    If a batch index is included in `batch_indices_for_min_token_penalty`,
+    a higher `min_tokens` value is assigned (within a randomized range),
+    and a random set of stop token IDs is created. Otherwise, a lower
+    `min_tokens` value is assigned, and the stop token IDs set is empty.
     """
     stop_token_ids: List[Set[int]] = []
     min_tokens: List[int] = []
@@ -120,7 +133,7 @@ def _create_weighted_output_token_list(
         batch_size: int,
         vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
     """
-    Creates an output token list where each token occurs a distinct 
+    Creates an output token list where each token occurs a distinct
     number of times.
 
     For each batch, a random subset of token IDs is selected from the
@@ -129,8 +142,8 @@ def _create_weighted_output_token_list(
 
     Returns:
         Tuple[List[List[int]], List[List[int]]]:
-            - The first element is the output token list, where each sublist 
-              corresponds to a batch and contains tokens with weighted 
+            - The first element is the output token list, where each sublist
+              corresponds to a batch and contains tokens with weighted
               frequencies.
             - The second element is a list of distinct token IDs for each
               batch, ordered by their frequency in the corresponding output
@@ -155,7 +168,7 @@ def _create_weighted_output_token_list(
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 def test_sampler_min_tokens_penalty(device: str, batch_size: int):
     """
-    Tests that if the number of output tokens is less than 
+    Tests that if the number of output tokens is less than
     SamplingParams.min_tokens then we will set the logits for
     the stop token ids to -inf.
     """
@@ -283,7 +296,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
 def test_sampler_repetition_penalty(device: str, batch_size: int,
                                     repetition_penalty: float):
     """
-    Test to verify that when the repetition penalty is enabled, tokens 
+    Test to verify that when the repetition penalty is enabled, tokens
     are penalized based on their presence in the prompt or the existing
     output.
     """
@@ -321,3 +334,37 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
                 penalized_token_id not in output_tokens)
             assert (non_penalized_token_id  in prompt_tokens or \
                 non_penalized_token_id in output_tokens)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
+def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.logit_bias = _create_logit_bias(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        bias_value=bias_value,
+    )
+    sampler = Sampler()
+    logits = sampler.apply_logits_bias(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        biased_index = min(batch_idx, VOCAB_SIZE - 1)
+        for token_id in range(VOCAB_SIZE):
+            if biased_index == token_id:
+                assert logits_for_req[token_id] == pytest.approx(bias_value +
+                                                                 1e-2)
+            else:
+                assert logits_for_req[token_id] == pytest.approx(1e-2)
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 5b40fbff821..5e70cfb5377 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -45,9 +45,11 @@ def _remove_requests(
 
 
 def _construct_expected_sampling_metadata(
-        reqs: List[CachedRequestState], req_ids_retained: Set[int],
-        req_id_index_in_input_batch: Dict[str, int],
-        device: torch.device) -> SamplingMetadata:
+    reqs: List[CachedRequestState],
+    req_ids_retained: Set[int],
+    req_id_index_in_input_batch: Dict[str, int],
+    device: torch.device,
+) -> SamplingMetadata:
     """
     Constructs and returns the expected SamplingMetadata for this
     batch.
@@ -63,6 +65,7 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
     min_tokens = [0 for _ in range(num_reqs)]
+    logit_bias = [None] * num_reqs
     for req in reqs:
         if req.req_id not in req_ids_retained:
             continue
@@ -71,20 +74,21 @@ def _construct_expected_sampling_metadata(
         prompt_token_ids[index_in_input_batch] = req.prompt_token_ids
         presence_penalties[
             index_in_input_batch] = req.sampling_params.presence_penalty
-        frequency_penalties[
-            index_in_input_batch] = req.sampling_params.frequency_penalty
-        repetition_penalties[
-            index_in_input_batch] = req.sampling_params.repetition_penalty
+        frequency_penalties[index_in_input_batch] = (
+            req.sampling_params.frequency_penalty)
+        repetition_penalties[index_in_input_batch] = (
+            req.sampling_params.repetition_penalty)
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         stop_token_ids[
             index_in_input_batch] = req.sampling_params.all_stop_token_ids
         min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
-
+        logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
 
     return SamplingMetadata(
-        temperature=torch.tensor(temperature, dtype=torch.float, device=device),
+        temperature=torch.tensor(temperature, dtype=torch.float,
+                                 device=device),
         all_greedy=False,
         all_random=True,
         top_p=torch.tensor(top_p, dtype=torch.float, device=device),
@@ -93,41 +97,45 @@ def _construct_expected_sampling_metadata(
         no_top_k=all(x == 0 for x in top_k),
         generators={},
         max_num_logprobs=0,
-        prompt_token_ids= make_tensor_with_pad(
+        prompt_token_ids=make_tensor_with_pad(
             prompt_token_ids,
             pad=VOCAB_SIZE,
             device=torch.device(device),
             dtype=torch.int64,
         ),
-        frequency_penalties=torch.tensor(
-            frequency_penalties, dtype=torch.float,
-            device=device),
-        presence_penalties=torch.tensor(
-            presence_penalties, dtype=torch.float,
-            device=device),
-        repetition_penalties=torch.tensor(
-            repetition_penalties, dtype=torch.float,
-            device=device),
+        frequency_penalties=torch.tensor(frequency_penalties,
+                                         dtype=torch.float,
+                                         device=device),
+        presence_penalties=torch.tensor(presence_penalties,
+                                        dtype=torch.float,
+                                        device=device),
+        repetition_penalties=torch.tensor(repetition_penalties,
+                                          dtype=torch.float,
+                                          device=device),
         output_token_ids=output_token_ids,
         min_tokens=min_tokens,
         stop_token_ids=stop_token_ids,
-        no_penalties=(all(x ==0 for x in presence_penalties) and \
-            all(x ==0 for x in frequency_penalties) and \
-                all(x ==1 for x in repetition_penalties))
+        no_penalties=(all(x == 0 for x in presence_penalties)
+                      and all(x == 0 for x in frequency_penalties)
+                      and all(x == 1 for x in repetition_penalties)),
+        logit_bias=logit_bias,
     )
 
 
 def _create_sampling_params():
-    return SamplingParams(top_k=np.random.randint(1, 10),
-                          top_p=np.random.uniform(0.0, 1.0),
-                          presence_penalty=np.random.uniform(-2.0, 2.0),
-                          repetition_penalty=np.random.uniform(0.0, 2.0),
-                          frequency_penalty=np.random.uniform(-2.0, 2.0),
-                          min_tokens=np.random.randint(1, 10),
-                          stop_token_ids=[
-                              np.random.randint(0, VOCAB_SIZE)
-                              for _ in range(np.random.randint(10))
-                          ])
+    return SamplingParams(
+        top_k=np.random.randint(1, 10),
+        top_p=np.random.uniform(0.0, 1.0),
+        presence_penalty=np.random.uniform(-2.0, 2.0),
+        repetition_penalty=np.random.uniform(0.0, 2.0),
+        frequency_penalty=np.random.uniform(-2.0, 2.0),
+        min_tokens=np.random.randint(1, 10),
+        stop_token_ids=[
+            np.random.randint(0, VOCAB_SIZE)
+            for _ in range(np.random.randint(10))
+        ],
+        logit_bias={0: np.random.uniform(-3.0, 3.0)},
+    )
 
 
 def _construct_cached_request_state(req_id_suffix: int):
@@ -139,16 +147,18 @@ def _construct_cached_request_state(req_id_suffix: int):
         np.random.randint(0, VOCAB_SIZE)
         for _ in range(np.random.randint(0, NUM_OUTPUT_TOKENS))
     ]
-    return CachedRequestState(req_id=f"req_id_{req_id_suffix}",
-                              prompt_token_ids=prompt_token_ids,
-                              prompt=None,
-                              sampling_params=_create_sampling_params(),
-                              mm_inputs=[],
-                              mm_positions=[],
-                              block_ids=[],
-                              generator=None,
-                              num_computed_tokens=len(output_token_ids),
-                              output_token_ids=output_token_ids)
+    return CachedRequestState(
+        req_id=f"req_id_{req_id_suffix}",
+        prompt_token_ids=prompt_token_ids,
+        prompt=None,
+        sampling_params=_create_sampling_params(),
+        mm_inputs=[],
+        mm_positions=[],
+        block_ids=[],
+        generator=None,
+        num_computed_tokens=len(output_token_ids),
+        output_token_ids=output_token_ids,
+    )
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -163,12 +173,14 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     output of `make_sampling_metadata` is then compared against the expected
     results to ensure correctness.
     """
-    input_batch: InputBatch = InputBatch(max_num_reqs=batch_size,
-                                         max_model_len=1024,
-                                         max_num_blocks_per_req=10,
-                                         device=torch.device(device),
-                                         pin_memory=is_pin_memory_available(),
-                                         vocab_size=1024)
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_blocks_per_req=10,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+    )
     reqs: List[CachedRequestState] = []
     req_id_reqs = {}
     req_id_output_token_ids = {}
@@ -206,21 +218,27 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
                           sampling_metadata.top_p)
     assert torch.allclose(expected_sampling_metadata.top_k,
                           sampling_metadata.top_k)
-    assert torch.allclose(expected_sampling_metadata.frequency_penalties,
-                          sampling_metadata.frequency_penalties)
-    assert torch.allclose(expected_sampling_metadata.presence_penalties,
-                          sampling_metadata.presence_penalties)
-    assert torch.allclose(expected_sampling_metadata.repetition_penalties,
-                          sampling_metadata.repetition_penalties)
+    assert torch.allclose(
+        expected_sampling_metadata.frequency_penalties,
+        sampling_metadata.frequency_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.presence_penalties,
+        sampling_metadata.presence_penalties,
+    )
+    assert torch.allclose(
+        expected_sampling_metadata.repetition_penalties,
+        sampling_metadata.repetition_penalties,
+    )
     assert torch.allclose(expected_sampling_metadata.prompt_token_ids,
                           sampling_metadata.prompt_token_ids)
     assert (expected_sampling_metadata.output_token_ids ==
             sampling_metadata.output_token_ids)
-    assert (
-        expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens)
-    assert (expected_sampling_metadata.stop_token_ids ==
-            sampling_metadata.stop_token_ids)
-    assert (expected_sampling_metadata.no_penalties ==
-            sampling_metadata.no_penalties)
-    assert (expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p)
-    assert (expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k)
+    assert expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens
+    assert expected_sampling_metadata.stop_token_ids == \
+           sampling_metadata.stop_token_ids
+    assert expected_sampling_metadata.no_penalties == \
+           sampling_metadata.no_penalties
+    assert expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p
+    assert expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k
+    assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 97f9e212957..04ddcd73fa9 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -243,8 +243,10 @@ def from_optional(
         allowed_token_ids: Optional[List[int]] = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
+            # Convert token_id to integer
+            # Clamp the bias between -100 and 100 per OpenAI API spec
             logit_bias = {
-                int(token): bias
+                int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
 
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 1a2771baba9..6c2478bf662 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -32,3 +32,5 @@ class SamplingMetadata:
     output_token_ids: List[List[int]]
     min_tokens: List[int]
     stop_token_ids: List[Set[int]]
+
+    logit_bias: List[Optional[Dict[int, float]]]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 43fd64aaaa8..739dc811d5d 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -37,6 +37,8 @@ def forward(
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
+        # Apply logits bias.
+        logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata)
         # Apply temperature.
@@ -166,3 +168,17 @@ def apply_penalties(
                 sampling_metadata.repetition_penalties,
                 sampling_metadata.output_token_ids)
         return logits
+
+    def apply_logits_bias(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        # TODO(houseroad): this implementation is extremely inefficient.
+        # One idea is implement this as a PyTorch C++ op, and we may
+        # even optimize the logit_bias layout.
+        for i, logit_bias in enumerate(sampling_metadata.logit_bias):
+            if logit_bias:
+                for token_id, bias in logit_bias.items():
+                    logits[i, token_id] += bias
+        return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d5b8fd21841..d52b8827d35 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -130,7 +130,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.frequency_penalties_cpu = \
-            self.frequency_penalties_cpu_tensor.numpy()
+                self.frequency_penalties_cpu_tensor.numpy()
         self.frequency_penalties_reqs: Set[str] = set()
 
         # Presence penalty related data structures
@@ -141,8 +141,8 @@ def __init__(
                                                          dtype=torch.float,
                                                          device="cpu",
                                                          pin_memory=pin_memory)
-        self.presence_penalties_cpu = \
-            self.presence_penalties_cpu_tensor.numpy()
+        self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
+        )
         self.presence_penalties_reqs: Set[str] = set()
 
         # Repetition penalty related data structures
@@ -155,7 +155,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.repetition_penalties_cpu = \
-            self.repetition_penalties_cpu_tensor.numpy()
+                self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: Set[str] = set()
 
         self.min_tokens: List[int] = [0] * max_num_reqs
@@ -180,6 +180,9 @@ def __init__(
         # that are currently in the prefill phase.
         self.num_prompt_logprobs: Dict[str, int] = {}
 
+        self.logit_bias: List[Optional[Dict[int,
+                                            float]]] = [None] * max_num_reqs
+
     def add_request(
         self,
         request: "CachedRequestState",
@@ -220,16 +223,16 @@ def add_request(
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
             self.top_k_reqs.add(req_id)
-        self.frequency_penalties_cpu[req_index] = \
-            sampling_params.frequency_penalty
+        self.frequency_penalties_cpu[
+            req_index] = sampling_params.frequency_penalty
         if sampling_params.frequency_penalty != 0.0:
             self.frequency_penalties_reqs.add(req_id)
-        self.presence_penalties_cpu[req_index] = \
-            sampling_params.presence_penalty
+        self.presence_penalties_cpu[
+            req_index] = sampling_params.presence_penalty
         if sampling_params.presence_penalty != 0.0:
             self.presence_penalties_reqs.add(req_id)
-        self.repetition_penalties_cpu[req_index] = \
-            sampling_params.repetition_penalty
+        self.repetition_penalties_cpu[
+            req_index] = sampling_params.repetition_penalty
         if sampling_params.repetition_penalty != 1.0:
             self.repetition_penalties_reqs.add(req_id)
         self.min_tokens[req_index] = sampling_params.min_tokens
@@ -244,6 +247,8 @@ def add_request(
             self.num_logprobs[req_id] = sampling_params.logprobs
         if sampling_params.prompt_logprobs is not None:
             self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
+        if sampling_params.logit_bias is not None:
+            self.logit_bias[req_index] = sampling_params.logit_bias
 
         # Add request lora ID
         if request.lora_request:
@@ -284,6 +289,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
                 self.lora_id_to_lora_request.pop(lora_id)
             self.request_lora_mapping[req_index] = 0
 
+        self.logit_bias[req_index] = None
         return req_index
 
     def clear(self) -> None:
@@ -302,6 +308,7 @@ def clear(self) -> None:
         self.request_lora_mapping.fill(0)
         self.lora_id_to_lora_request.clear()
         self.lora_id_to_request_ids.clear()
+        self.logit_bias = [None] * self.max_num_reqs
 
     def condense(self, empty_req_indices: List[int]) -> None:
         if self.num_reqs == 0:
@@ -332,8 +339,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens]
             self.num_tokens[empty_index] = num_tokens
-            self.num_prompt_tokens[empty_index] = \
-                self.num_prompt_tokens[last_req_index]
+            self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
+                last_req_index]
             self.num_computed_tokens_cpu[
                 empty_index] = self.num_computed_tokens_cpu[last_req_index]
             self.block_table.move_row(last_req_index, empty_index)
@@ -341,15 +348,15 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 last_req_index]
             self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index]
             self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index]
-            self.frequency_penalties_cpu[empty_index] = \
-                self.frequency_penalties_cpu[last_req_index]
-            self.presence_penalties_cpu[empty_index] = \
-                self.presence_penalties_cpu[last_req_index]
-            self.repetition_penalties_cpu[empty_index] = \
-                self.repetition_penalties_cpu[last_req_index]
+            self.frequency_penalties_cpu[
+                empty_index] = self.frequency_penalties_cpu[last_req_index]
+            self.presence_penalties_cpu[
+                empty_index] = self.presence_penalties_cpu[last_req_index]
+            self.repetition_penalties_cpu[
+                empty_index] = self.repetition_penalties_cpu[last_req_index]
             self.min_tokens[empty_index] = self.min_tokens[last_req_index]
-            self.stop_token_ids[empty_index] = \
-                self.stop_token_ids[last_req_index]
+            self.stop_token_ids[empty_index] = self.stop_token_ids[
+                last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
@@ -357,6 +364,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
+            self.logit_bias[empty_index] = self.logit_bias[last_req_index]
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -378,13 +387,16 @@ def make_sampling_metadata(
                 # penalties to be applied during sampling.
                 self.frequency_penalties[:self.num_reqs].copy_(
                     self.frequency_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True)
+                    non_blocking=True,
+                )
                 self.presence_penalties[:self.num_reqs].copy_(
                     self.presence_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True)
+                    non_blocking=True,
+                )
                 self.repetition_penalties[:self.num_reqs].copy_(
                     self.repetition_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True)
+                    non_blocking=True,
+                )
                 # The prompt tokens are used only for applying penalties during
                 # the sampling process. Hence copy these tensors only when
                 # there are requests which need penalties to be applied.
@@ -421,6 +433,7 @@ def make_sampling_metadata(
             min_tokens=self.min_tokens[:self.num_reqs],
             stop_token_ids=self.stop_token_ids[:self.num_reqs],
             no_penalties=self.no_penalties,
+            logit_bias=self.logit_bias[:self.num_reqs],
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
@@ -429,10 +442,11 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
             (self.num_reqs, max_prompt_len),
             device="cpu",
             dtype=torch.int64,
-            pin_memory=self.pin_memory)
+            pin_memory=self.pin_memory,
+        )
         prompt_token_ids = prompt_token_ids_cpu_tensor.numpy()
-        prompt_token_ids[:] = (
-            self.token_ids_cpu[:self.num_reqs, :max_prompt_len])
+        prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                                 num_reqs, :max_prompt_len]
         # Use the value of vocab_size as a pad since we don't have a
         # token_id of this value.
         for i in range(self.num_reqs):

From 6a55c0e71d58cbe50fe5d046f33015643dd58ff1 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Feb 2025 07:36:05 -0500
Subject: [PATCH 0179/1240] [Core] choice-based structured output with xgrammar
 (#12632)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt                       |  2 +-
 .../guided_decoding/__init__.py               |  9 +++---
 .../guided_decoding/xgrammar_decoding.py      | 31 ++++++++++++++++++-
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 0b7253cc121..8991cc1ce02 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,7 +20,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
 lark == 1.2.2 
-xgrammar >= 0.1.6; platform_machine == "x86_64"
+xgrammar >= 0.1.11; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 3eb7d186eb0..77212a1d8cf 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -49,11 +49,10 @@ def maybe_backend_fallback(
                            "Falling back to use outlines instead.")
             guided_params.backend = "outlines"
 
-        # xgrammar doesn't support regex or choice, fallback to outlines
-        if guided_params.regex is not None or guided_params.choice is not None:
-            logger.warning(
-                "xgrammar only supports json or grammar guided decoding. "
-                "Falling back to use outlines instead.")
+        # xgrammar doesn't support regex, fallback to outlines
+        if guided_params.regex is not None:
+            logger.warning("xgrammar does not support regex guided decoding. "
+                           "Falling back to use outlines instead.")
             guided_params.backend = "outlines"
 
         # xgrammar doesn't support some JSON schema features
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index fc3a4cd4beb..329b03a573d 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -5,8 +5,9 @@
 
 import copy
 import json
+import re
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, List
 
 import torch
 from transformers import PreTrainedTokenizerFast
@@ -228,11 +229,39 @@ def from_guided_params(cls,
                 max_threads=max_threads,
                 tokenizer_data=tokenizer_data,
             )
+        elif guided_params.choice:
+            choice_str = GrammarConfig.choice_as_grammar(guided_params.choice)
+            try:
+                xgr.Grammar.from_ebnf(choice_str)
+            except RuntimeError as err:
+                raise ValueError(str(err)) from err
+
+            return cls(
+                grammar_str=choice_str,
+                vocab_size=model_config.hf_text_config.vocab_size,
+                tokenizer_hash=tokenizer_hash,
+                max_threads=max_threads,
+                tokenizer_data=tokenizer_data,
+            )
         else:
             raise ValueError(
                 "Currently only support JSON and EBNF grammar mode for xgrammar"
             )
 
+    @staticmethod
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r'\\\1', s)
+
+    @staticmethod
+    def choice_as_grammar(choice: List[str] | None) -> str:
+        if choice is None:
+            raise ValueError("Choice is not set")
+        escaped_choices = (GrammarConfig.escape_ebnf_string(c) for c in choice)
+        grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+        return grammar
+
 
 @dataclass
 class XGrammarLogitsProcessor:

From 84ca2662c2769168565dbada4ab0aa813b0b5d12 Mon Sep 17 00:00:00 2001
From: Yu-Zhou <yu.zhou@intel.com>
Date: Fri, 14 Feb 2025 20:36:49 +0800
Subject: [PATCH 0180/1240] [Hardware][Gaudi][Bugfix] Fix error for guided
 decoding (#12317)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../guided_decoding/outlines_logits_processors.py     | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index ab72b55a894..a05267d921d 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -32,6 +32,8 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
+from vllm.platforms import current_platform
+
 
 class BaseLogitsProcessor:
 
@@ -91,7 +93,14 @@ def __call__(self, input_ids: List[int],
         allowed_tokens = allowed_tokens.masked_select(
             allowed_tokens < scores.shape[-1])
         mask.index_fill_(0, allowed_tokens, 0)
-        scores.add_(mask)
+        if current_platform.is_hpu():
+            # Workaround for HPU bug where add_() raise RuntimeError:
+            # synNodeCreateWithId failed for node: strided_insert
+            # with synStatus 1 [Invalid argument], hopefully it will
+            # be fixed in the future releases of the HPU runtime.
+            scores = scores.add(mask)
+        else:
+            scores.add_(mask)
         return scores
 
 
From ce61da9912174e7887242e2f12ca712e5848b11c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 14 Feb 2025 15:53:42 -0500
Subject: [PATCH 0181/1240] [Quant][Perf] Use moe_wna16 kernel by default for
 MoEs with many experts (#13236)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/weight_loading/test_weight_loading.py   |  2 +-
 .../layers/quantization/awq_marlin.py         |  8 ++++-
 .../layers/quantization/gptq_marlin.py        | 35 +++++++++----------
 .../layers/quantization/moe_wna16.py          | 20 ++++++++---
 4 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index e456bfab83d..9d6b25da7e6 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -12,7 +12,7 @@
                             "robertgshaw2/zephyr-7b-beta-channelwise-gptq")
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
-MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
 
 
 @pytest.mark.skipif(
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index a43b2e597c1..de4009d7d04 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -17,6 +17,7 @@
                                                          is_layer_skipped_awq)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
+from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     apply_awq_marlin_linear, awq_to_marlin_zero_points, check_marlin_supported,
@@ -134,7 +135,12 @@ def get_quant_method(self, layer: torch.nn.Module,
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            return AWQMoEMethod(self)
+            if layer.num_experts > 32:
+                # For MoEs with many experts the moe_wna16 kernel is faster
+                return MoeWNA16Config.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
+            else:
+                return AWQMoEMethod(self)
         return None
 
     @classmethod
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 0a9d86b008d..f421dbd2ce2 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -10,20 +10,18 @@
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearMethodBase,
-                                               UnquantizedLinearMethod,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+    QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision import (
     MPLinearLayerConfig, choose_mp_linear_kernel)
+from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
     get_linear_quant_method)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported, marlin_moe_permute_scales,
     marlin_repeat_scales_on_all_ranks, verify_marlin_supported)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    UnquantizedEmbeddingMethod)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            GroupQuantScaleParameter,
                                            PackedColumnParameter,
@@ -44,15 +42,10 @@ class GPTQMarlinConfig(QuantizationConfig):
         (8, True): scalar_types.uint8b128,
     }
 
-    def __init__(
-        self,
-        weight_bits: int,
-        group_size: int,
-        desc_act: bool,
-        is_sym: bool,
-        lm_head_quantized: bool,
-        dynamic: Dict[str, Dict[str, Union[int, bool]]],
-    ) -> None:
+    def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
+                 is_sym: bool, lm_head_quantized: bool,
+                 dynamic: Dict[str, Dict[str, Union[int, bool]]],
+                 full_config: Dict[str, Any]) -> None:
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
@@ -90,6 +83,7 @@ def __init__(
         self.group_size = group_size
         self.desc_act = desc_act
         self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
 
         if (weight_bits, is_sym) not in self.TYPE_MAP:
             raise ValueError("Unsupported quantization config: "
@@ -132,7 +126,7 @@ def from_config(cls, config: Dict[str, Any]) -> "GPTQMarlinConfig":
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
                                                  default=False)
         return cls(weight_bits, group_size, desc_act, is_sym,
-                   lm_head_quantized, dynamic)
+                   lm_head_quantized, dynamic, config)
 
     @classmethod
     def override_quantization_method(cls, hf_quant_cfg,
@@ -155,12 +149,15 @@ def override_quantization_method(cls, hf_quant_cfg,
                         " faster inference")
         return None
 
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional[Union["GPTQMarlinLinearMethod", "GPTQMarlinMoEMethod",
-                        UnquantizedLinearMethod, UnquantizedEmbeddingMethod]]:
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, FusedMoE):
-            return GPTQMarlinMoEMethod(self)
+            if layer.num_experts > 32:
+                # For MoEs with many experts the moe_wna16 kernel is faster
+                return MoeWNA16Config.from_config(
+                    self.full_config).get_quant_method(layer, prefix)
+            else:
+                return GPTQMarlinMoEMethod(self)
         return get_linear_quant_method(self, layer, prefix,
                                        GPTQMarlinLinearMethod)
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index b9460e7d798..30eb04698d8 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -9,13 +9,8 @@
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.gptq import GPTQConfig
-from vllm.model_executor.layers.quantization.gptq_marlin import (
-    GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supports_layer)
 from vllm.model_executor.utils import set_weight_attrs
@@ -37,6 +32,12 @@ def __init__(self, linear_quant_method: str, weight_bits: int,
         self.linear_quant_method = linear_quant_method
         self.full_config = full_config
         self.use_marlin = False
+        # Avoid circular import
+        from vllm.model_executor.layers.quantization.awq import AWQConfig
+        from vllm.model_executor.layers.quantization.awq_marlin import (
+            AWQMarlinConfig)
+        from vllm.model_executor.layers.quantization.gptq_marlin import (
+            GPTQMarlinConfig)
         if self.linear_quant_method == "gptq":
             self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(
                 full_config)
@@ -115,6 +116,8 @@ def is_moe_wna16_compatible(cls, quant_config: Dict[str, Any]):
         capability_tuple = current_platform.get_device_capability()
         device_capability = (-1 if capability_tuple is None else
                              capability_tuple.to_int())
+        # Avoid circular import
+        from vllm.model_executor.layers.quantization.awq import AWQConfig
         awq_min_capability = AWQConfig.get_min_capability()
 
         gptq_compatible = quant_method == "gptq" and \
@@ -129,6 +132,13 @@ def get_quant_method(self, layer: torch.nn.Module,
         if is_layer_skipped_quant(prefix, self.modules_to_not_convert):
             return UnquantizedLinearMethod()
         elif isinstance(layer, LinearBase):
+            # Avoid circular import
+            from vllm.model_executor.layers.quantization.awq import AWQConfig
+            from vllm.model_executor.layers.quantization.awq_marlin import (
+                AWQMarlinConfig)
+            from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+            from vllm.model_executor.layers.quantization.gptq_marlin import (
+                GPTQMarlinConfig)
             if self.linear_quant_method == "gptq":
                 if self.use_marlin:
                     return GPTQMarlinConfig.from_config(

From f6c091774b3adcede1fbf5afc34f563ec992cdaa Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Fri, 14 Feb 2025 16:36:07 -0700
Subject: [PATCH 0182/1240] [Core] Reduce TTFT with concurrent partial prefills
 (#10235)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../basic_correctness/test_chunked_prefill.py |  30 +-
 tests/core/test_chunked_prefill_scheduler.py  | 316 ++++++++++++++-
 vllm/config.py                                |  46 +++
 vllm/core/scheduler.py                        | 375 ++++++++++++++----
 vllm/engine/arg_utils.py                      |  34 +-
 vllm/model_executor/layers/sampler.py         |   8 +-
 6 files changed, 701 insertions(+), 108 deletions(-)

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index d041f0c4d09..a500ba9dfe0 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -8,7 +8,6 @@
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
 import os
-from contextlib import nullcontext
 
 import pytest
 
@@ -233,7 +232,6 @@ def test_with_prefix_caching(
 
     max_num_batched_tokens = max_num_seqs = chunk_size
     outputs = {}  # type: ignore
-    check_result = True
     for enable in (True, False):
         with vllm_runner(
                 model,
@@ -245,25 +243,17 @@ def test_with_prefix_caching(
                 enforce_eager=enforce_eager,
                 max_num_seqs=max_num_seqs,
         ) as vllm_model:
-            # It should fail when prefix caching is enable and chunk
-            # size is not a multiple of block size (16).
-            should_fail = chunk_size % 16 != 0 and enable
-            check_result &= not should_fail
             outputs[enable] = []
-            # Send the request one-by-one to ensure the cache is populated.
-            with pytest.raises(ValueError) if should_fail else nullcontext():
-                for prompt in full_prompts:
-                    outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                                  max_tokens)
-
-    # Check results only if we did not expect a failure.
-    if check_result:
-        check_outputs_equal(
-            outputs_0_lst=outputs[False],
-            outputs_1_lst=outputs[True],
-            name_0="w/o prefix caching",
-            name_1="with prefix caching",
-        )
+            for prompt in full_prompts:
+                outputs[enable] += vllm_model.generate_greedy([prompt],
+                                                              max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=outputs[False],
+        outputs_1_lst=outputs[True],
+        name_0="w/o prefix caching",
+        name_1="with prefix caching",
+    )
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 8da25aea457..8e0b9e63b40 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -7,6 +7,9 @@
 
 from vllm.config import CacheConfig, SchedulerConfig
 from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob, SequenceGroup
 
 from .utils import create_dummy_prompt
@@ -16,7 +19,7 @@ def get_sequence_groups(scheduler_output):
     return [s.seq_group for s in scheduler_output.scheduled_seq_groups]
 
 
-def append_new_token(seq_group, token_id: int):
+def append_new_token(seq_group: SequenceGroup, token_id: int):
     for seq in seq_group.get_seqs():
         seq.append_token_id(token_id, {token_id: Logprob(token_id)})
 
@@ -123,6 +126,232 @@ def test_chunk():
     assert out.num_batched_tokens == 57
 
 
+def test_concurrent_chunking():
+    """Verify prefills are chunked properly when 
+    --max-num-partial-prefills is > 1"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 32
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           prompt_length=60,
+                                           block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    # Verify both requests are chunked with half of max_num_batched_tokens each
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 32
+    assert seq_group_meta[1].token_chunk_size == 32
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # After one iteration, both should have 60 - 32 = 28 tokens left to prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+
+def test_concurrent_chunking_large_requests():
+    """Verify large prefill requests are run one at a time"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+
+    # Verify only a single request is chunked, and it gets all 64 tokens
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert len(get_sequence_groups(out)) == 1
+    assert seq_group_meta[0].token_chunk_size == 64
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+
+def test_short_prompts_jump_long_prompts_in_queue():
+    """Verify large prefill requests are punted behind smaller ones if 
+    another large prefill request is already running"""
+    block_size = 4
+    max_seqs = 60
+    max_model_len = 2000
+    max_num_batched_tokens = 64
+    scheduler_config = SchedulerConfig(
+        "generate",
+        max_num_batched_tokens,
+        max_seqs,
+        max_model_len,
+        enable_chunked_prefill=True,
+        max_num_partial_prefills=2,  # Up to 2 partial prefills at a time
+    )
+    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
+    cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
+    cache_config.num_gpu_blocks = 3200
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    long_seqs: List[SequenceGroup] = []
+    short_seqs: List[SequenceGroup] = []
+
+    # Add 2 large seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i),
+            prompt_length=1200,  # Very large prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        long_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Add 2 small seq groups behind them
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(
+            str(i + 2),
+            prompt_length=40,  # Very small prompt
+            block_size=block_size)
+        scheduler.add_seq_group(seq_group)
+        short_seqs.append(seq_group)
+        assert seq_group.is_prefill()
+
+    # Verify one large req and 1 small req chunked
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large req gets 32 tokens
+    assert seq_group_meta[1].token_chunk_size == 32  # small req gets 32 tokens
+
+    # all 4 are prefilling
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # First short and first long sequences have been scheduled
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 32
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 0
+
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 64
+
+    # in the second iteration,
+    # the first small request had only 8 tokens left
+    # so it went to decode
+    # The other small req is scheduled
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # the new small req got 64 - (32+8) tokens
+    assert seq_group_meta[0].token_chunk_size == 24
+    assert seq_group_meta[1].token_chunk_size == 32  # large req still got 32
+    # the other small request had only 8 tokens left
+    assert seq_group_meta[2].token_chunk_size == 8  # 40-32
+
+    # The first small request got to decode now
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert short_seqs[1].is_prefill()
+    # Both small requests have started in front of the second long request
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 64
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 40
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 24
+
+    assert out.num_prefill_groups == 3
+    assert out.num_batched_tokens == 64
+    # the first small seq group has a new token appended.
+    append_new_token(short_seqs[0], 1)
+
+    # in the third iteration,
+    # the first small request is already decoding
+    # the second small request only has 16 tokens left and will enter decoding
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert seq_group_meta[0].token_chunk_size == 32  # large still got 32
+    # small req finished prefilling 40-24=16 tokens
+    assert seq_group_meta[1].token_chunk_size == 16
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 49  # (32+16+1 decode)
+
+    # both small requests have now reached decode
+    assert long_seqs[0].is_prefill()
+    assert long_seqs[1].is_prefill()
+    assert not short_seqs[0].is_prefill()
+    assert not short_seqs[1].is_prefill()
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 96
+    assert long_seqs[1].first_seq.get_num_computed_tokens() == 0
+    assert short_seqs[0].first_seq.get_num_computed_tokens() == 41
+    assert short_seqs[1].first_seq.get_num_computed_tokens() == 40
+
+    # both the small seq groups have a new token appended
+    append_new_token(short_seqs[0], 1)
+    append_new_token(short_seqs[1], 1)
+
+    # in the fourth iteration, both small requests are decoding
+    # so large request gets all the budget
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+
+    # large req gets 62 tokens (minus 2 for decode)
+    assert seq_group_meta[0].token_chunk_size == 62
+    assert seq_group_meta[1].token_chunk_size == 1  # decode
+    assert seq_group_meta[2].token_chunk_size == 1  # decode
+    assert out.num_prefill_groups == 1
+    assert out.num_batched_tokens == 64
+
+    assert long_seqs[0].first_seq.get_num_computed_tokens() == 158
+
+    # assert long_seqs[0].is_prefill()
+    # assert long_seqs[1].is_prefill()
+    # assert not short_seqs[0].is_prefill()
+    # assert not short_seqs[1].is_prefill()
+
+    # # both the small seq groups have a new token appended
+    # append_new_token(short_seqs[0], 1)
+    # append_new_token(short_seqs[1], 1)
+
+    # # in the fifth iteration, large request gets all the budget
+    # # while both small requests are decoding
+    # seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    # assert seq_group_meta[0].token_chunk_size == 62
+    # assert seq_group_meta[1].token_chunk_size == 1  # decode
+    # assert seq_group_meta[2].token_chunk_size == 1  # decode
+    # assert out.num_prefill_groups == 1
+    # assert out.num_batched_tokens == 64
+
+
 def test_complex():
     block_size = 4
     max_seqs = 60
@@ -508,7 +737,7 @@ def test_chunked_prefill_max_seqs():
     assert not running[1].is_prefill()
 
 
-def test_perfix_caching():
+def test_prefix_caching():
     """Verify allocating full blocks when prefix caching is enabled."""
     block_size = 4
     max_seqs = 10
@@ -548,3 +777,86 @@ def test_perfix_caching():
     assert seq_group_meta[1].token_chunk_size == 12
     assert out.num_prefill_groups == 2
     assert out.num_batched_tokens == 62
+
+
+def test_prefix_caching_with_concurrent_partial_prefills():
+    """Verify allocating full blocks when prefix caching is enabled with 
+    --max-num-partial-prefills > 1."""
+    block_size = 4
+    max_seqs = 10
+    max_model_len = 8000
+    max_num_batched_tokens = 60  # With two slots, each slot will get 30 tokens
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
+                                       max_seqs,
+                                       max_model_len,
+                                       enable_chunked_prefill=True,
+                                       max_num_partial_prefills=2)
+    cache_config = CacheConfig(block_size,
+                               1.0,
+                               1,
+                               "auto",
+                               enable_prefix_caching=True)
+    cache_config.num_cpu_blocks = 0
+    cache_config.num_gpu_blocks = 32
+    scheduler = Scheduler(scheduler_config, cache_config, None)
+    running: List[SequenceGroup] = []
+
+    # Add seq groups to scheduler.
+    for i in range(2):
+        _, seq_group = create_dummy_prompt(str(i),
+                                           block_size=block_size,
+                                           prompt_length=50)
+        scheduler.add_seq_group(seq_group)
+        running.append(seq_group)
+
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # To partially prefill both sequences, both can chunk up to 30 tokens
+    # But the next lowest multiple of the block size (4) is 28
+    assert seq_group_meta[0].token_chunk_size == 28
+    assert seq_group_meta[1].token_chunk_size == 28
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 56
+
+    # On the next iteration, both sequences should finish prefill
+    seq_group_meta, out = schedule_and_update_computed_tokens(scheduler)
+    assert set(get_sequence_groups(out)) == set(running)
+    # Both sequences have 50 - 28 = 22 tokens left to prefill.
+    # This is not a multiple of the block size, but we don't care since we don't
+    # cache the final partial block of prefix sequences
+    assert seq_group_meta[0].token_chunk_size == 22
+    assert seq_group_meta[1].token_chunk_size == 22
+    assert out.num_prefill_groups == 2
+    assert out.num_batched_tokens == 44
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
+def test_chunked_prefill_with_actual_engine(model: str,
+                                            max_num_partial_prefills: int):
+    """Make sure the model can actually sample with concurrent 
+    partial prefills
+    """
+
+    prompt = "hello" * 40
+
+    engine_args = EngineArgs(
+        model=model,
+        max_num_partial_prefills=max_num_partial_prefills,
+        max_num_batched_tokens=40,
+        max_num_seqs=8,
+        enable_chunked_prefill=True,
+        gpu_memory_utilization=0.8,
+    )
+
+    engine = LLMEngine.from_engine_args(engine_args)
+    sampling_params = SamplingParams(temperature=0)
+
+    for req_num in range(max_num_partial_prefills):
+        engine.add_request(f"{req_num}", prompt, sampling_params)
+    # first step
+    request_outputs = engine.step()
+    # means all are prefilling
+    assert len(request_outputs) == 0
+    assert len(engine.scheduler[0].running) == max_num_partial_prefills
diff --git a/vllm/config.py b/vllm/config.py
index 87ceb19056e..07499d5abbe 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1430,6 +1430,17 @@ class SchedulerConfig:
     # Maximum length of a sequence (including prompt and generated text).
     max_model_len: int = 8192
 
+    # Maximum number of sequences that can be partially prefilled concurrently
+    max_num_partial_prefills: int = 1
+
+    # Maximum number of "very long prompt" sequences that can be prefilled
+    # concurrently (long is defined by long_prefill_threshold)
+    max_long_partial_prefills: int = 1
+
+    # calculate context length that determines which sequences are
+    # considered "long"
+    long_prefill_token_threshold: int = 0
+
     # The number of slots to allocate per sequence per
     # step, beyond the known token ids. This is used in speculative
     # decoding to store KV activations of tokens which may or may not be
@@ -1537,6 +1548,18 @@ def __post_init__(self) -> None:
                 self.max_num_batched_tokens)
 
         self.chunked_prefill_enabled = self.enable_chunked_prefill
+        if self.max_num_partial_prefills > 1:
+            if self.long_prefill_token_threshold == 0:
+                self.long_prefill_token_threshold = int(self.max_model_len *
+                                                        0.04)
+
+            logger.info(
+                "Concurrent partial prefills enabled with "
+                "max_num_partial_prefills=%d, max_long_partial_prefills=%d, "
+                "long_prefill_token_threshold=%d",
+                self.max_num_partial_prefills, self.max_long_partial_prefills,
+                self.long_prefill_token_threshold)
+
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -1568,6 +1591,29 @@ def _verify_args(self) -> None:
                 f"({self.num_scheduler_steps}) must be greater than or "
                 "equal to 1.")
 
+        if self.max_num_partial_prefills < 1:
+            raise ValueError(
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}) "
+                "must be greater than or equal to 1.")
+        elif self.max_num_partial_prefills > 1:
+            if not self.chunked_prefill_enabled:
+                raise ValueError("Chunked prefill must be enabled to set "
+                                 "max_num_partial_prefills > 1.")
+
+            if self.long_prefill_token_threshold > self.max_model_len:
+                raise ValueError(
+                    "long_prefill_token_threshold "
+                    f"({self.long_prefill_token_threshold}) cannot be greater "
+                    f"than the max_model_len ({self.max_model_len}).")
+
+        if (self.max_long_partial_prefills
+                < 1) or (self.max_long_partial_prefills
+                         > self.max_num_partial_prefills):
+            raise ValueError(
+                f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
+                "must be greater than or equal to 1 and less than or equal to "
+                f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+
     @property
     def is_multi_step(self) -> bool:
         return self.num_scheduler_steps > 1
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 1ba1d175a00..3cdad496e84 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -17,7 +17,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
                            SequenceGroupMetadata, SequenceGroupMetadataDelta,
-                           SequenceStatus)
+                           SequenceStage, SequenceStatus)
 from vllm.utils import Device, PyObjectCache
 
 logger = init_logger(__name__)
@@ -39,6 +39,7 @@ class PreemptionMode(enum.Enum):
     recompute them when the sequences are resumed, treating the sequences as
     new prompts.
     """
+
     SWAP = enum.auto()
     RECOMPUTE = enum.auto()
 
@@ -54,6 +55,7 @@ class SchedulingBudget:
     happen if we only have chunked prefill scheduling, we can remove this
     feature from the API when chunked prefill is enabled by default.
     """
+
     token_budget: int
     max_num_seqs: int
     _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
@@ -132,6 +134,7 @@ class ScheduledSequenceGroup:
 @dataclass
 class SchedulerOutputs:
     """The scheduling decision made from a scheduler."""
+
     # Scheduled sequence groups.
     scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
     # Number of prefill groups scheduled.
@@ -205,6 +208,7 @@ class SchedulerRunningOutputs:
     Could contain prefill (prefill that's chunked) or decodes. If there's not
     enough memory, it can be preempted (for recompute) or swapped out.
     """
+
     # Selected sequences that are running and in a decoding phase.
     decode_seq_groups: List[ScheduledSequenceGroup]
     # Selected sequences that are running and in a prefill phase.
@@ -246,6 +250,7 @@ class SchedulerSwappedInOutputs:
 
     Could contain prefill (prefill that's chunked) or decodes.
     """
+
     # Selected sequences that are going to be swapped in and is in a
     # decoding phase.
     decode_seq_groups: List[ScheduledSequenceGroup]
@@ -280,6 +285,7 @@ class SchedulerPrefillOutputs:
     Could contain a fresh prefill requests or preempted requests that need
     to be recomputed from scratch.
     """
+
     # Selected sequences for prefill.
     seq_groups: List[ScheduledSequenceGroup]
     # Ignored sequence groups.
@@ -321,6 +327,100 @@ def scheduled_seq_group_builder():
     # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
 
 
+@dataclass
+class PartialPrefillMetadata:
+    """Holds information about the partial prefills that are currently running
+    during a single iteration of the Scheduler.
+    When chunked prefill is enabled, we allow a certain number of seqs to be
+    partially prefilled during each iteration. Having multiple partial prefills
+    in flight allows us to minimize TTFT and avoid decode starvation in cases
+    where a single sequence group with a very large prompt blocks the queue for
+    too many iterations.
+    The number of long prefill requests is limited so that smaller
+    requests may jump the queue in front of them and get to the decode
+    phase faster.
+    """
+
+    # A minimum bound on the total number of prefills to be scheduled during
+    # this iteration
+    schedulable_prefills: int
+
+    # The number of long prefill requests currently running
+    long_prefills: int
+
+    scheduler_config: SchedulerConfig
+
+    def can_schedule(self, seq_group: SequenceGroup) -> bool:
+        """When concurrent partial prefills are enabled,
+        we limit the number of long requests and only accept
+        shorter requests from the queue while running them
+        concurrently"""
+        return not (seq_group.first_seq.get_num_new_tokens()
+                    > self.scheduler_config.long_prefill_token_threshold
+                    and self.long_prefills
+                    >= self.scheduler_config.max_long_partial_prefills
+                    and self.scheduler_config.max_num_partial_prefills > 1)
+
+    def maybe_increment_partial_prefills(self,
+                                         seq_group: SequenceGroup) -> None:
+        # When a new prefill is scheduled, we need to know if it is a
+        # long request
+        if (seq_group.first_seq.get_num_new_tokens()
+                > self.scheduler_config.long_prefill_token_threshold):
+            self.long_prefills += 1
+
+    @classmethod
+    def from_queues(
+        cls,
+        running: Deque[SequenceGroup],
+        waiting: Deque[SequenceGroup],
+        scheduler_config: SchedulerConfig,
+    ) -> "PartialPrefillMetadata":
+        """Create a PartialPrefillMetadata object from the current state of
+        the scheduler's queues.
+        This accounts for the currently running prefill requests, and peeks into
+        the waiting queue to see if there are more prefills to potentially be
+        scheduled during this iteration."""
+        prefills = 0
+        long_prefills = 0
+
+        waiting_long_prefills = 0
+
+        for sg in running:
+            if sg.first_seq.data.stage == SequenceStage.PREFILL:
+                prefills += 1
+                if (sg.first_seq.get_num_new_tokens()
+                        > scheduler_config.long_prefill_token_threshold):
+                    long_prefills += 1
+
+        for sg in waiting:
+            # Don't bother looping through the rest of the queue if we know
+            # there are already at
+            # least max_partial_prefills requests to fill
+            if prefills >= scheduler_config.max_num_partial_prefills:
+                break
+
+            # Don't count long requests from the waiting queue if we aren't
+            # going to schedule them anyway
+            if (sg.first_seq.get_num_new_tokens()
+                    > scheduler_config.long_prefill_token_threshold):
+                if (long_prefills + waiting_long_prefills
+                        >= scheduler_config.max_long_partial_prefills):
+                    continue
+                waiting_long_prefills += 1
+            prefills += 1
+
+        # NB: long_prefills and waiting_long_prefills are tracked separately.
+        # We don't account for the waiting requests here because we need to use
+        # this metadata to track how many have actually been scheduled.
+        return PartialPrefillMetadata(
+            schedulable_prefills=min(
+                prefills, scheduler_config.max_num_partial_prefills),
+            long_prefills=long_prefills,
+            scheduler_config=scheduler_config,
+        )
+
+
 class Scheduler:
 
     def __init__(
@@ -360,7 +460,8 @@ def __init__(
             num_gpu_blocks=num_gpu_blocks,
             num_cpu_blocks=num_cpu_blocks,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching)
+            enable_caching=self.cache_config.enable_prefix_caching,
+        )
 
         # Sequence groups in the WAITING state.
         # Contain new prefill or preempted requests.
@@ -421,6 +522,18 @@ def __init__(
         # for processing and deallocation by the free_finished_seq_groups()
         self._async_stopped: List[SequenceGroup] = []
 
+        # List with the chunk sizes to hand out to each sequence depending
+        # on how many partial prefills are running. This is slightly faster than
+        # running an integer division every time a prefill is scheduled.
+        # This splits the budget evenly among all prefills.
+        self.partial_prefill_budget_lookup_list = [0] * (
+            self.scheduler_config.max_num_partial_prefills + 1)
+        self.partial_prefill_budget_lookup_list[0] = (
+            scheduler_config.max_num_batched_tokens)
+        for i in range(1, self.scheduler_config.max_num_partial_prefills + 1):
+            self.partial_prefill_budget_lookup_list[i] = (
+                scheduler_config.max_num_batched_tokens // i)
+
     @property
     def next_cache_id(self):
         return (self.cache_id + 1) % self.num_cache_iters
@@ -500,8 +613,8 @@ def _free_seq_group_cross_attn_blocks(
             self.block_manager.free_cross(seq_group)
 
     def has_unfinished_seqs(self) -> bool:
-        return len(self.waiting) != 0 or len(self.running) != 0 or len(
-            self.swapped) != 0
+        return (len(self.waiting) != 0 or len(self.running) != 0
+                or len(self.swapped) != 0)
 
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_manager.get_prefix_cache_hit_rate(device)
@@ -523,6 +636,7 @@ def _schedule_running(
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> SchedulerRunningOutputs:
         """Schedule sequence groups that are running.
 
@@ -537,12 +651,14 @@ def _schedule_running(
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
-    
+            partial_prefill_metadata: information about the partial prefills
+            that are currently running
+
         Returns:
             SchedulerRunningOutputs.
         """
-        ret: SchedulerRunningOutputs = \
-            self._scheduler_running_outputs_cache[self.cache_id].get_object()
+        ret: SchedulerRunningOutputs = self._scheduler_running_outputs_cache[
+            self.cache_id].get_object()
         ret.blocks_to_swap_out.clear()
         ret.blocks_to_copy.clear()
         ret.decode_seq_groups.clear()
@@ -577,10 +693,14 @@ def _schedule_running(
             #   2. If a sequence is running with non-chunked prefill, then
             #      there it's a decoding sequence, and the cached tokens info is
             #      irrelevant.
-            num_uncached_new_tokens, _ = (
+            num_uncached_new_tokens, _ = \
                 self._get_num_new_uncached_and_cached_tokens(
-                    seq_group, SequenceStatus.RUNNING, enable_chunking,
-                    budget))
+                seq_group,
+                SequenceStatus.RUNNING,
+                enable_chunking,
+                budget,
+                partial_prefill_metadata,
+            )
 
             num_running_tokens = num_uncached_new_tokens
             if num_running_tokens == 0:
@@ -593,8 +713,8 @@ def _schedule_running(
             # to process the final tokens. The check below avoids this extra
             # decode run when the model max len is reached, in order to avoid
             # a memory overflow.
-            if self.use_async_output_proc and seq_group.seqs[0].get_len(
-            ) > self.scheduler_config.max_model_len:
+            if (self.use_async_output_proc and seq_group.seqs[0].get_len()
+                    > self.scheduler_config.max_model_len):
                 self._async_stopped.append(seq_group)
                 continue
 
@@ -653,8 +773,9 @@ def _schedule_running(
                 self._append_slots(seq_group, blocks_to_copy, enable_chunking)
                 is_prefill = seq_group.is_prefill()
 
-                scheduled_seq_group: ScheduledSequenceGroup = \
-                    self._scheduled_seq_group_cache[self.cache_id].get_object()
+                scheduled_seq_group: ScheduledSequenceGroup = (
+                    self._scheduled_seq_group_cache[
+                        self.cache_id].get_object())
                 scheduled_seq_group.seq_group = seq_group
                 if is_prefill:
                     scheduled_seq_group.token_chunk_size = num_running_tokens
@@ -731,7 +852,8 @@ def _schedule_swapped(
                 logger.warning(
                     "Failing the request %s because there's not enough kv "
                     "cache blocks to run the entire sequence.",
-                    seq_group.request_id)
+                    seq_group.request_id,
+                )
                 for seq in seq_group.get_seqs():
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 infeasible_seq_groups.append(seq_group)
@@ -800,16 +922,17 @@ def _schedule_swapped(
         )
 
     def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
-        if self.scheduler_config.chunked_prefill_enabled and \
-                not self.scheduler_config.is_multi_step:
+        if (self.scheduler_config.chunked_prefill_enabled
+                and not self.scheduler_config.is_multi_step):
             prompt_limit = self.scheduler_config.max_model_len
         else:
-            prompt_limit = min(self.scheduler_config.max_model_len,
-                               self.scheduler_config.max_num_batched_tokens)
+            prompt_limit = min(
+                self.scheduler_config.max_model_len,
+                self.scheduler_config.max_num_batched_tokens,
+            )
 
         # Model is fine tuned with long context. Return the fine tuned max_len.
-        if (seq_group.lora_request
-                and seq_group.lora_request.long_lora_max_len):
+        if seq_group.lora_request and seq_group.lora_request.long_lora_max_len:
             assert prompt_limit <= seq_group.lora_request.long_lora_max_len
             return seq_group.lora_request.long_lora_max_len
         else:
@@ -817,7 +940,7 @@ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
 
     def _get_priority(self,
                       seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
-        """ Get the priority of the sequence group.
+        """Get the priority of the sequence group.
         Highest preference to user-defined priority, followed by arrival time.
         Args:
             seq_group: The sequence group input.
@@ -850,14 +973,14 @@ def _schedule_priority_preemption(
         if waiting_queue:
             seq_group = waiting_queue.popleft()
             num_new_seqs = seq_group.get_max_num_running_seqs()
-            num_new_tokens_uncached, _ = (
+            num_new_tokens_uncached, _ = \
                 self._get_num_new_uncached_and_cached_tokens(
-                    seq_group, SequenceStatus.WAITING, False, budget))
+                seq_group, SequenceStatus.WAITING, False, budget)
 
-            #Only preempt if priority inversion exists
+            # Only preempt if priority inversion exists
             while running_queue and self._get_priority(
                     running_queue[-1]) > self._get_priority(seq_group):
-                #Only preempt if waiting sequence cannot be allocated
+                # Only preempt if waiting sequence cannot be allocated
                 can_allocate = self.block_manager.can_allocate(seq_group)
                 if (num_new_tokens_uncached > 0
                         and can_allocate == AllocStatus.OK
@@ -867,7 +990,7 @@ def _schedule_priority_preemption(
                         )):
                     break
 
-                #Adjust budget to remove the victim sequence group
+                # Adjust budget to remove the victim sequence group
                 vseq_group = running_queue.pop()
                 num_running_tokens_uncached, _ = (
                     self._get_num_new_uncached_and_cached_tokens(
@@ -878,11 +1001,11 @@ def _schedule_priority_preemption(
                 budget.subtract_num_seqs(vseq_group.request_id,
                                          num_running_seqs)
 
-                #Preempt out the victim sequence group
+                # Preempt out the victim sequence group
                 self._preempt(vseq_group, blocks_to_swap_out)
                 waiting_queue.appendleft(vseq_group)
                 force_preemption_count += 1
-            #Put the sequence back into the waiting queue
+            # Put the sequence back into the waiting queue
             waiting_queue.appendleft(seq_group)
 
         waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
@@ -896,6 +1019,7 @@ def _schedule_prefills(
         budget: SchedulingBudget,
         curr_loras: Optional[Set[int]],
         enable_chunking: bool = False,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> SchedulerPrefillOutputs:
         """Schedule sequence groups that are in prefill stage.
 
@@ -916,10 +1040,20 @@ def _schedule_prefills(
                 chunked number of tokens are scheduled  if
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
 
         Returns:
             SchedulerPrefillOutputs.
         """
+        if budget.remaining_token_budget() == 0:
+            # Do nothing: Can't add any more prefill anyway
+            return SchedulerPrefillOutputs(
+                seq_groups=[],
+                ignored_seq_groups=[],
+                num_lookahead_slots=self._get_num_lookahead_slots(
+                    is_prefill=True, enable_chunking=enable_chunking),
+            )
         ignored_seq_groups: List[SequenceGroup] = []
         seq_groups: List[ScheduledSequenceGroup] = []
 
@@ -933,10 +1067,19 @@ def _schedule_prefills(
             assert len(waiting_seqs) == 1, (
                 "Waiting sequence group should have only one prompt "
                 "sequence.")
+            if (partial_prefill_metadata is not None
+                    and not partial_prefill_metadata.can_schedule(seq_group)):
+                leftover_waiting_sequences.appendleft(seq_group)
+                waiting_queue.popleft()
+                continue
             num_new_tokens_uncached, num_new_tokens_cached = (
                 self._get_num_new_uncached_and_cached_tokens(
-                    seq_group, SequenceStatus.WAITING, enable_chunking,
-                    budget))
+                    seq_group,
+                    SequenceStatus.WAITING,
+                    enable_chunking,
+                    budget,
+                    partial_prefill_metadata=partial_prefill_metadata,
+                ))
             num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
 
             if not enable_chunking:
@@ -947,7 +1090,10 @@ def _schedule_prefills(
             if num_new_tokens > prompt_limit:
                 logger.warning(
                     "Input prompt (%d tokens) is too long"
-                    " and exceeds limit of %d", num_new_tokens, prompt_limit)
+                    " and exceeds limit of %d",
+                    num_new_tokens,
+                    prompt_limit,
+                )
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
@@ -968,7 +1114,9 @@ def _schedule_prefills(
                 logger.warning(
                     "Input prompt (%d tokens) + lookahead slots (%d) is "
                     "too long and exceeds the capacity of block_manager",
-                    num_new_tokens, num_lookahead_slots)
+                    num_new_tokens,
+                    num_lookahead_slots,
+                )
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
@@ -1009,6 +1157,10 @@ def _schedule_prefills(
             waiting_queue.popleft()
             self._allocate_and_set_running(seq_group)
 
+            if partial_prefill_metadata is not None:
+                partial_prefill_metadata.maybe_increment_partial_prefills(
+                    seq_group)
+
             if enable_chunking and self.scheduler_config.is_multi_step:
                 blocks_to_copy: List[Tuple[int, int]] = []
                 # init_multi_step_from_lookahead_slots happens in append_slots
@@ -1024,7 +1176,8 @@ def _schedule_prefills(
                     num_scheduler_steps=self.scheduler_config.
                     num_scheduler_steps,
                     is_multi_step=self.scheduler_config.is_multi_step,
-                    enable_chunking=enable_chunking)
+                    enable_chunking=enable_chunking,
+                )
 
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
@@ -1045,11 +1198,12 @@ def _schedule_prefills(
             seq_groups=seq_groups,
             ignored_seq_groups=ignored_seq_groups,
             num_lookahead_slots=self._get_num_lookahead_slots(
-                is_prefill=True, enable_chunking=enable_chunking))
+                is_prefill=True, enable_chunking=enable_chunking),
+        )
 
     def _schedule_default(self) -> SchedulerOutputs:
         """Schedule queued requests.
-        
+
         The current policy is designed to optimize the throughput. First,
         it batches as many prefill requests as possible. And it schedules
         decodes. If there's a pressure on GPU memory, decode requests can
@@ -1065,9 +1219,9 @@ def _schedule_default(self) -> SchedulerOutputs:
         for seq_group in self.running:
             budget.add_num_seqs(seq_group.request_id,
                                 seq_group.get_max_num_running_seqs())
-        curr_loras = set(
+        curr_loras = (set(
             seq_group.lora_int_id for seq_group in self.running
-            if seq_group.lora_int_id > 0) if self.lora_enabled else None
+            if seq_group.lora_int_id > 0) if self.lora_enabled else None)
 
         prefills = SchedulerPrefillOutputs.create_empty()
         running_scheduled = SchedulerRunningOutputs.create_empty()
@@ -1093,9 +1247,10 @@ def _schedule_default(self) -> SchedulerOutputs:
 
             # If any sequence group is preempted, do not swap in any sequence
             # group. because it means there's no slot for new running requests.
-            if len(running_scheduled.preempted) + len(
-                    running_scheduled.swapped_out) == 0:
-                swapped_in = self._schedule_swapped(budget, curr_loras)
+            if (len(running_scheduled.preempted) +
+                    len(running_scheduled.swapped_out) == 0):
+                swapped_in = \
+                    self._schedule_swapped(budget, curr_loras)
 
         assert (budget.num_batched_tokens
                 <= self.scheduler_config.max_num_batched_tokens)
@@ -1115,8 +1270,8 @@ def _schedule_default(self) -> SchedulerOutputs:
 
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
-        preempted = (len(running_scheduled.preempted) +
-                     len(running_scheduled.swapped_out))
+        preempted = len(running_scheduled.preempted) + len(
+            running_scheduled.swapped_out)
 
         # There should be no prefill from running queue because this policy
         # doesn't allow chunked prefills.
@@ -1154,7 +1309,7 @@ def _schedule_default(self) -> SchedulerOutputs:
 
     def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         """Schedule queued requests.
-        
+
         Chunked prefill allows to chunk prefill requests, batch them together
         with decode requests. This policy 1. schedule as many decoding requests
         as possible. 2. schedule chunked prefill requests that are not
@@ -1175,10 +1330,20 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         prefills = SchedulerPrefillOutputs.create_empty()
         swapped_in = SchedulerSwappedInOutputs.create_empty()
 
+        # Create partial prefill metadata
+        partial_prefill_metadata = PartialPrefillMetadata.from_queues(
+            running=self.running,
+            waiting=self.waiting,
+            scheduler_config=self.scheduler_config,
+        )
+
         # Decoding should be always scheduled first by fcfs.
-        running_scheduled = self._schedule_running(budget,
-                                                   curr_loras,
-                                                   enable_chunking=True)
+        running_scheduled = self._schedule_running(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
 
         # Schedule swapped out requests.
         # If preemption happens, it means we don't have space for swap-in.
@@ -1186,9 +1351,12 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                 running_scheduled.swapped_out) == 0:
             swapped_in = self._schedule_swapped(budget, curr_loras)
 
-        prefills = self._schedule_prefills(budget,
-                                           curr_loras,
-                                           enable_chunking=True)
+        prefills = self._schedule_prefills(
+            budget,
+            curr_loras,
+            enable_chunking=True,
+            partial_prefill_metadata=partial_prefill_metadata,
+        )
 
         assert (budget.num_batched_tokens
                 <= self.scheduler_config.max_num_batched_tokens)
@@ -1207,8 +1375,15 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
             [s.seq_group for s in swapped_in.prefill_seq_groups])
         self.running.extend(
             [s.seq_group for s in running_scheduled.decode_seq_groups])
+        # Because multiple prefills may be running concurrently, we need to
+        # make sure that prefills which are scheduled to finish are listed
+        # before those that won't. This is so that on the next scheduling
+        # iteration when they have transitioned to the decode stage, they are
+        # properly prioritized over sequences that are still in the prefill
+        # stage.
         self.running.extend(
-            [s.seq_group for s in running_scheduled.prefill_seq_groups])
+            self._order_finishing_prefills_first(
+                running_scheduled.prefill_seq_groups))
         self.running.extend([s.seq_group for s in prefills.seq_groups])
 
         # Update swapped requests.
@@ -1225,7 +1400,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
         # If all prompts, then we set num_lookahead_slots to 0
         # this allows us to go through the `no_spec` path in
         # `spec_decode_worker.py`
-        all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
+        all_prefills = len(scheduled_seq_groups) == num_prefill_groups
         num_lookahead_slots = (0 if
                                (all_prefills
                                 and not self.scheduler_config.is_multi_step)
@@ -1247,6 +1422,21 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
                        len(running_scheduled.swapped_out)),
         )
 
+    def _order_finishing_prefills_first(
+        self, scheduled_prefill_seqs: List[ScheduledSequenceGroup]
+    ) -> List[SequenceGroup]:
+        """Returns a list of prefilling SequenceGroups where sequences that are
+        scheduled to finish prefilling are listed first"""
+        finishing = [
+            s.seq_group for s in scheduled_prefill_seqs
+            if s.seq_group.get_num_uncomputed_tokens() == s.token_chunk_size
+        ]
+        not_finishing = [
+            s.seq_group for s in scheduled_prefill_seqs
+            if s.seq_group.get_num_uncomputed_tokens() != s.token_chunk_size
+        ]
+        return finishing + not_finishing
+
     def _schedule(self) -> SchedulerOutputs:
         """Schedule queued requests."""
         if self.scheduler_config.chunked_prefill_enabled:
@@ -1385,10 +1575,12 @@ def schedule(
                     # between engine and worker.
                     # the subsequent comms can still use delta, but
                     # `multi_modal_data` will be None.
-                    multi_modal_data=seq_group.multi_modal_data
-                    if scheduler_outputs.num_prefill_groups > 0 else None,
-                    multi_modal_placeholders=seq_group.multi_modal_placeholders
-                    if scheduler_outputs.num_prefill_groups > 0 else None,
+                    multi_modal_data=(seq_group.multi_modal_data
+                                      if scheduler_outputs.num_prefill_groups
+                                      > 0 else None),
+                    multi_modal_placeholders=(
+                        seq_group.multi_modal_placeholders
+                        if scheduler_outputs.num_prefill_groups > 0 else None),
                     mm_processor_kwargs=seq_group.mm_processor_kwargs,
                     prompt_adapter_request=seq_group.prompt_adapter_request,
                 )
@@ -1494,10 +1686,12 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             seq.status = SequenceStatus.RUNNING
 
-    def _append_slots(self,
-                      seq_group: SequenceGroup,
-                      blocks_to_copy: List[Tuple[int, int]],
-                      enable_chunking: bool = False) -> None:
+    def _append_slots(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: List[Tuple[int, int]],
+        enable_chunking: bool = False,
+    ) -> None:
         """Appends new slots to the sequences in the given sequence group.
 
         Args:
@@ -1518,7 +1712,8 @@ def _append_slots(self,
             num_lookahead_slots,
             num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
             is_multi_step=self.scheduler_config.is_multi_step,
-            enable_chunking=enable_chunking)
+            enable_chunking=enable_chunking,
+        )
 
         seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
         if self.scheduler_config.is_multi_step and enable_chunking:
@@ -1561,8 +1756,11 @@ def _preempt(self, seq_group: SequenceGroup,
                 "not enough KV cache space. This can affect the end-to-end "
                 "performance. Increase gpu_memory_utilization or "
                 "tensor_parallel_size to provide more KV cache memory. "
-                "total_num_cumulative_preemption=%d", seq_group.request_id,
-                preemption_mode, self.num_cumulative_preemption + 1)
+                "total_num_cumulative_preemption=%d",
+                seq_group.request_id,
+                preemption_mode,
+                self.num_cumulative_preemption + 1,
+            )
         self.num_cumulative_preemption += 1
 
         if preemption_mode == PreemptionMode.RECOMPUTE:
@@ -1668,6 +1866,7 @@ def _get_num_new_uncached_and_cached_tokens(
         status: SequenceStatus,
         enable_chunking: bool,
         budget: SchedulingBudget,
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> Tuple[int, int]:
         """
         Returns the number of new uncached and cached tokens to schedule for a
@@ -1691,6 +1890,8 @@ def _get_num_new_uncached_and_cached_tokens(
                 to schedule.
             enable_chunking: Whether to chunk the number of tokens to compute.
             budget: The budget to chunk the number of tokens to compute.
+            partial_prefill_metadata: information about the partial prefills
+                that are currently running
 
 
         Returns:
@@ -1768,6 +1969,8 @@ def _get_num_new_uncached_and_cached_tokens(
                 budget,
                 self._get_prompt_limit(seq_group),
                 num_uncached_new_tokens,
+                self.partial_prefill_budget_lookup_list,
+                partial_prefill_metadata,
             )
 
         return num_uncached_new_tokens, num_cached_new_tokens
@@ -1779,6 +1982,8 @@ def _chunk_new_tokens_to_schedule(
         budget: SchedulingBudget,
         prompt_limit: int,
         num_new_tokens: int,
+        partial_prefill_budget_lookup_list: List[int],
+        partial_prefill_metadata: Optional[PartialPrefillMetadata] = None,
     ) -> int:
         """
         Chunks the number of new tokens to schedule based on the budget when
@@ -1811,29 +2016,31 @@ def _chunk_new_tokens_to_schedule(
                 # the sequence.
                 return num_new_tokens
 
-            return (0 if num_new_tokens > remaining_token_budget else
-                    num_new_tokens)
+            return 0 if num_new_tokens > \
+                remaining_token_budget else num_new_tokens
 
-        if cache_config.enable_prefix_caching:
-            # Adjust the remaining token budget to be divisible by the block
-            # size when prefix caching is enabled.
+        # Get the number of tokens to allocate to this prefill slot
+        prefill_slot_budget = (
+            remaining_token_budget if partial_prefill_metadata is None else
+            partial_prefill_budget_lookup_list[
+                partial_prefill_metadata.schedulable_prefills])
 
-            # When prefix caching is enabled, we always allocate
-            # the number of new tokens that is dividable by the block
-            # size to avoid partial block matching.
+        if cache_config.enable_prefix_caching:
+            # When prefix caching is enabled and we're partially prefilling
+            # a sequence, we always allocate a number of new tokens that is
+            # divisible by the block size to avoid partial block matching.
             block_size = cache_config.block_size
-            remainder = budget.token_budget % block_size
-            if remainder != 0:
-                raise ValueError("When enabling chunked prefill and "
-                                 "prefix caching, max_num_batched_tokens "
-                                 "(chunk size) must be dividable by "
-                                 "block size, but got chunk_size "
-                                 f"({budget.token_budget}) % block_size "
-                                 f"({block_size}) = {remainder}")
-            # Round down to block size.
-            remaining_token_budget = (remaining_token_budget // block_size *
-                                      block_size)
-
-        num_new_tokens = min(num_new_tokens, remaining_token_budget)
+            # Don't exceed either the total budget or slot budget.
+            # Take min of those and get the next lowest multiple of the
+            # block size:
+            remaining_token_budget = (
+                min(remaining_token_budget, prefill_slot_budget) //
+                block_size) * block_size
+            # NB: In the case where num_new_tokens < budget, we are
+            # finishing prefill for this sequence, so we do not need to
+            # allocate a full block.
+
+        num_new_tokens = min(num_new_tokens, remaining_token_budget,
+                             prefill_slot_budget)
 
         return num_new_tokens
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 83ee6b97f93..5f076f05d04 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -120,6 +120,9 @@ class EngineArgs:
     cpu_offload_gb: float = 0  # GiB
     gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: Optional[int] = None
+    max_num_partial_prefills: Optional[int] = 1
+    max_long_partial_prefills: Optional[int] = 1
+    long_prefill_token_threshold: Optional[int] = 0
     max_num_seqs: Optional[int] = None
     max_logprobs: int = 20  # Default value for OpenAI Chat Completions API
     disable_log_stats: bool = False
@@ -515,6 +518,31 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=EngineArgs.max_num_batched_tokens,
                             help='Maximum number of batched tokens per '
                             'iteration.')
+        parser.add_argument(
+            "--max-num-partial-prefills",
+            type=int,
+            default=EngineArgs.max_num_partial_prefills,
+            help="For chunked prefill, the max number of concurrent \
+            partial prefills."
+            "Defaults to 1",
+        )
+        parser.add_argument(
+            "--max-long-partial-prefills",
+            type=int,
+            default=EngineArgs.max_long_partial_prefills,
+            help="For chunked prefill, the maximum number of prompts longer "
+            "than --long-prefill-token-threshold that will be prefilled "
+            "concurrently. Setting this less than --max-num-partial-prefills "
+            "will allow shorter prompts to jump the queue in front of longer "
+            "prompts in some cases, improving latency. Defaults to 1.")
+        parser.add_argument(
+            "--long-prefill-token-threshold",
+            type=float,
+            default=EngineArgs.long_prefill_token_threshold,
+            help="For chunked prefill, a request is considered long if the "
+            "prompt is longer than this number of tokens. Defaults to 4%% of "
+            "the model's context length.",
+        )
         parser.add_argument('--max-num-seqs',
                             type=int,
                             default=EngineArgs.max_num_seqs,
@@ -1244,7 +1272,11 @@ def create_engine_config(self,
             multi_step_stream_outputs=self.multi_step_stream_outputs,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
-            policy=self.scheduling_policy)
+            policy=self.scheduling_policy,
+            max_num_partial_prefills=self.max_num_partial_prefills,
+            max_long_partial_prefills=self.max_long_partial_prefills,
+            long_prefill_token_threshold=self.long_prefill_token_threshold,
+        )
         lora_config = LoRAConfig(
             bias_enabled=self.enable_lora_bias,
             max_lora_rank=self.max_lora_rank,
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 6af734be5e9..0fcb7869132 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -958,7 +958,9 @@ def get_logprobs(
     if len(query_indices) == 0:
         empty_sampled_logprob: SampleLogprobs = []
         empty_prompt_logprob: Optional[PromptLogprobs] = None
-        return [empty_prompt_logprob], [empty_sampled_logprob]
+        num_seq_groups = len(sampling_metadata.seq_groups)
+        return [empty_prompt_logprob
+                ] * num_seq_groups, [empty_sampled_logprob] * num_seq_groups
 
     selected_logprobs, ranks = None, None
     top_logprobs, top_token_ids = None, None
@@ -1225,6 +1227,10 @@ def _build_sampler_output(
         assert sample_logprobs is not None
         assert not isinstance(maybe_deferred_sample_results,
                               SampleResultArgsType)
+        assert len(sampling_metadata.seq_groups) \
+            == len(maybe_deferred_sample_results) \
+            == len(prompt_logprobs) \
+            == len(sample_logprobs)
         deferred_sample_results_args = None
 
         for (seq_group, sample_result, group_prompt_logprobs,

From 38d57a9500f6625840aced3b24c7c75d02b2af92 Mon Sep 17 00:00:00 2001
From: Aoyu <aoyuzhang1989@gmail.com>
Date: Sat, 15 Feb 2025 07:50:05 +0800
Subject: [PATCH 0183/1240] [V1][Core] min_p sampling support (#13191)

Signed-off-by: Aoyu <aoyuzhan@amazon.com>
Co-authored-by: Aoyu <aoyuzhan@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampler.py   | 42 +++++++++++++++++++++++++++++++
 vllm/v1/sample/metadata.py        |  2 ++
 vllm/v1/sample/sampler.py         | 26 +++++++++++++++++++
 vllm/v1/worker/gpu_input_batch.py | 26 +++++++++++++++++++
 4 files changed, 96 insertions(+)

diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 03606af3867..cfef475d8de 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -81,6 +81,8 @@ def _create_default_sampling_metadata(
         top_k=torch.empty(batch_size, ),
         no_top_p=True,
         no_top_k=True,
+        min_p=torch.empty(batch_size, ),
+        no_min_p=True,
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
@@ -336,6 +338,46 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
                 non_penalized_token_id in output_tokens)
 
 
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("min_p", [0.0, 0.1])
+def test_sampler_min_p(device: str, batch_size: int, min_p: float):
+    """
+    Tests that when min_p is applied, tokens with probability below 
+    min_p * max_prob are masked with -inf.
+    """
+    torch.set_default_device(device)
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+
+    # Create one dominant token per batch
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+
+    # Configure min_p parameters
+    sampling_metadata.min_p = torch.full((batch_size, ), min_p, device=device)
+
+    sampler = Sampler()
+    logits = sampler.apply_min_p(fake_logits, sampling_metadata.min_p)
+    logits = logits.cpu()
+
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            if token_id == 0:
+                # Dominant token should always be unmasked
+                assert logits[batch_idx][token_id] != -float("inf")
+            else:
+                if min_p > 0.0:
+                    # Non-dominant tokens should be masked when min_p > 0
+                    assert logits[batch_idx][token_id] == -float("inf")
+                else:
+                    # No masking when min_p is 0
+                    assert logits[batch_idx][token_id] != -float("inf")
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("bias_value", [-0.1, 1.2])
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 6c2478bf662..cfcc54b7e34 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -17,6 +17,8 @@ class SamplingMetadata:
     top_k: torch.Tensor
     no_top_p: bool
     no_top_k: bool
+    min_p: torch.Tensor
+    no_min_p: bool
 
     generators: Dict[int, torch.Generator]
 
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 739dc811d5d..ac32c90d676 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -93,6 +93,10 @@ def sample(
             sampling_metadata.no_top_p,
             sampling_metadata.top_p,
         )
+
+        if not sampling_metadata.no_min_p:
+            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+
         if sampling_metadata.all_random:
             return random_sampled
 
@@ -169,6 +173,28 @@ def apply_penalties(
                 sampling_metadata.output_token_ids)
         return logits
 
+    def apply_min_p(
+        self,
+        logits: torch.Tensor,
+        min_p: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Filters logits using adaptive probability thresholding.
+        """
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Reshape min_p for broadcasting
+        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
+        # Identify valid tokens using threshold comparison
+        valid_token_mask = probability_values >= adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[~valid_token_mask] = -float('inf')
+        return logits
+
     def apply_logits_bias(
         self,
         logits: torch.Tensor,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d52b8827d35..1604aeab320 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -14,6 +14,8 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.block_table import BlockTable
 
+_SAMPLING_EPS = 1e-5
+
 if TYPE_CHECKING:
     from vllm.multimodal.inputs import PlaceholderRange
 
@@ -120,6 +122,16 @@ def __init__(
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
         self.top_k_reqs: Set[str] = set()
 
+        self.min_p = torch.empty((max_num_reqs, ),
+                                 dtype=torch.float32,
+                                 device=device)
+        self.min_p_cpu_tensor = torch.empty((max_num_reqs, ),
+                                            dtype=torch.float32,
+                                            device="cpu",
+                                            pin_memory=pin_memory)
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.min_p_reqs: Set[str] = set()
+
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
                                                dtype=torch.float,
@@ -223,8 +235,11 @@ def add_request(
         self.top_k_cpu[req_index] = sampling_params.top_k
         if sampling_params.top_k > 0:
             self.top_k_reqs.add(req_id)
+        self.min_p_cpu[req_index] = sampling_params.min_p
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty
+        if sampling_params.min_p > _SAMPLING_EPS:
+            self.min_p_reqs.add(req_id)
         if sampling_params.frequency_penalty != 0.0:
             self.frequency_penalties_reqs.add(req_id)
         self.presence_penalties_cpu[
@@ -273,6 +288,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
+        self.min_p_reqs.discard(req_id)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -299,6 +315,7 @@ def clear(self) -> None:
         self.random_reqs.clear()
         self.top_p_reqs.clear()
         self.top_k_reqs.clear()
+        self.min_p_reqs.clear()
         self.frequency_penalties_reqs.clear()
         self.presence_penalties_reqs.clear()
         self.repetition_penalties_reqs.clear()
@@ -354,6 +371,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 empty_index] = self.presence_penalties_cpu[last_req_index]
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
+            self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
             self.min_tokens[empty_index] = self.min_tokens[last_req_index]
             self.stop_token_ids[empty_index] = self.stop_token_ids[
                 last_req_index]
@@ -381,6 +399,8 @@ def make_sampling_metadata(
                 self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             self.top_k[:self.num_reqs].copy_(
                 self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
+            self.min_p[:self.num_reqs].copy_(
+                self.min_p_cpu_tensor[:self.num_reqs], non_blocking=True)
             if not self.no_penalties:
                 # Since syncing these tensors is expensive only copy them
                 # if necessary i.e. if there are requests which require
@@ -421,6 +441,8 @@ def make_sampling_metadata(
             all_random=self.all_random,
             top_p=self.top_p[:self.num_reqs],
             top_k=self.top_k[:self.num_reqs],
+            min_p=self.min_p[:self.num_reqs],
+            no_min_p=self.no_min_p,
             no_top_p=self.no_top_p,
             no_top_k=self.no_top_k,
             generators=self.generators,
@@ -497,6 +519,10 @@ def no_top_p(self) -> bool:
     def no_top_k(self) -> bool:
         return len(self.top_k_reqs) == 0
 
+    @property
+    def no_min_p(self) -> bool:
+        return len(self.min_p_reqs) == 0
+
     @property
     def no_penalties(self) -> bool:
         return (len(self.presence_penalties_reqs) == 0

From a5e8f3f3fb95ccad77dd1c352b8344ae73d29116 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 14 Feb 2025 17:29:51 -0800
Subject: [PATCH 0184/1240] [V1][CI] Fix failed v1-test because of min_p
 (#13316)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_input_batch.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 5e70cfb5377..53deb70d767 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -62,6 +62,7 @@ def _construct_expected_sampling_metadata(
     repetition_penalties = [1.0 for _ in range(num_reqs)]
     top_k = [0 for _ in range(num_reqs)]
     top_p = [0.0 for _ in range(num_reqs)]
+    min_p = [0.0 for _ in range(num_reqs)]
     temperature = [0.0 for _ in range(num_reqs)]
     stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
     min_tokens = [0 for _ in range(num_reqs)]
@@ -80,12 +81,12 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.repetition_penalty)
         top_k[index_in_input_batch] = req.sampling_params.top_k
         top_p[index_in_input_batch] = req.sampling_params.top_p
+        min_p[index_in_input_batch] = req.sampling_params.min_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
         stop_token_ids[
             index_in_input_batch] = req.sampling_params.all_stop_token_ids
         min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
-
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
                                  device=device),
@@ -95,6 +96,8 @@ def _construct_expected_sampling_metadata(
         top_k=torch.tensor(top_k, dtype=torch.int, device=device),
         no_top_p=all(x == 1.0 for x in top_p),
         no_top_k=all(x == 0 for x in top_k),
+        min_p=torch.tensor(min_p, dtype=torch.float, device=device),
+        no_min_p=all(x == 0.0 for x in min_p),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(

From ae13dc5040eec13391ed98eb34ac55424936c5ae Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 14 Feb 2025 18:10:53 -0800
Subject: [PATCH 0185/1240] [V1][Sampler] Don't apply temp for greedy-only
 (#13311)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/sampler.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index ac32c90d676..66cf48bc0f5 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -41,8 +41,6 @@ def forward(
         logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
         logits = self.apply_penalties(logits, sampling_metadata)
-        # Apply temperature.
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
         # Sample the next token.
         sampled = self.sample(logits, sampling_metadata)
 
@@ -82,9 +80,21 @@ def sample(
     ) -> torch.Tensor:
         assert not (sampling_metadata.all_greedy
                     and sampling_metadata.all_random)
-        if sampling_metadata.all_greedy:
-            return self.greedy_sample(logits)
+        if sampling_metadata.all_random:
+            greedy_sampled = None
+        else:
+            greedy_sampled = self.greedy_sample(logits)
+            if sampling_metadata.all_greedy:
+                return greedy_sampled
 
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+
+        # Apply min_p.
+        if not sampling_metadata.no_min_p:
+            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+
+        # Apply top_k and/or top_p.
         random_sampled = self.topk_topp_sampler(
             logits,
             sampling_metadata.generators,
@@ -94,13 +104,9 @@ def sample(
             sampling_metadata.top_p,
         )
 
-        if not sampling_metadata.no_min_p:
-            logits = self.apply_min_p(logits, sampling_metadata.min_p)
-
-        if sampling_metadata.all_random:
+        if greedy_sampled is None:
             return random_sampled
 
-        greedy_sampled = self.greedy_sample(logits)
         sampled = torch.where(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,

From fdbc2df487265cf9d0ae4073040a6d0c096aadb9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 14 Feb 2025 20:17:25 -0800
Subject: [PATCH 0186/1240] [V1][PP] Fix memory profiling in PP (#13315)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e90b76dcdd9..821c9e13802 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1158,11 +1158,12 @@ def profile_run(self) -> None:
             # Trigger compilation for general shape.
             hidden_states = self._dummy_run(self.max_num_tokens,
                                             dummy_kv_caches)
-            if not get_pp_group().is_last_rank:
-                return hidden_states
-            hidden_states = hidden_states[logit_indices]
-            logits = self.model.compute_logits(hidden_states, None)
-            # TODO(woosuk): Consider the memory usage of the sampler.
+            if get_pp_group().is_last_rank:
+                hidden_states = hidden_states[logit_indices]
+                logits = self.model.compute_logits(hidden_states, None)
+                # TODO(woosuk): Consider the memory usage of the sampler.
+            else:
+                logits = None
             torch.cuda.synchronize()
             del hidden_states, logits
             self.encoder_cache.clear()

From 3772be56baff36fc65655267ec6f8107a9e374bd Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 14 Feb 2025 20:30:42 -0800
Subject: [PATCH 0187/1240] [Bugfix][AMD] Update torch_bindings so that
 scaled_fp4_quant isn't build on ROCm (#13235)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/ops.h              |  8 ++++----
 csrc/torch_bindings.cpp | 13 +++++++------
 vllm/_custom_ops.py     |  1 +
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 46007889672..52ccf3b51f1 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -177,6 +177,10 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a,
                               std::optional<torch::Tensor> const& bias);
 
 std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a);
+
+void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
+                      torch::Tensor& output_scale,
+                      torch::Tensor const& input_scale);
 #endif
 
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
@@ -194,10 +198,6 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
-void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
-                      torch::Tensor& output_scale,
-                      torch::Tensor const& input_scale);
-
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale);
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 2fd45545eaf..ef81db14bf8 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -385,6 +385,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "bool silu_activation,"
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
+
+  // Compute NVFP4 block quantized tensor.
+  ops.def(
+      "scaled_fp4_quant(Tensor! output, Tensor input,"
+      "                 Tensor! output_scale, Tensor input_scale) -> ()");
+  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
+
 #endif
 
   // Quantized GEMM for GPTQ.
@@ -421,12 +428,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
            &dynamic_per_token_scaled_fp8_quant);
 
-  // Compute NVFP4 block quantized tensor.
-  ops.def(
-      "scaled_fp4_quant(Tensor! output, Tensor input,"
-      "                 Tensor! output_scale, Tensor input_scale) -> ()");
-  ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
-
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9f2ced8fb08..e3e3c644fbd 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -774,6 +774,7 @@ def scaled_fp4_quant(
             two values are packed into a uint8 and float8_e4m3 scaling factors
             in the sizzled layout.
     """
+    assert not current_platform.is_rocm()
     assert input.ndim >= 1, (
         f'input.ndim needs to be >= 1, but got {input.ndim}.')
     other_dims = 1 if input.ndim == 1 else -1

From 814bfff6a357fa41d76da7077c6c7fd9f3661d32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 15 Feb 2025 06:32:37 +0100
Subject: [PATCH 0188/1240] [Bugfix][Docs] Fix offline Whisper (#13274)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 20 ++++++++++++++++++++
 vllm/entrypoints/llm.py                |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e498efc2208..7145bcf2d5f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -938,6 +938,26 @@ The following table lists those that are tested in vLLM.
   * ✅︎
 :::
 
+#### Transcription (`--task transcription`)
+
+Speech2Text models trained specifically for Automatic Speech Recognition.
+
+:::{list-table}
+:widths: 25 25 25 5 5
+:header-rows: 1
+
+- * Architecture
+  * Models
+  * Example HF Models
+  * [LoRA](#lora-adapter)
+  * [PP](#distributed-serving)
+- * `Whisper`
+  * Whisper-based
+  * `openai/whisper-large-v3-turbo`
+  * 🚧
+  * 🚧
+:::
+
 _________________
 
 ## Model Support Policy
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 73593f0c6f0..40b7a529ebf 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -421,7 +421,7 @@ def generate(
             instead pass them via the ``inputs`` parameter.
         """
         runner_type = self.llm_engine.model_config.runner_type
-        if runner_type != "generate":
+        if runner_type not in ["generate", "transcription"]:
             messages = [
                 "LLM.generate() is only supported for (conditional) generation "
                 "models (XForCausalLM, XForConditionalGeneration).",

From 0f7b958ebcd41baad878018c7fa82f10358fda1f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 15 Feb 2025 00:33:25 -0500
Subject: [PATCH 0189/1240] [Bugfix] Massage MLA's usage of flash attn for RoCM
 (#13310)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/utils.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
index e9b4dff74f4..df3fb2aeefc 100644
--- a/vllm/attention/backends/mla/utils.py
+++ b/vllm/attention/backends/mla/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import functools
 from abc import abstractmethod
 from dataclasses import dataclass
 from typing import Any, Dict, Generic, List, Optional, Tuple
@@ -183,6 +184,15 @@ def __init__(
         self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
 
+        # Handle the differences between the flash_attn_varlen from flash_attn
+        # and the one from vllm_flash_attn. The former is used on RoCM and the
+        # latter has an additional parameter to control FA2 vs FA3
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        if self.vllm_flash_attn_version is not None:
+            self.flash_attn_varlen_func = \
+                functools.partial(flash_attn_varlen_func,
+                                  fa_version=self.vllm_flash_attn_version)
+
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_UV_O):
@@ -487,7 +497,7 @@ def _forward_prefill_flash(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        attn_output = flash_attn_varlen_func(
+        attn_output = self.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v_padded,
@@ -497,7 +507,6 @@ def _forward_prefill_flash(
             max_seqlen_k=max_prefill_seq_len,
             softmax_scale=self.scale,
             causal=True,
-            fa_version=self.vllm_flash_attn_version,
         )
         attn_output = attn_output\
             .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\

From 000894b8302c6a88b1a00ac7e15188fc3af6aaef Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Fri, 14 Feb 2025 21:33:31 -0800
Subject: [PATCH 0190/1240] [BugFix] Don't scan entire cache dir when loading
 model (#13302)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../model_loader/weight_utils.py               | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 8b2c5610f1f..18f6f40b32f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -15,8 +15,7 @@
 import huggingface_hub.constants
 import numpy as np
 import torch
-from huggingface_hub import (HfFileSystem, hf_hub_download, scan_cache_dir,
-                             snapshot_download)
+from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from safetensors.torch import load_file, safe_open, save_file
 from tqdm.auto import tqdm
 
@@ -239,7 +238,8 @@ def download_weights_from_hf(
     Returns:
         str: The path to the downloaded model weights.
     """
-    if not huggingface_hub.constants.HF_HUB_OFFLINE:
+    local_only = huggingface_hub.constants.HF_HUB_OFFLINE
+    if not local_only:
         # Before we download we look at that is available:
         fs = HfFileSystem()
         file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
@@ -255,7 +255,6 @@ def download_weights_from_hf(
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
-        start_size = scan_cache_dir().size_on_disk
         start_time = time.perf_counter()
         hf_folder = snapshot_download(
             model_name_or_path,
@@ -264,13 +263,12 @@ def download_weights_from_hf(
             cache_dir=cache_dir,
             tqdm_class=DisabledTqdm,
             revision=revision,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            local_files_only=local_only,
         )
-        end_time = time.perf_counter()
-        end_size = scan_cache_dir().size_on_disk
-        if end_size != start_size:
-            logger.info("Time took to download weights for %s: %.6f seconds",
-                        model_name_or_path, end_time - start_time)
+        time_taken = time.perf_counter() - start_time
+        if time_taken > 0.5:
+            logger.info("Time spent downloading weights for %s: %.6f seconds",
+                        model_name_or_path, time_taken)
     return hf_folder
 
 
From 626f7babfb2a5d010b01600113a1f048222f6507 Mon Sep 17 00:00:00 2001
From: Xu Song <xusong.vip@gmail.com>
Date: Sat, 15 Feb 2025 13:39:42 +0800
Subject: [PATCH 0191/1240] [Bugfix]Fix search start_index of stop_checker
 (#13280)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/output_processor/stop_checker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index 3bca0bee35a..6cad9ec8f32 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -113,7 +113,7 @@ def check_stop_strings(
             stop_string_len = len(stop_str)
             # Avoid searching already-searched text.
             stop_index = output_text.find(stop_str,
-                                          -new_char_count - stop_string_len)
+                                          1 - new_char_count - stop_string_len)
             if stop_index == -1:
                 continue
 

From fae536ba8780f9f25e039f9ff0415b1c79ce7b13 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 15 Feb 2025 19:00:11 +0800
Subject: [PATCH 0192/1240] [Bugfix] Fix qwen2.5-vl image processor (#13286)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 13 ++++++++-----
 vllm/model_executor/models/qwen2_vl.py   | 10 +++++++++-
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 6aec99b3f96..632ecaf65f2 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -33,10 +33,11 @@
 import torch.nn.functional as F
 from einops import rearrange
 from transformers import BatchFeature
-from transformers.models.qwen2_5_vl import (Qwen2_5_VLImageProcessor,
-                                            Qwen2_5_VLProcessor)
+from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
+from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
+                                          Qwen2VLImageProcessorFast)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -693,7 +694,8 @@ def get_hf_processor(
     ) -> Qwen2_5_VLProcessor:
         hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
         image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+        assert isinstance(image_processor,
+                          (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
 
         if min_pixels:
             image_processor.min_pixels = min_pixels
@@ -713,14 +715,15 @@ def get_image_processor(
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
         fps: Optional[float] = 2.0,
-    ) -> Qwen2_5_VLImageProcessor:
+    ) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
         hf_processor = self.get_hf_processor(
             min_pixels=min_pixels,
             max_pixels=max_pixels,
             fps=fps,
         )
         image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2_5_VLImageProcessor)
+        assert isinstance(image_processor,
+                          (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
         return image_processor
 
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ce927fbbf12..3821f8d55be 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -31,7 +31,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
+from packaging.version import Version
 from transformers import BatchFeature
+from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@@ -746,7 +748,13 @@ def get_image_processor(
         hf_processor = self.get_hf_processor(min_pixels=min_pixels,
                                              max_pixels=max_pixels)
         image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2VLImageProcessor)
+        if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
+            from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
+            assert isinstance(
+                image_processor,
+                (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
+        else:
+            assert isinstance(image_processor, Qwen2VLImageProcessor)
         return image_processor
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:

From d521ea5ed61d4a8a734c7abfc10b4059284164b4 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 15 Feb 2025 11:56:19 +0000
Subject: [PATCH 0193/1240] [V1][Metrics] Add iteration_tokens_total histogram
 from V0 (#13288)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py | 13 ++++++++---
 vllm/v1/engine/async_llm.py              |  2 +-
 vllm/v1/metrics/loggers.py               | 28 ++++++++++++++++++++----
 3 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 34b648b6e99..45a387a14ad 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -96,9 +96,14 @@ async def client(server):
     [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
      ("_count", _NUM_REQUESTS)],
     "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_max_tokens":
-    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
+    "vllm:request_params_max_tokens": [
+        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ("_count", _NUM_REQUESTS)
+    ],
+    "vllm:iteration_tokens_total":
+    [("_sum", _NUM_REQUESTS *
+      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
+     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
     "vllm:prompt_tokens": [("_total",
                             _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
     "vllm:generation_tokens": [
@@ -197,6 +202,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_params_max_tokens_sum",
     "vllm:request_params_max_tokens_bucket",
     "vllm:request_params_max_tokens_count",
+    "vllm:iteration_tokens_total",
     "vllm:num_preemptions_total",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
@@ -223,6 +229,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:gpu_prefix_cache_hits",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
+    "vllm:iteration_tokens_total",
     "vllm:request_success_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a669c9f6267..1920dbf7a7d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -57,7 +57,7 @@ def __init__(
         if self.log_stats:
             self.stat_loggers.extend([
                 LoggingStatLogger(),
-                PrometheusStatLogger(vllm_config.model_config),
+                PrometheusStatLogger(vllm_config),
             ])
 
         # Tokenizer (+ ensure liveness if running in another process).
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 439be38a3e7..5019e2b3f92 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -7,7 +7,7 @@
 import numpy as np
 import prometheus_client
 
-from vllm.config import ModelConfig
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
@@ -92,13 +92,13 @@ def log(self, scheduler_stats: SchedulerStats,
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, model_config: ModelConfig):
+    def __init__(self, vllm_config: VllmConfig):
         self._unregister_vllm_metrics()
 
         labelnames = ["model_name"]
-        labelvalues = [model_config.served_model_name]
+        labelvalues = [vllm_config.model_config.served_model_name]
 
-        max_model_len = model_config.max_model_len
+        max_model_len = vllm_config.model_config.max_model_len
 
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",
@@ -162,6 +162,13 @@ def __init__(self, model_config: ModelConfig):
                 buckets=build_1_2_5_buckets(max_model_len),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_iteration_tokens = \
+            prometheus_client.Histogram(
+                name="vllm:iteration_tokens_total",
+                documentation="Histogram of number of tokens per engine_step.",
+                buckets=build_cudagraph_buckets(vllm_config),
+                labelnames=labelnames).labels(*labelvalues)
+
         self.histogram_time_to_first_token = \
             prometheus_client.Histogram(
                 name="vllm:time_to_first_token_seconds",
@@ -237,6 +244,9 @@ def log(self, scheduler_stats: SchedulerStats,
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
+        self.histogram_iteration_tokens.observe(
+            iteration_stats.num_prompt_tokens + \
+            iteration_stats.num_generation_tokens)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason].inc()
@@ -293,3 +303,13 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
     [1, 2, 5, 10, 20, 50, 100]
     """
     return build_buckets([1, 2, 5], max_value)
+
+
+def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]:
+    if not vllm_config.model_config.enforce_eager:
+        buckets = vllm_config.compilation_config.\
+            cudagraph_capture_sizes.copy()
+        buckets.sort()
+        return buckets
+    else:
+        return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]

From 07bb109725dd42d0e0369db11fc6e3bc09376d35 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 15 Feb 2025 05:58:09 -0600
Subject: [PATCH 0194/1240] [AMD] [Model] DeepSeek tunings (#13199)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++
 24 files changed, 3936 insertions(+)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e9a50e1d651
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b3bf9ea26be
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..defaacb3203
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..987c8f600ea
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b3ed43aafbd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..03e8235353c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..1a457b92a0b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4415cc9d0bf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7c039b409ac
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..5c604b9b6d9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b4d25aef96e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..634c1bfab62
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..19452dfe77b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e6d910735f3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c298da80a93
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..cb993c878fc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..56d3e1feea2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9cdff134dba
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6f9bd755cda
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c7122d3b960
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..3cea21b4d72
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..a8141f535bc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 32,
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c9566d71326
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e4716875871
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file

From d39a2a8310f99bde18f995e8bb22e6a9ff8cd2ff Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Sat, 15 Feb 2025 03:59:01 -0800
Subject: [PATCH 0195/1240] [V1][PP] Run engine busy loop with batch queue
 (#13064)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py              | 51 +++++++++++
 tests/v1/engine/test_engine_core.py          | 89 +++++++++++++++++++-
 vllm/v1/core/scheduler.py                    | 17 ++++
 vllm/v1/engine/core.py                       | 79 +++++++++++++++--
 vllm/v1/executor/abstract.py                 | 17 ++--
 vllm/v1/executor/ray_distributed_executor.py | 61 ++++++++++++++
 6 files changed, 299 insertions(+), 15 deletions(-)
 create mode 100644 vllm/v1/executor/ray_distributed_executor.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 8aba46aec47..97f75d0fd70 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -213,3 +213,54 @@ def test_schedule_partial_requests():
     assert output.num_scheduled_tokens[requests[0].request_id] == 1
     assert output.num_scheduled_tokens[requests[1].request_id] == 700
     assert requests[2].request_id not in output.num_scheduled_tokens
+
+
+def test_schedule_concurrent_batches():
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=2,
+    )
+    requests = create_requests(
+        num_requests=2,
+        num_tokens=512,
+    )
+
+    # Schedule the first request.
+    scheduler.add_request(requests[0])
+    scheduler_output0 = scheduler.schedule()
+    assert len(scheduler_output0.scheduled_new_reqs) == 1
+    assert scheduler_output0.num_scheduled_tokens[
+        requests[0].request_id] == 512
+
+    # The first request is still running, so only schedule the second request.
+    scheduler.add_request(requests[1])
+    scheduler_output1 = scheduler.schedule()
+    assert len(scheduler_output1.scheduled_new_reqs) == 1
+    assert scheduler_output1.num_scheduled_tokens[
+        requests[1].request_id] == 512
+
+    # Model output of the first request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[0],
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(scheduler_output0, model_runner_output)
+
+    # Schedule the next step.
+    # The first request can be scheduled again while the second
+    # request is still running.
+    scheduler_output2 = scheduler.schedule()
+    assert scheduler_output2.num_scheduled_tokens[requests[0].request_id] == 1
+
+    # Model output of the second request.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[requests[1].request_id],
+        req_id_to_index={requests[1].request_id: 0},
+        sampled_token_ids=[0],
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(scheduler_output1, model_runner_output)
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 36b31550dc0..d035668098e 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
+import threading
 import time
 import uuid
+from concurrent.futures import Future
 
 import pytest
 from transformers import AutoTokenizer
@@ -12,7 +15,9 @@
 from vllm.platforms import current_platform
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.executor.abstract import Executor
+from vllm.v1.executor.abstract import Executor, UniProcExecutor
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.outputs import ModelRunnerOutput
 
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
@@ -191,3 +196,85 @@ def _check_engine_state():
         )
         engine_core.add_request(request2)
         _check_engine_state()
+
+
+@fork_new_process_for_each_test
+def test_engine_core_concurrent_batches(monkeypatch):
+    """
+    Test that the engine can handle multiple concurrent batches.
+    """
+
+    def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
+        request = make_request()
+        request.sampling_params.max_tokens = max_tokens
+        return request
+
+    class DummyExecutor(UniProcExecutor):
+
+        def initialize(self, kv_cache_config: KVCacheConfig) -> None:
+            super().initialize(kv_cache_config)
+
+            # This executor actually can only run 1 batch at a time
+            self.semaphore = threading.Semaphore(1)
+
+        def execute_model(
+            self,
+            scheduler_output,
+        ) -> Future[ModelRunnerOutput]:
+            """Make execute_model non-blocking."""
+            future: Future[ModelRunnerOutput] = Future()
+
+            def _thread_wrapper(scheduler_output, future):
+                with self.semaphore:
+                    output = self.collective_rpc("execute_model",
+                                                 args=(scheduler_output, ))
+                    # Make a copy because output[0] may be reused
+                    # by the next batch.
+                    output = copy.deepcopy(output[0])
+                    future.set_result(output)
+
+            threading.Thread(target=_thread_wrapper,
+                             args=(scheduler_output, future)).start()
+            return future
+
+        @property
+        def max_concurrent_batches(self) -> int:
+            return 2
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(
+            model=MODEL_NAME,
+            # To test concurrent batches.
+            max_num_seqs=2,
+            # Avoid all requests being scheduled once.
+            enable_prefix_caching=False,
+            max_num_batched_tokens=10,
+        )
+        vllm_config = engine_args.create_engine_config()
+        engine_core = EngineCore(vllm_config=vllm_config,
+                                 log_stats=False,
+                                 executor_class=DummyExecutor)
+        assert engine_core.batch_queue is not None
+
+        # Add two requests in a row.
+        req = make_request_with_max_tokens(5)
+        engine_core.add_request(req)
+        req = make_request_with_max_tokens(5)
+        engine_core.add_request(req)
+
+        # First saturate the batch queue.
+        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.batch_queue.qsize() == 1
+        assert engine_core.step_with_batch_queue() is None
+        assert engine_core.batch_queue.qsize() == 2
+        assert engine_core.scheduler.get_num_unfinished_requests() == 2
+
+        # Loop through both requests.
+        while engine_core.scheduler.get_num_unfinished_requests() == 2:
+            engine_core.step_with_batch_queue()
+
+        # Reaching here when got the result of the first request.
+        while engine_core.scheduler.get_num_unfinished_requests() == 1:
+            engine_core.step_with_batch_queue()
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index e32e557ae23..2d5a1192c22 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -58,6 +58,9 @@ def __init__(
         # Priority queues for requests.
         self.waiting: Deque[Request] = deque()
         self.running: List[Request] = []
+        # The requests that have been scheduled and are being executed
+        # by the executor.
+        self.scheduled_req_ids: Set[str] = set()
 
         # The request IDs that are finished in between the previous and the
         # current steps. This is used to notify the workers about the finished
@@ -118,6 +121,11 @@ def schedule(self) -> "SchedulerOutput":
         req_index = 0
         while req_index < len(self.running) and token_budget > 0:
             request = self.running[req_index]
+            if request.request_id in self.scheduled_req_ids:
+                # This request has already been scheduled.
+                req_index += 1
+                continue
+
             num_new_tokens = request.num_tokens - request.num_computed_tokens
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
@@ -164,6 +172,7 @@ def schedule(self) -> "SchedulerOutput":
 
             # Schedule the request.
             scheduled_running_reqs.append(request)
+            self.scheduled_req_ids.add(request.request_id)
             req_to_new_block_ids[request.request_id] = [
                 b.block_id for b in new_blocks
             ]
@@ -251,6 +260,7 @@ def schedule(self) -> "SchedulerOutput":
 
                 self.waiting.popleft()
                 self.running.append(request)
+                self.scheduled_req_ids.add(request.request_id)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
                     self.request_scheduled(request, scheduled_timestamp)
@@ -519,6 +529,7 @@ def update_from_output(
                         stop_reason=request.stop_reason,
                         events=request.take_events()))
 
+            self.scheduled_req_ids.remove(request.request_id)
             if not stopped:
                 new_running.append(request)
 
@@ -575,6 +586,8 @@ def finish_requests(
 
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
+                if request.request_id in self.scheduled_req_ids:
+                    self.scheduled_req_ids.remove(request.request_id)
             else:
                 self.waiting.remove(request)
             request.status = finished_status
@@ -595,6 +608,10 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def get_num_unscheduled_requests(self) -> int:
+        """Number of requests that are not being processed by the executor."""
+        return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
+
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 401f331d81d..b3943816558 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -4,8 +4,9 @@
 import signal
 import threading
 import time
+from concurrent.futures import Future
 from multiprocessing.connection import Connection
-from typing import Any, List, Tuple, Type
+from typing import Any, List, Optional, Tuple, Type
 
 import psutil
 import zmq
@@ -18,11 +19,12 @@
     maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
-from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.version import __version__ as VLLM_VERSION
@@ -66,9 +68,22 @@ def __init__(
             log_stats=self.log_stats,
         )
 
+        # Setup MM Input Mapper.
         self.mm_input_cache_server = MMInputCacheServer(
             vllm_config.model_config)
 
+        # Setup batch queue for pipeline parallelism.
+        # Batch queue for scheduled batches. This enables us to asynchronously
+        # schedule and execute batches, and is required by pipeline parallelism
+        # to eliminate pipeline bubbles.
+        self.batch_queue_size = self.model_executor.max_concurrent_batches
+        self.batch_queue: Optional[queue.Queue[Tuple[Future[ModelRunnerOutput],
+                                                     SchedulerOutput]]] = None
+        if self.batch_queue_size > 1:
+            logger.info("Batch queue is enabled with size %d",
+                        self.batch_queue_size)
+            self.batch_queue = queue.Queue(self.batch_queue_size)
+
     def _initialize_kv_caches(self,
                               vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
@@ -135,7 +150,55 @@ def step(self) -> EngineCoreOutputs:
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
-            scheduler_output, output)
+            scheduler_output, output)  # type: ignore
+        return engine_core_outputs
+
+    def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
+        """Schedule and execute batches with the batch queue.
+        Note that if nothing to output in this step, None is returned.
+
+        The execution flow is as follows:
+        1. Try to schedule a new batch if there are unscheduled requests
+        and the job queue is not full. If a new batch is scheduled, directly
+        return an empty engine core output. In other words, we won't check
+        and return model outputs before the batch queue is full.
+        2. If there is no new scheduled batch, meaning that the batch queue
+        is full or no other requests can be scheduled, we block until the first
+        batch in the job queue is finished.
+        3. Update the scheduler from the output.
+        """
+        assert self.batch_queue is not None
+
+        engine_core_outputs = None
+        scheduler_output = None
+        # If there are unscheduled requests and the job queue
+        # is not full, schedule a new batch. Note that this is not blocking.
+        if (self.scheduler.get_num_unscheduled_requests() > 0
+                and not self.batch_queue.full()):
+            scheduler_output = self.scheduler.schedule()
+            if scheduler_output.total_num_scheduled_tokens > 0:
+                future = self.model_executor.execute_model(scheduler_output)
+                self.batch_queue.put_nowait(
+                    (future, scheduler_output))  # type: ignore
+
+        # If all requests are scheduled or the job queue is full,
+        # block until the first batch in the job queue is finished.
+        if (scheduler_output is None
+                or scheduler_output.total_num_scheduled_tokens == 0):
+            try:
+                future, scheduler_output = self.batch_queue.get(
+                    timeout=POLLING_TIMEOUT_S)
+                # Blocking until the first result is available.
+                model_output = future.result()
+                self.batch_queue.task_done()
+                engine_core_outputs = self.scheduler.update_from_output(
+                    scheduler_output, model_output)
+            except queue.Empty:
+                # If the queue is empty (timeout at .get), return
+                # an empty EngineCoreOutputs for logging.
+                engine_core_outputs = EngineCoreOutputs(
+                    outputs=[], scheduler_stats=self.scheduler.make_stats())
+
         return engine_core_outputs
 
     def shutdown(self):
@@ -226,6 +289,9 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
+        step_fn = (self.step
+                   if self.batch_queue is None else self.step_with_batch_queue)
+
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
@@ -249,10 +315,11 @@ def run_busy_loop(self):
                 self._handle_client_request(*req)
 
             # 3) Step the engine core.
-            outputs = self.step()
+            outputs = step_fn()
 
-            # 5) Put EngineCoreOutputs into the output queue.
-            self.output_queue.put_nowait(outputs)
+            # 4) Put EngineCoreOutputs into the output queue.
+            if outputs is not None:
+                self.output_queue.put_nowait(outputs)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index d1ffc891ad6..3663cbd08ae 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Type
+from concurrent.futures import Future
+from typing import List, Type, Union
 
 from vllm.config import VllmConfig
 from vllm.executor.executor_base import ExecutorBase
-from vllm.executor.ray_distributed_executor import (  # noqa
-    RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.executor.uniproc_executor import (  # noqa
     ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
 from vllm.executor.uniproc_executor import (  # noqa
@@ -33,6 +32,8 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
                     f"ExecutorBase. Got {distributed_executor_backend}.")
             executor_class = distributed_executor_backend
         elif distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
             executor_class = RayDistributedExecutor
         elif distributed_executor_backend == "mp":
             from vllm.v1.executor.multiproc_executor import MultiprocExecutor
@@ -70,11 +71,15 @@ def get_kv_cache_specs(self) -> List[KVCacheSpec]:
     def execute_model(
         self,
         scheduler_output,
-    ) -> ModelRunnerOutput:
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
         output = self.collective_rpc("execute_model",
                                      args=(scheduler_output, ))
         return output[0]
 
+    @property
+    def max_concurrent_batches(self) -> int:
+        return 1
+
     def profile(self, is_start: bool = True):
         self.collective_rpc("profile", args=(is_start, ))
 
@@ -85,7 +90,3 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
 
 class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
     pass
-
-
-class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
-    pass
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
new file mode 100644
index 00000000000..53548610adf
--- /dev/null
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from concurrent.futures import Future
+from typing import Union
+
+from vllm.executor.ray_distributed_executor import (  # noqa
+    RayDistributedExecutor as RayDistributedExecutorV0)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
+
+
+class FutureWrapper(Future):
+    """A wrapper around a Ray output reference to meet the interface
+    of .execute_model().
+    """
+
+    def __init__(self, ref):
+        super().__init__()
+        self.ref = ref
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise NotImplementedError("timeout is not supported")
+        return self.ref.get()
+
+
+class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
+    """Ray distributed executor using Ray Compiled Graphs."""
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        """Ray distributed executor supports pipeline parallelism,
+        meaning that it allows PP size batches to be executed concurrently.
+        """
+        return 1  #self.vllm_config.parallel_config.pipeline_parallel_size
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        """Execute the model on the Ray workers.
+
+        Args:
+            scheduler_output: The scheduler output to execute.
+
+        Returns:
+            The model runner output.
+        """
+        # Build the compiled DAG for the first time.
+        if self.forward_dag is None:  # type: ignore
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        refs = self.forward_dag.execute(scheduler_output)  # type: ignore
+
+        # When PP is not used, we block here until the result is available.
+        if self.max_concurrent_batches == 1:
+            return refs[0].get()
+
+        # When PP is used, we return a FutureWrapper immediately so that
+        # the scheduler can yield to the next batch.
+        return FutureWrapper(refs[0])

From ea8476d9cfe9386e014ed92ff834d3a919521002 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 15 Feb 2025 21:33:13 +0800
Subject: [PATCH 0196/1240] [ci/build] update flashinfer (#13323)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7ecb643f462..26da8c0f269 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -195,19 +195,22 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-# How to build this FlashInfer wheel:
+# If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
 # $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 # $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
 # $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 # $ cd flashinfer
 # $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
+# $ rm -rf build
 # $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
+# $ ls dist
+# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 

From 81170ecc80acf0f0b9af97032f51400806c714dd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Feb 2025 23:06:23 +0800
Subject: [PATCH 0197/1240] [Doc] [2/N] Add Fuyu E2E example for multimodal
 processor (#13331)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md | 554 ++++++++++++++++++-
 vllm/model_executor/models/fuyu.py           |   6 +-
 2 files changed, 529 insertions(+), 31 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 66a7554da84..14a59953ef4 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -262,6 +262,255 @@ def get_mm_max_tokens_per_item(
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 :::
 
+::::
+
+::::{tab-item} Non-consecutive feature tokens: Fuyu
+:sync: fuyu
+
+Looking at the code of HF's `FuyuForCausalLM`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+if image_patches is not None and past_key_values is None:
+    patch_embeddings = [
+        self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+        .squeeze(0)
+        .to(inputs_embeds.device)
+        for patch in image_patches
+    ]
+    inputs_embeds = self.gather_continuous_embeddings(
+        word_embeddings=inputs_embeds,
+        continuous_embeddings=patch_embeddings,
+        image_patch_input_indices=image_patches_indices,
+    )
+```
+
+The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
+which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
+
+Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
+Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
+
+The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
+`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
+
+In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
+returning the dimensions after resizing (but before padding) as metadata.
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+batch_images = image_encoding["images"]
+image_unpadded_heights = image_encoding["image_unpadded_heights"]
+image_unpadded_widths = image_encoding["image_unpadded_widths"]
+
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+if do_resize:
+    batch_images = [
+        [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+        for images in batch_images
+    ]
+
+image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+if do_pad:
+    batch_images = [
+        [
+            self.pad_image(
+                image,
+                size=size,
+                mode=padding_mode,
+                constant_values=padding_value,
+                input_data_format=input_data_format,
+            )
+            for image in images
+        ]
+        for images in batch_images
+    ]
+```
+
+In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+    image_input=tensor_batch_images,
+    image_present=image_present,
+    image_unpadded_h=image_unpadded_heights,
+    image_unpadded_w=image_unpadded_widths,
+    image_placeholder_id=image_placeholder_id,
+    image_newline_id=image_newline_id,
+    variable_sized=True,
+)
+
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+image_height, image_width = image.shape[1], image.shape[2]
+if variable_sized:  # variable_sized=True
+    new_h = min(
+        image_height,
+        math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+    )
+    new_w = min(
+        image_width,
+        math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+    )
+    image = image[:, :new_h, :new_w]
+    image_height, image_width = new_h, new_w
+
+num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+tensor_of_image_ids = torch.full(
+    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+)
+patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+assert num_patches == patches.shape[0]
+```
+
+The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+patch_size = patch_size if patch_size is not None else self.patch_size
+patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+
+if image_height % patch_height != 0:
+    raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+if image_width % patch_width != 0:
+    raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+num_patches_per_dim_h = image_height // patch_height
+num_patches_per_dim_w = image_width // patch_width
+num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+```
+
+We can calculate this in vLLM using this code:
+
+```python
+def get_num_image_patches(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> int:
+    image_processor = self.get_image_processor()
+    target_width = image_processor.size["width"]
+    target_height = image_processor.size["height"]
+    patch_width = image_processor.patch_size["width"]
+    patch_height = image_processor.patch_size["height"]
+
+    if not (image_width <= target_width and image_height <= target_height):
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        image_height = int(image_height * optimal_scale_factor)
+        image_width = int(image_width * optimal_scale_factor)
+
+    ncols = math.ceil(image_width / patch_width)
+    nrows = math.ceil(image_height / patch_height)
+    return ncols * nrows
+```
+
+These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
+inserts newline tokens (`|NEWLINE|`) as shown here:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
+tensor_of_image_ids = torch.full(
+    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+)
+patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+assert num_patches == patches.shape[0]
+
+if variable_sized:
+    # Now terminate each line with |NEWLINE|.
+    tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
+    newline_ids = torch.full(
+        [tensor_of_image_ids.shape[0], 1],
+        image_newline_id,
+        dtype=torch.int32,
+        device=image_input.device,
+    )
+    tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
+    tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
+```
+
+So, the layout of tokens for an image is:
+
+```
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+...
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+```
+
+This makes the placeholder tokens non-consecutive in the prompt.
+Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
+
+So overall, the total number of feature tokens is
+
+```python
+def get_num_image_tokens(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> int:
+    image_processor = self.get_image_processor()
+    target_width = image_processor.size["width"]
+    target_height = image_processor.size["height"]
+    patch_width = image_processor.patch_size["width"]
+    patch_height = image_processor.patch_size["height"]
+
+    if not (image_width <= target_width and image_height <= target_height):
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        image_height = int(image_height * optimal_scale_factor)
+        image_width = int(image_width * optimal_scale_factor)
+
+    ncols = math.ceil(image_width / patch_width)
+    nrows = math.ceil(image_height / patch_height)
+    return (ncols + 1) * nrows
+```
+
+To calculate the maximum number of image tokens, recall that input images are first resized
+to fit within `image_processor.size`. The maximum possible dimensions of the image before
+being converted into patches is therefore equal to `image_processor.size`.
+
+```python
+def get_image_size_with_most_features(self) -> ImageSize:
+    image_processor = self.get_image_processor()
+    return ImageSize(width=image_processor.size["width"],
+                        height=image_processor.size["height"])
+
+def get_max_image_tokens(self) -> int:
+    target_width, target_height = self.get_image_size_with_most_features()
+
+    return self.get_num_image_tokens(
+        image_width=target_width,
+        image_height=target_height,
+    )
+```
+
+And thus, we can override the method as:
+
+```python
+def get_mm_max_tokens_per_item(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> Mapping[str, int]:
+    return {"image": self.get_max_image_tokens()}
+```
+
+:::{note}
+Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
+This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
+:::
+
 ::::
 :::::
 
@@ -282,7 +531,8 @@ on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
 :sync: llava
-Making use of the `get_image_size_with_most_features` method implemented in the previous section:
+
+Making use of the `get_image_size_with_most_features` method implemented in Step 2:
 
 ```python
 def get_dummy_processor_inputs(
@@ -312,6 +562,39 @@ def get_dummy_processor_inputs(
 ```
 
 :::
+
+:::{tab-item} No input placeholders: Fuyu
+:sync: fuyu
+
+Fuyu does not expect image placeholders in the inputs to HF processor, so
+the dummy prompt text is empty regardless of the number of images.
+Otherwise, the logic of this method is very similar to LLaVA:
+
+```python
+def get_dummy_processor_inputs(
+    self,
+    seq_len: int,
+    mm_counts: Mapping[str, int],
+) -> ProcessorInputs:
+    target_width, target_height = \
+        self.info.get_image_size_with_most_features()
+    num_images = mm_counts.get("image", 0)
+
+    mm_data = {
+        "image":
+        self._get_dummy_images(width=target_width,
+                                height=target_height,
+                                num_images=num_images)
+    }
+
+    return ProcessorInputs(
+        prompt_text="",
+        mm_data=mm_data,
+    )
+```
+
+:::
+
 ::::
 
 ## 4. Specify processing details
@@ -325,40 +608,28 @@ to fill in the missing details about HF processing.
 
 ### Multi-modal fields
 
-Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
+Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
 return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
 
 :::::{tab-set}
 ::::{tab-item} Basic example: LLaVA
 :sync: llava
 
-Looking at the model's `forward` method:
+The output of `CLIPImageProcessor` is a simple tensor with shape
+`(num_images, num_channels, image_height, image_width)`:
 
 ```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L387-L404
-def forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    pixel_values: torch.FloatTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    vision_feature_layer: Optional[int] = None,
-    vision_feature_select_strategy: Optional[str] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    num_logits_to_keep: int = 0,
-) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
-```
-
-The only related keyword argument is `pixel_values` which directly corresponds to input images.
-The shape of `pixel_values` is `(N, C, H, W)` where `N` is the number of images.
-So, we override the method as follows:
+# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
+images = [
+    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+    for image in all_images
+]
+
+data = {"pixel_values": images}
+return BatchFeature(data=data, tensor_type=return_tensors)
+```
+
+So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
 
 ```python
 def _get_mm_fields_config(
@@ -377,11 +648,83 @@ pre-computed image embeddings, which can be passed to be model via the `image_em
 :::
 
 ::::
+
+::::{tab-item} With postprocessing: Fuyu
+:sync: fuyu
+
+The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
+the patches from each image belonging to an item in the batch:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
+        image_input_ids.append(tensor_of_image_ids)
+        image_patches.append(patches)
+    else:
+        image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+
+batch_image_input_ids.append(image_input_ids)
+batch_image_patches.append(image_patches)
+```
+
+The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
+`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
+
+In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA,
+we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`:
+
+```python
+def _call_hf_processor(
+    self,
+    prompt: str,
+    mm_data: Mapping[str, object],
+    mm_kwargs: Mapping[str, object],
+) -> BatchFeature:
+    processed_outputs = super()._call_hf_processor(
+        prompt=prompt,
+        mm_data=mm_data,
+        mm_kwargs=mm_kwargs,
+    )
+
+    image_patches = processed_outputs.get("image_patches")
+    if image_patches is not None:
+        images = mm_data["images"]
+        assert isinstance(images, list)
+
+        # Original output: (1, num_images, Pn, Px * Py * C)
+        # New output: (num_images, Pn, Px * Py * C)
+        assert (isinstance(image_patches, list)
+                and len(image_patches) == 1)
+        assert (isinstance(image_patches[0], torch.Tensor)
+                and len(image_patches[0]) == len(images))
+
+        processed_outputs["image_patches"] = image_patches[0]
+
+    return processed_outputs
+```
+
+:::{note}
+Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+for text-only inputs to prevent unnecessary warnings from HF processor.
+:::
+
+This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
+
+```python
+def _get_mm_fields_config(
+    self,
+    hf_inputs: BatchFeature,
+    hf_processor_mm_kwargs: Mapping[str, object],
+) -> Mapping[str, MultiModalFieldConfig]:
+    return dict(image_patches=MultiModalFieldConfig.batched("image"))
+```
+
+::::
+
 :::::
 
 ### Prompt replacements
 
-Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
+Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
 return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
 
 Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
@@ -402,7 +745,7 @@ for sample in text:
 ```
 
 It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
-Based on this, we override the method as follows:
+Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` as follows:
 
 ```python
 def _get_prompt_replacements(
@@ -435,6 +778,159 @@ def _get_prompt_replacements(
 ```
 
 :::
+
+:::{tab-item} Handling additional tokens: Fuyu
+:sync: fuyu
+
+Recall the layout of feature tokens from Step 2:
+
+```
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+...
+|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+```
+
+We define a helper function to return `ncols` and `nrows` directly:
+
+```python
+def get_image_feature_grid_size(
+    self,
+    *,
+    image_width: int,
+    image_height: int,
+) -> tuple[int, int]:
+    image_processor = self.get_image_processor()
+    target_width = image_processor.size["width"]
+    target_height = image_processor.size["height"]
+    patch_width = image_processor.patch_size["width"]
+    patch_height = image_processor.patch_size["height"]
+
+    if not (image_width <= target_width and image_height <= target_height):
+        height_scale_factor = target_height / image_height
+        width_scale_factor = target_width / image_width
+        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+        image_height = int(image_height * optimal_scale_factor)
+        image_width = int(image_width * optimal_scale_factor)
+
+    ncols = math.ceil(image_width / patch_width)
+    nrows = math.ceil(image_height / patch_height)
+    return ncols, nrows
+```
+
+Based on this, we can initially define our replacement tokens as:
+
+```python
+def get_replacement(item_idx: int):
+    images = mm_items.get_items("image", ImageProcessorItems)
+    image_size = images.get_image_size(item_idx)
+
+    ncols, nrows = self.info.get_image_feature_grid_size(
+        image_width=image_size.width,
+        image_height=image_size.height,
+    )
+
+    # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
+    # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
+    return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+```
+
+However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
+a BOS token (`<s>`) is also added to the promopt:
+
+```python
+# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+    image_input=tensor_batch_images,
+    image_present=image_present,
+    image_unpadded_h=image_unpadded_heights,
+    image_unpadded_w=image_unpadded_widths,
+    image_placeholder_id=image_placeholder_id,
+    image_newline_id=image_newline_id,
+    variable_sized=True,
+)
+prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+    tokenizer=self.tokenizer,
+    prompts=prompts,
+    scale_factors=scale_factors,
+    max_tokens_to_generate=self.max_tokens_to_generate,
+    max_position_embeddings=self.max_position_embeddings,
+    add_BOS=True,
+    add_beginning_of_answer_token=True,
+)
+```
+
+To accommodate this, instead of a string you can return an instance of `PromptReplacementDetails`
+with different `full` and `feature` attributes:
+
+```python
+hf_config = self.info.get_hf_config()
+bos_token_id = hf_config.bos_token_id  # `<s>`
+assert isinstance(bos_token_id, int)
+
+def get_replacement_fuyu(item_idx: int):
+    images = mm_items.get_items("image", ImageProcessorItems)
+    image_size = images.get_image_size(item_idx)
+
+    ncols, nrows = self.info.get_image_feature_grid_size(
+        image_width=image_size.width,
+        image_height=image_size.height,
+    )
+    image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                    [_NEWLINE_TOKEN_ID]) * nrows
+
+    return PromptReplacementDetails(
+        full=image_tokens + [bos_token_id],
+        features=image_tokens,
+    )
+```
+
+Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+we can search for it to conduct the replacement at the start of the string:
+
+```python
+def _get_prompt_replacements(
+    self,
+    mm_items: MultiModalDataItems,
+    hf_processor_mm_kwargs: Mapping[str, object],
+    out_mm_kwargs: MultiModalKwargs,
+) -> list[PromptReplacement]:
+    hf_config = self.info.get_hf_config()
+    bos_token_id = hf_config.bos_token_id
+    assert isinstance(bos_token_id, int)
+
+    tokenizer = self.info.get_tokenizer()
+    eot_token_id = tokenizer.bos_token_id
+    assert isinstance(eot_token_id, int)
+
+    def get_replacement_fuyu(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+        image_size = images.get_image_size(item_idx)
+
+        ncols, nrows = self.info.get_image_feature_grid_size(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                        [_NEWLINE_TOKEN_ID]) * nrows
+
+        return PromptReplacementDetails(
+            full=image_tokens + [bos_token_id],
+            features=image_tokens,
+        )
+
+    return [
+        PromptReplacement(
+            modality="image",
+            target=[eot_token_id],
+            replacement=get_replacement_fuyu,
+        )
+    ]
+```
+
+:::
+
 ::::
 
 ## 5. Register processor-related classes
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 50b5ef35d2c..4e0ee6364f8 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -104,6 +104,8 @@ def get_image_feature_grid_size(
         image_processor = self.get_image_processor()
         target_width = image_processor.size["width"]
         target_height = image_processor.size["height"]
+        patch_width = image_processor.patch_size["width"]
+        patch_height = image_processor.patch_size["height"]
 
         if not (image_width <= target_width and image_height <= target_height):
             height_scale_factor = target_height / image_height
@@ -113,8 +115,8 @@ def get_image_feature_grid_size(
             image_height = int(image_height * optimal_scale_factor)
             image_width = int(image_width * optimal_scale_factor)
 
-        ncols = math.ceil(image_width / 30)
-        nrows = math.ceil(image_height / 30)
+        ncols = math.ceil(image_width / patch_width)
+        nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:

From 8510e9df0894e49938860726ba56b399b46e2fd2 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sat, 15 Feb 2025 18:05:11 -0800
Subject: [PATCH 0198/1240] [V1][Spec Decode] Ngram Spec Decode  (#12193)

Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py           | 202 +++++++++++++++++++++-
 tests/v1/e2e/test_ngram_spec_decode.py    |  49 ++++++
 tests/v1/sample/test_rejection_sampler.py | 173 ++++++++++++++++++
 tests/v1/sample/test_sampler.py           |   2 +
 tests/v1/spec_decode/test_ngram.py        |  32 ++++
 tests/v1/worker/test_gpu_input_batch.py   |   4 +-
 tests/v1/worker/test_gpu_model_runner.py  |   6 +
 vllm/platforms/cuda.py                    |   5 +-
 vllm/v1/core/kv_cache_manager.py          |  33 ++--
 vllm/v1/core/scheduler.py                 |  97 +++++++----
 vllm/v1/core/scheduler_output.py          |   4 +
 vllm/v1/engine/core.py                    |  31 ++++
 vllm/v1/outputs.py                        |  12 +-
 vllm/v1/request.py                        |  17 ++
 vllm/v1/sample/metadata.py                |   2 +
 vllm/v1/sample/rejection_sampler.py       | 160 +++++++++++++++++
 vllm/v1/sample/sampler.py                 |  15 +-
 vllm/v1/spec_decode/ngram_proposer.py     |  99 +++++++++++
 vllm/v1/worker/gpu_input_batch.py         |  12 +-
 vllm/v1/worker/gpu_model_runner.py        | 152 +++++++++++++---
 vllm/v1/worker/tpu_model_runner.py        |   2 +-
 21 files changed, 1025 insertions(+), 84 deletions(-)
 create mode 100644 tests/v1/e2e/test_ngram_spec_decode.py
 create mode 100644 tests/v1/sample/test_rejection_sampler.py
 create mode 100644 tests/v1/spec_decode/test_ngram.py
 create mode 100644 vllm/v1/sample/rejection_sampler.py
 create mode 100644 vllm/v1/spec_decode/ngram_proposer.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 97f75d0fd70..e39a7f9f40b 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -4,10 +4,12 @@
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
+EOS_TOKEN_ID = 50256
+
 
 def create_scheduler(
     model: str = "facebook/opt-125m",
@@ -38,6 +40,7 @@ def create_scheduler(
     return Scheduler(scheduler_config,
                      model_config,
                      cache_config,
+                     speculative_config=None,
                      lora_config=None,
                      log_stats=True)
 
@@ -46,8 +49,12 @@ def create_requests(
     num_requests: int,
     num_tokens: int = 10,
     mm_positions: Optional[List[PlaceholderRange]] = None,
+    max_tokens: int = 16,
+    stop_token_ids: Optional[List[int]] = None,
 ):
-    sampling_params = SamplingParams()
+    sampling_params = SamplingParams(ignore_eos=False,
+                                     max_tokens=max_tokens,
+                                     stop_token_ids=stop_token_ids)
     requests = []
     for i in range(num_requests):
         if mm_positions is not None:
@@ -64,7 +71,7 @@ def create_requests(
             multi_modal_inputs=mm_inputs,
             multi_modal_placeholders=mm_position,
             multi_modal_hashes=None,
-            eos_token_id=None,
+            eos_token_id=EOS_TOKEN_ID,
             arrival_time=0,
         )
         requests.append(request)
@@ -195,7 +202,7 @@ def test_schedule_partial_requests():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[0] * len(requests),
+        sampled_token_ids=[[0] for _ in range(len(requests))],
         logprobs=None,
         prompt_logprobs_dict={},
     )
@@ -215,6 +222,189 @@ def test_schedule_partial_requests():
     assert requests[2].request_id not in output.num_scheduled_tokens
 
 
+def test_stop_via_update_from_output():
+    """Test stopping behavior through update_from_output"""
+    scheduler = create_scheduler()
+
+    # Test case 1: Stop on EOS token
+    requests = create_requests(num_requests=2, max_tokens=10)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        scheduler.scheduled_req_ids.add(req.request_id)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 1,
+                                           requests[1].request_id: 2
+                                       },
+                                       total_num_scheduled_tokens=3,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [],
+                                           requests[1].request_id: [10]
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[])
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[EOS_TOKEN_ID],
+                           [10,
+                            11]],  # First request hits EOS, second continues
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped, second continues
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID]
+    assert list(requests[1].output_token_ids) == [10, 11]
+
+    # Test case 2: Stop on custom stop token
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=2,
+                               max_tokens=10,
+                               stop_token_ids=[42, 43])
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        scheduler.scheduled_req_ids.add(req.request_id)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 3,
+                                           requests[1].request_id: 2
+                                       },
+                                       total_num_scheduled_tokens=5,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [10, 42],
+                                           requests[1].request_id: [13]
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[])
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[10, 42, 12],
+                           [13, 14]],  # First request hits stop token
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped on custom token
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_STOPPED
+    assert requests[0].stop_reason == 42
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 42]
+    assert list(requests[1].output_token_ids) == [13, 14]
+
+    # Test case 3: Stop on max tokens
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=2, max_tokens=2)
+    for req in requests:
+        req.num_computed_tokens = req.num_tokens
+        scheduler.requests[req.request_id] = req
+        scheduler.running.append(req)
+        scheduler.scheduled_req_ids.add(req.request_id)
+
+    scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
+                                       scheduled_cached_reqs=[],
+                                       num_scheduled_tokens={
+                                           requests[0].request_id: 3,
+                                           requests[1].request_id: 1
+                                       },
+                                       total_num_scheduled_tokens=4,
+                                       scheduled_encoder_inputs={},
+                                       scheduled_spec_decode_tokens={
+                                           requests[0].request_id: [10, 11],
+                                           requests[1].request_id: []
+                                       },
+                                       num_common_prefix_blocks=0,
+                                       finished_req_ids=set(),
+                                       free_encoder_input_ids=[])
+
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[10, 11, 12],
+                           [13]],  # First request exceeds max_tokens
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify first request stopped due to length
+    assert len(scheduler.running) == 1
+    assert scheduler.running[0].request_id == requests[1].request_id
+    assert requests[0].status == RequestStatus.FINISHED_LENGTH_CAPPED
+    assert requests[0].request_id in scheduler.finished_req_ids
+    assert list(requests[0].output_token_ids) == [10, 11
+                                                  ]  # Truncated to max_tokens
+    assert list(requests[1].output_token_ids) == [13]
+
+    # Test case 4: Ignore EOS flag
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=1, max_tokens=10)
+    requests[0].sampling_params.ignore_eos = True
+    requests[0].num_computed_tokens = requests[0].num_tokens
+    scheduler.requests[requests[0].request_id] = requests[0]
+    scheduler.running.append(requests[0])
+    scheduler.scheduled_req_ids.add(requests[0].request_id)
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={requests[0].request_id: 3},
+        total_num_scheduled_tokens=3,
+        scheduled_encoder_inputs={},
+        scheduled_spec_decode_tokens={
+            requests[0].request_id: [EOS_TOKEN_ID, 10]
+        },
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[])
+
+    model_output = ModelRunnerOutput(
+        req_ids=[requests[0].request_id],
+        req_id_to_index={requests[0].request_id: 0},
+        sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        logprobs=None,
+        prompt_logprobs_dict={})
+
+    scheduler.update_from_output(scheduler_output, model_output)
+
+    # Verify request continues past EOS
+    assert len(scheduler.running) == 1
+    assert not requests[0].is_finished()
+    assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
+
+
 def test_schedule_concurrent_batches():
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
@@ -243,7 +433,7 @@ def test_schedule_concurrent_batches():
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
-        sampled_token_ids=[0],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
     )
@@ -259,7 +449,7 @@ def test_schedule_concurrent_batches():
     model_runner_output = ModelRunnerOutput(
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
-        sampled_token_ids=[0],
+        sampled_token_ids=[[0]],
         logprobs=None,
         prompt_logprobs_dict={},
     )
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
new file mode 100644
index 00000000000..150caa150a5
--- /dev/null
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+@pytest.fixture
+def test_prompts():
+    return [
+        "Can you repeat the sentence ten times, this is a sentence.",
+        "Can you repeat the sentence ten times, this is a test.",
+    ]
+
+
+@pytest.fixture
+def sampling_config():
+    # Only support greedy for now
+    return SamplingParams(temperature=0, max_tokens=30, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "meta-llama/Meta-Llama-3-8B-Instruct"
+
+
+def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
+                           model_name):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using ngram speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name)
+        ref_outputs = ref_llm.generate(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(model=model_name,
+                       speculative_model='[ngram]',
+                       ngram_prompt_lookup_max=5,
+                       ngram_prompt_lookup_min=3,
+                       num_speculative_tokens=3)
+        spec_outputs = spec_llm.generate(test_prompts, sampling_config)
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            assert ref_output.outputs[0].text == spec_output.outputs[0].text, \
+                (f"ref_output: {ref_output.outputs[0].text},"
+                 f"spec_output: {spec_output.outputs[0].text}")
+        del spec_llm
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
new file mode 100644
index 00000000000..8bc33e84194
--- /dev/null
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -0,0 +1,173 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List
+
+import pytest
+import torch
+
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
+
+
+@pytest.fixture
+def sampler():
+    return RejectionSampler()
+
+
+def create_logits_tensor(token_ids: List[int],
+                         vocab_size: int = 100) -> torch.Tensor:
+    """Helper function to create logits tensor that 
+       will produce desired token ids on argmax"""
+    logits = torch.full((len(token_ids), vocab_size), -100.0).cuda()
+    for i, token_id in enumerate(token_ids):
+        logits[i, token_id] = 100.0
+    return logits
+
+
+def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
+    batch_size = len(spec_tokens)
+    return SamplingMetadata(
+        temperature=0.0,
+        all_greedy=True,
+        all_random=False,
+        rejection_sampling=True,
+        spec_token_ids=spec_tokens,
+        top_p=None,
+        top_k=None,
+        no_top_p=False,
+        no_top_k=False,
+        min_p=torch.empty(batch_size, ),
+        no_min_p=True,
+        generators={},
+        max_num_logprobs=0,
+        no_penalties=False,
+        prompt_token_ids=None,
+        frequency_penalties=torch.tensor([]),
+        presence_penalties=torch.tensor([]),
+        repetition_penalties=torch.tensor([]),
+        output_token_ids=[],
+        min_tokens=[],
+        stop_token_ids=[],
+        logit_bias=[None] * batch_size,
+    )
+
+
+def test_perfect_match(sampler):
+    """Test when output tokens perfectly match speculated tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [1, 2, 3, 4]  # 4 is the bonus token
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 2, 3, 4]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_early_mismatch(sampler):
+    """Test when there's an early mismatch in tokens"""
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [1, 5, 3, 4]  # Mismatch at position 1
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_multiple_sequences(sampler):
+    """Test handling multiple sequences of speculated tokens"""
+    spec_tokens = [[1, 2], [3]]
+    output_tokens = [1, 2, 5, 3, 4]  # Two sequences with bonus tokens 5 and 4
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_single_token_sequence(sampler):
+    """Test handling sequences with single token"""
+    spec_tokens = [[1]]
+    output_tokens = [1, 2]  # Single token with bonus token 2
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_empty_sequence(sampler):
+    """Test handling empty sequence of speculated tokens"""
+    spec_tokens: List[List[int]] = [[]]
+    output_tokens = [5]  # Just the bonus token
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+def test_multiple_mismatches(sampler):
+    """Test handling multiple sequences with mismatches"""
+    spec_tokens = [[1, 2, 3], [4, 5, 6]]
+    output_tokens = [1, 2, 7, 6, 4, 8, 6, 9]  # Mismatches in both sequences
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
+                             [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+
+
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2]], [1, 2, 3], [[1, 2, 3]]),  # Perfect match with bonus
+        ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]),  # First mismatch
+        ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID],
+                                                [3, 4, 7]]),  # Mixed matches
+    ])
+def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
+    """Parametrized test for various matching scenarios"""
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens)
+
+    output = sampler(logits, metadata)
+    expected_tensor = torch.tensor(expected,
+                                   dtype=torch.int,
+                                   device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected_tensor)
+
+
+def test_logits_shape_handling(sampler):
+    """Test handling of different logits tensor shapes"""
+    spec_tokens = [[1, 2]]
+    output_tokens = [1, 2, 3]
+    vocab_size = 1000
+
+    metadata = create_sampling_metadata(spec_tokens)
+    logits = create_logits_tensor(output_tokens, vocab_size)
+
+    output = sampler(logits, metadata)
+    expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
+    assert torch.equal(output.sampled_token_ids, expected)
+    assert logits.shape[-1] == vocab_size
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index cfef475d8de..a4bd651f822 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -77,6 +77,7 @@ def _create_default_sampling_metadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
         all_random=False,
+        rejection_sampling=False,
         top_p=torch.empty(batch_size, ),
         top_k=torch.empty(batch_size, ),
         no_top_p=True,
@@ -88,6 +89,7 @@ def _create_default_sampling_metadata(
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
                                                       vocab_size, device),
         output_token_ids=output_token_ids,
+        spec_token_ids=[],
         frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
new file mode 100644
index 00000000000..ec663c84d0d
--- /dev/null
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.utils import ConstantList
+
+
+@pytest.fixture
+def proposer():
+    return NgramProposer()
+
+
+def test_kmp_lps_array(proposer):
+    assert proposer._kmp_lps_array([]) == []
+    assert proposer._kmp_lps_array([1]) == [0]
+    assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2]
+    assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0]
+    assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0]
+
+
+def test_find_subarray_kmp(proposer):
+    X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6])
+    assert proposer._find_subarray_kmp(X, 2, 2) is None
+    X = ConstantList([1, 2, 3, 4, 1, 2, 3])
+    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
+    assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1]
+    assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2]
+    assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1]
+    X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3])
+    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
+    # Return on the first match
+    assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3]
\ No newline at end of file
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 53deb70d767..c0ab356f5c9 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -92,6 +92,7 @@ def _construct_expected_sampling_metadata(
                                  device=device),
         all_greedy=False,
         all_random=True,
+        rejection_sampling=False,
         top_p=torch.tensor(top_p, dtype=torch.float, device=device),
         top_k=torch.tensor(top_k, dtype=torch.int, device=device),
         no_top_p=all(x == 1.0 for x in top_p),
@@ -116,6 +117,7 @@ def _construct_expected_sampling_metadata(
                                           dtype=torch.float,
                                           device=device),
         output_token_ids=output_token_ids,
+        spec_token_ids=[],
         min_tokens=min_tokens,
         stop_token_ids=stop_token_ids,
         no_penalties=(all(x == 0 for x in presence_penalties)
@@ -205,7 +207,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
 
     # Generate the sampling metadata
     sampling_metadata = input_batch.make_sampling_metadata(
-        req_id_output_token_ids, skip_copy=False)
+        req_id_output_token_ids, req_id_to_spec_token_ids={}, skip_copy=False)
 
     # Create expected output.
     expected_sampling_metadata = _construct_expected_sampling_metadata(
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index f5219b676a8..576d906fa74 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -66,6 +66,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         scheduled_cached_reqs=[],
         num_scheduled_tokens=num_scheduled_tokens,
         total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
@@ -109,6 +110,7 @@ def test_update_states_request_finished(model_runner):
         scheduled_cached_reqs=[],
         num_scheduled_tokens={},
         total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids={req_id},
@@ -137,6 +139,7 @@ def test_update_states_request_resumed(model_runner):
         scheduled_cached_reqs=[],
         num_scheduled_tokens={},
         total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids={},
@@ -160,6 +163,7 @@ def test_update_states_request_resumed(model_runner):
         scheduled_cached_reqs=[cached_req_data],
         num_scheduled_tokens={req_id: 1},
         total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
@@ -188,6 +192,7 @@ def test_update_states_no_changes(model_runner):
         scheduled_cached_reqs=[],
         num_scheduled_tokens={req_id: 1},
         total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
@@ -220,6 +225,7 @@ def test_update_states_request_unscheduled(model_runner):
         scheduled_cached_reqs=[],
         num_scheduled_tokens={req_ids[0]: 1},
         total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9deb0294668..2c40a798736 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -124,9 +124,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                         "vllm.worker.multi_step_worker.MultiStepWorker"
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
-                    raise NotImplementedError(
-                        "Speculative decoding is not yet supported on VLLM V1."
-                    )
+                    parallel_config.worker_cls = \
+                            "vllm.v1.worker.gpu_worker.Worker"
                 else:
                     parallel_config.worker_cls = \
                         "vllm.spec_decode.spec_decode_worker.create_spec_worker"
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 0381e5cdd09..017e625dcdb 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -82,6 +82,11 @@ def __init__(
         self.req_to_block_hashes: DefaultDict[
             str, List[BlockHashType]] = defaultdict(list)
 
+        # {req_id: The number of cached blocks for this given request}
+        # This is used to track the number of cached blocks for each request.
+        # This is only used to track the RUNNING requests, we do not track the
+        # data for reempted ones.
+        self.num_cached_block: Dict[str, int] = defaultdict(int)
         self.prefix_cache_stats = PrefixCacheStats()
 
     @property
@@ -241,23 +246,25 @@ def allocate_slots(
         if not self.enable_caching:
             return new_blocks
 
-        # NOTE(rickyx): We are assuming the `num_tokens` are actual
-        # tokens rather than lookahead slots (e.g. for speculative decoding).
-        # TODO(rickyx): When supporting speculative decoding, we will need to
-        # differentiate between them so that we can know how many blocks are
-        # full after appending the actual tokens.
-        num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size
-        num_computed_full_blocks = num_computed_tokens // self.block_size
-        new_full_blocks = req_blocks[num_computed_full_blocks:num_full_blocks]
+        num_cached_blocks = self.num_cached_block[request.request_id]
+        # Speculated tokens might be rejected in the future, so we does
+        # not cache any speculated tokens. We only cache blocks with
+        # generated (accepted) tokens.
+        num_full_blocks_after_append = (num_computed_tokens + num_tokens - len(
+            request.spec_token_ids)) // self.block_size
+        new_full_blocks = req_blocks[
+            num_cached_blocks:num_full_blocks_after_append]
+
         if new_full_blocks:
             self._cache_full_blocks(
                 request=request,
-                blk_start_idx=num_computed_full_blocks,
+                blk_start_idx=num_cached_blocks,
                 # The new full blocks are the full blocks that are not computed.
                 full_blocks=new_full_blocks,
-                prev_block=(req_blocks[num_computed_full_blocks - 1]
-                            if num_computed_full_blocks > 0 else None))
-
+                prev_block=(req_blocks[num_cached_blocks -
+                                       1] if num_cached_blocks > 0 else None))
+        self.num_cached_block[
+            request.request_id] = num_full_blocks_after_append
         return new_blocks
 
     def free(self, request: Request) -> None:
@@ -281,6 +288,8 @@ def free(self, request: Request) -> None:
             if block.ref_cnt == 0:
                 self.free_block_queue.append(block)
 
+        self.num_cached_block.pop(request.request_id, None)
+
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache. This function may be used in RLHF
         flows to invalid prefix caching after the weights are updated,
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 2d5a1192c22..82c4b307d48 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -4,7 +4,8 @@
 from collections import deque
 from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
 
-from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.logger import init_logger
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
@@ -28,11 +29,13 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        speculative_config: Optional[SpeculativeConfig],
         log_stats: bool,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.speculative_config = speculative_config
         self.log_stats = log_stats
 
         # Scheduling constraints.
@@ -96,12 +99,14 @@ def __init__(
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
-        # Each request just has the num_computed_tokens and num_tokens,
-        # which is equal to len(prompt_token_ids) + len(output_token_ids).
+        # Each request just has the num_computed_tokens and
+        # num_tokens_with_spec. num_tokens_with_spec =
+        # len(prompt_token_ids) + len(output_token_ids) + len(spec_token_ids).
         # At each step, the scheduler tries to assign tokens to the requests
         # so that each request's num_computed_tokens can catch up its
-        # num_tokens. This is general enough to cover chunked prefills,
-        # prefix caching, and the "jump decoding" optimization in the future.
+        # num_tokens_with_spec. This is general enough to cover
+        # chunked prefills, prefix caching, speculative decoding,
+        # and the "jump decoding" optimization in the future.
 
         scheduled_new_reqs: List[Request] = []
         scheduled_resumed_reqs: List[Request] = []
@@ -114,7 +119,8 @@ def schedule(self) -> "SchedulerOutput":
         # Encoder-related.
         scheduled_encoder_inputs: Dict[str, List[int]] = {}
         encoder_budget = self.max_num_encoder_input_tokens
-
+        # Spec decode-related.
+        scheduled_spec_decode_tokens: Dict[str, List[int]] = {}
         scheduled_timestamp = time.monotonic()
 
         # First, schedule the RUNNING requests.
@@ -126,7 +132,8 @@ def schedule(self) -> "SchedulerOutput":
                 req_index += 1
                 continue
 
-            num_new_tokens = request.num_tokens - request.num_computed_tokens
+            num_new_tokens = (request.num_tokens_with_spec -
+                              request.num_computed_tokens)
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
@@ -189,6 +196,11 @@ def schedule(self) -> "SchedulerOutput":
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_budget = new_encoder_budget
 
+            # Speculative decode related.
+            if request.spec_token_ids:
+                scheduled_spec_decode_tokens[
+                    request.request_id] = request.spec_token_ids
+
         # Record the LoRAs in scheduled_running_reqs
         requested_loras: Set[int] = set()
         if self.lora_config:
@@ -338,6 +350,7 @@ def schedule(self) -> "SchedulerOutput":
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
+            scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             num_common_prefix_blocks=num_common_prefix_blocks,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
@@ -447,11 +460,11 @@ def update_from_output(
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
     ) -> EngineCoreOutputs:
-        # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
+
         new_running: List[Request] = []
         outputs: List[EngineCoreOutput] = []
 
@@ -466,11 +479,30 @@ def update_from_output(
                 new_running.append(request)
                 continue
 
-            request.num_computed_tokens += num_tokens_scheduled
-            # When the request's num_computed_tokens catches up its num_tokens,
-            # the request generates output tokens. Otherwise, we ignore the
-            # sampler output for the request.
-            assert request.num_computed_tokens <= request.num_tokens
+            req_index = model_runner_output.req_id_to_index[req_id]
+            generated_token_ids = sampled_token_ids[req_index]
+            if req_id not in scheduler_output.scheduled_spec_decode_tokens:
+                # When the request's num_computed_tokens catches up
+                # its num_tokens, the request generates output tokens.
+                # Otherwise, we ignore the sampler output for the request.
+                request.num_computed_tokens += num_tokens_scheduled
+                assert request.num_computed_tokens <= request.num_tokens
+            else:
+                # num_computed_tokens_step represents the number of tokens
+                # processed in the current step, considering scheduled
+                # tokens and rejections.
+                # It is calculated as:
+                # num_computed_tokens_step = num_scheduled_tokens -
+                #                            num_tokens_rejected,
+                # where num_tokens_rejected is given by:
+                # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
+                scheduled_spec_token_ids = (
+                    scheduler_output.scheduled_spec_decode_tokens[req_id])
+
+                num_computed_tokens_step = num_scheduled_tokens[req_id] - (
+                    len(scheduled_spec_token_ids) + 1 -
+                    len(generated_token_ids))
+                request.num_computed_tokens += num_computed_tokens_step
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -485,27 +517,32 @@ def update_from_output(
                         self.encoder_cache_manager.free_encoder_input(
                             request, input_id)
 
+            if request.num_computed_tokens >= request.num_tokens:
+                # Clear the spec tokens as the request has generated
+                # a new token. Here, We assume all spec tokens are verified
+                # if we perform speculative decoding for this request.
+                # Therefore, we can clear all spec tokens after
+                # the generation step.
+                request.clear_spec_tokens()
+
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
 
             stopped = False
             new_logprobs = None
-            new_token_ids = None
-
-            if request.num_computed_tokens == request.num_tokens:
-                req_index = model_runner_output.req_id_to_index[req_id]
-                # NOTE(woosuk): Currently, we assume that each request
-                # generates at most one token at each step.
-                token_id = sampled_token_ids[req_index]
-                request.append_output_token_ids(token_id)
-                num_new_tokens = 1
-                # TODO: Update the KV cache manager for prefix caching.
-
-                # Check for stop and update request state.
-                # This must be called before we make the EngineCoreOutput.
-                stopped = self._check_stop(request)
-                if stopped:
-                    self._free_request(request)
+            new_token_ids: List[int] = []
+
+            if request.num_computed_tokens >= request.num_tokens:
+                for output_token_id in generated_token_ids:
+                    request.append_output_token_ids(output_token_id)
+                    new_token_ids.append(output_token_id)
+
+                    # Check for stop and update request state.
+                    # This must be called before we make the EngineCoreOutput.
+                    stopped = self._check_stop(request)
+                    if stopped:
+                        self._free_request(request)
+                        break
 
                 # Extract sample logprobs if needed.
                 if request.sampling_params.logprobs is not None:
@@ -514,8 +551,6 @@ def update_from_output(
                     # the outer lists can be of length > 1.
                     new_logprobs = logprobs.slice(req_index, req_index + 1)
 
-                new_token_ids = request.output_token_ids[-num_new_tokens:]
-
             # Transmit partial if chunked prefill & prompt logprobs is enabled
             if new_token_ids or prompt_logprobs_tensors is not None:
                 # Add EngineCoreOutput for this Request.
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index 990b3dd0ed7..2ca8526936e 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -91,6 +91,10 @@ class SchedulerOutput:
     # Total number of tokens scheduled for all requests.
     # Equal to sum(num_scheduled_tokens.values())
     total_num_scheduled_tokens: int
+    # req_id -> spec_decode_tokens
+    # If a request does not have any spec decode tokens, it will
+    # not be included in the dictionary.
+    scheduled_spec_decode_tokens: Dict[str, List[int]]
     # req_id -> encoder input indices that need processing.
     # E.g., if a request has [0, 1], it could mean the vision encoder needs
     # to process that the request's 0-th and 1-th images in the current step.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b3943816558..c7ea7b1a94d 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -27,6 +27,7 @@
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -65,6 +66,7 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            speculative_config=vllm_config.speculative_config,
             log_stats=self.log_stats,
         )
 
@@ -84,6 +86,15 @@ def __init__(
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
+        # Setup speculative decode.
+        # TODO: find a better way to check if we are using ngram.
+        self.use_spec_decode = False
+        if self.scheduler.speculative_config:
+            assert self.scheduler.speculative_config.ngram_prompt_lookup_min \
+                    , "Only ngram spec decode is supported in V1."
+            self.proposer = NgramProposer()
+            self.use_spec_decode = True
+
     def _initialize_kv_caches(self,
                               vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
@@ -147,6 +158,9 @@ def step(self) -> EngineCoreOutputs:
             return EngineCoreOutputs(
                 outputs=[], scheduler_stats=self.scheduler.make_stats())
 
+        if self.use_spec_decode:
+            self.propose_tokens()
+
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
@@ -207,6 +221,23 @@ def shutdown(self):
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
+    def propose_tokens(self):
+        assert self.scheduler.speculative_config is not None
+        for req in self.scheduler.running:
+            # Ignore requests that are doing chunked prefill.
+            if req.num_computed_tokens < req.num_tokens - 1:
+                continue
+            # Ignore requests that already have spec tokens.
+            if req.spec_token_ids:
+                continue
+            spec_tokens = self.proposer.propose(
+                req.all_token_ids,
+                self.scheduler.speculative_config.ngram_prompt_lookup_min,
+                self.scheduler.speculative_config.num_speculative_tokens,
+            )
+            if spec_tokens:
+                req.append_spec_token_ids(spec_tokens)
+
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 27fd2dbda8b..fb6c4051e9a 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -43,7 +43,10 @@ def tolists(self):
 @dataclass
 class SamplerOutput:
 
-    # [num_reqs]
+    # [num_reqs, max_num_generated_tokens]
+    # Different requests can have different number of generated tokens.
+    # All requests are padded to max_num_generated_tokens.
+    # INVALID_TOKEN_ID (-1 by default) is used for padding.
     sampled_token_ids: torch.Tensor
     logprobs_tensors: Optional[LogprobsTensors]
 
@@ -58,8 +61,11 @@ class ModelRunnerOutput:
     # req_id -> index
     req_id_to_index: Dict[str, int]
 
-    # [num_reqs]
-    sampled_token_ids: List[int]
+    # num_reqs x num_generated_tokens
+    # num_generated_tokens is the number of tokens
+    # generated in the current step. It can be different for
+    # each request due to speculative/jump decoding.
+    sampled_token_ids: List[List[int]]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 0ebaa71ce74..a1bcc2d0393 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -46,6 +46,7 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
+        self.spec_token_ids: List[int] = []
         self.num_computed_tokens = 0
 
         # Multi-modal related
@@ -103,10 +104,26 @@ def append_output_token_ids(
         self._output_token_ids.extend(token_ids)
         self._all_token_ids.extend(token_ids)
 
+    def append_spec_token_ids(
+        self,
+        token_ids: Union[int, List[int]],
+    ) -> None:
+        if isinstance(token_ids, int):
+            self.spec_token_ids.append(token_ids)
+        else:
+            self.spec_token_ids.extend(token_ids)
+
+    def clear_spec_tokens(self) -> None:
+        self.spec_token_ids.clear()
+
     @property
     def num_tokens(self) -> int:
         return len(self._all_token_ids)
 
+    @property
+    def num_tokens_with_spec(self) -> int:
+        return len(self._all_token_ids) + len(self.spec_token_ids)
+
     @property
     def num_output_tokens(self) -> int:
         return len(self._output_token_ids)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index cfcc54b7e34..ea64181c0ae 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -12,6 +12,8 @@ class SamplingMetadata:
     temperature: torch.Tensor
     all_greedy: bool
     all_random: bool
+    rejection_sampling: bool
+    spec_token_ids: List[List[int]]
 
     top_p: torch.Tensor
     top_k: torch.Tensor
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
new file mode 100644
index 00000000000..6a0bbe7b216
--- /dev/null
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+from torch.nn.utils.rnn import pad_sequence
+
+from vllm.logger import init_logger
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
+
+try:
+    import flashinfer.sampling as fs
+    is_flashinfer_available = True
+except ImportError:
+    is_flashinfer_available = False
+
+logger = init_logger(__name__)
+INVALID_TOKEN_ID = -1
+
+
+class RejectionSampler(nn.Module):
+
+    def forward(self, logits: torch.Tensor,
+                sampling_metadata: SamplingMetadata) -> SamplerOutput:
+        if not sampling_metadata.all_greedy:
+            raise NotImplementedError(
+                "Only greedy sampling is supported by rejection sampler.")
+
+        if is_flashinfer_available:
+            logger.info("User FlashInfer for rejection sampling.")
+            return RejectionSampler.flashinfer_sample(logits,
+                                                      sampling_metadata)
+        else:
+            logger.warning(
+                "FlashInfer is not available. Falling back to the PyTorch-"
+                "native implementation of rejection sampling.")
+            return RejectionSampler.greedy_sample_native(
+                logits, sampling_metadata)
+
+    @staticmethod
+    def flashinfer_sample(
+            logits: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> SamplerOutput:
+        # NOTE: The following input preparationg can be moved
+        # to the model runner with a persistent manner for better
+        # performance.
+        spec_token_ids = sampling_metadata.spec_token_ids
+        max_spec_len = max(len(s) for s in spec_token_ids)
+        batch_size = len(spec_token_ids)
+        draft_token_ids = torch.full((batch_size, max_spec_len),
+                                     INVALID_TOKEN_ID,
+                                     device="cpu",
+                                     dtype=torch.long)
+
+        target_token_ids = torch.full((batch_size, max_spec_len + 1),
+                                      fill_value=INVALID_TOKEN_ID,
+                                      device=logits.device,
+                                      dtype=torch.long)
+
+        # TODO: Vectorize the following loop for better performance.
+        start_loc = 0
+        for i in range(batch_size):
+            num_spec_tokens = len(spec_token_ids[i])
+            draft_token_ids[i, :num_spec_tokens] = torch.tensor(
+                spec_token_ids[i], device="cpu", dtype=torch.long)
+            end_loc = start_loc + num_spec_tokens + 1
+            # Assume greedy sampling.
+            target_token_ids[i, :num_spec_tokens + 1] = torch.argmax(
+                logits[start_loc:end_loc], dim=-1)
+            start_loc = end_loc
+
+        vocab_size = logits.size(-1)
+        # NOTE: CPU <-> GPU synchronization happens here.
+        draft_token_ids = draft_token_ids.to(logits.device)
+        draft_probs = RejectionSampler._create_greedy_token_probs(
+            draft_token_ids, vocab_size, logits.device)
+        target_probs = RejectionSampler._create_greedy_token_probs(
+            target_token_ids, vocab_size, logits.device)
+        uniform_samples = torch.zeros(batch_size,
+                                      max_spec_len + 1,
+                                      device=logits.device)
+
+        sampled_token_ids, _, _ = fs.chain_speculative_sampling(
+            draft_probs,
+            draft_token_ids,
+            uniform_samples,
+            target_probs,
+        )
+        return SamplerOutput(sampled_token_ids=sampled_token_ids,
+                             logprobs_tensors=None)
+
+    # TODO: The following method can be optimized for better performance.
+    @staticmethod
+    def greedy_sample_native(
+            logits: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> SamplerOutput:
+        spec_lens = [len(x) for x in sampling_metadata.spec_token_ids]
+        # Add 1 to include the 'bonus' token.
+        sample_lens = [x + 1 for x in spec_lens]
+
+        output_token_ids = logits.argmax(dim=-1).view(-1)
+        output_token_ids = output_token_ids.split(sample_lens)
+        output_token_ids = pad_sequence(output_token_ids,
+                                        batch_first=True,
+                                        padding_value=INVALID_TOKEN_ID)
+
+        # Convert spec token IDs to a tensor, split by sample_lens, then pad.
+        spec_token_ids = [
+            torch.tensor(x,
+                         dtype=output_token_ids.dtype,
+                         device=output_token_ids.device)
+            for x in sampling_metadata.spec_token_ids
+        ]
+        spec_token_ids = pad_sequence(spec_token_ids,
+                                      batch_first=True,
+                                      padding_value=INVALID_TOKEN_ID)
+
+        # Produce a mask that remains 1 (True) until the first
+        # mismatch (cumprod turns 0 after a mismatch).
+        accept_mask = (output_token_ids[:, :-1] == spec_token_ids).cumprod(
+            dim=1)
+        # Identify valid positions (non-padding).
+        valid_mask = output_token_ids != INVALID_TOKEN_ID
+        # Generate mask with bonus token.
+        generate_mask = torch.cat([
+            accept_mask,
+            torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
+        ],
+                                  dim=1).to(torch.bool) & valid_mask
+        zeros_mask = (generate_mask == 0)
+        first_zero_idx = zeros_mask.float().argmax(dim=1)
+        # Figure out which rows actually contain at least one zero.
+        rows_with_zero = zeros_mask.any(dim=1)
+        # Use indexing to set the first zero in each of those rows to 1.
+        generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
+
+        output_token_ids[~generate_mask] = INVALID_TOKEN_ID
+        return SamplerOutput(sampled_token_ids=output_token_ids,
+                             logprobs_tensors=None)
+
+    @staticmethod
+    def _create_greedy_token_probs(token_ids: torch.Tensor, vocab_size: int,
+                                   out_device: torch.device) -> torch.Tensor:
+        batch_size, num_tokens = token_ids.shape
+
+        token_probs = torch.zeros(batch_size,
+                                  num_tokens,
+                                  vocab_size,
+                                  dtype=torch.float,
+                                  device=out_device)
+
+        # Ignore INVALID_TOKEN_ID.
+        valid_mask = (token_ids != INVALID_TOKEN_ID)
+        valid_indices = token_ids.clone()
+        valid_indices[~valid_mask] = 0
+
+        token_probs.scatter_(dim=2,
+                             index=valid_indices.unsqueeze(-1),
+                             src=valid_mask.unsqueeze(-1).float())
+
+        return token_probs
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 66cf48bc0f5..ec6374d12b1 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -9,6 +9,7 @@
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+from vllm.v1.sample.rejection_sampler import RejectionSampler
 
 _SAMPLING_EPS = 1e-5
 
@@ -18,12 +19,21 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
+        self.rejection_sampler = RejectionSampler()
 
     def forward(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        if sampling_metadata.rejection_sampling:
+            if sampling_metadata.max_num_logprobs:
+                raise NotImplementedError(
+                    "Rejection sampling does not support logprobs.")
+            return self.rejection_sampler(
+                logits,
+                sampling_metadata,
+            )
 
         # NOTE(woosuk): Use the original logits (before any penalties or
         # temperature scaling) for the top-k logprobs.
@@ -54,7 +64,10 @@ def forward(
 
         # These are GPU tensors.
         sampler_output = SamplerOutput(
-            sampled_token_ids=sampled,
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.unsqueeze(-1),
             logprobs_tensors=logprobs_tensors,
         )
         return sampler_output
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
new file mode 100644
index 00000000000..8eee99506b1
--- /dev/null
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+
+from vllm.v1.utils import ConstantList
+
+
+class NgramProposer:
+
+    def __init__(self):
+        pass
+
+    def propose(self, context_token_ids: ConstantList[int], n: int,
+                k: int) -> Optional[List[int]]:
+        """Proposes the next sequence of tokens based on n-gram pattern 
+        matching in the context. The function finds matches of the last n 
+        tokens in the previous context, and returns k tokens that followed 
+        that match.
+        
+        Args:
+            context_token_ids: List of token IDs representing the 
+                               context sequence.
+            n: Length of the n-gram to match.
+            k: Number of tokens follow the match. If there are less 
+               than k tokens follow the match, we will return 
+               the maximum amount of tokens until the end.
+        
+        Returns:
+            List[int]: The sequence of tokens that followed 
+                       the matched n-gram in the context.
+            None: If no matching n-gram pattern is found.
+        
+        Example:
+            If context_token_ids = [1,2,3,4,2,3], n = 2, and k = 4:
+            - The last 2 tokens [2,3] will be matched against the previous 
+              4 tokens [1,2,3,4].
+            - Finding a match of [2,3] would return the tokens that 
+              followed that pattern. Here we will return [4,2,3] because 
+              we only have three tokens after the match.
+        """
+        # TODO: Use c++ to implement the _find_subarray_kmp to
+        # improve the efficiency
+        return self._find_subarray_kmp(context_token_ids, n, k)
+
+    @staticmethod
+    def _kmp_lps_array(pattern: List[int]) -> List[int]:
+        """
+        Build the lps (longest proper prefix which is also suffix) 
+        array for the pattern.
+        """
+        lps = [0] * len(pattern)
+        prev_lps = 0  # length of the previous longest prefix suffix
+        i = 1
+
+        while i < len(pattern):
+            if pattern[i] == pattern[prev_lps]:
+                prev_lps += 1
+                lps[i] = prev_lps
+                i += 1
+            else:
+                if prev_lps != 0:
+                    prev_lps = lps[prev_lps - 1]
+                else:
+                    lps[i] = 0
+                    i += 1
+
+        return lps
+
+    @staticmethod
+    def _find_subarray_kmp(context_token_ids: ConstantList[int], n: int,
+                           k: int) -> Optional[List[int]]:
+        context_len = len(context_token_ids)
+        assert n > 0
+
+        pattern = context_token_ids[-n:]
+        # Precompute lps array for Y
+        lps = NgramProposer._kmp_lps_array(pattern)
+
+        i = 0
+        j = 0
+        # -n because the last n tokens are used as pattern
+        while i < context_len - n:
+            if context_token_ids[i] == pattern[j]:
+                i += 1
+                j += 1
+
+                # If we have matched the entire Y
+                if j == n:
+                    # Found pattern in context, gather the next K elements
+                    return context_token_ids[i:i + k]
+            else:
+                # Mismatch
+                if j != 0:
+                    # Use the lps array to avoid re-checking elements
+                    j = lps[j - 1]
+                else:
+                    i += 1
+
+        # Y not found
+        return None
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1604aeab320..805d8f618d2 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -390,6 +390,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
     def make_sampling_metadata(
         self,
         req_id_output_token_ids: Dict[str, List[int]],
+        req_id_to_spec_token_ids: Dict[str, List[int]],
         skip_copy: bool = False,
     ) -> SamplingMetadata:
         if not skip_copy:
@@ -423,7 +424,8 @@ def make_sampling_metadata(
                 self.prompt_token_ids = self._make_prompt_token_ids_tensor()
 
         output_token_ids: List[List[int]] = []
-
+        spec_token_ids: List[List[int]] = []
+        rejection_sampling = False
         for req_id in self.req_ids[:self.num_reqs]:
             assert req_id is not None
             # Currently we create a tensor for output_token_ids from scratch
@@ -434,11 +436,18 @@ def make_sampling_metadata(
             # TODO - Replace this with incremental update to output token
             # statistics.
             output_token_ids.append(req_id_output_token_ids[req_id])
+            req_spec_token_ids = req_id_to_spec_token_ids.get(req_id, [])
+            spec_token_ids.append(req_spec_token_ids)
+            if req_spec_token_ids:
+                # If any of the requests require speculative decoding, set the
+                # flag to True.
+                rejection_sampling = True
 
         return SamplingMetadata(
             temperature=self.temperature[:self.num_reqs],
             all_greedy=self.all_greedy,
             all_random=self.all_random,
+            rejection_sampling=rejection_sampling,
             top_p=self.top_p[:self.num_reqs],
             top_k=self.top_k[:self.num_reqs],
             min_p=self.min_p[:self.num_reqs],
@@ -452,6 +461,7 @@ def make_sampling_metadata(
             presence_penalties=self.presence_penalties[:self.num_reqs],
             repetition_penalties=self.repetition_penalties[:self.num_reqs],
             output_token_ids=output_token_ids,
+            spec_token_ids=spec_token_ids,
             min_tokens=self.min_tokens[:self.num_reqs],
             stop_token_ids=self.stop_token_ids[:self.num_reqs],
             no_penalties=self.no_penalties,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 821c9e13802..12b7ce18fbc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -32,6 +32,7 @@
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -180,6 +181,7 @@ def __init__(
                                        self.max_model_len,
                                        self.max_num_tokens),
                                    dtype=np.int32)
+        self.arange_cpu = torch.from_numpy(self.arange_np)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
         # not make any assumptions about the values in these tensors.
@@ -368,7 +370,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
 
         return batch_changed
 
-    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
+    def _prepare_inputs(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> Tuple[FlashAttentionMetadata, torch.Tensor]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -382,12 +386,19 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # TODO: The Python loop can be slow. Optimize.
         num_scheduled_tokens_list: List[int] = []
         max_num_scheduled_tokens = 0
-        for req_id in self.input_batch.req_ids[:num_reqs]:
+        all_spec_token_ids: List[int] = []
+        num_spec_tokens_list: List[int] = []
+        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens_list.append(num_tokens)
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
                                            num_tokens)
+            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+                req_id, [])
+            all_spec_token_ids.extend(spec_token_ids)
+            num_spec_tokens_list.append(len(spec_token_ids))
+
         num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
                                                     dtype=np.int32)
         assert max_num_scheduled_tokens > 0
@@ -426,6 +437,79 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # where M is the max_model_len.
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
+
+        use_spec_decode = len(all_spec_token_ids) > 0
+        if use_spec_decode:
+
+            # 1. Write spec_token_ids to input batch.
+            # Step 1. Get req indices that perform spec decode and repeat
+            #         the req indices by the number of spec tokens. Note
+            #         for requests that don't perform spec decode, the
+            #         number of spec tokens is 0 and the req index is
+            #         repeated 0 times.
+            # E.g., num_spec_tokens_list:            [3, 0, 2, 0, 1]
+            #       spec_req_indices:                [0, 0, 0, 2, 2, 4]
+            spec_req_indices = np.repeat(self.arange_np[:num_reqs],
+                                         num_spec_tokens_list)
+            # spec_offsets: offsets within each spec token list.
+            # E.g., [1, 2, 3, 1, 2, 1], TODO: avoid the for loop here
+            spec_offsets = np.concatenate(
+                [self.arange_np[1:val + 1] for val in num_spec_tokens_list])
+            # spec_seq_offsets: offsets within each sequence.
+            # E.g., num_computed_tokens_cpu:   [1, 4, 3, 6, 2]
+            #       after repeating:           [1, 1, 1, 3, 3, 2]
+            #       spec_seq_offsets:  [1, 1, 1, 3, 3, 2] + [1, 2, 3, 1, 2, 1]
+            #                                = [2, 3, 4, 4, 5, 3]
+            spec_seq_offsets = np.repeat(
+                self.input_batch.num_computed_tokens_cpu[:num_reqs],
+                num_spec_tokens_list) + spec_offsets
+            # cumsums_spec_offsets: [0, 0, 0, 2M, 2M, 4M] + [2, 3, 4, 4, 5, 3]
+            cumsums_spec_offsets = (
+                spec_seq_offsets +
+                spec_req_indices * self.input_batch.token_ids_cpu.shape[1])
+            cumsums_spec_offsets = torch.from_numpy(cumsums_spec_offsets).to(
+                torch.int64)
+            all_spec_token_ids = torch.tensor(all_spec_token_ids,
+                                              device="cpu",
+                                              dtype=self.input_ids_cpu.dtype)
+
+            # Step 2. Write spec token ids to input_ids_cpu.
+            self.input_batch.token_ids_cpu_tensor.flatten().scatter_(
+                0, cumsums_spec_offsets, all_spec_token_ids)
+
+            # 2. Get spec decode logits indices.
+            # E.g.,   num_scheduled_tokens: [4, 100, 3,   100, 2]
+            #         cu_num_tokens:        [4, 104, 107, 207, 209]
+            #         num_spec_tokens_list: [3, 0,   2,   0,   1]
+            #         num_sampled_tokens:   [4, 1,   3,   1,   2]
+            #         spec_decode_logits_indices:
+            #                 [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+            num_spec_tokens_np = np.array(num_spec_tokens_list, dtype=np.int32)
+            num_sampled_tokens = num_spec_tokens_np + 1
+            # logits_start_loc: [0, 103, 104, 206, 207]
+            logits_start_loc = cu_num_tokens - num_sampled_tokens
+            # [0, 103, 104, 206, 207] ->
+            #               [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+            logits_start_loc = np.repeat(logits_start_loc, num_sampled_tokens)
+            # The following three lines:
+            # [4, 1,   3,   1,   2] -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+            # Step 1. [4, 1, 3, 1, 2] -> [4, 5, 8, 9, 11]
+            cu_num_sampled_tokens = np.cumsum(num_sampled_tokens)
+            # Step 2. [4, 5, 8, 9, 11] -> [0, 4, 5, 8, 9]
+            #         -> [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
+            cumsums_sampled_offsets = np.repeat(
+                cu_num_sampled_tokens - num_sampled_tokens, num_sampled_tokens)
+            # Step 3.  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            #       -  [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
+            #      -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+            total_num_sampled_tokens = num_sampled_tokens.sum()
+            sampled_arange = (self.arange_np[:total_num_sampled_tokens] -
+                              cumsums_sampled_offsets)
+
+            # [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] ->
+            # [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+            spec_decode_logits_indices = logits_start_loc + sampled_arange
+
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
@@ -519,16 +603,21 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             suffix_kv_lens=suffix_kv_lens,
         )
 
+        if use_spec_decode:
+            logits_indices = torch.from_numpy(spec_decode_logits_indices).to(
+                self.device, non_blocking=True)
+        else:
+            # NOTE(woosuk): Due to chunked prefills, the batch may contain
+            # partial requests. While we should not sample any token
+            # from these partial requests, we do so for simplicity.
+            # We will ignore the sampled tokens from the partial requests.
+            # TODO: Support prompt logprobs.
+            logits_indices = query_start_loc[1:] - 1
+
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
 
-        # NOTE(woosuk): Due to chunked prefills, the batch may contain partial
-        # requests. While we should not sample any token from these partial
-        # requests, we do so for simplicity. We will ignore the sampled
-        # tokens from the partial requests.
-        # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
 
     def _compute_cascade_attn_prefix_len(
@@ -673,6 +762,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
     def _prepare_sampling(
         self,
         batch_changed: bool,
+        req_to_spec_token_ids: Dict[str, List[int]],
     ) -> SamplingMetadata:
         # Create the sampling metadata.
         req_id_output_token_ids: Dict[str, List[int]] = \
@@ -680,7 +770,7 @@ def _prepare_sampling(
                 for req_id, req in self.requests.items()}
 
         sampling_metadata = self.input_batch.make_sampling_metadata(
-            req_id_output_token_ids, skip_copy=not batch_changed)
+            req_id_output_token_ids, req_to_spec_token_ids, not batch_changed)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -847,7 +937,8 @@ def execute_model(
         logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(batch_changed)
+        sampling_metadata = self._prepare_sampling(
+            batch_changed, scheduler_output.scheduled_spec_decode_tokens)
         sampler_output = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
@@ -857,18 +948,12 @@ def execute_model(
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
         request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
-        for i, req_id in enumerate(  # type: ignore[assignment]
-                self.input_batch.req_ids[:num_reqs]):
+        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
-            assert seq_len <= req_state.num_tokens
-            if seq_len == req_state.num_tokens:
-                # Append the sampled token to the output token ids.
-                self.input_batch.num_tokens[i] += 1
-                # OPTIMIZATION: Priming the state updates for later updates.
-                req_state.output_token_ids.append(0)
+            if seq_len >= req_state.num_tokens:
                 request_seq_lens.append((i, req_state, seq_len))
             else:
                 # Ignore the sampled token from the partial request.
@@ -886,7 +971,6 @@ def execute_model(
 
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
-        sampled_token_ids = sampler_output.sampled_token_ids.tolist()
         logprobs_tensors = sampler_output.logprobs_tensors
         logprobs_lists = logprobs_tensors.tolists() \
             if logprobs_tensors is not None else None
@@ -897,16 +981,34 @@ def execute_model(
             scheduler_output,
         )
 
-        # Update with the actual token ids
-        for i, req_state, seq_len in request_seq_lens:
-            token_id = sampled_token_ids[i]
-            self.input_batch.token_ids_cpu[i, seq_len] = token_id
-            req_state.output_token_ids[-1] = token_id
+        # Update batch with the valid generated tokens.
+        sampled_token_ids = sampler_output.sampled_token_ids
+        max_gen_len = sampled_token_ids.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids = sampled_token_ids.tolist()
+            for i, req_state, seq_len in request_seq_lens:
+                token_id = valid_sampled_token_ids[i][0]
+                self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                req_state.output_token_ids.append(token_id)
+                self.input_batch.num_tokens[i] += 1
+        else:
+            valid_mask = sampled_token_ids != INVALID_TOKEN_ID
+            gen_lens = valid_mask.sum(dim=1).tolist()
+            valid_sampled_token_ids = [
+                seq.tolist()
+                for seq in sampled_token_ids[valid_mask].split(gen_lens)
+            ]
+            self.input_batch.num_tokens[:num_reqs] += gen_lens
+            for i, req_state, seq_len in request_seq_lens:
+                target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
+                self.input_batch.token_ids_cpu[
+                    i, target_slice] = valid_sampled_token_ids[i]
+                req_state.output_token_ids.extend(valid_sampled_token_ids[i])
 
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=sampled_token_ids,
+            sampled_token_ids=valid_sampled_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b64581bf5f4..8635ffce702 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -695,7 +695,7 @@ def execute_model(
         model_runner_output = ModelRunnerOutput(
             req_ids=all_req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=sampled_token_ids,
+            sampled_token_ids=[[token_id] for token_id in sampled_token_ids],
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
         )

From 262f740dbf3dc9258464fbee3cc92131b90e521f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sat, 15 Feb 2025 22:28:33 -0500
Subject: [PATCH 0199/1240] [Quant] Add `SupportsQuant` to phi3 and clip
 (#13104)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/aqlm.py               |  1 +
 .../model_executor/layers/quantization/awq.py |  1 +
 .../layers/quantization/awq_marlin.py         |  1 +
 .../layers/quantization/base_config.py        |  8 +++--
 .../layers/quantization/bitsandbytes.py       |  2 +-
 .../compressed_tensors/compressed_tensors.py  |  2 +-
 .../layers/quantization/deepspeedfp.py        |  1 +
 .../layers/quantization/experts_int8.py       |  2 +-
 .../layers/quantization/fbgemm_fp8.py         |  1 +
 .../model_executor/layers/quantization/fp8.py |  1 +
 .../layers/quantization/gguf.py               |  2 +-
 .../layers/quantization/gptq.py               |  1 +
 .../layers/quantization/gptq_marlin.py        |  1 +
 .../layers/quantization/gptq_marlin_24.py     |  1 +
 .../layers/quantization/hqq_marlin.py         |  1 +
 .../layers/quantization/ipex_quant.py         |  1 +
 .../layers/quantization/modelopt.py           |  1 +
 .../layers/quantization/moe_wna16.py          |  1 +
 .../layers/quantization/neuron_quant.py       |  1 +
 .../model_executor/layers/quantization/qqq.py |  1 +
 .../layers/quantization/quark/quark.py        |  1 +
 .../layers/quantization/tpu_int8.py           |  1 +
 vllm/model_executor/models/clip.py            |  5 +--
 vllm/model_executor/models/interfaces.py      | 32 +++++++++++++++++++
 vllm/model_executor/models/phi3v.py           | 10 +++---
 25 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index 6c08d016c0f..10f5241f9a7 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -169,6 +169,7 @@ def __init__(
         num_codebooks: int,
         out_group_size: int,
     ) -> None:
+        super().__init__()
         self.in_group_size = in_group_size
         self.nbits_per_codebook = nbits_per_codebook
         self.num_codebooks = num_codebooks
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index ff77af44d77..227be1497d0 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -26,6 +26,7 @@ def __init__(
         zero_point: bool,
         modules_to_not_convert: Optional[List[str]] = None,
     ) -> None:
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.zero_point = zero_point
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index de4009d7d04..111b3f74d50 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -47,6 +47,7 @@ def __init__(self, weight_bits: int, group_size: int, zero_point: bool,
                  lm_head_quantized: bool,
                  modules_to_not_convert: Optional[List[str]],
                  full_config: Dict[str, Any]) -> None:
+        super().__init__()
         self.pack_factor = 32 // weight_bits  # packed into int32
         self.group_size = group_size
         self.zero_point = zero_point
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index c0d8553c0df..5ef11546fd4 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -2,7 +2,7 @@
 
 import inspect
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Mapping, Optional, Type
+from typing import Any, Dict, List, Optional, Type
 
 import torch
 from torch import nn
@@ -59,7 +59,11 @@ def method_has_implemented_embedding(
 
 class QuantizationConfig(ABC):
     """Base class for quantization configs."""
-    packed_modules_mapping: Mapping[str, List[str]] = dict()
+
+    def __init__(self):
+        super().__init__()
+        # mapping is updated by models as they initialize
+        self.packed_modules_mapping: Dict[str, List[str]] = dict()
 
     @abstractmethod
     def get_name(self) -> str:
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 49d992d4cb0..33c2ca93ffa 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -30,7 +30,7 @@ def __init__(
         llm_int8_skip_modules: Optional[List[str]] = None,
         llm_int8_threshold: float = 6.0,
     ) -> None:
-
+        super().__init__()
         self.load_in_8bit = load_in_8bit
         self.load_in_4bit = load_in_4bit
         self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 4c974d31319..ce6c706fe3d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -51,7 +51,7 @@ def __init__(
         kv_cache_scheme: Optional[Dict[str, Any]] = None,
         config: Optional[Dict[str, Any]] = None,
     ):
-
+        super().__init__()
         self.ignore = ignore
         self.quant_format = quant_format
         # Map from [target -> scheme]
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index b4123650149..67934d37284 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -25,6 +25,7 @@ def __init__(
         weight_bits: int = 8,
         group_size: int = 512,
     ) -> None:
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.valid_types = [torch.bfloat16, torch.float16]
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 87fbcf62ac1..663fb8bf5b8 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -17,7 +17,7 @@ class ExpertsInt8Config(QuantizationConfig):
     """Config class for Int8 experts quantization."""
 
     def __init__(self) -> None:
-        pass
+        super().__init__()
 
     @classmethod
     def get_name(cls) -> str:
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index da5ef36c510..3bb8188f725 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -29,6 +29,7 @@ class FBGEMMFp8Config(QuantizationConfig):
     """Config class for FBGEMM Fp8."""
 
     def __init__(self, ignore_list: List[str], input_scale_ub: float):
+        super().__init__()
         self.ignore_list = ignore_list if ignore_list else []
         self.input_scale_ub = input_scale_ub
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 86e025310f4..f928ea7e23c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -47,6 +47,7 @@ def __init__(
         ignored_layers: Optional[List[str]] = None,
         weight_block_size: Optional[List[int]] = None,
     ) -> None:
+        super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected fp8 checkpoint. Please note that the "
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 86e6dbb5a5f..b1fecb32f4d 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -20,7 +20,7 @@ class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
 
     def __init__(self, ) -> None:
-        pass
+        super().__init__()
 
     def __repr__(self) -> str:
         return ("GGUFConfig()")
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 6d1f0cc2eb4..09291c2bf1f 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -58,6 +58,7 @@ def __init__(
         #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
         #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
         # }
+        super().__init__()
         self.dynamic = dynamic
 
         self.weight_bits = weight_bits
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index f421dbd2ce2..9f960d9fd37 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -46,6 +46,7 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
                  is_sym: bool, lm_head_quantized: bool,
                  dynamic: Dict[str, Dict[str, Union[int, bool]]],
                  full_config: Dict[str, Any]) -> None:
+        super().__init__()
         if desc_act and group_size == -1:
             # In this case, act_order == True is the same as act_order == False
             # (since we have only one group per output channel)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index cec984483fd..dd747e182e2 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -38,6 +38,7 @@ def __init__(
         weight_bits: int,
         group_size: int,
     ) -> None:
+        super().__init__()
         quant_type = {
             4: scalar_types.uint4b8,
             8: scalar_types.uint8b128,
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 432f43688ff..4edc9aa848a 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -33,6 +33,7 @@ def __init__(
         group_size: int,
         skip_modules: Optional[List[str]] = None,
     ) -> None:
+        super().__init__()
         assert group_size == 64, ("The only supported HQQ group size is "
                                   "currently 64.")
         assert weight_bits == 4, ("The only supported HQQ quantization "
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 2531170ecec..c09cc13cb27 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -35,6 +35,7 @@ def __init__(
         desc_act: Optional[bool] = None,
         lm_head_quantized: Optional[bool] = None,
     ) -> None:
+        super().__init__()
         self.method = method
         self.weight_bits = weight_bits
         self.group_size = group_size
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 348e9bccd9b..050130de1c0 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -28,6 +28,7 @@ def __init__(
         self,
         is_checkpoint_fp8_serialized: bool = False,
     ) -> None:
+        super().__init__()
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         if is_checkpoint_fp8_serialized:
             logger.warning("Detected ModelOpt fp8 checkpoint. Please note that"
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 30eb04698d8..da06ca3f70e 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -24,6 +24,7 @@ def __init__(self, linear_quant_method: str, weight_bits: int,
                  group_size: int, has_zp: bool, lm_head_quantized: bool,
                  modules_to_not_convert: Optional[List[str]],
                  full_config: Dict[str, Any]) -> None:
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.has_zp = has_zp
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index a8e8be207fd..82954612fb2 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -20,6 +20,7 @@ def __init__(
         dequant_dtype: str = "f16",
         quantize_method: str = "vector_dynamic",
     ) -> None:
+        super().__init__()
         self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8")
         if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
             raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 6e9d3dc6cb3..1e05917a518 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -39,6 +39,7 @@ def __init__(
         group_size: int,
         is_sym: bool = True,
     ) -> None:
+        super().__init__()
         self.weight_bits = weight_bits
         self.group_size = group_size
         self.is_sym = is_sym
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index ba123565a0e..ca71da8b736 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -30,6 +30,7 @@ def __init__(self,
                  kv_cache_group: Optional[List[str]] = None,
                  kv_cache_config: Optional[Dict[str, Any]] = None,
                  pack_method: str = "reorder"):
+        super().__init__()
         if kv_cache_group is None:
             kv_cache_group = []
         self.quant_config = quant_config
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index 3234fecaa3b..14e5bcf6e5b 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -21,6 +21,7 @@ def __init__(
         self,
         activation_scheme: str = "none",
     ) -> None:
+        super().__init__()
         if activation_scheme not in ACTIVATION_SCHEMES:
             raise ValueError(
                 f"Unsupported activation scheme {activation_scheme}")
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 547f6244781..73c109a27ac 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -15,6 +15,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import SupportsQuant
 
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
@@ -335,10 +336,10 @@ def forward(
         return encoder_outputs
 
 
-class CLIPVisionModel(nn.Module):
-
+class CLIPVisionModel(nn.Module, SupportsQuant):
     config_class = CLIPVisionConfig
     main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index a0a1b69ad50..bd6661d668d 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -7,6 +7,8 @@
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 from vllm.utils import supports_kw
 
 from .interfaces_base import is_pooling_model
@@ -443,6 +445,36 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
+class SupportsQuant:
+    """The interface required for all models that support quantization."""
+
+    packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {}
+    quant_config: Optional[QuantizationConfig] = None
+
+    def __new__(cls, *args, **kwargs) -> "SupportsQuant":
+        instance = super().__new__(cls)
+        quant_config = cls._find_quant_config(*args, **kwargs)
+        if quant_config is not None:
+            instance.quant_config = quant_config
+            instance.quant_config.packed_modules_mapping.update(
+                cls.packed_modules_mapping)
+        return instance
+
+    @staticmethod
+    def _find_quant_config(*args, **kwargs) -> Optional[QuantizationConfig]:
+        from vllm.config import VllmConfig  # avoid circular import
+
+        args_values = list(args) + list(kwargs.values())
+        for arg in args_values:
+            if isinstance(arg, VllmConfig):
+                return arg.quant_config
+
+            if isinstance(arg, QuantizationConfig):
+                return arg
+
+        return None
+
+
 @runtime_checkable
 class SupportsTranscription(Protocol):
     """The interface required for all models that support transcription."""
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 053390c521f..6bbfa40beed 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -50,7 +50,7 @@
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsMultiModal, SupportsPP, SupportsQuant
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -498,7 +498,8 @@ def _apply_prompt_replacements(
 @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor,
                                         info=Phi3VProcessingInfo,
                                         dummy_inputs=Phi3VDummyInputsBuilder)
-class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
+class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
+                       SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.vision_embed_tokens.wte": "embed_tokens",
@@ -510,7 +511,6 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
@@ -520,14 +520,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             config.vocab_size,
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
-            quant_config=quant_config,
+            quant_config=self.quant_config,
             prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
 
         # TODO: Optionally initializes this for supporting input embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             config,
-            quant_config,
+            self.quant_config,
             prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
 
         self.language_model = init_vllm_registered_model(

From cd3d10500be40f988c10e9622b7161c15ad23014 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 15 Feb 2025 22:42:25 -0500
Subject: [PATCH 0200/1240] [Bugfix] Pin xgrammar to 0.1.11 (#13338)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 8991cc1ce02..b7c94cbdba8 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,7 +20,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
 lark == 1.2.2 
-xgrammar >= 0.1.11; platform_machine == "x86_64"
+xgrammar == 0.1.11; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs

From db989bca13ca64989af593fcda18597708701ec1 Mon Sep 17 00:00:00 2001
From: wchen61 <wchen61@foxmail.com>
Date: Sun, 16 Feb 2025 16:59:49 +0800
Subject: [PATCH 0201/1240] [BugFix] Enhance test_pos_encoding to support
 execution on multi-devices (#13187)

Signed-off-by: wchen61 <wchen61@foxmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_pos_encoding.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index af9bfd2f0f5..bff7f8e57fb 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -70,7 +70,7 @@ def test_rotary_embedding(
     if rotary_dim is None:
         rotary_dim = head_size
     rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
@@ -125,7 +125,7 @@ def test_batched_rotary_embedding(
         "rope_type": "linear",
         "factor": (1, )
     })
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
@@ -184,7 +184,7 @@ def test_batched_rotary_embedding_multi_lora(
         "rope_type": "linear",
         "factor": tuple(scaling_factors)
     })
-    rope = rope.to(dtype=dtype)
+    rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
     query = torch.randn(batch_size,

From dc4b059defdbd410ce4a66ea01758a37c906fe3e Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 16 Feb 2025 02:35:54 -0800
Subject: [PATCH 0202/1240] [V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md                    | 4 ++--
 examples/offline_inference/vision_language.py             | 4 ++--
 examples/offline_inference/vision_language_multi_image.py | 5 ++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7145bcf2d5f..b046ccfd155 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -726,7 +726,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
   *
   * ✅︎
-  * \*
+  * ✅︎\*
 - * `Idefics3ForConditionalGeneration`
   * Idefics3
   * T + I
@@ -869,7 +869,7 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{note}
-H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
+`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
 :::
 
 :::{note}
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index b9963669a0d..5f05389faf8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -119,7 +119,7 @@ def run_glm4v(question: str, modality: str):
 def run_h2ovl(question: str, modality: str):
     assert modality == "image"
 
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "h2oai/h2ovl-mississippi-800m"
 
     llm = LLM(
         model=model_name,
@@ -136,7 +136,7 @@ def run_h2ovl(question: str, modality: str):
                                            add_generation_prompt=True)
 
     # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
     return llm, prompt, stop_token_ids
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 1a5ea0c70bc..b2821966cf1 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -78,14 +78,13 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
 
 
 def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
-    model_name = "h2oai/h2ovl-mississippi-2b"
+    model_name = "h2oai/h2ovl-mississippi-800m"
 
     llm = LLM(
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
         limit_mm_per_prompt={"image": len(image_urls)},
-        mm_processor_kwargs={"max_dynamic_patch": 4},
     )
 
     placeholders = "\n".join(f"Image-{i}: <image>\n"
@@ -99,7 +98,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
                                            add_generation_prompt=True)
 
     # Stop tokens for H2OVL-Mississippi
-    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
 
     return ModelRequestData(

From e666b92497e83013a56dd992aee6a967db7bc2c0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Feb 2025 22:09:15 +0800
Subject: [PATCH 0203/1240] [ci] skip failed tests for flashinfer (#13352)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_flashinfer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index 212ceb5e417..f623b0014db 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -273,6 +273,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
         seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
         head_size: int, dtype: torch.dtype, block_size: int,
         soft_cap: Optional[float]) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
@@ -384,6 +385,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
     block_size: int,
     soft_cap: Optional[float],
 ) -> None:
+    pytest.skip("TODO: fix the accuracy issue")
     # test doesn't work for num_heads = (16,16)
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)

From fc92cb454c5faee89447587f9902329f0b7688b5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 16 Feb 2025 22:14:22 +0800
Subject: [PATCH 0204/1240] [platform] add base class for communicators
 (#13208)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../base_device_communicator.py               | 117 ++++++++
 .../device_communicators/cpu_communicator.py  |  33 +++
 .../device_communicators/cuda_communicator.py | 106 ++++++++
 .../device_communicators/hpu_communicator.py  |  33 +--
 .../device_communicators/tpu_communicator.py  |  29 +-
 .../device_communicators/xpu_communicator.py  |  49 ----
 vllm/distributed/parallel_state.py            | 249 ++++--------------
 vllm/platforms/cpu.py                         |   7 +
 vllm/platforms/cuda.py                        |   4 +
 vllm/platforms/hpu.py                         |   4 +
 vllm/platforms/interface.py                   |   7 +
 vllm/platforms/rocm.py                        |   4 +
 vllm/platforms/tpu.py                         |   4 +
 13 files changed, 364 insertions(+), 282 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/base_device_communicator.py
 create mode 100644 vllm/distributed/device_communicators/cpu_communicator.py
 create mode 100644 vllm/distributed/device_communicators/cuda_communicator.py
 delete mode 100644 vllm/distributed/device_communicators/xpu_communicator.py

diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
new file mode 100644
index 00000000000..eb12f8834b4
--- /dev/null
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class DeviceCommunicatorBase:
+    """
+    Base class for device-specific communicator.
+    It can use the `cpu_group` to initialize the communicator.
+    If the device has PyTorch integration (PyTorch can recognize its
+    communication backend), the `device_group` will also be given.
+    """
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        self.device = device or torch.device("cpu")
+        self.cpu_group = cpu_group
+        self.device_group = device_group
+        self.unique_name = unique_name
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+        self.ranks = dist.get_process_group_ranks(cpu_group)
+        self.global_rank = dist.get_rank()
+        self.global_world_size = dist.get_world_size()
+        self.rank_in_group = dist.get_group_rank(self.cpu_group,
+                                                 self.global_rank)
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor,
+                                    input_,
+                                    group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(input_,
+                                 gather_list,
+                                 dst=self.ranks[dst],
+                                 group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        pass
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
new file mode 100644
index 00000000000..4e86396e713
--- /dev/null
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+from torch.distributed import ProcessGroup
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class CpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        self.ipex_available = False
+        self.dist_module = torch.distributed
+        try:
+            import intel_extension_for_pytorch as ipex
+            self.ipex_available = True
+            self.dist_module = ipex.distributed
+        except ImportError:
+            """
+            Intel IPEX not found. Falling back to PyTorch native 
+            all_reduce for CPU (e.g. MacOS)
+            """
+            pass
+
+    def all_reduce(self, input_):
+        return self.dist_module.all_reduce(input_, group=self.device_group)
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
new file mode 100644
index 00000000000..f806f8b39ef
--- /dev/null
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+from torch.distributed import ProcessGroup
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class CudaCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+        if "pp" in unique_name:
+            # pipeline parallel does not need custom allreduce
+            use_custom_allreduce = False
+        else:
+            from vllm.distributed.parallel_state import (
+                _ENABLE_CUSTOM_ALL_REDUCE)
+            use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
+        use_pynccl = True
+
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce)
+        from vllm.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator)
+
+        self.pynccl_comm: Optional[PyNcclCommunicator] = None
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+        self.ca_comm: Optional[CustomAllreduce] = None
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+
+    def all_reduce(self, input_):
+        # always try custom allreduce first,
+        # and then pynccl.
+        ca_comm = self.ca_comm
+        if ca_comm is not None and not ca_comm.disabled and \
+            ca_comm.should_custom_ar(input_):
+            out = ca_comm.custom_all_reduce(input_)
+            assert out is not None
+            return out
+        pynccl_comm = self.pynccl_comm
+        assert pynccl_comm is not None
+        out = pynccl_comm.all_reduce(input_)
+        if out is None:
+            # fall back to the default all-reduce using PyTorch.
+            # this usually happens during testing.
+            # when we run the model, allreduce only happens for the TP
+            # group, where we always have either custom allreduce or pynccl.
+            out = input_.clone()
+            torch.distributed.all_reduce(out, group=self.device_group)
+        return out
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+        return tensor
+
+    def destroy(self):
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py
index 3f85da98aca..9536a7f883e 100644
--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
@@ -2,45 +2,40 @@
 
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
 
 from vllm.platforms import current_platform
 
+from .base_device_communicator import DeviceCommunicatorBase
+
 if current_platform.is_hpu():
     import habana_frameworks.torch as htorch  # noqa: F401
 
 
-class HpuCommunicator:
-
-    def __init__(self, group: ProcessGroup):
-        if not current_platform.is_hpu():
-            self.disabled = True
-            return
-        self.disabled = False
-        self.group = group
-        self.world_size = dist.get_world_size(self.group)
+class HpuCommunicator(DeviceCommunicatorBase):
 
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
         # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
         # (which is required for tensor parallel HPUGraph inference)
         htorch.core.mark_step()
-        dist.all_reduce(x, group=self.group)
-        return x
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
 
-    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
         if dim < 0:
             # Convert negative dim to positive.
-            dim += x.dim()
-        input_size = x.size()
+            dim += input_.dim()
+        input_size = input_.size()
         # Allocate output tensor.
         output_tensor = torch.empty((world_size, ) + input_size,
-                                    dtype=x.dtype,
-                                    device=x.device)
+                                    dtype=input_.dtype,
+                                    device=input_.device)
         # All-gather.
         htorch.core.mark_step()
-        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        dist.all_gather_into_tensor(output_tensor,
+                                    input_,
+                                    group=self.device_group)
         # Reshape
         output_tensor = output_tensor.movedim(0, dim)
         output_tensor = output_tensor.reshape(input_size[:dim] +
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 7af7c65f642..524e655b6b4 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -1,13 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from typing import Optional
 
 import torch
-import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
 from vllm.platforms import current_platform
 
+from .base_device_communicator import DeviceCommunicatorBase
+
 if current_platform.is_tpu():
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
@@ -16,19 +18,20 @@
     from vllm.executor import ray_utils
 
 
-class TpuCommunicator:
+class TpuCommunicator(DeviceCommunicatorBase):
 
-    def __init__(self, group: ProcessGroup):
-        if not current_platform.is_tpu():
-            self.disabled = True
-            return
-        self.disabled = False
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
 
         # NOTE(woosuk): When using TP > 1 on TPUs, every TPU on the same node
         # must be used together. Therefore, the local rank and world size can
         # be simply calculated as follows.
-        global_rank = dist.get_rank(group)
-        global_world_size = dist.get_world_size(group)
+        global_rank = self.global_rank
+        global_world_size = self.global_world_size
 
         # Calculate how many TPU nodes are in the current deployment. This
         # is the Ray placement group if it is deployed with Ray. Default
@@ -55,9 +58,9 @@ def __init__(self, group: ProcessGroup):
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
 
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, x)
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, input_)
 
-    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
-        return xm.all_gather(x, dim=dim)
+        return xm.all_gather(input_, dim=dim)
diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
deleted file mode 100644
index 79ccc101e08..00000000000
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import torch
-import torch.distributed as dist
-from torch.distributed import ProcessGroup
-
-from vllm.platforms import current_platform
-
-
-class XpuCommunicator:
-
-    def __init__(self, group: ProcessGroup):
-        if not current_platform.is_xpu():
-            self.disabled = True
-            return
-        self.disabled = False
-        self.group = group
-        self.world_size = dist.get_world_size(self.group)
-
-    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
-        dist.all_reduce(x, group=self.group)
-        return x
-
-    def gather(self,
-               input_: torch.Tensor,
-               rank_in_group: int,
-               dst: int = 0,
-               dim: int = -1):
-        # For xpu path, gather doesn't work properly together with ray
-        # cluster so we use all_gather instead for now.
-        input_size = input_.size()
-        # Allocate output tensor.
-        output_tensor = torch.empty((self.world_size, ) + input_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        torch.distributed.all_gather_into_tensor(output_tensor,
-                                                 input_,
-                                                 group=self.group)
-        if rank_in_group == dst:
-            # Reshape
-            output_tensor = output_tensor.movedim(0, dim)
-            output_tensor = output_tensor.reshape(input_size[:dim] +
-                                                  (self.world_size *
-                                                   input_size[dim], ) +
-                                                  input_size[dim + 1:])
-        else:
-            output_tensor = None
-        return output_tensor
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index bfc41703b94..781f870a756 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -39,9 +39,12 @@
 
 import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer
 import vllm.envs as envs
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import direct_register_custom_op, supports_custom_op
+from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
+                        supports_custom_op)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -130,9 +133,8 @@ class GroupCoordinator:
     PyTorch ProcessGroup is bound to one specific communication backend,
         e.g. NCCL, Gloo, MPI, etc.
     GroupCoordinator takes charge of all the communication operations among
-        the processes in the group. It can route the communication to
-        a specific implementation (e.g. switch allreduce implementation
-        based on the tensor size and cuda graph mode).
+        the processes in the group. It manages both CPU and device
+        communication.
     """
 
     # available attributes:
@@ -150,11 +152,8 @@ class GroupCoordinator:
     rank_in_group: int  # rank inside the group
     cpu_group: ProcessGroup  # group for CPU communication
     device_group: ProcessGroup  # group for device communication
-    use_pynccl: bool  # a hint of whether to use PyNccl
-    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
-    # communicators are only created for world size > 1
-    pynccl_comm: Optional[Any]  # PyNccl communicator
-    ca_comm: Optional[Any]  # Custom allreduce communicator
+    use_device_communicator: bool  # whether to use device communicator
+    device_communicator: DeviceCommunicatorBase  # device communicator
     mq_broadcaster: Optional[Any]  # shared memory broadcaster
 
     def __init__(
@@ -162,11 +161,7 @@ def __init__(
         group_ranks: List[List[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
-        use_pynccl: bool,
-        use_custom_allreduce: bool,
-        use_tpu_communicator: bool,
-        use_hpu_communicator: bool,
-        use_xpu_communicator: bool,
+        use_device_communicator: bool,
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -196,56 +191,26 @@ def __init__(
         assert self.device_group is not None
 
         from vllm.platforms import current_platform
+
+        # TODO: fix it for other platforms
         if current_platform.is_cuda_alike():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
             self.device = torch.device("cpu")
 
-        self.use_pynccl = use_pynccl
-        self.use_custom_allreduce = use_custom_allreduce
-        self.use_tpu_communicator = use_tpu_communicator
-        self.use_hpu_communicator = use_hpu_communicator
-        self.use_xpu_communicator = use_xpu_communicator
-
-        # lazy import to avoid documentation build error
-        from vllm.distributed.device_communicators.custom_all_reduce import (
-            CustomAllreduce)
-        from vllm.distributed.device_communicators.pynccl import (
-            PyNcclCommunicator)
-
-        self.pynccl_comm: Optional[PyNcclCommunicator] = None
-        if use_pynccl and self.world_size > 1:
-            self.pynccl_comm = PyNcclCommunicator(
-                group=self.cpu_group,
-                device=self.device,
-            )
+        self.use_device_communicator = use_device_communicator
 
-        self.ca_comm: Optional[CustomAllreduce] = None
-        if use_custom_allreduce and self.world_size > 1:
-            # Initialize a custom fast all-reduce implementation.
-            self.ca_comm = CustomAllreduce(
-                group=self.cpu_group,
+        self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
+        if use_device_communicator and self.world_size > 1:
+            device_comm_cls = resolve_obj_by_qualname(
+                current_platform.get_device_communicator_cls())
+            self.device_communicator = device_comm_cls(
+                cpu_group=self.cpu_group,
                 device=self.device,
+                device_group=self.device_group,
+                unique_name=self.unique_name,
             )
 
-        from vllm.distributed.device_communicators.tpu_communicator import (
-            TpuCommunicator)
-        self.tpu_communicator: Optional[TpuCommunicator] = None
-        if use_tpu_communicator and self.world_size > 1:
-            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
-
-        from vllm.distributed.device_communicators.hpu_communicator import (
-            HpuCommunicator)
-        self.hpu_communicator: Optional[HpuCommunicator]
-        if use_hpu_communicator and self.world_size > 1:
-            self.hpu_communicator = HpuCommunicator(group=self.device_group)
-
-        from vllm.distributed.device_communicators.xpu_communicator import (
-            XpuCommunicator)
-        self.xpu_communicator: Optional[XpuCommunicator]
-        if use_xpu_communicator and self.world_size > 1:
-            self.xpu_communicator = XpuCommunicator(group=self.device_group)
-
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -253,6 +218,9 @@ def __init__(
             self.mq_broadcaster = MessageQueue.create_from_process_group(
                 self.cpu_group, 1 << 22, 6)
 
+        from vllm.platforms import current_platform
+        self.use_custom_op_call = current_platform.is_cuda_alike()
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -296,9 +264,16 @@ def graph_capture(
         else:
             stream = graph_capture_context.stream
 
-        ca_comm = self.ca_comm
-        maybe_ca_context = nullcontext(
-        ) if ca_comm is None else ca_comm.capture()
+        # only cuda uses this function,
+        # so we don't abstract it into the base class
+        maybe_ca_context = nullcontext()
+        from vllm.distributed.device_communicators.cuda_communicator import (
+            CudaCommunicator)
+        if self.device_communicator is not None:
+            assert isinstance(self.device_communicator, CudaCommunicator)
+            ca_comm = self.device_communicator.ca_comm
+            if ca_comm is not None:
+                maybe_ca_context = ca_comm.capture()  # type: ignore
 
         # ensure all initialization operations complete before attempting to
         # capture the graph on another stream
@@ -328,54 +303,14 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if self.world_size == 1:
             return input_
 
-        if input_.is_cpu:
-            try:
-                import intel_extension_for_pytorch as ipex
-                ipex.distributed.all_reduce(input_, group=self.device_group)
-                return input_
-            except ImportError:
-                """
-                Intel IPEX not found. Falling back to PyTorch native 
-                all_reduce for CPU
-                """
-                torch.distributed.all_reduce(input_, group=self.device_group)
-                return input_
-
-        if self.tpu_communicator is not None and \
-            not self.tpu_communicator.disabled:
-            # TPU handles Dynamo with its own logic.
-            return self.tpu_communicator.all_reduce(input_)
-
-        if self.hpu_communicator is not None and \
-            not self.hpu_communicator.disabled:
-            return self.hpu_communicator.all_reduce(input_)
-
-        if self.xpu_communicator is not None and \
-                not self.xpu_communicator.disabled:
-            return self.xpu_communicator.all_reduce(input_)
-
-        return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name)
+        if self.use_custom_op_call:
+            return torch.ops.vllm.all_reduce(input_,
+                                             group_name=self.unique_name)
+        else:
+            return self._all_reduce_out_place(input_)
 
     def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
-        # always try custom allreduce first,
-        # and then pynccl.
-        ca_comm = self.ca_comm
-        if ca_comm is not None and not ca_comm.disabled and \
-            ca_comm.should_custom_ar(input_):
-            out = ca_comm.custom_all_reduce(input_)
-            assert out is not None
-            return out
-        pynccl_comm = self.pynccl_comm
-        assert pynccl_comm is not None
-        out = pynccl_comm.all_reduce(input_)
-        if out is None:
-            # fall back to the default all-reduce using PyTorch.
-            # this usually happens during testing.
-            # when we run the model, allreduce only happens for the TP
-            # group, where we always have either custom allreduce or pynccl.
-            out = input_.clone()
-            torch.distributed.all_reduce(out, group=self.device_group)
-        return out
+        return self.device_communicator.all_reduce(input_)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         world_size = self.world_size
@@ -385,40 +320,7 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
 
-        # For TPUs, use TPU communicator.
-        tpu_comm = self.tpu_communicator
-        if tpu_comm is not None and not tpu_comm.disabled:
-            return tpu_comm.all_gather(input_, dim)
-
-        # For HPUs, use HPU communicator.
-        hpu_comm = self.hpu_communicator
-        if hpu_comm is not None and not hpu_comm.disabled:
-            return hpu_comm.all_gather(input_, dim)
-
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        input_size = input_.size()
-        # NOTE: we have to use concat-style all-gather here,
-        # stack-style all-gather has compatibility issues with
-        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
-        output_size = (input_size[0] * world_size, ) + input_size[1:]
-        # Allocate output tensor.
-        output_tensor = torch.empty(output_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        torch.distributed.all_gather_into_tensor(output_tensor,
-                                                 input_,
-                                                 group=self.device_group)
-        # Reshape
-        output_tensor = output_tensor.reshape((world_size, ) + input_size)
-        output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
-        return output_tensor
+        return self.device_communicator.all_gather(input_, dim)
 
     def gather(self,
                input_: torch.Tensor,
@@ -433,30 +335,7 @@ def gather(self,
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
             return input_
-        assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        if self.xpu_communicator is not None and \
-                not self.xpu_communicator.disabled:
-            return self.xpu_communicator.gather(input_, self.rank_in_group,
-                                                dst, dim)
-        # Allocate output tensor.
-        if self.rank_in_group == dst:
-            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-        else:
-            gather_list = None
-        # Gather.
-        torch.distributed.gather(input_,
-                                 gather_list,
-                                 dst=self.ranks[dst],
-                                 group=self.device_group)
-        if self.rank_in_group == dst:
-            output_tensor = torch.cat(gather_list, dim=dim)
-        else:
-            output_tensor = None
-        return output_tensor
+        return self.device_communicator.gather(input_, dst, dim)
 
     def broadcast(self, input_: torch.Tensor, src: int = 0):
         """Broadcast the input tensor.
@@ -798,14 +677,7 @@ def barrier(self):
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
-        if dst is None:
-            dst = (self.rank_in_group + 1) % self.world_size
-
-        pynccl_comm = self.pynccl_comm
-        if pynccl_comm is not None and not pynccl_comm.disabled:
-            pynccl_comm.send(tensor, dst)
-        else:
-            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+        self.device_communicator.send(tensor, dst)
 
     def recv(self,
              size: torch.Size,
@@ -813,16 +685,7 @@ def recv(self,
              src: Optional[int] = None) -> torch.Tensor:
         """Receives a tensor from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
-        if src is None:
-            src = (self.rank_in_group - 1) % self.world_size
-
-        tensor = torch.empty(size, dtype=dtype, device=self.device)
-        pynccl_comm = self.pynccl_comm
-        if pynccl_comm is not None and not pynccl_comm.disabled:
-            pynccl_comm.recv(tensor, src)
-        else:
-            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
-        return tensor
+        return self.device_communicator.recv(size, dtype, src)
 
     def destroy(self):
         if self.device_group is not None:
@@ -831,10 +694,8 @@ def destroy(self):
         if self.cpu_group is not None:
             torch.distributed.destroy_process_group(self.cpu_group)
             self.cpu_group = None
-        if self.pynccl_comm is not None:
-            self.pynccl_comm = None
-        if self.ca_comm is not None:
-            self.ca_comm = None
+        if self.device_communicator is not None:
+            self.device_communicator.destroy()
         if self.mq_broadcaster is not None:
             self.mq_broadcaster = None
 
@@ -853,11 +714,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         group_ranks=[ranks],
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=False,
-        use_custom_allreduce=False,
-        use_tpu_communicator=False,
-        use_hpu_communicator=False,
-        use_xpu_communicator=False,
+        use_device_communicator=False,
         group_name="world",
     )
 
@@ -866,23 +723,15 @@ def init_model_parallel_group(
     group_ranks: List[List[int]],
     local_rank: int,
     backend: str,
-    use_custom_allreduce: Optional[bool] = None,
     use_message_queue_broadcaster: bool = False,
     group_name: Optional[str] = None,
 ) -> GroupCoordinator:
-    if use_custom_allreduce is None:
-        use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
-    from vllm.platforms import current_platform
+
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_pynccl=current_platform.is_cuda_alike(),
-        use_custom_allreduce=current_platform.is_cuda_alike()
-        and use_custom_allreduce,
-        use_tpu_communicator=True,
-        use_hpu_communicator=True,
-        use_xpu_communicator=True,
+        use_device_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
@@ -1053,11 +902,9 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
-                                    use_custom_allreduce=False,
                                     group_name="pp")
 
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index a9216c2322e..ab8982a3a6e 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -146,3 +146,10 @@ def is_pin_memory_available(cls) -> bool:
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_cpu.PunicaWrapperCPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator"  # noqa
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2c40a798736..5b073125614 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -233,6 +233,10 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
 
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 78ddb67bb3f..4c842b52511 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -88,3 +88,7 @@ def is_pin_memory_available(cls):
     @classmethod
     def get_punica_wrapper(cls) -> str:
         return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 19adc2af8c6..58948ad1aba 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -322,6 +322,13 @@ def get_punica_wrapper(cls) -> str:
         """
         raise NotImplementedError
 
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        """
+        Get device specific communicator class for distributed communication.
+        """
+        return "vllm.distributed.device_communicator.base_device_communicator.DeviceCommunicatorBase"  # noqa
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d57cce4231d..393b8a18527 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -186,3 +186,7 @@ def get_current_memory_usage(cls,
         torch.cuda.reset_peak_memory_stats(device)
         return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(
             device)[0]
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0c81d6a9389..cdf835a52c0 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -115,3 +115,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on TPU.")
         return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa

From b15283155a287e9c54b96dd6355a9ee13b813e9c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 16 Feb 2025 22:20:22 +0800
Subject: [PATCH 0205/1240] [Bugfix] Fix 2 Node and Spec Decode tests (#13341)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/distributed/test_pipeline_parallel.py | 10 +++++-----
 vllm/spec_decode/ngram_worker.py            | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 06f9435816d..9677ccd2ea8 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -275,11 +275,11 @@ def _compare_tp(
     if load_format == "dummy":
         # Avoid OOM
         text_overrides = {
-            "num_layers": 1,
-            "num_hidden_layers": 1,
-            "num_experts": 2,
-            "num_experts_per_tok": 2,
-            "num_local_experts": 2,
+            "num_hidden_layers": 4,
+            "hidden_size": 512,
+            "intermediate_size": 800,
+            "num_attention_heads": 4,
+            "num_key_value_heads": 1,
         }
 
         if is_multimodal:
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 86390c99c2f..57ae173af67 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -6,6 +6,7 @@
 import torch
 import torch.nn as nn
 
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposals
@@ -25,11 +26,18 @@ class NGramWorker(NonLLMProposerWorkerBase):
     which don't rely on LLM model to give proposals.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        device_type: str = "cuda",
+        **kwargs,
+    ):
+        super().__init__(vllm_config)
+
         # Get local_rank/vocab_size from kwargs attribute
-        self.local_rank = kwargs["local_rank"]
-        self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size()
-        self.device_type = kwargs.get("device_type", "cuda")
+        self.local_rank = local_rank
+        self.device_type = device_type
 
         # Lazy initialization list.
         self._proposer: Top1Proposer

From 6a28ea5f60af5bd1e7f726b3e714105a4626dae5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=87=8C?= <i@ioioi.cn>
Date: Mon, 17 Feb 2025 00:04:21 +0800
Subject: [PATCH 0206/1240] [Docs] Change myenv to vllm. Update
 python_env_setup.inc.md (#13325)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../getting_started/installation/python_env_setup.inc.md  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index cb73914c9c7..6ea44c36db3 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -2,8 +2,8 @@ You can create a new Python environment using `conda`:
 
 ```console
 # (Recommended) Create a new conda environment.
-conda create -n myenv python=3.12 -y
-conda activate myenv
+conda create -n vllm python=3.12 -y
+conda activate vllm
 ```
 
 :::{note}
@@ -14,6 +14,6 @@ Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/
 
 ```console
 # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-uv venv myenv --python 3.12 --seed
-source myenv/bin/activate
+uv venv vllm --python 3.12 --seed
+source vllm/bin/activate
 ```

From 6da309765a4dfa9e3593b8f309e36df16b46ce59 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Feb 2025 09:39:08 -0800
Subject: [PATCH 0207/1240] [V1][BugFix] Add __init__.py to v1/spec_decode/
 (#13359)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/spec_decode/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 vllm/v1/spec_decode/__init__.py

diff --git a/vllm/v1/spec_decode/__init__.py b/vllm/v1/spec_decode/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d

From 875e3a77c2036c63adbbcfd686f9103f7628bfe7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Feb 2025 10:02:27 -0800
Subject: [PATCH 0208/1240] [V1][PP] Cache Intermediate Tensors (#13353)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 12b7ce18fbc..d3995b619d3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2,7 +2,7 @@
 
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
 
 import numpy as np
 import torch
@@ -149,6 +149,7 @@ def __init__(
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
+        # self.intermediate_tensors  # Set after load_model
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -869,7 +870,7 @@ def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> ModelRunnerOutput:
+    ) -> Union[ModelRunnerOutput, torch.Tensor]:
         batch_changed = self._update_states(scheduler_output)
 
         if self.is_multimodal_model:
@@ -919,6 +920,14 @@ def execute_model(
         else:
             positions = self.positions[:num_input_tokens]
 
+        if get_pp_group().is_first_rank:
+            intermediate_tensors = None
+        else:
+            intermediate_tensors = IntermediateTensors({
+                k: v[:num_input_tokens]
+                for k, v in self.intermediate_tensors.items()
+            })
+
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
@@ -931,7 +940,9 @@ def execute_model(
                 inputs_embeds=inputs_embeds,
             )
         if not get_pp_group().is_last_rank:
+            # For mid-pipeline stages, return the hidden states.
             return hidden_states
+
         hidden_states = hidden_states[:num_scheduled_tokens]
         sample_hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
@@ -1118,12 +1129,21 @@ def _dummy_run(
             positions = self.mrope_positions[:, :num_tokens]
         else:
             positions = self.positions[:num_tokens]
-        intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = self.model.make_empty_intermediate_tensors(
-                batch_size=num_tokens,
-                dtype=self.model_config.dtype,
-                device=self.device)
+
+        if get_pp_group().is_first_rank:
+            intermediate_tensors = None
+        else:
+            if not hasattr(self, "intermediate_tensors"):
+                self.intermediate_tensors = (
+                    self.model.make_empty_intermediate_tensors(
+                        batch_size=self.max_num_tokens,
+                        dtype=self.model_config.dtype,
+                        device=self.device))
+            intermediate_tensors = IntermediateTensors({
+                k: v[:num_tokens]
+                for k, v in self.intermediate_tensors.items()
+            })
+
         with set_forward_context(None, self.vllm_config):
             hidden_states = model(
                 input_ids=input_ids,

From bf8f15cb3447814f0e11e448ebb145174b9b3fc5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Mon, 17 Feb 2025 02:55:27 +0800
Subject: [PATCH 0209/1240] [Bugfix][Platform][CPU] Fix cuda platform detection
 on CPU backend edge case (#13358)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index e4767a378f4..724c4357ff7 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -33,12 +33,19 @@ def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
 
     try:
+        from importlib.metadata import version
+
         from vllm.utils import import_pynvml
         pynvml = import_pynvml()
         pynvml.nvmlInit()
         try:
-            if pynvml.nvmlDeviceGetCount() > 0:
-                is_cuda = True
+            # NOTE: Edge case: vllm cpu build on a GPU machine.
+            # Third-party pynvml can be imported in cpu build,
+            # we need to check if vllm is built with cpu too.
+            # Otherwise, vllm will always activate cuda plugin
+            # on a GPU machine, even if in a cpu build.
+            is_cuda = (pynvml.nvmlDeviceGetCount() > 0
+                       and "cpu" not in version("vllm"))
         finally:
             pynvml.nvmlShutdown()
     except Exception as e:

From 45108877c054e0256317e3cfdd478060335ddad2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Feb 2025 12:25:29 -0800
Subject: [PATCH 0210/1240] [V1][BugFix] Clean up rejection sampler & Fix
 warning msg (#13362)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/rejection_sampler.py | 109 ++++++++++++++++++----------
 1 file changed, 69 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 6a0bbe7b216..df1da893021 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,7 +3,9 @@
 import torch.nn as nn
 from torch.nn.utils.rnn import pad_sequence
 
+from vllm import envs
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 
@@ -19,27 +21,50 @@
 
 class RejectionSampler(nn.Module):
 
+    def __init__(self):
+        super().__init__()
+        if current_platform.is_cuda:
+            if is_flashinfer_available:
+                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
+                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
+                    # default it is unused). For backward compatibility, we set
+                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
+                    # interpret it differently in V0 and V1 samplers: In V0,
+                    # None means False, while in V1, None means True. This is
+                    # why we use the condition
+                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
+                    logger.info("Using FlashInfer for rejection sampling.")
+                    self.forward_method = self.flashinfer_sample
+                else:
+                    logger.warning(
+                        "FlashInfer is available, but it is not enabled. "
+                        "Falling back to the PyTorch-native implementation of "
+                        "rejection sampling. For the best performance, "
+                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
+                    self.forward_method = self.forward_native
+            else:
+                logger.warning(
+                    "FlashInfer is not available. Falling back to the PyTorch-"
+                    "native implementation of rejection sampling. For the "
+                    "best performance, please install FlashInfer.")
+                self.forward_method = self.forward_native
+        else:
+            self.forward_method = self.forward_native
+
     def forward(self, logits: torch.Tensor,
                 sampling_metadata: SamplingMetadata) -> SamplerOutput:
         if not sampling_metadata.all_greedy:
             raise NotImplementedError(
-                "Only greedy sampling is supported by rejection sampler.")
+                "Currently, only greedy sampling is supported by "
+                "rejection sampler.")
+        return self.forward_method(logits, sampling_metadata)
 
-        if is_flashinfer_available:
-            logger.info("User FlashInfer for rejection sampling.")
-            return RejectionSampler.flashinfer_sample(logits,
-                                                      sampling_metadata)
-        else:
-            logger.warning(
-                "FlashInfer is not available. Falling back to the PyTorch-"
-                "native implementation of rejection sampling.")
-            return RejectionSampler.greedy_sample_native(
-                logits, sampling_metadata)
-
-    @staticmethod
     def flashinfer_sample(
-            logits: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> SamplerOutput:
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
         # NOTE: The following input preparationg can be moved
         # to the model runner with a persistent manner for better
         # performance.
@@ -71,10 +96,10 @@ def flashinfer_sample(
         vocab_size = logits.size(-1)
         # NOTE: CPU <-> GPU synchronization happens here.
         draft_token_ids = draft_token_ids.to(logits.device)
-        draft_probs = RejectionSampler._create_greedy_token_probs(
-            draft_token_ids, vocab_size, logits.device)
-        target_probs = RejectionSampler._create_greedy_token_probs(
-            target_token_ids, vocab_size, logits.device)
+        draft_probs = _create_greedy_token_probs(draft_token_ids, vocab_size,
+                                                 logits.device)
+        target_probs = _create_greedy_token_probs(target_token_ids, vocab_size,
+                                                  logits.device)
         uniform_samples = torch.zeros(batch_size,
                                       max_spec_len + 1,
                                       device=logits.device)
@@ -89,10 +114,11 @@ def flashinfer_sample(
                              logprobs_tensors=None)
 
     # TODO: The following method can be optimized for better performance.
-    @staticmethod
-    def greedy_sample_native(
-            logits: torch.Tensor,
-            sampling_metadata: SamplingMetadata) -> SamplerOutput:
+    def forward_native(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
         spec_lens = [len(x) for x in sampling_metadata.spec_token_ids]
         # Add 1 to include the 'bonus' token.
         sample_lens = [x + 1 for x in spec_lens]
@@ -137,24 +163,27 @@ def greedy_sample_native(
         return SamplerOutput(sampled_token_ids=output_token_ids,
                              logprobs_tensors=None)
 
-    @staticmethod
-    def _create_greedy_token_probs(token_ids: torch.Tensor, vocab_size: int,
-                                   out_device: torch.device) -> torch.Tensor:
-        batch_size, num_tokens = token_ids.shape
 
-        token_probs = torch.zeros(batch_size,
-                                  num_tokens,
-                                  vocab_size,
-                                  dtype=torch.float,
-                                  device=out_device)
+def _create_greedy_token_probs(
+    token_ids: torch.Tensor,
+    vocab_size: int,
+    out_device: torch.device,
+) -> torch.Tensor:
+    batch_size, num_tokens = token_ids.shape
+
+    token_probs = torch.zeros(batch_size,
+                              num_tokens,
+                              vocab_size,
+                              dtype=torch.float,
+                              device=out_device)
 
-        # Ignore INVALID_TOKEN_ID.
-        valid_mask = (token_ids != INVALID_TOKEN_ID)
-        valid_indices = token_ids.clone()
-        valid_indices[~valid_mask] = 0
+    # Ignore INVALID_TOKEN_ID.
+    valid_mask = (token_ids != INVALID_TOKEN_ID)
+    valid_indices = token_ids.clone()
+    valid_indices[~valid_mask] = 0
 
-        token_probs.scatter_(dim=2,
-                             index=valid_indices.unsqueeze(-1),
-                             src=valid_mask.unsqueeze(-1).float())
+    token_probs.scatter_(dim=2,
+                         index=valid_indices.unsqueeze(-1),
+                         src=valid_mask.unsqueeze(-1).float())
 
-        return token_probs
+    return token_probs

From 3a44f4c07b205e09fafb9f8523cf2bf9a48f6c71 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 17 Feb 2025 11:26:24 +0800
Subject: [PATCH 0211/1240] [V1][Misc] Avoid unnecessary log output (#13289)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d3995b619d3..f1212c3554b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -96,11 +96,13 @@ def __init__(
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
-        # NOTE: Initialized client is only used for processing dummy
-        # multimodal data into multimodal kwargs for GPU memory profiling.
-        # Only applicable to multimodal models with legacy input mapper.
-        self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config)
-        self.mm_input_mapper_profiling.use_cache = False
+        if self.is_multimodal_model:
+            # NOTE: Initialized client is only used for processing dummy
+            # multimodal data into multimodal kwargs for GPU memory profiling.
+            # Only applicable to multimodal models with legacy input mapper.
+            self.mm_input_mapper_profiling = MMInputCacheClient(
+                self.model_config)
+            self.mm_input_mapper_profiling.use_cache = False
 
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,

From 8dd8fd2e7f4d56d4e82e3b2bf47bd0784871bad6 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Mon, 17 Feb 2025 11:32:26 +0800
Subject: [PATCH 0212/1240] [Feature][Spec Decode] Simplify the use of Eagle
 Spec Decode (#12304)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/spec_decode.md           |  16 +-
 .../spec_decode/e2e/test_eagle_correctness.py | 144 ++++++++++++++++++
 tests/spec_decode/test_spec_decode_worker.py  |  40 ++++-
 vllm/config.py                                |   9 ++
 vllm/model_executor/models/eagle.py           |  24 ++-
 vllm/spec_decode/multi_step_worker.py         |  12 ++
 .../spec_decode/smaller_tp_proposer_worker.py |  19 +++
 vllm/spec_decode/spec_decode_worker.py        |  27 +++-
 8 files changed, 273 insertions(+), 18 deletions(-)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 1e468962cc9..d2255eff608 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -175,7 +175,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3-8B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="path/to/modified/eagle/model",
+    speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
     speculative_draft_tensor_parallel_size=1,
 )
 
@@ -190,14 +190,12 @@ for output in outputs:
 
 A few important things to consider when using the EAGLE based draft models:
 
-1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be
-   used directly with vLLM due to differences in the expected layer names and model definition.
-   To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d)
-   to convert them. Note that this script does not modify the model's weights.
-
-   In the above example, use the script to first convert
-   the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model
-   and then use the converted checkpoint as the draft model in vLLM.
+1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
+   be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
+   If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
+   [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
+   and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
+   the latest version of vLLM, please leave a comment or raise an issue.
 
 2. The EAGLE based draft models need to be run without tensor parallelism
    (i.e. speculative_draft_tensor_parallel_size is set to 1), although
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 6d1803f8bc6..42a84071d94 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -305,6 +305,150 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
                                   batch_size, output_len, seed)
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "meta-llama/Llama-2-7b-chat-hf",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                             per_test_common_llm_kwargs,
+                                             baseline_llm_kwargs,
+                                             test_llm_kwargs, batch_size: int,
+                                             output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                             per_test_common_llm_kwargs,
+                                             baseline_llm_kwargs,
+                                             test_llm_kwargs, batch_size: int,
+                                             output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": "float16",
+
+        # Main model
+        "model_name": "Qwen/Qwen2-7B-Instruct",
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize("seed", [1])
+def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                            per_test_common_llm_kwargs,
+                                            baseline_llm_kwargs,
+                                            test_llm_kwargs, batch_size: int,
+                                            output_len: int, seed: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  temperature=0.0)
+
+
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index eee0f4c89c8..e4b1a178b0c 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -13,15 +13,18 @@
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SequenceOutput
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
                                                  split_num_cache_blocks_evenly)
+from vllm.worker.worker import Worker
 
 from .test_utils import mock_spec_decode_sampler
-from .utils import create_batch, create_sampler_output_list, mock_worker
+from .utils import (create_batch, create_sampler_output_list, create_worker,
+                    mock_worker)
 
 
 @pytest.mark.parametrize('k', [1, 2, 6])
@@ -905,3 +908,38 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
             worker.execute_model(execute_model_req=execute_model_req)
         # but first draft still counted
         assert draft_worker.get_spec_proposals.call_count == 1
+
+
+def test_correctly_load_weight_for_eagle():
+    """
+        Verify SpecDecodeWorker loads lm_head weight for eagle correctly.
+    """
+    seed = 100
+    block_size = 32
+    num_gpu_blocks = 8096 // block_size
+    target_worker = create_worker(
+        Worker,
+        "JackFram/llama-68m",
+        block_size,
+        num_gpu_blocks,
+        seed,
+    )
+    draft_worker = create_worker(
+        MultiStepWorker,
+        "abhigoyal/vllm-eagle-llama-68m-random",
+        block_size,
+        num_gpu_blocks,
+        seed,
+        model_runner_cls=TP1DraftModelRunner,
+    )
+
+    spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler")
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              spec_decode_sampler,
+                              disable_logprobs=False)
+    worker.proposer_worker.maybe_load_lm_head_weight(
+        target_worker.model_runner.model.lm_head.weight.data)
+    assert torch.allclose(
+        worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
+        worker.scorer_worker.model_runner.model.lm_head.weight.data)
diff --git a/vllm/config.py b/vllm/config.py
index 07499d5abbe..5c220ed1363 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1833,6 +1833,15 @@ def maybe_create_spec_config(
 
             draft_hf_config = draft_model_config.hf_config
 
+            # Detect EAGLE prefix to replace hf_config for EAGLE draft_model
+            if "eagle-" in draft_model_config.model.lower():
+                from vllm.transformers_utils.configs.eagle import EAGLEConfig
+                if isinstance(draft_model_config.hf_config, EAGLEConfig):
+                    pass
+                else:
+                    eagle_config = EAGLEConfig(draft_model_config.hf_config)
+                    draft_model_config.hf_config = eagle_config
+
             if (num_speculative_tokens is not None
                     and hasattr(draft_hf_config, "num_lookahead_tokens")):
                 draft_hf_config.num_lookahead_tokens = num_speculative_tokens
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 373a728be89..ab3f0dc07f4 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -7,6 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -18,6 +19,8 @@
 
 from .utils import maybe_prefix
 
+logger = init_logger(__name__)
+
 
 class DummyInputLayerNorm(nn.Module):
 
@@ -190,8 +193,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                             default_weight_loader)
                     weight_loader(self.fc.bias, loaded_weight)
                 else:
-                    raise ValueError("Found bias in the loaded weights "
-                                     "but the model config doesn't have bias")
+                    logger.warning_once("Found bias in the loaded weights but "
+                                        "the model config doesn't have bias.")
             elif name.startswith("model.lm_head.") or name.startswith(
                     "model.model."):
                 model_weights[name.split("model.", 1)[-1]] = loaded_weight
@@ -200,12 +203,21 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             else:
                 model_weights[f"model.{name}"] = loaded_weight
 
-        lm_head_weight = model_weights.pop("lm_head.weight")
+        if "lm_head.weight" in model_weights:
+            lm_head_weight = model_weights.pop("lm_head.weight")
+
+            if self.token_map is not None and\
+                lm_head_weight.shape[0] > self.token_map.shape[0]:
 
-        if self.token_map is not None and\
-            lm_head_weight.shape[0] > self.token_map.shape[0]:
+                lm_head_weight = lm_head_weight[self.token_map]
 
-            lm_head_weight = lm_head_weight[self.token_map]
+        else:
+            # NOTE(Shangming): initialize the placeholder for lm_head weight.
+            lm_head_weight = torch.zeros(
+                self.lm_head.org_vocab_size,
+                self.lm_head.embedding_dim,
+                dtype=self.config.torch_dtype,
+            )
 
         weight_loader = getattr(self.lm_head.weight, "weight_loader",
                                 default_weight_loader)
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 5474917a6fa..c28d413efe7 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -7,6 +7,7 @@
 import torch
 
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.platforms import current_platform
 from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData,
                            SequenceGroupMetadata)
@@ -386,3 +387,14 @@ def _raise_if_unsupported(
                 execute_model_req.seq_group_metadata_list):
             raise NotImplementedError(
                 "MultiStepWorker does not support beam search.")
+
+    def maybe_load_lm_head_weight(
+        self,
+        lm_head_weight: torch.Tensor,
+    ) -> None:
+        weight_loader = getattr(
+            self.worker.model_runner.model_runner.model.lm_head.weight,
+            "weight_loader", default_weight_loader)
+        weight_loader(
+            self.worker.model_runner.model_runner.model.lm_head.weight,
+            lm_head_weight)
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index a1466ba5db7..69195624650 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -10,6 +10,7 @@
                                              patch_tensor_parallel_group)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
@@ -173,3 +174,21 @@ def get_cache_block_size_bytes(self) -> int:
     @property
     def vocab_size(self) -> int:
         return self._worker.vocab_size
+
+    def maybe_load_lm_head_weight(
+        self,
+        lm_head_weight: torch.Tensor,
+    ) -> None:
+        if self._is_dummy:
+            return
+
+        with self._patch_tensor_parallel_group():
+            weight_loader = getattr(
+                self._worker.worker.model_runner.model_runner.model.\
+                    lm_head.weight,
+                "weight_loader",
+                default_weight_loader)
+            weight_loader(
+                self._worker.worker.model_runner.model_runner.model.\
+                    lm_head.weight,
+                lm_head_weight)
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8653bece8b5..33b1be54c8b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -9,7 +9,8 @@
 import torch.nn as nn
 
 from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
-from vllm.distributed.communication_op import broadcast_tensor_dict
+from vllm.distributed.communication_op import (broadcast_tensor_dict,
+                                               tensor_model_parallel_gather)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -155,6 +156,7 @@ def create_worker(
     ) -> "SpecDecodeWorker":
 
         allow_zero_draft_token_step = True
+        enable_lm_head_weight_load = False
         ngram_prompt_lookup_max = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
@@ -187,6 +189,11 @@ def create_worker(
                             "EAGLE does not support TP > 1 yet")
 
                     allow_zero_draft_token_step = False
+
+                # Load lm_head weight for eagle in init_device
+                if draft_model_config.hf_config.model_type == "eagle":
+                    enable_lm_head_weight_load = True
+
                 proposer_worker = MultiStepWorker(**draft_worker_kwargs)
 
             proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
@@ -239,7 +246,8 @@ def create_worker(
             disable_log_stats=disable_log_stats,
             disable_by_batch_size=disable_by_batch_size,
             spec_decode_sampler=spec_decode_sampler,
-            allow_zero_draft_token_step=allow_zero_draft_token_step)
+            allow_zero_draft_token_step=allow_zero_draft_token_step,
+            enable_lm_head_weight_load=enable_lm_head_weight_load)
 
     def __init__(
         self,
@@ -252,6 +260,7 @@ def __init__(
         metrics_collector: Optional[AsyncMetricsCollector] = None,
         disable_by_batch_size: Optional[int] = None,
         allow_zero_draft_token_step: Optional[bool] = True,
+        enable_lm_head_weight_load: Optional[bool] = False,
     ):
         """
         Create a SpecDecodeWorker.
@@ -282,6 +291,8 @@ def __init__(
             allow_zero_draft_token_step: whether to allow a step where the draft
                 model generates no draft token; should disallow when the tp of
                 draft model is larger than 1 (TODO: #5814)
+            enable_lm_head_weight_load: whether to load lm_head weight for
+                draft models like eagle.
         """
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
@@ -291,6 +302,7 @@ def __init__(
         self.disable_by_batch_size = disable_by_batch_size or float("inf")
         self.spec_decode_sampler = spec_decode_sampler
         self._allow_zero_draft_token_step = allow_zero_draft_token_step
+        self._enable_lm_head_weight_load = enable_lm_head_weight_load
         self._metrics = AsyncMetricsCollector(
             self.spec_decode_sampler
         ) if metrics_collector is None else metrics_collector
@@ -327,6 +339,17 @@ def init_device(self) -> None:
         self.scorer_worker.load_model()
         self.proposer_worker.load_model()
 
+        if self._enable_lm_head_weight_load:
+            # NOTE(Shangming): gather lm_head weight when tp enabled
+            target_lm_head_weight: torch.Tensor = tensor_model_parallel_gather(
+                self.scorer_worker.model_runner.model_runner.model.lm_head.\
+                    weight.data,
+                    dim=0,
+            )
+
+            self.proposer_worker.maybe_load_lm_head_weight(
+                target_lm_head_weight)
+
         self._metrics.init_tensors(self.rank, device_type=self.device)
         self.spec_decode_sampler.init_tensors(self.rank,
                                               device_type=self.device)

From 3edeeca6b9d93cf6c8e138d8b8068b8295b959e2 Mon Sep 17 00:00:00 2001
From: yankooo <948162199@qq.com>
Date: Mon, 17 Feb 2025 14:53:20 +0800
Subject: [PATCH 0213/1240] Fix spelling error in index.md (#13369)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index ee25678e2c4..d17155647f9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -23,7 +23,7 @@
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 
 vLLM is fast with:
 

From 14df7b85138e069dfa00fea650da58c9f78078ff Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Mon, 17 Feb 2025 00:16:32 -0800
Subject: [PATCH 0214/1240] Run v1 benchmark and integrate with PyTorch OSS
 benchmark database (#13068)

Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../scripts/run-performance-benchmarks.sh     |  5 +
 .../tests/latency-tests.json                  |  2 +-
 benchmarks/benchmark_latency.py               | 91 ++++++++++++-------
 benchmarks/benchmark_serving.py               | 48 ++++++++--
 benchmarks/benchmark_serving_guided.py        |  3 +-
 benchmarks/benchmark_throughput.py            | 24 ++++-
 benchmarks/benchmark_utils.py                 | 39 ++++++++
 7 files changed, 167 insertions(+), 45 deletions(-)
 create mode 100644 benchmarks/benchmark_utils.py

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 0d16a83781a..9425cb07ec0 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -345,6 +345,11 @@ main() {
   check_gpus
   check_hf_token
 
+  # Set to v1 to run v1 benchmark
+  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
+    export VLLM_USE_V1=1
+  fi
+
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json
index 1841186da15..7762a239f96 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@@ -29,4 +29,4 @@
             "num-iters": 15
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 89631294531..b041626550b 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,14 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
+
 import argparse
 import dataclasses
 import json
+import os
 import time
 from pathlib import Path
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 
 import numpy as np
 import torch
+from benchmark_utils import convert_to_pytorch_benchmark_format
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
@@ -18,6 +21,19 @@
 from vllm.utils import FlexibleArgumentParser
 
 
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={"latency": results["latencies"]},
+        extra_info={k: results[k]
+                    for k in ["avg_latency", "percentiles"]})
+    if pt_records:
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
 
@@ -54,7 +70,8 @@ def llm_generate():
                     beam_width=args.n,
                     max_tokens=args.output_len,
                     ignore_eos=True,
-                ))
+                ),
+            )
 
     def run_to_completion(profile_dir: Optional[str] = None):
         if profile_dir:
@@ -64,7 +81,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         torch.profiler.ProfilerActivity.CUDA,
                     ],
                     on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                        str(profile_dir))) as p:
+                        str(profile_dir)),
+            ) as p:
                 llm_generate()
             print(p.key_averages().table(sort_by="self_cuda_time_total"))
         else:
@@ -81,9 +99,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
     if args.profile:
         profile_dir = args.profile_result_dir
         if not profile_dir:
-            profile_dir = Path(
-                "."
-            ) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
+            profile_dir = (Path(".") / "vllm_benchmark_result" /
+                           f"latency_result_{time.time()}")
         print(f"Profiling (results will be saved to '{profile_dir}')...")
         run_to_completion(profile_dir=profile_dir)
         return
@@ -95,9 +112,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
-    print(f'Avg latency: {np.mean(latencies)} seconds')
+    print(f"Avg latency: {np.mean(latencies)} seconds")
     for percentage, percentile in zip(percentages, percentiles):
-        print(f'{percentage}% percentile latency: {percentile} seconds')
+        print(f"{percentage}% percentile latency: {percentile} seconds")
 
     # Output JSON results if specified
     if args.output_json:
@@ -108,43 +125,51 @@ def run_to_completion(profile_dir: Optional[str] = None):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--n',
-                        type=int,
-                        default=1,
-                        help='Number of generated sequences per prompt.')
-    parser.add_argument('--use-beam-search', action='store_true')
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=10,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion.")
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument("--num-iters",
                         type=int,
                         default=30,
-                        help='Number of iterations to run.')
+                        help="Number of iterations to run.")
     parser.add_argument(
-        '--profile',
-        action='store_true',
-        help='profile the generation process of a single batch')
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
     parser.add_argument(
-        '--profile-result-dir',
+        "--profile-result-dir",
         type=str,
         default=None,
-        help=('path to save the pytorch profiler output. Can be visualized '
-              'with ui.perfetto.dev or Tensorboard.'))
+        help=("path to save the pytorch profiler output. Can be visualized "
+              "with ui.perfetto.dev or Tensorboard."),
+    )
     parser.add_argument(
-        '--output-json',
+        "--output-json",
         type=str,
         default=None,
-        help='Path to save the latency results in JSON format.')
+        help="Path to save the latency results in JSON format.",
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 90eb052399b..9760737ccec 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -56,6 +56,8 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+from benchmark_utils import convert_to_pytorch_benchmark_format
+
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
@@ -402,21 +404,21 @@ async def get_request(
     burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
     """
-    Asynchronously generates requests at a specified rate 
+    Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
-    
+
     Args:
-        input_requests: 
+        input_requests:
             A list of input requests, each represented as a tuple.
-        request_rate: 
+        request_rate:
             The rate at which requests are generated (requests/s).
-        burstiness (optional): 
-            The burstiness factor of the request generation. 
+        burstiness (optional):
+            The burstiness factor of the request generation.
             Only takes effect when request_rate is not inf.
             Default value is 1, which follows a Poisson process.
             Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results 
-            in more bursty requests, while a higher burstiness value 
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
     input_requests = iter(input_requests)
@@ -817,6 +819,32 @@ def parse_goodput(slo_pairs):
     return goodput_config_dict
 
 
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any],
+                                     file_name: str) -> None:
+    metrics = [
+        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
+        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
+        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]]
+                 for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results if k not in metrics and k not in ignored_metrics
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -997,6 +1025,7 @@ def main(args: argparse.Namespace):
             file_name = os.path.join(args.result_dir, file_name)
         with open(file_name, "w", encoding='utf-8') as outfile:
             json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
 
 
 if __name__ == "__main__":
@@ -1014,7 +1043,8 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
-    parser.add_argument("--host", type=str, default="localhost")
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
         "--endpoint",
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 561e500d8b6..04942b06ffd 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -731,7 +731,8 @@ def main(args: argparse.Namespace):
         default=None,
         help="Server or API base url if not using http host and port.",
     )
-    parser.add_argument("--host", type=str, default="localhost")
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument(
         "--endpoint",
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 658eab6a278..f7d87f1b336 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -3,13 +3,15 @@
 import argparse
 import dataclasses
 import json
+import os
 import random
 import time
 from functools import cache
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import uvloop
+from benchmark_utils import convert_to_pytorch_benchmark_format
 from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -338,6 +340,25 @@ def run_mii(
     return end - start
 
 
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: Dict[str, Any]) -> None:
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={
+            "requests_per_second": [results["requests_per_second"]],
+            "tokens_per_second": [results["tokens_per_second"]],
+        },
+        extra_info={
+            k: results[k]
+            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -435,6 +456,7 @@ def main(args: argparse.Namespace):
         }
         with open(args.output_json, "w") as f:
             json.dump(results, f, indent=4)
+        save_to_pytorch_benchmark_format(args, results)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
new file mode 100644
index 00000000000..6f01cf20e17
--- /dev/null
+++ b/benchmarks/benchmark_utils.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+from typing import Any, Dict, List
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: Dict[str, List],
+                                        extra_info: Dict[str, Any]) -> List:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+        records.append(record)
+
+    return records

From 6fdf1146e786bda96718b285319f48dac6b0ab6e Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Mon, 17 Feb 2025 16:57:13 +0800
Subject: [PATCH 0215/1240] [MISC] tiny fixes (#13378)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/executor_base.py | 2 +-
 vllm/platforms/interface.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 75e3c67c556..6f5adb4f647 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -109,7 +109,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
         """
         # NOTE: This is logged in the executor because there can be >1 workers.
         logger.info("# %s blocks: %d, # CPU blocks: %d",
-                    vllm.platforms.current_platform.dispatch_key,
+                    vllm.platforms.current_platform.device_name,
                     num_gpu_blocks, num_cpu_blocks)
         max_concurrency = (num_gpu_blocks * self.cache_config.block_size /
                            self.model_config.max_model_len)
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 58948ad1aba..d6dae2e526d 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -327,7 +327,7 @@ def get_device_communicator_cls(cls) -> str:
         """
         Get device specific communicator class for distributed communication.
         """
-        return "vllm.distributed.device_communicator.base_device_communicator.DeviceCommunicatorBase"  # noqa
+        return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
 
 
 class UnspecifiedPlatform(Platform):

From 217f1ce4e9b8de5930c23db62a92d00a78c33d24 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Feb 2025 17:36:07 +0800
Subject: [PATCH 0216/1240] [VLM] Check required fields before initializing
 field config in `DictEmbeddingItems` (#13380)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/multimodal_inputs.md |  4 ++--
 vllm/model_executor/models/minicpmo.py   | 13 ++++++++-----
 vllm/model_executor/models/minicpmv.py   | 18 ++++++++++++------
 vllm/model_executor/models/qwen2_vl.py   |  4 ++--
 vllm/multimodal/parse.py                 | 18 +++++++++++-------
 5 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index ade59e37738..5cec5548ba1 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -184,8 +184,8 @@ llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={
 mm_data = {
     "image": {
         "image_embeds": image_embeds,
-        # image_size_list is needed to calculate details of the sliced image.
-        "image_size_list": [image.size for image in images],  # list of image sizes
+        # image_sizes is needed to calculate details of the sliced image.
+        "image_sizes": [image.size for image in images],  # list of image sizes
     }
 }
 
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 473881f9554..aa8c193ed6a 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,8 +23,8 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from functools import partial
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Set, Tuple, TypedDict, Union)
 
 import torch
 from torch import nn
@@ -122,13 +122,16 @@ class MiniCPMOAudioEmbeddingItems(DictEmbeddingItems):
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
-        fields_config: Mapping[str, MultiModalFieldConfig],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
     ) -> None:
         super().__init__(
             data,
             modality="image",
-            fields_config=fields_config,
             required_fields={"audio_embeds"},
+            fields_factory=fields_factory,
         )
 
 
@@ -141,7 +144,7 @@ def _parse_audio_data(
         if isinstance(data, dict):
             return MiniCPMOAudioEmbeddingItems(
                 data,
-                fields_config=_minicpmo_field_config(data),
+                fields_factory=_minicpmo_field_config,
             )
 
         return super()._parse_audio_data(data)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 77ac9eb467b..2083e7dc0b8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -255,13 +255,16 @@ class MiniCPMVImageEmbeddingItems(DictEmbeddingItems):
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
-        fields_config: Mapping[str, MultiModalFieldConfig],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
     ) -> None:
         super().__init__(
             data,
             modality="image",
-            fields_config=fields_config,
             required_fields={"image_embeds", "image_sizes"},
+            fields_factory=fields_factory,
         )
 
     def get_image_size(self, index: int) -> ImageSize:
@@ -274,13 +277,16 @@ class MiniCPMVVideoEmbeddingItems(DictEmbeddingItems):
     def __init__(
         self,
         data: Mapping[str, torch.Tensor],
-        fields_config: Mapping[str, MultiModalFieldConfig],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
     ) -> None:
         super().__init__(
             data,
             modality="video",
-            fields_config=fields_config,
             required_fields={"video_embeds", "video_image_sizes"},
+            fields_factory=fields_factory,
         )
 
     def get_frame_size(self, index: int) -> ImageSize:
@@ -300,7 +306,7 @@ def _parse_image_data(
         if isinstance(data, dict):
             return MiniCPMVImageEmbeddingItems(
                 data,
-                fields_config=_minicpmv_field_config(data),
+                fields_factory=_minicpmv_field_config,
             )
 
         return super()._parse_image_data(data)
@@ -312,7 +318,7 @@ def _parse_video_data(
         if isinstance(data, dict):
             return MiniCPMVVideoEmbeddingItems(
                 data,
-                fields_config=_minicpmv_field_config(data),
+                fields_factory=_minicpmv_field_config,
             )
 
         return super()._parse_video_data(data)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 3821f8d55be..68340ace18d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -691,8 +691,8 @@ def _parse_image_data(
             return DictEmbeddingItems(
                 data,
                 modality="image",
-                fields_config=_qwen2vl_field_config(data),
                 required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_qwen2vl_field_config,
             )
 
         return super()._parse_image_data(data)
@@ -705,8 +705,8 @@ def _parse_video_data(
             return DictEmbeddingItems(
                 data,
                 modality="video",
-                fields_config=_qwen2vl_field_config(data),
                 required_fields={"video_embeds", "video_grid_thw"},
+                fields_factory=_qwen2vl_field_config,
             )
 
         return super()._parse_video_data(data)
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index fb07c5c6a25..4e3e5b20886 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -125,17 +125,14 @@ def __init__(
         self,
         data: Mapping[str, torch.Tensor],
         modality: str,
-        fields_config: Mapping[str, MultiModalFieldConfig],
         required_fields: set[str],
+        fields_factory: Callable[
+            [Mapping[str, torch.Tensor]],
+            Mapping[str, MultiModalFieldConfig],
+        ],
     ) -> None:
         super().__init__(data, modality)
 
-        missing_required_fields = required_fields - fields_config.keys()
-        if missing_required_fields:
-            fields = set(fields_config.keys())
-            msg = f"{required_fields=} should be a subset of {fields=}"
-            raise ValueError(msg)
-
         missing_required_data_keys = required_fields - data.keys()
         if missing_required_data_keys:
             data_keys = set(data.keys())
@@ -143,6 +140,13 @@ def __init__(
                    f"but only found the following keys: {data_keys}")
             raise ValueError(msg)
 
+        fields_config = fields_factory(data)
+        missing_required_fields = required_fields - fields_config.keys()
+        if missing_required_fields:
+            fields = set(fields_config.keys())
+            msg = f"{required_fields=} should be a subset of {fields=}"
+            raise ValueError(msg)
+
         self.fields_config = fields_config
         self.required_fields = required_fields
 

From 2b5008a19faa4e3eabbddd5ab4e84bcae563220a Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 17 Feb 2025 07:17:50 -0500
Subject: [PATCH 0217/1240] [Model] Support Mamba2 (Codestral Mamba) (#9292)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../decoder_only/language/test_mamba.py       |  43 ++-
 tests/models/registry.py                      |   2 +
 .../layers/mamba/ops/ssd_chunk_scan.py        |   8 +-
 vllm/model_executor/models/bamba.py           |  21 +-
 vllm/model_executor/models/jamba.py           |  15 +-
 vllm/model_executor/models/mamba.py           |  18 +-
 vllm/model_executor/models/mamba2.py          | 320 ++++++++++++++++++
 vllm/model_executor/models/mamba_cache.py     |  13 +-
 vllm/model_executor/models/registry.py        |   1 +
 9 files changed, 376 insertions(+), 65 deletions(-)
 create mode 100644 vllm/model_executor/models/mamba2.py

diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 854f4fe4f91..80d13b667bb 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -4,6 +4,7 @@
 Run `pytest tests/models/test_mamba.py`.
 """
 import pytest
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from vllm.engine.arg_utils import EngineArgs
@@ -11,7 +12,14 @@
 
 from ...utils import check_outputs_equal
 
-MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
+MODELS = [
+    "state-spaces/mamba-130m-hf",
+    "tiiuae/falcon-mamba-tiny-dev",
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]
 
 
 # Use lower-level interfaces to create this greedy generator, as mamba will
@@ -21,6 +29,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForCausalLM.from_pretrained(model_name)
 
+    # Set the device (GPU if available, else CPU)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+
     # Generate texts from the prompts
     outputs = []
     for prompt in example_prompts:
@@ -29,7 +41,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
         input_ids = inputs["input_ids"].to(model.device)
 
         # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_ids = model.generate(input_ids,
+                                       max_new_tokens=max_tokens,
+                                       do_sample=False)
         generated_text = tokenizer.decode(generated_ids[0],
                                           skip_special_tokens=True)
 
@@ -50,7 +64,8 @@ def test_models(
 ) -> None:
     hf_outputs = generate_greedy(model, example_prompts, max_tokens)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Set max_num_seqs to keep Codestral from going OOM at fp32
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
         # This test is for verifying whether the model's extra_repr
@@ -81,7 +96,7 @@ def test_batching(
 ) -> None:
     # To pass the small model tests, we need full precision.
     for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         for prompt in example_prompts:
             for_loop_outputs.append(
                 vllm_model.generate_greedy([prompt], max_tokens)[0])
@@ -165,20 +180,22 @@ def test_parallel_sampling(
     max_tokens: int,
 ) -> None:
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Numerical differences produce slightly different output for these
+    if 'state-spaces' in model:
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         for_loop_outputs = []
         for _ in range(10):
             for_loop_outputs.append(
-                # using example_prompts index 1 instead of 0 since with 0 the
-                # logprobs get really close and the test doesn't pass
-                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
-                [0])
+                vllm_model.generate_greedy(example_prompts, max_tokens)[0])
         sampling_params = SamplingParams(n=10,
                                          temperature=0.001,
                                          seed=0,
                                          max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
-                                             sampling_params)
+        n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
     token_ids, texts = n_lt_1_outputs[0]
     n_lt_1_outputs = [(token_id, text)
                       for token_id, text in zip(token_ids, texts)]
@@ -232,7 +249,7 @@ def test_models_preemption_recompute(
     # Tests that outputs are identical with and w/o preemtions (recompute)
     assert dtype == "float"
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         vllm_model.model.llm_engine.scheduler[
             0].ENABLE_ARTIFICIAL_PREEMPT = True
         preempt_vllm_outputs = vllm_model.generate_greedy(
@@ -283,7 +300,7 @@ def test_state_cleanup(
     # This test is for verifying that the Mamba state is cleaned up between
     # steps, If its not cleaned, an error would be expected.
     try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
             for _ in range(10):
                 vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
     except ValueError:
diff --git a/tests/models/registry.py b/tests/models/registry.py
index c3e1c785979..17bfe1d21e4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -145,6 +145,8 @@ def check_available_online(
     "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
                                         is_available_online=False),
     "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
+                                         is_available_online=False),
     "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
     "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                          trust_remote_code=True),
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index 722fbd714ca..7ef5111227e 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -293,7 +293,8 @@ def _chunk_scan_fwd_kernel(
             dA_cs_m_boundary = tl.load(
                 dA_cumsum_ptr +
                 (pid_m * BLOCK_SIZE_M + c_off - 1) * stride_dA_cs_csize,
-                mask=(pid_m * BLOCK_SIZE_M + c_off - 1) > -1,
+                mask=(((pid_m * BLOCK_SIZE_M + c_off - 1) > -1)
+                      and ((pid_m * BLOCK_SIZE_M + c_off) < chunk_size)),
                 other=0.0).to(tl.float32)
 
     if HAS_SEQ_IDX:
@@ -463,7 +464,10 @@ def _seq_idx_to_chunk_indices_offsets(seq_idx, chunk_size: int):
         p += (s % chunk_size > 0)
 
         # get the dimensions
-        _s, _e = s // chunk_size + p, e // chunk_size + p + 1
+        # - the + 1 for _e is to shift the boundary by one chunk
+        # - this shifting is not needed if chunk_size divides e
+        _s, _e = s // chunk_size + p, e // chunk_size + p + (e % chunk_size
+                                                             > 0)
 
         # adjust inidces and offsets
         chunk_indices[_s:_e] -= p
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 72b74e31b6c..b9310108543 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -440,23 +440,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-        # follow jamba
-        if self.scheduler_config is not None and \
-            not self.model_config.enforce_eager:
-            # for compilation
-            if self.scheduler_config.max_num_seqs > \
-                vllm_config.compilation_config.max_capture_size:
-                self.max_batch_size = \
-                    vllm_config.compilation_config.max_capture_size
-            else:
-                self.max_batch_size = vllm_config.pad_for_cudagraph(
-                    self.scheduler_config.max_num_seqs)
-        elif self.scheduler_config is not None:
-            # for eager just take the scheduler_config if avail
-            self.max_batch_size = self.scheduler_config.max_num_seqs
-        else:
-            self.max_batch_size = 8192 + 2
-
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
 
@@ -474,8 +457,8 @@ def forward(self,
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
 
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers,
-                self.max_batch_size, *self._get_mamba_cache_shape())
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
         hidden_states = self.model(input_ids, positions, kv_caches,
                                    attn_metadata, mamba_cache_params,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index f307f279dad..efc1496d44f 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -426,17 +426,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
-        if self.scheduler_config is not None and \
-                not self.model_config.enforce_eager:
-            if self.scheduler_config.max_num_seqs > \
-                    vllm_config.compilation_config.max_capture_size:
-                self.max_batch_size = \
-                    vllm_config.compilation_config.max_capture_size
-            else:
-                self.max_batch_size = vllm_config.pad_for_cudagraph(
-                    self.scheduler_config.max_num_seqs)
-        else:
-            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.model.get_input_embeddings(input_ids)
@@ -453,8 +442,8 @@ def forward(self,
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers,
-                self.max_batch_size, *self._get_mamba_cache_shape())
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 3bbc219e92a..ba88950ee89 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -166,14 +166,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         lora_config = vllm_config.lora_config
-        scheduler_config = vllm_config.scheduler_config
+        self.scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, \
             "Mamba does not support prefix caching"
 
         super().__init__()
         self.config = config
         self.vllm_config = vllm_config
-        self.scheduler_config = scheduler_config
         self.model_config = vllm_config.model_config
         self.backbone = MambaModel(vllm_config=vllm_config,
                                    prefix=maybe_prefix(prefix, "backbone"))
@@ -202,17 +201,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.make_empty_intermediate_tensors = (
             self.backbone.make_empty_intermediate_tensors)
-        if self.scheduler_config is not None and \
-            not self.model_config.enforce_eager:
-            if self.scheduler_config.max_num_seqs > \
-                vllm_config.compilation_config.max_capture_size:
-                self.max_batch_size = \
-                    vllm_config.compilation_config.max_capture_size
-            else:
-                self.max_batch_size = vllm_config.pad_for_cudagraph(
-                    self.scheduler_config.max_num_seqs)
-        else:
-            self.max_batch_size = 8192 + 2
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
         return self.backbone.get_input_embeddings(input_ids)
@@ -229,8 +217,8 @@ def forward(self,
             num_mamba_layers = self.model_config.get_num_layers_by_block_type(
                 self.vllm_config.parallel_config, LayerBlockType.mamba)
             self.mamba_cache = MambaCacheManager(
-                self.lm_head.weight.dtype, num_mamba_layers,
-                self.max_batch_size, *self._get_mamba_cache_shape())
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
new file mode 100644
index 00000000000..6366fc02368
--- /dev/null
+++ b/vllm/model_executor/models/mamba2.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch MAMBA2 model."""
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers import MambaConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import (HasInnerState,
+                                                   IsAttentionFree)
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from vllm.utils import LayerBlockType
+
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class Mamba2DecoderLayer(nn.Module):
+
+    def __init__(self,
+                 config: MambaConfig,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__()
+        self.config = config
+        self.mixer = MambaMixer2(hidden_size=config.hidden_size,
+                                 ssm_state_size=config.state_size,
+                                 conv_kernel_size=config.conv_kernel,
+                                 intermediate_size=getattr(
+                                     config, "intermediate_size",
+                                     config.expand * config.hidden_size),
+                                 use_conv_bias=config.use_conv_bias,
+                                 use_bias=config.use_bias,
+                                 n_groups=config.n_groups,
+                                 num_heads=config.num_heads,
+                                 head_dim=config.head_dim,
+                                 rms_norm_eps=config.layer_norm_epsilon,
+                                 activation=config.hidden_act,
+                                 chunk_size=config.chunk_size,
+                                 quant_config=quant_config)
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        sequence_idx: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+        else:
+            hidden_states, residual = self.norm(hidden_states, residual)
+
+        hidden_states = self.mixer(hidden_states, attn_metadata,
+                                   mamba_cache_params, sequence_idx)
+        return hidden_states, residual
+
+
+class Mamba2Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
+        assert not is_lora_enabled
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embeddings = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Mamba2DecoderLayer(config,
+                                              quant_config=quant_config),
+            prefix=f"{prefix}.layers")
+
+        self.norm_f = RMSNorm(config.hidden_size,
+                              eps=config.layer_norm_epsilon)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        seq_idx = None
+        if attn_metadata.num_prefills > 0:
+            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
+            for i, (srt, end) in enumerate(
+                    zip(
+                        attn_metadata.query_start_loc,
+                        attn_metadata.query_start_loc[1:],
+                    )):
+                seq_idx[srt:end] = i
+            seq_idx.unsqueeze_(0)
+
+        for i in range(len(self.layers)):
+            layer = self.layers[i]
+
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                attn_metadata=attn_metadata,
+                residual=residual,
+                mamba_cache_params=mamba_cache_params.at_layer_idx(
+                    i - self.start_layer),
+                sequence_idx=seq_idx)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm_f(hidden_states, residual)
+
+        return hidden_states
+
+
+class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
+        self.backbone = Mamba2Model(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "backbone"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.backbone.embeddings)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.backbone.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: List[KVCache],
+                attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs):
+        if self.mamba_cache is None:
+            num_mamba_layers = self.model_config.get_num_layers_by_block_type(
+                self.vllm_config.parallel_config, LayerBlockType.mamba)
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        hidden_states = self.backbone(input_ids, positions, attn_metadata,
+                                      mamba_cache_params, intermediate_tensors,
+                                      inputs_embeds)
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = getattr(
+            self.config, "intermediate_size",
+            self.config.expand * self.config.hidden_size)
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (
+            self.config.n_groups +
+            extra_groups_for_head_shards(self.config.n_groups, world_size))
+
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size + 2 * n_groups * self.config.state_size)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.conv_kernel - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.num_heads, world_size),
+            self.config.head_dim,
+            self.config.state_size,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index ce4197507da..d529833093c 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
 import torch
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.config import VllmConfig
 
 
 @dataclass
@@ -22,8 +23,14 @@ def at_layer_idx(self, layer_idx):
 
 class MambaCacheManager:
 
-    def __init__(self, dtype, num_mamba_layers, max_batch_size,
-                 conv_state_shape, temporal_state_shape):
+    def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
+                 num_mamba_layers: int, conv_state_shape: Tuple[int, int],
+                 temporal_state_shape: Tuple[int, int]):
+
+        # Determine max batch size to set size of MambaCache
+        max_batch_size = vllm_config.scheduler_config.max_num_seqs
+        if not vllm_config.model_config.enforce_eager:
+            max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)
 
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  conv_state_shape,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7260d973bfb..775398e003c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -71,6 +71,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
     "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
     "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),

From 78861ef9403706ff6e8acd8cd26be428aedd7d5f Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Mon, 17 Feb 2025 20:59:18 +0800
Subject: [PATCH 0218/1240] [Bugfix] fix xpu communicator (#13368)

Signed-off-by: yan ma <yan.ma@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../device_communicators/xpu_communicator.py  | 54 +++++++++++++++++++
 vllm/platforms/xpu.py                         |  4 ++
 2 files changed, 58 insertions(+)
 create mode 100644 vllm/distributed/device_communicators/xpu_communicator.py

diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
new file mode 100644
index 00000000000..256e7965e0a
--- /dev/null
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from .base_device_communicator import DeviceCommunicatorBase
+
+
+class XpuCommunicator(DeviceCommunicatorBase):
+
+    def __init__(self,
+                 cpu_group: ProcessGroup,
+                 device: Optional[torch.device] = None,
+                 device_group: Optional[ProcessGroup] = None,
+                 unique_name: str = ""):
+        super().__init__(cpu_group, device, device_group, unique_name)
+
+    def all_reduce(self, input_) -> torch.Tensor:
+        dist.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((self.world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        dist.all_gather_into_tensor(output_tensor,
+                                    input_,
+                                    group=self.device_group)
+        if self.rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(input_size[:dim] +
+                                                  (self.world_size *
+                                                   input_size[dim], ) +
+                                                  input_size[dim + 1:])
+        else:
+            output_tensor = None
+        return output_tensor
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 81bc85f9415..04af319566a 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -135,3 +135,7 @@ def device_support_bf16(cls) -> bool:
             logger.warning("Unknown device name %s, always use float16",
                            device_name)
             return False
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa

From a307a607f06bcdb9de511cb06211a1ee1f8ac8d5 Mon Sep 17 00:00:00 2001
From: "r.4ntix" <antix.blue@gmail.com>
Date: Mon, 17 Feb 2025 22:22:01 +0800
Subject: [PATCH 0219/1240] [Bugfix] Fix VLLM_USE_MODELSCOPE issue (#13384)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 18 ++++++++++++------
 vllm/transformers_utils/utils.py  | 21 ++++++++++++++++++++-
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4b76509e454..360b457a19a 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -117,6 +117,12 @@ def list_repo_files(
 
     def lookup_files():
         try:
+            if VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import (
+                    modelscope_list_repo_files)
+                return modelscope_list_repo_files(repo_id,
+                                                  revision=revision,
+                                                  token=token)
             return hf_list_repo_files(repo_id,
                                       revision=revision,
                                       repo_type=repo_type,
@@ -382,17 +388,17 @@ def get_hf_file_to_dict(file_name: str,
 @cache
 def get_pooling_config(model: str, revision: Optional[str] = 'main'):
     """
-    This function gets the pooling and normalize 
-    config from the model - only applies to 
-    sentence-transformers models. 
+    This function gets the pooling and normalize
+    config from the model - only applies to
+    sentence-transformers models.
 
     Args:
         model (str): The name of the Hugging Face model.
-        revision (str, optional): The specific version 
+        revision (str, optional): The specific version
         of the model to use. Defaults to 'main'.
 
     Returns:
-        dict: A dictionary containing the pooling 
+        dict: A dictionary containing the pooling
         type and whether normalization is used.
     """
 
@@ -499,7 +505,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
                                          revision=revision,
                                          token=HF_TOKEN)
         except Exception as e:
-            logger.debug("Error getting repo files", e)
+            logger.error("Error getting repo files", e)
             repo_files = []
 
         for config_name in sentence_transformer_config_files:
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 71fe3ef0b23..d0b5d7f01a9 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -2,7 +2,7 @@
 
 from os import PathLike
 from pathlib import Path
-from typing import Union
+from typing import List, Optional, Union
 
 
 def is_s3(model_or_path: str) -> bool:
@@ -20,3 +20,22 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
     with open(model, "rb") as f:
         header = f.read(4)
     return header == b"GGUF"
+
+
+def modelscope_list_repo_files(
+    repo_id: str,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+) -> List[str]:
+    """List files in a modelscope repo."""
+    from modelscope.hub.api import HubApi
+    from modelscope.utils.hf_util import _try_login
+    _try_login(token)
+    api = HubApi()
+    # same as huggingface_hub.list_repo_files
+    files = [
+        file['Path'] for file in api.get_model_files(
+            model_id=repo_id, revision=revision, recursive=True)
+        if file['Type'] == 'blob'
+    ]
+    return files

From 5d11889822ede40adfa2e25c59f9617d0dcae2e6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 17 Feb 2025 11:01:07 -0800
Subject: [PATCH 0220/1240] [V1] Get input tokens from scheduler (#13339)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_model_runner.py |   1 +
 vllm/v1/core/scheduler.py                |  43 +++--
 vllm/v1/core/scheduler_output.py         |  15 +-
 vllm/v1/worker/gpu_model_runner.py       | 219 +++++++++++------------
 4 files changed, 139 insertions(+), 139 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 576d906fa74..c655b0fded6 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -154,6 +154,7 @@ def test_update_states_request_resumed(model_runner):
     cached_req_data = CachedRequestData(
         req_id=req_id,
         resumed_from_preemption=False,
+        new_token_ids=[],
         new_block_ids=[],
         num_computed_tokens=0,
     )
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 82c4b307d48..e5c60afeb49 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -121,6 +121,8 @@ def schedule(self) -> "SchedulerOutput":
         encoder_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
         scheduled_spec_decode_tokens: Dict[str, List[int]] = {}
+
+        # For logging.
         scheduled_timestamp = time.monotonic()
 
         # First, schedule the RUNNING requests.
@@ -187,6 +189,15 @@ def schedule(self) -> "SchedulerOutput":
             token_budget -= num_new_tokens
             req_index += 1
 
+            # Speculative decode related.
+            if request.spec_token_ids:
+                num_scheduled_spec_tokens = (num_new_tokens +
+                                             request.num_computed_tokens -
+                                             request.num_tokens)
+                if num_scheduled_spec_tokens > 0:
+                    scheduled_spec_decode_tokens[request.request_id] = (
+                        request.spec_token_ids[:num_scheduled_spec_tokens])
+
             # Encoder-related.
             if encoder_inputs_to_schedule:
                 scheduled_encoder_inputs[request.request_id] = (
@@ -196,11 +207,6 @@ def schedule(self) -> "SchedulerOutput":
                     self.encoder_cache_manager.allocate(request, i)
                 encoder_budget = new_encoder_budget
 
-            # Speculative decode related.
-            if request.spec_token_ids:
-                scheduled_spec_decode_tokens[
-                    request.request_id] = request.spec_token_ids
-
         # Record the LoRAs in scheduled_running_reqs
         requested_loras: Set[int] = set()
         if self.lora_config:
@@ -324,23 +330,24 @@ def schedule(self) -> "SchedulerOutput":
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
-                                        req_to_new_block_ids[req.request_id],
-                                        req.num_computed_tokens)
+                                        req_to_new_block_ids[req.request_id])
             for req in scheduled_new_reqs
         ]
         resumed_reqs_data = [
             self._make_cached_request_data(
                 req,
+                num_scheduled_tokens[req.request_id],
+                len(scheduled_spec_decode_tokens.get(req.request_id, ())),
                 req_to_new_block_ids[req.request_id],
-                req.num_computed_tokens,
                 resumed_from_preemption=True,
             ) for req in scheduled_resumed_reqs
         ]
         running_reqs_data = [
             self._make_cached_request_data(
                 req,
+                num_scheduled_tokens[req.request_id],
+                len(scheduled_spec_decode_tokens.get(req.request_id, ())),
                 req_to_new_block_ids[req.request_id],
-                req.num_computed_tokens,
                 resumed_from_preemption=False,
             ) for req in scheduled_running_reqs
         ]
@@ -349,8 +356,8 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
-            scheduled_encoder_inputs=scheduled_encoder_inputs,
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
@@ -366,22 +373,28 @@ def schedule(self) -> "SchedulerOutput":
     def _make_cached_request_data(
         self,
         request: Request,
+        num_scheduled_tokens: int,
+        num_scheduled_spec_tokens: int,
         new_block_ids: List[int],
-        num_computed_tokens: int,
         resumed_from_preemption: bool,
     ) -> "CachedRequestData":
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
-        if request.request_id in self._cached_reqs_data:
-            req_data = self._cached_reqs_data[request.request_id]
+        num_computed_tokens = request.num_computed_tokens
+        num_regular_tokens = num_scheduled_tokens - num_scheduled_spec_tokens
+        new_token_ids = request.all_token_ids[
+            num_computed_tokens:num_computed_tokens + num_regular_tokens]
+        req_data = self._cached_reqs_data.get(request.request_id)
+        if req_data is not None:
             req_data.resumed_from_preemption = resumed_from_preemption
+            req_data.new_token_ids = new_token_ids
             req_data.new_block_ids = new_block_ids
             req_data.num_computed_tokens = num_computed_tokens
         else:
             req_data = CachedRequestData.from_request(request,
                                                       resumed_from_preemption,
-                                                      new_block_ids,
-                                                      num_computed_tokens)
+                                                      new_token_ids,
+                                                      new_block_ids)
             self._cached_reqs_data[request.request_id] = req_data
         return req_data
 
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index 2ca8526936e..47413527c32 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -30,7 +30,6 @@ def from_request(
         cls,
         request: "Request",
         block_ids: List[int],
-        num_computed_tokens: int,
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -41,7 +40,7 @@ def from_request(
             mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
-            num_computed_tokens=num_computed_tokens,
+            num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
         )
 
@@ -54,6 +53,7 @@ class CachedRequestData:
     # the request's block IDs. If True, new_block_ids will be used as the
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: bool
+    new_token_ids: List[int]
     new_block_ids: List[int]
     num_computed_tokens: int
 
@@ -62,14 +62,15 @@ def from_request(
         cls,
         request: "Request",
         resumed_from_preemption: bool,
+        new_token_ids: List[int],
         new_block_ids: List[int],
-        num_computed_tokens: int,
     ) -> "CachedRequestData":
         return cls(
             req_id=request.request_id,
             resumed_from_preemption=resumed_from_preemption,
+            new_token_ids=new_token_ids,
             new_block_ids=new_block_ids,
-            num_computed_tokens=num_computed_tokens,
+            num_computed_tokens=request.num_computed_tokens,
         )
 
 
@@ -91,9 +92,9 @@ class SchedulerOutput:
     # Total number of tokens scheduled for all requests.
     # Equal to sum(num_scheduled_tokens.values())
     total_num_scheduled_tokens: int
-    # req_id -> spec_decode_tokens
-    # If a request does not have any spec decode tokens, it will
-    # not be included in the dictionary.
+    # req_id -> spec_token_ids
+    # If a request does not have any spec decode tokens, it will not be
+    # included in the dictionary.
     scheduled_spec_decode_tokens: Dict[str, List[int]]
     # req_id -> encoder input indices that need processing.
     # E.g., if a request has [0, 1], it could mean the vision encoder needs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1212c3554b..e1d1e43427b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2,7 +2,7 @@
 
 import gc
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -184,7 +184,6 @@ def __init__(
                                        self.max_model_len,
                                        self.max_num_tokens),
                                    dtype=np.int32)
-        self.arange_cpu = torch.from_numpy(self.arange_np)
         # NOTE(woosuk): These tensors are "stateless", i.e., they are literally
         # a faster version of creating a new tensor every time. Thus, we should
         # not make any assumptions about the values in these tensors.
@@ -327,7 +326,17 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             req_state = self.requests[req_id]
 
             # Update the cached states.
-            req_state.num_computed_tokens = req_data.num_computed_tokens
+            num_computed_tokens = req_data.num_computed_tokens
+            req_state.num_computed_tokens = num_computed_tokens
+            # Add the sampled token(s) from the previous step (if any).
+            # This doesn't include "unverified" tokens like spec decode tokens.
+            num_new_tokens = (num_computed_tokens +
+                              len(req_data.new_token_ids) -
+                              req_state.num_tokens)
+            new_token_ids = (req_data.new_token_ids[-num_new_tokens:]
+                             if num_new_tokens > 0 else [])
+            req_state.output_token_ids.extend(new_token_ids)
+            # Update the block IDs.
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
                 req_state.block_ids.extend(req_data.new_block_ids)
@@ -346,12 +355,30 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
 
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
-                req_data.num_computed_tokens)
-            start_index = len(req_state.block_ids) - len(
-                req_data.new_block_ids)
+                num_computed_tokens)
+            start_index = (len(req_state.block_ids) -
+                           len(req_data.new_block_ids))
             self.input_batch.block_table.append_row(req_index, start_index,
                                                     req_data.new_block_ids)
+            # Add new_token_ids to token_ids_cpu.
+            start_token_index = num_computed_tokens
+            end_token_index = num_computed_tokens + len(req_data.new_token_ids)
+            self.input_batch.token_ids_cpu[
+                req_index,
+                start_token_index:end_token_index] = req_data.new_token_ids
+            # Add spec_token_ids to token_ids_cpu.
+            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+                req_id, [])
+            if spec_token_ids:
+                start_index = end_token_index
+                end_token_index += len(spec_token_ids)
+                self.input_batch.token_ids_cpu[
+                    req_index, start_index:end_token_index] = spec_token_ids
+            # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
+            self.input_batch.num_tokens[req_index] = end_token_index
 
+        # Check if the batch has changed. If not, we can skip copying the
+        # sampling metadata from CPU to GPU.
         batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
 
         # Add the new or resumed requests to the persistent batch.
@@ -374,7 +401,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         return batch_changed
 
     def _prepare_inputs(
-        self, scheduler_output: "SchedulerOutput"
+        self,
+        scheduler_output: "SchedulerOutput",
     ) -> Tuple[FlashAttentionMetadata, torch.Tensor]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
@@ -387,24 +415,14 @@ def _prepare_inputs(
 
         # Get the number of scheduled tokens for each request.
         # TODO: The Python loop can be slow. Optimize.
-        num_scheduled_tokens_list: List[int] = []
+        num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
         max_num_scheduled_tokens = 0
-        all_spec_token_ids: List[int] = []
-        num_spec_tokens_list: List[int] = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            num_scheduled_tokens_list.append(num_tokens)
+            num_scheduled_tokens[i] = num_tokens
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
                                            num_tokens)
-            spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, [])
-            all_spec_token_ids.extend(spec_token_ids)
-            num_spec_tokens_list.append(len(spec_token_ids))
-
-        num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
-                                                    dtype=np.int32)
-        assert max_num_scheduled_tokens > 0
 
         # Get request indices.
         # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
@@ -441,78 +459,6 @@ def _prepare_inputs(
         token_indices = (positions_np +
                          req_indices * self.input_batch.token_ids_cpu.shape[1])
 
-        use_spec_decode = len(all_spec_token_ids) > 0
-        if use_spec_decode:
-
-            # 1. Write spec_token_ids to input batch.
-            # Step 1. Get req indices that perform spec decode and repeat
-            #         the req indices by the number of spec tokens. Note
-            #         for requests that don't perform spec decode, the
-            #         number of spec tokens is 0 and the req index is
-            #         repeated 0 times.
-            # E.g., num_spec_tokens_list:            [3, 0, 2, 0, 1]
-            #       spec_req_indices:                [0, 0, 0, 2, 2, 4]
-            spec_req_indices = np.repeat(self.arange_np[:num_reqs],
-                                         num_spec_tokens_list)
-            # spec_offsets: offsets within each spec token list.
-            # E.g., [1, 2, 3, 1, 2, 1], TODO: avoid the for loop here
-            spec_offsets = np.concatenate(
-                [self.arange_np[1:val + 1] for val in num_spec_tokens_list])
-            # spec_seq_offsets: offsets within each sequence.
-            # E.g., num_computed_tokens_cpu:   [1, 4, 3, 6, 2]
-            #       after repeating:           [1, 1, 1, 3, 3, 2]
-            #       spec_seq_offsets:  [1, 1, 1, 3, 3, 2] + [1, 2, 3, 1, 2, 1]
-            #                                = [2, 3, 4, 4, 5, 3]
-            spec_seq_offsets = np.repeat(
-                self.input_batch.num_computed_tokens_cpu[:num_reqs],
-                num_spec_tokens_list) + spec_offsets
-            # cumsums_spec_offsets: [0, 0, 0, 2M, 2M, 4M] + [2, 3, 4, 4, 5, 3]
-            cumsums_spec_offsets = (
-                spec_seq_offsets +
-                spec_req_indices * self.input_batch.token_ids_cpu.shape[1])
-            cumsums_spec_offsets = torch.from_numpy(cumsums_spec_offsets).to(
-                torch.int64)
-            all_spec_token_ids = torch.tensor(all_spec_token_ids,
-                                              device="cpu",
-                                              dtype=self.input_ids_cpu.dtype)
-
-            # Step 2. Write spec token ids to input_ids_cpu.
-            self.input_batch.token_ids_cpu_tensor.flatten().scatter_(
-                0, cumsums_spec_offsets, all_spec_token_ids)
-
-            # 2. Get spec decode logits indices.
-            # E.g.,   num_scheduled_tokens: [4, 100, 3,   100, 2]
-            #         cu_num_tokens:        [4, 104, 107, 207, 209]
-            #         num_spec_tokens_list: [3, 0,   2,   0,   1]
-            #         num_sampled_tokens:   [4, 1,   3,   1,   2]
-            #         spec_decode_logits_indices:
-            #                 [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
-            num_spec_tokens_np = np.array(num_spec_tokens_list, dtype=np.int32)
-            num_sampled_tokens = num_spec_tokens_np + 1
-            # logits_start_loc: [0, 103, 104, 206, 207]
-            logits_start_loc = cu_num_tokens - num_sampled_tokens
-            # [0, 103, 104, 206, 207] ->
-            #               [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
-            logits_start_loc = np.repeat(logits_start_loc, num_sampled_tokens)
-            # The following three lines:
-            # [4, 1,   3,   1,   2] -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-            # Step 1. [4, 1, 3, 1, 2] -> [4, 5, 8, 9, 11]
-            cu_num_sampled_tokens = np.cumsum(num_sampled_tokens)
-            # Step 2. [4, 5, 8, 9, 11] -> [0, 4, 5, 8, 9]
-            #         -> [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-            cumsums_sampled_offsets = np.repeat(
-                cu_num_sampled_tokens - num_sampled_tokens, num_sampled_tokens)
-            # Step 3.  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-            #       -  [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-            #      -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-            total_num_sampled_tokens = num_sampled_tokens.sum()
-            sampled_arange = (self.arange_np[:total_num_sampled_tokens] -
-                              cumsums_sampled_offsets)
-
-            # [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] ->
-            # [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
-            spec_decode_logits_indices = logits_start_loc + sampled_arange
-
         # NOTE(woosuk): We use torch.index_select instead of np.take here
         # because torch.index_select is much faster than np.take for large
         # tensors.
@@ -606,9 +552,11 @@ def _prepare_inputs(
             suffix_kv_lens=suffix_kv_lens,
         )
 
+        use_spec_decode = len(
+            scheduler_output.scheduled_spec_decode_tokens) > 0
         if use_spec_decode:
-            logits_indices = torch.from_numpy(spec_decode_logits_indices).to(
-                self.device, non_blocking=True)
+            logits_indices = self._calc_spec_decode_metadata(
+                scheduler_output, cu_num_tokens)
         else:
             # NOTE(woosuk): Due to chunked prefills, the batch may contain
             # partial requests. While we should not sample any token
@@ -762,6 +710,53 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
 
                 mrope_pos_ptr += completion_part_len
 
+    def _calc_spec_decode_metadata(
+        self,
+        scheduler_output: "SchedulerOutput",
+        cu_num_tokens: np.ndarray,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Get the number of spec decode tokens for each request.
+        num_reqs = self.input_batch.num_reqs
+        num_spec_decode_tokens = np.empty(num_reqs, dtype=np.int32)
+        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
+            assert req_id is not None
+            num_spec_decode_tokens[i] = len(
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
+
+        # Get spec decode logits indices.
+        # E.g.,   num_scheduled_tokens: [4, 100, 3,   100, 2]
+        #         cu_num_tokens:        [4, 104, 107, 207, 209]
+        #         num_spec_tokens_list: [3, 0,   2,   0,   1]
+        #         num_sampled_tokens:   [4, 1,   3,   1,   2]
+        #         spec_decode_logits_indices:
+        #                 [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        num_sampled_tokens = num_spec_decode_tokens + 1
+        # logits_start_loc: [0, 103, 104, 206, 207]
+        logits_start_loc = cu_num_tokens - num_sampled_tokens
+        # [0, 103, 104, 206, 207] ->
+        #               [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+        logits_start_loc = np.repeat(logits_start_loc, num_sampled_tokens)
+        # The following three lines:
+        # [4, 1,   3,   1,   2] -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        # Step 1. [4, 1, 3, 1, 2] -> [4, 5, 8, 9, 11]
+        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens)
+        # Step 2. [4, 5, 8, 9, 11] -> [0, 4, 5, 8, 9]
+        #         -> [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
+        cumsums_sampled_offsets = np.repeat(
+            cu_num_sampled_tokens - num_sampled_tokens, num_sampled_tokens)
+        # Step 3.  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        #       -  [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
+        #      -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        total_num_sampled_tokens = num_sampled_tokens.sum()
+        sampled_arange = (self.arange_np[:total_num_sampled_tokens] -
+                          cumsums_sampled_offsets)
+
+        # [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] ->
+        # [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        spec_decode_logits_indices = logits_start_loc + sampled_arange
+        return torch.from_numpy(spec_decode_logits_indices).to(
+            self.device, non_blocking=True)
+
     def _prepare_sampling(
         self,
         batch_changed: bool,
@@ -773,7 +768,9 @@ def _prepare_sampling(
                 for req_id, req in self.requests.items()}
 
         sampling_metadata = self.input_batch.make_sampling_metadata(
-            req_id_output_token_ids, req_to_spec_token_ids, not batch_changed)
+            req_id_output_token_ids,
+            req_to_spec_token_ids,
+            skip_copy=not batch_changed)
         return sampling_metadata
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -960,28 +957,24 @@ def execute_model(
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
         num_reqs = self.input_batch.num_reqs
-        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
+        req_ids: List[str] = []
+        # Because `input_batch.req_ids` is a list of length `max_num_reqs`,
+        # we need to stop at `num_reqs`.
+        # FIXME(woosuk): This is hacky. Refactor.
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
+            req_ids.append(req_id)
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
-            if seq_len >= req_state.num_tokens:
-                request_seq_lens.append((i, req_state, seq_len))
-            else:
-                # Ignore the sampled token from the partial request.
+            if seq_len < req_state.num_tokens:
+                # Ignore the sampled token.
                 # Rewind the generator state as if the token was not sampled.
                 generator = self.input_batch.generators.get(i)
                 if generator is not None:
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        # num_reqs entries should be non-None
-        assert all(
-            req_id is not None for req_id in
-            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
-        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
-
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
         logprobs_tensors = sampler_output.logprobs_tensors
@@ -994,29 +987,21 @@ def execute_model(
             scheduler_output,
         )
 
-        # Update batch with the valid generated tokens.
+        # Get the valid generated tokens.
         sampled_token_ids = sampler_output.sampled_token_ids
         max_gen_len = sampled_token_ids.shape[-1]
         if max_gen_len == 1:
+            # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()
-            for i, req_state, seq_len in request_seq_lens:
-                token_id = valid_sampled_token_ids[i][0]
-                self.input_batch.token_ids_cpu[i, seq_len] = token_id
-                req_state.output_token_ids.append(token_id)
-                self.input_batch.num_tokens[i] += 1
         else:
+            # Includes spec decode tokens.
             valid_mask = sampled_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()
+            # TODO(woosuk): Optimize this.
             valid_sampled_token_ids = [
                 seq.tolist()
                 for seq in sampled_token_ids[valid_mask].split(gen_lens)
             ]
-            self.input_batch.num_tokens[:num_reqs] += gen_lens
-            for i, req_state, seq_len in request_seq_lens:
-                target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
-                self.input_batch.token_ids_cpu[
-                    i, target_slice] = valid_sampled_token_ids[i]
-                req_state.output_token_ids.extend(valid_sampled_token_ids[i])
 
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,

From 8a42c8d7057e7ce5784626b5bab149f1dc62a564 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 17 Feb 2025 13:37:45 -0800
Subject: [PATCH 0221/1240] [V1][PP] Fix intermediate tensor values (#13417)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/sequence.py                   |  3 +++
 vllm/v1/worker/gpu_model_runner.py | 10 ++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 98578ee04d5..45d0e5bc768 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1137,6 +1137,9 @@ def __getitem__(self, key: Union[str, slice]):
     def __setitem__(self, key: str, value: torch.Tensor):
         self.tensors[key] = value
 
+    def items(self):
+        return self.tensors.items()
+
     def __len__(self):
         return len(self.tensors)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e1d1e43427b..1119d53b493 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -151,7 +151,8 @@ def __init__(
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
-        # self.intermediate_tensors  # Set after load_model
+        # None in the first PP rank. The rest are set after load_model.
+        self.intermediate_tensors: Optional[IntermediateTensors] = None
 
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
         if self.uses_mrope:
@@ -922,6 +923,11 @@ def execute_model(
         if get_pp_group().is_first_rank:
             intermediate_tensors = None
         else:
+            assert intermediate_tensors is not None
+            assert self.intermediate_tensors is not None
+            for k, v in intermediate_tensors.items():
+                self.intermediate_tensors[k][:num_input_tokens].copy_(
+                    v[:num_input_tokens], non_blocking=True)
             intermediate_tensors = IntermediateTensors({
                 k: v[:num_input_tokens]
                 for k, v in self.intermediate_tensors.items()
@@ -1120,7 +1126,7 @@ def _dummy_run(
         if get_pp_group().is_first_rank:
             intermediate_tensors = None
         else:
-            if not hasattr(self, "intermediate_tensors"):
+            if self.intermediate_tensors is None:
                 self.intermediate_tensors = (
                     self.model.make_empty_intermediate_tensors(
                         batch_size=self.max_num_tokens,

From 97dd4fdcd4137b423d8999eee958ee0d1b34147c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 17 Feb 2025 15:40:12 -0800
Subject: [PATCH 0222/1240] [V1][Spec decode] Move drafter to model runner 
 (#13363)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py       |  7 ++++
 vllm/v1/core/scheduler.py             | 11 +++----
 vllm/v1/engine/core.py                | 30 -----------------
 vllm/v1/outputs.py                    |  3 ++
 vllm/v1/request.py                    | 12 -------
 vllm/v1/spec_decode/ngram_proposer.py | 23 ++++++++-----
 vllm/v1/worker/gpu_input_batch.py     |  7 ++++
 vllm/v1/worker/gpu_model_runner.py    | 47 +++++++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py    |  1 +
 9 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index e39a7f9f40b..eb730973c94 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -203,6 +203,7 @@ def test_schedule_partial_requests():
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
         sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
     )
@@ -259,6 +260,7 @@ def test_stop_via_update_from_output():
         sampled_token_ids=[[EOS_TOKEN_ID],
                            [10,
                             11]],  # First request hits EOS, second continues
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={})
 
@@ -307,6 +309,7 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 42, 12],
                            [13, 14]],  # First request hits stop token
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={})
 
@@ -354,6 +357,7 @@ def test_stop_via_update_from_output():
         },
         sampled_token_ids=[[10, 11, 12],
                            [13]],  # First request exceeds max_tokens
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={})
 
@@ -394,6 +398,7 @@ def test_stop_via_update_from_output():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={})
 
@@ -434,6 +439,7 @@ def test_schedule_concurrent_batches():
         req_ids=[requests[0].request_id],
         req_id_to_index={requests[0].request_id: 0},
         sampled_token_ids=[[0]],
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
     )
@@ -450,6 +456,7 @@ def test_schedule_concurrent_batches():
         req_ids=[requests[1].request_id],
         req_id_to_index={requests[1].request_id: 0},
         sampled_token_ids=[[0]],
+        spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
     )
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index e5c60afeb49..8f10834251c 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -474,6 +474,7 @@ def update_from_output(
         model_runner_output: "ModelRunnerOutput",
     ) -> EngineCoreOutputs:
         sampled_token_ids = model_runner_output.sampled_token_ids
+        spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
@@ -530,13 +531,9 @@ def update_from_output(
                         self.encoder_cache_manager.free_encoder_input(
                             request, input_id)
 
-            if request.num_computed_tokens >= request.num_tokens:
-                # Clear the spec tokens as the request has generated
-                # a new token. Here, We assume all spec tokens are verified
-                # if we perform speculative decoding for this request.
-                # Therefore, we can clear all spec tokens after
-                # the generation step.
-                request.clear_spec_tokens()
+            # Add newly generated spec token ids to the request.
+            if spec_token_ids is not None:
+                request.spec_token_ids = spec_token_ids[req_index]
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index c7ea7b1a94d..6718a5f7b02 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -27,7 +27,6 @@
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
-from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -86,15 +85,6 @@ def __init__(
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
-        # Setup speculative decode.
-        # TODO: find a better way to check if we are using ngram.
-        self.use_spec_decode = False
-        if self.scheduler.speculative_config:
-            assert self.scheduler.speculative_config.ngram_prompt_lookup_min \
-                    , "Only ngram spec decode is supported in V1."
-            self.proposer = NgramProposer()
-            self.use_spec_decode = True
-
     def _initialize_kv_caches(self,
                               vllm_config: VllmConfig) -> Tuple[int, int]:
         start = time.time()
@@ -158,9 +148,6 @@ def step(self) -> EngineCoreOutputs:
             return EngineCoreOutputs(
                 outputs=[], scheduler_stats=self.scheduler.make_stats())
 
-        if self.use_spec_decode:
-            self.propose_tokens()
-
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
@@ -221,23 +208,6 @@ def shutdown(self):
     def profile(self, is_start: bool = True):
         self.model_executor.profile(is_start)
 
-    def propose_tokens(self):
-        assert self.scheduler.speculative_config is not None
-        for req in self.scheduler.running:
-            # Ignore requests that are doing chunked prefill.
-            if req.num_computed_tokens < req.num_tokens - 1:
-                continue
-            # Ignore requests that already have spec tokens.
-            if req.spec_token_ids:
-                continue
-            spec_tokens = self.proposer.propose(
-                req.all_token_ids,
-                self.scheduler.speculative_config.ngram_prompt_lookup_min,
-                self.scheduler.speculative_config.num_speculative_tokens,
-            )
-            if spec_tokens:
-                req.append_spec_token_ids(spec_tokens)
-
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index fb6c4051e9a..0c8eca38ade 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -67,6 +67,9 @@ class ModelRunnerOutput:
     # each request due to speculative/jump decoding.
     sampled_token_ids: List[List[int]]
 
+    # num_reqs x num_spec_tokens
+    spec_token_ids: Optional[List[List[int]]]
+
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index a1bcc2d0393..52d7faeeb06 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -104,18 +104,6 @@ def append_output_token_ids(
         self._output_token_ids.extend(token_ids)
         self._all_token_ids.extend(token_ids)
 
-    def append_spec_token_ids(
-        self,
-        token_ids: Union[int, List[int]],
-    ) -> None:
-        if isinstance(token_ids, int):
-            self.spec_token_ids.append(token_ids)
-        else:
-            self.spec_token_ids.extend(token_ids)
-
-    def clear_spec_tokens(self) -> None:
-        self.spec_token_ids.clear()
-
     @property
     def num_tokens(self) -> int:
         return len(self._all_token_ids)
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 8eee99506b1..9b116e00af9 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import List, Optional
 
-from vllm.v1.utils import ConstantList
+import numpy as np
 
 
 class NgramProposer:
@@ -9,8 +9,12 @@ class NgramProposer:
     def __init__(self):
         pass
 
-    def propose(self, context_token_ids: ConstantList[int], n: int,
-                k: int) -> Optional[List[int]]:
+    def propose(
+        self,
+        context_token_ids: np.ndarray,
+        n: int,
+        k: int,
+    ) -> Optional[np.ndarray]:
         """Proposes the next sequence of tokens based on n-gram pattern 
         matching in the context. The function finds matches of the last n 
         tokens in the previous context, and returns k tokens that followed 
@@ -25,8 +29,8 @@ def propose(self, context_token_ids: ConstantList[int], n: int,
                the maximum amount of tokens until the end.
         
         Returns:
-            List[int]: The sequence of tokens that followed 
-                       the matched n-gram in the context.
+            np.ndarray: The sequence of tokens that followed 
+                        the matched n-gram in the context.
             None: If no matching n-gram pattern is found.
         
         Example:
@@ -66,9 +70,12 @@ def _kmp_lps_array(pattern: List[int]) -> List[int]:
         return lps
 
     @staticmethod
-    def _find_subarray_kmp(context_token_ids: ConstantList[int], n: int,
-                           k: int) -> Optional[List[int]]:
-        context_len = len(context_token_ids)
+    def _find_subarray_kmp(
+        context_token_ids: np.ndarray,
+        n: int,
+        k: int,
+    ) -> Optional[np.ndarray]:
+        context_len = context_token_ids.shape[0]
         assert n > 0
 
         pattern = context_token_ids[-n:]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 805d8f618d2..cb7411a44e2 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -78,6 +78,7 @@ def __init__(
         )
         self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
 
@@ -217,7 +218,11 @@ def add_request(
         end_idx = start_idx + len(request.output_token_ids)
         self.token_ids_cpu[req_index,
                            start_idx:end_idx] = request.output_token_ids
+        # Number of token ids in token_ids_cpu.
+        # NOTE(woosuk): This may include spec decode tokens.
         self.num_tokens[req_index] = request.num_tokens
+        # Number of tokens without spec decode tokens.
+        self.num_tokens_no_spec[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
         self.block_table.add_row(req_index, request.block_ids)
@@ -356,6 +361,8 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
                 last_req_index, :num_tokens]
             self.num_tokens[empty_index] = num_tokens
+            self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[
+                last_req_index]
             self.num_prompt_tokens[empty_index] = self.num_prompt_tokens[
                 last_req_index]
             self.num_computed_tokens_cpu[
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1119d53b493..5754422cb1f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -33,6 +33,7 @@
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID
+from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -117,6 +118,15 @@ def __init__(
         # req_id -> (input_id -> encoder_output)
         self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
 
+        # Set up speculative decoding.
+        self.use_spec_decode = False
+        if self.speculative_config:
+            # TODO: find a better way to check if we are using ngram.
+            assert self.speculative_config.ngram_prompt_lookup_min, \
+                    "Currently, only ngram spec decode is supported in V1."
+            self.drafter = NgramProposer()
+            self.use_spec_decode = True
+
         # Request states.
         self.requests: Dict[str, CachedRequestState] = {}
         # Persistent batch.
@@ -367,6 +377,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.input_batch.token_ids_cpu[
                 req_index,
                 start_token_index:end_token_index] = req_data.new_token_ids
+            self.input_batch.num_tokens_no_spec[req_index] = end_token_index
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, [])
@@ -1009,15 +1020,51 @@ def execute_model(
                 for seq in sampled_token_ids[valid_mask].split(gen_lens)
             ]
 
+        if not self.use_spec_decode:
+            spec_token_ids = None
+        else:
+            spec_token_ids = self.generate_draft_token_ids(
+                valid_sampled_token_ids)
+
         model_runner_output = ModelRunnerOutput(
             req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
+            spec_token_ids=spec_token_ids,
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
         return model_runner_output
 
+    def generate_draft_token_ids(
+        self,
+        sampled_token_ids: List[List[int]],
+    ) -> List[List[int]]:
+        # TODO(woosuk): Optimize.
+        num_reqs = len(sampled_token_ids)
+        draft_token_ids: List[List[int]] = []
+        for i in range(num_reqs):
+            if len(sampled_token_ids[i]) == 0:
+                # Skip speculative decoding.
+                draft_token_ids.append([])
+                continue
+
+            # Add sampled_token_ids to token_ids_cpu.
+            start_idx = self.input_batch.num_tokens_no_spec[i]
+            end_idx = start_idx + len(sampled_token_ids[i])
+            self.input_batch.token_ids_cpu[
+                i, start_idx:end_idx] = sampled_token_ids[i]
+            drafter_output = self.drafter.propose(
+                self.input_batch.token_ids_cpu[i, :end_idx],
+                self.speculative_config.ngram_prompt_lookup_min,
+                self.speculative_config.num_speculative_tokens,
+            )
+            if drafter_output is None or len(drafter_output) == 0:
+                draft_token_ids.append([])
+            else:
+                draft_token_ids.append(drafter_output.tolist())
+        return draft_token_ids
+
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 8635ffce702..4ee6853ba7e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -696,6 +696,7 @@ def execute_model(
             req_ids=all_req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=[[token_id] for token_id in sampled_token_ids],
+            spec_token_ids=None,
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
         )

From 19f3e6584d2b59c3801cec037db8e73f039a4b32 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 17 Feb 2025 19:32:48 -0500
Subject: [PATCH 0223/1240] [Bugfix][CI][V1] Work around V1 + CUDA Graph +
 torch._scaled_mm fallback issue (#13425)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../schemes/compressed_tensors_w8a8_fp8.py         |  6 ++++--
 .../layers/quantization/fbgemm_fp8.py              |  4 +++-
 vllm/model_executor/layers/quantization/fp8.py     |  6 ++++--
 .../layers/quantization/utils/w8a8_utils.py        | 14 ++++++++------
 4 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 5dcc41a9e5d..32072e9fa57 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -9,8 +9,8 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
-    requantize_with_max_scale)
+    apply_fp8_linear, cutlass_fp8_supported, maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -93,6 +93,8 @@ def create_weights(self, layer: torch.nn.Module,
                        input_size_per_partition: int,
                        params_dtype: torch.dtype, weight_loader: Callable,
                        **kwargs):
+        maybe_create_device_identity()
+
         output_size_per_partition = sum(output_partition_sizes)
         layer.logical_widths = output_partition_sizes
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 3bb8188f725..20f2c3da600 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -17,7 +17,8 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, normalize_e4m3fn_to_e4m3fnuz)
+    apply_fp8_linear, maybe_create_device_identity,
+    normalize_e4m3fn_to_e4m3fnuz)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter)
 from vllm.platforms import current_platform
@@ -84,6 +85,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        maybe_create_device_identity()
         weight_loader = extra_weight_attrs.get("weight_loader")
         del input_size, output_size
         output_size_per_partition = sum(output_partition_sizes)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f928ea7e23c..fe8ff7ca5e1 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -24,8 +24,8 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     all_close_1d, apply_fp8_linear, convert_to_channelwise,
     cutlass_block_fp8_supported, cutlass_fp8_supported,
-    normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize,
-    requantize_with_max_scale)
+    maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
+    per_tensor_dequantize, requantize_with_max_scale)
 from vllm.model_executor.parameter import (BlockQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -162,6 +162,8 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        maybe_create_device_identity()
+
         output_size_per_partition = sum(output_partition_sizes)
         weight_loader = extra_weight_attrs.get("weight_loader")
 
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index bea6390f71f..0f93b7f6c45 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -9,7 +9,7 @@
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
-TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
+TORCH_DEVICE_IDENTITY = None
 
 # The condition to determine if it is on a platform that supports
 # torch._scaled_mm rowwise feature.
@@ -113,6 +113,13 @@ def requantize_with_max_scale(
     return max_w_scale, weight
 
 
+def maybe_create_device_identity():
+    # Allocate dummy ones tensor for torch._scaled_mm
+    global TORCH_DEVICE_IDENTITY
+    if TORCH_DEVICE_IDENTITY is None:
+        TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
+
+
 def apply_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -215,11 +222,6 @@ def apply_fp8_linear(
             # For the scaled_mm fallback case, we break this down, since it
             # does not support s_w being a vector.
 
-            # Making sure the dummy tensor is on the same device as the weight
-            global TORCH_DEVICE_IDENTITY
-            if TORCH_DEVICE_IDENTITY.device != weight.device:
-                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
-
             # GEMM
             # This computes C = (X * W).
             # Output in fp32 to allow subsequent ops to happen in-place

From cd51eca535ed98f0d144462ad5862438069b9904 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 18 Feb 2025 03:52:35 +0000
Subject: [PATCH 0224/1240] [Misc] Remove dangling references to
 `SamplingType.BEAM` (#13402)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/sampler.py | 78 ---------------------------
 1 file changed, 78 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 0fcb7869132..07ee75593f7 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -68,7 +68,6 @@ class SampleResultArgsType:
     sample_results_dict: SampleResultsDictType
     sampling_metadata: SamplingMetadata
     greedy_samples: Optional[torch.Tensor]
-    beam_search_logprobs: Optional[torch.Tensor]
 
 
 # Union of non-deferred (single-step scheduling)
@@ -510,74 +509,6 @@ def _random_sample(
     return results
 
 
-def _beam_search_sample(
-    selected_seq_groups: List[SequenceGroupToSample],
-    logprobs: torch.Tensor,
-) -> SampleResultType:
-    """Run beam sampling on a given samples.
-
-    Args:
-        selected_seq_groups: A list of sequence groups batched.
-        logprobs: (num_selected_samples, vocab_size,) A tensor of logprob
-        on selected sample indices.
-    Returns:
-        Tuple of (next_token_ids, parent_ids). The length of returned list is
-        same as the length of selected_seq_groups. If the corresponding
-        seq_group has do_sample=False, tuple contains ([], [])
-    """
-    # We sample 2 * beam_width candidates to make sure that with high
-    # probability we can get `beam_width` candidates in addition to
-    # the finished sequences for the next iteration. See
-    # https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
-    # for details. See also HF reference:
-    # https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
-    #
-    # NOTE: Beam search is not vectorized, so its speed can be slower than
-    # other sampling methods.
-    sample_idx = 0
-    results: SampleResultType = []
-    for seq_group in selected_seq_groups:
-        if not seq_group.do_sample:
-            results.append(([], []))
-            continue
-
-        is_prompt = seq_group.is_prompt
-        seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params
-        num_parent_seqs = len(seq_ids)
-        beam_width = sampling_params.n
-        seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs]
-        if is_prompt:
-            # Prompt phase.
-            assert num_parent_seqs == 1, (
-                "Prompt input should have only one seq.")
-            parent_ids = [0] * (2 * beam_width)
-            _, next_token_ids = torch.topk(seq_group_logprobs[0],
-                                           2 * beam_width)
-            next_token_ids = next_token_ids.tolist()
-        else:
-            # Generation phase.
-            cumulative_logprobs: List[float] = [
-                seq_group.seq_data[seq_id].cumulative_logprob
-                for seq_id in seq_ids
-            ]
-            cumulative_logprobs_tensor = torch.tensor(
-                cumulative_logprobs,
-                dtype=torch.float,
-                device=seq_group_logprobs.device)
-            seq_group_logprobs = (seq_group_logprobs +
-                                  cumulative_logprobs_tensor.unsqueeze(dim=1))
-            _, topk_ids = torch.topk(seq_group_logprobs.flatten(),
-                                     2 * beam_width)
-            topk_ids = topk_ids.tolist()
-            vocab_size = seq_group_logprobs.size(-1)
-            parent_ids = [i // vocab_size for i in topk_ids]
-            next_token_ids = [i % vocab_size for i in topk_ids]
-        results.append((next_token_ids, parent_ids))
-        sample_idx += num_parent_seqs
-    assert sample_idx == logprobs.size(0)
-    return results
-
-
 # torch.multinomial forces a GPU<->CPU sync.
 # Therefore, we use an optimized implementation instead.
 # Note that we always sample with replacement.
@@ -666,14 +597,12 @@ def get_pythonized_sample_results(
         sampling_metadata,
         greedy_samples,
         multinomial_samples,
-        beam_search_logprobs,
         sample_results_dict,
     ) = (
         sample_result_args.sample_metadata,
         sample_result_args.sampling_metadata,
         sample_result_args.greedy_samples,
         sample_result_args.multinomial_samples,
-        sample_result_args.beam_search_logprobs,
         sample_result_args.sample_results_dict,
     )
 
@@ -686,9 +615,6 @@ def get_pythonized_sample_results(
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             sample_results = _random_sample(seq_groups,
                                             multinomial_samples[sampling_type])
-        elif sampling_type == SamplingType.BEAM:
-            sample_results = _beam_search_sample(seq_groups,
-                                                 beam_search_logprobs)
         sample_results_dict.update(zip(seq_group_id, sample_results))
 
     return [
@@ -731,7 +657,6 @@ def _sample_with_torch(
     sample_metadata: SampleMetadataType = {}
     multinomial_samples: MultinomialSamplesType = {}
     greedy_samples: Optional[torch.Tensor] = None
-    beam_search_logprobs: Optional[torch.Tensor] = None
 
     # Create output tensor for sampled token ids.
     if include_gpu_probs_tensor:
@@ -800,8 +725,6 @@ def _sample_with_torch(
                 sampled_token_ids_tensor[long_sample_indices] = \
                     multinomial_samples[sampling_type].to(torch.long)
 
-        elif sampling_type == SamplingType.BEAM:
-            beam_search_logprobs = logprobs[sample_indices]
         else:
             raise ValueError(f"Unsupported sampling type: {sampling_type}")
 
@@ -812,7 +735,6 @@ def _sample_with_torch(
         sample_metadata=sample_metadata,
         multinomial_samples=multinomial_samples,
         greedy_samples=greedy_samples,
-        beam_search_logprobs=beam_search_logprobs,
         sample_results_dict=sample_results_dict)
 
     if not sampling_metadata.skip_sampler_cpu_output:

From 3f163e0d55a60b33bf6b43df555f53989da5e82b Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Feb 2025 11:52:47 +0800
Subject: [PATCH 0225/1240] [Model] Enable quantization support for
 `transformers` backend (#12960)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md     | 10 ++--
 tests/models/test_transformers.py          | 54 ++++++++++++++++++++--
 vllm/model_executor/models/transformers.py | 25 ++++------
 3 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index b046ccfd155..a1a28986b8a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -42,7 +42,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project
 
 ### Transformers fallback
 
-After the merge of <gh-pr:11330>, `vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+`vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
 
 To check if the backend is `transformers`, you can simply do this:
 
@@ -56,9 +56,13 @@ If it is `TransformersModel` then it means it's based on `transformers`!
 
 #### Supported features
 
-##### LORA and quantization
+##### Quantization
 
-Both are not supported yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
+Transformers fallback has supported most of available quantization in vLLM (except GGUF). See [Quantization page](#quantization-index) for more information about supported quantization in vllm.
+
+##### LoRA
+
+LoRA hasn't supported on transformers fallback yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
 
 Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
 
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 1d5d9729df8..31e3c1f7b98 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -45,10 +45,14 @@ def check_implementation(
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
         ("openai-community/gpt2", "transformers"),
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
-        ("meta-llama/Llama-3.2-1B-Instruct", "auto"),
     ])  # trust_remote_code=True by default
-def test_models(hf_runner, vllm_runner, example_prompts, model,
-                model_impl) -> None:
+def test_models(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    model_impl: str,
+) -> None:
 
     maybe_raises = nullcontext()
     if model == "openai-community/gpt2" and model_impl == "transformers":
@@ -67,10 +71,50 @@ def test_models(hf_runner, vllm_runner, example_prompts, model,
 
 @multi_gpu_test(num_gpus=2)
 def test_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
     example_prompts,
 ):
     kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
     check_implementation(hf_runner, vllm_runner, example_prompts,
                          "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+
+
+@pytest.mark.parametrize("model, quantization_kwargs", [
+    (
+        "meta-llama/Llama-3.2-1B-Instruct",
+        {
+            "quantization": "bitsandbytes",
+            "load_format": "bitsandbytes",
+        },
+    ),
+])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_quantization(
+    vllm_runner: Type[VllmRunner],
+    example_prompts: list[str],
+    model: str,
+    quantization_kwargs: dict[str, str],
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with vllm_runner(
+            model, model_impl="auto", enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+
+    with vllm_runner(
+            model,
+            model_impl="transformers",
+            enforce_eager=True,
+            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
+        transformers_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 1605467bc3d..9b456b24895 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -28,6 +28,7 @@
 from vllm.distributed.utils import divide
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -37,6 +38,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
 from .utils import maybe_prefix
 
 logger = init_logger(__name__)
@@ -50,10 +52,10 @@ def vllm_flash_attention_forward(
         value: torch.Tensor,
         attention_mask: torch.Tensor,
         # Transformers kwargs
-        scaling: float = None,
+        scaling: Optional[float] = None,
         # vLLM kwargs
-        attn_metadata: AttentionMetadata = None,
-        attention_instances: list[Attention] = None,
+        attn_metadata: Optional[AttentionMetadata] = None,
+        attention_instances: Optional[list[Attention]] = None,
         **kwargs):
     self_attn = attention_instances[module.layer_idx]
     if scaling is not None:
@@ -99,13 +101,7 @@ def replace_linear_class(
     vllm_linear_cls = {
         "colwise": ColumnParallelLinear,
         "rowwise": RowParallelLinear,
-    }.get(style)
-
-    if vllm_linear_cls is None:
-        logger.warning(
-            "Unsupported parallel style value: %s. "
-            "This layer will not be tensor parallelized.", style)
-        return linear
+    }.get(style, ReplicatedLinear)
 
     class HFCompatibleLinear(vllm_linear_cls):
         """
@@ -119,10 +115,11 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         input_size=linear.in_features,
         output_size=linear.out_features,
         bias=linear.bias is not None,
+        quant_config=quant_config,
     )
 
 
-class TransformersModel(nn.Module):
+class TransformersModel(nn.Module, SupportsQuant):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"
                          ]  # TODO transformers will have a util to get it
@@ -133,10 +130,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
 
         self.config = config
-        self.quant_config = quant_config
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
 
@@ -162,7 +157,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                 scale=config.head_dim**-0.5,
                 num_kv_heads=divide(config.num_key_value_heads, tp_size),
                 cache_config=cache_config,
-                quant_config=None,
+                quant_config=self.quant_config,
                 prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
         ]
 
@@ -172,7 +167,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         # ForCausalLM modifications
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
-                                      quant_config=None,
+                                      quant_config=self.quant_config,
                                       prefix=maybe_prefix(prefix, "lm_head"))
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.get_input_embeddings().weight

From 34af4785b118554d3283683ad5e937eb5910aa45 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 17 Feb 2025 22:07:12 -0600
Subject: [PATCH 0226/1240] [ROCm] fix get_device_name for rocm (#13438)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/rocm.py | 49 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 393b8a18527..e506689dc33 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,9 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import lru_cache
+import os
+from functools import lru_cache, wraps
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
+from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
+                    amdsmi_init, amdsmi_shut_down)
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -53,6 +56,41 @@
      "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`")
 }
 
+# Prevent use of clashing `{CUDA/HIP}_VISIBLE_DEVICES``
+if "HIP_VISIBLE_DEVICES" in os.environ:
+    val = os.environ["HIP_VISIBLE_DEVICES"]
+    if cuda_val := os.environ.get("CUDA_VISIBLE_DEVICES", None):
+        assert val == cuda_val
+    else:
+        os.environ["CUDA_VISIBLE_DEVICES"] = val
+
+# AMDSMI utils
+# Note that NVML is not affected by `{CUDA/HIP}_VISIBLE_DEVICES`,
+# all the related functions work on real physical device ids.
+# the major benefit of using AMDSMI is that it will not initialize CUDA
+
+
+def with_amdsmi_context(fn):
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        amdsmi_init()
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            amdsmi_shut_down()
+
+    return wrapper
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
 
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
@@ -96,13 +134,12 @@ def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
         return DeviceCapability(major=major, minor=minor)
 
     @classmethod
+    @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(cls, device_id: int = 0) -> str:
-        # NOTE: When using V1 this function is called when overriding the
-        # engine args. Calling torch.cuda.get_device_name(device_id) here
-        # will result in the ROCm context being initialized before other
-        # processes can be created.
-        return "AMD"
+        physical_device_id = device_id_to_physical_device_id(device_id)
+        handle = amdsmi_get_processor_handles()[physical_device_id]
+        return amdsmi_get_gpu_asic_info(handle)["market_name"]
 
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:

From 27eba3d42da7d3f28de1ae16340b96452b92c9f0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 18 Feb 2025 12:33:45 +0800
Subject: [PATCH 0227/1240] [v1] fix parallel config rank (#13445)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/worker_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index bc7e76c38ae..51d2da2344b 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -41,6 +41,7 @@ def __init__(
         # Configuration storage
         super().__init__(vllm_config=vllm_config)
 
+        self.parallel_config.rank = rank
         self.local_rank = local_rank
         self.rank = rank
         self.distributed_init_method = distributed_init_method

From 058bfc8ca6bb287280814512590e8de9a31f5f27 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 18 Feb 2025 00:34:47 -0500
Subject: [PATCH 0228/1240] [Quant] Molmo SupportsQuant (#13336)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/molmo.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index feb58502231..b2154ef54af 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -52,7 +52,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import JSONTree, json_map_leaves
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -633,7 +634,8 @@ def forward(
         return hidden_states, residual
 
 
-class MolmoVisionBackbone(nn.Module):
+class MolmoVisionBackbone(nn.Module, SupportsQuant):
+    packed_modules_mapping = {"merged_linear": ["gate_proj", "up_proj"]}
 
     def __init__(
         self,
@@ -794,7 +796,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 
 @support_torch_compile
-class MolmoModel(nn.Module):
+class MolmoModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -1402,8 +1404,8 @@ def get_replacement_molmo(item_idx: int):
 @MULTIMODAL_REGISTRY.register_processor(MolmoMultiModalProcessor,
                                         info=MolmoProcessingInfo,
                                         dummy_inputs=MolmoDummyInputsBuilder)
-class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
-                       SupportsLoRA):
+class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
+                       SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             # vision backbone mapping

From 893bb16cccff59ba2d00517eefb5303859c836f4 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 18 Feb 2025 00:35:09 -0500
Subject: [PATCH 0229/1240] [Quant] Arctic SupportsQuant (#13366)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/arctic.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index d015682aab4..27df448e63f 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -33,7 +33,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsPP, SupportsQuant
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -423,7 +423,8 @@ def forward(
         return hidden_states
 
 
-class ArcticForCausalLM(nn.Module, SupportsPP):
+class ArcticForCausalLM(nn.Module, SupportsPP, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 762d5020dd9fff39130f2c175f31817a6a9eb64f Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 18 Feb 2025 00:43:31 -0500
Subject: [PATCH 0230/1240] [Bugfix] Only print out chat template when supplied
 (#13444)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ad391d6737b..da5383e790f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -797,7 +797,9 @@ async def init_app_state(
     state.log_stats = not args.disable_log_stats
 
     resolved_chat_template = load_chat_template(args.chat_template)
-    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
+    if resolved_chat_template is not None:
+        logger.info("Using supplied chat template:\n%s",
+                    resolved_chat_template)
 
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,

From 1bf23c97e67aa915b19ccaba5578088dcf27c168 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 18 Feb 2025 13:48:10 +0800
Subject: [PATCH 0231/1240] [core] fix sleep mode in pytorch 2.6 (#13456)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/device_allocator/cumem.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index f74ad9ac338..7f63fc14378 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -9,7 +9,7 @@
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
 from contextlib import contextmanager
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 import torch
 
@@ -97,7 +97,7 @@ def use_memory_pool_with_allocator(
     new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func)
     mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator)
     with torch.cuda.memory.use_mem_pool(mem_pool):
-        yield mem_pool
+        yield mem_pool, new_alloc
 
 
 class CuMemAllocator:
@@ -142,6 +142,7 @@ def get_instance() -> "CuMemAllocator":
     def __init__(self):
         self.pointer_to_data: Dict[int, AllocationData] = {}
         self.current_tag: str = CuMemAllocator.default_tag
+        self.allocator_and_pools: Dict[str, Any] = {}
 
     def python_malloc_callback(self, allocation_handle: HandleType) -> None:
         """
@@ -231,7 +232,13 @@ def use_memory_pool(self, tag: Optional[str] = None):
         old_tag = self.current_tag
         self.current_tag = tag
         with use_memory_pool_with_allocator(self.python_malloc_callback,
-                                            self.python_free_callback):
+                                            self.python_free_callback) as data:
+            # start to hit another PyTorch bug in PyTorch 2.6,
+            # possibly because of gc-related issue w.r.t. the allocator and
+            # the memory pool.
+            # to avoid the issue, we keep a reference of the data.
+            # see https://github.com/pytorch/pytorch/issues/146431 .
+            self.allocator_and_pools[tag] = data
             yield
             # PyTorch's bug, calling torch.cuda.empty_cache() will error
             # when using pluggable allocator, see

From 3ed28478d1a0108631ed953a680ca8dfca4a2a91 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Tue, 18 Feb 2025 00:51:09 -0500
Subject: [PATCH 0232/1240] [Quant] Aria SupportsQuant (#13416)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/aria.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 98df532aa0a..df73a3b76b1 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -36,7 +36,7 @@
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsQuant
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter, maybe_prefix,
@@ -53,7 +53,8 @@ class AriaImagePixelInputs(TypedDict):
     """
 
 
-class AriaVisionTransformer(Idefics3VisionTransformer):
+class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(
         self,
@@ -304,11 +305,17 @@ def __init__(
         self.mlp = AriaTextMoELayer(config, quant_config=quant_config)
 
 
-class AriaTextModel(LlamaModel):
+class AriaTextModel(LlamaModel, SupportsQuant):
     """
     Custom LlamaModel for the AriaMoE model which modifies the standard
     LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`.
     """
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        "experts.w13_weight": ["experts.fc1.weight"],
+        "experts.w2_weight": ["experts.fc2.weight"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config,

From 4fce400f7eeb891e15fa3ca71c08b472b062f83e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 17 Feb 2025 21:58:06 -0800
Subject: [PATCH 0233/1240] [V1][PP] Fix & Pin Ray version in
 requirements-cuda.txt (#13436)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-cuda.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 0e7217fb376..44b56422e3a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray[default] >= 2.9
+ray[adag] == 2.41.0 # Required for pipeline parallelism in V1.
 torch == 2.5.1
 torchaudio==2.5.1
 # These must be updated alongside torch

From 319444f5242065a19c39ade383bd3b0ce34ce10d Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 18 Feb 2025 01:49:41 -0500
Subject: [PATCH 0234/1240] Add outlines fallback when JSON schema has enum
 (#13449)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/conftest.py                 | 41 +++++++++++++++++++
 tests/entrypoints/llm/test_guided_generate.py | 41 +++++++++++++++++++
 vllm/model_executor/guided_decoding/utils.py  |  4 ++
 3 files changed, 86 insertions(+)

diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index b00e168db9d..3b596ea3e6a 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -141,6 +141,47 @@ def sample_definition_json_schema():
     }
 
 
+@pytest.fixture
+def sample_enum_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "status": {
+                "type": "string",
+                "enum": ["active", "inactive",
+                         "pending"]  # Literal values using enum
+            },
+            "priority": {
+                "type": "string",
+                "enum": ["low", "medium", "high", "critical"]
+            },
+            "category": {
+                "type": "object",
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": ["bug", "feature", "improvement"]
+                    },
+                    "severity": {
+                        "type": "integer",
+                        "enum": [1, 2, 3, 4,
+                                 5]  # Enum can also contain numbers
+                    }
+                },
+                "required": ["type", "severity"]
+            },
+            "flags": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "enum": ["urgent", "blocked", "needs_review", "approved"]
+                }
+            }
+        },
+        "required": ["status", "priority", "category", "flags"]
+    }
+
+
 @pytest.fixture
 def sample_guided_choice():
     return [
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 932a35a9950..01d2c1709b4 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -146,6 +146,47 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm,
                             schema=sample_definition_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_enum_json_completion(sample_enum_json_schema, llm,
+                                     guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_enum_json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(prompts=[
+        "Create a bug report JSON that fits this schema: "
+        f"{sample_enum_json_schema}. Make it for a high priority critical bug."
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json,
+                            schema=sample_enum_json_schema)
+
+        # Additional assertions to verify enum values
+        assert output_json["status"] in ["active", "inactive", "pending"]
+        assert output_json["priority"] in ["low", "medium", "high", "critical"]
+        assert output_json["category"]["type"] in [
+            "bug", "feature", "improvement"
+        ]
+        assert output_json["category"]["severity"] in [1, 2, 3, 4, 5]
+        for flag in output_json["flags"]:
+            assert flag in ["urgent", "blocked", "needs_review", "approved"]
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_choice_completion(sample_guided_choice, llm,
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 87ef4535845..c3c0378ea95 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -14,6 +14,10 @@ def check_object(obj: dict) -> bool:
         if "pattern" in obj:
             return True
 
+        # Check for enum restrictions
+        if "enum" in obj:
+            return True
+
         # Check for numeric ranges
         if obj.get("type") in ("integer", "number") and any(
                 key in obj for key in [

From 2c7fea5da5c5689818099115ac97ce77bc07b6c0 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Tue, 18 Feb 2025 03:19:15 -0500
Subject: [PATCH 0235/1240] [Bugfix] Ensure LoRA path from the request can be
 included in err msg (#13450)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/worker_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index f33a7b88cc3..b103acefe4a 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -133,7 +133,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             # For NotFoundError
             raise ValueError(
                 f"Loading lora {lora_request.lora_name} failed: No adapter "
-                f"found for {lora_path}") from e
+                f"found for {lora_request.lora_path}") from e
         except Exception as e:
             # For BadRequestError
             raise e

From de8fb3fd80e62e269830104308f1d2340497fbbf Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Feb 2025 18:25:53 +0800
Subject: [PATCH 0236/1240] [Bugfix] Fix failing transformers dynamic module
 resolving with spawn multiproc method (#13403)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/utils.py | 25 +++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index dc620d4984a..9686231fb4b 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -47,22 +47,31 @@ def resolve_transformers_fallback(model_config: ModelConfig,
     for i, arch in enumerate(architectures):
         if arch == "TransformersModel":
             continue
-        custom_module = None
-        auto_map = getattr(model_config.hf_config, "auto_map", None)
-        if auto_map is not None and "AutoModel" in auto_map:
-            custom_module = get_class_from_dynamic_module(
-                model_config.hf_config.auto_map["AutoModel"],
-                model_config.model)
+        auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
+                                           None) or dict()
+        # Make sure that config class is always initialized before model class,
+        # otherwise the model class won't be able to access the config class,
+        # the expected auto_map should have correct order like:
+        # "auto_map": {
+        #     "AutoConfig": "<your-repo-name>--<config-name>",
+        #     "AutoModel": "<your-repo-name>--<config-name>",
+        #     "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
+        # },
+        auto_modules = {
+            name: get_class_from_dynamic_module(module, model_config.model)
+            for name, module in sorted(auto_map.items(), key=lambda x: x[0])
+        }
+        custom_model_module = auto_modules.get("AutoModel")
         # TODO(Isotr0py): Further clean up these raises.
         # perhaps handled them in _ModelRegistry._raise_for_unsupported?
         if model_config.model_impl == ModelImpl.TRANSFORMERS:
-            if not is_transformers_impl_compatible(arch, custom_module):
+            if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
             architectures[i] = "TransformersModel"
         if model_config.model_impl == ModelImpl.AUTO:
-            if not is_transformers_impl_compatible(arch, custom_module):
+            if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
                     f"{arch} has no vLLM implementation and the Transformers "
                     "implementation is not compatible with vLLM.")

From 23e0cb7b65d0d62f206f8c6d6ccb49c942fee2f8 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:52:39 +0000
Subject: [PATCH 0237/1240] [Doc]: Improve feature tables (#13224)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/_static/custom.css                |   8 +
 docs/source/conf.py                           |   9 +-
 docs/source/features/compatibility_matrix.md  | 151 +++++++++---------
 .../quantization/supported_hardware.md        |  94 +++++------
 docs/source/models/pooling_models.md          |   8 +-
 5 files changed, 142 insertions(+), 128 deletions(-)
 create mode 100644 docs/source/_static/custom.css

diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
new file mode 100644
index 00000000000..79bd2082b49
--- /dev/null
+++ b/docs/source/_static/custom.css
@@ -0,0 +1,8 @@
+.vertical-table-header th.head:not(.stub) {
+    writing-mode: sideways-lr;
+    white-space: nowrap;
+    max-width: 0;
+    p {
+       margin: 0;
+    }
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f4e8c8b9491..84c9a27be3b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -78,8 +78,12 @@
     'use_repository_button': True,
     'use_edit_page_button': True,
 }
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ["_static"]
 html_js_files = ["custom.js"]
+html_css_files = ["custom.css"]
 
 myst_url_schemes = {
     'http': None,
@@ -121,11 +125,6 @@
     if os.path.exists(header_file):
         os.remove(header_file)
 
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
 
 # Generate additional rst documentation here.
 def setup(app):
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
index ee5db70c7d5..6056ca0d366 100644
--- a/docs/source/features/compatibility_matrix.md
+++ b/docs/source/features/compatibility_matrix.md
@@ -4,8 +4,14 @@
 
 The tables below show mutually exclusive features and the support on some hardware.
 
+The symbols used have the following meanings:
+
+- ✅ = Full compatibility
+- 🟠 = Partial compatibility
+- ❌ = No compatibility
+
 :::{note}
-Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
+Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
 :::
 
 ## Feature x Feature
@@ -29,6 +35,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 :header-rows: 1
 :stub-columns: 1
 :widths: auto
+:class: vertical-table-header
 
 - * Feature
   * [CP](#chunked-prefill)
@@ -48,7 +55,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * beam-search
   * <abbr title="Guided Decoding">guided dec</abbr>
 - * [CP](#chunked-prefill)
-  *
+  * ✅
   *
   *
   *
@@ -66,7 +73,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   *
 - * [APC](#automatic-prefix-caching)
   * ✅
-  *
+  * ✅
   *
   *
   *
@@ -82,9 +89,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   *
   *
 - * [LoRA](#lora-adapter)
-  * [✗](gh-pr:9057)
   * ✅
-  *
+  * ✅
+  * ✅
   *
   *
   *
@@ -102,7 +109,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  *
+  * ✅
   *
   *
   *
@@ -118,9 +125,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
 - * [SD](#spec_decode)
   * ✅
   * ✅
-  * ✗
+  * ❌
+  * ✅
   * ✅
-  *
   *
   *
   *
@@ -138,7 +145,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  *
+  * ✅
   *
   *
   *
@@ -150,13 +157,13 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   *
   *
 - * <abbr title="Pooling Models">pooling</abbr>
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  *
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ✅
   *
   *
   *
@@ -167,14 +174,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   *
   *
 - * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * ✗
-  * [✗](gh-issue:7366)
-  * ✗
-  * ✗
-  * [✗](gh-issue:7366)
+  * ❌
+  * [❌](gh-issue:7366)
+  * ❌
+  * ❌
+  * [❌](gh-issue:7366)
+  * ✅
   * ✅
   * ✅
-  *
   *
   *
   *
@@ -190,9 +197,9 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ✗
+  * ❌
+  * ✅
   * ✅
-  *
   *
   *
   *
@@ -205,12 +212,12 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * [✗](gh-pr:8199)
   * ✅
-  * ✗
+  * ✅
+  * ❌
+  * ✅
   * ✅
   * ✅
-  *
   *
   *
   *
@@ -222,49 +229,49 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ✗
+  * ❌
+  * ✅
+  * ❌
+  * ❌
   * ✅
-  * ✗
-  * ✗
   * ✅
   * ✅
-  *
   *
   *
   *
   *
   *
 - * multi-step
-  * ✗
+  * ❌
   * ✅
-  * ✗
+  * ❌
+  * ✅
+  * ❌
+  * ✅
+  * ❌
+  * ❌
   * ✅
-  * ✗
   * ✅
-  * ✗
-  * ✗
   * ✅
-  * [✗](gh-issue:8198)
   * ✅
-  *
   *
   *
   *
   *
 - * <abbr title="Multimodal Inputs">mm</abbr>
   * ✅
-  * [✗](gh-pr:8348)
-  * [✗](gh-pr:7199)
-  * ?
-  * ?
+  * [🟠](gh-pr:8348)
+  * [🟠](gh-pr:4194)
+  * ❔
+  * ❔
   * ✅
   * ✅
   * ✅
   * ✅
   * ✅
   * ✅
-  * ?
-  *
+  * ❔
+  * ✅
   *
   *
   *
@@ -273,16 +280,16 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * [✗](gh-issue:6137)
+  * [❌](gh-issue:6137)
   * ✅
-  * ✗
+  * ❌
   * ✅
   * ✅
   * ✅
-  * ?
-  * [✗](gh-issue:7968)
+  * ❔
+  * [❌](gh-issue:7968)
+  * ✅
   * ✅
-  *
   *
   *
 - * beam-search
@@ -290,35 +297,35 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * [✗](gh-issue:6137)
+  * [❌](gh-issue:6137)
   * ✅
-  * ✗
+  * ❌
   * ✅
   * ✅
   * ✅
-  * ?
-  * [✗](gh-issue:7968)
-  * ?
+  * ❔
+  * [❌](gh-issue:7968)
+  * ❔
+  * ✅
   * ✅
-  *
   *
 - * <abbr title="Guided Decoding">guided dec</abbr>
   * ✅
   * ✅
-  * ?
-  * ?
-  * [✗](gh-issue:11484)
+  * ❔
+  * ❔
+  * [❌](gh-issue:11484)
   * ✅
-  * ✗
-  * ?
+  * ❌
+  * ❔
   * ✅
   * ✅
   * ✅
-  * [✗](gh-issue:9893)
-  * ?
+  * [❌](gh-issue:9893)
+  * ❔
+  * ✅
   * ✅
   * ✅
-  *
 :::
 
 (feature-x-hardware)=
@@ -339,7 +346,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * CPU
   * AMD
 - * [CP](#chunked-prefill)
-  * [✗](gh-issue:2729)
+  * [❌](gh-issue:2729)
   * ✅
   * ✅
   * ✅
@@ -347,7 +354,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
 - * [APC](#automatic-prefix-caching)
-  * [✗](gh-issue:3687)
+  * [❌](gh-issue:3687)
   * ✅
   * ✅
   * ✅
@@ -368,7 +375,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * [✗](gh-issue:8475)
+  * [❌](gh-issue:8475)
   * ✅
 - * [SD](#spec_decode)
   * ✅
@@ -384,7 +391,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ✗
+  * ❌
   * ✅
 - * <abbr title="Pooling Models">pooling</abbr>
   * ✅
@@ -393,7 +400,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ?
+  * ❔
 - * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
   * ✅
   * ✅
@@ -401,7 +408,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ✗
+  * ❌
 - * <abbr title="Multimodal Inputs">mm</abbr>
   * ✅
   * ✅
@@ -432,15 +439,15 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar
   * ✅
   * ✅
   * ✅
-  * ✗
-  * ✗
+  * ❌
+  * ❌
 - * multi-step
   * ✅
   * ✅
   * ✅
   * ✅
   * ✅
-  * [✗](gh-issue:8477)
+  * [❌](gh-issue:8477)
   * ✅
 - * best-of
   * ✅
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
index 555ed4ce4c8..a5bd8caf77c 100644
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@@ -20,93 +20,93 @@ The table below shows the compatibility of various quantization implementations
   * AWS Inferentia
   * Google TPU
 - * AWQ
-  * ✗
+  * ❌
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
+  * ❌
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
+  * ❌
+  * ❌
 - * GPTQ
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
+  * ❌
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
+  * ❌
+  * ❌
 - * Marlin (GPTQ/AWQ/FP8)
-  * ✗
-  * ✗
+  * ❌
+  * ❌
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * INT8 (W8A8)
-  * ✗
+  * ❌
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
+  * ❌
+  * ❌
   * ✅︎
-  * ✗
-  * ✗
+  * ❌
+  * ❌
 - * FP8 (W8A8)
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * AQLM
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * bitsandbytes
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * DeepSpeedFP
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 - * GGUF
   * ✅︎
   * ✅︎
@@ -114,16 +114,16 @@ The table below shows the compatibility of various quantization implementations
   * ✅︎
   * ✅︎
   * ✅︎
-  * ✗
-  * ✗
-  * ✗
-  * ✗
+  * ❌
+  * ❌
+  * ❌
+  * ❌
 
 :::
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- "✅︎" indicates that the quantization method is supported on the specified hardware.
-- "✗" indicates that the quantization method is not supported on the specified hardware.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
 
 :::{note}
 This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 9704ccee745..764b6724199 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -28,10 +28,10 @@ The selected option sets the default pooler used to extract the final hidden sta
 - * Embedding (`embed`)
   * `LAST`
   * ✅︎
-  * ✗
+  * ❌
 - * Classification (`classify`)
   * `LAST`
-  * ✗
+  * ❌
   * ✅︎
 - * Sentence Pair Scoring (`score`)
   * \*
@@ -39,8 +39,8 @@ The selected option sets the default pooler used to extract the final hidden sta
   * \*
 - * Reward Modeling (`reward`)
   * `ALL`
-  * ✗
-  * ✗
+  * ❌
+  * ❌
 :::
 
 \*The default pooler is always defined by the model.

From d680a057a4d159bfbb428fcc74d665a5349837e9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Feb 2025 19:15:48 +0800
Subject: [PATCH 0238/1240] [Bugfix] Remove noisy error logging during local
 model loading (#13458)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 360b457a19a..2fed5d743e8 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -504,8 +504,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             repo_files = list_repo_files(model,
                                          revision=revision,
                                          token=HF_TOKEN)
-        except Exception as e:
-            logger.error("Error getting repo files", e)
+        except Exception:
             repo_files = []
 
         for config_name in sentence_transformer_config_files:

From 66d5b3912420d465ce4028d06782b92f11eedd62 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Feb 2025 19:15:56 +0800
Subject: [PATCH 0239/1240] [ROCm] Make amdsmi import optional for other
 platforms (#13460)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/rocm.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index e506689dc33..a4f18cbfc58 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -5,8 +5,6 @@
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
-from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
-                    amdsmi_init, amdsmi_shut_down)
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -20,6 +18,12 @@
 
 logger = init_logger(__name__)
 
+try:
+    from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
+                        amdsmi_init, amdsmi_shut_down)
+except ImportError as e:
+    logger.warning("Failed to import from amdsmi with %r", e)
+
 try:
     import vllm._C  # noqa: F401
 except ImportError as e:

From 0fff7b593db08e70302d2f28b0689a969430ae51 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 18 Feb 2025 03:29:13 -0800
Subject: [PATCH 0240/1240] [Bugfix] Handle content type with optional
 parameters (#13383)

Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index da5383e790f..0de7e239269 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -258,7 +258,8 @@ def _cleanup_ipc_path():
 
 async def validate_json_request(raw_request: Request):
     content_type = raw_request.headers.get("content-type", "").lower()
-    if content_type != "application/json":
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
         raise HTTPException(
             status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE,
             detail="Unsupported Media Type: Only 'application/json' is allowed"

From e83d9a596c79846738badd2928bb507e3190c377 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Tue, 18 Feb 2025 03:52:03 -0800
Subject: [PATCH 0241/1240] [Bugfix] Fix invalid rotary embedding unit test
 (#13431)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_rotary_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/test_rotary_embedding.py b/tests/kernels/test_rotary_embedding.py
index 362bcb35cea..c497dd90edd 100644
--- a/tests/kernels/test_rotary_embedding.py
+++ b/tests/kernels/test_rotary_embedding.py
@@ -41,7 +41,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
                                   is_neox_style, rotary_dim, head_size,
                                   seq_len):
     batch_size = 1
-    base = 0
+    base = 10000
     num_heads = 7
     rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                           is_neox_style, torch.float32)

From 80b0b0a87e1eeed5bdc934a977baa8e994b59ee4 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Tue, 18 Feb 2025 17:02:49 +0100
Subject: [PATCH 0242/1240] [CI/Build] migrate static project metadata from
 setup.py to pyproject.toml (#8772)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 pyproject.toml | 36 ++++++++++++++++++++++++++++++++++-
 setup.py       | 51 ++++----------------------------------------------
 2 files changed, 39 insertions(+), 48 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 96d4aa149ab..ac155116ccd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,8 +12,42 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[project]
+name = "vllm"
+authors = [{name = "vLLM Team"}]
+license = { "file"= "LICENSE" }
+readme = "README.md"
+description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
+classifiers = [
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "License :: OSI Approved :: Apache Software License",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Information Technology",
+    "Intended Audience :: Science/Research",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+]
+requires-python = ">=3.9"
+dynamic = [ "version", "dependencies", "optional-dependencies"]
+
+[project.urls]
+Homepage="https://github.com/vllm-project/vllm"
+Documentation="https://vllm.readthedocs.io/en/latest/"
+Slack="http://slack.vllm.ai/"
+
+[project.scripts]
+vllm = "vllm.entrypoints.cli.main:main"
+
 [tool.setuptools_scm]
-# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
+version_file = "vllm/_version.py"
+
+[tool.setuptools.packages.find]
+where = ["."]
+exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
+namespaces = false
 
 [tool.yapfignore]
 ignore_patterns = [
diff --git a/setup.py b/setup.py
index 7243a2ab30a..d09ae4b3810 100755
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 import torch
 from packaging.version import Version, parse
-from setuptools import Extension, find_packages, setup
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
@@ -499,9 +499,7 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    version = get_version(
-        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-    )
+    version = get_version()
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
     if _no_device():
@@ -549,16 +547,6 @@ def get_vllm_version() -> str:
     return version
 
 
-def read_readme() -> str:
-    """Read the README file if present."""
-    p = get_path("README.md")
-    if os.path.isfile(p):
-        with open(get_path("README.md"), encoding="utf-8") as f:
-            return f.read()
-    else:
-        return ""
-
-
 def get_requirements() -> List[str]:
     """Get Python package dependencies from requirements.txt."""
 
@@ -649,36 +637,10 @@ def _read_requirements(filename: str) -> List[str]:
     }
 
 setup(
-    name="vllm",
+    # static metadata should rather go in pyproject.toml
     version=get_vllm_version(),
-    author="vLLM Team",
-    license="Apache 2.0",
-    description=("A high-throughput and memory-efficient inference and "
-                 "serving engine for LLMs"),
-    long_description=read_readme(),
-    long_description_content_type="text/markdown",
-    url="https://github.com/vllm-project/vllm",
-    project_urls={
-        "Homepage": "https://github.com/vllm-project/vllm",
-        "Documentation": "https://vllm.readthedocs.io/en/latest/",
-    },
-    classifiers=[
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Programming Language :: Python :: 3.12",
-        "License :: OSI Approved :: Apache Software License",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Information Technology",
-        "Intended Audience :: Science/Research",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        "Topic :: Scientific/Engineering :: Information Analysis",
-    ],
-    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
-                                    "tests*")),
-    python_requires=">=3.9",
-    install_requires=get_requirements(),
     ext_modules=ext_modules,
+    install_requires=get_requirements(),
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
@@ -687,9 +649,4 @@ def _read_requirements(filename: str) -> List[str]:
     },
     cmdclass=cmdclass,
     package_data=package_data,
-    entry_points={
-        "console_scripts": [
-            "vllm=vllm.entrypoints.cli.main:main",
-        ],
-    },
 )

From aa547210ee293bba6b57cee65e977912c99599da Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 18 Feb 2025 09:15:32 -0800
Subject: [PATCH 0243/1240] [V1][PP] Enable true PP with Ray executor  (#13472)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/executor/ray_distributed_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index 53548610adf..320ebfd37ae 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -32,7 +32,7 @@ def max_concurrent_batches(self) -> int:
         """Ray distributed executor supports pipeline parallelism,
         meaning that it allows PP size batches to be executed concurrently.
         """
-        return 1  #self.vllm_config.parallel_config.pipeline_parallel_size
+        return self.parallel_config.pipeline_parallel_size
 
     def execute_model(
         self,

From 8d91e3e5687a5c263e2ee0492cc387ab5ea8cbd9 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Feb 2025 01:37:11 +0800
Subject: [PATCH 0244/1240] [misc] fix debugging code (#13487)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/troubleshooting.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 2f41fa3b6b1..92103e65bbb 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -94,20 +94,20 @@ pynccl.disabled = False
 s = torch.cuda.Stream()
 with torch.cuda.stream(s):
     data.fill_(1)
-    pynccl.all_reduce(data, stream=s)
-    value = data.mean().item()
+    out = pynccl.all_reduce(data, stream=s)
+    value = out.mean().item()
     assert value == world_size, f"Expected {world_size}, got {value}"
 
 print("vLLM NCCL is successful!")
 
 g = torch.cuda.CUDAGraph()
 with torch.cuda.graph(cuda_graph=g, stream=s):
-    pynccl.all_reduce(data, stream=torch.cuda.current_stream())
+    out = pynccl.all_reduce(data, stream=torch.cuda.current_stream())
 
 data.fill_(1)
 g.replay()
 torch.cuda.current_stream().synchronize()
-value = data.mean().item()
+value = out.mean().item()
 assert value == world_size, f"Expected {world_size}, got {value}"
 
 print("vLLM NCCL with cuda graph is successful!")

From 56832a8647bf1663065264443da4b06ec777c5d1 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Tue, 18 Feb 2025 09:53:14 -0800
Subject: [PATCH 0245/1240] [V1][Tests] Adding additional testing for
 multimodal models to V1 (#13308)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_async_llm.py | 60 ++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 05197f44f93..d864cb2af23 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -8,7 +8,9 @@
 
 from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import SamplingParams
+from vllm.assets.image import ImageAsset
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
 from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
@@ -17,13 +19,32 @@
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
 
-ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
-                              enforce_eager=True,
-                              disable_log_requests=True)
+TEXT_ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B-Instruct",
+                                   enforce_eager=True,
+                                   disable_log_requests=True)
+
+VISION_ENGINE_ARGS = AsyncEngineArgs(model="Qwen/Qwen2-VL-2B-Instruct",
+                                     enforce_eager=True,
+                                     disable_log_requests=True)
+
+TEXT_PROMPT = "Hello my name is Robert and"
+
+VISION_PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+VISION_PROMPT = {
+    "prompt": VISION_PROMPT_TEMPLATE,
+    "multi_modal_data": {
+        "image": ImageAsset("stop_sign").pil_image
+    }
+}
 
 
 async def generate(engine: AsyncLLM,
                    request_id: str,
+                   prompt: PromptType,
                    output_kind: RequestOutputKind,
                    max_tokens: int,
                    prompt_logprobs: Optional[int] = None) -> Tuple[int, str]:
@@ -32,11 +53,12 @@ async def generate(engine: AsyncLLM,
 
     count = 0
     sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
                                      output_kind=output_kind,
                                      temperature=0,
                                      prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
-                                     prompt="Hello my name is Robert and",
+                                     prompt=prompt,
                                      sampling_params=sampling_params):
 
         num_tokens = len(out.outputs[0].token_ids)
@@ -74,6 +96,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
             await asyncio.create_task(
                 generate(engine,
                          "request-0",
+                         TEXT_PROMPT,
                          output_kind,
                          10,
                          prompt_logprobs=5))
@@ -86,18 +109,24 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("engine_args_and_prompt",
+                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
+                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind):
+async def test_load(monkeypatch, output_kind: RequestOutputKind,
+                    engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                                                  PromptType]):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
+        engine_args, prompt = engine_args_and_prompt
 
-        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
-        NUM_REQUESTS = 10000
+        NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 10
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
@@ -107,7 +136,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, output_kind,
+                    generate(engine, request_id, prompt, output_kind,
                              NUM_EXPECTED_TOKENS)))
 
         # Confirm that we got all the EXPECTED tokens from the requests.
@@ -126,13 +155,19 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind):
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("engine_args_and_prompt",
+                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
+                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind):
+async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+                     engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                                                   PromptType]):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
+        engine_args, prompt = engine_args_and_prompt
 
-        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
 
         NUM_REQUESTS = 100
@@ -146,7 +181,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
-                    generate(engine, request_id, output_kind,
+                    generate(engine, request_id, prompt, output_kind,
                              NUM_EXPECTED_TOKENS)))
 
         # API server cancels requests when they disconnect.
@@ -172,7 +207,8 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind):
         # Confirm we can do another generation.
         request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
         task = asyncio.create_task(
-            generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
+            generate(engine, request_id, prompt, output_kind,
+                     NUM_EXPECTED_TOKENS))
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()

From be846f47820c06c75f48fe3d542276fe028cd7e9 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 18 Feb 2025 12:15:33 -0800
Subject: [PATCH 0246/1240] [V1] Optimize handling of sampling metadata and
 req_ids list (#13244)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py |   9 +-
 tests/v1/sample/test_sampler.py           |  44 ++---
 tests/v1/worker/test_gpu_input_batch.py   |  47 +++--
 tests/v1/worker/test_gpu_model_runner.py  |  33 ++--
 vllm/model_executor/layers/utils.py       |   6 +-
 vllm/v1/core/scheduler.py                 |   6 +-
 vllm/v1/sample/metadata.py                |  21 +--
 vllm/v1/sample/ops/penalties.py           |  13 +-
 vllm/v1/sample/ops/topk_topp_sampler.py   |  48 ++---
 vllm/v1/sample/rejection_sampler.py       |   2 +
 vllm/v1/sample/sampler.py                 |  13 +-
 vllm/v1/utils.py                          |  11 ++
 vllm/v1/worker/gpu_input_batch.py         | 213 +++++++++++-----------
 vllm/v1/worker/gpu_model_runner.py        |  85 +++------
 vllm/v1/worker/tpu_model_runner.py        |   2 -
 15 files changed, 255 insertions(+), 298 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 8bc33e84194..3e810e525e1 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -26,17 +26,13 @@ def create_logits_tensor(token_ids: List[int],
 def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
     batch_size = len(spec_tokens)
     return SamplingMetadata(
-        temperature=0.0,
+        temperature=torch.tensor([]),
         all_greedy=True,
         all_random=False,
-        rejection_sampling=True,
         spec_token_ids=spec_tokens,
         top_p=None,
         top_k=None,
-        no_top_p=False,
-        no_top_k=False,
         min_p=torch.empty(batch_size, ),
-        no_min_p=True,
         generators={},
         max_num_logprobs=0,
         no_penalties=False,
@@ -45,8 +41,7 @@ def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
         presence_penalties=torch.tensor([]),
         repetition_penalties=torch.tensor([]),
         output_token_ids=[],
-        min_tokens=[],
-        stop_token_ids=[],
+        min_tokens={},
         logit_bias=[None] * batch_size,
     )
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index a4bd651f822..3f6301c5426 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -77,25 +77,20 @@ def _create_default_sampling_metadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,
         all_random=False,
-        rejection_sampling=False,
-        top_p=torch.empty(batch_size, ),
-        top_k=torch.empty(batch_size, ),
-        no_top_p=True,
-        no_top_k=True,
-        min_p=torch.empty(batch_size, ),
-        no_min_p=True,
+        top_p=None,
+        top_k=None,
+        min_p=None,
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
                                                       vocab_size, device),
         output_token_ids=output_token_ids,
-        spec_token_ids=[],
+        spec_token_ids=None,
         frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
         no_penalties=True,
-        min_tokens=[],
-        stop_token_ids=[],
+        min_tokens={},
         logit_bias=[None] * batch_size,
     )
     return fake_sampling_metadata
@@ -104,10 +99,10 @@ def _create_default_sampling_metadata(
 def _generate_min_token_penalties_and_stop_tokens(
     num_output_tokens: int, batch_size: int, vocab_size: int,
     batch_indices_for_min_token_penalty: List[int]
-) -> Tuple[List[int], List[Set[int]]]:
+) -> Dict[int, Tuple[int, Set[int]]]:
     """
-    Generates and returns a list of minimum token penalties (`min_tokens`)
-    and a corresponding list of stop token IDs (`stop_token_ids`) for each
+    Generates and returns a dict of minimum token penalties and
+    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
     batch.
 
     If a batch index is included in `batch_indices_for_min_token_penalty`,
@@ -115,22 +110,19 @@ def _generate_min_token_penalties_and_stop_tokens(
     and a random set of stop token IDs is created. Otherwise, a lower
     `min_tokens` value is assigned, and the stop token IDs set is empty.
     """
-    stop_token_ids: List[Set[int]] = []
-    min_tokens: List[int] = []
+    min_tokens: Dict[int, Tuple[int, Set[int]]] = {}
     for index in range(batch_size):
         if index in batch_indices_for_min_token_penalty:
-            min_tokens.append(
+            min_tokens[index] = (
                 np.random.randint(num_output_tokens + 1,
-                                  2 * num_output_tokens))
-            stop_token_ids.append(
+                                  2 * num_output_tokens),
                 set(
                     np.random.randint(0, vocab_size - 1)
                     for _ in range(np.random.randint(0, vocab_size))))
-
         else:
-            min_tokens.append(np.random.randint(0, num_output_tokens))
-            stop_token_ids.append(set())
-    return (min_tokens, stop_token_ids)
+            min_tokens[index] = (np.random.randint(0,
+                                                   num_output_tokens), set())
+    return min_tokens
 
 
 def _create_weighted_output_token_list(
@@ -165,7 +157,7 @@ def _create_weighted_output_token_list(
             output_token_ids_for_batch.extend(
                 [token_id for _ in range(index + 1)])
         output_token_ids.append(output_token_ids_for_batch)
-    return (output_token_ids, sorted_token_ids_in_output)
+    return output_token_ids, sorted_token_ids_in_output
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -182,17 +174,17 @@ def test_sampler_min_tokens_penalty(device: str, batch_size: int):
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
     batch_indices_for_min_token_penalty = np.random.randint(
         0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
-    min_tokens, stop_token_ids = _generate_min_token_penalties_and_stop_tokens(
+    min_tokens = _generate_min_token_penalties_and_stop_tokens(
         NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
         batch_indices_for_min_token_penalty)
     sampling_metadata.min_tokens = min_tokens
-    sampling_metadata.stop_token_ids = stop_token_ids
     sampler = Sampler()
     logits = sampler.apply_penalties(fake_logits, sampling_metadata)
     logits = logits.cpu()
     for batch_idx in range(batch_size):
         for token_id in range(VOCAB_SIZE):
-            if token_id in stop_token_ids[batch_idx]:
+            _, stop_token_ids = min_tokens.get(batch_idx, (0, set()))
+            if token_id in stop_token_ids:
                 assert logits[batch_idx][token_id] == -float("inf")
             else:
                 assert logits[batch_idx][token_id] != -float("inf")
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index c0ab356f5c9..cb3b3d21fbb 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Set, Tuple
+from typing import Dict, List, Optional, Set, Tuple
 
 import numpy as np
 import pytest
@@ -41,7 +41,7 @@ def _remove_requests(
     for index in req_indices_to_remove:
         input_batch.remove_request(reqs[index].req_id)
         req_ids_to_remove.add(reqs[index].req_id)
-    return (req_ids_to_remove, req_indices_to_remove_list)
+    return req_ids_to_remove, req_indices_to_remove_list
 
 
 def _construct_expected_sampling_metadata(
@@ -64,8 +64,7 @@ def _construct_expected_sampling_metadata(
     top_p = [0.0 for _ in range(num_reqs)]
     min_p = [0.0 for _ in range(num_reqs)]
     temperature = [0.0 for _ in range(num_reqs)]
-    stop_token_ids: List[Set[int]] = [set() for _ in range(num_reqs)]
-    min_tokens = [0 for _ in range(num_reqs)]
+    min_tokens = {}
     logit_bias = [None] * num_reqs
     for req in reqs:
         if req.req_id not in req_ids_retained:
@@ -83,22 +82,21 @@ def _construct_expected_sampling_metadata(
         top_p[index_in_input_batch] = req.sampling_params.top_p
         min_p[index_in_input_batch] = req.sampling_params.min_p
         temperature[index_in_input_batch] = req.sampling_params.temperature
-        stop_token_ids[
-            index_in_input_batch] = req.sampling_params.all_stop_token_ids
-        min_tokens[index_in_input_batch] = req.sampling_params.min_tokens
+        min_tokens[index_in_input_batch] = (
+            req.sampling_params.min_tokens,
+            req.sampling_params.all_stop_token_ids)
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
                                  device=device),
         all_greedy=False,
         all_random=True,
-        rejection_sampling=False,
-        top_p=torch.tensor(top_p, dtype=torch.float, device=device),
-        top_k=torch.tensor(top_k, dtype=torch.int, device=device),
-        no_top_p=all(x == 1.0 for x in top_p),
-        no_top_k=all(x == 0 for x in top_k),
-        min_p=torch.tensor(min_p, dtype=torch.float, device=device),
-        no_min_p=all(x == 0.0 for x in min_p),
+        top_p=None if all(x == 1.0 for x in top_p) else torch.tensor(
+            top_p, dtype=torch.float, device=device),
+        top_k=None if all(x == 0 for x in top_k) else torch.tensor(
+            top_k, dtype=torch.int, device=device),
+        min_p=None if all(x == 0.0 for x in min_p) else torch.tensor(
+            min_p, dtype=torch.float, device=device),
         generators={},
         max_num_logprobs=0,
         prompt_token_ids=make_tensor_with_pad(
@@ -117,9 +115,8 @@ def _construct_expected_sampling_metadata(
                                           dtype=torch.float,
                                           device=device),
         output_token_ids=output_token_ids,
-        spec_token_ids=[],
+        spec_token_ids=None,
         min_tokens=min_tokens,
-        stop_token_ids=stop_token_ids,
         no_penalties=(all(x == 0 for x in presence_penalties)
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
@@ -206,8 +203,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     input_batch.condense(req_indices_to_remove)
 
     # Generate the sampling metadata
-    sampling_metadata = input_batch.make_sampling_metadata(
-        req_id_output_token_ids, req_id_to_spec_token_ids={}, skip_copy=False)
+    sampling_metadata = input_batch._make_sampling_metadata()
 
     # Create expected output.
     expected_sampling_metadata = _construct_expected_sampling_metadata(
@@ -216,13 +212,16 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         input_batch.req_id_to_index,
         device=torch.device(device))
 
+    def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
+        return (t1 is None
+                and t2 is None) or (t1 is not None and t2 is not None
+                                    and torch.allclose(t1, t2))
+
     # Assert the actual and expected output.
     assert torch.allclose(expected_sampling_metadata.temperature,
                           sampling_metadata.temperature)
-    assert torch.allclose(expected_sampling_metadata.top_p,
-                          sampling_metadata.top_p)
-    assert torch.allclose(expected_sampling_metadata.top_k,
-                          sampling_metadata.top_k)
+    assert same(expected_sampling_metadata.top_p, sampling_metadata.top_p)
+    assert same(expected_sampling_metadata.top_k, sampling_metadata.top_k)
     assert torch.allclose(
         expected_sampling_metadata.frequency_penalties,
         sampling_metadata.frequency_penalties,
@@ -240,10 +239,6 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     assert (expected_sampling_metadata.output_token_ids ==
             sampling_metadata.output_token_ids)
     assert expected_sampling_metadata.min_tokens == sampling_metadata.min_tokens
-    assert expected_sampling_metadata.stop_token_ids == \
-           sampling_metadata.stop_token_ids
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
-    assert expected_sampling_metadata.no_top_p == sampling_metadata.no_top_p
-    assert expected_sampling_metadata.no_top_k == sampling_metadata.no_top_k
     assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index c655b0fded6..973efcbf8e5 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -5,6 +5,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
                                            SchedulerOutput)
+from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 
@@ -82,14 +83,21 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
+def _is_sampling_metadata_changed(model_runner,
+                                  sampling_metadata_before: SamplingMetadata):
+    return model_runner.input_batch.sampling_metadata is not (
+        sampling_metadata_before)
+
+
 def test_update_states_new_request(model_runner):
     req_id = "req_0"
 
     # new req
     scheduler_output = _schedule_new_request(req_id)
 
-    batch_changed = model_runner._update_states(scheduler_output)
-    assert batch_changed is True
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
 
@@ -117,8 +125,9 @@ def test_update_states_request_finished(model_runner):
         free_encoder_input_ids=[],
     )
 
-    batch_changed = model_runner._update_states(scheduler_output)
-    assert batch_changed is True
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert not _is_req_added(model_runner, req_id)
     assert not _is_req_scheduled(model_runner, req_id)
 
@@ -142,7 +151,7 @@ def test_update_states_request_resumed(model_runner):
         scheduled_spec_decode_tokens={},
         scheduled_encoder_inputs={},
         num_common_prefix_blocks=0,
-        finished_req_ids={},
+        finished_req_ids=set(),
         free_encoder_input_ids=[],
     )
 
@@ -171,8 +180,9 @@ def test_update_states_request_resumed(model_runner):
         free_encoder_input_ids=[],
     )
 
-    batch_changed = model_runner._update_states(scheduler_output)
-    assert batch_changed is True
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
 
@@ -200,8 +210,9 @@ def test_update_states_no_changes(model_runner):
         free_encoder_input_ids=[],
     )
 
-    batch_changed = model_runner._update_states(scheduler_output)
-    assert batch_changed is False
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
 
@@ -233,8 +244,8 @@ def test_update_states_request_unscheduled(model_runner):
         free_encoder_input_ids=[],
     )
 
-    batch_changed = model_runner._update_states(scheduler_output)
-    assert batch_changed is True
+    metadata_before = model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
 
     assert _is_req_added(model_runner, req_ids[0])
     assert _is_req_scheduled(model_runner, req_ids[0])
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index dfe71028c1b..a9ef973917e 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -45,7 +45,7 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                                    vocab_size, num_seqs)
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
         output_tokens_tensor, vocab_size, num_seqs)
-    repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
+    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(
         1, vocab_size)
     logits[logits > 0] /= torch.where(prompt_mask | output_mask,
                                       repetition_penalties, 1.0)[logits > 0]
@@ -53,6 +53,6 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                        repetition_penalties, 1.0)[logits <= 0]
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze_(dim=1) * output_bin_counts
-    logits -= presence_penalties.unsqueeze_(dim=1) * output_mask
+    logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze(dim=1) * output_mask
     return logits
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 8f10834251c..535aa644c53 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -195,8 +195,10 @@ def schedule(self) -> "SchedulerOutput":
                                              request.num_computed_tokens -
                                              request.num_tokens)
                 if num_scheduled_spec_tokens > 0:
+                    # Trim spec_token_ids list to num_scheduled_spec_tokens.
+                    del request.spec_token_ids[num_scheduled_spec_tokens:]
                     scheduled_spec_decode_tokens[request.request_id] = (
-                        request.spec_token_ids[:num_scheduled_spec_tokens])
+                        request.spec_token_ids)
 
             # Encoder-related.
             if encoder_inputs_to_schedule:
@@ -567,7 +569,7 @@ def update_from_output(
                 outputs.append(
                     EngineCoreOutput(
                         request_id=req_id,
-                        new_token_ids=new_token_ids or [],
+                        new_token_ids=new_token_ids,
                         finish_reason=request.get_finished_reason(),
                         new_logprobs=new_logprobs,
                         new_prompt_logprobs_tensors=prompt_logprobs_tensors,
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index ea64181c0ae..2184a1866ff 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple
 
 import torch
 
@@ -12,15 +12,13 @@ class SamplingMetadata:
     temperature: torch.Tensor
     all_greedy: bool
     all_random: bool
-    rejection_sampling: bool
-    spec_token_ids: List[List[int]]
 
-    top_p: torch.Tensor
-    top_k: torch.Tensor
-    no_top_p: bool
-    no_top_k: bool
-    min_p: torch.Tensor
-    no_min_p: bool
+    # None when there are no speculated tokens.
+    spec_token_ids: Optional[List[List[int]]]
+
+    top_p: Optional[torch.Tensor]
+    top_k: Optional[torch.Tensor]
+    min_p: Optional[torch.Tensor]
 
     generators: Dict[int, torch.Generator]
 
@@ -34,7 +32,8 @@ class SamplingMetadata:
     repetition_penalties: torch.Tensor
 
     output_token_ids: List[List[int]]
-    min_tokens: List[int]
-    stop_token_ids: List[Set[int]]
+
+    # req_index -> (min_tokens, stop_token_ids)
+    min_tokens: Dict[int, Tuple[int, Set[int]]]
 
     logit_bias: List[Optional[Dict[int, float]]]
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index ba368b44ab9..8d9f6529fa0 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Set, Tuple
+from typing import Dict, List, Set, Tuple
 
 import torch
 
@@ -8,18 +8,17 @@
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 
 
-def apply_min_token_penalties(logits: torch.Tensor,
-                              output_token_ids: List[List[int]],
-                              stop_token_ids: List[Set[int]],
-                              min_tokens: List[int]) -> None:
+def apply_min_token_penalties(
+        logits: torch.Tensor, output_token_ids: List[List[int]],
+        min_tokens: Dict[int, Tuple[int, Set[int]]]) -> None:
     """
     Applies minimum token penalty by setting the logits of the stop tokens
     to -inf.
     """
     min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
-    for index, min_token in enumerate(min_tokens):
+    for index, (min_token, stop_token_ids) in min_tokens.items():
         if len(output_token_ids[index]) < min_token:
-            for stop_token_id in stop_token_ids[index]:
+            for stop_token_id in stop_token_ids:
                 min_tokens_logits_to_penalize.append((index, stop_token_id))
     if min_tokens_logits_to_penalize:
         logits[tuple(zip(*min_tokens_logits_to_penalize))] = -float("inf")
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 27431001e3e..78c88ad8b83 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict
+from typing import Dict, Optional
 
 import torch
 import torch.nn as nn
@@ -55,13 +55,11 @@ def forward_native(
         self,
         logits: torch.Tensor,
         generators: Dict[int, torch.Generator],
-        no_top_k: bool,
-        k: torch.Tensor,
-        no_top_p: bool,
-        p: torch.Tensor,
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
     ) -> torch.Tensor:
         """PyTorch-native implementation of top-k and top-p sampling."""
-        logits = apply_top_k_top_p(logits, no_top_k, k, no_top_p, p)
+        logits = apply_top_k_top_p(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 
@@ -69,37 +67,33 @@ def forward_cuda(
         self,
         logits: torch.Tensor,
         generators: Dict[int, torch.Generator],
-        no_top_k: bool,
-        k: torch.Tensor,
-        no_top_p: bool,
-        p: torch.Tensor,
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
     ) -> torch.Tensor:
         """More optimized implementation for top-k and top-p sampling."""
         probs = logits.softmax(dim=-1, dtype=torch.float32)
-        if no_top_k and no_top_p:
+        if k is None and p is None:
             # We prefer `random_sample` over `flashinfer_sample` when sorting is
             # not needed. This is because `random_sample` does not require
             # CPU-GPU synchronization while `flashinfer_sample` does.
             return random_sample(probs, generators)
-        return flashinfer_sample(probs, no_top_k, k, no_top_p, p, generators)
+        return flashinfer_sample(probs, k, p, generators)
 
 
 def apply_top_k_top_p(
     logits: torch.Tensor,
-    no_top_k: bool,
-    k: torch.Tensor,
-    no_top_p: bool,
-    p: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
     This function sorts the logits tensor, which can be slow for large batches.
     """
-    if no_top_k and no_top_p:
+    if k is None and p is None:
         return logits
     logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
 
-    if not no_top_k:
+    if k is not None:
         # Apply top-k.
         top_k_mask = logits_sort.size(1) - k.to(torch.long)
         # Get all the top_k values.
@@ -107,7 +101,7 @@ def apply_top_k_top_p(
         top_k_mask = logits_sort < top_k_mask
         logits_sort.masked_fill_(top_k_mask, -float("inf"))
 
-    if not no_top_p:
+    if p is not None:
         # Apply top-p.
         probs_sort = logits_sort.softmax(dim=-1)
         probs_sum = probs_sort.cumsum(dim=-1)
@@ -147,10 +141,8 @@ def random_sample(
 
 def flashinfer_sample(
     probs: torch.Tensor,
-    no_top_k: bool,
-    k: torch.Tensor,
-    no_top_p: bool,
-    p: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
     generators: Dict[int, torch.Generator],
 ) -> torch.Tensor:
     """Sample from the probabilities using FlashInfer.
@@ -167,7 +159,7 @@ def flashinfer_sample(
     does not. Call this function at the end of the forward pass to minimize
     the synchronization overhead.
     """
-    assert not (no_top_k and no_top_p)
+    assert not (k is None and p is None)
     max_top_k_round = 32
     batch_size = probs.shape[0]
     uniform_samples = torch.empty((max_top_k_round, batch_size),
@@ -178,11 +170,11 @@ def flashinfer_sample(
         for i, generator in generators.items():
             uniform_samples[:, i].uniform_(generator=generator)
 
-    if no_top_k:
+    if k is None:
         # Top-p only.
         next_token_ids, success = flashinfer.sampling.top_p_sampling_from_probs(
             probs, uniform_samples, p, deterministic=True)
-    elif no_top_p:
+    elif p is None:
         # Top-k only.
         next_token_ids, success = flashinfer.sampling.top_k_sampling_from_probs(
             probs, uniform_samples, k, deterministic=True)
@@ -194,9 +186,9 @@ def flashinfer_sample(
 
     # NOTE: CPU-GPU synchronization happens here.
     if not success.all():
-        if not no_top_k:
+        if k is not None:
             probs = flashinfer.sampling.top_k_renorm_prob(probs, k)
-        if not no_top_p:
+        if p is not None:
             probs = flashinfer.sampling.top_p_renorm_prob(probs, p)
         next_token_ids = flashinfer.sampling.sampling_from_probs(
             probs, uniform_samples[0], deterministic=True)
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index df1da893021..580ad44297a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -68,6 +68,7 @@ def flashinfer_sample(
         # NOTE: The following input preparationg can be moved
         # to the model runner with a persistent manner for better
         # performance.
+        assert sampling_metadata.spec_token_ids is not None
         spec_token_ids = sampling_metadata.spec_token_ids
         max_spec_len = max(len(s) for s in spec_token_ids)
         batch_size = len(spec_token_ids)
@@ -119,6 +120,7 @@ def forward_native(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
+        assert sampling_metadata.spec_token_ids is not None
         spec_lens = [len(x) for x in sampling_metadata.spec_token_ids]
         # Add 1 to include the 'bonus' token.
         sample_lens = [x + 1 for x in spec_lens]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index ec6374d12b1..8e2533eefab 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -26,7 +26,7 @@ def forward(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        if sampling_metadata.rejection_sampling:
+        if sampling_metadata.spec_token_ids:
             if sampling_metadata.max_num_logprobs:
                 raise NotImplementedError(
                     "Rejection sampling does not support logprobs.")
@@ -104,16 +104,14 @@ def sample(
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
         # Apply min_p.
-        if not sampling_metadata.no_min_p:
+        if sampling_metadata.min_p is not None:
             logits = self.apply_min_p(logits, sampling_metadata.min_p)
 
         # Apply top_k and/or top_p.
         random_sampled = self.topk_topp_sampler(
             logits,
             sampling_metadata.generators,
-            sampling_metadata.no_top_k,
             sampling_metadata.top_k,
-            sampling_metadata.no_top_p,
             sampling_metadata.top_p,
         )
 
@@ -179,9 +177,10 @@ def apply_penalties(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
-        apply_min_token_penalties(logits, sampling_metadata.output_token_ids,
-                                  sampling_metadata.stop_token_ids,
-                                  sampling_metadata.min_tokens)
+        if sampling_metadata.min_tokens:
+            apply_min_token_penalties(logits,
+                                      sampling_metadata.output_token_ids,
+                                      sampling_metadata.min_tokens)
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5494542c181..5be46501424 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -188,3 +188,14 @@ def bind_kv_cache(
     for layer_name, kv_cache in kv_caches.items():
         # NOTE: Use list because of v0 PP virtual engine.
         forward_context[layer_name].kv_cache = [kv_cache]
+
+
+def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
+               length: int) -> None:
+    """
+    Copy the first length elements of a tensor into another tensor in a
+    non-blocking manner.
+
+    Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
+    """
+    to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index cb7411a44e2..ccafc325b53 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, cast
 
 import numpy as np
 import torch
@@ -12,6 +11,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
 
 _SAMPLING_EPS = 1e-5
@@ -63,7 +63,7 @@ def __init__(
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self.req_ids: List[Optional[str]] = [None] * max_num_reqs
+        self._req_ids: List[Optional[str]] = []
         self.req_id_to_index: Dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
@@ -171,11 +171,8 @@ def __init__(
                 self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: Set[str] = set()
 
-        self.min_tokens: List[int] = [0] * max_num_reqs
-        self.stop_token_ids: List[Set[int]] = [
-            set() for _ in range(max_num_reqs)
-        ]
-        self.prompt_token_ids: Optional[torch.Tensor] = None
+        # req_index -> (min_tokens, stop_token_ids)
+        self.min_tokens: Dict[int, Tuple[int, Set[int]]] = {}
 
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
@@ -196,6 +193,17 @@ def __init__(
         self.logit_bias: List[Optional[Dict[int,
                                             float]]] = [None] * max_num_reqs
 
+        self.req_output_token_ids: List[Optional[List[int]]] = []
+
+        # This is updated each time the batch constituents change.
+        self.sampling_metadata = self._make_sampling_metadata()
+
+    @property
+    def req_ids(self) -> List[str]:
+        # None elements should only be present transiently
+        # while performing state updates to the batch.
+        return cast(List[str], self._req_ids)
+
     def add_request(
         self,
         request: "CachedRequestState",
@@ -206,7 +214,13 @@ def add_request(
         assert req_index < self.max_num_reqs
 
         req_id = request.req_id
-        self.req_ids[req_index] = req_id
+        if req_index == len(self._req_ids):
+            self._req_ids.append(req_id)
+            self.req_output_token_ids.append(request.output_token_ids)
+        else:
+            self._req_ids[req_index] = req_id
+            self.req_output_token_ids[req_index] = request.output_token_ids
+
         self.req_id_to_index[req_id] = req_index
 
         # Copy the prompt token ids and output token ids.
@@ -255,8 +269,9 @@ def add_request(
             req_index] = sampling_params.repetition_penalty
         if sampling_params.repetition_penalty != 1.0:
             self.repetition_penalties_reqs.add(req_id)
-        self.min_tokens[req_index] = sampling_params.min_tokens
-        self.stop_token_ids[req_index] = sampling_params.all_stop_token_ids
+        if sampling_params.min_tokens:
+            self.min_tokens[req_index] = (sampling_params.min_tokens,
+                                          sampling_params.all_stop_token_ids)
 
         # NOTE(woosuk): self.generators should not include the requests that
         # do not have their own generator.
@@ -284,16 +299,20 @@ def add_request(
             self.request_lora_mapping[req_index] = 0
 
     def remove_request(self, req_id: str) -> Optional[int]:
+        """This method must always be followed by a call to condense()."""
+
         req_index = self.req_id_to_index.pop(req_id, None)
         if req_index is None:
             return None
-        self.req_ids[req_index] = None
+        self._req_ids[req_index] = None
+        self.req_output_token_ids[req_index] = None
 
         self.greedy_reqs.discard(req_id)
         self.random_reqs.discard(req_id)
         self.top_p_reqs.discard(req_id)
         self.top_k_reqs.discard(req_id)
         self.min_p_reqs.discard(req_id)
+        self.min_tokens.pop(req_index, None)
         self.frequency_penalties_reqs.discard(req_id)
         self.presence_penalties_reqs.discard(req_id)
         self.repetition_penalties_reqs.discard(req_id)
@@ -313,33 +332,17 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         return req_index
 
-    def clear(self) -> None:
-        self.req_ids = [None] * self.max_num_reqs
-        self.req_id_to_index.clear()
-        self.greedy_reqs.clear()
-        self.random_reqs.clear()
-        self.top_p_reqs.clear()
-        self.top_k_reqs.clear()
-        self.min_p_reqs.clear()
-        self.frequency_penalties_reqs.clear()
-        self.presence_penalties_reqs.clear()
-        self.repetition_penalties_reqs.clear()
-        self.generators.clear()
-        self.num_logprobs.clear()
-        self.num_prompt_logprobs.clear()
-        self.request_lora_mapping.fill(0)
-        self.lora_id_to_lora_request.clear()
-        self.lora_id_to_request_ids.clear()
-        self.logit_bias = [None] * self.max_num_reqs
-
     def condense(self, empty_req_indices: List[int]) -> None:
-        if self.num_reqs == 0:
+        num_reqs = self.num_reqs
+        if num_reqs == 0:
             # The batched states are empty.
+            self._req_ids.clear()
+            self.req_output_token_ids.clear()
             return
 
         # NOTE(woosuk): This function assumes that the empty_req_indices
         # is sorted in descending order.
-        last_req_index = self.num_reqs + len(empty_req_indices) - 1
+        last_req_index = num_reqs + len(empty_req_indices) - 1
         while empty_req_indices:
             # Find the largest non-empty index.
             while last_req_index in empty_req_indices:
@@ -351,10 +354,13 @@ def condense(self, empty_req_indices: List[int]) -> None:
                 break
 
             # Swap the states.
-            req_id = self.req_ids[last_req_index]
+            req_id = self._req_ids[last_req_index]
+            output_token_ids = self.req_output_token_ids[last_req_index]
             assert req_id is not None
-            self.req_ids[empty_index] = req_id
-            self.req_ids[last_req_index] = None
+            self._req_ids[empty_index] = req_id
+            self._req_ids[last_req_index] = None
+            self.req_output_token_ids[empty_index] = output_token_ids
+            self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
             num_tokens = self.num_tokens[last_req_index]
@@ -379,13 +385,14 @@ def condense(self, empty_req_indices: List[int]) -> None:
             self.repetition_penalties_cpu[
                 empty_index] = self.repetition_penalties_cpu[last_req_index]
             self.min_p_cpu[empty_index] = self.min_p_cpu[last_req_index]
-            self.min_tokens[empty_index] = self.min_tokens[last_req_index]
-            self.stop_token_ids[empty_index] = self.stop_token_ids[
-                last_req_index]
             generator = self.generators.pop(last_req_index, None)
             if generator is not None:
                 self.generators[empty_index] = generator
 
+            min_token = self.min_tokens.pop(last_req_index, None)
+            if min_token is not None:
+                self.min_tokens[empty_index] = min_token
+
             self.request_lora_mapping[empty_index] = self.request_lora_mapping[
                 last_req_index]
 
@@ -394,87 +401,71 @@ def condense(self, empty_req_indices: List[int]) -> None:
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
-    def make_sampling_metadata(
-        self,
-        req_id_output_token_ids: Dict[str, List[int]],
-        req_id_to_spec_token_ids: Dict[str, List[int]],
-        skip_copy: bool = False,
-    ) -> SamplingMetadata:
-        if not skip_copy:
-            self.temperature[:self.num_reqs].copy_(
-                self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_p[:self.num_reqs].copy_(
-                self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.top_k[:self.num_reqs].copy_(
-                self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True)
-            self.min_p[:self.num_reqs].copy_(
-                self.min_p_cpu_tensor[:self.num_reqs], non_blocking=True)
-            if not self.no_penalties:
-                # Since syncing these tensors is expensive only copy them
-                # if necessary i.e. if there are requests which require
-                # penalties to be applied during sampling.
-                self.frequency_penalties[:self.num_reqs].copy_(
-                    self.frequency_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True,
-                )
-                self.presence_penalties[:self.num_reqs].copy_(
-                    self.presence_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True,
-                )
-                self.repetition_penalties[:self.num_reqs].copy_(
-                    self.repetition_penalties_cpu_tensor[:self.num_reqs],
-                    non_blocking=True,
-                )
-                # The prompt tokens are used only for applying penalties during
-                # the sampling process. Hence copy these tensors only when
-                # there are requests which need penalties to be applied.
-                self.prompt_token_ids = self._make_prompt_token_ids_tensor()
-
-        output_token_ids: List[List[int]] = []
-        spec_token_ids: List[List[int]] = []
-        rejection_sampling = False
-        for req_id in self.req_ids[:self.num_reqs]:
-            assert req_id is not None
-            # Currently we create a tensor for output_token_ids from scratch
-            # at each step. However, for the penalties computation what we
-            # need is stats about the token ids present in the output. This
-            # stats can be maintained incrementally instead of computing it
-            # from scratch at each step.
-            # TODO - Replace this with incremental update to output token
-            # statistics.
-            output_token_ids.append(req_id_output_token_ids[req_id])
-            req_spec_token_ids = req_id_to_spec_token_ids.get(req_id, [])
-            spec_token_ids.append(req_spec_token_ids)
-            if req_spec_token_ids:
-                # If any of the requests require speculative decoding, set the
-                # flag to True.
-                rejection_sampling = True
+        # Trim lists to the batch size.
+        del self._req_ids[self.num_reqs:]
+        del self.req_output_token_ids[self.num_reqs:]
+
+    def refresh_sampling_metadata(self):
+        self.sampling_metadata = self._make_sampling_metadata()
+
+    def _make_sampling_metadata(self) -> SamplingMetadata:
+        num_reqs = self.num_reqs
+        copy_slice(self.temperature_cpu_tensor, self.temperature, num_reqs)
+        if not self.no_top_p:
+            copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
+        if not self.no_top_k:
+            copy_slice(self.top_k_cpu_tensor, self.top_k, num_reqs)
+        if not self.no_min_p:
+            copy_slice(self.min_p_cpu_tensor, self.min_p, num_reqs)
+
+        if not self.no_penalties:
+            # Since syncing these tensors is expensive only copy them
+            # if necessary i.e. if there are requests which require
+            # penalties to be applied during sampling.
+            copy_slice(self.frequency_penalties_cpu_tensor,
+                       self.frequency_penalties, num_reqs)
+            copy_slice(self.presence_penalties_cpu_tensor,
+                       self.presence_penalties, num_reqs)
+            copy_slice(self.repetition_penalties_cpu_tensor,
+                       self.repetition_penalties, num_reqs)
+
+            # The prompt tokens are used only for applying penalties during
+            # the sampling process. Hence copy these tensors only when
+            # there are requests which need penalties to be applied.
+            prompt_token_ids = self._make_prompt_token_ids_tensor()
+        else:
+            prompt_token_ids = None
 
         return SamplingMetadata(
-            temperature=self.temperature[:self.num_reqs],
+            temperature=self.temperature[:num_reqs],
             all_greedy=self.all_greedy,
             all_random=self.all_random,
-            rejection_sampling=rejection_sampling,
-            top_p=self.top_p[:self.num_reqs],
-            top_k=self.top_k[:self.num_reqs],
-            min_p=self.min_p[:self.num_reqs],
-            no_min_p=self.no_min_p,
-            no_top_p=self.no_top_p,
-            no_top_k=self.no_top_k,
+            top_p=None if self.no_top_p else self.top_p[:num_reqs],
+            top_k=None if self.no_top_k else self.top_k[:num_reqs],
+            min_p=None if self.no_min_p else self.min_p[:num_reqs],
             generators=self.generators,
             max_num_logprobs=self.max_num_logprobs,
-            prompt_token_ids=self.prompt_token_ids,
-            frequency_penalties=self.frequency_penalties[:self.num_reqs],
-            presence_penalties=self.presence_penalties[:self.num_reqs],
-            repetition_penalties=self.repetition_penalties[:self.num_reqs],
-            output_token_ids=output_token_ids,
-            spec_token_ids=spec_token_ids,
-            min_tokens=self.min_tokens[:self.num_reqs],
-            stop_token_ids=self.stop_token_ids[:self.num_reqs],
+            prompt_token_ids=prompt_token_ids,
+            frequency_penalties=self.frequency_penalties[:num_reqs],
+            presence_penalties=self.presence_penalties[:num_reqs],
+            repetition_penalties=self.repetition_penalties[:num_reqs],
+            output_token_ids=cast(List[List[int]], self.req_output_token_ids),
+            spec_token_ids=None,
+            min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
-            logit_bias=self.logit_bias[:self.num_reqs],
+            logit_bias=self.logit_bias[:num_reqs],
         )
 
+    def get_sampling_metadata(
+        self,
+        req_id_to_spec_token_ids: Dict[str, List[int]],
+    ) -> SamplingMetadata:
+        # Set the new spec token ids in the cached sampling metadata.
+        self.sampling_metadata.spec_token_ids = [
+            req_id_to_spec_token_ids.get(req_id, []) for req_id in self.req_ids
+        ] if req_id_to_spec_token_ids else None
+        return self.sampling_metadata
+
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
         max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
         prompt_token_ids_cpu_tensor = torch.empty(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5754422cb1f..0ecc00acc79 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -31,7 +31,6 @@
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
-from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.utils import bind_kv_cache
@@ -224,16 +223,15 @@ def __init__(
                                         pin_memory=self.pin_memory)
         self.seq_lens_np = self.seq_lens_cpu.numpy()
 
-    def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
+    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         """Update the cached states and the persistent batch with the scheduler
         output.
 
         The updated states are used by the `_prepare_inputs` function to create
         the input GPU tensors for the model.
 
-        Returns:
-            True if there is a new/resumed/paused/finished request in the batch.
-            If False, we can skip copying SamplingMetadata to the GPU.
+        The SamplingMetadata is updated and copied to the GPU if there is a
+        new/resumed/paused/finished request in the batch.
         """
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
@@ -344,9 +342,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             num_new_tokens = (num_computed_tokens +
                               len(req_data.new_token_ids) -
                               req_state.num_tokens)
-            new_token_ids = (req_data.new_token_ids[-num_new_tokens:]
-                             if num_new_tokens > 0 else [])
-            req_state.output_token_ids.extend(new_token_ids)
+            if num_new_tokens == 1:
+                # Avoid slicing list in most common case.
+                req_state.output_token_ids.append(req_data.new_token_ids[-1])
+            elif num_new_tokens > 0:
+                req_state.output_token_ids.extend(
+                    req_data.new_token_ids[-num_new_tokens:])
             # Update the block IDs.
             if not req_data.resumed_from_preemption:
                 # Append the new blocks to the existing block IDs.
@@ -380,7 +381,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.input_batch.num_tokens_no_spec[req_index] = end_token_index
             # Add spec_token_ids to token_ids_cpu.
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
-                req_id, [])
+                req_id, ())
             if spec_token_ids:
                 start_index = end_token_index
                 end_token_index += len(spec_token_ids)
@@ -410,7 +411,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-        return batch_changed
+        if batch_changed:
+            self.input_batch.refresh_sampling_metadata()
 
     def _prepare_inputs(
         self,
@@ -429,8 +431,7 @@ def _prepare_inputs(
         # TODO: The Python loop can be slow. Optimize.
         num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
         max_num_scheduled_tokens = 0
-        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
-            assert req_id is not None
+        for i, req_id in enumerate(self.input_batch.req_ids):
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
             num_scheduled_tokens[i] = num_tokens
             max_num_scheduled_tokens = max(max_num_scheduled_tokens,
@@ -669,10 +670,7 @@ def _compute_cascade_attn_prefix_len(
 
     def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
         mrope_pos_ptr = 0
-        num_reqs = self.input_batch.num_reqs
-        for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]):
-            assert req_id is not None
-
+        for index, req_id in enumerate(self.input_batch.req_ids):
             req = self.requests[req_id]
             assert req.mrope_positions is not None
 
@@ -726,12 +724,11 @@ def _calc_spec_decode_metadata(
         self,
         scheduler_output: "SchedulerOutput",
         cu_num_tokens: np.ndarray,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         # Get the number of spec decode tokens for each request.
         num_reqs = self.input_batch.num_reqs
         num_spec_decode_tokens = np.empty(num_reqs, dtype=np.int32)
-        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
-            assert req_id is not None
+        for i, req_id in enumerate(self.input_batch.req_ids):
             num_spec_decode_tokens[i] = len(
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
 
@@ -769,22 +766,6 @@ def _calc_spec_decode_metadata(
         return torch.from_numpy(spec_decode_logits_indices).to(
             self.device, non_blocking=True)
 
-    def _prepare_sampling(
-        self,
-        batch_changed: bool,
-        req_to_spec_token_ids: Dict[str, List[int]],
-    ) -> SamplingMetadata:
-        # Create the sampling metadata.
-        req_id_output_token_ids: Dict[str, List[int]] = \
-            {req_id: req.output_token_ids \
-                for req_id, req in self.requests.items()}
-
-        sampling_metadata = self.input_batch.make_sampling_metadata(
-            req_id_output_token_ids,
-            req_to_spec_token_ids,
-            skip_copy=not batch_changed)
-        return sampling_metadata
-
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -838,9 +819,7 @@ def _gather_encoder_outputs(
         scheduler_output: "SchedulerOutput",
     ) -> List[torch.Tensor]:
         encoder_outputs: List[torch.Tensor] = []
-        num_reqs = self.input_batch.num_reqs
-        for req_id in self.input_batch.req_ids[:num_reqs]:
-            assert req_id is not None
+        for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
             req_state = self.requests[req_id]
@@ -882,7 +861,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
-        batch_changed = self._update_states(scheduler_output)
+        self._update_states(scheduler_output)
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
@@ -964,8 +943,8 @@ def execute_model(
         logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self._prepare_sampling(
-            batch_changed, scheduler_output.scheduled_spec_decode_tokens)
+        sampling_metadata = self.input_batch.get_sampling_metadata(
+            scheduler_output.scheduled_spec_decode_tokens)
         sampler_output = self.model.sample(
             logits=logits,
             sampling_metadata=sampling_metadata,
@@ -973,14 +952,7 @@ def execute_model(
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
-        num_reqs = self.input_batch.num_reqs
-        req_ids: List[str] = []
-        # Because `input_batch.req_ids` is a list of length `max_num_reqs`,
-        # we need to stop at `num_reqs`.
-        # FIXME(woosuk): This is hacky. Refactor.
-        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
-            assert req_id is not None
-            req_ids.append(req_id)
+        for i, req_id in enumerate(self.input_batch.req_ids):
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -1027,7 +999,7 @@ def execute_model(
                 valid_sampled_token_ids)
 
         model_runner_output = ModelRunnerOutput(
-            req_ids=req_ids,
+            req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
             spec_token_ids=spec_token_ids,
@@ -1041,19 +1013,18 @@ def generate_draft_token_ids(
         sampled_token_ids: List[List[int]],
     ) -> List[List[int]]:
         # TODO(woosuk): Optimize.
-        num_reqs = len(sampled_token_ids)
         draft_token_ids: List[List[int]] = []
-        for i in range(num_reqs):
-            if len(sampled_token_ids[i]) == 0:
+        for i, sampled_ids in enumerate(sampled_token_ids):
+            num_sampled_ids = len(sampled_ids)
+            if not num_sampled_ids:
                 # Skip speculative decoding.
                 draft_token_ids.append([])
                 continue
 
             # Add sampled_token_ids to token_ids_cpu.
             start_idx = self.input_batch.num_tokens_no_spec[i]
-            end_idx = start_idx + len(sampled_token_ids[i])
-            self.input_batch.token_ids_cpu[
-                i, start_idx:end_idx] = sampled_token_ids[i]
+            end_idx = start_idx + num_sampled_ids
+            self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
                 self.input_batch.token_ids_cpu[i, :end_idx],
                 self.speculative_config.ngram_prompt_lookup_min,
@@ -1204,7 +1175,7 @@ def profile_run(self) -> None:
         # multiplying the list, to avoid Dynamo from treating them as
         # tensor aliasing.
         dummy_kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
+            torch.tensor((), dtype=torch.float32, device=self.device)
             for _ in range(self.num_attn_layers)
         ]
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4ee6853ba7e..e60268f0452 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1048,8 +1048,6 @@ def swap_positions(b: InputBatch, id_1, id_2):
 
     b.min_tokens[id_1], b.min_tokens[id_2] = b.min_tokens[id_2], b.min_tokens[
         id_1]
-    b.stop_token_ids[id_1], b.stop_token_ids[id_2] = b.stop_token_ids[
-        id_2], b.stop_token_ids[id_1]
 
     gen_1 = b.generators.pop(id_1, None)
     gen_2 = b.generators.pop(id_2, None)

From c3f814a1463e0dcd6f5ba235745e6bb7913244e7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 18 Feb 2025 12:50:31 -0800
Subject: [PATCH 0247/1240] Pin Ray version to 2.40.0 (#13490)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-cuda.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 44b56422e3a..bc670b8511f 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray[adag] == 2.41.0 # Required for pipeline parallelism in V1.
+ray[adag] == 2.40.0 # Required for pipeline parallelism in V1.
 torch == 2.5.1
 torchaudio==2.5.1
 # These must be updated alongside torch

From 3b3b1db8edd17db7292cf6f5a71dd6b01bd96a90 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 18 Feb 2025 13:19:58 -0800
Subject: [PATCH 0248/1240] [V1][Spec Decode] Optimize N-gram matching with
 Numba (#13365)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt               |   1 +
 vllm/v1/spec_decode/ngram_proposer.py | 113 +++++++++++++-------------
 vllm/v1/worker/gpu_model_runner.py    |  13 ++-
 3 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index b7c94cbdba8..c52980bc7df 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -1,6 +1,7 @@
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding.
 requests >= 2.26.0
 tqdm
 blake3
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 9b116e00af9..33289d05dab 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -1,14 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional
+from typing import Optional
 
 import numpy as np
+from numba import jit
 
 
 class NgramProposer:
 
-    def __init__(self):
-        pass
-
     def propose(
         self,
         context_token_ids: np.ndarray,
@@ -21,7 +19,7 @@ def propose(
         that match.
         
         Args:
-            context_token_ids: List of token IDs representing the 
+            context_token_ids: Numpy array of token IDs representing the 
                                context sequence.
             n: Length of the n-gram to match.
             k: Number of tokens follow the match. If there are less 
@@ -41,66 +39,65 @@ def propose(
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
-        # TODO: Use c++ to implement the _find_subarray_kmp to
-        # improve the efficiency
-        return self._find_subarray_kmp(context_token_ids, n, k)
+        return _find_subarray_kmp(context_token_ids, n, k)
 
-    @staticmethod
-    def _kmp_lps_array(pattern: List[int]) -> List[int]:
-        """
-        Build the lps (longest proper prefix which is also suffix) 
-        array for the pattern.
-        """
-        lps = [0] * len(pattern)
-        prev_lps = 0  # length of the previous longest prefix suffix
-        i = 1
 
-        while i < len(pattern):
-            if pattern[i] == pattern[prev_lps]:
-                prev_lps += 1
-                lps[i] = prev_lps
-                i += 1
+@jit(nopython=True)
+def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+    """
+    Build the lps (longest proper prefix which is also suffix) 
+    array for the pattern.
+    """
+    lps = np.zeros(len(pattern), dtype=np.int32)
+    prev_lps = 0  # length of the previous longest prefix suffix
+    i = 1
+
+    while i < len(pattern):
+        if pattern[i] == pattern[prev_lps]:
+            prev_lps += 1
+            lps[i] = prev_lps
+            i += 1
+        else:
+            if prev_lps != 0:
+                prev_lps = lps[prev_lps - 1]
             else:
-                if prev_lps != 0:
-                    prev_lps = lps[prev_lps - 1]
-                else:
-                    lps[i] = 0
-                    i += 1
+                lps[i] = 0
+                i += 1
+    return lps
 
-        return lps
 
-    @staticmethod
-    def _find_subarray_kmp(
-        context_token_ids: np.ndarray,
-        n: int,
-        k: int,
-    ) -> Optional[np.ndarray]:
-        context_len = context_token_ids.shape[0]
-        assert n > 0
+@jit(nopython=True)
+def _find_subarray_kmp(
+    context_token_ids: np.ndarray,
+    n: int,
+    k: int,
+) -> Optional[np.ndarray]:
+    context_len = context_token_ids.shape[0]
+    assert n > 0
 
-        pattern = context_token_ids[-n:]
-        # Precompute lps array for Y
-        lps = NgramProposer._kmp_lps_array(pattern)
+    pattern = context_token_ids[-n:]
+    # Precompute lps array for Y
+    lps = _kmp_lps_array(pattern)
 
-        i = 0
-        j = 0
-        # -n because the last n tokens are used as pattern
-        while i < context_len - n:
-            if context_token_ids[i] == pattern[j]:
-                i += 1
-                j += 1
+    i = 0
+    j = 0
+    # -n because the last n tokens are used as pattern
+    while i < context_len - n:
+        if context_token_ids[i] == pattern[j]:
+            i += 1
+            j += 1
 
-                # If we have matched the entire Y
-                if j == n:
-                    # Found pattern in context, gather the next K elements
-                    return context_token_ids[i:i + k]
+            # If we have matched the entire Y
+            if j == n:
+                # Found pattern in context, gather the next K elements
+                return context_token_ids[i:i + k]
+        else:
+            # Mismatch
+            if j != 0:
+                # Use the lps array to avoid re-checking elements
+                j = lps[j - 1]
             else:
-                # Mismatch
-                if j != 0:
-                    # Use the lps array to avoid re-checking elements
-                    j = lps[j - 1]
-                else:
-                    i += 1
+                i += 1
 
-        # Y not found
-        return None
+    # Y not found
+    return None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0ecc00acc79..31fe095a91b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -120,11 +120,20 @@ def __init__(
         # Set up speculative decoding.
         self.use_spec_decode = False
         if self.speculative_config:
+            self.use_spec_decode = True
+
             # TODO: find a better way to check if we are using ngram.
             assert self.speculative_config.ngram_prompt_lookup_min, \
                     "Currently, only ngram spec decode is supported in V1."
-            self.drafter = NgramProposer()
-            self.use_spec_decode = True
+            if get_pp_group().is_last_rank:
+                self.drafter = NgramProposer()
+                # Trigger Numba JIT compilation for N-gram proposer.
+                # This usually takes less than 1 second.
+                self.drafter.propose(
+                    np.zeros(1024, dtype=np.int32),
+                    self.speculative_config.ngram_prompt_lookup_min,
+                    self.speculative_config.num_speculative_tokens,
+                )
 
         # Request states.
         self.requests: Dict[str, CachedRequestState] = {}

From db2488656acc53a740c272288cb6f14efb389c07 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 19 Feb 2025 03:37:26 +0000
Subject: [PATCH 0249/1240] [Misc] Remove dangling references to
 `--use-v2-block-manager` (#13492)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/nightly-benchmarks/tests/serving-tests.json | 3 +--
 docs/source/features/spec_decode.md                    | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index facb0eac749..415171e268b 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -66,8 +66,7 @@
             "swap_space": 16, 
             "speculative_model": "turboderp/Qwama-0.5B-Instruct",
             "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1,
-            "use_v2_block_manager": ""
+            "speculative_draft_tensor_parallel_size": 1
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index d2255eff608..cc8d6fceb7d 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -45,7 +45,7 @@ To perform the same with an online mode launch the server:
 
 ```bash
 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m \
     --num_speculative_tokens 5 --gpu_memory_utilization 0.8
 ```
 

From 5c8e974932bb9d79df3bcd5cc0f8c0db9f9ccf1c Mon Sep 17 00:00:00 2001
From: Yu-Zhou <yu.zhou@intel.com>
Date: Wed, 19 Feb 2025 11:40:19 +0800
Subject: [PATCH 0250/1240] [Hardware][Gaudi][Feature] Support Contiguous Cache
 Fetch  (#12139)

Signed-off-by: yuzhou <yuzhou@habana.ai>
Signed-off-by: zhouyu5 <yu.zhou@intel.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/hpu_attn.py  |   6 +-
 vllm/attention/ops/hpu_paged_attn.py |   1 +
 vllm/envs.py                         |   8 ++
 vllm/worker/hpu_model_runner.py      | 114 +++++++++++++++++----------
 4 files changed, 81 insertions(+), 48 deletions(-)

diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 1ad5e6e8e4e..9eb533685db 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -118,12 +118,8 @@ def __init__(
         self.matmul_av = Matmul()
         self.batch2block_matmul = Matmul()
         self.block2batch_matmul = Matmul()
-        # NOTE(kzawora): Contiguous PA is off until model runner supports it
         self.k_cache = VLLMKVCache()
-        self.k_cache.use_contiguous_pa = False
         self.v_cache = VLLMKVCache()
-        self.v_cache.use_contiguous_pa = False
-        # NOTE(kzawora): Pipelined PA is off until model runner supports it
         ops.pa_impl = ops.pa
 
         self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
@@ -249,7 +245,7 @@ def forward(
                 block_mapping=attn_metadata.block_mapping,
                 block_bias=attn_metadata.attn_bias,
                 block_scales=attn_metadata.block_scales,
-                block_groups=None,
+                block_groups=attn_metadata.block_groups,
                 scale=self.scale,
                 matmul_qk_op=self.matmul_qk,
                 matmul_av_op=self.matmul_av,
diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py
index 8bb536343ed..49ea420d092 100644
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
@@ -23,6 +23,7 @@ class HPUPagedAttentionMetadata:
     block_indices: Optional[torch.Tensor]
     block_offsets: Optional[torch.Tensor]
     block_scales: Optional[torch.Tensor]
+    block_groups: Optional[torch.Tensor]
 
 
 class HPUPagedAttention:
diff --git a/vllm/envs.py b/vllm/envs.py
index f8a18cc662a..45547416314 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -89,6 +89,7 @@
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
+    VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
 
 
 def get_default_cache_root():
@@ -585,6 +586,13 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # specify the path through environment variable VLLM_CUDART_SO_PATH.
     "VLLM_CUDART_SO_PATH":
     lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
+
+    # Contiguous cache fetching to avoid using costly gather operation on
+    # Gaudi3. This is only applicable to HPU contiguous cache. If set to true,
+    # contiguous cache fetch will be used.
+    "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
+    lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
+    ("1", "true"),
 }
 
 # end-env-vars-definition
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 774049a5281..fe7c776d0a2 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -25,9 +25,11 @@
 import torch
 import torch.nn as nn
 from vllm_hpu_extension.ops import LoraMask as LoraMask
+from vllm_hpu_extension.ops import batch2block, block2batch
 from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
                                          HabanaMemoryProfiler, format_bytes)
 
+import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.distributed.parallel_state import get_world_group
@@ -260,10 +262,19 @@ def setup_profiler():
     return profiler
 
 
-def pad_list(list, k, v):
-    target_len = round_up(len(list), k)
-    padding = target_len - len(list)
-    return list + [v] * padding
+def pad_list(input, k, v):
+    input_len = len(input)
+    target_len = round_up(input_len, k)
+    padding = target_len - input_len
+    return input + [v] * padding
+
+
+def gather_list(input, indices, v):
+    return [input[i] if i is not None else v for i in indices]
+
+
+def flatten(in_list):
+    return list(itertools.chain(*in_list))
 
 
 def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
@@ -334,13 +345,23 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
         mask = mask >= metadata.block_usage.unsqueeze(-1)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
-        block_mapping = torch.nn.functional.one_hot(metadata.block_mapping,
+        block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
                                                     num_classes=batch_size)
         block_mapping = block_mapping.to(dtype)
         metadata = metadata._replace(block_mapping=block_mapping,
                                      attn_bias=attn_bias)
         return metadata
 
+    def _set_block_scales(self, metadata, device):
+        block_mapping = metadata.block_mapping
+        ones = torch.ones((block_mapping.size(0), ),
+                          device=device,
+                          dtype=block_mapping.dtype)
+        sums = batch2block(block2batch(ones, block_mapping), block_mapping)
+        block_scales = torch.reciprocal(torch.maximum(ones, sums))
+        metadata = metadata._replace(block_scales=block_scales)
+        return metadata
+
     def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
                          dtype):
         if attn_metadata.is_prompt:
@@ -351,6 +372,7 @@ def _update_metadata(self, attn_metadata, batch_size, seq_len, device,
             meta = attn_metadata
             attn_metadata = self._set_block_mapping(meta, batch_size, device,
                                                     dtype)
+            attn_metadata = self._set_block_scales(attn_metadata, device)
         return attn_metadata
 
     def forward(self, *args, **kwargs):
@@ -586,6 +608,7 @@ def __init__(
         self.bucketing_global_state = HPUBucketingGlobalState()
         self._setup_buckets()
         self._set_gc_threshold()
+        self.use_contiguous_pa = envs.VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH
 
     def _set_gc_threshold(self) -> None:
         # Read https://docs.python.org/3/library/gc.html#gc.set_threshold
@@ -911,6 +934,7 @@ def _prepare_prompt(
             block_indices=block_indices,
             block_offsets=block_offsets,
             block_scales=None,
+            block_groups=None,
             attn_bias=None,
             seq_lens_tensor=seq_lens_tensor,
             num_prefills=real_num_seqs,
@@ -1008,65 +1032,69 @@ def _prepare_decode(
 
         num_decode_tokens = sum(seq_lens)
 
-        blocks_used = [len(bt) for bt in block_tables if bt]
-        block_list = []
-        block_scales = []
-        for i, bt in enumerate(block_tables):
-            block_list.extend(bt)
-            blocks_in_group = len(bt)
-            if blocks_in_group > 0:
-                scale = 1.0 / blocks_in_group
-                block_scales.extend([scale] * blocks_in_group)
-
-        block_mapping_nested: List[List[int]] = [
-            [i] * b_u for i, b_u in enumerate(blocks_used)
+        last_block_usage = [
+            slot[0] % self.block_size + 1 for slot in slot_mapping
         ]
-        block_mapping: List[int] = list(
-            itertools.chain.from_iterable(block_mapping_nested))
+        block_groups = [[i] * len(bt) for i, bt in enumerate(block_tables)]
+        block_usage = [[self.block_size] * (len(bt) - 1) + [lbu]
+                       for bt, lbu in zip(block_tables, last_block_usage)
+                       if bt]
+
+        block_list = flatten(block_tables)
+        block_groups = flatten(block_groups)
+        block_usage = flatten(block_usage)
+
+        assert len(block_list) == len(block_groups)
+        assert len(block_list) == len(block_usage)
+
+        padding_fn = None
+        if self.use_contiguous_pa:
+            block_bucket_size = max(max(block_list) + 1, len(block_list))
+            block_bucket_size = find_bucket(
+                block_bucket_size,
+                self.bucketing_global_state.decode_block_bucket_cfg)
+            indices: List[Any]
+            indices = [None] * block_bucket_size
+            for i, bid in enumerate(block_list):
+                indices[bid] = i
+            padding_fn = lambda tensor, pad_value: gather_list(
+                tensor, indices, pad_value)
+        else:
+            block_bucket_size = find_bucket(
+                len(block_list),
+                self.bucketing_global_state.decode_block_bucket_cfg)
+            padding_fn = lambda tensor, pad_value: pad_list(
+                tensor, block_bucket_size, pad_value)
 
-        last_block = [
-            sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping)
-        ]
-        block_usage = [[self.block_size] * (b_u - 1) + [lb]
-                       for b_u, lb in zip(blocks_used, last_block)]
-        block_usage = list(itertools.chain(*block_usage))
-
-        block_bucket_size = find_bucket(
-            len(block_list),
-            self.bucketing_global_state.decode_block_bucket_cfg)
-        block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID)
-        block_mapping = pad_list(block_mapping, block_bucket_size, -1)
-        block_usage = pad_list(block_usage, block_bucket_size, 1)
-        block_scales = pad_list(block_scales, block_bucket_size, 0.0)
+        block_list = padding_fn(block_list, _PAD_BLOCK_ID)
+        block_groups = padding_fn(block_groups, -1)
+        block_usage = padding_fn(block_usage, 1)
 
         block_list = torch.tensor(block_list,
                                   dtype=torch.int,
                                   device=self.device)
-        block_mapping = torch.tensor(block_mapping,
-                                     dtype=torch.long,
-                                     device=self.device)
+        block_groups = torch.tensor(block_groups,
+                                    dtype=torch.int,
+                                    device=self.device)
         block_usage = torch.tensor(block_usage,
                                    dtype=self.model_config.dtype,
                                    device=self.device)
-
         slot_mapping = torch.tensor(slot_mapping,
                                     dtype=torch.long,
                                     device=self.device)
 
         block_indices, block_offsets = precompute_indices_and_offsets(
             self.block_size, slot_mapping, False)
-        block_scales = torch.tensor(block_scales,
-                                    dtype=self.model_config.dtype,
-                                    device=self.device)
 
         attn_metadata = self.attn_backend.make_metadata(
             is_prompt=False,
             block_list=block_list,
-            block_mapping=block_mapping,
+            block_mapping=None,
             block_usage=block_usage,
             block_indices=block_indices,
             block_offsets=block_offsets,
-            block_scales=block_scales,
+            block_scales=None,
+            block_groups=block_groups,
             attn_bias=None,
             seq_lens_tensor=None,
             num_prefills=0,
@@ -1280,7 +1308,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object:
         attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [
             'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping',
             'block_usage', 'slot_mapping', 'is_prompt', 'block_indices',
-            'block_offsets', 'block_scales'
+            'block_offsets', 'block_scales', 'block_groups'
         ])
         return attention_metadata
 

From 897d283429b973bffcabbe0e613f7fb198f1dbe4 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Feb 2025 21:13:41 -0800
Subject: [PATCH 0251/1240] [perf-benchmark] Allow premerge ECR (#13509)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/nightly-benchmarks/benchmark-pipeline.yaml   | 6 +++---
 .buildkite/nightly-benchmarks/scripts/wait-for-image.sh | 6 +++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index df95e46d6dd..d1c08de7c47 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -21,7 +21,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
             command:
             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
@@ -52,7 +52,7 @@ steps:
     depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -83,7 +83,7 @@ steps:
     depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index aa0f7ade808..50e1ab02422 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -1,6 +1,10 @@
 #!/bin/sh
 TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
+else
+    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+fi
 
 TIMEOUT_SECONDS=10
 

From ae5997ff4ab926af49ba301d6401d65646ac69e5 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 19 Feb 2025 00:23:24 -0600
Subject: [PATCH 0252/1240] [ROCm][MoE configs] mi325 mixtral & mi300 qwen_moe 
 (#13503)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...=1408,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...N=176,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...N=352,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...N=704,device_name=AMD_Instinct_MI300X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...14336,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...16384,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=1792,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=2048,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=3584,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=4096,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=7168,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 ...me=AMD_Instinct_MI325X,dtype=fp8_w8a8.json | 164 ++++++++++++++
 ...=8192,device_name=AMD_Instinct_MI325X.json | 200 ++++++++++++++++++
 20 files changed, 3712 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 00000000000..d09508b3172
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 00000000000..746463af4d5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=60,N=176,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 00000000000..bbdb9ad0964
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=60,N=352,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
new file mode 100644
index 00000000000..43584b1eb6b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=60,N=704,device_name=AMD_Instinct_MI300X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..f245285bd82
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..3918c93b160
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..16e0a91baf3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..d766fc062dd
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..6d5b1ae5b15
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..ffc1b23ea90
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..2758e48fc40
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..fc31215cbae
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..6cb80f48329
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..de9d0aba75a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..2c49f359c22
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..c7db6c0cbd3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..7a07bbf4141
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..3a3268cc17a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 32,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..c27ca0a3659
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
new file mode 100644
index 00000000000..da477b1fb15
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X.json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}

From 737b939f780677c0b00fda7b77bb6ced0444ff98 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 18 Feb 2025 22:24:03 -0800
Subject: [PATCH 0253/1240] [Doc] Add clarification note regarding paligemma
 (#13511)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index a1a28986b8a..5497b5dba76 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -808,7 +808,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `PaliGemmaForConditionalGeneration`
+- * `PaliGemmaForConditionalGeneration`\*
   * PaliGemma, PaliGemma 2
   * T + I<sup>E</sup>
   * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
@@ -885,6 +885,10 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 
+:::{note}
+Currently the PaliGemma model series is implemented without PrefixLM attention mask. This model series may be deprecated in a future release.
+:::
+
 :::{note}
 `mistral-community/pixtral-12b` does not support V1 yet.
 :::

From 063126af4c0f6f250c040457ed60c8a798c8ee3b Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Feb 2025 23:34:59 -0800
Subject: [PATCH 0254/1240] [1/n][CI] Load models in CI from S3 instead of HF
 (#13205)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-test.in                          |  2 ++
 requirements-test.txt                         |  8 +++++
 .../test_basic_correctness.py                 | 19 ++++++------
 tests/basic_correctness/test_cumem.py         | 13 ++++++--
 tests/basic_correctness/test_preemption.py    |  2 +-
 tests/conftest.py                             | 25 ++++++++++++++-
 tests/engine/test_computed_prefix_blocks.py   |  6 +++-
 tests/engine/test_detokenization.py           |  7 +++--
 tests/engine/test_executor.py                 | 17 +++++++---
 tests/engine/test_skip_tokenizer_init.py      |  9 ++++--
 tests/engine/test_stop_reason.py              |  2 +-
 tests/entrypoints/llm/test_chat.py            | 13 ++++++--
 tests/entrypoints/llm/test_collective_rpc.py  |  2 +-
 tests/entrypoints/llm/test_encode.py          |  4 ++-
 tests/entrypoints/llm/test_generate.py        |  4 ++-
 .../llm/test_generate_multiple_loras.py       |  4 ++-
 tests/entrypoints/llm/test_guided_generate.py |  7 +++--
 tests/entrypoints/llm/test_lazy_outlines.py   | 31 +++++++++++++++++--
 .../entrypoints/llm/test_prompt_validation.py |  9 ++++--
 tests/entrypoints/openai/test_rerank.py       |  2 +-
 tests/metrics/test_metrics.py                 | 21 +++++++++----
 tests/models/registry.py                      |  3 +-
 tests/models/test_initialization.py           |  6 +++-
 tests/mq_llm_engine/test_abort.py             |  4 +--
 tests/mq_llm_engine/test_error_handling.py    |  6 ++--
 tests/mq_llm_engine/test_load.py              |  6 ++--
 tests/multimodal/test_processing.py           |  8 +++--
 .../__init__.py                               |  0
 .../test_runai_model_streamer_loader.py       |  0
 .../test_weight_utils.py                      |  0
 tests/samplers/test_ignore_eos.py             |  2 +-
 tests/samplers/test_logits_processor.py       |  2 +-
 tests/samplers/test_logprobs.py               |  2 +-
 tests/samplers/test_no_bad_words.py           |  2 +-
 tests/samplers/test_ranks.py                  |  2 +-
 tests/test_config.py                          | 13 +++++---
 tests/test_regression.py                      | 13 ++++++--
 tests/worker/test_swap.py                     |  2 +-
 vllm/config.py                                |  3 +-
 vllm/model_executor/model_loader/loader.py    |  2 +-
 .../model_loader/weight_utils.py              |  4 +--
 vllm/transformers_utils/config.py             |  3 +-
 vllm/transformers_utils/s3_utils.py           | 11 +++++--
 43 files changed, 225 insertions(+), 76 deletions(-)
 rename tests/{runai_model_streamer => runai_model_streamer_test}/__init__.py (100%)
 rename tests/{runai_model_streamer => runai_model_streamer_test}/test_runai_model_streamer_loader.py (100%)
 rename tests/{runai_model_streamer => runai_model_streamer_test}/test_weight_utils.py (100%)

diff --git a/requirements-test.in b/requirements-test.in
index ecf874ecc50..53c531360d8 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -37,3 +37,5 @@ genai_perf==0.0.8
 tritonclient==2.51.0
 
 numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index 648a2626c85..f9158641914 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -171,6 +171,8 @@ huggingface-hub==0.26.2
     #   tokenizers
     #   transformers
     #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
 idna==3.10
     # via
     #   anyio
@@ -290,6 +292,7 @@ numpy==1.26.4
     #   patsy
     #   peft
     #   rouge-score
+    #   runai-model-streamer
     #   sacrebleu
     #   scikit-learn
     #   scipy
@@ -514,6 +517,10 @@ rpds-py==0.20.1
     #   referencing
 rsa==4.7.2
     # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
 s3transfer==0.10.3
     # via
     #   awscli
@@ -594,6 +601,7 @@ torch==2.5.1
     #   encodec
     #   lm-eval
     #   peft
+    #   runai-model-streamer
     #   sentence-transformers
     #   tensorizer
     #   timm
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index bd97dd945fe..cc25c8792aa 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,6 +9,7 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.platforms import current_platform
 
 from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("facebook/opt-125m")
+    llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
     "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
+        ("distilbert/distilgpt2", "ray", "", "L4"),
+        ("distilbert/distilgpt2", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("distilbert/distilgpt2", "ray", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
     hf_runner,
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f16b8007a74..24ed5d39283 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,9 +4,11 @@
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test
 
 
@@ -118,13 +120,18 @@ def model(x):
 @pytest.mark.parametrize(
     "model",
     [
-        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
-        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+        # sleep mode with safetensors
+        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        # sleep mode with pytorch checkpoint
+        "facebook/opt-125m"
     ])
 def test_end_to_end(model):
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
+    load_format = LoadFormat.AUTO
+    if "Llama" in model:
+        load_format = LoadFormat.RUNAI_STREAMER
+    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 6aaec6eef9d..a32b7cac080 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -17,7 +17,7 @@
 from ..models.utils import check_outputs_equal
 
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 02105900f30..74219e40026 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -46,6 +46,21 @@
 _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
+
+MODELS_ON_S3 = [
+    "distilbert/distilgpt2",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openai-community/gpt2",
+    "ArthurZ/Ilama-3.2-1B",
+    "llava-hf/llava-1.5-7b-hf",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -677,8 +692,15 @@ def __init__(
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
         enforce_eager: Optional[bool] = False,
+        load_format: Optional[LoadFormat] = None,
         **kwargs,
     ) -> None:
+        if model_name in MODELS_ON_S3 and not load_format:
+            model_name = (f"s3://vllm-ci-model-weights/"
+                          f"{model_name.split('/')[-1]}")
+            load_format = LoadFormat.RUNAI_STREAMER
+        if not load_format:
+            load_format = LoadFormat.AUTO
         self.model = LLM(
             model=model_name,
             task=task,
@@ -693,6 +715,7 @@ def __init__(
             max_model_len=max_model_len,
             block_size=block_size,
             enable_chunked_prefill=enable_chunked_prefill,
+            load_format=load_format,
             **kwargs,
         )
 
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index dca8fa6026a..93907ecae55 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,12 +2,15 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
@@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
         "decoration.")
 
     engine_args = EngineArgs(model=model,
+                             load_format=LoadFormat.RUNAI_STREAMER,
                              block_size=block_size,
                              enable_prefix_caching=True)
 
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index 742176ea8b6..ab594aeee40 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,11 +2,14 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
@@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
         "paper clips? Is there an easy to follow video tutorial available "
         "online for free?")
 
-    llm = LLM(model=model)
+    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(max_tokens=10,
                                      temperature=0.0,
                                      detokenize=False)
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 84cc3ed63bb..31c07e709bd 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -6,12 +6,17 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 class Mock:
     ...
@@ -33,10 +38,11 @@ def collective_rpc(self,
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
+                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                  distributed_executor_backend=Mock)
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
@@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):
 
         engine_args = EngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutor,
             enforce_eager=True,  # reduce test time
         )
@@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):
 
         engine_args = AsyncEngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutorAsync,
             enforce_eager=True,  # reduce test time
         )
@@ -95,7 +103,7 @@ async def t():
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
@@ -104,6 +112,7 @@ def test_respect_ray(model):
     engine_args = EngineArgs(
         model=model,
         distributed_executor_backend="ray",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         enforce_eager=True,  # reduce test time
     )
     engine = LLMEngine.from_engine_args(engine_args)
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index 655c8232ac7..fee7fd3f6aa 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,16 +2,21 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
-    llm = LLM(model=model, skip_tokenizer_init=True)
+    llm = LLM(model=model,
+              skip_tokenizer_init=True,
+              load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index a50b388048c..4b1e4f5cf45 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -12,7 +12,7 @@
 
 from vllm import SamplingParams
 
-MODEL = "facebook/opt-350m"
+MODEL = "distilbert/distilgpt2"
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 77c80b2f894..f6fda5120d9 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -5,12 +5,17 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 
+from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -28,7 +33,8 @@ def test_chat():
 
 
 def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
@@ -65,7 +71,8 @@ def test_multi_chat():
                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
     llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
+        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 39d4810de9e..69c60bbe6e8 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ class MyWorker(Worker):
         def echo_rank(self):
             return self.rank
 
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
               enforce_eager=True,
               load_format="dummy",
               tensor_parallel_size=tp_size,
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index ebec8baba38..61085bf43d1 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -6,9 +6,10 @@
 import pytest
 
 from vllm import LLM, PoolingParams, PoolingRequestOutput
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
 
 PROMPTS = [
     "Hello, my name is",
@@ -32,6 +33,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=32768,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.75,
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 4c78c2c8ee2..f1bad876be4 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,9 +6,10 @@
 import pytest
 
 from vllm import LLM, RequestOutput, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "facebook/opt-125m"
+MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
 
 PROMPTS = [
     "Hello, my name is",
@@ -30,6 +31,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.10,
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 90e1d581413..487c00460a6 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,10 +7,11 @@
 from huggingface_hub import snapshot_download
 
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
 
 PROMPTS = [
     "Hello, my name is",
@@ -27,6 +28,7 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
               tensor_parallel_size=1,
               max_model_len=8192,
               enable_lora=True,
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 01d2c1709b4..70252471cc2 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,12 +7,13 @@
 import jsonschema
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
@@ -20,7 +21,9 @@
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME,
+              load_format=LoadFormat.RUNAI_STREAMER,
+              max_model_len=1024)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index b1f9ae14da0..07608e15fe9 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -6,10 +6,11 @@
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
-def run_normal():
+def run_normal_opt125m():
     prompts = [
         "Hello, my name is",
         "The president of the United States is",
@@ -33,9 +34,35 @@ def run_normal():
     cleanup_dist_env_and_memory()
 
 
+def run_normal():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM without guided decoding as a baseline.
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True,
+              gpu_memory_utilization=0.3)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Destroy the LLM object and free up the GPU memory.
+    del llm
+    cleanup_dist_env_and_memory()
+
+
 def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
               gpu_memory_utilization=0.3)
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index f2c145fa3c2..04848131dfc 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,6 +3,7 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 
 
 @pytest.fixture(autouse=True)
@@ -14,13 +15,17 @@ def v1(run_with_both_engines):
 
 
 def test_empty_prompt():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     with pytest.raises(ValueError, match='Prompt cannot be empty'):
         llm.generate([""])
 
 
 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model="gpt2", enforce_eager=True)
+    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     with pytest.raises(ValueError, match='out of vocabulary'):
         llm.generate({"prompt_token_ids": [999999]})
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 4c9774a7397..cf114f0641d 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -86,4 +86,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
     assert rerank_response.status_code == 400
     # Assert just a small fragments of the response
     assert "Please reduce the length of the input." in \
-        rerank_response.text
\ No newline at end of file
+        rerank_response.text
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 0942c8eed34..1a9063bc2dc 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -8,16 +8,21 @@
 from prometheus_client import REGISTRY
 
 from vllm import EngineArgs, LLMEngine
+from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
@@ -141,8 +146,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == model, (
-            f"Metrics tag model_name is wrong! expect: {model!r}\n"
+        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
+        assert metrics_tag_content == actual_model_name, (
+            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
             f"actual: {metrics_tag_content!r}")
     else:
         assert metrics_tag_content == served_model_name[0], (
@@ -170,7 +176,8 @@ async def test_async_engine_log_metrics_regression(
     """
     engine_args = AsyncEngineArgs(model=model,
                                   dtype=dtype,
-                                  disable_log_stats=disable_log_stats)
+                                  disable_log_stats=disable_log_stats,
+                                  load_format=RUNAI_STREAMER_LOAD_FORMAT)
     async_engine = AsyncLLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         results = async_engine.generate(
@@ -199,7 +206,8 @@ def test_engine_log_metrics_regression(
 ) -> None:
     engine_args = EngineArgs(model=model,
                              dtype=dtype,
-                             disable_log_stats=disable_log_stats)
+                             disable_log_stats=disable_log_stats,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
     engine = LLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         engine.add_request(
@@ -283,7 +291,8 @@ def test_metric_spec_decode_interval(
                              gpu_memory_utilization=0.4,
                              speculative_model=model,
                              num_speculative_tokens=k,
-                             enforce_eager=True)
+                             enforce_eager=True,
+                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     engine = LLMEngine.from_engine_args(engine_args)
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 17bfe1d21e4..8b0ece16100 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -173,7 +173,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        trust_remote_code=True),
-    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
+    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct",
+                                        extras={"2.5": "Qwen/Qwen2.5-7B-Instruct"}), # noqa: E501
     "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
     "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
                                      is_available_online=False),
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index c58c6372316..e0d5e003227 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,6 +7,7 @@
 
 from vllm import LLM
 
+from ..conftest import MODELS_ON_S3
 from .registry import HF_EXAMPLE_MODELS
 
 
@@ -42,8 +43,11 @@ def _initialize_kv_caches(self) -> None:
 
     with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
                       _initialize_kv_caches):
+        model_name = model_info.default
+        if model_name in MODELS_ON_S3:
+            model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
         LLM(
-            model_info.default,
+            model_name,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             speculative_model=model_info.speculative_model,
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index 808346b5e58..b0ac0fb327f 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -10,8 +10,8 @@
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 EXPECTED_TOKENS = 250
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 35d00178111..4eac73417ad 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -21,8 +21,10 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 
-MODEL = "google/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
+                              load_format="runai_streamer",
+                              enforce_eager=True)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 2069ff987f2..3162d56c6d4 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -10,12 +10,14 @@
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "google/gemma-1.1-2b-it"
+MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
 NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000
 
 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
+                              load_format="runai_streamer",
+                              disable_log_requests=True)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 459c0d9d113..7bbe5c53562 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -553,7 +553,8 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -592,7 +593,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         profiler.get_dummy_data(model_config.max_model_len)
 
 
-@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize(
+    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("num_images", "limit", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -661,7 +663,7 @@ def __call__(
         return dict(exists=exists)
 
 
-@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-7B-Instruct"])  # Dummy
+@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy
 # yapf: disable
 @pytest.mark.parametrize(
     ("call_kwargs", "expected_kwargs"),
diff --git a/tests/runai_model_streamer/__init__.py b/tests/runai_model_streamer_test/__init__.py
similarity index 100%
rename from tests/runai_model_streamer/__init__.py
rename to tests/runai_model_streamer_test/__init__.py
diff --git a/tests/runai_model_streamer/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
similarity index 100%
rename from tests/runai_model_streamer/test_runai_model_streamer_loader.py
rename to tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
diff --git a/tests/runai_model_streamer/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
similarity index 100%
rename from tests/runai_model_streamer/test_weight_utils.py
rename to tests/runai_model_streamer_test/test_weight_utils.py
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 9a92b08ff3f..673d1b9a7ef 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -10,7 +10,7 @@
 
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
-MODELS = ["facebook/opt-125m", "meta-llama/Llama-3.2-1B-Instruct"]
+MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 3b95b038979..f237b616077 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -5,7 +5,7 @@
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 59d36099c65..78bdd9b0b95 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -9,7 +9,7 @@
 
 from ..conftest import VllmRunner
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index cc6557694c6..143f5299941 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -76,7 +76,7 @@ def _encode(self,
 
 class TestTwoTokenBadWord:
     # Another model (with a different tokenizer behaviour)
-    MODEL = "openai-community/gpt2"
+    MODEL = "distilbert/distilgpt2"
 
     PROMPT = "How old are you? I am 10"
     TARGET_TOKEN1 = "years"
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index c74c1c02c24..66779d97a92 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -4,7 +4,7 @@
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+MODELS = ["distilbert/distilgpt2"]
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/test_config.py b/tests/test_config.py
index 746ca7295a8..4a171861330 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,14 +8,19 @@
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
+from .conftest import MODEL_WEIGHTS_S3_BUCKET
+
 
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
-        ("facebook/opt-125m", "generate", "generate"),
-        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
-        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
+         "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
+         "classify"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
+         "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index f781b3113b4..e9b21e1a723 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,6 +10,9 @@
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
+
+from .conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
 def test_duplicated_ignored_sequence_group():
@@ -18,7 +21,8 @@ def test_duplicated_ignored_sequence_group():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=256)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -31,7 +35,8 @@ def test_max_tokens_none():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=None)
-    llm = LLM(model="facebook/opt-125m",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["Just say hello!"]
@@ -41,7 +46,9 @@ def test_max_tokens_none():
 
 
 def test_gc():
-    llm = LLM("facebook/opt-125m", enforce_eager=True)
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+              load_format=LoadFormat.RUNAI_STREAMER,
+              enforce_eager=True)
     del llm
 
     gc.collect()
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 7ae0f4bb8e8..2c337cc9fed 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -10,7 +10,7 @@
 
 def test_swap() -> None:
     # Configure the engine.
-    engine_args = EngineArgs(model="facebook/opt-125m",
+    engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
                              dtype="half",
                              load_format="dummy")
     engine_config = engine_args.create_engine_config()
diff --git a/vllm/config.py b/vllm/config.py
index 5c220ed1363..54227dda044 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -409,7 +409,8 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
         if is_s3(model) or is_s3(tokenizer):
             if is_s3(model):
                 s3_model = S3Model()
-                s3_model.pull_files(model, allow_pattern=["*config.json"])
+                s3_model.pull_files(
+                    model, allow_pattern=["*.model", "*.py", "*.json"])
                 self.model_weights = self.model
                 self.model = s3_model.dir
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 230484a36de..df957cfca3c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1327,6 +1327,7 @@ def _prepare_weights(self, model_name_or_path: str,
         """Prepare weights for the model.
 
         If the model is not local, it will be downloaded."""
+
         is_s3_path = is_s3(model_name_or_path)
         is_local = os.path.isdir(model_name_or_path)
         safetensors_pattern = "*.safetensors"
@@ -1340,7 +1341,6 @@ def _prepare_weights(self, model_name_or_path: str,
                          revision,
                          ignore_patterns=self.load_config.ignore_patterns,
                      ))
-
         if is_s3_path:
             hf_weights_files = s3_glob(path=hf_folder,
                                        allow_pattern=[safetensors_pattern])
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 18f6f40b32f..ac1be383c15 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -27,6 +27,8 @@
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule
 
+logger = init_logger(__name__)
+
 try:
     from runai_model_streamer import SafetensorsStreamer
 except (ImportError, OSError):
@@ -37,8 +39,6 @@
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
-logger = init_logger(__name__)
-
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2fed5d743e8..4768226f9a0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -144,7 +144,6 @@ def file_exists(
     revision: Optional[str] = None,
     token: Union[str, bool, None] = None,
 ) -> bool:
-
     file_list = list_repo_files(repo_id,
                                 repo_type=repo_type,
                                 revision=revision,
@@ -498,7 +497,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             if encoder_dict:
                 break
 
-    if not encoder_dict:
+    if not encoder_dict and not model.startswith("/"):
         try:
             # If model is on HuggingfaceHub, get the repo files
             repo_files = list_repo_files(model,
diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py
index 4fe744d285d..1c3520bcfb2 100644
--- a/vllm/transformers_utils/s3_utils.py
+++ b/vllm/transformers_utils/s3_utils.py
@@ -46,6 +46,8 @@ def glob(s3=None,
     """
     if s3 is None:
         s3 = boto3.client("s3")
+    if not path.endswith("/"):
+        path = path + "/"
     bucket_name, _, paths = list_files(s3,
                                        path=path,
                                        allow_pattern=allow_pattern)
@@ -109,6 +111,7 @@ def __init__(self) -> None:
         for sig in (signal.SIGINT, signal.SIGTERM):
             existing_handler = signal.getsignal(sig)
             signal.signal(sig, self._close_by_signal(existing_handler))
+
         self.dir = tempfile.mkdtemp()
 
     def __del__(self):
@@ -140,6 +143,9 @@ def pull_files(self,
             ignore_pattern: A list of patterns of which files not to pull.
 
         """
+        if not s3_model_path.endswith("/"):
+            s3_model_path = s3_model_path + "/"
+
         bucket_name, base_dir, files = list_files(self.s3, s3_model_path,
                                                   allow_pattern,
                                                   ignore_pattern)
@@ -147,8 +153,9 @@ def pull_files(self,
             return
 
         for file in files:
-            destination_file = os.path.join(self.dir,
-                                            file.removeprefix(base_dir))
+            destination_file = os.path.join(
+                self.dir,
+                file.removeprefix(base_dir).lstrip("/"))
             local_dir = Path(destination_file).parent
             os.makedirs(local_dir, exist_ok=True)
             self.s3.download_file(bucket_name, file, destination_file)

From 50c2d5a67c36c871cfaa75cd1d43eb01efed134f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 18 Feb 2025 23:56:11 -0800
Subject: [PATCH 0255/1240] [perf-benchmark] Fix ECR path for premerge
 benchmark (#13512)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../benchmark-pipeline.yaml                   | 100 ++++++++++++++++--
 1 file changed, 93 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index d1c08de7c47..4259514940d 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -10,18 +10,24 @@ steps:
           - image: badouralix/curl-jq
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-
+  - label: "Cleanup H100"
+    agents:
+      queue: H100
+    depends_on: ~
+    command: docker system prune -a --volumes --force
+  
   - label: "A100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: A100
     depends_on: wait-for-container-image
+    if: build.branch == "main"
     plugins:
     - kubernetes:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
             command:
             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
             resources:
@@ -50,9 +56,10 @@ steps:
     agents:
       queue: H200
     depends_on: wait-for-container-image
+    if: build.branch == "main"
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -70,20 +77,99 @@ steps:
     #key: block-h100
     #depends_on: ~
 
-  - label: "Cleanup H100"
+  - label: "H100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: ~
-    command: docker system prune -a --volumes --force
+    depends_on: wait-for-container-image
+    if: build.branch == "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  # Premerge benchmark
+  - label: "A100"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: A100
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+            command:
+            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+
+  - label: "H200"
+    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
+    agents:
+      queue: H200
+    depends_on: wait-for-container-image
+    if: build.branch != "main"
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
 
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
     depends_on: wait-for-container-image
+    if: build.branch != "main"
     plugins:
     - docker#v5.12.0:
-        image: public.ecr.aws/q9t5s3a7/${BUILDKITE_BRANCH:-main} == "main" && "vllm-ci-postmerge-repo" || "vllm-ci-test-repo"}:$BUILDKITE_COMMIT
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
         command:
         - bash
         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

From 550ac62d93d72ca7413ac1662103e49215e6b530 Mon Sep 17 00:00:00 2001
From: Zhe Zhang <2631992879@qq.com>
Date: Wed, 19 Feb 2025 16:05:02 +0800
Subject: [PATCH 0256/1240] use device param in load_model method (#13037)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index c7814f17375..78cc352b163 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1107,7 +1107,7 @@ def __init__(
 
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
-        with DeviceMemoryProfiler() as m:
+        with DeviceMemoryProfiler(self.device) as m:
             self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory

From 9c962fb6e3d86b75c143d869255de67bd12c05ce Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Wed, 19 Feb 2025 01:50:07 -0700
Subject: [PATCH 0257/1240] [Bugfix] Fix Positive Feature Layers in Llava
 Models (#13514)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/test_vision.py           | 34 +++++++++++++++++++++++++++
 vllm/model_executor/models/clip.py    |  2 +-
 vllm/model_executor/models/llava.py   |  4 ++--
 vllm/model_executor/models/pixtral.py |  2 +-
 vllm/model_executor/models/siglip.py  |  2 +-
 vllm/model_executor/models/vision.py  |  9 +++----
 6 files changed, 44 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/test_vision.py

diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
new file mode 100644
index 00000000000..d64c0e6d4e4
--- /dev/null
+++ b/tests/models/test_vision.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.model_executor.models.vision import resolve_visual_encoder_outputs
+
+
+@pytest.mark.parametrize(
+    ("feature_sample_layers", "num_layers_loaded", "max_possible_layers",
+     "expected_features"),
+    [
+        # All layers loaded
+        ([1, 10], 10, 10, [1, 10]),
+        ([-10, -1], 10, 10, [1, 10]),
+        # Some layers not loaded
+        ([1, 10], 10, 20, [1, 10]),
+        ([-20, -11], 10, 20, [1, 10]),
+    ])
+def test_resolve_visual_encoder_outputs(feature_sample_layers,
+                                        num_layers_loaded, max_possible_layers,
+                                        expected_features):
+    """
+    Test that offsets are correctly handled for vision feature layers.
+    """
+    encoder_outputs = [
+        torch.tensor([idx]) for idx in range(num_layers_loaded + 1)
+    ]
+    output_tensor = resolve_visual_encoder_outputs(
+        encoder_outputs=encoder_outputs,
+        feature_sample_layers=feature_sample_layers,
+        post_layer_norm=None,
+        max_possible_layers=max_possible_layers)
+    assert torch.equal(torch.tensor(expected_features), output_tensor)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 73c109a27ac..dc3aa9cbe86 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -251,7 +251,7 @@ def __init__(
     def forward(
         self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool
     ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        hidden_states_pool = []
+        hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
         for encoder_layer in self.layers:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index dcd90474e93..6a4277adb6b 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -428,7 +428,7 @@ def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
 
 
 def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
-    """Given an signed vision feature layer, get the number of hidden layers
+    """Given a signed vision feature layer, get the number of hidden layers
     needed to leverage it.
 
     Args:
@@ -438,7 +438,7 @@ def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
     """
     if feature_layer_index < 0:
         return num_hidden_layers + feature_layer_index + 1
-    return feature_layer_index + 1
+    return feature_layer_index
 
 
 def init_vision_tower_for_llava(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index e78e8d62cc4..44fca852805 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -969,7 +969,7 @@ def forward(
         position_embeddings: torch.Tensor,
         return_all_hidden_states: bool,
     ) -> torch.Tensor:
-        hidden_states_pool = []
+        hidden_states_pool = [x]
 
         for layer in self.layers:
             x = layer(x, attention_mask, position_embeddings)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ddae78d7739..2892f696107 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -378,7 +378,7 @@ def forward(
         inputs_embeds: torch.Tensor,
         return_all_hidden_states: bool,
     ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        hidden_states_pool = []
+        hidden_states_pool = [inputs_embeds]
         hidden_states = inputs_embeds
 
         for encoder_layer in self.layers:
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 0d67ee7bb5d..9a6fac2eec5 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -132,10 +132,11 @@ def resolve_visual_encoder_outputs(
     # Get the hidden states corresponding to the layer indices.
     # Negative values are relative to the full visual encoder,
     # so offset them depending on how many layers were loaded.
-    # NOTE: this assumes that encoder_outputs contains a list
-    # of hidden states in the same order as the encoder layers
-    # that produced them.
-    offset = max_possible_layers - len(encoder_outputs)
+    # NOTE: this assumes that encoder_outputs is a list containing
+    # the inputs to the visual encoder, followed by the hidden states
+    # of each layer.
+    num_loaded_layers = len(encoder_outputs) - 1
+    offset = max_possible_layers - num_loaded_layers
     hs_pool = [
         encoder_outputs[layer_idx]
         if layer_idx >= 0 else encoder_outputs[layer_idx + offset]

From 137758c39a63a27b07db86c080c1465547419d78 Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Wed, 19 Feb 2025 01:06:23 -0800
Subject: [PATCH 0258/1240] [Model][Speculative Decoding] DeepSeek MTP spec
 decode (#12755)

Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  22 +-
 tests/models/registry.py                      |   3 +
 tests/spec_decode/e2e/test_mtp_correctness.py | 318 ++++++++++++++++++
 vllm/config.py                                |  43 ++-
 vllm/model_executor/models/deepseek_mtp.py    | 284 ++++++++++++++++
 vllm/model_executor/models/deepseek_v2.py     |  22 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/sequence.py                              |   2 +
 vllm/spec_decode/draft_model_runner.py        |  20 +-
 vllm/spec_decode/spec_decode_worker.py        |  25 +-
 vllm/worker/model_runner.py                   |  20 +-
 vllm/worker/model_runner_base.py              |   5 +-
 vllm/worker/worker.py                         |   6 +-
 vllm/worker/worker_base.py                    |   2 +
 14 files changed, 727 insertions(+), 46 deletions(-)
 create mode 100644 tests/spec_decode/e2e/test_mtp_correctness.py
 create mode 100644 vllm/model_executor/models/deepseek_mtp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9991060a316..3918e3e8676 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -2,7 +2,7 @@
 # adding a new command to an existing step. See different options here for examples.
 
 # This script will be feed into Jinja template in `test-template-aws.j2` at
-# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
 # to generate the final pipeline yaml file.
 
 # Documentation
@@ -15,7 +15,7 @@
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
 # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
 # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
-# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, 
+# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
 #     in this case, commands must be specified. the first command runs on first host, the second
 #     command runs on the second host.
 # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
@@ -24,8 +24,8 @@
 # When adding a test
 # - If the test belong to an existing group, add it there
 # - If the test is short, add to any existing step
-# - If the test takes more than 10min, then it is okay to create a new step. 
-#   Note that all steps execute in parallel. 
+# - If the test takes more than 10min, then it is okay to create a new step.
+#   Note that all steps execute in parallel.
 
 steps:
 ##### fast check tests  #####
@@ -145,14 +145,14 @@ steps:
   - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
 
 - label: Metrics, Tracing Test # 10min
-  num_gpus: 2 
+  num_gpus: 2
   fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/metrics
   - tests/tracing
   commands:
-  - pytest -v -s metrics 
+  - pytest -v -s metrics
   - "pip install \
       'opentelemetry-sdk>=1.26.0,<1.27.0' \
       'opentelemetry-api>=1.26.0,<1.27.0' \
@@ -254,7 +254,7 @@ steps:
   - vllm/model_executor/guided_decoding
   - tests/test_logits_processor
   - tests/model_executor/test_guided_processors
-  commands: 
+  commands:
     - pytest -v -s test_logits_processor.py
     - pytest -v -s model_executor/test_guided_processors.py
 
@@ -265,7 +265,7 @@ steps:
   - vllm/model_executor/models/eagle.py
   commands:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
@@ -580,7 +580,7 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     # This test runs llama 13B, so it is required to run on 4 GPUs.
     - pytest -v -s -x lora/test_long_context.py
-    # There is some Tensor Parallelism related processing logic in LoRA that 
+    # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
@@ -605,7 +605,7 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt 
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
 
 ##### multi gpus test #####
@@ -617,7 +617,7 @@ steps:
   num_gpus: 4
   source_file_dependencies:
   - vllm/
-  commands: 
+  commands:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8b0ece16100..d89a41dae3a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -296,6 +296,9 @@ def check_available_online(
                                    speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
     "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
                                                     speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
+    "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random",
+                                        speculative_model="luccafong/deepseek_mtp_draft_random",  # noqa: E501
+                                        trust_remote_code=True),
 }
 
 _FALLBACK_MODEL = {
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
new file mode 100644
index 00000000000..0bad19f61d3
--- /dev/null
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -0,0 +1,318 @@
+# SPDX-License-Identifier: Apache-2.0
+"""This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality.
+
+However, we still need to verify below scenario could be passed:
+    * Batch size 1 greedy equality
+    * Batch size >1 greedy equality
+    * Test greedy equality under preemption
+    * Test greedy equality under various number of speculative tokens.
+
+With those tests, we can say at least, mtp would not break the
+correctess for the target model outputs.
+"""
+
+import pytest
+
+from .conftest import run_equality_correctness_test
+
+# main model
+MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
+
+# max. number of speculative tokens: this corresponds to
+# num_nextn_predict_layers in the config.json of the speculator model.
+MAX_SPEC_TOKENS = 1
+
+# precision
+PRECISION = "bfloat16"
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
+                                    per_test_common_llm_kwargs,
+                                    baseline_llm_kwargs, test_llm_kwargs,
+                                    batch_size: int, output_len: int,
+                                    seed: int):
+
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": False,
+    },
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_logprobs_during_spec_decoding": True,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seed", [1])
+@pytest.mark.parametrize("logprobs", [1, 6])
+def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
+                                 per_test_common_llm_kwargs,
+                                 baseline_llm_kwargs, test_llm_kwargs,
+                                 batch_size: int, output_len: int, seed: int,
+                                 logprobs: int):
+
+    run_equality_correctness_test(vllm_runner,
+                                  common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs,
+                                  test_llm_kwargs,
+                                  batch_size,
+                                  output_len,
+                                  seed,
+                                  logprobs=logprobs,
+                                  prompt_logprobs=logprobs,
+                                  disable_logprobs=test_llm_kwargs[
+                                      'disable_logprobs_during_spec_decoding'])
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "enforce_eager": False,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+        "gpu_memory_utilization": 0.85
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize("output_len", [
+    128,
+])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
+                                               per_test_common_llm_kwargs,
+                                               baseline_llm_kwargs,
+                                               test_llm_kwargs,
+                                               batch_size: int,
+                                               output_len: int, seed: int):
+    """Verify greedy equality with cuda graph enabled and different
+    batch sizes."""
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "block_size": 8,
+        # 2 for small prompt, 256//8 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 8,
+        "max_model_len": (2 + 256 // 8) * 8,
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+    },
+])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use small output len for fast test.
+        128,
+    ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_e2e_greedy_correctness_with_preemption(
+        vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
+        seed: int):
+    """Verify greedy equality, even when some sequences are preempted mid-
+    generation.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+    "test_llm_kwargs",
+    [
+        {
+            "num_speculative_tokens": k,
+        }
+        # Try a range of num. speculative tokens
+        for k in range(1, 1 + MAX_SPEC_TOKENS)
+    ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_different_k(vllm_runner, common_llm_kwargs,
+                         per_test_common_llm_kwargs, baseline_llm_kwargs,
+                         test_llm_kwargs, batch_size: int, output_len: int,
+                         seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode with different values of num_speculative_tokens.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model_name": MAIN_MODEL,
+
+        # GPU memory utilization
+        "gpu_memory_utilization": 0.9
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs",
+                         [{
+                             "num_speculative_tokens": MAX_SPEC_TOKENS,
+                             "speculative_disable_by_batch_size": 4
+                         }])
+@pytest.mark.parametrize("batch_size", [1, 5])
+@pytest.mark.parametrize(
+    "output_len",
+    [
+        # Use smaller output len for fast test.
+        32,
+    ])
+@pytest.mark.parametrize("seed", [1])
+def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
+                           per_test_common_llm_kwargs, baseline_llm_kwargs,
+                           test_llm_kwargs, batch_size: int, output_len: int,
+                           seed: int):
+    """Verify that mtp speculative decoding produces exact equality
+    to without spec decode when speculation is disabled for large
+    batch sizes.
+    """
+    run_equality_correctness_test(vllm_runner, common_llm_kwargs,
+                                  per_test_common_llm_kwargs,
+                                  baseline_llm_kwargs, test_llm_kwargs,
+                                  batch_size, output_len, seed)
+
+
+if __name__ == "__main__":
+    import pytest
+    pytest.main([__file__])
diff --git a/vllm/config.py b/vllm/config.py
index 54227dda044..59fa60fd8b0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -763,7 +763,7 @@ def get_hidden_size(self) -> int:
     def is_deepseek_mla(self) -> bool:
         return (hasattr(self.hf_text_config, "model_type")) \
                 and (self.hf_text_config.model_type in \
-                    ('deepseek_v2', 'deepseek_v3'))\
+                    ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\
                 and (self.hf_text_config.kv_lora_rank is not None)
 
     def get_head_size(self) -> int:
@@ -856,8 +856,12 @@ def get_num_attention_heads(self,
     def get_layers_start_end_indices(
             self, parallel_config: "ParallelConfig") -> Tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
-        total_num_hidden_layers = getattr(self.hf_text_config,
-                                          "num_hidden_layers", 0)
+        if self.hf_text_config.model_type == "deepseek_mtp":
+            total_num_hidden_layers = getattr(self.hf_text_config,
+                                              "num_nextn_predict_layers", 0)
+        else:
+            total_num_hidden_layers = getattr(self.hf_text_config,
+                                              "num_hidden_layers", 0)
         pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
         pp_size = parallel_config.pipeline_parallel_size
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
@@ -1689,6 +1693,18 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
+    @staticmethod
+    def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
+        if hf_config.model_type == "deepseek_v3":
+            hf_config.model_type = "deepseek_mtp"
+        if hf_config.model_type == "deepseek_mtp":
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["DeepSeekMTPModel"]
+            })
+        return hf_config
+
     @staticmethod
     def maybe_create_spec_config(
         target_model_config: ModelConfig,
@@ -1771,12 +1787,18 @@ def maybe_create_spec_config(
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
                 the necessary conditions are met, else None.
         """
-
         if speculative_model is None:
             if num_speculative_tokens is not None:
-                raise ValueError("num_speculative_tokens was provided without "
-                                 "speculative_model.")
-            return None
+                if target_model_config.hf_text_config.model_type \
+                        == "deepseek_v3":
+                    # use the draft model from the same model:
+                    speculative_model = target_model_config.model
+                else:
+                    raise ValueError(
+                        "num_speculative_tokens was provided without "
+                        "speculative_model.")
+            else:
+                return None
 
         if (speculative_disable_by_batch_size is not None
                 and speculative_disable_by_batch_size < 2):
@@ -1830,6 +1852,7 @@ def maybe_create_spec_config(
                 max_seq_len_to_capture=target_model_config.
                 max_seq_len_to_capture,
                 max_logprobs=target_model_config.max_logprobs,
+                hf_overrides=SpeculativeConfig.hf_config_override,
             )
 
             draft_hf_config = draft_model_config.hf_config
@@ -1846,7 +1869,6 @@ def maybe_create_spec_config(
             if (num_speculative_tokens is not None
                     and hasattr(draft_hf_config, "num_lookahead_tokens")):
                 draft_hf_config.num_lookahead_tokens = num_speculative_tokens
-
             n_predict = getattr(draft_hf_config, "n_predict", None)
             if n_predict is not None:
                 if num_speculative_tokens is None:
@@ -1960,8 +1982,9 @@ def _verify_and_get_draft_model_tensor_parallel_size(
                 speculative_draft_tensor_parallel_size = 1
                 if target_parallel_config.tensor_parallel_size > 1:
                     logger.warning(
-                        "MLPSpeculator cannot currently be run with tp>1; "
-                        "setting speculative_draft_tensor_parallel_size=1")
+                        "%s cannot currently be run with tp>1; "
+                        "setting speculative_draft_tensor_parallel_size=1",
+                        draft_hf_config.model_type)
             else:
                 speculative_draft_tensor_parallel_size = \
                     target_parallel_config.tensor_parallel_size
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
new file mode 100644
index 00000000000..1a051992a30
--- /dev/null
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -0,0 +1,284 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Iterable, List, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .deepseek_v2 import (DeepseekV2DecoderLayer,
+                          get_spec_layer_idx_from_weight_name)
+from .utils import maybe_prefix
+
+
+class SharedHead(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.head = ParallelLMHead(config.vocab_size,
+                                   config.hidden_size,
+                                   quant_config=quant_config)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.norm(hidden_states)
+
+
+class DeepSeekMultiTokenPredictorLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        model_config: ModelConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = nn.Linear(config.hidden_size * 2,
+                                 config.hidden_size,
+                                 bias=False)
+        self.shared_head = SharedHead(config=config, quant_config=quant_config)
+        self.mtp_block = DeepseekV2DecoderLayer(config, prefix, model_config,
+                                                cache_config, quant_config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        assert inputs_embeds is not None
+        # masking inputs at position 0, as not needed by MTP
+        inputs_embeds[positions == 0] = 0
+        inputs_embeds = self.enorm(inputs_embeds)
+        previous_hidden_states = self.hnorm(previous_hidden_states)
+
+        hidden_states = self.eh_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 kv_cache=kv_cache,
+                                                 attn_metadata=attn_metadata,
+                                                 residual=None)
+        hidden_states = residual + hidden_states
+        return self.shared_head(hidden_states)
+
+
+class DeepSeekMultiTokenPredictor(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        self.mtp_start_layer_idx = config.num_hidden_layers
+        self.num_mtp_layers = config.num_nextn_predict_layers
+        # to map the exact layer index from weights
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            DeepSeekMultiTokenPredictorLayer(
+                config,
+                f"{prefix}.layers.{idx}",
+                model_config=vllm_config.model_config,
+                cache_config=vllm_config.cache_config,
+                quant_config=vllm_config.quant_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+            input_ids,
+            positions,
+            kv_caches[spec_step_idx],
+            attn_metadata,
+            previous_hidden_states,
+            inputs_embeds,
+            spec_step_idx,
+        )
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        logits = self.logits_processor(mtp_layer.shared_head.head,
+                                       hidden_states, sampling_metadata)
+        return logits
+
+
+class DeepSeekMTP(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.model = DeepSeekMultiTokenPredictor(vllm_config=vllm_config,
+                                                 prefix=maybe_prefix(
+                                                     prefix, "model"))
+
+        self.sampler = get_sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        previous_hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, previous_hidden_states,
+                                   inputs_embeds, spec_step_idx)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        return self.model.compute_logits(hidden_states, sampling_metadata,
+                                         spec_step_idx)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.n_routed_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is None:
+                continue
+            name = self._rewrite_spec_layer_name(spec_layer, name)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        """
+        spec_layer_weight_names = [
+            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+        ]
+        spec_layer_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace(f"model.layers.{spec_layer}.",
+                                f"model.layers.{spec_layer}.mtp_block.")
+        return name
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fd0e58fa145..a4d52c613b3 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -732,13 +732,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
             if "rotary_emb.inv_freq" in name:
                 continue
 
-            # TODO(simon): support nextn predict layers
-            if hasattr(self.config, "num_nextn_predict_layers"
-                       ) and self.config.num_nextn_predict_layers > 0:
-                assert self.config.num_nextn_predict_layers == 1
-                layer_idx = self.config.num_hidden_layers
-                if name.startswith(f"model.layers.{layer_idx}"):
-                    continue
+            spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
+            if spec_layer is not None:
+                continue  # skip spec decode layers for main model
 
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
@@ -805,3 +801,15 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM):
     pass
+
+
+def get_spec_layer_idx_from_weight_name(config: PretrainedConfig,
+                                        weight_name: str) -> Optional[int]:
+    if hasattr(config,
+               "num_nextn_predict_layers") and (config.num_nextn_predict_layers
+                                                > 0):
+        layer_idx = config.num_hidden_layers
+        for i in range(config.num_nextn_predict_layers):
+            if weight_name.startswith(f"model.layers.{layer_idx+i}."):
+                return layer_idx + i
+    return None
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 775398e003c..81623defd33 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -187,6 +187,7 @@
 
 _SPECULATIVE_DECODING_MODELS = {
     "EAGLEModel": ("eagle", "EAGLE"),
+    "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "MedusaModel": ("medusa", "Medusa"),
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 45d0e5bc768..c0425ba33c9 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -1307,6 +1307,8 @@ class ExecuteModelRequest(
     previous_hidden_states: Optional[HiddenStates] = None
     # The number of forward steps to run.
     num_steps: int = 1
+    # The step index for spec model input.
+    spec_step_idx: Optional[int] = None
     # Finished request ids since last step.
     finished_requests_ids: List[str] = msgspec.field(default_factory=list)
     # The last sampled token ids for multi step decoding.
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 3948298db40..7353d3c53ae 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -153,7 +153,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
                 return False
 
         # TODO: Add support for other attn backends
-        if self.attn_backend.get_name() != "FLASH_ATTN":
+        if self.attn_backend.get_name() not in ("FLASH_ATTN", "TRITON_MLA"):
             return False
 
         # TODO: Add support for LORA
@@ -175,6 +175,7 @@ def execute_model(
         previous_hidden_states: Optional[torch.Tensor] = None,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        **kwargs,
     ) -> Optional[List[SamplerOutput]]:
         """Executes num_steps forward passes with advacement of input tensors 
         on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
@@ -271,10 +272,17 @@ def execute_model(
         for step in range(num_steps):
             multi_modal_kwargs = model_input.multi_modal_kwargs or {}
 
-            kwargs = {"previous_hidden_states": hidden_states} \
+            model_execute_kwargs = {"previous_hidden_states": hidden_states} \
                 if previous_hidden_states is not None else {}
 
+            compute_logits_kwargs = {}
             # Run model
+            if hasattr(self.model.config, "num_nextn_predict_layers"):
+                # for DeepSeek MTP only to use the corresponding layer for
+                # each step
+                spec_step_idx = kwargs.get("spec_step_idx", step)
+                model_execute_kwargs["spec_step_idx"] = spec_step_idx
+                compute_logits_kwargs["spec_step_idx"] = spec_step_idx
             with set_forward_context(model_input.attn_metadata,
                                      self.vllm_config):
                 hidden_states = model_executable(
@@ -285,13 +293,15 @@ def execute_model(
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                  device=self.device),
-                    **kwargs,
+                    **model_execute_kwargs,
                 )
 
             # Compute the logits.
             logits = self.model.compute_logits(hidden_states,
-                                               model_input.sampling_metadata)
-
+                                               model_input.sampling_metadata,
+                                               **compute_logits_kwargs)
+            if not self.is_driver_worker:
+                return []
             # Sample the next token.
             output = self.model.sample(
                 logits=logits,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 33b1be54c8b..fce06a81ff0 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -108,6 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
         typical_acceptance_sampler_posterior_alpha,
         disable_logprobs=speculative_config.disable_logprobs,
         disable_log_stats=speculative_config.disable_log_stats,
+        num_speculative_tokens=speculative_config.num_speculative_tokens,
     )
 
     return spec_decode_worker
@@ -153,10 +154,12 @@ def create_worker(
         typical_acceptance_sampler_posterior_alpha: float,
         disable_logprobs: bool,
         disable_log_stats: bool,
+        num_speculative_tokens: int,
     ) -> "SpecDecodeWorker":
 
         allow_zero_draft_token_step = True
         enable_lm_head_weight_load = False
+        num_spec_prefill_steps = 1
         ngram_prompt_lookup_max = (
             draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
         ngram_prompt_lookup_min = (
@@ -179,14 +182,16 @@ def create_worker(
             elif draft_model_config.hf_config.model_type == "medusa":
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
-                if draft_tp == 1:
+                if draft_tp == 1 or draft_model_config.hf_config.model_type ==\
+                        "deepseek_mtp":
                     if current_platform.is_cuda_alike():
                         draft_worker_kwargs[
                             "model_runner_cls"] = TP1DraftModelRunner
                 else:
                     if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(
-                            "EAGLE does not support TP > 1 yet")
+                            f"{draft_model_config.hf_config.model_type} "
+                            "does not support TP > 1 yet")
 
                     allow_zero_draft_token_step = False
 
@@ -195,6 +200,8 @@ def create_worker(
                     enable_lm_head_weight_load = True
 
                 proposer_worker = MultiStepWorker(**draft_worker_kwargs)
+                if draft_model_config.hf_config.model_type == "deepseek_mtp":
+                    num_spec_prefill_steps = num_speculative_tokens
 
             proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
                 proposer_worker, draft_tp, target_tp)
@@ -247,7 +254,8 @@ def create_worker(
             disable_by_batch_size=disable_by_batch_size,
             spec_decode_sampler=spec_decode_sampler,
             allow_zero_draft_token_step=allow_zero_draft_token_step,
-            enable_lm_head_weight_load=enable_lm_head_weight_load)
+            enable_lm_head_weight_load=enable_lm_head_weight_load,
+            num_spec_prefill_steps=num_spec_prefill_steps)
 
     def __init__(
         self,
@@ -261,6 +269,7 @@ def __init__(
         disable_by_batch_size: Optional[int] = None,
         allow_zero_draft_token_step: Optional[bool] = True,
         enable_lm_head_weight_load: Optional[bool] = False,
+        num_spec_prefill_steps: int = 1,
     ):
         """
         Create a SpecDecodeWorker.
@@ -293,6 +302,10 @@ def __init__(
                 draft model is larger than 1 (TODO: #5814)
             enable_lm_head_weight_load: whether to load lm_head weight for
                 draft models like eagle.
+            num_spec_prefill_steps: number of speculative prefill steps to run
+                before the speculative decoding starts. This is only used when
+                the draft model is a deepseek_mtp model that requires prefill
+                kv cache separately for each MTP layer.
         """
         self.proposer_worker = proposer_worker
         self.scorer_worker = scorer_worker
@@ -326,6 +339,7 @@ def __init__(
         self.previous_hidden_states: Optional[HiddenStates] = None
         self._disable_logprobs = disable_logprobs
         self._disable_log_stats = disable_log_stats
+        self._num_spec_prefill_steps = num_spec_prefill_steps
 
     def init_device(self) -> None:
         """Initialize both scorer and proposer models.
@@ -685,8 +699,9 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
             execute_model_req.previous_hidden_states = \
                 prepare_prefill_hidden_states(
                     sampler_output.prefill_hidden_states)
-
-            self.proposer_worker.execute_model(execute_model_req)
+            for i in range(self._num_spec_prefill_steps):
+                execute_model_req.spec_step_idx = i
+                self.proposer_worker.execute_model(execute_model_req)
 
         sampler_output_to_return = (self._serialize_sampler_output_no_logprobs(
             execute_model_req=execute_model_req, sampler_output=sampler_output)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 78cc352b163..67d175c373d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -99,6 +99,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
     virtual_engine: int = 0
     async_callback: Optional[Callable] = None
     scheduler_outputs: Optional[SchedulerOutputs] = None
+    previous_hidden_states: Optional[torch.Tensor] = None
 
     def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
@@ -1649,6 +1650,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
+        **kwargs,
     ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in ModelRunner")
@@ -1706,6 +1708,10 @@ def execute_model(
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_inner_state else {}
+        previous_hidden_states = kwargs.get("previous_hidden_states")
+        model_kwargs = {}
+        if previous_hidden_states is not None:
+            model_kwargs["previous_hidden_states"] = previous_hidden_states
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_start = torch.cuda.Event(enable_timing=True)
@@ -1723,7 +1729,9 @@ def execute_model(
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                  device=self.device),
-                    **seqlen_agnostic_kwargs)
+                    **seqlen_agnostic_kwargs,
+                    **model_kwargs,
+                )
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
@@ -1815,7 +1823,7 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
             1. current vLLM instance is KV cache consumer/decode vLLM instance
             2. this batch is not a profiling run
             3. this batch is a prefill run
-            
+
         Args:
             model_input: input to the model executable
             kv_caches: vLLM's paged memory
@@ -1840,7 +1848,7 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
             1. current vLLM instance is KV cache producer/prefill vLLM instance
             2. this batch is not a profiling run
             3. this batch is a prefill run
-            
+
         Args:
             model_input: input to the model executable
             kv_caches: vLLM's paged memory
@@ -1976,7 +1984,11 @@ def forward(
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
         if positions is not None:
-            self.input_buffers["positions"].copy_(positions, non_blocking=True)
+            # in some case like MLA, it will reuse positions in metadata
+            # but truncate them to the original size
+            # so the shape is not padded, we need to copy partial only
+            self.input_buffers["positions"][:positions.shape[0]].copy_(
+                positions, non_blocking=True)
 
         if self.backend_name != "NO_ATTENTION":
             self.input_buffers["slot_mapping"].copy_(
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 38d2b712eff..bae37cb7155 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -46,7 +46,10 @@ def _init_attn_metadata_from_tensor_dict(
     valid_attn_kwargs = {}
     for field in dataclasses.fields(attn_backend.get_metadata_cls()):
         if field.name in tensor_dict:
-            valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
+            if field.name == "input_positions":
+                valid_attn_kwargs[field.name] = tensor_dict[field.name]
+            else:
+                valid_attn_kwargs[field.name] = tensor_dict.pop(field.name)
 
     attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs)
     tensor_dict["attn_metadata"] = attn_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 582aa460eb4..ff38e3bfc20 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -68,10 +68,10 @@ def __init__(
         speculative_config = self.speculative_config
         model_config = self.model_config
         speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.model ==
-                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type ==
+                model_config.hf_config.model_type) \
             or (speculative_config.draft_model_config.hf_config.model_type
-                not in ["medusa", "mlp_speculator", "eagle"]) \
+                not in ("medusa", "mlp_speculator", "eagle", "deepseek_mtp")) \
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 83fcf0865ae..190429074d5 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -397,6 +397,8 @@ def execute_model(
 
         model_input, worker_input, kwargs = inputs
         num_steps = worker_input.num_steps
+        if (execute_model_req is not None and execute_model_req.spec_step_idx):
+            kwargs["spec_step_idx"] = execute_model_req.spec_step_idx
 
         self.execute_worker(worker_input)
 

From bcfb50a4fb1505ff9a5895b61f31a2ae17e5ec54 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Feb 2025 01:09:22 -0800
Subject: [PATCH 0259/1240] [V1][Core] Generic mechanism for handling engine
 utility (#13060)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_add_lora.py                |   2 +-
 tests/v1/engine/test_engine_core_client.py |  57 ++++++++--
 vllm/v1/engine/__init__.py                 |  24 +++-
 vllm/v1/engine/core.py                     |  49 ++++++---
 vllm/v1/engine/core_client.py              | 121 ++++++++++++++++-----
 5 files changed, 197 insertions(+), 56 deletions(-)

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index df8031cba68..2b421bfd9eb 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -41,7 +41,7 @@ def download_and_prepare_lora_module():
     ]
     for tokenizer_file in tokenizer_files:
         del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
-        del_path.unlink()
+        del_path.unlink(missing_ok=True)
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 45080be8e8c..828d7eed309 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,7 +3,8 @@
 import asyncio
 import time
 import uuid
-from typing import Dict, List
+from contextlib import ExitStack
+from typing import Dict, List, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -14,7 +15,9 @@
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.core import EngineCore
+from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
+                                        SyncMPClient)
 from vllm.v1.executor.abstract import Executor
 
 if not current_platform.is_cuda():
@@ -63,7 +66,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
 async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
 
     while True:
-        engine_core_outputs = await client.get_output_async().outputs
+        engine_core_outputs = (await client.get_output_async()).outputs
 
         if len(engine_core_outputs) == 0:
             break
@@ -78,6 +81,14 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
             break
 
 
+# Dummy utility function to monkey-patch into engine core.
+def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
+    print(f"echo util function called: {msg}, {err_msg}")
+    if err_msg is not None:
+        raise ValueError(err_msg)
+    return msg
+
+
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@@ -85,7 +96,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo", echo, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
             UsageContext.UNKNOWN_CONTEXT)
         executor_class = Executor.get_class(vllm_config)
@@ -147,15 +161,30 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
         client.abort_requests([request.request_id])
 
+        if multiprocessing_mode:
+            """Utility method invocation"""
 
-@fork_new_process_for_each_test
-@pytest.mark.asyncio
+            core_client: SyncMPClient = client
+
+            result = core_client._call_utility("echo", "testarg")
+            assert result == "testarg"
+
+            with pytest.raises(Exception) as e_info:
+                core_client._call_utility("echo", None, "help!")
+
+            assert str(e_info.value) == "Call to echo method failed: help!"
+
+
+@pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch):
 
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
 
-        engine_args = EngineArgs(model=MODEL_NAME)
+        # Monkey-patch core engine utility function to test.
+        m.setattr(EngineCore, "echo", echo, raising=False)
+
+        engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True)
         vllm_config = engine_args.create_engine_config(
             usage_context=UsageContext.UNKNOWN_CONTEXT)
         executor_class = Executor.get_class(vllm_config)
@@ -166,6 +195,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
             executor_class=executor_class,
             log_stats=True,
         )
+        after.callback(client.shutdown)
 
         MAX_TOKENS = 20
         params = SamplingParams(max_tokens=MAX_TOKENS)
@@ -204,3 +234,14 @@ async def test_engine_core_client_asyncio(monkeypatch):
             else:
                 assert len(outputs[req_id]) == MAX_TOKENS, (
                     f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+        """Utility method invocation"""
+
+        core_client: AsyncMPClient = client
+
+        result = await core_client._call_utility_async("echo", "testarg")
+        assert result == "testarg"
+
+        with pytest.raises(Exception) as e_info:
+            await core_client._call_utility_async("echo", None, "help!")
+
+        assert str(e_info.value) == "Call to echo method failed: help!"
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index dee7102bb47..7420dde1f7e 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -2,7 +2,7 @@
 
 import enum
 import time
-from typing import List, Optional, Union
+from typing import Any, List, Optional, Union
 
 import msgspec
 
@@ -106,6 +106,18 @@ def finished(self) -> bool:
         return self.finish_reason is not None
 
 
+class UtilityOutput(
+        msgspec.Struct,
+        array_like=True,  # type: ignore[call-arg]
+        gc=False):  # type: ignore[call-arg]
+
+    call_id: int
+
+    # Non-None implies the call failed, result should be None.
+    failure_message: Optional[str] = None
+    result: Any = None
+
+
 class EngineCoreOutputs(
         msgspec.Struct,
         array_like=True,  # type: ignore[call-arg]
@@ -116,10 +128,12 @@ class EngineCoreOutputs(
     # e.g. columnwise layout
 
     # [num_reqs]
-    outputs: List[EngineCoreOutput]
-    scheduler_stats: Optional[SchedulerStats]
+    outputs: List[EngineCoreOutput] = []
+    scheduler_stats: Optional[SchedulerStats] = None
     timestamp: float = 0.0
 
+    utility_output: Optional[UtilityOutput] = None
+
     def __post_init__(self):
         if self.timestamp == 0.0:
             self.timestamp = time.monotonic()
@@ -132,6 +146,4 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    PROFILE = b'\x02'
-    RESET_PREFIX_CACHE = b'\x03'
-    ADD_LORA = b'\x04'
+    UTILITY = b'\x02'
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6718a5f7b02..66e252b7ccb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -5,9 +5,11 @@
 import threading
 import time
 from concurrent.futures import Future
+from inspect import isclass, signature
 from multiprocessing.connection import Connection
 from typing import Any, List, Optional, Tuple, Type
 
+import msgspec
 import psutil
 import zmq
 import zmq.asyncio
@@ -21,7 +23,7 @@
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
-                            EngineCoreRequestType)
+                            EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
@@ -330,19 +332,39 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
-        elif request_type == EngineCoreRequestType.RESET_PREFIX_CACHE:
-            self.reset_prefix_cache()
-        elif request_type == EngineCoreRequestType.PROFILE:
-            self.model_executor.profile(request)
-        elif request_type == EngineCoreRequestType.ADD_LORA:
-            self.model_executor.add_lora(request)
+        elif request_type == EngineCoreRequestType.UTILITY:
+            call_id, method_name, args = request
+            output = UtilityOutput(call_id)
+            try:
+                method = getattr(self, method_name)
+                output.result = method(
+                    *self._convert_msgspec_args(method, args))
+            except BaseException as e:
+                logger.exception("Invocation of %s method failed", method_name)
+                output.failure_message = (f"Call to {method_name} method"
+                                          f" failed: {str(e)}")
+            self.output_queue.put_nowait(
+                EngineCoreOutputs(utility_output=output))
+
+    @staticmethod
+    def _convert_msgspec_args(method, args):
+        """If a provided arg type doesn't match corresponding target method
+         arg type, try converting to msgspec object."""
+        if not args:
+            return args
+        arg_types = signature(method).parameters.values()
+        assert len(args) <= len(arg_types)
+        return tuple(
+            msgspec.convert(v, type=p.annotation) if isclass(p.annotation)
+            and issubclass(p.annotation, msgspec.Struct)
+            and not isinstance(v, p.annotation) else v
+            for v, p in zip(args, arg_types))
 
     def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
-        add_lora_decoder = MsgpackDecoder(LoRARequest)
         generic_decoder = MsgpackDecoder()
 
         with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
@@ -352,14 +374,9 @@ def process_input_socket(self, input_path: str):
                 request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
-                decoder = None
-                if request_type == EngineCoreRequestType.ADD:
-                    decoder = add_request_decoder
-                elif request_type == EngineCoreRequestType.ADD_LORA:
-                    decoder = add_lora_decoder
-                else:
-                    decoder = generic_decoder
-
+                decoder = add_request_decoder if (
+                    request_type
+                    == EngineCoreRequestType.ADD) else generic_decoder
                 request = decoder.decode(data_frame.buffer)
 
                 # Push to input queue for core busy loop.
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 07176629e94..8641833e438 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -2,10 +2,14 @@
 
 import asyncio
 import os
+import queue
 import signal
+import uuid
 import weakref
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Type
+from concurrent.futures import Future
+from threading import Thread
+from typing import Any, Dict, List, Optional, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -16,7 +20,7 @@
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                         make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
-                            EngineCoreRequestType)
+                            EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -24,6 +28,8 @@
 
 logger = init_logger(__name__)
 
+AnyFuture = Union[asyncio.Future[Any], Future[Any]]
+
 
 class EngineCoreClient(ABC):
     """
@@ -204,6 +210,8 @@ def sigusr1_handler(signum, frame):
                 "log_stats": log_stats,
             })
 
+        self.utility_results: Dict[int, AnyFuture] = {}
+
     def shutdown(self):
         """Clean up background resources."""
         if hasattr(self, "proc_handle"):
@@ -212,6 +220,16 @@ def shutdown(self):
         self._finalizer()
 
 
+def _process_utility_output(output: UtilityOutput,
+                            utility_results: Dict[int, AnyFuture]):
+    """Set the result from a utility method in the waiting future"""
+    future = utility_results.pop(output.call_id)
+    if output.failure_message is not None:
+        future.set_exception(Exception(output.failure_message))
+    else:
+        future.set_result(output.result)
+
+
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
@@ -224,10 +242,30 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
             log_stats=log_stats,
         )
 
-    def get_output(self) -> EngineCoreOutputs:
+        self.outputs_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
 
-        (frame, ) = self.output_socket.recv_multipart(copy=False)
-        return self.decoder.decode(frame.buffer)
+        # Ensure that the outputs socket processing thread does not have
+        # a ref to the client which prevents gc.
+        output_socket = self.output_socket
+        decoder = self.decoder
+        utility_results = self.utility_results
+        outputs_queue = self.outputs_queue
+
+        def process_outputs_socket():
+            while True:
+                (frame, ) = output_socket.recv_multipart(copy=False)
+                outputs = decoder.decode(frame.buffer)
+                if outputs.utility_output:
+                    _process_utility_output(outputs.utility_output,
+                                            utility_results)
+                else:
+                    outputs_queue.put_nowait(outputs)
+
+        # Process outputs from engine in separate thread.
+        Thread(target=process_outputs_socket, daemon=True).start()
+
+    def get_output(self) -> EngineCoreOutputs:
+        return self.outputs_queue.get()
 
     def _send_input(self, request_type: EngineCoreRequestType,
                     request: Any) -> None:
@@ -236,6 +274,16 @@ def _send_input(self, request_type: EngineCoreRequestType,
         msg = (request_type.value, self.encoder.encode(request))
         self.input_socket.send_multipart(msg, copy=False)
 
+    def _call_utility(self, method: str, *args) -> Any:
+        call_id = uuid.uuid1().int >> 64
+        future: Future[Any] = Future()
+        self.utility_results[call_id] = future
+
+        self._send_input(EngineCoreRequestType.UTILITY,
+                         (call_id, method, args))
+
+        return future.result()
+
     def add_request(self, request: EngineCoreRequest) -> None:
         # NOTE: text prompt is not needed in the core engine as it has been
         # tokenized.
@@ -247,13 +295,13 @@ def abort_requests(self, request_ids: List[str]) -> None:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
-        self._send_input(EngineCoreRequestType.PROFILE, is_start)
+        self._call_utility("profile", is_start)
 
     def reset_prefix_cache(self) -> None:
-        self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
+        self._call_utility("reset_prefix_cache")
 
     def add_lora(self, lora_request: LoRARequest) -> None:
-        self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
+        self._call_utility("add_lora", lora_request)
 
 
 class AsyncMPClient(MPClient):
@@ -268,24 +316,35 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
             log_stats=log_stats,
         )
 
-        self.outputs_queue: Optional[asyncio.Queue[bytes]] = None
+        self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
         self.queue_task: Optional[asyncio.Task] = None
 
+    async def _start_output_queue_task(self):
+        # Perform IO in separate task to parallelize as much as possible.
+        # Avoid task having direct reference back to the client.
+        self.outputs_queue = asyncio.Queue()
+        output_socket = self.output_socket
+        decoder = self.decoder
+        utility_results = self.utility_results
+        outputs_queue = self.outputs_queue
+
+        async def process_outputs_socket():
+            while True:
+                (frame, ) = await output_socket.recv_multipart(copy=False)
+                outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
+                if outputs.utility_output:
+                    _process_utility_output(outputs.utility_output,
+                                            utility_results)
+                else:
+                    outputs_queue.put_nowait(outputs)
+
+        self.queue_task = asyncio.create_task(process_outputs_socket())
+
     async def get_output_async(self) -> EngineCoreOutputs:
         if self.outputs_queue is None:
-            # Perform IO in separate task to parallelize as much as possible
-            self.outputs_queue = asyncio.Queue()
-
-            async def process_outputs_socket():
-                assert self.outputs_queue is not None
-                while True:
-                    (frame, ) = await self.output_socket.recv_multipart(
-                        copy=False)
-                    self.outputs_queue.put_nowait(frame.buffer)
-
-            self.queue_task = asyncio.create_task(process_outputs_socket())
-
-        return self.decoder.decode(await self.outputs_queue.get())
+            await self._start_output_queue_task()
+            assert self.outputs_queue is not None
+        return await self.outputs_queue.get()
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: Any) -> None:
@@ -293,6 +352,18 @@ async def _send_input(self, request_type: EngineCoreRequestType,
         msg = (request_type.value, self.encoder.encode(request))
         await self.input_socket.send_multipart(msg, copy=False)
 
+        if self.outputs_queue is None:
+            await self._start_output_queue_task()
+
+    async def _call_utility_async(self, method: str, *args) -> Any:
+        call_id = uuid.uuid1().int >> 64
+        future = asyncio.get_running_loop().create_future()
+        self.utility_results[call_id] = future
+        await self._send_input(EngineCoreRequestType.UTILITY,
+                               (call_id, method, args))
+
+        return await future
+
     async def add_request_async(self, request: EngineCoreRequest) -> None:
         # NOTE: text prompt is not needed in the core engine as it has been
         # tokenized.
@@ -304,10 +375,10 @@ async def abort_requests_async(self, request_ids: List[str]) -> None:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
-        await self._send_input(EngineCoreRequestType.PROFILE, is_start)
+        await self._call_utility_async("profile", is_start)
 
     async def reset_prefix_cache_async(self) -> None:
-        await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
+        await self._call_utility_async("reset_prefix_cache")
 
     async def add_lora_async(self, lora_request: LoRARequest) -> None:
-        await self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
+        await self._call_utility_async("add_lora", lora_request)

From d4fc74d6b67ede95f7c1a97990905825741881c6 Mon Sep 17 00:00:00 2001
From: Yannick Schnider <Yannick.Schnider1@ibm.com>
Date: Wed, 19 Feb 2025 10:16:38 +0100
Subject: [PATCH 0260/1240] [Feature] Pluggable platform-specific scheduler
 (#13161)

Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/plugins_tests/test_scheduler_plugins.py | 33 +++++++++++++++++++
 vllm/config.py                                |  4 +++
 vllm/engine/arg_utils.py                      | 10 ++++++
 vllm/engine/llm_engine.py                     | 11 +++++--
 5 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 tests/plugins_tests/test_scheduler_plugins.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3918e3e8676..9d05ff4c2cf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -531,6 +531,7 @@ steps:
   - pip uninstall vllm_add_dummy_platform -y
   # end platform plugin tests
   # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
new file mode 100644
index 00000000000..84688cee966
--- /dev/null
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.core.scheduler import Scheduler
+
+
+class DummyScheduler(Scheduler):
+
+    def schedule(self):
+        raise Exception("Exception raised by DummyScheduler")
+
+
+def test_scheduler_plugins():
+    import pytest
+
+    from vllm.engine.arg_utils import EngineArgs
+    from vllm.engine.llm_engine import LLMEngine
+    from vllm.sampling_params import SamplingParams
+
+    with pytest.raises(Exception) as exception_info:
+
+        engine_args = EngineArgs(
+            model="facebook/opt-125m",
+            enforce_eager=True,  # reduce test time
+            scheduler_cls=DummyScheduler,
+        )
+
+        engine = LLMEngine.from_engine_args(engine_args=engine_args)
+
+        sampling_params = SamplingParams(max_tokens=1)
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+    assert str(exception_info.value) == "Exception raised by DummyScheduler"
diff --git a/vllm/config.py b/vllm/config.py
index 59fa60fd8b0..56315aacbe5 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1495,6 +1495,10 @@ class SchedulerConfig:
 
     chunked_prefill_enabled: bool = field(init=False)
 
+    # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
+    # or "mod.custom_class".
+    scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler"
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5f076f05d04..78681008b62 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -192,6 +192,7 @@ class EngineArgs:
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
     scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
+    scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler"
 
     override_neuron_config: Optional[Dict[str, Any]] = None
     override_pooler_config: Optional[PoolerConfig] = None
@@ -938,6 +939,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'priority (lower value means earlier handling) and time of '
             'arrival deciding any ties).')
 
+        parser.add_argument(
+            '--scheduler-cls',
+            default=EngineArgs.scheduler_cls,
+            help='The scheduler class to use. "vllm.core.scheduler.Scheduler" '
+            'is the default scheduler. Can be a class directly or the path to '
+            'a class of form "mod.custom_class".')
+
         parser.add_argument(
             '--override-neuron-config',
             type=json.loads,
@@ -1273,10 +1281,12 @@ def create_engine_config(self,
             send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                              and parallel_config.use_ray),
             policy=self.scheduling_policy,
+            scheduler_cls=self.scheduler_cls,
             max_num_partial_prefills=self.max_num_partial_prefills,
             max_long_partial_prefills=self.max_long_partial_prefills,
             long_prefill_token_threshold=self.long_prefill_token_threshold,
         )
+
         lora_config = LoRAConfig(
             bias_enabled=self.enable_lora_bias,
             max_lora_rank=self.max_lora_rank,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2e5bc75c6db..3ce9a046136 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -19,8 +19,7 @@
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ObservabilityConfig, ParallelConfig, SchedulerConfig,
                          VllmConfig)
-from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
-                                 SchedulerOutputs)
+from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
 from vllm.engine.output_processor.interfaces import (
@@ -58,7 +57,8 @@
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled,
                                   usage_message)
-from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind
+from vllm.utils import (Counter, Device, deprecate_kwargs,
+                        resolve_obj_by_qualname, weak_bind)
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -346,6 +346,11 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         # Create the scheduler.
         # NOTE: the cache_config here have been updated with the numbers of
         # GPU and CPU blocks, which are profiled in the distributed executor.
+        if isinstance(self.vllm_config.scheduler_config.scheduler_cls, str):
+            Scheduler = resolve_obj_by_qualname(
+                self.vllm_config.scheduler_config.scheduler_cls)
+        else:
+            Scheduler = self.vllm_config.scheduler_config.scheduler_cls
         self.scheduler = [
             Scheduler(
                 self.scheduler_config, self.cache_config, self.lora_config,

From 039f3f95bedab748928cd1b09ca6f11012758ee6 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Wed, 19 Feb 2025 11:48:03 +0100
Subject: [PATCH 0261/1240] [CI/Build] force writing version file (#13544)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ac155116ccd..1c03e9e17be 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,7 @@ Slack="http://slack.vllm.ai/"
 vllm = "vllm.entrypoints.cli.main:main"
 
 [tool.setuptools_scm]
-version_file = "vllm/_version.py"
+# no extra settings needed, presence enables setuptools-scm
 
 [tool.setuptools.packages.find]
 where = ["."]
diff --git a/setup.py b/setup.py
index d09ae4b3810..d8a336c2d42 100755
--- a/setup.py
+++ b/setup.py
@@ -499,7 +499,7 @@ def get_gaudi_sw_version():
 
 
 def get_vllm_version() -> str:
-    version = get_version()
+    version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
 
     if _no_device():

From 223ccc799af823df3cfd4d274986c4bc300de7a7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Feb 2025 20:55:58 +0800
Subject: [PATCH 0262/1240] [doc] clarify profiling is only for developers
 (#13554)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/profiling/profiling_index.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 79aeb292a9b..3d044f89038 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -1,15 +1,15 @@
 # Profiling vLLM
 
+:::{warning}
+Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
+:::
+
 We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
 When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag.
 
-:::{warning}
-Only enable profiling in a development environment.
-:::
-
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
 :::{tip}

From ef0eb000acafd0df5b082e7c8062a5396c811518 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Feb 2025 21:13:50 +0800
Subject: [PATCH 0263/1240] [VLM][Bugfix] Pass processor kwargs properly on
 init (#13516)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language_multi_image.py            |   1 +
 .../multimodal/processing/test_common.py      |   7 +-
 .../multimodal/processing/test_h2ovl.py       | 225 ++++++++++--------
 .../multimodal/processing/test_idefics3.py    |  24 +-
 .../multimodal/processing/test_internvl.py    | 142 ++++++++---
 .../multimodal/processing/test_llava_next.py  |  17 +-
 .../processing/test_llava_onevision.py        |  17 +-
 .../multimodal/processing/test_phi3v.py       |  13 +-
 .../multimodal/processing/test_qwen2_vl.py    |  16 +-
 tests/models/utils.py                         |  18 +-
 tests/multimodal/test_processing.py           |  10 +-
 vllm/inputs/registry.py                       |  77 +++---
 vllm/model_executor/models/aria.py            |   4 +-
 vllm/model_executor/models/chameleon.py       |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |  15 +-
 vllm/model_executor/models/fuyu.py            |   4 +-
 vllm/model_executor/models/glm4v.py           |  15 +-
 vllm/model_executor/models/gritlm.py          |   9 +-
 vllm/model_executor/models/h2ovl.py           |  41 +++-
 vllm/model_executor/models/idefics3.py        |  12 +-
 vllm/model_executor/models/internvl.py        |  45 +++-
 vllm/model_executor/models/llava.py           |  27 ++-
 vllm/model_executor/models/llava_next.py      |   4 +-
 .../model_executor/models/llava_next_video.py |   4 +-
 vllm/model_executor/models/llava_onevision.py |   4 +-
 vllm/model_executor/models/minicpmv.py        |   7 +-
 vllm/model_executor/models/mllama.py          |   4 +-
 vllm/model_executor/models/molmo.py           |   4 +-
 vllm/model_executor/models/nvlm_d.py          |  19 +-
 vllm/model_executor/models/paligemma.py       |   4 +-
 vllm/model_executor/models/phi3v.py           |   5 +-
 vllm/model_executor/models/pixtral.py         |  20 +-
 vllm/model_executor/models/qwen2_5_vl.py      |  47 +---
 vllm/model_executor/models/qwen2_audio.py     |   3 +-
 vllm/model_executor/models/qwen2_vl.py        |  94 +++++---
 vllm/model_executor/models/qwen_vl.py         |   9 +-
 vllm/model_executor/models/ultravox.py        |   3 +-
 vllm/model_executor/models/whisper.py         |   8 +-
 vllm/multimodal/image.py                      |   5 +-
 vllm/multimodal/registry.py                   |  14 +-
 vllm/multimodal/utils.py                      |   5 +-
 vllm/multimodal/video.py                      |   8 +-
 vllm/transformers_utils/processor.py          |  92 ++++++-
 vllm/transformers_utils/tokenizer.py          |  22 +-
 44 files changed, 675 insertions(+), 453 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index b2821966cf1..5dc6a936d1c 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -85,6 +85,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
         trust_remote_code=True,
         max_model_len=8192,
         limit_mm_per_prompt={"image": len(image_urls)},
+        mm_processor_kwargs={"max_dynamic_patch": 4},
     )
 
     placeholders = "\n".join(f"Image-{i}: <image>\n"
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 88dcc32f44f..331ffe82ec8 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -10,7 +10,7 @@
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import ProcessingCache
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -42,10 +42,7 @@ def _test_processing_correctness(
     factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
     ctx = InputProcessingContext(
         model_config,
-        tokenizer=cached_get_tokenizer(
-            model_config.tokenizer,
-            trust_remote_code=model_info.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(model_config),
     )
     # Ensure that it can fit all of the data
     cache = ProcessingCache(capacity=1 << 30)
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 767ac5eb9ef..5c43e4eed78 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,17 +1,118 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for H2OVL's multimodal preprocessing kwargs."""
-from typing import Optional
+from typing import Mapping, Optional
 
 import pytest
+from PIL import Image
+from transformers import PretrainedConfig
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 
 
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+
+    width, height = image.size
+
+    # Calculate the expected number of blocks
+    if num_imgs == 1 and config.use_msac:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=1,
+                max_num=max_num,
+                prior_aspect_ratio=None,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+        )
+
+        # Second pass
+        blocks2, _, _, _ = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=3,
+                max_num=max_num,
+                prior_aspect_ratio=aspect_ratio,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,
+        )
+
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+
+        # Total blocks is the sum of blocks from both passes minus
+        # overlapping
+        total_blocks = blocks1 + blocks2 - 1
+
+        return total_blocks
+
+    blocks, _, _, _ = calculate_h2ovl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=None,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+
+    processed_inputs = processor.apply("<image>" * len(images), mm_data,
+                                       mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
 @pytest.mark.parametrize("model_id", [
     "h2oai/h2ovl-mississippi-800m",
     "h2oai/h2ovl-mississippi-2b",
@@ -25,118 +126,54 @@
         [1.0, 1.0, 1.0],
         # Multi-scale
         [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
     ],
 )
-@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
 @pytest.mark.parametrize("dynamic_image_size", [True, False])
-@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     model_id: str,
     image_assets: _ImageAssets,
     size_factors: list[int],
+    min_dynamic_patch: int,
     max_dynamic_patch: int,
     dynamic_image_size: Optional[bool],
-    num_imgs: int,
+    kwargs_on_init: bool,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
-                                                  get_h2ovl_target_ratios)
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
 
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
-        mm_processor_kwargs=None,
-        limit_mm_per_prompt={"image": num_imgs},
-    )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
     )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
     )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
-    config = processor.info.get_hf_config()
-    use_msac = config.use_msac
-
-    mm_processor_kwargs = {
-        "max_dynamic_patch": max_dynamic_patch,
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
-
-    min_num = config.min_dynamic_patch
+    min_num = min_dynamic_patch if dynamic_image_size else 1
     max_num = max_dynamic_patch if dynamic_image_size else 1
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-
-    for asset in image_assets:
-        for factor in size_factors:
-            image = rescale_image_size(asset.pil_image, factor)
-            mm_data = {"image": [image] * num_imgs}
-
-            width, height = image.size
-
-            # Calculate the expected number of blocks
-            if num_imgs == 1 and use_msac:
-                # First pass
-                blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=None,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,  # Thumbnail is handled separately
-                )
-
-                # Second pass
-                blocks2, _, _, _ = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=aspect_ratio,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,
-                )
-
-                # Add thumbnail if use_thumbnail is True and total_blocks > 1
-                if config.use_thumbnail:
-                    blocks1 += 1 if blocks1 > 1 else 0
-                    blocks2 += 1 if blocks2 > 1 else 0
-
-                # Total blocks is the sum of blocks from both passes minus
-                # overlapping
-                total_blocks = blocks1 + blocks2 - 1
-
-                expected_num_patches = total_blocks
-            else:
-                blocks, _, _, _ = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=None,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,
-                )
-                expected_num_patches = blocks
-
-                if config.use_thumbnail and expected_num_patches != 1:
-                    expected_num_patches += 1
-
-            processed_inputs = processor.apply(prompt, mm_data,
-                                               mm_processor_kwargs)
-            pixel_shape = (
-                processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
-
-            assert pixel_shape[0] == expected_num_patches * num_imgs
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 07ab1bbd4b5..0a0f1cb3893 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -4,7 +4,7 @@
 from transformers import Idefics3Config
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -22,9 +22,15 @@
     ])
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(image_assets: _ImageAssets, model: str,
-                            mm_processor_kwargs: dict[str, object],
-                            expected_toks_per_img: int, num_imgs: int):
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    image_assets: _ImageAssets,
+    model: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
     """Ensure input_processor_for_idefics3 handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
@@ -33,15 +39,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
         model_name=model,
         tokenizer_name=model,
         trust_remote_code=True,
-        mm_processor_kwargs=None,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
     )
-    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
     placeholders = "<image>" if num_imgs == 1 else "\n".join(
@@ -54,8 +60,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
     dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
     mm_data = {"image": [dummy_image] * num_imgs}
 
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
+
     # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
     assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
         "input_ids"][0]
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index ede961225be..cc777fdf57b 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,64 +1,136 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
-from typing import Optional
+from typing import Mapping, Optional
 
 import pytest
+from PIL import Image
+from transformers import PretrainedConfig
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 
 
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets, get_internvl_target_ratios)
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+
+    processed_inputs = processor.apply("<image>" * len(images), mm_data,
+                                       mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
 @pytest.mark.parametrize("model_id", ["OpenGVLab/InternVL2-2B"])
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
-@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     model_id: str,
     image_assets: _ImageAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
     max_dynamic_patch: int,
     dynamic_image_size: Optional[bool],
-    num_imgs: int,
+    kwargs_on_init: bool,
 ):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
-        mm_processor_kwargs=None,
-        limit_mm_per_prompt={"image": num_imgs},
-    )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
     )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
     )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
-    mm_processor_kwargs = {
-        "max_dynamic_patch": max_dynamic_patch,
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
 
-    # Build the image str / prompt based on the number of images we pass
-    prompt = "<image>" * num_imgs
-    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
-    mm_data = {"image": [image] * num_imgs}
-
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
-
-    # Ensure we have the right number of placeholders per num_crops size
-    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
-    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
-
-    assert img_tok_count == 256 * expected_num_patches * num_imgs
-    assert pixel_shape[0] == expected_num_patches * num_imgs
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index fe4754c2ef6..dca25e5d4c4 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
     info = processor.info
 
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
 
     seen_aspect_ratios = set[float]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index fb650d9e099..96abc840f05 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
     info = processor.info
 
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
     )
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
-        tokenizer=cached_get_tokenizer(
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
     )
 
     seen_aspect_ratios = set[float]()
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index dde8904f2ef..420644f7084 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -21,12 +21,14 @@
     ])
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     image_assets: _ImageAssets,
     model_id: str,
     mm_processor_kwargs: dict[str, int],
     expected_toks_per_img: int,
     num_imgs: int,
+    kwargs_on_init: bool,
 ):
     """Ensure input_processor_for_phi3v handles num_crops properly."""
     # Avoid initializing CUDA early
@@ -36,23 +38,22 @@ def test_processor_override(
         model_name=model_id,
         tokenizer_name=model_id,
         trust_remote_code=True,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
-    )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
     )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
     img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
     prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index ef8e97f82d0..b882528aafb 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,7 +3,7 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -18,6 +18,7 @@
     ])
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     image_assets: _ImageAssets,
     model_id: str,
@@ -25,31 +26,30 @@ def test_processor_override(
     expected_toks_per_img: int,
     expected_pixels_shape: tuple[int, int],
     num_imgs: int,
+    kwargs_on_init: bool,
 ):
     """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
     ctx = build_model_context(
         model_name=model_id,
         tokenizer_name=model_id,
-        mm_processor_kwargs=None,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
-    )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     processor = MULTIMODAL_REGISTRY.create_processor(
         ctx.model_config,
         tokenizer=tokenizer,
     )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
     prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
     mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
 
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
-    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
     image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
     img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
     pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
diff --git a/tests/models/utils.py b/tests/models/utils.py
index e2be43c1266..a90efb17672 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -248,13 +248,16 @@ def check_logprobs_close(
                     warnings.warn(fail_msg, stacklevel=2)
 
 
-def build_model_context(model_name: str,
-                        task: TaskOption = "auto",
-                        tokenizer_name: Optional[str] = None,
-                        trust_remote_code: bool = False,
-                        dtype: Optional[Union[str, torch.dtype]] = None,
-                        mm_processor_kwargs: Optional[Dict] = None,
-                        limit_mm_per_prompt: Optional[Dict] = None):
+def build_model_context(
+    model_name: str,
+    task: TaskOption = "auto",
+    tokenizer_name: Optional[str] = None,
+    trust_remote_code: bool = False,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    mm_processor_kwargs: Optional[Dict] = None,
+    limit_mm_per_prompt: Optional[Dict] = None,
+    disable_mm_preprocessor_cache: bool = True,
+):
     """Creates an InputContext for a given model.
 
     Args:
@@ -283,5 +286,6 @@ def build_model_context(model_name: str,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
+        disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
     )
     return InputContext(model_config)
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 7bbe5c53562..b247321ebb2 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -22,8 +22,8 @@
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               cached_tokenizer_from_config)
 from vllm.utils import full_groupby
 
 from .utils import random_image
@@ -577,7 +577,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
 
     processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_tokenizer_from_config(model_config),
     )
     profiler = MultiModalProfiler(processor)
 
@@ -617,7 +617,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
 
     processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_tokenizer_from_config(model_config),
     )
 
     rng = np.random.RandomState(0)
@@ -689,7 +689,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
 
     processor = MULTIMODAL_REGISTRY.create_processor(
         model_config,
-        tokenizer=cached_get_tokenizer(model_config.tokenizer),
+        tokenizer=cached_tokenizer_from_config(model_config),
     )
     orig_get_hf_processor = processor.info.get_hf_processor
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 87b7a7631e4..691fcd7dc53 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -11,8 +11,9 @@
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               cached_tokenizer_from_config)
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
@@ -27,19 +28,9 @@
 
 logger = init_logger(__name__)
 
-C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
-P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
-
-
-class HashableDict(dict):
-    """
-    A dictionary that can be hashed by lru_cache.
-    """
-
-    # NOTE: pythonic dict is not hashable,
-    # we override on it directly for simplicity
-    def __hash__(self) -> int:  # type: ignore[override]
-        return hash(frozenset(self.items()))
+_T = TypeVar("_T")
+_C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig)
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
 
 
 @dataclass(frozen=True)
@@ -54,9 +45,9 @@ class InputContext:
 
     def get_hf_config(
         self,
-        typ: Union[type[C], tuple[type[C], ...]] = PretrainedConfig,
+        typ: Union[type[_C], tuple[type[_C], ...]] = PretrainedConfig,
         /,
-    ) -> C:
+    ) -> _C:
         """
         Get the HuggingFace configuration
         (:class:`transformers.PretrainedConfig`) of the model,
@@ -94,10 +85,10 @@ def get_mm_config(self):
 
     def get_hf_processor(
         self,
-        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
         /,
         **kwargs: object,
-    ) -> P:
+    ) -> _P:
         """
         Get the HuggingFace processor
         (:class:`transformers.ProcessorMixin`) of the model,
@@ -106,33 +97,29 @@ def get_hf_processor(
         Raises:
             TypeError: If the processor is not of the specified type.
         """
+        return cached_processor_from_config(
+            self.model_config,
+            processor_cls=typ,
+            **kwargs,
+        )
+
+    def init_processor(
+        self,
+        typ: type[_T],
+        /,
+        **kwargs: object,
+    ) -> _T:
+        """
+        Initialize a HuggingFace-like processor class, merging the
+        keyword arguments with those in the model's configuration.
+        """
         base_kwargs = self.model_config.mm_processor_kwargs
         if base_kwargs is None:
             base_kwargs = {}
 
         merged_kwargs = {**base_kwargs, **kwargs}
 
-        if isinstance(typ, type):
-            merged_kwargs["processor_cls"] = typ
-
-        # NOTE: Pythonic dict is not hashable and will raise unhashable type
-        # error when calling `cached_get_processor`, therefore we need to
-        # wrap it to a hashable dict.
-        for key, value in merged_kwargs.items():
-            if isinstance(value, dict):
-                merged_kwargs[key] = HashableDict(value)
-
-        hf_processor = cached_get_processor(
-            self.model_config.model,
-            trust_remote_code=self.model_config.trust_remote_code,
-            **merged_kwargs,
-        )
-        if not isinstance(hf_processor, typ):
-            raise TypeError("Invalid type of HuggingFace processor. "
-                            f"Expected type: {typ}, but "
-                            f"found type: {type(hf_processor)}")
-
-        return hf_processor
+        return typ(**merged_kwargs)
 
 
 @dataclass(frozen=True)
@@ -142,10 +129,10 @@ class InputProcessingContext(InputContext):
 
     def get_hf_processor(
         self,
-        typ: Union[type[P], tuple[type[P], ...]] = ProcessorMixin,
+        typ: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
         /,
         **kwargs: object,
-    ) -> P:
+    ) -> _P:
         return super().get_hf_processor(
             typ,
             tokenizer=self.tokenizer,
@@ -341,13 +328,9 @@ def dummy_data_for_profiling(
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
-        from vllm.multimodal.utils import cached_get_tokenizer
 
         if mm_registry.has_processor(model_config):
-            tokenizer = cached_get_tokenizer(
-                model_config.tokenizer,
-                trust_remote_code=model_config.trust_remote_code,
-            )
+            tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config, tokenizer)
             profiler = MultiModalProfiler(processor)
             dummy_data = profiler.get_dummy_data(
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index df73a3b76b1..bff4100a1de 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -400,8 +400,8 @@ def get_hf_config(self):
     def get_vision_config(self):
         return self.get_hf_config().vision_config
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(AriaProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(AriaProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index b29dd65a8e3..2d4dfab6073 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -58,8 +58,8 @@ class ChameleonProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(ChameleonConfig)
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(ChameleonProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(ChameleonProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0eaf3a6201f..5f684fa295a 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -28,13 +28,13 @@
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
-from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
                                                           MlpProjectorConfig,
                                                           VisionEncoderConfig)
 from vllm.transformers_utils.processors.deepseek_vl2 import (
     DeepseekVLV2Processor)
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -133,8 +133,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(DeepseekVLV2Config)
 
-    def get_hf_processor(self) -> DeepseekVLV2Processor:
-        return self.ctx.get_hf_processor(DeepseekVLV2Processor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(DeepseekVLV2Processor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -308,13 +308,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.text_config = config.text_config
 
         model_config = vllm_config.model_config
-        tokenizer = cached_get_tokenizer(
-            model_config.tokenizer,
-            tokenizer_mode=model_config.tokenizer_mode,
-            tokenizer_revision=model_config.tokenizer_revision,
-            trust_remote_code=model_config.trust_remote_code,
-        )
-        self.image_token_id = tokenizer.vocab.get(_IMAGE_TOKEN)
+        tokenizer = cached_tokenizer_from_config(model_config)
+        self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
 
         self.vision = self._init_vision_module(self.vision_config,
                                                quant_config,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 4e0ee6364f8..42a6aa97942 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -71,8 +71,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(FuyuConfig)
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(FuyuProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(FuyuProcessor, **kwargs)
 
     def get_image_processor(self) -> FuyuImageProcessor:
         return self.get_hf_processor().image_processor
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 450421302a1..40010ec5590 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -416,18 +416,15 @@ def __call__(
 
 class GLM4VProcessingInfo(BaseProcessingInfo):
 
-    def get_tokenizer(self):
-        tokenizer = self.ctx.tokenizer
-        assert isinstance(tokenizer, PreTrainedTokenizer)
-        return tokenizer
-
     def get_hf_config(self):
         return self.ctx.get_hf_config(ChatGLMConfig)
 
-    def get_hf_processor(self) -> GLM4VProcessor:
-        return GLM4VProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
+    def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
+        return self.ctx.init_processor(
+            GLM4VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
         )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 7bda54ea768..0f3a2ffe9a1 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -15,9 +15,9 @@
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.pooling_metadata import (PoolingMetadata,
                                                   PoolingTensors)
-from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                            PoolingSequenceGroupOutput)
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 logger = init_logger(__name__)
 
@@ -29,12 +29,7 @@ def __init__(self, model_config: ModelConfig):
 
         self.model_config = model_config
 
-        tokenizer = cached_get_tokenizer(
-            self.model_config.tokenizer,
-            tokenizer_mode=self.model_config.tokenizer_mode,
-            tokenizer_revision=self.model_config.tokenizer_revision,
-            trust_remote_code=self.model_config.trust_remote_code,
-        )
+        tokenizer = cached_tokenizer_from_config(self.model_config)
 
         # Collect the tokens needed for pattern matching.
         # "▁<" is different from "_<". The former uses "▁" to indicate that
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index cf3e777a202..01b721fa79e 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -41,6 +41,7 @@ def resolve_h2ovl_min_max_num(
     dynamic_image_size: bool,
     use_thumbnail: bool,
 ) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
     max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
 
     if use_thumbnail and max_dynamic_patch != 1:
@@ -190,7 +191,7 @@ def image_to_pixel_values_h2ovl(
         pixel_values1, aspect_ratio1 = _preprocess_image(
             image,
             input_size=input_size,
-            min_num=min_num,
+            min_num=1,
             max_num=max_num,
             use_thumbnail=True,
             prior_aspect_ratio=None,
@@ -199,7 +200,7 @@ def image_to_pixel_values_h2ovl(
         pixel_values2, _ = _preprocess_image(
             image,
             input_size=input_size,
-            min_num=3,  # Hardcoded value
+            min_num=3,
             max_num=max_num,
             use_thumbnail=True,
             prior_aspect_ratio=aspect_ratio1,
@@ -228,6 +229,7 @@ def __init__(
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         use_msac: Optional[bool] = None,
@@ -235,6 +237,7 @@ def __init__(
         super().__init__(
             config,
             tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
         )
@@ -267,11 +270,13 @@ def get_image_repl_full(
     def resolve_min_max_num(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         use_thumbnail: Optional[bool] = None,
     ) -> tuple[int, int]:
-        min_dynamic_patch = self.min_dynamic_patch
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
         max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
                              is None else max_dynamic_patch)
         dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
@@ -289,18 +294,21 @@ def resolve_min_max_num(
     def resolve_target_ratios(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         use_thumbnail: Optional[bool] = None,
         prior_aspect_ratio: Optional[tuple[int, int]] = None,
+        override_min_num: Optional[int] = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
             use_thumbnail=use_thumbnail,
         )
-        if prior_aspect_ratio:  # hardcoded value for second pass of use_msac
-            min_num = 3
+        if override_min_num is not None:
+            min_num = override_min_num
 
         return get_h2ovl_target_ratios(
             min_num,
@@ -322,6 +330,7 @@ def get_num_image_tokens(
         if use_msac:
             target_ratios_1 = self.resolve_target_ratios(
                 use_thumbnail=False,  # Applied in calculate_targets
+                override_min_num=1,
             )
             num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
                 orig_width=image_width,
@@ -334,6 +343,7 @@ def get_num_image_tokens(
             target_ratios_2 = self.resolve_target_ratios(
                 use_thumbnail=False,  # Applied in calculate_targets
                 prior_aspect_ratio=aspect_ratio_1,
+                override_min_num=3,
             )
             num_patches_2, _, _, _ = calculate_h2ovl_targets(
                 orig_width=image_width,
@@ -361,12 +371,14 @@ def get_num_image_tokens(
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
     ) -> list[torch.Tensor]:
         use_msac = self.use_msac if len(images) == 1 else False
 
         min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
             use_thumbnail=False,  # Applied in image_to_pixel_values
@@ -389,14 +401,23 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
     def get_hf_processor(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
     ) -> H2OVLProcessor:
-        return H2OVLProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            H2OVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
         )
 
     def get_mm_max_tokens_per_item(
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index fdfabbaafce..579253632c8 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -83,13 +83,15 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
 class Idefics3ProcessingInfo(BaseProcessingInfo):
 
     def get_hf_processor(
-            self,
-            *,
-            size: Optional[Dict[str, int]] = None) -> Idefics3Processor:
+        self,
+        *,
+        size: Optional[Dict[str, int]] = None,
+        **kwargs: object,
+    ) -> Idefics3Processor:
         if size is not None:
-            return self.ctx.get_hf_processor(Idefics3Processor, size=size)
+            kwargs["size"] = size
 
-        return self.ctx.get_hf_processor(Idefics3Processor)
+        return self.ctx.get_hf_processor(Idefics3Processor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 380eb40d9eb..4a600787677 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -120,6 +120,7 @@ def resolve_internvl_min_max_num(
     dynamic_image_size: bool,
     use_thumbnail: bool,
 ) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
     max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
 
     if use_thumbnail and max_dynamic_patch != 1:
@@ -247,6 +248,7 @@ def __init__(
         config: PretrainedConfig,
         tokenizer: AnyTokenizer,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
     ) -> None:
@@ -258,18 +260,22 @@ def __init__(
         image_size: int = config.vision_config.image_size
         patch_size: int = config.vision_config.patch_size
 
-        if dynamic_image_size is None:
-            dynamic_image_size = config.dynamic_image_size
-        assert isinstance(dynamic_image_size, bool)
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
 
         if max_dynamic_patch is None:
             max_dynamic_patch = config.max_dynamic_patch
         assert isinstance(max_dynamic_patch, int)
 
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
         self.num_image_token = int(
             (image_size // patch_size)**2 * (config.downsample_ratio**2))
         self.image_size = image_size
-        self.min_dynamic_patch: int = config.min_dynamic_patch
+        self.min_dynamic_patch = min_dynamic_patch
         self.max_dynamic_patch = max_dynamic_patch
         self.dynamic_image_size = dynamic_image_size
         self.use_thumbnail: bool = config.use_thumbnail
@@ -298,11 +304,13 @@ def get_image_repl_full(
     def resolve_min_max_num(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         use_thumbnail: Optional[bool] = None,
     ) -> tuple[int, int]:
-        min_dynamic_patch = self.min_dynamic_patch
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
         max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
                              is None else max_dynamic_patch)
         dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
@@ -320,11 +328,13 @@ def resolve_min_max_num(
     def resolve_target_ratios(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         use_thumbnail: Optional[bool] = None,
     ) -> list[tuple[int, int]]:
         min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
             use_thumbnail=use_thumbnail,
@@ -355,10 +365,12 @@ def get_num_image_tokens(
     def _images_to_pixel_values_lst(
         self,
         images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
     ) -> list[torch.Tensor]:
         min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
             max_dynamic_patch=max_dynamic_patch,
             dynamic_image_size=dynamic_image_size,
             use_thumbnail=False,  # Applied in image_to_pixel_values
@@ -378,6 +390,7 @@ def __call__(
         self,
         text: Optional[Union[str, list[str]]] = None,
         images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
@@ -396,6 +409,7 @@ def __call__(
         else:
             pixel_values_lst = self._images_to_pixel_values_lst(
                 images,
+                min_dynamic_patch=min_dynamic_patch,
                 max_dynamic_patch=max_dynamic_patch,
                 dynamic_image_size=dynamic_image_size,
             )
@@ -451,8 +465,10 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
     def get_hf_processor(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
     ) -> BaseInternVLProcessor:
         raise NotImplementedError
 
@@ -642,14 +658,23 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
     def get_hf_processor(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
     ) -> InternVLProcessor:
-        return InternVLProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            InternVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
         )
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 6a4277adb6b..19752ba703f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -119,7 +119,7 @@ def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
     @abstractmethod
-    def get_hf_processor(self) -> LlavaLikeProcessor:
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
         raise NotImplementedError
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
@@ -208,8 +208,8 @@ def get_dummy_processor_inputs(
 
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
 
 
 class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
@@ -272,8 +272,8 @@ def _get_mm_fields_config(
 
 class PixtralHFProcessingInfo(BaseLlavaProcessingInfo):
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(PixtralProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
 
 
 class PixtralHFMultiModalProcessor(
@@ -742,23 +742,24 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 class MantisProcessingInfo(LlavaProcessingInfo):
 
-    def get_hf_processor(self):
+    def get_hf_processor(self, **kwargs: object):
         hf_config = self.get_hf_config()
         vision_info = self.get_vision_encoder_info()
 
+        kwargs.setdefault("patch_size", vision_info.get_patch_size())
+
         if Version(TRANSFORMERS_VERSION) < Version("4.48"):
             # BUG: num_additional_image_tokens = 0 but treated as 1,
             # so we set vision_feature_select_strategy to None to offset this
-            vision_feature_select_strategy = None
+            kwargs.setdefault("vision_feature_select_strategy", None)
         else:
             # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150
-            vision_feature_select_strategy = hf_config.vision_feature_select_strategy  # noqa: E501
+            kwargs.setdefault(
+                "vision_feature_select_strategy",
+                hf_config.vision_feature_select_strategy,
+            )
 
-        return self.ctx.get_hf_processor(
-            LlavaProcessor,
-            patch_size=vision_info.get_patch_size(),
-            vision_feature_select_strategy=vision_feature_select_strategy,
-        )
+        return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
 
 
 class MantisMultiModalProcessor(LlavaMultiModalProcessor):
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 719916642f2..c39daec709f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -72,8 +72,8 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
     def get_hf_config(self) -> LlavaNextLikeConfig:
         return self.ctx.get_hf_config(LlavaNextConfig)
 
-    def get_hf_processor(self):
-        hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(LlavaNextProcessor, **kwargs)
 
         # In case patch_size is omitted from `processor_config.json`
         # e.g. for E5-V: https://huggingface.co/royokong/e5-v
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 817edcef4ba..2af3cc05080 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -56,8 +56,8 @@ def get_hf_config(self):
     def get_vision_encoder_info(self):
         return get_vision_encoder_info(self.get_hf_config())
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaNextVideoProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(LlavaNextVideoProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"video": 1}
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 084d4d51ad2..8eb8071e657 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -97,8 +97,8 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
     def get_hf_config(self) -> LlavaOnevisionLikeConfig:
         return self.ctx.get_hf_config(LlavaOnevisionConfig)
 
-    def get_hf_processor(self):
-        return self.ctx.get_hf_processor(LlavaOnevisionProcessor)
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(LlavaOnevisionProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2083e7dc0b8..97596f9e82c 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -331,11 +331,8 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config()
 
-    def get_hf_processor(
-        self,
-        **kwargs: object,
-    ):
-        hf_processor = self.ctx.get_hf_processor()
+    def get_hf_processor(self, **kwargs: object):
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
 
         # NumPy arrays are considered as Iterable but not Sequence in
         # https://github.com/huggingface/transformers/blob/main/src/transformers/image_transforms.py#L428
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 3ca22d346b7..1f8f5b2eb13 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -94,8 +94,8 @@ class MllamaProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self) -> MllamaConfig:
         return self.ctx.get_hf_config(MllamaConfig)
 
-    def get_hf_processor(self) -> MllamaProcessor:
-        return self.ctx.get_hf_processor(MllamaProcessor)
+    def get_hf_processor(self, **kwargs: object) -> MllamaProcessor:
+        return self.ctx.get_hf_processor(MllamaProcessor, **kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b2154ef54af..1d84d25c96a 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1200,8 +1200,8 @@ def __call__(
 
 class MolmoProcessingInfo(BaseProcessingInfo):
 
-    def get_hf_processor(self) -> MolmoProcessorWrapper:
-        processor = self.ctx.get_hf_processor()
+    def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
+        processor = self.ctx.get_hf_processor(**kwargs)
         return MolmoProcessorWrapper(processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 9c674ab4644..5de8eeb3fff 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -69,14 +69,23 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
     def get_hf_processor(
         self,
         *,
+        min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
     ) -> NVLMProcessor:
-        return NVLMProcessor(
-            self.get_hf_config(),
-            self.get_tokenizer(),
-            max_dynamic_patch=max_dynamic_patch,
-            dynamic_image_size=dynamic_image_size,
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            NVLMProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
         )
 
     def get_max_image_tokens(self) -> int:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 65d810dc23b..955a59953eb 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -16,8 +16,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
@@ -88,7 +88,7 @@ def input_processor_for_paligemma(ctx: InputContext,
     model_config = ctx.model_config
     hf_config = ctx.get_hf_config(PaliGemmaConfig)
 
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+    tokenizer = cached_tokenizer_from_config(model_config)
     image_feature_size = hf_config.text_config.num_image_tokens
     image_token_str = tokenizer.decode(hf_config.image_token_index)
     bos_token = tokenizer.decode(hf_config.bos_token_id)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 6bbfa40beed..207204df205 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -313,11 +313,12 @@ def get_hf_processor(
         self,
         *,
         num_crops: Optional[int] = None,
+        **kwargs: object,
     ) -> ProcessorMixin:
         if num_crops is not None:
-            return self.ctx.get_hf_processor(num_crops=num_crops)
+            kwargs["num_crops"] = num_crops
 
-        return self.ctx.get_hf_processor()
+        return self.ctx.get_hf_processor(**kwargs)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 44fca852805..273dc3b1cf7 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -32,9 +32,9 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
-from vllm.multimodal.utils import (cached_get_tokenizer,
-                                   consecutive_placeholder_ranges)
+from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
@@ -49,9 +49,7 @@
 
 
 def get_max_pixtral_image_tokens(ctx: InputContext):
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        tokenizer_mode=ctx.model_config.tokenizer_mode)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
     mm_encoder = tokenizer.instruct.mm_encoder
 
     image_config = mm_encoder.mm_config if hasattr(
@@ -65,9 +63,7 @@ def get_max_pixtral_image_tokens(ctx: InputContext):
 
 def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
                            mm_counts: Mapping[str, int]):
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        tokenizer_mode=ctx.model_config.tokenizer_mode)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
 
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
     image_token_id = mm_encoder.special_ids.img
@@ -109,9 +105,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
         MultiModalKwargs containing the stacked normalized images tensor or
         image embeddings.
     """
-    model_config = ctx.model_config
-    tokenizer = cached_get_tokenizer(
-        model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
 
     data_list = data if isinstance(data, list) else [data]
 
@@ -138,9 +132,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
 
     prompt_token_ids = inputs.get("prompt_token_ids")
     prompt = inputs.get("prompt")
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        tokenizer_mode=ctx.model_config.tokenizer_mode)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
 
     mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
     image_token_id = mm_encoder.special_ids.img
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 632ecaf65f2..29187eb2ef9 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -36,8 +36,6 @@
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
-from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
-                                          Qwen2VLImageProcessorFast)
 
 from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
@@ -690,41 +688,20 @@ def get_hf_processor(
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
-        fps: Optional[float] = 2.0,
+        size: Optional[dict[str, int]] = None,
+        fps: Optional[float] = None,
+        **kwargs: object,
     ) -> Qwen2_5_VLProcessor:
-        hf_processor = self.ctx.get_hf_processor(Qwen2_5_VLProcessor)
-        image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor,
-                          (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
-
-        if min_pixels:
-            image_processor.min_pixels = min_pixels
-        if max_pixels:
-            image_processor.max_pixels = max_pixels
-        if max_pixels or min_pixels:
-            image_processor.size = {
-                "min_pixels": image_processor.min_pixels,
-                "max_pixels": image_processor.max_pixels,
-            }
-
-        return hf_processor
-
-    def get_image_processor(
-        self,
-        *,
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        fps: Optional[float] = 2.0,
-    ) -> Union[Qwen2VLImageProcessor, Qwen2VLImageProcessorFast]:
-        hf_processor = self.get_hf_processor(
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            fps=fps,
+        if fps is not None:
+            kwargs["fps"] = fps
+
+        return self.ctx.get_hf_processor(
+            Qwen2_5_VLProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size),
+            **kwargs,
         )
-        image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor,
-                          (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
-        return image_processor
 
 
 class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index cf79544e60e..3df5dd2bdd4 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -93,8 +93,9 @@ def get_hf_processor(
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
+        **kwargs: object,
     ) -> Qwen2AudioProcessor:
-        return self.ctx.get_hf_processor(Qwen2AudioProcessor)
+        return self.ctx.get_hf_processor(Qwen2AudioProcessor, **kwargs)
 
     def get_feature_extractor(
         self,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 68340ace18d..919445267f4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -31,9 +31,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
-from packaging.version import Version
 from transformers import BatchFeature
-from transformers import __version__ as TRANSFORMERS_VERSION
 from transformers.models.qwen2_vl import (Qwen2VLImageProcessor,
                                           Qwen2VLProcessor)
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@@ -69,6 +67,8 @@
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
+from vllm.transformers_utils.processor import (
+    cached_image_processor_from_config)
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper,
@@ -722,40 +722,64 @@ def get_hf_processor(
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
     ) -> Qwen2VLProcessor:
-        hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor)
-        image_processor = hf_processor.image_processor  # type: ignore
-        assert isinstance(image_processor, Qwen2VLImageProcessor)
-
-        if min_pixels:
-            image_processor.min_pixels = min_pixels
-        if max_pixels:
-            image_processor.max_pixels = max_pixels
-        if max_pixels or min_pixels:
-            image_processor.size = {
-                "min_pixels": image_processor.min_pixels,
-                "max_pixels": image_processor.max_pixels,
-            }
-
-        return hf_processor
+        return self.ctx.get_hf_processor(
+            Qwen2VLProcessor,
+            image_processor=self.get_image_processor(min_pixels=min_pixels,
+                                                     max_pixels=max_pixels,
+                                                     size=size),
+            **kwargs,
+        )
+
+    def _get_image_processor_kwargs(
+        self,
+        *,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
+    ):
+        if self.ctx.model_config.mm_processor_kwargs:
+            kwargs.update(self.ctx.model_config.mm_processor_kwargs)
+
+        if min_pixels is not None:
+            kwargs["min_pixels"] = min_pixels
+
+            if size is None:
+                size = {"shortest_edge": min_pixels}
+            else:
+                size["shortest_edge"] = min_pixels
+
+        if max_pixels is not None:
+            kwargs["max_pixels"] = max_pixels
+
+            if size is None:
+                size = {"longest_edge": max_pixels}
+            else:
+                size["longest_edge"] = max_pixels
+
+        if size is not None:
+            kwargs["size"] = size
+
+        return kwargs
 
     def get_image_processor(
         self,
         *,
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
+        size: Optional[dict[str, int]] = None,
+        **kwargs: object,
     ):
-        hf_processor = self.get_hf_processor(min_pixels=min_pixels,
-                                             max_pixels=max_pixels)
-        image_processor = hf_processor.image_processor  # type: ignore
-        if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
-            from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
-            assert isinstance(
-                image_processor,
-                (Qwen2VLImageProcessor, Qwen2VLImageProcessorFast))
-        else:
-            assert isinstance(image_processor, Qwen2VLImageProcessor)
-        return image_processor
+        return cached_image_processor_from_config(
+            self.ctx.model_config,
+            **self._get_image_processor_kwargs(min_pixels=min_pixels,
+                                               max_pixels=max_pixels,
+                                               size=size,
+                                               **kwargs),
+        )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
@@ -952,6 +976,18 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
     def _get_data_parser(self) -> MultiModalDataParser:
         return Qwen2VLMultiModalDataParser()
 
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            self.info._get_image_processor_kwargs(**mm_kwargs),
+        )
+
     def _get_prompt_replacements(
         self,
         mm_items: MultiModalDataItems,
@@ -964,8 +1000,6 @@ def _get_prompt_replacements(
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
-        # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has
-        # image_token and video_token registered
         placeholder = {
             "image": vocab[hf_processor.image_token],
             "video": vocab[hf_processor.video_token],
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 0f4f5072fb2..61a4584abf8 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -519,8 +519,13 @@ def get_tokenizer(self) -> PreTrainedTokenizer:
 
         return _get_tokenizer_without_image_pad(tokenizer)
 
-    def get_hf_processor(self) -> QwenVLProcessor:
-        return QwenVLProcessor(self.get_hf_config(), self.get_tokenizer())
+    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
+        return self.ctx.init_processor(
+            QwenVLProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 063997a14a6..e24b4aeb8ae 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -68,8 +68,9 @@ def get_hf_processor(
         *,
         # Ignored in initialization
         sampling_rate: Optional[int] = None,
+        **kwargs: object,
     ) -> ProcessorMixin:
-        hf_processor = self.ctx.get_hf_processor()
+        hf_processor = self.ctx.get_hf_processor(**kwargs)
 
         # NOTE: Ultravox processing definition uses '<|eot_id|>' as the
         # placeholder that will cause confusion with the actual end of turn
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 0b506072094..073a30d25e2 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -29,7 +29,7 @@
                              NestedTensors)
 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SequenceData
-from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.processor import cached_processor_from_config
 
 from .interfaces import SupportsMultiModal, SupportsTranscription
 from .utils import AutoWeightsLoader, WeightsMapper, make_layers
@@ -579,7 +579,7 @@ def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
                                    mm_counts: Mapping[str, int]):
     assert mm_counts["audio"] == 1
     num_tokens = get_max_whisper_audio_tokens(ctx)
-    processor = cached_get_processor(ctx.model_config.model)
+    processor = cached_processor_from_config(ctx.model_config)
     chunk_length = processor.feature_extractor.chunk_length
     sampling_rate = processor.feature_extractor.sampling_rate
     num_samples = chunk_length * sampling_rate
@@ -596,7 +596,7 @@ def input_processor_for_whisper(ctx: InputContext, inputs):
         multi_modal_data["audio"] = multi_modal_data["audio"][0]
     # Resample and process audio
     audio, orig_sr = multi_modal_data["audio"]
-    processor = cached_get_processor(ctx.model_config.model)
+    processor = cached_processor_from_config(ctx.model_config)
     target_sr = processor.feature_extractor.sampling_rate
     audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
     multi_modal_data["audio"] = (audio, target_sr)
@@ -618,7 +618,7 @@ def input_mapper_for_whisper(
     if len(multi_modal_data) == 0:
         return MultiModalKwargs()
 
-    processor = cached_get_processor(ctx.model_config.model)
+    processor = cached_processor_from_config(ctx.model_config)
     sampling_rate = processor.feature_extractor.sampling_rate
 
     audios = [audio for audio, _ in multi_modal_data]
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 98ac8057e8f..98ece8f806f 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import base64
-from functools import lru_cache
 from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
@@ -11,7 +10,7 @@
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.processor import get_image_processor
+from vllm.transformers_utils.processor import cached_get_image_processor
 from vllm.utils import is_list_of
 
 from .base import MediaIO, MultiModalPlugin
@@ -22,8 +21,6 @@
 
 logger = init_logger(__name__)
 
-cached_get_image_processor = lru_cache(get_image_processor)
-
 
 class ImagePlugin(MultiModalPlugin):
     """Plugin for image data."""
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 613d1db4167..1882ffe9bf6 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -11,7 +11,8 @@
 from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+                                               cached_tokenizer_from_config)
 from vllm.utils import ClassRegistry
 
 from .audio import AudioPlugin
@@ -21,7 +22,6 @@
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
 from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
-from .utils import cached_get_tokenizer
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -256,10 +256,7 @@ def get_max_tokens_per_item_by_modality(
         on underlying model configuration.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_get_tokenizer(
-                model_config.tokenizer,
-                trust_remote_code=model_config.trust_remote_code,
-            )
+            tokenizer = cached_tokenizer_from_config(model_config)
             processor = self.create_processor(model_config, tokenizer)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
@@ -374,10 +371,7 @@ def get_mm_limits_per_prompt(
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_get_tokenizer(
-                model_config.tokenizer,
-                trust_remote_code=model_config.trust_remote_code,
-            )
+            tokenizer = cached_tokenizer_from_config(model_config)
             processor = self.create_processor(model_config, tokenizer)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 583f5365512..6e6c10b34a2 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import lru_cache
 from itertools import groupby
 from pathlib import Path
 from typing import TYPE_CHECKING, Optional, TypeVar, Union
@@ -13,7 +12,7 @@
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -23,8 +22,6 @@
 
 logger = init_logger(__name__)
 
-cached_get_tokenizer = lru_cache(get_tokenizer)
-
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 78a2918e3ed..8004377191b 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import base64
-from functools import lru_cache, partial
+from functools import partial
 from io import BytesIO
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Optional
@@ -12,8 +12,7 @@
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.transformers_utils.processor import get_video_processor
-from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.transformers_utils.processor import cached_get_video_processor
 from vllm.utils import PlaceholderModule, is_list_of
 
 from .base import MediaIO, ModalityData
@@ -30,9 +29,6 @@
 
 logger = init_logger(__name__)
 
-cached_get_video_processor = lru_cache(get_video_processor)
-cached_get_tokenizer = lru_cache(get_tokenizer)
-
 
 class VideoPlugin(ImagePlugin):
     """Plugin for video data."""
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 3197b07d8a4..29fab16c25c 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -1,25 +1,59 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import lru_cache
-from typing import Any, cast
+from typing import TYPE_CHECKING, Any, Union, cast
 
 from transformers.processing_utils import ProcessorMixin
+from typing_extensions import TypeVar
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
+_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
+
+
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
+
+
+def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
+    base_kwargs = model_config.mm_processor_kwargs
+    if base_kwargs is None:
+        base_kwargs = {}
+
+    merged_kwargs = {**base_kwargs, **kwargs}
+
+    # NOTE: Pythonic dict is not hashable and will raise unhashable type
+    # error when calling `cached_get_processor`, therefore we need to
+    # wrap it to a hashable dict.
+    for key, value in merged_kwargs.items():
+        if isinstance(value, dict):
+            merged_kwargs[key] = HashableDict(value)
+
+    return merged_kwargs
 
 
 def get_processor(
     processor_name: str,
     *args: Any,
     trust_remote_code: bool = False,
-    processor_cls: type[ProcessorMixin] = ProcessorMixin,
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
     **kwargs: Any,
-):
+) -> _P:
     """Load a processor for the given model name via HuggingFace."""
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
     from transformers import AutoProcessor
 
-    processor_factory = (AutoProcessor
-                         if processor_cls == ProcessorMixin else processor_cls)
+    processor_factory = (AutoProcessor if processor_cls == ProcessorMixin or
+                         isinstance(processor_cls, tuple) else processor_cls)
 
     try:
         processor = processor_factory.from_pretrained(
@@ -43,12 +77,30 @@ def get_processor(
         else:
             raise e
 
-    return cast(ProcessorMixin, processor)
+    if not isinstance(processor, processor_cls):
+        raise TypeError("Invalid type of HuggingFace processor. "
+                        f"Expected type: {processor_cls}, but "
+                        f"found type: {type(processor)}")
+
+    return processor
 
 
 cached_get_processor = lru_cache(get_processor)
 
 
+def cached_processor_from_config(
+    model_config: "ModelConfig",
+    processor_cls: Union[type[_P], tuple[type[_P], ...]] = ProcessorMixin,
+    **kwargs: Any,
+) -> _P:
+    return cached_get_processor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        processor_cls=processor_cls,  # type: ignore[arg-type]
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
 def get_image_processor(
     processor_name: str,
     *args: Any,
@@ -85,6 +137,20 @@ def get_image_processor(
     return cast(BaseImageProcessor, processor)
 
 
+cached_get_image_processor = lru_cache(get_image_processor)
+
+
+def cached_image_processor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_image_processor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
+
+
 def get_video_processor(
     processor_name: str,
     *args: Any,
@@ -104,3 +170,17 @@ def get_video_processor(
     )
 
     return cast(BaseImageProcessor, processor.video_processor)
+
+
+cached_get_video_processor = lru_cache(get_video_processor)
+
+
+def cached_video_processor_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_video_processor(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **_merge_mm_kwargs(model_config, **kwargs),
+    )
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 0c0f68ac123..f0aa5fdcaa6 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -3,9 +3,10 @@
 import contextlib
 import os
 import warnings
+from functools import lru_cache
 from pathlib import Path
 from types import MethodType
-from typing import Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import huggingface_hub
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
@@ -20,6 +21,9 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import make_async
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+
 logger = init_logger(__name__)
 
 AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
@@ -232,6 +236,22 @@ def get_tokenizer(
     return tokenizer
 
 
+cached_get_tokenizer = lru_cache(get_tokenizer)
+
+
+def cached_tokenizer_from_config(
+    model_config: "ModelConfig",
+    **kwargs: Any,
+):
+    return cached_get_tokenizer(
+        model_config.tokenizer,
+        tokenizer_mode=model_config.tokenizer_mode,
+        tokenizer_revision=model_config.tokenizer_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        **kwargs,
+    )
+
+
 def get_lora_tokenizer(lora_request: LoRARequest, *args,
                        **kwargs) -> Optional[AnyTokenizer]:
     if lora_request is None:

From 37534ca3a23ce2f11be2edd901e0752b360be80c Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Wed, 19 Feb 2025 22:13:15 +0800
Subject: [PATCH 0264/1240] [Bugfix] Fix device ordinal for multi-node spec
 decode (#13269)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/spec_decode/spec_decode_worker.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index fce06a81ff0..3f381d5199d 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -10,6 +10,7 @@
 
 from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig
 from vllm.distributed.communication_op import (broadcast_tensor_dict,
+                                               get_tp_group,
                                                tensor_model_parallel_gather)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
@@ -365,7 +366,7 @@ def init_device(self) -> None:
                 target_lm_head_weight)
 
         self._metrics.init_tensors(self.rank, device_type=self.device)
-        self.spec_decode_sampler.init_tensors(self.rank,
+        self.spec_decode_sampler.init_tensors(get_tp_group().local_rank,
                                               device_type=self.device)
 
         scorer_cls: Type[SpeculativeScorer]

From 0ccca48967cf79ea83cf607ce9e811a204955e31 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 19 Feb 2025 22:32:17 +0800
Subject: [PATCH 0265/1240] [doc] clarify multi-node serving doc (#13558)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/distributed_serving.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 6d136147c8d..54c7ded2042 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -75,11 +75,15 @@ bash run_cluster.sh \
                 -e VLLM_HOST_IP=ip_of_this_node
 ```
 
-Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
+Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
+
+:::{warning}
+Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
+:::
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
-After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
  vllm serve /path/to/the/model/in/the/container \

From f6ceb30ecd2297f46e7167230330a820147869bb Mon Sep 17 00:00:00 2001
From: Wilson Wu <iwilsonwu@gmail.com>
Date: Thu, 20 Feb 2025 00:55:34 +0800
Subject: [PATCH 0266/1240] Fix copyright year to auto get current year
 (#13561)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/conf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 84c9a27be3b..97bec81b1ee 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -12,6 +12,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 
+import datetime
 import inspect
 import logging
 import os
@@ -27,7 +28,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'vLLM'
-copyright = '2024, vLLM Team'
+copyright = f'{datetime.datetime.now().year}, vLLM Team'
 author = 'the vLLM Team'
 
 # -- General configuration ---------------------------------------------------

From 92e474483709a5d5295f95496dcbd5ba595040eb Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 19 Feb 2025 09:40:50 -0800
Subject: [PATCH 0267/1240] [MISC] Logging the message about Ray teardown
 (#13502)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 6a25a4d50fb..79ca45d55d9 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -101,6 +101,10 @@ def _init_executor(self) -> None:
                 self.driver_worker.execute_method)
 
     def shutdown(self) -> None:
+        logger.info(
+            "Shutting down Ray distributed executor. If you see error log "
+            "from logging.cc regarding SIGTERM received, please ignore because "
+            "this is the expected termination process in Ray.")
         if hasattr(self, "forward_dag") and self.forward_dag is not None:
             self.forward_dag.teardown()
             import ray

From 139c333934686db05aff80f03fe72656fde3b69c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 20 Feb 2025 02:57:48 +0800
Subject: [PATCH 0268/1240] [Misc] Avoid calling unnecessary
 `hf_list_repo_files` for local model path (#13348)

Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 4768226f9a0..dd6ee9a34ad 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -115,7 +115,14 @@ def list_repo_files(
     token: Union[str, bool, None] = None,
 ) -> list[str]:
 
-    def lookup_files():
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob('*') if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
         try:
             if VLLM_USE_MODELSCOPE:
                 from vllm.transformers_utils.utils import (
@@ -154,8 +161,8 @@ def file_exists(
 # In offline mode the result can be a false negative
 def file_or_path_exists(model: Union[str, Path], config_name: str,
                         revision: Optional[str]) -> bool:
-    if Path(model).exists():
-        return (Path(model) / config_name).is_file()
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
 
     # Offline mode support: Check if config file is cached already
     cached_filepath = try_to_load_from_cache(repo_id=model,

From 86ccf96ae344456fce2d576594826297161d6483 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Feb 2025 16:49:01 -0800
Subject: [PATCH 0269/1240] [BugFix] Avoid error traceback in logs when V1
 `LLM` terminates (#13565)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core_client.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8641833e438..77df9ed5409 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -252,14 +252,18 @@ def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
         outputs_queue = self.outputs_queue
 
         def process_outputs_socket():
-            while True:
-                (frame, ) = output_socket.recv_multipart(copy=False)
-                outputs = decoder.decode(frame.buffer)
-                if outputs.utility_output:
-                    _process_utility_output(outputs.utility_output,
-                                            utility_results)
-                else:
-                    outputs_queue.put_nowait(outputs)
+            try:
+                while True:
+                    (frame, ) = output_socket.recv_multipart(copy=False)
+                    outputs = decoder.decode(frame.buffer)
+                    if outputs.utility_output:
+                        _process_utility_output(outputs.utility_output,
+                                                utility_results)
+                    else:
+                        outputs_queue.put_nowait(outputs)
+            except zmq.error.ContextTerminated:
+                # Expected when the class is GC'd / during process termination.
+                pass
 
         # Process outputs from engine in separate thread.
         Thread(target=process_outputs_socket, daemon=True).start()

From d3f0d9c66244322ed146b3e25fcf174693fecfc3 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Feb 2025 18:12:30 -0800
Subject: [PATCH 0270/1240] [3/n][CI] Load Quantization test models with S3
 (#13570)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/conftest.py                             | 51 +++++++++++++++++++
 .../model_loader/weight_utils.py              |  4 +-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 74219e40026..46b8dd1e1df 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,6 +57,57 @@
     "ArthurZ/Ilama-3.2-1B",
     "llava-hf/llava-1.5-7b-hf",
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "JackFram/llama-160m",
+    "ai21labs/Jamba-tiny-random",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
 ]
 
 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index ac1be383c15..18f6f40b32f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -27,8 +27,6 @@
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule
 
-logger = init_logger(__name__)
-
 try:
     from runai_model_streamer import SafetensorsStreamer
 except (ImportError, OSError):
@@ -39,6 +37,8 @@
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
+logger = init_logger(__name__)
+
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the

From cdf21ecfc1ac3112bda15c49279550b08f283b3c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 20 Feb 2025 10:37:55 +0800
Subject: [PATCH 0271/1240] [Misc] Qwen2.5 VL support LoRA (#13261)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md   |   2 +-
 tests/lora/conftest.py                   |   5 +
 tests/lora/test_qwen2vl.py               | 176 +++++++++++++++--------
 vllm/model_executor/models/qwen2_5_vl.py |  10 +-
 4 files changed, 130 insertions(+), 63 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5497b5dba76..ae851c35e62 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -854,7 +854,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * Qwen2.5-VL
   * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
-  *
+  * ✅︎
   * ✅︎
   * ✅︎
 - * `UltravoxModel`
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 5ea66518b41..92ff52b839e 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -237,6 +237,11 @@ def qwen2vl_lora_files():
     return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
 
 
+@pytest.fixture(scope="session")
+def qwen25vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
+
+
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index a988f06ab25..1cf1534e403 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,83 +1,143 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import List
+from dataclasses import dataclass
+from typing import Dict, List, Optional
 
 import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
-MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
 
-PROMPT_TEMPLATE = (
-    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
-    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
-    "What is in the image?<|im_end|>\n"
-    "<|im_start|>assistant\n")
+@dataclass
+class TestConfig:
+    model_path: str
+    lora_path: str
+    max_num_seqs: int = 2
+    max_loras: int = 2
+    max_lora_rank: int = 16
+    max_model_len: int = 4096
+    mm_processor_kwargs: Optional[Dict[str, int]] = None
+
+    def __post_init__(self):
+        if self.mm_processor_kwargs is None:
+            self.mm_processor_kwargs = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }
+
+
+class Qwen2VLTester:
+    """Test helper for Qwen2 VL models with LoRA"""
+
+    PROMPT_TEMPLATE = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        "What is in the image?<|im_end|>\n"
+        "<|im_start|>assistant\n")
+
+    def __init__(self, config: TestConfig):
+        self.config = config
+        self.llm = self._initialize_llm()
+
+    def _initialize_llm(self) -> vllm.LLM:
+        """Initialize the LLM with given configuration"""
+        return vllm.LLM(
+            model=self.config.model_path,
+            max_num_seqs=self.config.max_num_seqs,
+            enable_lora=True,
+            max_loras=self.config.max_loras,
+            max_lora_rank=self.config.max_lora_rank,
+            trust_remote_code=True,
+            mm_processor_kwargs=self.config.mm_processor_kwargs,
+            max_model_len=self.config.max_model_len,
+        )
+
+    def run_test(self,
+                 images: List[ImageAsset],
+                 expected_outputs: List[str],
+                 lora_id: Optional[int] = None,
+                 temperature: float = 0,
+                 max_tokens: int = 5) -> List[str]:
+
+        sampling_params = vllm.SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        inputs = [{
+            "prompt": self.PROMPT_TEMPLATE,
+            "multi_modal_data": {
+                "image": asset.pil_image
+            },
+        } for asset in images]
+
+        lora_request = LoRARequest(str(lora_id), lora_id,
+                                   self.config.lora_path)
+        outputs = self.llm.generate(inputs,
+                                    sampling_params,
+                                    lora_request=lora_request)
+        generated_texts = [
+            output.outputs[0].text.strip() for output in outputs
+        ]
 
-IMAGE_ASSETS = [
+        # Validate outputs
+        for generated, expected in zip(generated_texts, expected_outputs):
+            assert expected.startswith(
+                generated), f"Generated text {generated} doesn't "
+            f"match expected pattern {expected}"
+
+        return generated_texts
+
+
+TEST_IMAGES = [
     ImageAsset("stop_sign"),
     ImageAsset("cherry_blossom"),
 ]
 
-# After fine-tuning with LoRA, all generated content should start begin `A`.
-EXPECTED_OUTPUT = [
+EXPECTED_OUTPUTS = [
     "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
     "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
 ]
 
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
-    sampling_params = vllm.SamplingParams(
-        temperature=0,
-        max_tokens=5,
-    )
-
-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
-
-    outputs = llm.generate(
-        inputs,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: List[str] = []
-    for output in outputs:
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Generated text: {generated_text!r}")
-    return generated_texts
+QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="Qwen2-VL dependency xformers incompatible with ROCm")
 def test_qwen2vl_lora(qwen2vl_lora_files):
-    llm = vllm.LLM(
-        MODEL_PATH,
-        max_num_seqs=2,
-        enable_lora=True,
-        max_loras=2,
-        max_lora_rank=16,
-        trust_remote_code=True,
-        mm_processor_kwargs={
-            "min_pixels": 28 * 28,
-            "max_pixels": 1280 * 28 * 28,
-        },
-        max_model_len=4096,
-    )
-    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output1[i])
-
-    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
-    for i in range(len(EXPECTED_OUTPUT)):
-        assert EXPECTED_OUTPUT[i].startswith(output2[i])
+    """Test Qwen 2.0 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
+                        lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
+
+
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
+)
+@pytest.mark.skipif(
+    Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
+    reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
+)
+def test_qwen25vl_lora(qwen25vl_lora_files):
+    """Test Qwen 2.5 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
+                        lora_path=qwen25vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES,
+                        expected_outputs=EXPECTED_OUTPUTS,
+                        lora_id=lora_id)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 29187eb2ef9..f16fa536791 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -734,16 +734,17 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
-    # LoRA specific attributes, TODO: double check
+    # LoRA specific attributes
     supported_lora_modules = [
+        # language model
         "qkv_proj",
         "o_proj",
         "gate_up_proj",
-        "down_proj",
-        "gate_proj"
-        "up_proj",
+        "down_proj",  # Same name with vision encoder
         # vision tower
         "qkv",
+        "gate_proj",
+        "up_proj",
         "attn.proj",  # Distinguish patch_embed.proj
         "fc1",
         "fc2",
@@ -751,6 +752,7 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         "mlp.0",
         "mlp.2"
     ]
+
     embedding_modules = {}
     embedding_padding_modules = []
 

From c65936a2f684a13958e47b7aa33021b2456ab412 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 19 Feb 2025 19:56:06 -0800
Subject: [PATCH 0272/1240] [ci] Add AWS creds for AMD (#13572)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh | 4 ++++
 requirements-rocm.txt      | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 3515ccd6566..f8bf1c87603 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -121,6 +121,8 @@ if [[ $commands == *"--shard-id="* ]]; then
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
         -v "${HF_CACHE}:${HF_MOUNT}" \
         -e "HF_HOME=${HF_MOUNT}" \
         --name "${container_name}_${GPU}" \
@@ -148,6 +150,8 @@ else
           --rm \
           -e HIP_VISIBLE_DEVICES=0 \
           -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
           -v "${HF_CACHE}:${HF_MOUNT}" \
           -e "HF_HOME=${HF_MOUNT}" \
           --name "${container_name}" \
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index ccc90623417..d86e039c232 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -10,3 +10,5 @@ ray >= 2.10.0
 peft
 pytest-asyncio
 tensorizer>=2.9.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0

From 2749bea7b0361f113b8a709eed88c14e25ad3919 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Wed, 19 Feb 2025 22:01:02 -0600
Subject: [PATCH 0273/1240] [ROCm][MoE] mi300 mixtral8x7B perf for specific BS
 (#13577)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json  | 4 ++--
 .../configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json   | 4 ++--
 .../configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
index 66f9106bd1b..4bf775347ec 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json
@@ -45,8 +45,8 @@
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 16,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
index ed5b655d899..5a3f415d541 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json
@@ -45,8 +45,8 @@
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
index 822f04e33e8..8d7b7802718 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json
@@ -128,7 +128,7 @@
         "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0,
-        "matrix_instr_nonkdim": 32,
+        "matrix_instr_nonkdim": 16,
         "kpack": 2
     },
     "512": {

From 8d2dca1c57a0331356ff79549613c0a48013fa6c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 20 Feb 2025 12:41:17 +0800
Subject: [PATCH 0274/1240] [core] add sleep and wake up endpoint and v1
 support (#12987)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: cennn <2523403608@qq.com>
Co-authored-by: cennn <2523403608@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py         | 12 ++++---
 tests/entrypoints/openai/test_sleep.py        | 32 +++++++++++++++++++
 vllm/engine/async_llm_engine.py               |  6 ++++
 vllm/engine/multiprocessing/__init__.py       | 12 ++++++-
 vllm/engine/multiprocessing/client.py         | 15 +++++++--
 vllm/engine/multiprocessing/engine.py         | 15 +++++++--
 vllm/engine/protocol.py                       | 10 ++++++
 vllm/entrypoints/openai/api_server.py         | 18 +++++++++++
 .../openai/serving_transcription.py           |  1 +
 vllm/v1/engine/async_llm.py                   |  6 ++++
 vllm/v1/engine/core.py                        |  6 ++++
 vllm/v1/engine/core_client.py                 | 30 +++++++++++++++++
 vllm/v1/engine/llm_engine.py                  |  6 ++++
 13 files changed, 160 insertions(+), 9 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_sleep.py

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 24ed5d39283..7ebccdb5cae 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -118,14 +118,16 @@ def model(x):
 
 @fork_new_process_for_each_test
 @pytest.mark.parametrize(
-    "model",
+    "model, use_v1",
     [
         # sleep mode with safetensors
-        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
-        "facebook/opt-125m"
+        ("facebook/opt-125m", False),
     ])
-def test_end_to_end(model):
+def test_end_to_end(model: str, use_v1: bool):
+    import os
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
     load_format = LoadFormat.AUTO
@@ -152,3 +154,5 @@ def test_end_to_end(model):
 
     # cmp output
     assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+    del os.environ["VLLM_USE_V1"]
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
new file mode 100644
index 00000000000..1caa743c401
--- /dev/null
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import requests
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+
+def test_sleep_mode():
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enable-sleep-mode",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={
+                                "VLLM_SERVER_DEV_MODE": "1",
+                                "CUDA_VISIBLE_DEVICES": "0"
+                            }) as remote_server:
+        response = requests.post(remote_server.url_for("/sleep"),
+                                 data={"level": "1"})
+        assert response.status_code == 200
+        response = requests.post(remote_server.url_for("/wake_up"))
+        assert response.status_code == 200
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 053635a2863..93d9b74d8e1 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1187,6 +1187,12 @@ async def stop_profile(self) -> None:
     async def reset_prefix_cache(self) -> None:
         self.engine.reset_prefix_cache()
 
+    async def sleep(self, level: int = 1) -> None:
+        self.engine.sleep(level)
+
+    async def wake_up(self) -> None:
+        self.engine.wake_up()
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine.add_lora(lora_request)
 
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 3cf1850ee65..26dfb63c3db 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -127,6 +127,15 @@ class RPCResetPrefixCacheRequest(Enum):
     RESET_PREFIX_CACHE = 1
 
 
+class RPCSleepRequest(Enum):
+    SLEEP_LEVEL_1 = 1
+    SLEEP_LEVEL_2 = 2
+
+
+class RPCWakeUpRequest(Enum):
+    WAKE_UP = 1
+
+
 @dataclass
 class RPCLoadAdapterRequest:
     lora_request: LoRARequest
@@ -141,7 +150,8 @@ class RPCAdapterLoadedResponse:
 
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
                       RPCUProfileRequest, RPCLoadAdapterRequest,
-                      RPCResetPrefixCacheRequest]
+                      RPCResetPrefixCacheRequest, RPCSleepRequest,
+                      RPCWakeUpRequest]
 
 REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
                           RPCError]
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 85b5f31e3a4..c12fe242082 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -31,8 +31,9 @@
                                          RPCLoadAdapterRequest,
                                          RPCProcessRequest,
                                          RPCResetPrefixCacheRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
-                                         RPCUProfileRequest)
+                                         RPCSleepRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
+                                         RPCUProfileRequest, RPCWakeUpRequest)
 from vllm.engine.protocol import EngineClient
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
@@ -685,6 +686,16 @@ async def reset_prefix_cache(self) -> None:
             request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE,
             socket=self.input_socket)
 
+    async def sleep(self, level: int = 1) -> None:
+        """Sleep the engine for a given level"""
+        return await self._send_one_way_rpc_request(
+            request=RPCSleepRequest(level), socket=self.input_socket)
+
+    async def wake_up(self) -> None:
+        """Wake up the engine"""
+        return await self._send_one_way_rpc_request(
+            request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         # Uses the same I/O as generate requests
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index a0dd7958658..ce24aa21514 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -20,8 +20,9 @@
                                          RPCLoadAdapterRequest,
                                          RPCProcessRequest,
                                          RPCResetPrefixCacheRequest,
-                                         RPCStartupRequest, RPCStartupResponse,
-                                         RPCUProfileRequest)
+                                         RPCSleepRequest, RPCStartupRequest,
+                                         RPCStartupResponse,
+                                         RPCUProfileRequest, RPCWakeUpRequest)
 # yapf: enable
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
@@ -242,6 +243,10 @@ def handle_new_input(self):
                     self._handle_load_adapter_request(request)
                 elif isinstance(request, RPCResetPrefixCacheRequest):
                     self.reset_prefix_cache()
+                elif isinstance(request, RPCSleepRequest):
+                    self.sleep(request.value)
+                elif isinstance(request, RPCWakeUpRequest):
+                    self.wake_up()
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -369,6 +374,12 @@ def stop_profile(self) -> None:
     def reset_prefix_cache(self) -> bool:
         return self.engine.reset_prefix_cache()
 
+    def sleep(self, level: int = 1) -> None:
+        self.engine.sleep(level)
+
+    def wake_up(self) -> None:
+        self.engine.wake_up()
+
 
 def signal_handler(*_) -> None:
     raise KeyboardInterrupt("MQLLMEngine terminated")
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d1112558666..ee9accd32f2 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -278,6 +278,16 @@ async def reset_prefix_cache(self) -> None:
         """Reset the prefix cache"""
         ...
 
+    @abstractmethod
+    async def sleep(self, level: int = 1) -> None:
+        """Sleep the engine"""
+        ...
+
+    @abstractmethod
+    async def wake_up(self) -> None:
+        """Wake up the engine"""
+        ...
+
     @abstractmethod
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0de7e239269..f7162fadbce 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -625,6 +625,24 @@ async def reset_prefix_cache(raw_request: Request):
         await engine_client(raw_request).reset_prefix_cache()
         return Response(status_code=200)
 
+    @router.post("/sleep")
+    async def sleep(raw_request: Request):
+        # get POST params
+        level = raw_request.query_params.get("level", "1")
+        logger.info("sleep the engine with level %s", level)
+        await engine_client(raw_request).sleep(int(level))
+        # FIXME: in v0 with frontend multiprocessing, the sleep command
+        # is sent but does not finish yet when we return a response.
+        return Response(status_code=200)
+
+    @router.post("/wake_up")
+    async def wake_up(raw_request: Request):
+        logger.info("wake up the engine")
+        await engine_client(raw_request).wake_up()
+        # FIXME: in v0 with frontend multiprocessing, the wake-up command
+        # is sent but does not finish yet when we return a response.
+        return Response(status_code=200)
+
 
 @router.post("/invocations", dependencies=[Depends(validate_json_request)])
 async def invocations(raw_request: Request):
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index da4930e0e2d..0bedb5718a4 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -295,6 +295,7 @@ async def create_transcription(
         # TODO(rob): figure out a way to pipe streaming in.
         # Non-streaming response.
         try:
+            assert result_generator is not None
             async for op in result_generator:
                 result = op
             return TranscriptionResponse(text=result.outputs[0].text)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1920dbf7a7d..670454c283d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -361,6 +361,12 @@ async def stop_profile(self) -> None:
     async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
+    async def sleep(self, level: int = 1) -> None:
+        await self.engine_core.sleep_async(level)
+
+    async def wake_up(self) -> None:
+        await self.engine_core.wake_up_async()
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         await self.engine_core.add_lora_async(lora_request)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 66e252b7ccb..03825d6ea43 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -213,6 +213,12 @@ def profile(self, is_start: bool = True):
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
 
+    def sleep(self, level: int = 1):
+        self.model_executor.sleep(level)
+
+    def wake_up(self):
+        self.model_executor.wake_up()
+
     def add_lora(self, lora_request: LoRARequest) -> None:
         self.model_executor.add_lora(lora_request)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 77df9ed5409..43ba7583c66 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -81,6 +81,12 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         raise NotImplementedError
 
+    def sleep(self, level: int = 1) -> None:
+        raise NotImplementedError
+
+    def wake_up(self) -> None:
+        raise NotImplementedError
+
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -99,6 +105,12 @@ async def profile_async(self, is_start: bool = True) -> None:
     async def reset_prefix_cache_async(self) -> None:
         raise NotImplementedError
 
+    async def sleep_async(self, level: int = 1) -> None:
+        raise NotImplementedError
+
+    async def wake_up_async(self) -> None:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -138,6 +150,12 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         self.engine_core.reset_prefix_cache()
 
+    def sleep(self, level: int = 1) -> None:
+        self.engine_core.sleep(level)
+
+    def wake_up(self) -> None:
+        self.engine_core.wake_up()
+
     def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine_core.add_lora(lora_request)
 
@@ -307,6 +325,12 @@ def reset_prefix_cache(self) -> None:
     def add_lora(self, lora_request: LoRARequest) -> None:
         self._call_utility("add_lora", lora_request)
 
+    def sleep(self, level: int = 1) -> None:
+        self._call_utility("sleep", level)
+
+    def wake_up(self) -> None:
+        self._call_utility("wake_up")
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -384,5 +408,11 @@ async def profile_async(self, is_start: bool = True) -> None:
     async def reset_prefix_cache_async(self) -> None:
         await self._call_utility_async("reset_prefix_cache")
 
+    async def sleep_async(self, level: int = 1) -> None:
+        await self._call_utility_async("sleep", level)
+
+    async def wake_up_async(self) -> None:
+        await self._call_utility_async("wake_up")
+
     async def add_lora_async(self, lora_request: LoRARequest) -> None:
         await self._call_utility_async("add_lora", lora_request)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c9a4c5369df..6b7de4deed3 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -169,6 +169,12 @@ def stop_profile(self):
     def reset_prefix_cache(self):
         self.engine_core.reset_prefix_cache()
 
+    def sleep(self, level: int = 1):
+        self.engine_core.sleep(level)
+
+    def wake_up(self):
+        self.engine_core.wake_up()
+
     def get_tokenizer_group(
         self,
         group_type: Type[_G] = BaseTokenizerGroup,

From 57e87e4d2c5a41c10c23c70c20e62dd1a743c044 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 19 Feb 2025 20:46:28 -0800
Subject: [PATCH 0275/1240] [bugfix] spec decode worker get tp group only when
 initialized (#13578)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/spec_decode/spec_decode_worker.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 3f381d5199d..8af71842224 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -12,6 +12,7 @@
 from vllm.distributed.communication_op import (broadcast_tensor_dict,
                                                get_tp_group,
                                                tensor_model_parallel_gather)
+from vllm.distributed.parallel_state import model_parallel_is_initialized
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -366,8 +367,12 @@ def init_device(self) -> None:
                 target_lm_head_weight)
 
         self._metrics.init_tensors(self.rank, device_type=self.device)
-        self.spec_decode_sampler.init_tensors(get_tp_group().local_rank,
-                                              device_type=self.device)
+        if model_parallel_is_initialized():
+            self.spec_decode_sampler.init_tensors(get_tp_group().local_rank,
+                                                  device_type=self.device)
+        else:
+            self.spec_decode_sampler.init_tensors(self.rank,
+                                                  device_type=self.device)
 
         scorer_cls: Type[SpeculativeScorer]
         if self.disable_mqa_scorer:

From e2e9541ca541de2183bc931148b136fec5c72449 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Wed, 19 Feb 2025 23:24:48 -0700
Subject: [PATCH 0276/1240] [Misc] Warn if the vLLM version can't be retrieved
 (#13501)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 724c4357ff7..48cf8f7a323 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -2,6 +2,7 @@
 
 import logging
 import traceback
+from contextlib import suppress
 from itertools import chain
 from typing import TYPE_CHECKING, Optional
 
@@ -14,6 +15,21 @@
 logger = logging.getLogger(__name__)
 
 
+def vllm_version_matches_substr(substr: str) -> bool:
+    """
+    Check to see if the vLLM version matches a substring.
+    """
+    from importlib.metadata import PackageNotFoundError, version
+    try:
+        vllm_version = version("vllm")
+    except PackageNotFoundError as e:
+        logger.warning(
+            "The vLLM package was not found, so its version could not be "
+            "inspected. This may cause platform detection to fail.")
+        raise e
+    return substr in vllm_version
+
+
 def tpu_platform_plugin() -> Optional[str]:
     is_tpu = False
     try:
@@ -33,8 +49,6 @@ def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
 
     try:
-        from importlib.metadata import version
-
         from vllm.utils import import_pynvml
         pynvml = import_pynvml()
         pynvml.nvmlInit()
@@ -45,7 +59,7 @@ def cuda_platform_plugin() -> Optional[str]:
             # Otherwise, vllm will always activate cuda plugin
             # on a GPU machine, even if in a cpu build.
             is_cuda = (pynvml.nvmlDeviceGetCount() > 0
-                       and "cpu" not in version("vllm"))
+                       and not vllm_version_matches_substr("cpu"))
         finally:
             pynvml.nvmlShutdown()
     except Exception as e:
@@ -113,8 +127,7 @@ def xpu_platform_plugin() -> Optional[str]:
 def cpu_platform_plugin() -> Optional[str]:
     is_cpu = False
     try:
-        from importlib.metadata import version
-        is_cpu = "cpu" in version("vllm")
+        is_cpu = vllm_version_matches_substr("cpu")
         if not is_cpu:
             import platform
             is_cpu = platform.machine().lower().startswith("arm")
@@ -138,11 +151,8 @@ def neuron_platform_plugin() -> Optional[str]:
 
 def openvino_platform_plugin() -> Optional[str]:
     is_openvino = False
-    try:
-        from importlib.metadata import version
-        is_openvino = "openvino" in version("vllm")
-    except Exception:
-        pass
+    with suppress(Exception):
+        is_openvino = vllm_version_matches_substr("openvino")
 
     return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
 

From 6eae274f9cc7f72d6293d60401c7179f72d0606f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=87=83?= <wulipc@163.com>
Date: Thu, 20 Feb 2025 15:04:30 +0800
Subject: [PATCH 0277/1240] [Misc] add mm_processor_kwargs to extra_body for
 Qwen2.5-VL (#13533)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/protocol.py       |  4 ++++
 vllm/entrypoints/openai/serving_engine.py |  2 ++
 vllm/model_executor/models/qwen2_5_vl.py  |  2 +-
 vllm/transformers_utils/processor.py      | 12 +++++++++++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2bcfdc23577..98ea6a46133 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -312,6 +312,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
+    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
     guided_json: Optional[Union[str, dict, BaseModel]] = Field(
         default=None,
         description=("If specified, the output will follow the JSON schema."),
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 785117ca1d4..dfc3328677c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -451,6 +451,8 @@ async def _preprocess_chat(
             prompt_token_ids=prompt_inputs["prompt_token_ids"])
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
+        if request.mm_processor_kwargs is not None:
+            engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
         return conversation, [request_prompt], [engine_prompt]
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index f16fa536791..ff10fcb4315 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -689,7 +689,7 @@ def get_hf_processor(
         min_pixels: Optional[int] = None,
         max_pixels: Optional[int] = None,
         size: Optional[dict[str, int]] = None,
-        fps: Optional[float] = None,
+        fps: Optional[Union[float, List[float]]] = None,
         **kwargs: object,
     ) -> Qwen2_5_VLProcessor:
         if fps is not None:
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index 29fab16c25c..1d09b99d50c 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -23,6 +23,15 @@ def __hash__(self) -> int:  # type: ignore[override]
         return hash(frozenset(self.items()))
 
 
+class HashableList(list):
+    """
+    A list that can be hashed by lru_cache.
+    """
+
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(tuple(self))
+
+
 def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
     base_kwargs = model_config.mm_processor_kwargs
     if base_kwargs is None:
@@ -36,7 +45,8 @@ def _merge_mm_kwargs(model_config: "ModelConfig", **kwargs):
     for key, value in merged_kwargs.items():
         if isinstance(value, dict):
             merged_kwargs[key] = HashableDict(value)
-
+        if isinstance(value, list):
+            merged_kwargs[key] = HashableList(value)
     return merged_kwargs
 
 
From aa559f56c25235ffb9d7576f8ca5cb7cdfbbd9ea Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 20 Feb 2025 02:05:00 -0500
Subject: [PATCH 0278/1240] [ROCm] MI300A compile targets deprecation (#13560)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                              | 2 +-
 csrc/quantization/fp8/amd/hip_float8_impl.h | 3 +--
 csrc/rocm/attention.cu                      | 3 +--
 vllm/attention/backends/rocm_flash_attn.py  | 3 +--
 4 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e8f7adf6ea..cd1c2c9015d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
diff --git a/csrc/quantization/fp8/amd/hip_float8_impl.h b/csrc/quantization/fp8/amd/hip_float8_impl.h
index 90251c35395..8b9cd26f2f7 100644
--- a/csrc/quantization/fp8/amd/hip_float8_impl.h
+++ b/csrc/quantization/fp8/amd/hip_float8_impl.h
@@ -1,7 +1,6 @@
 #pragma once
 
-#if defined(__HIPCC__) && \
-    (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
+#if defined(__HIPCC__) && defined(__gfx942__)
   #define __HIP__MI300__
 #endif
 
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 366b3cdc23a..82f7104a9e5 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -24,8 +24,7 @@
 #include "../attention/dtype_fp8.cuh"
 #include "../quantization/fp8/amd/quant_utils.cuh"
 
-#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx940__) || \
-                           defined(__gfx941__) || defined(__gfx942__))
+#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__))
   #define __HIP__MI300_MI250__
 #endif
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 02bff57a62b..f49b37842d9 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -25,8 +25,7 @@
 _PARTITION_SIZE_ROCM = 512
 _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
 _ON_NAVI = "gfx1" in _GPU_ARCH
-_ON_MI250_MI300 = any(arch in _GPU_ARCH
-                      for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"])
+_ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):

From 78f98f631e3f6a82b0e1900958d6729f7ee2f0ed Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 20 Feb 2025 02:05:13 -0500
Subject: [PATCH 0279/1240] [API Server] Add port number range validation
 (#13506)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 00793d4b967..4294a8aad9a 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -145,7 +145,7 @@ async def run_server(args: Namespace,
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
-    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--port", type=int, default=8000, ge=1024, le=65535)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
     parser.add_argument("--ssl-certfile", type=str, default=None)
     parser.add_argument("--ssl-ca-certs",

From 511207871baa990353b29d92308d83d3c75d7120 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 20 Feb 2025 02:05:44 -0500
Subject: [PATCH 0280/1240] [CI/Build] Use uv in the Dockerfile (#13566)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 26da8c0f269..310e003d427 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,6 +27,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install uv
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
@@ -52,13 +55,13 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
+    uv pip install --system -r requirements-cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -79,7 +82,7 @@ ARG TARGETPLATFORM
 COPY requirements-build.txt requirements-build.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
+    uv pip install --system -r requirements-build.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -144,7 +147,7 @@ COPY requirements-lint.txt requirements-lint.txt
 COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    uv pip install --system -r requirements-dev.txt
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
@@ -174,6 +177,9 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install uv
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -187,13 +193,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -210,7 +216,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 
@@ -220,7 +226,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
+    uv pip install --system -r requirements-build.txt
 
 #################### vLLM installation IMAGE ####################
 
@@ -233,15 +239,15 @@ ADD . /vllm-workspace/
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    uv pip install --system -r requirements-dev.txt
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -e tests/vllm_test_utils
+    uv pip install --system -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install hf_transfer
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
 # Copy in the v1 package for testing (it isn't distributed yet)
@@ -262,9 +268,9 @@ FROM vllm-base AS vllm-openai-base
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image

From ec64aee50e8c0ab0f850c02df69bc38894a3a611 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 20 Feb 2025 00:56:00 -0800
Subject: [PATCH 0281/1240] [ci] Fix spec decode test (#13600)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 46b8dd1e1df..ca268dd6657 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -57,7 +57,6 @@
     "ArthurZ/Ilama-3.2-1B",
     "llava-hf/llava-1.5-7b-hf",
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "JackFram/llama-160m",
     "ai21labs/Jamba-tiny-random",
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
     "nm-testing/Phi-3-mini-128k-instruct-FP8",

From 9723e11a338e48d16e8c88147818e7aea01ee417 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 20 Feb 2025 01:20:15 -0800
Subject: [PATCH 0282/1240] [2/n][ci] S3: Use full model path (#13564)

Signed-off-by: <>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py       |  2 +-
 tests/conftest.py                           |  3 +--
 tests/engine/test_computed_prefix_blocks.py |  3 ++-
 tests/engine/test_detokenization.py         |  3 ++-
 tests/engine/test_executor.py               | 12 ++++++++----
 tests/engine/test_skip_tokenizer_init.py    |  3 ++-
 tests/test_config.py                        | 13 +++++++------
 tests/test_regression.py                    |  6 +++---
 8 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 7ebccdb5cae..f1148fc8e3f 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -121,7 +121,7 @@ def model(x):
     "model, use_v1",
     [
         # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B", True),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
     ])
diff --git a/tests/conftest.py b/tests/conftest.py
index ca268dd6657..9304b8f17dc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -746,8 +746,7 @@ def __init__(
         **kwargs,
     ) -> None:
         if model_name in MODELS_ON_S3 and not load_format:
-            model_name = (f"s3://vllm-ci-model-weights/"
-                          f"{model_name.split('/')[-1]}")
+            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
             load_format = LoadFormat.RUNAI_STREAMER
         if not load_format:
             load_format = LoadFormat.AUTO
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 93907ecae55..51e7c8e7739 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -10,7 +10,8 @@
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index ab594aeee40..6ae4be2e478 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -9,7 +9,8 @@
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 31c07e709bd..6a86401ce5d 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -38,7 +38,8 @@ def collective_rpc(self,
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
@@ -51,7 +52,8 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -75,7 +77,8 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -103,7 +106,8 @@ async def t():
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index fee7fd3f6aa..b0930eaac17 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -9,7 +9,8 @@
 from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
-@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
+@pytest.mark.parametrize("model",
+                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
diff --git a/tests/test_config.py b/tests/test_config.py
index 4a171861330..bc87e6ccdfc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -14,13 +14,14 @@
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2", "generate", "generate"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/e5-mistral-7b-instruct", "pooling",
-         "embed"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/Qwen2.5-1.5B-apeach", "pooling",
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
+         "generate"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
+         "pooling", "embed"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
          "classify"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/ms-marco-MiniLM-L-6-v2", "pooling",
-         "score"),
+        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
+         "pooling", "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index e9b21e1a723..8cecc2892b6 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -21,7 +21,7 @@ def test_duplicated_ignored_sequence_group():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=256)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
@@ -35,7 +35,7 @@ def test_max_tokens_none():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=None)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
@@ -46,7 +46,7 @@ def test_max_tokens_none():
 
 
 def test_gc():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2",
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
               load_format=LoadFormat.RUNAI_STREAMER,
               enforce_eager=True)
     del llm

From 52b75f7817243419de06c59a5412bd1ef4a958ad Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Feb 2025 17:58:06 +0530
Subject: [PATCH 0283/1240] [Kernel] LoRA - Refactor sgmv kernels (#13110)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/ops/triton_ops/kernel_utils.py | 243 +++++++++++++++++++++++
 vllm/lora/ops/triton_ops/sgmv_expand.py  | 117 ++++-------
 vllm/lora/ops/triton_ops/sgmv_shrink.py  |  96 ++++-----
 3 files changed, 327 insertions(+), 129 deletions(-)
 create mode 100644 vllm/lora/ops/triton_ops/kernel_utils.py

diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
new file mode 100644
index 00000000000..3572d301862
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Utilities for Punica kernel construction.
+"""
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def mm_k(a_ptr, b_ptr, ak_stride, bk_stride, offset_k, K: tl.constexpr,
+         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+         EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr, CAST_TYPE: tl.constexpr,
+         b_dtype: tl.constexpr):
+    """
+    Given a_ptr and b_ptr, that identify the rows of A (m x k) and columns of
+    B (k x n), iterate, through the K dimension to compute the partial/complete
+    matrix block product.
+    If SPLIT_K == 1, the output m x n product is complete.
+    If SPLIT_K > 1, the thread block computes partial outputs. The partial
+    outputs are then atomically summed in the caller code. 
+    Args:
+        a_ptr: Array of pointers, identifying rows of A 
+        b_ptr: Array of pointers, identifying columns of B
+        ak_stride: K dimension stride of the A matrix
+        bk_stride: K dimension stride of the B matrix
+        K: Length of the K dimension
+        BLOCK_M: M dimension of the output block m x n
+        BLOCK_N: N dimension of the output block m x n
+        BLOCK_K: K dimension atom
+        EVEN_K: True if the blocks of A and B can be loaded without any
+          masking.
+        SPLIT_K: Parameter signifying parallelism in the K dimension. 
+        CAST_TYPE: if True, cast the values from the A matrix to the B
+          matrix dtype.
+        b_dtype: datatype of the B matrix
+    """
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(tl.cdiv(K, BLOCK_K * SPLIT_K)):
+        if EVEN_K:
+            tiled_a = tl.load(a_ptr)
+            tiled_b = tl.load(b_ptr)
+        else:
+            tiled_a = tl.load(a_ptr,
+                              mask=offset_k[None, :]
+                              < K - k * (BLOCK_K * SPLIT_K),
+                              other=0)
+            tiled_b = tl.load(b_ptr,
+                              mask=offset_k[:, None]
+                              < K - k * (BLOCK_K * SPLIT_K),
+                              other=0)
+        if CAST_TYPE:
+            tiled_a = tiled_a.to(b_dtype)
+        accumulator += tl.dot(
+            tiled_a,
+            tiled_b,
+        )
+        a_ptr += BLOCK_K * SPLIT_K * ak_stride
+        b_ptr += BLOCK_K * SPLIT_K * bk_stride
+    return accumulator
+
+
+@triton.jit
+def do_expand_kernel(
+    pid_n,
+    lora_index,
+    slice_id,
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    M_LEN,
+    ram,  # array identifying the rows of Input ptr to operate on
+    slice_start_loc,
+    # input ptr strides
+    input_d0_stride,
+    input_d1_stride,
+    input_d2_stride,
+    # lora ptr strides
+    ls_d0_ptr,
+    ls_d1_ptr,
+    ls_d2_ptr,
+    # out ptr strides
+    output_d0_stride,
+    output_d1_stride,
+    # constants
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    SAME_STRIDE: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    CAST_TYPE: tl.constexpr,
+    ADD_INPUTS: tl.constexpr,
+):
+    """
+    Given an array of integers that identifies the rows of A, ram,
+    a lora index that identifies which LoRA to use from lora_ptr, lora_index,
+    a slice_id that identifies the input/output slice,
+    compute the matrix product and store in the appropriate output location.
+    Given that this is an expand kernel, we don't perform any split-K reduction
+    as the K dimension is assumed to be small.
+    """
+
+    # ls_d*_ptr can be either an integer or a pointer
+    if SAME_STRIDE:
+        # integer
+        cur_lora_d0_stride = ls_d0_ptr
+        cur_lora_d1_stride = ls_d1_ptr
+        cur_lora_d2_stride = ls_d2_ptr
+    else:
+        # pointer
+        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
+        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
+        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
+
+    # Identify the input_ptr and lora_ptr from slice_id.
+    if SLICE_NUM == 1:
+        cur_input_ptr = input_ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        cur_input_ptr = input_ptr + slice_id * input_d0_stride
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(out_ptr.dtype.element_ty))
+
+    # Identify the column indices of B to process.
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    # Identify A and B block pointers
+    offset_k = tl.arange(0, BLOCK_K)
+    a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
+             offset_k[None, :] * input_d2_stride, )
+    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
+             offset_k[:, None] * cur_lora_d2_stride +
+             rbn[None, :] * cur_lora_d1_stride)
+
+    # Compute the block matrix product.
+    SPLIT_K = 1
+    accumulator = mm_k(a_ptr, b_ptr, input_d2_stride, cur_lora_d2_stride,
+                       offset_k, K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K,
+                       CAST_TYPE, cur_lora_ptr.dtype.element_ty)
+
+    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
+    if SLICE_NUM == 1:
+        cur_slice_start = slice_start_loc
+    else:
+        cur_slice_start = tl.load(slice_start_loc + slice_id)
+
+    # Identify the C output pointers to store the results of the accumulator.
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
+    offset_cm = tl.arange(0, BLOCK_M)
+    c_ptr = (out_ptr + ram[:, None] * output_d0_stride +
+             offset_cn[None, :] * output_d1_stride)
+    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :]
+                                             < (cur_slice_start + N))
+
+    if ADD_INPUTS:
+        tiled_out = tl.load(c_ptr, mask=c_mask)
+        tiled_c += tiled_out
+    tl.store(c_ptr, tiled_c, mask=c_mask)
+
+
+@triton.jit
+def do_shrink_kernel(
+    pid_n,
+    pid_sk,
+    slice_id,
+    lora_index,
+    input_ptr,
+    lora_ptr,
+    out_ptr,
+    N,
+    K,
+    M_LEN,
+    ram,
+    # input strides
+    input_d0_stride,
+    input_d1_stride,
+    # lora strides
+    lora_d0_stride,
+    lora_d1_stride,
+    lora_d2_stride,
+    # output strides
+    output_d0_stride,
+    output_d1_stride,
+    output_d2_stride,
+    scaling,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    SPLIT_K: tl.constexpr,
+    SLICE_NUM: tl.constexpr,
+):
+    """
+    Given an array of integers that identifies the rows of A, ram,
+    a lora index that identifies which LoRA to use from lora_ptr, lora_index,
+    a slice_id that identifies the input/output slice, compute the
+    matrix product and store in the appropriate output location.
+    """
+
+    # Identify the lora_ptr from slice_id.
+    if SLICE_NUM == 1:
+        # current lora ptr
+        cur_lora_ptr = lora_ptr
+    else:
+        # current lora ptr
+        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
+            tl.pointer_type(input_ptr.dtype.element_ty))
+
+    # Identify the column indices of B to process.
+    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
+
+    # Identify A and B block pointers
+    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
+    a_ptr = (input_ptr + ram[:, None] * input_d0_stride +
+             offset_k[None, :] * input_d1_stride)
+    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
+             rbn[None, :] * lora_d1_stride +
+             offset_k[:, None] * lora_d2_stride)
+
+    # Compute partial/complete block matrix product.
+    accumulator = mm_k(a_ptr, b_ptr, input_d1_stride, lora_d2_stride, offset_k,
+                       K, BLOCK_M, BLOCK_N, BLOCK_K, EVEN_K, SPLIT_K, False,
+                       cur_lora_ptr.dtype.element_ty)
+
+    # Identify the C output pointers to store the results of the accumulator.
+    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
+    offset_cm = tl.arange(0, BLOCK_M)
+    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
+                   slice_id * output_d0_stride)
+    c_ptr = cur_out_ptr + ram[:, None] * output_d1_stride + offset_cn[
+        None, :] * output_d2_stride
+    c_mask = (offset_cm[:, None] < M_LEN) & (offset_cn[None, :] < N)
+
+    accumulator *= scaling
+    # handles write-back with reduction-splitting
+    if SPLIT_K == 1:
+        tl.store(c_ptr, accumulator, mask=c_mask)
+    else:
+        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
index a8e71cacfe5..6aa3eafaba4 100644
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ b/vllm/lora/ops/triton_ops/sgmv_expand.py
@@ -14,6 +14,7 @@
 
 from vllm.utils import direct_register_custom_op
 
+from .kernel_utils import do_expand_kernel
 from .utils import _get_lora_b_ptr
 
 
@@ -63,86 +64,56 @@ def _sgmv_expand_kernel(
     curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
     pid_m = pid // cta_n_num
     pid_n = pid % cta_n_num
+
     M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M > M:
+    if pid_m * BLOCK_M >= M:
         return
-    if pid_n * BLOCK_N > curr_N:
+    if pid_n * BLOCK_N >= curr_N:
         return
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
 
-    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
-    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = tl.arange(0, BLOCK_K)
-    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % curr_N, BLOCK_N),
-                            BLOCK_N)
-    # ls_d*_ptr can be either an integer or a pointer
-    if SAME_STRIDE:
-        # integer
-        cur_lora_d0_stride = ls_d0_ptr
-        cur_lora_d1_stride = ls_d1_ptr
-        cur_lora_d2_stride = ls_d2_ptr
-    else:
-        # pointer
-        cur_lora_d0_stride = tl.load(ls_d0_ptr + slice_id)
-        cur_lora_d1_stride = tl.load(ls_d1_ptr + slice_id)
-        cur_lora_d2_stride = tl.load(ls_d2_ptr + slice_id)
-    if SLICE_NUM == 1:
-        cur_input_ptr = input_ptr
-        cur_lora_ptr = lora_ptr
-
-    else:
-        cur_input_ptr = input_ptr + slice_id * input_d0_stride
-        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
-            tl.pointer_type(out_ptr.dtype.element_ty))
-
-    a_ptr = (cur_input_ptr + cur_seq_start * input_d1_stride +
-             ram[:, None] * input_d1_stride +
-             offset_k[None, :] * input_d2_stride, )
-    b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
-             offset_k[:, None] * cur_lora_d2_stride +
-             rbn[None, :] * cur_lora_d1_stride)
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(tl.cdiv(K, BLOCK_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :] < K - k * BLOCK_K,
-                              other=0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None] < K - k * BLOCK_K,
-                              other=0)
-        if CAST_TYPE:
-            tiled_a = tiled_a.to(cur_lora_ptr.dtype.element_ty)
-        accumulator += tl.dot(
-            tiled_a,
-            tiled_b,
-        )
-        a_ptr += BLOCK_K * input_d2_stride
-        b_ptr += BLOCK_K * cur_lora_d2_stride
-
-    tiled_c = accumulator.to(cur_lora_ptr.dtype.element_ty)
-    if SLICE_NUM == 1:
-        cur_slice_start = slice_start_loc
-    else:
-        cur_slice_start = tl.load(slice_start_loc + slice_id)
-
-    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N + cur_slice_start
-    c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride +
-             offset_cn[None, :] * output_d1_stride)
-    M = tl.load(seq_lens + cur_batch)
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (
-        offset_cn[None, :] < (cur_slice_start + curr_N))
-    if ADD_INPUTS:
-        tiled_out = tl.load(c_ptr, mask=c_mask)
-        tiled_c += tiled_out
-    tl.store(c_ptr, tiled_c, mask=c_mask)
+    m_offset = tl.load(b_seq_start_loc + cur_batch)
+
+    cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
+    cta_m_offset = m_offset + (pid_m * BLOCK_M)
+    offset_m = tl.arange(0, BLOCK_M)
+    ram = cta_m_offset + tl.max_contiguous(
+        tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
+    do_expand_kernel(
+        pid_n,
+        lora_index,
+        slice_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        curr_N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        slice_start_loc,
+        # input ptr strides
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,
+        # lora ptr strides
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,
+        # out ptr strides
+        output_d0_stride,
+        output_d1_stride,
+        # constants
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        SAME_STRIDE,
+        SLICE_NUM,
+        EVEN_K,
+        CAST_TYPE,
+        ADD_INPUTS,
+    )
 
 
 @torch.inference_mode()
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
index 8b26583c11c..b8ed0b020f9 100644
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py
@@ -14,6 +14,7 @@
 
 from vllm.utils import direct_register_custom_op
 
+from .kernel_utils import do_shrink_kernel
 from .utils import _get_lora_a_ptr
 
 
@@ -62,67 +63,50 @@ def _sgmv_shrink_kernel(
         pid_sk = pid_mix % SPLIT_K
 
     M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M > M:
+    if pid_m * BLOCK_M >= M:
         return
     lora_index = tl.load(lora_indices + cur_batch)
     if lora_index == -1:
         return
-    cur_seq_start = tl.load(b_seq_start_loc + cur_batch)
-    offset_m = tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-    offset_n = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    offset_k = pid_sk * BLOCK_K + tl.arange(0, BLOCK_K)
-
-    ram = tl.max_contiguous(tl.multiple_of(offset_m % M, BLOCK_M), BLOCK_M)
-    rbn = tl.max_contiguous(tl.multiple_of(offset_n % N, BLOCK_N), BLOCK_N)
-    # input ptr
-    a_ptr = (input_ptr + cur_seq_start * input_d0_stride +
-             ram[:, None] * input_d0_stride +
-             offset_k[None, :] * input_d1_stride)
 
-    if SLICE_NUM == 1:
-        # current lora ptr
-        cur_lora_ptr = lora_ptr
-    else:
-        # current lora ptr
-        cur_lora_ptr = tl.load(lora_ptr + slice_id).to(
-            tl.pointer_type(input_ptr.dtype.element_ty))
-
-    b_ptr = (cur_lora_ptr + lora_d0_stride * lora_index +
-             rbn[None, :] * lora_d1_stride +
-             offset_k[:, None] * lora_d2_stride)
-
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K * SPLIT_K)):
-        if EVEN_K:
-            tiled_a = tl.load(a_ptr)
-            tiled_b = tl.load(b_ptr)
-        else:
-            k_remaining = K - k * (BLOCK_K * SPLIT_K)
-            tiled_a = tl.load(a_ptr,
-                              mask=offset_k[None, :] < k_remaining,
-                              other=0.0)
-            tiled_b = tl.load(b_ptr,
-                              mask=offset_k[:, None] < k_remaining,
-                              other=0.0)
-        accumulator += tl.dot(tiled_a, tiled_b)
-
-        a_ptr += BLOCK_K * SPLIT_K * input_d1_stride
-        b_ptr += BLOCK_K * SPLIT_K * lora_d2_stride
-    offset_cm = cur_seq_start + tl.arange(0, BLOCK_M) + pid_m * BLOCK_M
-
-    offset_cn = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
-    cur_out_ptr = (out_ptr if SLICE_NUM == 1 else out_ptr +
-                   slice_id * output_d0_stride)
-    c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[
-        None, :] * output_d2_stride
-    c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :]
-                                                           < N)
-    accumulator *= scaling
-    # handles write-back with reduction-splitting
-    if SPLIT_K == 1:
-        tl.store(c_ptr, accumulator, mask=c_mask)
-    else:
-        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
+    m_offset = tl.load(b_seq_start_loc + cur_batch)
+
+    cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
+    cta_m_offset = m_offset + (pid_m * BLOCK_M)
+    offset_m = tl.arange(0, BLOCK_M)
+    ram = cta_m_offset + tl.max_contiguous(
+        tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
+
+    do_shrink_kernel(
+        pid_n,
+        pid_sk,
+        slice_id,
+        lora_index,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        cta_m_len,
+        ram,
+        # input strides
+        input_d0_stride,
+        input_d1_stride,
+        # lora strides
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,
+        # output strides
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,
+        scaling,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        SLICE_NUM)
 
 
 @torch.inference_mode()

From 773da6187306c0f68607b5923483120e849bb58b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 20 Feb 2025 12:53:51 +0000
Subject: [PATCH 0284/1240] Merge similar examples in `offline_inference` into
 single `basic` example (#12737)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/run-gh200-test.sh                  |  2 +-
 .buildkite/run-hpu-test.sh                    |  2 +-
 .buildkite/run-openvino-test.sh               |  2 +-
 .buildkite/run-xpu-test.sh                    |  4 +-
 .buildkite/test-pipeline.yaml                 | 12 +--
 docs/source/generate_examples.py              |  4 +-
 .../getting_started/installation/cpu/index.md |  4 +-
 docs/source/getting_started/quickstart.md     |  2 +-
 docs/source/models/generative_models.md       |  4 +-
 docs/source/models/pooling_models.md          |  6 +-
 examples/offline_inference/aqlm_example.py    | 47 ---------
 examples/offline_inference/arctic.py          | 28 ------
 examples/offline_inference/basic/README.md    | 94 ++++++++++++++++++
 .../offline_inference/{ => basic}/basic.py    |  0
 examples/offline_inference/basic/chat.py      | 98 +++++++++++++++++++
 examples/offline_inference/basic/classify.py  | 42 ++++++++
 examples/offline_inference/basic/embed.py     | 42 ++++++++
 examples/offline_inference/basic/generate.py  | 57 +++++++++++
 examples/offline_inference/basic/score.py     | 38 +++++++
 .../basic_with_model_default_sampling.py      | 32 ------
 examples/offline_inference/chat.py            | 82 ----------------
 examples/offline_inference/classification.py  | 30 ------
 examples/offline_inference/cli.py             | 82 ----------------
 examples/offline_inference/cpu_offload.py     | 24 -----
 examples/offline_inference/embedding.py       | 30 ------
 examples/offline_inference/gguf_inference.py  | 34 -------
 examples/offline_inference/scoring.py         | 25 -----
 tests/plugins_tests/test_platform_plugins.py  |  2 +-
 29 files changed, 394 insertions(+), 437 deletions(-)
 delete mode 100644 examples/offline_inference/aqlm_example.py
 delete mode 100644 examples/offline_inference/arctic.py
 create mode 100644 examples/offline_inference/basic/README.md
 rename examples/offline_inference/{ => basic}/basic.py (100%)
 create mode 100644 examples/offline_inference/basic/chat.py
 create mode 100644 examples/offline_inference/basic/classify.py
 create mode 100644 examples/offline_inference/basic/embed.py
 create mode 100644 examples/offline_inference/basic/generate.py
 create mode 100644 examples/offline_inference/basic/score.py
 delete mode 100644 examples/offline_inference/basic_with_model_default_sampling.py
 delete mode 100644 examples/offline_inference/chat.py
 delete mode 100644 examples/offline_inference/classification.py
 delete mode 100644 examples/offline_inference/cli.py
 delete mode 100644 examples/offline_inference/cpu_offload.py
 delete mode 100644 examples/offline_inference/embedding.py
 delete mode 100644 examples/offline_inference/gguf_inference.py
 delete mode 100644 examples/offline_inference/scoring.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index e19ace782fe..2ead1f51ed8 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -30,7 +30,7 @@ function cpu_tests() {
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
-    python3 examples/offline_inference/basic.py"
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 99972afa21d..20aca328ba1 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -24,5 +24,5 @@ remove_docker_container
 
 # Run the image and test offline inference
 docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
+    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index 1edcb1d2669..f83eb927aae 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 EXITCODE=$?
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 6159b21ff82..a1103bed66e 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 4d344e58db8..d48639e5720 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -14,6 +14,6 @@ remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
-    python3 examples/offline_inference/basic.py
-    python3 examples/offline_inference/cli.py -tp 2
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9d05ff4c2cf..66efe3ed329 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -215,18 +215,18 @@ steps:
   - examples/
   commands:
     - pip install tensorizer # for tensorizer test
-    - python3 offline_inference/basic.py
-    - python3 offline_inference/cpu_offload.py
-    - python3 offline_inference/chat.py
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
     - python3 offline_inference/vision_language.py
     - python3 offline_inference/vision_language_multi_image.py
     - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder.py
-    - python3 offline_inference/classification.py
-    - python3 offline_inference/embedding.py
-    - python3 offline_inference/scoring.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
     - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 9d4de18a3b7..c5f75953aaf 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -147,7 +147,7 @@ def generate(self) -> str:
             return content
 
         content += "## Example materials\n\n"
-        for file in self.other_files:
+        for file in sorted(self.other_files):
             include = "include" if file.suffix == ".md" else "literalinclude"
             content += f":::{{admonition}} {file.relative_to(self.path)}\n"
             content += ":class: dropdown\n\n"
@@ -194,7 +194,7 @@ def generate_examples():
             path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
             title="Offline Inference",
             description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches.",  # noqa: E501
+            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.",  # noqa: E501
             caption="Examples",
         ),
     }
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md
index d5343040358..9c5977939cc 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu/index.md
@@ -170,7 +170,7 @@ vLLM CPU backend supports the following vLLM features:
 sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library
 find / -name *libtcmalloc* # find the dynamic link library path
 export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD
-python examples/offline_inference/basic.py # run vLLM
+python examples/offline_inference/basic/basic.py # run vLLM
 ```
 
 - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP:
@@ -207,7 +207,7 @@ CPU NODE SOCKET CORE L1d:L1i:L2:L3 ONLINE    MAXMHZ   MINMHZ      MHZ
 
 # On this platform, it is recommend to only bind openMP threads on logical CPU cores 0-7 or 8-15
 $ export VLLM_CPU_OMP_THREADS_BIND=0-7
-$ python examples/offline_inference/basic.py
+$ python examples/offline_inference/basic/basic.py
 ```
 
 - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using `VLLM_CPU_OMP_THREADS_BIND` to avoid cross NUMA node memory access.
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index f4682ee45a4..f3a4773f0fc 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -40,7 +40,7 @@ For non-CUDA platforms, please refer [here](#installation-index) for specific in
 
 ## Offline Batched Inference
 
-With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic.py>
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
 
 The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 4abe6b776ee..f31e5715d17 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,7 +46,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/basic.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
 
 ### `LLM.beam_search`
 
@@ -103,7 +103,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/chat.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/chat.py>
 
 If the model doesn't have a chat template or you want to specify another one,
 you can explicitly pass a chat template:
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 764b6724199..8612935432b 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -88,7 +88,7 @@ embeds = output.outputs.embedding
 print(f"Embeddings: {embeds!r} (size={len(embeds)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/embedding.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/embed.py>
 
 ### `LLM.classify`
 
@@ -103,7 +103,7 @@ probs = output.outputs.probs
 print(f"Class Probabilities: {probs!r} (size={len(probs)})")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/classification.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/classify.py>
 
 ### `LLM.score`
 
@@ -125,7 +125,7 @@ score = output.outputs.score
 print(f"Score: {score}")
 ```
 
-A code example can be found here: <gh-file:examples/offline_inference/scoring.py>
+A code example can be found here: <gh-file:examples/offline_inference/basic/score.py>
 
 ## Online Serving
 
diff --git a/examples/offline_inference/aqlm_example.py b/examples/offline_inference/aqlm_example.py
deleted file mode 100644
index e8db3811ff1..00000000000
--- a/examples/offline_inference/aqlm_example.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser
-
-
-def main():
-
-    parser = FlexibleArgumentParser(description='AQLM examples')
-
-    parser.add_argument('--model',
-                        '-m',
-                        type=str,
-                        default=None,
-                        help='model path, as for HF')
-    parser.add_argument('--choice',
-                        '-c',
-                        type=int,
-                        default=0,
-                        help='known good models by index, [0-4]')
-    parser.add_argument('--tensor-parallel-size',
-                        '-t',
-                        type=int,
-                        default=1,
-                        help='tensor parallel size')
-
-    args = parser.parse_args()
-
-    models = [
-        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
-        "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
-        "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
-        "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
-        "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
-    ]
-
-    model = LLM(args.model if args.model is not None else models[args.choice],
-                tensor_parallel_size=args.tensor_parallel_size)
-
-    sampling_params = SamplingParams(max_tokens=100, temperature=0)
-    outputs = model.generate("Hello my name is",
-                             sampling_params=sampling_params)
-    print(outputs[0].outputs[0].text)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/examples/offline_inference/arctic.py b/examples/offline_inference/arctic.py
deleted file mode 100644
index 90c88446c51..00000000000
--- a/examples/offline_inference/arctic.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Create an LLM.
-llm = LLM(model="snowflake/snowflake-arctic-instruct",
-          quantization="deepspeedfp",
-          tensor_parallel_size=8,
-          trust_remote_code=True)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/basic/README.md b/examples/offline_inference/basic/README.md
new file mode 100644
index 00000000000..5cb0177b355
--- /dev/null
+++ b/examples/offline_inference/basic/README.md
@@ -0,0 +1,94 @@
+# Basic
+
+The `LLM` class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server.
+
+## Usage
+
+The first script in this example shows the most basic usage of vLLM. If you are new to Python and vLLM, you should start here.
+
+```bash
+python examples/offline_inference/basic/basic.py
+```
+
+The rest of the scripts include an [argument parser](https://docs.python.org/3/library/argparse.html), which you can use to pass any arguments that are compatible with [`LLM`](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html). Try running the script with `--help` for a list of all available arguments.
+
+```bash
+python examples/offline_inference/basic/classify.py
+```
+
+```bash
+python examples/offline_inference/basic/embed.py
+```
+
+```bash
+python examples/offline_inference/basic/score.py
+```
+
+The chat and generate scripts also accept the [sampling parameters](https://docs.vllm.ai/en/latest/api/inference_params.html#sampling-parameters): `max_tokens`, `temperature`, `top_p` and `top_k`.
+
+```bash
+python examples/offline_inference/basic/chat.py
+```
+
+```bash
+python examples/offline_inference/basic/generate.py
+```
+
+## Features
+
+In the scripts that support passing arguments, you can experiment with the following features.
+
+### Default generation config
+
+The `--generation-config` argument specifies where the generation config will be loaded from when calling `LLM.get_default_sampling_params()`. If set to ‘auto’, the generation config will be loaded from model path. If set to a folder path, the generation config will be loaded from the specified folder path. If it is not provided, vLLM defaults will be used.
+
+> If max_new_tokens is specified in generation config, then it sets a server-wide limit on the number of output tokens for all requests.
+
+Try it yourself with the following argument:
+
+```bash
+--generation-config auto
+```
+
+### Quantization
+
+#### AQLM
+
+vLLM supports models that are quantized using AQLM.
+
+Try one yourself by passing one of the following models to the `--model` argument:
+
+- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf`
+- `ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf`
+- `ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf`
+- `ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf`
+- `BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf`
+
+> Some of these models are likely to be too large for a single GPU. You can split them across multiple GPUs by setting `--tensor-parallel-size` to the number of required GPUs.
+
+#### GGUF
+
+vLLM supports models that are quantized using GGUF.
+
+Try one yourself by downloading a GUFF quantised model and using the following arguments:
+
+```python
+from huggingface_hub import hf_hub_download
+repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
+filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
+print(hf_hub_download(repo_id, filename=filename))
+```
+
+```bash
+--model {local-path-printed-above} --tokenizer microsoft/Phi-3-medium-4k-instruct
+```
+
+### CPU offload
+
+The `--cpu-offload-gb` argument can be seen as a virtual way to increase the GPU memory size. For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU. Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory. Note that this requires fast CPU-GPU interconnect, as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.
+
+Try it yourself with the following arguments:
+
+```bash
+--model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+```
diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic/basic.py
similarity index 100%
rename from examples/offline_inference/basic.py
rename to examples/offline_inference/basic/basic.py
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
new file mode 100644
index 00000000000..b2523e533a4
--- /dev/null
+++ b/examples/offline_inference/basic/chat.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+    chat_template_path = args.pop("chat_template_path")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    def print_outputs(outputs):
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}")
+            print(f"Generated text: {generated_text!r}")
+        print("-" * 80)
+
+    print("=" * 80)
+
+    # In this script, we demonstrate how to pass input to the chat method:
+    conversation = [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant"
+        },
+        {
+            "role": "user",
+            "content": "Hello"
+        },
+        {
+            "role": "assistant",
+            "content": "Hello! How can I assist you today?"
+        },
+        {
+            "role": "user",
+            "content":
+            "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
+    print_outputs(outputs)
+
+    # You can run batch inference with llm.chat API
+    conversations = [conversation for _ in range(10)]
+
+    # We turn on tqdm progress bar to verify it's indeed running batch inference
+    outputs = llm.chat(conversations, sampling_params, use_tqdm=True)
+    print_outputs(outputs)
+
+    # A chat template can be optionally supplied.
+    # If not, the model will use its default chat template.
+    if chat_template_path is not None:
+        with open(chat_template_path) as f:
+            chat_template = f.read()
+
+        outputs = llm.chat(
+            conversations,
+            sampling_params,
+            use_tqdm=False,
+            chat_template=chat_template,
+        )
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    # Add example params
+    parser.add_argument("--chat-template-path", type=str)
+    args: dict = vars(parser.parse_args())
+    main(args)
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
new file mode 100644
index 00000000000..4ef949b4784
--- /dev/null
+++ b/examples/offline_inference/basic/classify.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="classify" for classification models
+    model = LLM(**vars(args))
+
+    # Generate logits. The output is a list of ClassificationRequestOutputs.
+    outputs = model.classify(prompts)
+
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        probs = output.outputs.probs
+        probs_trimmed = ((str(probs[:16])[:-1] +
+                          ", ...]") if len(probs) > 16 else probs)
+        print(f"Prompt: {prompt!r} | "
+              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
+                        task="classify",
+                        enforce_eager=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
new file mode 100644
index 00000000000..f1655b6dbe1
--- /dev/null
+++ b/examples/offline_inference/basic/embed.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # Create an LLM.
+    # You should pass task="embed" for embedding models
+    model = LLM(**vars(args))
+
+    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
+    outputs = model.embed(prompts)
+
+    # Print the outputs.
+    for prompt, output in zip(prompts, outputs):
+        embeds = output.outputs.embedding
+        embeds_trimmed = ((str(embeds[:16])[:-1] +
+                           ", ...]") if len(embeds) > 16 else embeds)
+        print(f"Prompt: {prompt!r} | "
+              f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
+                        task="embed",
+                        enforce_eager=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py
new file mode 100644
index 00000000000..93f4f2a36fa
--- /dev/null
+++ b/examples/offline_inference/basic/generate.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: dict):
+    # Pop arguments not used by LLM
+    max_tokens = args.pop("max_tokens")
+    temperature = args.pop("temperature")
+    top_p = args.pop("top_p")
+    top_k = args.pop("top_k")
+
+    # Create an LLM
+    llm = LLM(**args)
+
+    # Create a sampling params object
+    sampling_params = llm.get_default_sampling_params()
+    if max_tokens is not None:
+        sampling_params.max_tokens = max_tokens
+    if temperature is not None:
+        sampling_params.temperature = temperature
+    if top_p is not None:
+        sampling_params.top_p = top_p
+    if top_k is not None:
+        sampling_params.top_k = top_k
+
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    # Add engine args
+    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(engine_group)
+    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
+    # Add sampling params
+    sampling_group = parser.add_argument_group("Sampling parameters")
+    sampling_group.add_argument("--max-tokens", type=int)
+    sampling_group.add_argument("--temperature", type=float)
+    sampling_group.add_argument("--top-p", type=float)
+    sampling_group.add_argument("--top-k", type=int)
+    args: dict = vars(parser.parse_args())
+    main(args)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
new file mode 100644
index 00000000000..2d21f1f0e39
--- /dev/null
+++ b/examples/offline_inference/basic/score.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from argparse import Namespace
+
+from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: Namespace):
+    # Sample prompts.
+    text_1 = "What is the capital of France?"
+    texts_2 = [
+        "The capital of Brazil is Brasilia.",
+        "The capital of France is Paris.",
+    ]
+
+    # Create an LLM.
+    # You should pass task="score" for cross-encoder models
+    model = LLM(**vars(args))
+
+    # Generate scores. The output is a list of ScoringRequestOutputs.
+    outputs = model.score(text_1, texts_2)
+
+    # Print the outputs.
+    for text_2, output in zip(texts_2, outputs):
+        score = output.outputs.score
+        print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    # Set example specific arguments
+    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
+                        task="score",
+                        enforce_eager=True)
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/basic_with_model_default_sampling.py b/examples/offline_inference/basic_with_model_default_sampling.py
deleted file mode 100644
index 80de9428f6a..00000000000
--- a/examples/offline_inference/basic_with_model_default_sampling.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM with built-in default generation config.
-# The generation config is set to None by default to keep
-# the behavior consistent with the previous version.
-# If you want to use the default generation config from the model,
-# you should set the generation_config to "auto".
-llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", generation_config="auto")
-
-# Load the default sampling parameters from the model.
-sampling_params = llm.get_default_sampling_params()
-# Modify the sampling parameters if needed.
-sampling_params.temperature = 0.5
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/chat.py b/examples/offline_inference/chat.py
deleted file mode 100644
index dbc710cc8a0..00000000000
--- a/examples/offline_inference/chat.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-
-llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
-sampling_params = SamplingParams(temperature=0.5)
-
-
-def print_outputs(outputs):
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    print("-" * 80)
-
-
-print("=" * 80)
-
-# In this script, we demonstrate how to pass input to the chat method:
-
-conversation = [
-    {
-        "role": "system",
-        "content": "You are a helpful assistant"
-    },
-    {
-        "role": "user",
-        "content": "Hello"
-    },
-    {
-        "role": "assistant",
-        "content": "Hello! How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": "Write an essay about the importance of higher education.",
-    },
-]
-outputs = llm.chat(conversation,
-                   sampling_params=sampling_params,
-                   use_tqdm=False)
-print_outputs(outputs)
-
-# You can run batch inference with llm.chat API
-conversation = [
-    {
-        "role": "system",
-        "content": "You are a helpful assistant"
-    },
-    {
-        "role": "user",
-        "content": "Hello"
-    },
-    {
-        "role": "assistant",
-        "content": "Hello! How can I assist you today?"
-    },
-    {
-        "role": "user",
-        "content": "Write an essay about the importance of higher education.",
-    },
-]
-conversations = [conversation for _ in range(10)]
-
-# We turn on tqdm progress bar to verify it's indeed running batch inference
-outputs = llm.chat(messages=conversations,
-                   sampling_params=sampling_params,
-                   use_tqdm=True)
-print_outputs(outputs)
-
-# A chat template can be optionally supplied.
-# If not, the model will use its default chat template.
-
-# with open('template_falcon_180b.jinja', "r") as f:
-#     chat_template = f.read()
-
-# outputs = llm.chat(
-#     conversations,
-#     sampling_params=sampling_params,
-#     use_tqdm=False,
-#     chat_template=chat_template,
-# )
diff --git a/examples/offline_inference/classification.py b/examples/offline_inference/classification.py
deleted file mode 100644
index 4a364aeb8c4..00000000000
--- a/examples/offline_inference/classification.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM.
-# You should pass task="classify" for classification models
-model = LLM(
-    model="jason9693/Qwen2.5-1.5B-apeach",
-    task="classify",
-    enforce_eager=True,
-)
-
-# Generate logits. The output is a list of ClassificationRequestOutputs.
-outputs = model.classify(prompts)
-
-# Print the outputs.
-for prompt, output in zip(prompts, outputs):
-    probs = output.outputs.probs
-    probs_trimmed = ((str(probs[:16])[:-1] +
-                      ", ...]") if len(probs) > 16 else probs)
-    print(f"Prompt: {prompt!r} | "
-          f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
diff --git a/examples/offline_inference/cli.py b/examples/offline_inference/cli.py
deleted file mode 100644
index bc6833b3f39..00000000000
--- a/examples/offline_inference/cli.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import asdict
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.utils import FlexibleArgumentParser
-
-
-def get_prompts(num_prompts: int):
-    # The default sample prompts.
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-    if num_prompts != len(prompts):
-        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
-
-    return prompts
-
-
-def main(args):
-    # Create prompts
-    prompts = get_prompts(args.num_prompts)
-
-    # Create a sampling params object.
-    sampling_params = SamplingParams(n=args.n,
-                                     temperature=args.temperature,
-                                     top_p=args.top_p,
-                                     top_k=args.top_k,
-                                     max_tokens=args.max_tokens)
-
-    # Create an LLM.
-    # The default model is 'facebook/opt-125m'
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**asdict(engine_args))
-
-    # Generate texts from the prompts.
-    # The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-if __name__ == '__main__':
-    parser = FlexibleArgumentParser()
-    parser = EngineArgs.add_cli_args(parser)
-    group = parser.add_argument_group("SamplingParams options")
-    group.add_argument("--num-prompts",
-                       type=int,
-                       default=4,
-                       help="Number of prompts used for inference")
-    group.add_argument("--max-tokens",
-                       type=int,
-                       default=16,
-                       help="Generated output length for sampling")
-    group.add_argument('--n',
-                       type=int,
-                       default=1,
-                       help='Number of generated sequences per prompt')
-    group.add_argument('--temperature',
-                       type=float,
-                       default=0.8,
-                       help='Temperature for text generation')
-    group.add_argument('--top-p',
-                       type=float,
-                       default=0.95,
-                       help='top_p for text generation')
-    group.add_argument('--top-k',
-                       type=int,
-                       default=-1,
-                       help='top_k for text generation')
-
-    args = parser.parse_args()
-    main(args)
diff --git a/examples/offline_inference/cpu_offload.py b/examples/offline_inference/cpu_offload.py
deleted file mode 100644
index 5511eb73877..00000000000
--- a/examples/offline_inference/cpu_offload.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Create an LLM.
-llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/embedding.py b/examples/offline_inference/embedding.py
deleted file mode 100644
index f9399329d24..00000000000
--- a/examples/offline_inference/embedding.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-# Create an LLM.
-# You should pass task="embed" for embedding models
-model = LLM(
-    model="intfloat/e5-mistral-7b-instruct",
-    task="embed",
-    enforce_eager=True,
-)
-
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = model.embed(prompts)
-
-# Print the outputs.
-for prompt, output in zip(prompts, outputs):
-    embeds = output.outputs.embedding
-    embeds_trimmed = ((str(embeds[:16])[:-1] +
-                       ", ...]") if len(embeds) > 16 else embeds)
-    print(f"Prompt: {prompt!r} | "
-          f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
diff --git a/examples/offline_inference/gguf_inference.py b/examples/offline_inference/gguf_inference.py
deleted file mode 100644
index 0447e74e0d6..00000000000
--- a/examples/offline_inference/gguf_inference.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from huggingface_hub import hf_hub_download
-
-from vllm import LLM, SamplingParams
-
-
-def run_gguf_inference(model_path, tokenizer):
-    # Sample prompts.
-    prompts = [
-        "How many helicopters can a human eat in one sitting?",
-        "What's the future of AI?",
-    ]
-    prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
-    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0, max_tokens=128)
-
-    # Create an LLM.
-    llm = LLM(model=model_path, tokenizer=tokenizer)
-
-    outputs = llm.chat(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
-    filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
-    tokenizer = "microsoft/Phi-3-medium-4k-instruct"
-    model = hf_hub_download(repo_id, filename=filename)
-    run_gguf_inference(model, tokenizer)
diff --git a/examples/offline_inference/scoring.py b/examples/offline_inference/scoring.py
deleted file mode 100644
index 7daa82b8277..00000000000
--- a/examples/offline_inference/scoring.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm import LLM
-
-# Sample prompts.
-text_1 = "What is the capital of France?"
-texts_2 = [
-    "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-]
-
-# Create an LLM.
-# You should pass task="score" for cross-encoder models
-model = LLM(
-    model="BAAI/bge-reranker-v2-m3",
-    task="score",
-    enforce_eager=True,
-)
-
-# Generate scores. The output is a list of ScoringRequestOutputs.
-outputs = model.score(text_1, texts_2)
-
-# Print the outputs.
-for text_2, output in zip(texts_2, outputs):
-    score = output.outputs.score
-    print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index ed50fe53501..3be248f5aca 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -14,7 +14,7 @@ def test_platform_plugins():
     import os
     example_file = os.path.join(
         os.path.dirname(os.path.dirname(os.path.dirname(current_file))),
-        "examples", "offline_inference/basic.py")
+        "examples", "offline_inference/basic/basic.py")
     runpy.run_path(example_file)
 
     # check if the plugin is loaded correctly

From abc30e3319a413b789ec66420b64386913799170 Mon Sep 17 00:00:00 2001
From: chenxiaobing <22113491+Chen-XiaoBing@users.noreply.github.com>
Date: Thu, 20 Feb 2025 22:47:01 +0800
Subject: [PATCH 0285/1240] [Bugfix] Fix deepseekv3 grouped topk error (#13474)

Signed-off-by: Chen-XiaoBing <chenxb002@whu.edu.cn>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index d0b6249e1c3..543c8ced165 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -939,15 +939,17 @@ def grouped_topk(hidden_states: torch.Tensor,
     else:
         raise ValueError(f"Unsupported scoring function: {scoring_func}")
 
+    num_token = scores.shape[0]
     if e_score_correction_bias is not None:
         # Store original scores before applying correction bias. We use biased
         # scores for expert selection but original scores for routing weights
         original_scores = scores
         scores = scores + e_score_correction_bias.unsqueeze(0)
-
-    num_token = scores.shape[0]
-    group_scores = scores.view(num_token, num_expert_group,
-                               -1).max(dim=-1).values  # [n, n_group]
+        group_scores = (scores.view(num_token, num_expert_group,
+                                    -1).topk(2, dim=-1)[0].sum(dim=-1))
+    else:
+        group_scores = scores.view(num_token, num_expert_group,
+                                   -1).max(dim=-1).values  # [n, n_group]
     group_idx = torch.topk(group_scores, k=topk_group, dim=-1,
                            sorted=False)[1]  # [n, top_k_group]
     group_mask = torch.zeros_like(group_scores)  # [n, n_group]
@@ -955,7 +957,8 @@ def grouped_topk(hidden_states: torch.Tensor,
     score_mask = group_mask.unsqueeze(-1).expand(
         num_token, num_expert_group,
         scores.shape[-1] // num_expert_group).reshape(num_token, -1)  # [n, e]
-    tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0)  # [n, e]
+    tmp_scores = scores.masked_fill(~score_mask.bool(),
+                                    float("-inf"))  # [n, e]
 
     if e_score_correction_bias is not None:
         topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)[1]

From fd317d30c49337e0bc007ef394a74eaeefc51cca Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 20 Feb 2025 16:00:14 +0000
Subject: [PATCH 0286/1240] Update `pre-commit`'s `isort` version to remove
 warnings (#13614)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b1967065c09..5c4cb767c9e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
     additional_dependencies: ['tomli']
     args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
-  rev: 5.13.2
+  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
   hooks:
   - id: isort
     exclude: 'vllm/third_party/.*'

From a1fa28dcb9c9b100be39a973fc81b8c759677dd3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 20 Feb 2025 09:24:31 -0800
Subject: [PATCH 0287/1240] [V1][Minor] Print KV cache size in token counts
 (#13596)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6dec87d4dd2..e3eb6b24c19 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -519,11 +519,13 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
             "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override)
         num_blocks = num_gpu_blocks_override
 
-    logger.info("# GPU blocks: %d", num_blocks)
-    max_concurrency = (num_blocks * vllm_config.cache_config.block_size /
-                       vllm_config.model_config.max_model_len)
+    num_tokens = num_blocks * vllm_config.cache_config.block_size
+    num_tokens_str = f"{num_tokens:,}"
+    logger.info("GPU KV cache size: %s tokens", num_tokens_str)
+    max_model_len_str = f"{vllm_config.model_config.max_model_len:,}"
+    max_concurrency = num_tokens / vllm_config.model_config.max_model_len
     logger.info("Maximum concurrency for %s tokens per request: %.2fx",
-                vllm_config.model_config.max_model_len, max_concurrency)
+                max_model_len_str, max_concurrency)
 
     per_layer_size = page_size * num_blocks
 

From 353aced9b7b225ee6b6f69962a41ab14fe33aec0 Mon Sep 17 00:00:00 2001
From: ajayvohra2005 <ajayvohr@amazon.com>
Date: Thu, 20 Feb 2025 13:59:36 -0500
Subject: [PATCH 0288/1240] fix neuron performance issue (#13589)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/neuron_worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 5f0eb0019ee..95e7acd025f 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -76,7 +76,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Set the number of GPU blocks to be the same as the maximum number of
         # sequences that can be processed in a single batch. This is equivalent
         # to schedule without PagedAttention.
-        num_gpu_blocks = self.scheduler_config.max_num_seqs
+        num_gpu_blocks = self.scheduler_config.max_num_seqs + 1
 
         # Swap not yet supported with Neuron backend.
         num_cpu_blocks = 0
@@ -90,7 +90,7 @@ def initialize_cache(self, num_gpu_blocks: int,
 
         # Different values are not tested.
         assert num_cpu_blocks == 0
-        assert num_gpu_blocks == self.scheduler_config.max_num_seqs
+        assert num_gpu_blocks == self.scheduler_config.max_num_seqs + 1
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks

From 0a41517c136c4471a8740f43073fb74e3bad734f Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 20 Feb 2025 13:07:58 -0700
Subject: [PATCH 0289/1240] [Frontend] Add backend-specific options for guided
 decoding (#13505)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/structured_outputs.md    |  2 +-
 ...enai_chat_completion_structured_outputs.py | 25 +++++-
 tests/entrypoints/llm/test_guided_generate.py | 16 ++++
 .../model_executor/test_guided_processors.py  | 10 +++
 vllm/config.py                                |  5 +-
 vllm/engine/arg_utils.py                      |  7 +-
 .../guided_decoding/__init__.py               | 81 ++++++++++---------
 vllm/sampling_params.py                       | 19 +++++
 8 files changed, 123 insertions(+), 42 deletions(-)

diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 90c880e8cfa..1d5aa07ab17 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -16,7 +16,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_json`: the output will follow the JSON schema.
 - `guided_grammar`: the output will follow the context free grammar.
 - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding.
-- `guided_decoding_backend`: used to select the guided decoding backend to use.
+- `guided_decoding_backend`: used to select the guided decoding backend to use. Additional backend-specific options can be supplied in a comma separated list following a colon after the backend name. For example `"xgrammar:no-fallback"` will not allow vLLM to fallback to a different backend on error.
 
 You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page.
 
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index cddd9318000..986ff500e58 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -2,7 +2,7 @@
 
 from enum import Enum
 
-from openai import OpenAI
+from openai import BadRequestError, OpenAI
 from pydantic import BaseModel
 
 client = OpenAI(
@@ -94,3 +94,26 @@ class CarDescription(BaseModel):
     extra_body={"guided_grammar": simplified_sql_grammar},
 )
 print(completion.choices[0].message.content)
+
+# Extra backend options
+prompt = ("Generate an email address for Alan Turing, who works in Enigma."
+          "End in .com and new line. Example result:"
+          "alan.turing@enigma.com\n")
+
+try:
+    # The no-fallback option forces vLLM to use xgrammar, so when it fails
+    # you get a 400 with the reason why
+    completion = client.chat.completions.create(
+        model="Qwen/Qwen2.5-3B-Instruct",
+        messages=[{
+            "role": "user",
+            "content": prompt,
+        }],
+        extra_body={
+            "guided_regex": "\w+@\w+\.com\n",
+            "stop": ["\n"],
+            "guided_decoding_backend": "xgrammar:no-fallback"
+        },
+    )
+except BadRequestError as e:
+    print("This error is expected:", e)
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 70252471cc2..252eb3fb334 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -280,6 +280,22 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):
                      guided_options_request=dict(guided_regex=sample_regex))
 
 
+@pytest.mark.skip_global_cleanup
+def test_disable_guided_decoding_fallback(sample_regex, llm):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend="xgrammar:no-fallback"))
+
+    with pytest.raises(
+            ValueError,
+            match="xgrammar does not support regex guided decoding"):
+        llm.generate(prompts="This should fail",
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 def test_guided_json_object(llm, guided_decoding_backend: str):
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 64d0928f828..be544698fa0 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -109,6 +109,16 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
         GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
 
 
+def test_guided_decoding_backend_options():
+    """Test backend-specific options"""
+    params = GuidedDecodingParams(
+        backend="xgrammar:option-1,option-2,option-3")
+    assert params.backend_options() == ["option-1", "option-2", "option-3"]
+
+    no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback")
+    assert no_fallback.no_fallback()
+
+
 def test_pickle_xgrammar_tokenizer_data():
 
     # TODO: move to another test file for xgrammar
diff --git a/vllm/config.py b/vllm/config.py
index 56315aacbe5..6764694f805 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -25,6 +25,7 @@
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import CpuArchEnum
+from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
     ConfigFormat, get_config, get_hf_image_processor_config,
@@ -2631,7 +2632,9 @@ def compute_hash(self) -> str:
 
     def __post_init__(self):
         valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
-        backend = self.guided_decoding_backend
+
+        backend = GuidedDecodingParams(
+            backend=self.guided_decoding_backend).backend_name
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
                              f"must be one of {valid_guided_backends}")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 78681008b62..5aa77a138a3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -372,14 +372,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--guided-decoding-backend',
             type=str,
             default='xgrammar',
-            choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
             'https://github.com/outlines-dev/outlines, '
             'https://github.com/mlc-ai/xgrammar, and '
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
-            ' parameter.')
+            ' parameter.\n'
+            'Backend-sepcific options can be supplied in a comma-separated '
+            'list following a colon after the backend name. Valid backends and '
+            'all available options are: [xgrammar:no-fallback, '
+            'outlines:no-fallback, lm-format-enforcer:no-fallback]')
         parser.add_argument(
             '--logits-processor-pattern',
             type=nullable_str,
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 77212a1d8cf..1522e340418 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -22,47 +22,56 @@
 
 def maybe_backend_fallback(
         guided_params: GuidedDecodingParams) -> GuidedDecodingParams:
+
+    def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
+                          fallback: str) -> None:
+        """Change the backend to the specified fallback with a warning log, 
+        or raise a ValueError if the `no-fallback` option is specified."""
+        if guided_params.no_fallback():
+            raise ValueError(message)
+
+        logger.warning("%s Falling back to use %s instead.", message, fallback)
+        guided_params.backend = fallback
+
     # lm-format-enforce doesn't support grammar, fallback to xgrammar
-    if guided_params.backend == "lm-format-enforcer":
+    if guided_params.backend_name == "lm-format-enforcer":
         if guided_params.grammar is not None:
-            logger.warning(
-                "lm-format-enforcer does not support grammar guided decoding. "
-                "Falling back to use xgrammar instead.")
-            guided_params.backend = "xgrammar"
+            fallback_or_error(
+                guided_params,
+                "lm-format-enforcer does not support grammar guided decoding.",
+                "xgrammar")
 
         # lm-format-enforcer doesn't support some JSON schema features
         elif (guided_params.json is not None
               and has_lmf_unsupported_json_features(guided_params.json)):
-            logger.warning(
+            fallback_or_error(
+                guided_params,
                 "lm-format-enforcer does not support advanced JSON schema "
-                "features like patterns or numeric ranges. "
-                "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+                "features like patterns or numeric ranges.", "outlines")
 
-    if guided_params.backend == "xgrammar":
+    if guided_params.backend_name == "xgrammar":
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
         # xgrammar only has x86 wheels for linux, fallback to outlines
         from vllm.platforms import current_platform
         if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
-            logger.warning("xgrammar is only supported on x86 CPUs. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(guided_params,
+                              "xgrammar is only supported on x86 CPUs.",
+                              "outlines")
 
         # xgrammar doesn't support regex, fallback to outlines
         if guided_params.regex is not None:
-            logger.warning("xgrammar does not support regex guided decoding. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(
+                guided_params,
+                "xgrammar does not support regex guided decoding.", "outlines")
 
         # xgrammar doesn't support some JSON schema features
         elif (guided_params.json is not None
               and has_xgrammar_unsupported_json_features(guided_params.json)):
-            logger.warning(
+            fallback_or_error(
+                guided_params,
                 "xgrammar does not support advanced JSON schema features like "
-                "patterns or numeric ranges. "
-                "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+                "enums, patterns or numeric ranges.", "outlines")
 
         # xgrammar only supports GBNF grammars, so we must convert Lark.
         # We must check if the grammar is likely Lark and if that
@@ -72,25 +81,23 @@ def maybe_backend_fallback(
             try:
                 convert_lark_to_gbnf(guided_params.grammar)
             except Exception:
-                logger.warning(
+                fallback_or_error(
+                    guided_params,
                     "xgrammar does not support Lark grammars and the "
-                    "grammar failed to convert to GBNF. "
-                    "Falling back to use outlines instead.")
-                guided_params.backend = "outlines"
+                    "grammar failed to convert to GBNF.", "outlines")
 
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
-            logger.warning("xgrammar module cannot be imported successfully. "
-                           "Falling back to use outlines instead.")
-            guided_params.backend = "outlines"
+            fallback_or_error(
+                guided_params,
+                "xgrammar module cannot be imported successfully.", "outlines")
 
-    if (guided_params.backend == "outlines"
+    if (guided_params.backend_name == "outlines"
             and guided_params.json_object is not None):
         # outlines doesn't support json_object, fallback to xgrammar
-        logger.warning("outlines does not support json_object. "
-                       "Falling back to use xgrammar instead.")
-        guided_params.backend = "xgrammar"
+        fallback_or_error(guided_params,
+                          "outlines does not support json_object.", "xgrammar")
 
     return guided_params
 
@@ -100,18 +107,18 @@ async def get_guided_decoding_logits_processor(
         model_config: ModelConfig) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines':
+    if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_outlines_guided_decoding_logits_processor)
         return await get_outlines_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend == 'lm-format-enforcer':
+    if guided_params.backend_name == 'lm-format-enforcer':
         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
+    if guided_params.backend_name == 'xgrammar':
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
@@ -127,18 +134,18 @@ def get_local_guided_decoding_logits_processor(
         model_config: ModelConfig) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
-    if guided_params.backend == 'outlines':
+    if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_local_outlines_guided_decoding_logits_processor)
         return get_local_outlines_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend == 'lm-format-enforcer':
+    if guided_params.backend_name == 'lm-format-enforcer':
         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
             guided_params, tokenizer)
-    if guided_params.backend == 'xgrammar':
+    if guided_params.backend_name == 'xgrammar':
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 04ddcd73fa9..2ce87283df7 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -64,6 +64,25 @@ def from_optional(
             whitespace_pattern=whitespace_pattern,
         )
 
+    @property
+    def backend_name(self) -> str:
+        """Return the backend name without any options.
+        
+        For example if the backend is "xgrammar:no-fallback", returns "xgrammar"
+        """
+        return (self.backend or "").split(":")[0]
+
+    def backend_options(self) -> List[str]:
+        """Return the backend options as a list of strings."""
+        if not self.backend or ":" not in self.backend:
+            return []
+        return self.backend.split(":")[1].split(",")
+
+    def no_fallback(self) -> bool:
+        """Returns True if the "no-fallback" option is supplied for the guided
+        decoding backend"""
+        return "no-fallback" in self.backend_options()
+
     def __post_init__(self):
         """Validate that some fields are mutually exclusive."""
         guide_count = sum([

From d83d6b4b8ad9029977b68ce07a19940e86135e01 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 20 Feb 2025 20:45:20 -0500
Subject: [PATCH 0290/1240] [Bugfix] Fix max_num_batched_tokens for MLA
 (#13620)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 6764694f805..f118004b2f2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -51,6 +51,9 @@
 
 logger = init_logger(__name__)
 
+# This value is chosen to have a balance between ITL and TTFT. Note it is
+# not optimized for throughput.
+_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
 _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
 
@@ -1526,15 +1529,17 @@ def __post_init__(self) -> None:
                     # for now. Have max_num_batched_tokens set to max_model_len
                     # so we don't reject sequences on account of a short
                     # max_num_batched_tokens.
-                    self.max_num_batched_tokens = max(self.max_model_len, 2048)
+                    self.max_num_batched_tokens = max(
+                        self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
                 else:
-                    # This value is chosen to have a balance between ITL
-                    # and TTFT. Note it is not optimized for throughput.
-                    self.max_num_batched_tokens = 2048
+                    self.max_num_batched_tokens = (
+                        _DEFAULT_MAX_NUM_BATCHED_TOKENS)
             else:
-                # If max_model_len is too short, use 2048 as the default value
+                # If max_model_len is too short, use
+                # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
                 # for higher throughput.
-                self.max_num_batched_tokens = max(self.max_model_len, 2048)
+                self.max_num_batched_tokens = max(
+                    self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.runner_type == "pooling":
                 # Choose specific value for higher throughput
@@ -3333,6 +3338,9 @@ def __post_init__(self):
                         "caching to be disabled.")
             self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.max_num_batched_tokens = max(
+                self.scheduler_config.max_model_len,
+                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False

From 550c7418c946ad08db6506e739bbf5a81ddf85b5 Mon Sep 17 00:00:00 2001
From: Lingfan Yu <lingfany@amazon.com>
Date: Thu, 20 Feb 2025 17:45:45 -0800
Subject: [PATCH 0291/1240] [Neuron][Kernel] Vectorize KV cache load in
 FlashPagedAttention to maximize DMA bandwidth (#13245)

Signed-off-by: Lingfan Yu <lingfany@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/test_block_table.py     | 153 +++++++
 tests/neuron/test_prefix_prefill.py  | 332 +++++++-------
 vllm/attention/ops/nki_flash_attn.py | 627 ++++++++++++++++++---------
 3 files changed, 764 insertions(+), 348 deletions(-)
 create mode 100644 tests/neuron/test_block_table.py

diff --git a/tests/neuron/test_block_table.py b/tests/neuron/test_block_table.py
new file mode 100644
index 00000000000..30dcdd573ed
--- /dev/null
+++ b/tests/neuron/test_block_table.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import neuronxcc.nki.language as nl
+import pytest
+import torch
+import torch.nn.functional as F
+from neuronxcc import nki
+
+from vllm.attention.ops.nki_flash_attn import (
+    load_block_tables, transform_block_tables_for_indirect_load)
+
+
+def is_power_of_2(n):
+    return n > 0 and (n & (n - 1) == 0)
+
+
+def nki_load_and_transform_block_tables(
+    block_tables,
+    num_tiles,
+    num_blocks_per_tile,
+    num_head,
+    head_id,
+    block_size_tiling_factor,
+):
+    assert is_power_of_2(
+        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
+    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
+                                          num_blocks_per_tile)
+
+    # we need to pass an Index as head_id
+    head_id = nl.arange(1)[None, :] + head_id
+
+    block_tables_transposed = transform_block_tables_for_indirect_load(
+        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
+    B_P_SIZE = 128
+    assert block_tables_transposed.shape[1] == B_P_SIZE
+
+    out = nl.ndarray(
+        block_tables_transposed.shape,
+        dtype=nl.int32,
+        buffer=nl.shared_hbm,
+    )
+    for i in nl.affine_range(block_tables_transposed.shape[0]):
+        nl.store(dst=out[i], value=block_tables_transposed[i])
+    return out
+
+
+def ref_block_tables_transform(
+    block_tables,
+    num_tiles,
+    num_blocks_per_tile,
+    num_head,
+    head_id,
+    block_size_tiling_factor,
+):
+    assert block_tables.numel() == num_tiles * num_blocks_per_tile
+    block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
+    B_F_SIZE = 128
+    num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
+    block_tables = F.pad(
+        block_tables,
+        (0, 0, 0, num_tiles_padded - num_tiles),
+        "constant",
+        0,
+    )
+
+    block_tables = block_tables * num_head + head_id
+    block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
+    offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
+    block_tables = block_tables * block_size_tiling_factor + offset
+    block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
+
+    num_blocks_per_tile = block_tables_transposed.shape[0]
+    assert num_blocks_per_tile % B_F_SIZE == 0
+    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
+                                        B_F_SIZE, num_tiles_padded)
+
+
+@pytest.mark.parametrize(
+    "q_head_per_kv_head,head_id",
+    [
+        (1, 0),
+        (3, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "num_tiles,num_blocks_per_tile",
+    [
+        (1, 1),
+        (13, 16),
+        (17, 128),
+        (35, 512),
+        (128, 128),
+        (130, 64),
+        (280, 256),
+        (315, 1),
+    ],
+)
+@torch.inference_mode()
+def test_load_and_transform_block_tables(
+    num_tiles,
+    num_blocks_per_tile,
+    q_head_per_kv_head,
+    head_id,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+
+    compiler_flags = [
+        "-O1",
+        "--retry_failed_compilation",
+    ]
+    compiler_flags_str = " ".join(compiler_flags)
+    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+
+    torch.manual_seed(10000)
+    torch.set_printoptions(sci_mode=False)
+
+    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+    B_P_SIZE = 128
+    if num_blocks_per_tile < B_P_SIZE:
+        assert B_P_SIZE % num_blocks_per_tile == 0
+        block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+    else:
+        block_size_tiling_factor = 1
+    max_num_blocks = 100000
+    block_tables = torch.randint(
+        0,
+        max_num_blocks,
+        (num_tiles * num_blocks_per_tile, ),
+        dtype=torch.int32,
+    )
+    nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+        block_tables.to(device=device),
+        num_tiles,
+        num_blocks_per_tile,
+        q_head_per_kv_head,
+        head_id,
+        block_size_tiling_factor,
+    ).cpu()
+    ref_out = ref_block_tables_transform(
+        block_tables,
+        num_tiles,
+        num_blocks_per_tile,
+        q_head_per_kv_head,
+        head_id,
+        block_size_tiling_factor,
+    )
+    assert (nki_out.shape == ref_out.shape
+            ), f"{nki_out.shape=} != {ref_out.shape=}"
+    assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index 04d1bd3f0eb..347a139f39b 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -107,7 +107,7 @@ def ref_masked_attention(
             masked_score, dim=-1, return_max_reduce=True)
     else:
         norm_score = ref_softmax(masked_score, dim=-1)
-    out = torch.einsum("hqk,khd->qhd", norm_score, value)
+    out = torch.einsum("hqk,khd->qhd", norm_score.to(value.dtype), value)
     if return_max_reduce:
         return (
             out,
@@ -118,7 +118,7 @@ def ref_masked_attention(
             scaled_qk,
         )
     else:
-        return out
+        return (out, )
 
 
 def ref_context_attention(
@@ -128,8 +128,6 @@ def ref_context_attention(
     query_lens,
     seq_lens,
     head_size,
-    num_kv_heads,
-    num_heads,
     num_queries_per_kv,
     return_max_reduce=False,
 ):
@@ -146,18 +144,19 @@ def ref_context_attention(
     attn_mask = torch.logical_not(attn_mask)
     attn_mask = attn_mask.float() * -30000
 
-    output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
-        ref_masked_attention(
-            query,
-            key,
-            value,
-            scale,
-            attn_mask,
-            return_max_reduce=return_max_reduce,
-        ))
+    output, *debug_tensors = ref_masked_attention(
+        query,
+        key,
+        value,
+        scale,
+        attn_mask,
+        return_max_reduce=return_max_reduce,
+    )
 
     output = output.unsqueeze(1)
     if return_max_reduce:
+        cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = (
+            debug_tensors)
         return (
             output,
             cached_max,
@@ -170,65 +169,22 @@ def ref_context_attention(
         return output
 
 
-@pytest.mark.parametrize(
-    "block_size, large_tile_size",
-    [
-        (32, 2048),  # 64 blocks
-        (32, 4096),  # 128 blocks
-        (32, 8192),  # 256 blocks
-        (64, 8192),  # 128 blocks
-    ],
-)
-@pytest.mark.parametrize(
-    "num_heads,num_queries_per_kv,head_size,mixed_precision",
-    [
-        (4, 2, 8, False),
-        (4, 2, 8, True),
-        (32, 8, 64, True),
-        (16, 2, 128, True),
-    ],
-)
-@torch.inference_mode()
-def test_contexted_kv_attention(
-    num_heads: int,
-    num_queries_per_kv: int,
-    head_size: int,
-    block_size: int,
-    large_tile_size,
-    mixed_precision: bool,
-) -> None:
-    import os
-
-    import torch_xla.core.xla_model as xm
-
-    from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc
-
-    assert large_tile_size % block_size == 0
-
-    device = xm.xla_device()
-
-    compiler_flags = [
-        "--model-type=transformer -O1",
-        "--internal-hlo2tensorizer-options='--verify-hlo'",
-        "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
-
-    torch.manual_seed(0)
-    torch.set_printoptions(sci_mode=False)
-
-    min_ctx_len = 32
-    max_ctx_len = 1024
-    min_query_len = 16
-    max_query_len = 512
-    prefill_batch_size = 4
-    decode_batch_size = 12
+def sample_inputs(
+    prefill_batch_size,
+    decode_batch_size,
+    min_query_len,
+    max_query_len,
+    min_ctx_len,
+    max_ctx_len,
+    block_size,
+    num_heads,
+    num_kv_heads,
+    head_size,
+    dtype,
+):
     batch_size = prefill_batch_size + decode_batch_size
     max_model_len = (max_query_len + max_ctx_len) * 4
-
     max_block_per_request = max_model_len // block_size
-    dtype = torch.float32
     cache_size = (batch_size * max_block_per_request) + 2
     prefill_ctx_lens = torch.randint(min_ctx_len,
                                      max_ctx_len + 1, (prefill_batch_size, ),
@@ -244,7 +200,6 @@ def test_contexted_kv_attention(
         dtype=torch.long,
     ).tolist() + [1 for _ in range(decode_batch_size)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
-    num_kv_heads = num_heads // num_queries_per_kv
 
     num_tokens = sum(query_lens)
     query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype)
@@ -304,47 +259,139 @@ def test_contexted_kv_attention(
             cur_ctx += block_size
             block_id += 1
 
+    return (
+        query,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        block_table,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+    )
+
+
+def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
+                            num_blocks):
+    context_lens = seq_lens - query_lens
+    blocks_per_seq = (context_lens + block_size - 1) // block_size
+    num_seqs = len(seq_lens)
+    active_blocks: list[int] = []
+    for seq_id in range(num_seqs):
+        active_blocks = (
+            active_blocks +
+            block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
+    return F.pad(
+        torch.tensor(active_blocks, dtype=torch.int32),
+        (0, num_blocks - len(active_blocks)),
+        "constant",
+        0,
+    )
+
+
+@pytest.mark.parametrize(
+    "prefill_batch_size,decode_batch_size,block_size,large_tile_size",
+    [
+        (1, 199, 1, 512),  # 512 blocks
+        (4, 12, 256, 2048),  # 128 blocks
+        (4, 12, 16, 2048),  # 128 blocks
+        (4, 12, 4, 1024),  # 256 blocks
+        (4, 12, 32, 2048),  # 64 blocks
+        (4, 12, 32, 4096),  # 128 blocks
+        (4, 12, 32, 8192),  # 256 blocks
+        (4, 12, 64, 8192),  # 128 blocks
+    ],
+)
+@pytest.mark.parametrize(
+    "num_heads,num_queries_per_kv,head_size",
+    [
+        (4, 2, 8),
+        (32, 8, 64),
+        (4, 4, 128),
+        (8, 1, 32),
+    ],
+)
+@pytest.mark.parametrize("mixed_precision", [True, False])
+@torch.inference_mode()
+def test_contexted_kv_attention(
+    prefill_batch_size: int,
+    decode_batch_size: int,
+    num_heads: int,
+    num_queries_per_kv: int,
+    head_size: int,
+    block_size: int,
+    large_tile_size,
+    mixed_precision: bool,
+) -> None:
+    import os
+
+    import torch_xla.core.xla_model as xm
+
+    from vllm.attention.ops.nki_flash_attn import (flash_attn_varlen_nkifunc,
+                                                   reorder_context_mask)
+
+    assert large_tile_size % block_size == 0
+
+    device = xm.xla_device()
+
+    compiler_flags = [
+        "-O1",
+        "--retry_failed_compilation",
+    ]
+    compiler_flags_str = " ".join(compiler_flags)
+    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+
+    torch.manual_seed(0)
+    torch.set_printoptions(sci_mode=False)
+    dtype = torch.float32
+
+    min_ctx_len = 32
+    max_ctx_len = 1024
+    min_query_len = 16
+    max_query_len = 512
+    num_kv_heads = num_heads // num_queries_per_kv
     (
-        output_ref,
-        cached_max,
-        cached_sum_reciprocal,
-        lse,
-        masked_score,
-        scaled_qk,
-    ) = ref_context_attention(
+        query,
+        k_active,
+        v_active,
+        k_cache,
+        v_cache,
+        block_table,
+        key,
+        value,
+        query_lens,
+        seq_lens,
+    ) = sample_inputs(
+        prefill_batch_size=prefill_batch_size,
+        decode_batch_size=decode_batch_size,
+        min_query_len=min_query_len,
+        max_query_len=max_query_len,
+        min_ctx_len=min_ctx_len,
+        max_ctx_len=max_ctx_len,
+        block_size=block_size,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_size=head_size,
+        dtype=dtype,
+    )
+
+    output_ref = ref_context_attention(
         query,
         key,
         value,
         query_lens,
         seq_lens,
         head_size,
-        num_kv_heads,
-        num_heads,
         num_queries_per_kv,
-        return_max_reduce=True,
+        return_max_reduce=False,
     )
 
     # build neuron program
-    return_debug_tensors = False
     B_P_SIZE = 128
-    LARGE_TILE_SZ = large_tile_size
-
-    def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
-                                num_blocks):
-        context_lens = seq_lens - query_lens
-        blocks_per_seq = (context_lens + block_size - 1) // block_size
-        num_seqs = len(seq_lens)
-        active_blocks: list[int] = []
-        for seq_id in range(num_seqs):
-            active_blocks = (
-                active_blocks +
-                block_tables[seq_id, :blocks_per_seq[seq_id]].tolist())
-        return F.pad(
-            torch.tensor(active_blocks),
-            (0, num_blocks - len(active_blocks)),
-            "constant",
-            0,
-        )
+    assert (large_tile_size >= B_P_SIZE
+            ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
 
     def ceil_div(a, b):
         return (a + b - 1) // b
@@ -357,32 +404,27 @@ def pad_to_next_power_of_2(a):
         return 2**int(a - 1).bit_length()
 
     # calculate input shapes
-    max_num_queries = pad_to_multiple(sum(query_lens), block_size)
-    max_num_queries = pad_to_next_power_of_2(max_num_queries)
-    head_size_padded = B_P_SIZE
-    assert head_size_padded >= head_size
+    max_num_queries = pad_to_next_power_of_2(sum(query_lens))
     context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
     num_active_blocks = ceil_div(context_lens, block_size).sum().item()
     num_active_blocks = pad_to_multiple(num_active_blocks,
-                                        LARGE_TILE_SZ // block_size)
+                                        large_tile_size // block_size)
     context_kv_len = num_active_blocks * block_size
     assert (context_kv_len %
-            LARGE_TILE_SZ == 0), f"invalid context_kv_len={context_kv_len}"
+            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
 
     # pad QKV tensors
     pad_dims = (
         0,
-        head_size_padded - query.shape[2],
+        0,
         0,
         0,
         0,
         max_num_queries - query.shape[0],
     )
     query = F.pad(query, pad_dims, "constant", 0)
-    k = F.pad(k, pad_dims, "constant", 0)
-    v = F.pad(v, pad_dims, "constant", 0)
-    k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0)
-    v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0)
+    k = F.pad(k_active, pad_dims, "constant", 0)
+    v = F.pad(v_active, pad_dims, "constant", 0)
 
     # permute QKV tensors
     # query: (1, n_heads, d, seq_q)
@@ -391,6 +433,8 @@ def pad_to_next_power_of_2(a):
     query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
     k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
     v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+    k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+    v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
 
     # transform block table
     active_block_table = get_active_block_tables(
@@ -405,33 +449,31 @@ def pad_to_next_power_of_2(a):
     prior_mask, active_mask = (
         BlockDiagonalCausalFromBottomRightMask.from_seqlens(
             query_lens, seq_lens, block_size=block_size))
-    attn_mask = torch.concat(
-        [
-            F.pad(
-                prior_mask,
-                (
-                    0,
-                    context_kv_len - prior_mask.shape[1],
-                    0,
-                    max_num_queries - prior_mask.shape[0],
-                ),
-                "constant",
-                0,
-            ).bool(),
-            F.pad(
-                active_mask,
-                (
-                    0,
-                    max_num_queries - active_mask.shape[1],
-                    0,
-                    max_num_queries - active_mask.shape[0],
-                ),
-                "constant",
-                0,
-            ).bool(),
-        ],
-        dim=1,
-    )
+    prior_mask_padded = F.pad(
+        prior_mask,
+        (
+            0,
+            context_kv_len - prior_mask.shape[1],
+            0,
+            max_num_queries - prior_mask.shape[0],
+        ),
+        "constant",
+        0,
+    ).bool()
+    active_mask_padded = F.pad(
+        active_mask,
+        (
+            0,
+            max_num_queries - active_mask.shape[1],
+            0,
+            max_num_queries - active_mask.shape[0],
+        ),
+        "constant",
+        0,
+    ).bool()
+    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
+
+    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
 
     input_args = (
         query.to(device=device),
@@ -439,29 +481,21 @@ def pad_to_next_power_of_2(a):
         v.to(device=device),
         k_cache.to(device=device),
         v_cache.to(device=device),
-        active_block_table.to(torch.int32).to(device=device),
+        active_block_table.to(device=device),
         attn_mask.to(device=device),
     )
     input_kwargs = dict(
         n_kv_head=num_kv_heads,
         head_size=head_size,
         mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=LARGE_TILE_SZ,
-        return_debug_tensors=return_debug_tensors,
+        LARGE_TILE_SZ=large_tile_size,
     )
 
-    if return_debug_tensors:
-        output_nki, *debug_tensors = flash_attn_varlen_nkifunc(
-            *input_args, **input_kwargs)
-    else:
-        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
-        debug_tensors = []
-
-    debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors]
+    output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
 
     num_actual_tokens = sum(query_lens)
     # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.cpu().permute(0, 2, 1, 3)[:, :, :, :head_size]
+    output_nki = output_nki.cpu().permute(0, 2, 1, 3)
     output_nki = output_nki[0, :num_actual_tokens, :, :]
     output_ref_padded = F.pad(
         output_ref,
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 5e2a1f7e66d..20f9dcd163f 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -1,27 +1,203 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
-
 import neuronxcc.nki.isa as nisa
 import neuronxcc.nki.language as nl
 import numpy as np
+import torch
 from neuronxcc import nki
 from neuronxcc.nki.language import par_dim
 
 
-@dataclass(frozen=True)
-class FlashConfig:
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def is_power_of_2(x):
+    return x > 0 and (x & (x - 1)) == 0
+
+
+@nki.jit
+def load_block_tables(block_tables_hbm, num_tiles, num_blocks_per_tile):
+    """
+    Load block tables from HBM into SRAM
+
+    `block_tables_hbm` has shape `(num_tiles * num_blocks_per_tile, )`.
+    In case `num_tiles > B_P_SIZE`, we need further tile `num_tile` dimension.
+    """
+    B_P_SIZE = 128
+
+    # reshape as `(num_tiles, num_blocks_per_tile)`
+    assert len(block_tables_hbm.shape) == 1
+    (num_total_blocks, ) = block_tables_hbm.shape
+    assert num_blocks_per_tile * num_tiles == num_total_blocks
+    block_tables_hbm = block_tables_hbm.reshape(
+        (num_tiles, num_blocks_per_tile))
+
+    block_tables_sbuf = nl.zeros(
+        (ceil_div(num_tiles,
+                  B_P_SIZE), par_dim(B_P_SIZE), num_blocks_per_tile),
+        dtype=nl.int32,
+    )
+    for i in nl.affine_range(ceil_div(num_tiles, B_P_SIZE)):
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(num_blocks_per_tile)[None, :]
+        block_tables_sbuf[i, i_p, i_f] = nl.load(
+            block_tables_hbm[i_p + i * B_P_SIZE, i_f],
+            dtype=nl.int32,
+            mask=(i_p + i * B_P_SIZE < num_tiles),
+        )
+    return block_tables_sbuf
+
+
+@nki.jit
+def transform_block_tables_for_indirect_load(
+    block_tables,
+    block_size_tiling_factor,
+    num_head,
+    head_id,
+):
     """
-    Config class for flash attention with default values
+    This function does two things:
+    1. calculate new `block_tables` for a `head_id` after flattening
+    `num_block`, `num_head`, and `block_size_tiling_factor` dimensions
+    2. transpose the result so that `block_table` for each tile is mapped to
+    SBUF Partition dimension for vectorized DMA
+
+    Tiling trick to further improve DMA performance:
+    Given KV cache shape `(num_block, num_head, block_size, D)`, when loading M
+    blocks of a given `head_id` from HBM, the load `cache[block_tables,
+    head_id]` has shape `(M, block_size, D)`. If M < B_P_SIZE = 128, DMA may not
+    fully utilize hardware parallelization. The solution is to tile `block_size`
+    into `(block_size_tiling_factor, tiled_block_size)` s.t. `M *
+    block_size_tiling_factor = B_P_SIZE`. After tiling, KV cache has shape
+    `(num_block, num_head, block_size_tiling_factor, tiled_block_size, D)`. 
+
+    Note:
+    We don't further tile D dimension as small DMA size also hurts performance.
     """
+    B_P_SIZE = 128
+    num_partitions, num_tiles_per_partition, num_blocks_per_tile = (
+        block_tables.shape)
+    assert num_tiles_per_partition == B_P_SIZE
+    assert is_power_of_2(
+        num_blocks_per_tile), f"{num_blocks_per_tile=} is not power of 2"
+
+    num_loads = ceil_div(num_blocks_per_tile, B_P_SIZE)
+    block_tables_transposed = nl.ndarray(
+        (
+            num_loads,
+            par_dim(B_P_SIZE),
+            num_partitions * num_tiles_per_partition,
+        ),
+        dtype=nl.int32,
+    )
 
-    seq_tile_size: int = 2048
-    should_transpose_v: bool = False
+    # prepare iota ahead of time to avoid repeatedly using Gpsimd
+    if num_head > 1:
+        head_id = nisa.iota(head_id, dtype=nl.int32).reshape((1, 1))
+        head_id = nl.transpose(
+            head_id.broadcast_to((1, num_tiles_per_partition)))
+        if num_blocks_per_tile > 1:
+            head_id = head_id.broadcast_to(
+                (num_tiles_per_partition, num_blocks_per_tile))
+
+    if block_size_tiling_factor > 1:
+        broadcast_shape = (
+            num_tiles_per_partition,
+            num_blocks_per_tile,
+            block_size_tiling_factor,
+        )
+        offset = nisa.iota(nl.arange(block_size_tiling_factor)[None, None, :],
+                           dtype=nl.int32).broadcast_to(broadcast_shape)
+
+    for partition_id in nl.affine_range(num_partitions):
+        block_tables_partition = block_tables[partition_id]
+        if num_head > 1:
+            # fuse num_block and num_head dimension
+            block_tables_partition = block_tables_partition * num_head + head_id
+
+        if block_size_tiling_factor > 1:
+            # need to apply block size tiling trick
+            assert num_blocks_per_tile * block_size_tiling_factor == B_P_SIZE
+            block_tables_partition = ((block_tables_partition *
+                                       block_size_tiling_factor).reshape(
+                                           (num_tiles_per_partition,
+                                            num_blocks_per_tile,
+                                            1)).broadcast_to(broadcast_shape))
+            new_block_tables = block_tables_partition + offset
+            new_block_tables = new_block_tables.reshape(
+                (num_tiles_per_partition, B_P_SIZE))
+        else:
+            new_block_tables = block_tables_partition
 
-    __annotations__ = {
-        "seq_tile_size": int,
-        "should_transpose_v": bool,
-    }
+        # transpose the block table so that it can be used by vector DGE
+        for i in nl.affine_range(num_loads):
+            i_p = nl.arange(B_P_SIZE)[:, None]
+            i_f = (partition_id * num_tiles_per_partition +
+                   nl.arange(num_tiles_per_partition)[None, :])
+            block_tables_transposed[i, i_p, i_f] = nl.transpose(
+                new_block_tables[:, nl.ds(i * B_P_SIZE, B_P_SIZE)])
+    return block_tables_transposed
+
+
+@nki.jit
+def load_kv_tile_from_cache(
+    cur_k_tile,
+    cur_v_tile,
+    key_cache,
+    value_cache,
+    block_tables,
+    large_k_tile_idx,
+    num_blocks_per_large_tile,
+    tiled_block_size,
+    B_P_SIZE,
+    B_D_SIZE,
+):
+    """
+    Load KV cache and transform Key and Value into layout required by Matmul
+
+    Vectorized DMA Load layout:
+    Key and Value: (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
+
+    Layout used by attention matmuls:
+    Key: (par_dim(B_D_SIZE), seqlen_kv)
+    Value: (seqlen_kv // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE)
+           equivalent to (par_dim(B_P_SIZE), seqlen_kv // B_P_SIZE * B_D_SIZE)
+    """
+    # load key cache
+    num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+    for load_idx in nl.affine_range(num_loads):
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
+        loaded = nl.load(key_cache[block_tables[load_idx, i_p,
+                                                large_k_tile_idx], i_f])
+        if cur_k_tile.dtype != loaded.dtype:
+            loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
+        # Transpose SBUF tensor using PE
+        for tb_i in nl.affine_range(tiled_block_size):
+            cur_k_tile[
+                :,
+                nl.ds(
+                    load_idx * B_P_SIZE * tiled_block_size + tb_i * B_P_SIZE,
+                    B_P_SIZE,
+                ),
+            ] = nl.transpose(loaded[:, nl.ds(tb_i * B_D_SIZE, B_D_SIZE)])
+
+    # load value cache
+    for load_idx in nl.affine_range(num_loads):
+        loaded = nl.load(value_cache[block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
+        if cur_v_tile.dtype != loaded.dtype:
+            loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
+        i_p = nl.arange(B_P_SIZE)[:, None]
+        i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
+        cur_v_tile[
+            :,
+            nl.ds(
+                load_idx * tiled_block_size * B_D_SIZE,
+                tiled_block_size * B_D_SIZE,
+            ),
+        ] = loaded
 
 
 @nki.jit
@@ -62,13 +238,13 @@ def _flash_attention_core(
     o_buffer,
     l_buffer,
     m_buffer,
-    q_tile_idx,
     kernel_dtype,
     acc_type,
-    flash_config: FlashConfig,
-    use_causal_mask,
     tile_mask,
+    use_causal_mask,
+    q_tile_idx=None,
     initialize=False,
+    LARGE_TILE_SZ=2048,
     B_P_SIZE=128,
     B_F_SIZE=512,
     B_D_SIZE=128,
@@ -77,19 +253,19 @@ def _flash_attention_core(
     """
     The flash attention core function to calculate self attention between a tile
     of q and a block of K and V.
-    The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF
-    already. The block size of K and V
-    is defined in the seq_tile_size of the flash_config. The results are stored
-    in the following three buffers
+    The q_local_tile has (B_P_SIZE, B_D_SIZE)
+    The K and V have shape (B_D_SIZE, LARGE_TILE_SZ), whose free dimension will
+    be split into size B_F_SIZE tiles
+
+    The results are stored in the following three buffers
     o_buffer: (B_P_SIZE, d)
     l_buffer: (B_P_SIZE, 1)
     m_buffer: (B_P_SIZE, 1)
+
+    All IO buffers are in SBUF.
     """
-    LARGE_TILE_SZ = flash_config.seq_tile_size
     num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE
 
-    # mask are used to only apply computation to the lower half of the matrix,
-    # which reduce the arithmetic intensity by half
     qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
                             buffer=nl.sbuf,
                             dtype=acc_type)
@@ -99,6 +275,8 @@ def _flash_attention_core(
         k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE)
 
         if use_causal_mask:
+            # mask are used to only apply computation to the lower half of the
+            # matrix, which reduce the arithmetic intensity by up to 50%
             multiplication_required_selection = (q_tile_idx * B_P_SIZE
                                                  >= k_i * B_F_SIZE)
         else:
@@ -165,7 +343,9 @@ def _flash_attention_core(
     REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2)
 
     p_partial_sum = nl.ndarray(
-        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type)
+        (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE),
+        dtype=acc_type,
+    )
 
     for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE):
         k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE)
@@ -194,13 +374,15 @@ def _flash_attention_core(
         B_F_SIZE=B_F_SIZE,
     )
 
-    pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE),
-                       dtype=np.float32,
-                       buffer=nl.psum)
+    pv_psum = nl.zeros(
+        (par_dim(B_P_SIZE), B_D_SIZE),
+        dtype=np.float32,
+        buffer=nl.psum,
+    )
     for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
         pv_psum[:, :] += nl.matmul(
             p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)],
-            v[k_i, :, :],
+            v[:, nl.ds(k_i * B_D_SIZE, B_D_SIZE)],
             transpose_x=True,
         )  # (128, 128) (p(Br), d)
 
@@ -219,44 +401,16 @@ def _flash_attention_core(
 
 
 @nki.jit
-def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config):
-    LARGE_TILE_SZ = config.seq_tile_size
+def load_v_tile(v_hbm_tile, cur_v_tile, large_tile_idx, v_i, LARGE_TILE_SZ):
     B_P_SIZE = 128
-
-    if not config.should_transpose_v:
-        cur_v_tile[v_i, :, :] = nl.load(
-            v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :],
-            dtype=cur_v_tile.dtype,
-        )
-        return
-
-    if nisa.get_nc_version() == nisa.nc_version.gen3:
-        cur_v_tile_transposed = nisa.dma_transpose(
-            v_hbm_tile[:,
-                       nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)])
-        cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed,
-                                                 dtype=cur_v_tile.dtype)
-        return
-
-    cur_v_tile[v_i, :, :] = nl.load_transpose2d(
-        v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)],
-        dtype=cur_v_tile.dtype,
-    )
-
-
-@nki.jit
-def load_block_tables(block_tables_hbm, num_tiles):
-    (num_blocks, ) = block_tables_hbm.shape
-    assert num_blocks % num_tiles == 0
-    num_blocks_per_tile = num_blocks // num_tiles
-    block_tables_hbm = block_tables_hbm.reshape(
-        (num_tiles, num_blocks_per_tile))
-    block_tables_buffer = nl.load(block_tables_hbm, dtype=nl.int32)
-    return block_tables_buffer
-
-
-def is_power_of_2(x):
-    return x > 0 and (x & (x - 1)) == 0
+    B_D_SIZE = v_hbm_tile.shape[-1]
+    loaded = nl.load(v_hbm_tile[
+        nl.ds(large_tile_idx * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE),
+        :,
+    ])
+    if cur_v_tile.dtype != loaded.dtype:
+        loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
+    cur_v_tile[:, nl.ds(v_i * B_D_SIZE, B_D_SIZE)] = loaded
 
 
 @nki.jit
@@ -270,24 +424,21 @@ def flash_paged_attention(
     mask,
     softmax_scale=None,
     mixed_precision=True,
-    config=None,
+    LARGE_TILE_SZ=2048,
     return_debug_tensors=False,
 ):
     """
     Flash PagedAttention Forward Kernel.
-      - PagedAttention Paper: https://arxiv.org/abs/2309.06180
-      - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310
 
     IO tensor layouts:
       - query: shape   (1, n_heads, d, seq_q)
       - key:   shape   (1, n_kv_heads, d, seq_k)
       - value: shape   (1, n_kv_heads, seq_v, d)
-      - key_cache: (num_blocks, block_size, n_kv_heads, d)
-      - value_cache: (num_blocks, block_size, n_kv_heads, d)
+      - key_cache: (num_blocks, n_kv_heads, block_size, d)
+      - value_cache: (num_blocks, n_kv_heads, block_size, d)
       - block_tables: (num_active_blocks, )
-      - mask: (seq_q, num_active_blocks * block_size)
+      - mask: (seq_q, num_active_blocks * block_size + seq_q)
       - o: shape (1, n_heads, seq_q, d)
-      - l_m: shape (1, n_heads, seq_q, 2)
 
       - This kernel requires seq_k == seq_v
       - We use continuous batching by default, so the batch dimension is
@@ -306,11 +457,8 @@ def flash_paged_attention(
       - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)`
       - mixed_precision: flag to set non-matmul ops in fp32 precision, default
         is set to `true`, if false, we use same precision as input types
-      - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig`
-          with Performance config parameters for flash attention with default
-          values
-        seq_tile_size: `default=2048`, size of the kv tile size for attention
-          computation reduction
+      - LARGE_TILE_SZ: `default=2048`, size of the kv tile size for attention
+        computation reduction
 
     GQA support Notes:
       the spmd kernel for launching kernel should be on kv_heads instead of
@@ -322,31 +470,65 @@ def flash_paged_attention(
       GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d]
         usage: `flash_fwd[b, kv_h](q, k, v, ...)`
     """
-    config = config or FlashConfig()
     B_F_SIZE = 512
     B_P_SIZE = 128
     b, h, d, seqlen_q = query.shape
     B_D_SIZE = d
-    LARGE_TILE_SZ = config.seq_tile_size
     n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    num_blocks, block_size, k_h, _ = key_cache.shape
+    num_blocks, k_h, block_size, _ = key_cache.shape
     q_h_per_k_h = h // k_h
-    assert tuple(key_cache.shape) == (
-        num_blocks,
-        block_size,
+    assert b == 1, f"invalid batch size {b=}"
+    assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
+    cache_shape = (num_blocks, k_h, block_size, d)
+    assert (tuple(key_cache.shape) == cache_shape
+            ), f"{key_cache.shape=} mismatch, expect {cache_shape}"
+    assert (tuple(value_cache.shape) == cache_shape
+            ), f"{value_cache.shape=} mismatch, expect {cache_shape}"
+    assert key is None or tuple(key.shape) == (
+        1,
         k_h,
         d,
-    ), "Input shape mismatch!"
-    assert tuple(value_cache.shape) == (
-        num_blocks,
-        block_size,
+        seqlen_q,
+    ), f"key shape {key.shape} mismatch!"
+    assert value is None or tuple(value.shape) == (
+        1,
         k_h,
+        seqlen_q,
         d,
-    ), "Input shape mismatch!"
-    assert b == 1, f"invalid batch size {b=}"
-    assert d <= 128, f" we do not support head_dim > 128, got head dim {d}"
+    ), f"value shape {value.shape} mismatch!"
+
+    assert (
+        nl.program_ndim() == 2
+    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
+    batch_id = nl.program_id(axis=0)
+    head_id = nl.program_id(axis=1)
+
+    (num_active_blocks, ) = block_tables.shape
+    context_kv_len = num_active_blocks * block_size
+    assert (
+        LARGE_TILE_SZ % B_F_SIZE == 0
+    ), f"Need {LARGE_TILE_SZ=} to be divisible by {B_F_SIZE=} in transpose_p"
+    assert (context_kv_len % LARGE_TILE_SZ == 0
+            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
+
+    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
+    assert is_power_of_2(
+        num_blocks_per_large_tile
+    ), f"{num_blocks_per_large_tile=} is expected of be power of 2"
+    if seqlen_q > B_F_SIZE:
+        MAX_REDUCTION_TILE = 2048
+        if seqlen_q // 2 > MAX_REDUCTION_TILE:
+            assert (
+                seqlen_q % MAX_REDUCTION_TILE == 0
+            ), f"{seqlen_q=} should be divisible by {MAX_REDUCTION_TILE=}"
+        else:
+            assert (seqlen_q % B_F_SIZE == 0
+                    ), f"{seqlen_q=} should be divisible by {B_F_SIZE=})"
+
     kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype
     acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype
+    softmax_scale = softmax_scale or (1.0 / (d**0.5))
+    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
 
     o = nl.ndarray((b, h, seqlen_q, d),
                    dtype=query.dtype,
@@ -373,35 +555,38 @@ def flash_paged_attention(
             buffer=nl.sbuf,
             lazy_initialization=True,
         )
+    block_tables_sbuf = load_block_tables(
+        block_tables_hbm=block_tables,
+        num_tiles=num_large_k_tile,
+        num_blocks_per_tile=num_blocks_per_large_tile,
+    )
 
-    assert (
-        nl.program_ndim() == 2
-    ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!"
-    batch_id = nl.program_id(axis=0)
-    head_id = nl.program_id(axis=1)
-
-    softmax_scale = softmax_scale or (1.0 / (d**0.5))
-
-    (num_active_blocks, ) = block_tables.shape
-    context_kv_len = num_active_blocks * block_size
-    assert (config.seq_tile_size >= 512
-            ), f" seq tile_size {config.seq_tile_size} cannot be less than 512"
-    assert (context_kv_len % LARGE_TILE_SZ == 0
-            ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}"
-    assert (
-        LARGE_TILE_SZ % B_P_SIZE == 0
-    ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}"
-    assert (B_P_SIZE % block_size == 0
-            ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}"
-    num_large_k_tile = context_kv_len // LARGE_TILE_SZ
-    num_blocks_per_large_tile = LARGE_TILE_SZ // block_size
-    assert block_size % 32 == 0, "block_size is expected to be a multiple of 32"
-    assert is_power_of_2(
-        num_blocks_per_large_tile
-    ), "The number of blocks in each large tile is expected of be power of 2"
-    assert is_power_of_2(seqlen_q), "seqlen_q is expected to be power of 2"
+    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+    if num_blocks_per_large_tile < B_P_SIZE:
+        # we checked num_blocks_per_tile is a power of 2
+        assert B_P_SIZE % num_blocks_per_large_tile == 0
+        block_size_tiling_factor = B_P_SIZE // num_blocks_per_large_tile
+        # We assume block_size >= block_size_tiling_factor
+        assert block_size % block_size_tiling_factor == 0
+    else:
+        block_size_tiling_factor = 1
+    tiled_block_size = block_size // block_size_tiling_factor
+
+    # Indirect DMA load must be placed along Partition Dimension
+    block_tables_sbuf = transform_block_tables_for_indirect_load(
+        block_tables_sbuf,
+        block_size_tiling_factor=block_size_tiling_factor,
+        num_head=k_h,
+        head_id=head_id,
+    )
 
-    block_tables_sbuf = load_block_tables(block_tables, num_large_k_tile)
+    # Flatten KV cache to be 2D for loading into SBUF
+    new_cache_shape = (
+        num_blocks * k_h * block_size_tiling_factor,
+        tiled_block_size * d,
+    )
+    key_cache = key_cache.reshape(new_cache_shape)
+    value_cache = value_cache.reshape(new_cache_shape)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -411,7 +596,7 @@ def flash_paged_attention(
         lazy_initialization=True,
     )
     l_buffer = nl.zeros(
-        (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h),
+        (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1),
         dtype=acc_type,
         buffer=nl.sbuf,
         lazy_initialization=True,
@@ -423,50 +608,42 @@ def flash_paged_attention(
         lazy_initialization=True,
     )
 
-    for j in nl.sequential_range(0, num_large_k_tile):
-        cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
-                                dtype=kernel_dtype)
+    for large_k_tile_idx in nl.sequential_range(0, num_large_k_tile):
+        num_loads = ceil_div(num_blocks_per_large_tile, B_P_SIZE)
+        cur_k_tile = nl.ndarray(
+            (par_dim(B_D_SIZE), LARGE_TILE_SZ),
+            dtype=kernel_dtype,
+        )
         cur_v_tile = nl.ndarray(
-            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            (par_dim(B_P_SIZE), num_loads * tiled_block_size * B_D_SIZE),
             dtype=kernel_dtype,
         )
-
-        for k_i in nl.affine_range(num_blocks_per_large_tile):
-            loaded = nl.load(key_cache[block_tables_sbuf[j, k_i], :,
-                                       head_id, :])
-            cur_k_tile[:, nl.ds(k_i *
-                                block_size, block_size)] = nl.transpose(loaded)
-
-        load_tile_size = B_P_SIZE
-        num_blocks_per_partition = load_tile_size // block_size
-        for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
-            for block_in_partition in nl.affine_range(
-                    num_blocks_per_partition):
-                v_i = (partition_idx * num_blocks_per_partition +
-                       block_in_partition)
-                loaded_v = nl.load(value_cache[block_tables_sbuf[j, v_i], :,
-                                               head_id, :])
-                cur_v_tile[
-                    partition_idx,
-                    nl.ds(block_in_partition * block_size, block_size),
-                    :,
-                ] = loaded_v
+        load_kv_tile_from_cache(
+            cur_k_tile=cur_k_tile,
+            cur_v_tile=cur_v_tile,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            block_tables=block_tables_sbuf,
+            large_k_tile_idx=large_k_tile_idx,
+            num_blocks_per_large_tile=num_blocks_per_large_tile,
+            tiled_block_size=tiled_block_size,
+            B_P_SIZE=B_P_SIZE,
+            B_D_SIZE=B_D_SIZE,
+        )
 
         for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ),
-                                  dtype=mask.dtype)
-            for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE):
-                cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load(mask[
-                    nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE),
-                ])
+            cur_mask = nl.load(mask[
+                nl.ds(i * B_P_SIZE, B_P_SIZE),
+                nl.ds(large_k_tile_idx * LARGE_TILE_SZ, LARGE_TILE_SZ),
+            ])
             for i_q_h in nl.affine_range(q_h_per_k_h):
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(
-                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
-                    dtype=kernel_dtype,
-                )  # load (d, 128) tile in SBUF
+                q_sbuf_tile = nl.load(q_hbm_tile[:,
+                                                 nl.ds(i *
+                                                       B_P_SIZE, B_P_SIZE)])
+                if q_sbuf_tile.dtype != kernel_dtype:
+                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
                 q_tile[:, :] = q_sbuf_tile * softmax_scale
 
                 _flash_attention_core(
@@ -474,15 +651,15 @@ def flash_paged_attention(
                     k=cur_k_tile,
                     v=cur_v_tile,
                     o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[:, i, i_q_h],
+                    l_buffer=l_buffer[i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    q_tile_idx=i,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
-                    flash_config=config,
-                    use_causal_mask=False,
                     tile_mask=cur_mask,
-                    initialize=j == 0,
+                    use_causal_mask=False,
+                    q_tile_idx=i,
+                    initialize=large_k_tile_idx == 0,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
@@ -492,62 +669,58 @@ def flash_paged_attention(
     if key is not None and value is not None:
         B_F_SIZE = min(seqlen_q, B_F_SIZE)
         LARGE_TILE_SZ = seqlen_q
-        active_config = FlashConfig(
-            seq_tile_size=LARGE_TILE_SZ,
-            should_transpose_v=config.should_transpose_v,
-        )
 
         cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ),
                                 dtype=kernel_dtype)
         cur_v_tile = nl.ndarray(
-            (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE),
+            (par_dim(B_P_SIZE), LARGE_TILE_SZ // B_P_SIZE * B_D_SIZE),
             dtype=kernel_dtype,
         )
 
-        cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :])
+        loaded = nl.load(key[batch_id, head_id, :, :])
+        if loaded.dtype != kernel_dtype:
+            loaded = nl.copy(loaded, dtype=kernel_dtype)
+        cur_k_tile[:, :] = loaded
 
-        load_tile_size = B_P_SIZE
         v_hbm_tile = value[batch_id, head_id]
-        for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size):
+        for v_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE):
             load_v_tile(
                 v_hbm_tile=v_hbm_tile,
                 cur_v_tile=cur_v_tile,
-                j=0,
+                large_tile_idx=0,
                 v_i=v_i,
-                config=active_config,
+                LARGE_TILE_SZ=LARGE_TILE_SZ,
             )
 
         for i in nl.affine_range(n_tile_q):
-            cur_mask = nl.load(
-                mask[
-                    nl.ds(i * B_P_SIZE, B_P_SIZE),
-                    nl.ds(context_kv_len, LARGE_TILE_SZ),
-                ],
-                dtype=mask.dtype,
-            )
+            cur_mask = nl.load(mask[
+                nl.ds(i * B_P_SIZE, B_P_SIZE),
+                nl.ds(context_kv_len, LARGE_TILE_SZ),
+            ])
             for i_q_h in nl.affine_range(q_h_per_k_h):
 
                 q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype)
                 q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h]
-                q_sbuf_tile = nl.load(
-                    q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)],
-                    dtype=kernel_dtype,
-                )  # load (d, 128) tile in SBUF
+                q_sbuf_tile = nl.load(q_hbm_tile[:,
+                                                 nl.ds(i *
+                                                       B_P_SIZE, B_P_SIZE)])
+                if q_sbuf_tile.dtype != kernel_dtype:
+                    q_sbuf_tile = nl.copy(q_sbuf_tile, dtype=kernel_dtype)
                 q_tile[:, :] = q_sbuf_tile * softmax_scale
                 _flash_attention_core(
                     q_local_tile=q_tile,
                     k=cur_k_tile,
                     v=cur_v_tile,
                     o_buffer=o_buffer[i, i_q_h],
-                    l_buffer=l_buffer[:, i, i_q_h],
+                    l_buffer=l_buffer[i, i_q_h],
                     m_buffer=m_buffer[i, i_q_h],
-                    q_tile_idx=i,
                     kernel_dtype=kernel_dtype,
                     acc_type=acc_type,
-                    flash_config=active_config,
-                    use_causal_mask=True,
                     tile_mask=cur_mask,
+                    use_causal_mask=True,
+                    q_tile_idx=i,
                     initialize=False,
+                    LARGE_TILE_SZ=LARGE_TILE_SZ,
                     B_P_SIZE=B_P_SIZE,
                     B_F_SIZE=B_F_SIZE,
                     B_D_SIZE=B_D_SIZE,
@@ -559,8 +732,8 @@ def flash_paged_attention(
     for i_q_h in nl.affine_range(q_h_per_k_h):
         for i in nl.affine_range(n_tile_q):
             out = nl.multiply(
-                o_buffer[i, i_q_h, :, :],
-                nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]),
+                o_buffer[i, i_q_h],
+                nl.exp(m_buffer[i, i_q_h] - l_buffer[i, i_q_h]),
                 dtype=kernel_dtype,
             )
 
@@ -589,7 +762,7 @@ def flash_paged_attention(
                         head_id * q_h_per_k_h + i_q_h,
                         nl.ds(i * B_P_SIZE, B_P_SIZE),
                     ],
-                    l_buffer[:, i, i_q_h],
+                    l_buffer[i, i_q_h],
                 )
                 nl.store(
                     hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :],
@@ -601,6 +774,49 @@ def flash_paged_attention(
     return o
 
 
+def reorder_context_mask(mask, LARGE_TILE_SZ, block_size):
+    """
+    Reorder the mask to make it compatible with the flash attention kernel.
+
+    We vectorize KV cache read to improve DMA utilization. However, the layout
+    that maximizes DMA bandwidth changes the order tokens are consumed.
+    
+    The token layout (inner 2 dimensions) after vectorized load is (B_P_SIZE,
+    tiled_block_size) in a tile of `B_P_SIZE * tiled_block_size` tokens. And
+    each step the engine consumes a column (rather than a row) of B_P_SIZE
+    tokens. Therefore, the tokens are visited in a strided way.
+
+    To make sure mask matches the order tokens are consumed, we need to properly
+    transpose mask.
+    """
+    total_query_len, total_seq_len = mask.shape
+    context_kv_len = total_seq_len - total_query_len
+
+    B_P_SIZE = 128
+    assert (LARGE_TILE_SZ
+            >= B_P_SIZE), f"{LARGE_TILE_SZ=} must be larger than {B_P_SIZE=}"
+    num_tiled_blocks = max(B_P_SIZE, LARGE_TILE_SZ // block_size)
+    tiled_block_size = LARGE_TILE_SZ // num_tiled_blocks
+    if tiled_block_size > 1:
+        # Mask reordering is needed when tiled_block_size > 1
+        device = mask.device
+        mask = mask.cpu()
+        context_mask = mask[:, :context_kv_len]
+        context_mask = context_mask.view(
+            total_query_len,
+            context_kv_len // LARGE_TILE_SZ,
+            num_tiled_blocks // B_P_SIZE,
+            B_P_SIZE,
+            tiled_block_size,
+        )
+        context_mask = context_mask.transpose(3, 4).reshape(
+            total_query_len, context_kv_len)
+        new_mask = mask[:, context_kv_len:]
+        return torch.concat([context_mask, new_mask], dim=1).to(device)
+    else:
+        return mask
+
+
 def flash_attn_varlen_nkifunc(
     query,
     key,
@@ -612,13 +828,32 @@ def flash_attn_varlen_nkifunc(
     n_kv_head=None,
     head_size=None,
     LARGE_TILE_SZ=2048,
-    return_debug_tensors=False,
     mixed_precision=True,
 ):
-    config = FlashConfig(
-        seq_tile_size=LARGE_TILE_SZ,
-        should_transpose_v=False,
-    )
+    """
+    Compute flash paged attention for variable length sequences.
+
+    This function is a wrapper around the flash attention NKI kernel. It takes
+    in the following arguments:
+      - query: (1, n_heads, d, seq_q)
+      - key:   (1, n_kv_heads, d, seq_k)
+      - value: (1, n_kv_heads, seq_v, d)
+      - key_cache:   (n_blocks, n_kv_heads, block_size, d)
+      - value_cache: (n_blocks, n_kv_heads, block_size, d)
+      - block_tables: (n_active_blocks, )
+      - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
+
+    Notes:
+      - attn_mask must be reordered outside using `reorder_context_mask`
+      - Key/value cache layout must be (n_blocks, n_kv_heads, block_size, d) 
+        for better DMA throughput
+    """
+    if n_kv_head is None:
+        n_kv_head = key_cache.shape[1]
+    assert key_cache.shape[1] == n_kv_head
+    if head_size is None:
+        head_size = key_cache.shape[-1]
+
     kwargs = dict(
         query=query,
         key=key,
@@ -628,15 +863,9 @@ def flash_attn_varlen_nkifunc(
         block_tables=block_table,
         mask=attn_mask,
         softmax_scale=1.0 / (head_size**0.5),
-        config=config,
         mixed_precision=mixed_precision,
-        return_debug_tensors=return_debug_tensors,
+        LARGE_TILE_SZ=LARGE_TILE_SZ,
     )
-    _, n_kv_head, _, _ = key.shape
 
-    if return_debug_tensors:
-        o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs)
-        return o, *debug_tensors
-    else:
-        o = flash_paged_attention[1, n_kv_head](**kwargs)
-        return o
+    o = flash_paged_attention[1, n_kv_head](**kwargs)
+    return o

From b30f066a5d091e5fc036d55d2d4b45c829aa968f Mon Sep 17 00:00:00 2001
From: Kante Yin <kerthcet@gmail.com>
Date: Fri, 21 Feb 2025 11:52:40 +0800
Subject: [PATCH 0292/1240] Add llmaz as another integration (#13643)

Signed-off-by: kerthcet <kerthcet@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/integrations/index.md | 1 +
 docs/source/deployment/integrations/llmaz.md | 7 +++++++
 2 files changed, 8 insertions(+)
 create mode 100644 docs/source/deployment/integrations/llmaz.md

diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
index c286edb4d7b..a557456c086 100644
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -6,4 +6,5 @@
 kserve
 kubeai
 llamastack
+llmaz
 :::
diff --git a/docs/source/deployment/integrations/llmaz.md b/docs/source/deployment/integrations/llmaz.md
new file mode 100644
index 00000000000..cd4a76353d2
--- /dev/null
+++ b/docs/source/deployment/integrations/llmaz.md
@@ -0,0 +1,7 @@
+(deployment-llmaz)=
+
+# llmaz
+
+[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
+
+Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.

From e842403a5ddc8a47cd9c32de66eb3b98a2e64c1a Mon Sep 17 00:00:00 2001
From: Edwin Hernandez <Edandres249@gmail.com>
Date: Thu, 20 Feb 2025 21:16:40 -0800
Subject: [PATCH 0293/1240] [Misc] Adding script to setup ray for multi-node
 vllm deployments  (#12913)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/online_serving/multi-node-serving.sh | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 examples/online_serving/multi-node-serving.sh

diff --git a/examples/online_serving/multi-node-serving.sh b/examples/online_serving/multi-node-serving.sh
new file mode 100644
index 00000000000..067f20c69b8
--- /dev/null
+++ b/examples/online_serving/multi-node-serving.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+subcommand=$1
+shift
+
+ray_port=6379
+ray_init_timeout=300
+declare -a start_params
+
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          start_params+=("$1")
+      esac
+      shift
+    done
+
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              start_params+=("$1")
+          esac
+          shift
+    done
+
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+
+    # start the ray daemon
+    ray start --head --port=$ray_port "${start_params[@]}"
+
+    # wait until all workers are active
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac

From b67c0ac6e37526d7db705b96e4614cec21312f82 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Thu, 20 Feb 2025 22:01:48 -0800
Subject: [PATCH 0294/1240] [NVIDIA] Fix an issue to use current stream for the
 nvfp4 quant (#13632)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/fp4/nvfp4_quant_kernels.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
index c3b8e9b3ec4..fef74111624 100644
--- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu
@@ -348,10 +348,7 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
   auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
   auto output_ptr = static_cast<int64_t*>(output.data_ptr());
   at::cuda::CUDAGuard device_guard{(char)input.get_device()};
-  auto stream = at::cuda::getStreamFromPool(false, input.get_device());
-  if (stream == nullptr) {
-    std::cerr << "Warning: Null CUDA stream" << std::endl;
-  }
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
 
   // We don't support e8m0 scales at this moment.
   bool useUE8M0 = false;

From d140fafc72561808cf749dc1cb1f3f7ae019190f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 21 Feb 2025 06:03:27 +0000
Subject: [PATCH 0295/1240] Use pre-commit to update `requirements-test.txt`
 (#13617)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml |  7 +++++++
 requirements-test.txt   | 31 +++++++++++++------------------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5c4cb767c9e..6a66131cdb4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,6 +44,13 @@ repos:
   hooks:
   - id: actionlint
     exclude: 'vllm/third_party/.*'
+repos:
+- repo: https://github.com/astral-sh/uv-pre-commit
+  rev: 0.6.2
+  hooks:
+    - id: pip-compile
+      args: [requirements-test.in, -o, requirements-test.txt]
+      files: ^requirements-test\.(in|txt)$
 - repo: local
   hooks:
   - id: mypy-local
diff --git a/requirements-test.txt b/requirements-test.txt
index f9158641914..11f0e10969a 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,9 +1,5 @@
-#
-# This file is autogenerated by pip-compile with Python 3.12
-# by the following command:
-#
-# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
-#
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements-test.in -o requirements-test.txt
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -141,7 +137,7 @@ frozenlist==1.5.0
     #   aiohttp
     #   aiosignal
     #   ray
-fsspec[http]==2024.9.0
+fsspec==2024.9.0
     # via
     #   datasets
     #   evaluate
@@ -221,7 +217,7 @@ librosa==0.10.2.post1
     # via -r requirements-test.in
 llvmlite==0.43.0
     # via numba
-lm-eval[api]==0.4.4
+lm-eval==0.4.4
     # via -r requirements-test.in
 lxml==5.3.0
     # via sacrebleu
@@ -238,10 +234,8 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common[opencv]==1.5.1
-    # via
-    #   -r requirements-test.in
-    #   mistral-common
+mistral-common==1.5.1
+    # via -r requirements-test.in
 more-itertools==10.5.0
     # via lm-eval
 mpmath==1.3.0
@@ -418,7 +412,7 @@ pybind11==2.13.6
     # via lm-eval
 pycparser==2.22
     # via cffi
-pydantic[email]==2.9.2
+pydantic==2.9.2
     # via
     #   datamodel-code-generator
     #   mistral-common
@@ -478,7 +472,7 @@ pyyaml==6.0.2
     #   vocos
 rapidfuzz==3.12.1
     # via jiwer
-ray[adag]==2.40.0
+ray==2.40.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
@@ -549,6 +543,10 @@ sentence-transformers==3.2.1
     # via -r requirements-test.in
 sentencepiece==0.2.0
     # via mistral-common
+setuptools==75.8.0
+    # via
+    #   pytablewriter
+    #   torch
 six==1.16.0
     # via
     #   python-dateutil
@@ -646,7 +644,7 @@ tritonclient==2.51.0
     # via
     #   -r requirements-test.in
     #   genai-perf
-typepy[datetime]==1.3.2
+typepy==1.3.2
     # via
     #   dataproperty
     #   pytablewriter
@@ -683,6 +681,3 @@ yarl==1.17.1
     # via aiohttp
 zstandard==0.23.0
     # via lm-eval
-
-# The following packages are considered to be unsafe in a requirements file:
-# setuptools

From 73f20583d21de04072283f89ad7ca139f483fd77 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 20 Feb 2025 22:04:33 -0800
Subject: [PATCH 0296/1240] [Bugfix] Add `mm_processor_kwargs` to chat-related
 protocols (#13644)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/protocol.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 98ea6a46133..29f64d28bdf 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -974,6 +974,10 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
+    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
     priority: int = Field(
         default=0,
         description=(
@@ -1394,6 +1398,10 @@ class TokenizeChatRequest(OpenAIBaseModel):
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
+    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
 
     @model_validator(mode="before")
     @classmethod

From 9464cdf5e02f2cbc35dc59b747f7f8797afdc6d2 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 20 Feb 2025 22:05:56 -0800
Subject: [PATCH 0297/1240] [V1][Sampler] Avoid an operation during temperature
 application (#13587)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/metadata.py        |  2 +-
 vllm/v1/sample/sampler.py         |  8 ++++----
 vllm/v1/utils.py                  |  6 ++++--
 vllm/v1/worker/gpu_input_batch.py | 12 +++++++++---
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 2184a1866ff..6d82d3a79c8 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -9,7 +9,7 @@
 @dataclass
 class SamplingMetadata:
 
-    temperature: torch.Tensor
+    temperature: Optional[torch.Tensor]
     all_greedy: bool
     all_random: bool
 
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 8e2533eefab..ff978b3b6c4 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -77,11 +77,8 @@ def apply_temperature(
         logits: torch.Tensor,
         temp: torch.Tensor,
     ) -> torch.Tensor:
-        # Avoid division by zero.
-        temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp)
         # Use in-place division to avoid creating a new tensor.
-        logits.div_(temp.unsqueeze(dim=1))
-        return logits
+        return logits.div_(temp.unsqueeze(dim=1))
 
     def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
         return logits.argmax(dim=-1).view(-1)
@@ -100,6 +97,8 @@ def sample(
             if sampling_metadata.all_greedy:
                 return greedy_sampled
 
+        assert sampling_metadata.temperature is not None
+
         # Apply temperature.
         logits = self.apply_temperature(logits, sampling_metadata.temperature)
 
@@ -122,6 +121,7 @@ def sample(
             sampling_metadata.temperature < _SAMPLING_EPS,
             greedy_sampled,
             random_sampled,
+            out=greedy_sampled,  # Reuse tensor
         )
         return sampled
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 5be46501424..62271255b0c 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -191,11 +191,13 @@ def bind_kv_cache(
 
 
 def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
-               length: int) -> None:
+               length: int) -> torch.Tensor:
     """
     Copy the first length elements of a tensor into another tensor in a
     non-blocking manner.
 
     Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
+
+    Returns the sliced target tensor.
     """
-    to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
+    return to_tensor[:length].copy_(from_tensor[:length], non_blocking=True)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index ccafc325b53..bd1c369acb3 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -242,10 +242,12 @@ def add_request(
         self.block_table.add_row(req_index, request.block_ids)
 
         sampling_params = request.sampling_params
-        self.temperature_cpu[req_index] = sampling_params.temperature
         if sampling_params.sampling_type == SamplingType.GREEDY:
+            # Avoid later division by zero.
+            self.temperature_cpu[req_index] = -1.0
             self.greedy_reqs.add(req_id)
         else:
+            self.temperature_cpu[req_index] = sampling_params.temperature
             self.random_reqs.add(req_id)
 
         self.top_p_cpu[req_index] = sampling_params.top_p
@@ -410,7 +412,11 @@ def refresh_sampling_metadata(self):
 
     def _make_sampling_metadata(self) -> SamplingMetadata:
         num_reqs = self.num_reqs
-        copy_slice(self.temperature_cpu_tensor, self.temperature, num_reqs)
+        if not self.all_greedy:
+            temperature = copy_slice(self.temperature_cpu_tensor,
+                                     self.temperature, num_reqs)
+        else:
+            temperature = None
         if not self.no_top_p:
             copy_slice(self.top_p_cpu_tensor, self.top_p, num_reqs)
         if not self.no_top_k:
@@ -437,7 +443,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             prompt_token_ids = None
 
         return SamplingMetadata(
-            temperature=self.temperature[:num_reqs],
+            temperature=temperature,
             all_greedy=self.all_greedy,
             all_random=self.all_random,
             top_p=None if self.no_top_p else self.top_p[:num_reqs],

From da0abea97d8b0c18b75515b07be2ee0c864921da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Fri, 21 Feb 2025 07:06:54 +0100
Subject: [PATCH 0298/1240] Missing comment explaining VDR variable in GGUF
 kernels (#13290)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gguf/vecdotq.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh
index e00422637c6..d0d4c74ed37 100644
--- a/csrc/quantization/gguf/vecdotq.cuh
+++ b/csrc/quantization/gguf/vecdotq.cuh
@@ -37,6 +37,8 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t *
     return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
 }
 
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
 
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4

From c54a2cfc2a9608e333bbda0aac6d91dabd4dbe23 Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Fri, 21 Feb 2025 03:09:47 -0300
Subject: [PATCH 0299/1240] [FEATURE] Enables /score endpoint for embedding
 models (#12846)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/pooling_models.md          |   3 +-
 .../serving/openai_compatible_server.md       |  10 +-
 tests/entrypoints/openai/test_rerank.py       |   6 +-
 tests/entrypoints/openai/test_score.py        | 284 ++++++-----
 vllm/entrypoints/llm.py                       |  46 +-
 vllm/entrypoints/openai/api_server.py         |  17 +-
 vllm/entrypoints/openai/run_batch.py          |  31 +-
 vllm/entrypoints/openai/serving_engine.py     |   4 +-
 vllm/entrypoints/openai/serving_rerank.py     | 208 --------
 vllm/entrypoints/openai/serving_score.py      | 463 +++++++++++++-----
 vllm/entrypoints/score_utils.py               |  49 ++
 11 files changed, 599 insertions(+), 522 deletions(-)
 delete mode 100644 vllm/entrypoints/openai/serving_rerank.py
 create mode 100644 vllm/entrypoints/score_utils.py

diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index 8612935432b..f774f3d0fa0 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -108,8 +108,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas
 ### `LLM.score`
 
 The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
-It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html).
-These types of models serve as rerankers between candidate query-document pairs in RAG systems.
+It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
 
 :::{note}
 vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 64439475fdb..9b9242abf1e 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -51,7 +51,7 @@ In addition, we have the following custom APIs:
 - [Pooling API](#pooling-api) (`/pooling`)
   - Applicable to all [pooling models](../models/pooling_models.md).
 - [Score API](#score-api) (`/score`)
-  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+  - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
 - [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
   - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
   - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
@@ -333,10 +333,10 @@ Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
 ### Score API
 
-Our Score API applies a cross-encoder model to predict scores for sentence pairs.
+Our Score API can apply a cross-encoder model or an embedding model to predict scores for sentence pairs. When using an embedding model the score corresponds to the cosine similarity between each embedding pair.
 Usually, the score for a sentence pair refers to the similarity between two sentences, on a scale of 0 to 1.
 
-You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
 Code example: <gh-file:examples/online_serving/openai_cross_encoder_score.py>
 
@@ -496,11 +496,11 @@ The following extra parameters are supported:
 
 ### Re-rank API
 
-Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and
+Our Re-rank API can apply an embedding model or a cross-encoder model to predict relevant scores between a single query, and
 each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on
 a scale of 0 to 1.
 
-You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
+You can find the documentation for cross encoder models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html).
 
 The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the
 `score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank`
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index cf114f0641d..ba11cd3a29a 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -8,17 +8,17 @@
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "BAAI/bge-reranker-base"
+DTYPE = "bfloat16"
 
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--max-model-len", "100"]
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
     assert rerank.results[1].relevance_score <= 0.01
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_top_n(server: RemoteOpenAIServer, model_name: str):
     query = "What is the capital of France?"
@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
     assert rerank.results[1].relevance_score <= 0.01
 
 
-@pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
 
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index bcbcb5702c9..b756680ea9f 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,123 +1,185 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import math
+from typing import Any
+
 import pytest
 import requests
+import torch.nn.functional as F
+from torch import tensor
 
 from vllm.entrypoints.openai.protocol import ScoreResponse
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "BAAI/bge-reranker-v2-m3"
-
-
-@pytest.fixture(scope="module")
-def server():
-    args = ["--enforce-eager", "--max-model-len", "100"]
-
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+MODELS = [
+    {
+        "name": "BAAI/bge-reranker-v2-m3",
+        "is_cross_encoder": True
+    },
+    {
+        "name": "BAAI/bge-base-en-v1.5",
+        "is_cross_encoder": False
+    },
+]
+DTYPE = "half"
+
+
+def run_transformers(hf_model, model, text_pairs):
+    if model["is_cross_encoder"]:
+        return hf_model.predict(text_pairs).tolist()
+    else:
+        hf_embeddings = [
+            hf_model.encode(text_pair) for text_pair in text_pairs
+        ]
+        return [
+            F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
+            for pair in hf_embeddings
+        ]
+
+
+@pytest.fixture(scope="class", params=MODELS)
+def model(request):
+    yield request.param
+
+
+@pytest.fixture(scope="class")
+def server(model: dict[str, Any]):
+    args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
+
+    with RemoteOpenAIServer(model["name"], args) as remote_server:
         yield remote_server
 
 
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
-    text_1 = "What is the capital of France?"
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 2
-    assert score.data[0].score <= 0.01
-    assert score.data[1].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
-    text_1 = [
-        "What is the capital of the United States?",
-        "What is the capital of France?"
-    ]
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 2
-    assert score.data[0].score <= 0.01
-    assert score.data[1].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
-    text_1 = "What is the capital of France?"
-    text_2 = "The capital of France is Paris."
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    score_response.raise_for_status()
-    score = ScoreResponse.model_validate(score_response.json())
-
-    assert score.id is not None
-    assert score.data is not None
-    assert len(score.data) == 1
-    assert score.data[0].score >= 0.9
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
-
-    text_1 = "What is the capital of France?" * 20
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
-
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                   })
-    assert score_response.status_code == 400
-    # Assert just a small fragments of the response
-    assert "Please reduce the length of the input." in \
-        score_response.text
-
-    # Test truncation
-    score_response = requests.post(server.url_for("score"),
-                                   json={
-                                       "model": model_name,
-                                       "text_1": text_1,
-                                       "text_2": text_2,
-                                       "truncate_prompt_tokens": 101
-                                   })
-    assert score_response.status_code == 400
-    assert "Please, select a smaller truncation size." in \
-        score_response.text
+@pytest.fixture(scope="class")
+def runner(model: dict[str, Any], hf_runner):
+    kwargs = {
+        "dtype": DTYPE,
+        "is_cross_encoder" if model["is_cross_encoder"]\
+              else "is_sentence_transformer": True
+    }
+
+    with hf_runner(model["name"], **kwargs) as hf_model:
+        yield hf_model
+
+
+class TestModel:
+
+    def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
+                                    model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
+                                     model: dict[str, Any], runner):
+        text_1 = [
+            "What is the capital of the United States?",
+            "What is the capital of France?"
+        ]
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 2
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
+                                   model: dict[str, Any], runner):
+        text_1 = "What is the capital of France?"
+        text_2 = "The capital of France is Paris."
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        score_response.raise_for_status()
+        score = ScoreResponse.model_validate(score_response.json())
+
+        assert score.id is not None
+        assert score.data is not None
+        assert len(score.data) == 1
+
+        vllm_outputs = [d.score for d in score.data]
+
+        text_pairs = [[text_1, text_2]]
+        hf_outputs = run_transformers(runner, model, text_pairs)
+
+        for i in range(len(vllm_outputs)):
+            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+
+    def test_score_max_model_len(self, server: RemoteOpenAIServer,
+                                 model: dict[str, Any]):
+
+        text_1 = "What is the capital of France?" * 20
+        text_2 = [
+            "The capital of Brazil is Brasilia.",
+            "The capital of France is Paris."
+        ]
+
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                       })
+        assert score_response.status_code == 400
+        # Assert just a small fragments of the response
+        assert "Please reduce the length of the input." in \
+            score_response.text
+
+        # Test truncation
+        score_response = requests.post(server.url_for("score"),
+                                       json={
+                                           "model": model["name"],
+                                           "text_1": text_1,
+                                           "text_2": text_2,
+                                           "truncate_prompt_tokens": 101
+                                       })
+        assert score_response.status_code == 400
+        assert "Please, select a smaller truncation size." in \
+            score_response.text
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 40b7a529ebf..cefb9184b20 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -7,7 +7,6 @@
                     Tuple, Type, Union, cast, overload)
 
 import cloudpickle
-import torch
 import torch.nn as nn
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
@@ -25,6 +24,8 @@
                                          apply_mistral_chat_template,
                                          parse_chat_messages,
                                          resolve_chat_template_content_format)
+from vllm.entrypoints.score_utils import (_cosine_similarity,
+                                          _validate_score_input_lens)
 from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt
 from vllm.inputs.parse import is_token_prompt, parse_and_batch_prompt
 from vllm.logger import init_logger
@@ -1010,40 +1011,25 @@ def _embedding_score(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> List[ScoringRequestOutput]:
 
-        encoded_output = self.encode(
+        encoded_output: List[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request)
-        encoded_output_1 = encoded_output[0:len(text_1)]
-        encoded_output_2 = encoded_output[len(text_1):]
+
+        encoded_output_1: List[PoolingRequestOutput] = encoded_output[
+            0:len(text_1)]
+        encoded_output_2: List[PoolingRequestOutput] = encoded_output[
+            len(text_1):]
 
         if len(encoded_output_1) == 1:
             encoded_output_1 = encoded_output_1 * len(encoded_output_2)
 
-        output_pairs = [(t1, t2)
-                        for t1, t2 in zip(encoded_output_1, encoded_output_2)]
-
-        scores = []
-        scorer = torch.nn.CosineSimilarity(0)
-
-        for embed_1, embed_2 in output_pairs:
-            pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data)
+        scores: List[PoolingRequestOutput] = []
 
-            if (pad_token_id := getattr(tokenizer, "pad_token_id",
-                                        None)) is not None:
-                tokens = embed_1.prompt_token_ids + [
-                    pad_token_id
-                ] + embed_2.prompt_token_ids
-            else:
-                tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids
-
-            scores.append(
-                PoolingRequestOutput(
-                    request_id=f"{embed_1.request_id}_{embed_2.request_id}",
-                    outputs=pair_score,
-                    prompt_token_ids=tokens,
-                    finished=True))
+        scores = _cosine_similarity(tokenizer=tokenizer,
+                                    embed_1=encoded_output_1,
+                                    embed_2=encoded_output_2)
 
         items = self.engine_class.validate_outputs(scores,
                                                    PoolingRequestOutput)
@@ -1183,12 +1169,7 @@ def ensure_str(prompt: SingletonPrompt):
             text_2 = [text_2]
         input_text_2: List[str] = [ensure_str(t) for t in text_2]
 
-        if len(input_text_1) > 1 and len(input_text_1) != len(input_text_2):
-            raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
-        if len(input_text_1) == 0:
-            raise ValueError("At least one text element must be given")
-        if len(input_text_2) == 0:
-            raise ValueError("At least one text_pair element must be given")
+        _validate_score_input_lens(input_text_1, input_text_2)
 
         if self.llm_engine.model_config.is_cross_encoder:
             return self._cross_encoding_score(tokenizer, input_text_1,
@@ -1197,7 +1178,6 @@ def ensure_str(prompt: SingletonPrompt):
                                               lora_request,
                                               prompt_adapter_request)
         else:
-
             return self._embedding_score(
                 tokenizer,
                 input_text_1,  # type: ignore[arg-type]
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f7162fadbce..d037a4e6348 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -73,8 +73,7 @@
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
-from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank
-from vllm.entrypoints.openai.serving_score import OpenAIServingScores
+from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import (
     OpenAIServingTokenization)
 from vllm.entrypoints.openai.serving_transcription import (
@@ -320,12 +319,12 @@ def embedding(request: Request) -> Optional[OpenAIServingEmbedding]:
     return request.app.state.openai_serving_embedding
 
 
-def score(request: Request) -> Optional[OpenAIServingScores]:
+def score(request: Request) -> Optional[ServingScores]:
     return request.app.state.openai_serving_scores
 
 
-def rerank(request: Request) -> Optional[JinaAIServingRerank]:
-    return request.app.state.jinaai_serving_reranking
+def rerank(request: Request) -> Optional[ServingScores]:
+    return request.app.state.openai_serving_scores
 
 
 def tokenization(request: Request) -> OpenAIServingTokenization:
@@ -866,13 +865,13 @@ async def init_app_state(
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
     ) if model_config.task == "embed" else None
-    state.openai_serving_scores = OpenAIServingScores(
+    state.openai_serving_scores = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
-        request_logger=request_logger
-    ) if model_config.task == "score" else None
-    state.jinaai_serving_reranking = JinaAIServingRerank(
+        request_logger=request_logger) if model_config.task in (
+            "score", "embed", "pooling") else None
+    state.jinaai_serving_reranking = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 81e7028ad77..e4496f61e60 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -26,7 +26,7 @@
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
-from vllm.entrypoints.openai.serving_score import OpenAIServingScores
+from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, random_uuid
 from vllm.version import __version__ as VLLM_VERSION
@@ -342,7 +342,7 @@ async def main(args):
         chat_template=None,
         chat_template_content_format="auto",
     ) if model_config.task == "embed" else None
-    openai_serving_scores = (OpenAIServingScores(
+    openai_serving_scores = (ServingScores(
         engine,
         model_config,
         openai_serving_models,
@@ -364,9 +364,9 @@ async def main(args):
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
-            handler_fn = (None if openai_serving_chat is None else
-                          openai_serving_chat.create_chat_completion)
-            if handler_fn is None:
+            chat_handler_fn = (None if openai_serving_chat is None else
+                               openai_serving_chat.create_chat_completion)
+            if chat_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
@@ -375,12 +375,13 @@ async def main(args):
                     ))
                 continue
 
-            response_futures.append(run_request(handler_fn, request, tracker))
+            response_futures.append(
+                run_request(chat_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/embeddings":
-            handler_fn = (None if openai_serving_embedding is None else
-                          openai_serving_embedding.create_embedding)
-            if handler_fn is None:
+            embed_handler_fn = (None if openai_serving_embedding is None else
+                                openai_serving_embedding.create_embedding)
+            if embed_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
@@ -388,12 +389,13 @@ async def main(args):
                     ))
                 continue
 
-            response_futures.append(run_request(handler_fn, request, tracker))
+            response_futures.append(
+                run_request(embed_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/score":
-            handler_fn = (None if openai_serving_scores is None else
-                          openai_serving_scores.create_score)
-            if handler_fn is None:
+            score_handler_fn = (None if openai_serving_scores is None else
+                                openai_serving_scores.create_score)
+            if score_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
                         request,
@@ -401,7 +403,8 @@ async def main(args):
                     ))
                 continue
 
-            response_futures.append(run_request(handler_fn, request, tracker))
+            response_futures.append(
+                run_request(score_handler_fn, request, tracker))
             tracker.submitted()
         else:
             response_futures.append(
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index dfc3328677c..5619e509c55 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -52,8 +52,8 @@
 logger = init_logger(__name__)
 
 CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest,
-                              EmbeddingCompletionRequest, ScoreRequest,
-                              TokenizeCompletionRequest]
+                              EmbeddingCompletionRequest, RerankRequest,
+                              ScoreRequest, TokenizeCompletionRequest]
 
 ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest,
                         TokenizeChatRequest]
diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py
deleted file mode 100644
index 366df71217e..00000000000
--- a/vllm/entrypoints/openai/serving_rerank.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import asyncio
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
-
-from fastapi import Request
-
-from vllm.config import ModelConfig
-from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
-                                              RerankRequest, RerankResponse,
-                                              RerankResult, RerankUsage)
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.inputs.data import TokensPrompt
-from vllm.logger import init_logger
-from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import make_async, merge_async_iterators
-
-logger = init_logger(__name__)
-
-
-class JinaAIServingRerank(OpenAIServing):
-
-    def __init__(
-        self,
-        engine_client: EngineClient,
-        model_config: ModelConfig,
-        models: OpenAIServingModels,
-        *,
-        request_logger: Optional[RequestLogger],
-    ) -> None:
-        super().__init__(engine_client=engine_client,
-                         model_config=model_config,
-                         models=models,
-                         request_logger=request_logger)
-
-    async def do_rerank(
-        self,
-        request: RerankRequest,
-        raw_request: Optional[Request] = None
-    ) -> Union[RerankResponse, ErrorResponse]:
-        """
-        Rerank API based on JinaAI's rerank API; implements the same
-        API interface. Designed for compatibility with off-the-shelf
-        tooling, since this is a common standard for reranking APIs
-
-        See example client implementations at
-        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
-        numerous clients use this standard.
-        """
-        error_check_ret = await self._check_model(request)
-        if error_check_ret is not None:
-            return error_check_ret
-
-        model_name = request.model
-        request_id = f"rerank-{self._base_request_id(raw_request)}"
-        truncate_prompt_tokens = request.truncate_prompt_tokens
-        query = request.query
-        documents = request.documents
-        request_prompts = []
-        engine_prompts = []
-        top_n = request.top_n if request.top_n > 0 else len(documents)
-
-        try:
-            (
-                lora_request,
-                prompt_adapter_request,
-            ) = self._maybe_get_adapters(request)
-
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
-            if prompt_adapter_request is not None:
-                raise NotImplementedError("Prompt adapter is not supported "
-                                          "for scoring models")
-
-            if isinstance(tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "MistralTokenizer not supported for cross-encoding")
-
-            if not self.model_config.is_cross_encoder:
-                raise ValueError("Model is not cross encoder.")
-
-            if truncate_prompt_tokens is not None and \
-                    truncate_prompt_tokens > self.max_model_len:
-                raise ValueError(
-                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
-                    f"is greater than max_model_len ({self.max_model_len})."
-                    f" Please, select a smaller truncation size.")
-            for doc in documents:
-                request_prompt = f"{query}{tokenizer.sep_token}{doc}"
-                tokenization_kwargs: Dict[str, Any] = {}
-                if truncate_prompt_tokens is not None:
-                    tokenization_kwargs["truncation"] = True
-                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-                tokenize_async = make_async(tokenizer.__call__,
-                                            executor=self._tokenizer_executor)
-                prompt_inputs = await tokenize_async(text=query,
-                                                     text_pair=doc,
-                                                     **tokenization_kwargs)
-
-                input_ids = prompt_inputs["input_ids"]
-                text_token_prompt = \
-                    self._validate_input(request, input_ids, request_prompt)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-
-                request_prompts.append(request_prompt)
-                engine_prompts.append(engine_prompt)
-
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-
-        # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
-
-        try:
-            pooling_params = request.to_pooling_params()
-
-            for i, engine_prompt in enumerate(engine_prompts):
-                request_id_item = f"{request_id}-{i}"
-
-                self._log_inputs(request_id_item,
-                                 request_prompts[i],
-                                 params=pooling_params,
-                                 lora_request=lora_request,
-                                 prompt_adapter_request=prompt_adapter_request)
-
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
-
-                generator = self.engine_client.encode(
-                    engine_prompt,
-                    pooling_params,
-                    request_id_item,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=request.priority,
-                )
-
-                generators.append(generator)
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
-        result_generator = merge_async_iterators(*generators)
-
-        num_prompts = len(engine_prompts)
-
-        # Non-streaming response
-        final_res_batch: List[Optional[PoolingRequestOutput]]
-        final_res_batch = [None] * num_prompts
-
-        try:
-            async for i, res in result_generator:
-                final_res_batch[i] = res
-
-            assert all(final_res is not None for final_res in final_res_batch)
-
-            final_res_batch_checked = cast(List[PoolingRequestOutput],
-                                           final_res_batch)
-
-            response = self.request_output_to_rerank_response(
-                final_res_batch_checked, request_id, model_name, documents,
-                top_n)
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
-
-        return response
-
-    def request_output_to_rerank_response(
-            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
-            model_name: str, documents: List[str],
-            top_n: int) -> RerankResponse:
-        """
-        Convert the output of do_rank to a RerankResponse
-        """
-        results: List[RerankResult] = []
-        num_prompt_tokens = 0
-        for idx, final_res in enumerate(final_res_batch):
-            classify_res = ScoringRequestOutput.from_base(final_res)
-
-            result = RerankResult(
-                index=idx,
-                document=RerankDocument(text=documents[idx]),
-                relevance_score=classify_res.outputs.score,
-            )
-            results.append(result)
-            prompt_token_ids = final_res.prompt_token_ids
-            num_prompt_tokens += len(prompt_token_ids)
-
-        # sort by relevance, then return the top n if set
-        results.sort(key=lambda x: x.relevance_score, reverse=True)
-        if top_n < len(documents):
-            results = results[:top_n]
-
-        return RerankResponse(
-            id=request_id,
-            model=model_name,
-            results=results,
-            usage=RerankUsage(total_tokens=num_prompt_tokens))
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c7597808f7f..0e9b355ad4f 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,53 +1,36 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import asyncio
 import time
-from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast
+from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union
 
 from fastapi import Request
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest,
-                                              ScoreResponse, ScoreResponseData,
-                                              UsageInfo)
+from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument,
+                                              RerankRequest, RerankResponse,
+                                              RerankResult, RerankUsage,
+                                              ScoreRequest, ScoreResponse,
+                                              ScoreResponseData, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.score_utils import (_cosine_similarity,
+                                          _validate_score_input_lens)
 from vllm.inputs.data import TokensPrompt
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               PreTrainedTokenizer,
+                                               PreTrainedTokenizerFast)
 from vllm.utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
 
 
-def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str],
-                                                            str]) -> List:
-    if isinstance(text_1, (str, dict)):
-        # Convert a single prompt to a list.
-        text_1 = [text_1]
-    text_1 = [t for t in text_1]
-
-    if isinstance(text_2, (str, dict)):
-        # Convert a single prompt to a list.
-        text_2 = [text_2]
-    text_2 = [t for t in text_2]
-    if len(text_1) > 1 and len(text_1) != len(text_2):
-        raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
-    if len(text_1) == 0:
-        raise ValueError("At least one text element must be given")
-    if len(text_2) == 0:
-        raise ValueError("At least one text_pair element must be given")
-
-    if len(text_1) == 1:
-        text_1 = text_1 * len(text_2)
-
-    return [(t1, t2) for t1, t2 in zip(text_1, text_2)]
-
-
-class OpenAIServingScores(OpenAIServing):
+class ServingScores(OpenAIServing):
 
     def __init__(
         self,
@@ -62,137 +45,280 @@ def __init__(
                          models=models,
                          request_logger=request_logger)
 
-    async def create_score(
+    async def _embedding_score(
         self,
-        request: ScoreRequest,
-        raw_request: Optional[Request] = None,
-    ) -> Union[ScoreResponse, ErrorResponse]:
-        """
-        Score API similar to Sentence Transformers cross encoder
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        texts_1: List[str],
+        texts_2: List[str],
+        request: Union[RerankRequest, ScoreRequest],
+        request_id=str,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[Union[LoRARequest, None]] = None,
+        prompt_adapter_request: Optional[Union[PromptAdapterRequest,
+                                               None]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> List[PoolingRequestOutput]:
+
+        input_texts = texts_1 + texts_2
+
+        engine_prompts: List[TokensPrompt] = []
+        tokenize_async = make_async(tokenizer.__call__,
+                                    executor=self._tokenizer_executor)
+
+        tokenization_kwargs = tokenization_kwargs or {}
+        tokenized_prompts = await asyncio.gather(
+            *(tokenize_async(t, **tokenization_kwargs) for t in input_texts))
+
+        for tok_result, input_text in zip(tokenized_prompts, input_texts):
+
+            text_token_prompt = \
+                self._validate_input(
+                    request,
+                    tok_result["input_ids"],
+                    input_text)
+
+            engine_prompts.append(
+                TokensPrompt(
+                    prompt_token_ids=text_token_prompt["prompt_token_ids"]))
 
-        See https://sbert.net/docs/package_reference/cross_encoder
-        """
-        error_check_ret = await self._check_model(request)
-        if error_check_ret is not None:
-            return error_check_ret
+        # Schedule the request and get the result generator.
+        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        pooling_params = request.to_pooling_params()
 
-        model_name = request.model
-        request_id = f"score-{self._base_request_id(raw_request)}"
-        created_time = int(time.time())
-        truncate_prompt_tokens = request.truncate_prompt_tokens
+        for i, engine_prompt in enumerate(engine_prompts):
 
-        request_prompts = []
-        engine_prompts = []
+            request_id_item = f"{request_id}-{i}"
 
-        try:
-            (
-                lora_request,
-                prompt_adapter_request,
-            ) = self._maybe_get_adapters(request)
+            self._log_inputs(request_id_item,
+                             input_texts[i],
+                             params=pooling_params,
+                             lora_request=lora_request,
+                             prompt_adapter_request=prompt_adapter_request)
 
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            generators.append(
+                self.engine_client.encode(
+                    engine_prompt,
+                    pooling_params,
+                    request_id_item,
+                    lora_request=lora_request,
+                    trace_headers=trace_headers,
+                    priority=request.priority,
+                ))
 
-            if prompt_adapter_request is not None:
-                raise NotImplementedError("Prompt adapter is not supported "
-                                          "for scoring models")
+        result_generator = merge_async_iterators(*generators)
 
-            if isinstance(tokenizer, MistralTokenizer):
-                raise ValueError(
-                    "MistralTokenizer not supported for cross-encoding")
+        # Non-streaming response
+        final_res_batch: List[PoolingRequestOutput] = []
 
-            if not self.model_config.is_cross_encoder:
-                raise ValueError("Model is not cross encoder.")
+        embeddings: List[Optional[PoolingRequestOutput]] =\
+              [None] * len(engine_prompts)
 
-            if truncate_prompt_tokens is not None and \
-                truncate_prompt_tokens > self.max_model_len:
-                raise ValueError(
-                    f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
-                    f"is greater than max_model_len ({self.max_model_len})."
-                    f" Please, select a smaller truncation size.")
-
-            input_pairs = make_pairs(request.text_1, request.text_2)
-            for q, t in input_pairs:
-                request_prompt = f"{q}{tokenizer.sep_token}{t}"
-
-                tokenization_kwargs: Dict[str, Any] = {}
-                if truncate_prompt_tokens is not None:
-                    tokenization_kwargs["truncation"] = True
-                    tokenization_kwargs["max_length"] = truncate_prompt_tokens
-
-                tokenize_async = make_async(tokenizer.__call__,
-                                            executor=self._tokenizer_executor)
-                prompt_inputs = await tokenize_async(q,
-                                                     text_pair=t,
-                                                     **tokenization_kwargs)
-
-                input_ids = prompt_inputs["input_ids"]
-                text_token_prompt = \
-                    self._validate_input(request, input_ids, request_prompt)
-                engine_prompt = TokensPrompt(
-                    prompt_token_ids=text_token_prompt["prompt_token_ids"],
-                    token_type_ids=prompt_inputs.get("token_type_ids"))
-
-                request_prompts.append(request_prompt)
-                engine_prompts.append(engine_prompt)
+        async for i, res in result_generator:
+            embeddings[i] = res
 
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
+        emb_texts_1: List[PoolingRequestOutput] = []
+        emb_texts_2: List[PoolingRequestOutput] = []
+
+        for i in range(0, len(texts_1)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_1.append(emb)
+
+        for i in range(len(texts_1), len(embeddings)):
+            assert (emb := embeddings[i]) is not None
+            emb_texts_2.append(emb)
+
+        if len(emb_texts_1) == 1:
+            emb_texts_1 = emb_texts_1 * len(emb_texts_2)
+
+        final_res_batch = _cosine_similarity(tokenizer=tokenizer,
+                                             embed_1=emb_texts_1,
+                                             embed_2=emb_texts_2)
+
+        return final_res_batch
+
+    async def _cross_encoding_score(
+        self,
+        tokenizer: Union[AnyTokenizer],
+        texts_1: List[str],
+        texts_2: List[str],
+        request: Union[RerankRequest, ScoreRequest],
+        request_id=str,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
+        lora_request: Optional[Union[LoRARequest, None]] = None,
+        prompt_adapter_request: Optional[Union[PromptAdapterRequest,
+                                               None]] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+    ) -> List[PoolingRequestOutput]:
+
+        request_prompts: List[str] = []
+        engine_prompts: List[TokensPrompt] = []
+
+        if len(texts_1) == 1:
+            texts_1 = texts_1 * len(texts_2)
+
+        input_pairs = [(t1, t2) for t1, t2 in zip(texts_1, texts_2)]
+
+        if isinstance(tokenizer, MistralTokenizer):
+            raise ValueError(
+                "MistralTokenizer not supported for cross-encoding")
+
+        tokenize_async = make_async(tokenizer.__call__,
+                                    executor=self._tokenizer_executor)
+
+        tokenization_kwargs = tokenization_kwargs or {}
+        tokenized_prompts = await asyncio.gather(
+            *(tokenize_async(text=t1, text_pair=t2, **tokenization_kwargs)
+              for t1, t2 in input_pairs))
+
+        for prompt_inputs, (t1, t2) in zip(tokenized_prompts, input_pairs):
+
+            request_prompt = f"{t1}{tokenizer.sep_token}{t2}"
+
+            input_ids = prompt_inputs["input_ids"]
+            text_token_prompt = \
+                self._validate_input(request, input_ids, request_prompt)
+            engine_prompt = TokensPrompt(
+                prompt_token_ids=text_token_prompt["prompt_token_ids"],
+                token_type_ids=prompt_inputs.get("token_type_ids"))
+
+            request_prompts.append(request_prompt)
+            engine_prompts.append(engine_prompt)
 
         # Schedule the request and get the result generator.
         generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        try:
-            pooling_params = request.to_pooling_params()
-
-            for i, engine_prompt in enumerate(engine_prompts):
-                request_id_item = f"{request_id}-{i}"
+        pooling_params = request.to_pooling_params()
 
-                self._log_inputs(request_id_item,
-                                 request_prompts[i],
-                                 params=pooling_params,
-                                 lora_request=lora_request,
-                                 prompt_adapter_request=prompt_adapter_request)
+        for i, engine_prompt in enumerate(engine_prompts):
+            request_id_item = f"{request_id}-{i}"
 
-                trace_headers = (None if raw_request is None else await
-                                 self._get_trace_headers(raw_request.headers))
+            self._log_inputs(request_id_item,
+                             request_prompts[i],
+                             params=pooling_params,
+                             lora_request=lora_request,
+                             prompt_adapter_request=prompt_adapter_request)
 
-                generator = self.engine_client.encode(
-                    engine_prompt,
-                    pooling_params,
-                    request_id_item,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=request.priority,
-                )
+            generator = self.engine_client.encode(
+                engine_prompt,
+                pooling_params,
+                request_id_item,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
 
-                generators.append(generator)
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
+            generators.append(generator)
 
         result_generator = merge_async_iterators(*generators)
 
-        num_prompts = len(engine_prompts)
-
         # Non-streaming response
-        final_res_batch: List[Optional[PoolingRequestOutput]]
-        final_res_batch = [None] * num_prompts
+        final_res_batch: List[
+            Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
 
-        try:
-            async for i, res in result_generator:
-                final_res_batch[i] = res
+        async for i, res in result_generator:
+            final_res_batch[i] = res
+
+        return [out for out in final_res_batch if out is not None]
+
+    async def _run_scoring(
+        self,
+        texts_1: Union[str, list[str]],
+        texts_2: Union[str, list[str]],
+        request: Union[ScoreRequest, RerankRequest],
+        request_id: str,
+        raw_request: Optional[Request] = None,
+        truncate_prompt_tokens: Optional[int] = None,
+    ) -> List[PoolingRequestOutput]:
+
+        tokenization_kwargs: Dict[str, Any] = {}
+        if truncate_prompt_tokens is not None:
+            tokenization_kwargs["truncation"] = True
+            tokenization_kwargs["max_length"] = truncate_prompt_tokens
+
+        (
+            lora_request,
+            prompt_adapter_request,
+        ) = self._maybe_get_adapters(request)
+
+        if prompt_adapter_request is not None:
+            raise NotImplementedError("Prompt adapter is not supported "
+                                      "for scoring models")
+
+        tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
+        if truncate_prompt_tokens is not None and \
+                truncate_prompt_tokens > self.max_model_len:
+            raise ValueError(
+                f"truncate_prompt_tokens value ({truncate_prompt_tokens}) "
+                f"is greater than max_model_len ({self.max_model_len})."
+                f" Please, select a smaller truncation size.")
+
+        trace_headers = (None if raw_request is None else await
+                         self._get_trace_headers(raw_request.headers))
+
+        if isinstance(texts_1, str):
+            texts_1 = [texts_1]
+        if isinstance(texts_2, str):
+            texts_2 = [texts_2]
+
+        _validate_score_input_lens(texts_1, texts_2)
+
+        if self.model_config.is_cross_encoder:
+            return await self._cross_encoding_score(
+                tokenizer=tokenizer,
+                texts_1=texts_1,
+                texts_2=texts_2,
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                prompt_adapter_request=prompt_adapter_request,
+                trace_headers=trace_headers)
+
+        else:
+            return await self._embedding_score(
+                tokenizer=tokenizer,
+                texts_1=texts_1,
+                texts_2=texts_2,
+                request=request,
+                request_id=request_id,
+                tokenization_kwargs=tokenization_kwargs,
+                lora_request=lora_request,
+                prompt_adapter_request=prompt_adapter_request,
+                trace_headers=trace_headers)
+
+    async def create_score(
+        self,
+        request: ScoreRequest,
+        raw_request: Optional[Request] = None,
+    ) -> Union[ScoreResponse, ErrorResponse]:
+        """
+        Score API similar to Sentence Transformers cross encoder
+
+        See https://sbert.net/docs/package_reference/cross_encoder
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
 
-            assert all(final_res is not None for final_res in final_res_batch)
+        request_id = f"score-{self._base_request_id(raw_request)}"
+        created_time = int(time.time())
 
-            final_res_batch_checked = cast(List[PoolingRequestOutput],
-                                           final_res_batch)
+        try:
+            final_res_batch = await self._run_scoring(
+                request.text_1,
+                request.text_2,
+                request,
+                request_id,
+                raw_request,
+                request.truncate_prompt_tokens,
+            )
 
-            response = self.request_output_to_score_response(
-                final_res_batch_checked,
+            return self.request_output_to_score_response(
+                final_res_batch,
                 request_id,
                 created_time,
-                model_name,
+                request.model,
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
@@ -200,7 +326,44 @@ async def create_score(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        return response
+    async def do_rerank(
+        self,
+        request: RerankRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[RerankResponse, ErrorResponse]:
+        """
+        Rerank API based on JinaAI's rerank API; implements the same
+        API interface. Designed for compatibility with off-the-shelf
+        tooling, since this is a common standard for reranking APIs
+
+        See example client implementations at
+        https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py
+        numerous clients use this standard.
+        """
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+
+        request_id = f"rerank-{self._base_request_id(raw_request)}"
+        documents = request.documents
+        top_n = request.top_n if request.top_n > 0 else len(documents)
+
+        try:
+            final_res_batch = await self._run_scoring(
+                request.query,
+                documents,
+                request,
+                request_id,
+                raw_request,
+                request.truncate_prompt_tokens,
+            )
+            return self.request_output_to_rerank_response(
+                final_res_batch, request_id, request.model, documents, top_n)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            return self.create_error_response(str(e))
 
     def request_output_to_score_response(
         self,
@@ -236,3 +399,35 @@ def request_output_to_score_response(
             data=items,
             usage=usage,
         )
+
+    def request_output_to_rerank_response(
+            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: List[str],
+            top_n: int) -> RerankResponse:
+        """
+        Convert the output of do_rank to a RerankResponse
+        """
+        results: List[RerankResult] = []
+        num_prompt_tokens = 0
+        for idx, final_res in enumerate(final_res_batch):
+            classify_res = ScoringRequestOutput.from_base(final_res)
+
+            result = RerankResult(
+                index=idx,
+                document=RerankDocument(text=documents[idx]),
+                relevance_score=classify_res.outputs.score,
+            )
+            results.append(result)
+            prompt_token_ids = final_res.prompt_token_ids
+            num_prompt_tokens += len(prompt_token_ids)
+
+        # sort by relevance, then return the top n if set
+        results.sort(key=lambda x: x.relevance_score, reverse=True)
+        if top_n < len(documents):
+            results = results[:top_n]
+
+        return RerankResponse(
+            id=request_id,
+            model=model_name,
+            results=results,
+            usage=RerankUsage(total_tokens=num_prompt_tokens))
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
new file mode 100644
index 00000000000..6ec0b5fb024
--- /dev/null
+++ b/vllm/entrypoints/score_utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Union
+
+from torch.nn import CosineSimilarity
+
+from vllm.outputs import PoolingRequestOutput
+from vllm.transformers_utils.tokenizer import (PreTrainedTokenizer,
+                                               PreTrainedTokenizerFast)
+
+
+def _cosine_similarity(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    embed_1: List[PoolingRequestOutput],
+    embed_2: List[PoolingRequestOutput],
+) -> List[PoolingRequestOutput]:
+
+    scorer = CosineSimilarity(0)
+    scores: Union[List[PoolingRequestOutput]] = []
+
+    for emb_1, emb_2 in zip(embed_1, embed_2):
+        pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
+
+        padding = []
+        if (pad_token_id := getattr(tokenizer, "pad_token_id",
+                                    None)) is not None:
+            padding = [pad_token_id]
+
+        tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
+
+        scores.append(
+            PoolingRequestOutput(
+                request_id=f"{emb_1.request_id}_{emb_2.request_id}",
+                outputs=pair_score,
+                prompt_token_ids=tokens,
+                finished=True))
+
+    return scores
+
+
+def _validate_score_input_lens(
+    texts_1: Union[List[str], List[dict]],
+    texts_2: Union[List[str], List[dict]],
+):
+    if len(texts_1) > 1 and len(texts_1) != len(texts_2):
+        raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
+    if len(texts_1) == 0:
+        raise ValueError("At least one text element must be given")
+    if len(texts_2) == 0:
+        raise ValueError("At least one text_pair element must be given")

From cf98d661bc12c2428d56a3e46bba4f286b787b03 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 20 Feb 2025 22:12:10 -0800
Subject: [PATCH 0300/1240] [ci] Fix metrics test model path (#13635)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/metrics/test_metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 1a9063bc2dc..45a13488f07 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -146,7 +146,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
-        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
+        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
         assert metrics_tag_content == actual_model_name, (
             f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
             f"actual: {metrics_tag_content!r}")

From 81902d23379b3860adf54d88ee46e180994557dd Mon Sep 17 00:00:00 2001
From: leoneo <hongbosherlock@gmail.com>
Date: Fri, 21 Feb 2025 14:14:24 +0800
Subject: [PATCH 0301/1240] [Kernel]Add streamK for block-quantized CUTLASS
 kernels (#12978)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../cutlass_w8a8/c3x/cutlass_gemm_caller.cuh  | 16 +++++---
 .../scaled_mm_blockwise_sm90_fp8_dispatch.cuh | 40 +++++++++++++++----
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
index 9ac7eee7204..69a3f64cb0b 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -30,12 +30,18 @@ static inline cute::Shape<int, int, int, int> get_problem_shape(
 }
 
 template <typename GemmKernel>
-void cutlass_gemm_caller(torch::Device device,
-                         cute::Shape<int, int, int, int> prob_shape,
-                         typename GemmKernel::MainloopArguments mainloop_args,
-                         typename GemmKernel::EpilogueArguments epilogue_args) {
+void cutlass_gemm_caller(
+    torch::Device device, cute::Shape<int, int, int, int> prob_shape,
+    typename GemmKernel::MainloopArguments mainloop_args,
+    typename GemmKernel::EpilogueArguments epilogue_args,
+    typename GemmKernel::TileSchedulerArguments scheduler = {}) {
+  cutlass::KernelHardwareInfo hw_info;
   typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
+                                      prob_shape,
+                                      mainloop_args,
+                                      epilogue_args,
+                                      hw_info,
+                                      scheduler};
 
   // Launch the CUTLASS GEMM kernel.
   using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
index fb7a82b80ee..e089c3d4be2 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh
@@ -22,8 +22,9 @@ namespace vllm {
 
 using namespace cute;
 
-template <typename OutType, int GroupSizeM_, int GroupSizeN_, int GroupSizeK_,
-          int TileSizeM_ = 128, class ClusterShape = Shape<_1, _2, _1>>
+template <typename SchedulerType, typename OutType, int GroupSizeM_,
+          int GroupSizeN_, int GroupSizeK_, int TileSizeM_ = 128,
+          class ClusterShape = Shape<_1, _2, _1>>
 struct cutlass_3x_gemm_fp8_blockwise {
   using GroupSizeM = Int<GroupSizeM_>;
   using GroupSizeN = Int<GroupSizeN_>;
@@ -84,7 +85,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
 
   using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
+      SchedulerType>>;
 
   struct GemmKernel : public KernelType {};
 
@@ -150,8 +151,24 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
   typename GemmKernel::EpilogueArguments epilogue_args{
       {}, c_ptr, c_stride, c_ptr, c_stride};
 
+  typename GemmKernel::TileSchedulerArguments scheduler;
+
+  static constexpr bool UsesStreamKScheduler =
+      cute::is_same_v<typename GemmKernel::TileSchedulerTag,
+                      cutlass::gemm::StreamKScheduler>;
+
+  if constexpr (UsesStreamKScheduler) {
+    using DecompositionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::DecompositionMode;
+    using ReductionMode = typename cutlass::gemm::kernel::detail::
+        PersistentTileSchedulerSm90StreamKParams::ReductionMode;
+
+    scheduler.decomposition_mode = DecompositionMode::StreamK;
+    scheduler.reduction_mode = ReductionMode::Nondeterministic;
+  }
+
   c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
-                                       epilogue_args);
+                                       epilogue_args, scheduler);
 }
 
 template <typename OutType>
@@ -160,9 +177,18 @@ void cutlass_gemm_blockwise_sm90_fp8_dispatch(torch::Tensor& out,
                                               torch::Tensor const& b,
                                               torch::Tensor const& a_scales,
                                               torch::Tensor const& b_scales) {
-  cutlass_gemm_caller_blockwise<
-      cutlass_3x_gemm_fp8_blockwise<OutType, 1, 128, 128>>(out, a, b, a_scales,
-                                                           b_scales);
+  auto k = a.size(1);
+  auto n = b.size(1);
+
+  if (k > 3 * n) {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::StreamKScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
+        cutlass::gemm::PersistentScheduler, OutType, 1, 128, 128>>(
+        out, a, b, a_scales, b_scales);
+  }
 }
 
 }  // namespace vllm
\ No newline at end of file

From 0479457eb4bc9b2546c28c006c4ab043bdd81329 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Feb 2025 14:24:17 +0800
Subject: [PATCH 0302/1240] [Bugfix][CPU] Fix cpu all-reduce using native
 pytorch implementation  (#13586)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/device_communicators/cpu_communicator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index 4e86396e713..b920cd7e1ac 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -30,4 +30,5 @@ def __init__(self,
             pass
 
     def all_reduce(self, input_):
-        return self.dist_module.all_reduce(input_, group=self.device_group)
+        self.dist_module.all_reduce(input_, group=self.device_group)
+        return input_

From ecb835d9194a3cbf5877edc5eef95c63c9a58e3f Mon Sep 17 00:00:00 2001
From: John Zheng <johnzhengaz@gmail.com>
Date: Sat, 22 Feb 2025 02:21:05 +0800
Subject: [PATCH 0303/1240] fix typo of grafana dashboard, with correct
 datasource (#13668)

Signed-off-by: John Zheng <john.zheng@hp.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/online_serving/prometheus_grafana/grafana.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json
index f76a61bb5ee..fbe96b48e79 100644
--- a/examples/online_serving/prometheus_grafana/grafana.json
+++ b/examples/online_serving/prometheus_grafana/grafana.json
@@ -1260,7 +1260,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "edx8memhpd9tsa"
+            "uid": "${DS_PROMETHEUS}"
           },
           "disableTextWrap": false,
           "editorMode": "code",
@@ -1360,7 +1360,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "edx8memhpd9tsa"
+            "uid": "${DS_PROMETHEUS}"
           },
           "disableTextWrap": false,
           "editorMode": "code",
@@ -1473,7 +1473,7 @@
         {
           "datasource": {
             "type": "prometheus",
-            "uid": "edx8memhpd9tsa"
+            "uid": "${DS_PROMETHEUS}"
           },
           "disableTextWrap": false,
           "editorMode": "code",
@@ -1523,7 +1523,7 @@
         },
         "datasource": {
           "type": "prometheus",
-          "uid": "edx8memhpd9tsa"
+          "uid": "${DS_PROMETHEUS}"
         },
         "definition": "label_values(model_name)",
         "hide": 0,

From 290089c473aa9a2ea0f1fce09f00896d935e25fc Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 21 Feb 2025 18:30:12 -0500
Subject: [PATCH 0304/1240] [Attention] MLA with chunked prefill (#12639)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Patrick Horn <patrick.horn@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cache.h                                  |    7 +
 csrc/cache_kernels.cu                         |  159 ++
 csrc/core/math.hpp                            |    5 -
 csrc/cuda_utils.h                             |   22 +-
 .../cutlass_w8a8/scaled_mm_c3x.cu             |    5 +-
 csrc/torch_bindings.cpp                       |    6 +
 tests/kernels/test_cache.py                   |   75 +-
 vllm/_custom_ops.py                           |   10 +
 vllm/attention/__init__.py                    |   12 +-
 vllm/attention/backends/mla/common.py         | 1503 +++++++++++++++++
 vllm/attention/backends/mla/utils.py          |  515 ------
 vllm/attention/backends/triton_mla.py         |  664 +-------
 .../attention/ops/triton_merge_attn_states.py |   84 +
 vllm/config.py                                |   13 -
 vllm/engine/arg_utils.py                      |    7 +-
 .../layers/quantization/utils/fp8_utils.py    |   23 +-
 vllm/utils.py                                 |    4 +
 vllm/v1/attention/backends/flash_attn.py      |   71 +-
 18 files changed, 1910 insertions(+), 1275 deletions(-)
 create mode 100644 vllm/attention/backends/mla/common.py
 delete mode 100644 vllm/attention/backends/mla/utils.py
 create mode 100644 vllm/attention/ops/triton_merge_attn_states.py

diff --git a/csrc/cache.h b/csrc/cache.h
index cf4a65c2905..0970b704be3 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -39,3 +39,10 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
+
+void gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0960888d1f7..a6f8602a058 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -2,6 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
+#include "cuda_utils.h"
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 
@@ -570,3 +571,161 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
     TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
   }
 }
+
+namespace vllm {
+
+// grid is launched with dimensions (batch, num_splits)
+template <typename scalar_t>
+__global__ void gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRIES...]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
+                                               // batch
+
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
+  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
+
+  const int32_t split_start = split * split_blocks;
+  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
+
+  const bool is_active_split = (split_start < tot_blocks);
+  const bool is_last_split = (split_end == tot_blocks);
+
+  if (!is_active_split) return;
+
+  int32_t full_blocks_end = split_end;
+  int32_t partial_block_size = 0;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
+  // page_size)
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = 0;
+  if (seq_starts != nullptr) {
+    offset = seq_starts[bid] / block_size;
+  }
+  const int32_t* batch_block_table = block_table + batch_offset + offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  if (is_last_split) {
+    partial_block_size = seq_len % block_size;
+    if (partial_block_size) full_blocks_end -= 1;
+  }
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < full_blocks_end; ++pid) {
+    auto block_id = batch_block_table[pid];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
+    for (int eid = 0; eid < block_size; ++eid) {
+      copy_entry(block_start_ptr + eid * cache_entry_stride,
+                 block_dst_ptr + eid * dst_entry_stride);
+    }
+  }
+
+  if (partial_block_size) {
+    auto block_id = batch_block_table[full_blocks_end];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
+    for (int eid = 0; eid < partial_block_size; ++eid) {
+      copy_entry(block_start_ptr + eid * cache_entry_stride,
+                 block_dst_ptr + eid * dst_entry_stride);
+    }
+  }
+}
+
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
+  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting block index by
+//  (seq_starts[bid] / page_size)
+void gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
diff --git a/csrc/core/math.hpp b/csrc/core/math.hpp
index ddfaca27147..b8171133f6a 100644
--- a/csrc/core/math.hpp
+++ b/csrc/core/math.hpp
@@ -7,8 +7,3 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
   if (num <= 1) return num;
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
-
-template <typename T>
-inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
-  return (a + b - 1) / b;
-}
\ No newline at end of file
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index 6f79d2b7445..6e62ea208db 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -2,10 +2,14 @@
 
 #include <stdio.h>
 
-#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
-  #define HOST_DEVICE_INLINE __forceinline__ __host__ __device__
-  #define DEVICE_INLINE __forceinline__ __device__
-  #define HOST_INLINE __forceinline__ __host__
+#if defined(__HIPCC__)
+  #define HOST_DEVICE_INLINE __host__ __device__
+  #define DEVICE_INLINE __device__
+  #define HOST_INLINE __host__
+#elif defined(__CUDACC__) || defined(_NVHPC_CUDA)
+  #define HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+  #define DEVICE_INLINE __device__ __forceinline__
+  #define HOST_INLINE __host__ __forceinline__
 #else
   #define HOST_DEVICE_INLINE inline
   #define DEVICE_INLINE inline
@@ -25,3 +29,13 @@
 int64_t get_device_attribute(int64_t attribute, int64_t device_id);
 
 int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+
+namespace cuda_utils {
+
+template <typename T>
+HOST_DEVICE_INLINE constexpr std::enable_if_t<std::is_integral_v<T>, T>
+ceil_div(T a, T b) {
+  return (a + b - 1) / b;
+}
+
+};  // namespace cuda_utils
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index e40f2822996..53921abc951 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -1,7 +1,7 @@
 #include <cudaTypedefs.h>
 #include "c3x/scaled_mm_kernels.hpp"
 
-#include "core/math.hpp"
+#include "cuda_utils.h"
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
@@ -33,7 +33,8 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
     auto make_group_shape = [](torch::Tensor const& x,
                                torch::Tensor const& s) -> GroupShape {
       TORCH_CHECK(s.dim() == 2, "cutlass_scaled_mm group scales must be 2D");
-      return {ceil_div(x.size(0), s.size(0)), ceil_div(x.size(1), s.size(1))};
+      return {cuda_utils::ceil_div(x.size(0), s.size(0)),
+              cuda_utils::ceil_div(x.size(1), s.size(1))};
     };
 
     GroupShape a_scale_group_shape = make_group_shape(a, a_scales);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index ef81db14bf8..d2aecba442b 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -493,6 +493,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
       "str kv_cache_dtype) -> ()");
   cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
+
+  // Gather cache blocks from src_cache to dst.
+  cache_ops.def(
+      "gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index 21c02c5de35..b8b5e204545 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -682,8 +682,6 @@ def test_swap_blocks_mla(
         torch.ops._C_cache_ops.swap_blocks,
         (src_cache, dst_cache, block_mapping_tensor),
         test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-        cond=(kv_lora_rank == KV_LORA_RANKS[0]
-              and qk_rope_head_dim == QK_ROPE_HEAD_DIMS[0]),
     )
 
     ops.swap_blocks(src_cache, dst_cache, block_mapping_tensor)
@@ -694,3 +692,76 @@ def test_swap_blocks_mla(
             dst_cache[dst].cpu(),
             msg=f"Block {src} from src should have been swapped to block "
             f"{dst} in dst_cache.")
+
+
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [1024])
+@pytest.mark.parametrize("max_seq_len", [512])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype",
+                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("align_cache", [True, False])
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
+                          num_blocks, max_seq_len, batch_size, dtype,
+                          kv_cache_dtype, align_cache, device):
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device, align_cache)
+    _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
+
+    seq_len_tensor = torch.randint(0,
+                                   max_seq_len + 1, (batch_size, ),
+                                   device=device)
+
+    total_tokens = seq_len_tensor.sum()
+    cu_seq_lens = torch.empty((batch_size + 1),
+                              dtype=torch.int32,
+                              device=device)
+    cu_seq_lens[0] = 0
+    cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    print("seq_len_tensor", seq_len_tensor)
+
+    tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
+    block_table = torch.empty((batch_size, num_blocks),
+                              dtype=torch.int32,
+                              device=device)
+
+    for b in range(batch_size):
+        perm = torch.randperm(num_blocks, device=device)
+        block_table[b, :] = perm
+
+    dst = torch.zeros((total_tokens, entry_size),
+                      dtype=src_cache.dtype,
+                      device=device)
+
+    expected_batches = []
+    for b in range(batch_size):
+        s = seq_len_tensor[b]
+        if s == 0:
+            continue
+        tot = tot_blocks_tensor[b]
+        blocks = block_table[b, :tot].tolist()
+
+        gathered_rows = []
+        for i in range(tot - 1):
+            gathered_rows.append(src_cache[blocks[i]])
+        remaining = s - (tot - 1) * block_size
+        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+
+        batch_expected = torch.cat(gathered_rows, dim=0)
+        expected_batches.append(batch_expected)
+    expected = torch.cat(expected_batches, dim=0)
+
+    opcheck(
+        torch.ops._C_cache_ops.gather_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    torch.testing.assert_close(dst, expected)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index e3e3c644fbd..2112af1201f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1099,6 +1099,16 @@ def convert_fp8(output: torch.Tensor,
     torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
 
 
+def gather_cache(src_cache: torch.Tensor,
+                 dst: torch.Tensor,
+                 block_table: torch.Tensor,
+                 cu_seq_lens: torch.Tensor,
+                 batch_size: int,
+                 seq_starts: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C_cache_ops.gather_cache(src_cache, dst, block_table,
+                                        cu_seq_lens, batch_size, seq_starts)
+
+
 def get_device_attribute(attribute: int, device: int) -> int:
     return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
 
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 85c5715faba..89229e7b87a 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -4,16 +4,12 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionState, AttentionType)
+from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 
 __all__ = [
-    "Attention",
-    "AttentionBackend",
-    "AttentionMetadata",
-    "AttentionType",
-    "AttentionMetadataBuilder",
-    "Attention",
-    "AttentionState",
-    "get_attn_backend",
+    "Attention", "AttentionBackend", "AttentionMetadata", "AttentionType",
+    "AttentionMetadataBuilder", "Attention", "AttentionState",
+    "get_attn_backend", "get_flash_attn_version"
 ]
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
new file mode 100644
index 00000000000..c3dbbdb8682
--- /dev/null
+++ b/vllm/attention/backends/mla/common.py
@@ -0,0 +1,1503 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file implements common components for MLA implementations.
+
+First we define:
+
+Sq      as Q sequence length
+Skv     as KV sequence length
+
+MLA has two possible ways of computing, a data-movement friendly approach and a 
+compute friendly approach, we generally want to use the compute friendly 
+approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) 
+and the data-movement friendly approach for "decode" (i.e. the ratio 
+Sq / Skv is "large"). 
+
+NOTE what we deem small and large is currently determined by if its labelled 
+prefill or decode by the scheduler, but this is something we should probably 
+tune.
+
+Main reference: DeepseekV2 paper, and FlashInfer Implementation
+(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+Deepseek's MLA attention works the following way:
+* Use a single latent vector to represent the per-token entry of the KV cache.  
+* For decode (i.e. the memory friendly approach) the attention "simulates" a 
+multi-head attention, while the compute is similar to multi-query attention.
+
+Below is example of both paths assuming batchsize = 1
+
+## More Extent Definitions:
+
+C           Context length, `Skv - Sq`
+H           hidden size
+N           number of attention heads
+Lq          latent dimension for Q              1536 in DSV3
+Lkv         latent dimension for K/V            512 in DSV3
+P           nope dimension, no rope.            128 in DSV3
+R           rope dimension, goes through rope.  64 in DSV3
+V           V head dim.                         128 in DSV3
+
+## Vector/Matrix Definitions
+
+h_t         hidden states (input to attention)  shape [Sq, H]
+q_c         latent/compressed Q                 shape [Sq, Lq]
+q_nope      uncompressed Q (no-rope)            shape [Sq, N, P]
+q_pe        uncompressed Q (rope)               shape [Sq, N, R]
+kv_c        latent/compressed KV                shape [Skv, Lkv]
+k_pe        decoupled k position embeddings     shape [Skv, R]
+new_kv_c    new kv_c from current iter          shape [Sq, Lkv]
+new_k_pe    new k_pe from current iter          shape [Sq, R]
+cache_kv_c  cached k_c from previous iters      shape [C, Lkv]
+cache_k_pe  cached k_pe from previous iters     shape [C, R]
+W_DQ        project h_t to q_c                  shape [H, Lq]
+W_UQ        project q_c to q_nope               shape [Lq, N * P]
+W_QR        project q_c to q_pe                 shape [Lq, N * R]
+W_DKV       project h_t to kv_c                 shape [H, Lkv]
+W_UK        project kv_c to k_nope              shape [Lkv, N * P]
+W_KR        project h_t to k_pe                 shape [H, N * R]
+W_UV        project kv_c to v                   shape [Lkv, N * V]
+W_O         project v to h_t                    shape [N * V, H]
+
+
+## Compute Friendly Approach (i.e. "_forward_prefill"):
+
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(Sq, N, P)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+k_nope   = (kv_c @ W_UK).view(Skv, N, P)
+v        = (kv_c @ W_UV).view(Skv, N, V)
+
+// MHA with QK headdim = P + R
+//           V headdim = V
+//      spda_o shape [Sq, N, V]
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    v
+) 
+return spda_o @ W_O
+
+NOTE: in the actual code, 
+    `kv_b_proj` is [W_UK; W_UV] concatnated per head
+    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `out_proj` is W_O
+
+
+## Data-Movement Friendly Approach (i.e. "_forward_decode"):
+
+Ahead of time, compute:
+
+% this projects from q_c to [Sq, N * Lkv]
+W_UQ_UK = einsum("qnp,knp -> qnk"
+                     W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P)
+                ).view(Lkv, N * Lkv)
+% this projects from attn output [Sq, N * Lkv] to [Sq, H]
+W_UV_O  = einsum("knv,nvh -> nkh"
+                     W_UV.view(Lkv, N, V), W_O.view(N, V, H)
+                ).view(N * Lkv, H)
+
+Runtime
+q_c      = h_t @ W_DQ
+q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+
+// MQA with QK headdim = Lkv + R
+//           V headdim = Lkv
+//      spda_o shape [Sq, N, Lkv]
+// NOTE: this is less compute-friendly since Lkv > P
+//       but is more data-movement friendly since its MQA vs MHA
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_latent, q_pe], dim=-1),
+    torch.cat([kv_c, k_pe], dim=-1),
+    kv_c
+)
+return spda_o.reshape(-1, N * Lkv) @ W_UV_O
+
+
+## Chunked Prefill
+
+For chunked prefill we want to use the compute friendly algorithm. We are 
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
+the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
+
+However, the compute-friendly approach can potentially run out of memory if Skv
+is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
+
+To mitigate this, we chunk the computation of attention with respect to the 
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
+fixed workspace size.
+
+The chunked prefill approach is as follows:
+
+MCC        Max chunk of context to process per iter, computed dynamically, 
+           used to bound the memory usage
+
+q_c        = h_t @ W_DQ
+q_nope     = (q_c @ W_UQ).view(Sq, N, P)
+q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c   = h_t @ W_DKV
+new_k_pe   = RoPE(h_t @ W_KR)
+new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV).view(Sq, N, V)
+
+// MHA between queries and new KV
+//     with QK headdim = P + R
+//           V headdim = V
+//    curr_o   shape [Sq, N, V]
+//    curr_lse shape [N, Sq], this is just order FA returns
+curr_o, curr_lse = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    new_v,
+    casual=True,
+    return_softmax_lse=True
+) 
+
+// Compute attention with the already existing context
+for chunk_idx in range(cdiv(C, MCC)):
+    chunk_start  = chunk_idx * MCC
+    chunk_end    = min(chunk_start + MCC, C)
+    Sc           = chunk_end - chunk_start
+    cache_kv_c_chunk   = cache_kv_c[chunk_start:chunk_end]
+    cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
+    cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
+    cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
+    
+    chunk_o, chunk_lse = scaled_dot_product_attention(
+        torch.cat([q_nope, q_pe], dim=-1),
+        torch.cat([cache_k_nope_chunk, 
+                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], 
+                   dim=-1),
+        cache_v_chunk,
+        casual=False,
+        return_softmax_lse=True
+    )
+    
+    curr_o, curr_lse = merge_attn_states(
+        suffix_output=curr_o,
+        suffix_lse=curr_lse,
+        prefix_output=chunk_o,
+        prefix_lse=chunk_lse,
+    )
+
+return curr_o @ W_O
+"""
+
+import functools
+from abc import abstractmethod
+from collections import defaultdict
+from contextlib import contextmanager
+from dataclasses import dataclass
+from itertools import accumulate
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
+                    Type, TypeVar)
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
+                                              AttentionMetadata,
+                                              AttentionMetadataBuilder,
+                                              AttentionState, MLAAttentionImpl)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
+                                           compute_slot_mapping_start_idx,
+                                           get_flash_attn_version,
+                                           is_block_tables_empty)
+from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase, RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_quantize)
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
+
+try:
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    # For rocm use upstream flash attention
+    from flash_attn import flash_attn_varlen_func
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
+                                          ModelInputForGPUWithSamplingMetadata)
+
+
+class MLACommonBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return MLACommonMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
+        return MLACommonMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["MLACommonState"]:
+        return MLACommonState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+class MLACommonState(AttentionState):
+
+    def __init__(self, runner):
+        self.runner = runner
+        self._is_graph_capturing = False
+
+        scheduler_config = runner.scheduler_config
+        self.model_config = runner.model_config
+        cache_config = runner.cache_config
+
+        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+
+        if self.chunked_prefill_enabled:
+            self.chunked_prefill_workspace_size = min(
+                # Max sure there is enough for 8 full length request or at least
+                # 4 pages of cache per request
+                max(
+                    8 * self.model_config.max_model_len, 4 *
+                    scheduler_config.max_num_seqs * cache_config.block_size),
+                # For long-context models try not to over-allocate limiting
+                # kv-cache space, limiting it to 64k tokens,
+                # which would result in the workspace being:
+                #   2*(576)*(64*1024) = 144mb
+                # (assuming 576 MLA head dim, and fp16)
+                # which would result in up-projected context being
+                #   2*(192*128)*(64*1024) = 3gb
+                # (assuming 192 QK head dim, 128 heads, and fp16)
+                128 * 1024)
+            assert self.chunked_prefill_workspace_size >= \
+                scheduler_config.max_num_seqs * cache_config.block_size
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        self._is_graph_capturing = True
+
+        self._graph_slot_mapping = torch.full((max_batch_size, ),
+                                              PAD_SLOT_ID,
+                                              dtype=torch.long,
+                                              device=self.runner.device)
+        self._graph_seq_lens = torch.ones(max_batch_size,
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+        self._graph_block_tables = torch.from_numpy(
+            self.runner.graph_block_tables).to(device=self.runner.device)
+
+        self._positions = torch.zeros((max_batch_size, ),
+                                      dtype=torch.long,
+                                      device=self.runner.device)
+
+        yield
+
+        self._is_graph_capturing = False
+        del self._graph_slot_mapping
+        del self._graph_seq_lens
+        del self._graph_block_tables
+        del self._positions
+
+    def graph_clone(self, batch_size: int):
+        assert self._is_graph_capturing
+        return self.__class__(self.runner)
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        assert self._is_graph_capturing
+
+        attn_metadata = self.runner.attn_backend.make_metadata(
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            use_cuda_graph=True,
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=batch_size,
+            slot_mapping=self._graph_slot_mapping[:batch_size],
+            seq_lens=None,
+            seq_lens_tensor=self._graph_seq_lens[:batch_size],
+            max_query_len=1,
+            max_decode_query_len=1,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.runner.max_seq_len_to_capture,
+            query_start_loc=None,
+            seq_start_loc=None,
+            context_lens_tensor=None,
+            block_tables=self._graph_block_tables[:batch_size],
+            input_positions=self._positions[:batch_size],
+            head_dim=self.runner.model_config.get_head_size())
+
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "MLACommonState does not support encoder/decoder yet")
+
+        return attn_metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = {
+            "slot_mapping": attn_metadata.slot_mapping,
+            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
+            "block_tables": attn_metadata.decode_metadata.block_tables,
+            "input_positions": attn_metadata.decode_metadata.input_positions,
+        }
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "MLACommonState does not support encoder/decoder yet")
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        input_positions = attn_metadata.input_positions
+        num_positions = input_positions.shape[0]
+        input_buffers["seq_lens_tensor"].copy_(
+            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
+        input_buffers["block_tables"].copy_(
+            attn_metadata.decode_metadata.block_tables, non_blocking=True)
+        # CUDA graph buffer is padded so only perform a partial copy based on
+        # num_positions
+        input_buffers["input_positions"][:num_positions].copy_(
+            input_positions, non_blocking=True)
+        if is_encoder_decoder_model:
+            raise NotImplementedError(
+                "TritonMLAState does not support encoder/decoder yet")
+
+    def begin_forward(self, model_input):
+        if self.chunked_prefill_enabled:
+            if not hasattr(self, "chunked_prefill_workspace"):
+                # not self.runner.device does not return the correct device
+                # for this process, (init_device sets the correct device but
+                # only on the Worker). The only way Ive figured out to get the
+                # correct device is to allocate the workspace on the first call
+                # to begin_forward and use the device of the input tokens
+                assert model_input.input_tokens is not None
+                self.chunked_prefill_workspace = torch.empty(
+                    (self.chunked_prefill_workspace_size,
+                     self.model_config.get_head_size()),
+                    dtype=self.model_config.dtype,
+                    device=model_input.input_tokens.device,
+                )
+
+            model_input.attn_metadata.chunked_prefill_workspace = \
+                self.chunked_prefill_workspace
+
+
+@dataclass
+class MLACommonMetadata(AttentionMetadata):
+    """Metadata for MLACommon. 
+    
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # New for MLA (compared to FlashAttention)
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]]
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor]
+
+    # (batch_size, max_blocks_per_seq).
+    # Block addresses per sequence. (Seq id -> list of physical block)
+    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
+    # in the kv cache. Each block can contain up to block_size tokens.
+    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
+    # captured.
+    block_tables: Optional[torch.Tensor]
+
+    # Maximum query length in the batch.
+    max_query_len: Optional[int] = None
+
+    # Max number of query tokens among request in the batch.
+    max_decode_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    _cached_prefill_metadata: Optional["MLACommonMetadata"] = None
+    _cached_decode_metadata: Optional["MLACommonMetadata"] = None
+
+    num_prefill_tokens: int
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    # Used when chunked prefill is enabled to simulate worst case workspace
+    # allocations, hopefully to avoid going OOM
+    is_profile_run: bool = False
+
+    # New for MLA (compared to FlashAttention)
+    # For chunked prefill
+    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    context_chunk_starts: Optional[torch.Tensor] = None
+    context_chunk_seq_tot: Optional[List[int]] = None
+    context_chunk_max_seq_lens: Optional[List[int]] = None
+    # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
+    chunked_prefill_workspace: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+    @property
+    def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            return self._cached_prefill_metadata
+
+        assert self.seq_lens is not None
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[:self.num_prefill_tokens])
+
+        self._cached_prefill_metadata = MLACommonMetadata(
+            # Required by ModelRunner
+            use_cuda_graph=False,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            input_positions=input_positions,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_query_len=0,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            head_dim=self.head_dim,
+            is_profile_run=self.is_profile_run,
+            # MLACommonMetadata Chunk prefill specific
+            context_chunk_cu_seq_lens=self.context_chunk_cu_seq_lens,
+            context_chunk_starts=self.context_chunk_starts,
+            context_chunk_seq_tot=self.context_chunk_seq_tot,
+            context_chunk_max_seq_lens=self.context_chunk_max_seq_lens,
+        )
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["MLACommonMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            return self._cached_decode_metadata
+        assert self.seq_lens_tensor is not None
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+        input_positions = (None if self.input_positions is None else
+                           self.input_positions[self.num_prefill_tokens:])
+
+        self._cached_decode_metadata = MLACommonMetadata(
+            # Required by ModelRunner
+            use_cuda_graph=self.use_cuda_graph,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            seq_lens=None,
+            seq_lens_tensor=seq_lens_tensor,
+            max_decode_query_len=self.max_decode_query_len,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            input_positions=input_positions,
+            head_dim=self.head_dim,
+            is_profile_run=self.is_profile_run)
+        return self._cached_decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        """
+        Update metadata in-place to advance one decode step.
+        """
+        # When using cudagraph, the num_seqs is padded to the next captured
+        # batch sized, but num_queries tracks the actual number of requests in
+        # the batch. For --enforce-eager mode, num_seqs == num_queries
+        if num_seqs != num_queries:
+            assert num_seqs > num_queries
+
+        if turn_prefills_into_decodes:
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes. This update reflects that
+            # conversion.
+            assert self.num_decode_tokens + self.num_prefills == num_seqs
+            self.num_decode_tokens += self.num_prefills
+            self.num_prefills = 0
+            self.num_prefill_tokens = 0
+            self.max_prefill_seq_len = 0
+            self.max_query_len = 1
+
+            self.slot_mapping = self.slot_mapping[:num_seqs]
+        else:
+            assert self.seq_lens is not None
+            assert self.max_decode_seq_len == max(self.seq_lens)
+
+        assert self.num_prefills == 0
+        assert self.num_prefill_tokens == 0
+        assert self.num_decode_tokens == num_seqs
+        assert self.slot_mapping.shape == (num_seqs, )
+
+        assert self.seq_lens is not None
+        assert len(self.seq_lens) == num_seqs
+        assert self.seq_lens_tensor is not None
+        assert self.seq_lens_tensor.shape == (num_seqs, )
+        assert self.max_query_len == 1
+        assert self.max_prefill_seq_len == 0
+
+        assert self.query_start_loc is not None
+        assert self.query_start_loc.shape == (num_queries + 1, )
+        assert self.seq_start_loc is not None
+        assert self.seq_start_loc.shape == (num_seqs + 1, )
+
+        assert self.context_lens_tensor is not None
+        assert self.context_lens_tensor.shape == (num_queries, )
+
+        assert self.block_tables is not None
+        assert self.block_tables.shape[0] == num_seqs
+
+        # Update query lengths. Note that we update only queries and not seqs,
+        # since tensors may be padded due to captured cuda graph batch size
+        for i in range(num_queries):
+            self.seq_lens[i] += 1
+        self.max_decode_seq_len = max(self.seq_lens)
+
+        ops.advance_step_flashattn(num_seqs=num_seqs,
+                                   num_queries=num_queries,
+                                   block_size=block_size,
+                                   input_tokens=model_input.input_tokens,
+                                   sampled_token_ids=sampled_token_ids,
+                                   input_positions=model_input.input_positions,
+                                   seq_lens=self.seq_lens_tensor,
+                                   slot_mapping=self.slot_mapping,
+                                   block_tables=self.block_tables)
+
+
+T = TypeVar("T", bound=MLACommonMetadata)
+
+
+class MLACommonMetadataBuilder(AttentionMetadataBuilder[MLACommonMetadata]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+    """
+
+    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+        self.sliding_window = input_builder.sliding_window
+        self.block_size = input_builder.block_size
+        self.chunked_prefill_enabled = \
+            self.runner.scheduler_config.chunked_prefill_enabled
+
+        if self.chunked_prefill_enabled:
+            attn_state = self.input_builder.runner.attn_state
+            self.chunked_prefill_workspace_size = \
+                attn_state.chunked_prefill_workspace_size
+            self.page_size = self.runner.block_size
+
+    def prepare(self):
+        self.slot_mapping: List[int] = []
+        self.prefill_seq_lens: List[int] = []
+        self.context_lens: List[int] = []
+        self.block_tables: List[List[int]] = []
+        self.curr_seq_lens: List[int] = []
+        self.input_positions: List[int] = []
+        self.multimodal_placeholder_maps: Dict[
+            str,
+            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
+        self.num_prefills = 0
+        self.num_prefill_tokens = 0
+        self.num_decode_tokens = 0
+        self.has_prefix_cache_hit = False
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block, input_positions) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks,
+                 inter_data.input_positions):
+            self.input_positions.extend(input_positions)
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table = []
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
+                                 seq_len, context_len, start_idx,
+                                 self.block_size, inter_data.block_tables)
+
+    def _get_graph_runner_block_tables(
+            self, num_seqs: int,
+            block_tables: List[List[int]]) -> torch.Tensor:
+        # The shape of graph_block_tables is
+        # [max batch size, max context len // block size].
+        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
+        assert max_batch_size >= num_seqs
+
+        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
+        for i, block_table in enumerate(block_tables):
+            if block_table:
+                num_blocks = len(block_table)
+                if num_blocks <= max_blocks:
+                    graph_block_tables[i, :num_blocks] = block_table
+                else:
+                    # It may be possible to have more blocks allocated due
+                    # to lookahead slots of multi-step, however, they are
+                    # not used anyway, so can be safely ignored.
+                    graph_block_tables[
+                        i, :max_blocks] = block_table[:max_blocks]
+
+        return torch.from_numpy(graph_block_tables).to(
+            device=self.runner.device, non_blocking=True)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        """Build attention metadata with on-device tensors.
+
+        Args:
+            seq_lens: The maybe padded sequence lengths of the input sequences.
+            query_lens: The query lengths of the input sequences.
+            cuda_graph_pad_size: The padding size for cuda graph.
+                                 -1 if cuda graph is not used.
+            batch_size: The maybe padded batch size.
+        """
+        prefix_cache_hit = any([
+            inter_data.prefix_cache_hit
+            for inter_data in self.input_builder.inter_data_list
+        ])
+
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled,
+                                prefix_cache_hit)
+
+        device = self.runner.device
+        use_captured_graph = cuda_graph_pad_size != -1
+
+        max_query_len = max(query_lens)
+        decode_query_lens = query_lens[self.num_prefills:]
+        if len(decode_query_lens) > 0:
+            max_decode_query_len = max(decode_query_lens)
+        else:
+            max_decode_query_len = 1
+        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
+        max_decode_seq_len = max(self.curr_seq_lens, default=0)
+        num_decode_tokens = self.num_decode_tokens
+        query_start_loc = list(accumulate(query_lens, initial=0))
+        seq_start_loc = list(accumulate(seq_lens, initial=0))
+
+        num_seqs = len(seq_lens)
+        if use_captured_graph:
+            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
+            self.block_tables.extend([] * cuda_graph_pad_size)
+            num_decode_tokens = batch_size - self.num_prefill_tokens
+            block_tables = self._get_graph_runner_block_tables(
+                num_seqs, self.block_tables)
+        else:
+            block_tables = make_tensor_with_pad(
+                self.block_tables,
+                pad=0,
+                dtype=torch.int,
+                device=device,
+            )
+        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
+
+        assert device is not None
+        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
+                                               device, self.runner.pin_memory)
+        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
+                                           self.runner.pin_memory)
+        input_positions = async_tensor_h2d(self.input_positions, torch.long,
+                                           device, self.runner.pin_memory)
+        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
+                                               device, self.runner.pin_memory)
+        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
+                                                  device,
+                                                  self.runner.pin_memory)
+        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
+                                                device, self.runner.pin_memory)
+
+        context_chunk_cu_seq_lens = None
+        context_chunk_starts = None
+        context_chunk_seq_tot = None
+        context_chunk_max_seq_lens = None
+
+        if self.chunked_prefill_enabled and self.num_prefills > 0 \
+            and context_lens_tensor is not None \
+            and context_lens_tensor[:self.num_prefills].max() > 0:
+
+            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # the comment at the top of the file before trying to understand
+            # the following code
+
+            num_prefills_with_context = \
+                (context_lens_tensor[:self.num_prefills] > 0).sum().item()
+
+            # currently we allocate an equal amount of workspace for each
+            # prefill in the batch, we could probably use a more advanced
+            # algorithm here and allocate more workspace to prefills with
+            # longer context lengths
+            max_context_chunk = \
+                self.chunked_prefill_workspace_size // num_prefills_with_context
+
+            # align max_context_chunk to page_size by rounding down,
+            # currently the `gather_cache` kernel cannot handle
+            # `context_chunk_starts` that are not aligned to page_size
+            max_context_chunk = round_down(max_context_chunk, self.page_size)
+            assert max_context_chunk > 0
+            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
+
+            # if `max_context_chunk = 256`, `num_chunks = 3`, and
+            #   `num_prefills_with_context = 4`, create a tensor that looks like
+            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+            context_chunk_starts = \
+                torch.arange(num_chunks, device=device, dtype=torch.int32)\
+                .unsqueeze(1).expand(-1, self.num_prefills)\
+                * max_context_chunk
+            chunk_ends = torch.min(context_lens_tensor[:self.num_prefills]\
+                .unsqueeze(0), context_chunk_starts + max_context_chunk)
+            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
+            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
+                torch.int32)
+            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device)\
+                .unsqueeze(-1)
+            context_chunk_cu_seq_lens = \
+                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
+            context_chunk_max_seq_lens = \
+                chunk_seq_lens.max(dim=1).values.tolist()
+            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
+            assert max(context_chunk_seq_tot) <= \
+                self.chunked_prefill_workspace_size
+
+        return MLACommonMetadata(
+            # Required by ModelRunner
+            use_cuda_graph=use_captured_graph,  # Not Attention Related
+            # Required by Attention Metadata
+            num_prefills=self.num_prefills,
+            slot_mapping=slot_mapping_tensor,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=num_decode_tokens,
+            # Required by Attention Metadata (not used)
+            multi_modal_placeholder_index_maps=None,  # Not Attention Related
+            enable_kv_scales_calculation=False,
+            # MLACommonMetadata
+            input_positions=input_positions,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_decode_query_len=max_decode_query_len,
+            max_prefill_seq_len=max_prefill_seq_len,
+            max_decode_seq_len=max_decode_seq_len,
+            query_start_loc=query_start_loc_tensor,
+            seq_start_loc=seq_start_loc_tensor,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            head_dim=self.runner.model_config.get_head_size(),
+            is_profile_run=self.runner.in_profile_run,
+            # MLACommonMetadata Chunk prefill specific
+            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
+            context_chunk_starts=context_chunk_starts,
+            context_chunk_seq_tot=context_chunk_seq_tot,
+            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
+        )
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to 
+    understand this class
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        rotary_emb: RotaryEmbedding,
+        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
+        # attention backend perspective we rely on the layer to pass in the
+        # correct matrix
+        q_proj: ColumnParallelLinear,
+        kv_b_proj: ColumnParallelLinear,
+        o_proj: RowParallelLinear,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.rotary_emb = rotary_emb
+        self.use_yarn_rope = isinstance(rotary_emb,
+                                        DeepseekScalingRotaryEmbedding)
+        self.q_proj = q_proj
+        self.kv_b_proj = kv_b_proj
+        self.o_proj = o_proj
+        self.vllm_flash_attn_version = get_flash_attn_version()
+
+        # Handle the differences between the flash_attn_varlen from flash_attn
+        # and the one from vllm_flash_attn. The former is used on RoCM and the
+        # latter has an additional parameter to control FA2 vs FA3
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        if self.vllm_flash_attn_version is not None:
+            self.flash_attn_varlen_func = \
+                functools.partial(flash_attn_varlen_func,
+                                  fa_version=self.vllm_flash_attn_version)
+
+    def _v_up_proj_and_o_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_UV_O):
+                output_parallel = apply_fp8_linear_generic(
+                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape)
+            else:
+                output_parallel = torch.matmul(x.flatten(start_dim=1),
+                                               self.W_UV_O)
+            if self.tp_size > 1:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+            else:
+                output = output_parallel
+            return output
+        else:
+            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
+            return self.o_proj(x.reshape(-1,
+                                         self.num_heads * self.v_head_dim))[0]
+
+    def _q_proj_and_k_up_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_Q_UK):
+                return apply_fp8_linear_generic(
+                    x, self.W_Q_UK, self.W_Q_UK_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape).view(
+                        -1, self.num_heads, self.kv_lora_rank)
+            return torch.matmul(x, self.W_Q_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+        else:
+            x = torch.matmul(x, self.W_Q)\
+                .view(-1, self.num_heads, self.qk_nope_head_dim)
+            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        # TODO(lucas) This is very gross, we need a more wide scale refactor of
+        # all the FP8 code with a more standard way of
+        # defining schemes/group-shapes, we should also potentially force
+        # quant_methods to support a decompress function
+        #
+        # returns input_group_shape, weight_group_shape
+        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+            Tuple[Tuple[int, int], Tuple[int, int]]:
+            if isinstance(layer.quant_method, Fp8LinearMethod):
+                if layer.quant_method.block_quant:
+                    weight_block_size = \
+                        layer.quant_method.quant_config.weight_block_size
+                    # per-token-group (1, X), block-quantized (X, Y)
+                    return (1, weight_block_size[-1]), weight_block_size
+                else:
+                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
+            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                # this is hacky but we always assume the for
+                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+                # we ignore if it is static-per-tensor since we are going to
+                # requantize after later anyways
+                strategy = layer.scheme.strategy
+                if strategy == QuantizationStrategy.TENSOR:
+                    return (1, -1), (-1, -1)  # per-token, per-tensor
+                elif strategy == QuantizationStrategy.CHANNEL:
+                    return (1, -1), (-1, 1)  # per-token, per-channel
+                else:
+                    raise NotImplementedError(
+                        f"QuantizationStrategy.{strategy} is not supported for "
+                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+            else:
+                raise NotImplementedError(
+                    "Can't determine scale group shapes for "
+                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+                )
+
+        def get_layer_weight(layer):
+            if hasattr(layer, "weight"):
+                return layer.weight
+            elif hasattr(layer, "qweight"):
+                return layer.qweight
+            else:
+                raise AttributeError(
+                    f"Layer '{layer}' has neither weight nor qweight")
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
+        assert get_layer_weight(self.o_proj).dtype == weight_dtype
+        assert get_layer_weight(self.q_proj).dtype == weight_dtype
+
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+        # can be W_Q or W_UQ depending q_lora_rank, the former if
+        # q_lora_rank is None, the latter otherwise. From the Attention backend
+        # perspective though we call these both W_Q and rely on the layer
+        # to pass in the correct matrix
+        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
+            .flatten(start_dim=1).contiguous()
+
+        # W_QR is small so for simplicity we dont bother requantizing it
+        self.W_QR = self.W_QR.to(act_dtype)
+
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+            if is_fp8(weight_dtype) and requantization_enabled:
+                # This assumes it wise to requantize using the same group shapes
+                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+                # weights were originally quantized
+                requant_input_group_shape, requant_weight_group_shape = \
+                    get_scale_group_shapes_for_fp8(self.q_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.o_proj)
+                self.reqaunt_input_group_shape = requant_input_group_shape
+                self.reqaunt_weight_group_shape = requant_weight_group_shape
+
+            #
+            # Perform matrix-absorption following
+            #     https://github.com/flashinfer-ai/flashinfer/pull/551
+            # for decode, as a result we end up with absorbed weights for decode
+            # and another copy of raw weights for prefill.
+            #
+            self.W_UK, self.W_UV = kv_b_proj_weight.split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
+            # depending q_lora_rank, the former if q_lora_rank is None, the
+            # latter otherwise
+            # basically if q_lora_rank is none we are absorbing into q_proj
+            # instead of UQ
+            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+                .flatten(start_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_Q_UK, W_Q_UK_scales = scaled_quantize(
+                    W_Q_UK,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_Q_UK = W_Q_UK.T.contiguous()
+                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+            else:
+                self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+            W_O = get_and_maybe_dequant_weights(self.o_proj)\
+                .view(-1, self.num_heads, self.v_head_dim)
+            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+                .flatten(start_dim=0, end_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_UV_O, W_UV_O_scales = scaled_quantize(
+                    W_UV_O,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_UV_O = W_UV_O.T.contiguous()
+                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+            else:
+                self.W_UV_O = W_UV_O.to(act_dtype)
+
+            self.tp_size = get_tensor_model_parallel_world_size()
+        else:
+            if is_fp8(weight_dtype):
+                raise NotImplementedError(
+                    "Currently fp8 requires matrix absorption")
+
+            self.W_UV = W_UV
+            self.W_UK = W_UK
+            self.W_Q = W_Q.flatten(start_dim=1)
+
+    def _compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ):
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+        assert prefill_metadata.context_chunk_seq_tot is not None
+        assert prefill_metadata.context_chunk_cu_seq_lens is not None
+        assert prefill_metadata.context_chunk_starts is not None
+        assert prefill_metadata.context_chunk_max_seq_lens is not None
+        assert prefill_metadata.context_lens_tensor is not None
+
+        output = None
+        iters = len(prefill_metadata.context_chunk_seq_tot)
+
+        # Fetch from attn_metadata directly, since it late bound by
+        # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
+        # any weirdness around prefill_metadata caching
+        assert attn_metadata.chunked_prefill_workspace is not None
+        workspace = attn_metadata.chunked_prefill_workspace
+
+        for i in range(iters):
+            toks = prefill_metadata.context_chunk_seq_tot[i]
+
+            ops.gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=prefill_metadata.block_tables,
+                cu_seq_lens=prefill_metadata.context_chunk_cu_seq_lens[i],
+                batch_size=prefill_metadata.num_prefills,
+                seq_starts=prefill_metadata.context_chunk_starts[i],
+            )
+
+            kv_c_normed = workspace[:toks]\
+                [..., :self.kv_lora_rank].unsqueeze(1)
+            k_pe = workspace[:toks]\
+                [..., self.kv_lora_rank:].unsqueeze(1)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            # For MLA the v head dim is smaller than qk head dim so we pad
+            # out v with 0s to match the qk head dim
+            v_padded = torch.nn.functional.pad(v,
+                                               [0, q.shape[-1] - v.shape[-1]],
+                                               value=0)
+
+            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v_padded,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+
+        has_context = prefill_metadata.context_lens_tensor is not None \
+            and prefill_metadata.context_lens_tensor.max() > 0
+
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v_padded,
+            cu_seqlens_q=prefill_metadata.query_start_loc,
+            cu_seqlens_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
+
+        if has_context:
+            suffix_output, suffix_lse = output
+            context_output, context_lse = self._compute_prefill_context( \
+                q, kv_c_and_k_pe_cache, attn_metadata)
+
+            output = torch.empty_like(suffix_output)
+            merge_attn_states(
+                output=output,
+                prefix_output=context_output,
+                prefix_lse=context_lse,
+                suffix_output=suffix_output,
+                suffix_lse=suffix_lse,
+            )
+
+        # slice by `:v.shape[-1]` in order to remove v headdim padding
+        output = output\
+            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+                .reshape(-1, self.num_heads * v.shape[-1])
+
+        return self.o_proj(output)[0]
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if output is not None:
+            raise NotImplementedError(
+                "output is not yet supported for MLAImplBase")
+
+        if attn_metadata.is_profile_run and \
+            attn_metadata.chunked_prefill_workspace is not None:
+            # During the profile run try to simulate to worse case output size
+            # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
+            # since this can be large
+            _ = torch.empty(
+                (attn_metadata.chunked_prefill_workspace.shape[0],
+                 self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
+                device=k_c_normed.device,
+                dtype=k_c_normed.dtype,
+            )
+
+        has_decode = attn_metadata.decode_metadata is not None
+        has_prefill = attn_metadata.prefill_metadata is not None
+
+        # Restore head dim (for rotary embedding)
+        k_pe = k_pe.unsqueeze(1)
+        assert hasattr(attn_metadata, "input_positions")
+
+        num_prefill_tokens: int = attn_metadata.num_prefill_tokens
+
+        decode_hs_or_q_c = hidden_states_or_q_c[num_prefill_tokens:]
+        decode_k_pe = k_pe[num_prefill_tokens:]
+        decode_input_positions = \
+            attn_metadata.input_positions[num_prefill_tokens:]
+
+        prefill_hs_or_q_c = hidden_states_or_q_c[:num_prefill_tokens]
+        prefill_k_pe = k_pe[:num_prefill_tokens]
+        prefill_input_positions = \
+            attn_metadata.input_positions[:num_prefill_tokens]
+        prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
+
+        if has_decode:
+            decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
+                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
+                decode_input_positions, decode_q_pe, decode_k_pe)
+
+        if has_prefill:
+            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+            prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
+            prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
+                prefill_input_positions, prefill_q_pe, prefill_k_pe)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        output = torch.empty(attn_metadata.num_prefill_tokens +
+                             attn_metadata.num_decode_tokens,
+                             self.o_proj.output_size,
+                             device=hidden_states_or_q_c.device,
+                             dtype=hidden_states_or_q_c.dtype)
+        if has_prefill:
+            output[:num_prefill_tokens] = self._forward_prefill(
+                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                attn_metadata)
+
+        if has_decode:
+            output[num_prefill_tokens:] = self._forward_decode(
+                decode_q_nope, decode_q_pe, kv_cache, attn_metadata)
+
+        return output
diff --git a/vllm/attention/backends/mla/utils.py b/vllm/attention/backends/mla/utils.py
deleted file mode 100644
index df3fb2aeefc..00000000000
--- a/vllm/attention/backends/mla/utils.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import functools
-from abc import abstractmethod
-from dataclasses import dataclass
-from typing import Any, Dict, Generic, List, Optional, Tuple
-
-import torch
-from compressed_tensors.quantization import QuantizationStrategy
-
-from vllm import _custom_ops as ops
-from vllm import envs
-from vllm.attention.backends.abstract import (AttentionLayer,
-                                              AttentionMetadata,
-                                              MLAAttentionImpl, T)
-from vllm.attention.backends.utils import get_flash_attn_version
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               LinearBase, RowParallelLinear,
-                                               UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsW8A8Fp8)
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import (
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
-
-try:
-    from vllm.vllm_flash_attn import flash_attn_varlen_func
-except ImportError:
-    from flash_attn import flash_attn_varlen_func
-
-
-@dataclass
-class MLACommonMetadata(AttentionMetadata):
-    # Input positions for rotrary embeddings since for MLA the rotary
-    # position embeddings are applied inside the attention backend
-    input_positions: torch.Tensor
-
-
-class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
-    """
-    Common class for implementing repeated parts
-
-    Main reference: DeepseekV2 paper, and FlashInfer Implementation
-    (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
-
-    Deepseek's MLA attention works the following way:
-    * Use a single latent vector to represent the entire KV cache.
-    * The attention "simulates" a multi-head attention, while the compute is
-      similar to multi-query attention.
-    * The dataflow is as follows,
-
-        * B: batch/sequence length
-        * H: hidden size
-        * N: number of attention heads
-        * Lq: latent dimension for Q
-        * Lkv: latent dimension for K/V
-        * P: nope dimension, P+R is the actual head_dim in common attention.
-        * R: rope dimension, this slide of the head_dim goes through rope.
-        * V: V head dim.
-        * kv_c: latent/compressed KV
-        * q_c: latent/compressed Q
-
-        #
-        # Outside the MLA attention backend
-        #
-
-        1. The hidden states (B, H) are projected down into cq (B, Lq) and
-           kv_c_k_pe (B, Lkv+R).
-        2. The kv_c_k_pe is split into kv_c (B, Lkv) and k_pe (B, R). cq
-           and kv_c are normalized.
-
-        #
-        # Inside the MLA attention backend
-        #
-
-        * if prefill:
-
-        3. The q_c is then projected up into the multi-head version.
-           * q_c goes from (B, Lq) to (B, N, (P+R)), which is split into q_nope
-             (B, N, P) and q_pe (B, N, R).
-        4. q_pe, k_pe are then passed through rotary embeddings.
-        5. kv_c and k_pe are concatenated and inserted into the cache
-        6. The kv_c is then projected up into the multi-head version.
-           * kv_c goes from (B, Lkv) to (B, N, (P+V)) which has the nope
-             dimensions for K and V, which is split into k_nope (B, N, P)
-             and v (B, N, V).
-        7. q (B, N, (P+R)) and k (B, N, (P+R)) matrices are assembled from
-           q_nope, q_pe, k_nope, k_pe.
-        8. Attention is computued with q, k, v.
-        9. The attention computation returns (B, N, V), which is projected back
-           to (B, H) using out projection.
-
-        * if decode:
-
-        3. Here's the change, we do not perform up the full up projection for
-           q_c, and there is no up projection at all for kv_c. This is
-           achieved by the technique of "weight absorption". The paper says
-           "Fortunately, due to the associative law of matrix multiplication,
-           we can absorb WUK into WUQ, and WUV into WO"
-           * The q up projection turns (B, Lq) into (B, N, (P+R)), we split it
-             into W_UQ (Lq, N, P) and W_QR (Lq, N, R).
-           * The kv_c up projection turns (B, Lkv) into (B, N, (P+V)), we split
-             it into W_UK (Lkv, N, P) and W_UV (Lkv, N, V).
-           * The out projection shape W_O (N*V, H) turns (B, N, V) into (B, H).
-           * We can precompute the product of W_UQ and W_UK into
-             W_UQ_UK (Lq, N, Lkv), which is possible due to QK^T operation in
-             attention.
-           * We can precompute the product of W_UV and W_O into
-             W_UV_O (N, Lkv, H), which is possible due to V@O as the
-             "epilogue" of attention
-        4. We still need to compute q_pe (B, N, R) by applying W_QR to q_latent.
-        5. q_pe, k_pe are then passed through rotary embeddings.
-        6. kv_c and k_pe are concatenated and inserted into the cache
-        7. By applying W_UQ_UK to q_latent, we have the new q_nope of shape
-           (B, N, Lkv).
-        8. q (B, N, (Lkv+R)), k (B, (Lkv+R)) are assembled from q_nope, q_pe,
-           kv_a, k_pe. v (B, Lkv) is exactly the same vector as kv_a.
-        9. The attention is computed with q, k, v. Note that we just performed
-           a MQA attention with (LKv+R) as our head dim.
-        10. The KV cache is updated using the new entries k (B, N, (Lkv+R)),
-           which included the v and rope values.
-        11. The attention computation returns (B, N, Lkv), which is projected
-           back to (B, H) using W_UV_O.
-
-    From @tsu-bin's calculation, we only want to use the absorption technique
-    for decode. The prefill algorithm should still use the up-projected MHA
-    for less flops and memory usage.
-
-    """
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]],
-        logits_soft_cap: Optional[float],
-        attn_type: str,
-        # MLA Specific Arguments
-        q_lora_rank: Optional[int],
-        kv_lora_rank: int,
-        qk_nope_head_dim: int,
-        qk_rope_head_dim: int,
-        qk_head_dim: int,
-        v_head_dim: int,
-        rotary_emb: RotaryEmbedding,
-        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
-        # attention backend perspective we rely on the layer to pass in the
-        # correct matrix
-        q_proj: ColumnParallelLinear,
-        kv_b_proj: ColumnParallelLinear,
-        o_proj: RowParallelLinear,
-    ) -> None:
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-
-        self.q_lora_rank = q_lora_rank
-        self.kv_lora_rank = kv_lora_rank
-        self.qk_nope_head_dim = qk_nope_head_dim
-        self.qk_rope_head_dim = qk_rope_head_dim
-        self.qk_head_dim = qk_head_dim
-        self.v_head_dim = v_head_dim
-
-        self.rotary_emb = rotary_emb
-        self.use_yarn_rope = isinstance(rotary_emb,
-                                        DeepseekScalingRotaryEmbedding)
-        self.q_proj = q_proj
-        self.kv_b_proj = kv_b_proj
-        self.o_proj = o_proj
-        self.vllm_flash_attn_version = get_flash_attn_version()
-
-        # Handle the differences between the flash_attn_varlen from flash_attn
-        # and the one from vllm_flash_attn. The former is used on RoCM and the
-        # latter has an additional parameter to control FA2 vs FA3
-        self.flash_attn_varlen_func = flash_attn_varlen_func
-        if self.vllm_flash_attn_version is not None:
-            self.flash_attn_varlen_func = \
-                functools.partial(flash_attn_varlen_func,
-                                  fa_version=self.vllm_flash_attn_version)
-
-    def _v_up_proj_and_o_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_UV_O):
-                output_parallel = apply_fp8_linear_generic(
-                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape)
-            else:
-                output_parallel = torch.matmul(x.flatten(start_dim=1),
-                                               self.W_UV_O)
-            if self.tp_size > 1:
-                output = tensor_model_parallel_all_reduce(output_parallel)
-            else:
-                output = output_parallel
-            return output
-        else:
-            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
-            return self.o_proj(x.reshape(-1,
-                                         self.num_heads * self.v_head_dim))[0]
-
-    def _q_proj_and_k_up_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_Q_UK):
-                return apply_fp8_linear_generic(
-                    x, self.W_Q_UK, self.W_Q_UK_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape).view(
-                        -1, self.num_heads, self.kv_lora_rank)
-            return torch.matmul(x, self.W_Q_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
-        else:
-            x = torch.matmul(x, self.W_Q)\
-                .view(-1, self.num_heads, self.qk_nope_head_dim)
-            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
-
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
-        # TODO(lucas) This is very gross, we need a more wide scale refactor of
-        # all the FP8 code with a more standard way of
-        # defining schemes/group-shapes, we should also potentially force
-        # quant_methods to support a decompress function
-        #
-        # returns input_group_shape, weight_group_shape
-        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
-            Tuple[Tuple[int, int], Tuple[int, int]]:
-            if isinstance(layer.quant_method, Fp8LinearMethod):
-                if layer.quant_method.block_quant:
-                    weight_block_size = \
-                        layer.quant_method.quant_config.weight_block_size
-                    # per-token-group (1, X), block-quantized (X, Y)
-                    return (1, weight_block_size[-1]), weight_block_size
-                else:
-                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
-            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
-                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
-                # this is hacky but we always assume the for
-                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
-                # we ignore if it is static-per-tensor since we are going to
-                # requantize after later anyways
-                strategy = layer.scheme.strategy
-                if strategy == QuantizationStrategy.TENSOR:
-                    return (1, -1), (-1, -1)  # per-token, per-tensor
-                elif strategy == QuantizationStrategy.CHANNEL:
-                    return (1, -1), (-1, 1)  # per-token, per-channel
-                else:
-                    raise NotImplementedError(
-                        f"QuantizationStrategy.{strategy} is not supported for "
-                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
-            else:
-                raise NotImplementedError(
-                    "Can't determine scale group shapes for "
-                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
-                )
-
-        def get_layer_weight(layer):
-            if hasattr(layer, "weight"):
-                return layer.weight
-            elif hasattr(layer, "qweight"):
-                return layer.qweight
-            else:
-                raise AttributeError(
-                    f"Layer '{layer}' has neither weight nor qweight")
-
-        def get_and_maybe_dequant_weights(layer: LinearBase):
-            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
-                # NOTE: This should only be used offline, since it's O(N^3)
-                eye = torch.eye(layer.input_size_per_partition,
-                                dtype=act_dtype,
-                                device=get_layer_weight(layer).device)
-                dequant_weights = layer.quant_method.apply(layer,
-                                                           eye,
-                                                           bias=None)
-                del eye
-                # standardize to (output, input)
-                return dequant_weights.T
-            return layer.weight
-
-        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
-        assert get_layer_weight(self.o_proj).dtype == weight_dtype
-        assert get_layer_weight(self.q_proj).dtype == weight_dtype
-
-        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
-        assert kv_b_proj_weight.shape == (
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
-                f"{kv_b_proj_weight.shape=}, "
-                f"{self.kv_lora_rank=}, "
-                f"{self.num_heads=}, "
-                f"{self.qk_nope_head_dim=}, "
-                f"{self.v_head_dim=}")
-        kv_b_proj_weight = kv_b_proj_weight.view(
-            self.kv_lora_rank,
-            self.num_heads,
-            self.qk_nope_head_dim + self.v_head_dim,
-        )
-
-        W_UK, W_UV = kv_b_proj_weight.split(
-            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-
-        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
-                .view(-1, self.num_heads, self.qk_head_dim)
-
-        # can be W_Q or W_UQ depending q_lora_rank, the former if
-        # q_lora_rank is None, the latter otherwise. From the Attention backend
-        # perspective though we call these both W_Q and rely on the layer
-        # to pass in the correct matrix
-        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
-        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
-            .flatten(start_dim=1).contiguous()
-
-        # W_QR is small so for simplicity we dont bother requantizing it
-        self.W_QR = self.W_QR.to(act_dtype)
-
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
-            if is_fp8(weight_dtype) and requantization_enabled:
-                # This assumes it wise to requantize using the same group shapes
-                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
-                # weights were originally quantized
-                requant_input_group_shape, requant_weight_group_shape = \
-                    get_scale_group_shapes_for_fp8(self.q_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.o_proj)
-                self.reqaunt_input_group_shape = requant_input_group_shape
-                self.reqaunt_weight_group_shape = requant_weight_group_shape
-
-            #
-            # Perform matrix-absorption following
-            #     https://github.com/flashinfer-ai/flashinfer/pull/551
-            # for decode, as a result we end up with absorbed weights for decode
-            # and another copy of raw weights for prefill.
-            #
-            self.W_UK, self.W_UV = kv_b_proj_weight.split(
-                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
-            # depending q_lora_rank, the former if q_lora_rank is None, the
-            # latter otherwise
-            # basically if q_lora_rank is none we are absorbing into q_proj
-            # instead of UQ
-            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
-                .flatten(start_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_Q_UK, W_Q_UK_scales = scaled_quantize(
-                    W_Q_UK,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_Q_UK = W_Q_UK.T.contiguous()
-                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
-            else:
-                self.W_Q_UK = W_Q_UK.to(act_dtype)
-
-            W_O = get_and_maybe_dequant_weights(self.o_proj)\
-                .view(-1, self.num_heads, self.v_head_dim)
-            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
-                .flatten(start_dim=0, end_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_UV_O, W_UV_O_scales = scaled_quantize(
-                    W_UV_O,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_UV_O = W_UV_O.T.contiguous()
-                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
-            else:
-                self.W_UV_O = W_UV_O.to(act_dtype)
-
-            self.tp_size = get_tensor_model_parallel_world_size()
-        else:
-            if is_fp8(weight_dtype):
-                raise NotImplementedError(
-                    "Currently fp8 requires matrix absorption")
-
-            self.W_UV = W_UV
-            self.W_UK = W_UK
-            self.W_Q = W_Q.flatten(start_dim=1)
-
-    @abstractmethod
-    def _forward_prefill(
-        self,
-        q: torch.Tensor,
-        kv_c_normed: torch.Tensor,
-        k_pe: torch.Tensor,
-        attn_metadata: T,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-    @abstractmethod
-    def _forward_decode(
-        self,
-        q_nope: torch.Tensor,
-        q_pe: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: T,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-    def forward(
-        self,
-        layer: AttentionLayer,
-        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
-        k_c_normed: torch.Tensor,  # key in unified attn
-        k_pe: torch.Tensor,  # value in unified attn
-        kv_cache: torch.Tensor,
-        attn_metadata: T,
-        output: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if output is not None:
-            raise NotImplementedError(
-                "output is not yet supported for MLAImplBase")
-
-        is_decode = attn_metadata.decode_metadata is not None
-        is_prefill = attn_metadata.prefill_metadata is not None
-
-        if (is_decode and is_prefill):
-            raise NotImplementedError(
-                "chunked prefill is not supported for MLAImplBase")
-
-        # Restore head dim (for rotary embedding)
-        k_pe = k_pe.unsqueeze(1)
-        assert hasattr(attn_metadata, "input_positions")
-
-        if is_decode:
-            q_nope = self._q_proj_and_k_up_proj(hidden_states_or_q_c)
-            q_pe = torch.matmul(hidden_states_or_q_c, self.W_QR)\
-                .view(-1, self.num_heads, self.qk_rope_head_dim)
-            q_pe, k_pe = self.rotary_emb(attn_metadata.input_positions, q_pe,
-                                         k_pe)
-        else:
-            assert is_prefill
-            q = self.q_proj(hidden_states_or_q_c)[0]\
-                .view(-1, self.num_heads, self.qk_head_dim)
-
-            # TODO(lucas): there must be a nicer way to write this line
-            q[..., self.qk_nope_head_dim:], k_pe = \
-                self.rotary_emb(
-                    attn_metadata.input_positions,
-                    q[..., self.qk_nope_head_dim:], k_pe)
-
-        # write the latent and rope to kv cache
-        if kv_cache.numel() > 0:
-            ops.concat_and_cache_mla(
-                k_c_normed,
-                k_pe.squeeze(1),
-                kv_cache,
-                attn_metadata.slot_mapping.flatten(),
-                kv_cache_dtype=self.kv_cache_dtype,
-                scale=layer._k_scale,
-            )
-
-        if attn_metadata.prefill_metadata is not None:
-            return self._forward_prefill(q, k_c_normed, k_pe, attn_metadata)
-
-        if attn_metadata.decode_metadata is not None:
-            return self._forward_decode(q_nope, q_pe, kv_cache, attn_metadata)
-
-    # Optional common flash-attn based prefill
-    def _forward_prefill_flash(
-        self,
-        q: torch.Tensor,
-        k_c_normed: torch.Tensor,
-        k_pe: torch.Tensor,
-        seq_start_loc: torch.Tensor,
-        max_prefill_seq_len: int,
-    ) -> torch.Tensor:
-
-        kv_nope = self.kv_b_proj(k_c_normed)[0]\
-            .view(-1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
-        k_nope, v = kv_nope\
-            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-
-        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
-
-        # For MLA the v head dim is smaller than qk head dim so we pad out
-        # v with 0s to match the qk head dim
-        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
-                                           value=0)
-
-        attn_output = self.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v_padded,
-            cu_seqlens_q=seq_start_loc,
-            cu_seqlens_k=seq_start_loc,
-            max_seqlen_q=max_prefill_seq_len,
-            max_seqlen_k=max_prefill_seq_len,
-            softmax_scale=self.scale,
-            causal=True,
-        )
-        attn_output = attn_output\
-            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
-                .reshape(-1, self.num_heads * v.shape[-1])
-
-        return self.o_proj(attn_output)[0]
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 9a1984a931b..08e8226ab04 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -1,40 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from collections import defaultdict
-from contextlib import contextmanager
-from dataclasses import dataclass
-from itertools import accumulate
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
-
-from vllm.multimodal import MultiModalPlaceholderMap
-
-try:
-    from flashinfer import BatchDecodeMlaWithPagedKVCacheWrapper
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024
-except ImportError:
-    BatchDecodeMlaWithPagedKVCacheWrapper = None
-    FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
+from typing import Any, Dict, List, Optional, Type
 
 import torch
 
-from vllm import _custom_ops as ops
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata,
-                                              AttentionMetadataBuilder,
-                                              AttentionState, AttentionType)
-from vllm.attention.backends.mla.utils import MLACommonImpl, MLACommonMetadata
-from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
-                                           compute_slot_mapping_start_idx,
-                                           is_block_tables_empty)
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-from vllm.utils import async_tensor_h2d, make_tensor_with_pad
-
-if TYPE_CHECKING:
-    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
-                                          ModelInputForGPUWithSamplingMetadata)
 
 
-class TritonMLABackend(AttentionBackend):
+class TritonMLABackend(MLACommonBackend):
 
     @staticmethod
     def get_name() -> str:
@@ -44,610 +21,8 @@ def get_name() -> str:
     def get_impl_cls() -> Type["TritonMLAImpl"]:
         return TritonMLAImpl
 
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return TritonMLAMetadata
-
-    @staticmethod
-    def get_builder_cls() -> Type["TritonMLAMetadataBuilder"]:
-        return TritonMLAMetadataBuilder
-
-    @staticmethod
-    def get_state_cls() -> Type["TritonMLAState"]:
-        return TritonMLAState
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,  # assumed to be 1 for MLA
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        ops.copy_blocks_mla(kv_caches, src_to_dists)
-
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [576]
-
-
-class TritonMLAState(AttentionState):
-
-    def __init__(self, runner):
-        self.runner = runner
-        self._is_graph_capturing = False
-
-    @contextmanager
-    def graph_capture(self, max_batch_size: int):
-        self._is_graph_capturing = True
-
-        self._graph_slot_mapping = torch.full((max_batch_size, ),
-                                              PAD_SLOT_ID,
-                                              dtype=torch.long,
-                                              device=self.runner.device)
-        self._graph_seq_lens = torch.ones(max_batch_size,
-                                          dtype=torch.int32,
-                                          device=self.runner.device)
-        self._graph_block_tables = torch.from_numpy(
-            self.runner.graph_block_tables).to(device=self.runner.device)
-
-        self._positions = torch.zeros((max_batch_size, ),
-                                      dtype=torch.long,
-                                      device=self.runner.device)
-
-        yield
-
-        self._is_graph_capturing = False
-        del self._graph_slot_mapping
-        del self._graph_seq_lens
-        del self._graph_block_tables
-        del self._positions
-
-    def graph_clone(self, batch_size: int):
-        assert self._is_graph_capturing
-        return self.__class__(self.runner)
-
-    def graph_capture_get_metadata_for_batch(
-            self, batch_size: int, is_encoder_decoder_model: bool = False):
-        assert self._is_graph_capturing
-
-        attn_metadata = self.runner.attn_backend.make_metadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=batch_size,
-            slot_mapping=self._graph_slot_mapping[:batch_size],
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            seq_lens=None,
-            seq_lens_tensor=self._graph_seq_lens[:batch_size],
-            max_query_len=1,
-            max_decode_query_len=1,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.runner.max_seq_len_to_capture,
-            query_start_loc=None,
-            seq_start_loc=None,
-            context_lens_tensor=None,
-            block_tables=self._graph_block_tables[:batch_size],
-            use_cuda_graph=True,
-            input_positions=self._positions[:batch_size],
-            head_dim=self.runner.model_config.get_head_size())
-
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "TritonMLAState does not support encoder/decoder yet")
-
-        return attn_metadata
-
-    def get_graph_input_buffers(self,
-                                attn_metadata,
-                                is_encoder_decoder_model: bool = False):
-        input_buffers = {
-            "slot_mapping": attn_metadata.slot_mapping,
-            "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor,
-            "block_tables": attn_metadata.decode_metadata.block_tables,
-            "input_positions": attn_metadata.decode_metadata.input_positions,
-        }
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "TritonMLAState does not support encoder/decoder yet")
-
-        return input_buffers
-
-    def prepare_graph_input_buffers(self,
-                                    input_buffers,
-                                    attn_metadata,
-                                    is_encoder_decoder_model: bool = False):
-        input_positions = attn_metadata.input_positions
-        num_positions = input_positions.shape[0]
-        input_buffers["seq_lens_tensor"].copy_(
-            attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True)
-        input_buffers["block_tables"].copy_(
-            attn_metadata.decode_metadata.block_tables, non_blocking=True)
-        # CUDA graph buffer is padded so only perform a partial copy based on
-        # num_positions
-        input_buffers["input_positions"][:num_positions].copy_(
-            input_positions, non_blocking=True)
-        if is_encoder_decoder_model:
-            raise NotImplementedError(
-                "TritonMLAState does not support encoder/decoder yet")
-
-    def begin_forward(self, model_input):
-        return
-
-
-@dataclass
-class TritonMLAMetadata(MLACommonMetadata):
-    """Metadata for TritonMLAMetadata.
-
-    NOTE: Any python object stored here is not updated when it is
-    cuda-graph replayed. If you have values that need to be changed
-    dynamically, it should be stored in tensor. The tensor has to be
-    updated from `CUDAGraphRunner.forward` API.
-    """
-    # (batch_size,). The sequence length per sequence. Sequence length means
-    # the computed tokens + new tokens None if it is a decoding.
-    seq_lens: Optional[List[int]]
-    # seq_lens stored as a tensor.
-    seq_lens_tensor: Optional[torch.Tensor]
-
-    # NOTE(sang): Definition of context_len, query_len, and seq_len.
-    # |---------- N-1 iteration --------|
-    # |---------------- N iteration ---------------------|
-    # |- tokenA -|......................|-- newTokens ---|
-    # |---------- context_len ----------|
-    # |-------------------- seq_len ---------------------|
-    #                                   |-- query_len ---|
-
-    # Maximum sequence length among prefill batch. 0 if there are decoding
-    # requests only.
-    max_prefill_seq_len: int
-    # Maximum sequence length among decode batch. 0 if there are prefill
-    # requests only.
-    max_decode_seq_len: int
-    # (batch_size,) A tensor of context lengths (tokens that are computed
-    # so far).
-    context_lens_tensor: Optional[torch.Tensor]
-
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: Optional[torch.Tensor]
-
-    # Whether or not if cuda graph is enabled.
-    # Cuda-graph is currently enabled for decoding only.
-    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
-
-    use_cuda_graph: bool
-
-    # Maximum query length in the batch.
-    max_query_len: Optional[int] = None
-
-    # Max number of query tokens among request in the batch.
-    max_decode_query_len: Optional[int] = None
-
-    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
-    # the batch, used to index into subquery. E.g., if the subquery length
-    # is [4, 6], it is [0, 4, 10].
-    query_start_loc: Optional[torch.Tensor] = None
-    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
-    # the batch, used to index into sequence. E.g., if the sequence length is
-    # [4, 6], it is [0, 4, 10].
-    seq_start_loc: Optional[torch.Tensor] = None
-
-    _cached_prefill_metadata: Optional["TritonMLAMetadata"] = None
-    _cached_decode_metadata: Optional["TritonMLAMetadata"] = None
 
-    num_prefill_tokens: int
-
-    num_kv_splits: int = 4  # TODO(lucas) add heuristic
-    attn_logits: Optional[torch.Tensor] = None
-    req_idx: Optional[torch.Tensor] = None
-
-    # The dimension of the attention heads
-    head_dim: Optional[int] = None
-
-    def __post_init__(self):
-        supported_head_sizes = TritonMLABackend.get_supported_head_sizes()
-        if self.head_dim is not None and self.head_dim \
-                not in supported_head_sizes:
-            raise ValueError(
-                f"Only {supported_head_sizes} are supported for head_dim,",
-                f"received {self.head_dim}.")
-
-    @property
-    def prefill_metadata(self) -> Optional["TritonMLAMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            return self._cached_prefill_metadata
-
-        assert self.seq_lens is not None
-        assert self.seq_lens_tensor is not None
-
-        # Compute some attn_metadata fields which default to None
-        query_start_loc = (None if self.query_start_loc is None else
-                           self.query_start_loc[:self.num_prefills + 1])
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[:self.num_prefill_tokens])
-        seq_lens = (None if self.seq_lens is None else
-                    self.seq_lens[:self.num_prefills])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[:self.num_prefills])
-        seq_start_loc = (None if self.seq_start_loc is None else
-                         self.seq_start_loc[:self.num_prefills + 1])
-        context_lens_tensor = (None if self.context_lens_tensor is None else
-                               self.context_lens_tensor[:self.num_prefills])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[:self.num_prefills])
-        input_positions = (None if self.input_positions is None else
-                           self.input_positions[:self.num_prefill_tokens])
-
-        self._cached_prefill_metadata = TritonMLAMetadata(
-            num_prefills=self.num_prefills,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=self.
-            multi_modal_placeholder_index_maps,
-            enable_kv_scales_calculation=self.enable_kv_scales_calculation,
-            input_positions=input_positions,
-            seq_lens=seq_lens,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=self.max_prefill_seq_len,
-            max_decode_query_len=0,
-            max_decode_seq_len=0,
-            query_start_loc=query_start_loc,
-            seq_start_loc=seq_start_loc,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=False,
-            head_dim=self.head_dim)
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["TritonMLAMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            return self._cached_decode_metadata
-        assert self.seq_lens_tensor is not None
-
-        # Compute some attn_metadata fields which default to None
-        slot_mapping = (None if self.slot_mapping is None else
-                        self.slot_mapping[self.num_prefill_tokens:])
-        seq_lens_tensor = (None if self.seq_lens_tensor is None else
-                           self.seq_lens_tensor[self.num_prefills:])
-        block_tables = (None if self.block_tables is None else
-                        self.block_tables[self.num_prefills:])
-        input_positions = (None if self.input_positions is None else
-                           self.input_positions[self.num_prefill_tokens:])
-
-        self._cached_decode_metadata = TritonMLAMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=self.num_decode_tokens,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            seq_lens=None,
-            seq_lens_tensor=seq_lens_tensor,
-            max_decode_query_len=self.max_decode_query_len,
-            max_query_len=self.max_query_len,
-            max_prefill_seq_len=0,
-            max_decode_seq_len=self.max_decode_seq_len,
-            # Batch may be composed of prefill|decodes, adjust query start
-            # indices to refer to the start of decodes. E.g.
-            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-            query_start_loc=(self.query_start_loc[self.num_prefills:] -
-                             self.query_start_loc[self.num_prefills])
-            if self.query_start_loc is not None else None,
-            seq_start_loc=self.seq_start_loc[self.num_prefills:]
-            if self.seq_start_loc is not None else None,
-            context_lens_tensor=None,
-            block_tables=block_tables,
-            use_cuda_graph=self.use_cuda_graph,
-            input_positions=input_positions,
-            head_dim=self.head_dim)
-        return self._cached_decode_metadata
-
-    def advance_step(self,
-                     model_input: "ModelInputForGPUWithSamplingMetadata",
-                     sampled_token_ids: Optional[torch.Tensor],
-                     block_size: int,
-                     num_seqs: int,
-                     num_queries: int,
-                     turn_prefills_into_decodes: bool = False):
-        """
-        Update metadata in-place to advance one decode step.
-        """
-        # When using cudagraph, the num_seqs is padded to the next captured
-        # batch sized, but num_queries tracks the actual number of requests in
-        # the batch. For --enforce-eager mode, num_seqs == num_queries
-        if num_seqs != num_queries:
-            assert num_seqs > num_queries
-            assert self.use_cuda_graph
-
-        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
-            # decodes are scheduled together. In the first step, all the
-            # prefills turn into decodes. This update reflects that
-            # conversion.
-            assert self.num_decode_tokens + self.num_prefills == num_seqs
-            self.num_decode_tokens += self.num_prefills
-            self.num_prefills = 0
-            self.num_prefill_tokens = 0
-            self.max_prefill_seq_len = 0
-            self.max_query_len = 1
-
-            self.slot_mapping = self.slot_mapping[:num_seqs]
-        else:
-            assert self.seq_lens is not None
-            assert self.max_decode_seq_len == max(self.seq_lens)
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.num_decode_tokens == num_seqs
-        assert self.slot_mapping.shape == (num_seqs, )
-
-        assert self.seq_lens is not None
-        assert len(self.seq_lens) == num_seqs
-        assert self.seq_lens_tensor is not None
-        assert self.seq_lens_tensor.shape == (num_seqs, )
-        assert self.max_query_len == 1
-        assert self.max_prefill_seq_len == 0
-
-        assert self.query_start_loc is not None
-        assert self.query_start_loc.shape == (num_queries + 1, )
-        assert self.seq_start_loc is not None
-        assert self.seq_start_loc.shape == (num_seqs + 1, )
-
-        assert self.context_lens_tensor is not None
-        assert self.context_lens_tensor.shape == (num_queries, )
-
-        assert self.block_tables is not None
-        assert self.block_tables.shape[0] == num_seqs
-
-        # Update query lengths. Note that we update only queries and not seqs,
-        # since tensors may be padded due to captured cuda graph batch size
-        for i in range(num_queries):
-            self.seq_lens[i] += 1
-        self.max_decode_seq_len = max(self.seq_lens)
-
-        ops.advance_step_flashattn(num_seqs=num_seqs,
-                                   num_queries=num_queries,
-                                   block_size=block_size,
-                                   input_tokens=model_input.input_tokens,
-                                   sampled_token_ids=sampled_token_ids,
-                                   input_positions=model_input.input_positions,
-                                   seq_lens=self.seq_lens_tensor,
-                                   slot_mapping=self.slot_mapping,
-                                   block_tables=self.block_tables)
-
-
-class TritonMLAMetadataBuilder(AttentionMetadataBuilder[TritonMLAMetadata]):
-
-    def __init__(self, input_builder: "ModelInputForGPUBuilder"):
-        self.input_builder = input_builder
-        self.runner = input_builder.runner
-        self.sliding_window = input_builder.sliding_window
-        self.block_size = input_builder.block_size
-
-    def prepare(self):
-        self.slot_mapping: List[int] = []
-        self.prefill_seq_lens: List[int] = []
-        self.context_lens: List[int] = []
-        self.block_tables: List[List[int]] = []
-        self.curr_seq_lens: List[int] = []
-        self.input_positions: List[int] = []
-        self.multimodal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-        self.num_prefills = 0
-        self.num_prefill_tokens = 0
-        self.num_decode_tokens = 0
-        self.has_prefix_cache_hit = False
-
-    def _add_seq_group(
-            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
-            chunked_prefill_enabled: bool, prefix_cache_hit: bool):
-        """Add a sequence group to the metadata. Specifically update/append
-        1. context length.
-        2. block table.
-        3. slot mapping.
-        """
-        is_prompt = inter_data.is_prompt
-        block_tables = inter_data.block_tables
-
-        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
-             curr_sliding_window_block, input_positions) in zip(
-                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
-                 inter_data.orig_seq_lens, inter_data.seq_lens,
-                 inter_data.query_lens, inter_data.context_lens,
-                 inter_data.curr_sliding_window_blocks,
-                 inter_data.input_positions):
-            self.input_positions.extend(input_positions)
-            self.context_lens.append(context_len)
-            if is_prompt:
-                mm_maps = inter_data.multi_modal_placeholder_maps
-                if mm_maps:
-                    for modality, placeholders in mm_maps.items():
-                        self.multimodal_placeholder_maps[modality].extend(
-                            placeholders)
-
-                self.num_prefills += 1
-                self.num_prefill_tokens += token_len
-                self.prefill_seq_lens.append(seq_len)
-            else:
-                self.num_decode_tokens += query_len
-                self.curr_seq_lens.append(curr_seq_len)
-
-            # Compute block table.
-            # TODO(sang): Combine chunked prefill and prefix caching by
-            # only allowing multiple of block_size chunk size.
-            # NOTE: This only works for oooooooxxx style attention.
-            block_table = []
-            if prefix_cache_hit:
-                # NOTE(woosuk): For flash-attn, the block table should
-                # include the entries for the incoming prefill tokens.
-                block_table = block_tables[seq_id]
-            elif ((chunked_prefill_enabled or not is_prompt)
-                  and block_tables is not None):
-                if curr_sliding_window_block == 0:
-                    block_table = block_tables[seq_id]
-                else:
-                    block_table = block_tables[seq_id][
-                        -curr_sliding_window_block:]
-            self.block_tables.append(block_table)
-
-            # Compute slot mapping.
-            is_profile_run = is_block_tables_empty(block_tables)
-            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
-                                                       context_len,
-                                                       self.sliding_window)
-            compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
-                                 seq_len, context_len, start_idx,
-                                 self.block_size, inter_data.block_tables)
-
-    def _get_graph_runner_block_tables(
-            self, num_seqs: int,
-            block_tables: List[List[int]]) -> torch.Tensor:
-        # The shape of graph_block_tables is
-        # [max batch size, max context len // block size].
-        max_batch_size, max_blocks = self.runner.graph_block_tables.shape
-        assert max_batch_size >= num_seqs
-
-        graph_block_tables = self.runner.graph_block_tables[:num_seqs]
-        for i, block_table in enumerate(block_tables):
-            if block_table:
-                num_blocks = len(block_table)
-                if num_blocks <= max_blocks:
-                    graph_block_tables[i, :num_blocks] = block_table
-                else:
-                    # It may be possible to have more blocks allocated due
-                    # to lookahead slots of multi-step, however, they are
-                    # not used anyway, so can be safely ignored.
-                    graph_block_tables[
-                        i, :max_blocks] = block_table[:max_blocks]
-
-        return torch.from_numpy(graph_block_tables).to(
-            device=self.runner.device, non_blocking=True)
-
-    def build(self, seq_lens: List[int], query_lens: List[int],
-              cuda_graph_pad_size: int, batch_size: int):
-        """Build attention metadata with on-device tensors.
-
-        Args:
-            seq_lens: The maybe padded sequence lengths of the input sequences.
-            query_lens: The query lengths of the input sequences.
-            cuda_graph_pad_size: The padding size for cuda graph.
-                                 -1 if cuda graph is not used.
-            batch_size: The maybe padded batch size.
-        """
-        prefix_cache_hit = any([
-            inter_data.prefix_cache_hit
-            for inter_data in self.input_builder.inter_data_list
-        ])
-        for inter_data in self.input_builder.inter_data_list:
-            self._add_seq_group(inter_data,
-                                self.input_builder.chunked_prefill_enabled,
-                                prefix_cache_hit)
-
-        device = self.runner.device
-        use_captured_graph = cuda_graph_pad_size != -1
-
-        max_query_len = max(query_lens)
-        decode_query_lens = query_lens[self.num_prefills:]
-        if len(decode_query_lens) > 0:
-            max_decode_query_len = max(decode_query_lens)
-        else:
-            max_decode_query_len = 1
-        max_prefill_seq_len = max(self.prefill_seq_lens, default=0)
-        max_decode_seq_len = max(self.curr_seq_lens, default=0)
-        num_decode_tokens = self.num_decode_tokens
-        query_start_loc = list(accumulate(query_lens, initial=0))
-        seq_start_loc = list(accumulate(seq_lens, initial=0))
-
-        num_seqs = len(seq_lens)
-        if use_captured_graph:
-            self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size)
-            self.block_tables.extend([] * cuda_graph_pad_size)
-            num_decode_tokens = batch_size - self.num_prefill_tokens
-            block_tables = self._get_graph_runner_block_tables(
-                num_seqs, self.block_tables)
-        else:
-            block_tables = make_tensor_with_pad(
-                self.block_tables,
-                pad=0,
-                dtype=torch.int,
-                device=device,
-            )
-        assert max_query_len > 0, ("query_lens: {}".format(query_lens))
-
-        assert device is not None
-        context_lens_tensor = async_tensor_h2d(self.context_lens, torch.int,
-                                               device, self.runner.pin_memory)
-        seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device,
-                                           self.runner.pin_memory)
-        input_positions = async_tensor_h2d(self.input_positions, torch.long,
-                                           device, self.runner.pin_memory)
-        slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long,
-                                               device, self.runner.pin_memory)
-        query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32,
-                                                  device,
-                                                  self.runner.pin_memory)
-        seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
-                                                device, self.runner.pin_memory)
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            self.multimodal_placeholder_maps.items()
-        }
-
-        return TritonMLAMetadata(
-            num_prefills=self.num_prefills,
-            slot_mapping=slot_mapping_tensor,
-            num_prefill_tokens=self.num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            seq_lens=seq_lens,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=True,
-            input_positions=input_positions,
-            seq_lens_tensor=seq_lens_tensor,
-            max_query_len=max_query_len,
-            max_decode_query_len=max_decode_query_len,
-            max_prefill_seq_len=max_prefill_seq_len,
-            max_decode_seq_len=max_decode_seq_len,
-            query_start_loc=query_start_loc_tensor,
-            seq_start_loc=seq_start_loc_tensor,
-            context_lens_tensor=context_lens_tensor,
-            block_tables=block_tables,
-            use_cuda_graph=use_captured_graph,
-            num_kv_splits=4,  # TODO(lucas) add heuristic
-            head_dim=self.runner.model_config.get_head_size(),
-        )
-
-
-class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
+class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
 
     def __init__(
             self,
@@ -662,11 +37,11 @@ def __init__(
             logits_soft_cap: Optional[float],
             attn_type: str,
             # MLA Specific Arguments
-            **kwargs) -> None:
+            **mla_args) -> None:
         super().__init__(num_heads, head_size, scale, num_kv_heads,
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
-                         **kwargs)
+                         **mla_args)
 
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
@@ -683,24 +58,12 @@ def __init__(
                                       "are not implemented for "
                                       "TritonMLAImpl")
 
-    def _forward_prefill(
-        self,
-        q: torch.Tensor,
-        kv_c_normed: torch.Tensor,
-        k_pe: torch.Tensor,
-        attn_metadata: TritonMLAMetadata,
-    ) -> torch.Tensor:
-        assert isinstance(attn_metadata, TritonMLAMetadata)
-        return self._forward_prefill_flash(q, kv_c_normed, k_pe,
-                                           attn_metadata.seq_start_loc,
-                                           attn_metadata.max_prefill_seq_len)
-
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: TritonMLAMetadata,
+        attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
         if self.kv_cache_dtype.startswith("fp8"):
@@ -717,12 +80,14 @@ def _forward_decode(
                         dtype=q.dtype,
                         device=q.device)
 
+        num_kv_splits = 4  # TODO: heuristic
+
         # TODO(lucas) Allocate ahead of time
         attn_logits = torch.empty(
             (
                 B,
                 self.num_heads,
-                attn_metadata.num_kv_splits,
+                num_kv_splits,
                 # NOTE(lucas) idk why the +1 is here but sglang has it so we
                 # just mirror that
                 self.kv_lora_rank + 1,
@@ -740,7 +105,6 @@ def _forward_decode(
         decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
                              decode_meta.block_tables,
                              decode_meta.seq_lens_tensor, attn_logits,
-                             attn_metadata.num_kv_splits, self.scale,
-                             PAGE_SIZE)
+                             num_kv_splits, self.scale, PAGE_SIZE)
 
         return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
new file mode 100644
index 00000000000..31545b607fe
--- /dev/null
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+import triton
+import triton.language as tl
+
+
+# Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
+# can be used to combine partial attention results (in the split-KV case)
+def merge_attn_states(
+    output: torch.Tensor,
+    prefix_output: torch.Tensor,
+    prefix_lse: torch.Tensor,
+    suffix_output: torch.Tensor,
+    suffix_lse: torch.Tensor,
+    output_lse: Optional[torch.Tensor] = None,
+) -> None:
+    num_tokens = output.shape[0]
+    num_query_heads = output.shape[1]
+    head_size = output.shape[2]
+    padded_head_size = triton.next_power_of_2(head_size)
+
+    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
+    merge_attn_states_kernel[(num_tokens, num_query_heads)](
+        output,
+        output_lse,
+        prefix_output,
+        prefix_lse,
+        suffix_output,
+        suffix_lse,
+        head_size,
+        padded_head_size,
+        output_lse is not None,
+    )
+
+
+@triton.jit
+def merge_attn_states_kernel(
+    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    output_lse,  # [NUM_HEADS, NUM_TOKENS]
+    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
+    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    HEAD_SIZE: tl.constexpr,
+    PADDED_HEAD_SIZE: tl.constexpr,
+    OUTPUT_LSE: tl.constexpr,
+):
+    token_idx = tl.program_id(0)
+    num_tokens = tl.num_programs(0)
+    head_idx = tl.program_id(1)
+    num_heads = tl.num_programs(1)
+
+    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
+    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+    max_lse = tl.maximum(p_lse, s_lse)
+    p_lse = p_lse - max_lse
+    s_lse = s_lse - max_lse
+    out_se = (tl.exp(p_lse) + tl.exp(s_lse))
+
+    if OUTPUT_LSE:
+        out_lse = tl.log(out_se) + max_lse
+        tl.store(output_lse + head_idx * num_tokens + token_idx, out_lse)
+
+    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
+    head_mask = head_arange < HEAD_SIZE
+    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
+                    head_idx * HEAD_SIZE + head_arange,
+                    mask=head_mask)
+
+    # NOTE(woosuk): Be careful with the numerical stability.
+    # We should compute the scale first, and then multiply it with the output.
+    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
+    p_scale = tl.exp(p_lse) / out_se
+    s_scale = tl.exp(s_lse) / out_se
+    out = p_out * p_scale + s_out * s_scale
+    tl.store(output + token_idx * num_heads * HEAD_SIZE +
+             head_idx * HEAD_SIZE + head_arange,
+             out,
+             mask=head_mask)
diff --git a/vllm/config.py b/vllm/config.py
index f118004b2f2..d6e197fe988 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3332,19 +3332,6 @@ def __post_init__(self):
 
         current_platform.check_and_update_config(self)
 
-        # If MLA is enabled, force disable chunked prefill and prefix caching
-        if self.model_config and self.model_config.use_mla:
-            logger.info("MLA is enabled; forcing chunked prefill and prefix "
-                        "caching to be disabled.")
-            self.scheduler_config.enable_chunked_prefill = False
-            self.scheduler_config.chunked_prefill_enabled = False
-            self.scheduler_config.max_num_batched_tokens = max(
-                self.scheduler_config.max_model_len,
-                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
-
         if not self.instance_id:
             self.instance_id = random_uuid()[:5]
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5aa77a138a3..8b460b33e23 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1170,9 +1170,9 @@ def create_engine_config(self,
             # long context (> 32K) models. This is to avoid OOM errors in the
             # initial memory profiling phase.
 
-            # For multimodal models, chunked prefill is disabled by default in
-            # V0, but enabled by design in V1
-            if model_config.is_multimodal_model:
+            # For multimodal models and models with MLA, chunked prefill is
+            # disabled by default in V0, but enabled by design in V1
+            if model_config.is_multimodal_model or model_config.use_mla:
                 self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
 
             elif use_long_context:
@@ -1207,7 +1207,6 @@ def create_engine_config(self,
             msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
-
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 9895537c219..891edf23010 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -162,6 +162,9 @@ def _per_token_group_quant_fp8(
     y_q_ptr,
     y_s_ptr,
     group_size,
+    # Num columns of y
+    y_num_columns,
+    y_row_stride,
     # Avoid to divide zero
     eps,
     # Information for float8
@@ -174,9 +177,14 @@ def _per_token_group_quant_fp8(
     quantization on a tensor.
     This function converts the tensor values into float8 values.
     """
+    groups_per_row = y_num_columns // group_size
+
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * group_size
+    row = g_id // groups_per_row
+    row_g_id = g_id % groups_per_row
+
+    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
     y_q_ptr += g_id * group_size
     y_s_ptr += g_id
 
@@ -202,6 +210,7 @@ def _per_token_group_quant_fp8_colmajor(
     group_size,
     # Num columns of y
     y_num_columns,
+    y_row_stride,
     # Stride from one column to the next of y_s
     y_s_col_stride,
     # Avoid to divide zero
@@ -216,9 +225,14 @@ def _per_token_group_quant_fp8_colmajor(
     quantization on a tensor.
     This function converts the tensor values into float8 values.
     """
+    groups_per_row = y_num_columns // group_size
+
     # Map the program id to the row of X and Y it should compute.
     g_id = tl.program_id(0)
-    y_ptr += g_id * group_size
+    row = g_id // groups_per_row
+    row_g_id = g_id % groups_per_row
+
+    y_ptr += (row * y_row_stride) + (row_g_id * group_size)
     y_q_ptr += g_id * group_size
 
     # Convert g_id the flattened block coordinate to 2D so we can index
@@ -267,7 +281,7 @@ def per_token_group_quant_fp8(
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
         f"by `group_size` {group_size}")
-    assert x.is_contiguous(), "`x` must be contiguous"
+    assert x.stride(-1) == 1, "`x` groups must be contiguous"
 
     finfo = torch.finfo(dtype)
     fp8_min = finfo.min
@@ -295,6 +309,7 @@ def per_token_group_quant_fp8(
             x_s,
             group_size,
             x.shape[1],
+            x.stride(0),
             x_s.stride(1),
             eps,
             fp8_min=fp8_min,
@@ -309,6 +324,8 @@ def per_token_group_quant_fp8(
             x_q,
             x_s,
             group_size,
+            x.shape[1],
+            x.stride(0),
             eps,
             fp8_min=fp8_min,
             fp8_max=fp8_max,
diff --git a/vllm/utils.py b/vllm/utils.py
index b1bac649c97..4d3f90c95a7 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -565,6 +565,10 @@ def round_up(x: int, y: int) -> int:
     return ((x + y - 1) // y) * y
 
 
+def round_down(x: int, y: int) -> int:
+    return (x // y) * y
+
+
 def _generate_random_fp8(
     tensor: torch.Tensor,
     low: float,
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index b1b5cc35925..1922a3bf272 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -5,12 +5,11 @@
 
 import numpy as np
 import torch
-import triton
-import triton.language as tl
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import get_flash_attn_version
+from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -372,70 +371,4 @@ def cascade_attention(
 
     # Merge prefix and suffix outputs, and store the result in output.
     merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
-                      suffix_lse)
-
-
-def merge_attn_states(
-    output: torch.Tensor,
-    prefix_output: torch.Tensor,
-    prefix_lse: torch.Tensor,
-    suffix_output: torch.Tensor,
-    suffix_lse: torch.Tensor,
-) -> None:
-    num_tokens = output.shape[0]
-    num_query_heads = output.shape[1]
-    head_size = output.shape[2]
-    padded_head_size = triton.next_power_of_2(head_size)
-
-    # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
-    merge_attn_states_kernel[(num_tokens, num_query_heads)](
-        output,
-        prefix_output,
-        prefix_lse,
-        suffix_output,
-        suffix_lse,
-        head_size,
-        padded_head_size,
-    )
-
-
-@triton.jit
-def merge_attn_states_kernel(
-    output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    prefix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
-    suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
-    suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
-    HEAD_SIZE: tl.constexpr,
-    PADDED_HEAD_SIZE: tl.constexpr,
-):
-    token_idx = tl.program_id(0)
-    num_tokens = tl.num_programs(0)
-    head_idx = tl.program_id(1)
-    num_heads = tl.num_programs(1)
-
-    p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
-    s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
-    max_lse = tl.maximum(p_lse, s_lse)
-    p_lse = p_lse - max_lse
-    s_lse = s_lse - max_lse
-
-    head_arange = tl.arange(0, PADDED_HEAD_SIZE)
-    head_mask = head_arange < HEAD_SIZE
-    p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE +
-                    head_idx * HEAD_SIZE + head_arange,
-                    mask=head_mask)
-    s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE +
-                    head_idx * HEAD_SIZE + head_arange,
-                    mask=head_mask)
-
-    # NOTE(woosuk): Be careful with the numerical stability.
-    # We should compute the scale first, and then multiply it with the output.
-    # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly.
-    p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
-    s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse))
-    out = p_out * p_scale + s_out * s_scale
-    tl.store(output + token_idx * num_heads * HEAD_SIZE +
-             head_idx * HEAD_SIZE + head_arange,
-             out,
-             mask=head_mask)
+                      suffix_lse)
\ No newline at end of file

From b8ebfff37de7f614bcb6fc625e3bf4f4b21cc6bb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 22 Feb 2025 13:10:43 +0800
Subject: [PATCH 0305/1240] [Misc] Fix yapf linting tools etc not running on
 pre-commit (#13695)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6a66131cdb4..20d1981c9a0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,7 +44,6 @@ repos:
   hooks:
   - id: actionlint
     exclude: 'vllm/third_party/.*'
-repos:
 - repo: https://github.com/astral-sh/uv-pre-commit
   rev: 0.6.2
   hooks:

From b3a93ca1a9a22b68169514f0e1818876f174c7b5 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Sat, 22 Feb 2025 00:53:59 -0500
Subject: [PATCH 0306/1240] docs: Add a note on full CI run in contributing
 guide (#13646)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/overview.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index af09bfecc64..5f8f5525e52 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -145,6 +145,9 @@ review process:
 - Please respond to all comments within a reasonable time frame. If a comment
   isn't clear or you disagree with a suggestion, feel free to ask for
   clarification or discuss the suggestion.
+- Note that not all CI checks will be executed due to limited computational
+  resources. The reviewer will add `ready` label to the PR when the PR is
+  ready to merge or a full CI run is needed.
 
 ## Thank You
 

From a62d30d62553377a804dad12f81099f71bc5d65b Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Fri, 21 Feb 2025 21:55:50 -0800
Subject: [PATCH 0307/1240] [HTTP Server] Make model param optional in request
 (#13568)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_chat.py         | 32 +++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 20 ++++++------
 vllm/entrypoints/openai/serving_chat.py       |  2 +-
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 vllm/entrypoints/openai/serving_embedding.py  |  2 +-
 vllm/entrypoints/openai/serving_engine.py     | 13 +++++++-
 vllm/entrypoints/openai/serving_models.py     |  2 +-
 vllm/entrypoints/openai/serving_pooling.py    |  2 +-
 vllm/entrypoints/openai/serving_score.py      |  4 +--
 9 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 4b5ad55c5ed..d7ed4afa286 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -9,6 +9,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import requests
 import torch
 from openai import BadRequestError
 
@@ -996,3 +997,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
         assert ("greater_than_equal" in exc_info.value.message
                 or "less_than_equal" in exc_info.value.message)
+
+
+@pytest.mark.asyncio
+async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+    url = f"http://localhost:{server.port}/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+    }
+    data = {
+        # model_name is avoided here.
+        "messages": [{
+            "role": "system",
+            "content": "You are a helpful assistant."
+        }, {
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        "max_tokens":
+        5
+    }
+
+    response = requests.post(url, headers=headers, json=data)
+    response_data = response.json()
+    print(response_data)
+
+    choice = response_data.get("choices")[0]
+    message = choice.get("message")
+    assert message is not None
+    content = message.get("content")
+    assert content is not None
+    assert len(content) > 0
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 29f64d28bdf..45b98a032bd 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -213,7 +213,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
     messages: List[ChatCompletionMessageParam]
-    model: str
+    model: Optional[str] = None
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
     logprobs: Optional[bool] = False
@@ -642,7 +642,7 @@ def check_generation_prompt(cls, data):
 class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
-    model: str
+    model: Optional[str] = None
     prompt: Union[List[int], List[List[int]], str, List[str]]
     best_of: Optional[int] = None
     echo: Optional[bool] = False
@@ -907,7 +907,7 @@ def validate_stream_options(cls, data):
 class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
-    model: str
+    model: Optional[str] = None
     input: Union[List[int], List[List[int]], str, List[str]]
     encoding_format: Literal["float", "base64"] = "float"
     dimensions: Optional[int] = None
@@ -939,7 +939,7 @@ def to_pooling_params(self):
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     messages: List[ChatCompletionMessageParam]
 
     encoding_format: Literal["float", "base64"] = "float"
@@ -1007,7 +1007,7 @@ def to_pooling_params(self):
 
 
 class ScoreRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     text_1: Union[List[str], str]
     text_2: Union[List[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
@@ -1031,7 +1031,7 @@ def to_pooling_params(self):
 
 
 class RerankRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     query: str
     documents: List[str]
     top_n: int = Field(default_factory=lambda: 0)
@@ -1345,7 +1345,7 @@ class BatchRequestOutput(OpenAIBaseModel):
 
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     prompt: str
 
     add_special_tokens: bool = Field(
@@ -1357,7 +1357,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     messages: List[ChatCompletionMessageParam]
 
     add_generation_prompt: bool = Field(
@@ -1423,7 +1423,7 @@ class TokenizeResponse(OpenAIBaseModel):
 
 
 class DetokenizeRequest(OpenAIBaseModel):
-    model: str
+    model: Optional[str] = None
     tokens: List[int]
 
 
@@ -1456,7 +1456,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     """
 
-    model: str
+    model: Optional[str] = None
     """ID of the model to use.
     """
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 934bd2a9506..02dd2c4881c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -141,7 +141,7 @@ async def create_chat_completion(
                 prompt_adapter_request,
             ) = self._maybe_get_adapters(request)
 
-            model_name = self.models.model_name(lora_request)
+            model_name = self._get_model_name(request.model, lora_request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index e7ad263e7fb..840f0f9b844 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -166,7 +166,7 @@ async def create_completion(
 
         result_generator = merge_async_iterators(*generators)
 
-        model_name = self.models.model_name(lora_request)
+        model_name = self._get_model_name(request.model, lora_request)
         num_prompts = len(engine_prompts)
 
         # Similar to the OpenAI API, when n != best_of, we do not stream the
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 45f8ad90ddc..607dbd96b19 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -83,7 +83,7 @@ async def create_embedding(
             return self.create_error_response(
                 "dimensions is currently not supported")
 
-        model_name = request.model
+        model_name = self._get_model_name(request.model)
         request_id = f"embd-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 5619e509c55..05b5f95a5e5 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -523,5 +523,16 @@ def _get_decoded_token(logprob: Logprob,
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    def _is_model_supported(self, model_name):
+    def _is_model_supported(self, model_name) -> bool:
+        if not model_name:
+            return True
         return self.models.is_base_model(model_name)
+
+    def _get_model_name(self,
+                        model_name: Optional[str] = None,
+                        lora_request: Optional[LoRARequest] = None) -> str:
+        if lora_request:
+            return lora_request.lora_name
+        if model_name is None:
+            return self.models.base_model_paths[0].name
+        return model_name
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index f917a485190..6ade4ece6d0 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -95,7 +95,7 @@ async def init_static_loras(self):
             if isinstance(load_result, ErrorResponse):
                 raise ValueError(load_result.message)
 
-    def is_base_model(self, model_name):
+    def is_base_model(self, model_name) -> bool:
         return any(model.name == model_name for model in self.base_model_paths)
 
     def model_name(self, lora_request: Optional[LoRARequest] = None) -> str:
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 01a3d211f6b..bbf5aed1a33 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -79,7 +79,7 @@ async def create_pooling(
             return self.create_error_response(
                 "dimensions is currently not supported")
 
-        model_name = request.model
+        model_name = self._get_model_name(request.model)
         request_id = f"pool-{self._base_request_id(raw_request)}"
         created_time = int(time.time())
 
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 0e9b355ad4f..01e2d304361 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -318,7 +318,7 @@ async def create_score(
                 final_res_batch,
                 request_id,
                 created_time,
-                request.model,
+                self._get_model_name(request.model),
             )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
@@ -358,7 +358,7 @@ async def do_rerank(
                 request.truncate_prompt_tokens,
             )
             return self.request_output_to_rerank_response(
-                final_res_batch, request_id, request.model, documents, top_n)
+                final_res_batch, request_id, self._get_model_name(request.model), documents, top_n)
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:

From 749a076b9b5c4df3b77d32abe0a95ec539fe3a89 Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Sat, 22 Feb 2025 14:05:28 +0800
Subject: [PATCH 0308/1240] =?UTF-8?q?[Bugfix][API=20Server]=20Fix=20invali?=
 =?UTF-8?q?d=20usage=20of=20'ge'=20and=20'le'=20in=20port=20valid=E2=80=A6?=
 =?UTF-8?q?=20(#13672)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/api_server.py |  2 +-
 vllm/utils.py                  | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 4294a8aad9a..11ffc4f67ce 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -145,7 +145,7 @@ async def run_server(args: Namespace,
 if __name__ == "__main__":
     parser = FlexibleArgumentParser()
     parser.add_argument("--host", type=str, default=None)
-    parser.add_argument("--port", type=int, default=8000, ge=1024, le=65535)
+    parser.add_argument("--port", type=parser.check_port, default=8000)
     parser.add_argument("--ssl-keyfile", type=str, default=None)
     parser.add_argument("--ssl-certfile", type=str, default=None)
     parser.add_argument("--ssl-ca-certs",
diff --git a/vllm/utils.py b/vllm/utils.py
index 4d3f90c95a7..dcafd5411bb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1194,6 +1194,17 @@ def parse_args(self, args=None, namespace=None):
 
         return super().parse_args(processed_args, namespace)
 
+    def check_port(self, value):
+        try:
+            value = int(value)
+        except ValueError:
+            raise argparse.ArgumentTypeError("Port must be an integer")
+
+        if not (1024 <= value <= 65535):
+            raise argparse.ArgumentTypeError("Port must be between 1024 and 65535")
+
+        return value
+
     def _pull_args_from_config(self, args: List[str]) -> List[str]:
         """Method to pull arguments specified in the config file
         into the command-line args variable.

From be3fd86aba6ed1abfd6449b944a8bc5a77f0e6d7 Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Sat, 22 Feb 2025 01:06:34 -0500
Subject: [PATCH 0309/1240] [Misc] Capture and log the time of loading weights
 (#13666)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 8 +++++---
 vllm/worker/model_runner.py        | 7 +++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 31fe095a91b..d2e9c2650c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1048,6 +1048,7 @@ def generate_draft_token_ids(
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
+            time_before_load = time.perf_counter()
             self.model = get_model(vllm_config=self.vllm_config)
             if self.lora_config:
                 self.model = self.load_lora_model(self.model,
@@ -1055,10 +1056,11 @@ def load_model(self) -> None:
                                                   self.scheduler_config,
                                                   self.lora_config,
                                                   self.device)
-
+            time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GB and %.6f seconds",
+                    self.model_memory_usage / float(2**30),
+                    time_after_load - time_before_load)
 
     def _get_prompt_logprobs_dict(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 67d175c373d..1a78498ad12 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1109,11 +1109,14 @@ def __init__(
     def load_model(self) -> None:
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler(self.device) as m:
+            time_before_load = time.perf_counter()
             self.model = get_model(vllm_config=self.vllm_config)
+            time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GB and %.6f seconds",
+                    self.model_memory_usage / float(2**30),
+                    time_after_load - time_before_load)
 
         if self.lora_config:
             assert supports_lora(

From 5f969eff3136e0d660ac6658a21afc75799e4290 Mon Sep 17 00:00:00 2001
From: Gordon Wong <gongdao123@gmail.com>
Date: Sat, 22 Feb 2025 14:07:04 +0800
Subject: [PATCH 0310/1240] [ROCM] fix native attention function call (#13650)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index f49b37842d9..e1a8d3d3361 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -717,7 +717,6 @@ def forward(
                         self.num_heads,
                         self.head_size,
                         self.scale,
-                        causal_mask,
                         attn_masks,
                     )
                 else:

From f279cae620c8aa6a1341d4b15d03f3dd3771752e Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Fri, 21 Feb 2025 22:07:45 -0800
Subject: [PATCH 0311/1240] [Bugfix][Model] OLMo 2: split qkv correctly for GQA
 and MQA (#13687)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/olmo2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 4b0455098ee..d06f894123a 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -157,7 +157,7 @@ def forward(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)

From 2b3bc4a8df852e3bd76cee78e445ddd98c673668 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sat, 22 Feb 2025 01:09:04 -0500
Subject: [PATCH 0312/1240] [Misc] Bump compressed-tensors (#13619)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index c52980bc7df..f72aa40fcce 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -34,6 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.9.1 # required for compressed-tensors
+compressed-tensors == 0.9.2 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py

From 338f36dd5fa6ca07f471d678f97bf39a2f9c71fc Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Sat, 22 Feb 2025 14:10:38 +0800
Subject: [PATCH 0313/1240] [Bugfix] Fix benchmark script bug: inaccurate stats
 for vllm backend when max_model_len < input_len + output_len (#13691)

Signed-off-by: WangErXiao <863579016@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_guided.py         | 13 +++++++++++++
 benchmarks/benchmark_latency.py        |  4 ++++
 benchmarks/benchmark_prioritization.py | 18 ++++++++++++++----
 benchmarks/benchmark_throughput.py     | 13 ++++++++++++-
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
index 2b41834baf4..dc2bf0e79cb 100644
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@@ -46,6 +46,12 @@ def run_vllm(requests: List[SampleRequest],
              warmup: bool = False) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**vars(engine_args))
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests.")
 
     # Add the requests to the engine.
     prompts: List[str] = []
@@ -115,6 +121,13 @@ async def run_vllm_async(
     async with build_async_engine_client_from_engine_args(
             engine_args, disable_frontend_multiprocessing) as llm:
 
+        assert all(
+            llm.model_config.max_model_len >= (request.prompt_len +
+                                               request.expected_output_len)
+            for request in requests), (
+                "Please ensure that max_model_len is greater than the sum of"
+                " prompt_len and expected_output_len for all requests.")
+
         # Add the requests to the engine.
         prompts: List[str] = []
         sampling_params: List[SamplingParams] = []
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index b041626550b..b1d68ea2469 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -42,6 +42,10 @@ def main(args: argparse.Namespace):
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.model_config.max_model_len >= (
+        args.input_len + args.output_len), (
+            "Please ensure that max_model_len is greater than"
+            " the sum of input_len and output_len.")
 
     sampling_params = SamplingParams(
         n=args.n,
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index a32065e4e7c..24014e5b6c3 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -13,6 +13,11 @@
 from vllm.utils import FlexibleArgumentParser
 
 
+#Select a equi-probable random priority
+def get_random_flag():
+    return 0 if random.random() < 0.5 else 1
+
+
 def sample_requests(
     dataset_path: str,
     num_requests: int,
@@ -55,8 +60,7 @@ def sample_requests(
             # Prune too long sequences.
             continue
 
-        #Select a equi-probable random priority
-        priority = 0 if random.random() < 0.5 else 1
+        priority = get_random_flag()
 
         filtered_dataset.append((prompt, prompt_len, output_len, priority))
 
@@ -71,6 +75,12 @@ def run_vllm(
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
 
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " input_len and output_len for all requests.")
+
     # Add the requests to the engine.
     prompts = []
     sampling_params = []
@@ -103,8 +113,8 @@ def main(args: argparse.Namespace):
     if args.dataset is None:
         # Synthesize a prompt with the given input length.
         prompt = "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
+        requests = [(prompt, args.input_len, args.output_len,
+                     get_random_flag()) for _ in range(args.num_prompts)]
     else:
         requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
                                    args.output_len)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index f7d87f1b336..ca54213c064 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -171,7 +171,12 @@ def run_vllm(
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
-
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests.")
     # Add the requests to the engine.
     prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
@@ -229,6 +234,12 @@ async def run_vllm_async(
 
     async with build_async_engine_client_from_engine_args(
             engine_args, disable_frontend_multiprocessing) as llm:
+        assert all(
+            llm.model_config.max_model_len >= (request.prompt_len +
+                                               request.expected_output_len)
+            for request in requests), (
+                "Please ensure that max_model_len is greater than the sum of"
+                " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
         prompts: List[TextPrompt] = []

From fc067f7d884cbb7039aeae7e26bc142fa9b4e79f Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 21 Feb 2025 22:13:05 -0800
Subject: [PATCH 0314/1240] [v1] Support allowed_token_ids in v1 Sampler
 (#13210)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py |  1 +
 tests/v1/sample/test_sampler.py           | 94 +++++++++++++++++++----
 tests/v1/worker/test_gpu_input_batch.py   | 13 ++++
 vllm/v1/engine/processor.py               | 14 ++++
 vllm/v1/sample/metadata.py                |  4 +
 vllm/v1/sample/sampler.py                 | 18 ++++-
 vllm/v1/worker/gpu_input_batch.py         | 43 ++++++++++-
 7 files changed, 168 insertions(+), 19 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 3e810e525e1..956d91c6daf 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -43,6 +43,7 @@ def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
         output_token_ids=[],
         min_tokens={},
         logit_bias=[None] * batch_size,
+        allowed_token_ids_mask=None,
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 3f6301c5426..34fba5a9f6d 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -57,6 +57,26 @@ def _create_logit_bias(
     return res
 
 
+def _create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> Optional[torch.Tensor]:
+    mask: Optional[torch.Tensor] = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros((batch_size, vocab_size),
+                               dtype=torch.bool,
+                               device=device)
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask
+
+
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -92,6 +112,7 @@ def _create_default_sampling_metadata(
         no_penalties=True,
         min_tokens={},
         logit_bias=[None] * batch_size,
+        allowed_token_ids_mask=None,
     )
     return fake_sampling_metadata
 
@@ -253,7 +274,10 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     sampling_metadata.frequency_penalties = _create_penalty_tensor(
         batch_size, frequency_penalty, torch.device(device))
     output_token_ids, sorted_token_ids_in_output = \
-        _create_weighted_output_token_list(batch_size, VOCAB_SIZE)
+        _create_weighted_output_token_list(
+            batch_size,
+            VOCAB_SIZE,
+        )
     sampling_metadata.output_token_ids = output_token_ids
     sampling_metadata.no_penalties = False
     sampler = Sampler()
@@ -262,8 +286,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
     for batch_idx in range(batch_size):
         non_penalized_token_id = logits[batch_idx].argmax().item()
         penalized_token_id = logits[batch_idx].argmin().item()
-        distinct_sorted_token_ids_in_output = \
-            sorted_token_ids_in_output[batch_idx]
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
+            batch_idx]
         most_frequent_token_id = distinct_sorted_token_ids_in_output[
             len(distinct_sorted_token_ids_in_output) - 1]
         if frequency_penalty > 0:
@@ -272,8 +296,8 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
             # non-penalized token ID is not present in the output, while the
             # most penalized token is the one that occurs most frequently in
             # the output.
-            assert non_penalized_token_id \
-                not in distinct_sorted_token_ids_in_output
+            assert (non_penalized_token_id
+                    not in distinct_sorted_token_ids_in_output)
             assert penalized_token_id == most_frequent_token_id
         elif frequency_penalty < 0:
             # If `frequency_penalty` is set to < 0, it indicates
@@ -282,8 +306,7 @@ def test_sampler_frequency_penalty(device: str, batch_size: int,
             # in the output, while the penalized token ID is one that has not
             # yet appeared.
             assert non_penalized_token_id == most_frequent_token_id
-            assert penalized_token_id \
-                not in distinct_sorted_token_ids_in_output
+            assert penalized_token_id not in distinct_sorted_token_ids_in_output
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -318,18 +341,18 @@ def test_sampler_repetition_penalty(device: str, batch_size: int,
             # If `repetition_penalty` > 1.0, verify that the non-penalized
             # token ID has not been seen before, while the penalized token ID
             # exists either in the prompt or the output.
-            assert (non_penalized_token_id not in prompt_tokens and \
-                non_penalized_token_id not in output_tokens)
-            assert (penalized_token_id  in prompt_tokens or \
-                penalized_token_id in output_tokens)
+            assert (non_penalized_token_id not in prompt_tokens
+                    and non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id in prompt_tokens
+                    or penalized_token_id in output_tokens)
         elif repetition_penalty < 1.0:
             # If `repetition_penalty` < 1.0, verify that the penalized
             # token ID has not been seen before, while the non-penalized
             # token ID exists either in the prompt or the output.
-            assert (penalized_token_id not in prompt_tokens and \
-                penalized_token_id not in output_tokens)
-            assert (non_penalized_token_id  in prompt_tokens or \
-                non_penalized_token_id in output_tokens)
+            assert (penalized_token_id not in prompt_tokens
+                    and penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id in prompt_tokens
+                    or non_penalized_token_id in output_tokens)
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
@@ -404,3 +427,44 @@ def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
                                                                  1e-2)
             else:
                 assert logits_for_req[token_id] == pytest.approx(1e-2)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
+def test_sampler_allowed_token_ids(device: str, batch_size: int,
+                                   num_allowed_token_ids: int):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    mask = _create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=device,
+    )
+    sampling_metadata.allowed_token_ids_mask = mask
+    sampler = Sampler()
+    logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        if batch_idx % 2 == 1:
+            assert torch.all(logits_for_req != -float("inf"))
+            continue
+        for token_id in range(VOCAB_SIZE):
+            start = min(batch_idx, VOCAB_SIZE - 1)
+            end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
+            if token_id >= start and token_id < end:
+                assert logits_for_req[token_id] == -float(
+                    "inf"), f"{batch_idx}, {token_id}"
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index cb3b3d21fbb..0aee266264a 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -66,6 +66,10 @@ def _construct_expected_sampling_metadata(
     temperature = [0.0 for _ in range(num_reqs)]
     min_tokens = {}
     logit_bias = [None] * num_reqs
+    allowed_token_ids_mask = torch.zeros(num_reqs,
+                                         VOCAB_SIZE,
+                                         dtype=torch.bool,
+                                         device=device)
     for req in reqs:
         if req.req_id not in req_ids_retained:
             continue
@@ -86,6 +90,10 @@ def _construct_expected_sampling_metadata(
             req.sampling_params.min_tokens,
             req.sampling_params.all_stop_token_ids)
         logit_bias[index_in_input_batch] = req.sampling_params.logit_bias
+        if req.sampling_params.allowed_token_ids:
+            allowed_token_ids_mask[index_in_input_batch][
+                req.sampling_params.allowed_token_ids] = True
+
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
                                  device=device),
@@ -121,6 +129,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 0 for x in frequency_penalties)
                       and all(x == 1 for x in repetition_penalties)),
         logit_bias=logit_bias,
+        allowed_token_ids_mask=allowed_token_ids_mask,
     )
 
 
@@ -242,3 +251,7 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
     assert expected_sampling_metadata.no_penalties == \
            sampling_metadata.no_penalties
     assert expected_sampling_metadata.logit_bias == sampling_metadata.logit_bias
+    if sampling_metadata.allowed_token_ids_mask:
+        assert torch.allclose(
+            expected_sampling_metadata.allowed_token_ids_mask,
+            sampling_metadata.allowed_token_ids_mask)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b7eee5a3997..2547cebaede 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -83,6 +83,19 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
 
+    def _validate_allowed_token_ids(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> None:
+        if not isinstance(params, SamplingParams):
+            return
+        if params.allowed_token_ids is None:
+            return
+        if not all(0 <= tid < self.model_config.vocab_size
+                   for tid in params.allowed_token_ids):
+            raise ValueError(
+                "allowed_token_ids contains out-of-vocab token id")
+
     def process_inputs(
         self,
         request_id: str,
@@ -100,6 +113,7 @@ def process_inputs(
 
         self._validate_logprobs(params)
         self._validate_lora(lora_request)
+        self._validate_allowed_token_ids(params)
 
         if arrival_time is None:
             arrival_time = time.time()
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 6d82d3a79c8..9f7770bbd07 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -37,3 +37,7 @@ class SamplingMetadata:
     min_tokens: Dict[int, Tuple[int, Set[int]]]
 
     logit_bias: List[Optional[Dict[int, float]]]
+
+    # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
+    # vocab size).
+    allowed_token_ids_mask: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index ff978b3b6c4..47ec26d4202 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,6 +47,8 @@ def forward(
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
+        # Apply allowed token ids.
+        logits = self.apply_allowed_token_ids(logits, sampling_metadata)
         # Apply logits bias.
         logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -184,11 +186,13 @@ def apply_penalties(
         if not sampling_metadata.no_penalties:
             assert sampling_metadata.prompt_token_ids is not None
             logits = apply_all_penalties(
-                logits, sampling_metadata.prompt_token_ids,
+                logits,
+                sampling_metadata.prompt_token_ids,
                 sampling_metadata.presence_penalties,
                 sampling_metadata.frequency_penalties,
                 sampling_metadata.repetition_penalties,
-                sampling_metadata.output_token_ids)
+                sampling_metadata.output_token_ids,
+            )
         return logits
 
     def apply_min_p(
@@ -226,3 +230,13 @@ def apply_logits_bias(
                 for token_id, bias in logit_bias.items():
                     logits[i, token_id] += bias
         return logits
+
+    def apply_allowed_token_ids(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if sampling_metadata.allowed_token_ids_mask is not None:
+            logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
+                                float("-inf"))
+        return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index bd1c369acb3..d9fc53490c0 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -143,7 +143,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.frequency_penalties_cpu = \
-                self.frequency_penalties_cpu_tensor.numpy()
+            self.frequency_penalties_cpu_tensor.numpy()
         self.frequency_penalties_reqs: Set[str] = set()
 
         # Presence penalty related data structures
@@ -168,7 +168,7 @@ def __init__(
             device="cpu",
             pin_memory=pin_memory)
         self.repetition_penalties_cpu = \
-                self.repetition_penalties_cpu_tensor.numpy()
+            self.repetition_penalties_cpu_tensor.numpy()
         self.repetition_penalties_reqs: Set[str] = set()
 
         # req_index -> (min_tokens, stop_token_ids)
@@ -192,6 +192,9 @@ def __init__(
 
         self.logit_bias: List[Optional[Dict[int,
                                             float]]] = [None] * max_num_reqs
+        self.has_allowed_token_ids: Set[str] = set()
+        self.allowed_token_ids_mask: Optional[torch.Tensor] = None
+        self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
         self.req_output_token_ids: List[Optional[List[int]]] = []
 
@@ -287,6 +290,22 @@ def add_request(
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
+        if sampling_params.allowed_token_ids:
+            self.has_allowed_token_ids.add(req_id)
+            if self.allowed_token_ids_mask_cpu_tensor is None:
+                # Lazy allocation for this tensor, which can be large.
+                self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
+                                                          self.vocab_size,
+                                                          dtype=torch.bool,
+                                                          device=self.device)
+                self.allowed_token_ids_mask_cpu_tensor = torch.zeros(
+                    self.max_num_reqs,
+                    self.vocab_size,
+                    dtype=torch.bool,
+                    device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index][
+                sampling_params.allowed_token_ids] = True
+
         # Add request lora ID
         if request.lora_request:
             lora_id = request.lora_request.lora_int_id
@@ -332,6 +351,9 @@ def remove_request(self, req_id: str) -> Optional[int]:
             self.request_lora_mapping[req_index] = 0
 
         self.logit_bias[req_index] = None
+        self.has_allowed_token_ids.discard(req_id)
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
 
     def condense(self, empty_req_indices: List[int]) -> None:
@@ -400,6 +422,11 @@ def condense(self, empty_req_indices: List[int]) -> None:
 
             self.logit_bias[empty_index] = self.logit_bias[last_req_index]
 
+            if self.allowed_token_ids_mask_cpu_tensor is not None:
+                self.allowed_token_ids_mask_cpu_tensor[
+                    empty_index] = self.allowed_token_ids_mask_cpu_tensor[
+                        last_req_index]
+
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -442,6 +469,13 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
         else:
             prompt_token_ids = None
 
+        allowed_token_ids_mask: Optional[torch.Tensor] = None
+        if not self.no_allowed_token_ids:
+            assert self.allowed_token_ids_mask is not None
+            copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                       self.allowed_token_ids_mask, num_reqs)
+            allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+
         return SamplingMetadata(
             temperature=temperature,
             all_greedy=self.all_greedy,
@@ -460,6 +494,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
+            allowed_token_ids_mask=allowed_token_ids_mask,
         )
 
     def get_sampling_metadata(
@@ -550,3 +585,7 @@ def max_num_logprobs(self) -> Optional[int]:
     @property
     def no_prompt_logprob(self) -> bool:
         return not self.num_prompt_logprobs
+
+    @property
+    def no_allowed_token_ids(self) -> bool:
+        return len(self.has_allowed_token_ids) == 0

From 3e63f80d41d6ea8af58be8a61c61e574352a530c Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <JenZhao@users.noreply.github.com>
Date: Sat, 22 Feb 2025 00:08:29 -0800
Subject: [PATCH 0315/1240] [Bugfix] V1 Memory Profiling: V0 Sampler
 Integration without Rejection Sampler (#13594)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d2e9c2650c7..000b17c99b2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -31,6 +31,7 @@
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.utils import bind_kv_cache
@@ -1305,11 +1306,34 @@ def profile_run(self) -> None:
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
                 logits = self.model.compute_logits(hidden_states, None)
-                # TODO(woosuk): Consider the memory usage of the sampler.
+                dummy_tensors = lambda v: torch.full(
+                    (num_reqs, ), v, device=self.device)
+                dummy_metadata = SamplingMetadata(
+                    temperature=dummy_tensors(0.5),
+                    all_greedy=False,
+                    all_random=False,
+                    spec_token_ids=None,
+                    top_p=dummy_tensors(0.9),
+                    top_k=dummy_tensors(logits.size(1) - 1),
+                    min_p=None,
+                    generators={},
+                    max_num_logprobs=None,
+                    no_penalties=True,
+                    prompt_token_ids=torch.ones_like(logits, dtype=torch.int64),
+                    frequency_penalties=dummy_tensors(0.1),
+                    presence_penalties=dummy_tensors(0.1),
+                    repetition_penalties=dummy_tensors(0.1),
+                    output_token_ids=[[] for _ in range(num_reqs)],
+                    min_tokens={},
+                    logit_bias=[None for _ in range(num_reqs)])
+                sampler_output = self.model.sample(
+                    logits=logits, sampling_metadata=dummy_metadata)
             else:
                 logits = None
+                sampler_output = None
+                dummy_metadata = None
             torch.cuda.synchronize()
-            del hidden_states, logits
+            del hidden_states, logits, sampler_output, dummy_metadata
             self.encoder_cache.clear()
         gc.collect()
 

From bb79a2306c3f836050ad968792cbb4fc7f9671e3 Mon Sep 17 00:00:00 2001
From: Yu Chin Fabian Lim <fabianlim@users.noreply.github.com>
Date: Sat, 22 Feb 2025 16:19:10 +0800
Subject: [PATCH 0316/1240] Correction to TP logic for Mamba Mixer 2 when Num
 Groups not divisible by TP Size (#13660)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/mamba/mamba_mixer2.py              | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 5fd12649102..a6a95c8da7e 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -133,7 +133,8 @@ def extra_groups_for_head_shards(ngroups: int, tp_size: int):
     if ngroups % tp_size == 0:
         return 0
 
-    return tp_size - ngroups % tp_size
+    # for n_groups == 1, this is exactly tp_size - n_groups
+    return tp_size - ngroups 
 
 
 def mamba_v2_sharded_weight_loader(
@@ -153,7 +154,7 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
         boundary, loaded_boundary = 0, 0
 
         # - iterate over the shard specs
-        for full_dim, extra, ratio in shard_spec:
+        for full_dim, extra, duplicate_groups in shard_spec:
             # - full dim is the model dim (before TP).
             # - extra > 0, means there is expected overall increase
             #   of dimensions. This is so because of replication.
@@ -167,7 +168,12 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
             # - compute the rank into the loaded shard.
             # - if there is replication, different TP shards will
             #   take from the same rank.
-            rank = tp_rank // ratio
+            if duplicate_groups:
+                # NOTE: currently we only support duplication
+                # in the case where num_groups == 1
+                rank = 0
+            else:
+                rank = tp_rank 
 
             # - leftmost boundary index into loaded weight.
             loaded_skip = rank * shard_size
@@ -233,12 +239,21 @@ def __init__(self,
         # - HOWEVER IF, world_size DOES NOT divide groups, then we need
         #   to allocate extra space in the shard, such that groups
         #   may be replicated to follow the head shard.
+        # - NOTE: currently for the world size DOES NOT divide groups
+        #   case, we only support the case when n_groups == 1
         self.tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
         assert num_heads % self.tp_size == 0, \
             "Tensor parallel world size must divide num heads."
 
+        
+        assert (n_groups % self.tp_size) == 0 or n_groups == 1, \
+            (
+                "If tensor parallel world size does not divide num_heads, "
+                "then num_groups must equal 1."
+            )
+
         self.ssm_state_size = ssm_state_size
         self.activation = activation
 
@@ -284,11 +299,10 @@ def __init__(self,
             self.n_groups * self.ssm_state_size,  # expected model size
             (self.n_groups - n_groups) *
             self.ssm_state_size,  # extra dims assigned
-            self.num_heads //
-            n_groups,  # ratio for mapping back to original group
+            n_groups == 1,  # if there was only one group
         )
-        intermediate_settings = (intermediate_size, 0, 1)
-        head_setings = (self.num_heads, 0, 1)
+        intermediate_settings = (intermediate_size, 0, False)
+        head_setings = (self.num_heads, 0, False)
 
         # - the weight already has a "weight_loader" attribute
         #   which set_weight_attrs will raise if we do not

From da417e8976c3716875ff4dd4dbb63d57c2d8cd1f Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 22 Feb 2025 08:20:00 +0000
Subject: [PATCH 0317/1240] [V1][Metrics] Support `vllm:cache_config_info`
 (#13299)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/config.py                           |  6 ++++++
 vllm/engine/metrics.py                   |  5 ++---
 vllm/engine/metrics_types.py             | 10 ++--------
 vllm/v1/metrics/loggers.py               | 22 +++++++++++++++++++++-
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 45a387a14ad..e0323abe252 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -230,6 +230,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:iteration_tokens_total",
+    "vllm:cache_config_info",
     "vllm:request_success_total",
     "vllm:request_prompt_tokens_sum",
     "vllm:request_prompt_tokens_bucket",
diff --git a/vllm/config.py b/vllm/config.py
index d6e197fe988..dbcacdf4d95 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -88,6 +88,12 @@ def compute_hash(self) -> str:
         ...
 
 
+class SupportsMetricsInfo(Protocol):
+
+    def metrics_info(self) -> Dict[str, str]:
+        ...
+
+
 class ModelImpl(str, enum.Enum):
     AUTO = "auto"
     VLLM = "vllm"
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 7c55d66e507..e8736dffc44 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -8,9 +8,8 @@
 import numpy as np
 import prometheus_client
 
-from vllm.config import VllmConfig
-from vllm.engine.metrics_types import (StatLoggerBase, Stats,
-                                       SupportsMetricsInfo)
+from vllm.config import SupportsMetricsInfo, VllmConfig
+from vllm.engine.metrics_types import StatLoggerBase, Stats
 from vllm.executor.ray_utils import ray
 from vllm.logger import init_logger
 
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 7f0c2fa70c3..9e6d5ef29be 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -15,9 +15,9 @@
 import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Protocol
+from typing import List, Optional
 
-from vllm.config import VllmConfig
+from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -70,12 +70,6 @@ class Stats:
     spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
 
 
-class SupportsMetricsInfo(Protocol):
-
-    def metrics_info(self) -> Dict[str, str]:
-        ...
-
-
 class StatLoggerBase(ABC):
     """Base class for StatLogger."""
 
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 5019e2b3f92..e112a9f36e6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -7,7 +7,7 @@
 import numpy as np
 import prometheus_client
 
-from vllm.config import VllmConfig
+from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
@@ -228,6 +228,26 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.log_metrics_info("cache_config", vllm_config.cache_config)
+
+    def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
+        metrics_info = config_obj.metrics_info()
+
+        name, documentation = None, None
+        if type == "cache_config":
+            name = "vllm:cache_config_info"
+            documentation = "Information of the LLMEngine CacheConfig"
+        assert name is not None, f"Unknown metrics info type {type}"
+
+        # Info type metrics are syntactic sugar for a gauge permanently set to 1
+        # Since prometheus multiprocessing mode does not support Info, emulate
+        # info here with a gauge.
+        info_gauge = prometheus_client.Gauge(
+            name=name,
+            documentation=documentation,
+            labelnames=metrics_info.keys()).labels(**metrics_info)
+        info_gauge.set(1)
+
     def log(self, scheduler_stats: SchedulerStats,
             iteration_stats: IterationStats):
         """Log to prometheus."""

From 03d3fb4a6d941bb8661f816199ce4fce9e88d1eb Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Sat, 22 Feb 2025 08:20:45 +0000
Subject: [PATCH 0318/1240] [Metrics] Add `--show-hidden-metrics-for-version`
 CLI arg (#13295)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/metrics.md |  8 ++++++++
 tests/test_version.py          | 36 ++++++++++++++++++++++++++++++++++
 vllm/config.py                 |  4 +++-
 vllm/engine/arg_utils.py       | 20 +++++++++++++++++++
 vllm/engine/metrics.py         |  5 +++++
 vllm/v1/metrics/loggers.py     |  5 +++++
 vllm/version.py                | 18 +++++++++++++++++
 7 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_version.py

diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 6c0dc8880a9..1d55f201503 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -36,3 +36,11 @@ The following metrics are exposed:
 :language: python
 :start-after: begin-metrics-definitions
 :::
+
+The following metrics are deprecated and due to be removed in a future version:
+
+- *(No metrics are currently deprecated)*
+
+Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
+but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
+and are then removed in version `X.Y+2`.
diff --git a/tests/test_version.py b/tests/test_version.py
new file mode 100644
index 00000000000..56842b6d409
--- /dev/null
+++ b/tests/test_version.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import patch
+
+import pytest
+
+from vllm import version
+
+
+def test_version_is_defined():
+    assert version.__version__ is not None
+
+
+def test_version_tuple():
+    assert len(version.__version_tuple__) in (3, 4, 5)
+
+
+@pytest.mark.parametrize(
+    "version_tuple, version_str, expected",
+    [
+        ((0, 0, "dev"), "0.0", True),
+        ((0, 0, "dev"), "foobar", True),
+        ((0, 7, 4), "0.6", True),
+        ((0, 7, 4), "0.5", False),
+        ((0, 7, 4), "0.7", False),
+        ((1, 2, 3), "1.1", True),
+        ((1, 2, 3), "1.0", False),
+        ((1, 2, 3), "1.2", False),
+        # This won't work as expected
+        ((1, 0, 0), "1.-1", True),
+        ((1, 0, 0), "0.9", False),
+        ((1, 0, 0), "0.17", False),
+    ])
+def test_prev_minor_version_was(version_tuple, version_str, expected):
+    with patch("vllm.version.__version_tuple__", version_tuple):
+        assert version._prev_minor_version_was(version_str) == expected
diff --git a/vllm/config.py b/vllm/config.py
index dbcacdf4d95..797697aac12 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2653,7 +2653,9 @@ def __post_init__(self):
 
 @dataclass
 class ObservabilityConfig:
-    """Configuration for observability."""
+    """Configuration for observability - metrics and tracing."""
+    show_hidden_metrics: bool = False
+
     otlp_traces_endpoint: Optional[str] = None
 
     # Collecting detailed timing information for each request can be expensive.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8b460b33e23..d75e2324f5c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -10,6 +10,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm import version
 from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat,
                          DecodingConfig, DeviceConfig, HfOverrides,
                          KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig,
@@ -188,6 +189,7 @@ class EngineArgs:
     qlora_adapter_name_or_path: Optional[str] = None
     disable_logprobs_during_spec_decoding: Optional[bool] = None
 
+    show_hidden_metrics_for_version: Optional[str] = None
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
@@ -909,6 +911,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=None,
                             help='Name or path of the QLoRA adapter.')
 
+        parser.add_argument('--show-hidden-metrics-for-version',
+                            type=str,
+                            default=None,
+                            help='Enable deprecated Prometheus metrics that '
+                            'have been hidden since the specified version. '
+                            'For example, if a previously deprecated metric '
+                            'has been hidden since the v0.7.0 release, you '
+                            'use --show-hidden-metrics-for-version=0.7 as a '
+                            'temporary escape hatch while you migrate to new '
+                            'metrics. The metric is likely to be removed '
+                            'completely in an upcoming release.')
+
         parser.add_argument(
             '--otlp-traces-endpoint',
             type=str,
@@ -1317,6 +1331,11 @@ def create_engine_config(self,
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
+        show_hidden_metrics = False
+        if self.show_hidden_metrics_for_version is not None:
+            show_hidden_metrics = version._prev_minor_version_was(
+                self.show_hidden_metrics_for_version)
+
         detailed_trace_modules = []
         if self.collect_detailed_traces is not None:
             detailed_trace_modules = self.collect_detailed_traces.split(",")
@@ -1326,6 +1345,7 @@ def create_engine_config(self,
                     f"Invalid module {m} in collect_detailed_traces. "
                     f"Valid modules are {ALLOWED_DETAILED_TRACE_MODULES}")
         observability_config = ObservabilityConfig(
+            show_hidden_metrics=show_hidden_metrics,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_model_forward_time="model" in detailed_trace_modules
             or "all" in detailed_trace_modules,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index e8736dffc44..cb3ca7a1188 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -516,6 +516,11 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index e112a9f36e6..e562b4145af 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -95,6 +95,11 @@ class PrometheusStatLogger(StatLoggerBase):
     def __init__(self, vllm_config: VllmConfig):
         self._unregister_vllm_metrics()
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
         labelnames = ["model_name"]
         labelvalues = [vllm_config.model_config.served_model_name]
 
diff --git a/vllm/version.py b/vllm/version.py
index 70cd0289b44..ab5909b101a 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -11,3 +11,21 @@
 
     __version__ = "dev"
     __version_tuple__ = (0, 0, __version__)
+
+
+def _prev_minor_version_was(version_str):
+    """Check whether a given version matches the previous minor version.
+
+    Return True if version_str matches the previous minor version.
+
+    For example - return True if the current version if 0.7.4 and the
+    supplied version_str is '0.6'.
+
+    Used for --show-hidden-metrics-for-version.
+    """
+    # Match anything if this is a dev tree
+    if __version_tuple__[0:2] == (0, 0):
+        return True
+
+    # Note - this won't do the right thing when we release 1.0!
+    return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"

From ce6b7fec92ed69776de1c4a2a376128cb610fef8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 22 Feb 2025 16:21:30 +0800
Subject: [PATCH 0319/1240] [Misc] Reduce LoRA-related static variable (#13166)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py                     | 17 +++++++--
 tests/lora/test_lora_checkpoints.py        | 13 ++++---
 tests/lora/test_lora_huggingface.py        |  7 ++--
 tests/lora/test_lora_manager.py            | 26 +++++---------
 vllm/lora/models.py                        | 21 +++++------
 vllm/lora/utils.py                         | 26 ++++++++++++++
 vllm/lora/worker_manager.py                |  8 +++--
 vllm/model_executor/models/baichuan.py     |  9 -----
 vllm/model_executor/models/bamba.py        |  6 ----
 vllm/model_executor/models/chatglm.py      | 10 ------
 vllm/model_executor/models/commandr.py     |  4 ---
 vllm/model_executor/models/exaone.py       |  8 -----
 vllm/model_executor/models/gemma.py        | 12 -------
 vllm/model_executor/models/gemma2.py       | 11 ------
 vllm/model_executor/models/glm4v.py        | 15 --------
 vllm/model_executor/models/gpt_bigcode.py  |  5 +--
 vllm/model_executor/models/granite.py      |  4 ---
 vllm/model_executor/models/granitemoe.py   |  7 ----
 vllm/model_executor/models/idefics3.py     | 15 --------
 vllm/model_executor/models/interfaces.py   | 12 +++----
 vllm/model_executor/models/internlm2.py    | 10 ------
 vllm/model_executor/models/jamba.py        |  4 ---
 vllm/model_executor/models/llama.py        |  4 ---
 vllm/model_executor/models/minicpm.py      |  8 -----
 vllm/model_executor/models/minicpm3.py     | 16 ---------
 vllm/model_executor/models/minicpmv.py     | 42 ----------------------
 vllm/model_executor/models/mixtral.py      |  4 ---
 vllm/model_executor/models/molmo.py        | 20 -----------
 vllm/model_executor/models/nemotron.py     |  3 --
 vllm/model_executor/models/phi.py          | 11 ------
 vllm/model_executor/models/phimoe.py       | 10 ------
 vllm/model_executor/models/qwen.py         |  9 -----
 vllm/model_executor/models/qwen2.py        | 20 -----------
 vllm/model_executor/models/qwen2_5_vl.py   | 21 -----------
 vllm/model_executor/models/qwen2_rm.py     | 10 ------
 vllm/model_executor/models/qwen2_vl.py     | 18 ----------
 vllm/model_executor/models/qwen_vl.py      | 15 --------
 vllm/model_executor/models/solar.py        |  8 -----
 vllm/model_executor/models/transformers.py | 35 ++++++++++++++++++
 vllm/model_executor/models/ultravox.py     |  8 -----
 vllm/worker/hpu_model_runner.py            |  3 --
 41 files changed, 120 insertions(+), 395 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 92ff52b839e..a414c3bcb6f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -23,6 +23,7 @@
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.models.interfaces import SupportsLoRA
 from vllm.platforms import current_platform
 
 
@@ -98,9 +99,13 @@ def dist_init_torch_only():
                                          backend=backend)
 
 
+class DummyLoRAModel(nn.Sequential, SupportsLoRA):
+    pass
+
+
 @pytest.fixture
 def dummy_model() -> nn.Module:
-    model = nn.Sequential(
+    model = DummyLoRAModel(
         OrderedDict([
             ("dense1", ColumnParallelLinear(764, 100)),
             ("dense2", RowParallelLinear(100, 50)),
@@ -121,12 +126,13 @@ def dummy_model() -> nn.Module:
             ("sampler", Sampler())
         ]))
     model.config = MagicMock()
+    model.embedding_modules = {"lm_head": "lm_head"}
     return model
 
 
 @pytest.fixture
 def dummy_model_gate_up() -> nn.Module:
-    model = nn.Sequential(
+    model = DummyLoRAModel(
         OrderedDict([
             ("dense1", ColumnParallelLinear(764, 100)),
             ("dense2", RowParallelLinear(100, 50)),
@@ -147,6 +153,13 @@ def dummy_model_gate_up() -> nn.Module:
             ("sampler", Sampler())
         ]))
     model.config = MagicMock()
+    model.packed_modules_mapping = {
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    model.embedding_modules = {"lm_head": "lm_head"}
     return model
 
 
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index d2a4b901bd8..e2c3d20d327 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -12,6 +12,12 @@
 lora_lst = [
     "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
 ]
+BAICHUAN_LORA_MODULES = [
+    "W_pack",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+]
 
 
 @pytest.mark.parametrize("lora_name", lora_lst)
@@ -22,12 +28,11 @@ def test_load_checkpoints(
     baichuan_regex_lora_files,
     chatglm3_lora_files,
 ):
-    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
     expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
         else:
@@ -90,12 +95,12 @@ def test_load_checkpoints(
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
-    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
     expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
         else:
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 273fe9ae0eb..44d111732d2 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -11,17 +11,20 @@
 
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+LLAMA_LORA_MODULES = [
+    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
+    "lm_head"
+]
 
 
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    supported_lora_modules = LlamaForCausalLM.supported_lora_modules
     packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
     embedding_modules = LlamaForCausalLM.embedding_modules
     embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
     expected_lora_modules: List[str] = []
-    for module in supported_lora_modules:
+    for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
         else:
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 9fecd11f57a..7ab46b7ff9c 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -19,7 +19,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
                                       WorkerLoRAManager)
-from vllm.model_executor.layers.linear import RowParallelLinear
 from vllm.platforms import current_platform
 
 EMBEDDING_MODULES = {
@@ -114,19 +113,16 @@ def create_packed_lora(
 
 def test_replace_submodules(dist_init, dummy_model):
     model = dummy_model
-    model.supported_lora_modules = ["dense1", "layer1.dense2"]
-    model.packed_modules_mapping = {}
     manager = LoRAModelManager(
         model, 1, 1, 1,
         LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
         torch.device(DEVICES[0]))
     model = manager.model
-
     assert isinstance(model.get_submodule("dense1"),
                       ColumnParallelLinearWithLoRA)
     assert isinstance(model.get_submodule("layer1.dense1"),
                       ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
+    assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
     assert isinstance(model.get_submodule("layer1.dense2"),
                       RowParallelLinearWithLoRA)
 
@@ -134,8 +130,6 @@ def test_replace_submodules(dist_init, dummy_model):
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1,
                               model, ["layer1.dense1", "dense2", "lm_head"],
                               device=device)
@@ -190,13 +184,18 @@ def test_lora_model_manager(dist_init, dummy_model, device):
 
     assert manager.device == device
     assert manager.punica_wrapper.device == device
+    assert hasattr(manager, "supported_lora_modules")
+    assert sorted(manager.supported_lora_modules) == [
+        "dense1",
+        "dense2",
+        "lm_head",
+        "output",
+    ]
 
 
 @pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1,
                               model, ["layer1.dense1", "dense2", "lm_head"],
                               device=device)
@@ -289,8 +288,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
-    model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
-    model.packed_modules_mapping = {}
     model_lora1 = create_lora(1,
                               model, ["layer1.dense1", "dense2", "lm_head"],
                               device=device)
@@ -572,13 +569,6 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
 @pytest.mark.parametrize("device", DEVICES)
 def test_packed_loras(dist_init, dummy_model_gate_up, device):
     model = dummy_model_gate_up
-    model.supported_lora_modules = ["gate_up_proj"]
-    model.packed_modules_mapping = {
-        "gate_up_proj": [
-            "gate_proj",
-            "up_proj",
-        ],
-    }
     model_lora = create_packed_lora(
         1,
         model,
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b7403980d0b..eb53513a283 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -26,6 +26,7 @@
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.lora.utils import (from_layer, from_layer_logits_processor,
+                             get_supported_lora_modules,
                              is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
@@ -332,15 +333,15 @@ def __init__(
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
         super().__init__(model)
-        if hasattr(self.model, "supported_lora_modules"):
-            self.supported_lora_modules = copy.deepcopy(
-                self.model.supported_lora_modules)
-            if lora_config.long_lora_scaling_factors:
-                # We need to replace rotary emb layer to do batch computation
-                # for long lora.
-                self.supported_lora_modules.append("rotary_emb")
-            self.packed_modules_mapping = copy.deepcopy(
-                self.model.packed_modules_mapping)
+        self.supported_lora_modules = get_supported_lora_modules(self.model)
+        assert self.supported_lora_modules, "No supported LoRA modules found in"
+        f"{self.model.__class__.__name__}."
+        if lora_config.long_lora_scaling_factors:
+            # We need to replace rotary emb layer to do batch computation
+            # for long lora.
+            self.supported_lora_modules.append("rotary_emb")
+        self.packed_modules_mapping = copy.deepcopy(
+            self.model.packed_modules_mapping)
         # Used to indicate whether the model is a multimodal model
         self.supports_mm: bool = (
             supports_multimodal(self.model)
@@ -756,7 +757,7 @@ def create_lora_manager(
         lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
         **kwargs) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
-    if not hasattr(model, "supported_lora_modules"):
+    if not hasattr(model, "packed_modules_mapping"):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
     lora_manager = lora_manager_cls(
         model=model,
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index f47b0af1552..361dac5b331 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -29,6 +29,7 @@
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
+from vllm.model_executor.layers.linear import LinearBase
 # yapf: enable
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -68,6 +69,14 @@ def from_layer(layer: nn.Module,
             ret = lora_cls(layer)
             ret.create_lora_weights(max_loras, lora_config, model_config)
             return ret
+
+    # The Case for HFCompatibleLinear
+    if (hasattr(layer, "get_lora_class")
+            and layer.__class__.__name__ == "HFCompatibleLinear"):
+        lora_cls = layer.get_lora_class(lora_config.fully_sharded_loras)
+        ret = lora_cls(layer)
+        ret.create_lora_weights(max_loras, lora_config, model_config)
+        return ret
     return layer
 
 
@@ -170,6 +179,23 @@ def is_subset(sub_list, full_list):
     return False
 
 
+def get_supported_lora_modules(model: nn.Module) -> List[str]:
+    """
+    In vLLM, all linear layers support LoRA.
+    """
+    supported_lora_modules: Set[str] = set()
+    # step1: traverse the model to get all the linear subfixes.
+    for name, module in model.named_modules():
+        if isinstance(module, (LinearBase, )):
+            supported_lora_modules.add(name.split(".")[-1])
+    # step 2: get the embedding modules if the model's mbedding_modules
+    # is not empty.
+    if model.embedding_modules:
+        for name in model.embedding_modules:
+            supported_lora_modules.add(name)
+    return list(supported_lora_modules)
+
+
 def get_adapter_absolute_path(lora_path: str) -> str:
     """
     Resolves the given lora_path to an absolute local path.
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index b103acefe4a..108beb34b24 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -84,9 +84,10 @@ def create_lora_manager(
 
     def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
-            model = self._adapter_manager.model
-            supported_lora_modules = model.supported_lora_modules
-            packed_modules_mapping = model.packed_modules_mapping
+            supported_lora_modules = (
+                self._adapter_manager.supported_lora_modules)
+            packed_modules_mapping = (
+                self._adapter_manager.packed_modules_mapping)
             expected_lora_modules: List[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
@@ -107,6 +108,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
 
             # For some models like Qwen2VL, we need to use hf_to_vllm_mapper
             # to ensure correct loading of lora weights.
+            model = self._adapter_manager.model
             hf_to_vllm_mapper = None
             if (hasattr(model, "hf_to_vllm_mapper")
                     and model.hf_to_vllm_mapper is not None):
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 5dfaa727b75..b613b70a756 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -342,15 +342,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
             "up_proj",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "W_pack",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index b9310108543..22ae1775c3d 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -389,12 +389,6 @@ class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 26b4a95c530..ecf41765545 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -477,16 +477,6 @@ class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "query_key_value",
-        "dense",
-        "dense_h_to_4h",
-        "dense_4h_to_h",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index e73627da05d..0ceefc3e93a 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -357,11 +357,7 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
-    ]
     embedding_modules = {"embed_tokens": "input_embeddings"}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 2eb91a68224..e795c7e288c 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -415,14 +415,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "out_proj",
-        "gate_up_proj",
-        "c_proj",
-        "wte",
-        "lm_head",
-    ]
     embedding_modules = {
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index cb81aa41e25..d0589e60a72 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -344,18 +344,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-
-    # Gemma does not apply LoRA to the embedding layer.
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index a6dc8f84772..6ee257d65c5 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -390,17 +390,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    # Gemma does not apply LoRA to the embedding layer.
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 40010ec5590..8fc5a797f82 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -534,21 +534,6 @@ class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
         "dense_h_to_4h": ["dense_h_to_4h"],
         "merged_proj": ["gate_proj", "dense_h_to_4h"]
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "query_key_value",
-        "dense",
-        "dense_h_to_4h",
-        "dense_4h_to_h",
-        # vision
-        "fc1",
-        "fc2",
-        "merged_proj",
-        "linear_proj"
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 887a444748a..799edff46ea 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -261,15 +261,12 @@ def forward(
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
-    supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
-
+    # LoRA specific attributes
     embedding_modules = {
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
     }
 
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 85911a0f41c..2aeb179ee93 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -351,10 +351,6 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-        "lm_head"
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 8ae661bf15c..40df9c72c56 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -329,13 +329,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
-        "layer",
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 579253632c8..3a7e2a9a6a5 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -597,21 +597,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        # vision_model
-        "fc1",
-        "fc2",
-        "out_proj",
-        # text_model
-        "qkv_proj",  # same name with vision encoder
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index bd6661d668d..47bd05f140c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -118,11 +118,11 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
-
-    packed_modules_mapping: ClassVar[Dict[str, List[str]]]
-    supported_lora_modules: ClassVar[List[str]]
-    embedding_modules: ClassVar[Dict[str, str]]
-    embedding_padding_modules: ClassVar[List[str]]
+    # The `embedding_module` and `embedding_padding_modules`
+    # are empty by default.
+    embedding_modules: ClassVar[Dict[str, str]] = {}
+    embedding_padding_modules: ClassVar[List[str]] = []
+    packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {}
 
 
 # We can't use runtime_checkable with ClassVar for issubclass checks
@@ -132,7 +132,6 @@ class _SupportsLoRAType(Protocol):
     supports_lora: Literal[True]
 
     packed_modules_mapping: Dict[str, List[str]]
-    supported_lora_modules: List[str]
     embedding_modules: Dict[str, str]
     embedding_padding_modules: List[str]
 
@@ -155,7 +154,6 @@ def supports_lora(
     if not result:
         lora_attrs = (
             "packed_modules_mapping",
-            "supported_lora_modules",
             "embedding_modules",
             "embedding_padding_modules",
         )
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index c211ca5f4f8..b21933dd5da 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -329,16 +329,6 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
         "gate_up_proj": ["w1", "w3"],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "wqkv",
-        "wo",
-        "gate_up_proj",
-        "w2",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self,
                  *,
                  vllm_config: VllmConfig,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index efc1496d44f..5530e3ca708 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -380,10 +380,6 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj",
-        "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj"
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2ff52dd7891..011d0a7aafa 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -452,10 +452,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-        "lm_head"
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings"
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 29473f5bbaa..52ab8948878 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -522,14 +522,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index 878f0c895c3..b85306c4088 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -227,21 +227,5 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "kv_a_proj_with_mqa",
-        "q_a_proj",
-        "q_b_proj",
-        "kv_b_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
-
-    # `embedding_modules` and `embedding_padding_modules`
-    # are inherited from MiniCPMForCausalLM
-
     def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 97596f9e82c..1f278b65740 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1228,23 +1228,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
             "up_proj",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        # vision encoder
-        "fc1",
-        "fc2",
-        "out_proj",
-        # language model
-        "qkv_proj",  # same name with vision encoder
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        # resampler
-        "kv_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -1338,23 +1321,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
             "up_proj",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        # vision encoder
-        "fc1",
-        "fc2",
-        "out_proj",
-        # language model
-        "qkv_proj",  # same name with vision encoder
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        # resampler
-        "kv_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
@@ -1460,13 +1426,6 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA):
     which is not conducive to the current integration logic of LoRA and
     bitsandbytes in vLLM. Therefore, it is necessary to separate them.
     """
-    # Ensure that the LoRA support check passes when the class is not
-    # initialized, but set all these attributes to empty.
-    # These will be updated when an instance class is selected
-    packed_modules_mapping = {}
-    supported_lora_modules = []
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
@@ -1487,7 +1446,6 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         # quant_config references base class members,
         # so update values before init is called
         cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
-        cls.supported_lora_modules += instance_cls.supported_lora_modules
         cls.embedding_modules.update(instance_cls.embedding_modules)
         cls.embedding_padding_modules += instance_cls.embedding_padding_modules
         return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 70880eb7522..b83b69fd2c2 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -332,10 +332,6 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "embed_tokens", "lm_head", "w1", "w2", "w3",
-        "gate"
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 1d84d25c96a..6ce9fbda182 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1440,26 +1440,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
         "merged_linear": ["gate_proj", "up_proj"]  # image_projector
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        # language model
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",  # same name with image_projector
-        # vision tower
-        "wq",
-        "wk",
-        "wv",
-        "wo",
-        "w1",
-        "w2",
-        # image_projector
-        "merged_linear",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 6f0b831ac27..a42734edb39 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -389,9 +389,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 6b05bfee949..1ca8cad22ad 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -273,17 +273,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ]
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "dense",
-        "fc1",
-        "fc2",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index aa4bb52c444..17369cb58e3 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -526,16 +526,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "embed_tokens",
-        "lm_head",
-        "w1",
-        "w2",
-        "w3",
-        "gate",
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index a45e9463ab6..7c462703620 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -354,15 +354,6 @@ class QWenLMHeadModel(QWenBaseModel, SupportsPP, SupportsLoRA):
             "w1",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "c_attn",
-        "gate_up_proj",
-        "c_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index e3de6b64fbb..7da6e558ff3 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -430,16 +430,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -528,16 +518,6 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ff10fcb4315..ef31f18445f 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -734,27 +734,6 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "up_proj",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        # language model
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",  # Same name with vision encoder
-        # vision tower
-        "qkv",
-        "gate_proj",
-        "up_proj",
-        "attn.proj",  # Distinguish patch_embed.proj
-        "fc1",
-        "fc2",
-        # projector
-        "mlp.0",
-        "mlp.2"
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 00e4159e28c..c6588a47d88 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -47,16 +47,6 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 919445267f4..31701abd333 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -1048,24 +1048,6 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         ],
     }
 
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        # vision tower
-        "qkv",
-        "attn.proj",  # Distinguish patch_embed.proj
-        "fc1",
-        "fc2",
-        # projector
-        "mlp.0",
-        "mlp.2"
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     # To ensure correct weight loading and mapping.
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={
         "lm_head.": "language_model.lm_head.",
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 61a4584abf8..56faa390fc5 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -667,21 +667,6 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
             "w1",
         ],
     }
-    # LoRA specific attributes
-    supported_lora_modules = [
-        "c_attn",
-        "gate_up_proj",
-        "c_proj",
-        # visual module
-        "out_proj",
-        "in_proj",
-        "c_fc",
-        # resampler
-        "kv_proj",
-    ]
-
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 6215ed814bf..ad98f3b0703 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -386,14 +386,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
 
     # LoRA specific attributes
-    supported_lora_modules = [
-        "qkv_proj",
-        "o_proj",
-        "gate_up_proj",
-        "down_proj",
-        "embed_tokens",
-        "lm_head",
-    ]
     embedding_modules = {
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 9b456b24895..b431abb76b6 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -27,6 +27,11 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.logger import init_logger
+from vllm.lora.fully_sharded_layers import (
+    ColumnParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA)
+from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
+                              ReplicatedLinearWithLoRA,
+                              RowParallelLinearWithLoRA)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -103,6 +108,23 @@ def replace_linear_class(
         "rowwise": RowParallelLinear,
     }.get(style, ReplicatedLinear)
 
+    lora_linear_cls = {
+        ColumnParallelLinear: {
+            True: ColumnParallelLinearWithShardedLoRA,  # fully sharded
+            False: ColumnParallelLinearWithLoRA  # not fully sharded
+        },
+        RowParallelLinear: {
+            True: RowParallelLinearWithShardedLoRA,
+            False: RowParallelLinearWithLoRA
+        },
+        # ReplicatedLinear doesn't support fully sharded LoRA yet,
+        # so we use the same class for both cases.
+        ReplicatedLinear: {
+            True: ReplicatedLinearWithLoRA,
+            False: ReplicatedLinearWithLoRA
+        }
+    }
+
     class HFCompatibleLinear(vllm_linear_cls):
         """
         Wrapper class that removes `output_bias` from returned output.
@@ -111,6 +133,19 @@ class HFCompatibleLinear(vllm_linear_cls):
         def forward(self, input: torch.Tensor) -> torch.Tensor:
             return super().forward(input)[0]
 
+        @classmethod
+        def get_lora_class(cls, fully_sharded: bool = False):
+            """
+            Get the LoRA class corresponding to the current transformer
+            linear class.
+
+            Args:
+                fully_sharded (bool): If True, select the LoRA class variant
+                that supports fully sharded LoRA. Defaults to False.
+
+            """
+            return lora_linear_cls[vllm_linear_cls][fully_sharded]
+
     return HFCompatibleLinear(
         input_size=linear.in_features,
         output_size=linear.out_features,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index e24b4aeb8ae..b99094e5d4c 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -360,14 +360,6 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
         "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
-    # LoRA specific attributes
-    # TODO : Add LoRA to the audio tower and projector.
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
-    ]
-    embedding_modules = {}
-    embedding_padding_modules = []
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index fe7c776d0a2..f22526cfad7 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -650,9 +650,6 @@ def load_model(self) -> None:
             logger.info(msg)
 
             if self.lora_config:
-                assert hasattr(self.model, "supported_lora_modules"
-                               ) and self.model.supported_lora_modules, (
-                                   "Model does not support LoRA")
                 assert hasattr(self.model, "embedding_modules"
                                ), "Model does not have embedding_modules"
                 assert hasattr(

From 3b2ceb1debc51e416244ceac083acada66a9178e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Feb 2025 16:31:26 +0800
Subject: [PATCH 0320/1240] [CI/Build] Fix pre-commit errors (#13696)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_latency.py                  |  6 +++---
 vllm/entrypoints/openai/serving_engine.py        |  2 +-
 vllm/entrypoints/openai/serving_score.py         |  7 ++++++-
 vllm/model_executor/layers/mamba/mamba_mixer2.py | 13 +++++--------
 vllm/utils.py                                    |  6 ++++--
 vllm/v1/worker/gpu_model_runner.py               |  7 +++++--
 6 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index b1d68ea2469..71ec909cba4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -43,9 +43,9 @@ def main(args: argparse.Namespace):
     # the engine will automatically process the request in multiple batches.
     llm = LLM(**dataclasses.asdict(engine_args))
     assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len + args.output_len), (
-            "Please ensure that max_model_len is greater than"
-            " the sum of input_len and output_len.")
+        args.input_len +
+        args.output_len), ("Please ensure that max_model_len is greater than"
+                           " the sum of input_len and output_len.")
 
     sampling_params = SamplingParams(
         n=args.n,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 05b5f95a5e5..d097bfcfc5a 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -523,7 +523,7 @@ def _get_decoded_token(logprob: Logprob,
             return logprob.decoded_token
         return tokenizer.decode(token_id)
 
-    def _is_model_supported(self, model_name) -> bool:
+    def _is_model_supported(self, model_name: Optional[str]) -> bool:
         if not model_name:
             return True
         return self.models.is_base_model(model_name)
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 01e2d304361..a087a8d9ba0 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -358,7 +358,12 @@ async def do_rerank(
                 request.truncate_prompt_tokens,
             )
             return self.request_output_to_rerank_response(
-                final_res_batch, request_id, self._get_model_name(request.model), documents, top_n)
+                final_res_batch,
+                request_id,
+                self._get_model_name(request.model),
+                documents,
+                top_n,
+            )
         except asyncio.CancelledError:
             return self.create_error_response("Client disconnected")
         except ValueError as e:
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index a6a95c8da7e..2bcf50e7071 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -134,7 +134,7 @@ def extra_groups_for_head_shards(ngroups: int, tp_size: int):
         return 0
 
     # for n_groups == 1, this is exactly tp_size - n_groups
-    return tp_size - ngroups 
+    return tp_size - ngroups
 
 
 def mamba_v2_sharded_weight_loader(
@@ -168,12 +168,9 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
             # - compute the rank into the loaded shard.
             # - if there is replication, different TP shards will
             #   take from the same rank.
-            if duplicate_groups:
-                # NOTE: currently we only support duplication
-                # in the case where num_groups == 1
-                rank = 0
-            else:
-                rank = tp_rank 
+            # NOTE: currently we only support duplication
+            # in the case where num_groups == 1
+            rank = 0 if duplicate_groups else tp_rank
 
             # - leftmost boundary index into loaded weight.
             loaded_skip = rank * shard_size
@@ -247,7 +244,7 @@ def __init__(self,
         assert num_heads % self.tp_size == 0, \
             "Tensor parallel world size must divide num heads."
 
-        
+
         assert (n_groups % self.tp_size) == 0 or n_groups == 1, \
             (
                 "If tensor parallel world size does not divide num_heads, "
diff --git a/vllm/utils.py b/vllm/utils.py
index dcafd5411bb..25a3bdc6daf 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1198,10 +1198,12 @@ def check_port(self, value):
         try:
             value = int(value)
         except ValueError:
-            raise argparse.ArgumentTypeError("Port must be an integer")
+            msg = "Port must be an integer"
+            raise argparse.ArgumentTypeError(msg) from None
 
         if not (1024 <= value <= 65535):
-            raise argparse.ArgumentTypeError("Port must be between 1024 and 65535")
+            raise argparse.ArgumentTypeError(
+                "Port must be between 1024 and 65535")
 
         return value
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 000b17c99b2..0d76b1a35c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1319,13 +1319,16 @@ def profile_run(self) -> None:
                     generators={},
                     max_num_logprobs=None,
                     no_penalties=True,
-                    prompt_token_ids=torch.ones_like(logits, dtype=torch.int64),
+                    prompt_token_ids=torch.ones_like(logits,
+                                                     dtype=torch.int64),
                     frequency_penalties=dummy_tensors(0.1),
                     presence_penalties=dummy_tensors(0.1),
                     repetition_penalties=dummy_tensors(0.1),
                     output_token_ids=[[] for _ in range(num_reqs)],
                     min_tokens={},
-                    logit_bias=[None for _ in range(num_reqs)])
+                    logit_bias=[None for _ in range(num_reqs)],
+                    allowed_token_ids_mask=None,
+                )
                 sampler_output = self.model.sample(
                     logits=logits, sampling_metadata=dummy_metadata)
             else:

From 41b965384f14f06bf09a999e5f87a1d3ee810c33 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 22 Feb 2025 19:28:59 +0800
Subject: [PATCH 0321/1240] [core] set up data parallel communication (#13591)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  2 +
 examples/offline_inference/data_parallel.py   | 76 ++++++++++++++++
 vllm/config.py                                | 57 ++++++++++++
 .../device_communicators/cuda_communicator.py |  4 +-
 .../device_communicators/custom_all_reduce.py | 11 ++-
 vllm/distributed/parallel_state.py            | 76 +++++++++++++---
 vllm/distributed/utils.py                     | 91 ++++++++++++++++++-
 vllm/envs.py                                  | 20 ++++
 vllm/forward_context.py                       | 34 ++++++-
 vllm/utils.py                                 | 18 ++++
 vllm/v1/engine/core.py                        |  3 +
 vllm/v1/engine/core_client.py                 | 14 +++
 vllm/v1/engine/llm_engine.py                  | 26 +++++-
 vllm/v1/executor/multiproc_executor.py        |  2 +-
 vllm/v1/worker/gpu_model_runner.py            |  2 +-
 vllm/v1/worker/gpu_worker.py                  |  3 +
 vllm/worker/worker_base.py                    |  5 +
 17 files changed, 416 insertions(+), 28 deletions(-)
 create mode 100644 examples/offline_inference/data_parallel.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 66efe3ed329..d96f0183bc6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -134,7 +134,9 @@ steps:
   - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
   commands:
+  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
new file mode 100644
index 00000000000..a9544c8cf8a
--- /dev/null
+++ b/examples/offline_inference/data_parallel.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# usage: VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
+# we need to have a launcher to create multiple data parallel
+# ranks. And each rank will create a vLLM instance to process its own prompts.
+import os
+
+from vllm import LLM, SamplingParams
+from vllm.utils import get_open_port
+
+
+def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
+    os.environ["VLLM_DP_RANK"] = str(dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+    # set devices for each dp_rank
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+        str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) *
+                              GPUs_per_dp_rank))
+
+    # Sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    # with DP, each rank should process different prompts.
+    # usually all the DP ranks process a full dataset,
+    # and each rank processes a different part of the dataset.
+    promts_per_rank = len(prompts) // dp_size
+    start = dp_rank * promts_per_rank
+    end = start + promts_per_rank
+    prompts = prompts[start:end]
+    if len(prompts) == 0:
+        # if any rank has no prompts to process,
+        # we need to set a placeholder prompt
+        prompts = ["Placeholder"]
+    print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
+
+    # Create a sampling params object.
+    # since we are doing data parallel, every rank can have different
+    # sampling params. here we set different max_tokens for different
+    # ranks for demonstration.
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=16 * (dp_rank + 1))
+
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=2, enforce_eager=True)
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(
+            f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    from multiprocessing import Process
+    dp_size = 2
+    GPUs_per_dp_rank = 2
+    dp_master_ip = "127.0.0.1"
+    dp_master_port = get_open_port()
+    procs = []
+    for i in range(dp_size):
+        proc = Process(target=main,
+                       args=(dp_size, i, dp_master_ip, dp_master_port,
+                             GPUs_per_dp_rank))
+        proc.start()
+        procs.append(proc)
+    for proc in procs:
+        proc.join()
diff --git a/vllm/config.py b/vllm/config.py
index 797697aac12..ed32a502879 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -16,6 +16,7 @@
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
+from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
@@ -1296,6 +1297,11 @@ class ParallelConfig:
 
     pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
+    data_parallel_size: int = 1  # Number of data parallel groups.
+    data_parallel_rank: int = 0  # Rank of the data parallel group.
+    # IP of the data parallel master.
+    data_parallel_master_ip: str = "127.0.0.1"
+    data_parallel_master_port: int = 29500  # Port of the data parallel master.
 
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
@@ -1329,10 +1335,55 @@ class ParallelConfig:
     worker_cls: str = "auto"
     sd_worker_cls: str = "auto"
 
+    # world_size is TPxPP, it affects the number of workers we create.
     world_size: int = field(init=False)
+    # world_size_across_dp is TPxPPxDP, it is the size of the world
+    # including data parallelism.
+    world_size_across_dp: int = field(init=False)
 
     rank: int = 0
 
+    def get_next_dp_init_port(self) -> int:
+        """
+        We might need to initialize process groups in multiple
+        processes that is related to data parallelism,
+        e.g. both in the worker and in the engine, which
+        can live in different processes. To avoid port conflicts, we
+        increment the port number each time we need to initialize a
+        new process group related to data parallelism.
+        """
+        answer = self.data_parallel_master_port
+        self.data_parallel_master_port += 1
+        return answer
+
+    def stateless_init_dp_group(self) -> "ProcessGroup":
+        from vllm.distributed.utils import (
+            stateless_init_torch_distributed_process_group)
+
+        # use gloo since the engine process might not have cuda device
+        dp_group = stateless_init_torch_distributed_process_group(
+            self.data_parallel_master_ip,
+            self.get_next_dp_init_port(),
+            self.data_parallel_rank,
+            self.data_parallel_size,
+            backend="gloo")
+
+        return dp_group
+
+    @staticmethod
+    def has_unfinished_dp(dp_group: "ProcessGroup",
+                                      has_unfinished: bool) -> bool:
+        tensor = torch.tensor([has_unfinished],
+                              dtype=torch.int32,
+                              device="cpu")
+        # dp rank 0: has_unfinished_seqs=True
+        # dp rank 1: has_unfinished_seqs=False
+        # aggregated: has_unfinished_seqs=True
+        # so this is an OR operation, i.e. MAX in integers
+        torch.distributed.all_reduce(tensor, op=ReduceOp.MAX, group=dp_group)
+        aggregated_has_unfinished = bool(tensor.item())
+        return aggregated_has_unfinished
+
     def compute_hash(self):
         """
         Provide a hash that uniquely identifies all the configs
@@ -1350,6 +1401,12 @@ def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
+        self.data_parallel_size = envs.VLLM_DP_SIZE
+        self.data_parallel_rank = envs.VLLM_DP_RANK
+        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+        self.world_size_across_dp = self.world_size * self.data_parallel_size
+
         ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index f806f8b39ef..07c9ff50609 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -16,8 +16,8 @@ def __init__(self,
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
-        if "pp" in unique_name:
-            # pipeline parallel does not need custom allreduce
+        if "tp" not in unique_name:
+            # only tp uses custom allreduce
             use_custom_allreduce = False
         else:
             from vllm.distributed.parallel_state import (
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a2614ed5d0b..90f7f2d0f98 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -87,6 +87,7 @@ def __init__(self,
             return
 
         rank = dist.get_rank(group=self.group)
+        self.rank = rank
         world_size = dist.get_world_size(group=self.group)
         if world_size == 1:
             # No need to initialize custom allreduce for single GPU case.
@@ -201,8 +202,10 @@ def create_shared_buffer(
 
     @staticmethod
     def free_shared_buffer(pointers: List[int],
-                           group: Optional[ProcessGroup] = None) -> None:
-        rank = dist.get_rank(group=group)
+                           group: Optional[ProcessGroup] = None,
+                           rank: Optional[int] = None) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
         lib = CudaRTLibrary()
         lib.cudaFree(ctypes.c_void_p(pointers[rank]))
 
@@ -298,8 +301,8 @@ def close(self):
         if not self.disabled and self._ptr:
             ops.dispose(self._ptr)
             self._ptr = 0
-            self.free_shared_buffer(self.meta_ptrs)
-            self.free_shared_buffer(self.buffer_ptrs)
+            self.free_shared_buffer(self.meta_ptrs, rank=self.rank)
+            self.free_shared_buffer(self.buffer_ptrs, rank=self.rank)
 
     def __del__(self):
         self.close()
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 781f870a756..83484cd7355 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -750,6 +750,13 @@ def get_tp_group() -> GroupCoordinator:
 
 _PP: Optional[GroupCoordinator] = None
 
+_DP: Optional[GroupCoordinator] = None
+
+
+def get_dp_group() -> GroupCoordinator:
+    assert _DP is not None, ("data parallel group is not initialized")
+    return _DP
+
 
 def get_pp_group() -> GroupCoordinator:
     assert _PP is not None, (
@@ -811,6 +818,21 @@ def init_distributed_environment(
         "world_size=%d rank=%d local_rank=%d "
         "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
         distributed_init_method, backend)
+    from vllm.config import get_current_vllm_config
+    config = get_current_vllm_config()
+    if config is not None and config.parallel_config.data_parallel_size > 1:
+        parallel_config = config.parallel_config
+        # adjust to take into account data parallelism
+        # offset the rank by the data parallel rank
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        # adjust the world size to take into account data parallelism
+        world_size = parallel_config.world_size_across_dp
+        ip = parallel_config.data_parallel_master_ip
+        port = parallel_config.get_next_dp_init_port()
+        distributed_init_method = f"tcp://{ip}:{port}"  # noqa
+        logger.info(
+            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
+            world_size, rank, distributed_init_method)
     if not torch.distributed.is_initialized():
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
@@ -870,20 +892,28 @@ def initialize_model_parallel(
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
+    rank = torch.distributed.get_rank()
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
+    data_parallel_size = 1
+    from vllm.config import get_current_vllm_config
+    config = get_current_vllm_config()
+    if config is not None:
+        data_parallel_size = config.parallel_config.data_parallel_size
+
+    # the layout order is: DP x PP x TP
+    # to get group_ranks for each dimension, transpose that dimension to the
+    # last dimension, then reshape to 2D, then unbind the last dimension
+    all_ranks = torch.arange(world_size).reshape(
+        data_parallel_size, pipeline_model_parallel_size,
+        tensor_model_parallel_size)  # noqa
+
     # Build the tensor model-parallel groups.
-    num_tensor_model_parallel_groups: int = (world_size //
-                                             tensor_model_parallel_size)
     global _TP
     assert _TP is None, ("tensor model parallel group is already initialized")
-    group_ranks = []
-    for i in range(num_tensor_model_parallel_groups):
-        ranks = list(
-            range(i * tensor_model_parallel_size,
-                  (i + 1) * tensor_model_parallel_size))
-        group_ranks.append(ranks)
+    group_ranks = all_ranks.view(-1, tensor_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
 
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,
@@ -893,20 +923,33 @@ def initialize_model_parallel(
                                     group_name="tp")
 
     # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = (world_size //
-                                               pipeline_model_parallel_size)
     global _PP
     assert _PP is None, (
         "pipeline model parallel group is already initialized")
-    group_ranks = []
-    for i in range(num_pipeline_model_parallel_groups):
-        ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
-        group_ranks.append(ranks)
+    group_ranks = all_ranks.transpose(1, 2).reshape(
+        -1, pipeline_model_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     group_name="pp")
 
+    global _DP
+    assert _DP is None, ("data parallel group is already initialized")
+    group_ranks = all_ranks.transpose(0,
+                                      2).reshape(-1,
+                                                 data_parallel_size).unbind(0)
+    group_ranks = [x.tolist() for x in group_ranks]
+    _DP = init_model_parallel_group(group_ranks,
+                                    get_world_group().local_rank,
+                                    backend,
+                                    group_name="dp")
+
+    logger.info(
+        "rank %s in world size %s is assigned as "
+        "DP rank %s, PP rank %s, TP rank %s", rank, world_size,
+        _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group)
+
 
 def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
     """
@@ -1011,6 +1054,11 @@ def destroy_model_parallel():
         _PP.destroy()
     _PP = None
 
+    global _DP
+    if _DP:
+        _DP.destroy()
+    _DP = None
+
 
 def destroy_distributed_environment():
     global _WORLD
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 84f8c0a8e51..79f9a84b476 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -11,7 +11,11 @@
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
 import torch
-from torch.distributed import TCPStore
+from torch.distributed import ProcessGroup, TCPStore
+from torch.distributed.distributed_c10d import (Backend, PrefixStore,
+                                                _get_default_timeout,
+                                                is_nccl_available)
+from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -227,3 +231,88 @@ def create(
             world_size=world_size,
             store=store,
             data_expiration_seconds=data_expiration_seconds)
+
+
+def stateless_init_torch_distributed_process_group(
+        host: str, port: int, rank: int, world_size: int,
+        backend: str) -> ProcessGroup:
+    """
+    A replacement for `torch.distributed.init_process_group` that does not
+    pollute the global state. The created ProcessGroup object can be used for
+    some operations such as `allreduce`, because it does not depend on the
+    global rank. However, some operations such as `broadcast` cannot be used
+    because it depends on the global rank.
+
+    # TODO: ask for help from PyTorch team if we need the `broadcast` operation.
+
+    This function is useful when we are not sure about the total number of
+    processes in the process group. For example, we may have process
+    1, 2, ..., 8 who want to communicate, and process 9 might be the same
+    process as process 1, or it might be a different process; process 10
+    might be the same process as process 5, or it might be a different process.
+    In this case, how can we reliably form a communication channel within
+    process 9 and 10, without affecting the communication channel within
+    process 1, 2, ..., 8?
+
+    One possible solution is to figure out if process 9 and 10 are the same
+    as process 1 and 5 beforehand, and then form a communication channel
+    based on the information, adjusting the ranks and world_size etc. However,
+    figuring out the information is not always easy, and it will interfere
+    with the main communication channel.
+
+    Our solution is to always form a communication channel with process 1, 2,
+    ..., 8, and then use this function to form another communication channel
+    with process 9 and 10. This way, regardless of whether process 9 and 10
+    are the same as process 1 and 5, the main communication channel is
+    always formed with process 1, 2, ..., 8, and the additional communication
+    channel is formed with process 9 and 10.
+    """
+    init_method = f"tcp://{host}:{port}"
+    backend = Backend(backend)  # it is basically string
+    timeout = _get_default_timeout(backend)
+
+    store, rank, world_size = next(
+        rendezvous(init_method, rank, world_size, timeout=timeout))
+    store.set_timeout(timeout)
+
+    group_rank = rank
+    group_size = world_size
+
+    # Use a PrefixStore to avoid accidental overrides of keys used by
+    # different systems (e.g. RPC) in case the store is multi-tenant.
+    prefix_store = PrefixStore(init_method, store)
+
+    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
+
+    pg: ProcessGroup = ProcessGroup(
+        prefix_store,
+        group_rank,
+        group_size,
+        pg_options,
+    )
+
+    if backend == "gloo":
+        from torch.distributed.distributed_c10d import ProcessGroupGloo
+        backend_class = ProcessGroupGloo(prefix_store,
+                                         group_rank,
+                                         group_size,
+                                         timeout=timeout)
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+    elif backend == "nccl":
+        assert is_nccl_available()
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(device, backend_type, backend_class)
+
+    return pg
diff --git a/vllm/envs.py b/vllm/envs.py
index 45547416314..1eb9b9f1bbf 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -90,6 +90,10 @@
     VLLM_RAY_BUNDLE_INDICES: str = ""
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
+    VLLM_DP_RANK: int = 0
+    VLLM_DP_SIZE: int = 1
+    VLLM_DP_MASTER_IP: str = ""
+    VLLM_DP_MASTER_PORT: int = 0
 
 
 def get_default_cache_root():
@@ -593,6 +597,22 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
     lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
     ("1", "true"),
+
+    # Rank of the process in the data parallel setting
+    "VLLM_DP_RANK":
+    lambda: int(os.getenv("VLLM_DP_RANK", "0")),
+
+    # World size of the data parallel setting
+    "VLLM_DP_SIZE":
+    lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
+
+    # IP address of the master node in the data parallel setting
+    "VLLM_DP_MASTER_IP":
+    lambda: os.getenv("VLLM_DP_MASTER_IP", "127.0.0.1"),
+
+    # Port of the master node in the data parallel setting
+    "VLLM_DP_MASTER_PORT":
+    lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 10de8bc593a..b91816af1b6 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -4,9 +4,10 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import torch
+import torch.distributed as dist
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
@@ -32,6 +33,8 @@ class ForwardContext:
     attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
+    num_tokens_across_dp: Optional[
+        List[int]] = None  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -48,7 +51,8 @@ def get_forward_context() -> ForwardContext:
 @contextmanager
 def set_forward_context(attn_metadata: Any,
                         vllm_config: VllmConfig,
-                        virtual_engine: int = 0):
+                        virtual_engine: int = 0,
+                        num_tokens: int = 0):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     Here we can inject common logic for every model forward pass.
@@ -57,12 +61,36 @@ def set_forward_context(attn_metadata: Any,
     need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
+    num_tokens_across_dp = None
+    if vllm_config.parallel_config.data_parallel_size > 1:
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        if attn_metadata is not None:
+            if hasattr(attn_metadata, "num_prefill_tokens"):
+                # for v0 attention backends
+                batchsize = attn_metadata.num_prefill_tokens + \
+                    attn_metadata.num_decode_tokens
+            else:
+                # for v1 attention backends
+                batchsize = attn_metadata.num_input_tokens
+        else:
+            batchsize = num_tokens
+        num_tokens_across_dp = [0] * dp_size
+        num_tokens_across_dp[dp_rank] = batchsize
+        num_tokens_tensor = torch.tensor(num_tokens_across_dp,
+                                         device="cpu",
+                                         dtype=torch.int32)
+        from vllm.distributed.parallel_state import get_dp_group
+        dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
+        num_tokens_across_dp = num_tokens_tensor.tolist()
+
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
         attn_layers=vllm_config.compilation_config.static_forward_context,
         virtual_engine=virtual_engine,
-        attn_metadata=attn_metadata)
+        attn_metadata=attn_metadata,
+        num_tokens_across_dp=num_tokens_across_dp)
     try:
         yield
     finally:
diff --git a/vllm/utils.py b/vllm/utils.py
index 25a3bdc6daf..7d24154927b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -501,6 +501,24 @@ def get_open_zmq_ipc_path() -> str:
 
 
 def get_open_port() -> int:
+    """
+    Get an open port for the vLLM process to listen on.
+    An edge case to handle, is when we run data parallel,
+    we need to avoid ports that are potentially used by
+    the data parallel master process.
+    Right now we reserve 10 ports for the data parallel master
+    process. Currently it uses 2 ports.
+    """
+    if "VLLM_DP_MASTER_PORT" in os.environ:
+        dp_port = envs.VLLM_DP_MASTER_PORT
+        while True:
+            port = _get_open_port()
+            if port >= dp_port and port < dp_port + 10:
+                continue
+            return port
+    return _get_open_port()
+
+def _get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
         while True:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 03825d6ea43..981d23237e2 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -219,6 +219,9 @@ def sleep(self, level: int = 1):
     def wake_up(self):
         self.model_executor.wake_up()
 
+    def execute_dummy_batch(self):
+        self.model_executor.collective_rpc("execute_dummy_batch")
+
     def add_lora(self, lora_request: LoRARequest) -> None:
         self.model_executor.add_lora(lora_request)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 43ba7583c66..e898a872c62 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -87,6 +87,12 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         raise NotImplementedError
 
+    def execute_dummy_batch(self) -> None:
+        raise NotImplementedError
+    
+    async def execute_dummy_batch_async(self) -> None:
+        raise NotImplementedError
+
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
@@ -156,6 +162,9 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         self.engine_core.wake_up()
 
+    def execute_dummy_batch(self) -> None:
+        self.engine_core.execute_dummy_batch()
+
     def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine_core.add_lora(lora_request)
 
@@ -331,6 +340,8 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         self._call_utility("wake_up")
 
+    def execute_dummy_batch(self) -> None:
+        self._call_utility("execute_dummy_batch")
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -414,5 +425,8 @@ async def sleep_async(self, level: int = 1) -> None:
     async def wake_up_async(self) -> None:
         await self._call_utility_async("wake_up")
 
+    async def execute_dummy_batch_async(self) -> None:
+        await self._call_utility_async("execute_dummy_batch")
+
     async def add_lora_async(self, lora_request: LoRARequest) -> None:
         await self._call_utility_async("add_lora", lora_request)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 6b7de4deed3..04c7ee109e0 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -4,7 +4,7 @@
 
 from typing_extensions import TypeVar
 
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
@@ -47,6 +47,13 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
+        # important: init dp group before init the engine_core
+        self.parallel_config = vllm_config.parallel_config
+        self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
+        self.should_execute_dummy_batch = False
+        if self.dp_enabled:
+            self.dp_group = self.parallel_config.stateless_init_dp_group()
+
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
@@ -106,7 +113,17 @@ def get_num_unfinished_requests(self) -> int:
         return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
-        return self.output_processor.has_unfinished_requests()
+        has_unfinished = self.output_processor.has_unfinished_requests()
+        if not self.dp_enabled:
+            return has_unfinished
+        return self.has_unfinished_requests_dp(has_unfinished)
+
+    def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
+        aggregated_has_unfinished = ParallelConfig.has_unfinished_dp(
+            self.dp_group, has_unfinished)
+        if not has_unfinished and aggregated_has_unfinished:
+            self.should_execute_dummy_batch = True
+        return aggregated_has_unfinished
 
     @classmethod
     def validate_outputs(cls, outputs, output_type):
@@ -145,6 +162,11 @@ def add_request(
 
     def step(self) -> List[RequestOutput]:
 
+        if self.should_execute_dummy_batch:
+            self.should_execute_dummy_batch = False
+            self.engine_core.execute_dummy_batch()
+            return []
+
         # 1) Get EngineCoreOutput from the EngineCore.
         outputs = self.engine_core.get_output()
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e3f07172d8c..14492f273ed 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -239,7 +239,7 @@ def __init__(
             ready_socket.send_string(WorkerProc.READY_STR)
             ready_socket.send(payload)
 
-        self.worker.init_device()
+        wrapper.init_device()
         self.worker.load_model()
 
     @staticmethod
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0d76b1a35c7..f002cbfccd4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1167,7 +1167,7 @@ def _dummy_run(
                 for k, v in self.intermediate_tensors.items()
             })
 
-        with set_forward_context(None, self.vllm_config):
+        with set_forward_context(None, self.vllm_config, num_tokens=num_tokens):
             hidden_states = model(
                 input_ids=input_ids,
                 positions=positions,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 10154a75239..ece0fa55534 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -235,6 +235,9 @@ def profile(self, is_start: bool = True):
         else:
             self.profiler.stop()
 
+    def execute_dummy_batch(self) -> None:
+        self.model_runner._dummy_run(1)
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 190429074d5..44c26ed350a 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -567,6 +567,11 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
             self.worker = worker_class(**kwargs)
             assert self.worker is not None
 
+    def init_device(self):
+        with set_current_vllm_config(self.vllm_config):
+            # To make vLLM config available during device initialization
+            self.worker.init_device()  # type: ignore
+
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
             target = self if self.worker is None else self.worker

From 81eb81d559960975ae832fb625cb8983f8907c7c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 22 Feb 2025 20:28:59 +0800
Subject: [PATCH 0322/1240] [ci] fix linter (#13701)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/data_parallel.py | 9 +++++----
 vllm/config.py                              | 2 +-
 vllm/utils.py                               | 1 +
 vllm/v1/engine/core_client.py               | 3 ++-
 vllm/v1/worker/gpu_model_runner.py          | 3 ++-
 5 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index a9544c8cf8a..2e1fa50e2ab 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -48,15 +48,16 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
                                      max_tokens=16 * (dp_rank + 1))
 
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m", tensor_parallel_size=2, enforce_eager=True)
+    llm = LLM(model="facebook/opt-125m",
+              tensor_parallel_size=2,
+              enforce_eager=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(
-            f"DP rank {dp_rank}, Prompt: {prompt!r}, "
-            f"Generated text: {generated_text!r}")
+        print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
 
 
 if __name__ == "__main__":
diff --git a/vllm/config.py b/vllm/config.py
index ed32a502879..d3139b5fd84 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1372,7 +1372,7 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
 
     @staticmethod
     def has_unfinished_dp(dp_group: "ProcessGroup",
-                                      has_unfinished: bool) -> bool:
+                          has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished],
                               dtype=torch.int32,
                               device="cpu")
diff --git a/vllm/utils.py b/vllm/utils.py
index 7d24154927b..675edc3620b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -518,6 +518,7 @@ def get_open_port() -> int:
             return port
     return _get_open_port()
 
+
 def _get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e898a872c62..527aa72833b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -89,7 +89,7 @@ def wake_up(self) -> None:
 
     def execute_dummy_batch(self) -> None:
         raise NotImplementedError
-    
+
     async def execute_dummy_batch_async(self) -> None:
         raise NotImplementedError
 
@@ -343,6 +343,7 @@ def wake_up(self) -> None:
     def execute_dummy_batch(self) -> None:
         self._call_utility("execute_dummy_batch")
 
+
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f002cbfccd4..a7b9d478118 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1167,7 +1167,8 @@ def _dummy_run(
                 for k, v in self.intermediate_tensors.items()
             })
 
-        with set_forward_context(None, self.vllm_config, num_tokens=num_tokens):
+        with set_forward_context(None, self.vllm_config,
+                                 num_tokens=num_tokens):
             hidden_states = model(
                 input_ids=input_ids,
                 positions=positions,

From 0e5772ce138fa830a7bb83fbf25f0ad9086ec8b7 Mon Sep 17 00:00:00 2001
From: Keyun Tong <tongkeyun@gmail.com>
Date: Sat, 22 Feb 2025 05:17:44 -0800
Subject: [PATCH 0323/1240] Support SSL Key Rotation in HTTP Server (#13495)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt                      |  3 +-
 tests/entrypoints/test_ssl_cert_refresher.py | 72 +++++++++++++++++++
 vllm/entrypoints/api_server.py               |  6 ++
 vllm/entrypoints/launcher.py                 | 14 +++-
 vllm/entrypoints/openai/api_server.py        |  1 +
 vllm/entrypoints/openai/cli_args.py          |  5 ++
 vllm/entrypoints/ssl.py                      | 74 ++++++++++++++++++++
 7 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 tests/entrypoints/test_ssl_cert_refresher.py
 create mode 100644 vllm/entrypoints/ssl.py

diff --git a/requirements-common.txt b/requirements-common.txt
index f72aa40fcce..c0df136f500 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -20,7 +20,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
 outlines == 0.1.11
-lark == 1.2.2 
+lark == 1.2.2
 xgrammar == 0.1.11; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
@@ -37,3 +37,4 @@ einops # Required for Qwen2-VL.
 compressed-tensors == 0.9.2 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
+watchfiles # required for http server to monitor the updates of TLS files
diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py
new file mode 100644
index 00000000000..23ce7a679f3
--- /dev/null
+++ b/tests/entrypoints/test_ssl_cert_refresher.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+import asyncio
+import tempfile
+from pathlib import Path
+from ssl import SSLContext
+
+import pytest
+
+from vllm.entrypoints.ssl import SSLCertRefresher
+
+
+class MockSSLContext(SSLContext):
+
+    def __init__(self):
+        self.load_cert_chain_count = 0
+        self.load_ca_count = 0
+
+    def load_cert_chain(
+        self,
+        certfile,
+        keyfile=None,
+        password=None,
+    ):
+        self.load_cert_chain_count += 1
+
+    def load_verify_locations(
+        self,
+        cafile=None,
+        capath=None,
+        cadata=None,
+    ):
+        self.load_ca_count += 1
+
+
+def create_file() -> str:
+    with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
+        return f.name
+
+
+def touch_file(path: str) -> None:
+    Path(path).touch()
+
+
+@pytest.mark.asyncio
+async def test_ssl_refresher():
+    ssl_context = MockSSLContext()
+    key_path = create_file()
+    cert_path = create_file()
+    ca_path = create_file()
+    ssl_refresher = SSLCertRefresher(ssl_context, key_path, cert_path, ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 0
+    assert ssl_context.load_ca_count == 0
+
+    touch_file(key_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 1
+    assert ssl_context.load_ca_count == 0
+
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
+
+    ssl_refresher.stop()
+
+    touch_file(cert_path)
+    touch_file(ca_path)
+    await asyncio.sleep(1)
+    assert ssl_context.load_cert_chain_count == 2
+    assert ssl_context.load_ca_count == 1
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 11ffc4f67ce..28b8c847c0f 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -128,6 +128,7 @@ async def run_server(args: Namespace,
     shutdown_task = await serve_http(
         app,
         sock=None,
+        enable_ssl_refresh=args.enable_ssl_refresh,
         host=args.host,
         port=args.port,
         log_level=args.log_level,
@@ -152,6 +153,11 @@ async def run_server(args: Namespace,
                         type=str,
                         default=None,
                         help="The CA certificates file")
+    parser.add_argument(
+        "--enable-ssl-refresh",
+        action="store_true",
+        default=False,
+        help="Refresh SSL Context when SSL certificate files change")
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
index 79946a498da..b09ee526f14 100644
--- a/vllm/entrypoints/launcher.py
+++ b/vllm/entrypoints/launcher.py
@@ -12,13 +12,16 @@
 from vllm import envs
 from vllm.engine.async_llm_engine import AsyncEngineDeadError
 from vllm.engine.multiprocessing import MQEngineDeadError
+from vllm.entrypoints.ssl import SSLCertRefresher
 from vllm.logger import init_logger
 from vllm.utils import find_process_using_port
 
 logger = init_logger(__name__)
 
 
-async def serve_http(app: FastAPI, sock: Optional[socket.socket],
+async def serve_http(app: FastAPI,
+                     sock: Optional[socket.socket],
+                     enable_ssl_refresh: bool = False,
                      **uvicorn_kwargs: Any):
     logger.info("Available routes are:")
     for route in app.routes:
@@ -31,6 +34,7 @@ async def serve_http(app: FastAPI, sock: Optional[socket.socket],
         logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
 
     config = uvicorn.Config(app, **uvicorn_kwargs)
+    config.load()
     server = uvicorn.Server(config)
     _add_shutdown_handlers(app, server)
 
@@ -39,9 +43,17 @@ async def serve_http(app: FastAPI, sock: Optional[socket.socket],
     server_task = loop.create_task(
         server.serve(sockets=[sock] if sock else None))
 
+    ssl_cert_refresher = None if not enable_ssl_refresh else SSLCertRefresher(
+        ssl_context=config.ssl,
+        key_path=config.ssl_keyfile,
+        cert_path=config.ssl_certfile,
+        ca_path=config.ssl_ca_certs)
+
     def signal_handler() -> None:
         # prevents the uvicorn signal handler to exit early
         server_task.cancel()
+        if ssl_cert_refresher:
+            ssl_cert_refresher.stop()
 
     async def dummy_shutdown() -> None:
         pass
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index d037a4e6348..73061995572 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -960,6 +960,7 @@ def _listen_addr(a: str) -> str:
         shutdown_task = await serve_http(
             app,
             sock=sock,
+            enable_ssl_refresh=args.enable_ssl_refresh,
             host=args.host,
             port=args.port,
             log_level=args.uvicorn_log_level,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 3054958f3c8..ba953c21970 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -164,6 +164,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         type=nullable_str,
                         default=None,
                         help="The CA certificates file.")
+    parser.add_argument(
+        "--enable-ssl-refresh",
+        action="store_true",
+        default=False,
+        help="Refresh SSL Context when SSL certificate files change")
     parser.add_argument(
         "--ssl-cert-reqs",
         type=int,
diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py
new file mode 100644
index 00000000000..dba916b8bf1
--- /dev/null
+++ b/vllm/entrypoints/ssl.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+from ssl import SSLContext
+from typing import Callable, Optional
+
+from watchfiles import Change, awatch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class SSLCertRefresher:
+    """A class that monitors SSL certificate files and
+    reloads them when they change.
+    """
+
+    def __init__(self,
+                 ssl_context: SSLContext,
+                 key_path: Optional[str] = None,
+                 cert_path: Optional[str] = None,
+                 ca_path: Optional[str] = None) -> None:
+        self.ssl = ssl_context
+        self.key_path = key_path
+        self.cert_path = cert_path
+        self.ca_path = ca_path
+
+        # Setup certification chain watcher
+        def update_ssl_cert_chain(change: Change, file_path: str) -> None:
+            logger.info("Reloading SSL certificate chain")
+            assert self.key_path and self.cert_path
+            self.ssl.load_cert_chain(self.cert_path, self.key_path)
+
+        self.watch_ssl_cert_task = None
+        if self.key_path and self.cert_path:
+            self.watch_ssl_cert_task = asyncio.create_task(
+                self._watch_files([self.key_path, self.cert_path],
+                                  update_ssl_cert_chain))
+
+        # Setup CA files watcher
+        def update_ssl_ca(change: Change, file_path: str) -> None:
+            logger.info("Reloading SSL CA certificates")
+            assert self.ca_path
+            self.ssl.load_verify_locations(self.ca_path)
+
+        self.watch_ssl_ca_task = None
+        if self.ca_path:
+            self.watch_ssl_ca_task = asyncio.create_task(
+                self._watch_files([self.ca_path], update_ssl_ca))
+
+    async def _watch_files(self, paths, fun: Callable[[Change, str],
+                                                      None]) -> None:
+        """Watch multiple file paths asynchronously."""
+        logger.info("SSLCertRefresher monitors files: %s", paths)
+        async for changes in awatch(*paths):
+            try:
+                for change, file_path in changes:
+                    logger.info("File change detected: %s - %s", change.name,
+                                file_path)
+                    fun(change, file_path)
+            except Exception as e:
+                logger.error(
+                    "SSLCertRefresher failed taking action on file change. "
+                    "Error: %s", e)
+
+    def stop(self) -> None:
+        """Stop watching files."""
+        if self.watch_ssl_cert_task:
+            self.watch_ssl_cert_task.cancel()
+            self.watch_ssl_cert_task = None
+        if self.watch_ssl_ca_task:
+            self.watch_ssl_ca_task.cancel()
+            self.watch_ssl_ca_task = None

From c12e55817cfdc364a54dc945f699a6f3100ca79e Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Sat, 22 Feb 2025 05:24:05 -0800
Subject: [PATCH 0324/1240] [NVIDIA] Support nvfp4 cutlass gemm (#13571)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |   4 +-
 csrc/ops.h                                    |   5 +
 .../quantization/fp4/nvfp4_scaled_mm_entry.cu |  37 +++
 .../fp4/nvfp4_scaled_mm_kernels.cu            | 280 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |   7 +
 tests/kernels/test_nvfp4_scaled_mm.py         | 150 ++++++++++
 vllm/_custom_ops.py                           |  12 +
 7 files changed, 494 insertions(+), 1 deletion(-)
 create mode 100644 csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
 create mode 100644 csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
 create mode 100644 tests/kernels/test_nvfp4_scaled_mm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cd1c2c9015d..4b569ec25f1 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,7 +229,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
   # Please keep this in sync with FetchContent_Declare line below.
-  set(CUTLASS_REVISION "v3.7.0" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -267,6 +267,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
+    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
     "csrc/cutlass_extensions/common.cpp")
 
@@ -383,6 +384,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
     set(SRCS 
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
     )
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
diff --git a/csrc/ops.h b/csrc/ops.h
index 52ccf3b51f1..13fbbe41286 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -152,6 +152,11 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
                               int64_t row);
 
 #ifndef USE_ROCM
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha);
+
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
 
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
new file mode 100644
index 00000000000..a0852c5732e
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha);
+#endif
+
+void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
+                           torch::Tensor const& B, torch::Tensor const& A_sf,
+                           torch::Tensor const& B_sf,
+                           torch::Tensor const& alpha) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel, vLLM should "
+                                     "be compiled using CUDA 12.8 and target "
+                                     "compute capability 100 or above.");
+}
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
new file mode 100644
index 00000000000..26fd91217db
--- /dev/null
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cutlass_extensions/common.hpp"
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/util/packed_stride.hpp"
+
+using namespace cute;
+
+#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+// Kernel Perf config
+template <typename T>
+struct KernelTraits;
+
+template <>
+struct KernelTraits<float> {
+  using MmaTileShape = Shape<_128, _128, _256>;
+  using ClusterShape = Shape<_1, _1, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _128, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::half_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <>
+struct KernelTraits<cutlass::bfloat16_t> {
+  using MmaTileShape = Shape<_256, _256, _256>;
+  using ClusterShape = Shape<_4, _4, _1>;
+  using PerSmTileShape_MNK = Shape<_128, _256, _256>;
+};
+
+template <typename T>
+struct Fp4GemmSm100 {
+  // A matrix configuration
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutATag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA = 32;
+
+  // B matrix configuration
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using LayoutBTag = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB = 32;
+
+  // C/D matrix configuration
+  using ElementD = T;
+  using ElementC = T;
+  using LayoutCTag = cutlass::layout::RowMajor;
+  using LayoutDTag = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  // Kernel functional config
+  using ElementAccumulator = float;
+  using ArchTag = cutlass::arch::Sm100;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  // Kernel Perf config
+  using MmaTileShape = typename KernelTraits<T>::MmaTileShape;
+  using ClusterShape = typename KernelTraits<T>::ClusterShape;
+  using PerSmTileShape_MNK = typename KernelTraits<T>::PerSmTileShape_MNK;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, PerSmTileShape_MNK, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutCTag, AlignmentC, ElementD,
+          LayoutDTag, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutATag, AlignmentA, ElementB,
+          LayoutBTag, AlignmentB, ElementAccumulator, MmaTileShape,
+          ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using LayoutA = decltype(cute::make_layout(make_shape(0, 0, 0), StrideA{}));
+  using LayoutSFA = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using LayoutB = decltype(cute::make_layout(make_shape(0, 0, 0), StrideB{}));
+  using LayoutSFB = typename Gemm::GemmKernel::CollectiveMainloop::LayoutSFB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using LayoutC = decltype(cute::make_layout(make_shape(0, 0, 0), StrideC{}));
+  using StrideD = typename Gemm::GemmKernel::StrideD;
+  using LayoutD = decltype(cute::make_layout(make_shape(0, 0, 0), StrideD{}));
+};
+
+template <typename T>
+typename T::Gemm::Arguments args_from_options(
+    at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+    at::Tensor const& A_sf, at::Tensor const& B_sf, at::Tensor const& alpha,
+    int64_t M, int64_t N, int64_t K) {
+  using ElementA = typename T::Gemm::ElementA;
+  using ElementB = typename T::Gemm::ElementB;
+  using ElementSFA = cutlass::float_ue4m3_t;
+  using ElementSFB = cutlass::float_ue4m3_t;
+  using ElementD = typename T::Gemm::ElementD;
+  using ElementCompute = float;
+  using StrideA = typename T::StrideA;
+  using StrideB = typename T::StrideB;
+  using StrideD = typename T::StrideD;
+  using Sm100BlkScaledConfig =
+      typename T::Gemm::GemmKernel::CollectiveMainloop::Sm100BlkScaledConfig;
+
+  int m = static_cast<int>(M);
+  int n = static_cast<int>(N);
+  int k = static_cast<int>(K);
+  auto stride_A = cutlass::make_cute_packed_stride(StrideA{}, {m, k, 1});
+  auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, 1});
+  auto stride_D = cutlass::make_cute_packed_stride(StrideD{}, {m, n, 1});
+
+  auto layout_SFA = Sm100BlkScaledConfig::tile_atom_to_shape_SFA(
+      cute::make_shape(m, n, k, 1));
+  auto layout_SFB = Sm100BlkScaledConfig::tile_atom_to_shape_SFB(
+      cute::make_shape(m, n, k, 1));
+
+  typename T::Gemm::Arguments arguments{
+      cutlass::gemm::GemmUniversalMode::kGemm,
+      {m, n, k, 1},
+      {// Mainloop arguments
+       static_cast<ElementA const*>(A.data_ptr()), stride_A,
+       static_cast<ElementB const*>(B.data_ptr()), stride_B,
+       static_cast<ElementSFA const*>(A_sf.data_ptr()), layout_SFA,
+       static_cast<ElementSFB const*>(B_sf.data_ptr()), layout_SFB},
+      {     // Epilogue arguments
+       {},  // epilogue.thread
+       static_cast<ElementD const*>(D.data_ptr()),
+       stride_D,
+       static_cast<ElementD*>(D.data_ptr()),
+       stride_D}};
+  auto& fusion_args = arguments.epilogue.thread;
+  fusion_args.alpha_ptr = static_cast<ElementCompute const*>(alpha.data_ptr());
+  return arguments;
+}
+
+template <typename T>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  typename Fp4GemmSm100<T>::Gemm gemm;
+
+  auto arguments =
+      args_from_options<Fp4GemmSm100<T>>(D, A, B, A_sf, B_sf, alpha, m, n, k);
+
+  size_t workspace_size = Fp4GemmSm100<T>::Gemm::get_workspace_size(arguments);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(A.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  CUTLASS_CHECK(gemm.can_implement(arguments));
+
+  CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
+
+  CUTLASS_CHECK(gemm.run(arguments, workspace.data_ptr(), stream));
+}
+#else
+template <typename T>
+void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
+             at::Tensor const& A_sf, at::Tensor const& B_sf,
+             at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
+             cudaStream_t stream) {
+  TORCH_CHECK(false, "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+                     "a CUTLASS 3.8 source directory to enable support.");
+}
+#endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
+
+#define CHECK_TYPE(x, st, m) \
+  TORCH_CHECK(x.scalar_type() == st, "Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x, m) \
+  TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+#define CHECK_INPUT(x, st, m) \
+  CHECK_TH_CUDA(x, m);        \
+  CHECK_CONTIGUOUS(x, m);     \
+  CHECK_TYPE(x, st, m)
+
+constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
+constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
+
+void cutlass_scaled_fp4_mm_sm100a(torch::Tensor& D, torch::Tensor const& A,
+                                  torch::Tensor const& B,
+                                  torch::Tensor const& A_sf,
+                                  torch::Tensor const& B_sf,
+                                  torch::Tensor const& alpha) {
+  CHECK_INPUT(A, FLOAT4_E2M1X2, "a");
+  CHECK_INPUT(B, FLOAT4_E2M1X2, "b");
+
+  CHECK_INPUT(A_sf, SF_DTYPE, "scale_a");
+  CHECK_INPUT(B_sf, SF_DTYPE, "scale_b");
+
+  CHECK_INPUT(alpha, at::ScalarType::Float, "alpha");
+
+  TORCH_CHECK(A.dim() == 2, "a must be a matrix");
+  TORCH_CHECK(B.dim() == 2, "b must be a matrix");
+  TORCH_CHECK(A.sizes()[1] == B.sizes()[1],
+              "a and b shapes cannot be multiplied (", A.sizes()[0], "x",
+              A.sizes()[1], " and ", B.sizes()[0], "x", B.sizes()[1], ")");
+
+  auto const m = A.sizes()[0];
+  auto const n = B.sizes()[0];
+  auto const k = A.sizes()[1] * 2;
+
+  constexpr int alignment = 32;
+  TORCH_CHECK(k % alignment == 0, "Expected k to be divisible by ", alignment,
+              ", but got a shape: (", A.sizes()[0], "x", A.sizes()[1],
+              "), k: ", k, ".");
+  TORCH_CHECK(n % alignment == 0, "Expected n to be divisible by ", alignment,
+              ", but got b shape: (", B.sizes()[0], "x", B.sizes()[1], ").");
+
+  auto round_up = [](int x, int y) { return (x + y - 1) / y * y; };
+  int rounded_m = round_up(m, 128);
+  int rounded_n = round_up(n, 128);
+  // Since k is divisible by 32 (alignment), k / 16 is guaranteed to be an
+  // integer.
+  int rounded_k = round_up(k / 16, 4);
+
+  TORCH_CHECK(A_sf.dim() == 2, "scale_a must be a matrix");
+  TORCH_CHECK(B_sf.dim() == 2, "scale_b must be a matrix");
+  TORCH_CHECK(A_sf.sizes()[1] == B_sf.sizes()[1],
+              "scale_a and scale_b shapes cannot be multiplied (",
+              A_sf.sizes()[0], "x", A_sf.sizes()[1], " and ", B_sf.sizes()[0],
+              "x", B_sf.sizes()[1], ")");
+  TORCH_CHECK(A_sf.sizes()[0] == rounded_m && A_sf.sizes()[1] == rounded_k,
+              "scale_a must be padded and swizzled to a shape (", rounded_m,
+              "x", rounded_k, "), but got a shape (", A_sf.sizes()[0], "x",
+              A_sf.sizes()[1], ")");
+  TORCH_CHECK(B_sf.sizes()[0] == rounded_n && B_sf.sizes()[1] == rounded_k,
+              "scale_b must be padded and swizzled to a shape (", rounded_n,
+              "x", rounded_k, "), but got a shape (", B_sf.sizes()[0], "x",
+              B_sf.sizes()[1], ")");
+
+  auto out_dtype = D.dtype();
+  at::cuda::CUDAGuard device_guard{(char)A.get_device()};
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(A.get_device());
+
+  if (out_dtype == at::ScalarType::Half) {
+    runGemm<cutlass::half_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::BFloat16) {
+    runGemm<cutlass::bfloat16_t>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else if (out_dtype == at::ScalarType::Float) {
+    runGemm<float>(D, A, B, A_sf, B_sf, alpha, m, n, k, stream);
+  } else {
+    TORCH_CHECK(false, "Unsupported output data type of nvfp4 mm");
+  }
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index d2aecba442b..72de2035d0c 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -302,6 +302,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "SymInt size_k) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
+  // CUTLASS nvfp4 block scaled GEMM
+  ops.def(
+      "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
+      "                      Tensor block_scale_a, Tensor block_scale_b,"
+      "                      Tensor alpha) -> ()");
+  ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
+
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
   // quantization, as well as bias
   ops.def(
diff --git a/tests/kernels/test_nvfp4_scaled_mm.py b/tests/kernels/test_nvfp4_scaled_mm.py
new file mode 100644
index 00000000000..b08026c5867
--- /dev/null
+++ b/tests/kernels/test_nvfp4_scaled_mm.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+# m, n, k
+SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
+PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
+SHAPES.extend(PAD_SHAPES)
+
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1fn.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+kE2M1ToFloatArray = [
+    0.,
+    0.5,
+    1.,
+    1.5,
+    2.,
+    3.,
+    4.,
+    6.,
+]
+
+
+def e2m1_to_fp32(int4_value):
+    signBit = (int4_value & 0x8)
+    int4_absValue = int4_value & 0x7
+    float_result = kE2M1ToFloatArray[int4_absValue]
+    if (signBit):
+        float_result = -float_result
+    return float_result
+
+
+def break_fp4_bytes(a, dtype):
+    assert (a.dtype == torch.uint8)
+    m, n = a.shape
+    a = a.flatten()
+    # Get upper 4 bits
+    highHalfByte = (a & 0xF0) >> 4
+    # Get lower 4 bits
+    lowHalfByte = a & 0x0F
+    fH = torch.tensor([e2m1_to_fp32(x) for x in highHalfByte]).to(a.device)
+    fL = torch.tensor([e2m1_to_fp32(x) for x in lowHalfByte]).to(a.device)
+    # [0xAB, 0xCD] -> [0xB, 0xA, 0xD, 0xC]
+    out = torch.stack((fL, fH), dim=-1).reshape(m, n * 2)
+    return out
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    sf_m, sf_k = a_sf_swizzled.shape
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_to_dtype(tensor_fp4,
+                        tensor_sf,
+                        global_scale,
+                        dtype,
+                        device,
+                        block_size=16):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out
+
+
+def get_ref_results(a_fp4, b_fp4, a_sf, b_sf, a_global_scale, b_global_scale,
+                    m, n, dtype, block_size, device):
+    _, m_k = a_fp4.shape
+    _, n_k = b_fp4.shape
+    assert (m_k == n_k)
+    a_in_dtype = dequantize_to_dtype(a_fp4,
+                                     a_sf,
+                                     a_global_scale,
+                                     dtype=dtype,
+                                     device=device,
+                                     block_size=block_size)
+    b_in_dtype = dequantize_to_dtype(b_fp4,
+                                     b_sf,
+                                     b_global_scale,
+                                     dtype=dtype,
+                                     device=device,
+                                     block_size=block_size)
+    return torch.matmul(a_in_dtype, b_in_dtype.t())
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_nvfp4_gemm(
+    dtype: torch.dtype,
+    shape: tuple[int, int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    m, n, packed_k = shape
+    k = packed_k * 2
+    block_size = 16
+    a_dtype = torch.randn((m, k), dtype=dtype, device=device)
+    b_dtype = torch.randn((n, k), dtype=dtype, device=device)
+
+    a_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(a_dtype.flatten(), dim=-1)).to(torch.float32)
+    b_global_scale = ((FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX) /
+                      torch.amax(b_dtype.flatten(), dim=-1)).to(torch.float32)
+    alpha = 1. / (a_global_scale * b_global_scale)
+    a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(a_dtype, a_global_scale)
+    b_fp4, b_scale_interleaved = ops.scaled_fp4_quant(b_dtype, b_global_scale)
+
+    expected_out = get_ref_results(a_fp4, b_fp4, a_scale_interleaved,
+                                   b_scale_interleaved, a_global_scale,
+                                   b_global_scale, m, n, dtype, block_size,
+                                   device)
+    out = ops.cutlass_scaled_fp4_mm(a_fp4, b_fp4, a_scale_interleaved,
+                                    b_scale_interleaved, alpha, dtype)
+
+    torch.testing.assert_close(out,
+                               expected_out.to(dtype=dtype),
+                               atol=1e-1,
+                               rtol=1e-1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2112af1201f..3306610ad80 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -433,6 +433,18 @@ def _ggml_mul_mat_a8_fake(
 
 
 # cutlass
+def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
+                          block_scale_a: torch.Tensor,
+                          block_scale_b: torch.Tensor, alpha: torch.Tensor,
+                          out_dtype: torch.dtype) -> torch.Tensor:
+    assert a.ndim == 2 and b.ndim == 2
+    m, n = a.shape[0], b.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_fp4_mm(out, a, b, block_scale_a, block_scale_b,
+                                       alpha)
+    return out
+
+
 def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
     return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
 

From 9942a1acae45878de6c57ed3ead8c72388557b72 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Sat, 22 Feb 2025 05:25:41 -0800
Subject: [PATCH 0325/1240] [V1][Kernel] Refactor the prefix_prefill kernel so
 that the caller no longer has to pass in the context lengths (#13095)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_prefix_prefill.py       |  8 ++------
 vllm/attention/backends/rocm_flash_attn.py |  1 -
 vllm/attention/backends/xformers.py        |  1 -
 vllm/attention/ops/paged_attn.py           |  4 +---
 vllm/attention/ops/prefix_prefill.py       | 17 +++++++++--------
 vllm/v1/attention/backends/rocm_attn.py    | 12 ------------
 6 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 2184c98525f..c3ac6a37e71 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -100,7 +100,7 @@ def test_contexted_kv_attention(
         BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                             dtype=torch.long),
                                dim=0)
     max_input_len = MAX_SEQ_LEN
@@ -154,7 +154,6 @@ def test_contexted_kv_attention(
                           block_table,
                           b_start_loc,
                           b_seq_len,
-                          b_ctx_len,
                           max_input_len,
                           k_scale,
                           v_scale,
@@ -171,7 +170,6 @@ def test_contexted_kv_attention(
                           block_table,
                           b_start_loc,
                           b_seq_len,
-                          b_ctx_len,
                           max_input_len,
                           k_scale,
                           v_scale,
@@ -333,7 +331,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.long)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1],
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                             dtype=torch.long),
                                dim=0)
     max_input_len = MAX_SEQ_LEN
@@ -387,7 +385,6 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           block_table,
                           b_start_loc,
                           b_seq_len,
-                          b_ctx_len,
                           max_input_len,
                           k_scale,
                           v_scale,
@@ -404,7 +401,6 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
                           block_table,
                           b_start_loc,
                           b_seq_len,
-                          b_ctx_len,
                           max_input_len,
                           k_scale,
                           v_scale,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e1a8d3d3361..1b1f6ca9bee 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -753,7 +753,6 @@ def forward(
                     prefill_meta.block_tables,
                     prefill_meta.query_start_loc,
                     prefill_meta.seq_lens_tensor,
-                    prefill_meta.context_lens_tensor,
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window[0],
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 723a4558d0b..ec8e1f2ee5a 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -580,7 +580,6 @@ def forward(
                     prefill_meta.block_tables,
                     prefill_meta.query_start_loc,
                     prefill_meta.seq_lens_tensor,
-                    prefill_meta.context_lens_tensor,
                     prefill_meta.max_query_len,
                     self.alibi_slopes,
                     self.sliding_window,
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 2c60bd0c38d..fd703413db9 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -202,7 +202,6 @@ def forward_prefix(
         block_tables: torch.Tensor,
         query_start_loc: torch.Tensor,
         seq_lens_tensor: torch.Tensor,
-        context_lens: torch.Tensor,
         max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
@@ -220,9 +219,8 @@ def forward_prefix(
             value_cache,
             block_tables,
             # query_start_loc is (batch_size + 1,)
-            query_start_loc[:-1],
+            query_start_loc,
             seq_lens_tensor,
-            context_lens,
             max_query_len,
             k_scale,
             v_scale,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 362c46a95f3..103c408ebbf 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -31,7 +31,6 @@ def _fwd_kernel(
         v_scale,
         B_Start_Loc,
         B_Seqlen,
-        B_Ctxlen,
         block_size,
         x,
         Out,
@@ -72,10 +71,12 @@ def _fwd_kernel(
 
         cur_kv_head = cur_head // num_queries_per_kv
 
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
-        cur_batch_query_len = cur_batch_seq_len - cur_batch_ctx_len
+        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+        cur_batch_query_len = (cur_batch_in_all_stop_index -
+                               cur_batch_in_all_start_index)
+        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
         # start position inside of the query
         # generally, N goes over kv, while M goes over query_len
@@ -466,7 +467,6 @@ def _fwd_kernel_alibi(
         v_scale,
         B_Start_Loc,
         B_Seqlen,
-        B_Ctxlen,
         Alibi_slopes,
         block_size,
         x,
@@ -511,9 +511,12 @@ def _fwd_kernel_alibi(
         # cur_batch_seq_len: the length of prompts
         # cur_batch_ctx_len: the length of prefix
         # cur_batch_in_all_start_index: the start id of the dim=0
-        cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
         cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
         cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+        cur_batch_in_all_stop_index = tl.load(B_Start_Loc + cur_batch + 1)
+        cur_batch_query_len = (cur_batch_in_all_stop_index -
+                               cur_batch_in_all_start_index)
+        cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
         block_start_loc = BLOCK_M * start_m
 
@@ -713,7 +716,6 @@ def context_attention_fwd(q,
                               b_loc,
                               b_start_loc,
                               b_seq_len,
-                              b_ctx_len,
                               max_input_len,
                               k_scale: torch.Tensor,
                               v_scale: torch.Tensor,
@@ -765,6 +767,7 @@ def context_attention_fwd(q,
         batch, head = b_seq_len.shape[0], q.shape[1]
         num_queries_per_kv = q.shape[1] // k.shape[1]
 
+        assert batch + 1 == len(b_start_loc)
         grid = (batch, head, triton.cdiv(max_input_len, BLOCK))  # batch, head,
 
         # 0 means "disable"
@@ -784,7 +787,6 @@ def context_attention_fwd(q,
                 v_scale,
                 b_start_loc,
                 b_seq_len,
-                b_ctx_len,
                 alibi_slopes,
                 v_cache.shape[3],
                 k_cache.shape[4],
@@ -838,7 +840,6 @@ def context_attention_fwd(q,
             v_scale,
             b_start_loc,
             b_seq_len,
-            b_ctx_len,
             v_cache.shape[3],
             k_cache.shape[4],
             o,
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 5f3eb37514d..0f3fabf05fc 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -150,17 +150,6 @@ def forward(
             layer._v_scale,
         )
 
-        # TODO(sage): Refactor the context_attention_fwd kernel so that this
-        # overhead can be removed
-        context_lens = torch.empty_like(attn_metadata.seq_lens)
-        batch_size = len(attn_metadata.query_start_loc) - 1
-        assert len(context_lens) == batch_size
-        for i in range(batch_size):
-            query_start = attn_metadata.query_start_loc[i]
-            query_end = attn_metadata.query_start_loc[i + 1]
-            context_lens[i] = attn_metadata.seq_lens[i] - (query_end -
-                                                           query_start)
-
         # Compute attention and update output up to `num_actual_tokens`.
         context_attention_fwd(q=query[:num_actual_tokens],
                               k=key[:num_actual_tokens],
@@ -172,7 +161,6 @@ def forward(
                               b_loc=attn_metadata.block_table,
                               b_start_loc=attn_metadata.query_start_loc,
                               b_seq_len=attn_metadata.seq_lens,
-                              b_ctx_len=context_lens,
                               max_input_len=attn_metadata.max_query_len,
                               k_scale=layer._k_scale,
                               v_scale=layer._v_scale,

From 9ad443321dc8422c1c6d4b54abf141c9ac0d92bd Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Sat, 22 Feb 2025 08:54:38 -0500
Subject: [PATCH 0326/1240] [ROCm] Apply FP8 weights padding to values not
 divisible by 512 bytes on ROCm (#13231)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                                      |  4 ++++
 vllm/model_executor/layers/quantization/fp8.py    | 15 +++++++++++++++
 .../layers/quantization/utils/fp8_utils.py        |  2 +-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 1eb9b9f1bbf..1104f108784 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,6 +74,7 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
+    VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -507,6 +508,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
 
+    # Pad the fp8 weights to 256 bytes for ROCm
+    "VLLM_ROCM_FP8_PADDING":
+    lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
     # Divisor for dynamic key scale factor calculation for FP8 KV Cache
     "K_SCALE_CONSTANT":
     lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index fe8ff7ca5e1..1ca39b0ffa8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -3,6 +3,7 @@
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
+import torch.nn.functional as F
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
@@ -251,6 +252,17 @@ def create_weights(
             else:
                 layer.register_parameter("input_scale", None)
 
+    def add_padding_to_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
+                and weight.stride(-1) == 1
+                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+        return weight
+
     def process_weights_after_loading(self, layer: Module) -> None:
         # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
@@ -264,6 +276,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 weight = layer.weight.data
                 weight_scale_inv = layer.weight_scale_inv.data
 
+            weight = self.add_padding_to_weight(weight)
+
             # Torch.compile cannot use Parameter subclasses.
             layer.weight = Parameter(weight, requires_grad=False)
             layer.weight_scale_inv = Parameter(weight_scale_inv,
@@ -327,6 +341,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     logical_widths=layer.logical_widths,
                 )
 
+            weight = self.add_padding_to_weight(weight)
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 891edf23010..61706f485f4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -494,7 +494,7 @@ def w8a8_block_fp8_matmul(
     assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
     M = A.numel() // A.shape[-1]
 
-    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert B.ndim == 2 and Bs.ndim == 2
     N, K = B.shape
     assert triton.cdiv(N, block_n) == Bs.shape[0]
     assert triton.cdiv(K, block_k) == Bs.shape[1]

From 04b01acb5b59c02f7f02f5baeae913d695ef29fa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 22 Feb 2025 22:04:31 +0800
Subject: [PATCH 0327/1240] [Doc] Dockerfile instructions for optional
 dependencies and dev transformers (#13699)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/docker.md | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 334c02225bd..9e52a2182cf 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -27,6 +27,36 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
 memory to share data between processes under the hood, particularly for tensor parallel inference.
 :::
 
+:::{note}
+Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+
+If you need to use those dependencies (having accepted the license terms),
+create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+```Dockerfile
+FROM vllm/vllm-openai:v0.7.3
+
+# e.g. install the `audio` and `video` optional dependencies
+# NOTE: Make sure the version of vLLM matches the base image!
+RUN uv pip install --system vllm[audio,video]==0.7.3
+```
+
+:::
+
+:::{tip}
+Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+with an extra layer that installs their code from source:
+
+```Dockerfile
+FROM vllm/vllm-openai:latest
+
+RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+```
+
+:::
+
 (deployment-docker-build-image-from-source)=
 
 ## Building vLLM's Docker Image from Source

From 1abcbf22c99b8556c15fedeea0cf7d83805bacf1 Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Sat, 22 Feb 2025 17:04:12 +0100
Subject: [PATCH 0328/1240] [Bugfix] Fix boolean conversion for OpenVINO env
 variable (#13615)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                                 | 5 +++--
 vllm/model_executor/model_loader/openvino.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 1104f108784..8be9ebb95dd 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -360,8 +360,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Enables weights compression during model export via HF Optimum
     # default is False
     "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
-    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
-
+    lambda:
+    (os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in
+     ("on", "true", "1")),
     # If the env var is set, then all workers will execute as separate
     # processes from the engine, and we use the same mechanism to trigger
     # execution on all workers.
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index fde200d576e..805f0cfc585 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -125,7 +125,8 @@ def __init__(
                 "as-is, all possible options that may affect model conversion "
                 "are ignored.")
 
-        load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+        load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
+                        if export else False)
         pt_model = OVModelForCausalLM.from_pretrained(
             model_config.model,
             export=export,

From b6502eff7fe32c585cf5163a969cd81245b9f723 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Sun, 23 Feb 2025 00:05:35 +0800
Subject: [PATCH 0329/1240] [XPU]fix setuptools version for xpu (#13548)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-xpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 42c6c321d04..be5cb6a4a99 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -6,6 +6,7 @@ cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8
+setuptools>=75.8.0
 wheel
 jinja2
 

From 13d42b64cf8bc0211d73afec3e6a46290d40866a Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Sat, 22 Feb 2025 17:25:20 +0100
Subject: [PATCH 0330/1240] [CI/Build] fix uv caching in Dockerfile (#13611)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 310e003d427..63314b906f1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,7 +28,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv
 
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
@@ -53,14 +53,14 @@ WORKDIR /workspace
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-cuda.txt
 
 # cuda arch list used by torch
@@ -81,7 +81,7 @@ ARG TARGETPLATFORM
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-build.txt
 
 COPY . .
@@ -101,7 +101,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
     if [ "$USE_SCCACHE" = "1" ]; then \
         echo "Installing sccache..." \
@@ -121,7 +121,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
@@ -146,7 +146,7 @@ FROM base as dev
 COPY requirements-lint.txt requirements-lint.txt
 COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-dev.txt
 #################### DEV IMAGE ####################
 
@@ -178,7 +178,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
@@ -191,14 +191,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system dist/*.whl --verbose
 
 # If we need to build FlashInfer wheel before its release:
@@ -213,7 +213,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ ls dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
     uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
@@ -225,7 +225,7 @@ COPY examples examples
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-build.txt
 
 #################### vLLM installation IMAGE ####################
@@ -238,15 +238,15 @@ FROM vllm-base AS test
 ADD . /vllm-workspace/
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-dev.txt
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
@@ -266,7 +266,7 @@ RUN mv vllm test_docs/
 FROM vllm-base AS vllm-openai-base
 
 # install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \

From e6ead61d5a5c38c7d276daf1bc5eae002c532ef2 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 22 Feb 2025 16:50:38 -0800
Subject: [PATCH 0331/1240] [CI/Build] Fix pre-commit errors from #13571
 (#13709)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu   | 7 ++++---
 csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
index a0852c5732e..7b57b32fdb0 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -31,7 +31,8 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
 #if defined ENABLE_NVFP4 && ENABLE_NVFP4
   return cutlass_scaled_fp4_mm_sm100a(D, A, B, A_sf, B_sf, alpha);
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 mm kernel, vLLM should "
-                                     "be compiled using CUDA 12.8 and target "
-                                     "compute capability 100 or above.");
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 mm kernel, vLLM should "
+                              "be compiled using CUDA 12.8 and target "
+                              "compute capability 100 or above.");
 }
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
index 26fd91217db..9b30e4fef35 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -194,8 +194,9 @@ void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
              at::Tensor const& A_sf, at::Tensor const& B_sf,
              at::Tensor const& alpha, int64_t m, int64_t n, int64_t k,
              cudaStream_t stream) {
-  TORCH_CHECK(false, "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
-                     "a CUTLASS 3.8 source directory to enable support.");
+  TORCH_CHECK(false,
+              "Unsupported CUTLASS version. Set VLLM_CUTLASS_SRC_DIR to "
+              "a CUTLASS 3.8 source directory to enable support.");
 }
 #endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 

From 1737301441fb19348ac51ad3760348bb8b2715a5 Mon Sep 17 00:00:00 2001
From: Andy Lo <andy@mistral.ai>
Date: Sun, 23 Feb 2025 00:51:13 +0000
Subject: [PATCH 0332/1240] [BugFix] Minor: logger import in attention backend
 (#13706)

Signed-off-by: Andy Lo <andy@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 5c1f9916e22..baf01c9263d 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -12,12 +12,12 @@
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
-from vllm.logger import logging
+from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
-logger = logging.getLogger(__name__)
+logger = init_logger(__name__)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner_base import ModelRunnerBase

From 2b45fa9859a5a5ded8a8ae54ad243374ba7570da Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Sat, 22 Feb 2025 19:19:45 -0800
Subject: [PATCH 0333/1240] [ci] Use env var to control whether to use S3
 bucket in CI (#13634)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   4 +-
 .../test_basic_correctness.py                 |  11 +-
 tests/basic_correctness/test_cumem.py         |   9 +-
 tests/conftest.py                             |  73 +---------
 tests/engine/test_computed_prefix_blocks.py   |   7 +-
 tests/engine/test_detokenization.py           |   8 +-
 tests/engine/test_executor.py                 |  21 +--
 tests/engine/test_skip_tokenizer_init.py      |  13 +-
 tests/entrypoints/llm/test_chat.py            |  13 +-
 tests/entrypoints/llm/test_collective_rpc.py  |   2 +-
 tests/entrypoints/llm/test_encode.py          |   4 +-
 tests/entrypoints/llm/test_generate.py        |   4 +-
 .../llm/test_generate_multiple_loras.py       |   4 +-
 tests/entrypoints/llm/test_guided_generate.py |   7 +-
 tests/entrypoints/llm/test_lazy_outlines.py   |   7 +-
 .../entrypoints/llm/test_prompt_validation.py |   9 +-
 tests/metrics/test_metrics.py                 |  55 ++++----
 tests/models/test_initialization.py           |   6 +-
 tests/mq_llm_engine/test_abort.py             |   4 +-
 tests/mq_llm_engine/test_error_handling.py    |   6 +-
 tests/mq_llm_engine/test_load.py              |   6 +-
 tests/multimodal/test_processing.py           |   6 +-
 tests/prefix_caching/test_prefix_caching.py   |   2 +-
 tests/test_config.py                          |  14 +-
 tests/test_regression.py                      |  13 +-
 tests/worker/test_swap.py                     |   2 +-
 vllm/engine/arg_utils.py                      |   9 ++
 vllm/envs.py                                  |   4 +
 vllm/model_executor/model_loader/loader.py    |   1 -
 vllm/test_utils.py                            | 129 ++++++++++++++++++
 30 files changed, 222 insertions(+), 231 deletions(-)
 create mode 100644 vllm/test_utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d96f0183bc6..931057e6c19 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -278,7 +278,7 @@ steps:
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
   parallelism: 4
 
-- label: "PyTorch Fullgraph Smoke Test" # 9min
+- label: PyTorch Fullgraph Smoke Test # 9min
   fast_check: true
   source_file_dependencies:
   - vllm/
@@ -289,7 +289,7 @@ steps:
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
 
-- label: "PyTorch Fullgraph Test" # 18min
+- label: PyTorch Fullgraph Test # 18min
   source_file_dependencies:
   - vllm/
   - tests/compile
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index cc25c8792aa..d2fc0916bc5 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -9,7 +9,6 @@
 import pytest
 
 from vllm import LLM
-from vllm.config import LoadFormat
 from vllm.platforms import current_platform
 
 from ..conftest import VllmRunner
@@ -34,7 +33,7 @@ def v1(run_with_both_engines):
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
+    llm = LLM("distilbert/distilgpt2")
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -43,10 +42,10 @@ def test_vllm_gc_ed():
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
     hf_runner,
     model: str,
@@ -97,8 +96,8 @@ def test_models(
     "test_suite", [
         ("distilbert/distilgpt2", "ray", "", "L4"),
         ("distilbert/distilgpt2", "mp", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
-        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
+        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
         ("distilbert/distilgpt2", "ray", "", "A100"),
         ("distilbert/distilgpt2", "mp", "", "A100"),
         ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f1148fc8e3f..61c79a7bbc9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -4,11 +4,9 @@
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
 
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test
 
 
@@ -121,7 +119,7 @@ def model(x):
     "model, use_v1",
     [
         # sleep mode with safetensors
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/meta-llama/Llama-3.2-1B", True),
+        ("meta-llama/Llama-3.2-1B", True),
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
     ])
@@ -130,10 +128,7 @@ def test_end_to_end(model: str, use_v1: bool):
     os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    load_format = LoadFormat.AUTO
-    if "Llama" in model:
-        load_format = LoadFormat.RUNAI_STREAMER
-    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
+    llm = LLM(model, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)
diff --git a/tests/conftest.py b/tests/conftest.py
index 9304b8f17dc..dd339030e5e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,7 +24,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -47,70 +47,6 @@
 
 _M = TypeVar("_M")
 
-MODELS_ON_S3 = [
-    "distilbert/distilgpt2",
-    "meta-llama/Llama-2-7b-hf",
-    "meta-llama/Meta-Llama-3-8B",
-    "meta-llama/Llama-3.2-1B",
-    "meta-llama/Llama-3.2-1B-Instruct",
-    "openai-community/gpt2",
-    "ArthurZ/Ilama-3.2-1B",
-    "llava-hf/llava-1.5-7b-hf",
-    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "ai21labs/Jamba-tiny-random",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Phi-3-mini-128k-instruct-FP8",
-    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
-    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
-    "AMead10/Llama-3.2-1B-Instruct-AWQ",
-    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
-    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
-    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
-    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
-    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
-    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
-    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
-    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
-    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
-    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
-    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
-    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
-    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
-    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
-    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
-    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
-    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
-]
-
-MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
-
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -742,14 +678,8 @@ def __init__(
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
         enforce_eager: Optional[bool] = False,
-        load_format: Optional[LoadFormat] = None,
         **kwargs,
     ) -> None:
-        if model_name in MODELS_ON_S3 and not load_format:
-            model_name = (f"{MODEL_WEIGHTS_S3_BUCKET}/{model_name}")
-            load_format = LoadFormat.RUNAI_STREAMER
-        if not load_format:
-            load_format = LoadFormat.AUTO
         self.model = LLM(
             model=model_name,
             task=task,
@@ -764,7 +694,6 @@ def __init__(
             max_model_len=max_model_len,
             block_size=block_size,
             enable_chunked_prefill=enable_chunked_prefill,
-            load_format=load_format,
             **kwargs,
         )
 
diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
index 51e7c8e7739..049fa2c8b12 100644
--- a/tests/engine/test_computed_prefix_blocks.py
+++ b/tests/engine/test_computed_prefix_blocks.py
@@ -2,16 +2,12 @@
 
 import pytest
 
-from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
@@ -28,7 +24,6 @@ def test_computed_prefix_blocks(model: str, block_size: int):
         "decoration.")
 
     engine_args = EngineArgs(model=model,
-                             load_format=LoadFormat.RUNAI_STREAMER,
                              block_size=block_size,
                              enable_prefix_caching=True)
 
diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
index 6ae4be2e478..2b7ebf705bb 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/engine/test_detokenization.py
@@ -2,15 +2,11 @@
 
 import pytest
 
-from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
@@ -21,7 +17,7 @@ def test_computed_prefix_blocks(model: str):
         "paper clips? Is there an easy to follow video tutorial available "
         "online for free?")
 
-    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
+    llm = LLM(model=model)
     sampling_params = SamplingParams(max_tokens=10,
                                      temperature=0.0,
                                      detokenize=False)
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index 6a86401ce5d..c0a339e46ec 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -6,17 +6,12 @@
 
 import pytest
 
-from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
-
-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
-
 
 class Mock:
     ...
@@ -38,12 +33,10 @@ def collective_rpc(self,
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
-                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                  distributed_executor_backend=Mock)
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
@@ -52,8 +45,7 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -62,7 +54,6 @@ def test_custom_executor(model, tmp_path):
 
         engine_args = EngineArgs(
             model=model,
-            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutor,
             enforce_eager=True,  # reduce test time
         )
@@ -77,8 +68,7 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -87,7 +77,6 @@ def test_custom_executor_async(model, tmp_path):
 
         engine_args = AsyncEngineArgs(
             model=model,
-            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutorAsync,
             enforce_eager=True,  # reduce test time
         )
@@ -106,8 +95,7 @@ async def t():
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
@@ -116,7 +104,6 @@ def test_respect_ray(model):
     engine_args = EngineArgs(
         model=model,
         distributed_executor_backend="ray",
-        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         enforce_eager=True,  # reduce test time
     )
     engine = LLMEngine.from_engine_args(engine_args)
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
index b0930eaac17..5e197f5ffe5 100644
--- a/tests/engine/test_skip_tokenizer_init.py
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -2,22 +2,19 @@
 
 import pytest
 
-from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-
-@pytest.mark.parametrize("model",
-                         [f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2"])
+@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
-    llm = LLM(model=model,
-              skip_tokenizer_init=True,
-              load_format=LoadFormat.RUNAI_STREAMER)
+    llm = LLM(
+        model=model,
+        skip_tokenizer_init=True,
+    )
     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index f6fda5120d9..77c80b2f894 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -5,17 +5,12 @@
 import pytest
 
 from vllm import LLM
-from vllm.config import LoadFormat
 
-from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 
-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
-
 
 def test_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
 
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -33,8 +28,7 @@ def test_chat():
 
 
 def test_multi_chat():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
-              load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
 
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
@@ -71,8 +65,7 @@ def test_multi_chat():
                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
     llm = LLM(
-        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
-        load_format=RUNAI_STREAMER_LOAD_FORMAT,
+        model="microsoft/Phi-3.5-vision-instruct",
         dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 69c60bbe6e8..39d4810de9e 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -28,7 +28,7 @@ class MyWorker(Worker):
         def echo_rank(self):
             return self.rank
 
-    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
+    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
               enforce_eager=True,
               load_format="dummy",
               tensor_parallel_size=tp_size,
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 61085bf43d1..ebec8baba38 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -6,10 +6,9 @@
 import pytest
 
 from vllm import LLM, PoolingParams, PoolingRequestOutput
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "s3://vllm-ci-model-weights/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
 
 PROMPTS = [
     "Hello, my name is",
@@ -33,7 +32,6 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=32768,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.75,
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index f1bad876be4..910e1a4507c 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,10 +6,9 @@
 import pytest
 
 from vllm import LLM, RequestOutput, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "s3://vllm-ci-model-weights/distilgpt2"
+MODEL_NAME = "distilbert/distilgpt2"
 
 PROMPTS = [
     "Hello, my name is",
@@ -31,7 +30,6 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
               max_num_batched_tokens=4096,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.10,
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 487c00460a6..90e1d581413 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -7,11 +7,10 @@
 from huggingface_hub import snapshot_download
 
 from vllm import LLM
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 
-MODEL_NAME = "s3://vllm-ci-model-weights/zephyr-7b-beta"
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 PROMPTS = [
     "Hello, my name is",
@@ -28,7 +27,6 @@ def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
               tensor_parallel_size=1,
               max_model_len=8192,
               enable_lora=True,
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 252eb3fb334..314dc59328c 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -7,13 +7,12 @@
 import jsonschema
 import pytest
 
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "s3://vllm-ci-model-weights/Qwen2.5-1.5B-Instruct"
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 
 
@@ -21,9 +20,7 @@
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              load_format=LoadFormat.RUNAI_STREAMER,
-              max_model_len=1024)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 07608e15fe9..0598e3990d8 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -6,7 +6,6 @@
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
@@ -44,8 +43,7 @@ def run_normal():
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM without guided decoding as a baseline.
-    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
               enforce_eager=True,
               gpu_memory_utilization=0.3)
     outputs = llm.generate(prompts, sampling_params)
@@ -61,8 +59,7 @@ def run_normal():
 
 def run_lmfe(sample_regex):
     # Create an LLM with guided decoding enabled.
-    llm = LLM(model="s3://vllm-ci-model-weights/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
               gpu_memory_utilization=0.3)
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 04848131dfc..61bd1d462a5 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm import LLM
-from vllm.config import LoadFormat
 
 
 @pytest.fixture(autouse=True)
@@ -15,17 +14,13 @@ def v1(run_with_both_engines):
 
 
 def test_empty_prompt():
-    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
     with pytest.raises(ValueError, match='Prompt cannot be empty'):
         llm.generate([""])
 
 
 @pytest.mark.skip_v1
 def test_out_of_vocab_token():
-    llm = LLM(model="s3://vllm-ci-model-weights/gpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="openai-community/gpt2", enforce_eager=True)
     with pytest.raises(ValueError, match='out of vocabulary'):
         llm.generate({"prompt_token_ids": [999999]})
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 45a13488f07..d6183379c39 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -8,21 +8,17 @@
 from prometheus_client import REGISTRY
 
 from vllm import EngineArgs, LLMEngine
-from vllm.config import LoadFormat
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
-
-from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
 
 MODELS = [
     "distilbert/distilgpt2",
 ]
 
-RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
-
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
@@ -146,9 +142,8 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
-        actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
-        assert metrics_tag_content == actual_model_name, (
-            f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
+        assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", (
+            f"Metrics tag model_name is wrong! expect: {model!r}\n"
             f"actual: {metrics_tag_content!r}")
     else:
         assert metrics_tag_content == served_model_name[0], (
@@ -174,10 +169,11 @@ async def test_async_engine_log_metrics_regression(
     when disable_log_stats=False
     (see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
     """
-    engine_args = AsyncEngineArgs(model=model,
-                                  dtype=dtype,
-                                  disable_log_stats=disable_log_stats,
-                                  load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = AsyncEngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
     async_engine = AsyncLLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         results = async_engine.generate(
@@ -189,7 +185,7 @@ async def test_async_engine_log_metrics_regression(
         async for _ in results:
             pass
 
-    assert_metrics(async_engine.engine, disable_log_stats,
+    assert_metrics(model, async_engine.engine, disable_log_stats,
                    len(example_prompts))
 
 
@@ -204,10 +200,11 @@ def test_engine_log_metrics_regression(
     max_tokens: int,
     disable_log_stats: bool,
 ) -> None:
-    engine_args = EngineArgs(model=model,
-                             dtype=dtype,
-                             disable_log_stats=disable_log_stats,
-                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=disable_log_stats,
+    )
     engine = LLMEngine.from_engine_args(engine_args)
     for i, prompt in enumerate(example_prompts):
         engine.add_request(
@@ -218,7 +215,8 @@ def test_engine_log_metrics_regression(
     while engine.has_unfinished_requests():
         engine.step()
 
-    assert_metrics(engine, disable_log_stats, len(example_prompts))
+    assert_metrics(f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", engine,
+                   disable_log_stats, len(example_prompts))
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -285,14 +283,15 @@ def test_metric_spec_decode_interval(
 ) -> None:
     k = 5
 
-    engine_args = EngineArgs(model=model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4,
-                             speculative_model=model,
-                             num_speculative_tokens=k,
-                             enforce_eager=True,
-                             load_format=RUNAI_STREAMER_LOAD_FORMAT)
+    engine_args = EngineArgs(
+        model=model,
+        dtype=dtype,
+        disable_log_stats=False,
+        gpu_memory_utilization=0.4,
+        speculative_model=model,
+        num_speculative_tokens=k,
+        enforce_eager=True,
+    )
 
     engine = LLMEngine.from_engine_args(engine_args)
 
@@ -359,7 +358,7 @@ def test_metric_spec_decode_interval(
         cleanup_dist_env_and_memory()
 
 
-def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
+def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:
         with pytest.raises(AttributeError):
@@ -370,7 +369,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
         # Ensure the count bucket of request-level histogram metrics matches
         # the number of requests as a simple sanity check to ensure metrics are
         # generated
-        labels = {'model_name': engine.model_config.model}
+        labels = {'model_name': model}
         request_histogram_metrics = [
             "vllm:e2e_request_latency_seconds",
             "vllm:request_prompt_tokens",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index e0d5e003227..c58c6372316 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -7,7 +7,6 @@
 
 from vllm import LLM
 
-from ..conftest import MODELS_ON_S3
 from .registry import HF_EXAMPLE_MODELS
 
 
@@ -43,11 +42,8 @@ def _initialize_kv_caches(self) -> None:
 
     with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
                       _initialize_kv_caches):
-        model_name = model_info.default
-        if model_name in MODELS_ON_S3:
-            model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
         LLM(
-            model_name,
+            model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
             speculative_model=model_info.speculative_model,
diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py
index b0ac0fb327f..808346b5e58 100644
--- a/tests/mq_llm_engine/test_abort.py
+++ b/tests/mq_llm_engine/test_abort.py
@@ -10,8 +10,8 @@
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
+MODEL = "google/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 EXPECTED_TOKENS = 250
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 4eac73417ad..35d00178111 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -21,10 +21,8 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 
-MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
-                              load_format="runai_streamer",
-                              enforce_eager=True)
+MODEL = "google/gemma-1.1-2b-it"
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
 RAISED_ERROR = KeyError
 RAISED_VALUE = "foo"
 
diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py
index 3162d56c6d4..2069ff987f2 100644
--- a/tests/mq_llm_engine/test_load.py
+++ b/tests/mq_llm_engine/test_load.py
@@ -10,14 +10,12 @@
 from tests.mq_llm_engine.utils import RemoteMQLLMEngine, generate
 from vllm.engine.arg_utils import AsyncEngineArgs
 
-MODEL = "s3://vllm-ci-model-weights/gemma-1.1-2b-it"
+MODEL = "google/gemma-1.1-2b-it"
 NUM_EXPECTED_TOKENS = 10
 NUM_REQUESTS = 10000
 
 # Scenarios to test for num generated token.
-ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
-                              load_format="runai_streamer",
-                              disable_log_requests=True)
+ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
 
 
 @pytest.fixture(scope="function")
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b247321ebb2..c2fbe83abc8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -553,8 +553,7 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-@pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
@@ -593,8 +592,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         profiler.get_dummy_data(model_config.max_model_len)
 
 
-@pytest.mark.parametrize(
-    "model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("num_images", "limit", "is_valid"),
     [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 90d424fe35d..2773d27a681 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -16,7 +16,7 @@
 from ..models.utils import check_outputs_equal
 
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
 UNSTABLE_PROMPT_SEQUENCE = [
diff --git a/tests/test_config.py b/tests/test_config.py
index bc87e6ccdfc..8927a14d79a 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -8,20 +8,14 @@
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
-from .conftest import MODEL_WEIGHTS_S3_BUCKET
-
 
 @pytest.mark.parametrize(
     ("model_id", "expected_runner_type", "expected_task"),
     [
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
-         "generate"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
-         "pooling", "embed"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
-         "classify"),
-        (f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
-         "pooling", "score"),
+        ("distilbert/distilgpt2", "generate", "generate"),
+        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
         ("openai/whisper-small", "transcription", "transcription"),
     ],
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 8cecc2892b6..ce9498e8d7e 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -10,9 +10,6 @@
 import torch
 
 from vllm import LLM, SamplingParams
-from vllm.config import LoadFormat
-
-from .conftest import MODEL_WEIGHTS_S3_BUCKET
 
 
 def test_duplicated_ignored_sequence_group():
@@ -21,8 +18,7 @@ def test_duplicated_ignored_sequence_group():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=256)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
@@ -35,8 +31,7 @@ def test_max_tokens_none():
     sampling_params = SamplingParams(temperature=0.01,
                                      top_p=0.1,
                                      max_tokens=None)
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
+    llm = LLM(model="distilbert/distilgpt2",
               max_num_batched_tokens=4096,
               tensor_parallel_size=1)
     prompts = ["Just say hello!"]
@@ -46,9 +41,7 @@ def test_max_tokens_none():
 
 
 def test_gc():
-    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
-              load_format=LoadFormat.RUNAI_STREAMER,
-              enforce_eager=True)
+    llm = LLM(model="distilbert/distilgpt2", enforce_eager=True)
     del llm
 
     gc.collect()
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 2c337cc9fed..3ab8070999b 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -10,7 +10,7 @@
 
 def test_swap() -> None:
     # Configure the engine.
-    engine_args = EngineArgs(model="s3://vllm-ci-model-weights/distilgpt2",
+    engine_args = EngineArgs(model="distilbert/distilgpt2",
                              dtype="half",
                              load_format="dummy")
     engine_config = engine_args.create_engine_config()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d75e2324f5c..bab7cfe2aa3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -22,6 +22,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
@@ -1141,6 +1142,14 @@ def create_engine_config(self,
             f", but got {self.cpu_offload_gb}")
 
         device_config = DeviceConfig(device=self.device)
+
+        # NOTE: This is to allow model loading from S3 in CI
+        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
+                and self.model in MODELS_ON_S3
+                and self.load_format == LoadFormat.AUTO):  # noqa: E501
+            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
+            self.load_format = LoadFormat.RUNAI_STREAMER
+
         model_config = self.create_model_config()
 
         if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
diff --git a/vllm/envs.py b/vllm/envs.py
index 8be9ebb95dd..dbf1d462396 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -618,6 +618,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Port of the master node in the data parallel setting
     "VLLM_DP_MASTER_PORT":
     lambda: int(os.getenv("VLLM_DP_MASTER_PORT", "0")),
+
+    # Whether to use S3 path for model loading in CI via RunAI Streamer
+    "VLLM_CI_USE_S3":
+    lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index df957cfca3c..8736cf1ca34 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1394,7 +1394,6 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
 
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     """Get a model loader based on the load format."""
-
     if isinstance(load_config.load_format, type):
         return load_config.load_format(load_config)
 
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
new file mode 100644
index 00000000000..eb9a4d80a2c
--- /dev/null
+++ b/vllm/test_utils.py
@@ -0,0 +1,129 @@
+# SPDX-License-Identifier: Apache-2.0
+MODELS_ON_S3 = [
+    "adept/fuyu-8b",
+    "ai21labs/AI21-Jamba-1.5-Mini",
+    "ai21labs/Jamba-tiny-random",
+    "ai21labs/Jamba-tiny-reward-dev",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/OLMo-1B-hf",
+    "allenai/OLMoE-1B-7B-0924-Instruct",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "ArthurZ/Ilama-3.2-1B",
+    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-multilingual-gemma2",
+    "BAAI/bge-reranker-v2-m3",
+    "bigcode/starcoder2-3b",
+    "cross-encoder/ms-marco-MiniLM-L-6-v2",
+    "cross-encoder/quora-roberta-base",
+    "deepseek-ai/deepseek-vl2-tiny",
+    "distilbert/distilgpt2",
+    "facebook/bart-base",
+    "facebook/bart-large-cnn",
+    # "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "google/gemma-1.1-2b-it",
+    "google/gemma-2-2b-it",
+    "google/paligemma-3b-pt-224",
+    "h2oai/h2ovl-mississippi-800m",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "internlm/internlm2-1_8b-reward",
+    "intfloat/e5-mistral-7b-instruct",
+    "intfloat/multilingual-e5-large",
+    "jason9693/Qwen2.5-1.5B-apeach",
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "llava-hf/llava-v1.6-mistral-7b-hf",
+    "llava-hf/LLaVA-NeXT-Video-7B-hf",
+    # "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Meta-Llama-3-8B",
+    "microsoft/phi-2",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "microsoft/Phi-3-small-8k-instruct",
+    "microsoft/Phi-3-vision-128k-instruct",
+    "microsoft/Phi-3.5-MoE-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
+    # "mistralai/Mistral-7B-Instruct-v0.1",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/Mixtral-8x22B-v0.1-AWQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nvidia/NVLM-D-72B",
+    "openai-community/gpt2",
+    # "openai/whisper-large-v3",
+    "openbmb/MiniCPM-o-2_6",
+    "openbmb/MiniCPM-V-2_6",
+    "OpenGVLab/InternVL2-1B",
+    "parasail-ai/GritLM-7B-vllm",
+    "Qwen/Qwen1.5-MoE-A2.7B-Chat",
+    "Qwen/Qwen2-7B-Instruct",
+    "Qwen/Qwen2-Audio-7B-Instruct",
+    "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    "Qwen/Qwen2.5-Math-PRM-7B",
+    "Qwen/Qwen2.5-Math-RM-72B",
+    "Qwen/Qwen2.5-VL-3B-Instruct",
+    "royokong/e5-v",
+    "sentence-transformers/all-roberta-large-v1",
+    "sentence-transformers/stsb-roberta-base-v2",
+    "shanearora/OLMo-7B-1124-hf",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ssmits/Qwen2-7B-Instruct-embed-base",
+    "stabilityai/stablelm-3b-4e1t",
+    "stabilityai/stablelm-zephyr-3b",
+    "state-spaces/mamba-130m-hf",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "THUDM/glm-4v-9b",
+    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "TIGER-Lab/VLM2Vec-Full",
+    "tiiuae/falcon-40b",
+    "tiiuae/falcon-mamba-7b-instruct",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "upstage/solar-pro-preview-instruct",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

From 8e258ba24136677bfded08535d34b2eaa921c69f Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sat, 22 Feb 2025 22:21:15 -0500
Subject: [PATCH 0334/1240] [Quant] BaiChuan SupportsQuant (#13710)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/baichuan.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index b613b70a756..2e51b9c9c0c 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -46,7 +46,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
@@ -334,7 +334,8 @@ def forward(
         return hidden_states
 
 
-class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
+                              SupportsQuant):
     packed_modules_mapping = {
         "W_pack": ["W_pack"],
         "gate_up_proj": [

From 28af766c70ab719dea7d45e7d5fdd220259ed2cf Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 23 Feb 2025 17:46:03 +0800
Subject: [PATCH 0335/1240] [LMM] Implement merged multimodal processor for
 whisper (#13278)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../multimodal/processing/test_common.py      |  11 +-
 vllm/model_executor/models/whisper.py         | 206 +++++++++++-------
 vllm/multimodal/processing.py                 |   5 +-
 vllm/multimodal/profiling.py                  |  11 +-
 4 files changed, 150 insertions(+), 83 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 331ffe82ec8..0115863f562 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -83,11 +83,11 @@ def _test_processing_correctness(
     }
 
     tokenizer_encode_kwargs = {}
-    if model_config.hf_config.model_type == "mllama":
-        # For Mllama, tokenizer will always add bos_token at the beginning of
-        # prompt by default, causing hf_processor outputs incorrect token ids.
-        # So we need use `add_special_tokens=False` here to leave bos_token
-        # to be added by the processor.
+    if model_config.hf_config.model_type in ("mllama", "whisper"):
+        # For some encoder-decoder models, tokenizer will always add bos_token
+        # at the beginning of prompt by default, causing hf_processor outputs
+        # incorrect token ids. So we need use `add_special_tokens=False` here
+        # to leave bos_token to be added by the processor.
         tokenizer_encode_kwargs = {"add_special_tokens": False}
 
     for batch_idx in range(num_batches):
@@ -173,6 +173,7 @@ def _test_processing_correctness(
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "openai/whisper-large-v3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 073a30d25e2..2ad1731144e 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -4,15 +4,15 @@
 from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
                     Union)
 
-import numpy as np
 import torch
 from torch import nn
+from transformers import (BatchFeature, WhisperConfig, WhisperFeatureExtractor,
+                          WhisperProcessor)
 from transformers.models.whisper.modeling_whisper import sinusoids
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -25,11 +25,14 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             NestedTensors)
-from vllm.multimodal.audio import resample_audio
-from vllm.sequence import SequenceData
-from vllm.transformers_utils.processor import cached_processor_from_config
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (MultiModalDataDict, MultiModalDataItems,
+                                   MultiModalDataParser)
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptReplacement)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .interfaces import SupportsMultiModal, SupportsTranscription
 from .utils import AutoWeightsLoader, WeightsMapper, make_layers
@@ -571,72 +574,126 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-def get_max_whisper_audio_tokens(ctx: InputContext) -> int:
-    return ctx.model_config.hf_config.max_source_positions
-
-
-def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
-                                   mm_counts: Mapping[str, int]):
-    assert mm_counts["audio"] == 1
-    num_tokens = get_max_whisper_audio_tokens(ctx)
-    processor = cached_processor_from_config(ctx.model_config)
-    chunk_length = processor.feature_extractor.chunk_length
-    sampling_rate = processor.feature_extractor.sampling_rate
-    num_samples = chunk_length * sampling_rate
-    return DummyData(
-        SequenceData.from_prompt_token_counts((0, num_tokens)),
-        {"audio": [(np.zeros(num_samples), sampling_rate)]},
-    )
-
-
-def input_processor_for_whisper(ctx: InputContext, inputs):
-    multi_modal_data = inputs["encoder"]["multi_modal_data"]
-    if isinstance(multi_modal_data["audio"], list):
-        assert len(multi_modal_data["audio"]) == 1
-        multi_modal_data["audio"] = multi_modal_data["audio"][0]
-    # Resample and process audio
-    audio, orig_sr = multi_modal_data["audio"]
-    processor = cached_processor_from_config(ctx.model_config)
-    target_sr = processor.feature_extractor.sampling_rate
-    audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
-    multi_modal_data["audio"] = (audio, target_sr)
-    # Pre-allocate placeholder tokens in encoder sequence
-    num_tokens = get_max_whisper_audio_tokens(ctx)
-    inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens
-    return inputs
-
-
-def input_mapper_for_whisper(
-    ctx: InputContext,
-    multi_modal_data: Union[np.ndarray, List[np.ndarray]],
-) -> MultiModalKwargs:
-    if not isinstance(multi_modal_data, list):
-        multi_modal_data = [multi_modal_data]
-
-    assert len(multi_modal_data) == 1
-
-    if len(multi_modal_data) == 0:
-        return MultiModalKwargs()
-
-    processor = cached_processor_from_config(ctx.model_config)
-    sampling_rate = processor.feature_extractor.sampling_rate
-
-    audios = [audio for audio, _ in multi_modal_data]
-
-    kwargs = processor(audios,
-                       sampling_rate=sampling_rate,
-                       return_tensors="pt")
-    kwargs["input_features"] = kwargs["input_features"].squeeze(0).to(
-        ctx.model_config.dtype)
-
-    return MultiModalKwargs(kwargs)
-
-
-@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper)
-@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
-    "audio", get_max_whisper_audio_tokens)
+class WhisperProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> WhisperConfig:
+        return self.ctx.get_hf_config(WhisperConfig)
+
+    def get_hf_processor(self,
+                         sampling_rate: Optional[int] = None
+                         ) -> WhisperProcessor:
+        return self.ctx.get_hf_processor(WhisperProcessor)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"audio": 1}
+
+    def get_feature_extractor(self) -> WhisperFeatureExtractor:
+        hf_processor = self.get_hf_processor()
+        feature_extractor = hf_processor.feature_extractor  # type: ignore
+        assert isinstance(feature_extractor, WhisperFeatureExtractor)
+        return feature_extractor
+
+    def get_max_audio_tokens(self) -> int:
+        return self.get_hf_config().max_source_positions
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"audio": self.get_max_audio_tokens()}
+
+
+class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        feature_extractor = self.info.get_feature_extractor()
+
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
+        num_audios = mm_counts.get("audio", 0)
+
+        mm_data = {
+            "audio":
+            self._get_dummy_audios(length=audio_len, num_audios=num_audios)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<|startoftranscript|>" * num_audios,
+            mm_data=mm_data,
+        )
+
+
+class WhisperMultiModalProcessor(
+        EncDecMultiModalProcessor[WhisperProcessingInfo]):
+
+    def _get_data_parser(self) -> MultiModalDataParser:
+        feature_extractor = self.info.get_feature_extractor()
+        return MultiModalDataParser(target_sr=feature_extractor.sampling_rate)
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        # Strictly speaking, whisper encoder only accept audio features.
+        # We create a dummy encoder prompt here which will be padded to
+        # num_audio_tokens. So that we can create dummy data from this
+        # for encoder profiling.
+        return [0]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
+            mm_data = dict(audio=mm_data.pop("audios"))
+            mm_kwargs = dict(
+                **mm_kwargs,
+                sampling_rate=feature_extractor.sampling_rate,
+            )
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+        if "labels" in processed_outputs:
+            processed_outputs["input_ids"] = processed_outputs.pop("labels")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(input_features=MultiModalFieldConfig.batched("audio"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        num_tokens = self.info.get_max_audio_tokens()
+        return [
+            PromptReplacement(
+                modality="audio",
+                target=[0],
+                replacement=[0] * num_tokens,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(WhisperMultiModalProcessor,
+                                        info=WhisperProcessingInfo,
+                                        dummy_inputs=WhisperDummyInputsBuilder)
 class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
                                       SupportsMultiModal):
     packed_modules_mapping = {
@@ -724,7 +781,8 @@ def _parse_and_validate_audio_input(
             if not isinstance(input_features, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(input_features)}")
-            input_features = [feat.to(self.dtype) for feat in input_features]
+            input_features = torch.cat(
+                [feat.to(self.dtype) for feat in input_features])
 
         return WhisperAudioInputs(input_features=input_features)
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fcd02fbd520..93756364dea 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1297,7 +1297,10 @@ def create_encoder_prompt(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
     ) -> Union[str, list[int]]:
-        """Create input prompt for the encoder."""
+        """
+        Create input prompt for the encoder. HF processor will be applied on 
+        this prompt during profiling and generation.
+        """
         raise NotImplementedError
 
     def apply(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 81c92b38f8e..802e40a0c95 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -166,8 +166,12 @@ def get_dummy_data(
                 f"({set(mm_max_tokens_per_item.keys())})")
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
-        prompt_token_ids = mm_inputs["prompt_token_ids"]
         placeholders_by_modality = mm_inputs["mm_placeholders"]
+        # For encoder-decoder models, use encoder prompt token ids instead of
+        # decoder prompt to construct dummy seq_data for encoder profiling.
+        prompt_token_ids = (
+            mm_inputs["prompt_token_ids"] if not is_encoder_data else
+            mm_inputs["encoder_prompt_token_ids"])  # type: ignore
 
         total_placeholders_by_modality = {
             modality: sum(item["length"] for item in placeholders)
@@ -188,7 +192,7 @@ def get_dummy_data(
 
         # V0 does not support chunked prefill.
         if (total_len > seq_len and not envs.VLLM_USE_V1) or is_encoder_data:
-            if total_len > seq_len:
+            if total_len > seq_len and not is_encoder_data:
                 logger.warning(
                     "The context length (%d) of the model is too short "
                     "to hold the multi-modal embeddings in the worst case "
@@ -201,7 +205,8 @@ def get_dummy_data(
                     total_placeholders_by_modality)
 
             return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
+                seq_data=SequenceData.from_prompt_token_counts(
+                    (0, max(seq_len, total_len))),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )

From 77d9f0f96656d5142d9629f40a7cff3b73747a6d Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 23 Feb 2025 02:54:29 -0800
Subject: [PATCH 0336/1240] [Core][Distributed] Use IPC (domain socket) ZMQ
 socket for local comms (#13688)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../device_communicators/shm_broadcast.py     | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 48ac81ac008..12a720d47fb 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -19,7 +19,8 @@
 import vllm.envs as envs
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
-from vllm.utils import get_ip, get_open_port, is_valid_ipv6_address
+from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path,
+                        is_valid_ipv6_address)
 
 VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL
 
@@ -165,12 +166,12 @@ def get_metadata(self, current_idx: int):
 
 @dataclass
 class Handle:
-    connect_ip: str
     local_reader_ranks: List[int] = field(default_factory=list)
 
     buffer_handle: Optional[Tuple[int, int, int, str]] = None
-    local_subscribe_port: Optional[int] = None
-    remote_subscribe_port: Optional[int] = None
+    local_subscribe_addr: Optional[str] = None
+    remote_subscribe_addr: Optional[str] = None
+    remote_addr_ipv6: bool = False
 
 
 class MessageQueue:
@@ -192,9 +193,6 @@ def __init__(
         n_remote_reader = n_reader - n_local_reader
         self.n_remote_reader = n_remote_reader
 
-        if connect_ip is None:
-            connect_ip = get_ip() if n_remote_reader > 0 else "127.0.0.1"
-
         context = Context()
 
         if n_local_reader > 0:
@@ -212,32 +210,34 @@ def __init__(
             # message. otherwise, we will only receive the first subscription
             # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
             self.local_socket.setsockopt(XPUB_VERBOSE, True)
-            local_subscribe_port = get_open_port()
-            socket_addr = f"tcp://127.0.0.1:{local_subscribe_port}"
-            logger.debug("Binding to %s", socket_addr)
-            self.local_socket.bind(socket_addr)
+            local_subscribe_addr = get_open_zmq_ipc_path()
+            logger.debug("Binding to %s", local_subscribe_addr)
+            self.local_socket.bind(local_subscribe_addr)
 
             self.current_idx = 0
-
         else:
             self.buffer = None  # type: ignore
-            local_subscribe_port = None
+            local_subscribe_addr = None
             self.local_socket = None
             self.current_idx = -1
 
+        remote_addr_ipv6 = False
         if n_remote_reader > 0:
             # for remote readers, we will:
             # create a publish-subscribe socket to communicate large data
+            if not connect_ip:
+                connect_ip = get_ip()
             self.remote_socket = context.socket(XPUB)
             self.remote_socket.setsockopt(XPUB_VERBOSE, True)
             remote_subscribe_port = get_open_port()
             if is_valid_ipv6_address(connect_ip):
                 self.remote_socket.setsockopt(IPV6, 1)
+                remote_addr_ipv6 = True
             socket_addr = f"tcp://*:{remote_subscribe_port}"
             self.remote_socket.bind(socket_addr)
-
+            remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"
         else:
-            remote_subscribe_port = None
+            remote_subscribe_addr = None
             self.remote_socket = None
 
         self._is_writer = True
@@ -247,12 +247,12 @@ def __init__(
         self._is_remote_reader = False
 
         self.handle = Handle(
-            connect_ip=connect_ip,
             local_reader_ranks=local_reader_ranks,
             buffer_handle=self.buffer.handle()
             if self.buffer is not None else None,
-            local_subscribe_port=local_subscribe_port,
-            remote_subscribe_port=remote_subscribe_port,
+            local_subscribe_addr=local_subscribe_addr,
+            remote_subscribe_addr=remote_subscribe_addr,
+            remote_addr_ipv6=remote_addr_ipv6,
         )
 
         logger.info("vLLM message queue communication handle: %s", self.handle)
@@ -278,7 +278,7 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
 
             self.local_socket = context.socket(SUB)
             self.local_socket.setsockopt_string(SUBSCRIBE, "")
-            socket_addr = f"tcp://127.0.0.1:{handle.local_subscribe_port}"
+            socket_addr = handle.local_subscribe_addr
             logger.debug("Connecting to %s", socket_addr)
             self.local_socket.connect(socket_addr)
 
@@ -294,9 +294,9 @@ def create_from_handle(handle: Handle, rank) -> "MessageQueue":
 
             self.remote_socket = context.socket(SUB)
             self.remote_socket.setsockopt_string(SUBSCRIBE, "")
-            if is_valid_ipv6_address(handle.connect_ip):
+            if handle.remote_addr_ipv6:
                 self.remote_socket.setsockopt(IPV6, 1)
-            socket_addr = f"tcp://{handle.connect_ip}:{handle.remote_subscribe_port}"
+            socket_addr = handle.remote_subscribe_addr
             logger.debug("Connecting to %s", socket_addr)
             self.remote_socket.connect(socket_addr)
 

From f3ad866ac1d9ca9fce5b748c9cfedd181bdbefbf Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 23 Feb 2025 05:32:20 -0800
Subject: [PATCH 0337/1240] [Misc] Deprecate `--dataset` from
 `benchmark_serving.py` (#13708)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving.py | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 9760737ccec..9416a22b735 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -867,18 +867,10 @@ def main(args: argparse.Namespace):
                               tokenizer_mode=tokenizer_mode,
                               trust_remote_code=args.trust_remote_code)
 
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next "
-            "release. Please use '--dataset-name' and "
-            "'--dataset-path' in the future runs.",
-            stacklevel=2)
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
+    if args.dataset_name is None:
+        raise ValueError(
+            "Please specify '--dataset-name' and the corresponding "
+            "'--dataset-path' if required.")
 
     elif args.dataset_name == "sharegpt":
         input_requests = sample_sharegpt_requests(
@@ -1052,13 +1044,6 @@ def main(args: argparse.Namespace):
         default="/v1/completions",
         help="API endpoint.",
     )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=None,
-        help="Path to the ShareGPT dataset, will be deprecated in the "
-        "next release.",
-    )
     parser.add_argument(
         "--dataset-name",
         type=str,

From 141969dbfd7455cf3ee01d99c5a02fd500eec801 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 23 Feb 2025 22:47:24 +0800
Subject: [PATCH 0338/1240] [v1] torchrun compatibility (#13642)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml              |  1 +
 tests/distributed/test_torchrun_example.py |  6 ++++++
 tests/v1/engine/test_engine_core.py        |  6 ++++--
 vllm/config.py                             |  5 +++++
 vllm/executor/ray_distributed_executor.py  |  2 +-
 vllm/executor/ray_utils.py                 |  4 +++-
 vllm/executor/uniproc_executor.py          |  7 ++++---
 vllm/v1/engine/core.py                     |  2 +-
 vllm/v1/engine/llm_engine.py               |  9 +++++++--
 vllm/v1/executor/abstract.py               | 20 +++++++++++++++++---
 vllm/v1/executor/multiproc_executor.py     |  5 +++--
 vllm/v1/worker/gpu_worker.py               |  7 +++----
 vllm/v1/worker/tpu_worker.py               |  6 +++---
 vllm/worker/worker_base.py                 | 11 +++++++++--
 14 files changed, 67 insertions(+), 24 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 931057e6c19..05c4d261699 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -503,6 +503,7 @@ steps:
   - entrypoints/llm/test_collective_rpc.py
   commands:
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index a092a548a59..1c6c28b4ed3 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -48,6 +48,12 @@ def test_consistent_across_ranks(obj):
 test_consistent_across_ranks(
     llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
 
+# make sure we can access the model parameters from the calling process
+# of the `LLM` instance.
+params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
+              model.parameters())
+test_consistent_across_ranks(len(params))
+
 # all ranks should have the same outputs
 for output in outputs:
     prompt = output.prompt
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index d035668098e..8c2998e5889 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -5,6 +5,7 @@
 import time
 import uuid
 from concurrent.futures import Future
+from typing import List
 
 import pytest
 from transformers import AutoTokenizer
@@ -211,8 +212,9 @@ def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
 
     class DummyExecutor(UniProcExecutor):
 
-        def initialize(self, kv_cache_config: KVCacheConfig) -> None:
-            super().initialize(kv_cache_config)
+        def initialize_from_config(
+                self, kv_cache_configs: List[KVCacheConfig]) -> None:
+            super().initialize_from_config(kv_cache_configs)
 
             # This executor actually can only run 1 batch at a time
             self.semaphore = threading.Semaphore(1)
diff --git a/vllm/config.py b/vllm/config.py
index d3139b5fd84..6bcf34c3cff 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1407,6 +1407,11 @@ def __post_init__(self) -> None:
         self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
         self.world_size_across_dp = self.world_size * self.data_parallel_size
 
+        if self.distributed_executor_backend == "external_launcher":
+            import os
+            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+            logger.info("Disabling V1 multiprocessing for external launcher.")
+
         ray_only_devices = ["tpu"]
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 79ca45d55d9..b866413e3a6 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -541,7 +541,7 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                 # and the TP group executes in SPMD fashion.
                 if self.use_v1:
                     outputs = [
-                        worker.execute_model.
+                        worker.execute_model_ray.
                         bind(  # type: ignore[attr-defined]
                             outputs[i]) for i, worker in enumerate(tp_group)
                     ]
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 8ad466a5572..1734c670bf1 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -112,10 +112,12 @@ def setup_device_if_necessary(self):
                 torch.cuda.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
-        def execute_model(
+        def execute_model_ray(
             self,
             scheduler_output: "SchedulerOutput",
         ) -> "ModelRunnerOutput":
+            # this method is used to compile ray CG,
+            # and it needs a special logic of self.setup_device_if_necessary()
             self.setup_device_if_necessary()
             assert self.worker is not None, "Worker is not initialized"
             if isinstance(scheduler_output, tuple):
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 94db232240d..e041215de66 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -93,9 +93,10 @@ def _init_executor(self) -> None:
             ("ExecutorWithExternalLauncher needs deterministic "
             "execution, so it"
             "does not support delay_factor in scheduling")
-        assert not envs.VLLM_USE_V1, \
-            ("V1 architecture cannot guarantee deterministic execution, "
-            "so it is not supported in ExecutorWithExternalLauncher.")
+        if envs.VLLM_USE_V1:
+            assert not envs.VLLM_ENABLE_V1_MULTIPROCESSING, \
+            ("To get deterministic execution in V1, "
+            "please set VLLM_ENABLE_V1_MULTIPROCESSING=0")
         self.driver_worker = WorkerWrapperBase(vllm_config=self.vllm_config,
                                                rpc_rank=0)
         # engines are launched in torchrun-compatible launchers
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 981d23237e2..85c97293af8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -110,7 +110,7 @@ def _initialize_kv_caches(self,
         num_cpu_blocks = 0
 
         # Initialize kv cache and warmup the execution
-        self.model_executor.initialize(kv_cache_configs)
+        self.model_executor.initialize_from_config(kv_cache_configs)
 
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 04c7ee109e0..33b1ddc0f6f 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -4,10 +4,10 @@
 
 from typing_extensions import TypeVar
 
+import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -44,6 +44,7 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
+        self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
@@ -83,6 +84,10 @@ def __init__(
             log_stats=False,  # FIXME: implement
         )
 
+        if not multiprocess_mode:
+            # for v0 compatibility
+            self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
+
     @classmethod
     def from_engine_args(
         cls,
@@ -97,7 +102,7 @@ def from_engine_args(
         vllm_config = engine_args.create_engine_config(usage_context)
         executor_class = Executor.get_class(vllm_config)
 
-        if VLLM_ENABLE_V1_MULTIPROCESSING:
+        if envs.VLLM_ENABLE_V1_MULTIPROCESSING:
             logger.debug("Enabling multiprocessing for LLMEngine.")
             enable_multiprocessing = True
 
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 3663cbd08ae..11002ad0022 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -3,6 +3,9 @@
 from concurrent.futures import Future
 from typing import List, Type, Union
 
+import torch
+import torch.distributed as dist
+
 from vllm.config import VllmConfig
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.uniproc_executor import (  # noqa
@@ -49,12 +52,14 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
                              f"{distributed_executor_backend}")
         return executor_class
 
-    def initialize(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self,
+                               kv_cache_configs: List[KVCacheConfig]) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
         """
-        self.collective_rpc("initialize_cache", args=(kv_cache_configs, ))
+        self.collective_rpc("initialize_from_config",
+                            args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
     def determine_available_memory(self) -> int:  # in bytes
@@ -89,4 +94,13 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
 
 
 class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
-    pass
+
+    def determine_available_memory(self) -> int:  # in bytes
+        # same as determine_num_available_blocks in v0,
+        # we need to get the min across all ranks.
+        memory = super().determine_available_memory()
+        from vllm.distributed.parallel_state import get_world_group
+        cpu_group = get_world_group().cpu_group
+        memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
+        dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
+        return memory_tensor.item()
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 14492f273ed..d4582122fa6 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -216,9 +216,10 @@ def __init__(
             "local_rank": local_rank,
             "rank": rank,
             "distributed_init_method": distributed_init_method,
+            "is_driver_worker": rank == 0,
         }
         wrapper.init_worker(all_kwargs)
-        self.worker = wrapper.worker
+        self.worker = wrapper
 
         pid = os.getpid()
         _add_prefix(sys.stdout, f"VllmWorker rank={rank}", pid)
@@ -239,7 +240,7 @@ def __init__(
             ready_socket.send_string(WorkerProc.READY_STR)
             ready_socket.send(payload)
 
-        wrapper.init_device()
+        self.worker.init_device()
         self.worker.load_model()
 
     @staticmethod
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index ece0fa55534..d9a415aee52 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
@@ -185,9 +185,8 @@ def determine_available_memory(self) -> int:
     def get_kv_cache_spec(self) -> KVCacheSpec:
         return self.model_runner.get_kv_cache_spec()
 
-    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        kv_cache_config = kv_cache_configs[self.rank]
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")
@@ -225,7 +224,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        return output if self.rank == 0 else None
+        return output if self.is_driver_worker else None
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index f29edd34ede..c236f263edd 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -36,6 +36,7 @@ def __init__(
         distributed_init_method: str,
         is_driver_worker: bool = False,
     ):
+        self.is_driver_worker = is_driver_worker
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -151,7 +152,7 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         output = self.model_runner.execute_model(scheduler_output)
-        return output if self.rank == 0 else None
+        return output if self.is_driver_worker else None
 
     def load_model(self) -> None:
         self.model_runner.load_model()
@@ -170,9 +171,8 @@ def get_model(self) -> nn.Module:
     def get_kv_cache_spec(self) -> KVCacheSpec:
         return self.model_runner.get_kv_cache_spec()
 
-    def initialize_cache(self, kv_cache_configs: List[KVCacheConfig]) -> None:
+    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
-        kv_cache_config = kv_cache_configs[self.rank]
         self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def check_health(self) -> None:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 44c26ed350a..445c0d3285b 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -567,6 +567,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
             self.worker = worker_class(**kwargs)
             assert self.worker is not None
 
+    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
+        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        self.worker.initialize_from_config(kv_cache_config)  # type: ignore
+
     def init_device(self):
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during device initialization
@@ -574,8 +578,11 @@ def init_device(self):
 
     def execute_method(self, method: Union[str, bytes], *args, **kwargs):
         try:
-            target = self if self.worker is None else self.worker
-            return run_method(target, method, args, kwargs)
+            # method resolution order:
+            # if a method is defined in this class, it will be called directly.
+            # otherwise, since we define `__getattr__` and redirect attribute
+            # query to `self.worker`, the method will be called on the worker.
+            return run_method(self, method, args, kwargs)
         except Exception as e:
             # if the driver worker also execute methods,
             # exceptions in the rest worker may cause deadlock in rpc like ray

From 17b8fa9fb17558d826793b5d756eca2ce1847427 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 23 Feb 2025 13:07:43 -0800
Subject: [PATCH 0339/1240] [V1][BugFix] Fix engine core client shutdown hangs
 (#13298)

Even though ZMQ context.destroy() is meant to close open sockets before terminating the context, it appears to be necessary to do this explicitly or else it can hang in the context.term() method.

Close zmq sockets explicitly before terminating context, make shutdown of client resource more robust, shut down engine core process prior to terminating zmq context.

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_engine_core_client.py |  4 +-
 vllm/v1/engine/core_client.py              | 51 ++++++++++++++++------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 828d7eed309..a7c02322ff0 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,7 +3,6 @@
 import asyncio
 import time
 import uuid
-from contextlib import ExitStack
 from typing import Dict, List, Optional
 
 import pytest
@@ -178,7 +177,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 @pytest.mark.asyncio(loop_scope="function")
 async def test_engine_core_client_asyncio(monkeypatch):
 
-    with monkeypatch.context() as m, ExitStack() as after:
+    with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         # Monkey-patch core engine utility function to test.
@@ -195,7 +194,6 @@ async def test_engine_core_client_asyncio(monkeypatch):
             executor_class=executor_class,
             log_stats=True,
         )
-        after.callback(client.shutdown)
 
         MAX_TOKENS = 20
         params = SamplingParams(max_tokens=MAX_TOKENS)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 527aa72833b..5ffaf63e6ce 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,6 +8,7 @@
 import weakref
 from abc import ABC, abstractmethod
 from concurrent.futures import Future
+from dataclasses import dataclass
 from threading import Thread
 from typing import Any, Dict, List, Optional, Type, Union
 
@@ -169,6 +170,31 @@ def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine_core.add_lora(lora_request)
 
 
+@dataclass
+class BackgroundResources:
+    """Used as a finalizer for clean shutdown, avoiding
+    circular reference back to the client object."""
+
+    ctx: Union[zmq.Context, zmq.asyncio.Context] = None
+    output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
+    input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
+    proc_handle: Optional[BackgroundProcHandle] = None
+
+    def __call__(self):
+        """Clean up background resources."""
+
+        if self.proc_handle is not None:
+            self.proc_handle.shutdown()
+        # ZMQ context termination can hang if the sockets
+        # aren't explicitly closed first.
+        if self.output_socket is not None:
+            self.output_socket.close(linger=0)
+        if self.input_socket is not None:
+            self.input_socket.close(linger=0)
+        if self.ctx is not None:
+            self.ctx.destroy(linger=0)
+
+
 class MPClient(EngineCoreClient):
     """
     MPClient: base client for multi-proc EngineCore.
@@ -212,21 +238,22 @@ def sigusr1_handler(signum, frame):
             zmq.asyncio.Context()  # type: ignore[attr-defined]
             if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
 
-        # Note(rob): shutdown function cannot be a bound method,
-        # else the gc cannot collect the object.
-        self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
-                                           self.ctx)
+        # This will ensure resources created so far are closed
+        # when the client is garbage collected,  even if an
+        # exception is raised mid-construction.
+        resources = BackgroundResources(ctx=self.ctx)
+        self._finalizer = weakref.finalize(self, resources)
 
         # Paths and sockets for IPC.
         output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-        self.output_socket = make_zmq_socket(self.ctx, output_path,
-                                             zmq.constants.PULL)
-        self.input_socket = make_zmq_socket(self.ctx, input_path,
-                                            zmq.constants.PUSH)
+        resources.output_socket = make_zmq_socket(self.ctx, output_path,
+                                                  zmq.constants.PULL)
+        resources.input_socket = make_zmq_socket(self.ctx, input_path,
+                                                 zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        self.proc_handle = BackgroundProcHandle(
+        resources.proc_handle = BackgroundProcHandle(
             input_path=input_path,
             output_path=output_path,
             process_name="EngineCore",
@@ -237,13 +264,11 @@ def sigusr1_handler(signum, frame):
                 "log_stats": log_stats,
             })
 
+        self.output_socket = resources.output_socket
+        self.input_socket = resources.input_socket
         self.utility_results: Dict[int, AnyFuture] = {}
 
     def shutdown(self):
-        """Clean up background resources."""
-        if hasattr(self, "proc_handle"):
-            self.proc_handle.shutdown()
-
         self._finalizer()
 
 
From c6f6b9354bc047f79b47017bb85449ae4cd95425 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Sun, 23 Feb 2025 18:23:18 -0800
Subject: [PATCH 0340/1240] Fix some issues with benchmark data output (#13641)

Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../convert-results-json-to-markdown.py       | 27 +++++++++++++----
 .../scripts/run-performance-benchmarks.sh     |  3 ++
 .../tests/throughput-tests.json               |  2 +-
 benchmarks/benchmark_latency.py               |  5 ++--
 benchmarks/benchmark_serving.py               |  5 ++--
 benchmarks/benchmark_throughput.py            |  5 ++--
 benchmarks/benchmark_utils.py                 | 30 +++++++++++++++++++
 7 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index e031686c7a2..1030ec24e8d 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -84,8 +84,13 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_serving.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
-                command = json.loads(f.read())
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
             raw_result.update(command)
 
             # update the test name of this result
@@ -99,8 +104,13 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_latency.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
-                command = json.loads(f.read())
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
             raw_result.update(command)
 
             # update the test name of this result
@@ -121,8 +131,13 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_throughput.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands")) as f:
-                command = json.loads(f.read())
+            try:
+                with open(test_file.with_suffix(".commands")) as f:
+                    command = json.loads(f.read())
+            except OSError as e:
+                print(e)
+                continue
+
             raw_result.update(command)
 
             # update the test name of this result
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 9425cb07ec0..a3555f72a66 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -309,11 +309,14 @@ run_serving_tests() {
 
       new_test_name=$test_name"_qps_"$qps
 
+      # pass the tensor parallel size to the client so that it can be displayed
+      # on the benchmark dashboard
       client_command="python3 benchmark_serving.py \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
         --request-rate $qps \
+        --metadata "tensor_parallel_size=$tp" \
         $client_args"
 
       echo "Running test case $test_name with qps $qps"
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
index 91ef6d16be6..9bc87cbcd2b 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@@ -32,4 +32,4 @@
             "backend": "vllm"
         }
     }
-]
\ No newline at end of file
+]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 71ec909cba4..c82358d1451 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -11,7 +11,7 @@
 
 import numpy as np
 import torch
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
@@ -30,8 +30,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
                     for k in ["avg_latency", "percentiles"]})
     if pt_records:
         pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
-            json.dump(pt_records, f)
+        write_to_json(pt_file, pt_records)
 
 
 def main(args: argparse.Namespace):
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 9416a22b735..1bb83b082be 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -56,7 +56,7 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -841,8 +841,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
         pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
-            json.dump(pt_records, f)
+        write_to_json(pt_file, pt_records)
 
 
 def main(args: argparse.Namespace):
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ca54213c064..04de08fa97c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@
 
 import torch
 import uvloop
-from benchmark_utils import convert_to_pytorch_benchmark_format
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -366,8 +366,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
     if pt_records:
         # Don't use json suffix here as we don't want CI to pick it up
         pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
-        with open(pt_file, "w") as f:
-            json.dump(pt_records, f)
+        write_to_json(pt_file, pt_records)
 
 
 def main(args: argparse.Namespace):
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 6f01cf20e17..ac0688ca013 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import json
+import math
 import os
 from typing import Any, Dict, List
 
@@ -34,6 +36,34 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
                 "extra_info": extra_info,
             },
         }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
         records.append(record)
 
     return records
+
+
+class InfEncoder(json.JSONEncoder):
+
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: List) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)

From 53e64fdb8c4f0b6970ac0491b5a2d46de76b0b2e Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Sun, 23 Feb 2025 22:32:11 -0800
Subject: [PATCH 0341/1240] [ci] Add logic to change model to S3 path only when
 S3 CI env var is on (#13727)

Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-63-253.us-west-2.compute.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/metrics/test_metrics.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index d6183379c39..b276d9d9cb4 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -7,6 +7,7 @@
 import ray
 from prometheus_client import REGISTRY
 
+import vllm.envs as envs
 from vllm import EngineArgs, LLMEngine
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -141,8 +142,10 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
         stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
         metrics_tag_content = stat_logger.labels["model_name"]
 
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
     if served_model_name is None or served_model_name == []:
-        assert metrics_tag_content == f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", (
+        assert metrics_tag_content == model, (
             f"Metrics tag model_name is wrong! expect: {model!r}\n"
             f"actual: {metrics_tag_content!r}")
     else:
@@ -215,8 +218,9 @@ def test_engine_log_metrics_regression(
     while engine.has_unfinished_requests():
         engine.step()
 
-    assert_metrics(f"{MODEL_WEIGHTS_S3_BUCKET}/{model}", engine,
-                   disable_log_stats, len(example_prompts))
+    if envs.VLLM_CI_USE_S3:
+        model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
+    assert_metrics(model, engine, disable_log_stats, len(example_prompts))
 
 
 @pytest.mark.parametrize("model", MODELS)

From 28b9ec2c993c7087a32c96b9436535cbe3a22e81 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 24 Feb 2025 06:10:06 -0800
Subject: [PATCH 0342/1240] [V1][Core] Fix memory issue with logits & sampling
 (#13721)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++++++-------------
 vllm/v1/worker/gpu_worker.py       | 10 +++++
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7b9d478118..cf6bdd050e4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1179,6 +1179,43 @@ def _dummy_run(
             )
         return hidden_states
 
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        logits = self.model.compute_logits(hidden_states, None)
+        num_reqs = logits.size(0)
+
+        dummy_tensors = lambda v: torch.full(
+            (num_reqs, ), v, device=self.device)
+
+        dummy_metadata = SamplingMetadata(
+            temperature=dummy_tensors(0.5),
+            all_greedy=False,
+            all_random=False,
+            spec_token_ids=None,
+            top_p=dummy_tensors(0.9),
+            top_k=dummy_tensors(logits.size(1) - 1),
+            min_p=None,
+            generators={},
+            max_num_logprobs=None,
+            no_penalties=True,
+            prompt_token_ids=None,
+            frequency_penalties=dummy_tensors(0.1),
+            presence_penalties=dummy_tensors(0.1),
+            repetition_penalties=dummy_tensors(0.1),
+            output_token_ids=[[] for _ in range(num_reqs)],
+            min_tokens={},
+            logit_bias=[None for _ in range(num_reqs)],
+            allowed_token_ids_mask=None,
+        )
+        sampler_output = self.model.sample(logits=logits,
+                                           sampling_metadata=dummy_metadata)
+
+        return sampler_output
+
     def profile_run(self) -> None:
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
@@ -1306,38 +1343,11 @@ def profile_run(self) -> None:
                                             dummy_kv_caches)
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
-                logits = self.model.compute_logits(hidden_states, None)
-                dummy_tensors = lambda v: torch.full(
-                    (num_reqs, ), v, device=self.device)
-                dummy_metadata = SamplingMetadata(
-                    temperature=dummy_tensors(0.5),
-                    all_greedy=False,
-                    all_random=False,
-                    spec_token_ids=None,
-                    top_p=dummy_tensors(0.9),
-                    top_k=dummy_tensors(logits.size(1) - 1),
-                    min_p=None,
-                    generators={},
-                    max_num_logprobs=None,
-                    no_penalties=True,
-                    prompt_token_ids=torch.ones_like(logits,
-                                                     dtype=torch.int64),
-                    frequency_penalties=dummy_tensors(0.1),
-                    presence_penalties=dummy_tensors(0.1),
-                    repetition_penalties=dummy_tensors(0.1),
-                    output_token_ids=[[] for _ in range(num_reqs)],
-                    min_tokens={},
-                    logit_bias=[None for _ in range(num_reqs)],
-                    allowed_token_ids_mask=None,
-                )
-                sampler_output = self.model.sample(
-                    logits=logits, sampling_metadata=dummy_metadata)
+                sampler_output = self._dummy_sampler_run(hidden_states)
             else:
-                logits = None
                 sampler_output = None
-                dummy_metadata = None
             torch.cuda.synchronize()
-            del hidden_states, logits, sampler_output, dummy_metadata
+            del hidden_states, sampler_output
             self.encoder_cache.clear()
         gc.collect()
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d9a415aee52..d9030aae51d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -211,6 +211,16 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
+
+        # Warm up sampler and preallocate memory buffer for logits and other
+        # sampling related tensors of max possible shape to avoid memory
+        # fragmentation issue.
+        # NOTE: This is called after `capture_model` on purpose to prevent
+        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        self.model_runner._dummy_sampler_run(
+            hidden_states=self.model_runner._dummy_run(
+                num_tokens=self.scheduler_config.max_num_seqs))
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 0f1e7e59f5c1bedeffc7c626fb315e56bf9306c7 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Mon, 24 Feb 2025 22:10:14 +0800
Subject: [PATCH 0343/1240] [model][refactor] remove cuda hard code in models
 and layers (#13658)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/fused_moe/fused_marlin_moe.py            |  3 ++-
 vllm/model_executor/layers/rotary_embedding.py      | 13 +++++++++----
 .../layers/spec_decode_base_sampler.py              |  4 +++-
 vllm/model_executor/model_loader/loader.py          |  3 ++-
 vllm/model_executor/models/arctic.py                |  5 +++--
 vllm/model_executor/models/minicpm.py               |  5 +++--
 vllm/model_executor/models/minicpmv.py              | 10 +++++++---
 7 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 4ca569ca4f1..ee158d7ee47 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -7,6 +7,7 @@
 
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     fused_topk, moe_align_block_size, try_get_optimal_moe_config)
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import direct_register_custom_op
 
@@ -238,7 +239,7 @@ def fused_marlin_moe(
     max_workspace_size = (max(2 * N, K) // 64) * 16
     workspace = torch.zeros(max_workspace_size,
                             dtype=torch.int,
-                            device="cuda",
+                            device=current_platform.device_type,
                             requires_grad=False)
 
     if has_no_zp:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 5d7f9396c20..ce1bc98ea42 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -30,6 +30,7 @@
 from transformers import PretrainedConfig
 
 from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -650,9 +651,13 @@ def __init__(
                          is_neox_style, dtype)
 
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
-        pos_freqs = self.base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float, device="cuda") /
-                                self.rotary_dim)
+        pos_freqs = self.base**(
+            torch.arange(0,
+                         self.rotary_dim,
+                         2,
+                         dtype=torch.float,
+                         device=current_platform.device_type) /
+            self.rotary_dim)
         inv_freq_extrapolation = 1.0 / pos_freqs
         inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs)
 
@@ -670,7 +675,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
     def _compute_cos_sin_cache(self) -> torch.Tensor:
         inv_freq = self._compute_inv_freq(self.scaling_factor)
         t = torch.arange(self.max_position_embeddings * self.scaling_factor,
-                         device="cuda",
+                         device=current_platform.device_type,
                          dtype=torch.float32)
         freqs = torch.einsum("i,j -> ij", t, inv_freq)
         cos = (freqs.cos() * self.mscale)
diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py
index 35c7ffec271..54fd43fc659 100644
--- a/vllm/model_executor/layers/spec_decode_base_sampler.py
+++ b/vllm/model_executor/layers/spec_decode_base_sampler.py
@@ -7,6 +7,8 @@
 import torch.jit
 import torch.nn as nn
 
+from vllm.platforms import current_platform
+
 
 class SpecDecodeBaseSampler(nn.Module):
     """Base class for samplers used for Speculative Decoding verification
@@ -35,7 +37,7 @@ def __init__(self, strict_mode: bool = False):
     def init_gpu_tensors(self, device: Union[int, str]) -> None:
         assert self.num_accepted_tokens is None
         if isinstance(device, int):
-            device = f"cuda:{device}"
+            device = f"{current_platform.device_type}:{device}"
         elif not isinstance(device, str):
             raise ValueError(f"Device must be int or str, get {type(device)}")
         self.num_accepted_tokens = torch.tensor(0,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 8736cf1ca34..e23c6375855 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -914,7 +914,8 @@ def _parse_quant_state(param_name: str,
                 if param_name + "." in k:
                     quant_state[k] = temp_state_dict[k]
 
-            return QuantState.from_dict(quant_state, device="cuda")
+            return QuantState.from_dict(quant_state,
+                                        device=current_platform.device_type)
 
         # Second iterate over all prequant and normal weights
         # pre quantized weights would have a quant_state
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 27df448e63f..77f383b6e46 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -30,6 +30,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.arctic import ArcticConfig
 
@@ -138,13 +139,13 @@ def __init__(self,
                     torch.empty(self.num_experts,
                                 2 * self.intermediate_size,
                                 self.hidden_size,
-                                device="cuda",
+                                device=current_platform.device_type,
                                 dtype=self.params_dtype))
                 self.w2s = nn.Parameter(
                     torch.empty(self.num_experts,
                                 self.hidden_size,
                                 self.intermediate_size,
-                                device="cuda",
+                                device=current_platform.device_type,
                                 dtype=self.params_dtype))
             set_weight_attrs(self.ws, {
                 "weight_loader": self.weight_loader,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 52ab8948878..54b691b3572 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -51,6 +51,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -98,13 +99,13 @@ def __init__(
             torch.empty(self.num_total_experts,
                         2 * self.intermediate_size,
                         self.hidden_size,
-                        device="cuda",
+                        device=current_platform.device_type,
                         dtype=self.params_dtype))
         self.w2s = nn.Parameter(
             torch.empty(self.num_total_experts,
                         self.hidden_size,
                         self.intermediate_size,
-                        device="cuda",
+                        device=current_platform.device_type,
                         dtype=self.params_dtype))
 
         set_weight_attrs(self.ws, {
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1f278b65740..5e883d00c1c 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -59,6 +59,7 @@
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .idefics2_vision_model import Idefics2VisionTransformer
@@ -1184,7 +1185,8 @@ def init_resampler(self,
                                    quant_config=quant_config,
                                    prefix=prefix)
 
-        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -1266,7 +1268,8 @@ def init_resampler(self,
                                      quant_config=quant_config,
                                      prefix=prefix)
 
-        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,
@@ -1360,7 +1363,8 @@ def init_resampler(self,
                                      quant_config=quant_config,
                                      prefix=prefix)
 
-        return resampler.to(device="cuda", dtype=torch.get_default_dtype())
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
 
     def get_vision_embedding(
         self,

From f08eafddf8aa2980b801713a46d973a73897358e Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Mon, 24 Feb 2025 15:10:25 +0100
Subject: [PATCH 0344/1240] [Bugfix] fix(logging): add missing opening square
 bracket (#13011)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/logger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/logger.py b/vllm/logger.py
index b20d55e3c10..0ee47de173a 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -20,7 +20,7 @@
 VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
 
 _FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
-           "%(filename)s:%(lineno)d] %(message)s")
+           "[%(filename)s:%(lineno)d] %(message)s")
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
 DEFAULT_LOGGING_CONFIG = {

From 3d5fc96060797ff4b9a10f6dd0ac718e87b949c6 Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Mon, 24 Feb 2025 15:10:33 +0100
Subject: [PATCH 0345/1240] [CI/Build] add python-json-logger to
 requirements-common (#12842)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/other/logging_configuration.md | 9 ++-------
 requirements-common.txt                 | 1 +
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/other/logging_configuration.md b/examples/other/logging_configuration.md
index acd9c1f2bc0..c70b853c127 100644
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@@ -49,7 +49,8 @@ disabled, an error will occur while starting vLLM.
 ### Example 1: Customize vLLM root logger
 
 For this example, we will customize the vLLM root logger to use
-[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
+[`python-json-logger`](https://github.com/nhairs/python-json-logger)
+(which is part of the container image) to log to
 STDOUT of the console in JSON format with a log level of `INFO`.
 
 To begin, first, create an appropriate JSON logging configuration file:
@@ -82,12 +83,6 @@ To begin, first, create an appropriate JSON logging configuration file:
 }
 ```
 
-Next, install the `python-json-logger` package if it's not already installed:
-
-```bash
-pip install python-json-logger
-```
-
 Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
 to the path of the custom logging configuration JSON file:
 
diff --git a/requirements-common.txt b/requirements-common.txt
index c0df136f500..0514bf8adca 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -38,3 +38,4 @@ compressed-tensors == 0.9.2 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
+python-json-logger # Used by logging as per examples/other/logging_configuration.md

From 48fb453d90d785a27f89f755ce29942b5eff1d69 Mon Sep 17 00:00:00 2001
From: Jongseok Park <37990712+cakeng@users.noreply.github.com>
Date: Mon, 24 Feb 2025 07:33:20 -0800
Subject: [PATCH 0346/1240] Expert Parallelism (EP) Support for DeepSeek V2
 (#12583)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py           |   3 +-
 tests/distributed/test_expert_parallel.py     | 227 ++++++++++++++++++
 tests/kernels/test_awq_marlin.py              |   9 +-
 tests/kernels/test_moe.py                     |  65 ++++-
 tests/kernels/utils.py                        |   4 +-
 tests/utils.py                                |   6 +-
 vllm/config.py                                |  20 ++
 vllm/envs.py                                  |   7 +
 .../layers/fused_moe/fused_moe.py             | 126 +++++++---
 vllm/model_executor/layers/fused_moe/layer.py |  70 +++++-
 .../layers/fused_moe/moe_torch_iterative.py   |  10 +-
 .../layers/quantization/awq_marlin.py         |   7 +
 .../compressed_tensors_moe.py                 |  10 +
 .../layers/quantization/experts_int8.py       |   4 +
 .../model_executor/layers/quantization/fp8.py |   4 +
 .../layers/quantization/gptq_marlin.py        |   2 +
 .../layers/quantization/moe_wna16.py          |   4 +
 .../layers/quantization/quark/quark_moe.py    |   4 +
 vllm/model_executor/models/deepseek_v2.py     |   4 -
 19 files changed, 527 insertions(+), 59 deletions(-)
 create mode 100644 tests/distributed/test_expert_parallel.py

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index a4a45c9cbff..410750686ee 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -468,7 +468,8 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    elif config.architectures[0] == "DeepseekV3ForCausalLM":
+    elif (config.architectures[0] == "DeepseekV3ForCausalLM"
+          or config.architectures[0] == "DeepseekV2ForCausalLM"):
         E = config.n_routed_experts
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
new file mode 100644
index 00000000000..bc5770642b7
--- /dev/null
+++ b/tests/distributed/test_expert_parallel.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import List, Literal, NamedTuple, Optional
+
+import pytest
+
+from vllm.config import TaskOption
+from vllm.logger import init_logger
+
+from ..utils import compare_two_settings, fork_new_process_for_each_test
+
+logger = init_logger("test_expert_parallel")
+
+
+class ParallelSetup(NamedTuple):
+    tp_size: int
+    eager_mode: bool
+    chunked_prefill: bool
+
+
+class EPTestOptions(NamedTuple):
+    trust_remote_code: bool
+    tokenizer_mode: Optional[str]
+    load_format: Optional[str] = None
+    hf_overrides: Optional[str] = None
+
+
+@dataclass
+class EPTestSettings:
+    parallel_setups: List[ParallelSetup]
+    distributed_backends: List[str]
+    task: TaskOption
+    test_options: EPTestOptions
+
+    @staticmethod
+    def detailed(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return EPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=False,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+                ParallelSetup(tp_size=2 * tp_base,
+                              eager_mode=False,
+                              chunked_prefill=True),
+                ParallelSetup(tp_size=2 * tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp", "ray"],
+            task=task,
+            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    @staticmethod
+    def fast(
+        *,
+        tp_base: int = 2,
+        task: TaskOption = "auto",
+        trust_remote_code: bool = False,
+        tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
+    ):
+        return EPTestSettings(
+            parallel_setups=[
+                ParallelSetup(tp_size=tp_base,
+                              eager_mode=True,
+                              chunked_prefill=False),
+            ],
+            distributed_backends=["mp"],
+            task=task,
+            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
+        )
+
+    def iter_params(self, model_name: str):
+        opts = self.test_options
+
+        for parallel_setup in self.parallel_setups:
+            for distributed_backend in self.distributed_backends:
+                yield (model_name, parallel_setup, distributed_backend,
+                       self.task, opts)
+
+
+# NOTE: You can adjust tp_base locally to fit the model in GPU
+# The values displayed here are only a rough indicator of the size of the model
+
+# yapf: disable
+TEST_MODELS = {
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": EPTestSettings.fast(
+        trust_remote_code=True),
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": EPTestSettings.fast(tp_base=4),
+}
+
+
+def _compare_tp(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: EPTestOptions,
+    num_gpus_available: int,
+    *,
+    method: Literal["generate"],
+):
+    (
+        tp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    (
+        trust_remote_code,
+        tokenizer_mode,
+        load_format,
+        hf_overrides,
+    ) = test_options
+
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} GPUs")
+
+    common_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+        "--load-format",
+        "auto",
+    ]
+    if chunked_prefill:
+        common_args.append("--enable-chunked-prefill")
+    if eager_mode:
+        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
+    if trust_remote_code:
+        common_args.append("--trust-remote-code")
+    if tokenizer_mode:
+        common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", hf_overrides])
+
+    ep_env = {
+        "VLLM_TEST_ENABLE_EP": "1",
+    }
+
+    ep_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+    ]
+
+    # compare without expert parallelism
+    tp_env = {
+        "VLLM_TEST_ENABLE_EP": "0",
+    }
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    try:
+        compare_two_settings(model_name,
+                             ep_args,
+                             tp_args,
+                             ep_env,
+                             tp_env,
+                             method=method,
+                             max_wait_seconds=360)
+    except Exception:
+        raise
+
+
+@pytest.mark.parametrize(
+    ("model_name", "parallel_setup", "distributed_backend", "task",
+     "test_options"),
+    [
+        params for model_name, settings in TEST_MODELS.items()
+        for params in settings.iter_params(model_name)
+    ],
+)
+@fork_new_process_for_each_test
+def test_ep(
+    model_name: str,
+    parallel_setup: ParallelSetup,
+    distributed_backend: str,
+    task: TaskOption,
+    test_options: EPTestOptions,
+    num_gpus_available,
+):
+    _compare_tp(model_name,
+                parallel_setup,
+                distributed_backend,
+                task,
+                test_options,
+                num_gpus_available,
+                method="generate")
diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py
index 67595010cb2..939b0e7157b 100644
--- a/tests/kernels/test_awq_marlin.py
+++ b/tests/kernels/test_awq_marlin.py
@@ -99,13 +99,8 @@ def test_fused_marlin_moe_awq(
         num_bits=num_bits,
     )
 
-    torch_output = torch_moe(
-        a,
-        w_ref1.transpose(1, 2),
-        w_ref2.transpose(1, 2),
-        score,
-        topk,
-    )
+    torch_output = torch_moe(a, w_ref1.transpose(1, 2), w_ref2.transpose(1, 2),
+                             score, topk, None)
 
     assert compute_max_diff(marlin_output, torch_output) < 4e-2
 
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 0f13fbc9650..2f5c69046f4 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -26,6 +26,7 @@
 from vllm.scalar_type import scalar_types
 
 NUM_EXPERTS = [8, 64]
+EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
 
@@ -34,6 +35,7 @@
 @pytest.mark.parametrize("k", [128, 511, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 def test_fused_moe(
     m: int,
@@ -41,6 +43,7 @@ def test_fused_moe(
     k: int,
     e: int,
     topk: int,
+    ep_size: int,
     dtype: torch.dtype,
 ):
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
@@ -48,10 +51,38 @@ def test_fused_moe(
     w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
-    triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
-    torch_output = torch_moe(a, w1, w2, score, topk)
+
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randint(0,
+                              e, (local_e, ),
+                              device="cuda",
+                              dtype=torch.int32)
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
+
+    triton_output = fused_moe(a,
+                              w1,
+                              w2,
+                              score,
+                              topk,
+                              global_num_experts=e,
+                              expert_map=e_map,
+                              renormalize=False)
+    torch_output = torch_moe(a, w1, w2, score, topk, e_map)
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
-    iterative_output = iterative_moe(a, w1, w2, score, topk, renormalize=False)
+    iterative_output = iterative_moe(a,
+                                     w1,
+                                     w2,
+                                     score,
+                                     topk,
+                                     global_num_experts=e,
+                                     expert_map=e_map,
+                                     renormalize=False)
     torch.testing.assert_close(iterative_output,
                                torch_output,
                                atol=2e-2,
@@ -63,13 +94,14 @@ def test_fused_moe(
 @pytest.mark.parametrize("k", [128, 1024])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("group_size", [64, 128])
 @pytest.mark.parametrize("has_zp", [True, False])
 @pytest.mark.parametrize("weight_bits", [4, 8])
 def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
-                        dtype: torch.dtype, group_size: int, has_zp: bool,
-                        weight_bits: int):
+                        ep_size: int, dtype: torch.dtype, group_size: int,
+                        has_zp: bool, weight_bits: int):
     print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits)
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -130,6 +162,25 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
         if has_zp:
             w_qzeros[expert_id] = qzeros
 
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randint(0,
+                              e, (local_e, ),
+                              device="cuda",
+                              dtype=torch.int32)
+        e_map = torch.full((e, ), -1, device="cuda", dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device="cuda", dtype=torch.int32)
+        w1_ref = w1_ref[e_ids]
+        w2_ref = w2_ref[e_ids]
+        w1_qweight = w1_qweight[e_ids]
+        w2_qweight = w2_qweight[e_ids]
+        w1_scales = w1_scales[e_ids]
+        w2_scales = w2_scales[e_ids]
+        w1_qzeros = w1_qzeros[e_ids]
+        w2_qzeros = w2_qzeros[e_ids]
+    else:
+        e_map = None
+
     triton_output = fused_moe(a,
                               w1_qweight,
                               w2_qweight,
@@ -138,12 +189,14 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
                               renormalize=False,
                               use_int4_w4a16=weight_bits == 4,
                               use_int8_w8a16=weight_bits == 8,
+                              global_num_experts=e,
+                              expert_map=e_map,
                               w1_scale=w1_scales,
                               w2_scale=w2_scales,
                               w1_zp=w1_qzeros if has_zp else None,
                               w2_zp=w2_qzeros if has_zp else None,
                               block_shape=[0, group_size])
-    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk)
+    torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map)
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5be111d7130..1ee3a332503 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1053,7 +1053,7 @@ def compute_max_diff(output, output_ref):
         torch.abs(output_ref))
 
 
-def torch_moe(a, w1, w2, score, topk):
+def torch_moe(a, w1, w2, score, topk, expert_map):
     B, D = a.shape
     a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
     out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
@@ -1061,6 +1061,8 @@ def torch_moe(a, w1, w2, score, topk):
     topk_weight, topk_ids = torch.topk(score, topk)
     topk_weight = topk_weight.view(-1)
     topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
     for i in range(w1.shape[0]):
         mask = topk_ids == i
         if mask.sum():
diff --git a/tests/utils.py b/tests/utils.py
index f39cbe7ede0..2ad91ca2c86 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -297,12 +297,12 @@ def _test_completion_close(
                                            logprobs=5,
                                            temperature=0.0)
 
-    logporbs = completion.choices[0].logprobs.top_logprobs[0]
-    logporbs = {k: round(v, 2) for k, v in logporbs.items()}
+    logprobs = completion.choices[0].logprobs.top_logprobs[0]
+    logprobs = {k: round(v, 2) for k, v in logprobs.items()}
 
     results.append({
         "test": "completion_close",
-        "logprobs": logporbs,
+        "logprobs": logprobs,
     })
 
     return results
diff --git a/vllm/config.py b/vllm/config.py
index 6bcf34c3cff..ace49a86eae 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -677,6 +677,23 @@ def _verify_bnb_config(self) -> None:
                 "fallback to the eager mode.")
             self.enforce_eager = True
 
+    def _verify_with_expert_parallelism(self) -> None:
+        num_expert_names = [
+            "moe_num_experts",  # Dbrx
+            "num_experts",  # Jamba
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = 0
+        for name in num_expert_names:
+            num_experts = getattr(self.hf_text_config, name, 0)
+            if num_experts > 0:
+                break
+        if num_experts < 1:
+            raise ValueError(
+                "Number of experts in the model must be greater than 0 "
+                "when expert parallelism is enabled.")
+
     def verify_async_output_proc(self, parallel_config, speculative_config,
                                  device_config) -> None:
         if not self.use_async_output_proc:
@@ -730,6 +747,9 @@ def verify_with_parallel_config(
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
+        if envs.VLLM_TEST_ENABLE_EP:
+            self._verify_with_expert_parallelism()
+
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if pipeline_parallel_size > 1:
             architectures = getattr(self.hf_config, "architectures", [])
diff --git a/vllm/envs.py b/vllm/envs.py
index dbf1d462396..84426cb5bb2 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -86,6 +86,7 @@
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
+    VLLM_TEST_ENABLE_EP: bool = False
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
@@ -570,6 +571,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
+    # If set, vLLM will use the experimental expert parallel implementation on
+    # the FusedMoE layer, using tensor parallelism size as expert parallelism
+    # size.
+    "VLLM_TEST_ENABLE_EP":
+    lambda: bool(int(os.getenv("VLLM_TEST_ENABLE_EP", "0"))),
+
     # Number of GPUs per worker in Ray, if it is set to be a fraction,
     # it allows ray to schedule multiple actors on a single GPU,
     # so that users can colocate other actors on the same GPUs as vLLM.
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 543c8ced165..4cab72a29da 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -20,6 +20,18 @@
 logger = init_logger(__name__)
 
 
+@triton.jit
+def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
+                          token_mask, BLOCK_SIZE_M, BLOCK_SIZE_N,
+                          compute_type):
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=compute_type)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, accumulator, mask=c_mask)
+
+
 @triton.jit
 def fused_moe_kernel_gptq_awq(
         # Pointers to matrices
@@ -120,17 +132,26 @@ def fused_moe_kernel_gptq_awq(
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
     offs_bn = (pid_n * BLOCK_SIZE_N +
                tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
                       offs_k[None, :] * stride_ak)
 
-    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
-
     if use_int4_w4a16:
         b_ptrs = b_ptr + off_experts * stride_be + \
-            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * stride_bn
+            (offs_k[:, None] // 2) * stride_bk + offs_bn[None, :] * \
+                stride_bn
         b_shifter = (offs_k[:, None] % 2) * 4
     elif use_int8_w8a16:
         b_ptrs = b_ptr + off_experts * stride_be + \
@@ -170,7 +191,8 @@ def fused_moe_kernel_gptq_awq(
 
         b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + \
             offs_bn[None, :] * stride_bsn + \
-            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * stride_bsk
+            ((offs_k[:, None] + BLOCK_SIZE_K * k) // group_size) * \
+                stride_bsk
         b_scale = tl.load(b_scale_ptrs, mask=k_mask, other=k_other)
         b_scale = b_scale.to(tl.float32)
 
@@ -319,13 +341,22 @@ def fused_moe_kernel(
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N,
+                              offs_token, token_mask, BLOCK_SIZE_M,
+                              BLOCK_SIZE_N, compute_type)
+        return
+
     offs_bn = (pid_n * BLOCK_SIZE_N +
                tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
                       offs_k[None, :] * stride_ak)
 
-    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
     b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
                                                 offs_bn[None, :] * stride_bn)
     if use_int8_w8a16:
@@ -349,7 +380,6 @@ def fused_moe_kernel(
     # of fp32 values for higher accuracy.
     # `accumulator` will be converted back to fp16 after the loop.
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-
     for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
@@ -544,8 +574,11 @@ def moe_align_block_size_triton(
 
 
 def moe_align_block_size(
-        topk_ids: torch.Tensor, block_size: int,
-        num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: torch.Tensor = None
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
     size for matrix multiplication.
@@ -555,6 +588,10 @@ def moe_align_block_size(
         top-k expert indices for each token.
     - block_size: The block size used in block matrix multiplication.
     - num_experts: The total number of experts.
+    - expert_map: A tensor of shape [num_experts] that maps the expert index
+        from the global space to the local index space of the current
+        expert parallel shard. If the expert is not in the current expert
+        parallel shard, the mapping is set to -1.
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -589,7 +626,9 @@ def moe_align_block_size(
                              device=topk_ids.device)
     sorted_ids.fill_(topk_ids.numel())
     max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    expert_ids = torch.empty((max_num_m_blocks, ),
+    # Expert ids must be zeroed out to prevent index out of bounds error while
+    # mapping global expert ids to local expert ids in expert parallelism.
+    expert_ids = torch.zeros((max_num_m_blocks, ),
                              dtype=torch.int32,
                              device=topk_ids.device)
     num_tokens_post_pad = torch.empty((1),
@@ -618,6 +657,9 @@ def moe_align_block_size(
     else:
         ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
                                  expert_ids, num_tokens_post_pad)
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
@@ -1001,6 +1043,8 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
                           use_int4_w4a16: bool = False,
+                          global_num_experts: int = -1,
+                          expert_map: Optional[torch.Tensor] = None,
                           w1_scale: Optional[torch.Tensor] = None,
                           w2_scale: Optional[torch.Tensor] = None,
                           w1_zp: Optional[torch.Tensor] = None,
@@ -1009,8 +1053,9 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, w1_scale,
-                       w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
+                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16,
+                       global_num_experts, expert_map, w1_scale, w2_scale,
+                       w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -1022,6 +1067,8 @@ def inplace_fused_experts_fake(
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         w1_zp: Optional[torch.Tensor] = None,
@@ -1049,6 +1096,8 @@ def outplace_fused_experts(
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         w1_zp: Optional[torch.Tensor] = None,
@@ -1058,8 +1107,9 @@ def outplace_fused_experts(
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
                               False, use_fp8_w8a8, use_int8_w8a16,
-                              use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
-                              a1_scale, a2_scale, block_shape)
+                              use_int4_w4a16, global_num_experts, expert_map,
+                              w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
+                              a2_scale, block_shape)
 
 
 def outplace_fused_experts_fake(
@@ -1071,6 +1121,8 @@ def outplace_fused_experts_fake(
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         w1_scale: Optional[torch.Tensor] = None,
         w2_scale: Optional[torch.Tensor] = None,
         w1_zp: Optional[torch.Tensor] = None,
@@ -1098,26 +1150,27 @@ def fused_experts(hidden_states: torch.Tensor,
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
+                  global_num_experts: int = -1,
+                  expert_map: Optional[torch.Tensor] = None,
                   w1_scale: Optional[torch.Tensor] = None,
                   w2_scale: Optional[torch.Tensor] = None,
                   w1_zp: Optional[torch.Tensor] = None,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None):
+                  block_shape: Optional[List[int]] = None) -> torch.Tensor:
+
     if inplace:
-        torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2,
-                                             topk_weights, topk_ids,
-                                             use_fp8_w8a8, use_int8_w8a16,
-                                             use_int4_w4a16, w1_scale,
-                                             w2_scale, w1_zp, w2_zp, a1_scale,
-                                             a2_scale, block_shape)
+        torch.ops.vllm.inplace_fused_experts(
+            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
+            use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map,
+            w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
         return hidden_states
     else:
         return torch.ops.vllm.outplace_fused_experts(
             hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
-            use_int8_w8a16, use_int4_w4a16, w1_scale, w2_scale, w1_zp, w2_zp,
-            a1_scale, a2_scale, block_shape)
+            use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map,
+            w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -1129,6 +1182,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
                        use_int4_w4a16: bool = False,
+                       global_num_experts: int = -1,
+                       expert_map: Optional[torch.Tensor] = None,
                        w1_scale: Optional[torch.Tensor] = None,
                        w2_scale: Optional[torch.Tensor] = None,
                        w1_zp: Optional[torch.Tensor] = None,
@@ -1153,6 +1208,9 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    if global_num_experts == -1:
+        global_num_experts = E
+    top_k_num = topk_ids.shape[1]
     # We execute the fused_moe kernel in chunks to circumvent this issue:
     # https://github.com/vllm-project/vllm/issues/5938
     CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
@@ -1166,20 +1224,20 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         try_get_optimal_moe_config,
         w1.shape,
         w2.shape,
-        topk_ids.shape[1],
+        top_k_num,
         config_dtype,
         block_shape=block_shape,
     )
 
     config = get_config_func(M)
 
-    intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
+    intermediate_cache1 = torch.empty((M, top_k_num, N),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
-    intermediate_cache2 = torch.empty((M * topk_ids.shape[1], N // 2),
+    intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
-    intermediate_cache3 = torch.empty((M, topk_ids.shape[1], w2.shape[1]),
+    intermediate_cache3 = torch.empty((M, top_k_num, w2.shape[1]),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
 
@@ -1221,7 +1279,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
 
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
-            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'], E))
+            moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
+                                 global_num_experts, expert_map))
 
         invoke_fused_moe_kernel(curr_hidden_states,
                                 w1,
@@ -1235,7 +1294,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 expert_ids,
                                 num_tokens_post_padded,
                                 False,
-                                topk_ids.shape[1],
+                                top_k_num,
                                 config,
                                 compute_type=compute_type,
                                 use_fp8_w8a8=use_fp8_w8a8,
@@ -1286,6 +1345,8 @@ def fused_moe(
     use_fp8_w8a8: bool = False,
     use_int8_w8a16: bool = False,
     use_int4_w4a16: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
     w1_scale: Optional[torch.Tensor] = None,
     w2_scale: Optional[torch.Tensor] = None,
     w1_zp: Optional[torch.Tensor] = None,
@@ -1320,6 +1381,11 @@ def fused_moe(
     - use_int4_w4a16 (bool): If True, use matmul of int4 weight and bf16/fp16
         activation to compute the inner products for w1 and w2.
         Defaults to False.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
+        from the global expert space to the local expert space of the expert 
+        parallel shard.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
     - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -1334,8 +1400,6 @@ def fused_moe(
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
     """
-    # Check constraints.
-    assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
 
     if use_grouped_topk:
         assert num_expert_group is not None and topk_group is not None
@@ -1358,6 +1422,8 @@ def fused_moe(
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
                          use_int4_w4a16=use_int4_w4a16,
+                         global_num_experts=global_num_experts,
+                         expert_map=expert_map,
                          w1_scale=w1_scale,
                          w2_scale=w2_scale,
                          w1_zp=w1_zp,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f18c0313355..49400b699cc 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -6,6 +6,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -55,6 +56,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None
@@ -113,6 +116,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None
@@ -125,6 +130,8 @@ def apply(
                             use_grouped_topk=use_grouped_topk,
                             topk_group=topk_group,
                             num_expert_group=num_expert_group,
+                            global_num_experts=global_num_experts,
+                            expert_map=expert_map,
                             custom_routing_function=custom_routing_function,
                             scoring_func=scoring_func,
                             e_score_correction_bias=e_score_correction_bias)
@@ -139,6 +146,8 @@ def forward_cuda(
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None
@@ -160,7 +169,9 @@ def forward_cuda(
                              w2=layer.w2_weight,
                              topk_weights=topk_weights,
                              topk_ids=topk_ids,
-                             inplace=True)
+                             inplace=True,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map)
 
     def forward_cpu(
         self,
@@ -172,6 +183,8 @@ def forward_cpu(
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         **kwargs,
     ):
@@ -196,6 +209,8 @@ def forward_tpu(
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None
@@ -215,6 +230,8 @@ def forward_tpu(
                                 w2=layer.w2_weight,
                                 topk=top_k,
                                 gating_output=router_logits,
+                                global_num_experts=global_num_experts,
+                                expert_map=expert_map,
                                 renormalize=renormalize)
 
     forward_native = forward_cuda
@@ -255,6 +272,7 @@ def __init__(
         topk_group: Optional[int] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
+        ep_size: Optional[int] = None,
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
@@ -267,8 +285,13 @@ def __init__(
 
         self.tp_size = (tp_size if tp_size is not None else
                         get_tensor_model_parallel_world_size())
+        if envs.VLLM_TEST_ENABLE_EP:
+            self.ep_size = self.tp_size
+            self.tp_size = 1
+        else:
+            self.ep_size = 1
         self.top_k = top_k
-        self.num_experts = num_experts
+        self.num_experts = num_experts  # Global number of experts
         assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
@@ -281,6 +304,26 @@ def __init__(
         self.custom_routing_function = custom_routing_function
         self.scoring_func = scoring_func
         self.e_score_correction_bias = e_score_correction_bias
+        self.expert_map = None
+
+        if self.ep_size > 1:
+            # Create a tensor of size num_experts filled with -1
+            self.expert_map = torch.full((self.num_experts, ),
+                                         -1,
+                                         dtype=torch.int32)
+            # Create a expert map for the local experts
+            local_num_experts = num_experts // self.ep_size
+            ep_rank = get_tensor_model_parallel_rank()
+            if ep_rank < (self.ep_size - 1):
+                # Each non-last rank gets local_num_experts experts.
+                self.expert_map[ep_rank * local_num_experts:
+                                (ep_rank + 1) * local_num_experts] = \
+                    torch.arange(0, local_num_experts, dtype=torch.int32)
+            else:
+                # All remaining experts are assigned to the last rank.
+                local_num_experts = num_experts - ep_rank * local_num_experts
+                self.expert_map[-local_num_experts:] = \
+                    torch.arange(0, local_num_experts, dtype=torch.int32)
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -293,8 +336,11 @@ def __init__(
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
 
+        local_num_experts = torch.sum(self.expert_map != -1) \
+            if self.expert_map is not None else num_experts
+
         moe_quant_params = {
-            "num_experts": num_experts,
+            "num_experts": local_num_experts,
             "hidden_size": hidden_size,
             "intermediate_size_per_partition":
             self.intermediate_size_per_partition,
@@ -423,10 +469,22 @@ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor,
             assert shard_id in ("w1", "w3")
             expert_data.copy_(loaded_weight)
 
+    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
+        if self.expert_map is None:
+            return expert_id
+        return self.expert_map[expert_id].item()
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: str, expert_id: int) -> None:
 
+        expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
+        if expert_id == -1:
+            return
+
+        # TP rank is set to 0 if EP is enabled
+        tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
+
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
@@ -447,7 +505,6 @@ def weight_loader(self, param: torch.nn.Parameter,
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
 
         expert_data = param.data[expert_id]
-        tp_rank = get_tensor_model_parallel_rank()
 
         # is_transposed: if the dim to shard the weight
         # should be flipped. Required by GPTQ, compressed-tensors
@@ -590,13 +647,16 @@ def forward(self, hidden_states: torch.Tensor,
             top_k=self.top_k,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
+            global_num_experts=self.num_experts,
+            expert_map=self.expert_map,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
             custom_routing_function=self.custom_routing_function,
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias)
 
-        if self.reduce_results and self.tp_size > 1:
+        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            # Default set to False. (May have to add shared expert outputs.)
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
index d9a5de1b303..da27633f272 100644
--- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
+++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py
@@ -10,7 +10,9 @@ def fused_moe(
     w2: torch.Tensor,
     gating_output: torch.Tensor,
     topk: int,
-    renormalize: bool,
+    global_num_experts: int,
+    expert_map: torch.Tensor = None,
+    renormalize: bool = False,
 ) -> torch.Tensor:
     """
     Args:
@@ -18,6 +20,7 @@ def fused_moe(
         w1: [num_experts, intermediate_size * 2, hidden_size]
         w2: [num_experts, hidden_size, intermediate_size]
         gating_output: [*, num_experts]
+        expert_map: [num_experts]
     """
     orig_shape = hidden_states.shape
     hidden_size = hidden_states.shape[-1]
@@ -27,13 +30,16 @@ def fused_moe(
     dtype = hidden_states.dtype
 
     hidden_states = hidden_states.view(num_tokens, hidden_size)
-    gating_output = gating_output.view(num_tokens, num_experts)
+    gating_output = gating_output.view(num_tokens, global_num_experts)
     topk_weights = gating_output.softmax(dim=-1, dtype=torch.float)
     topk_weights, selected_experts = topk_weights.topk(topk, dim=-1)
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     topk_weights = topk_weights.to(dtype)
 
+    if expert_map is not None:
+        selected_experts = expert_map[selected_experts]
+
     final_hidden_states = None
     for expert_idx in range(num_experts):
         expert_w1 = w1[expert_idx]
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 111b3f74d50..0e8c4c7b3ac 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -464,10 +464,17 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if expert_map is not None:
+            raise NotImplementedError(
+                "Expert Parallelism is not supported for "
+                "fused Marlin MoE method.")
+
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index db8e8a4b6c1..389359a663c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -214,6 +214,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
@@ -239,6 +241,8 @@ def apply(
                              topk_ids=topk_ids,
                              inplace=True,
                              use_fp8_w8a8=True,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map,
                              w1_scale=layer.w13_weight_scale,
                              w2_scale=layer.w2_weight_scale,
                              a1_scale=layer.w13_input_scale,
@@ -540,10 +544,16 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if expert_map is not None:
+            raise NotImplementedError(
+                "Expert Parallelism is not supported for "
+                "fused Marlin MoE method.")
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 663fb8bf5b8..0767926ee5c 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -108,6 +108,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
@@ -133,6 +135,8 @@ def apply(
                              topk_ids=topk_ids,
                              inplace=True,
                              use_int8_w8a16=True,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map,
                              w1_scale=layer.w13_scale,
                              w2_scale=layer.w2_scale)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1ca39b0ffa8..9f4cd2aa737 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -670,6 +670,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
@@ -697,6 +699,8 @@ def apply(
             topk_ids=topk_ids,
             inplace=True,
             use_fp8_w8a8=True,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
             w1_scale=(layer.w13_weight_scale_inv
                       if self.block_quant else layer.w13_weight_scale),
             w2_scale=(layer.w2_weight_scale_inv
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 9f960d9fd37..241fc7d777a 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -585,6 +585,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index da06ca3f70e..a3adac1bb12 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -288,6 +288,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
@@ -317,6 +319,8 @@ def apply(
                              inplace=True,
                              use_int4_w4a16=weight_bits == 4,
                              use_int8_w8a16=weight_bits == 8,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map,
                              w1_scale=layer.w13_scales,
                              w2_scale=layer.w2_scales,
                              w1_zp=layer.w13_qzeros if has_zp else None,
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 98743b15e4b..36b08589fd1 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -198,6 +198,8 @@ def apply(
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
@@ -223,6 +225,8 @@ def apply(
                              topk_ids=topk_ids,
                              inplace=True,
                              use_fp8_w8a8=True,
+                             global_num_experts=global_num_experts,
+                             expert_map=expert_map,
                              w1_scale=layer.w13_weight_scale,
                              w2_scale=layer.w2_weight_scale,
                              a1_scale=layer.w13_input_scale,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index a4d52c613b3..9bf3ec2ffd8 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -106,10 +106,6 @@ def __init__(
         self.routed_scaling_factor = config.routed_scaling_factor
         self.n_shared_experts = config.n_shared_experts
         self.routed_scaling_factor = config.routed_scaling_factor
-        if self.tp_size > config.n_routed_experts:
-            raise ValueError(
-                f"Tensor parallel size {self.tp_size} is greater than "
-                f"the number of experts {config.n_routed_experts}.")
 
         if config.hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "

From 83d88e2644ef30f7220029e3348bab5561af2060 Mon Sep 17 00:00:00 2001
From: Zhonghua Deng <abzhonghua@gmail.com>
Date: Mon, 24 Feb 2025 23:37:32 +0800
Subject: [PATCH 0347/1240] [BugFix]  Illegal memory access for MoE On H20
 (#13693)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4cab72a29da..1ddc3ce6f89 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1271,7 +1271,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             # so the cache size and config are already set correctly and
             # do not need to be adjusted.
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk * topk_ids.shape[1]]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
             config = get_config_func(tokens_in_chunk)
 

From 166f4cd514862286d86d216e9194bf1e9bec1c3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Mon, 24 Feb 2025 16:43:21 +0100
Subject: [PATCH 0348/1240] [Misc][Docs] Raise error when flashinfer is not
 installed and `VLLM_ATTENTION_BACKEND` is set (#12513)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/quickstart.md | 10 ++++++++++
 vllm/config.py                            |  9 +++++++++
 2 files changed, 19 insertions(+)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index f3a4773f0fc..f51856d6eae 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -184,3 +184,13 @@ chat_response = client.chat.completions.create(
 )
 print("Chat response:", chat_response)
 ```
+
+## On Attention Backends
+
+Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications.
+
+If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+
+```{attention}
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
+```
diff --git a/vllm/config.py b/vllm/config.py
index ace49a86eae..a584bc0d930 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -9,6 +9,7 @@
 import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
+from importlib.util import find_spec
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
                     Final, List, Literal, Mapping, Optional, Protocol, Set,
@@ -294,6 +295,14 @@ def __init__(
 
         self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
 
+        if (backend := envs.VLLM_ATTENTION_BACKEND
+            ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
+            raise ValueError(
+                "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
+                "module was not found."
+                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "for instructions on how to install it.")
+
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision

From e0afb632b82509685da2e81833f8e1631b6ab58e Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Mon, 24 Feb 2025 11:29:41 -0500
Subject: [PATCH 0349/1240] [V1] V1 engine implements parallel sampling
 (AsyncLLM and LLMEngine) (#10980)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_llm_engine.py            | 103 ++++-
 .../v1/entrypoints/openai/test_completion.py  | 102 +++++
 vllm/v1/engine/async_llm.py                   |  27 +-
 vllm/v1/engine/llm_engine.py                  |  43 +-
 vllm/v1/engine/parallel_sampling.py           | 375 ++++++++++++++++++
 5 files changed, 641 insertions(+), 9 deletions(-)
 create mode 100644 vllm/v1/engine/parallel_sampling.py

diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 84b634316cb..de2a39ee9c0 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,21 +1,114 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import random
+from typing import Dict, List, Optional, Tuple
+
 import pytest
 
 from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import LLM, SamplingParams
 
+MODEL = "facebook/opt-125m"
+DTYPE = "half"
 
-def test_llm_engine_refuses_prompt_logprobs_with_apc(monkeypatch):
-    """Test passes if LLMEngine raises an exception when it is configured
-    for automatic prefix caching and it receives a request with
-    prompt_logprobs enabled, which is incompatible."""
 
+def _vllm_model(apc: bool, vllm_runner, monkeypatch):
+    """Set up VllmRunner instance."""
     monkeypatch.setenv("VLLM_USE_V1", "1")
     # TODO(nick): Single-proc to work around a ZMQ shutdown hang for now.
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    return vllm_runner(
+        MODEL,
+        dtype=DTYPE,
+        max_model_len=128,
+        enforce_eager=True,
+        enable_prefix_caching=apc,
+        gpu_memory_utilization=0.5,
+    )
+
+
+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture parameterized by APC True/False."""
+    with _vllm_model(request.param, vllm_runner, monkeypatch) as vllm_model:
+        yield vllm_model
+
+
+@pytest.fixture(scope="function")
+def vllm_model_apc(vllm_runner, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(True, vllm_runner, monkeypatch) as vllm_model:
+        yield vllm_model
+
+
+def _get_test_sampling_params(
+    prompt_list: List[str],
+    seed: Optional[int] = 42,
+) -> Tuple[List[SamplingParams], List[int]]:
+    """Generate random sampling params for a batch."""
+
+    def get_mostly_n_gt1() -> int:
+        """Mostly n \in [2,20], ~1/3 n=1"""
+        x = random.randint(0, 28)
+        if x < 10:
+            return 1
+        else:
+            return x - 8
+
+    n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
+    # High temperature to maximize the chance of unique completions
+    return [
+        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
+        for n in n_list
+    ], n_list
+
+
+def test_parallel_sampling(vllm_model, example_prompts) -> None:
+    """Test passes if parallel sampling `n>1` yields `n` unique completions.
+    
+    Args:
+      vllm_model: VllmRunner instance under test.
+      example_prompt: test fixture providing prompts for testing.
+    """
+    sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
+    model: LLM = vllm_model.model
+    outputs = model.generate(example_prompts, sampling_params_list)
+
+    # Validate each request response
+    for out, n in zip(outputs, n_list):
+        completion_counts: Dict[str, int] = {}
+        # Assert correct number of completions
+        assert len(out.outputs) == n, (
+            f"{len(out.outputs)} completions; {n} expected.")
+        for idx in range(n):
+            comp = out.outputs[idx]
+            # Assert correct completion indices
+            assert comp.index == idx, (f"Index {comp.index}; expected {idx}.")
+            text = comp.text
+            completion_counts[text] = completion_counts.get(text, 0) + 1
+        # Assert unique completions
+        if len(completion_counts) != n:
+            repeats = {
+                txt: num
+                for (txt, num) in completion_counts.items() if num > 1
+            }
+            raise AssertionError(
+                f"{len(completion_counts)} unique completions; expected"
+                f" {n}. Repeats: {repeats}")
+
+
+def test_llm_engine_refuses_prompt_logprobs_with_apc(vllm_model_apc):
+    """Test passes if LLMEngine raises an exception when it is configured
+    for automatic prefix caching and it receives a request with
+    prompt_logprobs enabled, which is incompatible."""
+    model: LLM = vllm_model_apc.model
     with pytest.raises(ValueError) as excinfo:
-        LLM(model="facebook/opt-125m", enable_prefix_caching=True).generate(
+        model.generate(
             "Hello, my name is",
             SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5))
 
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index ef46a16ef34..35e059ccb54 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -250,6 +250,108 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
     assert "".join(chunks) == single_output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
+                                     model_name: str):
+    """Parallel sampling without streaming.
+    A single request output contains a list of completions.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    # High temperature to maximize chance of unique completions.
+    completion = await client.completions.create(model=model_name,
+                                                 prompt=prompt,
+                                                 max_tokens=max_tokens,
+                                                 n=n,
+                                                 temperature=0.95,
+                                                 stream=False,
+                                                 seed=42)
+
+    # Assert `n` completions
+    num_completions = len(completion.choices)
+    assert num_completions == n, (
+        f"Num completions {num_completions} but expected {n}.")
+    completion_repeats: Dict[str, int] = {}
+    for idx, choice in enumerate(completion.choices):
+        # Assert correct completion index & some finish reason.
+        assert choice.index == idx, (
+            f"Index {choice.index} but expected {idx}.")
+        assert choice.finish_reason is not None, (
+            "None finish_reason is invalid.")
+        text = choice.text
+        completion_repeats[text] = completion_repeats.get(text, 0) + 1
+    # Assert `n` unique completions
+    num_unique = len(completion_repeats)
+    if num_unique != n:
+        repeats = {
+            txt: num
+            for (txt, num) in completion_repeats.items() if num > 1
+        }
+        raise AssertionError(
+            f"Expected {n} unique completions, got {num_unique};"
+            f" repeats: {repeats}.")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
+    """Streaming for parallel sampling.
+    The tokens from multiple samples, are flattened into a single stream,
+    with an index to indicate which sample the token belongs to.
+    """
+
+    prompt = "What is an LLM?"
+    n = 3
+    max_tokens = 5
+
+    stream = await client.completions.create(model=model_name,
+                                             prompt=prompt,
+                                             max_tokens=max_tokens,
+                                             n=n,
+                                             temperature=0.95,
+                                             stream=True,
+                                             seed=42)
+    chunks: List[List[str]] = [[] for i in range(n)]
+    finish_reason_count = 0
+    async for chunk in stream:
+        index = chunk.choices[0].index
+        text = chunk.choices[0].text
+        chunks[index].append(text)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # Assert `n` completions with correct finish reasons
+    assert finish_reason_count == n, (
+        f"Expected {n} completions with valid indices and finish_reason.")
+    completion_repeats: Dict[str, int] = {}
+    for chunk in chunks:
+        chunk_len = len(chunk)
+        # Assert correct number of completion tokens
+        assert chunk_len == max_tokens, (
+            f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
+        text = "".join(chunk)
+        completion_repeats[text] = completion_repeats.get(text, 0) + 1
+        print(text)
+    # Assert `n` unique completions
+    num_unique = len(completion_repeats)
+    if num_unique != n:
+        repeats = {
+            txt: num
+            for (txt, num) in completion_repeats.items() if num > 1
+        }
+        raise AssertionError(f"{num_unique} unique completions, expected {n};"
+                             f" repeats: {repeats}")
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 670454c283d..36a02628f40 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,6 +24,7 @@
 from vllm.utils import cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import generate_parallel_sampling_async
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
@@ -170,7 +171,7 @@ async def add_request(
     # requests we don't need to send multiple messages to core proc,
     # and so we don't need multiple streams which then get
     # re-multiplexed in the API server anyhow.
-    async def generate(
+    async def _generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
@@ -241,6 +242,30 @@ async def generate(
             await self.abort(request_id)
             raise
 
+    def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        kwargs = dict(prompt=prompt,
+                      sampling_params=sampling_params,
+                      request_id=request_id,
+                      lora_request=lora_request,
+                      trace_headers=trace_headers,
+                      prompt_adapter_request=prompt_adapter_request,
+                      priority=priority)
+        if sampling_params.n is None or sampling_params.n == 1:
+            return self._generate(**kwargs)
+        else:
+            # Special handling for parallel sampling requests
+            return generate_parallel_sampling_async(generate=self._generate,
+                                                    **kwargs)
+
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 33b1ddc0f6f..64fd8719c82 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -21,6 +21,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.parallel_sampling import SyncParallelSamplingManager
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -48,6 +49,9 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
+        # Bookkeeping for parallel sampling requests
+        self.parallel_manager = SyncParallelSamplingManager()
+
         # important: init dp group before init the engine_core
         self.parallel_config = vllm_config.parallel_config
         self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
@@ -115,7 +119,8 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.output_processor.get_num_unfinished_requests()
+        return self.parallel_manager.get_num_unfinished_requests(
+            self.output_processor.get_num_unfinished_requests())
 
     def has_unfinished_requests(self) -> bool:
         has_unfinished = self.output_processor.has_unfinished_requests()
@@ -151,7 +156,36 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-
+        """Add request."""
+        kwargs = dict(request_id=request_id,
+                      prompt=prompt,
+                      params=params,
+                      arrival_time=arrival_time,
+                      lora_request=lora_request,
+                      trace_headers=trace_headers,
+                      prompt_adapter_request=prompt_adapter_request,
+                      priority=priority)
+        # Handle parallel sampling requests differently.
+        if params is None or isinstance(params,
+                                        PoolingParams) or params.n == 1:
+            self._add_request(**kwargs)
+        else:
+            # Special handling for parallel sampling requests
+            self.parallel_manager.add_request_parallel_sampling(
+                add_request=self._add_request, **kwargs)
+
+    def _add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> None:
+        """Add request, `n=1`"""
         # 1) Process raw inputs into the request.
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
@@ -182,7 +216,10 @@ def step(self) -> List[RequestOutput]:
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
-        return processed_outputs.request_outputs
+        request_outputs = processed_outputs.request_outputs
+
+        # 4) Process unfinished parallel sampling requests
+        return self.parallel_manager.step(request_outputs)
 
     def get_model_config(self):
         return self.model_config
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
new file mode 100644
index 00000000000..5d4ea111abf
--- /dev/null
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from copy import copy
+from typing import (AsyncGenerator, Dict, List, Mapping, Optional, Protocol,
+                    Tuple, Union)
+
+from vllm.inputs import PromptType
+from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.utils import merge_async_iterators
+
+
+class AsyncGenerateMethodType(Protocol):
+
+    def __call__(self,
+                 prompt: PromptType,
+                 sampling_params: SamplingParams,
+                 request_id: str,
+                 lora_request: Optional[LoRARequest] = None,
+                 trace_headers: Optional[Mapping[str, str]] = None,
+                 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+                 priority: int = 0) -> AsyncGenerator[RequestOutput, None]:
+        ...
+
+
+class SyncAddRequestMethodType(Protocol):
+
+    def __call__(self,
+                 request_id: str,
+                 prompt: PromptType,
+                 params: Union[SamplingParams, PoolingParams],
+                 arrival_time: Optional[float] = None,
+                 lora_request: Optional[LoRARequest] = None,
+                 trace_headers: Optional[Mapping[str, str]] = None,
+                 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+                 priority: int = 0) -> None:
+        ...
+
+
+class ParallelSamplingRequest:
+    """Info, state & processing for parallel sampling request.
+    
+    Store parent request ID and sampling params.
+    Facilitate generating child request sampling params.
+    Transform child request outputs into parent request
+    outputs.
+    When stream mode is disabled, then `self.request_output`
+    aggregates child request completions.
+    """
+
+    request_id: str
+    sampling_params: SamplingParams
+    cached_child_sampling_params: Optional[SamplingParams]
+    request_output: Optional[RequestOutput]
+    num_finished_completions: int
+
+    def __init__(self, request_id: str,
+                 sampling_params: SamplingParams) -> None:
+        self.request_id = request_id
+        self.sampling_params = sampling_params
+        self.cached_child_sampling_params = None
+        self.request_output = None
+        self.num_finished_completions = 0
+
+    def _get_child_sampling_params(
+        self,
+        index: int,
+    ) -> SamplingParams:
+        """Efficiently obtain child `sampling_params`
+
+        If `sampling_params.seed` is not `None` then 
+        each child request requires a unique clone of
+        parent `sampling_params` with a unique seed.
+
+        Args:
+          index: index within `n` child requests
+
+        Returns:
+          Child `sampling_params` instance.
+        """
+        seed = self.sampling_params.seed
+        if self.cached_child_sampling_params:
+            # Reuse child sampling_params data structure
+            return self.cached_child_sampling_params
+        # Build child sampling_params
+        child_sampling_params = copy(self.sampling_params)
+        child_sampling_params.n = 1
+        if seed is None:
+            # Cache child sampling_params for later reuse
+            self.cached_child_sampling_params = child_sampling_params
+        else:
+            # Each child gets a clone with a unique seed
+            child_sampling_params.seed = seed + index
+        return child_sampling_params
+
+    def _add_output(
+        self,
+        child_req_output: RequestOutput,
+        index: int,
+    ) -> None:
+        """Aggregate a parallel sampling child
+        request output.
+        
+        Non-stream-mode (`output_kind == FINAL_ONLY`) 
+        only. Inject correct parent request ID and
+        completion index.
+
+        Args:
+          child_req_output: a single request output
+                            from a parallel sampling
+                            child request.   
+          index: index within `n` child    
+        """
+        self.num_finished_completions += 1
+        new_completion = child_req_output.outputs[0]
+        new_completion.index = index
+        if self.request_output is None:
+            # Save the first request output; reinstate
+            # original request ID; metrics are not
+            # supported for parallel sampling
+            child_req_output.request_id = self.request_id
+            child_req_output.metrics = None
+            self.request_output = child_req_output
+        else:
+            # Aggregate additional completion into request output
+            # Note: will be sorted by index later
+            self.request_output.outputs.append(new_completion)
+
+    def _get_final_request_output(self) -> RequestOutput:
+        """Invariant: parent completion outputs sorted by index"""
+        assert self.request_output is not None
+        self.request_output.finished = True
+        self.request_output.outputs = sorted(self.request_output.outputs,
+                                             key=lambda x: x.index)
+        return self.request_output
+
+    def get_child_info(self, index: int) -> Tuple[str, SamplingParams]:
+        """Get child request ID and sampling params.
+        
+        Args:
+          index: index within `n` child requests.
+        
+        Returns:
+          (request ID, sampling_params) tuple
+        """
+        return (f"{index}_{self.request_id}",
+                self._get_child_sampling_params(index))
+
+    def process_output(
+        self,
+        child_req_output: RequestOutput,
+        index: int,
+    ) -> Optional[RequestOutput]:
+        """Filter, aggregate and transform parallel sampling
+        child request outputs.
+
+        If the parent request has `stream=false`
+        (`output_kind == FINAL_ONLY`), each child will also have
+        `output_kind == FINAL_ONLY`. All child request outputs
+        must be aggregated into a single request output, with
+        multiple completions. This request output is only returned
+        once `n` completions are aggregated.
+
+        If the parent request has `stream=true`
+        (`output_kind == DELTA`), each child will also have
+        `output_kind == DELTA`. All child request outputs
+        must be streamed directly to the caller.
+
+        Args:
+          child_req_output: a single child request output
+          index: index within `n` child requests
+
+        Returns:
+          `None`, unless a processed request output is ready to
+          send back to the caller.
+        """
+        if self.output_kind != RequestOutputKind.FINAL_ONLY:
+            # stream=true: return child completions immediately
+            child_req_output.request_id = self.request_id
+            child_req_output.outputs[0].index = index
+            if child_req_output.finished:
+                # Parent request is complete if all child requests are
+                # complete.
+                self.num_finished_completions += 1
+                child_req_output.finished = (
+                    self.num_finished_completions == self.n)
+            return child_req_output
+
+        # stream=false: aggregate child completions
+        self._add_output(child_req_output, index)
+        if self.num_finished_completions == self.n:
+            # Return aggregated request output after obtaining
+            # all completions
+            return self._get_final_request_output()
+        return None
+
+    async def wrap_child_async_generator(
+        self,
+        child_gen: AsyncGenerator[RequestOutput, None],
+        index: int,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """Output generator for a single parallel sampling
+        child request.
+
+        Each parallel sampling request triggers at
+        least two child requests. This generator
+        yields zero or more request outputs to
+        return to the caller, as they become
+        available.
+
+        Args:
+          child_gen: generator for child request
+                     outputs.
+          index: index within the `n` child requests
+
+        Returns:
+          Yields zero or more request outputs to return
+          to the caller.
+        """
+        async for out in child_gen:
+            if req_out := self.process_output(out, index):
+                yield req_out
+
+    @property
+    def n(self) -> int:
+        return self.sampling_params.n
+
+    @property
+    def output_kind(self) -> RequestOutputKind:
+        return self.sampling_params.output_kind
+
+
+class SyncParallelSamplingManager:
+
+    def __init__(self):
+        # Parent req ID -> parent request manager
+        self.parent_reqs: Dict[str, ParallelSamplingRequest] = {}
+        # Child req ID -> (child req index, parent req ID)
+        self.child_reqs: Dict[str, Tuple[int, str]] = {}
+
+    def _register_parent_request(self, req: ParallelSamplingRequest) -> None:
+        """Register parallel sampling parent request."""
+        self.parent_reqs[req.request_id] = req
+
+    def _register_child_request(self, req_id: str, child_req_id: str,
+                                index: int) -> None:
+        """Register parallel sampling child request with parent.
+        
+        Args:
+          req_id: parent request ID
+          child_req_id: child request ID
+          index: child request index within `n` child requests
+        """
+        self.child_reqs[child_req_id] = (index, req_id)
+
+    def get_num_unfinished_requests(self, num_core_reqs: int) -> int:
+        """Get the number of unfinished requests, correcting for parallel
+           sampling.
+        
+        Args:
+          num_core_reqs: The number of unfinished requests in the engine core.
+        
+        Returns:
+          Number of unfinished requests, where each parallel sampling req 
+          counts as 1
+        """
+        return num_core_reqs + len(self.parent_reqs) - len(self.child_reqs)
+
+    def add_request_parallel_sampling(
+        self,
+        add_request: SyncAddRequestMethodType,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> None:
+        """Add sync parallel sampling request."""
+        req = ParallelSamplingRequest(request_id, params)
+        self._register_parent_request(req)
+        # Add n child requests with unique request IDs & random seeds and n=1
+        for idx in range(req.n):
+            child_req_id, child_params = req.get_child_info(idx)
+            self._register_child_request(request_id, child_req_id, idx)
+            add_request(request_id=child_req_id,
+                        prompt=prompt,
+                        params=child_params,
+                        arrival_time=arrival_time,
+                        lora_request=lora_request,
+                        trace_headers=trace_headers,
+                        prompt_adapter_request=prompt_adapter_request,
+                        priority=priority)  # type: ignore
+
+    def step(
+        self,
+        outputs: List[RequestOutput],
+    ) -> List[RequestOutput]:
+        """Build parallel sampling request outputs.
+        
+        Extract child request outputs, aggregate them
+        into parent request output, and return parent
+        output when complete.
+
+        Do not modify `n=1` requests.
+
+        Args:
+          outputs: step request outputs. Mix of child request
+                   outputs & `n=1` request outputs.
+
+        Return:
+          List of parallel sampling parent request outputs &
+          unmodified `n=1` request outputs passed-thru from input.
+        """
+        if not (self.parent_reqs and outputs):
+            # Return unmodified
+            return outputs
+        agg_outputs = []
+        for output in outputs:
+            req_id = output.request_id
+            if child_req_entry := self.child_reqs.get(req_id, None):
+                # For each parallel sampling child request output:
+                (index, parent_req_id) = child_req_entry
+                req = self.parent_reqs[parent_req_id]
+                # Update parallel sampling request
+                if out := req.process_output(output, index):
+                    # Return parent request output if complete;
+                    # cleanup parent request bookkeeping.
+                    agg_outputs.append(out)
+                    del self.parent_reqs[parent_req_id]
+                # Cleanup child request bookkeeping.
+                del self.child_reqs[req_id]
+            else:
+                # Not a parallel sampling request output
+                agg_outputs.append(output)
+        return agg_outputs
+
+
+async def generate_parallel_sampling_async(
+    generate: AsyncGenerateMethodType,
+    prompt: PromptType,
+    sampling_params: SamplingParams,
+    request_id: str,
+    lora_request: Optional[LoRARequest] = None,
+    trace_headers: Optional[Mapping[str, str]] = None,
+    prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+    priority: int = 0,
+) -> AsyncGenerator[RequestOutput, None]:
+    """Generate completions for async parallel sampling requests."""
+    parent_req = ParallelSamplingRequest(request_id, sampling_params)
+
+    # Aggregate generators for n child requests
+    gens: List[AsyncGenerator[RequestOutput, None]] = []
+    for idx in range(parent_req.n):
+        child_req_id, child_params = parent_req.get_child_info(idx)
+        child_gen = generate(
+            prompt=prompt,
+            sampling_params=child_params,
+            request_id=child_req_id,
+            lora_request=lora_request,
+            trace_headers=trace_headers,
+            prompt_adapter_request=prompt_adapter_request,
+            priority=priority,
+        )  # type: ignore
+        gen = parent_req.wrap_child_async_generator(child_gen, idx)
+        gens.append(gen)
+
+    # Merge generators
+    async for _, out in merge_async_iterators(*gens):
+        yield out

From 856660e4f3c1a4d8b8fc53f641f2a6fda9365d05 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 24 Feb 2025 09:16:05 -0800
Subject: [PATCH 0350/1240] Revert "[V1][Core] Fix memory issue with logits &
 sampling" (#13775)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 68 +++++++++++++-----------------
 vllm/v1/worker/gpu_worker.py       | 10 -----
 2 files changed, 29 insertions(+), 49 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cf6bdd050e4..a7b9d478118 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1179,43 +1179,6 @@ def _dummy_run(
             )
         return hidden_states
 
-    @torch.inference_mode()
-    def _dummy_sampler_run(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-
-        logits = self.model.compute_logits(hidden_states, None)
-        num_reqs = logits.size(0)
-
-        dummy_tensors = lambda v: torch.full(
-            (num_reqs, ), v, device=self.device)
-
-        dummy_metadata = SamplingMetadata(
-            temperature=dummy_tensors(0.5),
-            all_greedy=False,
-            all_random=False,
-            spec_token_ids=None,
-            top_p=dummy_tensors(0.9),
-            top_k=dummy_tensors(logits.size(1) - 1),
-            min_p=None,
-            generators={},
-            max_num_logprobs=None,
-            no_penalties=True,
-            prompt_token_ids=None,
-            frequency_penalties=dummy_tensors(0.1),
-            presence_penalties=dummy_tensors(0.1),
-            repetition_penalties=dummy_tensors(0.1),
-            output_token_ids=[[] for _ in range(num_reqs)],
-            min_tokens={},
-            logit_bias=[None for _ in range(num_reqs)],
-            allowed_token_ids_mask=None,
-        )
-        sampler_output = self.model.sample(logits=logits,
-                                           sampling_metadata=dummy_metadata)
-
-        return sampler_output
-
     def profile_run(self) -> None:
         # use an empty tensor instead of `None`` to force Dynamo to pass
         # it by reference, rather by specializing on the value `None`.
@@ -1343,11 +1306,38 @@ def profile_run(self) -> None:
                                             dummy_kv_caches)
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
-                sampler_output = self._dummy_sampler_run(hidden_states)
+                logits = self.model.compute_logits(hidden_states, None)
+                dummy_tensors = lambda v: torch.full(
+                    (num_reqs, ), v, device=self.device)
+                dummy_metadata = SamplingMetadata(
+                    temperature=dummy_tensors(0.5),
+                    all_greedy=False,
+                    all_random=False,
+                    spec_token_ids=None,
+                    top_p=dummy_tensors(0.9),
+                    top_k=dummy_tensors(logits.size(1) - 1),
+                    min_p=None,
+                    generators={},
+                    max_num_logprobs=None,
+                    no_penalties=True,
+                    prompt_token_ids=torch.ones_like(logits,
+                                                     dtype=torch.int64),
+                    frequency_penalties=dummy_tensors(0.1),
+                    presence_penalties=dummy_tensors(0.1),
+                    repetition_penalties=dummy_tensors(0.1),
+                    output_token_ids=[[] for _ in range(num_reqs)],
+                    min_tokens={},
+                    logit_bias=[None for _ in range(num_reqs)],
+                    allowed_token_ids_mask=None,
+                )
+                sampler_output = self.model.sample(
+                    logits=logits, sampling_metadata=dummy_metadata)
             else:
+                logits = None
                 sampler_output = None
+                dummy_metadata = None
             torch.cuda.synchronize()
-            del hidden_states, sampler_output
+            del hidden_states, logits, sampler_output, dummy_metadata
             self.encoder_cache.clear()
         gc.collect()
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d9030aae51d..d9a415aee52 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -211,16 +211,6 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
-
-        # Warm up sampler and preallocate memory buffer for logits and other
-        # sampling related tensors of max possible shape to avoid memory
-        # fragmentation issue.
-        # NOTE: This is called after `capture_model` on purpose to prevent
-        # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        self.model_runner._dummy_sampler_run(
-            hidden_states=self.model_runner._dummy_run(
-                num_tokens=self.scheduler_config.max_num_seqs))
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 2dc1fcd431aa0ae5560079fe41caa9e2d72934e7 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 24 Feb 2025 12:25:47 -0500
Subject: [PATCH 0351/1240] Fix precommit fail in fused_moe intermediate_cache2
 chunking (#13772)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1ddc3ce6f89..bc9573b36df 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1271,7 +1271,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             # so the cache size and config are already set correctly and
             # do not need to be adjusted.
             intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
-            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk * topk_ids.shape[1]]
+            intermediate_cache2 = intermediate_cache2[:tokens_in_chunk *
+                                                      topk_ids.shape[1]]
             intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
             config = get_config_func(tokens_in_chunk)
 

From 536740fd512cb3983e7e71a5bf8430cff2bfa817 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 24 Feb 2025 13:52:21 -0500
Subject: [PATCH 0352/1240] [Misc] Clean Up `EngineArgs.create_engine_config`
 (#13734)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py           |  4 +++
 vllm/engine/arg_utils.py | 65 ++++++++++++++++------------------------
 2 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index a584bc0d930..0bc9b2f817f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1124,6 +1124,10 @@ def metrics_info(self):
         return {key: str(value) for key, value in self.__dict__.items()}
 
     def _verify_args(self) -> None:
+        if self.cpu_offload_gb < 0:
+            raise ValueError("CPU offload space must be non-negative"
+                             f", but got {self.cpu_offload_gb}")
+
         if self.gpu_memory_utilization > 1.0:
             raise ValueError(
                 "GPU memory utilization must be less than 1.0. Got "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bab7cfe2aa3..8378a116a6d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
+        # gguf file needs a specific model loader and doesn't use hf_repo
+        if check_gguf_file(self.model):
+            self.quantization = self.load_format = "gguf"
+
+        # NOTE: This is to allow model loading from S3 in CI
+        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
+                and self.model in MODELS_ON_S3
+                and self.load_format == LoadFormat.AUTO):  # noqa: E501
+            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
+            self.load_format = LoadFormat.RUNAI_STREAMER
+
         return ModelConfig(
             model=self.model,
             task=self.task,
@@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig:
         )
 
     def create_load_config(self) -> LoadConfig:
-        return LoadConfig(
-            load_format=self.load_format,
-            download_dir=self.download_dir,
-            model_loader_extra_config=self.model_loader_extra_config,
-            ignore_patterns=self.ignore_patterns,
-        )
-
-    def create_engine_config(self,
-                             usage_context: Optional[UsageContext] = None
-                             ) -> VllmConfig:
-        from vllm.platforms import current_platform
-        current_platform.pre_register_and_update()
-
-        if envs.VLLM_USE_V1:
-            self._override_v1_engine_args(usage_context)
-
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
-            self.quantization = self.load_format = "gguf"
-
         # bitsandbytes quantization needs a specific model loader
         # so we make sure the quant method and the load format are consistent
         if (self.quantization == "bitsandbytes" or
@@ -1137,19 +1128,23 @@ def create_engine_config(self,
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
 
-        assert self.cpu_offload_gb >= 0, (
-            "CPU offload space must be non-negative"
-            f", but got {self.cpu_offload_gb}")
+        return LoadConfig(
+            load_format=self.load_format,
+            download_dir=self.download_dir,
+            model_loader_extra_config=self.model_loader_extra_config,
+            ignore_patterns=self.ignore_patterns,
+        )
 
-        device_config = DeviceConfig(device=self.device)
+    def create_engine_config(self,
+                             usage_context: Optional[UsageContext] = None
+                             ) -> VllmConfig:
+        from vllm.platforms import current_platform
+        current_platform.pre_register_and_update()
 
-        # NOTE: This is to allow model loading from S3 in CI
-        if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
-                and self.model in MODELS_ON_S3
-                and self.load_format == LoadFormat.AUTO):  # noqa: E501
-            self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
-            self.load_format = LoadFormat.RUNAI_STREAMER
+        if envs.VLLM_USE_V1:
+            self._override_v1_engine_args(usage_context)
 
+        device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
         if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
@@ -1281,16 +1276,6 @@ def create_engine_config(self,
             if speculative_config is None \
             else speculative_config.num_lookahead_slots
 
-        if not self.use_v2_block_manager:
-            logger.warning(
-                "[DEPRECATED] Block manager v1 has been removed, "
-                "and setting --use-v2-block-manager to True or False has "
-                "no effect on vLLM behavior. Please remove "
-                "--use-v2-block-manager in your engine argument. "
-                "If your use case is not supported by "
-                "SelfAttnBlockSpaceManager (i.e. block manager v2),"
-                " please file an issue with detailed information.")
-
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,

From 2261ada381d844edde59edb0cca53851a5fab88d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 24 Feb 2025 19:39:07 -0500
Subject: [PATCH 0353/1240] [Misc][Chore] Clean Up `AsyncOutputProcessing` Logs
 (#13780)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 0bc9b2f817f..fea673b6856 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -710,8 +710,6 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             return
 
         if parallel_config.pipeline_parallel_size > 1:
-            logger.warning("Async output processing can not be enabled "
-                           "with pipeline parallel")
             self.use_async_output_proc = False
             return
 
@@ -719,15 +717,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         # If the feature combo become valid
         from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
-            logger.warning(
-                "Async output processing is not supported on the "
-                "current platform type %s.", current_platform.device_type)
             self.use_async_output_proc = False
             return
 
         if envs.VLLM_USE_RAY_SPMD_WORKER:
-            logger.warning(
-                "Async output processing can not be enabled with ray spmd")
             self.use_async_output_proc = False
             return
 
@@ -739,8 +732,6 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
-            logger.warning("Async output processing is not supported with"
-                           " speculative decoding currently.")
             self.use_async_output_proc = False
 
     def verify_with_parallel_config(
@@ -768,8 +759,6 @@ def verify_with_parallel_config(
                     "Supported models implement the `SupportsPP` interface.")
 
             if self.use_async_output_proc:
-                logger.warning("Async output processor is not supported with "
-                               "pipeline parallelism currently. Disabling it.")
                 self.use_async_output_proc = False
 
     def get_hf_config_sliding_window(

From 9cd7af809a3ed45d1071fd0193d22da5ceadd427 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Feb 2025 01:13:52 +0000
Subject: [PATCH 0354/1240] Remove unused kwargs from model definitions
 (#13555)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/basic.md       |  2 -
 docs/source/contributing/model/multimodal.md  |  2 -
 tests/kernels/test_encoder_decoder_attn.py    | 14 +--
 vllm/attention/layer.py                       | 19 ++--
 .../layers/mamba/mamba_mixer.py               |  5 +-
 .../layers/mamba/mamba_mixer2.py              |  4 +-
 vllm/model_executor/models/adapters.py        |  6 +-
 vllm/model_executor/models/arctic.py          | 24 +----
 vllm/model_executor/models/aria.py            |  5 -
 vllm/model_executor/models/baichuan.py        | 24 +----
 vllm/model_executor/models/bamba.py           | 29 ++----
 vllm/model_executor/models/bart.py            | 93 ++++---------------
 vllm/model_executor/models/bert.py            | 44 +++------
 vllm/model_executor/models/blip2.py           |  7 +-
 vllm/model_executor/models/bloom.py           | 31 ++-----
 vllm/model_executor/models/chameleon.py       | 27 +-----
 vllm/model_executor/models/chatglm.py         | 42 ++-------
 vllm/model_executor/models/commandr.py        | 24 +----
 vllm/model_executor/models/dbrx.py            | 35 ++-----
 vllm/model_executor/models/deepseek.py        | 26 ++----
 vllm/model_executor/models/deepseek_mtp.py    | 19 +---
 vllm/model_executor/models/deepseek_v2.py     | 31 ++-----
 vllm/model_executor/models/deepseek_vl2.py    |  5 -
 vllm/model_executor/models/eagle.py           |  7 +-
 vllm/model_executor/models/exaone.py          | 30 ++----
 vllm/model_executor/models/falcon.py          | 31 ++-----
 vllm/model_executor/models/florence2.py       | 34 ++-----
 vllm/model_executor/models/fuyu.py            |  5 -
 vllm/model_executor/models/gemma.py           | 24 +----
 vllm/model_executor/models/gemma2.py          | 24 +----
 vllm/model_executor/models/glm4v.py           | 10 +-
 vllm/model_executor/models/gpt2.py            | 32 ++-----
 vllm/model_executor/models/gpt_bigcode.py     | 32 ++-----
 vllm/model_executor/models/gpt_j.py           | 31 ++-----
 vllm/model_executor/models/gpt_neox.py        | 31 ++-----
 vllm/model_executor/models/granite.py         | 29 ++----
 vllm/model_executor/models/granitemoe.py      | 26 ++----
 vllm/model_executor/models/gritlm.py          |  9 +-
 vllm/model_executor/models/idefics3.py        |  9 --
 vllm/model_executor/models/interfaces_base.py |  9 +-
 vllm/model_executor/models/internlm2.py       | 35 ++-----
 vllm/model_executor/models/internlm2_ve.py    | 14 +--
 vllm/model_executor/models/internvl.py        |  5 -
 vllm/model_executor/models/jais.py            | 32 ++-----
 vllm/model_executor/models/jamba.py           | 29 +-----
 vllm/model_executor/models/llama.py           | 28 ++----
 vllm/model_executor/models/llava.py           |  5 -
 vllm/model_executor/models/llava_next.py      |  5 -
 .../model_executor/models/llava_next_video.py |  5 -
 vllm/model_executor/models/llava_onevision.py |  5 -
 vllm/model_executor/models/mamba.py           | 16 +---
 vllm/model_executor/models/mamba2.py          | 18 ++--
 vllm/model_executor/models/minicpm.py         | 24 +----
 vllm/model_executor/models/minicpm3.py        |  6 +-
 vllm/model_executor/models/minicpmo.py        |  5 -
 vllm/model_executor/models/minicpmv.py        |  5 -
 vllm/model_executor/models/mixtral.py         | 26 ++----
 vllm/model_executor/models/mixtral_quant.py   | 26 ++----
 vllm/model_executor/models/mllama.py          | 50 +++-------
 vllm/model_executor/models/molmo.py           | 25 +----
 vllm/model_executor/models/mpt.py             | 31 ++-----
 vllm/model_executor/models/nemotron.py        | 28 +-----
 vllm/model_executor/models/olmo.py            | 28 ++----
 vllm/model_executor/models/olmo2.py           | 28 ++----
 vllm/model_executor/models/olmoe.py           | 24 +----
 vllm/model_executor/models/opt.py             | 32 ++-----
 vllm/model_executor/models/orion.py           | 29 ++----
 vllm/model_executor/models/paligemma.py       |  7 +-
 vllm/model_executor/models/persimmon.py       | 27 +-----
 vllm/model_executor/models/phi.py             | 29 ++----
 vllm/model_executor/models/phi3_small.py      | 28 +-----
 vllm/model_executor/models/phi3v.py           |  5 -
 vllm/model_executor/models/phimoe.py          | 24 +----
 vllm/model_executor/models/pixtral.py         |  5 -
 .../models/prithvi_geospatial_mae.py          |  5 +-
 vllm/model_executor/models/qwen.py            | 26 ++----
 vllm/model_executor/models/qwen2.py           | 29 ++----
 vllm/model_executor/models/qwen2_5_vl.py      |  5 -
 vllm/model_executor/models/qwen2_audio.py     |  9 +-
 vllm/model_executor/models/qwen2_moe.py       | 26 ++----
 vllm/model_executor/models/qwen2_rm.py        |  8 +-
 vllm/model_executor/models/qwen2_vl.py        |  9 +-
 vllm/model_executor/models/qwen_vl.py         |  8 +-
 vllm/model_executor/models/roberta.py         |  7 +-
 vllm/model_executor/models/solar.py           | 21 +----
 vllm/model_executor/models/stablelm.py        | 29 ++----
 vllm/model_executor/models/starcoder2.py      | 26 ++----
 vllm/model_executor/models/transformers.py    | 13 +--
 vllm/model_executor/models/ultravox.py        | 17 +---
 vllm/model_executor/models/whisper.py         | 89 +++---------------
 vllm/spec_decode/draft_model_runner.py        |  2 -
 vllm/v1/worker/gpu_model_runner.py            | 22 +----
 vllm/v1/worker/tpu_model_runner.py            | 19 +---
 vllm/worker/cpu_enc_dec_model_runner.py       |  4 -
 vllm/worker/cpu_model_runner.py               |  2 -
 vllm/worker/cpu_pooling_model_runner.py       | 14 ---
 vllm/worker/enc_dec_model_runner.py           | 14 +--
 vllm/worker/hpu_model_runner.py               | 36 +++----
 vllm/worker/model_runner.py                   | 13 +--
 vllm/worker/multi_step_model_runner.py        |  4 +-
 vllm/worker/openvino_model_runner.py          |  4 -
 vllm/worker/pooling_model_runner.py           | 12 ---
 vllm/worker/tpu_model_runner.py               | 24 ++---
 vllm/worker/xpu_model_runner.py               | 13 +--
 104 files changed, 436 insertions(+), 1654 deletions(-)

diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md
index 180fdd59e9a..ad31995f76b 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/source/contributing/model/basic.md
@@ -74,8 +74,6 @@ def forward(
     self,
     input_ids: torch.Tensor,
     positions: torch.Tensor,
-    kv_caches: List[torch.Tensor],
-    attn_metadata: AttentionMetadata,
 ) -> torch.Tensor:
     ...
 ```
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 14a59953ef4..990eac82d51 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -16,8 +16,6 @@ Further update the model as follows:
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
   +     pixel_values: torch.Tensor,
     ) -> SamplerOutput:
   ```
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 0d11e8652ce..0a93f7ce945 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -644,11 +644,7 @@ def _run_encoder_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(
-            reshaped_query, packed_qkv.key, packed_qkv.value,
-            torch.tensor([],
-                         dtype=torch.float32,
-                         device=packed_qkv.query.device), attn_metadata)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
 def _run_decoder_self_attention_test(
@@ -682,7 +678,6 @@ def _run_decoder_self_attention_test(
       & attn_metadata
     '''
     attn = test_rsrcs.attn
-    kv_cache = test_rsrcs.kv_cache
     packed_qkv = decoder_test_params.packed_qkvo.packed_qkv
     assert packed_qkv is not None
     with set_forward_context(attn_metadata, vllm_config):
@@ -695,8 +690,7 @@ def _run_decoder_self_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value,
-                            kv_cache, attn_metadata)
+        return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value)
 
 
 def _run_encoder_decoder_cross_attention_test(
@@ -744,7 +738,6 @@ def _run_encoder_decoder_cross_attention_test(
     assert decoder_test_params.packed_qkvo.packed_qkv is not None
 
     attn = test_rsrcs.attn
-    kv_cache = test_rsrcs.kv_cache
     if cross_test_params is None:
         key = None
         value = None
@@ -762,8 +755,7 @@ def _run_encoder_decoder_cross_attention_test(
         # is shaped as [num_tokens, hidden_size] and we can skip the reshape.
         reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view(
             -1, test_pt.num_heads * test_pt.head_size)
-        return attn.forward(reshaped_query, key, value, kv_cache,
-                            attn_metadata)
+        return attn.forward(reshaped_query, key, value)
 
 
 @pytest.fixture(autouse=True)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index e4df7ffc588..bd7783cc398 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -7,7 +7,7 @@
 import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention import AttentionMetadata, AttentionType
+from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -153,15 +153,10 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments
-        # directly, use `self.kv_cache` and
-        # `get_forward_context().attn_metadata` instead.
         if self.calculate_kv_scales:
-            ctx_attn_metadata = get_forward_context().attn_metadata
-            if ctx_attn_metadata.enable_kv_scales_calculation:
+            attn_metadata = get_forward_context().attn_metadata
+            if attn_metadata.enable_kv_scales_calculation:
                 self.calc_kv_scales(key, value)
         if self.use_output:
             output = torch.empty_like(query)
@@ -177,14 +172,14 @@ def forward(
                 value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
                 forward_context: ForwardContext = get_forward_context()
-                ctx_attn_metadata = forward_context.attn_metadata
+                attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 self.impl.forward(self,
                                   query,
                                   key,
                                   value,
                                   self_kv_cache,
-                                  ctx_attn_metadata,
+                                  attn_metadata,
                                   output=output)
             else:
                 torch.ops.vllm.unified_attention_with_output(
@@ -193,10 +188,10 @@ def forward(
         else:
             if self.use_direct_call:
                 forward_context = get_forward_context()
-                ctx_attn_metadata = forward_context.attn_metadata
+                attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
                 return self.impl.forward(self, query, key, value,
-                                         self_kv_cache, ctx_attn_metadata)
+                                         self_kv_cache, attn_metadata)
             else:
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index 93c3cc91bb0..156e8752e96 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.parallel_state import (
     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -130,14 +131,14 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor):
         ) if use_rms_norm else None
 
     def forward_native(self, hidden_states: torch.Tensor,
-                       attn_metadata: AttentionMetadata,
                        conv_state: torch.Tensor, ssm_state: torch.Tensor):
         pass
 
     def forward_cuda(self, hidden_states: torch.Tensor,
-                     attn_metadata: AttentionMetadata,
                      mamba_cache_params: MambaCacheParams):
 
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
+
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1)
         hidden_states, gate = projected_states.chunk(2, dim=-2)
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 2bcf50e7071..b53a540ed66 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -14,6 +14,7 @@
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -376,17 +377,16 @@ def __init__(self,
                                        eps=rms_norm_eps)
 
     def forward_native(self, hidden_states: torch.Tensor,
-                       attn_metadata: AttentionMetadata,
                        conv_state: torch.Tensor, ssm_state: torch.Tensor):
         pass
 
     def forward_cuda(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor] = None,
     ):
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         seq_len, _ = hidden_states.shape
         groups_time_state_size = self.n_groups * self.ssm_state_size
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 3e1daa773fc..23d72d8e60f 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -160,7 +160,6 @@ def as_classification_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.attention import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.linear import RowParallelLinear
     from vllm.model_executor.layers.pooler import PoolingType
@@ -201,13 +200,10 @@ def forward(
             self,
             input_ids: torch.Tensor,
             positions: torch.Tensor,
-            kv_caches: list[torch.Tensor],
-            attn_metadata: AttentionMetadata,
             intermediate_tensors: Optional[IntermediateTensors] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
         ) -> torch.Tensor:
-            hidden_states = super().forward(input_ids, positions, kv_caches,
-                                            attn_metadata,
+            hidden_states = super().forward(input_ids, positions,
                                             intermediate_tensors,
                                             inputs_embeds)
             logits, _ = self.score(hidden_states)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 77f383b6e46..e2d4a8de605 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -5,7 +5,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -283,13 +283,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -336,16 +334,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual_input = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual_input + hidden_states
 
@@ -400,8 +394,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -413,11 +405,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(positions, hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm(hidden_states)
@@ -458,13 +447,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index bff4100a1de..656e9b037d9 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -9,7 +9,6 @@
 from transformers.models.aria.modeling_aria import AriaCrossAttention
 from transformers.models.aria.processing_aria import AriaProcessor
 
-from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, QuantizationConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_rank
 from vllm.model_executor.layers.activation import get_act_fn
@@ -626,8 +625,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -643,8 +640,6 @@ def forward(
         hidden_states = self.language_model(
             input_ids,
             positions,
-            kv_caches,
-            attn_metadata,
             intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 2e51b9c9c0c..4fb68e7b48d 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -20,13 +20,13 @@
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -182,14 +182,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.W_pack(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         if self.postion_embedding != "ALIBI":
             q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -232,8 +230,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -246,8 +242,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -301,8 +295,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -316,13 +308,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -379,13 +368,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 22ae1775c3d..69da05884de 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -1,17 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 """Inference-only Bamba model."""
 # Added by the IBM Team, 2024
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import BambaConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -107,7 +107,6 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor] = None,
@@ -120,8 +119,8 @@ def forward(
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
 
-        hidden_states = self.mamba(hidden_states, attn_metadata,
-                                   mamba_cache_params, sequence_idx)
+        hidden_states = self.mamba(hidden_states, mamba_cache_params,
+                                   sequence_idx)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
             hidden_states, residual)
@@ -215,15 +214,13 @@ def self_attention(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -231,8 +228,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         **kwargs,
     ):
@@ -246,8 +241,6 @@ def forward(
         hidden_states = self.self_attention(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
@@ -312,8 +305,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -323,6 +314,7 @@ def forward(
         # proper continuous batching computation including
         # chunked prefill
         seq_idx = None
+        attn_metadata = get_forward_context().attn_metadata
         if attn_metadata.num_prefills > 0:
             seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
             for i, (srt, end) in enumerate(
@@ -348,9 +340,7 @@ def forward(
         num_attn = 0
         for i in range(len(self.layers)):
             layer = self.layers[i]
-            kv_cache = None
             if isinstance(layer, BambaAttentionDecoderLayer):
-                kv_cache = kv_caches[num_attn]
                 num_attn += 1
 
             layer_mamba_cache_params = None
@@ -361,8 +351,6 @@ def forward(
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
-                kv_cache=kv_cache,
-                attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params,
                 sequence_idx=seq_idx,
@@ -440,8 +428,6 @@ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[KVCache],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
@@ -454,8 +440,7 @@ def forward(self,
                 self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
                 *self._get_mamba_cache_shape())
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, mamba_cache_params,
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 204c48d0d89..5d2a8cdcb97 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -19,14 +19,14 @@
 # limitations under the License.
 """PyTorch BART model."""
 import math
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 import torch
 from torch import nn
 from transformers import BartConfig
 from transformers.utils import logging
 
-from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention import Attention, AttentionType
 from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -181,14 +181,13 @@ def __init__(
                               prefix=f"{prefix}.attn",
                               attn_type=AttentionType.ENCODER)
 
-    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
 
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -261,14 +260,13 @@ def __init__(
                               prefix=f"{prefix}.attn",
                               attn_type=AttentionType.DECODER)
 
-    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
 
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -344,8 +342,6 @@ def __init__(
     def forward(
         self,
         decoder_hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         encoder_hidden_states: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
@@ -363,7 +359,7 @@ def forward(
             _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
                                     dim=-1)
 
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
 
         output, _ = self.out_proj(attn_output)
         return output
@@ -411,23 +407,16 @@ def __init__(
 
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
 
-    def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor,
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
             hidden_states
                 torch.Tensor of *encoder* input embeddings.
-            kv_cache:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Encoder layer output torch.Tensor
         """
         residual = hidden_states
-        hidden_states = self.self_attn(hidden_states=hidden_states,
-                                       kv_cache=kv_cache,
-                                       attn_metadata=attn_metadata)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
 
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -509,18 +498,12 @@ def __init__(
     def forward(
         self,
         decoder_hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         encoder_hidden_states: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         r"""
         Args:
             decoder_hidden_states
                 torch.Tensor of *decoder* input embeddings.
-            kv_cache:
-                KV cache tensor
-            attn_metadata:
-                vLLM Attention metadata structure
             encoder_hidden_states
                 torch.Tensor of *encoder* input embeddings.
         Returns:
@@ -529,9 +512,7 @@ def forward(
         residual = decoder_hidden_states
 
         # Self Attention
-        hidden_states = self.self_attn(hidden_states=decoder_hidden_states,
-                                       kv_cache=kv_cache,
-                                       attn_metadata=attn_metadata)
+        hidden_states = self.self_attn(hidden_states=decoder_hidden_states)
 
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -542,8 +523,6 @@ def forward(
 
         hidden_states = self.encoder_attn(
             decoder_hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
             encoder_hidden_states=encoder_hidden_states,
         )
 
@@ -609,9 +588,8 @@ def __init__(self,
 
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor,
+                positions: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
             input_ids
@@ -620,10 +598,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 provide it.
             positions
                 Positions of *encoder* input sequence tokens.
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Decoder output torch.Tensor
         """
@@ -636,12 +610,8 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
 
-        for idx, encoder_layer in enumerate(self.layers):
-            hidden_states = encoder_layer(
-                hidden_states=hidden_states,
-                kv_cache=kv_caches[idx],
-                attn_metadata=attn_metadata,
-            )
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states=hidden_states)
 
         return hidden_states
 
@@ -693,9 +663,7 @@ def __init__(
 
     def forward(self, decoder_input_ids: torch.Tensor,
                 decoder_positions: torch.Tensor,
-                encoder_hidden_states: Optional[torch.Tensor],
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+                encoder_hidden_states: Optional[torch.Tensor]) -> torch.Tensor:
         r"""
         Args:
             decoder_input_ids
@@ -706,10 +674,6 @@ def forward(self, decoder_input_ids: torch.Tensor,
                 Positions of *decoder* input sequence tokens.
             encoder_hidden_states:
                 Tensor of encoder output embeddings
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Decoder output torch.Tensor
         """
@@ -725,11 +689,9 @@ def forward(self, decoder_input_ids: torch.Tensor,
 
         # decoder layers
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             hidden_states = decoder_layer(
                 decoder_hidden_states=hidden_states,
-                kv_cache=kv_caches[idx],
-                attn_metadata=attn_metadata,
                 encoder_hidden_states=encoder_hidden_states,
             )
 
@@ -768,8 +730,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 encoder_input_ids: torch.Tensor,
-                encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+                encoder_positions: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
             input_ids
@@ -782,10 +743,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 Indices of *encoder* input sequence tokens in the vocabulary.
             encoder_positions:
                 Positions of *encoder* input sequence tokens.
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Model output torch.Tensor
         """
@@ -796,18 +753,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
             # Run encoder attention if a non-zero number of encoder tokens
             # are provided as input
             encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
-                                                 positions=encoder_positions,
-                                                 kv_caches=kv_caches,
-                                                 attn_metadata=attn_metadata)
+                                                 positions=encoder_positions)
 
         # decoder outputs consists of
         # (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             decoder_input_ids=input_ids,
             decoder_positions=positions,
-            encoder_hidden_states=encoder_hidden_states,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata)
+            encoder_hidden_states=encoder_hidden_states)
 
         return decoder_outputs
 
@@ -845,8 +798,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         *,
         encoder_input_ids: torch.Tensor,
@@ -863,15 +814,11 @@ def forward(
                 torch.Tensor of *encoder* input token ids.
             encoder_positions
                 torch.Tensor of *encoder* position indices
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Output torch.Tensor
         """
         return self.model(input_ids, positions, encoder_input_ids,
-                          encoder_positions, kv_caches, attn_metadata)
+                          encoder_positions)
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 4d0f5ac8ea5..4ff69527653 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,15 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import BertConfig
 
-from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                QKVParallelLinear,
@@ -113,12 +114,9 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        for i in range(len(self.layer)):
-            layer = self.layer[i]
-            hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
+        for layer in self.layer:
+            hidden_states = layer(hidden_states)
         return hidden_states
 
 
@@ -152,13 +150,8 @@ def __init__(self,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.output")
 
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        kv_cache: Optional[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ):
-        attn_output = self.attention(hidden_states, kv_cache, attn_metadata)
+    def forward(self, hidden_states: torch.Tensor):
+        attn_output = self.attention(hidden_states)
         intermediate_output = self.intermediate(attn_output)
         output = self.output(intermediate_output, attn_output)
         return output
@@ -191,10 +184,8 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        self_output = self.self(hidden_states, kv_cache, attn_metadata)
+        self_output = self.self(hidden_states)
         return self.output(self_output, hidden_states)
 
 
@@ -246,12 +237,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output = self.attn(q, k, v)
         return output
 
 
@@ -343,8 +332,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
@@ -352,13 +339,14 @@ def forward(
         if inputs_embeds is not None:
             hidden_states = inputs_embeds
         else:
+            attn_metadata = get_forward_context().attn_metadata
             assert hasattr(attn_metadata, "seq_lens_tensor")
             hidden_states = self.embeddings(
                 input_ids=input_ids,
                 seq_lens=attn_metadata.seq_lens_tensor,
                 position_ids=position_ids,
                 token_type_ids=token_type_ids)
-        return self.encoder(hidden_states, kv_caches, attn_metadata)
+        return self.encoder(hidden_states)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
@@ -420,17 +408,13 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.model(input_ids=input_ids,
                           position_ids=positions,
-                          kv_caches=kv_caches,
                           inputs_embeds=inputs_embeds,
-                          intermediate_tensors=intermediate_tensors,
-                          attn_metadata=attn_metadata)
+                          intermediate_tensors=intermediate_tensors)
 
     def pooler(
         self,
@@ -519,16 +503,12 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.bert(input_ids=input_ids,
                          position_ids=positions,
-                         kv_caches=kv_caches,
                          inputs_embeds=inputs_embeds,
                          intermediate_tensors=intermediate_tensors,
-                         attn_metadata=attn_metadata,
                          token_type_ids=token_type_ids)
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 0463a0b97d4..23bb3cd07f1 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -9,7 +9,6 @@
 from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig,
                           apply_chunking_to_forward)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -658,8 +657,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -708,8 +705,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 229677ae7d9..84b79613abc 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -18,13 +18,13 @@
 # limitations under the License.
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import BloomConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -126,13 +126,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         del position_ids  # Unused.
         qkv, _ = self.query_key_value(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.dense(attn_output)
         return output
 
@@ -193,8 +191,6 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Layer norm at the beginning of the transformer layer.
         layernorm_output = self.input_layernorm(hidden_states)
@@ -209,8 +205,6 @@ def forward(
         attention_output = self.self_attention(
             position_ids=position_ids,
             hidden_states=layernorm_output,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         attention_output = attention_output + residual
         layernorm_output = self.post_attention_layernorm(attention_output)
@@ -266,8 +260,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -279,14 +271,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.ln_f(hidden_states)
@@ -322,14 +308,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 2d4dfab6073..e91399b2674 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import cached_property
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
+from typing import (Any, Dict, Iterable, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import torch
@@ -10,7 +10,7 @@
 from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor,
                           ChameleonVQVAEConfig)
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -310,15 +310,13 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self._apply_qk_norm(q, k)
 
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -372,8 +370,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
 
@@ -386,8 +382,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -447,8 +441,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
 
@@ -456,8 +448,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         hidden_states = self.input_layernorm(hidden_states)
@@ -906,8 +896,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -921,13 +909,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -1028,8 +1013,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
@@ -1048,8 +1031,6 @@ def forward(
 
         hidden_states = self.model(input_ids,
                                    positions,
-                                   kv_caches,
-                                   attn_metadata,
                                    intermediate_tensors,
                                    inputs_embeds=inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index ecf41765545..6eca25212ee 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,13 +2,13 @@
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from torch.nn import LayerNorm
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -108,19 +108,11 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.query_key_value(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        context_layer = self.attn(
-            q,
-            k,
-            v,
-            kv_cache,
-            attn_metadata,
-        )
+        context_layer = self.attn(q, k, v)
         attn_output, _ = self.dense(context_layer)
         return attn_output
 
@@ -215,8 +207,6 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # hidden_states: [num_tokens, h]
         # Layer norm at the beginning of the transformer layer.
@@ -225,8 +215,6 @@ def forward(
         attention_output = self.self_attention(
             hidden_states=layernorm_output,
             position_ids=position_ids,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Residual connection.
@@ -289,17 +277,10 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                hidden_states=hidden_states,
-                position_ids=position_ids,
-                kv_cache=kv_caches[i - self.start_layer],
-                attn_metadata=attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states=hidden_states,
+                                  position_ids=position_ids)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -350,8 +331,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -369,8 +348,6 @@ def forward(
         hidden_states = self.encoder(
             hidden_states=hidden_states,
             position_ids=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
 
         return hidden_states
@@ -494,12 +471,9 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 0ceefc3e93a..b0cb4a62333 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -21,14 +21,14 @@
 
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from transformers import CohereConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -218,8 +218,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -227,7 +225,7 @@ def forward(
             q, k = self._apply_qk_norm(q, k)
         if self.v1 or self.sliding_window:
             q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -255,8 +253,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -265,8 +261,6 @@ def forward(
         hidden_states_attention = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states_mlp = self.mlp(hidden_states)
         # Add everything together
@@ -311,8 +305,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -326,13 +318,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -389,13 +378,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index bb3f4f40dd2..7830dd4ce2e 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
@@ -230,15 +230,13 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.Wqkv(hidden_states)
         if self.clip_qkv is not None:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         hidden_states, _ = self.out_proj(attn_output)
         return hidden_states
 
@@ -265,16 +263,12 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.norm_1(hidden_states)
         x = self.attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + x
         residual = hidden_states
@@ -303,14 +297,10 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         hidden_states, residual = self.norm_attn_norm(
             position_ids=position_ids,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = self.ffn(hidden_states)
         hidden_states = hidden_states + residual
@@ -353,8 +343,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -366,14 +354,8 @@ def forward(
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            block = self.blocks[i]
-            hidden_states = block(
-                position_ids,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for block in self.blocks[self.start_layer:self.end_layer]:
+            hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm_f(hidden_states)
@@ -415,14 +397,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 9599e1df6a3..c04e7a02bae 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Deepseek model."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -248,13 +248,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -309,8 +307,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -323,8 +319,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -370,8 +364,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -384,11 +376,8 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -425,13 +414,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 1a051992a30..cac1b2b3b11 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -69,8 +68,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_index: int = 0,
@@ -88,8 +85,6 @@ def forward(
 
         hidden_states, residual = self.mtp_block(positions=positions,
                                                  hidden_states=hidden_states,
-                                                 kv_cache=kv_cache,
-                                                 attn_metadata=attn_metadata,
                                                  residual=None)
         hidden_states = residual + hidden_states
         return self.shared_head(hidden_states)
@@ -122,8 +117,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
@@ -131,8 +124,6 @@ def forward(
         return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
             input_ids,
             positions,
-            kv_caches[spec_step_idx],
-            attn_metadata,
             previous_hidden_states,
             inputs_embeds,
             spec_step_idx,
@@ -165,16 +156,14 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, previous_hidden_states,
-                                   inputs_embeds, spec_step_idx)
+        hidden_states = self.model(input_ids, positions,
+                                   previous_hidden_states, inputs_embeds,
+                                   spec_step_idx)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 9bf3ec2ffd8..22b2bf7ca46 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only DeepseekV2/DeepseekV3 model."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
@@ -279,8 +279,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         if self.q_lora_rank is not None:
             q = self.q_a_proj(hidden_states)[0]
@@ -313,7 +311,7 @@ def forward(
         v = torch.nn.functional.pad(
             v, [0, self.qk_head_dim - self.v_head_dim],
             value=0).view(-1, self.num_local_heads * self.qk_head_dim)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output = attn_output.view(
             -1, self.num_local_heads,
             self.qk_head_dim)[..., :self.v_head_dim].reshape(
@@ -451,8 +449,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         if self.q_lora_rank is not None:
             ckq = self.q_a_proj(hidden_states)[0]
@@ -462,8 +458,7 @@ def forward(
         kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe, kv_cache,
-                             attn_metadata)
+        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe)
 
 
 class DeepseekV2DecoderLayer(nn.Module):
@@ -532,8 +527,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -546,8 +539,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -608,8 +599,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -624,11 +613,8 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -665,13 +651,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 5f684fa295a..4e2dda33bca 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -13,7 +13,6 @@
 from einops import rearrange, repeat
 from transformers import BatchFeature
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
@@ -595,8 +594,6 @@ def get_input_embeddings(
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
@@ -614,8 +611,6 @@ def forward(self,
 
         hidden_states = self.language_model(input_ids,
                                             positions,
-                                            kv_caches,
-                                            attn_metadata,
                                             intermediate_tensors,
                                             inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index ab3f0dc07f4..f2a2935e6c6 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 import torch
 import torch.nn as nn
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -121,8 +120,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -140,8 +137,6 @@ def forward(
             input_ids=None,
             inputs_embeds=inputs_embeds,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
         )
         return hidden_states
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index e795c7e288c..79939f6f40e 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -24,12 +24,12 @@
 # limitations under the License.
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -179,13 +179,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.out_proj(attn_output)
         return output
 
@@ -225,14 +223,10 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         return self.attention(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
 
@@ -288,8 +282,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -301,8 +293,6 @@ def forward(
         hidden_states = self.attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -365,8 +355,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -381,13 +369,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
+        for layer in self.h[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
 
@@ -471,14 +456,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.transformer(input_ids, positions, kv_caches,
-                                        attn_metadata, intermediate_tensors,
-                                        inputs_embeds)
+        model_output = self.transformer(input_ids, positions,
+                                        intermediate_tensors, inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 01b66a1c2a5..7154ac2e6a5 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -20,14 +20,14 @@
 """PyTorch Falcon model."""
 
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from torch.nn import LayerNorm
 from transformers import FalconConfig as HF_FalconConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -190,8 +190,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, bias = self.query_key_value(hidden_states)
         if bias is not None:
@@ -199,7 +197,7 @@ def forward(
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.use_rotary:
             q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output, bias = self.dense(attn_output)
         return attn_output, bias
 
@@ -291,8 +289,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -306,8 +302,6 @@ def forward(
         attention_output, attention_bias = self.self_attention(
             positions=positions,
             hidden_states=attention_layernorm_out,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         if self.reduce_row_parallel_results and attention_bias is not None:
             attention_output += attention_bias
@@ -384,8 +378,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -396,14 +388,8 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.ln_f(hidden_states)
@@ -450,14 +436,11 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 4a1ad5f4ee0..06912bcfdc8 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -50,8 +49,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 encoder_input_ids: torch.Tensor,
-                encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata) -> torch.Tensor:
+                encoder_positions: torch.Tensor) -> torch.Tensor:
         r"""
         Args:
             input_ids
@@ -64,10 +62,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 Indices of *encoder* input sequence tokens in the vocabulary.
             encoder_positions:
                 Positions of *encoder* input sequence tokens.
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Model output torch.Tensor
         """
@@ -78,18 +72,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
             # Run encoder attention if a non-zero number of encoder tokens
             # are provided as input
             encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
-                                                 positions=encoder_positions,
-                                                 kv_caches=kv_caches,
-                                                 attn_metadata=attn_metadata)
+                                                 positions=encoder_positions)
 
         # decoder outputs consists of
         # (dec_features, past_key_value, dec_hidden, dec_attn)
         decoder_outputs = self.decoder(
             decoder_input_ids=input_ids,
             decoder_positions=positions,
-            encoder_hidden_states=encoder_hidden_states,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata)
+            encoder_hidden_states=encoder_hidden_states)
 
         return decoder_outputs
 
@@ -122,8 +112,6 @@ def forward(
         positions: torch.Tensor,
         encoder_input_ids: torch.Tensor,
         encoder_positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> torch.Tensor:
         r"""
@@ -136,15 +124,11 @@ def forward(
                 torch.Tensor of *encoder* input token ids.
             encoder_positions
                 torch.Tensor of *encoder* position indices
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Output torch.Tensor
         """
         return self.model(input_ids, positions, encoder_input_ids,
-                          encoder_positions, kv_caches, attn_metadata)
+                          encoder_positions)
 
     def compute_logits(
         self,
@@ -213,8 +197,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         *,
         encoder_input_ids: torch.Tensor,
@@ -231,15 +213,11 @@ def forward(
                 torch.Tensor of *encoder* input token ids.
             encoder_positions
                 torch.Tensor of *encoder* position indices
-            kv_caches:
-                Layer-wise list of KV cache tensors
-            attn_metadata:
-                vLLM Attention metadata structure
         Returns:
             Output torch.Tensor
         """
         return self.language_model(input_ids, positions, encoder_input_ids,
-                                   encoder_positions, kv_caches, attn_metadata)
+                                   encoder_positions)
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 42a6aa97942..4f5519f325e 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -25,7 +25,6 @@
 from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor,
                           FuyuProcessor)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import ColumnParallelLinear
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -351,8 +350,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -371,8 +368,6 @@ def forward(
         hidden_states = self.language_model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index d0589e60a72..da17646c540 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -16,13 +16,13 @@
 # limitations under the License.
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from functools import cache
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GemmaConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -183,13 +183,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -233,8 +231,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -247,8 +243,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -298,8 +292,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -313,13 +305,10 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -370,13 +359,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 6ee257d65c5..cf744fc2b9d 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -15,13 +15,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import Gemma2Config
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -164,13 +164,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -220,8 +218,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if residual is None:
@@ -233,8 +229,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = self.post_attention_layernorm(hidden_states)
 
@@ -284,8 +278,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -300,13 +292,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -415,13 +404,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 8fc5a797f82..48543c5642e 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -4,7 +4,7 @@
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
-from typing import List, Literal, Mapping, Optional, TypedDict, Union
+from typing import Literal, Mapping, Optional, TypedDict, Union
 
 import torch
 from torch import nn
@@ -15,7 +15,6 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import AttentionMetadata
 from vllm.attention.layer import MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -628,8 +627,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -645,8 +642,7 @@ def forward(
                                                       vision_embeddings)
             input_ids = None
 
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 7ad9a24dcbb..776c03f652b 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -18,13 +18,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GPT2Config
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
@@ -92,12 +92,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.c_attn(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output, _ = self.c_proj(attn_output)
         return attn_output
 
@@ -164,16 +162,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
+        attn_output = self.attn(hidden_states=hidden_states)
         # residual connection
         hidden_states = attn_output + residual
 
@@ -222,8 +214,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -236,11 +226,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -279,14 +266,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 799edff46ea..43f3d4f6dc9 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -19,13 +19,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GPTBigCodeConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -101,8 +101,6 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.c_attn(hidden_states)
         q, k, v = qkv.split(
@@ -112,7 +110,7 @@ def forward(
             ],
             dim=-1,
         )
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output, _ = self.c_proj(attn_output)
         return attn_output
 
@@ -173,16 +171,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
+        attn_output = self.attn(hidden_states=hidden_states, )
         # residual connection
         hidden_states = attn_output + residual
 
@@ -234,8 +226,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -246,11 +236,8 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -302,14 +289,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 815aba145d3..752aec0b223 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -17,13 +17,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GPTJConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -104,13 +104,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output, _ = self.out_proj(attn_output)
         return attn_output
 
@@ -167,16 +165,12 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
         attn_output = self.attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         mlp_output = self.mlp(hidden_states)
         hidden_states = attn_output + mlp_output + residual
@@ -217,8 +211,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -229,14 +221,8 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.ln_f(hidden_states)
@@ -273,14 +259,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 550ca3f7ca9..4b30c7bb303 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -17,13 +17,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GPTNeoXConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -104,13 +104,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.query_key_value(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.dense(attn_output)
         return output
 
@@ -167,15 +165,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         attn_input = self.input_layernorm(hidden_states)
         attn_output = self.attention(
             position_ids=position_ids,
             hidden_states=attn_input,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         if self.use_parallel_residual:
@@ -230,8 +224,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -242,14 +234,8 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                position_ids,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.final_layer_norm(hidden_states)
@@ -285,14 +271,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
-                                      attn_metadata, intermediate_tensors,
-                                      inputs_embeds)
+        hidden_states = self.gpt_neox(input_ids, positions,
+                                      intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 2aeb179ee93..201e15d3a30 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import GraniteConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -166,13 +166,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -233,8 +231,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -242,8 +238,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states * self.residual_multiplier
         # Fully Connected
@@ -300,8 +294,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -318,14 +310,8 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -405,13 +391,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
+        model_output = self.model(input_ids, positions, intermediate_tensors,
                                   inputs_embeds)
         return model_output
 
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 40df9c72c56..9b56874a8ad 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GraniteMoe model."""
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers.models.granitemoe import GraniteMoeConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -173,13 +173,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -226,8 +224,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Self Attention
         residual = hidden_states
@@ -235,8 +231,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states * self.residual_multiplier
         residual = hidden_states
@@ -287,8 +281,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -303,11 +295,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(positions, hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -377,13 +366,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 0f3a2ffe9a1..a20328289f9 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -1,15 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from array import array
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
 from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-from vllm.attention import AttentionMetadata
 from vllm.attention.backends.xformers import XFormersImpl
 from vllm.config import ModelConfig, VllmConfig
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import PoolerHead
 from vllm.model_executor.models.llama import LlamaForCausalLM
@@ -217,13 +217,12 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> Union[torch.Tensor, IntermediateTensors]:
 
         # Change attention to non-causal for pooling tasks.
         if self.runner_type == "pooling":
+            attn_metadata = get_forward_context().attn_metadata
             assert attn_metadata.prefill_metadata.attn_bias is None
             attn_metadata.prefill_metadata.attn_bias = [
                 BlockDiagonalMask.from_seqlens(attn_metadata.seq_lens)
@@ -232,8 +231,6 @@ def forward(
         return super().forward(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             **kwargs,
         )
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 3a7e2a9a6a5..0a8763cf910 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -25,7 +25,6 @@
 from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
                           Idefics3Processor)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
@@ -563,8 +562,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -572,8 +569,6 @@ def forward(
         hidden_states = self.text_model(
             input_ids,
             positions,
-            kv_caches,
-            attn_metadata,
             intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
@@ -645,8 +640,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -664,8 +657,6 @@ def forward(
 
         hidden_states = self.model.text_model(input_ids,
                                               positions,
-                                              kv_caches,
-                                              attn_metadata,
                                               intermediate_tensors,
                                               inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index c5f7be135d7..22c9287509e 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import (TYPE_CHECKING, List, Optional, Protocol, Type, Union,
-                    overload, runtime_checkable)
+from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload,
+                    runtime_checkable)
 
 import torch
 import torch.nn as nn
@@ -11,7 +11,6 @@
 from vllm.utils import supports_kw
 
 if TYPE_CHECKING:
-    from vllm.attention import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import PoolerOutput
     from vllm.model_executor.layers.sampler import SamplerOutput
@@ -46,8 +45,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: "AttentionMetadata",
     ) -> T_co:
         ...
 
@@ -62,7 +59,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
     if not callable(model_forward):
         return False
 
-    vllm_kws = ("input_ids", "positions", "kv_caches", "attn_metadata")
+    vllm_kws = ("input_ids", "positions")
     missing_kws = tuple(kw for kw in vllm_kws
                         if not supports_kw(model_forward, kw))
 
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index b21933dd5da..41ca399b9ef 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import partial
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -175,13 +175,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.wqkv(hidden_states)
         q, k, v = self.split_qkv(qkv)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.wo(attn_output)
         return output
 
@@ -227,8 +225,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -241,8 +237,6 @@ def forward(
         hidden_states = self.attention(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -290,8 +284,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -305,15 +297,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -363,13 +348,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
@@ -466,13 +448,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         logits, _ = self.v_head(hidden_states)
         return logits
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 106c3b6b78c..69b0caab8f8 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -65,8 +64,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         visual_token_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -80,8 +77,6 @@ def forward(
         hidden_states = self.attention(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -113,8 +108,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         visual_token_mask: Optional[torch.Tensor] = None,
@@ -129,13 +122,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
                 visual_token_mask=visual_token_mask,
             )
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 4a600787677..52ddb279cca 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -17,7 +17,6 @@
 from PIL import Image
 from transformers import BatchFeature, PretrainedConfig, TensorType
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
@@ -929,8 +928,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -951,8 +948,6 @@ def forward(
         forward_kwargs = {
             "input_ids": input_ids,
             "positions": positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
             "intermediate_tensors": intermediate_tensors,
             "inputs_embeds": inputs_embeds,
         }
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 72bcef5e228..78fe6588edd 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -21,12 +21,12 @@
 """Inference-only Jais model compatible with HuggingFace weights."""
 
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -123,12 +123,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.c_attn(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output, _ = self.c_proj(attn_output)
         return attn_output
 
@@ -200,16 +198,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.ln_1(hidden_states)
-        attn_output = self.attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
+        attn_output = self.attn(hidden_states=hidden_states, )
         # residual connection
         hidden_states = attn_output + residual
 
@@ -266,8 +258,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
@@ -285,11 +275,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
-            hidden_states = layer(hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.h[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -332,14 +319,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 5530e3ca708..14e56df6cad 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 """Inference-only Jamba model."""
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import JambaConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -138,7 +137,6 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
         **kwargs,
@@ -150,8 +148,7 @@ def forward(
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
 
-        hidden_states = self.mamba(hidden_states, attn_metadata,
-                                   mamba_cache_params)
+        hidden_states = self.mamba(hidden_states, mamba_cache_params)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
             hidden_states, residual)
@@ -223,13 +220,11 @@ def self_attention(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -237,8 +232,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         **kwargs,
     ):
@@ -252,8 +245,6 @@ def forward(
         hidden_states = self.self_attention(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(
@@ -320,8 +311,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -339,12 +328,9 @@ def forward(
 
         kv_cache_index = 0
         mamba_cache_index = 0
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            kv_cache = None
+        for layer in self.layers[self.start_layer:self.end_layer]:
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
-                kv_cache = kv_caches[kv_cache_index]
                 kv_cache_index += 1
             if isinstance(layer, JambaMambaDecoderLayer):
                 current_state_layer = mamba_cache_index
@@ -355,8 +341,6 @@ def forward(
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
-                kv_cache=kv_cache,
-                attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=layer_mamba_cache_params)
         if not get_pp_group().is_last_rank:
@@ -429,8 +413,6 @@ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[KVCache],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
@@ -443,8 +425,7 @@ def forward(self,
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, mamba_cache_params,
+        hidden_states = self.model(input_ids, positions, mamba_cache_params,
                                    intermediate_tensors, inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 011d0a7aafa..a0aff9e609d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union
 
 import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -197,13 +197,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -268,8 +266,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -280,9 +276,7 @@ def forward(
             hidden_states, residual = self.input_layernorm(
                 hidden_states, residual)
         hidden_states = self.self_attn(positions=positions,
-                                       hidden_states=hidden_states,
-                                       kv_cache=kv_cache,
-                                       attn_metadata=attn_metadata)
+                                       hidden_states=hidden_states)
 
         # Fully Connected
         hidden_states, residual = self.post_attention_layernorm(
@@ -347,8 +341,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -363,11 +355,8 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -535,13 +524,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
+        model_output = self.model(input_ids, positions, intermediate_tensors,
                                   inputs_embeds)
         return model_output
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 19752ba703f..72b1591306f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -15,7 +15,6 @@
 from transformers.models.llava import LlavaProcessor
 from transformers.models.pixtral import PixtralProcessor
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.inputs import InputProcessingContext
 from vllm.model_executor.layers.activation import get_act_fn
@@ -658,8 +657,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -712,8 +709,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c39daec709f..6a050d7798a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -12,7 +12,6 @@
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -508,8 +507,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -571,8 +568,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 2af3cc05080..807d6977ed4 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -10,7 +10,6 @@
 from transformers import (BatchFeature, LlavaNextVideoConfig,
                           LlavaNextVideoProcessor)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -443,8 +442,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -468,8 +465,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 8eb8071e657..e57eea4286e 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -13,7 +13,6 @@
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -922,8 +921,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -955,8 +952,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ba88950ee89..9f1cd8c29a5 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 """PyTorch MAMBA model."""
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
 from transformers import MambaConfig
 
-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
@@ -64,7 +63,6 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
         **kwargs,
@@ -75,8 +73,7 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        hidden_states = self.mixer(hidden_states, attn_metadata,
-                                   mamba_cache_params)
+        hidden_states = self.mixer(hidden_states, mamba_cache_params)
         return hidden_states, residual
 
 
@@ -125,7 +122,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -146,7 +142,6 @@ def forward(
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
-                attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(
                     i - self.start_layer))
@@ -208,8 +203,6 @@ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[KVCache],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
@@ -222,9 +215,8 @@ def forward(self,
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-        hidden_states = self.backbone(input_ids, positions, attn_metadata,
-                                      mamba_cache_params, intermediate_tensors,
-                                      inputs_embeds)
+        hidden_states = self.backbone(input_ids, positions, mamba_cache_params,
+                                      intermediate_tensors, inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 6366fc02368..266cdc243ac 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """PyTorch MAMBA2 model."""
-from typing import Iterable, List, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -10,6 +10,7 @@
 from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer2 import (
@@ -63,7 +64,6 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         mamba_cache_params: MambaCacheParams,
         sequence_idx: Optional[torch.Tensor],
@@ -75,8 +75,8 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        hidden_states = self.mixer(hidden_states, attn_metadata,
-                                   mamba_cache_params, sequence_idx)
+        hidden_states = self.mixer(hidden_states, mamba_cache_params,
+                                   sequence_idx)
         return hidden_states, residual
 
 
@@ -122,7 +122,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -142,6 +141,7 @@ def forward(
         # proper continuous batching computation including
         # chunked prefill
         seq_idx = None
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
         if attn_metadata.num_prefills > 0:
             seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
             for i, (srt, end) in enumerate(
@@ -158,7 +158,6 @@ def forward(
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
-                attn_metadata=attn_metadata,
                 residual=residual,
                 mamba_cache_params=mamba_cache_params.at_layer_idx(
                     i - self.start_layer),
@@ -224,8 +223,6 @@ def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[KVCache],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
@@ -238,9 +235,8 @@ def forward(self,
 
         mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
 
-        hidden_states = self.backbone(input_ids, positions, attn_metadata,
-                                      mamba_cache_params, intermediate_tensors,
-                                      inputs_embeds)
+        hidden_states = self.backbone(input_ids, positions, mamba_cache_params,
+                                      intermediate_tensors, inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 54b691b3572..34e1f3927a9 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -23,13 +23,13 @@
 # limitations under the License.
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -257,8 +257,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -266,7 +264,7 @@ def forward(
         q, k = q.float(), k.float()
         q, k = self.rotary_emb(positions, q, k)
         q, k = q.to(orig_dtype), k.to(orig_dtype)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -331,8 +329,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -341,8 +337,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states * \
             (self.config.scale_depth / math.sqrt(self.config.num_hidden_layers))
@@ -409,8 +403,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -424,13 +416,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -579,13 +568,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index b85306c4088..1b24c38cef1 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -129,8 +129,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         q, _ = self.q_a_proj(hidden_states)
         q = self.q_a_layernorm(q)
@@ -170,7 +168,7 @@ def forward(
             v, [0, self.qk_head_dim - self.v_head_dim],
             value=0).view(-1, self.num_local_heads * self.qk_head_dim)
 
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         attn_output = attn_output.view(
             -1, self.num_local_heads,
             self.qk_head_dim)[..., :self.v_head_dim].reshape(
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index aa8c193ed6a..e354e532332 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -33,7 +33,6 @@
 from transformers.models.whisper.modeling_whisper import (
     ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import MultiModalFieldConfig
@@ -792,8 +791,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: Any,
     ) -> torch.Tensor:
@@ -818,8 +815,6 @@ def forward(
         output = self.llm.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=vlm_embeddings,
         )
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 5e883d00c1c..46f794e88ad 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -37,7 +37,6 @@
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
@@ -1030,8 +1029,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         **kwargs: Any,
     ) -> torch.Tensor:
@@ -1051,8 +1048,6 @@ def forward(
         output = self.llm.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=vlm_embeddings,
         )
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index b83b69fd2c2..c8dea557e57 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -175,13 +175,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -224,8 +222,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -238,8 +234,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -291,8 +285,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -306,11 +298,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -377,13 +366,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index fdc43891754..21b52d9f54c 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -22,7 +22,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -229,13 +229,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -274,8 +272,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -288,8 +284,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -333,8 +327,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -348,11 +340,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -390,13 +379,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 1f8f5b2eb13..459928fe3fb 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -38,7 +38,8 @@
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.attention.selector import _Backend
 from vllm.config import VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group, get_tp_group
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -416,11 +417,11 @@ def __init__(self,
                  prefix: str = ""):
         super().__init__()
 
-        model_parallel_size = get_tensor_model_parallel_world_size()
+        tensor_parallel_size = get_tp_group().world_size
         self.embed_dim = config.hidden_size
         self.num_heads = config.attention_heads
         self.head_dim = config.hidden_size // config.attention_heads
-        self.num_local_heads = self.num_heads // model_parallel_size
+        self.num_local_heads = self.num_heads // tensor_parallel_size
         self.q_size = self.num_local_heads * self.head_dim
         self.kv_size = self.num_local_heads * self.head_dim
 
@@ -771,12 +772,13 @@ def __init__(
     ):
         super().__init__()
         self.config = config
-        self.model_parallel_size = get_tensor_model_parallel_world_size()
+        self.pipeline_parallel_rank = get_pp_group().rank_in_group
+        self.tensor_parallel_size = get_tp_group().world_size
         self.num_heads = self.config.num_attention_heads
-        self.num_local_heads = self.num_heads // self.model_parallel_size
+        self.num_local_heads = self.num_heads // self.tensor_parallel_size
         self.num_key_value_heads = self.config.num_key_value_heads
         self.num_local_key_value_heads = \
-            self.num_key_value_heads // self.model_parallel_size
+            self.num_key_value_heads // self.tensor_parallel_size
         self.dropout = config.dropout
         self.hidden_size = config.hidden_size
         self.head_dim = config.hidden_size // self.num_heads
@@ -824,8 +826,6 @@ def forward(
         attention_mask: Optional[torch.Tensor],
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         cross_attention_states: Optional[torch.Tensor],
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv_dec, _ = self.qkv_proj(hidden_states)
         q, _, _ = qkv_dec.split(
@@ -846,14 +846,11 @@ def forward(
         q = self.q_norm(q)
 
         if attention_mask is not None:
-            output = self._attention_with_mask(q, k, v, kv_cache,
-                                               attention_mask,
-                                               kv_range_for_decode,
-                                               attn_metadata)
+            output = self._attention_with_mask(q, k, v, attention_mask,
+                                               kv_range_for_decode)
         else:
             output = self.attn(
-                q.view(-1, self.num_local_heads * self.head_dim), k, v,
-                kv_cache, attn_metadata)
+                q.view(-1, self.num_local_heads * self.head_dim), k, v)
         out, _ = self.o_proj(output)
         return out
 
@@ -862,11 +859,11 @@ def _attention_with_mask(
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        kv_cache: torch.Tensor,
         attention_mask: torch.Tensor,
         kv_range_for_decode: List[Tuple[int, int]],
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
+        kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank]
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
         # Skip writing kv-cache for the initial profiling run.
         if len(kv_cache.shape) > 1:
             i = torch.ones(1, dtype=torch.float32)
@@ -978,8 +975,6 @@ def forward(
         cross_attention_mask: torch.Tensor,
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         full_text_row_masked_out_mask: torch.Tensor,
-        kv_cache: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -989,8 +984,6 @@ def forward(
             attention_mask=cross_attention_mask,
             kv_range_for_decode=kv_range_for_decode,
             cross_attention_states=cross_attention_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = full_text_row_masked_out_mask * hidden_states
         hidden_states = residual + self.cross_attn_attn_gate.tanh(
@@ -1054,14 +1047,12 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
                                                       torch.Tensor]],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         skip_cross_attention: bool,
     ) -> torch.Tensor:
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer):
                 if not skip_cross_attention:
                     hidden_states = decoder_layer(
@@ -1071,15 +1062,11 @@ def forward(
                         kv_range_for_decode=kv_range_for_decode,
                         full_text_row_masked_out_mask=
                         full_text_row_masked_out_mask,
-                        kv_cache=kv_caches[idx],
-                        attn_metadata=attn_metadata,
                     )
             elif isinstance(decoder_layer, LlamaDecoderLayer):
                 hidden_states, residual = decoder_layer(
                     positions=positions,
                     hidden_states=hidden_states,
-                    kv_cache=kv_caches[idx],
-                    attn_metadata=attn_metadata,
                     residual=None,
                 )
                 hidden_states = hidden_states + residual
@@ -1124,8 +1111,6 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor,
                                                       torch.Tensor]],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         skip_cross_attention: bool,
     ) -> torch.Tensor:
         hidden_states = self.model(
@@ -1135,8 +1120,6 @@ def forward(
             cross_attention_mask=cross_attention_mask,
             kv_range_for_decode=kv_range_for_decode,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             skip_cross_attention=skip_cross_attention,
         )
         return hidden_states
@@ -1353,10 +1336,9 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         **kwargs: object,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        attn_metadata = get_forward_context().attn_metadata
         if attn_metadata.num_prefill_tokens > 0 and \
             attn_metadata.num_decode_tokens > 0:
             raise ValueError("Chunk prefill not supported")
@@ -1410,8 +1392,6 @@ def forward(
             cross_attention_mask=cross_attention_mask,
             kv_range_for_decode=kv_range_for_decode,
             full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             skip_cross_attention=skip_cross_attention,
         )
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6ce9fbda182..cc4d38d8740 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -16,7 +16,7 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.attention.layer import MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -460,15 +460,13 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         if self.q_norm is not None and self.k_norm is not None:
             q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -580,8 +578,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Self Attention
@@ -594,8 +590,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         hidden_states, residual = self.post_attention_layernorm(
@@ -610,8 +604,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Self Attention
@@ -619,8 +611,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         hidden_states = self.input_layernorm(hidden_states)
@@ -841,8 +831,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -858,13 +846,10 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         # Apply blocks one-by-one.
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -1643,8 +1628,6 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         positions: torch.LongTensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -1663,8 +1646,6 @@ def forward(
 
         hidden_states = self.model(input_ids,
                                    positions,
-                                   kv_caches,
-                                   attn_metadata,
                                    intermediate_tensors,
                                    inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 676c960623e..d716818f31c 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -2,12 +2,12 @@
 
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
@@ -125,8 +125,6 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         del position_ids  # unused.
         qkv, _ = self.Wqkv(hidden_states)
@@ -136,7 +134,7 @@ def forward(
         if self.qk_ln:
             q = self.q_ln(q)
             k = self.k_ln(k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.out_proj(attn_output)
         return output
 
@@ -196,15 +194,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         x = self.norm_1(hidden_states)
         x = self.attn(
             position_ids=position_ids,
             hidden_states=x,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = hidden_states + x
         x = self.norm_2(hidden_states)
@@ -253,8 +247,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -267,14 +259,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            block = self.blocks[i]
-            hidden_states = block(
-                position_ids,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for block in self.blocks[self.start_layer:self.end_layer]:
+            hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm_f(hidden_states)
@@ -306,14 +292,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index a42734edb39..3b86b91465c 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -27,7 +27,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -204,13 +204,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -269,8 +267,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -283,8 +279,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -343,8 +337,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -359,15 +351,8 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-                residual,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -444,13 +429,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
+        model_output = self.model(input_ids, positions, intermediate_tensors,
                                   inputs_embeds)
         return model_output
 
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3b470dfdd05..4a341c97d6c 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import OlmoConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -119,15 +119,13 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         if self.clip_qkv is not None:
             qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -212,14 +210,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
         # Attention block.
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
-                                       attn_metadata)
+        hidden_states = self.self_attn(positions, hidden_states)
         hidden_states = hidden_states + residual
 
         # MLP block.
@@ -263,8 +258,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -281,14 +274,9 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
 
         # Apply blocks one-by-one.
-        for i in range(self.start_layer, self.end_layer):
+        for layer in self.layers[self.start_layer:self.end_layer]:
             # shape: (batch_size, seq_len, d_model)
-            hidden_states = self.layers[i](
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+            hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -332,16 +320,12 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index d06f894123a..54cc851de93 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -24,12 +24,12 @@
 """Inference-only OLMo2 model compatible with HuggingFace weights."""
 
 from functools import partial
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, Optional, Tuple, Union
 
 import torch
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.distributed.communication_op import tensor_model_parallel_all_gather
@@ -153,14 +153,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self._apply_qk_norm(q, k)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -239,13 +237,10 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Attention block.
         residual = hidden_states
-        hidden_states = self.self_attn(positions, hidden_states, kv_cache,
-                                       attn_metadata)
+        hidden_states = self.self_attn(positions, hidden_states)
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states = hidden_states + residual
 
@@ -287,8 +282,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """
@@ -307,14 +300,9 @@ def forward(
             assert isinstance(hidden_states, torch.Tensor)
 
         # Apply blocks one-by-one.
-        for i in range(self.start_layer, self.end_layer):
+        for layer in self.layers[self.start_layer:self.end_layer]:
             # shape: (batch_size, seq_len, d_model)
-            hidden_states = self.layers[i](
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+            hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -357,15 +345,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
         )
         return hidden_states
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index d6e24c6d67f..e27ff5deace 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -168,14 +168,12 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous())
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -222,8 +220,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -237,8 +233,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -283,8 +277,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -299,13 +291,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
 
@@ -347,13 +336,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index ad1d6690243..e4775478a54 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -18,13 +18,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import OPTConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -107,12 +107,10 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.out_proj(attn_output)
         return output
 
@@ -164,17 +162,13 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Self Attention
         residual = hidden_states
         # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
         if self.do_layer_norm_before:
             hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states,
-                                       kv_cache=kv_cache,
-                                       attn_metadata=attn_metadata)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
         # 350m applies layer norm AFTER attention
         if not self.do_layer_norm_before:
@@ -261,8 +255,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -277,11 +269,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -317,15 +306,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         return self.decoder(input_ids,
                             positions,
-                            kv_caches,
-                            attn_metadata,
                             intermediate_tensors,
                             inputs_embeds=inputs_embeds)
 
@@ -362,13 +347,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index f4f5cdff643..6668ede91ee 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -5,13 +5,13 @@
 # Copyright (c) OrionStar Inc.
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -136,13 +136,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -189,8 +187,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -198,8 +194,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         hidden_states = residual + hidden_states
@@ -247,8 +241,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -260,14 +252,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -303,13 +289,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 955a59953eb..02d1861b802 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
 from torch import nn
 from transformers import PaliGemmaConfig
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -288,8 +287,6 @@ def get_input_embeddings(
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
@@ -306,8 +303,6 @@ def forward(self,
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 6a80bea348e..db8d170a8c9 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -21,13 +21,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PersimmonConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -142,8 +142,6 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # [seq_length, 3 x hidden_size]
         qkv, _ = self.query_key_value(hidden_states)
@@ -161,7 +159,7 @@ def forward(
             k = self._merge_heads(k)
 
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.dense(attn_output)
         return output
 
@@ -189,8 +187,6 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -200,8 +196,6 @@ def forward(
         hidden_states = self.self_attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states
 
@@ -248,8 +242,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -261,13 +253,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            hidden_states = self.layers[i](
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.final_layernorm(hidden_states)
@@ -298,16 +285,12 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ):
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 1ca8cad22ad..6ee80210c2b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -36,13 +36,13 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PhiConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -126,13 +126,11 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(position_ids, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.dense(attn_output)
         return output
 
@@ -186,16 +184,12 @@ def forward(
         self,
         position_ids: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         attn_outputs = self.self_attn(
             position_ids=position_ids,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         feed_forward_hidden_states = self.mlp(hidden_states)
         hidden_states = attn_outputs + feed_forward_hidden_states + residual
@@ -234,8 +228,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -247,14 +239,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -304,13 +290,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
 
         return hidden_states
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 873e9d37771..33984f54ae2 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
@@ -231,8 +231,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         qkv, _ = self.query_key_value(hidden_states)
@@ -248,7 +246,7 @@ def forward(
         v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
 
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata=attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.dense(attn_output)
 
         return output
@@ -282,8 +280,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
@@ -291,8 +287,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states
 
@@ -338,8 +332,6 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         positions: Optional[torch.LongTensor],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -354,14 +346,8 @@ def forward(
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.final_layernorm(hidden_states)
@@ -438,16 +424,12 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         positions: Optional[torch.LongTensor],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         output_hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 207204df205..61d63e104de 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -23,7 +23,6 @@
 from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
                           ProcessorMixin)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -672,8 +671,6 @@ def get_input_embeddings(
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
@@ -691,8 +688,6 @@ def forward(self,
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 17369cb58e3..c35c7e9fcce 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -22,13 +22,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only PhiMoE model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -357,13 +357,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -410,8 +408,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         residual = hidden_states
@@ -422,8 +418,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = hidden_states + residual
 
@@ -478,8 +472,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -494,13 +486,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
 
@@ -571,13 +560,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 273dc3b1cf7..87b1d50749a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -16,7 +16,6 @@
 from transformers.models.pixtral.modeling_pixtral import (
     PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
@@ -270,8 +269,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -291,8 +288,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 9383cbae11b..0d0c367e677 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -15,13 +15,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM/NASA Prithvi Geospatial model."""
-from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
+from typing import Iterable, Mapping, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
 from transformers import BatchFeature
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (IsAttentionFree,
@@ -181,8 +180,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 7c462703620..96abfb9d109 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -6,13 +6,13 @@
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -124,13 +124,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.c_attn(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.c_proj(attn_output)
         return output
 
@@ -168,8 +166,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -181,8 +177,6 @@ def forward(
         hidden_states = self.attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -225,8 +219,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -241,13 +233,10 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.h[i]
+        for layer in self.h[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -373,12 +362,9 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 7da6e558ff3..fe615c41aea 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -23,13 +23,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -170,13 +170,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -233,8 +231,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -247,8 +243,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -328,8 +322,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -343,13 +335,10 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in self.layers[self.start_layer:self.end_layer]:
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
         if not get_pp_group().is_last_rank:
@@ -468,13 +457,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
@@ -553,12 +539,9 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> torch.Tensor:
-        return self.model(input_ids, positions, kv_caches, attn_metadata,
-                          intermediate_tensors)
+        return self.model(input_ids, positions, intermediate_tensors)
 
     def pooler(
         self,
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ef31f18445f..858cf28d2b8 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -37,7 +37,6 @@
 from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
     Qwen2_5_VLConfig, Qwen2_5_VLVisionConfig)
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
@@ -992,8 +991,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -1047,8 +1044,6 @@ def forward(
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 3df5dd2bdd4..f0dc8573ee1 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -22,8 +22,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import cached_property
-from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Iterable, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
@@ -33,7 +33,6 @@
                                              Qwen2AudioProcessor)
 from transformers.models.whisper import WhisperFeatureExtractor
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -380,8 +379,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -400,8 +397,6 @@ def forward(
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 35d9854a55d..41536b34b2f 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -23,14 +23,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
@@ -232,13 +232,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -296,8 +294,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
@@ -310,8 +306,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -358,8 +352,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -373,11 +365,8 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
                 "hidden_states": hidden_states,
@@ -416,13 +405,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index c6588a47d88..21cc9e8ed1c 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -5,12 +5,11 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-RM model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -80,13 +79,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         logits, _ = self.score(hidden_states)
         return logits
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 31701abd333..849ef7293bb 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -24,8 +24,8 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import cached_property, partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Set, Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Iterable, Literal, Mapping, Optional, Set,
+                    Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -38,7 +38,6 @@
     Qwen2VLConfig, Qwen2VLVisionConfig)
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed import parallel_state, tensor_model_parallel_all_gather
 from vllm.distributed import utils as dist_utils
@@ -1302,8 +1301,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -1354,8 +1351,6 @@ def forward(
         hidden_states = self.language_model.model(
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 56faa390fc5..e0d8bf2fa3d 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -22,7 +22,6 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -766,8 +765,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
@@ -783,7 +780,6 @@ def forward(
                                                       vision_embeddings)
             input_ids = None
 
-        hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors,
-                                         inputs_embeds)
+        hidden_states = self.transformer(input_ids, positions,
+                                         intermediate_tensors, inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 742e63a065b..f86fa268072 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, Optional, Tuple
 
 import torch
 from torch import nn
 from transformers import RobertaConfig
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import CrossEncodingPooler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -243,16 +242,12 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         return self.roberta(input_ids=input_ids,
                             position_ids=positions,
-                            kv_caches=kv_caches,
                             inputs_embeds=inputs_embeds,
                             intermediate_tensors=intermediate_tensors,
-                            attn_metadata=attn_metadata,
                             token_type_ids=token_type_ids)
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index ad98f3b0703..0f9e517aeb5 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -23,13 +23,13 @@
 # limitations under the License.
 """Inference-only Solar model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -172,13 +172,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -238,8 +236,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -252,8 +248,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Fully Connected
@@ -315,8 +309,6 @@ def forward(
         self,
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -357,8 +349,6 @@ def forward(
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
                 residual,
             )
 
@@ -438,13 +428,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
+        model_output = self.model(input_ids, positions, intermediate_tensors,
                                   inputs_embeds)
         return model_output
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a5d4432669f..a15faec547b 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -20,13 +20,13 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import StableLmConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -147,13 +147,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -183,8 +181,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
         residual = hidden_states
@@ -192,8 +188,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states
 
@@ -241,8 +235,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -254,14 +246,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                kv_caches[i - self.start_layer],
-                attn_metadata,
-            )
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm(hidden_states)
@@ -296,13 +282,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 01ea4366648..90098af9dde 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -19,13 +19,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
 from transformers import Starcoder2Config
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -118,13 +118,11 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -184,8 +182,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Self Attention
         residual = hidden_states
@@ -193,8 +189,6 @@ def forward(
         hidden_states = self.self_attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states
 
@@ -246,8 +240,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -259,11 +251,8 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
-            hidden_states = layer(positions, hidden_states,
-                                  kv_caches[i - self.start_layer],
-                                  attn_metadata)
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
         hidden_states = self.norm(hidden_states)
@@ -306,13 +295,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index b431abb76b6..1c3c443b294 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -22,7 +22,7 @@
 from transformers import AutoModel, PreTrainedModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
@@ -59,7 +59,6 @@ def vllm_flash_attention_forward(
         # Transformers kwargs
         scaling: Optional[float] = None,
         # vLLM kwargs
-        attn_metadata: Optional[AttentionMetadata] = None,
         attention_instances: Optional[list[Attention]] = None,
         **kwargs):
     self_attn = attention_instances[module.layer_idx]
@@ -68,12 +67,7 @@ def vllm_flash_attention_forward(
     hidden = query.shape[-2]
     query, key, value = (x.transpose(1, 2) for x in (query, key, value))
     query, key, value = (x.reshape(hidden, -1) for x in (query, key, value))
-    return self_attn.forward(
-        query,
-        key,
-        value,
-        kv_cache=None,  # argument not used
-        attn_metadata=attn_metadata), None
+    return self_attn.forward(query, key, value), None
 
 
 ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward
@@ -251,8 +245,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: list[torch.Tensor],  # argument not used
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -260,7 +252,6 @@ def forward(
             input_ids[None, ...],
             use_cache=False,
             position_ids=positions[None, ...],
-            attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
             attention_instances=self.attention_instances,
             return_dict=False)[0][0, ...]  # we remove batch dimension for now
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index b99094e5d4c..1dbba3c50b1 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -4,8 +4,8 @@
 """PyTorch Ultravox model."""
 import math
 from functools import cached_property
-from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import (Any, Iterable, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -16,8 +16,8 @@
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm import envs
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import MulAndSilu, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -495,13 +495,13 @@ def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[NestedTensors] = None,
-        attn_metadata: Optional[AttentionMetadata] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
 
             # TODO(ywang96): remove this block after v0 is deprecated.
             if not envs.VLLM_USE_V1:
+                attn_metadata = get_forward_context().attn_metadata
                 merge_multimodal_embeddings_from_map(
                     inputs_embeds, multimodal_embeddings,
                     attn_metadata.multi_modal_placeholder_index_maps["audio"])
@@ -514,8 +514,6 @@ def get_input_embeddings(
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[torch.Tensor] = None,
                 inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs) -> Union[torch.Tensor, IntermediateTensors]:
@@ -540,17 +538,12 @@ def forward(self,
         elif inputs_embeds is None:
             multimodal_embeddings = self.get_multimodal_embeddings(**kwargs)
 
-            # TODO(ywang96): remove attn_metadata from get_input_embeddings
-            # after v0 is deprecated
             inputs_embeds = self.get_input_embeddings(input_ids,
-                                                      multimodal_embeddings,
-                                                      attn_metadata)
+                                                      multimodal_embeddings)
             input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
-                                                  kv_caches,
-                                                  attn_metadata,
                                                   intermediate_tensors,
                                                   inputs_embeds=inputs_embeds)
         return hidden_states
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 2ad1731144e..e5f77e08c40 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -10,7 +10,7 @@
                           WhisperProcessor)
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention import Attention, AttentionType
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -134,13 +134,11 @@ def _init_qkv(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ):
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
 
         output, _ = self.out_proj(attn_output)
 
@@ -196,8 +194,6 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor],
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ):
         q, _ = self.q_proj(hidden_states)
 
@@ -209,13 +205,7 @@ def forward(
         else:
             k = v = None
 
-        attn_output = self.attn(
-            q,
-            k,
-            v,
-            kv_cache,
-            attn_metadata,
-        )
+        attn_output = self.attn(q, k, v)
 
         output, _ = self.out_proj(attn_output)
 
@@ -285,16 +275,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ):
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
-        )
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
         residual = hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
@@ -348,14 +332,10 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor],
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ):
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(hidden_states=hidden_states,
-                                       kv_cache=kv_cache,
-                                       attn_metadata=attn_metadata)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
@@ -363,8 +343,6 @@ def forward(
         hidden_states = self.encoder_attn(
             hidden_states=hidden_states,
             encoder_hidden_states=encoder_hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
         hidden_states = residual + hidden_states
 
@@ -411,12 +389,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.embed_positions.weight.copy_(
                 sinusoids(*self.embed_positions.weight.shape))
 
-    def forward(
-        self,
-        input_features: Union[torch.Tensor, List[torch.Tensor]],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    ):
+    def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]):
         hidden_states = []
         for features in input_features:
             embeds = nn.functional.gelu(self.conv1(features))
@@ -426,12 +399,8 @@ def forward(
             hidden_states.append(embeds)
         hidden_states = torch.cat(hidden_states)
 
-        for idx, encoder_layer in enumerate(self.layers):
-            hidden_states = encoder_layer(
-                hidden_states,
-                kv_cache=kv_caches[idx],
-                attn_metadata=attn_metadata,
-            )
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(hidden_states)
 
         hidden_states = self.layer_norm(hidden_states)
         return hidden_states
@@ -466,19 +435,15 @@ def forward(
         input_ids,
         positions: torch.Tensor,
         encoder_hidden_states: Optional[torch.Tensor],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ):
         inputs_embeds = self.get_input_embeddings(input_ids)
         positions = self.embed_positions(positions)
         hidden_states = inputs_embeds + positions
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             hidden_states = decoder_layer(
                 hidden_states,
                 encoder_hidden_states=encoder_hidden_states,
-                kv_cache=kv_caches[idx],
-                attn_metadata=attn_metadata,
             )
 
         hidden_states = self.layer_norm(hidden_states)
@@ -505,36 +470,22 @@ def forward(
         input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
         input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
-        encoder_outputs = self.get_encoder_outputs(
-            input_features,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-        )
+        encoder_outputs = self.get_encoder_outputs(input_features)
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             positions=positions,
             encoder_hidden_states=encoder_outputs,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
         return decoder_outputs
 
     def get_encoder_outputs(
         self,
         input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
     ) -> Optional[torch.Tensor]:
         if input_features is None:
             return None
-        return self.encoder(
-            input_features,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-        )
+        return self.encoder(input_features)
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
@@ -733,8 +684,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> torch.Tensor:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
@@ -742,31 +691,19 @@ def forward(
             input_features=audio_input["input_features"],
             input_ids=input_ids,
             positions=positions,
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
         return decoder_outputs
 
-    def get_multimodal_embeddings(
-        self,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-        **kwargs,
-    ) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
         # Refactor this once encoder/decoder support is implemented in V1.
         audio_input = self._parse_and_validate_audio_input(**kwargs)
-        return self.model.get_encoder_outputs(
-            audio_input["input_features"],
-            kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
-        )
+        return self.model.get_encoder_outputs(audio_input["input_features"])
 
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[NestedTensors] = None,
-        attn_metadata: Optional[AttentionMetadata] = None,
     ) -> torch.Tensor:
         # TODO: This method just returns the decoder sequence embeddings since
         # Whisper does not have encoder text tokens. Refactor this once
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 7353d3c53ae..40ecc3481e6 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -288,8 +288,6 @@ def execute_model(
                 hidden_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
-                    kv_caches=kv_caches,
-                    attn_metadata=model_input.attn_metadata,
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                  device=self.device),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a7b9d478118..1fbce3098a3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -939,8 +939,6 @@ def execute_model(
             hidden_states = self.model(
                 input_ids=input_ids,
                 positions=positions,
-                kv_caches=self.kv_caches,
-                attn_metadata=None,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
             )
@@ -1137,11 +1135,8 @@ def _get_prompt_logprobs_dict(
     def _dummy_run(
         self,
         num_tokens: int,
-        kv_caches: Optional[List[torch.Tensor]] = None,
     ) -> torch.Tensor:
         model = self.model
-        if kv_caches is None:
-            kv_caches = self.kv_caches
         if self.is_multimodal_model:
             input_ids = None
             inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -1172,26 +1167,12 @@ def _dummy_run(
             hidden_states = model(
                 input_ids=input_ids,
                 positions=positions,
-                kv_caches=kv_caches,
-                attn_metadata=None,
                 intermediate_tensors=intermediate_tensors,
                 inputs_embeds=inputs_embeds,
             )
         return hidden_states
 
     def profile_run(self) -> None:
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value `None`.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        # it is important to create tensors inside the loop, rather than
-        # multiplying the list, to avoid Dynamo from treating them as
-        # tensor aliasing.
-        dummy_kv_caches = [
-            torch.tensor((), dtype=torch.float32, device=self.device)
-            for _ in range(self.num_attn_layers)
-        ]
-
         # Profile with multimodal encoder & encoder cache.
         # TODO: handle encoder-decoder models once we support them.
         if (self.is_multimodal_model and self.max_num_encoder_input_tokens > 0
@@ -1302,8 +1283,7 @@ def profile_run(self) -> None:
         with self.maybe_profile_with_lora(self.lora_config,
                                           num_scheduled_tokens):
             # Trigger compilation for general shape.
-            hidden_states = self._dummy_run(self.max_num_tokens,
-                                            dummy_kv_caches)
+            hidden_states = self._dummy_run(self.max_num_tokens)
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
                 logits = self.model.compute_logits(hidden_states, None)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index e60268f0452..f7d72d26e04 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -13,11 +13,10 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 
-from vllm.attention import AttentionMetadata
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingType
@@ -623,7 +622,6 @@ def execute_model(
                 assert self.model is not None
                 selected_token_ids = self.model(prompt_data.input_tokens,
                                                 prompt_data.input_positions,
-                                                prompt_data.attn_metadata,
                                                 self.kv_caches)
 
             # In parallel to TPU execution, prepare the next iteration
@@ -662,7 +660,6 @@ def execute_model(
                 assert self.model is not None
                 selected_token_ids = self.model(decode_data.input_tokens,
                                                 decode_data.input_positions,
-                                                decode_data.attn_metadata,
                                                 self.kv_caches)
 
             # Transfer sampled tokens from TPU to CPU
@@ -839,7 +836,7 @@ def dummy_run(
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
-            self.model(token_ids, position_ids, attn_metadata, kv_caches)
+            self.model(token_ids, position_ids, kv_caches)
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -963,7 +960,6 @@ def forward(
         self,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
@@ -971,7 +967,6 @@ def forward(
         Args:
             token_ids: The input token IDs of shape [batch_size, seq_len].
             position_ids: The input position IDs of shape [batch_size, seq_len].
-            attn_metadata: The Pallas attention metadata.
             input_lens: The actual input lengths of shape [batch_size].
             t: The sampling temperature of shape [batch_size].
             p: The top-p probability of shape [batch_size].
@@ -980,7 +975,8 @@ def forward(
                 memory profiling at initialization.
         """
         # Skip this in memory profiling at initialization.
-        if attn_metadata is not None and kv_caches[0][0].numel() > 0:
+        if kv_caches[0][0].numel() > 0:
+            attn_metadata = get_forward_context().attn_metadata
             # index_copy_(slot_mapping) only works when the inserted dimension
             # is 0. However, the KV cache in the Pallas backend has the shape
             # [num_kv_heads, num_blocks, block_size, head_size]. To make it
@@ -1001,12 +997,7 @@ def forward(
             attn_metadata.slot_mapping = slot_mapping
 
         assert self.model is not None
-        hidden_states = self.model(
-            token_ids,
-            position_ids,
-            kv_caches,
-            attn_metadata,
-        )
+        hidden_states = self.model(token_ids, position_ids)
 
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, None)
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 71e32c5f7ac..ac7c93e4839 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -297,10 +297,6 @@ def execute_model(
             model_input.encoder_input_tokens,
             "encoder_positions":
             model_input.encoder_input_positions,
-            "kv_caches":
-            kv_caches,
-            "attn_metadata":
-            model_input.attn_metadata,
             **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
             "intermediate_tensors":
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 9400893105d..8407f073040 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -654,8 +654,6 @@ def execute_model(
             hidden_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **execute_model_kwargs,
                 **multimodal_kwargs,
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index c0744d63b8d..1ceb2557c6b 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -41,16 +41,6 @@ def execute_model(
             raise ValueError(
                 "CPU worker does not support multi-step execution.")
 
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-            for _ in range(num_layers)
-        ]
-
         model_executable = self.model
         cross_enc_kwargs = {}
         if model_input.token_type_ids is not None:
@@ -60,10 +50,6 @@ def execute_model(
             model_input.input_tokens,
             "positions":
             model_input.input_positions,
-            "kv_caches":
-            kv_caches,
-            "attn_metadata":
-            model_input.attn_metadata,
             **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
             **cross_enc_kwargs,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index e2d338f7576..5f39f2fa494 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -184,8 +184,6 @@ def execute_model(
                 positions=model_input.input_positions,
                 encoder_input_ids=model_input.encoder_input_tokens,
                 encoder_positions=model_input.encoder_input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
@@ -324,21 +322,11 @@ def profile_run(self) -> None:
                 or encoder_dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
-        # Run the model with the dummy inputs.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-            for _ in range(num_layers)
-        ]
         finished_requests_ids = [seq.request_id for seq in seqs]
         model_input = self.prepare_model_input(
             seqs, finished_requests_ids=finished_requests_ids)
         intermediate_tensors = None
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
+        self.execute_model(model_input, None, intermediate_tensors)
         torch.cuda.synchronize()
         return
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index f22526cfad7..d6eaf84e40f 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -384,11 +384,12 @@ def forward(self, *args, **kwargs):
         if 'virtual_engine' in kwargs:
             virtual_engine = kwargs.pop('virtual_engine')
         input_ids = kwargs['input_ids']
-        kwargs['attn_metadata'] = self._update_metadata(
-            kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1),
-            input_ids.device, self.dtype)
+        attn_metadata = self._update_metadata(kwargs.pop('attn_metadata'),
+                                              input_ids.size(0),
+                                              input_ids.size(1),
+                                              input_ids.device, self.dtype)
         LoraMask.setLoraMask(kwargs.pop('lora_mask'))
-        with set_forward_context(kwargs['attn_metadata'], self.vllm_config,
+        with set_forward_context(attn_metadata, self.vllm_config,
                                  virtual_engine):
             hidden_states = self.model(*args, **kwargs)
             hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
@@ -1346,15 +1347,13 @@ def profile_run(self) -> None:
         max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1]
         max_batch_size = min(self.max_num_batched_tokens // max_seq_len,
                              self.scheduler_config.max_num_seqs)
-        self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches,
-                             False, True)
+        self.warmup_scenario(max_batch_size, max_seq_len, True, False, True)
         return
 
     def warmup_scenario(self,
                         batch_size,
                         seq_len,
                         is_prompt,
-                        kv_caches,
                         is_pt_profiler_run=False,
                         is_lora_profile_run=False) -> None:
         use_graphs = self._use_graphs(batch_size, seq_len, is_prompt)
@@ -1418,7 +1417,7 @@ def warmup_scenario(self,
             profiler.start()
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
-            self.execute_model(inputs, kv_caches, warmup_mode=True)
+            self.execute_model(inputs, None, warmup_mode=True)
             torch.hpu.synchronize()
             if profiler:
                 profiler.step()
@@ -1470,17 +1469,16 @@ def log_warmup(self, phase, i, max_i, batch_size, seq_len):
                f"free_mem:{free_mem}")
         logger.info(msg)
 
-    def warmup_all_buckets(self, buckets, is_prompt, kv_caches):
+    def warmup_all_buckets(self, buckets, is_prompt):
         for i, (batch_size, seq_len) in enumerate(reversed(buckets)):
             self.log_warmup('Prompt' if is_prompt else 'Decode', i,
                             len(buckets), batch_size, seq_len)
-            self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+            self.warmup_scenario(batch_size, seq_len, is_prompt)
 
     def warmup_graphs(self,
                       strategy,
                       buckets,
                       is_prompt,
-                      kv_caches,
                       available_mem,
                       starting_mem=0,
                       total_batch_seq=0.001):
@@ -1512,7 +1510,7 @@ def warmup_graphs(self,
             self.graphed_buckets.add(graphed_bucket)
             self.log_warmup(phase, idx, num_candidates, batch_size, seq_len)
             with HabanaMemoryProfiler() as mem_prof:
-                self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches)
+                self.warmup_scenario(batch_size, seq_len, is_prompt)
             used_mem = align_workers(mem_prof.consumed_device_memory,
                                      torch.distributed.ReduceOp.MAX)
             available_mem -= used_mem
@@ -1542,8 +1540,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
             graphs = graph == 't'
             if graphs:
                 self.graphed_buckets.add((int(bs), int(seq_len), is_prompt))
-            self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches,
-                                 True)
+            self.warmup_scenario(int(bs), int(seq_len), is_prompt, True)
             raise AssertionError("Finished profiling")
         if self.skip_warmup:
             logger.info("Skipping warmup...")
@@ -1608,9 +1605,9 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         with compile_only_mode_context(
         ) if can_use_compile_only_mode else contextlib.nullcontext():
             self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets,
-                                    True, kv_caches)
+                                    True)
             self.warmup_all_buckets(self.bucketing_global_state.decode_buckets,
-                                    False, kv_caches)
+                                    False)
 
             if not self.enforce_eager and htorch.utils.internal.is_lazy():
                 assert self.mem_margin is not None, \
@@ -1641,11 +1638,11 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                 mem_post_prompt, prompt_batch_seq, prompt_captured_all = \
                     self.warmup_graphs(
                     prompt_strategy, self.bucketing_global_state.prompt_buckets,
-                    True, kv_caches, prompt_available_memory)
+                    True, prompt_available_memory)
                 mem_post_decode, decode_batch_seq, decode_captured_all = \
                     self.warmup_graphs(
                     decode_strategy, self.bucketing_global_state.decode_buckets,
-                    False, kv_caches, decode_available_memory)
+                    False, decode_available_memory)
 
                 # Not all prompt buckets were captured, but all decode buckets
                 # were captured and we have some free graph-allocated space
@@ -1656,7 +1653,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                         self.warmup_graphs(
                             prompt_strategy,
                             self.bucketing_global_state.prompt_buckets, True,
-                            kv_caches,
                             graph_free_mem - mem_post_prompt - mem_post_decode,
                             mem_post_prompt, prompt_batch_seq))
 
@@ -1669,7 +1665,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     mem_post_decode, _, _ = self.warmup_graphs(
                         decode_strategy,
                         self.bucketing_global_state.decode_buckets, False,
-                        kv_caches,
                         graph_free_mem - mem_post_prompt - mem_post_decode,
                         mem_post_decode, decode_batch_seq)
 
@@ -1982,7 +1977,6 @@ def execute_model(
         execute_model_kwargs = {
             "input_ids": input_tokens,
             "positions": input_positions,
-            "kv_caches": kv_caches,
             "attn_metadata": self.trim_attn_metadata(attn_metadata),
             "intermediate_tensors": intermediate_tensors,
             "lora_mask": lora_mask,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1a78498ad12..86dcde234f8 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -26,7 +26,7 @@
 from vllm.distributed import get_kv_transfer_group, get_pp_group
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
                                              graph_capture)
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -1727,8 +1727,6 @@ def execute_model(
                 hidden_or_intermediate_states = model_executable(
                     input_ids=model_input.input_tokens,
                     positions=model_input.input_positions,
-                    kv_caches=kv_caches,
-                    attn_metadata=model_input.attn_metadata,
                     intermediate_tensors=intermediate_tensors,
                     **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                  device=self.device),
@@ -1913,8 +1911,6 @@ def capture(
             self.model(
                 input_ids=input_ids,
                 positions=positions,
-                kv_caches=kv_caches,
-                attn_metadata=attn_metadata,
                 intermediate_tensors=intermediate_inputs,
                 **kwargs,
             )
@@ -1927,8 +1923,6 @@ def capture(
             output_hidden_or_intermediate_states = self.model(
                 input_ids=input_ids,
                 positions=positions,
-                kv_caches=kv_caches,
-                attn_metadata=attn_metadata,
                 intermediate_tensors=intermediate_inputs,
                 **kwargs,
             )
@@ -1976,13 +1970,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         **kwargs,
     ) -> torch.Tensor:
-        # KV caches are fixed tensors, so we don't need to copy them.
-        del kv_caches
+        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
 
         # Copy the input tensors to the input buffers.
         self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 90771e8ac75..7ddf382079c 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -476,7 +476,7 @@ def execute_model(
         # path for warm up runs
         if not model_input.is_multi_step:
             return self._base_model_runner.execute_model(
-                frozen_model_input, kv_caches, intermediate_tensors, num_steps)
+                frozen_model_input, None, intermediate_tensors, num_steps)
 
         # make sure we skip the sampler on the lask rank and only pythonize
         # if CPU is ahead.
@@ -538,7 +538,7 @@ def execute_model(
 
         # Execute the model
         output = self._base_model_runner.execute_model(frozen_model_input,
-                                                       kv_caches,
+                                                       None,
                                                        intermediate_tensors,
                                                        num_steps=1)
 
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index f7a5ab9de9f..5035ea20294 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -346,10 +346,6 @@ def execute_model(
             input_tokens,
             "positions":
             input_positions,
-            "kv_caches":
-            kv_caches,
-            "attn_metadata":
-            attn_metadata,
             **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
                                          device=self.device),
         }
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 4cbe5db4453..cbd5e2060ca 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -91,16 +91,6 @@ def execute_model(
         else:
             model_executable = self.model
 
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-            for _ in range(num_layers)
-        ]
-
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
@@ -121,8 +111,6 @@ def execute_model(
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index ecdf7aa8889..53541a2579e 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -15,7 +15,7 @@
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
+from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
@@ -275,8 +275,8 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(p, 0)
         # Dummy run.
         with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(token_ids, position_ids, attn_metadata, input_lens, t,
-                       p, num_samples, kv_caches)
+            self.model(token_ids, position_ids, input_lens, t, p, num_samples,
+                       kv_caches)
 
     def warmup_model(
         self,
@@ -679,8 +679,8 @@ def execute_model(
                                          self.vllm_config,
                                          model_input.virtual_engine):
                     output_token_ids = self.model(token_ids, position_ids,
-                                                  attn_metadata, input_lens, t,
-                                                  p, model_input.num_samples,
+                                                  input_lens, t, p,
+                                                  model_input.num_samples,
                                                   kv_caches)
                 next_token_ids.append(output_token_ids[0])
                 start_idx = end_idx
@@ -730,8 +730,8 @@ def execute_model(
                                          self.vllm_config,
                                          model_input.virtual_engine):
                     output_token_ids = self.model(token_ids, position_ids,
-                                                  attn_metadata, input_lens, t,
-                                                  p, model_input.num_samples,
+                                                  input_lens, t, p,
+                                                  model_input.num_samples,
                                                   kv_caches)
                 self.cached_step_outputs.append(output_token_ids)
 
@@ -777,7 +777,6 @@ def forward(
         self,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         input_lens: torch.Tensor,
         t: torch.Tensor,
         p: torch.Tensor,
@@ -789,7 +788,6 @@ def forward(
         Args:
             token_ids: The input token IDs of shape [batch_size, seq_len].
             position_ids: The input position IDs of shape [batch_size, seq_len].
-            attn_metadata: The Pallas attention metadata.
             input_lens: The actual input lengths of shape [batch_size].
             t: The sampling temperature of shape [batch_size].
             p: The top-p probability of shape [batch_size].
@@ -802,6 +800,7 @@ def forward(
         start_indicies = torch.arange(
             batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
         logits_indices = start_indicies + input_lens - 1
+        attn_metadata = get_forward_context().attn_metadata
 
         # FIXME(woosuk): This is a temporary hack to avoid using the existing
         # sampler and sampling metadata.
@@ -833,12 +832,7 @@ def forward(
             slot_mapping = slot_mapping.flatten()
             attn_metadata.slot_mapping = slot_mapping
 
-        hidden_states = self.model(
-            token_ids,
-            position_ids,
-            kv_caches,
-            attn_metadata,
-        )
+        hidden_states = self.model(token_ids, position_ids)
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
 
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 9c726e1a107..39957e661c4 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -484,15 +484,6 @@ def profile_run(self) -> None:
                 multi_modal_placeholders=dummy_data.multi_modal_placeholders)
             seqs.append(seq)
 
-        # Run the model with the dummy inputs.
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        # use an empty tensor instead of `None`` to force Dynamo to pass
-        # it by reference, rather by specializing on the value ``None``.
-        # the `dtype` argument does not matter, and we use `float32` as
-        # a placeholder (it has wide hardware support).
-        kv_caches = [
-            torch.tensor([], dtype=torch.float32, device=self.device)
-        ] * num_layers
         finished_requests_ids = [seq.request_id for seq in seqs]
         model_input = self.prepare_model_input(
             seqs, finished_requests_ids=finished_requests_ids)
@@ -502,7 +493,7 @@ def profile_run(self) -> None:
                 batch_size=batch_size,
                 dtype=self.model_config.dtype,
                 device=self.device)
-        self.execute_model(model_input, kv_caches, intermediate_tensors)
+        self.execute_model(model_input, None, intermediate_tensors)
         torch.xpu.synchronize()
         return
 
@@ -581,8 +572,6 @@ def execute_model(
             hidden_or_intermediate_states = model_executable(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
-                kv_caches=kv_caches,
-                attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
                                              or {},

From 8737ad0400af11d05064acb75ebf63249ed9089e Mon Sep 17 00:00:00 2001
From: Eli Boyarski <eli@boyar.ski>
Date: Tue, 25 Feb 2025 04:23:04 +0200
Subject: [PATCH 0355/1240] [Doc] arg_utils.py: fixed a typo (#13785)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8378a116a6d..663ea1ef8af 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -382,7 +382,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'https://github.com/noamgat/lm-format-enforcer.'
             ' Can be overridden per request via guided_decoding_backend'
             ' parameter.\n'
-            'Backend-sepcific options can be supplied in a comma-separated '
+            'Backend-specific options can be supplied in a comma-separated '
             'list following a colon after the backend name. Valid backends and '
             'all available options are: [xgrammar:no-fallback, '
             'outlines:no-fallback, lm-format-enforcer:no-fallback]')

From 8dd2b489523b25923099e8b280e140996c5b6734 Mon Sep 17 00:00:00 2001
From: cjackal <44624812+cjackal@users.noreply.github.com>
Date: Tue, 25 Feb 2025 11:26:12 +0900
Subject: [PATCH 0356/1240] [Misc] set single whitespace between log sentences
 (#13771)

Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/flashinfer.py                |  2 +-
 vllm/attention/backends/mla/common.py                |  2 +-
 vllm/attention/backends/rocm_flash_attn.py           |  4 ++--
 vllm/config.py                                       | 12 ++++++------
 .../device_communicators/pynccl_wrapper.py           |  4 ++--
 .../distributed/kv_transfer/kv_pipe/mooncake_pipe.py |  2 +-
 vllm/entrypoints/chat_utils.py                       |  2 +-
 vllm/entrypoints/llm.py                              |  2 +-
 vllm/entrypoints/openai/api_server.py                |  2 +-
 vllm/executor/ray_distributed_executor.py            |  2 +-
 vllm/executor/ray_utils.py                           |  2 +-
 vllm/lora/models.py                                  |  2 +-
 .../compressed_tensors/compressed_tensors_moe.py     |  2 +-
 vllm/model_executor/layers/quantization/gptq.py      |  2 +-
 vllm/model_executor/layers/quantization/modelopt.py  |  2 +-
 .../layers/quantization/neuron_quant.py              |  4 ++--
 .../layers/quantization/quark/quark_moe.py           |  2 +-
 .../layers/quantization/utils/marlin_utils.py        |  2 +-
 vllm/model_executor/model_loader/loader.py           |  2 +-
 vllm/model_executor/models/deepseek_vl2.py           |  2 +-
 vllm/model_executor/models/fuyu.py                   |  2 +-
 vllm/model_executor/models/gritlm.py                 |  8 ++++----
 vllm/model_executor/models/minicpmv.py               |  2 +-
 vllm/model_executor/models/phi3v.py                  |  2 +-
 vllm/model_executor/models/prithvi_geospatial_mae.py |  4 ++--
 vllm/multimodal/profiling.py                         |  2 +-
 vllm/platforms/cuda.py                               |  2 +-
 vllm/platforms/openvino.py                           |  2 +-
 vllm/platforms/xpu.py                                |  2 +-
 vllm/prompt_adapter/models.py                        |  2 +-
 vllm/spec_decode/draft_model_runner.py               |  2 +-
 vllm/transformers_utils/configs/jais.py              |  8 ++++----
 vllm/utils.py                                        |  6 +++---
 vllm/v1/worker/gpu_worker.py                         |  2 +-
 vllm/worker/openvino_worker.py                       |  2 +-
 vllm/worker/worker.py                                |  4 ++--
 36 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 715ed6748b8..0556c191dde 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -438,7 +438,7 @@ def __post_init__(self):
                 not in supported_head_sizes:
             raise ValueError(
                 f"Only {supported_head_sizes} are supported for head_dim,",
-                f"received {self.head_dim}.")
+                f" received {self.head_dim}.")
 
     def begin_forward(self):
         if self.num_prefill_tokens > 0:
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index c3dbbdb8682..f47ea3684e0 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -533,7 +533,7 @@ def __post_init__(self):
                 not in supported_head_sizes:
             raise ValueError(
                 f"Only {supported_head_sizes} are supported for head_dim,",
-                f"received {self.head_dim}.")
+                f" received {self.head_dim}.")
 
     @property
     def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 1b1f6ca9bee..3f40686ee2f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -497,7 +497,7 @@ def __init__(
             if logits_soft_cap is not None:
                 raise ValueError(
                     "ROCm Triton FlashAttention does not support attention"
-                    "logits soft capping."
+                    " logits soft capping."
                     " please try using the ROCm CK "
                     "FA backend instead by setting the env var "
                     "`VLLM_USE_TRITON_FLASH_ATTN=0`")
@@ -528,7 +528,7 @@ def __init__(
             if self.use_naive_attn:
                 if logits_soft_cap is not None:
                     raise ValueError(
-                        "ROCm Naive FlashAttention does not support"
+                        "ROCm Naive FlashAttention does not support "
                         "attention logits soft capping.")
 
                 self.attn_func = _sdpa_attention
diff --git a/vllm/config.py b/vllm/config.py
index fea673b6856..8e1ce87438a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -924,8 +924,8 @@ def get_num_layers_by_block_type(
             layers_block_type_value = getattr(self.hf_config,
                                               "layers_block_type", None)
             if layers_block_type_value is None:
-                raise ValueError("The model is an hybrid without a"
-                                 "layers_block_type in the hf_config,"
+                raise ValueError("The model is an hybrid without a "
+                                 "layers_block_type in the hf_config, "
                                  "cannot determine the num of "
                                  f"{block_type.value} layers")
 
@@ -2516,7 +2516,7 @@ def _get_and_verify_dtype(
 
             if current_platform.is_hpu() and config_dtype == torch.float16:
                 logger.info(
-                    "For HPU, we cast models to bfloat16 instead of"
+                    "For HPU, we cast models to bfloat16 instead of "
                     "using float16 by default. Please specify `dtype` if you "
                     "want to use float16.")
                 torch_dtype = torch.bfloat16
@@ -2732,7 +2732,7 @@ def __post_init__(self):
             backend=self.guided_decoding_backend).backend_name
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend},"
-                             f"must be one of {valid_guided_backends}")
+                             f" must be one of {valid_guided_backends}")
 
 
 @dataclass
@@ -3008,7 +3008,7 @@ def uuid(self):
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_reshape and self.enable_fusion:
                 logger.warning_once(
-                    "Fusion enabled but reshape elimination disabled."
+                    "Fusion enabled but reshape elimination disabled. "
                     "RMSNorm + quant (fp8) fusion might not work")
 
     pass_config: PassConfig = Field(default_factory=PassConfig)
@@ -3563,7 +3563,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
             logger.warning(
                 "`torch.compile` is turned on, but the model %s"
                 " does not support it. Please open an issue on GitHub"
-                "if you want it to be supported.",
+                " if you want it to be supported.",
                 vllm_config.model_config.model)
         _current_vllm_config = old_vllm_config
 
diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py
index 03c3b0be763..4f04899e92e 100644
--- a/vllm/distributed/device_communicators/pynccl_wrapper.py
+++ b/vllm/distributed/device_communicators/pynccl_wrapper.py
@@ -227,10 +227,10 @@ def __init__(self, so_file: Optional[str] = None):
             self.lib = NCCLLibrary.path_to_library_cache[so_file]
         except Exception as e:
             logger.error(
-                "Failed to load NCCL library from %s ."
+                "Failed to load NCCL library from %s. "
                 "It is expected if you are not running on NVIDIA/AMD GPUs."
                 "Otherwise, the nccl library might not exist, be corrupted "
-                "or it does not support the current platform %s."
+                "or it does not support the current platform %s. "
                 "If you already have the library, please set the "
                 "environment variable VLLM_NCCL_SO_PATH"
                 " to point to the correct nccl library path.", so_file,
diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 58ab7f0b642..57a2b0393ba 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -137,7 +137,7 @@ def initialize(self, local_hostname: str, metadata_server: str,
             if metadata_backend not in supported_backend:
                 raise ValueError(
                     "Mooncake Configuration error. `metadata_backend`"
-                    f"should be one of {supported_backend}.")
+                    f" should be one of {supported_backend}.")
 
             self.engine.initializeExt(local_hostname, metadata_server,
                                       protocol, device_name, metadata_backend)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f04902ae1c7..c50c631dafc 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -823,7 +823,7 @@ def _parse_chat_message_content_part(
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
-            "Skipping multimodal part (type: '%s')"
+            "Skipping multimodal part (type: '%s') "
             "with empty / unparsable content.", part_type)
         return None
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index cefb9184b20..3f3262f6e72 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1342,7 +1342,7 @@ def _add_guided_params(
             return params
 
         if params.guided_decoding is not None:
-            raise ValueError("Cannot set both guided_options_request and"
+            raise ValueError("Cannot set both guided_options_request and "
                              "params.guided_decoding.")
 
         params.guided_decoding = GuidedDecodingParams(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 73061995572..9995951b3f3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -575,7 +575,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
 async def do_rerank_v1(request: RerankRequest, raw_request: Request):
     logger.warning_once(
         "To indicate that the rerank API is not part of the standard OpenAI"
-        " API, we have located it at `/rerank`. Please update your client"
+        " API, we have located it at `/rerank`. Please update your client "
         "accordingly. (Note: Conforms to JinaAI rerank API)")
 
     return await do_rerank(request, raw_request)
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index b866413e3a6..cf834fdca42 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -513,7 +513,7 @@ def _check_ray_adag_installation(self):
         if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
             raise ValueError(
                 "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
+                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
                 "Run `pip install ray[adag]` and check cupy installation.")
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 1734c670bf1..7104004fcfa 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -317,7 +317,7 @@ def initialize_ray_cluster(
         if parallel_config.world_size > device_bundles:
             raise ValueError(
                 f"The number of required {device_str}s exceeds the total "
-                f"number of available {device_str}s in the placement group."
+                f"number of available {device_str}s in the placement group. "
                 f"Required number of devices: {parallel_config.world_size}. "
                 f"Total number of devices: {device_bundles}.")
     else:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index eb53513a283..774c3876e77 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -437,7 +437,7 @@ def _add_adapter(self, lora: LoRAModel):
     def pin_adapter(self, lora_id: int) -> bool:
         """Pin a LoRAModel in the manager cache."""
         raise NotImplementedError(
-            "Pinning is not supported in LoRAModelManager."
+            "Pinning is not supported in LoRAModelManager. "
             "Use LRUCacheLoRAModelManager for pinning")  # type: ignore
 
     def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 389359a663c..a8de36491c5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -71,7 +71,7 @@ def __init__(
         if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
                 and self.input_quant.strategy == QuantizationStrategy.TENSOR):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales"
+                "For FP8 Fused MoE layers, only per-tensor scales "
                 "for weights and activations are supported. Found "
                 f"{self.weight_quant}, {self.input_quant}")
 
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 09291c2bf1f..1c8d6cb1ea7 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -74,7 +74,7 @@ def __init__(
     def __repr__(self) -> str:
         return (f"GPTQConfig(weight_bits={self.weight_bits}, "
                 f"group_size={self.group_size}, "
-                f"desc_act={self.desc_act}),"
+                f"desc_act={self.desc_act}), "
                 f"lm_head_quantized={self.lm_head_quantized}), "
                 f"dynamic={self.dynamic}")
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 050130de1c0..36711a7a509 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -56,7 +56,7 @@ def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
         quant_method = quant_config["quant_algo"]
         is_checkpoint_fp8_serialized = ("FP8" in quant_method)
         if not is_checkpoint_fp8_serialized:
-            raise ValueError("ModelOpt currently only supports static FP8"
+            raise ValueError("ModelOpt currently only supports static FP8 "
                              "quantization in vLLM. Please check the "
                              "`hf_quant_config.json` file for your model's "
                              "quant configuration.")
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 82954612fb2..f6f66803f81 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -25,8 +25,8 @@ def __init__(
         if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
             raise ValueError(
                 f"Neuron quantization datatype {self.quant_dtype} is not valid,"
-                f"the quantization datatype should match one of the below types"
-                f"{SUPPORTED_QUANT_DTYPE_LIST}")
+                f" the quantization datatype should match one of the below "
+                f"types {SUPPORTED_QUANT_DTYPE_LIST}")
         self.dequant_dtype = dequant_dtype
         self.quantize_method = quantize_method
 
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 36b08589fd1..18393517a0b 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -55,7 +55,7 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str,
         if not (weight_qscheme == "per_tensor"
                 and input_qscheme == "per_tensor"):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales"
+                "For FP8 Fused MoE layers, only per-tensor scales "
                 "for weights and activations are supported. Found "
                 f"{weight_qscheme}, {input_qscheme}")  # noqa E501
 
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 05e37251aa1..80416c1bc6e 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -118,7 +118,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int,
             and input_size_per_partition % group_size != 0):
         raise ValueError(
             f"Weight input_size_per_partition = {input_size_per_partition}"
-            f" is not divisible by group_size = {group_size}."
+            f" is not divisible by group_size = {group_size}. "
             "Consider reducing tensor_parallel_size or running "
             "with --quantization gptq.")
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index e23c6375855..4e8ef49235e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1088,7 +1088,7 @@ def _load_weights(self, model_config: ModelConfig,
         self.model_type = type(model).__name__
 
         logger.info("Loading weights with BitsAndBytes quantization. "
-                    " May take a while ...")
+                    "May take a while ...")
 
         quant_config = getattr(model_config.hf_config, "quantization_config",
                                None)
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 4e2dda33bca..c58b65d4934 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -562,7 +562,7 @@ def _process_image_input(
                 # 3D tensor
                 return list(torch.unbind(image_data, dim=0))
             raise ValueError(
-                "We expect batched 2D tensors;"
+                "We expect batched 2D tensors; "
                 "this can be either a list of 2D tensors or a single 3D tensor."
             )
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 4f5519f325e..7e4cc6bac5e 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -290,7 +290,7 @@ def _validate_shape(d: torch.Tensor):
                 expected_expr = str(expected_dims)
                 raise ValueError(
                     "The expected shape of pixel values per image per batch "
-                    f" per patch is {expected_expr}. "
+                    f"per patch is {expected_expr}. "
                     f"You supplied {tuple(d.shape)}.")
 
         for d in data:
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index a20328289f9..16223953ff8 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -90,8 +90,8 @@ def _get_instruction_len(self, prompt_token_ids: array) -> int:
 
         # Return no instruction in case of missing BOS token.
         if prompt_token_ids[0] != self.token_ids["<s>"]:
-            logger.warning("BOS token not found in prompt,"
-                           "thus using empty string for instruction."
+            logger.warning("BOS token not found in prompt, "
+                           "thus using empty string for instruction. "
                            "GritLM requires BOS token in prompt.")
             return instruction_len
 
@@ -111,8 +111,8 @@ def _get_instruction_len(self, prompt_token_ids: array) -> int:
         if found_embed_pattern_idx != -1:
             instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
         else:
-            logger.warning("Query instruction not found in prompt,"
-                           "thus using BOS token as instruction instead."
+            logger.warning("Query instruction not found in prompt, "
+                           "thus using BOS token as instruction instead. "
                            "GritLM requires query instruction in prompt.")
             instruction_len = 1
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 46f794e88ad..2699958331f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -673,7 +673,7 @@ def check_mm_inputs(self, inputs: Dict[str, object],
         for modality, count in counts.items():
             if modality not in inputs or not inputs[modality]:
                 raise ValueError(f"None input data of {modality}."
-                                 "But prompt requires.")
+                                 " But prompt requires.")
             counter_key = self.get_modality_num_counter(modality)
             if len(inputs[modality][counter_key]) != count:
                 raise ValueError(f"The prompt requires {count} "
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 61d63e104de..0f45f131065 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -639,7 +639,7 @@ def _process_image_input(
                 # 3D tensor
                 return list(torch.unbind(image_data, dim=0))
             raise ValueError(
-                "We expect batched 2D tensors;"
+                "We expect batched 2D tensors; "
                 "this can be either a list of 2D tensors or a single 3D tensor."
             )
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 0d0c367e677..3d95e949e71 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -153,8 +153,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
             vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
         if self.model is None:
             raise ValueError(
-                "Unsupported task."
-                "Only SemanticSegmentationTask is supported for now"
+                "Unsupported task. "
+                "Only SemanticSegmentationTask is supported for now "
                 "by PrithviGeospatialMAE.")
 
     def _parse_and_validate_multimodal_data(
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 802e40a0c95..093f8b7a817 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -160,7 +160,7 @@ def get_dummy_data(
 
         if mm_counts.keys() != mm_max_tokens_per_item.keys():
             raise AssertionError(
-                "The keys returned by `get_supported_mm_limits`"
+                "The keys returned by `get_supported_mm_limits` "
                 f"({set(mm_counts.keys())}) should be the same as those "
                 "returned by `get_mm_max_tokens_per_item` "
                 f"({set(mm_max_tokens_per_item.keys())})")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 5b073125614..bf425b89132 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -190,7 +190,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                 "Cannot use FlashAttention-2 backend for FP8 KV cache.")
             logger.warning(
                 "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable  "
+                "better performance by setting environment variable "
                 "VLLM_ATTENTION_BACKEND=FLASHINFER")
             target_backend = _Backend.XFORMERS
         elif block_size % 16 != 0:
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index 41221de0afe..f385064875c 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -97,7 +97,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
             if not OpenVinoPlatform.is_openvino_cpu():
-                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
+                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
                             "ignored for GPU, f16 data type will be used.")
                 cache_config.cache_dtype = ov.Type.f16
             else:
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 04af319566a..d99d4ef3dac 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -73,7 +73,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 logger.warning(
                     "bfloat16 is only supported on Intel Data Center GPU, "
                     "Intel Arc GPU is not supported yet. Your device is %s,"
-                    "which is not supported. will fallback to float16",
+                    " which is not supported. will fallback to float16",
                     cls.get_device_name())
                 model_config.dtype = torch.float16
         if not model_config.enforce_eager:
diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py
index 3ba7d0896f9..795591606f2 100644
--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
@@ -226,7 +226,7 @@ def register_module(self, module_name: str, module: nn.Module):
     def pin_adapter(self, prompt_adapter_id: int) -> bool:
         """Pin a PromptAdapterModel in the manager cache."""
         raise NotImplementedError(
-            "Pinning is not supported in PromptAdapterModelManager."
+            "Pinning is not supported in PromptAdapterModelManager. "
             "Use LRUCachePromptAdapterModelManager for pinning"
         )  # type: ignore
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 40ecc3481e6..c54e6abe18d 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -16,7 +16,7 @@
             ROCmFlashAttentionMetadata as FlashAttentionMetadata)
 except (ModuleNotFoundError, ImportError) as err:
     raise RuntimeError(
-        "Draft model speculative decoding currently only supports"
+        "Draft model speculative decoding currently only supports "
         "CUDA and ROCm flash attention backend.") from err
 
 from vllm.logger import init_logger
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 0cab2c42e57..be0f3b7e5e5 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -212,26 +212,26 @@ def _alibi_scaling_validation(self):
         if (not isinstance(self.alibi_scaling, dict)
                 or len(self.alibi_scaling) != 2):
             raise ValueError(
-                "`alibi_scaling` must be a dictionary with two fields,"
+                "`alibi_scaling` must be a dictionary with two fields, "
                 "`type` and `factor` or `type` and `train_seq_len`, "
                 f"got {self.alibi_scaling}")
         alibi_scaling_type = self.alibi_scaling.get("type", None)
         alibi_scaling_factor = self.alibi_scaling.get("factor", None)
         alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
         if alibi_scaling_type is None or alibi_scaling_type != "linear":
-            raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
+            raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
                              f"got {alibi_scaling_type}")
         if (alibi_scaling_factor is not None
                 and not isinstance(alibi_scaling_factor, float)
                 or (alibi_scaling_factor is not None
                     and alibi_scaling_factor <= 1.0)):
             raise ValueError(
-                f"`alibi_scaling`'s factor field must be a float > 1.0,"
+                f"`alibi_scaling`'s factor field must be a float > 1.0, "
                 f"got {alibi_scaling_factor}")
         if (alibi_dynamic_scaling is not None
                 and not isinstance(alibi_dynamic_scaling, int)
                 or (alibi_dynamic_scaling is not None
                     and alibi_dynamic_scaling <= 1)):
             raise ValueError(
-                f"`alibi_scaling`'s `train_seq_len` field must be an"
+                f"`alibi_scaling`'s `train_seq_len` field must be an "
                 f"integer > 1, got {alibi_dynamic_scaling}")
diff --git a/vllm/utils.py b/vllm/utils.py
index 675edc3620b..29e60a9c9be 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -447,7 +447,7 @@ def get_ip() -> str:
         logger.warning(
             "The environment variable HOST_IP is deprecated and ignored, as"
             " it is often used by Docker and other software to"
-            "interact with the container's network stack. Please "
+            " interact with the container's network stack. Please "
             "use VLLM_HOST_IP instead to set the IP address for vLLM processes"
             " to communicate with each other.")
     if host_ip:
@@ -2091,8 +2091,8 @@ def set_ulimit(target_soft_limit=65535):
                                (target_soft_limit, current_hard))
         except ValueError as e:
             logger.warning(
-                "Found ulimit of %s and failed to automatically increase"
-                "with error %s. This can cause fd limit errors like"
+                "Found ulimit of %s and failed to automatically increase "
+                "with error %s. This can cause fd limit errors like "
                 "`OSError: [Errno 24] Too many open files`. Consider "
                 "increasing with ulimit -n", current_soft, e)
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d9a415aee52..a14a7082df4 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -277,5 +277,5 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
             raise ValueError(
                 "Bfloat16 is only supported on GPUs with compute capability "
                 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the"
+                "You can use float16 instead by explicitly setting the "
                 "`dtype` flag in CLI, for example: --dtype=half.")
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 0690222d91a..1ad66e6f3be 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -545,7 +545,7 @@ def model_profile_run():
                 "value. This may cause low performance due to "
                 "occupying the majority of available system "
                 "memory. Please consider decreasing "
-                "gpu_memory_utilization or explicitly setting"
+                "gpu_memory_utilization or explicitly setting "
                 "`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
                 "variable.", memory_utilization)
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index ff38e3bfc20..5d548bdb59f 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -525,7 +525,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
             raise ValueError(
                 "Bfloat16 is only supported on GPUs with compute capability "
                 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the"
+                "You can use float16 instead by explicitly setting the "
                 "`dtype` flag in CLI, for example: --dtype=half.")
 
 
@@ -533,7 +533,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
                                 max_model_len) -> None:
     if is_attention_free and num_gpu_blocks != 0:
         raise ValueError("No memory should be allocated for the cache blocks "
-                         f"for an attention-free model, but {num_gpu_blocks}"
+                         f"for an attention-free model, but {num_gpu_blocks} "
                          "blocks are allocated.")
     if not is_attention_free and num_gpu_blocks <= 0:
         raise ValueError("No available memory for the cache blocks. "

From 35428b6b7e255bd23cb0d0f3c18e1c9e1c8b6ca9 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 24 Feb 2025 21:54:17 -0500
Subject: [PATCH 0357/1240] [Bugfix][Quantization] Fix FP8 + EP (#13784)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 30 +++++++++----------
 .../layers/quantization/awq_marlin.py         |  2 +-
 .../compressed_tensors_moe.py                 |  2 +-
 .../model_executor/layers/quantization/fp8.py |  6 ++--
 .../layers/quantization/gptq_marlin.py        |  2 +-
 .../layers/quantization/quark/quark_moe.py    |  2 +-
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 49400b699cc..452f390f498 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -260,7 +260,7 @@ class FusedMoE(torch.nn.Module):
 
     def __init__(
         self,
-        num_experts: int,
+        num_experts: int,  # Global number of experts
         top_k: int,
         hidden_size: int,
         intermediate_size: int,
@@ -291,7 +291,8 @@ def __init__(
         else:
             self.ep_size = 1
         self.top_k = top_k
-        self.num_experts = num_experts  # Global number of experts
+        self.global_num_experts = num_experts
+        self.local_num_experts = self.global_num_experts // self.ep_size
         assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
@@ -308,27 +309,29 @@ def __init__(
 
         if self.ep_size > 1:
             # Create a tensor of size num_experts filled with -1
-            self.expert_map = torch.full((self.num_experts, ),
+            self.expert_map = torch.full((self.global_num_experts, ),
                                          -1,
                                          dtype=torch.int32)
             # Create a expert map for the local experts
-            local_num_experts = num_experts // self.ep_size
             ep_rank = get_tensor_model_parallel_rank()
             if ep_rank < (self.ep_size - 1):
                 # Each non-last rank gets local_num_experts experts.
-                self.expert_map[ep_rank * local_num_experts:
-                                (ep_rank + 1) * local_num_experts] = \
-                    torch.arange(0, local_num_experts, dtype=torch.int32)
+                self.expert_map[ep_rank * self.local_num_experts:
+                                (ep_rank + 1) * self.local_num_experts] = \
+                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
             else:
                 # All remaining experts are assigned to the last rank.
-                local_num_experts = num_experts - ep_rank * local_num_experts
-                self.expert_map[-local_num_experts:] = \
-                    torch.arange(0, local_num_experts, dtype=torch.int32)
+                self.local_num_experts = (self.global_num_experts -
+                                          ep_rank * self.local_num_experts)
+                self.expert_map[-self.local_num_experts:] = \
+                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
                              "non-grouped topk.")
 
+        # Note: get_quant_method will look at the layer's local_num_experts
+        # for heuristic purposes, so it must be initialized first.
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
                 UnquantizedFusedMoEMethod())
@@ -336,11 +339,8 @@ def __init__(
             self.quant_method = quant_config.get_quant_method(self, prefix)
         assert self.quant_method is not None
 
-        local_num_experts = torch.sum(self.expert_map != -1) \
-            if self.expert_map is not None else num_experts
-
         moe_quant_params = {
-            "num_experts": local_num_experts,
+            "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
             "intermediate_size_per_partition":
             self.intermediate_size_per_partition,
@@ -647,7 +647,7 @@ def forward(self, hidden_states: torch.Tensor,
             top_k=self.top_k,
             renormalize=self.renormalize,
             use_grouped_topk=self.use_grouped_topk,
-            global_num_experts=self.num_experts,
+            global_num_experts=self.global_num_experts,
             expert_map=self.expert_map,
             topk_group=self.topk_group,
             num_expert_group=self.num_expert_group,
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 0e8c4c7b3ac..7a2fb203dec 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -136,7 +136,7 @@ def get_quant_method(self, layer: torch.nn.Module,
                     self.full_config).get_quant_method(layer, prefix)
             return AWQMarlinLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-            if layer.num_experts > 32:
+            if layer.local_num_experts > 32:
                 # For MoEs with many experts the moe_wna16 kernel is faster
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index a8de36491c5..e7f08a91e26 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -190,7 +190,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert layer.w13_weight_scale is not None
         shard_size = layer.intermediate_size_per_partition
         max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.num_experts):
+        for expert_id in range(layer.local_num_experts):
             start = 0
             for shard_id in range(2):
                 dq_weight = per_tensor_dequantize(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 9f4cd2aa737..5e1bec0bb4b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -573,11 +573,11 @@ def process_weights_after_loading(self, layer: Module) -> None:
             # Re-initialize w13_scale because we directly quantize
             # merged w13 weights and generate a single scaling factor.
             layer.w13_weight_scale = torch.nn.Parameter(torch.ones(
-                layer.num_experts,
+                layer.local_num_experts,
                 dtype=torch.float32,
                 device=w13_weight.device),
                                                         requires_grad=False)
-            for expert in range(layer.num_experts):
+            for expert in range(layer.local_num_experts):
                 w13_weight[expert, :, :], layer.w13_weight_scale[
                     expert] = ops.scaled_fp8_quant(
                         layer.w13_weight.data[expert, :, :])
@@ -644,7 +644,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             assert layer.w13_weight_scale is not None
             shard_size = layer.intermediate_size_per_partition
             max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.num_experts):
+            for expert_id in range(layer.local_num_experts):
                 start = 0
                 for shard_id in range(2):
                     dq_weight = per_tensor_dequantize(
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 241fc7d777a..94a1de71bbc 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -153,7 +153,7 @@ def override_quantization_method(cls, hf_quant_cfg,
     def get_quant_method(self, layer: torch.nn.Module,
                          prefix: str) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, FusedMoE):
-            if layer.num_experts > 32:
+            if layer.local_num_experts > 32:
                 # For MoEs with many experts the moe_wna16 kernel is faster
                 return MoeWNA16Config.from_config(
                     self.full_config).get_quant_method(layer, prefix)
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 18393517a0b..32dce5aaf5e 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -174,7 +174,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         assert layer.w13_weight_scale is not None
         shard_size = layer.intermediate_size_per_partition
         max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.num_experts):
+        for expert_id in range(layer.local_num_experts):
             start = 0
             for shard_id in range(2):
                 dq_weight = per_tensor_dequantize(

From c4c156771b0ef4be2d151d9e121e04eb6d95ee38 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Tue, 25 Feb 2025 11:19:30 +0800
Subject: [PATCH 0358/1240] [Misc][Attention][Quantization] init property
 earlier (#13733)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/layer.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index bd7783cc398..24f2a6372b4 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -85,6 +85,11 @@ def __init__(
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
 
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
         if quant_method is not None:
@@ -116,10 +121,6 @@ def __init__(
                              alibi_slopes, sliding_window, kv_cache_dtype,
                              blocksparse_params, logits_soft_cap, attn_type,
                              **extra_impl_args)
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.num_kv_heads = num_kv_heads
-        self.sliding_window = sliding_window
         self.backend = backend_name_to_enum(attn_backend.get_name())
         self.dtype = dtype
 

From b46cb09e4b721a2b1d5f692dcf507e9f9711cc80 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 25 Feb 2025 04:01:33 +0000
Subject: [PATCH 0359/1240] [V1][Metrics] Implement vllm:lora_requests_info
 metric (#13504)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/output_processor.py | 23 +++++++--
 vllm/v1/metrics/loggers.py         | 31 +++++++++++-
 vllm/v1/metrics/stats.py           | 77 ++++++++++++++++++++++++++++--
 3 files changed, 121 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 1438f9d5a7b..9ae8303df54 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -11,7 +11,8 @@
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
-from vllm.v1.metrics.stats import IterationStats, RequestStateStats
+from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
+                                   RequestStateStats)
 
 
 @dataclass
@@ -26,6 +27,7 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        lora_name: Optional[str],
         output_kind: RequestOutputKind,
         prompt: Optional[str],
         prompt_token_ids: List[int],
@@ -36,6 +38,7 @@ def __init__(
         log_stats: bool,
     ):
         self.request_id = request_id
+        self.lora_name = lora_name
         self.output_kind = output_kind
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
@@ -58,6 +61,8 @@ def from_new_request(
     ) -> "RequestState":
         return cls(
             request_id=request.request_id,
+            lora_name=(request.lora_request.name
+                       if request.lora_request is not None else None),
             output_kind=request.sampling_params.output_kind,
             prompt=request.prompt,
             prompt_token_ids=request.prompt_token_ids,
@@ -86,6 +91,7 @@ def __init__(
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: Dict[str, RequestState] = {}
+        self.lora_states = LoRARequestStates()
 
     def is_request_active(self, request_id: str) -> bool:
         return request_id in self.request_states
@@ -101,7 +107,9 @@ def abort_requests(
         request_ids: List[str],
     ) -> None:
         for request_id in request_ids:
-            self.request_states.pop(request_id, None)
+            req_state = self.request_states.pop(request_id, None)
+            if req_state is not None:
+                self.lora_states.abort_request(req_state)
 
     def add_request(
         self,
@@ -112,11 +120,13 @@ def add_request(
         if request_id in self.request_states:
             raise ValueError(f"Request id {request_id} already running.")
 
-        self.request_states[request_id] = RequestState.from_new_request(
+        req_state = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
             queue=queue,
             log_stats=self.log_stats)
+        self.request_states[request_id] = req_state
+        self.lora_states.add_request(req_state)
 
     def process_outputs(
         self,
@@ -214,6 +224,8 @@ def process_outputs(
                                                      finish_reason,
                                                      iteration_stats)
 
+        self.lora_states.update_iteration_stats(iteration_stats)
+
         return OutputProcessorOutput(
             request_outputs=request_outputs,
             reqs_to_abort=reqs_to_abort,
@@ -226,13 +238,15 @@ def _update_stats_from_output(self, req_state: RequestState,
         if iteration_stats is None:
             return
 
+        lora_stats = self.lora_states.get_stats(req_state)
+
         assert engine_core_timestamp is not None
         assert req_state.stats is not None
         iteration_stats.update_from_output(engine_core_output,
                                            engine_core_timestamp,
                                            req_state.is_prefilling,
                                            req_state.prompt_len,
-                                           req_state.stats)
+                                           req_state.stats, lora_stats)
 
     def _update_stats_from_finished(self, req_state: RequestState,
                                     request_output: RequestOutput,
@@ -246,6 +260,7 @@ def _update_stats_from_finished(self, req_state: RequestState,
         iteration_stats.update_from_finished_request(finish_reason,
                                                      request_output,
                                                      req_state.stats)
+        self.lora_states.finish_request(req_state)
 
     @staticmethod
     def _make_request_output(
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index e562b4145af..2c17da0ebc8 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -2,7 +2,7 @@
 
 import time
 from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 import numpy as np
 import prometheus_client
@@ -233,6 +233,22 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
+        if vllm_config.lora_config is not None:
+            self.labelname_max_lora = "max_lora"
+            self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
+            self.labelname_running_lora_adapters = "running_lora_adapters"
+            self.max_lora = vllm_config.lora_config.max_loras
+            self.gauge_lora_info = \
+                prometheus_client.Gauge(
+                    name="vllm:lora_requests_info",
+                    documentation="Running stats on lora requests.",
+                    labelnames=[
+                        self.labelname_max_lora,
+                        self.labelname_waiting_lora_adapters,
+                        self.labelname_running_lora_adapters,
+                    ])
+
         self.log_metrics_info("cache_config", vllm_config.cache_config)
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
@@ -295,6 +311,19 @@ def log(self, scheduler_stats: SchedulerStats,
         for prefill_time in iteration_stats.prefill_times_iter:
             self.histogram_prefill_time_request.observe(prefill_time)
 
+        if self.gauge_lora_info is not None:
+            running_lora_adapters = \
+                ",".join(iteration_stats.running_lora_adapters.keys())
+            waiting_lora_adapters = \
+                ",".join(iteration_stats.waiting_lora_adapters.keys())
+            lora_info_labels = {
+                self.labelname_running_lora_adapters: running_lora_adapters,
+                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
+                self.labelname_max_lora: self.max_lora,
+            }
+            self.gauge_lora_info.labels(**lora_info_labels)\
+                                .set_to_current_time()
+
     @staticmethod
     def _unregister_vllm_metrics():
         # Unregister any existing vLLM collectors (for CI/CD
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index a0e6204929e..74d4a1bc4fb 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -2,11 +2,12 @@
 
 import time
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
+    from vllm.v1.output_processor import RequestState
 
 
 @dataclass
@@ -36,6 +37,12 @@ class SchedulerStats:
         default_factory=PrefixCacheStats)
 
 
+@dataclass
+class LoRAStats:
+    waiting_requests: Set[str] = field(default_factory=set)
+    running_requests: Set[str] = field(default_factory=set)
+
+
 @dataclass
 class RequestStateStats:
     """Stats that need to be tracked across delta updates."""
@@ -76,6 +83,8 @@ def __init__(self):
         self.time_per_output_tokens_iter: List[float] = []
         self.queue_times_iter: List[float] = []
         self.prefill_times_iter: List[float] = []
+        self.waiting_lora_adapters: Dict[str, int] = {}
+        self.running_lora_adapters: Dict[str, int] = {}
 
     def _time_since(self, start: float) -> float:
         """Calculate an interval relative to this iteration's timestamp."""
@@ -83,7 +92,8 @@ def _time_since(self, start: float) -> float:
 
     def update_from_output(self, output: "EngineCoreOutput",
                            engine_core_timestamp: float, is_prefilling: bool,
-                           prompt_len: int, req_stats: RequestStateStats):
+                           prompt_len: int, req_stats: RequestStateStats,
+                           lora_stats: Optional[LoRAStats]):
         num_new_generation_tokens = len(output.new_token_ids)
 
         self.num_generation_tokens += num_new_generation_tokens
@@ -105,7 +115,8 @@ def update_from_output(self, output: "EngineCoreOutput",
 
         # Process request-level engine core events
         if output.events is not None:
-            self.update_from_events(output.events, is_prefilling, req_stats)
+            self.update_from_events(output.request_id, output.events,
+                                    is_prefilling, req_stats, lora_stats)
 
         # Process the batch-level "new tokens" engine core event
         if is_prefilling:
@@ -123,17 +134,21 @@ def update_from_output(self, output: "EngineCoreOutput",
         if num_new_generation_tokens > 0:
             req_stats.last_token_ts = engine_core_timestamp
 
-    def update_from_events(self, events: List["EngineCoreEvent"],
-                           is_prefilling: bool, req_stats: RequestStateStats):
+    def update_from_events(self, req_id: str, events: List["EngineCoreEvent"],
+                           is_prefilling: bool, req_stats: RequestStateStats,
+                           lora_stats: Optional[LoRAStats]):
         # Avoid circular dependency
         from vllm.v1.engine import EngineCoreEventType
         for event in events:
             if event.type == EngineCoreEventType.QUEUED:
                 req_stats.queued_ts = event.timestamp
+                if lora_stats is not None:
+                    lora_stats.waiting_requests.add(req_id)
             elif event.type == EngineCoreEventType.SCHEDULED:
                 queued_interval = event.timestamp - req_stats.queued_ts
                 self.queue_times_iter.append(queued_interval)
                 req_stats.scheduled_ts = event.timestamp
+                LoRARequestStates.scheduled_request(lora_stats, req_id)
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
@@ -151,3 +166,55 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
                                  inference_time=inference_time,
                                  decode_time=decode_time)
         self.finished_requests.append(finished_req)
+
+
+class LoRARequestStates:
+    """Per-LoRA request state stats."""
+
+    def __init__(self):
+        self.lora_name_to_stats: Dict[str, LoRAStats] = {}
+
+    def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
+        if req_state.lora_name is None:
+            return None
+        if req_state.lora_name not in self.lora_name_to_stats:
+            self.lora_name_to_stats[req_state.lora_name] = LoRAStats()
+        return self.lora_name_to_stats[req_state.lora_name]
+
+    def add_request(self, req_state: 'RequestState'):
+        if (lora_stats := self.get_stats(req_state)) is not None:
+            lora_stats.waiting_requests.add(req_state.request_id)
+
+    def finish_request(self, req_state: 'RequestState'):
+        if req_state.lora_name is None:
+            return
+        lora_stats = self.lora_name_to_stats[req_state.lora_name]
+        lora_stats.running_requests.remove(req_state.request_id)
+
+    def abort_request(self, req_state: 'RequestState'):
+        if req_state.lora_name is None:
+            return
+        lora_stats = self.lora_name_to_stats[req_state.lora_name]
+        lora_stats.waiting_requests.discard(req_state.request_id)
+        lora_stats.running_requests.discard(req_state.request_id)
+
+    # Break the pattern for this lifecycle methods so we can
+    # call this from IterationStats.update_from_events()
+    @staticmethod
+    def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
+        if lora_stats is None:
+            return
+        lora_stats.waiting_requests.remove(request_id)
+        lora_stats.running_requests.add(request_id)
+
+    def update_iteration_stats(self,
+                               iteration_stats: Optional[IterationStats]):
+        if iteration_stats is None:
+            return
+        for lora_name, stats in self.lora_name_to_stats.items():
+            if stats.waiting_requests:
+                iteration_stats.waiting_lora_adapters[lora_name] = \
+                    len(stats.waiting_requests)
+            if stats.running_requests:
+                iteration_stats.running_lora_adapters[lora_name] = \
+                    len(stats.running_requests)

From 8a731012e710bb513f5029e25991dd18951c6f64 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Mon, 24 Feb 2025 23:33:59 -0500
Subject: [PATCH 0360/1240] [Bugfix] Fix deepseek-v2 error: "missing 1 required
 positional argument: 'residual'" (#13802)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 22b2bf7ca46..79484cee167 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -614,7 +614,7 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         for layer in self.layers[self.start_layer:self.end_layer]:
-            hidden_states, residual = layer(positions, hidden_states)
+            hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({

From ae42dfb52959623563622eb6533d35779089d958 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 25 Feb 2025 01:10:31 -0500
Subject: [PATCH 0361/1240] [Bugfix] Support MLA for CompressedTensorsWNA16
 (#13725)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index f47ea3684e0..4dd562be383 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1130,13 +1130,13 @@ def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
                 )
 
         def get_layer_weight(layer):
-            if hasattr(layer, "weight"):
-                return layer.weight
-            elif hasattr(layer, "qweight"):
-                return layer.qweight
-            else:
-                raise AttributeError(
-                    f"Layer '{layer}' has neither weight nor qweight")
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute:"
+                f" {WEIGHT_NAMES}.")
 
         def get_and_maybe_dequant_weights(layer: LinearBase):
             if not isinstance(layer.quant_method, UnquantizedLinearMethod):

From 26a77c117fdf7d5ee1acfa81594057670bcad272 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 25 Feb 2025 03:17:14 -0500
Subject: [PATCH 0362/1240] Fix CompressedTensorsWNA16MoE with grouped scales
 (#13769)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../quantization/compressed_tensors/compressed_tensors_moe.py  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index e7f08a91e26..f1f316f0833 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -527,7 +527,8 @@ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int,
         replace_tensor("w13_weight_scale", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             layer.w2_weight_scale,
-            layer.w2_weight_scale.shape[1] * self.packed_factor,
+            layer.w2_weight_scale.shape[1] *
+            (self.group_size if self.group_size != -1 else self.packed_factor),
             size_k2,
             self.group_size,
             self.num_bits,

From 1b39c7efa521d642af5003cd62c75af9e8baee5f Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 25 Feb 2025 13:48:02 +0530
Subject: [PATCH 0363/1240] [Core] LoRA V1 - Add add/pin/list/remove_lora
 functions   (#13705)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_add_lora.py               |  13 +-
 tests/lora/test_lora_functions.py         | 137 ++++++++++++++++++++++
 vllm/v1/engine/async_llm.py               |  18 ++-
 vllm/v1/engine/core.py                    |  15 ++-
 vllm/v1/engine/core_client.py             |  63 ++++++++--
 vllm/v1/engine/llm_engine.py              |  18 ++-
 vllm/v1/worker/gpu_worker.py              |  11 +-
 vllm/v1/worker/lora_model_runner_mixin.py |  17 ++-
 8 files changed, 270 insertions(+), 22 deletions(-)
 create mode 100644 tests/lora/test_lora_functions.py

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 2b421bfd9eb..70b058b201d 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -7,6 +7,7 @@
 import pytest
 from huggingface_hub import snapshot_download
 
+import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
@@ -144,10 +145,14 @@ async def test_add_lora():
         await requests_processing_time(llm, dummy_run_requests)
 
         # Run with warmup
-        for lr in warmup_run_requests:
-            await llm.add_lora(lr)
-        # Wait for the add_lora function to complete on the server side.
-        await asyncio.sleep(30)
+        add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
+        add_lora_results = await asyncio.gather(*add_lora_tasks)
+        if env.VLLM_USE_V1:
+            # Test that all all_lora calls are successful.
+            assert all(add_lora_results)
+        else:
+            # No way to check V0 engine results as the calls just return None.
+            pass
         time_with_add_lora = await requests_processing_time(
             llm, warmup_run_requests)
 
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
new file mode 100644
index 00000000000..1309848868b
--- /dev/null
+++ b/tests/lora/test_lora_functions.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Script to test add_lora, remove_lora, pin_lora, list_loras functions.
+"""
+
+import os
+from typing import List
+
+import pytest
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.llm import LLM
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+LORA_RANK = 8
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+def make_lora_request(lora_id: int):
+    return LoRARequest(lora_name=f"{lora_id}",
+                       lora_int_id=lora_id,
+                       lora_path=LORA_MODULE_PATH)
+
+
+def test_lora_functions_sync():
+
+    max_loras = 4
+    # Create engine in eager-mode. Due to high max_loras, the CI can
+    # OOM during cuda-graph capture.
+    engine_args = EngineArgs(model=MODEL_PATH,
+                             enable_lora=True,
+                             max_loras=max_loras,
+                             max_lora_rank=LORA_RANK,
+                             max_model_len=128,
+                             gpu_memory_utilization=0.8,
+                             enforce_eager=True)
+
+    llm = LLM.get_engine_class().from_engine_args(engine_args)
+
+    def run_check(fn, args, expected: List):
+        fn(args)
+        assert set(llm.list_loras()) == set(expected)
+
+    run_check(llm.add_lora, make_lora_request(1), [1])
+    run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+    # Pin LoRA 1 and test that it is never removed on subsequent adds.
+    run_check(llm.pin_lora, 1, [1, 2])
+    run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+    run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+    run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+    run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+    run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+    run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+    run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+    run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+    # Remove LoRA 1 and continue adding.
+    run_check(llm.remove_lora, 1, [8, 9, 10])
+    run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+    run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+    # Remove all LoRAs
+    run_check(llm.remove_lora, 13, [12, 10, 11])
+    run_check(llm.remove_lora, 12, [10, 11])
+    run_check(llm.remove_lora, 11, [10])
+    run_check(llm.remove_lora, 10, [])
+
+
+@pytest.mark.asyncio
+async def test_lora_functions_async():
+
+    if os.getenv("VLLM_USE_V1") == "0":
+        pytest.skip(
+            reason=
+            "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
+
+    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
+    # environment variable. reload vllm.enging.async_llm_engine as
+    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
+    # env var.
+    import importlib
+
+    import vllm.engine.async_llm_engine
+    importlib.reload(vllm.engine.async_llm_engine)
+    from vllm.entrypoints.openai.api_server import (
+        build_async_engine_client_from_engine_args)
+
+    max_loras = 4
+    engine_args = AsyncEngineArgs(model=MODEL_PATH,
+                                  enable_lora=True,
+                                  max_loras=max_loras,
+                                  max_lora_rank=LORA_RANK,
+                                  max_model_len=128,
+                                  gpu_memory_utilization=0.8,
+                                  enforce_eager=True)
+
+    async def run_check(fn, args, expected: List):
+        await fn(args)
+        assert set(await llm.list_loras()) == set(expected)
+
+    async with build_async_engine_client_from_engine_args(engine_args) as llm:
+        await run_check(llm.add_lora, make_lora_request(1), [1])
+        await run_check(llm.add_lora, make_lora_request(2), [1, 2])
+
+        # Pin LoRA 1 and test that it is never removed on subsequent adds.
+        await run_check(llm.pin_lora, 1, [1, 2])
+        await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
+        await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
+        await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
+        await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
+        await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
+        await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
+
+        # Remove LoRA 1 and continue adding.
+        await run_check(llm.remove_lora, 1, [8, 9, 10])
+        await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
+        await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
+
+        # Remove all LoRAs
+        await run_check(llm.remove_lora, 13, [12, 10, 11])
+        await run_check(llm.remove_lora, 12, [10, 11])
+        await run_check(llm.remove_lora, 11, [10])
+        await run_check(llm.remove_lora, 10, [])
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 36a02628f40..0c04e14cec2 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Mapping, Optional, Type, Union
+from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union
 
 import numpy as np
 
@@ -392,9 +392,21 @@ async def sleep(self, level: int = 1) -> None:
     async def wake_up(self) -> None:
         await self.engine_core.wake_up_async()
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
-        await self.engine_core.add_lora_async(lora_request)
+        return await self.engine_core.add_lora_async(lora_request)
+
+    async def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return await self.engine_core.remove_lora_async(lora_id)
+
+    async def list_loras(self) -> Set[int]:
+        """List all registered adapters."""
+        return await self.engine_core.list_loras_async()
+
+    async def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return await self.engine_core.pin_lora_async(lora_id)
 
     @property
     def is_running(self) -> bool:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 85c97293af8..041896f1c7c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,7 +7,7 @@
 from concurrent.futures import Future
 from inspect import isclass, signature
 from multiprocessing.connection import Connection
-from typing import Any, List, Optional, Tuple, Type
+from typing import Any, List, Optional, Set, Tuple, Type
 
 import msgspec
 import psutil
@@ -222,8 +222,17 @@ def wake_up(self):
     def execute_dummy_batch(self):
         self.model_executor.collective_rpc("execute_dummy_batch")
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self.model_executor.add_lora(lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_executor.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_executor.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_executor.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_executor.pin_lora(lora_id)
 
 
 class EngineCoreProc(EngineCore):
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5ffaf63e6ce..9f36e11d12d 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -10,7 +10,7 @@
 from concurrent.futures import Future
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Dict, List, Optional, Type, Union
+from typing import Any, Dict, List, Optional, Set, Type, Union
 
 import zmq
 import zmq.asyncio
@@ -97,7 +97,16 @@ async def execute_dummy_batch_async(self) -> None:
     def abort_requests(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError
+
+    def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
     async def get_output_async(self) -> EngineCoreOutputs:
@@ -121,7 +130,16 @@ async def wake_up_async(self) -> None:
     async def abort_requests_async(self, request_ids: List[str]) -> None:
         raise NotImplementedError
 
-    async def add_lora_async(self, lora_request: LoRARequest) -> None:
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    async def list_loras_async(self) -> Set[int]:
+        raise NotImplementedError
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
 
@@ -166,8 +184,17 @@ def wake_up(self) -> None:
     def execute_dummy_batch(self) -> None:
         self.engine_core.execute_dummy_batch()
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine_core.add_lora(lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.engine_core.pin_lora(lora_id)
 
 
 @dataclass
@@ -356,8 +383,17 @@ def profile(self, is_start: bool = True) -> None:
     def reset_prefix_cache(self) -> None:
         self._call_utility("reset_prefix_cache")
 
-    def add_lora(self, lora_request: LoRARequest) -> None:
-        self._call_utility("add_lora", lora_request)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self._call_utility("add_lora", lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        return self._call_utility("remove_lora", lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self._call_utility("list_loras")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self._call_utility("pin_lora", lora_id)
 
     def sleep(self, level: int = 1) -> None:
         self._call_utility("sleep", level)
@@ -454,5 +490,14 @@ async def wake_up_async(self) -> None:
     async def execute_dummy_batch_async(self) -> None:
         await self._call_utility_async("execute_dummy_batch")
 
-    async def add_lora_async(self, lora_request: LoRARequest) -> None:
-        await self._call_utility_async("add_lora", lora_request)
+    async def add_lora_async(self, lora_request: LoRARequest) -> bool:
+        return await self._call_utility_async("add_lora", lora_request)
+
+    async def remove_lora_async(self, lora_id: int) -> bool:
+        return await self._call_utility_async("remove_lora", lora_id)
+
+    async def list_loras_async(self) -> Set[int]:
+        return await self._call_utility_async("list_loras")
+
+    async def pin_lora_async(self, lora_id: int) -> bool:
+        return await self._call_utility_async("pin_lora", lora_id)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 64fd8719c82..ccf52250c1d 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Mapping, Optional, Type, Union
+from typing import Dict, List, Mapping, Optional, Set, Type, Union
 
 from typing_extensions import TypeVar
 
@@ -254,3 +254,19 @@ def get_tokenizer_group(
                             f"found type: {type(tokenizer_group)}")
 
         return tokenizer_group
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        """Load a new LoRA adapter into the engine for future requests."""
+        return self.engine_core.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        """Remove an already loaded LoRA adapter."""
+        return self.engine_core.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        """List all registered adapters."""
+        return self.engine_core.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        """Prevent an adapter from being evicted."""
+        return self.engine_core.pin_lora(lora_id)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a14a7082df4..f681925f557 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Set
 
 import torch
 import torch.distributed
@@ -240,6 +240,15 @@ def execute_dummy_batch(self) -> None:
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
 
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
+
     def check_health(self) -> None:
         # worker will always be healthy as long as it's running.
         return
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 053897da0aa..731e758e6e7 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -131,4 +131,19 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig,
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.add_adapter(lora_request)
\ No newline at end of file
+        return self.lora_manager.add_adapter(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.remove_adapter(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.pin_adapter(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        if not self.lora_manager:
+            raise RuntimeError("LoRA is not enabled.")
+        return self.lora_manager.list_adapters()
\ No newline at end of file

From 51c632ac740d74f1b7c714a8f5113c3c4a5c3bb6 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Feb 2025 16:18:19 +0800
Subject: [PATCH 0364/1240] [Misc] Check that the model can be inspected upon
 registration (#13743)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/registry.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 81623defd33..bae6444267f 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -347,6 +347,10 @@ def register_model(
           when importing the model and thus the related error
           :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
         """
+        if not isinstance(model_arch, str):
+            msg = f"`model_arch` should be a string, not a {type(model_arch)}"
+            raise TypeError(msg)
+
         if model_arch in self.models:
             logger.warning(
                 "Model architecture %s is already registered, and will be "
@@ -360,8 +364,18 @@ def register_model(
                 raise ValueError(msg)
 
             model = _LazyRegisteredModel(*split_str)
-        else:
+
+            try:
+                model.inspect_model_cls()
+            except Exception as exc:
+                msg = f"Unable to inspect model {model_cls}"
+                raise RuntimeError(msg) from exc
+        elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
             model = _RegisteredModel.from_model_cls(model_cls)
+        else:
+            msg = ("`model_cls` should be a string or PyTorch model class, "
+                   f"not a {type(model_arch)}")
+            raise TypeError(msg)
 
         self.models[model_arch] = model
 

From ba162a3348e3b344fd6d08b3a7de6fbcc5df1dd9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 25 Feb 2025 03:21:25 -0500
Subject: [PATCH 0365/1240] [Core] xgrammar: Expand list of unsupported
 jsonschema keywords (#13783)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/guided_decoding/utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index c3c0378ea95..10981776e76 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -33,6 +33,18 @@ def check_object(obj: dict) -> bool:
         ]):
             return True
 
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and any(
+                key in obj for key in ["minLength", "maxLength", "format"]):
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(key in obj for key in [
+                "minProperties", "maxProperties", "propertyNames",
+                "patternProperties"
+        ]):
+            return True
+
         # Recursively check all nested objects and arrays
         for value in obj.values():
             if isinstance(value, dict):

From 636fc8baf2d9159f7d975d3a3c3d904e77aca6d9 Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 25 Feb 2025 16:36:07 +0800
Subject: [PATCH 0366/1240] [Bugfix] Modify modelscope api usage in
 transformer_utils (#13807)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index d0b5d7f01a9..87e446f8943 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -29,9 +29,8 @@ def modelscope_list_repo_files(
 ) -> List[str]:
     """List files in a modelscope repo."""
     from modelscope.hub.api import HubApi
-    from modelscope.utils.hf_util import _try_login
-    _try_login(token)
     api = HubApi()
+    api.login(token)
     # same as huggingface_hub.list_repo_files
     files = [
         file['Path'] for file in api.get_model_files(

From bb058e2b9aa4ff4e7eb6f8512db744b7912d762d Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 25 Feb 2025 00:37:08 -0800
Subject: [PATCH 0367/1240] [misc] Clean up ray compiled graph type hints
 (#13731)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 16 ++++++++++++----
 vllm/executor/ray_utils.py                |  7 +++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index cf834fdca42..673d0fc5d23 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -528,10 +528,18 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                     envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
-            # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput   # noqa: E501
-            #                         -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput   # noqa: E501
-            #                         -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput   # noqa: E501
-            #                         -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput   # noqa: E501
+            #
+            # For V0:
+            # ExecuteModelRequest -> 0 -> (ExecuteModelReq, IntermediateTensors) -> 4 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 1 -> (ExecuteModelReq, IntermediateTensors) -> 5 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 2 -> (ExecuteModelReq, IntermediateTensors) -> 6 -> SamplerOutput   # noqa: E501
+            # ExecuteModelRequest -> 3 -> (ExecuteModelReq, IntermediateTensors) -> 7 -> SamplerOutput   # noqa: E501
+            #
+            # For V1:
+            # SchedulerOutput -> 0 -> (SchedulerOutput, IntermediateTensors) -> 4 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 1 -> (SchedulerOutput, IntermediateTensors) -> 5 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 2 -> (SchedulerOutput, IntermediateTensors) -> 6 -> ModelRunnerOutput   # noqa: E501
+            # SchedulerOutput -> 3 -> (SchedulerOutput, IntermediateTensors) -> 7 -> ModelRunnerOutput   # noqa: E501
 
             # All workers in the first TP group will take in the
             # ExecuteModelRequest as input.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7104004fcfa..a9661fe0ef1 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -114,8 +114,11 @@ def setup_device_if_necessary(self):
 
         def execute_model_ray(
             self,
-            scheduler_output: "SchedulerOutput",
-        ) -> "ModelRunnerOutput":
+            scheduler_output: Union["SchedulerOutput",
+                                    Tuple["SchedulerOutput",
+                                          "IntermediateTensors"]],
+        ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput",
+                                              "IntermediateTensors"]]:
             # this method is used to compile ray CG,
             # and it needs a special logic of self.setup_device_if_necessary()
             self.setup_device_if_necessary()

From f1e4ac812011b6a95a14aa398fa703e59161f66f Mon Sep 17 00:00:00 2001
From: Jiayi Yao <82156730+YaoJiayi@users.noreply.github.com>
Date: Tue, 25 Feb 2025 02:38:42 -0600
Subject: [PATCH 0368/1240] [Feature] Support KV cache offloading and disagg
 prefill with LMCache connector. (#12953)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../offline_inference/cpu_offload_lmcache.py  |  65 +++++++++
 .../disaggregated_prefill_lmcache.py          | 130 ++++++++++++++++++
 .../kv_transfer/kv_connector/factory.py       |   5 +
 .../kv_connector/lmcache_connector.py         | 108 +++++++++++++++
 vllm/distributed/parallel_state.py            |   4 +-
 5 files changed, 310 insertions(+), 2 deletions(-)
 create mode 100644 examples/offline_inference/cpu_offload_lmcache.py
 create mode 100644 examples/offline_inference/disaggregated_prefill_lmcache.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py

diff --git a/examples/offline_inference/cpu_offload_lmcache.py b/examples/offline_inference/cpu_offload_lmcache.py
new file mode 100644
index 00000000000..8211629b24e
--- /dev/null
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of cpu offloading
+with LMCache.
+
+Note that `pip install lmcache` is needed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+import time
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Enable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "True"
+# Set local CPU memory limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+
+# This example script runs two requests with a shared prefix.
+shared_prompt = "Hello, how are you?" * 1000
+first_prompt = [
+    shared_prompt + "Hello, my name is",
+]
+second_prompt = [
+    shared_prompt + "Tell me a very long story",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+ktc = KVTransferConfig.from_cli(
+    '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+# memory. Reduce the value if your GPU has less memory.
+# Note that LMCache is not compatible with chunked prefill for now.
+llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+          kv_transfer_config=ktc,
+          max_model_len=8000,
+          enable_chunked_prefill=False,
+          gpu_memory_utilization=0.8)
+
+outputs = llm.generate(first_prompt, sampling_params)
+for output in outputs:
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
+print("First request done.")
+
+time.sleep(1)
+
+outputs = llm.generate(second_prompt, sampling_params)
+for output in outputs:
+    generated_text = output.outputs[0].text
+    print(f"Generated text: {generated_text!r}")
+print("Second request done.")
+
+# Clean up lmcache backend
+LMCacheEngineBuilder.destroy(ENGINE_NAME)
diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
new file mode 100644
index 00000000000..36d343c6812
--- /dev/null
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file demonstrates the example usage of disaggregated prefilling
+with LMCache.
+We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
+and launch an additional LMCache server.
+KV cache is transferred in the following manner: 
+VLLM prefill node -> LMCache server -> VLLM decode node.
+
+Note that `pip install lmcache` is needed to run this example.
+Learn more about LMCache in https://github.com/LMCache/LMCache.
+"""
+import os
+import subprocess
+import time
+from multiprocessing import Event, Process
+
+from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+from lmcache.integration.vllm.utils import ENGINE_NAME
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# LMCache-related environment variables
+# The port to start LMCache server
+port = 8100
+# Use experimental features in LMCache
+os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+# LMCache is set to use 256 tokens per chunk
+os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+# Disable local CPU backend in LMCache
+os.environ["LMCACHE_LOCAL_CPU"] = "False"
+# Set local CPU memory buffer limit to 5.0 GB
+os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+# Set the remote URL for LMCache server
+os.environ["LMCACHE_REMOTE_URL"] = f"lm://localhost:{port}"
+# Set the serializer/deserializer between vllm and LMCache server
+# `naive` indicates using raw bytes of the tensor without any compression
+os.environ["LMCACHE_REMOTE_SERDE"] = "naive"
+
+
+def run_prefill(prefill_done, prompts):
+    # We use GPU 0 for prefill node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    #llm.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+    print("Prefill node is finished.")
+    prefill_done.set()
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_decode(prefill_done, prompts, timeout=1):
+    # We use GPU 1 for decode node.
+    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    ktc = KVTransferConfig.from_cli(
+        '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+    )
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # of memory. Reduce the value if your GPU has less memory.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+              kv_transfer_config=ktc,
+              max_model_len=8000,
+              gpu_memory_utilization=0.8,
+              enforce_eager=True)
+
+    print("Waiting for prefill node to finish...")
+    prefill_done.wait()
+    time.sleep(timeout)
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Generated text: {generated_text!r}")
+
+    # Clean up lmcache backend
+    LMCacheEngineBuilder.destroy(ENGINE_NAME)
+
+
+def run_lmcache_server(port):
+    server_proc = subprocess.Popen([
+        "python", "-m", "lmcache.experimental.server", "localhost",
+        str(port)
+    ])
+    return server_proc
+
+
+if __name__ == "__main__":
+
+    prompts = [
+        "Hello, how are you?" * 1000,
+    ]
+
+    prefill_done = Event()
+    prefill_process = Process(target=run_prefill, args=(prefill_done, prompts))
+    decode_process = Process(target=run_decode, args=(prefill_done, prompts))
+    lmcache_server_process = run_lmcache_server(port)
+
+    # Start prefill node
+    prefill_process.start()
+
+    # Start decode node
+    decode_process.start()
+
+    # Clean up the processes
+    decode_process.join()
+    prefill_process.terminate()
+    lmcache_server_process.terminate()
+    lmcache_server_process.wait()
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index fe480533458..7336c54ec8a 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -48,3 +48,8 @@ def create_connector(cls, rank: int, local_rank: int,
     "MooncakeConnector",
     "vllm.distributed.kv_transfer.kv_connector.simple_connector",
     "SimpleConnector")
+
+KVConnectorFactory.register_connector(
+    "LMCacheConnector",
+    "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
+    "LMCacheConnector")
diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
new file mode 100644
index 00000000000..bf9117133af
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+LMCache KV Cache Connector for Distributed Machine Learning Inference
+
+The LMCacheConnector can (1) transfer KV caches between prefill vLLM worker
+(KV cache producer) and decode vLLM worker (KV cache consumer) using LMCache;
+(2) offload and share KV caches.
+"""
+
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class LMCacheConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+
+        self.transfer_config = config.kv_transfer_config
+        self.vllm_config = config
+
+        from lmcache.experimental.cache_engine import LMCacheEngineBuilder
+        from lmcache.integration.vllm.utils import ENGINE_NAME
+        from lmcache.integration.vllm.vllm_adapter import (
+            RetrieveStatus, StoreStatus, init_lmcache_engine,
+            lmcache_retrieve_kv, lmcache_should_store, lmcache_store_kv)
+        logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
+                    self.transfer_config)
+
+        # TODO (Jiayi): Find model_config, parallel_config, and cache_config
+        self.engine = init_lmcache_engine(config.model_config,
+                                          config.parallel_config,
+                                          config.cache_config)
+        self.lmcache_engine_name = ENGINE_NAME
+        self.lmcache_engine_builder = LMCacheEngineBuilder
+
+        self.model_config = config.model_config
+        self.parallel_config = config.parallel_config
+        self.cache_config = config.cache_config
+        self.lmcache_retrieve_kv = lmcache_retrieve_kv
+        self.lmcache_store_kv = lmcache_store_kv
+        self.lmcache_should_store = lmcache_should_store
+        self.store_status = StoreStatus
+        self.retrieve_status = RetrieveStatus
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+
+        hidden_or_intermediate_states = None
+
+        # TODO (Jiayi): Need to support chunked prefill
+        retrieve_status = self.retrieve_status.PREFILL
+
+        model_input, bypass_model_exec = self.lmcache_retrieve_kv(
+            model_executable, model_input, self.cache_config, kv_caches,
+            retrieve_status)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        num_reqs = 0
+        seq_group_list = model_input.sampling_metadata.seq_groups
+        assert seq_group_list is not None
+        for seq_group in seq_group_list:
+            seq_ids = seq_group.seq_ids
+            for seq_id in seq_ids:
+                num_reqs += 1
+
+        # TODO (Jiayi): Only normal prefill is supported for now
+        store_status = self.lmcache_should_store(model_input)
+        self.lmcache_store_kv(
+            self.model_config,
+            self.parallel_config,
+            self.cache_config,
+            model_executable,
+            model_input,
+            kv_caches,
+            store_status,
+        )
+
+    def close(self):
+        self.lmcache_engine_builder.destroy(self.lmcache_engine_name)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 83484cd7355..86166dd5bb8 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -962,8 +962,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None:
         return
 
     if all([
-            vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER
-            is None
+            vllm_config.kv_transfer_config.is_kv_transfer_instance,
+            _KV_TRANSFER is None
     ]):
         _KV_TRANSFER = kv_transfer.KVTransferAgent(
             rank=get_world_group().rank,

From 0c4e4b2c4e31b82827832716eaaa82ecefc88a25 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 25 Feb 2025 03:39:59 -0500
Subject: [PATCH 0369/1240] [ROCm][Quantization][Kernel] Using HIP FP8 header
 (#12593)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                              |  19 +
 csrc/quantization/fp8/amd/hip_float8.h      | 137 -------
 csrc/quantization/fp8/amd/hip_float8_impl.h | 315 ----------------
 csrc/quantization/fp8/amd/quant_utils.cuh   | 398 +++++++++++---------
 csrc/quantization/fp8/common.cuh            |   8 +-
 tests/kernels/test_cache.py                 |  24 +-
 6 files changed, 267 insertions(+), 634 deletions(-)
 delete mode 100644 csrc/quantization/fp8/amd/hip_float8.h
 delete mode 100644 csrc/quantization/fp8/amd/hip_float8_impl.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b569ec25f1..82ad7b8819d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -174,6 +174,25 @@ include(FetchContent)
 file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
+#
+# Set rocm version dev int.
+#
+if(VLLM_GPU_LANG STREQUAL "HIP")
+  #
+  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
+  #
+  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
+
+
+  #
+  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
+  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
+  #
+  set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
+endif()
+
 #
 # Define other extension targets
 #
diff --git a/csrc/quantization/fp8/amd/hip_float8.h b/csrc/quantization/fp8/amd/hip_float8.h
deleted file mode 100644
index f9c80fcdec5..00000000000
--- a/csrc/quantization/fp8/amd/hip_float8.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#pragma once
-
-#ifdef __HIPCC__
-  #include <hip/hip_runtime.h>
-#else
-  #include <type_traits>
-  #include <stdint.h>
-  #include <math.h>
-  #include <iostream>
-#endif
-
-#include "hip_float8_impl.h"
-
-struct alignas(1) hip_fp8 {
-  struct from_bits_t {};
-  HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() {
-    return from_bits_t();
-  }
-  uint8_t data;
-
-  hip_fp8() = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default;
-  HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete;
-  explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t)
-      : data(v) {}
-
-#ifdef __HIP__MI300__
-  // NOTE: ON-DEVICE... always optimal bias
-  explicit HIP_FP8_DEVICE hip_fp8(float v)
-      : data(hip_fp8_impl::to_fp8_from_fp32(v)) {}
-
-  explicit HIP_FP8_DEVICE hip_fp8(_Float16 v)
-      : hip_fp8(static_cast<float>(v)) {}
-
-  // Host only implementation using s/w simulation
-  explicit HIP_FP8_HOST
-#else   // __HIP__MI300__
-  // both Host and DEVICE for non-MI300 using s/w simulation
-  explicit HIP_FP8_HOST_DEVICE
-#endif  // __HIP__MI300__
-  hip_fp8(float v) {
-    data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/,
-                                   true /*clip*/>(v);
-  }
-
-  explicit HIP_FP8_HOST_DEVICE hip_fp8(double v)
-      : hip_fp8(static_cast<float>(v)) {}
-
-#ifdef __HIP__MI300__
-  // upcast using device specific intrinsic
-  explicit inline HIP_FP8_DEVICE operator float() const {
-    float fval;
-    uint32_t i32val = static_cast<uint32_t>(data);
-
-    // upcast
-    asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0"
-                 : "=v"(fval)
-                 : "v"(i32val));
-
-    return fval;
-  }
-
-  explicit inline HIP_FP8_HOST operator float() const
-#else   // __HIP__MI300__
-  explicit inline HIP_FP8_HOST_DEVICE operator float() const
-#endif  // __HIP__MI300__
-  {
-    return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(
-        data);
-  }
-};
-
-namespace std {
-inline hip_fp8 sin(hip_fp8 a) { return hip_fp8(sinf(float(a))); }
-inline hip_fp8 cos(hip_fp8 a) { return hip_fp8(cosf(float(a))); }
-HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) { return a; }
-}  // namespace std
-
-// Special operator overloading
-inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) {
-  return os << float(f8);
-}
-
-// all + operator overloading with mixed types
-// mixed types, always converts to f32, does computation in f32, and returns
-// float
-inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) {
-  return (fa + float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) {
-  return (float(a) + fb);
-}
-
-inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) {
-  return hip_fp8(float(a) + float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) {
-  return a = hip_fp8(float(a) + float(b));
-}
-
-// overloading multiplication, always returns float,
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) {
-  return float(a) * float(b);
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) {
-  return (a * float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) {
-  return (float(a) * b);
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-
-inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) {
-  return ((float)a * float(b));
-}
-
-// overloading for compare
-inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) {
-  return (a.data == b.data);
-}
-inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) {
-  return (a.data != b.data);
-}
-
-inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) >= static_cast<float>(b);
-}
-inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) {
-  return static_cast<float>(a) > static_cast<float>(b);
-}
diff --git a/csrc/quantization/fp8/amd/hip_float8_impl.h b/csrc/quantization/fp8/amd/hip_float8_impl.h
deleted file mode 100644
index 8b9cd26f2f7..00000000000
--- a/csrc/quantization/fp8/amd/hip_float8_impl.h
+++ /dev/null
@@ -1,315 +0,0 @@
-#pragma once
-
-#if defined(__HIPCC__) && defined(__gfx942__)
-  #define __HIP__MI300__
-#endif
-
-#ifdef __HIPCC__
-  #define HIP_FP8_HOST_DEVICE __host__ __device__
-  #define HIP_FP8_HOST __host__
-  #define HIP_FP8_DEVICE __device__
-#else
-  #define HIP_FP8_HOST_DEVICE
-  #define HIP_FP8_HOST
-  #define HIP_FP8_DEVICE
-#endif
-
-namespace hip_fp8_impl {
-
-#ifdef __HIP__MI300__
-HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) {
-  uint8_t i8data;
-  union {
-    float fval;
-    uint32_t i32val;
-    uint8_t i8val[4];  // NOTE: not endian independent
-  } val;
-
-  uint32_t ival = 0;
-  val.fval = v;
-
-  if ((val.i32val & 0x7F800000) !=
-      0x7F800000) {  /// propagate NAN/INF, no clipping
-    val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
-  }
-
-  ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival,
-                                         false);  // false -> WORD0
-  val.i32val = ival;
-  i8data = val.i8val[0];
-
-  return i8data;
-}
-#endif  // __HIP__MI300__
-
-HIP_FP8_HOST inline int clz(uint32_t x) { return __builtin_clz(x); }
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
-HIP_FP8_DEVICE inline int clz(uint32_t x) { return __clz(x); }
-#endif
-
-template <int we, int wm, typename T, bool negative_zero_nan, bool clip>
-HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false,
-                                      uint32_t rng = 0) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(wm + we == 7, "wm+we==7");
-  static_assert(is_half || is_float, "Only half and float can be cast to f8");
-
-  const int mfmt = (sizeof(T) == 4) ? 23 : 10;
-  uint32_t x;
-  if (sizeof(T) == 4) {
-    x = reinterpret_cast<uint32_t&>(_x);
-  } else {
-    x = reinterpret_cast<uint16_t&>(_x);
-  }
-
-  uint32_t head, mantissa;
-  int exponent, bias;
-  uint32_t sign;
-
-  if (sizeof(T) == 4) {
-    head = x & 0xFF800000;
-    mantissa = x & 0x7FFFFF;
-    exponent = (head >> 23) & 0xFF;
-    sign = head >> 31;
-    bias = 127;
-  } else {
-    head = x & 0xFC00;
-    mantissa = x & 0x3FF;
-    exponent = (head >> 10) & 0x1F;
-    sign = head >> 15;
-    bias = 15;
-  }
-
-  uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
-
-  // Deal with inf and NaNs
-  if (negative_zero_nan) {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return 0x80;
-      }
-    } else {
-      // if(__hisinf(x) || __hisnan(x))
-      if ((x & 0x7C00) == 0x7C00) {
-        return 0x80;
-      }
-    }
-  } else {
-    if (sizeof(T) == 4) {
-      if ((x & 0x7F800000) == 0x7F800000) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    } else {
-      if ((x & 0x7C00) == 0x7C00) {
-        return signed_inf + (mantissa != 0 ? 1 : 0);
-      }
-    }
-  }
-  if (x == 0) {
-    return 0;
-  }
-
-  // First need to check if it is normal or denorm as there is a difference of
-  // implicit 1 Then need to adjust the exponent to align with the F8 exponent,
-  // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng
-  // to mantissa and truncate. And for RNE, no need to add rng. Then probably
-  // need to check whether there is carry and adjust exponent and mantissa again
-
-  // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent
-  // bits
-  const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
-  const int f8_denormal_act_exponent =
-      1 - f8_bias;  // actual exponent of f8 denormal
-  // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
-  // f8_exponent is the converted f8 exponent with bias encoding
-  // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
-  // the difference needs to be adjusted and mantissa shifted
-  int act_exponent, f8_exponent, exponent_diff;
-
-  if (exponent == 0) {  // fp32/fp16 is in denormal.
-    /* fp32 denormal is below 2^-127 so it is usually not a concern here, we
-mostly concern fp16 here. In this case, f8 is usually in denormal. But there
-could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has
-exponent bias 16. It means that there are some numbers in fp16 denormal but they
-are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
-where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
-(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1  */
-    act_exponent = exponent - bias + 1;
-    exponent_diff =
-        f8_denormal_act_exponent -
-        act_exponent;  // actual exponent is exponent-bias+1 as it is denormal
-  } else {             // fp32/fp16 is normal with implicit 1
-    act_exponent = exponent - bias;
-    if (act_exponent <= f8_denormal_act_exponent) {
-      /* This is the case where fp32/fp16 is normal but it is in f8 denormal
-range. For example fp8 nanoo mode, denormal exponent is -7, but if the
-fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1,
-Therefore it needs to be adjust to -6 and mantissa shift right by 1.
-So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
-      exponent_diff = f8_denormal_act_exponent - act_exponent;
-    } else {              // both fp32/fp16 and f8 are in normal range
-      exponent_diff = 0;  // exponent_diff=0 does not mean there is no
-                          // difference for this case, act_exponent could be
-                          // larger. Just that it does not need shift mantissa
-    }
-    mantissa += (1 << mfmt);  // Add the implicit 1 into mantissa
-  }
-
-  bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
-                  static_cast<uint32_t>(1 << (mfmt - wm + exponent_diff - 1));
-  /* This part is a bit tricky. The judgment of whether it is a tie needs to be
- done before we shift right as shift right could rip off some residual part
- and make something not midpoint look like midpoint. For example, the fp16
- number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after
- shift right by 4 bits, it would look like midpoint.
-*/
-
-  if (exponent_diff > 0) {
-    mantissa >>= exponent_diff;
-  } else if (exponent_diff == -1) {
-    mantissa <<= -exponent_diff;
-  }
-  bool implicit_one = mantissa & (1 << mfmt);
-  // if there is no implicit 1, it  means the f8 is denormal and need to adjust
-  // to denorm exponent
-  f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ +
-                f8_bias - (implicit_one ? 0 : 1);
-
-  // Now we have the exponent and mantissa adjusted
-  uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
-  bool odd = mantissa & (1 << (mfmt - wm));  // if the least significant bit
-                                             // that is not truncated is 1
-  mantissa +=
-      (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) &
-      drop_mask;
-
-  // Now we deal with overflow
-  if (f8_exponent == 0) {
-    if ((1 << mfmt) & mantissa) {
-      f8_exponent = 1;  // denormal overflow to become normal, promote exponent
-    }
-  } else {
-    if ((1 << (mfmt + 1)) & mantissa) {
-      mantissa >>= 1;
-      f8_exponent++;
-    }
-  }
-
-  mantissa >>= (mfmt - wm);
-
-  // above range: quantize to maximum possible float of the same sign
-  const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
-  if (f8_exponent > max_exp) {
-    if (clip) {
-      mantissa = (1 << wm) - 1;
-      f8_exponent = max_exp;
-    } else {
-      return signed_inf;
-    }
-  }
-
-  if (f8_exponent == 0 && mantissa == 0) {
-    return negative_zero_nan ? 0 : (sign << 7);
-  }
-  mantissa &= (1 << wm) - 1;
-  return (sign << 7) | (f8_exponent << wm) | mantissa;
-}
-
-template <int we, int wm, typename T = float, bool negative_zero_nan = true>
-inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) {
-#ifdef __HIPCC__
-  constexpr bool is_half = std::is_same<T, _Float16>::value;
-#else
-  constexpr bool is_half = false;
-#endif
-  constexpr bool is_float = std::is_same<T, float>::value;
-  static_assert(is_half || is_float, "only half and float are supported");
-
-  constexpr int weo = is_half ? 5 : 8;
-  constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
-
-  T fInf, fNegInf, fNaN, fNeg0;
-
-#ifdef __HIPCC__
-  if (is_half) {
-    const uint16_t ihInf = 0x7C00;
-    const uint16_t ihNegInf = 0xFC00;
-    const uint16_t ihNaN = 0x7C01;
-    const uint16_t ihNeg0 = 0x8000;
-    fInf = reinterpret_cast<const _Float16&>(ihInf);
-    fNegInf = reinterpret_cast<const _Float16&>(ihNegInf);
-    fNaN = reinterpret_cast<const _Float16&>(ihNaN);
-    fNeg0 = reinterpret_cast<const _Float16&>(ihNeg0);
-  } else
-#endif
-      if (is_float) {
-    const uint32_t ifInf = 0x7F800000;
-    const uint32_t ifNegInf = 0xFF800000;
-    const uint32_t ifNaN = 0x7F800001;
-    const uint32_t ifNeg0 = 0x80000000;
-    fInf = reinterpret_cast<const float&>(ifInf);
-    fNegInf = reinterpret_cast<const float&>(ifNegInf);
-    fNaN = reinterpret_cast<const float&>(ifNaN);
-    fNeg0 = reinterpret_cast<const float&>(ifNeg0);
-  }
-
-  if (x == 0) {
-    return 0;
-  }
-
-  uint32_t sign = x >> 7;
-  uint32_t mantissa = x & ((1 << wm) - 1);
-  int exponent = (x & 0x7F) >> wm;
-  if (negative_zero_nan) {
-    if (x == 0x80) {
-      return fNaN;
-    }
-  } else {
-    if (x == 0x80) {
-      return fNeg0;
-    }
-    if (exponent == ((1 << we) - 1)) {
-      return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
-    }
-  }
-  typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
-  if (we == 5 && is_half && !negative_zero_nan) {
-    retval = x << 8;
-    return reinterpret_cast<const T&>(retval);
-  }
-
-  const int exp_low_cutoff =
-      (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-
-  // subnormal input
-  if (exponent == 0) {
-    // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
-    int sh = 1 + clz(mantissa) - (32 - wm);
-    mantissa <<= sh;
-    exponent += 1 - sh;
-    mantissa &= ((1 << wm) - 1);
-  }
-  exponent += exp_low_cutoff - 1;
-  mantissa <<= wmo - wm;
-
-  // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
-  if (exponent <= 0) {
-    mantissa |= 1 << wmo;
-    mantissa >>= 1 - exponent;
-    exponent = 0;
-  }
-
-  if (sizeof(T) == 2) {
-    retval = (sign << 15) | (exponent << 10) | mantissa;
-  } else {
-    retval = (sign << 31) | (exponent << 23) | mantissa;
-  }
-  return reinterpret_cast<const T&>(retval);
-}
-
-}  // namespace hip_fp8_impl
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index eb66834222f..b2196b8ed51 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -1,13 +1,11 @@
 #pragma once
-#include "hip_float8.h"
+#include <hip/hip_fp8.h>
 
 #include <hip/hip_fp16.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_bfloat16.h>
 
-#include "../../../attention/dtype_fp8.cuh"
-#include "../../../attention/dtype_float32.cuh"
-#include "../../../attention/dtype_bfloat16.cuh"
+#include "../../../attention/attention_dtypes.h"
 
 namespace vllm {
 #ifdef USE_ROCM
@@ -26,40 +24,31 @@ __inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
   return x;
 }
 
+    #if HIP_FP8_TYPE_FNUZ
+using fp8_type = __hip_fp8_e4m3_fnuz;
+using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
+    #elif HIP_FP8_TYPE_OCP
+using fp8_type = __hip_fp8_e4m3;
+using fp8x2_type = __hip_fp8x2_e4m3;
+    #endif
+
 // fp8 -> half
 template <>
 __inline__ __device__ uint16_t
 vec_conversion<uint16_t, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8);
-  return res.x;
+  return __hip_cvt_fp8_to_halfraw(a, fp8_type::__default_interpret).x;
 }
 
 // fp8x2 -> half2
 template <>
 __inline__ __device__ uint32_t
 vec_conversion<uint32_t, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
   union {
     __half2_raw h2r;
     uint32_t ui32;
   } tmp;
-  tmp.h2r.x.data = f2[0];
-  tmp.h2r.y.data = f2[1];
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
   return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-
-  tmp.u16[0] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a));
-  tmp.u16[1] = vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return tmp.u32;
-    #endif
 }
 
 // fp8x4 -> half2x2
@@ -92,9 +81,9 @@ using __nv_bfloat16 = __hip_bfloat16;
 template <>
 __inline__ __device__ __nv_bfloat16
 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f);
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8));
 }
 
 using __nv_bfloat162 = __hip_bfloat162;
@@ -136,27 +125,18 @@ __inline__ __device__ bf16_8_t vec_conversion<bf16_8_t, uint2>(const uint2& a) {
 // fp8 -> float
 template <>
 __inline__ __device__ float vec_conversion<float, uint8_t>(const uint8_t& a) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8);
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8);
 }
 
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
 vec_conversion<float2, uint16_t>(const uint16_t& a) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0];
-  res.y = f2[1];
-  return res;
-    #else
-  float2 res;
-  res.x = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a));
-  res.y = vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U));
-  return res;
-    #endif
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2);
 }
 
 // fp8x4 -> float4
@@ -169,6 +149,15 @@ vec_conversion<Float4_, uint32_t>(const uint32_t& a) {
   return res;
 }
 
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+vec_conversion<float4, uint32_t>(const uint32_t& a) {
+  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
+  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
+  return res;
+}
+
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_ vec_conversion<Float8_, uint2>(const uint2& a) {
@@ -189,33 +178,36 @@ __inline__ __device__ uint8_t
 vec_conversion<uint8_t, uint16_t>(const uint16_t& a) {
   __half_raw tmp;
   tmp.x = a;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
 
-  hip_fp8 f8{static_cast<float>(tmp.data)};
-  return f8.data;
+template <>
+__inline__ __device__ uint16_t
+vec_conversion<uint16_t, uint32_t>(const uint32_t& a) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
 }
 
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t
 vec_conversion<uint8_t, __nv_bfloat16>(const __nv_bfloat16& a) {
-  hip_fp8 res{__bfloat162float(a)};
-  return res.data;
+  return __hip_cvt_float_to_fp8(__bfloat162float(a),
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
 
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t vec_conversion<uint8_t, float>(const float& a) {
-  hip_fp8 f8(a);
-  return f8.data;
-}
-
-// fp8x4 -> float4
-template <>
-__inline__ __device__ float4
-vec_conversion<float4, uint32_t>(const uint32_t& a) {
-  Float4_ tmp = vec_conversion<Float4_, uint32_t>(a);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
+  return __hip_cvt_float_to_fp8(a, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
 
 // float2 -> half2
@@ -307,90 +299,22 @@ vec_conversion<bf16_8_t, Float8_>(const Float8_& a) {
 
  */
 
-// fp8 -> half
-template <>
-__inline__ __device__ uint16_t
-scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  __half_raw res;
-  res.data = static_cast<float>(f8) * scale;
-  return res.x;
-}
-
-// fp8x2 -> half2
-template <>
-__inline__ __device__ uint32_t scaled_vec_conversion<uint32_t, uint16_t>(
-    const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  union {
-    __half2_raw h2r;
-    uint32_t ui32;
-  } tmp;
-  tmp.h2r.x.data = f2[0] * scale;
-  tmp.h2r.y.data = f2[1] * scale;
-  return tmp.ui32;
-    #else
-  union {
-    uint16_t u16[2];
-    uint32_t u32;
-  } tmp;
-
-  tmp.u16[0] =
-      scaled_vec_conversion<uint16_t, uint8_t>(static_cast<uint8_t>(a), scale);
-  tmp.u16[1] = scaled_vec_conversion<uint16_t, uint8_t>(
-      static_cast<uint8_t>(a >> 8U), scale);
-  return tmp.u32;
-    #endif
-}
-
-// fp8x4 -> half2x2
-template <>
-__inline__ __device__ uint2
-scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, const float scale) {
-  union {
-    uint2 u32x2;
-    uint32_t u32[2];
-  } tmp;
-  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
-  tmp.u32[1] =
-      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
-  return tmp.u32x2;
-}
-
-// fp8x8 -> half2x4
-template <>
-__inline__ __device__ uint4
-scaled_vec_conversion<uint4, uint2>(const uint2& a, const float scale) {
-  union {
-    uint4 u64x2;
-    uint2 u64[2];
-  } tmp;
-  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
-  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
-  return tmp.u64x2;
-}
-
 using __nv_bfloat16 = __hip_bfloat16;
 
 // fp8 -> __nv_bfloat16
 template <>
 __inline__ __device__ __nv_bfloat16
-scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a,
-                                              const float scale) {
-  hip_fp8 f8{a, hip_fp8::from_bits()};
-  float f{f8};
-  return __float2bfloat16(f * scale);
+scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return __float2bfloat16(static_cast<float>(f8) * scale);
 }
 
-using __nv_bfloat162 = __hip_bfloat162;
-
 // fp8x2 -> __nv_bfloat162
 template <>
 __inline__ __device__ __nv_bfloat162
 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
-                                                const float scale) {
+                                                float scale) {
   __nv_bfloat162 res;
   res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale);
   res.y =
@@ -400,8 +324,8 @@ scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a,
 
 // fp8x4 -> bf16_4_t
 template <>
-__inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
-    const uint32_t& a, const float scale) {
+__inline__ __device__ bf16_4_t
+scaled_vec_conversion<bf16_4_t, uint32_t>(const uint32_t& a, float scale) {
   bf16_4_t res;
   res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale);
   res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U),
@@ -412,7 +336,7 @@ __inline__ __device__ bf16_4_t scaled_vec_conversion<bf16_4_t, uint32_t>(
 // fp8x8 -> bf16_8_t
 template <>
 __inline__ __device__ bf16_8_t
-scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
+scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, float scale) {
   bf16_4_t tmp1, tmp2;
   tmp1 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<bf16_4_t, uint32_t>(a.y, scale);
@@ -427,29 +351,19 @@ scaled_vec_conversion<bf16_8_t, uint2>(const uint2& a, const float scale) {
 // fp8 -> float
 template <>
 __inline__ __device__ float scaled_vec_conversion<float, uint8_t>(
-    const uint8_t& a, const float scale) {
-  hip_fp8 fp8{a, hip_fp8::from_bits()};
-  return static_cast<float>(fp8) * scale;
+    const uint8_t& a, float scale) {
+  fp8_type f8;
+  f8.__x = a;
+  return static_cast<float>(f8) * scale;
 }
 
 // fp8x2 -> float2
 template <>
 __inline__ __device__ float2
-scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, const float scale) {
-    #if defined(__HIP__MI300__) && \
-        defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__)
-  float2 res;
-  const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0);
-  res.x = f2[0] * scale;
-  res.y = f2[1] * scale;
-  return res;
-    #else
-  float2 res;
-  res.x = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a), scale);
-  res.y = scaled_vec_conversion<float, uint8_t>(static_cast<uint8_t>(a >> 8U),
-                                                scale);
-  return res;
-    #endif
+scaled_vec_conversion<float2, uint16_t>(const uint16_t& a, float scale) {
+  fp8x2_type f8x2;
+  f8x2.__x = a;
+  return static_cast<float2>(f8x2) * scale;
 }
 
 // fp8x4 -> float4
@@ -462,10 +376,18 @@ scaled_vec_conversion<Float4_, uint32_t>(const uint32_t& a, const float scale) {
   return res;
 }
 
+// fp8x4 -> float4
+template <>
+__inline__ __device__ float4
+scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, float scale) {
+  Float4_ res = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
+  return {res.x.x, res.x.y, res.y.x, res.y.y};
+}
+
 // fp8x8 -> float8
 template <>
 __inline__ __device__ Float8_
-scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
+scaled_vec_conversion<Float8_, uint2>(const uint2& a, float scale) {
   Float4_ tmp1, tmp2;
   tmp1 = scaled_vec_conversion<Float4_, uint32_t>(a.x, scale);
   tmp2 = scaled_vec_conversion<Float4_, uint32_t>(a.y, scale);
@@ -477,44 +399,184 @@ scaled_vec_conversion<Float8_, uint2>(const uint2& a, const float scale) {
   return res;
 }
 
-/* Quantize(HP / scale) => FP8 */
+// fp8 -> half
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
+  __half_raw res;
+  res.data = scaled_vec_conversion<float, uint8_t>(a, scale);
+  return res.x;
+}
+
+// fp8x2 -> half2
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
+  __half2_raw h2r =
+      __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  union {
+    __half2_raw h2r;
+    uint32_t ui32;
+  } tmp;
+  tmp.h2r = __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
+  tmp.h2r.x.data *= scale;
+  tmp.h2r.y.data *= scale;
+  return tmp.ui32;
+}
+
+// fp8x4 -> half2x2
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint2 u32x2;
+    uint32_t u32[2];
+  } tmp;
+  tmp.u32[0] = scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)a, scale);
+  tmp.u32[1] =
+      scaled_vec_conversion<uint32_t, uint16_t>((uint16_t)(a >> 16U), scale);
+  return tmp.u32x2;
+}
 
-// TODO(Hai): vectorized to add
+// fp8x8 -> half2x4
+template <>
+__inline__ __device__ uint4 scaled_vec_conversion<uint4, uint2>(const uint2& a,
+                                                                float scale) {
+  union {
+    uint4 u64x2;
+    uint2 u64[2];
+  } tmp;
+  tmp.u64[0] = scaled_vec_conversion<uint2, uint32_t>(a.x, scale);
+  tmp.u64[1] = scaled_vec_conversion<uint2, uint32_t>(a.y, scale);
+  return tmp.u64x2;
+}
 
 // half -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, const float scale) {
+scaled_vec_conversion<uint8_t, uint16_t>(const uint16_t& a, float scale) {
   __half_raw tmp;
   tmp.x = a;
+  tmp.data /= scale;
+  return __hip_cvt_halfraw_to_fp8(tmp, fp8_type::__default_saturation,
+                                  fp8_type::__default_interpret);
+}
 
-  hip_fp8 f8{static_cast<float>(tmp.data) / scale};
-  return f8.data;
+// halfx2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, uint32_t>(const uint32_t& a, float scale) {
+  union {
+    uint32_t ui32;
+    __half2_raw h2r;
+  } tmp;
+  tmp.ui32 = a;
+  tmp.h2r.x.data /= scale;
+  tmp.h2r.y.data /= scale;
+  return __hip_cvt_halfraw2_to_fp8x2(tmp.h2r, fp8_type::__default_saturation,
+                                     fp8_type::__default_interpret);
+}
+
+// half2x2 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, uint2>(const uint2& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, uint32_t>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, uint32_t>(a.y, scale);
+  return tmp.ui32;
+}
+
+// half2x4 -> fp8x8
+template <>
+__inline__ __device__ uint2 scaled_vec_conversion<uint2, uint4>(const uint4& a,
+                                                                float scale) {
+  union {
+    uint2 ui2[2];
+    uint4 ui4;
+  } tmp;
+  tmp.ui4 = a;
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[0], scale);
+  res.y = scaled_vec_conversion<uint32_t, uint2>(tmp.ui2[1], scale);
+  return res;
 }
 
 // bf16 -> fp8
 template <>
 __inline__ __device__ uint8_t scaled_vec_conversion<uint8_t, __nv_bfloat16>(
-    const __nv_bfloat16& a, const float scale) {
-  hip_fp8 res{__bfloat162float(a) / scale};
-  return res.data;
+    const __nv_bfloat16& a, float scale) {
+  return __hip_cvt_float_to_fp8(__bfloat162float(a) / scale,
+                                fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
+}
+
+// bf16x2 -> fp8x2
+template <>
+__inline__ __device__ uint16_t scaled_vec_conversion<uint16_t, __nv_bfloat162>(
+    const __nv_bfloat162& a, float scale) {
+  union {
+    uint8_t ui8[2];
+    uint16_t ui16;
+  } tmp;
+  tmp.ui8[0] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.x, scale);
+  tmp.ui8[1] = scaled_vec_conversion<uint8_t, __nv_bfloat16>(a.y, scale);
+  return tmp.ui16;
+}
+
+// bf16x4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, bf16_4_t>(const bf16_4_t& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.x, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, __nv_bfloat162>(a.y, scale);
+  return tmp.ui32;
+}
+
+// bf16x8 -> fp8x8
+template <>
+__inline__ __device__ uint2
+scaled_vec_conversion<uint2, bf16_8_t>(const bf16_8_t& a, float scale) {
+  uint2 res;
+  res.x = scaled_vec_conversion<uint32_t, bf16_4_t>({a.x, a.y}, scale);
+  res.y = scaled_vec_conversion<uint32_t, bf16_4_t>({a.z, a.w}, scale);
+  return res;
 }
 
 // float -> fp8
 template <>
 __inline__ __device__ uint8_t
-scaled_vec_conversion<uint8_t, float>(const float& a, const float scale) {
-  hip_fp8 f8(a / scale);
-  return f8.data;
+scaled_vec_conversion<uint8_t, float>(const float& a, float scale) {
+  return __hip_cvt_float_to_fp8(a / scale, fp8_type::__default_saturation,
+                                fp8_type::__default_interpret);
 }
 
-// fp8x4 -> float4
+// floatx2 -> fp8x2
 template <>
-__inline__ __device__ float4
-scaled_vec_conversion<float4, uint32_t>(const uint32_t& a, const float scale) {
-  Float4_ tmp = scaled_vec_conversion<Float4_, uint32_t>(a, scale);
-  float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y);
-  return res;
+__inline__ __device__ uint16_t
+scaled_vec_conversion<uint16_t, float2>(const float2& a, float scale) {
+  return __hip_cvt_float2_to_fp8x2(a / scale, fp8_type::__default_saturation,
+                                   fp8_type::__default_interpret);
+}
+
+// floatx4 -> fp8x4
+template <>
+__inline__ __device__ uint32_t
+scaled_vec_conversion<uint32_t, float4>(const float4& a, float scale) {
+  union {
+    uint16_t ui16[2];
+    uint32_t ui32;
+  } tmp;
+  tmp.ui16[0] = scaled_vec_conversion<uint16_t, float2>({a.x, a.y}, scale);
+  tmp.ui16[1] = scaled_vec_conversion<uint16_t, float2>({a.z, a.w}, scale);
+  return tmp.ui32;
 }
   #endif  // ENABLE_FP8
 
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index 15bd5b6ed15..fac99b29734 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -12,7 +12,7 @@ C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
     std::numeric_limits<FP8_TYPE>::max();
 #else
   #include <c10/util/Float8_e4m3fnuz.h>
-  #include "amd/hip_float8.h"
+  #include "amd/quant_utils.cuh"
 using FP8_TYPE = c10::Float8_e4m3fnuz;
 // Using the default max value from pytorch (240.0) will cause accuracy
 // issue when running dynamic quantization. Here use 224.0f for rocm.
@@ -47,8 +47,10 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
   return static_cast<c10::Float8_e4m3fn>(r);
 #else
   // Use hardware cvt instruction for fp8 on rocm
-  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
-                              c10::Float8_e4m3fnuz::from_bits());
+  return c10::Float8_e4m3fnuz(
+      __hip_cvt_float_to_fp8(r, fp8::fp8_type::__default_saturation,
+                             fp8::fp8_type::__default_interpret),
+      c10::Float8_e4m3fnuz::from_bits());
 #endif
 }
 
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index b8b5e204545..fb368874821 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -159,19 +159,20 @@ def test_reshape_and_cache(
                                                 device)
     key_cache, value_cache = key_caches[0], value_caches[0]
 
+    # Using default kv_scale
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
+
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
         cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item())
         cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache)
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item())
     else:
         cloned_key_cache = key_cache.clone()
         cloned_value_cache = value_cache.clone()
 
-    # Using default kv_scale
-    k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
-
     # Call the reshape_and_cache kernel.
     opcheck(torch.ops._C_cache_ops.reshape_and_cache,
             (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype,
@@ -182,9 +183,9 @@ def test_reshape_and_cache(
 
     if kv_cache_dtype == "fp8":
         result_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(result_key_cache, key_cache)
+        ops.convert_fp8(result_key_cache, key_cache, k_scale.item())
         result_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(result_value_cache, value_cache)
+        ops.convert_fp8(result_value_cache, value_cache, v_scale.item())
 
     # Run the reference implementation.
     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
@@ -268,15 +269,16 @@ def test_reshape_and_cache_flash(
     del key_caches
     del value_caches
 
-    k_scale = (key.amax() / 256.0).to(torch.float32)
-    v_scale = (value.amax() / 256.0).to(torch.float32)
+    k_scale = (key.amax() / 64.0).to(torch.float32)
+    v_scale = (value.amax() / 64.0).to(torch.float32)
 
     # Clone the KV caches.
     if kv_cache_dtype == "fp8":
         cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype)
+        ops.convert_fp8(cloned_key_cache, key_cache, k_scale.item(),
+                        kv_cache_dtype)
         cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16)
-        ops.convert_fp8(cloned_value_cache, value_cache, v_scale,
+        ops.convert_fp8(cloned_value_cache, value_cache, v_scale.item(),
                         kv_cache_dtype)
     else:
         cloned_key_cache = key_cache.clone()

From d4241d9b7f382ac10166b14c71efa5ebf259df72 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 25 Feb 2025 18:01:15 +0800
Subject: [PATCH 0370/1240] [CI/Build]  Fix V1 LoRA failure (#13767)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_gemma.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index a1b4c897c45..bbdfbe37175 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -41,6 +41,8 @@ def v1(run_with_both_engines_lora):
     pass
 
 
+# The V1 lora test for this model requires more than 24GB.
+@pytest.mark.skip_v1
 @pytest.mark.xfail(current_platform.is_rocm(),
                    reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):

From aba8e74a9821860b671942fea4b827f81bbe93b6 Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+Chen-0210@users.noreply.github.com>
Date: Tue, 25 Feb 2025 18:12:19 +0800
Subject: [PATCH 0371/1240] [Misc]Clarify Error Handling for Non-existent Model
 Paths and HF Repo IDs (#13724)

Signed-off-by: Chen-0210 <chenjincong11@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index dd6ee9a34ad..55a620b4bf1 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -253,14 +253,28 @@ def get_config(
         model = Path(model).parent
 
     if config_format == ConfigFormat.AUTO:
-        if is_gguf or file_or_path_exists(
-                model, HF_CONFIG_NAME, revision=revision):
-            config_format = ConfigFormat.HF
-        elif file_or_path_exists(model, MISTRAL_CONFIG_NAME,
-                                 revision=revision):
-            config_format = ConfigFormat.MISTRAL
-        else:
-            raise ValueError(f"No supported config format found in {model}.")
+        try:
+            if is_gguf or file_or_path_exists(
+                    model, HF_CONFIG_NAME, revision=revision):
+                config_format = ConfigFormat.HF
+            elif file_or_path_exists(model,
+                                     MISTRAL_CONFIG_NAME,
+                                     revision=revision):
+                config_format = ConfigFormat.MISTRAL
+
+        except Exception as e:
+            error_message = (
+                "Invalid repository ID or local directory specified:"
+                " '{model}'.\nPlease verify the following requirements:\n"
+                "1. Provide a valid Hugging Face repository ID.\n"
+                "2. Specify a local directory that contains a recognized "
+                "configuration file.\n"
+                "   - For Hugging Face models: ensure the presence of a "
+                "'config.json'.\n"
+                "   - For Mistral models: ensure the presence of a "
+                "'params.json'.\n")
+
+            raise ValueError(error_message) from e
 
     if config_format == ConfigFormat.HF:
         config_dict, _ = PretrainedConfig.get_config_dict(

From 08c92fbc17cec35b4bbfa15be5930f38ea8dfdf5 Mon Sep 17 00:00:00 2001
From: Junlin Zhou <jameszhou2108@hotmail.com>
Date: Tue, 25 Feb 2025 18:13:09 +0800
Subject: [PATCH 0372/1240] [Bugfix] Initialize attention bias on the same
 device as Query/Key/Value (#13468)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/xformers.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index ec8e1f2ee5a..9fa76634e1f 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -673,7 +673,9 @@ def _run_memory_efficient_xformers_forward(
 
                     # Cross-attention mask is non-causal
                     attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.seq_lens, attn_metadata.encoder_seq_lens)
+                        attn_metadata.seq_lens,
+                        attn_metadata.encoder_seq_lens,
+                        device=query.device)
 
                 # Encoder branch of encoder-decoder model uses
                 # attn_metadata.encoder_seq_lens
@@ -683,7 +685,7 @@ def _run_memory_efficient_xformers_forward(
 
                     # Encoder self-attention mask is non-causal
                     attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.encoder_seq_lens)
+                        attn_metadata.encoder_seq_lens, device=query.device)
 
                 # Self-attention block of encoder-only model just
                 # uses the seq_lens directly.
@@ -692,7 +694,7 @@ def _run_memory_efficient_xformers_forward(
 
                     # Encoder self-attention mask is non-causal
                     attn_bias = BlockDiagonalMask.from_seqlens(
-                        attn_metadata.seq_lens)
+                        attn_metadata.seq_lens, device=query.device)
 
                 # Self-attention block of decoder branch just
                 # uses the seq_lens directly
@@ -701,7 +703,7 @@ def _run_memory_efficient_xformers_forward(
 
                     # Decoder self-attention mask is causal
                     attn_bias = BlockDiagonalCausalMask.from_seqlens(
-                        attn_metadata.seq_lens)
+                        attn_metadata.seq_lens, device=query.device)
                 else:
                     raise ValueError("Unknown AttentionType: %s", attn_type)
 

From 805624efa0457b31f1ce095290e8c561bf12a683 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <165712832+naromero77amd@users.noreply.github.com>
Date: Tue, 25 Feb 2025 05:08:20 -0600
Subject: [PATCH 0373/1240] [Bugfix] Flush TunableOp results before worker
 processes are destroyed. (#13623)

Signed-off-by: Nichols A. Romero <nick.romero@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/multiproc_worker_utils.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index cef6a994a9c..68a83bb610a 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -250,6 +250,15 @@ def _run_worker_process(
     except Exception:
         logger.exception("Worker failed")
 
+    # Flush TunableOp results when TunableOp is enabled and
+    # online (in situ) tuning is enabled.
+    # Offline tuning API (record_untuned_is_enabled()) only
+    # available in PyTorch 2.6 or later.
+    import torch.cuda.tunable as tunable
+    if (tunable.is_enabled() and tunable.tuning_is_enabled()
+            and not tunable.record_untuned_is_enabled()):
+        tunable.write_file()
+
     logger.info("Worker exiting")
 
 
From 0155c15c26d681bb3399838ab512397249b712f5 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 25 Feb 2025 22:03:02 +0800
Subject: [PATCH 0374/1240] [Bugfix] Fix deepseek-vl2 inference with more than
 2 images (#13818)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py | 50 ++++++++++++++++++----
 vllm/model_executor/models/h2ovl.py        |  6 ++-
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index c58b65d4934..ea217e24440 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -25,7 +25,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -138,18 +139,24 @@ def get_hf_processor(self, **kwargs: object):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
-    def get_num_image_tokens(self, *, image_width: int,
-                             image_height: int) -> int:
+    def get_num_image_tokens(self,
+                             *,
+                             image_width: int,
+                             image_height: int,
+                             cropping: bool = True) -> int:
         hf_processor = self.get_hf_processor()
         image_size = hf_processor.image_size
         patch_size = hf_processor.patch_size
         downsample_ratio = hf_processor.downsample_ratio
 
-        best_width, best_height = hf_processor.select_best_resolution(
-            (image_width, image_height))
+        if cropping:
+            best_width, best_height = hf_processor.select_best_resolution(
+                (image_width, image_height))
+            num_width_tiles, num_height_tiles = (best_width // image_size,
+                                                 best_height // image_size)
+        else:
+            num_width_tiles = num_height_tiles = 1
 
-        num_width_tiles, num_height_tiles = (best_width // image_size,
-                                             best_height // image_size)
         h = w = math.ceil((image_size // patch_size) / downsample_ratio)
 
         global_views_tokens = h * (w + 1)
@@ -169,10 +176,12 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
+        num_images = mm_counts.get("image", 0)
         max_image_size = self.get_image_size_with_most_features()
         max_image_tokens = self.get_num_image_tokens(
             image_height=max_image_size.height,
-            image_width=max_image_size.width)
+            image_width=max_image_size.width,
+            cropping=num_images <= 2)
 
         return {"image": max_image_tokens}
 
@@ -207,6 +216,30 @@ def get_dummy_processor_inputs(
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
 
+    def __init__(
+            self,
+            info: DeepseekVL2ProcessingInfo,
+            dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
+            *,
+            cache: Optional[ProcessingCache] = None,
+            enable_sanity_checks: bool = True) -> None:
+        super().__init__(
+            info,
+            dummy_inputs,
+            cache=cache,
+            enable_sanity_checks=enable_sanity_checks,
+        )
+
+        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
+        if self.cache is not None and mm_limit["image"] > 2:
+            # The processor output depends on the number of images passed,
+            # making it incompatible with processing cache which is supposed
+            # to be invariant of how many images are passed per prompt
+            self.cache = None
+            logger.warning_once(
+                f"{type(self).__name__} does not support processing cache with "
+                "image limit larger than 2.")
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -271,6 +304,7 @@ def get_replacement_deepseek_vl2(item_idx: int):
                 num_image_tokens = self.info.get_num_image_tokens(
                     image_width=image_size.width,
                     image_height=image_size.height,
+                    cropping=len(images) <= 2,
                 )
             return [image_token_id] * num_image_tokens
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 01b721fa79e..bab9c256b9a 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -477,13 +477,15 @@ def __init__(self,
             enable_sanity_checks=enable_sanity_checks,
         )
 
-        if self.cache is not None:
+        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
+        if self.cache is not None and mm_limit["image"] >= 2:
             # The processor output depends on the number of images passed,
             # making it incompatible with processing cache which is supposed
             # to be invariant of how many images are passed per prompt
             self.cache = None
             logger.warning_once(
-                f"{type(self).__name__} does not support processing cache.")
+                f"{type(self).__name__} does not support processing cache with "
+                "multi-image support enabled.")
 
     def _get_prompt_replacements(
         self,

From 5338923e462f2fc097a7dbb2ff44f52999c3a945 Mon Sep 17 00:00:00 2001
From: Wen Sun <35923278+HermitSun@users.noreply.github.com>
Date: Tue, 25 Feb 2025 22:03:33 +0800
Subject: [PATCH 0375/1240] Fix `/v1/audio/transcriptions ` Bad Request Error
 (#13811)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt             | 3 +--
 vllm/entrypoints/openai/protocol.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 0514bf8adca..942c3e039ea 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -9,8 +9,7 @@ py-cpuinfo
 transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
-fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
-fastapi[standard]  >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
+fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
 aiohttp
 openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 pydantic >= 2.9
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 45b98a032bd..cd2902f934b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1448,7 +1448,7 @@ class UnloadLoraAdapterRequest(BaseModel):
 
 class TranscriptionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
-    #https://platform.openai.com/docs/api-reference/audio/createTranscription
+    # https://platform.openai.com/docs/api-reference/audio/createTranscription
 
     file: UploadFile
     """

From fd90602ed3ab6fe22196d8915dc6c62fcce18592 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Feb 2025 00:18:50 +0800
Subject: [PATCH 0376/1240] [Bugfix] Revert inspection code in #13743 (#13832)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/registry.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index bae6444267f..05fb3d21953 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -364,12 +364,6 @@ def register_model(
                 raise ValueError(msg)
 
             model = _LazyRegisteredModel(*split_str)
-
-            try:
-                model.inspect_model_cls()
-            except Exception as exc:
-                msg = f"Unable to inspect model {model_cls}"
-                raise RuntimeError(msg) from exc
         elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
             model = _RegisteredModel.from_model_cls(model_cls)
         else:

From 0a6147fb8012472ed13442b686b3591ea7b0152c Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+Chen-0210@users.noreply.github.com>
Date: Wed, 26 Feb 2025 00:20:29 +0800
Subject: [PATCH 0377/1240] Fix string parsing error (#13825)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 55a620b4bf1..1937b138847 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -272,7 +272,7 @@ def get_config(
                 "   - For Hugging Face models: ensure the presence of a "
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
-                "'params.json'.\n")
+                "'params.json'.\n").format(model=model)
 
             raise ValueError(error_message) from e
 

From a3968c660b2b405934d0478d92d3e18763ecc374 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Tue, 25 Feb 2025 11:47:49 -0800
Subject: [PATCH 0378/1240] [Neuron] Add custom_ops for neuron backend (#13246)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: George Novack <gnovack@amazon.com>
Co-authored-by: Aoyu Zhang <aoyuzhan@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/test_activation.py               | 42 ++++++++
 tests/neuron/test_layernorm.py                | 56 +++++++++++
 tests/neuron/test_logits_processor.py         | 95 +++++++++++++++++++
 tests/neuron/test_prefix_prefill.py           |  7 +-
 tests/neuron/test_rotary_embedding.py         | 58 +++++++++++
 vllm/model_executor/custom_op.py              |  7 ++
 vllm/model_executor/layers/activation.py      |  7 ++
 .../model_executor/layers/logits_processor.py |  1 +
 .../model_executor/layers/rotary_embedding.py | 76 +++++++++++++++
 9 files changed, 346 insertions(+), 3 deletions(-)
 create mode 100644 tests/neuron/test_activation.py
 create mode 100644 tests/neuron/test_layernorm.py
 create mode 100644 tests/neuron/test_logits_processor.py
 create mode 100644 tests/neuron/test_rotary_embedding.py

diff --git a/tests/neuron/test_activation.py b/tests/neuron/test_activation.py
new file mode 100644
index 00000000000..ec2b1238e40
--- /dev/null
+++ b/tests/neuron/test_activation.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
+@pytest.mark.parametrize("num_tokens,d,dtype", [
+    (7, 512, torch.half),
+    (7, 512, torch.float),
+    (83, 512, torch.half),
+])
+@torch.inference_mode()
+def test_act_and_mul(
+    activation: str,
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
+    if activation == "silu_and_mul":
+        layer = SiluAndMul()
+        fn = layer.forward_native
+    elif activation == "gelu_fast":
+        layer = FastGELU()
+        fn = F.gelu
+    else:
+        raise NotImplementedError(
+            f"activation {activation} is not implemented.")
+    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
+    out = layer.to(device=device).forward_neuron(x)
+    ref_out = fn(x.cpu())
+    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
diff --git a/tests/neuron/test_layernorm.py b/tests/neuron/test_layernorm.py
new file mode 100644
index 00000000000..e96df8db6cc
--- /dev/null
+++ b/tests/neuron/test_layernorm.py
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
+    (7, 8, False, torch.half),
+    (83, 768, False, torch.half),
+    (83, 768, True, torch.half),
+    (83, 768, True, torch.bfloat16),
+    (83, 768, True, torch.float32),
+])
+@torch.inference_mode()
+def test_rms_norm(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+) -> None:
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+    layer = RMSNorm(hidden_size).to(dtype=dtype)
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None
+
+    residual_cpu = residual.cpu() if add_residual else None
+    ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
+    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
+    out = layer.to(device=device)(x, residual)
+
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+    # numerical errors than other operators because they involve reductions.
+    # Therefore, we use a larger tolerance.
+    if add_residual:
+        assert out[0].is_xla, "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out[0].cpu(),
+                                   ref_out[0],
+                                   atol=1e-2,
+                                   rtol=1e-2)
+        torch.testing.assert_close(out[1].cpu(),
+                                   ref_out[1],
+                                   atol=1e-2,
+                                   rtol=1e-2)
+    else:
+        assert out.is_xla, "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/test_logits_processor.py
new file mode 100644
index 00000000000..37d59c9e76a
--- /dev/null
+++ b/tests/neuron/test_logits_processor.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import random
+from typing import Tuple
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
+from vllm.utils import is_pin_memory_available
+
+
+class MockLogitsProcessor(LogitsProcessor):
+
+    def __init__(self, vocab_size: int, scale: float,
+                 fake_logits: torch.Tensor):
+        super().__init__(vocab_size=vocab_size, scale=scale)
+        self.fake_logits = fake_logits.clone()
+
+    def forward(self, *args, **kwargs):
+        with patch(
+                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
+                lambda x, y: x
+        ), patch(
+                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
+                lambda *args, **kwargs: self.fake_logits):
+            return super().forward(*args, **kwargs)
+
+
+def _prepare_test(
+        batch_size: int
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+    vocab_size = 32000
+    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
+    fake_logits = torch.full((batch_size, vocab_size),
+                             1e-2,
+                             dtype=input_tensor.dtype)
+    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
+    return input_tensor, fake_logits, logits_processor
+
+
+RANDOM_SEEDS = list(range(8))
+
+
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+def test_logits_processors(seed: int):
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    set_random_seed(seed)
+    torch.set_default_device("cpu")
+    batch_size = random.randint(1, 256)
+    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    seq_group_metadata_list = []
+    seq_lens = []
+    for i in range(batch_size):
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=f"test_{i}",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0,
+                                               logits_processors=[pick_ith]),
+                block_tables={0: [1]},
+            ))
+        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
+
+    sampling_metadata = SamplingMetadata.prepare(
+        seq_group_metadata_list,
+        seq_lens,
+        query_lens=seq_lens,
+        device=device,
+        pin_memory=is_pin_memory_available())
+    logits_processor_output = logits_processor(
+        lm_head=None,
+        hidden_states=input_tensor,
+        sampling_metadata=sampling_metadata)
+
+    fake_logits *= logits_processor.scale
+    torch.testing.assert_close(logits_processor_output[:, 1],
+                               fake_logits[:, 1],
+                               rtol=1e-4,
+                               atol=0.0)
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py
index 347a139f39b..2c6ac47888d 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/test_prefix_prefill.py
@@ -345,6 +345,7 @@ def test_contexted_kv_attention(
 
     torch.manual_seed(0)
     torch.set_printoptions(sci_mode=False)
+    torch.set_default_device("cpu")
     dtype = torch.float32
 
     min_ctx_len = 32
@@ -438,9 +439,9 @@ def pad_to_next_power_of_2(a):
 
     # transform block table
     active_block_table = get_active_block_tables(
-        block_table,
-        torch.tensor(query_lens),
-        torch.tensor(seq_lens),
+        block_table.cpu(),
+        torch.tensor(query_lens).cpu(),
+        torch.tensor(seq_lens).cpu(),
         block_size,
         num_active_blocks,
     )
diff --git a/tests/neuron/test_rotary_embedding.py b/tests/neuron/test_rotary_embedding.py
new file mode 100644
index 00000000000..c015b80bd47
--- /dev/null
+++ b/tests/neuron/test_rotary_embedding.py
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Tests for miscellaneous utilities
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
+
+
+@pytest.mark.parametrize(
+    "max_position,is_neox_style,rotary_dim,head_size,seq_len", [
+        (16, False, 32, 32, 1024),
+        (16, False, 32, 128, 1024),
+        (16, True, 32, 32, 1024),
+        (16, True, 32, 128, 1024),
+    ])
+def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
+                                  head_size, seq_len):
+    import torch_xla.core.xla_model as xm
+
+    device = xm.xla_device()
+    current_platform.seed_everything(0)
+    torch.set_default_device("cpu")
+
+    batch_size = 1
+    base = 10000
+    num_heads = 8
+
+    rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
+                          is_neox_style, torch.float32)
+
+    positions = torch.randint(0,
+                              max_position, (batch_size, seq_len),
+                              device="cpu")
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=torch.float32,
+                        device="cpu")
+    key = torch.randn_like(query)
+
+    assert positions.is_cpu, \
+        "reference input tensor is expected to be CPU tensor."
+    ref_query, ref_key = rot.to(device="cpu").forward_native(
+        positions, query, key)
+    out_query, out_key = rot.to(device=device).forward_neuron(
+        positions.to(device=device), query.to(device=device),
+        key.to(device=device))
+    assert out_query.is_xla and out_key.is_xla, \
+        "output tensor is expected to be XLA tensor"
+    torch.testing.assert_close(out_query.cpu(),
+                               ref_query,
+                               atol=1e-2,
+                               rtol=1e-2)
+    torch.testing.assert_close(out_key.cpu(), ref_key, atol=1e-2, rtol=1e-2)
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index ee4f41ea6ec..dfd052f6252 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -59,6 +59,11 @@ def forward_hpu(self, *args, **kwargs):
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_neuron(self, *args, **kwargs):
+        # By default, we assume that Neuron ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def forward_oot(self, *args, **kwargs):
         # By default, we assume that OOT ops are compatible with the
         # PyTorch-native implementation.
@@ -88,6 +93,8 @@ def dispatch_forward(self):
             return self.forward_tpu
         elif current_platform.is_xpu():
             return self.forward_xpu
+        elif current_platform.is_neuron():
+            return self.forward_neuron
         elif current_platform.is_out_of_tree():
             return self.forward_oot
         else:
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index f782920d06a..1de0f499c1a 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -89,6 +89,13 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         self.op(out, x)
         return out
 
+    def forward_neuron(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        x_reshaped = x.view(-1, x.shape[-1])
+        s = x_reshaped[:, :d] * F.sigmoid(x_reshaped[:, :d])
+        result = s * x_reshaped[:, d:]
+        return result.view(*x.shape[:-1], d)
+
 
 @CustomOp.register("mul_and_silu")
 class MulAndSilu(CustomOp):
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 9b174299857..2f39a0e8785 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -53,6 +53,7 @@ def __init__(self,
         # Whether to use gather or all-gather to gather the logits.
         parallel_config = get_current_vllm_config().parallel_config
         self.use_all_gather = current_platform.is_tpu() \
+            or current_platform.is_neuron() \
             or envs.VLLM_USE_V1 \
             or parallel_config.distributed_executor_backend == "external_launcher" # noqa
 
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index ce1bc98ea42..64c2dac524f 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -254,6 +254,82 @@ def forward_hpu(
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
 
+    def forward_neuron(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        def _apply_rotary_emb_neuron(
+            x: torch.Tensor,
+            cos: torch.Tensor,
+            sin: torch.Tensor,
+            is_neox_style: bool,
+        ) -> torch.Tensor:
+            cos = cos.unsqueeze(-2).to(x.dtype)
+            sin = sin.unsqueeze(-2).to(x.dtype)
+            if is_neox_style:
+                x1, x2 = torch.chunk(x, 2, dim=-1)
+            else:
+                # x1 = x[..., ::2]
+
+                # x2 = x[..., 1::2]
+                d = x.shape[-1] // 2
+                x_reshaped = x.view(-1, x.shape[-1])
+                x1 = x_reshaped[:, ::2].view(*x.shape[:-1], d)
+                x2 = x_reshaped[:, 1::2].view(*x.shape[:-1], d)
+            o1 = x1 * cos - x2 * sin
+            o2 = x2 * cos + x1 * sin
+            if is_neox_style:
+                return torch.cat((o1, o2), dim=-1)
+            else:
+                return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+        if offsets is not None:
+            positions = positions + offsets
+
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                   dtype=query.dtype)
+
+        positions = positions.flatten()
+        num_tokens = positions.shape[0]
+        cos_sin = self.cos_sin_cache.index_select(0, positions)
+        cos, sin = cos_sin.chunk(2, dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+
+        if self.rotary_dim == self.head_size:
+            query = _apply_rotary_emb(query, cos, sin, self.is_neox_style)
+            query = query.reshape(query_shape)
+            key = _apply_rotary_emb(key, cos, sin, self.is_neox_style)
+            key = key.reshape(key_shape)
+        else:
+            head_size = query.shape[-1]
+            query_reshaped = query.view(-1, head_size)
+            query_pass = query_reshaped[:, self.rotary_dim:].view(
+                *query.shape[:-1], head_size - self.rotary_dim)
+            query_rot = query_reshaped[:, :self.rotary_dim].view(
+                *query.shape[:-1], self.rotary_dim)
+            query_rot = _apply_rotary_emb_neuron(query_rot, cos, sin,
+                                                 self.is_neox_style)
+            query = torch.cat((query_rot, query_pass),
+                              dim=-1).reshape(query_shape)
+
+            key_reshaped = key.view(-1, head_size)
+            key_pass = key_reshaped[:, self.rotary_dim:].view(
+                *key.shape[:-1], head_size - self.rotary_dim)
+            key_rot = key_reshaped[:, :self.rotary_dim].view(
+                *key.shape[:-1], self.rotary_dim)
+            key_rot = _apply_rotary_emb_neuron(key_rot, cos, sin,
+                                               self.is_neox_style)
+            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
     def extra_repr(self) -> str:
         s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
         s += f", max_position_embeddings={self.max_position_embeddings}"

From b3cfb73cf9fb1ef5b9d207a8fdb8e902c4c2424f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Feb 2025 20:33:03 +0000
Subject: [PATCH 0379/1240] Fix failing `MyGemma2Embedding` test (#13820)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vllm_add_dummy_model/my_gemma_embedding.py             | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index 3af62b2885e..a376d2cb340 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.models.gemma2 import Gemma2Model
@@ -37,16 +36,12 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids,
             positions,
-            kv_caches,
-            attn_metadata,
             intermediate_tensors=intermediate_tensors,
             inputs_embeds=inputs_embeds,
         )

From 40bebd3c890fbf0f5d7b35e7bba15f3cc0413304 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 25 Feb 2025 20:07:12 -0500
Subject: [PATCH 0380/1240] [Model] Support Grok1 (#13795)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   5 +
 tests/models/registry.py                      |   2 +
 .../layers/fused_moe/fused_moe.py             |  43 +-
 vllm/model_executor/layers/fused_moe/layer.py |  22 +-
 .../layers/quantization/awq_marlin.py         |   2 +
 .../compressed_tensors_moe.py                 |   4 +
 .../layers/quantization/experts_int8.py       |   2 +
 .../model_executor/layers/quantization/fp8.py |   2 +
 .../layers/quantization/gptq_marlin.py        |   3 +
 vllm/model_executor/models/grok1.py           | 565 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 11 files changed, 634 insertions(+), 17 deletions(-)
 create mode 100644 vllm/model_executor/models/grok1.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ae851c35e62..9959f7233e8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -286,6 +286,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `parasail-ai/GritLM-7B-vllm`.
   * ✅︎
   * ✅︎
+- * `Grok1ModelForCausalLM`
+  * Grok1
+  * `hpcai-tech/grok-1`.
+  * ✅︎
+  * ✅︎
 - * `InternLMForCausalLM`
   * InternLM
   * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d89a41dae3a..566a4418feb 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -130,6 +130,8 @@ def check_available_online(
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
+                                             trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                            trust_remote_code=True),
     "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index bc9573b36df..00260313e72 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1040,6 +1040,7 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           w2: torch.Tensor,
                           topk_weights: torch.Tensor,
                           topk_ids: torch.Tensor,
+                          activation: str = "silu",
                           use_fp8_w8a8: bool = False,
                           use_int8_w8a16: bool = False,
                           use_int4_w4a16: bool = False,
@@ -1053,9 +1054,10 @@ def inplace_fused_experts(hidden_states: torch.Tensor,
                           a2_scale: Optional[torch.Tensor] = None,
                           block_shape: Optional[List[int]] = None) -> None:
     fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True,
-                       use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16,
-                       global_num_experts, expert_map, w1_scale, w2_scale,
-                       w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
+                       activation, use_fp8_w8a8, use_int8_w8a16,
+                       use_int4_w4a16, global_num_experts, expert_map,
+                       w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+                       block_shape)
 
 
 def inplace_fused_experts_fake(
@@ -1064,6 +1066,7 @@ def inplace_fused_experts_fake(
         w2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
+        activation: str = "silu",
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1093,6 +1096,7 @@ def outplace_fused_experts(
         w2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
+        activation: str = "silu",
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1106,7 +1110,7 @@ def outplace_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[List[int]] = None) -> torch.Tensor:
     return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids,
-                              False, use_fp8_w8a8, use_int8_w8a16,
+                              False, activation, use_fp8_w8a8, use_int8_w8a16,
                               use_int4_w4a16, global_num_experts, expert_map,
                               w1_scale, w2_scale, w1_zp, w2_zp, a1_scale,
                               a2_scale, block_shape)
@@ -1118,6 +1122,7 @@ def outplace_fused_experts_fake(
         w2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
+        activation: str = "silu",
         use_fp8_w8a8: bool = False,
         use_int8_w8a16: bool = False,
         use_int4_w4a16: bool = False,
@@ -1147,6 +1152,7 @@ def fused_experts(hidden_states: torch.Tensor,
                   topk_weights: torch.Tensor,
                   topk_ids: torch.Tensor,
                   inplace: bool = False,
+                  activation: str = "silu",
                   use_fp8_w8a8: bool = False,
                   use_int8_w8a16: bool = False,
                   use_int4_w4a16: bool = False,
@@ -1162,15 +1168,17 @@ def fused_experts(hidden_states: torch.Tensor,
 
     if inplace:
         torch.ops.vllm.inplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
-            use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map,
-            w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
+            hidden_states, w1, w2, topk_weights, topk_ids, activation,
+            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
+            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+            block_shape)
         return hidden_states
     else:
         return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8,
-            use_int8_w8a16, use_int4_w4a16, global_num_experts, expert_map,
-            w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale, block_shape)
+            hidden_states, w1, w2, topk_weights, topk_ids, activation,
+            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
+            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
+            block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -1179,6 +1187,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                        topk_weights: torch.Tensor,
                        topk_ids: torch.Tensor,
                        inplace: bool = False,
+                       activation: str = "silu",
                        use_fp8_w8a8: bool = False,
                        use_int8_w8a16: bool = False,
                        use_int4_w4a16: bool = False,
@@ -1303,8 +1312,14 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 use_int4_w4a16=use_int4_w4a16,
                                 block_shape=block_shape)
 
-        torch.ops._C.silu_and_mul(intermediate_cache2,
-                                  intermediate_cache1.view(-1, N))
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
         invoke_fused_moe_kernel(intermediate_cache2,
                                 w2,
@@ -1339,6 +1354,7 @@ def fused_moe(
     topk: int,
     renormalize: bool,
     inplace: bool = False,
+    activation: str = "silu",
     use_grouped_topk: bool = False,
     num_expert_group: Optional[int] = None,
     topk_group: Optional[int] = None,
@@ -1370,6 +1386,8 @@ def fused_moe(
     - renormalize (bool): If True, renormalize the top-k weights to sum to 1.
     - inplace (bool): If True, perform the operation in-place.
         Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
     - num_expert_group: Optional[int]: additional parameter for grouped_topk
     - topk_group: Optional[int]: additional parameter for grouped_topk
     - use_grouped_topk: If True, use grouped_topk instead of fused_topk
@@ -1420,6 +1438,7 @@ def fused_moe(
                          topk_weights,
                          topk_ids,
                          inplace=inplace,
+                         activation=activation,
                          use_fp8_w8a8=use_fp8_w8a8,
                          use_int8_w8a16=use_int8_w8a16,
                          use_int4_w4a16=use_int4_w4a16,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 452f390f498..42554b61f67 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -120,7 +120,8 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         return self.forward(x=x,
                             layer=layer,
@@ -134,7 +135,8 @@ def apply(
                             expert_map=expert_map,
                             custom_routing_function=custom_routing_function,
                             scoring_func=scoring_func,
-                            e_score_correction_bias=e_score_correction_bias)
+                            e_score_correction_bias=e_score_correction_bias,
+                            activation=activation)
 
     def forward_cuda(
         self,
@@ -150,7 +152,8 @@ def forward_cuda(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -170,6 +173,7 @@ def forward_cuda(
                              topk_weights=topk_weights,
                              topk_ids=topk_ids,
                              inplace=True,
+                             activation=activation,
                              global_num_experts=global_num_experts,
                              expert_map=expert_map)
 
@@ -186,9 +190,11 @@ def forward_cpu(
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
+        activation: str = "silu",
         **kwargs,
     ):
         assert custom_routing_function is None
+        assert activation == "silu", f"{activation} is not supported."
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -213,7 +219,8 @@ def forward_tpu(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
@@ -225,6 +232,7 @@ def forward_tpu(
         if e_score_correction_bias is not None:
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU.")
+        assert activation == "silu", f"{activation} is not supported for TPU."
         return fused_moe_pallas(hidden_states=x,
                                 w1=layer.w13_weight,
                                 w2=layer.w2_weight,
@@ -277,6 +285,7 @@ def __init__(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ):
         super().__init__()
 
@@ -305,6 +314,7 @@ def __init__(
         self.custom_routing_function = custom_routing_function
         self.scoring_func = scoring_func
         self.e_score_correction_bias = e_score_correction_bias
+        self.activation = activation
         self.expert_map = None
 
         if self.ep_size > 1:
@@ -653,7 +663,9 @@ def forward(self, hidden_states: torch.Tensor,
             num_expert_group=self.num_expert_group,
             custom_routing_function=self.custom_routing_function,
             scoring_func=self.scoring_func,
-            e_score_correction_bias=self.e_score_correction_bias)
+            e_score_correction_bias=self.e_score_correction_bias,
+            activation=self.activation,
+        )
 
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             # Default set to False. (May have to add shared expert outputs.)
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 7a2fb203dec..473816fcc3e 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -469,7 +469,9 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
+        assert activation == "silu", "Only SiLU activation is supported."
         if expert_map is not None:
             raise NotImplementedError(
                 "Expert Parallelism is not supported for "
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index f1f316f0833..c9aa0ec285b 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -219,6 +219,7 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -240,6 +241,7 @@ def apply(
                              topk_weights=topk_weights,
                              topk_ids=topk_ids,
                              inplace=True,
+                             activation=activation,
                              use_fp8_w8a8=True,
                              global_num_experts=global_num_experts,
                              expert_map=expert_map,
@@ -550,7 +552,9 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
+        assert activation == "silu", "Only SiLU activation is supported."
         if expert_map is not None:
             raise NotImplementedError(
                 "Expert Parallelism is not supported for "
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 0767926ee5c..d18ca55afeb 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -113,6 +113,7 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -134,6 +135,7 @@ def apply(
                              topk_weights=topk_weights,
                              topk_ids=topk_ids,
                              inplace=True,
+                             activation=activation,
                              use_int8_w8a16=True,
                              global_num_experts=global_num_experts,
                              expert_map=expert_map,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 5e1bec0bb4b..76a7d4df8a3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -675,6 +675,7 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
@@ -698,6 +699,7 @@ def apply(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=True,
+            activation=activation,
             use_fp8_w8a8=True,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 94a1de71bbc..21db8ccba05 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -590,7 +590,10 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
+        assert activation == "silu", "Only SiLU activation is supported."
+
         # The input must currently be float16
         orig_dtype = x.dtype
         x = x.half()
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
new file mode 100644
index 00000000000..f2e82017f65
--- /dev/null
+++ b/vllm/model_executor/models/grok1.py
@@ -0,0 +1,565 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from
+# https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Grok1 model."""
+from typing import Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+# Default Grok1-specific constants, overridden by config values if present
+DEFAULT_ATTN_OUTPUT_MULTIPLIER = 0.08838834764831845
+DEFAULT_OUTPUT_MULTIPLIER_SCALE = 0.5773502691896257
+DEFAULT_EMBEDDING_MULTIPLIER_SCALE = 78.38367176906169
+
+
+class Grok1MoE(nn.Module):
+    """A tensor-parallel MoE implementation for Grok1 that shards each expert
+    across all ranks.
+
+    Each expert's weights are sharded across all ranks and a fused MoE
+    kernel is used for the forward pass, and finally we reduce the outputs
+    across ranks.
+    """
+
+    def __init__(self,
+                 num_experts: int,
+                 top_k: int,
+                 hidden_size: int,
+                 intermediate_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 tp_size: Optional[int] = None,
+                 prefix: str = ""):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # Gate always runs at half / full precision for now.
+        self.gate = ReplicatedLinear(hidden_size,
+                                     num_experts,
+                                     bias=False,
+                                     params_dtype=params_dtype,
+                                     quant_config=None,
+                                     prefix=f"{prefix}.gate")
+
+        self.experts = FusedMoE(num_experts=num_experts,
+                                top_k=top_k,
+                                hidden_size=hidden_size,
+                                intermediate_size=intermediate_size,
+                                params_dtype=params_dtype,
+                                reduce_results=True,
+                                renormalize=True,
+                                quant_config=quant_config,
+                                tp_size=tp_size,
+                                activation="gelu",
+                                prefix=f"{prefix}.experts")
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # NOTE: hidden_states can have either 1D or 2D shape.
+        orig_shape = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        # router_logits: (num_tokens, n_experts)
+        router_logits, _ = self.gate(hidden_states)
+        router_logits = 30.0 * F.tanh(router_logits / 30.0)
+        final_hidden_states = self.experts(hidden_states, router_logits)
+        return final_hidden_states.view(orig_shape)
+
+
+class Grok1Attention(nn.Module):
+
+    def __init__(
+            self,
+            hidden_size: int,
+            num_heads: int,
+            num_kv_heads: int,
+            max_position: int = 4096 * 32,
+            rope_theta: float = 10000,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+            config=None,  # Added config parameter
+    ) -> None:
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.config = config  # Store config reference
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=int(self.rope_theta),
+            is_neox_style=True,
+        )
+
+        attn_logits_soft_cap = max(
+            getattr(config, "attn_logit_softcapping", 30.0), 0.0)
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        output, _ = self.o_proj(attn_output)
+
+        # Apply attention output multiplier if specified in config
+        attn_multiplier = getattr(self.config, "attn_output_multiplier",
+                                  None) if self.config else None
+        if attn_multiplier is not None:
+            output = output * attn_multiplier
+        return output
+
+
+class Grok1DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Check for fp8 quantization
+        self.use_fp8 = False
+        if quant_config is not None:
+            self.use_fp8 = getattr(quant_config, "is_fp8_w8a8",
+                                   lambda: False)()
+            if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
+                self.use_fp8 = quant_config.is_fp8
+
+        # Requires transformers > 4.32.0
+        # Default rope_theta value if not in config
+        rope_theta = 10000
+        self.attn = Grok1Attention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            config=config)  # Pass config to Grok1Attention
+
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(config, "num_experts", 8)
+        num_experts_per_tok = getattr(config, "num_experts_per_tok", 2)
+
+        self.moe_block = Grok1MoE(num_experts=num_experts,
+                                  top_k=num_experts_per_tok,
+                                  hidden_size=config.hidden_size,
+                                  intermediate_size=config.intermediate_size,
+                                  quant_config=quant_config,
+                                  prefix=f"{prefix}.moe_block")
+
+        self.pre_attn_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.hidden_size,
+                                      eps=config.rms_norm_eps)
+        self.pre_moe_norm = RMSNorm(config.hidden_size,
+                                    eps=config.rms_norm_eps)
+        self.post_moe_norm = RMSNorm(config.hidden_size,
+                                     eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_attn_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_attn_norm(
+                hidden_states, residual)
+
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        # Post attention normalization
+        hidden_states = self.post_attn_norm(hidden_states)
+
+        # MoE block with normalization
+        hidden_states, residual = self.pre_moe_norm(hidden_states, residual)
+        hidden_states = self.moe_block(hidden_states)
+        hidden_states = self.post_moe_norm(hidden_states)
+
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Grok1Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        self.embedding_multiplier_scale = getattr(
+            config, "embedding_multiplier_scale",
+            DEFAULT_EMBEDDING_MULTIPLIER_SCALE)
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=quant_config,
+        )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Grok1DecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        hidden_states = hidden_states * self.embedding_multiplier_scale
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states,
+                                            kv_caches[i - self.start_layer],
+                                            attn_metadata, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Grok1ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = Grok1Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"),
+        )
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.output_multiplier_scale = getattr(
+            config, "output_multiplier_scale", DEFAULT_OUTPUT_MULTIPLIER_SCALE)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                self.output_multiplier_scale)
+
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        # Map Grok1's unique expert parameter names to standard names
+        # Grok1 uses "num_experts" in its config
+        num_experts = getattr(self.config, "num_experts", 8)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="linear",  # Grok1 specific
+            ckpt_down_proj_name="linear_1",  # Grok1 specific
+            ckpt_up_proj_name="linear_v",  # Grok1 specific
+            num_experts=num_experts)
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if name.endswith("scale"):
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    # Handle Grok1-specific norm.scale naming
+                    if "norm.scale" in name:
+                        name = name.replace("scale", "weight")
+
+                    # Skip lm_head when tie_word_embeddings is True
+                    if "lm_head" in name and self.config.tie_word_embeddings:
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 05fb3d21953..58155905a7b 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -60,6 +60,7 @@
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
+    "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
     "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
     "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),

From ed96cf64eafbc1c92417b2010e0d0ca79b9f1066 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Feb 2025 01:24:57 +0000
Subject: [PATCH 0381/1240] DeepSeek V2/V3/R1 only place `lm_head` on last pp
 rank (#13833)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 79484cee167..6ff3ef129a7 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -636,9 +636,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.quant_config = quant_config
         self.model = DeepseekV2Model(vllm_config=vllm_config,
                                      prefix=maybe_prefix(prefix, "model"))
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=quant_config)
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (

From 960525e34eb138fa24c3e566281006130e5cb2ce Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 25 Feb 2025 17:53:43 -0800
Subject: [PATCH 0382/1240] [misc] Show driver IP info when Ray fails to
 allocate driver worker (#13858)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 673d0fc5d23..bcad274bab4 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -229,9 +229,10 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker)
         if not self.use_ray_spmd_worker and self.driver_dummy_worker is None:
             raise ValueError(
-                "Ray does not allocate any GPUs on the driver node. Consider "
-                "adjusting the Ray placement group or running the driver on a "
-                "GPU node.")
+                "Ray does not allocate any GPUs on the driver node."
+                f"Driver IP: {driver_ip}, worker IPs: {worker_ips}."
+                "Consider adjusting the Ray placement group or running "
+                "the driver on a GPU node.")
 
         ip_counts: Dict[str, int] = {}
         for ip in worker_ips:

From a332956e59d903d79b0f32da441698a28ea592f4 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Tue, 25 Feb 2025 18:14:48 -0800
Subject: [PATCH 0383/1240] [V1][Spec Decode] Change Spec Decode Rejection
 Sampling API (#13729)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py |  17 ++-
 tests/v1/sample/test_sampler.py           |   1 -
 tests/v1/worker/test_gpu_input_batch.py   |   1 -
 vllm/v1/sample/metadata.py                |   3 -
 vllm/v1/sample/rejection_sampler.py       | 134 +++++++++++-----------
 vllm/v1/sample/sampler.py                 |  19 ++-
 vllm/v1/worker/gpu_input_batch.py         |  11 --
 vllm/v1/worker/gpu_model_runner.py        |  29 +++--
 8 files changed, 104 insertions(+), 111 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 956d91c6daf..f00585b40ba 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -29,7 +29,6 @@ def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
         temperature=torch.tensor([]),
         all_greedy=True,
         all_random=False,
-        spec_token_ids=spec_tokens,
         top_p=None,
         top_k=None,
         min_p=torch.empty(batch_size, ),
@@ -55,7 +54,7 @@ def test_perfect_match(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 2, 3, 4]],
                             dtype=torch.int,
                             device=logits.device)
@@ -70,7 +69,7 @@ def test_early_mismatch(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
@@ -85,7 +84,7 @@ def test_multiple_sequences(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
@@ -100,7 +99,7 @@ def test_single_token_sequence(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
     assert torch.equal(output.sampled_token_ids, expected)
 
@@ -113,7 +112,7 @@ def test_empty_sequence(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
     assert torch.equal(output.sampled_token_ids, expected)
 
@@ -126,7 +125,7 @@ def test_multiple_mismatches(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
                              [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                             dtype=torch.int,
@@ -147,7 +146,7 @@ def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected_tensor = torch.tensor(expected,
                                    dtype=torch.int,
                                    device=logits.device)
@@ -163,7 +162,7 @@ def test_logits_shape_handling(sampler):
     metadata = create_sampling_metadata(spec_tokens)
     logits = create_logits_tensor(output_tokens, vocab_size)
 
-    output = sampler(logits, metadata)
+    output = sampler(spec_tokens, logits, metadata)
     expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
     assert torch.equal(output.sampled_token_ids, expected)
     assert logits.shape[-1] == vocab_size
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 34fba5a9f6d..435c1b7b5fd 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -105,7 +105,6 @@ def _create_default_sampling_metadata(
         prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
                                                       vocab_size, device),
         output_token_ids=output_token_ids,
-        spec_token_ids=None,
         frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
         repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 0aee266264a..327370e71ff 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -123,7 +123,6 @@ def _construct_expected_sampling_metadata(
                                           dtype=torch.float,
                                           device=device),
         output_token_ids=output_token_ids,
-        spec_token_ids=None,
         min_tokens=min_tokens,
         no_penalties=(all(x == 0 for x in presence_penalties)
                       and all(x == 0 for x in frequency_penalties)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 9f7770bbd07..b757a1dc60c 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -13,9 +13,6 @@ class SamplingMetadata:
     all_greedy: bool
     all_random: bool
 
-    # None when there are no speculated tokens.
-    spec_token_ids: Optional[List[List[int]]]
-
     top_p: Optional[torch.Tensor]
     top_k: Optional[torch.Tensor]
     min_p: Optional[torch.Tensor]
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 580ad44297a..2e3927345eb 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import List
+
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import pad_sequence
@@ -52,62 +54,62 @@ def __init__(self):
         else:
             self.forward_method = self.forward_native
 
-    def forward(self, logits: torch.Tensor,
+    def forward(self, draft_token_ids: List[List[int]],
+                target_probs: torch.Tensor,
                 sampling_metadata: SamplingMetadata) -> SamplerOutput:
         if not sampling_metadata.all_greedy:
             raise NotImplementedError(
                 "Currently, only greedy sampling is supported by "
                 "rejection sampler.")
-        return self.forward_method(logits, sampling_metadata)
+        return self.forward_method(draft_token_ids, target_probs,
+                                   sampling_metadata)
 
     def flashinfer_sample(
         self,
-        logits: torch.Tensor,
+        draft_token_ids: List[List[int]],
+        target_probs: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
         # NOTE: The following input preparationg can be moved
         # to the model runner with a persistent manner for better
         # performance.
-        assert sampling_metadata.spec_token_ids is not None
-        spec_token_ids = sampling_metadata.spec_token_ids
-        max_spec_len = max(len(s) for s in spec_token_ids)
-        batch_size = len(spec_token_ids)
-        draft_token_ids = torch.full((batch_size, max_spec_len),
-                                     INVALID_TOKEN_ID,
-                                     device="cpu",
-                                     dtype=torch.long)
-
-        target_token_ids = torch.full((batch_size, max_spec_len + 1),
-                                      fill_value=INVALID_TOKEN_ID,
-                                      device=logits.device,
-                                      dtype=torch.long)
-
-        # TODO: Vectorize the following loop for better performance.
-        start_loc = 0
-        for i in range(batch_size):
-            num_spec_tokens = len(spec_token_ids[i])
-            draft_token_ids[i, :num_spec_tokens] = torch.tensor(
-                spec_token_ids[i], device="cpu", dtype=torch.long)
-            end_loc = start_loc + num_spec_tokens + 1
-            # Assume greedy sampling.
-            target_token_ids[i, :num_spec_tokens + 1] = torch.argmax(
-                logits[start_loc:end_loc], dim=-1)
-            start_loc = end_loc
-
-        vocab_size = logits.size(-1)
-        # NOTE: CPU <-> GPU synchronization happens here.
-        draft_token_ids = draft_token_ids.to(logits.device)
-        draft_probs = _create_greedy_token_probs(draft_token_ids, vocab_size,
-                                                 logits.device)
-        target_probs = _create_greedy_token_probs(target_token_ids, vocab_size,
-                                                  logits.device)
-        uniform_samples = torch.zeros(batch_size,
-                                      max_spec_len + 1,
-                                      device=logits.device)
+        sample_lens = [len(x) + 1 for x in draft_token_ids]
+        # Convert draft token IDs to a tensor, split by sample_lens, then pad.
+        draft_token_ids = [
+            torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
+        ]
+        draft_token_ids_tensor = pad_sequence(draft_token_ids,
+                                              batch_first=True,
+                                              padding_value=INVALID_TOKEN_ID)
+
+        if sampling_metadata.all_greedy:
+            target_token_ids = target_probs.argmax(dim=-1).view(-1)
+            target_token_ids = target_token_ids.split(sample_lens)
+            target_token_ids = pad_sequence(target_token_ids,
+                                            batch_first=True,
+                                            padding_value=INVALID_TOKEN_ID)
+
+            vocab_size = target_probs.size(-1)
+            # NOTE: CPU <-> GPU synchronization happens here.
+            draft_token_ids_tensor = draft_token_ids_tensor.to(
+                target_probs.device)
+            draft_probs = _create_greedy_token_probs(draft_token_ids_tensor,
+                                                     vocab_size,
+                                                     target_probs.device)
+            target_probs = _create_greedy_token_probs(target_token_ids,
+                                                      vocab_size,
+                                                      target_probs.device)
+            uniform_samples = torch.zeros(draft_token_ids_tensor.size(0),
+                                          draft_token_ids_tensor.size(1) + 1,
+                                          device=target_probs.device)
+        else:
+            raise NotImplementedError(
+                "Currently, only greedy sampling is supported by "
+                "rejection sampler.")
 
         sampled_token_ids, _, _ = fs.chain_speculative_sampling(
             draft_probs,
-            draft_token_ids,
+            draft_token_ids_tensor,
             uniform_samples,
             target_probs,
         )
@@ -117,35 +119,35 @@ def flashinfer_sample(
     # TODO: The following method can be optimized for better performance.
     def forward_native(
         self,
-        logits: torch.Tensor,
+        draft_token_ids: List[List[int]],
+        target_probs: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        assert sampling_metadata.spec_token_ids is not None
-        spec_lens = [len(x) for x in sampling_metadata.spec_token_ids]
-        # Add 1 to include the 'bonus' token.
-        sample_lens = [x + 1 for x in spec_lens]
-
-        output_token_ids = logits.argmax(dim=-1).view(-1)
-        output_token_ids = output_token_ids.split(sample_lens)
-        output_token_ids = pad_sequence(output_token_ids,
-                                        batch_first=True,
-                                        padding_value=INVALID_TOKEN_ID)
-
-        # Convert spec token IDs to a tensor, split by sample_lens, then pad.
-        spec_token_ids = [
-            torch.tensor(x,
-                         dtype=output_token_ids.dtype,
-                         device=output_token_ids.device)
-            for x in sampling_metadata.spec_token_ids
+        sample_lens = [len(x) + 1 for x in draft_token_ids]
+        # Convert draft token IDs to a tensor, split by sample_lens, then pad.
+        draft_token_ids = [
+            torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
         ]
-        spec_token_ids = pad_sequence(spec_token_ids,
-                                      batch_first=True,
-                                      padding_value=INVALID_TOKEN_ID)
-
-        # Produce a mask that remains 1 (True) until the first
-        # mismatch (cumprod turns 0 after a mismatch).
-        accept_mask = (output_token_ids[:, :-1] == spec_token_ids).cumprod(
-            dim=1)
+        draft_token_ids_tensor = pad_sequence(draft_token_ids,
+                                              batch_first=True,
+                                              padding_value=INVALID_TOKEN_ID)
+        draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
+        # Add 1 to include the 'bonus' token.
+        if sampling_metadata.all_greedy:
+            output_token_ids = target_probs.argmax(dim=-1).view(-1)
+            output_token_ids = output_token_ids.split(sample_lens)
+            output_token_ids = pad_sequence(output_token_ids,
+                                            batch_first=True,
+                                            padding_value=INVALID_TOKEN_ID)
+            # Produce a mask that remains 1 (True) until the first
+            # mismatch (cumprod turns 0 after a mismatch).
+            accept_mask = (
+                output_token_ids[:, :-1] == draft_token_ids_tensor).cumprod(
+                    dim=1)
+        else:
+            raise NotImplementedError(
+                "Currently, only greedy sampling is supported by "
+                "rejection sampler.")
         # Identify valid positions (non-padding).
         valid_mask = output_token_ids != INVALID_TOKEN_ID
         # Generate mask with bonus token.
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 47ec26d4202..b0eb533ae2e 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -9,7 +9,6 @@
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
-from vllm.v1.sample.rejection_sampler import RejectionSampler
 
 _SAMPLING_EPS = 1e-5
 
@@ -19,22 +18,12 @@ class Sampler(nn.Module):
     def __init__(self):
         super().__init__()
         self.topk_topp_sampler = TopKTopPSampler()
-        self.rejection_sampler = RejectionSampler()
 
     def forward(
         self,
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
-        if sampling_metadata.spec_token_ids:
-            if sampling_metadata.max_num_logprobs:
-                raise NotImplementedError(
-                    "Rejection sampling does not support logprobs.")
-            return self.rejection_sampler(
-                logits,
-                sampling_metadata,
-            )
-
         # NOTE(woosuk): Use the original logits (before any penalties or
         # temperature scaling) for the top-k logprobs.
         # This is different from the V0 sampler, which uses the logits that
@@ -127,6 +116,14 @@ def sample(
         )
         return sampled
 
+    def compute_probs(self, logits: torch.Tensor,
+                      sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        if sampling_metadata.all_greedy:
+            return logits
+        # Apply temperature. This is an in-place op changing logits.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+        return logits.softmax(dim=-1, dtype=torch.float32)
+
     def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
         return logits.log_softmax(dim=-1, dtype=torch.float32)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index d9fc53490c0..e4e6b88245d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -490,23 +490,12 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
             output_token_ids=cast(List[List[int]], self.req_output_token_ids),
-            spec_token_ids=None,
             min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
         )
 
-    def get_sampling_metadata(
-        self,
-        req_id_to_spec_token_ids: Dict[str, List[int]],
-    ) -> SamplingMetadata:
-        # Set the new spec token ids in the cached sampling metadata.
-        self.sampling_metadata.spec_token_ids = [
-            req_id_to_spec_token_ids.get(req_id, []) for req_id in self.req_ids
-        ] if req_id_to_spec_token_ids else None
-        return self.sampling_metadata
-
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
         max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
         prompt_token_ids_cpu_tensor = torch.empty(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1fbce3098a3..4d0ae9a205a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -32,7 +32,7 @@
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID
+from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -122,7 +122,7 @@ def __init__(
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-
+            self.rejection_sampler = RejectionSampler()
             # TODO: find a better way to check if we are using ngram.
             assert self.speculative_config.ngram_prompt_lookup_min, \
                     "Currently, only ngram spec decode is supported in V1."
@@ -951,12 +951,24 @@ def execute_model(
         logits = self.model.compute_logits(sample_hidden_states, None)
 
         # Sample the next token and get logprobs if needed.
-        sampling_metadata = self.input_batch.get_sampling_metadata(
-            scheduler_output.scheduled_spec_decode_tokens)
-        sampler_output = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
-        )
+        sampling_metadata = self.input_batch.sampling_metadata
+        if not self.use_spec_decode:
+            sampler_output = self.model.sample(
+                logits=logits,
+                sampling_metadata=sampling_metadata,
+            )
+        else:
+            target_probs = self.model.sampler.compute_probs(
+                logits, sampling_metadata)
+            scheduled_request_ids = scheduler_output.num_scheduled_tokens.keys(
+            )
+            draft_token_ids = [
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
+                for req_id in scheduled_request_ids
+            ]
+            sampler_output = self.rejection_sampler(draft_token_ids,
+                                                    target_probs,
+                                                    sampling_metadata)
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
@@ -1293,7 +1305,6 @@ def profile_run(self) -> None:
                     temperature=dummy_tensors(0.5),
                     all_greedy=False,
                     all_random=False,
-                    spec_token_ids=None,
                     top_p=dummy_tensors(0.9),
                     top_k=dummy_tensors(logits.size(1) - 1),
                     min_p=None,

From d976d1e1e5a373b2f62254cd6523cec83ee83d83 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Wed, 26 Feb 2025 10:44:30 +0800
Subject: [PATCH 0384/1240] [Misc]Code Cleanup (#13859)

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index bcad274bab4..2908fefc8e7 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -95,7 +95,6 @@ def _init_executor(self) -> None:
         self.use_v1 = envs.VLLM_USE_V1
 
         self.pp_locks: Optional[List[asyncio.Lock]] = None
-        self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER
         if not self.use_ray_compiled_dag:
             self.driver_exec_method = make_async(
                 self.driver_worker.execute_method)

From 9d3fb29652584f4d7d879504854c2dd6503c9e6a Mon Sep 17 00:00:00 2001
From: Henry Tsang <henrylhtsang@gmail.com>
Date: Tue, 25 Feb 2025 18:52:03 -0800
Subject: [PATCH 0385/1240] [Kernel][Build/CI] Bump CUTLASS to 3.8 and add
 initializers for cutlass epilogues (#13797)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |  8 +++---
 .../epilogue/scaled_mm_epilogues_c2x.hpp      | 26 +++++++++--------
 .../epilogue/scaled_mm_epilogues_c3x.hpp      | 28 ++++++++++---------
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82ad7b8819d..02a60c0e352 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -266,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # Please keep this in sync with CUTLASS_REVISION line above.
-        GIT_TAG v3.7.0
+        GIT_TAG v3.8.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -321,7 +321,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS 
+    set(SRCS
        "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
@@ -401,7 +401,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # FP4 Archs and flags
   cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
-    set(SRCS 
+    set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
       "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
     )
@@ -612,7 +612,7 @@ endif()
 
 if(VLLM_FLASH_ATTN_SRC_DIR)
   FetchContent_Declare(
-          vllm-flash-attn SOURCE_DIR 
+          vllm-flash-attn SOURCE_DIR
           ${VLLM_FLASH_ATTN_SRC_DIR}
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
   )
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
index ef413e6dd75..64b7ddae3d2 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp
@@ -122,8 +122,8 @@ struct ScaledEpilogue
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
 
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
   }
 };
 
@@ -167,8 +167,8 @@ struct ScaledEpilogueBias
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
 
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
   }
 };
 
@@ -230,9 +230,10 @@ struct ScaledEpilogueBiasAzp
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
 
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
 
@@ -309,11 +310,12 @@ struct ScaledEpilogueBiasAzpToken
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
 
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
 
-};  // namespace vllm::c2x
\ No newline at end of file
+};  // namespace vllm::c2x
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 583fa3c4551..1a0cd45f4e2 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -146,8 +146,8 @@ struct ScaledEpilogue
     auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
 
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args};
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
   }
 };
 
@@ -193,8 +193,8 @@ struct ScaledEpilogueBias
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
 
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
   }
 };
 
@@ -236,8 +236,8 @@ struct ScaledEpilogueColumnBias
     auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
     auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
 
-    typename EVTCompute0::Arguments evt0_args{b_args};
-    return ArgumentType{a_args, evt0_args, bias_args};
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, bias_args, {}};
   }
 };
 
@@ -297,9 +297,10 @@ struct ScaledEpilogueBiasAzp
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpWithAdj, int32_t>(azp_adj);
 
-    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+    typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_azp_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
 
@@ -374,10 +375,11 @@ struct ScaledEpilogueBiasAzpToken
     auto azp_adj_args =
         SUPER::template args_from_tensor<AzpAdj, int32_t>(azp_adj);
 
-    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args};
-    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args};
-    typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args};
-    return ArgumentType{a_args, evt_scale_b_args, bias_args};
+    typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args, {}};
+    typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args, {}};
+    typename EVTComputeScaleB::Arguments evt_scale_b_args{
+        b_args, evt_acc_args, {}};
+    return ArgumentType{a_args, evt_scale_b_args, bias_args, {}};
   }
 };
 

From c9d3ba56a1b22adf1e0a85bdfafdecef7797a368 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Feb 2025 02:53:56 +0000
Subject: [PATCH 0386/1240] Improve pipeline partitioning (#13839)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/distributed/test_pipeline_partition.py | 24 ++++++++++++++++
 vllm/distributed/utils.py                    | 30 ++++++++++++++------
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 3ed104820b4..18c5be29c5c 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -34,3 +34,27 @@ def _verify(partition_str, num_layers, pp_size, goldens):
     # Wrong number of layers
     with pytest.raises(ValueError):
         _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+
+
+@pytest.mark.parametrize(
+    "num_hidden_layers,pp_size,pp_rank,indices",
+    [
+        # pp_size 2
+        (2, 2, 0, (0, 1)),
+        (2, 2, 1, (1, 2)),
+        (3, 2, 0, (0, 2)),
+        (3, 2, 1, (2, 3)),
+        # pp_size 3
+        (3, 3, 0, (0, 1)),
+        (3, 3, 1, (1, 2)),
+        (3, 3, 2, (2, 3)),
+        (4, 3, 0, (0, 1)),
+        (4, 3, 1, (1, 3)),
+        (4, 3, 2, (3, 4)),
+        (5, 3, 0, (0, 2)),
+        (5, 3, 1, (2, 4)),
+        (5, 3, 2, (4, 5)),
+    ])
+def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
+                               pp_rank: int, indices: tuple[int, int]):
+    assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 79f9a84b476..d6fca4f0221 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -67,8 +67,17 @@ def split_tensor_along_last_dim(
 def get_pp_indices(num_hidden_layers: int, pp_rank: int,
                    pp_size: int) -> Tuple[int, int]:
     """Try to evenly distribute layers across partitions.
+
     If the number of layers is not divisible by the number of partitions,
-    the last partition will have the remaining layers.
+    the remaining layers are evenly distributed across all but the last
+    partition. The last partition is excluded because it often contains an
+    additional norm layer and we are attempting to balance compute.
+
+    If `pp_size > 2` and the number of remaining layers is
+    `0 < x <= pp_size - 2` then the remaining layers are evenly distributed
+    across the middle partitions. The first and last partitions are excluded
+    because they contain the input and output embeddings respectively and we
+    are attempting to reduce maximum memory consumption across partitions.
     """
     partition_list_str = envs.VLLM_PP_LAYER_PARTITION
     if partition_list_str is not None:
@@ -84,15 +93,20 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
         if sum(partitions) != num_hidden_layers:
             raise ValueError(
                 f"{sum(partitions)=} does not match {num_hidden_layers=}.")
-        start_layer = sum(partitions[:pp_rank])
-        end_layer = start_layer + partitions[pp_rank]
     else:
         layers_per_partition = num_hidden_layers // pp_size
-        start_layer = pp_rank * layers_per_partition
-        end_layer = start_layer + layers_per_partition
-
-        if pp_rank == pp_size - 1:
-            end_layer = num_hidden_layers
+        partitions = [layers_per_partition for _ in range(pp_size)]
+
+        if remaining_layers := num_hidden_layers % pp_size:
+            for i in range(2, remaining_layers + 2):
+                partitions[-i] += 1
+            logger.info("Hidden layers were unevenly partitioned: %s",
+                        ",".join(str(p) for p in partitions))
+            logger.info("This can be manually overridden using the "
+                        "VLLM_PP_LAYER_PARTITION environment variable")
+
+    start_layer = sum(partitions[:pp_rank])
+    end_layer = start_layer + partitions[pp_rank]
 
     return (start_layer, end_layer)
 

From 6f38ce6fe5c403c9118f6000799a8ab9096b50ca Mon Sep 17 00:00:00 2001
From: Albert <albert.zty@antgroup.com>
Date: Wed, 26 Feb 2025 14:56:19 +0800
Subject: [PATCH 0387/1240] [Doc] fix the incorrect module path of
 tensorize_vllm_model (#13863)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/other/tensorize_vllm_model.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/other/tensorize_vllm_model.py b/examples/other/tensorize_vllm_model.py
index 68345e6cb98..7d11ba51a09 100644
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/other/tensorize_vllm_model.py
@@ -27,7 +27,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.offline_inference.tensorize_vllm_model \
+python -m examples.other.tensorize_vllm_model \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -47,7 +47,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.offline_inference.tensorize_vllm_model \
+python -m examples.other.tensorize_vllm_model \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -65,11 +65,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.offline_inference.tensorize_vllm_model serialize --help`.
+`python -m examples.other.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`.
+`python -m examples.other.tensorize_vllm_model deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -90,7 +90,7 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.offline_inference.tensorize_vllm_model deserialize --help`
+`python -m examples.other.tensorize_vllm_model deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and

From 79aad775ab0eceb4bf7047c876741db6aa0f4683 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 25 Feb 2025 22:56:58 -0800
Subject: [PATCH 0388/1240] [ROCm] Disable chunked prefill/prefix caching when
 running MLA on non-cuda platforms (#13844)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py | 42 +++++++++++++++++++--------
 vllm/config.py                        | 14 +++++++++
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 4dd562be383..225fee8d2a0 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -232,6 +232,7 @@
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.multimodal import MultiModalPlaceholderMap
+from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
 
 try:
@@ -1371,18 +1372,35 @@ def _forward_prefill(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        output = self.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v_padded,
-            cu_seqlens_q=prefill_metadata.query_start_loc,
-            cu_seqlens_k=prefill_metadata.query_start_loc,
-            max_seqlen_q=prefill_metadata.max_prefill_seq_len,
-            max_seqlen_k=prefill_metadata.max_prefill_seq_len,
-            softmax_scale=self.scale,
-            causal=True,
-            return_softmax_lse=has_context,
-        )
+        if has_context:
+            if not current_platform.is_cuda():
+                raise NotImplementedError(
+                    "Chunked Prefill for MLA is not currently supported on"
+                    "non-cuda platforms")
+            output = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v_padded,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.query_start_loc,
+                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+                return_softmax_lse=True,
+            )
+        else:
+            output = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v_padded,
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.query_start_loc,
+                max_seqlen_q=prefill_metadata.max_prefill_seq_len,
+                max_seqlen_k=prefill_metadata.max_prefill_seq_len,
+                softmax_scale=self.scale,
+                causal=True,
+            )
 
         if has_context:
             suffix_output, suffix_lse = output
diff --git a/vllm/config.py b/vllm/config.py
index 8e1ce87438a..a5d8ee9303d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3422,6 +3422,20 @@ def __post_init__(self):
                            "Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+        if self.model_config and self.model_config.use_mla and \
+            not current_platform.is_cuda():
+            logger.info(
+                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            self.scheduler_config.enable_chunked_prefill = False
+            self.scheduler_config.chunked_prefill_enabled = False
+            self.scheduler_config.max_num_batched_tokens = max(
+                self.scheduler_config.max_model_len,
+                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
+            if self.cache_config is not None:
+                self.cache_config.enable_prefix_caching = False
+
         current_platform.check_and_update_config(self)
 
         if not self.instance_id:

From c2d7cba55f20324a14aae7a2d5aa9ba151c56168 Mon Sep 17 00:00:00 2001
From: Seth Kimmel <seth.kimmel3@gmail.com>
Date: Tue, 25 Feb 2025 22:58:24 -0800
Subject: [PATCH 0389/1240] [v0][Core] Use xgrammar shared context to avoid
 copy overhead for offline engine (#13837)

Signed-off-by: Seth Kimmel <seth.kimmel3@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../guided_decoding/xgrammar_decoding.py      | 26 ++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 329b03a573d..e6ba7f5ecc6 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -3,7 +3,6 @@
 # noqa: UP007
 from __future__ import annotations
 
-import copy
 import json
 import re
 from dataclasses import dataclass, field
@@ -348,5 +347,26 @@ def __call__(self, input_ids: list[int],
         return scores
 
     def clone(self) -> XGrammarLogitsProcessor:
-        """Deepcopy due to per-sequence state in the matchers"""
-        return copy.deepcopy(self)
+        """Create a new instance with shared compiled grammar
+          but separate state"""
+        new_processor = XGrammarLogitsProcessor(self.config)
+
+        # Share the compiled grammar context (immutable after compilation)
+        new_processor.ctx = self.ctx
+
+        # Create fresh matchers for the new sequence
+        if self.ctx is not None:
+            new_processor.matchers = [
+                xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
+            ]
+
+        # Create a new token bitmask with the same size
+        if hasattr(self, 'token_bitmask') and self.token_bitmask is not None:
+            new_processor.token_bitmask = self.token_bitmask
+
+        # Copy simple attributes
+        new_processor.batch_size = self.batch_size
+        # Reset prefilled state for new sequence
+        new_processor.prefilled = False
+
+        return new_processor

From 0e6972a939c24af22382985a1a27a7fe63f0bc97 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 26 Feb 2025 15:43:01 +0800
Subject: [PATCH 0390/1240] [Misc] Improve LoRA spelling (#13831)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_lora.py          |  2 +-
 docs/source/features/lora.md                  |  2 +-
 tests/core/test_scheduler.py                  |  2 +-
 tests/entrypoints/openai/test_cli_args.py     |  2 +-
 .../entrypoints/openai/test_serving_models.py | 20 +++++++++----------
 tests/lora/test_layers.py                     | 18 ++++++++---------
 tests/lora/test_long_context.py               |  4 ++--
 vllm/engine/llm_engine.py                     |  2 +-
 vllm/entrypoints/openai/api_server.py         | 10 +++++-----
 vllm/entrypoints/openai/protocol.py           |  4 ++--
 vllm/entrypoints/openai/serving_models.py     | 14 ++++++-------
 vllm/lora/fully_sharded_layers.py             | 12 +++++------
 vllm/lora/layers.py                           |  8 ++++----
 vllm/lora/models.py                           | 10 +++++-----
 vllm/lora/peft_helper.py                      |  2 +-
 vllm/lora/punica_wrapper/punica_base.py       |  2 +-
 vllm/lora/utils.py                            | 18 ++++++++---------
 vllm/spec_decode/proposer_worker_base.py      |  4 ++--
 vllm/spec_decode/spec_decode_worker.py        |  4 ++--
 vllm/transformers_utils/configs/arctic.py     |  2 +-
 vllm/worker/neuron_worker.py                  |  4 ++--
 vllm/worker/openvino_worker.py                |  4 ++--
 vllm/worker/tpu_worker.py                     |  4 ++--
 vllm/worker/worker_base.py                    |  2 +-
 vllm/worker/xpu_worker.py                     |  4 ++--
 25 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index ecde8fbaa15..1deb0026a6e 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -89,7 +89,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
                              sort_by_lora_id: bool,
                              device: str) -> torch.Tensor:
     """
-    All prompts are mapped to a Lora ID in range [0, num_active_loras).
+    All prompts are mapped to a LoRA ID in range [0, num_active_loras).
     where 0 refers to first lora, 1 refers to second lora and so on.
     """
     assert num_active_loras > 0
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index fb5a7a0d519..dff7e916fb4 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -170,7 +170,7 @@ Now, you can specify a base_model_name alongside the name and path using JSON fo
 
 To provide the backward compatibility support, you can still use the old key-value format (name=path), but the `base_model_name` will remain unspecified in that case.
 
-## Lora model lineage in model card
+## LoRA model lineage in model card
 
 The new format of `--lora-modules` is mainly to support the display of parent model information in the model card. Here's an explanation of how your current response supports this:
 
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index dcc97ebaa7c..66bc5257f08 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -491,7 +491,7 @@ def test_prefill_schedule_max_lora():
                                                lora_path="abc"))
         scheduler.add_seq_group(seq_group)
     # Add two more requests to verify lora is prioritized.
-    # 0: Lora, 1: Lora, 2: regular, 3: regular
+    # 0: LoRA, 1: LoRA, 2: regular, 3: regular
     # In the first iteration, index 0, 2 is scheduled.
     # If a request is not scheduled because it hits max lora, it is
     # prioritized. Verify that.
diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index 2f065ec1070..e0285b5e556 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -26,7 +26,7 @@ def serve_parser():
     return make_arg_parser(parser)
 
 
-### Tests for Lora module parsing
+### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
     # Test old format: name=path
     args = serve_parser.parse_args([
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 55900163eef..e8f3c2f8b39 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -8,8 +8,8 @@
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              LoadLoraAdapterRequest,
-                                              UnloadLoraAdapterRequest)
+                                              LoadLoRAAdapterRequest,
+                                              UnloadLoRAAdapterRequest)
 from vllm.entrypoints.openai.serving_models import (BaseModelPath,
                                                     OpenAIServingModels)
 from vllm.lora.request import LoRARequest
@@ -51,7 +51,7 @@ async def test_serving_model_name():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter",
+    request = LoadLoRAAdapterRequest(lora_name="adapter",
                                      lora_path="/path/to/adapter2")
     response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
@@ -62,7 +62,7 @@ async def test_load_lora_adapter_success():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_missing_fields():
     serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="", lora_path="")
+    request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
@@ -72,14 +72,14 @@ async def test_load_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_load_lora_adapter_duplicate():
     serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
     response = await serving_models.load_lora_adapter(request)
     assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
     assert len(serving_models.lora_requests) == 1
 
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
     response = await serving_models.load_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
@@ -91,12 +91,12 @@ async def test_load_lora_adapter_duplicate():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_success():
     serving_models = await _async_serving_models_init()
-    request = LoadLoraAdapterRequest(lora_name="adapter1",
+    request = LoadLoRAAdapterRequest(lora_name="adapter1",
                                      lora_path="/path/to/adapter1")
     response = await serving_models.load_lora_adapter(request)
     assert len(serving_models.lora_requests) == 1
 
-    request = UnloadLoraAdapterRequest(lora_name="adapter1")
+    request = UnloadLoRAAdapterRequest(lora_name="adapter1")
     response = await serving_models.unload_lora_adapter(request)
     assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
         lora_name='adapter1')
@@ -106,7 +106,7 @@ async def test_unload_lora_adapter_success():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_missing_fields():
     serving_models = await _async_serving_models_init()
-    request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
+    request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "InvalidUserInput"
@@ -116,7 +116,7 @@ async def test_unload_lora_adapter_missing_fields():
 @pytest.mark.asyncio
 async def test_unload_lora_adapter_not_found():
     serving_models = await _async_serving_models_init()
-    request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
+    request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
     response = await serving_models.unload_lora_adapter(request)
     assert isinstance(response, ErrorResponse)
     assert response.type == "NotFoundError"
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 0838ca02c9b..61699e7052c 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -14,16 +14,16 @@
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLora,
+                              LinearScalingRotaryEmbeddingWithLoRA,
                               LogitsProcessorWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLora,
-                              QKVParallelLinearWithLora,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
@@ -866,9 +866,9 @@ def create_column_parallel_packed_layer():
                                        bias=False,
                                        params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = (MergedQKVParallelLinearWithLora(linear)
+            lora_linear = (MergedQKVParallelLinearWithLoRA(linear)
                            if not fully_shard else
-                           MergedQKVParallelLinearWithShardedLora(linear))
+                           MergedQKVParallelLinearWithShardedLoRA(linear))
         else:
             linear = QKVParallelLinear(4096,
                                        64,
@@ -876,9 +876,9 @@ def create_column_parallel_packed_layer():
                                        bias=False,
                                        params_dtype=torch.float16)
             linear.weight.data = torch.rand_like(linear.weight.data)
-            lora_linear = QKVParallelLinearWithLora(
+            lora_linear = QKVParallelLinearWithLoRA(
                 linear
-            ) if not fully_shard else QKVParallelLinearWithShardedLora(linear)
+            ) if not fully_shard else QKVParallelLinearWithShardedLoRA(linear)
 
         @dataclass
         class FakeConfig:
@@ -1024,7 +1024,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         base,
         is_neox_style,
     )
-    lora_rope = LinearScalingRotaryEmbeddingWithLora(rope)
+    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
     lora_rope.set_mapping(punica_wrapper)
     lora_rope.create_lora_weights(max_loras, lora_config)
     linear_rope = get_rope(head_size, rotary_dim, max_position, base,
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index 62005de73dd..0a94298c9f7 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -8,7 +8,7 @@
 
 import vllm
 from vllm import SamplingParams
-from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
+from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.rotary_embedding import (
     LinearScalingRotaryEmbedding)
@@ -151,7 +151,7 @@ def test_rotary_emb_replaced(dist_init):
         if "rotary_emb" in module_name:
             if "base_layer" not in module_name:
                 rotary_emb_count += 1
-                assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
+                assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
             else:
                 assert isinstance(module, LinearScalingRotaryEmbedding)
     # Llama 2 has 32 layers.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3ce9a046136..1690017f924 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1629,7 +1629,7 @@ def _get_stats(self,
         max_tokens_requests: List[int] = []
         finished_reason_requests: List[str] = []
 
-        # Lora requests
+        # LoRA requests
         running_lora_adapters = dict(
             collectionsCounter([
                 running_request.lora_request.lora_name
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9995951b3f3..1b65484c446 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -53,7 +53,7 @@
                                               EmbeddingResponse,
                                               EmbeddingResponseData,
                                               ErrorResponse,
-                                              LoadLoraAdapterRequest,
+                                              LoadLoRAAdapterRequest,
                                               PoolingChatRequest,
                                               PoolingCompletionRequest,
                                               PoolingRequest, PoolingResponse,
@@ -63,7 +63,7 @@
                                               TokenizeResponse,
                                               TranscriptionRequest,
                                               TranscriptionResponse,
-                                              UnloadLoraAdapterRequest)
+                                              UnloadLoRAAdapterRequest)
 from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -690,12 +690,12 @@ async def stop_profile(raw_request: Request):
 
 if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING:
     logger.warning(
-        "Lora dynamic loading & unloading is enabled in the API server. "
+        "LoRA dynamic loading & unloading is enabled in the API server. "
         "This should ONLY be used for local development!")
 
     @router.post("/v1/load_lora_adapter",
                  dependencies=[Depends(validate_json_request)])
-    async def load_lora_adapter(request: LoadLoraAdapterRequest,
+    async def load_lora_adapter(request: LoadLoRAAdapterRequest,
                                 raw_request: Request):
         handler = models(raw_request)
         response = await handler.load_lora_adapter(request)
@@ -707,7 +707,7 @@ async def load_lora_adapter(request: LoadLoraAdapterRequest,
 
     @router.post("/v1/unload_lora_adapter",
                  dependencies=[Depends(validate_json_request)])
-    async def unload_lora_adapter(request: UnloadLoraAdapterRequest,
+    async def unload_lora_adapter(request: UnloadLoRAAdapterRequest,
                                   raw_request: Request):
         handler = models(raw_request)
         response = await handler.unload_lora_adapter(request)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index cd2902f934b..31214211cfc 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1431,12 +1431,12 @@ class DetokenizeResponse(OpenAIBaseModel):
     prompt: str
 
 
-class LoadLoraAdapterRequest(BaseModel):
+class LoadLoRAAdapterRequest(BaseModel):
     lora_name: str
     lora_path: str
 
 
-class UnloadLoraAdapterRequest(BaseModel):
+class UnloadLoRAAdapterRequest(BaseModel):
     lora_name: str
     lora_int_id: Optional[int] = Field(default=None)
 
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 6ade4ece6d0..0f4a174a8c1 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -9,10 +9,10 @@
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              LoadLoraAdapterRequest,
+                                              LoadLoRAAdapterRequest,
                                               ModelCard, ModelList,
                                               ModelPermission,
-                                              UnloadLoraAdapterRequest)
+                                              UnloadLoRAAdapterRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -88,7 +88,7 @@ async def init_static_loras(self):
         if self.static_lora_modules is None:
             return
         for lora in self.static_lora_modules:
-            load_request = LoadLoraAdapterRequest(lora_path=lora.path,
+            load_request = LoadLoRAAdapterRequest(lora_path=lora.path,
                                                   lora_name=lora.name)
             load_result = await self.load_lora_adapter(
                 request=load_request, base_model_name=lora.base_model_name)
@@ -140,7 +140,7 @@ async def show_available_models(self) -> ModelList:
 
     async def load_lora_adapter(
             self,
-            request: LoadLoraAdapterRequest,
+            request: LoadLoRAAdapterRequest,
             base_model_name: Optional[str] = None
     ) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_load_lora_adapter_request(request)
@@ -177,7 +177,7 @@ async def load_lora_adapter(
 
     async def unload_lora_adapter(
             self,
-            request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]:
+            request: UnloadLoRAAdapterRequest) -> Union[ErrorResponse, str]:
         error_check_ret = await self._check_unload_lora_adapter_request(request
                                                                         )
         if error_check_ret is not None:
@@ -192,7 +192,7 @@ async def unload_lora_adapter(
         return f"Success: LoRA adapter '{lora_name}' removed successfully."
 
     async def _check_load_lora_adapter_request(
-            self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]:
+            self, request: LoadLoRAAdapterRequest) -> Optional[ErrorResponse]:
         # Check if both 'lora_name' and 'lora_path' are provided
         if not request.lora_name or not request.lora_path:
             return create_error_response(
@@ -214,7 +214,7 @@ async def _check_load_lora_adapter_request(
 
     async def _check_unload_lora_adapter_request(
             self,
-            request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]:
+            request: UnloadLoRAAdapterRequest) -> Optional[ErrorResponse]:
         # Check if either 'lora_name' or 'lora_int_id' is provided
         if not request.lora_name and not request.lora_int_id:
             return create_error_response(
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 3d6620817b4..41e1ec94145 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -13,8 +13,8 @@
 from vllm.distributed.parallel_state import get_tensor_model_parallel_rank
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLora,
-                              QKVParallelLinearWithLora,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
 
 if TYPE_CHECKING:
@@ -167,9 +167,9 @@ def can_replace_layer(
         )
 
 
-class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
+class QKVParallelLinearWithShardedLoRA(QKVParallelLinearWithLoRA):
     """
-    Differs from QKVParallelLinearWithLora by slicing the
+    Differs from QKVParallelLinearWithLoRA by slicing the
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
@@ -202,9 +202,9 @@ def can_replace_layer(cls, source_layer: nn.Module,
         )
 
 
-class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
+class MergedQKVParallelLinearWithShardedLoRA(MergedQKVParallelLinearWithLoRA):
     """
-    Differs from MergedQKVParallelLinearWithLora by slicing the 
+    Differs from MergedQKVParallelLinearWithLoRA by slicing the 
     LoRA A's also.
 
     Based on S-LoRA, slicing happens along the rank dim.
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 7f68dae9717..6c48173c201 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -363,7 +363,7 @@ def set_lora(
         embeddings_tensor: Optional[torch.Tensor],
         lora_bias: Optional[torch.Tensor] = None,
     ):
-        # Except for QKVParallelLinearWithLora and
+        # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
@@ -686,7 +686,7 @@ def can_replace_layer(
                 and len(packed_modules_list) == 2)
 
 
-class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA):
+class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     """
     ColumnParallelLinear layer that is specifically designed for
     qkv_proj. Certain models, such as chatglm3 and baichuan-7b,
@@ -754,7 +754,7 @@ def can_replace_layer(cls, source_layer: nn.Module,
             packed_modules_list) == 1
 
 
-class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA):
+class MergedQKVParallelLinearWithLoRA(MergedColumnParallelLinearWithLoRA):
     """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices)
     packed together in qkv proj fashion
     (q_proj + k_proj + v_proj -> qkv_proj).
@@ -1120,7 +1120,7 @@ def can_replace_layer(
         return False
 
 
-class LinearScalingRotaryEmbeddingWithLora(BaseLayerWithLoRA):
+class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
     """Implements RoPE-scaled embeddings with linear scaling for
     multiple LoRA adapters with a specialized kernel.
 
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 774c3876e77..e1294884ac2 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -20,7 +20,7 @@
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import (BaseLayerWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLora,
+                              LinearScalingRotaryEmbeddingWithLoRA,
                               LoRAMapping)
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
@@ -201,7 +201,7 @@ def from_local_checkpoint(
             expected_lora_modules: Name of modules that are expected to be
                 replaced by lora.
             peft_helper: Loaded lora configuration information.
-            lora_model_id: Lora model id. If not given, automatically set by
+            lora_model_id: LoRA model id. If not given, automatically set by
                 a global counter.
             device: Device where the lora model is loaded.
             dtype: dtype of the lora model weights.
@@ -480,9 +480,9 @@ def _create_lora_modules(self):
                 from_layer(module, self.lora_slots, self.lora_config,
                            packed_moduled_lst, self.model.config))
 
-            # LinearScalingRotaryEmbeddingWithLora is used to handle
+            # LinearScalingRotaryEmbeddingWithLoRA is used to handle
             # long context lora. Register relevant metadata.
-            if isinstance(new_module, LinearScalingRotaryEmbeddingWithLora):
+            if isinstance(new_module, LinearScalingRotaryEmbeddingWithLoRA):
                 self.long_lora_context = LongContextLoRAContext(
                     new_module.scaling_factors, new_module.rotary_dim)
                 self.scaling_factor_to_offset = \
@@ -527,7 +527,7 @@ def create_dummy_lora(
             bias_enabled = self.lora_config.bias_enabled
             if (not self._match_target_modules(module_name)
                     or not isinstance(module, BaseLayerWithLoRA)
-                    or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
+                    or isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
                     or self._filter_unsupported_mm_module(module_name)):
                 continue
             parts = module_name.split(".")
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 9496ab5a75c..f6944368b36 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -42,7 +42,7 @@ class PEFTHelper:
 
     def _validate_features(self) -> List[str]:
         """
-        Check if there are any unsupported Lora features.
+        Check if there are any unsupported LoRA features.
         """
         error_msg = []
         if self.modules_to_save:
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index dad98f8e212..94fa3f27ab6 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -314,7 +314,7 @@ def embeddings_indices(self) -> torch.Tensor:
     def long_lora_indices(self) -> torch.Tensor:
         """ 
         This property provides access to the indices used for long context 
-        lora, specifically for LinearScalingRotaryEmbeddingWithLora.
+        lora, specifically for LinearScalingRotaryEmbeddingWithLoRA.
         """
         long_lora_len = self.indices_len[4]
         return self._long_lora_indices[:long_lora_len]
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 361dac5b331..63b465fdf74 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -15,17 +15,17 @@
 from vllm.lora.fully_sharded_layers import (
     ColumnParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora,
+    MergedQKVParallelLinearWithShardedLoRA, QKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA)
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLora,
+                              LinearScalingRotaryEmbeddingWithLoRA,
                               LogitsProcessorWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
-                              MergedQKVParallelLinearWithLora,
-                              QKVParallelLinearWithLora,
+                              MergedQKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithLoRA,
                               ReplicatedLinearWithLoRA,
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
@@ -41,17 +41,17 @@
     VocabParallelEmbeddingWithLoRA,
     ColumnParallelLinearWithLoRA,
     MergedColumnParallelLinearWithLoRA,
-    QKVParallelLinearWithLora,
-    MergedQKVParallelLinearWithLora,
+    QKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithLoRA,
     RowParallelLinearWithLoRA,
     ReplicatedLinearWithLoRA,
     LogitsProcessorWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
-    QKVParallelLinearWithShardedLora,
+    QKVParallelLinearWithShardedLoRA,
     MergedColumnParallelLinearWithShardedLoRA,
-    MergedQKVParallelLinearWithShardedLora,
+    MergedQKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA,
-    LinearScalingRotaryEmbeddingWithLora,
+    LinearScalingRotaryEmbeddingWithLoRA,
 }
 
 
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index 2bebf80fada..2829d631b49 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -6,10 +6,10 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
 from vllm.spec_decode.interfaces import SpeculativeProposer
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoRANotSupportedWorkerBase
 
 
-class ProposerWorkerBase(LoraNotSupportedWorkerBase, SpeculativeProposer):
+class ProposerWorkerBase(LoRANotSupportedWorkerBase, SpeculativeProposer):
     """Interface for proposer workers"""
 
     @abstractmethod
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8af71842224..871a3aee630 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -47,7 +47,7 @@
                                    get_sampled_token_logprobs, nvtx_range,
                                    split_batch_by_proposal_len)
 from vllm.utils import resolve_obj_by_qualname
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -118,7 +118,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
 
 # Reminder: Please update docs/source/features/compatibility_matrix.md
 # If the feature combo become valid
-class SpecDecodeWorker(LoraNotSupportedWorkerBase):
+class SpecDecodeWorker(LoRANotSupportedWorkerBase):
     """Worker which implements speculative decoding.
 
     Speculative decoding reduces decoding per-token latency by using a proposal
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 6625ccf0f2a..5ab70c0e413 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -21,7 +21,7 @@
 
 
 @dataclass
-class ArcticLoraConfig:
+class ArcticLoRAConfig:
     lora_r: int = 64
     lora_alpha: float = 16
     shard_base_weights: bool = False
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 95e7acd025f..df651e05a7b 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -13,11 +13,11 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.worker.neuron_model_runner import NeuronModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     LoRANotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 
-class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of neuron cores.
     """
 
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
index 1ad66e6f3be..fad91270ea2 100644
--- a/vllm/worker/openvino_worker.py
+++ b/vllm/worker/openvino_worker.py
@@ -24,7 +24,7 @@
 from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
 from vllm.utils import bind_kv_cache
 from vllm.worker.openvino_model_runner import OpenVINOModelRunner
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
 
 logger = init_logger(__name__)
 
@@ -203,7 +203,7 @@ def get_cache_block_size(
         return dtype_size * total
 
 
-class OpenVINOWorker(LoraNotSupportedWorkerBase):
+class OpenVINOWorker(LoRANotSupportedWorkerBase):
     """A worker class that executes the model on OpenVINO backend.
 
     Each worker is associated with a single OpenVINO device. The worker is
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 12f10169f2d..7903e81943c 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -17,13 +17,13 @@
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, bind_kv_cache, get_dtype_size
 from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
-                                     LoraNotSupportedWorkerBase, WorkerBase,
+                                     LoRANotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
 
 logger = init_logger(__name__)
 
 
-class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
+class TPUWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
 
     def __init__(
         self,
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 445c0d3285b..7cc1562a5bc 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -189,7 +189,7 @@ def __getattr__(self, attr):
         return getattr(self.worker, attr)
 
 
-class LoraNotSupportedWorkerBase(WorkerBase):
+class LoRANotSupportedWorkerBase(WorkerBase):
     """Partial implementation of WorkerBase that raises exceptions when LoRA
     methods are invoked.
     """
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 047c0bbbc35..3aea0d7419d 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -18,13 +18,13 @@
 from vllm.platforms import current_platform
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
 from vllm.worker.xpu_model_runner import XPUModelRunner
 
 logger = init_logger(__name__)
 
 
-class XPUWorker(LoraNotSupportedWorkerBase, Worker):
+class XPUWorker(LoRANotSupportedWorkerBase, Worker):
     """A worker class that executes (a partition of) the model on a GPU.
     
     Each worker is associated with a single XPU device. The worker is 

From 152f65f259467f317c6b2a7c4df5b87edf2105fd Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 25 Feb 2025 23:56:34 -0800
Subject: [PATCH 0391/1240] [Misc] Fix input processing for Ultravox (#13871)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/multimodal/processing/test_common.py |  6 +++---
 tests/models/registry.py                          |  2 +-
 vllm/model_executor/models/ultravox.py            | 13 ++-----------
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0115863f562..a84999cfbf4 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -83,8 +83,8 @@ def _test_processing_correctness(
     }
 
     tokenizer_encode_kwargs = {}
-    if model_config.hf_config.model_type in ("mllama", "whisper"):
-        # For some encoder-decoder models, tokenizer will always add bos_token
+    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
+        # For some multimodal models, tokenizer will always add bos_token
         # at the beginning of prompt by default, causing hf_processor outputs
         # incorrect token ids. So we need use `add_special_tokens=False` here
         # to leave bos_token to be added by the processor.
@@ -172,7 +172,7 @@ def _test_processing_correctness(
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "fixie-ai/ultravox-v0_4",
     "openai/whisper-large-v3",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 566a4418feb..b47eaef30bf 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -284,7 +284,7 @@ def check_available_online(
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
                                      trust_remote_code=True),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 1dbba3c50b1..b8d4aef252e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -146,7 +146,8 @@ def _call_hf_processor(
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
         if not mm_data or not mm_data.get("audios", []):
-            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self.info.get_tokenizer().encode(
+                prompt, add_special_tokens=False)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
 
@@ -185,16 +186,6 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
-    def _apply_hf_processor_tokens_only(
-        self,
-        prompt_tokens: list[int],
-    ) -> list[int]:
-        # HF processor omits bos_token_id by setting add_special_tokens=False
-        tokenizer = self.info.get_tokenizer()
-        assert prompt_tokens[0] == tokenizer.bos_token_id
-
-        return prompt_tokens[1:]
-
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,

From 3005816de165d8bf52ddb952d7277ae9a05681f8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Feb 2025 18:31:43 +0800
Subject: [PATCH 0392/1240] [Bugfix] Add test example for Ultravox v0.5
 (#13890)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/registry.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index b47eaef30bf..8614baf18f3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -285,6 +285,7 @@ def check_available_online(
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
+                                     extras={"v0.5": "fixie-ai/ultravox-v0_5-llama-3_2-1b"},  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501

From d7c29ff7629fb99b8461158c60d6b8d9998b1a9a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Feb 2025 10:41:02 +0000
Subject: [PATCH 0393/1240] Add comments on accessing `kv_cache` and
 `attn_metadata` (#13887)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/layer.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 24f2a6372b4..c45c83a0707 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -47,6 +47,10 @@ def __init__(
         attn_type: str = AttentionType.DECODER,
         **extra_impl_args,
     ) -> None:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+        """
         super().__init__()
         if per_layer_sliding_window is not None:
             # per-layer sliding window
@@ -155,6 +159,15 @@ def forward(
         key: torch.Tensor,
         value: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        The KV cache is stored inside this class and is accessed via
+        `self.kv_cache`.
+
+        Attention metadata (`attn_metadata`) is set using a context manager in
+        the model runner's `execute_model` method. It is accessed via forward
+        context using
+        `vllm.forward_context.get_forward_context().attn_metadata`.
+        """
         if self.calculate_kv_scales:
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation:

From ab869692f0e4adc2a4200b8efcf6b833d2f394f5 Mon Sep 17 00:00:00 2001
From: Florian Greinacher <florian.greinacher@siemens.com>
Date: Wed, 26 Feb 2025 12:06:21 +0100
Subject: [PATCH 0394/1240] [Bugfix] Handle None parameters in Mistral function
 calls. (#13786)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/tokenization/test_mistral_tokenizer.py  | 35 ++++++++++++++++++-
 vllm/transformers_utils/tokenizers/mistral.py |  3 +-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index 03e1f1fadd7..f1c88028695 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -41,7 +41,40 @@
                 )
             ],
         ),
-    )],
+    ),
+     (
+         {
+             "messages":
+             [{
+                 "role": "user",
+                 "content": "What is the current local date and time?",
+             }],
+             "tools": [{
+                 "type": "function",
+                 "function": {
+                     "description": "Fetch the current local date and time.",
+                     "name": "get_current_time",
+                     "parameters": None,
+                 },
+             }],
+         },
+         ChatCompletionRequest(
+             messages=[
+                 UserMessage(
+                     content="What is the current local date and time?")
+             ],
+             tools=[
+                 Tool(
+                     type="function",
+                     function=Function(
+                         name="get_current_time",
+                         description="Fetch the current local date and time.",
+                         parameters={},
+                     ),
+                 )
+             ],
+         ),
+     )],
 )
 def test_make_mistral_chat_completion_request(openai_request,
                                               expected_mistral_request):
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 4e76f2dc871..801597bd365 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -164,7 +164,8 @@ def make_mistral_chat_completion_request(
                 tool["function"] for tool in tools
                 if tool["type"] == "function"
         ]:
-            function.setdefault("parameters", {})
+            if function.get("parameters") is None:
+                function["parameters"] = {}
 
     from mistral_common.protocol.instruct.request import ChatCompletionRequest
     return ChatCompletionRequest(messages=messages,

From 75eef99be5da4bb5aea9dce58a973208bc254f22 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Wed, 26 Feb 2025 06:06:47 -0500
Subject: [PATCH 0395/1240] [Misc]: Add support for goodput on guided
 benchmarking + TPOT calculation refactor (#13736)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving_guided.py | 87 ++++++++++++++++++++++++--
 1 file changed, 82 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 04942b06ffd..05eadff7978 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -9,7 +9,7 @@
     ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 
 On the client side, run:
-    python benchmarks/benchmark_serving.py \
+    python benchmarks/benchmark_serving_guided.py \
         --backend <backend> \
         --model <your_model> \
         --dataset json \
@@ -31,7 +31,7 @@
 import time
 import warnings
 from dataclasses import dataclass
-from typing import AsyncGenerator, List, Optional, Tuple
+from typing import AsyncGenerator, Dict, List, Optional, Tuple
 
 import datasets
 import numpy as np
@@ -264,6 +264,7 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentile_metrics: List[str],
     selected_percentiles: List[float],
+    goodput_config_dict: Optional[Dict[str, float]] = None,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
     actual_output_lens: List[int] = []
     total_input = 0
@@ -287,10 +288,10 @@ def calculate_metrics(
             total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
-                tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
-                                                                 1)
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
                 tpots.append(tpot)
-            outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
+            outputs[i].tpot = tpot
             # Note: if output_len <= 1, we regard tpot as 0 for goodput
             all_tpots.append(tpot)
             itls += outputs[i].itl
@@ -300,6 +301,28 @@ def calculate_metrics(
         else:
             actual_output_lens.append(0)
 
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(goodput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(goodput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(goodput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
     if completed == 0:
         warnings.warn(
             "All requests failed. This is likely due to a misconfiguration "
@@ -356,6 +379,7 @@ async def benchmark(
     max_concurrency: Optional[int],
     guided_decoding_ratio: float,
     guided_decoding_backend: str,
+    goodput_config_dict: Optional[Dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -483,6 +507,7 @@ async def limited_request_func(request_func_input, pbar):
         tokenizer=tokenizer,
         selected_percentile_metrics=selected_percentile_metrics,
         selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
     )
 
     print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@@ -494,6 +519,9 @@ async def limited_request_func(request_func_input, pbar):
                                  metrics.total_output))
     print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                     metrics.request_throughput))
+    if goodput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
     print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
                                     metrics.output_throughput))
     print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@@ -617,6 +645,40 @@ def _eval_correctness(expected, actual):
             100) if len(not_none_scores) > 0 else None
 
 
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return goodput_config_dict
+
+
+def check_goodput_args(args):
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return goodput_config_dict
+
+
 def main(args: argparse.Namespace):
     print(args)
     random.seed(args.seed)
@@ -661,6 +723,8 @@ def main(args: argparse.Namespace):
 
     input_requests = sample_requests(tokenizer, args)
 
+    goodput_config_dict = check_goodput_args(args)
+
     benchmark_result, ret = asyncio.run(
         benchmark(
             backend=backend,
@@ -681,6 +745,7 @@ def main(args: argparse.Namespace):
             max_concurrency=args.max_concurrency,
             guided_decoding_ratio=args.guided_decoding_ratio,
             guided_decoding_backend=args.guided_decoding_backend,
+            goodput_config_dict=goodput_config_dict,
         ))
 
     # Save config and results to json
@@ -865,6 +930,18 @@ def main(args: argparse.Namespace):
         "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+
     parser.add_argument("--no-guided-decoding",
                         action='store_true',
                         default=False,

From 3c94d044c01d293dd396cfe44add92c51f7caa88 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 26 Feb 2025 04:07:29 -0700
Subject: [PATCH 0396/1240] [Bugfix] Do not crash V0 engine on input errors
 (#13101)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/mq_llm_engine/test_error_handling.py | 78 ++++++++++++++++++++++
 vllm/engine/llm_engine.py                  | 62 ++++++++++++++++-
 vllm/engine/multiprocessing/engine.py      |  9 +++
 vllm/worker/model_runner.py                | 11 ++-
 vllm/worker/model_runner_base.py           | 18 +++++
 5 files changed, 172 insertions(+), 6 deletions(-)

diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index 35d00178111..aad7fc5303c 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -18,6 +18,7 @@
 from vllm.entrypoints.openai.api_server import build_async_engine_client
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 from vllm.lora.request import LoRARequest
+from vllm.sequence import SequenceGroupMetadata
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 
@@ -292,3 +293,80 @@ async def test_engine_process_death(tmp_socket):
             await client.check_health()
 
         client.close()
+
+
+def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
+                                   ipc_path: str):
+    """Simulate an exception while preparing inputs for the model.
+    In the wild, this could be something like a multimodal input processor
+    failing on invalid image data."""
+
+    # Make engine.
+    engine = MQLLMEngine.from_engine_args(
+        engine_args=engine_args,
+        usage_context=UsageContext.UNKNOWN_CONTEXT,
+        ipc_path=ipc_path)
+
+    runner = engine.engine.model_executor.driver_worker.worker.model_runner
+
+    # Raise error in the model runner when adding a sequence group.
+    # See class ModelInputForGPUBuilder
+    def raiser(_, seq_group_metadata: SequenceGroupMetadata):
+        if seq_group_metadata.request_id.startswith("evil"):
+            raise RAISED_ERROR(RAISED_VALUE)
+
+    runner.builder.per_seq_group_compute_fns.append(raiser)
+
+    # Run engine.
+    engine.start()
+
+
+@pytest.mark.asyncio
+async def test_failed_inputs(tmp_socket):
+    with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
+                           ipc_path=tmp_socket,
+                           run_fn=run_with_evil_input_processing) as engine:
+
+        client = await engine.make_client()
+        assert client.is_running
+
+        # Engine should be healthy
+        await client.check_health()
+
+        async def run_failing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id="evil" + str(uuid.uuid4())):
+                pass
+
+        async def run_passing_request():
+            async for _ in client.generate(
+                    prompt="Hello my name is",
+                    sampling_params=SamplingParams(max_tokens=10),
+                    request_id=str(uuid.uuid4())):
+                pass
+
+        passing_tasks = [
+            asyncio.create_task(run_passing_request()) for _ in range(10)
+        ]
+        failing_tasks = [
+            asyncio.create_task(run_failing_request()) for _ in range(10)
+        ]
+        await asyncio.gather(*failing_tasks, return_exceptions=True)
+        await asyncio.gather(*passing_tasks)
+
+        # All the bad inputs should have raised
+        for task in failing_tasks:
+            with pytest.raises(RAISED_ERROR):
+                task.result()
+
+        # But all good inputs should have still succeeded
+        for task in passing_tasks:
+            task.result()
+
+        # And the engine should remain healthy
+        assert not client.errored
+        await client.check_health()
+
+        client.close()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 1690017f924..3dee4dab4c4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -60,6 +60,7 @@
 from vllm.utils import (Counter, Device, deprecate_kwargs,
                         resolve_obj_by_qualname, weak_bind)
 from vllm.version import __version__ as VLLM_VERSION
+from vllm.worker.model_runner_base import InputProcessingError
 
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
@@ -410,6 +411,10 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
 
         self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {}
 
+        # Flag to set when an input fails to process and the engine should run
+        # the next step without re-scheduling.
+        self._skip_scheduling_next_step = False
+
     def _initialize_kv_caches(self) -> None:
         """Initialize the KV cache in the worker(s).
 
@@ -1334,7 +1339,11 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         # Skip the scheduler if there are any remaining steps in the seq groups.
         # This ensures that the scheduler is only called again when the current
         # batch has completed.
-        if not self._has_remaining_steps(seq_group_metadata_list):
+        # The scheduler is also skipped if a single request caused the last
+        # engine step to fail, and the previous schedule needs to be rerun.
+        if not self._has_remaining_steps(
+                seq_group_metadata_list
+        ) and not self._skip_scheduling_next_step:
             # Schedule iteration
             (seq_group_metadata_list, scheduler_outputs,
              allow_async_output_proc
@@ -1388,8 +1397,23 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
                 execute_model_req.async_callback = self.async_callbacks[
                     virtual_engine]
 
-            outputs = self.model_executor.execute_model(
-                execute_model_req=execute_model_req)
+            try:
+                outputs = self.model_executor.execute_model(
+                    execute_model_req=execute_model_req)
+                self._skip_scheduling_next_step = False
+            except InputProcessingError as e:
+                # The input for this request cannot be processed, so we must
+                # abort it. If there are remaining requests in the batch that
+                # have been scheduled, they will be retried on the next step.
+                invalid_request_id = e.request_id
+                self._abort_and_cache_schedule(
+                    request_id=invalid_request_id,
+                    virtual_engine=virtual_engine,
+                    seq_group_metadata_list=seq_group_metadata_list,
+                    scheduler_outputs=scheduler_outputs,
+                    allow_async_output_proc=allow_async_output_proc)
+                # Raise so the caller is notified that this request failed
+                raise
 
             # We need to do this here so that last step's sampled_token_ids can
             # be passed to the next iteration for PP.
@@ -1464,6 +1488,38 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
 
         return ctx.request_outputs
 
+    def _abort_and_cache_schedule(
+            self, request_id: str, virtual_engine: int,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            scheduler_outputs: SchedulerOutputs,
+            allow_async_output_proc: bool) -> None:
+        """Aborts a single request, and caches the scheduler outputs minus that
+        request. This allows the next step to continue processing the remaining
+        requests without having to re-run the scheduler."""
+
+        # Abort the request and remove its sequence group from the current
+        # schedule
+        self.abort_request(request_id)
+        for i, metadata in enumerate(seq_group_metadata_list):
+            if metadata.request_id == request_id:
+                del seq_group_metadata_list[i]
+                break
+        for i, group in enumerate(scheduler_outputs.scheduled_seq_groups):
+            if group.seq_group.request_id == request_id:
+                del scheduler_outputs.scheduled_seq_groups[i]
+                break
+
+        # If there are still other sequence groups left in the schedule, cache
+        # them and flag the engine to reuse the schedule.
+        if len(seq_group_metadata_list) > 0:
+            self._skip_scheduling_next_step = True
+            # Reuse multi-step caching logic
+            self._cache_scheduler_outputs_for_multi_step(
+                virtual_engine=virtual_engine,
+                scheduler_outputs=scheduler_outputs,
+                seq_group_metadata_list=seq_group_metadata_list,
+                allow_async_output_proc=allow_async_output_proc)
+
     def _has_remaining_steps(
         self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
     ) -> bool:
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index ce24aa21514..efea6ee2c69 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -27,6 +27,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
+from vllm.worker.model_runner_base import InputProcessingError
 
 logger = init_logger(__name__)
 
@@ -210,6 +211,14 @@ def engine_step(self) -> List[RequestOutput]:
             return self.engine.step()
         except SystemExit:
             raise
+        except InputProcessingError as e:
+            # Special case where we handle an error preparing the inputs for
+            # a single request in the batch
+            rpc_err = RPCError(request_id=e.request_id,
+                               is_engine_errored=False,
+                               exception=e.__cause__)
+            self._send_outputs(rpc_err)
+            return []
         except BaseException as e:
             self._set_errored(e)
             rpc_err = RPCError(request_id=None,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 86dcde234f8..a37a3168bbb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -53,8 +53,8 @@
                         is_pin_memory_available, supports_dynamo,
                         weak_ref_tensor)
 from vllm.worker.model_runner_base import (
-    ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
-    _add_attn_metadata_broadcastable_dict,
+    InputProcessingError, ModelRunnerBase, ModelRunnerInputBase,
+    ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
@@ -1216,7 +1216,12 @@ def _prepare_model_input_tensors(
         """
         self.builder.prepare(finished_requests_ids)
         for seq_group_metadata in seq_group_metadata_list:
-            self.builder.add_seq_group(seq_group_metadata)
+            try:
+                self.builder.add_seq_group(seq_group_metadata)
+            except Exception as e:
+                # Raise an exception that tracks the ID of the bad request
+                raise InputProcessingError(seq_group_metadata.request_id,
+                                           str(e)) from e
 
         self.builder.reset_cached_inter_data()
 
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index bae37cb7155..935325cb2e1 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -261,3 +261,21 @@ def __init__(
 
     def __getattr__(self, attr):
         return getattr(self.model_runner, attr)
+
+
+class InputProcessingError(Exception):
+    """This exception is raised when an error occurs preparing the inputs for
+    a single sequence group.
+    This allows the engine to gracefully handle errors with a single sequence
+    group without having to fail the entire batch.
+    """
+
+    def __init__(self, request_id, message):
+        """request_id is the id of the offending sequence group"""
+        self.request_id = request_id
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return "Failed to prepare inputs for sequence group with request id: " \
+                f"{self.request_id}, Error: {self.message}"

From 2af77889a64aaf36b3de59e7e7f63777f475cc58 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Feb 2025 20:56:50 +0800
Subject: [PATCH 0397/1240] [Bugfix] Update expected token counts for Ultravox
 tests (#13895)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_audio.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index fe7299a48e6..7e08fdaf1ad 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -83,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -140,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -196,7 +196,7 @@ async def test_single_chat_session_input_audio(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=201, total_tokens=211)
+        completion_tokens=10, prompt_tokens=202, total_tokens=212)
 
     message = choice.message
     message = chat_completion.choices[0].message

From 8295114f3f228d6d1fb777b32685ea1d7a3548ca Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Wed, 26 Feb 2025 05:18:54 -0800
Subject: [PATCH 0398/1240] [TPU] use torch2.6 with whl package (#13860)

Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-tpu.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 1abde714af7..8bfbb2dda19 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -17,7 +17,9 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.6.0.dev20241216+cpu
+torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"

From e45b33163312b3793dbc40526c73d0b9a24b7763 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 27 Feb 2025 00:31:53 +0800
Subject: [PATCH 0399/1240] [Misc] fixed qwen_vl_utils parameter error (#13906)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language_multi_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 5dc6a936d1c..872c9481a22 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -439,7 +439,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
         image_data = [fetch_image(url) for url in image_urls]
     else:
         image_data, _ = process_vision_info(messages,
-                                            return_video_sample_fps=False)
+                                            return_video_kwargs=False)
 
     return ModelRequestData(
         llm=llm,

From 49ce1a8158ecbbf623eabd3df38789134aa37c43 Mon Sep 17 00:00:00 2001
From: Wallas Henrique <wallashss@users.noreply.github.com>
Date: Wed, 26 Feb 2025 15:52:34 -0300
Subject: [PATCH 0400/1240] [Bugfix] Backend option to disable xgrammar
 any_whitespace (#12744)

Signed-off-by: Wallas Santos <wallashss@ibm.com>
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 54 +++++++++++++++++++
 vllm/engine/arg_utils.py                      |  1 +
 .../guided_decoding/xgrammar_decoding.py      | 36 +++++++++++--
 3 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 314dc59328c..fce581c7828 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -6,6 +6,7 @@
 
 import jsonschema
 import pytest
+from pydantic import BaseModel
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -322,3 +323,56 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+def test_json_with_any_whitespace_disabled(llm):
+
+    class ResponseSchema(BaseModel):
+        clarifying_question: str
+        cost_per_serving: str
+        calories: str
+        type_dish_ids: str
+        type_meal_ids: str
+        product_ids: list[str]
+        exclude_product_ids: list[str]
+        allergen_ids: list[str]
+        total_cooking_time: str
+        kitchen_ids: str
+        holiday_ids: str
+
+    # Note: Without this setting, the response is sometimes full of `\n`
+    # for some models. This option prevents that.
+    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
+
+    schema = ResponseSchema.model_json_schema()
+    guided_params = GuidedDecodingParams(json=schema,
+                                         backend=\
+                                           guided_decoding_backend)
+    sampling_params = SamplingParams(max_tokens=2000,
+                                     frequency_penalty=0,
+                                     presence_penalty=-1.1,
+                                     repetition_penalty=1.3,
+                                     guided_decoding=guided_params)
+
+    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
+              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
+              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
+    outputs = llm.generate(prompts=prompt,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert "\n" not in generated_text
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
+        jsonschema.validate(instance=parsed_json, schema=schema)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 663ea1ef8af..26d4a84b841 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -385,6 +385,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'Backend-specific options can be supplied in a comma-separated '
             'list following a colon after the backend name. Valid backends and '
             'all available options are: [xgrammar:no-fallback, '
+            'xgrammar:disable-any-whitespace, '
             'outlines:no-fallback, lm-format-enforcer:no-fallback]')
         parser.add_argument(
             '--logits-processor-pattern',
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index e6ba7f5ecc6..eb9d83acb28 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -19,6 +19,7 @@
     xgr_installed = False
     pass
 
+from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
                                                        grammar_is_likely_lark)
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
@@ -29,6 +30,8 @@
     from vllm.config import ModelConfig
     from vllm.sampling_params import GuidedDecodingParams
 
+logger = init_logger(__name__)
+
 
 # TODO: passing batch size to max threads here
 def get_local_xgrammar_guided_decoding_logits_processor(
@@ -161,6 +164,7 @@ class GrammarConfig:
     json_str: str | None = None
     grammar_str: str | None = None
     json_object: bool | None = None
+    any_whitespace: bool = True
     max_threads: int = 8
     tokenizer_data: TokenizerData | None = None
 
@@ -180,11 +184,33 @@ def from_guided_params(cls,
             else:
                 json_str = guided_params.json
 
+            any_whitespace = 'disable-any-whitespace' not in \
+                    guided_params.backend_options()
+
+            # Check and log if model with xgrammar and whitespace have history
+            # of runaway generation of whitespaces.
+            # References:
+            # https://github.com/vllm-project/vllm/pull/12744
+            # https://github.com/mlc-ai/xgrammar/issues/212
+            model_with_warn = None
+
+            if 'Mistral' in model_config.model:
+                model_with_warn = 'Mistral'
+            elif 'Qwen' in model_config.model:
+                model_with_warn = 'Qwen'
+
+            if model_with_warn is not None and any_whitespace:
+                msg = (f"{model_with_warn} "
+                       f"model detected, consider set "
+                       f"`guided_backend=xgrammar:disable-any-whitespace` "
+                       f"to prevent runaway generation of whitespaces.")
+                logger.info_once(msg)
             # Validate the schema and raise ValueError here if it is invalid.
             # This is to avoid exceptions in model execution, which will crash
             # the engine worker process.
             try:
-                xgr.Grammar.from_json_schema(json_str)
+                xgr.Grammar.from_json_schema(json_str,
+                                             any_whitespace=any_whitespace)
             except RuntimeError as err:
                 raise ValueError(str(err)) from err
 
@@ -192,7 +218,8 @@ def from_guided_params(cls,
                        vocab_size=model_config.hf_text_config.vocab_size,
                        tokenizer_hash=tokenizer_hash,
                        max_threads=max_threads,
-                       tokenizer_data=tokenizer_data)
+                       tokenizer_data=tokenizer_data,
+                       any_whitespace=any_whitespace)
         elif guided_params.grammar:
             # XGrammar only supports GBNF grammars, so we must convert Lark
             if grammar_is_likely_lark(guided_params.grammar):
@@ -290,7 +317,10 @@ def _ensure_ctx(self):
         if self.ctx is None:
             compiler = GrammarCompilerCache.get_compiler(self.config)
             if self.config.json_str is not None:
-                self.ctx = compiler.compile_json_schema(self.config.json_str)
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema(self.config.json_str,
+                                         any_whitespace=any_whitespace)
             elif self.config.grammar_str is not None:
                 self.ctx = compiler.compile_grammar(self.config.grammar_str)
             elif self.config.json_object:

From 00be88b80293ec4ac2b20362180a4ebe27429479 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 26 Feb 2025 13:48:55 -0800
Subject: [PATCH 0401/1240] [BugFix] Make FP8 Linear compatible with
 torch.compile (#13918)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../model_executor/layers/quantization/fp8.py |  5 +----
 .../layers/quantization/utils/fp8_utils.py    | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 76a7d4df8a3..a705f63be4a 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -369,12 +369,9 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        # Note: lazy import to avoid triton import error.
-        from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-            apply_w8a8_block_fp8_linear)
         if self.block_quant:
             assert self.quant_config.weight_block_size is not None
-            return apply_w8a8_block_fp8_linear(
+            return torch.ops.vllm.apply_w8a8_block_fp8_linear(
                 input=x,
                 weight=layer.weight,
                 block_size=self.quant_config.weight_block_size,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 61706f485f4..7d91d2cf1c6 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
 from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -81,6 +82,25 @@ def apply_w8a8_block_fp8_linear(
     return output.to(dtype=input.dtype).view(*output_shape)
 
 
+def apply_w8a8_block_fp8_linear_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    block_size: List[int],
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    output_shape = [*input.shape[:-1], weight.shape[0]]
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+
+
+direct_register_custom_op(
+    op_name="apply_w8a8_block_fp8_linear",
+    op_func=apply_w8a8_block_fp8_linear,
+    mutates_args=[],
+    fake_impl=apply_w8a8_block_fp8_linear_fake,
+)
+
+
 # Unify the interface between `apply_w8a8_block_fp8_linear` and
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally

From 59450cee7dfd5b7816cbd94e021c01913f5d1bd0 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 26 Feb 2025 21:35:08 -0500
Subject: [PATCH 0402/1240] [Kernel] FlashMLA integration (#13747)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |  77 +-----
 cmake/external_projects/flashmla.cmake        |  66 +++++
 cmake/external_projects/vllm_flash_attn.cmake |  67 +++++
 setup.py                                      |   6 +
 tests/kernels/test_flashmla.py                | 132 ++++++++++
 vllm/_custom_ops.py                           |  64 +++++
 vllm/attention/backends/flashmla.py           | 239 ++++++++++++++++++
 vllm/attention/backends/mla/common.py         |  28 +-
 vllm/attention/ops/flashmla.py                | 115 +++++++++
 vllm/platforms/cuda.py                        |  24 ++
 vllm/platforms/interface.py                   |   1 +
 11 files changed, 733 insertions(+), 86 deletions(-)
 create mode 100644 cmake/external_projects/flashmla.cmake
 create mode 100644 cmake/external_projects/vllm_flash_attn.cmake
 create mode 100644 tests/kernels/test_flashmla.py
 create mode 100644 vllm/attention/backends/flashmla.py
 create mode 100644 vllm/attention/ops/flashmla.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 02a60c0e352..0dd350c93ed 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -575,77 +575,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     WITH_SOABI)
 endif()
 
-# vllm-flash-attn currently only supported on CUDA
-if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
-  return()
+# For CUDA we also build and ship some external projects.
+if (VLLM_GPU_LANG STREQUAL "CUDA")
+    include(cmake/external_projects/flashmla.cmake)
+    include(cmake/external_projects/vllm_flash_attn.cmake)
 endif ()
-
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis)
-# we need to manually set VLLM_GPU_ARCHES here.
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  foreach(_ARCH ${CUDA_ARCHS})
-    string(REPLACE "." "" _ARCH "${_ARCH}")
-    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
-  endforeach()
-endif()
-
-#
-# Build vLLM flash attention from source
-#
-# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
-# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
-# They should be identical but if they aren't, this is a massive footgun.
-#
-# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
-# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
-# If no component is specified, vllm-flash-attn is still installed.
-
-# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
-# This is to enable local development of vllm-flash-attn within vLLM.
-# It can be set as an environment variable or passed as a cmake argument.
-# The environment variable takes precedence.
-if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
-  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
-endif()
-
-if(VLLM_FLASH_ATTN_SRC_DIR)
-  FetchContent_Declare(
-          vllm-flash-attn SOURCE_DIR
-          ${VLLM_FLASH_ATTN_SRC_DIR}
-          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
-  )
-else()
-  FetchContent_Declare(
-          vllm-flash-attn
-          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
-          GIT_PROGRESS TRUE
-          # Don't share the vllm-flash-attn build between build types
-          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
-  )
-endif()
-
-
-# Fetch the vllm-flash-attn library
-FetchContent_MakeAvailable(vllm-flash-attn)
-message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
-
-# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
-# case only one is built, in the case both are built redundant work is done)
-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
-  COMPONENT _vllm_fa2_C
-  FILES_MATCHING PATTERN "*.py"
-)
-
-install(
-  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
-  DESTINATION vllm_flash_attn
-  COMPONENT _vllm_fa3_C
-  FILES_MATCHING PATTERN "*.py"
-)
-
-# Nothing after vllm-flash-attn, see comment about macros above
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
new file mode 100644
index 00000000000..6291475164b
--- /dev/null
+++ b/cmake/external_projects/flashmla.cmake
@@ -0,0 +1,66 @@
+include(FetchContent)
+
+# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory 
+# instead of downloading.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{FLASH_MLA_SRC_DIR})
+  set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
+endif()
+
+if(FLASH_MLA_SRC_DIR)
+  FetchContent_Declare(
+        flashmla 
+        SOURCE_DIR ${FLASH_MLA_SRC_DIR}
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+else()
+  FetchContent_Declare(
+        flashmla
+        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
+        GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
+        GIT_PROGRESS TRUE
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND ""
+  )
+endif()
+
+
+FetchContent_MakeAvailable(flashmla)
+message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
+
+# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
+# Only build FlashMLA kernels if we are building for something compatible with 
+# sm90a
+cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
+if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
+    set(FlashMLA_SOURCES
+        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
+        ${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
+
+    set(FlashMLA_INCLUDES
+        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
+        ${flashmla_SOURCE_DIR}/csrc/include)
+
+    set_gencode_flags_for_srcs(
+        SRCS "${FlashMLA_SOURCES}"
+        CUDA_ARCHS "${FLASH_MLA_ARCHS}")
+
+    define_gpu_extension_target(
+        _flashmla_C
+        DESTINATION vllm
+        LANGUAGE ${VLLM_GPU_LANG}
+        SOURCES ${FlashMLA_SOURCES}
+        COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+        ARCHITECTURES ${VLLM_GPU_ARCHES}
+        INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
+        USE_SABI 3
+        WITH_SOABI)
+else()
+    # Create an empty target for setup.py when not targeting sm90a systems
+    add_custom_target(_flashmla_C)
+endif()
+
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
new file mode 100644
index 00000000000..ef6261fa6d9
--- /dev/null
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -0,0 +1,67 @@
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# we need to manually set VLLM_GPU_ARCHES here.
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  foreach(_ARCH ${CUDA_ARCHS})
+    string(REPLACE "." "" _ARCH "${_ARCH}")
+    list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
+  endforeach()
+endif()
+
+#
+# Build vLLM flash attention from source
+#
+# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
+# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
+# They should be identical but if they aren't, this is a massive footgun.
+#
+# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
+# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
+# If no component is specified, vllm-flash-attn is still installed.
+
+# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
+# This is to enable local development of vllm-flash-attn within vLLM.
+# It can be set as an environment variable or passed as a cmake argument.
+# The environment variable takes precedence.
+if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
+  set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
+endif()
+
+if(VLLM_FLASH_ATTN_SRC_DIR)
+  FetchContent_Declare(
+          vllm-flash-attn SOURCE_DIR 
+          ${VLLM_FLASH_ATTN_SRC_DIR}
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+else()
+  FetchContent_Declare(
+          vllm-flash-attn
+          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
+          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
+          GIT_PROGRESS TRUE
+          # Don't share the vllm-flash-attn build between build types
+          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
+  )
+endif()
+
+
+# Fetch the vllm-flash-attn library
+FetchContent_MakeAvailable(vllm-flash-attn)
+message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
+
+# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
+# case only one is built, in the case both are built redundant work is done)
+install(
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm_flash_attn
+  COMPONENT _vllm_fa2_C
+  FILES_MATCHING PATTERN "*.py"
+)
+
+install(
+  DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
+  DESTINATION vllm_flash_attn
+  COMPONENT _vllm_fa3_C
+  FILES_MATCHING PATTERN "*.py"
+)
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d8a336c2d42..a636d266cfb 100755
--- a/setup.py
+++ b/setup.py
@@ -328,6 +328,7 @@ def run(self) -> None:
             files_to_copy = [
                 "vllm/_C.abi3.so",
                 "vllm/_moe_C.abi3.so",
+                "vllm/_flashmla_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
                 "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
                 "vllm/vllm_flash_attn/flash_attn_interface.py",
@@ -612,6 +613,11 @@ def _read_requirements(filename: str) -> List[str]:
         # FA3 requires CUDA 12.0 or later
         ext_modules.append(
             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # Optional since this doesn't get built (produce an .so file) when
+        # not targeting a hopper system
+        ext_modules.append(
+            CMakeExtension(name="vllm._flashmla_C", optional=True))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 
 if _build_custom_ops():
diff --git a/tests/kernels/test_flashmla.py b/tests/kernels/test_flashmla.py
new file mode 100644
index 00000000000..21c1079fc8e
--- /dev/null
+++ b/tests/kernels/test_flashmla.py
@@ -0,0 +1,132 @@
+# Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py
+# SPDX-License-Identifier: Apache-2.0
+import math
+import random
+
+import pytest
+import torch
+import triton
+
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+
+
+def cal_diff(x: torch.Tensor, y: torch.Tensor, name: str) -> None:
+    x, y = x.double(), y.double()
+    cos_diff = 1 - 2 * (x * y).sum().item() / max(
+        (x * x + y * y).sum().item(), 1e-12)
+    assert cos_diff < 1e-5
+
+FLASH_MLA_UNSUPPORTED_REASON = is_flashmla_supported()[1] \
+    if not is_flashmla_supported()[0] else "FlashMLA is supported"
+
+
+@pytest.mark.skipif(not is_flashmla_supported()[0],
+                    reason=FLASH_MLA_UNSUPPORTED_REASON)
+@pytest.mark.parametrize("b", [128])
+@pytest.mark.parametrize("s_q", [1, 2])
+@pytest.mark.parametrize("mean_sk", [4096, 8192])
+@pytest.mark.parametrize("h_q", [16, 32, 64, 128])
+@pytest.mark.parametrize("h_kv", [1])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.parametrize("causal", [True])
+@pytest.mark.parametrize("varlen", [False, True])
+@torch.inference_mode()
+def test_flash_mla(b, s_q, mean_sk, h_q, h_kv, d, dv, block_size, causal,
+                   varlen):
+    # TODO: parametrize using pytest
+    dtype = torch.bfloat16
+    device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+
+    print(f"{b=}, {s_q=}, {mean_sk=}, {h_q=}, {h_kv=}, "
+          f"{d=}, {dv=}, {causal=}, {varlen=}")
+
+    cache_seqlens = torch.full((b, ), mean_sk, dtype=torch.int32)
+    if varlen:
+        for i in range(b):
+            cache_seqlens[i] = max(random.normalvariate(mean_sk, mean_sk / 2),
+                                   s_q)
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_table = torch.arange(b * max_seqlen_pad // block_size,
+                               dtype=torch.int32).view(
+                                   b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+    for i in range(b):
+        blocked_k.view(b, max_seqlen_pad, h_kv,
+                       d)[i, cache_seqlens[i].item():] = float("nan")
+    blocked_v = blocked_k[..., :dv]
+
+    tile_scheduler_metadata, num_splits = get_mla_metadata(
+        cache_seqlens, s_q * h_q // h_kv, h_kv)
+
+    def flash_mla():
+        return flash_mla_with_kvcache(
+            q,
+            blocked_k,
+            block_table,
+            cache_seqlens,
+            dv,
+            tile_scheduler_metadata,
+            num_splits,
+            causal=causal,
+        )
+
+    def scaled_dot_product_attention(query, key, value, is_causal=False):
+        query = query.float()
+        key = key.float()
+        value = value.float()
+        key = key.repeat_interleave(h_q // h_kv, dim=0)
+        value = value.repeat_interleave(h_q // h_kv, dim=0)
+        attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+        if is_causal:
+            s_q = query.shape[-2]
+            s_k = key.shape[-2]
+            attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+            temp_mask = torch.ones(s_q, s_k,
+                                   dtype=torch.bool).tril(diagonal=s_k - s_q)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+            attn_weight += attn_bias
+        lse = attn_weight.logsumexp(dim=-1)
+        attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+        return attn_weight @ value, lse
+
+    def ref_mla():
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            ref_O, LSE = scaled_dot_product_attention(
+                q[i].transpose(0, 1),
+                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                is_causal=causal,
+            )
+            out[i] = ref_O.transpose(0, 1)
+            lse[i] = LSE
+        return out, lse
+
+    out_flash, lse_flash = flash_mla()
+    out_torch, lse_torch = ref_mla()
+    cal_diff(out_flash, out_torch, "out")
+    cal_diff(lse_flash, lse_torch, "lse")
+
+    t = triton.testing.do_bench(flash_mla, fast_flush=False)
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d +
+             b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"{t:.3f} ms, {FLOPS / 10 ** 9 / t:.0f} "
+          f"TFLOPS, {bytes / 10 ** 6 / t:.0f} GB/s")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3306610ad80..0e83bcaead9 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1163,3 +1163,67 @@ def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
 def register_graph_buffers(fa: int, handles: List[List[int]],
                            offsets: List[List[int]]) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+
+
+def get_flash_mla_metadata(
+    cache_seqlens: torch.Tensor,
+    num_heads_per_head_k: int,
+    num_heads_k: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        cache_seqlens: (batch_size), dtype torch.int32.
+        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
+        num_heads_k: num_heads_k.
+
+    Return:
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), dtype torch.int32.
+        num_splits: (batch_size + 1), dtype torch.int32.
+    """
+    return torch.ops._C.get_flash_mla_metadata(cache_seqlens,
+                                               num_heads_per_head_k,
+                                               num_heads_k)
+
+
+def flash_mla_with_kvcache(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    cache_seqlens: torch.Tensor,
+    head_dim_v: int,
+    tile_scheduler_metadata: torch.Tensor,
+    num_splits: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        q: (batch_size, seq_len_q, num_heads_q, head_dim).
+        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
+        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
+        cache_seqlens: (batch_size), torch.int32.
+        head_dim_v: Head_dim of v.
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), torch.int32, return by get_mla_metadata.
+        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
+        softmax_scale: float. The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim).
+        causal: bool. Whether to apply causal attention mask.
+
+    Return:
+        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
+        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1]**(-0.5)
+    out, softmax_lse = torch.ops._C.flash_mla_fwd_kvcache(
+        q,
+        k_cache,
+        None,
+        head_dim_v,
+        cache_seqlens,
+        block_table,
+        softmax_scale,
+        causal,
+        tile_scheduler_metadata,
+        num_splits,
+    )
+    return out, softmax_lse
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
new file mode 100644
index 00000000000..273c69b63ec
--- /dev/null
+++ b/vllm/attention/backends/flashmla.py
@@ -0,0 +1,239 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.mla.common import (MLACommonBackend,
+                                                MLACommonImpl,
+                                                MLACommonMetadata,
+                                                MLACommonMetadataBuilder,
+                                                MLACommonState)
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+
+class FlashMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHMLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashMLAImpl"]:
+        return FlashMLAImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["FlashMLAMetadata"]:
+        return FlashMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
+        return FlashMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["FlashMLAState"]:
+        return FlashMLAState
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata):
+    decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
+                                                   torch.Tensor]] = None
+    decode_num_splits: Optional[torch.Tensor] = None
+
+    @property
+    def decode_metadata(self):
+        decode_metadata = super().decode_metadata
+        # TODO: cache assignment?
+        if decode_metadata is not None:
+            decode_metadata.decode_tile_scheduler_metadata=\
+                self.decode_tile_scheduler_metadata
+            decode_metadata.decode_num_splits=\
+                self.decode_num_splits
+        return decode_metadata
+
+    def advance_step(self,
+                     model_input: "ModelInputForGPUWithSamplingMetadata",
+                     sampled_token_ids: Optional[torch.Tensor],
+                     block_size: int,
+                     num_seqs: int,
+                     num_queries: int,
+                     turn_prefills_into_decodes: bool = False):
+        raise NotImplementedError(
+            "advance_step is not implemented for FlashMLA")
+
+
+class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config)
+
+    def build(self, seq_lens: List[int], query_lens: List[int],
+              cuda_graph_pad_size: int, batch_size: int):
+        m = super().build(seq_lens, query_lens, cuda_graph_pad_size,
+                          batch_size)
+
+        if m.num_decode_tokens > 0:
+            m.decode_tile_scheduler_metadata, m.decode_num_splits = \
+                get_mla_metadata(
+                m.seq_lens_tensor[m.num_prefills:],
+                self.num_q_heads,
+                1, # MQA for the decode path
+            )
+
+        return m
+
+
+class FlashMLAState(MLACommonState[FlashMLAMetadata]):
+
+    def __init__(self, *args, **kwds):
+        super().__init__(*args, **kwds)
+
+        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config)
+
+    @contextmanager
+    def graph_capture(self, max_batch_size: int):
+        # Run a dummy `get_mla_metadata` so we can get the right shapes
+        self._graph_decoder_tile_scheduler_metadata, \
+            self._graph_decode_num_splits = get_mla_metadata(
+            torch.ones(
+                max_batch_size, dtype=torch.int32, device=self.runner.device),
+            self.num_q_heads,
+            1, # MQA for the decode path
+        )
+
+        with super().graph_capture(max_batch_size):
+            yield
+
+        del self._graph_decoder_tile_scheduler_metadata
+        del self._graph_decode_num_splits
+
+    def graph_capture_get_metadata_for_batch(
+            self, batch_size: int, is_encoder_decoder_model: bool = False):
+        metadata = super().graph_capture_get_metadata_for_batch(
+            batch_size, is_encoder_decoder_model)
+        assert metadata.num_decode_tokens > 0
+
+        decoder_tile_scheduler_metadata, decode_num_splits = get_mla_metadata(
+            self._graph_seq_lens[:batch_size],
+            self.num_q_heads,
+            1,  # MQA for the decode path
+        )
+
+        self._graph_decoder_tile_scheduler_metadata.copy_(
+            decoder_tile_scheduler_metadata)
+        self._graph_decode_num_splits[:batch_size + 1].copy_(decode_num_splits)
+
+        metadata.decode_tile_scheduler_metadata=\
+            self._graph_decoder_tile_scheduler_metadata
+        metadata.decode_num_splits=\
+            self._graph_decode_num_splits[:batch_size + 1]
+
+        return metadata
+
+    def get_graph_input_buffers(self,
+                                attn_metadata,
+                                is_encoder_decoder_model: bool = False):
+        input_buffers = super().get_graph_input_buffers(
+            attn_metadata, is_encoder_decoder_model)
+        input_buffers["decode_tile_scheduler_metadata"] = \
+                attn_metadata.decode_metadata.decode_tile_scheduler_metadata
+        input_buffers["decode_num_splits"] = \
+                attn_metadata.decode_metadata.decode_num_splits
+
+        return input_buffers
+
+    def prepare_graph_input_buffers(self,
+                                    input_buffers,
+                                    attn_metadata,
+                                    is_encoder_decoder_model: bool = False):
+        super().prepare_graph_input_buffers(input_buffers, attn_metadata,
+                                            is_encoder_decoder_model)
+
+        input_buffers["decode_tile_scheduler_metadata"].copy_(
+            attn_metadata.decode_metadata.decode_tile_scheduler_metadata)
+        input_buffers["decode_num_splits"].copy_(
+            attn_metadata.decode_metadata.decode_num_splits)
+
+
+class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        assert is_flashmla_supported(), \
+            "FlashMLA is not supported on this device"
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashMLAImpl")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 FlashMLA not yet supported")
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+
+        q = torch.cat([q_nope, q_pe], dim=-1)\
+            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+
+        o, _ = flash_mla_with_kvcache(
+            q=q,
+            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            block_table=decode_meta.block_tables,
+            cache_seqlens=decode_meta.seq_lens_tensor,
+            head_dim_v=self.kv_lora_rank,
+            tile_scheduler_metadata=decode_meta.decode_tile_scheduler_metadata,
+            num_splits=decode_meta.decode_num_splits,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 225fee8d2a0..1befcb6b45d 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -293,7 +293,10 @@ def get_supported_head_sizes() -> List[int]:
         return [576]
 
 
-class MLACommonState(AttentionState):
+T = TypeVar("T", bound="MLACommonMetadata")
+
+
+class MLACommonState(AttentionState, Generic[T]):
 
     def __init__(self, runner):
         self.runner = runner
@@ -355,7 +358,9 @@ def graph_clone(self, batch_size: int):
         return self.__class__(self.runner)
 
     def graph_capture_get_metadata_for_batch(
-            self, batch_size: int, is_encoder_decoder_model: bool = False):
+            self,
+            batch_size: int,
+            is_encoder_decoder_model: bool = False) -> T:
         assert self._is_graph_capturing
 
         attn_metadata = self.runner.attn_backend.make_metadata(
@@ -507,8 +512,8 @@ class MLACommonMetadata(AttentionMetadata):
     # [4, 6], it is [0, 4, 10].
     seq_start_loc: Optional[torch.Tensor] = None
 
-    _cached_prefill_metadata: Optional["MLACommonMetadata"] = None
-    _cached_decode_metadata: Optional["MLACommonMetadata"] = None
+    _cached_prefill_metadata: Optional[Any] = None
+    _cached_decode_metadata: Optional[Any] = None
 
     num_prefill_tokens: int
 
@@ -537,7 +542,7 @@ def __post_init__(self):
                 f" received {self.head_dim}.")
 
     @property
-    def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
+    def prefill_metadata(self):
         if self.num_prefills == 0:
             return None
 
@@ -565,7 +570,7 @@ def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
         input_positions = (None if self.input_positions is None else
                            self.input_positions[:self.num_prefill_tokens])
 
-        self._cached_prefill_metadata = MLACommonMetadata(
+        self._cached_prefill_metadata = self.__class__(
             # Required by ModelRunner
             use_cuda_graph=False,  # Not Attention Related
             # Required by Attention Metadata
@@ -599,7 +604,7 @@ def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
         return self._cached_prefill_metadata
 
     @property
-    def decode_metadata(self) -> Optional["MLACommonMetadata"]:
+    def decode_metadata(self):
         if self.num_decode_tokens == 0:
             return None
 
@@ -617,7 +622,7 @@ def decode_metadata(self) -> Optional["MLACommonMetadata"]:
         input_positions = (None if self.input_positions is None else
                            self.input_positions[self.num_prefill_tokens:])
 
-        self._cached_decode_metadata = MLACommonMetadata(
+        self._cached_decode_metadata = self.__class__(
             # Required by ModelRunner
             use_cuda_graph=self.use_cuda_graph,  # Not Attention Related
             # Required by Attention Metadata
@@ -723,10 +728,7 @@ def advance_step(self,
                                    block_tables=self.block_tables)
 
 
-T = TypeVar("T", bound=MLACommonMetadata)
-
-
-class MLACommonMetadataBuilder(AttentionMetadataBuilder[MLACommonMetadata]):
+class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]):
     """
     NOTE: Please read the comment at the top of the file before trying to 
     understand this class
@@ -959,7 +961,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             assert max(context_chunk_seq_tot) <= \
                 self.chunked_prefill_workspace_size
 
-        return MLACommonMetadata(
+        return self.runner.attn_backend.make_metadata(
             # Required by ModelRunner
             use_cuda_graph=use_captured_graph,  # Not Attention Related
             # Required by Attention Metadata
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
new file mode 100644
index 00000000000..18b69a6b3dd
--- /dev/null
+++ b/vllm/attention/ops/flashmla.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py
+from typing import Optional, Tuple
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+if current_platform.is_cuda():
+    try:
+        import vllm._flashmla_C  # noqa: F401
+        _flashmla_C_AVAILABLE = True
+    except ImportError:
+        _flashmla_C_AVAILABLE = False
+else:
+    _flashmla_C_AVAILABLE = False
+
+
+def is_flashmla_supported() -> Tuple[bool, Optional[str]]:
+    """
+    Return: is_supported_flag, unsupported_reason (optional).
+    """
+    if not current_platform.is_cuda():
+        return False, "FlashMLA is only supported on CUDA devices."
+    if current_platform.get_device_capability()[0] != 9:
+        return False, "FlashMLA is only supported on Hopper devices."
+    if not _flashmla_C_AVAILABLE:
+        return False, "vllm._flashmla_C is not available, likely was not "\
+            "compiled due to insufficient nvcc version or a supported arch "\
+            "(only sm90a currently) was not in the list of target arches to "\
+            "compile for."
+    return True, None
+
+
+def get_mla_metadata(
+    cache_seqlens: torch.Tensor,
+    num_heads_per_head_k: int,
+    num_heads_k: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        cache_seqlens: (batch_size), dtype torch.int32.
+        num_heads_per_head_k: Equals to seq_len_q * num_heads_q // num_heads_k.
+        num_heads_k: num_heads_k.
+
+    Return:
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), 
+                                 dtype torch.int32.
+        num_splits: (batch_size + 1), dtype torch.int32.
+    """
+    return torch.ops._flashmla_C.get_mla_metadata(cache_seqlens,
+                                                  num_heads_per_head_k,
+                                                  num_heads_k)
+
+
+def flash_mla_with_kvcache(
+    q: torch.Tensor,
+    k_cache: torch.Tensor,
+    block_table: torch.Tensor,
+    cache_seqlens: torch.Tensor,
+    head_dim_v: int,
+    tile_scheduler_metadata: torch.Tensor,
+    num_splits: torch.Tensor,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Arguments:
+        q: (batch_size, seq_len_q, num_heads_q, head_dim).
+        k_cache: (num_blocks, page_block_size, num_heads_k, head_dim).
+        block_table: (batch_size, max_num_blocks_per_seq), torch.int32.
+        cache_seqlens: (batch_size), torch.int32.
+        head_dim_v: Head_dim of v.
+        tile_scheduler_metadata: (num_sm_parts, TileSchedulerMetaDataSize), 
+                                 torch.int32, return by get_mla_metadata.
+        num_splits: (batch_size + 1), torch.int32, return by get_mla_metadata.
+        softmax_scale: float. The scaling of QK^T before applying softmax. 
+                       Default to 1 / sqrt(head_dim).
+        causal: bool. Whether to apply causal attention mask.
+
+    Return:
+        out: (batch_size, seq_len_q, num_heads_q, head_dim_v).
+        softmax_lse: (batch_size, num_heads_q, seq_len_q), torch.float32.
+    """
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1]**(-0.5)
+    out, softmax_lse = torch.ops._flashmla_C.fwd_kvcache_mla(
+        q,
+        k_cache,
+        None,
+        head_dim_v,
+        cache_seqlens,
+        block_table,
+        softmax_scale,
+        causal,
+        tile_scheduler_metadata,
+        num_splits,
+    )
+    return out, softmax_lse
+
+
+#
+# TODO: Add fake functions
+#
+# @register_fake("_flashmla_C::get_mla_metadata")
+# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
+#     return ....
+#
+# @register_fake("_flashmla_C::fwd_kvcache_mla")
+# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
+#     return ....
+#
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bf425b89132..c6f3ccf0a3c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -141,6 +141,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
+        # TODO(lucas): handle this more gracefully
+        if envs.VLLM_ATTENTION_BACKEND is not None \
+           and envs.VLLM_ATTENTION_BACKEND == "FLASHMLA" \
+           and cache_config.block_size != 64:
+            cache_config.block_size = 64
+            logger.info(
+                "FlashMLA: Forcing kv cache block size to 64 since this"
+                " is currently the only block size supported by the kernel.")
 
     @classmethod
     def get_current_memory_usage(cls,
@@ -157,6 +165,22 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             logger.info("Using Flash Attention backend on V1 engine.")
             return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
         if use_mla:
+            if selected_backend == _Backend.FLASHMLA:
+                from vllm.attention.backends.flashmla import (
+                    is_flashmla_supported)
+                if not is_flashmla_supported()[0]:
+                    logger.warning(
+                        "FlashMLA backend is not supported due to %s",
+                        is_flashmla_supported()[1])
+                elif block_size != 64:
+                    logger.warning(
+                        "FlashMLA backend is not supported for block size %d"
+                        " (currently only supports block size 64).",
+                        block_size)
+                else:
+                    logger.info("Using FlashMLA backend.")
+                    return "vllm.attention.backends.flashmla.FlashMLABackend"
+
             logger.info("Using Triton MLA backend.")
             return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if selected_backend == _Backend.FLASHINFER:
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d6dae2e526d..0e4988a4fa7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -35,6 +35,7 @@ class _Backend(enum.Enum):
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()
+    FLASHMLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()

From 27f643a9daf00474bef4cb24fcce1246261ef197 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Thu, 27 Feb 2025 04:39:10 +0200
Subject: [PATCH 0403/1240] [ROCm][Quantization][Kernel] Use FP8 FNUZ when OCP
 flag is 0 or undefined (#13851)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/fp8/amd/quant_utils.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index b2196b8ed51..b812b28b607 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -24,12 +24,12 @@ __inline__ __device__ Tout scaled_vec_conversion(const Tin& x,
   return x;
 }
 
-    #if HIP_FP8_TYPE_FNUZ
-using fp8_type = __hip_fp8_e4m3_fnuz;
-using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
-    #elif HIP_FP8_TYPE_OCP
+    #if HIP_FP8_TYPE_OCP
 using fp8_type = __hip_fp8_e4m3;
 using fp8x2_type = __hip_fp8x2_e4m3;
+    #else
+using fp8_type = __hip_fp8_e4m3_fnuz;
+using fp8x2_type = __hip_fp8x2_e4m3_fnuz;
     #endif
 
 // fp8 -> half

From 50496173912c9e20df16e7c693b2b2be651b2c41 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 26 Feb 2025 22:06:37 -0500
Subject: [PATCH 0404/1240] Use CUDA 12.4 as default for release and nightly
 wheels (#12098)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml                    | 13 ++++++++++++-
 .buildkite/upload-wheels.sh                         | 10 ++++++++--
 .../getting_started/installation/gpu/cuda.inc.md    |  4 ++--
 setup.py                                            |  7 +++----
 4 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 829414bf8a3..37cdab9e01e 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,4 +1,15 @@
 steps:
+  - label: "Build wheel - CUDA 12.4"
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      - "bash .buildkite/upload-wheels.sh"
+    env:
+      DOCKER_BUILDKIT: "1"
+
   - label: "Build wheel - CUDA 12.1"
     agents:
       queue: cpu_queue_postmerge
@@ -37,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
index 3c756659a71..a681f892706 100644
--- a/.buildkite/upload-wheels.sh
+++ b/.buildkite/upload-wheels.sh
@@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 if [[ $normal_wheel == *"cu118"* ]]; then
     # if $normal_wheel matches cu118, do not upload the index.html
     echo "Skipping index files for cu118 wheels"
+elif [[ $normal_wheel == *"cu121"* ]]; then
+    # if $normal_wheel matches cu121, do not upload the index.html
+    echo "Skipping index files for cu121 wheels"
 else
-    # only upload index.html for cu12 wheels (default wheels)
+    # only upload index.html for cu124 wheels (default wheels)
     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
 
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 948bdbffbeb..2477c3e4c93 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -23,12 +23,12 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-# Install vLLM with CUDA 12.1.
+# Install vLLM with CUDA 12.4.
 pip install vllm # If you are using pip.
 uv pip install vllm # If you are using uv.
 ```
 
-As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.4 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.1, 11.8, and public PyTorch release versions:
 
 ```console
 # Install vLLM with CUDA 11.8.
diff --git a/setup.py b/setup.py
index a636d266cfb..6fe433517a0 100755
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ def load_module_from_path(module_name, path):
     # fallback to cpu
     VLLM_TARGET_DEVICE = "cpu"
 
-MAIN_CUDA_VERSION = "12.1"
+MAIN_CUDA_VERSION = "12.4"
 
 
 def is_sccache_available() -> bool:
@@ -571,9 +571,8 @@ def _read_requirements(filename: str) -> List[str]:
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
         for req in requirements:
-            if ("vllm-flash-attn" in req
-                    and not (cuda_major == "12" and cuda_minor == "1")):
-                # vllm-flash-attn is built only for CUDA 12.1.
+            if ("vllm-flash-attn" in req and cuda_major != "12"):
+                # vllm-flash-attn is built only for CUDA 12.x.
                 # Skip for other versions.
                 continue
             modified_requirements.append(req)

From 5d2a7d1a1f523102f5f2f90dcf0f5acdffe48708 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 26 Feb 2025 20:03:28 -0800
Subject: [PATCH 0405/1240] [misc] Rename Ray ADAG to Compiled Graph (#13928)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_basic_correctness.py |  2 +-
 tests/basic_correctness/test_chunked_prefill.py   |  2 +-
 tests/distributed/test_pipeline_parallel.py       | 11 ++++++-----
 vllm/envs.py                                      |  9 +++++----
 vllm/executor/ray_distributed_executor.py         | 10 +++++-----
 vllm/executor/ray_utils.py                        |  8 ++++----
 6 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index d2fc0916bc5..0cb3b739b72 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -117,7 +117,7 @@ def test_models_distributed(
         pytest.skip(f"Skip test for {test_suite}")
 
     if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test ray adag
+        # test Ray Compiled Graph
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
 
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index a500ba9dfe0..fd4a804183b 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -93,7 +93,7 @@ def test_models_distributed(
 
     if (model == "meta-llama/Llama-3.2-1B-Instruct"
             and distributed_executor_backend == "ray"):
-        # test ray adag
+        # test Ray Compiled Graph
         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 9677ccd2ea8..390ed91c260 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -324,8 +324,8 @@ def _compare_tp(
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
     if distributed_backend == "ray" and (vllm_major_version == "1"
                                          or specific_case):
-        # For V1, test Ray ADAG for all the tests
-        # For V0, test Ray ADAG for a subset of the tests
+        # For V1, test Ray Compiled Graph for all the tests
+        # For V0, test Ray Compiled Graph for a subset of the tests
         pp_env = {
             "VLLM_USE_V1": vllm_major_version,
             "VLLM_USE_RAY_COMPILED_DAG": "1",
@@ -333,7 +333,7 @@ def _compare_tp(
             "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
         }
         # Temporary. Currently when zeromq + SPMD is used, it does not properly
-        # terminate because of aDAG issue.
+        # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
     else:
         pp_env = None
@@ -367,8 +367,9 @@ def _compare_tp(
         if pp_env is None:
             raise
         else:
-            # Ray ADAG tests are flaky, so we don't want to fail the test
-            logger.exception("Ray ADAG tests failed")
+            # Ray Compiled Graph tests are flaky,
+            # so we don't want to fail the test
+            logger.exception("Ray Compiled Graph tests failed")
 
 
 @pytest.mark.parametrize(
diff --git a/vllm/envs.py b/vllm/envs.py
index 84426cb5bb2..048d63bfec0 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -371,21 +371,22 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_RAY_SPMD_WORKER":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_SPMD_WORKER", "0"))),
 
-    # If the env var is set, it uses the Ray's compiled DAG API
-    # which optimizes the control plane overhead.
+    # If the env var is set, it uses the Ray's Compiled Graph
+    # (previously known as ADAG) API which optimizes the
+    # control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
     "VLLM_USE_RAY_COMPILED_DAG":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
 
     # If the env var is set, it uses NCCL for communication in
-    # Ray's compiled DAG. This flag is ignored if
+    # Ray's Compiled Graph. This flag is ignored if
     # VLLM_USE_RAY_COMPILED_DAG is not set.
     "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
                  ),
 
     # If the env var is set, it enables GPU communication overlap
-    # (experimental feature) in Ray's compiled DAG. This flag is ignored if
+    # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
     # VLLM_USE_RAY_COMPILED_DAG is not set.
     "VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM", "0"))
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 2908fefc8e7..c3b41d1c113 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -491,7 +491,7 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         async_run_remote_workers_only to complete."""
         ray.get(parallel_worker_tasks)
 
-    def _check_ray_adag_installation(self):
+    def _check_ray_cgraph_installation(self):
         import pkg_resources
         from packaging import version
 
@@ -503,10 +503,10 @@ def _check_ray_adag_installation(self):
                              f"required, but found {current_version}")
 
         import importlib.util
-        adag_spec = importlib.util.find_spec(
+        cgraph_spec = importlib.util.find_spec(
             "ray.experimental.compiled_dag_ref")
-        if adag_spec is None:
-            raise ValueError("Ray accelerated DAG is not installed. "
+        if cgraph_spec is None:
+            raise ValueError("Ray Compiled Graph is not installed. "
                              "Run `pip install ray[adag]` to install it.")
 
         cupy_spec = importlib.util.find_spec("cupy")
@@ -518,7 +518,7 @@ def _check_ray_adag_installation(self):
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
         assert self.parallel_config.use_ray
-        self._check_ray_adag_installation()
+        self._check_ray_cgraph_installation()
         from ray.dag import InputNode, MultiOutputNode
         from ray.experimental.channel.torch_tensor_type import TorchTensorType
 
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index a9661fe0ef1..6067f9a3c13 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -83,9 +83,9 @@ def execute_model_spmd(
 
             execute_model_req = self.input_decoder.decode(serialized_req)
 
-            # TODO(swang): This is needed right now because Ray aDAG executes
-            # on a background thread, so we need to reset torch's current
-            # device.
+            # TODO(swang): This is needed right now because Ray Compiled Graph
+            # executes on a background thread, so we need to reset torch's
+            # current device.
             import torch
             if not self.compiled_dag_cuda_device_set:
                 torch.cuda.set_device(self.worker.device)
@@ -119,7 +119,7 @@ def execute_model_ray(
                                           "IntermediateTensors"]],
         ) -> Union["ModelRunnerOutput", Tuple["SchedulerOutput",
                                               "IntermediateTensors"]]:
-            # this method is used to compile ray CG,
+            # This method is used by Ray Compiled Graph to execute the model,
             # and it needs a special logic of self.setup_device_if_necessary()
             self.setup_device_if_necessary()
             assert self.worker is not None, "Worker is not initialized"

From f458f435709a4a49bb139c99d15436d8c0bf5a24 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 26 Feb 2025 20:04:12 -0800
Subject: [PATCH 0406/1240] [ROCm][V1] Update reshape_and_cache to properly
 work with CUDA graph padding (#13922)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cache_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index a6f8602a058..d06eac2b3d4 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -375,7 +375,7 @@ void reshape_and_cache(
     torch::Tensor& slot_mapping,  // [num_tokens]
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
     torch::Tensor& v_scale) {
-  int num_tokens = key.size(0);
+  int num_tokens = slot_mapping.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
   int block_size = key_cache.size(3);

From cf8939984322005a1d408874a76df4e3e14d6484 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Thu, 27 Feb 2025 04:04:59 +0000
Subject: [PATCH 0407/1240] [V1][Metrics] Handle preemptions (#13169)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  1 +
 vllm/v1/core/scheduler.py                | 10 +++++++-
 vllm/v1/engine/__init__.py               |  1 +
 vllm/v1/metrics/loggers.py               | 24 +++++++++++-------
 vllm/v1/metrics/stats.py                 | 31 +++++++++++++++++-------
 5 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index e0323abe252..5aa259a4f31 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -227,6 +227,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:gpu_cache_usage_perc",
     "vllm:gpu_prefix_cache_queries",
     "vllm:gpu_prefix_cache_hits",
+    "vllm:num_preemptions_total",
     "vllm:prompt_tokens_total",
     "vllm:generation_tokens_total",
     "vllm:iteration_tokens_total",
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 535aa644c53..87c9c0cd12b 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -164,6 +164,7 @@ def schedule(self) -> "SchedulerOutput":
                     self.kv_cache_manager.free(preempted_req)
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
+                    self.request_preempted(preempted_req, scheduled_timestamp)
 
                     self.waiting.appendleft(preempted_req)
                     preempted_reqs.append(preempted_req)
@@ -281,9 +282,9 @@ def schedule(self) -> "SchedulerOutput":
                 self.waiting.popleft()
                 self.running.append(request)
                 self.scheduled_req_ids.add(request.request_id)
+                self.request_scheduled(request, scheduled_timestamp)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
-                    self.request_scheduled(request, scheduled_timestamp)
                 elif request.status == RequestStatus.PREEMPTED:
                     scheduled_resumed_reqs.append(request)
                 else:
@@ -675,6 +676,13 @@ def request_scheduled(self, request: Request, timestamp: float):
             EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
                                       timestamp))
 
+    def request_preempted(self, request: Request, timestamp: float):
+        if not self.log_stats:
+            return
+        request.events.append(
+            EngineCoreEvent.new_event(EngineCoreEventType.PREEMPTED,
+                                      timestamp))
+
     def make_stats(self) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 7420dde1f7e..32fb3c5bd62 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -65,6 +65,7 @@ class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
     QUEUED = 1
     SCHEDULED = 2
+    PREEMPTED = 3
 
 
 class EngineCoreEvent(msgspec.Struct):
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 2c17da0ebc8..40dfc566167 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -132,6 +132,11 @@ def __init__(self, vllm_config: VllmConfig):
             "GPU prefix cache hits, in terms of number of cached blocks.",
             labelnames=labelnames).labels(*labelvalues)
 
+        self.counter_num_preempted_reqs = prometheus_client.Counter(
+            name="vllm:num_preemptions_total",
+            documentation="Cumulative number of preemption from the engine.",
+            labelnames=labelnames).labels(*labelvalues)
+
         self.counter_prompt_tokens = prometheus_client.Counter(
             name="vllm:prompt_tokens_total",
             documentation="Number of prefill tokens processed.",
@@ -282,6 +287,7 @@ def log(self, scheduler_stats: SchedulerStats,
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs)
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
             iteration_stats.num_generation_tokens)
@@ -289,10 +295,19 @@ def log(self, scheduler_stats: SchedulerStats,
             iteration_stats.num_prompt_tokens + \
             iteration_stats.num_generation_tokens)
 
+        for ttft in iteration_stats.time_to_first_tokens_iter:
+            self.histogram_time_to_first_token.observe(ttft)
+        for tpot in iteration_stats.time_per_output_tokens_iter:
+            self.histogram_time_per_output_token.observe(tpot)
+
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason].inc()
             self.histogram_e2e_time_request.observe(
                 finished_request.e2e_latency)
+            self.histogram_queue_time_request.observe(
+                finished_request.queued_time)
+            self.histogram_prefill_time_request.observe(
+                finished_request.prefill_time)
             self.histogram_inference_time_request.observe(
                 finished_request.inference_time)
             self.histogram_decode_time_request.observe(
@@ -302,15 +317,6 @@ def log(self, scheduler_stats: SchedulerStats,
             self.histogram_num_generation_tokens_request.observe(
                 finished_request.num_generation_tokens)
 
-        for ttft in iteration_stats.time_to_first_tokens_iter:
-            self.histogram_time_to_first_token.observe(ttft)
-        for tpot in iteration_stats.time_per_output_tokens_iter:
-            self.histogram_time_per_output_token.observe(tpot)
-        for queue_time in iteration_stats.queue_times_iter:
-            self.histogram_queue_time_request.observe(queue_time)
-        for prefill_time in iteration_stats.prefill_times_iter:
-            self.histogram_prefill_time_request.observe(prefill_time)
-
         if self.gauge_lora_info is not None:
             running_lora_adapters = \
                 ",".join(iteration_stats.running_lora_adapters.keys())
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 74d4a1bc4fb..30f460e5a69 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -67,6 +67,8 @@ class FinishedRequestStats:
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
+    queued_time: float = 0.0
+    prefill_time: float = 0.0
     inference_time: float = 0.0
     decode_time: float = 0.0
 
@@ -78,11 +80,10 @@ def __init__(self):
         self.iteration_timestamp = time.time()
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
+        self.num_preempted_reqs = 0
         self.finished_requests: List[FinishedRequestStats] = []
         self.time_to_first_tokens_iter: List[float] = []
         self.time_per_output_tokens_iter: List[float] = []
-        self.queue_times_iter: List[float] = []
-        self.prefill_times_iter: List[float] = []
         self.waiting_lora_adapters: Dict[str, int] = {}
         self.running_lora_adapters: Dict[str, int] = {}
 
@@ -122,9 +123,6 @@ def update_from_output(self, output: "EngineCoreOutput",
         if is_prefilling:
             # TODO: re-enable no-output-for-partial-prefills invariant as above
             if num_new_generation_tokens > 0:
-                prefill_interval = \
-                    engine_core_timestamp - req_stats.scheduled_ts
-                self.prefill_times_iter.append(prefill_interval)
                 req_stats.first_token_ts = engine_core_timestamp
         else:
             tpot = engine_core_timestamp - req_stats.last_token_ts
@@ -145,24 +143,39 @@ def update_from_events(self, req_id: str, events: List["EngineCoreEvent"],
                 if lora_stats is not None:
                     lora_stats.waiting_requests.add(req_id)
             elif event.type == EngineCoreEventType.SCHEDULED:
-                queued_interval = event.timestamp - req_stats.queued_ts
-                self.queue_times_iter.append(queued_interval)
-                req_stats.scheduled_ts = event.timestamp
+                if req_stats.scheduled_ts == 0.0:  # ignore preemptions
+                    req_stats.scheduled_ts = event.timestamp
                 LoRARequestStates.scheduled_request(lora_stats, req_id)
+            elif event.type == EngineCoreEventType.PREEMPTED:
+                self.num_preempted_reqs += 1
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      request_output: "RequestOutput",
                                      req_stats: RequestStateStats):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
-        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+        # Queued interval is from first QUEUED event to first SCHEDULED
+        queued_time = req_stats.scheduled_ts - req_stats.queued_ts
+
+        # Prefill interval is from first SCHEDULED to first NEW_TOKEN
+        # Any preemptions during prefill is included in the interval
+        prefill_time = req_stats.first_token_ts - req_stats.scheduled_ts
+
+        # Decode interval is from first NEW_TOKEN to last NEW_TOKEN
+        # Any preemptions during decode are included
         decode_time = req_stats.last_token_ts - req_stats.first_token_ts
 
+        # Inference interval is from first SCHEDULED to last NEW_TOKEN
+        # Any preemptions during prefill or decode are included
+        inference_time = req_stats.last_token_ts - req_stats.scheduled_ts
+
         finished_req = \
             FinishedRequestStats(finish_reason=finish_reason,
                                  e2e_latency=e2e_latency,
                                  num_prompt_tokens=len(request_output.prompt_token_ids),
                                  num_generation_tokens=req_stats.num_generation_tokens,
+                                 queued_time=queued_time,
+                                 prefill_time=prefill_time,
                                  inference_time=inference_time,
                                  decode_time=decode_time)
         self.finished_requests.append(finished_req)

From 32271a966f30a9f62aa5e8ed0679d45babf0a2cb Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Thu, 27 Feb 2025 03:24:11 -0500
Subject: [PATCH 0408/1240] [CI/Build] Add examples/ directory to be labelled
 by `mergify` (#13944)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/mergify.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 43bc5ce623d..e41107ae0a0 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -5,6 +5,7 @@ pull_request_rules:
     - or:
       - files~=^[^/]+\.md$
       - files~=^docs/
+      - files~=^examples/
   actions:
     label:
       add:

From af03641a5ac097dceea91eb4b740a4941e9de652 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 27 Feb 2025 17:06:49 +0800
Subject: [PATCH 0409/1240] [Misc] fixed 'required' is an invalid argument for
 positionals (#13948)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai_chat_embedding_client_for_multimodal.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index e410620378a..2c63c5ec370 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -102,7 +102,7 @@ def dse_qwen2_vl(inp: dict):
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
         "the model with --task embed before running this.")
-    parser.add_argument("model",
+    parser.add_argument("--model",
                         type=str,
                         choices=["vlm2vec", "dse_qwen2_vl"],
                         required=True,

From d028e2872b2b41b2912032bddf51b8eecc30b23e Mon Sep 17 00:00:00 2001
From: Yang Zheng <50227060+zhengy001@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:47:29 +0800
Subject: [PATCH 0410/1240] [PP] Correct cache size check (#13873)

Signed-off-by: Yang Zheng <zhengy.gator@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/hpu_worker.py | 13 +++++++------
 vllm/worker/worker.py     | 13 +++++++------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index a1f31bead72..ccb175d88fd 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -258,9 +258,10 @@ def initialize_cache(self, num_gpu_blocks: int,
 
         This also warms up the model, which may record CUDA graphs.
         """
-        raise_if_cache_size_invalid(num_gpu_blocks,
-                                    self.cache_config.block_size,
-                                    self.model_config.max_model_len)
+        raise_if_cache_size_invalid(
+            num_gpu_blocks, self.cache_config.block_size,
+            self.model_config.max_model_len,
+            self.parallel_config.pipeline_parallel_size)
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -442,13 +443,13 @@ def init_worker_distributed_environment(
                                       parallel_config.pipeline_parallel_size)
 
 
-def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
-                                max_model_len) -> None:
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len,
+                                pipeline_parallel_size) -> None:
     if num_gpu_blocks <= 0:
         raise ValueError("No available memory for the cache blocks. "
                          "Try increasing `gpu_memory_utilization` when "
                          "initializing the engine.")
-    max_seq_len = block_size * num_gpu_blocks
+    max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
     if max_model_len > max_seq_len:
         raise ValueError(
             f"The model's max seq len ({max_model_len}) "
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 5d548bdb59f..ad94a6a4db7 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -288,10 +288,11 @@ def initialize_cache(self, num_gpu_blocks: int,
 
         This also warms up the model, which may record CUDA graphs.
         """
-        raise_if_cache_size_invalid(num_gpu_blocks,
-                                    self.cache_config.block_size,
-                                    self.cache_config.is_attention_free,
-                                    self.model_config.max_model_len)
+        raise_if_cache_size_invalid(
+            num_gpu_blocks, self.cache_config.block_size,
+            self.cache_config.is_attention_free,
+            self.model_config.max_model_len,
+            self.parallel_config.pipeline_parallel_size)
 
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -530,7 +531,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
 
 
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
-                                max_model_len) -> None:
+                                max_model_len, pipeline_parallel_size) -> None:
     if is_attention_free and num_gpu_blocks != 0:
         raise ValueError("No memory should be allocated for the cache blocks "
                          f"for an attention-free model, but {num_gpu_blocks} "
@@ -539,7 +540,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
         raise ValueError("No available memory for the cache blocks. "
                          "Try increasing `gpu_memory_utilization` when "
                          "initializing the engine.")
-    max_seq_len = block_size * num_gpu_blocks
+    max_seq_len = block_size * (num_gpu_blocks // pipeline_parallel_size)
     if not is_attention_free and max_model_len > max_seq_len:
         raise ValueError(
             f"The model's max seq len ({max_model_len}) "

From c8b31f073cc299b18c4a0c845cace7e408edd27b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 27 Feb 2025 05:00:00 -0500
Subject: [PATCH 0411/1240] Fix test_block_fp8.py test for MoE (#13915)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_block_fp8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index 20eff1c2072..6206cbd5f76 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -30,8 +30,8 @@
 N_moe = [4608]  # [128, 4608, 13824]
 K_moe = [7168]  # [256, 7168, 13824]
 BLOCK_SIZE = [[128, 128]]
-E = [256]  # [8, 24, 128, 256]
-TOP_KS = [1]  # [1, 2, 6]
+E = [8, 24]  # [8, 24, 128, 256]
+TOP_KS = [2]  # [1, 2, 6]
 OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
 SEEDS = [0]
 

From 964c06dc5959ea0e2f78cf0f3193b07928eb02b1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 27 Feb 2025 18:06:41 +0800
Subject: [PATCH 0412/1240] [VLM] Support multimodal inputs for Florence-2
 models (#13320)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   7 +
 .../offline_inference/florence2_inference.py  |  39 +-
 examples/offline_inference/vision_language.py |  17 +
 tests/conftest.py                             |   6 +-
 .../audio_language/test_ultravox.py           |   4 +-
 .../vision_language/test_florence2.py         | 139 ++-
 .../multimodal/processing/test_common.py      |   5 +-
 tests/models/registry.py                      |  10 +-
 vllm/model_executor/models/bart.py            |  27 +-
 vllm/model_executor/models/florence2.py       | 913 +++++++++++++++++-
 vllm/model_executor/models/registry.py        |   2 +-
 vllm/multimodal/processing.py                 |  20 +-
 vllm/multimodal/profiling.py                  |   6 +-
 13 files changed, 1078 insertions(+), 117 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 9959f7233e8..4b1f3e180ed 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -715,6 +715,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Florence2ForConditionalGeneration`
+  * Florence-2
+  * T + I
+  * `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.
+  *
+  *
+  *
 - * `FuyuForCausalLM`
   * Fuyu
   * T + I
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
index 58610b0fd2a..27aceee43cb 100644
--- a/examples/offline_inference/florence2_inference.py
+++ b/examples/offline_inference/florence2_inference.py
@@ -1,34 +1,45 @@
 # SPDX-License-Identifier: Apache-2.0
-'''
+"""
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically Florence-2
-'''
+"""
 # TODO(Isotr0py):
 # Move to offline_inference/vision_language.py
 # after porting vision backbone
 from vllm import LLM, SamplingParams
-
-dtype = "float"
+from vllm.assets.image import ImageAsset
 
 # Create a Florence-2 encoder/decoder model instance
 llm = LLM(
-    model="microsoft/Florence-2-base",
-    tokenizer="facebook/bart-base",
-    dtype=dtype,
+    model="microsoft/Florence-2-large",
+    tokenizer="facebook/bart-large",
+    max_num_seqs=8,
     trust_remote_code=True,
 )
 
 prompts = [
-    "<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>",
-    "<CAPTION_TO_PHRASE_GROUNDING>", "<OD>", "<DENSE_REGION_CAPTION>",
-    "<REGION_PROPOSAL>", "<OCR>", "<OCR_WITH_REGION>"
+    {   # implicit prompt with task token
+        "prompt": "<DETAILED_CAPTION>",
+        "multi_modal_data": {
+            "image": ImageAsset("stop_sign").pil_image
+        },
+    },
+    {   # explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "Describe in detail what is shown in the image.",
+            "multi_modal_data": {
+                "image": ImageAsset("cherry_blossom").pil_image
+            },
+        },
+        "decoder_prompt": "",
+    },
 ]
 # Create a sampling params object.
 sampling_params = SamplingParams(
     temperature=0,
     top_p=1.0,
     min_tokens=0,
-    max_tokens=20,
+    max_tokens=128,
 )
 
 # Generate output tokens from the prompts. The output is a list of
@@ -38,9 +49,5 @@
 
 # Print the outputs.
 for output in outputs:
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
     generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 5f05389faf8..e2ec36211b8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -82,6 +82,22 @@ def run_deepseek_vl2(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Florence2
+def run_florence2(question: str, modality: str):
+    assert modality == "image"
+
+    llm = LLM(model="microsoft/Florence-2-large",
+              tokenizer="facebook/bart-large",
+              max_num_seqs=8,
+              trust_remote_code=True,
+              dtype="bfloat16",
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    prompt = "<MORE_DETAILED_CAPTION>"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
 def run_fuyu(question: str, modality: str):
     assert modality == "image"
@@ -571,6 +587,7 @@ def run_qwen2_5_vl(question: str, modality: str):
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
     "deepseek_vl_v2": run_deepseek_vl2,
+    "florence2": run_florence2,
     "fuyu": run_fuyu,
     "glm4v": run_glm4v,
     "h2ovl_chat": run_h2ovl,
diff --git a/tests/conftest.py b/tests/conftest.py
index dd339030e5e..871f0b62c53 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -600,8 +600,8 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
 
-            encoder_input_ids = self.wrap_device(
-                self.processor(**processor_kwargs).input_ids,
+            encoder_inputs = self.wrap_device(
+                self.processor(**processor_kwargs),
                 device=self.model.device.type,
             )
 
@@ -615,13 +615,13 @@ def generate_encoder_decoder_greedy_logprobs_limit(
                 )
 
             output = self.model.generate(
-                encoder_input_ids,
                 decoder_input_ids=decoder_input_ids,
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **encoder_inputs,
                 **kwargs,
             )
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index d1f643a8fdb..0ea17247028 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -15,7 +15,7 @@
 from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
-MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+MODEL_NAME = "fixie-ai/ultravox-v0_4"
 
 AudioTuple = Tuple[np.ndarray, int]
 
@@ -187,7 +187,7 @@ def run_multi_audio_test(
 
 
 @pytest.mark.core_model
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("vllm_kwargs", [
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index a1d15679918..de18deab11f 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -1,52 +1,59 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import partial
-from typing import List, Optional, Tuple, Type
+from typing import Optional, Type
 
 import pytest
 from PIL import Image
 
-from vllm.inputs.data import ExplicitEncoderDecoderPrompt
+from vllm.inputs.data import ExplicitEncoderDecoderPrompt, TextPrompt
+from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ....conftest import HfRunner, VllmRunner
+from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
 
-Florence2Prompt = partial(ExplicitEncoderDecoderPrompt,
-                          decoder_prompt=None,
-                          mm_processor_kwargs=None)
-
 MODELS = ["microsoft/Florence-2-base"]
 # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
 # Therefore, we borrow the BartTokenizer from the original Bart model
 TOKENIZER = "facebook/bart-base"
-PROMPTS = [
-    Florence2Prompt(encoder_prompt="<CAPTION>"),
-    Florence2Prompt(encoder_prompt="<DETAILED_CAPTION>"),
-    Florence2Prompt(encoder_prompt="<MORE_DETAILED_CAPTION>"),
-    Florence2Prompt(encoder_prompt="<CAPTION_TO_PHRASE_GROUNDING>"),
-    Florence2Prompt(encoder_prompt="<DENSE_REGION_CAPTION>"),
-    Florence2Prompt(encoder_prompt="<REGION_PROPOSAL>"),
-    Florence2Prompt(encoder_prompt="<OCR_WITH_REGION>"),
-    Florence2Prompt(encoder_prompt="<OCR>"),
-    Florence2Prompt(encoder_prompt="<OD>"),
-]
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<CAPTION>",  # special task token
+    "cherry_blossom":
+    "Describe in detail what is shown in the image.",
+})
+
 
+def get_hf_images_prompts(
+    prompts_: list[ExplicitEncoderDecoderPrompt[str, TextPrompt]],
+) -> tuple[list[ExplicitEncoderDecoderPrompt[str, str]], list[Image.Image]]:
+    prompts, images = [], []
+    for prompt in prompts_:
+        encoder_prompt = prompt["encoder_prompt"]
+        prompts.append(
+            ExplicitEncoderDecoderPrompt(
+                encoder_prompt=encoder_prompt["prompt"],
+                decoder_prompt=None,
+            ))
+        images.append(encoder_prompt["multi_modal_data"]["image"])
+    return prompts, images
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]], ):
-    """Sanitize vllm output to be comparable with hf output."""
-    output_ids, output_str, out_logprobs = vllm_output
 
-    hf_output_str = "</s><s>" + output_str + "</s>"
+def hf_to_vllm_output(hf_output: tuple[list[int], str,
+                                       Optional[SampleLogprobs]]):
+    """Sanitize hf output to be comparable with vllm output."""
+    output_ids, output_str, out_logprobs = hf_output
 
-    return output_ids, hf_output_str, out_logprobs
+    output_str = output_str.replace("</s>", "").replace("<s>", "")
+    output_ids = [ids for ids in output_ids if ids not in [0, 2]]
+
+    return output_ids, output_str, out_logprobs
 
 
 def run_test(
     hf_runner: Type[HfRunner],
     vllm_runner: Type[VllmRunner],
-    prompts: List[ExplicitEncoderDecoderPrompt],
+    inputs: list[list[ExplicitEncoderDecoderPrompt]],
     model: str,
     *,
     dtype: str,
@@ -56,46 +63,76 @@ def run_test(
     distributed_executor_backend: Optional[str] = None,
 ) -> None:
     with vllm_runner(model,
+                     max_num_seqs=8,
                      tokenizer_name=TOKENIZER,
                      dtype=dtype,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True) as vllm_model:
-        vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
-            prompts, max_tokens, num_logprobs)
+        vllm_outputs_per_case = [
+            vllm_model.generate_encoder_decoder_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs)
+            for prompts in inputs
+        ]
+
+    hf_inputs = [get_hf_images_prompts(prompts) for prompts in inputs]
 
-    # Florence-2 processors require image inputs
-    dummy_image = Image.new(mode="RGB", size=(2, 2))
     with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model:
         hf_model.model.get_output_embeddings = lambda: \
             hf_model.model.language_model.lm_head
-        hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            images=[dummy_image] * len(prompts),
-        ))
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=[
-            vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
-        ],
-        name_0="hf",
-        name_1="vllm",
-    )
-
-
+        hf_outputs_per_case = [
+            hf_model.generate_encoder_decoder_greedy_logprobs_limit(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images)
+            for prompts, images in hf_inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=[hf_to_vllm_output(output) for output in hf_outputs],
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float", "bfloat16"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, model, dtype, max_tokens,
-                num_logprobs) -> None:
+def test_models(hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+                image_assets: _ImageAssets, model: str,
+                size_factors: list[int], dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [[
+        ExplicitEncoderDecoderPrompt(
+            encoder_prompt=TextPrompt(
+                prompt=prompt,
+                multi_modal_data={"image": rescale_image_size(image, factor)}),
+            decoder_prompt=None,
+        ) for factor in size_factors
+    ] for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
     run_test(
         hf_runner,
         vllm_runner,
-        PROMPTS,
+        inputs_per_image,
         model,
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index a84999cfbf4..7534f0c9779 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -29,8 +29,8 @@ def _test_processing_correctness(
     model_config = ModelConfig(
         model_id,
         task="auto",
-        tokenizer=model_id,
-        tokenizer_mode="auto",
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
         trust_remote_code=model_info.trust_remote_code,
         seed=0,
         dtype="float16",
@@ -151,6 +151,7 @@ def _test_processing_correctness(
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
+    "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "THUDM/glm-4v-9b",
     "h2oai/h2ovl-mississippi-800m",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8614baf18f3..95bda029349 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -193,11 +193,6 @@ def check_available_online(
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
-    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
-    # Therefore, we borrow the BartTokenizer from the original Bart model
-    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
-                                                         tokenizer="facebook/bart-base",
-                                                         trust_remote_code=True),  # noqa: E501
 }
 
 _EMBEDDING_EXAMPLE_MODELS = {
@@ -288,6 +283,11 @@ def check_available_online(
                                      extras={"v0.5": "fixie-ai/ultravox-v0_5-llama-3_2-1b"},  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
+    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
+    # Therefore, we borrow the BartTokenizer from the original Bart model
+    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
+                                                         tokenizer="facebook/bart-base",
+                                                         trust_remote_code=True),  # noqa: E501
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
     "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 5d2a8cdcb97..93452696dca 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -588,8 +588,12 @@ def __init__(self,
 
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
-    def forward(self, input_ids: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         r"""
         Args:
             input_ids
@@ -602,7 +606,8 @@ def forward(self, input_ids: torch.Tensor,
             Decoder output torch.Tensor
         """
         # retrieve input_ids and inputs_embeds
-        inputs_embeds = self.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
 
         embed_pos = self.embed_positions(positions)
         embed_pos = embed_pos.to(inputs_embeds.device)
@@ -661,9 +666,13 @@ def __init__(
 
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
-    def forward(self, decoder_input_ids: torch.Tensor,
-                decoder_positions: torch.Tensor,
-                encoder_hidden_states: Optional[torch.Tensor]) -> torch.Tensor:
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor,
+        decoder_positions: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         r"""
         Args:
             decoder_input_ids
@@ -677,8 +686,10 @@ def forward(self, decoder_input_ids: torch.Tensor,
         Returns:
             Decoder output torch.Tensor
         """
-
-        inputs_embeds = self.embed_tokens(decoder_input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(decoder_input_ids)
+        else:
+            decoder_positions = inputs_embeds[:, -1]
 
         # embed positions
         embed_pos = self.embed_positions(decoder_positions)
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 06912bcfdc8..b71d0de8d70 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,10 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-from typing import Iterable, Optional, Set, Tuple
+from functools import cached_property
+from typing import (Iterable, List, Literal, Mapping, Optional, OrderedDict,
+                    Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from transformers import BatchFeature, PretrainedConfig
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -14,11 +19,567 @@
                                              BartParallelLMHead,
                                              BartScaledWordEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
+from vllm.multimodal.processing import (BaseProcessingInfo,
+                                        EncDecMultiModalProcessor,
+                                        PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .utils import AutoWeightsLoader
+from .interfaces import SupportsMultiModal
+from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
+class Florence2ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, num_channel, height, width)"""
+
+
+# ViT implementation are all copied from
+# https://huggingface.co/microsoft/Florence-2-base/blob/main/modeling_florence2.py
+class LearnedAbsolutePositionEmbedding2D(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256, num_pos=50):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(
+            num_pos, embedding_dim - (embedding_dim // 2))
+
+    def forward(self, pixel_values):
+        """
+        pixel_values: (batch_size, height, width, num_channels) 
+        returns: (batch_size, height, width, embedding_dim * 2)
+        """
+        if len(pixel_values.shape) != 4:
+            raise ValueError('pixel_values must be a 4D tensor')
+        height, width = pixel_values.shape[1:3]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        # (height, width, embedding_dim * 2)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(height, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, width, 1)
+        ],
+                        dim=-1)
+        # (embedding_dim * 2, height, width)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        # (batch_size, embedding_dim * 2, height, width)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        # (batch_size, height, width, embedding_dim * 2)
+        pos = pos.permute(0, 2, 3, 1)
+        return pos
+
+
+class PositionalEmbeddingCosine1D(nn.Module):
+    """
+    This class implements a very simple positional encoding. It follows closely
+    the encoder from the link below:
+    https://pytorch.org/tutorials/beginner/translation_transformer.html
+    Args:
+        embed_dim: The dimension of the embeddings.
+        dropout_prob: The dropout probability.
+        max_seq_len: The maximum length to precompute the positional encodings.
+    """
+
+    def __init__(self, embed_dim: int = 512, max_seq_len: int = 1024) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        # Generate the sinusoidal arrays.
+        factor = math.log(10000)
+        denominator = torch.exp(-factor * torch.arange(0, self.embed_dim, 2) /
+                                self.embed_dim)
+        # Matrix where rows correspond to a positional embedding as a function
+        # of the position index (i.e., the row index).
+        frequencies = \
+            torch.arange(0, self.max_seq_len) \
+            .reshape(self.max_seq_len, 1) * denominator
+        pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
+        # Populate uneven entries.
+        pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
+        pos_idx_to_embed[:, 1::2] = torch.cos(frequencies)
+        # Save the positional embeddings in a constant buffer.
+        # self.register_buffer("pos_idx_to_embed", pos_idx_to_embed)
+        self.pos_idx_to_embed = nn.Parameter(pos_idx_to_embed,
+                                             requires_grad=False)
+
+    def forward(self, seq_embeds: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            seq_embeds: The sequence embeddings in order. Allowed size:
+                1. [T, D], where T is the length of the sequence, and D is the
+                frame embedding dimension.
+                2. [B, T, D], where B is the batch size and T and D are the
+                same as above.
+        Returns a tensor of with the same dimensions as the input: i.e.,
+        [1, T, D] or [T, D].
+        """
+        shape_len = len(seq_embeds.shape)
+        assert 2 <= shape_len <= 3
+        len_seq = seq_embeds.size(-2)
+        assert len_seq <= self.max_seq_len
+        pos_embeds = self.pos_idx_to_embed[0:seq_embeds.size(-2), :]
+        # Adapt pre-computed positional embeddings to the input.
+        if shape_len == 3:
+            pos_embeds = pos_embeds.view(
+                (1, pos_embeds.size(0), pos_embeds.size(1)))
+        return pos_embeds
+
+
+class MySequential(nn.Sequential):
+
+    def forward(self, *inputs):
+        for module in self._modules.values():
+            if isinstance(inputs, tuple):
+                inputs = module(*inputs)
+            else:
+                inputs = module(inputs)
+        return inputs
+
+
+class PreNorm(nn.Module):
+
+    def __init__(self, norm, fn):
+        super().__init__()
+        self.norm = norm
+        self.fn = fn
+
+    def forward(self, x, *args, **kwargs):
+        shortcut = x
+        if self.norm is not None:
+            x, size = self.fn(self.norm(x), *args, **kwargs)
+        else:
+            x, size = self.fn(x, *args, **kwargs)
+
+        x = shortcut + x
+
+        return x, size
+
+
+class Mlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.net = nn.Sequential(
+            OrderedDict([("fc1", nn.Linear(in_features, hidden_features)),
+                         ("act", act_layer()),
+                         ("fc2", nn.Linear(hidden_features, out_features))]))
+
+    def forward(self, x, size):
+        return self.net(x), size
+
+
+class DepthWiseConv2d(nn.Module):
+
+    def __init__(
+        self,
+        dim_in,
+        kernel_size,
+        padding,
+        stride,
+        bias=True,
+    ):
+        super().__init__()
+        self.dw = nn.Conv2d(dim_in,
+                            dim_in,
+                            kernel_size=kernel_size,
+                            padding=padding,
+                            groups=dim_in,
+                            stride=stride,
+                            bias=bias)
+
+    def forward(self, x, size):
+        B, N, C = x.shape
+        H, W = size
+        assert N == H * W
+
+        x = self.dw(x.transpose(1, 2).view(B, C, H, W))
+        size = (x.size(-2), x.size(-1))
+        x = x.flatten(2).transpose(1, 2)
+        return x, size
+
+
+class ConvEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 patch_size=7,
+                 in_chans=3,
+                 embed_dim=64,
+                 stride=4,
+                 padding=2,
+                 norm_layer=None,
+                 pre_norm=True):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=stride,
+                              padding=padding)
+
+        dim_norm = in_chans if pre_norm else embed_dim
+        self.norm = norm_layer(dim_norm) if norm_layer else None
+
+        self.pre_norm = pre_norm
+
+    def forward(self, x, size):
+        H, W = size
+        if len(x.size()) == 3:
+            if self.norm and self.pre_norm:
+                x = self.norm(x)
+            x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        x = self.proj(x)
+
+        _, _, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm and not self.pre_norm:
+            x = self.norm(x)
+
+        return x, (H, W)
+
+
+class ChannelAttention(nn.Module):
+
+    def __init__(self, dim, groups=8, qkv_bias=True):
+        super().__init__()
+
+        self.groups = groups
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, size):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups,
+                                  C // self.groups).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * (float(N)**-0.5)
+        attention = q.transpose(-1, -2) @ k
+        attention = attention.softmax(dim=-1)
+        x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x, size
+
+
+class ChannelBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 groups,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 conv_at_attn=True,
+                 conv_at_ffn=True):
+        super().__init__()
+
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
+        self.channel_attn = PreNorm(
+            norm_layer(dim),
+            ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
+                                                   1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer),
+        )
+
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.channel_attn(x, size)
+
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+
+        return x, size
+
+
+def window_partition(x, window_size: int):
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+    B = batch_size
+
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+
+    def __init__(self, dim, num_heads, window_size, qkv_bias=True):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = float(head_dim)**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, size):
+
+        H, W = size
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        x = window_partition(x, self.window_size)
+        x = x.view(-1, self.window_size * self.window_size, C)
+
+        # W-MSA/SW-MSA
+        # attn_windows = self.attn(x_windows)
+
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        attn = self.softmax(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+
+        # merge windows
+        x = x.view(-1, self.window_size, self.window_size, C)
+        x = window_reverse(x, B, self.window_size, Hp, Wp)
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        return x, size
+
+
+class SpatialBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 conv_at_attn=True,
+                 conv_at_ffn=True):
+        super().__init__()
+
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
+        self.window_attn = PreNorm(
+            norm_layer(dim),
+            WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
+        )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1,
+                                                   1)) if conv_at_ffn else None
+        self.ffn = PreNorm(
+            norm_layer(dim),
+            Mlp(in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                act_layer=act_layer),
+        )
+
+    def forward(self, x, size):
+        if self.conv1:
+            x, size = self.conv1(x, size)
+        x, size = self.window_attn(x, size)
+
+        if self.conv2:
+            x, size = self.conv2(x, size)
+        x, size = self.ffn(x, size)
+        return x, size
+
+
+class DaViT(nn.Module):
+
+    def __init__(
+        self,
+        in_chans=3,
+        num_classes=1000,
+        depths=(1, 1, 3, 1),
+        patch_size=(7, 2, 2, 2),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 0, 0, 0),
+        patch_prenorm=(False, False, False, False),
+        embed_dims=(64, 128, 192, 256),
+        num_heads=(3, 6, 12, 24),
+        num_groups=(3, 6, 12, 24),
+        window_size=7,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        drop_path_rate=0.1,
+        norm_layer=nn.LayerNorm,
+        enable_checkpoint=False,
+        conv_at_attn=True,
+        conv_at_ffn=True,
+    ):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.num_groups = num_groups
+        self.num_stages = len(self.embed_dims)
+        self.enable_checkpoint = enable_checkpoint
+        assert self.num_stages == len(self.num_heads) == len(self.num_groups)
+
+        num_stages = len(embed_dims)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate,
+                                             sum(depths) * 2)
+        ]
+
+        depth_offset = 0
+        convs = []
+        blocks = []
+        for i in range(num_stages):
+            conv_embed = ConvEmbed(
+                patch_size=patch_size[i],
+                stride=patch_stride[i],
+                padding=patch_padding[i],
+                in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
+                embed_dim=self.embed_dims[i],
+                norm_layer=norm_layer,
+                pre_norm=patch_prenorm[i])
+            convs.append(conv_embed)
+
+            block = MySequential(*[
+                MySequential(
+                    OrderedDict([('spatial_block',
+                                  SpatialBlock(
+                                      embed_dims[i],
+                                      num_heads[i],
+                                      window_size,
+                                      drop_path_rate=dpr[depth_offset + j * 2],
+                                      qkv_bias=qkv_bias,
+                                      mlp_ratio=mlp_ratio,
+                                      conv_at_attn=conv_at_attn,
+                                      conv_at_ffn=conv_at_ffn,
+                                  )),
+                                 ('channel_block',
+                                  ChannelBlock(
+                                      embed_dims[i],
+                                      num_groups[i],
+                                      drop_path_rate=dpr[depth_offset + j * 2 +
+                                                         1],
+                                      qkv_bias=qkv_bias,
+                                      mlp_ratio=mlp_ratio,
+                                      conv_at_attn=conv_at_attn,
+                                      conv_at_ffn=conv_at_ffn,
+                                  ))])) for j in range(depths[i])
+            ])
+            blocks.append(block)
+            depth_offset += depths[i] * 2
+
+        self.convs = nn.ModuleList(convs)
+        self.blocks = nn.ModuleList(blocks)
+
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+
+    @property
+    def dim_out(self):
+        return self.embed_dims[-1]
+
+    def forward_features_unpool(self, x):
+        """
+        forward until avg pooling 
+        Args:
+            x (_type_): input image tensor
+        """
+        input_size = (x.size(2), x.size(3))
+        for conv, block in zip(self.convs, self.blocks):
+            x, input_size = conv(x, input_size)
+            x, input_size = block(x, input_size)
+        return x
+
+    def forward_features(self, x):
+        x = self.forward_features_unpool(x)
+
+        # (batch_size, num_tokens, token_dim)
+        x = self.avgpool(x.transpose(1, 2))
+        # (batch_size, 1, num_tokens)
+        x = torch.flatten(x, 1)
+        x = self.norms(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(
+            depths=config.depths,
+            embed_dims=config.dim_embed,
+            num_heads=config.num_heads,
+            num_groups=config.num_groups,
+            patch_size=config.patch_size,
+            patch_stride=config.patch_stride,
+            patch_padding=config.patch_padding,
+            patch_prenorm=config.patch_prenorm,
+            drop_path_rate=config.drop_path_rate,
+            window_size=config.window_size,
+        )
+
+
+# Language backbone and processor implementation
 class Florence2LanguageModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -47,9 +608,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.encoder.embed_tokens.weight = self.shared.weight
             self.decoder.embed_tokens.weight = self.shared.weight
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
-                encoder_input_ids: torch.Tensor,
-                encoder_positions: torch.Tensor) -> torch.Tensor:
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        encoder_input_ids: torch.Tensor,
+        encoder_positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         r"""
         Args:
             input_ids
@@ -68,11 +634,12 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
 
         encoder_hidden_states = None
 
-        if encoder_input_ids.numel() > 0:
+        if inputs_embeds is not None or encoder_input_ids.numel() > 0:
             # Run encoder attention if a non-zero number of encoder tokens
             # are provided as input
             encoder_hidden_states = self.encoder(input_ids=encoder_input_ids,
-                                                 positions=encoder_positions)
+                                                 positions=encoder_positions,
+                                                 inputs_embeds=inputs_embeds)
 
         # decoder outputs consists of
         # (dec_features, past_key_value, dec_hidden, dec_attn)
@@ -112,6 +679,7 @@ def forward(
         positions: torch.Tensor,
         encoder_input_ids: torch.Tensor,
         encoder_positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> torch.Tensor:
         r"""
@@ -127,8 +695,15 @@ def forward(
         Returns:
             Output torch.Tensor
         """
-        return self.model(input_ids, positions, encoder_input_ids,
-                          encoder_positions)
+
+        return self.model(input_ids,
+                          positions,
+                          encoder_input_ids,
+                          encoder_positions,
+                          inputs_embeds=inputs_embeds)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.encoder.embed_tokens(input_ids)
 
     def compute_logits(
         self,
@@ -177,21 +752,312 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class Florence2ForConditionalGeneration(nn.Module):
+class Florence2ProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_hf_processor(self):
+        return self.ctx.get_hf_processor()
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
+
+    def get_max_image_tokens(self) -> int:
+        processor_config = self.ctx.get_hf_image_processor_config()
+        return processor_config["image_seq_length"]
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+
+class Florence2DummyInputsBuilder(
+        BaseDummyInputsBuilder[Florence2ProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        target_width = target_height = self.info.get_hf_config().projection_dim
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class Florence2MultiModalProcessor(
+        EncDecMultiModalProcessor[Florence2ProcessingInfo]):
+
+    def _hf_processor_applies_repl(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> bool:
+        return False
+
+    def create_encoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return prompt
+
+    def create_decoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        return [self.info.get_hf_config().eos_token_id]
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        if mm_data:
+            processed_outputs = super()._call_hf_processor(
+                prompt, mm_data, mm_kwargs)
+        else:
+            hf_processor = self.info.get_hf_processor()
+            tokenizer = hf_processor.tokenizer
+            prompt = hf_processor._construct_prompts([prompt])[0]
+            processed_outputs = tokenizer(prompt,
+                                          add_special_tokens=True,
+                                          return_tensors="pt")
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+        pad_token_id = hf_config.pad_token_id
+        bos_token_id = hf_config.bos_token_id
+        num_image_tokens = self.info.get_max_image_tokens()
+        image_tokens = [pad_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[bos_token_id],
+                replacement=PromptReplacementDetails(
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Florence2MultiModalProcessor,
+    info=Florence2ProcessingInfo,
+    dummy_inputs=Florence2DummyInputsBuilder)
+class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
+        processor_config = vllm_config.model_config.hf_image_processor_config
 
-        # TODO(Isotr0py): Add vision backbone
+        self.config = config
+        self.vision_config = config.vision_config
+        self.processor_config = processor_config
+        assert config.vision_config.model_type == 'davit', (
+            'only DaViT is supported for now')
+        self.vision_tower = DaViT.from_config(config=config.vision_config)
+        self._build_image_projection_layers(config)
         self.language_model = Florence2LanguageForConditionalGeneration(
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=f"{prefix}.language_model",
         )
+        self.pad_token_id = config.pad_token_id
 
-    @property
+    def _build_image_projection_layers(self, config: PretrainedConfig):
+        image_dim_out = config.vision_config.dim_embed[-1]
+        dim_projection = config.vision_config.projection_dim
+        self.image_projection = nn.Parameter(
+            torch.empty(image_dim_out, dim_projection))
+        self.image_proj_norm = nn.LayerNorm(dim_projection)
+        image_pos_embed_config = config.vision_config.image_pos_embed
+        if image_pos_embed_config['type'] == 'learned_abs_2d':
+            self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+                embedding_dim=image_dim_out,
+                num_pos=image_pos_embed_config['max_pos_embeddings'])
+        else:
+            raise NotImplementedError("Florence2 only supports learned_abs_2d "
+                                      "as image position embedding.")
+
+        self.image_feature_source = config.vision_config.image_feature_source
+
+        # temporal embedding
+        visual_temporal_embedding_config = (
+            self.vision_config.visual_temporal_embedding)
+        if visual_temporal_embedding_config['type'] == 'COSINE':
+            self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+                embed_dim=image_dim_out,
+                max_seq_len=visual_temporal_embedding_config[
+                    'max_temporal_embeddings'])
+        else:
+            raise NotImplementedError(
+                'Florence2 only supports COSINE as temporal embedding.')
+
+    @cached_property
     def sampler(self):
-        return self.language_model.sampler
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+        return get_sampler()
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        size = self.processor_config["size"]
+        h, w = size["height"], size["width"]
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = tuple(*map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(self, **kwargs: object):
+        pixel_values: Optional[Union[List[List[torch.Tensor]],
+                                     List[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "pixel_values", None)
+        image_embeds: Optional[Union[List[List[torch.Tensor]],
+                                     List[torch.Tensor],
+                                     torch.Tensor]] = kwargs.pop(
+                                         "image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None and image_embeds is not None:
+            raise ValueError(
+                "Both pixel values and image embeds are provided.")
+
+        if pixel_values is not None:
+            return Florence2ImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_pixel_values(
+                    flatten_bn(pixel_values, concat=True)),
+            )
+
+        if image_embeds is not None:
+            raise NotImplementedError
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _encode_image(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        dtype = next(self.vision_tower.parameters()).dtype
+        pixel_values = pixel_values.to(dtype)
+
+        batch_size, T = pixel_values.size(0), 1
+        x = self.vision_tower.forward_features_unpool(pixel_values)
+        if self.image_pos_embed is not None:
+            x = x.view(batch_size * T, -1, x.shape[-1])
+            num_tokens = x.shape[-2]
+            h, w = int(num_tokens**0.5), int(num_tokens**0.5)
+            assert h * w == num_tokens, (
+                'only support square feature maps for now')
+            x = x.view(batch_size * T, h, w, x.shape[-1])
+            pos_embed = self.image_pos_embed(x)
+            x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
+
+        if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+            x = x.view(batch_size, T, -1,
+                       x.shape[-1]) + visual_temporal_embed.view(
+                           1, T, 1, x.shape[-1])
+
+        x_feat_dict = {}
+
+        spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
+        x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+
+        temporal_avg_pool_x = x.view(batch_size, T, -1,
+                                     x.shape[-1]).mean(dim=1)
+        x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
+
+        x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
+        x_feat_dict['last_frame'] = x
+
+        new_x = []
+        for _image_feature_source in self.image_feature_source:
+            if _image_feature_source not in x_feat_dict:
+                raise ValueError('invalid image feature source: {}'.format(
+                    _image_feature_source))
+            new_x.append(x_feat_dict[_image_feature_source])
+
+        x = torch.cat(new_x, dim=1)
+
+        x = x @ self.image_projection
+        x = self.image_proj_norm(x)
+
+        return x
+
+    def _process_image_input(
+            self, image_input: Florence2ImagePixelInputs) -> torch.Tensor:
+        assert image_input["type"] == "pixel_values"
+        pixel_values = image_input["data"]
+        return self._encode_image(pixel_values)
+
+    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.pad_token_id)
+        return inputs_embeds
 
     def forward(
         self,
@@ -216,8 +1082,19 @@ def forward(
         Returns:
             Output torch.Tensor
         """
-        return self.language_model(input_ids, positions, encoder_input_ids,
-                                   encoder_positions)
+        vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+        if encoder_input_ids.numel() > 0 or vision_embeddings is not None:
+            inputs_embeds = self.get_input_embeddings(encoder_input_ids,
+                                                      vision_embeddings)
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            encoder_input_ids,
+                                            encoder_positions,
+                                            inputs_embeds=inputs_embeds)
+        return hidden_states
 
     def compute_logits(
         self,
@@ -236,9 +1113,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        skip_prefixes = [
-            'image_projection', "vision_tower", "image_proj_norm",
-            "image_pos_embed", "visual_temporal_embed"
-        ]
-        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 58155905a7b..75e31d557dd 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -105,7 +105,6 @@
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
-    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
 }
 
 _EMBEDDING_MODELS = {
@@ -182,6 +181,7 @@
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
+    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 93756364dea..60b000e2b34 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1303,6 +1303,14 @@ def create_encoder_prompt(
         """
         raise NotImplementedError
 
+    def create_decoder_prompt(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+    ) -> Union[str, list[int]]:
+        """Create input prompt for the decoder."""
+        return prompt
+
     def apply(
         self,
         prompt: Union[str, list[int]],
@@ -1323,17 +1331,15 @@ def apply(
             hf_processor_mm_kwargs,
         )
 
-        # We assumed the decoder prompt text is copied from
-        # the original encoder prompt without extra process
         tokenizer = self.info.get_tokenizer()
-        if isinstance(prompt, str):
-            decoder_prompt = prompt
+        decoder_prompt = self.create_decoder_prompt(prompt, mm_data)
+        if isinstance(decoder_prompt, str):
             decoder_prompt_ids = encode_tokens(tokenizer,
-                                               prompt,
+                                               decoder_prompt,
                                                add_special_tokens=False)
         else:
-            decoder_prompt = decode_tokens(tokenizer, prompt)
-            decoder_prompt_ids = prompt
+            decoder_prompt_ids = decoder_prompt
+            decoder_prompt = decode_tokens(tokenizer, decoder_prompt)
 
         mm_inputs = MultiModalEncDecInputs(
             encoder_prompt=encoder_inputs["prompt"],
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 093f8b7a817..3178b0f8c3e 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -204,9 +204,11 @@ def get_dummy_data(
                     "and/or reduce `mm_counts`.", seq_len, total_len,
                     total_placeholders_by_modality)
 
+            num_tokens_to_pad = max(total_len, seq_len) - total_len
+            prompt_token_ids.extend([0] * num_tokens_to_pad)
+
             return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts(
-                    (0, max(seq_len, total_len))),
+                seq_data=SequenceData.from_seqs(prompt_token_ids),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )

From 9e743e7b99af6d1abf3db4bb5dfcb7a55b55d5ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Thu, 27 Feb 2025 11:08:35 +0100
Subject: [PATCH 0413/1240] [Model] Deepseek GGUF support  (#13167)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/gguf.md     |   7 +
 vllm/config.py                                |   9 +-
 vllm/engine/arg_utils.py                      |   8 ++
 vllm/model_executor/layers/fused_moe/layer.py |  22 ++-
 vllm/model_executor/layers/linear.py          |  15 ++-
 .../layers/quantization/gguf.py               | 127 +++++++++++++++++-
 vllm/model_executor/model_loader/loader.py    |  19 ++-
 .../model_loader/weight_utils.py              |   1 -
 8 files changed, 198 insertions(+), 10 deletions(-)

diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md
index 65c181900f9..4b1ff4a22a2 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@@ -29,6 +29,13 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::
 
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
+
+```console
+# If you model is not supported by huggingface you can manually provide a huggingface compatible config path
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+```
+
 You can also use the GGUF model directly through the LLM entrypoint:
 
 ```python
diff --git a/vllm/config.py b/vllm/config.py
index a5d8ee9303d..d1384c6375f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -229,6 +229,7 @@ def __init__(
         trust_remote_code: bool,
         dtype: Union[str, torch.dtype],
         seed: int,
+        hf_config_path: Optional[str] = None,
         allowed_local_media_path: str = "",
         revision: Optional[str] = None,
         code_revision: Optional[str] = None,
@@ -259,6 +260,7 @@ def __init__(
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model = model
+        self.hf_config_path = hf_config_path
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
@@ -321,8 +323,9 @@ def __init__(
         if self.enable_sleep_mode and not current_platform.is_cuda():
             raise ValueError("Sleep mode is only supported on CUDA devices.")
 
-        hf_config = get_config(self.model, trust_remote_code, revision,
-                               code_revision, config_format)
+        hf_config = get_config(self.hf_config_path or self.model,
+                               trust_remote_code, revision, code_revision,
+                               config_format)
 
         if hf_overrides_kw:
             logger.info("Overriding HF config with %s", hf_overrides_kw)
@@ -947,7 +950,7 @@ def get_multimodal_config(self) -> "MultiModalConfig":
     def try_get_generation_config(self) -> Dict[str, Any]:
         if self.generation_config is None or self.generation_config == "auto":
             config = try_get_generation_config(
-                self.model,
+                self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
                 revision=self.revision,
             )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 26d4a84b841..1a2f794c915 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -93,6 +93,7 @@ class EngineArgs:
     model: str = 'facebook/opt-125m'
     served_model_name: Optional[Union[str, List[str]]] = None
     tokenizer: Optional[str] = None
+    hf_config_path: Optional[str] = None
     task: TaskOption = "auto"
     skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
@@ -262,6 +263,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.tokenizer,
             help='Name or path of the huggingface tokenizer to use. '
             'If unspecified, model name or path will be used.')
+        parser.add_argument(
+            "--hf-config-path",
+            type=nullable_str,
+            default=EngineArgs.hf_config_path,
+            help='Name or path of the huggingface config to use. '
+            'If unspecified, model name or path will be used.')
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
@@ -1076,6 +1083,7 @@ def create_model_config(self) -> ModelConfig:
 
         return ModelConfig(
             model=self.model,
+            hf_config_path=self.hf_config_path,
             task=self.task,
             # We know this is not None because we set it in __post_init__
             tokenizer=cast(str, self.tokenizer),
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 42554b61f67..28a88571dab 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,6 +5,7 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
+from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
 from vllm.distributed import (get_tensor_model_parallel_rank,
@@ -514,7 +515,12 @@ def weight_loader(self, param: torch.nn.Parameter,
         # dimension intermediate_size_per_partition is used.
         SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}
 
-        expert_data = param.data[expert_id]
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+            param.data.copy_(loaded_weight)
+            return
 
         # is_transposed: if the dim to shard the weight
         # should be flipped. Required by GPTQ, compressed-tensors
@@ -524,6 +530,20 @@ def weight_loader(self, param: torch.nn.Parameter,
         if is_transposed:
             shard_dim = int(not shard_dim)
 
+        full_load = len(loaded_weight.shape) == 3
+        if full_load:
+            shard_dim += 1
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            final_shape = list(loaded_weight.shape)
+            if shard_id in ["w1", "w3"]:
+                final_shape[1] *= 2
+            final_shape[shard_dim] = final_shape[
+                shard_dim] // get_tensor_model_parallel_world_size()
+            param.materialize(final_shape, dtype=loaded_weight.dtype)
+
+        expert_data = param.data if full_load else param.data[expert_id]
         # Case input scale: input_scale loading is only supported for fp8
         if "input_scale" in weight_name:
             # this is needed for compressed-tensors only
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 521724765be..b9c85aaf50b 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -235,10 +235,23 @@ def __init__(self,
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         # If the weight on disk does not have a shape, give it one
         # (such scales for AutoFp8).
+        # Special case for GGUF
+
+        is_gguf_weight = getattr(param, "is_gguf_weight", False)
+        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
+        if is_gguf_weight_type:
+            param.weight_type = loaded_weight.item()
+
+        # Materialize GGUF UninitializedParameter
+        if is_gguf_weight and isinstance(param, UninitializedParameter):
+            param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype)
+
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
 
-        assert param.size() == loaded_weight.size()
+        assert param.size() == loaded_weight.size(), (
+            f"Tried to load weights of size {loaded_weight.size()}"
+            f"to a parameter of size {param.size()}")
         param.data.copy_(loaded_weight)
 
     def forward(self,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index b1fecb32f4d..ba176e4a567 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import gguf
 import torch
@@ -8,6 +8,9 @@
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
+                                                        FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
@@ -29,7 +32,7 @@ def get_name(self) -> str:
         return "gguf"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
-        return [torch.half, torch.bfloat16]
+        return [torch.half]
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -49,6 +52,8 @@ def get_quant_method(self, layer: torch.nn.Module,
             return GGUFLinearMethod(self)
         elif isinstance(layer, VocabParallelEmbedding):
             return GGUFEmbeddingMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return GGUFMoEMethod(self)
         return None
 
 
@@ -184,6 +189,124 @@ def apply(self,
         return out
 
 
+class GGUFMoEMethod(FusedMoEMethodBase):
+    """MoE method for GGUF.
+
+    Args:
+        quant_config: The GGUF quantization config.
+    """
+
+    def __init__(self, quant_config: GGUFConfig):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        tensor_shape = (num_experts, 2 * intermediate_size_per_partition,
+                        hidden_size)
+        #gate up proj
+        w13_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w13_qweight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            })
+        set_weight_attrs(w13_qweight, extra_weight_attrs)
+        layer.register_parameter("w13_qweight", w13_qweight)
+
+        w13_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
+                                     requires_grad=False)
+        set_weight_attrs(w13_qweight_type, {
+            "is_gguf_weight_type": True,
+            "weight_type": 0,
+            "ignore_warning": True
+        })
+        set_weight_attrs(w13_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w13_qweight_type", w13_qweight_type)
+
+        tensor_shape = (num_experts, intermediate_size_per_partition,
+                        hidden_size)
+        #gate down proj
+        w2_qweight = GGUFUninitializedParameter(requires_grad=False)
+        set_weight_attrs(
+            w2_qweight, {
+                "input_dim": 1,
+                "output_dim": 0,
+                "tensor_shape": tensor_shape,
+                "is_gguf_weight": True,
+                "data_container": [],
+            })
+        set_weight_attrs(w2_qweight, extra_weight_attrs)
+        layer.register_parameter("w2_qweight", w2_qweight)
+
+        w2_qweight_type = Parameter(torch.empty(1, dtype=torch.uint8),
+                                    requires_grad=False)
+        set_weight_attrs(w2_qweight_type, {
+            "is_gguf_weight_type": True,
+            "weight_type": 0,
+            "ignore_warning": True
+        })
+
+        set_weight_attrs(w2_qweight_type, extra_weight_attrs)
+        layer.register_parameter("w2_qweight_type", w2_qweight_type)
+        self.act = SiluAndMul()
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+    ):
+        assert activation == "silu", "Only SiLU activation is supported."
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+        final_hidden_states = torch.empty_like(x)
+        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
+            inp = x[tok].reshape((1, ) + x.shape[1:])
+            current_hidden_state = None
+            for ww, ii in zip(w, idx):
+                expert_up = layer.w13_qweight[ii]
+
+                out = _fuse_mul_mat(inp, expert_up,
+                                    layer.w13_qweight_type.weight_type)
+                out = self.act(out)
+
+                expert_down = layer.w2_qweight[ii]
+                current_state = _fuse_mul_mat(
+                    out, expert_down,
+                    layer.w2_qweight_type.weight_type).mul_(ww)
+                if current_hidden_state is None:
+                    current_hidden_state = current_state
+                else:
+                    current_hidden_state.add_(current_state)
+            final_hidden_states[tok] = current_hidden_state
+        return final_hidden_states
+
+
 class GGUFEmbeddingMethod(GGUFLinearMethod):
     """Embedding method for GGUF.
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4e8ef49235e..46247eaf2a6 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1245,9 +1245,24 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         """
         config = model_config.hf_config
         model_type = config.model_type
+        gguf_to_hf_name_map = {}
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
             model_type = "command-r"
+        if model_type in ("deepseek_v3", "deepseek_v2"):
+            model_type = "deepseek2"
+            # GGUF layer map assumes that we will have a merged expert weights
+            # so we need to map them manually
+            for idx in range(config.num_hidden_layers):
+                gguf_to_hf_name_map[f"blk.{idx}.exp_probs_b.bias"] = \
+                        f"model.layers.{idx}.mlp.gate.e_score_correction_bias"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_down_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.down_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_gate_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.gate_proj.weight"
+                gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = \
+                        f"model.layers.{idx}.mlp.experts.0.up_proj.weight"
+
         arch = None
         for key, value in gguf.MODEL_ARCH_NAMES.items():
             if value == model_type:
@@ -1258,10 +1273,10 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         num_layers = config.num_hidden_layers
         name_map = gguf.get_tensor_name_map(arch, num_layers)
         with torch.device("meta"):
-            dummy_model = AutoModelForCausalLM.from_config(config)
+            dummy_model = AutoModelForCausalLM.from_config(
+                config, trust_remote_code=model_config.trust_remote_code)
         state_dict = dummy_model.state_dict()
 
-        gguf_to_hf_name_map = {}
         for hf_name in state_dict:
             name, suffix = hf_name.rsplit(".", 1)
             gguf_name = name_map.get_name(name)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 18f6f40b32f..245c199f75b 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -496,7 +496,6 @@ def gguf_quant_weights_iterator(
             weight = tensor.data
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
-
             if weight_type.name != "F32":
                 name = name.replace("weight", "qweight")
             param = torch.tensor(weight)

From b2adf4e6122678640a3711b57836921200ed108d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E5=8D=9A=E4=BC=9F?= <mamomobo@live.com>
Date: Fri, 28 Feb 2025 00:05:11 +0800
Subject: [PATCH 0414/1240] Update quickstart.md (#13958)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/quickstart.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index f51856d6eae..452bee2385f 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -24,6 +24,12 @@ source myenv/bin/activate
 uv pip install vllm
 ```
 
+Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating an environment:
+
+```console
+uv run --with vllm vllm --help
+```
+
 You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
 
 ```console

From b878b67d6f34f425904ac8c4f853a66b03e1032d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Feb 2025 16:27:47 +0000
Subject: [PATCH 0415/1240] Deduplicate `.pre-commit-config.yaml`'s `exclude`
 (#13967)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 20d1981c9a0..23a38d49638 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,7 @@
 default_stages:
   - pre-commit # Run locally
   - manual # Run in CI
+exclude: 'vllm/third_party/.*'
 repos:
 - repo: https://github.com/google/yapf
   rev: v0.43.0
@@ -8,13 +9,11 @@ repos:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
-    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/astral-sh/ruff-pre-commit
   rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github, --fix]
-    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.0
   hooks:
@@ -25,7 +24,6 @@ repos:
   rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
   hooks:
   - id: isort
-    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/pre-commit/mirrors-clang-format
   rev: v19.1.7
   hooks:
@@ -38,12 +36,10 @@ repos:
   hooks:
   - id: pymarkdown
     args: [fix]
-    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
   hooks:
   - id: actionlint
-    exclude: 'vllm/third_party/.*'
 - repo: https://github.com/astral-sh/uv-pre-commit
   rev: 0.6.2
   hooks:
@@ -59,7 +55,6 @@ repos:
     types: [python]
     additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
     stages: [pre-commit] # Don't run in CI
-    exclude: 'vllm/third_party/.*'
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
     entry: tools/mypy.sh 1 "3.9"
@@ -67,7 +62,6 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
-    exclude: 'vllm/third_party/.*'
   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.10
     entry: tools/mypy.sh 1 "3.10"
@@ -75,7 +69,6 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
-    exclude: 'vllm/third_party/.*'
   - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.11
     entry: tools/mypy.sh 1 "3.11"
@@ -83,7 +76,6 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
-    exclude: 'vllm/third_party/.*'
   - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.12
     entry: tools/mypy.sh 1 "3.12"
@@ -91,19 +83,16 @@ repos:
     types: [python]
     additional_dependencies: *mypy_deps
     stages: [manual] # Only run in CI
-    exclude: 'vllm/third_party/.*'
   - id: shellcheck
     name: Lint shell scripts
     entry: tools/shellcheck.sh
     language: script
     types: [shell]
-    exclude: 'vllm/third_party/.*'
   - id: png-lint
     name: Lint PNG exports from excalidraw
     entry: tools/png-lint.sh
     language: script
     types: [png]
-    exclude: 'vllm/third_party/.*'
   - id: signoff-commit
     name: Sign-off Commit
     entry: bash
@@ -116,13 +105,11 @@ repos:
     language: system
     verbose: true
     stages: [commit-msg]
-    exclude: 'vllm/third_party/.*'
   - id: check-spdx-header
     name: Check SPDX headers
     entry: python tools/check_spdx_header.py
     language: python
     types: [python]
-    exclude: 'vllm/third_party/.*'
   - id: check-filenames
     name: Check for spaces in all filenames
     entry: bash
@@ -132,7 +119,6 @@ repos:
     language: system
     always_run: true
     pass_filenames: false
-    exclude: 'vllm/third_party/.*'
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
@@ -140,5 +126,4 @@ repos:
     language: system
     verbose: true
     pass_filenames: false
-    exclude: 'vllm/third_party/.*'
   # Insert new entries above the `suggestion` entry

From 4ed7d866392d84cad25c4de528692028dc3f7f13 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Thu, 27 Feb 2025 09:01:21 -0800
Subject: [PATCH 0416/1240] [bugfix] Fix profiling for RayDistributedExecutor
 (#13945)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index c3b41d1c113..2accb9e17f3 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -309,19 +309,24 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             ",".join(map(str, node_gpus[node_id])),
         } for (node_id, _) in worker_node_and_gpu_ids]
 
+        # Environment variables to copy from driver to workers
+        env_vars_to_copy = [
+            "VLLM_ATTENTION_BACKEND", "TPU_CHIPS_PER_HOST_BOUNDS",
+            "TPU_HOST_BOUNDS", "VLLM_USE_V1", "VLLM_TRACE_FUNCTION",
+            "VLLM_TORCH_PROFILER_DIR", "VLLM_TEST_ENABLE_EP"
+        ]
+
+        # Copy existing env vars to each worker's args
         for args in all_args_to_update_environment_variables:
-            # some carry-over env vars from the driver
             # TODO: refactor platform-specific env vars
-            for name in [
-                    "VLLM_ATTENTION_BACKEND",
-                    "TPU_CHIPS_PER_HOST_BOUNDS",
-                    "TPU_HOST_BOUNDS",
-                    "VLLM_USE_V1",
-                    "VLLM_TRACE_FUNCTION",
-            ]:
+            for name in env_vars_to_copy:
                 if name in os.environ:
                     args[name] = os.environ[name]
 
+        logger.info(
+            "Copying the following environment variables to workers: %s",
+            [v for v in env_vars_to_copy if v in os.environ])
+
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)
 

From 536ed00467366b1364c62f75f0e71ef92815c23e Mon Sep 17 00:00:00 2001
From: Noam Gat <noamgat@gmail.com>
Date: Thu, 27 Feb 2025 19:16:12 +0200
Subject: [PATCH 0417/1240] =?UTF-8?q?Update=20LMFE=20version=20to=20v0.10.?=
 =?UTF-8?q?11=20to=20support=20new=20versions=20of=20transforme=E2=80=A6?=
 =?UTF-8?q?=20(#13930)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 942c3e039ea..fb84d6d9e7b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -17,7 +17,7 @@ prometheus_client >= 0.18.0
 pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer >= 0.10.9, < 0.11
+lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.11; platform_machine == "x86_64"

From 1f86d56fcd58bc2bdfe176c3bac5330da41b8f3c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 28 Feb 2025 01:30:39 +0800
Subject: [PATCH 0418/1240] [Bugfix] Fix qwen2.5-vl overflow issue (#13968)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/minicpmo.py   | 11 +++--------
 vllm/model_executor/models/qwen2_5_vl.py |  7 ++++++-
 vllm/model_executor/models/utils.py      | 10 ++++++++++
 vllm/model_executor/models/whisper.py    |  9 +++------
 4 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index e354e532332..e6111f46143 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -47,7 +47,7 @@
                        MiniCPMVMultiModalDataParser,
                        MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
                        _minicpmv_field_config)
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, cast_overflow_tensors, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -469,13 +469,8 @@ def forward(
                                               training=self.training)
         hidden_states = residual + hidden_states
 
-        if hidden_states.dtype == torch.float16 and (
-                torch.isinf(hidden_states).any()
-                or torch.isnan(hidden_states).any()):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states,
-                                        min=-clamp_value,
-                                        max=clamp_value)
+        if hidden_states.dtype == torch.float16:
+            hidden_states = cast_overflow_tensors(hidden_states)
 
         outputs = (hidden_states, )
 
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 858cf28d2b8..0dbff665b5d 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -63,7 +63,7 @@
 from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
 from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
                        apply_rotary_pos_emb_vision)
-from .utils import (AutoWeightsLoader, WeightsMapper,
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 from .vision import get_vit_attn_backend
@@ -641,6 +641,11 @@ def forward(
                                 cu_seqlens=cu_seqlens_now,
                                 rotary_pos_emb=rotary_pos_emb)
 
+        # For Qwen2.5-VL-3B, float16 will overflow at last block
+        # for long visual tokens sequences.
+        if hidden_states.dtype == torch.float16:
+            hidden_states = cast_overflow_tensors(hidden_states)
+
         # adapter
         hidden_states = self.merger(hidden_states)
         reverse_indices = torch.argsort(window_index)
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index fff4be34ddb..f9aa5da39a5 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -641,3 +641,13 @@ def extract_layer_index(layer_name: str) -> int:
     assert len(int_vals) == 1, (f"layer name {layer_name} should"
                                 " only contain one integer")
     return int_vals[0]
+
+
+def cast_overflow_tensors(
+    tensors: torch.Tensor,
+    offset: float = 1000,
+) -> torch.Tensor:
+    if tensors.isinf().any() or tensors.isnan().any():
+        clamp_value = torch.finfo(tensors.dtype).max - offset
+        tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
+    return tensors
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index e5f77e08c40..a2eefbc6d89 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -35,7 +35,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .interfaces import SupportsMultiModal, SupportsTranscription
-from .utils import AutoWeightsLoader, WeightsMapper, make_layers
+from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
+                    make_layers)
 
 logger = init_logger(__name__)
 
@@ -285,11 +286,7 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
 
-        if hidden_states.isinf().any() or hidden_states.isnan().any():
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states,
-                                        min=-clamp_value,
-                                        max=clamp_value)
+        hidden_states = cast_overflow_tensors(hidden_states)
 
         return hidden_states
 

From dff3cc4acc3f077d08b0f9ba5e40fda83c266af9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Feb 2025 01:44:25 +0800
Subject: [PATCH 0419/1240] [VLM] Generalized prompt updates for multi-modal
 processor (#13964)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md  |  26 +-
 docs/source/design/mm_processing.md           |  23 +-
 tests/multimodal/test_processing.py           | 210 ++++----
 vllm/model_executor/models/aria.py            |  12 +-
 vllm/model_executor/models/blip2.py           |  24 +-
 vllm/model_executor/models/chameleon.py       |  14 +-
 vllm/model_executor/models/deepseek_vl2.py    |  10 +-
 vllm/model_executor/models/florence2.py       |  24 +-
 vllm/model_executor/models/fuyu.py            |  12 +-
 vllm/model_executor/models/glm4v.py           |  11 +-
 vllm/model_executor/models/h2ovl.py           |  11 +-
 vllm/model_executor/models/idefics3.py        |  10 +-
 vllm/model_executor/models/internvl.py        |  13 +-
 vllm/model_executor/models/llava.py           |  23 +-
 .../model_executor/models/llava_next_video.py |  11 +-
 vllm/model_executor/models/llava_onevision.py |  20 +-
 vllm/model_executor/models/minicpmo.py        |   9 +-
 vllm/model_executor/models/minicpmv.py        |  11 +-
 vllm/model_executor/models/mllama.py          |  10 +-
 vllm/model_executor/models/molmo.py           |  35 +-
 vllm/model_executor/models/nvlm_d.py          |  13 +-
 vllm/model_executor/models/phi3v.py           |  21 +-
 .../models/prithvi_geospatial_mae.py          |  24 +-
 vllm/model_executor/models/qwen2_audio.py     |  12 +-
 vllm/model_executor/models/qwen2_vl.py        |  16 +-
 vllm/model_executor/models/qwen_vl.py         |  15 +-
 vllm/model_executor/models/ultravox.py        |  11 +-
 vllm/model_executor/models/whisper.py         |  10 +-
 vllm/multimodal/processing.py                 | 486 +++++++++++-------
 29 files changed, 635 insertions(+), 492 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 990eac82d51..c8046d24850 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -720,13 +720,13 @@ def _get_mm_fields_config(
 
 :::::
 
-### Prompt replacements
+### Prompt updates
 
-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
-return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
+Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
+return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
 
-Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
-operation performed by the HF processor.
+Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
+(e.g.: insertion, replacement) performed by the HF processor.
 
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
@@ -743,15 +743,15 @@ for sample in text:
 ```
 
 It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
-Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` as follows:
+Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
 
 ```python
-def _get_prompt_replacements(
+def _get_prompt_updates(
     self,
     mm_items: MultiModalDataItems,
     hf_processor_mm_kwargs: Mapping[str, object],
     out_mm_kwargs: MultiModalKwargs,
-) -> list[PromptReplacement]:
+) -> Sequence[PromptUpdate]:
     hf_config = self.info.get_hf_config()
     image_token_id = hf_config.image_token_index
 
@@ -859,7 +859,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of `PromptReplacementDetails`
+To accommodate this, instead of a string you can return an instance of `PromptUpdateDetails`
 with different `full` and `feature` attributes:
 
 ```python
@@ -878,7 +878,7 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptReplacementDetails(
+    return PromptUpdateDetails(
         full=image_tokens + [bos_token_id],
         features=image_tokens,
     )
@@ -888,12 +888,12 @@ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the
 we can search for it to conduct the replacement at the start of the string:
 
 ```python
-def _get_prompt_replacements(
+def _get_prompt_updates(
     self,
     mm_items: MultiModalDataItems,
     hf_processor_mm_kwargs: Mapping[str, object],
     out_mm_kwargs: MultiModalKwargs,
-) -> list[PromptReplacement]:
+) -> Sequence[PromptUpdate]:
     hf_config = self.info.get_hf_config()
     bos_token_id = hf_config.bos_token_id
     assert isinstance(bos_token_id, int)
@@ -913,7 +913,7 @@ def _get_prompt_replacements(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptReplacementDetails(
+        return PromptUpdateDetails(
             full=image_tokens + [bos_token_id],
             features=image_tokens,
         )
diff --git a/docs/source/design/mm_processing.md b/docs/source/design/mm_processing.md
index a0d01205e63..2a4dac786d4 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@@ -6,11 +6,16 @@ To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefi
 
 Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
 
-## Prompt Replacement Detection
+## Prompt Update Detection
 
-One of the main responsibilies of HF processor is to replace input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size). The information about which tokens have been replaced is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
 
-In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptReplacement` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`. Given this specification, we can automatically detect whether HF has replaced the input placeholder tokens by checking whether the feature placeholder tokens exist in the prompt.
+- Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
+- Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
+
+The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
+
+In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
 
 ## Tokenized Prompt Inputs
 
@@ -22,7 +27,7 @@ Consider that HF processors follow these main steps:
 
 1. Tokenize the text
 2. Process multi-modal inputs
-3. Perform prompt replacement
+3. Perform prompt updates
 
 And we require that:
 
@@ -44,16 +49,16 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 
 We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
-(mm-automatic-prompt-replacement)=
+(mm-automatic-prompt-updating)=
 
-### Automatic prompt replacement
+### Automatic prompt updating
 
 We address the second issue by implementing model-agnostic code in
-{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_replacements` to automatically replace input placeholder tokens with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements`.
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
 
 ### Summary
 
-With the help of dummy text and automatic prompt replacement, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
 
 ## Processor Output Caching
 
@@ -61,4 +66,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238)
 
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
 
-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt replacement code, we apply [automatic prompt replacement](#mm-automatic-prompt-replacement) afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index c2fbe83abc8..878b1592500 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -14,12 +14,12 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptReplacement,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
+                                        apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
-                                        iter_token_matches,
-                                        replace_text_matches,
-                                        replace_token_matches)
+                                        iter_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -102,7 +102,7 @@ def test_iter_token_matches(token_ids, match_ids, expected):
             {
                 "pattern_1": [],
                 "pattern_2": [],
-            }
+            },
         ),
         (
             [32000, 32000, 32000, 32000],
@@ -147,16 +147,22 @@ def test_iter_token_matches(token_ids, match_ids, expected):
         ),
     ],
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
-def test_find_token_matches(prompt, target_by_key, expected_by_key):
+def test_find_token_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
     # Should not be used since there is nothing to convert to token IDs
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, []).bind(mock_tokenizer)
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
-    result = find_token_matches(prompt, prompt_repls)
+    result = find_token_matches(prompt, prompt_updates)
 
     # Only displayed on error
     print("result:", result)
@@ -254,16 +260,22 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
         ),
     ],
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
-def test_find_text_matches(prompt, target_by_key, expected_by_key):
+def test_find_text_matches(
+    prompt,
+    target_by_key,
+    expected_by_key,
+    update_type,
+):
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    prompt_repls = [
-        PromptReplacement(key, target, []).bind(mock_tokenizer)
+    prompt_updates = [
+        update_type(key, target, []).bind(mock_tokenizer)
         for key, target in target_by_key.items()
     ]
-    result = find_text_matches(prompt, prompt_repls)
+    result = find_text_matches(prompt, prompt_updates)
 
     # Only displayed on error
     print("result:", result)
@@ -281,7 +293,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
 
 # yapf: disable
 @pytest.mark.parametrize(
-    ("prompt", "target_by_key", "repl_by_key"),
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
     [
         (
             "Image:<image>Image:<image><image>!",
@@ -300,58 +312,66 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
                 # Test dynamic replacement (beyond the form of `unit * count`)
                 "pattern_3": "?!?",
             },
+            {
+                PromptInsertion: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "Image:<image><image><image>Image:<image><image>!?!?",
+                    2: "Image:<image><image><image><image><image>Image:<image><image>!?!??!?",  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: "Image:<image>Image:<image><image>!",
+                    1: "<image><image>Image:<image><image>?!?",
+                    2: "<image><image><image><image><image>?!?",
+                },
+            },
         ),
     ]
 )
-@pytest.mark.parametrize(
-    ("mm_count", "expected"),
-    [
-        (0, "Image:<image>Image:<image><image>!"),
-        (1, "<image><image>Image:<image><image>?!?"),
-        (2, "<image><image><image><image><image>?!?"),
-    ]
-)
 # yapf: enable
-def test_find_replace_text(
+def test_find_update_text(
     prompt,
     target_by_key,
     repl_by_key,
-    mm_count,
-    expected,
+    expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to text
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    mm_prompt_repls = {
-        key: [
-            PromptReplacement(key, target,
-                              repl_by_key[key]).bind(mock_tokenizer)
-        ]
-        for key, target in target_by_key.items()
-    }
-    mm_matches = {
-        key: find_text_matches(prompt, prompt_repls)
-        for key, prompt_repls in mm_prompt_repls.items()
-    }
-
-    result = replace_text_matches(
-        prompt,
-        mm_matches,
-        {key: mm_count
-         for key in repl_by_key},
-    )
-
-    # Only displayed on error
-    print("mm_matches:", mm_matches)
-    print("result:", result)
-
-    # Manually constructed results
-    assert result == expected
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_text_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_text_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected
 
 
 # yapf: disable
 @pytest.mark.parametrize(
-    ("prompt", "target_by_key", "repl_by_key"),
+    ("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"),  # noqa: E501
     [
         # Tokenized test cases of `test_find_replace_text`
         # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
@@ -372,53 +392,61 @@ def test_find_replace_text(
                 # Test dynamic replacement (beyond the form of `unit * count`)
                 "pattern_3": [1550, 918, 1550],
             },
+            {
+                PromptInsertion: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550],  # noqa: E501
+                },
+                PromptReplacement: {
+                    0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
+                    1: [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550],  # noqa: E501
+                    2: [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
+                },
+            },
         ),
     ]
 )
-@pytest.mark.parametrize(
-    ("mm_count", "expected"),
-    [
-        (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
-        (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
-        (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
-    ]
-)
 # yapf: enable
-def test_find_replace_tokens(
+def test_find_update_tokens(
     prompt,
     target_by_key,
     repl_by_key,
-    mm_count,
-    expected,
+    expected_by_update_type_mm_count,
 ):
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    mm_prompt_repls = {
-        key: [
-            PromptReplacement(key, target,
-                              repl_by_key[key]).bind(mock_tokenizer)
-        ]
-        for key, target in target_by_key.items()
-    }
-    mm_matches = {
-        key: find_token_matches(prompt, prompt_repls)
-        for key, prompt_repls in mm_prompt_repls.items()
-    }
-
-    result = replace_token_matches(
-        prompt,
-        mm_matches,
-        {key: mm_count
-         for key in repl_by_key},
-    )
-
-    # Only displayed on error
-    print("mm_matches:", mm_matches)
-    print("result:", result)
-
-    # Manually constructed results
-    assert result == expected
+    for (
+            update_type,
+            expected_by_mm_count,
+    ) in expected_by_update_type_mm_count.items():
+        mm_prompt_updates = {
+            key:
+            [update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
+            for key, target in target_by_key.items()
+        }
+        mm_matches = {
+            key: find_token_matches(prompt, updates)
+            for key, updates in mm_prompt_updates.items()
+        }
+
+        for mm_count, expected in expected_by_mm_count.items():
+            result = apply_token_matches(
+                prompt,
+                mm_matches,
+                {key: mm_count
+                 for key in repl_by_key},
+            )
+
+            # Only displayed on error
+            print("update_type:", update_type)
+            print("mm_count:", mm_count)
+            print("mm_matches:", mm_matches)
+            print("result:", result)
+
+            # Manually constructed results
+            assert result == expected
 
 
 # yapf: disable
@@ -524,22 +552,24 @@ def test_find_replace_tokens(
         ),
     ]
 )
+@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
 # yapf: enable
 def test_find_mm_placeholders(
     repl_by_key,
     prompt,
     expected,
+    update_type,
 ):
     # Should not be used since there is nothing to convert to tokens
     mock_tokenizer = cast(AnyTokenizer, object())
 
-    mm_prompt_repls = {
-        key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
+    mm_prompt_updates = {
+        key: [update_type(key, [], repl).bind(mock_tokenizer)]
         for key, repl in repl_by_key.items()
     }
 
     result = find_mm_placeholders(
-        mm_prompt_repls,
+        mm_prompt_updates,
         prompt,
         # Effectively match all occurrences in the prompt
         {key: 3
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 656e9b037d9..061a9a5bd2b 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import List, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -26,7 +25,8 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -457,12 +457,12 @@ def _get_mm_fields_config(
             pixel_mask=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 23bb3cd07f1..61f2f8974d9 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -19,8 +19,8 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        BaseProcessingInfo, PromptInsertion,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -474,30 +474,24 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
 
-        bos_token_id = tokenizer.bos_token_id
-        assert isinstance(bos_token_id, int)
-
         image_token_id = vocab["<image>"]
         num_image_tokens = self.info.get_num_image_tokens()
         image_tokens = [image_token_id] * num_image_tokens
 
         return [
-            PromptReplacement(
+            PromptInsertion(
                 modality="image",
-                target=[bos_token_id],
-                replacement=PromptReplacementDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
-                ),
+                target="",
+                insertion=image_tokens,
             )
         ]
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e91399b2674..9d597e24095 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Any, Dict, Iterable, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from typing import Any, Dict, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -35,7 +35,7 @@
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -141,12 +141,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
@@ -162,7 +162,7 @@ def _get_prompt_replacements(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptReplacementDetails(
+                replacement=PromptUpdateDetails(
                     full=([image_start_id] + image_tokens + [image_end_id]),
                     features=image_tokens,
                 ),
@@ -371,7 +371,7 @@ def forward(
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
 
         if residual is None:
             residual = hidden_states
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index ea217e24440..3d2e452bb50 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -3,9 +3,9 @@
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 """Inference-only Deepseek-VL2 model compatible with HuggingFace weights."""
 import math
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -26,7 +26,7 @@
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -281,12 +281,12 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         image_token_id = hf_processor.image_token_id
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index b71d0de8d70..c51fcf3d438 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections import OrderedDict
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, OrderedDict,
-                    Set, Tuple, TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -24,8 +25,7 @@
 from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
-                                        PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -803,7 +803,7 @@ def get_dummy_processor_inputs(
 class Florence2MultiModalProcessor(
         EncDecMultiModalProcessor[Florence2ProcessingInfo]):
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
@@ -850,26 +850,22 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         pad_token_id = hf_config.pad_token_id
-        bos_token_id = hf_config.bos_token_id
         num_image_tokens = self.info.get_max_image_tokens()
         image_tokens = [pad_token_id] * num_image_tokens
 
         return [
-            PromptReplacement(
+            PromptInsertion(
                 modality="image",
-                target=[bos_token_id],
-                replacement=PromptReplacementDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
-                ),
+                target="",
+                insertion=image_tokens,
             )
         ]
 
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 7e4cc6bac5e..581ec54b2ca 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -17,8 +17,8 @@
 # limitations under the License.
 """ PyTorch Fuyu model."""
 import math
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import List, Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -37,7 +37,7 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -203,12 +203,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(image_patches=MultiModalFieldConfig.batched("image"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         bos_token_id = hf_config.bos_token_id
         assert isinstance(bos_token_id, int)
@@ -228,7 +228,7 @@ def get_replacement_fuyu(item_idx: int):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=image_tokens + [bos_token_id],
                 features=image_tokens,
             )
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 48543c5642e..ca34c4f8d53 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -4,7 +4,8 @@
 # https://github.com/THUDM/CogAgent
 """Inference-only CogAgent model compatible with THUDM weights."""
 from argparse import Namespace
-from typing import Literal, Mapping, Optional, TypedDict, Union
+from collections.abc import Mapping, Sequence
+from typing import Literal, Optional, TypedDict, Union
 
 import torch
 from torch import nn
@@ -32,7 +33,7 @@
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BatchFeature,
                                         MultiModalFieldConfig,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
@@ -480,7 +481,7 @@ def get_dummy_processor_inputs(
 
 class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
@@ -495,12 +496,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
 
         boi_token_id = hf_config.boi_token_id
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index bab9c256b9a..d336d7521a2 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -7,7 +7,8 @@
 # Copyright (c) 2024 H2O.AI
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Mapping, Optional
+from collections.abc import Mapping, Sequence
+from typing import Optional
 
 import torch
 from PIL import Image
@@ -20,7 +21,7 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
@@ -487,12 +488,12 @@ def __init__(self,
                 f"{type(self).__name__} does not support processing cache with "
                 "multi-image support enabled.")
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         if "image_num_patches" in out_mm_kwargs:
@@ -527,7 +528,7 @@ def get_replacement_internvl(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=hf_processor.get_image_repl_full(feature_size,
                                                       num_patches),
                 features=hf_processor.get_image_repl_features(
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 0a8763cf910..286a75339d2 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -16,8 +16,8 @@
 """Inference-only Idefics3 model compatible with HuggingFace weights."""
 
 import math
-from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Set,
-                    Tuple, TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.utils.checkpoint
@@ -41,7 +41,7 @@
                                         BaseProcessingInfo,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -274,12 +274,12 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         image_token = hf_processor.image_token.content
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 52ddb279cca..48c2eb8c9f6 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -7,9 +7,10 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, TypeVar, Union)
+from typing import (List, Literal, Optional, Set, Tuple, TypedDict, TypeVar,
+                    Union)
 
 import torch
 import torch.nn as nn
@@ -31,7 +32,7 @@
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -599,12 +600,12 @@ def _get_mm_fields_config(
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         if "image_num_patches" in out_mm_kwargs:
@@ -636,7 +637,7 @@ def get_replacement_internvl(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=hf_processor.get_image_repl_full(feature_size,
                                                       num_patches),
                 features=hf_processor.get_image_repl_features(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 72b1591306f..8318a496e60 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, TypeVar, Union)
+from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
+                    TypedDict, TypeVar, Union)
 
 import torch
 import torch.nn as nn
@@ -31,7 +32,7 @@
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -222,12 +223,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         raise NotImplementedError
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
@@ -328,12 +329,12 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         hf_config = self.info.get_hf_config()
         tokenizer = self.info.get_tokenizer()
@@ -789,7 +790,7 @@ def get_replacement_mantis(item_idx: int):
                 "</Image>)",  # 3 tokens
             ])
 
-        mantis_mm_repls = self._bind_and_group_repls([
+        mantis_mm_repls = self._bind_and_group_updates([
             PromptReplacement(
                 modality="image",
                 target=[image_token_id] * num_image_tokens,
@@ -797,18 +798,18 @@ def get_replacement_mantis(item_idx: int):
             )
         ])
 
-        prompt_ids, prompt, _ = self._apply_prompt_replacements(
+        prompt_ids, prompt, _ = self._apply_prompt_updates(
             result["prompt_token_ids"],
             mantis_mm_repls,
             mm_item_counts,
         )
 
-        unbound_orig_repls = self._get_prompt_replacements(
+        unbound_orig_repls = self._get_prompt_updates(
             mm_items,
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        orig_repls = self._bind_and_group_repls(unbound_orig_repls)
+        orig_repls = self._bind_and_group_updates(unbound_orig_repls)
 
         mm_placeholders = self._find_mm_placeholders(
             orig_repls,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 807d6977ed4..ca9406657df 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -21,7 +21,8 @@
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -183,12 +184,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(pixel_values_videos=MultiModalFieldConfig.batched("video"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         video_token_id = hf_config.video_token_index
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e57eea4286e..e87ef24ce2c 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Final, Iterable, List, Literal, Mapping, Optional,
-                    Protocol, Set, Tuple, TypedDict, Union)
+from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -22,7 +23,7 @@
                                     NestedTensors)
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
-from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -347,13 +348,13 @@ def _call_hf_processor(
         )
         return BatchFeature(combined_outputs)
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> bool:
-        base_result = super()._hf_processor_applies_repl(
+        base_result = super()._hf_processor_applies_updates(
             prompt_text=prompt_text,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
@@ -361,13 +362,13 @@ def _hf_processor_applies_repl(
 
         return base_result and mm_items.get_count("video", strict=False) == 0
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
-        image_repls = super()._get_prompt_replacements(
+    ) -> Sequence[PromptUpdate]:
+        image_repls = super()._get_prompt_updates(
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             out_mm_kwargs=out_mm_kwargs,
@@ -392,7 +393,8 @@ def get_video_replacement(item_idx: int):
 
             return [video_token_id] * num_video_tokens
 
-        return image_repls + [
+        return [
+            *image_repls,
             PromptReplacement(
                 modality="video",
                 target=[video_token_id],
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index e6111f46143..f35c230c0ce 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -22,9 +22,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, TypedDict, Union)
+from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import torch
 from torch import nn
@@ -356,10 +357,10 @@ def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
                 inputs["audio"]["audio_lens"][index])
         return super().get_prompt_texts_by_modality(inputs, modality, index)
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
             self, mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, Any],
-            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+            out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]:
         placeholder = {
             "image": self.info.image_pattern,
             "video": self.info.video_pattern,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2699958331f..fb6ea53acf9 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -25,9 +25,10 @@
 import math
 import re
 from collections import Counter
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Set, Tuple, TypedDict, Union)
+from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
+                    TypedDict, Union)
 
 import numpy as np
 import torch
@@ -732,7 +733,7 @@ def _call_hf_processor(
             }
         }
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
@@ -740,10 +741,10 @@ def _hf_processor_applies_repl(
     ) -> bool:
         return False
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
             self, mm_items: MultiModalDataItems,
             hf_processor_mm_kwargs: Mapping[str, Any],
-            out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]:
+            out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]:
         placeholder = {
             "image": self.info.image_pattern,
             "video": self.info.video_pattern,
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 459928fe3fb..36e653e41e1 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 """PyTorch Mllama model."""
 import math
-from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import numpy as np
 import torch
@@ -59,7 +59,7 @@
                                    MultiModalDataDict, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .clip import CLIPMLP
@@ -243,12 +243,12 @@ def create_encoder_prompt(
         image_token_id = self.info.get_hf_config().image_token_index
         return [image_token_id] * num_images
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         token_per_chunk = self.info.get_token_per_chunk_from_config()
         image_token_id = self.info.get_hf_config().image_token_index
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc4d38d8740..60af103189f 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union, cast)
+from typing import List, Optional, Set, Tuple, TypedDict, Union, cast
 
 import numpy as np
 import torch
@@ -46,8 +46,8 @@
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        BaseProcessingInfo, PromptInsertion,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import JSONTree, json_map_leaves
@@ -1190,6 +1190,8 @@ def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
         return MolmoProcessorWrapper(processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        # TODO: Investigate different `embed_is_patch` between cache/no-cache
+        # in multi-image case
         return {"image": 1}
 
     def get_mm_max_tokens_per_item(
@@ -1328,25 +1330,18 @@ def _get_mm_fields_config(
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        tokenizer = self.info.get_tokenizer()
 
         image_token_length_w = processor.image_token_length_w
         image_token_length_h = processor.image_token_length_h
         pooling_size = processor.pooling_size
 
-        user_str = "User:"
-        if processor.always_start_with_space:
-            user_str = " " + user_str
-
-        user_tokens = tokenizer.encode(user_str, add_special_tokens=False)
-
         img_patch_id = processor.image_patch_id
         img_col_id = processor.im_col_id
         img_start_id = processor.im_start_id
@@ -1356,7 +1351,7 @@ def _get_prompt_replacements(
         extra_joint = ([img_start_id] + extra_row * image_token_length_h +
                        [img_end_id])
 
-        def get_replacement_molmo(item_idx: int):
+        def get_insertion_molmo(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
@@ -1371,17 +1366,13 @@ def get_replacement_molmo(item_idx: int):
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
             image_tokens = extra_joint + joint
-
-            return PromptReplacementDetails(
-                full=image_tokens + user_tokens,
-                features=image_tokens,
-            )
+            return image_tokens
 
         return [
-            PromptReplacement(
+            PromptInsertion(
                 modality="image",
-                target=user_str,
-                replacement=get_replacement_molmo,
+                target="<|endoftext|>",
+                insertion=get_insertion_molmo,
             )
         ]
 
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 5de8eeb3fff..1e1760491a9 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -6,7 +6,8 @@
 # Copyright (c) 2024 NVIDIA
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Mapping, Optional
+from collections.abc import Mapping, Sequence
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -17,8 +18,8 @@
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (PromptReplacement,
-                                        PromptReplacementDetails)
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .intern_vit import InternVisionModel
@@ -142,12 +143,12 @@ def get_dummy_processor_inputs(
 
 class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
 
         if "image_num_patches" in out_mm_kwargs:
@@ -179,7 +180,7 @@ def get_replacement_nvlm(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=hf_processor.get_image_repl_full(feature_size,
                                                       num_patches) + "\n",
                 features=hf_processor.get_image_repl_features(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0f45f131065..0fd4b3c7021 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -38,11 +38,10 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo,
-                                        BoundPromptReplacement,
+                                        BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -420,12 +419,12 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
@@ -449,7 +448,7 @@ def get_replacement_phi3v(item_idx: int):
 
             image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=image_tokens + [bos_token_id],
                 features=image_tokens,
             )
@@ -464,15 +463,15 @@ def get_replacement_phi3v(item_idx: int):
             ) for image_token in image_tokens[:num_images]
         ]
 
-    def _apply_prompt_replacements(
+    def _apply_prompt_updates(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
-        token_ids, text, placeholders = super()._apply_prompt_replacements(
+        token_ids, text, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
-            mm_prompt_repls=mm_prompt_repls,
+            mm_prompt_updates=mm_prompt_updates,
             mm_item_counts=mm_item_counts,
         )
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 3d95e949e71..bfa90e42733 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -15,7 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM/NASA Prithvi Geospatial model."""
-from typing import Iterable, Mapping, Optional, Set, Tuple, Union
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -32,7 +33,7 @@
                                     MultiModalInputs, MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
                            PoolingSequenceGroupOutput)
@@ -44,7 +45,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
-        pass
+        return {"image": 0}
 
 
 class PrithviGeoSpatialMAEInputBuilder(
@@ -78,20 +79,13 @@ def _get_mm_fields_config(
             location_coords=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
-        pass
-
-    def _get_mm_fields_config(
-        self,
-        hf_inputs: BatchFeature,
-        hf_processor_mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, MultiModalFieldConfig]:
-        pass
+    ) -> Sequence[PromptUpdate]:
+        return []
 
     def apply(
         self,
@@ -120,7 +114,7 @@ def apply(
 class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
     """ Prithvi Masked Autoencoder"""
 
-    def _instantiate_model(self, config: dict) -> nn.Module | None:
+    def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
 
         # We might be able/need to support different tasks with this same model
         if config["task_args"]["task"] == "SemanticSegmentationTask":
@@ -158,7 +152,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
                 "by PrithviGeospatialMAE.")
 
     def _parse_and_validate_multimodal_data(
-            self, **kwargs) -> Tuple[torch.Tensor, torch.Tensor | None]:
+            self, **kwargs) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
 
         pixel_values = kwargs.pop("pixel_values", None)
         if not isinstance(pixel_values, torch.Tensor):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index f0dc8573ee1..1c3107e76eb 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -21,9 +21,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Any, Iterable, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from typing import Any, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -43,7 +43,7 @@
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -188,12 +188,12 @@ def _get_mm_fields_config(
             feature_attention_mask=MultiModalFieldConfig.batched("audio"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
@@ -230,7 +230,7 @@ def get_replacement_qwen2_audio(item_idx: int):
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptReplacementDetails(
+            return PromptUpdateDetails(
                 full=[audio_bos_id] + audio_tokens + [audio_eos_id],
                 features=audio_tokens,
             )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 849ef7293bb..cb92fcbe9fa 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -23,9 +23,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
-from typing import (Any, Callable, Iterable, Literal, Mapping, Optional, Set,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 import torch.nn as nn
@@ -61,7 +62,8 @@
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import _Backend
 from vllm.sequence import IntermediateTensors
@@ -169,7 +171,7 @@ def __init__(
         self,
         in_features: int,
         hidden_features: int,
-        act_layer: Type[nn.Module] = QuickGELU,
+        act_layer: type[nn.Module] = QuickGELU,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ):
@@ -383,7 +385,7 @@ def __init__(
         dim: int,
         num_heads: int,
         mlp_ratio: float,
-        act_layer: Type[nn.Module] = QuickGELU,
+        act_layer: type[nn.Module] = QuickGELU,
         norm_layer: Optional[Callable[[int], nn.Module]] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
@@ -987,12 +989,12 @@ def _call_hf_processor(
             self.info._get_image_processor_kwargs(**mm_kwargs),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_processor = self.info.get_image_processor(
             **hf_processor_mm_kwargs)
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index e0d8bf2fa3d..b8aaa7f1db1 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -9,9 +9,10 @@
 import math
 import re
 import unicodedata
+from collections.abc import Collection, Mapping, Sequence
+from collections.abc import Set as AbstractSet
 from functools import lru_cache, partial
-from typing import (AbstractSet, Callable, Collection, List, Literal, Mapping,
-                    Optional, TypedDict, Union)
+from typing import Callable, List, Literal, Optional, TypedDict, Union
 
 import torch
 from torch import nn
@@ -36,7 +37,7 @@
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptReplacementDetails)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -606,7 +607,7 @@ def _call_hf_processor(
             mm_kwargs=mm_kwargs,
         )
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
@@ -624,12 +625,12 @@ def _get_mm_fields_config(
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         tokenizer = self.info.get_tokenizer()
         special_tokens: dict[str,
                              int] = tokenizer.special_tokens  # type: ignore
@@ -646,7 +647,7 @@ def _get_prompt_replacements(
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptReplacementDetails(
+                replacement=PromptUpdateDetails(
                     full=[img_start_id] + image_tokens + [img_end_id],
                     features=image_tokens,
                 ),
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index b8d4aef252e..d47f924ea19 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,9 +3,9 @@
 # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py
 """PyTorch Ultravox model."""
 import math
+from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Any, Iterable, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.utils.checkpoint
@@ -29,7 +29,8 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
@@ -197,12 +198,12 @@ def _get_mm_fields_config(
             audio_embeds=MultiModalFieldConfig.batched("audio"),
         )
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         tokenizer = self.info.get_tokenizer()
         vocab = tokenizer.get_vocab()
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index a2eefbc6d89..2da8c5c8b0e 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
-                    Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import List, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 from torch import nn
@@ -31,7 +31,7 @@
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
-                                        PromptReplacement)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .interfaces import SupportsMultiModal, SupportsTranscription
@@ -623,12 +623,12 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(input_features=MultiModalFieldConfig.batched("audio"))
 
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         num_tokens = self.info.get_max_audio_tokens()
         return [
             PromptReplacement(
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 60b000e2b34..ac33af7c10c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -6,11 +6,14 @@
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
                              Sequence)
 from dataclasses import dataclass, field
+from enum import Enum
 from functools import lru_cache
+from itertools import groupby
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
-                    TypeVar, Union)
+                    TypeVar, Union, cast)
 
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
+from typing_extensions import assert_never
 
 import vllm.envs as envs
 from vllm.inputs import InputProcessingContext
@@ -38,35 +41,129 @@
 
 
 @dataclass
-class PromptReplacementDetails:
-    """Details about the replacement token sequence or text."""
+class PromptUpdateDetails:
+    """Details about the token sequence or text that are part of the update."""
 
     full: PromptSeq
-    """The full replacement."""
+    """The full content."""
 
     features: PromptSeq
     """
-    The part of the replacement that corresponds to feature placeholders;
+    The part of the content that corresponds to feature placeholders;
     this will be replaced by the output of the vision encoder during model
     inference.
     """
 
     @staticmethod
-    def from_seq(seq: PromptSeq) -> "PromptReplacementDetails":
-        return PromptReplacementDetails(full=seq, features=seq)
+    def from_seq(seq: PromptSeq) -> "PromptUpdateDetails":
+        return PromptUpdateDetails(full=seq, features=seq)
 
 
-PromptRepl = Union[PromptSeq, PromptReplacementDetails]
+PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
 """
-The replacement token sequence or text.
+The token sequence or text that are part of the update.
 
-If only part of the replacement corresponds to feature placeholders, you can
-use :class:`PromptReplacementDetails` to specify which part.
+If only part of the content corresponds to feature placeholders, you can
+use :class:`PromptUpdateDetails` to specify which part.
 """
 
+PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
+                            PromptUpdateInfo]
+"""
+Given the index of the processed item within :attr:`modality`,
+output the corresponding token sequence (or text).
+
+For convenience, you can directly pass in the token sequence (or text)
+instead of a function if it does not depend on the input.
+"""
+
+
+class UpdateMode(str, Enum):
+    INSERT = "insert"
+    REPLACE = "replace"
+
+
+@dataclass
+class PromptUpdate:
+    """
+    Defines how to update a prompt with placeholder tokens.
+    """
+
+    modality: str
+    """The modality for which the update is made."""
+
+    target: PromptSeq
+    """The token sequence (or text) to update."""
+
+    @property
+    @abstractmethod
+    def content(self) -> PromptUpdateContent:
+        """The placeholder tokens that are part of the update."""
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def mode(self) -> UpdateMode:
+        """Defines how to update the prompt."""
+        raise NotImplementedError
+
+    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptUpdate":
+        return BoundPromptUpdate(
+            _origin=self,
+            tokenizer=tokenizer,
+        )
+
 
 @dataclass
-class PromptReplacement:
+class PromptInsertion(PromptUpdate):
+    """
+    Defines how to insert placeholder tokens into a prompt.
+
+    Example:
+
+        For each image, insert a number of ``<image>`` feature placeholders
+        equal to the feature size of the vision encoder at the start of the
+        prompt:
+
+        .. code-block:: python
+
+            PromptInsertion(
+                modality="image",
+                target="",
+                insertion="<image>" * image_feature_size,
+            )
+
+        As above, but insert after the ``<s>`` token:
+
+        .. code-block:: python
+
+            PromptInsertion(
+                modality="image",
+                target="<s>",
+                insertion="<image>" * image_feature_size,
+            )
+    """
+
+    insertion: PromptUpdateContent = field(repr=False)
+    """
+    Given the index of the processed item within :attr:`modality`,
+    output the token sequence (or text) to insert right after :attr:`target`.
+
+    For convenience, you can directly pass in the token sequence (or text)
+    instead of a function if it does not depend on the input.
+    """
+
+    @property
+    def content(self) -> PromptUpdateContent:
+        return self.insertion
+
+    @property
+    def mode(self) -> UpdateMode:
+        return UpdateMode.INSERT
+
+
+@dataclass
+class PromptReplacement(PromptUpdate):
     """
     Defines how to replace portions of an input prompt with placeholder tokens.
 
@@ -93,7 +190,7 @@ class PromptReplacement:
             PromptReplacement(
                 modality="image",
                 target="<image>",
-                replacement=PromptReplacementDetails(
+                replacement=PromptUpdateDetails(
                     full="".join([
                         "<image_bos>",
                         "<image>" * image_feature_size,
@@ -111,7 +208,7 @@ class PromptReplacement:
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptReplacementDetails(
+                replacement=PromptUpdateDetails(
                     full=([image_bos_id] + [image_token_id] * image_feature_size
                           + [image_eos_id]),
                     features=[image_token_id] * image_feature_size,
@@ -119,29 +216,22 @@ class PromptReplacement:
             )
     """
 
-    modality: str
-    """The modality for which the replacement is made."""
-
-    target: PromptSeq
-    """The token sequence (or text) to find and replace."""
-
-    replacement: Union[Callable[[int], PromptRepl],
-                       PromptRepl] = field(repr=False)
+    replacement: PromptUpdateContent = field(repr=False)
     """
     Given the index of the processed item within :attr:`modality`,
-    output the replacement token sequence (or text).
+    output the token sequence (or text) to replace :attr:`target`.
 
-    For convenience, you can directly pass in the replacement token sequence
-    (or text) instead of a function if it does not depend on the input.
+    For convenience, you can directly pass in the token sequence (or text)
+    instead of a function if it does not depend on the input.
     """
 
-    def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
-        return BoundPromptReplacement(
-            tokenizer=tokenizer,
-            modality=self.modality,
-            _target=self.target,
-            _replacement=self.replacement,
-        )
+    @property
+    def content(self) -> PromptUpdateContent:
+        return self.replacement
+
+    @property
+    def mode(self) -> UpdateMode:
+        return UpdateMode.REPLACE
 
 
 @lru_cache(maxsize=2048)
@@ -232,64 +322,73 @@ def token_ids(self) -> list[int]:
 
 
 @dataclass
-class _BoundPromptReplacementGroup:
+class _BoundPromptContent:
     full: _BoundPromptSequence
     features: _BoundPromptSequence
 
 
 @dataclass
-class BoundPromptReplacement:
+class BoundPromptUpdate:
     """
-    A :class:`PromptReplacement` bound to a tokenizer to automatically
-    convert :attr:`target` and the result of :meth:`get_replacement` between
+    A :class:`PromptUpdate` bound to a tokenizer to automatically convert
+    :attr:`target` and the result of :meth:`get_content` between
     token sequence and text representations.
     """
+    _origin: PromptUpdate
     tokenizer: AnyTokenizer = field(repr=False)
-    modality: str
-
-    _target: PromptSeq
-    _replacement: Union[Callable[[int], PromptRepl],
-                        PromptRepl] = field(repr=False)
 
     def __post_init__(self) -> None:
-        self._replacement_cache = dict[int, _BoundPromptReplacementGroup]()
+        self._content_cache = dict[int, _BoundPromptContent]()
+
+    @property
+    def modality(self) -> str:
+        return self._origin.modality
 
     @property
     def target(self) -> _BoundPromptSequence:
-        """The token sequence (or text) to find and replace."""
-        return _BoundPromptSequence.from_seq(self.tokenizer, self._target)
+        """The token sequence (or text) to update."""
+        return _BoundPromptSequence.from_seq(self.tokenizer,
+                                             self._origin.target)
 
-    def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup:
+    @property
+    def content(self) -> PromptUpdateContent:
+        """The placeholder tokens that are part of the update."""
+        return self._origin.content
+
+    @property
+    def mode(self) -> UpdateMode:
+        """Defines how to update the prompt."""
+        return self._origin.mode
+
+    def get_content(self, item_idx: int) -> _BoundPromptContent:
         """
         Given the index of the processed item within :attr:`modality`,
-        output the replacement token sequence (or text).
+        output the token sequence (or text) to update.
         """
-        replacement = self._replacement
-        if callable(replacement):
+        content = self.content
+        if callable(content):
             cache_key = item_idx
-            if cache_key in self._replacement_cache:
-                return self._replacement_cache[cache_key]
+            if cache_key in self._content_cache:
+                return self._content_cache[cache_key]
 
-            replacement = replacement(item_idx)
+            content = content(item_idx)
         else:
             cache_key = None
 
-        if not isinstance(replacement, PromptReplacementDetails):
-            replacement = PromptReplacementDetails.from_seq(replacement)
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                   replacement.full)
+                                                   content.full)
         bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                       replacement.features)
-        bound_replacement = _BoundPromptReplacementGroup(
-            full=bound_full,
-            features=bound_features,
-        )
+                                                       content.features)
+        bound_content = _BoundPromptContent(full=bound_full,
+                                            features=bound_features)
 
         if cache_key is not None:
-            self._replacement_cache[cache_key] = bound_replacement
+            self._content_cache[cache_key] = bound_content
 
-        return bound_replacement
+        return bound_content
 
 
 class _TokenMatch(NamedTuple):
@@ -326,12 +425,12 @@ def iter_token_matches(
 
 
 @dataclass(repr=False)
-class _PromptReplacementMatch(ABC):
-    prompt_repl: BoundPromptReplacement
+class _PromptTargetMatch(ABC):
+    _origin: BoundPromptUpdate
 
     @property
     def modality(self) -> str:
-        return self.prompt_repl.modality
+        return self._origin.modality
 
     @property
     @abstractmethod
@@ -349,7 +448,7 @@ def __repr__(self) -> str:
 
 
 @dataclass(repr=False)
-class _PromptReplacementTokenMatch(_PromptReplacementMatch):
+class _PromptTargetTokenMatch(_PromptTargetMatch):
     match: _TokenMatch
 
     @property
@@ -362,7 +461,7 @@ def end_idx(self) -> int:
 
 
 @dataclass(repr=False)
-class _PromptReplacementTextMatch(_PromptReplacementMatch):
+class _PromptTargetTextMatch(_PromptTargetMatch):
     match: re.Match[str]
 
     @property
@@ -394,40 +493,37 @@ def to_range(self) -> PlaceholderRange:
 
 def find_token_matches(
     prompt: list[int],
-    prompt_repls: Sequence[BoundPromptReplacement],
-) -> list[_PromptReplacementTokenMatch]:
-    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    prompt_updates: Sequence[BoundPromptUpdate],
+) -> Sequence[_PromptTargetMatch]:
+    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
     return [
-        _PromptReplacementTokenMatch(prompt_repl, match)
-        for prompt_repl in prompt_repls
-        for match in iter_token_matches(prompt, prompt_repl.target.token_ids)
+        _PromptTargetTokenMatch(update, match) for update in prompt_updates
+        for match in iter_token_matches(prompt, update.target.token_ids)
     ]
 
 
 def find_text_matches(
     prompt: str,
-    prompt_repls: Sequence[BoundPromptReplacement],
-) -> list[_PromptReplacementTextMatch]:
-    """Return each target of :code:`prompt_repls` found in :code:`prompt`."""
+    prompt_updates: Sequence[BoundPromptUpdate],
+) -> Sequence[_PromptTargetMatch]:
+    """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
     return [
-        _PromptReplacementTextMatch(prompt_repl, match)
-        for prompt_repl in prompt_repls
-        for match in re.finditer(re.escape(prompt_repl.target.text), prompt)
+        _PromptTargetTextMatch(update, match) for update in prompt_updates
+        for match in re.finditer(re.escape(update.target.text), prompt)
     ]
 
 
 def _resolve_matches(
     prompt: PromptSeq,
-    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
-) -> list[_PromptReplacementMatch]:
+    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
+) -> list[_PromptTargetMatch]:
     """
     Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
     matches = [m for matches in mm_matches.values() for m in matches]
 
-    seen_matches: list[Optional[_PromptReplacementMatch]] = [None
-                                                             ] * len(prompt)
+    seen_matches: list[Optional[_PromptTargetMatch]] = [None] * len(prompt)
 
     for match in matches:
         for idx in range(match.start_idx, match.end_idx):
@@ -441,74 +537,91 @@ def _resolve_matches(
     return sorted(matches, key=lambda x: x.start_idx)
 
 
-def _replace_matches(
+def _apply_matches(
     prompt: _S,
-    mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]],
+    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
-    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
-    out_seqs = list[_S]()
+    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
+    out_seqs = list[Union[str, list[int]]]()
     prev_end_idx = 0
     next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
-    for match in _resolve_matches(prompt, mm_matches):
-        modality = match.modality
+    for (start_idx, end_idx), group in groupby(
+            _resolve_matches(prompt, mm_matches),
+            key=lambda x: (x.start_idx, x.end_idx),
+    ):
+        matches = tuple(group)
+        assert len(matches) == 1
 
-        item_idx = next_idx_by_modality[modality]
-        if item_idx >= mm_item_counts.get(modality, 0):
-            continue
+        for match in matches:
+            modality = match.modality
 
-        start_idx = match.start_idx
-        end_idx = match.end_idx
+            item_idx = next_idx_by_modality[modality]
+            if item_idx >= mm_item_counts.get(modality, 0):
+                continue
 
-        repl_info = match.prompt_repl
-        replacement = repl_info.get_replacement(item_idx)
+            origin = match._origin
+            content = origin.get_content(item_idx)
+            mode = origin.mode
 
-        if isinstance(prompt, str):
-            repl_seq = replacement.full.text
-            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
-        else:
-            repl_seq = replacement.full.token_ids
-            out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq)
+            if mode == UpdateMode.INSERT:
+                out_seqs.append(prompt[prev_end_idx:end_idx])
+                num_inserts = mm_item_counts.get(modality, 0)
+            elif mode == UpdateMode.REPLACE:
+                out_seqs.append(prompt[prev_end_idx:start_idx])
+                num_inserts = 1
+            else:
+                assert_never(mode)
 
-        prev_end_idx = end_idx
-        next_idx_by_modality[modality] += 1
+            for _ in range(num_inserts):
+                if item_idx >= mm_item_counts.get(modality, 0):
+                    continue
+
+                if isinstance(prompt, str):
+                    out_seqs.append(content.full.text)
+                else:
+                    out_seqs.append(content.full.token_ids)
+
+                next_idx_by_modality[modality] += 1
+
+            prev_end_idx = end_idx
 
     out_seqs.append(prompt[prev_end_idx:])
 
-    return out_seqs
+    return cast(list[_S], out_seqs)
 
 
-def replace_token_matches(
+def apply_token_matches(
     prompt: list[int],
-    mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]],
+    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[int]:
-    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
     if not mm_matches:
         return prompt
 
-    token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts)
+    token_id_seqs = _apply_matches(prompt, mm_matches, mm_item_counts)
 
     return flatten_2d_lists(token_id_seqs)
 
 
-def replace_text_matches(
+def apply_text_matches(
     prompt: str,
-    mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]],
+    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> str:
-    """Apply the replacements in :code:`mm_matches` to :code:`prompt`."""
+    """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
     if not mm_matches:
         return prompt
 
-    texts = _replace_matches(prompt, mm_matches, mm_item_counts)
+    texts = _apply_matches(prompt, mm_matches, mm_item_counts)
 
     return "".join(texts)
 
 
 def _iter_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
 ) -> Iterable[PlaceholderFeaturesInfo]:
@@ -517,7 +630,7 @@ def _iter_placeholders(
 
     Matches are exclusive even when multiple modalities share
     the same placeholder tokens. In that case, the modality that
-    appears earlier in `mm_prompt_repls` takes priority.
+    appears earlier in `mm_prompt_updates` takes priority.
 
     Note that empty matches are ignored.
     """
@@ -528,37 +641,37 @@ def _iter_placeholders(
     while start_idx < prompt_len:
         found = False
 
-        for modality, modality_repls in mm_prompt_repls.items():
+        for modality, modality_updates in mm_prompt_updates.items():
             item_idx = item_idx_by_modality[modality]
             if item_idx >= mm_item_counts.get(modality, 0):
                 continue
 
-            for repl_info in modality_repls:
-                replacement = repl_info.get_replacement(item_idx)
-                repl_tokens_full = replacement.full.token_ids
-                repl_len_full = len(repl_tokens_full)
-                end_idx_full = start_idx + repl_len_full
+            for update_info in modality_updates:
+                content = update_info.get_content(item_idx)
+                content_tokens_full = content.full.token_ids
+                content_len_full = len(content_tokens_full)
+                end_idx_full = start_idx + content_len_full
 
-                if repl_len_full == 0 or end_idx_full > prompt_len:
+                if content_len_full == 0 or end_idx_full > prompt_len:
                     continue
 
-                if prompt[start_idx:end_idx_full] == repl_tokens_full:
-                    repl_tokens_feat = replacement.features.token_ids
+                if prompt[start_idx:end_idx_full] == content_tokens_full:
+                    content_tokens_feat = content.features.token_ids
 
                     try:
                         match = next(
-                            iter_token_matches(repl_tokens_full,
-                                               repl_tokens_feat))
+                            iter_token_matches(content_tokens_full,
+                                               content_tokens_feat))
                         yield PlaceholderFeaturesInfo(
                             modality=modality,
                             item_idx=item_idx,
                             start_idx=start_idx + match.start_idx,
-                            tokens=repl_tokens_feat,
+                            tokens=content_tokens_feat,
                         )
                     except StopIteration:
                         raise AssertionError(
-                            f"{repl_tokens_feat=} should be a "
-                            f"subsequence of {repl_tokens_full=}") from None
+                            f"{content_tokens_feat=} should be a "
+                            f"subsequence of {content_tokens_full=}") from None
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
@@ -574,11 +687,11 @@ def _iter_placeholders(
 
 
 def find_mm_placeholders(
-    mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+    mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
     prompt: list[int],
     mm_item_counts: Mapping[str, int],
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-    it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts)
+    it = _iter_placeholders(mm_prompt_updates, prompt, mm_item_counts)
     return dict(full_groupby_modality(it))
 
 
@@ -712,6 +825,12 @@ def __init__(self,
                  *,
                  cache: Optional[ProcessingCache] = None,
                  enable_sanity_checks: bool = True) -> None:
+        if get_repls := getattr(self, "_get_prompt_replacements", None):
+            logger.warning_once("`_get_prompt_replacements` has been renamed "
+                                "to `_get_prompt_updates`. The old name will "
+                                "be removed in an upcoming release.")
+            self._get_prompt_updates = get_repls  # type: ignore[method-assign]
+
         super().__init__()
 
         self.info = info
@@ -770,34 +889,34 @@ def _get_mm_fields_config(
         raise NotImplementedError
 
     @abstractmethod
-    def _get_prompt_replacements(
+    def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> list[PromptUpdate]:
         """
         Given the original multi-modal items for this modality
-        and HF-processed data, output the replacements to perform.
+        and HF-processed data, output the updates to perform.
 
         Notes:
             - You should not assume that HF processor always performs prompt
-              replacement: in :meth:`_apply_hf_processor_missing`, this method
+              updates: in :meth:`_apply_hf_processor_missing`, this method
               is called on text-only and multimodal-only inputs separately,
               instead of passing them in the same call.
-            - The replacement information returned by this method is also used
-              to determine the placeholder token positions for each multi-modal
+            - The update information returned by this method is also used to
+              determine the placeholder token positions for each multi-modal
               item.
         """
         raise NotImplementedError
 
     def _find_mm_placeholders(
         self,
-        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         new_token_ids: list[int],
         mm_item_counts: Mapping[str, int],
     ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
-        return find_mm_placeholders(mm_prompt_repls, new_token_ids,
+        return find_mm_placeholders(mm_prompt_updates, new_token_ids,
                                     mm_item_counts)
 
     def _get_hf_mm_data(
@@ -831,14 +950,14 @@ def _call_hf_processor(
             mm_kwargs,
         )
 
-    def _hf_processor_applies_repl(
+    def _hf_processor_applies_updates(
         self,
         prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> bool:
         """
-        Return whether the HF processor applies prompt replacements.
+        Return whether the HF processor applies prompt updates.
 
         For most HF processors, this should be :code:`True` when multi-modal
         data items are passed, but :code:`False` when multi-modal embeddings
@@ -858,7 +977,7 @@ def _apply_hf_processor_text_mm(
         Apply the HF processor on the prompt text and multi-modal data
         together.
 
-        In addition, return whether prompt replacements have been applied.
+        In addition, return whether prompt updates have been applied.
         """
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
@@ -876,13 +995,13 @@ def _apply_hf_processor_text_mm(
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
         )
 
-        is_repl_applied = self._hf_processor_applies_repl(
+        is_update_applied = self._hf_processor_applies_updates(
             prompt_text=prompt_text,
             mm_items=mm_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
         )
 
-        return prompt_ids, mm_kwargs, is_repl_applied
+        return prompt_ids, mm_kwargs, is_update_applied
 
     def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
         """
@@ -948,21 +1067,21 @@ def _apply_hf_processor_main(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        enable_hf_prompt_replacement: bool,
+        enable_hf_prompt_update: bool,
     ) -> tuple[list[int], MultiModalKwargs, bool]:
         """
         Apply the HF processor on the prompt text and multi-modal data.
 
-        In addition, return whether prompt replacements have been applied
+        In addition, return whether prompt updates have been applied
         (for most HF processors, this should be :code:`True`).
 
         Note:
-            If :code:`enable_hf_prompt_replacement=False`, we use HF processor
-            to perform prompt replacement if available; HF processor requires
+            If :code:`enable_hf_prompt_update=False`, we use HF processor
+            to perform prompt updates if available; HF processor requires
             that the prompt corresponds to multi-modal items.
         """
         if isinstance(prompt, str):
-            if enable_hf_prompt_replacement:
+            if enable_hf_prompt_update:
                 return self._apply_hf_processor_text_mm(
                     prompt_text=prompt,
                     mm_items=mm_items,
@@ -999,7 +1118,7 @@ def _cached_apply_hf_processor(
                 prompt=prompt,
                 mm_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-                enable_hf_prompt_replacement=True,
+                enable_hf_prompt_update=True,
             )
 
         mm_maybe_cached_kw_items = {
@@ -1022,17 +1141,17 @@ def _cached_apply_hf_processor(
         mm_missing_data_items = self._to_mm_items(mm_missing_data)
 
         # NOTE: `prompt` does not correspond to `mm_missing_data_items`,
-        # so we can't apply prompt replacements until the new multimodal
+        # so we can't apply prompt updates until the new multimodal
         # items are combined with the cached multimodal items
         (
             prompt_ids,
             mm_missing_kwargs,
-            is_repl_applied,
+            is_update_applied,
         ) = self._apply_hf_processor_main(
             prompt=prompt,
             mm_items=mm_missing_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-            enable_hf_prompt_replacement=False,
+            enable_hf_prompt_update=False,
         )
 
         mm_missing_next_idx = {
@@ -1071,28 +1190,28 @@ def _cached_apply_hf_processor(
 
         mm_kwargs = MultiModalKwargs.from_items(merged_kw_items)
 
-        return prompt_ids, mm_kwargs, is_repl_applied
+        return prompt_ids, mm_kwargs, is_update_applied
 
-    def _bind_and_group_repls(
+    def _bind_and_group_updates(
         self,
-        prompt_repls: list[PromptReplacement],
-    ) -> dict[str, list[BoundPromptReplacement]]:
+        prompt_updates: list[PromptUpdate],
+    ) -> dict[str, list[BoundPromptUpdate]]:
         tokenizer = self.info.get_tokenizer()
 
-        it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls)
+        it = (update.bind(tokenizer) for update in prompt_updates)
         return dict(full_groupby_modality(it))
 
-    def _apply_prompt_replacements(
+    def _apply_prompt_updates(
         self,
         token_ids: list[int],
-        mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]],
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
         tokenizer = self.info.get_tokenizer()
 
         mm_token_matches = {
-            modality: find_token_matches(token_ids, prompt_repls)
-            for modality, prompt_repls in mm_prompt_repls.items()
+            modality: find_token_matches(token_ids, updates)
+            for modality, updates in mm_prompt_updates.items()
         }
         mm_match_counts = {
             modality: len(matches)
@@ -1107,31 +1226,31 @@ def _apply_prompt_replacements(
         # up a token, then the token ID of "foo" will not appear at all
         # ----
         # Since it is inefficient to search for all possible tokenizations
-        # of the search text in the prompt, we instead perform string
-        # replacement on the decoded token IDs, then encode them back.
+        # of the search text in the prompt, we instead perform string-based
+        # updates on the decoded token IDs, then encode them back.
         if all(
             mm_match_counts.get(modality, 0) >= item_count
             for modality, item_count in mm_item_counts.items()
         ):  # yapf: disable
-            token_ids = replace_token_matches(
+            token_ids = apply_token_matches(
                 token_ids,
                 mm_token_matches,
                 mm_item_counts,
             )
 
             text = decode_tokens(tokenizer, token_ids)
-            matched_repls = {
-                modality: [match.prompt_repl for match in token_matches]
+            matched_updates = {
+                modality: [match._origin for match in token_matches]
                 for modality, token_matches in mm_token_matches.items()
             }
         else:
             text = decode_tokens(tokenizer, token_ids)
 
             mm_text_matches = {
-                modality: find_text_matches(text, prompt_repls)
-                for modality, prompt_repls in mm_prompt_repls.items()
+                modality: find_text_matches(text, updates)
+                for modality, updates in mm_prompt_updates.items()
             }
-            text = replace_text_matches(
+            text = apply_text_matches(
                 text,
                 mm_text_matches,
                 mm_item_counts,
@@ -1140,13 +1259,13 @@ def _apply_prompt_replacements(
             token_ids = encode_tokens(tokenizer,
                                       text,
                                       add_special_tokens=False)
-            matched_repls = {
-                modality: [match.prompt_repl for match in token_matches]
+            matched_updates = {
+                modality: [match._origin for match in token_matches]
                 for modality, token_matches in mm_text_matches.items()
             }
 
         placeholders = self._find_mm_placeholders(
-            matched_repls,
+            matched_updates,
             token_ids,
             mm_item_counts,
         )
@@ -1184,14 +1303,14 @@ def _validate_mm_placeholders(
 
             if len(placeholders) != item_count:
                 raise RuntimeError(
-                    f"Expected there to be {item_count} prompt replacements "
+                    f"Expected there to be {item_count} prompt updates "
                     f"corresponding to {item_count} {modality} items, but "
-                    f"instead found {len(placeholders)} prompt replacements! "
+                    f"instead found {len(placeholders)} prompt updates! "
                     "Either the prompt text has missing/incorrect tokens for "
                     "multi-modal inputs, or there is a problem with your "
                     "implementation of merged multi-modal processor for this "
                     "model (usually arising from an inconsistency between "
-                    "`_call_hf_processor` and `_get_prompt_replacements`).")
+                    "`_call_hf_processor` and `_get_prompt_updates`).")
 
     def apply(
         self,
@@ -1206,7 +1325,7 @@ def apply(
 
         1. Apply HF Processor on prompt text and multi-modal data together,
            outputting token IDs and processed tensors.
-        2. Find and replace sequences in the token IDs with placeholder tokens.
+        2. Find and update sequences in the token IDs with placeholder tokens.
            The number of placeholder tokens equals the feature size of the
            multi-modal data outputted by the multi-modal encoder.
         3. Extract information about the placeholder tokens from the
@@ -1235,26 +1354,27 @@ def apply(
         (
             prompt_ids,
             mm_kwargs,
-            is_repl_applied,
+            is_update_applied,
         ) = self._cached_apply_hf_processor(
             prompt,
             mm_items,
             hf_processor_mm_kwargs,
         )
 
-        unbound_prompt_repls = self._get_prompt_replacements(
+        unbound_prompt_updates = self._get_prompt_updates(
             mm_items,
             hf_processor_mm_kwargs,
             mm_kwargs,
         )
-        mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls)
+        mm_prompt_updates = self._bind_and_group_updates(
+            unbound_prompt_updates)
 
         mm_item_counts = mm_items.get_all_counts()
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
 
-        if is_repl_applied:
+        if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
-                mm_prompt_repls,
+                mm_prompt_updates,
                 prompt_ids,
                 mm_item_counts,
             )
@@ -1267,9 +1387,9 @@ def apply(
                 prompt_ids,
                 prompt,
                 mm_placeholders,
-            ) = self._apply_prompt_replacements(
+            ) = self._apply_prompt_updates(
                 prompt_ids,
-                mm_prompt_repls,
+                mm_prompt_updates,
                 mm_item_counts,
             )
             self._validate_mm_placeholders(mm_placeholders, mm_item_counts)

From c1dbbea3a01c605cca063f4302d81a46d83b0455 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Thu, 27 Feb 2025 10:14:17 -0800
Subject: [PATCH 0420/1240] [Attention] MLA support for V1 (#13789)

Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/layer.py                    |   35 +-
 vllm/model_executor/models/deepseek_v2.py  |   13 +-
 vllm/platforms/cuda.py                     |    9 +-
 vllm/platforms/interface.py                |    1 +
 vllm/v1/attention/backends/flash_attn.py   |   69 +-
 vllm/v1/attention/backends/mla/__init__.py |    0
 vllm/v1/attention/backends/mla/common.py   | 1022 ++++++++++++++++++++
 vllm/v1/attention/backends/triton_mla.py   |  110 +++
 vllm/v1/worker/gpu_input_batch.py          |   64 +-
 vllm/v1/worker/gpu_model_runner.py         |   76 +-
 10 files changed, 1340 insertions(+), 59 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mla/__init__.py
 create mode 100644 vllm/v1/attention/backends/mla/common.py
 create mode 100644 vllm/v1/attention/backends/triton_mla.py

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index c45c83a0707..58a3b4ee43c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -89,6 +89,7 @@ def __init__(
         self._k_scale_float = 1.0
         self._v_scale_float = 1.0
 
+        self.use_mla = use_mla
         self.num_heads = num_heads
         self.head_size = head_size
         self.num_kv_heads = num_kv_heads
@@ -158,6 +159,10 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
+        # For some alternate attention backends like MLA the attention output
+        # shape does not match the query shape, so we optionally let the model
+        # definition specify the output tensor shape.
+        output_shape: Optional[torch.Size] = None,
     ) -> torch.Tensor:
         """
         The KV cache is stored inside this class and is accessed via
@@ -173,17 +178,25 @@ def forward(
             if attn_metadata.enable_kv_scales_calculation:
                 self.calc_kv_scales(key, value)
         if self.use_output:
-            output = torch.empty_like(query)
-            hidden_size = query.size(-1)
-            # Reshape the query, key, and value tensors.
-            # NOTE(woosuk): We do this outside the custom op to minimize the
-            # CPU overheads from the non-CUDA-graph regions.
-            query = query.view(-1, self.num_heads, self.head_size)
-            output = output.view(-1, self.num_heads, self.head_size)
-            if key is not None:
-                key = key.view(-1, self.num_kv_heads, self.head_size)
-            if value is not None:
-                value = value.view(-1, self.num_kv_heads, self.head_size)
+            output_shape = (output_shape
+                            if output_shape is not None else query.shape)
+            output = torch.empty(output_shape,
+                                 dtype=query.dtype,
+                                 device=query.device)
+            hidden_size = output_shape[-1]
+            # We skip reshaping query, key and value tensors for the MLA
+            # backend since these tensors have different semantics and are
+            # processed differently.
+            if not self.use_mla:
+                # Reshape the query, key, and value tensors.
+                # NOTE(woosuk): We do this outside the custom op to minimize the
+                # CPU overheads from the non-CUDA-graph regions.
+                query = query.view(-1, self.num_heads, self.head_size)
+                output = output.view(-1, self.num_heads, self.head_size)
+                if key is not None:
+                    key = key.view(-1, self.num_kv_heads, self.head_size)
+                if value is not None:
+                    value = value.view(-1, self.num_kv_heads, self.head_size)
             if self.use_direct_call:
                 forward_context: ForwardContext = get_forward_context()
                 attn_metadata = forward_context.attn_metadata
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 6ff3ef129a7..b5409c7fe1b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -420,9 +420,15 @@ def __init__(
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
+        # In the MLA backend, kv_cache includes both k_c and
+        # pe (i.e. decoupled position embeddings). In particular,
+        # the concat_and_cache_mla op requires
+        #     k_c.size(1) + k_pe.size(1) == kv_cache.size(2)
+        # i.e.
+        #     kv_lora_rank + qk_rope_head_dim == head_size
         self.mla_attn = Attention(
             num_heads=self.num_local_heads,
-            head_size=self.kv_lora_rank,
+            head_size=self.kv_lora_rank + self.qk_rope_head_dim,
             scale=self.scaling,
             num_kv_heads=1,
             cache_config=cache_config,
@@ -458,7 +464,10 @@ def forward(
         kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
             [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
         kv_c_normed = self.kv_a_layernorm(kv_c.contiguous())
-        return self.mla_attn(hidden_states_or_q_c, kv_c_normed, k_pe)
+        return self.mla_attn(hidden_states_or_q_c,
+                             kv_c_normed,
+                             k_pe,
+                             output_shape=hidden_states.shape)
 
 
 class DeepseekV2DecoderLayer(nn.Module):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c6f3ccf0a3c..0209c723627 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -162,8 +162,13 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
         if use_v1:
-            logger.info("Using Flash Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
+            if use_mla:
+                logger.info("Using Triton MLA backend on V1 engine.")
+                return "vllm.v1.attention.backends.triton_mla.TritonMLABackend"
+            else:
+                logger.info("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends.flash_attn."
+                        "FlashAttentionBackend")
         if use_mla:
             if selected_backend == _Backend.FLASHMLA:
                 from vllm.attention.backends.flashmla import (
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 0e4988a4fa7..4af413dff0f 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -35,6 +35,7 @@ class _Backend(enum.Enum):
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()
+    TRITON_MLA_VLLM_V1 = enum.auto()
     FLASHMLA = enum.auto()
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 1922a3bf272..353bf46d503 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
 import numpy as np
 import torch
@@ -14,6 +14,11 @@
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
 
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
 if current_platform.is_cuda():
     from vllm.vllm_flash_attn import flash_attn_varlen_func
 
@@ -40,6 +45,10 @@ def get_impl_cls() -> Type["FlashAttentionImpl"]:
     def get_metadata_cls() -> Type["AttentionMetadata"]:
         return FlashAttentionMetadata
 
+    @staticmethod
+    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
     @staticmethod
     def get_kv_cache_shape(
         num_blocks: int,
@@ -85,6 +94,62 @@ class FlashAttentionMetadata:
     num_input_tokens: int = 0  # Number of tokens including padding.
 
 
+class FlashAttentionMetadataBuilder:
+
+    def __init__(self, runner: "GPUModelRunner"):
+        self.runner = runner
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput"):
+        pass
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            self.runner.device, non_blocking=True)
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(self.runner.device,
+                                                          non_blocking=True)
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            self.runner.device, non_blocking=True).long()
+
+        use_cascade = common_prefix_len > 0
+        if use_cascade:
+            # TODO: Optimize.
+            cu_prefix_query_lens = torch.tensor([0, num_actual_tokens],
+                                                dtype=torch.int32,
+                                                device=self.runner.device)
+            prefix_kv_lens = torch.tensor([common_prefix_len],
+                                          dtype=torch.int32,
+                                          device=self.runner.device)
+            suffix_kv_lens = (self.runner.seq_lens_np[:num_reqs] -
+                              common_prefix_len)
+            suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(
+                self.runner.device)
+        else:
+            cu_prefix_query_lens = None
+            prefix_kv_lens = None
+            suffix_kv_lens = None
+
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=slot_mapping,
+            use_cascade=use_cascade,
+            common_prefix_len=common_prefix_len,
+            cu_prefix_query_lens=cu_prefix_query_lens,
+            prefix_kv_lens=prefix_kv_lens,
+            suffix_kv_lens=suffix_kv_lens,
+        )
+        return attn_metadata
+
+
 class FlashAttentionImpl(AttentionImpl):
 
     def __init__(
@@ -371,4 +436,4 @@ def cascade_attention(
 
     # Merge prefix and suffix outputs, and store the result in output.
     merge_attn_states(output, prefix_output, prefix_lse, suffix_output,
-                      suffix_lse)
\ No newline at end of file
+                      suffix_lse)
diff --git a/vllm/v1/attention/backends/mla/__init__.py b/vllm/v1/attention/backends/mla/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
new file mode 100644
index 00000000000..2a742f5ce52
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -0,0 +1,1022 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file implements common components for MLA implementations.
+
+First we define:
+
+Sq      as Q sequence length
+Skv     as KV sequence length
+
+MLA has two possible ways of computing, a data-movement friendly approach and a
+compute friendly approach, we generally want to use the compute friendly
+approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
+and the data-movement friendly approach for "decode" (i.e. the ratio
+Sq / Skv is "large").
+
+NOTE what we deem small and large is currently determined by if its labelled
+prefill or decode by the scheduler, but this is something we should probably
+tune.
+
+Main reference: DeepseekV2 paper, and FlashInfer Implementation
+(https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
+
+Deepseek's MLA attention works the following way:
+* Use a single latent vector to represent the per-token entry of the KV cache.
+* For decode (i.e. the memory friendly approach) the attention "simulates" a
+multi-head attention, while the compute is similar to multi-query attention.
+
+Below is example of both paths assuming batchsize = 1
+
+## More Extent Definitions:
+
+C           Context length, `Skv - Sq`
+H           hidden size
+N           number of attention heads
+Lq          latent dimension for Q              1536 in DSV3
+Lkv         latent dimension for K/V            512 in DSV3
+P           nope dimension, no rope.            128 in DSV3
+R           rope dimension, goes through rope.  64 in DSV3
+V           V head dim.                         128 in DSV3
+
+## Vector/Matrix Definitions
+
+h_t         hidden states (input to attention)  shape [Sq, H]
+q_c         latent/compressed Q                 shape [Sq, Lq]
+q_nope      uncompressed Q (no-rope)            shape [Sq, N, P]
+q_pe        uncompressed Q (rope)               shape [Sq, N, R]
+kv_c        latent/compressed KV                shape [Skv, Lkv]
+k_pe        decoupled k position embeddings     shape [Skv, R]
+new_kv_c    new kv_c from current iter          shape [Sq, Lkv]
+new_k_pe    new k_pe from current iter          shape [Sq, R]
+cache_kv_c  cached k_c from previous iters      shape [C, Lkv]
+cache_k_pe  cached k_pe from previous iters     shape [C, R]
+W_DQ        project h_t to q_c                  shape [H, Lq]
+W_UQ        project q_c to q_nope               shape [Lq, N * P]
+W_QR        project q_c to q_pe                 shape [Lq, N * R]
+W_DKV       project h_t to kv_c                 shape [H, Lkv]
+W_UK        project kv_c to k_nope              shape [Lkv, N * P]
+W_KR        project h_t to k_pe                 shape [H, N * R]
+W_UV        project kv_c to v                   shape [Lkv, N * V]
+W_O         project v to h_t                    shape [N * V, H]
+
+
+## Compute Friendly Approach (i.e. "_forward_prefill"):
+
+q_c      = h_t @ W_DQ
+q_nope   = (q_c @ W_UQ).view(Sq, N, P)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+k_nope   = (kv_c @ W_UK).view(Skv, N, P)
+v        = (kv_c @ W_UV).view(Skv, N, V)
+
+// MHA with QK headdim = P + R
+//           V headdim = V
+//      spda_o shape [Sq, N, V]
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    v
+)
+return spda_o @ W_O
+
+NOTE: in the actual code,
+    `kv_b_proj` is [W_UK; W_UV] concatnated per head
+    `q_b_proj` is [W_UQ; W_QR] concatnated per head
+    `out_proj` is W_O
+
+
+## Data-Movement Friendly Approach (i.e. "_forward_decode"):
+
+Ahead of time, compute:
+
+% this projects from q_c to [Sq, N * Lkv]
+W_UQ_UK = einsum("qnp,knp -> qnk"
+                     W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P)
+                ).view(Lkv, N * Lkv)
+% this projects from attn output [Sq, N * Lkv] to [Sq, H]
+W_UV_O  = einsum("knv,nvh -> nkh"
+                     W_UV.view(Lkv, N, V), W_O.view(N, V, H)
+                ).view(N * Lkv, H)
+
+Runtime
+q_c      = h_t @ W_DQ
+q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv)
+q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c = h_t @ W_DKV
+new_k_pe = RoPE(h_t @ W_KR)
+kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
+k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
+
+// MQA with QK headdim = Lkv + R
+//           V headdim = Lkv
+//      spda_o shape [Sq, N, Lkv]
+// NOTE: this is less compute-friendly since Lkv > P
+//       but is more data-movement friendly since its MQA vs MHA
+spda_o = scaled_dot_product_attention(
+    torch.cat([q_latent, q_pe], dim=-1),
+    torch.cat([kv_c, k_pe], dim=-1),
+    kv_c
+)
+return spda_o.reshape(-1, N * Lkv) @ W_UV_O
+
+
+## Chunked Prefill
+
+For chunked prefill we want to use the compute friendly algorithm. We are
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
+the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
+
+However, the compute-friendly approach can potentially run out of memory if Skv
+is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
+
+To mitigate this, we chunk the computation of attention with respect to the
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
+fixed workspace size.
+
+The chunked prefill approach is as follows:
+
+MCC        Max chunk of context to process per iter, computed dynamically,
+           used to bound the memory usage
+
+q_c        = h_t @ W_DQ
+q_nope     = (q_c @ W_UQ).view(Sq, N, P)
+q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
+new_kv_c   = h_t @ W_DKV
+new_k_pe   = RoPE(h_t @ W_KR)
+new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV).view(Sq, N, V)
+
+// MHA between queries and new KV
+//     with QK headdim = P + R
+//           V headdim = V
+//    curr_o   shape [Sq, N, V]
+//    curr_lse shape [N, Sq], this is just order FA returns
+curr_o, curr_lse = scaled_dot_product_attention(
+    torch.cat([q_nope, q_pe], dim=-1),
+    torch.cat([new_k_nope, new_k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
+    new_v,
+    casual=True,
+    return_softmax_lse=True
+)
+
+// Compute attention with the already existing context
+for chunk_idx in range(cdiv(C, MCC)):
+    chunk_start  = chunk_idx * MCC
+    chunk_end    = min(chunk_start + MCC, C)
+    Sc           = chunk_end - chunk_start
+    cache_kv_c_chunk   = cache_kv_c[chunk_start:chunk_end]
+    cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
+    cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
+    cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
+
+    chunk_o, chunk_lse = scaled_dot_product_attention(
+        torch.cat([q_nope, q_pe], dim=-1),
+        torch.cat([cache_k_nope_chunk,
+                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
+                   dim=-1),
+        cache_v_chunk,
+        casual=False,
+        return_softmax_lse=True
+    )
+
+    curr_o, curr_lse = merge_attn_states(
+        suffix_output=curr_o,
+        suffix_lse=curr_lse,
+        prefix_output=chunk_o,
+        prefix_lse=chunk_lse,
+    )
+
+return curr_o @ W_O
+"""
+
+import functools
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
+                    Type, TypeVar)
+
+import torch
+from compressed_tensors.quantization import QuantizationStrategy
+
+from vllm import _custom_ops as ops
+from vllm import envs
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
+                                              AttentionMetadata,
+                                              MLAAttentionImpl)
+from vllm.attention.backends.utils import get_flash_attn_version
+from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.distributed import (get_tensor_model_parallel_world_size,
+                              tensor_model_parallel_all_reduce)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               LinearBase, RowParallelLinear,
+                                               UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsLinearMethod)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsW8A8Fp8)
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    scaled_quantize)
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.utils import cdiv, round_down
+
+try:
+    from vllm.vllm_flash_attn import flash_attn_varlen_func
+except ImportError:
+    # For rocm use upstream flash attention
+    from flash_attn import flash_attn_varlen_func
+
+if TYPE_CHECKING:
+    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class MLACommonBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return MLACommonMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
+        return MLACommonMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+@dataclass
+class MLACommonMetadata:
+    """Metadata for MLACommon.
+
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+    # New for MLA (compared to FlashAttention)
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    max_query_len: int
+    query_start_loc: torch.Tensor
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    block_table: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    # For logging.
+    num_input_tokens: int = 0  # Number of tokens including padding.
+
+    # The dimension of the attention heads
+    head_dim: Optional[int] = None
+
+    # New for MLA (compared to FlashAttention)
+    # For chunked prefill
+    num_decodes: Optional[int] = None
+    num_decode_tokens: Optional[int] = None
+    num_prefills: Optional[int] = None
+    has_context: bool = False
+    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
+    context_chunk_starts: Optional[torch.Tensor] = None
+    context_chunk_seq_tot: Optional[List[int]] = None
+    context_chunk_max_seq_lens: Optional[List[int]] = None
+    chunked_prefill_workspace: Optional[torch.Tensor] = None
+
+    def __post_init__(self):
+        supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
+        if self.head_dim is not None and self.head_dim \
+                not in supported_head_sizes:
+            raise ValueError(
+                f"Only {supported_head_sizes} are supported for head_dim,",
+                f"received {self.head_dim}.")
+
+
+T = TypeVar("T", bound=MLACommonMetadata)
+
+
+class MLACommonMetadataBuilder:
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+
+    def __init__(self, runner: "GPUModelRunner"):
+        self.runner = runner
+        scheduler_config = runner.scheduler_config
+        model_config = runner.model_config
+        cache_config = runner.cache_config
+        self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+
+        if self.chunked_prefill_enabled:
+            self.chunked_prefill_workspace_size = min(
+                # Max sure there is enough for 8 full length request or at least
+                # 4 pages of cache per request
+                max(
+                    8 * model_config.max_model_len, 4 *
+                    scheduler_config.max_num_seqs * cache_config.block_size),
+                # For long-context models try not to over-allocate limiting
+                # kv-cache space, limiting it to 64k tokens,
+                # which would result in the workspace being:
+                #   2*(576)*(64*1024) = 144mb
+                # (assuming 576 MLA head dim, and fp16)
+                # which would result in up-projected context being
+                #   2*(192*128)*(64*1024) = 3gb
+                # (assuming 192 QK head dim, 128 heads, and fp16)
+                128 * 1024)
+            assert self.chunked_prefill_workspace_size >= \
+                scheduler_config.max_num_seqs * cache_config.block_size
+            self.chunked_prefill_workspace = torch.empty(
+                (self.chunked_prefill_workspace_size,
+                 model_config.get_head_size()),
+                dtype=model_config.dtype,
+                device=runner.device,
+            )
+            self.page_size = self.runner.block_size
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput"):
+        # We now want to reorder the batch so that the "decode" requests are and
+        # the front and the "prefill" requests are at the using the least amount
+        # swaps possible. (NOTE for now we loosely use "decode" to mean requests
+        # where attention is likely memory-bound and "prefill" to mean requests
+        # where attention is likely compute-bound, TODO(lucas): figure out a
+        # better naming here)
+        decodes = []
+        prefills = []
+        num_decode_tokens = 0
+        num_prefill_tokens = 0
+
+        for i, req_id in enumerate(input_batch.req_ids):
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            # for now treat 1 scheduled token as "decode" even if its not,
+            # we should update this to something like < 8 in the future but
+            # currently the TritonMLA._forward_decode only supports
+            # num_tokens = 1
+            if num_tokens == 1:
+                decodes.append(i)
+                num_decode_tokens += num_tokens
+            else:
+                prefills.append(i)
+                num_prefill_tokens += num_tokens
+
+        # We hope that this is fairly minimal since decodes
+        # should be around for a number of iterations so hopefully they are
+        # relatively stationary (and new request are generally appended to the
+        # persistent batch so already should be at the back)
+        # To achieve this we loop over the decodes in descending order and
+        # the prefills in ascending order. We swap decodes from the  "back"
+        # i.e. past where the last decode should be in the reodorered with
+        # prefills from the front of the batch.
+        # `decodes` and `prefills` are already in ascending order just based on
+        # the above loop
+        num_decodes = len(decodes)
+        num_prefills = len(prefills)
+        first_prefill = 0
+
+        for i in range(1, min(num_decodes, num_prefills) + 1):
+            # If the decode is at the "back" of the batch, i, we can swap it
+            # with the prefill closest to the front of the batch
+            if decodes[num_decodes - i] >= num_decodes:
+                input_batch.swap_states(prefills[first_prefill],
+                                        decodes[num_decodes - i])
+                first_prefill += 1
+            else:
+                break
+
+        # Save for next `build` call
+        # TODO(lucas): this is a bit of a hack, we should probably have a
+        # better way of doing this
+        self._num_decodes = num_decodes
+        self._num_prefills = num_prefills
+        self._num_decode_tokens = num_decode_tokens
+        self._num_prefill_tokens = num_prefill_tokens
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        device = self.runner.device
+        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
+        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            device, non_blocking=True)
+        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(device,
+                                                          non_blocking=True)
+        block_table = (
+            self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
+            device, non_blocking=True).long()
+        input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
+            device, non_blocking=True).long()
+
+        context_chunk_cu_seq_lens = None
+        context_chunk_starts = None
+        context_chunk_seq_tot = None
+        context_chunk_max_seq_lens = None
+
+        num_computed_tokens_cpu_tensor = \
+            self.runner.input_batch.num_computed_tokens_cpu_tensor[:num_reqs]
+        context_lens_tensor = \
+            num_computed_tokens_cpu_tensor.to(device, non_blocking=True)
+
+        if self.chunked_prefill_enabled and self._num_prefills > 0 \
+            and context_lens_tensor[self._num_decodes:].max() > 0:
+            # NOTE: it is recommend you read the `Chunked Prefill` section in
+            # the comment at the top of the file before trying to understand
+            # the following code
+
+            self.has_context = True
+
+            num_prefills_with_context = \
+                (context_lens_tensor[self._num_decodes:] > 0).sum().item()
+
+            # currently we allocate an equal amount of workspace for each
+            # prefill in the batch, we could probably use a more advanced
+            # algorithm here and allocate more workspace to prefills with
+            # longer context lengths
+            max_context_chunk = \
+                self.chunked_prefill_workspace_size // num_prefills_with_context
+
+            # align max_context_chunk to page_size by rounding down,
+            # currently the `gather_cache` kernel cannot handle
+            # `context_chunk_starts` that are not aligned to page_size
+            max_context_chunk = round_down(max_context_chunk, self.page_size)
+            assert max_context_chunk > 0
+            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
+
+            # if `max_context_chunk = 256`, `num_chunks = 3`, and
+            #   `num_prefills_with_context = 4`, create a tensor that looks like
+            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+            context_chunk_starts = \
+                torch.arange(num_chunks, device=device, dtype=torch.int32) \
+                .unsqueeze(1).expand(-1, self._num_prefills) \
+                * max_context_chunk
+            chunk_ends = torch.min(context_lens_tensor[self._num_decodes:] \
+                .unsqueeze(0), context_chunk_starts + max_context_chunk)
+            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
+            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
+                torch.int32)
+            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device) \
+                .unsqueeze(-1)
+            context_chunk_cu_seq_lens = \
+                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
+            context_chunk_max_seq_lens = \
+                chunk_seq_lens.max(dim=1).values.tolist()
+            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
+            assert max(context_chunk_seq_tot) <= \
+                self.chunked_prefill_workspace_size
+
+        return MLACommonMetadata(
+            input_positions=input_positions,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=max_query_len,
+            query_start_loc=query_start_loc,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=slot_mapping,
+            head_dim=self.runner.model_config.get_head_size(),
+            # MLACommonMetadata Chunk prefill specific
+            num_decodes=self._num_decodes,
+            num_decode_tokens=self._num_decode_tokens,
+            num_prefills=self._num_prefills,
+            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
+            context_chunk_starts=context_chunk_starts,
+            context_chunk_seq_tot=context_chunk_seq_tot,
+            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
+        )
+
+
+class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+    """
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]],
+        logits_soft_cap: Optional[float],
+        attn_type: str,
+        # MLA Specific Arguments
+        q_lora_rank: Optional[int],
+        kv_lora_rank: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        qk_head_dim: int,
+        v_head_dim: int,
+        rotary_emb: RotaryEmbedding,
+        # q_proj should be q_b_proj if q_lora_rank is not None, but from an
+        # attention backend perspective we rely on the layer to pass in the
+        # correct matrix
+        q_proj: ColumnParallelLinear,
+        kv_b_proj: ColumnParallelLinear,
+        o_proj: RowParallelLinear,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_head_dim
+        self.v_head_dim = v_head_dim
+
+        self.rotary_emb = rotary_emb
+        self.use_yarn_rope = isinstance(rotary_emb,
+                                        DeepseekScalingRotaryEmbedding)
+        self.q_proj = q_proj
+        self.kv_b_proj = kv_b_proj
+        self.o_proj = o_proj
+        self.vllm_flash_attn_version = get_flash_attn_version()
+
+        # Handle the differences between the flash_attn_varlen from flash_attn
+        # and the one from vllm_flash_attn. The former is used on RoCM and the
+        # latter has an additional parameter to control FA2 vs FA3
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+        if self.vllm_flash_attn_version is not None:
+            self.flash_attn_varlen_func = \
+                functools.partial(flash_attn_varlen_func,
+                                  fa_version=self.vllm_flash_attn_version)
+
+    def _v_up_proj_and_o_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_UV_O):
+                output_parallel = apply_fp8_linear_generic(
+                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape)
+            else:
+                output_parallel = torch.matmul(x.flatten(start_dim=1),
+                                               self.W_UV_O)
+            if self.tp_size > 1:
+                output = tensor_model_parallel_all_reduce(output_parallel)
+            else:
+                output = output_parallel
+            return output
+        else:
+            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
+            return self.o_proj(x.reshape(-1,
+                                         self.num_heads * self.v_head_dim))[0]
+
+    def _q_proj_and_k_up_proj(self, x):
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            if is_fp8(self.W_Q_UK):
+                return apply_fp8_linear_generic(
+                    x, self.W_Q_UK, self.W_Q_UK_scales,
+                    self.reqaunt_input_group_shape,
+                    self.reqaunt_weight_group_shape).view(
+                        -1, self.num_heads, self.kv_lora_rank)
+            return torch.matmul(x, self.W_Q_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+        else:
+            x = torch.matmul(x, self.W_Q)\
+                .view(-1, self.num_heads, self.qk_nope_head_dim)
+            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
+                .view(-1, self.num_heads, self.kv_lora_rank)
+
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
+
+        # TODO(lucas) This is very gross, we need a more wide scale refactor of
+        # all the FP8 code with a more standard way of
+        # defining schemes/group-shapes, we should also potentially force
+        # quant_methods to support a decompress function
+        #
+        # returns input_group_shape, weight_group_shape
+        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
+            Tuple[Tuple[int, int], Tuple[int, int]]:
+            if isinstance(layer.quant_method, Fp8LinearMethod):
+                if layer.quant_method.block_quant:
+                    weight_block_size = \
+                        layer.quant_method.quant_config.weight_block_size
+                    # per-token-group (1, X), block-quantized (X, Y)
+                    return (1, weight_block_size[-1]), weight_block_size
+                else:
+                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
+            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
+                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
+                # this is hacky but we always assume the for
+                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
+                # we ignore if it is static-per-tensor since we are going to
+                # requantize after later anyways
+                strategy = layer.scheme.strategy
+                if strategy == QuantizationStrategy.TENSOR:
+                    return (1, -1), (-1, -1)  # per-token, per-tensor
+                elif strategy == QuantizationStrategy.CHANNEL:
+                    return (1, -1), (-1, 1)  # per-token, per-channel
+                else:
+                    raise NotImplementedError(
+                        f"QuantizationStrategy.{strategy} is not supported for "
+                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
+            else:
+                raise NotImplementedError(
+                    "Can't determine scale group shapes for "
+                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
+                )
+
+        def get_layer_weight(layer):
+            if hasattr(layer, "weight"):
+                return layer.weight
+            elif hasattr(layer, "qweight"):
+                return layer.qweight
+            else:
+                raise AttributeError(
+                    f"Layer '{layer}' has neither weight nor qweight")
+
+        def get_and_maybe_dequant_weights(layer: LinearBase):
+            if not isinstance(layer.quant_method, UnquantizedLinearMethod):
+                # NOTE: This should only be used offline, since it's O(N^3)
+                eye = torch.eye(layer.input_size_per_partition,
+                                dtype=act_dtype,
+                                device=get_layer_weight(layer).device)
+                dequant_weights = layer.quant_method.apply(layer,
+                                                           eye,
+                                                           bias=None)
+                del eye
+                # standardize to (output, input)
+                return dequant_weights.T
+            return layer.weight
+
+        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
+        assert get_layer_weight(self.o_proj).dtype == weight_dtype
+        assert get_layer_weight(self.q_proj).dtype == weight_dtype
+
+        kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
+        assert kv_b_proj_weight.shape == (
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim)), (
+                f"{kv_b_proj_weight.shape=}, "
+                f"{self.kv_lora_rank=}, "
+                f"{self.num_heads=}, "
+                f"{self.qk_nope_head_dim=}, "
+                f"{self.v_head_dim=}")
+        kv_b_proj_weight = kv_b_proj_weight.view(
+            self.kv_lora_rank,
+            self.num_heads,
+            self.qk_nope_head_dim + self.v_head_dim,
+        )
+
+        W_UK, W_UV = kv_b_proj_weight.split(
+            [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
+                .view(-1, self.num_heads, self.qk_head_dim)
+
+        # can be W_Q or W_UQ depending q_lora_rank, the former if
+        # q_lora_rank is None, the latter otherwise. From the Attention backend
+        # perspective though we call these both W_Q and rely on the layer
+        # to pass in the correct matrix
+        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
+        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
+            .flatten(start_dim=1).contiguous()
+
+        # W_QR is small so for simplicity we dont bother requantizing it
+        self.W_QR = self.W_QR.to(act_dtype)
+
+        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
+            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
+            if is_fp8(weight_dtype) and requantization_enabled:
+                # This assumes it wise to requantize using the same group shapes
+                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
+                # weights were originally quantized
+                requant_input_group_shape, requant_weight_group_shape = \
+                    get_scale_group_shapes_for_fp8(self.q_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
+                assert (requant_input_group_shape, requant_weight_group_shape)\
+                    == get_scale_group_shapes_for_fp8(self.o_proj)
+                self.reqaunt_input_group_shape = requant_input_group_shape
+                self.reqaunt_weight_group_shape = requant_weight_group_shape
+
+            #
+            # Perform matrix-absorption following
+            #     https://github.com/flashinfer-ai/flashinfer/pull/551
+            # for decode, as a result we end up with absorbed weights for decode
+            # and another copy of raw weights for prefill.
+            #
+            self.W_UK, self.W_UV = kv_b_proj_weight.split(
+                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
+            # depending q_lora_rank, the former if q_lora_rank is None, the
+            # latter otherwise
+            # basically if q_lora_rank is none we are absorbing into q_proj
+            # instead of UQ
+            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
+                .flatten(start_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_Q_UK, W_Q_UK_scales = scaled_quantize(
+                    W_Q_UK,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_Q_UK = W_Q_UK.T.contiguous()
+                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
+            else:
+                self.W_Q_UK = W_Q_UK.to(act_dtype)
+
+            W_O = get_and_maybe_dequant_weights(self.o_proj)\
+                .view(-1, self.num_heads, self.v_head_dim)
+            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
+                .flatten(start_dim=0, end_dim=1).contiguous()
+
+            if is_fp8(weight_dtype) and requantization_enabled:
+                W_UV_O, W_UV_O_scales = scaled_quantize(
+                    W_UV_O,
+                    self.reqaunt_weight_group_shape,
+                    quant_dtype=current_platform_fp8_dtype)
+                # For FP8 save the transpose so we can use
+                # `apply_w8a8_block_fp8_linear` directly
+                self.W_UV_O = W_UV_O.T.contiguous()
+                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
+            else:
+                self.W_UV_O = W_UV_O.to(act_dtype)
+
+            self.tp_size = get_tensor_model_parallel_world_size()
+        else:
+            if is_fp8(weight_dtype):
+                raise NotImplementedError(
+                    "Currently fp8 requires matrix absorption")
+
+            self.W_UV = W_UV
+            self.W_UK = W_UK
+            self.W_Q = W_Q.flatten(start_dim=1)
+
+    def _compute_prefill_context(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ):
+        assert attn_metadata.num_prefills is not None
+        assert attn_metadata.context_chunk_seq_tot is not None
+        assert attn_metadata.context_chunk_cu_seq_lens is not None
+        assert attn_metadata.context_chunk_starts is not None
+        assert attn_metadata.context_chunk_max_seq_lens is not None
+
+        output = None
+        iters = len(attn_metadata.context_chunk_seq_tot)
+
+        assert attn_metadata.chunked_prefill_workspace is not None
+        workspace = attn_metadata.chunked_prefill_workspace
+
+        for i in range(iters):
+            toks = attn_metadata.context_chunk_seq_tot[i]
+
+            ops.gather_cache(
+                src_cache=kv_c_and_k_pe_cache,
+                dst=workspace,
+                block_table=attn_metadata.block_table,
+                cu_seq_lens=attn_metadata.context_chunk_cu_seq_lens[i],
+                batch_size=attn_metadata.num_prefills,
+                seq_starts=attn_metadata.context_chunk_starts[i],
+            )
+
+            kv_c_normed = workspace[:toks]\
+                [..., :self.kv_lora_rank].unsqueeze(1)
+            k_pe = workspace[:toks]\
+                [..., self.kv_lora_rank:].unsqueeze(1)
+
+            kv_nope = self.kv_b_proj(kv_c_normed)[0].view( \
+                -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            k_nope, v = kv_nope\
+                .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+            k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))),
+                          dim=-1)
+
+            # For MLA the v head dim is smaller than qk head dim so we pad
+            # out v with 0s to match the qk head dim
+            v_padded = torch.nn.functional.pad(v,
+                                               [0, q.shape[-1] - v.shape[-1]],
+                                               value=0)
+
+            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+                q=q,
+                k=k,
+                v=v_padded,
+                cu_seqlens_q=attn_metadata.query_start_loc,
+                cu_seqlens_k=attn_metadata.context_chunk_cu_seq_lens[i],
+                max_seqlen_q=attn_metadata.max_query_len,
+                max_seqlen_k=attn_metadata.context_chunk_max_seq_lens[i],
+                softmax_scale=self.scale,
+                causal=False,  # Context is unmasked
+                return_softmax_lse=True,
+            )
+
+            if output is None:
+                output = attn_output
+                output_lse = attn_softmax_lse
+            else:
+                output_tmp = torch.empty_like(output)
+                output_lse_tmp = torch.empty_like(output_lse)
+                merge_attn_states(
+                    output=output_tmp,
+                    output_lse=output_lse_tmp,
+                    prefix_output=output,
+                    prefix_lse=output_lse,
+                    suffix_output=attn_output,
+                    suffix_lse=attn_softmax_lse,
+                )
+                output = output_tmp
+                output_lse = output_lse_tmp
+
+        return output, output_lse
+
+    def _forward_prefill(
+        self,
+        q: torch.Tensor,
+        kv_c_normed: torch.Tensor,
+        k_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        has_context = attn_metadata.has_context
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        output = self.flash_attn_varlen_func(
+            q=q,
+            k=k,
+            v=v_padded,
+            cu_seqlens_q=attn_metadata.query_start_loc,
+            cu_seqlens_k=attn_metadata.query_start_loc,
+            max_seqlen_q=attn_metadata.max_query_len,
+            max_seqlen_k=attn_metadata.max_seq_len,
+            softmax_scale=self.scale,
+            causal=True,
+            return_softmax_lse=has_context,
+        )
+
+        if has_context:
+            suffix_output, suffix_lse = output
+            context_output, context_lse = self._compute_prefill_context( \
+                q, kv_c_and_k_pe_cache, attn_metadata)
+
+            output = torch.empty_like(suffix_output)
+            merge_attn_states(
+                output=output,
+                prefix_output=context_output,
+                prefix_lse=context_lse,
+                suffix_output=suffix_output,
+                suffix_lse=suffix_lse,
+            )
+
+        # slice by `:v.shape[-1]` in order to remove v headdim padding
+        output = output\
+            .view(-1, self.num_heads, q.shape[-1])[..., :v.shape[-1]]\
+                .reshape(-1, self.num_heads * v.shape[-1])
+
+        return self.o_proj(output)[0]
+
+    @abstractmethod
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: T,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        hidden_states_or_q_c: torch.Tensor,  # query in unified attn
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: T,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        assert output is not None, "Output tensor must be provided."
+
+        if attn_metadata is None:
+            # Profiling run.
+            return output
+
+        num_actual_toks = attn_metadata.num_actual_tokens
+
+        # Inputs and outputs may be padded for CUDA graphs
+        output_padded = output
+        output = output[:num_actual_toks, ...]
+        hidden_states_or_q_c = hidden_states_or_q_c[:num_actual_toks, ...]
+        k_c_normed = k_c_normed[:num_actual_toks, ...]
+        k_pe = k_pe[:num_actual_toks, ...]
+
+        # Restore head dim (for rotary embedding)
+        k_pe = k_pe.unsqueeze(1)
+        assert hasattr(attn_metadata, "input_positions")
+
+        assert attn_metadata.num_decodes is not None and \
+            attn_metadata.num_prefills is not None and \
+            attn_metadata.num_decode_tokens is not None
+
+        has_decode = attn_metadata.num_decodes > 0
+        has_prefill = attn_metadata.num_prefills > 0
+        num_decode_tokens = attn_metadata.num_decode_tokens
+
+        decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
+        decode_k_pe = k_pe[:num_decode_tokens]
+        decode_input_positions = \
+            attn_metadata.input_positions[:num_decode_tokens]
+
+        prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
+        prefill_k_pe = k_pe[num_decode_tokens:]
+        prefill_input_positions = \
+            attn_metadata.input_positions[num_decode_tokens:]
+        prefill_k_c_normed = k_c_normed[num_decode_tokens:]
+
+        if has_decode:
+            decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
+            decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
+                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
+                decode_input_positions, decode_q_pe, decode_k_pe)
+
+        if has_prefill:
+            prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
+                .view(-1, self.num_heads, self.qk_head_dim)
+            prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
+            prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
+                prefill_input_positions, prefill_q_pe, prefill_k_pe)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        if has_prefill:
+            output[num_decode_tokens:] = self._forward_prefill(
+                prefill_q, prefill_k_c_normed, prefill_k_pe, kv_cache,
+                attn_metadata)
+
+        if has_decode:
+            output[:num_decode_tokens] = self._forward_decode(
+                decode_q_nope, decode_q_pe, kv_cache, attn_metadata)
+
+        return output_padded
diff --git a/vllm/v1/attention/backends/triton_mla.py b/vllm/v1/attention/backends/triton_mla.py
new file mode 100644
index 00000000000..7747509f1a4
--- /dev/null
+++ b/vllm/v1/attention/backends/triton_mla.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional, Type
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata)
+
+logger = init_logger(__name__)
+
+
+class TritonMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "TRITON_MLA_VLLM_V1"
+
+    @staticmethod
+    def get_impl_cls() -> Type["TritonMLAImpl"]:
+        return TritonMLAImpl
+
+
+class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "TritonMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "TritonMLAImpl")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: MLACommonMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 Triton MLA not yet supported")
+
+        B = q_nope.shape[0]
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = torch.zeros(B,
+                        self.num_heads,
+                        self.kv_lora_rank,
+                        dtype=q.dtype,
+                        device=q.device)
+
+        num_kv_splits = 4  # TODO: heuristic
+
+        # TODO(lucas) Allocate ahead of time
+        attn_logits = torch.empty(
+            (
+                B,
+                self.num_heads,
+                num_kv_splits,
+                # NOTE(lucas) idk why the +1 is here but sglang has it so we
+                # just mirror that
+                self.kv_lora_rank + 1,
+            ),
+            dtype=torch.float32,
+            device=q.device,
+        )
+
+        # Add a head dim of 1
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.unsqueeze(2)
+        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
+        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
+
+        # Run MQA
+        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
+                             attn_metadata.block_table, attn_metadata.seq_lens,
+                             attn_logits, num_kv_splits, self.scale, PAGE_SIZE)
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index e4e6b88245d..1b6ea559a7b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -80,7 +80,14 @@ def __init__(
         self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32)
         self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
-        self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs, ),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_computed_tokens_cpu = \
+            self.num_computed_tokens_cpu_tensor.numpy()
 
         # Block table.
         self.block_table = BlockTable(
@@ -356,6 +363,61 @@ def remove_request(self, req_id: str) -> Optional[int]:
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
 
+    def swap_states(self, i1: int, i2: int) -> None:
+        old_id_i1 = self._req_ids[i1]
+        old_id_i2 = self._req_ids[i2]
+        self._req_ids[i1], self._req_ids[i2] =\
+            self._req_ids[i2], self._req_ids[i1] # noqa
+        self.req_output_token_ids[i1], self.req_output_token_ids[i2] =\
+            self.req_output_token_ids[i2], self.req_output_token_ids[i1]
+        assert old_id_i1 is not None and old_id_i2 is not None
+        self.req_id_to_index[old_id_i1], self.req_id_to_index[old_id_i2] =\
+            self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
+        self.num_tokens[i1], self.num_tokens[i2] =\
+            self.num_tokens[i2], self.num_tokens[i1]
+        self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+            self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
+            self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
+        self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
+            self.num_prompt_tokens[i2], self.num_prompt_tokens[i1]
+        self.num_computed_tokens_cpu[i1], self.num_computed_tokens_cpu[i2] =\
+            self.num_computed_tokens_cpu[i2], self.num_computed_tokens_cpu[i1]
+        self.temperature_cpu[i1], self.temperature_cpu[i2] =\
+            self.temperature_cpu[i2], self.temperature_cpu[i1]
+        self.top_p_cpu[i1], self.top_p_cpu[i2] =\
+            self.top_p_cpu[i2], self.top_p_cpu[i1]
+        self.top_k_cpu[i1], self.top_k_cpu[i2] =\
+            self.top_k_cpu[i2], self.top_k_cpu[i1]
+        self.frequency_penalties_cpu[i1], self.frequency_penalties_cpu[i2] =\
+            self.frequency_penalties_cpu[i2], self.frequency_penalties_cpu[i1]
+        self.presence_penalties_cpu[i1], self.presence_penalties_cpu[i2] =\
+            self.presence_penalties_cpu[i2], self.presence_penalties_cpu[i1]
+        self.repetition_penalties_cpu[i1], self.repetition_penalties_cpu[i2] =\
+            self.repetition_penalties_cpu[i2], self.repetition_penalties_cpu[i1]
+        self.min_p_cpu[i1], self.min_p_cpu[i2] =\
+            self.min_p_cpu[i2], self.min_p_cpu[i1]
+
+        g1 = self.generators.get(i1)
+        g2 = self.generators.get(i2)
+        if g1 is not None:
+            self.generators[i2] = g1
+        if g2 is not None:
+            self.generators[i1] = g2
+
+        t1 = self.min_tokens.get(i1)
+        t2 = self.min_tokens.get(i2)
+        if t1 is not None:
+            self.min_tokens[i2] = t1
+        if t2 is not None:
+            self.min_tokens[i1] = t2
+
+        self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
+            self.request_lora_mapping[i2], self.request_lora_mapping[i1]
+        self.logit_bias[i1], self.logit_bias[i2] =\
+            self.logit_bias[i2], self.logit_bias[i1]
+        self.block_table.swap_row(i1, i2)
+
     def condense(self, empty_req_indices: List[int]) -> None:
         num_reqs = self.num_reqs
         if num_reqs == 0:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4d0ae9a205a..c9212d993f2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2,6 +2,7 @@
 
 import gc
 import time
+import weakref
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import numpy as np
@@ -9,7 +10,7 @@
 import torch.distributed
 import torch.nn as nn
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.layer import Attention
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
@@ -24,8 +25,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         LayerBlockType, cdiv, is_pin_memory_available)
-from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
-                                                   FlashAttentionMetadata)
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -92,6 +92,27 @@ def __init__(
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
 
+        self.attn_backend = get_attn_backend(
+            self.head_size,
+            self.dtype,
+            self.kv_cache_dtype,
+            self.block_size,
+            self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
+        )
+        if self.attn_backend is None:
+            error_msg = (
+                f"Error with get_att_backend: {self.head_size=}, "
+                f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, "
+                f"{self.model_config.is_attention_free=}, "
+                f"{self.model_config.use_mla=}")
+            logger.error(error_msg)
+            raise NotImplementedError(
+                "Non-Attention backend is not supported by V1 GPUModelRunner.")
+
+        self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
+            weakref.proxy(self))
+
         # Multi-modal data support
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -433,6 +454,12 @@ def _prepare_inputs(
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
 
+        # Some attention backends (namely MLA) may want to separate requests
+        # based on if the attention computation will be compute-bound or
+        # memory-bound. This gives them a hook to do that.
+        self.attn_metadata_builder.reorder_batch(self.input_batch,
+                                                 scheduler_output)
+
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
         self.input_batch.block_table.commit(num_reqs)
@@ -515,7 +542,6 @@ def _prepare_inputs(
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
             num_scheduled_tokens)
-        max_seq_len = self.seq_lens_np[:num_reqs].max()
 
         # Copy the tensors to the GPU.
         self.input_ids[:total_num_scheduled_tokens].copy_(
@@ -530,49 +556,17 @@ def _prepare_inputs(
             self.positions[:total_num_scheduled_tokens].copy_(
                 self.positions_cpu[:total_num_scheduled_tokens],
                 non_blocking=True)
-        query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to(
-            self.device, non_blocking=True)
-        seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device,
-                                                   non_blocking=True)
-        slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to(
-            self.device, non_blocking=True).long()
 
         # Prepare for cascade attention if needed.
         common_prefix_len = self._compute_cascade_attn_prefix_len(
             num_scheduled_tokens,
             scheduler_output.num_common_prefix_blocks,
         )
-        use_cascade = common_prefix_len > 0
-        if use_cascade:
-            # TODO: Optimize.
-            cu_prefix_query_lens = torch.tensor(
-                [0, total_num_scheduled_tokens],
-                dtype=torch.int32,
-                device=self.device)
-            prefix_kv_lens = torch.tensor([common_prefix_len],
-                                          dtype=torch.int32,
-                                          device=self.device)
-            suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len)
-            suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device)
-        else:
-            cu_prefix_query_lens = None
-            prefix_kv_lens = None
-            suffix_kv_lens = None
-
-        attn_metadata = FlashAttentionMetadata(
+        attn_metadata = self.attn_metadata_builder.build(
+            num_reqs=num_reqs,
             num_actual_tokens=total_num_scheduled_tokens,
             max_query_len=max_num_scheduled_tokens,
-            query_start_loc=query_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=(
-                self.input_batch.block_table.get_device_tensor()[:num_reqs]),
-            slot_mapping=slot_mapping,
-            use_cascade=use_cascade,
             common_prefix_len=common_prefix_len,
-            cu_prefix_query_lens=cu_prefix_query_lens,
-            prefix_kv_lens=prefix_kv_lens,
-            suffix_kv_lens=suffix_kv_lens,
         )
 
         use_spec_decode = len(
@@ -586,7 +580,7 @@ def _prepare_inputs(
             # from these partial requests, we do so for simplicity.
             # We will ignore the sampled tokens from the partial requests.
             # TODO: Support prompt logprobs.
-            logits_indices = query_start_loc[1:] - 1
+            logits_indices = attn_metadata.query_start_loc[1:] - 1
 
         # Hot-Swap lora model
         if self.lora_config:
@@ -667,7 +661,7 @@ def _compute_cascade_attn_prefix_len(
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (common_prefix_len // self.block_size *
                              self.block_size)
-        use_cascade = FlashAttentionBackend.use_cascade_attention(
+        use_cascade = self.attn_backend.use_cascade_attention(
             common_prefix_len=common_prefix_len,
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,
@@ -1379,7 +1373,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             assert tensor_config.size % layer_spec.page_size_bytes == 0
             num_blocks = tensor_config.size // layer_spec.page_size_bytes
             if isinstance(layer_spec, FullAttentionSpec):
-                kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape(
+                kv_cache_shape = self.attn_backend.get_kv_cache_shape(
                     num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
                     layer_spec.head_size)
                 dtype = layer_spec.dtype

From 5e5d610bfb47b81341dc873e1316e47845f51f53 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2025 19:04:10 +0000
Subject: [PATCH 0421/1240] Bump azure/setup-helm from 4.2.0 to 4.3.0 (#13742)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/lint-and-deploy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index a4e9acc414d..b199d0867a6 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -12,7 +12,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Helm
-        uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
+        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
         with:
           version: v3.14.4
 

From 4b03b6ebcc0ea77547b3c509b36aa0773594b2ca Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Feb 2025 03:14:55 +0800
Subject: [PATCH 0422/1240] [VLM] Deprecate legacy input mapper for OOT
 multimodal models (#13979)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py            | 45 ++++++++++++++++++++-------------------
 vllm/inputs/preprocess.py | 14 +++++++-----
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index d1384c6375f..cb683d19386 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -400,7 +400,7 @@ def __init__(
         else:
             self.override_neuron_config = None
 
-        supported_tasks, task = self._resolve_task(task, self.hf_config)
+        supported_tasks, task = self._resolve_task(task)
         self.supported_tasks = supported_tasks
         self.task: Final = task
         if self.task in ("draft", "generate"):
@@ -418,6 +418,14 @@ def __init__(
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @property
+    def registry(self):
+        return ModelRegistry
+
+    @property
+    def architectures(self) -> list[str]:
+        return getattr(self.hf_config, "architectures", [])
+
     def maybe_pull_model_tokenizer_for_s3(self, model: str,
                                           tokenizer: str) -> None:
         """
@@ -446,8 +454,7 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
-        architectures = getattr(self.hf_config, "architectures", [])
-        if ModelRegistry.is_multimodal_model(architectures):
+        if self.registry.is_multimodal_model(self.architectures):
             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
 
         if limit_mm_per_prompt:
@@ -480,16 +487,13 @@ def _init_pooler_config(
         return None
 
     def _init_attention_free(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_attention_free_model(architectures)
+        return self.registry.is_attention_free_model(self.architectures)
 
     def _init_is_hybrid(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_hybrid_model(architectures)
+        return self.registry.is_hybrid_model(self.architectures)
 
     def _init_has_inner_state(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.model_has_inner_state(architectures)
+        return self.registry.model_has_inner_state(self.architectures)
 
     def _verify_tokenizer_mode(self) -> None:
         tokenizer_mode = self.tokenizer_mode.lower()
@@ -507,9 +511,9 @@ def _get_preferred_task(
         model_id = self.model
         if get_pooling_config(model_id, self.revision):
             return "embed"
-        if ModelRegistry.is_cross_encoder_model(architectures):
+        if self.registry.is_cross_encoder_model(architectures):
             return "score"
-        if ModelRegistry.is_transcription_model(architectures):
+        if self.registry.is_transcription_model(architectures):
             return "transcription"
 
         suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
@@ -522,7 +526,7 @@ def _get_preferred_task(
             ("EmbeddingModel", "embed"),
             ("RewardModel", "reward"),
         ]
-        _, arch = ModelRegistry.inspect_model_cls(architectures)
+        _, arch = self.registry.inspect_model_cls(architectures)
 
         for suffix, pref_task in suffix_to_preferred_task:
             if arch.endswith(suffix) and pref_task in supported_tasks:
@@ -533,20 +537,19 @@ def _get_preferred_task(
     def _resolve_task(
         self,
         task_option: Union[TaskOption, Literal["draft"]],
-        hf_config: PretrainedConfig,
     ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]:
         if task_option == "draft":
             return {"draft"}, "draft"
 
-        architectures = getattr(hf_config, "architectures", [])
+        registry = self.registry
+        architectures = self.architectures
 
         runner_support: Dict[RunnerType, bool] = {
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
-            "transcription":
-            ModelRegistry.is_transcription_model(architectures),
-            "generate": ModelRegistry.is_text_generation_model(architectures),
-            "pooling": ModelRegistry.is_pooling_model(architectures),
+            "transcription": registry.is_transcription_model(architectures),
+            "generate": registry.is_text_generation_model(architectures),
+            "pooling": registry.is_pooling_model(architectures),
         }
         supported_runner_types_lst: List[RunnerType] = [
             runner_type
@@ -755,8 +758,7 @@ def verify_with_parallel_config(
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
         if pipeline_parallel_size > 1:
-            architectures = getattr(self.hf_config, "architectures", [])
-            if not ModelRegistry.is_pp_supported_model(architectures):
+            if not self.registry.is_pp_supported_model(self.architectures):
                 raise NotImplementedError(
                     "Pipeline parallelism is not supported for this model. "
                     "Supported models implement the `SupportsPP` interface.")
@@ -1023,8 +1025,7 @@ def is_multimodal_model(self) -> bool:
 
     @property
     def is_cross_encoder(self) -> bool:
-        architectures = getattr(self.hf_config, "architectures", [])
-        return ModelRegistry.is_cross_encoder_model(architectures)
+        return self.registry.is_cross_encoder_model(self.architectures)
 
     @property
     def use_mla(self) -> bool:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index bc5856990da..206a76e52b7 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -236,11 +236,15 @@ def _can_process_multimodal(self) -> bool:
         # updated to use the new multi-modal processor
         can_process_multimodal = self.mm_registry.has_processor(model_config)
         if not can_process_multimodal:
-            logger.info_once(
-                "Your model uses the legacy input pipeline instead of the new "
-                "multi-modal processor. Please note that the legacy pipeline "
-                "will be removed in a future release. For more details, see: "
-                "https://github.com/vllm-project/vllm/issues/10114")
+            from vllm.model_executor.models.registry import _VLLM_MODELS
+            if not any(arch in _VLLM_MODELS
+                       for arch in model_config.architectures):
+                logger.warning_once(
+                    "Your model uses the legacy input pipeline, which will be "
+                    "removed in an upcoming release. "
+                    "Please upgrade to the new multi-modal processing pipeline "
+                    "(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
+                )
 
         return can_process_multimodal
 

From e348adb3d2283bab1a8abaaddfede9455e4d358e Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 27 Feb 2025 12:31:47 -0800
Subject: [PATCH 0423/1240] [ROCm] Fix the Kernels, Core, and Prefix Caching
 AMD CI groups (#13970)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh                             |  4 +++-
 .../core/block/e2e/test_correctness_sliding_window.py  | 10 ++++++++++
 tests/prefix_caching/test_prefix_caching.py            | 10 ++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index f8bf1c87603..35d2ba1f8ba 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -92,7 +92,9 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_moe.py \
   --ignore=kernels/test_prefix_prefill.py \
   --ignore=kernels/test_rand.py \
-  --ignore=kernels/test_sampler.py"
+  --ignore=kernels/test_sampler.py \
+  --ignore=kernels/test_cascade_flash_attn.py \
+  --ignore=kernels/test_mamba_mixer2.py"
 fi
 
 #ignore certain Entrypoints tests
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index c874608e40a..a7dafcf8be8 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -7,6 +7,7 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
 
 from .conftest import get_text_from_llm_generator
 
@@ -42,6 +43,11 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 
     Additionally, we compare the results of the v1 and v2 managers.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
+
     override_backend_env_variable(monkeypatch, backend)
 
     sampling_params = SamplingParams(
@@ -101,6 +107,10 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     The results with and without chunked prefill are not the same due to
     numerical instabilities.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     sampling_params = SamplingParams(
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 2773d27a681..d7d84bdcf38 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -12,6 +12,7 @@
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
+from vllm.platforms import current_platform
 
 from ..models.utils import check_outputs_equal
 
@@ -53,6 +54,10 @@ def test_mixed_requests(
     and the others don't. The cached position determines where
     the sequence is at among the batch of prefills.
     """
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     with hf_runner(model, dtype=dtype) as hf_model:
@@ -103,6 +108,11 @@ def test_unstable_prompt_sequence(
     backend: str,
     monkeypatch,
 ) -> None:
+
+    if backend == "FLASHINFER" and current_platform.is_rocm():
+        pytest.skip("Flashinfer does not support ROCm/HIP.")
+    if backend == "XFORMERS" and current_platform.is_rocm():
+        pytest.skip("Xformers does not support ROCm/HIP.")
     override_backend_env_variable(monkeypatch, backend)
 
     with vllm_runner(

From 8589d1fda5420bd529056a7a34f4dd262848b4cf Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 27 Feb 2025 13:11:40 -0800
Subject: [PATCH 0424/1240] [V1][Minor] Minor cleanup for GPU Model Runner
 (#13983)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c9212d993f2..2730e6770dc 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1187,8 +1187,9 @@ def profile_run(self) -> None:
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
-                self.model_config)
+            max_tokens_by_modality_dict = (
+                MULTIMODAL_REGISTRY.
+                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
@@ -1275,15 +1276,15 @@ def profile_run(self) -> None:
         # maximum num_tokens.
         num_reqs = self.scheduler_config.max_num_seqs
         num_tokens = self.max_num_tokens
-        min_tokens_per_req: int = num_tokens // num_reqs
+        min_tokens_per_req = num_tokens // num_reqs
 
-        num_scheduled_tokens_list: List[int] = [min_tokens_per_req] * num_reqs
+        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
         num_scheduled_tokens_list[-1] += num_tokens % num_reqs
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
 
-        num_scheduled_tokens: np.ndarray = np.array(num_scheduled_tokens_list,
-                                                    dtype=np.int32)
+        num_scheduled_tokens = np.array(num_scheduled_tokens_list,
+                                        dtype=np.int32)
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
 
         with self.maybe_profile_with_lora(self.lora_config,

From 2a333b66b8df42e0e4d3c17cee1576e5face162f Mon Sep 17 00:00:00 2001
From: qli88 <qiang.li2@amd.com>
Date: Thu, 27 Feb 2025 16:14:30 -0600
Subject: [PATCH 0425/1240] [core] Perf improvement for DSv3 on AMD GPUs
 (#13718)

Signed-off-by: qli88 <qiang.li2@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py         |  92 ++++++++++---
 vllm/attention/ops/triton_decode_attention.py |  15 +-
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++++++
 3 files changed, 210 insertions(+), 25 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 1befcb6b45d..f240074f252 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -237,14 +237,20 @@
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
+    is_vllm_fa = True
 except ImportError:
     # For rocm use upstream flash attention
     from flash_attn import flash_attn_varlen_func
+    is_vllm_fa = False
+
+from vllm.attention.ops.triton_flash_attention import triton_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
 
+is_hip = current_platform.is_rocm()
+
 
 class MLACommonBackend(AttentionBackend):
 
@@ -1046,12 +1052,13 @@ def __init__(
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
-        self.vllm_flash_attn_version = get_flash_attn_version()
+        self.triton_fa_func = triton_attention
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
         # latter has an additional parameter to control FA2 vs FA3
         self.flash_attn_varlen_func = flash_attn_varlen_func
+        self.vllm_flash_attn_version = get_flash_attn_version()
         if self.vllm_flash_attn_version is not None:
             self.flash_attn_varlen_func = \
                 functools.partial(flash_attn_varlen_func,
@@ -1315,18 +1322,48 @@ def _compute_prefill_context(
                                                [0, q.shape[-1] - v.shape[-1]],
                                                value=0)
 
-            attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
-                q=q,
-                k=k,
-                v=v_padded,
-                cu_seqlens_q=prefill_metadata.query_start_loc,
-                cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
-                max_seqlen_q=prefill_metadata.max_query_len,
-                max_seqlen_k=prefill_metadata.context_chunk_max_seq_lens[i],
-                softmax_scale=self.scale,
-                causal=False,  # Context is unmasked
-                return_softmax_lse=True,
-            )
+            if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+                attn_output, attn_softmax_lse = self.triton_fa_func(
+                    q,
+                    k,
+                    v_padded,
+                    None,
+                    prefill_metadata.query_start_loc,
+                    prefill_metadata.context_chunk_cu_seq_lens[i],
+                    prefill_metadata.max_query_len,
+                    prefill_metadata.context_chunk_max_seq_lens[i],
+                    False,  # causal
+                    self.scale,
+                    None,  # attn_mask is None unless applying ALiBi mask
+                )
+            elif is_vllm_fa:
+                attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
+                    q=q,
+                    k=k,
+                    v=v_padded,
+                    cu_seqlens_q=prefill_metadata.query_start_loc,
+                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                    max_seqlen_q=prefill_metadata.max_query_len,
+                    max_seqlen_k=prefill_metadata.
+                    context_chunk_max_seq_lens[i],
+                    softmax_scale=self.scale,
+                    causal=False,  # Context is unmasked
+                    return_softmax_lse=True,
+                )
+            else:
+                attn_output, attn_softmax_lse, _ = self.flash_attn_varlen_func(
+                    q=q,
+                    k=k,
+                    v=v_padded,
+                    cu_seqlens_q=prefill_metadata.query_start_loc,
+                    cu_seqlens_k=prefill_metadata.context_chunk_cu_seq_lens[i],
+                    max_seqlen_q=prefill_metadata.max_query_len,
+                    max_seqlen_k=prefill_metadata.
+                    context_chunk_max_seq_lens[i],
+                    softmax_scale=self.scale,
+                    causal=False,  # Context is unmasked
+                    return_attn_probs=True,
+                )
 
             if output is None:
                 output = attn_output
@@ -1374,11 +1411,24 @@ def _forward_prefill(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        if has_context:
-            if not current_platform.is_cuda():
-                raise NotImplementedError(
-                    "Chunked Prefill for MLA is not currently supported on"
-                    "non-cuda platforms")
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+            output = self.triton_fa_func(
+                q,
+                k,
+                v_padded,
+                None,
+                prefill_metadata.query_start_loc,
+                prefill_metadata.query_start_loc,
+                prefill_metadata.max_prefill_seq_len,
+                prefill_metadata.max_prefill_seq_len,
+                True,  # causal
+                self.scale,
+                None,  # attn_mask is None unless applying ALiBi mask
+            )
+            ## triton flash attention always return 2 objects
+            if not has_context:
+                output = output[0]
+        elif is_vllm_fa:
             output = self.flash_attn_varlen_func(
                 q=q,
                 k=k,
@@ -1389,7 +1439,7 @@ def _forward_prefill(
                 max_seqlen_k=prefill_metadata.max_prefill_seq_len,
                 softmax_scale=self.scale,
                 causal=True,
-                return_softmax_lse=True,
+                return_softmax_lse=has_context,
             )
         else:
             output = self.flash_attn_varlen_func(
@@ -1402,10 +1452,12 @@ def _forward_prefill(
                 max_seqlen_k=prefill_metadata.max_prefill_seq_len,
                 softmax_scale=self.scale,
                 causal=True,
+                return_attn_probs=has_context,
             )
 
         if has_context:
-            suffix_output, suffix_lse = output
+            # ROCm flash_attn_varlen_func will return 3 objects instead of 2
+            suffix_output, suffix_lse, *rest = output
             context_output, context_lse = self._compute_prefill_context( \
                 q, kv_c_and_k_pe_cache, attn_metadata)
 
diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py
index 057fccb5e59..40daec3ec12 100644
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
@@ -178,7 +178,8 @@ def _decode_att_m_fwd(
     page_size,
     logit_cap,
 ):
-    BLOCK = 64
+    BLOCK = 64 if not is_hip_ else 8
+
     NUM_KV_SPLITS = num_kv_splits
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
@@ -188,7 +189,9 @@ def _decode_att_m_fwd(
     grid = (batch, head_num, NUM_KV_SPLITS)
     kv_group_num = q.shape[1] // k_buffer.shape[-2]
 
-    num_warps = 4 if kv_group_num == 1 else 2
+    num_warps = 4
+    if kv_group_num != 1:
+        num_warps = 1 if is_hip_ else 2
 
     BLOCK_DMODEL = triton.next_power_of_2(Lk)
     BLOCK_DV = triton.next_power_of_2(Lv)
@@ -418,14 +421,16 @@ def _decode_grouped_att_m_fwd(
     )
 
     extra_kargs = {}
+    num_stages = 2
     if is_hip_:
-        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
+        # https://rocm.docs.amd.com/en/latest/how-to/rocm-for-ai/inference-optimization/workload.html#mi300x-triton-kernel-performance-optimization
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {
-            "waves_per_eu": 4,
+            "waves_per_eu": 1,
             "matrix_instr_nonkdim": 16,
             "kpack": 2
         }
+        num_stages = 1
 
     _fwd_grouped_kernel_stage1[grid](
         q,
@@ -456,7 +461,7 @@ def _decode_grouped_att_m_fwd(
         PAGE_SIZE=page_size,
         logit_cap=logit_cap,
         num_warps=4,
-        num_stages=2,
+        num_stages=num_stages,
         Lk=Lk,
         Lv=Lv,
         **extra_kargs,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..2b1167fc71e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,128 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}

From 22fda0a808bae050d6d8dc1e5be05abd7a7d28f3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 27 Feb 2025 18:03:41 -0500
Subject: [PATCH 0426/1240] [Attention] Flash MLA for V1 (#13867)

Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Yang Chen <yangche@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cuda.py                        |  35 +++--
 vllm/platforms/interface.py                   |   5 +-
 vllm/v1/attention/backends/mla/common.py      |  11 +-
 vllm/v1/attention/backends/mla/flashmla.py    | 139 ++++++++++++++++++
 .../backends/{ => mla}/triton_mla.py          |   0
 5 files changed, 170 insertions(+), 20 deletions(-)
 create mode 100644 vllm/v1/attention/backends/mla/flashmla.py
 rename vllm/v1/attention/backends/{ => mla}/triton_mla.py (100%)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 0209c723627..2a4cac46c06 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -161,15 +161,9 @@ def get_current_memory_usage(cls,
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                              kv_cache_dtype, block_size, use_v1,
                              use_mla) -> str:
-        if use_v1:
-            if use_mla:
-                logger.info("Using Triton MLA backend on V1 engine.")
-                return "vllm.v1.attention.backends.triton_mla.TritonMLABackend"
-            else:
-                logger.info("Using Flash Attention backend on V1 engine.")
-                return ("vllm.v1.attention.backends.flash_attn."
-                        "FlashAttentionBackend")
         if use_mla:
+            # TODO(lucas): refactor to  be more concise
+            #  we should probably consider factoring out V1 here
             if selected_backend == _Backend.FLASHMLA:
                 from vllm.attention.backends.flashmla import (
                     is_flashmla_supported)
@@ -183,11 +177,26 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         " (currently only supports block size 64).",
                         block_size)
                 else:
-                    logger.info("Using FlashMLA backend.")
-                    return "vllm.attention.backends.flashmla.FlashMLABackend"
-
-            logger.info("Using Triton MLA backend.")
-            return "vllm.attention.backends.triton_mla.TritonMLABackend"
+                    if use_v1:
+                        logger.info("Using FlashMLA backend on V1 engine.")
+                        return ("vllm.v1.attention.backends.mla."
+                                "flashmla.FlashMLABackend")
+                    else:
+                        logger.info("Using FlashMLA backend.")
+                        return ("vllm.attention.backends."
+                                "flashmla.FlashMLABackend")
+
+            if use_v1:
+                logger.info("Using Triton MLA backend on V1 engine.")
+                return ("vllm.v1.attention.backends.mla."
+                        "triton_mla.TritonMLABackend")
+            else:
+                logger.info("Using Triton MLA backend.")
+                return "vllm.attention.backends.triton_mla.TritonMLABackend"
+        if use_v1:
+            logger.info("Using Flash Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends.flash_attn."
+                    "FlashAttentionBackend")
         if selected_backend == _Backend.FLASHINFER:
             logger.info("Using FlashInfer backend.")
             return "vllm.attention.backends.flashinfer.FlashInferBackend"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 4af413dff0f..d81a66e4bcb 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -34,9 +34,8 @@ class _Backend(enum.Enum):
     TORCH_SDPA = enum.auto()
     OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
-    TRITON_MLA = enum.auto()
-    TRITON_MLA_VLLM_V1 = enum.auto()
-    FLASHMLA = enum.auto()
+    TRITON_MLA = enum.auto()  # Supported by V1
+    FLASHMLA = enum.auto()  # Supported by V1
     HPU_ATTN = enum.auto()
     PALLAS = enum.auto()
     PALLAS_VLLM_V1 = enum.auto()
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 2a742f5ce52..30bce5cc8b6 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -333,13 +333,16 @@ def __post_init__(self):
 T = TypeVar("T", bound=MLACommonMetadata)
 
 
-class MLACommonMetadataBuilder:
+class MLACommonMetadataBuilder(Generic[T]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
     """
 
-    def __init__(self, runner: "GPUModelRunner"):
+    def __init__(self,
+                 runner: "GPUModelRunner",
+                 cls: Optional[type[T]] = None):
+        self.cls = cls if cls is not None else MLACommonMetadata
         self.runner = runner
         scheduler_config = runner.scheduler_config
         model_config = runner.model_config
@@ -431,7 +434,7 @@ def reorder_batch(self, input_batch: "InputBatch",
         self._num_prefill_tokens = num_prefill_tokens
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int):
+              common_prefix_len: int) -> T:
         device = self.runner.device
         max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
         query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
@@ -502,7 +505,7 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
             assert max(context_chunk_seq_tot) <= \
                 self.chunked_prefill_workspace_size
 
-        return MLACommonMetadata(
+        return self.cls(
             input_positions=input_positions,
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
new file mode 100644
index 00000000000..8a7b7b974e3
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
+                                         get_mla_metadata,
+                                         is_flashmla_supported)
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonImpl,
+                                                   MLACommonMetadata,
+                                                   MLACommonMetadataBuilder)
+
+logger = init_logger(__name__)
+
+
+class FlashMLABackend(MLACommonBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "FLASHMLA_VLLM_V1"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["FlashMLAMetadata"]:
+        return FlashMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
+        return FlashMLAMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> Type["FlashMLAImpl"]:
+        return FlashMLAImpl
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata):
+    decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
+                                                   torch.Tensor]] = None
+    decode_num_splits: Optional[torch.Tensor] = None
+
+
+class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
+
+    def __init__(self, runner):
+        super().__init__(runner, cls=FlashMLAMetadata)
+
+        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
+            self.runner.parallel_config)
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        m = super().build(num_reqs, num_actual_tokens, max_query_len,
+                          common_prefix_len)
+
+        if m.num_decode_tokens is not None and m.num_decode_tokens > 0:
+            m.decode_tile_scheduler_metadata, m.decode_num_splits = \
+                get_mla_metadata(
+                m.seq_lens[:m.num_decode_tokens],
+                self.num_q_heads,
+                1, # MQA for the decode path
+            )
+
+        return m
+
+
+class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        assert is_flashmla_supported(), \
+            "FlashMLA is not supported on this device"
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "FlashMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "FlashMLAImpl")
+
+    def _forward_decode(
+        self,
+        q_nope: torch.Tensor,
+        q_pe: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        attn_metadata: FlashMLAMetadata,
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+        if self.kv_cache_dtype.startswith("fp8"):
+            raise NotImplementedError("FP8 FlashMLA not yet supported")
+
+        q = torch.cat([q_nope, q_pe], dim=-1)\
+            .unsqueeze(1) # Add seqlen dim of 1 (decode)
+
+        o, _ = flash_mla_with_kvcache(
+            q=q,
+            k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
+            block_table=attn_metadata.block_table[:attn_metadata.num_decodes,
+                                                  ...],
+            cache_seqlens=attn_metadata.seq_lens[:attn_metadata.
+                                                 num_decode_tokens],
+            head_dim_v=self.kv_lora_rank,
+            tile_scheduler_metadata=attn_metadata.
+            decode_tile_scheduler_metadata,
+            num_splits=attn_metadata.decode_num_splits,
+            softmax_scale=self.scale,
+            causal=True,
+        )
+
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/v1/attention/backends/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
similarity index 100%
rename from vllm/v1/attention/backends/triton_mla.py
rename to vllm/v1/attention/backends/mla/triton_mla.py

From 05bf66e5a82cc4332d38df89c28778a26e357786 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <chislett.ben@gmail.com>
Date: Thu, 27 Feb 2025 18:28:08 -0500
Subject: [PATCH 0427/1240] [Model][Speculative Decoding] Expand DeepSeek MTP
 code to support k > n_predict (#13626)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                             | 11 +++++------
 vllm/model_executor/models/deepseek_mtp.py | 14 +++++++++-----
 vllm/spec_decode/draft_model_runner.py     | 11 ++++-------
 vllm/spec_decode/multi_step_worker.py      | 17 +++++++++++++++++
 vllm/spec_decode/spec_decode_worker.py     |  6 +++---
 vllm/worker/model_runner.py                | 12 +++++++++++-
 6 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index cb683d19386..c3f9932ab8b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1978,13 +1978,12 @@ def maybe_create_spec_config(
                 if num_speculative_tokens is None:
                     # Default to max value defined in draft model config.
                     num_speculative_tokens = n_predict
-                elif num_speculative_tokens > n_predict:
-                    # Verify provided value doesn't exceed the maximum
-                    # supported by the draft model.
+                elif num_speculative_tokens > n_predict and \
+                        num_speculative_tokens % n_predict != 0:
+                    # Ensure divisibility for MTP module reuse.
                     raise ValueError(
-                        "This speculative model supports a maximum of "
-                        f"num_speculative_tokens={n_predict}, but "
-                        f"{num_speculative_tokens=} was provided.")
+                        f"{num_speculative_tokens=} must be divisible by "
+                        f"{n_predict=}")
 
             speculative_draft_tensor_parallel_size = \
                 SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index cac1b2b3b11..e7fde76cd0b 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -87,7 +87,7 @@ def forward(
                                                  hidden_states=hidden_states,
                                                  residual=None)
         hidden_states = residual + hidden_states
-        return self.shared_head(hidden_states)
+        return hidden_states
 
 
 class DeepSeekMultiTokenPredictor(nn.Module):
@@ -121,12 +121,13 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        return self.layers[str(self.mtp_start_layer_idx + spec_step_idx)](
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
             input_ids,
             positions,
             previous_hidden_states,
             inputs_embeds,
-            spec_step_idx,
+            current_step_idx,
         )
 
     def compute_logits(
@@ -135,9 +136,12 @@ def compute_logits(
         sampling_metadata: SamplingMetadata,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
-        mtp_layer = self.layers[str(self.mtp_start_layer_idx + spec_step_idx)]
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        mtp_layer = self.layers[str(self.mtp_start_layer_idx +
+                                    current_step_idx)]
         logits = self.logits_processor(mtp_layer.shared_head.head,
-                                       hidden_states, sampling_metadata)
+                                       mtp_layer.shared_head(hidden_states),
+                                       sampling_metadata)
         return logits
 
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index c54e6abe18d..bc1b3e2319d 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -50,12 +50,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
     """
 
     def __init__(self, model_runner: ModelRunnerBase):
-        if hasattr(
-                model_runner,
-                "return_hidden_states") and model_runner.return_hidden_states:
-            raise ValueError(
-                "return_hidden_states is not supported for TP1DraftModelRunner."
-            )
         super().__init__(model_runner)
 
         self.indices_of_seq_with_bonus_tokens = None
@@ -153,7 +147,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
                 return False
 
         # TODO: Add support for other attn backends
-        if self.attn_backend.get_name() not in ("FLASH_ATTN", "TRITON_MLA"):
+        if self.attn_backend.get_name() not in ("FLASH_ATTN", ):
             return False
 
         # TODO: Add support for LORA
@@ -307,6 +301,9 @@ def execute_model(
             )
             outputs.append(output)
 
+            if self.return_hidden_states and is_fallback:
+                output.hidden_states = hidden_states
+
             if model_input.attn_metadata.num_prefills == 0 \
                 and self.indices_of_seq_with_bonus_tokens is not None:
                 assert output.sampled_token_ids is not None
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index c28d413efe7..d8d54918fa9 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -96,12 +96,16 @@ def sampler_output(
             # TODO: Remove this branch once DraftModelRunner supports TP>1
             # and other restrictions that are part of DraftModelRunner's
             # supports_gpu_multi_step(..)
+            if expanded_request.previous_hidden_states is not None:
+                self.worker.model_runner.return_hidden_states = True
             for _ in range(sample_len):
                 model_output: List[SamplerOutput] = self.worker.execute_model(
                     execute_model_req=expanded_request)
                 assert (len(model_output) == 1
                         ), "composing multistep workers not supported"
                 model_output = model_output[0]
+                self._maybe_update_previous_hidden_states(
+                    model_output, expanded_request)
 
                 self._append_new_tokens(
                     model_output, expanded_request.seq_group_metadata_list,
@@ -115,6 +119,19 @@ def sampler_output(
             model_outputs, indices_of_seq_with_bonus_tokens)
         return filtered_model_outputs, True
 
+    @staticmethod
+    def _maybe_update_previous_hidden_states(
+            model_output: SamplerOutput,
+            expanded_request: ExecuteModelRequest) -> None:
+        """
+        Updates the previous hidden states in an expanded request
+        in-place with the hidden states from the model output. 
+        """
+        if expanded_request.previous_hidden_states is not None:
+            expanded_request.previous_hidden_states = HiddenStates(
+                model_output.hidden_states,
+                expanded_request.seq_group_metadata_list)
+
     @staticmethod
     def _expand_execute_model_request(
         execute_model_req: ExecuteModelRequest,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 871a3aee630..8909a41bc99 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -184,8 +184,7 @@ def create_worker(
             elif draft_model_config.hf_config.model_type == "medusa":
                 proposer_worker = MedusaWorker(**draft_worker_kwargs)
             else:
-                if draft_tp == 1 or draft_model_config.hf_config.model_type ==\
-                        "deepseek_mtp":
+                if draft_tp == 1:
                     if current_platform.is_cuda_alike():
                         draft_worker_kwargs[
                             "model_runner_cls"] = TP1DraftModelRunner
@@ -203,7 +202,8 @@ def create_worker(
 
                 proposer_worker = MultiStepWorker(**draft_worker_kwargs)
                 if draft_model_config.hf_config.model_type == "deepseek_mtp":
-                    num_spec_prefill_steps = num_speculative_tokens
+                    num_spec_prefill_steps = \
+                        draft_model_config.hf_config.n_predict
 
             proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker(
                 proposer_worker, draft_tp, target_tp)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a37a3168bbb..bb2228165b5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1685,11 +1685,22 @@ def execute_model(
         # TODO(andoorve): We can remove this once all
         # virtual engines share the same kv cache.
         virtual_engine = model_input.virtual_engine
+        previous_hidden_states = kwargs.get("previous_hidden_states")
         if prefill_meta is None and decode_meta.use_cuda_graph:
             assert model_input.input_tokens is not None
             graph_batch_size = model_input.input_tokens.shape[0]
             model_executable = self.graph_runners[virtual_engine][
                 graph_batch_size]
+            if previous_hidden_states is not None:
+                previous_hidden_states = torch.cat([
+                    previous_hidden_states,
+                    torch.empty([
+                        graph_batch_size - previous_hidden_states.shape[0],
+                        *previous_hidden_states.shape[1:]
+                    ],
+                                dtype=previous_hidden_states.dtype,
+                                device=previous_hidden_states.device)
+                ])
         else:
             model_executable = self.model
 
@@ -1716,7 +1727,6 @@ def execute_model(
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_inner_state else {}
-        previous_hidden_states = kwargs.get("previous_hidden_states")
         model_kwargs = {}
         if previous_hidden_states is not None:
             model_kwargs["previous_hidden_states"] = previous_hidden_states

From e3bae802ad5f3d5dab090ce0bfff844a1c8a2e1e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Feb 2025 07:53:13 +0800
Subject: [PATCH 0428/1240] [Misc] Print FusedMoE detail info (#13974)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 28a88571dab..052d4d54601 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -737,3 +737,23 @@ def _load_fp8_scale(self, param: torch.nn.Parameter,
             # If we are in the row parallel case (down_proj)
             else:
                 param_data[expert_id] = loaded_weight
+
+    def extra_repr(self) -> str:
+
+        s = (
+            f"global_num_experts={self.global_num_experts}, "
+            f"local_num_experts={self.local_num_experts}, "
+            f"top_k={self.top_k}, "
+            f"intermediate_size_per_partition={self.intermediate_size_per_partition}, "  # noqa: E501
+            f"tp_size={self.tp_size},\n"
+            f"ep_size={self.ep_size}, "
+            f"reduce_results={self.reduce_results}, "
+            f"renormalize={self.renormalize}, "
+            f"use_grouped_topk={self.use_grouped_topk}")
+
+        if self.use_grouped_topk:
+            s += f", num_expert_group={self.num_expert_group}, topk_group={self.topk_group}"  # noqa: E501
+
+        s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501
+
+        return s

From 45eb27483c521b5bc775eca6ef94522ff606c7eb Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:02:15 -0800
Subject: [PATCH 0429/1240] [V1]`SupportsV0Only` protocol for model definitions
 (#13959)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                                |  5 ++++
 vllm/model_executor/models/__init__.py        |  7 +++--
 vllm/model_executor/models/bamba.py           |  5 ++--
 vllm/model_executor/models/bart.py            |  3 ++-
 vllm/model_executor/models/bert.py            |  4 +--
 vllm/model_executor/models/florence2.py       |  4 +--
 vllm/model_executor/models/gritlm.py          |  4 ++-
 vllm/model_executor/models/interfaces.py      | 26 +++++++++++++++++++
 vllm/model_executor/models/jamba.py           |  5 ++--
 vllm/model_executor/models/mamba.py           |  6 +++--
 vllm/model_executor/models/mamba2.py          |  6 +++--
 vllm/model_executor/models/minicpmv.py        |  6 +++--
 vllm/model_executor/models/mllama.py          |  5 ++--
 vllm/model_executor/models/paligemma.py       |  4 +--
 .../models/prithvi_geospatial_mae.py          |  6 +++--
 vllm/model_executor/models/qwen2_rm.py        |  5 ++--
 vllm/model_executor/models/registry.py        | 14 ++++++++--
 vllm/model_executor/models/roberta.py         |  5 ++--
 vllm/model_executor/models/whisper.py         |  5 ++--
 19 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c3f9932ab8b..78d02b01735 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1039,6 +1039,11 @@ def supported_runner_types(self) -> Set[RunnerType]:
     def runner_type(self) -> RunnerType:
         return _TASK_RUNNER[self.task]
 
+    @property
+    def is_v1_compatible(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return ModelRegistry.is_v1_compatible(architectures)
+
 
 class CacheConfig:
     """Configuration for the KV cache.
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 6be4a834130..3580c4fa525 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal,
-                         SupportsPP, has_inner_state, supports_lora,
-                         supports_multimodal, supports_pp)
+                         SupportsPP, SupportsV0Only, has_inner_state,
+                         supports_lora, supports_multimodal, supports_pp,
+                         supports_v0_only)
 from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration,
                               is_pooling_model, is_text_generation_model)
 from .registry import ModelRegistry
@@ -21,4 +22,6 @@
     "supports_multimodal",
     "SupportsPP",
     "supports_pp",
+    "SupportsV0Only",
+    "supports_v0_only",
 ]
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 69da05884de..ec62e41d59f 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -32,7 +32,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsV0Only)
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -366,7 +367,7 @@ def forward(
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid):
+                       IsHybrid, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 93452696dca..82684dfa730 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -43,6 +43,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsV0Only
 from .utils import maybe_prefix
 
 logger = logging.get_logger(__name__)
@@ -776,7 +777,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         return decoder_outputs
 
 
-class BartForConditionalGeneration(nn.Module):
+class BartForConditionalGeneration(nn.Module, SupportsV0Only):
     base_model_prefix = "model"
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 4ff69527653..77b2ef0fce5 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -26,7 +26,7 @@
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding
+from .interfaces import SupportsCrossEncoding, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -385,7 +385,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module):
+class BertEmbeddingModel(nn.Module, SupportsV0Only):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index c51fcf3d438..6fa1bb80995 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -29,7 +29,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsV0Only
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
@@ -651,7 +651,7 @@ def forward(
         return decoder_outputs
 
 
-class Florence2LanguageForConditionalGeneration(nn.Module):
+class Florence2LanguageForConditionalGeneration(nn.Module, SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 16223953ff8..2984f224128 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -19,6 +19,8 @@
                            PoolingSequenceGroupOutput)
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
+from .interfaces import SupportsV0Only
+
 logger = init_logger(__name__)
 
 
@@ -177,7 +179,7 @@ def forward(
         return PoolerOutput(outputs=pooled_outputs)
 
 
-class GritLM(LlamaForCausalLM):
+class GritLM(LlamaForCausalLM, SupportsV0Only):
     """This class implements the embedding model for parasail-ai/GritLM-7B-vllm.
 
     The class inherits from LlamaForCausalLM and provides a custom pooling
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 47bd05f140c..fb3ceb00529 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -498,3 +498,29 @@ def supports_transcription(
         return isinstance(model, SupportsTranscription)
 
     return isinstance(model, SupportsTranscription)
+
+
+@runtime_checkable
+class SupportsV0Only(Protocol):
+    """Models with this interface are not compatible with V1 vLLM."""
+
+    supports_v0_only: ClassVar[Literal[True]] = True
+
+
+@overload
+def supports_v0_only(model: Type[object]) -> TypeIs[Type[SupportsV0Only]]:
+    ...
+
+
+@overload
+def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]:
+    ...
+
+
+def supports_v0_only(
+    model: Union[Type[object], object],
+) -> Union[TypeIs[Type[SupportsV0Only]], TypeIs[SupportsV0Only]]:
+    if isinstance(model, type):
+        return isinstance(model, SupportsV0Only)
+
+    return isinstance(model, SupportsV0Only)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 14e56df6cad..58eccd6a6b8 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -30,7 +30,8 @@
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import LayerBlockType
 
-from .interfaces import HasInnerState, IsHybrid, SupportsLoRA, SupportsPP
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsV0Only)
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -353,7 +354,7 @@ def forward(
 
 
 class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid):
+                       IsHybrid, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 9f1cd8c29a5..46b9182f2d7 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -19,7 +19,8 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree, SupportsPP)
+                                                   IsAttentionFree, SupportsPP,
+                                                   SupportsV0Only)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -155,7 +156,8 @@ def forward(
         return hidden_states
 
 
-class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP):
+class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsPP,
+                       SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 266cdc243ac..da5cbddbcbc 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -22,7 +22,8 @@
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (HasInnerState,
-                                                   IsAttentionFree)
+                                                   IsAttentionFree,
+                                                   SupportsV0Only)
 from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
                                                     MambaCacheParams)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -174,7 +175,8 @@ def forward(
         return hidden_states
 
 
-class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
+class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree,
+                        SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index fb6ea53acf9..1816bf5d008 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -63,7 +63,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
+                         SupportsV0Only)
 from .utils import AutoWeightsLoader, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
@@ -804,7 +805,8 @@ def apply(
         return result
 
 
-class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
+class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
+                        SupportsV0Only):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
     instantiated.
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 36e653e41e1..7122fea2b3a 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -63,7 +63,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
 from .clip import CLIPMLP
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsMultiModal, SupportsV0Only
 from .llama import LlamaDecoderLayer, LlamaMLP
 from .utils import maybe_prefix
 
@@ -1128,7 +1128,8 @@ def forward(
 @MULTIMODAL_REGISTRY.register_processor(MllamaMultiModalProcessor,
                                         info=MllamaProcessingInfo,
                                         dummy_inputs=MllamaDummyInputsBuilder)
-class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
+class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"]
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 02d1861b802..9a1398c28db 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -18,7 +18,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsMultiModal, SupportsPP, SupportsV0Only
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
@@ -136,7 +136,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+                                        SupportsPP, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index bfa90e42733..d922329b3a4 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -25,7 +25,8 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.interfaces import (IsAttentionFree,
-                                                   SupportsMultiModal)
+                                                   SupportsMultiModal,
+                                                   SupportsV0Only)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -111,7 +112,8 @@ def apply(
     PrithviGeoSpatialMAEMultiModalProcessor,
     info=PrithviGeoSpatialMAEProcessingInfo,
     dummy_inputs=PrithviGeoSpatialMAEInputBuilder)
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal,
+                           SupportsV0Only):
     """ Prithvi Masked Autoencoder"""
 
     def _instantiate_model(self, config: dict) -> Optional[nn.Module]:
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 21cc9e8ed1c..90f799e6734 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsV0Only
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
@@ -33,7 +33,8 @@ def forward(self, input):
         return self.activation(input)
 
 
-class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP):
+class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP,
+                           SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 75e31d557dd..028658b5264 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -22,7 +22,7 @@
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
-                         supports_pp, supports_transcription)
+                         supports_pp, supports_transcription, supports_v0_only)
 from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
@@ -228,6 +228,7 @@ class _ModelInfo:
     is_attention_free: bool
     is_hybrid: bool
     supports_transcription: bool
+    supports_v0_only: bool
 
     @staticmethod
     def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
@@ -241,7 +242,9 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
             is_hybrid=is_hybrid(model),
-            supports_transcription=supports_transcription(model))
+            supports_transcription=supports_transcription(model),
+            supports_v0_only=supports_v0_only(model),
+        )
 
 
 class _BaseRegisteredModel(ABC):
@@ -504,6 +507,13 @@ def is_transcription_model(
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.supports_transcription
 
+    def is_v1_compatible(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return not model_cls.supports_v0_only
+
 
 ModelRegistry = _ModelRegistry({
     model_arch:
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index f86fa268072..ba92eef1270 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -19,7 +19,7 @@
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding
+from .interfaces import SupportsCrossEncoding, SupportsV0Only
 
 
 def roberta_task_weights_filter(
@@ -191,7 +191,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         assert len(loaded), "Unable to load RobertaEmbeddingModel"
 
 
-class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding):
+class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                       SupportsV0Only):
     """A model that uses Roberta to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 2da8c5c8b0e..656e5fc6dcf 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -34,7 +34,8 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
-from .interfaces import SupportsMultiModal, SupportsTranscription
+from .interfaces import (SupportsMultiModal, SupportsTranscription,
+                         SupportsV0Only)
 from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
                     make_layers)
 
@@ -643,7 +644,7 @@ def _get_prompt_updates(
                                         info=WhisperProcessingInfo,
                                         dummy_inputs=WhisperDummyInputsBuilder)
 class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
-                                      SupportsMultiModal):
+                                      SupportsMultiModal, SupportsV0Only):
     packed_modules_mapping = {
         "self_attn.qkv_proj": [
             "self_attn.q_proj",

From addae192a39b35fa14fb9171263b010531075bb2 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 27 Feb 2025 21:00:45 -0700
Subject: [PATCH 0430/1240] [Bugfix] Check that number of images matches number
 of <|image|> tokens with mllama (#13911)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_mllama.py            |  5 ++--
 vllm/model_executor/models/mllama.py          | 24 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 202516f4c20..4fee04fdb7b 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
 
         # Regression tests for https://github.com/vllm-project/vllm/issues/10648
 
-        # Number of image tags is greater than the number of images provided
-        prompt = "<|begin_of_text|><|image|><|image|> Compare the two images"  # noqa: E501
+        # Number of groups of image tokens is greater than the number of images
+        # provided (the whitespace between the tags is necessary)
+        prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images"  # noqa: E501
         image = stop_sign
         with pytest.raises(ValueError):
             vllm_model.generate_greedy_logprobs([prompt],
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 7122fea2b3a..2a829bf0e61 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -54,7 +54,8 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.inputs import (MultiModalEncDecInputs,
+                                    MultiModalFieldConfig, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataDict, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -169,6 +170,27 @@ def get_dummy_processor_inputs(
 class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
                                 ):
 
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalEncDecInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+
+        # Check that the number of image tokens in the decoder prompt matches
+        # the number of images provided in mm_data
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(
+            self.info.get_hf_config().image_token_index)
+        image_data = mm_data.get("image", [])
+        num_images = 1 if isinstance(image_data, Image) else len(image_data)
+        if num_image_tokens != num_images:
+            raise ValueError(
+                f"The number of image tokens ({num_image_tokens}) must be"
+                f" the same as the number of images ({num_images})")
+
+        return mm_inputs
+
     def _call_hf_processor(
         self,
         prompt: str,

From 01c5d3677e3b88506c920a757c3ec853569bc057 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Feb 2025 15:12:04 +0800
Subject: [PATCH 0431/1240] [Doc] Move multimodal Embedding API example to
 Online Serving page (#14017)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/multimodal_inputs.md      | 89 ++-----------------
 .../serving/openai_compatible_server.md       | 80 ++++++++++++++++-
 vllm/model_executor/models/registry.py        |  4 +-
 3 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 5cec5548ba1..c540bff2cf3 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -16,7 +16,7 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
 
-### Image
+### Image Inputs
 
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
@@ -120,20 +120,20 @@ for o in outputs:
     print(generated_text)
 ```
 
-### Video
+### Video Inputs
 
 You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
 instead of using multi-image input.
 
 Full example: <gh-file:examples/offline_inference/vision_language.py>
 
-### Audio
+### Audio Inputs
 
 You can pass a tuple `(array, sampling_rate)` to the `'audio'` field of the multi-modal dictionary.
 
 Full example: <gh-file:examples/offline_inference/audio_language.py>
 
-### Embedding
+### Embedding Inputs
 
 To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
@@ -211,7 +211,7 @@ The chat template can be inferred based on the documentation on the model's Hugg
 For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: <gh-file:examples/template_llava.jinja>
 :::
 
-### Image
+### Image Inputs
 
 Image input is supported according to [OpenAI Vision API](https://platform.openai.com/docs/guides/vision).
 Here is a simple example using Phi-3.5-Vision.
@@ -293,7 +293,7 @@ export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 :::
 
-### Video
+### Video Inputs
 
 Instead of `image_url`, you can pass a video file via `video_url`. Here is a simple example using [LLaVA-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf).
 
@@ -356,7 +356,7 @@ export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
 
 :::
 
-### Audio
+### Audio Inputs
 
 Audio input is supported according to [OpenAI Audio API](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in).
 Here is a simple example using Ultravox-v0.5-1B.
@@ -460,77 +460,6 @@ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 
 :::
 
-### Embedding
+### Embedding Inputs
 
-vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings),
-where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models.
-
-:::{tip}
-The schema of `messages` is exactly the same as in Chat Completions API.
-You can refer to the above tutorials for more details on how to pass each type of multi-modal data.
-:::
-
-Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images.
-Refer to the examples below for illustration.
-
-Here is an end-to-end example using VLM2Vec. To serve the model:
-
-```bash
-vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
-  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
-```
-
-:::{important}
-Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
-to run this model in embedding mode instead of text generation mode.
-
-The custom chat template is completely different from the original one for this model,
-and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-:::
-
-Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
-
-```python
-import requests
-
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-response = requests.post(
-    "http://localhost:8000/v1/embeddings",
-    json={
-        "model": "TIGER-Lab/VLM2Vec-Full",
-        "messages": [{
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": "Represent the given image."},
-            ],
-        }],
-        "encoding_format": "float",
-    },
-)
-response.raise_for_status()
-response_json = response.json()
-print("Embedding output:", response_json["data"][0]["embedding"])
-```
-
-Below is another example, this time using the `MrLight/dse-qwen2-2b-mrl-v1` model.
-
-```bash
-vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
-  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
-```
-
-:::{important}
-Like with VLM2Vec, we have to explicitly pass `--task embed`.
-
-Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-:::
-
-:::{important}
-Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
-example below for details.
-:::
-
-Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
+TBD
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 9b9242abf1e..5ab46da90ea 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -266,11 +266,85 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
 which will be treated as a single prompt to the model.
 
-:::{tip}
-This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details.
+Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
+
+#### Multi-modal inputs
+
+You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
+and passing a list of `messages` in the request. Refer to the examples below for illustration.
+
+:::::{tab-set}
+::::{tab-item} VLM2Vec
+
+To serve the model:
+
+```bash
+vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
+```
+
+:::{important}
+Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+to run this model in embedding mode instead of text generation mode.
+
+The custom chat template is completely different from the original one for this model,
+and can be found here: <gh-file:examples/template_vlm2vec.jinja>
 :::
 
-Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+
+```python
+import requests
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+response = requests.post(
+    "http://localhost:8000/v1/embeddings",
+    json={
+        "model": "TIGER-Lab/VLM2Vec-Full",
+        "messages": [{
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": image_url}},
+                {"type": "text", "text": "Represent the given image."},
+            ],
+        }],
+        "encoding_format": "float",
+    },
+)
+response.raise_for_status()
+response_json = response.json()
+print("Embedding output:", response_json["data"][0]["embedding"])
+```
+
+::::
+
+::::{tab-item} DSE-Qwen2-MRL
+
+To serve the model:
+
+```bash
+vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
+```
+
+:::{important}
+Like with VLM2Vec, we have to explicitly pass `--task embed`.
+
+Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
+by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
+:::
+
+:::{important}
+`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
+example below for details.
+:::
+
+::::
+
+:::::
+
+Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
 
 #### Extra parameters
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 028658b5264..4551d81e8a5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -19,6 +19,7 @@
 import torch.nn as nn
 
 from vllm.logger import init_logger
+from vllm.utils import is_in_doc_build
 
 from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
                          supports_cross_encoding, supports_multimodal,
@@ -368,7 +369,8 @@ def register_model(
                 raise ValueError(msg)
 
             model = _LazyRegisteredModel(*split_str)
-        elif isinstance(model_cls, type) and issubclass(model_cls, nn.Module):
+        elif isinstance(model_cls, type) and (is_in_doc_build() or issubclass(
+                model_cls, nn.Module)):
             model = _RegisteredModel.from_model_cls(model_cls)
         else:
             msg = ("`model_cls` should be a string or PyTorch model class, "

From 96e0a1a945054158bbc4fa3a53972a02a6f72a7a Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis.felardos+github@gmail.com>
Date: Fri, 28 Feb 2025 08:53:45 +0100
Subject: [PATCH 0432/1240] [Bugfix][Disaggregated] patch the inflight batching
 on the decode node in SimpleConnector to avoid hangs in SimpleBuffer (nccl
 based) (#13987)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../kv_connector/simple_connector.py            | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 2033e9762ac..8e2fbf36b4d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -214,6 +214,7 @@ def recv_kv_caches_and_hidden_states(
 
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
         hidden_or_intermediate_states_for_one_req = []
@@ -225,9 +226,21 @@ def recv_kv_caches_and_hidden_states(
         # enumerate different requests
         # FIXME(Kuntai): This impl assumes that all requests are prefill.
         for idx, slen in enumerate(seq_lens):
-
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
             current_tokens = input_tokens_tensor[start_pos:end_pos]
             num_tokens = slen
 
@@ -288,7 +301,7 @@ def recv_kv_caches_and_hidden_states(
             # Here we will fall back to normal model forwarding
             # But optionally you can adjust model_input so that you only do
             # prefilling on those tokens that are missing KV caches.
-            logger.debug(
+            logger.warning(
                 "[rank%d]: Failed to receive all KVs and hidden "
                 "states, redo model forwarding.", torch.distributed.get_rank())
             hidden_or_intermediate_states = None

From 12177c78e9e8886c64e0b9bfd46f521a2206b0c6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Feb 2025 08:50:43 +0000
Subject: [PATCH 0433/1240] Use smaller embedding model when not testing model
 specifically (#13891)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/llm/test_encode.py                |  2 +-
 tests/entrypoints/openai/test_embedding.py          |  2 +-
 tests/entrypoints/openai/test_metrics.py            |  4 ++--
 tests/entrypoints/openai/test_run_batch.py          | 10 +++++-----
 tests/model_executor/test_model_load_with_params.py |  4 ++--
 tests/models/embedding/language/test_embedding.py   |  2 +-
 tests/models/registry.py                            |  2 +-
 tests/test_config.py                                |  2 +-
 vllm/test_utils.py                                  |  2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index ebec8baba38..a65235ccdf1 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -8,7 +8,7 @@
 from vllm import LLM, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
 
-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/multilingual-e5-small"
 
 PROMPTS = [
     "Hello, my name is",
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index e86ea87dd66..8d00564351c 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -13,7 +13,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
+MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 
 
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 5aa259a4f31..39ce4ba2354 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -282,7 +282,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
 def test_metrics_exist_run_batch(use_v1: bool):
     if use_v1:
         pytest.skip("Skipping test on vllm V1")
-    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}"""  # noqa: E501
+    input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}"""  # noqa: E501
 
     base_url = "0.0.0.0"
     port = "8001"
@@ -302,7 +302,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
             "-o",
             output_file.name,
             "--model",
-            "intfloat/e5-mistral-7b-instruct",
+            "intfloat/multilingual-e5-small",
             "--enable-metrics",
             "--url",
             base_url,
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index db049ee2bfd..643d0d06abc 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -18,10 +18,10 @@
 INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
 
-INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
+INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
 
-{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
 {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
 
 INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
@@ -37,7 +37,7 @@ def test_empty_file():
         proc = subprocess.Popen([
             sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
             input_file.name, "-o", output_file.name, "--model",
-            "intfloat/e5-mistral-7b-instruct"
+            "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
@@ -97,7 +97,7 @@ def test_embeddings():
         proc = subprocess.Popen([
             sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
             input_file.name, "-o", output_file.name, "--model",
-            "intfloat/e5-mistral-7b-instruct"
+            "intfloat/multilingual-e5-small"
         ], )
         proc.communicate()
         proc.wait()
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 760a1199352..f8efa2eff85 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -14,7 +14,7 @@
 REVISION = os.environ.get("REVISION", "main")
 
 MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
-                                    "intfloat/multilingual-e5-large")
+                                    "intfloat/multilingual-e5-small")
 REVISION_ROBERTA = os.environ.get("REVISION", "main")
 
 
@@ -83,7 +83,7 @@ def test_roberta_model_loading_with_params(vllm_runner):
         assert model_config.pooler_config.pooling_norm
 
         # asserts on the tokenizer loaded
-        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small"
         assert not model_tokenizer.tokenizer_config["do_lower_case"]
 
         def check_model(model):
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index ad6385376dc..4b9926860f2 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -17,7 +17,7 @@
         pytest.param("BAAI/bge-base-en-v1.5",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
-        pytest.param("intfloat/multilingual-e5-large"),
+        pytest.param("intfloat/multilingual-e5-small"),
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 95bda029349..78a65b93870 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -211,7 +211,7 @@ def check_available_online(
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
     "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
     "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
diff --git a/tests/test_config.py b/tests/test_config.py
index 8927a14d79a..709d60b8367 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,7 +13,7 @@
     ("model_id", "expected_runner_type", "expected_task"),
     [
         ("distilbert/distilgpt2", "generate", "generate"),
-        ("intfloat/e5-mistral-7b-instruct", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed"),
         ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
         ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
         ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
diff --git a/vllm/test_utils.py b/vllm/test_utils.py
index eb9a4d80a2c..8611a25922b 100644
--- a/vllm/test_utils.py
+++ b/vllm/test_utils.py
@@ -28,7 +28,7 @@
     "HuggingFaceM4/Idefics3-8B-Llama3",
     "internlm/internlm2-1_8b-reward",
     "intfloat/e5-mistral-7b-instruct",
-    "intfloat/multilingual-e5-large",
+    "intfloat/multilingual-e5-small",
     "jason9693/Qwen2.5-1.5B-apeach",
     "llava-hf/llava-1.5-7b-hf",
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",

From fd92a5ded9fd56b133bd250f3b9df41feaf95d32 Mon Sep 17 00:00:00 2001
From: Kacper Pietkun <kacper.pietkun00@gmail.com>
Date: Fri, 28 Feb 2025 09:51:49 +0100
Subject: [PATCH 0434/1240] [Hardware][Intel-Gaudi] Regional compilation
 support (#13213)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/hpu_model_runner.py | 43 ++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index d6eaf84e40f..4ac547ae326 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -39,7 +39,10 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
                              MultiModalKwargs)
@@ -311,10 +314,38 @@ def __init__(self, model, vllm_config):
         self.block_size = vllm_config.cache_config.block_size
         self.dtype = vllm_config.model_config.dtype
         enforce_eager = vllm_config.model_config.enforce_eager
+
         if not htorch.utils.internal.is_lazy() and not enforce_eager:
-            self.model = torch.compile(self.model,
-                                       backend='hpu_backend',
-                                       dynamic=False)
+            if os.getenv('VLLM_REGIONAL_COMPILATION',
+                         'true').lower() == 'true':
+                self.regional_compilation_layers_list = [
+                    RMSNorm, VocabParallelEmbedding
+                ]
+                self._regional_compilation(self.model)
+            else:
+                self.model = torch.compile(self.model,
+                                           backend='hpu_backend',
+                                           dynamic=False)
+
+    def _regional_compilation(self,
+                              module,
+                              parent_module=None,
+                              module_name=None):
+        if isinstance(module, torch.nn.ModuleList):
+            for children_name, children_module in module.named_children():
+                self._compile_region(module, children_name, children_module)
+        elif any(
+                isinstance(module, layer)
+                for layer in self.regional_compilation_layers_list):
+            self._compile_region(parent_module, module_name, module)
+        else:
+            for children_name, children_module in module.named_children():
+                self._regional_compilation(children_module, module,
+                                           children_name)
+
+    def _compile_region(self, model, name, module):
+        module = torch.compile(module, backend='hpu_backend', dynamic=False)
+        setattr(model, name, module)
 
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
@@ -1575,9 +1606,9 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
                     list(sorted(self.bucketing_global_state.decode_buckets)))
 
         if not htorch.utils.internal.is_lazy() and not self.enforce_eager:
-            cache_size_limit = len(
-                self.bucketing_global_state.prompt_buckets) + len(
-                    self.bucketing_global_state.decode_buckets) + 1
+            cache_size_limit = 1 + 3 * (
+                len(self.bucketing_global_state.prompt_buckets) +
+                len(self.bucketing_global_state.decode_buckets))
             torch._dynamo.config.cache_size_limit = max(
                 cache_size_limit, torch._dynamo.config.cache_size_limit)
             # Multiply by 8 to follow the original default ratio between

From 114ba885df061a2cf2fa877531c2adffe27e6ef1 Mon Sep 17 00:00:00 2001
From: Thibault Schueller <1625198+Ryp@users.noreply.github.com>
Date: Fri, 28 Feb 2025 09:52:25 +0100
Subject: [PATCH 0435/1240] [V1][Minor] Restore V1 compatibility with LLMEngine
 class (#13090)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/llm_engine.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3dee4dab4c4..9c83ea75ead 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2084,3 +2084,8 @@ def _build_logits_processors(
                 sampling_params.logits_processors.extend(logits_processors)
 
         return sampling_params
+
+
+# TODO(v1): Remove this class proxy when V1 goes default.
+if envs.VLLM_USE_V1:
+    from vllm.v1.engine.llm_engine import LLMEngine  # type: ignore

From 6ebeda1580b543d980434c42634cc53a0629dec3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Feb 2025 15:20:29 +0000
Subject: [PATCH 0436/1240] Update AutoAWQ docs (#14042)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/auto_awq.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index fa0bebeb8ba..7001ec91467 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -6,13 +6,13 @@ To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.
 
-You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
+You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
 
 ```console
 pip install autoawq
 ```
 
-After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
+After installing AutoAWQ, you are ready to quantize a model. Please refer to the `AutoAWQ documentation <https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization>`_ for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
 
 ```python
 from awq import AutoAWQForCausalLM

From 74a435d0fa988ada71c9cddf74711fac69638ba8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Feb 2025 23:22:42 +0800
Subject: [PATCH 0437/1240] [Bugfix] Fix MoeWNA16Method activation (#14024)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/quantization/moe_wna16.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index a3adac1bb12..41b75c9be05 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -293,9 +293,10 @@ def apply(
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe import fused_experts
-
+        assert activation == "silu", "Only SiLU activation is supported."
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
             router_logits=router_logits,

From 33e460effa8e740c5ca0cdfa5f323888eb3a6e51 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Feb 2025 23:35:55 +0800
Subject: [PATCH 0438/1240] [VLM][Bugfix] Enable specifying prompt target via
 index (#14038)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/multimodal/test_processing.py     | 258 +++++++++++++++++++++++-
 vllm/model_executor/models/blip2.py     |   6 +-
 vllm/model_executor/models/florence2.py |   5 +-
 vllm/model_executor/models/molmo.py     |   6 +-
 vllm/multimodal/processing.py           | 216 +++++++++++++++-----
 5 files changed, 432 insertions(+), 59 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 878b1592500..ba3df86f715 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -14,8 +14,8 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptInsertion, PromptReplacement,
-                                        apply_text_matches,
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptReplacement, apply_text_matches,
                                         apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
@@ -98,10 +98,20 @@ def test_iter_token_matches(token_ids, match_ids, expected):
             {
                 "pattern_1": [],
                 "pattern_2": [32000],
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix([32000]),
+                "pattern_5": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [],
                 "pattern_2": [],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
             },
         ),
         (
@@ -110,6 +120,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_1": [32000],
                 "pattern_2": [32000, 32000],
                 "pattern_3": [32000, 32000, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([32000]),
+                "pattern_6": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [
@@ -125,6 +138,15 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_3": [
                     { "start_idx": 0, "end_idx": 3 },
                 ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 1, "end_idx": 1 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 4, "end_idx": 4 },
+                ],
             },
         ),
         (
@@ -133,6 +155,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                 "pattern_1": [28747, 32000],
                 "pattern_2": [28747, 32000, 32000, 32000],
                 "pattern_3": [28747, 0, 32000],
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix([28747, 32000]),
+                "pattern_6": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [
@@ -143,6 +168,13 @@ def test_iter_token_matches(token_ids, match_ids, expected):
                     { "start_idx": 1, "end_idx": 5 },
                 ],
                 "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [],
+                "pattern_6": [
+                    { "start_idx": 10, "end_idx": 10 },
+                ],
             },
         ),
     ],
@@ -189,10 +221,20 @@ def test_find_token_matches(
             {
                 "pattern_1": "",
                 "pattern_2": "<image>",
+                "pattern_3": PromptIndexTargets.start(),
+                "pattern_4": PromptIndexTargets.prefix("<image>"),
+                "pattern_5": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
                 "pattern_2": [],
+                "pattern_3": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_4": [],
+                "pattern_5": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
             }
         ),
         (
@@ -201,6 +243,9 @@ def test_find_token_matches(
                 "pattern_1": "<image>",
                 "pattern_2": "<image><image>",
                 "pattern_3": "<image><image><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("<image>"),
+                "pattern_6": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [
@@ -216,6 +261,15 @@ def test_find_token_matches(
                 "pattern_3": [
                     { "start_idx": 0, "end_idx": 21 },
                 ],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 7, "end_idx": 7 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 28, "end_idx": 28 },
+                ],
             },
         ),
         (
@@ -224,6 +278,9 @@ def test_find_token_matches(
                 "pattern_1": "Image:<image>",
                 "pattern_2": "Image:<image><image><image>",
                 "pattern_3": "Image:<unk><image>",
+                "pattern_4": PromptIndexTargets.start(),
+                "pattern_5": PromptIndexTargets.prefix("Image:<image>"),
+                "pattern_6": PromptIndexTargets.end(),
             },
             {
                 "pattern_1": [
@@ -234,6 +291,15 @@ def test_find_token_matches(
                     { "start_idx": 0, "end_idx": 27 },
                 ],
                 "pattern_3": [],
+                "pattern_4": [
+                    { "start_idx": 0, "end_idx": 0 },
+                ],
+                "pattern_5": [
+                    { "start_idx": 13, "end_idx": 13 },
+                ],
+                "pattern_6": [
+                    { "start_idx": 48, "end_idx": 48 },
+                ],
             },
         ),
         # Test regex escape
@@ -325,6 +391,100 @@ def test_find_text_matches(
                 },
             },
         ),
+        # Test index targets
+        (
+            "",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+                PromptReplacement: {
+                    0: "",
+                    1: "13",
+                    2: "1133",
+                },
+            },
+        ),
+        (
+            "<image>",
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix("<image>"),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": "1",
+                "pattern_2": "2",
+                "pattern_3": "3",
+            },
+            {
+                PromptInsertion: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+                PromptReplacement: {
+                    0: "<image>",
+                    1: "1<image>23",
+                    2: "11<image>2233",
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": "<image>",
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "1<image><image>",
+                    2: "12<image>",
+                },
+            },
+        ),
+        (
+            "<image><image><image>",
+            {
+                "pattern_1": PromptIndexTargets.prefix("<image>"),
+            },
+            {
+                "pattern_1": lambda idx: str(idx + 1),
+            },
+            {
+                PromptInsertion: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+                PromptReplacement: {
+                    0: "<image><image><image>",
+                    1: "<image>1<image><image>",
+                    2: "<image>12<image><image>",
+                },
+            },
+        ),
     ]
 )
 # yapf: enable
@@ -405,6 +565,100 @@ def test_find_update_text(
                 },
             },
         ),
+        # Test index targets
+        (
+            [],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [],
+                    1: [-1, -3],
+                    2: [-1, -1, -3, -3],
+                },
+            },
+        ),
+        (
+            [32000],
+            {
+                "pattern_1": PromptIndexTargets.start(),
+                "pattern_2": PromptIndexTargets.prefix([32000]),
+                "pattern_3": PromptIndexTargets.end(),
+            },
+            {
+                "pattern_1": [-1],
+                "pattern_2": [-2],
+                "pattern_3": [-3],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+                PromptReplacement: {
+                    0: [32000],
+                    1: [-1, 32000, -2, -3],
+                    2: [-1, -1, 32000, -2, -2, -3, -3],
+                },
+            },
+        ),
+        # Test different replacement per item
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": [32000],
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [-1, 32000, 32000],
+                    2: [-1, -2, 32000],
+                },
+            },
+        ),
+        (
+            [32000, 32000, 32000],
+            {
+                "pattern_1": PromptIndexTargets.prefix([32000]),
+            },
+            {
+                "pattern_1": lambda idx: [-(idx + 1)],
+            },
+            {
+                PromptInsertion: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+                PromptReplacement: {
+                    0: [32000, 32000, 32000],
+                    1: [32000, -1, 32000, 32000],
+                    2: [32000, -1, -2, 32000, 32000],
+                },
+            },
+        ),
     ]
 )
 # yapf: enable
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 61f2f8974d9..8457f629446 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -19,8 +19,8 @@
                                     NestedTensors)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptInsertion,
-                                        PromptUpdate)
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -490,7 +490,7 @@ def _get_prompt_updates(
         return [
             PromptInsertion(
                 modality="image",
-                target="",
+                target=PromptIndexTargets.start(),
                 insertion=image_tokens,
             )
         ]
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 6fa1bb80995..7a851037945 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -25,7 +25,8 @@
 from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
                                         EncDecMultiModalProcessor,
-                                        PromptInsertion, PromptUpdate)
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -864,7 +865,7 @@ def _get_prompt_updates(
         return [
             PromptInsertion(
                 modality="image",
-                target="",
+                target=PromptIndexTargets.start(),
                 insertion=image_tokens,
             )
         ]
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 60af103189f..21158f7e580 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,8 +46,8 @@
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptInsertion,
-                                        PromptUpdate)
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.utils import JSONTree, json_map_leaves
@@ -1371,7 +1371,7 @@ def get_insertion_molmo(item_idx: int):
         return [
             PromptInsertion(
                 modality="image",
-                target="<|endoftext|>",
+                target=PromptIndexTargets.prefix("<|endoftext|>"),
                 insertion=get_insertion_molmo,
             )
         ]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ac33af7c10c..7232df074f8 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -8,7 +8,6 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from functools import lru_cache
-from itertools import groupby
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
                     TypeVar, Union, cast)
 
@@ -40,6 +39,65 @@
 """A token sequence (list of token IDs) or text."""
 
 
+@dataclass
+class PromptIndex:
+    """Resolves to an index in the prompt."""
+    get_match_index: Callable[[AnyTokenizer, PromptSeq], Optional[int]]
+
+
+class PromptIndexTargets:
+
+    @staticmethod
+    def start() -> PromptIndex:
+        """
+        Resolves to the start of the prompt (before the first token).
+
+        This results in a match even if the prompt is empty.
+        """
+        return PromptIndex(lambda tok, prompt: 0)
+
+    @staticmethod
+    def prefix(seq: PromptSeq) -> PromptIndex:
+        """
+        Resolves to a location in the prompt after the given prefix.
+        """
+
+        def get_match_index(
+            tokenizer: AnyTokenizer,
+            prompt: PromptSeq,
+        ) -> Optional[int]:
+            prefix = seq
+
+            if isinstance(prompt, str):
+                if not isinstance(prefix, str):
+                    # Make both `str`
+                    prefix = decode_tokens(tokenizer, prefix)
+            else:
+                if isinstance(prefix, str):
+                    # Make both `list[int]`
+                    prefix = encode_tokens(tokenizer, prefix)
+
+            match_idx = len(prefix)
+            return match_idx if prompt[:match_idx] == prefix else None
+
+        return PromptIndex(get_match_index)
+
+    @staticmethod
+    def end() -> PromptIndex:
+        """
+        Resolves to the end of the prompt (after the last token).
+
+        This results in a match even if the prompt is empty.
+        """
+        return PromptIndex(lambda tok, prompt: len(prompt))
+
+
+PromptTarget = Union[PromptSeq, PromptIndex]
+"""
+The token sequence or text to update.
+"""
+
+
 @dataclass
 class PromptUpdateDetails:
     """Details about the token sequence or text that are part of the update."""
@@ -84,7 +142,7 @@ class UpdateMode(str, Enum):
 
 
 @dataclass
-class PromptUpdate:
+class PromptUpdate(ABC):
     """
     Defines how to update a prompt with placeholder tokens.
     """
@@ -92,7 +150,7 @@ class PromptUpdate:
     modality: str
     """The modality for which the update is made."""
 
-    target: PromptSeq
+    target: PromptTarget
     """The token sequence (or text) to update."""
 
     @property
@@ -122,24 +180,43 @@ class PromptInsertion(PromptUpdate):
     Example:
 
         For each image, insert a number of ``<image>`` feature placeholders
-        equal to the feature size of the vision encoder at the start of the
-        prompt:
+        equal to the feature size of the vision encoder after the ``<s>`` token:
 
         .. code-block:: python
 
             PromptInsertion(
                 modality="image",
-                target="",
+                target="<s>",
                 insertion="<image>" * image_feature_size,
             )
 
-        As above, but insert after the ``<s>`` token:
+        Insert these tokens at the start of the prompt:
 
         .. code-block:: python
 
             PromptInsertion(
                 modality="image",
-                target="<s>",
+                target=PromptIndexTargets.start(),
+                insertion="<image>" * image_feature_size,
+            )
+
+        Insert these tokens after a prefix ``Images:``:
+
+        .. code-block:: python
+
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.prefix("Images:"),
+                insertion="<image>" * image_feature_size,
+            )
+
+        Insert these tokens at the end of the prompt:
+
+        .. code-block:: python
+
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.end(),
                 insertion="<image>" * image_feature_size,
             )
     """
@@ -345,10 +422,14 @@ def modality(self) -> str:
         return self._origin.modality
 
     @property
-    def target(self) -> _BoundPromptSequence:
+    def target(self) -> Union[_BoundPromptSequence, PromptIndex]:
         """The token sequence (or text) to update."""
-        return _BoundPromptSequence.from_seq(self.tokenizer,
-                                             self._origin.target)
+        target = self._origin.target
+
+        if isinstance(target, PromptIndex):
+            return target
+
+        return _BoundPromptSequence.from_seq(self.tokenizer, target)
 
     @property
     def content(self) -> PromptUpdateContent:
@@ -447,6 +528,19 @@ def __repr__(self) -> str:
                 f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})")
 
 
+@dataclass(repr=False)
+class _PromptTargetIndexMatch(_PromptTargetMatch):
+    match_idx: int
+
+    @property
+    def start_idx(self) -> int:
+        return self.match_idx
+
+    @property
+    def end_idx(self) -> int:
+        return self.match_idx
+
+
 @dataclass(repr=False)
 class _PromptTargetTokenMatch(_PromptTargetMatch):
     match: _TokenMatch
@@ -496,9 +590,24 @@ def find_token_matches(
     prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[_PromptTargetMatch]:
     """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+
+    def get_matches(update: BoundPromptUpdate):
+        target = update.target
+
+        if isinstance(target, PromptIndex):
+            match_idx = target.get_match_index(update.tokenizer, prompt)
+            if match_idx is None:
+                return []
+
+            return [_PromptTargetIndexMatch(update, match_idx)]
+
+        return [
+            _PromptTargetTokenMatch(update, match)
+            for match in iter_token_matches(prompt, target.token_ids)
+        ]
+
     return [
-        _PromptTargetTokenMatch(update, match) for update in prompt_updates
-        for match in iter_token_matches(prompt, update.target.token_ids)
+        match for update in prompt_updates for match in get_matches(update)
     ]
 
 
@@ -507,9 +616,24 @@ def find_text_matches(
     prompt_updates: Sequence[BoundPromptUpdate],
 ) -> Sequence[_PromptTargetMatch]:
     """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
+
+    def get_matches(update: BoundPromptUpdate):
+        target = update.target
+
+        if isinstance(target, PromptIndex):
+            match_idx = target.get_match_index(update.tokenizer, prompt)
+            if match_idx is None:
+                return []
+
+            return [_PromptTargetIndexMatch(update, match_idx)]
+
+        return [
+            _PromptTargetTextMatch(update, match)
+            for match in re.finditer(re.escape(target.text), prompt)
+        ]
+
     return [
-        _PromptTargetTextMatch(update, match) for update in prompt_updates
-        for match in re.finditer(re.escape(update.target.text), prompt)
+        match for update in prompt_updates for match in get_matches(update)
     ]
 
 
@@ -547,45 +671,39 @@ def _apply_matches(
     prev_end_idx = 0
     next_idx_by_modality = defaultdict[str, int](lambda: 0)
 
-    for (start_idx, end_idx), group in groupby(
-            _resolve_matches(prompt, mm_matches),
-            key=lambda x: (x.start_idx, x.end_idx),
-    ):
-        matches = tuple(group)
-        assert len(matches) == 1
-
-        for match in matches:
-            modality = match.modality
+    for match in _resolve_matches(prompt, mm_matches):
+        modality = match.modality
+
+        item_start_idx = next_idx_by_modality[modality]
+        max_item_count = mm_item_counts.get(modality, 0)
+        if item_start_idx >= max_item_count:
+            continue
+
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+        origin = match._origin
+        mode = origin.mode
+
+        if mode == UpdateMode.INSERT:
+            out_seqs.append(prompt[prev_end_idx:end_idx])
+            num_inserts = max_item_count
+        elif mode == UpdateMode.REPLACE:
+            out_seqs.append(prompt[prev_end_idx:start_idx])
+            num_inserts = max_item_count if start_idx == end_idx else 1
+        else:
+            assert_never(mode)
 
-            item_idx = next_idx_by_modality[modality]
-            if item_idx >= mm_item_counts.get(modality, 0):
-                continue
+        item_end_idx = min(item_start_idx + num_inserts, max_item_count)
 
-            origin = match._origin
+        for item_idx in range(item_start_idx, item_end_idx):
             content = origin.get_content(item_idx)
-            mode = origin.mode
-
-            if mode == UpdateMode.INSERT:
-                out_seqs.append(prompt[prev_end_idx:end_idx])
-                num_inserts = mm_item_counts.get(modality, 0)
-            elif mode == UpdateMode.REPLACE:
-                out_seqs.append(prompt[prev_end_idx:start_idx])
-                num_inserts = 1
-            else:
-                assert_never(mode)
-
-            for _ in range(num_inserts):
-                if item_idx >= mm_item_counts.get(modality, 0):
-                    continue
-
-                if isinstance(prompt, str):
-                    out_seqs.append(content.full.text)
-                else:
-                    out_seqs.append(content.full.token_ids)
+            insert_seq = (content.full.text if isinstance(prompt, str) else
+                          content.full.token_ids)
 
-                next_idx_by_modality[modality] += 1
+            out_seqs.append(insert_seq)
 
-            prev_end_idx = end_idx
+        prev_end_idx = end_idx
+        next_idx_by_modality[modality] += item_end_idx - item_start_idx
 
     out_seqs.append(prompt[prev_end_idx:])
 

From fcdd3468ed91fbddacb0c2755b92f0fe538a2e0b Mon Sep 17 00:00:00 2001
From: Yang Liu <651636074@qq.com>
Date: Fri, 28 Feb 2025 23:36:08 +0800
Subject: [PATCH 0439/1240] [Bugfix] Initialize attention bias on the same
 device as Query/Key/Value for QwenVL Series (#14031)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 3 ++-
 vllm/model_executor/models/qwen2_vl.py   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 0dbff665b5d..ef3d28c8087 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -323,7 +323,8 @@ def forward(
 
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None)
+                                                       kv_seqlen=None,
+                                                       device=q.device)
 
             context_layer = xops.memory_efficient_attention_forward(
                 q, k, v, attn_bias=attn_bias, p=0, scale=None)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index cb92fcbe9fa..523b53d5ee4 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -367,7 +367,8 @@ def forward(
 
             seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None)
+                                                       kv_seqlen=None,
+                                                       device=q.device)
 
             context_layer = xops.memory_efficient_attention_forward(
                 q, k, v, attn_bias=attn_bias, p=0, scale=None)

From 17bbd9bae6551323aba7ed0387210362e7a8a813 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Fri, 28 Feb 2025 11:42:07 -0500
Subject: [PATCH 0440/1240] [Doc] Fix ROCm documentation (#14041)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 7004313c90f..84e7f6507de 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -53,9 +53,9 @@ Currently, there are no pre-built ROCm wheels.
     If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
     :::
 
-2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile)
+2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention)
 
-    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support)
+    Install ROCm's flash attention (v2.7.2) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention#amd-rocm-support)
     Alternatively, wheels intended for vLLM use can be accessed under the releases.
 
     For example, for ROCm 6.3, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`.

From 81238b0b57c12bda3970aebf36a32cab58c7d689 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Feb 2025 16:56:44 +0000
Subject: [PATCH 0441/1240] Fix entrypoint tests for embedding models (#14052)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_embedding.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index 8d00564351c..a37169f51b0 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -27,7 +27,7 @@ def server():
         "bfloat16",
         "--enforce-eager",
         "--max-model-len",
-        "8192",
+        "512",
         "--chat-template",
         DUMMY_CHAT_TEMPLATE,
     ]
@@ -60,10 +60,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 9
-    assert embeddings.usage.total_tokens == 9
+    assert embeddings.usage.prompt_tokens == 11
+    assert embeddings.usage.total_tokens == 11
 
     # test using token IDs
     input_tokens = [1, 1, 1, 1, 1]
@@ -77,7 +77,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 5
     assert embeddings.usage.total_tokens == 5
@@ -101,10 +101,10 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 3
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 32
-    assert embeddings.usage.total_tokens == 32
+    assert embeddings.usage.prompt_tokens == 33
+    assert embeddings.usage.total_tokens == 33
 
     # test List[List[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
@@ -119,7 +119,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 4
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 17
     assert embeddings.usage.total_tokens == 17
@@ -234,7 +234,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 10
     assert embeddings.usage.total_tokens == 10
@@ -252,7 +252,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
 
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
-    assert len(embeddings.data[0].embedding) == 4096
+    assert len(embeddings.data[0].embedding) == 384
     assert embeddings.usage.completion_tokens == 0
     assert embeddings.usage.prompt_tokens == 10
     assert embeddings.usage.total_tokens == 10

From cbb9669adeffa7af0de0528541480e402dab990d Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Fri, 28 Feb 2025 10:01:36 -0800
Subject: [PATCH 0442/1240] [V1][TPU] Integrate the new ragged paged attention
 kernel with vLLM v1 on TPU (#13379)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-tpu.txt                 |  11 +-
 vllm/v1/attention/backends/pallas.py | 280 ++------
 vllm/v1/outputs.py                   |   2 +-
 vllm/v1/worker/gpu_model_runner.py   |   4 +-
 vllm/v1/worker/tpu_model_runner.py   | 955 ++++++++-------------------
 vllm/v1/worker/tpu_worker.py         |   6 +-
 6 files changed, 353 insertions(+), 905 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 8bfbb2dda19..725b1a2e4a5 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -17,9 +17,8 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://download.pytorch.org/whl/nightly/cpu/torch-2.6.0.dev20241216%2Bcpu-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+
+torch==2.7.0.dev20250226+cpu
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 37bf33f6e3e..a9f7b3fd447 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -4,13 +4,16 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
-import torch_xla.experimental.custom_kernel  # Required to register custom ops.
+# Required to register custom ops.
+import torch_xla.experimental.custom_kernel  # noqa: F401
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 128
+
 
 class PallasAttentionBackend(AttentionBackend):
 
@@ -47,47 +50,23 @@ def swap_blocks(
     ) -> None:
         raise RuntimeError("swap_blocks is not used for the TPU backend.")
 
-    @torch.compile(backend="openxla")
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        src_to_dists: Tuple[torch.Tensor, torch.Tensor],
-    ) -> None:
-        src_indices, dst_indices = src_to_dists
-        for k_cache, v_cache in kv_caches:
-            torch.ops.xla.dynamo_set_buffer_donor_(k_cache, True)
-            k_cache[:, dst_indices] = k_cache[:, src_indices]
-            torch.ops.xla.dynamo_set_buffer_donor_(v_cache, True)
-            v_cache[:, dst_indices] = v_cache[:, src_indices]
-
 
 @dataclass
-class PallasMetadata(AttentionMetadata):
-
-    # Currently, input sequences can only contain all prefills
-    # or all decoding.
-    block_tables: Optional[torch.Tensor] = None
-    context_lens: Optional[torch.Tensor] = None
-    effective_query_lens: Optional[torch.Tensor] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        assert self.num_decode_tokens == 0
-        return self
-
-    @property
-    def decode_metadata(self) -> Optional["PallasMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        assert self.num_prefills == 0
-        assert self.num_prefill_tokens == 0
-        assert self.block_tables is not None
-        assert self.context_lens is not None
-        return self
+class PallasMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    # Used in the PallasAttentionBackendImpl
+    slot_mapping: torch.Tensor
+    block_tables: torch.Tensor
+    context_lens: torch.Tensor
+    query_start_loc: torch.Tensor
+    num_seqs: int
 
 
 class PallasAttentionBackendImpl(AttentionImpl):
@@ -105,10 +84,13 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
     ) -> None:
+        if blocksparse_params is not None:
+            raise ValueError("Paged attention Pallas kernel does "
+                             "not support block-sparse attention.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.num_kv_heads = num_kv_heads
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -126,25 +108,6 @@ def __init__(
             raise NotImplementedError(
                 "Attention logits soft-capping is not supported.")
 
-        if torch_xla.tpu.version() < 4:
-            raise NotImplementedError("TPU version must be 4 or higher.")
-
-        self.megacore_mode = None
-        tpu_env = torch_xla.tpu.get_tpu_env()
-        tpu_type = (tpu_env.get("ACCELERATOR_TYPE", None)
-                    or tpu_env.get("TYPE", None)
-                    or tpu_env.get("TPU_ACCELERATOR_TYPE", None))
-        assert tpu_type is not None
-        tpu_type = tpu_type.lower()
-
-        if (("lite" not in tpu_type) and ("v6" not in tpu_type)):
-            if self.num_kv_heads % 2 == 0:
-                self.megacore_mode = "kv_head"
-            else:
-                # NOTE(woosuk): If the batch size is not a multiple of 2, the
-                # megacore mode will be None.
-                self.megacore_mode = "batch"
-
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
@@ -164,135 +127,47 @@ def forward(
         """Forward pass with Pallas attention.
 
         Args:
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size]
-            kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size]
-                NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor 
-                with shape [0] for profiling run.
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = ([num_kv_heads, num_blocks, block_size, head_size], 
+                        [num_kv_heads, num_blocks, block_size, head_size])
             attn_metadata: Metadata for attention.
         Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
+            shape = [num_tokens, num_heads * head_size]
         """
-
-        if attn_metadata is None:
+        # For determine_available_memory case.
+        if kv_cache[0].numel() == 0:
             if output is None:
                 output = torch.ones_like(query)
             return output
 
         assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
-        batch_size, seq_len, hidden_size = query.shape
-        query = query.view(batch_size, seq_len, self.num_heads, self.head_size)
-        key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size)
-        value = value.view(batch_size, seq_len, self.num_kv_heads,
-                           self.head_size)
+        num_tokens, hidden_size = query.shape
+        query = query.view(num_tokens, self.num_heads, self.head_size)
+        key = key.view(num_tokens, self.num_kv_heads, self.head_size)
+        value = value.view(num_tokens, self.num_kv_heads, self.head_size)
 
+        key_cache, value_cache = kv_cache
         if kv_cache[0].numel() > 0:
             slot_mapping = attn_metadata.slot_mapping
-            key_cache, value_cache = kv_cache
             write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
 
         query = query * self.scale
-        if attn_metadata.num_prefills > 0:
-            if attn_metadata.block_tables is None:
-                # Prefill without paged KV cache.
-                assert seq_len % 16 == 0, (
-                    "Pallas FlashAttention kernel requires seq_len to be a "
-                    f"multiple of 16 but got {seq_len}")
-
-                # Handle GQA/MQA.
-                if self.num_kv_heads != self.num_heads:
-                    key = key.repeat_interleave(self.num_queries_per_kv,
-                                                dim=-2)
-                    key = key.view(batch_size, seq_len, self.num_heads,
-                                   self.head_size)
-                    value = value.repeat_interleave(self.num_queries_per_kv,
-                                                    dim=-2)
-                    value = value.view(batch_size, seq_len, self.num_heads,
-                                       self.head_size)
-                # FlashAttention kernel requires the input shape to be
-                # [batch_size, num_heads, seq_len, d_model]
-                # while the input is [batch_size, seq_len, num_heads, d_model].
-                # Permute the input to match the required format.
-                output = torch.ops.xla.flash_attention(
-                    query.permute(0, 2, 1, 3),
-                    key.permute(0, 2, 1, 3),
-                    value.permute(0, 2, 1, 3),
-                    True,
-                )
-                output = output.permute(0, 2, 1, 3)
-            else:
-                # Prefill with paged KV cache.
-                # TODO(woosuk): Tune the below knobs.
-                num_kv_pages_per_compute_block = 16
-                num_queries_per_compute_block = 16
-                assert seq_len % num_queries_per_compute_block == 0
-                output = torch.ops.xla.multi_queries_paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    attn_metadata.effective_query_lens,
-                    num_kv_pages_per_compute_block,
-                    num_queries_per_compute_block,
-                    use_kernel=True,
-                )
-        else:
-            # Decoding run.
-            assert kv_cache[0].numel() > 0
-            query = query.squeeze(dim=1)
-            pages_per_compute_block = 16  # TODO(woosuk): Tune this value.
-
-            assert attn_metadata.block_tables is not None
-            assert attn_metadata.context_lens is not None
-            # NOTE(woosuk): The PagedAttention Pallas kernel stores the entire
-            # block table in SMEM. Therefore, if the block table is too large,
-            # the kernel compilation will fail. To avoid this, we split the
-            # batch dimension into smaller chunks and run the kernel multiple
-            # times.
-            MAX_SMEM_USAGE = 512 * 1024
-            size_per_seq = 4 * attn_metadata.block_tables.shape[1]
-            max_num_seq = MAX_SMEM_USAGE // size_per_seq
-
-            if batch_size <= max_num_seq:
-                output = paged_attention(
-                    query,
-                    key_cache,
-                    value_cache,
-                    attn_metadata.context_lens,
-                    attn_metadata.block_tables,
-                    pages_per_compute_block,
-                    self.megacore_mode,
-                )
-            else:
-                chunk_size = max_num_seq
-                # Make sure the chunk size is a multiple of 2.
-                chunk_size = chunk_size // 2 * 2
-                num_chunks = (batch_size + chunk_size - 1) // chunk_size
-
-                output = torch.empty_like(query)
-                for chunk_idx in range(num_chunks):
-                    chunk_start = chunk_idx * chunk_size
-                    chunk_end = chunk_start + chunk_size
-                    # NOTE(woosuk): We skip this line because it causes Dynamo
-                    # compilation error. Instead, we rely on the slice operation
-                    # to handle the out-of-bound case.
-                    # chunk_end = min(chunk_end, batch_size)
-                    chunk_output = paged_attention(
-                        query[chunk_start:chunk_end],
-                        key_cache,
-                        value_cache,
-                        attn_metadata.context_lens[chunk_start:chunk_end],
-                        attn_metadata.block_tables[chunk_start:chunk_end],
-                        pages_per_compute_block,
-                        self.megacore_mode,
-                    )
-                    output[chunk_start:chunk_end] = chunk_output
+        output = torch.ops.xla.ragged_paged_attention(
+            query,
+            key_cache,
+            value_cache,
+            attn_metadata.context_lens,
+            attn_metadata.block_tables,
+            attn_metadata.query_start_loc,
+            attn_metadata.num_seqs,
+            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
+            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
+            use_kernel=False,
+        )
 
-        # Reshape the output tensor.
-        return output.reshape(batch_size, seq_len, hidden_size)
+        return output.reshape(num_tokens, hidden_size)
 
 
 def write_to_kv_cache(
@@ -302,52 +177,21 @@ def write_to_kv_cache(
     value_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
+    """ Write the key and values to the KV cache.
+
+    Args:
+        key: shape = [num_tokens, num_kv_heads, head_size]
+        value: shape = [num_tokens, num_kv_heads, head_size]
+        k_cache = [num_kv_heads, num_blocks, block_size, head_size]
+        v_cache = [num_kv_heads, num_blocks, block_size, head_size]
+
+    """
     torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
     torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
 
-    key = key.flatten(0, 2)
-    value = value.flatten(0, 2)
+    key = key.flatten(0, 1)
+    value = value.flatten(0, 1)
     key_cache = key_cache.flatten(0, 2)
     value_cache = value_cache.flatten(0, 2)
     key_cache.index_copy_(0, slot_mapping, key)
     value_cache.index_copy_(0, slot_mapping, value)
-
-
-def paged_attention(
-    query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    context_lens: torch.Tensor,
-    block_tables: torch.Tensor,
-    pages_per_compute_block: int,
-    megacore_mode: Optional[str],
-) -> torch.Tensor:
-    batch_size = query.shape[0]
-    if megacore_mode == "batch" and batch_size % 2 != 0:
-        megacore_mode = None
-    else:
-        megacore_mode = megacore_mode
-
-    # NOTE(woosuk): A temporary workaround to avoid the error:
-    # "xla::paged_attention() Expected a value of type 'str' for
-    # argument 'megacore_mode' but instead found type 'NoneType'."
-    if megacore_mode is not None:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-            megacore_mode=megacore_mode,
-        )
-    else:
-        output = torch.ops.xla.paged_attention(
-            query,
-            key_cache,
-            value_cache,
-            context_lens,
-            block_tables,
-            pages_per_compute_block,
-        )
-    return output
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 0c8eca38ade..f461d52cc98 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -79,4 +79,4 @@ class ModelRunnerOutput:
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
-    prompt_logprobs_dict: Dict[str, LogprobsTensors]
+    prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2730e6770dc..e255becbefb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1071,12 +1071,12 @@ def _get_prompt_logprobs_dict(
         self,
         hidden_states: torch.Tensor,
         scheduler_output: "SchedulerOutput",
-    ) -> Dict[str, LogprobsTensors]:
+    ) -> Dict[str, Optional[LogprobsTensors]]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
-        prompt_logprobs_dict: Dict[str, LogprobsTensors] = {}
+        prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
         # maintainable loop over optimal performance.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f7d72d26e04..d16a0a4165c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-import enum
 import time
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
 from unittest.mock import patch
 
 import numpy as np
@@ -21,7 +19,9 @@
 from vllm.model_executor.model_loader import get_model
 from vllm.sampling_params import SamplingType
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               NUM_QUERIES_PER_BLOCK,
+                                               PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
@@ -37,36 +37,7 @@
 # Here we utilize the behavior that out-of-bound index is ignored.
 # FIXME(woosuk): Find a more reliable way to prevent possible bugs.
 _PAD_SLOT_ID = 1_000_000_000
-
-
-class ExecutionMode(enum.Enum):
-    PREFILL = enum.auto()
-    DECODE = enum.auto()
-    PREFIX_PREFILL = enum.auto()
-
-    def is_prefill(self) -> bool:
-        return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL)
-
-
-@dataclass
-class PromptDecodeInfo:
-    prompt_req_ids: List[str]
-    decode_req_ids: List[str]
-    prompt_scheduled_tokens: List[int]
-
-
-@dataclass
-class PromptData:
-    input_tokens: torch.Tensor
-    input_positions: torch.Tensor
-    attn_metadata: PallasMetadata
-
-
-@dataclass
-class DecodeData:
-    input_tokens: Optional[torch.Tensor] = None
-    input_positions: Optional[torch.Tensor] = None
-    attn_metadata: Optional[PallasMetadata] = None
+INVALID_TOKEN_ID = -1
 
 
 class TPUModelRunner:
@@ -113,8 +84,6 @@ def __init__(
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
 
-        self.model: Optional[nn.Module] = None
-
         # Persistent batch.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
@@ -134,50 +103,48 @@ def __init__(
         # KV caches for forward pass
         self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = []
 
-        # Cached torch/numpy tensors
-        self.num_swaps = 2
-        self.cur_swap_id = 0
-        self.input_ids_cpu = []
-        self.input_ids_np = []
-        self.input_positions_cpu = []
-        self.input_positions_np = []
-        self.slot_mapping_cpu = []
-        self.slot_mapping_np = []
-        self.prompt_context_lens_cpu = []
-        self.prompt_effective_query_lens_cpu = []
-        self.decode_context_lens_cpu = []
-        self.decode_context_lens_np = []
-        for _ in range(self.num_swaps):
-            self.input_ids_cpu.append(
-                torch.empty(self.max_num_tokens,
-                            dtype=torch.int32,
-                            device="cpu"))
-            self.input_ids_np.append(self.input_ids_cpu[-1].numpy())
-
-            self.input_positions_cpu.append(
-                torch.empty(self.max_num_tokens,
-                            dtype=torch.int32,
-                            device="cpu"))
-            self.input_positions_np.append(
-                self.input_positions_cpu[-1].numpy())
-
-            self.slot_mapping_cpu.append(
-                torch.empty(self.max_num_tokens,
-                            dtype=torch.int64,
-                            device="cpu"))
-            self.slot_mapping_np.append(self.slot_mapping_cpu[-1].numpy())
-
-            self.prompt_context_lens_cpu.append(
-                torch.empty((1), dtype=torch.int32, device="cpu"))
-            self.prompt_effective_query_lens_cpu.append(
-                torch.empty((1), dtype=torch.int32, device="cpu"))
-
-            self.decode_context_lens_cpu.append(
-                torch.empty(self.max_num_tokens,
-                            dtype=torch.int32,
-                            device="cpu"))
-            self.decode_context_lens_np.append(
-                self.decode_context_lens_cpu[-1].numpy())
+        # Cached torch/numpy tensor
+        # The pytorch tensor and numpy array share the same buffer.
+        # Sometimes the numpy op is faster so we create both.
+        self.input_ids_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu")
+        self.input_ids_np = self.input_ids_cpu.numpy()
+
+        self.positions_cpu = torch.zeros(self.max_num_tokens,
+                                         dtype=torch.int32,
+                                         device="cpu")
+        self.positions_np = self.positions_cpu.numpy()
+
+        self.slot_mapping_cpu = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int64,
+                                            device="cpu")
+        self.slot_mapping_np = self.slot_mapping_cpu.numpy()
+
+        # self.input_batch.block_table has a shape of [max_num_reqs,
+        # max_num_blocks_per_req]. To reduce the number of recompilation,
+        # we want the block_table.shape[0] to be num_tokens.
+        # To make the block_table to be compatible with the paged attention
+        # kernel, we want the block_table[1] to be multiple of
+        # NUM_KV_PAGES_PER_BLOCK.
+        padded_max_num_blocks_per_req = _get_padded_number(
+            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
+        self.block_table_cpu = torch.zeros(
+            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
+            device="cpu")
+
+        self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1,
+                                               dtype=torch.int32,
+                                               device="cpu",
+                                               pin_memory=self.pin_memory)
+        self.query_start_loc_np = self.query_start_loc_cpu.numpy()
+
+        self.seq_lens_cpu = torch.zeros(self.max_num_tokens,
+                                        dtype=torch.int32,
+                                        device="cpu",
+                                        pin_memory=self.pin_memory)
+        self.seq_lens_np = self.seq_lens_cpu.numpy()
 
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
@@ -191,7 +158,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         the input GPU tensors for the model.
 
         Returns:
-            True if there is a new/resumed/paused/finished request in the batch.
+            True if there is a new/resumed/paused/finished request.
             If False, we can skip copying SamplingMetadata to the GPU.
         """
         # Remove finished requests from the cached states.
@@ -303,9 +270,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             self.input_batch.condense(removed_req_indices)
         return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
-    def swap_step(self):
-        self.cur_swap_id = (self.cur_swap_id + 1) % self.num_swaps
-
     def get_model(self) -> nn.Module:
         assert self.model is not None
         return self.model
@@ -345,238 +309,124 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
 
         return kv_cache_spec
 
-    def _get_prompts_and_decodes(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> PromptDecodeInfo:
+    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
 
-        # Traverse decodes first
-        decode_req_ids = []
-        for i in range(num_reqs):
-            req_id = self.input_batch.req_ids[i]
-            assert req_id is not None
-
-            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
-            num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
-                req_id]
-
-            if num_computed_tokens < num_prompt_tokens:
-                # This is prompt
-                break
-
-            # This is decode
-            assert num_scheduled_tokens == 1
-            decode_req_ids.append(req_id)
-
-        # Traverse prompts
-        prompt_req_ids = []
-        prompt_scheduled_tokens = []
-        for i in range(len(decode_req_ids), num_reqs):
-            req_id = self.input_batch.req_ids[i]
+        # Get the number of scheduled tokens for each request.
+        num_scheduled_tokens_per_req = []
+        max_num_scheduled_tokens_all_reqs = 0
+        for req_id in self.input_batch.req_ids[:num_reqs]:
             assert req_id is not None
-
-            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[i]
-            num_prompt_tokens = self.input_batch.num_prompt_tokens[i]
-            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
-                req_id]
-
-            # Must be prompt
-            assert num_computed_tokens < num_prompt_tokens
-
-            prompt_req_ids.append(req_id)
-            prompt_scheduled_tokens.append(num_scheduled_tokens)
-
-        return PromptDecodeInfo(prompt_req_ids, decode_req_ids,
-                                prompt_scheduled_tokens)
-
-    def _prepare_prompt(self, req_index: int,
-                        num_scheduled_tokens: int) -> PromptData:
-        num_computed_tokens = self.input_batch.num_computed_tokens_cpu[
-            req_index]
-        num_prompt_tokens = self.input_batch.num_prompt_tokens[req_index]
-
-        # Must be prompt
-        assert num_computed_tokens < num_prompt_tokens
-
-        # Prompt len
-        prompt_len = num_scheduled_tokens
-        padded_prompt_len = _get_padded_prompt_len(prompt_len)
-        assert padded_prompt_len <= self.max_model_len
-
-        # Seq len
-        seq_len = num_computed_tokens + prompt_len
-        padded_seq_len = num_computed_tokens + padded_prompt_len
-
-        # Input tokens
-        input_tokens_cpu = self.input_batch.token_ids_cpu_tensor[
-            req_index, num_computed_tokens:padded_seq_len]
-        input_tokens_cpu[prompt_len:] = 0
-
-        # Input positions
-        input_positions_np = self.input_positions_np[
-            self.cur_swap_id][:padded_prompt_len]
-        np.add(num_computed_tokens,
-               self.arange_np[:padded_prompt_len],
-               out=input_positions_np)
-        input_positions_np[prompt_len:] = 0
-
-        # Slot mapping
-        block_table_np = \
-            self.input_batch.block_table.get_numpy_array()
-        block_numbers_np = block_table_np[req_index, input_positions_np //
-                                          self.block_size]
-        block_offsets_np = input_positions_np % self.block_size
-
-        slot_mapping_np = self.slot_mapping_np[
-            self.cur_swap_id][:padded_prompt_len]
-        np.add(block_numbers_np * self.block_size,
-               block_offsets_np,
-               out=slot_mapping_np)
-        slot_mapping_np[prompt_len:] = _PAD_SLOT_ID
-
-        # Block table
-        block_table_cpu = None
-        if num_computed_tokens > 0:
-            block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
-            block_table_cpu = block_table_cpu[req_index]
-
-        # Context len
-        self.prompt_context_lens_cpu[self.cur_swap_id][0] = 0
-        if num_computed_tokens > 0:
-            self.prompt_context_lens_cpu[self.cur_swap_id][0] = seq_len
-
-        # Effective query len
-        self.prompt_effective_query_lens_cpu[self.cur_swap_id][0] = prompt_len
-
-        # Get final tensors
-        input_tokens = input_tokens_cpu.reshape(1, -1).to(self.device)
-        input_positions = self.input_positions_cpu[
-            self.cur_swap_id][:padded_prompt_len].reshape(1,
-                                                          -1).to(self.device)
-        slot_mapping = self.slot_mapping_cpu[
-            self.cur_swap_id][:padded_prompt_len].reshape(1,
-                                                          -1).to(self.device)
-        block_table = block_table_cpu.reshape(1, -1).to(
-            self.device) if block_table_cpu is not None else None
-
-        context_lens = self.prompt_context_lens_cpu[self.cur_swap_id].to(
-            self.device)
-        effective_query_lens = self.prompt_effective_query_lens_cpu[
-            self.cur_swap_id].to(self.device)
-
-        self.swap_step()
-
-        # Attn metadata
-        attn_metadata = PallasMetadata(
-            num_prefills=1,
-            num_prefill_tokens=0,  # NOTE: This is not used.
-            num_decode_tokens=0,
-            slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            block_tables=block_table,
-            context_lens=context_lens,
-            effective_query_lens=effective_query_lens,
-        )
-
-        return PromptData(input_tokens, input_positions, attn_metadata)
-
-    def _prepare_decode(
-        self,
-        decode_req_ids: List[str],
-    ) -> DecodeData:
-        # Batch size
-        batch_size = len(decode_req_ids)
-        padded_batch_size = _get_padded_batch_size(batch_size)
-        assert padded_batch_size <= self.max_model_len
-
-        # Init [0 .. batch_size - 1]
-        req_indices_np = self.arange_np[:padded_batch_size]
-
-        # Input positions
-        input_positions_np = self.input_positions_np[
-            self.cur_swap_id][:padded_batch_size]
-        np.add(self.input_batch.num_computed_tokens_cpu[:padded_batch_size],
-               0,
-               out=input_positions_np)
-        input_positions_np[batch_size:] = 0
-        input_positions_cpu = self.input_positions_cpu[
-            self.cur_swap_id][:padded_batch_size]
-
-        # Input tokens
-        token_indices_np = (
-            input_positions_np +
-            req_indices_np * self.input_batch.token_ids_cpu.shape[1])
-        input_tokens_cpu = self.input_ids_cpu[
-            self.cur_swap_id][:padded_batch_size]
+            num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_scheduled_tokens_per_req.append(num_tokens)
+            max_num_scheduled_tokens_all_reqs = max(
+                max_num_scheduled_tokens_all_reqs, num_tokens)
+        num_scheduled_tokens_per_req = np.array(num_scheduled_tokens_per_req,
+                                                dtype=np.int32)
+        assert max_num_scheduled_tokens_all_reqs > 0
+
+        # Get request indices.
+        # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        # For each scheduled token, what are the corresponding req index.
+        req_indices = np.repeat(self.arange_np[:num_reqs],
+                                num_scheduled_tokens_per_req)
+
+        # Get batched arange.
+        # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # For each scheduled token, what is its position in corresponding req.
+        arange = np.concatenate(
+            [self.arange_np[:n] for n in num_scheduled_tokens_per_req])
+
+        # Get positions.
+        positions_np = self.positions_np[:total_num_scheduled_tokens]
+        np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
+               arange,
+               out=positions_np)
+
+        # Get token indices.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
+        # where M is the max_model_len.
+        token_indices = (positions_np +
+                         req_indices * self.input_batch.token_ids_cpu.shape[1])
+
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
         torch.index_select(self.input_batch.token_ids_cpu_tensor.flatten(),
                            0,
-                           torch.from_numpy(token_indices_np),
-                           out=input_tokens_cpu)
-        input_tokens_cpu[batch_size:] = 0
-
-        # Slot mapping
-        block_table_indices_np = (
-            req_indices_np * self.max_num_blocks_per_req +
-            input_positions_np // self.block_size)
-
+                           torch.from_numpy(token_indices),
+                           out=self.input_ids_cpu[:total_num_scheduled_tokens])
+
+        # Calculate the slot mapping.
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size` here
+        # because M (max_model_len) is not necessarily divisible by block_size.
+        # req_indices: # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions_np // self.block_size)
+        # NOTE(woosuk): We use torch.index_select instead of np.take here
+        # because torch.index_select is much faster than np.take for large
+        # tensors.
         block_table_cpu = self.input_batch.block_table.get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
+        block_offsets = positions_np % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:total_num_scheduled_tokens])
+
+        # Prepare the attention metadata.
+        self.query_start_loc_np[0] = 0
+        np.cumsum(num_scheduled_tokens_per_req,
+                  out=self.query_start_loc_np[1:num_reqs + 1])
+
+        self.seq_lens_np[:num_reqs] = (
+            self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+            num_scheduled_tokens_per_req)
+
+        # Do the padding and copy the tensors to the TPU.
+        padded_total_num_scheduled_tokens = _get_padded_number(
+            total_num_scheduled_tokens, NUM_QUERIES_PER_BLOCK)
+        self.input_ids = self.input_ids_cpu[:
+                                            padded_total_num_scheduled_tokens].to(
+                                                self.device)
+        self.position_ids = self.positions_cpu[:
+                                               padded_total_num_scheduled_tokens].to(
+                                                   self.device)
+        self.slot_mapping_cpu[total_num_scheduled_tokens:] = _PAD_SLOT_ID
+        slot_mapping = self.slot_mapping_cpu[:
+                                             padded_total_num_scheduled_tokens].to(
+                                                 self.device)
+        padded_block_table = self.block_table_cpu[:
+                                                  padded_total_num_scheduled_tokens]
+        padded_block_table[:num_reqs, :self.max_num_blocks_per_req] = (
+            self.input_batch.block_table.get_cpu_tensor()[:num_reqs])
+        padded_block_table = padded_block_table.to(self.device)
+        query_start_loc = self.query_start_loc_cpu[:
+                                                   padded_total_num_scheduled_tokens
+                                                   + 1].to(self.device)
+        seq_lens = self.seq_lens_cpu[:padded_total_num_scheduled_tokens].to(
+            self.device)
 
-        block_numbers_np = block_table_cpu.flatten(
-        )[block_table_indices_np].numpy()
-
-        block_offsets_np = input_positions_np % self.block_size
-
-        slot_mapping_np = self.slot_mapping_np[
-            self.cur_swap_id][:padded_batch_size]
-        np.add(block_numbers_np * self.block_size,
-               block_offsets_np,
-               out=slot_mapping_np)
-        slot_mapping_np[batch_size:] = _PAD_SLOT_ID
-
-        block_table_cpu = block_table_cpu[:padded_batch_size]
-
-        # Context lens
-        context_lens_np = self.decode_context_lens_np[
-            self.cur_swap_id][:padded_batch_size]
-        np.add(self.input_batch.num_computed_tokens_cpu[:padded_batch_size],
-               1,
-               out=context_lens_np)
-        context_lens_np[batch_size:] = 0
-
-        # Get final tensors
-        input_tokens = input_tokens_cpu.reshape(-1, 1).to(self.device)
-        input_positions = input_positions_cpu.reshape(-1, 1).to(self.device)
-        slot_mapping = self.slot_mapping_cpu[
-            self.cur_swap_id][:padded_batch_size].reshape(-1,
-                                                          1).to(self.device)
-        block_table = block_table_cpu.to(self.device)
-        context_lens = self.decode_context_lens_cpu[
-            self.cur_swap_id][:padded_batch_size].to(self.device)
-
-        self.swap_step()
-
-        # Attn metadata
         attn_metadata = PallasMetadata(
-            num_prefills=0,
-            num_prefill_tokens=0,
-            num_decode_tokens=padded_batch_size,
             slot_mapping=slot_mapping,
-            multi_modal_placeholder_index_maps=None,
-            enable_kv_scales_calculation=True,
-            block_tables=block_table,
-            context_lens=context_lens,
-            effective_query_lens=None,
+            block_tables=padded_block_table,
+            context_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            num_seqs=num_reqs,
         )
-
-        return DecodeData(input_tokens=input_tokens,
-                          input_positions=input_positions,
-                          attn_metadata=attn_metadata)
+        # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
+        # request in the batch. While we should not sample any token from this
+        # partial request, we do so for simplicity. We will ignore the sampled
+        # token from the partial request.
+        # TODO: Support prompt logprobs.
+        logits_indices = query_start_loc[1:] - 1
+        return attn_metadata, logits_indices
 
     @torch.no_grad()
     def execute_model(
@@ -586,118 +436,81 @@ def execute_model(
         # Update cached state
         self._update_states(scheduler_output)
 
-        # If necessary, swap decodes/prompts to have all decodes on the start
-        ensure_decodes_first(self.input_batch)
-
-        # Prepare prompts/decodes info
-        pd_info = self._get_prompts_and_decodes(scheduler_output)
-
-        # Init
-        num_prompts = len(pd_info.prompt_req_ids)
-        num_decodes = len(pd_info.decode_req_ids)
-        decode_data = None
-        sampled_token_ids = [0] * self.input_batch.num_reqs
-
-        # Run each prompt individually
-        is_first = True
-        for i in range(num_prompts):
-            req_id = pd_info.prompt_req_ids[i]
-            req_index = num_decodes + i
-            assert req_index == self.input_batch.req_id_to_index[
-                req_id]  # TODO: Remove
-            req_state = self.requests[req_id]
-            num_scheduled_tokens = pd_info.prompt_scheduled_tokens[i]
-            prompt_len = num_scheduled_tokens
-            seq_len = req_state.num_computed_tokens + num_scheduled_tokens
-
-            # Prepare first prompt
-            if is_first:
-                prompt_data = self._prepare_prompt(req_index,
-                                                   num_scheduled_tokens)
-                is_first = False
-
-            # Run forward pass
-            with set_forward_context(prompt_data.attn_metadata,
-                                     self.vllm_config):
-                assert self.model is not None
-                selected_token_ids = self.model(prompt_data.input_tokens,
-                                                prompt_data.input_positions,
-                                                self.kv_caches)
-
-            # In parallel to TPU execution, prepare the next iteration
-            if i < num_prompts - 1:
-                # There is next prompt => prepare it
-                prompt_data = self._prepare_prompt(
-                    req_index + 1, pd_info.prompt_scheduled_tokens[i + 1])
-            elif i == num_prompts - 1 and num_decodes > 0:
-                # There is next decode => prepare it
-                decode_data = self._prepare_decode(pd_info.decode_req_ids)
-
-            # Update cached state (if prompt is fully done)
-            if seq_len >= len(req_state.prompt_token_ids):
-                # Transfer sampled tokens from TPU to CPU
-                selected_token_ids_cpu = selected_token_ids.cpu()
-
-                # Get output token
-                token_id = selected_token_ids_cpu[prompt_len - 1].item()
-                sampled_token_ids[req_index] = token_id
-
-                # Add output token to the request
-                self.input_batch.token_ids_cpu[req_index, seq_len] = token_id
-                self.input_batch.num_tokens[req_index] += 1
-                req_state.output_token_ids.append(token_id)
+        # Prepare inputs
+        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
 
-        # Run decodes (a single batch)
-        if num_decodes > 0:
-
-            # Prepare decode (if was not yet prepared)
-            if decode_data is None:
-                decode_data = self._prepare_decode(pd_info.decode_req_ids)
-
-            # Run forward pass
-            with set_forward_context(decode_data.attn_metadata,
-                                     self.vllm_config):
-                assert self.model is not None
-                selected_token_ids = self.model(decode_data.input_tokens,
-                                                decode_data.input_positions,
-                                                self.kv_caches)
-
-            # Transfer sampled tokens from TPU to CPU
-            decode_token_ids_cpu = selected_token_ids.cpu()
-            # Convert to list
-            decode_token_ids_list = decode_token_ids_cpu.tolist()
-
-            # Update cached state for each decode request
-            for i in range(num_decodes):
-                req_id = pd_info.decode_req_ids[i]
-                req_index = i
-                assert req_index == self.input_batch.req_id_to_index[
-                    req_id]  # TODO: Remove
-                req_state = self.requests[req_id]
-                seq_len = req_state.num_computed_tokens + 1
-
-                token_id = decode_token_ids_list[i]
-                sampled_token_ids[req_index] = token_id
-
-                self.input_batch.token_ids_cpu[req_index, seq_len] = token_id
-                self.input_batch.num_tokens[req_index] += 1
-                req_state.output_token_ids.append(token_id)
+        # Run the decoder
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                token_ids=self.input_ids,
+                position_ids=self.position_ids,
+                kv_caches=self.kv_caches,
+            )
+        hidden_states = hidden_states[:total_num_scheduled_tokens]
+        num_reqs = self.input_batch.num_reqs
+        logits_indices = logits_indices[:num_reqs]
+        hidden_states = hidden_states[logits_indices]
+        logits = self.model.compute_logits(hidden_states, None)
+        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+
+        # Then, let's update the cache state.
+        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
+        for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
+            assert req_id is not None
+            req_state = self.requests[req_id]
+            seq_len = (req_state.num_computed_tokens +
+                       scheduler_output.num_scheduled_tokens[req_id])
+            if seq_len >= req_state.num_tokens:
+                request_seq_lens.append((i, req_state, seq_len))
+            else:
+                # Ignore the sampled token from the partial request.
+                # Rewind the generator state as if the token was not sampled.
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    # This relies on cuda-specific torch-internal impl details
+                    generator.set_offset(generator.get_offset() - 4)
+
+        # num_reqs entries should be non-None
+        assert all(
+            req_id is not None for req_id in
+            self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
+        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
 
-        # Create output.
-        all_req_ids = pd_info.decode_req_ids + pd_info.prompt_req_ids
         prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {}
-        for req_id in all_req_ids:
+        for req_id in self.input_batch.req_ids[:num_reqs]:
             prompt_logprobs_dict[req_id] = None
 
+        max_gen_len = selected_token_ids.shape[-1]
+        if max_gen_len == 1:
+            valid_sampled_token_ids = selected_token_ids.tolist()
+            for i, req_state, seq_len in request_seq_lens:
+                token_id = valid_sampled_token_ids[i][0]
+                self.input_batch.token_ids_cpu[i, seq_len] = token_id
+                req_state.output_token_ids.append(token_id)
+                self.input_batch.num_tokens[i] += 1
+        else:
+            valid_mask = selected_token_ids != INVALID_TOKEN_ID
+            gen_lens = valid_mask.sum(dim=1).tolist()
+            valid_sampled_token_ids = [
+                seq.tolist()
+                for seq in selected_token_ids[valid_mask].split(gen_lens)
+            ]
+            self.input_batch.num_tokens[:num_reqs] += gen_lens
+            for i, req_state, seq_len in request_seq_lens:
+                target_slice = slice(seq_len - gen_lens[i] + 1, seq_len + 1)
+                self.input_batch.token_ids_cpu[
+                    i, target_slice] = valid_sampled_token_ids[i]
+                req_state.output_token_ids.extend(valid_sampled_token_ids[i])
+
         model_runner_output = ModelRunnerOutput(
-            req_ids=all_req_ids,
+            req_ids=req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
-            sampled_token_ids=[[token_id] for token_id in sampled_token_ids],
+            sampled_token_ids=valid_sampled_token_ids,
             spec_token_ids=None,
             logprobs=None,
-            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore[arg-type]
+            prompt_logprobs_dict=prompt_logprobs_dict,
         )
-
         return model_runner_output
 
     def load_model(self) -> None:
@@ -731,185 +544,63 @@ def dummy_run(
         self,
         kv_caches,
         num_tokens: int,
-        seq_len: Optional[int] = None,
-        exec_mode: Optional[ExecutionMode] = None,
     ) -> None:
-        assert seq_len is not None
-        assert exec_mode is not None
-
-        exec_mode = ExecutionMode(exec_mode)
-        if exec_mode.is_prefill():
-            seq_len = (seq_len + 15) // 16 * 16
-            token_ids = torch.zeros((num_tokens, seq_len),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            position_ids = torch.zeros((num_tokens, seq_len),
-                                       dtype=torch.int32,
-                                       device=self.device)
-            slot_mapping = torch.zeros((num_tokens, seq_len),
-                                       dtype=torch.int64,
-                                       device=self.device)
-            if exec_mode == ExecutionMode.PREFILL:
-                attn_metadata = PallasMetadata(
-                    num_prefills=num_tokens,
-                    num_prefill_tokens=num_tokens * seq_len,
-                    num_decode_tokens=0,
-                    slot_mapping=slot_mapping,
-                    multi_modal_placeholder_index_maps=None,
-                    enable_kv_scales_calculation=True,
-                    block_tables=None,
-                    context_lens=None,
-                    effective_query_lens=None,
-                )
-
-            else:
-                context_lens = torch.ones((num_tokens, ),
-                                          dtype=torch.int32,
-                                          device=self.device)
-
-                block_tables = torch.zeros(
-                    (num_tokens, self.max_num_blocks_per_req),
-                    dtype=torch.int32,
-                    device=self.device)
-
-                effective_query_lens = torch.ones_like(context_lens)
-
-                attn_metadata = PallasMetadata(
-                    num_prefills=num_tokens,
-                    num_prefill_tokens=num_tokens * seq_len,
-                    num_decode_tokens=0,
-                    slot_mapping=slot_mapping,
-                    multi_modal_placeholder_index_maps=None,
-                    enable_kv_scales_calculation=True,
-                    block_tables=block_tables,
-                    context_lens=context_lens,
-                    effective_query_lens=effective_query_lens,
-                )
-        else:
-            assert seq_len == 1
-            token_ids = torch.zeros((num_tokens, seq_len),
-                                    dtype=torch.int32,
-                                    device=self.device)
-            position_ids = torch.zeros((num_tokens, seq_len),
-                                       dtype=torch.int32,
-                                       device=self.device)
-            slot_mapping = torch.zeros((num_tokens, seq_len),
-                                       dtype=torch.int64,
-                                       device=self.device)
-            block_tables = torch.zeros(
-                (num_tokens, self.max_num_blocks_per_req),
-                dtype=torch.int32,
-                device=self.device)
-            context_lens = torch.ones((num_tokens, ),
-                                      dtype=torch.int32,
-                                      device=self.device)
-            attn_metadata = PallasMetadata(
-                num_prefills=0,
-                num_prefill_tokens=0,
-                num_decode_tokens=num_tokens * seq_len,
-                slot_mapping=slot_mapping,
-                multi_modal_placeholder_index_maps=None,
-                enable_kv_scales_calculation=True,
-                block_tables=block_tables,
-                context_lens=context_lens,
-            )
+        input_ids = torch.zeros(num_tokens,
+                                dtype=torch.int32,
+                                device=self.device)
+        position_ids = torch.zeros(num_tokens,
+                                   dtype=torch.int32,
+                                   device=self.device)
+        slot_mapping = torch.zeros(num_tokens,
+                                   dtype=torch.int64,
+                                   device=self.device)
+        block_tables = torch.zeros((num_tokens, self.block_table_cpu.shape[1]),
+                                   dtype=torch.int32,
+                                   device=self.device)
+        query_lens = [1] * num_tokens
+        query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                    dtype=torch.int32),
+                                       dim=0,
+                                       dtype=torch.int32).to(self.device)
+        context_lens = torch.ones((num_tokens, ),
+                                  dtype=torch.int32,
+                                  device=self.device)
+        attn_metadata = PallasMetadata(
+            slot_mapping=slot_mapping,
+            block_tables=block_tables,
+            context_lens=context_lens,
+            query_start_loc=query_start_loc,
+            num_seqs=num_tokens,
+        )
 
-        # NOTE(woosuk): There are two stages of compilation: torch.compile and
-        # XLA compilation. Using `mark_dynamic` can reduce the torch.compile
-        # overhead by reusing the FX graph for different shapes.
-        # However, the XLA graph will still require static shapes and needs to
-        # be re-compiled for every different shapes. This overhead is inevitable
-        # in the first run, but can be skipped afterwards as we cache the XLA
-        # graphs in the disk (VLLM_XLA_CACHE_PATH).
-        if exec_mode.is_prefill():
-            # Prefll
-            torch._dynamo.mark_dynamic(token_ids, 1)
-            torch._dynamo.mark_dynamic(position_ids, 1)
-            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 1)
-        else:
-            # Decode
-            torch._dynamo.mark_dynamic(token_ids, 0)
-            torch._dynamo.mark_dynamic(position_ids, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
-            torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
+        torch._dynamo.mark_dynamic(input_ids, 0)
+        torch._dynamo.mark_dynamic(position_ids, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
+        torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
-            self.model(token_ids, position_ids, kv_caches)
+            self.model(input_ids, position_ids, kv_caches)
 
     def capture_model(self) -> None:
         """Compile the model."""
 
-        # Prefill
-        logger.info(
-            "Compiling the model with different input shapes for prefill:")
-        start = time.time()
-        for batch_size in [1]:
-            seq_len = 16
-            while seq_len <= self.model_config.max_model_len:
-                self.dummy_run(self.kv_caches,
-                               batch_size,
-                               seq_len,
-                               exec_mode=ExecutionMode.PREFILL)
-                xm.wait_device_ops()
-                logger.info("  batch_size: %d, seq_len: %d", batch_size,
-                            seq_len)
-                num_tokens = batch_size * seq_len
-                if num_tokens >= self.scheduler_config.max_num_batched_tokens:
-                    break
-                seq_len = seq_len * 2
-
-        end = time.time()
-        logger.info("    -- Compilation for prefill done in %.2f [secs].",
-                    end - start)
-
-        # Prefix prefill
-        if self.scheduler_config.enable_chunked_prefill:
-            logger.info("Compiling the model with different input shapes for "
-                        "prefix prefill:")
-            start = time.time()
-            for batch_size in [1]:
-                seq_len = 16
-                while seq_len <= self.model_config.max_model_len:
-                    self.dummy_run(self.kv_caches,
-                                   batch_size,
-                                   seq_len,
-                                   exec_mode=ExecutionMode.PREFIX_PREFILL)
-                    xm.wait_device_ops()
-                    logger.info("  batch_size: %d, seq_len: %d", batch_size,
-                                seq_len)
-                    num_tokens = batch_size * seq_len
-                    if (num_tokens
-                            >= self.scheduler_config.max_num_batched_tokens):
-                        break
-                    seq_len = seq_len * 2
-            end = time.time()
-            logger.info(
-                "    -- Compilation for prefix prefill done in %.2f [secs].",
-                end - start)
-
-        # Decode
-        logger.info(
-            "Compiling the model with different input shapes for decode:")
-        start = time.time()
-        seq_len = 1
-        batch_size = 8  # Must be in sync with _get_padded_batch_size()
+        logger.info("Compiling the model with different input shapes.")
+
+        start = time.perf_counter()
+        num_tokens = 16
         while True:
-            self.dummy_run(self.kv_caches,
-                           batch_size,
-                           seq_len,
-                           exec_mode=ExecutionMode.DECODE)
+            self.dummy_run(self.kv_caches, num_tokens)
+            logger.info("  -- num_tokens: %d", num_tokens)
+            xm.mark_step()
             xm.wait_device_ops()
-            logger.info("  batch_size: %d, seq_len: %d", batch_size, seq_len)
-
-            if batch_size >= self.scheduler_config.max_num_seqs:
+            if num_tokens >= self.scheduler_config.max_num_batched_tokens:
                 break
-            batch_size = batch_size + 16 if batch_size >= 16 else batch_size * 2
-
-        end = time.time()
-        logger.info("    -- Compilation for decode done in %.2f [secs].",
-                    end - start)
+            num_tokens *= 2
+        end = time.perf_counter()
+        logger.info("Compilation finished in in %.2f [secs].", end - start)
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
@@ -965,12 +656,8 @@ def forward(
         """Executes the forward pass of the model and samples the next token.
 
         Args:
-            token_ids: The input token IDs of shape [batch_size, seq_len].
-            position_ids: The input position IDs of shape [batch_size, seq_len].
-            input_lens: The actual input lengths of shape [batch_size].
-            t: The sampling temperature of shape [batch_size].
-            p: The top-p probability of shape [batch_size].
-            num_samples: Number of samples to draw from each logits vector.
+            token_ids: The input token IDs of shape [num_tokens].
+            position_ids: The input position IDs of shape [num_tokens].
             kv_caches: The key and value caches. They can be None during the
                 memory profiling at initialization.
         """
@@ -982,6 +669,7 @@ def forward(
             # [num_kv_heads, num_blocks, block_size, head_size]. To make it
             # work, we need to flatten the first three dimensions and modify
             # the slot_mapping accordingly.
+            # kv_caches: List[Tuple[torch.Tensor, torch.Tensor]]
             num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
             slot_mapping = attn_metadata.slot_mapping
             slot_mapping = slot_mapping.flatten()
@@ -997,103 +685,22 @@ def forward(
             attn_metadata.slot_mapping = slot_mapping
 
         assert self.model is not None
-        hidden_states = self.model(token_ids, position_ids)
-
-        hidden_states = hidden_states.flatten(0, 1)
-        logits = self.model.compute_logits(hidden_states, None)
-
-        # Greedy sampling.
-        argmax_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
-        argmax_token_ids = argmax_token_ids.squeeze(dim=-1)
-        return argmax_token_ids
-
-
-def swap_positions(b: InputBatch, id_1, id_2):
-    assert id_1 != id_2
-    req_id_1 = b.req_ids[id_1]
-    req_id_2 = b.req_ids[id_2]
-    assert req_id_1 is not None
-    assert req_id_2 is not None
-    assert id_1 == b.req_id_to_index[req_id_1]
-    assert id_2 == b.req_id_to_index[req_id_2]
-
-    b.req_ids[id_1], b.req_ids[id_2] = b.req_ids[id_2], b.req_ids[id_1]
-    b.req_id_to_index[req_id_1], b.req_id_to_index[
-        req_id_2] = b.req_id_to_index[req_id_2], b.req_id_to_index[req_id_1]
-
-    ids = [id_1, id_2]
-    rev_ids = [id_2, id_1]
-    b.num_tokens[ids] = b.num_tokens[rev_ids]
-    b.token_ids_cpu[ids] = b.token_ids_cpu[rev_ids]
-    b.num_prompt_tokens[ids] = b.num_prompt_tokens[rev_ids]
-    b.num_computed_tokens_cpu[ids] = b.num_computed_tokens_cpu[rev_ids]
-
-    b.block_table.swap_row(id_1, id_2)
-
-    b.temperature_cpu[ids] = b.temperature_cpu[rev_ids]
-    b.top_p_cpu[ids] = b.top_p_cpu[rev_ids]
-    b.top_k_cpu[ids] = b.top_k_cpu[rev_ids]
-    b.frequency_penalties_cpu[ids] = b.frequency_penalties_cpu[rev_ids]
-    b.presence_penalties_cpu[ids] = b.presence_penalties_cpu[rev_ids]
-    b.repetition_penalties_cpu[ids] = b.repetition_penalties_cpu[rev_ids]
-
-    b.min_tokens[id_1], b.min_tokens[id_2] = b.min_tokens[id_2], b.min_tokens[
-        id_1]
-
-    gen_1 = b.generators.pop(id_1, None)
-    gen_2 = b.generators.pop(id_2, None)
-    if gen_1 is not None:
-        b.generators[id_2] = gen_1
-    if gen_2 is not None:
-        b.generators[id_1] = gen_2
-
-
-def ensure_decodes_first(b: InputBatch):
-    num_reqs = b.num_reqs
-    while True:
-        # Find the first prompt index
-        first_prompt_index = None
-        for i in range(num_reqs):
-            if b.num_computed_tokens_cpu[i] < b.num_prompt_tokens[i]:
-                first_prompt_index = i
-                break
-        if first_prompt_index is None:
-            break
-
-        # Find the last decode index
-        last_decode_index = None
-        for i in reversed(range(num_reqs)):
-            if b.num_computed_tokens_cpu[i] >= b.num_prompt_tokens[i]:
-                last_decode_index = i
-                break
-        if last_decode_index is None:
-            break
-
-        # Sanity
-        assert first_prompt_index != last_decode_index
-
-        # Check if done
-        if first_prompt_index > last_decode_index:
-            break
-
-        # Swap
-        swap_positions(b, first_prompt_index, last_decode_index)
+        hidden_states = self.model(
+            token_ids,
+            position_ids,
+            kv_caches,
+        )
 
+        return hidden_states
 
-def _get_padded_prompt_len(x: int) -> int:
-    # NOTE(woosuk): The pallas FlashAttention kernel requires the sequence
-    # length to be a multiple of 16. We pad the prompt length to the nearest
-    # multiple of 16. This is also good for performance.
-    if x <= 16:
-        return 16
-    return 1 << (x - 1).bit_length()
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+        return logits
 
 
-def _get_padded_batch_size(batch_size: int) -> int:
-    # The GMM Pallas kernel requires num_tokens * topk to be a multiple of 16.
-    # To meet this requirement in the simplest way, we set the minimal batch
-    # size to 8.
-    if batch_size <= 8:
-        return 8
-    else:
-        return ((batch_size + 15) // 16) * 16
+def _get_padded_number(n: int, multiple: int) -> int:
+    return ((n + multiple - 1) // multiple) * multiple
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index c236f263edd..405dc628ee1 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -21,7 +21,7 @@
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.tpu_model_runner import ExecutionMode, TPUModelRunner
+from vllm.v1.worker.tpu_model_runner import TPUModelRunner
 
 logger = init_logger(__name__)
 
@@ -126,9 +126,7 @@ def determine_available_memory(self) -> int:
 
         self.model_runner.dummy_run(
             runner_kv_caches,
-            num_tokens=1,
-            seq_len=self.scheduler_config.max_num_batched_tokens,
-            exec_mode=ExecutionMode.PREFILL,
+            num_tokens=self.scheduler_config.max_num_batched_tokens,
         )
 
         # Synchronize before measuring the memory usage.

From fe519ae1527da5a6a7623d3b8739487e57db281a Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 1 Mar 2025 03:03:16 +0800
Subject: [PATCH 0443/1240] [v1] Cleanup the BlockTable in InputBatch (#13977)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_model_runner.py | 14 ++++++++++++++
 vllm/v1/worker/block_table.py            | 13 ++++++-------
 vllm/v1/worker/gpu_input_batch.py        |  3 +--
 vllm/v1/worker/gpu_model_runner.py       |  6 ++----
 vllm/v1/worker/tpu_model_runner.py       |  6 ++----
 5 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 973efcbf8e5..ff4058a3b92 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -89,6 +89,17 @@ def _is_sampling_metadata_changed(model_runner,
         sampling_metadata_before)
 
 
+def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    req_index = model_runner.input_batch.req_id_to_index[req_id]
+    block_table = model_runner.input_batch.block_table
+    req_state = model_runner.requests[req_id]
+    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+        return False
+    num_blocks = block_table.num_blocks_per_row[req_index]
+    return (block_table.block_table_np[req_index, :num_blocks] ==
+            req_state.block_ids).all()
+
+
 def test_update_states_new_request(model_runner):
     req_id = "req_0"
 
@@ -100,6 +111,7 @@ def test_update_states_new_request(model_runner):
     assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
 
 
 def test_update_states_request_finished(model_runner):
@@ -185,6 +197,7 @@ def test_update_states_request_resumed(model_runner):
     assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
 
 
 def test_update_states_no_changes(model_runner):
@@ -215,6 +228,7 @@ def test_update_states_no_changes(model_runner):
     assert not _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
 
 
 def test_update_states_request_unscheduled(model_runner):
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 669175f5d9c..830cca104dd 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -15,13 +15,11 @@ class BlockTable:
     def __init__(
         self,
         max_num_reqs: int,
-        max_model_len: int,
         max_num_blocks_per_req: int,
         pin_memory: bool,
         device: torch.device,
     ):
         self.max_num_reqs = max_num_reqs
-        self.max_model_len = max_model_len
         self.max_num_blocks_per_req = max_num_blocks_per_req
         self.pin_memory = pin_memory
         self.device = device
@@ -42,18 +40,19 @@ def __init__(
 
     def append_row(
         self,
-        row_idx: int,
-        start: int,
         block_ids: List[int],
+        row_idx: int,
     ) -> None:
         if not block_ids:
             return
         num_blocks = len(block_ids)
+        start = self.num_blocks_per_row[row_idx]
+        self.num_blocks_per_row[row_idx] += num_blocks
         self.block_table_np[row_idx, start:start + num_blocks] = block_ids
-        self.num_blocks_per_row[row_idx] = start + num_blocks
 
-    def add_row(self, row_idx: int, block_ids: List[int]) -> None:
-        self.append_row(row_idx, 0, block_ids)
+    def add_row(self, block_ids: List[int], row_idx: int) -> None:
+        self.num_blocks_per_row[row_idx] = 0
+        self.append_row(block_ids, row_idx)
 
     def move_row(self, src: int, tgt: int) -> None:
         num_blocks = self.num_blocks_per_row[src]
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 1b6ea559a7b..788a35221fe 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -92,7 +92,6 @@ def __init__(
         # Block table.
         self.block_table = BlockTable(
             max_num_reqs=max_num_reqs,
-            max_model_len=max_model_len,
             max_num_blocks_per_req=max_num_blocks_per_req,
             pin_memory=pin_memory,
             device=device,
@@ -249,7 +248,7 @@ def add_request(
         self.num_tokens_no_spec[req_index] = request.num_tokens
 
         self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
-        self.block_table.add_row(req_index, request.block_ids)
+        self.block_table.add_row(request.block_ids, req_index)
 
         sampling_params = request.sampling_params
         if sampling_params.sampling_type == SamplingType.GREEDY:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e255becbefb..0215b273538 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -399,10 +399,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 num_computed_tokens)
-            start_index = (len(req_state.block_ids) -
-                           len(req_data.new_block_ids))
-            self.input_batch.block_table.append_row(req_index, start_index,
-                                                    req_data.new_block_ids)
+            self.input_batch.block_table.append_row(req_data.new_block_ids,
+                                                    req_index)
             # Add new_token_ids to token_ids_cpu.
             start_token_index = num_computed_tokens
             end_token_index = num_computed_tokens + len(req_data.new_token_ids)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d16a0a4165c..2c6a0371cde 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -247,10 +247,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             # Update the persistent batch.
             self.input_batch.num_computed_tokens_cpu[req_index] = (
                 req_data.num_computed_tokens)
-            start_index = len(req_state.block_ids) - len(
-                req_data.new_block_ids)
-            self.input_batch.block_table.append_row(req_index, start_index,
-                                                    req_data.new_block_ids)
+            self.input_batch.block_table.append_row(req_data.new_block_ids,
+                                                    req_index)
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.

From 027f53388ad6e4f7c9d1cf8dde9853b9f85955ce Mon Sep 17 00:00:00 2001
From: Andrey Talman <atalman@fb.com>
Date: Fri, 28 Feb 2025 20:25:50 +0000
Subject: [PATCH 0444/1240] Add RELEASE.md (#13926)

Signed-off-by: atalman <atalman@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 RELEASE.md | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 RELEASE.md

diff --git a/RELEASE.md b/RELEASE.md
new file mode 100644
index 00000000000..7f527071521
--- /dev/null
+++ b/RELEASE.md
@@ -0,0 +1,54 @@
+# Releasing vLLM
+
+vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
+
+## Release Versioning
+
+vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
+
+* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
+* _minor_ major features
+* _patch_ features and backwards-compatible bug fixes
+* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
+
+## Release Cadence
+
+Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
+Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
+
+| Release Date | Patch release versions | Post Release versions |
+| --- | --- | --- |
+| Jan 2025 | 0.7.0 | --- |
+| Feb 2025 | 0.7.1, 0.7.2, 0.7.3  | --- |
+| Mar 2025 | 0.7.4, 0.7.5 | --- |
+| Apr 2025 | 0.7.6, 0.7.7 | --- |
+| May 2025 | 0.7.8, 0.7.9 | --- |
+| Jun 2025 | 0.7.10, 0.7.11 | --- |
+| Jul 2025 | 0.7.12, 0.7.13 | --- |
+| Aug 2025 | 0.7.14, 0.7.15 | --- |
+| Sep 2025 | 0.7.16, 0.7.17 | --- |
+| Oct 2025 | 0.7.18, 0.7.19 | --- |
+| Nov 2025 | 0.7.20, 0.7.21 | --- |
+| Dec 2025 | 0.7.22, 0.7.23 | --- |
+
+## Release branch
+
+Each release is built from a dedicated release branch.
+
+* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
+* For post releases, previously cut release branch is reused
+* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
+* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
+* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
+
+## Release Cherry-Pick Criteria
+
+After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
+
+* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
+* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
+* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
+* Documentation improvements
+* Release branch specific changes (e.g. change version identifiers or CI fixes)
+
+Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.

From 6654326c77a0213a23b2bcddabba9c54996ab9dc Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 1 Mar 2025 04:53:31 +0800
Subject: [PATCH 0445/1240] [v1] Move block pool operations to a separate class
 (#13973)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_prefix_caching.py |  89 +++++----
 vllm/v1/core/block_pool.py           | 285 +++++++++++++++++++++++++++
 vllm/v1/core/kv_cache_manager.py     | 263 +++---------------------
 3 files changed, 360 insertions(+), 277 deletions(-)
 create mode 100644 vllm/v1/core/block_pool.py

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index d598d12571f..8956393c0bf 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
+from typing import List
+
 import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
-from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens
+from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
+                                         hash_block_tokens)
 
 
 def make_request(request_id,
@@ -62,14 +66,14 @@ def test_prefill():
     for block_id in (0, 1, 2):
         block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
         block_hash = hash_block_tokens(parent_block_hash, block_tokens)
-        assert manager.block_pool[block_id].block_hash == block_hash
-        assert manager.block_pool[block_id].ref_cnt == 1
+        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
     for block_id in (3, 4):
-        assert manager.block_pool[block_id].block_hash is None
-        assert manager.block_pool[block_id].ref_cnt == 1
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
     # Cache hit in the common prefix when the original block is still in use.
     # Incomplete 1 block (5 tokens)
@@ -86,20 +90,21 @@ def test_prefill():
         assert block.ref_cnt == 2
 
     # At this point, we should have 3 free blocks left.
-    assert manager.free_block_queue.num_free_blocks == 3
+    assert manager.block_pool.free_block_queue.num_free_blocks == 3
 
     manager.free(req0)
     manager.free(req1)
 
     # All blocks should be available.
-    assert manager.free_block_queue.num_free_blocks == 10
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
     # [unallocated (7, 8, 9)]
     # [unique_req0 (4, 3)]
     # [unique_req1 (6, 5)]
     # [common (2, 1, 0)]
     assert [
-        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
 
     # Cache hit in the common prefix when the original block is already free.
@@ -116,12 +121,14 @@ def test_prefill():
 
     # Although we only have 5 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.free_block_queue.num_free_blocks == 5
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
     assert all([
-        b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks()
+        b.ref_cnt == 0
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ])
-    assert len([b
-                for b in manager.free_block_queue.get_all_free_blocks()]) == 5
+    assert len([
+        b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ]) == 5
 
     manager.free(req2)
 
@@ -133,9 +140,9 @@ def test_prefill():
     blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
     assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
-    assert manager.free_block_queue.num_free_blocks == 0
-    assert manager.free_block_queue.free_list_head is None
-    assert manager.free_block_queue.free_list_tail is None
+    assert manager.block_pool.free_block_queue.num_free_blocks == 0
+    assert manager.block_pool.free_block_queue.free_list_head is None
+    assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
 def test_decode():
@@ -219,13 +226,14 @@ def test_evict():
     assert len(blocks) == 3  # 3 full blocks
     last_token_id += 3 * 16
 
-    assert manager.free_block_queue.num_free_blocks == 0
+    assert manager.block_pool.free_block_queue.num_free_blocks == 0
 
     manager.free(req0)
     manager.free(req1)
-    assert manager.free_block_queue.num_free_blocks == 10
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
     assert [
-        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
     ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
 
     # Touch the first 2 blocks.
@@ -235,7 +243,7 @@ def test_evict():
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
     assert [b.block_id for b in blocks] == [6, 5]
-    assert manager.free_block_queue.num_free_blocks == 6
+    assert manager.block_pool.free_block_queue.num_free_blocks == 6
 
 
 def test_hash_block_correct_reuse():
@@ -274,7 +282,7 @@ def test_hash_block_correct_reuse():
     blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
     assert len(blocks) == 1
 
-    assert manager.block_pool[blocks[0].block_id].block_hash is None
+    assert manager.block_pool.blocks[blocks[0].block_id].block_hash is None
 
 
 def test_computed_blocks_not_evicted():
@@ -413,13 +421,9 @@ def test_cache_blocks():
     function of KVCacheManager.
     """
     block_size = 4
-    manager = KVCacheManager(
-        block_size=block_size,
+    block_pool = BlockPool(
         num_gpu_blocks=5,
-        max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
-        num_preallocate_tokens=0,
     )
     # Req:
     #  Block 0: [0, 1, 2, 3]
@@ -430,26 +434,31 @@ def test_cache_blocks():
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
+    block_hashes: List[BlockHashType] = []
 
-    manager._cache_full_blocks(
+    block_pool.cache_full_blocks(
         request=req,
-        blk_start_idx=0,
-        full_blocks=blocks,
-        prev_block=None,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=0,
+        num_full_blocks=2,
+        block_size=block_size,
     )
 
-    assert len(manager.cached_block_hash_to_block) == 2
+    assert len(block_pool.cached_block_hash_to_block) == 2
     assert all([block.block_hash is not None for block in blocks])
 
     # Test that blocks that don't start from the beginning are cached correctly.
-    blocks = [KVCacheBlock(block_id=2)]
-    manager._cache_full_blocks(
+    blocks += [KVCacheBlock(block_id=2)]
+    block_pool.cache_full_blocks(
         request=req,
-        blk_start_idx=2,
-        full_blocks=blocks,
-        prev_block=None,
+        blocks=blocks,
+        block_hashes=block_hashes,
+        num_cached_blocks=2,
+        num_full_blocks=3,
+        block_size=block_size,
     )
-    assert len(manager.cached_block_hash_to_block) == 3
+    assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
 
 
@@ -580,7 +589,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
     # but it cannot be allocated due to insufficient free blocks (2).
     # In this case, the ref_cnt of the computed blocks should not be changed.
-    assert manager.free_block_queue.num_free_blocks == 5
+    assert manager.block_pool.free_block_queue.num_free_blocks == 5
     req3 = make_request("3", common_token_ids * 3)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
     assert computed_blocks == block_part1
@@ -621,12 +630,12 @@ def test_reset_prefix_cache():
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
-    assert manager.cached_block_hash_to_block
+    assert manager.block_pool.cached_block_hash_to_block
 
     # Free the blocks.
     manager.free(req0)
     manager.free(req1)
 
     assert manager.reset_prefix_cache()
-    assert not manager.cached_block_hash_to_block
-    assert all([blk.block_hash is None for blk in manager.block_pool])
+    assert not manager.block_pool.cached_block_hash_to_block
+    assert all([blk.block_hash is None for blk in manager.block_pool.blocks])
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
new file mode 100644
index 00000000000..5ef495c7eed
--- /dev/null
+++ b/vllm/v1/core/block_pool.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import defaultdict
+from typing import Dict, Iterable, List, Optional
+
+from vllm.logger import init_logger
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens)
+from vllm.v1.request import Request
+
+logger = init_logger(__name__)
+
+
+class BlockPool:
+    """BlockPool that manages KVCacheBlocks.
+    It provides methods to allocate, free and cache the kv cache blocks. The 
+    free_block_queue stores the free blocks in eviction order to enable 
+    allocation, free, and cache eviction. The cached_block_hash_to_block 
+    maps between block hash and cached block to support finding cached blocks 
+    by their block hash.
+
+    Args:
+        num_gpu_blocks: The number of blocks in the pool.
+        enable_caching: Whether to enable prefix caching.
+    """
+
+    def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+        self.num_gpu_blocks = num_gpu_blocks
+        self.enable_caching = enable_caching
+        # All kv-cache blocks.
+        self.blocks: List[KVCacheBlock] = [
+            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
+        ]
+        # Free block queue that constructs and manipulates a doubly linked
+        # list of free blocks (including eviction candidates when caching is
+        # enabled).
+        self.free_block_queue = FreeKVCacheBlockQueue(self.blocks)
+
+        # {block_hash: {block ID: block}}. A cached block is
+        # a full block with a block hash that can be used for prefix caching.
+        # The cached block may be used by running requests or in the
+        # free_block_queue that could potentially be evicted.
+        # NOTE: We currently don't de-duplicate the blocks in the cache,
+        # meaning that if a block becomes full and is cached, we don't check
+        # if there is already an identical block in the cache. This is because
+        # we want to make sure the allocated block IDs won't change so that
+        # block tables are append-only.
+        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
+            int, KVCacheBlock]] = defaultdict(dict)
+
+    def get_cached_block(self,
+                         block_hash: BlockHashType) -> Optional[KVCacheBlock]:
+        """Get a cached block by the block hash, or None if cache miss.
+        If there are duplicated blocks, we return the first block in the cache.
+
+        Args:
+            block_hash: The hash value of the block.
+
+        Returns:
+            The cached block if it exists, or None.
+        """
+        if block_hash in self.cached_block_hash_to_block:
+            first_block_id = list(
+                self.cached_block_hash_to_block[block_hash].keys())[0]
+            return self.cached_block_hash_to_block[block_hash][first_block_id]
+        return None
+
+    def cache_full_blocks(
+        self,
+        request: Request,
+        blocks: List[KVCacheBlock],
+        block_hashes: List[BlockHashType],
+        num_cached_blocks: int,
+        num_full_blocks: int,
+        block_size: int,
+    ) -> None:
+        """Cache a list of full blocks for prefix caching.
+        This function takes a list of blocks that will have their block hash
+        metadata to be updated and cached. Given a request, it computes the
+        block hashes for the blocks starting from `num_cached_blocks` to 
+        `num_full_blocks`, updating the metadata for each block
+        and caching them in the `cached_block_hash_to_block`.
+
+        Args:
+            request: The request to cache the blocks.
+            blocks: All blocks in the request.
+            block_hashes: Block hashes of the blocks in the request. Note that
+            this list may be shorter than the blocks list. In this case the 
+            missed block hash will be computed in this function.
+            num_cached_blocks: The number of blocks that are already cached.
+            num_full_blocks: The number of blocks that are full and should 
+                be cached after this function.
+            block_size: Number of tokens in each block.
+        """
+        if num_cached_blocks == num_full_blocks:
+            return
+        new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
+        assert len(block_hashes) >= num_cached_blocks
+        new_block_hashes = block_hashes[num_cached_blocks:]
+
+        # Update the new blocks with the block hashes through the chain.
+        if num_cached_blocks == 0:
+            prev_block_hash_value = None
+        else:
+            prev_block = blocks[num_cached_blocks - 1]
+            assert prev_block.block_hash is not None
+            prev_block_hash_value = prev_block.block_hash.hash_value
+
+        # Find the first uncached block.
+        # FIXME: num_cached_blocks should be corrected by the caller
+        # so this should never happen.
+        offset = 0
+        for blk in new_full_blocks:
+            if blk.block_hash is None:
+                break
+            else:
+                prev_block_hash_value = blk.block_hash.hash_value
+                offset += 1
+        else:
+            # All blocks are cached.
+            return
+
+        for i, blk in enumerate(new_full_blocks[offset:]):
+            blk_idx = num_cached_blocks + offset + i
+            assert blk.block_hash is None
+
+            if i + offset < len(new_block_hashes):
+                # The block hash may already be computed in
+                # "get_computed_blocks" if the tokens are not generated by
+                # this request (either the prompt tokens or the previously
+                # generated tokens with preemption). In this case we simply
+                # reuse the block hash.
+                block_hash = new_block_hashes[i + offset]
+            else:
+                # Otherwise compute the block hash and cache it in the request
+                # in case it will be preempted in the future.
+                start_token_idx = blk_idx * block_size
+                end_token_idx = (blk_idx + 1) * block_size
+                block_tokens = request.all_token_ids[
+                    start_token_idx:end_token_idx]
+                assert len(block_tokens) == block_size, (
+                    f"Expected {block_size} tokens, got "
+                    f"{len(block_tokens)} at {blk_idx}th block for request "
+                    f"{request.request_id}({request})")
+
+                # Generate extra keys for multi-modal inputs. Note that since
+                # we reach to this branch only when the block is completed with
+                # generated tokens, we only need to consider the last mm input.
+                extra_keys, _ = generate_block_hash_extra_keys(
+                    request, start_token_idx, end_token_idx, -1)
+
+                # Compute the hash of the current block.
+                block_hash = hash_block_tokens(prev_block_hash_value,
+                                               block_tokens, extra_keys)
+                block_hashes.append(block_hash)
+
+            # Update and added the full block to the cache.
+            blk.block_hash = block_hash
+            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
+            prev_block_hash_value = block_hash.hash_value
+
+    def get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
+        """Get new blocks from the free block pool.
+
+        Note that we do not check block cache in this function.
+
+        Args:
+            num_blocks: The number of blocks to allocate.
+
+        Returns:
+            A list of new block.
+        """
+        if num_blocks > self.get_num_free_blocks():
+            raise ValueError(
+                f"Cannot get {num_blocks} free blocks from the pool")
+
+        ret: List[KVCacheBlock] = []
+        idx = 0
+        while idx < num_blocks:
+            # First allocate blocks.
+            curr_block = self.free_block_queue.popleft()
+            assert curr_block.ref_cnt == 0
+
+            # If the block is cached, evict it.
+            if self.enable_caching:
+                self._maybe_evict_cached_block(curr_block)
+
+            curr_block.incr_ref()
+            ret.append(curr_block)
+            idx += 1
+
+        return ret
+
+    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
+        """
+        If a block is cached in `cached_block_hash_to_block`, we reset its hash
+        metadata and evict it from the cache.
+
+        Args:
+            block: The block to evict.
+
+        Returns:
+            True if the block is evicted, False otherwise.
+        """
+        block_hash = block.block_hash
+        if block_hash and block_hash in self.cached_block_hash_to_block:
+            block.reset_hash()
+            del self.cached_block_hash_to_block[block_hash][block.block_id]
+
+            if len(self.cached_block_hash_to_block[block_hash]) == 0:
+                del self.cached_block_hash_to_block[block_hash]
+
+            return True
+        return False
+
+    def touch(self, blocks: List[KVCacheBlock]) -> None:
+        """Touch a block increases its reference count by 1, and may remove
+        the block from the free queue. This is used when a block is hit by
+        another request with the same prefix.
+
+        Args:
+            blocks: A list of blocks to touch.
+        """
+        for block in blocks:
+            # ref_cnt=0 means this block is in the free list (i.e. eviction
+            # candidate), so remove it.
+            if block.ref_cnt == 0:
+                self.free_block_queue.remove(block)
+            block.incr_ref()
+
+    def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
+        """Free a list of blocks. The blocks should be ordered by their
+        eviction priority, where the first block will be evicted first.
+
+        Args:
+            ordered_blocks: A list of blocks to free ordered by their eviction
+                priority.
+        """
+        for block in ordered_blocks:
+            block.decr_ref()
+            if block.ref_cnt == 0:
+                self.free_block_queue.append(block)
+
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
+        if num_used_blocks > 0:
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks)
+            return False
+
+        # Remove all hashes so that no new blocks will hit.
+        self.cached_block_hash_to_block = defaultdict(dict)
+
+        # Remove all hashes from all blocks.
+        for block in self.blocks:
+            block.reset_hash()
+
+        logger.info("Successfully reset prefix cache")
+        return True
+
+    def get_num_free_blocks(self) -> int:
+        """Get the number of free blocks in the pool.
+
+        Returns:
+            The number of free blocks.
+        """
+        return self.free_block_queue.num_free_blocks
+
+    def get_usage(self) -> float:
+        """Get the KV cache usage.
+
+        Returns:
+            The KV cache usage (between 0.0 and 1.0).
+        """
+        return 1.0 - (self.get_num_free_blocks() / self.num_gpu_blocks)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 017e625dcdb..fc7bfa0eff5 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -5,10 +5,8 @@
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock,
-                                         generate_block_hash_extra_keys,
-                                         hash_block_tokens,
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
@@ -49,26 +47,7 @@ def __init__(
         self.num_preallocate_tokens = num_preallocate_tokens
         self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
 
-        # A Block pool of all kv-cache blocks.
-        self.block_pool: List[KVCacheBlock] = [
-            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
-        ]
-        # Free block queue that constructs and manipulates a doubly linked
-        # list of free blocks (including eviction candidates when caching is
-        # enabled).
-        self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool)
-
-        # {block_hash: {block ID: block}}. A cached block is
-        # a full block with a block hash that can be used for prefix caching.
-        # The cached block may be used by running requests or in the
-        # free_block_queue that could potentially be evicted.
-        # NOTE: We currently don't de-duplicate the blocks in the cache,
-        # meaning that if a block becomes full and is cached, we don't check
-        # if there is already an identical block in the cache. This is because
-        # we want to make sure the allocated block IDs won't change so that
-        # block tables are append-only.
-        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
-            int, KVCacheBlock]] = defaultdict(dict)
+        self.block_pool = BlockPool(num_gpu_blocks, enable_caching)
 
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
@@ -96,8 +75,7 @@ def usage(self) -> float:
         Returns:
             The KV cache usage (between 0.0 and 1.0).
         """
-        return 1.0 - (self.free_block_queue.num_free_blocks /
-                      self.num_gpu_blocks)
+        return self.block_pool.get_usage()
 
     def make_prefix_cache_stats(self) -> PrefixCacheStats:
         """Get (and reset) the prefix cache stats.
@@ -139,7 +117,7 @@ def get_computed_blocks(
             # block_hashes is a chain of block hashes. If a block hash is not
             # in the cached_block_hash_to_id, the following block hashes are
             # not computed yet for sure.
-            if cached_block := self._get_cached_block(block_hash):
+            if cached_block := self.block_pool.get_cached_block(block_hash):
                 computed_blocks.append(cached_block)
             else:
                 break
@@ -204,14 +182,14 @@ def allocate_slots(
         # when allocating this request.
         num_evictable_computed_blocks = sum(1 for blk in new_computed_blocks
                                             if blk.ref_cnt == 0)
-        if (num_new_blocks > self.free_block_queue.num_free_blocks -
+        if (num_new_blocks > self.block_pool.get_num_free_blocks() -
                 num_evictable_computed_blocks):
             # Cannot allocate new blocks
             return None
 
         # Touch the computed blocks to make sure they won't be evicted.
         if self.enable_caching:
-            self._touch(new_computed_blocks)
+            self.block_pool.touch(new_computed_blocks)
         else:
             assert not new_computed_blocks, (
                 "Computed blocks should be empty when "
@@ -231,7 +209,7 @@ def allocate_slots(
             # preallocated blocks.
             num_new_blocks = min(
                 num_new_blocks + self.num_preallocate_blocks,
-                self.free_block_queue.num_free_blocks,
+                self.block_pool.get_num_free_blocks(),
                 # Should not exceed the maximum number of blocks per request.
                 # This is especially because the block table has the shape
                 # [..., max_num_blocks_per_req].
@@ -240,29 +218,30 @@ def allocate_slots(
             assert num_new_blocks > 0
 
             # Concatenate the computed block IDs and the new block IDs.
-            new_blocks = self._get_new_blocks(num_new_blocks)
+            new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
             req_blocks.extend(new_blocks)
 
         if not self.enable_caching:
             return new_blocks
 
+        # FIXME: `num_cached_blocks` is not correct when the prefix cache
+        # of a new request is hit.
         num_cached_blocks = self.num_cached_block[request.request_id]
         # Speculated tokens might be rejected in the future, so we does
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.
         num_full_blocks_after_append = (num_computed_tokens + num_tokens - len(
             request.spec_token_ids)) // self.block_size
-        new_full_blocks = req_blocks[
-            num_cached_blocks:num_full_blocks_after_append]
-
-        if new_full_blocks:
-            self._cache_full_blocks(
-                request=request,
-                blk_start_idx=num_cached_blocks,
-                # The new full blocks are the full blocks that are not computed.
-                full_blocks=new_full_blocks,
-                prev_block=(req_blocks[num_cached_blocks -
-                                       1] if num_cached_blocks > 0 else None))
+
+        self.block_pool.cache_full_blocks(
+            request=request,
+            blocks=req_blocks,
+            block_hashes=self.req_to_block_hashes[request.request_id],
+            num_cached_blocks=num_cached_blocks,
+            num_full_blocks=num_full_blocks_after_append,
+            block_size=self.block_size,
+        )
+
         self.num_cached_block[
             request.request_id] = num_full_blocks_after_append
         return new_blocks
@@ -283,11 +262,7 @@ def free(self, request: Request) -> None:
             # freed first.
             ordered_blocks = reversed(blocks)
 
-        for block in ordered_blocks:
-            block.decr_ref()
-            if block.ref_cnt == 0:
-                self.free_block_queue.append(block)
-
+        self.block_pool.free_blocks(ordered_blocks)
         self.num_cached_block.pop(request.request_id, None)
 
     def reset_prefix_cache(self) -> bool:
@@ -299,25 +274,10 @@ def reset_prefix_cache(self) -> bool:
             bool: True if the prefix cache is successfully reset,
             False otherwise.
         """
-        num_used_blocks = (self.num_gpu_blocks -
-                           self.free_block_queue.num_free_blocks)
-        if num_used_blocks > 0:
-            logger.warning(
-                "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks)
-            return False
-
-        # Remove all hashes so that no new blocks will hit.
-        self.cached_block_hash_to_block = defaultdict(dict)
-
-        # Remove all hashes from all blocks.
-        for block in self.block_pool:
-            block.reset_hash()
-
-        self.prefix_cache_stats.reset = True
-
-        logger.info("Successfully reset prefix cache")
-        return True
+        if self.block_pool.reset_prefix_cache():
+            self.prefix_cache_stats.reset = True
+            return True
+        return False
 
     def get_num_common_prefix_blocks(
         self,
@@ -367,177 +327,6 @@ def get_num_common_prefix_blocks(
                 break
         return num_common_blocks
 
-    def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
-        """Get new blocks from the free block pool.
-
-        Note that we do not check block cache in this function.
-
-        Args:
-            num_blocks: The number of blocks to allocate.
-
-        Returns:
-            A list of new block.
-        """
-        if num_blocks > self.free_block_queue.num_free_blocks:
-            raise ValueError(
-                f"Cannot get {num_blocks} free blocks from the pool")
-
-        ret: List[KVCacheBlock] = []
-        idx = 0
-        while idx < num_blocks:
-            # First allocate blocks.
-            curr_block = self.free_block_queue.popleft()
-            assert curr_block.ref_cnt == 0
-
-            # If the block is cached, evict it.
-            if self.enable_caching:
-                self._maybe_evict_cached_block(curr_block)
-
-            curr_block.incr_ref()
-            ret.append(curr_block)
-            idx += 1
-
-        return ret
-
-    def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
-        """
-        If a block is cached in `cached_block_hash_to_block`, we reset its hash
-        metadata and evict it from the cache.
-
-        Args:
-            block: The block to evict.
-
-        Returns:
-            True if the block is evicted, False otherwise.
-        """
-        block_hash = block.block_hash
-        if block_hash and block_hash in self.cached_block_hash_to_block:
-            block.reset_hash()
-            del self.cached_block_hash_to_block[block_hash][block.block_id]
-
-            if len(self.cached_block_hash_to_block[block_hash]) == 0:
-                del self.cached_block_hash_to_block[block_hash]
-
-            return True
-        return False
-
-    def _get_cached_block(self,
-                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
-        """Get a cached block by the block hash, or None if cache miss.
-        If there are duplicated blocks, we return the first block in the cache.
-
-        Args:
-            block_hash: The hash value of the block.
-
-        Returns:
-            The cached block if it exists, or None.
-        """
-        if block_hash in self.cached_block_hash_to_block:
-            first_block_id = list(
-                self.cached_block_hash_to_block[block_hash].keys())[0]
-            return self.cached_block_hash_to_block[block_hash][first_block_id]
-        return None
-
-    def _touch(self, blocks: List[KVCacheBlock]) -> None:
-        """Touch a block increases its reference count by 1, and may remove
-        the block from the free queue. This is used when a block is hit by
-        another request with the same prefix.
-
-        Args:
-            blocks: A list of blocks to touch.
-        """
-        for block in blocks:
-            # ref_cnt=0 means this block is in the free list (i.e. eviction
-            # candidate), so remove it.
-            if block.ref_cnt == 0:
-                self.free_block_queue.remove(block)
-            block.incr_ref()
-
-    def _cache_full_blocks(
-        self,
-        request: Request,
-        blk_start_idx: int,
-        full_blocks: List[KVCacheBlock],
-        prev_block: Optional[KVCacheBlock],
-    ) -> None:
-        """Cache a list of full blocks for prefix caching.
-
-        This function takes a list of blocks that will have their block hash
-        metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `blk_start_idx` to the end
-        of the request's full blocks, updating the metadata for each block
-        and caching them in the `cached_block_hash_to_block`.
-
-        Args:
-            request: The request to cache the blocks.
-            blk_start_idx: The index of the first block in the request's blocks
-                to cache.
-            full_blocks: The list of blocks to update hash metadata.
-            prev_block: The previous block in the chain.
-        """
-        block_hashes = self.req_to_block_hashes[request.request_id]
-        num_cached_block_hashes = len(block_hashes)
-
-        # Update the new blocks with the block hashes through the chain.
-        prev_block_hash_value = None
-        if prev_block is not None:
-            # Previous block must have a block hash because it must be
-            # a full, cached block.
-            assert prev_block.block_hash is not None
-            prev_block_hash_value = prev_block.block_hash.hash_value
-
-        # Find the first uncached block. This case should only happen when
-        # speculative decoding is used.
-        offset = 0
-        for blk in full_blocks:
-            if blk.block_hash is None:
-                break
-            else:
-                prev_block_hash_value = blk.block_hash.hash_value
-                offset += 1
-        else:
-            # All blocks are cached.
-            return
-
-        for i, blk in enumerate(full_blocks[offset:]):
-            blk_idx = blk_start_idx + offset + i
-            assert blk.block_hash is None
-
-            if blk_idx < num_cached_block_hashes:
-                # The block hash may already be computed in
-                # "get_computed_blocks" if the tokens are not generated by
-                # this request (either the prompt tokens or the previously
-                # generated tokens with preemption). In this case we simply
-                # reuse the block hash.
-                block_hash = block_hashes[blk_idx]
-            else:
-                # Otherwise compute the block hash and cache it in the request
-                # in case it will be preempted in the future.
-                start_token_idx = blk_idx * self.block_size
-                end_token_idx = (blk_idx + 1) * self.block_size
-                block_tokens = request.all_token_ids[
-                    start_token_idx:end_token_idx]
-                assert len(block_tokens) == self.block_size, (
-                    f"Expected {self.block_size} tokens, got "
-                    f"{len(block_tokens)} at {blk_idx}th block for request "
-                    f"{request.request_id}({request})")
-
-                # Generate extra keys for multi-modal inputs. Note that since
-                # we reach to this branch only when the block is completed with
-                # generated tokens, we only need to consider the last mm input.
-                extra_keys, _ = generate_block_hash_extra_keys(
-                    request, start_token_idx, end_token_idx, -1)
-
-                # Compute the hash of the current block.
-                block_hash = hash_block_tokens(prev_block_hash_value,
-                                               block_tokens, extra_keys)
-                block_hashes.append(block_hash)
-
-            # Update and added the full block to the cache.
-            blk.block_hash = block_hash
-            self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
-            prev_block_hash_value = block_hash.hash_value
-
     def free_block_hashes(self, request: Request) -> None:
         """Discard the block hashes for the request.
 

From 3eee39f8f957ccf87cd5774e8cefd5d3a8821e1d Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 28 Feb 2025 13:47:44 -0800
Subject: [PATCH 0446/1240] [core] Bump ray to 2.43 (#13994)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/dependabot.yml                    |  2 +-
 requirements-cuda.txt                     |  2 +-
 requirements-test.in                      |  2 +-
 requirements-test.txt                     |  2 +-
 vllm/executor/ray_distributed_executor.py | 10 ++++------
 5 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 683b70cd899..a017d69be99 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -23,7 +23,7 @@ updates:
       - dependency-name: "lm-format-enforcer"
       - dependency-name: "gguf"
       - dependency-name: "compressed-tensors"
-      - dependency-name: "ray[adag]"
+      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
       - dependency-name: "lm-eval"
     groups:
       minor-update:
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index bc670b8511f..2de06668c3a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for NVIDIA GPUs
-ray[adag] == 2.40.0 # Required for pipeline parallelism in V1.
+ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch == 2.5.1
 torchaudio==2.5.1
 # These must be updated alongside torch
diff --git a/requirements-test.in b/requirements-test.in
index 53c531360d8..de33f92b37b 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -16,7 +16,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[adag]==2.40.0
+ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements-test.txt b/requirements-test.txt
index 11f0e10969a..f5722c82e20 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -472,7 +472,7 @@ pyyaml==6.0.2
     #   vocos
 rapidfuzz==3.12.1
     # via jiwer
-ray==2.40.0
+ray==2.43.0
     # via -r requirements-test.in
 redis==5.2.0
     # via tensorizer
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 2accb9e17f3..108f606e2fb 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -500,7 +500,7 @@ def _check_ray_cgraph_installation(self):
         import pkg_resources
         from packaging import version
 
-        required_version = version.parse("2.40")
+        required_version = version.parse("2.43.0")
         current_version = version.parse(
             pkg_resources.get_distribution("ray").version)
         if current_version < required_version:
@@ -512,20 +512,19 @@ def _check_ray_cgraph_installation(self):
             "ray.experimental.compiled_dag_ref")
         if cgraph_spec is None:
             raise ValueError("Ray Compiled Graph is not installed. "
-                             "Run `pip install ray[adag]` to install it.")
+                             "Run `pip install ray[cgraph]` to install it.")
 
         cupy_spec = importlib.util.find_spec("cupy")
         if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
             raise ValueError(
                 "cupy is not installed but required since "
                 "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
-                "Run `pip install ray[adag]` and check cupy installation.")
+                "Run `pip install ray[cgraph]` and check cupy installation.")
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
         assert self.parallel_config.use_ray
         self._check_ray_cgraph_installation()
         from ray.dag import InputNode, MultiOutputNode
-        from ray.experimental.channel.torch_tensor_type import TorchTensorType
 
         logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
@@ -574,8 +573,7 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                         if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
                         else "auto"
                     outputs = [
-                        output.with_type_hint(
-                            TorchTensorType(transport=transport))
+                        output.with_tensor_transport(transport=transport)
                         for output in outputs
                     ]
 

From 714ceb5546777bdca5b12a401fdb08da26cd9c3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 28 Feb 2025 18:20:11 -0500
Subject: [PATCH 0447/1240] [torch.compile] Fix RMSNorm + quant fusion in the
 non-cutlass-fp8 case, rename RedundantReshapesPass to NoopEliminationPass
 (#10902)

Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/backend.py                      |  13 +-
 tests/compile/test_functionalization.py       |   8 +-
 tests/compile/test_fusion.py                  | 127 ++++++++--------
 vllm/compilation/noop_elimination.py          | 135 ++++++++++++++++++
 vllm/compilation/pass_manager.py              |   8 +-
 vllm/compilation/reshapes.py                  |  90 ------------
 vllm/compilation/vllm_inductor_pass.py        |  18 ++-
 vllm/config.py                                |  13 +-
 .../layers/quantization/utils/w8a8_utils.py   |   7 +-
 9 files changed, 249 insertions(+), 170 deletions(-)
 create mode 100644 vllm/compilation/noop_elimination.py
 delete mode 100644 vllm/compilation/reshapes.py

diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 74bc58a2dd5..64416eb136c 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -13,21 +13,26 @@ class TestBackend:
     This class provides a simple Inductor backend that can be used for testing.
     It takes a list of custom passes and runs them after Inductor's passes.
     It also saves the graph before and after the custom passes for inspection.
+
+    Inductor config can be modified directly by editing the inductor_config
+    property. This can be helpful for adding passes like the
+    'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
     """
 
     def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
                                                              None]]):
         self.custom_passes = list(passes)
         from torch._inductor import config
-        self.current_config = config.shallow_copy_dict()
-        self.current_config['force_disable_caches'] = True
-        self.current_config['post_grad_custom_post_pass'] = self.post_pass
+        self.inductor_config = config.shallow_copy_dict()
+        self.inductor_config['force_disable_caches'] = True
+        self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
 
     def __call__(self, graph: fx.GraphModule, example_inputs):
+        self.graph_pre_compile = deepcopy(graph)
         from torch._inductor.compile_fx import compile_fx
         return compile_fx(graph,
                           example_inputs,
-                          config_patches=self.current_config)
+                          config_patches=self.inductor_config)
 
     def post_pass(self, graph: fx.Graph):
         self.graph_pre_pass = deepcopy(graph)
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index 8f504052269..9f9b2d06b22 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -9,7 +9,7 @@
 from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
                                      kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
-from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig
 
 from .backend import TestBackend
@@ -50,11 +50,11 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
     torch.set_default_device("cuda")
 
     config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
-                                          enable_reshape=True)
-    reshape_pass = RedundantReshapesPass(config)
+                                          enable_noop=True)
+    noop_pass = NoOpEliminationPass(config)
     fusion_pass = FusionPass.instance(config)
 
-    passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass]
+    passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
     func_pass = FixFunctionalizationPass(config)
     backend_func = TestBackend(*passes, func_pass)
     backend_no_func = TestBackend(*passes)
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index c14f0caab53..89abc001764 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -5,23 +5,25 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
+import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
                                      FusionPass, QuantKey)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
-from vllm.compilation.reshapes import RedundantReshapesPass
-from vllm.config import CompilationConfig
+from vllm.compilation.noop_elimination import NoOpEliminationPass
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    CUTLASS_FP8_SUPPORTED, apply_fp8_linear, maybe_create_device_identity)
 
 from .backend import TestBackend
 
 
 class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, eps: float, static: bool, *args,
-                 **kwargs):
+    def __init__(self, hidden_size: int, eps: float, static: bool,
+                 cutlass_fp8_enabled: bool, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.cutlass_fp8_enabled = cutlass_fp8_enabled
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         if static:
@@ -41,7 +43,8 @@ def forward(self, x):
                               self.w[0],
                               self.wscale[0],
                               self.scale[0],
-                              use_per_token_if_dynamic=True)
+                              use_per_token_if_dynamic=True,
+                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
@@ -49,7 +52,8 @@ def forward(self, x):
                               self.w[1],
                               self.wscale[1],
                               self.scale[1],
-                              use_per_token_if_dynamic=True)
+                              use_per_token_if_dynamic=True,
+                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
@@ -59,60 +63,67 @@ def forward(self, x):
 @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
 @pytest.mark.parametrize("eps", [1e-5, 1e-6])
 @pytest.mark.parametrize("static", [True, False])
+@pytest.mark.parametrize("cutlass_fp8_enabled",
+                         [True, False] if CUTLASS_FP8_SUPPORTED else [False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
                     reason="Only test on CUDA")
-def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static):
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
+                              cutlass_fp8_enabled):
     torch.set_default_device("cuda")
     torch.set_default_dtype(dtype)
     torch.manual_seed(1)
+    maybe_create_device_identity()  # needed for certain non-cutlass fp8 paths
 
-    # Reshape pass is needed for the fusion pass to work
-    config = CompilationConfig.PassConfig(enable_fusion=True,
-                                          enable_reshape=True)
-    reshape_pass = RedundantReshapesPass(config)
-    fusion_pass = FusionPass.instance(config)
-
-    backend = TestBackend(reshape_pass, fusion_pass)
-    model = TestModel(hidden_size, eps, static)
-
-    # First dimension dynamic
-    x = torch.rand(num_tokens, hidden_size)
-    torch._dynamo.mark_dynamic(x, 0)
-
-    result = model(x)
-
-    model2 = torch.compile(model, backend=backend)
-    result2 = model2(x)
-
-    # Higher tol for dynamic, even higher for bfloat16
-    if static:
-        ATOL, RTOL = (1e-3, 1e-3)
-    elif dtype == torch.float16:
-        ATOL, RTOL = (2e-3, 2e-3)
-    else:
-        ATOL, RTOL = (1e-2, 1e-2)
-
-    torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
-
-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
-
-    # static is per-tensor, dynamic is per-token
-    key = QuantKey(dtype=FP8_DTYPE,
-                   static=static,
-                   per_tensor=static,
-                   symmetric=True)
-    rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
-    add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
-    fp8_quant = QUANT_OPS[key]
-
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
-    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
-
-    # In post-nodes, fused kernels should be present and fp8 quant should not
-    find_auto_fn(post_nodes, rms_quant)
-    find_auto_fn(post_nodes, add_rms_quant)
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
+    with vllm.config.set_current_vllm_config(vllm_config):
+        # Reshape pass is needed for the fusion pass to work
+        config = CompilationConfig.PassConfig(enable_fusion=True,
+                                              enable_noop=True)
+        noop_pass = NoOpEliminationPass(config)
+        fusion_pass = FusionPass.instance(config)
+
+        backend = TestBackend(noop_pass, fusion_pass)
+        model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)
+
+        # First dimension dynamic
+        x = torch.rand(num_tokens, hidden_size)
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result = model(x)
+
+        model2 = torch.compile(model, backend=backend)
+        result2 = model2(x)
+
+        # Higher tol for dynamic, even higher for bfloat16
+        if static:
+            ATOL, RTOL = (1e-3, 1e-3)
+        elif dtype == torch.float16:
+            ATOL, RTOL = (2e-3, 2e-3)
+        else:
+            ATOL, RTOL = (1e-2, 1e-2)
+
+        torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
+
+        # Check substitution worked
+        pre_nodes = backend.graph_pre_pass.nodes
+        post_nodes = backend.graph_post_pass.nodes
+
+        # static is per-tensor, dynamic is per-token
+        key = QuantKey(dtype=FP8_DTYPE,
+                       static=static,
+                       per_tensor=static,
+                       symmetric=True)
+        rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
+        add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
+        fp8_quant = QUANT_OPS[key]
+
+        # In pre-nodes, fp8 quant should be there and fused kernels should not
+        assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
+        assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
+        find_auto_fn(pre_nodes, fp8_quant)
+
+        # In post-nodes, fused kernels should be there and fp8 quant should not
+        find_auto_fn(post_nodes, rms_quant)
+        find_auto_fn(post_nodes, add_rms_quant)
+        assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py
new file mode 100644
index 00000000000..19127e933ec
--- /dev/null
+++ b/vllm/compilation/noop_elimination.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Iterable, Union
+
+import torch.fx
+from torch import SymInt
+
+from vllm.logger import init_logger
+
+from .fx_utils import is_func
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class NoOpEliminationPass(VllmInductorPass):
+    """
+    This is an inductor pass that removes redundant reshape/slice operations.
+    It is required for RMSNorm-quant fusion to work properly.
+    That's because apply_fp8_linear adds a reshape, which is redundant
+    in the 2D-case. Additionally, torch internal no-op elimination pass does
+    not handle certain slice variants.
+
+    Example graph 1:
+    getitem_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
+    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+
+    Can be replaced with:
+    getitem_1: "f16[s0, 4096]" = ...
+    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+
+    Example graph 2:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    slice_1: "f16[s0, 4096]" = torch.slice(scaled_mm, -1, 0, arg0)
+    at = auto_functionalized(fused_add_rms_norm, input = slice_1, ...)
+    out: "f16[s0, 4096]" = torch.slice_scatter(scaled_mm, at[1], 0, 0, arg0)
+
+    Can be replaced with:
+    arg0: "s0" = SymInt(s0)
+    scaled_mm: "f16[s0, 4096]" = ...
+    at = auto_functionalized(fused_add_rms_norm, input = scaled_mm, ...)
+    out: "f16[s0, 4096]" = at[1]
+
+    TODO(luka): This is currently tested in test_fusion,
+     but separate tests could be good.
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_noop_elimination")
+        count = 0
+        # Remove no-op reshapes/views:
+        for node in graph.nodes:
+            if is_func(node, torch.ops.aten.reshape.default):
+                input, shape = node.args[:2]
+                input_shape = input.meta["val"].shape
+                if len(shape) != len(input_shape):
+                    # Reshape changing rank, skip
+                    continue
+
+                if shape.count(-1) > 1:
+                    # Invalid reshape args, skip
+                    continue
+
+                if self.all_dims_equivalent(shape, input_shape):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+
+            elif is_func(node, torch.ops.aten.slice.Tensor):
+                input, dim_index, start, end = node.args[:4]
+                input_shape = input.meta["val"].shape
+                i_dim = input_shape[dim_index]
+
+                if start == 0 and self.dims_equivalent(end, i_dim):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+
+            elif is_func(node, torch.ops.aten.slice_scatter.default):
+                base, view, dim_index, start, end = node.args[:5]
+                base_shape = base.meta["val"].shape
+                view_shape = view.meta["val"].shape
+
+                view_dim = view_shape[dim_index]
+
+                # Check that view fully covers base and the full view is used
+                # (if the view fully covered the base after slicing but was not
+                # fully used, we could replace slice_scatter with a simple slice
+                # but that's a niche case).
+                if (base_shape == view_shape and start == 0
+                        and self.dims_equivalent(end, view_dim)):
+                    node.replace_all_uses_with(view)
+                    graph.erase_node(node)
+                    count += 1
+
+        logger.debug("Removed %s no-op reshapes and slices", count)
+        self.dump_graph(graph, "after_noop_elimination")
+        self.end_and_log()
+
+    def all_dims_equivalent(self, dims: Iterable[Union[int, torch.fx.Node]],
+                            i_dims: Iterable[Union[int, SymInt]]):
+        return all(
+            self.dims_equivalent(s, i_s) for s, i_s in zip(dims, i_dims))
+
+    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
+                        i_dim: Union[int, SymInt]) -> bool:
+        """
+        This function checks if two dimensions are equivalent.
+        :param dim: The dimension arg to reshape/slice
+        :param i_dim: The corresponding dimension in the input tensor
+        :return: Are the dimensions equivalent?
+
+        There are three cases in which the dimensions are equivalent:
+        1. The dimensions are equal (both integers)
+        2. The reshape dimension is -1 (i.e. inferred)
+        3. The dimensions both correspond to the same SymInt
+
+        While case 2 does not guarantee the dimensions are equal,
+        they are equal if all other dimensions are equal.
+
+        In case 3, the reshape dimension is a torch.fx.Node,
+        and its value is a SymInt. That value is equal to the
+        input dimension.
+
+        """
+        # Case 1 and 2
+        if dim == i_dim or dim == -1:
+            return True
+        # Case 3
+        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 52f8c3b1ec1..b012346c353 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -11,7 +11,7 @@
 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
 from .inductor_pass import InductorPass
-from .reshapes import RedundantReshapesPass
+from .noop_elimination import NoOpEliminationPass
 
 logger = init_logger(__name__)
 
@@ -36,7 +36,7 @@ class PostGradPassManager(Parent):
 
     The order of the post-grad post-passes is:
     1. passes (constructor parameter)
-    2. default passes (RedundantReshapesPass, FusionPass)
+    2. default passes (NoopEliminationPass, FusionPass)
     3. config["post_grad_custom_post_pass"] (if it exists)
     4. fix_functionalization
     This way, all passes operate on a functionalized graph.
@@ -54,8 +54,8 @@ def __call__(self, graph: fx.Graph):
 
     def configure(self, pass_config: CompilationConfig.PassConfig):
         self.pass_config = pass_config
-        if pass_config.enable_reshape:
-            self.passes += [RedundantReshapesPass(pass_config)]
+        if pass_config.enable_noop:
+            self.passes += [NoOpEliminationPass(pass_config)]
 
         if pass_config.enable_fusion:
             self.passes += [FusionPass.instance(pass_config)]
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
deleted file mode 100644
index 292baae8528..00000000000
--- a/vllm/compilation/reshapes.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Union
-
-import torch.fx
-from torch import SymInt
-
-from vllm.logger import init_logger
-
-from .fx_utils import is_func
-from .vllm_inductor_pass import VllmInductorPass
-
-logger = init_logger(__name__)
-
-
-class RedundantReshapesPass(VllmInductorPass):
-    """
-    This is an inductor pass that removes redundant reshape operations.
-    It is required for RMSNorm-quant fusion to work properly.
-    That's because apply_fp8_linear adds a reshape, which is redundant
-    in the 2D-case.
-
-    Example graph:
-
-    getitem_1: "f16[s0, 4096]" = ...
-    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
-    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
-    out: "f8e4m3fn[s0, 4096]" = at[1]
-
-    Can be replaced with:
-    getitem_1: "f16[s0, 4096]" = ...
-    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
-    out: "f8e4m3fn[s0, 4096]" = at[1]
-    """
-
-    def __call__(self, graph: torch.fx.Graph):
-        self.begin()
-        self.dump_graph(graph, "before_reshapes")
-        count = 0
-        # Remove no-op reshapes/views:
-        for node in graph.nodes:
-            if is_func(node, torch.ops.aten.reshape.default):
-                input, shape = node.args[:2]
-                input_shape = input.meta["val"].shape
-                if len(shape) != len(input_shape):
-                    # Reshape changing rank, skip
-                    continue
-
-                if shape.count(-1) > 1:
-                    # Invalid reshape args, skip
-                    continue
-
-                if all(
-                        self.dims_equivalent(s, i_s)
-                        for s, i_s in zip(shape, input_shape)):
-                    node.replace_all_uses_with(input)
-                    graph.erase_node(node)
-                    count += 1
-
-        logger.debug("Removed %s no-op reshapes", count)
-
-        self.dump_graph(graph, "after_reshapes")
-        self.end_and_log()
-
-    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
-                        i_dim: Union[int, SymInt]) -> bool:
-        """
-        This function checks if two dimensions are equivalent.
-        :param dim: The dimension arg to reshape
-        :param i_dim: The corresponding dimension in the input tensor
-        :return: Are the dimensions equivalent?
-
-        There are three cases in which the dimensions are equivalent:
-        1. The dimensions are equal (both integers)
-        2. The reshape dimension is -1 (i.e. inferred)
-        3. The dimensions both correspond to the same SymInt
-
-        While case 2 does not guarantee the dimensions are equal,
-        they are equal if all other dimensions are equal.
-
-        In case 3, the reshape dimension is a torch.fx.Node,
-        and its value is a SymInt. That value is equal to the
-        input dimension.
-
-        """
-        # Case 1 and 2
-        if dim == i_dim or dim == -1:
-            return True
-        # Case 3
-        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index 1d2597e4271..98ed6f1472a 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -28,8 +28,8 @@ def __init__(self, config: CompilationConfig.PassConfig):
         self.config = config
         self.pass_name = self.__class__.__name__
 
-    def dump_graph(self, graph: torch.fx.Graph, stage: str):
-        if stage in self.config.dump_graph_stages:
+    def dump_graph(self, graph: torch.fx.Graph, stage: str, always=False):
+        if stage in self.config.dump_graph_stages or always:
             # Make sure filename includes rank in the distributed setting
             parallel = p_is_init() and get_tp_world_size() > 1
             rank = f"-{get_tp_rank()}" if parallel else ""
@@ -49,3 +49,17 @@ def end_and_log(self):
         self._end_time = time.perf_counter_ns()
         duration_ms = float(self._end_time - self._start_time) / 1.0e6
         logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms)
+
+
+class PrinterInductorPass(VllmInductorPass):
+
+    def __init__(self,
+                 name: str,
+                 config: CompilationConfig.PassConfig,
+                 always=False):
+        super().__init__(config)
+        self.name = name
+        self.always = always
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.dump_graph(graph, self.name, always=self.always)
diff --git a/vllm/config.py b/vllm/config.py
index 78d02b01735..c7108473442 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2993,13 +2993,13 @@ class PassConfig(BaseModel):
             Each pass defines its own stages (before, after, maybe in-between).
         - dump_graph_dir: directory to dump the graphs. Default is .
         - enable_fusion: whether to enable the custom fusion pass.
-        - enable_reshape: whether to enable the custom reshape elimination pass.
-            TODO better pass enabling system.
+        - enable_noop: whether to enable the custom no-op elimination pass.
+            TODO(luka) better pass enabling system.
         """
         dump_graph_stages: List[str] = Field(default_factory=list)
         dump_graph_dir: Path = Field(default=Path("."))
         enable_fusion: bool = True
-        enable_reshape: bool = True
+        enable_noop: bool = True
 
         def uuid(self):
             """
@@ -3008,13 +3008,12 @@ def uuid(self):
             Do not include dump_graph_* in the hash - they don't affect
             compilation.
             """
-            dict_ = self.model_dump(
-                include={"enable_fusion", "enable_reshape"})
+            dict_ = self.model_dump(include={"enable_fusion", "enable_noop"})
             encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
             return hashlib.sha256(encoded).digest()
 
         def model_post_init(self, __context: Any) -> None:
-            if not self.enable_reshape and self.enable_fusion:
+            if not self.enable_noop and self.enable_fusion:
                 logger.warning_once(
                     "Fusion enabled but reshape elimination disabled. "
                     "RMSNorm + quant (fp8) fusion might not work")
@@ -3411,7 +3410,7 @@ def __post_init__(self):
             self.compilation_config.use_inductor = True
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
-            self.compilation_config.pass_config.enable_reshape = False
+            self.compilation_config.pass_config.enable_noop = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
 
         self._set_cudagraph_sizes()
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 0f93b7f6c45..8072f307763 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.config import CompilationLevel, get_current_vllm_config
 from vllm.platforms import current_platform
 
 # Input scaling factors are no longer optional in _scaled_mm starting
@@ -161,10 +162,14 @@ def apply_fp8_linear(
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
         # This could change in the future.
+        # We also don't pad when using torch.compile,
+        # as it breaks with dynamic shapes.
+        config = get_current_vllm_config().compilation_config
+        do_pad = config.level < CompilationLevel.PIECEWISE
         qinput, x_scale = ops.scaled_fp8_quant(
             input_2d,
             input_scale,
-            num_token_padding=17,
+            num_token_padding=17 if do_pad else None,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)

From e091d5e7aaf08dff4baaf55498cc06250a8cc9d9 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Sat, 1 Mar 2025 00:43:54 -0500
Subject: [PATCH 0448/1240] [Docs] Add `pipeline_parallel_size` to optimization
 docs (#14059)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/performance/optimization.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 4fbc376e1aa..5b0f8421a51 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -18,6 +18,7 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
 - Increase `gpu_memory_utilization`. The vLLM pre-allocates GPU cache by using gpu_memory_utilization% of memory. By increasing this utilization, you can provide more KV cache space.
 - Decrease `max_num_seqs` or `max_num_batched_tokens`. This can reduce the number of concurrent requests in a batch, thereby requiring less KV cache space.
 - Increase `tensor_parallel_size`. This approach shards model weights, so each GPU has more memory available for KV cache.
+- Increase `pipeline_parallel_size`. This approach distributes model layers across GPUs, reducing the memory needed for model weights on each GPU, which indirectly leaves more memory available for KV cache.
 
 You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
 

From 0d6d18b78abbeaf1442ac86ed72f3a13c1857532 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 1 Mar 2025 14:10:28 +0800
Subject: [PATCH 0449/1240] [Bugfix] Add file lock for ModelScope download
 (#14060)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py            | 15 ++++++++-----
 vllm/model_executor/model_loader/loader.py    | 20 ++++++++++-------
 .../model_loader/weight_utils.py              |  5 ++++-
 vllm/transformers_utils/tokenizer.py          | 22 ++++++++++++-------
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 364b087b841..e43549c13c8 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,6 +14,8 @@
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
+from vllm.model_executor.model_loader.weight_utils import get_lock
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 
@@ -430,12 +432,15 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
 
-        model_path = snapshot_download(
-            model_id=pretrained_model_name_or_path,
-            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+        # Use file lock to prevent multiple processes from
+        # downloading the same model weights at the same time.
+        with get_lock(pretrained_model_name_or_path):
+            model_path = snapshot_download(
+                model_id=pretrained_model_name_or_path,
+                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
 
-        return model_path
+            return model_path
     return pretrained_model_name_or_path
 
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 46247eaf2a6..6244241d189 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -49,7 +49,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
     filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
+    get_gguf_extra_tensor_names, get_lock, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
     runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
@@ -235,13 +235,17 @@ def _maybe_download_from_modelscope(
             from modelscope.hub.snapshot_download import snapshot_download
 
             if not os.path.exists(model):
-                model_path = snapshot_download(
-                    model_id=model,
-                    cache_dir=self.load_config.download_dir,
-                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                    revision=revision,
-                    ignore_file_pattern=self.load_config.ignore_patterns,
-                )
+                # Use file lock to prevent multiple processes from
+                # downloading the same model weights at the same time.
+                with get_lock(model, self.load_config.download_dir):
+                    model_path = snapshot_download(
+                        model_id=model,
+                        cache_dir=self.load_config.download_dir,
+                        local_files_only=huggingface_hub.constants.
+                        HF_HUB_OFFLINE,
+                        revision=revision,
+                        ignore_file_pattern=self.load_config.ignore_patterns,
+                    )
             else:
                 model_path = model
             return model_path
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 245c199f75b..d184079fb25 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -8,6 +8,7 @@
 import tempfile
 import time
 from collections import defaultdict
+from pathlib import Path
 from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
 
 import filelock
@@ -67,8 +68,10 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs, disable=True)
 
 
-def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
+def get_lock(model_name_or_path: Union[str, Path],
+             cache_dir: Optional[str] = None):
     lock_dir = cache_dir or temp_dir
+    model_name_or_path = str(model_name_or_path)
     os.makedirs(os.path.dirname(lock_dir), exist_ok=True)
     model_name = model_name_or_path.replace("/", "-")
     hash_name = hashlib.sha256(model_name.encode()).hexdigest()
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index f0aa5fdcaa6..2c34f2f5d44 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -150,16 +150,22 @@ def get_tokenizer(
         # pylint: disable=C.
         from modelscope.hub.snapshot_download import snapshot_download
 
+        # avoid circuit import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
         # Only set the tokenizer here, model will be downloaded on the workers.
         if not os.path.exists(tokenizer_name):
-            tokenizer_path = snapshot_download(
-                model_id=tokenizer_name,
-                cache_dir=download_dir,
-                revision=revision,
-                local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                # Ignore weights - we only need the tokenizer.
-                ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-            tokenizer_name = tokenizer_path
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=tokenizer_name,
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+                tokenizer_name = tokenizer_path
 
     if tokenizer_mode == "slow":
         if kwargs.get("use_fast", False):

From 3fe4bfa27add7af5537ea43f5ed395fb04b1c73f Mon Sep 17 00:00:00 2001
From: YajieWang <wyajieha@outlook.com>
Date: Sat, 1 Mar 2025 14:30:59 +0800
Subject: [PATCH 0450/1240] [Misc][Kernel]: Add GPTQAllSpark Quantization
 (#12931)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |   16 +
 benchmarks/kernels/benchmark_marlin.py        |   47 +-
 .../gptq_allspark/allspark_qgemm_w8a16.cu     | 1008 +++++++++++++++++
 .../gptq_allspark/allspark_repack.cu          |  163 +++
 .../gptq_allspark/allspark_utils.cuh          |  408 +++++++
 csrc/torch_bindings.cpp                       |   19 +
 tests/kernels/test_allspark_gemm.py           |  100 ++
 tests/quantization/test_compressed_tensors.py |    2 -
 vllm/_custom_ops.py                           |   77 ++
 .../kernels/mixed_precision/__init__.py       |    3 +
 .../kernels/mixed_precision/allspark.py       |  115 ++
 .../quantization/utils/allspark_utils.py      |   51 +
 12 files changed, 2005 insertions(+), 4 deletions(-)
 mode change 100755 => 100644 CMakeLists.txt
 create mode 100644 csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
 create mode 100644 csrc/quantization/gptq_allspark/allspark_repack.cu
 create mode 100644 csrc/quantization/gptq_allspark/allspark_utils.cuh
 create mode 100644 tests/kernels/test_allspark_gemm.py
 create mode 100644 vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/allspark_utils.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
old mode 100755
new mode 100644
index 0dd350c93ed..c5fc2f3c1aa
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,6 +317,22 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures")
   endif()
 
+  # Only build AllSpark kernels if we are building for at least some compatible archs.
+  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
+  if (ALLSPARK_ARCHS)
+    set(ALLSPARK_SRCS
+       "csrc/quantization/gptq_allspark/allspark_repack.cu"
+       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${ALLSPARK_SRCS}"
+      CUDA_ARCHS "${ALLSPARK_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
+    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
+  else()
+    message(STATUS "Not building AllSpark kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
+
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index c22e66c0b0c..21ef491294e 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -10,6 +10,8 @@
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
     GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
     MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
@@ -18,12 +20,12 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
     marlin_24_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    gptq_pack, gptq_quantize_weights, sort_weights)
+    gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
 from vllm.scalar_type import ScalarType
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
-DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
@@ -81,6 +83,27 @@ def bench_run(results: List[benchmark.Measurement], model: str,
                                           GPTQ_MARLIN_24_MAX_PARALLEL)
     marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
 
+    # AllSpark W8A16 quant
+    as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
+                         and group_size == -1 and not act_order and is_k_full)
+    if as_supported_case:
+        properties = torch.cuda.get_device_properties(b.device.index)
+        sm_count = properties.multi_processor_count
+        sm_version = properties.major * 10 + properties.minor
+
+        supported_arch = (sm_version >= 80 and sm_version < 90)
+        as_supported_case = as_supported_case and supported_arch
+        if supported_arch:
+            has_zp = False
+            w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size,
+                                                has_zp)
+            qw = qw.to(torch.uint8)
+
+            qw_reorder, s_reorder, zp_reorder = \
+                ops.allspark_repack_weight(
+                qw, s, zp, has_zp)
+            CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
+
     globals = {
         # Gen params
         "quant_type": quant_type,
@@ -109,10 +132,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
         # GPTQ params
         "q_w_gptq": q_w_gptq,
         "repack_sort_indices": repack_sort_indices,
+        # AllSpark W8A16 params
+        "qw_reorder": qw_reorder if as_supported_case else None,
+        "s_reorder": s_reorder if as_supported_case else None,
+        "zp_reorder": zp_reorder if as_supported_case else None,
+        "sm_count": sm_count if as_supported_case else None,
+        "sm_version": sm_version if as_supported_case else None,
+        "CUBLAS_M_THRESHOLD":
+        CUBLAS_M_THRESHOLD if as_supported_case else None,
         # Kernels
         "gptq_marlin_gemm": ops.gptq_marlin_gemm,
         "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
         "gptq_marlin_repack": ops.gptq_marlin_repack,
+        "allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
     }
 
     min_run_time = 1
@@ -172,6 +204,17 @@ def bench_run(results: List[benchmark.Measurement], model: str,
             description="gptq_marlin_repack",
         ).blocked_autorange(min_run_time=min_run_time))
 
+    if as_supported_case:
+        results.append(
+            benchmark.Timer(
+                stmt=
+                "output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)",  # noqa: E501
+                globals=globals,
+                label=label,
+                sub_label=sub_label,
+                description="allspark_w8a16_gemm_fp32",
+            ).blocked_autorange(min_run_time=min_run_time))
+
 
 def main(args):
     print("Benchmarking models:")
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
new file mode 100644
index 00000000000..c4ed98ca64f
--- /dev/null
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -0,0 +1,1008 @@
+#include "allspark_utils.cuh"
+#include <torch/all.h>
+#include "core/registration.h"
+#include <cublas_v2.h>
+
+at::Tensor as_g_workspace;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+
+torch::Tensor allspark_w8a16_gemm(
+    torch::Tensor const& a, torch::Tensor const& b_qweight,
+    torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros,
+    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
+    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false, "allspark_w8a16_gemm(..) requires CUDA_ARCH >= 8.0");
+  return torch::empty({1, 1});
+}
+
+#else
+namespace allspark {
+/*
+ * GemmTile manage data movement from Global Memory to Shared Memory
+ * requiring N % 8 == 0， K % 16 == 0 by loading uint
+ * BN is obtained by padding the original N to a multiple of 32
+ * weight B is rearranged as N32K16 order,
+ * i.e. a initial data block of size 32(n)x16(k) is reordered as n8k4n4k4，
+ * in order to put data loaded by the same thread of 32x16 data block together
+ * continuously (see
+ * https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type)
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
+          int BLOCK>
+struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
+  // element num loaded by a LDG inst.
+  static constexpr int LDG_ELEMENT_CNT_A = 8;
+  static constexpr int LDG_ELEMENT_CNT_B = 16;
+  static constexpr int WARP_SIZE = 32;
+  static constexpr int M_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_A) / 32;
+  static constexpr int N_SIZE_ONE_LOAD = (BLOCK * LDG_ELEMENT_CNT_B) / 32;
+
+  __device__ GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
+      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
+      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
+      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
+      : params(k_params),
+        A_smem_base_addr(A_smem_addr),
+        BQ_smem_base_addr(BQ_smem_addr),
+        A_smem_stage_stride(A_stage_stride),
+        BQ_smem_stage_stride(BQ_stage_stride) {
+    this_block_A_base_ptr = params.A_ptr + blockIdx.x * Mtile * params.K +
+                            blockIdx.z * params.SplitK;
+    // here B is rearranged as N32K16 order, i.e. 4 continuous N-direction
+    // 8(N)x16(K) size data blocks are packed together
+    this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
+                            blockIdx.z * params.SplitK * 4;
+
+    const int lane_id = threadIdx.x % WARP_SIZE;
+
+    // For matrix A, a block load/store Mtile(row) x 32(col) elements in
+    // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
+    const int Aldg_row_base_idx = threadIdx.x / 4;
+    Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
+    const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
+
+    // For matrix B, a block load/store elements of (Ntile / 4) row x 128 col
+    // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
+    // * 128(col) per iter
+    Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
+    const int Bldg_row_base_idx = threadIdx.x / 8;
+    const int Bldg_base_offset =
+        Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
+
+    this_block_A_base_ptr += Aldg_base_offset;
+    this_block_B_base_ptr += Bldg_base_offset;
+
+    const int sts_a_base_offset =
+        (threadIdx.x / 4) * 32 +
+        ((lane_id % 4) ^ ((lane_id / 4) % 4) ^ ((lane_id / 4) / 4)) *
+            LDG_ELEMENT_CNT_A;
+    const int sts_bq_base_offset =
+        Bldg_row_base_idx * 32 * 4 +
+        ((threadIdx.x % 8) ^ (((threadIdx.x / 8) % 2) * 4)) * LDG_ELEMENT_CNT_B;
+
+    A_smem_base_addr += sts_a_base_offset * sizeof(FType);
+    BQ_smem_base_addr += sts_bq_base_offset * sizeof(uint8_t);
+
+    A_ldg_guard = 0;
+    B_ldg_guard = 0;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      if (m_idx < params.M) {
+        A_ldg_guard |= (1u << i);
+      }
+    }
+
+    const int N_padded = (params.N + 31) / 32 * 32;
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+                  i * N_SIZE_ONE_LOAD;
+      if (n_idx < N_padded) {
+        B_ldg_guard |= (1u << i);
+      }
+    }
+  }
+
+  __device__ void ldgsts_first_ktiles(const int& first_k_tile,
+                                      const int& k_tiles) {
+    // load first k_tile
+    // load A
+    const int A_src_size = Aldg_col_idx < first_k_tile ? 16 : 0;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(
+          A_smem_base_addr + (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+          this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, A_src_size,
+          (A_ldg_guard & (1u << i)) != 0);
+    }
+
+    // load B
+    const int B_src_size = (Bldg_col_idx / 4) < first_k_tile ? 16 : 0;
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(
+          BQ_smem_base_addr + (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+          this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, B_src_size,
+          (B_ldg_guard & (1u << i)) != 0);
+    }
+
+    cp_async_commit_group();
+    this_block_A_base_ptr += first_k_tile;
+    this_block_B_base_ptr += (first_k_tile * 4);
+
+    // load second to (N-stage - 1) k_tiles
+    for (int stage_idx = 1; stage_idx < NStage - 1; ++stage_idx) {
+      if (stage_idx < k_tiles) {
+  #pragma unroll
+        for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD;
+             ++i) {
+          cp_async<16>(A_smem_base_addr + stage_idx * A_smem_stage_stride +
+                           (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+                       this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K,
+                       16, (A_ldg_guard & (1u << i)) != 0);
+        }
+
+  #pragma unroll
+        for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD;
+             ++i) {
+          cp_async<16>(BQ_smem_base_addr + stage_idx * BQ_smem_stage_stride +
+                           (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+                       this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K,
+                       16, (B_ldg_guard & (1u << i)) != 0);
+        }
+
+        this_block_A_base_ptr += 32;
+        this_block_B_base_ptr += (32 * 4);
+      }
+      cp_async_commit_group();
+    }
+  }
+
+  __device__ void ldgsts(const int& sts_stage_idx) {
+    const int a_stage_offset = sts_stage_idx * A_smem_stage_stride;
+    const int bq_stage_offset = sts_stage_idx * BQ_smem_stage_stride;
+  #pragma unroll
+    for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(A_smem_base_addr + a_stage_offset +
+                       (i * M_SIZE_ONE_LOAD * 32) * sizeof(FType),
+                   this_block_A_base_ptr + i * M_SIZE_ONE_LOAD * params.K, 16,
+                   (A_ldg_guard & (1u << i)) != 0);
+    }
+
+  #pragma unroll
+    for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
+      cp_async<16>(BQ_smem_base_addr + bq_stage_offset +
+                       (i * N_SIZE_ONE_LOAD * 32) * sizeof(uint8_t),
+                   this_block_B_base_ptr + i * N_SIZE_ONE_LOAD * params.K, 16,
+                   (B_ldg_guard & (1u << i)) != 0);
+    }
+
+    cp_async_commit_group();
+    this_block_A_base_ptr += 32;
+    this_block_B_base_ptr += (32 * 4);
+  }
+
+  const FType* this_block_A_base_ptr = nullptr;
+  const QType* this_block_B_base_ptr = nullptr;
+
+  int Aldg_col_idx;
+  int Bldg_col_idx;
+
+  uint32_t A_ldg_guard;
+  uint32_t B_ldg_guard;
+
+  uint32_t A_smem_base_addr, BQ_smem_base_addr;
+  const uint32_t A_smem_stage_stride, BQ_smem_stage_stride;
+
+  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
+};
+
+/*
+ * requiring N % 8 == 0
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int BLOCK,
+          bool EnableFuse, bool has_zp>
+struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
+  static constexpr int WARP_SIZE = 32;
+  static constexpr int WARP_CNT = BLOCK / WARP_SIZE;
+  static constexpr int WARP_NTILE = Ntile / WARP_CNT;
+  static constexpr int WARP_NITER = WARP_NTILE / 8;  // hmma16816
+  static_assert(WARP_NTILE == 32 or WARP_NTILE == 64,
+                "now only support WARP_NTILE = 32 or 64!");
+
+  __device__ ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK(
+      const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& k_params,
+      const uint32_t& A_smem_addr, const uint32_t& BQ_smem_addr,
+      const uint32_t& A_stage_stride, const uint32_t& BQ_stage_stride)
+      : params(k_params),
+        A_smem_base_addr(A_smem_addr),
+        BQ_smem_base_addr(BQ_smem_addr),
+        A_smem_stage_stride(A_stage_stride),
+        BQ_smem_stage_stride(BQ_stage_stride) {
+    warp_id = threadIdx.x / WARP_SIZE;
+    lane_id = threadIdx.x % WARP_SIZE;
+
+    load_a_base_offset[0] =
+        (lane_id % 16) * 32 +
+        ((lane_id / 16) ^ (lane_id % 4) ^ ((lane_id / 4) % 2)) * 8;
+    load_a_base_offset[1] =
+        (lane_id % 16) * 32 +
+        ((lane_id / 16 + 2) ^ (lane_id % 4) ^ ((lane_id / 4) % 2)) * 8;
+
+    load_b_base_offset[0] =
+        (lane_id / 4 + warp_id * (WARP_NTILE / 4)) * 32 * 4 +
+        (lane_id % 4) * 16 + ((lane_id / 4) % 2) * 16 * 4;
+    load_b_base_offset[1] =
+        (lane_id / 4 + warp_id * (WARP_NTILE / 4)) * 32 * 4 +
+        (lane_id % 4) * 16 + (((lane_id / 4) % 2) ^ 1) * 16 * 4;
+
+    sts_c_base_offset = warp_id * Mtile * WARP_NTILE +
+                        (lane_id / 4) * WARP_NTILE + (lane_id % 4) * 2;
+
+    if (EnableFuse) {
+      this_block_C_base_ptr =
+          params.C_ptr + blockIdx.x * Mtile * params.N + blockIdx.y * Ntile;
+    } else {
+      this_block_C_base_ptr =
+          params.C_split_ptr + blockIdx.z * params.M * params.N +
+          blockIdx.x * Mtile * params.N + blockIdx.y * Ntile;
+    }
+    int store_thds_in_row = WARP_NTILE / 8;
+    store_c_row_base_idx = lane_id / store_thds_in_row;
+    store_c_col_idx = warp_id * WARP_NTILE + (lane_id % store_thds_in_row) * 8;
+    store_c_base_offset = store_c_row_base_idx * params.N + store_c_col_idx;
+
+  #pragma unroll
+    for (int i = 0; i < Mtile / 16; ++i) {
+  #pragma unroll
+      for (int j = 0; j < WARP_NITER; ++j) {
+  #pragma unroll
+        for (int k = 0; k < 4; ++k) {
+          C_frag[i][j][k] = 0.f;
+        }
+      }
+    }
+    params_n_idx =
+        blockIdx.y * Ntile + warp_id * WARP_NTILE + (lane_id / 4) * 4;
+  }
+
+  __device__ void lds(const int& smem_stage_idx, const int& reg_buf_idx,
+                      const int& k_phase_idx) {
+    uint32_t A_smem_addr =
+        A_smem_base_addr + A_smem_stage_stride * smem_stage_idx;
+    uint32_t B_smem_addr =
+        BQ_smem_base_addr + BQ_smem_stage_stride * smem_stage_idx;
+
+  #pragma unroll
+    for (int i = 0; i < Mtile / 16; ++i) {
+      ldsm_4(A_frag[reg_buf_idx][i][0], A_frag[reg_buf_idx][i][1],
+             A_frag[reg_buf_idx][i][2], A_frag[reg_buf_idx][i][3],
+             A_smem_addr + (load_a_base_offset[k_phase_idx] + i * 16 * 32) *
+                               sizeof(FType));
+    }
+  #pragma unroll
+    for (int i = 0; i < WARP_NTILE / 32; ++i) {
+      lds128(BQ_frag[reg_buf_idx][4 * i + 0], BQ_frag[reg_buf_idx][4 * i + 1],
+             BQ_frag[reg_buf_idx][4 * i + 2], BQ_frag[reg_buf_idx][4 * i + 3],
+             B_smem_addr + (load_b_base_offset[k_phase_idx] + i * 32 * 32) *
+                               sizeof(uint8_t));
+    }
+
+  // dequant B
+  #pragma unroll
+    for (int i = 0; i < WARP_NITER / 2; ++i) {
+      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i],
+                                BF_frag[reg_buf_idx][2 * i]);
+      if (has_zp) {
+        BF_frag[reg_buf_idx][2 * i][0] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_zero[i].x));
+        BF_frag[reg_buf_idx][2 * i][1] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_zero[i].x));
+      }
+
+      BF_frag[reg_buf_idx][2 * i][0] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i][0], num2num2(B_scale[i].x));
+      BF_frag[reg_buf_idx][2 * i][1] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i][1], num2num2(B_scale[i].x));
+
+      cvt_8bx4_to_16bx4_bias128(BQ_frag[reg_buf_idx][2 * i + 1],
+                                BF_frag[reg_buf_idx][2 * i + 1]);
+      if (has_zp) {
+        BF_frag[reg_buf_idx][2 * i + 1][0] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_zero[i].y));
+        BF_frag[reg_buf_idx][2 * i + 1][1] =
+            __hsub2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_zero[i].y));
+      }
+
+      BF_frag[reg_buf_idx][2 * i + 1][0] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][0], num2num2(B_scale[i].y));
+      BF_frag[reg_buf_idx][2 * i + 1][1] =
+          __hmul2(BF_frag[reg_buf_idx][2 * i + 1][1], num2num2(B_scale[i].y));
+    }
+  }
+
+  __device__ void ldg_params() {
+    const int N_padded = (params.N + 31) / 32 * 32;
+    // load B scale and zero_point
+  #pragma unroll
+    for (int i = 0; i < WARP_NTILE / 32; ++i) {
+      ldg64_ca(B_scale[2 * i + 0], B_scale[2 * i + 1],
+               params.B_scale_ptr + params_n_idx + i * 32,
+               (params_n_idx + i * 32) < N_padded);
+      if (has_zp) {
+        ldg64_ca(B_zero[2 * i + 0], B_zero[2 * i + 1],
+                 params.B_zero_ptr + params_n_idx + i * 32,
+                 (params_n_idx + i * 32) < N_padded);
+      }
+    }
+  }
+
+  __device__ void mma(const int& reg_buf_idx) {
+  #pragma unroll
+    for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+        hmma16816_f32<FType>(
+            C_frag[m_idx][n_idx], A_frag[reg_buf_idx][m_idx],
+            reinterpret_cast<uint32_t(&)[2]>(BF_frag[reg_buf_idx][n_idx]));
+      }
+    }
+  }
+
+  __device__ void fused_splitk_reduce() {
+    // need splitk-reduce if enable splitk
+    if (gridDim.z > 1) {
+      int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      // Wait for all previous blocks in the splitk direction to accumulate the
+      // results into C_tmp
+      if (threadIdx.x == 0) {
+        uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx;
+        uint32_t count;
+        do {
+          // make sure the ld.cg inside the do-wile loop
+          __threadfence_block();
+          asm volatile("ld.global.cg.b32 %0, [%1];"
+                       : "=r"(count)
+                       : "l"(red_count_ptr));
+        } while (count != blockIdx.z);
+      }
+      __syncthreads();
+
+      int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      if (blockIdx.z != 0) {
+        // expecting that temporary register here reuses the previous A&B frag
+        // register
+        float temp_frag[Mtile / 16][WARP_NITER][4];
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+            int offset =
+                C_tmp_base_offset + (m_idx * WARP_NITER + n_idx) * BLOCK * 4;
+            *reinterpret_cast<int4*>(temp_frag[m_idx][n_idx]) =
+                *reinterpret_cast<int4*>(params.C_tmp_ptr + offset);
+          }
+        }
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+  #pragma unroll
+            for (int idx = 0; idx < 4; ++idx) {
+              C_frag[m_idx][n_idx][idx] += temp_frag[m_idx][n_idx][idx];
+            }
+          }
+        }
+      }
+
+      // first splitk - 1 blocks need to write partial results into C_tmp
+      if (blockIdx.z != gridDim.z - 1) {
+  #pragma unroll
+        for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+          for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+            int offset =
+                C_tmp_base_offset + (m_idx * WARP_NITER + n_idx) * BLOCK * 4;
+            asm volatile(
+                "{st.global.cg.v4.b32 [%0], {%1, %2, %3, %4};}\n"
+                :
+                : "l"(params.C_tmp_ptr + offset), "f"(C_frag[m_idx][n_idx][0]),
+                  "f"(C_frag[m_idx][n_idx][1]), "f"(C_frag[m_idx][n_idx][2]),
+                  "f"(C_frag[m_idx][n_idx][3]));
+          }
+        }
+        __threadfence();
+        __syncthreads();
+        if (threadIdx.x == 0) {
+          uint32_t* red_count_ptr = params.red_count_ptr + blk_red_idx;
+          atomicInc(red_count_ptr, gridDim.z);
+        }
+      }
+    }
+  }
+
+  __device__ void stg(char* smem) {
+    if (EnableFuse) {
+      if (blockIdx.z != gridDim.z - 1) return;
+    }
+    uint32_t* C_sts_ptr =
+        reinterpret_cast<uint32_t*>(smem + sts_c_base_offset * sizeof(FType));
+    // C_tile sts
+  #pragma unroll
+    for (int m_idx = 0; m_idx < Mtile / 16; ++m_idx) {
+  #pragma unroll
+      for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
+  #pragma unroll
+        for (int k_idx = 0; k_idx < 2; ++k_idx) {
+          FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
+          FType high16 =
+              static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+          uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
+                         (reinterpret_cast<uint32_t&>(high16) << 16);
+          int sts_offset =
+              m_idx * 16 * (WARP_NTILE / 2) +
+              (((lane_id / (32 / WARP_NITER)) + n_idx) % WARP_NITER) * (8 / 2) +
+              k_idx * 8 * (WARP_NTILE / 2);
+          C_sts_ptr[sts_offset] = tmp;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
+    // C_tile lds and stg
+    int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
+    if (WARP_NTILE == 32) {
+      int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
+                              (lane_id / 4) * WARP_NTILE +
+                              ((lane_id % 4 + lane_id / 8) % 4) * 8;
+      uint4* C_lds_ptr =
+          reinterpret_cast<uint4*>(smem + lds_c_base_offset * sizeof(FType));
+  #pragma unroll
+      for (int i = 0; i < (Mtile / 16) * (WARP_NITER / 2); ++i) {
+        uint4 stg_reg = C_lds_ptr[i * 8 * 4];
+        stg128(stg_reg.x, stg_reg.y, stg_reg.z, stg_reg.w,
+               C_base_ptr + i * 8 * params.N,
+               (m_base_idx + i * 8) < params.M && n_guard);
+      }
+    } else if (WARP_NTILE == 64) {
+      int lds_c_base_offset =
+          warp_id * Mtile * WARP_NTILE + (lane_id / 8) * WARP_NTILE;
+  #pragma unroll
+      for (int i = 0; i < (Mtile / 16) * (WARP_NITER / 2); ++i) {
+        int lds_c_offset = lds_c_base_offset + i * 4 * WARP_NTILE +
+                           ((lane_id % 8 + lane_id / 8 + (i % 2) * 4) % 8) * 8;
+        uint4 stg_reg =
+            *reinterpret_cast<uint4*>(smem + lds_c_offset * sizeof(FType));
+        stg128(stg_reg.x, stg_reg.y, stg_reg.z, stg_reg.w,
+               C_base_ptr + i * 4 * params.N,
+               (m_base_idx + i * 4) < params.M && n_guard);
+      }
+    }
+  }
+
+  const SM8x_GEMM_W8A16_Splitk_Params<FType, QType>& params;
+
+  int load_a_base_offset[2];
+  int load_b_base_offset[2];
+  int sts_c_base_offset;
+
+  int store_c_base_offset;
+
+  int store_c_row_base_idx, store_c_col_idx;
+  FType* this_block_C_base_ptr = nullptr;
+
+  int params_n_idx;
+  const uint32_t A_smem_base_addr, BQ_smem_base_addr;
+  const uint32_t A_smem_stage_stride, BQ_smem_stage_stride;
+
+  int lane_id;
+  int warp_id;
+  // first 2 denotes double buffer, second dim denotes M direction
+  uint32_t A_frag[2][Mtile / 16][4];
+
+  typename HalfType<FType>::T2 B_scale[WARP_NITER / 2];
+  typename HalfType<FType>::T2 B_zero[WARP_NITER / 2];
+  uint32_t BQ_frag[2][WARP_NITER];
+  // first 2 denotes double buffer, second dim denotes N direction, last 2
+  // denotes K direction
+  typename HalfType<FType>::T2 BF_frag[2][WARP_NITER][2];
+  // first dim denotes M direction, second dim denotes N direction
+  float C_frag[Mtile / 16][WARP_NITER][4];
+};
+
+/*
+ *  @brief W8A16 Perchannel Quantization GEMM,
+ *         requires N % 8 == 0, K % 16 == 0
+ *         accumulator precision: FP32
+ *  @tparam FType: DataType for A, B_scale, B_zero, and C, supports half or
+ * nv_bfloat16
+ *  @tparam QType: DataType for B, support uint8(bias128)
+ *  @tparam Mtile: M-dimensional size of the gemm block tile, supports 16, 32,
+ * 48 or 64
+ *  @tparam Ntile: N-dimensional size of the gemm block tile, supports 128 or
+ * 256
+ *  @tparam NStage: Num of stages for async copy
+ *  @tparam BLOCK: BLOCK size
+ *  @tparam EnableFuse: If true, use fused splitk-reduce, otherwise use
+ * non-fused splitk-reduce
+ *  @tparam has_zp: whether to use zero_point
+ *
+ *  @fparam params struct consists of following parameters:
+ *      @param A_ptr: Matrix A value ptr, A = (M, K)
+ *      @param B_ptr: Matrix B value ptr, B = (N32_align, K) (N32K16 special
+ * format), N32_align = (N + 32 - 1) / 32 * 32
+ *      @param B_scale_ptr: B_scale value ptr, B_scale = (N32_align,) (N32K16
+ * special format)
+ *      @param B_zero_ptr: B_zero value ptr, B_zero = (N32_align,) (N32K16
+ * special format)
+ *      @param C_ptr: Matrix C value ptr, C = (M, N)
+ *      @param M: dimnesion m
+ *      @param N: dimnesion n
+ *      @param K: dimnesion k
+ *      @param SplitK: split size along K-dimension
+ *      @param C_split_ptr: Matrix C_split value ptr, used only in non-fused
+ * splitk-reduce
+ *      @param C_tmp_ptr: Matrix C_tmp value ptr, used only in fused
+ * splitk-reduce
+ *      @param red_count_ptr: 1-D red_count value ptr, used only in fused
+ * splitk-reduce
+ */
+template <typename FType, typename QType, int Mtile, int Ntile, int NStage,
+          int BLOCK, bool EnableFuse, bool has_zp>
+__global__ void __launch_bounds__(BLOCK)
+    ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel(
+        const SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params) {
+  // A smem size = 64 * 32 * 2B/elem * 4(stage) = 16KB
+  // B smem size = 128 * 32 * 1B/elem * 4(stage) = 16KB
+  constexpr int smem_size_one_stage = Mtile * 32 * 2 + Ntile * 32;
+  __shared__ char smem[NStage * smem_size_one_stage];
+  char* A_smem = smem;
+  char* BQ_smem = smem + Mtile * 32 * 2 * NStage;
+
+  uint32_t A_smem_addr = smem_u32addr(A_smem);
+  uint32_t BQ_smem_addr = smem_u32addr(BQ_smem);
+  uint32_t A_smem_stage_stride = Mtile * 32 * 2;
+  uint32_t BQ_smem_stage_stride = Ntile * 32;
+
+  // initialize the data move process from GM to SMEM for this block
+  GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<
+      FType, QType, Mtile, Ntile, NStage, BLOCK>
+      gmem_tile(params, A_smem_addr, BQ_smem_addr, A_smem_stage_stride,
+                BQ_smem_stage_stride);
+
+  int sts_stage_idx = 0;
+  int lds_stage_idx = 0;
+
+  int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+                       ? params.SplitK
+                       : params.K - blockIdx.z * params.SplitK;
+  int k_tiles = (tb_k_slice + 31) / 32;
+  int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;
+
+  // load first three tiles to shared memory
+  gmem_tile.ldgsts_first_ktiles(first_k_tile, k_tiles);
+  sts_stage_idx += (NStage - 2);
+  ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK<
+      FType, QType, Mtile, Ntile, BLOCK, EnableFuse, has_zp>
+      compute_tile(params, A_smem_addr, BQ_smem_addr, A_smem_stage_stride,
+                   BQ_smem_stage_stride);
+  compute_tile.ldg_params();
+  cp_asyc_wait_group<NStage - 2>();
+  __syncthreads();
+
+  compute_tile.lds(lds_stage_idx, 0, 0);
+  int reg_buf_idx = 1;
+
+  // main loop
+  for (; k_tiles > NStage - 1; --k_tiles) {
+    // load next A&B tile
+    sts_stage_idx = sts_stage_idx < NStage - 1 ? sts_stage_idx + 1 : 0;
+    gmem_tile.ldgsts(sts_stage_idx);
+
+  #pragma unroll
+    for (int k_phase_idx = 0; k_phase_idx < 2; k_phase_idx++) {
+      // dequantize next B tile
+      if (k_phase_idx == 1) {
+        cp_asyc_wait_group<NStage - 2>();
+        __syncthreads();
+        lds_stage_idx = lds_stage_idx < NStage - 1 ? lds_stage_idx + 1 : 0;
+      }
+
+      compute_tile.lds(lds_stage_idx, reg_buf_idx, (k_phase_idx + 1) % 2);
+
+      compute_tile.mma(reg_buf_idx ^ 1);
+      reg_buf_idx ^= 1;
+    }
+  }
+
+  // last NStage-1 tiles
+  for (; k_tiles > 0; --k_tiles) {
+    cp_async_commit_group();
+  #pragma unroll
+    for (int k_phase_idx = 0; k_phase_idx < 2; k_phase_idx++) {
+      // dequantize next B tile
+      if (k_phase_idx == 1) {
+        cp_asyc_wait_group<NStage - 2>();
+        __syncthreads();
+        lds_stage_idx = lds_stage_idx < NStage - 1 ? lds_stage_idx + 1 : 0;
+      }
+
+      compute_tile.lds(lds_stage_idx, reg_buf_idx, (k_phase_idx + 1) % 2);
+
+      compute_tile.mma(reg_buf_idx ^ 1);
+      reg_buf_idx ^= 1;
+    }
+  }
+
+  if (EnableFuse) {
+    compute_tile.fused_splitk_reduce();
+  }
+  compute_tile.stg(smem);
+}
+
+  #define __CALL_IF(MTILE, NTILE, NUM_THREADS, ENABLE_FUSE, HAS_ZP)                                     \
+    else if (Mtile == MTILE && Ntile == NTILE && BLOCK == NUM_THREADS &&                                \
+             enable_fuse == ENABLE_FUSE && has_zp == HAS_ZP) {                                          \
+      ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_hmma16816_multistage_AN_BTN32K16_CN_splitk_kernel< \
+          FType, QType, MTILE, NTILE, 4, NUM_THREADS, ENABLE_FUSE, HAS_ZP>                              \
+          <<<grid, block, 0, stream>>>(params);                                                         \
+    }
+
+template <typename FType, typename QType>
+void ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_mma16816_multistage_AN_BTN32K16_CN_splitk(
+    const FType* A, const QType* B, const FType* B_scale, const FType* B_zero,
+    FType* C, const int M, const int N, const int K, void* workspace,
+    const int sm_version, const BlockTileSplitkParams& fused_gemm_params,
+    cudaStream_t stream) {
+  int Mtile = fused_gemm_params.Mtile;
+  int grid_x = (M + Mtile - 1) / Mtile;
+  int Ntile = fused_gemm_params.Ntile;
+  int grid_y = (N + Ntile - 1) / Ntile;
+  int SplitK = fused_gemm_params.SplitK;
+  int grid_z = (K + SplitK - 1) / SplitK;
+
+  int BLOCK = (Ntile == 256) ? 256 : 128;
+
+  dim3 grid(grid_x, grid_y, grid_z);
+  dim3 block(BLOCK);
+
+  bool enable_fuse = fused_gemm_params.EnableFuse;
+  bool has_zp = B_zero != nullptr;
+  if (enable_fuse) {
+    float* C_tmp = reinterpret_cast<float*>(workspace);
+    uint32_t* red_count = reinterpret_cast<uint32_t*>(
+        (char*)workspace + grid_x * Mtile * grid_y * Ntile * sizeof(float));
+    CHECK_CUDA(cudaMemsetAsync(red_count, 0, grid_x * grid_y * sizeof(uint32_t),
+                               stream));
+    SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params{
+        A, B,      B_scale, B_zero, C,       M,     N,
+        K, SplitK, 0,       -1,     nullptr, C_tmp, red_count};
+
+    if (false) {
+    }
+    // Select the template parameters for kernel launch
+    // according to the above settings. Tuning is not supported.
+    __CALL_IF(16, 256, 256, true, false)
+    __CALL_IF(32, 256, 256, true, false)
+    __CALL_IF(48, 256, 256, true, false)
+    __CALL_IF(64, 128, 128, true, false)
+    __CALL_IF(64, 256, 256, true, false)
+    __CALL_IF(16, 256, 256, true, true)
+    __CALL_IF(32, 256, 256, true, true)
+    __CALL_IF(48, 256, 256, true, true)
+    __CALL_IF(64, 128, 128, true, true)
+    __CALL_IF(64, 256, 256, true, true)
+  } else {
+    FType* C_split = reinterpret_cast<FType*>(workspace);
+    SM8x_GEMM_W8A16_Splitk_Params<FType, QType> params{
+        A, B,      B_scale, B_zero, C,       M,       N,
+        K, SplitK, 0,       -1,     C_split, nullptr, nullptr};
+
+    if (false) {
+    }
+    // Select the template parameters for kernel launch
+    // according to the above settings. Tuning is not supported.
+    __CALL_IF(16, 256, 256, false, false)
+    __CALL_IF(32, 256, 256, false, false)
+    __CALL_IF(48, 256, 256, false, false)
+    __CALL_IF(64, 128, 128, false, false)
+    __CALL_IF(64, 256, 256, false, false)
+    __CALL_IF(16, 256, 256, false, true)
+    __CALL_IF(32, 256, 256, false, true)
+    __CALL_IF(48, 256, 256, false, true)
+    __CALL_IF(64, 128, 128, false, true)
+    __CALL_IF(64, 256, 256, false, true)
+
+    // SplitK reduce
+    f16_gemm_splitk_reduce(C_split, C, M, N, grid_z, stream);
+  }
+}
+
+size_t allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size(
+    int m, int n, int k, int sm_count,
+    BlockTileSplitkParams& fused_gemm_params) {
+  // Determine the block tile and splitk strategy
+  int m16_times = (m + 16 - 1) / 16;
+  int Mtile = m16_times <= 4 ? m16_times * 16 : 64;
+  int grid_x = (m + Mtile - 1) / Mtile;
+  int Ntile =
+      (float(grid_x * ((n + 127) / 128)) / sm_count > 10) || (Mtile < 64) ? 256
+                                                                          : 128;
+  int grid_y = (n + Ntile - 1) / Ntile;
+  int grid_z;
+
+  // split-k
+  const float SPLIT_THRESHOLD = 0.8;
+  int n_slice;
+  for (n_slice = 1; n_slice < k / 256; ++n_slice) {
+    int n_block = grid_x * grid_y * n_slice;
+    if (n_block >= sm_count * SPLIT_THRESHOLD &&
+        (n_block % sm_count == 0 || n_block % sm_count >= sm_count * 0.5)) {
+      break;
+    }
+  }
+
+  int k_slice =
+      (k / n_slice) % 32 == 0 ? k / n_slice : k / n_slice / 32 * 32 + 32;
+  grid_z = (k + k_slice - 1) / k_slice;
+  bool enable_fuse = float(grid_x * grid_y) / sm_count >= 0.5 ? 1 : 0;
+
+  size_t ws_size;
+  if (enable_fuse) {
+    ws_size = grid_x * Mtile * grid_y * Ntile * sizeof(float)  // For C_tmp
+              + grid_x * grid_y * sizeof(uint32_t);            // For red_count
+  } else {
+    ws_size = grid_z * m * n * sizeof(__half);
+  }
+
+  fused_gemm_params.Mtile = Mtile;
+  fused_gemm_params.Ntile = Ntile;
+  fused_gemm_params.SplitK = k_slice;
+  fused_gemm_params.EnableFuse = enable_fuse;
+  return ws_size;
+}
+
+// restore from N32K16 order to original N-major order
+// K % 16 == 0, N % 8 == 0
+// each block process 64(k) * 32(n) result elements
+template <typename FT, typename QT>
+__global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
+    const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
+    const int N_32align, const int N, const int K) {
+  __shared__ FT smem[64 * 32];
+  int warp_id = threadIdx.x / 32;
+  int lane_id = threadIdx.x % 32;
+  const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  const int src_col_idx =
+      blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
+  const int src_offset = src_row_idx * K * 4 + src_col_idx;
+  int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+
+  QT qval_reg[16];
+  const QT* pdata = qdata + src_offset;
+  if (src_col_idx < (K * 4)) {
+    *(reinterpret_cast<uint4*>(qval_reg)) =
+        *(reinterpret_cast<const uint4*>(qdata + src_offset));
+  }
+  FT scale_reg[4];
+  *(reinterpret_cast<uint2*>(scale_reg)) =
+      *(reinterpret_cast<const uint2*>(scales + params_nidx));
+  FT zero_reg[4] = {0};
+  if (zeros != nullptr) {
+    *(reinterpret_cast<uint2*>(zero_reg)) =
+        *(reinterpret_cast<const uint2*>(zeros + params_nidx));
+  }
+  FT fval_reg[16];
+
+  const int sts_base_offset =
+      (warp_id * 16 + (lane_id % 4) * 2) * 32 + lane_id / 4;
+  #pragma unroll
+  for (int ni = 0; ni < 4; ++ni) {
+    cvt_8bx4_to_16bx4_bias128(
+        *reinterpret_cast<uint32_t*>(&qval_reg[ni * 4]),
+        reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
+  #pragma unroll
+    for (int ki = 0; ki < 4; ++ki) {
+      fval_reg[ni * 4 + ki] =
+          (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
+      int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
+                       ((ni + lane_id % 4) % 4) * 8;
+      smem[sts_offset] = fval_reg[ni * 4 + ki];
+    }
+  }
+  __syncthreads();
+
+  const int lds_base_offset =
+      (threadIdx.x / 4) * 32 + ((threadIdx.x % 4 + threadIdx.x / 8) % 4) * 8;
+  #pragma unroll
+  for (int i = 0; i < 2; ++i) {
+    *reinterpret_cast<uint4*>(fval_reg + i * 8) =
+        *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
+  }
+
+  const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  #pragma unroll
+  for (int i = 0; i < 2; ++i) {
+    int dst_row_kidx = dst_row_base_kidx + i * 32;
+    int dst_offset = dst_row_kidx * N + dst_col_nidx;
+    if (dst_row_kidx < K && dst_col_nidx < N) {
+      *reinterpret_cast<uint4*>(fdata + dst_offset) =
+          *reinterpret_cast<uint4*>(fval_reg + i * 8);
+    }
+  }
+}
+
+template <typename FT, typename QT>
+void restore_N32_K16_dequantize_rhs_w8a16(const QT* qdata, const FT* scales,
+                                          const FT* zeros, FT* fdata,
+                                          const int N_32align, const int N,
+                                          const int K, const int GroupSize,
+                                          cudaStream_t stream) {
+  TORCH_CHECK(N % 8 == 0 && K % 16 == 0 && N_32align % 32 == 0,
+              "Unsupported shape");
+  if (GroupSize == -1) {
+    const int BLOCK = 128;
+    dim3 grid(N_32align / 32, ((K / 16) + 3) / 4);
+    restore_N32_K16_dequantize_rhs_w8a16_perc_kernel<FT, QT>
+        <<<grid, BLOCK, 0, stream>>>(qdata, scales, zeros, fdata, N_32align, N,
+                                     K);
+  }
+  // TODO: Support SubChannel
+  else {
+    TORCH_CHECK(false, "Now only support PerChannel");
+  }
+}
+
+template <typename FT, typename QT>
+void w8a16_gemm_dq_cublas(const FT* in, const QT* rhs_qdata_ptr,
+                          const FT* rhs_scales_ptr, const FT* rhs_zeros_ptr,
+                          FT* out, void* workspace, const int M,
+                          const int N_32align, const int N, const int K,
+                          const int group_size, cudaStream_t stream,
+                          cublasHandle_t handle) {
+  static_assert(
+      std::is_same<FT, half>::value || std::is_same<FT, nv_bfloat16>::value,
+      "only float16 and bfloat16 is supported");
+  // Dequant
+  FT* rhs_fdata_ptr = static_cast<FT*>(workspace);
+  restore_N32_K16_dequantize_rhs_w8a16(rhs_qdata_ptr, rhs_scales_ptr,
+                                       rhs_zeros_ptr, rhs_fdata_ptr, N_32align,
+                                       N, K, group_size, stream);
+  // cuBLAS GEMM
+  int lda = K;
+  int ldb = N;
+  int ldc = N;
+  const float alpha = 1.0f;
+  const float beta = 0.0f;
+  cudaDataType_t cuda_type;
+  if (std::is_same<FT, __half>::value) {
+    cuda_type = CUDA_R_16F;
+  } else {
+    cuda_type = CUDA_R_16BF;
+  }
+  CHECK_CUBLAS(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
+                            rhs_fdata_ptr, cuda_type, ldb, in, cuda_type, lda,
+                            &beta, out, cuda_type, ldc, CUDA_R_32F,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+}
+
+template <typename FType, typename QType>
+void allspark_qgemm_w8a16_perc_ampere(
+    const FType* A, const QType* B, const FType* B_scale, const FType* B_zero,
+    FType* C, const int M, const int N_32align, const int N, const int K,
+    void* workspace, const BlockTileSplitkParams& fused_gemm_params,
+    const int group_size, int CUBLAS_M_THRESHOLD, const int sm_version,
+    cudaStream_t stream, cublasHandle_t handle) {
+  if (M > CUBLAS_M_THRESHOLD) {
+    w8a16_gemm_dq_cublas<FType, QType>(A, B, B_scale, B_zero, C, workspace, M,
+                                       N_32align, N, K, group_size, stream,
+                                       handle);
+  } else {
+    ampere_hgemm_W8A16_perc_f16_f16_MtilexNtilex32_mma16816_multistage_AN_BTN32K16_CN_splitk<
+        FType, QType>(A, B, B_scale, B_zero, C, M, N, K, workspace, sm_version,
+                      fused_gemm_params, stream);
+  }
+}
+
+}  // namespace allspark
+
+torch::Tensor allspark_w8a16_gemm(
+    torch::Tensor const& a, torch::Tensor const& b_qweight,
+    torch::Tensor const& b_scales, c10::optional<torch::Tensor> const& b_qzeros,
+    int64_t n, int64_t group_size, int64_t sm_count, int64_t sm_version,
+    int64_t CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) {
+  // Verify device and strides
+  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
+  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+
+  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
+  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  if (has_zp) {
+    TORCH_CHECK(b_qzeros.value().device().is_cuda(), "b_qzeros is not on GPU");
+    TORCH_CHECK(b_qzeros.value().is_contiguous(), "b_qzeros is not contiguous");
+  }
+
+  int m = a.size(0);
+  int n_32align = (n + 32 - 1) / 32 * 32;
+  int k = a.size(1);
+
+  // Verify shape
+  TORCH_CHECK(b_qweight.size(0) == n_32align,
+              "Shape mismatch: b_qweight.size(0) = ", b_qweight.size(0),
+              ", n_32align = ", n_32align);
+  TORCH_CHECK(b_qweight.size(1) == k,
+              "Shape mismatch: b_qweight.size(1) = ", b_qweight.size(1),
+              ", k = ", k);
+
+  TORCH_CHECK(group_size == -1, "Currently only supports group_size = -1");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
+  const void* a_ptr = reinterpret_cast<const void*>(a.data_ptr());
+  const uint8_t* b_ptr = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
+  const void* b_scale_ptr = reinterpret_cast<const void*>(b_scales.data_ptr());
+  const void* b_zero_ptr = nullptr;
+  if (b_qzeros.has_value()) {
+    b_zero_ptr = reinterpret_cast<const void*>(b_qzeros.value().data_ptr());
+  }
+
+  auto c_options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
+  torch::Tensor c = torch::empty({m, n}, c_options);
+  void* c_ptr = reinterpret_cast<void*>(c.data_ptr());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+
+  allspark::BlockTileSplitkParams fused_gemm_params;
+
+  size_t ws_size = 0;
+  if (m > CUBLAS_M_THRESHOLD) {
+    ws_size = k * n * 2;  // sizeof(f16)==2
+  } else {
+    ws_size = allspark::allspark_qgemm_w8a16_perc_n32k16_ampere_workspace_size(
+        m, n, k, sm_count, fused_gemm_params);
+  }
+
+  auto ws_options = torch::TensorOptions().dtype(at::kChar).device(a.device());
+  if (as_g_workspace.numel() <
+      ws_size) {  // ws_options: kChar, so numel() is bytes
+    as_g_workspace = torch::empty({long(ws_size)}, ws_options);
+  }
+  void* ws = reinterpret_cast<void*>(as_g_workspace.data_ptr());
+
+  if (a.dtype() == at::ScalarType::Half) {
+    allspark::allspark_qgemm_w8a16_perc_ampere<__half, uint8_t>(
+        reinterpret_cast<const __half*>(a_ptr), b_ptr,
+        reinterpret_cast<const __half*>(b_scale_ptr),
+        reinterpret_cast<const __half*>(b_zero_ptr),
+        reinterpret_cast<__half*>(c_ptr), m, n_32align, n, k, ws,
+        fused_gemm_params, group_size, CUBLAS_M_THRESHOLD, sm_version, stream,
+        handle);
+  } else if (a.dtype() == at::ScalarType::BFloat16) {
+    allspark::allspark_qgemm_w8a16_perc_ampere<__nv_bfloat16, uint8_t>(
+        reinterpret_cast<const __nv_bfloat16*>(a_ptr), b_ptr,
+        reinterpret_cast<const __nv_bfloat16*>(b_scale_ptr),
+        reinterpret_cast<const __nv_bfloat16*>(b_zero_ptr),
+        reinterpret_cast<__nv_bfloat16*>(c_ptr), m, n_32align, n, k, ws,
+        fused_gemm_params, group_size, CUBLAS_M_THRESHOLD, sm_version, stream,
+        handle);
+  }
+
+  return c;
+}
+
+#endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
+}
\ No newline at end of file
diff --git a/csrc/quantization/gptq_allspark/allspark_repack.cu b/csrc/quantization/gptq_allspark/allspark_repack.cu
new file mode 100644
index 00000000000..82929c94ad8
--- /dev/null
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@@ -0,0 +1,163 @@
+#include "allspark_utils.cuh"
+#include <torch/all.h>
+#include "core/registration.h"
+
+namespace allspark {
+
+// Rearrange B to facilitate Ampere Tensor Core load data
+// reorder B from (K, N) to (N_32align / 4, K * 4)
+// K % 16 == 0, N % 16 == 0, N_32align % 32 == 0
+template <typename FType>
+__global__ void __launch_bounds__(128)
+    rearrange_kn_weight_as_n32k16_order_ldg16_kernel(
+        const uint8_t* B, const FType* B_scale, const FType* B_zero,
+        uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+        const int K, const int N, const int N_32align) {
+  const int lane_id = threadIdx.x % 32;
+  const int warp_id = threadIdx.x / 32;
+
+  if (blockIdx.x != gridDim.x - 1) {
+    // Load B
+    // per block process 64(k) * 128(n) B elements
+    // per warp process 16(k) * 128 B elements
+    const int src_row_base_idx =
+        blockIdx.x * 64 + warp_id * 16 + ((lane_id % 8) / 2) * 2;
+    const int src_col_idx =
+        blockIdx.y * 128 + (lane_id / 8) * 32 + (lane_id % 2) * 16;
+    uint8_t B_frag[4][16];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+      int src_row_idx = src_row_base_idx + (i / 2) * 8 + (i % 2);
+      int src_offset = src_row_idx * N + src_col_idx;
+      bool guard = src_row_idx < K && src_col_idx < N;
+      ldg128_cg_0(*reinterpret_cast<uint32_t*>(B_frag[i]),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 1),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 2),
+                  *(reinterpret_cast<uint32_t*>(B_frag[i]) + 3), B + src_offset,
+                  guard);
+    }
+
+    // reorder B
+    uint8_t B_reorder_frag[8][8];
+#pragma unroll
+    for (int i = 0; i < 4; ++i) {
+#pragma unroll
+      for (int j = 0; j < 16; ++j) {
+        int dst_i = j % 8;
+        int dst_j = i + (j / 8) * 4;
+        B_reorder_frag[dst_i][dst_j] = B_frag[i][j];
+      }
+    }
+
+    // Store B
+    const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const int dst_col_idx =
+        blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
+    for (int i = 0; i < 8; ++i) {
+      int dst_row_idx = dst_row_base_idx + i;
+      int dst_offset = dst_row_idx * K * 4 + dst_col_idx;
+      bool guard = (dst_row_base_idx < N_32align / 4) && (dst_col_idx < K * 4);
+      if (guard) {
+        *reinterpret_cast<int2*>(B_result + dst_offset) =
+            *reinterpret_cast<int2*>(B_reorder_frag[i]);
+      }
+    }
+  } else {
+    // Load B_scale and B_zero
+    FType b_scale_reg, b_zero_reg;
+    int src_offset = blockIdx.y * 128 + threadIdx.x;
+    ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
+    if (B_zero != nullptr)
+      ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
+    int dst_offset =
+        blockIdx.y * 128 + warp_id * 32 + (lane_id % 8) * 4 + lane_id / 8;
+    if (dst_offset < N_32align) {
+      B_scale_result[dst_offset] = b_scale_reg;
+      if (B_zero != nullptr) B_zero_result[dst_offset] = b_zero_reg;
+    }
+  }
+}
+
+template <typename FType>
+void rearrange_kn_weight_as_n32k16_order_ldg16(
+    const uint8_t* B, const FType* B_scale, const FType* B_zero,
+    uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
+    const int64_t K, const int64_t N, const int64_t N_32align,
+    cudaStream_t stream) {
+  if (N % 16 != 0 || K % 16 != 0) {
+    std::cerr << "Now only support N and K is multiples of 16" << std::endl;
+  }
+  const int BLOCK = 128;
+  int grid_x = (K + 64 - 1) / 64 + 1;
+  int grid_y = (N + 128 - 1) / 128;
+  dim3 grid(grid_x, grid_y);
+
+  rearrange_kn_weight_as_n32k16_order_ldg16_kernel<FType>
+      <<<grid, BLOCK, 0, stream>>>(B, B_scale, B_zero, B_result, B_scale_result,
+                                   B_zero_result, K, N, N_32align);
+}
+}  // namespace allspark
+
+void rearrange_kn_weight_as_n32k16_order(
+    torch::Tensor const& b_qweight, torch::Tensor const& b_scales,
+    c10::optional<torch::Tensor> const& b_zeros, bool has_zp,
+    torch::Tensor& b_qweight_reorder, torch::Tensor& b_scales_reorder,
+    c10::optional<torch::Tensor> const& b_zeros_reorder, const int64_t K,
+    const int64_t N, const int64_t N_32align) {
+  // Verify device and strides
+  TORCH_CHECK(b_qweight.device().is_cuda(), "b_qweight is not on GPU");
+  TORCH_CHECK(b_qweight.is_contiguous(), "b_qweight is not contiguous");
+
+  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
+  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
+
+  TORCH_CHECK(b_qweight_reorder.device().is_cuda(),
+              "b_qweight_reorder is not on GPU");
+  TORCH_CHECK(b_qweight_reorder.is_contiguous(),
+              "b_qweight_reorder is not contiguous");
+
+  TORCH_CHECK(b_scales_reorder.device().is_cuda(),
+              "b_scales_reorder is not on GPU");
+  TORCH_CHECK(b_scales_reorder.is_contiguous(),
+              "b_scales_reorder is not contiguous");
+
+  if (has_zp) {
+    TORCH_CHECK(b_zeros.value().device().is_cuda(), "b_zeros is not on GPU");
+    TORCH_CHECK(b_zeros.value().is_contiguous(), "b_zeros is not contiguous");
+
+    TORCH_CHECK(b_zeros_reorder.value().device().is_cuda(),
+                "b_zeros_reorder is not on GPU");
+    TORCH_CHECK(b_zeros_reorder.value().is_contiguous(),
+                "b_zeros_reorder is not contiguous");
+  }
+
+  const uint8_t* matB = reinterpret_cast<const uint8_t*>(b_qweight.data_ptr());
+  const void* b_scale = b_scales.data_ptr();
+  const void* b_zero = has_zp ? b_zeros.value().data_ptr() : nullptr;
+
+  uint8_t* matB_reorder =
+      reinterpret_cast<uint8_t*>(b_qweight_reorder.data_ptr());
+  void* b_scale_reorder = b_scales_reorder.data_ptr();
+  void* b_zero_reorder = has_zp ? b_zeros_reorder.value().data_ptr() : nullptr;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (b_scales.dtype() == at::ScalarType::Half) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__half>(
+        matB, reinterpret_cast<const __half*>(b_scale),
+        reinterpret_cast<const __half*>(b_zero), matB_reorder,
+        reinterpret_cast<__half*>(b_scale_reorder),
+        reinterpret_cast<__half*>(b_zero_reorder), K, N, N_32align, stream);
+  } else if (b_scales.dtype() == at::ScalarType::BFloat16) {
+    allspark::rearrange_kn_weight_as_n32k16_order_ldg16<__nv_bfloat16>(
+        matB, reinterpret_cast<const __nv_bfloat16*>(b_scale),
+        reinterpret_cast<const __nv_bfloat16*>(b_zero), matB_reorder,
+        reinterpret_cast<__nv_bfloat16*>(b_scale_reorder),
+        reinterpret_cast<__nv_bfloat16*>(b_zero_reorder), K, N, N_32align,
+        stream);
+  }
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("rearrange_kn_weight_as_n32k16_order",
+         &rearrange_kn_weight_as_n32k16_order);
+}
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
new file mode 100644
index 00000000000..7aded9a1728
--- /dev/null
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -0,0 +1,408 @@
+#pragma once
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <iostream>
+
+namespace allspark {
+
+#define CHECK_CUDA(cmd)                                             \
+  do {                                                              \
+    cudaError_t cuda_status = cmd;                                  \
+    if (cuda_status != cudaSuccess) {                               \
+      std::string err_str = cudaGetErrorString(cuda_status);        \
+      std::cerr << "Failed: " << __FILE__ << ":" << __LINE__ << " " \
+                << err_str;                                         \
+      exit(-1);                                                     \
+    }                                                               \
+  } while (0)
+
+#define CHECK_CUBLAS(cmd)                                            \
+  do {                                                               \
+    cublasStatus_t cublas_status = cmd;                              \
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {                    \
+      std::cerr << "Failed:  " << __FILE__ << ":" << __LINE__ << " " \
+                << cublas_status << std::endl;                       \
+      exit(-1);                                                      \
+    }                                                                \
+  } while (0)
+
+template <typename FType, typename QType>
+struct SM8x_GEMM_W8A16_Splitk_Params {
+  const FType* A_ptr;
+  const QType* B_ptr;
+  const FType* B_scale_ptr;
+  const FType* B_zero_ptr;
+  FType* C_ptr;
+  int M;
+  int N;
+  int K;
+  int SplitK;
+  int GroupCnt;
+  int GroupSize;
+  FType* C_split_ptr;       // for non-fused splitk reduce
+  float* C_tmp_ptr;         // for fused splitk reduce
+  uint32_t* red_count_ptr;  // for fused splitk reduce
+};
+
+struct alignas(16) BlockTileSplitkParams {
+  int Mtile;
+  int Ntile;
+  int SplitK;
+  bool EnableFuse;
+};
+
+template <typename FType, int BLOCK, int N_MATRIX>
+__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
+                                              uint32_t n, uint32_t n_matrix,
+                                              uint32_t matrix_size) {
+  int idx = blockIdx.x * BLOCK + threadIdx.x;
+
+  if (idx >= matrix_size) {
+    return;
+  }
+
+  FType sum(0);
+
+  int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
+  for (int i = 0; i < n_mat; ++i) {
+    sum += C_split[idx + i * matrix_size];
+  }
+
+  C[idx] = sum;
+}
+
+template <typename FType>
+void f16_gemm_splitk_reduce(const FType* C_split, FType* C, const uint32_t m,
+                            const uint32_t n, const uint32_t n_matrix,
+                            cudaStream_t stream) {
+  const int BLOCK = 128;
+  uint32_t matrix_size = m * n;
+  int grid = (matrix_size + BLOCK - 1) / BLOCK;
+
+  void (*kernel)(const FType*, FType*, uint32_t, uint32_t, uint32_t) = nullptr;
+
+  switch (n_matrix) {
+    case 4:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 4>;
+      break;
+    case 5:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 5>;
+      break;
+    case 6:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 6>;
+      break;
+    case 7:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 7>;
+      break;
+    case 8:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 8>;
+      break;
+    case 9:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 9>;
+      break;
+    case 10:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 10>;
+      break;
+    case 11:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 11>;
+      break;
+    case 12:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, 12>;
+      break;
+    default:
+      kernel = f16_gemm_splitk_reduce_kernel<FType, BLOCK, -1>;
+      break;
+  }
+
+  kernel<<<grid, BLOCK, 0, stream>>>(C_split, C, n, n_matrix, matrix_size);
+}
+
+template <typename T>
+struct HalfType;
+template <>
+struct HalfType<half> {
+  using T1 = __half;
+  using T2 = __half2;
+};
+template <>
+struct HalfType<__nv_bfloat16> {
+  using T1 = __nv_bfloat16;
+  using T2 = __nv_bfloat162;
+};
+
+// convert 64-bit pointer to 32-bit smem addr
+__device__ __forceinline__ uint32_t smem_u32addr(const void* smem_ptr) {
+  uint32_t addr;
+  asm("{.reg .u64 u64addr;\n"
+      " cvta.to.shared.u64 u64addr, %1;\n"
+      " cvt.u32.u64 %0, u64addr;}\n"
+      : "=r"(addr)
+      : "l"(smem_ptr));
+
+  return addr;
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg16_cg_0(T& r0, const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 2, "ldg16_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %2, 0;\n"
+      " @!p mov.b16 %0, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.b16 {%0}, [%1];}\n"
+#else
+      " @p ld.global.ca.b16 {%0}, [%1];}\n"
+#endif
+      : "=h"(reinterpret_cast<uint16_t&>(r0))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg64_ca(T& r0, T& r1, const void* ptr,
+                                         bool guard) {
+  static_assert(sizeof(T) == 4, "ldg64_ca: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.ca.L2::128B.v2.b32 {%0, %1}, [%2];}\n"
+#else
+      " @p ld.global.ca.v2.b32 {%0, %1}, [%2];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldg128_cg_0(T& r0, T& r1, T& r2, T& r3,
+                                            const void* ptr, bool guard) {
+  static_assert(sizeof(T) == 4, "ldg128_cg_0: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %5, 0;\n"
+      " @!p mov.b32 %0, 0;\n"
+      " @!p mov.b32 %1, 0;\n"
+      " @!p mov.b32 %2, 0;\n"
+      " @!p mov.b32 %3, 0;\n"
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4 && \
+    __CUDA_ARCH__ >= 750
+      " @p ld.global.cg.L2::128B.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#else
+      " @p ld.global.cg.v4.b32 {%0, %1, %2, %3}, [%4];}\n"
+#endif
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "l"(ptr), "r"((int)guard));
+}
+
+template <typename T>
+__device__ __forceinline__ void lds128(T& reg0, T& reg1, T& reg2, T& reg3,
+                                       const uint32_t addr) {
+  static_assert(sizeof(T) == 4, "lds128: invalid T");
+
+  asm volatile("ld.shared.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(reinterpret_cast<uint32_t&>(reg0)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg1)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg2)),
+                 "=r"(reinterpret_cast<uint32_t&>(reg3))
+               : "r"(addr));
+}
+
+template <typename T>
+__device__ __forceinline__ void stg128(const T& r0, const T& r1, const T& r2,
+                                       const T& r3, const void* ptr,
+                                       bool guard) {
+  static_assert(sizeof(T) == 4, "stg128: invalid T");
+
+  asm volatile(
+      "{.reg .pred p;\n"
+      " setp.ne.b32 p, %1, 0;\n"
+      " @p st.global.v4.b32 [%0], {%2, %3, %4, %5};}\n"
+      :
+      : "l"(ptr), "r"((int)guard), "r"(reinterpret_cast<const uint32_t&>(r0)),
+        "r"(reinterpret_cast<const uint32_t&>(r1)),
+        "r"(reinterpret_cast<const uint32_t&>(r2)),
+        "r"(reinterpret_cast<const uint32_t&>(r3)));
+}
+
+template <typename T>
+__device__ __forceinline__ void ldsm_4(T& r0, T& r1, T& r2, T& r3,
+                                       const uint32_t& addr) {
+  static_assert(sizeof(T) == 4, "ldsm_4: invalid T");
+#if (__CUDA_ARCH__ >= 750) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+      : "=r"(reinterpret_cast<uint32_t&>(r0)),
+        "=r"(reinterpret_cast<uint32_t&>(r1)),
+        "=r"(reinterpret_cast<uint32_t&>(r2)),
+        "=r"(reinterpret_cast<uint32_t&>(r3))
+      : "r"(addr));
+#endif
+}
+
+template <typename FType>
+__device__ __forceinline__ void hmma16816_f32(float (&d)[4],
+                                              const uint32_t (&a)[4],
+                                              const uint32_t (&b)[2]);
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__half>(float (&d)[4],
+                                                      const uint32_t (&a)[4],
+                                                      const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <>
+__device__ __forceinline__ void hmma16816_f32<__nv_bfloat16>(
+    float (&d)[4], const uint32_t (&a)[4], const uint32_t (&b)[2]) {
+#if (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11)
+  asm volatile(
+      "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 {%0, %1, %2, %3}, "
+      "{%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};\n"
+      : "+f"(d[0]), "+f"(d[1]), "+f"(d[2]), "+f"(d[3])
+      : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async(const uint32_t smem_addr,
+                                         const void* gmem_ptr,
+                                         const int src_in_bytes, bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.cg.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.cg.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+template <int SIZE_IN_BYTES>
+__device__ __forceinline__ void cp_async_ca(const uint32_t smem_addr,
+                                            const void* gmem_ptr,
+                                            const int src_in_bytes,
+                                            bool guard) {
+  static_assert(
+      (SIZE_IN_BYTES == 4 || SIZE_IN_BYTES == 8 || SIZE_IN_BYTES == 16),
+      "Size is not supported");
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile(
+      "{.reg.pred p;\n"
+      " setp.ne.b32 p, %4, 0;\n"
+  #if __CUDACC_VER_MINOR__ >= 4
+      " @p cp.async.ca.shared.global.L2::256B [%0], [%1], %2, %3;}\n"
+  #else
+      " @p cp.async.ca.shared.global [%0], [%1], %2, %3;}\n"
+  #endif
+      ::"r"(smem_addr),
+      "l"(gmem_ptr), "n"(SIZE_IN_BYTES), "r"(src_in_bytes), "r"((int)guard));
+#endif
+}
+
+__device__ __forceinline__ void cp_async_commit_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.commit_group;\n");
+#endif
+}
+
+template <int N>
+__device__ __forceinline__ void cp_asyc_wait_group() {
+#if __CUDACC_VER_MAJOR__ >= 11 && __CUDA_ARCH__ >= 800
+  asm volatile("cp.async.wait_group %0;\n" : : "n"(N));
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128(const uint32_t& idata,
+                                                          T* fdata);
+
+template <>
+// fast conversion: 4xuint8 to 4xhalf, subtracting bias = 128
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__half2>(
+    const uint32_t& idata, __half2* fdata) {
+  uint32_t i10, i32;
+  asm volatile(
+      "prmt.b32 %0, %2, 0x64, 0x4140;"
+      "prmt.b32 %1, %2, 0x64, 0x4342;"
+      : "=r"(i10), "=r"(i32)
+      : "r"(idata));
+
+  static constexpr uint32_t MAGIC_NUM = 0x64806480;
+  fdata[0] = __hsub2(reinterpret_cast<const __half2&>(i10),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+  fdata[1] = __hsub2(reinterpret_cast<const __half2&>(i32),
+                     reinterpret_cast<const __half2&>(MAGIC_NUM));
+}
+
+template <>
+// fast conversion: 4xuint8 to 4xbfloat16, subtracting bias = 128
+// reference from marlin fast implementation
+__device__ __forceinline__ void cvt_8bx4_to_16bx4_bias128<__nv_bfloat162>(
+    const uint32_t& idata, __nv_bfloat162* fdata) {
+  float fp32_imd[4];
+  uint32_t* fp32_imd_casted = reinterpret_cast<uint32_t*>(fp32_imd);
+  asm volatile(
+      "prmt.b32 %0, %4, 0x4B000000, 0x7650;"
+      "prmt.b32 %1, %4, 0x4B000000, 0x7651;"
+      "prmt.b32 %2, %4, 0x4B000000, 0x7652;"
+      "prmt.b32 %3, %4, 0x4B000000, 0x7653;"
+      : "=r"(fp32_imd_casted[0]), "=r"(fp32_imd_casted[1]),
+        "=r"(fp32_imd_casted[2]), "=r"(fp32_imd_casted[3])
+      : "r"(idata));
+
+  fp32_imd[0] -= 8388736.f;
+  fp32_imd[1] -= 8388736.f;
+  fp32_imd[2] -= 8388736.f;
+  fp32_imd[3] -= 8388736.f;
+
+  uint32_t* bf16_res = reinterpret_cast<uint32_t*>(fdata);
+  asm volatile(
+      "prmt.b32 %0, %2, %3, 0x7632;"
+      "prmt.b32 %1, %4, %5, 0x7632;"
+      : "=r"(bf16_res[0]), "=r"(bf16_res[1])
+      : "r"(fp32_imd_casted[0]), "r"(fp32_imd_casted[1]),
+        "r"(fp32_imd_casted[2]), "r"(fp32_imd_casted[3]));
+}
+
+static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
+  assert(false);
+#else
+  return __bfloat162bfloat162(x);
+#endif
+  __builtin_unreachable();  // Suppress missing return statement warning
+}
+
+static __device__ half2 inline num2num2(const half x) {
+  return __half2half2(x);
+}
+
+}  // namespace allspark
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 72de2035d0c..0b0334f84ef 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -447,6 +447,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
            &dynamic_scaled_int8_quant);
+
+#ifndef USE_ROCM
+  // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
+  ops.def(
+      "rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_zeros, "
+      "bool has_zp, Tensor! b_qweight_reorder, Tensor! b_scales_reorder, "
+      "Tensor!? b_zeros_reorder, "
+      "int K, int N, int N_32align) -> ()");
+  //  conditionally compiled so impl in source file
+
+  // AllSpark quantization ops
+  ops.def(
+      "allspark_w8a16_gemm(Tensor a, Tensor b_qweight, Tensor b_scales, "
+      "Tensor? b_qzeros, "
+      "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
+      "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
+  //  conditionally compiled so impl in source file
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/tests/kernels/test_allspark_gemm.py b/tests/kernels/test_allspark_gemm.py
new file mode 100644
index 00000000000..896e0265738
--- /dev/null
+++ b/tests/kernels/test_allspark_gemm.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    ALLSPARK_AMPERE_N_ALIGN)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    quantize_weights)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+
+def is_gptq_allspark_supported(min_capability: int,
+                               max_capability: int) -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability = current_platform.get_device_capability()
+    assert capability is not None
+
+    return capability.to_int() >= min_capability \
+        and capability.to_int() <= max_capability
+
+
+MNK_FACTORS = [
+    (1, 4, 8),
+    (13, 17, 67),
+    (26, 37, 13),
+    (48, 16, 24),
+    (67, 13, 88),
+    (257, 13, 11),
+    (658, 13, 11),
+    (1033, 9, 17),
+]
+
+DTYPES = [torch.float16, torch.bfloat16]
+HAS_ZP_OPTS = [False, True]
+
+
+def compute_max_diff(output, output_ref):
+    return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+        torch.abs(output_ref))
+
+
+def rand_data(shape, dtype=torch.float16):
+    return torch.randn(shape, dtype=dtype, device="cuda")
+
+
+@pytest.mark.skipif(
+    not is_gptq_allspark_supported(80, 89),
+    reason="AllSpark Ampere kernel is not supported on this GPU type.")
+@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("group_size", [-1])
+@pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
+    m_factor, n_factor, k_factor = mnk_factors
+    m = m_factor
+    n = n_factor * ALLSPARK_AMPERE_N_ALIGN
+    k = k_factor * ALLSPARK_AMPERE_K_ALIGN
+
+    input = rand_data((m, k), dtype=dtype)
+    weight = rand_data((k, n), dtype=dtype)
+
+    # Quantize (and apply act_order if provided)
+    w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
+                                        group_size, has_zp)
+
+    qw = qw.to(torch.uint8)
+    if has_zp:
+        zp = zp.to(dtype)
+    properties = torch.cuda.get_device_properties(qw.device.index)
+    sm_count = properties.multi_processor_count
+    sm_version = properties.major * 10 + properties.minor
+
+    n_32align = (n + 32 - 1) // 32 * 32
+
+    qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
+        qw, s, zp, has_zp)
+    opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
+            (qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
+             n_32align))
+
+    opcheck(torch.ops._C.allspark_w8a16_gemm,
+            (input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
+             sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
+                                     n, group_size, sm_count, sm_version,
+                                     ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+                                     has_zp, True)
+
+    output_ref = torch.matmul(input, w_ref)
+    torch.cuda.synchronize()
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index c187b4c7ed9..b9b2b634e0b 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -215,8 +215,6 @@ def check_model(model):
             assert qkv_proj.scheme.group_size == (-1
                                                   if group is None else group)
 
-            assert qkv_proj.weight_packed.dtype is torch.int32
-            assert qkv_proj.weight_scale.dtype is torch.float16
             assert qkv_proj.scheme.pack_factor == pack_factor
 
         llm.apply_model(check_model)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 0e83bcaead9..373f92a52a1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -404,6 +404,22 @@ def machete_prepack_B_fake(
                                 memory_format=torch.contiguous_format)
 
 
+if hasattr(torch.ops._C, "allspark_w8a16_gemm"):
+
+    @register_fake("_C::allspark_w8a16_gemm")
+    def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
+                                  b_scales: torch.Tensor,
+                                  b_qzeros: Optional[torch.Tensor],
+                                  n: torch.SymInt, group_size: torch.SymInt,
+                                  sm_count: torch.SymInt,
+                                  sm_version: torch.SymInt,
+                                  CUBLAS_M_THRESHOLD: torch.SymInt,
+                                  has_zp: bool,
+                                  n32k16_reorder: bool) -> torch.Tensor:
+        m = a.size(0)
+        return torch.empty((m, n), device=a.device, dtype=a.dtype)
+
+
 if hasattr(torch.ops._C, "ggml_dequantize"):
 
     @register_fake("_C::ggml_dequantize")
@@ -881,6 +897,67 @@ def scaled_fp8_quant(
     return output, scale
 
 
+# gptq allspark
+def allspark_repack_weight(
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        zero_point: Optional[torch.Tensor] = None,
+        has_zp: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format 
+    for Ampere W8A16 Fused Gemm kernel
+
+    Args:
+        qweight: uint8 weight tensor, original k x n format.
+        scale: fp16/bf16 weight scale tensor, 1 x n format.
+        zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
+            Must be provided for asymmetric quantization.
+        has_zp: if use symmetric quantization, has_zp = False.
+            if use asymmetric quantization, has_zp = True.  
+    
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : 
+            rearranged weight, scale, and optionally zero_point.
+    """
+    K = qweight.shape[0]
+    N = qweight.shape[1]
+    N_32align = (N + 32 - 1) // 32 * 32
+
+    qweight_reorder = torch.empty((N_32align, K),
+                                  device=qweight.device,
+                                  dtype=qweight.dtype)
+    scale_reorder = torch.empty((1, N_32align),
+                                device=scale.device,
+                                dtype=scale.dtype)
+    zero_point_reorder = None
+    if has_zp:
+        assert zero_point is not None, (
+            "zero_point must be provided for asymmetric quantization.")
+        zero_point_reorder = torch.empty((1, N_32align),
+                                         device=zero_point.device,
+                                         dtype=zero_point.dtype)
+
+    torch.ops._C.rearrange_kn_weight_as_n32k16_order(
+        qweight, scale, zero_point, has_zp, qweight_reorder, scale_reorder,
+        zero_point_reorder, K, N, N_32align)
+
+    return qweight_reorder, scale_reorder, zero_point_reorder
+
+
+def allspark_w8a16_gemm(a: torch.Tensor, b_qweight: torch.Tensor,
+                        b_scales: torch.Tensor,
+                        b_qzeros: Optional[torch.Tensor], n: int,
+                        group_size: int, sm_count: int, sm_version: int,
+                        CUBLAS_M_THRESHOLD: int, has_zp: bool,
+                        n32k16_reorder: bool) -> torch.Tensor:
+
+    return torch.ops._C.allspark_w8a16_gemm(a, b_qweight, b_scales, b_qzeros,
+                                            n, group_size, sm_count,
+                                            sm_version, CUBLAS_M_THRESHOLD,
+                                            has_zp, n32k16_reorder)
+
+
 # int8
 def scaled_int8_quant(
     input: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
index bcfdb167771..520e1bc9672 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py
@@ -3,6 +3,8 @@
 from typing import List, Optional, Type
 
 import vllm.envs as envs
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.allspark import (  # noqa: E501
+    AllSparkLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.exllama import (  # noqa: E501
     ExllamaLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.mixed_precision.machete import (  # noqa: E501
@@ -16,6 +18,7 @@
 # in priority/performance order (when available)
 _POSSIBLE_KERNELS: List[Type[MPLinearKernel]] = [
     MacheteLinearKernel,
+    AllSparkLinearKernel,
     MarlinLinearKernel,
     ExllamaLinearKernel,
 ]
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
new file mode 100644
index 00000000000..56fdd6a18e0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, check_allspark_supported_dtype_shape)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+
+
+class AllSparkLinearKernel(MPLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if c.has_g_idx:
+            return False, "Act reordering currently not supported by AllSpark"
+
+        if c.zero_points:
+            return False, "Zero points currently not supported by AllSpark"
+
+        return check_allspark_supported_dtype_shape(
+            c.partition_weight_shape[0],  # in_features
+            c.partition_weight_shape[1],  # out_features
+            c.group_size,
+            c.weight_type,
+            c.act_type)
+
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+
+        # prepare the parameters required for the kernel
+        properties = torch.cuda.get_device_properties(device.index)
+        sm_count = properties.multi_processor_count
+        sm_version = properties.major * 10 + properties.minor
+        gemm_args = {}
+        gemm_args['sm_count'] = sm_count
+        gemm_args['sm_version'] = sm_version
+
+        self.gemm_args = gemm_args
+
+        # transform param weight, scale
+        old_weight_param = getattr(layer, self.w_q_name)
+        old_scale_param = getattr(layer, self.w_s_name)
+
+        assert isinstance(old_weight_param, BasevLLMParameter)
+        permute_param_layout_(old_weight_param,
+                              input_dim=0,
+                              output_dim=1,
+                              packed_dim=0)
+
+        assert isinstance(old_scale_param, BasevLLMParameter)
+        permute_param_layout_(old_scale_param, input_dim=0, output_dim=1)
+
+        # unpack weight from K / 4 x N int32 to K x N uint8
+        new_weight_param = torch.nn.Parameter(old_weight_param.data,
+                                              requires_grad=False)
+        new_weight_param.data = new_weight_param.data.t().contiguous().view(
+            dtype=torch.uint8)
+        new_weight_param.data = new_weight_param.data.t().contiguous()
+
+        new_scale_param = torch.nn.Parameter(old_scale_param.data,
+                                             requires_grad=False)
+
+        # reorder K x N weight as N32K16 format for Ampere W8A16
+        new_weight_param.data, new_scale_param.data, _ = \
+            ops.allspark_repack_weight(
+                new_weight_param.data, new_scale_param.data, None,
+                c.zero_points)
+
+        replace_parameter(layer, self.w_q_name, new_weight_param.data)
+        replace_parameter(layer, self.w_s_name, new_scale_param.data)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        gemm_args = self.gemm_args
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+
+        output = ops.allspark_w8a16_gemm(
+            a=reshaped_x,
+            b_qweight=w_q,
+            b_scales=w_s,
+            b_qzeros=None,
+            n=c.partition_weight_shape[1],
+            group_size=c.group_size,
+            sm_count=gemm_args['sm_count'],
+            sm_version=gemm_args['sm_version'],
+            CUBLAS_M_THRESHOLD=ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+            has_zp=c.zero_points,
+            n32k16_reorder=True)
+
+        if bias is not None:
+            output.add_(bias)  # In-place add
+
+        return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
new file mode 100644
index 00000000000..97860765a9e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.platforms import current_platform
+from vllm.scalar_type import ScalarType, scalar_types
+
+ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD = 1024
+ALLSPARK_SUPPORTED_QUANT_TYPES = [scalar_types.uint8b128]
+ALLSPARK_AMPERE_N_ALIGN = 16
+ALLSPARK_AMPERE_K_ALIGN = 16
+
+
+def check_allspark_supported_dtype_shape(input_size_per_partition: int,
+                                         output_size_per_partition: int,
+                                         group_size: int,
+                                         weight_dtype: ScalarType,
+                                         act_dtype: torch.dtype):
+    capability_tuple = current_platform.get_device_capability()
+    device_capability = (-1 if capability_tuple is None else
+                         capability_tuple.to_int())
+
+    # For Ampere GPU
+    if device_capability >= 80 and device_capability < 90:
+        if group_size != -1:
+            return False, \
+                "For Ampere GPU, AllSpark does not support group_size "\
+                f"= {group_size}. Only group_size = -1 are supported."
+
+        if weight_dtype not in ALLSPARK_SUPPORTED_QUANT_TYPES:
+            return False, "For Ampere GPU, AllSpark does not support "\
+                f"quant type ({weight_dtype}). Only quant type "\
+                f"({ALLSPARK_SUPPORTED_QUANT_TYPES}) are supported."
+
+        if input_size_per_partition % ALLSPARK_AMPERE_K_ALIGN != 0 \
+            or output_size_per_partition % ALLSPARK_AMPERE_N_ALIGN != 0:
+            return False, \
+                "AllSpark needs input_size_per_partition % "\
+                f"{ALLSPARK_AMPERE_K_ALIGN} = 0 and "\
+                f"output_size_per_partition % {ALLSPARK_AMPERE_N_ALIGN} = 0 "\
+                "for Ampere GPU optimized kernels."
+
+        if act_dtype != torch.float16 and act_dtype != torch.bfloat16:
+            return False, \
+                "AllSpark only supports act_dtype = float16 or bfloat16,"\
+                f"for Ampere GPU, but got act_dtype = {act_dtype}."
+    else:
+        return False, "AllSpark currently does not support "\
+            f"device_capability = {device_capability}."
+
+    return True, None

From 7d3c3661a2cc71c1cf12e80bce903454c0ef88dd Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 1 Mar 2025 14:31:01 +0800
Subject: [PATCH 0451/1240] [Bugfix][V1][Minor] Fix shutting_down flag checking
 in V1 MultiprocExecutor (#14053)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/executor/multiproc_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index d4582122fa6..25b5c1c1c2f 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -170,7 +170,7 @@ def _cleanup_sockets(self):
 
     def shutdown(self):
         """Properly shut down the executor and its workers"""
-        if getattr(self, 'shutting_down', False):
+        if not getattr(self, 'shutting_down', False):
             self.shutting_down = True
             for w in self.workers:
                 w.worker_response_mq = None

From e05afc91da4ab9273f352ba3354254fae82af414 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 28 Feb 2025 22:44:24 -0800
Subject: [PATCH 0452/1240] [Documentation] Add more deployment guide for
 Kubernetes deployment (#13841)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/integrations/index.md  |   1 +
 .../integrations/production-stack.md          | 154 ++++++++++++++++++
 docs/source/deployment/k8s.md                 |  18 +-
 3 files changed, 166 insertions(+), 7 deletions(-)
 create mode 100644 docs/source/deployment/integrations/production-stack.md

diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
index a557456c086..410742b88c7 100644
--- a/docs/source/deployment/integrations/index.md
+++ b/docs/source/deployment/integrations/index.md
@@ -7,4 +7,5 @@ kserve
 kubeai
 llamastack
 llmaz
+production-stack
 :::
diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/source/deployment/integrations/production-stack.md
new file mode 100644
index 00000000000..e66e8e6a16b
--- /dev/null
+++ b/docs/source/deployment/integrations/production-stack.md
@@ -0,0 +1,154 @@
+(deployment-production-stack)=
+
+# Production stack
+
+Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
+
+* **Upstream vLLM compatibility** – It wraps around upstream vLLM without modifying its code.
+* **Ease of use** – Simplified deployment via Helm charts and observability through Grafana dashboards.
+* **High performance** – Optimized for LLM workloads with features like multi-model support, model-aware and prefix-aware routing, fast vLLM bootstrapping, and KV cache offloading with [LMCache](https://github.com/LMCache/LMCache), among others.
+
+If you are new to Kubernetes, don't worry: in the vLLM production stack [repo](https://github.com/vllm-project/production-stack), we provide a step-by-step [guide](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) and a [short video](https://www.youtube.com/watch?v=EsTJbQtzj0g) to set up everything and get started in **4 minutes**!
+
+## Pre-requisite
+
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+
+## Deployment using vLLM production stack
+
+The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+
+To install the vLLM production stack, run the following commands on your desktop:
+
+```bash
+sudo helm repo add vllm https://vllm-project.github.io/production-stack
+sudo helm install vllm vllm/vllm-stack -f tutorials/assets/values-01-minimal-example.yaml
+```
+
+This will instantiate a vLLM-production-stack-based deployment named `vllm` that runs a small LLM (Facebook opt-125M model).
+
+### Validate Installation
+
+Monitor the deployment status using:
+
+```bash
+sudo kubectl get pods
+```
+
+And you will see that pods for the `vllm` deployment will transit to `Running` state.
+
+```text
+NAME                                           READY   STATUS    RESTARTS   AGE
+vllm-deployment-router-859d8fb668-2x2b7        1/1     Running   0          2m38s
+vllm-opt125m-deployment-vllm-84dfc9bd7-vb9bs   1/1     Running   0          2m38s
+```
+
+**NOTE**: It may take some time for the containers to download the Docker images and LLM weights.
+
+### Send a Query to the Stack
+
+Forward the `vllm-router-service` port to the host machine:
+
+```bash
+sudo kubectl port-forward svc/vllm-router-service 30080:80
+```
+
+And then you can send out a query to the OpenAI-compatible API to check the available models:
+
+```bash
+curl -o- http://localhost:30080/models
+```
+
+Expected output:
+
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "facebook/opt-125m",
+      "object": "model",
+      "created": 1737428424,
+      "owned_by": "vllm",
+      "root": null
+    }
+  ]
+}
+```
+
+To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
+
+```bash
+curl -X POST http://localhost:30080/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Once upon a time,",
+    "max_tokens": 10
+  }'
+```
+
+Expected output:
+
+```json
+{
+  "id": "completion-id",
+  "object": "text_completion",
+  "created": 1737428424,
+  "model": "facebook/opt-125m",
+  "choices": [
+    {
+      "text": " there was a brave knight who...",
+      "index": 0,
+      "finish_reason": "length"
+    }
+  ]
+}
+```
+
+### Uninstall
+
+To remove the deployment, run:
+
+```bash
+sudo helm uninstall vllm
+```
+
+------
+
+### (Advanced) Configuring vLLM production stack
+
+The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
+
+```yaml
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "opt125m"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 1
+
+    requestCPU: 6
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "10Gi"
+```
+
+In this YAML configuration:
+* **`modelSpec`** includes:
+  * `name`: A nickname that you prefer to call the model.
+  * `repository`: Docker repository of vLLM.
+  * `tag`: Docker image tag.
+  * `modelURL`: The LLM model that you want to use.
+* **`replicaCount`**: Number of replicas.
+* **`requestCPU` and `requestMemory`**: Specifies the CPU and memory resource requests for the pod.
+* **`requestGPU`**: Specifies the number of GPUs required.
+* **`pvcStorage`**: Allocates persistent storage for the model.
+
+**NOTE:** If you intend to set up two pods, please refer to this [YAML file](https://github.com/vllm-project/production-stack/blob/main/tutorials/assets/values-01-2pods-minimal-example.yaml).
+
+**NOTE:** vLLM production stack offers many more features (*e.g.* CPU offloading and a wide range of routing algorithms). Please check out these [examples and tutorials](https://github.com/vllm-project/production-stack/tree/main/tutorials) and our [repo](https://github.com/vllm-project/production-stack) for more details!
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index cbc95c20ff4..64071ba042d 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -2,17 +2,21 @@
 
 # Using Kubernetes
 
-Using Kubernetes to deploy vLLM is a scalable and efficient way to serve machine learning models. This guide will walk you through the process of deploying vLLM with Kubernetes, including the necessary prerequisites, steps for deployment, and testing.
+Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
-## Prerequisites
+--------
 
-Before you begin, ensure that you have the following:
+Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
 
-- A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at `https://github.com/NVIDIA/k8s-device-plugin/`
-- Available GPU resources in your cluster
+* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
 
-## Deployment Steps
+--------
+
+## Pre-requisite
+
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+
+## Deployment using native K8s
 
 1. Create a PVC, Secret and Deployment for vLLM
 

From c06238b0e492beed864d6eb77710b4788e560f89 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 1 Mar 2025 14:49:15 +0800
Subject: [PATCH 0453/1240] [Doc] Consolidate `whisper` and `florence2`
 examples (#14050)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/audio_language.py  |  82 +++++----
 .../encoder_decoder_multimodal.py             | 158 ++++++++++++++++++
 .../offline_inference/florence2_inference.py  |  53 ------
 examples/offline_inference/whisper.py         |  61 -------
 vllm/model_executor/models/whisper.py         |   4 +-
 5 files changed, 210 insertions(+), 148 deletions(-)
 create mode 100644 examples/offline_inference/encoder_decoder_multimodal.py
 delete mode 100644 examples/offline_inference/florence2_inference.py
 delete mode 100644 examples/offline_inference/whisper.py

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 3e3034a02f0..1ceec026b31 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -24,25 +24,30 @@
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-# Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
-    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
+# MiniCPM-O
+def run_minicpmo(question: str, audio_count: int):
+    model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(model=model_name,
+              trust_remote_code=True,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    audio_placeholder = "(<audio>./</audio>)" * audio_count
+    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
     messages = [{
         'role': 'user',
-        'content': "<|audio|>\n" * audio_count + question
+        'content': f'{audio_placeholder}\n{question}'
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
-                                           add_generation_prompt=True)
-
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              trust_remote_code=True,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
+                                           add_generation_prompt=True,
+                                           chat_template=audio_chat_template)
     return llm, prompt, stop_token_ids
 
 
@@ -68,36 +73,49 @@ def run_qwen2_audio(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
-def run_minicpmo(question: str, audio_count: int):
-    model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    llm = LLM(model=model_name,
-              trust_remote_code=True,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
-
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
-    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+# Ultravox 0.5-1B
+def run_ultravox(question: str, audio_count: int):
+    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
-    audio_placeholder = "(<audio>./</audio>)" * audio_count
-    audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
     messages = [{
         'role': 'user',
-        'content': f'{audio_placeholder}\n{question}'
+        'content': "<|audio|>\n" * audio_count + question
     }]
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=audio_chat_template)
+                                           add_generation_prompt=True)
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Whisper
+def run_whisper(question: str, audio_count: int):
+    assert audio_count == 1, (
+        "Whisper only support single audio input per prompt")
+    model_name = "openai/whisper-large-v3-turbo"
+
+    prompt = "<|startoftranscript|>"
+
+    llm = LLM(model=model_name,
+              max_model_len=448,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
 model_example_map = {
-    "ultravox": run_ultravox,
+    "minicpmo": run_minicpmo,
     "qwen2_audio": run_qwen2_audio,
-    "minicpmo": run_minicpmo
+    "ultravox": run_ultravox,
+    "whisper": run_whisper,
 }
 
 
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
new file mode 100644
index 00000000000..f44bc423658
--- /dev/null
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+the explicit/implicit prompt format on enc-dec LMMs for text generation.
+"""
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+
+def run_florence2():
+    # Create a Florence-2 encoder/decoder model instance
+    llm = LLM(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # implicit prompt with task token
+            "prompt": "<DETAILED_CAPTION>",
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image
+            },
+        },
+        {   # explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "Describe in detail what is shown in the image.",
+                "multi_modal_data": {
+                    "image": ImageAsset("cherry_blossom").pil_image
+                },
+            },
+            "decoder_prompt": "",
+        },
+    ]
+    return llm, prompts
+
+
+def run_mllama():
+    # Create a Mllama encoder/decoder model instance
+    llm = LLM(
+        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+        max_model_len=4096,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+            "multi_modal_data": {
+                "image": ImageAsset("stop_sign").pil_image,
+            },
+        },
+        {   # Explicit prompt
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {
+                    "image": ImageAsset("stop_sign").pil_image,
+                },
+            },
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+        },
+    ]
+    return llm, prompts
+
+
+def run_whisper():
+    # Create a Whisper encoder/decoder model instance
+    llm = LLM(
+        model="openai/whisper-large-v3-turbo",
+        max_model_len=448,
+        max_num_seqs=16,
+        limit_mm_per_prompt={"audio": 1},
+        dtype="half",
+    )
+
+    prompts = [
+        {   # Test implicit prompt
+            "prompt": "<|startoftranscript|>",
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+            },
+        },
+        {   # Test explicit encoder/decoder prompt
+            "encoder_prompt": {
+                "prompt": "",
+                "multi_modal_data": {
+                    "audio": AudioAsset("winning_call").audio_and_sample_rate,
+                },
+            },
+            "decoder_prompt": "<|startoftranscript|>",
+        }
+    ]
+    return llm, prompts
+
+
+model_example_map = {
+    "florence2": run_florence2,
+    "mllama": run_mllama,
+    "whisper": run_whisper,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompts = model_example_map[model]()
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=64,
+    )
+
+    start = time.time()
+
+    # Generate output tokens from the prompts. The output is a list of
+    # RequestOutput objects that contain the prompt, generated
+    # text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Decoder prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+
+    duration = time.time() - start
+
+    print("Duration:", duration)
+    print("RPS:", len(prompts) / duration)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="mllama",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference/florence2_inference.py b/examples/offline_inference/florence2_inference.py
deleted file mode 100644
index 27aceee43cb..00000000000
--- a/examples/offline_inference/florence2_inference.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Demonstrate prompting of text-to-text
-encoder/decoder models, specifically Florence-2
-"""
-# TODO(Isotr0py):
-# Move to offline_inference/vision_language.py
-# after porting vision backbone
-from vllm import LLM, SamplingParams
-from vllm.assets.image import ImageAsset
-
-# Create a Florence-2 encoder/decoder model instance
-llm = LLM(
-    model="microsoft/Florence-2-large",
-    tokenizer="facebook/bart-large",
-    max_num_seqs=8,
-    trust_remote_code=True,
-)
-
-prompts = [
-    {   # implicit prompt with task token
-        "prompt": "<DETAILED_CAPTION>",
-        "multi_modal_data": {
-            "image": ImageAsset("stop_sign").pil_image
-        },
-    },
-    {   # explicit encoder/decoder prompt
-        "encoder_prompt": {
-            "prompt": "Describe in detail what is shown in the image.",
-            "multi_modal_data": {
-                "image": ImageAsset("cherry_blossom").pil_image
-            },
-        },
-        "decoder_prompt": "",
-    },
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(
-    temperature=0,
-    top_p=1.0,
-    min_tokens=0,
-    max_tokens=128,
-)
-
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-
-# Print the outputs.
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
diff --git a/examples/offline_inference/whisper.py b/examples/offline_inference/whisper.py
deleted file mode 100644
index 59c119a772d..00000000000
--- a/examples/offline_inference/whisper.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import time
-
-from vllm import LLM, SamplingParams
-from vllm.assets.audio import AudioAsset
-
-# Create a Whisper encoder/decoder model instance
-llm = LLM(
-    model="openai/whisper-large-v3",
-    max_model_len=448,
-    max_num_seqs=400,
-    limit_mm_per_prompt={"audio": 1},
-    kv_cache_dtype="fp8",
-)
-
-prompts = [
-    {
-        "prompt": "<|startoftranscript|>",
-        "multi_modal_data": {
-            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
-        },
-    },
-    {  # Test explicit encoder/decoder prompt
-        "encoder_prompt": {
-            "prompt": "",
-            "multi_modal_data": {
-                "audio": AudioAsset("winning_call").audio_and_sample_rate,
-            },
-        },
-        "decoder_prompt": "<|startoftranscript|>",
-    }
-] * 1024
-
-# Create a sampling params object.
-sampling_params = SamplingParams(
-    temperature=0,
-    top_p=1.0,
-    max_tokens=200,
-)
-
-start = time.time()
-
-# Generate output tokens from the prompts. The output is a list of
-# RequestOutput objects that contain the prompt, generated
-# text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    encoder_prompt = output.encoder_prompt
-    generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
-
-duration = time.time() - start
-
-print("Duration:", duration)
-print("RPS:", len(prompts) / duration)
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 656e5fc6dcf..c5a55e300c4 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -748,11 +748,11 @@ def _create_fake_bias_for_k_proj(
     weights: Iterable[Tuple[str, torch.Tensor]]
 ) -> Iterable[Tuple[str, torch.Tensor]]:
     """
-    Create full zeros bias for k_proj weight in self-attention layers.
+    Create full zeros bias for k_proj weight in self-attn and x-attn layers.
     So that the bias for k_proj in qkv_proj can be initialized with zeros.
     """
     for name, weight in weights:
-        if name.endswith(".self_attn.k_proj.weight"):
+        if name.endswith(".k_proj.weight"):
             bias = torch.zeros(weight.size(0))
             bias_name = name.replace("weight", "bias")
             yield from [(name, weight), (bias_name, bias)]

From b177c8f3b973b50c12889010548b0ae57792417a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Feb 2025 23:09:14 -0800
Subject: [PATCH 0454/1240] [V1][Minor] Do not print attn backend twice
 (#13985)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cuda.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2a4cac46c06..bffa113cab8 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -178,7 +178,8 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         block_size)
                 else:
                     if use_v1:
-                        logger.info("Using FlashMLA backend on V1 engine.")
+                        logger.info_once(
+                            "Using FlashMLA backend on V1 engine.")
                         return ("vllm.v1.attention.backends.mla."
                                 "flashmla.FlashMLABackend")
                     else:
@@ -187,14 +188,14 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                                 "flashmla.FlashMLABackend")
 
             if use_v1:
-                logger.info("Using Triton MLA backend on V1 engine.")
+                logger.info_once("Using Triton MLA backend on V1 engine.")
                 return ("vllm.v1.attention.backends.mla."
                         "triton_mla.TritonMLABackend")
             else:
                 logger.info("Using Triton MLA backend.")
                 return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if use_v1:
-            logger.info("Using Flash Attention backend on V1 engine.")
+            logger.info_once("Using Flash Attention backend on V1 engine.")
             return ("vllm.v1.attention.backends.flash_attn."
                     "FlashAttentionBackend")
         if selected_backend == _Backend.FLASHINFER:

From 6faac8d437dbdea06f4d5f0f13b2f88fb338c22d Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 28 Feb 2025 23:18:32 -0800
Subject: [PATCH 0455/1240] [ROCm][V1][Bugfix] Add get_builder_cls method to
 the ROCmAttentionBackend class (#14065)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/rocm_attn.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 0f3fabf05fc..5c7d759b181 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -9,7 +9,8 @@
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.logger import init_logger
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.flash_attn import (
+    FlashAttentionMetadata, FlashAttentionMetadataBuilder)
 
 logger = init_logger(__name__)
 
@@ -49,6 +50,10 @@ def get_kv_cache_shape(
     def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
+    @staticmethod
+    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
 
 class ROCmAttentionImpl(AttentionImpl):
 

From baa52d6dbf45b0efecda920aa0cdb02a9e75a9b2 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sat, 1 Mar 2025 16:25:54 +0800
Subject: [PATCH 0456/1240] [v1][Bugfix] Only cache blocks that are not in the
 prefix cache (#14073)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/block_pool.py       | 22 ++++------------------
 vllm/v1/core/kv_cache_manager.py |  9 +++++----
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 5ef495c7eed..1b5c7f96f66 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -107,34 +107,20 @@ def cache_full_blocks(
             assert prev_block.block_hash is not None
             prev_block_hash_value = prev_block.block_hash.hash_value
 
-        # Find the first uncached block.
-        # FIXME: num_cached_blocks should be corrected by the caller
-        # so this should never happen.
-        offset = 0
-        for blk in new_full_blocks:
-            if blk.block_hash is None:
-                break
-            else:
-                prev_block_hash_value = blk.block_hash.hash_value
-                offset += 1
-        else:
-            # All blocks are cached.
-            return
-
-        for i, blk in enumerate(new_full_blocks[offset:]):
-            blk_idx = num_cached_blocks + offset + i
+        for i, blk in enumerate(new_full_blocks):
             assert blk.block_hash is None
 
-            if i + offset < len(new_block_hashes):
+            if i < len(new_block_hashes):
                 # The block hash may already be computed in
                 # "get_computed_blocks" if the tokens are not generated by
                 # this request (either the prompt tokens or the previously
                 # generated tokens with preemption). In this case we simply
                 # reuse the block hash.
-                block_hash = new_block_hashes[i + offset]
+                block_hash = new_block_hashes[i]
             else:
                 # Otherwise compute the block hash and cache it in the request
                 # in case it will be preempted in the future.
+                blk_idx = num_cached_blocks + i
                 start_token_idx = blk_idx * block_size
                 end_token_idx = (blk_idx + 1) * block_size
                 block_tokens = request.all_token_ids[
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index fc7bfa0eff5..030574de2bd 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -65,7 +65,7 @@ def __init__(
         # This is used to track the number of cached blocks for each request.
         # This is only used to track the RUNNING requests, we do not track the
         # data for reempted ones.
-        self.num_cached_block: Dict[str, int] = defaultdict(int)
+        self.num_cached_block: Dict[str, int] = {}
         self.prefix_cache_stats = PrefixCacheStats()
 
     @property
@@ -224,9 +224,10 @@ def allocate_slots(
         if not self.enable_caching:
             return new_blocks
 
-        # FIXME: `num_cached_blocks` is not correct when the prefix cache
-        # of a new request is hit.
-        num_cached_blocks = self.num_cached_block[request.request_id]
+        # Use `new_computed_blocks` for a new request, and `num_cached_block`
+        # for a running request.
+        num_cached_blocks = self.num_cached_block.get(request.request_id,
+                                                      len(new_computed_blocks))
         # Speculated tokens might be rejected in the future, so we does
         # not cache any speculated tokens. We only cache blocks with
         # generated (accepted) tokens.

From 6af0ad3152238cb0e84316e570fdf6eb5577e845 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sun, 2 Mar 2025 04:46:02 +0800
Subject: [PATCH 0457/1240] [v1] Add `__repr__` to KVCacheBlock to avoid
 recursive print (#14081)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e3eb6b24c19..546fddf67f4 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -128,6 +128,19 @@ def reset_hash(self):
         """Reset the block hash when the block is evicted."""
         self._block_hash = None
 
+    def __repr__(self) -> str:
+        # Use block_id instead of KVCacheBlock object to avoid calling __repr__
+        # on KVCacheBlock object recursively.
+        prev_block_id = self.prev_free_block.block_id \
+            if self.prev_free_block else None
+        next_block_id = self.next_free_block.block_id \
+            if self.next_free_block else None
+        return (f"KVCacheBlock(block_id={self.block_id}, "
+                f"ref_cnt={self.ref_cnt}, "
+                f"_block_hash={self._block_hash}, "
+                f"prev_free_block={prev_block_id}, "
+                f"next_free_block={next_block_id})")
+
 
 class FreeKVCacheBlockQueue:
     """This class organizes a list of KVCacheBlock objects to a doubly linked

From f5498b7061abf25f20cda60d9e9011b7117175de Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sun, 2 Mar 2025 09:17:34 +0800
Subject: [PATCH 0458/1240] [Model] Add LoRA support for TransformersModel
 (#13770)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml              |   3 +-
 docs/source/models/supported_models.md     |  15 +--
 tests/lora/conftest.py                     |   5 +
 tests/lora/test_transfomers_model.py       | 120 +++++++++++++++++++++
 vllm/lora/layers.py                        |  25 +++--
 vllm/lora/utils.py                         |  25 +++--
 vllm/model_executor/models/transformers.py |  43 ++------
 7 files changed, 166 insertions(+), 70 deletions(-)
 create mode 100644 tests/lora/test_transfomers_model.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 05c4d261699..d0f5c94ffd8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -275,7 +275,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
@@ -589,6 +589,7 @@ steps:
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_minicpmv_tp.py
+    - pytest -v -s -x lora/test_transfomers_model.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 4b1f3e180ed..0e93a15b84f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -62,20 +62,7 @@ Transformers fallback has supported most of available quantization in vLLM (exce
 
 ##### LoRA
 
-LoRA hasn't supported on transformers fallback yet! Make sure to open an issue and we'll work on this together with the `transformers` team!
-
-Usually `transformers` model load weights via the `load_adapters` API, that depends on PEFT. We need to work a bit to either use this api (for now this would result in some weights not being marked as loaded) or replace modules accordingly.
-
-Hints as to how this would look like:
-
-```python
-class TransformersModel(nn.Module, SupportsLoRA):
-  def __init__(*):
-    ...
-    self.model.load_adapter(vllm_config.load_config.model_loader_extra_config["qlora_adapter_name_or_path"])
-```
-
-Blocker is that you need to specify supported lora layers, when we would ideally want to load whatever is inside the checkpoint!
+Transformers fallback has supported LoRA. The usage way is identical to how LoRA works with models supported by vLLM. If you encounter any issues, please open an issue.
 
 ##### Remote code
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index a414c3bcb6f..59c1570b542 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -240,6 +240,11 @@ def baichuan_regex_lora_files():
     return snapshot_download(repo_id="jeeejeee/baichuan-7b-lora-zero-regex")
 
 
+@pytest.fixture(scope="session")
+def ilama_lora_files():
+    return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
+
+
 @pytest.fixture(scope="session")
 def minicpmv_lora_files():
     return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
new file mode 100644
index 00000000000..07af1e9f449
--- /dev/null
+++ b/tests/lora/test_transfomers_model.py
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+import pytest
+
+import vllm
+from tests.utils import fork_new_process_for_each_test
+from vllm.lora.request import LoRARequest
+
+from ..utils import multi_gpu_test
+
+MODEL_PATH = "ArthurZ/ilama-3.2-1B"
+
+PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
+
+EXPECTED_LORA_OUTPUT = [
+    "SELECT count(*) FROM singer",
+    "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'",  # noqa: E501
+    "SELECT DISTINCT Country FROM singer WHERE Age  >  20",
+]
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    prompts = [
+        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            query=
+            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+        ),
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
+@pytest.mark.skip_v1
+@fork_new_process_for_each_test
+def test_ilama_lora(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=1,
+                   trust_remote_code=True,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skip_v1
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_ilama_lora_tp4(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=False,
+                   enable_chunked_prefill=True)
+
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
+
+
+@pytest.mark.skip_v1
+@multi_gpu_test(num_gpus=4)
+@fork_new_process_for_each_test
+def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
+    llm = vllm.LLM(MODEL_PATH,
+                   max_model_len=1024,
+                   enable_lora=True,
+                   max_loras=4,
+                   max_lora_rank=16,
+                   tensor_parallel_size=4,
+                   trust_remote_code=True,
+                   fully_sharded_loras=True,
+                   enable_chunked_prefill=True)
+    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
+    output2 = do_sample(llm, ilama_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_LORA_OUTPUT)):
+        assert output2[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 6c48173c201..5a4d991da1b 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -401,6 +401,11 @@ def apply(self,
                                             self.output_slices)
         return output
 
+    @classmethod
+    def get_source_layer(cls, source_layer: nn.Module) -> type:
+        # Check parent_cls in case source_layer is a HFCompatibleLinear.
+        return getattr(source_layer, "parent_cls", type(source_layer))
+
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
@@ -443,7 +448,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        return type(source_layer) is ReplicatedLinear
+        source_layer = cls.get_source_layer(source_layer)
+        return source_layer is ReplicatedLinear
 
 
 class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
@@ -539,8 +545,9 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        return type(source_layer) is ColumnParallelLinear or (
-            type(source_layer) is MergedColumnParallelLinear
+        source_layer = cls.get_source_layer(source_layer)
+        return source_layer is ColumnParallelLinear or (
+            source_layer is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
 
 
@@ -682,7 +689,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        return (type(source_layer) is MergedColumnParallelLinear
+        source_layer = cls.get_source_layer(source_layer)
+        return (source_layer is MergedColumnParallelLinear
                 and len(packed_modules_list) == 2)
 
 
@@ -750,7 +758,8 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
-        return type(source_layer) is QKVParallelLinear and len(
+        source_layer = cls.get_source_layer(source_layer)
+        return source_layer is QKVParallelLinear and len(
             packed_modules_list) == 1
 
 
@@ -811,7 +820,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        return (type(source_layer) is QKVParallelLinear
+        source_layer = cls.get_source_layer(source_layer)
+        return (source_layer is QKVParallelLinear
                 and len(packed_modules_list) == 3)
 
 
@@ -896,7 +906,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        return type(source_layer) is RowParallelLinear
+        source_layer = cls.get_source_layer(source_layer)
+        return source_layer is RowParallelLinear
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 63b465fdf74..9f1b14b4970 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -66,17 +66,20 @@ def from_layer(layer: nn.Module,
                                       lora_config=lora_config,
                                       packed_modules_list=packed_modules_list,
                                       model_config=model_config):
-            ret = lora_cls(layer)
-            ret.create_lora_weights(max_loras, lora_config, model_config)
-            return ret
-
-    # The Case for HFCompatibleLinear
-    if (hasattr(layer, "get_lora_class")
-            and layer.__class__.__name__ == "HFCompatibleLinear"):
-        lora_cls = layer.get_lora_class(lora_config.fully_sharded_loras)
-        ret = lora_cls(layer)
-        ret.create_lora_weights(max_loras, lora_config, model_config)
-        return ret
+            instance_layer = lora_cls(layer)
+            if layer.__class__.__name__ == "HFCompatibleLinear":
+                # HACK:  Make the forward method compatible with the original
+                # forward method of the instance_layer.
+                original_forward = instance_layer.forward
+
+                def new_forward(input):
+                    input = input.squeeze(0)
+                    return original_forward(input)[0]  # noqa: B023
+
+                instance_layer.forward = new_forward
+            instance_layer.create_lora_weights(max_loras, lora_config,
+                                               model_config)
+            return instance_layer
     return layer
 
 
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 1c3c443b294..61cfc566dd3 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -27,11 +27,6 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.logger import init_logger
-from vllm.lora.fully_sharded_layers import (
-    ColumnParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA)
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              ReplicatedLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
@@ -43,7 +38,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsQuant
+from .interfaces import SupportsLoRA, SupportsQuant
 from .utils import maybe_prefix
 
 logger = init_logger(__name__)
@@ -102,44 +97,18 @@ def replace_linear_class(
         "rowwise": RowParallelLinear,
     }.get(style, ReplicatedLinear)
 
-    lora_linear_cls = {
-        ColumnParallelLinear: {
-            True: ColumnParallelLinearWithShardedLoRA,  # fully sharded
-            False: ColumnParallelLinearWithLoRA  # not fully sharded
-        },
-        RowParallelLinear: {
-            True: RowParallelLinearWithShardedLoRA,
-            False: RowParallelLinearWithLoRA
-        },
-        # ReplicatedLinear doesn't support fully sharded LoRA yet,
-        # so we use the same class for both cases.
-        ReplicatedLinear: {
-            True: ReplicatedLinearWithLoRA,
-            False: ReplicatedLinearWithLoRA
-        }
-    }
-
     class HFCompatibleLinear(vllm_linear_cls):
         """
         Wrapper class that removes `output_bias` from returned output.
         """
+        # NOTE: The LoRA layer needs to use `parent_cls`.
+        @property
+        def parent_cls(self) -> type:
+            return vllm_linear_cls
 
         def forward(self, input: torch.Tensor) -> torch.Tensor:
             return super().forward(input)[0]
 
-        @classmethod
-        def get_lora_class(cls, fully_sharded: bool = False):
-            """
-            Get the LoRA class corresponding to the current transformer
-            linear class.
-
-            Args:
-                fully_sharded (bool): If True, select the LoRA class variant
-                that supports fully sharded LoRA. Defaults to False.
-
-            """
-            return lora_linear_cls[vllm_linear_cls][fully_sharded]
-
     return HFCompatibleLinear(
         input_size=linear.in_features,
         output_size=linear.out_features,
@@ -148,7 +117,7 @@ def get_lora_class(cls, fully_sharded: bool = False):
     )
 
 
-class TransformersModel(nn.Module, SupportsQuant):
+class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"
                          ]  # TODO transformers will have a util to get it

From e16dc8f37cabecbacb2e833764e4e3931c86061f Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Sat, 1 Mar 2025 20:20:30 -0500
Subject: [PATCH 0459/1240] [Misc] Accurately capture the time of loading
 weights (#14063)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 11 +++++++++++
 vllm/v1/worker/gpu_model_runner.py         |  2 +-
 vllm/worker/model_runner.py                |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 6244241d189..4f1092f68f5 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -10,6 +10,7 @@
 import itertools
 import math
 import os
+import time
 import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -216,6 +217,9 @@ class Source:
         allow_patterns_overrides: Optional[list[str]] = None
         """If defined, weights will load exclusively using these patterns."""
 
+    counter_before_loading_weights: float = 0.0
+    counter_after_loading_weights: float = 0.0
+
     def __init__(self, load_config: LoadConfig):
         super().__init__(load_config)
         if load_config.model_loader_extra_config:
@@ -368,6 +372,8 @@ def _xla_weights_iterator(iterator: Generator):
 
             weights_iterator = _xla_weights_iterator(weights_iterator)
 
+        if self.counter_before_loading_weights == 0.0:
+            self.counter_before_loading_weights = time.perf_counter()
         # Apply the prefix.
         return ((source.prefix + name, tensor)
                 for (name, tensor) in weights_iterator)
@@ -412,6 +418,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
                 self._get_all_weights(model_config, model))
+            self.counter_after_loading_weights = time.perf_counter()
+            logger.info(
+                "Loading weights took %.2f seconds",
+                self.counter_after_loading_weights -
+                self.counter_before_loading_weights)
             # We only enable strict check for non-quantized models
             # that have loaded weights tracking currently.
             if model_config.quantization is None and loaded_weights is not None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0215b273538..6785d668426 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1061,7 +1061,7 @@ def load_model(self) -> None:
                                                   self.device)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB and %.6f seconds",
+        logger.info("Model loading took %.4f GB and %.6f seconds",
                     self.model_memory_usage / float(2**30),
                     time_after_load - time_before_load)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index bb2228165b5..0ea1d5dcbbb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1114,7 +1114,7 @@ def load_model(self) -> None:
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB and %.6f seconds",
+        logger.info("Model loading took %.4f GB and %.6f seconds",
                     self.model_memory_usage / float(2**30),
                     time_after_load - time_before_load)
 

From ca72cd1667e3fa745fb99e5a38c8f7f45fcefcd7 Mon Sep 17 00:00:00 2001
From: qux-bbb <1147635419@qq.com>
Date: Sun, 2 Mar 2025 18:59:50 +0800
Subject: [PATCH 0460/1240] [Doc] Source building add clone step (#14086)

Signed-off-by: qux-bbb <1147635419@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../source/getting_started/installation/cpu/build.inc.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 2a8173803c0..46329e9bd28 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -6,7 +6,14 @@ sudo apt-get install -y gcc-12 g++-12 libnuma-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
 
-Second, install Python packages for vLLM CPU backend building:
+Second, clone vLLM project:
+
+```console
+git clone https://github.com/vllm-project/vllm.git vllm_source
+cd vllm_source
+```
+
+Third, install Python packages for vLLM CPU backend building:
 
 ```console
 pip install --upgrade pip

From 09193147d98cbcd0eab11928ae5815cfde88a964 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Mon, 3 Mar 2025 03:49:42 +0800
Subject: [PATCH 0461/1240] [v0][structured output] Support reasoning output
 (#12955)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md     |  43 +++++--
 ...etion_structured_outputs_with_reasoning.py |  64 ++++++++++
 .../model_executor/test_guided_processors.py  | 116 +++++++++++++++---
 vllm/config.py                                |   2 +
 vllm/engine/arg_utils.py                      |  26 +++-
 vllm/engine/async_llm_engine.py               |  11 +-
 vllm/engine/llm_engine.py                     |   7 +-
 vllm/engine/multiprocessing/client.py         |   3 +-
 vllm/entrypoints/openai/cli_args.py           |  18 ---
 .../guided_decoding/__init__.py               |  30 +++--
 .../guided_decoding/outlines_decoding.py      |  30 +++--
 .../outlines_logits_processors.py             |  37 ++++--
 .../guided_decoding/reasoner/__init__.py      |  23 ++++
 .../reasoner/deepseek_reasoner.py             |  28 +++++
 .../guided_decoding/reasoner/reasoner.py      |  19 +++
 .../guided_decoding/xgrammar_decoding.py      |  19 ++-
 16 files changed, 400 insertions(+), 76 deletions(-)
 create mode 100644 examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 create mode 100644 vllm/model_executor/guided_decoding/reasoner/__init__.py
 create mode 100644 vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
 create mode 100644 vllm/model_executor/guided_decoding/reasoner/reasoner.py

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index e39bbacf113..5c0c1762f8a 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -76,7 +76,13 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 }
 ```
 
-Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests.
+Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+
+## Limitations
+
+- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
+- It is not compatible with [`tool_calling`](#tool_calling).
+- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
 
 ## How to support a new reasoning model
 
@@ -137,15 +143,36 @@ class ExampleParser(ReasoningParser):
         """
 ```
 
-After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint.
+Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in `vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py`.
+
+```python
+@dataclass
+class DeepSeekReasoner(Reasoner):
+    """
+    Reasoner for DeepSeek R series models.
+    """
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
+    @classmethod
+    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+        return cls(start_token_id=tokenizer.encode(
+            "<think>", add_special_tokens=False)[0],
+                   end_token_id=tokenizer.encode("</think>",
+                                                 add_special_tokens=False)[0])
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.end_token_id in input_ids
+```
+
+The structured output engine like xgrammar will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
+
+Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
 
 ```bash
 vllm serve <model_tag> \
     --enable-reasoning --reasoning-parser example
 ```
-
-## Limitations
-
-- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
-- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features.
-- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
new file mode 100644
index 00000000000..1f72e1164d4
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example shows how to generate structured outputs from reasoning models
+like DeepSeekR1. The thinking process will not be guided by the JSON
+schema provided by the user. Only the final output will be structured.
+
+To run this example, you need to start the vLLM server with the reasoning 
+parser:
+
+```bash
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+     --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+This example demonstrates how to generate chat completions from reasoning models
+using the OpenAI Python client library.
+"""
+
+from enum import Enum
+
+from openai import OpenAI
+from pydantic import BaseModel
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+
+# Guided decoding by JSON using Pydantic schema
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+json_schema = CarDescription.model_json_schema()
+
+prompt = ("Generate a JSON with the brand, model and car_type of"
+          "the most iconic car from the 90's, think in 100 tokens")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("content", completion.choices[0].message.content)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index be544698fa0..531c3a8c13b 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -16,17 +16,33 @@
 
 MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
 GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
 
-def test_guided_logits_processors(sample_regex, sample_json_schema):
+# Initialize the tokenizer for the model here to avoid repeated loading
+@pytest.fixture(scope="module")
+def zephyr_7B_tokenzer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
+                                  sample_json_schema):
     """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
-    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
-    regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+    regex_LP = RegexLogitsProcessor(sample_regex,
+                                    zephyr_7B_tokenzer,
+                                    reasoner=None)
     json_LP = JSONLogitsProcessor(sample_json_schema,
-                                  tokenizer,
-                                  whitespace_pattern=None)
+                                  zephyr_7B_tokenzer,
+                                  whitespace_pattern=None,
+                                  reasoner=None)
 
-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -34,7 +50,7 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
     assert tensor.shape == original_tensor.shape
     assert not torch.allclose(tensor, original_tensor)
 
-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
         f"Give an employee profile that fits this schema: {sample_json_schema}"
     )
     tensor = torch.rand(32000)
@@ -49,7 +65,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.parametrize("is_local", [True, False])
 async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
                                                  sample_regex,
-                                                 sample_json_schema):
+                                                 sample_json_schema,
+                                                 zephyr_7B_tokenzer):
 
     config = ModelConfig(
         MODEL_NAME,
@@ -60,15 +77,14 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
         seed=0,
         dtype="bfloat16",
     )
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
         f"Give an example IPv4 address with this regex: {sample_regex}")
     regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
 
     regex_lp = get_local_guided_decoding_logits_processor(
-            regex_request, tokenizer, config) if is_local else \
+            regex_request, zephyr_7B_tokenzer, config) if is_local else \
             await get_guided_decoding_logits_processor(
-                    regex_request, tokenizer, config)
+                    regex_request, zephyr_7B_tokenzer, config)
     assert regex_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
@@ -76,13 +92,85 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
     assert tensor.shape == original_tensor.shape
     assert not torch.allclose(tensor, original_tensor)
 
-    token_ids = tokenizer.encode(
+    token_ids = zephyr_7B_tokenzer.encode(
         f"Give an employee profile that fits this schema: {sample_json_schema}"
     )
     json_request = GuidedDecodingParams(json=sample_json_schema,
                                         backend=backend)
     json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer, config)
+        json_request, zephyr_7B_tokenzer, config)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend",
+                         GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
+@pytest.mark.parametrize("is_local", [True, False])
+@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
+async def test_guided_logits_processor_with_reasoning(
+        backend: str, is_local: bool, reasoning_backend: str, sample_regex,
+        sample_json_schema, deepseek_r1_qwen_tokenizer):
+
+    config = ModelConfig(
+        REASONING_MODEL_NAME,
+        task="generate",
+        tokenizer=REASONING_MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an example IPv4 address with this regex: {sample_regex}."
+        "<think>here is the thinking process")
+    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
+
+    regex_lp = get_local_guided_decoding_logits_processor(regex_request,
+                    deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, deepseek_r1_qwen_tokenizer, config,
+                    reasoning_backend)
+    assert regex_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = regex_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an employee profile that fits this schema: {sample_json_schema}."
+        "<think>here is the thinking process")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
+    assert json_lp is not None
+    tensor = torch.rand(32000)
+    original_tensor = torch.clone(tensor)
+    tensor = json_lp(token_ids, tensor)
+    assert tensor.shape == original_tensor.shape
+    assert torch.allclose(tensor, original_tensor)
+
+    # Thinking is over, so the tensor should change.
+    token_ids = deepseek_r1_qwen_tokenizer.encode(
+        f"Give an employee profile that fits this schema: {sample_json_schema}."
+        "<think>here is the thinking process</think> Then")
+    json_request = GuidedDecodingParams(json=sample_json_schema,
+                                        backend=backend)
+    json_lp = get_local_guided_decoding_logits_processor(
+        json_request, deepseek_r1_qwen_tokenizer, config,
+        reasoning_backend) if is_local else \
+        await get_guided_decoding_logits_processor(
+            json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
     assert json_lp is not None
     tensor = torch.rand(32000)
     original_tensor = torch.clone(tensor)
diff --git a/vllm/config.py b/vllm/config.py
index c7108473442..54ed38418dd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2715,6 +2715,8 @@ class DecodingConfig:
     # 'outlines' / 'lm-format-enforcer' / 'xgrammar'
     guided_decoding_backend: str = 'xgrammar'
 
+    reasoning_backend: Optional[str] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1a2f794c915..989eb4dbfd1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -213,6 +213,8 @@ class EngineArgs:
     calculate_kv_scales: Optional[bool] = None
 
     additional_config: Optional[Dict[str, Any]] = None
+    enable_reasoning: Optional[bool] = None
+    reasoning_parser: Optional[str] = None
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -1059,6 +1061,25 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "Different platforms may support different configs. Make sure the "
             "configs are valid for the platform you are using. The input format"
             " is like '{\"config_key\":\"config_value\"}'")
+
+        parser.add_argument(
+            "--enable-reasoning",
+            action="store_true",
+            default=False,
+            help="Whether to enable reasoning_content for the model. "
+            "If enabled, the model will be able to generate reasoning content."
+        )
+
+        parser.add_argument(
+            "--reasoning-parser",
+            type=str,
+            choices=["deepseek_r1"],
+            default=None,
+            help=
+            "Select the reasoning parser depending on the model that you're "
+            "using. This is used to parse the reasoning content into OpenAI "
+            "API format. Required for ``--enable-reasoning``.")
+
         return parser
 
     @classmethod
@@ -1332,7 +1353,10 @@ def create_engine_config(self,
                                         if self.enable_prompt_adapter else None
 
         decoding_config = DecodingConfig(
-            guided_decoding_backend=self.guided_decoding_backend)
+            guided_decoding_backend=self.guided_decoding_backend,
+            reasoning_backend=self.reasoning_parser
+            if self.enable_reasoning else None,
+        )
 
         show_hidden_metrics = False
         if self.show_hidden_metrics_for_version is not None:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 93d9b74d8e1..90e66b005f3 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -509,6 +509,7 @@ async def add_request_async(
                 tokenizer=await self.get_tokenizer_async(lora_request),
                 default_guided_backend=self.decoding_config.
                 guided_decoding_backend,
+                reasoning_backend=self.decoding_config.reasoning_backend,
                 model_config=self.model_config)
 
         self._add_processed_request(
@@ -530,7 +531,7 @@ async def check_health_async(self) -> None:
 
 async def build_guided_decoding_logits_processor_async(
         sampling_params: SamplingParams, tokenizer: AnyTokenizer,
-        default_guided_backend: str,
+        default_guided_backend: str, reasoning_backend: Optional[str],
         model_config: ModelConfig) -> SamplingParams:
     """Constructs logits processors based on the guided_decoding,
     logits_bias, and allowed_token_ids fields in sampling_params. Deletes
@@ -545,14 +546,18 @@ async def build_guided_decoding_logits_processor_async(
     sampling_params = copy.copy(sampling_params)
     guided_decoding = sampling_params.guided_decoding
 
-    logger.debug("Building guided decoding logits processor. "
-                 "Params: %s", guided_decoding)
+    logger.info(
+        "Building guided decoding logits processor. "
+        "guided_decoding: %s%s", guided_decoding,
+        f", reasoning_backend: {reasoning_backend}"
+        if reasoning_backend is not None else "")
 
     guided_decoding.backend = guided_decoding.backend or default_guided_backend
 
     processor = await get_guided_decoding_logits_processor(
         guided_params=guided_decoding,
         tokenizer=tokenizer,
+        reasoning_backend=reasoning_backend,
         model_config=model_config)
 
     if processor:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 9c83ea75ead..f055438d1fe 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2048,10 +2048,15 @@ def _build_logits_processors(
             guided_decoding.backend = guided_decoding.backend or \
                 self.decoding_config.guided_decoding_backend
 
+            logger.debug("Reasoning backend: %s",
+                         self.decoding_config.reasoning_backend)
+
             processor = get_local_guided_decoding_logits_processor(
                 guided_params=guided_decoding,
                 tokenizer=tokenizer,
-                model_config=self.model_config)
+                model_config=self.model_config,
+                reasoning_backend=self.decoding_config.reasoning_backend,
+            )
             if processor:
                 logits_processors.append(processor)
 
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index c12fe242082..005ba81cd22 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -611,7 +611,8 @@ async def _process_request(
                     default_guided_backend=(self.decoding_config.guided_decoding_backend
                         if self.decoding_config
                         else DecodingConfig.guided_decoding_backend),
-                    model_config=self.model_config
+                    model_config=self.model_config,
+                    reasoning_backend=self.decoding_config.reasoning_backend,
                 )
 
         # 1) Create output queue for this requests.
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index ba953c21970..8d877046f75 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -13,7 +13,6 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
                                          validate_chat_template)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 from vllm.entrypoints.openai.serving_models import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -215,23 +214,6 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Enable auto tool choice for supported models. Use "
         "``--tool-call-parser`` to specify which parser to use.")
-    parser.add_argument(
-        "--enable-reasoning",
-        action="store_true",
-        default=False,
-        help="Whether to enable reasoning_content for the model. "
-        "If enabled, the model will be able to generate reasoning content.")
-
-    valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys()
-    parser.add_argument(
-        "--reasoning-parser",
-        type=str,
-        metavar="{" + ",".join(valid_reasoning_parsers) + "}",
-        default=None,
-        help=
-        "Select the reasoning parser depending on the model that you're using."
-        " This is used to parse the reasoning content into OpenAI API "
-        "format. Required for ``--enable-reasoning``.")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 1522e340418..86f6f0e5f90 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.reasoner import get_reasoner
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
@@ -103,8 +104,13 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
 
 
 async def get_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig) -> LogitsProcessor | None:
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
+
+    reasoner = get_reasoner(tokenizer, reasoning_backend)
+
     guided_params = maybe_backend_fallback(guided_params)
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
@@ -112,8 +118,8 @@ async def get_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_outlines_guided_decoding_logits_processor)
         return await get_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer)
-    if guided_params.backend_name == 'lm-format-enforcer':
+            guided_params, tokenizer, reasoner)
+    if guided_params.backend == 'lm-format-enforcer':
         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
         return get_local_lm_format_enforcer_guided_decoding_logits_processor(
@@ -122,7 +128,7 @@ async def get_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config)
+            guided_params, tokenizer, model_config, reasoner)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
@@ -130,16 +136,22 @@ async def get_guided_decoding_logits_processor(
 
 
 def get_local_guided_decoding_logits_processor(
-        guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer,
-        model_config: ModelConfig) -> LogitsProcessor | None:
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizer,
+        model_config: ModelConfig,
+        reasoning_backend: str | None = None) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
+
+    # Get the reasoner if needed, it will be None if reasoning_
+    reasoner = get_reasoner(tokenizer, reasoning_backend)
+
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
         from vllm.model_executor.guided_decoding.outlines_decoding import (  # noqa
             get_local_outlines_guided_decoding_logits_processor)
         return get_local_outlines_guided_decoding_logits_processor(
-            guided_params, tokenizer)
+            guided_params, tokenizer, reasoner)
     if guided_params.backend_name == 'lm-format-enforcer':
         from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (  # noqa
             get_local_lm_format_enforcer_guided_decoding_logits_processor)
@@ -149,7 +161,7 @@ def get_local_guided_decoding_logits_processor(
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (  # noqa
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
-            guided_params, tokenizer, model_config)
+            guided_params, tokenizer, model_config, reasoner)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index ba9c9829036..97f63ae11f4 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -6,12 +6,13 @@
 from enum import Enum
 from json import dumps as json_dumps
 from re import escape as regex_escape
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 
 from transformers import PreTrainedTokenizerBase
 
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.sampling_params import GuidedDecodingParams
 
 
@@ -58,7 +59,9 @@ class GuidedDecodingMode(Enum):
 
 
 async def get_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
+    guided_params: GuidedDecodingParams,
+    tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[Reasoner],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -82,11 +85,14 @@ async def get_outlines_guided_decoding_logits_processor(
 
     return await loop.run_in_executor(global_thread_pool,
                                       _get_logits_processor, guide, tokenizer,
-                                      mode, guided_params.whitespace_pattern)
+                                      mode, guided_params.whitespace_pattern,
+                                      reasoner)
 
 
 def get_local_outlines_guided_decoding_logits_processor(
-    guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizerBase
+    guided_params: GuidedDecodingParams,
+    tokenizer: PreTrainedTokenizerBase,
+    reasoner: Optional[Reasoner],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -100,7 +106,7 @@ def get_local_outlines_guided_decoding_logits_processor(
         return None
 
     return _get_logits_processor(guide, tokenizer, mode,
-                                 guided_params.whitespace_pattern)
+                                 guided_params.whitespace_pattern, reasoner)
 
 
 def _get_guide_and_mode(
@@ -131,14 +137,18 @@ def _get_guide_and_mode(
 
 
 def _get_logits_processor(
-    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
-    whitespace_pattern: Union[str, None]
+    guide: str,
+    tokenizer: PreTrainedTokenizerBase,
+    mode: GuidedDecodingMode,
+    whitespace_pattern: Union[str, None],
+    reasoner: Optional[Reasoner],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
     if mode == GuidedDecodingMode.JSON:
-        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
+        return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
+                                   reasoner)
     elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
-        return RegexLogitsProcessor(guide, tokenizer)
+        return RegexLogitsProcessor(guide, tokenizer, reasoner)
     elif mode == GuidedDecodingMode.GRAMMAR:
-        return CFGLogitsProcessor(guide, tokenizer)
+        return CFGLogitsProcessor(guide, tokenizer, reasoner)
     else:
         raise ValueError(f"Unknown guided decoding mode {mode}")
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index a05267d921d..db5d738f42e 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -19,7 +19,7 @@
 import json
 from collections import defaultdict
 from functools import lru_cache
-from typing import Callable, DefaultDict, Dict, List, Union
+from typing import Callable, DefaultDict, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -32,13 +32,18 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
+from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
 
+logger = init_logger(__name__)
+
 
 class BaseLogitsProcessor:
 
-    def __init__(self, guide: Guide):
+    def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
         self._guide: Guide = guide
+        self._reasoner = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -46,6 +51,14 @@ def __init__(self, guide: Guide):
     def __call__(self, input_ids: List[int],
                  scores: torch.Tensor) -> torch.Tensor:
         """Use the FSM to bias the logits before sampling the next token."""
+
+        # Skip the structured logits processing if reasoning is not finished.
+        # reasoner is not None only when `--enable-reasoning` is set.
+        if self._reasoner is not None and \
+        not self._reasoner.is_reasoning_end(
+                input_ids):
+            return scores
+
         seq_id = hash(tuple(input_ids))
 
         if len(input_ids) > 0:
@@ -113,7 +126,12 @@ def _get_guide(cls, regex_string: str,
         tokenizer = _adapt_tokenizer(tokenizer)
         return RegexGuide.from_regex(regex_string, tokenizer)
 
-    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
+    def __init__(
+        self,
+        regex_string: str,
+        tokenizer: PreTrainedTokenizerBase,
+        reasoner: Optional[Reasoner],
+    ):
         """Compile the FSM that drives the regex-structured generation.
 
         Parameters
@@ -125,14 +143,15 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
 
         """
         super().__init__(
-            RegexLogitsProcessor._get_guide(regex_string, tokenizer))
+            RegexLogitsProcessor._get_guide(regex_string, tokenizer), reasoner)
 
 
 class JSONLogitsProcessor(RegexLogitsProcessor):
 
     def __init__(self, schema: Union[str, Dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Union[str, None]):
+                 whitespace_pattern: Union[str, None],
+                 reasoner: Optional[Reasoner]):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -160,7 +179,7 @@ def __init__(self, schema: Union[str, Dict, BaseModel],
                 f"a Pydantic object, a dictionary or a string that contains "
                 f"the JSON Schema specification")
         regex_string = build_regex_from_schema(schema_str, whitespace_pattern)
-        super().__init__(regex_string, tokenizer)
+        super().__init__(regex_string, tokenizer, reasoner)
 
 
 class CFGLogitsProcessor(BaseLogitsProcessor):
@@ -171,7 +190,8 @@ def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
         tokenizer = _adapt_tokenizer(tokenizer)
         return CFGGuide(cfg, tokenizer)
 
-    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
+    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
+                 reasoner: Optional[Reasoner]):
         """Compile the FSM that drives the context free grammar generation.
 
         Parameters
@@ -182,7 +202,8 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
             The model's tokenizer
 
         """
-        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
+        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer),
+                         reasoner)
         self._guide = self._guide.copy()
 
 
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
new file mode 100644
index 00000000000..5a91f791d45
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from transformers import PreTrainedTokenizer
+
+from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
+    DeepSeekReasoner)
+from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
+
+
+def get_reasoner(tokenizer: PreTrainedTokenizer,
+                 reasoning_backend: str | None) -> Reasoner | None:
+    if reasoning_backend is None:
+        # No reasoning backend specified
+        return None
+    elif reasoning_backend == "deepseek_r1":
+        return DeepSeekReasoner.from_tokenizer(tokenizer)
+    else:
+        raise ValueError(f"Unknown reasoning backend '{reasoning_backend}'")
+
+
+__all__ = ["Reasoner", "get_reasoner"]
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
new file mode 100644
index 00000000000..e762fb0659d
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+from transformers import PreTrainedTokenizer
+
+from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
+
+
+@dataclass
+class DeepSeekReasoner(Reasoner):
+    """
+    Reasoner for DeepSeek R series models.
+    """
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
+    @classmethod
+    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+        return cls(start_token_id=tokenizer.encode(
+            "<think>", add_special_tokens=False)[0],
+                   end_token_id=tokenizer.encode("</think>",
+                                                 add_special_tokens=False)[0])
+
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.end_token_id in input_ids
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
new file mode 100644
index 00000000000..5db0c9bc785
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from transformers import PreTrainedTokenizer
+
+
+@dataclass
+class Reasoner(ABC):
+
+    @abstractmethod
+    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
+        pass
+
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index eb9d83acb28..ce278c15ab3 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -11,6 +11,8 @@
 import torch
 from transformers import PreTrainedTokenizerFast
 
+from vllm.logger import init_logger
+
 try:
     import xgrammar as xgr
     from xgrammar.base import _core as xgr_core
@@ -19,7 +21,6 @@
     xgr_installed = False
     pass
 
-from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.utils import (convert_lark_to_gbnf,
                                                        grammar_is_likely_lark)
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
@@ -28,6 +29,7 @@
     from transformers import PreTrainedTokenizer
 
     from vllm.config import ModelConfig
+    from vllm.model_executor.guided_decoding.reasoner import Reasoner
     from vllm.sampling_params import GuidedDecodingParams
 
 logger = init_logger(__name__)
@@ -38,12 +40,13 @@ def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
         model_config: ModelConfig,
+        reasoner: Reasoner | None,
         max_threads: int = 8):
     config = GrammarConfig.from_guided_params(guided_params=guided_params,
                                               model_config=model_config,
                                               tokenizer=tokenizer,
                                               max_threads=max_threads)
-    return XGrammarLogitsProcessor(config)
+    return XGrammarLogitsProcessor(config, reasoner)
 
 
 @dataclass(frozen=True)
@@ -293,6 +296,7 @@ def choice_as_grammar(choice: List[str] | None) -> str:
 class XGrammarLogitsProcessor:
     """Wrapper class to support pickle protocol"""
     config: GrammarConfig
+    reasoner: Reasoner | None = None
 
     ctx: xgr.CompiledGrammar | None = None
     token_bitmask: torch.Tensor = None  # type: ignore[assignment]
@@ -301,10 +305,11 @@ class XGrammarLogitsProcessor:
     prefilled: bool = field(default=False)
 
     def __getstate__(self) -> dict[str, Any]:
-        return {'config': self.config}
+        return {'config': self.config, 'reasoner': self.reasoner}
 
     def __setstate__(self, state: dict[str, Any]):
         self.config = state['config']
+        self.reasoner = state['reasoner']
 
         self.ctx = None
         self.matchers = []
@@ -331,6 +336,14 @@ def _ensure_ctx(self):
 
     def __call__(self, input_ids: list[int],
                  scores: torch.Tensor) -> torch.Tensor:
+
+        # Skip the structured logits processing if reasoning is not finished.
+        # reasoner is not None only when `--enable-reasoning` is set.
+        if self.reasoner is not None and \
+        not self.reasoner.is_reasoning_end(
+                input_ids):
+            return scores
+
         if self.ctx is None:
             self._ensure_ctx()
 

From 3504b6721ba6b65e8fd3dd86aea4be70d8b0978b Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 3 Mar 2025 01:34:51 +0000
Subject: [PATCH 0462/1240] Update deprecated Python 3.8 typing (#13971)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py            |   6 +-
 benchmarks/benchmark_guided.py                |  17 +-
 benchmarks/benchmark_latency.py               |   6 +-
 benchmarks/benchmark_prefix_caching.py        |  16 +-
 benchmarks/benchmark_prioritization.py        |   8 +-
 benchmarks/benchmark_serving.py               |  77 +++----
 benchmarks/benchmark_serving_guided.py        |  57 ++---
 benchmarks/benchmark_throughput.py            |  38 ++--
 benchmarks/benchmark_utils.py                 |   8 +-
 .../cutlass_benchmarks/sparse_benchmarks.py   |   9 +-
 benchmarks/cutlass_benchmarks/utils.py        |   8 +-
 .../cutlass_benchmarks/w8a8_benchmarks.py     |  17 +-
 .../fused_kernels/layernorm_rms_benchmarks.py |   5 +-
 benchmarks/kernels/benchmark_lora.py          |  60 ++---
 benchmarks/kernels/benchmark_machete.py       |  25 ++-
 benchmarks/kernels/benchmark_marlin.py        |   6 +-
 benchmarks/kernels/benchmark_moe.py           |  18 +-
 .../kernels/benchmark_paged_attention.py      |   4 +-
 benchmarks/kernels/benchmark_rmsnorm.py       |   4 +-
 benchmarks/kernels/benchmark_rope.py          |   4 +-
 benchmarks/kernels/graph_machete_bench.py     |   3 +-
 benchmarks/kernels/utils.py                   |   3 +-
 .../vllm_cutlass_library_extension.py         |  14 +-
 csrc/quantization/machete/generate.py         |  20 +-
 docs/source/conf.py                           |   3 +-
 docs/source/features/reasoning_outputs.md     |   4 +-
 docs/source/features/structured_outputs.md    |   2 +-
 docs/source/generate_examples.py              |   2 +-
 examples/offline_inference/distributed.py     |  10 +-
 .../offline_inference/llm_engine_example.py   |   7 +-
 .../lora_with_quantization_inference.py       |   8 +-
 examples/offline_inference/mlpspeculator.py   |   3 +-
 .../offline_inference/multilora_inference.py  |   8 +-
 .../prithvi_geospatial_mae.py                 |   8 +-
 examples/offline_inference/profiling.py       |  15 +-
 .../profiling_tpu/profiling.py                |   3 +-
 .../vision_language_multi_image.py            |  34 +--
 examples/online_serving/api_client.py         |   6 +-
 .../online_serving/openai_embedding_client.py |   2 +-
 pyproject.toml                                |  28 ++-
 setup.py                                      |   7 +-
 tests/async_engine/api_server_async_engine.py |   5 +-
 tests/async_engine/test_async_llm_engine.py   |   4 +-
 tests/compile/piecewise/test_toy_llama.py     |   6 +-
 tests/compile/test_basic_correctness.py       |   8 +-
 tests/conftest.py                             | 159 +++++++------
 tests/core/block/e2e/conftest.py              |   3 +-
 .../e2e/test_correctness_sliding_window.py    |  11 +-
 tests/core/block/test_block_table.py          |   8 +-
 tests/core/block/test_naive_block.py          |   4 +-
 tests/core/block/test_prefix_caching_block.py |  16 +-
 tests/core/test_chunked_prefill_scheduler.py  |  25 +--
 tests/core/test_scheduler.py                  |  19 +-
 tests/core/test_scheduler_encoder_decoder.py  |   4 +-
 tests/core/utils.py                           |  21 +-
 tests/distributed/test_expert_parallel.py     |   6 +-
 tests/distributed/test_pipeline_parallel.py   |   8 +-
 tests/distributed/test_pynccl.py              |   5 +-
 tests/distributed/test_shm_broadcast.py       |   3 +-
 tests/encoder_decoder/test_e2e_correctness.py |   4 +-
 tests/engine/test_executor.py                 |   6 +-
 tests/engine/test_multiproc_workers.py        |   6 +-
 tests/engine/test_stop_strings.py             |   6 +-
 tests/entrypoints/llm/test_chat.py            |   4 +-
 tests/entrypoints/llm/test_encode.py          |   5 +-
 tests/entrypoints/llm/test_generate.py        |   3 +-
 .../test_transcription_api_correctness.py     |   3 +-
 .../test_deepseekr1_reasoning_parser.py       |   4 +-
 .../openai/reasoning_parsers/utils.py         |  14 +-
 tests/entrypoints/openai/test_audio.py        |  16 +-
 tests/entrypoints/openai/test_basic.py        |   3 +-
 tests/entrypoints/openai/test_chat.py         |   8 +-
 tests/entrypoints/openai/test_completion.py   |   8 +-
 tests/entrypoints/openai/test_embedding.py    |   4 +-
 tests/entrypoints/openai/test_pooling.py      |   4 +-
 tests/entrypoints/openai/test_root_path.py    |   4 +-
 tests/entrypoints/openai/test_video.py        |  12 +-
 tests/entrypoints/openai/test_vision.py       |  12 +-
 .../openai/test_vision_embedding.py           |   4 +-
 .../tool_parsers/test_pythonic_tool_parser.py |   3 +-
 .../entrypoints/openai/tool_parsers/utils.py  |   9 +-
 tests/kernels/quant_utils.py                  |   6 +-
 tests/kernels/test_activation.py              |   3 +-
 tests/kernels/test_attention.py               |  16 +-
 tests/kernels/test_blocksparse_attention.py   |  12 +-
 tests/kernels/test_cache.py                   |   5 +-
 tests/kernels/test_cascade_flash_attn.py      |   8 +-
 tests/kernels/test_cutlass.py                 |  11 +-
 tests/kernels/test_cutlass_2of4_sparse.py     |   5 +-
 tests/kernels/test_encoder_decoder_attn.py    |   4 +-
 tests/kernels/test_flash_attn.py              |  16 +-
 tests/kernels/test_flashinfer.py              |  22 +-
 tests/kernels/test_fused_quant_layernorm.py   |  12 +-
 tests/kernels/test_gguf.py                    |   3 +-
 tests/kernels/test_machete_mm.py              |  14 +-
 tests/kernels/test_mamba_mixer2.py            |   3 +-
 tests/kernels/test_mamba_ssm_ssd.py           |   8 +-
 tests/kernels/test_pos_encoding.py            |   6 +-
 tests/kernels/test_triton_scaled_mm.py        |   4 +-
 tests/kernels/utils.py                        |  68 +++---
 tests/kv_transfer/test_send_recv.py           |   3 +-
 tests/lora/conftest.py                        |   6 +-
 tests/lora/data/long_context_test_data.py     |   4 +-
 tests/lora/test_add_lora.py                   |   9 +-
 tests/lora/test_baichuan.py                   |   6 +-
 tests/lora/test_chatglm3_tp.py                |   6 +-
 tests/lora/test_gemma.py                      |   6 +-
 tests/lora/test_jamba.py                      |   6 +-
 tests/lora/test_layers.py                     |  48 ++--
 tests/lora/test_llama_tp.py                   |   6 +-
 tests/lora/test_long_context.py               |  16 +-
 tests/lora/test_lora_bias_e2e.py              |   6 +-
 tests/lora/test_lora_checkpoints.py           |   6 +-
 tests/lora/test_lora_functions.py             |   5 +-
 tests/lora/test_lora_huggingface.py           |   4 +-
 tests/lora/test_lora_manager.py               |   7 +-
 tests/lora/test_minicpmv_tp.py                |   6 +-
 tests/lora/test_mixtral.py                    |   6 +-
 tests/lora/test_phi.py                        |   6 +-
 tests/lora/test_punica_ops.py                 |   5 +-
 tests/lora/test_quant_model.py                |   7 +-
 tests/lora/test_qwen2vl.py                    |  10 +-
 tests/lora/test_transfomers_model.py          |   6 +-
 tests/lora/test_ultravox.py                   |   7 +-
 tests/lora/utils.py                           |  14 +-
 tests/metrics/test_metrics.py                 |   3 +-
 tests/mistral_tool_use/utils.py               |   8 +-
 .../model_executor/test_enabled_custom_ops.py |   4 +-
 .../audio_language/test_ultravox.py           |  16 +-
 .../models/decoder_only/language/test_gguf.py |   6 +-
 .../decoder_only/language/test_modelopt.py    |   3 +-
 .../decoder_only/vision_language/test_awq.py  |   6 +-
 .../vision_language/test_models.py            |  39 ++--
 .../vision_language/test_phi3v.py             |  10 +-
 .../vision_language/test_pixtral.py           |  12 +-
 .../vision_language/test_qwen2_vl.py          |  46 ++--
 .../vision_language/vlm_utils/builders.py     |   7 +-
 .../vlm_utils/case_filtering.py               |  10 +-
 .../vision_language/vlm_utils/core.py         |  22 +-
 .../vision_language/vlm_utils/model_utils.py  |  16 +-
 .../vision_language/vlm_utils/runners.py      |  21 +-
 .../vision_language/vlm_utils/types.py        |  36 +--
 .../models/embedding/language/test_gritlm.py  |  11 +-
 tests/models/embedding/utils.py               |   6 +-
 .../vision_language/test_dse_qwen2_vl.py      |  12 +-
 .../vision_language/test_llava_next.py        |   8 +-
 .../embedding/vision_language/test_phi3v.py   |   8 +-
 .../encoder_decoder/language/test_bart.py     |  10 +-
 .../vision_language/test_florence2.py         |   8 +-
 .../vision_language/test_mllama.py            |  36 +--
 .../multimodal/processing/test_h2ovl.py       |   3 +-
 .../multimodal/processing/test_internvl.py    |   3 +-
 tests/models/registry.py                      |   5 +-
 tests/models/test_transformers.py             |  15 +-
 tests/models/utils.py                         |  21 +-
 tests/mq_llm_engine/utils.py                  |   4 +-
 .../multi_step/test_correctness_async_llm.py  |   4 +-
 tests/multimodal/test_utils.py                |   8 +-
 tests/neuron/test_logits_processor.py         |   3 +-
 .../my_gemma_embedding.py                     |   5 +-
 tests/quantization/test_configs.py            |   3 +-
 .../test_register_quantization_config.py      |   8 +-
 tests/samplers/test_logprobs.py               |   4 +-
 tests/samplers/test_no_bad_words.py           |  16 +-
 tests/samplers/test_rejection_sampler.py      |  11 +-
 tests/samplers/test_sampler.py                |  44 ++--
 tests/spec_decode/e2e/conftest.py             |   9 +-
 tests/spec_decode/test_batch_expansion.py     |   4 +-
 tests/spec_decode/test_multi_step_worker.py   |  15 +-
 tests/spec_decode/test_scorer.py              |   3 +-
 tests/spec_decode/test_spec_decode_worker.py  |  11 +-
 tests/spec_decode/utils.py                    |  33 ++-
 tests/test_cache_block_hashing.py             |   6 +-
 tests/test_inputs.py                          |   4 +-
 tests/test_logger.py                          |   2 +-
 tests/test_logits_processor.py                |   3 +-
 tests/test_utils.py                           |   4 +-
 tests/tokenization/test_detokenize.py         |  23 +-
 tests/tokenization/test_tokenizer_group.py    |   4 +-
 tests/tokenization/test_tokenizer_registry.py |  32 +--
 tests/tool_use/test_chat_completions.py       |   6 +-
 tests/tool_use/test_jamba_tool_parser.py      |  13 +-
 tests/tool_use/test_parallel_tool_calls.py    |  10 +-
 tests/tool_use/test_tool_calls.py             |  10 +-
 tests/tool_use/utils.py                       |  26 +--
 tests/tracing/test_tracing.py                 |   5 +-
 tests/utils.py                                |  42 ++--
 tests/v1/core/test_prefix_caching.py          |   3 +-
 tests/v1/core/test_scheduler.py               |   6 +-
 tests/v1/engine/conftest.py                   |   6 +-
 tests/v1/engine/test_async_llm.py             |  10 +-
 tests/v1/engine/test_engine_core.py           |   3 +-
 tests/v1/engine/test_engine_core_client.py    |  10 +-
 tests/v1/engine/test_llm_engine.py            |   8 +-
 tests/v1/engine/test_output_processor.py      |  12 +-
 tests/v1/engine/utils.py                      |  50 ++---
 .../v1/entrypoints/openai/test_completion.py  |  12 +-
 tests/v1/sample/test_logprobs.py              |   9 +-
 tests/v1/sample/test_rejection_sampler.py     |   7 +-
 tests/v1/sample/test_sampler.py               |  26 +--
 tests/v1/sample/utils.py                      |   5 +-
 tests/v1/test_utils.py                        |   6 +-
 tests/v1/worker/test_gpu_input_batch.py       |  22 +-
 .../vllm_test_utils/vllm_test_utils/blame.py  |   3 +-
 .../vllm_test_utils/monitor.py                |   3 +-
 .../test_encoder_decoder_model_runner.py      |  21 +-
 tests/worker/test_model_input.py              |  11 +-
 tests/worker/test_model_runner.py             |  20 +-
 tools/profiler/print_layerwise_table.py       |   3 +-
 tools/profiler/visualize_layerwise_profile.py |  14 +-
 vllm/_custom_ops.py                           |  56 ++---
 vllm/_ipex_ops.py                             |   8 +-
 vllm/beam_search.py                           |  18 +-
 vllm/config.py                                | 141 ++++++------
 vllm/connections.py                           |   3 +-
 vllm/entrypoints/api_server.py                |   3 +-
 vllm/entrypoints/chat_utils.py                |  49 ++--
 vllm/entrypoints/cli/openai.py                |  10 +-
 vllm/entrypoints/cli/serve.py                 |   3 +-
 vllm/entrypoints/llm.py                       | 210 +++++++++---------
 vllm/entrypoints/logger.py                    |   4 +-
 vllm/entrypoints/openai/api_server.py         |   9 +-
 vllm/entrypoints/openai/cli_args.py           |   7 +-
 vllm/entrypoints/openai/logits_processors.py  |  23 +-
 vllm/entrypoints/openai/protocol.py           | 128 +++++------
 .../abs_reasoning_parsers.py                  |  21 +-
 .../deepseek_r1_reasoning_parser.py           |   5 +-
 vllm/entrypoints/openai/run_batch.py          |   9 +-
 vllm/entrypoints/openai/serving_chat.py       |  31 ++-
 vllm/entrypoints/openai/serving_completion.py |  32 +--
 vllm/entrypoints/openai/serving_embedding.py  |  15 +-
 vllm/entrypoints/openai/serving_engine.py     |  43 ++--
 vllm/entrypoints/openai/serving_models.py     |  10 +-
 vllm/entrypoints/openai/serving_pooling.py    |  15 +-
 vllm/entrypoints/openai/serving_score.py      |  49 ++--
 .../openai/serving_tokenization.py            |   4 +-
 .../openai/serving_transcription.py           |   3 +-
 .../tool_parsers/abstract_tool_parser.py      |  21 +-
 .../granite_20b_fc_tool_parser.py             |   5 +-
 .../tool_parsers/granite_tool_parser.py       |   5 +-
 .../openai/tool_parsers/hermes_tool_parser.py |   7 +-
 .../tool_parsers/internlm2_tool_parser.py     |   5 +-
 .../openai/tool_parsers/jamba_tool_parser.py  |  11 +-
 .../openai/tool_parsers/llama_tool_parser.py  |  11 +-
 .../tool_parsers/mistral_tool_parser.py       |  13 +-
 .../tool_parsers/pythonic_tool_parser.py      |   5 +-
 vllm/entrypoints/openai/tool_parsers/utils.py |   6 +-
 vllm/entrypoints/score_utils.py               |  14 +-
 vllm/envs.py                                  |   8 +-
 vllm/forward_context.py                       |   6 +-
 vllm/logger.py                                |   2 +-
 vllm/logits_process.py                        |  16 +-
 vllm/outputs.py                               |  24 +-
 vllm/sampling_params.py                       |  53 +++--
 vllm/sequence.py                              | 132 +++++------
 vllm/tracing.py                               |   3 +-
 vllm/utils.py                                 |  76 +++----
 vllm/v1/attention/backends/flash_attn.py      |  18 +-
 vllm/v1/attention/backends/mla/common.py      |  21 +-
 vllm/v1/attention/backends/mla/flashmla.py    |  14 +-
 vllm/v1/attention/backends/mla/triton_mla.py  |   8 +-
 vllm/v1/attention/backends/pallas.py          |  16 +-
 vllm/v1/attention/backends/rocm_attn.py       |  16 +-
 vllm/v1/core/block_pool.py                    |  17 +-
 vllm/v1/core/encoder_cache_manager.py         |  16 +-
 vllm/v1/core/kv_cache_manager.py              |  19 +-
 vllm/v1/core/kv_cache_utils.py                |  32 +--
 vllm/v1/core/scheduler.py                     |  45 ++--
 vllm/v1/core/scheduler_output.py              |  42 ++--
 vllm/v1/engine/__init__.py                    |  16 +-
 vllm/v1/engine/async_llm.py                   |   9 +-
 vllm/v1/engine/core.py                        |  16 +-
 vllm/v1/engine/core_client.py                 |  34 +--
 vllm/v1/engine/detokenizer.py                 |  12 +-
 vllm/v1/engine/llm_engine.py                  |  17 +-
 vllm/v1/engine/logprobs.py                    |  12 +-
 vllm/v1/engine/mm_input_cache.py              |  18 +-
 vllm/v1/engine/output_processor.py            |  20 +-
 vllm/v1/engine/parallel_sampling.py           |  16 +-
 vllm/v1/engine/processor.py                   |   3 +-
 vllm/v1/executor/abstract.py                  |  10 +-
 vllm/v1/executor/multiproc_executor.py        |  10 +-
 vllm/v1/kv_cache_interface.py                 |   7 +-
 vllm/v1/metrics/loggers.py                    |  18 +-
 vllm/v1/metrics/stats.py                      |  20 +-
 vllm/v1/outputs.py                            |  20 +-
 vllm/v1/request.py                            |  24 +-
 vllm/v1/sample/metadata.py                    |  10 +-
 vllm/v1/sample/ops/penalties.py               |  12 +-
 vllm/v1/sample/ops/topk_topp_sampler.py       |  10 +-
 vllm/v1/sample/rejection_sampler.py           |   7 +-
 vllm/v1/stats/common.py                       |  18 +-
 vllm/v1/utils.py                              |  20 +-
 vllm/v1/worker/block_table.py                 |   6 +-
 vllm/v1/worker/gpu_input_batch.py             |  62 +++---
 vllm/v1/worker/gpu_model_runner.py            |  34 +--
 vllm/v1/worker/gpu_worker.py                  |   4 +-
 vllm/v1/worker/lora_model_runner_mixin.py     |  17 +-
 vllm/v1/worker/tpu_model_runner.py            |  24 +-
 vllm/v1/worker/tpu_worker.py                  |   6 +-
 300 files changed, 2294 insertions(+), 2347 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index e43549c13c8..158705769b5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import aiohttp
 import huggingface_hub.constants
@@ -41,8 +41,8 @@ class RequestFuncOutput:
     latency: float = 0.0
     output_tokens: int = 0
     ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(
-        default_factory=list)  # List of inter-token latencies
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
index dc2bf0e79cb..2e0f6c6b5d2 100644
--- a/benchmarks/benchmark_guided.py
+++ b/benchmarks/benchmark_guided.py
@@ -6,7 +6,6 @@
 import os
 import random
 import time
-from typing import List
 
 import datasets
 import pandas as pd
@@ -39,7 +38,7 @@ class SampleRequest:
     completion: str = None
 
 
-def run_vllm(requests: List[SampleRequest],
+def run_vllm(requests: list[SampleRequest],
              engine_args: EngineArgs,
              n: int,
              guided_decoding_rate: float = 1.0,
@@ -54,8 +53,8 @@ def run_vllm(requests: List[SampleRequest],
             " prompt_len and expected_output_len for all requests.")
 
     # Add the requests to the engine.
-    prompts: List[str] = []
-    sampling_params: List[SamplingParams] = []
+    prompts: list[str] = []
+    sampling_params: list[SamplingParams] = []
     # create a list containing random selected true or false
     guided_decoding_req_idx = random.sample(
         range(len(requests)), int(len(requests) * guided_decoding_rate))
@@ -110,7 +109,7 @@ def run_vllm(requests: List[SampleRequest],
 
 
 async def run_vllm_async(
-        requests: List[SampleRequest],
+        requests: list[SampleRequest],
         engine_args: AsyncEngineArgs,
         n: int,
         guided_decoding_rate: float = 1.0,
@@ -129,8 +128,8 @@ async def run_vllm_async(
                 " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
-        prompts: List[str] = []
-        sampling_params: List[SamplingParams] = []
+        prompts: list[str] = []
+        sampling_params: list[SamplingParams] = []
         guided_decoding_req_idx = random.sample(
             range(len(requests)), int(len(requests) * guided_decoding_rate))
 
@@ -203,7 +202,7 @@ async def run_vllm_async(
 
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
+                    args: argparse.Namespace) -> list[SampleRequest]:
     if args.dataset == 'json':
         if args.json_schema_path is None:
             dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -287,7 +286,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     elif args.dataset == "xgrammar_bench":
         args.warmup = False
-        requests: List[SampleRequest] = []
+        requests: list[SampleRequest] = []
         dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                         split="train")
         print(f"dataset has {len(dataset)} entries")
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index c82358d1451..d7f39f50f6c 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -7,7 +7,7 @@
 import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -22,7 +22,7 @@
 
 
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
+                                     results: dict[str, Any]) -> None:
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={"latency": results["latencies"]},
@@ -57,7 +57,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 23822856b88..fba32520442 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -31,7 +31,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from transformers import PreTrainedTokenizerBase
 
@@ -77,9 +77,9 @@ def sample_requests_from_dataset(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
     fixed_output_len: Optional[int],
-) -> List[Request]:
+) -> list[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -99,7 +99,7 @@ def sample_requests_from_dataset(
     assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
 
     # Filter out sequences that are too long or too short
-    filtered_requests: List[Request] = []
+    filtered_requests: list[Request] = []
 
     for i in range(len(dataset)):
         if len(filtered_requests) == num_requests:
@@ -122,10 +122,10 @@ def sample_requests_from_dataset(
 def sample_requests_from_random(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
     fixed_output_len: Optional[int],
     prefix_len: int,
-) -> List[Request]:
+) -> list[Request]:
 
     requests = []
     prefix_token_ids = sample_tokens(tokenizer, prefix_len)
@@ -144,9 +144,9 @@ def sample_requests_from_random(
     return requests
 
 
-def repeat_and_sort_requests(requests: List[Request],
+def repeat_and_sort_requests(requests: list[Request],
                              repeat_count: int,
-                             sort: bool = False) -> List[str]:
+                             sort: bool = False) -> list[str]:
     repeated_requests = requests * repeat_count
     if sort:
         repeated_requests.sort(key=lambda x: x[1])
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 24014e5b6c3..43b2c1b0332 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -5,7 +5,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -23,7 +23,7 @@ def sample_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -40,7 +40,7 @@ def sample_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -68,7 +68,7 @@ def sample_requests(
 
 
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: list[tuple[str, int, int]],
     n: int,
     engine_args: EngineArgs,
 ) -> float:
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1bb83b082be..16ec0a4817a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -33,9 +33,10 @@
 import random
 import time
 import warnings
+from collections.abc import AsyncGenerator, Collection
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
+from typing import Any, Optional
 
 import numpy as np
 import pandas as pd
@@ -73,22 +74,22 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
+    percentiles_ttft_ms: list[tuple[float, float]]
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
+    percentiles_tpot_ms: list[tuple[float, float]]
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
+    percentiles_itl_ms: list[tuple[float, float]]
     # E2EL stands for end-to-end latency per request.
     # It is the time taken on the client side from sending
     # a request to receiving a complete response.
     mean_e2el_ms: float
     median_e2el_ms: float
     std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
+    percentiles_e2el_ms: list[tuple[float, float]]
 
 
 def sample_sharegpt_requests(
@@ -96,7 +97,7 @@ def sample_sharegpt_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int, None]]:
+) -> list[tuple[str, int, int, None]]:
     # Load the dataset.
     with open(dataset_path, encoding='utf-8') as f:
         dataset = json.load(f)
@@ -110,7 +111,7 @@ def sample_sharegpt_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -139,7 +140,7 @@ def sample_burstgpt_requests(
     num_requests: int,
     random_seed: int,
     tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, int, int, None]]:
+) -> list[tuple[str, int, int, None]]:
     df = pd.read_csv(dataset_path)
     gpt4_df = df[df["Model"] == "GPT-4"]
     # Remove the failed requests (i.e., response length is 0)
@@ -170,7 +171,7 @@ def sample_sonnet_requests(
     output_len: int,
     prefix_len: int,
     tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, str, int, int, None]]:
+) -> list[tuple[str, str, int, int, None]]:
     assert (
         input_len > prefix_len
     ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
@@ -211,7 +212,7 @@ def sample_sonnet_requests(
     prefix_lines = poem_lines[:num_prefix_lines]
 
     # Sample the rest of lines per request.
-    sampled_requests: List[Tuple[str, int, int]] = []
+    sampled_requests: list[tuple[str, int, int]] = []
     for _ in range(num_requests):
         num_lines_needed = num_input_lines - num_prefix_lines
         sampled_lines = "".join(prefix_lines +
@@ -238,8 +239,8 @@ def sample_vision_arena_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
-    sampled_requests: List[Tuple[str, int, int, Dict[str,
+) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
+    sampled_requests: list[tuple[str, int, int, dict[str,
                                                      Collection[str]]]] = []
     for data in dataset:
         if len(sampled_requests) == num_requests:
@@ -285,7 +286,7 @@ def sample_hf_requests(
     tokenizer: PreTrainedTokenizerBase,
     random_seed: int,
     fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
 
     # Special case for vision_arena dataset
     if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
@@ -307,7 +308,7 @@ def sample_hf_requests(
         "HF Dataset must have 'conversations' column.")
     filter_func = lambda x: len(x["conversations"]) >= 2
     filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-    sampled_requests: List[Tuple[str, int, int, Dict[str,
+    sampled_requests: list[tuple[str, int, int, dict[str,
                                                      Collection[str]]]] = []
     for data in filtered_dataset:
         if len(sampled_requests) == num_requests:
@@ -370,7 +371,7 @@ def sample_random_requests(
     num_prompts: int,
     range_ratio: float,
     tokenizer: PreTrainedTokenizerBase,
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int]]:
     prefix_token_ids = np.random.randint(0,
                                          tokenizer.vocab_size,
                                          size=prefix_len).tolist()
@@ -399,10 +400,10 @@ def sample_random_requests(
 
 
 async def get_request(
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[tuple[str, int, int]],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[Tuple[str, int, int], None]:
+) -> AsyncGenerator[tuple[str, int, int], None]:
     """
     Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
@@ -443,23 +444,23 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: List[Tuple[str, int, int]],
-    outputs: List[RequestFuncOutput],
+    input_requests: list[tuple[str, int, int]],
+    outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[float],
-    goodput_config_dict: Dict[str, float],
-) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
     total_input = 0
     completed = 0
     good_completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    all_tpots: List[float] = []
-    ttfts: List[float] = []
-    e2els: List[float] = []
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             output_len = outputs[i].output_tokens
@@ -557,19 +558,19 @@ async def benchmark(
     model_id: str,
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[Tuple[str, int, int]],
+    input_requests: list[tuple[str, int, int]],
     logprobs: Optional[int],
     best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[str],
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[str],
     ignore_eos: bool,
-    goodput_config_dict: Dict[str, float],
+    goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
-    lora_modules: Optional[List[str]],
+    lora_modules: Optional[list[str]],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -652,7 +653,7 @@ async def limited_request_func(request_func_input, pbar):
                                       pbar=pbar)
 
     benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
         req_model_id, req_model_name = model_id, model_name
@@ -674,7 +675,7 @@ async def limited_request_func(request_func_input, pbar):
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
                                      pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
     if profile:
         print("Stopping profiler...")
@@ -820,7 +821,7 @@ def parse_goodput(slo_pairs):
 
 
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any],
+                                     results: dict[str, Any],
                                      file_name: str) -> None:
     metrics = [
         "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
@@ -974,7 +975,7 @@ def main(args: argparse.Namespace):
 
     # Save config and results to json
     if args.save_result:
-        result_json: Dict[str, Any] = {}
+        result_json: dict[str, Any] = {}
 
         # Setup
         current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py
index 05eadff7978..6c132d05f1b 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_guided.py
@@ -30,8 +30,9 @@
 import random
 import time
 import warnings
+from collections.abc import AsyncGenerator
 from dataclasses import dataclass
-from typing import AsyncGenerator, Dict, List, Optional, Tuple
+from typing import Optional
 
 import datasets
 import numpy as np
@@ -66,22 +67,22 @@ class BenchmarkMetrics:
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
-    percentiles_ttft_ms: List[Tuple[float, float]]
+    percentiles_ttft_ms: list[tuple[float, float]]
     mean_tpot_ms: float
     median_tpot_ms: float
     std_tpot_ms: float
-    percentiles_tpot_ms: List[Tuple[float, float]]
+    percentiles_tpot_ms: list[tuple[float, float]]
     mean_itl_ms: float
     median_itl_ms: float
     std_itl_ms: float
-    percentiles_itl_ms: List[Tuple[float, float]]
+    percentiles_itl_ms: list[tuple[float, float]]
     # E2EL stands for end-to-end latency per request.
     # It is the time taken on the client side from sending
     # a request to receiving a complete response.
     mean_e2el_ms: float
     median_e2el_ms: float
     std_e2el_ms: float
-    percentiles_e2el_ms: List[Tuple[float, float]]
+    percentiles_e2el_ms: list[tuple[float, float]]
 
 
 @dataclasses.dataclass
@@ -104,7 +105,7 @@ class SampleRequest:
 
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
+                    args: argparse.Namespace) -> list[SampleRequest]:
     if args.dataset == 'json':
         if args.json_schema_path is None:
             dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -187,7 +188,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
         ]
 
     elif args.dataset == "xgrammar_bench":
-        requests: List[SampleRequest] = []
+        requests: list[SampleRequest] = []
         dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                         split="train")
         print(f"dataset has {len(dataset)} entries")
@@ -214,10 +215,10 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
 
 async def get_request(
-    input_requests: List[SampleRequest],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
+) -> AsyncGenerator[tuple[int, SampleRequest], None]:
     """
     Asynchronously generates requests at a specified rate 
     with OPTIONAL burstiness.
@@ -258,23 +259,23 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: List[Tuple[str, int, int]],
-    outputs: List[RequestFuncOutput],
+    input_requests: list[tuple[str, int, int]],
+    outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[float],
-    goodput_config_dict: Optional[Dict[str, float]] = None,
-) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[float],
+    goodput_config_dict: Optional[dict[str, float]] = None,
+) -> tuple[BenchmarkMetrics, list[int]]:
+    actual_output_lens: list[int] = []
     total_input = 0
     completed = 0
     good_completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    all_tpots: List[float] = []
-    ttfts: List[float] = []
-    e2els: List[float] = []
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             # We use the tokenizer to count the number of output tokens for all
@@ -368,18 +369,18 @@ async def benchmark(
     base_url: str,
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[SampleRequest],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
-    selected_percentile_metrics: List[str],
-    selected_percentiles: List[str],
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[str],
     ignore_eos: bool,
     max_concurrency: Optional[int],
     guided_decoding_ratio: float,
     guided_decoding_backend: str,
-    goodput_config_dict: Optional[Dict[str, float]] = None,
+    goodput_config_dict: Optional[dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -459,8 +460,8 @@ async def limited_request_func(request_func_input, pbar):
                                       pbar=pbar)
 
     benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
-    expected: List[str] = []
+    tasks: list[asyncio.Task] = []
+    expected: list[str] = []
     async for i, request in get_request(input_requests, request_rate,
                                         burstiness):
         extra_body = prepare_extra_body(
@@ -479,7 +480,7 @@ async def limited_request_func(request_func_input, pbar):
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
                                      pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
     if profile:
         print("Stopping profiler...")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 04de08fa97c..aabce64ff77 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -7,7 +7,7 @@
 import random
 import time
 from functools import cache
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 import uvloop
@@ -74,12 +74,12 @@ def lora_path_on_disk(lora_path: str) -> str:
     return get_adapter_absolute_path(lora_path)
 
 
-lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
 
 
 def get_random_lora_request(
         args: argparse.Namespace
-) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
     global lora_tokenizer_cache
     lora_id = random.randint(1, args.max_loras)
     lora_request = LoRARequest(lora_name=str(lora_id),
@@ -91,7 +91,7 @@ def get_random_lora_request(
 
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
+                    args: argparse.Namespace) -> list[SampleRequest]:
 
     dataset_path: str = args.dataset
     num_requests: int = args.num_prompts
@@ -109,7 +109,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[SampleRequest] = []
+    filtered_dataset: list[SampleRequest] = []
     for data in tqdm(dataset,
                      total=len(filtered_dataset),
                      desc="sampling requests"):
@@ -165,7 +165,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
 
 def run_vllm(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
     n: int,
     engine_args: EngineArgs,
 ) -> float:
@@ -178,8 +178,8 @@ def run_vllm(
             "Please ensure that max_model_len is greater than the sum of"
             " prompt_len and expected_output_len for all requests.")
     # Add the requests to the engine.
-    prompts: List[TextPrompt] = []
-    sampling_params: List[SamplingParams] = []
+    prompts: list[TextPrompt] = []
+    sampling_params: list[SamplingParams] = []
     for request in requests:
         prompts.append(
             TextPrompt(prompt=request.prompt,
@@ -192,7 +192,7 @@ def run_vllm(
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
             ))
-    lora_requests: Optional[List[LoRARequest]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
     if engine_args.enable_lora:
         lora_requests = [request.lora_request for request in requests]
 
@@ -225,7 +225,7 @@ def run_vllm(
 
 
 async def run_vllm_async(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
@@ -242,9 +242,9 @@ async def run_vllm_async(
                 " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
-        prompts: List[TextPrompt] = []
-        sampling_params: List[SamplingParams] = []
-        lora_requests: List[Optional[LoRARequest]] = []
+        prompts: list[TextPrompt] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
                 TextPrompt(prompt=request.prompt,
@@ -276,7 +276,7 @@ async def run_vllm_async(
 
 
 def run_hf(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
@@ -292,7 +292,7 @@ def run_hf(
 
     pbar = tqdm(total=len(requests))
     start = time.perf_counter()
-    batch: List[str] = []
+    batch: list[str] = []
     max_prompt_len = 0
     max_output_len = 0
     for i in range(len(requests)):
@@ -334,7 +334,7 @@ def run_hf(
 
 
 def run_mii(
-    requests: List[SampleRequest],
+    requests: list[SampleRequest],
     model: str,
     tensor_parallel_size: int,
     output_len: int,
@@ -352,7 +352,7 @@ def run_mii(
 
 
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
+                                     results: dict[str, Any]) -> None:
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={
@@ -479,8 +479,8 @@ def main(args: argparse.Namespace):
                         type=str,
                         default=None,
                         help="Path to the dataset. The dataset is expected to "
-                        "be a json in form of List[Dict[..., conversations: "
-                        "List[Dict[..., value: <prompt_or_response>]]]]")
+                        "be a json in form of list[dict[..., conversations: "
+                        "list[dict[..., value: <prompt_or_response>]]]]")
     parser.add_argument("--input-len",
                         type=int,
                         default=None,
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index ac0688ca013..45a0ddbd5d0 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -4,12 +4,12 @@
 import json
 import math
 import os
-from typing import Any, Dict, List
+from typing import Any
 
 
 def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: Dict[str, List],
-                                        extra_info: Dict[str, Any]) -> List:
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
     """
     Save the benchmark results in the format used by PyTorch OSS benchmark with
     on metric per record
@@ -64,6 +64,6 @@ def iterencode(self, o: Any, *args, **kwargs) -> Any:
         return super().iterencode(self.clear_inf(o), *args, **kwargs)
 
 
-def write_to_json(filename: str, records: List) -> None:
+def write_to_json(filename: str, records: list) -> None:
     with open(filename, "w") as f:
         json.dump(records, f, cls=InfEncoder)
diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
index 468a1b2868f..9e36b0a9d3b 100644
--- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py
@@ -5,7 +5,8 @@
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Tuple
+from collections.abc import Iterable
+from typing import Callable
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+        MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
         timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@@ -241,7 +242,7 @@ def run(dtype: torch.dtype,
 
 # output makers
 def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
+                MKNs: Iterable[tuple[int, int, int]],
                 base_description: str,
                 timestamp=None):
     print(f"== All Results {base_description} ====")
@@ -282,7 +283,7 @@ def run_model_bench(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
         KNs = []
         for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
             KN[tp_split_dim] = KN[tp_split_dim] // tp_size
diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py
index bab37780072..fe4d8fdfc06 100644
--- a/benchmarks/cutlass_benchmarks/utils.py
+++ b/benchmarks/cutlass_benchmarks/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Cutlass bench utils
-from typing import Iterable, Tuple
+from collections.abc import Iterable
 
 import torch
 
@@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
 
 
 def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
-                      k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+                      k: int) -> tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
 
@@ -63,7 +63,7 @@ def prune_to_2_4(tensor):
 
 
 def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
-                             k: int) -> Tuple[torch.Tensor, torch.Tensor]:
+                             k: int) -> tuple[torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda') * 5
     b = torch.randn((n, k), device='cuda').t() * 5
 
@@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
 
 def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
                         m: int, n: int, k: int) -> \
-                        Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
+                        tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
     ABs = []
     for _ in range(num_tensors):
         b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
index 6552b62dae8..e7b742d8bec 100644
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -5,7 +5,8 @@
 import itertools
 import pickle as pkl
 import time
-from typing import Callable, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -49,7 +50,7 @@ def bench_int8(
         n: int,
         label: str,
         sub_label: str,
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
     """Benchmark INT8-based kernels."""
     assert dtype == torch.int8
     a, b = make_rand_tensors(torch.int8, m, n, k)
@@ -101,7 +102,7 @@ def bench_fp8(
         n: int,
         label: str,
         sub_label: str,
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
     """Benchmark FP8-based kernels."""
     assert dtype == torch.float8_e4m3fn
     a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@@ -180,7 +181,7 @@ def bench(dtype: torch.dtype,
           n: int,
           label: str,
           sub_label: str,
-          bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+          bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
     if dtype == torch.int8:
         return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
     if dtype == torch.float8_e4m3fn:
@@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]):
 
 
 def run(dtype: torch.dtype,
-        MKNs: Iterable[Tuple[int, int, int]],
-        bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
+        MKNs: Iterable[tuple[int, int, int]],
+        bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
     results = []
     for m, k, n in MKNs:
         timers = bench(dtype,
@@ -212,7 +213,7 @@ def run(dtype: torch.dtype,
 
 
 def make_output(data: Iterable[TMeasurement],
-                MKNs: Iterable[Tuple[int, int, int]],
+                MKNs: Iterable[tuple[int, int, int]],
                 base_description: str,
                 timestamp=None):
     print(f"== All Results {base_description} ====")
@@ -248,7 +249,7 @@ def run_model_bench(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
         KNs = []
         for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
             KN[tp_split_dim] = KN[tp_split_dim] // tp_size
diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
index c56cc743845..3da583a3344 100644
--- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
+++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py
@@ -2,9 +2,10 @@
 
 import pickle as pkl
 import time
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Iterable, List, Optional
+from typing import Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -29,7 +30,7 @@ def description(self):
                 f'x DT {self.dtype}')
 
 
-def get_bench_params() -> List[bench_params_t]:
+def get_bench_params() -> list[bench_params_t]:
     ## Test Fixtures
     NUM_TOKENS = [2**x for x in range(11)]
     HIDDEN_SIZES = list(range(1024, 8129, 1024))
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 1deb0026a6e..5eaeec01705 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -9,7 +9,7 @@
 from enum import Enum, auto
 from itertools import product
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
@@ -61,15 +61,15 @@ def make_rand_lora_weight_tensor(k: int,
 
 
 def make_rand_tensors(
-    a_shape: Tuple[int],
-    b_shape: Tuple[int],
-    c_shape: Tuple[int],
+    a_shape: tuple[int],
+    b_shape: tuple[int],
+    c_shape: tuple[int],
     a_dtype: torch.dtype,
     b_dtype: torch.dtype,
     c_dtype: torch.dtype,
     num_slices: int,
     device: str = "cuda",
-) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
+) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
     """
     Make LoRA input/output matrices.
     """
@@ -135,7 +135,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int,
 
 
 def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
-                   lora_weights: List[torch.Tensor],
+                   lora_weights: list[torch.Tensor],
                    seq_lens_cpu: torch.Tensor,
                    prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
                    add_inputs: Optional[bool]):
@@ -204,7 +204,7 @@ def is_decode_op(self) -> bool:
     def is_expand_slice_fn(self) -> bool:
         return self in [OpType.BGMV_EXPAND_SLICE]
 
-    def num_slices(self) -> List[int]:
+    def num_slices(self) -> list[int]:
         if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
             # SGMV kernels supports slices
             return [1, 2, 3]
@@ -215,7 +215,7 @@ def num_slices(self) -> List[int]:
         raise ValueError(f"Unrecognized OpType {self}")
 
     def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
-            lora_rank: int) -> Tuple[int, int, int]:
+            lora_rank: int) -> tuple[int, int, int]:
         num_tokens = batch_size * seq_length
         if self.is_shrink_fn():
             m = num_tokens
@@ -230,7 +230,7 @@ def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
 
     def matmul_dtypes(
             self, op_dtype: torch.dtype
-    ) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
+    ) -> tuple[torch.dtype, torch.dtype, torch.dtype]:
         """
         return a type, b type and c type for A x B = C
         """
@@ -243,7 +243,7 @@ def matmul_dtypes(
     def matmul_shapes(
             self, batch_size: int, seq_length: int, hidden_size: int,
             lora_rank: int, num_loras: int,
-            num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
+            num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]:
         """
         Given num_slices, return the shapes of the A, B, and C matrices
         in A x B = C, for the op_type
@@ -268,7 +268,7 @@ def matmul_shapes(
 
     def bench_fn(self) -> Callable:
 
-        def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
+        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
             for x in kwargs_list:
                 bgmv_expand_slice(**x)
 
@@ -285,7 +285,7 @@ def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
         raise ValueError(f"Unrecognized optype {self}")
 
     def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
-                           lora_weights: List[torch.Tensor],
+                           lora_weights: list[torch.Tensor],
                            **kwargs) -> Callable:
         """Each benchmark operation expected the input, lora_weights and outputs
            in a slightly different format. Refer to self.matmul_shapes().
@@ -384,7 +384,7 @@ class BenchmarkTensors:
     """
     # matmul tensors
     input: torch.Tensor
-    lora_weights_lst: List[torch.Tensor]
+    lora_weights_lst: list[torch.Tensor]
     output: torch.Tensor
     # metadata tensors
     seq_lens: torch.Tensor
@@ -469,7 +469,7 @@ def to_device(tensor: torch.Tensor):
         for i in range(len(self.lora_weights_lst)):
             self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
 
-    def metadata(self) -> Tuple[int, int, int]:
+    def metadata(self) -> tuple[int, int, int]:
         """
         Return num_seqs, num_tokens and max_seq_len
         """
@@ -505,7 +505,7 @@ def convert_to_sgmv_benchmark_tensors(self):
         self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
         self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
 
-    def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
+    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
         self.convert_to_sgmv_benchmark_tensors()
         self.sanity_check()
         self.to_device(self.input.device)
@@ -540,7 +540,7 @@ def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
             'scaling': 1.0,
         }
 
-    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
 
         self.convert_to_sgmv_benchmark_tensors()
         self.sanity_check()
@@ -578,7 +578,7 @@ def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
             'add_inputs': add_inputs,
         }
 
-    def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
+    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
         assert len(self.lora_weights_lst) == 1
         self.to_device(self.input.device)
 
@@ -634,7 +634,7 @@ def as_bgmv_expand_kwargs(self, add_inputs: bool):
             'add_inputs': add_inputs
         }
 
-    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
+    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
 
         _, num_tokens, _, num_slices = self.metadata()
         # Sanity check shapes
@@ -670,7 +670,7 @@ def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
 
     def bench_fn_kwargs(self,
                         op_type: OpType,
-                        add_inputs: Optional[bool] = None) -> Dict[str, Any]:
+                        add_inputs: Optional[bool] = None) -> dict[str, Any]:
         if op_type.is_shrink_fn():
             assert add_inputs is None
         else:
@@ -734,7 +734,7 @@ def bench_optype(ctx: BenchmarkContext,
         assert expand_fn_add_inputs is not None
 
     # BenchmarkContext -> BenchmarkTensors
-    bench_tensors : List[BenchmarkTensors] = \
+    bench_tensors : list[BenchmarkTensors] = \
         [BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
     for bt in bench_tensors:
         bt.sanity_check()
@@ -746,7 +746,7 @@ def bench_optype(ctx: BenchmarkContext,
             for bt in bench_tensors
         ])
 
-    # BenchmarkTensors -> Dict (kwargs)
+    # BenchmarkTensors -> dict (kwargs)
     kwargs_list = [
         bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
         for bt in bench_tensors
@@ -841,7 +841,7 @@ def use_cuda_graph_recommendation() -> str:
             """
 
 
-def print_timers(timers: List[TMeasurement],
+def print_timers(timers: list[TMeasurement],
                  args: Optional[argparse.Namespace] = None):
     compare = TBenchmark.Compare(timers)
     compare.print()
@@ -861,7 +861,7 @@ def print_timers(timers: List[TMeasurement],
           "small num_loras the goal should be to match the torch.mm numbers.")
 
 
-def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
+def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
 
     if args.cuda_graph_nops is not None:
         assert args.cuda_graph_nops > 0
@@ -873,7 +873,7 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
     timers = []
     for bench_ctx in bench_ctxs:
         for seq_len in args.seq_lengths:
-            bench_ops: List[OpType] = []
+            bench_ops: list[OpType] = []
             if seq_len == 1:
                 # bench all decode ops
                 bench_ops = [op for op in args.op_types if op.is_decode_op()]
@@ -921,10 +921,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
             pickle.dump(timers, f)
 
 
-def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
-                          args: argparse.Namespace) -> List[BenchmarkContext]:
+def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int],
+                          args: argparse.Namespace) -> list[BenchmarkContext]:
 
-    ctxs: List[BenchmarkContext] = []
+    ctxs: list[BenchmarkContext] = []
     for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product(  # noqa
             args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
             args.sort_by_lora_id):
@@ -954,7 +954,7 @@ def run_list_bench(args: argparse.Namespace):
           f"  LoRA Ranks {args.lora_ranks}")
 
     # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
         hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
 
     run(args, bench_contexts)
@@ -975,7 +975,7 @@ def run_range_bench(args: argparse.Namespace):
           f" LoRA Ranks {lora_ranks}")
 
     # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
         hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
 
     run(args, bench_contexts)
@@ -1002,7 +1002,7 @@ def hidden_sizes_from_model(model: str, tp_size: int) -> set[int]:
           f" LoRA Ranks {args.lora_ranks}")
 
     # Get all benchmarking contexts
-    bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
+    bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
         hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
 
     run(args, bench_contexts)
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 0301fee1a88..3fa57bd7b23 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -7,9 +7,10 @@
 import os
 import pickle as pkl
 import time
+from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import product
-from typing import Callable, Iterable, List, Optional, Tuple
+from typing import Callable, Optional
 
 import pandas as pd
 import torch
@@ -102,8 +103,8 @@ def quantize_and_pack(atype: torch.dtype,
     return w_ref, w_q, w_s, w_zp
 
 
-def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
-                         group_size: Optional[int]) -> List[BenchmarkTensors]:
+def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig,
+                         group_size: Optional[int]) -> list[BenchmarkTensors]:
     m, n, k = shape
 
     # we want to make sure that weights don't fit into L2 cache between runs so
@@ -114,7 +115,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
 
     a = rand_data((m, k), types.act_type, scale=5)
 
-    benchmark_tensors: List[BenchmarkTensors] = []
+    benchmark_tensors: list[BenchmarkTensors] = []
     for _ in range(num_weights):
         w = rand_data((k, n), types.act_type, scale=5)
 
@@ -276,7 +277,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
 
 
 def bench_fns(label: str, sub_label: str, description: str,
-              fns: List[Callable]):
+              fns: list[Callable]):
 
     min_run_time = 1 if not NVTX_PROFILE else 0.1
     res = TBenchmark.Timer(
@@ -311,7 +312,7 @@ def bench(types: TypeConfig,
           n: int,
           label: str,
           sub_label: str,
-          sweep_schedules: bool = True) -> List[TMeasurement]:
+          sweep_schedules: bool = True) -> list[TMeasurement]:
     benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
     sub_label += f", L={len(benchmark_tensors)}"
 
@@ -414,12 +415,12 @@ def bench(types: TypeConfig,
 
 
 # runner
-def print_timers(timers: List[TMeasurement]):
+def print_timers(timers: list[TMeasurement]):
     compare = TBenchmark.Compare(timers)
     compare.print()
 
 
-def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
     types = TypeConfig(
         act_type=args.act_type,
         weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
@@ -431,7 +432,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
         token_scale_type=args.token_scale_type,
     )
 
-    results: List[TMeasurement] = []
+    results: list[TMeasurement] = []
     for m, k, n in MKNs:
         timers = bench(types,
                        args.group_size,
@@ -449,8 +450,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
 
 # output makers
 def make_output(
-    data: List[TMeasurement],
-    MKNs: Iterable[Tuple[int, int, int]],
+    data: list[TMeasurement],
+    MKNs: Iterable[tuple[int, int, int]],
     base_description: str,
     timestamp=None,
 ):
@@ -497,7 +498,7 @@ def run_model_bench(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+    def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
         KNs = []
         for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
             KN[tp_split_dim] = KN[tp_split_dim] // tp_size
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 21ef491294e..1e785ac8fc7 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import torch
 import torch.utils.benchmark as benchmark
 from benchmark_shapes import WEIGHT_SHAPES
@@ -31,7 +29,7 @@
 K_FULL_OPTS = [False, True]
 
 
-def bench_run(results: List[benchmark.Measurement], model: str,
+def bench_run(results: list[benchmark.Measurement], model: str,
               act_order: bool, is_k_full: bool, quant_type: ScalarType,
               group_size: int, size_m: int, size_k: int, size_n: int):
     label = "Quant Matmul"
@@ -221,7 +219,7 @@ def main(args):
     for i, model in enumerate(args.models):
         print(f"[{i}]  {model}")
 
-    results: List[benchmark.Measurement] = []
+    results: list[benchmark.Measurement] = []
 
     for model in args.models:
         for layer in WEIGHT_SHAPES[model]:
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 410750686ee..c862dec81fc 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -4,7 +4,7 @@
 import time
 from datetime import datetime
 from itertools import product
-from typing import Any, Dict, List, Tuple, TypedDict
+from typing import Any, TypedDict
 
 import ray
 import torch
@@ -132,7 +132,7 @@ def run():
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
 
-    latencies: List[float] = []
+    latencies: list[float] = []
     for i in range(num_iters):
         prepare(i)
         torch.cuda.synchronize()
@@ -175,8 +175,8 @@ def get_rocm_tuning_space(use_fp16):
     return param_ranges
 
 
-def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
-    configs: List[BenchmarkConfig] = []
+def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]:
+    configs: list[BenchmarkConfig] = []
 
     if current_platform.is_rocm():
         param_ranges = get_rocm_tuning_space(use_fp16)
@@ -335,7 +335,7 @@ def benchmark(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-    ) -> Tuple[Dict[str, int], float]:
+    ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
         dtype_str = get_config_dtype_str(dtype,
                                          use_int8_w8a16=use_int8_w8a16,
@@ -371,8 +371,8 @@ def tune(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
-        search_space: List[Dict[str, int]],
-    ) -> Dict[str, int]:
+        search_space: list[dict[str, int]],
+    ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
         if current_platform.is_rocm():
@@ -434,7 +434,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
     }
 
 
-def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
+def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
                  shard_intermediate_size: int, hidden_size: int, topk: int,
                  dtype: torch.dtype, use_fp8_w8a8: bool,
                  use_int8_w8a16: bool) -> None:
@@ -498,7 +498,7 @@ def main(args: argparse.Namespace):
     num_gpus = int(ray.available_resources()["GPU"])
     workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
 
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+    def _distribute(method: str, inputs: list[Any]) -> list[Any]:
         outputs = []
         worker_idx = 0
         for input_args in inputs:
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index daedaadb1a7..d00e8482436 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -2,7 +2,7 @@
 
 import random
 import time
-from typing import List, Optional
+from typing import Optional
 
 import torch
 
@@ -54,7 +54,7 @@ def main(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables_lst: List[List[int]] = []
+    block_tables_lst: list[list[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index dba153742da..010a38b7527 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 import triton
@@ -22,7 +22,7 @@ def forward(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         orig_dtype = x.dtype
         x = x.to(torch.float32)
         if residual is not None:
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 8ee0212a0c1..05d24fc4b16 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional
 
 import nvtx
 import torch
@@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora(
                             })
     # non-batched RoPE takes only one scaling factor, we create multiple
     # instances to simulate the same behavior
-    non_batched_ropes: List[RotaryEmbedding] = []
+    non_batched_ropes: list[RotaryEmbedding] = []
     for scaling_factor in scaling_factors:
         non_batched_ropes.append(
             get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index 01d97d63d7c..bd62173a7b3 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -4,7 +4,6 @@
 import pickle
 import re
 from collections import defaultdict
-from typing import List
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -23,7 +22,7 @@
 
     with open(args.filename, 'rb') as f:
         data = pickle.load(f)
-        raw_results: List[TMeasurement] = data["results"]
+        raw_results: list[TMeasurement] = data["results"]
 
     results = defaultdict(lambda: list())
     for v in raw_results:
diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py
index 72817074849..ac64f786f18 100644
--- a/benchmarks/kernels/utils.py
+++ b/benchmarks/kernels/utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
-from typing import Any, Callable, Iterable, Optional
+from collections.abc import Iterable
+from typing import Any, Callable, Optional
 
 import torch
 import torch.utils.benchmark as TBenchmark
diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
index d5a5e2ef83d..d64f0d0a5c2 100644
--- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
+++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
-from typing import Dict, Union
+from typing import Union
 
 from cutlass_library import *
 
@@ -21,7 +21,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     TmaWarpSpecializedCooperative = enum_auto()
 
 
-VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = {
     **DataTypeNames,  # type: ignore
     **{
         VLLMDataType.u4b8: "u4b8",
@@ -29,7 +29,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     }
 }
 
-VLLMDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     **DataTypeTag,  # type: ignore
     **{
         VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t",
@@ -37,7 +37,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     }
 }
 
-VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = {
+VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = {
     **DataTypeSize,  # type: ignore
     **{
         VLLMDataType.u4b8: 4,
@@ -45,7 +45,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     }
 }
 
-VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     VLLMDataType.u4b8: "vllm::kU4B8",
     VLLMDataType.u8b128: "vllm::kU8B128",
     DataType.u4: "vllm::kU4",
@@ -56,7 +56,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     DataType.bf16: "vllm::kBfloat16",
 }
 
-VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = {
+VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = {
     DataType.u8: "at::ScalarType::Byte",
     DataType.s8: "at::ScalarType::Char",
     DataType.e4m3: "at::ScalarType::Float8_e4m3fn",
@@ -66,7 +66,7 @@ class MixedInputKernelScheduleType(enum.Enum):
     DataType.f32: "at::ScalarType::Float",
 }
 
-VLLMKernelScheduleTag: Dict[Union[
+VLLMKernelScheduleTag: dict[Union[
     MixedInputKernelScheduleType, KernelScheduleType], str] = {
         **KernelScheduleTag,  # type: ignore
         **{
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index 02e59fe28b9..3114e14baa0 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -8,7 +8,7 @@
 from copy import deepcopy
 from dataclasses import dataclass, fields
 from functools import reduce
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import jinja2
 # yapf conflicts with isort for this block
@@ -247,8 +247,8 @@
 
 @dataclass(frozen=True)
 class ScheduleConfig:
-    tile_shape_mn: Tuple[int, int]
-    cluster_shape_mnk: Tuple[int, int, int]
+    tile_shape_mn: tuple[int, int]
+    cluster_shape_mnk: tuple[int, int, int]
     kernel_schedule: MixedInputKernelScheduleType
     epilogue_schedule: EpilogueScheduleType
     tile_scheduler: TileSchedulerType
@@ -277,8 +277,8 @@ class PrepackTypeConfig:
 @dataclass
 class ImplConfig:
     types: TypeConfig
-    schedules: List[ScheduleConfig]
-    heuristic: List[Tuple[Optional[str], ScheduleConfig]]
+    schedules: list[ScheduleConfig]
+    heuristic: list[tuple[Optional[str], ScheduleConfig]]
 
 
 def generate_sch_sig(schedule_config: ScheduleConfig) -> str:
@@ -333,7 +333,7 @@ def is_power_of_two(n):
     return (n != 0) and (n & (n - 1) == 0)
 
 
-def to_cute_constant(value: List[int]):
+def to_cute_constant(value: list[int]):
 
     def _to_cute_constant(value: int):
         if is_power_of_two(value):
@@ -347,7 +347,7 @@ def _to_cute_constant(value: int):
         return _to_cute_constant(value)
 
 
-def unique_schedules(impl_configs: List[ImplConfig]):
+def unique_schedules(impl_configs: list[ImplConfig]):
     return list(
         set(sch for impl_config in impl_configs
             for sch in impl_config.schedules))
@@ -391,7 +391,7 @@ def create_template(template_str):
 prepack_dispatch_template = create_template(PREPACK_TEMPLATE)
 
 
-def create_sources(impl_configs: List[ImplConfig], num_impl_files=8):
+def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
     sources = []
 
     sources.append((
@@ -435,7 +435,7 @@ def prepacked_type_key(prepack_type: PrepackTypeConfig):
     num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0)
     num_impls_per_file = math.ceil(num_impls / num_impl_files)
 
-    files_impls: List[List[ImplConfig]] = [[]]
+    files_impls: list[list[ImplConfig]] = [[]]
 
     curr_num_impls_assigned = 0
     curr_impl_in_file = 0
@@ -515,7 +515,7 @@ def generate():
         for cond, tile_config in default_tile_heuristic_config.items()
     ]
 
-    def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]):
+    def get_unique_schedules(heuristic: dict[str, ScheduleConfig]):
         # Do not use schedules = list(set(...)) because we need to make sure
         # the output list is deterministic; otherwise the generated kernel file
         # will be non-deterministic and causes ccache miss.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 97bec81b1ee..b72faef9af1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -17,7 +17,6 @@
 import logging
 import os
 import sys
-from typing import List
 
 import requests
 from sphinx.ext import autodoc
@@ -58,7 +57,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
+exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
 
 # Exclude the prompt "$" when copying code
 copybutton_prompt_text = r"\$ "
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 5c0c1762f8a..230e461f69f 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -123,7 +123,7 @@ class ExampleParser(ReasoningParser):
 
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
-    ) -> Tuple[Optional[str], Optional[str]]:
+    ) -> tuple[Optional[str], Optional[str]]:
         """
         Extract reasoning content from a complete model-generated string.
 
@@ -138,7 +138,7 @@ class ExampleParser(ReasoningParser):
             The request object that was used to generate the model_output.
 
         Returns:
-        Tuple[Optional[str], Optional[str]]
+        tuple[Optional[str], Optional[str]]
             A tuple containing the reasoning content and the content.
         """
 ```
diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md
index 1d5aa07ab17..de3c5bf5e7a 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/source/features/structured_outputs.md
@@ -193,7 +193,7 @@ class Step(BaseModel):
 
 
 class MathResponse(BaseModel):
-    steps: List[Step]
+    steps: list[Step]
     final_answer: str
 
 
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index c5f75953aaf..c51ca18667e 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -74,7 +74,7 @@ class Example:
         path (Path): The path to the main directory or file.
         category (str): The category of the document.
         main_file (Path): The main file in the directory.
-        other_files (list[Path]): List of other files in the directory.
+        other_files (list[Path]): list of other files in the directory.
         title (str): The title of the document.
 
     Methods:
diff --git a/examples/offline_inference/distributed.py b/examples/offline_inference/distributed.py
index a2df41d4ce2..e890c6dad8b 100644
--- a/examples/offline_inference/distributed.py
+++ b/examples/offline_inference/distributed.py
@@ -6,7 +6,7 @@
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """
 
-from typing import Any, Dict, List
+from typing import Any
 
 import numpy as np
 import ray
@@ -36,13 +36,13 @@ def __init__(self):
         self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
                        tensor_parallel_size=tensor_parallel_size)
 
-    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
+    def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
         # Generate texts from the prompts.
         # The output is a list of RequestOutput objects that contain the prompt,
         # generated text, and other information.
         outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt: List[str] = []
-        generated_text: List[str] = []
+        prompt: list[str] = []
+        generated_text: list[str] = []
         for output in outputs:
             prompt.append(output.prompt)
             generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -72,7 +72,7 @@ def scheduling_strategy_fn():
         pg, placement_group_capture_child_tasks=True))
 
 
-resources_kwarg: Dict[str, Any] = {}
+resources_kwarg: dict[str, Any] = {}
 if tensor_parallel_size == 1:
     # For tensor_parallel_size == 1, we simply set num_gpus=1.
     resources_kwarg["num_gpus"] = 1
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index 501034c1cc5..f7741a37224 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -1,13 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-from typing import List, Tuple
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.utils import FlexibleArgumentParser
 
 
-def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+def create_test_prompts() -> list[tuple[str, SamplingParams]]:
     """Create a list of test prompts with their sampling parameters."""
     return [
         ("A robot may not injure a human being",
@@ -24,7 +23,7 @@ def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
 
 
 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams]]):
+                     test_prompts: list[tuple[str, SamplingParams]]):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
@@ -34,7 +33,7 @@ def process_requests(engine: LLMEngine,
             engine.add_request(str(request_id), prompt, sampling_params)
             request_id += 1
 
-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index de0734c1aa8..a409735013f 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -7,7 +7,7 @@
 """
 
 import gc
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 from huggingface_hub import snapshot_download
@@ -18,7 +18,7 @@
 
 def create_test_prompts(
         lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     return [
         # this is an example of using quantization without LoRA
         ("My name is",
@@ -49,7 +49,7 @@ def create_test_prompts(
 
 
 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                               Optional[LoRARequest]]]):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
@@ -63,7 +63,7 @@ def process_requests(engine: LLMEngine,
                                lora_request=lora_request)
             request_id += 1
 
-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()
         for request_output in request_outputs:
             if request_output.finished:
                 print("----------------------------------------------------")
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index f227e71ba79..61641245de8 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -2,12 +2,11 @@
 
 import gc
 import time
-from typing import List
 
 from vllm import LLM, SamplingParams
 
 
-def time_generation(llm: LLM, prompts: List[str],
+def time_generation(llm: LLM, prompts: list[str],
                     sampling_params: SamplingParams):
     # Generate texts from the prompts. The output is a list of RequestOutput
     # objects that contain the prompt, generated text, and other information.
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 630fd1bf834..4b0d115e660 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -6,7 +6,7 @@
 Requires HuggingFace credentials for access to Llama2.
 """
 
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from huggingface_hub import snapshot_download
 
@@ -16,7 +16,7 @@
 
 def create_test_prompts(
         lora_path: str
-) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     """Create a list of test prompts with their sampling parameters.
 
     2 requests for base model, 4 requests for the LoRA. We define 2
@@ -56,7 +56,7 @@ def create_test_prompts(
 
 
 def process_requests(engine: LLMEngine,
-                     test_prompts: List[Tuple[str, SamplingParams,
+                     test_prompts: list[tuple[str, SamplingParams,
                                               Optional[LoRARequest]]]):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
@@ -70,7 +70,7 @@ def process_requests(engine: LLMEngine,
                                lora_request=lora_request)
             request_id += 1
 
-        request_outputs: List[RequestOutput] = engine.step()
+        request_outputs: list[RequestOutput] = engine.step()
 
         for request_output in request_outputs:
             if request_output.finished:
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 298f0801900..3ae507cac5c 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -21,7 +21,7 @@
 import datetime
 import os
 import re
-from typing import List, Union
+from typing import Union
 
 import albumentations
 import numpy as np
@@ -260,9 +260,9 @@ def _convert_np_uint8(float_image: torch.Tensor):
 
 
 def load_example(
-    file_paths: List[str],
-    mean: List[float] = None,
-    std: List[float] = None,
+    file_paths: list[str],
+    mean: list[float] = None,
+    std: list[float] = None,
     indices: Union[list[int], None] = None,
 ):
     """Build an input example by loading images in *file_paths*.
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index c2e072fdd88..ffa76b4e4f2 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -5,8 +5,9 @@
 import os
 import sys
 from argparse import RawTextHelpFormatter
+from collections.abc import Generator
 from dataclasses import asdict, dataclass
-from typing import Any, Dict, Generator, List, Optional, TypeAlias
+from typing import Any, Optional, TypeAlias
 
 import torch
 import tqdm
@@ -42,8 +43,8 @@ def get_dtype(dtype: str):
         return dtype
 
 
-OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
-def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
+OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
+def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
       -> OutputLen_NumReqs_Map:
     """
     Given the number of requests, batch_size, and the number of requests
@@ -63,7 +64,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
     Args:
         batch_size (int): Number of requests submitted for profile. This is
             args.batch_size.
-        step_requests (List[int]): step_requests[i] is the number of requests
+        step_requests (list[int]): step_requests[i] is the number of requests
             that the ith engine step should process.
 
     Returns:
@@ -114,7 +115,7 @@ def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
     return ol_nr
 
 
-def determine_requests_per_step(context: ProfileContext) -> List[int]:
+def determine_requests_per_step(context: ProfileContext) -> list[int]:
     """
     Determine number of requests each engine step should process.
     If context.num_steps is set, then all engine steps process the
@@ -130,7 +131,7 @@ def determine_requests_per_step(context: ProfileContext) -> List[int]:
         context: ProfileContext object.
 
     Returns:
-        List[int]: Number of requests to process for all engine-steps. 
+        list[int]: Number of requests to process for all engine-steps. 
          output[i], contains the number of requests that the ith step
          should process.
     """
@@ -170,7 +171,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
 
-    requests_per_step: List[int] = determine_requests_per_step(context)
+    requests_per_step: list[int] = determine_requests_per_step(context)
 
     ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
         context.batch_size, requests_per_step)
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index d54117d6262..61da4705e18 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -4,7 +4,6 @@
 import dataclasses
 import os
 import time
-from typing import List
 
 import numpy as np
 import torch_xla.debug.profiler as xp
@@ -35,7 +34,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 872c9481a22..b1aec33cff4 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -5,7 +5,7 @@
 using the chat template defined by the model.
 """
 from argparse import Namespace
-from typing import List, NamedTuple, Optional
+from typing import NamedTuple, Optional
 
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
@@ -24,8 +24,8 @@
 class ModelRequestData(NamedTuple):
     llm: LLM
     prompt: str
-    stop_token_ids: Optional[List[int]]
-    image_data: List[Image]
+    stop_token_ids: Optional[list[int]]
+    image_data: list[Image]
     chat_template: Optional[str]
 
 
@@ -34,7 +34,7 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_aria(question, image_urls: List[str]) -> ModelRequestData:
+def load_aria(question, image_urls: list[str]) -> ModelRequestData:
     model_name = "rhymes-ai/Aria"
     llm = LLM(model=model_name,
               tokenizer_mode="slow",
@@ -55,7 +55,7 @@ def load_aria(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_deepseek_vl2(question: str, image_urls: List[str]):
+def load_deepseek_vl2(question: str, image_urls: list[str]):
     model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     llm = LLM(model=model_name,
@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
     )
 
 
-def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-800m"
 
     llm = LLM(
@@ -111,7 +111,7 @@ def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
@@ -142,7 +142,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
     llm = LLM(
@@ -179,7 +179,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
+def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
@@ -201,7 +201,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_nvlm_d(question: str, image_urls: List[str]):
+def load_nvlm_d(question: str, image_urls: list[str]):
     model_name = "nvidia/NVLM-D-72B"
 
     # Adjust this as necessary to fit in GPU
@@ -234,7 +234,7 @@ def load_nvlm_d(question: str, image_urls: List[str]):
     )
 
 
-def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
     # Adjust this as necessary to fit in GPU
@@ -259,7 +259,7 @@ def load_pixtral_hf(question: str, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
+def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
     # to use 16 for single frame scenarios, and 4 for multi-frame.
@@ -295,7 +295,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
 
 
 def load_qwen_vl_chat(question: str,
-                      image_urls: List[str]) -> ModelRequestData:
+                      image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     llm = LLM(
         model=model_name,
@@ -336,7 +336,7 @@ def load_qwen_vl_chat(question: str,
     )
 
 
-def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
+def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
@@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
-def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
@@ -466,7 +466,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
 }
 
 
-def run_generate(model, question: str, image_urls: List[str]):
+def run_generate(model, question: str, image_urls: list[str]):
     req_data = model_example_map[model](question, image_urls)
 
     sampling_params = SamplingParams(temperature=0.0,
@@ -487,7 +487,7 @@ def run_generate(model, question: str, image_urls: List[str]):
         print(generated_text)
 
 
-def run_chat(model: str, question: str, image_urls: List[str]):
+def run_chat(model: str, question: str, image_urls: list[str]):
     req_data = model_example_map[model](question, image_urls)
 
     sampling_params = SamplingParams(temperature=0.0,
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 623e0d59a30..22bb1a87bfd 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -7,7 +7,7 @@
 
 import argparse
 import json
-from typing import Iterable, List
+from collections.abc import Iterable
 
 import requests
 
@@ -39,7 +39,7 @@ def post_http_request(prompt: str,
     return response
 
 
-def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
+def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
                                      delimiter=b"\0"):
@@ -49,7 +49,7 @@ def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
             yield output
 
 
-def get_response(response: requests.Response) -> List[str]:
+def get_response(response: requests.Response) -> list[str]:
     data = json.loads(response.content)
     output = data["text"]
     return output
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index cb110997464..b7c5651e3ba 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -24,4 +24,4 @@
 )
 
 for data in responses.data:
-    print(data.embedding)  # list of float of len 4096
+    print(data.embedding)  # List of float of len 4096
diff --git a/pyproject.toml b/pyproject.toml
index 1c03e9e17be..04e0c9e67eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -65,6 +65,32 @@ exclude = [
 [tool.ruff.lint.per-file-ignores]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
+# Python 3.8 typing. TODO: Remove these excludes after v1.0.0
+"vllm/adapter_commons/**/*.py" = ["UP006", "UP035"]
+"vllm/attention/**/*.py" = ["UP006", "UP035"]
+"vllm/compilation/**/*.py" = ["UP006", "UP035"]
+"vllm/core/**/*.py" = ["UP006", "UP035"]
+"vllm/device_allocator/**/*.py" = ["UP006", "UP035"]
+"vllm/distributed/**/*.py" = ["UP006", "UP035"]
+"vllm/engine/**/*.py" = ["UP006", "UP035"]
+"vllm/executor/**/*.py" = ["UP006", "UP035"]
+"vllm/inputs/**/*.py" = ["UP006", "UP035"]
+"vllm/logging_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/lora/**/*.py" = ["UP006", "UP035"]
+"vllm/model_executor/**/*.py" = ["UP006", "UP035"]
+"vllm/multimodal/**/*.py" = ["UP006", "UP035"]
+"vllm/platforms/**/*.py" = ["UP006", "UP035"]
+"vllm/plugins/**/*.py" = ["UP006", "UP035"]
+"vllm/profiler/**/*.py" = ["UP006", "UP035"]
+"vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
+"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
+"vllm/third_party/**/*.py" = ["UP006", "UP035"]
+"vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
+"vllm/usage/**/*.py" = ["UP006", "UP035"]
+"vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
+"vllm/assets/**/*.py" = ["UP006", "UP035"]
+"vllm/worker/**/*.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
 select = [
@@ -91,8 +117,6 @@ ignore = [
     "B007",
     # f-string format
     "UP032",
-    # Python 3.8 typing
-    "UP006", "UP035",
     # Can remove once 3.10+ is the minimum Python version
     "UP007",
 ]
diff --git a/setup.py b/setup.py
index 6fe433517a0..cd17709b57e 100755
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,6 @@
 import sys
 from pathlib import Path
 from shutil import which
-from typing import Dict, List
 
 import torch
 from packaging.version import Version, parse
@@ -78,7 +77,7 @@ def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
 
 class cmake_build_ext(build_ext):
     # A dict of extension directories that have been configured.
-    did_config: Dict[str, bool] = {}
+    did_config: dict[str, bool] = {}
 
     #
     # Determine number of compilation jobs and optionally nvcc compile threads.
@@ -548,10 +547,10 @@ def get_vllm_version() -> str:
     return version
 
 
-def get_requirements() -> List[str]:
+def get_requirements() -> list[str]:
     """Get Python package dependencies from requirements.txt."""
 
-    def _read_requirements(filename: str) -> List[str]:
+    def _read_requirements(filename: str) -> list[str]:
         with open(get_path(filename)) as f:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py
index d9ac611644d..1e3c2d1a473 100644
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """vllm.entrypoints.api_server with some extra logging for testing."""
-from typing import Any, Dict, Iterable
+from collections.abc import Iterable
+from typing import Any
 
 import uvicorn
 from fastapi.responses import JSONResponse, Response
@@ -24,7 +25,7 @@ async def _engine_abort(self, request_ids: Iterable[str]):
         self._num_aborts += len(ids)
         await super()._engine_abort(ids)
 
-    def testing_stats(self) -> Dict[str, Any]:
+    def testing_stats(self) -> dict[str, Any]:
         return {"num_aborted_requests": self._num_aborts}
 
 
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index ca29abc9285..6307bd7d646 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -6,7 +6,7 @@
 from asyncio import CancelledError
 from copy import copy
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 
 import pytest
 import pytest_asyncio
@@ -254,7 +254,7 @@ async def run_deltas(prompt: str):
         params.output_kind = RequestOutputKind.DELTA
 
         prompt_tokens = None
-        output_tokens: List[int] = []
+        output_tokens: list[int] = []
         output_text = ""
         output_count = 0
         final_output = None
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 021bd4cc463..7307f44b618 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -8,7 +8,7 @@
 initialized randomly with a fixed seed.
 """
 from dataclasses import dataclass
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 from torch import nn
@@ -56,7 +56,7 @@ class LlamaConfig:
     random_seed: int = 0
 
     def compute_hash(self) -> str:
-        factors: List[Any] = []
+        factors: list[Any] = []
         for k, v in self.__dict__.items():
             if k == "random_seed":
                 continue
@@ -174,7 +174,7 @@ def forward(
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         For tractable computation:
         - if residual is None, the outputs are:
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 587c0a60cee..48323b21a8c 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
-from typing import Dict, List, Optional
+from typing import Optional
 
 import pytest
 
@@ -14,7 +14,7 @@
 @dataclasses.dataclass
 class TestSetting:
     model: str
-    model_args: List[str]
+    model_args: list[str]
     pp_size: int
     tp_size: int
     attn_backend: str
@@ -108,8 +108,8 @@ def test_compile_correctness(test_setting: TestSetting):
     final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
                 ["-tp", str(tp_size)]
 
-    all_args: List[List[str]] = []
-    all_envs: List[Optional[Dict[str, str]]] = []
+    all_args: list[list[str]] = []
+    all_envs: list[Optional[dict[str, str]]] = []
 
     for level in [
             CompilationLevel.NO_COMPILATION,
diff --git a/tests/conftest.py b/tests/conftest.py
index 871f0b62c53..57a33ad08c9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,7 @@
 import tempfile
 from collections import UserList
 from enum import Enum
-from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
-                    TypedDict, TypeVar, Union)
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
 
 import numpy as np
 import pytest
@@ -47,14 +46,14 @@
 
 _M = TypeVar("_M")
 
-_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+_PromptMultiModalInput = Union[list[_M], list[list[_M]]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
-PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptAudioInput = _PromptMultiModalInput[tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 
-def _read_prompts(filename: str) -> List[str]:
+def _read_prompts(filename: str) -> list[str]:
     with open(filename) as f:
         prompts = f.readlines()
         return prompts
@@ -77,7 +76,7 @@ def __init__(self) -> None:
             ImageAsset("cherry_blossom"),
         ])
 
-    def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _ImageAssetPrompts) -> list[str]:
         """
         Convenience method to define the prompt for each test image.
 
@@ -102,7 +101,7 @@ def __init__(self) -> None:
             VideoAsset("sample_demo_1.mp4"),
         ])
 
-    def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
+    def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
         return [prompts["sample_demo_1"]]
 
 
@@ -175,7 +174,7 @@ def dynamo_reset():
 
 
 @pytest.fixture
-def example_prompts() -> List[str]:
+def example_prompts() -> list[str]:
     prompts = []
     for filename in _TEST_PROMPTS:
         prompts += _read_prompts(filename)
@@ -197,7 +196,7 @@ class DecoderPromptType(Enum):
 
 @pytest.fixture
 def example_encoder_decoder_prompts(
-) -> Dict[DecoderPromptType, List[ExplicitEncoderDecoderPrompt]]:
+) -> dict[DecoderPromptType, list[ExplicitEncoderDecoderPrompt]]:
     '''
     Returns an encoder prompt list and a decoder prompt list, wherein each pair
     of same-index entries in both lists corresponds to an (encoder prompt,
@@ -229,7 +228,7 @@ def example_encoder_decoder_prompts(
 
 
 @pytest.fixture
-def example_long_prompts() -> List[str]:
+def example_long_prompts() -> list[str]:
     prompts = []
     for filename in _LONG_PROMPTS:
         prompts += _read_prompts(filename)
@@ -273,11 +272,11 @@ def __init__(
         model_name: str,
         dtype: str = "half",
         *,
-        model_kwargs: Optional[Dict[str, Any]] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
-        auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
+        auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
         postprocess_inputs: Callable[..., BatchEncoding] = identity,
     ) -> None:
         torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
@@ -334,11 +333,11 @@ def __init__(
 
     def get_inputs(
         self,
-        prompts: List[str],
+        prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
-    ) -> List[BatchEncoding]:
+    ) -> list[BatchEncoding]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -348,9 +347,9 @@ def get_inputs(
         if audios is not None:
             assert len(prompts) == len(audios)
 
-        all_inputs: List[BatchEncoding] = []
+        all_inputs: list[BatchEncoding] = []
         for i, prompt in enumerate(prompts):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                 "text": prompt,
                 "return_tensors": "pt",
             }
@@ -370,7 +369,7 @@ def get_inputs(
 
         return all_inputs
 
-    def classify(self, prompts: List[str]) -> List[str]:
+    def classify(self, prompts: list[str]) -> list[str]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
         outputs = []
@@ -383,18 +382,18 @@ def classify(self, prompts: List[str]) -> List[str]:
 
     def generate(
         self,
-        prompts: List[str],
+        prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
         all_inputs = self.get_inputs(prompts,
                                      images=images,
                                      videos=videos,
                                      audios=audios)
 
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
             output_ids = self.model.generate(
                 **self.wrap_device(inputs, device=self.model.device.type),
@@ -412,13 +411,13 @@ def generate(
 
     def generate_greedy(
         self,
-        prompts: List[str],
+        prompts: list[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
@@ -432,10 +431,10 @@ def generate_greedy(
 
     def generate_beam_search(
         self,
-        prompts: List[str],
+        prompts: list[str],
         beam_width: int,
         max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
@@ -453,19 +452,19 @@ def generate_beam_search(
 
     def generate_greedy_logprobs(
         self,
-        prompts: List[str],
+        prompts: list[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
-    ) -> List[List[torch.Tensor]]:
+    ) -> list[list[torch.Tensor]]:
         all_inputs = self.get_inputs(prompts,
                                      images=images,
                                      videos=videos,
                                      audios=audios)
 
-        all_logprobs: List[List[torch.Tensor]] = []
+        all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
             output = self.model.generate(
                 **self.wrap_device(inputs, device=self.model.device.type),
@@ -483,11 +482,11 @@ def generate_greedy_logprobs(
 
     def _hidden_states_to_seq_logprobs(
         self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
-    ) -> List[torch.Tensor]:
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
+    ) -> list[torch.Tensor]:
         output_embeddings = self.model.get_output_embeddings()
 
-        seq_logprobs: List[torch.Tensor] = []
+        seq_logprobs: list[torch.Tensor] = []
         for _, hidden_state in enumerate(hidden_states):
             last_hidden_states = hidden_state[-1][0]
             logits = torch.matmul(
@@ -503,14 +502,14 @@ def _hidden_states_to_seq_logprobs(
 
     def _hidden_states_to_logprobs(
         self,
-        hidden_states: Tuple[Tuple[torch.Tensor, ...], ...],
+        hidden_states: tuple[tuple[torch.Tensor, ...], ...],
         num_logprobs: int,
-    ) -> Tuple[List[Dict[int, float]], int]:
+    ) -> tuple[list[dict[int, float]], int]:
         seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
         output_len = len(hidden_states)
 
         # convert to dict
-        seq_logprobs_lst: List[Dict[int, float]] = []
+        seq_logprobs_lst: list[dict[int, float]] = []
         for tok_idx, tok_logprobs in enumerate(seq_logprobs):
             # drop prompt logprobs
             if tok_idx == 0:
@@ -530,22 +529,22 @@ def _hidden_states_to_logprobs(
 
     def generate_greedy_logprobs_limit(
         self,
-        prompts: List[str],
+        prompts: list[str],
         max_tokens: int,
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
         all_inputs = self.get_inputs(prompts,
                                      images=images,
                                      videos=videos,
                                      audios=audios)
 
-        all_logprobs: List[List[Dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
-        all_output_strs: List[str] = []
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []
 
         for inputs in all_inputs:
             output = self.model.generate(
@@ -577,23 +576,23 @@ def generate_greedy_logprobs_limit(
 
     def generate_encoder_decoder_greedy_logprobs_limit(
         self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         **kwargs: Any,
-    ) -> List[TokensTextLogprobs]:
+    ) -> list[TokensTextLogprobs]:
         '''
         Greedy logprobs generation for vLLM encoder/decoder models
         '''
 
-        all_logprobs: List[List[Dict[int, float]]] = []
-        all_output_ids: List[List[int]] = []
-        all_output_strs: List[str] = []
+        all_logprobs: list[list[dict[int, float]]] = []
+        all_output_ids: list[list[int]] = []
+        all_output_strs: list[str] = []
 
         for i, (encoder_prompt, decoder_prompt) in enumerate(
                 to_enc_dec_tuple_list(encoder_decoder_prompts)):
-            processor_kwargs: Dict[str, Any] = {
+            processor_kwargs: dict[str, Any] = {
                 "text": encoder_prompt,
                 "return_tensors": "pt",
             }
@@ -641,10 +640,10 @@ def generate_encoder_decoder_greedy_logprobs_limit(
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
-    def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
+    def encode(self, prompts: list[str]) -> list[list[torch.Tensor]]:
         return self.model.encode(prompts)
 
-    def predict(self, prompts: List[List[str]]) -> torch.Tensor:
+    def predict(self, prompts: list[list[str]]) -> torch.Tensor:
         return self.model.predict(prompts, convert_to_tensor=True)
 
     def __enter__(self):
@@ -699,11 +698,11 @@ def __init__(
 
     def get_inputs(
         self,
-        prompts: List[str],
+        prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
-    ) -> List[TextPrompt]:
+    ) -> list[TextPrompt]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -733,13 +732,13 @@ def get_inputs(
 
     def generate(
         self,
-        prompts: List[str],
+        prompts: list[str],
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
@@ -749,12 +748,12 @@ def generate(
                                           sampling_params=sampling_params,
                                           **kwargs)
 
-        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        outputs: list[tuple[list[list[int]], list[str]]] = []
         for req_output in req_outputs:
             prompt_str = req_output.prompt
             prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids: List[List[int]] = []
-            req_sample_output_strs: List[str] = []
+            req_sample_output_ids: list[list[int]] = []
+            req_sample_output_strs: list[str] = []
             for sample in req_output.outputs:
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
@@ -765,9 +764,9 @@ def generate(
 
     @staticmethod
     def _final_steps_generate_w_logprobs(
-        req_outputs: List[RequestOutput],
-    ) -> List[TokensTextLogprobsPromptLogprobs]:
-        outputs: List[TokensTextLogprobsPromptLogprobs] = []
+        req_outputs: list[RequestOutput],
+    ) -> list[TokensTextLogprobsPromptLogprobs]:
+        outputs: list[TokensTextLogprobsPromptLogprobs] = []
         for req_output in req_outputs:
             assert len(req_output.outputs) > 0
             for sample in req_output.outputs:
@@ -780,14 +779,14 @@ def _final_steps_generate_w_logprobs(
 
     def generate_w_logprobs(
         self,
-        prompts: List[str],
+        prompts: list[str],
         sampling_params: SamplingParams,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
@@ -806,10 +805,10 @@ def generate_w_logprobs(
 
     def generate_encoder_decoder_w_logprobs(
         self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         sampling_params: SamplingParams,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
         '''
         Logprobs generation for vLLM encoder/decoder models
         '''
@@ -826,13 +825,13 @@ def generate_encoder_decoder_w_logprobs(
 
     def generate_greedy(
         self,
-        prompts: List[str],
+        prompts: list[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> list[tuple[list[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts,
                                 greedy_params,
@@ -845,18 +844,18 @@ def generate_greedy(
 
     def generate_greedy_logprobs(
         self,
-        prompts: List[str],
+        prompts: list[str],
         max_tokens: int,
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
-        stop_token_ids: Optional[List[int]] = None,
-        stop: Optional[List[str]] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        stop: Optional[list[str]] = None,
         **kwargs: Any,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
@@ -874,12 +873,12 @@ def generate_greedy_logprobs(
 
     def generate_encoder_decoder_greedy_logprobs(
         self,
-        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
         num_logprobs: int,
         num_prompt_logprobs: Optional[int] = None,
-    ) -> Union[List[TokensTextLogprobs],
-               List[TokensTextLogprobsPromptLogprobs]]:
+    ) -> Union[list[TokensTextLogprobs],
+               list[TokensTextLogprobsPromptLogprobs]]:
         greedy_logprobs_params = SamplingParams(
             temperature=0.0,
             max_tokens=max_tokens,
@@ -895,10 +894,10 @@ def generate_encoder_decoder_greedy_logprobs(
 
     def generate_beam_search(
         self,
-        prompts: Union[List[str], List[List[int]]],
+        prompts: Union[list[str], list[list[int]]],
         beam_width: int,
         max_tokens: int,
-    ) -> List[Tuple[List[List[int]], List[str]]]:
+    ) -> list[tuple[list[list[int]], list[str]]]:
         if is_list_of(prompts, str, check="all"):
             prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
         else:
@@ -915,17 +914,17 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
-    def classify(self, prompts: List[str]) -> List[List[float]]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
         req_outputs = self.model.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
     def encode(
         self,
-        prompts: List[str],
+        prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
-    ) -> List[List[float]]:
+    ) -> list[list[float]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
@@ -936,9 +935,9 @@ def encode(
 
     def score(
         self,
-        text_1: Union[str, List[str]],
-        text_2: Union[str, List[str]],
-    ) -> List[float]:
+        text_1: Union[str, list[str]],
+        text_2: Union[str, list[str]],
+    ) -> list[float]:
         req_outputs = self.model.score(text_1, text_2)
         return [req_output.outputs.score for req_output in req_outputs]
 
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
index 7d3ccaadaca..83259b69033 100644
--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, Iterable, Optional
+from collections.abc import Iterable
+from typing import Callable, Optional
 
 import pytest
 
diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index a7dafcf8be8..e23b8718cb6 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import List
 
 import pytest
 
@@ -137,9 +136,9 @@ def prep_prompts(batch_size: int):
     The prompt is just under 10k tokens; sliding window is 4k
     so the answer is outside sliding window, but should still be correct.
     """
-    prompts: List[str] = []
-    answer: List[int] = []
-    indices: List[int] = []
+    prompts: list[str] = []
+    answer: list[int] = []
+    indices: list[int] = []
     random.seed(1)
     for _ in range(batch_size):
         idx = random.randint(30, 90)
@@ -158,7 +157,7 @@ def prep_prompts(batch_size: int):
     return prompts, answer, indices
 
 
-def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
+def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -170,7 +169,7 @@ def check_answers(indices: List[int], answer: List[int], outputs: List[str]):
     assert frac_ok > 0.7
 
 
-def check_window(prompts: List[str]):
+def check_window(prompts: list[str]):
 
     def inner(llm: LLM):
         sliding_window = llm.llm_engine.model_config.get_sliding_window()
diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
index d8cf0bec709..250c9a7497d 100644
--- a/tests/core/block/test_block_table.py
+++ b/tests/core/block/test_block_table.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm.core.block.block_table import BlockTable
@@ -32,7 +30,7 @@ def test_allocate_naive(block_size: int, sequence_len: int):
     token_ids = list(range(sequence_len))
     num_blocks_per_alloc = len(list(chunk_list(token_ids, block_size)))
 
-    block_tables: List[BlockTable] = []
+    block_tables: list[BlockTable] = []
     for i in range(5):
         assert allocator.get_num_free_blocks(
             device=Device.GPU) == num_gpu_blocks - i * num_blocks_per_alloc
@@ -77,7 +75,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
     num_immutable_blocks_per_alloc = len(
         chunked_tokens) - num_mutable_blocks_per_alloc
 
-    block_tables: List[BlockTable] = []
+    block_tables: list[BlockTable] = []
     for alloc_i in range(1, 6):
 
         block_tables.append(
@@ -272,7 +270,7 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
     )
     block_table.allocate(token_ids=token_ids, device=Device.GPU)
 
-    appended_so_far: List[int] = []
+    appended_so_far: list[int] = []
     for append in chunk_list(token_ids_to_append, append_size):
         block_table.append_token_ids(append)
         appended_so_far.extend(append)
diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py
index 0ca2a0b8054..4b9454c84ff 100644
--- a/tests/core/block/test_naive_block.py
+++ b/tests/core/block/test_naive_block.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional
+from typing import Optional
 
 import pytest
 
@@ -14,7 +14,7 @@ class TestNaiveBlockAllocator:
     def create_allocate_lambda(allocate_type: str,
                                allocator: NaiveBlockAllocator,
                                prev_block: Optional[Block],
-                               token_ids: List[int]):
+                               token_ids: list[int]):
         if allocate_type == "immutable":
             allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
index bf40b334abc..50233624f7d 100644
--- a/tests/core/block/test_prefix_caching_block.py
+++ b/tests/core/block/test_prefix_caching_block.py
@@ -2,7 +2,7 @@
 
 import math
 import random
-from typing import List, Optional
+from typing import Optional
 from unittest.mock import MagicMock
 
 import pytest
@@ -123,11 +123,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int,
 
     @staticmethod
     def create_chain(block_size: int,
-                     token_ids: List[int],
-                     num_empty_trailing_blocks=0) -> List[PrefixCachingBlock]:
+                     token_ids: list[int],
+                     num_empty_trailing_blocks=0) -> list[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks: List[PrefixCachingBlock] = []
+        blocks: list[PrefixCachingBlock] = []
         num_blocks = math.ceil(
             len(token_ids) / block_size) + num_empty_trailing_blocks
 
@@ -161,7 +161,7 @@ class TestPrefixCachingBlockAllocator:
     @staticmethod
     def create_allocate_lambda(allocate_type: str, allocator: BlockAllocator,
                                prev_block: Optional[Block],
-                               token_ids: List[int]):
+                               token_ids: list[int]):
         if allocate_type == "immutable":
             allocate_block = lambda: allocator.allocate_immutable_block(
                 prev_block=prev_block, token_ids=token_ids)
@@ -839,13 +839,13 @@ def test_reset_prefix_cache(num_blocks: int, block_size: int):
     @staticmethod
     def create_immutable_chain(
         block_size: int,
-        token_ids: List[int],
+        token_ids: list[int],
         allocator: PrefixCachingBlockAllocator,
         extra_hash: Optional[int] = None,
-    ) -> List[PrefixCachingBlock]:
+    ) -> list[PrefixCachingBlock]:
         """Helper method which creates a chain of blocks.
         """
-        blocks: List[Block] = []
+        blocks: list[Block] = []
         num_blocks = math.ceil(len(token_ids) / block_size)
 
         if num_blocks == 0:
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
index 8e0b9e63b40..161b32f01b1 100644
--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
 from unittest.mock import MagicMock
 
 import pytest  # noqa
@@ -46,7 +45,7 @@ def test_simple():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
@@ -93,7 +92,7 @@ def test_chunk():
     cache_config.num_cpu_blocks = 32
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
@@ -145,7 +144,7 @@ def test_concurrent_chunking():
     cache_config.num_cpu_blocks = 32
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
@@ -226,8 +225,8 @@ def test_short_prompts_jump_long_prompts_in_queue():
     cache_config.num_cpu_blocks = 3200  # large KV cache size for large requests
     cache_config.num_gpu_blocks = 3200
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    long_seqs: List[SequenceGroup] = []
-    short_seqs: List[SequenceGroup] = []
+    long_seqs: list[SequenceGroup] = []
+    short_seqs: list[SequenceGroup] = []
 
     # Add 2 large seq groups to scheduler.
     for i in range(2):
@@ -368,7 +367,7 @@ def test_complex():
     cache_config.num_cpu_blocks = 64
     cache_config.num_gpu_blocks = 64
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
@@ -439,7 +438,7 @@ def test_maximal_decoding():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
@@ -533,7 +532,7 @@ def test_prompt_limit():
     cache_config.num_cpu_blocks = 16
     cache_config.num_gpu_blocks = 16
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=48,
@@ -565,7 +564,7 @@ def test_prompt_limit_exceed():
     cache_config.num_cpu_blocks = 16
     cache_config.num_gpu_blocks = 16
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
     _, seq_group = create_dummy_prompt("2",
                                        prompt_length=48,
                                        block_size=block_size)
@@ -699,7 +698,7 @@ def test_chunked_prefill_max_seqs():
     cache_config.num_cpu_blocks = 128
     cache_config.num_gpu_blocks = 128
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=65,
@@ -758,7 +757,7 @@ def test_prefix_caching():
     cache_config.num_cpu_blocks = 0
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
@@ -800,7 +799,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
     cache_config.num_cpu_blocks = 0
     cache_config.num_gpu_blocks = 32
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(2):
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 66bc5257f08..9e461d4e0b4 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -2,7 +2,6 @@
 
 import time
 from collections import deque
-from typing import List, Set, Tuple
 from unittest.mock import MagicMock
 
 import pytest  # noqa
@@ -57,7 +56,7 @@ def test_scheduler_abort_seq_group():
 
     # Add multiple seq groups to scheduler.
     num_seq_group = 4
-    request_ids: Set[str] = set()
+    request_ids: set[str] = set()
     for i in range(num_seq_group):
         _, seq_group = create_dummy_prompt(str(i), block_size)
         scheduler.add_seq_group(seq_group)
@@ -83,7 +82,7 @@ def test_scheduler_schedule_simple():
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
@@ -221,7 +220,7 @@ def test_scheduler_max_seqs():
     cache_config.num_gpu_blocks = 8
     scheduler = Scheduler(scheduler_config, cache_config, None)
 
-    all_seq_groups: List[SequenceGroup] = []
+    all_seq_groups: list[SequenceGroup] = []
     # Add seq groups to scheduler.
     for i in range(num_seq_group):
         _, seq_group = create_dummy_prompt(str(i),
@@ -480,7 +479,7 @@ def test_prefill_schedule_max_lora():
                                      num_cpu_blocks=64,
                                      num_gpu_blocks=64)
     budget = create_token_budget(token_budget=120)
-    curr_loras: Set[int] = set()
+    curr_loras: set[int] = set()
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -651,8 +650,8 @@ def test_schedule_swapped_max_loras():
                                      block_size=block_size,
                                      num_cpu_blocks=32,
                                      num_gpu_blocks=32)
-    curr_loras: Set[int] = set()
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    curr_loras: set[int] = set()
+    blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -683,7 +682,7 @@ def test_schedule_swapped_cannot_swap_in():
                                      num_cpu_blocks=32,
                                      num_gpu_blocks=32)
     curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -714,7 +713,7 @@ def test_infeasible_swap():
                                      num_cpu_blocks=32,
                                      num_gpu_blocks=32)
     curr_loras = None
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
@@ -752,7 +751,7 @@ def test_schedule_swapped_blocks_to_copy():
                                        block_size=block_size)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
-    blocks_to_swap_out: List[Tuple[int, int]] = []
+    blocks_to_swap_out: list[tuple[int, int]] = []
     scheduler._swap_out(seq_group, blocks_to_swap_out)
     scheduler._add_seq_group_to_swapped(seq_group)
 
diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
index a4e3c73a5a7..c6049b26a2b 100644
--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest  # noqa
 
 from vllm.config import CacheConfig, SchedulerConfig
@@ -48,7 +46,7 @@ def test_scheduler_schedule_simple_encoder_decoder():
     cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
     cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group
     scheduler = Scheduler(scheduler_config, cache_config, None)
-    running: List[SequenceGroup] = []
+    running: list[SequenceGroup] = []
 
     # Add seq groups to scheduler.
     req_id_list = []
diff --git a/tests/core/utils.py b/tests/core/utils.py
index fb77dccce1c..ba4265e3c20 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -2,9 +2,8 @@
 
 import time
 from collections import defaultdict
-from typing import Any, Dict, List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple
+from collections.abc import Sequence as GenericSequence
+from typing import Any, Optional
 
 from vllm import SamplingParams
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
@@ -20,10 +19,10 @@ def create_dummy_prompt(
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
     best_of: int = 1,
-    prompt_tokens: Optional[List[int]] = None,
+    prompt_tokens: Optional[list[int]] = None,
     min_tokens: int = 0,
     max_tokens: int = 16,
-) -> Tuple[Sequence, SequenceGroup]:
+) -> tuple[Sequence, SequenceGroup]:
     if not block_size:
         block_size = prompt_length
 
@@ -48,7 +47,7 @@ def create_dummy_prompt(
     return prompt, seq_group
 
 
-def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
+def create_dummy_lora_sequence(request_id: int, token_ids: list[int],
                                block_size: int, lora_int_id: int) -> Sequence:
     return Sequence(seq_id=request_id,
                     inputs=token_inputs(token_ids),
@@ -58,7 +57,7 @@ def create_dummy_lora_sequence(request_id: int, token_ids: List[int],
                                              lora_int_id=lora_int_id))
 
 
-def create_dummy_sequence(request_id: int, token_ids: List[int],
+def create_dummy_sequence(request_id: int, token_ids: list[int],
                           block_size: int) -> Sequence:
     return Sequence(
         seq_id=request_id,
@@ -74,7 +73,7 @@ def create_dummy_prompt_encoder_decoder(
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
     best_of: int = 1,
-) -> Tuple[Sequence, Sequence, SequenceGroup]:
+) -> tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
 
@@ -125,7 +124,7 @@ def create_seq_group(
 
     prompt_token_ids = [0] * seq_prompt_len
 
-    seqs: List[Sequence] = []
+    seqs: list[Sequence] = []
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         seq = Sequence(
             seq_id=seq_id_start + seq_id_offset,
@@ -241,7 +240,7 @@ class SchedulerProxy:
 
     def __init__(self, scheduler: Scheduler):
         self.scheduler_ = scheduler
-        self.call_history: Dict[str, List[Any]] = defaultdict(list)
+        self.call_history: dict[str, list[Any]] = defaultdict(list)
 
     def __getattr__(self, name: str) -> Any:
 
@@ -253,6 +252,6 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     def last_schedule_ret(
-        self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]:
+        self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]:
         _, _, ret = self.call_history["schedule"][-1]
         return ret
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index bc5770642b7..2e575f95d5f 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import List, Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple, Optional
 
 import pytest
 
@@ -28,8 +28,8 @@ class EPTestOptions(NamedTuple):
 
 @dataclass
 class EPTestSettings:
-    parallel_setups: List[ParallelSetup]
-    distributed_backends: List[str]
+    parallel_setups: list[ParallelSetup]
+    distributed_backends: list[str]
     task: TaskOption
     test_options: EPTestOptions
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 390ed91c260..5562b36816c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -9,7 +9,7 @@
 import json
 import os
 from dataclasses import dataclass
-from typing import List, Literal, NamedTuple, Optional
+from typing import Literal, NamedTuple, Optional
 
 import pytest
 
@@ -38,14 +38,14 @@ class PPTestOptions(NamedTuple):
 
 @dataclass
 class PPTestSettings:
-    parallel_setups: List[ParallelSetup]
+    parallel_setups: list[ParallelSetup]
     # NOTE: the length of distributed_backends and
     # vllm_major_versions should be the same, and they
     # are first zipped together to iterate over all
     # test settings.
-    distributed_backends: List[str]
+    distributed_backends: list[str]
     # vllm major version: "0" for V0, "1" for V1
-    vllm_major_versions: List[str]
+    vllm_major_versions: list[str]
     task: TaskOption
     test_options: PPTestOptions
 
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 4c42a0ed811..2c323edfa2a 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -2,7 +2,6 @@
 
 import multiprocessing
 import os
-from typing import Dict, List
 
 import pytest
 import torch
@@ -20,9 +19,9 @@
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes: List[multiprocessing.Process] = []
+    processes: list[multiprocessing.Process] = []
     for i in range(number_of_processes):
-        env: Dict[str, str] = {}
+        env: dict[str, str] = {}
         env['RANK'] = str(i)
         env['LOCAL_RANK'] = str(i)
         env['WORLD_SIZE'] = str(number_of_processes)
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 59fa7cc9f31..711c2441f34 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -3,7 +3,6 @@
 import multiprocessing
 import random
 import time
-from typing import List
 
 import numpy as np
 import torch.distributed as dist
@@ -13,7 +12,7 @@
 from vllm.utils import get_ip, get_open_port, update_environment_variables
 
 
-def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]:
+def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
     np.random.seed(seed)
     sizes = np.random.randint(1, 10_000, n)
     # on average, each array will have 5k elements
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index d0e4f86250b..cb772fc7608 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -3,7 +3,7 @@
 
 Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
 """
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 from transformers import AutoModelForSeq2SeqLM
@@ -22,7 +22,7 @@
 
 
 def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
     decoder_prompt_type: DecoderPromptType,
 ):
     """Sanitize vllm output to be comparable with hf output."""
diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
index c0a339e46ec..91c9ba4a74e 100644
--- a/tests/engine/test_executor.py
+++ b/tests/engine/test_executor.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import os
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import pytest
 
@@ -22,8 +22,8 @@ class CustomUniExecutor(UniProcExecutor):
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None) -> list[Any]:
         # Drop marker to show that this was ran
         with open(".marker", "w"):
             ...
diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py
index f1fe58e35a3..9b2f45def6c 100644
--- a/tests/engine/test_multiproc_workers.py
+++ b/tests/engine/test_multiproc_workers.py
@@ -4,7 +4,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from time import sleep
-from typing import Any, List, Tuple
+from typing import Any
 
 import pytest
 
@@ -17,7 +17,7 @@
 class DummyWorkerWrapper(WorkerWrapperBase):
     """Dummy version of vllm.worker.worker.Worker"""
 
-    def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
+    def worker_method(self, worker_input: Any) -> tuple[int, Any]:
         sleep(0.05)
 
         if isinstance(worker_input, Exception):
@@ -27,7 +27,7 @@ def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
         return self.rpc_rank, input
 
 
-def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
+def _start_workers() -> tuple[list[ProcessWorkerWrapper], WorkerMonitor]:
     result_handler = ResultHandler()
     vllm_config = VllmConfig()
     workers = [
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 0f633bb26da..62d167aa14b 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, List, Optional
+from typing import Any, Optional
 
 import pytest
 
@@ -21,8 +21,8 @@ def vllm_model(vllm_runner):
 def _test_stopping(llm_engine: LLMEngine,
                    expected_output: str,
                    expected_reason: Any,
-                   stop: Optional[List[str]] = None,
-                   stop_token_ids: Optional[List[int]] = None,
+                   stop: Optional[list[str]] = None,
+                   stop_token_ids: Optional[list[int]] = None,
                    include_in_output: bool = False,
                    use_async_output_proc: bool = False) -> None:
     llm_engine.add_request(
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 77c80b2f894..710bad4ecf4 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm import LLM
@@ -63,7 +61,7 @@ def test_multi_chat():
 
 @pytest.mark.parametrize("image_urls",
                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
-def test_chat_multi_image(image_urls: List[str]):
+def test_chat_multi_image(image_urls: list[str]):
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
         dtype="bfloat16",
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index a65235ccdf1..6438743b649 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import weakref
-from typing import List
 
 import pytest
 
@@ -45,8 +44,8 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: List[PoolingRequestOutput],
-                         o2: List[PoolingRequestOutput]):
+def assert_outputs_equal(o1: list[PoolingRequestOutput],
+                         o2: list[PoolingRequestOutput]):
     assert [o.outputs for o in o1] == [o.outputs for o in o2]
 
 
diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py
index 910e1a4507c..9a895c922cc 100644
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import weakref
-from typing import List
 
 import pytest
 
@@ -43,7 +42,7 @@ def llm():
     cleanup_dist_env_and_memory()
 
 
-def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]):
+def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
     assert [o.outputs for o in o1] == [o.outputs for o in o2]
 
 
diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 19d4735b9dd..eca5d184f5d 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -10,7 +10,6 @@
 import io
 import time
 from statistics import mean, median
-from typing import List
 
 import librosa
 import pytest
@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
     audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
     _ = await bound_transcribe(model, sem, client, (audio, sr), "")
 
-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
     for sample in data:
         audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
         task = asyncio.create_task(
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
index ea504f3d0b4..5ce5d9280f3 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 from transformers import AutoTokenizer
 
@@ -180,7 +178,7 @@ def test_reasoning(
 ):
     output = tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
-    output_tokens: List[str] = [
+    output_tokens: list[str] = [
         tokenizer.convert_tokens_to_string([token]) for token in output
     ]
     parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py
index 2157e059594..01e43130bc6 100644
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
@@ -33,10 +33,10 @@ def append_delta(self, delta: DeltaMessage):
 
 def run_reasoning_extraction(
     reasoning_parser: ReasoningParser,
-    model_output: List[str],
+    model_output: list[str],
     request: Union[ChatCompletionRequest, None] = None,
     streaming: bool = False,
-) -> Tuple[Optional[str], Optional[str]]:
+) -> tuple[Optional[str], Optional[str]]:
     if streaming:
         reconstructor = run_reasoning_extraction_streaming(
             reasoning_parser,
@@ -55,9 +55,9 @@ def run_reasoning_extraction(
 
 def run_reasoning_extraction_nonstreaming(
     reasoning_parser: ReasoningParser,
-    model_output: List[str],
+    model_output: list[str],
     request: Union[ChatCompletionRequest, None] = None,
-) -> Tuple[Optional[str], Optional[str]]:
+) -> tuple[Optional[str], Optional[str]]:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     return reasoning_parser.extract_reasoning_content(
         model_output=''.join(model_output), request=request)
@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(
 
 def run_reasoning_extraction_streaming(
     reasoning_parser: ReasoningParser,
-    model_deltas: List[str],
+    model_deltas: list[str],
     request: Union[ChatCompletionRequest, None] = None,
 ) -> StreamingReasoningReconstructor:
     request = request or ChatCompletionRequest(messages=[], model="test-model")
     reconstructor = StreamingReasoningReconstructor()
     previous_text = ""
-    previous_tokens: List[int] = []
+    previous_tokens: list[int] = []
     for delta in model_deltas:
         token_delta = [
             reasoning_parser.vocab.get(token)
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 7e08fdaf1ad..56fb2932842 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List
-
 import openai
 import pytest
 import pytest_asyncio
@@ -41,7 +39,7 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_audio() -> Dict[str, str]:
+def base64_encoded_audio() -> dict[str, str]:
     return {
         audio_url: encode_audio_base64(*fetch_audio(audio_url))
         for audio_url in TEST_AUDIO_URLS
@@ -107,7 +105,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_single_chat_session_audio_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: Dict[str, str]):
+        base64_encoded_audio: dict[str, str]):
 
     messages = [{
         "role":
@@ -165,7 +163,7 @@ async def test_single_chat_session_audio_base64encoded(
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_single_chat_session_input_audio(
         client: openai.AsyncOpenAI, model_name: str, audio_url: str,
-        base64_encoded_audio: Dict[str, str]):
+        base64_encoded_audio: dict[str, str]):
     messages = [{
         "role":
         "user",
@@ -255,7 +253,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -277,7 +275,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
                                           model_name: str, audio_url: str,
-                                          base64_encoded_audio: Dict[str,
+                                          base64_encoded_audio: dict[str,
                                                                      str]):
     messages = [{
         "role":
@@ -315,7 +313,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -337,7 +335,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
 async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
                                  audio_url: str,
-                                 base64_encoded_audio: Dict[str, str]):
+                                 base64_encoded_audio: dict[str, str]):
 
     messages = [{
         "role":
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index a970981b756..e7bf974f13e 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -2,7 +2,6 @@
 
 import asyncio
 from http import HTTPStatus
-from typing import List
 
 import openai
 import pytest
@@ -17,7 +16,7 @@
 
 
 @pytest.fixture(scope='module')
-def server_args(request: pytest.FixtureRequest) -> List[str]:
+def server_args(request: pytest.FixtureRequest) -> list[str]:
     """ Provide extra arguments to the server via indirect parametrization
 
     Usage:
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d7ed4afa286..25e4595cef6 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -3,7 +3,7 @@
 # imports for guided decoding tests
 import json
 import re
-from typing import Dict, List, Optional
+from typing import Optional
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -190,7 +190,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
 async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
                                     model_name: str,
                                     prompt_logprobs: Optional[int]):
-    params: Dict = {
+    params: dict = {
         "messages": [{
             "role": "system",
             "content": "You are a helpful assistant."
@@ -232,7 +232,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
 )
 async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
                                                   model_name: str):
-    params: Dict = {
+    params: dict = {
         "messages": [{
             "role": "system",
             "content": "You are a helpful assistant."
@@ -343,7 +343,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
         temperature=0.0,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 28671cc2757..1d9aa4972b7 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -5,7 +5,7 @@
 import re
 import shutil
 from tempfile import TemporaryDirectory
-from typing import Dict, List, Optional
+from typing import Optional
 
 import jsonschema
 import openai  # use the official client for correctness check
@@ -287,7 +287,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
                                           model_name: str,
                                           prompt_logprobs: Optional[int]):
-    params: Dict = {
+    params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
         "model": model_name,
     }
@@ -331,7 +331,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                              max_tokens=5,
                                              temperature=0.0,
                                              stream=True)
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
@@ -364,7 +364,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
                                              max_tokens=max_tokens,
                                              n=n,
                                              stream=True)
-    chunks: List[List[str]] = [[] for i in range(n)]
+    chunks: list[list[str]] = [[] for i in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
         index = chunk.choices[0].index
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index a37169f51b0..0d1c936da75 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -86,7 +86,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
-    # test List[str]
+    # test list[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
@@ -106,7 +106,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
     assert embeddings.usage.prompt_tokens == 33
     assert embeddings.usage.total_tokens == 33
 
-    # test List[List[int]]
+    # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
     embedding_response = await client.embeddings.create(
diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py
index 11d3bfafab1..72ab12c5646 100644
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
-    # test List[str]
+    # test list[str]
     input_texts = [
         "The cat sat on the mat.", "A feline was resting on a rug.",
         "Stars twinkle brightly in the night sky."
@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
     assert poolings.usage.prompt_tokens == 25
     assert poolings.usage.total_tokens == 25
 
-    # test List[List[int]]
+    # test list[list[int]]
     input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                     [25, 32, 64, 77]]
     response = requests.post(
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index ad8159afc87..c9fa192fb6a 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import os
-from typing import Any, List, NamedTuple
+from typing import Any, NamedTuple
 
 import openai  # use the official client for correctness check
 import pytest
@@ -40,7 +40,7 @@ def server():
 
 class TestCase(NamedTuple):
     model_name: str
-    base_url: List[str]
+    base_url: list[str]
     api_key: str
     expected_error: Any
 
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index ab9285407d2..36d62224233 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List
-
 import openai
 import pytest
 import pytest_asyncio
@@ -49,7 +47,7 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_video() -> Dict[str, str]:
+def base64_encoded_video() -> dict[str, str]:
     return {
         video_url: encode_video_base64(fetch_video(video_url))
         for video_url in TEST_VIDEO_URLS
@@ -151,7 +149,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: Dict[str, str]):
+        base64_encoded_video: dict[str, str]):
 
     messages = [{
         "role":
@@ -209,7 +207,7 @@ async def test_single_chat_session_video_base64encoded(
 @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
 async def test_single_chat_session_video_base64encoded_beamsearch(
         client: openai.AsyncOpenAI, model_name: str, video_url: str,
-        base64_encoded_video: Dict[str, str]):
+        base64_encoded_video: dict[str, str]):
 
     messages = [{
         "role":
@@ -279,7 +277,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -302,7 +300,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
     "video_urls",
     [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
 async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
-                                 video_urls: List[str]):
+                                 video_urls: list[str]):
 
     messages = [{
         "role":
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index c954fca696f..d605394f57b 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List
-
 import openai
 import pytest
 import pytest_asyncio
@@ -50,7 +48,7 @@ async def client(server):
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> Dict[str, str]:
+def base64_encoded_image() -> dict[str, str]:
     return {
         image_url: encode_image_base64(fetch_image(image_url))
         for image_url in TEST_IMAGE_URLS
@@ -152,7 +150,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: Dict[str, str]):
+        base64_encoded_image: dict[str, str]):
 
     messages = [{
         "role":
@@ -210,7 +208,7 @@ async def test_single_chat_session_image_base64encoded(
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image_base64encoded_beamsearch(
         client: openai.AsyncOpenAI, model_name: str, image_url: str,
-        base64_encoded_image: Dict[str, str]):
+        base64_encoded_image: dict[str, str]):
 
     messages = [{
         "role":
@@ -280,7 +278,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
         temperature=0.0,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         delta = chunk.choices[0].delta
@@ -303,7 +301,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
     "image_urls",
     [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
 async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
-                                 image_urls: List[str]):
+                                 image_urls: list[str]):
 
     messages = [{
         "role":
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index cee5274561f..100aca6f63f 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict
-
 import pytest
 import requests
 
@@ -49,7 +47,7 @@ def server():
 
 
 @pytest.fixture(scope="session")
-def base64_encoded_image() -> Dict[str, str]:
+def base64_encoded_image() -> dict[str, str]:
     return {
         image_url: encode_image_base64(fetch_image(image_url))
         for image_url in TEST_IMAGE_URLS
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index 788efa86b10..fbbbc1fb2a5 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
 from unittest.mock import MagicMock
 
 import pytest
@@ -125,7 +124,7 @@ def test_no_tool_call(streaming: bool):
 @pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
                          TEST_CASES)
 def test_tool_call(streaming: bool, model_output: str,
-                   expected_tool_calls: List[FunctionCall]):
+                   expected_tool_calls: list[FunctionCall]):
     mock_tokenizer = MagicMock()
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         mock_tokenizer)
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 57ec9865355..6ad5aa26ffa 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, List, Tuple, Union
+from collections.abc import Iterable
+from typing import Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage,
@@ -12,7 +13,7 @@
 class StreamingToolReconstructor:
 
     def __init__(self, assert_one_tool_per_delta: bool = True):
-        self.tool_calls: List[ToolCall] = []
+        self.tool_calls: list[ToolCall] = []
         self.other_content: str = ""
         self._assert_one_tool_per_delta = assert_one_tool_per_delta
 
@@ -72,7 +73,7 @@ def run_tool_extraction(
     request: Union[ChatCompletionRequest, None] = None,
     streaming: bool = False,
     assert_one_tool_per_delta: bool = True,
-) -> Tuple[Union[str, None], List[ToolCall]]:
+) -> tuple[Union[str, None], list[ToolCall]]:
     if streaming:
         reconstructor = run_tool_extraction_streaming(
             tool_parser,
@@ -106,7 +107,7 @@ def run_tool_extraction_streaming(
     reconstructor = StreamingToolReconstructor(
         assert_one_tool_per_delta=assert_one_tool_per_delta)
     previous_text = ""
-    previous_tokens: List[int] = []
+    previous_tokens: list[int] = []
     for delta in model_deltas:
         token_delta = [
             tool_parser.vocab.get(token)
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index 34dcf91c766..a21d642bcaa 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 
@@ -19,7 +19,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
 def ref_dynamic_per_token_quant(x: torch.tensor,
                                 quant_dtype: torch.dtype,
                                 scale_ub: Optional[torch.tensor] = None) \
-        -> Tuple[torch.tensor, torch.tensor]:
+        -> tuple[torch.tensor, torch.tensor]:
 
     assert quant_dtype in [torch.int8, FP8_DTYPE]
     if scale_ub is not None:
@@ -68,7 +68,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
 # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
 # kernel
 def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-                    -> Tuple[torch.tensor, torch.tensor]:
+                    -> tuple[torch.tensor, torch.tensor]:
 
     fp8_traits = torch.finfo(FP8_DTYPE)
     fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index 2e70b1db35c..cf0f21ce065 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Type
 
 import pytest
 import torch
@@ -86,7 +85,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_activation(
-    activation: Type[torch.nn.Module],
+    activation: type[torch.nn.Module],
     num_tokens: int,
     d: int,
     dtype: torch.dtype,
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index b667d8d9e03..0fe10d76909 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -85,8 +85,8 @@ def ref_single_query_cached_kv_attention(
         block_table = block_tables_lst[i]
         seq_len = int(seq_lens_lst[i])
 
-        keys_lst: List[torch.Tensor] = []
-        values_lst: List[torch.Tensor] = []
+        keys_lst: list[torch.Tensor] = []
+        values_lst: list[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
@@ -133,7 +133,7 @@ def test_paged_attention(
     kv_cache_factory,
     version: str,
     num_seqs: int,
-    num_heads: Tuple[int, int],
+    num_heads: tuple[int, int],
     head_size: int,
     use_alibi: bool,
     block_size: int,
@@ -166,7 +166,7 @@ def test_paged_attention(
 
     # Create the block tables.
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables_lst: List[List[int]] = []
+    block_tables_lst: list[list[int]] = []
     for _ in range(num_seqs):
         block_table = [
             random.randint(0, NUM_BLOCKS - 1)
@@ -334,7 +334,7 @@ def test_paged_attention(
 
 
 def ref_multi_query_kv_attention(
-    cu_seq_lens: List[int],
+    cu_seq_lens: list[int],
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -342,7 +342,7 @@ def ref_multi_query_kv_attention(
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
-    ref_outputs: List[torch.Tensor] = []
+    ref_outputs: list[torch.Tensor] = []
     for i in range(num_seqs):
         start_idx = cu_seq_lens[i]
         end_idx = cu_seq_lens[i + 1]
@@ -378,7 +378,7 @@ def ref_multi_query_kv_attention(
 @torch.inference_mode()
 def test_multi_query_kv_attention(
     num_seqs: int,
-    num_heads: Tuple[int, int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     seed: int,
diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py
index e653d34d00e..3025ae0f921 100644
--- a/tests/kernels/test_blocksparse_attention.py
+++ b/tests/kernels/test_blocksparse_attention.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -87,8 +87,8 @@ def ref_single_query_cached_kv_attention(
         block_table = block_tables_lst[i]
         seq_len = int(seq_lens_lst[i])
 
-        keys_lst: List[torch.Tensor] = []
-        values_lst: List[torch.Tensor] = []
+        keys_lst: list[torch.Tensor] = []
+        values_lst: list[torch.Tensor] = []
         for j in range(seq_len):
             block_number = int(block_table[j // block_size])
             block_offset = j % block_size
@@ -162,7 +162,7 @@ def test_paged_attention(
     kv_cache_factory,
     version: str,
     num_seqs: int,
-    num_heads: Tuple[int, int],
+    num_heads: tuple[int, int],
     head_size: int,
     use_alibi: bool,
     block_size: int,
@@ -331,7 +331,7 @@ def test_paged_attention(
 
 
 def ref_multi_query_kv_attention(
-    cu_seq_lens: List[int],
+    cu_seq_lens: list[int],
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
@@ -376,7 +376,7 @@ def ref_multi_query_kv_attention(
 @torch.inference_mode()
 def test_varlen_blocksparse_attention_prefill(
     num_seqs: int,
-    num_heads: Tuple[int, int],
+    num_heads: tuple[int, int],
     head_size: int,
     blocksparse_local_blocks: int,
     blocksparse_vert_stride: int,
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index fb368874821..b55ebd967fd 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import List, Tuple
 
 import pytest
 import torch
@@ -74,7 +73,7 @@ def test_copy_blocks(
     src_blocks = random.sample(range(num_blocks), num_mappings)
     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
-    block_mapping: List[Tuple[int, int]] = []
+    block_mapping: list[tuple[int, int]] = []
     for i in range(num_mappings):
         src = src_blocks[i]
         dst1 = dst_blocks[2 * i]
@@ -342,7 +341,7 @@ def test_reshape_and_cache_flash(
 @torch.inference_mode()
 def test_swap_blocks(
     kv_cache_factory,
-    direction: Tuple[str, str],
+    direction: tuple[str, str],
     num_mappings: int,
     num_heads: int,
     head_size: int,
diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py
index 8cc1a6a1b49..d6570e6334b 100755
--- a/tests/kernels/test_cascade_flash_attn.py
+++ b/tests/kernels/test_cascade_flash_attn.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -25,7 +25,7 @@
 @torch.inference_mode()
 def test_merge_kernel(
     num_tokens: int,
-    num_heads: Tuple[int, int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
 ):
@@ -85,8 +85,8 @@ def test_merge_kernel(
 @pytest.mark.parametrize("fa_version", [2, 3])
 @torch.inference_mode()
 def test_cascade(
-    seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int],
-    num_heads: Tuple[int, int],
+    seq_lens_and_common_prefix: tuple[list[tuple[int, int]], int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 49fd8ed634f..72fc660a653 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
-from typing import Type
 
 import pytest
 import torch
@@ -71,7 +70,7 @@ def cutlass_fp8_gemm_helper(m: int,
                             a_scale_group_shape: tuple,
                             b_scale_group_shape: tuple,
                             use_bias: bool,
-                            out_dtype: Type[torch.dtype] = torch.bfloat16,
+                            out_dtype: type[torch.dtype] = torch.bfloat16,
                             device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
@@ -109,7 +108,7 @@ def cutlass_int8_gemm_helper(m: int,
                              a_scale_group_shape: tuple,
                              b_scale_group_shape: tuple,
                              use_bias: bool,
-                             out_dtype: Type[torch.dtype] = torch.bfloat16,
+                             out_dtype: type[torch.dtype] = torch.bfloat16,
                              device: str = "cuda"):
     # Test for a cutlass kernel with per-token activation quantization
     # and per-output channel weight quantization.
@@ -187,7 +186,7 @@ def test_cutlass_int8_gemm(m: int, n: int, k: int, a_scale_group_shape,
 @pytest.mark.parametrize("use_bias", [True, False])
 def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
                                         b_scale_group_shape,
-                                        out_dtype: Type[torch.dtype],
+                                        out_dtype: type[torch.dtype],
                                         use_bias: bool):
     cutlass_int8_gemm_helper(512,
                              512,
@@ -208,7 +207,7 @@ def test_cutlass_int8_gemm_output_dtype(a_scale_group_shape,
                     reason="FP8 is not supported on this GPU type.")
 def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
                                        b_scale_group_shape,
-                                       out_dtype: Type[torch.dtype],
+                                       out_dtype: type[torch.dtype],
                                        use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
@@ -227,7 +226,7 @@ def test_cutlass_fp8_gemm_output_dtype(a_scale_group_shape,
                     reason="FP8 blockwise is not supported on this GPU type.")
 def test_cutlass_fp8_blockwise_scale_gemm_dtype(a_scale_group_shape,
                                                 b_scale_group_shape,
-                                                out_dtype: Type[torch.dtype],
+                                                out_dtype: type[torch.dtype],
                                                 use_bias: bool):
     cutlass_fp8_gemm_helper(512,
                             512,
diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py
index b0c5804715a..2890e15d6cb 100644
--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/kernels/test_semi_structured.py`.
 """
-from typing import Tuple, Type
 
 import pytest
 import torch
@@ -79,7 +78,7 @@ def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
 
 def make_rand_sparse_tensors(
         dtype: torch.dtype, m: int, n: int, k: int
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     a = torch.randn((m, k), device='cuda')
     b = torch.randn((n, k), device='cuda').t()
 
@@ -167,7 +166,7 @@ def test_cutlass_sparse_subset():
 @pytest.mark.parametrize("m, n, k", MNK_FACTORS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
 @pytest.mark.parametrize("use_bias", [True, False])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype],
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: type[torch.dtype],
                              use_bias: bool):
 
     # Create tensors
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 0a93f7ce945..547a63499b2 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -243,7 +243,7 @@ def _decoder_attn_setup(
     test_pt: TestPoint,
     test_rsrcs: TestResources,
     block_base_addr: int = 0,
-) -> Tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
+) -> tuple[QKVInputs, PhaseTestParameters, PhaseTestParameters, int]:
     '''
     Set up test vectors & data structures for self-attention test.
 
@@ -421,7 +421,7 @@ def _enc_dec_cross_attn_setup_reuses_query(
     test_pt: TestPoint,
     test_rsrcs: TestResources,
     block_base_addr: int = 0,
-) -> Tuple[PhaseTestParameters, PhaseTestParameters]:
+) -> tuple[PhaseTestParameters, PhaseTestParameters]:
     '''
     Set up test vectors & data structures for cross-attention test.
 
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index b8af89b660a..95424e25732 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -24,8 +24,8 @@ def ref_paged_attn(
     query: torch.Tensor,
     key_cache: torch.Tensor,
     value_cache: torch.Tensor,
-    query_lens: List[int],
-    kv_lens: List[int],
+    query_lens: list[int],
+    kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
     sliding_window: Optional[int] = None,
@@ -35,7 +35,7 @@ def ref_paged_attn(
     block_tables = block_tables.cpu().numpy()
     _, block_size, num_kv_heads, head_size = key_cache.shape
 
-    outputs: List[torch.Tensor] = []
+    outputs: list[torch.Tensor] = []
     start_idx = 0
     for i in range(num_seqs):
         query_len = query_lens[i]
@@ -88,8 +88,8 @@ def ref_paged_attn(
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     use_out: bool,
-    kv_lens: List[int],
-    num_heads: Tuple[int, int],
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
@@ -174,8 +174,8 @@ def test_flash_attn_with_paged_kv(
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
     use_out: bool,
-    seq_lens: List[Tuple[int, int]],
-    num_heads: Tuple[int, int],
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
     head_size: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py
index f623b0014db..5ad1137aa6a 100644
--- a/tests/kernels/test_flashinfer.py
+++ b/tests/kernels/test_flashinfer.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import flashinfer
 import pytest
@@ -19,8 +19,8 @@ def ref_paged_attn(
     query: torch.Tensor,
     key_cache: torch.Tensor,
     value_cache: torch.Tensor,
-    query_lens: List[int],
-    kv_lens: List[int],
+    query_lens: list[int],
+    kv_lens: list[int],
     block_tables: torch.Tensor,
     scale: float,
     sliding_window: Optional[int] = None,
@@ -30,7 +30,7 @@ def ref_paged_attn(
     block_tables = block_tables.cpu().numpy()
     _, block_size, num_kv_heads, head_size = key_cache.shape
 
-    outputs: List[torch.Tensor] = []
+    outputs: list[torch.Tensor] = []
     start_idx = 0
     for i in range(num_seqs):
         query_len = query_lens[i]
@@ -78,8 +78,8 @@ def ref_paged_attn(
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_kv(
-    kv_lens: List[int],
-    num_heads: Tuple[int, int],
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
@@ -168,8 +168,8 @@ def test_flashinfer_decode_with_paged_kv(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 @torch.inference_mode
-def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
-                                          num_heads: Tuple[int, int],
+def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]],
+                                          num_heads: tuple[int, int],
                                           head_size: int, dtype: torch.dtype,
                                           block_size: int,
                                           soft_cap: Optional[float]) -> None:
@@ -270,7 +270,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]],
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 def test_flashinfer_prefill_with_paged_fp8_kv(
-        seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
+        seq_lens: list[tuple[int, int]], num_heads: tuple[int, int],
         head_size: int, dtype: torch.dtype, block_size: int,
         soft_cap: Optional[float]) -> None:
     pytest.skip("TODO: fix the accuracy issue")
@@ -378,8 +378,8 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
 @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0])
 @torch.inference_mode
 def test_flashinfer_decode_with_paged_fp8_kv(
-    kv_lens: List[int],
-    num_heads: Tuple[int, int],
+    kv_lens: list[int],
+    num_heads: tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
diff --git a/tests/kernels/test_fused_quant_layernorm.py b/tests/kernels/test_fused_quant_layernorm.py
index d4b674b2353..7a591f53678 100644
--- a/tests/kernels/test_fused_quant_layernorm.py
+++ b/tests/kernels/test_fused_quant_layernorm.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import pytest
 import torch
@@ -39,7 +39,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
 def ref_rms_norm(rms_norm_layer: RMSNorm,
                  x: torch.Tensor,
                  residual: Optional[torch.Tensor]) \
-        -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     if residual is not None:
         residual = residual.clone()
         out, residual = rms_norm_layer.forward_native(x, residual)
@@ -54,7 +54,7 @@ def ref_dynamic_per_token_quant(rms_norm_layer: RMSNorm,
                                 quant_dtype: torch.dtype,
                                 residual: Optional[torch.Tensor],
                                 scale_ub: Optional[torch.Tensor]) \
-        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     if scale_ub is not None:
         assert quant_dtype == torch.float8_e4m3fn
 
@@ -78,7 +78,7 @@ def ref_impl(rms_norm_layer: RMSNorm,
              quant_dtype: torch.dtype,
              residual: Optional[torch.Tensor],
              scale_ub: Optional[torch.Tensor]) \
-        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     return ref_dynamic_per_token_quant(rms_norm_layer, x, quant_dtype,
                                        residual, scale_ub)
 
@@ -88,7 +88,7 @@ def ops_dynamic_per_token_quant(weight: torch.Tensor,
                                 quant_dtype: torch.dtype,
                                 residual: Optional[torch.Tensor],
                                 scale_ub: Optional[torch.Tensor]) \
-        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     if residual is not None:
         residual = residual.clone()
     out, scales = ops.rms_norm_dynamic_per_token_quant(x, weight, EPS,
@@ -102,7 +102,7 @@ def ops_impl(weight: torch.Tensor,
              quant_dtype: torch.dtype,
              residual: Optional[torch.Tensor],
              scale_ub: Optional[torch.Tensor]) \
-        -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     return ops_dynamic_per_token_quant(weight, x, quant_dtype, residual,
                                        scale_ub)
 
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index 847ca9f4310..aa666a464a5 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
-from typing import List
 
 import pytest
 import torch
@@ -16,7 +15,7 @@
 
 def get_gguf_sample_tensors(
         hidden_size: int,
-        quant_type: GGMLQuantizationType) -> List[ReaderTensor]:
+        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
     sample_dir = GGUF_SAMPLE
     filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
     sample_file = Path(sample_dir) / filename
diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py
index bd60526ed9b..5aeaaa654ed 100644
--- a/tests/kernels/test_machete_mm.py
+++ b/tests/kernels/test_machete_mm.py
@@ -6,7 +6,7 @@
 
 import math
 from dataclasses import dataclass, fields
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 import torch
@@ -45,7 +45,7 @@
     (1024, 8192, 4096),
 ]
 
-GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1]
+GROUP_SIZES_TO_TEST: list[Optional[int]] = [128, -1]
 
 
 @dataclass
@@ -75,7 +75,7 @@ class Tensors:
 #  Ch Scales Type, Tok Scales Type)
 # NOTE: None "Scale Type" means the act type is floating point
 #       None "Output Type" means the output type is the same as the act type
-TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype],
+TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
                       Optional[torch.dtype], bool]
 TEST_TYPES = [
     # GPTQ style
@@ -136,7 +136,7 @@ def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor):
     return zps if zps is None else -1 * s * (zps.to(s.dtype))
 
 
-def group_size_valid(shape: Tuple[int, int, int],
+def group_size_valid(shape: tuple[int, int, int],
                      group_size: Optional[int]) -> bool:
     return group_size is None or group_size == -1 or group_size % shape[2] == 0
 
@@ -166,7 +166,7 @@ def machete_quantize_and_pack(atype: torch.dtype,
     return w_ref, w_q_machete, w_s, w_zp
 
 
-def create_test_tensors(shape: Tuple[int, int, int],
+def create_test_tensors(shape: tuple[int, int, int],
                         types: TypeConfig,
                         group_size: Optional[int],
                         subset_stride_factor: Optional[int] = None) -> Tensors:
@@ -265,7 +265,7 @@ def machete_mm_test_helper(types: TypeConfig,
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_all_schedules(shape, types: TypeConfig):
 
-    group_sizes: List[Optional[int]] = []
+    group_sizes: list[Optional[int]] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
@@ -294,7 +294,7 @@ def test_machete_all_schedules(shape, types: TypeConfig):
                          ids=lambda x: "x".join(str(v) for v in x))
 @pytest.mark.parametrize("types", TEST_TYPES)
 def test_machete_heuristic(shape, types: TypeConfig):
-    group_sizes: List[Optional[int]] = []
+    group_sizes: list[Optional[int]] = []
     if types.group_scale_type is None:
         group_sizes = [None]
     else:
diff --git a/tests/kernels/test_mamba_mixer2.py b/tests/kernels/test_mamba_mixer2.py
index 8c441fcbe61..abcf3888fea 100644
--- a/tests/kernels/test_mamba_mixer2.py
+++ b/tests/kernels/test_mamba_mixer2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import unittest
-from typing import Tuple
 
 import pytest
 import torch
@@ -29,7 +28,7 @@
 def test_mixer2_gated_norm_multi_gpu(
     batch_size: int,
     seq_len: int,
-    hidden_size_n_groups: Tuple[int, int],
+    hidden_size_n_groups: tuple[int, int],
     dtype: torch.dtype,
     device: str = 'cuda',
 ):
diff --git a/tests/kernels/test_mamba_ssm_ssd.py b/tests/kernels/test_mamba_ssm_ssd.py
index 882513116ed..8f23a9b216e 100644
--- a/tests/kernels/test_mamba_ssm_ssd.py
+++ b/tests/kernels/test_mamba_ssm_ssd.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, Tuple
-
 import pytest
 import torch
 import torch.nn.functional as F
@@ -134,7 +132,7 @@ def generate_continous_batched_examples(example_lens_by_batch,
     # given a tuple of lengths for each example in the batch
     # e.g., example_lens=(8, 4) means take 8 samples from first eg,
     #       4 examples from second eg, etc
-    def get_continuous_batch(example_lens: Tuple[int, ...]):
+    def get_continuous_batch(example_lens: tuple[int, ...]):
 
         indices = []
         for i, x in enumerate(example_lens):
@@ -264,8 +262,8 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
 
     # hold state during the cutting process so we know if an
     # example has been exhausted and needs to cycle
-    last_taken: Dict = {}  # map: eg -> pointer to last taken sample
-    exhausted: Dict = {}  # map: eg -> boolean indicating example is exhausted
+    last_taken: dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
 
     states = None
     for Y_min, cu_seqlens, sed_idx, (A, dt, X, B,
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index bff7f8e57fb..eb83b4d612c 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from itertools import accumulate, product
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Optional
 
 import pytest
 import torch
@@ -179,7 +179,7 @@ def test_batched_rotary_embedding_multi_lora(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    scaling_factors: List[int] = [1, 2, 4]
+    scaling_factors: list[int] = [1, 2, 4]
     rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, {
         "rope_type": "linear",
         "factor": tuple(scaling_factors)
@@ -234,7 +234,7 @@ def test_rope_module_cache():
     })
     settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE,
                 ROPE_SCALINGS, DTYPES)
-    rope_setting_id_map: Dict[str, int] = {}
+    rope_setting_id_map: dict[str, int] = {}
     for setting in product(*settings):
         head_size, rotary_dim, max_position, base, \
             is_neox_stype, rope_scaling, dtype = setting
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
index d878ed6f451..bbff3e0a041 100644
--- a/tests/kernels/test_triton_scaled_mm.py
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -4,7 +4,7 @@
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.
 """
 import importlib
-from typing import Optional, Type
+from typing import Optional
 
 import pytest
 import torch
@@ -18,7 +18,7 @@ def scaled_mm_torch(a: torch.Tensor,
                     b: torch.Tensor,
                     scale_a: torch.Tensor,
                     scale_b: torch.Tensor,
-                    out_dtype: Type[torch.dtype],
+                    out_dtype: type[torch.dtype],
                     bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     out = torch.mm(a.to(torch.float32), b.to(torch.float32))
     out = scale_a * out
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 1ee3a332503..010974076ba 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -4,9 +4,9 @@
 import itertools
 import random
 import unittest
+from collections.abc import Sequence
 from numbers import Number
-from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple,
-                    Type, Union)
+from typing import Any, NamedTuple, Optional, Union
 
 import pytest
 import torch
@@ -20,13 +20,13 @@
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.
-DEFAULT_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+DEFAULT_OPCHECK_TEST_UTILS: tuple[str, ...] = (
     "test_schema",
     "test_autograd_registration",
     "test_faketensor",
 )
 
-ALL_OPCHECK_TEST_UTILS: Tuple[str, ...] = (
+ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
     "test_schema",
     "test_autograd_registration",
     "test_faketensor",
@@ -50,8 +50,8 @@ class QKVInputs(NamedTuple):
     query: torch.Tensor
     key: torch.Tensor
     value: torch.Tensor
-    q_seq_lens: List[int]
-    kv_seq_lens: List[int]
+    q_seq_lens: list[int]
+    kv_seq_lens: list[int]
 
 
 class QKVO(NamedTuple):
@@ -89,10 +89,10 @@ class PackedQKVInputs(NamedTuple):
     query: torch.Tensor
     key: torch.Tensor
     value: torch.Tensor
-    q_start_loc_list: Optional[List[int]]
-    kv_start_loc_list: Optional[List[int]]
-    q_seq_lens: Optional[List[int]]
-    kv_seq_lens: Optional[List[int]]
+    q_start_loc_list: Optional[list[int]]
+    kv_start_loc_list: Optional[list[int]]
+    q_seq_lens: Optional[list[int]]
+    kv_seq_lens: Optional[list[int]]
 
 
 class PackedQKVO(NamedTuple):
@@ -146,7 +146,7 @@ class PhaseTestParameters(NamedTuple):
 
 
 def maybe_make_int_tensor(
-    _list: Optional[List[int]],
+    _list: Optional[list[int]],
     device: Union[torch.device, str],
 ) -> torch.Tensor:
     '''
@@ -162,7 +162,7 @@ def maybe_make_int_tensor(
 
 
 def maybe_make_long_tensor(
-    _list: Optional[List[int]],
+    _list: Optional[list[int]],
     device: Union[torch.device, str],
 ) -> torch.Tensor:
     '''
@@ -177,7 +177,7 @@ def maybe_make_long_tensor(
         _list, dtype=torch.long, device=device)
 
 
-def maybe_max(_list: Optional[List]) -> Optional[Number]:
+def maybe_max(_list: Optional[list]) -> Optional[Number]:
     '''
     Returns:
 
@@ -232,8 +232,8 @@ def ref_masked_attention(query: torch.Tensor,
                          value: torch.Tensor,
                          scale: float,
                          custom_mask: Optional[torch.Tensor] = None,
-                         q_seq_lens: Optional[List] = None,
-                         kv_seq_lens: Optional[List] = None) -> torch.Tensor:
+                         q_seq_lens: Optional[list] = None,
+                         kv_seq_lens: Optional[list] = None) -> torch.Tensor:
     '''
     "Golden" masked attention reference. Supports two types of masking:
 
@@ -295,10 +295,10 @@ def make_qkv(
     num_heads: int,
     head_size: int,
     device: Union[torch.device, str],
-    force_kv_seq_lens: Optional[List[int]] = None,
+    force_kv_seq_lens: Optional[list[int]] = None,
     attn_type: AttentionType = AttentionType.ENCODER_DECODER,
     force_max_len: bool = False,
-) -> Tuple[QKVInputs, QKVInputs, QKVInputs]:
+) -> tuple[QKVInputs, QKVInputs, QKVInputs]:
     '''
     Construct QKV test tensors for self- and cross-attention.
 
@@ -429,8 +429,8 @@ def make_qkv(
 
 
 def pack_tensor(
-        unpacked_tensor: torch.Tensor, seq_lens: List[int],
-        device: Union[torch.device, str]) -> Tuple[torch.Tensor, List[int]]:
+        unpacked_tensor: torch.Tensor, seq_lens: list[int],
+        device: Union[torch.device, str]) -> tuple[torch.Tensor, list[int]]:
     '''
     Pack a batch_size x padded_seq_len x num_heads x head_size tensor into an
     unpadded number_of_tokens x num_heads x head_size tensor, where
@@ -537,11 +537,11 @@ def make_backend(backend_name: str) -> AttentionBackend:
 
 
 def _make_metadata_tensors(
-    seq_lens: Optional[List[int]],
-    context_lens: Optional[List[int]],
-    encoder_seq_lens: Optional[List[int]],
+    seq_lens: Optional[list[int]],
+    context_lens: Optional[list[int]],
+    encoder_seq_lens: Optional[list[int]],
     device: Union[torch.device, str],
-) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor],
            torch.Tensor, torch.Tensor, Optional[int]]:
     '''
     Build scalar & tensor values required to build attention metadata structure.
@@ -654,7 +654,7 @@ def make_empty_block_tables_tensor(device: Union[torch.device, str]):
     return torch.tensor([], device=device)
 
 
-def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
+def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
                        device: Union[torch.device, str]):
     '''
     Split a slot mapping into valid prefill- and decode-phase slot mappings.
@@ -682,9 +682,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
 
     Arguments:
 
-    * slot_mapping_list: Length-P 1D slot mapping (as List) reflecting all N
+    * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
       post-decode sequences
-    * seq_lens: List of N post-decode sequence lengths (K_i + 1 in the 
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the 
       description above)
     * device: cuda, cpu, etc.
 
@@ -712,9 +712,9 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: List[int],
 
 def make_block_tables_slot_mapping(
         block_size: int,
-        seq_lens: List[int],
+        seq_lens: list[int],
         device: Union[torch.device, str],
-        block_base_addr: int = 0) -> Tuple[torch.Tensor, List[int], int]:
+        block_base_addr: int = 0) -> tuple[torch.Tensor, list[int], int]:
     '''
     Construct fake block tables & slot mappings.
 
@@ -794,7 +794,7 @@ def make_block_tables_slot_mapping(
 def make_test_metadata(
     attn_backend: _Backend,
     is_prompt: bool,
-    seq_lens: Optional[List[int]],
+    seq_lens: Optional[list[int]],
     decoder_test_params: Optional[PhaseTestParameters],
     device: Union[torch.device, str],
     encoder_test_params: Optional[PhaseTestParameters] = None,
@@ -1043,7 +1043,7 @@ def fp8_allclose(
 # Marlin MoE test utils
 
 
-def stack_and_dev(tensors: List[torch.Tensor]):
+def stack_and_dev(tensors: list[torch.Tensor]):
     dev = tensors[0].device
     return torch.stack(tensors, dim=0).to(dev)
 
@@ -1090,12 +1090,12 @@ def torch_moe_single(a, w, score, topk):
 # and a patched version of allclose that supports fp8 types.
 def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket,
                       torch._library.custom_ops.CustomOpDef],
-            args: Tuple[Any, ...],
-            kwargs: Optional[Dict[str, Any]] = None,
+            args: tuple[Any, ...],
+            kwargs: Optional[dict[str, Any]] = None,
             *,
             test_utils: Union[str, Sequence[str]] = ALL_OPCHECK_TEST_UTILS,
             raise_exception: bool = True,
-            cond: bool = True) -> Dict[str, str]:
+            cond: bool = True) -> dict[str, str]:
     with unittest.mock.patch('torch.allclose', new=fp8_allclose):
         return torch.library.opcheck(
             op,
@@ -1120,7 +1120,7 @@ def baseline_scaled_mm(a: torch.Tensor,
                        b: torch.Tensor,
                        scale_a: torch.Tensor,
                        scale_b: torch.Tensor,
-                       out_dtype: Type[torch.dtype],
+                       out_dtype: type[torch.dtype],
                        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
     # We treat N-dimensional group scaling as extended numpy-style broadcasting
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 181a5ac207f..3dd923d2405 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -2,7 +2,6 @@
 
 import os
 import time
-from typing import List
 
 import torch
 from tqdm import tqdm
@@ -45,7 +44,7 @@ def test_run(my_rank, pipe):
 def stress_test(my_rank, pipe):
     print(f"rank {my_rank} stress_test starts....")
 
-    tensors: List[torch.Tensor] = []
+    tensors: list[torch.Tensor] = []
 
     torch.distributed.barrier()
     torch.manual_seed(0)
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 59c1570b542..dd14abff630 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,7 +2,7 @@
 
 import tempfile
 from collections import OrderedDict
-from typing import Dict, List, TypedDict
+from typing import TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -37,7 +37,7 @@ class ContextInfo(TypedDict):
     context_length: str
 
 
-LONG_LORA_INFOS: List[ContextIDInfo] = [{
+LONG_LORA_INFOS: list[ContextIDInfo] = [{
     "lora_id": 1,
     "context_length": "16k",
 }, {
@@ -290,7 +290,7 @@ def long_context_infos(long_context_lora_files_16k_1,
                        long_context_lora_files_16k_2,
                        long_context_lora_files_32k):
     cleanup_dist_env_and_memory(shutdown_ray=True)
-    infos: Dict[int, ContextInfo] = {}
+    infos: dict[int, ContextInfo] = {}
     for lora_checkpoint_info in LONG_LORA_INFOS:
         lora_id = lora_checkpoint_info["lora_id"]
         if lora_id == 1:
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
index 2d33f738bd8..fd0470a351a 100644
--- a/tests/lora/data/long_context_test_data.py
+++ b/tests/lora/data/long_context_test_data.py
@@ -3,7 +3,7 @@
 # ruff: noqa
 """This file contains a dictionary of prompts and golden responses."""
 
-from typing import Dict, List, TypedDict
+from typing import TypedDict
 
 
 class DateJSON(TypedDict):
@@ -25,7 +25,7 @@ class PromptResponse(TypedDict):
     golden_answer: AnswerJSON
 
 
-prompts_and_responses: Dict[str, List[PromptResponse]] = {
+prompts_and_responses: dict[str, list[PromptResponse]] = {
     "16k": [{
         "prompt":
         "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 70b058b201d..644a075b6dd 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -2,7 +2,6 @@
 import asyncio
 import time
 from pathlib import Path
-from typing import List
 
 import pytest
 from huggingface_hub import snapshot_download
@@ -53,8 +52,8 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-def get_lora_requests() -> List[LoRARequest]:
-    lora_requests: List[LoRARequest] = [
+def get_lora_requests() -> list[LoRARequest]:
+    lora_requests: list[LoRARequest] = [
         LoRARequest(lora_name=f"{i}",
                     lora_int_id=i,
                     lora_path=LORA_MODULE_DOWNLOAD_PATH)
@@ -64,7 +63,7 @@ def get_lora_requests() -> List[LoRARequest]:
 
 
 async def requests_processing_time(llm,
-                                   lora_requests: List[LoRARequest]) -> float:
+                                   lora_requests: list[LoRARequest]) -> float:
 
     sampling_params = SamplingParams(n=1,
                                      temperature=0.0,
@@ -107,7 +106,7 @@ async def test_add_lora():
 
     download_and_prepare_lora_module()
 
-    lora_requests: List[LoRARequest] = get_lora_requests()
+    lora_requests: list[LoRARequest] = get_lora_requests()
 
     max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
     # Create engine in eager-mode. Due to high max_loras, the CI can
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index d3992594804..9103ba425af 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -13,7 +11,7 @@
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -33,7 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index ee09afe8677..fc0434e7a7e 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -21,7 +19,7 @@
 ]
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index bbdfbe37175..8f07e39d20d 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -11,7 +9,7 @@
 MODEL_PATH = "google/gemma-7b"
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "Quote: Imagination is",
         "Quote: Be yourself;",
@@ -24,7 +22,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
index c0417466589..885851880b5 100644
--- a/tests/lora/test_jamba.py
+++ b/tests/lora/test_jamba.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import torch
 
@@ -14,7 +12,7 @@
 
 
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: List[str]) -> List[str]:
+              prompts: list[str]) -> list[str]:
 
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
     outputs = llm.generate(
@@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 61699e7052c..3507d012121 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -3,7 +3,7 @@
 import random
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -66,7 +66,7 @@
 
 def get_random_id_to_index(num_loras: int,
                            num_slots: int,
-                           log: bool = True) -> List[Optional[int]]:
+                           log: bool = True) -> list[Optional[int]]:
     """Creates a random lora_id_to_index mapping.
 
     Args:
@@ -81,7 +81,7 @@ def get_random_id_to_index(num_loras: int,
             f"num_loras is higher than num_slots: {num_loras} > {num_slots}. "
             "num_loras must be less than or equal to num_slots.")
 
-    slots: List[Optional[int]] = [None] * num_slots
+    slots: list[Optional[int]] = [None] * num_slots
     random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist()
     for lora_id, slot_idx in enumerate(random_slot_selections, start=1):
         slots[slot_idx] = lora_id
@@ -93,12 +93,12 @@ def get_random_id_to_index(num_loras: int,
 
 
 def populate_loras(
-    id_to_index: List[Optional[int]],
+    id_to_index: list[Optional[int]],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
     generate_embeddings_tensor: int = 0,
     repeats: int = 1,
-) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]:
+) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora layers with lora weights.
 
     Args:
@@ -117,15 +117,15 @@ def populate_loras(
 
     # Dictionary that maps the lora ID to the
     # corresponding lora weights.
-    lora_dict: Dict[int, LoRALayerWeights] = dict()
+    lora_dict: dict[int, LoRALayerWeights] = dict()
 
     # Dictionary that maps the lora ID to the
     # corresponding subloras.
-    sublora_dict: Dict[int, List[LoRALayerWeights]] = dict()
+    sublora_dict: dict[int, list[LoRALayerWeights]] = dict()
 
     for slot_idx, lora_id in enumerate(id_to_index):
         if lora_id is not None:
-            subloras: List[LoRALayerWeights] = []
+            subloras: list[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
                 sublora = DummyLoRAManager(
@@ -156,13 +156,13 @@ def populate_loras(
 
 
 def create_random_inputs(
-    active_lora_ids: List[int],
+    active_lora_ids: list[int],
     num_inputs: int,
-    input_size: Tuple[int, ...],
-    input_range: Tuple[float, float],
+    input_size: tuple[int, ...],
+    input_range: tuple[float, float],
     input_type: torch.dtype = torch.int,
     device: torch.device = "cuda"
-) -> Tuple[List[torch.Tensor], List[int], List[int]]:
+) -> tuple[list[torch.Tensor], list[int], list[int]]:
     """Creates random inputs.
 
     Args:
@@ -176,9 +176,9 @@ def create_random_inputs(
 
     low, high = input_range
 
-    inputs: List[torch.Tensor] = []
-    index_mapping: List[int] = []
-    prompt_mapping: List[int] = []
+    inputs: list[torch.Tensor] = []
+    index_mapping: list[int] = []
+    prompt_mapping: list[int] = []
 
     for _ in range(num_inputs):
         if input_type == torch.int:
@@ -268,7 +268,7 @@ def create_random_embedding_layer():
 
         lora_result = lora_embedding(torch.cat(inputs))
 
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = embedding(input_)
@@ -408,7 +408,7 @@ def create_random_embedding_layer():
 
         lora_result = lora_embedding(torch.cat(original_inputs))
 
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, original_input_, lora_id in zip(inputs, original_inputs,
                                                     prompt_mapping):
             lora = lora_dict[lora_id]
@@ -538,7 +538,7 @@ def _pretest():
 
         logits_processor.org_vocab_size = (vocab_size +
                                            lora_config.lora_extra_vocab_size)
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(hidden_states=input_,
@@ -659,7 +659,7 @@ def create_random_linear_replicated_layer():
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
@@ -784,7 +784,7 @@ def create_random_linear_parallel_layer():
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
@@ -933,7 +933,7 @@ class FakeConfig:
 
         lora_result = lora_linear(torch.cat(inputs))[0]
 
-        expected_results: List[torch.Tensor] = []
+        expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
@@ -1093,9 +1093,9 @@ def test_vocab_parallel_embedding_indices(tp_size, seed):
     computed_added_vocab_size = 0
     vocab_size_padded = -1
 
-    all_org_tokens: List[int] = []
-    all_added_tokens: List[int] = []
-    token_ids: List[int] = []
+    all_org_tokens: list[int] = []
+    all_added_tokens: list[int] = []
+    token_ids: list[int] = []
 
     for tp_rank in range(tp_size):
         with patch(
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 564818f23fd..e84ff30ba99 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import ray
 
@@ -31,7 +29,7 @@
 ]
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@@ -49,7 +47,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index 0a94298c9f7..f577f39ba78 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ast
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -86,7 +86,7 @@ def evaluate_json_response(model_response, golden_response):
 
 def generate(
     llm: vllm.LLM,
-    inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
+    inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
 ):
     prompts, sampling_param, lora_request = inputs
     outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
@@ -95,7 +95,7 @@ def generate(
 
 def batched_generate(
     llm: vllm.LLM,
-    inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]],
+    inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
 ):
     for input in inputs:
         prompt, sampling_param, lora_req = input
@@ -164,7 +164,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
         non-batched generation.
     """
     # Create non batched results first to compare against batched results
-    non_batched_results: List[str] = []
+    non_batched_results: list[str] = []
 
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
@@ -177,7 +177,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
     # Create batched results
     # Each element of the batch must be
     # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                 Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
@@ -202,7 +202,7 @@ def test_self_consistency(lora_llm, long_context_infos):
     num_loras = len(long_context_infos)
 
     # Create results in order of long_context_infos
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                 Optional[LoRARequest]]] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
@@ -251,7 +251,7 @@ def test_quality(lora_llm, long_context_infos):
     The test is expected to run for about 1 minute on a p4de.24xlarge
     instance.
     """
-    scores: List[float] = []
+    scores: list[float] = []
     for lora_id, info in long_context_infos.items():
         context_len = info["context_length"]
         for prompt_and_response in prompts_and_responses[context_len]:
@@ -284,7 +284,7 @@ def test_max_len(lora_llm, long_context_infos):
             generate(lora_llm, (bad_prompt, sampling_params, lora_request))
 
     # Also test batched
-    batched_prompts: List[Tuple[str, SamplingParams,
+    batched_prompts: list[tuple[str, SamplingParams,
                                 Optional[LoRARequest]]] = []
     for lora_id_with_bad_inputs in long_context_infos:
         for lora_id, info in long_context_infos.items():
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
index 3a7b391692c..d4245a89dff 100644
--- a/tests/lora/test_lora_bias_e2e.py
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -10,7 +8,7 @@
 MODEL_PATH = "ibm-granite/granite-3b-code-base"
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
@@ -23,7 +21,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         sampling_params,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         generated_text = output.outputs[0].text
         generated_texts.append(generated_text)
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index e2c3d20d327..02f2339bef0 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm.lora.models import LoRAModel
@@ -31,7 +29,7 @@ def test_load_checkpoints(
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
+    expected_lora_modules: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
@@ -99,7 +97,7 @@ def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
     embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
     embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
+    expected_lora_modules: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 1309848868b..b279566c00f 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -4,7 +4,6 @@
 """
 
 import os
-from typing import List
 
 import pytest
 
@@ -46,7 +45,7 @@ def test_lora_functions_sync():
 
     llm = LLM.get_engine_class().from_engine_args(engine_args)
 
-    def run_check(fn, args, expected: List):
+    def run_check(fn, args, expected: list):
         fn(args)
         assert set(llm.list_loras()) == set(expected)
 
@@ -105,7 +104,7 @@ async def test_lora_functions_async():
                                   gpu_memory_utilization=0.8,
                                   enforce_eager=True)
 
-    async def run_check(fn, args, expected: List):
+    async def run_check(fn, args, expected: list):
         await fn(args)
         assert set(await llm.list_loras()) == set(expected)
 
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 44d111732d2..0875128c4ff 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm.lora.models import LoRAModel
@@ -23,7 +21,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
     embedding_modules = LlamaForCausalLM.embedding_modules
     embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: List[str] = []
+    expected_lora_modules: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
             expected_lora_modules.extend(packed_modules_mapping[module])
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 7ab46b7ff9c..8d258331259 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Dict, List
 
 import pytest
 import torch
@@ -72,9 +71,9 @@ def test_from_lora_tensors(sql_lora_files, device):
             assert lora.embeddings_tensor is None
 
 
-def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str],
+def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
                 device: torch.device) -> LoRAModel:
-    loras: Dict[str, LoRALayerWeights] = {}
+    loras: dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
         loras[name] = LoRALayerWeights(
@@ -96,7 +95,7 @@ def create_packed_lora(
     empty_replaced_module_name=None,
 ) -> LoRAModel:
     w = model.get_submodule(module_name).weight
-    loras: Dict[str, LoRALayerWeights] = {}
+    loras: dict[str, LoRALayerWeights] = {}
     for replaced_module_name in replaced_module_names:
         if replaced_module_name == empty_replaced_module_name:
             continue
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 2e81bb32671..f596651be01 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -27,7 +25,7 @@
 ]
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     sampling_params = vllm.SamplingParams(
         temperature=0,
         max_tokens=5,
@@ -48,7 +46,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         if lora_id else None,
     )
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         generated_text = output.outputs[0].text.strip()
         generated_texts.append(generated_text)
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 90cf8fd39a1..caa65f2dc63 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import torch
 
@@ -13,7 +11,7 @@
 
 
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: List[str]) -> List[str]:
+              prompts: list[str]) -> list[str]:
 
     sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
     outputs = llm.generate(
@@ -22,7 +20,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 8999e0cf319..8596d399979 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -12,7 +10,7 @@
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
             sql_prompt=
@@ -41,7 +39,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         if lora_id else None,
     )
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 032e20470bc..c75e866172e 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 from threading import Lock
-from typing import List
 
 import pytest
 import torch
@@ -20,7 +19,7 @@
 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
         nslices: int, inputs_tensor: torch.Tensor,
-        lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
+        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
         b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
         prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
         num_tokens: int, scaling: float):
@@ -44,7 +43,7 @@ def sgmv_shrink_for_nslices(
 
 def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
                             inputs_tensor: torch.Tensor,
-                            lora_weights_lst: List[torch.Tensor],
+                            lora_weights_lst: list[torch.Tensor],
                             out_tensor: torch.Tensor,
                             b_seq_start_loc: torch.Tensor,
                             seq_len_tensor: torch.Tensor,
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index 7f687f563eb..b4f3d8dc478 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -3,7 +3,6 @@
 # Adapted from
 # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
 from dataclasses import dataclass
-from typing import List
 
 import pytest
 
@@ -19,7 +18,7 @@ class ModelWithQuantization:
     quantization: str
 
 
-MODELS: List[ModelWithQuantization]
+MODELS: list[ModelWithQuantization]
 #AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
     MODELS = [
@@ -41,7 +40,7 @@ class ModelWithQuantization:
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
-              max_tokens: int = 256) -> List[str]:
+              max_tokens: int = 256) -> list[str]:
     raw_prompts = [
         "Give me an orange-ish brown color",
         "Give me a neon pink color",
@@ -61,7 +60,7 @@ def format_prompt_tuples(prompt):
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1cf1534e403..24eff013e20 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Optional
 
 import pytest
 from packaging.version import Version
@@ -20,7 +20,7 @@ class TestConfig:
     max_loras: int = 2
     max_lora_rank: int = 16
     max_model_len: int = 4096
-    mm_processor_kwargs: Optional[Dict[str, int]] = None
+    mm_processor_kwargs: Optional[dict[str, int]] = None
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
@@ -57,11 +57,11 @@ def _initialize_llm(self) -> vllm.LLM:
         )
 
     def run_test(self,
-                 images: List[ImageAsset],
-                 expected_outputs: List[str],
+                 images: list[ImageAsset],
+                 expected_outputs: list[str],
                  lora_id: Optional[int] = None,
                  temperature: float = 0,
-                 max_tokens: int = 5) -> List[str]:
+                 max_tokens: int = 5) -> list[str]:
 
         sampling_params = vllm.SamplingParams(
             temperature=temperature,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 07af1e9f449..ff3bfcac505 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 import vllm
@@ -21,7 +19,7 @@
 ]
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
         PROMPT_TEMPLATE.format(
@@ -40,7 +38,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
         lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
         if lora_id else None)
     # Print the outputs.
-    generated_texts: List[str] = []
+    generated_texts: list[str] = []
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text.strip()
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index 703f92ce8b6..6d2833bd125 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -3,7 +3,6 @@
 import shutil
 from os import path
 from tempfile import TemporaryDirectory
-from typing import List, Tuple
 
 import torch
 from huggingface_hub import snapshot_download
@@ -86,8 +85,8 @@ def test_ultravox_lora(vllm_runner):
                 dtype="bfloat16",
                 max_model_len=1024,
         ) as vllm_model:
-            ultravox_outputs: List[Tuple[
-                List[int], str]] = vllm_model.generate_greedy(
+            ultravox_outputs: list[tuple[
+                list[int], str]] = vllm_model.generate_greedy(
                     [
                         _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
                                     ULTRAVOX_MODEL_NAME)
@@ -108,7 +107,7 @@ def test_ultravox_lora(vllm_runner):
             dtype="bfloat16",
             max_model_len=1024,
     ) as vllm_model:
-        llama_outputs: List[Tuple[List[int], str]] = (
+        llama_outputs: list[tuple[list[int], str]] = (
             vllm_model.generate_greedy(
                 [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
                 256,
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 1e163fbf97c..59a0e7420fc 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 
@@ -12,7 +12,7 @@ class DummyLoRAManager:
 
     def __init__(self, device: torch.device = "cuda:0"):
         super().__init__()
-        self._loras: Dict[str, LoRALayerWeights] = {}
+        self._loras: dict[str, LoRALayerWeights] = {}
         self._device = device
 
     def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
@@ -77,11 +77,11 @@ def init_packed_lora(
         self,
         module_name: str,
         input_dim: int,
-        output_dims: List[int],
-        noop_lora_index: Optional[List[int]] = None,
+        output_dims: list[int],
+        noop_lora_index: Optional[list[int]] = None,
         rank: int = 8,
     ):
-        base_loras: List[LoRALayerWeights] = []
+        base_loras: list[LoRALayerWeights] = []
         noop_lora_index_set = set(noop_lora_index or [])
 
         for i, out_dim in enumerate(output_dims):
@@ -110,7 +110,7 @@ def assert_close(a, b):
 @dataclass
 class PunicaTensors:
     inputs_tensor: torch.Tensor
-    lora_weights: Union[torch.Tensor, List[torch.Tensor]]
+    lora_weights: Union[torch.Tensor, list[torch.Tensor]]
     our_out_tensor: torch.Tensor
     ref_out_tensor: torch.Tensor
     b_seq_start_loc: torch.Tensor
@@ -118,7 +118,7 @@ class PunicaTensors:
     seq_len_tensor: torch.Tensor
     token_lora_mapping: torch.Tensor
 
-    def meta(self) -> Tuple[int, int]:
+    def meta(self) -> tuple[int, int]:
         """
         Infer max_seq_length and token_nums from the tensors
         and return them.
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index b276d9d9cb4..e23ff43ebd7 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from typing import List
 
 import pytest
 import ray
@@ -133,7 +132,7 @@ def test_metric_counter_generation_tokens_multi_step(
     "served_model_name",
     [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
-                                   served_model_name: List[str]) -> None:
+                                   served_model_name: list[str]) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      disable_log_stats=False,
diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py
index 971ed55ca3c..1d809a05e89 100644
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
@@ -1,21 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional
+from typing import Optional
 
 from typing_extensions import TypedDict
 
 
 class ServerConfig(TypedDict, total=False):
     model: str
-    arguments: List[str]
+    arguments: list[str]
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
     supports_rocm: Optional[bool]
 
 
-ARGS: List[str] = ["--max-model-len", "1024"]
+ARGS: list[str] = ["--max-model-len", "1024"]
 
-CONFIGS: Dict[str, ServerConfig] = {
+CONFIGS: dict[str, ServerConfig] = {
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 2c678084856..4a6a766b8ca 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
@@ -51,7 +49,7 @@ class Relu3(ReLUSquaredActivation):
         # All but RMSNorm
         ("all,-rms_norm", 4, [0, 1, 1, 1], True),
     ])
-def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
+def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
                      default_on: bool):
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=torch_level, custom_ops=env.split(",")))
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 0ea17247028..13433b04225 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -17,7 +17,7 @@
 
 MODEL_NAME = "fixie-ai/ultravox-v0_4"
 
-AudioTuple = Tuple[np.ndarray, int]
+AudioTuple = tuple[np.ndarray, int]
 
 VLLM_PLACEHOLDER = "<|audio|>"
 HF_PLACEHOLDER = "<|audio|>"
@@ -78,7 +78,7 @@ def _get_prompt(audio_count, question, placeholder):
                                          add_generation_prompt=True)
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                          Optional[SampleLogprobs]],
                       model: str):
     """Sanitize vllm output to be comparable with hf output."""
@@ -96,9 +96,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
 
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, str, AudioTuple]],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    prompts_and_audios: list[tuple[str, str, AudioTuple]],
     model: str,
     *,
     dtype: str,
@@ -158,8 +158,8 @@ def process(hf_inputs: BatchEncoding, **kwargs):
 
 
 def run_multi_audio_test(
-    vllm_runner: Type[VllmRunner],
-    prompts_and_audios: List[Tuple[str, List[AudioTuple]]],
+    vllm_runner: type[VllmRunner],
+    prompts_and_audios: list[tuple[str, list[AudioTuple]]],
     model: str,
     *,
     dtype: str,
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 57fe1d5b151..804df4c4903 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -5,7 +5,7 @@
 """
 
 import os
-from typing import List, NamedTuple, Type
+from typing import NamedTuple
 
 import pytest
 from huggingface_hub import hf_hub_download
@@ -90,8 +90,8 @@ def gguf_model(self):
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_models(
     num_gpus_available: int,
-    vllm_runner: Type[VllmRunner],
-    example_prompts: List[str],
+    vllm_runner: type[VllmRunner],
+    example_prompts: list[str],
     model: GGUFTestConfig,
     dtype: str,
     max_tokens: int,
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
index 66dd979579c..a997b9e6640 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -5,7 +5,6 @@
 Note: these tests will only pass on H100
 """
 import os
-from typing import List
 
 import pytest
 from transformers import AutoTokenizer
@@ -65,7 +64,7 @@ def test_models(example_prompts, model_name) -> None:
         for prompt in example_prompts
     ]
     params = SamplingParams(max_tokens=20, temperature=0)
-    generations: List[str] = []
+    generations: list[str] = []
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index 31a5cd260a1..f4a6dd0f101 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Type
+from typing import Optional
 
 import pytest
 import torch
@@ -19,12 +19,12 @@
 
 
 def run_awq_test(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
     image_assets: _ImageAssets,
     source_model: str,
     quant_model: str,
     *,
-    size_factors: List[float],
+    size_factors: list[float],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2c66edb539d..3f7a7c01aeb 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -6,7 +6,6 @@
 import os
 from collections import defaultdict
 from pathlib import PosixPath
-from typing import Type
 
 import pytest
 from packaging.version import Version
@@ -562,8 +561,8 @@ def _mark_splits(
     ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
-                             hf_runner: Type[HfRunner],
-                             vllm_runner: Type[VllmRunner],
+                             hf_runner: type[HfRunner],
+                             vllm_runner: type[VllmRunner],
                              image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_single_image_test(
@@ -585,8 +584,8 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
                             image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_multi_image_test(
@@ -608,8 +607,8 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
-                                hf_runner: Type[HfRunner],
-                                vllm_runner: Type[VllmRunner],
+                                hf_runner: type[HfRunner],
+                                vllm_runner: type[VllmRunner],
                                 image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_embedding_test(
@@ -629,7 +628,7 @@ def test_image_embedding_models(model_type: str,
         fork_new_process_for_each_test=False,
     ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
-                      hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                       video_assets: _VideoAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_video_test(
@@ -651,8 +650,8 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
 def test_custom_inputs_models(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
 ):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_custom_inputs_test(
@@ -674,8 +673,8 @@ def test_custom_inputs_models(
 @fork_new_process_for_each_test
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
-                                   hf_runner: Type[HfRunner],
-                                   vllm_runner: Type[VllmRunner],
+                                   hf_runner: type[HfRunner],
+                                   vllm_runner: type[VllmRunner],
                                    image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_single_image_test(
@@ -698,8 +697,8 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
 @fork_new_process_for_each_test
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
-                                  hf_runner: Type[HfRunner],
-                                  vllm_runner: Type[VllmRunner],
+                                  hf_runner: type[HfRunner],
+                                  vllm_runner: type[VllmRunner],
                                   image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_multi_image_test(
@@ -722,8 +721,8 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
 @fork_new_process_for_each_test
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
-                                      hf_runner: Type[HfRunner],
-                                      vllm_runner: Type[VllmRunner],
+                                      hf_runner: type[HfRunner],
+                                      vllm_runner: type[VllmRunner],
                                       image_assets: _ImageAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_embedding_test(
@@ -743,8 +742,8 @@ def test_image_embedding_models_heavy(model_type: str,
         fork_new_process_for_each_test=True,
     ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
-                            hf_runner: Type[HfRunner],
-                            vllm_runner: Type[VllmRunner],
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
                             video_assets: _VideoAssets):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_video_test(
@@ -767,8 +766,8 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
 def test_custom_inputs_models_heavy(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
 ):
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_custom_inputs_test(
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index dd68fe4cd55..53b183b2735 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -2,7 +2,7 @@
 
 import os
 import re
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -25,7 +25,7 @@
 models = ["microsoft/Phi-3.5-vision-instruct"]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                          Optional[SampleLogprobs]],
                       model: str):
     """Sanitize vllm output to be comparable with hf output."""
@@ -55,9 +55,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
 
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput]],
     model: str,
     *,
     dtype: str,
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 602da2b5f4e..d51dabc2334 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -6,7 +6,7 @@
 import json
 import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 
 import pytest
 from mistral_common.multimodal import download_image
@@ -38,7 +38,7 @@
 PROMPT = "Describe each image in one short sentence."
 
 
-def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
     return [{
         "role":
         "user",
@@ -54,7 +54,7 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
     }]
 
 
-def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
+def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
     return [{
         "role":
         "user",
@@ -68,7 +68,7 @@ def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
     }]
 
 
-def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
+def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
     msg = _create_msg_format(urls)
 
     tokenizer = MistralTokenizer.from_model("pixtral")
@@ -89,7 +89,7 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
     return engine_inputs
 
 
-def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
+def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
     msg = _create_msg_format_hf(urls)
 
     tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
@@ -128,7 +128,7 @@ def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
 FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
 FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
 
-OutputsLogprobs = List[Tuple[List[int], str, Optional[SampleLogprobs]]]
+OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
 
 
 # For the test author to store golden output in JSON
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index de240a904e4..af494eb2e62 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
+from typing import Any, Optional, TypedDict, Union
 
 import numpy.typing as npt
 import pytest
@@ -69,21 +69,21 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
 
 
 def batch_make_image_embeddings(
-        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]:
+        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
+        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
     """batched image embeddings for Qwen2-VL
 
     This will infer all images' embeddings in a single batch, 
       and split the result according to input batches.
 
     image_batches:
-      - Single-image batches: `List[Image.Image]`
-      - Multiple-image batches: `List[List[Image.Image]]]`
+      - Single-image batches: `list[Image.Image]`
+      - Multiple-image batches: `list[list[Image.Image]]]`
     
-    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
+    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
     """
 
-    image_batches_: List[Any] = image_batches[:]
+    image_batches_: list[Any] = image_batches[:]
 
     # convert single-image batches to multiple-image batches
     for idx in range(len(image_batches_)):
@@ -93,7 +93,7 @@ def batch_make_image_embeddings(
         assert isinstance(image_batches_[idx], list)
 
     # append all images into a list (as a batch)
-    images: List[Image.Image] = []
+    images: list[Image.Image] = []
     for image_batch in image_batches_:
         images += image_batch
 
@@ -121,7 +121,7 @@ def get_image_embeds(model):
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
-    result: List[Qwen2VLPromptImageEmbeddingInput] = []
+    result: list[Qwen2VLPromptImageEmbeddingInput] = []
     image_counter = 0
     embed_counter = 0
     for image_batch in image_batches_:
@@ -153,7 +153,7 @@ def get_image_embeds(model):
 
 def batch_make_video_embeddings(
         video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
     """batched video embeddings for Qwen2-VL
 
     A NDArray represents a single video's all frames.
@@ -162,21 +162,21 @@ def batch_make_video_embeddings(
       and split the result according to input batches.
 
     video_batches:
-      - Single-video batches: `List[NDArray]`
-      - Multiple-video batches: `List[List[NDArray]]`
+      - Single-video batches: `list[NDArray]`
+      - Multiple-video batches: `list[list[NDArray]]`
     """
 
-    video_batches_: List[Any] = video_batches[:]
+    video_batches_: list[Any] = video_batches[:]
 
     for idx in range(len(video_batches_)):
         if not isinstance(video_batches_[idx], list):
-            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
+            single_video_batch: list[npt.NDArray] = [video_batches_[idx]]
             video_batches_[idx] = single_video_batch
 
         assert isinstance(video_batches_[idx], list)
 
     # append all videos into a list (as a batch)
-    videos: List[npt.NDArray] = []
+    videos: list[npt.NDArray] = []
     for video_batch in video_batches_:
         videos += video_batch
 
@@ -204,7 +204,7 @@ def get_image_embeds(model):
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
-    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
+    result: list[Qwen2VLPromptVideoEmbeddingInput] = []
     video_counter = 0
     embed_counter = 0
     for video_batch in video_batches_:
@@ -235,8 +235,8 @@ def get_image_embeds(model):
 
 
 def run_embedding_input_test(
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput, PromptVideoInput]],
     model: str,
     *,
     dtype: str,
@@ -323,8 +323,8 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
                                          num_logprobs: int) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: List[Tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[
+        list[str], PromptImageInput, PromptVideoInput]] = [(
             [prompt for _ in size_factors],
             [rescale_image_size(image, factor) for factor in size_factors],
             [],
@@ -365,7 +365,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
                                                   num_logprobs: int) -> None:
     images = [asset.pil_image for asset in image_assets]
 
-    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+    inputs_per_case: list[tuple[list[str], PromptImageInput,
                                 PromptVideoInput]] = [(
                                     [MULTIIMAGE_PROMPT for _ in size_factors],
                                     [[
@@ -413,8 +413,8 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
         for asset in video_assets
     ]
 
-    inputs_per_case: List[Tuple[
-        List[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[
+        list[str], PromptImageInput, PromptVideoInput]] = [(
             [prompt for _ in size_factors],
             [],
             [rescale_video_size(video, factor) for factor in size_factors],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/builders.py b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
index 539410d1895..bf5f87ebf98 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """Helpers for building inputs that can be leveraged for different test types.
 """
+from collections.abc import Iterable
 from pathlib import PosixPath
-from typing import Callable, Iterable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 
 import torch
 
@@ -33,7 +34,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
 def get_model_prompts(base_prompts: Iterable[str],
                       img_idx_to_prompt: Optional[Callable[[int], str]],
                       video_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> List[str]:
+                      prompt_formatter: Callable[[str], str]) -> list[str]:
     """Given a model-agnostic base prompt and test configuration for a model(s)
     to be tested, update the media placeholders and apply the prompt formatting
     to get the test prompt string for this model.
@@ -218,7 +219,7 @@ def build_video_inputs_from_test_info(
     ) for video, prompt in zip(sampled_vids, model_prompts)]
 
 
-def apply_image_size_scaling(image, size: Union[float, Tuple[int, int]],
+def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                              size_type: SizeType):
     """Applies a size scaler to one image; this can be a an image size factor,
     which scales the image while maintaining the aspect ratio"""
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
index ca4ec214118..c189e5a761f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -5,7 +5,7 @@
 """
 import itertools
 from collections import OrderedDict
-from typing import Dict, Iterable, Tuple
+from collections.abc import Iterable
 
 import pytest
 
@@ -13,9 +13,9 @@
                     ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
 
 
-def get_filtered_test_settings(test_settings: Dict[str, VLMTestInfo],
+def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
                                test_type: VLMTestType,
-                               fork_per_test: bool) -> Dict[str, VLMTestInfo]:
+                               fork_per_test: bool) -> dict[str, VLMTestInfo]:
     """Given the dict of potential test settings to run, return a subdict
     of tests who have the current test type enabled with the matching val for
     fork_per_test.
@@ -49,7 +49,7 @@ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
     return matching_tests
 
 
-def get_parametrized_options(test_settings: Dict[str, VLMTestInfo],
+def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                              test_type: VLMTestType,
                              fork_new_process_for_each_test: bool):
     """Converts all of our VLMTestInfo into an expanded list of parameters.
@@ -121,7 +121,7 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
 
 def get_wrapped_test_sizes(
         test_info: VLMTestInfo,
-        test_type: VLMTestType) -> Tuple[ImageSizeWrapper, ...]:
+        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
     """Given a test info which may have size factors or fixed sizes, wrap them
     and combine them into an iterable, each of which will be used in parameter
     expansion.
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index f2260f56737..aaad584c9cd 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Core test implementation to be shared across modalities."""
-from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 from PIL.Image import Image
@@ -17,9 +17,9 @@
 
 def run_test(
     *,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
     model: str,
     dtype: str,
     max_tokens: int,
@@ -29,15 +29,15 @@ def run_test(
     max_num_seqs: int,
     hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
     vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
-    auto_cls: Type[_BaseAutoModelClass],
+    auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
     get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-    stop_str: Optional[List[str]],
-    limit_mm_per_prompt: Dict[str, int],
-    vllm_runner_kwargs: Optional[Dict[str, Any]],
-    hf_model_kwargs: Optional[Dict[str, Any]],
+    stop_str: Optional[list[str]],
+    limit_mm_per_prompt: dict[str, int],
+    vllm_runner_kwargs: Optional[dict[str, Any]],
+    hf_model_kwargs: Optional[dict[str, Any]],
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
     task: TaskOption = "auto",
     runner_mm_key: str = "images",
@@ -61,7 +61,7 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    vllm_runner_kwargs_: Dict[str, Any] = {}
+    vllm_runner_kwargs_: dict[str, Any] = {}
     if model_info.tokenizer:
         vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
     if model_info.tokenizer_mode:
@@ -84,7 +84,7 @@ def run_test(
                      **vllm_runner_kwargs_) as vllm_model:
         tokenizer = vllm_model.model.get_tokenizer()
 
-        vllm_kwargs: Dict[str, Any] = {}
+        vllm_kwargs: dict[str, Any] = {}
         if get_stop_token_ids is not None:
             vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
         if stop_str:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 408ce9cfead..66410f66ca0 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,7 +6,7 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, Optional, Union
 
 import torch
 from PIL.Image import Image
@@ -49,7 +49,7 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 def qwen_vllm_to_hf_output(
         vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     """Sanitize vllm output [qwen models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -60,7 +60,7 @@ def qwen_vllm_to_hf_output(
 
 def qwen2_vllm_to_hf_output(
         vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
     output_ids, output_str, out_logprobs = vllm_output
 
@@ -78,7 +78,7 @@ def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
 
 def llava_video_vllm_to_hf_output(
         vllm_output: RunnerOutput,
-        model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]:
+        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
     config = AutoConfig.from_pretrained(model)
     mm_token_id = config.video_token_index
     return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
@@ -247,7 +247,7 @@ def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
 
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset],
+        tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
                                                         _ImageAssets]) -> str:
     """Given a temporary dir path, export one or more image assets into the
     tempdir & replace its contents with the local path to the string so that
@@ -257,7 +257,7 @@ def qwen_prompt_path_encoder(
     Args:
         tmp_path: Tempdir for test under consideration.
         prompt: Prompt with image placeholders.
-        assets: List of image assets whose len equals the num placeholders.
+        assets: list of image assets whose len equals the num placeholders.
     """
     # Ensure that the number of placeholders matches the number of assets;
     # If this is not true, the test is probably written incorrectly.
@@ -350,7 +350,7 @@ def __init__(self, hf_runner: HfRunner):
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, List[Image]],
+        def __call__(self, text: str, images: Union[Image, list[Image]],
                      **kwargs):
             # yapf: disable
             from vllm.model_executor.models.h2ovl import (
@@ -410,7 +410,7 @@ def __init__(self, hf_runner: HfRunner):
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, List[Image]],
+        def __call__(self, text: str, images: Union[Image, list[Image]],
                      **kwargs):
             from vllm.model_executor.models.internvl import (
                 IMG_CONTEXT, IMG_END, IMG_START,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
index fb9df37cad9..023df5f1618 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py
@@ -3,7 +3,6 @@
 types / modalities.
 """
 from pathlib import PosixPath
-from typing import Type
 
 from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
 from . import builders, core
@@ -13,8 +12,8 @@
 ####### Entrypoints for running different test types
 def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                           test_case: ExpandableVLMTestArgs,
-                          hf_runner: Type[HfRunner],
-                          vllm_runner: Type[VllmRunner],
+                          hf_runner: type[HfRunner],
+                          vllm_runner: type[VllmRunner],
                           image_assets: _ImageAssets):
     assert test_case.size_wrapper is not None
     inputs = builders.build_single_image_inputs_from_test_info(
@@ -36,8 +35,8 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
 
 def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
                          test_case: ExpandableVLMTestArgs,
-                         hf_runner: Type[HfRunner],
-                         vllm_runner: Type[VllmRunner],
+                         hf_runner: type[HfRunner],
+                         vllm_runner: type[VllmRunner],
                          image_assets: _ImageAssets):
     assert test_case.size_wrapper is not None
     inputs = builders.build_multi_image_inputs_from_test_info(
@@ -59,8 +58,8 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
 
 def run_embedding_test(*, model_test_info: VLMTestInfo,
                        test_case: ExpandableVLMTestArgs,
-                       hf_runner: Type[HfRunner],
-                       vllm_runner: Type[VllmRunner],
+                       hf_runner: type[HfRunner],
+                       vllm_runner: type[VllmRunner],
                        image_assets: _ImageAssets):
     assert test_case.size_wrapper is not None
     inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
@@ -85,8 +84,8 @@ def run_video_test(
     *,
     model_test_info: VLMTestInfo,
     test_case: ExpandableVLMTestArgs,
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     video_assets: _VideoAssets,
 ):
     assert test_case.size_wrapper is not None
@@ -111,8 +110,8 @@ def run_video_test(
 
 def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
                            test_case: ExpandableVLMTestArgs,
-                           hf_runner: Type[HfRunner],
-                           vllm_runner: Type[VllmRunner]):
+                           hf_runner: type[HfRunner],
+                           vllm_runner: type[VllmRunner]):
     # Custom test cases can provide inputs directly, but they need to
     # explicitly provided a CustomTestConfig, which wraps the inputs and
     # the limit_mm_per_prompt
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index ecb86609c52..bdbdbc7ec26 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """Types for writing multimodal model tests."""
+from collections.abc import Iterable
 from enum import Enum
 from pathlib import PosixPath
-from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
-                    Tuple, Type, Union)
+from typing import Any, Callable, NamedTuple, Optional, Union
 
 import torch
 from PIL.Image import Image
@@ -35,7 +35,7 @@
 
 IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
 EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
-RunnerOutput = Tuple[List[int], str, Optional[SampleLogprobs]]
+RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
 # yapf: enable
 
 
@@ -53,8 +53,8 @@ class SizeType(Enum):
 
 
 class CustomTestOptions(NamedTuple):
-    inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]]
-    limit_mm_per_prompt: Dict[str, int]
+    inputs: list[tuple[list[str], list[Union[list[Image], Image]]]]
+    limit_mm_per_prompt: dict[str, int]
     # kwarg to pass multimodal data in as to vllm/hf runner instances.
     runner_mm_key: str = "images"
 
@@ -63,13 +63,13 @@ class ImageSizeWrapper(NamedTuple):
     type: SizeType
     # A size factor is a wrapper of 0+ floats,
     # while a fixed size contains an iterable of integer pairs
-    data: Union[Iterable[float], Iterable[Tuple[int, int]]]
+    data: Union[Iterable[float], Iterable[tuple[int, int]]]
 
 
 class VLMTestInfo(NamedTuple):
     """Holds the configuration for 1+ tests for one model architecture."""
 
-    models: List[str]
+    models: list[str]
     test_type: Union[VLMTestType, Iterable[VLMTestType]]
 
     # Should be None only if this is a CUSTOM_INPUTS test
@@ -97,19 +97,19 @@ class VLMTestInfo(NamedTuple):
     max_num_seqs: int = 256
     task: TaskOption = "auto"
     tensor_parallel_size: int = 1
-    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
+    vllm_runner_kwargs: Optional[dict[str, Any]] = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
     get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
-    stop_str: Optional[List[str]] = None
+    stop_str: Optional[list[str]] = None
 
     # Exposed options for HF runner
-    hf_model_kwargs: Optional[Dict[str, Any]] = None
+    hf_model_kwargs: Optional[dict[str, Any]] = None
     # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
-    auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM
+    auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
     # Callable to pass to the HF runner to run on inputs; for now, we also pass
     # the data type to input post processing, because almost all of the uses of
     # postprocess_inputs are to fix the data types of BatchEncoding values.
@@ -128,12 +128,12 @@ class VLMTestInfo(NamedTuple):
     # Default expandable params per test; these defaults can be overridden in
     # instances of this object; the complete set of test cases for the model
     # is all combinations of .models + all fields below
-    max_tokens: Union[int, Tuple[int]] = 128
-    num_logprobs: Union[int, Tuple[int]] = 5
+    max_tokens: Union[int, tuple[int]] = 128
+    num_logprobs: Union[int, tuple[int]] = 5
     dtype: Union[str, Iterable[str]] = "half"
     distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
     # Only expanded in video tests
-    num_video_frames: Union[int, Tuple[int]] = 16
+    num_video_frames: Union[int, tuple[int]] = 16
 
     # Fixed image sizes / image size factors; most tests use image_size_factors
     # The values provided for these two fields will be stacked and expanded
@@ -141,19 +141,19 @@ class VLMTestInfo(NamedTuple):
     # once per tests (much like concatenating and wrapping in one parametrize
     # call)
     image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
-    image_sizes: Optional[Iterable[Iterable[Tuple[int, int]]]] = None
+    image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
 
     # Hack for updating a prompt to take into a local path; currently only used
     # for Qwen-VL, which requires encoding the image path / url into the prompt
     # for HF runner
     prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]],
+        Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]],
                  str]] = None  # noqa: E501
 
     # Allows configuring a test to run with custom inputs
-    custom_test_opts: Optional[List[CustomTestOptions]] = None
+    custom_test_opts: Optional[list[CustomTestOptions]] = None
 
-    marks: Optional[List[MarkDecorator]] = None
+    marks: Optional[list[MarkDecorator]] = None
 
     def get_non_parametrized_runner_kwargs(self):
         """Returns a dictionary of expandable kwargs for items that are used
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index 7ed2fb8a635..470dc041077 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -3,7 +3,6 @@
 import importlib.util
 import math
 from array import array
-from typing import List
 
 import openai
 import pytest
@@ -81,14 +80,14 @@ async def client_generate(server_generate: RemoteOpenAIServer):
         yield async_client
 
 
-def run_llm_encode(llm: vllm.LLM, queries: List[str],
-                   instruction: str) -> List[float]:
+def run_llm_encode(llm: vllm.LLM, queries: list[str],
+                   instruction: str) -> list[float]:
     outputs = llm.encode([instruction + q for q in queries], )
     return [output.outputs.embedding for output in outputs]
 
 
-async def run_client_embeddings(client: vllm.LLM, queries: List[str],
-                                instruction: str) -> List[float]:
+async def run_client_embeddings(client: vllm.LLM, queries: list[str],
+                                instruction: str) -> list[float]:
     outputs = await client.embeddings.create(
         model=MODEL_NAME,
         input=[instruction + q for q in queries],
@@ -123,7 +122,7 @@ def get_test_data():
     return queries, q_instruction, documents, d_instruction
 
 
-def validate_embed_output(q_rep: List[float], d_rep: List[float]):
+def validate_embed_output(q_rep: list[float], d_rep: list[float]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
     assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
 
diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
index 567aa509849..bef85eaf372 100644
--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Sequence
+from collections.abc import Sequence
 
 import torch
 import torch.nn.functional as F
@@ -8,8 +8,8 @@
 
 def check_embeddings_close(
     *,
-    embeddings_0_lst: Sequence[List[float]],
-    embeddings_1_lst: Sequence[List[float]],
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
     name_0: str,
     name_1: str,
     tol: float = 1e-3,
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 82f2bf53122..7391df6e1c3 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from functools import partial
-from typing import Callable, Dict, List, Type
+from typing import Callable
 
 import pytest
 import torch
@@ -67,7 +67,7 @@ def get_messages(image: Image.Image, text: str, embed_text: bool):
 
 
 def apply_chat_template_and_add_eos(
-    messages: List[Dict],
+    messages: list[dict],
     apply_chat_template_fn: Callable,
 ):
     prompt = apply_chat_template_fn(
@@ -80,11 +80,11 @@ def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
 
 
 def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    input_texts: List[str],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
     input_images: PromptImageInput,
-    embed_texts: List[bool],
+    embed_texts: list[bool],
     model: str,
     *,
     dtype: str,
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 990c6c150fc..4c2fbd526ed 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Type
-
 import pytest
 import torch.nn.functional as F
 from transformers import AutoModelForVision2Seq
@@ -35,9 +33,9 @@
 
 
 def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    input_texts: List[str],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
     input_images: PromptImageInput,
     model: str,
     *,
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 0cb94874604..3226138a28b 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Type
-
 import pytest
 import torch.nn.functional as F
 
@@ -29,9 +27,9 @@
 
 
 def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    input_texts: List[str],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    input_texts: list[str],
     input_images: PromptImageInput,
     model: str,
     *,
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
index 81b629fdcf1..e8070d28bef 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -3,7 +3,7 @@
 
 Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
 """
-from typing import List, Optional, Tuple, Type
+from typing import Optional
 
 import pytest
 from transformers import AutoModelForSeq2SeqLM
@@ -17,7 +17,7 @@
 
 
 def vllm_to_hf_output(
-    vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
     decoder_prompt_type: DecoderPromptType,
 ):
     """Sanitize vllm output to be comparable with hf output."""
@@ -31,9 +31,9 @@ def vllm_to_hf_output(
 
 
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
     decoder_prompt_type: DecoderPromptType,
     model: str,
     *,
diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py
index de18deab11f..a6ec333e2e9 100644
--- a/tests/models/encoder_decoder/vision_language/test_florence2.py
+++ b/tests/models/encoder_decoder/vision_language/test_florence2.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional, Type
+from typing import Optional
 
 import pytest
 from PIL import Image
@@ -51,8 +51,8 @@ def hf_to_vllm_output(hf_output: tuple[list[int], str,
 
 
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     inputs: list[list[ExplicitEncoderDecoderPrompt]],
     model: str,
     *,
@@ -114,7 +114,7 @@ def run_test(
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner],
+def test_models(hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
                 image_assets: _ImageAssets, model: str,
                 size_factors: list[int], dtype: str, max_tokens: int,
                 num_logprobs: int) -> None:
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 4fee04fdb7b..1e202907171 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple, Type, overload
+from typing import Optional, overload
 
 import pytest
 import torch
@@ -64,7 +64,7 @@
 }
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
                                          Optional[SampleLogprobs]],
                       model: str):
     """Sanitize vllm output to be comparable with hf output."""
@@ -91,9 +91,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 def _get_inputs(
     image_assets: _ImageAssets,
     *,
-    size_factors: Optional[List[float]] = None,
-    sizes: Optional[List[Tuple[int, int]]] = None,
-) -> List[Tuple[List[str], PromptImageInput]]:
+    size_factors: Optional[list[float]] = None,
+    sizes: Optional[list[tuple[int, int]]] = None,
+) -> list[tuple[list[str], PromptImageInput]]:
     images = [asset.pil_image for asset in image_assets]
 
     if size_factors is not None:
@@ -123,12 +123,12 @@ def _get_inputs(
 
 @overload
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     image_assets: _ImageAssets,
     model: str,
     *,
-    size_factors: List[float],
+    size_factors: list[float],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -140,12 +140,12 @@ def run_test(
 
 @overload
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     image_assets: _ImageAssets,
     model: str,
     *,
-    sizes: List[Tuple[int, int]],
+    sizes: list[tuple[int, int]],
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -156,13 +156,13 @@ def run_test(
 
 
 def run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     image_assets: _ImageAssets,
     model: str,
     *,
-    size_factors: Optional[List[float]] = None,
-    sizes: Optional[List[Tuple[int, int]]] = None,
+    size_factors: Optional[list[float]] = None,
+    sizes: Optional[list[tuple[int, int]]] = None,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -183,9 +183,9 @@ def run_test(
 
 
 def _run_test(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
-    inputs: List[Tuple[List[str], PromptImageInput]],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput]],
     model: str,
     *,
     dtype: str,
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 5c43e4eed78..84471c92a29 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for H2OVL's multimodal preprocessing kwargs."""
-from typing import Mapping, Optional
+from collections.abc import Mapping
+from typing import Optional
 
 import pytest
 from PIL import Image
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index cc777fdf57b..adbc4f5b558 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
-from typing import Mapping, Optional
+from collections.abc import Mapping
+from typing import Optional
 
 import pytest
 from PIL import Image
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 78a65b93870..b5ded20c5af 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Mapping, Set
 from dataclasses import dataclass, field
-from typing import AbstractSet, Any, Literal, Mapping, Optional
+from typing import Any, Literal, Optional
 
 import pytest
 from packaging.version import Version
@@ -324,7 +325,7 @@ def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None:
 
         self.hf_models = hf_models
 
-    def get_supported_archs(self) -> AbstractSet[str]:
+    def get_supported_archs(self) -> Set[str]:
         return self.hf_models.keys()
 
     def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 31e3c1f7b98..243cb92ae25 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -4,7 +4,6 @@
 Run `pytest tests/models/test_transformers.py`.
 """
 from contextlib import nullcontext
-from typing import Type
 
 import pytest
 
@@ -14,8 +13,8 @@
 
 
 def check_implementation(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     example_prompts: list[str],
     model: str,
     **kwargs,
@@ -47,8 +46,8 @@ def check_implementation(
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
     ])  # trust_remote_code=True by default
 def test_models(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     example_prompts: list[str],
     model: str,
     model_impl: str,
@@ -71,8 +70,8 @@ def test_models(
 
 @multi_gpu_test(num_gpus=2)
 def test_distributed(
-    hf_runner: Type[HfRunner],
-    vllm_runner: Type[VllmRunner],
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
     example_prompts,
 ):
     kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
@@ -92,7 +91,7 @@ def test_distributed(
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_quantization(
-    vllm_runner: Type[VllmRunner],
+    vllm_runner: type[VllmRunner],
     example_prompts: list[str],
     model: str,
     quantization_kwargs: dict[str, str],
diff --git a/tests/models/utils.py b/tests/models/utils.py
index a90efb17672..b0182d545f4 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import warnings
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Optional, Union
 
 import torch
 
@@ -9,7 +10,7 @@
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
-TokensText = Tuple[List[int], str]
+TokensText = tuple[list[int], str]
 
 
 def check_outputs_equal(
@@ -46,7 +47,7 @@ def check_outputs_equal(
 # * List of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
                                                                     float]],
                                                           SampleLogprobs]]]
 
@@ -57,8 +58,8 @@ def check_outputs_equal(
 # * Optional list of top sample logprobs for each sampled token
 #
 # Assumes prompt logprobs were not requested.
-TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
-                                                        List[Dict[str,
+TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
+                                                        list[dict[str,
                                                                   Logprob]]]]]
 
 # Representation of generated sequence as a tuple of
@@ -68,9 +69,9 @@ def check_outputs_equal(
 # * Optional list of top prompt logprobs for each prompt token
 #
 # Allows prompt logprobs to be requested.
-TokensTextLogprobsPromptLogprobs = Tuple[
-    List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
-    Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
+TokensTextLogprobsPromptLogprobs = tuple[
+    list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
+    Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
 
 
 def check_logprobs_close(
@@ -254,8 +255,8 @@ def build_model_context(
     tokenizer_name: Optional[str] = None,
     trust_remote_code: bool = False,
     dtype: Optional[Union[str, torch.dtype]] = None,
-    mm_processor_kwargs: Optional[Dict] = None,
-    limit_mm_per_prompt: Optional[Dict] = None,
+    mm_processor_kwargs: Optional[dict] = None,
+    limit_mm_per_prompt: Optional[dict] = None,
     disable_mm_preprocessor_cache: bool = True,
 ):
     """Creates an InputContext for a given model.
diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py
index 11e44f12bc5..64559609abb 100644
--- a/tests/mq_llm_engine/utils.py
+++ b/tests/mq_llm_engine/utils.py
@@ -2,7 +2,7 @@
 
 import asyncio
 import multiprocessing
-from typing import Callable, Tuple, Union
+from typing import Callable, Union
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -16,7 +16,7 @@ async def generate(
         client: MQLLMEngineClient,
         request_id: str,
         num_tokens: int,
-        return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]:
+        return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
 
     final_output = None
     count = 0
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index 9822cee14a2..f925e42f46d 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Test the AsyncLLMEngine with multi-step-decoding
-from typing import List, Optional
+from typing import Optional
 
 import pytest
 
@@ -17,7 +17,7 @@
 NUM_SCHEDULER_STEPS = [8]  # Multi-step decoding steps
 NUM_PROMPTS = [10]
 
-DEFAULT_SERVER_ARGS: List[str] = [
+DEFAULT_SERVER_ARGS: list[str] = [
     "--distributed-executor-backend",
     "ray",
     "--gpu-memory-utilization",
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index f9e0f507a1e..8f76d895fdd 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -4,7 +4,7 @@
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
+from typing import TYPE_CHECKING, NamedTuple, Optional
 
 import numpy as np
 import pytest
@@ -30,7 +30,7 @@
 
 
 @pytest.fixture(scope="module")
-def url_images() -> Dict[str, Image.Image]:
+def url_images() -> dict[str, Image.Image]:
     connector = MediaConnector()
 
     return {
@@ -39,7 +39,7 @@ def url_images() -> Dict[str, Image.Image]:
     }
 
 
-def get_supported_suffixes() -> Tuple[str, ...]:
+def get_supported_suffixes() -> tuple[str, ...]:
     # We should at least test the file types mentioned in GPT-4 with Vision
     OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
 
@@ -66,7 +66,7 @@ async def test_fetch_image_http(image_url: str):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
-async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+async def test_fetch_image_base64(url_images: dict[str, Image.Image],
                                   image_url: str, suffix: str):
     connector = MediaConnector()
     url_image = url_images[image_url]
diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/test_logits_processor.py
index 37d59c9e76a..6d1514088f9 100644
--- a/tests/neuron/test_logits_processor.py
+++ b/tests/neuron/test_logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Tuple
 from unittest.mock import patch
 
 import pytest
@@ -33,7 +32,7 @@ def forward(self, *args, **kwargs):
 
 def _prepare_test(
         batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
     vocab_size = 32000
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
     fake_logits = torch.full((batch_size, vocab_size),
diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
index a376d2cb340..bc4a41cdf00 100644
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Iterable, Optional, Tuple, Union
+from collections.abc import Iterable
+from typing import Optional, Union
 
 import torch
 import torch.nn as nn
@@ -59,7 +60,7 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
 
         weights = self.hf_to_vllm_mapper.apply(weights)
         weights = ((name, data) for name, data in weights
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 0abbd8ebb59..e30166842ea 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -5,7 +5,6 @@
 """
 
 from dataclasses import dataclass
-from typing import Tuple
 
 import pytest
 
@@ -53,7 +52,7 @@ class ModelPair:
 
 
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
-def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None:
+def test_auto_gptq(model_arg_exptype: tuple[str, None, str]) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
 
     try:
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index da59dc75afc..f64dca6e4bb 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -5,7 +5,7 @@
 
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import pytest
 import torch
@@ -58,7 +58,7 @@ def get_name(self) -> str:
         """Name of the quantization method."""
         return "custom_quant"
 
-    def get_supported_act_dtypes(self) -> List["torch.dtype"]:
+    def get_supported_act_dtypes(self) -> list["torch.dtype"]:
         """List of supported activation dtypes."""
         return [torch.float16, torch.bfloat16]
 
@@ -68,12 +68,12 @@ def get_min_capability(cls) -> int:
         return -1
 
     @staticmethod
-    def get_config_filenames() -> List[str]:
+    def get_config_filenames() -> list[str]:
         """List of filenames to search for in the model directory."""
         return []
 
     @classmethod
-    def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig":
+    def from_config(cls, config: dict[str, Any]) -> "CustomQuantConfig":
         """Create a config class from the model's quantization config."""
         return CustomQuantConfig(num_bits=config.get("num_bits", 8))
 
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 78bdd9b0b95..58c7c256473 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import torch
 
@@ -70,7 +68,7 @@ def test_get_prompt_logprobs(
             assert (len(logprobs) == num_top_logprobs
                     or len(logprobs) == num_top_logprobs + 1)
         output_text = result.outputs[0].text
-        output_string_from_most_likely_tokens_lst: List[str] = []
+        output_string_from_most_likely_tokens_lst: list[str] = []
         for top_logprobs in result.outputs[0].logprobs:
             top_logprob = next(iter(top_logprobs.values()))
             output_string_from_most_likely_tokens_lst.append(
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 143f5299941..29e73eb1bea 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -4,7 +4,7 @@
 Run `pytest tests/samplers/test_no_bad_words.py`.
 
 """
-from typing import List, Optional
+from typing import Optional
 
 from transformers import AutoTokenizer
 
@@ -16,8 +16,8 @@ def _generate(
     prompt: str,
     num_prompt_tokens: int,
     temperature: float = 0,
-    bad_words: Optional[List[str]] = None,
-) -> List[int]:
+    bad_words: Optional[list[str]] = None,
+) -> list[int]:
     sampling_params = SamplingParams(
         temperature=temperature,
         bad_words=bad_words,
@@ -59,7 +59,7 @@ def test_one_token_bad_word(self, vllm_runner):
 
     def _generate(self,
                   model: LLM,
-                  bad_words: Optional[List[str]] = None) -> List[int]:
+                  bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
             model=model,
             prompt=self.PROMPT,
@@ -69,7 +69,7 @@ def _generate(self,
 
     def _encode(self,
                 prompt: str,
-                add_special_tokens: bool = True) -> List[int]:
+                add_special_tokens: bool = True) -> list[int]:
         return self.tokenizer(prompt,
                               add_special_tokens=add_special_tokens).input_ids
 
@@ -149,7 +149,7 @@ def test_two_token_bad_word(self, vllm_runner):
 
     def _generate(self,
                   model: LLM,
-                  bad_words: Optional[List[str]] = None) -> List[int]:
+                  bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
             model=model,
             prompt=self.PROMPT,
@@ -158,7 +158,7 @@ def _generate(self,
         )
 
     @staticmethod
-    def _contains(sequence: List[int], subsequence: List[int]) -> bool:
+    def _contains(sequence: list[int], subsequence: list[int]) -> bool:
         searched = False
 
         for start in range(len(sequence)):
@@ -181,6 +181,6 @@ def _contains(sequence: List[int], subsequence: List[int]) -> bool:
 
     def _encode(self,
                 prompt: str,
-                add_special_tokens: bool = True) -> List[int]:
+                add_special_tokens: bool = True) -> list[int]:
         return self.tokenizer(prompt,
                               add_special_tokens=add_special_tokens).input_ids
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index cc199bf682f..2b86dcac7f0 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 """Tests for rejection sampling."""
-from typing import List, Tuple
 
 import pytest
 import torch
@@ -416,8 +415,8 @@ def test_rejection_sampling_approximates_target_distribution(
         draft_and_target_probs_equal)
 
     sample_sizes = [10, 100, 1_000, 10_000, 100_000]
-    distance_wrt_reference: List[float] = []
-    distance_wrt_target: List[float] = []
+    distance_wrt_reference: list[float] = []
+    distance_wrt_target: list[float] = []
 
     for num_samples in sample_sizes:
         (reference_vs_rejsample_dist,
@@ -452,7 +451,7 @@ def test_rejection_sampling_approximates_target_distribution(
             expected_improvement_multiplier)
 
 
-def get_ratio_first_to_last(elements: List[float]) -> float:
+def get_ratio_first_to_last(elements: list[float]) -> float:
     return elements[0] / elements[-1]
 
 
@@ -477,7 +476,7 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler):
 
     def generate_probs_for_test(
         self, draft_and_target_probs_equal: bool
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         draft_probs, target_probs = (F.softmax(
             torch.rand(self.vocab_size, dtype=torch.float32),
             dim=-1,
@@ -499,7 +498,7 @@ def generate_probs_for_test(
     def run_and_compare_distributions(self, draft_probs: torch.Tensor,
                                       target_probs: torch.Tensor,
                                       reference_probs: torch.Tensor,
-                                      num_samples: int) -> Tuple[float, float]:
+                                      num_samples: int) -> tuple[float, float]:
         # Sample using rejection sampling.
         rej_sample_probs = self._estimate_rejection_sampling_pdf(
             draft_probs, target_probs, num_samples)
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index ca09e536a06..68944ac7e1e 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -3,7 +3,7 @@
 import itertools
 import random
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 from unittest.mock import Mock, patch
 
 import pytest
@@ -30,7 +30,7 @@ def forward(self, *args, **kwargs):
 
 def _prepare_test(
         batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
     fake_logits = torch.full((batch_size, VOCAB_SIZE),
                              1e-2,
@@ -53,8 +53,8 @@ def _do_sample(
     sampling_params: SamplingParams,
     device: str,
 ):
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -171,7 +171,7 @@ def test_sampler_min_tokens_penalty(seed: int, device: str):
     def create_sampling_params(min_tokens,
                                eos_token_id=0,
                                *,
-                               stop_token_ids: Optional[List[int]] = None,
+                               stop_token_ids: Optional[list[int]] = None,
                                prompt_logprobs: Optional[int] = None):
         sampling_params = SamplingParams(
             min_tokens=min_tokens,
@@ -196,7 +196,7 @@ def generate_test_case():
         batch_size = random.randint(1, 128)
 
         expected_penalization = []
-        sequence_metadata_list: List[SequenceGroupMetadata] = []
+        sequence_metadata_list: list[SequenceGroupMetadata] = []
         # 20% chance to generate seq group metadata list with all prompts
         is_prompt = random.random() < 0.2
         while batch_size > 0:
@@ -216,8 +216,8 @@ def generate_test_case():
                 eos_token_id=eos_token_id,
                 stop_token_ids=stop_token_ids)
 
-            seq_data: Dict[int, SequenceData] = {}
-            seq_group_penalization: List[bool] = []
+            seq_data: dict[int, SequenceData] = {}
+            seq_group_penalization: list[bool] = []
             for _ in range(num_seqs):
                 num_input = random.randint(1, 100)
                 num_generated = 0 if is_prompt else random.randint(1, 100)
@@ -376,16 +376,16 @@ def generate_test_case():
     else:
         test_cases = [generate_test_case()]
 
-    def run_test_case(*, expected_penalization: List[bool],
-                      seq_group_metadata_list: List[SequenceGroupMetadata]):
+    def run_test_case(*, expected_penalization: list[bool],
+                      seq_group_metadata_list: list[SequenceGroupMetadata]):
         assert expected_penalization, \
             "Invalid test case, need expected_penalization"
         assert seq_group_metadata_list, \
             "Invalid test case, need seq_group_metadata_list"
 
         batch_size = 0
-        seq_lens: List[int] = []
-        sampling_params_per_row: List[SamplingParams] = []
+        seq_lens: list[int] = []
+        sampling_params_per_row: list[SamplingParams] = []
         for sgm in seq_group_metadata_list:
             sampling_params = sgm.sampling_params
 
@@ -456,11 +456,11 @@ def test_sampler_mixed(seed: int, device: str):
     batch_size = random.randint(1, 256)
     input_tensor, fake_logits, sampler = _prepare_test(batch_size)
 
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    expected_tokens: List[Optional[List[int]]] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    expected_tokens: list[Optional[list[int]]] = []
+    seq_lens: list[int] = []
     for i in range(batch_size):
-        expected: Optional[List[int]] = None
+        expected: Optional[list[int]] = None
         sampling_type = random.randint(0, 2)
         if sampling_type == 0:
             sampling_params = SamplingParams(temperature=0)
@@ -492,7 +492,7 @@ def test_sampler_mixed(seed: int, device: str):
             ))
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
-    generators: Dict[str, torch.Generator] = {}
+    generators: dict[str, torch.Generator] = {}
 
     def test_sampling():
         sampling_metadata = SamplingMetadata.prepare(
@@ -587,8 +587,8 @@ class MockConfig:
                                                         device=device)
     assert len(processors) == 2  # top_p and top_k
 
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    seq_lens: List[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
     for i in range(batch_size):
         seq_group_metadata_list.append(
             SequenceGroupMetadata(
@@ -669,10 +669,10 @@ def test_sampler_repetition_penalty_mixed(device: str):
 
     vocab_size = 8
 
-    def test_sampling_params(sampling_params: List[SamplingParams]):
+    def test_sampling_params(sampling_params: list[SamplingParams]):
 
-        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        seq_lens: List[int] = []
+        seq_group_metadata_list: list[SequenceGroupMetadata] = []
+        seq_lens: list[int] = []
         for i in range(2):
             seq_group_metadata_list.append(
                 SequenceGroupMetadata(
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 53c888816a6..fe4a1c13fc7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Sequence
 from itertools import cycle
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import Optional, Union
 
 import pytest
 import torch
@@ -64,9 +65,9 @@ def maybe_assert_ngram_worker(llm):
 
 def get_output_from_llm_generator(
         llm_generator, prompts,
-        sampling_params) -> Tuple[List[str], List[List[int]], float]:
-    tokens: List[str] = []
-    token_ids: List[List[int]] = []
+        sampling_params) -> tuple[list[str], list[list[int]], float]:
+    tokens: list[str] = []
+    token_ids: list[list[int]] = []
     acceptance_rate: float = -1.0
     for llm in llm_generator():
         maybe_assert_ngram_worker(llm)
diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
index fe95ff9b9c3..9edd8bd4c00 100644
--- a/tests/spec_decode/test_batch_expansion.py
+++ b/tests/spec_decode/test_batch_expansion.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import torch
 
@@ -42,7 +40,7 @@ def test_get_token_ids_to_score(k: int):
         device='cuda',
     )
 
-    expected_output: List[List[int]] = [
+    expected_output: list[list[int]] = [
         [],
     ]
     for i in range(proposal_token_ids.shape[0]):
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 2bf401613f0..ca37c9a68df 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Dict, List
 from unittest.mock import MagicMock
 
 import pytest
@@ -221,7 +220,7 @@ def test_same_output_for_multi_step():
 
     # Run single-step repeatedly.
     zero_kv_cache(worker.cache_engine)
-    single_step_output: List[SamplerOutput] = []
+    single_step_output: list[SamplerOutput] = []
     continuations = [[1] for _ in prompts]
     set_random_seed(seed)
 
@@ -243,15 +242,15 @@ def test_same_output_for_multi_step():
             continuations[i].append(seq_group_output.samples[0].output_token)
 
     # Get token ids and logprobs for comparison.
-    multi_step_output_logprobs: List[List[Dict[int,
+    multi_step_output_logprobs: list[list[dict[int,
                                                Logprob]]] = [[]
                                                              for _ in prompts]
-    single_step_output_logprobs: List[List[Dict[int,
+    single_step_output_logprobs: list[list[dict[int,
                                                 Logprob]]] = [[]
                                                               for _ in prompts]
 
-    multi_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
-    single_step_output_token_ids: List[List[int]] = [[] for _ in prompts]
+    multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts]
+    single_step_output_token_ids: list[list[int]] = [[] for _ in prompts]
     for i, _ in enumerate(prompts):
         for multi_step, single_step in zip(multi_step_output,
                                            single_step_output):
@@ -336,7 +335,7 @@ def test_multi_step_with_batch_expansion_correct_output():
     # will simulate the bonus token case with the second token
     # being the bonus token.
     zero_kv_cache(worker.cache_engine)
-    single_step_output: List[SamplerOutput] = []
+    single_step_output: list[SamplerOutput] = []
     set_random_seed(seed)
     for _ in range(num_steps):
         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
@@ -430,7 +429,7 @@ def test_multi_step_with_batch_expansion_incorrect_output():
     # will simulate the bonus token case with the second token
     # being the bonus token.
     zero_kv_cache(worker.cache_engine)
-    single_step_output: List[SamplerOutput] = []
+    single_step_output: list[SamplerOutput] = []
     set_random_seed(seed)
     for _ in range(num_steps):
         seq_group_metadata_list = create_seq_group_metadata_from_prompts(
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index 7bbbb0236da..161cc9fbf55 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import List
 
 import pytest
 import torch
@@ -15,7 +14,7 @@
 from .utils import create_batch, create_worker
 
 
-def create_proposal(propose_lens: List[int], vocab_size: int,
+def create_proposal(propose_lens: list[int], vocab_size: int,
                     device: str) -> SpeculativeProposals:
     batch_size = len(propose_lens)
     max_propose_len = max(propose_lens)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index e4b1a178b0c..f7ef9786a69 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -3,7 +3,6 @@
 import random
 from collections import defaultdict
 from types import SimpleNamespace
-from typing import Dict, List, Set
 from unittest.mock import MagicMock
 
 import pytest
@@ -123,7 +122,7 @@ def test_batch_expansion_correctly_calls_target_model(
             seq_group_metadata_list=seq_group_metadata_list,
             num_lookahead_slots=k))
 
-    seen_contexts: List[List[int]] = []
+    seen_contexts: list[list[int]] = []
 
     call_args_list = target_worker.execute_model.call_args_list
     assert len(call_args_list) == 1
@@ -136,7 +135,7 @@ def test_batch_expansion_correctly_calls_target_model(
             for seq_data in seq_group_metadata.seq_data.values():
                 seen_contexts.append(seq_data.get_token_ids())
 
-    expected_seen_contexts: List[List[int]] = []
+    expected_seen_contexts: list[list[int]] = []
 
     for prompt, prev_generated, draft_tokens in zip(
             prompts, prev_output_tokens, proposal_token_ids.tolist()):
@@ -338,11 +337,11 @@ def test_correctly_formats_output(k: int, batch_size: int,
         next(iter(seq_group_metadata.seq_data.keys()))
         for seq_group_metadata in seq_group_metadata_list
     ]
-    actual_output_by_seq: Dict[int, List[SequenceOutput]] = {
+    actual_output_by_seq: dict[int, list[SequenceOutput]] = {
         seq_id: []
         for seq_id in seq_ids
     }
-    expected_output_by_seq: Dict[int, List[SequenceOutput]] = {
+    expected_output_by_seq: dict[int, list[SequenceOutput]] = {
         seq_id: []
         for seq_id in seq_ids
     }
@@ -728,7 +727,7 @@ def test_populate_seq_ids_with_bonus_tokens():
                                        size=(batch_size, (k + 1)),
                                        dtype=torch.int64,
                                        device='cuda')
-    expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+    expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set)
     for seq_group_metadata in seq_group_metadata_list:
         for seq_id in seq_group_metadata.seq_data:
             expected_request_id_seq_ids_mapping[
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 38f57e99bdb..d303b7f1219 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Sequence as GenericSequence
 from itertools import count
-from typing import Callable, Dict, List, Optional
-from typing import Sequence as GenericSequence
-from typing import TypeVar, Union
+from typing import Callable, Optional, TypeVar, Union
 from unittest.mock import MagicMock
 
 import torch
@@ -44,7 +43,7 @@ def mock_worker(cls=None,
     return worker
 
 
-def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]):
+def patch_execute_model_with_seeds(worker: Worker, rand_seeds: list[int]):
     seed_iter = iter(rand_seeds)
     original_execute_model = worker.execute_model
 
@@ -56,7 +55,7 @@ def new_execute_model(*args, **kwargs):
     return new_execute_model
 
 
-def zero_kv_cache(cache_engine: List[CacheEngine]):
+def zero_kv_cache(cache_engine: list[CacheEngine]):
     assert cache_engine[0].gpu_cache
     for key_blocks, value_blocks in cache_engine[0].gpu_cache:
         key_blocks.zero_()
@@ -106,13 +105,13 @@ def create_worker(cls: Callable[..., T],
 
 
 def create_seq_group_metadata_from_prompts(
-    prompts: List[List[int]],
+    prompts: list[list[int]],
     num_gpu_blocks: int,
     block_size: int,
-    final_prompt_lens: List[int],
-    continuations: Optional[List[List[int]]] = None,
-    seq_ids: Optional[List[int]] = None,
-) -> List[SequenceGroupMetadata]:
+    final_prompt_lens: list[int],
+    continuations: Optional[list[list[int]]] = None,
+    seq_ids: Optional[list[int]] = None,
+) -> list[SequenceGroupMetadata]:
 
     if continuations is None:
         continuations = [[] for _ in prompts]
@@ -149,11 +148,11 @@ def create_seq_group_metadata_from_prompts(
 
 
 def create_chunked_seq_group_metadata_from_prompt(
-        prompt: List[int],
+        prompt: list[int],
         num_gpu_blocks: int,
         chunk_size: int,
         block_size: int,
-        seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]:
+        seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]:
 
     if seq_id is None:
         seq_id = 0
@@ -184,8 +183,8 @@ def create_chunked_seq_group_metadata_from_prompt(
 
 
 def assert_logprobs_dict_allclose(
-        actual_logprobs: List[Dict[int, Logprob]],
-        expected_logprobs: List[Dict[int, Logprob]]) -> None:
+        actual_logprobs: list[dict[int, Logprob]],
+        expected_logprobs: list[dict[int, Logprob]]) -> None:
     for single_step_actual_logprobs, single_step_expected_logprobs in zip(
             actual_logprobs, expected_logprobs):
         assert set(single_step_actual_logprobs.keys()) == set(
@@ -202,7 +201,7 @@ def create_sampler_output_list(
         token_ids: torch.Tensor,
         probs: GenericSequence[Optional[torch.Tensor]],
         logprobs: GenericSequence[Optional[torch.Tensor]],
-        seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]:
+        seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]:
     num_steps, batch_size = token_ids.shape
     token_ids_by_step = token_ids.tolist()
 
@@ -231,9 +230,9 @@ def create_sampler_output_list(
 
 def create_batch(batch_size,
                  k,
-                 prompt_len: Union[int, List[int]] = 10,
+                 prompt_len: Union[int, list[int]] = 10,
                  prev_output_token_len: int = 10,
-                 seq_ids: Optional[List[int]] = None,
+                 seq_ids: Optional[list[int]] = None,
                  num_gpu_blocks: Optional[int] = None,
                  block_size: Optional[int] = None,
                  prefill_chunk_size: Optional[int] = None):
diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py
index 17c128a1765..05d2c624df1 100644
--- a/tests/test_cache_block_hashing.py
+++ b/tests/test_cache_block_hashing.py
@@ -3,7 +3,7 @@
 
 Run `pytest tests/test_cache_block_hashing.py`.
 """
-from typing import List, Optional
+from typing import Optional
 
 import pytest
 
@@ -44,7 +44,7 @@ def flatten_2d(li):
 @pytest.mark.parametrize("concurrent_lora_int_ids",
                          [[None], [1], [None, 1], [None, 1, 2], [1, 2]])
 def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
-                             concurrent_lora_int_ids: List[Optional[int]]):
+                             concurrent_lora_int_ids: list[Optional[int]]):
 
     tokenizer = TokenizerGroup(
         tokenizer_id="facebook/opt-125m",
@@ -53,7 +53,7 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int,
         max_input_length=None,
     )
 
-    hashes: List[List[List[int]]] = []
+    hashes: list[list[list[int]]] = []
 
     for prefix in prefixes:
         for lora_int_id in concurrent_lora_int_ids:
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index fff909154a2..d361808ed2f 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 
 from vllm.inputs import zip_enc_dec_prompts
@@ -45,7 +43,7 @@ def test_parse_single_batch_string_consistent(string_input: str):
 
 
 @pytest.mark.parametrize('token_input', TOKEN_INPUTS)
-def test_parse_single_batch_token_consistent(token_input: List[int]):
+def test_parse_single_batch_token_consistent(token_input: list[int]):
     assert parse_and_batch_prompt(token_input) \
         == parse_and_batch_prompt([token_input])
 
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 993822e9224..11deae309ac 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -155,7 +155,7 @@ def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
             with pytest.raises(ValueError) as ex_info:
                 _configure_vllm_root_logger()
             assert ex_info.type == ValueError  # noqa: E721
-            assert "Invalid logging config. Expected Dict, got" in str(ex_info)
+            assert "Invalid logging config. Expected dict, got" in str(ex_info)
 
 
 @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index 487fbb8fcb8..8301c645b79 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Tuple
 from unittest.mock import patch
 
 import pytest
@@ -33,7 +32,7 @@ def forward(self, *args, **kwargs):
 
 def _prepare_test(
         batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
+) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
     vocab_size = 32000
     input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
     fake_logits = torch.full((batch_size, vocab_size),
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 5b69ffd18bb..8b67e92fca6 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -3,7 +3,7 @@
 import asyncio
 import os
 import socket
-from typing import AsyncIterator, Tuple
+from collections.abc import AsyncIterator
 from unittest.mock import patch
 
 import pytest
@@ -33,7 +33,7 @@ async def mock_async_iterator(idx: int):
     iterators = [mock_async_iterator(i) for i in range(3)]
     merged_iterator = merge_async_iterators(*iterators)
 
-    async def stream_output(generator: AsyncIterator[Tuple[int, str]]):
+    async def stream_output(generator: AsyncIterator[tuple[int, str]]):
         async for idx, output in generator:
             print(f"idx: {idx}, output: {output}")
 
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 851c79d2e09..9aa2eea3154 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, Generator, List, Optional
+from collections.abc import Generator
+from typing import Any, Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -163,7 +164,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer:
 
 @pytest.fixture(name="complete_sequence_token_ids")
 def create_complete_sequence_token_ids(complete_sequence: str,
-                                       tokenizer) -> List[int]:
+                                       tokenizer) -> list[int]:
     complete_sequence_token_ids = tokenizer(complete_sequence).input_ids
     return complete_sequence_token_ids
 
@@ -178,7 +179,7 @@ def create_sequence(prompt_token_ids=None):
 
 
 def create_dummy_logprobs(
-        complete_sequence_token_ids: List[int]) -> List[Dict[int, Logprob]]:
+        complete_sequence_token_ids: list[int]) -> list[dict[int, Logprob]]:
     return [{
         token_id: Logprob(logprob=0.0),
         token_id + 1: Logprob(logprob=0.1)
@@ -186,10 +187,10 @@ def create_dummy_logprobs(
 
 
 def create_dummy_prompt_logprobs(
-        complete_sequence_token_ids: List[int]
-) -> List[Optional[Dict[int, Any]]]:
+        complete_sequence_token_ids: list[int]
+) -> list[Optional[dict[int, Any]]]:
     # logprob for the first prompt token is None.
-    logprobs: List[Optional[Dict[int, Any]]] = [None]
+    logprobs: list[Optional[dict[int, Any]]] = [None]
     logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
     return logprobs
 
@@ -198,7 +199,7 @@ def create_dummy_prompt_logprobs(
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
 @pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True)
 def test_decode_sequence_logprobs(complete_sequence: str,
-                                  complete_sequence_token_ids: List[int],
+                                  complete_sequence_token_ids: list[int],
                                   detokenizer: Detokenizer,
                                   skip_special_tokens: bool):
     """Verify Detokenizer decodes logprobs correctly."""
@@ -208,8 +209,8 @@ def test_decode_sequence_logprobs(complete_sequence: str,
     # Run sequentially.
     seq = create_sequence()
     dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
-    sequential_logprobs_text_chosen_token: List[str] = []
-    sequential_logprobs_text_other_token: List[str] = []
+    sequential_logprobs_text_chosen_token: list[str] = []
+    sequential_logprobs_text_other_token: list[str] = []
     for new_token, logprobs in zip(complete_sequence_token_ids,
                                    dummy_logprobs):
         seq.append_token_id(new_token, logprobs)
@@ -232,7 +233,7 @@ def test_decode_sequence_logprobs(complete_sequence: str,
 
 @pytest.mark.parametrize("complete_sequence", TRUTH)
 @pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
+def test_decode_prompt_logprobs(complete_sequence_token_ids: list[int],
                                 detokenizer: Detokenizer):
     """Verify Detokenizer decodes prompt logprobs correctly."""
     sampling_params = SamplingParams(skip_special_tokens=True,
@@ -249,7 +250,7 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
                                                dummy_logprobs,
                                                position_offset=0)
     # First logprob is None.
-    decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[
+    decoded_prompt_logprobs: list[dict[int, Any]] = dummy_logprobs[
         1:]  # type: ignore
 
     # decoded_prompt_logprobs doesn't contain the first token.
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index 8e99f86917b..d1873823ac1 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -3,7 +3,7 @@
 import asyncio
 import os
 import sys
-from typing import List, Optional
+from typing import Optional
 from unittest.mock import patch
 
 import pytest
@@ -129,7 +129,7 @@ class FailingTokenizerGroup(TokenizerGroup):
 
         def __init__(self,
                      *args,
-                     fail_at: Optional[List[int]] = None,
+                     fail_at: Optional[list[int]] = None,
                      **kwargs):
             super().__init__(*args, **kwargs)
             self.i = 0
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
index 793d38f9c36..772eeb345ca 100644
--- a/tests/tokenization/test_tokenizer_registry.py
+++ b/tests/tokenization/test_tokenizer_registry.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
@@ -17,15 +17,15 @@ def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
         return TestTokenizer()
 
     @property
-    def all_special_tokens_extended(self) -> List[str]:
+    def all_special_tokens_extended(self) -> list[str]:
         raise NotImplementedError()
 
     @property
-    def all_special_tokens(self) -> List[str]:
+    def all_special_tokens(self) -> list[str]:
         raise NotImplementedError()
 
     @property
-    def all_special_ids(self) -> List[int]:
+    def all_special_ids(self) -> list[int]:
         raise NotImplementedError()
 
     @property
@@ -58,7 +58,7 @@ def max_token_id(self) -> int:
 
     def __call__(
         self,
-        text: Union[str, List[str], List[int]],
+        text: Union[str, list[str], list[int]],
         text_pair: Optional[str] = None,
         add_special_tokens: bool = False,
         truncation: bool = False,
@@ -66,10 +66,10 @@ def __call__(
     ):
         raise NotImplementedError()
 
-    def get_vocab(self) -> Dict[str, int]:
+    def get_vocab(self) -> dict[str, int]:
         raise NotImplementedError()
 
-    def get_added_vocab(self) -> Dict[str, int]:
+    def get_added_vocab(self) -> dict[str, int]:
         raise NotImplementedError()
 
     def encode_one(
@@ -77,33 +77,33 @@ def encode_one(
         text: str,
         truncation: bool = False,
         max_length: Optional[int] = None,
-    ) -> List[int]:
+    ) -> list[int]:
         raise NotImplementedError()
 
     def encode(self,
                text: str,
-               add_special_tokens: Optional[bool] = None) -> List[int]:
+               add_special_tokens: Optional[bool] = None) -> list[int]:
         raise NotImplementedError()
 
     def apply_chat_template(self,
-                            messages: List["ChatCompletionMessageParam"],
-                            tools: Optional[List[Dict[str, Any]]] = None,
-                            **kwargs) -> List[int]:
+                            messages: list["ChatCompletionMessageParam"],
+                            tools: Optional[list[dict[str, Any]]] = None,
+                            **kwargs) -> list[int]:
         raise NotImplementedError()
 
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
         raise NotImplementedError()
 
     def decode(self,
-               ids: Union[List[int], int],
+               ids: Union[list[int], int],
                skip_special_tokens: bool = True) -> str:
         raise NotImplementedError()
 
     def convert_ids_to_tokens(
         self,
-        ids: List[int],
+        ids: list[int],
         skip_special_tokens: bool = True,
-    ) -> List[str]:
+    ) -> list[str]:
         raise NotImplementedError()
 
 
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index da033fa1d85..448347be6ec 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import openai
 import pytest
 
@@ -45,7 +43,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
         logprobs=False,
         stream=True,
     )
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     role_sent: bool = False
 
@@ -116,7 +114,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
         stream=True,
     )
 
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     role_sent: bool = False
 
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 7e349c51253..a40675744ba 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Generator, List, Optional
+from collections.abc import Generator
+from typing import Optional
 
 import partial_json_parser
 import pytest
@@ -26,8 +27,8 @@ def jamba_tool_parser(jamba_tokenizer):
     return JambaToolParser(jamba_tokenizer)
 
 
-def assert_tool_calls(actual_tool_calls: List[ToolCall],
-                      expected_tool_calls: List[ToolCall]):
+def assert_tool_calls(actual_tool_calls: list[ToolCall],
+                      expected_tool_calls: list[ToolCall]):
     assert len(actual_tool_calls) == len(expected_tool_calls)
 
     for actual_tool_call, expected_tool_call in zip(actual_tool_calls,
@@ -218,10 +219,10 @@ def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer,
                                       model_output, expected_tool_calls,
                                       expected_content):
     other_content: str = ''
-    function_names: List[str] = []
-    function_args_strs: List[str] = []
+    function_names: list[str] = []
+    function_args_strs: list[str] = []
     tool_call_idx: int = -1
-    tool_call_ids: List[Optional[str]] = []
+    tool_call_ids: list[Optional[str]] = []
 
     for delta_message in stream_delta_message_generator(
             jamba_tool_parser, jamba_tokenizer, model_output):
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index b49a5e8e7e4..910e0b2d51a 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Dict, List, Optional
+from typing import Optional
 
 import openai
 import pytest
@@ -54,7 +54,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
         assert isinstance(tool_call.function.arguments, str)
 
         parsed_arguments = json.loads(tool_call.function.arguments)
-        assert isinstance(parsed_arguments, Dict)
+        assert isinstance(parsed_arguments, dict)
         assert isinstance(parsed_arguments.get("city"), str)
         assert isinstance(parsed_arguments.get("state"), str)
 
@@ -73,8 +73,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
     role_name: Optional[str] = None
     finish_reason_count: int = 0
 
-    tool_call_names: List[str] = []
-    tool_call_args: List[str] = []
+    tool_call_names: list[str] = []
+    tool_call_args: list[str] = []
     tool_call_idx: int = -1
     tool_call_id_count: int = 0
 
@@ -180,7 +180,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
         logprobs=False,
         stream=True)
 
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     role_sent: bool = False
 
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index 45f1bfc45bd..b320b335e33 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Dict, List, Optional
+from typing import Optional
 
 import openai
 import pytest
@@ -44,7 +44,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
 
     # make sure the arguments parse properly
     parsed_arguments = json.loads(tool_calls[0].function.arguments)
-    assert isinstance(parsed_arguments, Dict)
+    assert isinstance(parsed_arguments, dict)
     assert isinstance(parsed_arguments.get("city"), str)
     assert isinstance(parsed_arguments.get("state"), str)
     assert parsed_arguments.get("city") == "Dallas"
@@ -117,7 +117,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
 
     # validate arguments
     streamed_args = json.loads(function_args_str)
-    assert isinstance(streamed_args, Dict)
+    assert isinstance(streamed_args, dict)
     assert isinstance(streamed_args.get("city"), str)
     assert isinstance(streamed_args.get("state"), str)
     assert streamed_args.get("city") == "Dallas"
@@ -128,7 +128,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
     assert choice.message.role == role_name
     assert choice.message.tool_calls[0].function.name == function_name
 
-    # compare streamed with non-streamed args Dict-wise, not string-wise
+    # compare streamed with non-streamed args dict-wise, not string-wise
     # because character-to-character comparison might not work e.g. the tool
     # call parser adding extra spaces or something like that. we care about the
     # dicts matching not byte-wise match
@@ -167,7 +167,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
         logprobs=False,
         stream=True)
 
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     role_sent: bool = False
 
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index a7dfb10780a..fd947bd7fed 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from copy import deepcopy
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 from openai.types.chat import (ChatCompletionMessageParam,
                                ChatCompletionToolParam)
@@ -12,14 +12,14 @@
 
 class ServerConfig(TypedDict, total=False):
     model: str
-    arguments: List[str]
+    arguments: list[str]
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
     supports_rocm: Optional[bool]
 
 
-def patch_system_prompt(messages: List[Dict[str, Any]],
-                        system_prompt: str) -> List[Dict[str, Any]]:
+def patch_system_prompt(messages: list[dict[str, Any]],
+                        system_prompt: str) -> list[dict[str, Any]]:
     new_messages = deepcopy(messages)
     if new_messages[0]["role"] == "system":
         new_messages[0]["content"] = system_prompt
@@ -28,8 +28,8 @@ def patch_system_prompt(messages: List[Dict[str, Any]],
     return new_messages
 
 
-def ensure_system_prompt(messages: List[Dict[str, Any]],
-                         config: ServerConfig) -> List[Dict[str, Any]]:
+def ensure_system_prompt(messages: list[dict[str, Any]],
+                         config: ServerConfig) -> list[dict[str, Any]]:
     prompt = config.get("system_prompt")
     if prompt:
         return patch_system_prompt(messages, prompt)
@@ -39,9 +39,9 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
 
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
-ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
+ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
 
-CONFIGS: Dict[str, ServerConfig] = {
+CONFIGS: dict[str, ServerConfig] = {
     "hermes": {
         "model":
         "NousResearch/Hermes-3-Llama-3.1-8B",
@@ -205,7 +205,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
     }
 }
 
-MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
+MESSAGES_WITHOUT_TOOLS: list[ChatCompletionMessageParam] = [{
     "role":
     "user",
     "content":
@@ -222,14 +222,14 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
     "Can you tell me a joke please?"
 }]
 
-MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{
+MESSAGES_ASKING_FOR_TOOLS: list[ChatCompletionMessageParam] = [{
     "role":
     "user",
     "content":
     "What is the weather in Dallas, Texas in Fahrenheit?"
 }]
 
-MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
+MESSAGES_WITH_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
     "role":
     "user",
     "content":
@@ -258,7 +258,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
     "cloudy skies and a low chance of rain."
 }]
 
-MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{
+MESSAGES_ASKING_FOR_PARALLEL_TOOLS: list[ChatCompletionMessageParam] = [{
     "role":
     "user",
     "content":
@@ -266,7 +266,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
     "Fahrenheit?"
 }]
 
-MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{
+MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: list[ChatCompletionMessageParam] = [{
     "role":
     "user",
     "content":
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 592775e8b89..5fc5d08b327 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -2,8 +2,9 @@
 
 import os
 import threading
+from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Dict, Iterable, Literal
+from typing import Callable, Literal
 
 import grpc
 import pytest
@@ -25,7 +26,7 @@
 
 
 def decode_value(value: AnyValue):
-    field_decoders: Dict[FieldName, Callable] = {
+    field_decoders: dict[FieldName, Callable] = {
         "bool_value": (lambda v: v.bool_value),
         "string_value": (lambda v: v.string_value),
         "int_value": (lambda v: v.int_value),
diff --git a/tests/utils.py b/tests/utils.py
index 2ad91ca2c86..5a97636eec6 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -11,7 +11,7 @@
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Type, Union
+from typing import Any, Callable, Optional, Union
 
 import openai
 import pytest
@@ -73,9 +73,9 @@ class RemoteOpenAIServer:
 
     def __init__(self,
                  model: str,
-                 vllm_serve_args: List[str],
+                 vllm_serve_args: list[str],
                  *,
-                 env_dict: Optional[Dict[str, str]] = None,
+                 env_dict: Optional[dict[str, str]] = None,
                  auto_port: bool = True,
                  max_wait_seconds: Optional[float] = None) -> None:
         if auto_port:
@@ -183,7 +183,7 @@ def _test_completion(
     client: openai.OpenAI,
     model: str,
     prompt: str,
-    token_ids: List[int],
+    token_ids: list[int],
 ):
     results = []
 
@@ -400,10 +400,10 @@ def _test_image_text(
 
 
 def compare_two_settings(model: str,
-                         arg1: List[str],
-                         arg2: List[str],
-                         env1: Optional[Dict[str, str]] = None,
-                         env2: Optional[Dict[str, str]] = None,
+                         arg1: list[str],
+                         arg2: list[str],
+                         env1: Optional[dict[str, str]] = None,
+                         env2: Optional[dict[str, str]] = None,
                          *,
                          method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
@@ -429,8 +429,8 @@ def compare_two_settings(model: str,
 
 
 def compare_all_settings(model: str,
-                         all_args: List[List[str]],
-                         all_envs: List[Optional[Dict[str, str]]],
+                         all_args: list[list[str]],
+                         all_envs: list[Optional[dict[str, str]]],
                          *,
                          method: str = "generate",
                          max_wait_seconds: Optional[float] = None) -> None:
@@ -470,7 +470,7 @@ def compare_all_settings(model: str,
 
     prompt = "Hello, my name is"
     token_ids = tokenizer(prompt).input_ids
-    ref_results: List = []
+    ref_results: list = []
     for i, (args, env) in enumerate(zip(all_args, all_envs)):
         if can_force_load_format:
             # we are comparing the results and
@@ -481,7 +481,7 @@ def compare_all_settings(model: str,
             # environment variable to force the load format,
             # e.g. in quantization tests.
             args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
-        compare_results: List = []
+        compare_results: list = []
         results = ref_results if i == 0 else compare_results
         with RemoteOpenAIServer(model,
                                 args,
@@ -582,7 +582,7 @@ def multi_process_parallel(
 
 
 @contextmanager
-def error_on_warning(category: Type[Warning] = Warning):
+def error_on_warning(category: type[Warning] = Warning):
     """
     Within the scope of this context manager, tests will fail if any warning
     of the given category is emitted.
@@ -604,7 +604,7 @@ def get_physical_device_indices(devices):
 
 
 @_nvml()
-def wait_for_gpu_memory_to_clear(devices: List[int],
+def wait_for_gpu_memory_to_clear(devices: list[int],
                                  threshold_bytes: int,
                                  timeout_s: float = 120) -> None:
     # Use nvml instead of pytorch to reduce measurement error from torch cuda
@@ -612,8 +612,8 @@ def wait_for_gpu_memory_to_clear(devices: List[int],
     devices = get_physical_device_indices(devices)
     start_time = time.time()
     while True:
-        output: Dict[int, str] = {}
-        output_raw: Dict[int, float] = {}
+        output: dict[int, str] = {}
+        output_raw: dict[int, float] = {}
         for device in devices:
             if current_platform.is_rocm():
                 dev_handle = amdsmi_get_processor_handles()[device]
@@ -758,13 +758,13 @@ def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
 
 
 async def completions_with_server_args(
-    prompts: List[str],
+    prompts: list[str],
     model_name: str,
-    server_cli_args: List[str],
+    server_cli_args: list[str],
     num_logprobs: Optional[int],
     max_wait_seconds: int = 240,
     max_tokens: Union[int, list] = 5,
-) -> List[Completion]:
+) -> list[Completion]:
     '''Construct a remote OpenAI server, obtain an async client to the
     server & invoke the completions API to obtain completions.
 
@@ -807,7 +807,7 @@ async def completions_with_server_args(
     return outputs
 
 
-def get_client_text_generations(completions: List[Completion]) -> List[str]:
+def get_client_text_generations(completions: list[Completion]) -> list[str]:
     '''Extract generated tokens from the output of a
     request made to an Open-AI-protocol completions endpoint.
     '''
@@ -816,7 +816,7 @@ def get_client_text_generations(completions: List[Completion]) -> List[str]:
 
 
 def get_client_text_logprob_generations(
-        completions: List[Completion]) -> List[TextTextLogprobs]:
+        completions: list[Completion]) -> list[TextTextLogprobs]:
     '''Operates on the output of a request made to an Open-AI-protocol
     completions endpoint; obtains top-rank logprobs for each token in
     each :class:`SequenceGroup`
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 8956393c0bf..cce2fb2c481 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
-from typing import List
 
 import pytest
 
@@ -434,7 +433,7 @@ def test_cache_blocks():
 
     # Test that blocks are cached correctly for 2 full blocks from the start.
     blocks = [KVCacheBlock(block_id=i) for i in range(2)]
-    block_hashes: List[BlockHashType] = []
+    block_hashes: list[BlockHashType] = []
 
     block_pool.cache_full_blocks(
         request=req,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index eb730973c94..f45c21ab75b 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Optional
+from typing import Optional
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -48,9 +48,9 @@ def create_scheduler(
 def create_requests(
     num_requests: int,
     num_tokens: int = 10,
-    mm_positions: Optional[List[PlaceholderRange]] = None,
+    mm_positions: Optional[list[PlaceholderRange]] = None,
     max_tokens: int = 16,
-    stop_token_ids: Optional[List[int]] = None,
+    stop_token_ids: Optional[list[int]] = None,
 ):
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py
index 560dc312185..8872f0388dd 100644
--- a/tests/v1/engine/conftest.py
+++ b/tests/v1/engine/conftest.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Tuple
-
 import pytest
 import torch
 from transformers import AutoTokenizer
@@ -17,8 +15,8 @@
 
 from tests.v1.engine.utils import FULL_STRINGS  # isort: skip
 
-EngineCoreSampleLogprobsType = List[Tuple[torch.Tensor, torch.Tensor]]
-EngineCorePromptLogprobsType = Tuple[torch.Tensor, torch.Tensor]
+EngineCoreSampleLogprobsType = list[tuple[torch.Tensor, torch.Tensor]]
+EngineCorePromptLogprobsType = tuple[torch.Tensor, torch.Tensor]
 
 
 def _build_test_vectors_no_logprobs() -> DummyOutputProcessorTestVectors:
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index d864cb2af23..e7b91aeb0fb 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from contextlib import ExitStack
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import pytest
 
@@ -47,7 +47,7 @@ async def generate(engine: AsyncLLM,
                    prompt: PromptType,
                    output_kind: RequestOutputKind,
                    max_tokens: int,
-                   prompt_logprobs: Optional[int] = None) -> Tuple[int, str]:
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
 
@@ -114,7 +114,7 @@ async def test_async_llm_refuses_prompt_logprobs_with_apc(
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
 async def test_load(monkeypatch, output_kind: RequestOutputKind,
-                    engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                    engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                   PromptType]):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
@@ -160,7 +160,7 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
 async def test_abort(monkeypatch, output_kind: RequestOutputKind,
-                     engine_args_and_prompt: Tuple[AsyncEngineArgs,
+                     engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                    PromptType]):
 
     with monkeypatch.context() as m, ExitStack() as after:
@@ -177,7 +177,7 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
 
         # Create concurrent requests.
-        tasks: List[asyncio.Task] = []
+        tasks: list[asyncio.Task] = []
         for request_id in request_ids:
             tasks.append(
                 asyncio.create_task(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8c2998e5889..11c22effb12 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -5,7 +5,6 @@
 import time
 import uuid
 from concurrent.futures import Future
-from typing import List
 
 import pytest
 from transformers import AutoTokenizer
@@ -213,7 +212,7 @@ def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
     class DummyExecutor(UniProcExecutor):
 
         def initialize_from_config(
-                self, kv_cache_configs: List[KVCacheConfig]) -> None:
+                self, kv_cache_configs: list[KVCacheConfig]) -> None:
             super().initialize_from_config(kv_cache_configs)
 
             # This executor actually can only run 1 batch at a time
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index a7c02322ff0..3880a3dd9b8 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,7 +3,7 @@
 import asyncio
 import time
 import uuid
-from typing import Dict, List, Optional
+from typing import Optional
 
 import pytest
 from transformers import AutoTokenizer
@@ -44,7 +44,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
     )
 
 
-def loop_until_done(client: EngineCoreClient, outputs: Dict):
+def loop_until_done(client: EngineCoreClient, outputs: dict):
 
     while True:
         engine_core_outputs = client.get_output().outputs
@@ -62,7 +62,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
             break
 
 
-async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
+async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
 
     while True:
         engine_core_outputs = (await client.get_output_async()).outputs
@@ -121,7 +121,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
             client.add_request(request)
             time.sleep(0.01)
 
-        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
+        outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
         loop_until_done(client, outputs)
 
         for req_id in request_ids:
@@ -207,7 +207,7 @@ async def test_engine_core_client_asyncio(monkeypatch):
             await client.add_request_async(request)
             await asyncio.sleep(0.01)
 
-        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
+        outputs: dict[str, list] = {req_id: [] for req_id in request_ids}
         await loop_until_done_async(client, outputs)
 
         for req_id in request_ids:
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index de2a39ee9c0..33c884e6de3 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Dict, List, Optional, Tuple
+from typing import Optional
 
 import pytest
 
@@ -47,9 +47,9 @@ def vllm_model_apc(vllm_runner, monkeypatch):
 
 
 def _get_test_sampling_params(
-    prompt_list: List[str],
+    prompt_list: list[str],
     seed: Optional[int] = 42,
-) -> Tuple[List[SamplingParams], List[int]]:
+) -> tuple[list[SamplingParams], list[int]]:
     """Generate random sampling params for a batch."""
 
     def get_mostly_n_gt1() -> int:
@@ -81,7 +81,7 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
 
     # Validate each request response
     for out, n in zip(outputs, n_list):
-        completion_counts: Dict[str, int] = {}
+        completion_counts: dict[str, int] = {}
         # Assert correct number of completions
         assert len(out.outputs) == n, (
             f"{len(out.outputs)} completions; {n} expected.")
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 1d47df417dd..0de853ba6e5 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -2,7 +2,7 @@
 
 import math
 import time
-from typing import Dict, List, Optional
+from typing import Optional
 
 import pytest
 
@@ -112,12 +112,12 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
 
 
 def _validate_logprobs(
-    gen_tokens: Dict[str, List[int]],
-    gen_logprobs: Dict[str, Optional[SampleLogprobs]],
-    gen_prompt_logprobs: Dict[str, Optional[PromptLogprobs]],
-    gen_cumulative_logprob: Dict[str, float],
+    gen_tokens: dict[str, list[int]],
+    gen_logprobs: dict[str, Optional[SampleLogprobs]],
+    gen_prompt_logprobs: dict[str, Optional[PromptLogprobs]],
+    gen_cumulative_logprob: dict[str, float],
     dtv: DummyOutputProcessorTestVectors,
-    request_id_list: List[str],
+    request_id_list: list[str],
     num_sample_logprobs: Optional[int],
     num_prompt_logprobs: Optional[int],
 ) -> None:
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 39248ce86f2..02baa4801a4 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -2,7 +2,7 @@
 
 import random
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -61,7 +61,7 @@ def _create_random_top_logprob_test_vector(
 
 
 def _create_random_top_logprob_test_matrix(
-    shape: Tuple,
+    shape: tuple,
     lower: float,
     upper: float,
 ) -> torch.Tensor:
@@ -90,7 +90,7 @@ def _create_random_top_token_test_vector(
         lower: int,
         upper: int,
         sampled_token_id: int,
-        adjust_num_logprobs: bool = True) -> Tuple[torch.Tensor, int]:
+        adjust_num_logprobs: bool = True) -> tuple[torch.Tensor, int]:
     """Create a random vector of top logprob token indices
 
     Use to create fake sample logprobs for testing. The sampled token
@@ -141,11 +141,11 @@ def _create_random_top_token_test_vector(
 
 
 def _create_random_top_token_test_matrix(
-    shape: Tuple[int, int],
+    shape: tuple[int, int],
     lower: int,
     upper: int,
-    tokens_list: List[int],
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    tokens_list: list[int],
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Create a random matrix of top logprob token indices
 
     Use to create fake prompt logprobs for testing.
@@ -160,7 +160,7 @@ def _create_random_top_token_test_matrix(
       upper: upper range of token ids
 
     Returns:
-      Tuple containing:
+      tuple containing:
       - 2D num_tokens x num_logprobs+1 torch Tensor of token ids
       - 1D tensor of ranks of prompt tokens in their respective
         rows, or random values
@@ -206,10 +206,10 @@ def decode_token(
 
 
 def generate_dummy_sample_logprobs(
-    sampled_tokens_list: List,
+    sampled_tokens_list: list,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
-) -> List[Tuple[List[int], List[float], int]]:
+) -> list[tuple[list[int], list[float], int]]:
     """Generate dummy sample logprobs
 
     Generate a test data structure which imitates the list of sample logprobs
@@ -221,7 +221,7 @@ def generate_dummy_sample_logprobs(
       tokenizer: model tokenizer to use for detokenization
 
     Returns
-      List of (top token ids vector, logprobs vector, sampled token rank)
+      list of (top token ids vector, logprobs vector, sampled token rank)
       Python lists tuples; in each tuple the logprobs and top token ids
       vectors have the same length which is either `num_logprobs` or
       `num_logprobs+1`. Sampled token rank is the rank (index+1) of the
@@ -253,7 +253,7 @@ def generate_dummy_sample_logprobs(
 
 
 def generate_dummy_prompt_logprobs_tensors(
-    prompt_tokens_list: List,
+    prompt_tokens_list: list,
     num_logprobs: int,
     tokenizer: PreTrainedTokenizer,
 ) -> LogprobsTensors:
@@ -269,7 +269,7 @@ def generate_dummy_prompt_logprobs_tensors(
       tokenizer: model tokenizer to use for detokenization
 
     Returns
-      Single Tuple of (logprobs matrix, top token ids matrix) torch Tensor,
+      Single tuple of (logprobs matrix, top token ids matrix) torch Tensor,
       where both matrices have dimensions
       num_prompt_tokens x num_logprobs
     """
@@ -301,19 +301,19 @@ class DummyOutputProcessorTestVectors:
     tokenizer: GeneralTokenizerType
     tokenizer_group: BaseTokenizerGroup
     vllm_config: EngineArgs
-    full_tokens: List[List[int]]  # Prompt + generated tokens
-    prompt_tokens: List[List[int]]
-    generation_tokens: List[List[int]]
+    full_tokens: list[list[int]]  # Prompt + generated tokens
+    prompt_tokens: list[list[int]]
+    generation_tokens: list[list[int]]
     # Each request is associated with a tuple of
     # (top tokens, top logprobs, ranks) prompt logprobs tensors
-    prompt_logprobs: List[LogprobsTensors]
+    prompt_logprobs: list[LogprobsTensors]
     # Each request is associated with a sample logprobs; a request's
     # sample logprobs are a list of (top tokens, top logprobs, ranks)
     # sample logprobs tensors at each sequence position
-    generation_logprobs: List[List[Tuple[List[int], List[float], int]]]
-    prompt_strings: List[str]
-    prompt_strings_len: List[int]
-    generation_strings: List[str]
+    generation_logprobs: list[list[tuple[list[int], list[float], int]]]
+    prompt_strings: list[str]
+    prompt_strings_len: list[int]
+    generation_strings: list[str]
 
 
 class MockEngineCore:
@@ -321,18 +321,18 @@ class MockEngineCore:
 
     def __init__(
         self,
-        tokens_list: List[List[int]],
+        tokens_list: list[list[int]],
         # For each request, for each sampled token offset,
         # a tuple of
         # (list of topk token ids, list of sample logprob vals, rank)
-        generated_logprobs_raw: Optional[List[List[Tuple[List[int],
-                                                         List[float],
+        generated_logprobs_raw: Optional[list[list[tuple[list[int],
+                                                         list[float],
                                                          int]]]] = None,
         # For each request, a tuple of
         # (prompt logprob val matrix, prompt logprob tok id matrix);
         # each matrix has dimensions
         # (num prompt toks) x (num prompt logprobs+1)
-        prompt_logprobs_raw: Optional[List[LogprobsTensors]] = None,
+        prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
     ) -> None:
         self.tokens_list = tokens_list
         self.current_idx = 0
@@ -341,7 +341,7 @@ def __init__(
         self.prompt_logprobs_raw = prompt_logprobs_raw
         self.do_prompt_logprobs = prompt_logprobs_raw is not None
 
-    def get_outputs(self) -> List[EngineCoreOutput]:
+    def get_outputs(self) -> list[EngineCoreOutput]:
         do_logprobs = self.do_logprobs
         do_prompt_logprobs = self.do_prompt_logprobs
         token_idx = self.current_idx
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 35e059ccb54..171c84176ea 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Dict, List, Optional
+from typing import Optional
 
 import openai  # use the official client for correctness check
 import pytest
@@ -193,7 +193,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
 async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
                                           model_name: str,
                                           prompt_logprobs: Optional[int]):
-    params: Dict = {
+    params: dict = {
         "prompt": ["A robot may not injure another robot", "My name is"],
         "model": model_name,
     }
@@ -237,7 +237,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
                                              max_tokens=5,
                                              temperature=0.0,
                                              stream=True)
-    chunks: List[str] = []
+    chunks: list[str] = []
     finish_reason_count = 0
     async for chunk in stream:
         chunks.append(chunk.choices[0].text)
@@ -278,7 +278,7 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     num_completions = len(completion.choices)
     assert num_completions == n, (
         f"Num completions {num_completions} but expected {n}.")
-    completion_repeats: Dict[str, int] = {}
+    completion_repeats: dict[str, int] = {}
     for idx, choice in enumerate(completion.choices):
         # Assert correct completion index & some finish reason.
         assert choice.index == idx, (
@@ -321,7 +321,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
                                              temperature=0.95,
                                              stream=True,
                                              seed=42)
-    chunks: List[List[str]] = [[] for i in range(n)]
+    chunks: list[list[str]] = [[] for i in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
         index = chunk.choices[0].index
@@ -332,7 +332,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     # Assert `n` completions with correct finish reasons
     assert finish_reason_count == n, (
         f"Expected {n} completions with valid indices and finish_reason.")
-    completion_repeats: Dict[str, int] = {}
+    completion_repeats: dict[str, int] = {}
     for chunk in chunks:
         chunk_len = len(chunk)
         # Assert correct number of completion tokens
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index a26a8c4ed07..d564a8c2e7a 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
-from typing import List, Tuple
 
 import pytest
 import torch
@@ -46,8 +45,8 @@ def hf_model(hf_runner):
 
 def _repeat_logprob_config(
     test_prompts,
-    logprob_prompt_logprob_list: List[Tuple],
-) -> List[Tuple]:
+    logprob_prompt_logprob_list: list[tuple],
+) -> list[tuple]:
     """Ensure each test prompt has a logprob config.
     
     A logprob config specifies the optional (i.e.
@@ -74,7 +73,7 @@ def _repeat_logprob_config(
                              tuples
     
     Returns:
-      List of
+      list of
       (optional num sample logprob,optional num prompt logprob)
       tuples which is either identical to
       `logprob_prompt_logprob_list`, or else repeats
@@ -177,7 +176,7 @@ def _test_case_get_logprobs_and_prompt_logprobs(
                                for r in range(1, num_top_logprobs + 1))
 
             output_text = vllm_result.outputs[0].text
-            output_string_from_most_likely_tokens_lst: List[str] = []
+            output_string_from_most_likely_tokens_lst: list[str] = []
             for top_logprobs in vllm_result.outputs[0].logprobs:
                 top_logprob = next(iter(top_logprobs.values()))
                 output_string_from_most_likely_tokens_lst.append(
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index f00585b40ba..b1862455d0e 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 
 import pytest
 import torch
@@ -13,7 +12,7 @@ def sampler():
     return RejectionSampler()
 
 
-def create_logits_tensor(token_ids: List[int],
+def create_logits_tensor(token_ids: list[int],
                          vocab_size: int = 100) -> torch.Tensor:
     """Helper function to create logits tensor that 
        will produce desired token ids on argmax"""
@@ -23,7 +22,7 @@ def create_logits_tensor(token_ids: List[int],
     return logits
 
 
-def create_sampling_metadata(spec_tokens: List[List[int]]) -> SamplingMetadata:
+def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
     batch_size = len(spec_tokens)
     return SamplingMetadata(
         temperature=torch.tensor([]),
@@ -106,7 +105,7 @@ def test_single_token_sequence(sampler):
 
 def test_empty_sequence(sampler):
     """Test handling empty sequence of speculated tokens"""
-    spec_tokens: List[List[int]] = [[]]
+    spec_tokens: list[list[int]] = [[]]
     output_tokens = [5]  # Just the bonus token
 
     metadata = create_sampling_metadata(spec_tokens)
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 435c1b7b5fd..b702d9ed7f8 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -32,7 +32,7 @@ def _create_penalty_tensor(batch_size: int, penalty_value: float,
 
 
 def _create_prompt_tokens_tensor(
-    prompt_token_ids: List[List[int]],
+    prompt_token_ids: list[list[int]],
     vocab_size: int,
     device: torch.device,
 ) -> torch.Tensor:
@@ -49,8 +49,8 @@ def _create_logit_bias(
     batch_size: int,
     vocab_size: int,
     bias_value: float,
-) -> List[Optional[Dict[int, float]]]:
-    res: List[Optional[Dict[int, float]]] = []
+) -> list[Optional[dict[int, float]]]:
+    res: list[Optional[dict[int, float]]] = []
     for i in range(batch_size):
         logit_bias = {min(i, vocab_size - 1): bias_value}
         res.append(logit_bias)
@@ -83,8 +83,8 @@ def _create_default_sampling_metadata(
     vocab_size: int,
     device: torch.device,
 ) -> SamplingMetadata:
-    output_token_ids: List[List[int]] = []
-    prompt_token_ids: List[List[int]] = []
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
     for _ in range(batch_size):
         output_token_ids.append(
             np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
@@ -118,8 +118,8 @@ def _create_default_sampling_metadata(
 
 def _generate_min_token_penalties_and_stop_tokens(
     num_output_tokens: int, batch_size: int, vocab_size: int,
-    batch_indices_for_min_token_penalty: List[int]
-) -> Dict[int, Tuple[int, Set[int]]]:
+    batch_indices_for_min_token_penalty: list[int]
+) -> dict[int, tuple[int, set[int]]]:
     """
     Generates and returns a dict of minimum token penalties and
     corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
@@ -130,7 +130,7 @@ def _generate_min_token_penalties_and_stop_tokens(
     and a random set of stop token IDs is created. Otherwise, a lower
     `min_tokens` value is assigned, and the stop token IDs set is empty.
     """
-    min_tokens: Dict[int, Tuple[int, Set[int]]] = {}
+    min_tokens: dict[int, tuple[int, set[int]]] = {}
     for index in range(batch_size):
         if index in batch_indices_for_min_token_penalty:
             min_tokens[index] = (
@@ -147,7 +147,7 @@ def _generate_min_token_penalties_and_stop_tokens(
 
 def _create_weighted_output_token_list(
         batch_size: int,
-        vocab_size: int) -> Tuple[List[List[int]], List[List[int]]]:
+        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
     """
     Creates an output token list where each token occurs a distinct
     number of times.
@@ -157,7 +157,7 @@ def _create_weighted_output_token_list(
     list, each with a different frequency.
 
     Returns:
-        Tuple[List[List[int]], List[List[int]]]:
+        tuple[list[list[int]], list[list[int]]]:
             - The first element is the output token list, where each sublist
               corresponds to a batch and contains tokens with weighted
               frequencies.
@@ -165,8 +165,8 @@ def _create_weighted_output_token_list(
               batch, ordered by their frequency in the corresponding output
               list.
     """
-    output_token_ids: List[List[int]] = []
-    sorted_token_ids_in_output: List[List[int]] = []
+    output_token_ids: list[list[int]] = []
+    sorted_token_ids_in_output: list[list[int]] = []
     for _ in range(batch_size):
         distinct_token_ids = np.random.choice(vocab_size,
                                               size=np.random.randint(1, 10),
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index e1465b12396..c69d0d49c46 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import List, Tuple
 
 from vllm import CompletionOutput
 
 
-def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
+def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
     """Generate logprobs configs for a batch of requests
     
     A given request's logprobs configuration is (1) num_sample_logprobs and (2)
@@ -32,7 +31,7 @@ def get_test_batch(batch_logprobs_composition: str) -> List[Tuple]:
 
     Returns:
 
-      List of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
+      list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
     """
     if batch_logprobs_composition == "NONE":
diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py
index 9b669ae0066..b68f0838586 100644
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import torch
 
 from vllm.v1.utils import bind_kv_cache
@@ -22,7 +20,7 @@ def test_bind_kv_cache():
         'layers.2.self_attn': torch.zeros((1, )),
         'layers.3.self_attn': torch.zeros((1, )),
     }
-    runner_kv_caches: List[torch.Tensor] = []
+    runner_kv_caches: list[torch.Tensor] = []
     bind_kv_cache(kv_cache, ctx, runner_kv_caches)
     assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
         'layers.0.self_attn']
@@ -52,7 +50,7 @@ def test_bind_kv_cache_non_attention():
         'model.layers.28.attn': torch.zeros((1, )),
     }
 
-    runner_kv_caches: List[torch.Tensor] = []
+    runner_kv_caches: list[torch.Tensor] = []
     bind_kv_cache(kv_cache, ctx, runner_kv_caches)
 
     assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 327370e71ff..72ec7370115 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -22,22 +22,22 @@
 
 def _remove_requests(
         input_batch: InputBatch, batch_size: int,
-        reqs: List[CachedRequestState]) -> Tuple[Set[str], List[int]]:
+        reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]:
     """
-    Remove some requests randomly from the batch and returns a Tuple
+    Remove some requests randomly from the batch and returns a tuple
     of 1) set of request removed 2) indices of the requests removed
     ordered in descending order
     """
 
     num_reqs_to_remove = np.random.randint(0, batch_size)
-    req_indices_to_remove: Set[int] = set()
+    req_indices_to_remove: set[int] = set()
     for _ in range(num_reqs_to_remove):
         req_index_to_remove = np.random.randint(0, batch_size)
         req_indices_to_remove.add(req_index_to_remove)
 
     req_indices_to_remove_list = list(req_indices_to_remove)
     req_indices_to_remove_list.sort(reverse=True)
-    req_ids_to_remove: Set[str] = set()
+    req_ids_to_remove: set[str] = set()
     for index in req_indices_to_remove:
         input_batch.remove_request(reqs[index].req_id)
         req_ids_to_remove.add(reqs[index].req_id)
@@ -45,9 +45,9 @@ def _remove_requests(
 
 
 def _construct_expected_sampling_metadata(
-    reqs: List[CachedRequestState],
-    req_ids_retained: Set[int],
-    req_id_index_in_input_batch: Dict[str, int],
+    reqs: list[CachedRequestState],
+    req_ids_retained: set[int],
+    req_id_index_in_input_batch: dict[str, int],
     device: torch.device,
 ) -> SamplingMetadata:
     """
@@ -55,8 +55,8 @@ def _construct_expected_sampling_metadata(
     batch.
     """
     num_reqs = len(req_ids_retained)
-    output_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
-    prompt_token_ids: List[List[int]] = [list() for _ in range(num_reqs)]
+    output_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
+    prompt_token_ids: list[list[int]] = [list() for _ in range(num_reqs)]
     presence_penalties = [0.0 for _ in range(num_reqs)]
     frequency_penalties = [0.0 for _ in range(num_reqs)]
     repetition_penalties = [1.0 for _ in range(num_reqs)]
@@ -191,7 +191,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
     )
-    reqs: List[CachedRequestState] = []
+    reqs: list[CachedRequestState] = []
     req_id_reqs = {}
     req_id_output_token_ids = {}
     # Add requests
diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py
index 392fd2705fb..3b25980cb94 100644
--- a/tests/vllm_test_utils/vllm_test_utils/blame.py
+++ b/tests/vllm_test_utils/vllm_test_utils/blame.py
@@ -4,7 +4,8 @@
 import dataclasses
 import sys
 import traceback
-from typing import Callable, Generator
+from collections.abc import Generator
+from typing import Callable
 
 
 @dataclasses.dataclass
diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py
index 44d45f26210..27077f13de2 100644
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
@@ -4,7 +4,8 @@
 import dataclasses
 import sys
 import traceback
-from typing import Callable, Generator, Generic, TypeVar
+from collections.abc import Generator
+from typing import Callable, Generic, TypeVar
 
 _T = TypeVar("_T")
 
diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py
index 0ce0465a704..3e237aacc8c 100644
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
-from typing import List
 
 import pytest
 import torch
@@ -43,7 +42,7 @@ def test_empty_seq_group():
         enable_chunked_prefill=False,
         enforce_eager=True,
     )
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list)
     (
@@ -103,9 +102,9 @@ def test_prepare_prompt(batch_size):
         enforce_eager=True,
     )
 
-    seq_lens: List[int] = []
-    encoder_seq_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     cross_block_table = [2]
     for i in range(batch_size):
@@ -295,9 +294,9 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
         enforce_eager=True,
     )
 
-    seq_lens: List[int] = []
-    encoder_seq_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     block_tables = {
         0: [1],
         1: [3]
@@ -503,9 +502,9 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):
     } if multiple_seqs_per_seq_group else {
         0: [1]
     }
-    seq_lens: List[int] = []
-    encoder_seq_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    encoder_seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
 
     cross_block_table = [2]
     expanded_batch_size = 0
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index eb341fb1b29..a41fc52170f 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dataclasses
-from typing import List, Tuple, Type
 
 import torch
 
@@ -27,15 +26,15 @@ def get_impl_cls():
         raise NotImplementedError
 
     @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
+    def get_metadata_cls() -> type["AttentionMetadata"]:
         return AttentionMetadata
 
     @staticmethod
-    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+    def get_builder_cls() -> type["AttentionMetadataBuilder"]:
         return AttentionMetadataBuilder
 
     @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
+    def get_state_cls() -> type["CommonAttentionState"]:
         return CommonAttentionState
 
     @staticmethod
@@ -44,7 +43,7 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         raise NotImplementedError
 
     @staticmethod
@@ -57,7 +56,7 @@ def swap_blocks(
 
     @staticmethod
     def copy_blocks(
-        kv_caches: List[torch.Tensor],
+        kv_caches: list[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
         pass
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index 3f9a0d6faa6..b8ba69b0dd8 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import pytest
 import torch
 
@@ -42,8 +40,8 @@ def test_prepare_prompt(batch_size):
         enable_chunked_prefill=False,
     )
 
-    seq_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     for i in range(batch_size):
         # make sure all tokens fit into one block
@@ -159,8 +157,8 @@ def test_prepare_decode_cuda_graph(batch_size):
         enable_chunked_prefill=False,
     )
 
-    context_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    context_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     # Assume each seq group finishes prefill.
     for i in range(batch_size):
         # make sure all tokens fit into one block
@@ -265,7 +263,7 @@ def test_empty_seq_group():
         dtype="float16",
         enforce_eager=False,
     )
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
     model_input = model_runner._prepare_model_input_tensors(
         seq_group_metadata_list)
     input_tokens, input_positions, attn_metadata = (
@@ -315,10 +313,10 @@ def test_hybrid_batches(batch_size, enforce_eager, distributed_init):
     )
 
     # Add prefill requests.
-    seq_lens: List[int] = []
-    seq_group_metadata_list: List[SequenceGroupMetadata] = []
-    prefill_metadata_list: List[SequenceGroupMetadata] = []
-    decode_metadata_list: List[SequenceGroupMetadata] = []
+    seq_lens: list[int] = []
+    seq_group_metadata_list: list[SequenceGroupMetadata] = []
+    prefill_metadata_list: list[SequenceGroupMetadata] = []
+    decode_metadata_list: list[SequenceGroupMetadata] = []
     block_tables = {0: [1]}
     prefill_batch_size = batch_size // 2
     decode_batch_size = batch_size - prefill_batch_size
diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py
index adbb7301bfc..9601b578eb9 100644
--- a/tools/profiler/print_layerwise_table.py
+++ b/tools/profiler/print_layerwise_table.py
@@ -2,13 +2,12 @@
 
 import argparse
 import json
-from typing import Dict
 
 from vllm.profiler.layerwise_profile import ModelStatsEntry, SummaryStatsEntry
 from vllm.profiler.utils import TablePrinter, indent_string
 
 
-def flatten_entries(entry_cls, profile_dict: Dict):
+def flatten_entries(entry_cls, profile_dict: dict):
     entries_and_depth = []
 
     def get_entries(node, curr_depth=0):
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index c527cdbe022..8ec3dfc97a7 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -6,7 +6,7 @@
 import math
 import os
 from pathlib import Path
-from typing import Any, List, Optional, Tuple
+from typing import Any, Optional
 
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -24,7 +24,7 @@ def largest_dist_from_leaf(node: dict, depth: int = 0):
 
 
 def get_entries_at_depth(depth: int,
-                         entries_and_traces: List[Tuple[Any, Any]],
+                         entries_and_traces: list[tuple[Any, Any]],
                          node: dict,
                          curr_depth: int = 0,
                          trace=()):
@@ -48,9 +48,9 @@ def get_entries_at_depth(depth: int,
                              trace=trace)
 
 
-def fold_nodes(root: dict, nodes_to_fold: List[str]):
+def fold_nodes(root: dict, nodes_to_fold: list[str]):
 
-    stack: List[dict] = [root]
+    stack: list[dict] = [root]
     while len(stack) != 0:
         node = stack.pop()
         if node['entry']['name'] in nodes_to_fold:
@@ -427,12 +427,12 @@ def main(
         plot_metric: str,
         make_names_unique: bool,
         top_k: int,
-        json_nodes_to_fold: List[str]):
+        json_nodes_to_fold: list[str]):
 
-    def prepare_data(profile_json: dict, step_keys: List[str]) -> pd.DataFrame:
+    def prepare_data(profile_json: dict, step_keys: list[str]) -> pd.DataFrame:
 
         def get_entries_and_traces(key: str):
-            entries_and_traces: List[Tuple[Any, Any]] = []
+            entries_and_traces: list[tuple[Any, Any]] = []
             for root in profile_json[key]["summary_stats"]:
                 # Fold nodes in the traces as per user request. i.e. simply
                 # make the requested nodes leaf-nodes.
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 373f92a52a1..3c822028426 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -2,7 +2,7 @@
 
 import contextlib
 import importlib
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 import torch.library
@@ -198,7 +198,7 @@ def rms_norm_dynamic_per_token_quant(
     quant_dtype: torch.dtype,
     scale_ub: Optional[torch.Tensor] = None,
     residual: Optional[torch.Tensor] = None
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     output = torch.empty_like(input, dtype=quant_dtype)
     scales = torch.empty((input.numel() // input.shape[-1], 1),
                          device=input.device,
@@ -347,7 +347,7 @@ def _awq_gemm_fake(input: torch.Tensor, qweight: torch.Tensor,
     @register_fake("_C::aqlm_gemm")
     def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
                         codebooks: torch.Tensor, scales: torch.Tensor,
-                        codebook_partition_sizes: List[int],
+                        codebook_partition_sizes: list[int],
                         bias: Optional[torch.Tensor]) -> torch.Tensor:
         out_features = codes.size(0) * codebooks.size(2)
         flat_input = input.reshape((-1, input.size(-1)))
@@ -363,7 +363,7 @@ def _aqlm_gemm_fake(input: torch.Tensor, codes: torch.Tensor,
     @register_fake("_C::aqlm_dequant")
     def _aqlm_dequant_fake(
             codes: torch.Tensor, codebooks: torch.Tensor,
-            codebook_partition_sizes: List[int]) -> torch.Tensor:
+            codebook_partition_sizes: list[int]) -> torch.Tensor:
         in_features = codes.size(1) * 8
         out_features = codes.size(0)
         return torch.empty((out_features, in_features),
@@ -554,7 +554,7 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
 
 
 def cutlass_sparse_compress(a: torch.Tensor) \
-    -> Tuple[torch.Tensor, torch.Tensor]:
+    -> tuple[torch.Tensor, torch.Tensor]:
     """
     Compresses a sparse matrix for use with Cutlass sparse operations.
 
@@ -571,7 +571,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \
             - `torch.float16`
 
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: 
+        tuple[torch.Tensor, torch.Tensor]: 
             A tuple containing:
             - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
             - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
@@ -646,14 +646,14 @@ def cutlass_scaled_sparse_mm(
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
-              codebook_partition_sizes: List[int],
+              codebook_partition_sizes: list[int],
               bias: Optional[torch.Tensor]) -> torch.Tensor:
     return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
                                   codebook_partition_sizes, bias)
 
 
 def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
-                 codebook_partition_sizes: List[int]) -> torch.Tensor:
+                 codebook_partition_sizes: list[int]) -> torch.Tensor:
     return torch.ops._C.aqlm_dequant(codes, codebooks,
                                      codebook_partition_sizes)
 
@@ -738,7 +738,7 @@ def machete_supported_schedules(
         group_zeros_type: Optional[torch.dtype] = None,
         channel_scales_type: Optional[torch.dtype] = None,
         token_scales_type: Optional[torch.dtype] = None,
-        out_type: Optional[torch.dtype] = None) -> List[str]:
+        out_type: Optional[torch.dtype] = None) -> list[str]:
     return torch.ops._C.machete_supported_schedules(
         a_type, b_type.id, group_scales_type, group_zeros_type,
         channel_scales_type, token_scales_type, out_type)
@@ -783,7 +783,7 @@ def permute_cols(a: torch.Tensor, perm: torch.Tensor) -> torch.Tensor:
 # fp4
 def scaled_fp4_quant(
         input: torch.Tensor,
-        input_global_scale: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        input_global_scale: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale.
 
@@ -798,7 +798,7 @@ def scaled_fp4_quant(
         input_global_scale: A scalar scaling factor for the entire tensor.
 
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
             two values are packed into a uint8 and float8_e4m3 scaling factors
             in the sizzled layout.
     """
@@ -845,7 +845,7 @@ def scaled_fp8_quant(
     num_token_padding: Optional[int] = None,
     scale_ub: Optional[torch.Tensor] = None,
     use_per_token_if_dynamic: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP8 and return quantized tensor and scale.
 
@@ -866,12 +866,12 @@ def scaled_fp8_quant(
             in the dynamic quantization case.
 
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
+        tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and
             scaling factor.
     """
     # This code assumes batch_dim and num_tokens are flattened
     assert (input.ndim == 2)
-    shape: Union[Tuple[int, int], torch.Size] = input.shape
+    shape: Union[tuple[int, int], torch.Size] = input.shape
     # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
     out_dtype: torch.dtype = torch.float8_e4m3fnuz \
             if current_platform.is_rocm() else torch.float8_e4m3fn
@@ -903,7 +903,7 @@ def allspark_repack_weight(
         scale: torch.Tensor,
         zero_point: Optional[torch.Tensor] = None,
         has_zp: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format 
     for Ampere W8A16 Fused Gemm kernel
@@ -917,7 +917,7 @@ def allspark_repack_weight(
             if use asymmetric quantization, has_zp = True.  
     
     Returns:
-        Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : 
+        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : 
             rearranged weight, scale, and optionally zero_point.
     """
     K = qweight.shape[0]
@@ -964,7 +964,7 @@ def scaled_int8_quant(
     scale: Optional[torch.Tensor] = None,
     azp: Optional[torch.Tensor] = None,
     symmetric: bool = True
-) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
     """
     Quantize the input tensor to int8 and return the quantized tensor and scale, and maybe azp.
 
@@ -977,7 +977,7 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
@@ -1165,13 +1165,13 @@ def concat_and_cache_mla(
                                                 scale)
 
 
-def copy_blocks(key_caches: List[torch.Tensor],
-                value_caches: List[torch.Tensor],
+def copy_blocks(key_caches: list[torch.Tensor],
+                value_caches: list[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
-def copy_blocks_mla(kv_caches: List[torch.Tensor],
+def copy_blocks_mla(kv_caches: list[torch.Tensor],
                     block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks_mla(kv_caches, block_mapping)
 
@@ -1209,7 +1209,7 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
 
 
 # custom ar
-def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor,
+def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
                    rank: int, full_nvlink: bool) -> int:
     return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
                                                  full_nvlink)
@@ -1229,16 +1229,16 @@ def meta_size() -> int:
     return torch.ops._C_custom_ar.meta_size()
 
 
-def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+def register_buffer(fa: int, ipc_tensors: list[int]) -> None:
     return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
 
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
+def get_graph_buffer_ipc_meta(fa: int) -> tuple[list[int], list[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 
-def register_graph_buffers(fa: int, handles: List[List[int]],
-                           offsets: List[List[int]]) -> None:
+def register_graph_buffers(fa: int, handles: list[list[int]],
+                           offsets: list[list[int]]) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
@@ -1246,7 +1246,7 @@ def get_flash_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_heads_per_head_k: int,
     num_heads_k: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
         cache_seqlens: (batch_size), dtype torch.int32.
@@ -1272,7 +1272,7 @@ def flash_mla_with_kvcache(
     num_splits: torch.Tensor,
     softmax_scale: Optional[float] = None,
     causal: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Arguments:
         q: (batch_size, seq_len_q, num_heads_q, head_dim).
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index ccb67baa533..a7b909d2063 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import torch
 
@@ -18,7 +18,7 @@ class ipex_ops:
 
     @staticmethod
     def _reshape_activation_tensor(
-            x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+            x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         num = x.size(0)
         d = x.size(1) // 2
         x = x.reshape(num, 2, d)
@@ -213,8 +213,8 @@ def reshape_and_cache(
             key, value, key_cache, value_cache, slot_mapping)
 
     @staticmethod
-    def copy_blocks(key_caches: List[torch.Tensor],
-                    value_caches: List[torch.Tensor],
+    def copy_blocks(key_caches: list[torch.Tensor],
+                    value_caches: list[torch.Tensor],
                     block_mapping: torch.Tensor) -> None:
         torch.xpu.copy_blocks(  # type: ignore
             key_caches,
diff --git a/vllm/beam_search.py b/vllm/beam_search.py
index 97b2b630fc3..5d4ebdb7acb 100644
--- a/vllm/beam_search.py
+++ b/vllm/beam_search.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 from vllm.sequence import Logprob
 
@@ -17,14 +17,14 @@ class BeamSearchSequence:
     about to be returned to the user.
     """
     # The tokens includes the prompt.
-    tokens: List[int]
-    logprobs: List[Dict[int, Logprob]]
+    tokens: list[int]
+    logprobs: list[dict[int, Logprob]]
     cum_logprob: float = 0.0
     text: Optional[str] = None
     finish_reason: Optional[str] = None
     stop_reason: Union[int, str, None] = None
     multi_modal_data: Optional["MultiModalDataDict"] = None
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_processor_kwargs: Optional[dict[str, Any]] = None
 
 
 @dataclass
@@ -33,20 +33,20 @@ class BeamSearchOutput:
     It contains the list of the best beam search sequences.
     The length of the list is equal to the beam width.
     """
-    sequences: List[BeamSearchSequence]
+    sequences: list[BeamSearchSequence]
 
 
 class BeamSearchInstance:
 
-    def __init__(self, prompt_tokens: List[int]):
-        self.beams: List[BeamSearchSequence] = [
+    def __init__(self, prompt_tokens: list[int]):
+        self.beams: list[BeamSearchSequence] = [
             BeamSearchSequence(tokens=prompt_tokens, logprobs=[])
         ]
-        self.completed: List[BeamSearchSequence] = []
+        self.completed: list[BeamSearchSequence] = []
 
 
 def get_beam_search_score(
-    tokens: List[int],
+    tokens: list[int],
     cumulative_logprob: float,
     eos_token_id: int,
     length_penalty: float = 1.0,
diff --git a/vllm/config.py b/vllm/config.py
index 54ed38418dd..f87d2d6e82c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -7,13 +7,14 @@
 import json
 import sys
 import warnings
+from collections import Counter
+from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field, replace
 from importlib.util import find_spec
 from pathlib import Path
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict,
-                    Final, List, Literal, Mapping, Optional, Protocol, Set,
-                    Tuple, Type, Union)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Final, Literal,
+                    Optional, Protocol, Union)
 
 import torch
 from pydantic import BaseModel, Field, PrivateAttr
@@ -67,20 +68,20 @@
 
 RunnerType = Literal["generate", "pooling", "draft", "transcription"]
 
-_RUNNER_TASKS: Dict[RunnerType, List[_ResolvedTask]] = {
+_RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = {
     "generate": ["generate"],
     "pooling": ["embed", "classify", "score", "reward"],
     "draft": ["draft"],
     "transcription": ["transcription"],
 }
 
-_TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = {
+_TASK_RUNNER: dict[_ResolvedTask, RunnerType] = {
     task: runner
     for runner, tasks in _RUNNER_TASKS.items()
     for task in tasks
 }
 
-HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
+HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig],
                                              PretrainedConfig]]
 
 
@@ -92,7 +93,7 @@ def compute_hash(self) -> str:
 
 class SupportsMetricsInfo(Protocol):
 
-    def metrics_info(self) -> Dict[str, str]:
+    def metrics_info(self) -> dict[str, str]:
         ...
 
 
@@ -209,7 +210,7 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: List[Any] = []
+        factors: list[Any] = []
         factors.append(self.model)
         factors.append(self.dtype)
         factors.append(self.quantization)
@@ -233,7 +234,7 @@ def __init__(
         allowed_local_media_path: str = "",
         revision: Optional[str] = None,
         code_revision: Optional[str] = None,
-        rope_scaling: Optional[Dict[str, Any]] = None,
+        rope_scaling: Optional[dict[str, Any]] = None,
         rope_theta: Optional[float] = None,
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
@@ -244,19 +245,19 @@ def __init__(
         max_logprobs: int = 20,
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
-        served_model_name: Optional[Union[str, List[str]]] = None,
+        served_model_name: Optional[Union[str, list[str]]] = None,
         limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
         use_async_output_proc: bool = True,
         config_format: ConfigFormat = ConfigFormat.AUTO,
         hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
         disable_mm_preprocessor_cache: bool = False,
-        override_neuron_config: Optional[Dict[str, Any]] = None,
+        override_neuron_config: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional["PoolerConfig"] = None,
         logits_processor_pattern: Optional[str] = None,
         generation_config: Optional[str] = None,
         enable_sleep_mode: bool = False,
-        override_generation_config: Optional[Dict[str, Any]] = None,
+        override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model = model
@@ -283,7 +284,7 @@ def __init__(
             hf_overrides_fn = None
 
         if rope_scaling is not None:
-            hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
+            hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
             hf_overrides_kw.update(hf_override)
             msg = ("`--rope-scaling` will be removed in a future release. "
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
@@ -505,8 +506,8 @@ def _verify_tokenizer_mode(self) -> None:
 
     def _get_preferred_task(
         self,
-        architectures: List[str],
-        supported_tasks: Set[_ResolvedTask],
+        architectures: list[str],
+        supported_tasks: set[_ResolvedTask],
     ) -> Optional[_ResolvedTask]:
         model_id = self.model
         if get_pooling_config(model_id, self.revision):
@@ -516,7 +517,7 @@ def _get_preferred_task(
         if self.registry.is_transcription_model(architectures):
             return "transcription"
 
-        suffix_to_preferred_task: List[Tuple[str, _ResolvedTask]] = [
+        suffix_to_preferred_task: list[tuple[str, _ResolvedTask]] = [
             # Other models follow this pattern
             ("ForCausalLM", "generate"),
             ("ForConditionalGeneration", "generate"),
@@ -537,27 +538,27 @@ def _get_preferred_task(
     def _resolve_task(
         self,
         task_option: Union[TaskOption, Literal["draft"]],
-    ) -> Tuple[Set[_ResolvedTask], _ResolvedTask]:
+    ) -> tuple[set[_ResolvedTask], _ResolvedTask]:
         if task_option == "draft":
             return {"draft"}, "draft"
 
         registry = self.registry
         architectures = self.architectures
 
-        runner_support: Dict[RunnerType, bool] = {
+        runner_support: dict[RunnerType, bool] = {
             # NOTE: Listed from highest to lowest priority,
             # in case the model supports multiple of them
             "transcription": registry.is_transcription_model(architectures),
             "generate": registry.is_text_generation_model(architectures),
             "pooling": registry.is_pooling_model(architectures),
         }
-        supported_runner_types_lst: List[RunnerType] = [
+        supported_runner_types_lst: list[RunnerType] = [
             runner_type
             for runner_type, is_supported in runner_support.items()
             if is_supported
         ]
 
-        supported_tasks_lst: List[_ResolvedTask] = [
+        supported_tasks_lst: list[_ResolvedTask] = [
             task for runner_type in supported_runner_types_lst
             for task in _RUNNER_TASKS[runner_type]
         ]
@@ -767,7 +768,7 @@ def verify_with_parallel_config(
                 self.use_async_output_proc = False
 
     def get_hf_config_sliding_window(
-            self) -> Union[Optional[int], List[Optional[int]]]:
+            self) -> Union[Optional[int], list[Optional[int]]]:
         """Get the sliding window size, or None if disabled."""
 
         # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in
@@ -778,7 +779,7 @@ def get_hf_config_sliding_window(
             return None
         return getattr(self.hf_text_config, "sliding_window", None)
 
-    def get_sliding_window(self) -> Optional[Union[int, List[Optional[int]]]]:
+    def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]:
         """Get the sliding window size, or None if disabled.
         """
         # If user disables sliding window, return None.
@@ -888,7 +889,7 @@ def get_num_attention_heads(self,
         return num_heads // parallel_config.tensor_parallel_size
 
     def get_layers_start_end_indices(
-            self, parallel_config: "ParallelConfig") -> Tuple[int, int]:
+            self, parallel_config: "ParallelConfig") -> tuple[int, int]:
         from vllm.distributed.utils import get_pp_indices
         if self.hf_text_config.model_type == "deepseek_mtp":
             total_num_hidden_layers = getattr(self.hf_text_config,
@@ -949,7 +950,7 @@ def get_multimodal_config(self) -> "MultiModalConfig":
 
         return self.multimodal_config
 
-    def try_get_generation_config(self) -> Dict[str, Any]:
+    def try_get_generation_config(self) -> dict[str, Any]:
         if self.generation_config is None or self.generation_config == "auto":
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
@@ -967,7 +968,7 @@ def try_get_generation_config(self) -> Dict[str, Any]:
 
         return config.to_diff_dict()
 
-    def get_diff_sampling_param(self) -> Dict[str, Any]:
+    def get_diff_sampling_param(self) -> dict[str, Any]:
         """
         This method returns a dictionary containing the parameters
         that differ from the default sampling parameters, but only
@@ -975,7 +976,7 @@ def get_diff_sampling_param(self) -> Dict[str, Any]:
         set, an empty dictionary is returned.
 
         Returns:
-            Dict[str, Any]: A dictionary with the differing sampling
+            dict[str, Any]: A dictionary with the differing sampling
             parameters if `generation_config` is set, otherwise an
             empty dictionary.
         """
@@ -1032,7 +1033,7 @@ def use_mla(self) -> bool:
         return self.is_deepseek_mla and not envs.VLLM_MLA_DISABLE
 
     @property
-    def supported_runner_types(self) -> Set[RunnerType]:
+    def supported_runner_types(self) -> set[RunnerType]:
         return {_TASK_RUNNER[task] for task in self.supported_tasks}
 
     @property
@@ -1075,7 +1076,7 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: List[Any] = []
+        factors: list[Any] = []
         factors.append(self.cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
@@ -1183,7 +1184,7 @@ class TokenizerPoolConfig:
             pool type.
     """
     pool_size: int
-    pool_type: Union[str, Type["BaseTokenizerGroup"]]
+    pool_type: Union[str, type["BaseTokenizerGroup"]]
     extra_config: dict
 
     def compute_hash(self) -> str:
@@ -1200,7 +1201,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -1214,7 +1215,7 @@ def __post_init__(self):
     @classmethod
     def create_config(
         cls, tokenizer_pool_size: int,
-        tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]],
+        tokenizer_pool_type: Union[str, type["BaseTokenizerGroup"]],
         tokenizer_pool_extra_config: Optional[Union[str, dict]]
     ) -> Optional["TokenizerPoolConfig"]:
         """Create a TokenizerPoolConfig from the given parameters.
@@ -1285,7 +1286,7 @@ class LoadConfig:
     download_dir: Optional[str] = None
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
-    ignore_patterns: Optional[Union[List[str], str]] = None
+    ignore_patterns: Optional[Union[list[str], str]] = None
 
     def compute_hash(self) -> str:
         """
@@ -1301,7 +1302,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -1359,7 +1360,7 @@ class ParallelConfig:
     # to "ray" if Ray is installed and fail otherwise. Note that tpu
     # and hpu only support Ray for distributed inference.
     distributed_executor_backend: Optional[Union[str,
-                                                 Type["ExecutorBase"]]] = None
+                                                 type["ExecutorBase"]]] = None
 
     # the full name of the worker class to use. If "auto", the worker class
     # will be determined based on the platform.
@@ -1423,7 +1424,7 @@ def compute_hash(self):
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: List[Any] = []
+        factors: list[Any] = []
         factors.append(self.pipeline_parallel_size)
         factors.append(self.tensor_parallel_size)
         return hashlib.sha256(str(factors).encode()).hexdigest()
@@ -1600,7 +1601,7 @@ class SchedulerConfig:
 
     # scheduler class or path. "vllm.core.scheduler.Scheduler" (default)
     # or "mod.custom_class".
-    scheduler_cls: Union[str, Type[object]] = "vllm.core.scheduler.Scheduler"
+    scheduler_cls: Union[str, type[object]] = "vllm.core.scheduler.Scheduler"
 
     def compute_hash(self) -> str:
         """
@@ -1616,7 +1617,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -1752,7 +1753,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # the device/platform information will be summarized
         # by torch/vllm automatically.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -1798,7 +1799,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # spec decode does not use `torch.compile` yet.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2261,7 +2262,7 @@ class LoRAConfig:
     lora_extra_vocab_size: int = 256
     # This is a constant.
     lora_vocab_padding_size: ClassVar[int] = 256
-    long_lora_scaling_factors: Optional[Tuple[float]] = None
+    long_lora_scaling_factors: Optional[tuple[float]] = None
     bias_enabled: bool = False
 
     def compute_hash(self) -> str:
@@ -2278,7 +2279,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # LoRA is not compatible with `torch.compile` .
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2350,7 +2351,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2395,7 +2396,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2431,7 +2432,7 @@ class PoolerConfig:
     are returned.
     """
 
-    returned_token_ids: Optional[List[int]] = None
+    returned_token_ids: Optional[list[int]] = None
     """
     A list of indices for the vocabulary dimensions to be extracted,
     such as the token IDs of ``good_token`` and ``bad_token`` in the
@@ -2452,7 +2453,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2469,7 +2470,7 @@ def from_json(json_str: str) -> "PoolerConfig":
     "bfloat16": torch.bfloat16,
 }
 
-_ROCM_NOT_SUPPORTED_DTYPE: List[str] = []  #
+_ROCM_NOT_SUPPORTED_DTYPE: list[str] = []  #
 
 
 def _get_and_verify_dtype(
@@ -2558,7 +2559,7 @@ def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
     max_model_len: Optional[int],
     disable_sliding_window: bool,
-    sliding_window_len: Optional[Union[int, List[Optional[int]]]],
+    sliding_window_len: Optional[Union[int, list[Optional[int]]]],
     spec_target_max_model_len: Optional[int] = None,
     encoder_config: Optional[Any] = None,
 ) -> int:
@@ -2684,7 +2685,7 @@ def _get_and_verify_max_len(
 
 
 def get_min_sliding_window(
-        sliding_window: Union[int, List[Optional[int]]]) -> int:
+        sliding_window: Union[int, list[Optional[int]]]) -> int:
     if isinstance(sliding_window, list):
         return min(s for s in sliding_window if s is not None)
 
@@ -2692,7 +2693,7 @@ def get_min_sliding_window(
 
 
 def get_served_model_name(model: str,
-                          served_model_name: Optional[Union[str, List[str]]]):
+                          served_model_name: Optional[Union[str, list[str]]]):
     """
     If the input is a non-empty list, the first model_name in
     `served_model_name` is taken.
@@ -2731,7 +2732,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2774,7 +2775,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2833,7 +2834,7 @@ def compute_hash(self) -> str:
         """
         # no factors to consider.
         # this config will not affect the computation graph.
-        factors: List[Any] = []
+        factors: list[Any] = []
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -2930,7 +2931,7 @@ class CompilationConfig(BaseModel):
             torch.compile will handle cudagraph capture logic in the future.
         - cudagraph_capture_sizes: sizes to capture cudagraph.
             - None (default): capture sizes are inferred from vllm config.
-            - List[int]: capture sizes are specified as given.
+            - list[int]: capture sizes are specified as given.
         - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
             It means the first several runs will be treated as warmup runs.
             Only after that, the execution will be recorded, and the recorded
@@ -2972,17 +2973,17 @@ class CompilationConfig(BaseModel):
     debug_dump_path: str = ""
     cache_dir: str = ""
     backend: str = ""
-    custom_ops: List[str] = Field(default_factory=list)
-    splitting_ops: List[str] = Field(default=None)  # type: ignore
+    custom_ops: list[str] = Field(default_factory=list)
+    splitting_ops: list[str] = Field(default=None)  # type: ignore
 
     use_inductor: bool = True
-    compile_sizes: Optional[List[Union[int, str]]] = Field(default=None)
-    inductor_compile_config: Dict = Field(default_factory=dict)
-    inductor_passes: Dict[str, str] = Field(default_factory=dict)
+    compile_sizes: Optional[list[Union[int, str]]] = Field(default=None)
+    inductor_compile_config: dict = Field(default_factory=dict)
+    inductor_passes: dict[str, str] = Field(default_factory=dict)
 
     use_cudagraph: bool = False
     cudagraph_num_of_warmups: int = 0
-    cudagraph_capture_sizes: Optional[List[int]] = None
+    cudagraph_capture_sizes: Optional[list[int]] = None
     cudagraph_copy_inputs: bool = False
 
     class PassConfig(BaseModel):
@@ -2998,7 +2999,7 @@ class PassConfig(BaseModel):
         - enable_noop: whether to enable the custom no-op elimination pass.
             TODO(luka) better pass enabling system.
         """
-        dump_graph_stages: List[str] = Field(default_factory=list)
+        dump_graph_stages: list[str] = Field(default_factory=list)
         dump_graph_dir: Path = Field(default=Path("."))
         enable_fusion: bool = True
         enable_noop: bool = True
@@ -3026,20 +3027,20 @@ def model_post_init(self, __context: Any) -> None:
     max_capture_size: int = PrivateAttr
     local_cache_dir: str = PrivateAttr  # local cache dir for each rank
     # optimization:
-    # Intuitively, bs_to_padded_graph_size should be Dict[int, int].
+    # Intuitively, bs_to_padded_graph_size should be dict[int, int].
     # since we know all keys are in a range [0, max_capture_size],
-    # we can optimize it to List[int] for better lookup performance.
-    bs_to_padded_graph_size: List[int] = PrivateAttr
+    # we can optimize it to list[int] for better lookup performance.
+    bs_to_padded_graph_size: list[int] = PrivateAttr
 
     # keep track of enabled and disabled custom ops
     enabled_custom_ops: Counter[str] = PrivateAttr
     disabled_custom_ops: Counter[str] = PrivateAttr
-    traced_files: Set[str] = PrivateAttr
+    traced_files: set[str] = PrivateAttr
     compilation_time: float = PrivateAttr
 
     # Per-model forward context
     # Map from layer name to the attention cls
-    static_forward_context: Dict[str, Any] = PrivateAttr
+    static_forward_context: dict[str, Any] = PrivateAttr
 
     def compute_hash(self) -> str:
         """
@@ -3053,7 +3054,7 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: List[Any] = []
+        factors: list[Any] = []
         factors.append(self.level)
         factors.append(self.backend)
         factors.append(self.custom_ops)
@@ -3150,7 +3151,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]:
         return VllmBackend(vllm_config)
 
     def init_with_cudagraph_sizes(self,
-                                  cudagraph_capture_sizes: List[int]) -> None:
+                                  cudagraph_capture_sizes: list[int]) -> None:
         """To complete the initialization of config,
         we need to know the cudagraph sizes."""
 
@@ -3243,10 +3244,10 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: List[Any] = []
+        factors: list[Any] = []
 
         # summarize vllm config
-        vllm_factors: List[Any] = []
+        vllm_factors: list[Any] = []
         from vllm import __version__
         vllm_factors.append(__version__)
         if self.model_config:
diff --git a/vllm/connections.py b/vllm/connections.py
index dc060bb6f88..2c259bb7c3e 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Mapping, MutableMapping
 from pathlib import Path
-from typing import Mapping, MutableMapping, Optional
+from typing import Optional
 from urllib.parse import urlparse
 
 import aiohttp
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 28b8c847c0f..c81ff958531 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -10,7 +10,8 @@
 import json
 import ssl
 from argparse import Namespace
-from typing import Any, AsyncGenerator, Optional
+from collections.abc import AsyncGenerator
+from typing import Any, Optional
 
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index c50c631dafc..b05842dd27d 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -5,10 +5,11 @@
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
+from collections.abc import Awaitable, Iterable
 from functools import cache, lru_cache, partial
 from pathlib import Path
-from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
-                    Literal, Optional, Tuple, TypeVar, Union, cast)
+from typing import (Any, Callable, Generic, Literal, Optional, TypeVar, Union,
+                    cast)
 
 import jinja2.nodes
 import transformers.utils.chat_template_utils as hf_chat_utils
@@ -117,7 +118,7 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[str, List[ChatCompletionContentPartParam]]
+    content: Union[str, list[ChatCompletionContentPartParam]]
     """The contents of the message."""
 
     name: str
@@ -143,7 +144,7 @@ class ConversationMessage(TypedDict, total=False):
     role: Required[str]
     """The role of the message's author."""
 
-    content: Union[Optional[str], List[Dict[str, str]]]
+    content: Union[Optional[str], list[dict[str, str]]]
     """The contents of the message"""
 
     tool_call_id: Optional[str]
@@ -495,13 +496,13 @@ def __init__(self) -> None:
         super().__init__()
 
         # multimodal placeholder_string : count
-        self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0)
+        self._placeholder_counts: dict[str, int] = defaultdict(lambda: 0)
 
     def _add_placeholder(self, placeholder: Optional[str]):
         if placeholder:
             self._placeholder_counts[placeholder] += 1
 
-    def mm_placeholder_counts(self) -> Dict[str, int]:
+    def mm_placeholder_counts(self) -> dict[str, int]:
         return dict(self._placeholder_counts)
 
     @abstractmethod
@@ -652,12 +653,12 @@ def load_chat_template(
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
 # (similar to chat template)
-def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
+def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
                                      text_prompt: str) -> str:
     """Combine multimodal prompts for a multimodal language model."""
 
     # Look through the text prompt to check for missing placeholders
-    missing_placeholders: List[str] = []
+    missing_placeholders: list[str] = []
     for placeholder in placeholder_counts:
 
         # For any existing placeholder in the text prompt, we leave it as is
@@ -684,10 +685,10 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 
-_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
 
 # Define a mapping from part types to their corresponding parsing functions.
-MM_PARSER_MAP: Dict[
+MM_PARSER_MAP: dict[
     str,
     Callable[[ChatCompletionContentPartParam], _ContentPart],
 ] = {
@@ -749,7 +750,7 @@ def _parse_chat_message_content_mm_part(
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
         if part.get("input_audio") is not None:
-            input_audio_params = cast(Dict[str, str], part)
+            input_audio_params = cast(dict[str, str], part)
             return "input_audio", input_audio_params
         if part.get("video_url") is not None:
             video_params = cast(CustomChatCompletionContentSimpleVideoParam,
@@ -773,7 +774,7 @@ def _parse_chat_message_content_parts(
     mm_tracker: BaseMultiModalItemTracker,
     *,
     wrap_dicts: bool,
-) -> List[ConversationMessage]:
+) -> list[ConversationMessage]:
     content = list[_ContentPart]()
 
     mm_parser = mm_tracker.create_parser()
@@ -791,7 +792,7 @@ def _parse_chat_message_content_parts(
         # Parsing wraps images and texts as interleaved dictionaries
         return [ConversationMessage(role=role,
                                     content=content)]  # type: ignore
-    texts = cast(List[str], content)
+    texts = cast(list[str], content)
     text_prompt = "\n".join(texts)
     mm_placeholder_counts = mm_parser.mm_placeholder_counts()
     if mm_placeholder_counts:
@@ -866,7 +867,7 @@ def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
     content_format: _ChatTemplateContentFormat,
-) -> List[ConversationMessage]:
+) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
 
@@ -900,7 +901,7 @@ def _parse_chat_message_content(
     return result
 
 
-def _postprocess_messages(messages: List[ConversationMessage]) -> None:
+def _postprocess_messages(messages: list[ConversationMessage]) -> None:
     # per the Transformers docs & maintainers, tool call arguments in
     # assistant-role messages with tool_calls need to be dicts not JSON str -
     # this is how tool-use chat templates will expect them moving forwards
@@ -916,12 +917,12 @@ def _postprocess_messages(messages: List[ConversationMessage]) -> None:
 
 
 def parse_chat_messages(
-    messages: List[ChatCompletionMessageParam],
+    messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
-    conversation: List[ConversationMessage] = []
+) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+    conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
     for msg in messages:
@@ -939,12 +940,12 @@ def parse_chat_messages(
 
 
 def parse_chat_messages_futures(
-    messages: List[ChatCompletionMessageParam],
+    messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
-    conversation: List[ConversationMessage] = []
+) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+    conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
 
     for msg in messages:
@@ -963,7 +964,7 @@ def parse_chat_messages_futures(
 
 def apply_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    conversation: List[ConversationMessage],
+    conversation: list[ConversationMessage],
     chat_template: Optional[str],
     *,
     tokenize: bool = False,  # Different from HF's default
@@ -985,10 +986,10 @@ def apply_hf_chat_template(
 
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
-    messages: List[ChatCompletionMessageParam],
+    messages: list[ChatCompletionMessageParam],
     chat_template: Optional[str] = None,
     **kwargs: Any,
-) -> List[int]:
+) -> list[int]:
     if chat_template is not None:
         logger.warning_once(
             "'chat_template' cannot be overridden for mistral tokenizer.")
diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py
index 73df900f610..21a7d48b75c 100644
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -5,7 +5,7 @@
 import os
 import signal
 import sys
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from openai import OpenAI
 from openai.types.chat import ChatCompletionMessageParam
@@ -23,7 +23,7 @@ def signal_handler(sig, frame):
     signal.signal(signal.SIGTSTP, signal_handler)
 
 
-def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]:
+def _interactive_cli(args: argparse.Namespace) -> tuple[str, OpenAI]:
     _register_signal_handlers()
 
     base_url = args.url
@@ -43,7 +43,7 @@ def _interactive_cli(args: argparse.Namespace) -> Tuple[str, OpenAI]:
 
 def chat(system_prompt: Optional[str], model_name: str,
          client: OpenAI) -> None:
-    conversation: List[ChatCompletionMessageParam] = []
+    conversation: list[ChatCompletionMessageParam] = []
     if system_prompt is not None:
         conversation.append({"role": "system", "content": system_prompt})
 
@@ -100,7 +100,7 @@ def __init__(self):
     def cmd(args: argparse.Namespace) -> None:
         model_name, client = _interactive_cli(args)
         system_prompt = args.system_prompt
-        conversation: List[ChatCompletionMessageParam] = []
+        conversation: list[ChatCompletionMessageParam] = []
         if system_prompt is not None:
             conversation.append({"role": "system", "content": system_prompt})
 
@@ -168,5 +168,5 @@ def subparser_init(
         return complete_parser
 
 
-def cmd_init() -> List[CLISubcommand]:
+def cmd_init() -> list[CLISubcommand]:
     return [ChatCommand(), CompleteCommand()]
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 1afead8a120..c345ece4dad 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
-from typing import List
 
 import uvloop
 
@@ -59,5 +58,5 @@ def subparser_init(
         return make_arg_parser(serve_parser)
 
 
-def cmd_init() -> List[CLISubcommand]:
+def cmd_init() -> list[CLISubcommand]:
     return [ServeSubcommand()]
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 3f3262f6e72..122e2ed86cb 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -2,9 +2,9 @@
 
 import itertools
 import warnings
+from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import (Any, Callable, ClassVar, Dict, List, Optional, Sequence,
-                    Tuple, Type, Union, cast, overload)
+from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
 
 import cloudpickle
 import torch.nn as nn
@@ -177,11 +177,11 @@ def __init__(
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
         hf_overrides: Optional[HfOverrides] = None,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
-        compilation_config: Optional[Union[int, Dict[str, Any]]] = None,
+        compilation_config: Optional[Union[int, dict[str, Any]]] = None,
         **kwargs,
     ) -> None:
         '''
@@ -246,7 +246,7 @@ def __init__(
         self.request_counter = Counter()
 
     @staticmethod
-    def get_engine_class() -> Type[LLMEngine]:
+    def get_engine_class() -> type[LLMEngine]:
         if envs.VLLM_USE_V1:
             # Lazy import: the v1 package isn't distributed
             from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
@@ -283,11 +283,11 @@ def generate(
                                         Sequence[SamplingParams]]] = None,
         *,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @overload  # LEGACY: single (prompt + optional token ids)
@@ -296,30 +296,30 @@ def generate(
         self,
         prompts: str,
         sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
-        prompt_token_ids: Optional[List[int]] = None,
+                                        list[SamplingParams]]] = None,
+        prompt_token_ids: Optional[list[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
     @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
-        prompts: List[str],
+        prompts: list[str],
         sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
-        prompt_token_ids: Optional[List[List[int]]] = None,
+                                        list[SamplingParams]]] = None,
+        prompt_token_ids: Optional[list[list[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
@@ -328,32 +328,32 @@ def generate(
         self,
         prompts: Optional[str] = None,
         sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
+                                        list[SamplingParams]]] = None,
         *,
-        prompt_token_ids: List[int],
+        prompt_token_ids: list[int],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
     @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def generate(
         self,
-        prompts: Optional[List[str]] = None,
+        prompts: Optional[list[str]] = None,
         sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
+                                        list[SamplingParams]]] = None,
         *,
-        prompt_token_ids: List[List[int]],
+        prompt_token_ids: list[list[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
@@ -362,13 +362,13 @@ def generate(
         self,
         prompts: None,
         sampling_params: None,
-        prompt_token_ids: Union[List[int], List[List[int]]],
+        prompt_token_ids: Union[list[int], list[list[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-    ) -> List[RequestOutput]:
+    ) -> list[RequestOutput]:
         ...
 
     @deprecate_kwargs(
@@ -379,17 +379,17 @@ def generate(
     def generate(
         self,
         prompts: Union[Union[PromptType, Sequence[PromptType]],
-                       Optional[Union[str, List[str]]]] = None,
+                       Optional[Union[str, list[str]]]] = None,
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
-        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         guided_options_request: Optional[Union[LLMGuidedOptions,
                                                GuidedDecodingRequest]] = None,
-        priority: Optional[List[int]] = None,
-    ) -> List[RequestOutput]:
+        priority: Optional[list[int]] = None,
+    ) -> list[RequestOutput]:
         """Generates the completions for the input prompts.
 
         This class automatically batches the given prompts, considering
@@ -440,7 +440,7 @@ def generate(
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
-                prompts=cast(Optional[Union[str, List[str]]], prompts),
+                prompts=cast(Optional[Union[str, list[str]]], prompts),
                 prompt_token_ids=prompt_token_ids,
             )
         else:
@@ -473,8 +473,8 @@ def generate(
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict[str, Any]] = None) -> List[_R]:
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
         """
         Execute an RPC call on all workers.
 
@@ -510,9 +510,9 @@ def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
 
     def beam_search(
         self,
-        prompts: List[Union[TokensPrompt, TextPrompt]],
+        prompts: list[Union[TokensPrompt, TextPrompt]],
         params: BeamSearchParams,
-    ) -> List[BeamSearchOutput]:
+    ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
 
@@ -543,7 +543,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
         beam_search_params = SamplingParams(logprobs=2 * beam_width,
                                             max_tokens=1,
                                             temperature=temperature)
-        instances: List[BeamSearchInstance] = []
+        instances: list[BeamSearchInstance] = []
 
         for prompt in prompts:
             if is_token_prompt(prompt):
@@ -553,12 +553,12 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
             instances.append(BeamSearchInstance(prompt_tokens))
 
         for _ in range(max_tokens):
-            all_beams: List[BeamSearchSequence] = list(
+            all_beams: list[BeamSearchSequence] = list(
                 sum((instance.beams for instance in instances), []))
             pos = [0] + list(
                 itertools.accumulate(
                     len(instance.beams) for instance in instances))
-            instance_start_and_end: List[Tuple[int, int]] = list(
+            instance_start_and_end: list[tuple[int, int]] = list(
                 zip(pos[:-1], pos[1:]))
 
             if len(all_beams) == 0:
@@ -620,19 +620,19 @@ def sort_beams_key(x: BeamSearchSequence) -> float:
 
     def chat(
         self,
-        messages: Union[List[ChatCompletionMessageParam],
-                        List[List[ChatCompletionMessageParam]]],
+        messages: Union[list[ChatCompletionMessageParam],
+                        list[list[ChatCompletionMessageParam]]],
         sampling_params: Optional[Union[SamplingParams,
-                                        List[SamplingParams]]] = None,
+                                        list[SamplingParams]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
         chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> List[RequestOutput]:
+        tools: Optional[list[dict[str, Any]]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    ) -> list[RequestOutput]:
         """
         Generate responses for a chat conversation.
 
@@ -678,17 +678,17 @@ def chat(
             A list of ``RequestOutput`` objects containing the generated
             responses in the same order as the input messages.
         """
-        list_of_messages: List[List[ChatCompletionMessageParam]]
+        list_of_messages: list[list[ChatCompletionMessageParam]]
 
         # Handle multi and single conversations
         if is_list_of(messages, list):
-            # messages is List[List[...]]
-            list_of_messages = cast(List[List[ChatCompletionMessageParam]],
+            # messages is list[list[...]]
+            list_of_messages = cast(list[list[ChatCompletionMessageParam]],
                                     messages)
         else:
-            # messages is List[...]
+            # messages is list[...]
             list_of_messages = [
-                cast(List[ChatCompletionMessageParam], messages)
+                cast(list[ChatCompletionMessageParam], messages)
             ]
 
         tokenizer = self.get_tokenizer()
@@ -699,7 +699,7 @@ def chat(
             tokenizer,
         )
 
-        prompts: List[Union[TokensPrompt, TextPrompt]] = []
+        prompts: list[Union[TokensPrompt, TextPrompt]] = []
 
         for msgs in list_of_messages:
             # NOTE: _parse_chat_message_content_parts() currently doesn't
@@ -712,7 +712,7 @@ def chat(
                 content_format=resolved_content_format,
             )
 
-            prompt_data: Union[str, List[int]]
+            prompt_data: Union[str, list[int]]
             if isinstance(tokenizer, MistralTokenizer):
                 prompt_data = apply_mistral_chat_template(
                     tokenizer,
@@ -762,9 +762,9 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         *,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single (prompt + optional token ids)
@@ -774,25 +774,25 @@ def encode(
         prompts: str,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[List[int]] = None,
+        prompt_token_ids: Optional[list[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (prompt + optional token ids)
     @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
-        prompts: List[str],
+        prompts: list[str],
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[List[List[int]]] = None,
+        prompt_token_ids: Optional[list[list[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single (token ids + optional prompt)
@@ -803,26 +803,26 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         *,
-        prompt_token_ids: List[int],
+        prompt_token_ids: list[int],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: multi (token ids + optional prompt)
     @deprecated("'prompt_token_ids' will become part of 'prompts'")
     def encode(
         self,
-        prompts: Optional[List[str]] = None,
+        prompts: Optional[list[str]] = None,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         *,
-        prompt_token_ids: List[List[int]],
+        prompt_token_ids: list[list[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @overload  # LEGACY: single or multi token ids [pos-only]
@@ -831,11 +831,11 @@ def encode(
         self,
         prompts: None,
         pooling_params: None,
-        prompt_token_ids: Union[List[int], List[List[int]]],
+        prompt_token_ids: Union[list[int], list[list[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         ...
 
     @deprecate_kwargs(
@@ -846,14 +846,14 @@ def encode(
     def encode(
         self,
         prompts: Union[Union[PromptType, Sequence[PromptType]],
-                       Optional[Union[str, List[str]]]] = None,
+                       Optional[Union[str, list[str]]]] = None,
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
-        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
         """Apply pooling to the hidden states corresponding to the input
         prompts.
 
@@ -898,7 +898,7 @@ def encode(
 
         if prompt_token_ids is not None:
             parsed_prompts = self._convert_v1_inputs(
-                prompts=cast(Optional[Union[str, List[str]]], prompts),
+                prompts=cast(Optional[Union[str, list[str]]], prompts),
                 prompt_token_ids=prompt_token_ids,
             )
         else:
@@ -926,9 +926,9 @@ def embed(
         /,
         *,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[EmbeddingRequestOutput]:
+    ) -> list[EmbeddingRequestOutput]:
         """
         Generate an embedding vector for each prompt.
 
@@ -966,9 +966,9 @@ def classify(
         /,
         *,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[ClassificationRequestOutput]:
+    ) -> list[ClassificationRequestOutput]:
         """
         Generate class logits for each prompt.
 
@@ -1003,29 +1003,29 @@ def classify(
     def _embedding_score(
         self,
         tokenizer: AnyTokenizer,
-        text_1: List[Union[str, TextPrompt, TokensPrompt]],
-        text_2: List[Union[str, TextPrompt, TokensPrompt]],
+        text_1: list[Union[str, TextPrompt, TokensPrompt]],
+        text_2: list[Union[str, TextPrompt, TokensPrompt]],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[ScoringRequestOutput]:
+    ) -> list[ScoringRequestOutput]:
 
-        encoded_output: List[PoolingRequestOutput] = self.encode(
+        encoded_output: list[PoolingRequestOutput] = self.encode(
             text_1 + text_2,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request)
 
-        encoded_output_1: List[PoolingRequestOutput] = encoded_output[
+        encoded_output_1: list[PoolingRequestOutput] = encoded_output[
             0:len(text_1)]
-        encoded_output_2: List[PoolingRequestOutput] = encoded_output[
+        encoded_output_2: list[PoolingRequestOutput] = encoded_output[
             len(text_1):]
 
         if len(encoded_output_1) == 1:
             encoded_output_1 = encoded_output_1 * len(encoded_output_2)
 
-        scores: List[PoolingRequestOutput] = []
+        scores: list[PoolingRequestOutput] = []
 
         scores = _cosine_similarity(tokenizer=tokenizer,
                                     embed_1=encoded_output_1,
@@ -1038,13 +1038,13 @@ def _embedding_score(
     def _cross_encoding_score(
         self,
         tokenizer: AnyTokenizer,
-        text_1: List[str],
-        text_2: List[str],
+        text_1: list[str],
+        text_2: list[str],
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[ScoringRequestOutput]:
+    ) -> list[ScoringRequestOutput]:
 
         if isinstance(tokenizer, MistralTokenizer):
             raise ValueError(
@@ -1057,7 +1057,7 @@ def _cross_encoding_score(
 
         pooling_params = PoolingParams()
 
-        tokenization_kwargs: Dict[str, Any] = {}
+        tokenization_kwargs: dict[str, Any] = {}
         if truncate_prompt_tokens is not None:
             tokenization_kwargs["truncation"] = True
             tokenization_kwargs["max_length"] = truncate_prompt_tokens
@@ -1094,9 +1094,9 @@ def score(
         *,
         truncate_prompt_tokens: Optional[int] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
+        lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    ) -> List[ScoringRequestOutput]:
+    ) -> list[ScoringRequestOutput]:
         """Generate similarity scores for all pairs ``<text,text_pair>``.
 
         The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
@@ -1162,12 +1162,12 @@ def ensure_str(prompt: SingletonPrompt):
         if isinstance(text_1, (str, dict)):
             # Convert a single prompt to a list.
             text_1 = [text_1]
-        input_text_1: List[str] = [ensure_str(t) for t in text_1]
+        input_text_1: list[str] = [ensure_str(t) for t in text_1]
 
         if isinstance(text_2, (str, dict)):
             # Convert a single prompt to a list.
             text_2 = [text_2]
-        input_text_2: List[str] = [ensure_str(t) for t in text_2]
+        input_text_2: list[str] = [ensure_str(t) for t in text_2]
 
         _validate_score_input_lens(input_text_1, input_text_2)
 
@@ -1226,8 +1226,8 @@ def wake_up(self):
     # LEGACY
     def _convert_v1_inputs(
         self,
-        prompts: Optional[Union[str, List[str]]],
-        prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
+        prompts: Optional[Union[str, list[str]]],
+        prompt_token_ids: Optional[Union[list[int], list[list[int]]]],
     ):
         # skip_tokenizer_init is now checked in engine
 
@@ -1252,7 +1252,7 @@ def _convert_v1_inputs(
             raise ValueError("Either prompts or prompt_token_ids must be "
                              "provided.")
 
-        parsed_prompts: List[PromptType] = []
+        parsed_prompts: list[PromptType] = []
         for i in range(num_requests):
             item: PromptType
 
@@ -1275,7 +1275,7 @@ def _validate_and_add_requests(
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
         prompt_adapter_request: Optional[PromptAdapterRequest],
         guided_options: Optional[GuidedDecodingRequest] = None,
-        priority: Optional[List[int]] = None,
+        priority: Optional[list[int]] = None,
     ) -> None:
         if guided_options is not None:
             warnings.warn(
@@ -1357,7 +1357,7 @@ def _add_guided_params(
 
     def _run_engine(
             self, *, use_tqdm: bool
-    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    ) -> list[Union[RequestOutput, PoolingRequestOutput]]:
         # Initialize tqdm.
         if use_tqdm:
             num_requests = self.llm_engine.get_num_unfinished_requests()
@@ -1370,7 +1370,7 @@ def _run_engine(
             )
 
         # Run the engine.
-        outputs: List[Union[RequestOutput, PoolingRequestOutput]] = []
+        outputs: list[Union[RequestOutput, PoolingRequestOutput]] = []
         total_in_toks = 0
         total_out_toks = 0
         while self.llm_engine.has_unfinished_requests():
diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py
index e82b6ba6c7b..ea5759152a2 100644
--- a/vllm/entrypoints/logger.py
+++ b/vllm/entrypoints/logger.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -22,7 +22,7 @@ def log_inputs(
         self,
         request_id: str,
         prompt: Optional[str],
-        prompt_token_ids: Optional[List[int]],
+        prompt_token_ids: Optional[list[int]],
         params: Optional[Union[SamplingParams, PoolingParams,
                                BeamSearchParams]],
         lora_request: Optional[LoRARequest],
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1b65484c446..ec2099d4ceb 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -13,10 +13,11 @@
 import tempfile
 import uuid
 from argparse import Namespace
+from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import Annotated, AsyncIterator, Dict, Optional, Set, Tuple, Union
+from typing import Annotated, Optional, Union
 
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
@@ -93,7 +94,7 @@
 # Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765)
 logger = init_logger('vllm.entrypoints.openai.api_server')
 
-_running_tasks: Set[asyncio.Task] = set()
+_running_tasks: set[asyncio.Task] = set()
 
 
 @asynccontextmanager
@@ -587,7 +588,7 @@ async def do_rerank_v2(request: RerankRequest, raw_request: Request):
     return await do_rerank(request, raw_request)
 
 
-TASK_HANDLERS: Dict[str, Dict[str, tuple]] = {
+TASK_HANDLERS: dict[str, dict[str, tuple]] = {
     "generate": {
         "messages": (ChatCompletionRequest, create_chat_completion),
         "default": (CompletionRequest, create_completion),
@@ -894,7 +895,7 @@ async def init_app_state(
     state.task = model_config.task
 
 
-def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
+def create_server_socket(addr: tuple[str, int]) -> socket.socket:
     family = socket.AF_INET
     if is_valid_ipv6_address(addr[0]):
         family = socket.AF_INET6
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 8d877046f75..b8cc57430f8 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -8,7 +8,8 @@
 import argparse
 import json
 import ssl
-from typing import List, Optional, Sequence, Union, get_args
+from collections.abc import Sequence
+from typing import Optional, Union, get_args
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
@@ -33,7 +34,7 @@ def __call__(
         if isinstance(values, str):
             raise TypeError("Expected values to be a list")
 
-        lora_list: List[LoRAModulePath] = []
+        lora_list: list[LoRAModulePath] = []
         for item in values:
             if item in [None, '']:  # Skip if item is None or empty string
                 continue
@@ -69,7 +70,7 @@ def __call__(
         if isinstance(values, str):
             raise TypeError("Expected values to be a list")
 
-        adapter_list: List[PromptAdapterPath] = []
+        adapter_list: list[PromptAdapterPath] = []
         for item in values:
             name, path = item.split('=')
             adapter_list.append(PromptAdapterPath(name, path))
diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py
index 41e5eef40ea..04d5091a968 100644
--- a/vllm/entrypoints/openai/logits_processors.py
+++ b/vllm/entrypoints/openai/logits_processors.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Iterable
 from functools import lru_cache, partial
-from typing import Dict, FrozenSet, Iterable, List, Optional, Union
+from typing import Optional, Union
 
 import torch
 
@@ -14,10 +15,10 @@ class AllowedTokenIdsLogitsProcessor:
     specific set of token ids."""
 
     def __init__(self, allowed_ids: Iterable[int]):
-        self.allowed_ids: Optional[List[int]] = list(allowed_ids)
+        self.allowed_ids: Optional[list[int]] = list(allowed_ids)
         self.mask: Optional[torch.Tensor] = None
 
-    def __call__(self, token_ids: List[int],
+    def __call__(self, token_ids: list[int],
                  logits: torch.Tensor) -> torch.Tensor:
         if self.mask is None:
             self.mask = torch.ones((logits.shape[-1], ),
@@ -31,7 +32,7 @@ def __call__(self, token_ids: List[int],
 
 @lru_cache(maxsize=32)
 def _get_allowed_token_ids_logits_processor(
-    allowed_token_ids: FrozenSet[int],
+    allowed_token_ids: frozenset[int],
     vocab_size: int,
 ) -> LogitsProcessor:
     if not allowed_token_ids:
@@ -43,8 +44,8 @@ def _get_allowed_token_ids_logits_processor(
 
 
 def logit_bias_logits_processor(
-    logit_bias: Dict[int, float],
-    token_ids: List[int],
+    logit_bias: dict[int, float],
+    token_ids: list[int],
     logits: torch.Tensor,
 ) -> torch.Tensor:
     for token_id, bias in logit_bias.items():
@@ -53,16 +54,16 @@ def logit_bias_logits_processor(
 
 
 def get_logits_processors(
-    logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]],
-    allowed_token_ids: Optional[List[int]],
+    logit_bias: Optional[Union[dict[int, float], dict[str, float]]],
+    allowed_token_ids: Optional[list[int]],
     tokenizer: AnyTokenizer,
-) -> List[LogitsProcessor]:
-    logits_processors: List[LogitsProcessor] = []
+) -> list[LogitsProcessor]:
+    logits_processors: list[LogitsProcessor] = []
     if logit_bias:
         try:
             # Convert token_id to integer
             # Clamp the bias between -100 and 100 per OpenAI API spec
-            clamped_logit_bias: Dict[int, float] = {
+            clamped_logit_bias: dict[int, float] = {
                 int(token_id): min(100.0, max(-100.0, bias))
                 for token_id, bias in logit_bias.items()
             }
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 31214211cfc..14ce71cd3c2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,13 +5,13 @@
 import re
 import time
 from argparse import Namespace
-from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union
+from typing import Annotated, Any, ClassVar, Literal, Optional, Union
 
 import torch
 from fastapi import UploadFile
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
                       ValidationInfo, field_validator, model_validator)
-from typing_extensions import Annotated, TypeAlias
+from typing_extensions import TypeAlias
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.logger import init_logger
@@ -47,7 +47,7 @@ class OpenAIBaseModel(BaseModel):
     model_config = ConfigDict(extra="allow")
 
     # Cache class field names
-    field_names: ClassVar[Optional[Set[str]]] = None
+    field_names: ClassVar[Optional[set[str]]] = None
 
     @model_validator(mode="wrap")
     @classmethod
@@ -105,12 +105,12 @@ class ModelCard(OpenAIBaseModel):
     root: Optional[str] = None
     parent: Optional[str] = None
     max_model_len: Optional[int] = None
-    permission: List[ModelPermission] = Field(default_factory=list)
+    permission: list[ModelPermission] = Field(default_factory=list)
 
 
 class ModelList(OpenAIBaseModel):
     object: str = "list"
-    data: List[ModelCard] = Field(default_factory=list)
+    data: list[ModelCard] = Field(default_factory=list)
 
 
 class PromptTokenUsageInfo(OpenAIBaseModel):
@@ -134,7 +134,7 @@ class JsonSchemaResponseFormat(OpenAIBaseModel):
     description: Optional[str] = None
     # schema is the field in openai but that causes conflicts with pydantic so
     # instead use json_schema with an alias
-    json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema')
+    json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema')
     strict: Optional[bool] = None
 
 
@@ -152,7 +152,7 @@ class StreamOptions(OpenAIBaseModel):
 class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: Optional[str] = None
-    parameters: Optional[Dict[str, Any]] = None
+    parameters: Optional[dict[str, Any]] = None
 
 
 class ChatCompletionToolsParam(OpenAIBaseModel):
@@ -171,15 +171,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
 
 class LogitsProcessorConstructor(BaseModel):
     qualname: str
-    args: Optional[List[Any]] = None
-    kwargs: Optional[Dict[str, Any]] = None
+    args: Optional[list[Any]] = None
+    kwargs: Optional[dict[str, Any]] = None
 
 
-LogitsProcessors = List[Union[str, LogitsProcessorConstructor]]
+LogitsProcessors = list[Union[str, LogitsProcessorConstructor]]
 
 
 def get_logits_processors(processors: Optional[LogitsProcessors],
-                          pattern: Optional[str]) -> Optional[List[Any]]:
+                          pattern: Optional[str]) -> Optional[list[Any]]:
     if processors and pattern:
         logits_processors = []
         for processor in processors:
@@ -212,10 +212,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
-    messages: List[ChatCompletionMessageParam]
+    messages: list[ChatCompletionMessageParam]
     model: Optional[str] = None
     frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[Dict[str, float]] = None
+    logit_bias: Optional[dict[str, float]] = None
     logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = 0
     # TODO(#9845): remove max_tokens when field is removed from OpenAI API
@@ -228,12 +228,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     presence_penalty: Optional[float] = 0.0
     response_format: Optional[ResponseFormat] = None
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
-    tools: Optional[List[ChatCompletionToolsParam]] = None
+    tools: Optional[list[ChatCompletionToolsParam]] = None
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
 
@@ -248,7 +248,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
@@ -290,7 +290,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "special tokens so this should be set to false (as is the "
             "default)."),
     )
-    documents: Optional[List[Dict[str, str]]] = Field(
+    documents: Optional[list[dict[str, str]]] = Field(
         default=None,
         description=
         ("A list of dicts representing documents that will be accessible to "
@@ -307,12 +307,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one."),
     )
-    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
-    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -325,7 +325,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, the output will follow the regex pattern."),
     )
-    guided_choice: Optional[List[str]] = Field(
+    guided_choice: Optional[list[str]] = Field(
         default=None,
         description=(
             "If specified, the output will be exactly one of the choices."),
@@ -643,17 +643,17 @@ class CompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/completions/create
     model: Optional[str] = None
-    prompt: Union[List[int], List[List[int]], str, List[str]]
+    prompt: Union[list[int], list[list[int]], str, list[str]]
     best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[Dict[str, float]] = None
+    logit_bias: Optional[dict[str, float]] = None
     logprobs: Optional[int] = None
     max_tokens: Optional[int] = 16
     n: int = 1
     presence_penalty: Optional[float] = 0.0
     seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    stop: Optional[Union[str, list[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
     stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
@@ -667,14 +667,14 @@ class CompletionRequest(OpenAIBaseModel):
     min_p: Optional[float] = None
     repetition_penalty: Optional[float] = None
     length_penalty: float = 1.0
-    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    stop_token_ids: Optional[list[int]] = Field(default_factory=list)
     include_stop_str_in_output: bool = False
     ignore_eos: bool = False
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
-    allowed_token_ids: Optional[List[int]] = None
+    allowed_token_ids: Optional[list[int]] = None
     prompt_logprobs: Optional[int] = None
     # doc: end-completion-sampling-params
 
@@ -701,7 +701,7 @@ class CompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, the output will follow the regex pattern."),
     )
-    guided_choice: Optional[List[str]] = Field(
+    guided_choice: Optional[list[str]] = Field(
         default=None,
         description=(
             "If specified, the output will be exactly one of the choices."),
@@ -908,7 +908,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/embeddings
     model: Optional[str] = None
-    input: Union[List[int], List[List[int]], str, List[str]]
+    input: Union[list[int], list[list[int]], str, list[str]]
     encoding_format: Literal["float", "base64"] = "float"
     dimensions: Optional[int] = None
     user: Optional[str] = None
@@ -940,7 +940,7 @@ def to_pooling_params(self):
 
 class EmbeddingChatRequest(OpenAIBaseModel):
     model: Optional[str] = None
-    messages: List[ChatCompletionMessageParam]
+    messages: list[ChatCompletionMessageParam]
 
     encoding_format: Literal["float", "base64"] = "float"
     dimensions: Optional[int] = None
@@ -969,12 +969,12 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one."),
     )
-    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
-    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1008,8 +1008,8 @@ def to_pooling_params(self):
 
 class ScoreRequest(OpenAIBaseModel):
     model: Optional[str] = None
-    text_1: Union[List[str], str]
-    text_2: Union[List[str], str]
+    text_1: Union[list[str], str]
+    text_2: Union[list[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
     # doc: begin-score-pooling-params
@@ -1033,7 +1033,7 @@ def to_pooling_params(self):
 class RerankRequest(OpenAIBaseModel):
     model: Optional[str] = None
     query: str
-    documents: List[str]
+    documents: list[str]
     top_n: int = Field(default_factory=lambda: 0)
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
@@ -1073,14 +1073,14 @@ class RerankResponse(OpenAIBaseModel):
     id: str
     model: str
     usage: RerankUsage
-    results: List[RerankResult]
+    results: list[RerankResult]
 
 
 class CompletionLogProbs(OpenAIBaseModel):
-    text_offset: List[int] = Field(default_factory=list)
-    token_logprobs: List[Optional[float]] = Field(default_factory=list)
-    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: List[Optional[Dict[str,
+    text_offset: list[int] = Field(default_factory=list)
+    token_logprobs: list[Optional[float]] = Field(default_factory=list)
+    tokens: list[str] = Field(default_factory=list)
+    top_logprobs: list[Optional[dict[str,
                                      float]]] = Field(default_factory=list)
 
 
@@ -1096,7 +1096,7 @@ class CompletionResponseChoice(OpenAIBaseModel):
             "to stop, None if the completion finished for some other reason "
             "including encountering the EOS token"),
     )
-    prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
+    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
 
 
 class CompletionResponse(OpenAIBaseModel):
@@ -1104,7 +1104,7 @@ class CompletionResponse(OpenAIBaseModel):
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[CompletionResponseChoice]
+    choices: list[CompletionResponseChoice]
     usage: UsageInfo
 
 
@@ -1127,14 +1127,14 @@ class CompletionStreamResponse(OpenAIBaseModel):
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[CompletionResponseStreamChoice]
+    choices: list[CompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
 
 
 class EmbeddingResponseData(OpenAIBaseModel):
     index: int
     object: str = "embedding"
-    embedding: Union[List[float], str]
+    embedding: Union[list[float], str]
 
 
 class EmbeddingResponse(OpenAIBaseModel):
@@ -1142,14 +1142,14 @@ class EmbeddingResponse(OpenAIBaseModel):
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    data: List[EmbeddingResponseData]
+    data: list[EmbeddingResponseData]
     usage: UsageInfo
 
 
 class PoolingResponseData(OpenAIBaseModel):
     index: int
     object: str = "pooling"
-    data: Union[List[List[float]], List[float], str]
+    data: Union[list[list[float]], list[float], str]
 
 
 class PoolingResponse(OpenAIBaseModel):
@@ -1157,7 +1157,7 @@ class PoolingResponse(OpenAIBaseModel):
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    data: List[PoolingResponseData]
+    data: list[PoolingResponseData]
     usage: UsageInfo
 
 
@@ -1172,7 +1172,7 @@ class ScoreResponse(OpenAIBaseModel):
     object: str = "list"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    data: List[ScoreResponseData]
+    data: list[ScoreResponseData]
     usage: UsageInfo
 
 
@@ -1205,7 +1205,7 @@ class ExtractedToolCallInformation(BaseModel):
     tools_called: bool
 
     # extracted tool calls
-    tool_calls: List[ToolCall]
+    tool_calls: list[ToolCall]
 
     # content - per OpenAI spec, content AND tool calls can be returned rarely
     # But some models will do this intentionally
@@ -1216,21 +1216,21 @@ class ChatMessage(OpenAIBaseModel):
     role: str
     reasoning_content: Optional[str] = None
     content: Optional[str] = None
-    tool_calls: List[ToolCall] = Field(default_factory=list)
+    tool_calls: list[ToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionLogProb(OpenAIBaseModel):
     token: str
     logprob: float = -9999.0
-    bytes: Optional[List[int]] = None
+    bytes: Optional[list[int]] = None
 
 
 class ChatCompletionLogProbsContent(ChatCompletionLogProb):
-    top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list)
+    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
 
 
 class ChatCompletionLogProbs(OpenAIBaseModel):
-    content: Optional[List[ChatCompletionLogProbsContent]] = None
+    content: Optional[list[ChatCompletionLogProbsContent]] = None
 
 
 class ChatCompletionResponseChoice(OpenAIBaseModel):
@@ -1248,16 +1248,16 @@ class ChatCompletionResponse(OpenAIBaseModel):
     object: Literal["chat.completion"] = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[ChatCompletionResponseChoice]
+    choices: list[ChatCompletionResponseChoice]
     usage: UsageInfo
-    prompt_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None
+    prompt_logprobs: Optional[list[Optional[dict[int, Logprob]]]] = None
 
 
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
     reasoning_content: Optional[str] = None
-    tool_calls: List[DeltaToolCall] = Field(default_factory=list)
+    tool_calls: list[DeltaToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
@@ -1273,7 +1273,7 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[ChatCompletionResponseStreamChoice]
+    choices: list[ChatCompletionResponseStreamChoice]
     usage: Optional[UsageInfo] = Field(default=None)
 
 
@@ -1358,7 +1358,7 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
 
 class TokenizeChatRequest(OpenAIBaseModel):
     model: Optional[str] = None
-    messages: List[ChatCompletionMessageParam]
+    messages: list[ChatCompletionMessageParam]
 
     add_generation_prompt: bool = Field(
         default=True,
@@ -1393,12 +1393,12 @@ class TokenizeChatRequest(OpenAIBaseModel):
             "allowed, so you must provide a chat template if the tokenizer "
             "does not define one."),
     )
-    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+    chat_template_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
     )
-    mm_processor_kwargs: Optional[Dict[str, Any]] = Field(
+    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the HF processor."),
     )
@@ -1419,12 +1419,12 @@ def check_generation_prompt(cls, data):
 class TokenizeResponse(OpenAIBaseModel):
     count: int
     max_model_len: int
-    tokens: List[int]
+    tokens: list[int]
 
 
 class DetokenizeRequest(OpenAIBaseModel):
     model: Optional[str] = None
-    tokens: List[int]
+    tokens: list[int]
 
 
 class DetokenizeResponse(OpenAIBaseModel):
@@ -1492,7 +1492,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     to automatically increase the temperature until certain thresholds are hit.
     """
 
-    timestamp_granularities: List[Literal["word", "segment"]] = Field(
+    timestamp_granularities: list[Literal["word", "segment"]] = Field(
         alias="timestamp_granularities[]", default=[])
     """The timestamp granularities to populate for this transcription.
 
@@ -1580,7 +1580,7 @@ class TranscriptionSegment(OpenAIBaseModel):
     text: str
     """Text content of the segment."""
 
-    tokens: List[int]
+    tokens: list[int]
     """Array of token IDs for the text content."""
 
 
@@ -1594,8 +1594,8 @@ class TranscriptionResponseVerbose(OpenAIBaseModel):
     text: str
     """The transcribed text."""
 
-    segments: Optional[List[TranscriptionSegment]] = None
+    segments: Optional[list[TranscriptionSegment]] = None
     """Segments of the transcribed text and their corresponding details."""
 
-    words: Optional[List[TranscriptionWord]] = None
+    words: Optional[list[TranscriptionWord]] = None
     """Extracted words and their corresponding timestamps."""
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
index b5df7e47446..b3bc0e836d4 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from collections.abc import Sequence
 from functools import cached_property
-from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union
+from typing import Callable, Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
@@ -25,14 +26,14 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.model_tokenizer = tokenizer
 
     @cached_property
-    def vocab(self) -> Dict[str, int]:
+    def vocab(self) -> dict[str, int]:
         # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
-    ) -> Tuple[Optional[str], Optional[str]]:
+    ) -> tuple[Optional[str], Optional[str]]:
         """
         Extract reasoning content from a complete model-generated string.
 
@@ -47,7 +48,7 @@ def extract_reasoning_content(
             The request object that was used to generate the model_output.
 
         Returns:
-        Tuple[Optional[str], Optional[str]]
+        tuple[Optional[str], Optional[str]]
             A tuple containing the reasoning content and the content.
         """
 
@@ -77,10 +78,10 @@ def extract_reasoning_content_streaming(
 
 
 class ReasoningParserManager:
-    reasoning_parsers: Dict[str, Type] = {}
+    reasoning_parsers: dict[str, type] = {}
 
     @classmethod
-    def get_reasoning_parser(cls, name) -> Type:
+    def get_reasoning_parser(cls, name) -> type:
         """
         Get reasoning parser by name which is registered by `register_module`.
 
@@ -94,8 +95,8 @@ def get_reasoning_parser(cls, name) -> Type:
 
     @classmethod
     def _register_module(cls,
-                         module: Type,
-                         module_name: Optional[Union[str, List[str]]] = None,
+                         module: type,
+                         module_name: Optional[Union[str, list[str]]] = None,
                          force: bool = True) -> None:
         if not issubclass(module, ReasoningParser):
             raise TypeError("module must be subclass of ReasoningParser, "
@@ -114,9 +115,9 @@ def _register_module(cls,
     @classmethod
     def register_module(
             cls,
-            name: Optional[Union[str, List[str]]] = None,
+            name: Optional[Union[str, list[str]]] = None,
             force: bool = True,
-            module: Union[Type, None] = None) -> Union[type, Callable]:
+            module: Union[type, None] = None) -> Union[type, Callable]:
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not 
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index e5ab6e6b233..1a2c66a60e9 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
-from typing import Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Optional, Union
 
 from transformers import PreTrainedTokenizerBase
 
@@ -122,7 +123,7 @@ def extract_reasoning_content_streaming(
 
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
-    ) -> Tuple[Optional[str], Optional[str]]:
+    ) -> tuple[Optional[str], Optional[str]]:
 
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index e4496f61e60..0d06ba3df23 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -2,9 +2,10 @@
 
 import asyncio
 import tempfile
+from collections.abc import Awaitable
 from http import HTTPStatus
 from io import StringIO
-from typing import Awaitable, Callable, List, Optional
+from typing import Callable, Optional
 
 import aiohttp
 import torch
@@ -143,7 +144,7 @@ async def read_file(path_or_url: str) -> str:
 
 
 async def write_local_file(output_path: str,
-                           batch_outputs: List[BatchRequestOutput]) -> None:
+                           batch_outputs: list[BatchRequestOutput]) -> None:
     """
     Write the responses to a local file.
     output_path: The path to write the responses to.
@@ -204,7 +205,7 @@ async def upload_data(output_url: str, data_or_file: str,
                                 f"Error message: {str(e)}.") from e
 
 
-async def write_file(path_or_url: str, batch_outputs: List[BatchRequestOutput],
+async def write_file(path_or_url: str, batch_outputs: list[BatchRequestOutput],
                      output_tmp_dir: str) -> None:
     """
     Write batch_outputs to a file or upload to a URL.
@@ -353,7 +354,7 @@ async def main(args):
     logger.info("Reading batch from %s...", args.input_file)
 
     # Submit all requests in the file to the engine "concurrently".
-    response_futures: List[Awaitable[BatchRequestOutput]] = []
+    response_futures: list[Awaitable[BatchRequestOutput]] = []
     for request_json in (await read_file(args.input_file)).strip().split("\n"):
         # Skip empty lines.
         request_json = request_json.strip()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 02dd2c4881c..98e9ea0fc61 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -3,10 +3,9 @@
 import asyncio
 import json
 import time
-from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List,
-                    Optional)
-from typing import Sequence as GenericSequence
-from typing import Union
+from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import Sequence as GenericSequence
+from typing import Callable, Final, Optional, Union
 
 from fastapi import Request
 
@@ -205,7 +204,7 @@ async def create_chat_completion(
             raw_request.state.request_metadata = request_metadata
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[RequestOutput, None]] = []
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
@@ -282,7 +281,7 @@ async def chat_completion_stream_generator(
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
         model_name: str,
-        conversation: List[ConversationMessage],
+        conversation: list[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
@@ -310,7 +309,7 @@ async def chat_completion_stream_generator(
         should_stream_with_reasoning_parsing = (
             self._should_stream_with_reasoning_parsing(request))
 
-        all_previous_token_ids: Optional[List[List[int]]]
+        all_previous_token_ids: Optional[list[list[int]]]
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
@@ -339,7 +338,7 @@ async def chat_completion_stream_generator(
         # Prepare the tool parser if it's needed
         try:
             if tool_choice_auto and self.tool_parser:
-                tool_parsers: List[Optional[ToolParser]] = [
+                tool_parsers: list[Optional[ToolParser]] = [
                     self.tool_parser(tokenizer)
                 ] * num_choices
             else:
@@ -406,7 +405,7 @@ async def chat_completion_stream_generator(
                     # Send response to echo the input portion of the
                     # last message
                     if request.echo:
-                        last_msg_content: Union[str, List[Dict[str, str]]] = ""
+                        last_msg_content: Union[str, list[dict[str, str]]] = ""
                         if conversation and "content" in conversation[
                                 -1] and conversation[-1].get("role") == role:
                             last_msg_content = conversation[-1]["content"] or ""
@@ -674,7 +673,7 @@ async def chat_completion_full_generator(
         result_generator: AsyncIterator[RequestOutput],
         request_id: str,
         model_name: str,
-        conversation: List[ConversationMessage],
+        conversation: list[ConversationMessage],
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> Union[ErrorResponse, ChatCompletionResponse]:
@@ -693,7 +692,7 @@ async def chat_completion_full_generator(
 
         assert final_res is not None
 
-        choices: List[ChatCompletionResponseChoice] = []
+        choices: list[ChatCompletionResponseChoice] = []
 
         role = self.get_chat_request_role(request)
         for output in final_res.outputs:
@@ -812,7 +811,7 @@ async def chat_completion_full_generator(
             choices.append(choice_data)
 
         if request.echo:
-            last_msg_content: Union[str, List[Dict[str, str]]] = ""
+            last_msg_content: Union[str, list[dict[str, str]]] = ""
             if conversation and "content" in conversation[-1] and conversation[
                     -1].get("role") == role:
                 last_msg_content = conversation[-1]["content"] or ""
@@ -853,8 +852,8 @@ async def chat_completion_full_generator(
         return response
 
     def _get_top_logprobs(
-            self, logprobs: Dict[int, Logprob], top_logprobs: Optional[int],
-            tokenizer: AnyTokenizer) -> List[ChatCompletionLogProb]:
+            self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
+            tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
         return [
             ChatCompletionLogProb(token=(token := self._get_decoded_token(
                 p[1],
@@ -871,12 +870,12 @@ def _get_top_logprobs(
     def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
+        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
         tokenizer: AnyTokenizer,
         num_output_top_logprobs: Optional[int] = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
-        logprobs_content: List[ChatCompletionLogProbsContent] = []
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
 
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 840f0f9b844..ed09af84f64 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -2,9 +2,9 @@
 
 import asyncio
 import time
-from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple, Union, cast
+from collections.abc import AsyncGenerator, AsyncIterator
+from collections.abc import Sequence as GenericSequence
+from typing import Optional, Union, cast
 
 from fastapi import Request
 
@@ -113,7 +113,7 @@ async def create_completion(
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[RequestOutput, None]] = []
+        generators: list[AsyncGenerator[RequestOutput, None]] = []
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 sampling_params: Union[SamplingParams, BeamSearchParams]
@@ -189,7 +189,7 @@ async def create_completion(
                 request_metadata=request_metadata)
 
         # Non-streaming response
-        final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts
+        final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
@@ -203,7 +203,7 @@ async def create_completion(
                 if final_res.prompt is None:
                     final_res.prompt = request_prompts[i]["prompt"]
 
-            final_res_batch_checked = cast(List[RequestOutput],
+            final_res_batch_checked = cast(list[RequestOutput],
                                            final_res_batch)
 
             response = self.request_output_to_completion_response(
@@ -237,7 +237,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
     async def completion_stream_generator(
         self,
         request: CompletionRequest,
-        result_generator: AsyncIterator[Tuple[int, RequestOutput]],
+        result_generator: AsyncIterator[tuple[int, RequestOutput]],
         request_id: str,
         created_time: int,
         model_name: str,
@@ -270,7 +270,7 @@ async def completion_stream_generator(
                     num_prompt_tokens[prompt_idx] = len(res.prompt_token_ids)
 
                 delta_token_ids: GenericSequence[int]
-                out_logprobs: Optional[GenericSequence[Optional[Dict[
+                out_logprobs: Optional[GenericSequence[Optional[dict[
                     int, Logprob]]]]
 
                 for output in res.outputs:
@@ -381,7 +381,7 @@ async def completion_stream_generator(
 
     def request_output_to_completion_response(
         self,
-        final_res_batch: List[RequestOutput],
+        final_res_batch: list[RequestOutput],
         request: CompletionRequest,
         request_id: str,
         created_time: int,
@@ -389,7 +389,7 @@ def request_output_to_completion_response(
         tokenizer: AnyTokenizer,
         request_metadata: RequestResponseMetadata,
     ) -> CompletionResponse:
-        choices: List[CompletionResponseChoice] = []
+        choices: list[CompletionResponseChoice] = []
         num_prompt_tokens = 0
         num_generated_tokens = 0
 
@@ -406,7 +406,7 @@ def request_output_to_completion_response(
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
-            out_logprobs: Optional[GenericSequence[Optional[Dict[int,
+            out_logprobs: Optional[GenericSequence[Optional[dict[int,
                                                                  Logprob]]]]
 
             for output in final_res.outputs:
@@ -480,16 +480,16 @@ def request_output_to_completion_response(
     def _create_completion_logprobs(
         self,
         token_ids: GenericSequence[int],
-        top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
+        top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
         num_output_top_logprobs: int,
         tokenizer: AnyTokenizer,
         initial_text_offset: int = 0,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
-        out_text_offset: List[int] = []
-        out_token_logprobs: List[Optional[float]] = []
-        out_tokens: List[str] = []
-        out_top_logprobs: List[Optional[Dict[str, float]]] = []
+        out_text_offset: list[int] = []
+        out_token_logprobs: list[Optional[float]] = []
+        out_tokens: list[str] = []
+        out_top_logprobs: list[Optional[dict[str, float]]] = []
 
         last_token_len = 0
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 607dbd96b19..5f6e06e6f79 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -3,7 +3,8 @@
 import asyncio
 import base64
 import time
-from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
+from collections.abc import AsyncGenerator
+from typing import Final, Literal, Optional, Union, cast
 
 import numpy as np
 from fastapi import Request
@@ -31,7 +32,7 @@
 def _get_embedding(
     output: EmbeddingOutput,
     encoding_format: Literal["float", "base64"],
-) -> Union[List[float], str]:
+) -> Union[list[float], str]:
     if encoding_format == "float":
         return output.embedding
     elif encoding_format == "base64":
@@ -143,7 +144,7 @@ async def create_embedding(
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
             pooling_params = request.to_pooling_params()
 
@@ -178,7 +179,7 @@ async def create_embedding(
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch: list[Optional[PoolingRequestOutput]]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
@@ -186,7 +187,7 @@ async def create_embedding(
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(List[PoolingRequestOutput],
+            final_res_batch_checked = cast(list[PoolingRequestOutput],
                                            final_res_batch)
 
             response = self.request_output_to_embedding_response(
@@ -206,13 +207,13 @@ async def create_embedding(
 
     def request_output_to_embedding_response(
         self,
-        final_res_batch: List[PoolingRequestOutput],
+        final_res_batch: list[PoolingRequestOutput],
         request_id: str,
         created_time: int,
         model_name: str,
         encoding_format: Literal["float", "base64"],
     ) -> EmbeddingResponse:
-        items: List[EmbeddingResponseData] = []
+        items: list[EmbeddingResponseData] = []
         num_prompt_tokens = 0
 
         for idx, final_res in enumerate(final_res_batch):
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index d097bfcfc5a..59333dbfd24 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -1,15 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+from collections.abc import Iterable, Iterator, Mapping, Sequence
 from concurrent.futures.thread import ThreadPoolExecutor
 from http import HTTPStatus
-from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping,
-                    Optional, Sequence, Tuple, TypedDict, Union)
+from typing import Annotated, Any, Callable, Optional, TypedDict, Union
 
 from fastapi import Request
 from pydantic import Field
 from starlette.datastructures import Headers
-from typing_extensions import Annotated
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -64,10 +63,10 @@
 
 class TextTokensPrompt(TypedDict):
     prompt: str
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
 
 
-RequestPrompt = Union[List[int], str, TextTokensPrompt]
+RequestPrompt = Union[list[int], str, TextTokensPrompt]
 
 
 class OpenAIServing:
@@ -144,7 +143,7 @@ async def _check_model(
 
     def _maybe_get_adapters(
         self, request: AnyRequest
-    ) -> Union[Tuple[None, None], Tuple[LoRARequest, None], Tuple[
+    ) -> Union[tuple[None, None], tuple[LoRARequest, None], tuple[
             None, PromptAdapterRequest]]:
         if self._is_model_supported(request.model):
             return None, None
@@ -188,7 +187,7 @@ def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_ids: List[int],
+        prompt_ids: list[int],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
         if truncate_prompt_tokens is None:
@@ -203,7 +202,7 @@ def _normalize_prompt_tokens_to_input(
     def _validate_input(
         self,
         request: AnyRequest,
-        input_ids: List[int],
+        input_ids: list[int],
         input_text: str,
     ) -> TextTokensPrompt:
         token_num = len(input_ids)
@@ -259,7 +258,7 @@ def _tokenize_prompt_input(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_input: Union[str, List[int]],
+        prompt_input: Union[str, list[int]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
@@ -280,7 +279,7 @@ def _tokenize_prompt_inputs(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        prompt_inputs: Iterable[Union[str, List[int]]],
+        prompt_inputs: Iterable[Union[str, list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
     ) -> Iterator[TextTokensPrompt]:
@@ -309,10 +308,10 @@ def _tokenize_prompt_input_or_inputs(
         self,
         request: AnyRequest,
         tokenizer: AnyTokenizer,
-        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+        input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> List[TextTokensPrompt]:
+    ) -> list[TextTokensPrompt]:
         """
         Tokenize/detokenize depending on the input format.
 
@@ -344,10 +343,10 @@ async def _preprocess_completion(
         self,
         request: CompletionLikeRequest,
         tokenizer: AnyTokenizer,
-        input_or_inputs: Union[str, List[str], List[int], List[List[int]]],
+        input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = True,
-    ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]:
+    ) -> tuple[list[TextTokensPrompt], list[TokensPrompt]]:
         request_prompts = await self._tokenize_prompt_input_or_inputs_async(
             request,
             tokenizer,
@@ -367,19 +366,19 @@ async def _preprocess_chat(
         self,
         request: ChatLikeRequest,
         tokenizer: AnyTokenizer,
-        messages: List[ChatCompletionMessageParam],
+        messages: list[ChatCompletionMessageParam],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
-        tool_dicts: Optional[List[Dict[str, Any]]] = None,
-        documents: Optional[List[Dict[str, str]]] = None,
-        chat_template_kwargs: Optional[Dict[str, Any]] = None,
+        tool_dicts: Optional[list[dict[str, Any]]] = None,
+        documents: Optional[list[dict[str, str]]] = None,
+        chat_template_kwargs: Optional[dict[str, Any]] = None,
         tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = False,
-    ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
-               List[TokensPrompt]]:
+    ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
+               list[TokensPrompt]]:
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
             chat_template_content_format,
@@ -392,7 +391,7 @@ async def _preprocess_chat(
             content_format=resolved_content_format,
         )
 
-        _chat_template_kwargs: Dict[str, Any] = dict(
+        _chat_template_kwargs: dict[str, Any] = dict(
             chat_template=chat_template,
             add_generation_prompt=add_generation_prompt,
             continue_final_message=continue_final_message,
@@ -401,7 +400,7 @@ async def _preprocess_chat(
         )
         _chat_template_kwargs.update(chat_template_kwargs or {})
 
-        request_prompt: Union[str, List[int]]
+        request_prompt: Union[str, list[int]]
         if isinstance(tokenizer, MistralTokenizer):
             request_prompt = apply_mistral_chat_template(
                 tokenizer,
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 0f4a174a8c1..38a66583022 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -4,7 +4,7 @@
 import pathlib
 from dataclasses import dataclass
 from http import HTTPStatus
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -53,10 +53,10 @@ def __init__(
         self,
         engine_client: EngineClient,
         model_config: ModelConfig,
-        base_model_paths: List[BaseModelPath],
+        base_model_paths: list[BaseModelPath],
         *,
-        lora_modules: Optional[List[LoRAModulePath]] = None,
-        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+        lora_modules: Optional[list[LoRAModulePath]] = None,
+        prompt_adapters: Optional[list[PromptAdapterPath]] = None,
     ):
         super().__init__()
 
@@ -65,7 +65,7 @@ def __init__(
         self.engine_client = engine_client
 
         self.static_lora_modules = lora_modules
-        self.lora_requests: List[LoRARequest] = []
+        self.lora_requests: list[LoRARequest] = []
         self.lora_id_counter = AtomicCounter(0)
 
         self.prompt_adapter_requests = []
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index bbf5aed1a33..0a3ca2aa7c5 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -3,7 +3,8 @@
 import asyncio
 import base64
 import time
-from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
+from collections.abc import AsyncGenerator
+from typing import Final, Literal, Optional, Union, cast
 
 import numpy as np
 from fastapi import Request
@@ -29,7 +30,7 @@
 def _get_data(
     output: PoolingOutput,
     encoding_format: Literal["float", "base64"],
-) -> Union[List[float], str]:
+) -> Union[list[float], str]:
     if encoding_format == "float":
         return output.data.tolist()
     elif encoding_format == "base64":
@@ -139,7 +140,7 @@ async def create_pooling(
             return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         try:
             pooling_params = request.to_pooling_params()
 
@@ -174,7 +175,7 @@ async def create_pooling(
         num_prompts = len(engine_prompts)
 
         # Non-streaming response
-        final_res_batch: List[Optional[PoolingRequestOutput]]
+        final_res_batch: list[Optional[PoolingRequestOutput]]
         final_res_batch = [None] * num_prompts
         try:
             async for i, res in result_generator:
@@ -182,7 +183,7 @@ async def create_pooling(
 
             assert all(final_res is not None for final_res in final_res_batch)
 
-            final_res_batch_checked = cast(List[PoolingRequestOutput],
+            final_res_batch_checked = cast(list[PoolingRequestOutput],
                                            final_res_batch)
 
             response = self.request_output_to_pooling_response(
@@ -202,13 +203,13 @@ async def create_pooling(
 
     def request_output_to_pooling_response(
         self,
-        final_res_batch: List[PoolingRequestOutput],
+        final_res_batch: list[PoolingRequestOutput],
         request_id: str,
         created_time: int,
         model_name: str,
         encoding_format: Literal["float", "base64"],
     ) -> PoolingResponse:
-        items: List[PoolingResponseData] = []
+        items: list[PoolingResponseData] = []
         num_prompt_tokens = 0
 
         for idx, final_res in enumerate(final_res_batch):
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index a087a8d9ba0..73b4288cbb0 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
-from typing import Any, AsyncGenerator, Dict, List, Mapping, Optional, Union
+from collections.abc import AsyncGenerator, Mapping
+from typing import Any, Optional, Union
 
 from fastapi import Request
 
@@ -48,8 +49,8 @@ def __init__(
     async def _embedding_score(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        texts_1: List[str],
-        texts_2: List[str],
+        texts_1: list[str],
+        texts_2: list[str],
         request: Union[RerankRequest, ScoreRequest],
         request_id=str,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -57,11 +58,11 @@ async def _embedding_score(
         prompt_adapter_request: Optional[Union[PromptAdapterRequest,
                                                None]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
 
         input_texts = texts_1 + texts_2
 
-        engine_prompts: List[TokensPrompt] = []
+        engine_prompts: list[TokensPrompt] = []
         tokenize_async = make_async(tokenizer.__call__,
                                     executor=self._tokenizer_executor)
 
@@ -82,7 +83,7 @@ async def _embedding_score(
                     prompt_token_ids=text_token_prompt["prompt_token_ids"]))
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
         pooling_params = request.to_pooling_params()
 
         for i, engine_prompt in enumerate(engine_prompts):
@@ -108,16 +109,16 @@ async def _embedding_score(
         result_generator = merge_async_iterators(*generators)
 
         # Non-streaming response
-        final_res_batch: List[PoolingRequestOutput] = []
+        final_res_batch: list[PoolingRequestOutput] = []
 
-        embeddings: List[Optional[PoolingRequestOutput]] =\
+        embeddings: list[Optional[PoolingRequestOutput]] =\
               [None] * len(engine_prompts)
 
         async for i, res in result_generator:
             embeddings[i] = res
 
-        emb_texts_1: List[PoolingRequestOutput] = []
-        emb_texts_2: List[PoolingRequestOutput] = []
+        emb_texts_1: list[PoolingRequestOutput] = []
+        emb_texts_2: list[PoolingRequestOutput] = []
 
         for i in range(0, len(texts_1)):
             assert (emb := embeddings[i]) is not None
@@ -139,8 +140,8 @@ async def _embedding_score(
     async def _cross_encoding_score(
         self,
         tokenizer: Union[AnyTokenizer],
-        texts_1: List[str],
-        texts_2: List[str],
+        texts_1: list[str],
+        texts_2: list[str],
         request: Union[RerankRequest, ScoreRequest],
         request_id=str,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -148,10 +149,10 @@ async def _cross_encoding_score(
         prompt_adapter_request: Optional[Union[PromptAdapterRequest,
                                                None]] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
 
-        request_prompts: List[str] = []
-        engine_prompts: List[TokensPrompt] = []
+        request_prompts: list[str] = []
+        engine_prompts: list[TokensPrompt] = []
 
         if len(texts_1) == 1:
             texts_1 = texts_1 * len(texts_2)
@@ -185,7 +186,7 @@ async def _cross_encoding_score(
             engine_prompts.append(engine_prompt)
 
         # Schedule the request and get the result generator.
-        generators: List[AsyncGenerator[PoolingRequestOutput, None]] = []
+        generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
 
         pooling_params = request.to_pooling_params()
 
@@ -212,7 +213,7 @@ async def _cross_encoding_score(
         result_generator = merge_async_iterators(*generators)
 
         # Non-streaming response
-        final_res_batch: List[
+        final_res_batch: list[
             Optional[PoolingRequestOutput]] = [None] * len(engine_prompts)
 
         async for i, res in result_generator:
@@ -228,9 +229,9 @@ async def _run_scoring(
         request_id: str,
         raw_request: Optional[Request] = None,
         truncate_prompt_tokens: Optional[int] = None,
-    ) -> List[PoolingRequestOutput]:
+    ) -> list[PoolingRequestOutput]:
 
-        tokenization_kwargs: Dict[str, Any] = {}
+        tokenization_kwargs: dict[str, Any] = {}
         if truncate_prompt_tokens is not None:
             tokenization_kwargs["truncation"] = True
             tokenization_kwargs["max_length"] = truncate_prompt_tokens
@@ -372,12 +373,12 @@ async def do_rerank(
 
     def request_output_to_score_response(
         self,
-        final_res_batch: List[PoolingRequestOutput],
+        final_res_batch: list[PoolingRequestOutput],
         request_id: str,
         created_time: int,
         model_name: str,
     ) -> ScoreResponse:
-        items: List[ScoreResponseData] = []
+        items: list[ScoreResponseData] = []
         num_prompt_tokens = 0
 
         for idx, final_res in enumerate(final_res_batch):
@@ -406,13 +407,13 @@ def request_output_to_score_response(
         )
 
     def request_output_to_rerank_response(
-            self, final_res_batch: List[PoolingRequestOutput], request_id: str,
-            model_name: str, documents: List[str],
+            self, final_res_batch: list[PoolingRequestOutput], request_id: str,
+            model_name: str, documents: list[str],
             top_n: int) -> RerankResponse:
         """
         Convert the output of do_rank to a RerankResponse
         """
-        results: List[RerankResult] = []
+        results: list[RerankResult] = []
         num_prompt_tokens = 0
         for idx, final_res in enumerate(final_res_batch):
             classify_res = ScoringRequestOutput.from_base(final_res)
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 6c79adf90c8..4e95ef59e80 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Final, List, Optional, Union
+from typing import Final, Optional, Union
 
 from fastapi import Request
 
@@ -92,7 +92,7 @@ async def create_tokenize(
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-        input_ids: List[int] = []
+        input_ids: list[int] = []
         for i, engine_prompt in enumerate(engine_prompts):
             self._log_inputs(request_id,
                              request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 0bedb5718a4..77f016a5e0a 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import io
-from typing import AsyncGenerator, Optional, Union, cast
+from collections.abc import AsyncGenerator
+from typing import Optional, Union, cast
 
 from fastapi import Request
 
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 7cdd6d4c4f2..931d5aab9bd 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from collections.abc import Sequence
 from functools import cached_property
-from typing import Callable, Dict, List, Optional, Sequence, Type, Union
+from typing import Callable, Optional, Union
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage,
@@ -22,16 +23,16 @@ class ToolParser:
     """
 
     def __init__(self, tokenizer: AnyTokenizer):
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: List[str] = []
+        self.streamed_args_for_tool: list[str] = []
 
         self.model_tokenizer = tokenizer
 
     @cached_property
-    def vocab(self) -> Dict[str, int]:
+    def vocab(self) -> dict[str, int]:
         # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
@@ -79,10 +80,10 @@ def extract_tool_calls_streaming(
 
 
 class ToolParserManager:
-    tool_parsers: Dict[str, Type] = {}
+    tool_parsers: dict[str, type] = {}
 
     @classmethod
-    def get_tool_parser(cls, name) -> Type:
+    def get_tool_parser(cls, name) -> type:
         """
         Get tool parser by name which is registered by `register_module`.
 
@@ -95,8 +96,8 @@ def get_tool_parser(cls, name) -> Type:
 
     @classmethod
     def _register_module(cls,
-                         module: Type,
-                         module_name: Optional[Union[str, List[str]]] = None,
+                         module: type,
+                         module_name: Optional[Union[str, list[str]]] = None,
                          force: bool = True) -> None:
         if not issubclass(module, ToolParser):
             raise TypeError(
@@ -116,9 +117,9 @@ def _register_module(cls,
     @classmethod
     def register_module(
             cls,
-            name: Optional[Union[str, List[str]]] = None,
+            name: Optional[Union[str, list[str]]] = None,
             force: bool = True,
-            module: Union[Type, None] = None) -> Union[type, Callable]:
+            module: Union[type, None] = None) -> Union[type, Callable]:
         """
         Register module with the given name or name list. it can be used as a
         decoder(with module as None) or normal function(with module as not 
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index 002bf173883..76da63c5800 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -2,8 +2,9 @@
 
 import json
 import re
+from collections.abc import Sequence
 from json import JSONDecoder
-from typing import Dict, Sequence, Union
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -145,7 +146,7 @@ def extract_tool_calls_streaming(
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                 if len(tool_call_arr) > 0 else {}
 
             # case -- if no tokens have been streamed for the tool, e.g.
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index c948ed78f50..91afc88ef3d 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Dict, Sequence, Union
+from collections.abc import Sequence
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -136,7 +137,7 @@ def extract_tool_calls_streaming(
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id]
+            current_tool_call: dict = tool_call_arr[self.current_tool_id]
 
             delta = None
             # case: we are starting a new tool in the array
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4841b28703e..4c39e9b0c61 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -2,7 +2,8 @@
 
 import json
 import re
-from typing import Dict, List, Sequence, Union
+from collections.abc import Sequence
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -33,9 +34,9 @@ def __init__(self, tokenizer: AnyTokenizer):
             self.model_tokenizer = self.model_tokenizer.tokenizer
 
         self.current_tool_name_sent: bool = False
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: List[str] = [
+        self.streamed_args_for_tool: list[str] = [
         ]  # map what has been streamed for each tool so far to a list
 
         self.tool_call_start_token: str = "<tool_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index b9215e7979b..57d7c77c64f 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-from typing import Dict, Sequence, Union
+from collections.abc import Sequence
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -90,7 +91,7 @@ def extract_tool_calls_streaming(
             # tool calls are generated in an object in inernlm2
             # it's not support parallel tool calls
             try:
-                tool_call_arr: Dict = partial_json_parser.loads(
+                tool_call_arr: dict = partial_json_parser.loads(
                     parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 7c4d63e1886..8df106bf271 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -2,7 +2,8 @@
 
 import json
 import re
-from typing import Dict, List, Sequence, Union
+from collections.abc import Sequence
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -35,9 +36,9 @@ def __init__(self, tokenizer: AnyTokenizer):
             )
 
         self.current_tool_name_sent: bool = False
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
-        self.streamed_args_for_tool: List[str] = [
+        self.streamed_args_for_tool: list[str] = [
         ]  # map what has been streamed for each tool so far to a list
 
         self.tool_calls_start_token: str = "<tool_calls>"
@@ -157,7 +158,7 @@ def extract_tool_calls_streaming(
             # tool calls are generated in an array, so do partial JSON
             # parsing on the entire array
             try:
-                tool_call_arr: List[Dict] = partial_json_parser.loads(
+                tool_call_arr: list[dict] = partial_json_parser.loads(
                     parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
@@ -165,7 +166,7 @@ def extract_tool_calls_streaming(
 
             # select as the current tool call the one we're on the state at
 
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                 if len(tool_call_arr) > 0 else {}
 
             # case -- if no tokens have been streamed for the tool, e.g.
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 6a7b113623e..20c3238fb3d 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -2,8 +2,9 @@
 
 import json
 import re
+from collections.abc import Sequence
 from json import JSONDecoder
-from typing import Dict, List, Sequence, Union
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -40,10 +41,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: List[str] = [
+        self.streamed_args_for_tool: list[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "<|python_tag|>"
         self.bot_token_id = tokenizer.encode(self.bot_token,
@@ -78,7 +79,7 @@ def extract_tool_calls(
                 start_idx += end_idx + len('; ')
                 function_call_arr.append(obj)
 
-            tool_calls: List[ToolCall] = [
+            tool_calls: list[ToolCall] = [
                 ToolCall(
                     type="function",
                     function=FunctionCall(
@@ -152,7 +153,7 @@ def extract_tool_calls_streaming(
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                 if len(tool_call_arr) > 0 else {}
 
             # case -- if no tokens have been streamed for the tool, e.g.
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 4f048088299..0661445639d 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -2,9 +2,10 @@
 
 import json
 import re
+from collections.abc import Sequence
 from random import choices
 from string import ascii_letters, digits
-from typing import Dict, List, Sequence, Union
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -56,10 +57,10 @@ def __init__(self, tokenizer: AnyTokenizer):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: List[str] = [
+        self.streamed_args_for_tool: list[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
@@ -104,7 +105,7 @@ def extract_tool_calls(
                 function_call_arr = json.loads(raw_tool_call)
 
             # Tool Call
-            tool_calls: List[MistralToolCall] = [
+            tool_calls: list[MistralToolCall] = [
                 MistralToolCall(
                     type="function",
                     function=FunctionCall(
@@ -172,7 +173,7 @@ def extract_tool_calls_streaming(
             # tool calls are generated in an array, so do partial JSON
             # parsing on the entire array
             try:
-                tool_call_arr: List[Dict] = partial_json_parser.loads(
+                tool_call_arr: list[dict] = partial_json_parser.loads(
                     parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
@@ -180,7 +181,7 @@ def extract_tool_calls_streaming(
 
             # select as the current tool call the one we're on the state at
 
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                 if len(tool_call_arr) > 0 else {}
 
             # case -- if no tokens have been streamed for the tool, e.g.
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index 5c282b5c260..1b9317f16f3 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -3,7 +3,8 @@
 import ast
 import json
 import re
-from typing import Any, Sequence, Tuple, Union
+from collections.abc import Sequence
+from typing import Any, Union
 
 from transformers import PreTrainedTokenizerBase
 
@@ -204,7 +205,7 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
                                           arguments=json.dumps(arguments)))
 
 
-def _make_valid_python(text: str) -> Union[Tuple[str, str], None]:
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
     bracket_stack = []
     for index, char in enumerate(text):
         if char in {"[", "(", "{"}:
diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
index 945cbd68350..7997629d461 100644
--- a/vllm/entrypoints/openai/tool_parsers/utils.py
+++ b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -2,7 +2,7 @@
 
 import json
 from json import JSONDecodeError, JSONDecoder
-from typing import Any, List, Tuple
+from typing import Any
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -82,7 +82,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str:
     return diff
 
 
-def find_all_indices(string: str, substring: str) -> List[int]:
+def find_all_indices(string: str, substring: str) -> list[int]:
     """
     Find all (starting) indices of a substring in a given string. Useful for
     tool call extraction
@@ -99,7 +99,7 @@ def find_all_indices(string: str, substring: str) -> List[int]:
 
 # partial_json_parser doesn't support extra data and
 # JSONDecorder.raw_decode doesn't support partial JSON
-def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
+def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
     try:
         return (partial_json_parser.loads(input_str, flags), len(input_str))
     except JSONDecodeError as e:
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 6ec0b5fb024..53411a27b41 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List, Union
+from typing import Union
 
 from torch.nn import CosineSimilarity
 
@@ -10,12 +10,12 @@
 
 def _cosine_similarity(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    embed_1: List[PoolingRequestOutput],
-    embed_2: List[PoolingRequestOutput],
-) -> List[PoolingRequestOutput]:
+    embed_1: list[PoolingRequestOutput],
+    embed_2: list[PoolingRequestOutput],
+) -> list[PoolingRequestOutput]:
 
     scorer = CosineSimilarity(0)
-    scores: Union[List[PoolingRequestOutput]] = []
+    scores: Union[list[PoolingRequestOutput]] = []
 
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
@@ -38,8 +38,8 @@ def _cosine_similarity(
 
 
 def _validate_score_input_lens(
-    texts_1: Union[List[str], List[dict]],
-    texts_2: Union[List[str], List[dict]],
+    texts_1: Union[list[str], list[dict]],
+    texts_2: Union[list[str], list[dict]],
 ):
     if len(texts_1) > 1 and len(texts_1) != len(texts_2):
         raise ValueError("Input lengths must be either 1:1, 1:N or N:N")
diff --git a/vllm/envs.py b/vllm/envs.py
index 048d63bfec0..bf64cd70674 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,7 +2,7 @@
 
 import os
 import tempfile
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Optional
 
 if TYPE_CHECKING:
     VLLM_HOST_IP: str = ""
@@ -67,12 +67,12 @@
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_TEST_FORCE_FP8_MARLIN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
-    VLLM_PLUGINS: Optional[List[str]] = None
+    VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
-    VLLM_DISABLED_KERNELS: List[str] = []
+    VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = False
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
@@ -123,7 +123,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
 # begin-env-vars-definition
 
-environment_variables: Dict[str, Callable[[], Any]] = {
+environment_variables: dict[str, Callable[[], Any]] = {
 
     # ================== Installation Time Env Vars ==================
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index b91816af1b6..c3d20cff426 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 import torch.distributed as dist
@@ -28,13 +28,13 @@
 @dataclass
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
-    attn_layers: Dict[str, Any]
+    attn_layers: dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
     attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     num_tokens_across_dp: Optional[
-        List[int]] = None  # set dynamically for each forward pass
+        list[int]] = None  # set dynamically for each forward pass
 
 
 _forward_context: Optional[ForwardContext] = None
diff --git a/vllm/logger.py b/vllm/logger.py
index 0ee47de173a..2b0b9da2d6f 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -109,7 +109,7 @@ def _configure_vllm_root_logger() -> None:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
-            raise ValueError("Invalid logging config. Expected Dict, got %s.",
+            raise ValueError("Invalid logging config. Expected dict, got %s.",
                              type(custom_config).__name__)
         logging_config = custom_config
 
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index a810be7bc7a..e3faf20029e 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Callable, List, Tuple, Union
+from typing import Callable, Union
 
 import torch
 
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 
-LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor],
-                        Callable[[List[int], List[int], torch.Tensor],
+LogitsProcessor = Union[Callable[[list[int], torch.Tensor], torch.Tensor],
+                        Callable[[list[int], list[int], torch.Tensor],
                                  torch.Tensor]]
 """LogitsProcessor is a function that takes a list
 of previously generated tokens, the logits tensor
@@ -17,9 +17,9 @@
 
 
 def get_bad_words_logits_processors(
-        bad_words: List[str],
-        tokenizer: AnyTokenizer) -> List[LogitsProcessor]:
-    bad_words_ids: List[List[int]] = list()
+        bad_words: list[str],
+        tokenizer: AnyTokenizer) -> list[LogitsProcessor]:
+    bad_words_ids: list[list[int]] = list()
 
     for bad_word in bad_words:
         # To prohibit words both at the beginning
@@ -51,13 +51,13 @@ class NoBadWordsLogitsProcessor:
     _SMALLEST_LOGIT = float("-inf")
     _NEUTRAL_LOGIT = 0.0
 
-    def __init__(self, bad_words_ids: List[List[int]]):
+    def __init__(self, bad_words_ids: list[list[int]]):
         self.bad_words_ids = bad_words_ids
         self.word_bias: torch.FloatTensor = None
 
     def __call__(
         self,
-        past_tokens_ids: Union[List[int], Tuple[int]],
+        past_tokens_ids: Union[list[int], tuple[int]],
         logits: torch.FloatTensor,
     ) -> torch.Tensor:
         if self.word_bias is None:
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 030119710a1..8c355c89e3e 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
+from collections.abc import MutableSequence
+from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass
-from typing import Dict, Generic, List, MutableSequence, Optional
-from typing import Sequence as GenericSequence
-from typing import Union
+from typing import Generic, Optional, Union
 
 import torch
 from typing_extensions import TypeVar, deprecated
@@ -109,14 +109,14 @@ def __init__(
         self,
         request_id: str,
         prompt: Optional[str],
-        prompt_token_ids: Optional[List[int]],
+        prompt_token_ids: Optional[list[int]],
         prompt_logprobs: Optional[PromptLogprobs],
-        outputs: List[CompletionOutput],
+        outputs: list[CompletionOutput],
         finished: bool,
         metrics: Optional[RequestMetrics] = None,
         lora_request: Optional[LoRARequest] = None,
         encoder_prompt: Optional[str] = None,
-        encoder_prompt_token_ids: Optional[List[int]] = None,
+        encoder_prompt_token_ids: Optional[list[int]] = None,
         num_cached_tokens: Optional[int] = None,
         *,
         multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
@@ -139,9 +139,9 @@ def new(
         cls,
         request_id: str,
         prompt: Optional[str],
-        prompt_token_ids: Optional[List[int]],
+        prompt_token_ids: Optional[list[int]],
         text: str,
-        token_ids: List[int],
+        token_ids: list[int],
         logprobs: Optional[SampleLogprobs],
         prompt_logprobs: Optional[PromptLogprobs],
         cumulative_logprob: Optional[float],
@@ -189,7 +189,7 @@ def add(self, next_output: "RequestOutput") -> None:
     @classmethod
     def from_seq_group(
         cls, seq_group: SequenceGroup, use_cache: bool,
-        seq_id_to_seq_group: Dict[str, SequenceGroupBase]
+        seq_id_to_seq_group: dict[str, SequenceGroupBase]
     ) -> Optional["RequestOutput"]:
         finished = seq_group.is_finished()
 
@@ -363,12 +363,12 @@ class PoolingRequestOutput(Generic[_O]):
     Args:
         request_id (str): A unique identifier for the pooling request.
         outputs (PoolingOutput): The pooling results for the given input.
-        prompt_token_ids (List[int]): A list of token IDs used in the prompt.
+        prompt_token_ids (list[int]): A list of token IDs used in the prompt.
         finished (bool): A flag indicating whether the pooling is completed.
     """
 
     def __init__(self, request_id: str, outputs: _O,
-                 prompt_token_ids: List[int], finished: bool):
+                 prompt_token_ids: list[int], finished: bool):
         self.request_id = request_id
         self.prompt_token_ids = prompt_token_ids
         self.finished = finished
@@ -407,7 +407,7 @@ class RequestOutputFactory:
 
     @staticmethod
     def create(seq_group: SequenceGroup,
-               seq_id_to_seq_group: Dict[str, SequenceGroupBase],
+               seq_id_to_seq_group: dict[str, SequenceGroupBase],
                use_cache: bool = False):
         if seq_group.pooled_data is not None:
             return PoolingRequestOutput.from_seq_group(seq_group)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 2ce87283df7..17e4e43387d 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -4,11 +4,10 @@
 from dataclasses import dataclass
 from enum import Enum, IntEnum
 from functools import cached_property
-from typing import Any, Dict, List, Optional, Set, Union
+from typing import Annotated, Any, Optional, Union
 
 import msgspec
 from pydantic import BaseModel
-from typing_extensions import Annotated
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
@@ -29,9 +28,9 @@ class SamplingType(IntEnum):
 @dataclass
 class GuidedDecodingParams:
     """One of these fields will be used to build a logit processor."""
-    json: Optional[Union[str, Dict]] = None
+    json: Optional[Union[str, dict]] = None
     regex: Optional[str] = None
-    choice: Optional[List[str]] = None
+    choice: Optional[list[str]] = None
     grammar: Optional[str] = None
     json_object: Optional[bool] = None
     """These are other options that can be set"""
@@ -40,9 +39,9 @@ class GuidedDecodingParams:
 
     @staticmethod
     def from_optional(
-        json: Optional[Union[Dict, BaseModel, str]] = None,
+        json: Optional[Union[dict, BaseModel, str]] = None,
         regex: Optional[str] = None,
-        choice: Optional[List[str]] = None,
+        choice: Optional[list[str]] = None,
         grammar: Optional[str] = None,
         json_object: Optional[bool] = None,
         backend: Optional[str] = None,
@@ -72,7 +71,7 @@ def backend_name(self) -> str:
         """
         return (self.backend or "").split(":")[0]
 
-    def backend_options(self) -> List[str]:
+    def backend_options(self) -> list[str]:
         """Return the backend options as a list of strings."""
         if not self.backend or ":" not in self.backend:
             return []
@@ -144,12 +143,12 @@ class SamplingParams(
             considered, relative to the probability of the most likely token.
             Must be in [0, 1]. Set to 0 to disable this.
         seed: Random seed to use for the generation.
-        stop: List of strings that stop the generation when they are generated.
+        stop: list of strings that stop the generation when they are generated.
             The returned output will not contain the stop strings.
-        stop_token_ids: List of tokens that stop the generation when they are
+        stop_token_ids: list of tokens that stop the generation when they are
             generated. The returned output will contain the stop tokens unless
             the stop tokens are special tokens.
-        bad_words: List of words that are not allowed to be generated.
+        bad_words: list of words that are not allowed to be generated.
             More precisely, only the last token of a corresponding
             token sequence is not allowed when the next generated token
             can complete the sequence.
@@ -172,7 +171,7 @@ class SamplingParams(
         skip_special_tokens: Whether to skip special tokens in the output.
         spaces_between_special_tokens: Whether to add spaces between special
             tokens in the output.  Defaults to True.
-        logits_processors: List of functions that modify logits based on
+        logits_processors: list of functions that modify logits based on
             previously generated tokens, and optionally prompt tokens as
             a first argument.
         truncate_prompt_tokens: If set to an integer k, will use only the last k
@@ -198,9 +197,9 @@ class SamplingParams(
     top_k: int = -1
     min_p: float = 0.0
     seed: Optional[int] = None
-    stop: Optional[Union[str, List[str]]] = None
-    stop_token_ids: Optional[List[int]] = None
-    bad_words: Optional[List[str]] = None
+    stop: Optional[Union[str, list[str]]] = None
+    stop_token_ids: Optional[list[int]] = None
+    bad_words: Optional[list[str]] = None
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
     min_tokens: int = 0
@@ -212,8 +211,8 @@ class SamplingParams(
     detokenize: bool = True
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    # Optional[List[LogitsProcessor]] type. We use Any here because
-    # Optional[List[LogitsProcessor]] type is not supported by msgspec.
+    # Optional[list[LogitsProcessor]] type. We use Any here because
+    # Optional[list[LogitsProcessor]] type is not supported by msgspec.
     logits_processors: Optional[Any] = None
     include_stop_str_in_output: bool = False
     truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
@@ -222,12 +221,12 @@ class SamplingParams(
     # The below fields are not supposed to be used as an input.
     # They are set in post_init.
     output_text_buffer_length: int = 0
-    _all_stop_token_ids: Set[int] = msgspec.field(default_factory=set)
+    _all_stop_token_ids: set[int] = msgspec.field(default_factory=set)
 
     # Fields used to construct logits processors
     guided_decoding: Optional[GuidedDecodingParams] = None
-    logit_bias: Optional[Dict[int, float]] = None
-    allowed_token_ids: Optional[List[int]] = None
+    logit_bias: Optional[dict[int, float]] = None
+    allowed_token_ids: Optional[list[int]] = None
 
     @staticmethod
     def from_optional(
@@ -241,9 +240,9 @@ def from_optional(
         top_k: int = -1,
         min_p: float = 0.0,
         seed: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stop_token_ids: Optional[List[int]] = None,
-        bad_words: Optional[List[str]] = None,
+        stop: Optional[Union[str, list[str]]] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        bad_words: Optional[list[str]] = None,
         include_stop_str_in_output: bool = False,
         ignore_eos: bool = False,
         max_tokens: Optional[int] = 16,
@@ -253,13 +252,13 @@ def from_optional(
         detokenize: bool = True,
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
-        logits_processors: Optional[List[LogitsProcessor]] = None,
+        logits_processors: Optional[list[LogitsProcessor]] = None,
         truncate_prompt_tokens: Optional[Annotated[int,
                                                    msgspec.Meta(ge=1)]] = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         guided_decoding: Optional[GuidedDecodingParams] = None,
-        logit_bias: Optional[Union[Dict[int, float], Dict[str, float]]] = None,
-        allowed_token_ids: Optional[List[int]] = None,
+        logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
+        allowed_token_ids: Optional[list[int]] = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -435,7 +434,7 @@ def _verify_greedy_sampling(self) -> None:
 
     def update_from_generation_config(
             self,
-            generation_config: Dict[str, Any],
+            generation_config: dict[str, Any],
             model_eos_token_id: Optional[int] = None) -> None:
         """Update if there are non-default values from generation_config"""
 
@@ -468,7 +467,7 @@ def sampling_type(self) -> SamplingType:
         return SamplingType.RANDOM
 
     @property
-    def all_stop_token_ids(self) -> Set[int]:
+    def all_stop_token_ids(self) -> set[int]:
         return self._all_stop_token_ids
 
     def clone(self) -> "SamplingParams":
diff --git a/vllm/sequence.py b/vllm/sequence.py
index c0425ba33c9..6a7b1e62a60 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,11 +5,11 @@
 from abc import ABC, abstractmethod
 from array import array
 from collections import defaultdict
+from collections.abc import Mapping
+from collections.abc import Sequence as GenericSequence
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional
-from typing import Sequence as GenericSequence
-from typing import Set, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import msgspec
 import torch
@@ -50,9 +50,9 @@ class Logprob:
 
 # {token_id -> logprob} per each sequence group. None if the corresponding
 # sequence group doesn't require prompt logprob.
-PromptLogprobs = List[Optional[Dict[int, Logprob]]]
+PromptLogprobs = list[Optional[dict[int, Logprob]]]
 # {token_id -> logprob} for each sequence group.
-SampleLogprobs = List[Dict[int, Logprob]]
+SampleLogprobs = list[dict[int, Logprob]]
 
 
 class SequenceStatus(enum.IntEnum):
@@ -129,7 +129,7 @@ class SequenceDataDelta(
         omit_defaults=True):  # type: ignore[call-arg]
     """Delta SequenceData to send to workers per step."""
     # A new token to be appended to existing SequenceData.
-    new_output_token_ids: List[int]
+    new_output_token_ids: list[int]
     # Overwriting existing `cumulative_logprob`
     new_cumulative_logprob: float
     # Overwriting existing `num_computed_tokens`.
@@ -152,7 +152,7 @@ class SequenceData(msgspec.Struct,
         output_token_ids: The token IDs of the output.
         cumulative_logprob: The cumulative log probability of the output.
     """
-    # NOTE: we cannot use Union[List, array] because msgspec cannot support
+    # NOTE: we cannot use Union[list, array] because msgspec cannot support
     # union of 2 list types.
     _prompt_token_ids: array
     _output_token_ids: array = msgspec.field(
@@ -160,25 +160,25 @@ class SequenceData(msgspec.Struct,
 
     ### The below fields should not be passed as an argument ###
     _cumulative_logprob: float = 0.0
-    _prompt_token_ids_tuple: Tuple[int,
+    _prompt_token_ids_tuple: tuple[int,
                                    ...] = msgspec.field(default_factory=tuple)
     # The number of tokens that are computed (that run against the model).
     _num_computed_tokens: int = 0
     # The number of tokens with prefix cache hit.
     _num_cached_tokens: int = 0
     _stage: SequenceStage = SequenceStage.PREFILL
-    _cached_all_token_ids: List[int] = msgspec.field(default_factory=list)
+    _cached_all_token_ids: list[int] = msgspec.field(default_factory=list)
 
     # It is used to get delta input. It is reset when `get_delta_and_reset`
     # is called.
-    _new_appended_tokens: List[int] = msgspec.field(default_factory=list)
+    _new_appended_tokens: list[int] = msgspec.field(default_factory=list)
 
     # It is used to compute mrope_position_ids.
     _mrope_position_delta: Optional[int] = None
 
     @staticmethod
     def from_prompt_token_counts(
-            *token_counts: Tuple[int, int]) -> "SequenceData":
+            *token_counts: tuple[int, int]) -> "SequenceData":
         """
         Construct a :class:`SequenceData` instance by concatenating
         prompt token sequences.
@@ -220,14 +220,14 @@ def from_seqs(
     def __post_init__(self) -> None:
         assert self._prompt_token_ids.typecode == "l"
         assert self._output_token_ids.typecode == "l"
-        self._prompt_token_ids_tuple: Tuple[int, ...] = tuple(
+        self._prompt_token_ids_tuple: tuple[int, ...] = tuple(
             self._prompt_token_ids)
         self._update_cached_all_tokens()
 
     def _update_cached_all_tokens(self):
         assert isinstance(self._prompt_token_ids, array)
         assert isinstance(self._output_token_ids, array)
-        self._cached_all_token_ids: List[int] = list(self._prompt_token_ids +
+        self._cached_all_token_ids: list[int] = list(self._prompt_token_ids +
                                                      self._output_token_ids)
 
     @property
@@ -235,7 +235,7 @@ def cumulative_logprob(self) -> float:
         return self._cumulative_logprob
 
     @property
-    def prompt_token_ids(self) -> Tuple[int, ...]:
+    def prompt_token_ids(self) -> tuple[int, ...]:
         return self._prompt_token_ids_tuple
 
     @prompt_token_ids.setter
@@ -252,7 +252,7 @@ def prompt_token_ids_array(self) -> array:
         return self._prompt_token_ids
 
     @property
-    def output_token_ids(self) -> Tuple[int, ...]:
+    def output_token_ids(self) -> tuple[int, ...]:
         return tuple(self._output_token_ids)
 
     @output_token_ids.setter
@@ -295,12 +295,12 @@ def get_prompt_len(self) -> int:
     def get_output_len(self) -> int:
         return len(self._output_token_ids)
 
-    def get_token_ids(self) -> List[int]:
+    def get_token_ids(self) -> list[int]:
         return self._cached_all_token_ids
 
     def get_prefix_token_ids(
             self, num_tokens: int
-    ) -> Tuple[Tuple[int, ...], Optional[Tuple[int, ...]]]:
+    ) -> tuple[tuple[int, ...], Optional[tuple[int, ...]]]:
         """Get prefix tokens, and make the return value hashable"""
         prompt_length = self.get_prompt_len()
         if num_tokens > prompt_length:
@@ -351,10 +351,10 @@ def get_last_token_id(self) -> int:
             return self._prompt_token_ids[-1]
         return self._output_token_ids[-1]
 
-    def get_prompt_token_ids(self) -> Tuple[int, ...]:
+    def get_prompt_token_ids(self) -> tuple[int, ...]:
         return self.prompt_token_ids
 
-    def get_output_token_ids(self) -> Tuple[int, ...]:
+    def get_output_token_ids(self) -> tuple[int, ...]:
         return self.output_token_ids
 
     def get_delta_and_reset(self) -> SequenceDataDelta:
@@ -432,7 +432,7 @@ def __init__(
         self.prefix_offset = 0
         self.read_offset = 0
         # Input + output tokens
-        self.tokens: Optional[List[str]] = None
+        self.tokens: Optional[list[str]] = None
 
     @property
     def n_blocks(self) -> int:
@@ -443,7 +443,7 @@ def prompt(self) -> Optional[str]:
         return self.inputs.prompt
 
     @property
-    def prompt_token_ids(self) -> List[int]:
+    def prompt_token_ids(self) -> list[int]:
         return self.inputs.prompt_token_ids
 
     @property
@@ -451,7 +451,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]:
         return self.inputs.prompt_embeds
 
     @property
-    def token_type_ids(self) -> List[int]:
+    def token_type_ids(self) -> list[int]:
         return self.inputs.token_type_ids
 
     @property
@@ -463,7 +463,7 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
         return self.inputs.multi_modal_placeholders
 
     @property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
+    def mm_processor_kwargs(self) -> dict[str, Any]:
         return self.inputs.mm_processor_kwargs
 
     @property
@@ -548,7 +548,7 @@ def reset_state_for_recompute(self):
         """Reset the sequence states for recomputation."""
         self.data.reset_state_for_recompute()
 
-    def append_token_id(self, token_id: int, logprobs: Dict[int,
+    def append_token_id(self, token_id: int, logprobs: dict[int,
                                                             Logprob]) -> None:
         assert token_id in logprobs
         self.output_logprobs.append(logprobs)
@@ -563,16 +563,16 @@ def get_prompt_len(self) -> int:
     def get_output_len(self) -> int:
         return self.data.get_output_len()
 
-    def get_token_ids(self) -> List[int]:
+    def get_token_ids(self) -> list[int]:
         return self.data.get_token_ids()
 
-    def get_prompt_token_ids(self) -> Tuple[int, ...]:
+    def get_prompt_token_ids(self) -> tuple[int, ...]:
         return self.data.get_prompt_token_ids()
 
     def get_last_token_id(self) -> int:
         return self.data.get_last_token_id()
 
-    def get_output_token_ids(self) -> Tuple[int, ...]:
+    def get_output_token_ids(self) -> tuple[int, ...]:
         return self.data.get_output_token_ids()
 
     def get_cumulative_logprob(self) -> float:
@@ -644,7 +644,7 @@ class SequenceGroup:
     def __init__(
         self,
         request_id: str,
-        seqs: List[Sequence],
+        seqs: list[Sequence],
         arrival_time: float,
         sampling_params: Optional[SamplingParams] = None,
         lora_request: Optional[LoRARequest] = None,
@@ -686,7 +686,7 @@ def prompt(self) -> Optional[str]:
         return self.first_seq.prompt
 
     @property
-    def prompt_token_ids(self) -> List[int]:
+    def prompt_token_ids(self) -> list[int]:
         return self.first_seq.prompt_token_ids
 
     @property
@@ -698,7 +698,7 @@ def encoder_prompt(self) -> Optional[str]:
                 if self.encoder_seq is not None else None)
 
     @property
-    def encoder_prompt_token_ids(self) -> Optional[List[int]]:
+    def encoder_prompt_token_ids(self) -> Optional[list[int]]:
         # There are either 0 or 1 encoder sequences
         # If one is present, its prompt token ids are
         # distinct from the decoder's.
@@ -706,7 +706,7 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]:
                 if self.encoder_seq is not None else None)
 
     @property
-    def token_type_ids(self) -> Optional[List[int]]:
+    def token_type_ids(self) -> Optional[list[int]]:
         return self.first_seq.token_type_ids
 
     @property
@@ -726,7 +726,7 @@ def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
         return {}
 
     @property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
+    def mm_processor_kwargs(self) -> dict[str, Any]:
         if self.first_seq.multi_modal_data:
             return self.first_seq.mm_processor_kwargs
         elif self.encoder_seq is not None:
@@ -823,7 +823,7 @@ def get_max_num_running_seqs(self) -> int:
     def get_seqs(
         self,
         status: Optional[SequenceStatus] = None,
-    ) -> List[Sequence]:
+    ) -> list[Sequence]:
         if status is None:
             return self.seqs
 
@@ -838,7 +838,7 @@ def is_encoder_decoder(self) -> bool:
     def get_encoder_seq(self) -> Optional[Sequence]:
         return self.encoder_seq
 
-    def get_finished_seqs(self) -> List[Sequence]:
+    def get_finished_seqs(self) -> list[Sequence]:
         if self.is_single_seq:
             return self.seqs if self.first_seq.is_finished() else []
 
@@ -897,13 +897,13 @@ class SequenceGroupMetadataDelta(
     After sending the first SequenceGroupMetadata, vLLM scheduler
     only sends delta to reduce the data payload size.
     """
-    seq_data_delta: Dict[int, SequenceDataDelta]
+    seq_data_delta: dict[int, SequenceDataDelta]
     request_id: str
-    block_tables: Dict[int, List[int]]
+    block_tables: dict[int, list[int]]
     is_prompt: bool
     do_sample: bool = True
     token_chunk_size: Optional[int] = None
-    computed_block_nums: Optional[List[int]] = None
+    computed_block_nums: Optional[list[int]] = None
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
 
@@ -947,23 +947,23 @@ class SequenceGroupMetadata(
 
     request_id: str
     is_prompt: bool
-    seq_data: Dict[int, SequenceData]
+    seq_data: dict[int, SequenceData]
     sampling_params: Optional[SamplingParams]
-    block_tables: Dict[int, List[int]]
+    block_tables: dict[int, list[int]]
     do_sample: bool = True
     pooling_params: Optional[PoolingParams] = None
     lora_request: Optional[LoRARequest] = None
-    computed_block_nums: Optional[List[int]] = None
+    computed_block_nums: Optional[list[int]] = None
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
     # "MultiModalDataDict" types. We have to use Any due to msgspec
     # doesn't allow to have union of 2 different dicts.
-    token_type_ids: Optional[List[int]] = None
+    token_type_ids: Optional[list[int]] = None
     multi_modal_data: Optional[Any] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None
+    mm_processor_kwargs: Optional[dict[str, Any]] = None
     encoder_seq_data: Optional[SequenceData] = None
-    cross_block_table: Optional[List[int]] = None
+    cross_block_table: Optional[list[int]] = None
     prompt_adapter_request: Optional[PromptAdapterRequest] = None
     token_chunk_size: Optional[int] = None
 
@@ -1042,7 +1042,7 @@ class SequenceOutput(
     """
     parent_seq_id: int
     output_token: int
-    logprobs: Dict[int, Logprob]
+    logprobs: dict[int, Logprob]
 
     def __repr__(self) -> str:
         return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
@@ -1076,7 +1076,7 @@ class CompletionSequenceGroupOutput(
         array_like=True):  # type: ignore[call-arg]
     """The model output associated with a completion sequence group."""
     __metaclass__ = SequenceGroupOutput
-    samples: List[SequenceOutput]
+    samples: list[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
 
@@ -1119,7 +1119,7 @@ class IntermediateTensors:
     contains the hidden states and residuals for a request.
     """
 
-    tensors: Dict[str, torch.Tensor]
+    tensors: dict[str, torch.Tensor]
 
     def __init__(self, tensors):
         # manually define this function, so that
@@ -1155,7 +1155,7 @@ class PoolerOutput(
         omit_defaults=True,  # type: ignore[call-arg]
         array_like=True):  # type: ignore[call-arg]
     """The output from a pooling operation in the pooling model."""
-    outputs: List[PoolingSequenceGroupOutput]
+    outputs: list[PoolingSequenceGroupOutput]
 
     def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
         return self.outputs[idx]
@@ -1172,7 +1172,7 @@ def __eq__(self, other: object):
 
 
 def get_all_seq_ids(
-        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+        seq_group_metadata_list: list[SequenceGroupMetadata]) -> list[int]:
     """Given a list of SequenceGroupMetadata, create a list of all
     sequence ids.
     """
@@ -1180,13 +1180,13 @@ def get_all_seq_ids(
 
 
 def get_all_seq_ids_and_request_ids(
-    seq_group_metadata_list: List[SequenceGroupMetadata]
-) -> Tuple[List[int], Dict[str, Set[int]]]:
+    seq_group_metadata_list: list[SequenceGroupMetadata]
+) -> tuple[list[int], dict[str, set[int]]]:
     """Given a list of SequenceGroupMetadata, create a list of all
     sequence ids.
     """
-    seq_ids: List[int] = []
-    request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set)
+    seq_ids: list[int] = []
+    request_id_seq_ids_mapping: defaultdict[str, set[int]] = defaultdict(set)
     for sg in seq_group_metadata_list:
         for seq_id in sg.seq_data:
             seq_ids.append(seq_id)
@@ -1206,14 +1206,14 @@ class HiddenStates(msgspec.Struct, array_like=True,
     # all tokens, whereas for decode step, it use used for last accepted tokens.
     hidden_states: torch.Tensor
     # The sequence group metadata list. Only needed for decode step.
-    seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None
+    seq_group_metadata_list: Optional[list[SequenceGroupMetadata]] = None
     # Scorer hidden states of the 2nd last token proposed by the proposer (
     # irrespective of whether it was accepted or not). Only used for cases when
     # last proposed token is accepted (i.e., in case of bonus tokens). For the
     # case of no bonus tokens, these are ignored.
     second_last_token_hidden_states: Optional[torch.Tensor] = None
 
-    _seq_ids: List[int] = msgspec.field(default_factory=list)
+    _seq_ids: list[int] = msgspec.field(default_factory=list)
 
     def __post_init__(self):
         if self.seq_group_metadata_list is not None:
@@ -1221,12 +1221,12 @@ def __post_init__(self):
             self._seq_ids = get_all_seq_ids(self.seq_group_metadata_list)
 
     @property
-    def seq_ids(self) -> List[int]:
+    def seq_ids(self) -> list[int]:
         return self._seq_ids
 
     def update(self,
                hidden_states: torch.Tensor,
-               seq_group_metadata_list: List[SequenceGroupMetadata],
+               seq_group_metadata_list: list[SequenceGroupMetadata],
                second_last_token_hidden_states: Optional[torch.Tensor] = None):
         """Update hidden states from target model invocation. Only used for
         decode steps"""
@@ -1244,7 +1244,7 @@ def update(self,
             ])
 
     def prune(self,
-              seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
+              seq_group_metadata_list: list[SequenceGroupMetadata]) -> None:
         """Prune to provided list of sequence ids. Only used for decode steps.
         """
         # Currently this prunes all seq_ids not present in
@@ -1287,16 +1287,16 @@ class ExecuteModelRequest(
     """The model execution request, containing CPU metadata only. The LLM
     engine should create an instance of this class for each request batch."""
     # The sequence group metadata list.
-    seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+    seq_group_metadata_list: list[Union[SequenceGroupMetadata,
                                         SequenceGroupMetadataDelta]]
     # Blocks to swap in. List of CPU -> GPU block number.
-    blocks_to_swap_in: List[Tuple[int,
+    blocks_to_swap_in: list[tuple[int,
                                   int]] = msgspec.field(default_factory=list)
     # Blocks to swap out. List of GPU -> CPU block number.
-    blocks_to_swap_out: List[Tuple[int,
+    blocks_to_swap_out: list[tuple[int,
                                    int]] = msgspec.field(default_factory=list)
     # Blocks to copy. Source to dest block.
-    blocks_to_copy: List[Tuple[int, int]] = msgspec.field(default_factory=list)
+    blocks_to_copy: list[tuple[int, int]] = msgspec.field(default_factory=list)
     # Virtual engine ID for pipeline parallel.
     virtual_engine: int = 0
     # The number of slots for lookahead decoding.
@@ -1310,7 +1310,7 @@ class ExecuteModelRequest(
     # The step index for spec model input.
     spec_step_idx: Optional[int] = None
     # Finished request ids since last step.
-    finished_requests_ids: List[str] = msgspec.field(default_factory=list)
+    finished_requests_ids: list[str] = msgspec.field(default_factory=list)
     # The last sampled token ids for multi step decoding.
     last_sampled_token_ids: Optional[torch.Tensor] = None
     # Async callback
@@ -1344,7 +1344,7 @@ def current_step(self) -> int:
         return state.current_step
 
     def clone(
-        self, seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+        self, seq_group_metadata_list: list[Union[SequenceGroupMetadata,
                                                   SequenceGroupMetadataDelta]]
     ) -> "ExecuteModelRequest":
         """Clone the request with a new sequence group metadata list."""
@@ -1371,13 +1371,13 @@ class SequenceGroupBase:
     assembled_seq_group: Optional[SequenceGroup] = None
 
     # seq id to a unique index inside this group
-    seq_id_to_index: Dict[str, int] = field(default_factory=dict)
+    seq_id_to_index: dict[str, int] = field(default_factory=dict)
 
     # seq ids to be finished
-    to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict)
+    to_be_finished: dict[str, SequenceGroup] = field(default_factory=dict)
 
     # seq id to finished sequences
-    finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict)
+    finished_reqs: dict[str, SequenceGroup] = field(default_factory=dict)
 
     streaming: bool = False
 
diff --git a/vllm/tracing.py b/vllm/tracing.py
index bf069ad84fd..557ae40b87a 100644
--- a/vllm/tracing.py
+++ b/vllm/tracing.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import Mapping, Optional
+from collections.abc import Mapping
+from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.utils import run_once
diff --git a/vllm/utils.py b/vllm/utils.py
index 29e60a9c9be..26c9e1a9083 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -28,12 +28,12 @@
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import OrderedDict, UserDict, defaultdict
-from collections.abc import Hashable, Iterable, Mapping
+from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
+                             Iterable, Iterator, Mapping)
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
-from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
-                    Dict, Generator, Generic, Iterator, List, Literal,
-                    NamedTuple, Optional, Tuple, Type, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
+                    Optional, TypeVar, Union)
 from uuid import uuid4
 
 import cloudpickle
@@ -400,7 +400,7 @@ def _next_task(iterator: AsyncGenerator[T, None],
 
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T,
-                               None], ) -> AsyncGenerator[Tuple[int, T], None]:
+                               None], ) -> AsyncGenerator[tuple[int, T], None]:
     """Merge multiple asynchronous iterators into a single iterator.
 
     This method handle the case where some iterators finish before others.
@@ -433,7 +433,7 @@ async def merge_async_iterators(
 
 
 async def collect_from_async_generator(
-        iterator: AsyncGenerator[T, None]) -> List[T]:
+        iterator: AsyncGenerator[T, None]) -> list[T]:
     """Collect all items from an async generator into a list."""
     items = []
     async for item in iterator:
@@ -560,7 +560,7 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]:
     return None
 
 
-def update_environment_variables(envs: Dict[str, str]):
+def update_environment_variables(envs: dict[str, str]):
     for k, v in envs.items():
         if k in os.environ and os.environ[k] != v:
             logger.warning(
@@ -569,7 +569,7 @@ def update_environment_variables(envs: Dict[str, str]):
         os.environ[k] = v
 
 
-def chunk_list(lst: List[T], chunk_size: int):
+def chunk_list(lst: list[T], chunk_size: int):
     """Yield successive chunk_size chunks from lst."""
     for i in range(0, len(lst), chunk_size):
         yield lst[i:i + chunk_size]
@@ -642,7 +642,7 @@ def create_kv_caches_with_random_flash(
     model_dtype: Optional[Union[str, torch.dtype]] = None,
     seed: int = 0,
     device: Optional[str] = "cuda",
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     from vllm.platforms import current_platform
     current_platform.seed_everything(seed)
 
@@ -650,8 +650,8 @@ def create_kv_caches_with_random_flash(
     key_value_cache_shape = (num_blocks, 2, block_size, num_heads, head_size)
     scale = head_size**-0.5
 
-    key_caches: List[torch.Tensor] = []
-    value_caches: List[torch.Tensor] = []
+    key_caches: list[torch.Tensor] = []
+    value_caches: list[torch.Tensor] = []
 
     for _ in range(num_layers):
         key_value_cache = torch.empty(size=key_value_cache_shape,
@@ -679,7 +679,7 @@ def create_kv_caches_with_random(
     model_dtype: Optional[Union[str, torch.dtype]] = None,
     seed: int = 0,
     device: Optional[str] = "cuda",
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
 
     if cache_dtype == "fp8" and head_size % 16:
         raise ValueError(
@@ -693,7 +693,7 @@ def create_kv_caches_with_random(
     scale = head_size**-0.5
     x = 16 // torch.tensor([], dtype=torch_dtype).element_size()
     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
-    key_caches: List[torch.Tensor] = []
+    key_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
         key_cache = torch.empty(size=key_cache_shape,
                                 dtype=torch_dtype,
@@ -708,7 +708,7 @@ def create_kv_caches_with_random(
         key_caches.append(key_cache)
 
     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_caches: List[torch.Tensor] = []
+    value_caches: list[torch.Tensor] = []
     for _ in range(num_layers):
         value_cache = torch.empty(size=value_cache_shape,
                                   dtype=torch_dtype,
@@ -754,7 +754,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 def make_ndarray_with_pad(
-    x: List[List[T]],
+    x: list[list[T]],
     pad: T,
     dtype: npt.DTypeLike,
     *,
@@ -779,7 +779,7 @@ def make_ndarray_with_pad(
 
 
 def make_tensor_with_pad(
-    x: List[List[T]],
+    x: list[list[T]],
     pad: T,
     dtype: torch.dtype,
     *,
@@ -831,7 +831,7 @@ def is_list_of(
     typ: Union[type[T], tuple[type[T], ...]],
     *,
     check: Literal["first", "all"] = "first",
-) -> TypeIs[List[T]]:
+) -> TypeIs[list[T]]:
     if not isinstance(value, list):
         return False
 
@@ -843,8 +843,8 @@ def is_list_of(
     assert_never(check)
 
 
-JSONTree = Union[Dict[str, "JSONTree[T]"], List["JSONTree[T]"],
-                 Tuple["JSONTree[T]", ...], T]
+JSONTree = Union[dict[str, "JSONTree[T]"], list["JSONTree[T]"],
+                 tuple["JSONTree[T]", ...], T]
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
 
@@ -859,7 +859,7 @@ def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
         return func(value)
 
 
-def flatten_2d_lists(lists: List[List[T]]) -> List[T]:
+def flatten_2d_lists(lists: list[list[T]]) -> list[T]:
     """Flatten a list of lists to a single list."""
     return [item for sublist in lists for item in sublist]
 
@@ -1226,7 +1226,7 @@ def check_port(self, value):
 
         return value
 
-    def _pull_args_from_config(self, args: List[str]) -> List[str]:
+    def _pull_args_from_config(self, args: list[str]) -> list[str]:
         """Method to pull arguments specified in the config file
         into the command-line args variable.
 
@@ -1291,7 +1291,7 @@ def _pull_args_from_config(self, args: List[str]) -> List[str]:
 
         return args
 
-    def _load_config_file(self, file_path: str) -> List[str]:
+    def _load_config_file(self, file_path: str) -> list[str]:
         """Loads a yaml file and returns the key value pairs as a
         flattened list with argparse like pattern
         ```yaml
@@ -1313,9 +1313,9 @@ def _load_config_file(self, file_path: str) -> List[str]:
                               %s supplied", extension)
 
         # only expecting a flat dictionary of atomic types
-        processed_args: List[str] = []
+        processed_args: list[str] = []
 
-        config: Dict[str, Union[int, str]] = {}
+        config: dict[str, Union[int, str]] = {}
         try:
             with open(file_path) as config_file:
                 config = yaml.safe_load(config_file)
@@ -1399,7 +1399,7 @@ def resolve_mm_processor_kwargs(
     *,
     requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
     """Applies filtering to eliminate invalid mm_processor_kwargs, i.e.,
     those who are not explicit keywords to the given callable (of one is
     given; otherwise no filtering is done), then merges the kwarg dicts,
@@ -1440,7 +1440,7 @@ def get_allowed_kwarg_only_overrides(
     *,
     requires_kw_only: bool = True,
     allow_var_kwargs: bool = False,
-) -> Dict[str, Any]:
+) -> dict[str, Any]:
     """
     Given a callable which has one or more keyword only params and a dict
     mapping param names to values, drop values that can be not be kwarg
@@ -1531,9 +1531,9 @@ def value(self):
 # Adapted from: https://stackoverflow.com/a/47212782/5082708
 class LazyDict(Mapping[str, T], Generic[T]):
 
-    def __init__(self, factory: Dict[str, Callable[[], T]]):
+    def __init__(self, factory: dict[str, Callable[[], T]]):
         self._factory = factory
-        self._dict: Dict[str, T] = {}
+        self._dict: dict[str, T] = {}
 
     def __getitem__(self, key: str) -> T:
         if key not in self._dict:
@@ -1552,9 +1552,9 @@ def __len__(self):
         return len(self._factory)
 
 
-class ClassRegistry(UserDict[Type[T], _V]):
+class ClassRegistry(UserDict[type[T], _V]):
 
-    def __getitem__(self, key: Type[T]) -> _V:
+    def __getitem__(self, key: type[T]) -> _V:
         for cls in key.mro():
             if cls in self.data:
                 return self.data[cls]
@@ -1584,8 +1584,8 @@ def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
 
 
 def weak_ref_tensors(
-    tensors: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]
-) -> Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]:
+    tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
+) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]:
     """
     Convenience function to create weak references to tensors,
     for single tensor, list of tensors or tuple of tensors.
@@ -1857,7 +1857,7 @@ def __getattr__(self, key: str):
 def direct_register_custom_op(
     op_name: str,
     op_func: Callable,
-    mutates_args: List[str],
+    mutates_args: list[str],
     fake_impl: Optional[Callable] = None,
     target_lib: Optional[Library] = None,
     dispatch_key: str = "CUDA",
@@ -2177,8 +2177,8 @@ def get_mp_context():
 
 
 def bind_kv_cache(
-        ctx: Dict[str, Any],
-        kv_cache: List[List[torch.Tensor]],  # [virtual_engine][layer_index]
+        ctx: dict[str, Any],
+        kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
 ) -> None:
     # Bind the kv_cache tensor to Attention modules, similar to
     # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
@@ -2210,8 +2210,8 @@ def bind_kv_cache(
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
 
 
-def run_method(obj: Any, method: Union[str, bytes, Callable], args: Tuple[Any],
-               kwargs: Dict[str, Any]) -> Any:
+def run_method(obj: Any, method: Union[str, bytes, Callable], args: tuple[Any],
+               kwargs: dict[str, Any]) -> Any:
     """
     Run a method of an object with the given arguments and keyword arguments.
     If the method is string, it will be converted to a method using getattr.
@@ -2263,7 +2263,7 @@ def import_pynvml():
     return pynvml
 
 
-def warn_for_unimplemented_methods(cls: Type[T]) -> Type[T]:
+def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
     """
     A replacement for `abc.ABC`.
     When we use `abc.ABC`, subclasses will fail to instantiate
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 353bf46d503..8bf7f3587bc 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import torch
@@ -30,7 +30,7 @@ class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
     @staticmethod
-    def get_supported_head_sizes() -> List[int]:
+    def get_supported_head_sizes() -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
@@ -38,15 +38,15 @@ def get_name() -> str:
         return "FLASH_ATTN_VLLM_V1"
 
     @staticmethod
-    def get_impl_cls() -> Type["FlashAttentionImpl"]:
+    def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
 
     @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
+    def get_metadata_cls() -> type["AttentionMetadata"]:
         return FlashAttentionMetadata
 
     @staticmethod
-    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+    def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
         return FlashAttentionMetadataBuilder
 
     @staticmethod
@@ -55,7 +55,7 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
@@ -158,10 +158,10 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
+        alibi_slopes: Optional[list[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
+        blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> None:
@@ -381,7 +381,7 @@ def cascade_attention(
     max_kv_len: int,
     softmax_scale: float,
     alibi_slopes: Optional[torch.Tensor],
-    sliding_window: Tuple[int, int],
+    sliding_window: tuple[int, int],
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 30bce5cc8b6..824ffcfd61b 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -195,8 +195,7 @@
 import functools
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Tuple,
-                    Type, TypeVar)
+from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
 
 import torch
 from compressed_tensors.quantization import QuantizationStrategy
@@ -250,11 +249,11 @@ def get_name() -> str:
         return "TRITON_MLA_VLLM_V1"
 
     @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
+    def get_metadata_cls() -> type["AttentionMetadata"]:
         return MLACommonMetadata
 
     @staticmethod
-    def get_builder_cls() -> Type["MLACommonMetadataBuilder"]:
+    def get_builder_cls() -> type["MLACommonMetadataBuilder"]:
         return MLACommonMetadataBuilder
 
     @staticmethod
@@ -263,11 +262,11 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,  # assumed to be 1 for MLA
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
     @staticmethod
-    def get_supported_head_sizes() -> List[int]:
+    def get_supported_head_sizes() -> list[int]:
         return [576]
 
     @staticmethod
@@ -317,8 +316,8 @@ class MLACommonMetadata:
     has_context: bool = False
     context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
     context_chunk_starts: Optional[torch.Tensor] = None
-    context_chunk_seq_tot: Optional[List[int]] = None
-    context_chunk_max_seq_lens: Optional[List[int]] = None
+    context_chunk_seq_tot: Optional[list[int]] = None
+    context_chunk_max_seq_lens: Optional[list[int]] = None
     chunked_prefill_workspace: Optional[torch.Tensor] = None
 
     def __post_init__(self):
@@ -538,10 +537,10 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
+        alibi_slopes: Optional[list[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]],
+        blocksparse_params: Optional[dict[str, Any]],
         logits_soft_cap: Optional[float],
         attn_type: str,
         # MLA Specific Arguments
@@ -634,7 +633,7 @@ def process_weights_after_loading(self, act_dtype: torch.dtype):
         #
         # returns input_group_shape, weight_group_shape
         def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
-            Tuple[Tuple[int, int], Tuple[int, int]]:
+            tuple[tuple[int, int], tuple[int, int]]:
             if isinstance(layer.quant_method, Fp8LinearMethod):
                 if layer.quant_method.block_quant:
                     weight_block_size = \
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 8a7b7b974e3..b357d714241 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Optional
 
 import torch
 
@@ -25,21 +25,21 @@ def get_name() -> str:
         return "FLASHMLA_VLLM_V1"
 
     @staticmethod
-    def get_metadata_cls() -> Type["FlashMLAMetadata"]:
+    def get_metadata_cls() -> type["FlashMLAMetadata"]:
         return FlashMLAMetadata
 
     @staticmethod
-    def get_builder_cls() -> Type["FlashMLAMetadataBuilder"]:
+    def get_builder_cls() -> type["FlashMLAMetadataBuilder"]:
         return FlashMLAMetadataBuilder
 
     @staticmethod
-    def get_impl_cls() -> Type["FlashMLAImpl"]:
+    def get_impl_cls() -> type["FlashMLAImpl"]:
         return FlashMLAImpl
 
 
 @dataclass
 class FlashMLAMetadata(MLACommonMetadata):
-    decode_tile_scheduler_metadata: Optional[Tuple[torch.Tensor,
+    decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor,
                                                    torch.Tensor]] = None
     decode_num_splits: Optional[torch.Tensor] = None
 
@@ -76,10 +76,10 @@ def __init__(
             head_size: int,
             scale: float,
             num_kv_heads: int,
-            alibi_slopes: Optional[List[float]],
+            alibi_slopes: Optional[list[float]],
             sliding_window: Optional[int],
             kv_cache_dtype: str,
-            blocksparse_params: Optional[Dict[str, Any]],
+            blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
             # MLA Specific Arguments
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 7747509f1a4..3f9b349a5f0 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Optional
 
 import torch
 
@@ -21,7 +21,7 @@ def get_name() -> str:
         return "TRITON_MLA_VLLM_V1"
 
     @staticmethod
-    def get_impl_cls() -> Type["TritonMLAImpl"]:
+    def get_impl_cls() -> type["TritonMLAImpl"]:
         return TritonMLAImpl
 
 
@@ -33,10 +33,10 @@ def __init__(
             head_size: int,
             scale: float,
             num_kv_heads: int,
-            alibi_slopes: Optional[List[float]],
+            alibi_slopes: Optional[list[float]],
             sliding_window: Optional[int],
             kv_cache_dtype: str,
-            blocksparse_params: Optional[Dict[str, Any]],
+            blocksparse_params: Optional[dict[str, Any]],
             logits_soft_cap: Optional[float],
             attn_type: str,
             # MLA Specific Arguments
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index a9f7b3fd447..bf4a05daf2d 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Optional
 
 import torch
 # Required to register custom ops.
@@ -22,15 +22,15 @@ def get_name() -> str:
         return "PALLAS_VLLM_V1"
 
     @staticmethod
-    def get_impl_cls() -> Type["PallasAttentionBackendImpl"]:
+    def get_impl_cls() -> type["PallasAttentionBackendImpl"]:
         return PallasAttentionBackendImpl
 
     @staticmethod
-    def get_metadata_cls() -> Type["PallasMetadata"]:
+    def get_metadata_cls() -> type["PallasMetadata"]:
         return PallasMetadata
 
     @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
+    def get_state_cls() -> type["CommonAttentionState"]:
         return CommonAttentionState
 
     @staticmethod
@@ -39,7 +39,7 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (num_kv_heads, num_blocks, block_size, head_size)
 
     @staticmethod
@@ -77,10 +77,10 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
+        alibi_slopes: Optional[list[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
+        blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -120,7 +120,7 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        kv_cache: tuple[torch.Tensor, torch.Tensor],
         attn_metadata: PallasMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 5c7d759b181..a625d99f4a1 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with PagedAttention on rocm"""
-from typing import Any, Dict, List, Optional, Tuple, Type
+from typing import Any, Optional
 
 import torch
 
@@ -20,7 +20,7 @@ class ROCmAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
 
     @staticmethod
-    def get_supported_head_sizes() -> List[int]:
+    def get_supported_head_sizes() -> list[int]:
         return [32, 64, 96, 128, 160, 192, 224, 256]
 
     @staticmethod
@@ -28,11 +28,11 @@ def get_name() -> str:
         return "ROCM_ATTN_VLLM_V1"
 
     @staticmethod
-    def get_impl_cls() -> Type["ROCmAttentionImpl"]:
+    def get_impl_cls() -> type["ROCmAttentionImpl"]:
         return ROCmAttentionImpl
 
     @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
+    def get_metadata_cls() -> type["AttentionMetadata"]:
         return FlashAttentionMetadata
 
     @staticmethod
@@ -41,7 +41,7 @@ def get_kv_cache_shape(
         block_size: int,
         num_kv_heads: int,
         head_size: int,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
         return (2, num_blocks, block_size, num_kv_heads, head_size)
@@ -51,7 +51,7 @@ def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
     @staticmethod
-    def get_builder_cls() -> Type["FlashAttentionMetadataBuilder"]:
+    def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
         return FlashAttentionMetadataBuilder
 
 
@@ -63,10 +63,10 @@ def __init__(
         head_size: int,
         scale: float,
         num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
+        alibi_slopes: Optional[list[float]],
         sliding_window: Optional[int],
         kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
+        blocksparse_params: Optional[dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
     ) -> None:
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 1b5c7f96f66..394b47fddf0 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import defaultdict
-from typing import Dict, Iterable, List, Optional
+from collections.abc import Iterable
+from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
@@ -29,7 +30,7 @@ def __init__(self, num_gpu_blocks: int, enable_caching: bool):
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
         # All kv-cache blocks.
-        self.blocks: List[KVCacheBlock] = [
+        self.blocks: list[KVCacheBlock] = [
             KVCacheBlock(idx) for idx in range(num_gpu_blocks)
         ]
         # Free block queue that constructs and manipulates a doubly linked
@@ -46,7 +47,7 @@ def __init__(self, num_gpu_blocks: int, enable_caching: bool):
         # if there is already an identical block in the cache. This is because
         # we want to make sure the allocated block IDs won't change so that
         # block tables are append-only.
-        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
+        self.cached_block_hash_to_block: dict[BlockHashType, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
     def get_cached_block(self,
@@ -69,8 +70,8 @@ def get_cached_block(self,
     def cache_full_blocks(
         self,
         request: Request,
-        blocks: List[KVCacheBlock],
-        block_hashes: List[BlockHashType],
+        blocks: list[KVCacheBlock],
+        block_hashes: list[BlockHashType],
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
@@ -146,7 +147,7 @@ def cache_full_blocks(
             self.cached_block_hash_to_block[block_hash][blk.block_id] = blk
             prev_block_hash_value = block_hash.hash_value
 
-    def get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
+    def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
         """Get new blocks from the free block pool.
 
         Note that we do not check block cache in this function.
@@ -161,7 +162,7 @@ def get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]:
             raise ValueError(
                 f"Cannot get {num_blocks} free blocks from the pool")
 
-        ret: List[KVCacheBlock] = []
+        ret: list[KVCacheBlock] = []
         idx = 0
         while idx < num_blocks:
             # First allocate blocks.
@@ -200,7 +201,7 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
             return True
         return False
 
-    def touch(self, blocks: List[KVCacheBlock]) -> None:
+    def touch(self, blocks: list[KVCacheBlock]) -> None:
         """Touch a block increases its reference count by 1, and may remove
         the block from the free queue. This is used when a block is hit by
         another request with the same prefix.
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 13ad14e45b3..018379c1f43 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import TYPE_CHECKING, Dict, List, Set, Tuple
+from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,9 +18,9 @@ def __init__(self, cache_size: int):
         self.cache_size = cache_size
         self.num_free_slots = cache_size
         # req_id -> cached input ids
-        self.cached: Dict[str, Set[int]] = {}
-        # List of [req_id, input_id]
-        self.freed: List[Tuple[str, int]] = []
+        self.cached: dict[str, set[int]] = {}
+        # list of [req_id, input_id]
+        self.freed: list[tuple[str, int]] = []
 
     def has_cache(self, request: Request, input_id: int) -> bool:
         req_id = request.request_id
@@ -37,7 +37,7 @@ def allocate(self, request: Request, input_id: int) -> None:
         self.cached[req_id].add(input_id)
         self.num_free_slots -= request.get_num_encoder_tokens(input_id)
 
-    def get_cached_input_ids(self, request: Request) -> Set[int]:
+    def get_cached_input_ids(self, request: Request) -> set[int]:
         return self.cached.get(request.request_id, set())
 
     def free_encoder_input(self, request: Request, input_id: int) -> None:
@@ -58,7 +58,7 @@ def free(self, request: Request) -> None:
         for input_id in input_ids:
             self.free_encoder_input(request, input_id)
 
-    def get_freed_ids(self) -> List[Tuple[str, int]]:
+    def get_freed_ids(self) -> list[tuple[str, int]]:
         freed = self.freed
         self.freed = []
         return freed
@@ -67,7 +67,7 @@ def get_freed_ids(self) -> List[Tuple[str, int]]:
 def compute_encoder_budget(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
-) -> Tuple[int, int]:
+) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations.
 
@@ -97,7 +97,7 @@ def compute_encoder_budget(
 def _compute_encoder_budget_multimodal(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
-) -> Tuple[int, int]:
+) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations for a multimodal model.
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 030574de2bd..6c6be01a2ff 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections import defaultdict
-from typing import DefaultDict, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
@@ -52,20 +53,20 @@ def __init__(
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
         # is finished.
-        self.req_to_blocks: DefaultDict[str,
-                                        List[KVCacheBlock]] = defaultdict(list)
+        self.req_to_blocks: defaultdict[str,
+                                        list[KVCacheBlock]] = defaultdict(list)
 
         # Mapping from request ID to kv block hashes.
         # This is to avoid recomputing the block hashes for each call of
         # `get_computed_blocks` or `allocate_slots`.
-        self.req_to_block_hashes: DefaultDict[
-            str, List[BlockHashType]] = defaultdict(list)
+        self.req_to_block_hashes: defaultdict[
+            str, list[BlockHashType]] = defaultdict(list)
 
         # {req_id: The number of cached blocks for this given request}
         # This is used to track the number of cached blocks for each request.
         # This is only used to track the RUNNING requests, we do not track the
         # data for reempted ones.
-        self.num_cached_block: Dict[str, int] = {}
+        self.num_cached_block: dict[str, int] = {}
         self.prefix_cache_stats = PrefixCacheStats()
 
     @property
@@ -88,7 +89,7 @@ def make_prefix_cache_stats(self) -> PrefixCacheStats:
         return stats
 
     def get_computed_blocks(
-            self, request: Request) -> Tuple[List[KVCacheBlock], int]:
+            self, request: Request) -> tuple[list[KVCacheBlock], int]:
         """Get the computed (cached) blocks for the request.
         Note that the computed blocks must be full.
 
@@ -136,8 +137,8 @@ def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
-        new_computed_blocks: Optional[List[KVCacheBlock]] = None
-    ) -> Optional[List[KVCacheBlock]]:
+        new_computed_blocks: Optional[list[KVCacheBlock]] = None
+    ) -> Optional[list[KVCacheBlock]]:
         """Add slots for a request with new tokens to append.
 
         Args:
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 546fddf67f4..adadcab5ea1 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -3,7 +3,7 @@
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, List, NamedTuple, Optional, Tuple
+from typing import Any, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -25,7 +25,7 @@ class BlockHashType(NamedTuple):
     # Hash value of the block in an integer.
     hash_value: int
     # Token IDs in the block.
-    token_ids: Tuple[int, ...]
+    token_ids: tuple[int, ...]
     # Extra keys for the block.
     extra_keys: Optional[Any] = None
 
@@ -45,7 +45,7 @@ def __init__(self, interval: int = 1000):
         self.aggregated_query_total = 0
         self.aggregated_query_hit = 0
         # A deque of (requests, queries, hits) for the most recent requests.
-        self.query_queue: deque[Tuple[int, int, int]] = deque()
+        self.query_queue: deque[tuple[int, int, int]] = deque()
 
     def observe(self, stats: PrefixCacheStats):
         """Observe the prefix caching for a set of requests.
@@ -164,7 +164,7 @@ class FreeKVCacheBlockQueue:
         blocks: A list of KVCacheBlock objects.
     """
 
-    def __init__(self, blocks: List[KVCacheBlock]) -> None:
+    def __init__(self, blocks: list[KVCacheBlock]) -> None:
         self.num_free_blocks = len(blocks)
 
         # Initialize the doubly linked list of free blocks.
@@ -233,7 +233,7 @@ def append(self, block: KVCacheBlock) -> None:
         block.next_free_block = None
         self.num_free_blocks += 1
 
-    def get_all_free_blocks(self) -> List[KVCacheBlock]:
+    def get_all_free_blocks(self) -> list[KVCacheBlock]:
         """Get all free blocks in the free list. Mainly used for testing.
         
         Returns:
@@ -264,7 +264,7 @@ def need_extra_keys(request: Request) -> bool:
 
 def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
                             end_token_idx: int,
-                            start_mm_idx: int) -> Tuple[List[Any], int]:
+                            start_mm_idx: int) -> tuple[list[Any], int]:
     """Generate extra keys related to MultiModal request for block hash
     computation. For multi-modal inputs, the extra keys are
     (mm_hash, start_offset) that indicate a mm input contained in the
@@ -279,7 +279,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
-    extra_keys: List[Any] = []
+    extra_keys: list[Any] = []
 
     mm_positions, mm_hashes = request.mm_positions, request.mm_hashes
     if not mm_positions:
@@ -331,7 +331,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     return extra_keys, curr_mm_idx
 
 
-def _gen_lora_extra_hash_keys(request: Request) -> List[int]:
+def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
     """Generate extra keys related to LoRA for block hash computation.
     
     Args:
@@ -348,7 +348,7 @@ def _gen_lora_extra_hash_keys(request: Request) -> List[int]:
 
 def generate_block_hash_extra_keys(
         request: Request, start_token_idx: int, end_token_idx: int,
-        start_mm_idx: int) -> Tuple[Optional[Tuple[Any, ...]], int]:
+        start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
     """Generate extra keys for the block hash. The extra keys can come from
     the multi-modal inputs and request specific metadata (e.g., LoRA ID).
     
@@ -361,12 +361,12 @@ def generate_block_hash_extra_keys(
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
-    mm_extra_keys: List[Any]
+    mm_extra_keys: list[Any]
     mm_extra_keys, new_start_mm_idx = _gen_mm_extra_hash_keys(
         request, start_token_idx, end_token_idx, start_mm_idx)
-    lora_extra_keys: List[int] = _gen_lora_extra_hash_keys(request)
+    lora_extra_keys: list[int] = _gen_lora_extra_hash_keys(request)
 
-    extra_keys: List[Any] = lora_extra_keys + mm_extra_keys
+    extra_keys: list[Any] = lora_extra_keys + mm_extra_keys
 
     if not extra_keys:
         return None, new_start_mm_idx
@@ -377,7 +377,7 @@ def generate_block_hash_extra_keys(
 def hash_block_tokens(
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
-        extra_keys: Optional[Tuple[Any, ...]] = None) -> BlockHashType:
+        extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
     """Computes a hash value corresponding to the contents of a block and
     the contents of the preceding block(s). The hash value is used for
     prefix caching. We use LRU cache for this function to avoid recomputing
@@ -410,7 +410,7 @@ def hash_block_tokens(
 
 
 def hash_request_tokens(block_size: int,
-                        request: Request) -> List[BlockHashType]:
+                        request: Request) -> list[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
 
@@ -554,8 +554,8 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
 
 
 def get_kv_cache_configs(vllm_config: VllmConfig,
-                         kv_cache_specs: List[KVCacheSpec],
-                         available_memory: int) -> List[KVCacheConfig]:
+                         kv_cache_specs: list[KVCacheSpec],
+                         available_memory: int) -> list[KVCacheConfig]:
     """
     Generates the KV cache configuration for a model
     TODO: support hybrid models with more than one type of KV cache.
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 87c9c0cd12b..db14c9455a1 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -2,7 +2,8 @@
 
 import time
 from collections import deque
-from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
+from collections.abc import Iterable
+from typing import Optional, Union
 
 from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
                          SpeculativeConfig)
@@ -57,24 +58,24 @@ def __init__(
         self.block_size = self.cache_config.block_size
 
         # req_id -> Request
-        self.requests: Dict[str, Request] = {}
+        self.requests: dict[str, Request] = {}
         # Priority queues for requests.
-        self.waiting: Deque[Request] = deque()
-        self.running: List[Request] = []
+        self.waiting: deque[Request] = deque()
+        self.running: list[Request] = []
         # The requests that have been scheduled and are being executed
         # by the executor.
-        self.scheduled_req_ids: Set[str] = set()
+        self.scheduled_req_ids: set[str] = set()
 
         # The request IDs that are finished in between the previous and the
         # current steps. This is used to notify the workers about the finished
         # requests so that they can free the cached states for those requests.
         # This is flushed at the end of each scheduling step.
-        self.finished_req_ids: Set[str] = set()
+        self.finished_req_ids: set[str] = set()
 
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
         # Request id -> CachedRequestData
-        self._cached_reqs_data: Dict[str, CachedRequestData] = {}
+        self._cached_reqs_data: dict[str, CachedRequestData] = {}
 
         # Encoder-related.
         # Calculate encoder cache size if applicable
@@ -108,19 +109,19 @@ def schedule(self) -> "SchedulerOutput":
         # chunked prefills, prefix caching, speculative decoding,
         # and the "jump decoding" optimization in the future.
 
-        scheduled_new_reqs: List[Request] = []
-        scheduled_resumed_reqs: List[Request] = []
-        scheduled_running_reqs: List[Request] = []
-        preempted_reqs: List[Request] = []
+        scheduled_new_reqs: list[Request] = []
+        scheduled_resumed_reqs: list[Request] = []
+        scheduled_running_reqs: list[Request] = []
+        preempted_reqs: list[Request] = []
 
-        req_to_new_block_ids: Dict[str, List[int]] = {}
-        num_scheduled_tokens: Dict[str, int] = {}
+        req_to_new_block_ids: dict[str, list[int]] = {}
+        num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
         # Encoder-related.
-        scheduled_encoder_inputs: Dict[str, List[int]] = {}
+        scheduled_encoder_inputs: dict[str, list[int]] = {}
         encoder_budget = self.max_num_encoder_input_tokens
         # Spec decode-related.
-        scheduled_spec_decode_tokens: Dict[str, List[int]] = {}
+        scheduled_spec_decode_tokens: dict[str, list[int]] = {}
 
         # For logging.
         scheduled_timestamp = time.monotonic()
@@ -211,7 +212,7 @@ def schedule(self) -> "SchedulerOutput":
                 encoder_budget = new_encoder_budget
 
         # Record the LoRAs in scheduled_running_reqs
-        requested_loras: Set[int] = set()
+        requested_loras: set[int] = set()
         if self.lora_config:
             requested_loras = set(
                 req.lora_request.lora_int_id for req in scheduled_running_reqs
@@ -378,7 +379,7 @@ def _make_cached_request_data(
         request: Request,
         num_scheduled_tokens: int,
         num_scheduled_spec_tokens: int,
-        new_block_ids: List[int],
+        new_block_ids: list[int],
         resumed_from_preemption: bool,
     ) -> "CachedRequestData":
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -407,7 +408,7 @@ def _try_schedule_encoder_inputs(
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_budget: int,
-    ) -> Tuple[List[int], int, int]:
+    ) -> tuple[list[int], int, int]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
         and update `num_new_tokens` and encoder token budget accordingly.
@@ -427,7 +428,7 @@ def _try_schedule_encoder_inputs(
         if not request.has_encoder_inputs():
             return [], num_new_tokens, encoder_budget
 
-        encoder_inputs_to_schedule: List[int] = []
+        encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
         assert len(mm_positions) > 0
@@ -482,8 +483,8 @@ def update_from_output(
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
-        new_running: List[Request] = []
-        outputs: List[EngineCoreOutput] = []
+        new_running: list[Request] = []
+        outputs: list[EngineCoreOutput] = []
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
         # loop can be a performance bottleneck. We should do our best to avoid
@@ -543,7 +544,7 @@ def update_from_output(
 
             stopped = False
             new_logprobs = None
-            new_token_ids: List[int] = []
+            new_token_ids: list[int] = []
 
             if request.num_computed_tokens >= request.num_tokens:
                 for output_token_id in generated_token_ids:
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index 47413527c32..b6caa8b4ebf 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
     from vllm.lora.request import LoRARequest
@@ -15,13 +15,13 @@
 class NewRequestData:
 
     req_id: str
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     prompt: Optional[str]
-    mm_inputs: List["MultiModalKwargs"]
-    mm_hashes: List[str]
-    mm_positions: List["PlaceholderRange"]
+    mm_inputs: list["MultiModalKwargs"]
+    mm_hashes: list[str]
+    mm_positions: list["PlaceholderRange"]
     sampling_params: "SamplingParams"
-    block_ids: List[int]
+    block_ids: list[int]
     num_computed_tokens: int
     lora_request: Optional["LoRARequest"]
 
@@ -29,7 +29,7 @@ class NewRequestData:
     def from_request(
         cls,
         request: "Request",
-        block_ids: List[int],
+        block_ids: list[int],
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -53,8 +53,8 @@ class CachedRequestData:
     # the request's block IDs. If True, new_block_ids will be used as the
     # request's block IDs instead of appending to the existing block IDs.
     resumed_from_preemption: bool
-    new_token_ids: List[int]
-    new_block_ids: List[int]
+    new_token_ids: list[int]
+    new_block_ids: list[int]
     num_computed_tokens: int
 
     @classmethod
@@ -62,8 +62,8 @@ def from_request(
         cls,
         request: "Request",
         resumed_from_preemption: bool,
-        new_token_ids: List[int],
-        new_block_ids: List[int],
+        new_token_ids: list[int],
+        new_block_ids: list[int],
     ) -> "CachedRequestData":
         return cls(
             req_id=request.request_id,
@@ -77,29 +77,29 @@ def from_request(
 @dataclass
 class SchedulerOutput:
 
-    # List of the requests that are scheduled for the first time.
+    # list of the requests that are scheduled for the first time.
     # We cache the request's data in each worker process, so that we don't
     # need to re-send it every scheduling step.
-    scheduled_new_reqs: List[NewRequestData]
-    # List of the requests that have been scheduled before.
+    scheduled_new_reqs: list[NewRequestData]
+    # list of the requests that have been scheduled before.
     # Since the request's data is already cached in the worker processes,
     # we only send the diff to minimize the communication cost.
-    scheduled_cached_reqs: List[CachedRequestData]
+    scheduled_cached_reqs: list[CachedRequestData]
 
     # req_id -> num_scheduled_tokens
     # Number of tokens scheduled for each request.
-    num_scheduled_tokens: Dict[str, int]
+    num_scheduled_tokens: dict[str, int]
     # Total number of tokens scheduled for all requests.
     # Equal to sum(num_scheduled_tokens.values())
     total_num_scheduled_tokens: int
     # req_id -> spec_token_ids
     # If a request does not have any spec decode tokens, it will not be
     # included in the dictionary.
-    scheduled_spec_decode_tokens: Dict[str, List[int]]
+    scheduled_spec_decode_tokens: dict[str, list[int]]
     # req_id -> encoder input indices that need processing.
     # E.g., if a request has [0, 1], it could mean the vision encoder needs
     # to process that the request's 0-th and 1-th images in the current step.
-    scheduled_encoder_inputs: Dict[str, List[int]]
+    scheduled_encoder_inputs: dict[str, list[int]]
     # Number of common prefix blocks for all requests.
     # This can be used for cascade attention.
     num_common_prefix_blocks: int
@@ -107,7 +107,7 @@ class SchedulerOutput:
     # Request IDs that are finished in between the previous and the current
     # steps. This is used to notify the workers about the finished requests
     # so that they can free the cached states for those requests.
-    finished_req_ids: Set[str]
-    # List of (req_id, encoder_input_index) tuples.
+    finished_req_ids: set[str]
+    # list of (req_id, encoder_input_index) tuples.
     # Used to free the encoder cache.
-    free_encoder_input_ids: List[Tuple[str, int]]
+    free_encoder_input_ids: list[tuple[str, int]]
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 32fb3c5bd62..cd29c2d7d57 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -2,7 +2,7 @@
 
 import enum
 import time
-from typing import Any, List, Optional, Union
+from typing import Any, Optional, Union
 
 import msgspec
 
@@ -51,10 +51,10 @@ class EngineCoreRequest(
     # NOTE(ywang96): original text prompt is needed when a request is added to
     # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
-    prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
-    mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[List[PlaceholderRange]]
+    prompt_token_ids: list[int]
+    mm_inputs: Optional[list[Optional[MultiModalKwargs]]]
+    mm_hashes: Optional[list[str]]
+    mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
     arrival_time: float
@@ -93,14 +93,14 @@ class EngineCoreOutput(
         gc=False):  # type: ignore[call-arg]
 
     request_id: str
-    new_token_ids: List[int]
+    new_token_ids: list[int]
 
     new_logprobs: Optional[LogprobsLists] = None
     new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None
 
     finish_reason: Optional[FinishReason] = None
     stop_reason: Union[int, str, None] = None
-    events: Optional[List[EngineCoreEvent]] = None
+    events: Optional[list[EngineCoreEvent]] = None
 
     @property
     def finished(self) -> bool:
@@ -129,7 +129,7 @@ class EngineCoreOutputs(
     # e.g. columnwise layout
 
     # [num_reqs]
-    outputs: List[EngineCoreOutput] = []
+    outputs: list[EngineCoreOutput] = []
     scheduler_stats: Optional[SchedulerStats] = None
     timestamp: float = 0.0
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0c04e14cec2..ab3cdc4ee29 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -2,7 +2,8 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, List, Mapping, Optional, Set, Type, Union
+from collections.abc import AsyncGenerator, Mapping
+from typing import Optional, Union
 
 import numpy as np
 
@@ -39,7 +40,7 @@ class AsyncLLM(EngineClient):
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         input_registry: InputRegistry = INPUT_REGISTRY,
@@ -54,7 +55,7 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: List[StatLoggerBase] = []
+        self.stat_loggers: list[StatLoggerBase] = []
         if self.log_stats:
             self.stat_loggers.extend([
                 LoggingStatLogger(),
@@ -400,7 +401,7 @@ async def remove_lora(self, lora_id: int) -> bool:
         """Remove an already loaded LoRA adapter."""
         return await self.engine_core.remove_lora_async(lora_id)
 
-    async def list_loras(self) -> Set[int]:
+    async def list_loras(self) -> set[int]:
         """List all registered adapters."""
         return await self.engine_core.list_loras_async()
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 041896f1c7c..b9bf8fac40f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -7,7 +7,7 @@
 from concurrent.futures import Future
 from inspect import isclass, signature
 from multiprocessing.connection import Connection
-from typing import Any, List, Optional, Set, Tuple, Type
+from typing import Any, Optional
 
 import msgspec
 import psutil
@@ -42,7 +42,7 @@ class EngineCore:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
     ):
         assert vllm_config.model_config.runner_type != "pooling"
@@ -80,7 +80,7 @@ def __init__(
         # schedule and execute batches, and is required by pipeline parallelism
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
-        self.batch_queue: Optional[queue.Queue[Tuple[Future[ModelRunnerOutput],
+        self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput],
                                                      SchedulerOutput]]] = None
         if self.batch_queue_size > 1:
             logger.info("Batch queue is enabled with size %d",
@@ -88,7 +88,7 @@ def __init__(
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
     def _initialize_kv_caches(self,
-                              vllm_config: VllmConfig) -> Tuple[int, int]:
+                              vllm_config: VllmConfig) -> tuple[int, int]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -134,7 +134,7 @@ def add_request(self, request: EngineCoreRequest):
 
         self.scheduler.add_request(req)
 
-    def abort_requests(self, request_ids: List[str]):
+    def abort_requests(self, request_ids: list[str]):
         """Abort requests from the scheduler."""
 
         # TODO: The scheduler doesn't really need to know the
@@ -228,7 +228,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_executor.remove_lora(lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         return self.model_executor.list_loras()
 
     def pin_lora(self, lora_id: int) -> bool:
@@ -244,7 +244,7 @@ def __init__(
         output_path: str,
         ready_pipe: Connection,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
     ):
         super().__init__(vllm_config, executor_class, log_stats)
@@ -254,7 +254,7 @@ def __init__(
         # and to overlap some serialization/deserialization with the
         # model forward pass.
         # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[Tuple[EngineCoreRequestType,
+        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
                                             Any]] = queue.Queue()
         self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 9f36e11d12d..cdce14afe0b 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -10,7 +10,7 @@
 from concurrent.futures import Future
 from dataclasses import dataclass
 from threading import Thread
-from typing import Any, Dict, List, Optional, Set, Type, Union
+from typing import Any, Optional, Union
 
 import zmq
 import zmq.asyncio
@@ -48,7 +48,7 @@ def make_client(
         multiprocess_mode: bool,
         asyncio_mode: bool,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
     ) -> "EngineCoreClient":
 
@@ -94,7 +94,7 @@ def execute_dummy_batch(self) -> None:
     async def execute_dummy_batch_async(self) -> None:
         raise NotImplementedError
 
-    def abort_requests(self, request_ids: List[str]) -> None:
+    def abort_requests(self, request_ids: list[str]) -> None:
         raise NotImplementedError
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -103,7 +103,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         raise NotImplementedError
 
     def pin_lora(self, lora_id: int) -> bool:
@@ -127,7 +127,7 @@ async def sleep_async(self, level: int = 1) -> None:
     async def wake_up_async(self) -> None:
         raise NotImplementedError
 
-    async def abort_requests_async(self, request_ids: List[str]) -> None:
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
         raise NotImplementedError
 
     async def add_lora_async(self, lora_request: LoRARequest) -> bool:
@@ -136,7 +136,7 @@ async def add_lora_async(self, lora_request: LoRARequest) -> bool:
     async def remove_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
-    async def list_loras_async(self) -> Set[int]:
+    async def list_loras_async(self) -> set[int]:
         raise NotImplementedError
 
     async def pin_lora_async(self, lora_id: int) -> bool:
@@ -162,7 +162,7 @@ def get_output(self) -> EngineCoreOutputs:
     def add_request(self, request: EngineCoreRequest) -> None:
         self.engine_core.add_request(request)
 
-    def abort_requests(self, request_ids: List[str]) -> None:
+    def abort_requests(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
             self.engine_core.abort_requests(request_ids)
 
@@ -190,7 +190,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.engine_core.remove_lora(lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         return self.engine_core.list_loras()
 
     def pin_lora(self, lora_id: int) -> bool:
@@ -239,7 +239,7 @@ def __init__(
         self,
         asyncio_mode: bool,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
     ):
         # The child processes will send SIGUSR1 when unrecoverable
@@ -293,14 +293,14 @@ def sigusr1_handler(signum, frame):
 
         self.output_socket = resources.output_socket
         self.input_socket = resources.input_socket
-        self.utility_results: Dict[int, AnyFuture] = {}
+        self.utility_results: dict[int, AnyFuture] = {}
 
     def shutdown(self):
         self._finalizer()
 
 
 def _process_utility_output(output: UtilityOutput,
-                            utility_results: Dict[int, AnyFuture]):
+                            utility_results: dict[int, AnyFuture]):
     """Set the result from a utility method in the waiting future"""
     future = utility_results.pop(output.call_id)
     if output.failure_message is not None:
@@ -312,7 +312,7 @@ def _process_utility_output(output: UtilityOutput,
 class SyncMPClient(MPClient):
     """Synchronous client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
         super().__init__(
             asyncio_mode=False,
@@ -373,7 +373,7 @@ def add_request(self, request: EngineCoreRequest) -> None:
         request.prompt = None
         self._send_input(EngineCoreRequestType.ADD, request)
 
-    def abort_requests(self, request_ids: List[str]) -> None:
+    def abort_requests(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
@@ -389,7 +389,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self._call_utility("remove_lora", lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         return self._call_utility("list_loras")
 
     def pin_lora(self, lora_id: int) -> bool:
@@ -408,7 +408,7 @@ def execute_dummy_batch(self) -> None:
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
 
-    def __init__(self, vllm_config: VllmConfig, executor_class: Type[Executor],
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
         super().__init__(
             asyncio_mode=True,
@@ -471,7 +471,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
 
-    async def abort_requests_async(self, request_ids: List[str]) -> None:
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
@@ -496,7 +496,7 @@ async def add_lora_async(self, lora_request: LoRARequest) -> bool:
     async def remove_lora_async(self, lora_id: int) -> bool:
         return await self._call_utility_async("remove_lora", lora_id)
 
-    async def list_loras_async(self) -> Set[int]:
+    async def list_loras_async(self) -> set[int]:
         return await self._call_utility_async("list_loras")
 
     async def pin_lora_async(self, lora_id: int) -> bool:
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 629da06f492..4a1636f4949 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional
 
 from vllm.engine.output_processor.stop_checker import StopChecker
 from vllm.logger import init_logger
@@ -17,12 +17,12 @@ class IncrementalDetokenizer:
 
     # Generation data
     output_text: str
-    tokens: List[str]
-    token_ids: List[int]
+    tokens: list[str]
+    token_ids: list[int]
     prompt_len: int
 
     # Stop strings
-    stop: List[str]
+    stop: list[str]
     include_stop_str_in_output: bool
 
     # Metadata for incremental detokenization
@@ -41,7 +41,7 @@ class IncrementalDetokenizer:
     _last_output_text_offset: int = 0
 
     @property
-    def output_token_ids(self) -> List[int]:
+    def output_token_ids(self) -> list[int]:
         return self.token_ids[self.prompt_len:]
 
     @classmethod
@@ -84,7 +84,7 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
         )
 
-    def update(self, new_token_ids: List[int]) -> Optional[str]:
+    def update(self, new_token_ids: list[int]) -> Optional[str]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index ccf52250c1d..2e76694a7f5 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Mapping, Optional, Set, Type, Union
+from collections.abc import Mapping
+from typing import Optional, Union
 
 from typing_extensions import TypeVar
 
@@ -36,10 +37,10 @@ class LLMEngine:
     def __init__(
         self,
         vllm_config: VllmConfig,
-        executor_class: Type[Executor],
+        executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
@@ -97,7 +98,7 @@ def from_engine_args(
         cls,
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
         enable_multiprocessing: bool = False,
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
@@ -139,7 +140,7 @@ def has_unfinished_requests_dp(self, has_unfinished: bool) -> bool:
     def validate_outputs(cls, outputs, output_type):
         return outputs
 
-    def abort_request(self, request_ids: List[str]) -> None:
+    def abort_request(self, request_ids: list[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
         self.engine_core.abort_requests(request_ids)
@@ -199,7 +200,7 @@ def _add_request(
         # 3) Add the request to EngineCore.
         self.engine_core.add_request(request)
 
-    def step(self) -> List[RequestOutput]:
+    def step(self) -> list[RequestOutput]:
 
         if self.should_execute_dummy_batch:
             self.should_execute_dummy_batch = False
@@ -241,7 +242,7 @@ def wake_up(self):
 
     def get_tokenizer_group(
         self,
-        group_type: Type[_G] = BaseTokenizerGroup,
+        group_type: type[_G] = BaseTokenizerGroup,
     ) -> _G:
         tokenizer_group = self.tokenizer
 
@@ -263,7 +264,7 @@ def remove_lora(self, lora_id: int) -> bool:
         """Remove an already loaded LoRA adapter."""
         return self.engine_core.remove_lora(lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         """List all registered adapters."""
         return self.engine_core.list_loras()
 
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 4622cafa4a0..7f572163ead 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -2,7 +2,7 @@
 
 import itertools
 from dataclasses import dataclass
-from typing import Dict, List, Optional
+from typing import Optional
 
 from vllm.logger import init_logger
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
@@ -151,12 +151,12 @@ def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
 
     @staticmethod
     def _make_logprob_dict(
-        logprobs: List[float],
-        logprob_token_ids: List[int],
-        decoded_tokens: List[str],
+        logprobs: list[float],
+        logprob_token_ids: list[int],
+        decoded_tokens: list[str],
         rank: int,
         num_logprobs: int,
-    ) -> Dict[int, Logprob]:
+    ) -> dict[int, Logprob]:
         """Make a Logprob dictionary for a position.
 
         Args:
@@ -168,7 +168,7 @@ def _make_logprob_dict(
             by the user (in addition to sampled logprob)
 
         Returns:
-          Dict[token id, Logprob]
+          dict[token id, Logprob]
         """
 
         # We do not need a special case for the sampled token
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index a1d802bf818..0f66f68109b 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
@@ -68,10 +68,10 @@ def cache_hit_ratio(self, steps):
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
-        mm_hashes: Optional[List[str]],
-        mm_processor_kwargs: Optional[Dict[str, Any]],
-        precomputed_mm_inputs: Optional[List[MultiModalKwargs]],
-    ) -> List[MultiModalKwargs]:
+        mm_hashes: Optional[list[str]],
+        mm_processor_kwargs: Optional[dict[str, Any]],
+        precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
+    ) -> list[MultiModalKwargs]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -88,7 +88,7 @@ def process_inputs(
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_inputs: List[MultiModalKwargs] = []
+        ret_inputs: list[MultiModalKwargs] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
                 self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
@@ -133,9 +133,9 @@ def __init__(self, model_config):
 
     def get_and_update(
         self,
-        mm_inputs: List[Optional[MultiModalKwargs]],
-        mm_hashes: List[str],
-    ) -> List[MultiModalKwargs]:
+        mm_inputs: list[Optional[MultiModalKwargs]],
+        mm_hashes: list[str],
+    ) -> list[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
         if not self.use_cache:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 9ae8303df54..22bbb8a0f5b 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -2,7 +2,7 @@
 
 import asyncio
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
 
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import RequestOutputKind
@@ -18,8 +18,8 @@
 @dataclass
 class OutputProcessorOutput:
 
-    request_outputs: List[RequestOutput]
-    reqs_to_abort: List[str]
+    request_outputs: list[RequestOutput]
+    reqs_to_abort: list[str]
 
 
 class RequestState:
@@ -30,7 +30,7 @@ def __init__(
         lora_name: Optional[str],
         output_kind: RequestOutputKind,
         prompt: Optional[str],
-        prompt_token_ids: List[int],
+        prompt_token_ids: list[int],
         logprobs_processor: LogprobsProcessor,
         detokenizer: IncrementalDetokenizer,
         arrival_time: float,
@@ -90,7 +90,7 @@ def __init__(
     ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
-        self.request_states: Dict[str, RequestState] = {}
+        self.request_states: dict[str, RequestState] = {}
         self.lora_states = LoRARequestStates()
 
     def is_request_active(self, request_id: str) -> bool:
@@ -104,7 +104,7 @@ def has_unfinished_requests(self) -> bool:
 
     def abort_requests(
         self,
-        request_ids: List[str],
+        request_ids: list[str],
     ) -> None:
         for request_id in request_ids:
             req_state = self.request_states.pop(request_id, None)
@@ -130,7 +130,7 @@ def add_request(
 
     def process_outputs(
         self,
-        engine_core_outputs: List[EngineCoreOutput],
+        engine_core_outputs: list[EngineCoreOutput],
         engine_core_timestamp: Optional[float] = None,
         iteration_stats: Optional[IterationStats] = None,
     ) -> OutputProcessorOutput:
@@ -158,8 +158,8 @@ def process_outputs(
         **********************************************************
         """
 
-        request_outputs: List[RequestOutput] = []
-        reqs_to_abort: List[str] = []
+        request_outputs: list[RequestOutput] = []
+        reqs_to_abort: list[str] = []
         for engine_core_output in engine_core_outputs:
             req_id = engine_core_output.request_id
             req_state = self.request_states.get(req_id)
@@ -265,7 +265,7 @@ def _update_stats_from_finished(self, req_state: RequestState,
     @staticmethod
     def _make_request_output(
         request_state: RequestState,
-        new_token_ids: List[int],
+        new_token_ids: list[int],
         finish_reason: Optional[FinishReason],
         stop_reason: Union[int, str, None],
     ) -> Optional[RequestOutput]:
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 5d4ea111abf..291360771b5 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import AsyncGenerator, Mapping
 from copy import copy
-from typing import (AsyncGenerator, Dict, List, Mapping, Optional, Protocol,
-                    Tuple, Union)
+from typing import Optional, Protocol, Union
 
 from vllm.inputs import PromptType
 from vllm.lora.request import LoRARequest
@@ -137,7 +137,7 @@ def _get_final_request_output(self) -> RequestOutput:
                                              key=lambda x: x.index)
         return self.request_output
 
-    def get_child_info(self, index: int) -> Tuple[str, SamplingParams]:
+    def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         """Get child request ID and sampling params.
         
         Args:
@@ -237,9 +237,9 @@ class SyncParallelSamplingManager:
 
     def __init__(self):
         # Parent req ID -> parent request manager
-        self.parent_reqs: Dict[str, ParallelSamplingRequest] = {}
+        self.parent_reqs: dict[str, ParallelSamplingRequest] = {}
         # Child req ID -> (child req index, parent req ID)
-        self.child_reqs: Dict[str, Tuple[int, str]] = {}
+        self.child_reqs: dict[str, tuple[int, str]] = {}
 
     def _register_parent_request(self, req: ParallelSamplingRequest) -> None:
         """Register parallel sampling parent request."""
@@ -299,8 +299,8 @@ def add_request_parallel_sampling(
 
     def step(
         self,
-        outputs: List[RequestOutput],
-    ) -> List[RequestOutput]:
+        outputs: list[RequestOutput],
+    ) -> list[RequestOutput]:
         """Build parallel sampling request outputs.
         
         Extract child request outputs, aggregate them
@@ -355,7 +355,7 @@ async def generate_parallel_sampling_async(
     parent_req = ParallelSamplingRequest(request_id, sampling_params)
 
     # Aggregate generators for n child requests
-    gens: List[AsyncGenerator[RequestOutput, None]] = []
+    gens: list[AsyncGenerator[RequestOutput, None]] = []
     for idx in range(parent_req.n):
         child_req_id, child_params = parent_req.get_child_info(idx)
         child_gen = generate(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 2547cebaede..3a3fc69e53e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import time
-from typing import Mapping, Optional, Union
+from collections.abc import Mapping
+from typing import Optional, Union
 
 from vllm.config import CacheConfig, LoRAConfig, ModelConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 11002ad0022..aa6ae83c26e 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from concurrent.futures import Future
-from typing import List, Type, Union
+from typing import Union
 
 import torch
 import torch.distributed as dist
@@ -22,8 +22,8 @@ class Executor(ExecutorBase):
     For methods shared by v0 and v1, define them in ExecutorBase"""
 
     @staticmethod
-    def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
-        executor_class: Type[Executor]
+    def get_class(vllm_config: VllmConfig) -> type["Executor"]:
+        executor_class: type[Executor]
         parallel_config = vllm_config.parallel_config
         distributed_executor_backend = (
             parallel_config.distributed_executor_backend)
@@ -53,7 +53,7 @@ def get_class(vllm_config: VllmConfig) -> Type["Executor"]:
         return executor_class
 
     def initialize_from_config(self,
-                               kv_cache_configs: List[KVCacheConfig]) -> None:
+                               kv_cache_configs: list[KVCacheConfig]) -> None:
         """
         Initialize the KV caches and begin the model execution loop of the
         underlying workers.
@@ -69,7 +69,7 @@ def determine_available_memory(self) -> int:  # in bytes
         # operators can be applied to all workers.
         return min(output)
 
-    def get_kv_cache_specs(self) -> List[KVCacheSpec]:
+    def get_kv_cache_specs(self) -> list[KVCacheSpec]:
         output = self.collective_rpc("get_kv_cache_spec")
         return output
 
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 25b5c1c1c2f..b2cbba51803 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -10,7 +10,7 @@
 from enum import Enum, auto
 from functools import partial
 from multiprocessing.process import BaseProcess
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 
 import cloudpickle
 import psutil
@@ -77,7 +77,7 @@ def sigusr1_handler(signum, frame):
         scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
 
         # Create workers
-        self.workers: List[WorkerProcHandle] = []
+        self.workers: list[WorkerProcHandle] = []
         for rank in range(self.world_size):
             worker = WorkerProc.make_worker_process(self.vllm_config, rank,
                                                     rank,
@@ -94,8 +94,8 @@ def sigusr1_handler(signum, frame):
     def collective_rpc(self,
                        method: Union[str, Callable],
                        timeout: Optional[float] = None,
-                       args: Tuple = (),
-                       kwargs: Optional[Dict] = None) -> List[Any]:
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None) -> list[Any]:
         start_time = time.monotonic()
         kwargs = kwargs or {}
 
@@ -208,7 +208,7 @@ def __init__(
         self.rank = rank
         wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
         # TODO: move `init_worker` to executor level as a collective rpc call
-        all_kwargs: List[Dict] = [
+        all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]
         all_kwargs[rank] = {
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index eddfb5949eb..dfef1039fce 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List
 
 import torch
 
@@ -74,7 +73,7 @@ def bytes_for_tokens(self, num_tokens: int) -> int:
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
 
-KVCacheSpec = Dict[str, KVCacheSpecBase]
+KVCacheSpec = dict[str, KVCacheSpecBase]
 
 
 @dataclass
@@ -95,7 +94,7 @@ class KVCacheConfig:
     """The number of KV cache blocks"""
     num_blocks: int
     """layer_name -> how to initialize KV cache for that layer"""
-    tensors: Dict[str, KVCacheTensor]
+    tensors: dict[str, KVCacheTensor]
     """
     A list of kv-cache groups. Each group includes a set of layers with
     the same kv-cache spec, and the total page_size of layers inside a group
@@ -108,6 +107,6 @@ class KVCacheConfig:
     3. (not implemented yet) A model with 2 full attention layers and 4 sliding 
     window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
     """
-    groups: List[List[str]]
+    groups: list[list[str]]
     """the KVCacheSpec of the model"""
     kv_cache_spec: KVCacheSpec
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 40dfc566167..5a2a1c30a9d 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -2,7 +2,7 @@
 
 import time
 from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
+from typing import Optional
 
 import numpy as np
 import prometheus_client
@@ -35,8 +35,8 @@ def _reset(self, now):
         self.last_log_time = now
 
         # Tracked stats over current local logging interval.
-        self.num_prompt_tokens: List[int] = []
-        self.num_generation_tokens: List[int] = []
+        self.num_prompt_tokens: list[int] = []
+        self.num_generation_tokens: list[int] = []
 
         # Prefix cache metrics. TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
@@ -52,7 +52,7 @@ def _track_iteration_stats(self, iteration_stats: IterationStats):
         self.num_generation_tokens.append(
             iteration_stats.num_generation_tokens)
 
-    def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
+    def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
         # Compute summary metrics for tracked stats
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
@@ -147,7 +147,7 @@ def __init__(self, vllm_config: VllmConfig):
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
-        self.counter_request_success: Dict[FinishReason,
+        self.counter_request_success: dict[FinishReason,
                                            prometheus_client.Counter] = {}
         counter_request_success_base = prometheus_client.Counter(
             name="vllm:request_success_total",
@@ -338,14 +338,14 @@ def _unregister_vllm_metrics():
                 prometheus_client.REGISTRY.unregister(collector)
 
 
-def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by
     mantissa values until the value exceeds the specified maximum.
 
     """
     exponent = 0
-    buckets: List[int] = []
+    buckets: list[int] = []
     while True:
         for m in mantissa_lst:
             value = m * 10**exponent
@@ -356,7 +356,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
         exponent += 1
 
 
-def build_1_2_5_buckets(max_value: int) -> List[int]:
+def build_1_2_5_buckets(max_value: int) -> list[int]:
     """
     Example:
     >>> build_1_2_5_buckets(100)
@@ -365,7 +365,7 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
     return build_buckets([1, 2, 5], max_value)
 
 
-def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]:
+def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]:
     if not vllm_config.model_config.enforce_eager:
         buckets = vllm_config.compilation_config.\
             cudagraph_capture_sizes.copy()
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 30f460e5a69..625edb60746 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -2,7 +2,7 @@
 
 import time
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
     from vllm.outputs import RequestOutput
@@ -39,8 +39,8 @@ class SchedulerStats:
 
 @dataclass
 class LoRAStats:
-    waiting_requests: Set[str] = field(default_factory=set)
-    running_requests: Set[str] = field(default_factory=set)
+    waiting_requests: set[str] = field(default_factory=set)
+    running_requests: set[str] = field(default_factory=set)
 
 
 @dataclass
@@ -81,11 +81,11 @@ def __init__(self):
         self.num_generation_tokens = 0
         self.num_prompt_tokens = 0
         self.num_preempted_reqs = 0
-        self.finished_requests: List[FinishedRequestStats] = []
-        self.time_to_first_tokens_iter: List[float] = []
-        self.time_per_output_tokens_iter: List[float] = []
-        self.waiting_lora_adapters: Dict[str, int] = {}
-        self.running_lora_adapters: Dict[str, int] = {}
+        self.finished_requests: list[FinishedRequestStats] = []
+        self.time_to_first_tokens_iter: list[float] = []
+        self.time_per_output_tokens_iter: list[float] = []
+        self.waiting_lora_adapters: dict[str, int] = {}
+        self.running_lora_adapters: dict[str, int] = {}
 
     def _time_since(self, start: float) -> float:
         """Calculate an interval relative to this iteration's timestamp."""
@@ -132,7 +132,7 @@ def update_from_output(self, output: "EngineCoreOutput",
         if num_new_generation_tokens > 0:
             req_stats.last_token_ts = engine_core_timestamp
 
-    def update_from_events(self, req_id: str, events: List["EngineCoreEvent"],
+    def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                            is_prefilling: bool, req_stats: RequestStateStats,
                            lora_stats: Optional[LoRAStats]):
         # Avoid circular dependency
@@ -185,7 +185,7 @@ class LoRARequestStates:
     """Per-LoRA request state stats."""
 
     def __init__(self):
-        self.lora_name_to_stats: Dict[str, LoRAStats] = {}
+        self.lora_name_to_stats: dict[str, LoRAStats] = {}
 
     def get_stats(self, req_state: 'RequestState') -> Optional[LoRAStats]:
         if req_state.lora_name is None:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index f461d52cc98..dc3ad402e06 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, NamedTuple, Optional
+from typing import NamedTuple, Optional
 
 import torch
 
@@ -9,11 +9,11 @@
 class LogprobsLists(NamedTuple):
 
     # [num_reqs, max_num_logprobs + 1]
-    logprob_token_ids: List[List[int]]
+    logprob_token_ids: list[list[int]]
     # [num_reqs, max_num_logprobs + 1]
-    logprobs: List[List[float]]
+    logprobs: list[list[float]]
     # [num_reqs]
-    sampled_token_ranks: List[int]
+    sampled_token_ranks: list[int]
 
     def slice(self, start: int, end: int):
         return LogprobsLists(
@@ -52,23 +52,23 @@ class SamplerOutput:
 
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
-# This is expensive for torch.Tensor so prefer to use List instead.
+# This is expensive for torch.Tensor so prefer to use list instead.
 @dataclass
 class ModelRunnerOutput:
 
     # [num_reqs]
-    req_ids: List[str]
+    req_ids: list[str]
     # req_id -> index
-    req_id_to_index: Dict[str, int]
+    req_id_to_index: dict[str, int]
 
     # num_reqs x num_generated_tokens
     # num_generated_tokens is the number of tokens
     # generated in the current step. It can be different for
     # each request due to speculative/jump decoding.
-    sampled_token_ids: List[List[int]]
+    sampled_token_ids: list[list[int]]
 
     # num_reqs x num_spec_tokens
-    spec_token_ids: Optional[List[List[int]]]
+    spec_token_ids: Optional[list[list[int]]]
 
     # [num_reqs, max_num_logprobs + 1]
     # [num_reqs, max_num_logprobs + 1]
@@ -79,4 +79,4 @@ class ModelRunnerOutput:
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
-    prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]]
+    prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 52d7faeeb06..99df5473483 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import enum
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -20,10 +20,10 @@ def __init__(
         self,
         request_id: str,
         prompt: Optional[str],
-        prompt_token_ids: List[int],
-        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
-        multi_modal_hashes: Optional[List[str]],
-        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
+        prompt_token_ids: list[int],
+        multi_modal_inputs: Optional[list["MultiModalKwargs"]],
+        multi_modal_hashes: Optional[list[str]],
+        multi_modal_placeholders: Optional[list["PlaceholderRange"]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
@@ -36,7 +36,7 @@ def __init__(
         self.lora_request = lora_request
 
         self.status = RequestStatus.WAITING
-        self.events: List[EngineCoreEvent] = []
+        self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
@@ -44,15 +44,15 @@ def __init__(
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
-        self._output_token_ids: List[int] = []
-        self._all_token_ids: List[int] = self.prompt_token_ids.copy()
-        self.spec_token_ids: List[int] = []
+        self._output_token_ids: list[int] = []
+        self._all_token_ids: list[int] = self.prompt_token_ids.copy()
+        self.spec_token_ids: list[int] = []
         self.num_computed_tokens = 0
 
         # Multi-modal related
         self.mm_positions = multi_modal_placeholders or []
         self.mm_inputs = multi_modal_inputs or []
-        self.mm_hashes: List[str] = multi_modal_hashes or []
+        self.mm_hashes: list[str] = multi_modal_hashes or []
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
@@ -89,7 +89,7 @@ def scheduled(self, timestamp: Optional[float] = None) -> None:
             EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
                                       timestamp))
 
-    def take_events(self) -> Optional[List[EngineCoreEvent]]:
+    def take_events(self) -> Optional[list[EngineCoreEvent]]:
         if not self.events:
             return None
         events, self.events = self.events, []
@@ -97,7 +97,7 @@ def take_events(self) -> Optional[List[EngineCoreEvent]]:
 
     def append_output_token_ids(
         self,
-        token_ids: Union[int, List[int]],
+        token_ids: Union[int, list[int]],
     ) -> None:
         if isinstance(token_ids, int):
             token_ids = [token_ids]
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index b757a1dc60c..55d9739b800 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Optional
 
 import torch
 
@@ -17,7 +17,7 @@ class SamplingMetadata:
     top_k: Optional[torch.Tensor]
     min_p: Optional[torch.Tensor]
 
-    generators: Dict[int, torch.Generator]
+    generators: dict[int, torch.Generator]
 
     # None means no logprobs, 0 means sampled token logprobs only
     max_num_logprobs: Optional[int]
@@ -28,12 +28,12 @@ class SamplingMetadata:
     presence_penalties: torch.Tensor
     repetition_penalties: torch.Tensor
 
-    output_token_ids: List[List[int]]
+    output_token_ids: list[list[int]]
 
     # req_index -> (min_tokens, stop_token_ids)
-    min_tokens: Dict[int, Tuple[int, Set[int]]]
+    min_tokens: dict[int, tuple[int, set[int]]]
 
-    logit_bias: List[Optional[Dict[int, float]]]
+    logit_bias: list[Optional[dict[int, float]]]
 
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py
index 8d9f6529fa0..ed05e3f4840 100644
--- a/vllm/v1/sample/ops/penalties.py
+++ b/vllm/v1/sample/ops/penalties.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, List, Set, Tuple
-
 import torch
 
 from vllm.model_executor.layers.utils import apply_penalties
@@ -9,13 +7,13 @@
 
 
 def apply_min_token_penalties(
-        logits: torch.Tensor, output_token_ids: List[List[int]],
-        min_tokens: Dict[int, Tuple[int, Set[int]]]) -> None:
+        logits: torch.Tensor, output_token_ids: list[list[int]],
+        min_tokens: dict[int, tuple[int, set[int]]]) -> None:
     """
     Applies minimum token penalty by setting the logits of the stop tokens
     to -inf.
     """
-    min_tokens_logits_to_penalize: List[Tuple[int, int]] = []
+    min_tokens_logits_to_penalize: list[tuple[int, int]] = []
     for index, (min_token, stop_token_ids) in min_tokens.items():
         if len(output_token_ids[index]) < min_token:
             for stop_token_id in stop_token_ids:
@@ -30,7 +28,7 @@ def apply_all_penalties(
     presence_penalties: torch.Tensor,
     frequency_penalties: torch.Tensor,
     repetition_penalties: torch.Tensor,
-    output_token_ids: List[List[int]],
+    output_token_ids: list[list[int]],
 ) -> torch.Tensor:
     """
     Applies presence, frequency and repetition penalties to the logits.
@@ -43,7 +41,7 @@ def apply_all_penalties(
                            repetition_penalties)
 
 
-def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+def _convert_to_tensors(output_token_ids: list[list[int]], vocab_size: int,
                         device: torch.device) -> torch.Tensor:
     """
     Convert the different list data structures to tensors.
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 78c88ad8b83..1bb950be822 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -54,7 +54,7 @@ def __init__(self):
     def forward_native(
         self,
         logits: torch.Tensor,
-        generators: Dict[int, torch.Generator],
+        generators: dict[int, torch.Generator],
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
@@ -66,7 +66,7 @@ def forward_native(
     def forward_cuda(
         self,
         logits: torch.Tensor,
-        generators: Dict[int, torch.Generator],
+        generators: dict[int, torch.Generator],
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
@@ -117,7 +117,7 @@ def apply_top_k_top_p(
 
 def random_sample(
     probs: torch.Tensor,
-    generators: Dict[int, torch.Generator],
+    generators: dict[int, torch.Generator],
 ) -> torch.Tensor:
     """Randomly sample from the probabilities.
 
@@ -143,7 +143,7 @@ def flashinfer_sample(
     probs: torch.Tensor,
     k: Optional[torch.Tensor],
     p: Optional[torch.Tensor],
-    generators: Dict[int, torch.Generator],
+    generators: dict[int, torch.Generator],
 ) -> torch.Tensor:
     """Sample from the probabilities using FlashInfer.
 
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 2e3927345eb..80a4b24186a 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import List
 
 import torch
 import torch.nn as nn
@@ -54,7 +53,7 @@ def __init__(self):
         else:
             self.forward_method = self.forward_native
 
-    def forward(self, draft_token_ids: List[List[int]],
+    def forward(self, draft_token_ids: list[list[int]],
                 target_probs: torch.Tensor,
                 sampling_metadata: SamplingMetadata) -> SamplerOutput:
         if not sampling_metadata.all_greedy:
@@ -66,7 +65,7 @@ def forward(self, draft_token_ids: List[List[int]],
 
     def flashinfer_sample(
         self,
-        draft_token_ids: List[List[int]],
+        draft_token_ids: list[list[int]],
         target_probs: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
@@ -119,7 +118,7 @@ def flashinfer_sample(
     # TODO: The following method can be optimized for better performance.
     def forward_native(
         self,
-        draft_token_ids: List[List[int]],
+        draft_token_ids: list[list[int]],
         target_probs: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> SamplerOutput:
diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py
index 09d382638bf..46818977dae 100644
--- a/vllm/v1/stats/common.py
+++ b/vllm/v1/stats/common.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from dataclasses import field as dataclass_field
 from enum import IntEnum
-from typing import ClassVar, Dict, List, Optional, Set
+from typing import ClassVar, Optional
 
 import msgspec
 from msgspec import field as msgspec_field
@@ -78,7 +78,7 @@ class Type(IntEnum):
                                 ▼
                 FINISHED (All could go to FINISHED)
     """
-    _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = {
+    _VALID_TRANSITIONS: ClassVar[dict[Type, set[Type]]] = {
         Type.ARRIVED: {
             Type.INPUT_PROCESSED,
             Type.FINISHED,
@@ -140,7 +140,7 @@ class Type(IntEnum):
     finish_reason: Optional[str] = None
 
     # Non-optional fields for each update type.
-    _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = {
+    _REQUIRED_FIELDS: ClassVar[dict[Type, list[str]]] = {
         Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"],
         Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"],
         Type.DETOKENIZED: ["num_new_tokens"],
@@ -218,13 +218,13 @@ class RequestStats:
     # 2. the request was preempted and resumed. It is equivalent to running
     #    a prefill of the original prefill tokens + generated output tokens
     #    before preemption.
-    prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+    prefill_start_ts_s_lst: list[float] = dataclass_field(default_factory=list)
 
     # A list of timestamps when a token is decoded by the engine core.
-    decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+    decoding_ts_s_lst: list[float] = dataclass_field(default_factory=list)
 
     # A sorted list of timestamps for each output token.
-    output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+    output_token_ts_s_lst: list[float] = dataclass_field(default_factory=list)
 
     # First token's timestamp.
     first_token_ts_s: Optional[float] = None
@@ -241,7 +241,7 @@ class RequestStats:
     # metric to measure the impact of preemption other than observation of
     # large P99 TPOT. Ideally we could quantify the impact of preemption by
     # measuring the number of tokens re-computed due to preemption.
-    preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list)
+    preempted_ts_s_lst: list[float] = dataclass_field(default_factory=list)
 
     # Timestamp when the request was finished at the engine core.
     finished_ts_s: Optional[float] = None
@@ -308,7 +308,7 @@ def decode_latency_s(self) -> Optional[float]:
         return self.e2e_latency_s - self.first_token_latency_s
 
     @property
-    def output_token_latency_s_lst(self) -> List[float]:
+    def output_token_latency_s_lst(self) -> list[float]:
         if len(self.output_token_ts_s_lst) == 0:
             return []
         latency_s_lst = []
@@ -442,7 +442,7 @@ class EngineCoreStatsSnapshot(
         default_factory=SchedulerStats)
 
     # Per request stats updates.
-    requests_stats_updates: List[RequestStatsUpdate] = msgspec_field(
+    requests_stats_updates: list[RequestStatsUpdate] = msgspec_field(
         default_factory=list)
 
     # Engine core's queue stats.
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 62271255b0c..8e1fb18cca0 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -5,8 +5,8 @@
 import weakref
 from collections import defaultdict
 from collections.abc import Sequence
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, List,
-                    Optional, TypeVar, Union, overload)
+from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
+                    Union, overload)
 
 import torch
 
@@ -24,7 +24,7 @@
 
 class ConstantList(Generic[T], Sequence):
 
-    def __init__(self, x: List[T]) -> None:
+    def __init__(self, x: list[T]) -> None:
         self._x = x
 
     def append(self, item):
@@ -57,10 +57,10 @@ def __getitem__(self, item: int) -> T:
         ...
 
     @overload
-    def __getitem__(self, s: slice, /) -> List[T]:
+    def __getitem__(self, s: slice, /) -> list[T]:
         ...
 
-    def __getitem__(self, item: Union[int, slice]) -> Union[T, List[T]]:
+    def __getitem__(self, item: Union[int, slice]) -> Union[T, list[T]]:
         return self._x[item]
 
     @overload
@@ -71,7 +71,7 @@ def __setitem__(self, item: int, value: T):
     def __setitem__(self, s: slice, value: T, /):
         ...
 
-    def __setitem__(self, item: Union[int, slice], value: Union[T, List[T]]):
+    def __setitem__(self, item: Union[int, slice], value: Union[T, list[T]]):
         raise Exception("Cannot set item in a constant list")
 
     def __delitem__(self, item):
@@ -99,7 +99,7 @@ def __init__(
         output_path: str,
         process_name: str,
         target_fn: Callable,
-        process_kwargs: Dict[Any, Any],
+        process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
         reader, writer = context.Pipe(duplex=False)
@@ -146,9 +146,9 @@ def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
 
 
 def bind_kv_cache(
-    kv_caches: Dict[str, torch.Tensor],
-    forward_context: Dict[str, "Attention"],
-    runner_kv_caches: List[torch.Tensor],
+    kv_caches: dict[str, torch.Tensor],
+    forward_context: dict[str, "Attention"],
+    runner_kv_caches: list[torch.Tensor],
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 830cca104dd..7d4082b7399 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
-
 import numpy as np
 import torch
 
@@ -40,7 +38,7 @@ def __init__(
 
     def append_row(
         self,
-        block_ids: List[int],
+        block_ids: list[int],
         row_idx: int,
     ) -> None:
         if not block_ids:
@@ -50,7 +48,7 @@ def append_row(
         self.num_blocks_per_row[row_idx] += num_blocks
         self.block_table_np[row_idx, start:start + num_blocks] = block_ids
 
-    def add_row(self, block_ids: List[int], row_idx: int) -> None:
+    def add_row(self, block_ids: list[int], row_idx: int) -> None:
         self.num_blocks_per_row[row_idx] = 0
         self.append_row(block_ids, row_idx)
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 788a35221fe..b0b218d92b9 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -2,7 +2,7 @@
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, cast
+from typing import TYPE_CHECKING, Optional, cast
 
 import numpy as np
 import torch
@@ -24,16 +24,16 @@
 class CachedRequestState:
 
     req_id: str
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     prompt: Optional[str]
-    mm_inputs: List[MultiModalKwargs]
-    mm_positions: List["PlaceholderRange"]
+    mm_inputs: list[MultiModalKwargs]
+    mm_positions: list["PlaceholderRange"]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 
-    block_ids: List[int]
+    block_ids: list[int]
     num_computed_tokens: int
-    output_token_ids: List[int]
+    output_token_ids: list[int]
 
     mrope_positions: Optional[torch.Tensor] = None
     mrope_position_delta: Optional[int] = None
@@ -63,8 +63,8 @@ def __init__(
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
 
-        self._req_ids: List[Optional[str]] = []
-        self.req_id_to_index: Dict[str, int] = {}
+        self._req_ids: list[Optional[str]] = []
+        self.req_id_to_index: dict[str, int] = {}
 
         # TODO(woosuk): This buffer could be too large if max_model_len is big.
         # Find a way to reduce the CPU memory usage.
@@ -106,8 +106,8 @@ def __init__(
                                                   device="cpu",
                                                   pin_memory=pin_memory)
         self.temperature_cpu = self.temperature_cpu_tensor.numpy()
-        self.greedy_reqs: Set[str] = set()
-        self.random_reqs: Set[str] = set()
+        self.greedy_reqs: set[str] = set()
+        self.random_reqs: set[str] = set()
 
         self.top_p = torch.empty((max_num_reqs, ),
                                  dtype=torch.float32,
@@ -117,7 +117,7 @@ def __init__(
                                             device="cpu",
                                             pin_memory=pin_memory)
         self.top_p_cpu = self.top_p_cpu_tensor.numpy()
-        self.top_p_reqs: Set[str] = set()
+        self.top_p_reqs: set[str] = set()
 
         self.top_k = torch.empty((max_num_reqs, ),
                                  dtype=torch.int32,
@@ -127,7 +127,7 @@ def __init__(
                                             device="cpu",
                                             pin_memory=pin_memory)
         self.top_k_cpu = self.top_k_cpu_tensor.numpy()
-        self.top_k_reqs: Set[str] = set()
+        self.top_k_reqs: set[str] = set()
 
         self.min_p = torch.empty((max_num_reqs, ),
                                  dtype=torch.float32,
@@ -137,7 +137,7 @@ def __init__(
                                             device="cpu",
                                             pin_memory=pin_memory)
         self.min_p_cpu = self.min_p_cpu_tensor.numpy()
-        self.min_p_reqs: Set[str] = set()
+        self.min_p_reqs: set[str] = set()
 
         # Frequency penalty related data structures
         self.frequency_penalties = torch.empty((max_num_reqs, ),
@@ -150,7 +150,7 @@ def __init__(
             pin_memory=pin_memory)
         self.frequency_penalties_cpu = \
             self.frequency_penalties_cpu_tensor.numpy()
-        self.frequency_penalties_reqs: Set[str] = set()
+        self.frequency_penalties_reqs: set[str] = set()
 
         # Presence penalty related data structures
         self.presence_penalties = torch.empty((max_num_reqs, ),
@@ -162,7 +162,7 @@ def __init__(
                                                          pin_memory=pin_memory)
         self.presence_penalties_cpu = self.presence_penalties_cpu_tensor.numpy(
         )
-        self.presence_penalties_reqs: Set[str] = set()
+        self.presence_penalties_reqs: set[str] = set()
 
         # Repetition penalty related data structures
         self.repetition_penalties = torch.empty((max_num_reqs, ),
@@ -175,43 +175,43 @@ def __init__(
             pin_memory=pin_memory)
         self.repetition_penalties_cpu = \
             self.repetition_penalties_cpu_tensor.numpy()
-        self.repetition_penalties_reqs: Set[str] = set()
+        self.repetition_penalties_reqs: set[str] = set()
 
         # req_index -> (min_tokens, stop_token_ids)
-        self.min_tokens: Dict[int, Tuple[int, Set[int]]] = {}
+        self.min_tokens: dict[int, tuple[int, set[int]]] = {}
 
         # lora related
         self.request_lora_mapping = np.zeros((self.max_num_reqs, ),
                                              dtype=np.int32)
-        self.lora_id_to_request_ids: Dict[int, Set[str]] = {}
-        self.lora_id_to_lora_request: Dict[int, LoRARequest] = {}
+        self.lora_id_to_request_ids: dict[int, set[str]] = {}
+        self.lora_id_to_lora_request: dict[int, LoRARequest] = {}
 
         # req_index -> generator
         # NOTE(woosuk): The indices of the requests that do not have their own
         # generator should not be included in the dictionary.
-        self.generators: Dict[int, torch.Generator] = {}
+        self.generators: dict[int, torch.Generator] = {}
 
-        self.num_logprobs: Dict[str, int] = {}
+        self.num_logprobs: dict[str, int] = {}
         # NOTE(rob): num_prompt_logprobs only includes reqs
         # that are currently in the prefill phase.
-        self.num_prompt_logprobs: Dict[str, int] = {}
+        self.num_prompt_logprobs: dict[str, int] = {}
 
-        self.logit_bias: List[Optional[Dict[int,
+        self.logit_bias: list[Optional[dict[int,
                                             float]]] = [None] * max_num_reqs
-        self.has_allowed_token_ids: Set[str] = set()
+        self.has_allowed_token_ids: set[str] = set()
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
-        self.req_output_token_ids: List[Optional[List[int]]] = []
+        self.req_output_token_ids: list[Optional[list[int]]] = []
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
 
     @property
-    def req_ids(self) -> List[str]:
+    def req_ids(self) -> list[str]:
         # None elements should only be present transiently
         # while performing state updates to the batch.
-        return cast(List[str], self._req_ids)
+        return cast(list[str], self._req_ids)
 
     def add_request(
         self,
@@ -417,7 +417,7 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.logit_bias[i2], self.logit_bias[i1]
         self.block_table.swap_row(i1, i2)
 
-    def condense(self, empty_req_indices: List[int]) -> None:
+    def condense(self, empty_req_indices: list[int]) -> None:
         num_reqs = self.num_reqs
         if num_reqs == 0:
             # The batched states are empty.
@@ -550,7 +550,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             frequency_penalties=self.frequency_penalties[:num_reqs],
             presence_penalties=self.presence_penalties[:num_reqs],
             repetition_penalties=self.repetition_penalties[:num_reqs],
-            output_token_ids=cast(List[List[int]], self.req_output_token_ids),
+            output_token_ids=cast(list[list[int]], self.req_output_token_ids),
             min_tokens=self.min_tokens,
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
@@ -577,7 +577,7 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
 
     def make_lora_inputs(
         self, num_scheduled_tokens: np.ndarray
-    ) -> Tuple[Tuple[int, ...], Tuple[int, ...], Set[LoRARequest]]:
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
         """
         Given the num_scheduled_tokens for each request in the batch, return
         datastructures used to activate the current LoRAs.
@@ -593,7 +593,7 @@ def make_lora_inputs(
         prompt_lora_mapping = tuple(req_lora_mapping)
         token_lora_mapping = tuple(
             req_lora_mapping.repeat(num_scheduled_tokens))
-        active_lora_requests: Set[LoRARequest] = set(
+        active_lora_requests: set[LoRARequest] = set(
             self.lora_id_to_lora_request.values())
 
         return prompt_lora_mapping, token_lora_mapping, active_lora_requests
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 6785d668426..4a1fb0514c3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3,7 +3,7 @@
 import gc
 import time
 import weakref
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import numpy as np
 import torch
@@ -135,9 +135,9 @@ def __init__(
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
-        self.kv_caches: List[torch.Tensor] = []
+        self.kv_caches: list[torch.Tensor] = []
         # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
 
         # Set up speculative decoding.
         self.use_spec_decode = False
@@ -158,7 +158,7 @@ def __init__(
                 )
 
         # Request states.
-        self.requests: Dict[str, CachedRequestState] = {}
+        self.requests: dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
@@ -274,7 +274,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # then resubmitted with the same ID. In this case, we treat them as two
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
-        removed_req_indices: List[int] = []
+        removed_req_indices: list[int] = []
         for req_id in scheduler_output.finished_req_ids:
             req_index = self.input_batch.remove_request(req_id)
             if req_index is not None:
@@ -305,7 +305,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             assert req_index is not None
             removed_req_indices.append(req_index)
 
-        req_ids_to_add: List[str] = []
+        req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
@@ -446,7 +446,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> Tuple[FlashAttentionMetadata, torch.Tensor]:
+    ) -> tuple[FlashAttentionMetadata, torch.Tensor]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -774,8 +774,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: List[MultiModalKwargs] = []
-        req_input_ids: List[Tuple[str, int]] = []
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
             for input_id in encoder_input_ids:
@@ -819,8 +819,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
     def _gather_encoder_outputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> List[torch.Tensor]:
-        encoder_outputs: List[torch.Tensor] = []
+    ) -> list[torch.Tensor]:
+        encoder_outputs: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -1022,10 +1022,10 @@ def execute_model(
 
     def generate_draft_token_ids(
         self,
-        sampled_token_ids: List[List[int]],
-    ) -> List[List[int]]:
+        sampled_token_ids: list[list[int]],
+    ) -> list[list[int]]:
         # TODO(woosuk): Optimize.
-        draft_token_ids: List[List[int]] = []
+        draft_token_ids: list[list[int]] = []
         for i, sampled_ids in enumerate(sampled_token_ids):
             num_sampled_ids = len(sampled_ids)
             if not num_sampled_ids:
@@ -1069,12 +1069,12 @@ def _get_prompt_logprobs_dict(
         self,
         hidden_states: torch.Tensor,
         scheduler_output: "SchedulerOutput",
-    ) -> Dict[str, Optional[LogprobsTensors]]:
+    ) -> dict[str, Optional[LogprobsTensors]]:
         num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
-        prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
         # maintainable loop over optimal performance.
@@ -1365,7 +1365,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
-        kv_caches: Dict[str, torch.Tensor] = {}
+        kv_caches: dict[str, torch.Tensor] = {}
 
         for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
             tensor_config = kv_cache_config.tensors[layer_name]
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f681925f557..cc6268d6569 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,7 +2,7 @@
 """A GPU worker class."""
 import gc
 import os
-from typing import TYPE_CHECKING, Optional, Set
+from typing import TYPE_CHECKING, Optional
 
 import torch
 import torch.distributed
@@ -243,7 +243,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
     def remove_lora(self, lora_id: int) -> bool:
         return self.model_runner.remove_lora(lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         return self.model_runner.list_loras()
 
     def pin_lora(self, lora_id: int) -> bool:
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 731e758e6e7..f34aacacf3e 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -4,7 +4,6 @@
 """
 
 from contextlib import contextmanager
-from typing import Set, Tuple
 
 import numpy as np
 import torch.nn as nn
@@ -57,9 +56,9 @@ def load_lora_model(self, model: nn.Module, model_config: ModelConfig,
         )
         return self.lora_manager.create_lora_manager(model)
 
-    def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...],
-                          token_lora_mapping: Tuple[int, ...],
-                          lora_requests: Set[LoRARequest]) -> None:
+    def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...],
+                          token_lora_mapping: tuple[int, ...],
+                          lora_requests: set[LoRARequest]) -> None:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
 
@@ -74,10 +73,10 @@ def _set_active_loras(self, prompt_lora_mapping: Tuple[int, ...],
     def set_active_loras(self, input_batch: InputBatch,
                          num_scheduled_tokens: np.ndarray) -> None:
 
-        prompt_lora_mapping: Tuple[int, ...]  # of size input_batch.num_reqs
-        token_lora_mapping: Tuple[int,
+        prompt_lora_mapping: tuple[int, ...]  # of size input_batch.num_reqs
+        token_lora_mapping: tuple[int,
                                   ...]  # of size np.sum(num_scheduled_tokens)
-        lora_requests: Set[LoRARequest]
+        lora_requests: set[LoRARequest]
         prompt_lora_mapping, token_lora_mapping, lora_requests = \
                             input_batch.make_lora_inputs(num_scheduled_tokens)
         return self._set_active_loras(prompt_lora_mapping, token_lora_mapping,
@@ -105,7 +104,7 @@ def maybe_profile_with_lora(self, lora_config: LoRAConfig,
                                            num_scheduled_tokens)
 
             # Make dummy lora requests
-            lora_requests: Set[LoRARequest] = {
+            lora_requests: set[LoRARequest] = {
                 LoRARequest(lora_name=f"warmup_{lora_id}",
                             lora_int_id=lora_id,
                             lora_path="/not/a/real/path")
@@ -143,7 +142,7 @@ def pin_lora(self, lora_id: int) -> bool:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.pin_adapter(lora_id)
 
-    def list_loras(self) -> Set[int]:
+    def list_loras(self) -> set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
         return self.lora_manager.list_adapters()
\ No newline at end of file
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 2c6a0371cde..104e5a3dcfc 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import time
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast
+from typing import TYPE_CHECKING, Optional, cast
 from unittest.mock import patch
 
 import numpy as np
@@ -95,13 +95,13 @@ def __init__(
         )
 
         # Request states.
-        self.requests: Dict[str, CachedRequestState] = {}
+        self.requests: dict[str, CachedRequestState] = {}
 
         # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
 
         # KV caches for forward pass
-        self.kv_caches: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        self.kv_caches: list[tuple[torch.Tensor, torch.Tensor]] = []
 
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
@@ -171,7 +171,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # then resubmitted with the same ID. In this case, we treat them as two
         # distinct requests - clearing the cached states for the first request
         # and handling the second as a new request.
-        removed_req_indices: List[int] = []
+        removed_req_indices: list[int] = []
         for req_id in scheduler_output.finished_req_ids:
             req_index = self.input_batch.remove_request(req_id)
             if req_index is not None:
@@ -194,7 +194,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             assert req_index is not None
             removed_req_indices.append(req_index)
 
-        req_ids_to_add: List[str] = []
+        req_ids_to_add: list[str] = []
         # Add new requests to the cached states.
         for new_req_data in scheduler_output.scheduled_new_reqs:
             req_id = new_req_data.req_id
@@ -453,7 +453,7 @@ def execute_model(
         selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
 
         # Then, let's update the cache state.
-        request_seq_lens: List[Tuple[int, CachedRequestState, int]] = []
+        request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -473,9 +473,9 @@ def execute_model(
         assert all(
             req_id is not None for req_id in
             self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
-        req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs])
+        req_ids = cast(list[str], self.input_batch.req_ids[:num_reqs])
 
-        prompt_logprobs_dict: Dict[str, Optional[LogprobsTensors]] = {}
+        prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
         for req_id in self.input_batch.req_ids[:num_reqs]:
             prompt_logprobs_dict[req_id] = None
 
@@ -612,7 +612,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
-        kv_caches: Dict[str, torch.Tensor] = {}
+        kv_caches: dict[str, torch.Tensor] = {}
 
         for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
             tensor_config = kv_cache_config.tensors[layer_name]
@@ -649,7 +649,7 @@ def forward(
         self,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
@@ -667,7 +667,7 @@ def forward(
             # [num_kv_heads, num_blocks, block_size, head_size]. To make it
             # work, we need to flatten the first three dimensions and modify
             # the slot_mapping accordingly.
-            # kv_caches: List[Tuple[torch.Tensor, torch.Tensor]]
+            # kv_caches: list[tuple[torch.Tensor, torch.Tensor]]
             num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
             slot_mapping = attn_metadata.slot_mapping
             slot_mapping = slot_mapping.flatten()
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 405dc628ee1..cbd2fe6edd8 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """A TPU worker class."""
 import os
-from typing import Dict, List, Optional
+from typing import Optional
 
 import torch
 import torch.distributed
@@ -103,7 +103,7 @@ def init_device(self):
         self.model_runner = TPUModelRunner(self.vllm_config, self.device)
 
     def determine_available_memory(self) -> int:
-        kv_caches: Dict[str, torch.Tensor] = {}
+        kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_spec = self.model_runner.get_kv_cache_spec()
         for layer_name, layer_spec in kv_cache_spec.items():
             if isinstance(layer_spec, FullAttentionSpec):
@@ -118,7 +118,7 @@ def determine_available_memory(self) -> int:
             else:
                 raise NotImplementedError
 
-        runner_kv_caches: List[torch.Tensor] = []
+        runner_kv_caches: list[torch.Tensor] = []
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,

From 34bba482885127071b55d699ac0d3d363f699cbe Mon Sep 17 00:00:00 2001
From: Sheng Yao <30943636+realShengYao@users.noreply.github.com>
Date: Mon, 3 Mar 2025 09:35:01 +0800
Subject: [PATCH 0463/1240] [Bugfix] Explicitly include "omp.h" for MacOS to
 avoid installation failure (#14051)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cpu/cpu_types_arm.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 990e99f2fc0..65ffe524af7 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -2,6 +2,10 @@
 #include <torch/all.h>
 #include <cmath>
 
+#if defined(__APPLE__)
+  #include "omp.h"
+#endif
+
 namespace vec_op {
 
 #ifdef ARM_BF16_SUPPORT

From fb9a797ac24009b46d6bad7ea37eb4430b595874 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Mon, 3 Mar 2025 14:10:11 +0800
Subject: [PATCH 0464/1240] [Misc] duplicate code in deepseek_v2  (#14106)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index b5409c7fe1b..7ff61f9a182 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -105,7 +105,6 @@ def __init__(
         self.tp_size = get_tensor_model_parallel_world_size()
         self.routed_scaling_factor = config.routed_scaling_factor
         self.n_shared_experts = config.n_shared_experts
-        self.routed_scaling_factor = config.routed_scaling_factor
 
         if config.hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {config.hidden_act}. "

From fb0531ce17883a557e7a537ab6993f940063ac0b Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Mon, 3 Mar 2025 15:40:04 +0800
Subject: [PATCH 0465/1240] [Misc][Platform] Move use allgather to platform
 (#14010)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/logits_processor.py | 10 +++-------
 vllm/platforms/interface.py                    | 13 +++++++++++++
 vllm/platforms/neuron.py                       |  4 ++++
 vllm/platforms/tpu.py                          |  4 ++++
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index 2f39a0e8785..4a359725bad 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -8,7 +8,6 @@
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import get_current_vllm_config
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_gather)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -51,11 +50,7 @@ def __init__(self,
         # Soft cap the logits. Used in Gemma 2.
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
-        parallel_config = get_current_vllm_config().parallel_config
-        self.use_all_gather = current_platform.is_tpu() \
-            or current_platform.is_neuron() \
-            or envs.VLLM_USE_V1 \
-            or parallel_config.distributed_executor_backend == "external_launcher" # noqa
+        self.use_all_gather = current_platform.use_all_gather()
 
     def forward(
         self,
@@ -83,7 +78,8 @@ def forward(
                 logits *= self.scale
 
             # Apply logits processors (if any).
-            if sampling_metadata is not None:
+            if sampling_metadata is not None and \
+                sampling_metadata.seq_groups is not None:
                 logits = _apply_logits_processors(logits, sampling_metadata)
 
         return logits
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d81a66e4bcb..e7e55e11775 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -330,6 +330,19 @@ def get_device_communicator_cls(cls) -> str:
         """
         return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
 
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        """
+        Whether to use allgather in LogitsProcessor to gather the logits.
+        """
+        import vllm.envs as envs
+        from vllm.config import get_current_vllm_config
+
+        parallel_config = get_current_vllm_config().parallel_config
+        return (envs.VLLM_USE_V1
+                or parallel_config.distributed_executor_backend
+                == "external_launcher")
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 5a03f5f7acb..b2eadb7932f 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -55,3 +55,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on Neuron.")
         return False
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index cdf835a52c0..0b66b52713e 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -119,3 +119,7 @@ def is_pin_memory_available(cls):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.tpu_communicator.TpuCommunicator"  # noqa
+
+    @classmethod
+    def use_all_gather(cls) -> bool:
+        return True

From 27ed31832d6003df4eabd17796f3346d65f1ef88 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 3 Mar 2025 00:43:14 -0800
Subject: [PATCH 0466/1240] [Build] Make sure local main branch is synced when
 VLLM_USE_PRECOMPILED=1 (#13921)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py                                      | 28 ++++++++++++++++++-
 tests/standalone_tests/python_only_compile.sh |  2 +-
 vllm/envs.py                                  |  8 +++++-
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index cd17709b57e..1a6f2ffd852 100755
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,7 @@
 
 import ctypes
 import importlib.util
+import json
 import logging
 import os
 import re
@@ -269,9 +270,32 @@ class repackage_wheel(build_ext):
     """Extracts libraries and other files from an existing wheel."""
 
     def get_base_commit_in_main_branch(self) -> str:
-        import subprocess
+        # Force to use the nightly wheel. This is mainly used for CI testing.
+        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
+            return "nightly"
 
         try:
+            # Get the latest commit hash of the upstream main branch.
+            resp_json = subprocess.check_output([
+                "curl", "-s",
+                "https://api.github.com/repos/vllm-project/vllm/commits/main"
+            ]).decode("utf-8")
+            upstream_main_commit = json.loads(resp_json)["sha"]
+
+            # Check if the local main branch is up-to-date. This is to ensure
+            # the base commit we found is the most recent commit on the main
+            # branch.
+            local_main_commit = subprocess.check_output(
+                ["git", "rev-parse", "main"]).decode("utf-8").strip()
+            if local_main_commit != upstream_main_commit:
+                raise ValueError(
+                    f"Local main branch ({local_main_commit}) is not "
+                    "up-to-date with upstream main branch "
+                    f"({upstream_main_commit}). Please pull the latest "
+                    "changes from upstream main branch first.")
+
+            # Then get the commit hash of the current branch that is the same as
+            # the upstream main commit.
             current_branch = subprocess.check_output(
                 ["git", "branch", "--show-current"]).decode("utf-8").strip()
 
@@ -279,6 +303,8 @@ def get_base_commit_in_main_branch(self) -> str:
                 ["git", "merge-base", "main",
                  current_branch]).decode("utf-8").strip()
             return base_commit
+        except ValueError as err:
+            raise ValueError(err) from None
         except Exception as err:
             logger.warning(
                 "Failed to get the base commit in the main branch. "
diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh
index f00895c0997..ec1bcbcc58a 100644
--- a/tests/standalone_tests/python_only_compile.sh
+++ b/tests/standalone_tests/python_only_compile.sh
@@ -18,7 +18,7 @@ apt autoremove -y
 
 echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py
 
-VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
+VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL=1 VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e .
 
 # Run the script
 python3 -c 'import vllm'
diff --git a/vllm/envs.py b/vllm/envs.py
index bf64cd70674..f6c038967b6 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -60,12 +60,12 @@
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL: bool = False
     VLLM_NO_DEPRECATION_WARNING: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
     VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
-    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
     VLLM_PLUGINS: Optional[list[str]] = None
     VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@@ -148,6 +148,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
         os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
 
+    # Whether to force using nightly wheel in python build.
+    # This is used for testing the nightly wheel in python build.
+    "VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL":
+    lambda: bool(int(os.getenv("VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL", "0"))
+                 ),
+
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"
     # Available options: "Debug", "Release", "RelWithDebInfo"

From b59021584b0c29a67a95a06357c1637f85b40876 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 16:15:27 +0000
Subject: [PATCH 0467/1240] [V1] Refactor parallel sampling support (#13774)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/async_llm.py         |  61 ++---
 vllm/v1/engine/llm_engine.py        |  74 ++----
 vllm/v1/engine/output_processor.py  | 181 +++++++++------
 vllm/v1/engine/parallel_sampling.py | 344 ++++------------------------
 vllm/v1/metrics/stats.py            |   5 +-
 5 files changed, 201 insertions(+), 464 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index ab3cdc4ee29..954f74c3fda 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -25,7 +25,7 @@
 from vllm.utils import cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
-from vllm.v1.engine.parallel_sampling import generate_parallel_sampling_async
+from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger,
@@ -145,25 +145,30 @@ async def add_request(
         """Add new request to the AsyncLLM."""
 
         # 1) Create a new output queue for the request.
-        if self.output_processor.is_request_active(request_id):
-            raise ValueError(f"Request id {request_id} already running.")
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
-        # 2) Convert Input --> Request.
-        request = self.processor.process_inputs(request_id, prompt, params,
-                                                arrival_time, lora_request,
-                                                trace_headers,
-                                                prompt_adapter_request,
-                                                priority)
+        # 2) Fan out child requests (for n>1)
+        parent_req = ParentRequest.from_params(request_id, params)
+        n = params.n if isinstance(params, SamplingParams) else 1
+        for idx in range(n):
+            if parent_req is not None:
+                request_id, params = parent_req.get_child_info(idx)
 
-        # 3) Add the request to OutputProcessor (this process).
-        self.output_processor.add_request(request, queue)
+            # 3) Convert Input --> Request.
+            request = self.processor.process_inputs(request_id, prompt, params,
+                                                    arrival_time, lora_request,
+                                                    trace_headers,
+                                                    prompt_adapter_request,
+                                                    priority)
 
-        # 4) Add the EngineCoreRequest to EngineCore (separate process).
-        await self.engine_core.add_request_async(request)
+            # 4) Add the request to OutputProcessor (this process).
+            self.output_processor.add_request(request, parent_req, idx, queue)
 
-        if self.log_requests:
-            logger.info("Added request %s.", request_id)
+            # 5) Add the EngineCoreRequest to EngineCore (separate process).
+            await self.engine_core.add_request_async(request)
+
+            if self.log_requests:
+                logger.info("Added request %s.", request_id)
 
         return queue
 
@@ -172,7 +177,7 @@ async def add_request(
     # requests we don't need to send multiple messages to core proc,
     # and so we don't need multiple streams which then get
     # re-multiplexed in the API server anyhow.
-    async def _generate(
+    async def generate(
         self,
         prompt: PromptType,
         sampling_params: SamplingParams,
@@ -243,30 +248,6 @@ async def _generate(
             await self.abort(request_id)
             raise
 
-    def generate(
-        self,
-        prompt: PromptType,
-        sampling_params: SamplingParams,
-        request_id: str,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        kwargs = dict(prompt=prompt,
-                      sampling_params=sampling_params,
-                      request_id=request_id,
-                      lora_request=lora_request,
-                      trace_headers=trace_headers,
-                      prompt_adapter_request=prompt_adapter_request,
-                      priority=priority)
-        if sampling_params.n is None or sampling_params.n == 1:
-            return self._generate(**kwargs)
-        else:
-            # Special handling for parallel sampling requests
-            return generate_parallel_sampling_async(generate=self._generate,
-                                                    **kwargs)
-
     async def _run_output_handler(self):
         """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 2e76694a7f5..99b97ac8e6c 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -22,7 +22,7 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
-from vllm.v1.engine.parallel_sampling import SyncParallelSamplingManager
+from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
 
@@ -50,9 +50,6 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
-        # Bookkeeping for parallel sampling requests
-        self.parallel_manager = SyncParallelSamplingManager()
-
         # important: init dp group before init the engine_core
         self.parallel_config = vllm_config.parallel_config
         self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
@@ -120,8 +117,7 @@ def from_engine_args(
                    multiprocess_mode=enable_multiprocessing)
 
     def get_num_unfinished_requests(self) -> int:
-        return self.parallel_manager.get_num_unfinished_requests(
-            self.output_processor.get_num_unfinished_requests())
+        return self.output_processor.get_num_unfinished_requests()
 
     def has_unfinished_requests(self) -> bool:
         has_unfinished = self.output_processor.has_unfinished_requests()
@@ -157,48 +153,25 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-        """Add request."""
-        kwargs = dict(request_id=request_id,
-                      prompt=prompt,
-                      params=params,
-                      arrival_time=arrival_time,
-                      lora_request=lora_request,
-                      trace_headers=trace_headers,
-                      prompt_adapter_request=prompt_adapter_request,
-                      priority=priority)
-        # Handle parallel sampling requests differently.
-        if params is None or isinstance(params,
-                                        PoolingParams) or params.n == 1:
-            self._add_request(**kwargs)
-        else:
-            # Special handling for parallel sampling requests
-            self.parallel_manager.add_request_parallel_sampling(
-                add_request=self._add_request, **kwargs)
-
-    def _add_request(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        """Add request, `n=1`"""
-        # 1) Process raw inputs into the request.
-        request = self.processor.process_inputs(request_id, prompt, params,
-                                                arrival_time, lora_request,
-                                                trace_headers,
-                                                prompt_adapter_request,
-                                                priority)
-
-        # 2) Make a new RequestState and queue.
-        self.output_processor.add_request(request)
-
-        # 3) Add the request to EngineCore.
-        self.engine_core.add_request(request)
+        # 1) Fan out child requests (for n>1)
+        parent_req = ParentRequest.from_params(request_id, params)
+        n = params.n if isinstance(params, SamplingParams) else 1
+        for idx in range(n):
+            if parent_req is not None:
+                request_id, params = parent_req.get_child_info(idx)
+
+            # 2) Process raw inputs into the request.
+            request = self.processor.process_inputs(request_id, prompt, params,
+                                                    arrival_time, lora_request,
+                                                    trace_headers,
+                                                    prompt_adapter_request,
+                                                    priority)
+
+            # 3) Make a new RequestState and queue.
+            self.output_processor.add_request(request, parent_req, idx)
+
+            # 3) Add the request to EngineCore.
+            self.engine_core.add_request(request)
 
     def step(self) -> list[RequestOutput]:
 
@@ -217,10 +190,7 @@ def step(self) -> list[RequestOutput]:
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
-        request_outputs = processed_outputs.request_outputs
-
-        # 4) Process unfinished parallel sampling requests
-        return self.parallel_manager.step(request_outputs)
+        return processed_outputs.request_outputs
 
     def get_model_config(self):
         return self.model_config
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 22bbb8a0f5b..4e1d1e3bf51 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -4,13 +4,14 @@
 from dataclasses import dataclass
 from typing import Optional, Union
 
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
 from vllm.v1.engine.logprobs import LogprobsProcessor
+from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.metrics.stats import (IterationStats, LoRARequestStates,
                                    RequestStateStats)
 
@@ -27,6 +28,8 @@ class RequestState:
     def __init__(
         self,
         request_id: str,
+        parent_req: Optional[ParentRequest],
+        request_index: int,
         lora_name: Optional[str],
         output_kind: RequestOutputKind,
         prompt: Optional[str],
@@ -38,6 +41,8 @@ def __init__(
         log_stats: bool,
     ):
         self.request_id = request_id
+        self.parent_req = parent_req
+        self.request_index = request_index
         self.lora_name = lora_name
         self.output_kind = output_kind
         self.prompt = prompt
@@ -56,11 +61,15 @@ def from_new_request(
         cls,
         tokenizer: AnyTokenizer,
         request: EngineCoreRequest,
+        parent_req: Optional[ParentRequest],
+        request_index: int,
         queue: Optional[asyncio.Queue[RequestOutput]],
         log_stats: bool,
     ) -> "RequestState":
         return cls(
             request_id=request.request_id,
+            parent_req=parent_req,
+            request_index=request_index,
             lora_name=(request.lora_request.name
                        if request.lora_request is not None else None),
             output_kind=request.sampling_params.output_kind,
@@ -79,6 +88,88 @@ def from_new_request(
             log_stats=log_stats,
         )
 
+    def make_request_output(
+        self,
+        new_token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> Optional[RequestOutput]:
+
+        finished = finish_reason is not None
+        output_kind = self.output_kind
+        final_only = output_kind == RequestOutputKind.FINAL_ONLY
+
+        # In follow up, we will switch to invariant where EngineCore
+        # does not stream partial prefills.
+        if not finished and (self.is_prefilling or final_only):
+            # Only the final output is required in FINAL_ONLY mode.
+            return None
+
+        def new_request_output(request_id: str) -> RequestOutput:
+            return self._new_request_output(request_id, finished)
+
+        completion_output = self._new_completion_output(
+            new_token_ids, finish_reason, stop_reason)
+
+        if self.parent_req is not None:
+            return self.parent_req.make_request_output(final_only,
+                                                       completion_output,
+                                                       new_request_output)
+
+        request_output = new_request_output(self.request_id)
+        request_output.outputs.append(completion_output)
+        return request_output
+
+    def _new_request_output(
+        self,
+        request_id: str,
+        finished: bool,
+    ) -> RequestOutput:
+
+        if self.output_kind == RequestOutputKind.DELTA:
+            # Side effect: logprobs processor forgets prompt logprobs
+            prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs()
+        else:
+            prompt_logprobs = self.logprobs_processor.prompt_logprobs
+
+        return RequestOutput(
+            request_id=request_id,
+            prompt=self.prompt,
+            prompt_token_ids=self.prompt_token_ids,
+            prompt_logprobs=prompt_logprobs,
+            outputs=[],
+            finished=finished,
+        )
+
+    def _new_completion_output(
+        self,
+        token_ids: list[int],
+        finish_reason: Optional[FinishReason],
+        stop_reason: Union[int, str, None],
+    ) -> CompletionOutput:
+
+        finished = finish_reason is not None
+        delta = self.output_kind == RequestOutputKind.DELTA
+
+        # Prepare text and token_ids, based on delta mode
+        text = self.detokenizer.get_next_output_text(finished, delta)
+        if not delta:
+            token_ids = self.detokenizer.output_token_ids
+
+        # Prepare logprobs, based on delta mode
+        logprobs = self.logprobs_processor.logprobs
+        if delta and logprobs:
+            logprobs = logprobs[-len(token_ids):]
+
+        return CompletionOutput(
+            index=self.request_index,
+            text=text,
+            token_ids=token_ids,
+            logprobs=logprobs,
+            cumulative_logprob=self.logprobs_processor.cumulative_logprob,
+            finish_reason=str(finish_reason) if finished else None,
+            stop_reason=stop_reason if finished else None)
+
 
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
@@ -93,9 +184,6 @@ def __init__(
         self.request_states: dict[str, RequestState] = {}
         self.lora_states = LoRARequestStates()
 
-    def is_request_active(self, request_id: str) -> bool:
-        return request_id in self.request_states
-
     def get_num_unfinished_requests(self):
         return len(self.request_states)
 
@@ -114,6 +202,8 @@ def abort_requests(
     def add_request(
         self,
         request: EngineCoreRequest,
+        parent_req: Optional[ParentRequest] = None,
+        request_index: int = 0,
         queue: Optional[asyncio.Queue[RequestOutput]] = None,
     ) -> None:
         request_id = request.request_id
@@ -123,6 +213,8 @@ def add_request(
         req_state = RequestState.from_new_request(
             tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
             request=request,
+            parent_req=parent_req,
+            request_index=request_index,
             queue=queue,
             log_stats=self.log_stats)
         self.request_states[request_id] = req_state
@@ -202,8 +294,8 @@ def process_outputs(
             req_state.logprobs_processor.update_from_output(engine_core_output)
 
             # 4) Create and handle RequestOutput objects.
-            if request_output := self._make_request_output(
-                    req_state, new_token_ids, finish_reason, stop_reason):
+            if request_output := req_state.make_request_output(
+                    new_token_ids, finish_reason, stop_reason):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put_nowait(request_output)
@@ -211,18 +303,17 @@ def process_outputs(
                     # LLMEngine: return list of RequestOutputs.
                     request_outputs.append(request_output)
 
-                # Free completed requests.
-                if request_output.finished:
-                    self.request_states.pop(req_id)
-                    if not engine_core_output.finished:
-                        # If req not finished in EngineCore, but Detokenizer
-                        # detected stop string, abort needed in EngineCore.
-                        reqs_to_abort.append(req_id)
+            # Free completed requests.
+            if finish_reason is not None:
+                self.request_states.pop(req_id)
+                if not engine_core_output.finished:
+                    # If req not finished in EngineCore, but Detokenizer
+                    # detected stop string, abort needed in EngineCore.
+                    reqs_to_abort.append(req_id)
 
-                    # Track per-request stats
-                    self._update_stats_from_finished(req_state, request_output,
-                                                     finish_reason,
-                                                     iteration_stats)
+                # Track per-request stats
+                self._update_stats_from_finished(req_state, finish_reason,
+                                                 iteration_stats)
 
         self.lora_states.update_iteration_stats(iteration_stats)
 
@@ -249,7 +340,6 @@ def _update_stats_from_output(self, req_state: RequestState,
                                            req_state.stats, lora_stats)
 
     def _update_stats_from_finished(self, req_state: RequestState,
-                                    request_output: RequestOutput,
                                     finish_reason: Optional[FinishReason],
                                     iteration_stats: Optional[IterationStats]):
         if iteration_stats is None:
@@ -257,55 +347,8 @@ def _update_stats_from_finished(self, req_state: RequestState,
 
         assert finish_reason is not None
         assert req_state.stats is not None
-        iteration_stats.update_from_finished_request(finish_reason,
-                                                     request_output,
-                                                     req_state.stats)
+        iteration_stats.update_from_finished_request(
+            finish_reason=finish_reason,
+            num_prompt_tokens=len(req_state.prompt_token_ids),
+            req_stats=req_state.stats)
         self.lora_states.finish_request(req_state)
-
-    @staticmethod
-    def _make_request_output(
-        request_state: RequestState,
-        new_token_ids: list[int],
-        finish_reason: Optional[FinishReason],
-        stop_reason: Union[int, str, None],
-    ) -> Optional[RequestOutput]:
-
-        finished = finish_reason is not None
-        output_kind = request_state.output_kind
-        # In follow up, we will switch to invariant where EngineCore
-        # does not stream partial prefills.
-        if not finished and (request_state.is_prefilling
-                             or output_kind == RequestOutputKind.FINAL_ONLY):
-            # Only the final output is required in FINAL_ONLY mode.
-            return None
-
-        detokenizer = request_state.detokenizer
-        logprobs_processor = request_state.logprobs_processor
-
-        delta = output_kind == RequestOutputKind.DELTA
-        logprobs = logprobs_processor.logprobs
-        if delta:
-            if logprobs:
-                logprobs = logprobs[-len(new_token_ids):]
-            # Side effect: logprobs processor forgets prompt logprobs
-            prompt_logprobs = logprobs_processor.pop_prompt_logprobs()
-        else:
-            prompt_logprobs = logprobs_processor.prompt_logprobs
-
-        request_output = RequestOutput.new(
-            request_id=request_state.request_id,
-            prompt=request_state.prompt,
-            prompt_token_ids=request_state.prompt_token_ids,
-            text=detokenizer.get_next_output_text(finished, delta),
-            token_ids=new_token_ids if delta else detokenizer.output_token_ids,
-            logprobs=logprobs,
-            prompt_logprobs=prompt_logprobs,
-            cumulative_logprob=logprobs_processor.cumulative_logprob,
-            finished=finished,
-        )
-        if finished:
-            completion_output = request_output.outputs[0]
-            completion_output.finish_reason = str(finish_reason)
-            completion_output.stop_reason = stop_reason
-
-        return request_output
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 291360771b5..adced8973b0 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,69 +1,46 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from collections.abc import AsyncGenerator, Mapping
 from copy import copy
-from typing import Optional, Protocol, Union
+from typing import Callable, Optional, Union
 
-from vllm.inputs import PromptType
-from vllm.lora.request import LoRARequest
-from vllm.outputs import RequestOutput
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.utils import merge_async_iterators
+from vllm.sampling_params import SamplingParams
 
 
-class AsyncGenerateMethodType(Protocol):
-
-    def __call__(self,
-                 prompt: PromptType,
-                 sampling_params: SamplingParams,
-                 request_id: str,
-                 lora_request: Optional[LoRARequest] = None,
-                 trace_headers: Optional[Mapping[str, str]] = None,
-                 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-                 priority: int = 0) -> AsyncGenerator[RequestOutput, None]:
-        ...
-
-
-class SyncAddRequestMethodType(Protocol):
-
-    def __call__(self,
-                 request_id: str,
-                 prompt: PromptType,
-                 params: Union[SamplingParams, PoolingParams],
-                 arrival_time: Optional[float] = None,
-                 lora_request: Optional[LoRARequest] = None,
-                 trace_headers: Optional[Mapping[str, str]] = None,
-                 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-                 priority: int = 0) -> None:
-        ...
-
-
-class ParallelSamplingRequest:
+class ParentRequest:
     """Info, state & processing for parallel sampling request.
-    
+
     Store parent request ID and sampling params.
     Facilitate generating child request sampling params.
-    Transform child request outputs into parent request
-    outputs.
-    When stream mode is disabled, then `self.request_output`
-    aggregates child request completions.
     """
 
     request_id: str
     sampling_params: SamplingParams
+
+    # To aggregate child completions when not streaming
+    output_aggregator: Optional[RequestOutput]
+
+    # To efficiently obtain child sampling params
     cached_child_sampling_params: Optional[SamplingParams]
-    request_output: Optional[RequestOutput]
-    num_finished_completions: int
 
     def __init__(self, request_id: str,
                  sampling_params: SamplingParams) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
+
+        self.output_aggregator = None
         self.cached_child_sampling_params = None
-        self.request_output = None
-        self.num_finished_completions = 0
+
+    @classmethod
+    def from_params(
+        cls,
+        request_id: str,
+        params: Union[SamplingParams, PoolingParams],
+    ) -> Optional['ParentRequest']:
+        if not isinstance(params, SamplingParams) or params.n == 1:
+            return None
+        return cls(request_id, params)
 
     def _get_child_sampling_params(
         self,
@@ -96,47 +73,6 @@ def _get_child_sampling_params(
             child_sampling_params.seed = seed + index
         return child_sampling_params
 
-    def _add_output(
-        self,
-        child_req_output: RequestOutput,
-        index: int,
-    ) -> None:
-        """Aggregate a parallel sampling child
-        request output.
-        
-        Non-stream-mode (`output_kind == FINAL_ONLY`) 
-        only. Inject correct parent request ID and
-        completion index.
-
-        Args:
-          child_req_output: a single request output
-                            from a parallel sampling
-                            child request.   
-          index: index within `n` child    
-        """
-        self.num_finished_completions += 1
-        new_completion = child_req_output.outputs[0]
-        new_completion.index = index
-        if self.request_output is None:
-            # Save the first request output; reinstate
-            # original request ID; metrics are not
-            # supported for parallel sampling
-            child_req_output.request_id = self.request_id
-            child_req_output.metrics = None
-            self.request_output = child_req_output
-        else:
-            # Aggregate additional completion into request output
-            # Note: will be sorted by index later
-            self.request_output.outputs.append(new_completion)
-
-    def _get_final_request_output(self) -> RequestOutput:
-        """Invariant: parent completion outputs sorted by index"""
-        assert self.request_output is not None
-        self.request_output.finished = True
-        self.request_output.outputs = sorted(self.request_output.outputs,
-                                             key=lambda x: x.index)
-        return self.request_output
-
     def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         """Get child request ID and sampling params.
         
@@ -149,227 +85,35 @@ def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         return (f"{index}_{self.request_id}",
                 self._get_child_sampling_params(index))
 
-    def process_output(
-        self,
-        child_req_output: RequestOutput,
-        index: int,
-    ) -> Optional[RequestOutput]:
-        """Filter, aggregate and transform parallel sampling
-        child request outputs.
-
-        If the parent request has `stream=false`
-        (`output_kind == FINAL_ONLY`), each child will also have
-        `output_kind == FINAL_ONLY`. All child request outputs
-        must be aggregated into a single request output, with
-        multiple completions. This request output is only returned
-        once `n` completions are aggregated.
-
-        If the parent request has `stream=true`
-        (`output_kind == DELTA`), each child will also have
-        `output_kind == DELTA`. All child request outputs
-        must be streamed directly to the caller.
-
-        Args:
-          child_req_output: a single child request output
-          index: index within `n` child requests
-
-        Returns:
-          `None`, unless a processed request output is ready to
-          send back to the caller.
-        """
-        if self.output_kind != RequestOutputKind.FINAL_ONLY:
-            # stream=true: return child completions immediately
-            child_req_output.request_id = self.request_id
-            child_req_output.outputs[0].index = index
-            if child_req_output.finished:
-                # Parent request is complete if all child requests are
-                # complete.
-                self.num_finished_completions += 1
-                child_req_output.finished = (
-                    self.num_finished_completions == self.n)
-            return child_req_output
-
-        # stream=false: aggregate child completions
-        self._add_output(child_req_output, index)
-        if self.num_finished_completions == self.n:
-            # Return aggregated request output after obtaining
-            # all completions
-            return self._get_final_request_output()
-        return None
-
-    async def wrap_child_async_generator(
-        self,
-        child_gen: AsyncGenerator[RequestOutput, None],
-        index: int,
-    ) -> AsyncGenerator[RequestOutput, None]:
-        """Output generator for a single parallel sampling
-        child request.
-
-        Each parallel sampling request triggers at
-        least two child requests. This generator
-        yields zero or more request outputs to
-        return to the caller, as they become
-        available.
-
-        Args:
-          child_gen: generator for child request
-                     outputs.
-          index: index within the `n` child requests
-
-        Returns:
-          Yields zero or more request outputs to return
-          to the caller.
-        """
-        async for out in child_gen:
-            if req_out := self.process_output(out, index):
-                yield req_out
-
     @property
     def n(self) -> int:
         return self.sampling_params.n
 
-    @property
-    def output_kind(self) -> RequestOutputKind:
-        return self.sampling_params.output_kind
-
-
-class SyncParallelSamplingManager:
-
-    def __init__(self):
-        # Parent req ID -> parent request manager
-        self.parent_reqs: dict[str, ParallelSamplingRequest] = {}
-        # Child req ID -> (child req index, parent req ID)
-        self.child_reqs: dict[str, tuple[int, str]] = {}
-
-    def _register_parent_request(self, req: ParallelSamplingRequest) -> None:
-        """Register parallel sampling parent request."""
-        self.parent_reqs[req.request_id] = req
-
-    def _register_child_request(self, req_id: str, child_req_id: str,
-                                index: int) -> None:
-        """Register parallel sampling child request with parent.
-        
-        Args:
-          req_id: parent request ID
-          child_req_id: child request ID
-          index: child request index within `n` child requests
-        """
-        self.child_reqs[child_req_id] = (index, req_id)
-
-    def get_num_unfinished_requests(self, num_core_reqs: int) -> int:
-        """Get the number of unfinished requests, correcting for parallel
-           sampling.
-        
-        Args:
-          num_core_reqs: The number of unfinished requests in the engine core.
-        
-        Returns:
-          Number of unfinished requests, where each parallel sampling req 
-          counts as 1
-        """
-        return num_core_reqs + len(self.parent_reqs) - len(self.child_reqs)
-
-    def add_request_parallel_sampling(
+    def make_request_output(
         self,
-        add_request: SyncAddRequestMethodType,
-        request_id: str,
-        prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: Optional[float] = None,
-        lora_request: Optional[LoRARequest] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
-        """Add sync parallel sampling request."""
-        req = ParallelSamplingRequest(request_id, params)
-        self._register_parent_request(req)
-        # Add n child requests with unique request IDs & random seeds and n=1
-        for idx in range(req.n):
-            child_req_id, child_params = req.get_child_info(idx)
-            self._register_child_request(request_id, child_req_id, idx)
-            add_request(request_id=child_req_id,
-                        prompt=prompt,
-                        params=child_params,
-                        arrival_time=arrival_time,
-                        lora_request=lora_request,
-                        trace_headers=trace_headers,
-                        prompt_adapter_request=prompt_adapter_request,
-                        priority=priority)  # type: ignore
-
-    def step(
-        self,
-        outputs: list[RequestOutput],
-    ) -> list[RequestOutput]:
-        """Build parallel sampling request outputs.
-        
-        Extract child request outputs, aggregate them
-        into parent request output, and return parent
-        output when complete.
-
-        Do not modify `n=1` requests.
-
-        Args:
-          outputs: step request outputs. Mix of child request
-                   outputs & `n=1` request outputs.
+        final_only: bool,
+        completion_output: CompletionOutput,
+        new_request_output: Callable[[str], RequestOutput],
+    ) -> Optional[RequestOutput]:
+        # Use an existing RequestOutput if we're aggregating
+        request_output = self.output_aggregator
 
-        Return:
-          List of parallel sampling parent request outputs &
-          unmodified `n=1` request outputs passed-thru from input.
-        """
-        if not (self.parent_reqs and outputs):
-            # Return unmodified
-            return outputs
-        agg_outputs = []
-        for output in outputs:
-            req_id = output.request_id
-            if child_req_entry := self.child_reqs.get(req_id, None):
-                # For each parallel sampling child request output:
-                (index, parent_req_id) = child_req_entry
-                req = self.parent_reqs[parent_req_id]
-                # Update parallel sampling request
-                if out := req.process_output(output, index):
-                    # Return parent request output if complete;
-                    # cleanup parent request bookkeeping.
-                    agg_outputs.append(out)
-                    del self.parent_reqs[parent_req_id]
-                # Cleanup child request bookkeeping.
-                del self.child_reqs[req_id]
-            else:
-                # Not a parallel sampling request output
-                agg_outputs.append(output)
-        return agg_outputs
+        # Make new RequestOutput otherwise
+        if request_output is None:
+            request_output = new_request_output(self.request_id)
 
+        # Add a new completion
+        request_output.outputs.append(completion_output)
 
-async def generate_parallel_sampling_async(
-    generate: AsyncGenerateMethodType,
-    prompt: PromptType,
-    sampling_params: SamplingParams,
-    request_id: str,
-    lora_request: Optional[LoRARequest] = None,
-    trace_headers: Optional[Mapping[str, str]] = None,
-    prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-    priority: int = 0,
-) -> AsyncGenerator[RequestOutput, None]:
-    """Generate completions for async parallel sampling requests."""
-    parent_req = ParallelSamplingRequest(request_id, sampling_params)
+        # If not streaming, aggregate until all child requests complete
+        if final_only and len(request_output.outputs) != self.n:
+            self.output_aggregator = request_output
+            return None
 
-    # Aggregate generators for n child requests
-    gens: list[AsyncGenerator[RequestOutput, None]] = []
-    for idx in range(parent_req.n):
-        child_req_id, child_params = parent_req.get_child_info(idx)
-        child_gen = generate(
-            prompt=prompt,
-            sampling_params=child_params,
-            request_id=child_req_id,
-            lora_request=lora_request,
-            trace_headers=trace_headers,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority,
-        )  # type: ignore
-        gen = parent_req.wrap_child_async_generator(child_gen, idx)
-        gens.append(gen)
+        # We're done aggregating
+        self.output_aggregator = None
 
-    # Merge generators
-    async for _, out in merge_async_iterators(*gens):
-        yield out
+        # Parent completion output list must be sorted by index
+        request_output.outputs = sorted(request_output.outputs,
+                                        key=lambda x: x.index)
+        return request_output
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 625edb60746..abdca95670e 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -5,7 +5,6 @@
 from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
-    from vllm.outputs import RequestOutput
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
     from vllm.v1.output_processor import RequestState
 
@@ -150,7 +149,7 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                 self.num_preempted_reqs += 1
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
-                                     request_output: "RequestOutput",
+                                     num_prompt_tokens: int,
                                      req_stats: RequestStateStats):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
@@ -172,7 +171,7 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
         finished_req = \
             FinishedRequestStats(finish_reason=finish_reason,
                                  e2e_latency=e2e_latency,
-                                 num_prompt_tokens=len(request_output.prompt_token_ids),
+                                 num_prompt_tokens=num_prompt_tokens,
                                  num_generation_tokens=req_stats.num_generation_tokens,
                                  queued_time=queued_time,
                                  prefill_time=prefill_time,

From 7aa981fa3f23463cea69fc2ffbe26f2b2bfc5dbf Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:03:05 +0000
Subject: [PATCH 0468/1240] Improve the docs for `TransformersModel` (#14147)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 68 +++++++++++++++++++-------
 1 file changed, 49 insertions(+), 19 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0e93a15b84f..29ed24cfdb5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -14,8 +14,11 @@ Alongside each architecture, we include some popular models that use it.
 
 By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
 
-To determine whether a given model is supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory.
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 :::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -40,33 +43,41 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
+(transformers-fallback)=
+
 ### Transformers fallback
 
-`vllm` can fallback to models that are available in `transformers`. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
 
-To check if the backend is `transformers`, you can simply do this:
+To check if the backend is Transformers, you can simply do this:
 
 ```python 
 from vllm import LLM
 llm = LLM(model=..., task="generate")  # Name or path of your model
-llm.apply_model(lambda model: print(model.__class__))
+llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersModel` then it means it's based on `transformers`!
+If it is `TransformersModel` then it means it's based on Transformers!
 
-#### Supported features
+:::{note}
+vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
+:::
 
-##### Quantization
+#### Supported features
 
-Transformers fallback has supported most of available quantization in vLLM (except GGUF). See [Quantization page](#quantization-index) for more information about supported quantization in vllm.
+The Transformers fallback explicitly supports the following features:
 
-##### LoRA
+- <project:#quantization-index> (except GGUF)
+- <project:#lora-adapter>
+- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
 
-Transformers fallback has supported LoRA. The usage way is identical to how LoRA works with models supported by vLLM. If you encounter any issues, please open an issue.
+#### Remote code
 
-##### Remote code
+Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
+If you are interested in this feature, this section is for you!
 
-This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
+Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
 
 ```python 
 from vllm import LLM
@@ -74,16 +85,17 @@ llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of
 llm.apply_model(lambda model: print(model.__class__))
 ```
 
-A model just needs the following two things:
+To make your model compatible with the Transformers fallback, it needs:
+
+```{code-block} python
+:caption: modeling_my_model.py
 
-```python
 from transformers import PreTrainedModel
 from torch import nn
 
 class MyAttention(nn.Module):
 
   def forward(self, hidden_states, **kwargs): # <- kwargs are required
-
     ...
     attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
     attn_output, attn_weights = attention_interface(
@@ -102,8 +114,26 @@ class MyModel(PreTrainedModel):
 Here is what happens in the background:
 
 1. The config is loaded
-2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersModel` backend is used. See `/model_executors/models/transformers`, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+
+To make your model compatible with tensor parallel, it needs:
+
+```{code-block} python
+:caption: configuration_my_model.py
+
+from transformers import PretrainedConfig
+
+class MyConfig(PretrainedConfig):
+  base_model_tp_plan = {
+    "layers.*.self_attn.q_proj": "colwise",
+    ...
+  }
+```
+
+:::{tip}
+`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+:::
 
 That's it!
 
@@ -893,7 +923,7 @@ Currently the PaliGemma model series is implemented without PrefixLM attention m
 :::
 
 :::{note}
-To use Qwen2.5-VL series models, you have to install Huggingface `transformers` library from source via `pip install git+https://github.com/huggingface/transformers`.
+To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
 
 ### Pooling Models

From a1d1ef499ee9181b5b7577e365642e0900bbd7b9 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Tue, 4 Mar 2025 01:24:45 +0800
Subject: [PATCH 0469/1240] [ROCm] Faster Custom Paged Attention kernels
 (#12348)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh                    |    1 -
 .../kernels/benchmark_paged_attention.py      |   71 +-
 csrc/rocm/attention.cu                        | 1506 ++++++++++++-----
 requirements-rocm.txt                         |    2 +-
 tests/kernels/test_attention.py               |    8 +-
 vllm/attention/backends/rocm_flash_attn.py    |    4 +-
 6 files changed, 1145 insertions(+), 447 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 35d2ba1f8ba..96fcafc9dc1 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -77,7 +77,6 @@ echo "Commands:$commands"
 #ignore certain kernels tests
 if [[ $commands == *" kernels "* ]]; then
   commands="${commands} \
-  --ignore=kernels/test_attention.py \
   --ignore=kernels/test_attention_selector.py \
   --ignore=kernels/test_blocksparse_attention.py \
   --ignore=kernels/test_causal_conv1d.py \
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index d00e8482436..221d7b7d5d9 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -11,8 +11,9 @@
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                         create_kv_caches_with_random)
 
-NUM_BLOCKS = 1024
+NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
 
 
 @torch.inference_mode()
@@ -80,6 +81,12 @@ def main(
     # Prepare for the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v2":
+        if current_platform.is_rocm():
+            global PARTITION_SIZE
+            if not args.custom_paged_attn:
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -123,25 +130,46 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     v_scale,
                 )
             elif version == "v2":
-                ops.paged_attention_v2(
-                    output,
-                    exp_sums,
-                    max_logits,
-                    tmp_output,
-                    query,
-                    key_cache,
-                    value_cache,
-                    num_kv_heads,
-                    scale,
-                    block_tables,
-                    seq_lens,
-                    block_size,
-                    max_seq_len,
-                    alibi_slopes,
-                    kv_cache_dtype,
-                    k_scale,
-                    v_scale,
-                )
+                if not args.custom_paged_attn:
+                    ops.paged_attention_v2(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
+                else:
+                    ops.paged_attention_rocm(
+                        output,
+                        exp_sums,
+                        max_logits,
+                        tmp_output,
+                        query,
+                        key_cache,
+                        value_cache,
+                        num_kv_heads,
+                        scale,
+                        block_tables,
+                        seq_lens,
+                        block_size,
+                        max_seq_len,
+                        alibi_slopes,
+                        kv_cache_dtype,
+                        k_scale,
+                        v_scale,
+                    )
             else:
                 raise ValueError(f"Invalid version: {version}")
         torch.cuda.synchronize()
@@ -195,6 +223,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
         help="Data type for kv cache storage. If 'auto', will use model "
         "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
         "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
+    parser.add_argument("--custom-paged-attn",
+                        action="store_true",
+                        help="Use custom paged attention")
     args = parser.parse_args()
     print(args)
 
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 82f7104a9e5..86029da141b 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -17,6 +17,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <hip/hip_fp8.h>
 #include <hip/hip_bf16.h>
 #include "cuda_compat.h"
 
@@ -50,6 +51,9 @@ using floatx4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
 using float16x4 =
     __attribute__((__vector_size__(4 * sizeof(_Float16)))) _Float16;
 typedef float16x4 _Half4;
+using float16x2 =
+    __attribute__((__vector_size__(2 * sizeof(_Float16)))) _Float16;
+typedef float16x2 _Half2;
 typedef struct _Half8 {
   _Half4 xy[2];
 } _Half8;
@@ -62,23 +66,17 @@ typedef struct _B16x8 {
 } _B16x8;
 
 using _B8x8 = uint2;
+using _B8x4 = int32_t;  // used in builtins
+using bit8_t = uint8_t;
 
-////// Non temporal load stores ///////
-
-template <typename T>
-__device__ __forceinline__ T load(T* addr) {
-  return addr[0];
-}
-
-template <typename T>
-__device__ __forceinline__ void store(T value, T* addr) {
-  addr[0] = value;
-}
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
 
 template <typename T, int absz, int cbid, int blgp>
-__device__ __forceinline__ floatx4 gcn_mfma_instr(const _B16x4& inpA,
-                                                  const _B16x4& inpB,
-                                                  const floatx4& inpC) {
+__device__ __forceinline__ floatx4 gcn_mfma4x4x4_instr(const _B16x4& inpA,
+                                                       const _B16x4& inpB,
+                                                       const floatx4& inpC) {
   if constexpr (std::is_same<T, _Float16>::value) {
     return __builtin_amdgcn_mfma_f32_4x4x4f16(inpA, inpB, inpC, absz, cbid,
                                               blgp);
@@ -90,6 +88,21 @@ __device__ __forceinline__ floatx4 gcn_mfma_instr(const _B16x4& inpA,
   }
 }
 
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx4 gcn_mfma16x16x16_instr(const _B16x4& inpA,
+                                                          const _B16x4& inpB,
+                                                          const floatx4& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x16f16(inpA, inpB, inpC, absz, cbid,
+                                                 blgp);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(inpA, inpB, inpC, absz,
+                                                     cbid, blgp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
 template <typename T>
 __device__ __forceinline__ float to_float(const T& inp) {
   if constexpr (std::is_same<T, _Float16>::value) {
@@ -121,17 +134,22 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
   } t16;
   _B16x4 ret;
   if constexpr (std::is_same<T, _Float16>::value) {
-  #pragma unroll
-    for (int i = 0; i < 4; i++) {
-      t16.f = (_Float16)inp[i];
-      ret[i] = t16.u;
-    }
-    return ret;
+    union h2cvt {
+      __half2 h2[2];
+      _B16x4 b16x4;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    return u.b16x4;
   } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
-  #pragma unroll
     for (int i = 0; i < 4; i++) {
-      t16.b = __float2bfloat16(inp[i]);
-      ret[i] = t16.u;
+      union fcvt {
+        uint32_t u32;
+        float f32;
+      } u;
+      u.f32 = inp[i];
+      u.u32 += 0x7fff + ((u.u32 >> 16) & 1);  // BF16 RNE with no nan/inf check
+      ret[i] = uint16_t(u.u32 >> 16);
     }
     return ret;
   } else {
@@ -149,21 +167,25 @@ __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
   } t1, t2, res;
   _B16x4 ret;
   if constexpr (std::is_same<T, _Float16>::value) {
-  #pragma unroll
-    for (int i = 0; i < 4; i++) {
-      t1.u = inp1[i];
-      t2.u = inp2[i];
-      res.f = t1.f + t2.f;
-      ret[i] = res.u;
-    }
-    return ret;
+    union h2cvt {
+      _B16x4 b16x4;
+      __half2 h2[2];
+    } u1, u2, s;
+    u1.b16x4 = inp1;
+    u2.b16x4 = inp2;
+    s.h2[0] = u1.h2[0] + u2.h2[0];
+    s.h2[1] = u1.h2[1] + u2.h2[1];
+    return s.b16x4;
   } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
-  #pragma unroll
     for (int i = 0; i < 4; i++) {
-      t1.u = inp1[i];
-      t2.u = inp2[i];
-      res.b = t1.b + t2.b;
-      ret[i] = res.u;
+      union fcvt {
+        float f32;
+        uint32_t i32;
+      } u1, u2, s;
+      u1.i32 = uint32_t(inp1[i]) << 16;
+      u2.i32 = uint32_t(inp2[i]) << 16;
+      s.f32 = u1.f32 + u2.f32;
+      ret[i] = uint16_t(s.i32 >> 16);
     }
     return ret;
   } else {
@@ -171,53 +193,600 @@ __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
   }
 }
 
-template <typename T, vllm::Fp8KVCacheDataType KV_DTYPE>
-__device__ __forceinline__ _B16x8 scaled_convert_b8x8(const _B8x8 input,
-                                                      const float scale) {
-  union alignas(16) {
-    uint4 u4;
-    _B16x8 u16x8;
-    vllm::bf16_8_t b16x8;
-  } tmp;
+__device__ __forceinline__ floatx4 to_float_fp8x4(const _B8x4& inp) {
+  // From MI300+ platforms, we have v_cvt_pk_f32_fp8 instruction
+  // to convert 2 packed fp8 to 2 packed fp32 values.
+  // However, in MI200 platforms, we only have v_cvt_f32_fp8
+  // to convert fp8 values individually. So we added
+  // #else case for fewer instructions (# inst=2) in MI300+,
+  // and fallback to
+  // #if case for other platforms (# inst=4).
+  #if defined(__gfx90a__)
+  float4 f32x4 = vllm::fp8::vec_conversion<float4, uint32_t>(
+      *reinterpret_cast<const uint32_t*>(&inp));
+  return *reinterpret_cast<floatx4*>(&f32x4);
+  #else  // MI3xx+ optimized builtins
+  const auto f0 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, false);
+  const auto f1 = __builtin_amdgcn_cvt_pk_f32_fp8(inp, true);
+  floatx4 ret;
+  ret[0] = f0[0];
+  ret[1] = f0[1];
+  ret[2] = f1[0];
+  ret[3] = f1[1];
+  return ret;
+  #endif
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x4 from_floatx4_rtz(const floatx4& inp) {
+  _B16x4 ret;
   if constexpr (std::is_same<T, _Float16>::value) {
-    tmp.u4 = vllm::fp8::scaled_convert<uint4, _B8x8, KV_DTYPE>(input, scale);
-    return tmp.u16x8;
+    union h2cvt {
+      _Half2 h2[2];
+      _B16x4 b16x4;
+    } u;
+    u.h2[0] = __builtin_amdgcn_cvt_pkrtz(inp[0], inp[1]);
+    u.h2[1] = __builtin_amdgcn_cvt_pkrtz(inp[2], inp[3]);
+    return u.b16x4;
   } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
-    tmp.b16x8 = vllm::fp8::scaled_convert<vllm::bf16_8_t, _B8x8, KV_DTYPE>(
-        input, scale);
-    return tmp.u16x8;
+    for (int i = 0; i < 4; i++) {
+      union fcvt {
+        uint32_t i32;
+        float f32;
+      } u;
+      u.f32 = inp[i];
+      ret[i] = uint16_t(u.i32 >> 16);
+    }
+    return ret;
   } else {
     static_assert(false, "unsupported 16b dtype");
   }
 }
 
-///////////////////////////////////////
+template <typename T>
+__device__ __forceinline__ _B16x8 convert_b8x8_custom(const _B8x8 input) {
+  union {
+    _B8x8 b8x8;
+    _B8x4 b8x4[2];
+  } tmp;
+  tmp.b8x8 = input;
+  _B16x8 ret;
+  for (int i = 0; i < 2; i++) {
+    ret.xy[i] = from_floatx4_rtz<T>(to_float_fp8x4(tmp.b8x4[i]));
+  }
+  return ret;
+}
+
+// grid (num_seqs, num_partitions,num_kv_heads)
+// block (256)
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,   
+    const float scale,    
+    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes, // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,           // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
+  const int warpid = threadIdx.x / WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
+  const int lane4id = laneid % 4;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
+
+  const int seq_idx = blockIdx.x;
+  const int partition_idx = blockIdx.y;
+
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
+
+  const int max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];
+
+  const int partition_start_token_idx =
+      partition_idx * T_PAR_SIZE;  // partition_size;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+
+  constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
+
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
+
+  // for QK mfma16x16, layout is QHead/Tokenx16 across every 16 lanes, 16 Bytes
+  // HeadElements in each lane, 4x16B HeadElements across 4 rows of warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16;  // rows refers to 16 lanes; refer DDP (Data Parallel
+                       // Processing) terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QK_SIZE_RATIO =
+      sizeof(scalar_t) /
+      sizeof(cache_t);  // 1 for 16bit types, 2 for 8bit types
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 4xQKHE_16B across
+                                                        // warp
+
+  _B16x8 Qlocal[QKHELOOP]
+               [QK_SIZE_RATIO];  // note that 16 contiguous elements of Q should
+                                 // be fetched per lane for 8 bit cache types :
+                                 // QK_SIZE_RATIO changes for this
+
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
+
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each mfma16x16x16 instruction processes 16 tokens
+
+  // can be interpreted as B8x16 for 8 bit types
+  _B16x8 Klocal[TLOOP][QKHELOOP];
+
+  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const int wg_start_kv_head_idx = blockIdx.z;
+  const int total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each mfma takes QH16xT16x16HE across warp
+  // repeat mfmas across QKHELOOP dimension
+  // output layout from QKmfma : QH16xT4x4 16 qheads across 16 lanes, 16 tokens
+  // across 4 rows x 4 tokens per lane
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  // fetch Q in shared across warps and then write to registers
+  const int local_qhead_idx = 4 * warpid + rowid;
+  const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
+  const scalar_t* q_ptr =
+      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+
+  const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+  if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+    const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+    const _B16x8* q_fetch_ptr_16B =
+        reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+    _B16x8 tmp = *q_fetch_ptr_16B;
+    if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+      const int offset1 =
+          lane16id /
+          4;  // 16 contiguous chunks of head elems are spread across 4x4lanes
+      shared_logits[offset1][lane4id][local_qhead_idx][0] = tmp.xy[0];
+      shared_logits[offset1][lane4id][local_qhead_idx][1] = tmp.xy[1];
+    } else {
+      for (int i = 0; i < 2; i++) {
+        const int head_elem = lane16id * 2 + i;  // element id in _B16x4 terms
+        const int offset3 = head_elem % 4;
+        const int offset2 = (head_elem / 4) % 4;
+        const int offset1 = head_elem / 4 / 4;
+        shared_logits[offset1][offset2][local_qhead_idx][offset3] = tmp.xy[i];
+      }
+    }
+  }
+  __syncthreads();
+  for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+    for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+      for (int i = 0; i < 2; i++) {
+        Qlocal[qkhe_depth][qkratio].xy[i] =
+            shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO]
+                         [2 * qkratio + i];
+      }
+    }
+  }
+
+  constexpr int KX =
+      16 / sizeof(cache_t);  // vLLM defines x as 16 Bytes of kv cache elements
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+  // fetch K values
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B;
+    }
+  }
+
+  float alibi_slope;
+  if constexpr (ALIBI_ENABLED) {
+    const int alibi_head_idx = wg_start_head_idx + lane16id;
+    alibi_slope = (lane16id < GQA_RATIO) ? alibi_slopes[alibi_head_idx] : 0.f;
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 64/4 = 16 contiguous vtokens per lane
+  constexpr int VBLOCKS_PER_LANE =
+      1;  // assumes block size >=16, each lane can correspond to 1 block only
+  constexpr int VTLOOP = NWARPS;  // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = HEAD_SIZE / 16 / NWARPS;
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
+      // Safe to use an int32_t here assuming we are working with < 2 billion
+      // tokens
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x8 Vlocal[VTLOOP][VHELOOP][VTLANELOOP];  // this could be B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride +
+                         ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE);
 
-// grid (num_seqs, num_partitions,num_heads/gqa_ratio)
-// block (partition size)
+  // v fetches are 16head elems across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int vblock_depth = 0;
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth][vblock_depth]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  // calculate post qk mfma scale
+  float scale2 = scale;
+  if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+    // multiply by k_scale if fp8 kv cache
+    scale2 *= *k_scale;
+  }
+
+  floatx4 d_out[TLOOP];
+  // qk mfma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    d_out[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+        for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+          for (int i = 0; i < 2; i++) {
+            d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Klocal[token_depth][qkhe_depth].xy[i],
+                Qlocal[qkhe_depth][qkratio].xy[i], d_out[token_depth]);
+          }
+        }
+      } else {  // kv cache dtype fp8
+        auto Ktmp = Klocal[token_depth][qkhe_depth];
+        _B8x16 Ktmp8x16 = *reinterpret_cast<_B8x16*>(&Ktmp);
+        for (int qkratio = 0; qkratio < QK_SIZE_RATIO; qkratio++) {
+          _B8x8 Ktmp8x8 = Ktmp8x16.xy[qkratio];
+          _B16x8 Klocaltmp = convert_b8x8_custom<scalar_t>(Ktmp8x8);
+          for (int i = 0; i < 2; i++) {
+            d_out[token_depth] = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Klocaltmp.xy[i], Qlocal[qkhe_depth][qkratio].xy[i],
+                d_out[token_depth]);
+          }
+        }
+      }
+    }
+    d_out[token_depth] *= scale2;
+  }
+
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 4;
+
+  // apply alibi
+  if constexpr (ALIBI_ENABLED) {
+    for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+      const int local_token_idx = qkout_token_idx + token_depth * 16;
+      const int alibi_offset = local_token_idx - context_len + 1;
+      for (int i = 0; i < 4; i++) {
+        d_out[token_depth][i] += alibi_slope * (alibi_offset + i);
+      }
+    }
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 4; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? d_out[token_depth][i]
+                            : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  for (int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor(qk_max, mask));
+  }
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 4; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? __expf(d_out[token_depth][i] - qk_max)
+                            : 0.0f;
+      d_out[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  for (int mask = WARP_SIZE / 2; mask >= 16; mask /= 2) {
+    exp_sum += __shfl_xor(exp_sum, mask);
+  }
+
+  __syncthreads();  // sync before writing to shared mem
+
+  float* shared_mem = reinterpret_cast<float*>(shared_logits);
+  if (laneid < 16) {
+    const int qk_max_offset = warpid * 16 + lane16id;
+    shared_mem[qk_max_offset] = qk_max;
+    const int exp_sum_offset = NWARPS * 16 + qk_max_offset;
+    shared_mem[exp_sum_offset] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_mem[w * 16 + lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum +=
+        shared_mem[NWARPS * 16 + w * 16 + lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // disable rtz conversion due to its impact on accuracy.
+  constexpr bool LOGITS_RTZ_CONVERSION = false;
+
+  // write logits to shared mem
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    d_out[token_depth] *= inv_sum_scale;
+    if constexpr (LOGITS_RTZ_CONVERSION) {
+      // use rtz conversion for better performance, with negligible impact on
+      // accuracy
+      shared_logits[warpid][token_depth][lane16id][rowid] =
+          from_floatx4_rtz<scalar_t>(d_out[token_depth]);
+    } else {
+      shared_logits[warpid][token_depth][lane16id][rowid] =
+          from_floatx4<scalar_t>(d_out[token_depth]);
+    }
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int64_t offset = static_cast<int64_t>(seq_idx) *
+                               static_cast<int64_t>(total_num_heads) *
+                               static_cast<int64_t>(max_num_partitions) +
+                           (static_cast<int64_t>(wg_start_head_idx) +
+                            static_cast<int64_t>(qhead_idx)) *
+                               static_cast<int64_t>(max_num_partitions) +
+                           static_cast<int64_t>(partition_idx);
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  constexpr int ELEMS8_ELEMS4_RATIO = 8 / 4;
+  constexpr int ELEMS16_ELEMS8_RATIO = 16 / 8;
+
+  _B16x4 outelems[VHELOOP];
+  // Softmax V mfma
+  // v layout: 16he across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx4 tmp_out = {0};
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
+        for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+          for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
+            const int offset = rowid * VTLANELOOP * ELEMS8_ELEMS4_RATIO +
+                               vfetch_depth * ELEMS8_ELEMS4_RATIO + i;
+            const int offset1 = offset % ROWS_PER_WARP;
+            const int offset2 = offset / ROWS_PER_WARP;
+            // output format is 16 qheads across 16 lanes, 16 head elems spread
+            // across 4 rows
+            tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                Vlocal[vtoken_depth][vhe_depth][vfetch_depth].xy[i],
+                shared_logits[vtoken_depth][offset2][lane16id][offset1],
+                tmp_out);
+          }
+        }
+        // KV cache fp8
+      } else {
+        for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+          _B16x8 Vtmp = Vlocal[vtoken_depth][vhe_depth][vfetch_depth];
+          // reinterpret V format as 16 elements of 8bits
+          _B8x16 Vtmp8x16 = *reinterpret_cast<_B8x16*>(&Vtmp);
+          for (int j = 0; j < ELEMS16_ELEMS8_RATIO; j++) {
+            _B8x8 Vtmp8x8 = Vtmp8x16.xy[j];
+            _B16x8 Vlocaltmp = convert_b8x8_custom<scalar_t>(Vtmp8x8);
+            for (int i = 0; i < ELEMS8_ELEMS4_RATIO; i++) {
+              const int offset =
+                  rowid * ELEMS16_ELEMS8_RATIO * ELEMS8_ELEMS4_RATIO +
+                  j * ELEMS8_ELEMS4_RATIO + i;
+              const int offset1 = offset % ROWS_PER_WARP;
+              const int offset2 = offset / ROWS_PER_WARP;
+              // output format is 16 qheads across 16 lanes, 16 head elems
+              // spread across 4 rows
+              tmp_out = gcn_mfma16x16x16_instr<scalar_t, 0, 0, 0>(
+                  Vlocaltmp.xy[i],
+                  shared_logits[vtoken_depth][offset2][lane16id][offset1],
+                  tmp_out);
+            }
+          }
+        }
+      }
+    }
+    // apply post Softmax V mfma v_scale
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+      tmp_out *= *v_scale;
+    }
+    outelems[vhe_depth] = from_floatx4<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  // store Softmax-V mfma output to shared mem
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    // lane16 id head dimension; rowid head element dimension
+    shared_logits[warpid][vhe_depth][lane16id][rowid] = outelems[vhe_depth];
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO4];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO4; h++) {
+        const int local_head_idx = 4 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % 4;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 4) % 4;
+        for (int i = 0; i < 2; i++) {
+          vout[h].xy[i] =
+              shared_logits[offset1][offset2][local_head_idx][offset3 + i];
+        }
+      }
+
+      const int64_t hsz_maxp_mult =
+          static_cast<int64_t>(HEAD_SIZE * max_num_partitions);
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO4; h++) {
+        const int local_head_idx = 4 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int64_t out_head_idx =
+              static_cast<int64_t>(wg_start_head_idx + local_head_idx);
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+// grid (num_seqs, num_partitions, num_kv_heads)
+// block (256 : partition size)
+// each WG handles 1 partition per sequence
+// clang-format off
 template <typename scalar_t, typename cache_t,
-          vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE,
-          int NUM_THREADS,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
           int GQA_RATIO>
-__global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
-    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
-    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size/x, block_size, x]
-    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size, block_size]
-    const int num_kv_heads, const float scale,
-    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,   // [num_seqs]
     const int max_num_blocks_per_seq,
-    const float* __restrict__ alibi_slopes,  // [num_heads]
-    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
-    float* __restrict__ max_logits,  // [num_seqs, num_heads,
-                                     // max_num_partitions]
-    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
-                                 // head_size]
-    scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
-    int max_ctx_blocks, const float* k_scale_ptr, const float* v_scale_ptr) {
+    const float* __restrict__ alibi_slopes, // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,           // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,         // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,           // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
   const int laneid = threadIdx.x % WARP_SIZE;
@@ -234,29 +803,37 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
   if (partition_start_token_idx >= context_len) {
     return;
   }
-  constexpr int QHLOOP =
-      DIVIDE_ROUND_UP(GQA_RATIO, 4);  // each 4 lanes fetch 4 different qheads,
-                                      // total qheads =8, so qhloop is 2
+  // every 4 lanes fetch 4 different qheads
+  // qhloop = num loops over qhead dimension
+  constexpr int QHLOOP = DIVIDE_ROUND_UP(GQA_RATIO, 4);
   constexpr int GQA_RATIO4 = 4 * QHLOOP;
   __shared__ float shared_qk_max[NWARPS][GQA_RATIO4 + 1];
   __shared__ float shared_exp_sum[NWARPS][GQA_RATIO4 + 1];
   _B16x8 Qlocal[QHLOOP];
   constexpr int x = 16 / sizeof(scalar_t);
+  // kheloop = num loops over head_size for 16Bytes of Q/dequantized K elements
   constexpr int KHELOOP = HEAD_SIZE / x;
   _B16x8 Klocal[KHELOOP];
   _B8x8 Klocalb8[KHELOOP];
-  constexpr int VHELOOP =
-      HEAD_SIZE /
-      WARP_SIZE;  // v head_size dimension is distributed across lanes
-  constexpr int VTLOOP = 8;  // 16 separate 4xtokens across warp -> 16/2
-                             // 8xtokens
+  // for SoftMax-V Gemm, V head_size dimension is distributed across warp
+  // vheloop = num loops to cover v head size dimension
+  constexpr int VHELOOP = HEAD_SIZE / WARP_SIZE;
+  // softmax out has warp_size tokens across warp
+  // vtloop = num loops to cover warp_size(64) tokens with 16Bytes of
+  // dequantized V elements
+  constexpr int VTLOOP = WARP_SIZE / 8;
+  // num vblocks to cover warp_size(64) v elements
+  constexpr int VBLOCKS = 8 * VTLOOP / BLOCK_SIZE;
+  int vphysical_blocks[VBLOCKS];
   _B16x8 Vlocal[VHELOOP][VTLOOP];
   _B8x8 Vlocalb8[VHELOOP][VTLOOP];
-  floatx4 dout[QHLOOP];
+  floatx4 d_out[QHLOOP];
   float qk_max[QHLOOP];
-  #pragma unroll
+
+  __shared__ _B16x4 vout_shared[QHLOOP][VHELOOP][WARP_SIZE][NWARPS + 1];
+
   for (int h = 0; h < QHLOOP; h++) {
-    dout[h] = {0};
+    d_out[h] = {0};
     qk_max[h] = -FLT_MAX;
   }
 
@@ -278,25 +855,24 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
     const int last_ctx_block = num_context_blocks - 1;
 
     const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
-
+    // token id within partition
     const int local_token_idx = threadIdx.x;
+    // token id within sequence
     const int global_token_idx = partition_start_token_idx + local_token_idx;
 
+    // fetch block number for k
     const int block_idx = (global_token_idx < context_len)
                               ? global_token_idx / BLOCK_SIZE
                               : last_ctx_block;
-    // fetch block number for q and k
-    // int32 physical_block_number leads to overflow when multiplied with
-    // kv_block_stride
+
+    // fetch k physical block number
+    //  int32 physical_block_number leads to overflow when multiplied with
+    //  kv_block_stride
     const int64_t physical_block_number =
         static_cast<int64_t>(block_table[block_idx]);
 
     // fetch vphysical block numbers up front
-    constexpr int VBLOCKS = 8 * VTLOOP / BLOCK_SIZE;
-    int vphysical_blocks[VBLOCKS];
-
     const int warp_start_block_idx = warp_start_token_idx / BLOCK_SIZE;
-  #pragma unroll
     for (int b = 0; b < VBLOCKS; b++) {
       const int vblock_idx = warp_start_block_idx + b;
       const int vblock_idx_ctx =
@@ -304,12 +880,13 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
       vphysical_blocks[b] = block_table[vblock_idx_ctx];
     }
 
-    // each 4 lanes fetch 8 helems, so warp fetches 8*16 = 128 helems
+    // fetch q elements
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
     const scalar_t* q_ptr =
         q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
     const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
     const int qhead_elemh8 = laneid / 4;
-  #pragma unroll
+
     for (int h = 0; h < QHLOOP - 1; h++) {
       const int qhead_idx = h * 4 + lane4id;
       Qlocal[h] = q_ptrh8[qhead_idx * HEAD_SIZE / 8 + qhead_elemh8];
@@ -323,22 +900,24 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
       Qlocal[QHLOOP - 1].xy[1] = {0};
     }
 
+    // fetch k elements
     const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride +
                            wg_start_kv_head_idx * kv_head_stride;
 
-    const int physical_block_offset =
-        local_token_idx % BLOCK_SIZE;  // since x=half8, physical_block_offset
-                                       // is already cast as _H8
+    // physical_block_offset is already cast in terms of _B16x8
+    const int physical_block_offset = local_token_idx % BLOCK_SIZE;
+
+    // each K fetch is for 8 elements of cache_t which are later dequantized to
+    // scalar_t for fp8
     if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
       const _B16x8* k_ptrh8 = reinterpret_cast<const _B16x8*>(k_ptr);
-  #pragma unroll
       for (int d = 0; d < KHELOOP; d++) {
         Klocal[d] = k_ptrh8[d * BLOCK_SIZE + physical_block_offset];
       }
     } else {
+      // vllm defines X as 16 Bytes of elements of cache_t
       constexpr int X = 16 / sizeof(cache_t);
       const cache_t* k_ptr2 = k_ptr + physical_block_offset * X;
-  #pragma unroll
       for (int d = 0; d < KHELOOP; d++) {
         const int head_elem = d * 8;
         const int offset1 = head_elem / X;
@@ -348,9 +927,9 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
       }
     }
 
+    // optional alibi fetch
     float alibi_slope[QHLOOP];
-    if (alibi_slopes != nullptr) {
-  #pragma unroll
+    if constexpr (ALIBI_ENABLED) {
       for (int h = 0; h < QHLOOP; h++) {
         const int qhead_idx = h * 4 + lane4id;
         alibi_slope[h] = (qhead_idx < GQA_RATIO)
@@ -360,10 +939,10 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
     }
 
     const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride;
+    // fetch vcache in kv cache auto case
     if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto) {
       const _B16x8* v_ptrh8 = reinterpret_cast<const _B16x8*>(v_ptr);
       // iterate over each v block
-  #pragma unroll
       for (int b = 0; b < VBLOCKS; b++) {
         // int32 physical_block_number leads to overflow when multiplied with
         // kv_block_stride
@@ -372,21 +951,20 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
         const _B16x8* v_ptrh8b =
             v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
         // iterate over each head elem (within head_size)
-  #pragma unroll
         for (int h = 0; h < VHELOOP; h++) {
           const int head_size_elem = h * WARP_SIZE + laneid;
           const _B16x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
           // iterate over all velems within block
-  #pragma unroll
           for (int d = 0; d < BLOCK_SIZE / 8; d++) {
             Vlocal[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
           }
         }
       }
-    } else {
+    }  // if constexpr (KV_DTYPE == vllm::Fp8KVCacheDataType::kAuto)
+    // fetch vcache in fp8 case
+    else {  // if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto)
       const _B8x8* v_ptrh8 = reinterpret_cast<const _B8x8*>(v_ptr);
       // iterate over each v block
-  #pragma unroll
       for (int b = 0; b < VBLOCKS; b++) {
         // int32 physical_block_number leads to overflow when multiplied with
         // kv_block_stride
@@ -395,164 +973,153 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
         const _B8x8* v_ptrh8b =
             v_ptrh8 + (vphysical_block_number * kv_block_stride) / 8;
         // iterate over each head elem (within head_size)
-  #pragma unroll
         for (int h = 0; h < VHELOOP; h++) {
           const int head_size_elem = h * WARP_SIZE + laneid;
           const _B8x8* v_ptrh8be = v_ptrh8b + head_size_elem * BLOCK_SIZE / 8;
           // iterate over all velems within block
-  #pragma unroll
           for (int d = 0; d < BLOCK_SIZE / 8; d++) {
-            // Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
-            const _B8x8 Vlocalb8 = v_ptrh8be[d];
-            Vlocal[h][b * BLOCK_SIZE / 8 + d] =
-                scaled_convert_b8x8<scalar_t, KV_DTYPE>(Vlocalb8, *v_scale_ptr);
+            Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d];
           }
         }
       }
     }
 
+  #define QK_mfma(x)                                             \
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) { \
+      Klocal[x] = convert_b8x8_custom<scalar_t>(Klocalb8[x]);    \
+    }                                                            \
+    for (int h = 0; h < QHLOOP; h++) {                           \
+      d_out[h] = gcn_mfma4x4x4_instr<scalar_t, 4, x, 0>(         \
+          Qlocal[h].xy[0], Klocal[x].xy[0], d_out[h]);           \
+      d_out[h] = gcn_mfma4x4x4_instr<scalar_t, 4, x, 0>(         \
+          Qlocal[h].xy[1], Klocal[x].xy[1], d_out[h]);           \
+    }
+    // QK mfma with Q mfma block broadcast
+    // Q values across head_size dimension stored across lanes
+    // K values across head_size dimension are stored depthwise within lane
+    // Q broadcast with absz, cbid of mfma instruction
+    QK_mfma(0);
+    QK_mfma(1);
+    QK_mfma(2);
+    QK_mfma(3);
+    QK_mfma(4);
+    QK_mfma(5);
+    QK_mfma(6);
+    QK_mfma(7);
+    // below only needed for head size 128
+    if constexpr (KHELOOP > 8) {
+      QK_mfma(8);
+      QK_mfma(9);
+      QK_mfma(10);
+      QK_mfma(11);
+      QK_mfma(12);
+      QK_mfma(13);
+      QK_mfma(14);
+      QK_mfma(15);
+    }
+  #undef QK_mfma
+
+    float scale2 = scale;
     if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
-  #pragma unroll
-      for (int d = 0; d < KHELOOP; d++) {
-        Klocal[d] =
-            scaled_convert_b8x8<scalar_t, KV_DTYPE>(Klocalb8[d], *k_scale_ptr);
-      }
+      // post mfma scaling for fp8
+      scale2 *= *k_scale;
     }
 
-  #pragma unroll
     for (int h = 0; h < QHLOOP; h++) {
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 0, 0>(Qlocal[h].xy[0],
-                                                  Klocal[0].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 0, 0>(Qlocal[h].xy[1],
-                                                  Klocal[0].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 1, 0>(Qlocal[h].xy[0],
-                                                  Klocal[1].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 1, 0>(Qlocal[h].xy[1],
-                                                  Klocal[1].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 2, 0>(Qlocal[h].xy[0],
-                                                  Klocal[2].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 2, 0>(Qlocal[h].xy[1],
-                                                  Klocal[2].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 3, 0>(Qlocal[h].xy[0],
-                                                  Klocal[3].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 3, 0>(Qlocal[h].xy[1],
-                                                  Klocal[3].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 4, 0>(Qlocal[h].xy[0],
-                                                  Klocal[4].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 4, 0>(Qlocal[h].xy[1],
-                                                  Klocal[4].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 5, 0>(Qlocal[h].xy[0],
-                                                  Klocal[5].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 5, 0>(Qlocal[h].xy[1],
-                                                  Klocal[5].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 6, 0>(Qlocal[h].xy[0],
-                                                  Klocal[6].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 6, 0>(Qlocal[h].xy[1],
-                                                  Klocal[6].xy[1], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 7, 0>(Qlocal[h].xy[0],
-                                                  Klocal[7].xy[0], dout[h]);
-      dout[h] = gcn_mfma_instr<scalar_t, 4, 7, 0>(Qlocal[h].xy[1],
-                                                  Klocal[7].xy[1], dout[h]);
-      if constexpr (KHELOOP > 8) {
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 8, 0>(Qlocal[h].xy[0],
-                                                    Klocal[8].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 8, 0>(Qlocal[h].xy[1],
-                                                    Klocal[8].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 9, 0>(Qlocal[h].xy[0],
-                                                    Klocal[9].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 9, 0>(Qlocal[h].xy[1],
-                                                    Klocal[9].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 10, 0>(Qlocal[h].xy[0],
-                                                     Klocal[10].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 10, 0>(Qlocal[h].xy[1],
-                                                     Klocal[10].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 11, 0>(Qlocal[h].xy[0],
-                                                     Klocal[11].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 11, 0>(Qlocal[h].xy[1],
-                                                     Klocal[11].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 12, 0>(Qlocal[h].xy[0],
-                                                     Klocal[12].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 12, 0>(Qlocal[h].xy[1],
-                                                     Klocal[12].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 13, 0>(Qlocal[h].xy[0],
-                                                     Klocal[13].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 13, 0>(Qlocal[h].xy[1],
-                                                     Klocal[13].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 14, 0>(Qlocal[h].xy[0],
-                                                     Klocal[14].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 14, 0>(Qlocal[h].xy[1],
-                                                     Klocal[14].xy[1], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 15, 0>(Qlocal[h].xy[0],
-                                                     Klocal[15].xy[0], dout[h]);
-        dout[h] = gcn_mfma_instr<scalar_t, 4, 15, 0>(Qlocal[h].xy[1],
-                                                     Klocal[15].xy[1], dout[h]);
-      }  // KHELOOP>8
-      dout[h] *= scale;
+      d_out[h] *= scale2;
     }
-  // transpose dout so that 4 token ids are in each lane, and 4 heads are across
-  // 4 lanes
-  #pragma unroll
+
+    // transpose d_out so that 4 token ids are in each lane, and 4 heads are
+    // across 4 lanes
     for (int h = 0; h < QHLOOP; h++) {
       floatx4 tmp = {0};
-  #pragma unroll
       for (int i = 0; i < 4; i++) {
         const float B = (lane4id == i) ? 1.0f : 0.0f;
-        // const float A = (global_token_idx < context_len) ? dout[h][i] : 0.0f;
-        tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(dout[h][i], B, tmp, 0, 0, 0);
-        // tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(A, B, tmp, 0, 0, 0);
+        tmp = __builtin_amdgcn_mfma_f32_4x4x1f32(d_out[h][i], B, tmp, 0, 0, 0);
       }
-      dout[h] = tmp;
+      d_out[h] = tmp;
     }
 
     const int lane4_token_idx = 4 * (global_token_idx >> 2);
-    const int alibi_offset = lane4_token_idx - context_len + 1;
-    if (alibi_slopes != nullptr) {
-  #pragma unroll
+
+    if constexpr (ALIBI_ENABLED) {
+      const int alibi_offset = lane4_token_idx - context_len + 1;
       for (int h = 0; h < QHLOOP; h++) {
-  #pragma unroll
         for (int i = 0; i < 4; i++) {
-          dout[h][i] += alibi_slope[h] * (alibi_offset + i);
+          d_out[h][i] += alibi_slope[h] * (alibi_offset + i);
         }
       }
     }
 
-  #pragma unroll
+    const int bpermute_mask = 4 * (16 * ((laneid >> 2) % 4) + lane4id);
+
     for (int h = 0; h < QHLOOP; h++) {
       qk_max[h] = -FLT_MAX;
-  #pragma unroll
       for (int i = 0; i < 4; i++) {
         qk_max[h] = (lane4_token_idx + i < context_len)
-                        ? fmaxf(qk_max[h], dout[h][i])
+                        ? fmaxf(qk_max[h], d_out[h][i])
                         : qk_max[h];
       }
-  #pragma unroll
-      for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
-        qk_max[h] = fmaxf(qk_max[h], __shfl_xor(qk_max[h], mask));
-      }
+
+      // for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
+      //   qk_max[h] = fmaxf(qk_max[h], __shfl_xor(qk_max[h], mask));
+      // }
+      // faster version of above code with dpp
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+
+      auto tmp = __builtin_amdgcn_ds_bpermute(
+          bpermute_mask, *reinterpret_cast<int*>(&qk_max[h]));
+      qk_max[h] = *reinterpret_cast<float*>(&tmp);
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
+      asm("v_nop\n v_nop\n v_max_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(qk_max[h])
+          : "v"(qk_max[h]), "v"(qk_max[h]));
     }
 
     float exp_sum[QHLOOP];
-  #pragma unroll
     for (int h = 0; h < QHLOOP; h++) {
       exp_sum[h] = 0.0f;
-  #pragma unroll
       for (int i = 0; i < 4; i++) {
-        dout[h][i] = (lane4_token_idx + i < context_len)
-                         ? __expf(dout[h][i] - qk_max[h])
-                         : 0.0f;
-        exp_sum[h] += dout[h][i];
-      }
-  #pragma unroll
-      for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
-        exp_sum[h] += __shfl_xor(exp_sum[h], mask);
+        d_out[h][i] = (lane4_token_idx + i < context_len)
+                          ? __expf(d_out[h][i] - qk_max[h])
+                          : 0.0f;
+        exp_sum[h] += d_out[h][i];
       }
+      // for (int mask = WARP_SIZE / 2; mask >= 4; mask /= 2) {
+      //   exp_sum[h] += __shfl_xor(exp_sum[h], mask);
+      // }
+      // faster version of above code with dpp
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+
+      auto tmp = __builtin_amdgcn_ds_bpermute(
+          bpermute_mask, *reinterpret_cast<int*>(&exp_sum[h]));
+      exp_sum[h] = *reinterpret_cast<float*>(&tmp);
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:4"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
+      asm("v_nop\n v_nop\n v_add_f32_dpp %0, %1, %2 row_ror:8"
+          : "=v"(exp_sum[h])
+          : "v"(exp_sum[h]), "v"(exp_sum[h]));
     }
 
-  #pragma unroll
-    for (int h = 0; h < QHLOOP; h++) {
-      const int head_idx = 4 * h + lane4id;
-      shared_qk_max[warpid][head_idx] = qk_max[h];
-      shared_exp_sum[warpid][head_idx] = exp_sum[h];
+    if (laneid < 4) {
+      for (int h = 0; h < QHLOOP; h++) {
+        const int head_idx = 4 * h + lane4id;
+        shared_qk_max[warpid][head_idx] = qk_max[h];
+        shared_exp_sum[warpid][head_idx] = exp_sum[h];
+      }
     }
   }  // warp within context
 
@@ -563,18 +1130,16 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
       max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
   float* exp_sums_ptr =
       exp_sums + seq_idx * num_heads * max_num_partitions + partition_idx;
-  #pragma unroll
+  // calculate qk_max and exp_sums for partition
   for (int h = 0; h < QHLOOP; h++) {
     float global_qk_max = -FLT_MAX;
     float warp_qk_max[NWARPS];
     const int head_idx = 4 * h + lane4id;
-  #pragma unroll
     for (int w = 0; w < NWARPS; w++) {
       warp_qk_max[w] = shared_qk_max[w][head_idx];
       global_qk_max = fmaxf(global_qk_max, warp_qk_max[w]);
     }
     float global_exp_sum = 0.0f;
-  #pragma unroll
     for (int w = 0; w < NWARPS; w++) {
       global_exp_sum +=
           shared_exp_sum[w][head_idx] * __expf(warp_qk_max[w] - global_qk_max);
@@ -587,101 +1152,94 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
     }
     const float global_inv_sum_scale = __fdividef(1.f, global_exp_sum + 1e-6f) *
                                        __expf(qk_max[h] - global_qk_max);
-    dout[h] *= global_inv_sum_scale;
+    d_out[h] *= global_inv_sum_scale;
   }
+  constexpr bool LOGITS_RTZ_CONVERSION = false;
   // logits[h] -> every 4 lanes hold 4 heads, each lane holds 4 tokens, there
   // are 4x16 tokens across warp
   _B16x4 logits[QHLOOP];
-  #pragma unroll
   for (int h = 0; h < QHLOOP; h++) {
-    logits[h] = from_floatx4<scalar_t>(dout[h]);
+    if constexpr (LOGITS_RTZ_CONVERSION) {
+      // use rtz for faster performance with no perceivable accuracy loss
+      logits[h] = from_floatx4_rtz<scalar_t>(d_out[h]);
+    } else {
+      logits[h] = from_floatx4<scalar_t>(d_out[h]);
+    }
   }
 
-  __shared__ _B16x4 vout_shared[QHLOOP][VHELOOP][WARP_SIZE][NWARPS + 1];
-
   if (warp_start_token_idx >= context_len) {  // warp out of context
-  #pragma unroll
     for (int qh = 0; qh < QHLOOP; qh++) {
-  #pragma unroll
       for (int vh = 0; vh < VHELOOP; vh++) {
         vout_shared[qh][vh][laneid][warpid] = {0};
       }
     }
   } else {  // warp in context
-  // iterate across heads
-  #pragma unroll
-    for (int qh = 0; qh < QHLOOP; qh++) {
-  // iterate over each v head elem (within head_size)
-  #pragma unroll
-      for (int vh = 0; vh < VHELOOP; vh++) {
-        floatx4 acc = {0};
-        // iterate over tokens
-        acc = gcn_mfma_instr<scalar_t, 4, 0, 0>(logits[qh], Vlocal[vh][0].xy[0],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 1, 0>(logits[qh], Vlocal[vh][0].xy[1],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 2, 0>(logits[qh], Vlocal[vh][1].xy[0],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 3, 0>(logits[qh], Vlocal[vh][1].xy[1],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 4, 0>(logits[qh], Vlocal[vh][2].xy[0],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 5, 0>(logits[qh], Vlocal[vh][2].xy[1],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 6, 0>(logits[qh], Vlocal[vh][3].xy[0],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 7, 0>(logits[qh], Vlocal[vh][3].xy[1],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 8, 0>(logits[qh], Vlocal[vh][4].xy[0],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 9, 0>(logits[qh], Vlocal[vh][4].xy[1],
-                                                acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 10, 0>(logits[qh],
-                                                 Vlocal[vh][5].xy[0], acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 11, 0>(logits[qh],
-                                                 Vlocal[vh][5].xy[1], acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 12, 0>(logits[qh],
-                                                 Vlocal[vh][6].xy[0], acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 13, 0>(logits[qh],
-                                                 Vlocal[vh][6].xy[1], acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 14, 0>(logits[qh],
-                                                 Vlocal[vh][7].xy[0], acc);
-        acc = gcn_mfma_instr<scalar_t, 4, 15, 0>(logits[qh],
-                                                 Vlocal[vh][7].xy[1], acc);
-        vout_shared[qh][vh][laneid][warpid] = from_floatx4<scalar_t>(acc);
+  #define SV_mfma(x)                                                  \
+    if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {      \
+      Vlocal[vh][x] = convert_b8x8_custom<scalar_t>(Vlocalb8[vh][x]); \
+    }                                                                 \
+    for (int qh = 0; qh < QHLOOP; qh++) {                             \
+      acc[qh] = gcn_mfma4x4x4_instr<scalar_t, 4, 2 * x, 0>(           \
+          logits[qh], Vlocal[vh][x].xy[0], acc[qh]);                  \
+      acc[qh] = gcn_mfma4x4x4_instr<scalar_t, 4, 2 * x + 1, 0>(       \
+          logits[qh], Vlocal[vh][x].xy[1], acc[qh]);                  \
+    }
+
+    for (int vh = 0; vh < VHELOOP; vh++) {
+      floatx4 acc[QHLOOP];
+      for (int qh = 0; qh < QHLOOP; qh++) {
+        acc[qh] = {0};
+      }
+      // SoftMax-V calculation
+      // logits -> token dimension is distributed across lanes
+      // Vlocal -> token dimension is depthwise within lane
+      // uses mfma instruction block broadcast for logits
+      SV_mfma(0);
+      SV_mfma(1);
+      SV_mfma(2);
+      SV_mfma(3);
+      SV_mfma(4);
+      SV_mfma(5);
+      SV_mfma(6);
+      SV_mfma(7);
+
+      for (int qh = 0; qh < QHLOOP; qh++) {
+        if constexpr (KV_DTYPE != vllm::Fp8KVCacheDataType::kAuto) {
+          // post mfma v scale for fp8
+          acc[qh] *= *v_scale;
+        }
+        vout_shared[qh][vh][laneid][warpid] = from_floatx4<scalar_t>(acc[qh]);
       }
     }
+
+  #undef SV_mfma
   }  // warp in context
 
   __syncthreads();
 
+  // final write to tmp_out after vout accumulation
   if (warpid == 0) {
     _B16x4 vout[QHLOOP][VHELOOP];
     // iterate across heads
-    scalar_t* out_ptr;
-    int out_num_partitions;
-    if (context_len > partition_size) {
-      out_num_partitions = max_num_partitions;
-      out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
-                partition_idx * HEAD_SIZE;
-    } else {
-      out_num_partitions = 1;
-      out_ptr = final_out + seq_idx * num_heads * HEAD_SIZE;
-    }
-  #pragma unroll
     for (int qh = 0; qh < QHLOOP; qh++) {
-  // iterate over each v head elem (within head_size)
-  #pragma unroll
+      // iterate over each v head elem (within head_size)
       for (int vh = 0; vh < VHELOOP; vh++) {
         vout[qh][vh] = {0};
-  #pragma unroll
         for (int w = 0; w < NWARPS; w++) {
           vout[qh][vh] =
               addx4<scalar_t>(vout[qh][vh], vout_shared[qh][vh][laneid][w]);
         }
+      }
+    }
+
+    scalar_t* out_ptr = out +
+                        seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+                        partition_idx * HEAD_SIZE;
+    const int out_num_partitions = max_num_partitions;
+    bit16_t* out_ptr_b16 = reinterpret_cast<bit16_t*>(out_ptr);
+    for (int qh = 0; qh < QHLOOP; qh++) {
+      for (int vh = 0; vh < VHELOOP; vh++) {
         const int head_size_elem = vh * WARP_SIZE + laneid;
-        bit16_t* out_ptr_b16 = reinterpret_cast<bit16_t*>(out_ptr);
-  #pragma unroll
         for (int i = 0; i < 4; i++) {
           const int head_idx = 4 * qh + i;
           if (head_idx < GQA_RATIO) {
@@ -692,15 +1250,15 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
         }
       }
     }
-  }
+  }  // warpid == 0
 }
 
 // Grid: (num_heads, num_seqs).
-template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
-          int PARTITION_SIZE>
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
 __global__
 __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
-    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
     const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
                                            // max_num_partitions]
     const float* __restrict__ max_logits,  // [num_seqs, num_heads,
@@ -714,18 +1272,13 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   const int seq_idx = blockIdx.y;
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  if (num_partitions == 1) {
-    // if num_partitions==1, main kernel will write to out directly, no work in
-    // reduction kernel
-    return;
-  }
-
   constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
   const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
-  __shared__ float shared_exp_sums[2 * WARP_SIZE];
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
 
   if (warpid == 0) {
     const float* max_logits_ptr = max_logits +
@@ -734,14 +1287,25 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
     // valid partition is the last valid partition in case threadid > num
     // partitions
-    const int valid_partition =
-        (threadIdx.x < num_partitions) ? threadIdx.x : num_partitions - 1;
-    const int valid_partition2 = (WARP_SIZE + threadIdx.x < num_partitions)
-                                     ? WARP_SIZE + threadIdx.x
-                                     : num_partitions - 1;
-    float reg_max_logit = max_logits_ptr[valid_partition];
-    float reg_max_logit2 = max_logits_ptr[valid_partition2];
-    float max_logit = fmaxf(reg_max_logit, reg_max_logit2);
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
 
   #pragma unroll
     for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
@@ -752,17 +1316,28 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                 seq_idx * num_heads * max_num_partitions +
                                 head_idx * max_num_partitions;
 
-    float global_exp_sum = 0.0f;
-    float rescaled_exp_sum = exp_sums_ptr[valid_partition];
-    float rescaled_exp_sum2 = exp_sums_ptr[valid_partition2];
-    rescaled_exp_sum *=
-        (threadIdx.x < num_partitions) ? expf(reg_max_logit - max_logit) : 0.0f;
-    rescaled_exp_sum2 *= (threadIdx.x + WARP_SIZE < num_partitions)
-                             ? expf(reg_max_logit2 - max_logit)
-                             : 0.0f;
-    global_exp_sum += rescaled_exp_sum + rescaled_exp_sum2;
-    shared_exp_sums[threadIdx.x] = rescaled_exp_sum;
-    shared_exp_sums[threadIdx.x + WARP_SIZE] = rescaled_exp_sum2;
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
 
   #pragma unroll
     for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
@@ -839,82 +1414,117 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     }
   }
 
-  if (num_partitions > MAX_NPAR) {
-    idx = 0;
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
   #pragma unroll
-    for (int j = MAX_NPAR * HEAD_SIZE; j < 2 * MAX_NPAR * HEAD_SIZE;
-         j += HEAD_SIZE) {
-      // lastj is last valid partition
-      const int lastj_offset =
-          (j < num_partition_offset) ? j : last_partition_offset;
-      tmps[idx] = tmp_out_ptr[lastj_offset];
-      idx++;
-    }
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
 
   #pragma unroll
-    for (int j = 0; j < MAX_NPAR; j++) {
-      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + MAX_NPAR];
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
     }
   }
 
   const float inv_global_exp_sum =
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
-  scalar_t* out_ptr =
-      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
-  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+
+  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  if constexpr (std::is_same<OUTT, bit8_t>::value) {
+    out_ptr[threadIdx.x] =
+        __hip_cvt_float_to_fp8(acc, vllm::fp8::fp8_type::__default_saturation,
+                               vllm::fp8::fp8_type::__default_interpret);
+  } else {
+    out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+  }
 }
 
 #else  // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
+// clang-format off
 template <typename scalar_t, typename cache_t,
-          vllm::Fp8KVCacheDataType KV_DTYPE, int BLOCK_SIZE, int HEAD_SIZE,
-          int NUM_THREADS,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
           int GQA_RATIO>
-__global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel(
-    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
-    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size/x, block_size, x]
-    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size, block_size]
-    const int num_kv_heads, const float scale,
-    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,  // [num_seqs]
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
-    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
-    float* __restrict__ max_logits,  // [num_seqs, num_heads,
-                                     // max_num_partitions]
-    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
-                                 // head_size]
-    scalar_t* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,             // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,           // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,               // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,             // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,          // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,     // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,     // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,            // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,          // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,              // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,            // [num_seqs, num_heads, head_size]
     int max_ctx_blocks, const float* k_scale, const float* v_scale) {
   UNREACHABLE_CODE
 }
 
 // Grid: (num_heads, num_seqs).
-template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
-          int PARTITION_SIZE>
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
 __global__
 __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
-    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
-    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
-                                           // max_num_partitions]
-    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
-                                           // max_num_partitions]
-    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
-                                           // max_num_partitions, head_size]
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
     const int max_num_partitions) {
   UNREACHABLE_CODE
 }
+// clang-format on
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
-#define LAUNCH_CUSTOM_ATTENTION(GQA_RATIO)                                    \
-  paged_attention_ll4mi_QKV_kernel<T, KVT, KV_DTYPE, BLOCK_SIZE, HEAD_SIZE,   \
-                                   NTHR, GQA_RATIO>                           \
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
+                                          GQA_RATIO>                          \
       <<<grid, block, 0, stream>>>(                                           \
           query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
           block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
@@ -922,8 +1532,27 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
           exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
           k_scale_ptr, v_scale_ptr);
 
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                         GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                           \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
+          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
+          k_scale_ptr, v_scale_ptr);
+
+#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
+  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
+                                      PARTITION_SIZE, NPAR_LOOPS>    \
+      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
+          context_lens_ptr, max_num_partitions);
+
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
-          int BLOCK_SIZE, int HEAD_SIZE, int PARTITION_SIZE = 512>
+          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
+          bool ALIBI_ENABLED>
 void paged_attention_custom_launcher(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
@@ -945,7 +1574,6 @@ void paged_attention_custom_launcher(
           ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
           : nullptr;
 
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
   float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
   float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
   T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
@@ -956,109 +1584,143 @@ void paged_attention_custom_launcher(
   int* context_lens_ptr = context_lens.data_ptr<int>();
   const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
   const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
 
   const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+
+  // partition size is fixed at 256 since both mfma4 and mfma16 kernels support
+  // it mfma4 kernel also supports partition size 512
+  constexpr int PARTITION_SIZE = 256;
   const int max_num_partitions =
       DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
   const int gqa_ratio = num_heads / num_kv_heads;
   assert(num_heads % num_kv_heads == 0);
   assert(head_size == HEAD_SIZE);
-  assert(max_num_partitions <= 128);
 
-  constexpr int NTHR = PARTITION_SIZE;
+  constexpr int NTHR = 256;
   dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
   dim3 block(NTHR);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4
   switch (gqa_ratio) {
     case 1:
-      LAUNCH_CUSTOM_ATTENTION(1);
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(1);
       break;
     case 2:
-      LAUNCH_CUSTOM_ATTENTION(2);
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(2);
       break;
     case 3:
-      LAUNCH_CUSTOM_ATTENTION(3);
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(3);
       break;
     case 4:
-      LAUNCH_CUSTOM_ATTENTION(4);
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(4);
       break;
     case 5:
-      LAUNCH_CUSTOM_ATTENTION(5);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(5);
       break;
     case 6:
-      LAUNCH_CUSTOM_ATTENTION(6);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(6);
       break;
     case 7:
-      LAUNCH_CUSTOM_ATTENTION(7);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(7);
       break;
     case 8:
-      LAUNCH_CUSTOM_ATTENTION(8);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(8);
       break;
     case 9:
-      LAUNCH_CUSTOM_ATTENTION(9);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(9);
       break;
     case 10:
-      LAUNCH_CUSTOM_ATTENTION(10);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(10);
       break;
     case 11:
-      LAUNCH_CUSTOM_ATTENTION(11);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(11);
       break;
     case 12:
-      LAUNCH_CUSTOM_ATTENTION(12);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(12);
       break;
     case 13:
-      LAUNCH_CUSTOM_ATTENTION(13);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(13);
       break;
     case 14:
-      LAUNCH_CUSTOM_ATTENTION(14);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(14);
       break;
     case 15:
-      LAUNCH_CUSTOM_ATTENTION(15);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(15);
       break;
     case 16:
-      LAUNCH_CUSTOM_ATTENTION(16);
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(16);
       break;
     default:
       TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio);
       break;
   }
-  // dim3 grid2(num_heads,num_seqs,head_size/HEAD_ELEMS_PER_WG);
-  // dim3 block2(1024);
-  //  LAUNCH_CUSTOM_ATTENTION2;
-
-  // reduction kernel is only required if max_context_len > partition size,
-  // otherwise main kernel writes directly to final output
-  //  note there are cases with graphing where max_context_len is the max
-  //  supported by graphing, not the actual max among all the sequences: in that
-  //  case reduction kernel will still run but return immediately
-  if (max_context_len > PARTITION_SIZE) {
-    dim3 reduce_grid(num_heads, num_seqs);
-    dim3 reduce_block(head_size);
-    paged_attention_ll4mi_reduce_kernel<T, HEAD_SIZE, HEAD_SIZE, PARTITION_SIZE>
-        <<<reduce_grid, reduce_block, 0, stream>>>(
-            out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,
-            context_lens_ptr, max_num_partitions);
+
+  dim3 reduce_grid(num_heads, num_seqs);
+  dim3 reduce_block(head_size);
+  const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, WARP_SIZE);
+  // reduction kernel supports upto 8 NPAR_loops * 64 (warp_size) * 256
+  // (partition size) = 128K context length
+  switch (npar_loops) {
+    case 1:
+      LAUNCH_CUSTOM_REDUCTION(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_REDUCTION(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_REDUCTION(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_REDUCTION(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_REDUCTION(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_REDUCTION(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_REDUCTION(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_REDUCTION(8);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops);
+      break;
   }
 }
 
-#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE)       \
-  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE>( \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
-      num_kv_heads, scale, block_tables, context_lens, max_context_len,   \
+#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, PSIZE,  \
+                             ALIBI_ENABLED)                                 \
+  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
+                                  PSIZE, ALIBI_ENABLED>(                    \
+      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
+      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
       alibi_slopes, k_scale, v_scale);
 
-#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE)     \
-  switch (block_size) {                                           \
-    case 16:                                                      \
-      CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, 16, HEAD_SIZE);      \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, 32, HEAD_SIZE);      \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
+#define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
+                                   PSIZE)                                      \
+  if (alibi_slopes) {                                                          \
+    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, PSIZE, true);  \
+  } else {                                                                     \
+    CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, PSIZE, false); \
+  }
+
+#define CALL_CUSTOM_LAUNCHER_BLK(T, KVT, KV_DTYPE, HEAD_SIZE)           \
+  switch (block_size) {                                                 \
+    case 16:                                                            \
+      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, 16, HEAD_SIZE, 256); \
+      break;                                                            \
+    case 32:                                                            \
+      CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, 32, HEAD_SIZE, 256); \
+      break;                                                            \
+    default:                                                            \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size);       \
+      break;                                                            \
   }
 
 #define CALL_CUSTOM_LAUNCHER_BLK_HEAD(T, KVT, KV_DTYPE)         \
@@ -1074,24 +1736,24 @@ void paged_attention_custom_launcher(
       break;                                                    \
   }
 
+// clang-format off
 void paged_attention(
     torch::Tensor& out,         // [num_seqs, num_heads, head_size]
     torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
     torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
-    torch::Tensor&
-        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
-    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
-    torch::Tensor&
-        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
-    torch::Tensor&
-        value_cache,  // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& context_lens,  // [num_seqs]
+    torch::Tensor& tmp_out,     // [num_seqs, num_heads, max_num_partitions, head_size]
+    torch::Tensor& query,       // [num_seqs, num_heads, head_size]
+    torch::Tensor& key_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads, 
+    double scale,
+    torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& context_lens, // [num_seqs]
     int64_t block_size, int64_t max_context_len,
     const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
     torch::Tensor& v_scale) {
+  // clang-format on
   const int head_size = query.size(2);
   if (kv_cache_dtype == "auto") {
     if (query.dtype() == at::ScalarType::Half) {
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index d86e039c232..97985655cbf 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -11,4 +11,4 @@ peft
 pytest-asyncio
 tensorizer>=2.9.0
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
+runai-model-streamer-s3==0.11.0
\ No newline at end of file
diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index 0fe10d76909..fc549d7a7c1 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -25,6 +25,7 @@
 # Reduce NUM_BLOCKS when it happens.
 NUM_BLOCKS = 4321  # Arbitrary values for testing
 PARTITION_SIZE = 512
+PARTITION_SIZE_ROCM = 256
 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16}
 DTYPES = [
     torch.half, torch.bfloat16, torch.float
@@ -146,6 +147,8 @@ def test_paged_attention(
             or (version == "rocm" and head_size not in (64, 128))):
         pytest.skip()
 
+    global PARTITION_SIZE
+
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
     scale = float(1.0 / (head_size**0.5))
@@ -214,6 +217,9 @@ def test_paged_attention(
                       and block_size == BLOCK_SIZES[0]))
 
     elif version in ("v2", "rocm"):
+        if current_platform.is_rocm() and version == "rocm":
+            PARTITION_SIZE = PARTITION_SIZE_ROCM
+
         num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
         assert PARTITION_SIZE % block_size == 0
         num_seqs, num_heads, head_size = output.shape
@@ -432,4 +438,4 @@ def test_multi_query_kv_attention(
     )
     atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
     rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
\ No newline at end of file
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 3f40686ee2f..02a2a48fe85 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -22,7 +22,7 @@
 
 logger = init_logger(__name__)
 
-_PARTITION_SIZE_ROCM = 512
+_PARTITION_SIZE_ROCM = 256
 _GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
 _ON_NAVI = "gfx1" in _GPU_ARCH
 _ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
@@ -885,4 +885,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
\ No newline at end of file

From 45e933dac00200811984dc1f1a5329661d360162 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:48:11 +0000
Subject: [PATCH 0470/1240] Fix `head_dim` not existing in all model configs
 (Transformers backend) (#14141)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/transformers.py | 27 ++++++++++++----------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 61cfc566dd3..a6bfdebb1a7 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -25,7 +25,6 @@
 from vllm.attention import Attention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.distributed.utils import divide
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
@@ -128,10 +127,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
+        model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
 
         self.config = config
-        self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+        self.vocab_size = model_config.get_vocab_size()
+        self.unpadded_vocab_size = model_config.get_vocab_size()
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
@@ -145,15 +146,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.apply_base_model_tp_plan(self.model)
 
         # Attention modifications (assumes 1 attention op per hidden layer)
-        tp_size = get_tensor_model_parallel_world_size()
+        num_heads = model_config.get_num_attention_heads(parallel_config)
+        head_size = model_config.get_head_size()
+        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.attention_instances = [
             Attention(
-                num_heads=divide(config.num_attention_heads, tp_size),
-                head_size=config.head_dim,
+                num_heads=num_heads,
+                head_size=head_size,
                 # NOTE: We use Llama scale as default, if it's set by
                 # Transformers, it's updated in vllm_flash_attention_forward
-                scale=config.head_dim**-0.5,
-                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                scale=head_size**-0.5,
+                num_kv_heads=num_kv_heads,
                 cache_config=cache_config,
                 quant_config=self.quant_config,
                 prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
@@ -163,7 +166,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.replace_vocab_embed_class(self.model)
 
         # ForCausalLM modifications
-        self.lm_head = ParallelLMHead(config.vocab_size,
+        self.lm_head = ParallelLMHead(self.vocab_size,
                                       config.hidden_size,
                                       quant_config=self.quant_config,
                                       prefix=maybe_prefix(prefix, "lm_head"))
@@ -172,7 +175,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+                                                self.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
     def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
@@ -203,12 +206,12 @@ def replace_vocab_embed_class(self, module: nn.Module):
         new_module = VocabParallelEmbedding(
             self.vocab_size,
             self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
+            org_num_embeddings=self.vocab_size,
             quant_config=None,
         )
         log_replacement("input embedding", self.model.get_input_embeddings(),
                         new_module)
-        self.model.set_input_embeddings(new_module)
+        module.set_input_embeddings(new_module)
 
     def forward(
         self,

From c58d56bc977f2d0192c5041abe8ebe1ef1ad9c9a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 17:50:22 +0000
Subject: [PATCH 0471/1240] [V0][Metrics] Remove unimplemented
 `vllm:tokens_total` (#14134)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/metrics.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index cb3ca7a1188..9379ba61463 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -115,10 +115,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
-        self.counter_tokens = self._counter_cls(
-            name="vllm:tokens_total",
-            documentation="Number of prefill plus generation tokens processed.",
-            labelnames=labelnames)
         buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]
         if not vllm_config.model_config.enforce_eager:
             buckets = vllm_config.compilation_config.\

From 0842e1efb97db074b86abb38f2cbe806349d1e23 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 18:25:46 +0000
Subject: [PATCH 0472/1240] [V0][Metrics] Deprecate some KV/prefix cache
 metrics (#14136)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/metrics.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 9379ba61463..08be2cbc0b9 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -74,31 +74,51 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             ],
             multiprocess_mode="livemostrecent",
         )
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_scheduler_swapped = self._gauge_cls(
             name="vllm:num_requests_swapped",
-            documentation="Number of requests swapped to CPU.",
+            documentation=(
+                "Number of requests swapped to CPU. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_cache_usage = self._gauge_cls(
             name="vllm:cpu_cache_usage_perc",
-            documentation="CPU KV-cache usage. 1 means 100 percent usage.",
+            documentation=(
+                "CPU KV-cache usage. 1 means 100 percent usage. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
-        #   Prefix caching block hit rate
+
+        # Deprecated in 0.8 - KV cache offloading is not used in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:cpu_prefix_cache_hit_rate",
-            documentation="CPU prefix cache block hit rate.",
+            documentation=(
+                "CPU prefix cache block hit rate. "
+                "DEPRECATED: KV cache offloading is not used in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
+
+        # Deprecated in 0.8 - replaced by queries+hits counters in V1
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
             name="vllm:gpu_prefix_cache_hit_rate",
-            documentation="GPU prefix cache block hit rate.",
+            documentation=("GPU prefix cache block hit rate. "
+                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
+                           "vllm:gpu_prefix_cache_queries in V1"),
             labelnames=labelnames,
             multiprocess_mode="sum")
 

From 462e9154dedc3aff058fbc90751b56441d48ee7a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 3 Mar 2025 10:34:14 -0800
Subject: [PATCH 0473/1240] [V1] Simplify stats logging (#14082)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/async_llm.py | 21 +++++++++++----------
 vllm/v1/engine/core.py      | 17 ++++-------------
 vllm/v1/metrics/loggers.py  | 29 +++++++++++++++--------------
 3 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 954f74c3fda..4c9d4cb467a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import logging
 import os
 from collections.abc import AsyncGenerator, Mapping
 from typing import Optional, Union
@@ -57,10 +58,9 @@ def __init__(
         self.log_stats = log_stats
         self.stat_loggers: list[StatLoggerBase] = []
         if self.log_stats:
-            self.stat_loggers.extend([
-                LoggingStatLogger(),
-                PrometheusStatLogger(vllm_config),
-            ])
+            if logger.isEnabledFor(logging.INFO):
+                self.stat_loggers.append(LoggingStatLogger())
+            self.stat_loggers.append(PrometheusStatLogger(vllm_config))
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -287,7 +287,7 @@ async def _run_output_handler(self):
                 # 4) Logging.
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
-                self._log_stats(
+                self._record_stats(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
                 )
@@ -306,7 +306,7 @@ async def abort(self, request_id: str) -> None:
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
 
-    def _log_stats(
+    def _record_stats(
         self,
         scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
@@ -316,9 +316,9 @@ def _log_stats(
 
         assert scheduler_stats is not None
         assert iteration_stats is not None
-        for logger in self.stat_loggers:
-            logger.log(scheduler_stats=scheduler_stats,
-                       iteration_stats=iteration_stats)
+        for stat_logger in self.stat_loggers:
+            stat_logger.record(scheduler_stats=scheduler_stats,
+                               iteration_stats=iteration_stats)
 
     def encode(
         self,
@@ -354,7 +354,8 @@ async def do_log_stats(
         scheduler_outputs=None,
         model_output=None,
     ) -> None:
-        logger.debug("Called do_log_stats.")
+        for stat_logger in self.stat_loggers:
+            stat_logger.log()
 
     async def check_health(self) -> None:
         logger.debug("Called check_health.")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b9bf8fac40f..b78b903b805 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -316,19 +316,10 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            if not self.scheduler.has_unfinished_requests():
-                while True:
-                    try:
-                        req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
-                        self._handle_client_request(*req)
-                        break
-                    except queue.Empty:
-                        logger.debug("EngineCore busy loop waiting.")
-                        # Break out the loop so we can log_stats in step().
-                        if self.log_stats:
-                            break
-                    except BaseException:
-                        raise
+            while not self.scheduler.has_unfinished_requests():
+                logger.debug("EngineCore busy loop waiting.")
+                req = self.input_queue.get()
+                self._handle_client_request(*req)
 
             # 2) Handle any new client requests.
             while not self.input_queue.empty():
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 5a2a1c30a9d..7f6de791048 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -21,15 +21,19 @@
 class StatLoggerBase(ABC):
 
     @abstractmethod
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: IterationStats):
         ...
 
+    def log(self):  # noqa
+        pass
+
 
 class LoggingStatLogger(StatLoggerBase):
 
     def __init__(self):
         self._reset(time.monotonic())
+        self.last_scheduler_stats = SchedulerStats()
 
     def _reset(self, now):
         self.last_log_time = now
@@ -41,11 +45,6 @@ def _reset(self, now):
         # Prefix cache metrics. TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
 
-    def _local_interval_elapsed(self, now: float) -> bool:
-        # Log every _LOCAL_LOGGING_INTERVAL_SEC.
-        elapsed_time = now - self.last_log_time
-        return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
-
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
@@ -56,24 +55,26 @@ def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
         # Compute summary metrics for tracked stats
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: IterationStats):
         """Log Stats to standard output."""
 
         self._track_iteration_stats(iteration_stats)
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
-        now = time.monotonic()
-        if not self._local_interval_elapsed(now):
-            return
+        self.last_scheduler_stats = scheduler_stats
 
+    def log(self):
+        now = time.monotonic()
         prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
         generation_throughput = self._get_throughput(
             self.num_generation_tokens, now)
 
         self._reset(now)
 
+        scheduler_stats = self.last_scheduler_stats
+
         # Format and print output.
         logger.info(
             "Avg prompt throughput: %.1f tokens/s, "
@@ -274,8 +275,8 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
             labelnames=metrics_info.keys()).labels(**metrics_info)
         info_gauge.set(1)
 
-    def log(self, scheduler_stats: SchedulerStats,
-            iteration_stats: IterationStats):
+    def record(self, scheduler_stats: SchedulerStats,
+               iteration_stats: IterationStats):
         """Log to prometheus."""
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

From 3646b5b761a6aee85b31c24e81b2f5492c9b03c6 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 3 Mar 2025 19:04:45 +0000
Subject: [PATCH 0474/1240] [WIP][[V1][Metrics] Implement
 max_num_generation_tokens,  request_params_n, and request_params_max_tokens
 metrics (#14055)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  6 +++
 vllm/v1/engine/output_processor.py       | 13 ++++++
 vllm/v1/engine/parallel_sampling.py      | 39 +++++++++++++++++-
 vllm/v1/metrics/loggers.py               | 50 ++++++++++++++++++++++++
 vllm/v1/metrics/stats.py                 |  5 +++
 5 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 39ce4ba2354..2bffd0ce138 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -239,6 +239,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_generation_tokens_sum",
     "vllm:request_generation_tokens_bucket",
     "vllm:request_generation_tokens_count",
+    "vllm:request_params_n_sum",
+    "vllm:request_params_n_bucket",
+    "vllm:request_params_n_count",
+    "vllm:request_params_max_tokens_sum",
+    "vllm:request_params_max_tokens_bucket",
+    "vllm:request_params_max_tokens_count",
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 4e1d1e3bf51..75c638a854f 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -36,6 +36,7 @@ def __init__(
         prompt_token_ids: list[int],
         logprobs_processor: LogprobsProcessor,
         detokenizer: IncrementalDetokenizer,
+        max_tokens_param: Optional[int],
         arrival_time: float,
         queue: Optional[asyncio.Queue[RequestOutput]],
         log_stats: bool,
@@ -50,6 +51,7 @@ def __init__(
         self.prompt_len = len(prompt_token_ids)
         self.logprobs_processor = logprobs_processor
         self.detokenizer = detokenizer
+        self.max_tokens_param = max_tokens_param
         self.is_prefilling = True
         self.queue = queue
 
@@ -83,6 +85,8 @@ def from_new_request(
                 tokenizer=tokenizer,
                 request=request,
             ),
+            max_tokens_param=(request.sampling_params.max_tokens if
+                              request.sampling_params is not None else None),
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
@@ -198,6 +202,8 @@ def abort_requests(
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.abort_request(req_state)
+                if req_state.parent_req is not None:
+                    req_state.parent_req.finish_child_request(request_id)
 
     def add_request(
         self,
@@ -310,6 +316,8 @@ def process_outputs(
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.
                     reqs_to_abort.append(req_id)
+                if req_state.parent_req is not None:
+                    req_state.parent_req.finish_child_request(req_id)
 
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
@@ -350,5 +358,10 @@ def _update_stats_from_finished(self, req_state: RequestState,
         iteration_stats.update_from_finished_request(
             finish_reason=finish_reason,
             num_prompt_tokens=len(req_state.prompt_token_ids),
+            max_tokens_param=req_state.max_tokens_param,
             req_stats=req_state.stats)
         self.lora_states.finish_request(req_state)
+
+        ParentRequest.observe_finished_request(
+            req_state.parent_req, iteration_stats,
+            req_state.stats.num_generation_tokens)
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index adced8973b0..4e2c78173b5 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -6,6 +6,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.v1.metrics.stats import IterationStats
 
 
 class ParentRequest:
@@ -18,9 +19,15 @@ class ParentRequest:
     request_id: str
     sampling_params: SamplingParams
 
+    # To track the completion of child requests
+    child_requests: set[str]
+
     # To aggregate child completions when not streaming
     output_aggregator: Optional[RequestOutput]
 
+    # To find the max number of generated tokens across all children
+    max_num_generation_tokens: int
+
     # To efficiently obtain child sampling params
     cached_child_sampling_params: Optional[SamplingParams]
 
@@ -29,7 +36,9 @@ def __init__(self, request_id: str,
         self.request_id = request_id
         self.sampling_params = sampling_params
 
+        self.child_requests = set()
         self.output_aggregator = None
+        self.max_num_generation_tokens = 0
         self.cached_child_sampling_params = None
 
     @classmethod
@@ -82,8 +91,12 @@ def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         Returns:
           (request ID, sampling_params) tuple
         """
-        return (f"{index}_{self.request_id}",
-                self._get_child_sampling_params(index))
+        child_req_id = f"{index}_{self.request_id}"
+        self.child_requests.add(child_req_id)
+        return (child_req_id, self._get_child_sampling_params(index))
+
+    def finish_child_request(self, req_id: str):
+        self.child_requests.remove(req_id)
 
     @property
     def n(self) -> int:
@@ -117,3 +130,25 @@ def make_request_output(
         request_output.outputs = sorted(request_output.outputs,
                                         key=lambda x: x.index)
         return request_output
+
+    def observe_num_generation_tokens(self, num_generation_tokens: int):
+        self.max_num_generation_tokens = max(num_generation_tokens,
+                                             self.max_num_generation_tokens)
+        return self.max_num_generation_tokens
+
+    @staticmethod
+    def observe_finished_request(parent_req: Optional['ParentRequest'],
+                                 iteration_stats: IterationStats,
+                                 num_generation_tokens: int):
+
+        n_param = parent_req.n if parent_req is not None else 1
+
+        if parent_req is not None:
+            num_generation_tokens = parent_req.observe_num_generation_tokens(
+                num_generation_tokens)
+
+        # Child requests finished, we can now record to iteration stats
+        if parent_req is None or not parent_req.child_requests:
+            iteration_stats.max_num_generation_tokens_iter.append(
+                num_generation_tokens)
+            iteration_stats.n_params_iter.append(n_param)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 7f6de791048..d02b9a5da27 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -106,6 +106,9 @@ def __init__(self, vllm_config: VllmConfig):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        #
+        # Scheduler state
+        #
         self.gauge_scheduler_running = prometheus_client.Gauge(
             name="vllm:num_requests_running",
             documentation="Number of requests in model execution batches.",
@@ -116,6 +119,9 @@ def __init__(self, vllm_config: VllmConfig):
             documentation="Number of requests waiting to be processed.",
             labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # GPU cache
+        #
         self.gauge_gpu_cache_usage = prometheus_client.Gauge(
             name="vllm:gpu_cache_usage_perc",
             documentation="GPU KV-cache usage. 1 means 100 percent usage.",
@@ -133,6 +139,9 @@ def __init__(self, vllm_config: VllmConfig):
             "GPU prefix cache hits, in terms of number of cached blocks.",
             labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # Counters
+        #
         self.counter_num_preempted_reqs = prometheus_client.Counter(
             name="vllm:num_preemptions_total",
             documentation="Cumulative number of preemption from the engine.",
@@ -159,6 +168,9 @@ def __init__(self, vllm_config: VllmConfig):
                 reason] = counter_request_success_base.labels(*(labelvalues +
                                                                 [str(reason)]))
 
+        #
+        # Histograms of counts
+        #
         self.histogram_num_prompt_tokens_request = \
             prometheus_client.Histogram(
                 name="vllm:request_prompt_tokens",
@@ -180,6 +192,31 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=build_cudagraph_buckets(vllm_config),
                 labelnames=labelnames).labels(*labelvalues)
 
+        self.histogram_max_num_generation_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_max_num_generation_tokens",
+                documentation=
+                "Histogram of maximum number of requested generation tokens.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_n_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_params_n",
+                documentation="Histogram of the n request parameter.",
+                buckets=[1, 2, 5, 10, 20],
+                labelnames=labelnames).labels(*labelvalues)
+
+        self.histogram_max_tokens_request = \
+            prometheus_client.Histogram(
+                name="vllm:request_params_max_tokens",
+                documentation="Histogram of the max_tokens request parameter.",
+                buckets=build_1_2_5_buckets(max_model_len),
+                labelnames=labelnames).labels(*labelvalues)
+
+        #
+        # Histogram of timing intervals
+        #
         self.histogram_time_to_first_token = \
             prometheus_client.Histogram(
                 name="vllm:time_to_first_token_seconds",
@@ -239,6 +276,9 @@ def __init__(self, vllm_config: VllmConfig):
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # LoRA metrics
+        #
         self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
         if vllm_config.lora_config is not None:
             self.labelname_max_lora = "max_lora"
@@ -255,6 +295,9 @@ def __init__(self, vllm_config: VllmConfig):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Cache config info metric
+        #
         self.log_metrics_info("cache_config", vllm_config.cache_config)
 
     def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
@@ -296,6 +339,11 @@ def record(self, scheduler_stats: SchedulerStats,
             iteration_stats.num_prompt_tokens + \
             iteration_stats.num_generation_tokens)
 
+        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
+            self.histogram_max_num_generation_tokens_request.observe(
+                max_gen_tokens)
+        for n_param in iteration_stats.n_params_iter:
+            self.histogram_n_request.observe(n_param)
         for ttft in iteration_stats.time_to_first_tokens_iter:
             self.histogram_time_to_first_token.observe(ttft)
         for tpot in iteration_stats.time_per_output_tokens_iter:
@@ -317,6 +365,8 @@ def record(self, scheduler_stats: SchedulerStats,
                 finished_request.num_prompt_tokens)
             self.histogram_num_generation_tokens_request.observe(
                 finished_request.num_generation_tokens)
+            self.histogram_max_tokens_request.observe(
+                finished_request.max_tokens_param)
 
         if self.gauge_lora_info is not None:
             running_lora_adapters = \
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index abdca95670e..14ec7d2d746 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -66,6 +66,7 @@ class FinishedRequestStats:
     e2e_latency: float = 0.0
     num_prompt_tokens: int = 0
     num_generation_tokens: int = 0
+    max_tokens_param: Optional[int] = None
     queued_time: float = 0.0
     prefill_time: float = 0.0
     inference_time: float = 0.0
@@ -81,6 +82,8 @@ def __init__(self):
         self.num_prompt_tokens = 0
         self.num_preempted_reqs = 0
         self.finished_requests: list[FinishedRequestStats] = []
+        self.max_num_generation_tokens_iter: list[int] = []
+        self.n_params_iter: list[int] = []
         self.time_to_first_tokens_iter: list[float] = []
         self.time_per_output_tokens_iter: list[float] = []
         self.waiting_lora_adapters: dict[str, int] = {}
@@ -150,6 +153,7 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      num_prompt_tokens: int,
+                                     max_tokens_param: Optional[int],
                                      req_stats: RequestStateStats):
         e2e_latency = self._time_since(req_stats.arrival_time)
 
@@ -173,6 +177,7 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
                                  e2e_latency=e2e_latency,
                                  num_prompt_tokens=num_prompt_tokens,
                                  num_generation_tokens=req_stats.num_generation_tokens,
+                                 max_tokens_param=max_tokens_param,
                                  queued_time=queued_time,
                                  prefill_time=prefill_time,
                                  inference_time=inference_time,

From 356af0b1a7bc6d3c4b5e45be229a6b8f2df024e3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 3 Mar 2025 16:20:24 -0500
Subject: [PATCH 0475/1240] [Bugfix] Allow shared_experts skip quantization for
 DeepSeekV2/V3 (#14100)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7ff61f9a182..cf244ff572c 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -145,6 +145,7 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
             )
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:

From 553034ec81d3005316b34732776831ef119e605a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Mon, 3 Mar 2025 16:29:53 -0500
Subject: [PATCH 0476/1240] [Kernel] Optimize moe intermediate_cache usage
 (#13625)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/fused_moe/fused_moe.py               | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 00260313e72..5336b3c1002 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1240,15 +1240,20 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     config = get_config_func(M)
 
-    intermediate_cache1 = torch.empty((M, top_k_num, N),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
+    # We can reuse the memory between these because by the time we need
+    # cache3, we're done with cache1
+    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(
+        (M, topk_ids.shape[1], N))
+    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
+        (M, topk_ids.shape[1], w2.shape[1]))
+
+    # This needs separate memory since it's used concurrently with cache1
     intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
                                       device=hidden_states.device,
                                       dtype=hidden_states.dtype)
-    intermediate_cache3 = torch.empty((M, top_k_num, w2.shape[1]),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
 
     if hidden_states.dtype == torch.bfloat16:
         compute_type = tl.bfloat16

From 473d69023f2793be9f4e148df515fe2dcf9ad4a8 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Tue, 4 Mar 2025 05:59:09 +0800
Subject: [PATCH 0477/1240] [Docs] Add GPTQModel (#14056)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/auto_awq.md |  2 +-
 .../source/features/quantization/gptqmodel.md | 83 +++++++++++++++++++
 docs/source/features/quantization/index.md    |  1 +
 3 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/features/quantization/gptqmodel.md

diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md
index 7001ec91467..b703d019531 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/source/features/quantization/auto_awq.md
@@ -3,7 +3,7 @@
 # AutoAWQ
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
-Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
+Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
 The main benefits are lower latency and memory usage.
 
 You can quantize your own models by installing AutoAWQ or picking one of the [6500+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq).
diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/source/features/quantization/gptqmodel.md
new file mode 100644
index 00000000000..34adf6512b7
--- /dev/null
+++ b/docs/source/features/quantization/gptqmodel.md
@@ -0,0 +1,83 @@
+(gptqmodel)=
+
+# GPTQModel
+
+To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
+
+Quantization reduces the model's precision from BF16/FP16 (16-bits) to INT4 (4-bits) or INT8 (8-bits) which significantly reduces the
+total model memory footprint while at-the-same-time increasing inference performance.
+
+Compatible GPTQModel quantized models can leverage the `Marlin` and `Machete` vLLM custom kernels to maximize batching
+transactions-per-second `tps` and token-latency performance for both Ampere (A100+) and Hopper (H100+) Nvidia GPUs.
+These two kernels are highly optimized by vLLM and NeuralMagic (now part of Redhat) to allow world-class inference performance of quantized GPTQ
+models.
+
+GPTQModel is one of the few quantization toolkits in the world that allows `Dynamic` per-module quantization where different layers and/or modules within a llm model can be further optimized with custom quantization parameters. `Dynamic` quantization
+is fully integrated into vLLM and backed up by support from the ModelCloud.AI team. Please refer to [GPTQModel readme](https://github.com/ModelCloud/GPTQModel?tab=readme-ov-file#dynamic-quantization-per-module-quantizeconfig-override)
+for more details on this and other advanced features.
+
+You can quantize your own models by installing [GPTQModel](https://github.com/ModelCloud/GPTQModel) or picking one of the [5000+ models on Huggingface](https://huggingface.co/models?sort=trending&search=gptq).
+
+```console
+pip install -U gptqmodel --no-build-isolation -v
+```
+
+After installing GPTQModel, you are ready to quantize a model. Please refer to the [GPTQModel readme](https://github.com/ModelCloud/GPTQModel/?tab=readme-ov-file#quantization) for further details.
+
+Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
+
+```python
+from datasets import load_dataset
+from gptqmodel import GPTQModel, QuantizeConfig
+
+model_id = "meta-llama/Llama-3.2-1B-Instruct"
+quant_path = "Llama-3.2-1B-Instruct-gptqmodel-4bit"
+
+calibration_dataset = load_dataset(
+    "allenai/c4",
+    data_files="en/c4-train.00001-of-01024.json.gz",
+    split="train"
+  ).select(range(1024))["text"]
+
+quant_config = QuantizeConfig(bits=4, group_size=128)
+
+model = GPTQModel.load(model_id, quant_config)
+
+# increase `batch_size` to match gpu/vram specs to speed up quantization
+model.quantize(calibration_dataset, batch_size=2)
+
+model.save(quant_path)
+```
+
+To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
+
+```console
+python examples/offline_inference/llm_engine_example.py --model DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+```
+
+GPTQModel quantized models are also supported directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.6, top_p=0.9)
+
+# Create an LLM.
+llm = LLM(model="DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 1c98620aa21..65f438f599f 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -12,6 +12,7 @@ supported_hardware
 auto_awq
 bnb
 gguf
+gptqmodel
 int4
 int8
 fp8

From 70c2dad554ffded924b6020d8050200d8431f964 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Mon, 3 Mar 2025 15:00:55 -0800
Subject: [PATCH 0478/1240] [v1] Add comments to the new ragged paged attention
 Pallas kernel (#14155)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/pallas.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bf4a05daf2d..543e8487e28 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -11,6 +11,7 @@
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
+# These are the 2 tunable parameters of the paged attention Pallas kernel.
 NUM_QUERIES_PER_BLOCK = 16
 NUM_KV_PAGES_PER_BLOCK = 128
 
@@ -154,6 +155,9 @@ def forward(
             write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
 
         query = query * self.scale
+        # use_kernel switches between using kernel or reference implementation
+        # (non kernel: https://github.com/pytorch/xla/blob/cee0820e78fc9675e2d0511db891fd44342e890d/torch_xla/experimental/custom_kernel.py#L890).
+        use_kernel = False
         output = torch.ops.xla.ragged_paged_attention(
             query,
             key_cache,
@@ -164,7 +168,7 @@ def forward(
             attn_metadata.num_seqs,
             num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
             num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            use_kernel=False,
+            use_kernel=use_kernel,
         )
 
         return output.reshape(num_tokens, hidden_size)

From 5c2b6a82402e5fa5ccba230b3fe569088eae41e4 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Mon, 3 Mar 2025 17:04:52 -0700
Subject: [PATCH 0479/1240] [Model] Add support for GraniteMoeShared models
 (#13313)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   5 +
 tests/models/registry.py                      |   2 +
 .../model_executor/models/granitemoeshared.py | 343 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 4 files changed, 351 insertions(+)
 create mode 100644 vllm/model_executor/models/granitemoeshared.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 29ed24cfdb5..409a4d1210b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -298,6 +298,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
   * ✅︎
   * ✅︎
+- * `GraniteMoeSharedForCausalLM`
+  * Granite MoE Shared
+  * `ibm-research/moe-7b-1b-active-shared-experts` (test model)
+  * ✅︎
+  * ✅︎
 - * `GritLM`
   * GritLM
   * `parasail-ai/GritLM-7B-vllm`.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index b5ded20c5af..97db33b46fa 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -131,6 +131,8 @@ def check_available_online(
     "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
     "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
     "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts",  # noqa: E501
+                                                   min_transformers_version="4.49"),  # noqa: E501
     "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                              trust_remote_code=True),
     "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
new file mode 100644
index 00000000000..7e2e4cdcbfa
--- /dev/null
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only GraniteMoeShared model.
+
+The architecture is the same as granitemoe but with the addition of shared
+experts.
+"""
+from typing import Iterable, Optional, Set, Tuple
+
+import torch
+from torch import nn
+from transformers.models.granitemoeshared import GraniteMoeSharedConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from . import mixtral
+from .granitemoe import GraniteMoeAttention, GraniteMoeMoE
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import make_layers, maybe_prefix
+
+
+class GraniteMoeSharedMLP(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        self.input_size = config.hidden_size
+        self.hidden_size = config.shared_intermediate_size
+        self.input_linear = MergedColumnParallelLinear(
+            input_size=self.input_size,
+            output_sizes=[self.hidden_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.input_linear")
+        self.output_linear = RowParallelLinear(
+            self.hidden_size,
+            self.input_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_linear")
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.input_linear(hidden_states)
+        hidden_states = self.act_fn(hidden_states)
+        hidden_states, _ = self.output_linear(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: GraniteMoeSharedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        # Requires transformers > 4.32.0
+        rope_theta = getattr(config, "rope_theta", 10000)
+        self.self_attn = GraniteMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            max_position=config.max_position_embeddings,
+            num_kv_heads=config.num_key_value_heads,
+            rope_theta=rope_theta,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+            attention_multiplier=config.attention_multiplier)
+        self.block_sparse_moe = GraniteMoeMoE(
+            num_experts=config.num_local_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe")
+        self.shared_mlp = None if \
+            getattr(config, 'shared_intermediate_size', 0) == 0 \
+            else GraniteMoeSharedMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_mlp"
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+        self.residual_multiplier = config.residual_multiplier
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = residual + hidden_states * self.residual_multiplier
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        if self.shared_mlp is None:
+            hidden_states = self.block_sparse_moe(hidden_states)
+        else:
+            # create a copy since block_sparse_moe modifies in-place
+            moe_hidden_states = hidden_states.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_hidden_states)
+            hidden_states = moe_hidden_states + self.shared_mlp(hidden_states)
+            del moe_hidden_states
+        hidden_states = residual + hidden_states * self.residual_multiplier
+
+        return hidden_states
+
+
+@support_torch_compile
+class GraniteMoeSharedModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.padding_idx = config.pad_token_id
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            quant_config=quant_config,
+        )
+        self.embedding_multiplier = config.embedding_multiplier
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: GraniteMoeSharedDecoderLayer(
+                config, cache_config, quant_config=quant_config, prefix=prefix
+            ),
+            prefix=f"{prefix}.layers")
+
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            hidden_states *= self.embedding_multiplier
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    fall_back_to_pt_during_load = False
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+
+        self.model = GraniteMoeSharedModel(vllm_config=vllm_config,
+                                           prefix=maybe_prefix(
+                                               prefix, "model"))
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "lm_head"))
+        if config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size,
+                                                scale=1 /
+                                                self.config.logits_scaling)
+
+        self.sampler = get_sampler()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds)
+        return hidden_states
+
+    def compute_logits(
+            self, hidden_states: torch.Tensor,
+            sampling_metadata: SamplingMetadata) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        new_weights = {}
+        for n, p in weights:
+            if n.endswith('.block_sparse_moe.input_linear.weight'):
+                for e in range(p.size(0)):
+                    w1_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w1.weight")
+                    w3_name = n.replace(
+                        '.block_sparse_moe.input_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w3.weight")
+                    w1_param, w3_param = p[e].chunk(2, dim=0)
+                    assert w1_name not in new_weights
+                    assert w3_name not in new_weights
+                    new_weights[w1_name] = w1_param
+                    new_weights[w3_name] = w3_param
+            elif n.endswith('.block_sparse_moe.output_linear.weight'):
+                for e in range(p.size(0)):
+                    w2_name = n.replace(
+                        '.block_sparse_moe.output_linear.weight',
+                        f".block_sparse_moe.experts.{e}.w2.weight")
+                    w2_param = p[e]
+                    assert w2_name not in new_weights
+                    new_weights[w2_name] = w2_param
+            elif n.endswith('.block_sparse_moe.router.layer.weight'):
+                gate_name = n.replace('.block_sparse_moe.router.layer.weight',
+                                      ".block_sparse_moe.gate.weight")
+                assert gate_name not in new_weights
+                new_weights[gate_name] = p
+            elif n == 'lm_head.weight' and self.config.tie_word_embeddings:
+                pass
+            else:
+                new_weights[n] = p
+        return mixtral.MixtralForCausalLM.load_weights(self,
+                                                       new_weights.items())
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4551d81e8a5..3a7fcdcf7b3 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -60,6 +60,7 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "GraniteForCausalLM": ("granite", "GraniteForCausalLM"),
     "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"),
+    "GraniteMoeSharedForCausalLM": ("granitemoeshared", "GraniteMoeSharedForCausalLM"),   # noqa: E501
     "GritLM": ("gritlm", "GritLM"),
     "Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
     "InternLMForCausalLM": ("llama", "LlamaForCausalLM"),

From 609677187ed5cda3c63d05260359669df0a05091 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Mon, 3 Mar 2025 19:30:23 -0600
Subject: [PATCH 0480/1240] [core] moe fp8 block quant tuning support (#14068)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py           | 98 +++++++++++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 88 ++++++++++++-----
 2 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c862dec81fc..0d2d304156a 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -40,6 +40,7 @@ def benchmark_config(
     use_fp8_w8a8: bool,
     use_int8_w8a16: bool,
     num_iters: int = 100,
+    block_quant_shape: List[int] = None,
 ) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
@@ -81,8 +82,24 @@ def benchmark_config(
                                dtype=torch.float32)
         w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
     if use_fp8_w8a8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        if block_quant_shape:
+            block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+            E = num_experts
+            N = shard_intermediate_size // 2
+            K = hidden_size
+            factor_for_scale = 1e-2
+            n_tiles_w1 = (2 * N + block_n - 1) // block_n
+            n_tiles_w2 = (K + block_n - 1) // block_n
+            k_tiles_w1 = (K + block_k - 1) // block_k
+            k_tiles_w2 = (N + block_k - 1) // block_k
+            w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1),
+                                  dtype=torch.float32) * factor_for_scale
+            w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2),
+                                  dtype=torch.float32) * factor_for_scale
+        else:
+            w1_scale = torch.randn(num_experts, dtype=torch.float32)
+            w2_scale = torch.randn(num_experts, dtype=torch.float32)
+
         a1_scale = torch.randn(1, dtype=torch.float32)
         a2_scale = torch.randn(1, dtype=torch.float32)
 
@@ -111,6 +128,7 @@ def run():
                 w2_scale=w2_scale,
                 a1_scale=a1_scale,
                 a2_scale=a2_scale,
+                block_shape=block_quant_shape,
             )
 
     # JIT compilation & warmup
@@ -175,7 +193,8 @@ def get_rocm_tuning_space(use_fp16):
     return param_ranges
 
 
-def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]:
+def get_configs_compute_bound(use_fp16,
+                              block_quant_shape) -> list[dict[str, int]]:
     configs: list[BenchmarkConfig] = []
 
     if current_platform.is_rocm():
@@ -204,17 +223,27 @@ def get_configs_compute_bound(use_fp16) -> list[dict[str, int]]:
     for config_values in product(*values):
         config = dict(zip(keys, config_values))
         configs.append(config)
+
+    # Remove configs that are not compatible with fp8 block quantization
+    # BLOCK_SIZE_K must be a multiple of block_k
+    # BLOCK_SIZE_N must be a multiple of block_n
+    if block_quant_shape is not None and not use_fp16:
+        block_n, block_k = block_quant_shape[0], block_quant_shape[1]
+        for config in configs[:]:
+            if config["BLOCK_SIZE_K"] % block_k != 0 or config[
+                    "BLOCK_SIZE_N"] % block_n != 0:
+                configs.remove(config)
     return configs
 
 
 def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
-                            search_space, is_fp16):
+                            search_space, is_fp16, topk):
     N1, K1 = shard_intermediate_size, hidden_size
     N2, K2 = hidden_size, shard_intermediate_size // 2
-    pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
-                                        is_fp16)
-    pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
-                                        is_fp16)
+    pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1,
+                                        search_space, is_fp16)
+    pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2,
+                                        search_space, is_fp16)
     search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
     return search_space
 
@@ -372,6 +401,7 @@ def tune(
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
         search_space: list[dict[str, int]],
+        block_quant_shape: list[int],
     ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
@@ -380,21 +410,23 @@ def tune(
             search_space = prune_rocm_search_space(num_tokens,
                                                    shard_intermediate_size,
                                                    hidden_size, search_space,
-                                                   is_fp16)
+                                                   is_fp16, topk)
 
         with torch.cuda.device(self.device_id):
             for config in tqdm(search_space):
                 try:
-                    kernel_time = benchmark_config(config,
-                                                   num_tokens,
-                                                   num_experts,
-                                                   shard_intermediate_size,
-                                                   hidden_size,
-                                                   topk,
-                                                   dtype,
-                                                   use_fp8_w8a8,
-                                                   use_int8_w8a16,
-                                                   num_iters=20)
+                    kernel_time = benchmark_config(
+                        config,
+                        num_tokens,
+                        num_experts,
+                        shard_intermediate_size,
+                        hidden_size,
+                        topk,
+                        dtype,
+                        use_fp8_w8a8,
+                        use_int8_w8a16,
+                        num_iters=20,
+                        block_quant_shape=block_quant_shape)
                 except triton.runtime.autotuner.OutOfResources:
                     # Some configurations may be invalid and fail to compile.
                     continue
@@ -436,8 +468,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
 
 def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
                  shard_intermediate_size: int, hidden_size: int, topk: int,
-                 dtype: torch.dtype, use_fp8_w8a8: bool,
-                 use_int8_w8a16: bool) -> None:
+                 dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool,
+                 block_quant_shape: List[int]) -> None:
     dtype_str = get_config_dtype_str(dtype,
                                      use_int8_w8a16=use_int8_w8a16,
                                      use_fp8_w8a8=use_fp8_w8a8)
@@ -445,7 +477,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
     # NOTE(woosuk): The current naming convention uses w2.shape[2], which
     # is the intermediate size after silu_and_mul.
     filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
+                                    dtype_str, block_quant_shape)
 
     print(f"Writing best config to {filename}...")
     with open(filename, "w") as f:
@@ -455,7 +487,7 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
 
 def main(args: argparse.Namespace):
     print(args)
-
+    block_quant_shape = None
     config = AutoConfig.from_pretrained(
         args.model, trust_remote_code=args.trust_remote_code)
     if config.architectures[0] == "DbrxForCausalLM":
@@ -474,6 +506,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+        block_quant_shape = config.quantization_config['weight_block_size']
     else:
         # Default: Mixtral.
         E = config.num_local_experts
@@ -511,27 +544,30 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
 
     if args.tune:
         is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
-        search_space = get_configs_compute_bound(is_fp16)
+        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
         print(f"Start tuning over {len(search_space)} configurations...")
 
         start = time.time()
         configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
-                     for batch_size in batch_sizes])
+            "tune",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
+             for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
         }
         save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8_w8a8, use_int8_w8a16)
+                     topk, dtype, use_fp8_w8a8, use_int8_w8a16,
+                     block_quant_shape)
         end = time.time()
         print(f"Tuning took {end - start:.2f} seconds")
     else:
         outputs = _distribute(
-            "benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
-                           topk, dtype, use_fp8_w8a8, use_int8_w8a16)
-                          for batch_size in batch_sizes])
+            "benchmark",
+            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+             for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
             print(f"Batch size: {batch_size}, config: {config}")
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
index 2b1167fc71e..63e118746fd 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,28 +1,28 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "2": {
-        "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "4": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0
     },
@@ -31,15 +31,15 @@
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 1,
         "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0
@@ -49,13 +49,13 @@
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 4,
+        "num_warps": 2,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "32": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
@@ -64,7 +64,7 @@
     },
     "48": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 4,
         "num_warps": 2,
@@ -73,7 +73,7 @@
     },
     "64": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 2,
@@ -82,46 +82,82 @@
     },
     "96": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "128": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 2,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "256": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 4,
+        "GROUP_SIZE_M": 8,
         "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "512": {
         "BLOCK_SIZE_M": 32,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 8,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 8,
-        "num_warps": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
         "num_stages": 2,
         "waves_per_eu": 0
     }

From 5cdba456b10d2b2590ed749e190cd207f7f270be Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 3 Mar 2025 19:09:34 -0800
Subject: [PATCH 0481/1240] [Misc] Remove lru_cache in NvmlCudaPlatform
 (#14156)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cuda.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bffa113cab8..00bbfec1ef7 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -4,7 +4,7 @@
 """
 
 import os
-from functools import lru_cache, wraps
+from functools import wraps
 from typing import (TYPE_CHECKING, Callable, List, Optional, Tuple, TypeVar,
                     Union)
 
@@ -284,7 +284,6 @@ def get_device_communicator_cls(cls) -> str:
 class NvmlCudaPlatform(CudaPlatformBase):
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_capability(cls,
                               device_id: int = 0
@@ -298,7 +297,6 @@ def get_device_capability(cls,
             return None
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def has_device_capability(
         cls,
@@ -311,14 +309,12 @@ def has_device_capability(
             return False
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_name(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
         return cls._get_physical_device_name(physical_device_id)
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_uuid(cls, device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)
@@ -326,7 +322,6 @@ def get_device_uuid(cls, device_id: int = 0) -> str:
         return pynvml.nvmlDeviceGetUUID(handle)
 
     @classmethod
-    @lru_cache(maxsize=8)
     @with_nvml_context
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         physical_device_id = device_id_to_physical_device_id(device_id)

From 63dc8e4375c9aa69592aedb6ea61f072e0c9b8e7 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Mon, 3 Mar 2025 19:44:17 -0800
Subject: [PATCH 0482/1240] [core] Pass all driver env vars to ray workers
 unless excluded (#14099)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 30 ++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 108f606e2fb..3b1735fdcf7 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import json
 import os
 from collections import defaultdict
 from dataclasses import dataclass
@@ -48,6 +49,24 @@ class RayWorkerMetaData:
 
 
 class RayDistributedExecutor(DistributedExecutorBase):
+    """Ray-based distributed executor"""
+
+    # These env vars are worker-specific, therefore are NOT copied
+    # from the driver to the workers
+    WORKER_SPECIFIC_ENV_VARS = {
+        "VLLM_HOST_IP", "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES"
+    }
+
+    config_home = envs.VLLM_CONFIG_ROOT
+    # This file contains a list of env vars that should not be copied
+    # from the driver to the Ray workers.
+    non_carry_over_env_vars_file = os.path.join(
+        config_home, "ray_non_carry_over_env_vars.json")
+    if os.path.exists(non_carry_over_env_vars_file):
+        with open(non_carry_over_env_vars_file) as f:
+            non_carry_over_env_vars = set(json.load(f))
+    else:
+        non_carry_over_env_vars = set()
 
     uses_ray: bool = True
 
@@ -311,9 +330,9 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
 
         # Environment variables to copy from driver to workers
         env_vars_to_copy = [
-            "VLLM_ATTENTION_BACKEND", "TPU_CHIPS_PER_HOST_BOUNDS",
-            "TPU_HOST_BOUNDS", "VLLM_USE_V1", "VLLM_TRACE_FUNCTION",
-            "VLLM_TORCH_PROFILER_DIR", "VLLM_TEST_ENABLE_EP"
+            v for v in envs.environment_variables
+            if v not in self.WORKER_SPECIFIC_ENV_VARS
+            and v not in self.non_carry_over_env_vars
         ]
 
         # Copy existing env vars to each worker's args
@@ -323,9 +342,14 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
                 if name in os.environ:
                     args[name] = os.environ[name]
 
+        logger.info("non_carry_over_env_vars from config: %s",
+                    self.non_carry_over_env_vars)
         logger.info(
             "Copying the following environment variables to workers: %s",
             [v for v in env_vars_to_copy if v in os.environ])
+        logger.info(
+            "If certain env vars should NOT be copied to workers, add them to "
+            "%s file", self.non_carry_over_env_vars_file)
 
         self._env_vars_for_all_workers = (
             all_args_to_update_environment_variables)

From 5959ca934a9b23623b9a6a6325218177cc0d53c8 Mon Sep 17 00:00:00 2001
From: Zhanwen Chen <phil.zhanwen.chen@gmail.com>
Date: Tue, 4 Mar 2025 00:09:22 -0500
Subject: [PATCH 0483/1240] Use math.prod instead of np.prod for trivial ops
 (#14142)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/cache_engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 3960392cf74..004b4e4b757 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """CacheEngine class for managing the KV cache."""
+from math import prod
 from typing import List
 
-import numpy as np
 import torch
 
 from vllm import envs
@@ -90,7 +90,7 @@ def _allocate_kv_cache(
             # NOTE this assumption currently only holds for MLA so we only apply
             # this optimization when `use_mla` is true
             entry_shape = kv_cache_shape[2:]
-            entry_size = np.prod(entry_shape)
+            entry_size = prod(entry_shape)
             alloc_entry_size = align_to_256bytes(entry_size, self.dtype)
             alloc_shape = (*kv_cache_shape[:2], alloc_entry_size)
         else:

From a83467c7d423fd30a01e85dd329d7dde487e8c7a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Mar 2025 00:11:03 -0500
Subject: [PATCH 0484/1240] Fix benchmark_moe.py tuning for CUDA devices
 (#14164)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 0d2d304156a..bb28c32798e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -2,6 +2,7 @@
 
 import argparse
 import time
+from contextlib import nullcontext
 from datetime import datetime
 from itertools import product
 from typing import Any, TypedDict
@@ -412,7 +413,8 @@ def tune(
                                                    hidden_size, search_space,
                                                    is_fp16, topk)
 
-        with torch.cuda.device(self.device_id):
+        with torch.cuda.device(self.device_id) if current_platform.is_rocm(
+        ) else nullcontext():
             for config in tqdm(search_space):
                 try:
                     kernel_time = benchmark_config(

From abc0d36edf7853d43b2be5c57645b2945427d298 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 4 Mar 2025 18:39:16 +0800
Subject: [PATCH 0485/1240] [platform] add debug logging during inferring the
 device type (#14195)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 64 ++++++++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 48cf8f7a323..89e69c7f578 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -32,6 +32,7 @@ def vllm_version_matches_substr(substr: str) -> bool:
 
 def tpu_platform_plugin() -> Optional[str]:
     is_tpu = False
+    logger.debug("Checking if TPU platform is available.")
     try:
         # While it's technically possible to install libtpu on a
         # non-TPU machine, this is a very uncommon scenario. Therefore,
@@ -39,7 +40,9 @@ def tpu_platform_plugin() -> Optional[str]:
         # has TPUs.
         import libtpu  # noqa: F401
         is_tpu = True
-    except Exception:
+        logger.debug("Confirmed TPU platform is available.")
+    except Exception as e:
+        logger.debug("TPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
@@ -47,7 +50,7 @@ def tpu_platform_plugin() -> Optional[str]:
 
 def cuda_platform_plugin() -> Optional[str]:
     is_cuda = False
-
+    logger.debug("Checking if CUDA platform is available.")
     try:
         from vllm.utils import import_pynvml
         pynvml = import_pynvml()
@@ -60,9 +63,19 @@ def cuda_platform_plugin() -> Optional[str]:
             # on a GPU machine, even if in a cpu build.
             is_cuda = (pynvml.nvmlDeviceGetCount() > 0
                        and not vllm_version_matches_substr("cpu"))
+            if pynvml.nvmlDeviceGetCount() <= 0:
+                logger.debug(
+                    "CUDA platform is not available because no GPU is found.")
+            if vllm_version_matches_substr("cpu"):
+                logger.debug("CUDA platform is not available because"
+                             " vLLM is built with CPU.")
+            if is_cuda:
+                logger.debug("Confirmed CUDA platform is available.")
         finally:
             pynvml.nvmlShutdown()
     except Exception as e:
+        logger.debug("Exception happens when checking CUDA platform: %s",
+                     str(e))
         if "nvml" not in e.__class__.__name__.lower():
             # If the error is not related to NVML, re-raise it.
             raise e
@@ -75,23 +88,28 @@ def cuda_is_jetson() -> bool:
                 or os.path.exists("/sys/class/tegra-firmware")
 
         if cuda_is_jetson():
+            logger.debug("Confirmed CUDA platform is available on Jetson.")
             is_cuda = True
+        else:
+            logger.debug("CUDA platform is not available because: %s", str(e))
 
     return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None
 
 
 def rocm_platform_plugin() -> Optional[str]:
     is_rocm = False
-
+    logger.debug("Checking if ROCm platform is available.")
     try:
         import amdsmi
         amdsmi.amdsmi_init()
         try:
             if len(amdsmi.amdsmi_get_processor_handles()) > 0:
                 is_rocm = True
+                logger.debug("Confirmed ROCm platform is available.")
         finally:
             amdsmi.amdsmi_shut_down()
-    except Exception:
+    except Exception as e:
+        logger.debug("ROCm platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
@@ -99,10 +117,17 @@ def rocm_platform_plugin() -> Optional[str]:
 
 def hpu_platform_plugin() -> Optional[str]:
     is_hpu = False
+    logger.debug("Checking if HPU platform is available.")
     try:
         from importlib import util
         is_hpu = util.find_spec('habana_frameworks') is not None
-    except Exception:
+        if is_hpu:
+            logger.debug("Confirmed HPU platform is available.")
+        else:
+            logger.debug("HPU platform is not available because "
+                         "habana_frameworks is not found.")
+    except Exception as e:
+        logger.debug("HPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
@@ -110,7 +135,7 @@ def hpu_platform_plugin() -> Optional[str]:
 
 def xpu_platform_plugin() -> Optional[str]:
     is_xpu = False
-
+    logger.debug("Checking if XPU platform is available.")
     try:
         # installed IPEX if the machine has XPUs.
         import intel_extension_for_pytorch  # noqa: F401
@@ -118,7 +143,9 @@ def xpu_platform_plugin() -> Optional[str]:
         import torch
         if hasattr(torch, 'xpu') and torch.xpu.is_available():
             is_xpu = True
-    except Exception:
+            logger.debug("Confirmed XPU platform is available.")
+    except Exception as e:
+        logger.debug("XPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
@@ -126,13 +153,21 @@ def xpu_platform_plugin() -> Optional[str]:
 
 def cpu_platform_plugin() -> Optional[str]:
     is_cpu = False
+    logger.debug("Checking if CPU platform is available.")
     try:
         is_cpu = vllm_version_matches_substr("cpu")
+        if is_cpu:
+            logger.debug("Confirmed CPU platform is available because"
+                         " vLLM is built with CPU.")
         if not is_cpu:
             import platform
             is_cpu = platform.machine().lower().startswith("arm")
+            if is_cpu:
+                logger.debug("Confirmed CPU platform is available"
+                             " because the machine is ARM.")
 
-    except Exception:
+    except Exception as e:
+        logger.debug("CPU platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
@@ -140,10 +175,14 @@ def cpu_platform_plugin() -> Optional[str]:
 
 def neuron_platform_plugin() -> Optional[str]:
     is_neuron = False
+    logger.debug("Checking if Neuron platform is available.")
     try:
         import transformers_neuronx  # noqa: F401
         is_neuron = True
-    except ImportError:
+        logger.debug("Confirmed Neuron platform is available because"
+                     " transformers_neuronx is found.")
+    except ImportError as e:
+        logger.debug("Neuron platform is not available because: %s", str(e))
         pass
 
     return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
@@ -151,8 +190,15 @@ def neuron_platform_plugin() -> Optional[str]:
 
 def openvino_platform_plugin() -> Optional[str]:
     is_openvino = False
+    logger.debug("Checking if OpenVINO platform is available.")
     with suppress(Exception):
         is_openvino = vllm_version_matches_substr("openvino")
+        if is_openvino:
+            logger.debug("Confirmed OpenVINO platform is available"
+                         " because vLLM is built with OpenVINO.")
+    if not is_openvino:
+        logger.debug("OpenVINO platform is not available because"
+                     " vLLM is not built with OpenVINO.")
 
     return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
 

From 268998b9b76c2bf8df4f294184e17e9c5db626aa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 4 Mar 2025 18:54:19 +0800
Subject: [PATCH 0486/1240] [sleep mode] error out with expandable_segments
 (#14189)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/device_allocator/cumem.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 7f63fc14378..0291fd9e1c8 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
 
@@ -140,6 +141,12 @@ def get_instance() -> "CuMemAllocator":
         return CuMemAllocator.instance
 
     def __init__(self):
+        conf = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+        assert "expandable_segments:True" not in conf, \
+            ("Expandable segments are not compatible with memory pool. "
+            "Please track https://github.com/pytorch/pytorch/issues/147851 "
+            "for the latest updates.")
+
         self.pointer_to_data: Dict[int, AllocationData] = {}
         self.current_tag: str = CuMemAllocator.default_tag
         self.allocator_and_pools: Dict[str, Any] = {}

From 25eeb044d4ce332bc7cc931e5f9860991a9b4c94 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 4 Mar 2025 20:47:06 +0800
Subject: [PATCH 0487/1240] [doc] add "Failed to infer device type" to faq
 (#14200)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/troubleshooting.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index 92103e65bbb..fdfaf9f9326 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -254,6 +254,10 @@ ValueError: Model architectures ['<arch>'] are not supported for now. Supported
 
 But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model.
 
+## Failed to infer device type
+
+If you see an error like `RuntimeError: Failed to infer device type`, it means that vLLM failed to infer the device type of the runtime environment. You can check [the code](gh-file:vllm/platforms/__init__.py) to see how vLLM infers the device type and why it is not working as expected. After [this PR](gh-pr:14195), you can also set the environment variable `VLLM_LOGGING_LEVEL=DEBUG` to see more detailed logs to help debug the issue.
+
 ## Known Issues
 
 - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759).

From fddcc3b8e9f234ffb5dcb586e32238b21b98bfad Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Mar 2025 09:25:27 -0500
Subject: [PATCH 0488/1240] [Bugfix] Restrict MacOS CPU detection (#14210)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 89e69c7f578..74ef8bd1cff 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -160,11 +160,11 @@ def cpu_platform_plugin() -> Optional[str]:
             logger.debug("Confirmed CPU platform is available because"
                          " vLLM is built with CPU.")
         if not is_cpu:
-            import platform
-            is_cpu = platform.machine().lower().startswith("arm")
+            import sys
+            is_cpu = sys.platform.startswith("darwin")
             if is_cpu:
                 logger.debug("Confirmed CPU platform is available"
-                             " because the machine is ARM.")
+                             " because the machine is MacOS.")
 
     except Exception as e:
         logger.debug("CPU platform is not available because: %s", str(e))

From 118fff2b97f86e08744f3b066f57ab84a13d6a34 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 4 Mar 2025 07:06:47 -0800
Subject: [PATCH 0489/1240] [V1][BugFix] Fix remaining sync engine client
 shutdown errors/hangs (#13869)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_llm_engine.py |  2 -
 vllm/utils.py                      | 22 ++++----
 vllm/v1/engine/core_client.py      | 84 ++++++++++++++++++++----------
 3 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 33c884e6de3..43b16d3e5a2 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -15,8 +15,6 @@
 def _vllm_model(apc: bool, vllm_runner, monkeypatch):
     """Set up VllmRunner instance."""
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    # TODO(nick): Single-proc to work around a ZMQ shutdown hang for now.
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
     return vllm_runner(
         MODEL,
         dtype=DTYPE,
diff --git a/vllm/utils.py b/vllm/utils.py
index 26c9e1a9083..66d629011dd 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -500,6 +500,10 @@ def get_open_zmq_ipc_path() -> str:
     return f"ipc://{base_rpc_path}/{uuid4()}"
 
 
+def get_open_zmq_inproc_path() -> str:
+    return f"inproc://{uuid4()}"
+
+
 def get_open_port() -> int:
     """
     Get an open port for the vLLM process to listen on.
@@ -2108,12 +2112,12 @@ def get_exception_traceback():
 def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
-    type: Any,
+    socket_type: Any,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
     mem = psutil.virtual_memory()
-    socket = ctx.socket(type)
+    socket = ctx.socket(socket_type)
 
     # Calculate buffer size based on system memory
     total_mem = mem.total / 1024**3
@@ -2127,29 +2131,27 @@ def make_zmq_socket(
     else:
         buf_size = -1  # Use system default buffer size
 
-    if type == zmq.constants.PULL:
+    if socket_type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)
         socket.setsockopt(zmq.constants.RCVBUF, buf_size)
         socket.connect(path)
-    elif type == zmq.constants.PUSH:
+    elif socket_type == zmq.constants.PUSH:
         socket.setsockopt(zmq.constants.SNDHWM, 0)
         socket.setsockopt(zmq.constants.SNDBUF, buf_size)
         socket.bind(path)
     else:
-        raise ValueError(f"Unknown Socket Type: {type}")
+        raise ValueError(f"Unknown Socket Type: {socket_type}")
 
     return socket
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(
-        path: str,
-        type: Any) -> Iterator[zmq.Socket]:  # type: ignore[name-defined]
+def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
-    ctx = zmq.Context(io_threads=2)  # type: ignore[attr-defined]
+    ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        yield make_zmq_socket(ctx, path, type)
+        yield make_zmq_socket(ctx, path, socket_type)
 
     except KeyboardInterrupt:
         logger.debug("Got Keyboard Interrupt.")
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index cdce14afe0b..55057179f3a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -18,8 +18,8 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
-                        make_zmq_socket)
+from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
+                        kill_process_tree, make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
@@ -202,10 +202,11 @@ class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: Union[zmq.Context, zmq.asyncio.Context] = None
+    ctx: Union[zmq.Context] = None
     output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
     input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
     proc_handle: Optional[BackgroundProcHandle] = None
+    shutdown_path: Optional[str] = None
 
     def __call__(self):
         """Clean up background resources."""
@@ -218,8 +219,13 @@ def __call__(self):
             self.output_socket.close(linger=0)
         if self.input_socket is not None:
             self.input_socket.close(linger=0)
-        if self.ctx is not None:
-            self.ctx.destroy(linger=0)
+        if self.shutdown_path is not None:
+            # We must ensure that the sync output socket is
+            # closed cleanly in its own thread.
+            with self.ctx.socket(zmq.PAIR) as shutdown_sender:
+                shutdown_sender.connect(self.shutdown_path)
+                # Send shutdown signal.
+                shutdown_sender.send(b'')
 
 
 class MPClient(EngineCoreClient):
@@ -261,28 +267,23 @@ def sigusr1_handler(signum, frame):
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        self.ctx = (
-            zmq.asyncio.Context()  # type: ignore[attr-defined]
-            if asyncio_mode else zmq.Context())  # type: ignore[attr-defined]
+        sync_ctx = zmq.Context()
+        self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
         # when the client is garbage collected,  even if an
         # exception is raised mid-construction.
-        resources = BackgroundResources(ctx=self.ctx)
-        self._finalizer = weakref.finalize(self, resources)
+        self.resources = BackgroundResources(ctx=sync_ctx)
+        self._finalizer = weakref.finalize(self, self.resources)
 
-        # Paths and sockets for IPC.
-        output_path = get_open_zmq_ipc_path()
+        # Paths for IPC.
+        self.output_path = get_open_zmq_ipc_path()
         input_path = get_open_zmq_ipc_path()
-        resources.output_socket = make_zmq_socket(self.ctx, output_path,
-                                                  zmq.constants.PULL)
-        resources.input_socket = make_zmq_socket(self.ctx, input_path,
-                                                 zmq.constants.PUSH)
 
         # Start EngineCore in background process.
-        resources.proc_handle = BackgroundProcHandle(
+        self.resources.proc_handle = BackgroundProcHandle(
             input_path=input_path,
-            output_path=output_path,
+            output_path=self.output_path,
             process_name="EngineCore",
             target_fn=EngineCoreProc.run_engine_core,
             process_kwargs={
@@ -291,8 +292,10 @@ def sigusr1_handler(signum, frame):
                 "log_stats": log_stats,
             })
 
-        self.output_socket = resources.output_socket
-        self.input_socket = resources.input_socket
+        # Create input socket.
+        self.resources.input_socket = make_zmq_socket(self.ctx, input_path,
+                                                      zmq.constants.PUSH)
+        self.input_socket = self.resources.input_socket
         self.utility_results: dict[int, AnyFuture] = {}
 
     def shutdown(self):
@@ -325,27 +328,48 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         # Ensure that the outputs socket processing thread does not have
         # a ref to the client which prevents gc.
-        output_socket = self.output_socket
+        ctx = self.ctx
+        output_path = self.output_path
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
 
+        shutdown_path = get_open_zmq_inproc_path()
+        self.resources.shutdown_path = shutdown_path
+
         def process_outputs_socket():
+            shutdown_socket = ctx.socket(zmq.PAIR)
+            shutdown_socket.bind(shutdown_path)
+            out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
             try:
+                poller = zmq.Poller()
+                poller.register(shutdown_socket)
+                poller.register(out_socket)
                 while True:
-                    (frame, ) = output_socket.recv_multipart(copy=False)
+                    socks = poller.poll()
+                    if not socks:
+                        continue
+                    if len(socks) == 2 or socks[0][0] == shutdown_socket:
+                        # shutdown signal, exit thread.
+                        break
+
+                    (frame, ) = out_socket.recv_multipart(copy=False)
                     outputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
                     else:
                         outputs_queue.put_nowait(outputs)
-            except zmq.error.ContextTerminated:
-                # Expected when the class is GC'd / during process termination.
-                pass
+            finally:
+                # Close sockets.
+                shutdown_socket.close(linger=0)
+                out_socket.close(linger=0)
 
         # Process outputs from engine in separate thread.
-        Thread(target=process_outputs_socket, daemon=True).start()
+        self.output_queue_thread = Thread(target=process_outputs_socket,
+                                          name="EngineCoreOutputQueueThread",
+                                          daemon=True)
+        self.output_queue_thread.start()
 
     def get_output(self) -> EngineCoreOutputs:
         return self.outputs_queue.get()
@@ -424,10 +448,13 @@ async def _start_output_queue_task(self):
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
         self.outputs_queue = asyncio.Queue()
-        output_socket = self.output_socket
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
+        output_path = self.output_path
+        output_socket = make_zmq_socket(self.ctx, output_path,
+                                        zmq.constants.PULL)
+        self.resources.output_socket = output_socket
 
         async def process_outputs_socket():
             while True:
@@ -439,7 +466,8 @@ async def process_outputs_socket():
                 else:
                     outputs_queue.put_nowait(outputs)
 
-        self.queue_task = asyncio.create_task(process_outputs_socket())
+        self.queue_task = asyncio.create_task(process_outputs_socket(),
+                                              name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
         if self.outputs_queue is None:

From fd195ad3963cce55660394fa7d5a0ff45bb7b8a3 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 4 Mar 2025 15:11:33 +0000
Subject: [PATCH 0490/1240] [V0][Metrics] Deprecate some questionable request
 time metrics (#14135)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/metrics.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 08be2cbc0b9..70f36d1290c 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -197,24 +197,35 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             "Histogram of time spent in DECODE phase for request.",
             labelnames=labelnames,
             buckets=request_latency_buckets)
+        # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_time_in_queue_request = self._histogram_cls(
             name="vllm:time_in_queue_requests",
-            documentation=
-            "Histogram of time the request spent in the queue in seconds.",
+            documentation=(
+                "Histogram of time the request spent in the queue in seconds. "
+                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
             labelnames=labelnames,
             buckets=request_latency_buckets)
+
+        # Deprecated in 0.8 - use prefill/decode/inference time metrics
+        # TODO: in 0.9, only enable if show_hidden_metrics=True
         self.histogram_model_forward_time_request = self._histogram_cls(
             name="vllm:model_forward_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model forward pass in ms.",
+            documentation=(
+                "Histogram of time spent in the model forward pass in ms. "
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
         self.histogram_model_execute_time_request = self._histogram_cls(
             name="vllm:model_execute_time_milliseconds",
-            documentation=
-            "Histogram of time spent in the model execute function in ms.",
+            documentation=(
+                "Histogram of time spent in the model execute function in ms."
+                "DEPRECATED: use prefill/decode/inference time metrics instead."
+            ),
             labelnames=labelnames,
             buckets=build_1_2_3_5_8_buckets(3000))
+
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
             name="vllm:request_prompt_tokens",

From c744ee756e9706d9da1ad125a52b8c644bce9541 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Tue, 4 Mar 2025 07:43:59 -0800
Subject: [PATCH 0491/1240] [V1][Molmo] Fix get_multimodal_embeddings() in
 molmo.py (#14161)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language.py | 294 +++++++++++-------
 vllm/model_executor/models/aria.py            |   4 +-
 vllm/model_executor/models/blip2.py           |   4 +-
 vllm/model_executor/models/chameleon.py       |   4 +-
 vllm/model_executor/models/deepseek_vl2.py    |   4 +-
 vllm/model_executor/models/florence2.py       |   4 +-
 vllm/model_executor/models/fuyu.py            |   6 +-
 vllm/model_executor/models/glm4v.py           |   4 +-
 vllm/model_executor/models/idefics3.py        |   4 +-
 vllm/model_executor/models/interfaces.py      |  18 +-
 vllm/model_executor/models/internvl.py        |   4 +-
 vllm/model_executor/models/llava.py           |   4 +-
 vllm/model_executor/models/llava_next.py      |   4 +-
 .../model_executor/models/llava_next_video.py |   4 +-
 vllm/model_executor/models/molmo.py           |   9 +-
 vllm/model_executor/models/paligemma.py       |   4 +-
 vllm/model_executor/models/phi3v.py           |   4 +-
 vllm/model_executor/models/pixtral.py         |   4 +-
 vllm/model_executor/models/qwen2_audio.py     |   4 +-
 vllm/model_executor/models/qwen_vl.py         |   4 +-
 vllm/model_executor/models/ultravox.py        |   4 +-
 vllm/model_executor/models/whisper.py         |   4 +-
 22 files changed, 249 insertions(+), 150 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index e2ec36211b8..a0a71f18ed9 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -21,7 +21,7 @@
 
 
 # Aria
-def run_aria(question: str, modality: str):
+def run_aria(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
@@ -32,41 +32,42 @@ def run_aria(question: str, modality: str):
               dtype="bfloat16",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-              "<|im_end|>\n<|im_start|>assistant\n")
+    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+                "<|im_end|>\n<|im_start|>assistant\n")
+               for question in questions]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # BLIP-2
-def run_blip2(question: str, modality: str):
+def run_blip2(questions: list[str], modality: str):
     assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-    prompt = f"Question: {question} Answer:"
+    prompts = [f"Question: {question} Answer:" for question in questions]
     llm = LLM(model="Salesforce/blip2-opt-2.7b",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Chameleon
-def run_chameleon(question: str, modality: str):
+def run_chameleon(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}<image>"
+    prompts = [f"{question}<image>" for question in questions]
     llm = LLM(model="facebook/chameleon-7b",
               max_model_len=4096,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Deepseek-VL2
-def run_deepseek_vl2(question: str, modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -77,9 +78,12 @@ def run_deepseek_vl2(question: str, modality: str):
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
               hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
 
-    prompt = f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+    prompts = [
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
+        for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Florence2
@@ -99,20 +103,20 @@ def run_florence2(question: str, modality: str):
 
 
 # Fuyu
-def run_fuyu(question: str, modality: str):
+def run_fuyu(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"{question}\n"
+    prompts = [f"{question}\n" for question in questions]
     llm = LLM(model="adept/fuyu-8b",
               max_model_len=2048,
               max_num_seqs=2,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # GLM-4v
-def run_glm4v(question: str, modality: str):
+def run_glm4v(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "THUDM/glm-4v-9b"
 
@@ -124,15 +128,17 @@ def run_glm4v(question: str, modality: str):
               hf_overrides={"architectures": ["GLM4VForCausalLM"]},
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
 
-    prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>"
+    prompts = [
+        f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
+        {question}<|assistant|>" for question in questions
+    ]
 
     stop_token_ids = [151329, 151336, 151338]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # H2OVL-Mississippi
-def run_h2ovl(question: str, modality: str):
+def run_h2ovl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "h2oai/h2ovl-mississippi-800m"
@@ -146,19 +152,24 @@ def run_h2ovl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompts = [
+        tokenizer.apply_chat_template([{
+            'role': 'user',
+            'content': f"<image>\n{question}"
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for question in questions
+    ]
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, modality: str):
+def run_idefics3(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
@@ -176,15 +187,15 @@ def run_idefics3(question: str, modality: str):
         },
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    prompt = (
+    prompts = [(
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    )
+    ) for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # InternVL
-def run_internvl(question: str, modality: str):
+def run_internvl(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "OpenGVLab/InternVL2-2B"
@@ -198,10 +209,15 @@ def run_internvl(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompts = [
+        tokenizer.apply_chat_template([{
+            'role': 'user',
+            'content': f"<image>\n{question}"
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for question in questions
+    ]
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -209,71 +225,82 @@ def run_internvl(question: str, modality: str):
     # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.5
-def run_llava(question: str, modality: str):
+def run_llava(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    prompts = [
+        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
+    ]
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question: str, modality: str):
+def run_llava_next(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"[INST] <image>\n{question} [/INST]"
+    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question: str, modality: str):
+def run_llava_next_video(questions: list[str], modality: str):
     assert modality == "video"
 
-    prompt = f"USER: <video>\n{question} ASSISTANT:"
+    prompts = [
+        f"USER: <video>\n{question} ASSISTANT:" for question in questions
+    ]
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
               max_model_len=8192,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # LLaVA-OneVision
-def run_llava_onevision(question: str, modality: str):
+def run_llava_onevision(questions: list[str], modality: str):
 
     if modality == "video":
-        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]
 
     elif modality == "image":
-        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n"
+        prompts = [
+            f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+        ]
 
     llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
               max_model_len=16384,
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Mantis
-def run_mantis(question: str, modality: str):
+def run_mantis(questions: list[str], modality: str):
     assert modality == "image"
 
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompt = llama3_template.format(f"{question}\n<image>")
+    prompts = [
+        llama3_template.format(f"{question}\n<image>")
+        for question in questions
+    ]
 
     llm = LLM(
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
@@ -282,11 +309,11 @@ def run_mantis(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # MiniCPM-V
-def run_minicpmv_base(question: str, modality: str, model_name):
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
     assert modality in ["image", "video"]
     # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
 
@@ -333,26 +360,28 @@ def run_minicpmv_base(question: str, modality: str, model_name):
         "video": "(<video>./</video>)",
     }
 
-    messages = [{
-        'role': 'user',
-        'content': f'{modality_placeholder[modality]}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+    return llm, prompts, stop_token_ids
 
 
-def run_minicpmo(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6")
+def run_minicpmo(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
 
 
-def run_minicpmv(question: str, modality: str):
-    return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6")
+def run_minicpmv(questions: list[str], modality: str):
+    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
 # LLama 3.2
-def run_mllama(question: str, modality: str):
+def run_mllama(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -379,16 +408,16 @@ def run_mllama(question: str, modality: str):
             "type": "text",
             "text": f"{question}"
         }]
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           add_generation_prompt=True,
-                                           tokenize=False)
+    } for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            add_generation_prompt=True,
+                                            tokenize=False)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Molmo
-def run_molmo(question, modality):
+def run_molmo(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "allenai/Molmo-7B-D-0924"
@@ -400,13 +429,16 @@ def run_molmo(question, modality):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = question
+    prompts = [
+        f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n" for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # NVLM-D
-def run_nvlm_d(question: str, modality: str):
+def run_nvlm_d(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "nvidia/NVLM-D-72B"
@@ -422,12 +454,15 @@ def run_nvlm_d(question: str, modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    } for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # PaliGemma
@@ -435,7 +470,7 @@ def run_paligemma(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma-3b-mix-224",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -447,7 +482,7 @@ def run_paligemma2(question: str, modality: str):
     assert modality == "image"
 
     # PaliGemma 2 has special prompt format for VQA
-    prompt = "caption en"
+    prompt = ["caption en"]
     llm = LLM(model="google/paligemma2-3b-ft-docci-448",
               disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
     stop_token_ids = None
@@ -455,10 +490,13 @@ def run_paligemma2(question: str, modality: str):
 
 
 # Phi-3-Vision
-def run_phi3v(question: str, modality: str):
+def run_phi3v(questions: list[str], modality: str):
     assert modality == "image"
 
-    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+    prompts = [
+        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
+        for question in questions
+    ]
 
     # num_crops is an override kwarg to the multimodal image processor;
     # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
@@ -482,11 +520,11 @@ def run_phi3v(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Pixtral HF-format
-def run_pixtral_hf(question: str, modality: str):
+def run_pixtral_hf(questions: list[str], modality: str):
     assert modality == "image"
 
     model_name = "mistral-community/pixtral-12b"
@@ -499,13 +537,13 @@ def run_pixtral_hf(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"<s>[INST]{question}\n[IMG][/INST]"
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen
-def run_qwen_vl(question: str, modality: str):
+def run_qwen_vl(questions: list[str], modality: str):
     assert modality == "image"
 
     llm = LLM(
@@ -517,13 +555,13 @@ def run_qwen_vl(question: str, modality: str):
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
-    prompt = f"{question}Picture 1: <img></img>\n"
+    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2-VL
-def run_qwen2_vl(question: str, modality: str):
+def run_qwen2_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
@@ -544,16 +582,18 @@ def run_qwen2_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 # Qwen2.5-VL
-def run_qwen2_5_vl(question: str, modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str):
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
@@ -574,12 +614,14 @@ def run_qwen2_5_vl(question: str, modality: str):
     elif modality == "video":
         placeholder = "<|video_pad|>"
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-              f"{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompts = [
+        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+         f"{question}<|im_end|>\n"
+         "<|im_start|>assistant\n") for question in questions
+    ]
     stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    return llm, prompts, stop_token_ids
 
 
 model_example_map = {
@@ -624,29 +666,35 @@ def get_multi_modal_input(args):
         # Input image and question
         image = ImageAsset("cherry_blossom") \
             .pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
     """Repeats images with provided probability of "image_repeat_prob". 
     Used to simulate hit/miss for the MM preprocessor cache.
     """
@@ -666,7 +714,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
                 cur_image.putpixel((0, 0), new_val)
 
         inputs.append({
-            "prompt": prompt,
+            "prompt": prompts[i % len(prompts)],
             "multi_modal_data": {
                 modality: cur_image
             }
@@ -683,9 +731,14 @@ def main(args):
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
+    llm, prompts, stop_token_ids = model_example_map[model](questions,
+                                                            modality)
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = prompts if args.use_different_prompt_per_request else [
+        prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -697,27 +750,26 @@ def main(args):
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
         if args.image_repeat_prob is not None:
             # Repeat images with specified probability of "image_repeat_prob"
             inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompt,
+                                        args.num_prompts, data, prompts,
                                         modality)
         else:
             # Use the same image for all prompts
             inputs = [{
-                "prompt": prompt,
+                "prompt": prompts[i % len(prompts)],
                 "multi_modal_data": {
                     modality: data
                 },
-            } for _ in range(args.num_prompts)]
+            } for i in range(args.num_prompts)]
 
     if args.time_generate:
         import time
@@ -775,5 +827,11 @@ def main(args):
         action='store_true',
         help='If True, then print the total generate() call time')
 
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
     args = parser.parse_args()
     main(args)
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index 061a9a5bd2b..c8bf6681d7c 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -602,7 +602,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 8457f629446..d7eaac2563f 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -628,7 +628,9 @@ def _process_image_input(self,
 
         return self.language_projection(query_output)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9d597e24095..137bfc0f98c 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -986,7 +986,9 @@ def _parse_and_validate_image_input(
             data=self._validate_pixel_values(pixel_values),
         )
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 3d2e452bb50..532400b3b42 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -606,7 +606,9 @@ def _process_image_input(
         return self._pixel_values_to_embedding(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 7a851037945..7c226ea47bd 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1037,7 +1037,9 @@ def _process_image_input(
         pixel_values = image_input["data"]
         return self._encode_image(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs: object) -> torch.Tensor:
+    def get_multimodal_embeddings(
+        self, **kwargs: object
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 581ec54b2ca..6f4b6cdda33 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -327,7 +327,9 @@ def _process_image_input(
             image_patches_flat)
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index ca34c4f8d53..2700ebccb83 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -595,7 +595,9 @@ def _process_image_input(
 
         return self.transformer.vision(pixel_values)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 286a75339d2..3e0b3c768b6 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -617,7 +617,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self.model._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index fb3ceb00529..43196bf544e 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -4,6 +4,7 @@
                     Protocol, Type, Union, overload, runtime_checkable)
 
 import torch
+from torch import Tensor
 from typing_extensions import TypeIs, TypeVar
 
 from vllm.logger import init_logger
@@ -15,12 +16,11 @@
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
-    from vllm.multimodal.inputs import NestedTensors  # noqa: F401
     from vllm.sequence import IntermediateTensors
 
 logger = init_logger(__name__)
 
-T = TypeVar("T", default="NestedTensors")
+T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]])
 
 
 @runtime_checkable
@@ -36,7 +36,7 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
+    def get_multimodal_embeddings(self, **kwargs) -> T:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
@@ -59,18 +59,18 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
         attn_metadata: Optional["AttentionMetadata"] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         ...
 
     @overload
     def get_input_embeddings(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Tensor,
         multimodal_embeddings: Optional[T] = None,
-    ) -> torch.Tensor:
+    ) -> Tensor:
         """
         Returns the input embeddings merged from the text embeddings from 
         input_ids and the multimodal embeddings generated from multimodal 
@@ -210,7 +210,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         """
         Accept :class:`IntermediateTensors` when PP rank > 0.
 
@@ -237,7 +237,7 @@ def forward(
         self,
         *,
         intermediate_tensors: Optional["IntermediateTensors"],
-    ) -> Union[torch.Tensor, "IntermediateTensors"]:
+    ) -> Union[Tensor, "IntermediateTensors"]:
         ...
 
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 48c2eb8c9f6..1aa8455bad8 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -904,7 +904,9 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         else:
             self.visual_token_mask = None
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8318a496e60..542eb944de9 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -635,7 +635,9 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 6a050d7798a..04b0f291029 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -479,7 +479,9 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index ca9406657df..508b47d1351 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -420,7 +420,9 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return None
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 21158f7e580..cc571bc24ba 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -50,7 +50,7 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import JSONTree, json_map_leaves
+from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
                          SupportsQuant)
@@ -1576,14 +1576,16 @@ def _get_mm_embeds(
 
         return embeds_in_batch
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
         image_features = self._process_image_input(image_input)
 
-        return [
+        nested_embeds = [
             self._get_mm_embeds(*args) for args in zip(
                 image_features,
                 image_input["feat_is_patch"],
@@ -1591,6 +1593,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
                 image_input["embed_is_patch"],
             )
         ]
+        return flatten_2d_lists(nested_embeds)
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9a1398c28db..0e39389eb63 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -263,7 +263,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 0fd4b3c7021..06fa5c5e019 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -648,7 +648,9 @@ def _process_image_input(
 
         return image_embeds
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 87b1d50749a..d2388dda3f4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -220,7 +220,9 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input, image_tokens = self._parse_and_validate_image_input(
             **kwargs)
         if image_input is None:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 1c3107e76eb..c44f4fa4d75 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -356,7 +356,9 @@ def _process_audio_input(self,
         return torch.split(masked_audio_features,
                            audio_output_lengths.flatten().tolist())
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index b8aaa7f1db1..ff581b093b4 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -740,7 +740,9 @@ def _process_image_input(self,
 
         return self.transformer.visual(image_input["data"])
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d47f924ea19..90a833a83b6 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -476,7 +476,9 @@ def _process_audio_input(
 
         return result
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index c5a55e300c4..1cb026f4bcd 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -692,7 +692,9 @@ def forward(
         )
         return decoder_outputs
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
         # Refactor this once encoder/decoder support is implemented in V1.
         audio_input = self._parse_and_validate_audio_input(**kwargs)

From 00c11f4df1e6aa6312e2374e63be41a8c78f1d66 Mon Sep 17 00:00:00 2001
From: kushanam <42385577+kushanam@users.noreply.github.com>
Date: Tue, 4 Mar 2025 07:55:07 -0800
Subject: [PATCH 0492/1240] add cutlass support for blackwell fp8 gemm (#13798)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                | 32 ++++++---
 .../epilogue/scaled_mm_epilogues_c3x.hpp      | 48 +++++++------
 .../cutlass_w8a8/c3x/cutlass_gemm_caller.cuh  | 32 +++++----
 .../cutlass_w8a8/c3x/scaled_mm.cuh            | 68 +++++++++++++++++--
 .../cutlass_w8a8/c3x/scaled_mm_kernels.hpp    |  6 ++
 .../cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu   | 24 +++++++
 .../c3x/scaled_mm_sm100_fp8_dispatch.cuh      | 67 ++++++++++++++++++
 .../cutlass_w8a8/scaled_mm_c3x.cu             | 25 +++++++
 .../cutlass_w8a8/scaled_mm_entry.cu           | 21 +++++-
 .../machete/machete_mm_kernel.cuh             |  7 +-
 csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh  |  7 +-
 11 files changed, 272 insertions(+), 65 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5fc2f3c1aa..f7e329294ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
@@ -297,7 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_ARCHS)
     set(MARLIN_SRCS
        "csrc/quantization/fp8/fp8_marlin.cu"
@@ -335,7 +335,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a;10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
     set(SRCS
        "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
@@ -369,7 +369,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
   # kernels for the remaining archs that are not already built for 3x.
   cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
-    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   # subtract out the archs that are already built for 3x
   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
   if (SCALED_MM_2X_ARCHS)
@@ -394,7 +394,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
+  # require CUDA 12.2 or later (and only work on Hopper and Blackwell).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
@@ -419,8 +419,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
-    )
+      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
@@ -433,6 +432,22 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
+  # FP8 Blackwell Archs 
+  cuda_archs_loose_intersection(BLACKWELL_ARCHS "10.0;10.1;12.0" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND BLACKWELL_ARCHS)
+    set(SRCS 
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${BLACKWELL_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    message(STATUS "Building FP8 for archs: ${BLACKWELL_ARCHS}")
+  else()
+    # clear BLACKWELL_ARCHS
+    set(BLACKWELL_ARCHS)
+  endif()
+  
   #
   # Machete kernels
 
@@ -514,6 +529,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
+  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
   WITH_SOABI)
 
@@ -537,7 +553,7 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
         "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 1a0cd45f4e2..0a812dc56a9 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -22,7 +22,7 @@ struct identity {
   T operator()(T lhs) const { return lhs; }
 };
 
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct TrivialEpilogue {
  private:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
@@ -44,32 +44,30 @@ struct TrivialEpilogue {
  * This class provides the common load descriptors for the
  * ScaledEpilogue[...] classes
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBase {
  protected:
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
 
   template <typename T>
   using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<1>, Int<0>, Int<0>>>;
+      0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
 
   template <typename T>
   using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
-      Stride<Int<0>, Int<1>, Int<0>>>;
+      0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+      0 /*Stages*/, TileShape, T, T, Stride<Int<1>, Int<0>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // Don't want to support nullptr by default
   template <typename T, bool EnableNullPtr = false>
   using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+      0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
+      128 / sizeof_bits_v<T>, EnableNullPtr>;
 
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
@@ -116,11 +114,11 @@ struct ScaledEpilogueBase {
    the A and B operands respectively. These scales may be either per-tensor or
    per row or column.
 */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -160,11 +158,11 @@ struct ScaledEpilogue
  * The bias tensor must be per-output channel.
  * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -203,11 +201,11 @@ struct ScaledEpilogueBias
  * bias is a column vector instead of a row vector. Useful e.g. if we are
  * computing a GEMM via C^T += B^T A^T. This happens in the 2:4 sparse kernels.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueColumnBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -249,11 +247,11 @@ struct ScaledEpilogueColumnBias
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzp
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
@@ -314,11 +312,11 @@ struct ScaledEpilogueBiasAzp
  *
  * This epilogue also supports bias, which remains per-channel.
  */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+template <typename ElementAcc, typename ElementD, typename TileShape>
 struct ScaledEpilogueBiasAzpToken
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+    : private ScaledEpilogueBase<ElementAcc, ElementD, TileShape> {
  private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, TileShape>;
   using Accum = typename SUPER::Accum;
   using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
   using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
diff --git a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
index 69a3f64cb0b..26de32ce2b1 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/cutlass_gemm_caller.cuh
@@ -16,6 +16,7 @@
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/util/packed_stride.hpp"
 
 #include "core/math.hpp"
 #include "cutlass_extensions/common.hpp"
@@ -64,22 +65,28 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                          torch::Tensor const& b,
                          EpilogueArgs&&... epilogue_params) {
   using ElementAB = typename Gemm::ElementAB;
+  using ElementC = typename Gemm::ElementC;
   using ElementD = typename Gemm::ElementD;
   using GemmKernel = typename Gemm::GemmKernel;
 
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideB = cute::Stride<int64_t, cute::Int<1>, int64_t>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, cute::Int<1>{}, 0};
-  StrideB b_stride{ldb, cute::Int<1>{}, 0};
-  StrideC c_stride{ldc, cute::Int<1>{}, cute::Int<0>{}};
+  using StrideA = typename Gemm::GemmKernel::StrideA;
+  using StrideB = typename Gemm::GemmKernel::StrideB;
+  using StrideC = typename Gemm::GemmKernel::StrideC;
+  using StrideD = StrideC;
+  using StrideAux = StrideC;
 
   typename GemmKernel::ProblemShape prob_shape = get_problem_shape(a, b);
+  auto [M, N, K, L] = prob_shape;
+
+  StrideA a_stride =
+      cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+  StrideB b_stride =
+      cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+  StrideC c_stride =
+      cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+  StrideD d_stride =
+      cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+  StrideAux aux_stride = d_stride;
 
   auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
   auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
@@ -87,10 +94,11 @@ void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
                                                        b_stride};
 
   auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  // auto d_ptr = static_cast<ElementC*>(out.data_ptr());
   typename GemmKernel::EpilogueArguments epilogue_args{
       Gemm::Epilogue::prepare_args(
           std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
+      c_ptr, c_stride, c_ptr, d_stride};
 
   cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
                                   epilogue_args);
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
index d2f43e2b7a8..8f4df836bcc 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm.cuh
@@ -40,12 +40,7 @@ struct cutlass_3x_gemm {
       typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                 float>::type;
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
 
   using StrideD = Stride<int64_t, Int<1>, Int<0>>;
   using ElementC = void;
@@ -88,4 +83,65 @@ struct cutlass_3x_gemm {
   struct GemmKernel : public KernelType {};
 };
 
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm_sm100 {
+  using ElementAB = ElementAB_;
+  using LayoutA = cutlass::layout::RowMajor;
+  static constexpr int AlignmentA =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using LayoutB = cutlass::layout::ColumnMajor;
+  static constexpr int AlignmentB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+
+  using ElementC = void;
+  using LayoutC = cutlass::layout::RowMajor;
+  static constexpr int AlignmentC =
+      128 / cutlass::sizeof_bits<ElementD_>::value;
+
+  using ElementD = ElementD_;
+  using LayoutD = cutlass::layout::RowMajor;
+  static constexpr int AlignmentD = AlignmentC;
+
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
+
+  // MMA type
+  using ElementAccumulator = float;
+
+  // Epilogue types
+  using ElementBias = cutlass::half_t;
+  using ElementCompute = float;
+  using ElementAux = ElementD;
+  using LayoutAux = LayoutD;
+  using ElementAmax = float;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAccumulator, ElementCompute, ElementC, LayoutC, AlignmentC,
+          ElementD, LayoutD, AlignmentD, EpilogueSchedule,
+          EVTCompute>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, ElementAB,
+          LayoutA, AlignmentA, ElementAB, LayoutB, AlignmentB,
+          ElementAccumulator, TileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          KernelSchedule>::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+};
+
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
index 7ede9e06747..85272804774 100644
--- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_kernels.hpp
@@ -30,4 +30,10 @@ void cutlass_scaled_mm_blockwise_sm90_fp8(torch::Tensor& out,
                                           torch::Tensor const& a_scales,
                                           torch::Tensor const& b_scales);
 
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias);
+
 }  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
new file mode 100644
index 00000000000..cf2cccc913f
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu
@@ -0,0 +1,24 @@
+#include "scaled_mm_kernels.hpp"
+#include "scaled_mm_sm100_fp8_dispatch.cuh"
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace vllm {
+
+void cutlass_scaled_mm_sm100_fp8(torch::Tensor& out, torch::Tensor const& a,
+                                 torch::Tensor const& b,
+                                 torch::Tensor const& a_scales,
+                                 torch::Tensor const& b_scales,
+                                 std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  if (bias) {
+    TORCH_CHECK(bias->dtype() == out.dtype(),
+                "currently bias dtype must match output dtype ", out.dtype());
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogueBias>(
+        out, a, b, a_scales, b_scales, *bias);
+  } else {
+    return cutlass_scaled_mm_sm100_fp8_epilogue<c3x::ScaledEpilogue>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
+}  // namespace vllm
diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
new file mode 100644
index 00000000000..468b77d9593
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "scaled_mm.cuh"
+#include "cutlass_gemm_caller.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM100 (fp8) based on the
+ * Gemm shape.
+ */
+
+namespace vllm {
+
+using c3x::cutlass_gemm_caller;
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm100_fp8_config_default {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;
+  using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto;
+  using TileShape = Shape<_256, _128, _64>;
+  using ClusterShape = Shape<_2, _2, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm_sm100<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm100_fp8_config_default<InType, OutType,
+                                        Epilogue>::Cutlass3xGemm;
+  return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+      out, a, b, std::forward<EpilogueArgs>(args)...);
+}
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm100_fp8_epilogue(torch::Tensor& out,
+                                          torch::Tensor const& a,
+                                          torch::Tensor const& b,
+                                          EpilogueArgs&&... epilogue_args) {
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  if (out.dtype() == torch::kBFloat16) {
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::bfloat16_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  } else {
+    TORCH_CHECK(out.dtype() == torch::kFloat16);
+    return cutlass_gemm_sm100_fp8_dispatch<cutlass::float_e4m3_t,
+                                           cutlass::half_t, Epilogue>(
+        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
index 53921abc951..df7c178f79d 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -71,3 +71,28 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
   vllm::cutlass_scaled_mm_azp_sm90_int8(out, a, b, a_scales, b_scales, azp_adj,
                                         azp, bias);
 }
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12080
+
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 6bef5508868..f2508ea7610 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -29,6 +29,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias);
 #endif
 
 void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a,
@@ -86,7 +91,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
   // and at least SM90 (Hopper)
 
 #if defined CUDA_VERSION
-  if (cuda_device_capability >= 90) {
+  if (cuda_device_capability >= 90 && cuda_device_capability < 100) {
     return CUDA_VERSION >= 12000;
   }
 #endif
@@ -120,10 +125,22 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
-  if (version_num >= 90) {
+
+  #if defined CUDA_VERSION && CUDA_VERSION < 12080
+  if (version_num >= 90 && version_num < 100) {
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
+    return;
+  }
+  #else
+  if (version_num >= 90 && version_num < 100) {
     cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
     return;
+  } else if (version_num >= 100) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
+    return;
   }
+  #endif
+
 #endif
 
 #if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh
index e4af067915e..cc50e68b058 100644
--- a/csrc/quantization/machete/machete_mm_kernel.cuh
+++ b/csrc/quantization/machete/machete_mm_kernel.cuh
@@ -126,15 +126,10 @@ struct MacheteKernelTemplate {
            std::is_same_v<ElementSChannel, ElementSToken>),
       "Currently token and channel scales (if present) must be the same type");
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
   // Currently only supports float scales
   using ChTokScalesEpilogue =
       typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
-                                         EpilogueDescriptor>;
+                                         TileShape>;
   static_assert((with_channel_scales || with_token_scales) ||
                     (std::is_same_v<ElementSChannel, float> &&
                      std::is_same_v<ElementSToken, float>),
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 5fb4aec5332..9c8a50332ad 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -65,12 +65,7 @@ struct cutlass_sparse_3x_gemm {
       typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
                                 float>::type;
 
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
 
   using ElementC = void;
   using LayoutC = cutlass::layout::RowMajor;

From 3083ed764d7856b28a5d0afc9bb45dcb15592c83 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Tue, 4 Mar 2025 11:40:06 -0800
Subject: [PATCH 0493/1240] [TPU][Profiler] Support start_profile/stop_profile
 in TPU worker (#13988)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-tpu.txt         | 11 ++++++-----
 vllm/v1/worker/tpu_worker.py | 19 +++++++++++++++++++
 vllm/worker/tpu_worker.py    | 22 ++++++++++++++++++++++
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 725b1a2e4a5..d999e8f1c90 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -17,8 +17,9 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-torch==2.7.0.dev20250226+cpu
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250226+cxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index cbd2fe6edd8..76b6297606c 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -7,6 +7,7 @@
 import torch.distributed
 import torch.nn as nn
 import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -65,6 +66,15 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        self.profiler = None
+        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+            # For TPU, we can only have 1 active profiler session for 1 profiler
+            # server. So we only profile on rank0.
+            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        self.profile_dir)
+            self.profiler = xp.start_server(9012)
+
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         torch.set_grad_enabled(False)
@@ -152,6 +162,15 @@ def execute_model(
         output = self.model_runner.execute_model(scheduler_output)
         return output if self.is_driver_worker else None
 
+    def profile(self, is_start: bool = True):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            if is_start:
+                xp.start_trace(self.profile_dir)
+            else:
+                xp.stop_trace()
+
     def load_model(self) -> None:
         self.model_runner.load_model()
 
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 7903e81943c..1a5eaba09b9 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -5,6 +5,7 @@
 
 import torch
 import torch_xla.core.xla_model as xm
+import torch_xla.debug.profiler as xp
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
@@ -93,6 +94,27 @@ def init_device(self) -> None:
                                      f"tp{world_size}_rank{rank}")
         xr.initialize_cache(per_rank_path, readonly=False)
 
+        self.profiler = None
+        if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
+            # For TPU, we can only have 1 active profiler session for 1 profiler
+            # server. So we only profile on rank0.
+            self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        self.profile_dir)
+            self.profiler = xp.start_server(9012)
+
+    def start_profile(self):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            xp.start_trace(self.profile_dir)
+
+    def stop_profile(self):
+        if self.rank < 1:
+            if self.profiler is None:
+                raise RuntimeError("Profiler is not enabled.")
+            xp.stop_trace()
+
     def load_model(self):
         self.model_runner.load_model()
 

From be31e4d3baf2cf85dbfa08a1586ff4061700528d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 4 Mar 2025 20:59:22 +0100
Subject: [PATCH 0494/1240] Fix performance when `--generation-config` is not
 `None` (#14223)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/llm.py                          | 10 ++++++----
 vllm/entrypoints/openai/serving_chat.py          | 14 ++++++--------
 vllm/entrypoints/openai/serving_completion.py    | 14 ++++++--------
 vllm/entrypoints/openai/serving_transcription.py | 10 +++++-----
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 122e2ed86cb..fc585ee9e54 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -244,6 +244,7 @@ def __init__(
             engine_args, usage_context=UsageContext.LLM_CLASS)
 
         self.request_counter = Counter()
+        self.default_sampling_params: Union[dict[str, Any], None] = None
 
     @staticmethod
     def get_engine_class() -> type[LLMEngine]:
@@ -268,10 +269,11 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
             tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)
 
     def get_default_sampling_params(self) -> SamplingParams:
-        diff_sampling_param = (
-            self.llm_engine.model_config.get_diff_sampling_param())
-        if diff_sampling_param:
-            return SamplingParams.from_optional(**diff_sampling_param)
+        if self.default_sampling_params is None:
+            self.default_sampling_params = (
+                self.llm_engine.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
+            return SamplingParams.from_optional(**self.default_sampling_params)
         return SamplingParams()
 
     @overload
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98e9ea0fc61..f4aaee36078 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -105,10 +105,11 @@ def __init__(
                                 "been registered") from e
 
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
             logger.info("Overwriting default chat sampling param with: %s",
-                        diff_sampling_param)
+                        self.default_sampling_params)
 
     async def create_chat_completion(
         self,
@@ -210,17 +211,14 @@ async def create_chat_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
-                # Build default sampling params
-                default_sampling_params = (
-                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
                         self.model_config.logits_processor_pattern,
-                        default_sampling_params)
+                        self.default_sampling_params)
 
                 self._log_inputs(request_id,
                                  request_prompts[i],
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index ed09af84f64..b2ad28c0a33 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -51,11 +51,12 @@ def __init__(
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
-                diff_sampling_param)
+                self.default_sampling_params)
 
     async def create_completion(
         self,
@@ -119,17 +120,14 @@ async def create_completion(
                 sampling_params: Union[SamplingParams, BeamSearchParams]
                 default_max_tokens = self.max_model_len - len(
                     engine_prompt["prompt_token_ids"])
-                # Build default sampling params
-                default_sampling_params = (
-                    self.model_config.get_diff_sampling_param())
                 if request.use_beam_search:
                     sampling_params = request.to_beam_search_params(
-                        default_max_tokens, default_sampling_params)
+                        default_max_tokens, self.default_sampling_params)
                 else:
                     sampling_params = request.to_sampling_params(
                         default_max_tokens,
                         self.model_config.logits_processor_pattern,
-                        default_sampling_params)
+                        self.default_sampling_params)
 
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 77f016a5e0a..402a0bb7a6b 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -161,11 +161,12 @@ def __init__(
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
-        diff_sampling_param = self.model_config.get_diff_sampling_param()
-        if diff_sampling_param:
+        self.default_sampling_params = (
+            self.model_config.get_diff_sampling_param())
+        if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
-                diff_sampling_param)
+                self.default_sampling_params)
 
     async def _preprocess_transcription(
         self,
@@ -273,9 +274,8 @@ async def create_transcription(
         try:
             # TODO(rob): subtract len of tokenized prompt.
             default_max_tokens = self.model_config.max_model_len
-            default_params = self.model_config.get_diff_sampling_param()
             sampling_params = request.to_sampling_params(
-                default_max_tokens, default_params)
+                default_max_tokens, self.default_sampling_params)
 
             self._log_inputs(
                 request_id,

From 9e187c7e38c68fcb89f0970eab502e74c64276b4 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 4 Mar 2025 21:13:06 +0100
Subject: [PATCH 0495/1240] [Frontend] Do `prompt_logprobs` clamping for chat
 as well as completions (#14225)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py       |  5 +++--
 vllm/entrypoints/openai/serving_completion.py | 11 +++--------
 vllm/entrypoints/openai/serving_engine.py     | 17 ++++++++++++++++-
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f4aaee36078..fc4dd110712 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -24,7 +24,8 @@
     RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
                                                        ReasoningParserManager)
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
@@ -844,7 +845,7 @@ async def chat_completion_full_generator(
             model=model_name,
             choices=choices,
             usage=usage,
-            prompt_logprobs=final_res.prompt_logprobs,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
         )
 
         return response
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b2ad28c0a33..f52e1de9840 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -23,7 +23,8 @@
                                               RequestResponseMetadata,
                                               UsageInfo)
 # yapf: enable
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
+                                                    clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
@@ -394,13 +395,7 @@ def request_output_to_completion_response(
         for final_res in final_res_batch:
             prompt_token_ids = final_res.prompt_token_ids
             assert prompt_token_ids is not None
-            prompt_logprobs = final_res.prompt_logprobs
-            if prompt_logprobs:
-                for logprob_dict in prompt_logprobs:
-                    if logprob_dict:
-                        for logprob_values in logprob_dict.values():
-                            if logprob_values.logprob == float('-inf'):
-                                logprob_values.logprob = -9999.0
+            prompt_logprobs = clamp_prompt_logprobs(final_res.prompt_logprobs)
             prompt_text = final_res.prompt
 
             token_ids: GenericSequence[int]
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 59333dbfd24..125812d2cc0 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -42,7 +42,7 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.sequence import Logprob
+from vllm.sequence import Logprob, PromptLogprobs
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
                           log_tracing_disabled_warning)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
@@ -535,3 +535,18 @@ def _get_model_name(self,
         if model_name is None:
             return self.models.base_model_paths[0].name
         return model_name
+
+
+def clamp_prompt_logprobs(
+    prompt_logprobs: Union[PromptLogprobs,
+                           None]) -> Union[PromptLogprobs, None]:
+    if prompt_logprobs is None:
+        return prompt_logprobs
+
+    for logprob_dict in prompt_logprobs:
+        if logprob_dict is None:
+            continue
+        for logprob_values in logprob_dict.values():
+            if logprob_values.logprob == float('-inf'):
+                logprob_values.logprob = -9999.0
+    return prompt_logprobs

From 933110d79461f2f0e9fc42a4c405cdb2a4c6ceb3 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Mar 2025 15:22:11 -0500
Subject: [PATCH 0496/1240] [Docs] Update Dockerfile dependency image (#14215)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../dockerfile-stages-dependency.png          | Bin 118207 -> 120680 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/source/assets/contributing/dockerfile-stages-dependency.png
index b016531f1e0a06bb38b01b1989df932f161e3aee..6ace54f6676203dc05aa2a9d44248b621771c8a2 100644
GIT binary patch
literal 120680
zcmcG$cU;cz-#&gJWYraKqM@=$WhB}I8KpscQly=xl!in|vXXWR?LA3L2+`2q64FxI
zyYF#cSKjyi{rv9x_xtO2eIB2OcXVCX>vf*baU93<IA1rF6ppM|%C?k3p{zJ|^w3EP
zg~^RVng4}xKK^EVda5@5SY#k~<Pc?+{4YE=>H&qqO*wW*TE#KAr`g0#^+)smuOT-#
zF)*HCT7BplH#f_QtIIbSbKVu{K3Nzue~ni}ra@e_T6g?`$fGd_pOwdQK0T+DyhHHx
zb_O1;C2A~HTRi43F`U1yg?aMQz?kV9%7^+TI<;Pj&V7TD&a<gGYHP%&e@IyS`T5Pm
zrOdsQ7W`3O{Pzb6#eLDi$u<A=qv`SgAAZ`gyNmV>)Mxka-!GGxjL*)_j*d32P1IE`
zp4X*Q`TRs>Wo6KQtCmFFoai*#udcX1!bkpNaqwzhwO0cld0)JEap}?}uFL(>_@nBk
zJLA)+sPpH}4foXB_J1wh8e(@+R#vv^CV7whb)Ll3)ID9x$q&?Jc&DP|^w6h(y^`bK
zAK>$c<qQnIxzAfrD%$tTTOjX%xVU)N0rI<2xzAme;m-nBOI#Z+%X#3#R{x!v1AhJ|
zPn~ie`ej{r#_L&BRO84>ChD_i&)&X$+g+P9JDNY6YxFrV=~dvPM?wZ~ZdkX!<FtOm
zqWp9L`A{}WF9`(7i1zB$t1F`v^LsNJC8qj|VjslB#2oZunw}ZWm$ds;-sd`+vt-E<
zmMshssyaHa>^i>)+y49%ENojHtE$@3-rg?os{eJPRsM8x5tH|>_2eU!Bu!dLtR=f?
zr5jh&r5Lte>IjM5TVG$Vu6OCjLROu-j~=Z#vn$-yX|(y`R>e>;`>v{Q-@f(BpFH`c
zT3z3znDe5b$(KzH=H$bqawF(BKV#Hgld!+_^57~(F=b_CmQdk8Z&u-ZM`orckFYO~
zp3Ssq6gv0uVNPG(<Y1X3bD3RNmC3;O4`dJ03&}^hPtaq^eflFV*yZ<&jzqm&5wV`i
zsIuwNR>`yP?;brkZ^fsY1Z^(Yu{P(v+>4VVy=?(zi7$Q@F6J_?idO#nIa0J}be}G+
ziaAw$hm$??2R6cOyhOj{LeUDWmU72@wx_-l6TiG9r`ld!8hl;U)6*k4^T#(t#Qv7x
zKjdPl;&bc280%lF?>24dE52s^dfQLCu^E!z?;eaeu~5(sIdbZ`{ELwWGr8l(J-Cf4
zpMQ?mOmiCj&1uh1K85>uLl*f{(DBEQABOL4UmB@DUsS}KnEmZfG1utAKT;N&d6&mu
zbc9Z}d2ub0#=?(uNY0KweV}klgnk+8=$FBk86tWy)V5T3@A=Z@9^Bivn|yq@-Cxhi
zr0R+L^ylX%f=VSOd$9U@89ly#|IVs^`Oi?LqVL>iteE?Z<Xg9Hv22t@KrEA{ZrZw4
zF-&5*C#~#`$Un($JNWo=e?@rDj1@~Rk}i){jttm)KGgBsO=T|<DwGZ3v|W~3Amq?f
z_u;|T#pGIr&X)#bS%kU{70krHdNs9Ja;y+5ewEv}Ccd8C>Xr3S#gQYNq|i}r?3=rW
z*d?pB+~$&$VDymnVVfRKauY{(MJh1#%zpg%u~dAtY298U2I;#20WZw!QyqF64$r-R
z``r7Vjt_RY^*Oil^74B7`i4okOpJGgaoKP9Dy4k#qzw{JKqHx((WCI{;@RmwH${2*
z+jEN`I#<8k#nm)4919n+KJ@cjD1D)XFDtrbe#*o9_XnaRr(+C@??_X3Y9xO>FFC0>
zR~H_qj?#+Cin99iVblKkQ=1t*Rw*24LoS>+dh~_}%Y}mR+WeV)j{RT1eEBjqHik=N
zW)s}jb%@-#)a^QYG%ou2%wKP5Tvk9p0CLi!i;sFU&t<X&aV$+$QBk?QZtwH-biu!G
zGNz?y`Q3k3ub%#OL~?1ZujGtl$s@kV&kH!-zw=?Q&3BvSW*itCL^}PBZYSSF*-cMH
zse^mYy!N#3uEFggiAdC-U@{s1`SS-=)cpAI<5UJ_vHk+L*~!-ERkRwPa`!@`Mb_aE
z+7G7Rk)K?`Hs4D_MA`S53L<Xt!f!5qsg|H+(U9JmkkzsS8^FgWK3sJQn@7I9O4ylx
z$)}ra%fz)a&6jwb?x{=h;;=bdWlizmX0&|S`5b||NO~8lK!#bZEGMm>MBC1#$a3v0
zOP8Mk=RLTO%E{%7{|t(}v!Gv6E5mfSoon_0qX!Z|0$+z)Az#-OOt17SiiSEXy}iA!
za<3J(Iop_NLEhztk`RbI;AV6k3~GJ&=+WQr(*FBh54I{Yu_F1IBTvllU!9<pAu`#M
zc9nbVo40_T>-dkk1+|x(+q%=A1NUL^mwALEXO3{W*x{D)Og}x|-S$n<ZQ@k=F>*=n
zFX??iYQZkEn)nMWzxkIawJu*TmaQ76CQW@c(G@3p@%L>JCi1&VUg=ikK8z_wWuct>
z*H^Hwu~_MDx_Q&4V4IKIrK#pEuY|X5-8$D`IKSjA#4m@>bL>vYLYZmJb5T=MtJ0*z
zst<R}YfIWmKJv8NGkT7?WFc=(1PLvaz8ez4$;!%V&*9$1=`vK&wn0EZpkWjFWvP(4
zkIakfN~s@P6J3@!+1w{&*>s?s_EqlFA@ubfZpQ8Y!~fF0n_OV?(W6IyeR(EFWk3l&
zgA2gRUF7NC??nr{P!X#dXZ(QDqc>(|igt)i^${W6k)otdzvbw@Ti)J9hn<|9uxt)V
zmj-{fMMOt$o4c;&3i_?)#Wl#J9(%B*V#^tQl+-Q1&8{zBx6(e5?L|*&N#6RI&-+`O
zyybj2MMt{*Cr~pr8}HqeMei+o`-o3F;smW|RdLdbMpq7M%LNv!1i1xo-`Oz5X5Jk;
z{^Pf}*NE76sf_2}5&fMV-F{j1@Zq<r>bicRp)ENV`=+L*l3q<9vafw;b!_)>c~x;r
zQGB(YX=$+Vq~msSdz1}NXv;vMPK}qjg`z|L{2XL!YwI=?$#s|>>h2Ttc%pD`6A(z!
z^7r#Q{Fs(I?oJA{K%;OU7;UmW;=8{2>9Gg#@$p!CL3#*Ks`PKp;jbHdGBU3pZb#n<
zSZPnCeRY>8J@;roP^3=K`}=?VRDnt}`p%cDHBLPVJv8d+Q*_`?+|u-5*sPAG<~sZI
z+$^WzuCUpujw77*D&A}O*!Ejq26hQ?XfPo#0`(yB&LZ-$1ud1dZK6b|Xlh<+bs0T(
z?%avTyUx75wHmGGu!%`$Lx$PxP?Tf{iVs06-ybOE4Lo`#=l{;-bI8S(ale!!>o0xp
zHVku}=v05lBstZe<cnt7+g@frJ3Tu3tD^mPPM;onBPs?uP71CI4N31p;litnR%oRe
zeU{MUA{XKwO)r!b<(DXgt|Mt-oNiNpk^ueC15;ckM_#>p#cr9+gBPO*smts3eD7Y^
z<n;SGyX9D?qMO#qlPejdkx%-_`r+={zPgl+l9IEC0Y|LX9m$z7AfISMS~w-|rw6e6
z`&Y9wlUXfJ-PzXdKTAUh+{-er-?MLD#`&^PQdwJPMzWIKb8yki__M<!A|kqUbDc*M
zv7_<vO{wQfEcsoV?Q5?YVh`&-Yq+|)9&iAts`|)0JlQ%s12D6^_I!wa&FgE+$_}*O
zp4+V6hT%urmYRTUSQE=vf8J@m;3P{x*-bAylta&xlatMA6Pd+_%4_x6gv{%3eXoZj
z4tqPah9^j7n%AFPopY|_QT}wV)v8si^7lOO^BXQ$z<hb?Pw{R=S}~U5qo+S5!ftxF
z*2}rcwR!AMd;7O<uZK7KimfbCa<=NKiXmH^y<XJ89QhW<Ia$c;T8{d#_9tzX!fXES
zh)8dGWnSmAPoF-a&t30E3X%QaQPmP8WT92<v|mySf!yC#WO$#E+=jc>zxSb=<A63(
z|GA*z2$yb#Ub?Yz^VWu~fX)ErZ6Xp9dFn}eIqOP4e!O*T8+j|mG=N^Mr~&x4W@fwJ
zO)pGC{QcF7tw*XS`||a*wAd1vMSg{&Cp0h5vTBvs@`U!j=yUXjAayoS&?I&83h=mU
z!hXvp4e~}5<=C3e&dwaJnVK-#QoBzaqvz=j`O8yTmk$L525M!rq@I)GEUs5arAyF4
zR`s|Z;Nf|4hu)Hvw$8=Ogc*RSYB#n{_VLp#dBMp~pKf|hE5($*Wr}-`K7XxR99WKN
zGd3+Zrv$`i<hPG=EH+KG8)4MUY}J^V@Sh+3-n=gP%;*9_a;immdAoU&-o1URnsD)V
z)A$M2ahq#d3ih2}Hshs3I7jpJNLK4a8jWQ9Wdf9Y<G&((9r72NTUuIHI~BOk)4#M2
z;T-FpgYs<k^1a*4V3^DM?8R*ROsh~0FX$>Or(7N%XkMN_`8#L#64IO~QhakYVZ$k9
z<=yypE2rUnfH1U+qG7;WJyr9Bm0Vn5_wKPI*2ZfNo~YGKK3$l%C?Un0|7Eo4G$54y
z{s(F2%MvGPw2D$CK`-Cj$4Nx*&$UBkP*+pC-rb#;lh_b|fZU_AQ}0!_=~#)L+w?HA
z#MtRD!<wqYth!ZL<_qGN2EJQ0+`IVh_Bzt6KAoiD1|{#mNxW&(CWJ$ibup*Do5@H|
zy`tEkL)q(hoi2F1+b}lwb!SxAdVYZFG@$HG-XzPq<hGBzNn*dkubK)BWC5ZC9k4dY
zZb6D|_SZS7rZ(aiP#vePZ(pqng0Wj}Zb|4}ma<{*g|7vBOAoZO=8wG%I3GGb(VfJ)
zdbQufhp(`J`<pHl;p_DD^wu*J@_2C!fR^w`<V`T|b?d*%m5y-LA?3+eY~cOYu*<1P
zaa1bBAfW_+k?q^JOHS688H{dRSO^+7S{XJYs8fbpz$WfB`h>_hh-&P`<$3^^uqD^o
z3h-^D(aQJS4q5`3de0@!GC-D_llp@lWz32QAHWBTH#b=D2?yFdSMAxehcMZZp7F0&
znXgwj0`!ld0r!+YDcY|qBrZNJ;5IG?`gfJP4lHlDR=+jHuo(D0!=iB{!kfRVE~OnT
z>;8iWaX&@veo>908=j2x7cn!JH92)Bs2UzScC76Kp<mKe9f$gJZEfXG4#pW>XX0cF
zef9fgci#OjpyZHUXJkxGO{olWk9G<Ve%h;@Vd{7P{+AcF9e>)3k+P0IKLvoost!(1
zjxr0_f8JNy^J9><3*2gRTfPky1UJ@;3*_2sB%fv3j2pvMi8}N+f|$*s9|TIexgt?7
zx?MxfO4NBVklAPnvaS!6;N;1ZUQC<{!ap9e>N&P$)asd`{F%GnO&a^4=sM5`H2pKk
zN@C#sMo1E&!NE;W4tli%w5JrWcLtBfe~lfbhhTEwH=yUpQrZUO>~$LY6&2<x!sv0a
z)g=qP9{_hu)>WTGYr&O*78-w~{VGN26unyg$4^S4__(6OIyyRkacJZmAoB;2=f{rF
zpj+chd$VlgFHevC=Wo?wZ!Re7=x`8p1YER~RW(1Ge24qA?;Hr!1zO9Fj?I`?rP!7f
zGdAW(#~TH%@^pYak%%VW_6;=MS7=Frr3$h%|2Y#1?wBK2J}ET?t|i4#xj+*xYHDi!
z1)@2BK!d%UTLA^`DW9k1XV=}oU)fO_!oJ^<gQ0NXM@eMq{C*#`cnubQr;(m@7Awgo
zg)7iuj#4X7GY8-;FgE%!vcpG&NopAAi<N(3sB_i-{oCb~<?KQ`bv}iPivn@Bo~?=3
zq;cV}3$K^_$0e*NKyxa3t3NwY3qY&FD<IGp=_|3+<7;;N9kYgX6L0{${)N8_?)v%J
zzvuB{kltn0@`_GyzQ%<D;9RhNx9<F*AR5$*Hk+d$6l-hiNLiny9^tQET?Uya+dd1V
zV~7STO?~?OIfCrkU$XOvz9=h-K+GYAd2o+?wsK7aya7>}3X3@vW!aFvozY{Nz^h#w
zH*TC`4b5{9{|qFvX5o(DRSbn_(}dKMclqLTpk(dU(ZBS=ui`pw@eSsLlSS<adpUXe
z<sM%{oquoNvEvq2l*&*DY3I*}tzit(7jQ#EznewpzA1!W`NIo;HDq+Zzqb)}Al>9t
zwZa1mi&nntG+6yA2I<R}r#LS*vDkBvRpVas-*RAI2Kw&r<{SdPczD_XK%*?STru`p
zyZaj&mJ5}^U&rQymD^<F94$*Z-AB{L*zPNYh)jQD&1a-7;?S=O8o^?@@HKFsnVLqf
zMRjbo=aN+scWBM-&5gN5{Ip_!6JbUSg|}Ao0uXcAb9|)?qX5r*T`b91i!yr@wG2d?
zT<9Bl9bnt4b>zQKm+jQgd*wDegI+34oq__F|J7qzgaQMHhQseBVF*+Jg}jU&m5P!s
z=sL<<X@JO}n4ZIeV!2MkPQY44!zj;SDQhg&f0eRDq4KP5Jct$ts_so^ht#=57d#q?
zg0Rn2g~|YQL6nlg!9nnCj%}8?!!@7)P)Z<NfV3K&J$opG2BU7wDJ3H(t*!r{?q}O}
zlt#~&4?bWG!Bd)g`OjO{qU$RrArD0!pU-h%Jrak60iy?mi;hrdeo;};ISnHIt*V-u
zEKL(|Uu5#*qhE8*MP?7~jT_s`1wMTGWDe8}f)^xca<X=vkX4Hq)JOmohz|XQi}S&D
zf#MRngXS-eM~mxw9%Q*xQr+$Qy(5XaBMo!10}CB(o)%D5QAvm16CEAhpboSN@^BHE
z#Bmv)#C7>XltM`C8pxg*YYm%U+5;&!fn$OOH@}3Ao8~<FTi3gGF=8Mh;v4#^TTe>y
z>mvQ>E_LjNS^7P)bAM?<@~DEu_;>V--<>%PnHG~j_E?UELG(K=C)b#>7Uf0^3InTv
z9tVT;@#7y+z9mqdb94SFsI@@<M=Nvs;ivAweZ$+E$e6U)p#f0tJ@l;Z$<?3w#(J^I
z`iO;Y0%*U!<&W!35@TNx-!9$Gkg6{uab7{&Xlp~J6B+a2!-owY7jYy%+No*Na;YOh
zH%Exk<IS5l=R+_3#J)nCBiB#eOou`vE#Sg=09Cb}d@t=f2f*Hj`&wtUx5~-MS%DJ7
z_Fa?gNiK4o?$+l553*-tuJZ-RXd=`PzY78faSqD{%_M&`x9@NWjhWwgOwSDGKIq!e
zFi&v0yn!HQQ+Z<lG&tM|1%<@%oae|jFtOgoti{q)R@VJSpB|Gnwg4dY_AGEU2NSN<
zbM7TbvHwDOxHJf(4D{K^6SN*&wRmoU9uUErK-C2O5s<=AfG8CD(Qmoq<KqZ+z3dBT
z93MY=<j|W@dxuLh5AZW1kflzNPba${Av94eIm5F4`O~LOHYI%fTO1A1l;=cWrwtPH
zae<)u^u%yn^D-kNBhavX7wp(bq7yqqAuwp(=|Yd2tF{VRGzc<!;L@92Mw>ZB9mu3W
zJzhgO?#z|xBgc+?fc_<Uy1-rf>LOKDD`1_vA0_@^#ggi|w3~^YQ}kR0#@c;-+4m=^
z@%ue~>{zX;-V8JesVL5GS)zXa48q>ED<%rT6{kS*EDA#KE)GeEx93zqT=5SMe2
zDo}NU_nFlIw#JrT?5^p>j6tkBK5Z(3Y(6DVhTdge_5gyl6?yed+NY+aZ9NfK6{Q&F
zl3%4B51s%S5DJcK7E(|kOi@v>uRiTOL?SFM!6TuedfEy4;WXK92YG1${Sb5xehlD@
z9yC?57xn2S@D~5eUlE?3p3ye6@-;k1t(=XJ>=>d53k!pV;f0WDk<d%V4rMhvH14a_
zITdv{i%&aK9taB%SM#M^OxYXvBsoPzN!*I-SP>Ud&;Rz=IXyT}S6aFdJh$jwFL(l8
zW<OHV7+H1gDp0x6WKZ$3#@~GX`n4uFKNNua)N=%=*1KGB+6IDx>UecA^LmNPV}Myf
z4<G(xzkw`R_&j=H31>q}>r8D1n2mKn<`|)4HiLhEd+R+qv_|)(-KB3gx8@-cdTr}+
z)pc~Huw!ff`Ntg1bZXB8)N;l_%D(-eqL>6BV1|Z<){8rFU`VrW-MZF^#Hy+)pp5fp
z&kCa919w27Osv$+BCXVYdTtUWO=Up;h*L}W`1$jS(d^G<Wl5P{fp_ng10Wy{@IsVv
z+E*z<oh2Fz#xF{P&d;7dZ-LCTzuE4Tjg3w3G(*9>v+eJEyf_^+Fk7h(066{qHjLJ@
zDLvQTMAKRoHNeBeqp7JW**eF#A_9r3E?NTy3DB>;&oVzR=PX*WscR)cszgjiRUHu#
zl7h^$blI|w&q4b!=JD5ey@YK>zdcs<gsi0df@`@np^}Lk+N$$pZ&uMe3k8KzC{rrE
z8pu91wVHw}3-Fh|>&5eiKSJCNa~Y}seLin6q^}9t#bzVbyJP3hU<mc+{B>)v_;aJW
z#SFBP=04rjRUKz*WApl5@BPhk=%bpw8b(Gx!=<mjerM6wl6&gp$&p|Dq#Qya&+Dv4
zZH!BN*_D{{+w{xR524TKL&@m#D<f&bsuUkDKfhk;*<*FtE)Yv0f++QBproaxrs5lR
z7jsks1G&5~!CK_SxHzL%pdmasf=$iSp(#p~@KAd2823Xpgy5ZVq3t!6)aJ)W)BL_|
zp!V2l5REhKndr&bwG6)}`uVQ<lJ)ptXbrPp<OF=Yz2g$t=T+STXCE4}%MYO~WiC~{
zcXk!oDk*VwC8fJTLF_`7{>T^%9ZZY&?A@D|mWDjQh!MJ*&XG$HZ0_H?rwmrO{#|jg
zFWWvsgDff{4s$lMoYoI^n;9n(9d!C1{`xJDR_4&rikq}*E>)C_5daF8mrvuuWozQK
zvl1Ipo<4mF3F^v%#n*iZ<PbDkf^|XL3E^hE19l0$741aQC~HyO#>0mXV>5wt+S+hM
zm^fUV8Sgj}M4RRqZ~MF2!yijZfJvZp;>rQk;~XAM-+~5%puXnQh)u_ssrZ0tG*$?*
zV_q9%x4iM6poWL<&kNsz-Pr}yfPQ>EU!9dtOYrP_D!tK#U+<xD7$si}K=^cD3PKFu
zzyBg`9vK}^%Vlxrk>%`JV0%^=b>6mBGyC)H?P5;5@{0()=&O{fqi4>vp`VRS4VDRD
z90B|SH01bq)+<dbsockN9D1FgE3oqGtQE3UPt?)uE#7zY=1tUw52d9{>{-A(z8EjK
zXt__Hbal<^qD_L7=4KWtYP4<CuFmG$zCFfa=ia@0MMZOBt%Cvs+1i82rc)@Fbq%U$
zi8|tjtI$j_I9!(<BI!0WG29);iGtS+)u1k$Q_PXT90T@Mv^Tk1MbU46^1rWdW1v=a
z(_b%uRX#)*<=*d4yw;0ATu8QVgsz2!eS2UB#hGX8)(#MCvNBcD<P+T)-~0>aPP-{-
zXk=qjoNOKR;K3=(o_aOF?=VY4h9H@>(~Vhh^*xcPsi{nyBA2o6V0XFSZt<3i4Sqrh
zMIWTyPzs%9Ci`RD;OFn}|KPzYA<I@k4b$S4t5)UMb?z}P565*u`b4{1dt_nwn$N+)
zgXxtDaU$l8nY~5Kl9$`AvC6JpLay+}v$--RxM|_Sh09p^vAJ^o{Cs?T3IC6UtqK8K
z@a2-6xtKqxd*;j;Dgyw!84A=~(W*<KP3#JOlQ9Zr9_CFzC9_k&Q?Z<8s2cZ<tY0T&
ziN+CsePMEITy(S})Xp#@6tW0?6twiuU+{j27AIe4PpPS8npVfs%4<+qn7*>|pWzgA
zNfcP4^XHLck#|In_bvTeprWT2diO5Xo|Y)w4s_`Oh53K2%zO9jF$kX9_V=4*X+k6S
z{QvK2`|q13WNFuUKq(&Ma^#756TfEaSv3CYJLimym@tVnZyEtvLoUA)o>$i=4jTa|
zfGGZmWTN%>9-nN~y6C-RAF6Ib@!l-=$kV5h;Ai||^__jdOPKeWH<1pWpq+KcT7kl}
zt?$*K0uTn30o9O9oZqtQH$k_Yiw<Wxm#lNcoE1gXwC(k^Gwlw@!fMx{d;j)7ga<wk
z)T1tH;nxB?pgxI_Izv@26B85I7=-cP8*kHia}NuxFod7s-L~yLS}A)L=5dWFhDS0C
zQKo41*a4E45Bq*{af}34VVH+8pB|Eq%5Wl3aQn7x#)P%MAb05=NGHc^kMZ^V`gvkl
zG>zobpcmD5uuf>%D>x49+Sxqvt70P-?ewUPtu1M6Pz=FHc6HGvZUyV<or29BIo$F6
zJC+Rf7B6+tM*#uC^ymjf!<*K2(C&hAI^Tb-J?#Qa=L#FX7&Z(vcPulY4JHWaEZ~?}
zvs<_m+KWk$hMuKzmFQ7;Z5b;HxF4^&+N6*8|5&Z&-lsu--`S;0m*NKE*s^eMw5UML
z0u-D&b;=N-*X4`3FgoG-9L&~0D^}XmtUU*$-m2z!6FZEQZXFso*fI*l)c7FuF#TQ)
zEiJ7iy<Ggw*TSo!_FexpKBg$SiT4*QAh?_s6zV$GZ?CcHK?|s)2Sve^!?aM1J_tJm
z06$RVnf8(1tky&@KZk+xgUp-<E}X8*26nN6He$fBn!@I(m+O?EkrL6B4EzRCV~tYf
zNh5V%r5@1N^0eEj|Gl80d1+;uuR}pAr{?~7$1adN^aZoKVThoMr7wKiU5xQRxy9S{
zv`ec~M@K->E&1?a340bWu3m;AeTCNv8Y|(_0H>g|-Hx1J;A;5c{${drkVFWtI}}1=
z2TVI@i(b&OZQC{t(3!eyG!w+L$=^s-^=Slh{2LMO4Sw{y2tQNz>XI!)V7%d#<DZT&
zH@#ks2U|{%#++;|>e&AkO5VEzTj^U{MW^}UPXC22*|V^u2z-4mIZSE6p&c9?>axYV
zq9ebxw;P7d*|iMlCYv!yR9V{kJD{mzy^|ckA{@w=%Cre;!W(>1^2HgVaX?htD0};;
zBK#yYx+thZykW}c$%Sl41B=RPbrm&l`uY9)Wf(sGd;XyA(DTP#%Nj$nvJlbzn>YVJ
z1IR%$frG-6IJ#)fL}~%OnJf<eH<eI>dYWFOHKhCtC$RLGuaGNK%0Wbn*!@yM3$!;H
z>u&>ry1k1g3kn{Z8<ckGKmY-xM1*Qv<z}2Ix^7w=C}a`n=eJtO(pUKBW6afx$!A41
zWLeDu`P0}U3<wglQ-i=mWDHEdwjx8C-c-ko+y0MIHvjJbSjs2{@lT(AhbmqIFT^Mi
zNEXabbg`_wfHog1aE%BL=ia}6Kk^^Ap!g0aJuhdIAg*VERP8RMwMCc&wderCJvaUT
z9ngWBkhP(`dK77mC!9w!*U4T%K>>3eOUvK+Gs8*PWYzz*Jh@N%(ZFMs8Q>qfFUl-1
zW<$2MVTK`iZ8tb2QlK^84TOgDp955Rc!+XZr!9dVBIwOT`Bq$f{B4n<EX+>IBxc6i
z#hv6ve>W4IXmA`93Dx$puVn=q=N(Sb-xb=#0fQo3a~ZRY&gjTH4UyW8YWq4gtAo$0
z+J>rrzC`;$XL;!A6;)8p$nHSIU(L#@uA*{?b8)mS$QY({z|o)HzrP_;(z1=07sG*S
zI;;HX#~U}z)%QOa$*%<|2;Y(3M)0>6(2<DF2yG7an8kh@1x_K1b>R8IcVSr53w5&V
zfK-Uwi4ztruP&hsp;D||)vLlN)qBHa(g7tL+!%d32mY$jUp^om&SPy}qrdpd#0KT@
zQJ*_dINcx!{I1}Wy~A+{n4_;kn?HYQ03<xHEX$<Q2xBN`yJcmuKgHq#2jfjk(_)3`
zKX@0WMLZ!lmVJR3)*yx7^Y_n+$}m5je-G^sQ!|uFGWtGvefcp3g*ze@`*+Y9WrAtb
zqlXVw$c_g<^91TyzP21gD^h-#&f>~GK~j!os{Hb0%ZWfc^zb8qW8v4yzTr~^^j;l+
z?2V!HMHDcae-gHVz7}y_-Z((|s8FY_P)i{OivAk!{0=g1hFwzMxi<r|Ua)7#EIlTq
zs+xiaee;965KvOzWw;tYf~-{V?j)~Zh_)CUgu(lk+vj5S{rx?VJtI)dstONmYP?3u
z2J@!UFiAJaI@NbD!Wr(W1{7g}vF22iLW<84O23=z?X?x?GDNCR${o?7GGH6JD<Y2&
zJm_$~w;_XAIZFicry#7LOyAIh$b*iCa2kOu2BQq5du09{wtdG}u)60eW%V{<Pl!K=
zsA>q+SOo|=dD+=;edhkG?QF@NgqiE*kt|C%{?>mc<0nO{l&IEg5S5e96pj4Sb^dd>
z?%&}3uSixuQf7+bH1G;;t*KJ%%@=4sSj8A}>1hAsALS6v7?FU<C?UiUM3F}qzkycS
z2-nQAWy>H_{ek06JtTNHd|B)Ef;JxUVY45uR?jl4<$o0OfANL?l_TugN-5Znv70b3
z9PzVWY`JtEGYIq#nz;$97CYJ%m!tsI7`h&swZ25y<HvQ-EckU_rUM6~7w27F&qTBh
z*jd?y%r%g4WYVstwkEU;?X#pL2V>gn%v`PzxPvissQAece&!}hL%er)qE<#1nx+nd
zdA4~we<CVP+kQjziXzD~^V}DqRIe!VVpcqUTm;E3@Z$S@=%<H6ZX}Uqzz9YJqfC?l
z)euO=Kek5&@3}nI4^BJbeC|NO>NRWfhHJE;76pPJo)2?TvzOrGvuHwW7W<aHe;-|$
zBLF%{Z&#I>O{g3$6GLeA3l=T!-%3*eoGl7H77{0`fA><%auIgy5SVUm4SkII9~{ha
zMvPkvI0{<WAmks|ZAn=bu)22v5UK@|2OKK5;Iv(Fv^UEN;?WRbO)dmRRdvgXh_~y`
z2V2)@p3ksq&2#8mNKvXK10wM5Ys4CWf;fT=fw9q7XtgR)TwSNeQ1q{PfssVP8HOAZ
z#<U;2!#Hi;@QHKhe!yn=r|Ohq?$khuYHRry)CTme_Sm?@*o4;J{}IZx=|=-Rf<P6&
zeOrB;bPM+dH9!kVc@tgOR@IPBjB8h`>+CkIieBrsWMw(`KWo-VOb=DUtuhY5GfshA
z@~W*082=vhETkAyO<%Z(LYeeh?-Xq+5QcV*&?HkfaN;$6^C{H5gd9LqorQX94PMd~
zfG9vu9=>~!>88+HqEtf-@Awq3YW<ciTQF{a1`ll6wEBX}aZvsiuXj9+sU=ptEdaZ$
z5U}SNgcs+Fh2T_lAPm;_=+Ps!xf-WioqgiOiRKczKWJ7TJ}%6ytOq~uw`zX5PplFU
zhxno6(@IlMEl74oa1&>+EB>!$@w^;W#2KbZjQZ0zGE;Jt&3YS&PV73_o0LD}<oGx=
zbf$=FrXTPAlCzL9vH-+0NbJ%8as2eRD)_MNo4_>Krc}7&P)K*dFwuhGnLwdwYr}ZQ
zxmNcg@nSTnNEHDOg*D-0Npl#>5f>I7C#)2vniZ{ikna#%e0qR^-`>&$kLN_ZoEQRX
zF@$h{K9Y~(TBYR#BYreY>sz;;hve=sy_jNJ3mJ+ZLLyN3Mlmv>*S*-h&)^MXu6FoO
z6bHA_oYrQD=P+KGIR5lprSMq7<z<=uys<4Am3h-dGeE&Zo?rKZG5ZT9W#yoBF8O;X
zQqw6&H`I!>uDJ6GFn$JE<q2a`8dMesLMYI-eDPyuvX6O|GYHhj`r8}ye$g%+se?p5
zLnZF$=<xE|c;rs7*d-e+t%j^TYs@-9&t^V;_%9ai+_WA_Xr?TM5<P%P4w=aUFaXyK
z5$zDkPv%CGSAh(JQL)bG^6>Lp!h{LSl@t;_qI99)QHb3n|F<dxLn&opC?skS$f-5B
zAza_#60sp8y~Jw3H9J;}bOxxId@JC(8?!r?{4EgXRJ622aP{bTqituPk(1Fh+JtOS
zS3wH)B&}2=u4{4)f8N+P%n>I3SzUJoYXF_L5_O4fhWR`Gz=*oOwP7LgdZXhEa5Eyg
zYT=rReP;n1!A_Y46hSFsAwU>;U1&!8#k_DD<t5qoA4;-7C}aq%S6@R0CW%*>L<nbv
z#*INx`PW=bj<AM1w{63;OF2@ObT!yr{sCC&3TZHDT_oL)nF{#iFKh)Wj7x$Jl@i|K
z1X}Gn9H(#5nkUZa0ZB&;*8G_z|13agaiFx3Y~yBx5uy)MEwiLtp)RF@8PZ*d!5nHl
zl>si^_f0Qs(WzerjU(^errNoRIcz^+JjW4t8yK|_%~j0g%TtVi_?iyI0%Q`ZMw<8w
z9IP<?_U0!3#bQj+yB)Zi*=_6~N>c>W!k(=J&Jj;KEcDXUH*eoIyUk9+4|5QmJ6hi~
z!9^Pg9~v`ON$Uk0_Gc89Fp&Jj<m6q99#CgN>^n&r5viJq4-fx}`lO#>s!C-5qMT`L
zovA^l0L4!Rfh?0Yix#`YBOGG@Vt!_Z?T|qH9xGDbOe&v~2`ko@cyeGPrU<H#$p+}@
z`ov-%_EFSvI1LW5FR3eorlAYV5sKh)W~-duO#PVw{|edp;d(I{z|RF&jL<Y{b!<OF
z+Qz^QS{JssH>s=pC@9HYo`a7B^w%>;V>(8_(8fClHjo96Nlxy6cuL_zNy##gucEEv
zr2w!N7Rx9~tk7mKc_RP^x+Z1C7c!T$Y|6ec4YSV>;Ioeqb21khND}|!wQe}<r@n8M
zyeY&?8TKNV(*j8x#XSx}?Nx5rcT30&4w_5!!CosQl>-@qAZMhh@Xd|G9MgxwPR<q7
zL-kb@>3j;&i<=Q;B@@##EIHkh?>_qn&`B4*B%~37gcw}li08E5a9UEmdJLi|siXX8
z-bh*{0b?%;r6(5?6r_y<EDo>=12CV;O|Zw?Nt|I^-<mg#AXBxv^0Ed+2aVOPw_zWn
z2iZE<NC7qIhY?`!{|fgJ2P{0<C3Pro&zzRMDe!e2ilh&+v0O-?S-8gG&8G}|kO9Vs
zDr7>05;8i1DU~a5*O}KaHeTg^|Ni}BSboYv#c5-8%u7Te+t9~7q(l%?8Ig>c?9eC3
z$&n}uJg-oFLqev(3^42likC!-hhE6Ujq=`1>Hvc@CK$k%NPJdIq&5&^qm+0CmJjC!
zp2WxRl#rOQf}bHKDwGXJ7YYWF+@{Sio(H(aN)m;F*fsD@_XQ1nSxo>BNsv-jBQ)jZ
z<cLRrYKGN-DEk>Ig~wM!O~5|0M}3hC3}Rp=f#(arWSPfld?K3G%Y0^r0|mq@qr^oj
z033o)USaLPNroR5S<P$6;R)E4pJJ@CgV6)UE^oBO8AGp|SiORIvGMUeaDDNUn!^pN
z7s%{vg|DmK8<r!7Fh0H!LT1Zky6+K=IR+F3B3zL(5%pk=4h|!*Vt8=F8vGh2sRsxJ
z`Y5|AlD4{%Qx~WVpe*C?y;?Fa;NAh%V+NxAT86?KEE|D{*1(%Fn{qL1p0ZBwY(KLc
zi?p(I^2Qqm(#rEv!nfUsK7H-<PK}6nddY`Y2uc0(#?nK!XY-+jRoe`TWPH9HKD>~D
zVcF3>mrHuV9g_Jk&RjHECO7V?$7xg=`ock@ls~OjXEBAM731m2fayzr{{<uOYu6S|
z2NRETg0#H+CnN$eHxTZE`SU^9ws+m2g!?UBzPzfo7P7Mcy?ba0CqR7<ak4<cUVeWq
zfm7ih5GcZz1~%~=l9l}lhr&^ejaep;wZ@xQtE<;ScfKJ)Trlox#}6Og3<|Vq(^)vD
z(M3J)!ZdJGgm?;!#dy`?x8dJBF^*gklSSc{K}pukcNOP(8XiuW^`lVIZK7#Y0?LWf
zH#b*>i95?Cn=(@bR}kBM_5~iEvm^lY^WX8&<oXD|fPl`nHgxelyLazaE+(p%Phn-H
zElg6LR8`e#*b~aN;b}VbEIeH5)Tu!nbn?9G#muD;DyCV*GLPbJ%uSQXD1mFNtondi
zS!94kz>GJ;U5%BY+z1WTJAIm)=R4wl*_xeHd%BMk-;)42;aX;<gQMdC9&)S%a?L;X
zhv2ShZf<`4+JhRAkif~oaa(l3Jc?En+v=X1zP>>JroWW%UJ8Yg2i^1e3OG}qw#=7W
zmEtz*+EABs(dyEr7sgj<_l93G7H?{9Ht9-8O#BU6bchqvKpxM>K|y9_W~*dm4jx21
z<m<Wsp-<5Cp$pKe=CWnp7#=~k6||Eg-%=h3MJ)I7_<d?bOpNgPQVwOW1(g2Tpj~G?
zQc`yFe8+NNj2^*+h@rj0GY(*nE$SK@BVTT!2(lO$7|4HEWE41`;$HK7qNfo!7)}y&
zid@G(GFn<MSq#XEyQ_7>e35<zys53~m)dTM`@0mA$}J5CY=7Q1-bil7U8FAG%?$^O
zq9e&q4L5Jy8W<hj>baavkhu5A1$>>A4;8zF-SwoxP<yej<tB+I>XoilY%`$J_wTQi
zd5>{7Mm1~_o5;l)KSOY#Wk@r2R>f2TiSHy!G2@kzVvXo!<iYUvmai`>eZ!QNv9Pe9
zKeO#N@Vs;9j<>hmnKMtKH{$Bu*@PkVTHvE)iLXXwxH~ET%RA@<huI-k-8eK{pM^@g
z(euX5o1a6)bK2Tz_fyl2(^y07<pAp+i9*9cai1-0yMQLBz}RN<0#0F;zhC_6V%w7^
zPk7Z6dzy1bOZ){6vlC5EB@XfdR2om}x^)5glw0H@KCa=@I;75$Z{Mwk*orPfe7;jV
zF#?G74l&EfgT2M{*g4XZCcGz9;b)VgPoLz;h4;T{^85HnovcjxP7c6JOfJDWBPG|Y
zUcG`%@Mt;@E?g?(bx~2i%cLyhZkkJU6}d=1do<$m^1IagFdM8}x1B6wc#7%+7j`PO
z;D^l;9&mP_mCX2HDY;Y7iM6zLbZrZ~GB7Xzs{)G*B+Fe!pVpGdDky?74IP+~CoeA#
z!qHq{nZ|<Il9@a+^XTYk7;+NVcCQdq<MJUe&Wd+9U`JbR$>-0Xr(e0h0?DPcuRiNj
z$U4|(EXPiho6`D6U%wTg6a0O*n3zsgir|8ZiV7$=o_EoHaR7|0Af@G(zc@M$k~iWG
zO}REbp{Q6^T|NA(A~J%BEL$w=+O<d7!TVS&5JpqGR{gX0+&qd@w=&k*&Q6fWv#4#T
zuQgx!8`)=LVZ<0zD0sR4VQ1&<(g?f)o>F-|Gc%I}HdP9(OX#sCO+qFY$u>WC3K}R}
zTN}DGC=|!;$|zqCkNIb+_cI`a4l0fC`IlgjzR`0CB@`UYXElihsbv*O`jQQZFkI8;
z*MR%RhQG?6MEEdJq$oyr42)<4TJrhSmp~iHdKR7Ci`=(Zk!-n(jKDmeR{MO8D~(uS
zx6Yj5;W?zLnvpVh6R3s<*`aN%#>dak?enJYx$spA*p9+Y*?R;O>=@lqCKb6|K!DYr
zSnd*-Y<m}~-<}tKlU>l5YVoGH*josfYJAos)e3`glSuSMpOv&ZiZ|pQ_LuHbQ0Sr!
zO%_4CF^Pmfk9PI$fMS3yD{JCuo59xtU-M!n4o6_;sj17fYcALNdTTdDkoA+`HSc%$
zVxRCH=u~12<V<uV1929Ve}P>*G}+R0{ygjkBM%4DVlD|>S>(m&FZ89?dUfm8QdU-f
zW-bX0O-=j078lIqfQ%xTFtH5(QBqp^+y@ll2Dyd8$0kczeNdCxm&}`l;%<CUR`$8F
zhsh%*jsq9Ke2=pefe=Icdv7W?1Ox@S%}fR49zK42$F5yUDu1MGYnSRP3keCKZ2Ad>
z3fszRYin=%qjX~L4$70rut+7gs4c$|^l_aWcfv*Ags(ex^r(`mYC^j8)Yu<<F!k^O
zip;^jcqv_5+?l)DhRvIwLGeoudGG)p!IkXnlS7@SDrB)*<=>uYti`&(E0cc3MvT5P
z+Wh`<>?q1udck<~>9=p*P-UPa;?%T~vRCCEbahQ_?c#UuR?0w2JOI)cC@F-lzH8Sm
zv}6&H2@FMsa;8vsP+)Zr9xN&-2)91DQqpxQ8~l7=z((dM{NQZAH`ut8uBKESx3NhL
z3u{F{;ydA%9~>A!v|+T61ho?l1&1C{j%pn~;nN@<%N{y}f5y>7c(Gz*Von6?Ijpa*
z55n|Ikq3$^_P(X11xoobSfW7W{P}fdeb-x<ngWjLgx#}h$~K#EPyFizu>Muf(;y@$
zXlG-yQufvC)h+kcR8$yUzCZE`-F$g~!TmY2ttggID((x4g1><N^=sEc0I$)e(ekj5
ztL#am<>UIYniBrv^z-dT-_XR=WRM_c{Ug2F@HC)M0hwKbpTK@?VFP&uQdh2Bn|-Ee
zQDMjnox}$ZDnU}w>_&SVqaq?sdTp%v3_pT#R{{WOT3ktYTbndHbe>`iNDs5$ru_c>
z8zT?!lEmsrtboVWtB~M3`}&|Cs+m|JEuEYuu#^y8TktWUlPlJXX*lwztLf=knwiO+
zvSL?hF7-<m^jCjKt*;|5DrxdCaG)g5xSiM=`v}YX9m|Nh2l_x<I#C`mal!lv6DCYq
zIkAz@+c1SS+k;~-zacH5r@n4^OnZ$@joadU2AalbU#q^c;#Dek2LNP5hR@FO_N`kl
zq0GQVM!t_n+22<XTV(6tutx7GMJWLJ^&k|-Ef~1CxGLU}ti<8R^B9-H<R=WPMyuPb
zxXXkcX##Vv4K)pER=qtvn7lLcKp<lmcd|RE?&&f!^&%%nl;=Ax12Y`>LrlwyR7C_W
z<R9;v7bm<-*TEq(Bt*A)<#8O8OEvn;_Olr7Ky3{TV^dQ|Jvd+JGCuIOv2hYG4I7=3
zk^-T%8M6W~Nl(_v*=)x@26lFK3sxvpcj(NMN^rb%slKtXG80NKy37fA`H9g!fvsEp
z!@{^YIbATr0dj>~1kCZ!p+j(VY3b_f_CJPLjgQ4|Z}NmC+!@sb(Sd`|0GSB2k-<Ss
zBQ;Xben983v$wxCuv14@7kJ}%*lW<NvzC@ExF9-K!XyCp<4LS5m&D}*$g}KhM~lT-
zSy=+xwo#7<&n;Ft8wRT?+SmVGJ*@@OnvZcbBt4lN$YbLRmwlHw(ZV<^QSW8!S(K8J
zX2m|3ogjVTLO<B0iAaoxhm6a0t-Q-G;OhYZLhXF^^y%5NXRE5KaW2Ug`kcrf|B{Tf
zv}Jg%g2Jy^MCor#y;tnsbMQvT((rY{xH8}(ypmU*G?P1dasX5kvsO&3L2Mu3s)K`r
z@vk8vA(4?Ab2o1k6?Mi4qGuvi)$h?G%lg!7&s&(*?)r-ReE04h$|@=#@neAF<GBsS
zm_1~nkiM9?H#sq}UeYx<6Dyws$pWT7SV>_ICF3VVNq1GuDxUB8vomnd#rA$ho-i$4
z8tZoVZ&Z;#mE2H#4Q74w^Yf#NV(y_?Y%zMilQ`wlSE4<^q+<Y|tdWrr;!<{5^gUc>
z910Y!2p9yLX!@%N^9ZP9a;W_1icnrHeiZ=_?b)z#;}RbH9!A5)>Ft@0Af!;PI#5&K
zDu7UQdFYq2t?i(Q@AJsWG}G!GyLYF==AgNzUF_3q_-XS3v2=Eg^_n&O{y^y3kOZR?
zxd3q5m=ru+5-H(okN9<CqoRs!Ye8*b0x}WZyLS-CdHy?VTU+PpiG4hWaC)$`6nT+&
z(hKfIxOk{CFgY-zP?RNm-@1h!CDr#JV9&YZ)v2wqv9U0LMp!U2``);5BanFp(*US9
zRGIXQjP7;kyZ`jm@BPmEZ%EB8A+@kRD}fAj#KgodU%t$CK4BY^R~m*OfLFNLC@hH2
zg+m`}YyFJ`Fi1Fd{J2nqwe~CQgx*WLn<YO2A3tt-b?KD8e!fH`E>W?d1<H7IOiYCX
zJ`I`Va!0iq6&Zb@xb1>NUke|wSX1H6<TCV=V8qe2YuDVFAZ@2l@$r0zrhzpU%YhKF
z`$x*0`#07F$Jnq`#~Y=kr3Li!;+>m3r$7OK_0n?<-YyMwk(ZapIn=|Ac#uc@_*Z~p
zOlRQv!a&5(#lu_`;;Nv{_oJwFP}Qz`d3h6LNM7zn8wCYbeht(7=j1P>(zTpNEO`H*
zGB%(`QY|*|`2v^Yq?cojdwc>a&$8bI{N+bW6qk}GrQnenhZzG92++Gc59|q|(rzj?
z))aMrq1|1WWK>n3^<v6~Lu`L~Mvy{1J?0IeBKAA-E0!-`j!}<XBhYWW+#64-3|a(X
zcg_HDfJjqLdkR6VUB~*LY`KbIv?MpTJ2e$33%$Bl_<c<$PO;&3tG;|;gSv@xSUBoJ
zW9YQoc<|`%-Cek4d;4LuETePhXyl3f6EGvw+1AX<sFj6<;8r+W=gRAQuo-jLRxB5m
zY<S&M96iM@;Z^<f>5lE&cUwGPe|8#}x30b(PJ5g|A%pVQY?&*c1oKj&{h-@l^Yqlo
zcg<}G-4~$uDmT~UppH(k%U-F?z_jP?kD<qFDk&*>-_S%xA$3f&)z#Id6h~Unqf|w0
zKxgi4Ji^F>N`a6~(sq<$ym{jWj@boaz?zpg(^(mHZR{P5;{XTUpTcYq0fJJGasqb*
z{L&>b${phTcN?qx`)mD^u0nJN&0<)%5DehM`}erT?(S|FG}IwcS3Q~bg>C>1Ie%tm
z=EH{%fJ)Hef8#+c=nE^BFW<Cz^EKCO)Cu%4LDT}2`mI}|KwLOQ?AGE0CA1s)6DQ1E
zZKkHD@n9529ux^`e>A)jV8ou+uEE|6x5#^JWl2d~T%4Jyslwx3#8><Jl3F!xuYYLh
z=@}fdz+p5%Ge=Ynbk7X74j?i-j%Ep%`^}plK7Ram;Q$aHVguDPXt&|@<j^~aC#82W
zJR=QfSFV(qY|piB2hj3FL&FUB>9c3o_L*<?xtzH6wbNl18So%P|M$MWS-=E5;Q}NL
z6SwWWys+AuVj9}>{q1!m-v~~D6%-V}cYOQya*VX_J27!_zIaXvo?f$JRL)!;699lU
z0BU(W9zjm-!J|jNF(5`S!Vt3hnH($2q8go+WK(bP-2Tf)F!Wx&V#PBHxYselVO(p0
zm`3h;;vmm`^h-82wi=#Il9H0o6P`Tb3GaX(6RVGL>Jq{et6L6#{q^fH&=%6J)pas~
z>5BAb9sb%f^0I=U3cpYw$r{Zx6ymL2+2NON(7_5uEqSiffk8oy8D{F7oFO<152~(>
z>d<cp4=@TGLi6CkgPmT3WD1E!j!-5dPH=`IqFc;q2(CN60|yQuDe&+Dv?H7rAjg3b
z5dbDT91|lEkicuMvoq8E{rx!C0*H*+o9pz%I(GK5Fv)xb^geM`TAr2L5JTRE4c`#H
zo_Aq_mz<qi#k!gNo9A!Lw=hue9~n`=4&fjJL^!}-kh$lIVV0(*TD|*-kwQPmfsdP;
z8v{eYQ-t2o{fsMDu0Vf8dv6ybEYnzzZTsX1_O_;`#!m=?Ydd5ndZ{P~or%fzW-Q|%
zDh8_c%bXm@ATJTnJzk$+bAj?8#+(9)8e+p9Np#!F4<BMovoJ0}@m_MntMybi?BJxr
zRcgc_Q*e*fSbvP*(DtX+AEi^F7hm3?SlsmXR&RL!!v>JXvGdC_SO`W&&^F<a05`>m
z`cr-VCAbxEA0EudAr*NDq53&GAo^|X?3OQEW@z;Z8TjD?$dMy>5z1^vdOEZZJjiP6
zmMso2L!ghTuzd%%K^hoZA>%PWW_Fgt_VzWsC=w`r{aR-6;>BP#BkzzNKyPrmV8JYi
z=SWOdhLH!-IQ-9!!(FRI18!4Ks;CU1)PR|TnexFD2}~d(f7*hhC)EREja45X)4}c*
zPaL9j#vx&ub!j)LHN3aT+zYh|9Hx%_0qu3w!((QmTR={(1hS^}fmwAW9gV@&&ddBu
zaxb*K1{Hv~8sLNULF0!T0i6H^WeBqG7R6AV!6-FtumMz{9zG5xujg8+c)|p@k#Fuf
z9QB1FnD~JQIg@GCx}KBsvpM1tt2B;eMMs6P{PdYK6&IA3bg%HH8W<Y3cXobgdHuu2
z-p+1^eGueFkZndD&|bit*z}87Ma*aot*q_sp#~X0v((em8$LGgyrKXCJ)r?}#yiim
zR?4WTsxI_>9ue_6-O#|G>yb-|>pVsY11u$mBeJtiu-U=Ty|8KJ;P(%!w75p$ivvbG
z#W_E20pwL=dpb(KHPhA|sJZw8ynwJ!Z`Z6@gF)UYTSa-V>o_~#_WgUhZHI!9C6DLj
z8350Yvi0lNV@L9xhE3tPbd(U{=PxZRY{PH~rJ<)6L<ZgS2=FZ)%mf_HulH)(X#>=v
z$EXAd<$8Ab((%H{P+c7zNKnZD1~BjdRSvShDJn9A$pow4;{%fvUf2bg-uqx}03(TY
z{Yuk2-PO*a+M$RbzyWn7+-6?k$!d|RIV5F`*C4rwy+)-$!(LT&#R?zfl$whpI2<NO
zAPM#B<^z!61tw-|ZCz1QBVqe<dHyp_VrWUyg95R7qzFrANP8in@+_N0pvYsnQkIEf
z=%z9~-wc_nn$L{Ues9x@y?y5ihW#9TBaL}_Y6^H;Em6nB+&n%ynoQ`&{ul;+pTLp<
zbRj!)c|LxusX-xt`AxPF6&8G@x~d9nb$VtxCo5|d1_E->17G@tgBMT_sgr)fR?lns
zx-aM(_;fs<239>cw^<|*a`x=GbDzT`_4|#nk@M!w^Q1C)MJ6TbXB<>gDo0=$i)i6S
zak2x@ccXxSzLFB6(hCh#%ysH9btSfY%^HWv5erun>M&4XI-1m$1&&x>RKdNUDWD|;
z?qd@$u)}mLj?)y=f%FO(7mzZ(9Kt81rJrPHk3o<|&l&3P$J1m;Y@9ra5$-Bc_0&%1
z*%@&QkUS&Q@L{gxhzJ9uEWjk{DRC3bx}Wf(>MMKRh08t|Pdv+9v0?=d07WGxmbP8M
z*7g9){`{$i=V`&TU00`Z<j4{9y+uwZ5pJlZf-$->G6slDD3E9Z=)V)VKU{lzM+b};
znJ615ZK~6YGhVzeEd{u}=;}(2I{-Mmbm$EPo5bzI1O=iEdj~Ms&!0but?1qYGkz-#
zdnFZ>hj;HvH3kL-it(ztO$^;MQ4jiA^1HzGbV4#0jKUhdEG8osC>ao1o~7H%!gBQ4
zuC}(;s(jP%ao~kI<6_gLXhvdqLZ7%eTFC*hZD0m8Dxn-vz3?YEY|%g3>K2Bf7p7fc
zz#xC95lK2PY+y=2+k&W$Au8M#9vxfD5FB4F7U7HoqOlhz^-rp*{;+ugLk^e~nU2WG
zLF!>c*}wr8N4-jUMMW-PFLbCSJjj>K)Knz-NhKwWaxNo;v4ds7!Vf}1T%cN_8C<-0
zF;uY)T^`}_)B;(G!~|l6*~<-C4eggS*Bv{^5fA7gn8cc%l%@AjEg>LSV#BFV^K#J_
zCy7{f3Wu!mXdq)%R&-tjYI(*5JR?#6mE##JD=t`r+1QA83M3BgH>Qgb#19b!-rm>h
z211F1f$e};Kk+f(mEB*zV$OLYt}C^qo_zdW*^<0{(DQ_Zo?9TRp|3G+-?<Z_lZ}Hz
zO;y#?9Bl7PXk$P~^$iUxWe`eJFoy#v?P@Ro7=wBz@7jeEQjlcyhHt>%j{|*7Ja0-%
zf1?ylOic6~zarM&)&Zg;-<5!7j_f&=f9rW%S9f=kM#?wTpZSaa^!JDRpd_MSYe^|y
z2LeXRdAyp6XE&iE;&l;C7P0t%YBM!v&B@7WZ*M>ED#PN%FrciG!80Ya_C49b!*d3z
z7ht~UU3d>*a`QOG@erV@n)a%wfJ#G|R7Q3@JtvCtg3A4-ul&dzjwe1yN=e6lss0Dp
z?`;pZnSha*n0)AALqy?D(V5N7%}w*ZUB;XlPmlP5^9d7?fZE6uKwdCe0$d;%fcj%M
zF$l}LzTlRQtn4D&t*uy%YpK!D&$6ywy}E=4>z@-JZ$7=)T*Pzi7O=_s2TnOC+*@*k
zsC#fYb>4zSx9Qd*;C(>D($dmTT-TC+@?O7sb-@ECXG9MiA{D)8$3scg_4VWUPEYFj
z^XKtkm(V4n!xynU&kH~KsP0zHLeoGGQ6Bt5ii?((6YBBvhw%>|R^f`_e8_?!4@X*L
z-Xnt+EnkZX#^dWZj%!$Hcv9JEF9#Jd-hn-fR<IokJ1s4J#a+J@ZL2eZ6{DS$tgMdO
zgrD>0&j;E!c80S~Y%+P7u(~r&nrlANUuaAS-Q$y$U=h`7KIgoKew47f)@Om%!>~%!
z2-Qxr=rm||$SyKmz;V&0rpu^H={NGC+vit!-?_uY$cT~m0%zfCJO{6Hh}gX^FTViF
zfMvlj9mQyPaIpPi5&&rxCM0!z3()}Z0<uTQ;WzNYq%|9Jr0Qa#TvchGI#mgXv|FGA
zN$<mf19tE40ax$YzdtGc?)*m7gU40mP}7(CP^30qdmwS+fdiVF_4wQPgoFTQi1T=r
ze?0Tq0Drp2;&e9@u$3}5qM|w=Vsy$%O6CK>FLlLo6{1iZcUj=>z?K5K<ZuQBu|Xg_
zfDj;68bljZFZ`9tjtQvle1T5n+rbs~ELrq@)`is^@(s2m_EHuK#Sohf@(k9J&Ufe`
zWC8Ec9;^|_qr7H@Ji7o-7JDqe<$6ra_6M@<y}i1_SAal1nN}bv=QG_43X+q{n!&>X
z?3bm&P6Ozj-u&d?Vr}ilSh?bn?pnez0JH2x859v%810DdRl&{4s0)6zd4a5C6)Qz@
zni#(JT;k^d-k_naU0J0~{H%)?hwc;o{OJ?hHZNbCpS^_s*RWpLW|OG${WGSfB9O-e
z!@ffr4VnJ%_(?(nkg!2Rp5vbg_cz_x3_NKG!>m6rnNaUKR~Qp9ARU<k@_cD}vHY=P
zPqqtyh9tZ+Ka#fPjk=1B&7M=~HmI=qGZSXj5|9SYeaS}23RwB}Ve-#7FcC=bUq;|E
zd0eHP2|*aoYrK)!+uO^VP~5Bo<#-CmL*%>o4q^xmK7&0TDqgUdY1fiSpYT#EjnA(k
zmKYyIoM04qo=a;(NURX+qw#$#GQT*vGv2zYWO&83vADRnbcVq*s+#e`SvvbmWhKxT
z@K982>`PD!xDAw#Fl-oswu>jE4!){B3-J&}+VEbGO-P~x>~HXVs}QFWOyi1iLedz!
z*(m1E6M6AHRc6gjje@~J5**N!KlGIYwt+oAvw%App`q4m2mx5^9&Y>E_3H!dm=XSc
zbs6Df0DIG+Esw`83j<mi0gg;hRRyZ6Ug#50@)l^YJ>YaXrfi$4$i!MQ;!_@cCMn6u
z(Jw=a+Lm)k1ZM{L9yiDS%exc;9?TQwKw|?}@2XGhz}0rIA%ZnXrEz0Z6N=mQ^I4<?
zEOC%|5*b<4#I0IQT2wm<93H9CQR`Fk<IC{?PxX;rZ&ra{Cr*1((FH@fHgDJROKZ!1
z`EpNZg1o7@>U;;)4d{(=aV1s)wj)?g-p$ANL_|iSVETgwprMWroC7muTmSM3nEvh`
zV#t_Jwz^{t4H&268Ea^ynU>93vlW5=@Z?1R*I`C-s1Z*SYEw_O#%iSRAy!e4&-xtC
zefqm^9>NW#|29zt_o(-~ckd5QvdmLV_|Wy`*|M^I`iwjfZc)_JLxO{+un)4{u?UP4
zVL&&0|1=|W34C+Dm*#)y`=z)=>^Tc^iz74Zd}BY@^!7-IrO^h1oTKPI;G1zwJXe$m
z)JKPd(1GHF7K^eJafk7c5w=6s$|Mp_(fvmJ2FHWcai;H`mH(A1(h#mqf;ytI0k$e+
z9UO%6r5}s@Qsor3@eFtlExFL@XL@R?fy8=7z0Lnk*fix877mE{8lI5wxsUxF8gE(9
ze&5VK%-cIFw~pw&RA7tg{z0mi@sCZ19L*sNS~kD@)AUwlvW%wJL}qPB0l(GHb@F>O
z-`>{N@P1ew>JG`s4iN!cH2ek6`NA1?dKJOU+Jk89a`;Ec!ouR^KsE)4<f#wrgLw8C
zz({B_nDv8u_oNw>)z$mZ^`GU+TPw-jK<9TaTEQiuBO?=W<)b{n@*cCol8+z1q~?hY
zBk55N<TDv0t3$;{VsP~5>h<fg)e?y5p44o>ZbO@yLk7rM!4E}%_?l*s62O5A7A#o8
zL(T-Cpv`sE9RSVFqx=>)nWHAP(hUj&W?Vp&j69H->R;Gy6`f3;8LWmp5TJ7j0`$BW
z13f$3F%3Gu(k<v75_rkulwx15J`A=RQE`1n@-1Cjk^@s=5C>jvXvuTQiH+TnA1co#
z!b3u-Ke?w?a`MW)V%LYBhlayc>v;Z%eZx)T<ORYN(u1y-qvr+@5fONERF1ITf?~4F
z3}|;7YA-4|AUP;U>+9F^d95;q=1Y{>b=RCwS06>~^1KTW2axgW*RN#JT~<T$iNd^e
zbZF7l;~;xdlAFTtkPSo^aGL5Q&WJ(|u2#*$bO^`H?qveOKr8(ua2@}EA=?3X9jG>N
zHDN>?*rY(O8%oDcpALr)46ApTlZ?EjDs7N$Mtn+`Z_g7)W*r~2Ljgciz0)hdJ^pPN
zaRn0z1c|r_V$B!2_SOa-f$$|O&y83Bx{dW3tQpwi`Msk<{e)2$#4?<^LHAq2gS#ds
zeXSLOkeYVyzNhP$nhDkd8CR;2gXsuLxKsyGTY9(I7buv2{PgL&zsn?1q@%UQGSMhE
zZ1Bhw=1{=?FXTvs$U!7=v*c=sU<LzDD1E#KN@9^O)kdvqaxyX=MQ!CO?_mlFbpd#J
zGxS8VW-xr%8WSK0a56Ul*#ObHB~Lp!iRgA;0-~C-oM)Z~IOYU`b*f$gdRe~`G?B>q
zEW4KSs;aXN)oytP0_uEYB4CX=h$(D5BM+jIR62Fd<PR&wDlz(TH#s6Ji}F)7{_~br
zOc}e{gSS6y5CSLk`H18;rT3w9EuOBsT5M0w)%0{bP6I=(#4V$)$&)JL-{6luV&$ue
zCnTcic2j+o$RJK?by*GO4HG!PF+L9KYiU8jX<(!tZT;|t-M|1SknT(<ErOGe$k<k6
z`();p$1&W{AiG*1!J&-?PzS(K0ieiZeSp&Tz2e#~C@8ppzkenW^upG=IH<1_=%NjC
z2OHbRyko>7hYh_BWQO7G@X!#R9cyf4RMW#hO3WaFREKAaz`o{**Xt)QVqi$V2%FqA
zPTYCk1?L0w!SnRU(?X|K?joPFgXP>v0*oN}x;^RauIC0ZfHxM>#)Io1J7c<`6>}92
zOW=7^SC?J-OdQXygs5z+4b7rotFeb6eu8)V_CIjT3=jL<xubxiJ!NGb{;r$C<GBZ|
z(@aM!3=B3rP$Q}IT1E%TxO`AdXNj+St^8($XiG~A94s)KL5enX_4Vr)b?)lwB3#|}
zr@XRq1yK26jj@`>#>V>k0BXc@Qdv_>SuZCoS-jW_$^e=&&PVnQb0x&EeB-@!EN`nP
zwDe``Z|P?2#&femT;#gM^{a(e1}bV|=~!87AManpc<t*8923_=YgEvO^#_kO@)*qW
zd=`oZQi)yY9KHsmt);2C<oC{v8;jD%KSwYBDbaqMIbIk@Dm3y>k;Gt6@k$wVppm{-
zMaGJil}E$>dI74c$PA#Hx?Io$f;RlEU%!4e)z>ZG(;v5`^jBk+K;`rLI8~S-&{}7}
zf>Ahuv=KI9X3V?79P2qaIB@Pjx<nZ2BJ%nFF!m;JHLmUZ_hK6|mLW4)h9pW-rb_K(
zN=Pb%WGEGpNQIOwGD}Lyl%X_8s8EUw+mIwp>`KOH4k=}*_j?uld49k5|9?O4`aI9x
zc2;ZM_kCU0IUMJ49_I$Aq&s%(fCDsg^k|!x#k1wF2d<7=a~U*sNiJ282TeM#p1Eyh
zWMn#a4BhAH>B)hwpe#dpa(cf2V)ee*Z;88-5n@qu0YFt4ebrsZPM)mf%8H7@&YS^E
zQ+NBkoWq^}T~sof?^ORzU~=WhhD&><eu{jpiwF}){pO$F{N<cT+~EH^j_qSpNnvBN
zyL%fc9>RIQnwHj4ySWR}F0O7fS)1D-$@jT%c!A264?{+et^o$~^1&$0jT(H!-~aE}
zNejYf7Y!LVVFK%4kI&Z}m*6Qsz=i@nX=zaV3`g);qJOj5vrk1tY$Tf)|KVVa4)9H=
zH42K;t_^+LK}Ai?5d-(5<^uUNoxlcR(MlWa-tiw#ulHLyfKV*s@W!;cWmAQwNmb_u
zSn#-%8$SgNR7LLuARWXRpP&QgW11KA3gf#*hMUqBw(pSpf+u6nD;7c0qnSrhx*X0Q
zad!4$IhzJev!lz0_wRmX$8(RPnh))4zJ`BsamOsZd$$~bhKM7kp`zExQ>T`!TfDyh
z?>*g(#+fWRc<9hk{4SqGi35T?G*QL;Pp9l6YC2dzL$X7@#x(cn@#7BG*3NaG3xiS<
zw7=t9%5f|L*j)RzRn#?zY{IW^+!_ZsNa92;Rsm3uegE>F;solH*RGlN{=@9flNuI}
z*D9K~&OIvL{G;}u+-HJYWX)%{7Nn;B8*%$(mCIM&s9gf~Fl}0!IbRI<Y{LGyt5<a?
z%5B*)q3<|M|Jbq5b8@Z_(zCWiwo#@VMK`Rca34B{tj6|%Xj}&z*+xn-LTm?ev7)9R
z!8sDP$*bh8L{QEXT3k0bkJFs8-9HoX*KWm%DOD1IeL~!z2n#T_?f7^3PMwf3b-MLd
z+oS%Nz}+J%LKON8eX97!rCCmK2TeEmo;h=7Zj^=pp+gg8(2oWKJgIKIn<6Ik$mj9a
z2?+@^JRC|f0Bi6qc+!Z72vt>81jQm61TO-9&;YN!<kTOPs~!gcxoJlhX1CC~etev<
z`}<+i$r)eFzOP8DkISh`%{puwGkSa4-+vdPgXAR#5V~p8_aN=-?0Yqb6OXRNT+%XK
z@K?|fj2<*71>G%j`iQ@SpSC<_9dmQTQiYKk8ecF*UOpg9kr5F&($3N#^jF9>C-t7F
z1Mz#&ynn6&K_Ahu^92e7y?tR4duo2<hTJ6$4?nzrf7zuFnSvfYyu7_L6+l@@x1+dw
zC^OPe9u2uYdiDgHH%*qJ5Add>#D3+<P1Pl4E-r!}GwM3T6^b~*91;qXb{KvgV!Hd>
z`;RThiTxnS+PZgdDw=J8v~^q%pcRG3`4$$t8=1~Dg2Lz^bkAExF!cPN1cB;bxdKuN
zUnI$`%Erp7|0pLL8^x&GR(5t6H;=X<7NVOyb`Ysa)J;&Ndb$?YA6&}%C?17z^aHnj
zcFXoP4;#$ORvruSp%?)YWVbYozoMJKH77r!4W<3KMOoKhU)<9*XH;j^03dJ6ppC;Q
zsbJvWUr$^!b?O8B6`ls{rJ}5CxL?)+{Rd>`w({9Lo8qD((>z^DPbOoa1>urTwaC;X
zGFaMhL?|Q7pP#)V_FF2apK2Pe@!B_ojQmR;QToHj7i?SzIl$aLD>D<ppU5RkF9ikl
z$_$-$srf~%iwg6G5$svtn>J$xdMy_ob^>xG-hI64E5d))BWAI+NE7X4zww+64_Ss3
zdIz5v+b4~zOJ(?QvcmMllIV#8sWC3kB~o5ddHKo}8<RCIWzS!~dS!g_4=mjifTFZC
zRcs;;>B5BzsjtBMKy``A#=BJX`7SL>NTeZs+~!(Gcai?_3Ts1wm+Ha5foF)3?dR?n
zKez1wY6|&)q@d^kWw+{upXXXyZUMc(LL>V1^~;w;A~hh;K#hEkFbk>JzrSSgB!1U(
z2GlaJa-NNhl&lYjvv%2b({_}P)zsB-TLZij0hY08JZUc<aD{VJsUJR^aY8%(#M@Ej
z_Ky~3FSjeRf048)pyvu~uk*|#p!`jOlWKte%7+0mWvkM_>J$2g><B!Ns10lPpxxX}
zzW;>xFq>pxKrJ`@%8eUas%9kZqnM7$*xI$1{`%`SUKg0ipzrKM0^<rgd#lYS`M>K&
zg?sGFW-V{;Gw|dAQl7B9wyf=mW)TsiR|M%;XV<mL%$=OAk^OqUaH;H1FFEoBgjhBl
zpL&Y}I8aq}=7^CzDw&GZ3i9fkxvi#AL2liumHHoTAKXw;Rjsf8spSBHWvZcJPWjIC
zV1iL(zE5MppVh^5h#mCTOOG~EoP*H<ZqBZI*RJUqFTCVfHwcwWwr5doA?71Z)5L7v
zySL-U;x3IuN0*9)tXU|0yaAaP*7aQ|){-d&OB$w&1xybDH9!;vv?E=i^rZVESO~#A
z)iE$2oohF22wnUzY0J-Cm=Fa8mP%8I>O2qsX`DM_{`{UQ1C?`g!~U8-Z(jPE`^5bu
ze>A!p$Uw~*=!O1URIc-fmMG>N_Vqm-rwplqQ^eQQI(JKHTM5x`NiKwdu&S{}uNMH4
zoN(?yl>_TZxucDgV2C~MJa%rGvnwT`I_ECOUXX+>4*dx-cx%e?b4yL-d-i-7Te_~s
zpp-ohWCFAfdJF&VLf+}}AA9KzQ*3jdf@-YKax8F*=-m7kETpDYvLy5;zp=5^<dTt#
zJHWuAB&5IBKq;`N=OZBfg<A%jt~~m3gsbY~2M?@1AHW_)ybm2?Y`V`#DQ2Y^02v(a
z!CvkAjQvQvYFto__44J=hbVH&NC_+ddZ|g$H@!C@KaD!)&3yI}=Bkg|AkvLz&pte>
z!Yig8zwe)bV=!9~H=Y?TSj$)kTic>{?@X_HZas%+)>Ff)&S`gcbQ`Lpv&hjKa3%Ed
zjf2C<$+m1B?6tSAmU;4!s`0%?F1-uFg66#b5F`BuU{TNixr)&-x}JNTK9t|D;H|cH
zb~>&fJ+E}>+?ivE+s9@*gZn~e#b{=zP#e)})O)y?ZJSgay6S<C)M!UOIXmSCMhB7J
z5GT^?dv0S#)uBTXwCOLHTR5Mt>2*|}s<r#ny<Di*OtUWx&{u0k*x0*Ot5(`tj$@Az
z#*RAUUi<OSWLvfwX)Q0J>g8*w5ZUA!DYkgx-LA)GD&|JVZ*!ZsLi)5kxjbn2%uC%{
zUhul0U!Xx;vzF8fkRE8540ZVsUv(Is6manhy>%09{@fWIMx?2Jbt>5PrQDm>S=!8-
zUAN4?>3kxZcLml!c53N;tI%oRzJ2@}jFgsN&z}oQl~5}F(rQOgmmkxY=(?ZSp43|@
zVlvu`y%n2XO^aDMm_#hZEd*B%U$ylU>$OBbd|b}?`FGZNL1N69OzFwiR_W#+^qQ-?
z_~PF$r<GQ=?>nBVZCdP%m@Rc~l+rWd<DSoZt#^%d8}X4=AijJsT-E;7-IVEX>8=*v
zN&oqw_M0@f!7B-Ud|}rD`9pq-vHFK8jFHZJPJsPwRWWksfdeZlGjeilCWLr<d(Zv;
zDVc&kybWGT4UIZPzZU%Odh@BtbLR?y_M(tnBJImpWqXh2ZF?U^huPE4*+tPPxZE&c
zk4Vsh+-p>%UJotMxpDha)JbI!7HOE3e@W1&kt3JnQcq~Uk+dzV!LjVv>}F=Ss4TW@
z?|bA(&rY5GlsSlmyK(cJ)YQFpqvD+0mY%4E>+(DgxaUD^UH01_pWb@EGC2-tYP8!?
zlmk-EW4r%~=>{44N$fuld-|hpY<_fzv@@rMhRlbLAEV?wo+f@uD3kimju~`!ZddO+
zM@BAd<*>rW2C{OSqGMDx=|vDoQFr4@*H%|$BM=xE`yx46FSK^pEC@7Ts$Ztt?JVml
zC~n%{WhU5ZKh1m~`+f1^njiV;v$MP0?3~#A+o&!<o<lX@y6pI4g6qfRf`S{6v+=KZ
z#IhLEb&#x5k9J)*WUU!E#EKlht6-jYwkmBf7hLc|zP`29%m3=BzWf2@y3)-CVY)<O
zcsrWR1b9cSgvdwU*&y9uZT!N}&`oBPo~;XRAh?lyqorW38y%f*ymv^mL#@ov^y|{)
z&YCqu*c5<muV*Z3K?p@zI;1=+%M{*lvneR-o?Vv(;zFgmg?AIdtN(h)n14lIY0R*P
z@*A3!iMCG%gsRH^%rWx>P(3<2_3`7#3i*Xm=RF$gV{gS28AUB6Rrl-`I(Yd@TIaVv
zG)dU<bNr&`NfUbNuHu@k+pctTb^SOjv+(WPIpw{+YTok84O#ZN^nH1`ij^Qe2MHSH
z5UW26fV@W5ZN}HUznZ&=K32?*j1*t(?_0+`1b#Z=d~aAhRg}0mo9@8Vrf!K&g>7%H
zCdBy5yK~Av%kr<gJRy8|@Sq4zgl6&LS#-&X)lW<7*1fxnqhr`Cy9E?pfwX(~=rMNu
z__h~*D0d~1BQ_ztKZ)_c_xAqgs3|=^x2yCnMOE_WD`4s^z7W1^KOP$!8{0KKDy=FT
zp(@)mc+=!VZ=B-M-%|8~O#{psox7xT+oDw8aA%8$I81x3RY+Xl)ISF?;mh7m`cyAd
zMkQU_dOBjr`^9Tj10bcQteQ1rMo6{qk?O}ifdyhe!G=FEB1@#y>mDi51&6J~abTn<
ztg#ysv|H<oT|Vd-(+38@Znr-P*M~x4`W5O_*OyukCOr_fq>&>#eJq7rvz3k)hm+&h
z?O34zLFc{pdAJX)p8Vl83&|dO{{i-XP&ucUdL(T_Hl16&I#-f--_glw>3jM(ht*sF
zuu&Uq=e~LK^T^2;w|jd(JDXlcj8a!l@KpD$dYA(i;Cw~_Vk2tVpBz@LYSHI*LBWao
z?-vn2v2Se85{)ucNr>3ikQBkopAn-day3cs>jmZ^e;IJwprUw})Nb4$Sz+dt=;%tU
ze_e;~)unT}Z~dtdpx|a%#%=R_^tYGX`{|b=0`|0#lEM4?_`K>_CD!BCqAhb{*tfy)
zE8F~Y725!=eWo19k|>She^F^^JL``m{i|v8xdNdX*Eq*{UoO!Sjw9zvtaKK?ux$0d
zZ;%qtoIV{<6OU5WE9FEZhpC~>m1c2?Sj*re+cmacE-@)vsOka)7`x>iZcn-Jv*FaK
zd;94GjkC>GKLi-ZU>5thBbH=|MqBTY;G8(Jog5NpD7V=+E-nt{+n;%_U%%#An0+^x
zGNo$z>xe88D0ln@z8k8PRp<I%Q<;WG<plLUqDdjIhZh<z>mL*(u5bu#2JhcPE4!YM
z(01g^6I8^j;phM=(5WC6LvzHWsq><h^mj7CfH>SvYxwXof_Uw!{sM_7_m|UW{0Q^%
zx;FbG$L&VOvu7(x%&6lZ{NQl_z8~x#vVAX~Nqf4{!IQ+kUM75Z2YnK<i0}d`AA9%j
zx5)@tx%1BJvb?L60M-6i0{r|a$Dw7EsSrhxvSC?oX~afup?juHM!bylOTZ+VV(eUq
zw_5aW*me<L+b>_SLTl{UtU(8I?%!VlL13ZkT(n;S!un3@D~k7gr<V^KoKpaY!@R_i
z9Yim!{#4Rqcv)#{j!|igzjkdoQp~lV?%cYyb@7Sy!0=$0F5g@PYXoCGw}d7}kFw_@
zbD@)s{{B}x4M7&71~g{ORibgxTW~`A$cM#)^~|t%(7{NxBNJdm?y>xXn5gHd_l)PW
zu5>`;UeM>YkO>t4_f2i)!+b*;)xPsRf)Q%!ZWT_Bjt?F^8t6sEH8ewgxtEW~uD`yk
z{q}8jLE@1Q_qXw7ePP_osgndkym~f>wm0#Ta-2>zETKCz<M!>-m{!gB(Sm@~U*E{B
z*X<Oy!Cttuj5WbU-Oct=CnOf|yFw1&cK9NpWFWO+kQPX&AQo<iBdR}rdOi10DW9J@
zCdOZN{;0q;-<0K}PXz=M1~VTcxoRsPot_`(RklEXCk@HhQ-Xl#jGdMYB?BWV$h>mk
zz!Rh;^rt&OIc5$^A*+ayfhag1@q?cSLe8I0q|G;1b@=dwmzHj)gA=|vfjx?-B(SR-
z9AI{}7;Z_x{}C-E{hh?-Cr((#d22yxNZL9ap+x){041?FVL5vqceof4y&2laffRV;
z5&J(v_e!4wzI~ba1{;f@AdwvjUp4pz_|Rxg&Fq|<3pKkI%%4B<QQMQqez1Xv9I%zZ
z52C>X<yYu8um;Di%o*Nn8{Ap+-VROsxpw{f*h@leV+O=Tt{R>{P60X<h`FhQ7%o}1
zyyF&s_w6)<S<{ikReJfr_W|W9l8JRHG;DT;u=Qe07|Bm>Rj1w46b5<0*vh`xW5vWS
zDZ_RLq1QrzuBbFWI~z5Y5b8a2Sz%6v=3m>kxm0eVqG&z@0((Y)8F74biS}X#5?hcs
zFCRP}is5C*a{yY9|5J2OQ7I>rAdjU(W3<!&r}?qeKnapGlC}nU<j&+MGP1ICu_%%2
zVF&nBTI>YLFLb<BhbUw_b#z2Q7GFusz{Yu;k|NlY)Tcqq11J309WmP2v|7VHOX+<D
zuI2-fHGS7S$EW9q(-Rvp?kkg8AWofi0F0o;oO8pm1n{HFV)(*q7pa6$=<KvPAJ#y{
z^Gn+bGzjQMkssd5-1DmW6G3$kCuyP6@=zmU2f_+hZ7kaj^Qf*)JqY!jf<a!aQ6Ftr
zn)Li%jxBS;4(liP(SlJ#ERb;}Dyp1DUtLfl2=2#@0oPVTk<}bE>X6*&$jIx<?svR(
zYfm~@-_QFB9<hOySAp@EuEwJyw1R+4qA|o~(GbsG6nBNJ66OE`wUGFy$UvM%=s}L_
zOdz{H2ND*^>Bj7<Qz^E+UmZy|#uS<|GBaU0o}p3&-V18LE~_PISn1pD#L1J<(b07C
z*5JEgQ9<(Bc;^mwHq|s%2dE>v_aGT5cd6y{PNoBMD3;M{AI}y9-69Fa7$qeofv})<
z(OoqFBPrTHT2DV90x^l++qaHm;YAv4zd|On@sk`s3w}r%NCR~X2X?;t@ZskX_aKUa
zZF>6Abd*LtUoeyE8{9bPZ;D=sB#S1P1q%r9f#MKs!>2$?T1rF=#_r|Aevt_~6AMZV
z6JP?N1?VyxkB|ZB0PG>juEsj@g1=r42s6@IL>sSQa26JFzk=pVox^OI2m=or7$x5>
zrNGLL?C*uN+F$d(B(Cn<xpSb2pBZGy1FH8nen{)PY15`HFKGK8YSp0>QPT9kQ{Z3%
z=|4Gd_BfN<z<tz}muH@neXOt)Q=dx->B!|hVzMN-9h}H7>n|bJ?tM5-bIh2AA9bT{
z4ur+;NF!_N0lX)z?nc=Iu4mC#L!BpZ<FUPz8K7oA%F8=y(|;cZojUlB{SUe+C{z{|
z^`P9eZiM$8L7)LbtQ{;vmYw(X>A@ATK=uD5y2Y^Xc#6v9VAVT-(l$+>J^SFcZ4<LI
zcKA`CXW_iPMcd)9@`&{{6d8$vI2*3}Ghya~u?iK{2*1&c{@c*(3Kaz+9d(bs+y}p_
z-5MCGgq+W{BH|rUD|oNj)lF`0<^RN-Qyh44?btrw{)&Ewb47h<X=lf0Q48ieZ0Q&N
zDdyEU`LW~1jr15n&6O?0)Gi12{ra^hBP}{Xl%`ZNc604T&-^#A3~|Z$vS)*?SUjFf
zwOyX7z82xE%7eN7US4<6fB}Ct8~TVo)q9zlhhAPx9kg%(5ddWi${A)Z*hjip83y(E
z?%lH{(md=+x|V_4WM|lQZ{95J-TNq3AmR>@)dtR^(d#=qc1CP1xJYGZ@;ZlMuN}ui
zHDLFcx!}Wbn7eHP%uk1e6cU(~mM+lTK@2zY!wO;_FzyjI``10q&MtIu837<3g9^iz
zM_+0A%PT0@0_fg+GL0HTud7#(d1`2Cw)`TR*piZTo=kG%p!1n1f!LclX9F0%e0ei>
zHO!UphaCoG2aQ@W6NAASNA%(6lrE=m!URti^X9FGzO=O?f&iV+m>4p|TQksNFNtwP
zd@XzMCSI7pvR1xWfyfQcH*{XN3JQQHV0Uxq-6|k$taWlqaR0s`#4ZBphAQVN2V!<M
z3tr4L`KCTj1EDBr(*16B^pXRC#*XovKbMLDo&?2_TuSp+TW#7_$%})oF_7p3iz;YT
z1i&q$1K~!epQUYa|KVPEpEBs0llpel=<MOqfIP;h^I<|D;q&mp;LkAx+<1u3(=Qd}
z$5)9hbzyeR>pQK13F-Ga5#HaAc2)3Z=q&f6)6k2^rHRJYxxkRDMgS}APXF0%M_pS<
zaJK!pb}i`fOkXt`7$@l5<?LgexKpQIQz~V5^H?kwF3b;~GX!xswij4}7$8PN_PqNR
zVD}s?9c^uWeSKO+eNUbofw>F|6D{0u<Y7`~Y&z?@vHr(6(PDaPhxZ&aGaXJVP&llv
zJGXDs*G$Eqn1GuNr*LOuGx<@U&!hOSC3kkbxVDy-F~7q$Wk?Ls^+hUr#H^paxEFG`
z{QMtmSQZm^O(oyb@&-?lN&x*s5lh_D!OA$--+2p!YH$I{qF7KoZ6ErHfESo{Ie_rH
zyg$GfdhIVotJ1=SqDXsU*V-945Z|PvpTt+tj`r@`*ZcM41Pm8(Cm;sZEC+1l50f>$
zPhPZ~?T61nPU^%2_YzuvD4B3TKvcf1P^ZETkbf(Fhkusa4J6DFQF%?6ix;6T(gQ-o
z!GoYl!ur#zNDmEBq^8YoS?@c9CkHNF`gD^+!snqdKn<D`R#IFHQ)X0Lo0Th514n`%
zVhX-~`b3)`XTX;(p`55Z(Ixcc#G>*ptAfKy9+K3~nDO||oq?2&6|+$AL`Dw{b2m~I
zZH3b;az(`!7<f(XNTiMgQ`GP=zrB0)+I?~FX~vG>f?p=OQSt(+!L=UXBcV01;f-j8
zq*iEjYVj7una8eC_mY*BZPR8K8?(*Kq`_X?BBvg=kD`edsSY~>yUhRk(&a<MTwC~I
zYkKy(4IMpu`NEth%ORMk$A#8@|IRK3l}1v2(v&GHU=P4wvw5~F+F;Qlok4@{J{B*P
zL4&+h`}ZF)LKhg3&*p2LDU3Sm3=xHefdegWj_cq56}c<z3<T-$f(XspxvvlyOwGnH
zKty&0mYnq$+#?p8-AmV5Xs94RVsqLK-u8Qd!#xTMMDXBkpRTtT$!z+1u+z~^Io5;;
z6R-qBRaDTd7XgCMXsddnTeW#7Pmh)KV8{jbgUcApTUAtMa9E~IvxBdIodn@4bV+i7
zek&XqDIaS-y=t&HOvjHi1c2Ts_`$C@cnpyarh3tQDNOfuV7zQ_JZJ-J)l^yre@=$U
zicy}YPs9-1awTE`>0{X$>_>u3syYD1E9dPH((t`z?g@9#lQEv&SrXK)Nw`E9qz)b8
z1~m(gPu|>8l5U@U`pZBK_gk@YN2-2v0dCz|uk#Iv0LW3i{eQjmihSJjPB)f56^#P>
zXXyQ}zi<3c3q_<Kh(>y;bd+mLkiu)Ws0BzGC@bh^NeL=^maG;%BAz)cu8<c#E$TxS
zBhC(!=*4W?)lL!|*U9v|xrIg0Xa<e+9xODHJP(Hk2YbrV@i?$VFQ^$AhQ28MX5>nO
zci|1m&}gB}5@`@t*AK5u{-t6Ot2BPRMb;%Eu3>*KoJ+@BcOgsi7tSK1RfXjr5?hBB
z)-Tl;56)QAO<ZB{m7<2c2k8;J*bSZa;diW6k(UHrLw@UXLz7I1XE%UJp}R)@#1X^B
zAu<~$q;5RTz2j@dlIBj8NLnll9lB7X)vwS0x*R=s4Z}v>ii#lvaTZOKTusO^pwwVH
zaMY+dLLWp1q4vp@#xSJB9e5s|0t|X#HWiT?&WnH|3cLu%(rp7*j&PN2?y0!{NT}n}
z9^y&)xkm;Erz38JamOByw$KxwpfVYN134?{St@C#_lNW>B5ikISsG<Jn(>!K`Dv*Z
z;;j;u0}OSb0T*jZ8XzNOd$%ibMYd%i-ygcKsK~Za&7{Yu8Lv-|l<2J-D789C0_g2M
z<;?cRy3lR4hE=$sxv>W|+R=m&`{`TtN=gAwTOnE_8}m+e)JpMmC9zZ4r6T&H#gyX*
z&fF-v6gwVIh(ems3*Wo9t1}4oU+H^9n>&<wIq{$AVJ)(ohD?96vcEO}D$?_Rt~Lp{
zEN>hkj!p&f3N9TCe)o6|hwZ*rvi*!?Fr4%Qb{C(KNDJwf?=|~`9*SNhi82@Dx#M$V
zHcO%nIBd{&`lw)9C<hLlDD$bNCNweuLBBuA^`t_c$ZqAzJ>$C}M06lY>%g7}%OAS+
zOLp}Ar-gi1xoLlnoZb7X<ZAOt$ED8_>_Q4Hj@ZBi+b~R1MP<^Q_``Pd4;QMRS)aY|
z&pAQHUFEMjHe0A>tS4vl>g)aZfM{L)5&a!oy|4aJntJTXxGI}|<2MJoJ^3IWnBJls
z`T^{g(OvXev6-j$!Ge+9WD}P+lccs5vkg4$*HSsfa<-K+E}^iZLe=G%Me@CS?}Y-j
zKKR1rA1abeyjMNxlZ@0<*`IEQ{I%kf8HawE<u>e%)$<wCMxwoU=D9<o0S<?(=`7KE
z+$gPXtU_owNaa<p&YcIi9JV;~E-8zG$s?u`rku^T4|%qsU}_`mCkb1PlLipi-`Uup
zqaXiPqGyMrVIS?0yPcg)k#SzdSSBbmj-3d$c%GX}fqR||Tt?aWn5<sC_Dv2qZ7aR;
z<MZ@0PnBHyNR%9i<1+4{gf1?JS&+*S)MiBKz@bB@RrF`>FT_$qi_5IkG9G}_{brsS
zTPUMK%aa;_EDl#uDMXAh%&t+<JYo{TsHk-kn0tCk#23=bLeH?A{z5vbzg<w11n$S}
z(Su_oBSgk=$d4MwZ>!A)LSA^$>Q6v-jb4vAT<^rkzF=m`V2AS-afSthBp+P{kC;K1
zz+oPuUbm(%+@pWWINa3fT$r_`$4_UwWi(id<ugIKa-AJBnhL}dXE<OAnxO5Z22_$~
zo@>{!V+F1Rxt9D>5=opHOCBPx%B8rJWDVtkCO1qoGNNZBSnf2e2$I@ES{)^Nts1}3
zN`JtB0r&_nISv<$6(834l`i@E&ARu!rCyPdr3mDS=gN~@522$NrPZpMn!~FtuQ{Hg
zg!mb)qAW{zGa21a(Mkpar^1_Sb;(hbRaLv4e<qRG>}{<28SOL)XYAPFSRM3w-R?;A
zc7f^jYTrtFmt2RrG(&!U34|Qx2Lea$A)j|f`ki2Gse^-xixNYB@xDFCL#21Ya`yMr
zQRa8#^mkGTv5T{i8h~{R-8_EisXhE`<q|GhqIla@<BDCTX#dgQNzw`Terkn*B$Y-3
zZ@3nOuQZ^!)&Fp7-MV!=g0DQliKKK-ZJ{&!YRO1~H!Og|O5o%r$4&+IHCCEu-QM#U
z(LAnN5T&IC>(_rF#(Oujg~X?Ij0qnR*e%3loJaj;yYEp~3*e3Vjggg%LfJ%@%P&3}
z$*~LFD7b2VC-qMS^C#EM2#E(Fn;|z~^ym$sOzcam3Q&l%B@R|ruL#b??(yl+IF%&u
z7%V4UChY0&1UO}m>s$^<YOa_!gwUMSfYwY1uHqNO%W4C;j(N1D1JYAo$nSmwR4<&_
zB6;&7x*2#X`ZC+~8F+3F@E~q~`JB2M2S6|R_u`lPePPTBJcGsq_K0wa&Fy)Fn!`&{
z#qDwg+N3V8FYgfVl}IjwDc|S++vSLb+&NV3+P3APB=@0o*%5{2Aw#Cfd_rm0d&iCx
z#z*!)m$KP$9vz9`E^r|X=N5>%Q^Ya4Rv4skwj5FHQ5@O=ZAq$_UxtjXM~{7zrFV&N
z;2)^C4B*|t6|hhIw3<tj4e;0xo;{l@^9j9WA#fxfiWoN8rF-`wULUKf7Eu2X224HD
zh<5U?1wyr(2Z^D1q%14Gc`CZ@pK@15w~>-Zx9Zp@@ZdoyUt%)|jF-|ekW^RwFEphf
zk?S->_^y&0bR0DG)>vNOFrxxbaR^-r$Zs~FE7H5TsvC5K@L1JGAo*-ftDzhsBCA1u
z0C2=-JUFjjp7-Fv)MERiZ1iuiIU>gkqxDgNCL!#A{!Y<RRGtUxXMxzX`e3T$I)oUC
zUP$#8gGH<Sly5H{gayFfH)wJ!h?p~H$(&Upj7L0YD+^QQY?8&0a;%WaT#50rSPRAr
zEDI@zr78^8(bf{Hw#rqXmJm3C7jBn>5&^)QAG2bX**S5eUUIi{a%$*C#0PiQM{<lq
zq@=66xWZUcGWN{Vu}kk_y?Y325osO?OaG9T5P-1iMhFV+vcDHlBmwP2b~1p{yN)d-
zYAVRr1dUyLT8`6s<4eYL@?1ZyPM!&s6gwGA_tz~eIej4(pW)xTnI!Dp8=Yt|H8DDI
zfR~c8auAd{t~*kLJXuc#T$u(eyZAX-?f{PzD%QZ0+%5+swSVVM=8^8lC}z#arJ!I+
z(s-tE34lm2W(S+DQ<ZYk3144zR2a#Q#R0m=Q53dRaRq4sQsohH1Bf=EBMO%%uG8pT
zrdXB{BWB5rL?RuSyzxmZ4~NSz%w;(ROEC<UF&t6^DE>x$d~o$zY)k5&Xg`q@!>o#<
zrysa8{%w{!Xn{a~x%BOFc*z3<3|V0Z#bs-9kQji`fV=gQ13S^h3*eCod4w1lphsGe
zh0q1yWeW3PtacOgw(igi7rR}KZg#>q_qxIW-X7fX8wU^V{=rcjd-v>Fhe9xnxRkza
zC05GHFK@`}Ll#D<-qfPYR2d}iZjw0U<>s2t9?kc3amE;PKrQijGE}NASwHFqejPtC
z+a1or6sTO(GlqI07x@s4W4S(khGVLo@8gkJ)nk5sgS;Fa9c!6*yYXwq&z+xO77>9L
z6clL66KsiB!+k3?QCW$MzTvxif6q~h<fC6pnwiRPyY&&a^9YOBsmkUZplji}5+7id
z1PHP^ME1(Xi>}Z`K(Il1U8HxZame-^k2e_tXsz6i)&VPG)upueALA%{IXXKpy1aZB
zsAvAE)W5JY#0clI?U5~@LG~vazZg35xTmD&6}A%lB;Xl)jUX!IQUgTx2|cqZ<4#ey
z3`)f#C*dz+WiUqE5XG}t*mVaJn{Md+%u~j;8bRnx;hEY<zij+To=oAN3Z#;Yh9u6C
z^knjxq4W0(l#&BGTiO3}Xmd$m(0z;VSMIWy^>t4?3a-1%l3B8HrN}MuRi77-_GFEu
zBoC~cd)*~^{iy!^tymc@^Y9q|c#7dUV4OnN=S3ON4rL8Cabh^$84}q^n0%D@&K@N=
z`p#0|@#{Zqg)YbN!iU?wD*2}n-uN4an1sW60dt5pR*D#`(R4TUD}hVV&o?Knkh=|M
z6jCvzERD#~lCWbG$+&rORB;O3^eH%QMt2RQ#ZRldB)O~EjPK?$2RU1G+_8D*B=qdj
zqn7D6ty;D;d-#wO@y=SU2aQq*OaI=DB2V!NDCFn?KgrFtD6o3}&`K?pDjPSMVVVQ(
zN09<niPght5_yw(^J;~TG59>YDMN>P0zzq*<Kk4FycRmw>_kx?WC_$;kJZv55(=;K
z&_r!H<#dx@eASF&*B5#P{xIaI7_7wj1~DWy<!D>IsCD$m1DjRkU7eLs{Pt}O;5<f`
zzmiy8jyUf=%nQ(9LA+BTl^p^BFF67r0zKs?yZ_{wDMGHZrC$P*uI!d~fie|@_BOJz
z0b_cwi!@nJ<UEUQh*m_!0guBSJPc}wDZ?28^Kfz&UvvC3;T$cs2t}|PJ`7uX1!xZH
zGnjvpWdm>sl=qvdOXv#E2)YJp&*@NFabNUm>(vcbRJ^R(UJ@7usK@}iY2XV1tChR%
zW*4P_y5DQ=`B@C)qttN|?Q!3fOwmw}COIMeqrM#^RtXW8-G?D5{7!@vJbaeNKcVdq
zVQeAgglF5s1vAOG9(VDgabz}kfxGk_(|ZuKOu$f9LA}?$3?1U+o<KQ-1I*$Xux7L*
zw3uB<$c^8fF|G27{mV&|ILPDtg05{$s)wj+-7zLM_RH6=0YTT4{SkHa*&`Ls=Rh%2
zb#U&sjNia?F5Xx?c)PxE%%1J*e^z>kiMY$NTM@qOn>-5YD^l%SG$pmyea&3gk3W9P
z%BSKBzMr{7^X3I6(pVlEU6OZIU*^oyux$IhSi7pbU1piqiJcgchy7sSi5L|?*O;#s
zW5zVVTp<1Q+;)nt&uIh<xe3lIAExGHWL#$V*vBPMKqICM7}HNO$B{ZPKv2>81X5D%
zYoi_J&3jH`a%AQ0C6Mt!ZmwtDg&Rh(O?h2lOw2keB_WEdr%siL94gM4zZ&}jqcSyb
z7BfQWS>(8enkpAE(KyTvVhZ|6BAuXp_Tg~|Q`9#3dg36wg8IEbtlEkmh*!zYj?cVG
z8HR#XK>kbeQd%Js5k*NfmPk_0FbKh4XaNxQwe5?1$lKSiXP=r#JqEAwNcRRUS@J6X
zAWKllMvxzW7I1|CuTbI1I<xvu+H?O;e^~s-6Co)kl=~cpb=?iO;3jT5``KfC|FPT-
zL1LVR!^63U2BRqgYYNL*O)adk3z}B#fMJ6?gAUTfNF|yVRDW{7z`%KzcgGu?GKLzC
zo2+KIZ}=tC$!E|^i`keo0Y(dCV*Z>lIJG|Q$&*?3Zz4=1_+l!*+4F!;B*8Tq#p?;1
zvP-KoW~^LUN+jdDY11Q)Zht|$7W1!~%DA!xYnCk&16mQE!TLexE}pUMf?<m+2jZU$
zt$P?W8pCI@MGXx%Iv!~y^$G}hMj`-wMJI{U+?f)+sAc-)c^Esv%G382R*)bgDqRW_
zUXYXO?$htbk&lz#xRWC1s{Cu0^u57RG_XTI-nv~#(LM1Y;{WuAmG9$wsV|=jh>Lfn
z3I0`Kq1mZrQbCCJVA?hZ`sVs#-~9QKKoOUKhVSdAV^K3b4hjYBcl!2ku+LwX+zckI
zdUnNOLHI7w!;P)VaTGmv!5gqJPiXljd$~#?9WZ=*J4u*42zXg}IR&d@$B##r9BZ}h
z&&ce_Q>HW!ILKO-b(9QZH&7#|RyjK|d*SG2664v2odg!AHVI1?f$ZE%muTl84LbrU
z(b>ZPCnwz9OqR6W@PnkJAC+Q&77T3+FeIVL@m+~mw8oF8--p-CqoZfY96WqjQFCqe
z#w5#V@V0;*aAN5p;Wcdx-CbtNX12Crv12bFb+}5G@0D2jQn2QQ(<`s69O2SUqW3o)
z!$Jlk1Fg#AR1OJAIdqiesEY6hdJN!Z@N>xU8p(Vz1sFKT=oRw^;5q)xANV=<3M!G2
z;nb*o&|XN3u+i}WWStuUSxo|r|FAL#E%^EKC%T3iY!=SXrROS=;0&WM_dHxNJdn9X
zKD#a3E%@Qm_xk4paMaHiO@xPV>U8&3QALFZ*Z>Duf7k!<FZ=iLVt8VaRBfo3{+1mf
z`j@@r@Zq|2EkQzCzvvXu!h_=-^{SoLulG~!RrLOS4!hW5ZMYbrAS)<?K2rUiJQuc_
zuVMwU-1HqG=gyU44`&Xk|K4xyv@T=R)Y1)h|IIV>EJJv@a6i2Ji}+sO8V8sdycj94
z>_jG^&Aoe}+!(ekCN9p|$w{<Nk~N_p5CkY*y?FSZc3BQQEY*g>($YMvWAEO*<0m$g
zlr-N%K`_2?$fv~f@*<SvCrq%kf5WrYDO)B2G@hm5jQm%xtZo1yh?ji;rxtpnY3anU
z>=L5xlov0OEEGhjYx7qlgg|b-?WwKyrkYjU0ke?_ep3%ZrZVX=H^hWig@?Jh2|La-
zeVVB$u9HQNe4C1i6jOf~@qKNB2stC#HjVw2*oPcDb`1Dt`iy+abOj^_sY^O`?8t#V
z6m$*q=Ku}2!>?JIN(Tsm>7%-qGOLtI4n-lB{6@+Xk$kOdnu!YAy*~&(H+k<~{ipjv
zVA8r{)t4`LIG_WzR!vQfX5;WxwLh^!oFZa$r`F9RF&o$vcv-XZapZ}G6p3>nwb;)R
z5JA(>Pf75Fkv-b8LAllFXHfV;Q*|5s4XyR`JhSe0Xw$|nJHlW8R9Iaszu}oE;-r54
zD&D@uAXUOpPrG+-GU=mL=-$a^Kwj}7{>L6aeYzI)RK`D;_#YEJ`%2q?J!9v5ihH<+
z5XA}-#?|!lyjP?2fiB6^8Tqj1g!S67PCVtyV(ue4NU`OmrCQhp8q6<tRFynfN~1J3
z%Hu~>m}vx)LN34@JK&+rZv-1PqE6V15%jD23fE8B;|3PEF_vjsNZKb_Ci7<FxyKPo
zX2eA<b@wmf^T`?c`sHiZjBrtvs7--f)Q`RX_Smi4x8H!XSR}e$Vvm}Ih)b*A|Lbnv
z55urarGH`*36UTbrY_;B<BS@=3l*jWn`Pn`vM-$7cAaR4s-8gbQcagGJ#84G-lM(0
zRw6hTF{K&sV8m!qoBinW>n@T#=%$`FBVRP7UAqRkP~97;hlu(}%p0DnX|cl;k?$dO
zuz9^e5;l!K3aF?SQRe3|i>tpGo;(L+#`Lcl6xt%ydW#QN`}N^^J=?dM=;U;{F>DZp
z(Ib`8ra(P>7Zo^|Lz}ki&#tl~b|Ew?2eXSoEHug-?#q4x1D&*R%#$$15S~Iz=9_)u
zZJtw_CSUS^uHp=-F>wZw_EVuombwH~qVd6qrgx(H^RJJIkCr6+y#Ui-TKm)D^_U(i
zt?rHi`loEkMBxt*)>nNV(-Y1Yz-*7solQC{;fg%HPd81AkSKE%oQaa^Z!byHupPh8
zL{rpyF)l9Z_xE&##Z<h0e3bbnHjs%*O9&H8=bL=yDf*imU0r3QY&REDrq9R3^j)4Y
z&B9TK)5Rs;dD_$(Dmf5B1V~qV<2O<_4;YsU|MTU`#f^ai!5r!)z4oI%&dc5i<H7zl
zk9+}ELn1J}9J(CvE^#e3?=rkF$7jSEy0x#iO*xsLle2*~XWaeyH{v_>>VBO~i5d01
z84<6sZG9>6!LzaMi11i(`l)S=hclcgvB`||dr3jjr93ypJhINdr^N3Xu^MlWvna<i
zBICHgeht0vlh0rpcsjD1KJ7hlppqF11N#Z&mgnaV+Tp*QYK}yC=dTSOrrH!7nRFS`
z6dAF@toIZ2V#AdEBe1FLcTI-dV&c8zhneqH(7gdrLNMD)+f|J|o%dAGr#Eg$poW)L
zYTDNx<>b&$U>ao(&O@S8>Tx21%!5KL<=@mKQNn~9L#z;~IEGI(9{PL3c7R?zVeK2Q
zth<CCMQr($J|K@t+#4}kdU`qlIm?*U6ohEgQOFUtH0g+JT_zh0Ao?BAuI58q4WwEO
z&Sk=a{RwOvQj>69=tWFn9~c<G)iy5@dG#Xlh;S|1c=73A9$)6265B4KX%MtT@?hYv
zZAUagh`IQ^+N@c#!gmc**}sx$#COIu?okm3fjkIfca)rC!h{p@j?J>%$YeMH*~NlA
zdL6yA@Fez$eVoOar=*m32>zX9Ph}PZ>u>uCLu4~i6c03Nxh)SP*zos8{0uTl>^R)+
z4aNV2)Fp(s5IP3W$Y&1*34`nNf=r)*N;*lFH@P1$o)ipE1l>z1JSZ?a{z)OLUw(+G
zG9kw4r?$k|3UYEEYY$LN{y=daNmQwq{waa#f<p{@Q7C+6M4w;Y&B_UWUq1zf5t?gJ
zut)v>QJ?o^W%*QvupfQ;l)<5Gd<>Flz*SHoXxAC+A}0x>^!Klc`iUD0Y1;~_Ad#Pa
z+<0~MjKBX5u~$g&XsGAmtjr2~3I+*sRT-fpx-UNlHqM*nPls$YdQpg2ka2g$)Tw}W
z%InUu3|V*0!cA!_q*L(1>oDF1rqKaENlIW5CAz;M<A&6X#$qu^BFda3`uex}Z=kE?
zB6^Kpi~z>$8M4CSnfCg1o5n|}koM8tvJeYTk%JJ3)92{Ol!D_~cX8z_+3EJbCe_BJ
zu_=Ls){kGr(!eg>cD-xcNzf>=?c+Fg;ij5$in_Wl@!t?{TlEwEH%uw_s<=!^a5r*H
z&J@G|oEEU}FOM|_;Q~70=<aB8iXa3;M#Ng?kvjAkhqp^ZtXGGqmr~W4r^pnh_=_M!
zP4oAu>AOU&2gZAP&^3(p&?<LXzQ@DEgGK7xIU~Ys*nf<?4`m5$r*}JNp3B2tfu$Oj
zI{>W0$=2*O18dAwo8{Ahy(EF^zb|@q0=&=7n-e*bw7gJJG5fzb^JHy|Jk&!_B|Yk7
zsgHQCAQaihiQdka@$1fI6|i%;2lL2@`ucF`n!YNl?XUare7)gg-fu3zd;kXDth?l3
z_&%`L-!>nE#8xLDYjTY?OR?ru8^G7TF%6Jdn&MmVes;UBc_inLa*I&1a}0=zqH`_G
zBC6k5$6zGd7q$_{H}`kMqNc@(#jBch`AQ25ya=F#%a=Jj1Q>iH$e3ntL2Pg~M<kN*
zj7+l~;$^JVm9Gub_1MC;D@<z{Jac5PeU$Q^*~jTJk5)v@<zU&HnADnO?+n2o?VVq@
zZ29|^<D0fbV?w*ur1hIh)L+aRNQvjd-{~c0DtnF}Nc^F!Y!8k?z<9XA4M3J)zM*!-
z=pO9>24HQJXM|ApbN$h!d6+XB{wq(tWPJip9lw#3Wr;dSoJG`ATTJQ_lw_yti_O?E
z_?LVA=<&a!gf%DrSCr6bZNkEcbus?}o&#0Zfr3bY{KySWFaav?V1)EwMn=DU)Mj@J
z{GINQdKm2&(rP6}kBCVH9hK+;%LtEN_uBMu(+yB?8lCeM`)tDtUR;;nBK@YRO_Eoq
z@MlK@1C=z_3XTgj6Ls%#<HrO15ifC<V}Q*Y4@__kTT}=)(z;|^Qcn;C)InIa2oKEZ
z#UOxu(4ou=f#rJ*+EFm~2#eSf)n{8-DgS%eAl8P@YTyh&W)6R9dxD8@R9hc5p0=<<
zzwMIWT)@}Hoe3nfDg19QLyjat5Rk55GrEeK*iI6;@LVRt))vc;Hw-r&ZJ+hv{(Y!J
zZ$UI5$+zoHx*D}ny}8ev-vPu=YB$hP%<b3EIl?1~GLPh+C=sbNtykLbxXZXFL~8g&
zxNDUD@x1>V_DaX}G$t{@8^m}#cyKBCaNx4h1gG*zx{v5GfCj`+&DHtO@87?F=MFB(
zMCKrpbL9GGo)*1%<3v4^=8T|yvcU;7PQ!Gc!|i;2G=hXKq=v4^jy=^4&OFtw&YJV%
zwWueXj`?LUBxWLoiGD^(1R>}pICJU}B3a-?pGKTfv?Mo%Yhm)BL{~8`(z^_}7c)O;
zBnw~9=#AMYOkPlcL@jH(9B^$dGu$F|R6R?`k_YoxGK-NBS)<mYU%vH<72P{`K5ZT;
zq))lM_|75_xA`5|=*f@jwU6l7ztQH3%JP4_2pXcaU>r9IKi}v@)Mj8hVbc??bFq-6
z97&<mUsJLZd^QGGpo<2)A>?<54)L^Ks?iJW!PA+=AkRTa=h&~o5)gi9^b+IMpyv!}
zT(soW-@ZJ|k_QKQlV<?#mjtwP<PLB9g{|OI*l5{}VNIYPfJ#H@;)c%pi9c)dpmatR
zy=H$OAG79!oM2dETb5;i_`|pZ4*6@@KWFef77j#c;YkpsaS>7G6rzW9Z9Bj5Vfg$G
zF~eqa5E_%e(&}%Z_l=2$;f#fLYZuZM&}eETZY23v>ChW9=2>no9lznA*7D=EhYY#@
z@}({<zA~Li&I!<UMm96uO4z;*_lnmghQ~t7R+uRJqqFkVRznOY28JCbWi>oaejl6%
zp^bs{rL(&>(PpmhP5tQG_wNB3YmvJG$+aIV@;mziP#;mfXe&JdX`Qgh?8`@pepKva
zgqVX<?Z+kWeLSZPRaT}=DxHYp_{`{M0l5pvt-#;Sqs&EN8|3BkAW@S2DD%tGN{ob7
z8jvNBDIK!Nc^tU|Ohs)3`vRi#0;$rmAMU_g5oeX-fD6Ok8$A3L2Q-h%Ci|8jpAQva
z<+BPZMEX0$NLqNr*<HHmwR~4t2+4ZNxlC9DcFrZOfm=bj_f2McuM3r~rlnAl{e472
zAJ}RTx?MTD>!^<pn7!%yso3c7U6Cf?lv$QLI4rs|d%6)o7>=1?#FHj%5>L`KkOTv?
zdZx@L6b~7PY_|SqE`TvOwA71PKCmH{9~;)(rUETw!G6KX$FEyKy>Q!gKSOIGptBiK
zh9SO9tJ3z=FY|BQ?*B*hqrMzY0J%I)8-7F`F`dvAP*muPBbf<W4=y{y(6F;UBX>gx
z?BsPOy1LeKhhB03$uwYZ&YsfZqaz4-hWb@RXjQA9BH#XV)D!J9Pi<k<LGgWOD8i0l
zr{9|;YeM68B_lD9eufw-Sy*P~9Q!vQeXUxzM&D@e;<GSHfq{Gn(fP)~q<z6)Qoss|
zGkQZUqDHH$o5;Mbtn{F6v)gj7Xk)>VhlM?O@!}^om<__hC3_(01y!98g)X;+NgLy`
zEa~aoxl<<2;zi`#5$yJ+rE}f<dmGu0*>=l^6Q6OnF`-^li+l#V7;A_9N0oVK*WJo{
zMFjvvIUr#aLyo{ms1%3aZjEhHn@AKanA&obtK^woJb99=M{!8Njh{7?`zc}21+uuh
zP9n3GkP^g|3**tE8*%l2#7?#Q2k@|rTmO5u=Kv^nX$;gw3nl@z|2Vp<!Z|%AU9Mt6
zm=fojdLP0@9T-;qmYe?m*^nG`s8=Hkb$_#X^f5F)68m_=89~p1S?9O-0DW(C>mPIY
zM~rwvN;huYKlksS3vz01!&uh;&x~(+qcfL8{P8Qs_Dm2tVx#%N;XuCi`?a3rb>MjB
zZ@!(Q=q~8RcWOWZTp{IBus594h+sYBx+pYhN$--(F$DL{?tT*h>!_KxlQOa$_3Z@Z
z1FZw)369@A=+soeppfL3TUt?30QY&{<Sv~$y#{e_G|H0feZLZ4&v=FekX&GtVM7{D
zn>KWv(7$9{YGamxRaSlSkPLGRq1TiHq}0Sa6kF#{I>aVpG*bwqU^voOV`8SyoC!2{
zA^kOlILbLR?p)D7MFAtwFeD=Mm|ow*M~{->SaNBEl6>Of+|myp&Y4FNbu(KCe8aji
z>JGm5dqvgeG;M#J5ifjpW3CX<P=Y$zo;{CV-p{A)3skAl?F=dNYBo7YMfBaz39O_|
z#S530RO4oCmlH$MfnDaOouP*WHawcqo#o}@gwr+Pgpwadj8rnd3ZIe=vFVZ7$RtH(
z6&heJ0M~G41g(q`{dG3q|8{1cYQK;~ffukO);Ct3HC5(~XBAi5+6v`Q`L^5T^s$B-
zL7#OOayHG+&+F8$pQW_`-939EVTV_(BR?Q9ZD0m6=$-E(SaQq@qS>_soLpQh2ysx5
zvj1u*{0&<osOvBar&nsnEJMny{}9+w+%YTJLQ#P16?})u8PsVXi4lV!n7{2vaP0pL
zr6OI|)U1Y+5>a%fJ(xFri3~sASeAGIIlTsA6XbxI+Mx_RXiR(0UQ}#~5DlVS{ra_(
zLP4D8Ei0uX%GzZX%{1*u0Ki+j>DSiPiLyQhm{2{9IVExZJ~U*JYOzz0A2FR}6x@x%
z0oV?TB{d&E2GZNd-x_Y<xBg97;Z0cF^T8N)opbc+A7QIy)AC>_SU>y?Oh^uxEL=z|
z&d4G{g#$nxqS>d_wo?%^D@fD1Y>4#3{8J*vuW+L)>y5zON~?(&32W4P?bEC)k{Ee$
z5g<WAGK(FFse`v7Xh!mc%BV>SQEPe&=u@C7ID%5CIw)IpP0bC**MA{E564DSQT+`E
zL)4Ib^a#BAhnzmdO5#Gob9nOcAltyQnl<Ua?WS=vVukM%{=WdHL?1vio@nmv$tq?F
ze|xiM1a9w3?YWH_AQ>CVEsoC=J+UQQhD+l1goIe*d*CctLV&uU&>=7imm+xERD#jj
z@Y8B1In<?V*DN|oa8Nn#?$=2quPV7BSFfOBS2KTsdU}QYhLA(wNGhRFrQLS>(yXxA
zbhvC_7?rP;EUPRubh-PU3JQzu1DI3B=bv<a(}s@m!$~btg@Or0k9Djxrv;|6$tDs7
z1iFPn`H+*FP4=&sy!%w?(V#_)si+v5wYwI)hUO3;8~hVxK&M5IK3C2>Iib8!M~>ZO
zYi&K)r4LzSkM^utJdHn2PM<|izUbS;a8sdPO#cIP8K=Q5B+09?6Wp{sOO~~*+VDGy
za2+mq3JgFuAAIifmoJ#?*ASdc2CwL%&zLv|=Kt`is1*&5&rD0ZK=@0&R|LTpvBt{&
zX4Q@odtmyC9}r~d`5b>-b=<i4cU!jWe_}bP@>eqA-DhM!&t8tioK%6Sqxt8bF;(VQ
z^6~x5Oo4oagY5aXna^C37%1E%WzOSY&^Z93!El3fd9E`}q7*JDFH4d)tF3AMQO4<U
zo*3;cUau}uSXj+Eo4MX5R6u{6lW_Qe+X>PJqMV9b!_vb|Bk*NE6&h-3h`zJzL?68u
z>o9eML2I60e2s_A&>`?$bEEEkYlY3m(Uv-HG0nm{(&w?I>hTy<q-HxEp!Jwkf}n+2
z!qR1Gl2D{-+yQkc*s<6{<7tRjS5{8R%Uf{I3Mk1THLscJ)ER02b(L`YMf6~{X(rO9
zP)h`TkA6G<vBbqIV2_Bo$94NRVNQV{PtX;rysu4g$CSs9MRS*~M?G%j>?VZ&$kRnk
z;kK<=m_7g*za}n>PV4ov;I;9)lxdSs*Gl?5w$)(jnl-pPjmn2B=r5*je~X;{g0a%O
z$E_rKeeWdp)eFf^O+5a~kx7(C5VBKrv{Ey@|8PO9?IpqsR;X=XoVuFp$5(^^)5Z|%
zicyf1A5GmSNSsA)JU$tt;7T@Md;+840U3%?+~-z*e5{2DA$CF&fu+3sCxh%^Q29Nn
zZ02|JsqK&VN7od9#si^G`>G8##Zr=wpGv$1;5AB|zRlS>YtH?OF+EeXTt8X~LQA#O
z()IOrO4%BGzV<e9cr^wdgsX7M#*l%*MGY8J*vvHTq(}WY>i^(^)5n-8XJb%NUb61c
z^3^3l9_*?94mzT~3`jhN3O%mp44kQ0NI4E;pBAKv#*<;f`^hj%w$#4%ra~CMj$(7^
z$Th8vw@0ignE4A~koc4AoQ5xB_PPxT2+#{(!V9a&8Z!s&B>`PsZLRjVWTGNk<OmXt
z68-_^fHDQdc)o2+Gqs64b~Y2BekvUv`WmaAJ3`D5>8@6H&y;zjB`|y8M{Jj)C`zHk
z0Y3+WF*qj=i$#$a#KfzS2Mg7XNKlX)^;GV$z|LX&-L!s%PZHAtxEO+FZrCTxRGEjJ
z56=hkkALhcuD0auTNL=_JdNqO?;a1tqdI$T#$z(3;frq^Yj^jLFlA6F1)xnLrtNan
zCq?Rpdli?llslupN-1g`xyo0j9TeQTGf~hocTxcolsO>S#s^IWE(x-alXr8b-7PKX
z82rt&?!Pu5W7>w{PCNZUFhj5J*)36~vHliU3JMB1p=Q3Pf`hG~z>=)VFiTp#Z(ji>
zDvui_@go>uI_oRSI@=owia$P1T)zA)+(o}<m|Nz23~`thsI~;vG{|KIQza!mw%t-=
z{5$%n#qY&CQjZK=Gl;~o@B4B|#DeVq9Nl2w3zkVdnt=|GX~8=J&MzHi$HKBLxX8#6
zu(Y2NSt12C$m`;zOALU2i(`wdd@Tu#!Pwt~RZ6JVO`n$6()9@lpi`EveaRf?2@K1$
z+r%fq41t*KIlc_Pwxu%Z>)CRsH_KsueSY5zxOWLp>7Hr!e?6sQwk@Hi@9rz?AakXz
znkScFrW->CG$g6EiyqSVqa<nylw`8!E`D)w$u_y$4<Fir&Cb%P_T#Z=r{J+IH<iax
z9cTa3DGbs6C|qFU`~Di<Bxn$%zQS~U0CCsT!V{;_%Y5*EoJLp_<e)=C=Y+~gk^U!2
zfhD*G&mZ=^eXBdr9^jiOdNI)K2e(9%yVxSDco_8Fjp@-|e<utlNUVzlmH74)1@|vF
zk9Ja6q+5Pg&%H4~Mv{DnL)L%K{geOkyix1PlF9tH-3m5VBDSx+-EIUINAUgRiBJ!L
z*|=Q}V&-?SD&ic$HHE3B;cw{t^P98uE&~8i>>nU_)bGl#zN`lfYWXs|ndFrRk5_N<
zfpPz_GPy%^bbb<0_!a~p8!9wJ1$jIX{E8|vn4pXlCg+cbLr(Jb$$~SAfWRmcOc@%^
z@md`PD@;jZbJtd*rZ{s2=1PAjsYN+5tl_4D=_)yd;q^@Hx2_<q-LDQc`;(B&=R_0;
zQVOM@!}LYOysfCH=jr7>e=Z|M4OrRTL?(It=Fbm@WOx{nT~ZhQ*lB`iFSuC(Ktz5)
zdRGV$d@mctIzYk2dip$DmX<gFx0#AdWsrbC!P-f&QS<~c?nY5OkP><+AjdHR9?!`c
z4@kKtS?+?<rC>%o{ZRiDipsc2sKuT~#shC+d#K<ywVpC5L^K?Dw&6J@8<=h^NE;){
zg1n56qCb&TowFdLhW#2|(A?)KX(O)9PI{Mu7dh$(IwTs<_z9$|zj29bWfVO3qO2UE
zYbl^vY&vZt5XZR}wNCyguEzZKRQ87u@ok_a-&wzXKkZt2gZy}A>jG$IBpYFJG+-38
z^D%ZA1{DUL@tMWzfwW|v!Zm_xI)3a}v2*5&TnatLMso(zF&&tMjs%J4G%If9tm*24
zwBIpOm?kS2hq#E;ivSjZ3enXtfX+=CVdV6IZI~FQS25Nzo~fw<k%>)OUu-hrcI*CH
zl#YRE4f9AOmpeerZ&9^D5_-<*zfO`S0+tM%QSs_AX7xg9MB=0MWkxc+2a$)yo(>Kt
z<%qyIQo<qVDN2cQyE28nspo|3E!<RD>SY=Ik{M{CvA{=x?@$cu{9oVE7iy$vO_F*M
z08wZZ-_bx@Z9Tk<MooV_6hUP2`DrWPwUbW3c%rdNZ;a<%C=EPi-z(ECL%|(Z5Z43-
z8>Z%|T)QG_e7x+plXM7;J{hr)#8iT{P(KTT5G*yK-4`!j5N6vfTQ<;TKyn|HR79qr
zzmsNO^za*-g2V_##MC?MGnL73<0X11v^o7&DasmaR{VBcSse%OTM)jjoh0<mix=)#
z0{xvJ`NTw&D%Iw2DE&qkxW4V+c1V)^QgG06ckbUW+L<Ju@iEDMX14#emMuYGgEUtv
zdKEpt^p*oF9!3aNj!Gzgo%G4!1mDz0tE1HhNc1=%Tq`wZCJIq_5nH|CB+ZnFO|Z-8
zt0#?E=kFl#bQ526_kVs7p@-CAYUK3hp5c&Ak6yoiUGB}xhI$tW%q&#QA+R2pW?lp!
zZwGFcC_ec6?@8ZIr|}F)nnd@Pr?%MmYvqIfV+K_&l6Bji!Fq~zEOpK%Fq8By5e}|f
z?2^8rZ6!t&Y<`~I=UdsWxx{L<_zd^L|1za2j^G<o1DGDDIKE43F0md~S7yBNVRPd{
z9Tg@FcbKZ<DpC7$@Cf6<Bj!`X>y1Y=T3q*`{C5&&Ria8naOB8PFIq!@HAVB<WgRg5
zj8*}MTG<ysSoV^GGYCD@X$5};j4A43T$p4OA&F=4u!jFQGowWhk1ia(xm~hfob6L&
zx-5;uo7v~H9H`=Y$pKsWlbUI#9GW#@sFzR`<ZlYOWW5>7aEn)zKK$1wy&&8h>JUo}
zNHn#{7={SCh0NMdZzjdZNZyCM)9pK_r9@E`Re)Tjy52Abt8Nh>Im}M`Pk<EIs0x<r
z=;9I}ItqinsGGspJ#=WGOSj~`G}asTHj<YFUh@qI_wBAqA4O}9THK^@|NU&pmIgWq
z2!STUevetwTt<HSj%j0M9;hq{Q!%s*<=>K2x&QZX8XopJKR5bB;xsE0D(0&{jlrQf
zefVoWBN6>ZUyg|>oY6c`KD4dk?Pt%3D}tu)n~x!Wc|R3Vfc=T`<|h>7Ts7!Px7!xZ
z8ET&$;<jnrxj3W#QZWVSW<$HOr)RwL<L$RP{U^3b&Y~%CdhbM?4#{_ic5QHbmV|&@
z!6481L+cdon>O>@HO%eE(g*=Ka`3ad75*m^mKYC1L9c7WTuKU!TQz?Av}p&ojCgWn
z)Q2U@=wO=tx%9eXjM=WwKRmG#YOM>8<*0A<N+g4sUVjRr<f4kP&T_wxw=g7)ui$?X
zm)}2;ND3$Y_F(`1Olib#%lz+;^xS^qO8@>ylCtNwzxelO#=mQ(Z0rPrR%{PF>A^(r
zBxCqg1_As8Kovp@yJDzgh+GD+mA5e(8^Fa5fm%?6oC}LK;VzVAWg$8kleeOk31$T|
zjBp5+Cp)oD(67gahJ_V@k<wQa!10F`##`t=NrEfOx7!;!g9I=UTm!)XTEHPWQd)eX
zU6qQ+N6M%;iq2W~aa5{Bk?H8sqp*{Y9u?$uk+PFU^w@Xra|^JUyKpmco4qw`t&q7Y
zDteBK>b`jMS{f#Z%~U_k>eQvnD6A1d-SznRX}XiJw=^4FcvkwzO?Ye5DBR-yUYg4N
z`zM2G1KIMO3Y$Ov3H@^0GVCR>{qxJ#(@9+gux6iK0?!SkGd!Zbz++(M!hFd}bX$kZ
zVq6VXC`RxI`8OE-BsL9IjF24)_i-(D3CJEHE(G_fu)+-`CvySYcDkojT@!rn+zJpw
z=6ssg7gubH_)&ffybd}jqzj7$3&x<KfSLgsTpB&v%kJN`Z=d_8zq|9a;n7kKd<ik~
zz=Ur?o=W=z-L)1#HSPZCy7e~~fCkc{%F0S)<DpxVO>uoQXz-*1&0prnSXe+P*^EIG
zr1+#)Qj$C6Q%as(iio8RkA0*tjF?oLjvW~p#8zR#JAKE@JzM2JBzoiDP5aB!uJ<^n
zvtKN8xDxDYPuhbn2(`%@NsLU31E~V`BaWIb^kcqvI!DE$;l~GE0HA&~Tf%7sf-^F`
zZ#A6&+N*xhcdF1Q9+@+B@8jBTms3u53skg>mdr<*Z+b&DMa{xAlVN6cYZ72M`;~3H
z`}Xi&DmOF37qN0bzn>|CA4}9lQQg|6$H=8#pp2@jH#U7;&8~gaCzD(OJ*fL0_Y8k}
z0^4A$ttXsD;Si0kej82@iOWu?O@{WGx@<7u69MM;fwz&eTmv5tgD<PK4d}n!h{WJ$
zr3!0(VXiWNuCUWc`qekHpAB`FDcvfSmn0uMa=Y>the~_Z?YAg#+dCdvCpe7$z05C{
zR%YgQJ`gCSo4euXY`EF@S}_iE`{})LnjAv`!54pIi$nKnaNo-cn@75*o{PM8&6OQT
z4lKI~_rU&!!1~Xf(0^cPpbqeCpkkz9$Y+3S6(uFt^?Roz>VUSM&n(`p;JWGKpO{N!
z=ZKUfoq5Rb3v}E4Be*kz3`I|;pIZbl9C_3kqFowiz+|lVf2yt)Lt6)KsR88UfTY~J
z_kPLN?Q-zt{QDJkqH$NX1N?$b;fts{Y<hl#SjOD`BYT=%sW7?#rXC-YaneS1<K3H8
zYz1Pa^Qke%!E>JKnpO|}P%6Lj7mg)STSP?6Kp^i}uWbaGTo|}1jgBBooZL=*`e>7`
z5}^BkFomduHr6qnO&eCPzQ9Uw`q8aA5)j}5Xig{)Nh!|&CIx9JqNK)2x{&U|t`Y9m
zKpUeZ3*C}(B76|EI^PX^M?gks`NUQO>>C2pW;Q%^IhgMZ2w~WspN0*kS<XXbt|^E!
z#UvEupw1&{L)l^$QcPD?9m&d`I&B(^MWr~6HM@W|1bi}ZASdWe+%ropat2P#`W?{}
ztw@}@>ZAST?dA1Dx28htLU^R?kJ>hyhW4+Rzt$}FbVg3I)sbOsvi>YB$WfW#UiLO(
zf`vohIJ2h-SF;mZ@7g4L_sV|zzGe$rT<zanS<C08!q=~lM$GcudA|3ns`@k0cXWn)
z*mCIkyQ7<Ke$9vLcfNTMf-{^f=kbdme1XYy?bPWYxVN9*dY%@U|9CB}bD^Q|i<4-Y
znE#k|t=Z&Ptan?HC&5wa%Qz_B&B;;NQhUlY+(dsTFtFREO;ZdEK#;&8^+v0p)J4WF
z$Vvbp(5Muk9*XoWFRz`H+9WtNC6kb+wnA4Q-nNj-95g5qJsMyhlS^W*7!b+2BR2t4
zFMzXA&$)c@Vh^2-!mTt?L)|GKk>G{2uL9&Cb^};h6grA%rv&_d{P7DIW*h^<;{QZ2
z0x=}g4))$MYik->u2|jri`KH#yY9-D$7}cCLd+wD#iT^C>cXKQ1@G28icNIs7q^}Y
zdQ4o62bVfih93$IOJ*cHuVuS#Mx^W;#|ZTpSm)DQ#sMmGlWG=&pQV%XKRZAF(6M7>
z@7{f(g)u+Dc^GzO!i0LBJhTJ<3xxu^BfYUi&hv5Ly0Yt)H%e*K2BmFG+qEFmbI~9r
z#FpA5ez*}3azWY#9E5QG&ykN%p-z$jlT=iV`?z_I-rvbhW>;_KrNhfF`7@I2R%?Z+
zl{b(^17O(KiqV*0d<F?;Vl`6l=3a$P;ByZYMuGWzZ$05xm&zWa2;956m+Jn@EJWr(
z*?!vdWKNlcU;b(^d;LeZmJe?C$w^mk{`uok)vqgGmasH4b<YErQIp_`cKm@_5?QFb
zohPX<XOQXX>l*6a&$!{>RrX!Fe0gK(Y2!84wJjn|!!t-dp@H<+NB<q;nG!YbWoK7&
z4T|qaS}dFA>*vQr_7NSz=3QEfExb39vzjA|V#mX=uSkS5>RgcpH9Gl-V_Ll@#JMP4
z%G_t$WB|B1;U<IalF5IMQROPuexNuZS|a)TNZn@H&~xXWMVL)*8)_0_7&_`0z>{6G
z-1DmyeU8VDKHAes6!J&HrrLly=rz}J3RTrrjMj|M-`<ws!vAR?OTj>gY0HomTDjC^
zO~}p4yR?yw>MPfyhf;@mmzPfq{o`x_eIeEhx4|LjXr-j3&FmEeb=_!%Yw7D-pf6BK
z6Ruw$W1ls(*I^Iu*45x;)w(9?KbxI?c~EXC?O%X;JzqYCQe1o=hou$iKCOEn01wEl
z%gfxq6dfdF(wK(vi3u`T;Z(k08XeK5f-;t6)VfD~)uS(quR~>Q$omeXC}y`oLuGzm
z-gVA}{!Us&j-#vE_=3z)`Ygx#-&pg(DUR`LBx(F|sdj#?sTti1NC_bab6U=@!ADQn
z<6;0u{c;7{KJ?Me$0u~Y-;vez%f-k~H!{*F>mVbeG$(X#mL-;Nmpjcn2>8yB8HrM5
z=4`N+38@W7Q6XJLT*DRY=uNyc`9MFFIMO=N={P^ehQ9wzlMnnXiHttG1J?IV508G6
z5%?F)vwO!}h%%4LrP(lKgOO1xWw#^Q?SJ)+sIR^n7e^0E64gIB(IRj}bqJEP6T(lK
zDx(Rl-Z9K9GKX5n@$3tjJFr=r>b&GA{j!*8t`ir)#R5#2UM>J+pwRi00VKxrl)AS4
zS#lP=q0z?`F3h_$Ku0GDlp1JJ)u+fyPK;|Kz|Al@zXRUy;w!7>hbMuA(9s5n(|aGI
zX{qAx$;w%L-v19GWjsefF7EO1`c$?{b->-D7dczSzG?%pD)mo1>IcSwwH~6XwKjEu
ztSo|esE{b591M>s^v;8yuW#)|?6%U#!?TA?Z3B&!2ooRt^a26Si|g;VKe*4uo^qCO
zSJ#r**tpY!xe(l7<n1VSyL_3j8l;BV>VoU}W6EGJsuo-80|MY`F8MRwlHd}1G$c+`
zD;VXLsr8e3i0D%3+BKF(Pqatk#*4D&`NJloCz}52oTorgq6gn3k1{#5j3XUmE?w%0
zTGoOUN6`|=qOSS#+0A<Y;vN7e?$3apLJ|h(02tdH*OlHS8?d|9_2faQLl}K3hGz)U
z)^wm{XVjAVm+OgQsy=>va4iXN{2PLz4x#?dXs8`*=;xD5b?M=qFb2u0pI8|{UHJdu
z>rKFV%-=Td+h8PHqwGS~l!&slNU|p+DMI!wOC(w-4P}W4CCQSKR1!i;i(QhaWQoes
zqDb0MRPX1TG1LD!-uLZzj(Pqw^84NQce$4HI<NC8e4dwu&!S*EM6qx9SFipQt_fr(
z6%!CF>H38k;316c@yY1zYu0We;sn7FXFCpAvMc`J?Oo)<gIdFxSn}?SFr038sRf)~
zs^|7#)nmNRX@eWv67E=RwNo5Jgtw#55j?y`%!nxgk3A9dEPFQjS!BSpq2r#tJVX1M
z^SF$2UQBmopJ!JVS(}Ka$h~)uzSPxSVQn2Ts9IC$+s<YhNkz>j@+qw%_C2h~!W^uA
zcFy;y^T+)P@(G=Ry#o`9@8Qs<JU$d6I_A+sIb{s<r%zaiDgnR@d?jxHd1=xq<0aaw
z+4D4uRH}>0%83=uFzu-y9$UYk0!wO+|Lx@I|5S)15GK$uE{4f<_OsgEcP|_@qiKRb
zETl24qO3Ah((exI4Kz{xm8@c~;{+U?qF(#iCy&roLN(3v&*d9Cb_N?*lAj}lW**pX
zCf}(j7MaESA(w?r$*G6~G*+(cHl8+hqY6RW@!;+DF8{nd`3QV`a-|iEb`UsC^e}pS
z|KsBi5b5gRI{Bs;<Fb!W{YGfNcixU~F~x{-?^drEcLPV-S0%EMB+0g5I{^?4i^<2b
z8ONBukQW8%vfZ!&X!x<Lx;E6?mG&OKO1D1^4{I5X)34sNFLM6vcd1yFLfaUlopdYA
z3r<H+v~N005rnUouUT_E!;)zwmBo8IS$}QQrbxczTyDnO(Y<EZNPqoV;<ejuM%`3j
zdq1km;pv6&#TNB!nV!(u=t-e%UG297_=c33&q5@vEb=Z?hkA<!Z3tQBig(({Dl<c$
zGTqq<_1IqUdTb@%^P$WuqWm3^UTDRJhX*Tl54Gmq#LtmHCMr9<BGbK-i0ha%d}&G&
zUY!T~7OY_!Y8t}FO%&E?N?1ncukjek>a;`T(Mw`BC5krq>^PbxL>eeBq*hw`Rt=v=
zu&R-@*1ok~*kA)0(2o#=yFTjfO#-hQ8}YVy_n^7gJhxY2flrSoN5ALZ%&FL#mMRLm
zcx+k@BefzS$ZxHUeR}oc#i|d1`;A%-+j4ZrB#wao+o6sXE1Yz~Rz0R%GtjjoHb_rZ
zwIgl2G$B5^tg7=eLa(x2bfPo;t-%qks6K>2Yal%H*yo*h0#S7|DxdD75xwY9?c1Al
zQTke{4UK5srt{#^#$&~j4=*p8;9i@v{ND2?O~TD|7;ufDPkr>Ln6V0dt*pcx$NgW3
zB8N?%s@Bq3hqk1pb05F(Bt4k(eCn_b4a^gGLvVcf`~)S{b}FXosjIFPZLf`gFuT9`
z`D0JfFJ{p5;pq{zdfUmvv6&CPuPa>rh?!EK^YfQEPGDn{Hq@TkuyNxf+v~Nf<94%@
zH6tBB;L6lh(kJbDYcWM61hz&93{hPdaz!*H!u5c5?)Ep>$)wwtE!G$sx~DEi9Y1)`
zk$wC3pVU~s@A~y0L?*g)<UM&}oc_3iM2R^#2QsN~odC+%eY3jBBpESyQodLH;fGaR
z({v)97wt{;&g-xp92YkO2EXk3W8b5(7JkP+nV+n7pSvUh6UA$!`fP;ACq}i97YPKH
z#7iU2bdV}uyiLoN6{V&93AISY-fb=HmHfT^lDaScbSFX)*xd;9>WT5D?2u1CCr+9+
zZMb3@VLOxX3K4+izPIgTIBV9dj`2G<{7;&#cI+01!u!^(bvT|DO0adU9M@zWi*tOp
zCP6XOJYwy4>nrL3)&CT8T(sQx?RyqAg;w-hyGi@^wlsJ8kaa!kR`1)}ZxQ2b98E$V
zXzPSn!k$jYZ`d=&#Ui}@pFxu)Nnv3A&nZ5~EAQFF_MJ__6`9n^=<!PJ=KW`1IvrnA
zmNz+&8%-1$%iRIxcI4BDRQ7my3|*f+@_l9$PL(1X)Zv)gO;Os%KT{*2MROS``9rV%
z8k9=Ug*vyAKTU9XbgHM9VUY8>driQ|{XNryuaMc1N)Imb9;mC4?AgL-SbFBMr3tVs
zhaLF-2sCVz&>Nu{Y{y1l0(*Y7kVbzu*>)i5SCNnb%$NTW17QtPJWMTw3y5RkEv#L%
z<(}=g<DB{Pvd5;=ey`~jG0LPj-v-q=bVhIr)rfsvwaNgBc(Em0!h6qNGn}|Oo!F6H
zSi`L(k`HaYuf7edH{B80+nTnh*ghq}k)Y<~?q|muSpLEei{)RAn7HQJrwQY=;_Y6M
z!R@`uE(#L8m6%^2nA-6x{1B6{3`YkCC;AVxUiG3v@Xqh^yxFskXy;hryR4TRO%!w-
zGM?|>?UlxI24NSVtARZAX$=%|2c$P5hj@HUs~q12A_O4lw$MaDUOp1nLRA2j4G^zR
zBh}l;Q%-g<2!$&XWaxb`lS0ApPuOd3)BSUj2VE4z2r5r(A=&X4pqh_Sl!pb%&~b18
z09sl*$4+FKkKp_F?mbqNyN6O=p<)0chSuusz0dC~jI-0$T~WQPwVUP-C+5;o!XLK!
zMGx<cU;PmDCl5f6{>#wSM9^Og5X2}eEp6VaRmS{VqPPcIbku&LUYsLT4K2TCTa5qn
zp`Ff~FLZ3@5jAA)rl|zam%>ScJ1y@*98tS2;j)bT*su&kyAK2NmoFuZP|UG)P^Qr_
zXPCtvQZ+$%I+RJvI4tWlmqv}n^;osaZ@Z$m%1kQ+VOb6#`i`muP;jd~9R(YK>eHm9
z@s2y1AN27t|5|@Q^LrTJFEkJm5avFF*UP|$I6dcvBlKEMyPD8ZJ@X-X;xArcDUuvy
z`$8j$(^!2TP8SNTkZc}&P@=nqoYCiV9bx`ct4c(@jt;km;}&47d?|W<<Vf9F3($76
zVFg1*u1xC<-x>gePw^c%bXYNvDX@`Ah$)1n5R)MH%R<+Sp@6n#Z3b_t6v`jV>oPDI
zGxxn8pm6pdo|v>?AMjd5F)AtP+dV=j?ZwxO&q*mYX|>~TI?E>(3y_W^+C+fJ)N<$_
z1tB&cXxB(`+3Q{~Jd2YNsl9M@wPwOXr#0husHPP8%sliHGG?_H-)>`O>&MXgS1d0_
z6(?OoM^w%1DwZDw^84$SWcQuO>bX5lJkYE1uK$kbCnimPOKInIMO<syUX+UE^A18^
zNGkeU)=dys2v@1h=b-0{DADDAqKCeH@2&ens7Wbe;1#O8Y}W?4(3jo14d_UV6A+ze
zCSp2+<8+I4I>EdkYQcFEMVl2~ibD^M`@8Il>6(RGDrLt{|6wb<KR?ErbZzVB)Z%)K
z+R)}ZEETfuVkaFq0(R@0FT9X{a%ymTj*qeS7>#vr^lxuAYAhLdES=)xbmowpQ}Lzm
zIj_gp7j#1278PYed(mxk&d^(hBajKX+4&&>Z>wfc$$Ar9Ge8!E1Jf1wcDXsQJ4-dO
zm>k|PV<#LTbnb~{AA#jB@klYEi2!|!N|G)tyw6oJ!w*Pf$KsY9+LGF>N!e5}b!W4n
zjhiA57VK=+;s$|yWuLpi0Hk*gL&B+a5_AN8tf2XrORBVHD@NV9|B7C2r5UOn8+d7W
zEY)Ahz^3dK^DD?cr?gbpy<_dDBH%hl(oeboMRd_Q9pX^U=mN0Q-hKNDiD+sPkPUMa
z51c7zCf)ccmW1*+j{b=KhspA;a%@R;pN$^R?hIWNqkNS#NY?Gv9koU5n*|zmR~VgO
z{bK3Ll?+#C5uz4%x9_Iju0je+L6&fK*7Z%{g$GzBw>`C)kz%ZB#({ylEo5ZW=axlY
zdl_YoI^dIjhdRB?%yj;iWmi4Q<I<Gd7PI6#MO=Qf)-3MjDBX_o^5wY?<mDTe{rs|^
zQDXio<?OFvH+La>@b)B{L#YWPI(sZj4H(y{`@6inJjP015wzSBC)PT40A5bB-Wm1G
zsKcfP3&o`qOunaZax1{9FLic6Qzm_e6FjWywgKRD3|72%=MBg+Dgv@f)RdPkIuif&
z18d869ZE@pZWoVPIkO!Gjqw}5fqv}%k^nIu$3!&F!lJ7#hQd_s9Yz_(=vbUH$o8DY
z-^mk<DuC}NSkd3xd84Q=REv?3Q%Z<1)C^=00;c0KY6ndr#=B^56G3b--E3q?OSsCn
z{8gexJX{vqf5;L)dR42xa<d|jzJ(Ll_u$)4-MeWXKfLFsmhO?9OI4lSKh;<zx}(Yl
z5^-*jCVDCFfMfRFqZ6PtIF!|dapR6_ZMb*G6?dEV>TO}6&>iU7%uKm~q2}1`qm#pm
zxVISzul4}AXTXHk$BPr{Y9d77cZ9w|Gr<lc+3D+gfo{Z>Ej0&=R!#7N(_cwNaJvIW
zs5X|A4Zfdw$Zrjs6xA<f1;2pN_s(Q{Fx*2v)zB6hbWd!BMeMM_gG<BBJ2h8=AF?c9
zIPJw&@YSB2og8FRI9U#^N@oDmjNUX9#n`Q*;^ST}yN)PhB7o+T`Q8fMx@Ge@=$X+7
zeS32Ug*nVw8NNQasZ;&3Roj|(oO|f-fx=ZcE;-m62d6KMSrm3acNB}J_x(*{QtA*P
z90W#hw3F7np+1w7&CVJxBwFGq3yH5Z(Mz52Kx8hJd-79`48DD5HUEi_)Ti4)zq7JU
z|I0=~*qD!PUgLwAre5CZK4Rf{cw)k+Gr`n@(oasnNv1$38M2RADb#jytrKecx_$??
z5pHdf0c{VB8vi-57{iOxpBZ8@F32RVn4WIq5nrbGOo8Ak<d8&Q{FO3&P4!I<1#B<P
z&g#^eoHpYQci4G*gHr<cRs*M4dkE?Ew0Czpi8PiZz+^Lhwd3Pt-30x*|MURh7>+m)
zKa$NG?@fZ4rj2`4a>W?{FlKkA9I1aw14;Lq@a8EeMyg)>l6(WWfpU??T84UYpw2wy
zeKv8JTW!X%6(hKNg<@wE_GquhT;dH|#o)y3w>Je+FCZvG?;#^+p6K2!*&6OKjCm4x
zLL&T%#(DX!UAm~|6-DQM{1`UqFvrF6-qtz}3we(BQ$5u2dE}7v!XYV<3<3;kn@P_H
zM=H8aa)Wf;h%pW&^bA-i3*ImfCIECIc}gFP7BAkfsPhaDMEjUf@!aun<&r+i$`iF0
zglAa6bBe?`Kuqtct!)vO!T%Or@~C~yrvFCHfTK9A+YxmTxs_979#9J{eb#dU3HD#{
zg_jl+e?XrToxW5@`ImAE<Y)u}Xi$Wm9HVva`0*@U2@#uh1zf&nG;2+@f;&eK{yowT
zMh70C8dX&mG(Hf)lMHNs__=eJ<nIB*^V$yJLnO>V0$eSo-mav#7`Jmc<w>(9O)5}p
zeBYS!61TO^dFAo<VB>;^479Wr?LcstEn9xF!wJ<zVF1kt%QK3K&kKZ!8|@H{jZU5c
zsh0@;d>yJd(Zmiz?kzT>cu|b@J!15f<AKf5XM)j>xMm=0$Ny?)z+euj+^HY#ZzFkR
zV=1TwjY5ZU=(;yI2s-Z(X7=k>2X_t)5Ra%RI|GAlg>im*x}+zih@EV~?lWvhz>9NE
zPOd)>;>mNyyl0bF(ylc=(1G<}Rp=(5siMnJD^WLHU2COY4ATFD?u?+X#BQ)8muL!w
zc=~U)(|{_s2qg>ta}GQl+#y3#8H>KJ`>1Yma_Y#t5J!+-)B^9`efKS}HLuT)x^=bN
zO~Pj-y8)gz!QZpsu%6qut~FYqXz_qtcDjyD-+KntZ0eRJ=r2VceUzE$NO^{m)yut8
z=4&oqvqn_;W1WkXw|tNNV~Nu7uI&uYS;kp>p5=bjuKEl*Qb%_mItZ{x^EZW(;e#1N
zGUKR9y7$GsXOe8a&Eg2SR;tiYbMD@?=;*lgumgqiPYZ3yw9+HImYwDbK)Oz`x<%Uj
z@%g1BkS#IO0MdD<>)`qQ9<8I^LM61&dV*j7{&A^RSNfh1?d=G`Enakx+UpxA)g=`+
zN;9@tHj>;mH0rwP0(j|Fb<){%uJGndqg9cIlt>o{QsXwN)Wn?jb>F=k{EFJ#og>hq
zW7At4Ja#OWkOt3q^vDdERR7V9<#z`l_DD^~Y=5{(P!i@w<Jk%*iuYG^li%44&g624
zz$Z?eAEir4e1iOA)%!b};YHWH+nAWr#&dPNpTHR2I@7+uoCV*E1sFf#<f5(D3TK9z
zPI9~kGr?@X8i0f5GEQONcU4dgTfNsJjsxJFvbT=t_2kIShfBSOmzK<xnGVlhzQp3@
z!~BAo-AqR|{<XLIwIPTNGI;|dMVjphf=({4qow5y7J>^V9H5c^L9Hpl&@I>ILM@3m
zMB$30xDzv>4j;~Ebl=;zsW>t$Gqt)tKje|hvWW59kUF;F_-`%%A^ni^E1OCB3-1q~
zw}~QrU1@natDjEuM>k+yya4F35ZdGnkg1|@m0a{UCJ;?ZM7BS1<pGwyaN!A;z*_;Y
z5hJvr#{t-p%EQ&&^F+~{TnHL{Du@tWMjLq<?jnu?qvu1gj3wm@KBosio|En;_>AN*
zq(@nJD^cpwbs!HGb#u}V(8sg<UChY9_NwM&35$|xfJ;v==dKy-;-Qli_x*&7Km{Ix
z_8SOsVgO!Lj`MZ;LBPG}Q%4`4&{Fihfsj{KS0A0IKvB{LN6#ZG^0$OYKl3J#uACQh
z*~iDnB;*s0#?0qpr@_<=J9U~JFm^Yz6lW-?fX!lNqg>CSLuYngZEbD6D74-~TM71g
zS`d#33#yzmjHZIYE>;q>g^0`<K`@St;0phJHyz7PP$q~9*jU`nM}j3fYsslaq=T{e
z#@z0)HS1FY0U~Yg|2iN@N}yau-BMNw6Ce&z^zBA214=2@E#jC_pph3B*%2Wto1KlN
z+3(6_CF5hIBX@0?N%4^rJV9nKx@6Sh6BS#_My+<XoUO1xBufo{9QrbdrDsnAUT5B%
z>;=aE<;K5%OEw5(D%?!<3TLL5wYsw2znP!Z!}^1*cE~B3EIgVhn|&YETL0ATW*sE|
zj_*FU(JIFvAy7#CbF^dRk_T-=QlX2B&DDQraB)HP_B};H)1RLsI#6js{-PNZR{DZW
z>Nlr%5KhM)knXzEXZ*CsOld*BC1cZuPNcYhSI$AlW-)$--dNvF?4K>(s-ABzTCZ2n
zRX%#jH)_W31b;)pyLUZ@!bRZ;B_rS?@6Oe221-p`OdIJhJXLq3=S-leF}cNw9;B(1
zxvf#W<Ooyp#C{Si%?Qh&AjF?}-xKpbC<2t-+Z~|f=Wu=I-%b9x$hpTTQ9XU)bET1s
z!$G|>5XxsgX{|i*v@95R&3&(0sGKhEw#O<)i^f)wD`w85+k_il_2Wl}brY3C|L1|t
zgACh+tpE0To&!LkJenpIgJ(wDEN#(Z?Z{>d$3ka@8MZQMDlMB@H#n%yqp_r4Iy!sb
zF;XHSTtGnaoPs(jyC%E`e$e?EySpu4RRh|qz3J0;Mh!5bI8m{``@Yrr#lN$V8?TXR
z)wi0P;5?6hfg7#_2oqZ@Mx-H0v4wH0DGhFnchS~93Qf+JV(L-3TaO-LR}F3M!%iIC
z%X&fzo<TIR;V5mIZClja&q1d{&adUU@jBz&?DU&AN6Szb*qW%=LRvCOd*C2f^qty>
zW3!{^dBh-Bdz#dd&wYJS($B}EozQjY*>d2ZCX)X(CHS$mmrgd=(&glKWYYCa&|U}_
zwQeR2p>=6+NGTmR54torZdq8DpyT}W8Yf8KOq`h>|K~GY4E!Oen7k+N6=M3cG2AXf
zZ~H1y8SkW^Q|;$zSl90lmriLbG))3oQt@jpa}%KiCwwX>mOVbzr%q^liJxd2ygwL^
ztP-#RV#vMiMo0^U)ugUvu#0Qo6=kK*>s%J~L&}M;8$1xh;9Qow8`S*uUyW|mK_>yc
z-NGE_u&`CMa)1}}-=!Xn%Qn%Pdc17Yv)?^=N_2#5#{jobq6N79Uw<iy?^@0Go7{%~
znLW|v9eR8g7UMqxs^E4H>2#IUuVQC~RQOmbg357CC51V;#ZvNXoxFdy-(L=4mN7e8
z{$+p$2%6>=pV!eWKsJuZ^lX`h*48<6VPSjRL+Exk^dJXJ?9yCXa=LCs=V7alu*y=8
zK|E^yCmz*pYVgAXq3FSni|!YSWgJ;1tsPc1r%as8IrzLwdQ(T#kO2<rCWl*lHfUQ5
ze-{cmxHk$x6HXa(5E=egZ8HG|@^b(P;t6FZBWezlc{0gU-a$u-tUT|a^P5oQzfPI;
z10QyiBf+X8*%ybRHiGaF*FRZ^uqSnCX?%=Ub6jq7C0p$*LZUqJI|%~P4_h>;`wR@h
za<MYcuICq@`^SE`7`!JV<_S=!`6b8%7Aqd>9c4RuY_`eW%E{S^0m1+)timGkT4=@`
zkb6hANq~Mpr>7*BK&v`A8&zsj3__yiLQQWha#?#J2zN>v{U2XjS(C7d*s#0OSPp}8
z&?L!o`jREN=sVQQ9b?rG>U8zLhPe?u#GtSA7%)mP&!d^d|3ecZsLw?(nbCy|rP;=3
zS3Q`WJ++d55kIo>H3uT7x?(UVv5E?I01Er}?Q3s9nEq{h8$D0XiV(}d-?TS4Mk-Ca
zJ#XcIZ4T%r$H`3&L>p)5#qUYUO~n=*jEtS9<o)qib0S4n?mt-Mfcj<_8c*=ix%*`F
z2X6$=2r}vKsyr{If*=6uNH1QW^o~6MXeaCxVH}!YBvkc(q1|79O(&Nor2x|A<%!rQ
zaB3v=USfa7?kjI_AvEYLqL!j`hv%ksTa{yKYy?$@h<dciW^fp<>_74Yar^b|QcU}!
z14Qt5*_pf3t*oplA)r^%%e5m^nIL4RItqi++UC%I4H&W~=_LuI=E(jU>_ROvh#SQr
z@VDnbtmIT8lO%WR@ndTc7yk2`tu%fPrA-exEt$c1ZWiyM;$JR{mXq9IO4=D*yMA31
zNk!3{r;w6cvzRaQ`u^7fpr8jDHW%Uar`<rgfl6{cn(416Dd`we@r<Sb-i58XfuwOc
z!6QA^u5z#+1&u?z)kE&l=^5W?ug6vgR4@d`=l~5e<^#(&{l)`~l|`dky=Ez^Z2Rd&
zDM|1_@t5xs7$P92=6WGA^3}z~I}vlm7A2y?>r-oSoKaKU=n2{Ol0tW@D~>`ka5F&=
zxz$1Rn5E=Syh(Jfo^|l=0+Cw=kr5S?{;rT^1QnN3PH%SKJ)Tv;zs`|?M{$v)nLUi|
z^@nE=7KWB+StVecYIs^q*Wf=JSCGx;GsBB(Xs`=0JQ010Yv>Z)iw?yvWxX4Gqc>TQ
zML6qzQLd-1?(6&aiO_Xynl<%TCbGy*sI43nQbV0lfbnJb8ovo|&zvs-ER02@j0bGp
zbwm&GfJ4B>@n?7Lj8z)T8HuOyh$oT!*WpZzJ<C)YeB&(mC4x^CJXKK%xxBGpwB(V!
z?TjUNk=D`wL8gYtqS75&pL(e!H@=aKC1P7c>1G*A@rc-qM;jpi!Wp<I<T68%&y;yv
zP!Pq^v}iH)K9fk)IQ+6k-SUf1|M+#Im0y$G+L2VEP9&p**F>=UA`*kq+y5{=4R;oz
zs=yFrh|lsdi)%79`4i81)IeC)17>N4Q>KVP2bf`U(Y$>v^ixk|SoOndu^NT<u23Y*
zzOu?kI3wmL7$5PBYMf~8a3V_e^uvl@HEtIJf#lP!ExK-NyHdMECxr6zl2)?%)4xFb
z0|a0>=$kutj_iO0aTHJRB?V(e>s10VVc}zdOCgd(CD(JfiZt#9WB|0J96rC+Dtyes
z!R%8}kaa*{*59+Ibu-UtWxX0)lcpFBT}ny>mZwLdBQC54ACT0gor?jp+Ezje1h_US
zT*G8M9P!vfHRFkh3N&fjG<<G!8Vap4>+_R4wP~}T`sBr_tz5d&JVj;6>WTl~wz4@e
zTCpE&X8j%N%1QGqC>zw^>-%${x}l*0fFQ>VA=2JYe%A4#4$X=wFX0;%I(DpY6`lsm
zlAKpOV)j`nH=Enm8CAhAZ^slqX18fDKa0OB8{gFbNfpN&Q%(}qr-otA8H6NG6{!>W
zGfDrz=<L=#rsb|`;4w@HH0<E_keDdBP7M~-8E)g5smD8mSlO+RWOs_r8?`x`C?ToD
zLG1T~V*qKSe_T`d9UL4=gWW~Ns8)CVJ6AaAtRPJF+*U317yi}0{WCx#LL0;2FYRRS
zlDYt88w&5@u#0Yzf$j7gJjgb7l|FK2*XFX(ljj(EDW`HS(5Dq~WS1_IK%OoWNfSP}
z?$}W@{OXF-MU=W=xE&a*E>(f|_v$r%FJqK_YMBE>6^egD?=;S>>Br}i`r#8NoUxz$
z<rSKp$wt=L+S$z~E+)#!^Jpcx^u(Fo(CMETd;E7=p)ec4_8g~gStN3hR4cn+AJ3VD
zPz%PE8JzMQtSjaBK#qg&?d*}y%K<)D7yrs>g+No4zvO;_2!iODN>c&$s@g+T^O00E
z2@$eA;7WVnHd1d!vOtjq5hnzXXk)1^rdGs(OX7q145>!~*4JKjcG-W{_s+PnW0MgK
zpfvO>V!JBJ7PUmfoi$e&Bd{rGENJxfnVi(Fx63xJB;8K@oWv!Axq#3JA!L{_I~Egt
zi3x?I6wI_uYS+(YTLrNp(jWq*XUyX|#$2yK(oK9Ywh0n6pUsa-+1~+Eg3~937(j9$
zffX~U8OD8kRh6q8R*55hha1HPDw(53E7BvWBDsNyy?cwN00X#0c8x<@Z`4wMJgpR*
zR7Jfl_Jlfl`<lA6ylfFJ2IM$a!e#;NBO-Hoi7FJ0xiw&vu=t|KHbV-V4Yz~qNV~|U
zO(|ho4+yc%JRNj0eknjo%9coZI?@G4A!J^(O&^>p(M#e3v>*9wNb^D^hdW9yO5Q#h
zm!IT`0+%CD`ALWb2#eSko<e^ZNH;$u!}JDCSg>FLNCUx!<Z|mYmmi1jor%Mw;ejxZ
zo(JkB*p|xx1OU-+umrO9k9;}yGd1-hX%)Z5?|?Y)Ya5vg#xF!gJ)bs}CR<uELb=ye
z7>Wss0AOW5^=<b%+w%WT1AxF+F;)1S(>gP0zImvq;Iajs<q~pLV4ZjGOz=ejf|oWa
zO7iIKvDEDdz<$&z6y;rkfs$MYu*Ynj#W<zpeel#C)`?0nnpy=EZM|bC6=pDK7;Lzt
z<mAp4-?8AXqOgHFT;eM&UI(2#n*8`7d@uxu00*Q30O9#UBKw`4q--5QgBAzq=FZW-
zV@Ujt@oD(%b=WAPZJ^m~pd7WDZ=h0XzJm25P7~t{Y<4z3a-C3{{m)#XHj*L5*m)uG
zWg1Eu=|MY0m-9>b051Q7=kq8o5n_{`XwU!lhvfS&yM(V(KF86)Anlc1Lp#1_hI=gl
zoR;0^u6?%a4`T?t5Fw>qBBO}%bD$-aT?VR{s$iGbGJ9*XE8s$;K%0mFss@WW($v#o
zMFtxymo0Xd>{WlqLwu8qK@Ym|$^8MRzwv67SFYmICr+OHFux@x0IRv$IYR$Wg-KjJ
z6gnvK5$=(2`%JgyvXlDFRGG<^v7k$KBQ~m0agv4?eBk5ql>!1M(1q!cSCuSzI{iKf
ze5;f{E-Yim&YhF{TxBu6W!I3<vb*~VY}c>XLNAg?uUvhqzddpYKoQs}V&?I6=1}cv
z&xg7xq$xlo&&xED)a40uR)Lsjr@W*Z!!xgsZXvz=mo>td9oZvB8PGq%rK3dh==MrQ
zMMdf{nJkO$F?oTGQ;$2Fx`G1wPX-2v72WS5rql>2#I^3NB;^wXW=TGS>d0?&eZ#p0
zlvh526rM=_Q;Htj;GirRNGnGv5{(7L$G9n0SE}i2eDG>zqYqG4=?bw7H!I<=av8_R
z0oHv4zz2onpPc0ZfBnJa2tdu%nV0*2E4O!W5HSMIFsa5=^6d8?07b}2!nq2531qSZ
z=JK$(;RMD}?Qs~YB&CzTqDl!5f-qw5)cZ_6U@RR}Mt~O|Rr9H0)s!a^Br~r&@li`~
zR=%IpGP1)(NnN$!E-gd3k&bv>U&B+z0&*jAb{Nvd#jXqlf#N&o568v?>n@upQ_^=~
z%6C*%fvM6$&v!_8#BXI~X;xRXHrsPrP}R)=1BmsIm2CF^_E+rBjaW_VY+b}{zm<bs
zwjtG6`njNyXBXfdLC&i8Fa*sSMH}tx%vHRRXTx@3k1mask@T}E0wg$AhFXL_TcL?3
zPAS!e(^<t12;r2S7aQqYp86e@^IhtLFnViP`GNg>v5JC8-LF(ils_~10DO+jt82nc
zRm6lE;fUHQ0dtfBQ^Lg!%_zdSjvY5HZ<z{>3$%YB%=suSZAiqe#xAkr8lC`jcdWO9
z)O8)Tg<c~LSITXBDK{KtgOHZrDE<Ml?U)BIo{+P-j10Xib3_K{E+@IHcHg^~!0y8M
zKrLD+(9&J#<K{LFs8ocwWD=juLuQ6s;H$%-8C1y)ka9aUxXMywITw$H_}<G4%9tv5
zZEcpeo56vC>5URMfUS#2$08gHzi-bTjiSmgUuf-xFwAiJx{yZFvOM0iUKEmnKsS|F
zZA91j)m3BwsgkNfv6Bl}czXxo!NZixI6VH3c^9kVo3xS~b~fC}uEN@76xlhMg1>VU
zZmK!{ZP1U(10vsnsi4imevXghmWtAqVGxNwnCBsJ@;zz5K}__Dg(yqIO6y#S@yvio
zhx+S>CpX#e3#*WW1PvH6GMe%o9)^Iie8dVVvHSlVTErckNDnai_Q4ZGCk2|xXP-6F
zKNUECiaMuk<jAZG4{@ZtMB27%1PxxQ29o~8wu*{QWx131{;hSj)UP^KE?XY-m3tx5
zrePQNkX}L*38$vx&|Xu@)o8f#8V66Dp!M!EsEXijAMf@8)559I*4ECSu|SfwG&Oxn
ziieIj|0M5T{9@_Cg{fz!_kv$+5<;7#*nVRMxVhD<L2iJ19B#Ji1+t|D@86@Pgv+sn
zI}$ZDD7_F4Z5c8{Mw=mGMlHH4OOYmacNa;v$?w>KQ|>a`vuG_`<@Fz*snv{k>p5q6
z1yPed?LSm9ezufzSxw;qZDni(`=N1o;LHfi8VnWV!O)4keDUIeJ}gNpz3c5ecMeTm
zLFcKc6Fn862wZgJ2++KNK^dE#j2)OyIXHDZyk^oO_HLy4=*BKx8(OPeJ8F!?L%hSd
zgu57VwDE9O1TAre_u*m`kX?&-v!G+6BX0Zwkm`)UR2=Hv4|@|B$4+trKOUa9^8IGg
zlKB)+3FU`&7DE+@57$0LzHeaoRq8~mxSGd`4TRNFxvj<}AqNlP7`5L1r!>2vOGqqa
z2?<2v(SLTH%$Y(O%8!CDom@S8ibIJ*%Rq=AOSB0CTW{Od&P9vnw-IP!TZHQYv{3S2
z=!51Mao<c8?}P-=?={FA*Q?Q?TQ_Gq%+lxg5z3!bg=`WcP;4P!gXbrs1W=B;h@i}N
zeO~s5%cuWYsP)vUu`N)o%o2h~pG8BVMZ49|2t@1T84c5Ovr0Mm2!&O3QI^Whq!tJO
zxbG$1rr>~$GV+;jw{fFz1|}iN`<FfyrcC5Hs^)8HC}VGvUp;8#@6vE76fRShp~_r?
zHe9tDEx|XzLO}D#pZESb%Bflm%}mo?xWmwAAhZmTHg0LtL0^dpI{*J?BN6kdKM*xJ
zD5ZcV*bd%$a1pUlH8N-c2!w6=v|{n%L(>MI<0g{Ou*)jonR>;4NU|*(9A^ES(KKih
z0ZcnK;#7_a$(Ue%_&&=jqPbTr*p4|wat>HrMzN`PU)<FA2p4J6$IgV%@CG_>LQxlS
zCc-VwI^7ntg>!1!Lq(;C?nz<`{z+|d@F&PCE0~6OSuX~TVxuO9yu6k`2I^VZ)WyKH
zfsOO;F*nv;Py{npXjZEcoTBLTDLr^UOz^_{OfjOP34df*ot~Xt3&=pMo_v2@BZYQo
zKLHb(hGjtDeZF<?2MT7KQX<vj;wf4Rl0ql2kf$p&5vu)P8>J$K<;^^6JetS$zYy5K
zmnF`}!BTMj3K~{XkPl%+4^k^&RqU0|X4X(>fX<NHd+`Fhi+*xSm4cc}fMHk$4SMSd
zny^<+-@lxiPVr|cd(lx&j+JNz{bqL0CQ|Nrk}D9t@qsTfJ_LpyjzNzdh=@>%rEbNq
zxCAs@l^zNBy7%A!gU?+T%}vlDsZ_3By0miY447PJhysQD79Hkz@^YnF(%oIoQ|`c&
z>FC`GMWf4WUYXSg4^)9I#s0jkpg_yYGM@(-yldA+1X?~D(adFwLFF2yO%#-bR#zaF
zPBK<t?syCZ2Alfv?b5g;z}Ebf<JXxTSrhI|d|!>UIF#F-_xBhMq?onj!9gLFo9))z
z|Jt7a4}VLxjOQyWJ19KE83J*7^l{2eMq%{vLPY<LqZdcq_Mh)|9~Nre;vx|RaEQ@o
z#16?Io5J5aOE&R(y7*W9zt$5v;~m(oVJKf`nqbkJ%lC!6{5mCN;BXD5CYcRr>^Tk7
zOsCdBwOvpGX9QZ>+1Z7(jQTI)AvPECh?s7|d+V&2rB6RHmI*y>){hbZIgQC-PRe)x
z>Z@|UPRqX$P?^y31X<>>8AlnUM#f(CEun5C&6vd14vQPRto|pOx_m;gkUShGH^`?-
zK7Bg2bzzfc%>)2JcMo}GRa6H_S%DU`sa(x^H^FgS_B1R(KY%EO7)OL?cjYpo+>XRo
zZ^8*+f$EWTn23)7nc#)@{AK@K=F-=ClXCW|S$4w9urNr@Jfm=u{X*(-Uy!V*bzQL#
zSuQgTZx19xfdPOjEX#>jgU(~Sk$~R-$d}h<RSWWI<8P-xA*4xTN1L99C2-<#8<H;=
z-GmloPW0rW#X9U$+E$q#|AwsoBLZ2pbn_foSHJ7a(AvI;L07o{e>|*b3$$~E(jGWu
zWuL$3tjCsD5vfPN9puupSKq$*_!g*f;btfXcdld(N&P*4iFg7!T(3(8(tSq{CNPb`
zE}Q~Mi)DyuzOw}@`9G1n%Jr^_id3IT5PsIwpo`y6xEG}+d#Hc@{?Yo25v!aA#zl=_
zf^nt^6?_uVDJOKb=9k=SEj4}2VDldJKmP_{fS&^PcI51cP?GwwcmEmp2(*4pwtDcO
z|NZOCCkiliR2A^G8;a3fZNkIlu)Ld1`%_AuW@3^?Lsy1XC60<*Y{}C((7~zAsdy*x
zPx?EB@xLL{z=+?t?f`8OYCnAg9Pl7Dm=**`Cz#9&)*efk9)F3Mj8H#HOMRKSiY7~a
zL=TqsOrYNk;DSVioqmZ|M}!gGOQ;zXV@qN>AZz|T7!VM%Ox+7%GBc7J-OWR!9#N=T
z(pP94nA(*TpKN=490nW4BRK>}1bjlSiZr`nf{=I~W)}d*0LUN2GimK7Ot_CpHDZn~
z&`;UDKc%M0Sy?01>wgwk&O*)-jv*IT4@|17>3*=~QSSwfk`l%`q#4m#clc=}`7c-+
z>o|Ja)=C3?eSK<7c?5i4%o@h6*5AN1MHJ+#(Qn|ushRKIyitb;L8L=gW63Shw<LKI
z7XdS(5w}4wOK4}oC?S%oRYM1&PzF6S50A_9pg-^vR16AeS>6LlzZR>}g2sRqnmF3)
zj4J+rP2ayvZ5WIg?cTFz4qBny(~6BzU_^v^CpA~L-54b;U`pM2SOI{$^I3IvW@u**
zi*TX(IyG3{0Q8%dPauz(5WvaC7<U3(j@8yU+B@CNx)qh%kDPq+RWbL>d89lubbujQ
zMRSA=u4oS%_U!C)<OM08c_(r3=FFNU4pfij4Ighvzi>Mp4g^Ksl`tG00cCPnEa3xT
z$RrISctAK2B9#80OW$71m0qPuvZG=3^lZHmQGxQyWiV`md$v9fRXvuCAofDcO1sjE
zso=y=P!-RZg#jBdj&6})TK9LVK)Un_fS1iU87Z+(!-QaQr;lp^N~t(#F@qixE{yA~
zAeIE*!=&>q2RuC)$=e*o>8@RWqAsU27Jmzy7h)7bcCWLt7`aA3M3v-{21pNupk@3K
z536E&*7NbSKcsBTNDVuTp&?X`-}3To6jqMsnW5s%Vhdq=Y;WR!4X&7sy7{YDe*(hj
zpk{G8VW<Ef!MzzA{*ljl2v)6NMS55tZwQ2uss?*8V#Ig+jEOZhNpye_Q`33mfFTDS
zr`DAE{Q2<i27kIl(K8M7daZt*6tUz`nvB%QFR=>D-#ILj31|NXk&9f<kCkhMl17-B
znpl~DJ219I&B5Zbpfhv%FHg+{mqXoogZdnF`3P6A*w+`_{6poHq#y)iIE<?_k5Wgc
zO!^CdSpC9SC5EY=^-ep!A)Izsku%$)1HIAIR`}sUVYO_T>_xL&iT3WP!OV}BZPluV
zNP#19p!@niTyi;gh|tqV3^!@VJQfBli;+U`V(^SUA~3kw@Rja%5ci@l0OU;ES68p>
zq8gY$l(j4`E*i&KEK^sPCK?)Q(}!pg4nL}d7w~W-JSl=)OE7HvVZo|Zs$h36+a@$X
z6Wn_ht!;GUHDS=)lrL-(4R+nNZCB7+z}0eS+s<W?s<w7Dv@$|$>;hjjq+xD9=ag}M
z96r8*rSZ3a5c+u&sWj1w*YIJIa*EA-Cso{2IjWxVOcJNZN!M47jJSP!PCR)QTs7QT
zgSdyM;5G{zwxD%`JE^~3DI<&1BE<MJ4YJgvBfCd2vyC*(VdyJpIvHdZ1VB_e^D~Z7
zW`ICT%TN6s*z)7+n!s#~0`rq65y@0nA6A}TZ=LgZGzh*GoStU=;o{q(b?cN@&7lzA
z+uwaFZ$cIPu*yso>we9QHM!gn>r6l!UPTRJiqzTXSy}TyU`;~_d*;lZzND%2z5ABI
zzdfkrue_Pn#K_^9K%q~ofiwX19HW%6xg@UNsuhw;-L$~e-20b>>BOY)F?gaA-OC#*
z@1?yIixB!hcHt$R9o~kO#A2e&Km@b-4bd)zDP#hI0vBsP_@j84&7?4}&JRyJgxbhu
zs|+jZcci9QFMJ>MDQmNS%D1|@(&rf&T7|iuV`MkDU)-O*I_35~<qazo+>KLSst@bF
z#AEPyui3t*hTYd%-q9>^;fh|tT0zGqk8@9Z8{@n+HX-L*N%f{{t`++}M<twy%Z~W=
z(J?k>O@|_fxa>b8KFpM2+yBwHF^Li7$LKBZX?TH1zzB2Eyi6Ii=SxY7=n(U}!510t
zGn1xH<vz4-+45;t$>I1pl5!+Q5akogtZHQ7Xznd$+exZBqYO@#L1u)CtGb3pc}ASF
z%aB`)%FfKpOiD^3L&E39gh#P|pIKH;uU_?sUX*>Bj`>5f)Uo~DoR0|fBgkP{NR*%)
zK`X}C&iIQ?xL8gUQIa<@2g)M!aMxY}|AOK=aP^`^CR{y4;Sps(f0}A9GpA6+;h|(H
zyq-0_NmkBN@Hnukc_ND5dZl3c3Gh0_X$7kxH5MtbW<+cKP7AH9$VL23LQuA%NRQgd
z8w{2*dY@ln0vZUIS#RTWF|uOHeRkrL7ccCgDhg`#?8q~hSj_5H2(0j*+=iucC_7jV
zBSBVH25nmdG^rMSL9#SwIsp6emjO9dFLian+x*GAa7GwdVlaaO8%dU<8m>*{CX#zD
zuO6ABs_JC1Hl<}{NAVdHMZLt@v<CM%IBy+PP^xtppOl9W(-oN~wi><&Y=2eC3#`?c
zF)@8zhR-LW#ThXij(9`p0Q&i<dJxL;lj5r(rbzN_1de`}XoHC8P}@z6^{M}Hc|Oy?
z;1*X2x0Snm7jB#6W(%nTk-&^$T}XihDWBd75%=%MrXk}T#pe--u43{9k;}GHNtrED
z9X4ze^dicNzW`w5kjPL4X_-5SFL)(tJQ^I4Hvh8>8+aAM*&1lcWuKR_$6vmF71Z+L
z;$m?^j44^);zJmV*k?aIWDoD(Uk;b!XR4&X$jV9(sX2oJ2>~a7Upp7v2m9?RN5&wu
zg8)3K*g?%qo76~Jdi?LSUvvQR<}L+t9Xb@ySd2f#qJ>m4x7!972?U#)ekws$N>aki
zp+0|ll6-XKvzIR&z>T29!Fnp#|EJ^@O3J*o?ne$yFI3O8baW<f7w`xOKi$}bDwm+$
zE$y21Pj!%Qv>ypsDI3aqRLNC32TXEAjc4_mHQuddBx8N*Yv3*YjE;C7IH2UZZrwVe
zfbHQyY??q?qOIt1iLOtiokp?F6r*tj3I?J_1vPb4P?$n(qOB<Te`<KDmu3-LGaDH(
zjP~8Soe*=)&i)d;IUPTT>lM`a56Npi41D%<xR|Me$_~wP!W+;DQ@ys4E^&HsUY?M8
zC%Nm`v7?L<q?0GVfKnBv!5fcH{moxxMK68CsX@kgR(zc(qKM3adM&z2)@#;;+v-D)
zCMyT7XD&_;S8moahT{-C5m|euG<~yCP)!Nq5sdc7VjJwDR0CnTidKf{+Awto{Lh<B
zr<TdA47wN?Xxf@8yX#rl^E?>t0H|*6DTAi<Z>NAl7uw~=j&)TUOEdzU^%B4&XiOt}
zdE26=g&<S_S?vtyXx;4SIB@PY<3eM}WysFY{}6uJH7_;HVKFcZ$C#4W>G)UFI+=vf
zr+04>jEvW>=N7t<8dIntp|A{0MTpqw2>lz4NYBOhY=D&NfF0xl${Vb>l8HQXGHfF8
zJ(unp7xE4N{J6-8KV}XF>A_DZvSt2<4yDs=glIX_&Dv45Qr5xW<qo?B+L+=ueA-K0
zDSINV_7JK#4abJ9f$f@;lR%zI8>xv{vvd9pY@BTR3Qdx%vTxr6?wNO!yW>n;OK!Gf
z$4|o+(w9-Nsfr5<7=9OG%4bq1nyLMl%W&U{H*V~{j1uj*;2%qA5aYe1q$Z7n{!428
ze<nS6(7Z*94dfgfH+s*pCJ>GRg~S?x<2`X~A$g*U@6lDuBrHQ#37l)pM>Q7{b@DG1
z{>RI}9YFG9J=EGDdGXN=T$jJgr%$UxN}e%klu#q>4bLtHVsZ=FvCs^wrSgABNnUW<
z;i?tn=gTSyxn_u0CWT{>nj{RGe0^;}Zf?tt>Td{v4Ey~dDerH%y`iKqgJ|UkBe%4)
zBn9#Q6SFSH<o24dPl(Q$s?>kQQFc1|eK>~!1k99$Da+$gO0cPNVmJaiRt!U(q5T)#
z3k?jMzZZ8@d^}Zgps`qb@ikxII{A3oh^}9|rrf2CWV}pm?lpBfrevlQT%rlpxN(f4
z<fTGg15poJfRygw*KqjW2;CZY+*&{Ny8hJMP(@t%Q%T8u{vKdGJZf2`mMvRye>QTh
z$@Z>LMdW;PH9<abkThY1n_0s+BMoOQjJwf3N-_~aN+L6uv=r1%?10zUZ~u@n`5Mbp
zI7iAP4CteK?FZi0K!!37;#UBaum(gOz_3{p$!jANCSPA)gI%baeL~4^rgm$WvX5{;
zm=YCRuZ%=p%tX7jq(7;_620+8K{hlZA|f#GJzZY}f<0ZC(c5XL)l-sr@VFPE=z^)T
zY>acM?_Hq3;>V94s>lA+j~))8g_A6+BobRnE<x`F52V4<8@~8u1ZjR!=-ipWbA}Q{
zo4XKFL5yPPu;as270K)!88}8`p;r-c0B%s(k2%ke9tn~93=SK7z9fX-mE9Viog?55
zz$w2blIjj0ZY~r75a^$e8k7@MJFcavb`R;z#xk(y8mI^cyKq<6V`5a0V|JAbfXzbs
zBxk_-uFi^?SvUfZ@uY!4Z7Zj!m<49QyNRZ$v~CL^y`11vfkn`3n!#<XUzkx`GjDfs
zlSse2{3-QT@wNhHFi}gG!fom@hKs7Ng-B@^MaA%~`=A`+8cpxtxx?9fxYJa{+t+tC
zxxt<)m#gsKzJLD?t6u275Fzf^QI%7-!Nx{-{|q7y2tX;r-;y2B0OV6dBq(zB5fLpU
zWphHgWvf>YaLR--0mkdRLh~rLmB#vGg!bJapZr8ChhZHnkw;uGOuDhEm~1s|(9>hb
zk0T3o#<r{Uy^c^a5ULjvou8>!^Bv--rtOxbAP?5vm_b0}2_-5YU%VjZ(R*4^(n5J*
z+XlQSL*8K3hJj(Ui%G-@K`e_Qu;6x2?uAi;G+xCy2T{u-&Rl8J4UC94-SQEkUgq5!
zhYCg*jfpjH8%cRjXfK4G{5CgN6ZIFWsG{ird%};4xvGxNW`Y*ZK6>H@bzcCq4T}GM
zi^5Z4;>4S)C70nhC~EX*2f{dgSaV51lR2+UfK$2yyDNA8nZ{C58_b*jLV|YU2DJ)L
zR{!QrR?l$U`0@8Rd_=v}`8X)zenHL4^%H&RYzB&C^JS(Jq~n3a)LXUzsEW`~j}jMC
zZEpJDPT^VEV|;V}@)#cpUe@8dzNGKK{4jwo0RTdVN`se69?ktS)>aVj<nOw4$&z!_
z?kK@h13uWSuKZ-35rrBeO$<Z*J8ef04J949Td!Ylfm2|Jy7W(k7zOpm+xrR@fCnch
z7RJM{9YK%(1zrH!MDUSA8CDxjO}&0DMYfiZfTynn2BbA~Zk%HV%tpPVbm{{GQDb2u
zC?Hf4$)5~L3SByO90A<T*=B4y)2mc-CR6VMXj{wj$7Dqjm@*u6pX~>?2nESF8=-Q-
zw;6P{i`m*tvL~>skFM%9(d}nLej|{^u~Ez~TY!z{*<mjQEKrwjbjd#n7RQIl1wetJ
zzHEw>%0;882Ju2MMF$vjuZIWFmv8-POwz}3yL4E-Y}qi3CKj=U61WGX$y1vv2v{h7
zZIgS4B(9f*wVNC+n%uCP<_b@_cTjvraUSA_5>3!RK`PYOV|=eR=jTLCl^M)gGj|xu
z0Xj53aCf~h2!+t2VPPLJaP+^Ov#qaD+}LxCiY<SWdgn5-k1N@oJ9dOl_Ynx2H1%@!
zACloSGK+!j$O(<b2^1`Flj-YnRC!V1`|OJyHgBHn_W9~HgI)NddY6F|-tiJI8a@r(
zo|=U{1S@1Ah`+SpHfSF7^Ska)zKkpk7wTiX32Soj`0+SI&nYe9Sny+l|DvNK!WwZ&
z7J);SmzRglyZ)6!7u05E=T4nkq2W#)l&dZJRz=Wj-&he+kP6C0=U2Ctl-+;+*(YHs
z220$JH&<}lz1yYTw(CtT&_rV`t^<>Fu*8dVh3Ik%rLUi}=B&HPO_f!Gr83B=il!Y%
zkQ5tk!)d#7n~QdeJduD3Dd(qzIqB)#r{m-$hCAyU0Te!STtEq7j|mimp}_UM3!p^=
z;wZ*3zxqI0qehbTIoyJb66~|w9JfaLvjG_uk)u-@ivnoc?0;;NeetAGW$g{<$a}C|
z1QEj&22|Y7VJZ`BS%Y1;)hgSU)KE!Uuu989Iv8elx8o8<^wpF6e>$QMYHxoHR2>2z
z<H!ka&BA8>x}0yeLXc*GsgXFssuU9wrV}X>k#T>9s;fWYObZV`d^Y9n%Oysb2QQ@9
zt2LRjAhVvoLFpwZQj!e4Pm%rj$4;Iy#d}zY+6UY+n2&S5EEP^K*{qX#^a4y8M`qDt
zq%fia#9yIFGQ@bnCnL!;?*hN!q$<H|A>_ZH?;6~I5F8+R&-&4lq#H%RPX3I*Ob~?v
z;-QYiR?mtb<tGk=L@RPCW9U>7F0H*8C>H?c_D?B+)}n1>*m?IkiKn+YJ!|B(9v~DC
zq+lSryfW$1GoC7(qO1;b4s2kHk+$@l5na~=FJH1m=&Xyp1;^5V|0u_bi7Mz>witPl
zD7b1_QD-zzI4_6=iLOu-dSI54RxO{0>ir;9Jw4}ooA}rc2eDxTlEa}ixTaBq!Y0Pz
zU}jP71ACQk+cx(mzD{HhD42mc2$NWr4@hk?9q@YTA<4zL?3LHgdrc+fejjLoa-(%k
z6BPZ)n5M(yfyc|nM}rUG&qOx`9y8P=gt3sX$h*u>RFO0Vv_Ej<=uyMFQ~_Fa6I69H
zoW%H9FR$LN9n}};mqwg@otdyTAx(srWt$WRmEb$+NNo)nsiCo(Y_(oIyuKcZ;(SCQ
zNq#@4+X#9_L4lC$0TRUZ>Q{L&;h-J0zNEg5q?+8Q5YlB8l<|#|>L30JM~#y2-DBrc
zSCp;sT7}{M{8<l&hz}~<*QImkq+o>@Q{1>I$Gx4d6=O)>y%Q8$<P@m!5fQc5Sm+?u
zJ*by-!;h_pgeAg%s=i!7Tyf#6sEAl69r3Jh@(7lw&3s{i+rd$fBTPa#LPO}4f&RNR
z5^E5-cW=uO`+3gYMo?NI_Q~yn^xh}<9zU0otlr$|w2@(8*;|?^_vhHfOw;gOam8ZX
zt=Li(W_PtiWk|fE*AQV+hNgo?D7lUg#L}m5+!(S8$qFBF-1;c=1Jfl;FN#Sa*b53c
zdvbP5GErQffeeyr4uNwCE13*#2W0^fLE(pTnoSkYFAWmD17xriD{VXK>?m?S5;TV4
z%zC++)~O#4MF#QfgqTk6vbd3yZA$ce<KtLFoSeT`vipEHz&{XDg#r)qeEqwJqG@FM
znhk#v1f#a>QF1VO2M>c?hWk74z!&RSzC%T^k)*Im7`&Zrd_EnecMWdgMLZvySEVT<
z9?okQAu4Vkm@=`~FLZ^D#Y-R}+irqLIMvghi!$nqbzAmd@}kA|f#3d3q{hTwLG9Z_
z_z-bA6Viet;XR3~od0h3j!C<sn+%Q4KAq_Q|2e3Bb)UiUBM29gU|L$pF{EPZz1oP-
zNkvxAWULLuZKbhf`(YHuKQvjq_`PUWI^8Sb0kD0rEvN5=n73n+6$RKV=<CR5&_hT|
zwrS5*!eHxb`5ZbF38as7fiT@=3jpr%TjZ?E7cUOvU`x_OQZYasKY?|0giKEDA11fB
z()voAZI|bBu#fO+qo|VN8N~b0vXFKyE-sECEZDNTs)Fv}W_~57bcq*C%-51wDaf>f
zSVfo$loA&k%h(Zb3@ks^V%!PqGtlDl2R^fG0XsyzS;EAwWy{XgKhFo4lx5u>;B;(S
zNm5ehde1hThIwKPY9t^Cl|sX|%NnYFVxAeOCiM~4f6_NE0zkvNdq#$!;?}R<No0;g
zsVvVpR<L_MRWfdufy|5<=}<UzH#l>G@ENAqInVI}!FG9A;$hevKcCI9?ti>K0?O`6
z6BwQ5W2gwREV^PG0bZmf5})8FM|eD`DLbP!VuU>vJ|b;uJ428VBc+v;1&;M46W%yp
z-4Az2p@*?h&xP9BdyH0_=FJ~Gd`Q@7L8SNk73H?Ja&kRfDOXiiRG1Ekk|QwTr4JNA
z$456{WVsg%5x^c9C6Hj+%TH>J>eVZdTWJzUy^Uly{;qMf+ixzwidoVP)wgfnJfYu^
ztP?Br`<0Y|{80ZF^%_=OGr%N-mYl+Zg6CIOnG53~>F<ZO3*2u}NTmCwQ%f(7t8ydo
zkxQ2@wKagF)r+8|GYHXJ*e|YfqAi&U!Z%Vi2sFc$mC@EPBlc4;7+`UEqBiZ(Cd<&+
zegA#~ASAFhuOP4ue)=eDYiHiQ8wA%-Bx4rVkb{|AZo?hnVyFH3CFx7q3lN#?CdB?i
zwk089E5tZhGbloxF2yQYX&xnVs|A)4SGs^W2SI};$e)~wm0WB9zC*z}VUpoy-<F;c
z-)nq{86@oc1*Y9$xgKt(6hazK5Q7&nSFg{-2>W)3^l8VbQwssX+~h>wk@6EH97KKe
zw7|6GC!N|WDjq*{NS@(s<PT)`O0I@MM@MaqM>><bA&}rVT7Veh05h*u*AePD(_VS(
z-FsS>im^&jAM)fNtv;29s&VU+TF5FHO_{Q1L(&}UxIV`hSzFIPR#oOSu(rSC(sntR
zZRGN4M2$?nx^~3+yLbhccc=-cVmw6WTA{Cb+kX9mAw*D5vY0cc{$P5YgUy7g{s}*d
z*!Rk%OG3nS=gyrBDmeAF&bp4lLa08^CdmXw#dl^qehep6Ww_i;s+3I*5f=)^8a5Pm
zfxt>bOff4f!Rk31)Q(Slv+iLL{*~AUU>}1alqiM{-DqP&`R@6yzj{$VaF4vjLj>t^
z;wBjyaxp^YMwjW7#ELQC{184A#NEK6^`iO;U`>t*m%NT)=g@jbhyfPgE;urc8`@;Q
zo;}~e<`CH_XA?JtJZ}zgI(~+qrNe`%Im)8^2Js5;n*fzYHK?F3V4aYPjw~POofss#
z3`sN)KT5;^!E`q=GQu#yoqWkLBIYD$ZdahfNEgB9#|Pj)Nb+91d`XXcBqx-QwWeq6
z;@T#7uxWdqKO;LmZT*gais(_2Rxt9{wl4gXS=cCYa?Vn4P*8uxR^!UPemxvNpD=6&
zw}HJYXy2j3aOnRO0J6638)}`sh3phBVi??5dUMx1)B3hR=uL+U9*OO8aMO0$#jHV0
zI`aE@V<6Owi#s3dWW=#7S40$T-c55~x@d?v*ozjAo&cp2Zc$B7T_Li2g90RT5SY3$
zcJ0yefxhrj@h8xHS!zK0{`QMbP0Pn7i|M(vJ<s}Jf+Q)l|MZjn1`R4fM(pZ^3(6|f
zIJD4@=6o3oe6a1%x8hf{S)qr1*1)HT1O>}CeNs2QEs*Pk$Vb3I87111){$EpOdF)K
zzxLsCYwTm03EI<~r*T)Jw6(N^Qb)UaNAczT01?G7@^(B1A+VgheBkl}T+AqdxM_RI
z*!%VA6Zb`Nb0galNk6)B@3dTT1#XmEu^9^%yus#UG2Z9p1r_YPApXRq1EnyeU1pQb
z;>X#A=KmmkSCAMl^!)rME$nYvi9w(Qtegc=rv~+;2=1vfmfpNP=E=Dp!A%e`f|WS{
z^bp%aXx_jaZU94rUBDm$(8<q_U$U1WT3A@XK61yz8#fkUA;}Mnz<ei9)&Z^ME|?-2
zX%z~?2&F&wavRu~u#m+;gx5kdsTcJ}+UH1Y3!4W3iFczY!k@TnkJQkJgprK%*n$w>
zfdi>nT<}Cc5W|B9FYzakKbk9{Cj9a3+ftgc+n0BSZtxY5wT^lag(iNHL{^buF1E@;
zhS<Pp7<c>{ew+ceI7q+;dXl(MNbFPp)5HdH5+UsWWq{mM2)BSUrn>A%TU+|~_$nfF
zaKNm1r#Z4$nF|-veK-;n+!A1jK^3ptMhl(%^Rpwzhh+>SaE1Y_r1X|`C5*>}9HW@b
zG;yY^VjSJA?zT4&-H9+cSfgul0n|2#x#JO&GBbO|o(RJRNmJ<#C?K@IS9lJe*m>kk
zerd%C_UpGe3OU53Nf-rfQf_>ts$!GJf5(aXt;_StmX_I|L!!^&$MtUHrN8E2s2p#N
zL7kD5(28CnpSlGMfRC<ykhcHn+_`gQM^=|@&M5+GPrDqI?HGn4QPZu}q1XLBarIiY
zYQ>lMT2F;4s=J&7u{z?jZ|+HDsK`Wb9gba~2f<-OIt9$wC*y2OOazRwdUe?vrE?ip
z<j6eXfN8HVdZ#8ou)C3%C`9R+_S;gG=Tp{`tyZ=@fa`j=0XpX#0G4fj!AXNon}oP$
zj+>ReVpz_-2W9Qxug^JZObo-xQiz!tuiNL^w{W5_9IrX$P842A?pD$%hQh6#<=vn)
zXgi#MX}EoxJf+P6#o@?Jn}4_>ifLf-7P$ma$+*VhZ$WYi%{}{)D#tuM(RB2;){@tp
zPxW}J{}Ow9d-0Iu2!_pAR}y9cC$a_vH9#x~FPy%K4-z9%@BzJhpT^cE^=%@(spJMR
zgkWh%?c1H*E0SsAeM>IrCI`OP-(yLgAmYGIV|}&BPoqI+&FX4vg0d_(3XsLmPgW_n
zbz%pBk&84Tl(O&!X!qMT;1)i7{HU5il%<WRN9|C-lv|P7b7i^dJgz0^i}!UkYCnLl
zvL~)nm=8HIWU1k)&1b~?W4JzEMLhDdWy^{lM?$q`UUOAKvzJ80K?Y$NwT!!VG+V)R
z#LwTxeRIcWFI~E{kLxu6t}Xp0Pk{=^QZW^`%!5Xw9j>kuD+xOFpnU|G+51ig8uK#+
zP<oh|8S}=KT0#wkQ?;TXl1H?ofwJmn+9Hphb|xhTvlz)Jfe^j*@5T~XP7kAfucyc&
z|M^kA4t21mriS({UIu)icPnfWleK#5>1Cq?@;R8z7<3OKpPg1I9D~mOd_;RgIU7$I
zb{e=}PbJ*eKxW~hMME%g+un1bQ`ank8fbr0i%u;@HXJGv#s8&?njqoohC_MnVey0j
zDzDfYYZ1P63Wo`H%<^tA=r-JZwoq$713U<L%s%ua&nTm|F*d}(q77XLA|!cZ{|jg8
znZe;ri!`93y+>P&rV5<UCM26C^V;5hQjnB6TdXm6-n_j1HYE8>5uD>Tb+hkeWSy>>
zB23cm%EVJO=?5`6452U-<F@ij#_6HQ0lIRihzdPnUO;<}R$hbZsd0OlYnkckk?a<k
z{ZvYR1~hVkx(LxI0w1C}GCafO{JmCUs72%Lb%p-QRzK#1n+bBU;i0_GX=x&i+BWB~
zAz^h8l^&X+<DY1e>K=B-EyS<eK@I8{6#G!POa2Arqe!P$XM}p2it#Z39gc_T$_(C8
z(4q+T`CMUkV|I$_5H@?_Mqxe)nx6I~4D$I%Yt+bvu5ymmp?OzRpiZ{ezR7OVURF4{
zy#YICu3~vfzPc#|v=3S;IYso#Y#W(0XzNNFzwa2c+&a&)mB4FeVN<3~9cZsitI>HO
zQr>UJ>n2@MsSiHXO>*&>@Lw56zf(Z!3Od7I;vh*E3H_KV-9iaWmLR)rmo7hW6L17(
z7Ved#*p(ooxo6_IBq+nQKemCMTvP-7jvj5@W!O2}^xa(F4jU8G(o9rXrYS~7vd3po
zFAkYFZQ9niEw`elHuvG9E8|j+%XR7$vV0i{V?|eNQ1o0h#*D%;lID*eH}1hkZb{{U
zMI-}n-t@>|RTCdQ%FrO{iatAr6y2osLu#*)svr7ob!3djxNkOwqf4zT$wlTbaH{d?
z`>mFni-6(hu!wl~{rd;yCjq9BBblrLQ>+^`D(j&xDDmL=NcnWoZ?Fo!G~(Z;MUWE*
zy=Wtb#^kvHS}kBRVErB!E?l5`qI~=>tlKjtke}|Uqw@n?MP<b3OnCt}I$m{h8izG^
z*FJ3)c2es;y{Ia5IJH>*B<qQ3cGIiMG4d7GWh&q9whtwgx7oZN{)`}u4wwcVBuLCQ
z5#&0JNLmBNDFUAy>W~^~0PBQ;#}mR+7WUPP7iTl9y!YXhC#KtM*bvlaw)au)bD>r>
zspx}^eEF#jmicKrT6XF={2HQe-zklS1E~@wVkTg09`^AG?)M6>$&9VjnPlOxGafIg
z+N|a}Vq=ll312q2H)-^51EcVj0D-SNLtz2z|3FkDYS6WKV@gM*l+iz4C4Y4>KZR=-
z4g5vuQEw@B;bI`*6Z~Xln%b1I^8tgjf{kH!%xG5(h@N&C`M4?tBHEf&MN^-Qyx7BG
zes%zDJk!q$`H5<RC7a^6Z%-AprF6EoLp~rvMeH%U{-*ED4R_m<$>@TNl#`=!Qd8H%
ze<36XMNGmeR@&+&yvUBiV)QQ>Qdu|Lzs{_~#*=wx2RW>S2}nVJYvxsQfx!nT6JQEO
z9~<EMkLv378I@r_;^MbdU8*hF2z;3PpgQ6C|5*nwYUrSTqAUB;Cq`J6fkd<rp<>LY
z;uy-!B-dnAN!QPzQ)iw8qZ0ObC~HJb_0(ozkBR%Bb8^4TdLAzmVm;NqTSuiHr=jf3
z@-wwgM&)WjexswD2UcJIJ1@?4{tZGwxUGF_JkHbfgax2S@nKR@T9J&5jQ!in`3L|0
zJHFegfX|Vq#}0f%m_S%dEmuZX7R}DM&*9EM-VD*=a&Zz&zKl9zUJ>`F^rlZki<rp?
zFc>q2DadI&7U1H_&FTy{ApYTHIF^r@<4M6&Q64aMDQ>y!^C_WVsS9%SBYoPa=5E8{
z)q2@}8~+Hl*r?fdw={+)snwd$1L2lWmK1#T1(}~^-)Yk!zeUwMXW|#paEF2<{_&ws
z{^9lGvX)Oz^C9QgfjUcLf1kd6McJ6c6tm`<7*C8<3>r4y_w%<H7VmDICEZX}P22t<
z85=byaeJM>4*sg3S65eCURyW1^Eo&i&{gQefOrL>czU$~_>dZP1t98>Y|bY+SCAvO
zLL|HwFRrJLc3@W65V~$h*6iU79XK<Sz9_vsici(gpJv5hrlRk-OfT|)xm|x^&FE*3
z%5QBBJnT>4@n9pFMcAqsk=jqsp~O32WWS<>4Bo9YY2YK!_J<Ewk;;hH-}3T9cBA#E
zY0=gZrW1M2DWZ|&*`OqC+!@tc(V_NdtIo+MM~$kN8hmxN82C%{tC9%9D-wJO$-xfC
zbX|l7hriyWVsMu5ebEE2XCxlc@iuwxwGU!8??YL`;Bhh)ijfmYb2P+-N85~>?3TZK
z|NiGEgB3*Eyb4Am+o3>l>1_qWy70rBAJrom86zt~2R*uZQ%PNY9Z_sqBB8i#zo%j5
z)3R%YswtRTpA#JRh6eP8Y6YPIW|FTUe{$x|V|Wc@sBF<NaEEFnwU+4ULOtsa;6!J^
zNo?j65a7W=q!mhxXrt1wzgzg~VN0F9+-znex=(8kZR}@@X367Zn1>&=AW-&t4uoVm
z-N81vonc@qjOv*|CfLu(b;%usAh$!#%Pn2*=Laq*_&}~+(KRD(CnhX@uFun;xbhSl
zGE`JlLeDlw-hoEMg90-9*o{!01Mg#1xOLLVLA_mPG2)6Avk+4_Z%;>Y{(VOO=Ou2a
z3Q@kTTk9wjGgCptAlvzRSYXfi45t5OGiXZek;}H+fv-RuVG~U5k^`)Eg}lf;-((gk
zb6$L|IPU%Wbr{Ox`VkEsETR^Tq_pwlOK{{<Hwa~4n~}&i29zNeIVZbklrgyz(+S%O
zp)T{smpd#2mV^<IfmwRKLUY_B&U=2CjB!^&k!KskoIexb1?uK`r3d3KW9-Zyr*f<&
z0>zJuA49lf6Gs+V6H!LuE~`X7O~iGwQ*(B7{C_X2-+a&R-KA7^Y;766c=fek8n7mB
z3bcdJTn^LEUebZ%uS^dK$e2kStz-~vl<wdi6<W>gn|yTWWEI?$m;e@VMvT;8ib5sP
zR%L_(ToF=ggEm<hc(9fOMS@$f`j~gQAhS+8K}ds>lTm633H|5KiUNm)A}V(WZESbt
zwy2rV<zzYCWjTTfs-erpO@?A{*a|btb^p*!?dD{thjAwYp%j{wGV(iDC->dEp2O!f
zYu1d0k%Y732zO8|9;|c+IeZoK^FNa(V)E4+xIV-2n>LAgNu-9a07|}m`6763uHaWd
z=foN!`!6h9NseP{#2W19sDpHvGJT}4ni@CfCie5C>C(SNH&4?-)(yVq^mQVo#U`D0
zOglXZoG~?F8IS=5B4;*i%X{))@JH@)nh$9b8I)t$7R;wmj=Nb&1&!O`M3aP)7=DXV
zXZWbcExPD!xi%M`;=rLphvqb)g@dobfW`$#kG)Qu_>62V&3V?k=tyOkyu5dB8BIr3
zeZAfU0jd~h8pZgt36Q%d3VrKxMd<vjWaQ<`upiHy`TFtsAVB#yJ;_}jgGac^QOP44
zdz{=@y7909`|ZtAv>YCt9F;(7NQ-YFs}m&BJy>5uAruHME)|DM%^M+8$&@B|G!N!i
zfB#M>)3n@6QQ&0=<+{mXQ-HBMF=H0IO}xBgHMA?rP;`F;d00~&;I4d2ZVG^kF{_0}
zfqZpuB7f{UH4$zSj!e(~i@#>S9zFb7gzEC#{CqJh2RU2PKz*h}QcWYm#u(RqzFxxQ
zHv^fs@7@jJ${mbp(-bi@)3rn6?MoN&Sz-(_4LRheBnFHLMICKD7hTp_?jyuVk)v~w
zMNI&X%<<z+yd;V{puR-K^%yOGw7}Z>WXT<_AC-S_%5~&B^jGkjojP^m&L4$SL0O$T
zVEycHvc!S*i(p^io(2|ArWN%qhMxmVMPjI`>PNb&C}a_ZIk1|@p@0kH^JkIj`$X5n
z9ql933J2{n*hTe2xxOlKW7PvX!>X6x#S;Dk9%Ikf*8Z^W-n~0EH!CMW$V?ENXlTqR
zWL!n;;iOyExAQiQp;~|yDLOfz#Cwi1?3!>hI5`_@YPz6=0*?!bW=lQ@-Bq0OxJN4i
zi#3;XoE;BqvyOYRI(dvQho8n-Zg=GpI0MQN73YZd`%l|PcPaWQh?%sQmvrk^PrzJX
zhxP+z`S;roVucm(Mf#xR4(%i#qerp!-e;(fPkgMIl~95-&gAeLZbA-f!akBL$D3oo
zicyhfteK<b3PQoVb=~05wVylsSA6*5#kD7u<i&vj0qz!G`V3;cmT2=uTb}&i@%;4a
zsqAb~^kr!2LKEw8Ptc>Ul>OYdMp#<Twoib`xNUite*<CNC>RQuAi6ynra877#Bwsn
z2!J|rqX<!r`clT&$jwkwmBzBqEk6Hgae^ib2&n)r5&yvBaA3FA#n%_;Nwv8rq*^;K
zk6t-=C?XYo`z~Yi(YU+@V#d5K%m4z$1f*2o3*Wr?lB^3laJY=vwE)@&<t7dir4Awt
zkesh&Wsdnh`vg##=My>-`g8wKffkk#L=vnY+QoI84pa>|@YVT#)`m)xrpJGOC+4r=
zvoXUA-Oyor4lx1+f?tn9<M_Dg{o;QHA!PW4`sx3})tSKMn6~}@o|(tU-cYh9#z;x_
zHEM2K%94~wi=DEhNc-|om<(b{WXYB`$x_<Oo=PGml(Mv^MM@&||DI)@nfHHv-p~6C
z>b|e*yw3Aje#h_l9ke*)C%HHwMoIka!e>WDjvqR7C_}PIjo|!RKx)ctUMPMVh)aW-
zGMZyZ($Prq@%62r!0|E9v~+~JwYE@B9W%}FCyCBpTCi14{h(e(GSv?NQWpNUJgQ%M
z(1Zj=mq}Vi;<!QSF8jD2>RqtefMpv5dmm>slGZE?L){s$QOHhoi}Cq^;*}J1ZJyYi
z(OX`An3XvknHSWRXTOuEdp4>{l%qwh==*hM8MnxdV(D)HdO6erke(jB>NW*ZI4*LK
zgT>{p|1ajF`wY%7Fp?>S5wc1RF)OaV45hx@KwSkI)K_Y>e|B4Ky!}7X-V|fM$-TtH
zNn*P}(6L0zeafGGmrxq)KuN9k=XZap(ddz2{ezanzrOBfE8WB`m|<#SFg<ihl;4To
zFr-K0<Kw|)=o66GzdL^7gsz}D_%48>B96>#$~8XxL3tekv(|EJIB2|UIJS@zbu}Lg
zBS>8!e8;HvTRbzs0jon~109yaVe#*zWVlpP5Y~Bq>_$r7LxrP9>);}!qQ{F|PaG+X
zfnV|;Dwtlwh7Mgrq|4m^fs$H=5b#j66@y$@Nogr<^0e6E5vqZDF{NJ-JDbr|)C<nY
zuJlh8Il6YU0`=MT;2`Cv><XIb@BjcCstEK~)K)-bp9=(V@nYdH5l8&#r`@<^c2vUW
z34WI$;395v7C#)fJFds|pWy3@ZcVDFRBK6|jcPMp#A7PIql`u7bLY+-tYGmiFx*Xd
z$+zbGiPq&3QPe9wz*aIqct-EVugbp*#WD2Ql5TO%bZP!I7ph@<e>6MVJaJeul89-Y
zh#EBV2xOB#zuixiR@z^HJLE3?2MoA0?HB^>#l=yyqQ1IC9`u65l~_>3q}BF#BW|yY
z%M%`h^dB%y^vF0~n@?dJ<bdo0$`YR_`i*yO*aAU_GuT^-MM<6$9!(VtJS`2t%sjaT
za1zleGExswfHa)SFuzPCvTWo#h71wH@$i+A3u<M}>D|wY6)T|?fN7|vo(7wC?!f_v
zJ9g{ZRZJWq2Lzw5tKl<r%xN~b@Rlm-cQ(ri91?^JiB4Z$`RP@(W{L3$vgbfDF)h^i
zCyf~sOWgpz2Fr}I4Vt$6{1YKAi$Rc+)>psEW8oQ+&DUak7iw$69ZJDLQdRGgq(hY(
z?#_ykM-LtVNM!T9G_@_gY;yeAF#y(u<$CmP;GBttFgTH5kk0$5`%&pPlv%Z9%XCoQ
zBT2m?9&`Jow;r;rV_NQ+u9BOT%4%vt<%qwaq)XIaNZcn-f0O_pwc)9)Da7xP46oAF
z3`F7Z7}T584N3|K-_o{iVM5J=av{E#NgckAcJ;i@Q<F)mp88RF73H5_q?7;?=k)i0
zrYTNDJwg)uvmMb-O~JAtfv@`xqxS&|1;m7xpYf`R&Mf`PD)D-E?YeLaa*rq~0@zSF
zMQAA9&g(8@LH=Clho5JsXM^J_;J<(Q<toboi$`e|X%<s0L^C!vyc&p6<j?!tWeH}>
z{6e7(h~!9ow+Lz%Ck+x{Mu}H>W)|&62?-iXSE|Rj{mliKMT7)yz{M7&bW7mX<y9hz
zuDtKRfKG^^Hf|IGO*D}e3n3`r6EV5Ru@4!Y-cl<6{K=m8L()l523=fBNi=AeJ{ovy
z_m&^dn2gn%3SC}x`0x#+5K<Wg;jp?7!lTF1ib^hBesH8&!ZMjvTer@zt2TM{fvkz+
z+)3$EO&cHPZ9*Jlr>S>c5M;eNdpgH~=R9PEUwrBKE7Vh>Q|Uf}c06BPjKcDXYuB&W
z@x-KDo5#8T1<%N?OX1B^ak_ArU64ut7S@ffrcaHHLN!YSsaUy1tUvv)TL^2YR)eYk
z3K6t_f3jCLi8^z_Qd0IlTLGr94?ua|JwPjAa%Ug5Q~ay&8V01Ju!m@P+#rtXtV9@x
zRKSX|0k5_`p#W2!sxGFgqm`3%oKxxR#b+vvb5H1B4SHSlJ2`$xR3k24jAl(?i+{s&
zD$RmzW*k1ML|1tur}b0|gRa9a+@d77Bg>jSx@$Pt2h0}qPUdC#f$u{wJ_lwbh~ALw
zT`rFbpiasRzw5&DLWW_G8fviWH8;XGQR^cY=5eP}XXF=%;*i$M$2MTNI{f@|Tucn-
z*N%u_m>tu0tVV(+&_0JFfs`0Q6CadJ!u71P_!YWu;xYTYRdkC}*^oYnn~@%!{0upl
zOb$(!>dH!|H@7=jTp{BFp%RsP76`_5s6c7FGYN@D%JWBk!LA*~Oac{XCn2~St@3dO
zV*6|&H`W)TqHiDit(eWUiN&=WiNyooQX#(ARZpPJ_Q<MN<I3)EphXWBHACcOgjf)m
zA8@sT6lP8^i+|6=Tyz!+d9kEhSo(6;<Np4}wRZ{X>7$=Hb0#xNcfMa%etR8T3!i3V
zv}5Pa8xavToO|2S&Fo?<ASX}=aoeWAZweqj)gX=GUb2#ICsRDZ*=1y8Fe`p&#tW<>
zo0a*T=C6-Hh4;qKz_LJZ6)2bhl!7{pv&ArmS{fHcvN7o}VGN7F*xP8+>MW)EqzqB<
z-u$AD(j4s7^xag%$)Tv2S=-vWsLe?20SA$hN7Jaf&`?ae1KIaMKy63&u^`q?(AU}y
z9`D1Po(T7>`vvUPU!ziqkP<Nt_l~RA=v9`LEvpTN$fgt6Q`$_3d#~#@9BxRkaa3q^
z%zyv;JG&7{7#*D>pKhJfaIjgp@L*tIF)ReH*qFEIQVnS(7xo)ZwA-RR5L@@`x>QT+
zCf1x_Kp0IldMMK-0LVc5Jlt=?Fhuh8E4T30X_FFqXXF4RH<M}489aC;jDQY);A@}p
zeo2hmv#5E4kMrES*Nti&5b)tIq2z?R?}=j4x&hDOzaVAs1&pj|x;~XS8SnO?p#fJG
z8fm&UY0)dzQF3!Xv&WCZ65_AHDvO@2Xoo-12V!G1ls|1h;bVz!ExzUnmykD@E-^?T
zL#Ce<U7CCLY<8EC6nD4!Ch?L}P#7-y3YAODIUT(M&xxw=p?>}P5qUF+bMS}}A;*3w
zGBwSk%Ej_N3m5!);#8nw5CMGfOqnsvS>kWxw_D%Cr?3o;`U44F<&lUPS@LS(ym^8K
zDQev4Cp&B-k%^1X63Hy-lyo1Y6kyAF6|O`F4i0$Ea+=#f8Ub%EF<7oTjT6~SwC5p9
zzCc#x&E#X3IN95fp4dAuPIMP7ijIs#Hz78%WgLA=TT!;1JnjdHl?D!<91uYW5ICkw
zmLm~}FA9|%^6%&ZJPkaGyV)?Qjj@%fd;Uo?rvZsfUeJ?o#~d*q@Bk@OA_F1+XGe+T
zbUHy7zGS$^X9^0$Xg&vjlC-8!C{&i`5E*d%wSFONRW@;SjH|z1{#Ns+sY8d(RDu80
z<<#H<=}+r9<*C?7&bX2X)=0TNA{4&>TBB=hEJh`I$dc;#k6tAmb3W+@9(X!s)K|||
zD0z@=Vm`ZeZP{S{5x`=e3{V-mqbkt_ot23AVH5zz<dqf|qYthBQghieWG_SPE5oCS
zug3RVfS<eYONi;Ru$%(AHb|Dp1L%HUX`NM)m)HkMnpvO$$PCjvrpRXK-TMahBfN4a
zA!km9JQraEAt{}p-Xt9rEx)&=7@mT_3!grc1c>xEbvL|Fw&K7EmIg$VqDfI=HlKIZ
zAI1zsaG)E0J~cV=NA%k7-ybw}hqeqFdgnnBm1%)GBOxx80~}eQ;??TxL1Gi)9k2QE
z@KGX0Skq088>to2kv}|yArz_5UjmVE^!KM*;_~Bx$UcS$SOfq@#~fmjZ*<iMp2=vN
zUgK9=Iti6#5lLapPg@ydjVWFFoIE))<H@=;Ylg~G`K1g%bP8OQwJ+aGZTgHgaE914
zDhmx@roNr;^orKvs@M{{vd1EV%(T_f(sHj~6!R8`Qn)=slWA_4i|wbzV$@2%HU8Qd
z`Z>nvUP_`$l75U)W?;K8#<KP?L%En2J`H5BlM;~`fjw7ccE~%BG-UAHX2HRdNY=I!
z(AbXLZhC6!;S?GW#FXVRYwjZxGW(Vw`mCcQ%%7P#ZxQtp5@y|{*^+f+XXDT?mR${7
zM`r^~YP(_kdH(fvWZ<ahZ%0p><kvZN`=kJ1e+%PVRh2ZVe5^WW$!HQhKbBvR2ZN0U
z;6un@Y;0`&i6!vzt4l^E(OjW(qP85N4M#A(E~ngx{P??){%EeC&8qxivoHg^jnkbV
zT0_y1${>HZYp$V+AxECXAi^Z}NR9W_pi2hnLv22Y-n&_(TMZkOt%#ReO+FUSDSbPQ
zcla?W?W4~8`OXD?wI4n>6d1iRlUHnCQKQa{6&;8J8OOt5m>FL<hm0$$XF5#ag$vqy
zsdfp`G$LC<Dx|_*<8elMwq>?yNumiH_eU@y*iipBb)a*!!#k=2J>R^0cYWopT|(G~
z<OK?jqv%ed(UOp*7Yt@OX?F19H8^Top1!Xv&|c`Wt8d{NLC*Ga@IoxXfNWJ^s+cJm
z_jCs?v4pCosNbtV(WR1#z;S%=^d0MH1<E>U6j@ot57Jdo*X<%~feD(JDf1pzW#7tw
z+m}qV_GP$s{Lh_F&8cvvRx@jr-}$`$YOl;3W{pDL?DVuTq)n)VU#GOsrXqk-!{jsy
zKFx(Hs|M$h*e$&4FPv)H9qB<Ts~!l-S5sU}Xs(R6Dn2J3d%dOQm%`Q-J04EDkHk(9
z2HWxEng4M&qGX}%WYa|n0V@h;KBDFHXAvuALmZ@v$tQ3bkpLh6$Q~XlPaHPFemv@E
zUl!Y+jSqbJ;)Ss7fp3$3BzpDxRGDL#eLq?%<K*-eYv8=Q!=x{2=(YF$GW@~U?1~v%
z7x_ibz6vFy%Z6o?&IFxm+*9x!4UJC9s2s<ZRBjz-B9hI{ojN%bTtBVL77S>Vo^CQ<
z^Lu{S+js37Mdyv#^a#QsOe}I@+Wn)`Irs(1Zf-yWSg)jWn))JWPM8DS6p14e;U(Lq
z?hJCKS%LeRJ5O&;o!5*&f=8d?s;WfTw8UV*0j&(l(<q|*9z9LiY@<B3?c9kO3(m5Z
zNW9K|?X7i~QlXtNH+MN1F#c*EHsQbYt%Y>f+~T0*)Te1loG6W4XSTw%r!#3c*s^is
zu@fhpFyPYR+6%IPTU_7x#q6vyzLn_O#SnI)f~#+?_6UvRuSh~WynKIpKlQ&6kuR(x
zybd`TI`i17o=%4IX(uU!5Du}4tf@MNMw<Qm_cQN}#Gq@prgkUyff#D9hX>@>rH#M$
z1sD<FH@&X{qiTZ$Ljj3AJv@+*?1=qgyCCOmy=@zvf$lacN=mCJf{1S4-AvYU-A_E&
z*X`%mtyk@8ewb3&+L~f+RBB+baPh3-D?|2fKK6Wm_v>DIC3@XMPX4z2bN0lcd6`Ss
zUmUHPB0sUXQt#@sRSQ%99KHAC5u2kPC#Ra_)*NbVME+sbQk9x*$=@21%xZGK+D><D
zyk8u5I`?C@=AqPK2_}}xFe<{|%b<|F^VQ9puYnwJR4$GYj#ucP%{kF4^FnwyJif1-
zM<!(isQ6a=;rd@hZHa&6rBvpcJOfsfbMRK=k2m#HNktpQ8-|DgV=zXTjuD!dgMx@a
zi#Q{2{aUJoIUekxtfJxti+kTbZ8$2L)iBm8E2p8d2`2u)7rK1Hd;yU#S{?1I-+%w_
z-n}U#$M#)*xtVUHuRoT<RzB^&_+h5h`Anm4XQ4sikDupqUTwE;&x7s@_5f#}-|E#1
zm378Ky@9DYJ^Sg?{v1Wl0s4GbIpcih(STTJ)}pZUft<+f<)PX=_}9nOFhqMS?PR2;
zb@8H#0H;OC<4bPikBH23g&i3Ui<wT~tWOv=%k}F&MeGHO+)M}}m4S}Q5;T~XiYe@-
zLk9He^PrgdRL}*CDQ<8X8~amZJhaKul2*Gs8lT7!XmPzyJxwC9JcMHpCusq(z#}g0
zL2Q1Y&zRNtX^OqxW5B4q??E}m3_-9H<r&(r_wLz~c1q@E{|$IlFe3%&vgOO?B|ao!
z8;cT`MkdiW)1gt+D@SyK@PYJVo?ZSZ^ik%|7v00ufz$k~gg(V@zg>)pc~0JeuO#}E
zlgi1-;S}jH?W_bc7!vAWA<!3^(eT$m0RG30!5r?SWS}AjgYMq7%R|;GvEdnc_S*T!
z{QQWO2U)~pJ%;QR`S@PO{VrYWYGD!bWdxA<*C8W*kQi<UI3)1K%6NGEr#xP<XSZ$#
z2yVF!F&v%{Yb<O*Fa{?AZQk2-K&@fJ0-JL`apH+7i7a7Wt`lM9S_497a45)k7{WcC
zo(v>&z<1m1{jv6EmNbK6o`)>iG9=QAb(EwU)Z9x+n^U2*vupu$qTn+@nuSLA+_}TS
z2sJg$Wmc_Ov&7Yvz@0=62C*;l5rj@88ysJ%r+CB8N{}sxnKnDP9+{*MG6&l5x}-$#
zm^nYB4OtH#j)XGA5=r+FOJV=EFBv4!>NbLL!n7B9dA$T$$3Qfh!nh`Of$U5dnaw>R
z1fiElL!vD)G>L9J1%5|8AupUG<e<N-o&xrJXUgQsBTeJ7nUN&(rmAYo+O@x0o$o8L
zLS<nx!MlYMSd^%EPwF8#GZCGxo5>sT$P_$!qX;N3FQ&G^W1063_rfh4hZWN6Be2ps
zx^xK#s_56i5%!C`aRZ<vVC*`tJ$vr!E@-`9iFhIqCjIh%A>f8t2GgVEwOcnKYy&}P
z61Z`-J0Sj3(uV28Tl60KD2i6cHpI0LQ!D$&?h@`Jc#Dp%E(i6OL4#-sPFD5RX|QMi
zLH;T$E?&2K^^i3L9(sB6kBxpqZcQKu>k^|R0K}Xyzx=b3gc4XV<D5TlD{pZXGm?-+
z$fvnI>PJ);TTFTO@39Py6VsMRY7A*aFdnnBolBZ2U4nU4vi;%L&z?=k%X)&?%###&
zV>acEQkuoun~z*UT?QGKoSO2{)LBIeE%RYfP-Fu5EqlkPdC6Y#2KMJS%!erB{+HY`
z>)A6`#}ayAX?n?ppwaMvQUgXQXM#R**jY7Uzz*`xpGvo5%6iL5j{$%edGlffuqJah
zD9Gq=Q^ltFGEZ=|3SONg!I2D()J@z1Mh06JnktHP5BhXs1+}&eoe*zIw+`93qu$)P
z8z{z59s`V6tSmS7KHDhj8C)`mq*nmeU@*DS69q3$(WU;dSic*zp0STxK7U}S9(e?f
zVAEoI{UEXIg8eX%Vr~7$Tj<|PrGVCItP~m;ILz==(v{qI>6a));L~S!7@oYWT%VPu
zSt3BQv^;)I0+s(_MazvF4L&qBc)&j}*eUQWa6hlih%Xa<me`M>x=J|PpZPhQ+l{Wo
z2of!33vVW;f;}awfR1zLQR@_qMlfKYX!jMN<;z89^*r|OB}cn=@1Ee(XJl*vj0H!q
zDgCjw-xqupazBH=W!{6zhbM9_f;;<N{xu4jz8VyYsTNU~fZR|-<_SpEJ2frMm{5<P
zgDrA-P&T4P*47AcmZM>*h=`(lXI^Fjpe_IqKcuUp15bgF-%oADr-yl(Ki=#D%zTP~
z8QY8qwyc<5kI8tNb?y<OSgo}YqWQzUVhnyxv`qs~R1fBFB$mI}Ue6;#hu$BO0(BS$
z-~@^u0IcbKSdfHsY*xWDW-n436vL(NV({HyTzeI!_L8)u0#VOs`TUM#abAya-@fr;
zJ9<i$tW?+&cN!mv<7IG&+O%F<r<9f&HjUoq{QLyADm7*O@^Ib)#YUEboNe2-(P9GH
zYftvRkYHylMf4jwiw-dbWY#Q??BJdfGk2XpwD}{Wqy2QOY;A4*sKhhJNeAZOqVxao
zM}jBva_|v{aPwlg;a6~TG$xgan29=4+n#MbGczE}zOJrzMTmi{=}S`+>McUng{rB|
z+@ET_rl8c1pg3Xvg}qqzLZ~ie0BYj#!e`e2!?s^~-5>@k_UZHd((-XML4nwW)GtMe
z0(H_6Bg$z$L7spya1wV*I@p9r48YL59!vD~r}Fg>C47B{S&f!Nc##)yd@)6fRT2%A
z<@iDNmoiJ@UivSA!-N=$hWIOC+hNkQar}(>S>ds_ZL0;o64PQ%o{TY6#k5Ufc?CBe
z<_C;)4_Tc+8_WrS4MMuwZ6?KM+F<vW+CmoeYYjjO|McR@s@?T=o-{(BsLEzTrl^><
z{@|R*B2gA?zs%+V2Z*jqYIaz#z(LvUc3azqhK8!AK|SQfTU=Lxi6#@J@Q^~lH<7q6
zGI9sIQY2+h&UwO5xqf{+XK#nCZSlcQTI(@>BJoGinc{tF>f%G>u(kB;38_=J#S)3i
zu7C6g^*Zs>P+~Z#FdI<HoDL!)m<mMmNpSqztd7;#>C+Qo^I*CWdw)-rH?9poqCx-;
zkMqNo5kE4408|gU6rJ;2C10gCD;EUn3xW`lfGC%6TYNOc8$-mcTn{2NLl#LJ%X*xh
z(L&l7|229+Pl*VX4O9hCOSLF__xO0qJW&fU1vsl-$ZaVoL<7<>jq2^&)k?D#FMfh8
z6-q32T9T+;h)KdlKsSf@ikFv9LK4yEA3@WbALV-YzK`K~_5KDo4cznb)2FS>3;Xi}
zFXr?-8nub%GBSjU2$?(S|6~aE1QqzW;)6eHP4sWOcuZ{H$HvA<6DIf)#=bWE*We8M
z8@M+iyv&b)h#K}l7}E*8>d&76Q)b?${aGSUm+N?n?DTYYR)Pve?o4&7ro}J2x5CXz
z9ihUKEJkHeT0tq0A9kkMm<Xa0<43nC#*ft*Zl}-E{YzA`wsnNE@@Jek$S1W&bJqmo
zYEHo^kTkY!aT++3p1h-@BlTFqhU@viOCGFY-=mSI-{U>#FQ)?VoSw(V0w?sAb1(wL
z&eeayjorN`Us$8IODi1h!1jhLAzxBj8Z;^q%>_YTAln148z?KQCB}^Ld)dH(q;+WQ
zF!kL6-di_X<LLHsVKg(|hstf2u3bR}vASYJTDDF7|GFezFR<hN@Rsn!6@(7ODFe|h
zquqG=baAG{(?}#a50D9hE%)kG;-yRPX^+F|W4JW8m;4z?<Rh;3t`(#gwh^K+mW6<`
zIHRj1*faQ_uj<CsA(7P5{mh!tsui3<l0zl!w}mI46E6zag@4N+=D^l+bw#){Esbb^
zG`SE?5mB-1Th1m*Ok6~%m>!W7GhmfUSboS4lBf>EGvBWSYkGDP02iqakYNRPoG^Yo
zf*;kSTHjMCVPZiIe**+@6IUw^i`Pc|irEYFz!Sef%I0^uFcNumr4Vz+#0btMQmLo<
zNd^!9XWQE^k&~APYJ?6&(<cB-gBbJ0Lb^CRlN7bZ-x5ik7@Q~Ye?KG2o6OcUG07+7
zVIb6yDsr4-nr=cE0<j=MCN$K!3pa>54&#23to_%Bsfc94YCsJ@Fr0iyNBC_Rf~v|&
z)ZH%z-kaO@+G>-T^gz1F1@MPukf%0EZ3+bjmqTNKQrgPjw{PA&!wG^Z1`*7XtWK<>
zdJn6>zqexTOIED_kc+;__8kxSBa9f(yJ8O!8{GUz_W>7*OEkQVfSt#u0qO?_Ys-9~
z&cvzY0_w4=e3rp}&&g>Bk1jYeZLH+X{eOl_+}xzhVo{IhNTZo<(4(TfJT@lg^Or9S
zrDx#ngYU0v3SO=eBi_(}y!&FxERf((Dh5;bkR|M8-6*QMtz0Q6FBmB5e3XTQ!o$_<
zEG1e8{&7<7xheb(0EwX7h$^WfGuCQZvikb*SR7UINTQ2Duq@tz^&v9>@Ql&Hu-!{Z
zp)95AL{A#Jf?SMeZbhzV^0aA?z?h^HSGWFWiGljR_C_(<)S^X~$0Kfs5m;f3a4C)$
z3|aKt-v?raCS{!U_%VeNF6ipi*pH-bYkdD==(ptegw6ybTo@88H*5t(+#}AdXkj-=
z@XuuU;sdy9J5OT^;S*s6l|sT{bBrG^hzm?Vgy3yzYTACzZuY0Xkp@w<uuv2*xFud1
zqyEVBP@2!mqD@yUBq$6C7ypD4;<)K4%h?3|EJ+yL{%i>h1+pPEAhr^HD*4N~0e7Qi
z>}=}kL}FxaB9R9-1zmWWl9IyY<?oMrU$i-k{$)FLRn=(}&*?|!Y%@Y-xHt%0GMzfn
zsk4|?6C?^|ARoXb4uXpA%xa8mH@2@_=h=8M1~B|N2B?_V5TU0GSY!HP4L~s8PkkB%
z-muRR)VuoagTQ-|?t31IiMSrDDB2W0>c5Cp)B7D5nbvh;%evZHV3(t2M!*j=*b;65
z(7+gYwRZw+p7t35OWM2YuRU!uRoE%ve%as@1z|d!J^LZK!5EW-m4f}tXA&th4@TKd
zx0ggq4aH{5Gzg7yt(qkAgaoL`IyyPk&}>yclYlw!INDl^YUstd5Q)HXNX0F(8=c))
zXver6Ye*%s3P3}RY8LG^E+!>|Hxny6gdwKihD7ya{NL=edfaVm<wHIh0A`8I7}8cA
zTW5MY(M3gRsU01SyxioKC?;7>3e=B<napD7nlHh%)qeaa(pFJyKPaq7LO&wQ-U9~Y
z3zXY5jx`M~oO>-Q%9NxA>jMD=9z;}ROyxkajM-D$S+KATs8D_;&%kNqeA|i)AI()Y
zFu;Iv`52~)!oW&D(OXg*5u9`{ov8(a3c;I!OGN|VCNK$hWKNVlz7MR+%rxw*kWwD*
zI|JpbXZ72WEv)69>31SQ3@<PJVuY;?gX%+Wh@Db(HK0S5w9-?S;u{z|I<ZR_MC5BJ
z<j0Q6L0hlD%)%8dU`@WzG1XR<lJD#(J7mO&KYrsyW7Xsp6jJWpqc#pG<NE2vh6*QN
z?jiT-({Y5%I7@OFiriNdP`2fge{yBjOae%%s|*Lj1&~1t&RM*V&N|q-d|mvG^406s
z6_=JmO))Yt8ELgra&z2bm;L>6k(UTLi1;M|A`0EcamCAF5kd_M`|!t6^!yKmy)ir-
z5TKqDE?%h0+mJYd3z0q&K4s=Rk=g-G3SL!a{w6pmLP_&8HFs3uI|NvhD`pkZxn;b3
zdCshk<`PK}SvqG2Fww`~U&VtXO(IM}j^q-iRbl)sLHTJBIvkL>urQF{*Q{PmaONpX
z%@!lTWV~dsI;OCb7TG4MwRs!+<Mj;<1ogNYYA-gRac~#FU4mQp+W~#hLxaQslsOy_
zXPZ<et$!bh-k=U<_70q4d=LAGe+Gb1_YhwI(KyFw;^@(`()anX<IsPo2z!&$6Z``;
z2w)|eCJb`@VcRB9^TtH%?bd%MBVHSrH=PQf9QIOw1qDINaGNT6^0S6ucEeteL%eRm
zOQggvNw=~2Pm#^^nDO~$W@ZfwDBXu6zIfpRnKMd55WH)Mpjn#s1vxDkl{^K=6gR`b
zSZanAyl9QVgI|+AvE%vCtgq+9R&dXCM-&hFdp=ckZwwk#{r>%>^XG?FF%EhahK?Qp
zI{ROjmG$Y_(}p=o4o`oSFfha$I?fANIG)0h31WMW0^_yHUq->iLST4yj?ta4b-Zn<
z43->{UWM^>uuyPx!d3#ZLUb>pShqIO&Z!LP>C>m|%^NZFhy<hg=^JpzrBJK0U#Li3
z0%X+G!c|hJgf8_qE?`z7Pq(5ux>(eQ2qnT#FTr-wui*b$IF_Dp50U#UE=H7n3Ds_#
zgYLVyxHy0~5*$DwDE(Q9_r$YHB8EW~K;MAIBf?$!{P=GWJ>7WR4YfCN(9iVaLHI&Q
zlORMldfpF`1ZRUn=^##X)|{yNBzY+ldS=U9g$sdmwOn4i&VSs7gVM8*tMWuYI>efw
zpj^mG#AVdMD8fxdLjgiP@#lloREICmsV~!%D}sMg1TN4jFvlZXE1Nx_b$K#~SQ$<A
zx_UE`eXsz-*<rY^<x`7Tdzzt}VcgRW$$lmVrbD2|z^V0jhmf4=H|N4eyKoD*VMH1a
z?r`;b{d^n;21ZsI6B}#z?wV9)%a$sL%_cG1Rzb~x&nhQ{eRl8V%P+)|)GQ}Xn&j5<
zM!;H9Sj`0Xtxr#4v_co9V<B=fSG4Rv-&ezBB_(&c&Yt91vTsSa3tbvbDRZNILQW`Z
zJv;7DqPa~=ArB~oj^@q{BGje2v<e*@P`K3>$+4wpWhlyb>V#+|6G+Oy_GsH6i3Dfd
zOu~bIgC1q{VNqY&-<%ev=LHRa`X9(@k{Ft&w6*sgIIsY=HY9OsSHB)0{QbFYoX@Z}
zy_wk@)`FAbg}eLRhpBPz9&=~K6cLI`U1-FE5Rm;nLPGyM4=HPGBdHIbC;b>oG+N`A
zdoaeaCAuyiTWQ?8zP9!l(jsKugk<AsE45tyaTslVw8;Fm@BxhKDn`DRl)x64V^M=k
zm;qi2@>91L1%@7d$6jcz41%n&g{mtF4G^cd<*Kk@c~tIc%)7&W(Cz^`25xbfaQMiP
zEFw8e2QP)|z?T;1+3*wU<%%;`-OJp-w)c>wt_p`7FB@@qtu#O0E&}lz-gDL+Rt~JW
z0dO-JvU=Q&5wP`eDaAEYk&Q%NnoD$ek>B7PYyMcr;^9Ak+d?+h4agGXVY+$Xpg~)y
zfo4o9%zoUXgTy`uDm<0AN5{s@42=g<q5p`-{q`|FTX(@WpyCQde;899(y1EE%P8D<
zSkygXq0MxGpMay=;f3O4*Y#i_sHMT<DDSS@_F^U-Cp~8$FcpuG61<$K4C@185O_Hn
zfTt~wTFm(%qbI&&Q$}66;yrDGB%!6urHe!GrXRv8M_F*r3knLP`@n6C(s!vB@`{q0
z>(VjdLM~mpk5qtakTvveun5L6k))9$>IWG8VG!=r+Tucs8`RR`;$ndmLG2EmX-cyJ
zBSLJ~oq!$%bPD1slsS2SF32E}{R5o9<5JfY6JJ7GjTs<0-oB;T!Eg$0D#uqx<|<Va
zrC9>n0dzf(-dTw>z_t0qT$-Yh=z1h@{xQ%;9zL8;@wZ*Du%T&hGL$O)KbnYeTP+?@
z>BtURhZ;#mshV4yI8rA!fb8oNz--*Ok^jGH)f;d`*n%dmQh-kI0G#i6J!PR*K)(Y3
z7q<wMV<vNz{WCmnpe{vnfN!Ry4J>c`$`%RGbelnDrm5+M2Wk^KD{cJ@M{Q}KalwoO
zZ9#_pcFOk{LN*WZ1zTBH-H~{mJxaHu=#d>XXklBHFL~XG_s6Uj_9dYevO}NR*-MdO
z9Bw*%h@AksNO;<F{o<Luf!469KNH3u`twgY>Bdc)plBNAUZWE~`|)F;X-5^#&~#JM
ze0Bh$P|Mla0x0GD$tb};oIsFF47m_n9J(Btm3@vafQy4P><4>1huRYzE!-;?5if@-
zpTeTjT|8mnZNE_J=q)G%6wn9Z;_{8x#GInJD%uQos3o#AvwuOYX@tuChY#N$R}~|d
zWL8;OwchS~x|~ic(hd4}tat1XOhNc1)XQ*%SbH&b5u}1DwIu;X=JH#V1VcxnytuJp
z{gQOvHJd<J7m~C;#}~>KFdltxNld-8JygI*sN~j`6*6OBK?C;?(N&a{K|&RZ_WA>>
zh_H#g1D3@CjM4Hne&R#~4!!3Plkjvx0s@<)beB^~ayKmW-6SMLw?K#?9$<sP?CPR~
zu?F~V_nxL9wikYjA)eAtUlG!2E8$64-4T%~2Qxh_=9hl`UII!H1_K}y4eGul2Srh+
zfcj8-JZ%f=o*}9hlHg*JnwskBL4Er=U|7ks1Xjc9RV3o}$+W4*jb~OyT^%rqs*1{y
zF>16bfr7eE(*6B+p;1Z~%YFz2U6jCAL$Bs0vNk}PB+QgN3mr;NRTzH?#6LXF`ckNQ
zg!oU>>C>gSF;}E#H$rCvloRIb0LX@LkTOp=^x#$z%|h`Og{c;iNWspgI!TX!!SoXh
zeupv8beOVND_&H(4|;a0t!5bsrQlP~BDW8-D<;mO{<jTCn;q0kO1L<?UoKlld?212
z_`lvfzKS2I#V67$m2+Z&8leJO4WW^vtf0qKR6(|&0kUgXbZEcbjikE_x7J08Ly53L
zdW|Iny~tGvvxSG7>g%6NlFM;hNuSZus-SNZyicDtYgvVpQ2lC@PwbGUkb&N>;3Mm+
zawlT6ZSh=Ww^X8Ti=K)Y=D2Rzv~s212YHs5sgjFb@7}(B4lm>DCCJ49qzyBvHA5)X
z35T<4%kgiR3BHHZGqMx|N<&*e>k_k=haA#G?IO6kVbs-Y*My9hLD$QwopODRs}e55
zXHRY_=^s+PROShm3Fl#SqxyyhHlLOzIJgXcWk>1Hq@Rt3N^)#%EI+24JRdgCh5oyB
zNROs*ytuD<jn~`2fHT+dk1Pilne+d_e{2)MUk^}K_3{d(9lWEW^J@@}iVB_9k&@uv
z28HMD!*S9u{q+d|K8p6}HYYj;Y)bPPAv!!KZutTb4-Y^uaewakhD7wl^@H+)=90mw
zPiL@`{i;=GWpAgZmX`+#Nx?yvj#lIa6@G&r84|fTNhahP#THQ#Q<%}r8fEP#l6b0M
zR&<=gKij0mA|f`{VaJa8<WWNck5ug5v`cyxRL1$ye**O#n9r+pmkvWA@H6}q^wDT^
zYasDeD4G=XQYhekWe=_Rx(5!KyxZ=G0FIcC+hllhf-a%2`lq613!bTBF9+7`QQ+tK
zA~fiA;JB*O@WT{DKx68L!p!siA_H};@sA}RP^#6PenE9B6ysu5IZ|2Gy3)%D1Un$O
z4Ks-+%J(bEM+4=vjgBhD5;X;?$l(%NAd?<d{Nv!k%kPt2OpJP7R*ib;U`oYE*;mKn
z9imlOGON=qK|$Aesd(Ln4<BMt^v?c|9sCFwM@K48X!4eCcA?D!G(VfSOjAMnp=mS)
ze)$3pum8=pjY%poWPSvff~>4}Tl^ubsF+OeL&*6w0vt+~T*DXm)BiGD{FH)U+9x1p
zpCBbWNx~o04f!|vs;^j@$5QOVua8nup~hz$rJjTo)4e69mp6w#Bby59F>k>Fmy)%F
zw!?1(3|d?$qh+d5|L~#<_98bsd!%N#*XdE~mTpijoK-t4-l_3&<SJ2h*i-7QvXl@G
zyKK4Ig}P>`-P8oIL^j1;O_N_MJ_M;BDK+t`=vGY%pu9*WQC>>Zr8Bv|p?acWCB1w|
zsRg^naBsZXrMYie6RnKY%uFEUGCU2?4@NCJFuJ6q1O_^+BuzA<BO=sAg^okw{}}!L
z_Kci*4$v%+ai~=-dR>4~g2O~R+NMpL0Deg%`l+C-o7+FpYVkW<DI<MHdMJZ>J$%@V
z_!0rXs5xTHC`4Xrp5N_2XW!krwt<-!^pwSXUOW{rf(3ZJ;rvnO-QohT#pRy)VwRbC
zX)EPrz7|~2+e<y{@~&imzlY>RFO^Yvd@`IVvca4E=6)zNAm}OHzg+NT51iI~)1>f+
zV{?;2YO<M^a%OIBbn_kjj^R#EG#-7E_kYtgT&OvhK=t&##mk8b7F~Kw)T<Ltx?X#?
zGg*}HfialTy6`CT*VBaVRIzGyfO&Y!eEw-s^I#~bzQ$)U6pJp^{+PU^-3F;8Tn48g
zJ%IJY`=V09>LTwwdBY=2$f>kZUp=rUxVHeN4Ld%k?gUaJjHXw*Fg6o{f`gS2KBIz!
z+r!6HSmXKOz~Q%nZ-aIw<+P&&YUP!cK<Hd_6TKzq!;Xm}GQo8L**HWP!Wj$#D(wPJ
zr4d0SIDW%HCW#H0_FlZ*%Z*wPV7S>JEF0aGcx7<(J$v`!?41e5iDWA}$&b``ZV|o3
zV$ulMG*zZx%eO&G5rx|v*gciTp@OcK6*{Lx<vfvLmN4j-$t?a*(wEFT$2a^fF^l1?
zuO<S;B6(fZ6)85YvwFCEa@u~{MuK_NTPn`Zo@W3;PAwj#=4|ND9m_)ATGcl68Z^l4
zo;+iyLu2ScbXsSn)e0uJSEXez`~X8udD3M=le1nq=cF=7PL@uZ+-k&o<R99Ng|U;V
zXdIw%mvsRL6KpAgJY6=;BYaX=k<1z~1WG`Gx^mPHaessaALexBO<0DrONpNe5=!`;
zzY97%rx*<pd#w$tI&=0cs-|4Ba&kKerpUl}BM4HJA54M2){Zo@C3&6Oc|qH^Z{okA
zQV9EmB0Ssty6ZPb8dfdM6{(*jt?kiQsFYf6tf{F{|E4->ar9W1^)sCs9+tYLUgZ6;
zse4LCQjLlGF;`mQHyD?`s7A~%FbLIe@%4(cjCIcK)*xmlSpK?kK~{FQkL8Mh&)VmY
zdh64=rGHsMkoH&SZ2=C4gHs%)biqY|Ie(?>3f>xcG@2M49SA43KMBn`zc2j5G}{U$
z%??RCjYO6ZBtTPAIgzYMOU)-BfCjNAL^fm{@DGCvgT_vp)I&yQG7v6!4Ge)1;wJ$!
z2XWW|0S@fnA5Yc^We(IoZgmj+3nDr&TEe!gf5+R@)^h3+K*OmyOZVYmNHxcx>t}ql
zgCthZrAyyF{rWv(bdJnnU*85o<Iog5aA00O<(ug)LeV<%3Bh$~gyy<5Ii*=-a%8Sy
zs@u0bJv*~wNP~rqP2E$coZZD%zd~L@=nF|TZok(@3MI`Cy-GtL=j8ZKBs8yGv>oqy
zA@rD=<q5ook<wjKNz=qPVVBRG(F)yCg?P0|U;49!5Qmr|>7U&tHi6b8?Zo5eueZLl
zG;Hn0xRc=&88E19KK=AOd@@-ru5<uu3cFOP?P5S6=bWCho$)!4j|U7m>F+O;Ao@KN
zQoaUeyajlnbZ?Wb(FD%&2U=6wfh$OyNb1DC^Dpb!Mm^WEWv`(yLx%D&Jsk|ooW8ZD
z9_xhuj#Nr-lLY{9aj|@_ULQ|(_V)VBtIfHmN$Lu$OZw&Z--V+jdb+Hy1I|4O?z?m!
z(LIzr?cGGEgtvAcvSbf=PdmhbEGkGz?Zm)>ru}(&d63{3sYW1a;8bTUH8Qeex`cF}
zD2vl4*nF82;{qXDDnoe_z_uX<3U&Pu3s4yS;ZvdjRF^v47PwY_SXjZI?&VzA{|5Af
z-jwjyO5B=}G8ut1?#E3Jh=l`dT!qA3$?0NqF^Nc2#b%ZtQC0`Ff9dTwJAEB7rdR8c
z(Eqj?u5;Th3fh|YIB21=y(M}|>~&Eep{uCg@TAwITXY&q)PxL`-dE^EDx*hV;qbE_
ztYk$^BryVflOk>f_7ZiQtlZaaN8WzeM)01gQ^9cPZutTU2i;O@8yo%B!cW)<-dTI8
zqJo#eD&5;ZK8?bwYO7wLzUQ7j6ji?AadT?NNp9*=z$iK!9zGq{pfn32UQ^snG4@Et
znv9EcSVKQM|AP$z8Bm<G^`wE$G@9oK`dcc$=zq<FM)fSahh7&{e3Aj}1eh6(GbsWW
z@7<dqG8up0#-OE2(6?W|qndX!G8V{KQ%xspg3drP0u6B|VFI~umVQn!xB)I!x{ovc
z+puAry$Hu#KbmP*yhMawYr&<~+ZQx2X5<=wjoCxtl|;%@mWG`G0DSBzVLg3cl~q-0
zzSnV0&6}o7lSq7rO}It6x~;>B*E(uen1*@jf=iQa42VW`n}f8ZJ^T;};Q?Z=j*OcD
zm8aZEk`C-rSJJk@&j`j0gmw};Mv}F#e<>mN@7s4?JaaWj^9dbx5J&<kW5f}Y@zpWB
z<%@G^sejG6!E0eRjJFUCCX6JRpGYh#7$s5y*B%w|S#C&(!}g<qgbi$_O#xXtbg$v;
z47R-_eui!u)oY*OKtQOs9O)Fka?Kh9A$x-zbFZ{Y^E6xthK+$nFY_T4GCE}5MFYk%
zy_~YR(p@5S+Ht6fa=^br(xxq6=Nn4-2<_Pc&Pexho}sn(?cZNFV{V|iqb(HfD>rWR
z7NBQfKfmy})~0~gU!?Vwxr+)GnLwlA*{E`CqU^+cR*+&-(^r@dp?50t1oc63dQqP=
zvy$JuOSgT=1?dJS#P$Tn<z#8@e1B}b;7yYj6I7xdO?fKRP}O03TiT;nC7^X|24M^D
z#m9$n2;*7eh5~BlSXDddy>LD#_VJ+<b2FEVaX@rTx|UpK!-Dh?=!>a-InCoFS_>Er
zd+kY9mj5)n%d&6r59}MGLyJyKtfDx8r-l12Mq#jklO~By296TEkL9;|hx1ZE@+Qb9
zJ}aZ329&fR!=G-9Y2;$WE%f#CVSw(?eCD6k(-@PCpCtL<wuol=T_W=jR3deC5wWpf
zaQ7>N5NU2J<UU~vW}EzQpniV=3kMRvsceLbn19F3D~R^@^i#7V>40qSA+JV9gDTO{
zqf1G>+4yYQmzX%-SGpKIHPy+rmx(dFFNm-BOoRVZ@zK$_U`^Blf0H^#S=mOcVA90)
zFb<+gIj}#L!HnJszMP0gl!p}p38g6##>B;O;%JWLzw>cy<+D$}_<SRQd_hz+Z@l`_
zM{;`R;t$n@!CQ!}hvf`}0AWKTzD>#xMF420NWvk9uLRK|QU7b&L0KtyK2x@@@a8zU
zr9btNkaiLpiTR<lR$fq_P9#qx-h_M@Tl?gAA-hNUd&|>{2QG~%)C0lPH*coQPnSq4
z4PX+j^<wWMF^iqW+oT;@VQh?Mm5|_x`=yug{}QVhEDLBn^GW!o>DHw)7c9u&yu(W1
z`H~u*t;kx*8^m0Hi%6%f$%RmpdQ8saGeN()zdTrxXVfPr+P!E$Lz>9<aL0CZ69-Le
zNm|>XLkDtTn)V-Ir+IJcRy>cTF9?+rNGhI3O*1$mq7|AE+woMyr3apEv@)exq9tBm
zpOlbSpS@+Pr!3GNMjmD-s8!K0_;g&0z^wicG2-;|P1GG>lf>Iui7dDEo$CzZPU>*b
zRt@ZyNrD$T>%w!gBC@0ZadJDoviQ*uig{S`HHgr`8IEO^i(Wji;T!s(IR?2;o*bS7
zH^dcmlLS~lMSOQ`JD$lHGnNQ()nHAT9k3aC^vDEoICyZUwe_3tcbRFxG#gS<wii{B
z_n2+Iy|`xQF<)Q#-n}^lOFv|-P?|+w7%mr(!*Z~?#GpUCgM>P)oT!c|-31;(Pvp;e
z6#wa~>8NxU?Buid&l2|_i%pxL5;S7@#LNbASOLehd=u4;*MUq)lIrOJ))-P{p^U;*
zGPMJA4ch<*Q_Gp$mPJOO)$zTwYj23}<nFd35kl8S>^8Z~@pJ#?0=&76h!2)LGAhb^
za}*~9(Cy3DuSTJp6^M;Jbpnq}`^oGvEtT+8JY;#)lL(Ut&Kx$~k#1+v{OrM!U!c;|
z_Scrr2!wA~6%LY@_dRk%za>%3_vqelJX8L{*bYP$?TrwUgoVIHKu<y=#l)m}wW`oB
zKpJpAv$`98{>eS`=@C5H_iKV6P}0N3Q_+Wkl*w5~lbJI`=>rc=fIfEgXzwvfZ9g0_
zj06lTld!EkT=p$XNcP>rgyK1W_E~)3mgG#uKcudN(|B-sdHDt$8$ulCn`3DHK&zL0
zlxaBM`C~Uv3?4P=vX?4h*`EWZ)Nw*rTwUFv@N@?~CnC4+_h%nY001~U1%YkC>*5q@
zZxnS)Ff?Ul%0^<u2BI~}h7n1=%ygmCqYTo#n2s4WQbuAp>wi<QU)$<Y0Ry0smyaSn
zWgbLB3}7&YEHtv1O92&2ZH@H%rxCivP%yO(-wFSnlii_XN5FWaTz^UJ?SDPwOGGJ&
z0!aqx*f4U<$Jo&T0yjl83%3)6nAJpZTVYE2x()mQoD_Q=6%u06Fsi%6SNhNG`R)Mr
zgm?sbQE&o^|E4t4!Hw@o7wtV<%%#q~dyS}2wEYleQvZA9zfAGp{L-7MC|yqjOF^<R
z0G5sDgF&Qg2XPTVK13q#F>T@$k{Al{0_mT*A0kTv@GVqd;Qqc=KT5p+=F7dFVD!Sy
zo+YR5sbZ@Q!5>Ey*j)tBDikm@HR;*W-^&vkd@9*=GR*6piQ$0K(6^I~<n1UgV<zVX
zMneujtU+<|ww1g;>H?l(7Ov~+mJ`mGV=k*y4?Edro$R-h*3Jwuc$WVKm_V3s;vAu1
zL?A$!4ii;_?*&A=96P*&o-Rt&FyeE=j4}cEb#+fqUG|+-byM{WWJH`C2;+dUYxf;L
zzC7@t$eII7=u+TC8dZ1`j#GFutndy}y>q$-{D7p*cWM#*Xl~Rhu=|_;Jw-}vXbk)v
zJX|GY=VQlca?;Y(Rc%XzKLnH{zbOsX|2V}+zVXLWA=w})7(f`-8mSA56gZHe#mERe
zCXSrrNlwm$F=L*9Fy-b(&;v|hPK3tACgi=4PZFSEz+fWhbR<l+!`tx)?-}<%85ls;
ztYlHgGe;kz$4qPy9SxVh63`MHxh3ng|95t=o)dgB@E8u}AYxpIv{E6qVR+*Uj2dI)
zaW`dM_#=OWxKqi^>&StlZu{x@?8wfZZ$16sEiY%%5gvp_uf;B_en<#~9(!njP|`mP
zx37~DEehh6s;a*AOxqUnoxvHkn1VuUb7re(P{ElMr+~MaznHO;m^382Ss})=9->_?
zjU#D;3`gMJg5UQLl89+Dy0q^U)_C$G!+X}3RSs@9w4Pvn8B!8!mIL2BZI8%N+>K7{
z->=&+>N71a>Ihe1D2PE-H7}zELhmBT3^)%UYVP#19}?!XdjvxbA4)DJ_)WUHf#6f@
z99L%s4A8jogdkn@>mT}o%1jQ(2tU|Q%nX48sdN{i3(){!H$W^<#2_g5Bp@<M3r}lU
zY}hc|Zoq3dp{Ri98p97<=&*CV!fD1s)ey$vGN{R@i;f$2uP&WCqoQ2%{OUVem??_<
z+WwLD!)OW;vs@rGB|FYqvgA6UA|PQo-8dLX%B^srAA|)5mqGHu1TWChfv<541-F_C
zU{9tIy+dT8SEg>Dz=0_P2}wr5(`s&ZE<d|x&m1bP7{_O5gTu6S5c6gX((oFvAv|Pp
zmmfjY*iddQABC7*>~^ubH&usHtOmm;sn9zFUl`=;3rNIP)Uf}&1=>NKmxeZu)H~iG
zWp~j+L3T{N+fe3vKOjFc3ILRRGv{l9vt=7hBC+%&wI(9o!Y#ISOO#M6Wq0+a&c(<T
z9a7bsswoKZVktx)#~{vk?wvbBr%sJ??$V|+c5@utnM@2_>V>9`v?x}>5hkFSe`4Z+
ztQC?>EbUNGTK<M`5{-T1ejK2v>(^H#IXg5Y4ZqXTsBM+BToECmXwSrmzXpLiGL^Q3
zl!XC-Aqltm3A1BjAmU#z#ZCxCp&ok_>B&XHUeGV5eG~OL$?2WUiXt?EyEUEWM42Zn
z1tbfCS~|eBp*&RYkXTM54CIf>R9)%tCQxVgI+RlaTT)E8?ctX<oaG>j>p=+p1ORLp
z9+tbObi8HuM*c|Qj!@Wd8X{6!6Zm{s%<PFlm@(;DS*e)8417O>i4OTs#HXB@h`ZAR
z{u9H<1!0SS74=nGgXX+abJr1!X*;_@<TIk9qj&7T2X7zO5ax8t;Fs@wXsv~u9?bF4
z8+6b2>DB9Ln31}}P_b>uNA93L)eH~L^7)msvO^|K%FWEQ!Ya`##CN;D90qZokd$y=
zx{r8qGd`M_7@JpZA~7?BCT8x^U<h5CNZnHw{6lPg`uM_<#{Tci**Abp&}zTYOUXDD
z)t`R)Npw$1vgJrH*kqF@PnHoyeB%CSu(sHlV#jK25qeXi(&pnMqeP^c#9_x`qz>$S
zvr=eSlK)fxqy<w!!O6ivm>jQ7`rjjpwd+@{x=Oti%ph$bsrZ@uh}TfvK13cVmGRYq
zDD@V8Q~Ni$H?tvx+!~_J9?XF+faXL@RMfr0>xRYT2ox8Q5nU+8{ZVk74#75br1p28
z?i~Q}D_^ik&3dviTANEX1rzKJu=LXOs_x=JUgt#7nq=EIA0r{29zV<C+ke|=Veu`X
z^`je<g6-w2;EO>TXuTcw4<}kudkyJqCPV1H?2gzg2BkQoZJ*jKkKTGJIXPJtD$~ei
zSF}v~bd*e3#o@-$M@5PzO@e!c760n_ko0w?$%Wd#oRo>wXK>l%THKqYFW6}wjEO+~
zPD&uwbe*c&4Ut47V@ts_WJKdCZQF#;jX&<+XlSP*ey8m+c);HiPI1pJkf`k1cD(%y
z3dR%R=Cm~gx3{+s;q$D#I8yX_I?)S3GQ}xn8flx;UV7%Ao+$Utju^xCwuVW178d2e
zDX<`vJj&Qp;7`R1|MNOyED1K|xRVlqifHo_wtwKl|3z5sH}^V_8slQd#fZ7F8sykB
zck4`)sQX^LSi+<sI)Z-lc#b_MbXT#R?1ps3zb~`Xq&IG~gGgm~qwEq6KHJs47ilw&
zX+D3H8N*!w?RY8b{{P=xjGB%MgJxuJ!xP8uv(B|8H2bGR0P~c^4ERsBUVX2}h3_Wf
z?=P-BLeM8zcrpSB7KQ179dDi(HF)q%mQv=hk54^r(WqAWd;IhFuSwL=Ark0B<Ulz&
zky1H)l(;7)wJ4Z7D{XNjBSise$lSlTl;tyG1yZK(u>l`am&dmef_cgc<DVN5Kn|iM
zqr{L9hC``)Fq%M2h??E~Z;O_FhDT~*<~)%+cEe}d-vPW92X0l?9K=ebXT2wNm1slj
z<PVYIEt@Ku{tl)mY!n^f`_PerJ}DB)_N&N;v_z0m>AP){nHOp|nLKadjV+{k)TBr=
z8X`Xa9g9pT<}rlo$3A?8O~*0cKgRyfHx78#edv`%NRBP)k6>oxRT`PxPR;-RmV!qB
zvH&Pa_mQU2VbF#JwEg<v{ul+wq)HyF9(hGE_pr#u&W=uEd4Xmj_&(k3Z<BWBqPCrE
zFR2hOs6#NW{d!PIJqs({OIrW_#ZzG*CrfHCaEG7hNs_T9?k7Ca{O2>~DEp@2)7pfb
ztE#HR7fNb%7|HtIHgh65;|TS2$;&h5Ke-zt%QEjJ(QeAe4VF31paopDzY)e|5tew$
z)+Bs9c(k|G@BFQS6y_3;2*Di%qWy?sJ`i|^L=yTtcz@&vYWL~={v&BQ{6$^4!F+kE
z$Dt(guk=Pt1~-YMm3D;#8itYt8&E&Mf4nuS0z=x5ckm#XY0<K`ncz(jV6))|_YEi_
zk4M%J(=4iknoHgl|ECRylUVr^vRmAtBc|Y2J}+Vj2#wNR{2oD5QQN35iJ>i!6evw6
zC0f;s`Mb=aQNGmC>xj<-#vS-O+8rv|W46$;pUS9Eihmbef3a<-Fl@U`iB^s&vi2`G
zGEF35p5%2?w%Y>uWgn(I{okH@+j_$;;D^Nnx134Ydt=Bz`S;y)4c;f-3s%1Bw{qV2
zPX$-z9GGw;c-grz{&zQSzPt7PoL@#Be!g01qMUq(*>b*S3d^eZZm)^`dhN={D_736
z76!f8zP?n$$^G}er5ZyOETCMC+bkZ?^KkPaWy`ikjL#AR73~9bjP#v|6zIa!7mNN-
zoq4gTXV<!CC>#=itS@UDsVp~xUA(9J4~o|)3bzSpL+U%bh8?9WBv>xYnLq!}Rl-k8
z-1GN)c^&CTSKOWh|B*!g6%@h-Q+xFw>6<J((z@Z>`5B4U@66j)_kg}IY2x~tQ-3D9
zwF=02k=+$aBPjfh*49w`m5Mk-%L~&cLtnu=_8T%l&Ji2_jYfjw)vpQM2xQiH+t=OV
zeJZFK`&7SXTN40m@wWf9!#EYK8Z!I9J@uJwo<KSKf@3$SOvQwv0-PH{&fa2vdPjuk
zl#-o4yq(Q)f7)R2<lH^}@}tI#Gu}@>{IT(xRhKdFrm({OrnaS1ia)|{n7P-z$5HIS
z*_rNxE@9xYh0=Z812wMv$8VgE!H6UFc4RWN(ky)P`bQ^2zB<vND)KH0gxn#p8toc%
z$3w=U+nyduTI^9E+1c51NapI+WC#(^PZj(1Q%TIH)a0vTSoc;M8*3!FHv7=i{Xc97
z<zE9PTi=Ec&Nd~O?B2b*5L_jXAiTwB*=TH@G2gK+`8=%zu5Ddb?#;tnbN9?%Bk6GR
z;HI%_3wy7oLzQdK!L~!-eu(B2I!ZpHB(Gu+e1#AmQJ!^n9^4*y$xKVZEhY<m`Lz&i
zj>wSOM-^}}Cw;r@_JeHX#g<grCu&BVa3YiBlF9_?BUhk1bMIdvuo_SnZ{=QJxU6B)
z0w#^2IqI=H_l*S796Pp-$~)P>_2}pT5@Qn7dO556(5eP+I9q&bK%r{lF?iJm3ouOS
zFu8yJLn*+PMJKS*r{H;iPq#TsN1y+4v&`CY%<K801P32Rl>fi2Y->@f1x!cW3at4W
zjFw+F>@Ll`&}L}dFOo&VCRr#WNTC#));<s7OcV6##{*zNXG%@}tQSl{Yy)D(U=7gM
zpXNvOF!uf)yLtBl<Mq2yR@d0rSX=v?V9d;1$l4G^OZSmN2=)*g#KC2@2=@cupX)5I
zg2+a3=4{PD5b;I955~{w>v3Z0P}eS9{(1>)JaF_%Ex4vSkc3yS?unQndj^;1K{hpO
zmKY2U;(-y%IRADCE0tEcpwh-6Lb7)Mpdf{r83yc2zN$y}?!BcbC0YZr(AGoY*bPV*
zPCDh$de9t>6G2H5q0Wne8j8P77iUu9o`BZIL^fh}wt>=J$_CU4yic6kHDlu1_O2~=
z4bk+@)x-Yy9ODC7n@a~a{CTH2_FNGksehjR+qCn^p}W);36H@G1}e>BF-R=~-kl}N
zVIJ0EVx1;!$ZMpSNN)u3e+X8ukg+DCC&W3}Z-PqW;h?{mj>iVXUMC(J`Se}lkDU(_
z1}>mw*wTt>4P~4r#odE~{F+aOAoE0}VN_tjGbE@;m@qq8Aq4ec=7o3etO4NRbiltA
zsvNvSTg88#wpmHt8iD+4`G?i+@A=-1?R<;`ruLqK`GaY@-R$}A$K)TBK<K>7U=ph0
z<;fs_#AVc-QFe%dQPs3<_8%u*0KoE`<I7TBBbfxbUyKoxo;`;8G!JE?;S(NSC1||8
zUao~b0kRqid)W-Wo|^inc?c9eMQ?*5EC#xuqk>~%`!Uy@k!e5<^qCFfAxoW(pE<h3
zYMp2wV#vGf5R}H*v@iN;L=6A_`kLRAmBp~hIkL3!+hA=vd)|Khi~MijU!SB%=;IJY
ziT-p#Zv*tI6)!KtL_zK11V5q2UUAfL2dp7oiL&Qr{m~Q1<56^_Ygbf875n$Eq=rU0
zW%KjVP^Aa-01z?CxI1<^lh27Aev9(eDdwbXcRt&dc5gc&QMr#eU?~GslXvUZ+qdsg
z@zM|wVCSkt5yzDk6<v6dLRyG?n7&X|Aur3Zu5~_E-?k+sbs1P`p#AL}RM`D}rXH2<
zc`pi_Tx)d_Ys1;^t9CctlpCSH7Lp_HA$_#k$Q9S}|4hO%*yLqqs{8pZ2{KTj1Lu-{
z2=B3hq_xs|t*v9i_jPuCWB{^EU0FY#V|CmXH-E}xza3jA?$48&jM-|tI?q@#v(0F5
zPIgsNh-=&LKX)tosff{&EP+XP8wTT;#?eajpo&sUaoxEJF*b(^F$o>QYd4=yju_OC
zo1XZix0>TgvpTnnbxaEF2hoNTns#XgK@e(T$UWIA-31VY5r>FG<e`7x!ZlG%$9pPW
z)DMAOXTuXyPlXgCNB0iy1$$18I)2B-)|rqHQKEKmpf)@j4PfG<Ter?)T_}D%*uNl*
zMkz9H()1{jZH79Inlc5w$T(E=Fo{+4gL55xmw;O)tdh!rZmGS{1xIVFtE2B<OkANw
z6h}iiL@(9JQ>Vn><@0m8eRB@IL>X!zU&9&!;DSuTQD2wlLlKjL=6+D6>UZ2RB9$+4
za~rYZsB|zGiRf(P;&UGL)-%4oALkOdqIknp3&OzsiQ~pS&AlRAEAxnCSF=sv=;!2K
z@$Y)Xu|G{oRFJ3(bKC(?eU%qO=M3R3@?7b+ri&R*U~Vv>_!>WN@0HN*%gThArksOQ
zFZunm=inm=EO`U=m!jxv!a2fjyeVLbn1DeqW7j@l1~6($`6OzH7M7OAsAPaIIMV{a
zbB$SP5F3<IKHGdI6*x+WODv6;iK=~gi1AcH>=kKAFmBk?TMCmBk3MReSl;#q^|YwL
zZw0%6Z->+}?d1zZ+azci9d=8ewg&99=cc$DQTXG3C@7Pr3+;Gb*Vc9G0#mf<jOz|f
z7pw{>G1;jRvF%_^EIS9+?1Fm)A-^^OL*)<Mn_QRLJ{8^l925$ns7sPuYo$c_QwP9b
z1W=0P?DAVnl;xmz+wI)BmAgk!q9|kMjz;RH1YJ$^DPsO01&5l%UPE%?VDu9r5dIN3
zoCY4s*08IB(FbMuarssBco(YwC;RYsZ`pY&1Pa&#J32yf?%lsXb=+*e_d!SQiwj-j
zag<VNv~6gCHfwxk<?nmm5!co8wH0&e2q_vEwvM&H(?0IWh{Ab$-gWHQkv+*t{aW`+
zpnhLEOk`yd0V$`uWaUlwu7%1U>gs68S{S#RX9RCeSYHZ7h-wRb6dk%*6d4T4<wm0^
zJ7h(aWfUkI!LUVaa1mBie^fSTQ)tpjmHh_}2)+$X&;(S@v@uMfWIld;J*)BpH6R4~
z$wMI<+JP^0^Y!*VGea9gO*`=g*Ix$@*7|rFNo-N1J?`m95JRey;PQ3p44gX`tyR(7
z5@(Zlr=Ffk86lomh=HCpXHKZ%App072btL*7|XaYLQCVaBXT)m+DpThE>K@XBOo6>
zdBTJ!0yb*rPt%WeNa1XQiYhb&oacGD_-gQsP5gRX4|!SHD=>gZ95zL?3}~SM0C6Sh
z4dq#Q4L4AO$*izWN?Z5q7p(tfTxrm>7;a7=Ws1iMlGq+OhkR6xs;N`NnGAtU$+^aY
zR&+!(jElXM&BxNpr@ei#*{lZMju<2HN{=`O$TTcDxs^DzIDk0;O(L{v8(6!RP|^S1
zJwWj!Vx}D(5%D-)y$=kO-wyml#E!sBPA2Nzx(<OAqG_^!|1j4J{4M~9!j!<a^M}ec
zep06j(0o%3O!Q%~-%vqPdlCKf@p?Shg~G{LqNeS$+f3+be?&sb*T-rUGgD~##}T5^
z2$5)gH27{sV{#y2dX9<ZPCGk<veLL=@DLTnu049RxAeafc_gC*e3TiZP{KG~P|h?L
zX#E%)E@()JXq_Z#7z7+Win7c_%re>(!HNWCT<}{z(MCoC3j$hMiPqp@xO#2T@GR(H
zeSv$w=r4j;Mt3c*NX%TPHrB20*jc(7i1#6`P4#_^vvY#;XcSvOkBM+`1hl3tqn(da
zo0T}``ikj<a?mcsxtcO%`P4q`tr--Qgg?bbkTIl=KEiY>4hP`X7&huRd&uov2n-OK
z(bAi=`31q_q>SiH{|V_d6u-Jf*e!JQjAzXF=U@l&A^bc{imfjrQ(>__cu)pwgj_@v
zenn&Kbv7p_Xu^(T42-cSD%D>aOb;mlJjDnD(q<apiRY`xComq4`Eic8LxO~!Qu?pD
z4$^qe;CMnC9-c$O5fGd~76&|d9NbpI<F&`rw$phPa`t%N-#jp$QP5dM6oYtxB~F6_
zrpEJv+)__xVmogh8ZnPaO2K@~@kad{ig8%%Ml_m#k@_evUJY9So6PX>WhgcAt<>FV
zTN7#OhAEFYIW)qI^cX#MY=!V)W{IIo!-aex{~$~s)G~4!{2vo3J{S9$)Aev-uHtZY
zF^^*FhR-<<|4Sx29n9?cb9;(Fj+cT~$fEJQ&<xAFlhNAtL~n>|U=|Y-6QLr_px%M?
zsTiBhod`ijwYxYK7s6(C7U87O>*o^_@!k+%(9Xsw@J_FSyErtS?Y!_;hvAn32eI|r
zbpe>808M~TKxq;F`UVvEDLo<A%k8V1#4yqD6EfZC?uiu$NM3^*nvT34Q7-Mq5m-pj
z0O8&6P89rnbc)$gIIj~5wvRvVB@uDy)_|04Y*krVaN>2xjt!G6$3D1_%dhi)C7Dre
z5tZH~K9NsDd|qf(JbF~kE@tgsQ67+eo7~#cpYBK!6(QtI`E?7~K9#8v)Z4HwVm5$H
zG1cTI>@;L_K`^{?t$Q45UbNa}Q6lw}MR=KZZIqMTuziLqIgn~Nmum7$mR(sXwlRfB
z-{N9mV*%b`a5*l-Q25zVoGA7gT^GBPlilcEVOjx4JIk=qo9TuJ>6;+Tm&!ms;G5AH
z4+6P^O^zvjg{&C;X3Lg62Xbaq(HKSona()RhGT&?JUk}w3EbIUPfsGG9T-F^1C(4b
z+f>3*x2e6w$_j~(m(&RQOxkL~KM*j{oJqtkYQbMWf5wc&#K%KgJ=AZ4j^9dewZD_I
zUDber`fCW3ZMqk;zcG2}u&V`K7A;KFJV5Y>7Gh=JQZ5vc7?Jai8#o504<K~}uF*v#
zjgzNNh480wXIB&+PP&hp4<8k3pu&J)CRc1cG<+rr6?{Y=eJlCS_b}`ujaJeN%08i&
zshDRY6tf~Id@;Bk2GEbk`c)XNAGe0X!N|#wpuLDya0x|6>Sg+!GI1{ax=#9FO4URX
z-cl5;X2)G;X~eWreJhlHZjdfmJqXlOwuH;a92ep@Qj1UpiLCU&x@jG|^((5W33wAo
z<~hn#I4IukvzArR?58jsYed5hNLy^Ec#~&vfv#fN@pZh3)66LUWSt!w9}j<D+f@{s
z36wZ#AE>-1+J8WQ;3V}{B&8y2)|;K7Rr+^J18$WkidC%Ty|bNrNxKf1{P@L-DUCn%
z9>Bpoyu=E5_-LB>9F7LlIe>wG8EG+2H2)nLbi7m7t|!N=nS)xG>^Uqt{?#|^GG^A5
z&K(Yi5DCZLVU}==AYFFl#*QZ=Z2JD;3V}oWOR9iTF|^|<835TYWDMmUAtTuMK~&8t
ze9-tBrm)9?>R5f=?_Um5!N+l!=lm7a(8rKOJTO*QW(+|@Z1sCw<acdHU3Ef2I;xB*
zBjIc5UQ_i=0~5Izwj+;K#2y6g*SS6<f(-9jYPXDQ!^S#HEaNqEo9kcgXb*f7otyNx
z`YA7#I&Gjz*0u21qmCNwac<pHVXq_12KMYpiM@+O=~0rnwJ{G&_%`%%mF|MJ;kZob
zvA~X8%*&dq4wo8(v?#=s5fCFnQLMUm_wGpzhik<0St5T6XYSINF^fZgQw<D52i1Z7
z4w=OJiw}?+HDLsN*`0Et&{>$CuA9DReVhP~d>m1eB?s^A5Mm<p1h)fihytx6R6YWJ
z;IwF}v@zSx=C8u=X6;JF3II_I9hlEHV;MN}qax1AtDBh^w+w{Hz)E`xiWD<S7w*aY
znOt|p!bA}hFgE+Qf$I7f9uPR<B?FieC3J$?=DD%k&+|Mlt!yzV!e9cv<HpMkh<z~E
z=I3U0vRGWkwh}B8g4(G3KoA>Xf(0#`w0VDcZoKwa(ZE9xDc#3D$0v<_<$s^o30O74
zBZh8=qm{J}ybC&qwb@unQILoL0yH&EbDj^G?Iq^B>rW$a<yDs#tP}d8S0TzD-P-@d
z4&pY((2I$L$I~tj<-GG^w5!<sX(7=SAD9V}2YFOOLt}&AgU3eZF21YQq02Yd!RZ4b
z*e3P5Qbx6J*^vXh0GNMqni#m9rnm(nnoUJ`9M%EC;%^;4Y}g-JqAG^ZjSMPC1D{27
z@wZ~F%G)nqET7)6^rD_}?@=Zj*D;CR{_CgkFQ+$^i-I-1GPE#=UNoRp+k~wJu?`<E
z`V3d|IL5w6kU6miy%bCahKcur+|%aEKaJD^LoL3vgSHzGWsBdK4X|_zS|22v&t@1f
zS%CEqv|30)$SQ7XvAsZWf%8B55~@ZaJf$zoT{4Vf4_y$BbphUeZ2n)?AZS<vpO@q;
zK5j$mZt?TX0VvL1v7Gq7$p50i0tYE6@2H=gSY{mzP97WH^i_2rmNi!J2Ui93u+M5M
ziuRes8|Q7RN73vU_^s(V@r#TS1=p|y?ZgZ*Bvp8L7AU*bJb`Q*qr^M7CK?<I3d}46
z@O6_`ot$cGNtCG%hw@1U|B9JT>!Qvn`qa>nGou6#$i&WR8572PO0QC>b{HmP9{kfM
z@K$|}9P*T<#N)TbDklS>mX?_}VR8(OFx^l>07}5gKdb{ws5>~FeVIF!&PFlaVVsHX
zaVmmD8lJMOI0@>@p}%+k8w>3mNGnpRu6+e=;@4=P!!s`gwbdq7-yv{5s`+)CBoA2%
z-5JY0z&<&0elMX2F6^=}1sH)>JSBeC`l7%U3*elMB3sICAp(i)IwW^+cK*jN>|!#w
zqc35`9C=nVHXEa&VJb&f#QKD0U&h4jnUW&iCot=np8iT|cA59@U&OKKPrJS@ADRIs
z8`Uogm-M=uU7lm1%&D)7vn1LWpWdhRCM!%Lz7iomY^u~ivP^+xBA@kn$uf|m_1#&C
z>)!M6ke~;Uh%K|^@GcprZjwcb1W;GI3L|k*b!)g^laBgfx!$nd9@WO+VNV)H=!X<(
z<`wM*HU+p`XRp)ZiN2Y-{FG59Vx!ny)X~=V&RgEnUBAZr^0N5%Kb>xezV?h=`#?72
z@R1|eSjC5Nuaezb!K#UA`abLOU>XD|=(>+JU_^E!7iuczL<YV{p)<;?KE|nW>(<-e
z3PzOgVGnsPpVUxlPqY^1GM@t*+x_y<B?98kc-8eCO@CP8v~9A8d^P!e8q)m;<#=K)
z%Rx*46GI`hTfiS9|AZE*+Mup8)n;)XIArgvl;kdB{|2MZ`YgY{0fSWRd82Ez_l>Jp
zn}8_-KZyWNPR{4e<br|%kYBpxE2hm{m(JTQat!1DKN@*pkqpYgCr+A5pMn<?tuBt=
zxkb%IaGoX*0v}th2+N`B>N-aA^(|mOQx494_AtY6gN4P2+!GH475-@6ORMvvCOL2h
zMC725G|s2^RhMqvTv}Tao9hV7mzzym<lyM2V^M~QJk8v>>W*DIcNU_?p0e#|*~4${
z9nxSU42zC(&g|2_KU>21tDeoCxX1okY98K8;+Wh?^Lvo1j`~Mmvmqno7&JSw)x#23
z<jX$JCL`YAdQoA+QC*dwX<d{E`VO1FDLX(A{OyrJ#q4!}WRZ4DGm|o|^vl>gz1=N#
z4aC_~)j9GKLNp)JcV}LbOT6qkf)D?78$TLww8pJ?eALf8`{~T5hZUyd=KgxuhrF`r
zRh9pw|JT)-$JLy^|NqQJri>8EG8H0~q(&4KSt~@cBx(#TDq9jo%ZwsxLZy&3NwTyc
zgpr}N%E=O03xy&T?Z4;U;`@30&L1<6Inz1k{eIu~eJ!u+b-k_?XG#`*!v$xSI0B>$
zG6{-O#k_dfmc{OQ_$oji$8Kdpy>eN~v&m?|=&rxDQ05+d{*?HVE#78?!g{v%1b`!j
z><o-@??WC28)s~3X*u*YV_0UoFy-0UuW7-0opkN*iwv`9@5@yX3Th2jRHw!i#D;Iq
za{K%qdtn-c>3MU>Bd+etyU6-^v6>4nC&5_GIkWpG_o8bMq_XN3pk}1;!=uCFexiTG
z!$~kDE+$SrgR6bdPQb#2rmal2vXqzpZpHzxa-UdRPU#!s+)__<zOC*i>ZXZyXZGuf
zrVQA-@;9G7Yog~i-MwRtmf4fDge(B<m#<uTKj8L{6Jo9JniX(Rh&&$V+tjbMhF#H+
zO@r2n#wTMNO(PA?rW`n1rD*nVOyjn{*h|e38pg&=_}$uWd(WLepYb7hxVxIW>I5LC
zrd!8`fz?(S&x#F?nKpg;gJ(9W8)^H#oTyyWT0_!NsovoY{T$NnAN9oMpt2KWLKZEP
zjtPBSc}3pT?yfej*>T%jAUvx9BQ<|7+aFv2p?|$o^>0=Sw`56~dY$>0Bsl#ew)@ms
zNn979K{IqCQ|Lm6%$np<&mfbtqv9O8=HI(FbGg$#pE%j-HQlNcy5`A#N2GUkFO#(_
zP0ut_>UqAqsTe~+Zj-;d4BGHMEcWGq{n+tlNIQ_A*!2cG#h^lWTML6pW?Q<c>1OuG
z`I}xhT2`K?t{;g4512F-B5lmQtny&_SE9HxAT9qwnS*_S>&Q1IMJp2MXWY4X(!_~0
zxHXcQ(9eKuTo4k5=O!^W!^Pb_;Z0paY-}kYE75HeNj89~5KFrAzD(#%A(ugW0REqB
z;ZVuQBiiK6>qWsx2y-OwU9EkbVG<=rb3ncd^(sJFj`*m1$=0*!_wU>1TzWbxvXp?o
z+)ewfrqLv;PfMGR%BbEAAJ&X^c&q?@eAG%@V=`B09$90m?T?u){%>lu@#qW_X@r@%
z`Q1-Dc6V9d=87MPZ0O}n)y7;8Dv&_p(;qG(_M+9dhlX|DwdQ$t=aQ4NKkU7F?Ra$X
z)n3z<dnz;A{>;;`xEqd>Qa9H18Ij?2UP%M!8X3o=Im${(wd8PI=v$TF35Ib@om3fN
zPfm3@+I<4Zu<q#SYjXR*<ZO_%^J)opbD6_oLbD@}J#mBm(2Xii>1|xrz}=328%Ags
zxQLDzBm`6^Ac%pnDxe5kooMxU22GTE5vjP-m6g5wQOn{<nPv<<$euK$u=wdZMfwM`
z>gWb0&of(=9asbx*UcyC>K<l)MkSm!>b7v}5C*IOSFEPCnX({|fL34|tMyFF={enw
zE7s6p%CCrF3pEYj8Nb0`2gqZmmJz~~cgz||3T|<OY?V<~8|C|JXqX%C%RFRLeUoy-
zzntv{!=-ZXEsGtm<*yhxAI#sz!XXbF<IVUTVOoQ&Qa)^z?@zB^a9Zv4q^ZVb*W*1R
zl4NC!rVsNPK5I*JV{Nu>JO{3BZIF6aYa!<F3{+zA#ha5ZyoPbDsJQRUs}FeZ0C^&M
zUHzPj#*YPulyasWlJjY9{F<*nO39FnhpP{i#|c%AgWB<N@!dYJqSqfg;nCdqaG=xk
zUsVy}EBRnsU6OzT5BkRA>POM<t^;s5e@>Ez&1lLrQGmz@O=*WH9R20w&HmT@bX#D2
zNtNmPU+!r6pkP_)eIRM+6)onXQrUdvH*!Pn57lxxTD|&LC;0a6)oUa|QAAu8+nmX*
z?BK*9TMoRs{Nz$dpEav6?Z>yL5;>*2ZcD(4S8x5rz;1DNDsvR7rzANod|%l)k%}l-
z&lLMh8naJ{GBDfq%7)s6I>AO?P(C2n6(E81qO+`Q;<<B=NGvd~)w&-rjNQ6--_}D@
zA4iocH!m-J#9x8GnKcypITmcb+p}8gX|w!|X5%EQ8x95`r*h{ST+@3T+U)mx<}6X9
zaz4?`^rX|#>%i5*VwGxC_<m|#i5oA9?}SEQmXyVaks{a^O@*x@LY0K{kWlM!ADvnF
zYgvI|l8MJMm&l1JR@FzOaV974>={~^`1k5WPY*KQMQNMNVlRS)f~6uY;nnbMRZt{g
z<lA){mTW0((MFKGSd4T5Rp*zbr9%SN;paJ`#a9mjk|2K@j({2Epp=w=I=%O0eQe(z
zWXl<QS4|Og1rMrY27TVPm+B&&Xk-TJ6T+3@U<>oh<ln~P`M{JjJ6$r5)*Tu0?b|no
zJWX{GTG`}p`O!h{Kad^iqlJ@~H-k8u2p3EJ&fyV%$y1k6CH^OlKbvV=_qF0|Q$i~q
z{m??kYeH)I#IP#`?~ScFiP45?24VYa&98?jc$UpNe@$bjpS3Vb_ok|05VqC@CTt&=
zHquwF{lP-pwkPKR_p+1`gk}(rjHM^2mdcy*0FIGTZj!sVQabWthI^uJb@npp`xkHg
z>t>V<KY5XE?t8><PZy|t1wIy2SeJUy1|p^{q&&Xc#bA7mkYa}K+nqB3zln_wy(Zwu
z;*5E8+d@0-5<B$+NqLEzer!@P+`N#?e*%f!+-)1X*2+1gc<+nG$%&o33}Orrdht4S
zgL(n%MLHq+CfHGLs|h9xOk#bJW^p!fMZay?GBE=$bvwaE!kNgDewe%KPouy=RHvnO
z?$Sk>&<*7u)&^hk4Cgm~vb73b$wRhHA6;EhOpDCNgUKtL(1jg#arJ^8;}`CHtC>1^
zcLa^PW$t?tC)UdyHmmP7v7y5Hz-yDVB_mJ$sCh+sLhyYHyIj;Yr&rGEPXhc9&ploQ
z7kygviX=9@$E-V-RBDcZ3n{K3yQ)qAwu8<XAU}SS-0+nfy#7Q-i|KnA3AQBPB^PSk
zFO(WO=g!S>KQhl}QC?1tcG|jx1)p*|Vw?AvWCHk2NgW~69xz4W4Y%#w8K!x>=UD3W
zuFtO45`hND6Ae=tBAmGbg%5z5b`a2m)ds#?8wG{I$$(a|sfh<nBNtN)342(`W>~=(
zi^NrMUg46c1*oG>;N4^F8irL};`~dgC*(Z797VDrMGrRTv}M?~eMEVCx;Ig-j8)sM
zRq=_GE_xF+MgYg;wc7izcNh<%bJj-i%5#nDrv`7N74F?`TZ1-G$px9$VTmh_{1eMG
za`LoO-rS~bv{kHHr_s8)R`oMPJageaF$=)4c4F=7G4`cK-BNe?HB2mb^!G?;81DD^
zRp8SHS&3Q4PF5aj7)RkO?CiP=#z7D_#vF_tNLEGN`!aov@~?<SIFS>#DrB7gLLzm1
z6RJ)mA9X6q4Xsg@x(q?0kc?bvgNf^BRTIy+O~8C=1+c}O-$^>3fOuKy=)kkQ|I9!~
zXU6i(E@zz789bW#n4s~l<+&%sQ`!p+)7DAnM#u^ABzIQeL5J<diygm(&S^zU#+Nb<
zUOGb~6o{~gY2Q&RA=aa7Q2kGc!K|}(MkyaFD$Yhbv1LG@-MtK|H`gaW^pHk%^AVK?
zV(csU+J{4=P4@!iF;il)8imaKeiTr`d(SUh<(Lu_TR)9KeOtm30nn{-qSK3t^bV65
z!XC4T%VG%gH1Js64~!}Vu<b+WSF*w*a3gyJ)baQQ^JolVNLUWsqyqN`j~-nD#<Chs
z1t`;`%ZKOJ?d$TVbf3A&UEjWB;=q)_&EI6>_2|Z5d0|sQ)5I-1c7(<#(8rvul`&?k
zMssQ9II@ZXmfkk`ud8;i&<ka|N~fT`-fU}a2$)@Yte22!OG?W6+@r<Ur7NMMK?ZB~
zk(-CKGg?ip1U+|oW#^Z+MQmR4<xVN)pQu4(eB@xdlq$)~A1*n4b%siSGr*VQ=aXwQ
zcbP{sbj*&(;mh*1y47WpvJu|*hn$lwRPNArY;N3%V^J#8-jl?z;o|qM*KkTHnZ(1O
z6*;;F0$FmgJ52L8P3{1F?xV+ySzwV%A;#H2(+LXh6a7SEVv|tUf9^Kw0SPy=WmRus
z$|KA7UR9w(J@WArrk6)MtW*u~e)E8fjgOA0+3&V|x%OPyok2k;+eTDNeZR_~F})o3
zk$+PNtCv)JF9|PyJ`+8?;F=@ol*y*Rf{jlV6`nvr%>kUsIBnT#mDZQ}b=kFbnJb-2
znIEhjsk$dC41;JL+WaUFb))R9=e!hVYty;4<m=Ef<$;Nv^fMpNK5?g1&BRFq;n~`m
zzozKNp8x{Nba-7|8K*2Zrhi{WIxen;W`prk&HGYU?Mgp6*=-+voiy=8seT-<A)H9k
z^1kj}ZJu)t5i~DsK(bt%&p6xpefJQP3{G5!eTi}S*Iou9hFa{aNshU{zm1~2@HDLF
zOTE3ht~SbjzAr;2@Ri-a8!#AzhHw}8l`}rB8buw2dT;avt60&ZyQzrc3CW+kUkYub
zca_ew5?O8*pE*51!n%p$a6!j=p9f(JpWAH&^7d2hysBD8G`lz&;i*(d{E--0yzDt-
zlqeIwGquk+30iu;Ffv&^ye}N8>5gjhbiL+88O8aWB$Y7VK|^!auWE=Vl<w9b$8Y@M
zrw-7k96`3qMr`>q1zZ)@DoBn>fFhl7)OA*mkpk0g>Y^-!;Te{-)aHD_F}4xEex8_E
zAg#s=v@JNT+H+3tlN!^$;^_Jgd{SDudSETea7>s~P5%1vNsO9i6;KGB9XsWZ^2x%u
zM{nG&<*y%@0^LK+>l$4jcAKyRUvfo=-FZK?(8p8vQ~VVwS5Ps)F`GAR(+Ecvwb&E<
zzb=+;YoX32+K0aO^z!Tx(d(~!p9fFJZ^8UxcUP*38^2b6T#J)N?iOAs;%U2Ht<=S<
zvr^n0Qq~FXD>Q?F0YWqt4779aa4}YQnhO-0DHqkon>2kpcJ(hkYOEg@<93_L#FV1g
zn+$8&B^51Fj;~JlrPBOQ*g66uW*oF|%_v&t{cgE4wnBC-{ia!$4~|;ZKln#~mHO6Z
zp$)}YW3^sJp)Nn1Sbx-LB%+ivP~(af)knG2<8kNjFqAfd!>@3+kmyvBAu}Oyc*f?&
z2xx0u=KKyJhw7W@u_<<!Rie${Xu~~`$t?2vkBaUr^Ftr0Pd&9k%KA{ovg`l@#z;Zw
zjEgzwAeOpPw+h<$9~LUm(naB0O>L=i45z@Iq4eYRT<1hvG4Zt$#IW!VUL`_**?hEG
zD3;UYL28!RAd^Cq!XN0@3|XAO#|R~JvYkbPC%XXD7*%L72FK{XeS%5ENiu+Hqow5r
z#AQ?dFs#ego<j{$oruLEr6LWH9sihZPL(*MB=<B<xWNb=DkI_shJ<G?hc&R<SA(WN
zH2h<HvJPa=_3CP-4I%7;^F31+dn!t+e@yXfZo+pISH3jV)z^Q<IfisG!V-ztna^gg
zgcCpzQ`>X4_H3AR%NbRe{t-FHSobZCBf<r4ka3?L$f?|EOX7+gj?MIQgzj>>v-lcI
z@9hKC)Et1}A!3yp!ZDuXU6V#loadO*sU;NhU(g%tBAvn<=&%sJ$}KPG*Pof@!aThB
zF7SOGQ$|V51-%_(WE5RP%fkGN%j12rsJr5aM8{F-PL^N?0;9g)ya;tLi1T07Qv0Aw
zG1kDBq1d87(x<C<0@p4avj+kW9RN`(aY5r;8g1%zuP)8&)oyquQ{MTDLT9)=GT@X5
zZR|I4l)Trin-V)8(IPx5qykDr5mvE;lPR7bBWdEMN`m&cD}Y>mCd4hX>9V7F{#0A2
zZ7QIy5gFOLWhVDMc*h-n{@MBQoduXkXnr-WLlm~6MSKL@*ffYZ;WtH{KjC$dyOIzX
z8$YAL41yJYB31q<dmab9<BbfsQutg`m|KWecU4qmf(8LEg#}|AK5=aB{*l8nEf=1c
z8Ug-n_I^8&d~cmUZ$K4NqSA8UKm)ZGpq4I*IxmPqC~@91o(|~9P+5&+Sj+k=U)*4Z
z4U(JDCr&WC&i=B;QlfsMo=o>%-^fF%<%+U2oXZ0DmTFe0_`v|UZF5gI(kgbZhO$Ma
zYOfl;-hDLO0n3Wea&h>YlV7t|g!@;BpISwiYp+kOr4g*>8nX%GtSd{{e3Zj@QbyE1
zwKkucapWfa3OqxgJkO*Yy??&b{_46T)*9#=;L@-)CeSXBQX?k7uG1CgtgLhAOX#x!
z8L1+xQ_9MJ`tJ4X?kEEsIMBCSw*(d)TepUIjgtA}-id4-HS@hI*RHh(O5k`h>Jj73
z!UO8ECp=><kr!kJ;+{h@ORQj57Te1MgZrRL4sHa1L{=A_G(WrI2iit!D^?RK!_*;5
z^+deD%1Wpv3)%p=VfMk%4^j5ZTvI(NRng^ymAV3D{vn7P$lrYa+)oH^+u4B(fcMZR
zg{>&$R$H99sP172!nXC!1I?#~&MfK<v2HrTA-@=3z<Gcdls*i?!qB0qf6-q_klo7^
zLh(*!Jd!!^bz6w3(7E~YWj}cYfkgDpi#&&eU<qMAY0pm*vVBZ?0(gNZLh*{rpz82}
z1}rj#<HnRRv)lp~h{NRoVFrex(D-T@!WGi<CBbVAJRliW`f*W?l=CU-2tGNzvU?7<
z0>(#aS@d=#P4J^Y3O8puPC;+8qW6-$x%Eq00y_U-(Vf63NcIZ!(S4amu=U4EYCjNs
zSPXH17SM?p=A0#KRwC7$2J4({xwlZ%6Ln@=BUxk15IRRE(9P0Uua5c_Q+wl+xmx2-
z;{Tys_ttaV?pSt&PEQ0Qm?IN5aXi)KA%0W3F}a5ciz&Aj8={^8p&u!WDRcXPZ+@n@
z%~K$pp~D!eH2HL>g_}#)K7Hz$THM#fcR^y#ndA63sxZh{WPfslnnP-<m!+<OmJ~q<
zZGP;NgXPv_E73kzqzmy&WN%8oKdCEh#s4$xVrjWXRCzhGQJ79vOs!dI5;-abrD1|E
z8{_^M;u5ELSTd64Q2sQ+U(on}2Zy8E_Y;Y+;hb}dH1n~VBZ8rEEM4w*VjE0<DUj}$
zOI3?Clj$fq&k@ktod0<ix_c`rmzDlj_wC!shx;3FVywpg{c%BA6|+65ONe+uDQv2z
zhsOe#@xZ3AmM+f%{@-Kq_AW6IqQF}L8kTs*b8p(n%78#z{yD6u7Mi?x92VIhD6mro
zwEe0m?FVfLhF@esV4yr-fuSq}-=NXrtVaxy<6KBSa>shZJHf&+KI=6;+TcXhfCdD<
zc{pRXn#;T_D?4TLw{8FDaYR7QIc^C|FPMr!Z45#lja#My<M@`i6UCCbU%*V}LrCBt
z{9%b1Ws5VAY}%J_v~1&<J0^Yby6Ue-5gLYu-UR%WZ_Q)47Xqh(&oP#}_nQst2!T04
z48prb$yn-|Br2n07bRgcu@Ie90kB8>c|$UO^Q;%A?XTO})iiv(_&lk3g?B4gF8#v?
z=!b$GifzM;x+>jbK+NsiqV8GtWUs@hz4Ax#-uV~wBPrCV#m7=o{Du%mwwpTSD8Xde
zGFXoqASxu81O+43UI=+qSjf04pIbp)BX+DTx!c1_HkO0W#DQR>b{cE#A{bRhFid80
z$|wp9RtzMJ_-`>{W!Qb#j>j`M6nvf|)*|!hAr$z$9Y2vDUE&~Qrv?vX)ApmH2TG9N
zFWk2S8cS;7ogav~U)-^SXU)dJemw|5MtT{|=^mRcj117%Tn;GfYoQc(+3j`ILY%80
zP!h+GdBFW;UV|MiCf(G`33d;2yGkc|mdv9Vjdl_vnt3%+pCMRK4PfdYpWaXX8g2ze
z7!J^qZ_oJW6tg69%Y3!!IWmhJ97d4+v&j#!nEWoDi}`@o>I%T{WCHJT3b>Kvt3}VA
zi6N>OtE!qB^rsQ|1|kx54!o4mp>}Yne)XShKNG=96AWkk7gyKX_4MS?qrD)Q+%?Zn
z&=xjraV>>SB4if{g!l$%_|xajLu7zKrP}yMep3hq(KP~g>Y{v$axu9LS%Mh6LmU$}
zVy*06ybzO9s|be)LJykw`P$j%{P&*&?vVv!gik+aQ#!_8?C_b=A5N`_-9z^?kST!^
z2qa*?Gi)BRZ!8G3$J4K8P@k)Vxd$a|4KV;f2YyMPkpd{^Kq6v)LL%aV!ph>y`j7k)
z>Wk~+Xy6+CU}=yEZGbrdM1f}zgY1zB;&C3>VFUXbHwQCc<Qz~g%>&k~T<J_;19X9*
z?A&d(Hua%qwhk?#Cj7cu;1XBYdKAw*;FlrVk#><P<uLjQ3JQc6JrtewR6D^4z!KqE
zl<-T;sol^dQx^)8<MwUQCL&nAz<j7)TPqb2-N2H^B_OU6QR60N(9vF-?f=71XcKLw
z&}S8G?oeaj6uc)S<Q_<b5YZPLaa7{D-og#1p_wxa+zb<otzEWw@v(Y;z^GBM^8~*Q
zE)wY`&p2}bexYkla|t<)^)S-|He&#9W(U8;Q&RMV=*Th>waw;vCH9?QFGS31XfP?Q
z6?|d8rZM<O6s+k<04OTstFdVLvtRl9&TPv=iaQxSxHhqgC?dSuuT(AtRx^I{Pm&vu
zY+BG`<AKs^fk%$V`8<Jqly-aLKbCh2FV@b@Xt`sF<d-?b9M_whG=)?FM*I(tQEmT+
z|JUnrD*!E#XP@AmO@qiF=g*s`XxJjgA^y<+pG;|9Yau{#;}an$ug?7tn@IO35jZlw
z<!_$8x~?zzwcp3?y^`CyH=<~J>}ATVN%S$53`wc{Zw@v#wsTP!I$wG&M0{v$dB*7F
z8O@J>@c(Csw)S)vI!c$8PWler=I;(LkA^m&<DtbL2U=1fm*zEvw{)tgD+sD{dpqVw
z2s6=}#;kG(YHOO=HJJS!-N7}M4;rX6M^A2Y>LGqn!HyGbHGpEzxsVqFhSS#oSqdEI
z0)blPztp&=bi_+L-*`0RMuw|y<*iWF`x`p7NUTXVcu*043Z)7c)w|D^s$pFJ1$1pJ
zcCe2`TPd&fR6&y1!K~D@Nk2*oGwxfHO(lEip@otezE1o_%j<9)C>{V{4(TsLxbn`6
zcm02djr)qQgNv#q5ANJgR<h(2E|%P62v!|9D#_zmjnI^Bd5!&-={wquf5&hbHtlSY
z{-`>JMKlc}*_$|Ff+CMR{So{VMvx8&IB9$FOifgKhZJ9+pv4gm?+v1*+tMv7IXsM<
zBBZwf=6GXQ1RKWIC4JT$$Mzx}A5x!Je7y=jSz+t?`Co&mH_?SEfg<;xzMU9I`i|*1
zHBpoBIbO;w^ZZ&a)?+sb0reAz85qU`M(ZB{ujh3Yo67%p#w?(ciHa1)O$=5ev)Z&%
zM$$AHtuZ8eprj&wlXmlFlI1Mv_wj`i^|9c)?+fCb03GOvw`$pS8+70y9Iw1cbVY=%
z&qm+%3=_atv{du5D?9zr+7aAxRSKgpj2uxchn|Ouc<b;;f(9gZyP7hE&IROU({CNQ
z5d5WRc@7ZbRGiMFwS(G@X3N*`k6mx+JR;ewWpouU1Z?oViiZ6q->#EKP|{GJHVY4t
zo^XAY@hy6$|L65Boyh?rt(1Xcf5E%jMI7zsLoi>hv*$#fNSN@ktq-j2Gh=<HOr3gq
zPJAKJTcAAEPkiFvM+W>TdC5303^lk6ds>K<&|zS6EwHV?)c%Y94SzC?8M<gJ%+J3V
zN%d%*WNfq*eXI5w4)sNApO^0T*8HC(Z~0vDexgOT)k2w7L|j0RN>~TwE0ja<8Wu7$
zNn)tRlzr14Fl1q`_FroGxBmu!_dpgkCOVoKLp0!=p+3&?-k4lQB#ny*-nLCoU;i2x
zgRfn)VL-e2@LjCgt|RbmAJSo-z^kS~Ost_A!;b00!&N>8<%u#Yyy{M>0Ld}y%i`O?
zn_@jbzvi>o>d<4&%lo_in&fV$r5)qEng)r4#wOub-b=|hMWHK88g*8v<O|N>WA9;Y
zR#Bz7fx169(K=|7RP)(#o%^+qw7sE29f(zR23Hj$INp<Y%k}H(ADoGUq}OFofcpHT
zcf=V)iiy>pZMpy0rdWQPEf*+{haxQ?krCUy_4TJBPN0w?vos28#MYJT)5Smd4pdPQ
zN+tV2wj_sLvi9fi-!=pmCP54^O`tp%!{eD`Wwrar#@42PiA3=>F#s`)RsgW~K}h4|
z^_{uOF3(B7P#4+1dw2f;(W4<?L>-y-8@H#5PzR)(AmmLKAiI=*+zjIKrr&4~SGf)c
zbnu7~1$i^YRr|ke6w;=fI(HW%lu&1R2&Y;s@#d^2Pu4ToyG0p-hjRv&z0>P<CS4^`
zq4x<&Y5xn52N@t_ufkk)?x%odZDNkHnCl&;y)ZH?%<<hk@k2Lmz*Q69Pw8G(&<y}Q
zeLuYzH@@R9FVaM%m7y2-8yQvV2WZWBE`Htj#JxySM6Q!-)o~l!k)|XRb86{wqg)JU
zzM`aLHq{Ht$Gr`;xtFVG{3Q^MKtDB$wt`G&eac$!aa(R&B9Y5NYJplM4!`Ql7iQRD
zW`2{EB@JD;Gx3!~<sdt<HAb@*0xm#?=b)*f*4DSnls3nvBL`uET2$7<ha(3K8bm3b
zbdW3%opjg@n||X;RV-<oVh&}j$PDd#Q9dEGC{}I}pHd0X(bq3$hmjM4N(IW3fddfD
zC#)mG?cA@ozp3b!Cmf+C@?4de{6<ixKYVyb_P$AOOIwYPoI1my1p6C?X;lIVBXR3c
z(1JzB*fx0#T}~px5qd6z1|cdvL;Zy1q4GCx9N%3M%W(Jx&0t?Zpy{QTze(i91&p~Q
zqjCou2)nilBZB{tQQfLAB!QLzQkA#9OWO9U|KGqpqGH4Y%hwG|NvUObUWXD2Ygf_m
z55C=K9aqeT7VN21t=Xrf5amR`*r(Nj&*OvdfokCA9S_Ickr~Y7$jJ5S;cxmDD@AU8
zopNDS0`)wB@*%ut*I{s*xmJ)N5Er2-yvpO@_!d}ZQH#cN&*{-nRfL>9Ef0)_mSJE5
zlDZc;fQ649wO*J+J&!s#d*w2LducRtRY-zZemf??rFH0Vcm^{<=kuM^S&K(RkW#`=
zIswdAclqbGLv{FYP;hWFZV;e-aKIxdzam%d*OmP~9r6mBh;qJQo#T`8|HX>uMCip(
z72mX@W%vFX0#>q}xDzxu--5Ob06}yp-_dR`f`)KsXUVVqfslzE5UI<2@`TdWL_<1{
zs}MWze3v)w`osS1&q0gUePZ_y)B%tDgcnZV0_S3#mlp!EcPHa*g_s#Fnt`T^Zro^D
zavk%jQ=d|U09c1SHCS1sF>!IOBpp1$9KU1<AT?PIa<_#1f+%lX{^ErW{9fR&9z(1q
zRP<77`|S-fWJ>b%wXOFHG!<e_ZVUUv)TVm!*xP~}4Vev(inG=ljSUS6Yqnsk!Gl$8
zSG=?<-MKz`ba`m`rMZ`1xBSPI4VUJEFfy9(s1#`7!~K|CSWZh9EeeSrCoWo2Mp!6D
ziL<UWtxjmX1i`m>kEH;Gqs@D8skXMZuu5^*<NxcVqD-8MT*YK4c@rj#@u`zOf8I<q
z4)Z6~lB5h%`M$h-IXxef7qy!(<DWTZ2+l^8H(G>htf7?Vw(u{pK_8yYePD}bDDw@M
z71eW-m$CVeFW(}3WIn}#_=_vgdfZ9P@AuVw?NiO7RW1Lq(D-Q_`3G~Sn7VNlZ~Gyz
zMU2CH@9y31kg|d9vl)fF@I*g}4C9R^^6V!C|6UYtJCZi_F{xooqjt7?CdsFnh-BP{
zSv>uGOZ1R8n(czkAd4D3XwdD<Od(Lrg)$kAVfVv5oF#I%rP3s@Ura{@RSDMiS&u8z
z{E8Y<S|iS}QAj3Pz@qi2IJEJ<UQSML+2Kpx52Y;sxNODJr4dCIz(6{-jlX==82x>T
z+SV&`^ZMCj@E=;Ew_QVtd%^B_nFbLDa!f@URM~t)OZ(FzI&(dnEjbtm4j!auV|Vi+
zMHrGL&9g<Y_&+cIo9U0*vy%UNJFr80KJ)!O|1<x8I3tmGOl&j8-~UQ-ShLMU{oh-X
yh3!4P|M%Xzt?hmJ-`^WM_kZH*ws$Yrne>}vlv`wWUroGYHgWodGp2w3^Zx*GW^%*;

literal 118207
zcmbTe2RN2}{62h1C8<P3Dm09!h>|^?QXwg#VPrLwB%92JC`B0=S&4+K5-BScNf|A&
zw`65+-t&9Y^ZefTcO38Wf8V<fiTl2;@A!=KJU^fBb@Q;={-ul8ET$-Gsj`xy21PA#
zrl@&e7A?e2#t*6f!9NR)4(wN?X2_q&>{vgFT2Cn}{-bFh+EwdtH_RqXc-qcx*E0rR
z1*scb-ImW|e79nKG=0?kh4TUrS}O+Sedv)8-0^79$yTXH1;v&x-mxlr(6O(x*|&Gz
zysH<^6)mscx}5c}FO|A<!>M||lCh~bKJ?{fr#c3|hGm`9(zD5ljp!Z7IutIbwko#1
z{fL%OU@;vA_4gGi7JOXKdfwmvl7FW|zA*fc|0`^X9{iu5C|fLmDgO6L+pWU=-~T8o
z@$Ub}k9T`5oBezl+cy0i883F>l4qxjpILW&iE5_O1hllYI=&|yyFBzeAjf&)@qQn!
zz5B^{-Hs}E(9=HHz<HrEWrL6Wv^|&0XrpR`oa~OXC8~S&>?u#BY}e8+T=Kz_Ws~JQ
zmi?z{URp@@ylnpRaF0inF`1M?WBkWf%gkTPXGf6u`SHOA{l)szqVkiC%mbbJGg4cQ
zC2f^r&EtB68%vJ0`0X&vzkZurUaq_9#TF@6^0AeMQFAlS)7h$>wz+=%M@D~buZy_r
z_}7T^%c{L~*;Xz0_CJ37m`~PmST*Bft+oWOLUqRZ@9|qPU-B+d<AU2;Pdq!7P`GrR
z#P&cv+Y?DIn{EyE)V_8<LguRQ&|hwLgubVKK0BS+@%TaiP_nCR!qLR4CcBC54E+Kh
z?ufqT!n2=(L^|qoUH=iFv(;x}+p46er`MA`QaAbgSnkKB_g)#J&ZXh91En*QrJ29u
z6a2f$RV$Bdp4(B!bj!?8bJvOLv6^<djQ1IzGR*7c&W2j-Bf|-`Z@L+mlq9sBH!nOU
zHC2RJ=8wX~?4chucJ<EZE?jsznPgs{lQ})!Y1Qb?{y089KF^<AJ*D`sy*MVdN4Uxb
zp1gdH%XPBvqZOyL?Wfj2{qTh39I4*R6|WboGAQ_F4SqQ=_VHMbZ?$=@>{{`QPi0)c
zDD$^g*H7EtSi+%UAVnK-4BxnlAo-3m@Uh1H+dpjbdAd>#Hx<SH$jnR*j1`FIJp4mf
zrm_Spno)Pz^h{}Z7_;l>+kkf0sZRZ~?>t#DJCn#g$yHw8pWDKe{U>r=oaM%f4z)FT
z371s8F!kfvb7M)-HD<XW>E5gz*A^}nl-NM&ST$T>XxlG8uI@6FpszKpla1XBb)6o|
zoa`@tZus`b)fKdT8%PigEN+e3L+(LQY1iL-vA3svRVzMnNLI_ChkLq_+j45CBKAO_
z&;mYs1IMS@F9ahdTllRorDNgJFjbl{W?vo>+|EI+uruK%-`~BuE<ZC?>M%1=uP@+a
z-BTl(QSaiIGxphB(1}VDseNTroI5j-+g>8w`*c!#@+N=yhssprfmr#exCU!-sWK<k
zaq=%k**Xem{n=eNGd<EC6L?a^cggDQ>vPAyCtHp-da#EsaSPMZ*N^ymNM4qST}ah;
z-K9J_2iUXdrG~t#e#D9L)OO5yVrBL=I6!luPx4B+WfBc;GULA<E#{_P-_(fnub&>y
zko*;S&6mfjz2q~-6LjpPLrlXdGJ=$WIs102{qZH{z>bSGFUcM%`?74>cj}|RfaIS%
zdM34n3RY`GPbm@OS;8r`Q@OuTep)0p!sYVOB;6C2M9GCJCFg=s-o-eWDEDLo!@%Iz
zL;T9iWd7vUxXw&<RA-pKDWWP@oj-rRzu_A5nVG4PL%x$ecDco4?GX`{nYwcqviSEx
z9Rm?FP9(DMDCfFN_2<zuN5hvJG^s9?iQZgSE~h_l20Om;D_fSzj<jsiA5(b~M}}fh
zs1@Lx$Vk4WQXXWshdWxf1)qJaD-^rOV;NaM4ILdF>#dabhds=){zud#dGFl0bFLyu
zuP1kA`u1A!q+iErAI;_qnTx+&NAGVv;d?+hd#AZm@AswRHIaYlQam`sFY4xuz8B_G
zUbA*BalJER{jCdPX(6&Z_TP_AcSKAJBxm)j__jQKUh<J&eyr%~3cADBO_Q^C@kcmW
z`bIx|xM60(b*2|T#*Ooa69jYn#Ey|MP?T8>8Kj?@lz{THkvhk-Kc1e5i;EMGSVip(
zmUnfz!OR`5m2foX)hj7K{zE0Hg<NE2cvT)<aqzE|F|}Gs9@BTWIqm%Qptyh&)z!F=
znJZM-b*xy~_wxhs_OB7s)^Bc-PZ>-E{T(-(OTS-4i}}p3xgSrAhQN_nwS!0qSmDju
z#>U3p5>1|;Ld3QxTh=>|w0=8sXd6a^*M${+v+>d`-F%?!b7s%0PQlJ(r`9bnMjmB2
zb)f51g=^wd`H{@hRY4?IQ|Uf)5umJ1cC_JI@Q$-;zAf3k&gN=zV=Y*cvXD^c;h&Pf
zpFHyA`TF=^@rpw->X$Z>Pgd@lGZlpgcE#H=G`aL`zH2Suql*+nIQkeXHx&1AoP@6{
z=g7Mas<i@&TP@2skpIB<zFl3wBx?VsVdt)0XY$<Wg2l~GR%Mv$6T=7|>Z~{m`z%F(
z3{eY{I{oJAf{w4Thmh9riWK;FX{*v+_vT2$e3X5?gmaEJ(z_-8{uXycDC<u?Pcpj#
zNdR-3Dt%;bH>%PyrptR6<o;Zl=SP)Qg<)<1g|qWWTcCxMsB685+=X)EOg-~Cgc!JK
zMbc+CI{N-E2ZUoVGni+Z2JYEWk*v=qd@__v_VTH|ruQ9X4@us|tD#$U=UiMcdH(oN
zMEDES3O;2u$<~$b%v8v?B)!+6PW?s7zN@wzJq3GOz)v&T$A7bh*$>=wi_d`=jVsEO
z$(Fi-@DWX}(jbvXc|7Fme@gy_j9r}_u@Ol!a!sC_7OT!v(CPaz;+2u(L*Ard`WL&T
zoj}w{l;M)HHlbba#?rM%9VYXcgjWX5C-1Ba{=5BWw&}gvsr&_4WAOL)god3|+5+eC
z!8jS0W@p+vwYS7DlTT2QpS9C3obZhXFo=tf7nE2^nf+7f&G|Lcs-4ID@BL%u3_0xt
za<z)Db?1-72A(c&M0RgYEiJR5_L<3unIqO&dguQ%yfL4sdQgn@%eXw9JzSOWGl|YT
z)^VhVwnQsKX?G8K{O<+fPRbg3-J6)H^xCz(Yt6Ifrrf$>4$V~F@)CNr8GG6PT>5yL
ziMpCpN3?ZsoosukmD<tdliA;!+e#z!9cnA|>}rLIM9-FleiTc}8UL+kxs=S$EnRJH
z`>5gNGvmoKAs$?pH)9$fOI@c2oRCC?X8{m(@bH{fxydRT8#l9c9(=HCl|OB~Ze|bX
z224463OhA2Nj6?zfe;@UYFl+~>|3mymNgH#85L5N^NaQkcCGkHTwKv69~lknCA1%<
z-&6iSgqYHejUL=3*BMt>{(9)~_-tu-M_sn#jb)r6KF(dn!BRHo%lj6KIrO*aUhasI
zY`MGVOQKE&wt36j8%vApXQt}g5xTZWagn<!B>WAYl(fP3c&)KN9`d0U%>}I!^)oJQ
zpMw2}-QwpCJ(=y6@cWh^WACC>$8_((Yt4t&iPza1!%Nb~ci?u3J(8VIpQm4*{D*-v
z?Rzu3knU)D#cFBW3(uz<v%bU32JBo%><Bca0moAL87FK|SZi_M+2O9Ana$qPmidhQ
zEm2-VA49}UZIx*e%dPgxoXt`fVu9P3oqz2a>p0{ZvL)%|Mg+>w9|N{Lzwq;!%y6YK
zxxHYa^F(!h3GsR0<%bR5wi$m85#x||adP={jaxGu@Mb{c*s(w|ulKiB1&f|m5y}~D
za_V)gFJ8c|d)PpmEQ{NZgMY0e{j2L_p=-+(2Cff2u9M9*SOpHT^O~=0I`4<eI<}2A
z`;-z((Y5+47SP;_Af#(oYvXDY?m8Ces+_5-mZ@7JJ5uxfV8!%%VHfqER?}ncxy^l3
z<3kDS^o0Vgnr`x6oM5V@JEmu&MFWiE1#8Ve!BzuAj0>XS7i4d}CEJ3WMqbx{=CrH1
zr$5=mK34AIl6GJRNnuo))?8$!Qo>HUjJ48seWb>!emIDv9f1IqfFY#OyC<HXd7+q<
zF9Uc#R$`ZP<V1w^7v-^b*Xi~QS(p6ljAI}6%lQJg#_=}IUiUA`pi=1NraBIZk4<2!
zw|N5vfBGz5pOs*|>bI%>gf^(f>8apSr#8+JUFV_iuSIEmLE*t)AEv51%JwjS#sZFd
z7lvK_9Y=!aX=nH&@U_LUeOo>e)9})tjC$5n{%C*QMBPYEb57%1kFy9CuV)dVLiOJe
zgR6ON@~(W`e0lKKy=%*)o;*95X?YsK`$dA$s`E$)KVMoHpO$qR{j+Vc@Iej{qjyQS
z7La8s`#0w1*~Z7k{cJO?*B)2$MGD+y7kac2(Y4Mfdz;_ky0To?nKo}4=}OW0*XO&x
z!HLY|(h&!Bo(66`@ukK>+r8qXqnVctxJHIU@9?A}3GGyu@?REXAgy)Ge4_9|n6w=(
zZQ1&<&HJk$!0&mlp_7i?FFrd~BUjW|kPlI|O#f~oQ=;B$TQ8fDo?0fAXQorFo!S`}
z_5M+weH-_`Zws4#_MzM#&SRchcQ(Q~WLsguowX0Tb)^&Rdh0O6Z93_n9giiSto`42
z2qU*QlN%=X%+?9?d12@!+1ej#cU`bAw3<2nFj~oV?A1y8Mt8Hi*RnI=w0k?U&ZPvZ
zQms^G;L|S745zXFZ7STdmj^!}Amw~Fer?_H?EPi9Rj>Ut;g;3@k9cpd5fcOldipfN
zWxT^IT=7}r-|MaU?|MT+Lk>gVkJb0RbnML*f3V9e{;7QbJ)h`Tmpi%3hdREV^%|NS
z8+afyLU4q6U6$Rl>fNubJAAE5Bx7j?zy+*nS2KdFJ(pRtmi~8jT<@cJPTw6qAA!K)
z%*GpQ1v_oKt6H16XGW{J<gI^|S;KCgg(DRBwdZ!HS8E6bW{6l6zXW&G01i8$?>aTq
z`?8%Fdx9e;Z7~wv|BFiGS~u2S{P31pelopK7i$|V=Y07zT*g((gVVOMnyVgEJk;`|
zK)_I~U40n4zLP~lq1@27Bbj{#YX_3-vP~Jj^h)jLcD7E+>i_t09E({OHQFD@cQC}?
z+-2Ya!OI8U(Ja!$>Obq!b7&19G+0e`=-cF1`KcoWeu$a=IMw<wzyWc^^2dp@tqJ;5
z=UYcXn%m)FP14~bA*M-jE7$)>)CmN*6PX#(p9z2&Z^PDqsL8Y%_@3+{#vg8P#HSi`
zcZHgS;j!#@oVLw(#M_*E9rbB`&c}^N(cj{C=nU?2BKxw93)orviilCg5~<Fo>+Bn@
zWVZMoSgcCRYGs%IBAvXWH5P@Oz!341Sz1sF1_&<!KOL*8$hm)S)BeXmULEj%Z~5ub
z8t0K(Wna9N#ImuUr7riqIi+)&#e5J4NdP`qpX2P?;?92Z1B3i1jhBQ;*?dHV2{KK}
z<YU*fR==1t*0yap=TAP9PKz@bi7yXmT}{(ih8lldLPB+u9RwIP-z8i!FZ*ls?`zdv
zo)mWJ^UA%v6wX=_t?Wk#_D3uE@p2#0GoOM6f0b_n*E|rX`4N#X^pTI8)JVOnE7Te-
zNn3<+9o^>K>CW7`zIqy_FtGdnZ<;S-<FqdOhol*o$!6|$!d7=c2f;pXQTDaV?$mRr
zpQsD;aT$Ch2^yKvnMIU?0&mU-56p8UedNYXNJzg>l_o$Go?qpUtlqCW7M-4Mdn_yZ
z%vT_c8i)2ULdOTVXZpAu2Fv$<Ei1eG`}glI5M>TY%TpxzggW+Q1{Cqp40rNHJYcVo
z6b-jfdENws<hwKN0672jKs@=E1eL~8Cr{pLvWf2~dl>CQb6?zY|BdOipZq71nVlrp
zq{Tl<;rYMt`zhcrpE6+YkxQ&pm2be7V|>c5?0Qe)e|VYvmEg+G3;~@eKSLV73Y>oQ
zLd{E&gk#BS{cRI8c%F82F6t>f`2PHCDUZ%HoiMetHF577@w)xwN|7%#{v!UTQzP}c
z{iC3{!rK`sv-djn!gb?vw2>v=`xpJ*SuGOPZx4J!7))Z2i1EJGKArm6Yl;1Lt&Hi^
z9-SFacB;;M_M>Ju|Gh+ZyKJxWYuj#Lp4{o7WF|Fw1)V?0bG*t@wp~wXXz)L5m3CsH
z>-nL5K3riu4lqVh*Xc<kdb+ZBAd@XX><4{AFXc0Kz$a&!VdS}g=ex^ga`a=fvLD~7
zp!se;dcqxh;$-By)4qZOkcn6C`^!DPmLMd5Z0>a#c}-(wTM+4{hO7N}5WvRtXF6S1
zh0@mYKML>|p~+AZu*r{S$C*zb{0_;T0M*leMDQ;ZByL|G{xE9|B1g?KK>M@*j>@hz
z;Mh?v*XFhLo3Y4=-^FvsK7VX37JB`3mbRpH|K9{hX=6AK$4-;OY}>T`VVgv}eCxq1
zEX778zJo{<0Jo3T6*^0pp|ssaDeGT{XNk2v|9|W+3uIvnVb;E<Kz7IS*>et);rbDe
zH+SN0YL}mJaHnyWswIDUacw?eT0dkDX`&KhCAcpCdYCmc)k|)@V4_vnHF!s<%noJR
zTb`f!JiGIeiQDH^wNke0Ma9XnCidJQ9+EnU>;<TC^$6YJXaL9-B5etF3OjckSWJWN
zX##Ugly|YYcV@cRYO=48UwL}8P`>R|XVMl47HaQ!*ID@o3z*n=l!=&0(Ci!~?53&K
zdp*du!Z(D-7SQBeXs{3@Nl>!<9T&^S$>hdnJ5p=c-u1i`;8G2_frvk6Kiu80w~!cx
zTenDg^elNZ$e_(kfBZ|JFV6#U^AaH3yJG|G>}wahu?<ZQl%56db7+=cZvLu+Z*K8;
z=JKXNhShhiW3A*azEEv>xJUCCo&m&Q@@1^|h4ey@d-=B<eX8WEd+E&rvNTL;G@vE@
zIDWF=*aO-$sIGmGt%8lcCHdgEYQ9;$&b`G5?tz5)K^5x~Bz(Jm@^gI)v@#{%br7IP
ztbAqLtvx4EZTQy&Y!B4Fwks`l8GcSvqWYnRwp?Wr`T#Y1vD(7Cq!cKUP5}9kNXLgd
z1ECh^G?Jw0w(IlX^>{s1TR)|DUsykU!=4+b8uG3Xt6aErjsF#ThBLWS!&{{4*dD8e
zg^<(;O<dJC6rg3S&&M@XNbl83sDy3L5cEkr&77H@@Z%xS2iYJ5`HL--mhfypz8l`l
zKRfxV@QbytT1kpJ>RJK9jC3=a{Biv?2Ii8;f3AH2&P>Df-#_EB`KC_Gayv>SHjxFv
zUiXW;4v3Et3k9Wk{oayFG0htPogg_i_;mmh5U(<pC<Ji<5))o2I{%vNMY}qOvqG<a
zMR8VD19p(5nwl6XrDIrkcnoTm^33$q4gz^#Ru1(14zN(G0*<q%Ag3w&{&Q_%E40&d
z-crAxoXKUQwl$CB&N!14g4`fHD~f%IR_54oMiufRUXdqMHRmF4dbL%lY#{E~<7;(H
zODhKIp(t<59pw!WljBvH5Z$Vmg*lE7c1X7RA3^p@=20dH&fK|EClhJ<!2mB+HjgOn
zPQQ;te8dL&@sN~9&n$Bn`?_j&bwlpV#ov@E&<<S8kN3d+m6lWBNN39;|DmmpyRg#+
zOs={u8%|FH2&%!NG5s~?V#Dt1T+HN!oH>)^$)vC&1ezX^{2`js2egx%3qV*%5Y?60
z$^AU|R7i@(g{ZP&2pO%v9{C7$jYHiiM%E1^w69ugRO#dl-zdP_PAI*nTQdoOMQ|%6
zKw;_HT|4b`sgO4aLzVzhXGZ@t>a3JYdi6VgvQ0c!eUz-BLB?PCrt2;OA-iSm^Af2o
zrE(J&rd#!=dAqAK>K|GGp$gDSF3%&Twu*^~Df<HX(?A#D{qQQspIHu8Rs{<U7|Z|`
z5PN3Aq2^0OOb^CBhicDLly`;R3iPyXW-50ka2co6%1~Mow*8Bf4Ls7UJDT-+FCTM3
zt)`8H(}l~{Rc>vULGBtr7-;i7Af${K>U*j#%Ykf}RoOobG<<myK=e!|8&w@k)YW10
znfuta?SZ<M$^^XdDBIZB$W8b8v<IIJF9H;0Qe!0ZetlF6pn;up7B!eS-ruHkAN$zb
ze{(hFTv{kMoa!`ItgnVrlI_n~kXdD#{27-{c|Sm^Dnwl;f_-e5tu}l(tyq-m>S!wo
zdqB3pkMH0|h%3H#P+jSpAdW?%peT6ZRaSCG1+S3VEDgp+0>^1c5U;x?jSIa$Z_B9h
z1(tUT)N2-&MSdL1of(;F1G=m{|L`<&6o;IX{l43)@6pjOj3(j>>UL6gJvBxgbXBjw
z=+2;abpo0h+t%ZE!7k4LWt=lH8BH(--3$DxCcQ0Us_U#(S85^KT{f}~iyzHtM;Bff
z6cl_4-oXzx1dv*M2&(W_o%E+alb=J4Zbt?wmY;5xfBfpztDmL0+uA-q2<&xgmmX_m
zb`2uD{q7RV@!6Ye3?!%%2>uKqik*{z+~Zc{+&!|g-I>_WfbInN7LpR!Q>X%gddX1s
z*mW=cbMaNj<H7D~yFk6Oj-P`>HsQABBxR!Bw&}qAtpX?!UW`2yAt#C0OUf4{I!bn>
zyuIa3gOhvz0>6_PE>q)^(<8a01UEx!8xH+N{Ph<3dt=qYp5b?AGN<1#aEB7-m+X7D
z;z6s^>o0`bnf`d1HC$y%mXDYNa_mY@rpTYQHWhjfW37RHJbJIJi?Cf}AG?SgY*PnC
zHX1bdCpI0bal|@+8sfrMibOtgtNjbuy5iQXStGbYhVM{#==k{fQ=N>YB3U>R<Pi=?
zROd}icUZQ380@Wgja|%4NWfXSfndVrIx16}mqjqRwY@;{BUcajQn;4rw9z>^Igy%V
zbyM~SSLXbhL)1wb$PlUzc5NfOn4DbN>?0oz#+O<o3lLPyvVWbrfJ4?uc5m%(@z;SW
zF)~<6Y5z`Sa9YH`_Vt#zP_|*Hik_XG92@I4cYV<7Ix|7rWmsA}<S{|1HRQHkB6GDZ
zx7x#d_Pk|VAwyCZi8-0=ZuHBP%*@PGVl?oV$VUkIjhx9g-7hN6G`+i79gyrgGaW2p
zL3mz@-UPLoRHOSNX2v2UAsvy0L|GcIyzsoa+-cHDGC4xOrV7`Q$EroBvp%T#ih}VH
zZaEpa%|YC*;5LUJapAHd!qlLaIIFf)tpD2!Fflw<Stxg(z$J)U6U+3N-HeDVc}&-0
zZ@CD$n}JW%oSm8}q3}WUN9@(Nii%iNWJx^&HeYXh2&mig!v>FNO+hUcF2Gfj_=JQ9
zykDVezCKtOpb}GaxigVybkB9!$yF%>yT2l5Vr748@eZ5c-&Dh8GBx^-zp_5(=SaYU
z`lEd@nu7S-3eEZeWMkVC1;8odq$--_Fpxezf!YDT{?zYdR(<c;#dXT(yQQ0g`E{TI
zF+29izxuG-Q6mhM2&T{(j|n(@3YV2`wHz}WNkf^Lf!pcQ>8txeBw@u67rtLD@<Y&{
z(du{vA6&tw`hiGwkmb`&xG3#p6gnixyrK2>L-PCPNHemX594dT@$aS0El-FLKCIHm
z`925?PxW4>|Fvr!e;E>&cHgY8BkG3}YD`a`GRqE@5en@9!BfXTkkWpDL=p>QKp;5i
z8x2KOvibH7O8|Tt<A2jZ?h^Su*kms8rnw=al9(32$b~A{bVAW;*EnDn5)zIG251Nd
zSOQWI>2BK!d+9q|vQo-l0DYX=c0?43Rp|Pm!m}!LvD=TfL$2dTmSHg1U4G>s$@&o_
zRdI-(@|?ZV1@}2MJcViJ&YjUoNl6<0qv$sXgzzk$8T<J0881dIBAI}0RqnoNSAtqc
zKO&)oc&+sjLRzfXQQDb5pPwbQ;ZI?9b-@UHZ9mUOe17P`r9De8Ow9NMXY0&)YLMl$
zh<-xs?fdsu*iv!SUaydvui1_xx4{-Q`d7<dwjLkuuHJo<F4Cql?c5O(N+6Lu{I!{^
zQQ4X~r6X@R1SAHVjIIPg)BC9U<#zW0krZ$6Sm%oxG5)3+7oKVPa1Xft;+t>e(~xmR
zR#g9`+uF|wyNHVokN1}eNBM}v#DhH@m<>T@-+ugf3|mc4-$hs@6E96ChmrrSLCoaN
z=8;&@H^%)fMZvO;wjpamYGG9CP{qZ*y?F70W7mb_&{wp#OHy4wPeQgu*w6U2nhd0@
zYffUSG~TBBayu*{(O<>|qB@w5*ufrxA>%YLFyib$#c79)s0LxNUAxGbpjQWPH{2};
z-9*rhTr8#fiJ<t<&?)jg3dCgDz+p3CtXOD*EHEv|!6cnlRaL2!H>bQ^X8Q0Qm6!gq
zH)q1K>RhbW52Eb=Is_BsPb7_5bY>8HaIWy?R;z(FnA2Aa*Yf(<bp8;_Y`-JGH5ZZg
z-B}YrxQ06`JtZB5A97$9AFBrY@{kS)TG<t_MnHnrchDs>M;uKCNMYjK*T*qAIdVjs
zgh>yeQ({+UtWL&7Nz~v3d3_Mr<w#Qs*?^GhP)RGxLQcCDZ|NQ-k7b;M;uCN`SKi+7
z*U-w^V9ggq-WU_5I>m60pCjoYnA=fAACy;=@p0|VB2<~iu8TE6@SV21dzJ}Ek?t$1
zYwH|4)6RgHat2#~Jnk=W<e7b66W<7tsR;B$3&ar}od=|V04;c%KW$r4e0u*cHE|Os
zmDrL{No1%)&d$yt<)Xli7wWP|oHDHt@>pl7gv=4sVnIqw2rLWyX!;|iFff<oBexd^
z3YQ}Mk<yP!k#WbD80;m=q-4owtsE5{A#`<|C;J)RgszK^0HI#q1X1U4&6VBU*iy01
zGDtdrJPH0Xq=CgUb28l#_2CF0*q5Sj9%%}mu1qn6ba?(v3$6qOCA{cQ<6DFq%b%wM
zTft)wfdzYc6Z)CQ^#s8&_8eI)D&-oIzJ|WOesF_2YCZsG*g<qC#b})K^4;HF5~hQ7
z&g&$2oK*UfpE{!dYM?|~7{v*_Vf5zAqqTmdcm&U3#XPlYlozmYCxRP9FmEE670D7%
z^5Ne93V*AqNXB}UT4+s71kbl-LJxZkj11bidA6|OxO#3w)AeD)*ajF@{Zx+~u}(in
zv~Q#bHCTL8-yZ`vBpYck@x7v=J-zy3F_8>uL=-BI7Y%>qh3}o!E!-X_qQwSDd~R#|
zD~)|lmq+cz{vbQU*8o9oBQiTr44YwZsL!UT@SQ*aL?%3Dy^^fl@Ya8$k=u{cJvEsK
z8-Vdd*hT}!gBq(mItEQX``et-yEg6jR>iM~8&tj^WigQ6Rh%^9Ej!mGm&bJhMF9uM
z7nmOUFhrBHhqe72FAsMGKk}A}*XVzenD|+4qU$SSE4e#mP&yaPQanshZh~{oK{tdz
zC`Gy5;RwX-P>_>U|B%oAn@s7bsvMvsenOA6XJy8!xw?g0?KdKHBKZ?Q0mmjy96&uG
zHAjvd@zaRY#G-KQ;KOE~Ba8Anx%%SkZNw0WtgRZfeM57>ow;T>GX<^8xa3PTkBB_@
ze;m{($NS(FGLVqspL(E~<bY5nPsHdu{>7@qauv3S&*?U9yPm*E5O<KlAqrNsgo{6>
zEj0<;z%|GQ$!sc1sFZf?*S2PWt^@tW!e_C<%D!k6l0v6i#%vNjC^~BpkpVi;=}9u0
zg;VW)h1{jU7L_eDg#rb&C;l=-cqghcF*CQs6DT?X!i3k@R+~AXMQY$0_vCt_{NJLv
znL?+S`rm7V-BcP~Ekr0FJz;z!q<1PVG4WuWrV4P5NZ~p^&`pvisPqc#B>F%M{3^&U
zoBXGhRQe_AW{N_G#6U=RMhh9TJc@0EY!PaX>_B!0+4v&aot+*~8rV3AwYcq4m>Y@v
z|KFfS?VYRhRqiF)P~4H&Gf12Wt*GHYsQH?#pG&G(=*+=i1bJN`{O`n6O%CibtMVW2
zcJ3m2TZ;66t1Foo2tU@kzs9OGhK5^$kbyt8_E-0iy2UcN(FUYHL`W<aX)z&@wd^SA
zO(AP8^t$U=C`4k=4&(BB<YFdgv|1m_S<ENj8zlT61T^ZkU!cB=B%rG@K(y~E=vu|7
zl>|Yb-O9jaZ!nALO!)o^YxWG_$e%zwx6INbhk-V+Owz4KlI<LWfNhXs$?19YNa>D%
zz&lLJB0q&miz^c;r)9h|`H&BnY=35{bJA?%-A?woTspn`|0uAz$*M!l1Z2vAJljsX
z4@mX{aT0-gB|J-bJ{!;Vpr-8C`TqjfsD|WhXlJyVSczo&N>1W#Zf$eCyXyor1D)9n
zfySfQ)KqXahIN+|fGDsDgcfQZJC;H0m%I#^#ZsNiX$l5q8}R>z(^QHc{XZ}oW$!oZ
zm?=8{JFLYS782b9yjT~5LH_<9_CdSzh1!dH<HuiKGCYZ@SeR5Hf*$GY;H=BVJE5sN
z$BGViRtVEGEUSsLBG?Kc{rj@sUV^W@*JU2zsPPfc<W9DduWND1Dw?tno9N>kX+GFf
z*oj}fxG_QZ`825<q7M5UZONo-7m~4%6sy7m(hV0&qz(wM!*!0mQO#*Vb2`EtfFM}-
zbn$)I4K2OK0WL!%guwTO_(`OV4gwXHJ~R~S<XD-9oIe3&zm$-#j}mbi;zAKZab4ex
z7kx?$(jei5fDRvXCVHy*rfuVq(<^FL85H^Q2Fo~D5_E<%Z5~@P^p<;iyZ%H~h3)Tk
zq%{=DE%ogLJL(K5dG>1F7$}vOM{C7*wXo(#+*2mV+?j7Dpq|&0eqQN-H&gb06Ma&&
zRd`V2Q<2GSZb@67k}3NmrZ2fuHW3F0{FxlD$Spwt618gk6ehHt?$6_pkPyOr_j5U%
zqBWIXU!kT1b>o)zi?8G}=!8rHbU~USncvS5MUA_tw4-#Ft}!S9&4fTaDoqyEzY$CH
zGx8~uY?wP$o%<OLpMAJ=7U>$D4e?yggXKgpjC0Bvhr2-&F~|@>#qMK!0_ke>hDtRH
zHqFISwN%HCKRcyNYR9CKgYg!DH9rTM?y)9u((Q*KX%>VwG=RoGVe~h<nLnVbbf2DV
zmv4upX+>H+by4x|KnEdO3IfRzo(;1(N-N8}Gy5`2H@B&9K?gOpEM0p<=5KsnBlb~z
z1wEB#JNnt&<%mNT+5<@X$V5rK9|A}FpSR4y1om{D*ma@uF3~7UNTW+VrbgH&!kw+o
zKznH^O8wB1M_THqyX&7}`C8C8T0|lk8k5j{`RbRaK-!=PW;TiLm9wPukBXs%S3POV
z1t>j)h77yWNpw3UI58D@S7E2^&btN*>DfxX1x&;DzM*JCk+unMTdmXlzg>Z51Si{F
zvf?}^`hYPAu>p>jw$U42hp6~DF{3VoU;_-b{<jEpC%gZt1_EIf-<V&y^y$fLDT0tA
zW+vM|1@+CO+O=DVwa#$o>DOwkJhA(_ry|du6VHWj7KnzNDY$qe)!U|E^?`-PW?MxT
z85c<1DB{Ww4DywIT@W6nsqL>4b1_%`vb9TCwXJzic5LgAlZ*^kyH!tnXz7IQ>vrAR
z(uskbYvSg0o?Ozvmfl>lx3GK0(Cva`97Li2@*2J~QdU-eY17FzkDi`o_3B0P^77ZO
zUtei|``$ekG=HwxQ1l3s+b$;dUE`f1dBuE1(S7^rk1H<#&%qwpeSLimOihcgGjAlX
zZH!DzSmL#R(C)rbP*AW`4vK%uix+!=JrAQLXEg`MIZ2g`QNF?(HqiC;_1Ta0^Y+wb
zdyS6TH>Orr9`CAs%?`Tg9uvchN|+C}_6>ge%-%+H$BrE@WPd?fN;&tHZ~pxGp4ZCD
zuLK0FlCt@I9Ycyjek%LAh3&nPCwYC$3JO{+Auat#H`8(yI7q~@MM-&P$Da2(8XBxX
zPabI5-5?~SHW+mCCSz!5XhT=m5(Wl_pa%~QtmI`;?-&{%GS$s-;^0>c%`5WbXJKL4
zaJlBgheZI~@1c$!Mxse9War`%if^!(8n=m#iCJZ@iitNiG*Hsg(v?4dJ~XRNFRQ4i
zm|M5ghjew<p(kVk=JE2?D;3UVxF(Bw92+O+9i$5b=sET?)7igLF`r2NT$f9>w);T&
z<SwTk+_>>$H;~Vtk&(iiEBG%#Wzn_Li$9j^4cJSl{FB2Eucp7%N%|h`^YWzxMz;_V
zXLpHVf>y!}RIr+!m&#oW4h~Mw-$2`<RiLfhe+CCvRs{%ZK@?um@cXy+){m<-`ceU-
z&{E8F@7}$9xnR|W{QUf?p%N@01nxjA*I8Iu0=D5o60XxSyUgou1LAuM_8g7VSdcr}
z&xFttojNi0+OGFfdkN=;qGq(<d3t*b{aFpTk!Dgp4}|^dwQKZHW<IDG(`<?MMf3Nm
zQ>XIYzNPnK+m??{U!MXRy<}nW1|uuWv^x9i*Ds~j*pi>V;o-~7&CS6^IgoVRQ6%6>
zI1;-|Mn>ij>iVWTc)EF=LuDLTp%%YEr@Ol|h+P4qzZWEqD40RjqH8qMxV2eIY7NA~
ze5{S)7K`ets{GHNm$I?3vF_Xd{vQ*Q%@98+Zf<VFi12VO&{lU?R-fH<&nK(xr^dxx
zriPB$U%tGNmv?VvZ$pDyz3U9;j~_p{ZKiMEULxZ(y8OzOD~pyaQB+W%z<7Aa_dMAM
z__Fidm-)7~w!3{jUc3<VQ@KeS2>J3v6I12_W53o}k<2u)Wa&~6$Wu<}e1^{jY?s=&
zah@Chp>UGoMmv*p@7}w&p^JN`$#TG;Cy7_tLk^8Asi@>blWOSf+|rc*v`pLv(eZW>
z7j1cYRZpGbNx$%O9+vLm)d*ysoSA9ImoHx`t>aXFUtG+DwiA{_xdAL8A?qp6p80MU
zg~b&kN~M~VZ=A$0@~3L@@+dAYt{|v5_ewh8E3U4t<=N|n0VQ^0Da&%49U&#t4G#}*
z7Znx!67=8pD)N?t{cSh>vGt);hAQ9Nmln+pZEcSe8Q&=?MqiDLTn~iFfY#v^!&-^j
ztEBCE6j6#4lg&EgCx=GL8{DpwEE_g#$d}u{fBz&h#gn6nA&+nH&c>RT4g-8R7I3w!
z?64tRW@@`#kFLM4{>l6gN)Nt#{VL)*?ey~XYbL)NkX-IS0XKmD$hv>OqM|$FqD6T?
zaeaRp_ZZd#F{PX?W(!y$LSl)V8&xhRD=T~a#0kb#GF@F=WlvA+xc`u!+){8)PkpWr
zZYLornHt4SJUGDN=FOXEXWcs7TQA&YT9+l6ZdQE*JoLb74i*-Hx7F2aSMaMT9zVXC
zQrN$r0UG%mG&Vkrjg5^n6nik(*xT!aa?r*dJ61rRJDy@dht+x<*GMxErDBaEM^X*n
zE}TUBO1_+_sj0oaecrowikRS9R#w(O5H4>DN_(#snTJL#H$c?u_gGl&32JC)Y(HDF
z4%G}br3Ta5S7K`6GI1Imquef|bR@Wjg@tV{dbHbpameB@+Kp7)P^vaUZ{hYGJKo~J
z*!FW@ct|MZWFu3&_ru-Y{Y_C3<4A8k!(B1;(L!$5cM$p?CMOG9SYsQXBJAK?7}v>@
zC-ZaQZTZE;2U^{S{G)%ON&p_b0)pUtJ3G7kp|pJyj4271^+H73Bq_;39Zo(ei$r~I
zhGFx2BV*&nuCCjtvEwk|k|imYttzOQ9BldkM|nch7X<kYa&lahfuZ3Q9CXY3@<k)A
z`RmuOVwsnhlA*E$C}0(y=wt}h<+=)Y;Wuy59o2{8E^Rw77uy9443Chv?T5QKsJzO`
z)i}7sudA!e<9*xH^VWHwmmty8i@$vNf`cDbX>Snw)j^VNm}ADGnKO002M4XsY+$9`
z$@0ycH$hT1f+urbmQqiU*JWg74Gj%9Se%N!*x1;J`!b*kHJ^H-{%B!Vc6ND^p7hO4
zn>I%I3?XJbguOzYTR`P~`m_X+(;meCn57s3UMf79y2~I>kqiq#VBh7-Qd9$`P#CY3
zz~k*r-l8N;4Dn?4<qe`}tTi}yju8PXhFx#t#;B&^(o$wr*LNe|P<aIf3(3`?(SDb_
z-R37pKJ59V%k$Q)d;lPi_R<J?O7-ADWCYj9qlwzA`}h53<>vN5=xT%wQ_uM5!64S`
z+O-S6QB?fhkF*HeZ$ur6a)Y<5U%#G$^d2aIkU{*TDF5wSl0^_9^9u_Z5EwN)Xt5yL
zqBKH&Kl0M`_bbdUT(~qocrGY7Sa}_MB(4~(4-G9X*HN#dqb``4^`R6Wr4k_cd(Bm4
z?%7YD1XLiHg-gz95rr6)kJzx@uLzqacN7Wz@<`7e#4Gv_+E)rHD%iop{m6(kGJqW<
zq@*@(-@cqOFf&^UbaT*uBOe)Rl;}R8zyWi2a(zuce2GMArx`^g&M5uq>-z_+hnE<|
zw`~#<t3h%0D=96aBBAt>U!pPMyTy3x<blOBvy0QhyNRdnG%2HQ+_*vPBKO_41wbt#
z=f7><(D)YHj6(*7`3x-T0*G{CqobpdXw;zG0J)ikgoKn19r||_J4Q2hlQ9Z1!T&`r
zH7MZo*hpXz1KQ_L!}_7)JwkG#+|a>j0GxYXT7HZ3ax$2E`?6f7*5Ev%JA|Qc5AUu)
zbMa!ROY<mAL|EKTSUb&RgM}s>_Z&j&uWxaFhi{Rtx8vEDU%XSN&1d$@DszOcci;(h
zrw-1~vG3dX>C-13?^~XpJG(&vh=BvvGSEDnPc3vbHCC@AdKLz56ucV|!42+m6Ats?
zl&HO<<0DwShR0q-MQzLODlt&N1*jR60mB6bvcvzPmwC}_^JB`lZ<?!DuU6t*NCu%E
zZrfXT2hnLBh1_V-@r5_-!cUE+CvX9sWn40gP)j>?=1e})lZ%UsNp*U6VW_2K=VO|O
zvZ&|y`uX*t<zl_8EGLDuQvj42CHff1?D%WjZPa>1T^!ikuc_%jH*fV50r&1*K}q8w
zLd2ip;e{2MJj7U<cw}VO!9Z43K@>SfHi4tLsp%>r_q-3<FO5*NQBqbOdtxQpF^e%g
z`^AuaWE_W=P-Tfa;#-fWC^TVuN{h*YLcXA*Ja6871Lnv>b7$bycnqW&O;{3+!&?_F
zS?vZmWdNJP+wMv2B*w6jKr!s<0?G-;CQ)`uOQXYP^BjG8{0eX-9fe?X9c0`2+nBXz
zX4it*Eo^PXPMOr_$cl=J8km@PBEgcN`8x%J*{H6rZi0jr_2>~D1(2Z#9g3H1FJ2!*
za;L|cU^V%`C3`1)xaF6fJ$v?M6m2B)maUVZu=11w>Q^i)S6&y?idR~<iiJhFb$V(d
z<@xhH+S+RX|2&`+<QEh?-W;o4qB85rivF=ct(!J&f?{OfXT;~UoF%{l-O2AjOx%ZY
zj(HKfKMZM3uc*m}oV`R?Kj$3^V`T})gaHL3(I#-o($aT!O$6W0rTK^17R~!+FNRnZ
zwBzhDssS~RAe7whmA?P?-o?0g`D;6KN&%`Z20kBo{^ldHzj<%o(18oiL*45+18}^b
z1Z}1p1&gOAdgZUXyFWy9((u?#Ms~UqT?pF^^XF3v=vcOB`^0f`6S>BEUfu-<Lv|7f
zNs+*aZWPU-oyA&e<BOIpy8$qX#2@?HU~Y1|31B*wq)SJU>^}NCA(>_ix4huck;u}N
z8{E4QRp7Gk-vcBBXNN=kdJb&kAu^%;(C<~&*4CF=KdwqMt3FEh^hpp-p2%;v?K(~=
zKzX}_qp@0tQgt8yX2HQqX<a?NRO=269v+@2mnG(0+zK_46|8b3?7nB$S6YW3ui{5f
z!nH?_Hc<^&FA0~)m+xlZJo{C-z2VI@a&O82Z7-^$>o;z^htWJF(Tqacs>yZLZ?$^j
zKghz)?t#e==fkNV1%ym3uDdkr^1LN}=+GfU()V>&TV#Ff-(G+s&6C1zNPwAWC|Fdj
zlYZgecGmwn@zzNf8w4uWeb|Th_pqW6g*T>o^-smC?64#FAT4btm4`06$mHbYez&=m
zcFK01V4E1}WuOH>eV4HqG9Pt|mS4X-K~?8bY7ugG8X649E|QnvE2KqZq)nxR2Lq&5
z(tc!k_3G7z-d<+P<i}GE2v{2}aB<CbOS$FNfxXSAy6Upm78Ml<awcnh@c=HOfxQUl
z;b0Z|*;P~TYj1A81_VV<K^LHd`Z;SfN_>8XU6rX61#kgEMcV)}y7EN*TypS!ABgjI
zv#On7<w~5hz+*XhA!~Z!owfXiJQzf%7LEWPw-*C00x#0)u#fSR1CS!vmQp~&lfjuY
zkH09)t;J-RgvIVkE*blj>-Uyp4FavygF;7UgLafJ4*;38Z5MS{Yk?iw7*f*G-l3v4
zhf7qhhq`iud)%G<<cLUpvS8RC%kt$I`ZDU$rAvp;EvKzt)FciGH^KchJpQ2&cSy3Q
zPHb6X-BqbhqTmw=Q#7Nk5j#Jh`ZF-FXZP-_f9)%-q^xW?7M`$#!w+{apcH_828V`d
zc^L5Q^{-zhy16c#R9Vzsrf<pm`m>koL?%LZpgYCujAS6_{Pc@5H{$oIbZP5fk-P0@
zr=B@BHf9L6{Ojj)e!|RbB<43j4FpA9NDX%VT!COqb9%uZl=GP=^yJZ_muZ8=&e@Oe
zH9mXR1E*)!0}P<QRRJ<2LCJA%$&{mvdf?k1K761kpd;mVz@n?{Rtah<z`5usfWk$L
zjOydZu`yioE}=GVsy7hcpRC?UGmU$dV5=55G!Pvf-2kas!($JH(%PmA)jxkKuVw_M
zT?svxE#0ba|E%+<Y(!}a{Ud7usXVf?v%fWxyss`|hr*rVgZgAvmV2U>b;6cxt($La
zD05q8uEQG{C>&Kfw)vb2Z91Bo=+|INkC1WLXvpK=jWnKbTYP^T@9pdGzFVOqUZEo_
zvwP{Q3QzsLoZR`z{DzSPpawx4-Zs?fR^}EF5rJkQOXcO~?|xr4J2*>IoUcpaWMkvQ
zg||un<ffh`CjNI_SSW2yvra+q_-WVN_}Ey60|yS=e6#_=P$WdZ{B%JWSHNIjPp!1!
zYd}H4>i`aT`HR!j(?7grKy0RbIrZ-Sd&X)M_5w+m5EKy9d_p0$P)!qy{$3yJaxQSn
zt`8Y1MH|6Q;63P5rOU){;+H~!fWdn~L7OiedwN{m9l?{l`j5qF@KQ~RR=#S(u6(CO
zIc9pn7gdT+^6U$cSDA$s0k#m+ufKjRZDeE=z093_Y$wk|Zxd?o_a$fpi9ebscp(8{
zNh8i3Q6WG5;lqcE4ev?8ewM&_KMtfq(|@$ri%r|x^V|FP?-u}vWe#_Cb~dB5#Hy&s
zBby^ESBJW!#pGz840rXj(@Y?uY3tO>{qMK`qv%jPJ+oIs!}s02<$q=vXZG#ecelu2
zdz0If<4E2lCl5G1ML(W|&2Rtvn>Gs8KJ$|}kvR*mHhTe=K`y(2rUyfusb0O^fP(ye
zlMq(UHDuM|)Q3kBbPxM@ditL>Frd%|fOHc#h`A6`(yjbaUal@HwUKuzhv@u6;W9{H
zAP*)#pCPE;dnMyKy$(kg-UFS|QJ`^(5M8~Ta<JyQudErNM7{&eVS#L>JBb~c4=rq`
zdEGjI>p<(>FLi~$LME>eMFlpC-2YB~W^&UB8IUOni>CSW=FNkhU%7sLeo#<QBszu&
z07V9Q`^jqKmsg&ii$*6VnqkTI{Vj~>MZJcb{_5;}q!FWnq7)qxTq8IJYT4}7B69nT
zj5ear;Tqb{n_xc(0{dcA0u0WdzlEY?-p`*LZEbDmx$PiDj1E=kyW1j{HuUr)4O=Ys
z4MYECaNKtb0rQ{9^fUFaj`)VDP+|n0N?qt9yIXYXZgDG8hdY36R#sL+clVt`@>80>
z|IprY@Y#$|$$}xgdDYJWOU*GCt+;4rjn61&r2-G3scaz<6fY#G$om5&Vbbha%>@)$
zJfKgl@#6@!AkrWYc{Mfbi0rmzr%B}fm*040WOS8Ap^j(=$Lm`DY52!*0@Bx8T!HuY
zzA4}}H>hD7#l(glmG}h*-vp>@j&PlR0}_rj8h$nxYU#Uo?;5atl^N#TTeoc^;z?y?
zrPH_3co>!2g9mF7G-JM73~UT61{_<4Q#lJbI5?sTL4WU{zXO13Eus3TnlVAExw&!5
zHD=2B;ZgvAcQ9l+3cA>8d6%p-x>f4u9JqMy-@ku@FnB0X%v>@bo*zP_2U=(0;JArH
zK211CwI9K=51qUZfrH)~EwMqacnj6RDxkACCt|^Z1%?yD-K)jUuY};mEGs+Ybe)-*
znVfd~ZYcyX_83bMCGU;yIxZl$fbFzshA)B#&`Yuu;FYlJ>U6U`s;W!Qe|vHTTv~b7
z0pJ4x3aaB}Zde?G$SF7Q1=N+7`5t>pq)!5VsZnrLScErDdn>Q;i-PFSR1MT*JZvhk
z9Q>Jl6n3mtJcjGK0~y^MhE%o~Dk{;!@$KF4aBqyO5pa_%IN>RS9Y>r7K8r&Gy7J>_
zi6nRc&Qsh%9y@nB>*|MA%SjYV{tOK*xU0q33Mp_O+7S(mjIO4nY+om7IUkF$2S$zM
z_X-r&H-g5`x?fE(<|hP<)Kf*Q4-+3fx<-UhurQM^(Tun6+_~rPf4$I$yBRtdVYFw?
z@J&yR=-1>puZ2Yr9nk5v&kG#Of}p(5vlsOq4=*pK-#-$)p!Vp#yhKM9&&f$9-Mro#
zDg}r?J&G{gvctZ-2kwT3PL{XN%FSfqqiWXdy5I?*ln15Nak%S>uF!tK!I(MDhn%xM
z##auAfJZYZB|ZIzi9!tC!v12_vG0Lbjh4LMz_<67GPmY4J>-70@8^M6P!t~@AJ~z4
z=W2wq_h=w@7u`<*_c3qTvIW{5(W{8?hV5;!l*B_CZljm)O<mo|M2i-GU*D_H{~J;E
zp>_^rX$_&LhmnyR0$gR5!*df>D-Zw_l2nOm4gcGVrGd7ijs-VT@4M!Cs{nJwEa>iP
z?HdGdVcqYai!NY8w95bmE$<fJwd+Q9wrp2-clis`V?^IXIGYnN5D*R{(irWy5kEOO
z+0fXyAq@i5S#?Vzgl&;iAC?n?g)i-adLyKtvjonL3$#BfkaHTnlsX*-us!z@+;;o+
z?c?ufBTw4Lx;id+807|GqzOjFx<C4a{Nx{c)I^t3U=VvRUw#FY6&Ilqvl4m_%7g}}
z+y^Ohs|3lr%KffLW=7(?y-t!9P~*N74+^bJ(2%a8;be{Y#oG`L|M`7a5vbB)q(_fB
z;W~K!>*G7`Sk%>HVq=Z5T11;6F&8L^9IS_YuihyEH5h{N)y<82)bW-+SP+vLP%hFW
z<jy0BsizzVf3X07^YJ*L&yXGJ;TnzuCz!eAR`NO1=b?YCU++Z@?EhDNZf+UQbMvyU
zSRux>iKSe;-pOwZ#~yVg7ZEJWLr!*fhC_!+jl^yc=`-cRPtgr0DvOE)>d$LJ*2g)%
z8)b6#f7V~D&(Uj1eH0aya<Nv5atjO7yH~pBcY9}N@u_uxSM}e2S=9wQu4xVw@kca(
z?u;THiP-$sjC)IW_eK5)N|BB2?H=G%Yk{M%UMx_S48J@=c>}bU-$GGS6G$Jbw)FbI
z;__sDc{HQR|N0W+RaHoOSv96m1o1>zW9$a)n7?2_oGcf0JmVtg?%lhuE22wdYyoVM
zEC%8<yRR<UFg$T+-h|IuaW$prn?62;2<&7j0?goZmuzgb;(GzqH~3A>6jmVjZV3oZ
zz9XWE)aANCN=n-VpG(S{2BO~s*m`{!NzhAX7*GT7#}i&>P?dIWuN-q+J<unSDT#;$
zJ@PGVA$r+=wF+8}!xx@{Xl6mSk_dU>3)mgKd?h7L5<!t*jPMYYH3u%6e2)+K;=JmZ
zNUl}8_p7%oCZgqgwE%IVzUPO9hAOTI9?(cQx~;r)7%L0VTz34nq9QixiAKyV4QEuX
z7BVq0Jq*LxfEHJIiD=$K#scJ8WVUwgTFBDumvy03xOby+(u4;B2>LiTy<<d0kfLz#
zfL}9SQ@PdZZkzk{>js#y3FL2hzB^89uY#1|FSYgGE%q%#+=^ayWo}+P`O(b2x|Wur
zN&30VD3tCzu3fwKj-|YCySR8Ebd7Qpqh3V3MnDHUSqN_MP(xH1#=f^ZbW1|QS2&$m
z$gik)1dN+iaONYwUxrRWq`%Z$u=GX2Ud8zo94a5?l7fcs0PiEMAxU>cZZ$zkwI67^
zQ+BaBJyD}jV0d6?Nb6h;bW2o<*H8_hsOS!ix0X?E21FYL)GCkshn}cs2-0gNw(g<9
zV)EsE1;9snyw(<y*m~=81Eu~N^_PvZarB43<7{&ZdN_4o+b%6HFW<g%=k?4?Nl@9V
z$vNYTp;>BpjH>77=U*E9rLLQ0PoN+`%wA}z<>*Ky4oWo8Ov~1^HD4jv5^13(+mRi}
zR!K?8?aCE8!DC6c(clz?l!)S`P=Dz(vbEBI1N2ndxv%@;1|U+>LkGx1=W}1%=e1BF
z@=#Dk(7YB9unO?3975SQNZd*y=kfuM0evktoI&mE+Y@zqh&qS@@_6RKlzKbK^LSvF
zd`*_a3LsEY?Azd1WN_iaZJhUMz=_p2cvR2M=^b&X740k4Ux|`sRC;*$86js#)fFEc
zygt^~>k}M%1p=u?;m^uS&nl>gD*U&fsC?~ZqA=#S@C1v#roA$k2N$8B03%hvKn*Wm
z^u{i{MQEwuF^}83e}6OD+g@6Cu%erZW$juffMuew9zA;WS7quZVAE^ZlLMURQ||)p
z5Uh!}6V)Gzn|h9u0~GLwFqhS+=fTBemK_m@nR;GQsFC&9w8`q;<H8RP&I{cjzizP!
zMcL5W#>RcIDh3*~^PKOO=A`4P^B5T!;WUe(SH?KyX>J8b00w?rU(XHi-2-ikB$Op<
zb}mBZ^_P;_sEjw*J2)7^dr0YQ<EBl9STxA^c}SLU5V5CEpUPbh1)4L)hXD%Spc!)c
zs#Oa>y2vg>IQ5fS`Sw#E?uxql&}4Kt)8{cz<@AfSw}3=Q73ku{i=*iL5<!murn`R@
z_j*2Ac|ckD4G_pBIBU%-8&YH@%X(*xSQ(mUq0$@uDtm~^P12WkRW-F60MCS`K(Abn
z5{H0*0I=;El8f+DBOL0(o9Q8b?Huj$RPi?Q8zeFu={zef#2Np!sv$cW>+0%`r(Xzu
za^c+W-SbGf1QJj)T8&Zq{$$lO1eNFy($zfUpj~EbAjxc~8o{$?@@i{k>y1&%^FeqP
zSmkO6Y1JJT%fz_+C!AwB8{0yZK6je_SOO7X^PuieNY~3*S$*4$f#y&pzZmXQ)M^<j
zWwRJMN*-XLl`I_9J35-@x`)SG)Fz-Gzk|%_2^~{_Qzsqpcu_SROE{$$Z%#FOzZh1%
zqrw<F3%AOH&!XPXK@@xh;{pH{(qO>J$w~Aka^i5H1~BX`+I@UYXON?O@R&F8ifPmk
zoA6$D8R-TPi{m*?lBfU%ccy|8&1YaBHAy^nLjch-c6Vv)FId<p@)IP8@49}PSDvoD
zsHwRUQRfD5uOV8jk=&mZuyWtp^!|GahQC#vr;2b0oCaEKwqnH!5^&>;w{6|3*bNc+
ziP$cMfnKi*I0l7!f*5SfN)}Z_1ldI-@&LINAXAAzJ^gU1Y2)#b9M{dh#jWK2sc;3|
zgq<-lF&jigScvS4F879xj++nzBSB{u6vQecw-o|gTXhE(oqCC0-ta;^Rca|Ju$jZm
z>m-)L^aKn4^tT=$>IgJzw(dg*4nSa<hq{p`9zbb;{(5A~fG!7<@#o8^t-3Fl0|A%5
zz$16w*44?C8snKiOp}mMDK}K60f$6OU5H{22w~Y>k6pq;3;fzeFilOz-2qKvwUE*i
z+barI-^xw6RZ%;Mx<w=mCSObdDiODw(_uk@R-Dp`ILtww^7TafMf@#wQ*U?o1t@M3
zeJgH{fCzw{8{Dj^rO0nHn(NR+x&Z9N2@eDcwN^V_fM0FLu{Pb24zTA2SmI0PiCqNM
z0U4sZFI&kNPQ4Y!kO|v{Ch#lO-+$%1!op@mFo{bIR837y3YzVgaEK~?>jl+)0y}+Z
z3?MCAkv)!8FT~lhfi80+qx^XJxm<Eu3Ds(>)HOIXr2t#N5wDwQx^2Y9(aPr}+u%^}
z(*96kCn!};EBFCxNfSo9V+m4&5!xBh*K<0lkc7oGYi?m<8_^F`03-{N+vL>FdsZ6s
zCI`>Ulxc6-yLayfLBYjzbaaGkLoB<34L1OR$!yz$e)Kg60iSHV0d}1tda+2TvfY^1
zyh$)g<<nDh_#!I&#!yKvSz9Y2mMmYf;u>h>qMNKJZUjkM@gZKv4xL}kT|A4~S*oyv
zg#eKB5V2Nsa5N$%kdh0I3teGc!PkJ(gI`T#aJV!d2M@2?+V(4=E#+{k(O#sh^_w>@
z0!V#{LKu)}s==FiSrC>%47DzY0<{v!Ujc;-6uVohN5Bh6wsvPs#~lh`KnCXKK4`hx
z0ErcEeut74@oee@BMXs5fMWteOdefCphG+g*{mD`izkgs?u^k;?+WpIi>^u$9$wxz
zH8r|1BH+3F7ajx$6YbcfB8gK~RrPL2$SnY-Mg(06pj`;BDX7Gv-Hx@VrzevyBk(A?
zN+MFs3=G^Lawx67V~ddF#kx>4*xJm<Xde1~h>nXAXt1{N*Mf<8)b!-QAv|M5*<|Eb
zIX(J@Nw?K0Q4WCAOUTVgBpbGETMG8VA1>#73n@lHUw@`K!h{9tuMx~*cT_MGdK^Sv
zjLOlsgk#K@oBimY4b#&T=15bU(1nAR!#(!)5-<c)WuyDrV3RP{yWph)M;?<J$P=vu
zHuRTW8fZIsp*npjN%jchsb~wxavWJ!?=*1dn_+ugn(GgB4_JMcecx^qwDZJg=lky}
zIGmOA<B9s74rBBK2~5U{qvQmv;=p&eb#S;_X9Hs*`r3T?Zncu`Rt=BCQ87?o!6H%a
zkN}Fg0l@e6`}bAoecE<JIY%>H#-V>c2tFAug^6;L(imE32{yu}G$NB+va>5hMeo+&
zOgO;5F`m32fu=jRr}2kK)IvBmJa_J9OUo$&TM)Y6LVWH+IUSGlTq%4qD-S*J-^QK(
zqHV}JZqPwU<(j+~c@xgZfFtXBf6of7PyF!&J`RL#{xk+=uG{eTM=1V*>Afc{a#rnX
z*f~b5I8q=7FeZbrurQ)ii-Q_m27PgxuyvxrOGt3Tg$>Qj?qCO-0J{)X*Px|}*d|g1
zIgo@JZTV={xu~Y3%1`qwt*x&=di00}SGPBgBD;`j!x{}uO}7ljVZYE+d=n_n2q#M@
zx4U<jL#rZnqNsth$4*H&^ha+FHiYs9hFBz?B%CukPv5C;0c8N<AYt3Jnw<SN-P?J!
zN%#|Hw3m?^sj(0vQ~Y`HC*v|4!gvGho#HZeIi@7cN~;Y0=k!RgDx85wP`o2?PQ6VB
zK#e3xbo0SC_Iz*}`18jNbR5{mUup!-=)UR*Doeq=(I44`Pd0dPyC?wjQ;~qJpkQGp
zM(KEXYE`<K7y5Ye!KxvP7u$rRNq12~-^9cO+NC($3tr+@hSd-=<mBX3WW}lD&(GX3
zF`B?*6$l|!=LS)McovCsXXwn{0B{!|=c-2f(U4Zb|5wp^%Yb>YYF=RMMmP&iP$#;l
z{{VzE;Gvg-Uyk!CU%0Rpz(qKXVc9xQtb)qwd$ya;yg~HW>oCUQ8g+CbLbanlglI+t
z1wBcRc(ALE9k3_rI%%A|$r?>X;uMt$iY+)c<pafv3yK|)(Xd~8D;6HCm$%f;amw7>
z_!hwtd(X~=ens{qD}qrFH|Z#Rqlhkp#j2rSR|0jH+m=k6aIrjU550gpDk|!h%rxx2
z4~E%Q_2Pi$)Nc3%vaBL1ID{+{z)z8G&7EddD^U#nHa<``c$Ra0!k0oQf>yFVR|W64
zd_#9t<Jo}F(5K&ezrms0ua}|>M*7{{xsGTeDWEZey$brl77OF(*jTT4W{x#$;x!w0
z_gyA8-|Z(?Q(H?ajaVNS%^@Da>AD<eFPM!nX=f4<xdPFOpgYa%O@d3&*3;0`w2{XN
zuF!zzS`oQs935j^gc<`C_jPB~v&H3?zq`2twg+gyUR(G)p^K;?$YJjR=IJPW<9_GP
z((#dA=Jg4?PucD*fE?NcNVmL%RFq9kJDrS4;&po}=mH{OKRzgc4ZDL@f%rk)IjZ?&
z5JmF0u!R&zb6xeh#LNSwNT&e=N<K>=%kxMUd3g$y0wfNL;jR+_B`=`4s^=HNI<|gN
z{`ljSjcMA&+BM$Z-h@4%3f>5r8F=YdTW#AQ8rex`rcwYW(a(8c@FkjDQPm=ykVar7
zuyegDB0ZI<^b}g^<QEyEwf<Jy^zr~UUZda{#Dqxj`Y7eQl9JNWUVM40tSzw8fhNy9
z0s{jzL=bw_o^Yuj7TBc4%g^5ojbT+4G@Sx8A@*-G{{}FRo-NXib>8dYdh~LV$8TQo
zdxz&7-9h6mJ=J2FX~KHIa8^I){IRW64g?LA`lP#m3{Rc9j)TYSFIi_G4e$m}hJMA$
z?2WM|4@odJa#SlRDM9sLu-{Y~{cY;`9vjhXi$gdnCTNV%i+jg|CPxIfM@WOC*vjjS
z2YBD3%kz5s2~>4(D(z^KXWAy|U_l*Hi<UZ>n%gvXOJ!{@N+XNP<NHvZc?Z$_f%y!I
z3pZIKhkI(b#4qzZ_;rLCW$ktHF1jW@>m)V&|2#CoVR;mSgUe1nJH?0|oq$_$J8#~;
z{SH#8A|tnCM`tJLrHrcYKuBKontAy5<kKS|-SxTpmFbXwQA|*&S$(VGh!(3H^*y;1
zlrCS)1p6YW<r3w)jgN03nvC|si3k>g12<yX&^a?$PV5L-k+B<33i4!?v;0zCPL32E
z#&)Xb>lvaSX0sr@3|cL*GEsO0N*?4+6IYx@jf!{tiL)46&$p{;2z`Jl{(;!6iN4`a
zNA2c{-Wx@%tgK)xUS8R=T|`yxH*Va@skXj!DGx@6C!-^atA<HwnI!AIKB^>9)B$Py
zrQ=9anUG1{`nxYaoKt64=NG)AqWES1|Hapt$JLyFfBa(~nHgL5C?rXi>{*iC&DNgm
zOSY_K-<z?GkX%JVlF*(IvJaAMDJ7MJ3Q5+IO8lPZGGl(fKYsV|U1q3zKli<y<@I`<
za|pPR8hl(m&S2+3aMu343=IvlE^-tCyrYnlg(T;+W}~1x{oxcy<kqQoG<d_ETXre`
z6-@Z|839mq_m;T0{BiK$<UitQ%Wa-(XR$?KPE^ZAClBo0x%1d+f=tB2+AFpK7Tce!
z_F+`%nFUWP!m%2QQye$Rb-=0D^*$Xw4vKB{=fJz`w|p_Y-?{^Q8A_qK2g_YAj)cNo
zK5|Y%9?tkP{n%?kKOjYa{V1j`LB6r|R%@uC`r!}yQdY1kG~TOQn#wxuL)+cEbYs81
zvM9ZGy=)3M6zxy#blh(H!kUi>2R97gwRjj(scT#}*P_J2@~M1UK-BTJt#gJL?A%;>
z?fUg+c!~|zZPI;weFL~_RX*_?f1EgR;vRA!H?#{!2H0%w**D+I{_EGT{k`?bzt6<P
z1ef13p1bwno&1j?lfF4a^oeA#wvD$~Y{$`tAs8S@986FZwcu%gsJL-RLTq>W`L(y*
zQ~7wEr-&u2pO~->K4z=WT4d9ksl%01fBez!V7(U8I_(J>#ZvkmXiW{(26C9{1@xP2
zQ-lRD&A}ZdrDs+A2mjOjf6$yU$i-%^xh34z%P)Iy*IS-vAzCnVI6&TD)yvH6;^l>}
zAm9++-rP9C_Eny$saq$BYY2@SJUj}HcI6Y$jbM=2?&oQ#@SuFL`u5+7KYXY&BGA_B
z2kaLYvhc%!?c28pg%4k>wP*N7Zs*?_8Dr<V7#_!pf~a}DnaCBG+>6kfv~eJriB|9D
zP+5O%+g3$^3pmJcEdO-Nv7X|V)XJGJSOwjW%6{Qm$m{mw&gh-owsq@zXTvH{Cr_^5
zv!@YMhIg}J-l?khGrl0AfT8T2^|fEWela0`^j`MPyvy>lJ;t706inO%QAh-|yGM(H
z_&tS+LL_OEw=DLn`-`?(bI*4W=>swQ!SQ5zhwBs(1tbPpwUj_od3Whsc}+IzmYvm5
zn_@vGwx}~ex58~*#{wG<D!>T;RmbM$pWwW07f*bX9^F=M_emmP$0zQY=||oq^gKL*
zdxrX}`AN*(Z#8OcsnflxXW0tK^z=9vyNCqh-hKciH+Ofa*}Z#r@9R6R2=_|6lf@0y
zTGgv3es0Kj!Z_Ea8&88CJ$g*yCXyU)Ae+7(XHYsC5ln;C2CpB4@3r2dEf?_Q@!|a<
zCR}hQA^W8O=j9$UH@EeSKl^xk-`n9}=K;$lVX$)i4Kcv$zlo(k_IZmc(*?li>1d5_
z`}coFs78&7BgEKniaHZ4pUq1_AJ)-Q+cI85Ok(l|ZP{ydcT*X+@)%#R;)^vHP+eBa
zp=eeuZ;Mtf@(z&+Zun%iZ#_(w*V}<ZVUv-V)F!H{?Ag^tAIvaIO99LpGIQq42)oC9
z?NOxx1Q?!#*4IRXr-+iAw+v{i-Fk3bY-~N>LYJ>!mQU_=$w-$ti9Gw%y<PE6uw|RA
zO&Sp}0PR~8*_{R%usXgOm?CLs&vxzF*;$lYZXnTd>O_x&KJD8Bv;zv)ElPOoP=^4t
z5;s^;NH!@7ABFoF=F;jj`CrhY^HaVNPpn-uJbl^3FF-c|#@Pb(^%)HNvmtQ%>--bp
z;o(8skG#=!&wWj-A8dOYQh=&}K?wTY!UDsnH+V^9_~KQw{9ms=fu9KYgtG)Icbs$6
zFSP8$$XQUePoF<G_~KWyNw?tz@gve3w%q%y$T20yHuh`D_ygP(WBp$^QjMmgn%UN}
z3Q7+I=0lqf-qW1lMD0Ybu!^#{mya*>l47Q|Pj?6PKssM5y5RL|ubvTLSV0t49Bdn_
z)N0kHlH*}_Z!4s&lI!u_l@$Se^78Ux%(igEc7H}mSG#G`-}Lg+9W0aL(9luM@CQ|K
z;--Q}|6^UeyuG|CLq-K()o@(_QzQohj;xyw9A&rS?>v4{OQlO{Lz#tkJ7?0Vg`-E0
z7F~{V3_~wj3tgLlpzW7GwGB8v*kG}86Akv@gR8Y$->Y}Z=mH%Dj+GO%LYwMmT_mGd
z6c}sUh;cFZMtju*uRuk1a6Ad#xcIsyR?`n#EhaW!hp4N5`*D9k1P5>&MN$dDH|QHx
zZf$#x+!Lahw!WXUyHM_z@8$6X&Ep~$yA*P=t$NL9<#omIW0~J>tld<y6KWuT)oI!^
zk+^YO<W2-UGh0lY9J!P1ngg(jCf8SxrMDcs)v&gmnJDG4a1|A3n)M_QM&*x<+G&65
zvDeo(B<a0-dCL2^378sJgQN;>F)=YoQw<{)G}G4hn$dT}qrS=YmReY7`So2JQ5bdj
zkK@Hx!5{vndSPa3eSLwFe6@y6^ZguVGX{Lo!Y7Bvn<$XdK#v^-QzH}HBUesHeL=!1
z=#shpvuuc|;3;0V5AYLDNGZ>(DL?!2TkC<bcPPIeSRi=m>N%6^MIX;RzuYF-e%jwv
zkSU2#Hcd#_S+#1_04+WG&cu7)Ru`ZRdw6PY6UDydA3n3_+}dwuJl+NzaW<kyL{J(u
zD%S1n#ST^6NJRchCtt;p)@`?71%6*(ZPATlg+*{aInlmFowl`SS)sZzJh^>}fg@-7
zk8%WH8kxGW@vq8)m-m_rp44eikD3MmPJ~^`wYsZQ`nh;p+1cmcHFdGG`BGjk=mH-d
zv}(@ezTdz&MIWLZx4)Y(|L`Y9!nmimnl@<R+>57B|5H&}9iZb-l8gD%$hr#CR%$_0
z*_J%Aeow6yBq5iPdbk>>;X}GQh6Y;hs~q+!X8{i4(r)tAD<CyB`Xq=BpKo00-rchP
zQ7i6nz~o5}$HljS;Z7*`J+s#EdLOzQw{SR$X)XzJwxo2s$&xpZ9A{qoAtM^pY{+=8
zlC7TMQ)1l%23{e8A+55hJ2`lG-2;m))*+>Ng4&~k!c?6m;91ry7uTP=b;r(UMZ+^w
zXDIL9y)z#;urfM3ti`VUQrmGwNNWb_7dt1BkS;)(7QSr;L;+PrzXAjGg9rBy4Xr^;
z9KLN-piykXjF44#9X_2nI>j0nGQfmJo$~Y;dxzW}icb%h76h*C^mFEuh$gtVATk?>
z2?Hb82=f#&)I<|+k}u@Wz;F&7KKvTSKWc79H_Dc45_i=Ozcx&B7N%`^R(zPyIF`TP
zLQ~}D%><YtMHE70=9+;}<{*k4zJMZvS@feWspE>mnUWrO>G!wY<^5M+%@M<Gcg8yb
zKKu;_TES`jI5u2JdaFysZOws*s8~P}9W!R?4b%79ZWi9Zbm_DFEv{_z`DIZZs~?8u
z7s-jxx6mdzzWY|7yBL>ZrsXQ0;~c?x$){l!7ImVB-5h8hn`-ZU#%}ps-M(wKf-UM%
z@u<kcHE7TPmEhl3u2cjxyWQ0#8p9Ugta~_~)W>Ayr#Ym#SNTgx0<ac`e;-zd?i977
zfnB`K?A!0&qOOH{!z$SN@OBSXleaZuP0Th<Npie9I(hZCdxIYx_BKi#9-C#l`0Jy-
z_NqNzkt1ZuU`-S!yIagz+MK?2WO~B^Rxo*%l-CX%G^oOj(y)(T%U2$*>!Tg{!rg8s
zwV*Da-%X}GCTc1<$ia<4(T9kYgojU_Jk%NM+IUoH$BrFE{`~aSD=ofIwC%?hNxR?|
ztj2oxeUpb@O6f2o_}O;vtQiZq0ff8NfJziM?#Jb;S6e#dMuN9gPz3q=+I;5^+n;>*
z+O^88$-pU7S|Hl+(CzC1Dbmu|IBa<P5QJ8|(n~I$1XVfXnybauy(jsULYsifJ8s_|
zjw`Byp!#oU$0%y{I`uuU{ujgMDqUa3^_WZ9;H+6RfB^_g49vuNifYEKG<1~`<M1rO
zV}LU!aCFp^4W11x@nHZOHIV68XEeNNu-fqJR+7Aw`s)nZM4S!Ze1_>0YY=TTe>#AQ
zF2dV(C)!2|*dc-oAO-}t8$}k7w!Ws)-}M0@O9gYEaba5#uUbv<ByborCFy!u|M$Lv
z6=8747i0&OeLdJvFk>N^yAB?1{wB+GuI-(GfV8Non~qtyAuK?`686dISMIGYTFJ!4
zB^Yp+X@^%2_r<Z^0XXA;xbpJy4MrX5Y+lFU*S5C+bI@SKd}_PvQJ{Jd$ry*62rrg`
zH%p3|f(NuPsuMfV$u6{2o2xL+$^7tjT7`ssazUP|uP?&C$yi+E^d9d%1MG_c8t#d%
zC;bYt>#|<*Du?S^uyEm3qPZyN6r=MiinIV`f2;ew)aYi%*tf$jQE@0PrsdSdZyq(G
zjo!wiM_WrI$@4q~mbwnox!w$!U&9^?^aP0q9Cz0LJwV+*IB+}b`;>>OsBn?B@T*BP
zEz_;3Bo<JH9E~XB!U|#b2O@1yoLU(S95RG2(~!9(i8nr*2CJJN-5lR8jq`8N^BYGc
z#rK_{uD#cMX=rZWe{*>F+>5L0wJvBI@j7HKv5G}eC@O@Y@SC&u_EBBxh`|>IX8jV(
z_02g~q!k1GlYRx^zQR#uo%>_5xS`fPd-fEyG@ejVL>628X1{g4Uevn|Dmbq;!(Ie^
zs5}WrtsU9OHzddZP1&YX&+M<GagkE$@NGw5FYekmM89U6#{-f}4#&osZBOdo6mjp$
z;weC1%JEKRnVFds4L<HYZSGd9T1cFb1gd6U0`X>vsF<PTXUxp*=r+hf?hoB++fyh?
zfV68wKFOua{e~LhOae|B)w`=5I`sLFsPv6J9gfEbiy91Z<b?CfDku+8nWBwrr>{4?
zb%ANmo+0h3@BMKxhwcCq6;z!1<0)MF7^4i$_FJU|guflO5FDZ(b}*4DQ2ImYkicBU
z`3V}P7y2Pi{2OTc?z|0`H+zk_+k<j|@9d_biFbW3?}b%9ad&lkEHIzBa{2NKidlJ~
z4e(sh>Nx-s2smpLz>gS?FBsq*-!9~)pT1u>mczNv+Pg-&;K6Mc*v;kV616_#A+0jr
z`h$oC5!`L+zul@^-ar@Yk#}!jy#gQBHsvibOTekK6OXE&@i<bUMU7xhij%26tc^|a
z?in`aAS%l%OGFAs)yH9vmG7SX5tMm&ScB+mhqe_Df>j?>@b2A$!*Q10J>Tl4ZymeS
zb%Fy!W`d>+Ut`v7IA2=y?-J=Y&T?2(nzv5nslm%#Wkr!~-H$EM$*SPL5eLB#mb
zPYILK8y@Ot8ku1J;KFd>g9Z*Byz$=M{L<d5OBZ2E2IQ?pn^M1N7N|n+GOPGRP|`1m
zh?af%Y=U^-jk=7^PuE%e7F(qf@bQttD9)Tk5v88dLHqZ69h^s~M6%Z)9np<QXWhce
z^7!@@oa`$9Bg#waf>=SqA%r(S>RJZYNqSIxxFit{fzc5gp=#ah_gl+JK4#w8%JHym
z(vVpX(A_6GjTQ9f$dML=+-T15wSU9zMeK;WMad6ZgzBR5f-h{dILv70Slh!rV#e(8
zzURHC*4_ee-z!UGhQ;f)JLB&mR!Qu^m$!A^M^ySSBLfn)nPkO9(1*E$=1#L;^V@H~
z**U-FO*h*=Uc)G&q|gIx%hC<>_5VtDpBDW=jTARIW|%vlHd!!qgab&{=o>ikiP0IJ
z?bzwlL$F}c4G%A(7_E)-#0x9zg0o#eb=FAqE{u7Wc)jdOdU|7=j&?e@oONDl7d-IO
zvsR<6-PvOIzEh{wUE~&)k7A=G@jlh5VTCTf{;w8y?bOLS&nfPUVcEHZpF6h;y6L5z
zaHvNoa+OwVf<Bt<*^Mgk@661mno`xQHDx2bJ_pZ)%HfF|W_1dKpBfCaAho3jS5psn
z*u&<$2jR?sa2Kn1J4#}K+Uk)y($1rTq#YpkRO9YQH|^2muXMhcMbidNnzRk^vaJ<6
zJM{Qe#qq!`@P&iK2)0#@l32Lcsd*xFrKkkBuItDQat^%xC&SYGf6!|n=(=|rU8FUa
z#mBecb^LOrrcL*c`*_wv$Fw4LUIsSZ`^a6KMDN;w)daj2as6x?ds7Lk1gK)T#64D*
zo;+r;k`?y$4YRTZMXwuO{^g5T&-QwH=5?PcH}?1TzwI7`m$2C1t9|?5F{f3PhNMV#
z`=?Ng9kArCMe{20B`3h!7DmK{+6K39ZgA@O8tU<qLf;}|o#1s(EzE#Xn!LrRNfEZe
z>Y%1gnv8vEBG)~y>bQM<R+nm1<RYB|5Pxn5Zv6ZD^}(->xqBl9`s?vjN=}Umi?##K
z;x9LR;$T+={L$R!4V0s!6k5Mz+_-%&OSJWg&om`qAVzKMOa_lH-Y(~^&QJ(l{SXT)
zPdriew`+r2yS1I*P-E}7;McQlJYT1Vbu_K}`yY>rv-SlpYvyZd*{a>N?Fq{xt)4eq
z#`?9`mP^?WSgh0<l9D_G9*;<UI(636U#GP(*pPW?t*)fug*~L<z&R=cME*K-m2D>i
z___>sYbvVuI(7cUKIMkA7+{;uri>_TBp2yh+~W&z#pAwx)4lZeyy=0t8%KBD^ZZ^$
z@LbpF4{O~yt+S=Y>i!4%`;H1&WtsC)W%bfuZLQy}_SG5byE-nsVY+^f&()1l*()#g
z{c`^J?Kw-9&0TnILi_P6E=*W?!7rgKWA@5Rr;E!<mYL0OvoYcBwt8J$4-r#2PLGtH
zED;XJwQAJ3j}>Z_S-N!25@PvXxkXQK(@D7QLMD}1sD*Ut)$8228JbqZnpQ=MYH7LY
zA*#<Ig`Ho=VU%GS9a8Z=i48O8AYL~U`AQmj^5(22)EQ0ddvx>ol5zElE4rKMmz0&w
zXq5b!V$O!Q6x84KoU@=~Z2*I}mJ>H;xfo!X-}Sm{en7W-q?zlXOTEavZrNpE#ky79
zUq&a?#KiX=sYuZbZcTd0eK*85#?3}AN`q@_l{Ig4yy-9F$!(0wztQB+^i&CDc`s%i
z-ddqjRi{18%V~^uCcCS&BwDPyVflf1_3P9LpyV<Lk8%C>iRoq3-4!xZ&CtZeicSx{
zw8Z-(@-d|j5plX}#}PNA!VSA#XfbHAg@uLnf(6H*7^O{*HEGJ(?5+ehoi5f)*k^hO
zD&_eydi;Le_V#bBdPXl9L3DTNo4xfuC2Y`VLaODon_ecL8qh^LG+!FDfYRQX*|NLV
zhoY&JEL~sX>gpQtb#9n_sEOgdyLU&9>QiLvS86<S?E$+rAl0URXaKTaoJ$}B&vNlz
z51BR1@o_>*O4qoGjXF0QGWkqN&8DF#Atz6sG@c0L+TH$?GzvX6mlU>f26^XLkG@;D
z?ziloUdf^>K;E|Qk3CbI?Z{KQ(tIHWd^4fICo>N5L^6@=VkWLRWYwi(yCpxdtsKhW
z#j>w(Wj?uCOHq_9)ve#keuGxMF6h*lZ)DG9O`_1fgRbrY<H&IHS<`3CI7({Vqztsj
zxATmV9;u_l?dg2Ka`~$ld~(NAY?pIb9q*ll^B2hd)cPw|uG~gLIEU4l=Y4G{eB-2r
z07u@SXod83j-RWhJD=t>x_K?Oe*fA)7Y=80R0}DtKfbEK+hKxaMqA6#&A$jqpyylT
zZ05w}dT018rSfpB!Mi(md7=}qyU$H8JoV!7mt~zhb@HO?Q*J&w!Ry(p0lMBk>~5)J
zRd_J}L_)R7l^d2(@sq@^+H!HEarlPx(_6M|p=x3N>xl$$7%yuWiBm5Q^jvHqBR7g)
zXPN@C-LNkDl;SW^n!}y$vzFzHnQ$=YdFxkuXM9Q63{XlqHvQLU7fs%kza4e6JL^(T
zd$exTr>~`s^F+J$QKwI*$_#>583of9B3+8S2;SfOw_0;q_FayogqgnCdT&4l9Hn_t
z(S_*I#;!}3cK<YW$`mTBk7xL;C718@yWyQTzD$rEM(w&4MOOr8_l+wZy|ds$j%piR
z7<x`mp5^hf?B67dH81;X`}S1EY91kIeg(V<Zn=jJ9oGJ<A8o^Y%;L&&{F8|JBj}4C
zYiNC@X@?Gx#v2s=<RYpRuYwp7PS9zSPt-Rqz2)Ai#O`6=21!MG5~zNk`T8x4K03;~
zF2`039r~9>H<G&6w6M51!ER!elG69m(sEDjQaaw;M_4{@<;4`|6mm}~V^AeQDdHZ&
zdlxz0qaTAEImFbpc(&8+d`h|Si}}9BReCPH`rv{6{0l|9OhP|pF|NR<M~~5*zUxs>
zo;=ZUIA)UVqP>yS-zlvx4EL_`CRx4Ig^WdY%D8?fD-Ou~(%kn{Wdumm;1~Vf2^L4}
z>tSUdY6mWL4=*=Ae*eTliE+5|X!Jwl&o8qAOq54_JK8ws*|PwKF5OwShQ4S8b7utf
zUGm!JD-&27mT_g4?Q(o+brwD*6lGB9n{#g$A<>|>)le)zyJOyG+|<vdm+FV<kAan_
z#dgW~h<>>q9{uQ^@zO=?u-;OTcGIunl(-+nOgG*T1k`MU(_Waed;9iLyW$pR&(DXm
zs=w@y7MdGkKKbVD+g_LfG!HvNQ?6FvPqqE_w!YAZB(2@VOUu>c=g)s@JaOUDr3sx9
zKY(5q!707y@pT^Jk5c-Rw^e=k_|b+cs)e_Uv5haWYNyLrj2Yt#6*cAc$`vb~_h|O^
z?anV>FHF)u8dt+hTYL4FwXhPWZ+m6{z4dPwaYiC?Cg1)QrHd(@;kSG+X`;pO;kM4s
zA)-%``v#}viH{B>2)%gkgOoXvGj;w1PkDqyU^Z25<X=8y@QVqw3ervtp$SVOgOv7!
zR_KBnE+z8PS0k0{f(4$Dk@NjJ-SO*_(a?>s$iC=p)<hI0hn{6;d;9rCP<RI8rjqy8
zstZISLx(a>mEG=~(6G2i*RFPCTge<cFO(Pn$aFih0BTD@_mgsHNKHrMUE^l#il{l%
z8FVSCFYL9P3xbvhz>D%+O-X<dt|hmPJRblKAmiF%oHIbiVS;XVdNQ9a@I{9S_9BE@
z8$ORwz?%pSgJ4100J4rMDK5yE5q8`4>VEryZH9AU6Os2tf(5ElvwG|UA~Ufm-~Y9V
z$&`T+rKU&<+YK7T=eK_sx6`3}&3n57sW2XQC*&K$(i&|rArMNU5lkWMi&3|3_S^tj
zL(;==Di*REoM2%?F(OsI>Div?+n_shL+t2i)_2>1x5PR#xk_U$t+@bsb(H=Oq-iRC
zHU%0n1Yy?==jad!Ip1z`$N2e`)xdE<le~Hqb0RQ*UA5K}zR8U0^s|dY5!uO6#Yc(V
zkV7DBS6=DmFOSx&ZD4@F=eK6PA3p56>hm$g1>uyijVHX9M_u;LAM<zwqT3m!-ZA9L
zTanH#2BVWb#fK+i*u8e`IwE`e=!_NE#sd(7<NMxbXWP8S4N^6*cvnCif(j#=gi*qo
z4i58vlRZ;t=8f&0dSb>XV)V?1YpK_YM#8b*bm3sYgR={s{)x`>5ZXw7rOl|&N^{eW
z52Ag0N3HQ4IH1zwHZ>tH#xeG&VX5c5)obp;5~t)(<F+5aUAZqvzy;d&_H@npi?C!7
zRnx0^{}uFGU4#@}Wlg@&l>ubqa*f(BskH3{T6rvQKeF8&bVRxN`JqVSFZD!VMX%Bb
zX0`Q4D=~dYrQt80Jn=_{(5T8gR_4{is6))9T04t7X)$u7Jtm!GDU#eUAHBo_rhtB%
zR#M)%JFe~!fRaj$vLuE2mEWNMXfbpT2QF8E3NU#z@Op1fpkZEVmOsOue420t&gVY7
z)7DL6J`}IYDDFx*LR~oTyv;;eXY?4iHO*ol3C_5!0C8M?*0-;x<*=|F?IxUm*s$)$
zmxosM^7Qye3$T59uY;M4ggOkCklL_P@quq&h5#pu*{;{1!BMW8SW@vET3SiWZfZ#g
zLr+t48PV4?PMd0oi8N+NL}Isy(nwS*ZQHdQfpYQaMIxG#&!1(x)q#M!d0d%<s5?B%
zj7B!Kb#%hWqVpj&)2Ry+rYEn|UC(q|o2u@e_uk%T5&88}P>><r?8hV({c*W_!ZJt8
zuZV|@Ph85n%hOIXF8heQW+ocCrROQKHC?Lnq|JC;TbDz=x477VUiP^4x6AhHL)(ue
zy;|Bk;p)`^B#U!TO&km0Y(~c`;}bi+jYrtph0_6a*r+(?EztFhux3fHHBI}j`Qj1t
zm|s{7(R?dPxOfoLO9de<yCb@?#gsZue{d=7($}I6MQ<|4vO?GO29avqp{C3u<|`7Q
zslJp)k=-uroq*%x4Nm&8Fui_1KLIS1A$L2nvtOr19$;k4v=OKeA(i7l?PJ2+D-e=P
zxyxK#kK^jrWTc&&Kl9O<C`L_$5RK={GJe|k`i;%Nf$I^Tgkf8BGDsbc$Pb{Udt`m1
zxr#-<KX(2F@7#xIkh@agWL(b|(`-c^?)p0aR)<T;bWz-$ZJNelXg?x}<r$IRSYF>Q
zv#k9x%6p9+n_=v|ckNQ-QGB;v!=A<wqWgsYeH$wsll`j6+Eg-%)l3*!+WRe?{^LJ<
zv4V&XpKxIe?RJ))f8ZSbidvCG!t;IE#VZK<y2#~<T;y_EtL(+IjW=!#1>0-u5NjgE
zJOu0^w!yA1s=e!8vb_BBIUp-e>S!t=Hp3gLvtz(c#Sh8i%~>0~A5)F^`R&6OcOt_}
zrbUnG3WPs)SNXX;WayxR^=j4f=hDqMXEB5Z&D*?3x~|Y+;~Xq3+-13_3V3vC6tFrQ
zf3xBFS=X~WW-r~3@V!ZF)3m7Q=oEC&DS(3+ekX`*E)*sW8=Tm02`w*cGoNJuDbK~Z
zk0H`#loYSc4?lg9gPn0|&sg+L$rKnLFCSx7*jy#oS%z!pJ`~88faFC;Imp^9PAmTP
ze1R!FNgMZA;4kw&SB1ZOhhNC;(Q?fa<|ZF_aV7QqOHKtXH$rI8YSg>;`Fo{psx!zf
z5W>H$q2W*}U5_};Ob*&1`fJjdoX>r#`xIb=<t=GPr_Nr^w9u`TtA_FuO?KA$GJe&n
z^TgmwZoOx1FO<DGw{(@qu80PQM7A~mV###B14nnN)j4n8%qv9eHs09%?jPr$6lubF
z=`M#mHMun-jMOWg6x*Zohl+~g<%A8iP^qloa*q$U(DG?V-66|%*X6YhIBE3qkJI~b
z#ZLEm!68USqg}|E&WcJvI+9Q9Y|Z|kC|^OOA=o>4uUO5cYigY}Tl0>(<U1*6yu}_F
zzPaH6RWDy2!=D>6-@jvKY<^tfIb9QbEi%<c>y|)6E=aAk<n05>Pf`vJs8){bj;d-)
zfCntSEs<ynQHz{2urWI-gV4m+Tj2*@AA++NjV4vL5mdw5(5=(R=EzpMKeh*iTb!39
zjWv_7S8pFa#^r4-K)RD@cLWnLn;bIz;_7n{?16|c4b99ZfT-9L@Y>OJx9j}J<ABJm
z_{@bV&&gdQ_-$uIYwjK%-KKYtt4fTVf{p1qD=8=9gGK7vjz@>@2_QVg>f7;C_Usp)
z#||}X*O~XsrP&SX+Z*R;3bl~oqoev0YFj%w1+Mv4zEmOrfLU&kn!!QMqp1f2+alz;
zVUzJ&<1Y6kV<5zpKpSK0;8AJauq_bYVmV3sJgA~e286{4U4A`!1#NM~9}XHk_~xBE
zBcKqSBecu{^DiBfaRILq4{hS&)oau!7$-IU;wneG5L@|{vg}|8p)jNyz6{m+_TpJp
z2+ZhtnPa#JmYAtEWd+^GpIOk4s`Ji%oiQ>2T+mR0tEHKv`dwUuoEN0Lp9moZ!Nu-3
z?X$vG@*mP)(8}DWPY;TILr?F?sZ08MV=<<Z*%3!^{VZ_n^LwuSdR|9ICkJkfPcACR
zNcGu&P7itXODQ_wjq4HNk~bMOZ{Ck{>$-5E4Fp3W7Vkw~*3GSPRt=fF6~qah$Q?^%
zBeq`I#)Sz%`9a^>qvUmmbTZ`1A&@wNg{J$e1xeFA7Ss-*uwW}%zy4}VHtHJv%3)SY
z)YKt;Xa_B)3uLn^bFs%YST)odAU_33g}5Z#g#;8xr2H`<@hK^G{HeXUxf@mBWB=M-
zF}B|X`&$*>EJnR@i9(-f3UAvv9v?{M(>+43tuOAzHCs8H<ey$(S>uZ*Pn~)QWYSrJ
zLsD*8MegD&>LzdAxM9hC%+1X`j4_yvaz4c7=-u%}HVk~C&BQgIhJV<Zea}c#UFdDZ
zkr_%!@<Dn|_i05lBTD^OFyr(Nom>><(ZV@%<`jZ*%zk=hL+SIq^9!l&ZUZVK2hbtX
zkKs3A=zcP>0bI5e&Q-gK7sug@ACL!x5P}=FX`_;Cjkf@Z(B;EkdTeOKZ#V9_C`j-G
zj`~Yv<M&vE=(XS0?C-mF0tc7!$9=5J?5&kPk&YJm1d`wmV)pMbv(lF>$&E))0j}1k
zw1+-4v#4${9cl5q_R`qthiiJxBX;Dd@5Ga|Kq&+g@+o5aeQ-1Lyek&Eu{AnJS5n?o
zjnoX2blcm?z~B{Og;!{3+nt$oZnoa$&EhOw=WS*=M?MD%bcg=(l`C!DNxt^uW!}b?
zB9;+89shp)eeQ=><5SbCUA{0GPUHF#0?^wvFqr@QWnO(psJ9ubsnx5RMGSF2trPcZ
zMmWR{2U<z1%>$afU-A2o-#WAVy_u=ve=OJea80zLQ_DzWHq@6bxqKm)<E;+tH0`F(
zzRGG_wpE4Qt23W*rLG0Ys?>pC1$*~tn%VR7ZUGCkKc+M}u(Fb}pfz3MFyTAAPA51l
zv@3L3xURd|HBG1XFK1Vk9M`Y9xqbt%JGDMaP<Adpr;FmIZow5wni$zq^JLoVah$WE
z6{g4W?`Oi%`(`aWprg3;iYe*ZvcrMM=OHhpP4M+kT$&HHYRE4g-@RkU<S_kZAG35i
zjuH=DKqa29Pi4i@8g{<P&TEgpe4lG;zJ}i%1WRKNDwg}=#Zib1`__Ev*t^?`W2ufS
z8DB;5Y%7|&B@@gh-myy@g85twvKtKfDW>%Cec7hVS^IjpTv?WVD<vf*r;BGG5JSTZ
zD4=d8)fNB19;7($SFHJ%uNl^=lt`<PsZ)FjQL0p}Dh5ESX3g&O(o`-_oOiy1pIRM2
z+`4yPRmHN7PVZ&+9^Sq6A~eRR6uJ977P^SpM?ruH%SUOT*|iIin@lZZl3hoW>hmOu
zi@HFaaNV7XV&T4bVuyKaGy@eBy}o{{KRGfK$l=4MPu)Cd8Z?$mVQ_?Vb0Y*8>pFVr
zb0gSN!!~a|eM}iwPcuZxyx+Oyt*X15xmR=NlrA5XsQFC)j<7wUXbRFVtUOHyA{PyH
z(?~`e2c;#l9=~>P5cHwMNYU#)uYH}sm*VElql#&JVWH+g=H%v1b8S4-8r0PG$nZug
z>(QfqXtg+EWM#$hjl5l6X?{%X6cba;;AQ_Va=FAnVTn&1;;o#(Cu&SByIw};u}34J
z-e{+5s0tY!6`*NzK5>}lDcxE$(AAAZu`pCk`-&hoYtO!4)Hyjhf#ix7YI4}vIh-~f
z8u-kD^Hl3^iEV+>a%Wx5y|Xw$*yAHhr?-WozD9L*1`isP2LfWyx}u^zgWfG$aNDg_
zABu~2U%0S3BJ%L}Z@Ht4Lb`2s`m0v29x-y=ULe%WhZCZmy*H~kRO3Ot-miEJps@2k
z-$V1VESz_~uOEpr3+O9*zK_!Ph8fGtS9u3$2EGouo0cX`|JC~Z#N9_7Z=k5NBv4WZ
zO1^;9QS(F&63a$q(q>O~{`+f^$KEVmiMch~dKYa&p)`q8qh2FmIo*5ws2>ZJdST^K
z32GRgd66)9(h{9V*D^E5<K)$W9L3H}epB?uq8m-dX0}ohX4(;w4QIP}ncqcAI(-TM
zY^haJS{i~>PaVj(hqdXTmhqeU)`{qYB^_2JNua_$<mRoA)1)XqSMam-x5U0~ctbM$
zZ3U(apZmx)9VaO9^-ncAoS!sBGtp9Uok1xpaFlpj?>&2FEz#iv-RY?R=%F9Q2b$$g
zVG!G-Iay!M?ikx`&zDvjQ*cJb`JnyBV(Q{mNx1lwBO{vZw5g!DZy}w11=nFuZw;(}
z7)XuCA*XBR?J7@BNpuVeW!R_Bcy&%eLAW?SDxnZ9cQ$RRD292&HMR(sB}(BY32MQ!
z%~)crn43uBn|R99)yWi?6=a@sK+U>*)|8%YzfUGR4taN<+c57nD9BzPHvJLpArl+w
zcV6>lnbvn-89RXpje`qn3g(4yD-rTzxD1uC*g{J!@j>c9qyra9p6}J1fCkiII9|`B
z$gEXl6<zh{@AFeU5l8=uJDE-Js}2N72&UG;;Alm~rvSiB%pR-RhI8N4Y}IJ~x@#}Z
z{7Pjh^6g-(p}^AOT8-!?-qh4}AjOEl5{h!z9GDdTXVKFuRnpSZ@{sa)b7lGM%TB=3
zF@9@G*>Hw+20)%<jYzEa<Ezh`He<%_X}ivGO!83r7#Q#*?MzJ_VKh>=nL43d8Tj{i
z8*}YgRFu`ENs?{^Q1_Kj5_s?lpRDb}d1~A_W`>i~OwNSGuwk7|>nWFq3la<bHsy7K
zk4bifeTk?3_d}+ny~;c%^BY^K+_!AGPtGicrsI8Z*2PaQS5T=zNH^Bho#*-;HGMOZ
zIJgVSvlCOrJCj43IH#Ws)}KXW>1(?1h#cwZjp^Sh6)>vC9VfXFlB`Odo96V`$2U>|
z$xlnSrr#;A!-L2W2!x%&KD@^&%!V_SmDi5u1_gtQ5jc60P=dxvlcYUOH3yd8bDJ`a
z4|7fo*ua%_nm<1hIZ_C}BS%v^N+N9rDeTh^rZI+GGP-xToW3)ngQwPS04Efe%%q^!
z=yGGt#c_+LCR0f9soyXKn=taTr?gPat;)#pi*k03GHTg!fVvA!%YPg@KK{mFLf=9<
zT3QHxRK#H7JW78xPkxx-i5S)D`d2d14>s?btBgDYwxf=l2~1lG4s=u?2t8koWzeoG
z|3b)Rq&n-G>kfK-h%dcopO#{&O<8C%*~$JsEA7?E*u(6rMzUh^S`Z3(6N-D)3@#Ln
z3s6@#w%FfrpV{g$lyr9WD;~0~@nL^||5c~Q0*Js6d4(4g9g$FNH4IsmL}Pg)e;+AJ
zGOoWxU#8K$dVJv=EqUg+8Y$*-XlyJkJ!yNuvoS@ZknL&mwvuMo7IfI+;qd^L=#C6E
zbB6`(wVEOv2|o2_5r<GccYt-d^IKKL(3S+jnmDwG40NbkDgekc>kZAQ@}YY8fddC7
zta=adN3g{yI(Y2Zu|H>OW_()yN|n=WvW^f2JL=Ek4mwV!`KDKUjQI+x`lAt>sH_R8
z>p@bIBlv!wlkE0vGM#Eyy!*SFV0v7ugx9pP^7r8!gh<jrY%oksd2bhVX^!FcJ9k!4
zv<6U94o+9EZrzrukcpQrO80ogLG_20Rq56)3reLct%Ou+k{}=w0&zRbJ6tivumX5{
z$%0UAa+|V~G-rhx9H2f)<t9m3TN9JS94E!?Pl&S+v^cY=_EYNvcQEbA`tjq(2Cd~S
z6#r#(=?$k0(|}XS+l=I|1mHvQa$7=u_u5-x17LLGac}BEdZEu^bhqX<H&Xod1QCQ9
zNx@>bQjr6tBCO7N{@i}jYVFa9oAg0eEKrQ87hQOl&FGFKn(|tCj+c)$Ji^(M`%81r
zaUkT~di|~y&&)ZBk?n(NnywCn)rp>;H35j-grLh`PqbLLI)NiwsYZ?*DXbSQb*55Q
zDlhl#Or*4x{%ts6{qedjPmm~;qvp|k&)5+}-zNc)k#-DK3#C=`I!*F?`CIMvlJo;;
zS&>QW2a|=dy9bh~`l(nDXF$Qj3B4v@$jQu{)}3l#u9Eb3!pefKhsF?`lRhw|sP;D9
zz7r^A+^rvT_N+6p?DZxNhO5P7P<)j~_NCURm4F=35AWa0_jxi;xQJ1`->;VfIF%8z
zh6Pf_N&kC{NOx4gRjGg?PP1o+0{h+o@TTdk;@>d+*yMHe;|`}E!B9=MFX<H`d-*z#
z!VzKQ(^JV7yB3(AeOQt}EVlL{hx;|ozs%k!0f}UG^SXRv#L6o$X?sw@8i#txIvQ{_
zg!ViFCZak!A?hpn-}~pueV)kWq`8vS>+F{=gMefl7}(v{31ydSPsMx!lY|E19J)`h
zA3EW}S)|AU)#RMrM%Fr!CIN&K-g58IdYIgk#6npx3LVh{z!#}ylz7o?iZN;*R5xdr
zy${frGf>Lp>*S-ObmfYCkZx9lGeYWfty_(c^#05y*)ove%FE;L&@QwwHXaUybp**H
zMAMQb4`5c3FRfjR+pA}PT0YYKGE_q$arsOrx$99x4Ksn@xw*SG$6I<qKy{sTS)sV<
zaSsUO?X$>iKNJ>rF=<46O!L}sus*>{yLfa3^aUN*y=l`BzjPwYR2J0fbw)<Ut^4<F
z5WIgVD#FR`pt;J1Kjd?HxvhKvnNxJF?x>mAq3cnUx2(oO5Qc5s98X(X7mg4&q4%w2
z!6fJT@Mdjpzj%M8U8!dv<k$e7_dbEm)~Rf9W5ytKCahAFR+hv>Yd$}%c4vPt2Vkde
z9>pALm=JMSoAV}ZgccG4jC8w;{YqmHElxooCi51-bk2Ik?GhT-``E;=Cr>7k;x~-j
z^jni7HO)SogN!VncY>n}GU4R$g0i?qZU5+9`ts26(Z(FrIes)Q`2O+_6sj376)C`}
z@cTz7Qp;y+c2RS&IZsj2RQv!2DwIEOO9-Ks!%Q499`~h289I5%#Lt6t)sS0VIk%%A
z%Dq#Zz4d7rG!vw3xt~cbKFQ4#v4xN*<SvG{-0srXk{rrK2(8U|mYN|9)N<UTv*lzR
zh|*M|wg}GFP4NEn3lv3_IxztsAPH0+C$hT-m&yj!yA{Xh&DjECFS(7B;0xr25uoSj
z*5XPh=tg^#Q*vWURKvD^b^*INmZhyZygbq6MI@^a`hF`JB@?TAe7ZNFW*)kk<hrAJ
z|9vw}u@w2LGw#;6=i(4piK#kk+3#pd28F^moT5D<F;1HFB}#n5=8I;*7ZHfnV#%hS
zv=_sQc4iEC!mcSbsdTSWx$;XGnWMB{oiwGtc57^xZx5FMCfZdZDQ_5ubhU{4@`|~E
zJI|dvx4C+SCy%wstWZ16;xQcE`;gL!1Vm8Z5c+lgQM{504?w$NGzkJHy@zu;zG|8G
zeln^X9A?9cU@2DQ&NX^B`M%dgR9jFU2!xCJe}?Isyu4s&1`*G+_d<WInD(NLwXmh5
z%t{7AEqC+i)uYEtsHq1{=AUogpuz3{cbk)gbd!o2eZv7tv<}dA$?zO*M)9mhBK`pK
zJ{nRONx<}yF;H<uOnzlb%B{dfhHo&Gnsb8yeBHdSC+@@U@boqyWh)9_9)yXye9cR-
zJPRveEv(PXnL+dtq6?uqHy@rjiMTwO>gg7D>NROHTl)_b%(HRnr@utJU&_#xA9pr(
ziWP|J=zf}sua+TFmqnXdUn=|OFJU<6q(6Zq8J{*As%qh`Y_7?x^ypCeco8Y#m<?K0
zc1wDT!`Axkx0LT_WBbq)=T&!*%b@|dFBh98dI28xg@bgf<Pr;{W1=pmOG_rH=1mzq
zeE3$v$v35?xS`yY{*Mp?AGfWQDJEIORS}Ud)_mDe_P)WJxW$jQUs}-j>y`2|9-4oj
zct$t1-lnqmBn%-C+t)9GmlJy)hWBYu^2E|>oY2GQqqGTE3JVLzUVQWFl~59)+zJVC
zxKn|oEWxBE27y)F)bQPo$PX=s4z)yY%Y=Y%D&J-}9?Y)_e1^u{k(|>3pmX@pm7%m=
zqUSF&f5vc$gvX<-z%RQi!I2Hu+$m~w4yX-5MhFq~pzlL*-TtH>fiVpaFh=pIsSZ-6
zLJV|AXv^WpXghRgpQ&K8)=RFge0%P#S=V`wMiZNext1p-ySRyIUw=}u>;xm7@_AJe
z#m$0r0L~vqHSpLqV%<88lSJ+@0+yk3^YB3RxWXu47ix8kC(b>OSmzb<>My3hr2Z27
z|F*Og3;7blD&Aoux;qJ(N}Q$FV#tIGrQ;_ilz*;KoZaEk_WU!tYoE$KxHQ%LJNppP
zwtaC+I2rCG0Oj6hYrnXp(B*Gp!PCdEQf*e#z_*mcyodz%M7DCd&ru#z(h?cTTqmk@
z9!m^Ccp~f<w3z#{yV7K7aP7Y+QkF3ity>>$!OT%!movxGxm3yobA3pNL@L0k+vCM~
zeY60)empptEILG+yHl%Igy*whC`zdwfWmcnSzbXoj!#SWY(Ah1o$*(V`g5FeBxCP>
zd293cMTtXngE;}m(V_R!vgkp?a^d~;CM&YHbw3FNOR2gMJ{A#liBVxi#VL{#>P+3H
z=<enCgOIg)Rdk-`kv%dAorQPeOHR{vle>n($}wW|mMu2iOSpqp;}0x83PQ}i>+0T*
z4AuP$Md`d2v_?V}CfN{;zY8A4*r>JZ)uU~UzC;0viXdXY->MI)saaqW2n=lz3kX;Q
zzIiJ?UWIPY3aOj?33n?LjTMoAJ@HsB9Vx&|;VC3`lyHlZ-46Uo_Bc{z`QxObzpoSj
zMi8P%{$$vti><JC#KB&ko|7K*t)h61p%xB-*&R{cVUKSx`R4UdN3Vwg+l5B~Q7>`6
zdh1r(EYo?nSFJkDclfaXAcCxt2%UCNt`{Tz|Es&Zj8-Elt4>$=G9LL$3XlW|<w&()
zGNPxURYB)n3A7;UV%kda(t!YWpvS2#Jr|gCVn?KHjg`CYVdoFeFfD2{cW&!c7Z(>{
zr=nU6MeR#q<h-g+yLNXBYsEah?yT26@|S1No>>QO*SIvdOF0=kw*8|8I1wFfM^Z1A
zji^3f1c=D#7(+N|Nf&-bK$G8p|D7~or#?B1I4cT*jlI6^+rOWc4v@=9tU7vfAUl95
zw8Kyn%$q^|{LMvWYc{B>C>DpB90<2(>G_D=r%#_IL!P`OOec-@^3#2r{u5z*piFfD
zy;(ZCn^Wv22a(=FM+q)~i?&s)1ssUDtxmL&#$@{!MvVz4(;pY5yQngdcp#}PBHGU3
z-`Js-geki(UAi<24EM;Mk>5d5Vhd~HjA5dM;r+FnHVwzd^PuQfjqJmM2xA)#(j%%1
z0d=a)pylr`?IDB@iMYbydFMzvN|9vczIx?s@l8?gTNQFr<@MyT3)lV7FZu+wT$|%#
z!cX_fx_fsdY=<W(jJrA6k^u~Ri{TmwmEY)ipC}X6%Y_yN;@<$fk^@ycmyK$yyqT;0
z9hCxIUn<ueps$O&Ic4;Vkt^v$wwZYGJe~WD;ROZXL&}WJTJiINeWaRSw52-=?SR9C
zLZW-zpd?+zfAk>@C@N1puYQvzwAddyb&w~SYui?>TE)%g9B*yGItLIS;DVz5VxIf=
zmsSQQmuUE8|F?L1vr;fj1b?~xY42#+!t$S0P)t>XnZVZDT6od}gu7)NH~Z$UcA?_Z
zrbH24y^M-$W<#FnZqQNQ_=#V4WPc)MA&v;7C;M%WN{V@~j1TKW47%aLMVdHBu1=MZ
z_848ot<L7}7%abgBa|8|dPcUz6-ivPb!#$stn2yfu$VxYqiY9pA+ZVV1Fp$Oxvh-F
z!4{GR86oxpze;5w9;VSUEhA%Z)8BaOJI>KJ0a^Ke?cbfcaQ=KDK_21Jxyu<<6|ZIg
z?uq#tn)K%5`1!;x_Lb8vt-a*BqAZ0yq;(ROoWKGc-MwR1`BeWd0#PAOJ&^lQa$aJz
zc!b)4wxA2E#DRc>5xZ!e%!KRC6OtS&r*(2Xi_q_%Q>$oiq!5r;@{VCex}qdY!@l#^
zVGjwScaweafjCdh!{HyU!>}F4yO8X;EtXB4gJ20{H3a>U<c$7Kw89RPL=i9%bYw96
zR-0p|XulYQ8Yy4qg=&S3;PVIm{QM0+!{))YhZ47h+2_xVj5=|`hTafje(LoD7&X+K
zZw-BA7z{nomC7#3k;$&6$5E;|5TtJt(V#>n9F3ea0Ut9ZTa+AKt5XFf=I1x|(ZowM
zso`zVULHhh@sfbibb<|%6Ts6ZgY?iVg%I=nIQCux;k9U{)uk8LHr5({Bq*hT9DK2R
z;8!R5L=uol3`f6le-<;-ZXy^kw!#ZH(5hOsO<%56RUY*+ER}8J<@St@2M<_ruqVP9
z(IjVg{lDSk=gX3^$ZK9UR0L9qLwrGDsHK)F<0FMi+SFdTxhdS#n?#0m+rCG(`r(n(
z<`x0|2ayGpY^$l9IsNk%#X15Kk(h{u;)c$BeBKdkB>ufGZaA0_j?_$apv1IxoU|XC
z&NIX&knssb#m;Wnv<XpG!LEg3r-{gT1SoO`Zpg*leOw$Oma)qlt;v9|U5&yy1T5fU
zvr1gZpWE)$1qt@x+Bde<>qkFzj@+F)oWS=q)J2Gi-?bh1V~)JyKa<4dCH3r9xO}35
z!*ubOv2p@F4)thv;naW_LM#tnX__&Q$p$RvlDLlqKO({s&xbT-`5L00zy7`4h9#u_
zJZ$?ziaz%n68+Zu#$fXj8e+@_oBTye{;$b%`*XLKS0L|X#DNDWIbRSG&sJ&u@{1*|
z4c@f%%9Ulki#|RWJAW2+5vG3G<TD@$gV}^zHW`ZY<+ni^Hkd`UY%Kf}A9&qe4>;p2
zoyj~>AC3Mk=Xt$;G$(bJfx_JDGfaoxt&Qvm+G-D@uHD8;L!-Xsg*)W{&=8x%=CP{>
z#Tf#I(0hu09K&k%DYU#;%0Zst5vwB%0{n<b0dhn6^q&j=s0po376FChPg=C-FFiU&
z&aeF)b+~V+M6vxJKB%{C8&TUpgUtKPB8nIswTUnhA#nfq)ks+>olXI|5&pD*-nkjY
zrsP<n4H7jG_^d4vg;v-%v_ZJ!QRo_OxXL8qQb;GT5x0u@HIxMjKfT2pJ5jL0Hpy+I
z@D@Q$9yK#uNYd8mnTc087#v9g(k(w0D5%fJ+not%t#(btzob$qbcA#?VV@<I_%1uE
zjiiJwJA!6`@8A4Hq3f$)E`w|uV0-S%m!oIy5xf@2BlRKWNikw4`n<bbX~Too|LdGS
z=|H(!S7HWi1+Z#p%8HDSG}HjqtK?{0|IjaiobBV$(e17z%g(u7ks>#iMG71^8oM>H
zy{K^sEN2M9HLE_!H1VIrh3NAVsWq5q8b=VyX^|Y1)P<<a-8c&{NPjJ`6Zi!XJcfSy
z!TNg?<&ERNzCxMzmUm5$S$Nc{avkI-BrDICjR0s)0SZI7AlZ;;KsX%#tXoG23t1E>
zp0ltZ5!|Y@*Fy<iW>R@30Tz^{wEvjXI*^#p3Zk^$cMC)QertW8HS#K9S0V0#Se7GB
zqziTbHEK%E2(k|riaRY5C>p)!M7k_o*xB72SRR};K5VHhl4V7q|68_PcL`Xh3O_$U
z_*H`WXj-5(?YYP><D*EX`0{eO$u?uBM+bAq<GDKsBHWU;{d<$mzf$%*3n&cFfh-8B
zcUN{-cr81W65oT@LLj(cBA7vSze_0;w^<^QA#*d-*B>adCv4%V7t5C~Zxp)xv&N1d
z`D<rI#nSkv$uwMstrAFQ(V~-b4meyqPVTa8d-v|;n-La66y&~sJ?>wdrM(N4fe{~o
zrWU&2Va%BHb}xiQ2bKmEx2CZc0<5NW)qK$F|9sFY#3e#4eqTg78%OSY9B}IN>D`PV
zwW>TNYP1N4{tfe<%ws#rmm6+qM!KBl=YVpE%7}h+SrFF`#AA6)RY5WQ^mB`RS|YB&
zwzDUE#$q}-vPHC{=L>vauVe^Q(wN|Xqq8&nK|!3c0Rlykk&#E_%}Jn7qviM0nsRFH
zSkQ277i+jm?H?o@yBs?JDpuSf7aCJzU@+J|wCGeS!<K`IM({G2Kz6|k=<bK5$+72n
zp||YH5JtM<trowYbDy~CBw0K(`Am=x)Fm;wI*>FX1_R_pB;`e;yXx{aqkeu(?TO%+
zgdyj|5D+Yiw421|)_dRoi%R{t8G5Z*f%{BtugQ0L3Fw`SY7?25)y(qDhO1uVk7lkW
z519V*n$S|wE3;!VGMOOKV-af+ZcEvaW>by+v#E}ZVjyN^9f%A@5K5R%$Yz!Vx-u*X
z|G|@4kOlp~GZw5Pg%N$*)GQKFAxe=%wFC6(9qo-)ge0L28h>u*^>1izo#%X`ajl4*
zz{^J8t64>P(};{o^omk*1OHPF*_GzUau&;>6$6N|H9SMCHt9dlm<GP3UAOKbvA%-`
zJF^%)nW^EC>ebd^Bx-EJz#TALQU*s*qir=s?lJiTWVoywXb{P*o@zzTjv*&kmo33)
zHGDzPpGRDeIK~S%=3VfK2ykJ_QJfa-P=e89ow00ISMeImzTAi943dbEvNAUA?x)@~
zen5@c2JOT4-XDhUdf|%M(EGW+US|JK^x(5i6E|ta(-1|(?B&ImzNIke#0h0D?ZFQD
zXAkaoBNIWR=-9}5&^J+GRft@n{=h|wVYEmAa30_kucRL|vBG>)T22ijG+_Defy_1s
z(vWRlFzVpHDUW3-if@f6sxTBgfiv8l-IZ;1Mx%o~WZRTlHEVAF5l$6vgpdg(Jaq>L
ztyHtOKdveBePa-Wm~-cDktUFKd4s+>r$&Tp#6>7{Ir)hLYW5L@#xS)&*eq(jPXj8t
z)se*Z9}7xw2n4P7UkfS?^vUu&JPX8H-~YLWkM@$b%WjdRNvcW%@VMFY!m7G{bk{?4
z{o^=VoEE&Dx+--J<<#&EDYS{_#CCG;MK4cLhKxI1BBgK8dFnu-U5ReQA$om5&_uW0
zS&$A=0YzZ$n&IsYL?eL>Z-kWp-J|M?+wz}}WPaA{#8YYxwrVoG0xAJaFKnAPe|`s=
zT^&-ck>oL~n813FkA}Fg*hIr4(X$d^c<NK(7)%{^i8D-_jmW3jf>tU~CseK3Y~R~3
z#Zs-)Ysqk?bU1J|h&GNUCnt}+D~SfRzvAbhFvgK)8R&y#r1}55vu+tNbh?E-4#AR0
zPKZ_$E$NN-zTgufBwgfXTn<zvKOBI+QyCDQfaEG`9FdfDy-mxdZ>u1ADm5ER^_&Ro
za=IHS+G8m=Bde#5P6Av3@A^c7o=Now^~Ch&>R=Z(I!b(!+qnNc)Oi{f2jTcJa1_%x
zhCj&goBxw%7^ElBXR&`Pxu}NQU_J1^vwpSw3egXn^=;MZ4C-PqszcR6(gAI@%bHu!
z+(pK{vXE_vPM{4?P~7^|8*CMn%8<K={qkP7Zb0qUEYR|wA*E8{!dOc{w)=(^tJkcV
z3go%){1E~^Ir~h`$)Vtd^$sNn`*Et=Pm3r*!}=2?<--ZO(qq_RFYShskxPcbIC)c1
z4wNr(QH>Vu6`$v@pJSkyi4%&rszz<v<n3CBra6Uf3y1Dqof75a39_D3>EBpe(LO0T
z{NTZj+x|I+j-&9P7~bd4r)&Q)8l%f(T68>vyn1|NaRjUb1<(+qu<UrUGXM><`D}II
z(#Ok(aE5O!-AilD^C;>&s8*2F6j4+0;@k+u(3xnstNll&G+QqjV1CQ1%^`MIR!f8m
zv@a9iWM1K?p>VsxIISaK;N+RLziq!zw$&qK+&8nYdkZbq=AN~j%~#B<V}5v@TcMHR
zUwRkfE7Yr8w9GR=@2@I{m)Uw<)$QH5f=QPhkG&^U{3U5X_4T!W3Ebh-x5F}@tLwc~
znblYIojol)rub2=@BU83moI*bp1T*;Xy+>*5=erbF=GGm+usqMOZ%$#sD#6n5y!IJ
z1d#Up(Cy!=vjXPLpq))b2>^dXN|3~etaelo4!^UUCR27nmi;iAq;5OqEQ6#wSzS{+
z`+$B+VnyKQjCh?ItNzE!-RrjtflQddS9aj-&$J=0*#5ivyYPum>Ub3`=SgZKL^?%Z
zvs5mw+?iJmagm(7(QW^P-5LUvZL00^iVcXQ%6oK1hf80MQ>fDhHDcXEg7SIpmXd2K
za>Dgz6~0U?Wdv8}J}<z%s1>QK(x|%^<s}VIsIE8`pt%?Bn?a!`SCzOzTenSHLu_UT
z50Ok~6fr9Gx3t>Sl7nvQMhk@>FV}UH=rPfEqQIfiEp9-jdZqWxrf*s>TJ;C|o|250
zmA#mh{H-%mi`D#Sw>a{2sj_zl*v08|Y}H=gY<VhQT4cVNF$;%40BQaHyA)S<>N*4G
zA<KEe<|}sw`1WepFAbDVxs;uXo&u9|l2V9F)Yu2jSQ2L@I6^B$Lb3)s#j6blf?v8e
zRQ;21DmnIaWJ%^QhpY5tZOLP;o7#kpHW52cpeP4|JRgy!cR|zxcAGml@>2|DCS&mI
zHZVWO`)E*??Yl(yLxQUg<lXZ5ft!W<WU&#tBgaH914tj6`MyHV87jjBaTjnzWZD?z
zRvD2)ceIc>RA9gUau~><O?BF^`HlYbto7iDdcysQZfQhC0lz3Kw9yfz02Z;0p5Em#
zwS`-99J9P7SSagdO+xIx8l`?0+D1`Nvu;aWS>1;PoFS2jq5(2i5w`4=dY@oZG`@Dk
z-Qro);_ySrV)l5{Q<7X?+-*Ad;jw=5%)rVt7WADXbHaiVAibjLEd`Kh;*RQgh>dBX
z{||)9nZv-m0<V(;0rYSL6$=Ui;}jFVAjs+2uGIs<#t`A&UGS^89?*@~G+NCMh2{tb
z>})ha4%g-ET<P129c-nNQX8(`j=Y;XB#45(>8nU-C;^D09KknlbOk+;ljYI<SX4~{
zww*b1W>YCsSzhg#;b$dBIM}8&gJ#p#55@GfAC#m?<Yis9`%=LteB}doJ=8KIKmGY<
zH8M~^k~QIc#fpX8np+TJ5k*1?ms8L@++izNU@G-^f&wnuCaCSZ&noCV%ws(r9b=VL
z2td*J<}IgP>tXD~cy%FUt}~Z#k6&Fyp8*@AiPa&YH?S>7bK2j(eW33pK<gu%q)?6r
z=A~0}@(0gqp8uk~mk+%`atp0_NeElA{Mld(<RBv!5C5ZL6$!L44N^}l+z}?mcTN_T
zXeJy@o`-}`QWh@7c5i}2Xs2h#&RViG`cf`niTWdR^Q-t69-4U>!J6txN@x~SDbHUU
z4trMR-7>-E#1d(12T~^c68U2Mr;or8m@qLBeZGD9C{F9CtZ}n}|5@WhM01Wi&EGgw
zK}oJ!ycPCMl%3(f5N3`Cv%DD>H-u8=I}41ZasjuNLRr?Y2?F*%Nlb+imMT}Zh<9H`
z<CufCPj2`lHJ8F^*?M>eX$ng~9lm{M*%ulSH012J%$QisZAct?g+k&jXR;-7)-^Ae
zgKP<0BFC89r$=60H=r^WNayFxGk4#-nT}9pg@iy%s2DGx0%@}6tA<3n7_`~6qj0P<
zK(uer@JB{l>7R~S{_zOlvNn(n4Ih1){;uS-slR0q$`WxF5+oBhoFG1p8@z1U8TbVO
zcL<<PTvhF+aK`!CU$r~^I)HF%1$cuLRZE8of&n>54E2hj*5=ncZy|yBmbc>KT5oAt
z?OhO}_+xLTa)&|y;)2ljbX!dnTJ&gPf<~dAb@76GgAVhoPqLD=`~BRKp^piGqh};E
zn?1-=I610q#-<kXnCJ!~sjf%g(R^&G#dfNYBva&`rV@q1Y&HjJjemVid9uUmxRXkp
zp-^{xK6!Xc%KP^=t*_)P7q0r`texHFJQOUPbsoTeb?s%WEOYu?QX+yP4g##y4!n-=
zwZZ0pRGhvEEV)_JpO!=^V`|Aux)<U_h_mu&f0lM!>UYq@9VH2Hy7lFgG;9!yO3W$x
z69*(Fv!XB_%s3o?r50ErS<>;IJ$;+jQF0oQCCc5Dia1hjr<KpTDNojFHE16u(HP2L
zM5luy&K|_c-8eDyA8cAt8WoEQl=8Q<PN#A30u)gS9yICiKhus6c7^{70RVc51|bo)
zU}r%#0t~=%ft%RQM}OB<Iz8dyoaKPDpKv~iC86f9ciI{p-UHDfLW_X1zgXa<crG0W
z<u{Qb-{RH584NPuFNOdBFvTS{2OO2Kg&%Zmi3Nd%F&cf$pSQC0H?L67c(f%9EX(-B
zWC4xorH?5=IcvXZ5su5LOadA~LHR`T_#IcRh@Xf2^7F{+O(oQpDkhBQobg8)bt4iR
zz>S^Z`SdOnNtX!67BQ!J?~7HdR>{Jmxv_`baR4?s(Y3y^Oh-@&@|`g;u7^ddMlbEX
zFX&iQL?zQJqJ-9IzbnlcLv;jWKZ`hw`*>=3#K6sUoEad;Sw9MJY0qYMZDXM<TO{&W
z(N+NJMT2-`o^iSQyrjh-cTp%cHkq=XlRm<|4*qBI&r{>JHBFowM8nj4kvTG;V*?TQ
z@kJA^b}S?<%L7P>j=6Q7Z4`c;)zK(>2;z4|Fg(qXnsUJ!L@|Y3!<+tD6eIQMxCsZS
zb}*sgTk9ydy*)ks;nRmhB-(Q5iq62|GPmjo?<XYyU&1T9>4-{*v`*fNe98vFcu`E9
zRu(U%zlTQJzaBGOv&DCzI<`)6x=oFJ0ONf+4@tefyTbvQqbq_gqM<#0Z-EZPXT!8i
z>GK>j(0sr~SqLMmfPqI+`Q^Kuj3NoE5GI_ZwSCbt(fRO|gW-w|23J#_OCe6xUmN-p
zcaGhT0T?CHE>X^CBF0L}&?KmvuCN3&=D1<78ByO|q*LAO23AnqE*x<kW+V?OFVE(v
zK5xG7$~4L@^B(mOb^{lBD|wt$0dw@js_SC;q=_M56}|k785N+n+i?8ZlX7?QZ=m@T
zjyIETK5#0(N!9$dmx|tt)y$0|{yJu`ijLmWhd~PXsL&pHrJ9mcN1Ow#dBh4)FrK*L
zi%=kB-rCyQPE!`KmK_fMv~V{(@d^2W=10QkM~xPGZZ^#n=r449<v2ju$1C#NNkX_H
zjKwJh4-r|V(~uy=&G;np<KLS0v;~Ga(X0ux=4Gr_%$YMj4I81&AY`0PW?u@(6dp$t
z&mw5cmqZyRCY=<wK7`N6**XeupLAhuYC}`kKqB%z5o_5XW&8eM{ghkVh(y272>3T<
za$9r`X{N<4<2~WC>8%>feK2CJIJQzBv^SD(a_i2WcH<XrB%>#ulF3)$afnq83KTIB
z%&DAPkd}juk%N!vnBRxUwWTz-lt_X{yU`3Cfi1^SG;ylp_S$PGtrjInoyVNP7?<pg
z8-HyS0CAEZMaiusYn-O;{G#HyWyilEh>|oP<&M;_cIvtbtk+_9`*O=po5=O~r*O`;
zQ{MjebOZ~NfR?y<mp<^#Y-n^u;}ds<S5R&^ek(~>D@z2Wp*p)Xu8e?rL@Nugj;1_g
zPwCUcp%q<^O1){*CNYA<@q2WTiXQq(QO#@=;N;*t^`+ui?&ppcgNA#+W##n$j|nJj
z(rZbW8O;oA%jgK++v~>rN#4ItoS3EKZZF`QZ}<5T9MoZIRy-I!CKOw3TECK_xAHal
zme40efch%1u=}XE?9MKlwX<a<x7)qwYyrb^5LixS4XF?fSA|V|oO44__C&*>i*&2|
zyhon;<V_<aw8fc>&^%nH|32Jo{2n=`G`%2%3Pt_UZDR5E7gdRXgkW4m!+AES>I71I
z5#;dMijy}|Q&Di2;-6b(o-<s*b<+r*S+z5mCaLt5g2KR+4Jt|hubAlvep7CoM}8`>
zFu;X6kdrYJKOpCUwPss?MwUua2M|;swdtF793gfIg7xf)t>E28g;7Z>YU=nMNSx9!
z8d6?M=Z0}=!RsYMpyHwn?>O|5L?AqeEU|tbf_BmA<>uwhaz!j3wuF8^gbKsCRXg=X
z=!<jP#SxPCbsIM&WStabet)^0Z{0zE70y2hAmPFK?b2tAz`yy^N=h>;Fh>DB$az|+
z(of9TF0CD0bO$QSg=&$gV1&#OmmpS!(dZo@dI-+gozl1UNF1b#(g+VS!W@JDY{`%$
zS~VQSj}23kO&*aW$;TKBsiNE%32~Q%W3mezT3cmJc0pH*0Xq*>bSrE`4UF{gQGGjC
z>kQ-YzW+GIoJnv3VRo-Xu89Tp)u~4?(ayN)>8?M-D;NSF+yaMOwQ=$d)c>rwEzJJy
z-)ktI0|bWSzip`j1&tYlW}GuOdC6$S^9bl|2nmup5FklP_F$2Z#9LKRTyKKqCR4-K
zXk&9KbR}Se;p$AdrA&$lw*48#Z1+O@FXhL@Z&!+30IU4O<UYlOB4$<tSo;(4zQRPy
zcL_cR#`pF8_O+t{#u5x<@jFY;3sIb^+_!DBBent{pXzE$wl?R#t9g?j`F=~-Y~e0;
zwxr7GDFsZT0oBhu=UFO(+xXfQ%wN%R=m;^e${NA2)ZF+ssij=j1|z&r30XT82JDhR
z1-?3*j*fd{&#c|s_TOXD8`H2DROPUKFzXLqu#3b(7UZ@o7&c;`@pNq5g;r70Py&kw
z61O(*`|Zn#vvdwpp^!^~!6)LGCg0%%9hC=?|BHD+1|xYHc=R6PS*2r~t<g_oJ><{H
z4S|dHMfZUa&@AB#7=KZBtEGDqS*bj2Z(}pF$443`lFg_-D{gV)a8%-jhN<5?z7Pt8
z7MJrE^yr?TpT@OWbs@CB7tIjnA`ES}C3cGD*`!V!NKU?^KK|7LX^@Gjk@EQf;!X=t
zLvMY7@1Qffno`{~k2!*exFJ%nsSd=Tg^C)5JynLAAO07*@hqkm+xg1+>JL7_Jq+cx
zn5%OD7P$P586w!*oT#{klL1KQ6g9e&u<aETdL`e#L_e0<Wt1p(HldzplEF{l%G?i;
z=opQ4KwedsuY6O!WsC}jMQR0ANz}`T4w)0-wvyIr0niF6YtAye)kls=jsf@rT^YRf
zRU#536&db9*|5(aKOra27s64O%DBbuCUy9>Jem}|%q{~l=)@y^K2WVVYX1568#nI&
z`e_M?K(n``^y#`a$PD7AG~76O$v-+0LVFY(Ezow!3Ii8G@@z#ap}C8<7<B{jXDtaB
z8e)-p5PbSHt?w51P!7)9w;fc7VuYa^!9#jBk&v6rIGGj4?g_n)cx=`5O)R|Y-Txk;
z-gEIE=zl%+=?dXSqiyr!e1P<;P!-oP$9Bzf9jG*0${R$R(J=Rz=8JR&^S|)<*%tMi
zR&H43yRx$8BU}CqA9H5aoNzJ&2}s?R@Ptdq)S}D`@+!HdHT%*_l$GEA`1RBZ%Chs0
zzh3Ql`}cpOfQnvA*gIQn9B6*1ECbyhAac_tV_kwluMmU|r=P+HNKT5C5xLy?y~gFK
zh9q*p3pNDFz+DlX!#!kKdJQNbv&S)uzEt8}g0L@T0&#BtuQ6IKy_C@0v{N}V<hns`
zMX^SOAbJ*+nu~oH-^GD=ccv6nO1QMV$7b~vx8yVD&WWyC9Y~b`RSF9YhLE|t*7;8w
zlhAX)6SpE4Zjdb72Y0OZSY2HT$*&Dndf$CW5n)pP-^DeIh-%+0YyO|*b0lIpWfp-|
z{r-#9vN|+drCY&m-!(qja{`v^n5-w4xyJwcINHb-f%kil*jicfbg&H{lT!K_z{y~6
z#V0OH*6QYe*UawKp^CV}B+Vh*@}JW@gS2Kq(uFO#prXSm4D)QD-NyP_Df32tQF2a}
ztzFBXBmZ%7rC9s?Wn?L(JzBK5!gu*Uo9|fM!Id7OqJK1POySTIgt=f9s&U5EF840}
z_LUx(zB<(vy+}jbAkQL`GRmErb@-3j?wyaxQwLV8qUEC#PY3@i61P*oJ)XVyf`R2$
zaM7&4h)_6f@(gk^2kTXH$)daT>Yd|{i#YQ6#J!WVY))LgWiM?_+#^?G#vPAeCj+$^
zHx&$pe?AmJ&a4h3soSGd-7U^;-s89yFIJHUw7Gcn5*@+fJ<zobbGd!UxaAA2Wyxp2
zhTs|NXdbqE@+P+W`)m2G$qm)|)iym^cpwx2T1tLIOM@1B#*!Y2o71DUS!cePRa8vd
zb$`QokI5>p+~HG{qV%5dIR|s{{pC3bFqJ-d|3lW9z~!8GZTvP1Gou;LGm|9@W)u<=
zSwqWMyCaf_RF;G!Tcu5E%ws%=xXa##WJ`#$L|MjCQj|(1%!CSMX`@u{_uM>VdHcMd
zdFL^?@Bi}qo!>dvb)D;s6E89e`*0It#RgR-D;t#GK;vSqgsWr2(ydtN$GrWd^EB_L
z^V;ZE`<pa}2<|;NYRu>{V{S3YL-srwh(H!8)pf8ThYtI~ljMDWjc(Bx7e=BFnwrB`
z=RKn+i_hzmSH$6v(?DO*)kj~s1qFu6(RP3QukS;2Ko0}>S!uRYX(OqYY>O;q0Fi&n
z3^1?l^ml4I!@#$t&Jd>>5|g*L_I%&&_oa|&t6mzzpvmJCQ2Ux)xZ)^<0(yg-!+;o3
zzUz6xx6>F|kud+inpzaL)3es&ovQo2B47$dQ4~4fjvv+X-21(UefX&Z-AgurHpi5K
z!fkEyZ9q&lE{Ss~j=Eb$27vDDgdV!{bOVHeHtZR7%S0|Un?S^Xj$uR-bzoE|yvBTx
ziEj83g~aqMOi1Jc^3LOYiVa%4NHg$018=&h{=iNYDD*5!tK#*e9MjG`jV>eA7wKj9
z(Z_+DEy((XC9fnHC^{Vj|Bitu-Ve^5<A+PwCF_%mW+-b8_=`0$9Ks~9xotbukFmqz
z_uE4;>o+Lj9SLLxZ$GFHx}=HtUrUXZ<48rFOCOqa*@N6=`z#~%=V<6Y(M0_|^8R?g
z;RA+<%8VTI5Vn^4MbHgaMea>HkDqo`U-BhUD3pc|fK^4n$b7s)YfO~{-vJ1aUqgFR
zCE_~HO#_Y2S%+nage(Ed(TPW<|0=)9JG}*7#X~zkCL>}q&~|r}Xuv;=IEtr1s3*$_
z%;_3bxU|oJw-RGFlpTE_bh-<D{^HYw08e^}#J95Jvgg0?5-e5=3`eJ23=e&e;36<C
zVoLy?5eb|%J0)YPg9pNTX@(MCA9=chG>G6>m<}4r#sk7liA$$A=m#MTmU;~VH4mV>
z!<m!(;a*_;sU-f_RUVv4l_*d4313iI83M#Y)Y&Vz2j$O{xTcC((u~CMJa0nbMPM?q
zMnokR#VI}HR+339VMba?iDhZazBc_sTKw_D5G@`?T)%+EN)-w8nga*a;juqSqx97h
z*rYJ|MzdQaw9{NF&O!8#rzZzAF62=?&e|J9Qt&f}u{>vfIftzu&jq_@h?$!oRz9c{
z!lXbU{65Txm_|w<2x;HNe-95c@i#84m|a|a82y0@h?;b{WBuNlUqx{x&6$sB)W+Da
zVATR!)=q$3`r94NKd<g)wahj%02dfDC7DjMKeQ8*8I0-DXE}v^{|utoGu>Dzo*mhQ
ze-O>=T>$8EzWVmt_$MB3_^%_W-pjV10P?s@^J^eV_b9_>ZW%^%e#}L_cd?g75iL5s
z@yD1DdXL^b@zk(iyw?UHS#SCkcf(jMJ)6UwY0+S(T%1Hv8ug^kzfpvJIfhBRa`&K`
znwk@<izJ~uN^?<?i#<XSolDDk27RS<sb_IvFOA(C;5qS_RLw*M)s|CYbk&@{mfxc$
z?>e{`24q6}0p2ek%(}9Sc46K1r-U<gw5l^oW>V4tbx7@BPn|tC_cfyRAdLY7PFQDH
zPztQ$dfHG{CKx6$K{zpoIi>+GFgU$k)kpkW+fZOzEDh(u?;Tj)@T{KPGLUs2eflz}
zSJ|4u2e;NYH0f0H8lpx6cd{MH7VPR1-uK2nX4Q@4{Bpev1mA{5rgS@7=X-i@xKDSz
zX-(s9liy@5i-5uLk8=!e7-IBF+(l+x8FG6L$+h$40g$abdZ!#MKI_8^{{y6PjWEFT
zj$L88@wSb?C7N}KG=gGx=zKXsOPMhdK=B27iRjw1q6(v@C~)i6t$mB{(f)OlsRK~^
zQLp>lN124o=<<Kpu05t#{j~OtMD8)$`=(swLbg{hD6uUmQi7htgI~`0Lr_hw-C+I?
z399fm#SlZVt8P^OUw)%`16EjCsL21Gz%q%8Co+iHXNxMSW-EbxqrFd*a$5%o<1j57
zsj38jqD8Yi(Tt-~9M$>NTU?(q6aTErApbYRc%$QLT(<Ue*^K?<K_2*H_KN5R39{0a
z&Hge|X<WP^1tXb6lyAdvdFMXQ&emwQYSo3-2<fjU%pv}_roLyM*g@b1M$ykxMI|@X
zj;5Hs!Q~zsvN`lPT1n|3l;%z8Dn{zS)s5}wSGkc!3vDGJ2u3;+Hct#PyGBQQ6#XCv
zX<3j8TUZ-%Rq|J{6+zb?qAe<L6dBpn+)K1$@5n%RsrisbYAY$Ov*6lyvDCVwmZm#B
z7j2xxe(|M)T^>`@mQrjYVOpqvr8x14&nl130$qz>nt+1)SvI3pGKojHbt>i2HMU}L
zC-@iZ#69TCfY9mVJc2?jt3D4}!K~?_3zO>;8i3Cp^KDXqBia!Ubs)CUD&Zg?QcBs0
z!YT?B%_%49t2M|lc-+ftOS^8u%UUgJ+v4D(JIr@gAbEk2Po=1~#Tks{t_O+CumWi`
zq}gYFS&hKQgi+^m-yL|$aXYEKG&c2|I*B)%3qmOVr5HBO;}!Xo#pg<U7}bLG<VoNa
zt~zW$qOreJ6+DsJ+~kHm1RViA0D<F|m!D<|n<OrR)MuVGvPCi2mlhS@aeat)X|on&
zits}HV|w{v^x$~L{TbS~RA_Ye+hkmBW6mh7-Uw^MXC_leiVIebp`aG?G$dEA4#UO%
zfFD*FPGxm)hfxiGYZRDwlt{#ausXBjDZyI<U7bs#s9g;GeT=-iKG5njJL<N6moAq}
za9xUD2(htl-@bj;#p?qDkH2<^ws!ux9~-s5+kA?aLN{v32}X`^Un6NGv3j#jY3n=-
z_+87kPtO*UYM;}CN7@|d|E8#;ZuK;5kgh-3H!V47BS-Db#=y`<sH^<hW6U=hcL=$A
zucC5Ntqkh2XcSiS<Yn4Rzvb$7Bhl#Zn0cTQL)ZCkbIL{V?{(q{`J5KG>5=L~gthI5
z4V_m;)Onu6D`)Rfph(8zUSLp0FYxF<4vHytR@8x723CcgbxIG_o)62fkZD)DIc{>E
zxvu_R=V|`4c7MzWHS*1|9pOlpw!hl%d2!c+-TkyTxd(+yh&s4LyHM{t{D$PF*v&iU
zH&UTpL|U2e&k?!wQpmqsTt+9Wdy&@guQW8q9Bo9V>Dp&DW-LpuPeUnh-Pna$H8-P&
z7Jv?iKR%OJ^;#&moFuZE%dG9qGw($e{|(<^;j!R<F>BWc`KE&bMZQ>nTq+(N(rqUV
z1()+3h@i~-P|d#w2r8<n98i<96YPDVt;m#=bN;NsytRaU$_BzV*zYb`0!da}0tWa|
z;BK?#9y*SoY#ejZl`p&j&-`dimRv(yI!cCZvRLxFpl&3Kx(#V(UZ&~+8pS906Qkw_
zl_HA}Pe}G)ux1@y&?CjTA*KvIwFKBIz4~?<Vq52m?X=X1T{<q;It9tJ#cap37WbTm
zHEsWxu@BoGcBo35(_f4cmUwtrB>0P+E4_9ub!5=D@b;Kd<0DZ99smfnhgi3gsHl)Y
z)2f#ZW;dkH_sdStu8mPgi=vye0Yu;wUFyJ>#URn%u^8Hg7jA?BzC`e)E4LC3e-cGq
zeC;qRvC<@AJtY29vN!3eR3?aNzsU1kd`TLnv|IFzsKljhYGrE&v!fZ(OA!#DaPh_3
z8(3<rTRC87bnSqhFnd053o&JwRAUDDQ(h7;S{i<BX|RpQdwsIYo(&J^g&sksax|gY
zw{4?JN=o|Xf{q4wf55q39VWQT><ISeekPmcRCBth$~@`C-BAstHX)3OyCW#Tj&@Fc
zVC1#aZiz?={N}g$gHf|v|BAPA%rNXh+AmHfu7~YT8BCp-vH<sWc*EgqG6pjs3Yspb
znl98OKlhO41j7E9_zaq+p!j0S;P!eXUNp!iNW1>>RR`q6-?Zwn_3>6x(bCmj!;ko`
zgQUImXy`?hSGVo1ZVN?fA@qArHHj54;M`mvC=VO*)PA^_%!*jZylvLYoLoD_hKZC8
zfJcYXuuKV(3VYQ0_D5ZUHf(<INL16HW(@T}merzy8G_?*oXV+2Ym(Xi2dZ0(CGCxu
zc_m!htc5eB*$YEDsaC00G2)>m3{}*?N9WHzBn{U@t@FB5Sua#G))6Nr_vr73Sy$$m
z*;U{-WTKgNb>|;c0oO~DLrV3v*o<jp!3oAoJv_2mx#9EtQAq4C7W->#@OL~KL<H_N
z?&XP<{~*|=v2na-_bPu3ian4zDl=Jys<I-m)U0sagK2AP$aplgE^7Ik7o+NFUo2fS
z;<V?C@wTmdaC+oK`<XPvyd)ug0t0v=v5~CP*)KHHFs)3S$k2gC<rklCUJD8Vb#39Y
zDfnq7#A@F<+7YDrQXnDi!h>y(B4F5&_`Qj_aCcb=#qHtwWqH@~$z`@h_wePmNaXZd
zy3Pfm)!6VMx=K>{7=)|yJU_clAv0lFIt3<kJ&(GCKmDE#O=QM-?JHisOs&kwgSzZ+
z-~ufMm8dD4=G}$&p@Mj|vxr~1e(RYljd=je)i+*{l^@`RX@%RUHL)ZfI~=87jw_9R
zWJHn7;XKbRjef2!zxC=&x-zHU+g~(hRxxWFg9y{Jet05ch}kbE&LbH@6VY*ygAjko
z&$O5hTE6NRv-E4p9Yw%Tgsb8W*+wAV*(zKrhrH9gvZ!*K`d7(~btY23u*h=`bR^1U
zvJ~UojUEKDE`@N-MDyNNDJnmfyS~KgOTEPw#;584QlD%ZG(x}0UZDITAUN1vKaam`
zUDvfR3k1{fV)kccVwJn&tgks6sx!qSdOI7{ahx2PI50SG)~t)Yo;oB8msJLUEBz%c
zs;;fim?k#=7IDk)DtLNe(yh}TE<38l?FSS?|N3HCR?Xf+X^Nmu7_mJmB2&;QxI$$r
zA}>-&7=seA;F1K4{07xoOx<Bo3*Ww@WoU@zEzWt=$dR-2YPDY6H)NFDX^o3!RV!ld
zi07emwfB+kf!b6DdQ*dqrGBdUQu?71y7-x8(jC(wB`Qc>*x;c!Y6$qabloAmW9;kE
zZMy#Q*pr?g{%%>@smQB#RD+w8nb$I$H{IGJGW1`x7}2_$-`)jl-MFv+Roa`CwCX?>
z+_jA7nNYESfeZThTMxV9iFE=3!9f=*crsNxD>^A7Fo-*_dT@0~-m&6gp8aL+Nguya
zFQ|;1iIx4Y2%!r9&bm8y?%b?rtvZ4}3&HAjmr)+GGG1^p%=i3k(h}}<>0Ok%bBV*U
z0!OZI?_4opa%y67N9K9tqLy^8ul0EHg0jj5!NpSh_F>}%<{-`Nhf$WxrXcYbrLbLU
zV==VcVCJ7+xn!<#Kb(I>4(XYV1a)&&ZOyZV$4;d6q)-i>J#=bjs-xh+@&3rp=I0w8
z-8=5IjdC?~EuH^|$?KG-TTc}|S>67SfrAHMLkv{w*`uq8zs^}hQ+TeYZEXMrAj(8D
zzCl~S4pjdcnHd4BhpU6WoBfYCPobz2(-F~FP~Zv?D<wBA1e8Vnr?t3SH5lDe-l_KO
z+e`ECZM%R3>ONs%QIH7MB(5+tEzBNN9`*H)Da!4ds=T!(6PEBiaF8Fxdz}ENhj8~N
z;)b#NclR*leX<oek+v$Q(L?f5!kdQ#hu4HGcZgd9vsCCu>9{b<n3HwVs$e~FJdOT-
zs8rOk-7L>|El#YJi-mGZNMMTC)1-a+rG=-04^G+;N|gzWc_}tYvB6&0AC50(N8QoZ
z`7+~wS^%FkI*frKHlH}*nQ-_Ss0&EqZR_tDQ#?r7(c2m{+amJ;>IbVkgXst}Lxw;z
z@=fA_TlYWRkIo4SKmN@~TSrw=&79ouhF|keCEq~?L$a|4h7H~4OvdB*m$?0YY35Py
zFvHic^_)jssI>O>SrpWo%7?9=ZzRTthHG!x@&D960YmN+VcJ4-w(2pt)5Tty{rbl@
zaOt8(mZh^J(fc0IwVWr|zkm&kQt}A}!$fp4ecZSFef+>OvzlL4#9d*gLB)-*E5Rf;
z6N#GFdtdC;mq~#ZjDuIl-a}DE72Dye_U3hUUh?ab!+l7<W1syc=11f^QTbg<3nM1b
zAv7av`STTpqf&nN4BrwGYpz!vo)P64Y7DOOhh8qR=Thu))OK3iO}e)1W_f?7c`IBi
zQYz1I&Hg2xdV1ZzoVed=1c<C}$;q=RCb}=4KWFd6?2GBc-LM+2J(@iG*}h+tl#R<v
zE)>SR+D3)v5Lf(jX;^udup%kxQxSZQx$bgwvMTHZCL7zf+o&8kd*QQhJ(}$BroJ1f
z8Ka6upM8FzxtNf=D@0}3knoP8XFx;i<Wv8ELEi`Q_7h)~s>!5URp^rT^O1&2dJl=a
zOPMU0uCk`OM4m5sSBJ*~=-g3Su@R1y&FOK>AntI+fi*hs1hJV<rL!NeG~;aJyraoO
z=&!_INvOX9qZ&VX-=!T6b2H#;Hm4momzTYW``ZZePBA6u17y;!9Oii*nF@V&C`d^D
zD9Xo>rfocNqGp)q(}jRl7V!-1MDK$DNE%dWM*W%o$vcmBEx6e0IMuHB9ue&`fxK^V
zX2*ChHWWx$tX1Rfk<~NbY9#A?5`|UA(+UqlO8;!*m%E}oXQzMr)S`T2hq0|F_R}6O
zTC%d4-XgQo#I2ckpDmsgo`{qzu@;5loV@LMdteb{0?X!%0={GZ>%mhvP3NC#lZ8ph
z)b!uA&sL-l&NEm@2sq|+7Y)Z8<%Qu^vKVNPI6675%y+=dKmR*_TW-v~AW)9Sc&`d-
zAkGKU`cLW43&|400um_qv!nKO!G0ca=@x$!gSk#<XeD|!8P2lQ#%*Xh0}S-VqUv~;
z=4oMrYj#EjNySV;4O6$eldn@%!%JibGCHTqm&Um$(TI`u3w22T-qC!L8>*zi8};-|
z{I?ywf|}Sg{o3|%2j6W&Vq*ViXDZfPT(|<U5D&`1@imtGqJ_?~S=QWlUnl~q6{Q%|
zlD<wvm$}0j!qtl8pfaNIP1n&mX}UGf9{5e;y&NZHlFoFF)`x#zvje}#j}FTlTvj|u
zIn5@sd%r0^mD(Tr{IkzywHf(G(%8^}8y~)CVmfva&R-Nb$Ehr(&mr?{&31uLSguoE
z)|8P*&ASo#o6H15e8F8wuPf={(PFPJa5$Ty$5AW1FTr_-F~cF${ODWTqDSukH5*}D
zca6thRho5W`|}w`CnM*I61amLfksA~&DLF2agQo&GL0N8EiM13@zOf<FW~urr=vco
z4S#7x6k`h4-TdW$ukH%c`saN6Ec`DJX5FMuKziDG+Ml*YK@)Ghu+1N0G6?O~d1{ea
z3O$jTXQu0xj;MgcN^thC&YNTBzU@13?WOkJ5~x1<7MuI*w4b-&?iuGSYM@1@9mk~r
zl^mNNn}&)#!I{y{w~eYv>y}pe^5ygMgOH>tj*Q}{lX)#H_H^5yrXz}^fY~e_ozU*l
zzUe|+ffbXlCa7g3HRW$i(9*GG#{n>1%gj}{vHQ{<U1+2#yFJ8&dQh1O78n<^B5ijx
zIq1?deTu?}M|M*P_L!_tfy{FCe0qC4FkOOgL*Ui4V*P3_R#8wh<B9()rUx_8e8-*I
zmxxkF)Y?a@n3X9$^6D>sl6#^9(<VXj4le1jK8<W^OV#3?LswcKoYeD&-QH*Hlb08k
zC*j3UV$I=_Qa#$|#6$F=eT&EXoR8RsuS+*lICbH<)TZu8&9X`REYCk0dxMkS5~`^7
zvo8ZinN6BhA9?ypQL#8FA_$2oLw1JNS*hy3@UA%KQOV4OftbDW)#Q}adZ)|-J&E8M
zUemQU+|8IKxtntYs>H-$uz>+ze^@bILR9UVUp;Bik>Z)<)Mq0QAp+uA96JmaF2bE1
zjwsBI>a{wC`Yg}RG-6Znze+t3CM6!6fb=Ujy8iykg)KNKPblA<g!bF+7BR#-Gq%l=
z6;Ka=hmNjYaj%C~S-Q>$fYiPoq#R1BvkT><EfCnT!I$auy7tz31@t4{w>>=kw*9pm
z?L1nH8a3*G?~pEHwd>a&r!+tRGt;Velis(OIK#C1h^L8^Iy<6ZC0ejduSbs?Y3yH!
z2kqvkuik9xtE*GmVAv1UG^mDHkid79bs$j+2>oaq@VI~4u@q9ozKEqdy{0WI<ix>g
zFdp;sf$L8Ae%l8&CL?-KD$mjCY(Pa)W$3U`qpoON+MMn7K;-#|t6aC&i%gkrt;q#I
zWC_YZuMOb6*PpF1@*H?K&S`Z1`A3mV87Vl<fFhg3%X{|zolbyrmC3G4gY1vb-KZ_5
zrr;pQ<pF%D#RIVN2;zTR78dnJRqwk=cf#fTaOUC&(nTmz##wyxYqgm9vKu$gPUc&?
zBcI7HXI@3WyeCP4%NoSI(mq1J{90<pGyx9wS06=xSD55i5c@h8%@jwqxk{*@+%~NB
zt-pVGti{l0*T40*?BFSa&U&N~jf5si%!L}k5PL`FQ-;eq?^p^q_SE6x{1#oBv!V2^
z=eENVglow+N~T4Tupd(f7PzWbgTY<4x<8$IHUk4d*Zh?0<7r!>Z+S=a#Fx8Zy0otw
z14o@W<@i!+%(7|J4o`}(o9vvO*mL3SBj+Fel5y%+4yTFuHK<i9=Er8HfDU9;CEjJw
z$Bu|F;N{HN=GwGl9VcokGck6)V1KR2khrHjOIwd1GI%SV<=-Nbu=FOniR&ufbQ6J^
zC&XC(5hLF37%}Y0ztT&CPKb;7OWoaXF{q^4GC4|lnM8J+TYaT=ra5{{7m)y6!F)re
zer|W`is&9~Pp~$4l{3huv)BNCjLcjTZVYF#$puKzfn6rA@l>s&!5R%$=Gk;|7g*6M
z%?S9K*e=XHbhU|@Jf1K(wr0?5Es`ROI=VO}pv;W(qy4BKB?q=kI`%Fhh$(9amVRlg
zIu6sL6!$17_18Bls5Xu}cMU5Nn(3QX;-FzO|I}){I7Qe6d9(NS91d)x$_eoVC_hMn
zZrR=)V~j<Vp{bvgNJ%I<GgNZXeO^A@Wa7VE2Jhu81r`S<baWf=xEWZnEv@C_Y*w~`
zb-={+{PyXZ)ORRRbg>7y)Eirn6AczGs;kFV)iqTFt*D!5B0a}sbIRIj*+Ei^+ZJA=
z#!tI3V+G2VeCv?JY0ALv9Stdf#?z1@b^U~=Bqq;Fh#)!D9Myn8jh4x*KU@DA9cMB-
z2>k_XDp34ba+WPW?U>GDmu;jp6)8SkL9u4>STnO5(gU~Jg;HxGJ%w3yh<_Cue*Z8M
zKIB8ADHkUfh^Zrma}3Y0+mIo4!qm_x1IKz0{>q&8(&PIF%?RS7_H)$HfwWn$0W1_s
zC>Z58g-_+mUC8T5XLj|Ne58|o>m57F$n&#>Idttv3o*eY2if{=Q20ygxwv-HTtvE2
zNmje3uKm)I8|Q1iD7I=5ZF$f@Ao&VJ^9hGp)fjwr9h@}x<}QpaV&jd#H{~Rfq7k4&
zTxvlY24ow)TtPxB<7(9;0;jDD^on%#Q&4~a@1g|W$6csJRYFjR1^*#WyKzbur8U)=
zOQL^cgSvkK$94fP$cD42TSS(|fN5u7>b%-PeRdI$)#ARhbU~7WtEbhFiQsK{kS{L<
zd76YBoO{vbZ+EP1QYIV!okOsy-2Q3_s2*F_N)AY<B6oV|q=l2~%0~2m5KXYCwLy;~
z=BM{o3`r%2JwP{zwEj@is>&zY?LOVCgEe76Fs4Ua_wJn&e)r(R5!~to5@teGJ24Iv
zffL-gSUCtKCo=v1uG{1NK^rIB8Agu$1SOmbsY2ClHe(gCN*HPT({9D<F<$U^^J+1$
zsZw;H2}7sZ#?@Ac6LQRe<}}mT;Gj^+8xcT2xEW9o56n!}h5-mN4?$SiW}eOHRa!WX
z9PHc|YQ=@*7N#BO$?iQ&^;dE6oYY#1djst~uW2El`(qzS{=~}~48oax2+Ho-KEl%W
zBEdqJu?a7;hPFJd5GEsCi&JqKPR=9w>rdAxRRFZ#Is)74hX24w-M+>A&FJ_>A!}_t
zX?)n$uv~hXA+XHcL@24zw(nIh;QI+r-CjOOS@kFx_?XMt^7szRvp*Yo)675;^L|49
zggFA)^mhaRnw@Ld9?iT*8pMNZ?pA`!oD#bTL$@xM3}L)x-r;~L;SaYz88GkG)q$Sn
zp@mNx(EeYhpKIgeP!Rype?7sdne>@>q*peCGskj3;k}?J^yiKj3k!|1Ux>V&Sg}w9
zK$nc^xiDM2jwPW*tUjifLrFHLw#GsA*Ry8}YibM<w+RcqFb{95@v(M>PwpKf7i$ZQ
zHY)m&yN5?VZ7ms!+-k(v$6%ncAJs1q@+GLA{<fz+E&LWW+5sFUUhXk9A#G5Xv%Zz=
z%B)yyoKDVws`}!~FY~e+&l^qh*|Rj3=91~5q&HuE^_7`NeO=uf;h1x$LLQEfGTYd9
z+5}47K%`J@kRhiSH?k<E7N>#ewP3zlUrMDJ2+E=j53|Z%AZu;M;dA|Y%@>dE*NZ?d
zb5&`g(Z6M3sDH-DJ`u0Qw+1D*K;6ZbSZM|0A=90xB8SSDWkiEAzPE3&JGdJJb#}79
zzeEEubC4vNSeRxzF_Lbv?(|2Hk?kg55KBJq0~fCRE~k>{OJtNOQ0HELTP$rb#z6FL
zMWhrPAQ_<v^m#KxcS2@AfB8cCto_)M(y|1aD*WU=z5rLkIAKvCtr^tS^O7Ow+&-Yq
zQn2cqM0-(#qN+Kt=&uXv`BhZwBF@}q)H3=Sy__PP;uLs{5aI@>lsgzXIVWaxqewh%
z;&vTF>NELS?1CV2lTywCF*=lvg0O*$hk_wZl6Ehm34DXNl$6H)t;)K)til3y(ZqYp
zD%MGpCi`BO#8bd7Ku}S5o)w0QE<WP8!SRute(^IdHWfWMN>a&lK&f>^ZixDH0<53J
zK0zvIzH$w(Kie!IW>NBEsWN%QO+*y8KPjT<MJ9%scnVD5tO#*G{P>F{4$}NUsV}&`
zbkjk&h*ycyijTa7`2dgjUU}*ePK+$+C-QGtCX!{um7=quWA_01oNk>unet`{!e<#A
zJTo^mul_ZO^IUj#oMV%vK@aGs8(nVv%0vXF*wjR^6T~^~alr}kgK)|{!{X`{ajBQa
z#iyO#E;RmWaDlcvoF~Md1;Eo$^ngQF)U;5{c4H<PB7$(A$GUuV#?wW#e17o^O~jDR
z(Sy{pZvOd@cmL91<og=)pZ{k1%Ll!!S1S<G4i;Tvr$UjI@{>9A{a?Cz>}7F=z7mGd
zz!0gj1LhREN@?0l*{JZcJBmYSlnrL`gk>H1ANLnmG~Q0EIC<dle}ampEw$`Fz;0-h
z>pB{_1}$p}ADj-HX?QBi=&!9oGqTbm$9CE3SbJ+s>g}lL{tt)PZrpzNp!3f&X3k#V
zyy8^A{UpWDCnGOBe7*b1V1t?4K3ny$V)@x+XVdG$?!69sR=>)3WrfkI_m|o21O$MN
za>|q`O}jQxgtSI-mPydlsjqYULSf$S9umsroUjP3Wn5$AlAZ4*>#03qsFf@A?{n?N
zcf60@Da-h`qMPdp{{}m{k}#}kx|h#u26d6(S^>fskh=22Z>240VSzEF)Ne|K;^C#R
zh}NFV*JBBGy1t?q*hZ!y4E$CRIYb)QTz5P}z-j$U$Uim-rl&Z-n-^5G&opFklfJ-K
z1xXch%@e@4T?^!T@3ctby;ypWul?r@KFkSgZFtA~1yD=Pg-c5msVnHmDCjS@SyPm*
z>LmZN&#v2h$^Vjrl#y5!Y!F>Y-hGs~FFT=|5eJww{=P;9lSJ|pUUz7&c%}3D^=oy9
zlE}FLNSbzKQS4d$;a!VX8Vq%`N;VE7Y#cvtqDVas1cDo{nTnv(1vS|3KQ!4FgP@Ny
zw3|`}!G$o>YZ2o9DG82OzQYr9w2HK>D^(@Gi3Qm3_d@0n7m~K)t_3K#wlYj8XWRAJ
z1=RsfK9`5o*kBkE5g`**+RCt{%`en1W!(ptmrI^Ct3bxe%S&;Z^6Rg^L2`6aGWaJr
zWtPFfe+Q(l=S>9EEvCs-5Dc|6RrdV-;W+gj|L{4BV}gt>e5?ECy5(9^cR{$w_(9dJ
zM9+ovthWjM%G9Yf<7x@3qgA9Fo8V8?w<@i8BH~erbqp~m>;HenT`;-%1*D5jyRgzG
zBA7hlx@zT0E$+ObEIneBG%~1fq@}#o0DHt6zuo-lpQX@Y<bnmU2<p^xd<qJl&|hUk
zm(NX>n$w?hi14BgMn;;_4<vKk=o;lenX*v?@uHhhJb<Vh&{5tinXCvfn2DNACZuSW
ziuEcjpSY}QjHLm<Ja5O44_=Wkf=9B~4dU`E?6WMB5FIki?6SI_K#jvLGz&1eh%gd1
zPZcE+8#F=cUuwy&l60@k!`fPNCEXm|b;I9p$nO8SbH`3>8v*}RI7<;(Aog19=H4~n
zJqJnFr;^DKfKXPo3d?BOr{GN1Olg5fk!n*UgWQYacfVJEt@5Gf&{5KEa+7*MmmOhg
z_~LzOZP^=zwoGcw9OrovN5(T8$Fmo8R|ITiwe$m*FL*+{9{);V`N+%LJFomfIBT!9
zgO@m^7z|`;k(qQTldR73H3HrKhFEMp^Meh(vNv)88562<$9hHTSXls_r6EP`Ph0d1
zZH<?mJii)KOr8{k-zIIDb1Di14H<KkDMu}Ss>)0f&AosY?LyIiw@Hb`aOqsh`!<kE
z4+X`fmk7P^FTTFMRP(Z=<(*n3(9^;o$KgD9!E9wPgsfWT6_u;MOE_@$0wj`*+h)D?
zAJj?_86r-=;Fyfupt2%ql2P>M_ft#%cK5-1(lc_XGoC>FhK>>&V^Q%kdS>0XUn;sy
zL>6$9tMaJp@Lw4!uDeXptpdff*taRI92~+$fQJsNolF$kE}jsOhj|W-MI9R>Wq9HT
z5y5)k;WIDS(Bb*BGL?BNutuALgB^KMrp_nPOD(V1aj)Cp!BbUzmpm|oM7Vq3TCvC<
zxvwl8WhPCg!uLFU^r)?TRa=U$y<ph+%ge`6N1zgDx|bN|^qc+pvZ*`%*;&?|01zZK
zl73yKObL=?1h<E_jPtRy;yK5n8IaUN6(v4XqKIZBz`R8Uij)2_2}rQiaR>Ws8)JLc
zCmy72olG3+otKv<Ht=lAuF84X8?dOmLr4FPiFts*eTxsbf;Lc^3`{1G2+@|wfAVxm
zk2J>eL0^LeSp$?EdOn^$Mts&bv*VWR1=N%x@d0*pN`oEOQ8fl<&+9jkIXT*PXMHAi
z2~L^xij4UNSv&7Tp=kGBQyF<ds1rO7rYBBc03IPD-_`rUKw-LANOoalC?!X$K>g)e
zFtqf>{aK3V)5S%O6_gilRdc>sM^PMe2y8x|f@dHY$Y{xT#EnyFMII$C82cz@ns(*%
za{BsxuYl!FBZNrutkiNx>A$Y42H4%AzTNMn{g>iSh8ZcXxH?oWLdWym&Rs0!fy!`t
zI;E7Eh+UaxZHoI@HWwH0_acBY;Z%ijRCl)0Mkyv<-|`d{sck>J|0lq8GK)}h0_mI=
zFIMCjTD<V*y<~w3<bdy9SQK+D>PUE{8GtJ=ebI(zrEaqpEh8VXPo(7-Hp^WO36!dn
zY*NCh_I`5oS}NLZzffoV{J~+fX(*hBwA_OdjaD)0LCkO^gsDH{zarzJ=n^v>Z82uR
zSV!bfJmg3(0nelf6XlJz5}a<gDAmb-+R25x;CE8Oj3p~7(YM}sFo>d@-3M^lbg#UH
zmLI$Y=i`_cQW#7E@YhjNc4rBV2`~aVoO$1!b8{nOqe$zWf!$8icdd<1YOL6cpbGkS
zY%_(9SUXUJh-6$_$#Pc@ZpCo|=A=#NcZx+Ocua9|`t_yb{3Q%%?+3GxVG?k}=SwNY
zr;L1m&{%ip<PuXoIVK$eAmnswXsAhk!eQGqnxRIcIRM2LhUf-oW0CMeCch%l7Xzc;
zN)?e03X^_#14HKOh;~&-EZX*FCI+-oSSHAh5LygSB$hrwr3$5+kZ7Fmc-gfHCA@}s
zEaij09J%VBxX|kY)rndQkJ_+*E3EG@T^540y84bub4B3hEnD_7c-9g?a%Fg`*viT<
zM41cr?U-hY%dbR3Dwl{cXoAE|XAVYcv#oN_@(UZ(hb3SM-?CX$cI_4tU+?-i`blHP
zIcx$V+j|GU&v*9Sii?Y@ykgQYUTUN;|NdX&|3r_Q2$oq%?Y90)(S0vBDKm4V*ntz9
z(T;Yg+|l}HqXg~8#qQjH6w_l6TjT0shpMVa-@7zF5=R{nX`JCXF0dS=*ms5{S&1;o
zg%3ho8|r!oH_K#Ts(xbJt+}wPI2b6D!+*6#3(mJHLi)Tv2^H!#A365v&bOX>2Y6jv
zg;4<G=g^7v#oxdG%=FS0>@W#1)KMjzshMCy-;{M^VmVJ~4;ilfPF@8oS0+=-=!)j+
z5>KB#fCf{jYB7eO7S#&jhLa?vp&A1P68nDq;9Mg}CdBsyJ&sKKX&Oz<^HB5s!Vehi
zT6lUQC8^Yz-`Gbj-hpHS`6;7enu{AZK7|Y!xwGd7#qyPcgX!swCW}Jk5F(bSv}Kf_
z@bSFv!H`!{Rk5nZxb=JhQ%H<ajgA-dqNLi#dM!N+na3O<efZoF+h81@N=#S5@HBC5
z61hx7gJr%?5pRj5v-nz(Vw`k|C3C~|Fow-0ctO)BQnHp`d~o@@#84X4HL0P-ztIN&
z#9o8GQ&<n8$w(9s4NsCAM)PO~@P{Go<X{Fm6ytF$N^(jDDlAL!QW%bk^ybOMMba3*
zyNHASWzhR|c1}MJ=O%%S*&T9OjH?_u>a&1v_VRWlZ{gYRAxt14zf)qSu#6?_h&&Hm
zFh9;|tt1^zPEJ}R9o^eY?+(=kV3t_vBDJ}>k!^oz==*JdKUbbC`s+<hR*i}2!C%!K
zJh*^##&Q9;O)7~cXTSsyC~4>CF=ohr`U}HO8IH!KB|JOI@jdd2!GE&nO<cM(0kmRP
zohQl}iIs4pBd>gJIg$pVf(AZkG*5IQafNlFoq{tz3vUujb4-_DPBxcCGszm8SiUd6
zBotV_z+g@7hKgw;FkG}qP!fVVs+)H59{KqA%%g&n96)1?M&udsbP))j7^rzFnX;-a
z%bL#@9|f$dQ@_S?>Pyr2_b*>z`dTWT<H!4g1I1V#Eh3dNx3I_-HOS&`VJXE_N>sip
z|5S8u%Zxsb%ar;3W!V$yN2DKgID1+;w0I}tqQ^4Z6ROFsobbV}l&=Y~r{Y@~etYqp
zQ$;{0H4~qnY|RT2D-)0m|K44iT5iLg{4t4WNZ$M$JWHlL$moW^QZ(;ff|cOEljJxx
z!<8WQ>-Wd0e6}LcBk}h5TR#l$kZ^mkKEfvfsfG?4w${&;KqvhNB%Fk0GL_WS)b2|h
z0E*u^(&LaazPh^l?KRK%<eJ+Q3yQ@986CyvGE7$@8wIz`yy7n`Phns$Jr}GzaausZ
zNsp!gK-kQ0Ntq>``lLtWQ^mzmYuLxnJYX?U<4z(#_ntiu-AaEgGvNSZ_bfG5yc);S
z;zIdY&LbNKhr9M}()5foo0tYM%2dp9HRLg9d~krtQwTUp1sYA=u+utSyr$9UhoqiT
z++WT#YoP&Tk6<@N&9OGLHMXUIQQO9JiARymt*M_tn}GK_Q9e=O$W1;0$IZoPck-^{
zZGD|pxF3zKY)_`b?0R}>IvvZ4VPia@XJkaW*qotil4on|S9|r+r9B~HKWg>CmXl8n
zcp<ply@u)&ItdLj&Wm(z(VS(9hvuX%^i!wJK9=<QQM;BNDRaQMW$-80DJ7X8)SYxO
zCjIHtH<d9jUS^-lG^?YBjVOZ!t3<2ZqdLAx<Z<K(ze?r;2_ob<|9Yq2mGkF+B~i3u
z2TH((T>o6fwa)yY9*}D_%{wA>O@|>TC+9+bpWxcWaNon8qKV59tp`OATX`(KDSH1~
zT0aM;M3K}ogl50`39R~mpL&yeU*<7hikrkD5_hx%gttp^9TXuWFm@#^3eKCq8SNS%
z@S0E?S|}x?pqCVLBYK_94S5b;icg+H$dA;(ldzoV;1h;Qvs4HUq;kEZJ_~r+Sx4v6
zBt;$v6?w5oZhJ5hBgVFhJvVv@$sp7ZpzhQ7kT*S^MxyNQB1uB5?SJ2c)AfKPo2Y9p
zZT(J>8i8TIz^Aj1O@W=uf;v|(ux>mA!U6;3ucfEQs-J^z<og<2p6<EHC$4pX1eUJQ
zj`I#tR+K_#O+x&+M)yaBwFWUGpZOfDiJX`zyFsiq4G(i)0v!<65M1hede&HtWiyn2
za@hgB3A2z(?>d%R=G#Cog9kMpsOsST;NRk?jf=%4tsfNKqe4P70e@ig=E{DyJ>+>E
z_xTAN7-%ujh@0>H4&-Rm$6MI6lul8)$ZyhIcAVT`XdOYe>T<h<Ol6uNZJgybFQZAW
z=J7<3tM^!aRbjc4@BiOMgt!0~Dc|nM;9fB@0f8sVaasLjfp7Zm{h2GzNwKFZK$_ql
z;0w*yiF}1*%bE!6>PSCUHg%T&Vhfrr<u%|#GyG`rmjS7dfWE|Ek~Pai+rx8iZJ2>z
zSQ;5v_Vh&R&6!gykI!_^Cy&oGW$n6k*~p^iNUO)n%ak>T$$LSz3O&ZKOQpaD5AuzD
z^5{`CFxdgp!b{;=isxAwe~Ybfk5({xC$4twCBvbR0ohPP^2UIsXPa69#%O;M!Oh1~
z2w`C*_I*Q(FxI*O3m=X6CM7++Jp2j+uFM;uG}qiIMj1xFXa{!ZzL-|#JHu>kCEL>`
z4sm8ZAw2W?@aSd-IT>v0JNHH>3fr`RC@_;YpX?URWB~3baIJvLb~_1k{xaa8R)4e6
zg3$~Ibqq3!CV6Pt){|_D{}{vNv~*dZI9D(Jj~I;o3<pjOc#I}<M7j%xs;rk;2HT{f
z0_E7egd0um_sNZ}$>125Ny}4uY#v$!(F447gSx?K;ljp)IRXv=>d*Ie#gdt#bl%h3
zM=z;(tc&O#uA&Rz+Wh`{UAl~<=Huu^fh`lV<ml}%WHKIl62t?Re1KTX(g|2~!oP8O
z{X>2K@l5a9`9g{TGFzYWQ^ic-d2Z?wG04N5S2&3ZN89=C8E>*B|B^ICfNG;k_4Bd3
z9^57I?QYfaE=rN60x8L`9H5QV+}vDwup!=`JlG*8kp}!K-Xg|zPx};l>#7)gMW@{X
zQj!#=2=2`2L!<2V2$N}27oj}s=wrkn+pAZvS_1R4T-gzOQD3w+)Te`;eL>t4Q<tzX
zd)=;J>tl9R{`Pk~j1<?EULr~s>XpxSAWkHPI=QPy)hPnZ^_f|~kLM;MmTF(nxtV&t
zca}nI_ndshlyB~lfZ5O<J$?Ogf~pWK)IchI8CSgtZGtC|L}bIQBee=qzfBGEj@=xj
ze|oKc(%g67WBJ>UC=pnD0|ve2c}@3_lm8?W*QLjh7w1|;PMk|DK5(zT+|w?!x#FZg
z8Y0eP4tfEtt#9cZf^8cQ^1%Ax|AcaqXgq!EfU<RC)%W+me?&f&DVL}S#WYVerITJg
zfAQibtuO9)maSa1YQii0+D}lo>d|s4#<-Z2h^h;TcPdjl`xjHM@zrfvYWT~`OxSL;
zGt#QuyVogeOUuZ6ZypVhS=xP@DI)(aQg-Y%$<w&hk@Q3!2oR$Dn2zX$oRTN1{C?cK
zjJ2Dqv$L~^OC$+7b7n|^m%uPU5RW<Aq*MGivd@;+c@l^O^cb7Qvf>?R8tP-uTm8j`
zcgN)|gXIJq0mQ_{5ja9v|2<0=5Tag2@4or9<znd`rQ+)I?rHB+@Ztj3zkNKGrRdFv
z3JfTUTR6On{T+z=^?$YSm)2c++FO49!)J=_-AQbxZo4=y9&F6-ZML;C{P|)yfAMLD
zEtIq$(oGgRya|Os;_yDUJz6QPNb;B072HF2<p65G`>U@N-IiA48u;Ol<<d~LlmM-a
zgePBeT^U4xcA@ov=Q?Tl@Gn+kWwZati4)TJB$*lk@(D>X<=RF<2w?gA9iDKIaw5U>
ziGdY$N)$~7(ip6xA_%nhIZIQCKv|6an7%r+#mCf*TumgU_B@2c;&aPIY}&CyPbtnc
zw5bD)@U=_=9rru=S@%Bv{{AQnN$d5nz?UPOX(8Vj<#jeOnMUBxg#;GDl5SBmZc4Nx
z+4KBLG^qY=;3n#y>&1b<qT0*bGOjdaz{3YbiCl^0YB~706tO_Sj0WA4huY4biC`gk
znxov*JiQg#D=w1#Yp`vMG^~e&hsSap>avKbzx4X*>*$PtzmSo;GTs1ZAqj+3UoG74
zQdkwb-D7F>qlsKFXiBpd$6>%ccYU01SPum<MK~>j;BPh=0i+&~N`qrROMHqEUdsb{
zehrLYypW~;0Uf9n!ejQ+UYb&fUN`&e8yVRXopN~@4~vR+1DCtd%{qu<)h~i~)A)C&
zoKP(POgQZhCG|{|Z1?eb9{GI=05gHFCBP1j8BTNNj8@Slxyxpn-$cZSxg04Ikr=A4
z=Du+1Sdxk2nj|v|vRra^Zy%rhM+`gfPG&$~nz(w2Wgjq+NC-faM#TEUr-L_CB^e-L
zu#z5TdggfvYajYGQFQP4@zG?evmq?(5Q|J&{sa~xKhajwbt*mUJnky=3+Pa7n@R-s
z5>H!-(Hz=!4uGz^uo=XV7rVZxa~+rzhVZ6RLJI^o3Z^ZTbR04B-9Rtb4n6Fz1d!u5
z=qWust*S2yhy}Ypc~R3z3~`UyZs4>A<VjnmW^x@9%+2uxu#FM~M$S4_F+4f1MW&2O
z!?;j|tR^AEsc90t2}!o}3S%|Cj~#ezmVq;n*P7L<y(23wU%4Vvb%=zQswR<f{b%=0
z*nkgyG?PhdMItYq_<WeFz)7M}l1aXjEfYwlpJ81{f4rRHu=q~>cf>;<q3$XOFIkLs
zgHwLhFY~1U6Lml3Q$w+T!zjjNTzWwxNz3;Ci=OYkxBCXZ6S>a+UcsT~f06I-J_K)|
zs*i%PIOi|T(GEVPI#Qam(#2|s-hA+su$hBLLEN$1as3|kBRKx;oG%sDb{`+l0pHwD
zj~m5API<8PiBA`ETB-dwXs$a?x!SExoI6O~kxZ1bMy2@z6WGb~pFBgQ#h%QjVRo%6
z<M=LpQr2|4(#@Cc9>)k!8SlfcZ~f#nB7>U*H{nJKiU%kZjUTYQa@&^QG1kelnt&DZ
z2B`#|3TCTge#DxMB;bk6u-RDEMDg&`4b=XWP2R=4ylruWUY$SZ_uo=d%Y+<>4iGX<
z@wTHY?Ld|;-7NXwjq<O{r!ZYefL#|79kGH`zts$^(}l-D!Xa<fa`7hj(qSs_GN~TO
z5nYgS93cB%9r1(0vibvNPd+?Sqb{SX@10+ky(tbh>IU0ki4xM_I4i|!@HB)Jk}6Z0
zZpt@oqHt>cNgU8HA2&{V1~mWmFUx&T|H7{p(*Y&%X!lca16|7f-?<F)s!(A7MOucj
z_n})E-zvIIeNVirZ_&w&`GVX;6|wU%=%w}8vFoG!P4YUka)jL#5y2u!y_+4&n}5~&
z(4>P6FSjJ`Lp3A%0qxHf=e*W`a*lNpXJLU9Wy|r4d6(_&>?*~*h*+*^L>=xVLju4t
zG_F6F#vB^JYl}@9_d|WjNy6F$Z?T#*Y4grJMdX>I%%cJlMngf=6f2kAo$G)8I0gm$
zChgP7^<@j$bU6fcR~d3G>}o$cp#VtR0{I4{>T&u4`6my|0HI?use43*3_&6zhBMN>
zB!7s%CCCYUTo;%YQn;(QyVE9&oPQ%76VN}1Y;5QDqz=Z_t{V`MB&-6n63@0=7~Cu%
z^*qHAOt|2VC}4NfI&hV?{gTNmKY{5q7GVH?7B?CnJi3~1GNPP7`_)DmYc*Ok1~OH0
zEfkHZl!uVoH9h8Osx2W%G$c!8q|P@+O<?@az-w;WzP&r;%z(RI!+=(7K$7IsXm=P5
zR5fx`)W@MRBpUFJC+45Uw&av|X?Ga<)xqnpFGma?zK`J8v<o2g2}Wyl@73#wb-MTt
zNdgRTn8k^`CKPA-o-+SkH1o74viYsL^S*w@v|v1o2zAhI<{v+N@Zff})9l&v(5K23
zckrtmFW|lS-Bnih?V<n4%GQhm&BM%6AYjpf7GO!2NOiVXuw~dO^;L~DWBGIPiVe!{
z8(GO}-s&;#3<4=~cpjiwX}af&fHhAr^APq&256G>KSmg-ieihE%3pkx9(EA6SL`{#
zT=#^3fG5k=uv~$qPAtd5T*Pf4?8nymdU=&dr*A|<WmhZ}3t=V{=e`1L-~g3#>6=2Q
zyU-43?QW0>RpM$+8Jd2M@$N4~>?+wj+F}NX?IrD(;2@JCz#AC(R3#h<hhY|-T^RTt
zvAX^VaL}y@u^a_+xzV)_c!<a<mCQI`ykOTNGb5AQ;z}E|h0r3B=Yzgz0kd@1XQhfR
z4}dY3z9RDAAu=kJ%O>?WPN~w5R8SnH;8Gm+&x+IEbW9x?{`Zef!^zXcxs4&?b2$eS
zE@1l6uP?lcsOgv|ypz+#R7aF(B5s)UiVQWE`<GcyVi}`ptcVO0R+r>NguXz0w=bJC
zzHu|cLzpCbkLvf2;X_LeIgd#l4#EM7!vd*?j*@cxmJI%)Pk>wnaJ-!W#%=zBQT3s#
zs2`Of_|$0CEnM>8MJI7#<DoUb9s8A{o4JUkK?_SjCiYUqc2=H-m>0$vPrsq#miGAi
zlcR3c5j65r9Rn$Jv#v%Ny<S*6kml|bJ+wheHlrA6$O9BM6X0gJY8JIO#0XPIEM)SC
zkcBn75pJSangDE~y&s_qX`o>1w5;3CEZ+0+h8z;WkwLDpA5vGKVaBg$D=8?q_nEVv
z%v*%Ug7v-qW?@<(V?^I<uEL}KICYCO<B;zNYx<@w*C=%XYuu$yt3haO;W5XJ)8FWZ
zGUdac<ttf@2SF@&Fu_1(hn|i{<}X{FG)OeyfT<dSl?5)P;YtSnvBOo`B>$2~u-JP+
z;NPDS2+4>(^k~`Y^>i<aBEbXw=}ePPj;{O0;`<5sBvxPI70?b(E2dCXNEs?yktnC8
zznn&pv`P3Rtd3TZSVj}nL<w|zI&iD>xyULeh2K;;;pSnUvtaZ&6`zFI^79Nrp5On8
zbVI@SRF$2$mtQRvi8Ow)#9DUfFg6dC`F1EmD6v$BbVQLuvO<S;tWcDqrwA43QobH)
zTP4hw6btCuFD?D<<Adm&{}J0mB#JkMw50-^$74wF<17&WM2}a`XGp!oA~*;95g8-K
z;IatupH!L{{pCnRpwg1Z8L15dWhyd%wwku`tpk)^LX~KQKKN;TMjk0ukJMS;b~Lu5
z>WC|IDLfr-TwggscrbEp@#y`H{LFNU>=D7`R1JPV9pNk!y^PL&(nQ&0*!}A$r98o{
z*`|lfgpU@9=^jsF@jGjjOt5!xvnoise}B3_*sQFH0Cj4PSa_B=pj;PTzDk>VSvms+
zfJcxN*%$q>l|Oax{Kh%U9&WOkuXAqIuO9=Xn{sFg>Psv^1T>en0b$x)rva);2*>>h
zNdNN*0PZjE!)QwB;Ch&DPZ>)4IqEm_0_aUKw<_Jcqj_KlBU@fli})@dpE1^hNCL#&
zQQ#dOiKHv)-3uZngM>u9fWMODw5Hy@DGlNGl&Wy=M_N4MJNJhx6rIPg)wuJ`+Twag
zXXEmHRUhY7k(noHsUokEdBqq(b|bG*%>uNQ@3?z*DwsFHsn0+_+iv3o?ULuHdO%r7
za1+U*hD;$6vp#mTDvDF6geI^SXF7axD_(-0P#<tBB%Mx)&1Hgyu<Ih#=1auNSp6jn
z$p<U*T@L3tfc{Pe^lRGXGesM=&}{kZ+K5!93Lg6T;SnFdkCv0DV(I*&e3>9I0Vscq
zHcb%ttdBQ0H_xQ|UIcGaaR$7cC+s_FQf+bP6{WG@K2GQT8*BWZY#h~Lqc(WC9~^m$
z<DUT4)&miBi2p`5+&)-};IS%ga=m(TZX4!wN@u4`QCCO5hj~OS&VwFG5#fObh#0M#
z9_dqMs@~z-4%m3t|CaMVA+5PJxjU!q&ED#rb7sSpw*S)tC@hs2p<jI$a}oEHc~0Qb
zNYLXu8$gZ^sKr1+XP*azc1NwFRF6Huh)u1wR8dh&I8kZh)kQFFc-*i3+QA%5n<i4L
z;tNymo5cPTyD#6fbO)0TX>NC%Gp8#%L%3bYgyca`zOm$QNbz)p3FOa~xc8_nYUG^=
z4qq0JwhjHt+E8Eod5{+7I}CsiC$X!`ehsMh@cCWeXJT(8S{8vGfgpBwep{7BuF!qU
zH!z4|KUA$|elcr|(OM;&Y#bz?{goubbVL=Oke7fXj9Tiu7Cjg$$AR_Op>5XY2sC3b
z<vstxwmSdTAdN2J$bEeGY@J_*WQ?(Pr)K&ZEq~($d{7)-L1`Y#Gm*0h>g`otd<J?p
zvMgW?bMlM$b)*KvfKf8pzB2H%WUBzSsn6IZ%t!n=m*&$1g4!J4dM>HdGr6uXV>bG?
zO$F%Wn9_C#Uq0Ua7@4iX>~UFoyLZ34)%M4157{O!KT9T7UORQFscC?W0U{XitWjqu
z0@n1Pr~x|*uFSHS07A7;ErS&^Cu~-xa&_6{hy7>;rWAOX6ttJ~+4O7iyXPc*am;Ob
zdFyMO&GD|&PAwTB_ztR0DRBr`95Dx3@*_I<a*EM|phOTBqAkh{P7IDq>96Fj_Dbh9
zUu;8EFW>f4&z=?_$Yu78TJg4rWPY4~^;z!aO{i>MX=)?ik&){>Pgt}^zlG0#ixdZ>
zmP*{G>aE(6{AL7#3EkuR7?AqEy4bO@;9kR|_3Q20a;TY$4XA5@xaz7!y4y(81=HYy
zB@@QwE}Le#XLVk6-={SNDHTI1Fo?#d$)c>h>iA>(_TOADpo}L$ZP~J=jxsepJqOZ{
zMu$C#>#t{I<jVhGO-fHO`Mp3X^z`oppA}7)N*x_Jfs*DlATKai8I$lOR?Cz+F#>mG
z&fB?*X6EKrU<y(eh;c0-x$a9}{)_x{s&~J5oW@$2ztFbN>@hnpj6xo#p7n(a73E0Z
zl_+#A|CaWjf^x_`^<CQY-&_kkmoLB5LJsGU&q?lm_|7USP&1W5pFT25@m{}Y(;91$
z^r*Cz!jG|*=GU?8t5%u3Ilt8AH$=NKX=g<EvjNGk*M)|LV#d!jZri!{Sp4dtX=xXe
z2`B-LbK-4f3`{907C>OUsICPSIlbiJnwvM@-iKw=SxlfNaTH_<o&W{1Yk^3z$zGfZ
z$8f2e2xRI*+G!&ICaX*-DNh$B5~{;Y949SDnX-0&<#<cWMfa{q4j0*83(p?c!N$f7
zvGwE%8?^HZ3Z&n3L~L(S2UnMUcEuk^=noOU@J__nHQ@yqyX-|OT`1ujzn>wt1xyny
z2g(`YYRNc(QYD8W$gaaOTYdtpmNvZceuprL=OIf!Dx#r7D}Zh!xj2#iig@$BWHoVl
z)LZI|9e8?xG#JFVLI1E{w3U0uMNG1_&G+j#nnNaF$)99%34SQdhSo{{zut-=Z_7#D
zCF^8t6<sK*C;~}gg&l@rG1}4|n6OToOH*Ndc*_qTKCJ4rkR5a1<`nI8<gP-=lQGO?
z6daXB!n8<s0Y8y52O)$J?|*TFgxe7aN1J`Aq!yeA><D*6t~W=34$CQiRO1O{>}<8b
z-$DAi5?Zt$$Ws?VWMuo4>(}F5nZejHA40WvLE+(!zE6+(#{PBxek3K5naaw_>YGE}
zdvO@NhV2a4XPmvYcU?*+;G(qOlfUL^Hj`E%>1;%q#iNfPPKm#$J2NHZM{_r78JQ{R
z^YrMI<9J&Z+)L(gBd>|(dYvgpg&|PqCr&lYFTI`cIqF7rlScWQGq3vXX>mpM)#nOR
zr!CvIIr~1od=`E8cq-B4d4;U3yo9L<^QX+6Mtbd<NS$wzIF`am%@WRw7iG!SX={9a
zd=AjjDfXR`ccD+3OS*EvYR?+jSggKYAB^M4YVL&_7TSZ1_9a<7PBgor>he+#y}gfa
z(>MUdYEMmCGNDkcjqxhng#fp02%G%=f)j_c7vl&D&v$eNlDDC#KV1uWF*SbeCDSoK
z$6jFgMomPR8-3WEY{f~rXV_Y^Gs1gH(?k}G+T^+Zg0mJ}yjV0R7k>f$mnR#=RCOK%
zKrOu^A>h-Yq3t2Eq<*RiYsX8BqLiCge&>{=fgG(2V3e7m;nQN@E|`$TNWkRpR*Dle
zOKoK}>Hj!rjM~Cy%ex8#!hxc=QvlhjUGMHVanz_^u6(I@6UP_iB^-;3E3X@9>_l$Y
zL>M7zjpa+z&xXHb?^>47+J5R3iuGH#-=QNC&9(TOfEE}{EIA2YR{wYY36$LV2{1`M
z8QzeX+*B^%K-wsjnd0|z%w@Mb1HcPQK#1;~ke<uIHt*1Lv;_Mo=465wwxhNst-HQ`
z8NCEa7sKbdXd>7Sb9ef23Nc_aSn}m}M<jPp(E3S3=JhE}6eq7^ho4{}>EQ1q@MDp`
zEQ@qJG|6S-xUUg9WDzpOn?(e+o@L1u>zgoeeNF~B#hikrOO~82K7;=(sg*O0a-iiD
zHX=PNFLsUmPhry3FL*ZF5x>FPf?)|>igV~b1Q8=?Cz79iw>Wgf*in*B*q4>nqB!CO
zyd$8eTMlI-)p?3No+z%r9lK&$5onJva~tcVC?{q-jeP;S4&f7h)lrgcir1Z!`+T<V
zWN26znV2}`BiyS?3KAN@RO?o}i;dF6KE?~v3Z1T9pP+*fSb1l3aaj4A77z3bE$YZi
z6I>CGM1<AJ3gg~XQPg>@e9XTJ%mN4A-5J*A9{a`m@yZb_6TuM3RV;axUQzwkcNXv)
zxn7lMw>U;UjaC}NI)La9berzsS$p+@gHdVICGmSOwbTy`pI0_Ev_I9+?ghD?RuNG~
z#14*9WK(o8{%JSj4i|2_@ycDlD>YsTjUD~xXstGTdfT_}n(uKFTC-iZ_t5=+9+;bb
zs5~)g-#lm6dD`=|;|mj1wj=&~Gi2AVI+xeAUH7lA<Fu1a-q=;8Z202E*A5w1b_ZSY
zSiZhu$l;=3z3m=9=I)$%Y0b7YUE1pwzk$CqdC?v{l-Xi?bAYBjL`(5HCT>|2Cz{S~
z&Oky>O2McCn5jL~*>h^W<K|?<@@rR#i5wY^_M5AphGO<oX_>~Xiac;!&gvwf0qG<l
z-lc8*iC=LTfQ6dOj0#*V67)80Uj5lmq(3kWI!c5PkJ0c)Kl<v78Hl1Z1;~6k$;dhI
zLJl1|bSi7;_nK7A&%bBF(`K2O$mPn6L_)7X&|@B1$4tICFo}VDKh%i5ZPco9=RlL6
z|4=;p+GY!Is)`A-2~`_T_2}7iqO`Lk>IJY)r498Ig{b^^l}WM6RuU~~O7Lj6hpBs{
z`{LE}znCBQAFV#Db_Mg_#N=n;J!7TX+KHvj9JE_$Q-0F*F=tqYN)k<V6G=Y)!1ShB
z{7va6w(Z*D3co+00V1o<nxTR!uhwFr%<6y;QV<jIH740b-PH{aUw(S+rcxG~odiQ|
zl*|a<qGuDuiCSuR!idbST|t1w|LA8N5vCg~KQp>>+;@Z2Pl~j^p)*tKMvNHYOuK|M
z?l5CE(2FG}1MU$E$bimybcx3qQKQUhPfKmXh^YEB=p=U_ZAUvax(`vO4D%I#ST*R%
z{GWZmw_3J*?%{v{g`?emD;;Q9r@36k^Z9JZb?ZBT8w~IqBv-`)XL`vYf`TX<u}bK@
zT6_mkPTL?;3+XdwY#%3LA5ZnVP>EF46gOiii6#)NuZMkaWqK2aM}H&3oKg@+GX>%X
z^5a;-wU|SVd_yXc10Gp*|NQ5oC9OqlFlJ3>vXK2iq@z_YA6?gDm$z={5I&Me3yn25
zFeAd`FmAt2bA!N$gq2rC;d$|xD9%k0z&QKc6v2I|$L!|I3$mllSomOBGI+Z_qW?hs
z(>M(>U7BzqSjNJ0^<GVuV1Z}hei@zQ;<CVkoQA1h2QcE2BluL7Y1<yx!}znSTT#4S
zLxQN1nLL*+Uv7JpEXf(IgDQ%Rv!80!_T-|StR5+krNCs*sI)D@24&yo%RA|%>=<xZ
z(m_!$QPf4yQ1I*dx0hwwYH|4;Ly<n?L@%0u<@r$vCgSO7k$yrb5KZaH;cc~lp4E?s
zyKRW~%My{g(qIkN_a%da5S}`Wd?Q*s(PeScldtT@h^@;LS65eopQ_3W>qhLA{nVWZ
zCtxDkIX3-Y4p8qA9-4T8h;-mMJv<OBZ4OVJI#u$h2*2uW_;}8^tRW9a;ni<(Gg8-#
z)$#H1cfE^$R38F;&fLYe7JpyWrcD#KyP4qM&CV2#hMHRHB`4?CM}FG^2*=Bck}XOA
zm#SsKy`)1dmHi?Tq+Mr?rmkxKxOVN@b#BDwBWm041d`q+d+NjYvS!*9tD;!Vv%G?^
zC(-g{q@a^meNk?Wa}Adj{b7_F<Y7g1oEG-ITrdM+f|&tE&%ZiSAXiuR;N6QlM!-7U
z!!yTdNj%X%oZQ$jl|pDESrI2m6wMzkd)itHeIVBAATHa}g4V3bWc5)qnv^o<L@;K!
z?1_k0aQYgB-*a{IYxNX&!Z48Eh;#y4=PG6?GQ}F9WF_VGezNS0W#M?gaaCgw<4e&(
zQnZ^fHR1v<BYa_rdp2*rR^wSBJ?V^-c!GLC8f(O9k@vq(=4{ifi9)!-x<=7OrtZRP
zJ5y>fjeUho0APb0Wc8B(nC1HU1WAmQ9wfjL0Otv)#<n1gM>%1QA%l=%kMrXX!q(TO
zo$Bmz)C}!eU(fHoJg`PF6^XPqy)OWs#s+R9)eqIJ4qRL|gOYc?2`uYpQW>ehF;<wJ
z{+<3>X$BH3G77Q70p8f`bamVau(3hbD}rK?q+(pvdo|H01)7b0G!d5fdI+aaz-z+T
zFf`ykpjH^lEG&0-&=T_#rpn&vntn7sKAT_=e6~R*uo4P&#hGm#RAunlySRQqo|&uo
z^MmZhAusN{J+0REMq9K_mf3s<fDmyL{#)ip-ePN4-5k{s>B`TS40CdFw)dK`0zyky
zDdU(4xK;GC=x5&%GkJUyjC(iEsUl~6d=Deu7rlK|Og$Tgd4a{65r4Dkt?*H?va2Z>
zgyX{q88yKR+ws^?E`}zP=(?Skpd{s><-9!6w>W|1N}Laf)mwTj1?z?1GP>k}fH#vY
zNL)?Wxl`2#z5^A390Q_QxNav@4IelUaNYb#jz66z!ZmSUs(6po`kyj`O$^47=Lqow
z7_s+>+xW}@o4195knt0MzXcvf`-oD`RNL$Y{Vyrm<!4G7=_(F^lV#~s0QA>Yo8O7Y
zqv%EuIKGWW<D|_2G%Ar-zl#tm9h0K@gE79j^Ct-32>*st^fQWul{6uw26qh-+vG4T
ziO8%)1i^{w^mH)e=1{@vWWD?js)|PCg__GkIB^7qVLM50L<uG)*(}+0<t>rDjc19|
zMH2Ft8?kg8*@QLWR?uG7S$8J#K-JwM;hp8=q*EI4b0`8)z3WCbWNXmDZ3g6#)-h&!
zl!&1~IO367QhZ;2HMklboXq>SPr?}ZwIEHRzkwO7qSH-}&+R%c{1#%Ub)RjbU>9{e
zMzrJHlEt%!H9}osjZuWy{G;O-F3SqwOk%h+>%}D`<go9=(-0Yu_=jI)lkLiC7Op2J
zKR+rNeZUJ&3v24I#GP`NpH3B_AmpIjo53N=?<Ft%9&gPgtZ;>DP36p2$E{gL?zLC#
zWb!nM*2TkQ1Tc3RGOu{W74Bt<WFnji{EcYpf~Y|Qjt%k~vVc>x7JbmV#n_=p!=xWZ
zy(y2v;bQQ98NY1V8FhPa9E2iZf~d*!Zr|RTmh9h{B$_4OiJ3}1jeOF$cf{U*gGR6k
zpP;N#MF|YR!b@eV#GRt#m#~6{7BTAEdK0(e&5s*`rmZ@0*Jm`SHbU>~Vr{7D|6Lwb
zsxVJ@a+vwNS}bd%N6!i6tDehtlgWI`IdQ@~@n`;9vvIv20GI?Ex)J-l%!A<l!K6m0
zIg%V@i@{C<aXzMwgX|vDzlIklr8^~0V2v>iP1ZG=XK8*ee!r9=7V%%1pj(yqO=!|C
zrKiOj5<mQ;r}^uw(#~!--y!p}4Bs8ZidiB?mw7l?1IjLJJHQmm)PnGyNgA{6?eE<m
z5qIRs23YjoNwg%KUh4u^Ak!$>jwVO%U$o<5=uEz<>4=5rO&KZ^D8_Rl27@&<pY;4E
zCG3MWSy$i-mQq%>;m@6EeUdKL-S;o1r;lZAjy`&Sn2Ok{!(^ADXMd6o<{*3yEw7NI
zi~uS9kyiDFQ3OXh?|L^No@S(CoOMM<+7ez=jOEbj2Vra7xZWOvO_JK1+7YBudj>bM
z?7EYDURu$j^sjz2{%-1hmm_WbgW%1KD0sBmzBRf-IbO7i_?~tvdwZQ4{}HYFe7hNz
zUL^X`H%3perF1$#Xh*FYU?Ovf2^UNqxUL#-m9*l1)lEvk;0r6rJEKl@A#YH_YQZ!A
z8xtjdFs&91u|i%Kd=$sR?%3Zyegi{OFl!lJrE^C2S}nT6fdg;1fsC{#*Y^5*J0gbd
z?!yhCL~zfDD0lPm@`}pKcETR1%Lsy#v2Hq1WCwY%Q()npJ0T|i4PDC%>FL|_i+z5Z
zMmR%MlV;-{pZ2RuK#O~^Yx;irMi;jy+q`k(M&Fuvw4j2ehvuT&Dtbk6no(cT<k7E^
z`~LHDPOqHcAc{Cr#XVGR)4<{kW|9i$fG>-84#mJQ;M+cq(fp-z!{~&P-tXRbll}d-
z54f_BI)d6t0;gyU@^p<iaU_`Z4zH|xai#eeR#v-)e;xdS>?Gosqt@DGkuS(B_PE&d
zUMwyHc$ON^5y?pC1ZHQQWw6%ef%Ghp;rPaR6ZZRQop8?TVsD7d`qscCQiGHXOJ~_!
z;cfiSL=IO;CHCiQ(Spg^>+zqB9hP?YZa<>q;ihZ;aVn8Jbr>%aN8xNGCzl!E++Uy!
zauEDXT(E^4VTn3j9>a?cc%WMP#Oz?fD%6kK``LS`^vlfNQ#A&KQpKNMna{3eZC^-v
zWsn57nWeLN1%9HzGn+MU-kipSe)cXM^{;M-!aCgjkej=^HR(lGzX8sk7Z|3NJ*zzh
z4s}X9NAgGV`kwZ_#QGh9K@8G}^&l&8UOH6S78}qpxpzxvxVrY_$aGxBCMr%Ckk++q
z$-DFK?dv;Z&BTpE+`0%QSqr@!P8>gOEp-^OIO%W?RHuy22O(l)=(g-P6+P13xp8y0
zEg^>kOXdx+heS5nI&9u)!+`^b^lo_dZC&QYpI3Cnz^S*EiP!V{rQeS}kBoE78nbU&
z{}Mxn8|8v@)O!=Id4;FkMyUjT<P)N&+z<;Z_u?(b9WMj^jJeDKG$19JU|9Ni^G`bZ
zX#*jHPUnzK?J?|HTD;KR-Tibm*8WGNhYC@rj15=2rOlife)17!k)`?VrnDRJ(S08d
zfaj5<lq&ufh1-Jf$rOT#VbZ3ES<zPAVLYHtMLsp<u9|+sM~<8aJPBz#D$X1E@@DOR
z@MG-ZWgB&8n~fa2VaVb2IwWz<OTXP1#U_y^VHMJkMZuE^Jbm2LudFrhP_|{#%~6@@
zI^$Hw^-?*~EiNk)?L-wo9I~is2ukD(J^d&zKR?>AD*$SjqCfxqvunBu6Ay5T*jT=x
zvsPX^{brx6%9$K2x#+8;8pDS1lJ+Fb*}ahwQGZTKTJ}0kl1#<(%6$4{Ku)y=RMyqj
zcG0W-FD|iwq}oP3zCT~_!fT6$vb6l=#{4%|nwwe$-yPdtw~UI@$zbgE#$#r-is0>0
zVYcBDA;}T0t;_Itad8pV7f$AuWc#qr`YhbwemJFh#r`;L`f|ZT-v6BW&-xBV2T0me
zGMFdz?B0OYLg#>#(Ww`_Zr!?V22z@Z+k-S^6CI`fLM2r`FjWe^>su>GKy3(l6U9Rh
zme_4iZK^U~*?G{QLBpN|r)SLFyCJYVqVK(^A{bQZYs%=iDoqX`T0z==*|AltR-&~f
zeVh3x?fI@xqPI$-Hz`qH)^geHqZhSjJ2~y)j2HSDy;AtQi%aInmqB-=AF3!`{8mjr
zrdgP)Bz+z|di1deTeF?Wj;72xcU-%yRhxhRd*jY3<zU|jP#o9$DVyg<Xd4)0&U(tC
z<C*$rj_ow1kgF5A*YOUPtlt<rOn5Wx+xO2*(P{14z(Doj&ag1ylXhq><Di$PCf(-{
z>BH4kwSyrSb8GOkGme2W6Ne?Iv|=6<+PKa{2XNqRdY5d(ijzK`g@K3h$td3y$O*Mw
z(@(RTr71eQv2KM(12S=zswlR&EK<XifzAz_&|83QHk8+2@7b-})me5ZcKSGd=C->8
z+~J>}PHUx0Pl6&<tTdQOnKP3NU%)etV1H-wueS&lrL@Aph9rJn5ox7R6PXudJE}kr
z6K93}_<Vj$OblVi)iIPA%<SR4E@iSS4PptMr*ovww{WG7v~_FYc9=CQv~MGi5?w`q
zd*9KStbg%hC6T}NYMJjwYHe48vH2<5I=rosquQSt)a`Aa`R=-}&iw4N&qS@Y55Q5x
z35$;fQL#(HM@jG)-WVx^ii&*OG4)kXBQ%OVqT^GNZ<^m-SYoO%Yll96Q~5M6Yu1~R
zqOi?T6@VU_Vm**{;Dftxdj<D3+iJ4r;sgW2h}su`Qr%F(b>5k%*9-5SF|_wKlhNcl
zN-8lBFsIYGM$#kPrRd0!BYku2m^Q|TZQJyH+JAPbakO_=AJC%{{RP-Uh(b>k`0xf#
zpn}wx!9F-<fg8lao7y{t+&N94Jca&xYG9hOW(4X}@RQEyQn|p`q?{gd|26KND^XX@
z4BSGC3@2yZY8POIUBr|&MWPUqy<TJJw`^HvDB-ew@cLF1%HtM0WWH$)JI!i}KGjGC
zM6sf+1T9@V<@8nUo-<$6)Qoep7`evw`OC<UR?WFUagV`;U_WDo9@CD8406Grawgyc
zDqtS@(k8;ORv&r%X$LtW<2XYui-X<pEkS9sX3W^h(bJ7XTNQ<@V;*kP$a2RnZXkz;
z@)rXOr3!yp#<XoKuXvaeJc_@%?C<SxQ=+rKd=m906`k+tm&xQkbZZ`#8DTXhJUE%i
z);U5NbxPA4y(IvcZ|>n7*y6`Q+S8`pvo3DdteG^7=>OdB%=C|=+}87)4l+X)2(Oe<
ziq&85^d~5@IzM>k-<So8b^WFF6z-|h!4%M4nQin^#!FF2%!&?lXd(GA30(*WqO!$W
zS8++;9h7jO0Ka5OS1$*zm@T5i4%U8neTNOpfXsaI=?nw~Ix@FSmNUuIUdqK1k}_dg
z|BtOVfvY)x-?*Fq>@$ATjBP9_%Ly&E%C49(mP14f64_cLTO|sq*%$`pw2-9+Eh<FW
zl4ZuyqKFoevPG#BQKD4O`#za5^E^+l|ML6IbUNpJzn|~DT-SA9H_2T+#*1iW?DR`K
zs0+zS3CGMEPrQzJqr)-^Iw7?f7KBZlW4SOQ{u70#tfV6rUI{ghmcE!pG^UjsszPVj
z*|`En)9ekh?~MDV<V<u1lhC4xLm#6;R9hx8ATQB=;^DMe?Kk?@9seNg1{*LozZy?n
zDw?v)-QTFsv{yS>ltix;RP#K27(o1>chJj-J%Ta=FG!N6wKAVSXwV?RjN}b@-a^$$
zIs$pZP}uYRm}OdlF_la%J?@SY4(jVh2kbnj6HYAh3#bLfe5P#Y8iMZQYkTaNJkOy;
z$=TfuCW4ETo0@1@1VLD4#swapop<OFBP~<lS69?k&6Y(YSKoknvrS>u&X?I0pkkf)
z@_=7CyYFBlE=@&*)tE&nKj%NS1;`f`T<mh75)T1WTjvY-Bm*pfPdg949nEE8TWpsh
z%e!7pLINr1y?EHPEf#5e6EVM6)dT#=C>173nfiy^SMR$#>J@^>q`L|LKsSibanq&a
zV>Y#wW%ZBC%NMo1JmN5%nY@#X^9w2x`ny83wB+DP`LiPag_l(Dd{4|$EjySK_V2*%
z=j_I>Hqsx6I21eag{{RF6CJ1tTHbbSxPAvEZXkkXFn<Epqn1;#IkLX@oO5SnEE>ri
z245CT8Rt^Gd*Y@`VVDLdPCUs$1Q^Y0N7i~nd$43{Jks82Txv_ua0w4$&M7{g1UbO*
z1DwK)3m3W<HG~*UpG=W>lgl8how<6V7?T~R?j@OeSRe_~O`l|gN)}mNZSJ!N-tTr|
z#M1^Zq>2|+vf!~?E&~Fwi1a1)bIY%!q@)y<b#HUM?8B^fds^!MQBHs4sJor}jzXpi
z1}z>#P-4!l<EHEtU*iaB%nYKq`kLH#;niO2R4eOefWn0u|DTcOyT7D9>;yDJ+CEs=
zJl>x%tGTTcsklighz%k|nDiv(nEE&G*N*;go8_?_kX0m-_k5erilayCj6K6oQpL$U
za@m%_cl%qdZ+WHT|Lc{;pMCKK^Y$lc%vUrw`2^0O8I%56bR=-x#_HFl8ImqbCc_kM
z`hR8lpRu76$E!)<X0zEr-G8I)%$WjH1S0Pe5edbgv7^WiGUp>v#)#Ho%z3CeX>Te1
zg=%bA5ift!&%C{smLZxGX7AZmnSG$D8E{4h%8Z@==_vg+pTccR;*E$BsR9OVWNP^e
zLlQz@NH~rXg#LN=Avm^S*~1Q+0jn^r+x`QGz>N}XN0ld`c~tvv=wc~tL?SC@)FPnk
zK4#MaU;(j?B5uCER13Tz=nJ8e&)uP2-&r%zgNT;Ia!H#mydF5I=bEAm7rqyW0sveV
zH^%_r%6{ea-3<OOR8u3M{W1~haxN7#SPu{_BqRihiWPA+XV%LEyVEOT1n+@YbvR0m
zX6!)4`~0Dhk^p_%Fu#$auDmX~<%rnY;>IFQOyc_8x9>H@G^~`t(}Y<{IDUMf`gVD3
z1JYI~`Sw%i&+lGT<kw2P;6!3Q->`rG>yPeUAO`61ulwl_>}rLwhGhchwnltP9~AJh
zA*V>@)>WpTKiP&p*M&vXJ5ATDOqjtnq~|$=JW26;pIN7-NO=j!juzmoLp4pRi10Qh
zq3tiD2NqKeU={P??YRUl@eC6n6kJM7{|MxhJ;OzTq(;XfL(aATSO+3n?2537wAuUp
z$6_4Cb^{f~i6RD=)ae?|N%t@R#*ht<!f{wdflxr2RSfDPAJwOq^@wgcDgm*SfD&Wb
zcX$IdYSMg)C@WB2DFJx(E_HU;F_DZf%U$8JM3W|br_ibnZU8eWZe!0aodTTWGzfrS
zJmyrgeOhD!B5M+=g^TEdY#Ob+*lMT_O~R>*fi~HdqqOAB^3JYVg}_1&T#bm61C^Wy
z%m^-j<@W8Nc_X+V`(@Q-hjC-ZlnJ~mW2T_r5?W6qDWZ^f$P7i2C=1uAQWiq#LaEqo
zTEpUH;YL;3C-(0g;x<2J<dlAUNd*SjHL4Kf?`!+zGER&AU@oZW@I@ym1QRF{0e?GF
zRarUy>f<yjL`}P`S>wvOgVWwGqm-G%U=VLn+QCf^Xdya12);YT{;h0Yum9U*=2v6D
zl@je`m5JZCeL<auTub};q8^pBu?!DPeO@gJ1Nb}OPLqeM{0$RrlnArAD(N?AGwHq9
zSbhU{^35}l)yZyQv9{z$qT!gnY0U^#zd2_7lUt9D^O#8$Cm!;W>FAnR`8P>`$A}Gc
zmW#I)aloGBE_4&bQSf3`r8-LrR;f1ulPSvHes;Ey=h#_mdW_}e&7pjEhc45mO<Vo^
z{*MZgGqUYPKzrHDM)1zKdbM(6mq#rRoY6!8yOmXidNfIlcHDg3C8dYx*FbcTzBnvc
zupr8ikP~1tV%^9obis_yxuh(ns{T2;PyLb1%?=tGE+BR}Tu_CQ-PbqBY?Hn)(=Nq?
zf)8$XyMk{MsGzqmClF0;zYIc)PFhzveC5;YjC2_7jt0#ZOYhC-V4HVdV$?PsCUBw(
zQAwi1`L|zbYUVSmsB8N6|A^avwkmy|a2kx-je2~tvB*~gJP*!FvP40neoz6H&a6XE
zH)%)z?>K!?vX=4af>eEl;?m|&wWPo=e+(nH@JO;3P)bU2>Y%2QU}6AdG6Pvt;1lAI
z?CS~R@23#rGNqc*@am4wHSRkSE<P0XzY3>@KX|I5*YqLD{b?=+S^T95t8Yd2AhlZT
z4nMZ=K5&c_l!_RjV~}<Wx7qRlu?b0;O(7Q+TYRaw%P8@BZk+)HC{~FAVBqW%03E=+
zpL;2l@hloa<?3727a791Sn6v_v>i>-G)`dvvH#XxEm<h=jTk1#v3Y2Z%vwaC6yV}U
zrpyMWXD_FvxD7yW%T?jq*#I=PtsBC=E!Jhlp@9@2iW@i;cuTCVWtZit65;Z>OUT4l
z*&Lo_D=ul|6>C~z8Ho@gjZ#aF4=uOP^S?e7qHe62IgxDOb}pz?HlRm2JP5)$i>D<g
znUX$`dbd6GW)7}n%n0rS#TU15=9`T?m^<a?zKuJoG<6*|()XM+wyg~;BYu;XrQ7KB
z9W+Nt(WX6}U63f|4bb_>U4x^KggM<Yybno`rmO`O;PG}?vZu-ND-b00jasH9>cWc^
znmT5SwUSvsIxL>PD5aVUWwcgXi41kS+gciShiY<~Sf7h;8tM+Y3{pRnj3k=rDuqv=
zm<}(0+wXVK^hMwxdMTE%CB8M6hwCz$LhMvQq6?-^@mRdz%_*X_fuUh$L~oJwaHGIy
z+!EgK39}KR)Pki}KuwspABm<CSUaA~4A*qYYg3-Avw-nWzj<>S{Plp6c!nCJ>mUzV
z`Sjyj#9*-^R0Q{y$(zHpPiY9GDO#ooB=38!pZLuZlx575)HDe^a?sz`+EMH$N&haK
zMVaZV9{!mUW<%ooK%AQh|5d^3c68)j!u*NbGR&>1J>ojSL<I33(I07~?W6HtswIf?
zXxBdBtBf@KYP_xe<}Fz8(z<A{_Onw{6CBd5OWXCcFMQJ9jf$XE_&C+~qepKkrnS=j
z-EQX0MSx(u=XH_N0idV@fH5GL__EPU6j|+QaC}IiNT=Jm8OHR^Z)(u$1U<5K@><0}
zkWr~nfoXF=ML2nb1WuENLJHQlP9q*xRV{71R)&8B$3y@iLyFvM1Bx{orF?F%rD-YE
zwG4$WLbo|Kdb<oJ0o~2<sSV~aac$d~LeTU>9bE!LkaVF4FTa4(B=^ZQY~aI^aZ4tf
znwm<-B)UGDRK51PE$xK@8^?h(oJw>Nq}Dx*BY(C3eh*rC(eDu}5A)FvP++MUsIIfY
zEthHaV;zD7^n?To5E;r+L7WZ}dU$k2nPhKwbR~4mRFwP5wt>)ePK6yA?-|CaLvqc1
z?t|q<kK&CJdUok4K2;B85osI^v1Cqh_YsW?527kU_K>g>z{0a?yOOkTb8fBog0~#T
z4yn)1yu4;LyA-qog3U!#Gct40x`9L<=y~fo>QuFG8>UcpLTuTmqZDXEcDI1li)b2+
z<#Z~Pz&$5<%!HKG6%zxNJMKHm?EOm}rDk|$5o1=ehLXiR!bu7Z`_lObK=R|X;;rJj
zhVfC{*9;U#2w`NA?lG1Q$OIHrtBOs`3&o6q&!}ECOA9H&7HTAwz^sWab(3v6J5Kg-
z@a66jg^V`__nTu6Yb*X-#Phi=<+YJ;u#!0Fc19yCkYMT-4n*lMn=4LjILUK}M$LEq
zO)I$tU$ZNFg>xDU^sFi=ydAYl)3z0F^w7DiGbd1+M>Gc=DAPwhL{lpgDyuysk&w$A
zB?XKOujwiUdqjJ%bMMQCIyB&@9WScdtsSqKe94drk&uvPld734p;n|{xiSGnz(ntL
zKm;?oYA+EeVc^O?Wb@CH%onG)6`fN#Vu2KvNQ7nIU#Iwy`pcE5VkuF?534s-I>mST
z^9J*npK<2#t9jm_;&sCOOT>b>5pycyu~LO#;!$a7FN`{Qu?4qy8s#wYztQ2|m(&Hk
zIDa(hG^F30!%~(Y80a|wRG7Vz);t@8n>Jlc`K{-fG|(6)>M99%N5x9c*qNZQh}z9T
zi`icXFH@@Xxu4K9rBvsv@D0#Dv3m8R>RnA%FP>;Iq6hbGUR`pMzFMlL4)>B*S@^P`
zmnXYd*}Mm7<l%4{8#G<A0*BP<bY>8=K-h`5o00vg78#b6W>JMJlv1m+I>U076)Eg-
zR`?v)x0JUL=-?J;5reJ}Yq%+_gAK2)FaCHUZr>_}m?+X#Pb%)B3rcvRRlIi*Wy8bJ
zn=Dot&j`l4C{sa<C@al!w3$HhjjAOk<x#YRyZD2QVG1=i>)5CH#oZ34jy$E^e1G+b
zOSN8Rq-V)|&^(K{<uWEn?u-W67{B_KF8W}*yLazOuO(CQB_&Rl*U^0PV?P_E|4kkb
zcEnzA&>U?5S1JwG2OoZNfHCnBrC5W<%Y2s*{$s1FS-~l5ggl}@Mvu7e&Es!n3v2ep
zPh%K#Qe0f4?g5^^zWTy{lfKavGWw8pXMR)YB0EpnnH#oJOItf^{)SHwlnJQSjC_$d
zi8OiHX@u7ivGGPgC4OSa!6$lLeJMU$#W7h*`LDPqc|WY@5XDfx=%s*i<I#T2`PzUU
zNWk`qddYPuCkMo%uhM<x%E_K#Akm^YpotQA`|?ob@BOv6==e5P7n25+vIiw+PrtCM
zpcP>*jm5v3TF7zB+MSw6J5g_3`rITqN9ks&{c!c$pDU_052ucv<zJe?6C+*=cYW$+
zQ(IdIfYK{`?LGoUClBd;nb%rKh!QB|Ttn4xsPpX2w(5xOm-#iy8rs3MJ9ma(zURZX
zz_g1OM<dol_^_d)P&a?Z$r^5nM<}JF%Z7y)C$EypU(#wSdiIo6n<-u(Y%zCJ8x-{B
zejRV!y49OTKxjHJs%q;sx9;5OoJ!cU*v+via&T4QYq9ml%<Iy6yFZ#D+B1MPli?dj
zTq`Ij*s*NuS30yyx#S$H%?QsTlFXokE<gNM>E!Zk_P!rBuE_GK`NcK;*pc44fa)sC
z<{gU0sE0QO`n;{BxE5w(1W#_*25#Mwd*{@)IpAx7rf02G9Q<V~8pd0EV5~lCUf%4c
z9Rp@5>%9`rpJWDx=3$ZqOawV<6_<v6FHsLNXtJN~UNRJuz7N(+)$kJ>j0MQ!Rnp&q
z*^J0DlkXqO=EO<#Nmf`DgV75w-|A35&bixLn#9Oq{knI*@T4ccF|9~KR%9mQpwZK$
zWZs{sv~=-&6QBU;V$5^+<y$${)6EQq4^QR`pLscU)~v!drDo+{1>!FpJ<+h8)T#tE
z<1>9J$LZTDJ+3~s`4+^`ydrR=hj%BLQl`-e_YQYv;<>@OX@<k2d$x;CX=}QfHvLum
zy5)g=*#>AI*yhlzr0gf|uTDvi44x{kYczmYUzlpCuT+|r&>v9v^cC;yW_DdN;4sT$
ziAa+VvOP3GPD}-U72h>8dRcqhIlPQ&L&`Gof=8R0^RZvP4c7(O+j%pT&+FW}{?GG1
z(#RCrf_fla;R1+<+QxPLO^EN#sgw~DJnfe+U%tuJ-UgwJcT_D$#>?gRIV69_B})!b
zHNcHNCL6W;LRJeWm3)mRZc%hlFFor6*FCNldj|Bf(DsP#%4IBd|L{d(8iZiTsSiRc
zs=v-J)|ut+yIdZq5xXQ6^6Z4jZkj#Aw;NV5LMt?jBw4$YCEH8Ym*LEOK^Fl|7IGlc
z*9{JkVuiRtDgcS>sOiUV`Q2utLKzyCT>7%MOlpBI3ArcF%-o)0V8kC(s7q^ZQP}E!
zRX@>qyD_E#%H2bW26x{2-_2+=xb?$j9~w{1B+t9WVzp{EJHbnj2kZb<jf^FUf(|aZ
zZS#?DD2d6O#MZt~y~QU$+UW<LwhwyFpoKg>>x|b_^eES<FfijNbEp<gHJGtfE*#c3
z>snezJjka6h+7JD7+Tt0mn4({fyy3rS&q{}nKmIkIK<;gRX<bc3lz6C6g?FT$PZez
zNW<@2a5%>T>?Rd@zD;jgkXLS8IS#;chY~=Dt1^xYFd22c+9FcZ>~+`6N#sQlz>8S=
zOh}J$Ux~VxW0*&8CbD<r3jJ@N{`kxGXsT+8d0WT3oxcycj@C`&#p2~opGH@=bMZCo
z05-2I_4l1@s!3BSYxu$|!c>^u{J~)2g%0sEj((3ezGJ}x%5f?6)T3NnnzZZnxMBtV
z6oLoFZeJ^Q4>=1{N;z;yXgKt+<Y<$PcDA-L@LK}J@L_wCH;f%y5|`1$6)TQOT>&do
zL9_FpI^ovQ9z|w?Gdz~?AbYT6xPj9vDm-*NR%~p~+|x<p{>MYz9{!TvxD`c{cgd&I
zo_GElMZO>;G&<M8+8<X{9TpL(++v326-wz-_{axI!ZAd%^rhFE&o?)Wlvb3BDxXt2
zcM+<oWb2)1_Yl!!U+kQlq*s+U7^qm18KycxOn6c!><3SosG_5*r34i-NL7Tm!T{76
zEjUs3EkX|BPX{K<pR{_0Xm0*fX?$A6)6Wl1-%H)y@h-_lv5HlKV(}p92n1;J!`yN2
zXCPSHwWZf8^@)4FsUAX2*1@CAWesAXqU6bw-XwR?lEWc7U6FJ@KAcBM*1Jjsf_{x6
z>ZqMn!c3EstUlCmNB|x=;M?2;DYi`^WSO4Xnc@3|8#dOTb?S(0IACSWba6D2(Ia1m
z%B4D&B#r#PczXc_<-`-`Xw^glPlxG=Fb^5)Y|414Y5|7Oz#1q?eSuU3)!*OioGI1V
zOweE9FsO>+(0b^z`;2){8IoX2gN@3XHc4Pi4xzC}3O&E43?=~eNzc`J(BdhJ7cWj2
z%6kxQgOiho5B5+;ndh4X*;-d$*6Zr-p;lB(<~M6|5T{GQiBZksB}=Xb(V=Ms3;+ZZ
z$c)$%BO+7A{Lu>LLd89q_Nhkv5&Qsz;8pARX=KMCFJ0Gn&C9$GH8KzKU8TtowWmP7
z93hEUn@>TGbCshROo}>nj$svjyAua7$&IA^(+4X2O-3mw2&6vZjv6<f?dmGK{di!z
zNkJxI2XlWB+D;T%UnJ4g6)y$>XG4Ycjo%MMAXye&W%~Sf`qly9ztn)%ETS`ZXz63@
zf8jH`fNA0nG(dYs?QcxDGTtDr7_pFpEr-Hu5D8SC`&^m(N5(@OQ?H<8Y|BIETa7b6
zQ9VcF81sbDE>#4hKC&t^tMacxHH9$O)iq8=s^_pC@UZ>aSWc=qEJC(T1=JRP=~68j
z=LUa7y3E-GP-H*=IF{)u-SAU=>1)NMEMmjRg(DV?@xK4J7NDO5HX0M8GtGss)^-$4
zJuQDogh~87ORmavYC;~fm)50+8HtpkBI3Iq@ZGIAKwj8zr6;XoVq3)U5V||gi0~4C
zduQR=P?AmY%V=)6MX>W~rcwVKf}Mhl5cq=_NoW0TxS+AP*<K-Jwfb2c&wjT-9ij8B
zA<1p1z#g;KE$X29q>XZ#h(?Ad>QQCoA-*Qkfx8NOI%6V@1xkSFYB9T<wdZds4;NCd
z&{`hH?d>=g`(hS}3S`Aw4ZrE*-?0mXw(%<2_eYGE6?|EhT5Iz({0yU{^==$l4|?Tr
zVEmsHdc7xjg~77c6xOZn2uC`B+^{<Kcvi*waZGK9RIccy)tdQY%&a==<l+*l7d}!C
zZoSz!JdQX=`&wufpLNg|Xz0vVW^Upr!O}QZz<Uo}TJ-&AE=a79Y)y?h*7?OaYIIQw
zOKics0x+h(RS4g0RLMd%QIKAv_<b>M2WJr4kwoZ-9;&K_S&=71QgQYW`AyjapEJOG
zJvDEw)Ban3651DmjIfaek37n)(wi4rbm|VK{;O(dU@d@F*8HIyFB(ihnm-(hQb`DY
z(<jj#3EwFxk7q&-l?+ZW>jcX3S7F%)8e!R#Q9|kgb&OU$V)&4DErxTD)i{f@EI)qc
zd5O*zJ}f*E(htr|e!8{+2ABv&BOoSN)TH@vSE3=NSCuquYv++HQP^IG6p%xhMzS+p
zUIOAEwqgT`4YJ~+WU{74zz0x>;#DP@d-9gjlBbWUhIpPU=PN_s^l3?gY51K68_D>d
z`u#=cMW=_9Q!s7iK@Ne;qoQqp-9@@<dshxxK60sCCLX%5=Ycjsyg!H{aWpQyDIZEE
zf1?qwUW`Sd<1kx4Up*6Tu@ubh3}^Jux#k+`GQ0(VQopC)^lakmV~#Ve)&bxcxPd`m
zm=UUEpMnHq(Y4c8snKtZn&-;r0wDryZpjysq0oJV>y$RdA86QS5cEXvfF$rz@7G6;
z9m@qn6PrErKCc7*zVqt0)hKIA9nH|Dj2s9)jo{=PX<bF?F32gtNrF6JO-Hr#C_$U*
zj=+m>F6&8^RBzw(7<+(IS-=Dz+4t&hU5!yH&WyM-QGi$wyQd-Tj@U3}Ewz3&iW5hd
z&^NjrZ-%b>q~#;1|L^l`X}l1Q?3JQ{e#oNzDmS+q*rVV|H+GxE*MP}n@Lbf|*bw&<
zxt0pgS#hyr`S)1Se}Ag8wkyV8#?D`8#D_ClDVj&|U!$FycV_8=1^dBjtx=#)<zbjJ
zL%N>#KYk9Z7LjOzhPIs8*>eDYNsx14TzF^~t05@A?RFGAIQkfg8<NP7;U9FBqQeC+
zmmM@hCvhsRM4&|$HFj>Lp*9Br6CEwo3QvTLy<b9`DtfOcB6vrzXz#W3KOaX!Y55Ml
zq1csK$n=M>KLC<4RcrsGO&|3mXv?=ghQo@WAR_A!;VfJKh;M1w@$smuBG8iw4}ZP$
zvNL;0ox;9efue^;Tw1aUy2zh$xq@6LuAI=LBL5R6oR4>V7Tke6+<Z+-((%=R%F+M%
zjL&zj$<159p_65l8xc~7L!#tY6cUSCF4y!q)Y`(bi5ht)e1YImWUNlr+@1>1A@O?Y
znD~+3^f_Dv@sX%S9YdX4GI_&)Bxd&G06t_TgY<D6WQXc;vjO>^bmKG{JIHj?nK0Dp
ze_%T#!;|%t^v9rT#o9>5O?b4f?$Opb<1R}-iHK74uqf;hFPU2vWG-0CKR<uEKH_?l
z!EX>EL?RNn12-FaFOJ&j)$)5zPV?q<`}W&!MKlWQql(}$r^uMWsJ6(~rPrc_k#pEj
zlVenTUWUtPeWVph%30n0KZ;qe%TfdomC%^w(X#|z*72-G@{}yf?e*%(zld?<ZX2cq
zJXn6|_2Yk@;QQ^QgrN#uNG+(o6H*g0VMs+41OwR8`q1D8{`XrND6k7=w~!3hpdvY{
zzM?`cLO8a(D^HyLe{c8~??f9ReP_p_hS_<^z4Y24xE&cGWB&MWv4=)l@Be;7Wzjhw
zJK<7bM?FDa{=avROyK-q34gy(?(pC0kR3?p%wFIBP`VBfjuv$YH^S?mJf^;V_jQ+p
z<;$8pun}~iliK^{=IwtcQ5daOqWH<g_=bG1IybH6-9LQNv9(>B;U?X)f83eZy6u>l
zeXkZj{4Q@%WS_{ft^*I5jcvoO!N`F#FZDb$&D81g;)B<(|MFX~XV}PDL$#D?W|tZ~
zXCJO8tb2CO_xv-Ts&5|EojtD+P&EGE($4z>f`fu0fG35U;p9(az{fTjd4&|KQrm+j
zw%_z_T5{;G?>8axg|v-@NvE(UG}SP~u<JMrj$%;OpySBlf$m1Z@#Pp)F8uiI{_kd4
z{~sITRv#p7fPXkuQcK{IUIbzs%|mC`dxH8k8jpIa{ykA}MC?3W5hv^%uwebQFgB;J
z^tF+;KLZic_fyr4kH(*^|MUI2Zifm|qXwcKVi*bk%fwfZl%o5TE{?jQPg|+P_%Yn_
zpYP|c1XB@J4KYp+rsUw&f`XacV%f*Uv%Y`ro7cfDpYP9{Yge*)MqeC;4CvVE6RZ%U
zy@ux%>!UkGFdXvNT-k}7P#At=p*1}ZvJG7bXMQm!e9pQzx)mRgX&+4$CxzlIgO4$W
zK5>Kk+l&g4vVkso3KgQLo#@+l>aO+j@>+=4x}K3Z92k1S7;XJUv*jy{?~arz0y$@*
zvLM};!VC*?L-LV~3QEV5=ufp8`s&NiJ{#|m49&9ZL}#=mw!5OEqhk&cjEy>NiFq{u
z`Zs4K_4C!j?>@ZU6*mzbpPFq7u|G*jxKgAul<|V?d?!YTjE;q)JB5SLSBgl544c?C
zm3~P|7}DVr5JKDXFKN+S=9+X8;ftWYBO7Z1rSG6L)Cac9-m4C80rSw8l*0fq1{Q@X
zhD92eYSx$m1;{9uFf?>H15uq&)n2*qO-pfd-)(gErvb{b`=^SmAJkmDqQuz~x^Md0
zwQINf`=E<oa_dO9)YQKM9Vu9&ehDY%&m5*DoEpmAcz!Dv{m8@(f!$<ekS@g&6^Y?l
z+j?}(^~g{SlrrFt5;BtH)p5&M{s*4IB>8&^gtRtd#D<npDtmx|s9KUt@X3>dZAPkZ
zh{Z23Ok}JI63ldNrhL3a-_q6I-m(r)G)yvd4)btjZvPqm=eOL%5!X1wMt`!C!Nhhr
z{#dvP+~m0s`HwQ2E>z0gSsNVzoRW;b3g}368|AQpYOmyVKYe-#_BwO{B&gndXExh%
zfbFl7DRfTnAxJTWuKUhl$1>`ntU7bKC$eH%q$K-laK`(}<-VT&#DnpwR-cGR+CheQ
zCY)|qWfr?m{o%Byv$2BV|EW}7HjEj`{4oeZ)JhICb(a8*GTS1`7BJ8EUrBF=_ZJST
zzvjxybE${v87Io4p`ey6qo%rc)D3Sdg;IJ1&iB?WA3BTyG@~}^hJsjDFbp-ji;VA(
zWai`f_BnRT&PIaOh8|B%O;tP6)1gz4<9&5`+^gHvy;k>#XV<J})me-+MDP?PVK1#h
zZWj(^iRId&9R@L#^#WZ$yC|lkDuKMDdgF%K={u;uueo&HJa(~EC)g_-l@bsH+-i;J
zPEiWrqsBLgL5R?PbNiOtXVG0{=C^nX7f0-~R6*$TnRp$Vnj-ZP#g{}}CId@u4H-3!
zBQ{Qj=|yJ*dnK>E#N-RB;eO)P#^Y*Du58(d45U!=_GOkqSM?V0d_E8xf&<3AvQ2V#
z6suWPRTqn~Ma>iUa3>vO2?l;=t}HZ>q))1m+BYIOy_T<t3rhgqp+dcro49>!UE}H`
z*+EH*))W6w=znvb?vuAL(&8s<G6mTU9!L~z69u9`{pZxr)c^|(aV?`ac0unzyf;33
z>)k9yT5tLGsMQwK?+y$$Q^bHDsj4RbfN*@})~&UtI^u%E(c+Pcq%#%?oA9>pc1B`#
z<_e~Z(r||~yUYm6aDmG!*`XlH5%Ep?qP?NugHlxsb<79PE_?=1Zq%U_|09e_cH!Y4
zHn!%!L`D43Al6Y)W+IC4ZF(~f4kAPVxza4rX2(!x30h-Wxdw@PROA0X`I+$)mql@a
zShl<Vnj5fT!?>MpPSIcwLcOb23@+<&QgC9v6Y#5A#!YXbREgs+_4Ic_ut2avyoFSm
zfj=531+T^ZPCKcQ?kw}%OKSXF!*icz38k!ZnnMdhB<(+GCHQJoPm8dFjE2}#U<!aN
z>bQg7y94=Q7GXnxna;E00KNj+9LL*A>cuV1vWS{{HQ!bg1$1CQnSFU=WZp^&K5Xu!
z?}hyU6dKRCwNH82XqwmxRot4_JiJ6`Rh3Phlum(xR-1fyhN2`f*fhSzfrFM~Ql2`u
zj5<67WSlt=YqT>0&J*Ztkcg<sWzd0aB9yEPmCq_g!`2QhPlNXd@m)Sm*chC5Rx1`Y
zr&gCZK7({EednAz2$U?#JckauTl|Cp7%}NBOEz16-;XXl62-_u3d^WUL33d57~wV<
zqa#f;lmC+fGgaaeDl1$FQL-=F%6-h_VZgld4FI}AZ%SE*;z_K2r3Xd#Te7g#ZAbo5
zR2E6|#Q0X4Vey$^=Xo=w3Tz2Ec~7ZUGmbi@@b4d0RK#)TbB^^zb0ZihVR{9E;>1n*
zfD%)=JvbRU-L#++5`MIi{jTC&CD1lOI%-%6GA_&)=FPlIV<7i{dMNRS2$sFboq+zb
zUD>5CG3nywvHR($BdMH1o3xkq?@%K;K@Mb0&ozbplFG;NfrUAs`q>q`PKqYxqK#}m
zRA<eu!yzgb(*MY5V2;4@*z|)lrUENl3gLcwh0B*esLd4!2wNxUj2e{W>+VS}gPSF-
ziBttD6PP!xE){N`L=I7xis>Q=Lsv;fCWaO!GAPGvjlko&N|8&&y!avEQN%VvwTLBX
z!^I$W1@d=-dPdN}uklC3!C@FPNl;k&wh0mNRIw!Ra%wQ~8#{RDkTfooQoS#mp@Ry%
zvCyP36c8k)lJo1dglx`8eP(rWKm?u9M3oH5@<B?j?8q*1h+`Sv0iNBRxb~@CMzY06
zlazi1!IjtrlU1^dO4(NGFyRR+$5MNo3}=n#?Nw%s4kW(T9??sJ>vS#Qq!|P_43RFH
zXc>^^mviIioNs$4g_@DBRMw&rsu;cKfx(Wnp+XLUl<F!$QtV>eEmXmhjgcb;M^RQk
zoP*1{)V*9SJw&I>U~OA5_&dp}Q;n>Odi1ewR6i1rj!8VB{bicZ3gGZ+YcnVZNs8hR
zBsvZnJmovzpix`9m3^4@iBvZnDut4H4zaV|>VNy>vN_b-WvKo{1r%0XY`T<f4Ydz8
z=!i+pYCI%r@xvl_4~hh>O=IK$<dM4M7KGsKCh3})JW;sNuSn4)!$i=UB88Le4&0<r
zibn+rc-#I|^ql)C3ZCGqXd}cjAz({~0T+T!9bt8ou<mxLTG-RH`egL_=AhnFAMS0e
zVpv_wa{~`mn`HvrA>xRndyFGg$wgGVDv4rP0yDN?<_siTp|L01?Kr${&+gsk%2^`p
zbT@3;Fci>-SP+5drf4e2+uH!Hh4PmvTo^;AVDt8UXC0t=8-IGu9(HQl%kO}=vv+zR
z<5yM*DU|v=(TA;5#j8*tE9s3<4oB5;X00g97z2p~2cMvld{2!r@k&w!aNCF^9!^=h
ziA5_qfQhH=iY`Jjh}@kkXq30iJ<p%WyeoE%VtOMsZlSz9d`YVMK6?%OVa%A&rvKTo
z@{fvQ*Sc^8+fwYsLqsII2xCOzG1R_~c1xt}%8mXB$pY2tcRU-{5gaL$P~j$%hFnqd
zji$nDDopp}nbET#Kr+j5e{m<UDn4O85hK$$YrKeV%qyN@7Q=b4SE)A%zqn$cSKn{S
zAAZuFDpzGm{t0UQ)J=z6&n1YMaSz}&rxbYk5uuUT`W*oLHU)5-Nq$P7>4$eSc@rzn
zn^YoP@gUn|pf>qih%O|1c8ndRTC0V$<l4<0IThq;@h{Q)yzD!Id@#6#i#PR{N+Bg2
zwRTLEG4*>+>9N=qK?c*W)g+Oof#$b4tn)8wKA7CBO;JBSWKW)H_V@3S&NR$`#ph5$
z{@>*$pj*_+>C>l+s|i(T4)RNxy=D~Q)=z)81ulWlZtT(+e}dM{<larI<BM!0v`KtA
z?j>4Ku_cii_;2T|-AkSNsNhye3Co&J?4EU_$D!KvHkpbtx)?!A6s9$YF4UbA^96)<
znKQ3zP6a5m&hORh(0EG*sZVSjO1m^E)JWH^K}^^jO*-i+iLh60p3_MoJkr$o7hVHF
z=TK!$xtE^0ZFi>X4q-(W_a0sns8ZEHYLAyWg#F+!5vOPKVr%Rm2|2s;6G!wuzD94@
zu!6-tFB)}=l2hFup8mcZuz1J(*(6<=W-_cz2LElV9Dxa;`RVwJz<&vsSNPn0)-=H=
z^c#pptZN5cuBT4gX*$}toP1CJJa$tpch#CdVN?{az-{2-(^8cJ7PFkVmu7ih26ki=
zQf9`*Ey}jMP67=bo8`f01f!6iuH(><?DTrovMA9wr*T4|PWq{lRS%j@^*ULlx&Oui
zcmR<<)Qq+M`LC{=HySQIJC$kG_F1+2IXYrb3EMr%W~4|y{`lh$!`WijM4~E110@Su
zrT2z>k60Lw&>pNMm+k3^+xxu(!;<LMyLLX6lvFmd`HerL_J?M5ImyH}bD64lYa$%H
z=g%FRiJX&G)cp8Wbgi6l|EK%8LNQIfPAdd|6DH5&-lNzLHTy;%Zz9G=dCK&>59P_)
zoeP5c&Yp?5NjT~|y?R9iAMt3ezI6HWV;+_V#FVJ4+os#HELDM8biVC$biyBY%5Z^C
z1e2bWe}Znpcv9@zTgUel^3TFbj_(Ou^Ry|e?%sm5E)>#wH$iC?8G$RQCSt^=VDW4Y
zXyS+%AL0=+VCF~7;zlNmV@vre{QnybCsmz=wTf+A2O(%1mWxSRc=V|{A#WcmE5Z0~
zQmAL%q?~v5wrt>=ST**Q4l~lZk#@tf@+7?hFT2+0_HA9NX`xQtqmo9qk$5x7_EK@&
z$R6U4dH6(u#Yq+U>LQ?}ojpk<&ep$SkDsX1l3(f_c&KB17~-1vAVhs@t7|}_m(eOo
zHtY!h8<5c`IcV5L<GG@m5!D+DgKjL$rD~VW0>a=?RZZcW)l?-vi#i=1Y?HEVpsQ=e
zxTOpFcQ9|0Cwf0!U0p}7-DkQV`ob{1s@b9eMtX-?w9V(;lCsotf0yn)lhZQ?z8JzX
zm;pXpH*P#SSFx&>d#5ca*1ZNf{qVw#(aX}+7d9xE3^FXb6IEAm?!4*Q{FP&?q{B&n
z`%DvV1T@F$u)-*uL^j58I}Be8;O0;>wK?Sd>IEj9z;wToW{zVk3ta({=GXf!p`KIk
z21M?_)fP!u0DCvoXE^nXU^72+MvsD-_@cPeh47+JbfIS#rU(`7yc0{DdTxqd6|QZ%
zTJ1gH<#h@yYx@H?qd53saZU)S|4dUeD`bXIc~k2`qy*+3!+ESk)~`aRV>}Skl=)Gw
zGX5uXnFO+J4;nXp+wU-?Z)Cx=i1FWmF9B8H80a|gQtE~C%}oYM{h7xOj7edp!=mm)
z+1V4%mSsIXz9(e&slx%Msfr?JOh++hST`JkiW=tX;@NP;iG}fKo`vdqyiQvboYv>R
z?AI^eu+|O=#k%1Ty1tP5V43ibIIz{~j8`?UXE+foAoLK>P=@<?(NY7lSRTwO_qdUf
zVUYLi6kVTf`SgcP<5L>lG&zh(-oDJD2{b+oR|=ZPfp8VFB^jdhw!5LGkS|c8`@t&o
z$Whrvz9)s06Ajdh$HnJI$u+<4(i(D?>TePJ34Op1n_kfs$}BYOzVmwfKPklI=g}ai
z5gNDWQeU%JI9$B4g^ZKkdME%URn|=(`J2DLMPhivz16$!e*O#U7;8X{b|{W0HzZ?+
z4*4nici8o^m;8gA!YZoTsE92Mb=xun-BrDUOnVb#E?+u%2HD+AIWY7G1e3*O%M-@D
zaFe`uVcUqAT-nD=f^eKi)D(nN&2G@NzvJcpDEH+#Q~^V?mr;NvIDY^7XYOcNpEehC
zSY@((!M$qunB6@Zk;4oe*K;ZYgkg3)e;IjzdT;2fE*`#xm)~7ZA7AkoMuRMdxgs!>
zijzuD=51sHQKIi)$IFyExv-m*_07hIUP37FK#3!_Lx#F_^GltB6|D4La!q-$-ntFq
z7$ckur^5zv$MA3D6v;%$R~l(wax4Mbdn;G<vdki<94sR5ee&~nDlBb+O&T_C-fZG?
zE$z`!3c-ouIJcvG*QU8_FAX!cvH0xs&y^m@4R?4a$D1YW-&k$6oHG&Iz^byky~?{M
z{`l)}Er8kW%BGVxsVOOD?hP<1X$wu9UT5u(9l**BtL&nUt1M1aMOnFD3hZxtofRY|
z4~$Gqa;;-ZzN3dXvMKO>UT`gT_vynzI;5whjHTPnCr8e!Z?LAC10C6+>*6qWkx%5H
zo^oEgE^Clk$gjWdihc&vX99R^f8Dh+z`CVaX3UN20~GYbg%*{CMp$o!kH+l?`X~{3
zC|r<fAfOp=93kDPerIdg>_C%0*|>TEvyQ5&rcw_SfjOopkou5Odhvvl6`fyv7N*VO
zJE?ngl|1r!Afi@_J!rCI1~R+oBY$%7&g1B&gxydbL)vdOCZIubZ1QN|0}b~c`2eKM
z>e8?-J-><%h<N5#^V$I{(Uq_O^6caIxmc!QQ5>;*%}cM9?EMZ@lmUoExxW4mIo%*1
zhTc7v{u1m6j5cU?n@@iJE9jW&8{4yok##I;f04stT*x=JFo05*>Ig&6{x)?(yq^Cd
zx*0z|jl|duWu9zi$!SVYzZtVd4cMmb5>W^!xg;6dh#NF~S7T-V#Fc7Xx_8upy2!@i
z>;3@`yLy0!CTyhgL?qO$Z0SJ%PkY&6-eaj+LC-4w;k2Up)c2yqk`=C`dlTKokcBcd
zk#YTXdY+l|F?plC_1E&y>r)RaWRsh;>wq&WfLLS$2znCf0To4ix;|qlcAw0wj91qZ
zp!empwAqrn7y<IcO^W$Eqb=JyV|Z{_oruH-LHak6e#Y_YK+%!Zyg{0JwYECZ%Ka@x
zgoK!1_bbCWH3EC7s_aW;X9B^g9Zx8%VGNcnQ6D98W1V{cg}}b6K|);lr+KDw3h$Q*
zY>27~&}@>>XH-w{z>_LDBr?7wvL-4<8Jo~&n*8G)@7U{vtx_e)2c`s3MSwb45;lyE
zp9N+_LKurwY6YYJ@?T}08r}+FYpRCkbS`E<EQ+G6m46YfZgB6Z-R-s(yWD3`R2Bl`
znIyXu!TxN}Rf-^<GRg|?V?`O;<MJq-ttuCA{-Wlzl}&)+QciCwUJvlsGO>f;=gRQ~
z=(lfD!77vhRdRLEwwv30u8P09TOynQYN{H{)E`~hG1QSesfs{GaA3;b@7r(f=|jt0
z19ZoqX6MO-Y)VF(DD#fDZ{L>Z0bn?I#Yy*cT4qD7J`i!893#t933y_UJ+(-E6vK>!
z7Z<>cFU7f+^*A!@8FLmqNMJ<Kn+cCk;~T_W%==5KsRT$7vA^7N<tl@va;Tgt=|6Kp
z;Z5fLseu$G9lh>}6DOp{2OuDB=pUMM-pnem8&_Na9)}FK5q<2}R1Oq_C;>sp%A&q?
zb;agZbYI_rALej?`uzM?KzWm-A+9d~zO2E6`o^8q5_Ck?aSN!MF-{??0%#Fs4g-DZ
z{G)RgD(3vo!z<hO`m`QBO%~$-PzEVv3hKfQ2Xhp@$8b4h^j3DNh&CDjBWrwNL?u8~
zmqR@ws;06j?5ecV-)}RrYT2FhIFPj{6JBgZpXW-`F1R@9bU)90Bb%+&%^CT>%ojIX
z(Fl@WEeJs;=3bJMj4<e0{|;Nm?vMO=)ZGE>&(3jp2AU}zz3sPt=BN9^xN&iq@yjX<
z0+8r$CT<9tCqY{F{$dG(FT+oRf1=QO6o6uePhXDX?EsrMM~)u7@h-m!pb!7vUw6&;
zRdH+R0Ky&oqs=+4lY%S0(lI+oD?4LIclj(syD`-w-xc_>U7I!wZXy9N6BJw&8^ry-
zT<X6`OpQl7vtr=l(^j$(OO&Dt8Dh4Wxo#c`5-IYOSiFDqx!7h6?VU6~mFE-1OrJ7)
zM}K^lL}C~9izcueab_}LbT9R9Y6NVM(uej>Y}934G@DQE7(OUQ=MAy6UnCD=3V}du
zV9N_Q&Wi%%V|+uOe@>*tT$~EXz>_P6bTp!NV_T2RJ3)c*Pxvqvzmiy7`S$8<ynH{U
zcoCxW=&BUA2)RWxceir!0OoJI%S5N^&4X%&het|CZH|q}aXxt*?SRM}#GXn<RJc#=
zH>;}!4UHqMgsmNV0Y#-4d@9N?!j(l1zxJ3APCgdkMJ0BFQy=sAb^VqnRWhSf@CcQ`
zkH4&6zf=E5d3%kpSyZaY(xU_UaoMJcL`Ltyy}A7beabs|Z=Gxi6bpKqR|use%X8JQ
z)9`!ze37}vsI3%b{*>7Z%K95!0`!6%3;Bw7ydP5nhC`cW1LYb?0sfsV90lQdfWj>T
zAX+xhB82kYv{IhGszO`l*S3lR<HyhbF6zHkwpgOwKgzx!ncAPYLH)1N#1L3u8`Pq?
zTx&YPWjB|yT8dD91-J;UnDxt+PpOeT>ffvNy4s-_|3O|RItNr1Z7Zp?0%4d%1`TNc
zH+-ux!uCJJw!=)3MMFjbCldzxjY*~=rF!3o;wKIwxBYLT@a4zLecQj~BWVGUm~0ZZ
z!GB~X>y@aruNG9Y<(0;dNkbbzuI&W7U%R6zm7lQ16f^ICvL8zt6b=%@QiLH<>fT(u
zHMxp12l&tXrrG`H^F{z-%oM3SuRTUh-nJ6BVzU_JNgv2<K0fM~mbV#ocg%mT(66xB
zlD~v+pdhu~MUE2sou^BxaXfWSObPn?rSDY+T{B4edtqN^pI<i{#3IXDr{9_9KM{S6
zx{T&|1>G<@dtM==79lLdQis0I3IIK-ZWUrwvQ1*!(X?dj3ro!Xw6wVIwN%sH{%eNl
zta3A}O3x@QhV*p$TnA0sqPu63B#r~8jaN0jeQ9AeoL{e!->q|~ZSm*!i*EDo=zwn+
zHr)SN6J1o8VeL^E2d2lJRyTw5$-sy0rg_vsB=p%K4R|_St7+mmSTQ~jGJnR2#i88r
zZINazAL7B<!i#TC{r=L;!rn{=_&S98h!J@Pw7-#rOl;1Vnz8cdm+F3>51+95Z@I$8
ze}oRBLd2*mLq6U8YAgODqqeiC^|JTAyXh9X??o84_|5bY$eaA{zI=I_wGL}cZ8z-&
z#w+LCh15T<smaW4QjrEf1;a4NX-G~=avZvz?oq61Y|G`dloo5NuVpgVXSrDwU*qzd
zQycgi)3RC$Xs=-%0bOrof7kWd4tyS3Kz_8lxzQFnUyl1o{kdNJ#QV8Yp{gPXesM4a
z<c@crnE&UeRhJYT+2j2`An6o{>JJj~-7~9U==aY>kl*u5oq;q%oSYe&DI<a%RmV;V
zRqr35>lB%WT|}GVXT5u*uRRZLkOI(VqKpqiQ5kBrQ@>|gpMRd3KdlVuSZT!;-8*--
zr4N)}8S$ssvoHb$@NTFS*C}9483qRMupKf?{Z|5V{&|G_qI1a_CWuu<J|waQ%tuGF
zcx>h{L-}3J=cu!Ur$vgOQ~|Qc(o$x}xeFN=TGnCv?f&7P=W4{SvRN7ip^=GKl>#6`
zQXVyG`>oWS8lx6r{{Qy|c7jr%Lds*>6e}K_U-zraCyjrpfxNEF!cSPp7m|XmlI)1z
z^EB$0K3X#T?KiHB<^p?`ZUi)Cw&+s1l>igAL-2azC!o#0z9|0Uo5jCE#%@#-f_m?z
zI6&=s=fXCLwd3PKI;Il5^qJ~c$WrUoB9q4;<jZj_C2vCKe;&2xL8m+`Ld0(f`c|m|
zHy1j9NB(BQkMhUTj&R8YE)d5(MH%8x(Fbx)4Aq~TUzq%_`}pTV+@}sq9Y%+v5JUc*
zW<IBB&zC+(lkeaVLI4IP>j^Tg%!ctWVO`so<66RK!u!YndD|I`P~+#^UB(x0!a2&A
z=7R(3zaeuqA`g~UQ6=2Gu-EOkw&;ExzWwJ(?CQ_s9WL*tWc9bGRw-iVz565IqF^95
zPLRf1c9j9{&lsXFe_tb@%livG;wj}_C)3r(fG%~FlslFjmOknO?Dw<;?i0LEsS-p9
zPqDwByM)nZ6W_JDT%DoZBWo1hiWo{}Q~-&~e~@=^zd~@dVR%%09OzF%zG``i!-xO+
z0>T2x<fM9Gp!9VH24ASn8N7qZKQRR@FG=hCy7C=8f<g+79-Ew0ige_o0+6)4oxCMh
zi9h|b_cLG|D+Yw8gLK^k0t)of@;mZ>9lrkOcm#|S!5Wy7^d6C;)u~+livgbtobmtt
z(R&{W-v7(z9nf68|F7@<)BOSOpSJh^^3Mgo{U_+O{2Ra2_n+zg?`rsW`{#1~>z`_m
z{^v;k>z|HT|Idy6*FW`Y_5c5YuQ-?e>$f--9<pXb_VU1VLrzb@9imT{7S-SQUMjPI
z1u_Q<T74;dp8AD;Bkq%1LcUU8GQVCS+%Z7>9uminE;mvCO4lCa>ql)(@@W{<rsW|g
z=Ging*3FRx#4#sWY-cBovd7Tw4%-QB<tz}GaC`}tHiCRsee#h*B4wVmO?fBYJ@H9e
z-x%(tP(#u%KX!@8(;hF>aOg!@Dl_H?31taF<i(cT*pV_TAs8W}h@;E;t3T2xZ70;D
zWnWVZ4uZ2$?_G^x1{eotd(y{k!s%Fqb~#jS3MEBVK8o#tvL;amFf4}^-7<=@9fXVh
z)U#qSA-fyC{B|15m^Eu21c;y*x$W7zBY43eF^{h)c}dGoWiUe#LK%Z9YbxO2FyE##
z_ylzqO$OPPoN|*+<1l!;onnxN#W|VM5giq6ozTe;M{-}JnWQoyLI@ejvmeVKmw#mk
z+j%jiPZ_zAy{S3wbdo4PA{OhE&O`mDKp8h+dTL7$vR%;sMZEXB8=uxjR+2&*$m}}V
zDdeltqah#+JPiU*O+%YL5UYP;<)AyMWlU($HW}*q1e;0w^v-osj0NjoRqm=B;F#CA
z>_-TbB$)($@SD3JhjcUxW?uU;*o>1B4M$f_P91P4V{vvW75Z@o4H^+N(dA9bHzQ%J
z#t$hME5}y;+uNZznu;YWn;=|H+_hBdh%lWAffZm3CMS<3B*-RX5g4w0<6hOc6jhUY
z##P^E#37j4tJj7Hn}yACb8{2vja10Qzlb&Gr`vmvfph@@f3je}*p)|eJqDpLW{<!L
z&c4e`=Zz8mNP1*3iEOq*mA5-?f%>ubGjG`auh62!7(9HZtniH-?fq0rX>pfRZv+xA
zNzxpFSZpyw(V~g~-iZU#lIG4(E6)_;fT&;mxz!_sKiBb|gn&k*A5gveEEBh@*Fr=|
z$vG(lh)1U4WC)r*=n7CWJca1SYxv)C6hPoFH_-{mngW^3L;rKTb;iZrcou?X9xV$?
zJO(7cqxCD8MyawT6f=S@ui|i;&~}62PNMOZf$DSh<p#DC0Ylt1{F6ZoT$wzNa{T$P
zKlp4hP^hxa`!U9fpw&+RS_C(*L9vxVLZoVz=@E=WWjUbj+RX<O9(k<f^%Tb~N3G*L
z$M?}DZ3*)(nzK&?ecG#NWM4PkJ4g`AJbwP<e})}CPZ&)qk!nv?f1)zfpLJ^a34hqu
zCGw=oVdHd($rH|j@?Bv3PKrU1=-;9$d4>;ht)16sX<{}ND*94(Qpv9}2sqTN<RT*S
z(PGHr;!P44=74b>S^q0-Cg9snN=>!H6bcvD!{@tck1Ch8hHfs7OoBUDc;rRz-r`q;
z=^3jpHv{46lIyu{V&@2+DNZmZ>?XFgE!s0dl->~3&3l6f7!DXa*7n8FWMY|=Teej#
zscppB_aS?PC$Mt_`8ZA-98sT%7e!fsLMe;NLX8~7Gzy4DHd-EAG2W@lqTY*7(dXOG
z5*G8o%Rymexi@!EIB~wr)tdv5$sM4}H(=JsWvOf~77-(VJ<5}tZfp~jzLb##v33#F
z8SN{Ol?#A08pX72)!D^7wDrhG%$0M%No|0YEQ?MsA13vYi#OeobjpV>ecpB16Y);F
zd8py~lC?fQdFU>4us1u#d3ZBp%eHSq)?ORS_yC_!6(Pv}>6SzBd_|9ca^~|~9_ei6
zN5$Djj9Fmn_>!s!k;2IY03SuBv?gv?P(qVD3@=J``Nag%iRC|}Z6dVj6CMRjCVmK`
z!7XvF=s=^o`n3MSw))m-i(f1eDq~4O(4nJ>1{5-4Mu)eUIH<|khhTX<7DS3IxF_cO
zISJw6_eP&&{y&K`?Kl%`G~UB-kcRi~<T;^R4=-V5&ivR*-HIl)Hy=OV%P=La_e3*i
z@gO70%d8U7%s#52B^6l=exJwx>|r|N@kP=4c5;vfqf+SdfIXg(eUS$$ylM$}-O>}g
z1M>`^vG+j)2p8xSxTViu3r)&_cn0@D`Y5oAe_=dzaa0z_jP;BeGqzs(wCgfG^cp~@
z*~gbKOCv~Gjyn<`%DCDYo7J}hZ|KPkMj*QV=p$@J^)9|^?Z>|_Dg0L4V$`(dk>q$?
z=90P_md&H~$E7)5c2HIMpb1%YVr_Hh+lf}pj0qBus7wi2!76&zirhZ-7K}C0_57zN
z?|><sNcp(yG8cv$1!clfqwnYj?r-MeH}kSe3On>LHmu#fg~dsv_-<uYSz$zJ-O&v#
z<y2w6aM`}s^7E(<5kUY8u^mJ0CHqauy|VvArrLP@jqGktN~%M8_+uEuBTx|$7FHHQ
zdU62w%Mf|R|FUnFA%T#ocwp%LXcVNM#VST}D{ES0-?v0PZhUBYGm$-yc10npi-5`s
zdenW6uN+`;?VV-kNs$wYr_y)J!v=sTeP}EuK<JK)+gVL!=4$;q7Td`D6CD2yA}es}
zKIbrRaibR8fY<FPYiiTFH7kc=3CLx#$_Z0SpbL>bi*0Ppo->GehT8wQk90q@z*pqw
ziH@B+@6zfR4fhIFC}Tu9i-(@A6OoR%H-PXcv*|`f%(mFS-#~GGp#_&wEFhb5CeaD4
zM<|RAN);iSI$}QxvdZcQ^(6Qh6^gij)-b#Lgx{snpMr?T1`Ul2J0r$NTD4?-{Z=dr
z2vhPX34{&KhDsucic9<g=?7xK`t*7xmbT;H%w$lLmS_IzbKe@ym_iLFEAT)z46p{V
z8<x4^jSNJQ&V{@|MNeZC!bAe;R8}<Go#_-1`8XwUXml!BdnF8q%%d|qbl11JvF1~0
zSSew@c>j=<k>c$34E5-lcLF9z9Mh2fivR7^0*rYN$k3xiP1ZdjV%{;onfp=!gzWv~
zHID8U1H3I%gfhUe<JNU9*d#$Z1RZ{joJ?m}6B$OVcg0+7u@5nYHd;0@$wF=!9)Muf
zE|{M1Pd$5xtfaYTu9vK0!$n5K?L45HUSWwq`&%Yh8bdbG$R?z-1Z<pgdj7-{fD}0q
zncL^Avwia-vavz{Fi>vUrvTGo<odh6e?EUrX6dCZmtL3wdC}?%CD_yc{M#29{KaXT
z&dpg}ri-?x$K`r4h=1eb4@1B!eIE^2zka=#5I8OSaC-!^U>u7@SWL+vs(2EW!U1h-
zEb7OV6Ok0>+>_d?Z6%Z1!ema065|z~f@lbK=Y_j_-TrymN*o~e{n|Z!HbHQ(-Q;B-
zx27%^jzVTd6haGZ^7qd=>d>*Q>e5rljD<k7dR3K=oZs@HYaagIPd*OQ?5{^RWTcnM
zn6O#X>jhQ`1&yj_CjrNvU@RTL1jAj{X^E@wn9HHL$au!coVf^L+Ae|C;)8ClAxc&t
z2^Drc7g%iZM6Dy2605&MhlMuHVN-Y#3bz{wR*xZFl=}wPIoUYfc<|O@*-@f7@@+u5
z87roy@Qp=<AQGm7eTXFd0rBo9{);)dK6y3fRSpr35q9KNM2gwMX?li}XJy5>ss`>R
z2Ie+qX<c~p+i}diPCmL0J+ml#Ji|G2&TIDF_&Li-Q`3KO<;cM)M=p7+MA;d2rK`aD
zlp@9*Sn0=)-_Z9*`*!yM1Ez|R4or>7f|ORcn2({)r>aEMC1T2cN^$(+B>lnL7hU+q
zIg=wRYtkls0Y5SoU{MxApb>(UvJ<hzxyDy^oUW29cm857$BMScLS_BOs1Jl^BuWW?
zMNmarhz-+Z@weqj=YhSd2cr=V!5S*-hVVTH`YgrHWak}ZuS%i9sPc7-4t<dwH@)9k
z=rn;?=%iM#$W??L1W)le;lqi0VFvXrilS?e25%b(Y-EL{HNUG+GS|HgESyI*4#}@T
z$gI)Zbu~ZP8eC4jBS5Ue;q?|F1MOJuE0>o|V+sgv9GNWnlcM$)j+3-5U+v5GhSyR2
z%BK+Vv+P$xnv&4yr8ACe=#-(EeC>q_uqd=obg#0mwrq<u58T$KO&d|R%fPH%aq>N(
zAc$=-qzUovKXc}cl#x)5q&ZLVDMOSm4I>no-jE@$_ioe~TKiViejNGzJZwtQA?7hC
zj@<4FY>7yZr6pQfIa7Qqh4=y_Ux6E`)lYM0;x_0S5_FwKwjBGZw3@P1PkK<=#zi34
zautO;V9W{g(ry>?e$XB#rY87z0W3w4y<$<6aF^6)o)Sx@kvPOMQOmG$oG`+oN{GBT
znUj$NHotUBm*LaE1A}<nMAsxMF+pBbiprwo>7?Fu4&N`b_k=TtZ{8Kck^`gNCiO!$
zZ@ZIqo9bm}m@a2K-!=Yh$X~y;6hM~m0g`~Fb?3iX!^d|DyvGBo&y*_C$;*Zh9#Gom
z(lF(r@yoe%4!2Q>R5BZ}Bp<o%z-4!5Aqyvy%6329qCLw`a``t*dvHrV+p=tzFOzdY
z^sfkRAb^dFsz;hL$|y=zs|y>5-4vuQ)2jcX9pGTwppO@LPG%Jtw(x(N{0;<ic@w$a
zenNHo#&Kk!8(a=c#b-fy{Q)q@GBV+9pUAEp_9#l61Tl$@dq#McE0bNc;2<eH_Kxsp
zbTme~Yui!@;y81I{m=^vQh0B|&NPzrW6(7aLdnE>^{3D8oXBH|<MNFSDKpqgV6~a%
z-wJ&}w&!A%90Rq~`uPJaYUmSpP8!yS@fZ@UF8)%SF<CWQR@MBS<RAXd)teG^$t<VE
zNs(|i1*nj$vs*^8h)@~VuJz3+Hr1mGbgF;);6e4R$!)VH%NS-}cV=z^oW}f^+m^ax
z@f$j=p&>PHoP6<&mrhx_lA9&a<-+J9^pQg*7abV<2|Rr+N+_LBjqOEQ@hCj3E{p?I
ztRYtJY&@uk<U$OPm|&+8DgRm1tiO%X>K&Z29lC2@x6JD=?#u);&jf|mOc@XXmlWOI
zBs|oKc<UZ@@K^21O37V|1v|C>O%BV|!+FxZF@_{n7c<a%sR*VEd_Vw!$PIfq^Xisj
zSx#wgb4--xXcKo7u>4>jXG{2gr*<7ARKkAFpac|`=7{9dvqU)NZ!89+@KG&Iw;t0E
z8IcB|x?&a@JZmapog@nqy}6BrjC!CrHFXcq6n}{_=lu$Z!#5i{Uu>OqTeNBfEGkC^
zbJAw~sgC6uE@^M_H@7?%kZWIn?c}qS*J{swR~SVi%bjCu$?Mb?ioA8S?$oxfGq<vO
zh$bkf-9_>(G7!bGWzX7vu48+Y(KWm;vyT-5rzig6-$2+C<a5T+`Ak}0InhBoie6Cj
z&}UQOb<jU&A3MY0Z@cW=<P6lnc(?Xb3c<QFP*-~1#Ah;8O53g}%UsT%_dPv7O(|P?
ziJOx$@Fh^b>{CoR-D%{yGkR}&?H8SG_Xox6z@=zjgU~MMswWk>n6J~;A<x-I=Me+E
zCY~2i>wz(^4Ifien&^J1%x0N^^C0EOc!BJ0zi7UbTz4S0{EUCGPT$u1?tiRfc}C$q
zv2#DYm<L5BIbZbYv&Q^tv|Fb6?|CM-AC2yl^ZlM|x8Pcb0xxB>-qp+j+eS^=Vp847
z`>z*=+kVk|@W(+f-~Q#E@xisB4fls?Ioypu?sTYec<ITyXKSu4d-?P3d#il+-2Tl_
z$ppsA_|ge7+0$iJ;oj}`tJ9sr0UhH!o4~I{0itDpC$D09`bYby9q*joPTwHLH$yg8
zbE9%N>|0hctx+32vN7GMnQdh=eZHk%U9scVa)Y$Y%v{v)OYX*j#LJ$F%`2x(3n7-<
z***w8L8W_^qJG-vI=ZHC`zc2+&mYiwVsp;hiaE_jTZgkL9fjOI_XXCkhT)Pk4X9D6
zLZ^hE#KT;#mUDDM6aXzh%QpfI+%6oYrzG_rU4&KNb*Fpdw7oCIp8kuz=)v;0SOWSq
z*5<1sP?{Ic^G)RLM{y?#7dE_kW4olJj|Crcb1%2)pBV2_Al+@JrM37NEc_a){Ju#)
zp2pk#@hhc;h3vcEyb|%d_!)1=*QOUllx=?b!B;Y~Lcg(DE=T23v+WTCwp|4Zdu`*F
z?`9<TpQWfxZe%7Ymg}qp>$;MWk;AXm)}3!Y`+c?dPnYM5{&34H;>hB}o#viar>|^D
zY&UyC-74qo4c2czS~t#SIFO}lu)|S2ayI09*4En?=^6CumHOcL)q^&&>A=SQ7A==n
z&z^jpE3eNPw*Or=%x@^R_WbaZR!4Sa_7V^GX}}*_R+1a0dPl|{n4Nq=cKp0-q!EmJ
zG;)n^y!CX^-EcP*?cxUlk@U4liQ7Ct-(%0!Yu9el_4QGnCi;nUT6BtfNL%B`hJP@9
zzllQ%hTNgRio3d&j_>I0!M%Hrmt0UF<|3qvO8b$xA$9E7QOp|YJ{B4+N~K&lIOrSd
z6s9}>eBjMzTxr|tgdwFCEpPorou;|5zVz9fi(xv4Kb<pW${*6;$##mXS;7%vSlwZX
zU3#}j8mft8$botJUD!WysKBW-^LEl6#!ciM!nBw98@~+Nxihw=W<^{+!+v`<Wd+Kf
z;{~HV!2pqWmp}MT-y&V{e@evQp5<?Os&AbQi=z(Cm=_L-_5_vopaT3VjwH>yJG&yk
zcEV6qO_py{&OD<`;2TK<c=81`G*Ovs{^k3aQ<i?Amm;`ivnjLt7l_e#-Y{<iBM;5-
z2@^JB%b!gPQPj@z*BznfX+%4v(x<L;9%3_@Cxzthc#I$$%W5aPrLAJz^KX;;WV1KI
zx0A7xt@Cin1Ts}3DR{o3xB5Z&^YiyG#;k{=>SGP(&_Xfgy7|QZUPrbczWLxlK`IeX
zp(JB?y43!L%H8hxv(puQtxCyeX^URE*-Uw{F)uVU^s5HuXmY$JyfUpT+)pzVa;Kl3
zUV3b$dG}O6B4Y5PZN26{yK^!tcFCYB@>Nm~t<mY;(z<i%uAbunCams$RuMO;&gM2u
z^>JqUm9pbmz>+-nab28h6P9feaGt##R;Amko;`c!n={~cYb~u=C1#0{HJ!ZCO>Vz5
zcV(k~Xb_eZpUd=$v^p`dI**6K3)md@qj<YhlTxj69rN9HKgKvD#~viON7W})XfJO!
z&NqI1LabqNmHsdr<<zO$Xj5zHizezHe%_{INAHX+ZnGT^C60Xi{Aap=Udq*UP+g{7
zY6&RILp=Nw-h~#8Ki9)wZ19<|^qfbHbv-qMpcE-q4~4pMMXvpL>8armdD3|B^FJN4
z!t}Jt26Fzv0^RUI&*bdupSf|XBezn<%|xXrU;*Nv(`(-(6ghlgZ1VxLnTxgV#NC~`
zx`zivme!}UKek@6w7KGZQ{4B((oG1K59%gEmX1zN`hy2gr+FTfl|)JX#g|_$>1p()
zzP=pYp$*KjRK_Hi>3|OHHpLr>H8d4&f~Za@nMC6=xpA@0rw~gCi%LsNhkLFo3xANE
z?-9uX-%(|k@cmELXUy&tm)FX-lca8GWyRDZU59iU;sn94Aifvq9g=C6mzKXQ%&ebg
z<TgqrrjZP5>@YhYJ7A5@kTsgtgN$9h($5bpoINI5UlwTA&;?IaeWI!TG~`~d-~lO~
z!AGx^Jbqku=;U3MLKJ^66)TM1nO~&-gAV&G+Q$h)Xe^t4#-$8?8RYG3uzW<*>tbP`
zBOYGqis#*R__LW&_iFXKf|Z9;`aR9<uoZjC8=TqNuU2J^nLK%`ka*;k<`c=^ogT5Z
z{@Zr#T!?jJ?@Y_yaCv@G%(FBr!}b$Hf6+>G8*aDd!u+G+*d%`6s1}~)COOg52p+rQ
zjc3tglAkad0Fk;>98XZtTM^+#beR@uaMo-{^3gLZnzQo_diS0}BY&}?5RgL%o~N(e
zocb7?)f;AD5LdXaYUX+H=RfG#<>ra9%T-}RFzZc=3rnC<?}bbY=!U57htNlxRTHMS
zd&7*abl=0;J@YIpE1OnH^`$p-=<TYZus~xo!<<%D8n4mx+7z0FGU$KBw{ADQoF=;f
zD*#uws~wr)1enOk?=yI;vdZ3Bw+!)nGW)g%OS49`#^my~Ph05r`x^fc#ZxA|E!Js%
z{^T{o^nK5lQ1!}lyx`&!g2@$y4}F#((3Wnh-{OMrM$Ac=xv@u8;p^?YcEve|`%GgC
zzySLS90tYs7%E7XYB`oTun(-hI#p-VY+u;c6sOYUs?!sR8~P{j(Is55j$K|d{6!G;
z;r5Mp7F((k>*`qzD-QRaD{;RTU!U^Aq(m;@6sJOP;tKNSibb34#w=PCD_&y9j+t;{
z7BXfax|8UOFMcDO_r{{q#l=On$v3JW6A7yqGy<U>ET}pd^e{8(Y4*rw)*m|7pIXx>
z<+QLyrYzH_#|m6_Ljob$)edF3rT{a7)BD!vgwsWxav$kzZyy#j;A-zZU01~oXc&Gq
z)AYWw>h+q((@x?*DS{PM&7U7?in(jw=FU+@DZQ^2RXhblIPP?{lTxI+3{sc;bf6{v
zTzTIuPH~jmVq|0~*<Ddap*)NcTX@mAHMSs%)Tx4-GC*@`#QJOt&udv(tKM{Ica&fo
z_rDxra<M3!y|NJjZ--Rs_Qcy+2nV=_(u`NnT(|CA%$0s&LfvB6VyC-~TIJM;D%1{f
zzxdPmR`9s4I$b`=Gaw`+3P@~nOiYYe+Yopgbap*=n+-O$<!YL`MWd0)%uUkGk}(R3
z$p<f9>+h@SWP7^&=SNMo6{ksOE*J~3mS*4NaI-^vAW+Ghi=O`U)BlN*QO3y`B@A^9
z+UiRiNb&COeK2oh=o$^1R2rOPTt%g&zqYBkvY#H&alcW@k;{83o)V5XPw20+_W6&}
z=SI(`@Cd};BI8UL`6<^^9m7Y}j{8UT^T&kFm@hzy&RZO7hrgvwc1t$&H+{9kZqU~^
zW2{nR_XGwms4vX6`F<9Cs(}b7ZX9N(Ul`Ns+qX_ND%(#rIH*Ca{y4=~($f0dXFw#p
zJZlpoy?=QrW4^ep6SdR5`V1d#$1;nrvOJyanLqJ}d&Tt1Tb<D7FqwPhdh)fmf!{O5
zw1VKUdYGZVD?|Zp-<DT^Xy!}Cazm$co^Qa=CIOJn%S*-9&9U=_J<62^XUB##g~o2K
zjT-?$l_%62b1i}&i&xD#ir3;sK}cv(WzBe=H)TxXqU3}ZskO162LJW6V4;QF{|S?~
zR{gl^!KSRKvrj!to{{7p38?I&ysX?v-@YwA-u2|y)+rrco$}bz!l55941W2#1##x`
z>K~xVlT2`9E0`=#p{|mdiW|%jSQCC&#FX2o_~fFzE3rP{O*y3@=Dn9k0yoao)kf8u
z9@+UDptTjUiQ#hRNXDt=oot@Hs>m_V3~Pyu`PDC87;R-ySeI9;dg?{*^Qg!?(}WH<
z#SL2&?#q(;!t#gIBI+|0HbRf|d$u~Ib2;kQ=!>zf#m6EKY-5n;-3J#p9Ez=|LG&Hz
z7F?2P>RVsq61TD}+;~;|94FxW$ruadTUnSo)~*Je4m}r{f3m`Le{_<^l4F;j`rLn_
z-8;Qa_<nu;H>Tr*U(R03@S)62C}Joz1u%|@2#X!GzDLz8XN)c=0v+moQ4b#-cHXzS
z>G7);0;l(TXQffWbxZ`XZ{p*R)&2IvtG}p_%=;NxELasx6EX8Rt0NuC&m25xh<(jp
zUApNR7-YmQd8JpbJGfs#BCcu0rw~eh)v*2Wt+}s;t0L&l49LkC9?wmB*3i(<Ih82(
zRs8hTiAPP_?YV>6_p<eQ^45Vr{`hlY^6IdqAEWvark*$JsjSbK)3gtt&s(Ow_#@0A
zpe2~&_{)>2lquF!FPmS{&n_%K!{o=z<CxJs%RV&$XO`V%Ukr})-8x~Kld3Rzoujj}
zd<04(8;Y%qP4rf0Y##4<(R!G0Q`9b<D@pA;PQUU5S{zvA;;j0li(6^I!ryNne+J_-
ztuniAlrrqKmd%YNCyShdU(YK^J{9^=1|ZH%Ife?jtTIWl&?<{x&B5q_r(9GX`UBT@
zJb7gp{!0AJWh_DcA3%)t<B^y0#B964y!~-z(w8ZE1(|>0b{w;H(AN&&RjYOtY(|~4
z^5kVSVhi)G@hE%V1$kZ?J10HDA9QNi)8^}=HEoyd?(2Qdl!5nb$Pu+pO#h?>K<Ad?
z++6Do`Gtwj{xL^&gOz5tpz>p0W?nnQxS-9)KWAo|TkX1h^_rE9<U!FE>FSPAnXo*-
zwE9vT?X9tkIN(D2ZeHo`o)9ysk|L(ROccZu)=ul5s~`#rBfOz8J3ITJ%8a8X3tDY!
z-RkOf+oifXjXnc1ocAn5_4#M{#BbS-x6s+iX{Wt2k~mCvdMlC4ALmBTFCSrCm^7%s
zy)khJI+^yxIOp4OYZGqgw+X-4<;>Y-Rn05wV3X@__qUqrgigL)CszofokKmMsOE9_
zl=3~!d-<<dI$S(x=lCuZ{HyI76<i9iHBQGTcVRf|sX_5`K-y%jS+d19t1jQNFnbem
zqi^rt*C_{_>UQtj*S+y&c=HN7a}vR^2?@6-Goc@!hIud+Yh@8C8?38M>cG(OS-JaQ
zD0u40f+^lOlUQ}W1ZuiS>&}Eij2nX8kZB%*9JzJIo}LLQcbBbeiUi9bC3cW`lpzQE
zZX<bt-QBF|pgVYDNlA$;_gmQ@rSG2pBI1y>&0+vS|2H#A^;0j58eACWP^%nWs4`ZU
zggVa}mV%U^7L$Pkj3~+8`fZ~gxrk~VphPzPh`9f&hLN?oR>1Fr3M2#<H3_($xBf8`
zde$Rm^g1}>=!JRVP=D$3k9Me}JPR%>zE&G2%J~gGl<|kEKs*bA?kpn#0+{tL;Ln@1
zyGVrF@`awvqj_$%t(w(&i6nkmuX!>H^x#nIrLyiSc!=%JUjI*X=O0(|y~pu$@9dgw
zGSVL9M>aa7n8{UsMD31LvXYLpagxX=$BeL)@?(AtJ+$IlB;0<SlrH91KUSMtR1!;y
z@@s9@4<(X`RrmQmC*|J9{=9$O9_JyQI^Xj>-|y$g`~7;o-mmw>&Eq4J=PqO0vU)&m
zU>_sz0Hra`BR9sd44!)CWw8Ww)F>&a7d?x_dYc@Alf4Gie*co|WLEi-z!!J6)8{UG
zxKG(XeP*f*kJ{Cx&p*B{eLK*?;!*cFt-8Tohp~4vGt#PugqHWh3LXD!<GPFtTO2OS
zk_X~aA${Vb(^lqL^hX>^54SzqM&uR&P5l8iZ`RK!hpL<E+gMN4O7f-QQUNn_$tT@x
zII|#_l@x2g=#hv)C13m$gs#@wvIilGIbkYcQGBW$7{rk$*bM~?UeKFZ9zNUI)zv1X
zg`Kzt9NXR5c^e^;#BNO8=!v)c6TKMGqLG2&Xwhn*4t_X!Pk3W#R;x<9T8ZJr#z?#X
ze(@dS&ma~{8LLsFq8LF4-lPa+Wf2a)@vZnl_o<ysK8FXox8^pHp+AMX4$o-|s{*^8
z^V^>S8bD}SM8sw_Iz+bP5OPe1bQ13c^`Uv0wVqCdwl)zv1Dem<wI1Y}IpwJAoITsl
zzDJD8dyl#yJ1N9QohIL#C}h{~rL>Hi@bu1xOKZENS7ci69zq@{FgIhTAjEF;Xcg?F
z!+7u(S_2)ZQy0tJrUByOew1~ey7-(3<is*MhZlUFdIclT$}tlr-1J*tK=wf=+{zUL
zbA*T$S|>5lk*GyjG*!h>Hxplv(m=hjSGK9QYlrqV%!FC#31@l|twE)zLi}m*rWsHU
z^(T-&-guE*Hv&8mgyo_aBJWEE@UXPA^A?*Zl*i%>*41>iy}<pffuGEinFUtE3kU0r
zqa-nbOa4gqBG&%Mn3$tdeJ7><pdlhvpeW*criKL@oOSO`!XsDUt>QOEWW)-S5BFM1
z3DIM_wnn>}^3x*UNMvwWuoCm_8gRauk=^y*M}#(iRanbZ`B(}klf1=Zh(w6IYok!`
zd<KcD<Xxq_4Lcy6;;PaBHdI{LL|G@KAg3hld8LIO$MFE|h>VJgV^b7ek(73-g^u1o
zzVvDnpTbAlOM*v6dSvW<tWAjdaX{PCDfA@~BT6=K5tWILCq5#3ze%Vi3okI+dZspJ
zRAo@F{HA{la}s>^F%>~)SM1{i4-YI1+-co1b5lg$bR{c{IZ2^U&ORYoAeR1W-G{j~
zCoeBAHza27UM<Tm{U9eh#C-`8TEjsCi@FYtV{ewuyKODE+o}=c2T@=iSUP7_^{H}A
zvG8-0*a{i!Sn*o-?gBLw{>x@ninkKYi<k5PLqVXGwjE#uU{szfSDq50fy7{zs&4DH
zZO7Bch7gPjbtJwP>;W9iU91qr27XQb*(pLyF$qOr$?jo^z{P!;RW|rp-6J#mRjE6E
zIlDGoomydtX%QiITtY%EnMg5^7`;Q4PB~eI1k%Y#mL5?>cMSj$rKy(EXh=XXpCjo0
zvGX!q2-!?Z%t@ePFJHd=d1_A+6D`zM-;#wwtM8|v0RxjPM3hjBaYQ^dB$YDo9#cgw
zGWqzQ%8&VOR);jHDi@kdShj7iB@3r})0~C+-0l9u<7I2Qv^&8m6v_;=z7Ukl%a$@M
zxww%Yvvck~%T<5U+JM13Y^_0RI!x(~0d}`Og)#V1^Mm~&sq`aU=>H^|d0U^=)@qe{
zVC-@SpjZQ&JyY~zG7%&$(PWc6h^eBK&z7p!6#BTVy7fgu$&ANJ>J4TDp^%G;h?w`=
z?9k9q8L^`I*%j>OjvbgdODvc_zY2?E4<a_|ZBk^9OiJ5kg5ilowcuh48}nL9g(a3Y
zTL_4JPka0M#mlOh?pg;TRNiztt7RJ6*&rCHg1X|Y(9r0<^Jx)ykd7n8^_ewzc8TFH
z1wD|h=BHi1XpG#mCjtMeSxep@5zcPF1eC7a)eN1Xr;C8slda=v1yOTwv+41qk9u);
zgv76xs?Dot{Og_!ErLoE7liA<cEg7k`km~uP$Ue{na7)34P9*y!uOceJ+Du9C83Hr
zSBv|DN#Be6k3rpI!CO}MHG2W{_s5w}@`fA``@Z;GAy9=J4cIp#97x$A+%$#b=arTQ
z7Oj#%7d2`@h_KrtloL#Vd3z)ClHj7^#D!_u%I9Z;TGHyJHH##x=z-Fs#vEmViBYd@
zUO@Ta1s`winL~NwA7mT`ei?tCAWx?P7nDKkKfa*9f90TX+j%FPMQMN>Bp%lCvfs%b
zP7=Y=Y)+FlZvFU)=NZo_Nju9Pf$U1ZSzYtCLA?j3c_|ccUUZuDfycX7VnD{(8@)39
zj?H@ft6iHHx`dNosL7ZFw7}}|2fqWzkfkUv|Ki1OuP4ao!rDR%%J4|pUH)s7r}_#P
z(&yWSbQ30lypa*_#S&J66x^+TP1MLuEBsSV7&V79ve#)5Y8#mv>Vj^MsUtl)jCCul
zkP?fWvv2gqDcg;wYb$A*HHiguCJldq<^!p?X84JZItl7JfHc$YrnY7E4uPKHbuTW|
zRJLM<SW6{!uEP}(CJ!6t!cZ?}VNy{eN{{!P`X-T7nJp$@I#<21hxW`fXOx#B>t@f8
z=3rv}La1a;IXyvtB4Z<{>fmOVrLRdwFTSH}VV2Ml!p7w1yCb=l<sLPK1&|A$-TGwD
zbXzioQeZAIdy(>XjM+ShH38M}9{P~=x(Yd^b&TzU?UX}G=K{P_pCKQU!vDkbTsS82
zvyl2oj4b#JTm<WoI<Tm!rJ>KGxd3ta$I-=BBcR8V%G(k06t|)VZL>&zL9QqCk6;;C
z%Oas>Q0)fNZ+&{+?wea%h8cO5qQ?_*o7k5IJd5O_k|ZofwIu>zG(+NczPvKQ+?(&>
zKa1;;3o_=b><qG<N?yp;$pWz7G`QgdE{LmItu{#gwFcCN%^`^m_8B-3#j>%m1=4^9
z&_iM+DzV|ol0jy*<XU2Wi#MW6Xb<V_Nr){kfH<`rJJt)fN5F36uW?ep&TZ&pHGhjR
zyU)^$rknWtF5)Q_X~#*(98rKcPn|mIT0V@_P3Y{PpLb0GqAIj#89Td>s$Qu|6~iRa
zn6WybU)aesIfywL`^Pk~k7s?mSzCc-8o)cMwJ9}pD1fQ}@SHVkd&!~6Z%lY}VURQg
zIM?Odn&fe*C-iyaDYp($uEH@wz54E}R75yi+;X#{q`gOb<sE<8FknZAAYNfFL>^bl
z%n1mVb*@2Il=E2}sEmYCERQ0zOK30482nN}iWK7k{5eO6&YIDqe+SQJx7%90fq@e_
zg4F*0(MttRtXkLAv|&Yi@iAv*{(X7z3rbY?d9o#~P`SCi<K(eI2SeO>`@^_)pJYYi
zV1s^Rq{fUQ+bJY_QgWN{p%8j*uhI&__&nqP$y_u2fxJ41CHv7xf?ijenK?#?nupE#
z=DD8ou8FVHA!0`*8*>i_JuSHJm%2{f`_buEd6l4%rrW#l$M!1sM_E0H#;U$|)V)L4
ziI>rSlBC>+RKZLz7W3?|Br~NlXUu!exApsKY)z+jI%zB8&*Z$)CQ#1dY0*rpq5Im&
zNKI!6qW$PA=+toPof^U9P=1y3)7>rdwyQs8?~`YoV0BK_pQ$u>ag+Cb&;c5H7>dvX
z^uf}Jad}u@mnEHyNkk*eSO6|yz^lIo3dO=sR=UJdnm<&6m0sGgOWrN<b&;I(`;FB~
z|AZoJ8<uzSE54F;fmGy`r$0Tgwv>K`gh%>w$(|w94f7K2>P^S?vpQ|sp@@s^gLGxO
z)SwL|1%Ng%)zyK9B?Qpc6N(^0SKtFdu76&k81q#JTbt;ICPvdQlwd-CU&4k+ubte7
zzTp$@wci>0$(<v%#03oY>X~J}I(4n+2leL|dtBNlkgx1Ay9v)DqOi?_^n<|Yr3*Pq
zVrs+_SJdPdT@TA0yVH#|^$LaZ!&##xaAxs|F_62iP<Vvx$(eU-^|;q`KQRvc$=#&E
zxvn*|wA_^$y!o(h%w89v7J+2@$!4bv-PAEf-MRyh2%Y7?DPCu#e>8WJ9Mcx-gUF5g
z38C?MX^58~Cy}b`i2286iT%KfQ6VqNHu)~5qcrZ$L}F$YP>Rl**xdX=@m={F{nuyf
z|MUD`JkR%8TOU4IArs-hzRWxCP3LdIEdH=&?tlJln8hVM<wd~S*iI?O_G%8&*W$i7
zpR$z*?WBJc^=B&hIBHF_1UALRK5{=5Vb1m)ON_mVNK42#BM%&SQ0*qaZJQ-t-b6~t
zZLJU5uE7i|okWg`+`zWI<TP=Z`fX-t^*eGn`BD{Fre+*sF{GQ(BoYnjbQByi)Kpdj
zk4;EBq<f+iXgD}*dKkn`A+1$N334{H*Zn&8#v@&KC~jXq>k2T9D{J$G)x$X=YoGLu
z6>Z7)Rs2<0jU^Ka1Xo(JN5#ZwGU`bjc$HYoq}81&w>5rOcd*3y9YaQuFNJYJLP({K
zMRU;wNEVp}pwMECEmCmP?NU#8rb!rCwtMk2I>`(=^0U7X$M|}JW63^eYpm2AzTvHo
zIj%UR95O_@rHElUTg}XSErN|9@MQMLcK7F3Ucoptky(E8s^OO9%a=C+Y~6OLlDba9
zkEJzzKyE$RpS~<gRUO$~Pu*$swz~+(oTp98)cRxTv#a^7#_7Epm%Y+BeS7nnb@DLJ
z4X(%zuK4Uk<1vKu5@v(e&@tI6KjQ)VJ)(w1lsRF>T3EPI&lqZ!bY78ubh9m%W^@D`
zQV3@7Je3n?(>~g;TEJ8Y3ya|I%%)-H1W}c)^?$SfI1d7el!j)Kv=WL5mRf8g8}r8C
z_^j6d{Q)_D5AUmc%)5BLyF8|hyjS^`dzG7__lHZWhKhDXV-RVU;Bin79_Yet3<v6O
zmFc|T3A)Ze@?&hx8{N7IKq19S{pk)Qf)~~glBGkSwyv9HuvEB<!q7=V^cQ9e<oFVu
zZXTpxvsK+Y@McBAMyf4{?g(U}%*1sbxax>NM_+$^7+X#B#kZ~+b3#Fsj&U~#jIy+i
z%0*;GNG9CBEb7s%+u3Skk+4S(zfl<6+jpX7t~|Dl9ANXlJla3KX5~aGu~NYd@%V3x
zKdL{IB)bs>g{@QD9glRMxc#5m!*k1vd%paP{{OMVItN$hZ<F5nKX^8_t!qk{<pv{7
S-(op4r^z2pI{LS{>;Db4M#d%p


From e1a518eadb3f981089e7172f0b0c2c697127d80a Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 4 Mar 2025 20:36:55 +0000
Subject: [PATCH 0497/1240] [v1][Metrics] Add design doc (#12745)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../assets/design/v1/metrics/intervals-1.png  | Bin 0 -> 189717 bytes
 .../assets/design/v1/metrics/intervals-2.png  | Bin 0 -> 165806 bytes
 .../assets/design/v1/metrics/intervals-3.png  | Bin 0 -> 164476 bytes
 docs/source/design/v1/metrics.md              | 712 ++++++++++++++++++
 docs/source/index.md                          |   1 +
 5 files changed, 713 insertions(+)
 create mode 100644 docs/source/assets/design/v1/metrics/intervals-1.png
 create mode 100644 docs/source/assets/design/v1/metrics/intervals-2.png
 create mode 100644 docs/source/assets/design/v1/metrics/intervals-3.png
 create mode 100644 docs/source/design/v1/metrics.md

diff --git a/docs/source/assets/design/v1/metrics/intervals-1.png b/docs/source/assets/design/v1/metrics/intervals-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc4ba4400029a13fc62feb4b2a2993e8cb627efc
GIT binary patch
literal 189717
zcmd42^;=Zm8!kKuh#zTb=>}n>8w90Gq`O;ELOMjHJEfJ7?ve%pX%M7Sx<Q5<7>2Xv
z`!Brjb*^Ll;4sc!d#~rYpF7q>ym>8;`}pZ&2n2$ws34;OfjlSze~%wM0IyK}X1jtw
z&>;5G(r@f7Eg=x{6iYKR@ooqP=R+KvW*=tRm&dOk6DFuXKEx*{|3HBABq<7;+!g(6
zsmm<gTdsq3^)PJL?**i+=zI85<r*0ERCWcA;-QIn<_^JgiUf_g;l?H!Zys1-1<%RS
zzZ{c#|2V!PA-4DRD+^XK;tx|-ka@vok;1`8UVQ%wwm&~ax5uMPHD|MVWZ`P-`T$Ow
z#}q_}{;^N@;lu6!9@P}2<KQeBQjjw_)AwqTa&!M{;^uY~izAT!r1;W8ok1@gGdz8A
zlAQbn)Yj~8jFlNdM^{%D2`=^{DrZed>u5GT?V~>wZsZyt7;<oYa0YRFrBVq$qJ5<Q
zNCEjiBPlsZiaY?1gw9eyLlpw?Wr09~KS3Zj;HBVQ2*i^c0@*i%Kt#VlAY`stE$ZUn
z2M;Zj<z*oEsNY}Pi;}=ASRRV1vRHc$o<1QF8Q-IJ2mcwODDzUwd*LwK>#J7ZMSrL5
z_6oF$%=hFgF(E;#1(ytG#B%O0Jqw`;enbAoIm@W_+1@$XzR-4THa3QKFYmkd<#w-a
z1X8&53W_hI9vv@~yUO*$<tnw=-F7=!#69#U_mlx2AK_r9t1uN_SkGx^_)RnR-~at%
zqw`bJ4R~$MwVpI<$_w@L{D!{;9pD9FstQcj3u@F4?V!RXC#d(!J9H}spx!NPfc<)3
z4E1Rf2Cf>bsCUQCg#3U0sDFXS4;)`{pDma^NkA~s=8e0#o^RcgPn>N_{y<$-&?{@o
z6bt?;JH4VI;Zg5>WdRw(bpN0qSkm=skI!}8CglyoUJQ>p`GemEns|j;9W=o%rv#4Q
zWt?#=RiYllo1?Z-ON|MxyO`yEThTCMFn(4)0}D`99UV#MTWoS!tu$`^^-Ip?IR^(L
zg9EgqX_;1WRCV5WEVuce__zT_ZvKhT)=x32iX<0GLxYc&lcznd8bP?p8U_a{EPR%~
zqMO}!0uiVEapJInED`si;bCPZrJvk5J=OAYG)+rx!-9*BmeKH>w3vcDme>Xzc7Y%y
zqA16AU$^r{yUBSunny__EWW3Py_UxP)i0`|a<Q3Tig%nN<;fYMp8LyQt9LgCRcg5}
zB$5UP)MT`@QNJ*|&f2VpQzh;ZXN!(?F%3EnciCx9^f00BKyngAq{Q9%A5;I+z8cH^
zj>`oH-u#*plDRe4%{VAjv;@J9RG`Q=;(9M5T#_q0vgAH4;kESRoq<EmyM`Iw?#@0^
zL&v&NL2GMkr4lsshg9$9%m+M+PGoKj$wPU)st1SEEu(jQsyc4=qqsUAnOz>v$Hc_+
z_4Q?bPW!n=FymaAX=Zwx6IY#+DBnL&P|!=WrS@wu_+c0MnS}f^d5&(yJAvyB+WQ|D
zGlnkh7c+M^7k<}kVO5<-U&zOYxHsp2)QBQ_MbU3Ni9}2sj(kiu-4`)1JH?m5I$48q
z+{Z+~wam#|=zD5JYMWKcDvCA_Xqh*;d0ra`pUG~jWeM{n4LUY0zlZIx2yY7qk}f4X
zk@6gPEjl%ERQ_21`w@TTZCOQWS=m-mz~ww^f<l1?d-^PX$lu%ZTz~7V@DBKPW~OtC
zJsh@`7wYm>`U&4{N3K%githmwLPA0kM&rvSC@nTPkpXY~%$liXdOHRSvbEoWP{^^U
zW_TP_G|kM+Y#%&UD#^&m=)n_nygh2V2ltesu`2m!@KiFc?jjR2^BnhG`1I^-6au|^
zHSiYgpSW7|1u1K#=JV|<lR_;^%MJ4|YC-EE;?vw6Lj>g=`;30#iMUDDr(A#7;8ckw
z+F{$-7`52@=vVL=K5{3S1~Z9<nz8FmT7ACNony+=rI>Xe<k+(tr00*ZkDZ+&KfZll
z{km_gcB0q$M=WTWop7tQ1V*Ih^kz^f*`;;(s*2~`0+Bj0_|JOnq<I>rzT4jqSkH}q
zxUTm`TrGR$&Yi<f?ygoP9cDNosKUKH>7<S8x1O8-%4^xaVTjCqA|1RWYTFcIrul3B
zW;C4E<O{!j;(7XufE(0CKxB|x$~PxeFRVmqyEft^Uw|8f94$6s(V5EiL%4nDs+wdv
zD{WuBeDa_F&L3HHi%omIMRH%*AnVpRW1RRCpQxZs8Tw~~S8XQ`C-MXhHsuX=y7k;$
z2f2<yOLZoi=ep1wobFt*uSTnCXgOhhy7gl-i9u0_j9%a6Ww;tOyAO1TVwp!W;?F~p
z9N6{P(}COvi@%>HnH=7BT(=vx<&)AHXw5F9M7W$}>XkXjtjgeW1qGa3)%;~F3eZA&
zlE#H^zV}-8oc;M38_i?)_shtq)O;ShZ+tR|SX8{h{%*})h&y^zUBn4OBR?!ThLpb_
z%k5qIy1jPovR)9FieTq~aCIVT5zV*9tz@L64_^{o*g7~7U?e!VHZE|`M)Rbr$d$+V
z9XDf1DJl+{B4K0@c!2cHm-F=E#ZA+!^nCIz1lN`{dFbTyY{|e*5K-tiJ^X}LF4IfD
z99>gWu6O?Jz84EY_jd`{y3wzb*CTi)^2jASkog2mJ}bK>&`&xY6+{>xtTcwMOymkL
z#JMCp?C+Y!?&+EzcO2bSo*BV)9|-0b>9vl$B#?Q$8meQA{F)^>uuUf5%dYXzUe_%;
z`uf1bS6(1hsSwpT1G^dO)q3Z0FwgE2eK(Owa>gphJAwGs$}jq_BYO9nv<;>*lQqUA
z?-65SC*pw!Z+l6&P-IKB%Csz0<Wb13`PpJ(eHfyndF%yqC&P|s+*1?&jgI#XRdtw3
zTb-~W4zZ*9KO9wUPZX~ZEYHxDIY}KH9sM^G<RKo%OK<>KDR~S#byNaxr#|Apm;>-l
z$SCL7yp;+II6u_LH79Qz7^D}~oa*na7@5FMX;jzNz5pG#3IK$w{anM+(lYRN-e_CL
zfMl$v^Wo$!S{rD2*eDtnRj_!rR7=>&EmUivL9EfcC0Rr51$<MUHUPscPU5n9=Ec>L
zyK##L#S8x*(R~I7+&HEI7t^yQe$b>r9NHI%cxgO#C^QAhwT%20M)P*Iav6Dh!X>R&
zK%t-rvsU3>SC$Iw_`dG5ayVbtA44f*_xr2y`*~)`T3x)rjTj-D@f=*UkuN*|-(C_Z
z>|6x|UF=Vz7QmY8KiLs*+k-FZKM+rEPX4S=Fj9_IJKu;eX;2(_y#cWJSul~BxcW0_
zkY=z#j_gX>0OPY1O=IIb#M$_GhQPrmqSw1K6^7>y*c&cK7Hoy7&y7W`hmy13FO+=!
zD)l;<<uv=%>;2ybnM#EWE~Q3%&)V7&?4*nqF^ul{W1_Efb^=fXPU3P7tlVqCzDQF|
zqM@LmK(-*fZHeyDuNX=)jPAhOBeJNz&0+sphXTvd9EsadN@wqjX<f|{E#CCD(_fTV
zi!PFc_LGT{h<`m1M69G_Wcai%{Qq`+Sd~l`@U0|$XA8ZV5~VeL%ozaX&7Q5pgWkXz
z7oGi1+OM3!?t-f`CT1`Y<`-w|*183s+&=j+<fCBdd)&P90z3pOUQ~OXG_3~0+egmY
zn3Ype*Ax&CX!pBX@tieI8nin1aNh<q-*ya?C1-YhpJ>ZP+wP)E=N$mI`+iAe%}K|t
zoIw=-PXzweYS2A+E81b_J#sjWD`+R%>*joq&-<Sly9rB@$jxp(zQ`s)g{L@8TI(>M
zoXMwwu5f>Gr4j&-rC{v$gyEaoR>tn`h0y&A==}C=t+cGNWg}Khr_zYMr|%^e_4#I!
zS_>0xCV$?s;(c0>A(VkL-Djp;pP88%(&BlbV`#`B84)`&LhJvpo5D8h6cm!X^F#H+
zhdk5;criQxPGEgxrkn}Ekop-+OiXoK@5_1Ht?zs`BXjme*_+C6&otxr2~5H1Dxv9I
zt%tl2FwWg~#!@u*_xB4>96}eAE~u-YBC`R=uw6CwOA=k91QnJoIj_}#CStMs=GYfE
zcJPaAuNCuc4^gO^7bs929UTTCHZo#je{ivhJw$${YG6%fMuuD*4Yf9A@J^23^Lvwo
zTzLD{%I)1r5O{RAxo$)3%4yHHvKBWk7hO&k8p^W7-cGUk_-9Jv4O{U-AJc08RAkM@
z_?zWEmhd!OiJY1=V2rCRODqs^EpfF}XW1VswCe93crnB?Jv|L}VFmnQuhQ;Fvtd-L
z&v~1i%M^Y70gEPFE(iifvt3|!9+rNK$6k)#G5bX|Sb(m!w(QnOm7>EDuF%pOEv@9$
z2Uz$tSxSYQ)my`9_ke^A(ehNb9=9NbV0W$4lrC-Ww$p6`Py5JVoC)2~T*Ik4V$YQ}
zpIIaKQTi|{w=OiSXj;h^ojyT}#@pk$tKU^AAy`zdQb@yiqOC7`e~L`K*FFcqhfQrp
zGx|nURaJ#nyf=c;&|eF$eF_Z?Ek0;JpHyCM@k~nP5^x8ltF)HyHPPs_SiB+^O<rTs
z7g?OsXU;NB(ol#vo?ri%?9>-Y0w5=0W--rHxjQHDX76lVG6+mnB4!PTY!w#9v|B7%
ze*ys#pUYzGfwbXNXzNz0A)eS__1)#7njAe`dkC@MP=yxWzXAG)W`_z2^;J3#*8HwA
z4`@(<_I1Z<gM+Sm1?rQFaVm_yJ7x%%tg1}$fL?lGKU9?E_IO8*&7mS1vlDqyg{GyY
zHIgNgl%9W@aMofcn!REX1&z-X8=3XrIs^OFGXly=n<+08k6IL<n;z%Gzb8TWt;x27
ziOf$PztPcY^gdnZ#<@Q)lJwmD{mt*{7!>6ls1<y(afz#FXQ+$&@lrFHiT4_qwIV`+
z*<e)zndhoZPr#^+mrp{KuT?N5x0%$lltg+P(9PiHUA5d8p0uz1>%osJ-CBHGi+NCT
zW+&4)L*q6;&oC}NPUK|&?{D|SKYH1Us;c`@mp~$t|D;ws0=CQxUnOfOv6l=gCb96G
zfmw15x^g>B<o<h>0~{9@H{j+Rtr|SdA?`acbQFX6cRX6{0$E>0dI6dzmv{~E-h6v7
ziFF6R8_?I5WH=z|pe|EJd`d@}rnNoN02(}d^6J!k>FQ45V>0^YDyWa)k1{U)bJgb9
zq?~`V{)HQqV7LzOv>w(D4<s-q@EsS%=ZO-ibCO2&Z$KiWqHM==sEyy=Q9rxyi=p6q
zC^cJY4EPxXl+W~U1r!F*;iY;9V;yq!9_ba8_T|2c|ANx_$4k5pFhps+|9(Kn7k541
zodDA0(T<fiOHzc@bWK0a@Cd-vrU57<=<dL@@SmFGUHw3{3_}f|t1@?5z4Uzdx2yND
zz8(w-@gKny_lJ8<<jDf=3wgijBvi8BPaA{){gv<@6sZDx#L6<NCDYok*Q02CHWR>O
z^zOX7Yz4hn*r;1+G!I0G3#DH8?q@O+vuyy&S%AY%UgygVk8rj9d)4~>2u2xdSDdil
zyy)CII86gqy-VE?Pv};HFNvsbYcs;5dEtLulpBar2E(~(5*Nf`C7u6m0Ai(p;6CfP
zJ?0oi>6a|q=iKgKws!{JZO%5W_$rTOi~GRd^;3KG;0g`=Aj16(;Lfeeq<!-*W32G{
zR+?$x*{Be!Y6gE0(%->JG&@!JB_s&3L7VOaiq%3iys>)(v{@$Azr|JLdoYQ?ppbB@
z^8zSW8LfkY)9sP9tl(V9pul_N9hx|hVr~hN_waGIgyZE_TK5qi20VQ6e^O(026>p+
zwmClEvjprXnJ5~q3OYb*?u=z~2)F}|dzR5I8gz{})7O+<2j8EX)-{DT9V|ATB(rPw
z_HsUZ_T)&(x#{h6>41QSk&%&!$@4gO_*NR<85lKBqxvd4Zr-xnf)4{^%U!Vqv?ssQ
z0>4e*-Nh`5Irww1KAmMz%i*-}1ABrU^0x~eKbaC#5{u}MZ{O%mE#niTxKrWF-r_GT
zdc!}UW08`QKJeZvOxsjsE<o=B*LX&vZE6}cZ=IIyF{ua(hD-32QjH~r)Y*=ca*%eZ
z3N4=>){V=pzg+3if&ulLr_*$`;wSHGcy+woH%40Yhz}IYDFbAo!QpDoGLENi>gems
zkAOs`!y`E=<K4kV9Z>MuAOHlUNv;N5Ef;qk?7hjd4dF8FFzVH2NcbY|Q%iMr+D}Ux
za6aL+^iF=$JDldSD(J3kY51(|{n5gp;R+~AAE=8C_H17`A>-!ryyx=1mjbg!j99w5
zx?mlKflvAE;`#9llkMsOE~4`V0aj92SO`fRESMSraLQGLI554p*!DTw_S~ORjPwG9
zp_VDAIu(*1E2>0ul%i{ru%RZpS8#K4L+!Q1vp<lPR0~~56c29;(c@%6^Hqm)HE%v?
zhM{U75T)VeLOxUn0%mbZX6j!9FwcM59MGq~MpTd;Voje7r^QAR>PoMZm5YlDSS&+d
zJ$NCHab|kD!l;?IQ-A1UXC$47<nNTGS_<=M9uF}I$><jz%j~21nux!k@a?<}+PrIa
ztAU70Bjs9Ec8|+L_@3|5v-}0Ot@-8+U1Xht!#ld+Qw9-$A%(1)rq6hlkN|WQDVUhB
zg4qtUG~2PUu~Cl{DExscL&usvk)5owLDt+X&l!DKOn9ma3m{BZQ*!{{3;v-xk?K!{
zVH3bFzB3ei3aT_{h;;Ahk9yvsfaeFE*VfGKVO^X*A6SNVnU3P3@J^O{xZVC#N!$0Z
zAPK>{1>^|nsxo%givVAJeTrbPaS`66y7#Gu&ht}Kz0EZyI_gUBqSx7gS;Y<IvgUX*
z@KqT!yam9c-hKoK1=zTc0}#mgi`zrgAQ})%WKM>&xOp@H)KHB!TfLI)zX20~FMR~m
z#sL^Vh_l@L<o-DSJ>K=(4pSo|Dr3`(PjeEUo}PJ;XarR~ttTB~u7A4}XrgSrovu7P
z#MVH4rfi_dl1;2i**#)MGRw9yqLWEOXaC@Uky*;9iUFPUxe;5TDoYY5Y97AXnVCr8
zHc;;AKHxqX;EgN34(ZYfU#DtoYe6ZUUNv59y~me0k54uSv>hfxX6%u0a?3FJyk(bR
z?9WTt`HO(uaOc*eM%gIRh|deaMrvshHTh$i;KCO4uN!MX#1C@#V<Q%Y)otP6jyOR}
zj2U<M!>i4E;_FWQE)U^=v=U0m%6_hYTq2#mX|r8gS_){g^wS*#{7j^Xo4xS3<>0xE
zvz=Z1vCrCPvY|RtzoQ0NTilDA;(9rsInegDV_7L2x+v7m7V{!Kk>$s|nzzlRZ=!q@
zoWLj_OCw>BQ&?WE!c;@W{!qF<mO4LdU0m89xB_HrP_EcBV$J^zk+Ra#f-az2U@{YC
zn13ggxl$eqf`^-!m<Wc2`2GnT`FeBvr|`I>FPNM#10;Dr%jcvUsD!SX_~|G9)sJ|h
z;b%*?8tU>??>-@XL3<Ycu_ugKUq}BJWr_$nW9iYUFqj7BW({?^NcL;_4TDV;Qis1_
z+|o+;*5jywn@S^g97<--i#q{N%%U423>4&3O3K0{`6qWkhs0|>|MB+$T+Zegg}V=R
zc+->TZwi&uE_Wvwg9ws$p6m%b&T+G^0B>b00`PB^2ka?IJZGiaWxcJLT-#@oReCiR
zi0e}@5El2qECPmA90lL@7t7e@%|vB}D>rNoXz3+3SKAw<=YskH0qqzx!_^#KWVR5(
z2A5UjFjuG5He1qPz~s9}@@PS6-48ac06~$Y!V(n~#Rb;pQ`}&hsudD_0mvSqX}$Lw
z>yd!}5z#19nb5R>ht0-Ty{LXoVpT(e$`-f#O~>QQa0E^KEggzuzI{;DY4Xc70b|gL
z7nWWHz1T7y|H32v=^XTzK5z)g&|JZi<zpSj>UO|o>m28KZ34ko^u9R;$jq<ZJyNDy
zmD)&u)(_)EIeY9ySqz~_m!Mn2<|$g_KP(%QAFgYEyFa3#+(3uUp$iX==4BJVBWB%J
z00$0l-xjnFX7LB3%pKoh;5^E{dsj167apIH@iaYff4Q~kQ(L_&wd|zAxsCqi`>hmR
zr6a`}iSeXI<a-kg7bzN_KYw1Y4+5lHHWLm2VDlL0Ab^~wtH{fyil!BV$6N~loBSEH
z7YdchM7ChVTv!Cafm$CSXT+cD3<`QTAUlZu`0-;CpRE+9#(BBtyFNKt4C8@F_>O$b
z07@wtOMf?p416Nw19DhdYZGX(*JGn<beCYV47WeTeae)U4JNr&#xSFv!`8QF&z{Ms
zBLU&q>Huv~SyGaaAhZ9?WwqnUF<q70e697m{N8+>t*2+h`TD}b0*Z4zOh>?<_Opb8
zvfjar6FdIlU#M-<XdnakJXyH|gY=1R=fFdX-Z+tM@-aUB72wwRZ>T&cQLQ!~RY5Yq
zZm&+bj(u%VW5e2rAc;exC5C4*ofV93YVE9FDEo-m5=c5iY@}D`ZMG2tH9Z~Vjz3g1
zE<Agh0OX)Dg@1nf=QI;vMZBc=V!IDUNO6UZsWcY=2{OZjaaCiv`zDJI;_-nbv3Q)8
zoGcS}58ufR0??h6WD7KLa>8r&iy@Ax0Up5lBbQ`H{(uqiy2x4~<(&_hV?<Xf*4BkM
z+UQt#oNR32k;zJhdClk|bm=NX%|LkVDs#0bD=@8}{!oaulSw5PwJ~Y&DDk#v1-)cI
zyrb0vT$)Is&*}K0yL2LVzbIXz3+(y17J+)I8JuCiDD<8<5A*ho;o;CJ_XaxUC)J2J
zz&AS$xg|*;X^6J|>)M^j7g3Aa%Hs0@!a=>j@cbF7l`CGe1beOc>;QY@f!%6HyLU9~
zZt{R?-N|Hbc5^VPPzei<>Kcqu*%+pM;U~<jF*-V<G8t4mKEpua2bpk0^9Nn38W0jj
z(3&h1{19Dea^X6~7~wpa-I@WtPPy}`vYx-@c1so;mJtX<U8J)Ia6e>m%YnuRnz77o
zM_r}seDBXzW=P0`=3&s^^h0*~`sp%=@KnfOuv5nTcHte5{&r*0ZZ7f1Uw;DWfNYQ1
zww;^2I4+bksrbX)7FQ4;7viTf@NEzXr1<jzlIYU~9IXNU3B7k?B0DnXQVExg-+Eyo
zz#xJ?-i;MIW=m;&{{yE<$G%n{(1EiyIzAzO{_vzfR{=FFXFaA|9oH6n<i@S<Q~ta5
z7DqO9-W`F?O#OBb%x{}*x<^*m;Wk_OUuP7W2+$M4V@3CifqC?9s#qOZkUM~;v*mYE
zF>WV#j0N7^DEh!Q#`7n^An@H9O0nn#mE#Y(YNXBg;%!a5sK+kde%pXs^WO=1TBxqs
zwnJsJ+z>@f%AfD5%1lof+VF+{LjJ=ZN_!0bb+da5jN|*SAMqBf(~N;<a2Irc6;uQ~
zC{&k&|HvZwDrA-yqp#*av)q8ogvi8GYaj#bY(^75=8J>_(*Ejs0|2g)OMl}v@DQZN
zGH3y>YtUKAhz3^=I0x1IT!3z!Ph-NW!p15|@h-eA;X^ARS-QraDW6*7s6;fBp1*e4
zWSJEy8jtwQ)OZ0n_U19b9tJVA!{(hF063pV7?VM7yf|~IyC~#gD|~Txu4(hS=+Df;
z_wV1?HNS_K^pBvpypfSg7nmPPbm0nQ6Iw8BwGs^mUtk62m&UWuPw(D#)VqHxDA<e<
zTKU`d2yuZyzb)p*U|i=gTge-sqow5m+!T}wV31_e_>|77@`digRA0Z@YVb)jlBb`F
zv1oWHQvBya1zSccDYITOeR?C7EAU+0#|lrwBoQw=mcSDf-Z>%M>l@Lq2UuyF_3js;
zJXOsBa58}H0(f#*mZ7F5BXtFs-H*BQPG>uj0y~>|DkOtXMK+iIuA|3U9TldTJnK~Y
zgv-|nIE==#Q~v>J*^cv7#I}$+ypI_orSn9#=4M13$fF4aXzX4!V2H^R{{W2F-F*a6
zBzdP;Nw*dT2$e+-F&UW)ATkM&bajpiKZzUE(wU=)u_Lza>5+y-{PUc4?HZk*(-}hE
zgBd!RVSSl!7P<_`&fZbNY?E%4$@)Yji%cJ!0+$x$pBXuU+#w5lF_m&qN=ILR{^vkc
zXf+9kjtsgp5O*@Ou|IS2dqCB)K|lmT#cLH##o36>2ZA5<A@&bL%Bh?QBA5wi777s&
zjO<q&&z@O@s>z`o9u9+boWJ8iNWLl(hvhupS<UITnW;de$Kg@a>hsSi-&v$6stwpP
zr{3ZK)_#-X^7C?y`6NA1_czki)U-<0)zLu-s}U+p$6<o|5x(&OlpNJ!NowBBi(-nF
z(xN3C54gP~>5)(hOjKY3RJHE=cb3ygRLljVkuCPlVf`t*n(_>rM|k{*vr3xX6DYai
zgCtoj^-(ah3y&&H+6{EvFHTlFD=S}*IRV-R@$*60|M?!lSeln-0UA)V=ru`?PzKYA
z+b~qt8mH%t$-7OqG1E;J7nucZA0IfVXwbtsV7wjw#O6wV)u=bxki$ZVIi#Q*Tksc5
zaM?H`ARWSs|GXAc9cuyRM!{uX^$$P@6-<<XIMxAw-$NUIL&1g*Gy?AmHFOfnzzj1M
z&1evmq|;?EIor_CEJ>bP1P;+kUiYwhq91w}CYcd8&`y7Vm(WY3mV6*X;s@$gp4usl
z`aOX<l!h*%z_8k^8_?3eoqAxi4t-RZa+*p7$enFxPyilU?Z6v@Y>bxF)YPH0yaz*S
z>vYj8>^nB+`K%V(Kp-kKfQPTdR3e^Ceg%uQvX1}61+oAiAj`4Io&!G_e=A$bF~5EX
zC`jGyL<TRqCSLX0`H6+X7dxRv2p<6mAZ?<2LIMJ^?|0^EEDOIyaiW4xS#CWCGEP0Z
z2{{H+=F(XABuu#TM+N$LdOEjko->r$4K!(z+6(=a@Gm)&RlQw_7BU*9m5hVUOvnE<
zcS%uLZ?ZywHK+=hzz=ueWju2$11uq5J}y<!BjSL9#{y%g@YrG-*xfY1$*t9+Z5}xC
z{I$1vzyJ@2v;7R6)xv|NR46}hmT{&pJ4=THcZLYP@atEzg?jrv4+8}005tmth$prZ
z#60(zhA%hE2eNztFZ|;3^V5F}e4<{*oXB7hS1MMz5738!5Q3@;U^!tsCq)S$>l|d!
zr20l6qsSs7;Q@@1V!iKnp(DWWKX2kYMSdKgdXRh9oD^zpZEdYz#(Oahtu8cb$q=V<
z*;w-@k7tVs8N_mNNz|H-191pUxtC0jgX_jbIVK;``tH)3xU=grvG++5x8E3hEemui
z4y1v2GJ@*Y2!I4&%T){Kbyg_KL@gvN{(8{G1w7H48($Gvrb|ac6XQ4ZEFo(~gT3Ae
zqQakO;$R2cbaH}=Wd*<-<K*Jt5IZQZ1A6MfE`yyWt`lU~S%2O!HmFfeK@n=|U86wp
zlTF_P()+^ouay2KDRgewG{r<)Tflkg2WUAFlWmyw%P<V0&3>(+k1E-s+FH*Os$LQd
zI6Qm)`~{E(a<NntDW(okGzN9a#4j^~P74adX&+=G388A?m$yFv2IN2F+EAIIDY%uV
z?mrXvp$2lXwfB8^S!aE{);Ye9>JtN=_lI+KR^+@^3dwx=B$0`uu`_HJXR6~0k}1be
zD;_5~r=BEr?w6K92HeU)jH&TDnx&<8<SDmVa4-4r6=1WqMyf3OTrInK<~^Yib$2KS
zJQwf9^<PxY0nI{SSqZv5Pi5c;atH))+jm{QAh!TiEbU6?f~nil{jI?nFrE=TucHOe
z)Ac^y>PUP9n63u3RuSUKKiIa+L1hz+D#~MrTl<(}-|sJYwgwtdu{TAf+EW81f#MwX
zbJ5{U10n{09+{N^Ch63)EJF(uKLO@gY$Wu<3<M13&Ou{Tm->I`7byzhBqhqU0sM=%
ziyCMkH>>*hr)5<gPk=;kTvSTY(bgVxc*enzbj9{qlQ$i4Q57Tsa?<9XK!s5B$&Tkp
zh$Sctw2Qt!WYgfwey#k0`4xc!Gh<|Y6Citz>H51X?YAh$61YvW%GXTW{NThxa<xV&
z@Q4{q^KBDIylV6~(^X7dyD?Fm57_a%fGk<6)H}-5K9a#kV|pYG^VZhY9ZmptigBCw
zJFObAlQzB}Wh+yri@r}JP#JN!dVe}_l!!ey2NGa0Dn8*9qBbL6U@$krK-+9j(yHcA
z!B=wJ`o0ScrdIa$dw{o5ZDA(LJFM{!xs1C7%Zw9F0M>jDpbv?>>P*99kWl}=lMuz%
zFw=p64fAT#{x>nuPXPA=ravxRj#P*H3=GYDub~kl*4MoT^ZzL;EB8lJ5EagZmuiWI
zdt*lPpq$OuVn@9`XapekOlr_6)iTNIg~kGp&VYc8&)O~8v5UR};F?!eNy%VtS7J>b
zpfrgG>`GiU{%{!Ep(zdx>xB@uLH5-6$=ZZy3(zG%`&k~&)u_`X0+$mGNXQQWxnsb|
zRXinx4od8P{m9e;<$b47qP3|<>Q%2tGEv|HOw7LW^5Ww7mzr`P8ic9|#e&_pAIwDm
zXLm}h9{9zi-FF$xoVM>tmtd9-(PFiF>@oDL0*>7mO(X984|snlb0j}b{QKP3ei}bV
z06&0MoP3iG1AIrJVL<Cj&M1k9MEhYj+jQ|8;8DGM2!CR%!~ndl!2!y7sVDqiyWhgL
zJ6BO2Y(JH|iq@frD0RdRI-al@D0<i(ps;vmfMUhLc1ALVs>;hbd1tNsqrVsTp)r1=
zgl8rkwj5MK4{3OI7aN^87$k#iB;4G>qx_DV9Ni~k8KxNDJ--q?tnLA}z!v#`AyF2;
zlQq_J35uf1l?dihIw*=^vP84U4z3tg?!LYNG6&Sw0Fv|+$}7GZFJ0hfWt}1O+CMW)
z^6^4^v%fmW*<Dux>6TB9%BxlXvr$;;nNX`J02hN#gq!yqqtb021}Gy6oEShUN7OA5
zLlbB#*;0@Ht@SYW2SzC?riPGeF9SekU$e~(V6k2VxK;(23X1FQi_^p@ukI|t9@kV9
z&MRyx-CiE$FX5PzM@Hr+qk6o2LtCi<6gn85AgB-l5W4lZ7%?&3>(81;FCD7T6K{Q$
zRcFYwq>V?4A5^d{2bK5rO-5HY=|wTj*@Pke4*zEFH=tkh&t(Ab+6Mt$(2TyVaUgsP
zJZ4k=wt<M%bi2vZx<9^!9sCDih3|onB`Yfn0>?=|>z*>i%OW5$c;5+tC>fju(sMi=
zkcroMVgB<IDo_QHXKgd_7v9gbTM$AXmN6>|Bj?V$_YqlXtss(+rW&5qwZ2Z`JU`%1
z7ID!L&@eU7gQ5tBxw$!Sx;XFw^HoAd#O)^wP-dW8{fqP8(jb2e_b^~z1aWSdKof=I
zVq=Ahkn@sEo4PvlbA#hADl8~IrXI&2at>UY$aZpT$&@|AK`PEsZ%{&@7kr{fM&K_J
z?V!JarwboojFUmn^}h$Hk5cWLUHsjCSyk_3q#&pHHtIMM=%DqfwKifmkRW&Leb3w+
zKscu;6;?KHKb?4Dz@ORD(gJdqt1#NY>!#5I`C3pytHfc|ZLvVSwSkcZD{HF-)}HLJ
zn0z!j{TZG5p{=wu2qTvM4Vxl0p7f&bC~tpo8c;2nVwqtyhDH96a1guhmX<!tBpYXr
z*y$LZiK96KqzV-pGE=H(@xj&rSzU3cYPV{S_Mk{NA|TY((P2q?bfA)#6_m|Zf+97u
zQ&UVX@-03d^7aZKHk$)ai}~tx$641l?-u4u0#*J;02S*nf;Ou{$4{u2TfNf8C^02a
zGkAi1!au!!4zU!Q#G?8{-mHFT0}vABLX9U_8TWFqgSy5>on$k1Y2XADaM)}EH#m-3
ztg-JaJ&~G5VT8;p(i^!^6QnU9dcd?y>*P>7zXPEl<DOhWq3p}A34swy(IZ!iTu0aQ
z-|WUC5X;c}u(dYT9}?rYKn25r-6HR&;*&#m6iFJ@2i%;<$Ve2;Kb=ynSFY#6<J)nO
zq;>i}na2xcBob7G#qX~#S+hMf0CZWaVLMdJ_>N_XxPSv(kt7^9S0|kShJY}UZ(Sgm
zSZa139zy-{9$i+d-loyl!10Mw@7^TlVGN;*SGC(zaHv>%_3-oV`}d>NM#mku&gp2-
z=g(<G+&0zasZrKrX4B|#oFfz-M<+`y8HfPBrjefBHLxf1fPjI;=H<Pu02$2LyNzGk
zT3YA((+(hc0uBm16Eau`&KrhoPJ>!wg33Fq$-=eeitVJ6iz#GwwojNdf}(Z{mAiqs
zZ#U}>0V7y2K|yAAWOYrh<;$qkKx!woX%x#;o{mV9bo7ibh@wFzUQt^56fCV+J>N<?
z8}VHa<Q!{Zw17N21F2D?r;;HPSaVh-2+Uo18lM7CsI|2)l<)=!zQvPI>Iv3n1gtU+
z-AYbQ&Lwcz4EZli5_PDoc5zV}UxLY;02R!D9-_f`iNp(Y@Zv9tDXs^}F)~-F1o-=R
zARxpG`3jPKkctHW+kO+uxI;O7VydwXDjpXc>i`}Z@W3{}kD+RB-Tbh2W?enxTer8p
z_1y8L3a0>=T!Ixpd|5pzxClTQPkk|dHUM}l0l@i|0XA$A5h^h!dd<egmA3uVxVy4>
z)Ol**Y?x8L(x}-DKxz~z^r@iZ9N=Y?M+m0TFR;e4xM$EuAs`Pp0<FGclj-#O1>9=y
z?*wLkEF%^$NKhe9X-Uc6Qu7%&R<f8ck3Vm~-sz*@G;pP!8`<))-UR+NIfL8$Cknkl
z4o6`LoZ$ChJ*XtL_Esy(z|%3LUbKYEb!AK)IA%uRoB?cW_ymld(~W-ANe3Xq<yevw
znVv+*<N;&`P?^=VXo$pfR4<fW43<l;ITI7vIdUbTCQvjoaaXSK_xb1alkT1!W?vIy
z=n<trAXm86R<W7aq3YnQ+B$~jupyY(z!kf6xu*Uh0UWDBy5@2X&ZP3RZPT>E#~Drt
zn?*3k7E51$l0mBGw{CX+LD<CK)691Gol9&v-OR4DZhj&Py47(z@JAqYXI`1=x6ZLN
z%~#I&9yQM9A-94-VSw#QNlQ~U3U{m%1m{wzfxinf7u42WpKe&z%q<w$N14pzwQqG|
z!`V&SD(Z&o?UE+;Gzx&1W7=r*19@`*)B)ARtC@MOyvmGK>KW%45E6D$Y<g9~&vv5T
z0wHXmbuGF$xuBYn{*f-c&jJLK1z$GOjNAr+F`M<eh`IE6=RfFEbhn>BKJ!jkMBlv7
z&bW_;hF1A6(mnGjV5oo7nbo#xWY;csq~UBi18tnM=n6)Y;RjI@pn<l)cPJ|^w&O99
z`~GWz(t4ZnZX#FD9euu-Z!->?_gXIBq#m6GCiu!YNPg&~=~ZBujI)uQxyfdlpeX0P
z8vSZ)vZjH@ZOjEjURi{L*4yu&C73Yd(F?SoLqvDCPGU8U<fVBC1$bQBWivb_LYyac
z6=zdoXkg9)`*L>)v2m8oJTN>}k8I*>%oonnirqDts9t#$Ho)~hP>GG*eTuj6r(ykD
zOBHkTZ=X1nI8xHwfP8yyP;UngK7nxP4JT>cYR7>y_J>am=ToGKyq&``rCke#Q&r1u
zEWj699dL^v-Lf|ystO{S+wD;ASVJtx<pdwDw6{`m`DrxG^*8rP)ig%ru3mssP?m5F
zQryOBj+y7*h*^9Cppj{-(X;x6>WVXa9`>g<2KDbnIN#!DZQLHp9nHzu6*j(@nL5Zn
zh9cJxmoWl&k?@_sMeD5|(b3V9^0fxlqYGLpW1s9<cw<yhZqrg8defVupuE}ByWx-G
z-OVq-vK)}X7LL0cLAk!S-D!7dVOtLMaDnvIwglD{*cKXL07xA`vakZ|DiB06R-N3?
zLPf7`lU+Y96nQKH-2>wH60mu&%|QvjOT=qW_$`0t#iM0^eEp~)_zUftMv@nAr;5R$
zE-}(5@Lv02XcKYV#0^%uXGcfOq|odt*Nn6Hs(w<eYK)H4!Eo3>EZK!mHqyL&zIOJ{
z(Cmvtznp8QBL7_Z>m13ufhd>Xdz3?0<>|N*4iZI(YHZ|RvC8{i+A}&z-#JkSv+P_$
z1&;&`a8%g1BmY{Ac{EgK@1gc*73p%L<|k6QF9MiKdTvZluiE;&axzc&uE)f-?-yQe
z3JD5=4Sgg9yb~+cdwg<cL5o$Rpw8<Ywf(tdbxGutd&=^o2q*JXaELsnm{I1%`*#o-
zB3#KTuQ%mKz}<nGw2^m#6N9A<skL`$$_+1bj!5PDfcsF0z@qke!;tVOqzhL#HaB_3
z;Q+hxqX^aNFHyh#1w*;JHU(E4h+@Qtn3}H5u<4Od1`Sg;QnYyEe8JDpJP|H`e0RWc
z{{C%cWcekS6(H3>9rGDwOB8p@Wh(@HPttM(5Vzf1B>p4qcfx{?P^bU<x8;!`H7YDb
z&=2tW9csbie*+b?EP06O&`fVQb*2F7TL7`6iHP}g1Y6dqHI-p2BJ=am`rOjH34gpW
z%Q_}W*KD_Wav{c&=J#-+{GVcirvLk%K_-vG&YPQN<$7~wQ!&Map3|hs1<PyH^@eSr
zEJ6qWdt=4_l`kFq|M>ld|0h1`E(T-gK8ym#hTB$h2X9><7H!y+^L`WQh>U86%b2WI
z=vUI;Zyc6nwR#*c=#9%#NeaMT<#`QF7}^IKQf{!ok3xkOuy?jY7hnnA8_Rubu84DQ
zNiQ@cnHl#7Z`^J!MkX|j?IqjbFBBLy!7u?#xI*(d1~*!k7)||lyt2%EP4-riO?RBa
z*J`Tt6f_JQ_MhX%#uW()P(#gs^7qVeuDk0rMyHrB=@6D*ah=6R${A33)s)(}S4^r|
z)r8JluEK$Dvno2(=~xJOtour%19%hp)ZY(aK*nMca7gpiiFso~AQmDy=1{XbECw(3
zA>-V-{_Oy$e27AV2IQSiG65`V?fT^B6enRJW^g)r^~kTE<PIt`gy)P-f#1vX(zJOa
z1b<?)auf1xEeI0w7;-0RXxJ>UUA+^gl9YJ8f6ta7FFHKZ>)2KO2u)B`pqhfkK{z{Q
z!%R69`jMDDn3#is%?8gLw>vmuzLtkdPmh7yKAu5#AksBTft&jyxm83ulrx$qlwLk`
zZ~D(uvJOj)vheT<w4YgdFXt4-topA)y3H&?Nk&J>iFgUwNeIeyHtoG>d}ug$MuNUx
zkBn|rFJY;uq~@n*(|W6qWqzc4$jJ%8gDQMT&=-!&-2X;w$dX4}Rtvt!l1sC*cApUL
zwHgv`2F2lH-zwaRDGP#+2gU_NULiO<66ZstR4}A}d;Ym68o8IEe|`E>rb@l$`636l
zbp`Wd1rAvFxA2dp^Do~P>Zn|3zkJ*9{5WY>Fj3?H-{wJ$xg85HRj{H1{vHV~pDCVX
z5R=i3^}wge*uq;QqLW6k^!Vf&GX`c`QMLOZtTEMTy;>ZW{Q9Cvj?@JbjwEU^TbnT>
z*s)j%^YFZ{v6`qL&sRx9Wf{p{?dMQJx?wrO4-KE`30`gm-h7LLJa{10{Y=ho&F7g>
zwcXzQPC<?jgP_?kEbN$T-iFB#d#n6Wn5AYI>bp#~0Ujb=^&5Ua%`7a|vu0Ob!5iPd
z?CHci-M1}JdFR-v9EC+a&eAhf=P0o?J>uWI2Tw{eXVF~JE_C2)d@izEoM`-9D({v4
z@H3p$0MfDB-{l!m&9nuSHla#npFXSRxZKdry*rmQgNFAiJ4STNI7&U{R{fLvy3<SZ
z$05hEb$>6Z80#~vDYYA2&4)*^{uncswK#S&<>WY5dG}&6P$ny7Ql*3aewvD#16h8r
zZ|v-6WjdJEk;SRae*&%=C6PjoIP4AIG$e&8q*Ez=@v^g(#aQ54QNtt)9E5~iIMlIz
zg}&jby1Yo!q4LTzZY(&=avwS4nkFGThi_UkjN{Vb^?wIn9SM4*en>447Zv()=;n>l
z{mQBauE1wM#9-T1e5vsmq(-njTpH5i{Fc|4dUYowkv{BQX{r78k5FBti9@sFDy7Os
zq;-{%)NoO6k6d5b0A1tsJK6TDbHvd4`V-<tbeS$&7yO5g3;P(890#VIbYEb^4SNYo
z_a~nXshlhdWvWf%idyWsW8M@tfBCQ!q^HF7;t|~yRYFjDzAz-#2zwN{Xvd)Y`fItO
za<Txfh!-YHx+Wdsk%Rjg6PY7>W36DhxC>l1NYyw{-^kukguCJ5w@fF}sp&9ojM~p*
zx4iYe`PX@)F;UOW$H}-?8^t;aQRr(5V@fJvFk<}&n#B?3{^`bfOMYucR5`s~f?*Mp
z8TSyWDJF6Z>++gsq-8#*pSQ4RKE2NtjSFfrOUu?7HHZ(~N;x0emsu1X@Xf1rp_*T`
zqSR~bO{TSZw#GS+Om>KsYV2xO1A}X0tpNG(4^=>z!Y`hr#+kSHF?_!SRRdeR$D^=)
z4-M`Y?i!0#9@?vZ_(tx?!snN_SFJq}fB#We2Gjbp^muiQKn(42289oQ3eL0R&Y<o}
z)?!%2z9~ZWZjx*9(EmfO-MAx^{Q<|Eqm|&t-kvI=srnzseSdl=xX2vu)2=Uz4!jNV
z&{YRGpVInYRQ0~@WHcW7^(=%rJs{<NIPzNIyY1(fuRHjMuMb(D%qKOy4rUk2+X=m1
z5}DVlE!RnImoH=BsLJc}Nlhu@8%`Bi@!|@;KDufYLz2U^8li+da}U-(C(K>j#3)ww
z>#tGU>kpLJQBlhlXs8Y!x8$#rVfFc5XwArphCP~dJZIfW#i27;{UD%2Oh@TQzDKPx
z;+RaP9B{EUc`w$@Rd)Dq9UIdSA9vqjU-G{AtEOL7G2IDgv)Uz}Z7!p3q+R4QO4AHY
ziA`?j$`+ksHc_g3%q8)qheQ?m4tOaHkn#pRJ`T#&cEK5QtL|{kAK#VeEt5zfU%Eyj
z3Cgc|B~F$%<9ix(t$KdzVbiU=GvxaE^=<m)x~yR=k5glloVCsv7w5@A-(r~guj&s1
zRxwMeW;Qx6ic+;YU2V7zK3>1LSBP!F2&ByXPkLPF;_PjIPE;fmJ1oZxjYawmMhf2<
z<2Q3z-WrUXh-zC>zW6%kqQfPAJ)`!iwyCTyUlK@M^yyi5Hs$=*x2-TLSj~gAEYEyZ
z9MDQzu2x)6Edo|@7jpfx)PH>;_<*ZQ9%}#T+n|rO%z};OnJ}wo9g<vsm_D&nzXSqt
z_^;USbj#W{x9+CI783togs0A!5Emx%69=(zvRpsuUVZz_v+K<%g9k)?(f{dY`=shN
z<RQBK9yihqN&U$@5g0RhQ+={xQX3re^0Ic)P1tbE&NxV?Q)V+DUNrIhE1Xc0LA5L=
zsimQen4VM9F2y@sy2!A`QBy1TJUs`J-`8;IwaBW+O_Ij+If@kw7&YGPbe7pg!Qaz2
z2+oM^t}s86=*vPfbX52s-N~dsn8U#M7pb^c`gylOj4+v(Mj;_o!@_vy$#(*?gVmO)
z5pRy>s}$Lu*IV37<dfwezGLQQ!g98-R!bg>c75*(g%YVG>wPBwCh+T_qj_Bq&&Tr5
znB^FV#qt312fwVgf*`{)PaAOrM|7U`7*M(`WZ0>T8g?<XuDiy_yt+Tw)Y&AmB{P0A
zV1MAV7O5R0v|1|q{utO2A1in@WOWGbsXxN36j}ArG4zA*(5r)atlY~s>K@Qg8Bn^~
ziSU!M4}Y6FuMN5zCM8QlSh%o9w-qgg_@(lRq^)%SeDKhmU?Er<-Jx9NuY8n@TkHeM
zT88qLz3mpERTUHOK=>{X3r<K&i#1s`*%=})Ni8{q_shJ&FgIElnmb$BBhGNbFJ09<
zf<=}O9MRB2tT8_PS0O;389!euBPr(`n|qrLN?chO{~n+2U~Fv6JESYLo=i(kImp}3
zqAM`4Qv%JkrQ5RC!U>H4Q>8<8VZ@hp-dW9f3JXozu+Tr#z*+KMxBRxU29J@Gk7E^U
zKh3{@n7QS|4lNXacj+&g>Y9Ou=$A9M?Wm&jzaJlm(O5b@DkNqn;BbAJyNrE%f~(|a
zS8*vom{6<oFt`?{_m#H+pAV^y{;-OQ91oubsSY9f8%Joea0Cgt75b$_VYmX*Q+iA$
zs9DOSf)r(_Id`4hjGbmG8&_zXfVX2^-XsKvjglO@qIz?o41kH&if5CldcuFwT%=d&
zYlL;yyckC{<wsMb2Hm{u%D)GEjXCu^JTMzmIqfS**tx&p)C~R7_h{aKd^U@nXoyoQ
z$iz_b$PAl7xpzcKUk{h3JjT4xd@#;uq+rnPt65i|`=LUkkKxh5FMZQl<A1Zg;Z6(2
zt{-qKU<g<E&zqZH&xMfrcE8^cJ@t;{G<$X66KRo}jw>GApiS3x=Kr`Zwf*yliQh^T
zkI@NkbJS#hk5KW7TX1s}+m&wa1Qy}--{XzVtkTua1deTG^F=^#D{rp_%kI{*xtP5n
z#pY<Ykx57Bbf3`d>)ty5a5Hf15`1Vyz}86fTX?VI`wer1aZtag+{`PVXu3MHJk$9_
zhq}+!Z2G5Z7vUPtk>VKf>9?d7va(``SoTT+0tk7CIkvZ=vWZ=c_GW^!!9Vq*O{|g9
zDshYPSA2i>LS1s4ulL`{K7R8iGPysT@Ks^g?|EA*y%{SC1w&<nl99<PN)vd%QCTN`
z>31R$JsXCZ#&5_T!=HV*1aJ&Bj*4&m4g~Ca@tBU45AJ87O>F*ZAzPe&fqBPC<2t5(
zPNy!h4(^i?)rU3zrE(Vg`W&`tju!_8a{hfcK$?oS-o-yPxTrJp>0z}Jk1{5&u$A&t
zgOY_b6GQLIlTuTk2ys=MGT}$6ZML~TUrI+?KhPg{$&whInf(whTy2F0(;IyC3O!hg
zMbmS2dAOgdcDFW4HGuKmPXfha8{#$k<E$1+=PfPiOoyt=Czzq-)<g&MPXBrpNmz3&
z<`Y9L@aBJBX$M^F>;4i@$t;z4oRYxt&goD2*>j~)q!LG8!QfT$<OrQ~$K|1rduq1N
zLY^tL(Zx2U0im!Nub^1x-$aNo6(teKx`NBQM%6sV%pAUY*qrv;z|Q)_vdcJp6PO_-
zp*P(6<%vA_zXiVsj|g#DA-=2zu;Q|fYl`M%AiVEd!4cQ@@A5WcxzRY-LKJmR*9U8I
zZV3_>1QS#md4+9nWLzzs@{A?xDICk0nWRYRZ1ByhS-z9Ar|5Uwq|}w>igs*QZ~*)B
zpX;%F0sqU)^V*wPE<R-z!mjQiv$5fls$6qk^S6DaUX^v`4fLaeT8|Xj8Ai($4@QL4
zro{cNY);9HBwDxU`J6U*F$=$D*tDDQAw#ByWHDTqL%vgRg?hR_kgfifY@*+%?tOXB
zorUdE&vZ&FcZ;KC>K`cbrei$tVqBv6x}MKAj=Z_v?_?{#zr5pq#q5*;Gq}7)HI(g(
zsNap?;&c#<hrRFHX#;~rC}&gsF9V&0nHjDZ>%~6#SC#D-+bOqq78e_Fi8>!X!}tm3
zo_;1!vfLY@5IbyNUsbd3G9Y?F+33iJ(-#~9BjysnI2U2f%Cax7?A*A7+uqS&SU;#9
z6m&k_wd1O@QanP(C#9+mS}R_5B=A)-WRo7^c+P+sqTyIxgW|-rIRbXOi2+xcop)oN
zYLeURTsV;8lsE|^_i2(AJjhRqKV`>1ijZ{?I_#4se~M_wXq%_^gCy+tHL&qtunCwy
zC=oU+b9)yAC>rB>0Wvf}+2M7w6jb?f^@S(xN_nJ&w%*kF)<@5cvEslRSAm^rtC#YV
zIp)<fg2hf*H;&!G5{OB(Ui0d=8q;%0WhN4>c15eiE&-8*E*xPP1v{!ZnjBBs-?Kn|
zlRkB<rI^mAtizIOqD`zcF4T;YDCES_!SUez_MqqOCFj0_i73~_^jXTS(ezfbuBB|?
z#t#02Zwv)RFI3aDWf!TE((hMF!|<vCZhCJ{(~8J~@;E1xkTKJmkm|_Ash`tTCh-Qm
z3CXmTSsJ(deKlye$+Y)xCmMsart5fP&h$PcD5?cA&D+13=u*cueNhiPHJuW7m-(~F
z`p68gHo>sM>N?h9+nPFGASZ9XW;|@eB<f%e(GzDB`1{+Q*nNa{LrqWIF>J0AYb<y6
za!S)2TF(%QH5G6k=)W*4a1x{fd8w_2;B9ad;TVtCV2QR@t+O3j^i0dx$vK7}cKSy9
zG2=~{1l92ferw<ITCG{n@k_pR9DtM_ch&(U&U=Xtp?lO&^LXfO9K`Y)aYY;a*TU`b
ziTfSYc-5QFr|Ziv{1wY!PZb>-D`<cN+LXi{r;s3VPGHM*68WLx3Ol7zuaudh&&qz#
zN*kis@-Fb>ZCp@(oAG>J=V}+u>YthK2gr8;eI2(f4_%tC&;Qss*K9@pd2Qf7SeW&n
z(`ae4JhiNZ@74YOFdn{Se6f>)YW8DDdHj*>bM)Yz8lHD&H<ZEUX-Rf$=#x*XU+ck-
zB`x?&9urw}m^QO~OD407>~G6pUMiHd&&A=Wm@H$}JZh6^F<cIt4Xs<XH{e-RBTYH>
zQnmP7g-+@FW+L(#-z_Gw#~UYK2*f8<Z|A|k451TENc_^xb?9DIWgv1y5Nk#DRH^pY
z6RA!8+5xG9UlDPi`TBK`uwULMAA6dsir(@u_cS*AE3a_LKxovC<8_zo>a_b9oA1yM
zO3<8(;cZ({R3UyvQApVbv;X<GzdCXlcRpZA{N`#!wY0k4JNMvG;BxY9NbQT7#OIF(
zv_^5sUbD}`Ki!NU+j(y+Tz@r9rSg^>^0gl>NSuH3vUZ@7?%I?EW^H9F9}byXdT0A)
zMG$;x_S9bU$!8~}M(WhvKbj=OiVB%LhwKbClb>0*J+GFw(3~z;x%gaG5?84*Y3^g2
zHt?2B{8N<{YsvUxaIx|4wEtu^FRyyn+4UbCnE1sE4Hs9YD%AY<6d9-*kO_Ownq6@B
z#i`iXbNFb`GM0V8Ic$G+Mep7I$rC@9)r-c~a&;+#1DUs{>!oMU5C3IaDvygW8VO6@
z?RF{$28g3sy<(HhwD{1igkBtQNnxy`QGFyxTD>q!$DmxTlJl^Jr(<Wf{oZ_(qLX9J
zwkdY}u0M+Ps5(K_RE1aYyk7Tq@_HmXJ$qc-;qut;`-)H9XnIjvj*o9!8*X14bK26E
zFU`e0n`9oX?>V?y*f!vOi_!eSA!EnlDY{8`VH^D$K=Z{L_G2iB^J8VKo8KAnZSgGH
zTXo~vl4*NP3~G=#gjF}*bhkU@75^XN?(!|FFYFiikdlIQr*t>csdUEx(%sTX4k#cc
zNOwsyGz^_XcX#|mLb{~uY@g@F`3uf<-orK5%wBu#wbs7x&leZvYkc0UX;%u`VFpn^
zj!7N!vYXl68U1hZ51Q@xMu+%rf)Ss`zIu8044!{CD)VI;cCJDQwZZNm>1IXk)~kgP
zi5N)i7VA!CcRq#$D!n&OJMw{qg<_K$YOWO@&X`J2U6!8zX430=fiyY!^$30-dM!c(
zG3E^^@6rppIk-7xTfdr=hX-To+Evopr4g{#(}EiDPaQHOs*du4Z>^0AQ`I#vHyX@Y
zpMH4>8WRWHS?trk6oQ#Bt!!dv$$GXT%l7e=Zk~+XZ(VzacfB@|6{4E;KsyVp%!tBY
zqpVZdb+|foZVxvJPTPE@P5h-;hLdb#G*c9fur2gbD)q__X4Q|GatSDIf^9B{J>n!c
zZcP^H;|!i|>AhS#bc6oO2NAtoRb!h5%^qe8C9b+`{vB`71)A!8+yAypvWBgoYuim^
zO$<_^K+psx#WD@+N*_@6^q=;Ibt5PbQi&ENU^V*<e2HIBU$I;Z=!rjms;ez~4yzqL
ztTqXHL4xHq!>M1!#@SBUWqWuVMxZA0)PYt$%N%+O3x*YMUE&E6b<wcJF8G0`>1HnW
zrlZG`sz?r&<Kb<G`ic{ulp~H5{AZjlL0^SyD~rb5WqllGc$}4`!lBuG!r1RPJLC$k
zHfW80T(Z8YcF;7+Gvs;TD->ZNY5Zkdt`lB64VW)TcHiACF48y%@U^saw94*7O^tsw
zZK67n$Q3%351`t=RYvO@+503g?#pph=eV3!!8}c6((?8lHoIe;6k1ddFMnteer9=U
zlCuYfbPAVp60C56Cn^#oYj%xBn<+)?>FV<bj!kXcbkKNdVJf+5GufqSZkvYnIkLFC
zEWJD|Z1q`C)vd$A&!SJU--yBvJ1L_0UGCc%<#BgbY7fhT#Xm6uo%R|>21V#@@mlgl
z={u(U9iP2^rC2@&MIhOw-Id2SvaFm;g>&U_Gio>7B%=fd*~^C8j<v_(^;`_RqfUC>
zYW01AJEiehi51n!dQUt%3-CM+DFs(3<hvcrF}{DG^Z}W{V`j(-!S-6nypZs1xDCjz
z`6lOW{Ntystyk2XOV$tGH<Gw)%qEXV`0~-^2>fcdg+OUZ>(k-+Hf;4(w5`t5j##Fe
z{+gHia_t~DuEZS}f9f1UFSm+u^0~%+(Ku7$Az=B1Qn%*t`!UzU!;vXv<o!GdSGn6t
z3ui6&v7f6w=ZWYU;FN?*XX_{X+q?t^Wb9E833gJ9SCVy$JF6_rf<aas4GGDVV<U#$
zhC7Glsivxd_rO;Bg@9U^MCyrea-Kb7b|+jJU4g?`@8$fJb*%ktk^iP`j^+-7AR1J_
z@Nh+c^EDAW^UFhW1@)8uZ;zKUvAr8-u4YK3uAI~WbEU^{n4w~uoA5KHa9Ni$f9Aav
z=?ZYtFq11$7*0jy*cFSdHD&m?G#2ip5Qzth!qwqWTI5pr{#ePn(i@JNg5apgA|FMF
zYpDmJrOA3KzD_!+qo+}PDfnBlY#)eApb&y>A6lni)YV{@%C-fMs}4NAuzrzfJvPVD
z(sV^EHGh|0xUv;L6P53T&Egzei-BR@e2@vz?U~qP^otcNR%C%}8A!_1(ZnRD<(1n!
zHU??7g&K3!Xpp_<NQZof<E=jSY#tz2DEM1*BlveZXW5n)8>lgQi*s$fa9hUE?fgh+
ziQo0?x)6JLu<|Kj5+tI@v+u7Xh(DUoMko?!;2pEV3IRVvl?H7Q2gxh8f0zunH!aHX
zYvrhMUsz<&H(frt34XhYDe*KGo`150-d(OPQ9MAvVOaMcv|eBAMQ4y<UixMu^6AEw
zh9CB8zeMT%_ZU}kG%3IR$DP>{uY*n7dHa%EqqskRAHlO}BN}-atlA!X2y*9NvjwWp
z9=npBH>-gey_=0gk0<xt&wLU?D%s_aCQ<Vh{CTS|1l(RfWM`dkxX6`(=*&Hbzmuui
zL{5-VtcWH}uSzxvPith)X(JHw4EQi?$6Xv2%ivEp2N!sWYCou%=nyj`p7{UFS8h!P
zvq(PNFwXv$s%Hp8C7Q~>$M`YkV;=Z=@I74q(ixQCi=y&hyg7e-j%PJbS;Ctx1kfbX
z{>Ka=B*z9*8YV#wU#_WI@QpccZ63%1N6(0^f&|%0s_$cJ`a}Mv3&eCIQxz>Ux){}c
zd2AVXh*ubQ5y^~@{54d6s^lWC?RP(=<7Q_v8V1A^*UwNQVdl!i#2I`MS<+x%evzWC
z=M1cps7QFx-1;LSV$9dw+o?nj$$q~+vS({GZ_bxDw=uchUPX7nC_&6!b6-q_&N~OJ
zsU5qkOspJI@^4BDD%E!5D-(_?3_kx@N2$7!yWFWe{XqH`L&S#*QNvgjlt+@0RMNnx
z?r}9@U6OUZL*@UqLLbR4<3b)<Tn_U{6|C-<dsg2Yw)nux@)~@X?gvVKc<$Z&yhxb(
zQBUaC>@&`@Xy>NHxf)@lAR+QszD@%kOU<o3S^kjjGWmIj9p0cr%9GiW)%wYU1l}G0
z+TPxBQ_AbRv;4B6Xbo*@Zo<Em-BM6z`DU}-_KGU(^22;_`>rHM6AUx9Nh^>Yqu|0P
zg_WDUl(=B}4y$}b2cj_+pjp+bE6Dq<F6<TMN<8}NrdR5P1y*5k@GNqlngs1~eZ+Vn
z3lWoI|EsAigY%%LZ{I9CO$}Qfe?90)JQRM&fe|5RDo6Z7$LQ1!TtRDMh&&C{3*$g9
z=>8I9{b`0n){#A_&a4i!gqjjtvW4}vCQYI$?}@w12~!YDJ|*&3ZK0Va?~tDoi?;YA
zduHXs$p;ZstwQ`H)!Y>dT2xdxm{6JPvB7)*tj-|7FNR3ot^RjaJniAN&=>-)E1v>!
zvOs&Fv){?m%6V|5-?76uI_A=VuFd>MhbC->6;?|FH$icXAu_FDL0G%v>|<G&=o9Bg
z;N%qYn2JWZ#V6rl){I@--FDdhk1vOtod#=F>38^Ezi0aXgCmTaVPn#TPq+|HjieZU
z$UNq5pmXl&7j<$C$2Rqy?(Y$r9U_}346W`e`+Qs5(j{T8RxMF+(&^Dn+4MF78z&@z
z(p&LZr2H(rLdj>9X$)EE&1(1IrV!8)HPLzL0*a&(zwZ!%Egq2?b>0O1suHcr5NtHt
zR%od<?nv;0`sM~QyfG>8U^?K-Gmi^w>-rnISy``Y+dd-b%q3hcg7#qI@3ri``cu^a
zto8BbUf8h3X{T{f?O4)xes)qz>_<}vo{Gz$sji6&Wb7argeEsKn-d`)y+`rsEX!ju
z?37QV2Ck;6UBdAI&n<STcLwXO?o|pUBE2doi-g=)7Z-(uu6sumQh^K*>rwu5?(jOh
zvl0!ovDvb{^dT2`Si<KOh)t-A$7R)PQ9x(9qx>EletG7Kwmd}#hZ0Qv?T54KCR4bH
zf%;RVR1B|6Yo$kk;kSg=tv9a7m2FTnTLyW>?{LjQU@eOrgTi97+JHP#Y$Neujid=%
z)pXrek4s@ufZ(+x|3(n<TO>|`Zi8%usZPV|kxCPd(;Kq>C}+#;>`U+6-$rmSf|-ns
z@l>aKZkKLcTg`dP=6)2yF-|-0LXe{)c}IfauZ0qVD{>THWcs@afwbgWIf!Y^ahde#
z6DBI9lXG7FFe+5D`Kd+MQr+17RBeQa@&0J(1X9rvRSBwC#dcV_e!QUa=ObT80W0=N
z$ZE3X#9KqK;K<v9ZDN1V3XpKJc+4<Ye@4UdyUfzfUq6D+b6$D2+~PdLR^80_p(jhV
zr-#4Oq3C=ovskivAyW?<ZRA88yWD14M}(}B+<oa~n{yNC`knA9knq^srj_#FfkK1#
zS2Qs0tUnslmzDaD1-!-@c3)yH!~|g%xA<&vFW7m`jAtu4p3T?5Op~kYHBC5wJSu8@
zy9l$*Do#Y9D9%rU5)Z#0=X+Rh`?NRVRoyjzavC1Vn;Qtb&mYv=@Nqp?6OHxV(REmy
z_$r+_t6rTh=)6Cx?jyE4G$Q#TpPIC!f<H(ADG<LYH*9IQ>f<(esa8SLqM_{8%L;mb
zb?ak$bxO`W(-uSaIFK3E7FU;VtqwPw@Jd)%`3AgjMaF<@D2q+wtbc90qPenMCJQf_
zo3T}6XFdYA-dK;NxcjFlFc_|aA5I!u;V_1$8|SQtfjZ)elYqaLCjGcJKl7!yX)G!f
z3^EO#SBe}KG6(9YC$1V`I*Mp{V!!+G{T~}^Zn&1n=&ZlBGiQtQvCN-cSyq^KwbIx$
zsugQpwYn8KtO~*4cl}QvY>{fe2}7H(M?E!P{gK%fct3SyQnGp7`z|v}3Z|ebC)h#P
z;Z$xq)5CT+sL~Q^xL8WlQi`JE=&Zh6WtGm-Rxkc$`5=SAXX8f(H;>Jd=K;8V{M(FX
zo8Nk?feof_()x!F)@+?cX`{!1gJW`xG?UeSXB$N&*$X}k0oAhpFd`-SRa(K*=(Goq
z@o1@(Xk1NhM{kFLqfqDY{gJ5ndd&A1Trya&@CFm_y+GZPEDZ^FD@LqxiCluyjq0by
zV77;yH(Ej2{=*k&f=#lzXv|g~%F~8%mg&-RWu8wTsDbm=z8Ysz0>g?Q*^>z}K_A$>
zfI~l^sg@k%;evmsGJ-l9y5c&-cAb!&hCKxRTv*a?RmZ*GuX?$`by8nF-`_|3-y3Uo
ze+VrG!aDbjGe75SP!@4eNC7OUI$H-C|LA^!ImBiKmqUQt&g6l(Yp7~*kVn*ui95+v
zg62l&JQPJ|UF6#LO2Otu=+541-F_Mv61d2$$d3I{<{Sp7EX@5ihiARELVlxa1AL@4
zMcnUM)zEX{kS_!}-c8p>8(m5WhNLyTeG-J;jK`yXhYRz0b*rR!q?snI@v{g?ukhn@
z&l19F>|t1yd5!3^urkfCj3LYMasLfS#*Vij6X%zHFN<YVvl;oKXmw!&j?BgL<yuCx
z`qME(t@3yF9oD!DJi{_rb^nn>=n*<7y>*g}(vDq-W6>MoWy9R2<E0~%T0Uqy8K8gp
zHG?SfH*ZE3=Y~xbSpXYZwgzfJ6eW!9yyVa?!NX^uhirkt(G4Y}t>&&Hk8jk-;pDWj
zbXM)SlI_wKy0H2=kQSQkIk(*hY<|vx&5!I5sp9^MYs&LOp<Ip3vhXq^oy~QoDtqW#
zP4&B#CTOHWayaA8^80R_B*xeB{u*dKhIfjG2ADJ9Bv#C_f(?Sw6n5mG6{!#?qEY;N
z1r@AGJOir+PAw6la{i@7ikiyTv8WgUE_5XD{uEv+UMz0=W!ltt@7a|RtT%}b{Wp^8
z?vS?azvao)sgPR06`1ufQigd!c6D+DpgxgA8x6OY>j=q0eCPB^=<{=n>1uE?RF;`r
z`$LR^3eXA7Xhxc)M{7{T&SEi>A^<``;w5`uYiqFlu~{TTa(fH;TUw(YGkX|=Jfndz
zCFsf|R9LkIUWicD)ta7X*0X}!&TZB0E7Det^QAlA!h+v5t#A4GrL;NL2GhES$H?@9
zG`$YHj*6MHfI^6DTr4diYrj>a^)RWto_9H7ZIM{nB$VGMj2p!|DHn@3J?gqc@|k9%
zjj5NP`87km&A*B&a(tocE(AH*6fFi6N>pMY&e!eDy6+8Og$j77(MCymX&Cw6q?2k@
zc<6HB8lRCtRLsZZBySNX%ei$_LIgX_!x>|nL#!pwpiI0uzw_e3Cl(;ggsI-6&N80s
zNO74JOG>TYa>x*F;;Wx&b-&-e6>3lOlk#Y#RUEB@`KP)6&`F?{;w6l?u&L9NU!+Oc
zv&oIL43XEg=2U1P{66<%*LDO(M#9DXI?>G;@h3qKt8xpat9-3ixD=K%X#;1q@?UJD
zN>9H#_m>K;XhngVanF6+Rg2L&Pfpv3n{bdtKk`o!Gj;EV$zh-G{M`bK=-^s`6zL-a
z$4IcfULTYcnb41dxk6^=jclzGj>?uzJjG0bWLkRyo*Bbl%?=_nvmPU5Qqn;oR~5Z7
zmO>P(Q2psACl#P%2!UIc6&($MZU$8)Pr#@D*ILZGf_+U)p3-560ExzQMqz1deUJg9
zhjp?ogRfOaYOn87WaCW;M>kERKg_N(@cAnEzb|_>4(G>4yJ3(C#sX_pHm$zCxr6sd
zM-8ZaNC-b{>=MFGIokA|qhmZdp<Xc`lfLza-^<%(XZ!6~yZ$+wSp0~;>5d7`#B6xk
z=r_82e-}Iw&~CWM=HixiUlqM4)b8rf-vcr<<Vs}Fb3!v?R#q}n{3h?x0^C?{+<;Ss
zp~QPZ;?_Q?%j>3PI8A*;$`9l8v1us<EE|h{RwkvNf4!hqY=4hRjs!dBj@pvH?Y1wQ
z$i4Y&Q6#-{@rz$HYVnqWLwpSV>?&?(`aN%p6U%az$5yTL8(tg57+vdp0w>Pzl~>qH
zE%6gjGNKMT8ZJ(bJ<-%BLU*s`DuvE9B`NM77U5+kMM_G^#Fmai0q^`_Z&~&6CfXwZ
zq_%KfdZ^=H@OQP(>>3JZME7vSnXgiOta)!x^kpe4CnJ{OhZRoy`;-#BrGJmg*GyNz
zpe)y+V4=^0joh7|jZG_m*Zsf@qTCY-2E!pndp{hB?;phB_mz+Bh*knw(C;(l9?+o$
zp3RD|09#6Vz7KWYUn)_gqSAsDo?uv~MvLl>!7Wv`cP5ifRN3y7sYCBiqhW`r^gy`=
zaC8@*fW1ALI(4Wdlt~`vky&lsRh(tcb&ctfb2OoPr?sG$G<&(#;4b7q#gh49!ijOo
zW)sO4lh)+=5ma01yUBH@$Qc)O%{q<LAY6EdIWhhRX7Yo&vL*(R-aeLf)T0-wO#AT$
z#e*x^TH2`oq~+JPq0F^$)tz?3gXLu<edU)7qfeYOpJJsTA4%&~A?L@D85c?Ok4Epa
zM+E31elz`FIhm8b`u~%!{Qsmr@y3JrzhHb+BpOCYItZ8KZPyz<5H1f=9|)~)8UBqF
z`5j<p1fcRX-?6$E9jRX!{q=&OT}ZZ2yy4*k(TzM|K<~=#JSU7pGpgm1L{n(O#t8Y1
z3>`X2cy>}}|H0CkXh2EcTU`8a^>sPv87R!OY?A5d$c(5Y>;<G_bV(j~N$wY^=<dT=
za;S9cG<YYUUm~p}BT;*o+3*c17|Ug!smIuxl0E3nCqK7-vtznoOG~TUl6+tit^HSC
zn)w#k-B}6qjv&;j^j%1Pns^;<jhN>S>m=Dqk<Mxy;s_y$>Q+I8O1ZaEIC;PXzOa-K
zeLExGCl5rmy~0BMwp8;mMHuhzyC%dvxi&=nA-r#-SQdDM5TIHw9hxLI8Ew`RmFXOa
zsrQyir00J=kOcTZ&pLNvr=F1n>jl<})o8C=T}&sdQgVp9O%sYR`6u&84p|<)H(%08
z#;{6<T3iqy=%^qmLEBa~Fe*EXHFh^me~sBZdlY>S_Q=gm^ats;5^3ESPCv4Jxvi2>
z@Rt%ap`iXT2yLmDGo0>$3635F0NG1PdE8iy3t3ky?%pel=Ww$j0&yOrIxN*1$eT&=
zo_ZSP!gK_ff#sUb-taJDMxIBGXI<Y&Xq_Y}qs>B}p#b?IQDK^gP_d1V_>RqQ1YGtI
z4lzVqrZ;tU*-}`UftMiJFf;W|TQEdt0w<3I%LZ4|gBhVdSplISKE0bNB#k{$^N8oo
zyZ`+S@PL=_&2sO>BjhrWtO?mvl}K$+#Yn>IXvPcKNoaO<<~U$jHS-3M<2m^n5m-As
zAq@f;1b9UxJan1Z=w`}LbPzin;>EeGHs*n8fm9S(M}@Qn#hx6R7^1iOnqq;aRf7?3
zZXTJJHWbRiE^Qt7fb8g?;zx*rKnS~RcjqSMUSg_4##{g)L}mZ)D4+r^3W0miv8J6U
zTO3Di*fpQP?Cd2I(bekc0Gl&fQ6)p%GDHfaI7qG74}s4Dg|g$qM1{vuR(>EJhZh5?
z5HZC3HAgK}oY8s-*AW|nE|-GkA$3L&pob?%<1HBtU6cY{m?MDWCkQh@OWn9{0<5L`
z#&~2kMxxgw`e<fqcx!XZ{2o%w1`<bVTtf$Z-P$kp7?55stoxt_SP~LEa6{<+JP!T4
zj-Q<9fKx`zq4DIsf=w`wLCyEozZd0hwzFe6yzn67+R2?fiy9PmHs^!gxz0<Rjx*!g
znlWT^T(+8Zn6gW-L&0p-fCI|wNCN*WzT7btU-q7%q^9$>W4u$|5w?AcbdP%EsHBOB
zzpLS9)3%M5UCb)CXPBX$_({2cL<-E$_ZW<*#)4Q3l#^mfX+TbSV%V4!?4&ZKxJ=sk
z(5xJjd+X){$8{Zizc6Myqd9--dHWj1T#S)+--0}sWFCQ{Y*>{MZ{tC9N#d6Qf)lq$
zVprSrG<IRD>qwd$9Hwi)o41_pQGB9C7veJd;4)45#at>Jx7z>ozMv)1^M>|n)>M%G
z`_<}e+fmlG8HqBzi8BY=v?_lhpCvkDflT56t()5q?KcdOJzcxpf5p1&ha+??Co38R
zq=&?+^M6GLFB1E(Zd`opgW1v;kgmih#bu<cZpMtWKe0k{@GG)Q$n}Z|-3sWT4oWax
z)lvA(sVY-6#>3M{FP<YRz7ZUnym+=_sXA)gu>6Dwv(gQ%u&wm_jhJy$yoH?5pYB_B
zuAE<N4Q@OXTQX!Ncf*6dYrU?8#}HR6o#t4z(hB@8(>;Cc*l#I19$2$VTu)+%H?K?b
zSNfmpD)PVAwaC`;PQgH`f41J#`Tpo+=2%@V)w@{8Wp2D;1wG{n*t{56%LeXU1Xg`Z
z#))woSQuC83J8MVI(pt5TQWQi*O{)AHK<1D=rVThnOf`^p;mO+XFLb2KfDYTRw&)H
zn2@ToHExZ%r9CHXR^cQ!F0z&6ou=KYm6Xeo$CVY7ie(9KS{}A|vw%?9!Wno|#E%6g
z0*mg8B$Hf%G2ry%zDkQ~cX2{jiXCuQpo}gyH0@eBKT)n`%!;5JYNkJK2fp&!cS!|B
zc@~&WRd-x}(R+Rnsu(cU6TBDfPW}AKb_)m&0Boeup$GcNl+ox9MnroCp|*SiWldFQ
ztFM)g+!5wq<$w1mKn5k&fmoC{a+7i-B$jWw!FN*lVAu-RvdpQU8mmHj|0Ztt!U33t
zTxQiA#VA{D-J4_Spzs2o!?p3C-ZQr7bR1CWW#`=Q-R0tOglYZLRUL~$n_8=VGtIZ{
zSXYya;O1-9*jIu)bopZhE8-J+9mCA<Fl??T+GT6Ip6|g{qe>6venH&7H;XT?!$eJ>
zY!7Y)<T~%tk)6c0-zL)WBJ-;S>mK^HFS?*QeRE=@bQCSH*klXk%2ihlPA6cORkWbd
zcrK9ocWob!oLaRc+q6`8M|YkjZ)W|zqI+=CPWa)hHd~J?2Oz=5pz-&Q8)J24C~c4F
z8c?!b_jcUc0;d2;KFHkM9;pJB2nl2cp&kKpUqoha{P8t_Q{Nw?|4rcfZA6Xg%c}5-
z5rug@$PU-0ccov$tVRo6xf5k9#^=fEqZc_M#EjhtWFAkeeg}M0+&4(vP_skHmgR7-
zF#nI`I(UpUGJY$g?M`~`+hryNl&@GeZGd;UvhV-EN$4s9>dOAh)Zbe7Swu00jx+mi
ze;!K*P$oRe4}w8YuAnFq?oZPXuLtA4yi_z$*k2B)597}SpouyMdDzS*-?w-+ByflS
zcnC(f{ONl9W|^DU2Dh3vHdsX1w6m6>=u3v{fn@6_d5wPE<Oayeq3k<xqCQQ>dlqLh
zn+yHP0l(v2A2K3CON0vxSdiN9+suTH5B(ypiRQ0=ye<}}<k}}YgjIxuTz|`heCmIc
zD^}{a1c@nTR+!C3!<V(Uvb{AJo|d@rzY&0v%Vx``l=Tp4?4sj4z%}mfep`w1tJ(Oj
zlA)^j54GHl!k6br<;Q1Mx#9@%#I}?XT2X}fQiZ}gd2_vSc6W2z-P5Qv*90C4BiwHo
z+NoBTtrMLz)C<i8_Abz$*D8m8Ej|9X*4>ZwV0s%ox?pGt6<An-;?XRq=+O=tVlA)z
z=i)qJGhY0zn(MorrlfV1*h5&X)152f-Z&LPk%D5>YOZ_6yHu7Y18F^}1Nf?Wvt3EO
z>}{Wl7j{rVybcyoU&_DoAj&x|#qZ4_xJ-gdgYY}oO0tI!oa`K{drG@s>6Qph!!RY!
z!zuO%JuUEY9kmoQO_I|u2z;ncl3nfV@{yIKBSvkJue3WTU(}Ik$;YXnVWF=b_z7uE
z*u%6z-DNi~k#ckrXSk!FBt$YM2{ZlR#K)cE?T7UFS@w)H%gBP0BPJ56Zj30%*k4<|
zWN~HDQz@)cT)|>@78I5rF(teUh5r$=*kS@r6o?}y8MNgO!N0>>a9X}Pr^JS7C6G{P
zQdJ)fWv&m6mR)5sWM<7x9_zi5)#LfrsEL-=Servvl=LV2(7}29*WbFOwK{XXB>YE7
zMSBL)T=wY?Xg`xm_@X5gg3q0%U2F$eTpZJTQZmYaJNv&$7(?iZP!&-5Y<NROc-r|2
zDal~!Z2Acs|0M2;Ot)@xo^_c@dgKz{t6rjpw*USZ6B1KLrX(=m8K_ue;Ws=TXHbnR
zXjR1g7Nw9#O^Q9`s09-=gFK81B1sG{dBbMSjhhfvMBFjU|FqmU5&RdUEsa~pM6C3w
z*#2TR4Bc6>rRAV9lP|WBh(&wj=X#a?K&vbMiYFEo0km>*?CI^aE6#B$8M%X9$_<T;
zl=hb>eGPVLc2d83Q4Fn7veDxYHR*eyF&JG2l3|wkZ^?SgGcu}tZFUc86xhf^#b6)=
zLD%)ExQK3wOC1CB6`~A5o@$@LsLt$YHvAP!n3P?fmF4TNDu%Vloc1HmP5T+oHiL{z
zSM|l>Az+@KZai(7Hw3(6_4<MwP+rw^Pi<DpaLv}GkEp(kQY;vxR?u}zDTBc}#NAw&
zfsH>cl7cJ)3GC!&=F{tr1bDEcGxj3qdx!LK_7BsqL0Dk>LgIkp)F3_n1H`t+*$h_`
zw9bIxCC~Fx$+r<NDZ-Ekr0<72i?Z6@bxvSzbT%fc_=K<$&ngMLp?YM<!wP`>6=
z%92CT6v+ey@nL(a{!9d^MXGJ?>aFw)JqLZ*WDQWo)myZV5kY|orc;&@{=M$IJ~mL0
z^D*;!#fxLvU9x%|R;w21EbWi+PHhE`&9~=5aX-9utJ5FRj5%F`7Kv_D%I4~019hJ6
z8}f;wUASL(7!=1%HFtcMP6B8@>p8t;iFeVaJVSEj1nV9q67$-FQST(B8xj!lc@aW!
z4e5XBH>iI<D#syPxcl#TrBT^0gckGS8%KRuNjGaInGP?cHKi-Pc33e``f)ql0H5bq
z$GiEL`@&tgJzH0nexm&x8j{u9yNcoEL_9IqJUj?3$plchb$s$Fnb5%IDp5dr*-I@X
zou$x9Wq&{s=WMM>=fR}0I%~DTBE_(l`Il`><$TyN0yDgD`tLZn(g{B3^Xl~D2SQ4W
zCK17R+cd^=;?A-(XJEc0d$6;<Di=i%<DGMN%-F%bTd-Q-xr}>U7ol??UYKg$S4}xI
z{{9)ZE}qdYrIVnyc|E9w1tHQuv{-w#zaSMo7B5e}s^8#RKXqsQW2RC%gwsrWYXfCr
zW)a(_4%#;(<f7}dP83Hqawd&}qQqxB!Ax}ynpVkD1cq?g!YyjP(>5~Kd&=~{m=HnG
zk&4`1pA05?1YFGpY<$Z0cUaWZ+F-{^%p>G3OUpL`Kh;n+qJgZDuX`SL^_|bL>q8_o
z$O8)t5fz!eD2+Ya>B<tBxuim6=4<}iT>fq+W&W_2brW$6!dFKR5Co&p=kpt{#{ef0
zEy<==6t+HYIF(loIdys0lP~E{Bg9v*s;WFX9;m*-i(0?4vc<8obBOIxMoM95BMpQf
zzVu09PX-hfB*K&=d>c6oI!eqnd{_2soz1g-N~3lYY&7Ndt*f7^*u}(VMmtEF1&QFY
z_ny>Y*LTut#yBFyInV+Mr}0gWN{@}jXRp;w$>WXQu)z0K9d(+Adj4)Nc1!(qRj?&n
zrH&<xI)E(lYZ!nLq~Q~OOg{tH199Ue*||a2Ns}7T^Z>#6*wb~2Q$c>7+tMR7ktn+>
ziu!GB@&AZxuiK(Ln<((YtSmFwnfbrHv0u{peRN|X?oDTG0i8!4Cv%G7{R(zi8MOlw
z9*TY3Fn5_0ht?m+Zx}ho^3bt|dFh>t2z()$MZR^l$C_r;1zAm&D=fX-qDMPTM0`Yh
ztIb)&V|!yi=3JGePd<cMKSD(tWha=RRrTXRZB25zzksWLZ<Ki1n1;EDmQT}V0lOtS
zsyI9L#(`7a<#O-HhCmbD!(XcM&D8-x;nk1IvgQ)lv0+w)+!em?9DU-j%=MFn*`!+f
zRFFv5^|cVO#Le>v=4txFMrwttr1__~i!^itdz(5et{d2T^xET+XQ6~9e_5_7{m<Oo
zy1#v^#ns?)RP3`!u}_EVP0agx2{CUv*(A8gy9cw(1?5D7Nml-Fhd?oweWlK>z{s&-
zmM+&1R0>Bt<a#R;LB(#pd>=K;5yD@W#LeylwQ|*_9d8%B(GK0yAd0fJA+RYjS|pnR
z|Da-?%RBMi%sn`_vnUS7yW8BLezP>JL<<X>87HjIsDfEn@7tY&t1U2hA7gJo-GAhU
zSkjVahFe))l&aH~cZUYk@kU6>63y;~t(*#8La=zbCD^rlWfJMEBlG!<?6^K&6_sb`
z^B)A@&9jTHkXHmGn=Ejj)6Rc>0JrrlH;*2LBOj6!rhNu{CO`uK(CM7L#-$@344Usc
zAy;TuL>N865vAS#EJ}0pUx33vbZmG>U!8^D6hAgG$4+1Go>6bQlOS$=DNcLi-03@^
z*VPbi*HyH8_{m!-=1m#ivg=P$p6>@@6{7t5(${-yMfI^E_2e?vlwB?RLUJ-=@<G9-
zbgznGE&H-ydn%is{<oxk8fJ_(D2xSZ#gzUjCvz1?vmK6;?fwVNI1Yn4!8|9EMqaFH
zV8?X#y8$WA{p;PKVw$V|ce#}72(SgaA}01y@w){UHV2F#f4lzqh-;23aT}YZZYcv$
zjbl+jGKami$^zJT%0!UzQQg`Q*L`)7WfJGmhCkrE`ZnHjD=?U}MwcSNTrVR-3!z^X
z`WAu2#p%)gBlK{n?+ji}El%ru*c%n-QkNJ1pEOI2MfioPpNqr!w(m=zLBDpiWYpp!
z!4(;01&PxaEPIO!3nx;iyySwKgVFnB?dXW7_xa!6!MFDYyfE!1)lv6V{_FRG_s+F7
z5jY1rL1$kJ^1mmy7WhKG*A+G5(t7v_oB1Lw5|_6}DiINFAp!MZFPNlH`4MKsZOdc|
zV$7UoO}W2MH|yWNFnvPN=@TqG8bRmTG+GEWHeLtx3YW^`jShbx!pjd(#sMS-FiySy
zOwpR+NaX|w4AAV+f>+COng;fnn#B6#rR4yzq7{fFN!9|yUQ&Po<JlcVdp^RQ)O?Ga
zu~^FQ8l6=?J?&(P&zBhG$-V3y=^g^Sq~);=ME*ffQz2_L56|*}q!&r<IQbM0#iw36
zFWgVMoXFK+jZpO(XsT!7YrM->mm6CKpB~}M8RkbLKfETPsf=Mj^_K&YazF8pR&ubV
z^UWhWnlFJvSBf2nM~Sf{cRGjFV=A7GkxV7McJ(C83|fwiG6gb2=O$m18YqVu^vt0I
zEOeN1Ra{R9H?MPi_-pOhJH=e|EvY(^P?m0PapZ56m9VA~{hmbkKrW`yAp#h|D_3&c
z4cst}o|12EWQ2{Q;DNfr%7s7!zCh=C!tytMa~hP_v_AOjgV4PGH4DYLKV`Tc8%pRe
zjh?a0zD7)ZOP9Z*F<5%R8^R}{W4B$ab(~=xAIR~UUxV#yYPreK`CJQ9O^&aYNFmnB
zV`diFR~G|7yP1rh>LvMR+mA=9U*AjMy$+~JGa`225jJA~><_-@*9)jLakylHU%@U1
z6E-iR58~BO`mG%+2K5L!@D3r0?(Bg?d&Q@v+VhQv2#R|RB4j~2?FHC1oco`&X#=n_
zfQ<pLss%7uN;)P6hLW-};9t!&IxYe())mmH55S(;B3|WCXw2Ym!_m<~g)t*X`Fq$W
z^zB0gj$OoyJDv?+I^F@`e|7+u@exk_{0Xoh|3wm=`?kvmY~uVu*s^2buD8K^pr~Ot
z;^tFu+rAdcrv=}x$Sl-zOD%w?;97#M^P!5K|2a{Ai*ZZwW$y>US!D;vD$!F6zBsuA
z7emEu(<9RiT&mo^BfUnLL_}s3q?r^n=)<yCEtFDmaz;Zc3Sb1+I9%_&vwxxF7otlj
zeY-5DE#P`p>vmA{jyVdv*Dg|+r|1}t?F^>6ITbfQt1DNu+$?WxSsas$Ik8nK9sb#_
z;4rw`&q#Ci?n05b=c^tI*Y$Qpv_#rW4ct3<k$Pbetn8#P*!L=(d@wUM_0dt@QptN%
z=&>-Ue@&{|OXES+Z%aF}C*scf2{BPqiLYT77Zj$-bAcO;q!4%~#$<s49DiwaUQ#=f
zr3iHfp==S#e~GJK;AUr8n7XQ$m~wvbKaS`oskX2?5Ao3#zY30>d`by&Q<m=f;;aY#
zOyy5hn)yQ3NJ(%ih6mAeLmKJ6w;J*TH8<ny(D~ja+{@=Codi~X_c0@Pupq(4uFCC+
zt-#L?=LJR1&`xSgtU{d4+P}h-SO{%Wrt7Ms{7)Qz@$hEk59uovr3Pv}%%#^}a2EZ1
z9w4F$OA%w#g5DYp6a!3A*UQuC*hf>-F5m+9M-xv1sSrSz&*kp;KlBn158nY+pbl%Y
z!doRJr4=CU1pp#l)>|W4r+{#b>*G~oLPD0JB&|C?NGalw2RhW&s~d;tZdsKmiFbg)
z?@vB}<t+b;Cjs!%OF(!M7+}6aiml1|`YHqqWZi(k{-ZkGypk5hB5rtOg^2sE<}A=g
zUL_^%8)2ery?CFEx<(2b0iMyQ6$yrcsHwC)SKEPBPiV-BVbVZITC#D1$0ho^h*-@!
zqPdl$HC(nzjs}Knj_5c})FJ_9r;*ZfR#p~Wc||&I894Mm`n@E(cbGe1p@OE}1(s3h
z4q3XiAci|7;S<B`6&542szL;enW$~QIfQ;M=sZrS{5vIK1KV3i4uv;-_-&sv$G@ve
z9nf<<M@VPLoISa6y_epq<1w~~;vE=B8mx)u=W97H{jA0t%963My_WHuTc;?P$yqj;
z3$a&l@*)9E;^pOmXE4IjV_g5spQ7XTkdRx<E-h%PRWbRz+_^ZPR{R&!)?;>L>xXR^
zG+!Zttyi8}^X5&I1<_c(W>~h;>-LUc>6>$s{#t&w5w$0x#D7=P4^vlbcoaCUh=YzA
zsehzDw3ZpU9O{?8mc<vpVI=1@kP4Hx&I6fBo;rItxkn={K^o~$hj<CtOY$*TRDqqd
zfl@W}X}9ISXkv}E;=kRiq&z5T;ThrgH3h#@FwEfFgU)`|$0bcdfw$^itKLPgHnPu0
z12ZkYlgsB=YY2|GWjAu+{{rxQV1WQwZn1qHBH90Dl~5E+K-3ck3@3wn8;+07XFyT~
zwPa8LF7m?4%*+g6S#cV-0+JdqmZ-My?0{sL&!1ZsL@r*Uom}3~P07r-CXsXl6O0dl
ztNh~<fz$wimuuq25}$<QOtZc%{myO{>1k)ca%(00Q-N*A0Jf#cG&)A%Q0Z7+zUg0K
zk=jw#{B;TPEx&GttS+;zQI1H@NO?p*$_l+%Mi5mQPw>^kK%?o?S|T53i_@7=b^l7k
ztT`G=_h|W4RHXT@FywwH;g9Qg<dQP^!?>>u9~Nwqh)wTj9QQ9@{#2S+^X7-6zmH*l
z>aqlS$x87Rzaofnl|=mfOl)JzSrs9+TC}WxnK-BK0!pDxtYr2!yCw|fUMlyPt`%+U
z*jq-43$8gk>hLq1@C8^kiS@(PwcER*Any-;wc|i-kzsTBexiXY{Qmameex&>je9Tj
z{HM+4qdM|V=OfpCE*3XSvORc;|JQV1JWtS$nINj+C;<flzga+aBzcsgHBn5-LA6qo
zX!0)v1nF8+J=i9Pz<FKxVs$=)2gbEn9H!_hcSCw{g5VdBJeq=|qjqb9>3$E+V$l6$
zEBZ^70<2O=pm3v=&Si$xF3QbvHneHRW0Qsz%Sm<Wp@wta$Fk_H_V?T)msacwwqt|Z
z+1wD4Bm_1K-Xxwg&rI5|Y?ykNn{u;VxLQhh<0zT_Sl%4QyqlccW?-ba7OUOIBK&^B
z4;%OuN3S3OlJ#Z*PvrVVDXPq5emc`G#LHl#O`C5|QX%oQ*UoDH@ntXszeVe}%yrEh
ztp;E!fDZ#6RV%<)kpF}#tzm53;k5iOuVNgSw15X4phy10{=cH*0|Az4@2KLM0HH=U
z;x*vJodUFE9F=%0;7<bc7BKC}(Y=YhCZa_gYC$fT+<eEdDdRe2NNx#43i$M(5NG*r
zMd|}0KahU)kNm`cyp;%*<T^9SVAoby_@hygM$G=PVe2&5^m*h^K`X+Pa^brDayc3)
zT*?WacAfgs<8q6QYcz&()$OkxFY1brlX#JhiKnJ!wFKq&mqr0vTzc+(@a$2TH-Pb%
znOgK{{nQ1U#8WgH4R1VX<%w-#W*($w2b-X1Li1D*pndda97d=OcGYIq$tAkK7flyC
zZXMiSsKB-ESD2RMHWSe5a?W>0L!Ck&twVgrjIuCRg^DdI^ukxM_@YxdMYo1@Wg~n7
z{`~6gF<PR5EI3y2xZ(X49rn0>`TBc>3^O?6<t?4!2D*fsPK3Wn=FJ>a#l%KMiEhw#
zfmijX`Y#1&+qrD|WVSmJd~mXBv8!I<(SeWGJMSXAJD<-G7_pKWAsk}Kjh2GhDU8L1
z7#X`&WWgRguS@QC0?lI?6m`e-db6E$XWP{={7wx8o)gSQW}Vf1wF!L~$z?*Y3}!U2
z(<iAb{Pr?D>x@d^RA)+}f)k&)OcoZoPATdbYAkrM=!$eiz)y|(Ync1o+Tz;F-Ruvn
z>)etOW(_6yG;#Uup|GS<eq?MEOo&r+XzWa+5{b<u9zmL{bpwM|j(ytf*&Tj^C$9jz
z5YJ`<^~aePQ1vXp4++YO=ym;>&A4$rs98M~)=M(DwRgB5ElYPD9Bm~&x^3$tdPVyE
z4S5j(yMZ;jeONQf8|yFqM(xlml?Hm(wjV?u*LR^UFqk)>TLYGJmZ;AOV2SQdWQ)3P
zv;))}kX2{kC_c%?!m<y*&4Bsu51`L&KCbA*c7XFKEghm&-oF-1KnL+ELv-XKh{=&*
zylq_g$JYUgc5Xmu<i;mB_Fd{d6$kZDvBw0XG<M9Dc=h!@6LEGxl!FudyRARn`q)0}
z#%}xll$@<gsP@e2_Upn|qYSd){$K8We2=fdtKMj_x=JidDo=AKMoA})K1~rPn7uJ{
zrfl_pdZYC={hBI`b>$TLewX!z`idFMmjveyo<{o|NK0LH_;8AzC&6ykj6V0es-)3Q
zG9fAFYmuoCy*DW1H~IWv+!)-Up|2}|XRsRz7Mq5NF}n_apj!R?xM^THjCW@;ue+$5
zeKYvlG)R~G(`f5nj=}w&q;(4P;Q69nUi8S~OS~ADs4nq1l@ndZ@-zjQ48_f+i|NzA
ziZf?6zoWr#p<wFimi8%b$L1pcOa{rWxyk$cG;6)h%fGR%<9AQkvsBHW{<zVLhUZUb
zfaJMV>K?|5`P8=S&}1we(ckhW$MBkIY1D*hDoL{xj2*bnu$2(qkXu$-)L~?pVM(PZ
zROC6hJM0dIeZkR;RO04~v9Kn1BWq9-8b-^gwNhstUjUs<$=G>in~Y6D@6nrw3k9Lr
zyq2aK!t;k%$JaR}qlk^X2jNN6o%OvqCae4?9XrtHL}PqrBEm#kC|7;i*0aPjqAL6C
zyljZd0@LG>-V1I1$*?Y2V0bAZml*<J*qGvwZ!`OU0f;Y^!${EQWbL1Sn<?Z9<j4UO
zd|72B5c(F^Z!v6D_YVbb^Ny842Gr~|Ko|tlX-d!Oxkxl@UM&Dv3+JE|NjQ*w0c2pb
zyU*+DQvwlQGx-qLIMT67gGh@*hR-|^DPq;T!zC${ex;JWQY9rh%72eB^LBIEY_mS!
zL%2Gb51QoThi_|_>a^7=TQ0v}yS8h#KAcd5Ylv7~H;$;>d!E!dvr)HKUo4qsdM18n
za@+`T>_2HE{ClkbX?8qPV*P34HLMwYXBZKJur$k>nBH~eUGAq_Qe9G>U;$R{8z7mC
z9`D#?%+(o`%(3SUA->dK^Y?j*WE$LZd+C<NdpH^md?4~Xmd;jcq}HDrLxu99S%?Vw
z>=`2#1f8~1ro}Aq@Gp$5KM?mo(>o;Y`c7ApRi#E!@9gPz>d2i9t$uR4BN++$Ba=Fc
zSE>d5sc0MI3_4GE^WbXWz;5rZ6<oi)cXPHEzxy6UMkWxTn=%FYdJr8DeIe~EGS05#
zS>XQ8o}W*)HgsZlVXAZ23C#jp&}F(2XCRYB_$|GW4yZ4?9u4n<k&Z31GpS5z6v`d3
zl%a8m;#yngFAoi+udZ|Yu!1f+)^<TbYQQQSiEk+p5ub-mYnzA9N$toQpK=(vW<-LP
zY2><{y?%$J%2em$f@<TYVVTVN1bv$l7x_R6wj%um2g6H<tAq2GwtkF`fF5X4)ZlN6
zsW_mPpsp8)Dk9=CX}6oEOD6^ct8+jGwRWIE3*Fk<0#;RO*&@SpQh-zq1eYuTb}_J4
zAF$ww9CfU9pwW+I?fSZ4xOp=P_^AL8dk?^m<N)sr1e($H#GSfd?aA+u@7!Ti2Q?p*
z{(Q<=Z;J`!S*R>gEb^o?kHGq=(|sBU2{Q7n1wR3D{mq6^^9xs$&*pkKf<3krIcBB9
zMj-Kjy>H+iqrazar{EaO?e_;WPuJ#_c@2z?%Z8KmmI69(Tl~!8AkF7R<@N*%^1|Wh
z5CeY^73*@>_TurH%6aNZt4iq3S?!Cj)MM-6N{(B2qu!3pc%i!j!os}?bU$++kXgN(
zFJFVkMo?8vr0o}9#3UTM^F9aPY3F{R-dkG>*M<IcTFKNXaap(GlhU}Rw@&pMiKF*R
zUUw#%{Fu<2vcs<IyYBZi9uKVJJxmOVK~l~gAtjg?2}5DE?$<S{E}nVNtWm|)Bby{U
z@9*;aW8c!81@B1rWQczl<H&>P9pN7)mnfPO>p6acnNjO12t3eN-g1395uOGWAcBSz
zRUKaEPqbXPN3TJc=Z%7R#klbqejEFYUw^WJy;m{&K6d`tt+-)Vc40{J^f}C-<&z}F
z8(fx(Mi(G*#M*i%o>sQtzfK^i4VVwOJ~nIUVgE~f1&-fb+t1(2%F5y*8y@}-1Utg4
z^Dm$sh>&msmd<W&=PQ0vp0nIo4}bSBQ@b8_p!E~2|I}UoNxzRz><A@tDc1OwHb^r;
zX=;@sHxyh7sV4r$P@_rib-5hG?h5@oh>Jr4E86oDQ18q-DQ=A5WG>EZLx-K`eRR22
z|DpYZBuRzE@Z6mJJkGj)^NUH*t4T5$J`e~3Qk0X{p|%35c%wsNL(Q2jR8#Ub`MjOL
zh993-EfJ(xWT#LI8?psBI<0D(4oR`>DjjnJJMAAGfl&1JaZ=kfm5gra5MG{`v#x@b
z;vJ(7l|zoNm@)Vxke!7L`GBz^q#CkWt9)nBW$JaMn_jW&YwU06$j{P$sIqo+dpG>u
zvIT5E(yyJI%Dhy=l8q2QJHOVp`nZH#7{uR{O1`9>jvHlRQsuN9;*p4d*D|yBoS0cO
zR!lWi#q^(m2#&x_{zsF>H7)p%l&p88nv1l)9?OXJi+oUEWDkmi1b15w_l>?9m5@4}
zEwQRCVTzNVkjr!0e3g5P$+p<&!Z%BVLYCA$$r~GYOy3LqOXsAKb=)24-J>18y^ysz
zAP`&8&CLy<7Yn#DGcx|gQ`k)x{9CO8Q#`Q6PLTiaYS9!2jw=QH6_CI1Z=rdKm|#!}
z%=AD;Bd-9>i=iVRuJZvXHDJ*`6=%AL07`FW6SJw~NsPOzObr%LpI=TD$o+WsbIIjr
z6wV!kis7Av5$GF-;UA>pf=~!|V%hT|egAqJ(NQxd$q`FGh}<`*#us5Dp!!dtb}0>2
zai&=I$Ir)JIr7uXdx$iWYKBHID*-lc1r~634EdtXV)pNBzm@J9rYcs(y!|H}JgIZ<
z)i>Sgp5w8L){xi19Uu@E`J{GXvJ4so{Dp(9uVI*9aP2{Ar}N=JEnC)-fCW`1mq4T8
zl*%jBG~v!d3F<frk=0PCw|cW;Rp3eSMpMI+KN=lXV;kw~y^7%5g|JiC;>SlVN%gJ7
z?XR^;2;oarpZ_af>t!~*uYTj$wwrZk$h0j!mx1@a;Kk^&{4*P|U99XPlzHTZ{<NzF
zlfUgO<5bl#OZD^TSyH8)>*;zRBvOe!@g+ublg+Tiys~ao($H{kdQOTr{Rp6Xfi>BO
z?-VSi;pWYLR|mj-r~C<mjKvAA`48B!R##Te8$NVy0uw1PxBm0W0WKNH8<bB4BB*^%
zK5A;Z0dYb=*2F(7o)Q3kfk034@O(Mi26nbRPhoC81B;?>A#?cZcTzoyUDL>;$o+j7
zQgB;p*6!=0S2VW{TDzB%__kT3(ZowrU#ad=%+d%PHiGJD`H$m-5h29t3qjW=vq(27
z6hU(RCjMX9QX)t#L&N3dMG2CaVOW^Q)<k1vPY0uFA1JUvV|^Lu3FtOgm|SiKMO4Db
z-3R`s75)$F7s{NR6ufD%(~-?|TQ%q*^Fg#dGNjg0h9w&>rRE!JFrk+(Am<(`&@D13
zND~2}xLj`~J8ZSb+C#>3ZYvGPbY4&%T~kp0rQ)~f-CfZ$nT>?|iG|Hhg2eNY9C^<W
z7AIuQ;!xp}vsS}WNl8C)&q_wfAB1UliK3*1R$wFpzZxaKorb3$aPB+<#163=@B+sI
zz$*Eh<-eei-vhCjaPA)p0HyygRaZS%0_eD>a|6~e*c8Gwz|rJDnwS6WPYruIn4W#t
zel*t?eUn8aEEY-+TqLLf&(NZ@s`0FHHIdhW##A*9<RI1L91@CzlfY(=&a4TF_j@es
zgM^@ne&-)Z&}$5CE5=d?7WjO9cFf};c%Z+?m>8C7yn2&0TZvE~qO~uyF^tRz+w278
zVgInPsq}32t)je9I%=lJCrmC2yVN)S6a+MvH+0?N;TL~t!5mxU)1nTpySQUg9h`U?
zlqh*@w$@vaI=ru~CTMUqZCE@>P)>=AB&^q!N~<H%pcDm7K}l*@GxuEa9p%0Y45^yp
zflLv;PPGcbK3`rcKds5}@i?h{+0JELY_Nn6jmwv=d5b1c#kfB-IlE_fg?%$N+N$D{
zH6{4WBp2fULs_aA_5SLxX_T{#rpdt!_)!6_)&$V?zoLx+V-ld(lXYZ(ZHd@v`y~|+
zd2uzVVU=3N4btg)r0YKa9XA)&Pw{m`yC#L;z=S;R?sEQx@>n+sf$?cZSf_!5LNKc*
zN(T*0Zb#{ubzu#REAQJuFQq|58u6XOIY(Clkj(-A!5qrNi7^cRZ9??u4>nmZTD}4)
zN}k(eX8)HaW(u+RbVU|S8_Q%Z0j|y&62V6CtyEBosgF^6d8?SF#!Z`t!$v@5(18wy
zGcAPn%gF0@3W>H$tZZfeM<e<JDHNW0YAML(1G7}5*18J}N$KJo`04}xH=_dVF4q^%
z?#Dy5jLY1P;BaN=tz`Ek*A;yoX>a|B4WR5G)Xjp%*vYG`W~u0!#9Zr(!}h7~kQPib
z7wPijp=5tHyMh}w;?k3~fQX45AT>-FNC5!OttkDEYHz7p{L+>dS9(z=C8+&^_N8AH
zETh>VI-GRnKKTx}bTQF%*hM}$$pmapmlqHSLN3riMyTc0EwDTqP4~Y)Am{mx&zER3
zvA=foy+lr!bq%3WqjJvm5P1cVSR995@U`MyX*L4bk>GP5P&h<#)wE7}IMZm>w9Z#=
zE41d}W1o5wz2QFz#0zWgsU^Rn9BYeXU3J!Jp9J|*@v%Y76*V132(ISmqAIG)rM_S(
zB<eb)W95;X_zoX#P984DR)%R|X;B>0{aNyiv<QgL9-7?r8!U4IN?79OA{?bN-;Rqm
z$_GE=cwIN5Jhrl#SLz}aAb-Qzym<no;)l9X&P?Fm1WxAug}VR=5OU^^j_Y;rh_vUz
zfHvs}>frb_pG~<<j$RtsD3IQYYt}V($KNdR?Wt`&I6`4pdJ%7Wudz>3`_PdQVQ9PD
z@9p8nI58dffwEfI@jJ9-IMC0279BmRCcJSt^?D(EfJBf-OD+S<_l3XrUH#|XqrR3s
zkQprhy1hH$bYAkW9egj5mGdRD1Rs++SVsuT8Mt3I^NiHH(-<Z$;@=;(EQ~%Xh^45=
z*oAwhCwW8HRHky$Ob>L6eHkVZ%t=+?GFdsB)86S4S4GHIWlMf*F%k8oI&HswOO%!T
z($!tea7CF(_ZgvMI`@UoX^J=Gv0G~S|4??7ZEbE%H@I7Y;!bgQC&i&yafjl?i@Uo^
zaW7ijDGn{}?oM&nK!D`sob&vF_iJ(`*_l1FX7-+Y)(WA2A5)_C^X=;UDIfY`#g*Cl
zCJ+6lfU8rY0ZD#M^N@0QwR9I2wJ%jxf=Tr`R{+Q{mnd3`gssX_e1#PvbuJL-CbsfT
zg%v_49R?&d*j3-)dG6;!%YA#!?&bi!pe#DyQnBnG3F3T<g}Eps-=lq^yK+kVs+Pm2
zD~?auaX+@#If?_ant2z&oHjN(UIQP(4BIvH{P_T#9Bdn9KF<Ao6$R5xwkm*#!C+pU
z8`;n9`~IZBlT7*;ED>O8`Zq!e=8?@>+S#iEl5DQbc&G<_;5vlR3C1s*YO)L36X;U>
zHnRh40vkW9X1-BZqtNF#P4F(ElZ!mQY0-&79`^<?u%lkw*x0P|CN0MRmWo&~+^zzF
z?Nju;ui`$3WY+SZ_o0`93x1b=Ue1{4e*np!<xPQ~$AdHGvKk?-9)>QPyz~ZeRW)qX
zu$G@WOn}l2e@PL;Gru@{KW(gO9zH)8wO{&#6k&33aI5BfT~1*5bv(P5J1+dhb?9On
zYpmDwATWk{ACmjHkwIB#;8Z(KC#euX2BdQ=ecb9R&Z|E|dc+)0=*1V>E@8gdNd>YV
zzOW&cc3p)5BEzG&M}l&k0Pycg9Q51P0oj~zKx}w;q^jpfdKrhVuER>>Vfy>#aKu0M
ztt!G-Gpj2DT4<J(($cxN)DklyxQ9S)`XY~YG3P(c>Uno}076O)_21bC&87nl_w>0u
z_g?f5r0(#Ng<0CXVRKaw2V~w6$c3`ntCVSnwaUxTk9M!Te>xuQzB_1d(x}02+I9E;
z$OOLW{k&^@RUF@HonQ<pcl!ub4O#?a7y|t+i~w2z+i<Po2N@h9B3BbojBGw$Bhadz
z2gRTss?)+jUMAX&t~#<OTGo;tvzFcCk_Ruji`1q5)-H-UFTi`iScW9GIRuj&s}3E3
z1;F{~`+%1&3E}uo!=wXXHzsS0KlkRoIUh3L1oE>Uy`G})q8?tB^Lt={gPlm?pg4jN
zN*636Q4x1J|GK4~E^Kl(l^)A03R+u&q2{52<An><y!0|&+~D0ShQ`MWo68jT_A8Vn
z<@%Z8uEhlQ>d5podeDCBk$Vr3+Rf+w7MXwe)Z5#8C<ba%R}po4D{|j7E@q?X^Q<kY
zLaP*5@3CyI6wBF-y&Dli=85kOkXd|RPqp(sHc$9FQF<WFO4(e;nox=F1KGNG_g~c;
zCO4;TY>Z?%+Yjy?k)`X0er~ywk=<<y68Kshc_<B0MGT*LvB)4~mPe0$q|OjRg2M*%
zDq^$118QcDpc9u`UHjE3#t+H5&?g*3?k|g?QO_6rPUj!g0oA+!SY@eYoA>;ib=Qsg
zMqUS41kgvM>rY|gdHf}1^!fZ~9Po%}x_N6?R~4;qGuKs@>@O37Pn~V(Mq_&Ru;%(D
zzvRvuoJ2CLplvBNFK08{-Xs7Gb<^&R)M9W{zj)dC@|!Idw2fbr5`5oucG(0Umd(U(
zk`mq|$Hh3QRL18B(xI}O&*$^8Gu4<!dYvZC6eJ|)m+&W30scc#Q^QjNet1WZqXefe
zhfS#YaDbN=qJaf7r#7L-!kq<npOq^|f6>!2@haApsptY_3G`}mRv}><<-jVK-Y9fK
z*O^{z6<v?pb06q-fDlNCqe%lMv!@R5GQqz{z`FhxLY2n1;LS1-3kVd#o@#0lUb>m!
zYrTIxWpv-5{m=!uFRU$G+R!ubJU&44KXh!L5$XWFPG)w|clSzsGPJ+NX#M4vK3_I*
zrBv&cR`~!u;XmDapzE>Q|3LvQzGa*;8MDo+&y7{PIA%*;PK{dm&~9@v9hM?Ds{^(w
z7J2EQlE24|3Q!hy*?LpzfUHdUySuvHZ9&erHo=Rn?j9bpBl#Ivj}$)0Y4RBI2%QXp
zFB$Z`JO&W$4e0g(2LDM#vs<8w@8}&rv<ExbCC~!Uo(KKWOz-D4p$}2Q6mvQo%RlZp
zPJ1#Ix41!>!m6fhd$$?umA8-|@H@QrRYl+c#e4Y6a;L*tMP+zR#~bJM)?b32;rj;|
z>@xBr@6m6}AiahM1oq+5iB0FFbH6D8_e~=4n>c0GPJuFH%j%ISuQXp!m8PT5pDb?W
z82{K5K<nX;L8LA2*Jj7b2c4t1Svzm=S9(AENjRr_mMY`cgFh52zN3uIUX6#)3lBpT
zUkL`8M7RD6<;*iGI)8cZ`Z4Z2^Y5XN{7eNB7t4RwqL7jd3uYbAl}ZU?EhjA;g<X!9
zCXL+t%HloGeq5MnDCLgyPZpmSfZqD?R}+r@xBk!Ppdc*Y?Vl=sHi{?w^;(w?H8oWO
zc(>bp18!Bmb3LGZ@cmh$<T}1~nD^q*c=e1hXg9|7!XPv)WnJ#jb&qk&;58nq?HKpv
zVg9SJsMk#tdmVOxGB$vQyF9e8FsL?%>!mUut9GqG+WL&M^6-iM#OG<uP=!6#jH&JO
z3f>Z77X+-7{}Rh$Y!m=-q3gQFSe@CM$dSWe!KzEO?eYRe?}$SO;?VIox|%;cyIr;>
z@Yxho;iWBb3b<~3GH(BCY&7e2rk=I5QMb?ELk1p%Ht{)Ryi}wsz4lw(2)e|m{$&bP
zP`v&waqrG@ZUT;4=`u60?kHdEe<|xsw=u+*Kr;Oy2B5QELiq&A=%Dz|#X5Og>{f0{
z*!_+z-oobXq_*L{o}F>w(V>NaIuJ@r`bCY<=0vvD_i)VvdPm^*iJWt9VCuG?EWvBN
zDZp>&j6Lsf*OaD`68dzN+nQR({(HYWs%Bl6BQzYO&yhsi1wnX---rO<FP;nDNTJ1^
zjp!0_MhH9^AwLXv#4=DcI)NBO*!}bH??f6E&GxW+=p>P0Do}@1t&Q}d0cC6f6P138
zEBmr1UU_c}QOD4Syvd^YVy{r_hAg<IDtJ~#muF0iN_e4#PkYyHxVvX>!;RZw%YSw4
z%g8#@+Z{UC|M`>Wrlwh&89`;|4IZ@)IVaxqRaHi?h#7JR#jD}vHQ=j=OYkU^)Op{c
zSsLwY+JGAJ`+|xe2tPM-xaSlZ0HU;T7}zv~l$aHCp<V>QGEOxkIfgHo?@XwG_^@cZ
z{j}OTEW0cv-#s~rNnv=4GNMBwflBaoHV#JN?CqX&1L+G^t}22LL@PNf5_%jAQs_>h
zHktEY^gJ9`U*sH88S`f()Kj0rHNuEP>l%6nxcVO4R}EG_(9~(_C<rBVaDFusPkKCT
z>KM4b7Oet<Dp2gS)nj=R%q+>jAyXks4Fi~dS7G_rUH?vRm*0SyZdgo;axpc(%B=XS
zP@y5s!i9}ZUl6Tv&q4|_k~k#4V)J}m3EcY&Z(2esF(x#=w;3LcjvB{(U6Go^|K5ge
ztP)LLZ|SqFyZAx5X9jj9al17Pn_*X__pI=<l$beTz77^5g0+|c&LJtzC(@!U`I|gt
zvz5z?I+*rX-G_=FIUokIEOfpO&VRjYQWV~Q_ZPIgI<bUGi^1X^#SK-WoSKEuIYJ)k
z3=I)ubKVPosC*2S07Qj(Ga7YMK?^hxTy?3lzI3qADfrIj#$YVNQaCw#*Q;~aki~Ey
zh1(FDKwSjqq4B;Sb`Tc$M$JsVX+1@w?Y8z#IP?LSOZg>8ZLFd6%Sw2C_T`0fv&FR3
z!JYfLc$r7SS3_P|Tb#^F_-7*mu`E@`M!yfV?%K&G^zYWjFnMeh^nr{ZLQlrTs3Kt3
z05xwQis6fnpb0ZY?9@z4b<KJ7@WN3y9`a8-Z{KpdbJXLKcTzj?W{<O&pWRhdb<sw*
z9!MPp6RLzZWeLR<HMb;a2RzPAm`3=70^3HXYIL6$wpgHVlcfW_40-@kE;-}ZNZC98
zqhEqk0@m%Vf!6*X>Gr5R*5)CjHO9FuQJ?G#Blajf$RL}e^|>42)#CZ+=)&Ruq)z1@
zYC376fi|V7KLU|Ma3u_K&`o^2zUmut7UGjS=3eS@6uxG~`Z$k>KXx5LnyX3IRzcMi
z4n>;EM#jR)qbIZOHAS<E4FRIJrR-2=R=pNzay&hSuutE-QCRgE#q%GGQ?@qOG45Jo
zPbeCq*tg$C?@>Qu<_hR0^yKd3Z+B>Wo4riL0$QJbIdd}Pr(P;(X82S}rIJDwHSOGf
zS5yq0<2$&eez+j`#{qJc7Ld_Qs5P_Mh{yI-(XkoNpYcSBqsi$mF!lyXVFQVB#vrEz
zt$t5dwzi@dG|+~_9&+Ix$xldlFna9Ew`&1LPZV2*4<n6QlV=sso-&rJFb`rwHWQhC
z7N6M%gO5Cmp(gM-3bBvC<0yHxD$#mL*dKa=O=*KLoJC1Yu{dmMSxF4klp&u1Won4<
zTK>Vbdk`QzOP7Vki@hR;+S@7_u>z>;Q@(agDfxa|U&J&oHd{k+vbuCIwZirZmSQZz
zgD-ONKE7dnV<!`uzf-4I4uGn^pFi@mM&Ne5vt(P&CKrdIv1i7xJRhl~ZPk^EkL3~(
zmMs0O&{^|obiRjI(&Q?G;m}|UG~t*HR)2foMq@w_hD8UMX_GZm$nAoPlVvq7kJG41
z0%NY6iyM5+D6P0X(d!(L6J1e`I-z%M!;i>0YZ_mf>&N8oGpi$>w7k-P{0)YZI6s~7
zr8o<mZ{nv?qk&~C(1*^(seC(*me$&`ogvx2_0sf#icpHXnoBY!nI*XELEZ;)7F}p;
z*nTmOw&Zu=<%dwc<9cg-svpo^y>uLtUsv3khEoy{k!W-m8*}l?XKy$rVbg5pI~xrQ
zVMvJT##5{P%j<o%Hb&^A0jBZ%eN#y3)X`{2raFpDnfGx+b5G~VMG+{^e;wk|)fa`*
z57cb4y}i;USQqor1ED1t%rlswM`1T8F2#dgY7G5e$^Pi;33VV1kxhd#x-9pvnt(q)
zt*P)DK7+FVadWnhqywsSk9*DBOR%^%b;XvKJ91}d@+Hp_mdY|B{vP^)@7YCsl_M(H
zgU1>kVg=;zQIIjccX8DH-JVLfwm7)J>xsf}>)@naab<nMW*VPK9G{291|QEOznyj7
zB_n>%$6ei+Qdg4TUaq7#rPtQW4bN4_OPRt|5&=46+XFPvuf1YzR>7SDT}v@l(yNZs
zMZXtjkM5lk5eMiK2d-5Cw*-Pm4TaYw;~18ZHTX7O4;kjCX_)1`_y*@)siTqKt&-U+
z_OU9-RlYO$Y^X|bpVw#VxVkyES)lQQmNzn`g=gm_?ScxeJH}r}!(F--yPt7Bp6UZp
z?+_U#`bS2U%sguBe~gLRjF<yXMlC9f(I~fpfal8<_0vU*$`+O_o|dU5WdN^SQEltu
zalhtFM(Uvf=AEm%Npjmd3_ug6_GZx0@Of{mCSZ5Q4+<Igd&pSiUA1}Lu00h4%?TS{
z@00k03MXkEjxfR^B(=N3<x?8tl>tmFWN^p;-2;x2vW~Jmm;yau+Cr^?sMBwPn&ZnQ
zj~>$zl!&RLQzs95>s;5#2ICj6X5DWOJ!ENra5H<+Egm9Wh1V(mLDi`Ki{Je@=<VN3
z?Cgqp{S?G}6uP(|jZ9Y>lXL1^05q@-w|8TzzMW*xIX5dYCW~iiVslr=D((TICyg9C
z06SLqLb>Db1qJlm1Pqcg7^07vx2efbtQ&BL;uGoU{}9V-3?9g}>U!N~7-VID-E9&D
zs_HJfCzc|Q6D&3mk~4)BKGbtNtTmKA^`%d}P93tLA4vb&YMPXtFun<84LO9kompwu
zcx}>H;Rm&;W{}Rf1CA-_-qu}%L2nHX^#t|Rry;oW-FR!k!I>F_UW>%qvx~a!>%Ki$
z>U3+(-K`l3JFMeW=_yFLXp{^n;s!Xq1LSyJ+eAJ;qWCCZRzzT@aFPV3QwY*zlluFt
zg$wf~xc*I@D+I31c>d60WuYxlKvu<`v{>2_eTg{@e^~7}HO}wm2jI{rKlKtB-s4j6
zViZjn8?<p=IE}MXErj&@(Z#)is+1}JN1!=&<oC=h7*@HAkCa+MAuTjlx-xs&{M3I<
zkngT7!hpf{%(veGbnrtz45`NI{GI-ubokn<f$rp2HuMo?;HY#Rs5~ZEVH%%pm|KR)
zPP2WKFW><T+qwt^s8aGFu){C~1=B$VFc;Cni!Q&KJ>5Z0SJ_Kgm2-d@Z0~jhSt}%B
z=e1QWZXKlo0R5nIaaQaLDEBRHrKsy|x!IgZRgM66?&dY`!(}8I_h%Nqm_|2ngnTD@
z|1}VZ6th@*fNdc5ll&02fD-0+^U%ck8ybbW$%R;?_f`{qwI{#rBTjfea;u|Dlti!k
zh;sj189f0rT`QDuzl^Idu}ZUqP8A?4PzaA4XpudbxD?=FSig-y@Oz*Jh>0w~)Qnl@
zvyqm>8QRXEINGhC_VO(Ib^$@1v^)?s@=cy_8(u;Hz+L9*?y>(GGLSW4c1!PpCk017
zOh-2o1Fa>$vW>8}DdP1|sMfQ1nQCbhB>?<Q{dJ=AiE!GGU0J|JPgWT%iz-loGwFSQ
zm{yCm4Go*&>eeSH9PTgo&k>aHhE+Vdp8@C*h$5IdDP+w{Ia&;)(tV{uoX7pc<s+Sp
zhH*MV5_I7w^ea98#{avh4Hz7aB1aqqK}WSANTU#2yZhtIS_^5*AuR!V#q_m#M=0S$
zQ2LfoEF``QYve%17JgP<^OO8MohMAcW&&aMVsO05lgiIb39iHL#~10jXQ9~nC@-2_
z4dED7&vYvVU|dZuVwNTWKyqR>sjhlU{Zy$*FTr3Lbgz9z0=ppn4J$B)=#>)NED$I)
z<N_|ehWOEY-F|qB1U6-1s}NXhPUKEbq&yc@{$XzR4KAI~tZ7MRW3iy#SY?a($JbAb
zzq3@t3tBM`Lts=3BU^kURrXGbN^3|64d=uWk+j5aQ+#Y1>6cpU*lKinQ46dQ(5T=k
zg1x4w3m9On>s6Bf0AhL201RDVg%#s&cRacu-;Nvr?^S*lv1wOfnr)_-N!L+N`!Yxn
zqJ^>hUfor|HyI)|jDAi1<@gVNzS%GAsCdXdY5<pl+hdW%<3281+~<PJ+>5S2mje;a
z5*w)5{cyTZu*JjU_{H;n?+~_LmD|Q^abe|%g6NdO2KFjw^Qa5v*ImnmEnlWh+&G}6
zGk;cB*u4i6Jt6no3@SjkG$>xRH-~E~(aJ^*jT+!hEZEfIf6D+s^pqU$!rE3@st!tv
zho~a_HUs60T3Px}*#8Om^+i3+jc^>8(%!egGiICV@}ro!2wM`rFEQuwQuv?hNXk^(
z>U1=mcwYp*$J}P9%&1$+*>{(odDDGRi8&%BV|4Rph3SIzHHG-E`&X;#piz3`PS4H1
zY!u^Bf&O|buLVSPJ7@X?Rfvg|0;zfzX;{4C*MHV}<Iy+!TKOU2^b@B?bI@mXXh3c&
zPx)U=<$<r3OiNWnZAWQJ#6wrTL1?A&D((<9Xr(^CC9fMSe!8Fsc)v2h=@@Y0h9L$f
zZjs2C(+`+1D>Ivh+{X&D8MTz`7$_|;o+=N<26TG;$vutGl#{yjbI{CW|CTRz`)~OB
za2Vwf0k$<zw|p0NxL@=!Jr7$Qz^hnGzEJjTtSzh2suAT}`{niamzJFlZFIK>i?UAV
zDQ(hjN>3q#{$NfQ-_sN;=<zB^_uZJ%Qf=4MNW#%*PKVd)a%NAD*=1W-f{^tr{!&ef
z&c$cirYu?3jJr(ji~IODM(RRGb|}P{kL)?JTgD2sIuSC)2NpN#K3PG!e|dRTM|eL@
zQ7;Sngn25ohXq$Fa=dsb>HrpNY?sO2`JK%NJ(yjX61UtG)W&L-pfPK@J1por4gH|K
zCF(sHrk{5+Q%UY|R0a3`p@Fd5NBuNBwfeNC?eq>y#Ab?U1O1}`vQs>4RCip|v)q4Z
zBN}=90MKYY%T4iS(5n@;8KgoJ>V~ZC9~yQ*($iT(YN-fMHp)8)#LOOgL@gF^(8}V8
z-gr=non7eO8#yiKK)wDfmrd$Etc|5Z9$=9?zs~3T%*S4K9v4i#{w-x@(?2v1kP30i
zi{Jo1m>pb-3PNAnnAv{WUx7)UsY0h0?DM5JrbOLw05}A+E_LkP-U0ItPXoqF`8SRG
zyF=x({MC>r{>$fzq5~1|Mo+Mt8f{r-@AAo*?Uv74AN{%4p$EI(^XVt87ceXm3V=9e
zV=i9T^)XK!5W=)gR`UJ%@3@CJ_GOn#4O5Mx*HwpVy|uuA36Be#&syfE9`}a^G%>e+
zGXgiHBAb)Nf3QTLWJyGNxS#-mp6LgPOcrx%tR2z##j1wyb`{CR<~T^q<Do3Cqj-U-
zfh8O)90X$n7#B{jr6B9@^K+RfaHl*I<=%49qh~lplD#A+-gMD3c2tUEsyAT^aZ$@y
z@BH$3F|kN`JNczg!;q`GBgty;cOPf?Z6T$NhXYZ&Z3;$wLq}o6c(uxTc|@QvzXOUX
zTA5PK90RqfB{f~v@nHR;y{*g=)~U|S$*Ggi#3;d?yXCI@2U6e%MyBNG%XRjDif=hu
zcYdY`G}sSP*yF&=)g+bk9L1sv#f#<J0pqs&xveeW7WfYBq4jsh0QICyIjYhCQug9-
zDZ6!IY|XWF0k*mi@tlva+;2IU0cKKoD$j_QUZM>oMG6Xr+3|>Np0ra8SSonS7<#u$
zLBWGV+^i?+$P=SEdYGWa8$B0fi9#U`&#f$jsf2Z!gg$S0Y3%WA#`(|VNpyM$+_W=q
z>BO=7pFB5Voo*7Z>lLNA@RNq|z%P*Ew)&8i3xvkLA-TS6HK_}&H)dZj_=p9cB2s$c
zruvzRHv<$NY}uC+Ed)a(fdhyZfG;65ft|}j9!H$$1fe53TAF4`Fd|-`P-+O2BW57z
zT=fN`<lH(H2I9K{^MX)6h#+eUl_6dqOt8Rbg_+PL+cK(Y{xV`}^XQQ``#1G&5Q2``
zUA6~=_)%vCaYp)mj60u}r9bgw`RzPcih18C32{9Z;$pqs{b(S6l1?bPA7=zEDKATE
ze*CI{WQeC{MJp;*U>GkErN9zM6DEPg0Q0Lsh&)v}khFxV@Inp=C~vdw@=+8w8Izg_
zW~9=i;iwIWAN5Clq_~PP<G}aw-_xj+DM+f9Nq;MJ)@$EnIjWWNt(${`G<CepWxBz|
z$6+bjO#Jbu+F_IWMiR(?eUSMj_1;(U`vC&ykl0W<jOj%{Fg9vMyH7comYNpFBW=cW
zYm!`N0oU#lmR=;G1(M<2qd1>F_ykJ-uMp~>UI^e~d#<0&!wthWJTbb?=vsxWN@4f+
zaZ8(<ssc5%zO$pQR<KcqOvAUC3ldqACx2wHJ6w-DwLW}P5ggNDr4L_CDmT3tBpZ@%
zBD=LP4V7s7@sLss5{~4l&6lHn*TKp~<)y+qsy)qsC_7Q`KKeHjA_cO9gy1B3J`IER
zXVkOMdwF5T_$cd^FHUFQZykHC;$arX1-`i(s>c1MuTo_ybh!M{-`f$jR*q*(C)V4q
zJ*tf$f9HYyUwUdSduL6hA%`?4;Et3S<#M3$);d?paq<z}U8REsI<9hD9k3e&fDuSw
z62rfLa0JonKK_%{K(b6OLKeY06rmFE$TKbemwGP^G*mP4?Aj5V<l0lCpk(&f3;F+0
z@;qwLsj$iF&u67k;df4vd3P2;8|SVe>7)JZk@;A7;U^p4nXOuBfBI7=$%#ls;yC{v
z{nYc_j<M?mMcK7@ty@!>a3;ORUHDm_?!PMssa>kzZO5GbRj8w8%Vl@1SbaxI=hK^P
zGj)>ZuSAW1V)$KD1lqrU@c;c%09IyVIsN`d0K@;jX{}V@YEAjNG4-m<sQ2PO<d?If
zxWUT%;EwgbQ_vsL!6oy1Vv5Kl{xP>!=z1k7B+4-9d&IwRf-46c4D0sA(aaU)TAbSm
z3b}F)gNLVH;0d6`U)x}nulRSLQK)Ih6ol|#vD5Hfp8#>cOR5%;JTGx1@;Xj(8weU2
zu@rVUlUQE<DZj@g0kn+Rs<D)@O{#c(*r7lUm@lb61CW-VD}T3};sIzTV5_Is7?dz9
zkZ9{WlKwS_yFD_p9C8??NkNzi_3%xW`gAr55S95B8lU;sinoA1u&`93q$sU0YcGK>
zu>V5(zpZOdnoU;bM2(d(OS52pj+ug<#Jkk$r(3{A0|BgL7w(~5#)A}ufx=5j40LY$
z%R<rO^*%`l|JFU}f#rx(YGTrY!$@+GgdtuBY#<`&<!*=gL($r$-nmA$xc?N5vVfB^
zjzlRncJTQl__>q*TBV5<h}UcJDPr0wV%!lBX&6iU(n0gJ>K_vc=Z|vce;+MY4n7}o
zL9vvWXnZ+Y_%j9B<*j`JlZ`zK3)9~d-&Uq!0w{IdaW2Z<i2l8h-_aJFAnFb5dtgrH
zuC!goVgjh&oL~EYNG~SQkku)BoQ?&QvPKM6c2no}y<g%FfTY_Z4H4H=>a)axInNW{
zDn1k-Oia;@d+$b9ah9w#h=L&yN7)H-Pm<0~A^<^oE5}d7GSmwrue$B&>$(5iQDNNp
zPu&c0dz?44@{NigBA*BDZN<C~a<w-FX*W^~WM%6<)5jO>{DKoAZ#8ewElkdgj30W~
zh%o%};aP@L^#ihrs>dkBBCuge-2eR>!3CS(&3Wm6)o?Ca<+cYmP9P%ATk&bB+wFV1
z9O3Ufc1&Q$<9bp&f^I_Sq6R>1GV(bSgT-@g%PNvt^4{rs@2@O(LC;ZwXfq~JbfDaE
zbzy8=e~c2L?7jMtJ9y@2ksOa=aT3Jge@CfI<lb4R7$Fy66qfE}J!9v)c=cm#{${f!
zoPkF=>bNA;_cVszgjl+X;^*6p92KY^?NwT~ta#~yfl41a@Z$6!WhEq=km{0}9Wl>c
z5v1#bfwo`fS&Lg_|1(eObzygO;sKR%ZpYsm)=Tl*#oJnn6d9{Uv%aiQRA+40_!_u@
zZDtDoR{Aki$4r_&K>2naqfaXmBR5#B9~LeXtwGO!la<8iIW-#z9HY|hFp>hw#6g`K
z$N6}aND6jW`7bypLp49HQNmJ>*^tP@sJuEKGxiv9bLowBw8Q0sV3Ofr(D5W}_)ZR$
znUzIM`3M!qgF^x(8hyIpg1O8&4pye{sU0?~uX_S?Z%4Qth{zWL0*-%xMEr~xP)A5j
zkF?cZHq!n}2Dem{3^`-|?&pIbzst02ZM6=eW`Z!U0)%o7dc0k#k0>N0{`=Q11d2AH
zCcG1uhw#GkFu_OQodYu4L4UWoL3QXaoBi`5bxXILn>0})|K-kYv@s1BB$dCg??9U#
zl=W!;A(A^2Y@&>L7WBFWgZ*ii*KG0Bn*?3y7FK|PN-8ib0}Y5s({97RVGo78u4VVx
zAhXUZ#drs9=)V%+^~r<}A<F98%I*(##7vxKTR)A=ZLAq{fZSA!Jg3Jeu{+xpKWJ@E
zj8+Jbq`ZOsznFZcfgIV3K7D$*%)_TKZ^f@SQJT#8xG90jYgyFP&QS!YQ@on{wDe^@
zaliLH2b>FF+DEX+5thLQlg)$S{X~`Md4YVwipy*YR$lYv9cI>(YPlsfXM0`Mc{+Rm
z=)-@%;Yt`WsUmK-x!GgvcRKp<{JhYX2(BOdq)vzuyS$G!a+<$UrPFt3j$)?S3;iTI
zhvgM|SxrHaRLMi^B|G7E!1uFpG1Y>WN-ea2ql~D)q6HXK{PePs^WWAqQfUn6+g^W<
zH|~L!C%hiD%&{+HPwxu}hwubRjHh6l1KD_f5&M1{%C*Jx8!fcNYs-IlXeg%$li+kD
zH`2_tnU7@R5kG#F4Ei6Y^HCQt_LIDSI4JAB2+Z#~3=BDJ{+-sWHm_k5{2u!|EU$u6
zh-_Kj_0r6k_CqD&<%B-+L5P>*&gP3w9X5+6Hx}U+j&D_hZto25;YMG&6-OS)iT@Yd
z^7!!d!HQCW*qs53{&W7&BpUI%WUrL^5k#euPXjQB@T$BzX>Bt*PJ;-QL5g_POuHI9
zxNe|bSq+T=3k4+9@2b&i4Hc%fxE%2}DWFZskpE1e9))lwRiFyw)lNs)`>QAh$?=<l
z9u?(3$sI;S9K$uN&wMwluKEKzQw}|v#bACO;wm#G<?kI5`bE74mF$m)Ba>R=1xbZT
zB)Kyd<oXrA6At{J+v7B_*s8#~G8K4UVr9+UHwFkl25hw)zQ~7R#HzU*WSSt8j@|g~
z3+1_869dud5|13{i>3BAAryf6{+h&Kg=6``Z83sd`LpQ%EMN%02?8oe3c+X!9HM3s
z8vFdsK@<0W%+6ap&C(Zmm%|gt(yQv|d&usA<vC^`!x>rl?!<?8Q8qQB7EHK$S-76t
zFB$sZ+H5IPFGe4knJTa^|KvD$+%P{tCw#C3eUonCXgD-W`2K?Nt8Z)c;CfSjW6M5+
zEY<>AZ4zkLAd*r4`}}8%1Eh};!KUM+D3`Cepv;>N5XDV?Z5IDeK5VcoK5y*FJu@Fb
z6*Y0_#Ve|Ri{?LPw&`~kp@!d~WTVj>5M*R=74pCOg>V>fS582&60?~0`cQXB-i6G8
z6Z(h)Gj}9%?IEY@{Z_u1F8#<drk4h44kF=|R|KL<pB5ZH{d5$6e62*Za<$~9Ux+Jp
zG70?k*k+!lVwdvY5}75WN)m|4x%UaJsVnrjFS!q~a^}9yr|Fhv)+rX#?7sl1>oNX8
zLg763qdO@i>z&1zer)?6=Z$1P5v|YZI;;GZ>ue@1c2`H+k3Lfaf}DP*nVY^_xMn#y
z|G8x^=s)nt`L)(lxbu%3>ZnQW!^5S87Qv4f->0)L;(BpR<SN9KXbL!yzzJz*9JD?u
zUnO|-Qdk?WpI4ituMH{Km8Tk|;Wu0v(-Gy$_FiA@K!^(bVPrY$Diz}UxqT;Dj6UM~
zFeW~2Qszr)uk*r6B0LNWyf*Z2yaBi+c#(}QxOl&8tY(y}CU0CmkGHCAw+NtCrd8V=
zj!LiEX8+PJIY>Y<AK5z4a&(MXaEJUn`t)<@cLTqRan5k1@}JOJ5QlxWd^9r5UZ`a~
z`q30+pGjySwG%v?G_VlKfT64TLdjOoC|*;WD=+e)=DVnR^!J7-A0k?gO^;4!S)&LK
zZM6uwMU|Pbt<yrAb+*EVJ?OS@lC2obwHeRp26)%O7`wXkP+F0d=WItoGNGnZIkIZ=
zh({1(#@jNaeZ9E7NR#yqjcv>UX3wdD_|%ctlw@h<QIU=~kp68jD70kbBN2t_>jq->
zf`0<YI8<xU$E72C9mTlLZ~LOTQ$(ZO+tG>h%>M<Qm=eghlNyy<>}&AlQq*1)!xA<!
zQlY~~&q!bafho0e$33^;mn9sUnO9WY<!zU^fK<&7fJRf7GtI>4c>lgmkEm#}0+Rhe
zO#!%Sl?Fuu35ZW*AS4;Jfun34{dt7_YoG*j^8AB!bd6rPJzC|!XY#MuyomS+1}nXK
z_#Sf?_{adb=I`Eh8sTcqlI!2Gr`@x7gZCa_(xAJeIxQy)>5LeNGc4EqW{)Tn{s2<x
zP;%_)FQ=T(;)0gnm!%^G_eb}1E*$I@Js!OLRHF!HGO(7ai7|Jb;j!W%eyN`=^eKKN
zto?AAU9omuYOdYs;7<Rk2FT)<IHILNm-wP(r0&$fN;A{4WbF2lKW@cE++!yf?jDms
zgiE#qD+rM;4Ta8aq0H?tMOZJ!>Vdr8{DkJf^@8)~MGic-W0gDyOa=^{glfuxiaCMm
z>x~8>)#tlEs=Wu6%x7n0pJ*@Or~!)D78#}r*gpz@u>DP3A(G6Jom7}{^yX^(pA#h}
zOadb!DRECi1s5MlvduEezr$q_wr<34!y<oRi0$~SgyXpHjJ<Z%Fr+pqC&#lqgs+(!
zZSTADFyAQeRNPyCR$^!TLRdM1UJ>~j*7O(gP(+sSbFo;t#v)}EIl0fOEVT1$lR7dr
z(jX;fd(n9T4?~mIWzAPdu@=vZ`J#tR_s1XcH0gH}7d1&u*!=@8UnHnpY%J1O@(TUk
z1`vO93V18f;mLCEcj23bdCdtgVj4GB;me@%@PE`^EFIpX;5=>~dTS2BsCPKy!7!;x
z5SDvi+9_kf_4{1r*si&|OXRmVbrUku&W~iK*Fg50ZN=`^{Xlt(Uy4f`Q<wcBxxU7>
zretz(qo@`V=wX*Rq1HE?m%gKW2Ovbi!~d=YgHj-YXGs+qSuV*-8Bg{`=!VUS8ir~^
zdh=;*5P&Lz(r?Lu>ATt$bgjp#rbL^Lu=r_Y$P?*{uY}3zb3Iq7LIHto)L5S)cDeE|
zKe*pEm9%@HWPv`Dg~Vj6T4%?S1zGbkx!XB$!*)7rGD{~SxB|NMq@myfg*kKcVnKm-
zS{N&=9$38m=pP$((Y%dbm%r9}zjSt$xn=R=1W}{*XbYXVXFW30;AHp7{A^1M228BF
zNP;9lpR!+vbqH-*z(djc+7HnCmcp%#+0KKsIOLlnxOTCphOYbub7vgbd5f8|56k{z
z{eObU3c$GGfW4v!9|P2L`9c)(9}D^m8s@;j3{~EEO*vo*7p^l&UcWw;Sj+M2+=CoZ
z{+@5fBpw~P+^GEVtdEYA9J5(f;ie*G8*@=j98XA5FG_G96`+q+`!mXcQvb))&}rn1
zXk09TEGhb)qhY+uTuP?d(Ig^+g{Z(xZK{mSpetCQvm5mlx&?@O^%Vt}!z4}j>!ZHw
zxlItj5Sdt*6rcKSE=e^GEJcGp+)r3WrMSHsVJ4CDQAE!oiAeEE7Z0Z~I0zOACz2mA
z)xf;6tve#>MLT7Ss}JEdD3vuC)-!4UwMd)&6guD*BHrP-kd`{}^Hxagbym7o*gFrO
zwS62<FwRL+`#Qh1%wB~+MJQ5%65jRO>g#6Zx3i0PPQcER;{89h4y%s?TXv7P7L})>
z7tGLxqk9#dWz>(k42;vce5D&7Rz8}&M!`k&aAyx)^S3M-y@Sm{hD9jG<tF|x_pLP+
z1!GdQMG!LB(kfio^I1&EN~}KQ%QIF*$8Z%kOeWFxci5gU6TKGNjFMYs@yR3VM2Z@U
ziXqre%6cAKUqbB8nyrRILql+|v*fxThOF+$!NcHPzX9}$H(p(CfiP^#Vl*aY1}dP2
z1{@>xs)LI>yXx=Mo#6f4&F<?j;nrwzHKBOXo*A<_mUbQRqn<h%XZ^BYQK<>ZH+}!O
zsbh-5*L-{vh;8CWMgtTur4BUvtrzFK;#b?%O9|#<aRc7rE{NE*pY2OaA@b!4MrUIW
zwEjtQpAYLFVE|B%A!TiW_p(%=)?S}uyPu_9#9V$u4!YXbsgRp<n78uCq7Sq1xMf08
zR4<;jQ9`jzubFsg6m!M`OOLE3+iumx^~4rYw|s_ZjJ*5Ls6!R^3e-LjJuzRTpCoiJ
zCm_)`lKtq5%AFH{%`(AfxO8iRW7`!(!G<HF!j9q>QWgC0Nm%CR`b|H;I&HDELfs%N
zGHne%hC6Q`@8Hh@zd(J_PmKjr_$1xV2svV@(CEd5bbIiny1rGOm=GU)Ad-mpOq-QE
z%kr2HR*9rC(c0aX@y1;cbk`~1b)h!bV^o{x6fn`xH6i2B_Ilu#KJNW2eE0%N*YPxu
z2T1NTtj<p5g~r)}UdD%5%&AR*rKS_6N72m0++_#SI?y5U2V)nddN1{<!iWi<CE?()
zKh6C1PfrOlX&Ygze~g;VAa{%jK6ZO3L!*C3>HzPOKPnApAWtG9rUhL3G{*ZV4DEIi
zip)#n8n_G??iz?$k+THSu^Hd?dDK4lAHL54oyx0tF~xA2dnvv{LM?VRD3H5CKm>1W
z*KPemNujqwBS8L6sqwPGto<^@|EKy><Dy(+hsRBF7dY?w4opKK&27E51gymTBMC;V
zG2f!~KLil&08)M8J>2^jr9xpS3xlYGuq%ob9_DP9x}UP&Cd=o9p6owu-i1DiO@9Tz
z@3y$K<Y>wn`Yiq=z4)WqQgD9z7^-pcp?-_9VBkPZ6`SFmK^pYvq?uj(xLt<kc>)Pb
z(A3BnxjsKL9E^LlnZncfuysCE5n$OjK5ji|(#P`sQbw4bl~A%RvGtd=Pms-=WuNun
zFU8MdJltqJj#~Zu@|7*8Ax|b@1FTu>pn%`T+-B=cUHlYh-F`DtrfacVuwq7GRGTP;
z)<i3M9pD=N&Ii6E{r-*D=yc3C@z*zMoQn)jw*{n4l?Fnx5?z<Qv{ZVrOG}dh?gR=*
zZNdhkozSaB@*;~wrM&aTvzR9{$5rL69O4#NBQ`9N-s#fDrtmQs4zr2#Quoo4JCTOp
zDBy4|zm4=)TizDS3Ry4LuzfGDigyEh^kV5ky7*HRKN;w+MS|Zr)AnMUOYPg(UpAM=
zM)8QP`11e<pIu)xwI^})mGs!~LO$efzjt&hAjo`%*denyI{0sccIQdykmyWG^#Q|(
zlDN1BW1St;Pga|aUe}X?WjfdjxsRo3ytO3<wB~mC*t&)#bkyF>$UiUahy-bOnyC=a
zk)2K8lV=lts^o_w(YSPS)|TDF+(taQY3<nvkAMw9aHRUqfB9z*k=zk>jBn1`;+bG;
z4M(LCX@8M0+qO~bOZUnRE!63GFmEZ>IADPS;`{8K+^re9y`~L$@K3ifD^b-T=>mz9
z28h&rD8a;+rhF>7z36>=a<y2+aFJS)VK7}jF;+Uq(^xaWAnZ@5EyMKqU~PFcb~fPl
zb1{7$)DtHI-%^PiF&^Iz!K}>$d7<dn>LUtbOfW?SxnxFN+ruatx8q=iDBH#PK$hGK
z+^!o%Y*3%RHPLYwyB;lnE+~7=YO)hw$7@|ZoUJ{u2^e2(?D5HQDcqwg!ThS5cRFAV
zH{X@J?n3gCM$l=Z`BM-EMF7P}y=5h34C`y_cwX}^X9DM}1ak(cm>o5l#NYdY3<%Zk
zQD|5pr4n>nSm2+^Xol6PANtW@42e#KKGpY8<n#L8TU1+kIN8OT+gdxhPS_-8Pm0{W
zHyP8lN+j)m8Qrh-y1!2>QSrC4-CG6nNq!OYbaq{cE=lo8XJ4&s_SU2f0Ub;ng*dWp
z9QI-iiLRuuHne@6oApau&*Vj9>j^fu#<$pt5X<lqS8(2*?b)nyKyW&Y&8$oi_ur2Y
zTZR@<wA6GI5|T1mzwexJv37b&O3)W_8yrqecly&lV_T#D)R4XbTje>Xzq^C_p4^DP
zcE!_n!GX!+MWy6<;e97(U4a*cz#@~J|ML{{^@}+9b0)JWYiNBMKCZ088q{r+r1bUg
zCPh-iAJq|v*2WLd2`@)<74UB*&mD>Yw+*3WG5AlCT`224_5SyTzxBtS(?HT!VOU{U
z)h6fp)EVrZ?46*u(QdTM7*%-Num{t^D=kmsN3G9wMCo;^b=G0icUqpe`qg|*!T406
z*!*8NBhp&aTB#M+UDp*pKU!V|;cDb#<%iQ;2dX_t!LjNT@Hd)%|2t^>a&{o&cRaqL
zKMAoot9w5Y{CAbV8{%#9_~VltaoY_>Z5M_PGzn`UwOZim?`cqw<157ohme%>-f`I$
zyxq%_ikx0!orz_WWz65KrOxZs3tzi?h2(){!jWbQPmr_83d+=1`(<GAdsBk!VlOnR
z3@^unu8pT*8ix$dpKXQn&dDK%uNR0%(T80EiPFe?T8Ret11V{+r#;_LM^y1PS=iP)
zUt5^*n8hJMk8hJ<+pn#F%APBt<S$hwb=bkJst?04??)CceTNKt+qF9zmOHye>?~!#
z)`KyXhHblx7Z<6Q-t_Xu-zA%P={v8qOvdsdf909OD-WJ`zCrc$N$BNn;ackz)W2G4
zO=rD-$(SJ!OH=`m8(;5==XuN_CRguvGYp2#wS-KXtbF}S!Azrzmv<-r%NyV5?i9Dj
zlT~+Ri<~@t{J{x22nN-)z7)9-8G~@e8qO_tS<feYk<s-@&Rg}ky44bQE^9fu=Nuk<
zd1qA!fTl<@-7dj1daU9y&U<ZETR9eE1T5t3T=9NgjxBNy;7Jb%YL)M)@ZNsMlTX<e
z=K6hSdMLJUoz~}dzvZ1)SNAxUjz1|gCQd_h35G&uGMoZ`iK?#94_FCxo5CGd%x1F6
z#Mk6(F}$7c;w{F&-j4cp1#QkBMpStuD=n(i@VoJ|!r~CX+xoOf(3_#`z<Xmb&&af$
z_ZG`wxV*#5!OtatzW)tw@}doqz6ZH?|2f)&9mX9J7d%C5p=KiK%MO+U3?Jl7n>M1R
z?SgNiM3(eRk=bTzuopM;5gIcgOV$TI$jiEZApVrErFA?$&lV|$8LKg#7~ZCcq&0q6
zQl;2!CP`2d8XGN<UQzV{$B5G5%PywL>+|NP*}=ipy_sCUU5czGl3}`u?E0%dUvBth
z@itTZJWw`4z^h(zAp=tfn9Y^?Et#_r>eEatc3*b=xWwquUldp1qFu=EeI1>}-QdXz
zA9FAkU*!4~8<`}}LnD=wcHN67ccl()&o(RiGhvd&%u>@gE+v%{83h~0{eIobaw%`_
zly$78-E1TL%h-*c>sRtEp8lElQy03;U#ttH-IT3okOs%43Mv4hJR^SAteG=9Tt(WF
zfpbg@bZzdm7wmhCvtb+~hJKHW+5A1b_fN(0a{>1;gZ>F7oS{|Rh=;WuFQY`4UQ<;G
zeu{wy(H(p}r(19IOjiE;F|Z}3dn5E%2L3C5JziypeoGf>%CS;(n7z#}gW~vmSWhpQ
zB&bd(ixGtGwGD(D&L)qckV{y+180-HR%q*rILEPzam7ULnhhUsNYQ6!7bZ?wY+jjh
zdvMR?!A5S1(StW!qF{+mDHTsy=S8Z1_w@kA7G$gYP)z&A_bRJ0rjjl9p*~?>v`dR;
zVIdL-Pq}uNew#byjVVgl+sohmvGFIxOC8UNHNGm`!e=V<f-$bQ61%juN4In$1JYbu
zpk-E9wFn@R>PMcYoczi#cs74|;B&E0?j{GA-;bx9UUn&j)#<<lG@f-Y+{@dsmjtv}
zD_YmSQCPu5(w~c4T#$DWYzi&E&HOEuudwBCt{}P2-wd%HgAcq;9d$(;>RS7lc&LJ&
zGqc&jqmFhXx8fcX?F&j@UFSrshe?HO>tJaRXyG~|oviP!Rs8!|TWj<?Y-nr@Eml>L
zT-|>o#qxb2()s004$i&|*k4!a{M%dbKv8Dw(WivbysnqMLpr#3fj@9HwDnZ(2-$AO
zXU?8I`amWS9(!~qg&i9U2sLLL`vZ1^K2&@#_8jOdCRh~AI~7h^=IqaD$6R)vm0xdi
zxPkh0)Nj#5Ykj({X|T=<8!!g{7{jaP1du>)dj_>?vtMuXm#n6k^r}0xNKPsGIaTr@
zeOQDhCoX^C#Kp=`#(huJ;VApIS*E{Iop8K#7=v8z?>mqBfiCy)Gq$yBHzO*G<SxtK
z(|Z7xHCnhp4a#u|-BG0Xv}1a<OK~|~mP?<E@|5!5`Hp?XxWbOR2Hn$(*l%Q7JthTJ
zHMTz9U!o7ki!KHh)S$<YaJ`$@GVUEB%Vf{kSnqVWtC~BNC0%$Ow<1)!LFa@^EXT3z
z`|0qqOR!(FuikUJ0sbwG9zFr5pSHPf&{%poOhe3lI1LMzXk|uV*m+!fYSnrA+8jT*
zo&Ce@H~jET9_2eBA(%*fr#PK2`&1WydQz+@=FmB9iB%Cp-}~KziwGWa{ANZ;mJ-N#
zdp=$`OXIBV=u03t^>04$Pt7npPzX5WGyYl<1W=5wvo9o|uAe@pl}#B6qPuy$_f0!K
zf4v#&!5lxi@Lg2z;QE&7by;k^vJ)eGx$3gH+Q;a27)({$R${By({Ab1movzXnQ6~M
zb+T<aHFU;L1caMin2C_NZj$T(<t%99O<$jo{@$kOsT&iwpIUg`w9us_Cc4n!cfVMW
zbg^_5c}~on49<{zFC$YX0_n{J76^TpCYm*&s)sA5)|UP{++b>LewTjeO5tz5RcV9~
z&f-$Vw5T>O*zhLi&gr{%|GM?EwMo!vGTWFR_XC)RV7lY8i4WN$b2WU2dgu%{GzoSv
zo5{Pk5?O#5x3ze;5#HO%_wvHOG9zm_XO$2}?a>bDx@?`SIo^8pU!3`))faav>)2&6
zCwLg=VAS=9MlZibTMsO9S-yFtXiG}^3B9=ie3u9J3;qTG@+Nf_?MPnE9Z%G)crXQ2
z1FKLk$}9(FOPpWt)q7q(0BEcr=H`0`g2n+IxEzG@Fuxl%fk{@)J8xaEo79vt0ed5l
zKcgN{B*@ial71~2F)7+J88|ZE7mNFlaBLFVixr>&CuHm@0*EXEpa;e~tk4!`FCz68
zur2I+1EVC^D0v`*8tQbrfiw?Uz7G^y>vg7&<xeSc4ONqkw!9^<aq=;sTyeN3!!bTP
z#E0b-;bz-!Pz_^1f$)&5vab{iwJBoR&9g9~zgJ%cf(u>1Yl+XHo+T|D{8ixKA{`-D
zYGdu@OJ&TWm}$>uigA0WX3*QUh5~cBE>-Vv8o$VwzUzqb+dczcD^-M`aQd?q0^G16
zCFsej+aY8h?Dz09_ht5jS~cfEM*#v7naYPITf@qzjb!&#{b1Bc_QCKN`6>T2xN#)6
zwkRx~h#YDgi^=H4Ru$ZWOVLX@VAM=MfV5Fjk~GtbhJS0m6S7K#bSCaJOJM(`C(b3n
z#}zn}v;)B)vjy9}z5SY63m7&Odz{PP(LNj(faqQf>T3nuf3HV_{8%y)TqI_eYP1pw
z<3LK><)N`Zns*F;rI-=el?&X>K?>oJ&9uk2)#fc#9wF|kzCV?Pf|oXnFkSq*u3%6l
zV*98_1a6V4Wj0-Y-vaC)ldT;DY{x#6jZpx%JtPD?0YJ7m^cXxnozac~^>Oa@tj!a=
zeU2CKolMD}eyjFNDkISG%Ex|}#{Ch0{QNpR<Sq`L4t8ZaSF458YRAINfmYVwsK4PJ
zFZ!7zRlLJv4hbL?oytEqSrymO7(3THx7O_Yxn8u1Sje}oh+udzf|;>I|AneFj;lzp
zI_y2w?5q|}XUKazB%Jp@6bP+hxoLRVqhZx3J{t8JF#Pz4LaSkx#}?Md;|Xh0Ff+){
zj)<R;Fg1R}LxKULM^QuqcUpLrEC}2Vd7C=gHk++CJabk$>uy3@w6053Iyv?ITPPXu
zGSP(P56Hr|6y!#xQbjD($$Nr8l*;_d4x<vzz2}RF7*I=aP(Od@$H{ZDHuUx?(nR0v
zZo0o#Ic@st?`~XKsbb@2=tEv^%f?Y~%PETh)H8$2huC50fU>u4&s=PPV)hLpuZ-Mz
z4^<cR<fjitEF36#8kStn$0cUkhN7?H;x+docw&Ed<>E@F`uif8*>bKk%wR&-YSJOB
zo_9NXma86lX%@Ua-{uZd#I``OR)5hB5{!K1PaDR3#IEI+j`+6Jv^Cdp3K85Kj=-nk
z{@{>{f#*B_V;Amt0V%6-+~1}(V%#!N5^hcO*L{_Cf5W9iG3H^MPJs_{-vBaGPbeoC
zpbRsvURTqPe_WwrCdjKbxVXs-E@}rK$?TM>#yTr=5FG9U0Yco;X&5cL1PV$J0Z{FM
z(eroVxJ|DDIKT*?Fsnj+#sIcpv<mxkP;#g`_Ov1fU+Ixr`7eXf8xvJ{RVj^flY>3v
zttgx4PEGzE8%?{WoKbk$RNkP-v73t8tf<h22UF_$KT)g-fc4IlZqGR!*;?;8{X^e#
zv}E38)a2BGt5^-|Zz~o9V7EA{*N*QCuF^z`p+8Wu06*Zv*<WJjh02VsmsZyiDiXbO
z;m7;in%VtE()wmrH+_T)pfBR~-DkT9(EBK&1aa@bY1!Ub0>!t1euf^#ie{F5l6Xw$
z@EooOy41Q$F-Gt}Dj5u*<>c@3MWNQ5Z;NjPLHQ5WmoS9--LSx+!U(M`rG{y>YDa;4
zXmnh*knyV!z;;Tg(J2G6O8|#GvTE@Z8o80Nc<EoudI}ki@Xmj=qv+J|(v^a@p+>cC
za=wE;(pT49Y?g^mjMB-Emn#oBwEJ(b8-u!DQpwN=rhUGF7?|=s0~x#O&HgZTHKX$q
z*<3^;1yoKJAouDW*fiT!l`Mw|(RaXuR1hjR)l&~!c&?*7rwxWBePyhIuce|H-a7WZ
z1E@tz8q(ugG3W?|i1hCO+%U9jS}QpXVh98Bq~erpKb({cbOqtfsGNpL6|n=)K9-v$
zc@lm$Qj+;i#l7X6u<ZCo%kk^{5f=%1ci3k9>I|w^F|BKeBt!l`(#|TNt>p{%!2_kZ
zJCq^?3dP;sDemrGthl=tcXxLW&|)o4vEmeWhX5hua?bfb-lu!tlf5T<X05%}<okYO
za+%yeAH<j_z#WiF6T!^?h!XrciI13^=;_Y+ScSjcya5!7L?zt<FOS|K9SH8puMp<A
zIEPcsh2AI^z9eUOo_CGzmQWJjETC|p&R087h)o(0k?BK2DU|~uIs6%?<S}#x<#-?B
zBJ?WW-Z_7r_AM6zKjz7hV~A!nj0cSv#lmHx+2|Tk>idkAWBS)ZeyJS!E~-oXJ*%(=
z3;m6L%#RIelXPQYs`*Bx0f!EK0pCs4PgRIaiV`VF&KDM!(^;Mw?`Orwg&F59c~5+L
zoXGmz+KUU9o5j0+yGhS9b3+MBw9Ls*`vp_ycqznI-$Vm&Hly>6#Arc4hSk({;w`m!
z@3G@Oig5JZ`iqH6Kd*RFpLOyxp3v*@<*=%XN;d15Gi`LE0m8&3v9?3d08NU#VRo+b
zp;$~Th#VY>578b!%-*OD1B<^X#?+uRe`zarZ$3fJq4iW4OUBR;4FP<Vu<;N+Z-N*G
z>OyBio&8pB&V6=HEZu1<hwR(|$me=$ZrRgW2%=(QQT|$q161g=Gu{5W?PlK4tWmf^
za0GfX)4Z+Y>EbH#lJnu$;}q0P)6;Bw5(R<<f=M1{E8#E#TQm<<R;<Y(zHDG==Tr5{
zv8K?p9|52m0gn?8PPvpptA9U!GI)}s@A8QE4S#!p1l^QUj?}>SpYQ`CuP$rpNGrel
zmY80T@iWt5!}U=?Ie@d6Y+~odhK?$igTZF^W%9~*NRO@huR`6PB)Y05*UMnB;P0^?
zS9$}WySN1<09pKR=^28g&o{=%%8x#yC&qJbGw&uNb>IPi%EI<dgIX2RX{fs0r3-9K
zv<2`N?$n;#u(75M9Voq!JVnvXb%o{VD2&1@qczJs%!5DTGe|O|*#as@<Skk9fB!H}
zLqH(fGU*${4pMhj&JA_D3Ko^5wL$=_g!|Fs0TCoCXKg8qQ?KgfX_|JhqT5^%Bss&<
zbJ|=IIg~}V#x}vQ$LWq&+=5x%7N@6-$JzdjfTqATRvBhikaR)xxaRvAIB15k9zdM}
zDhp6HJbi|jF6`W8U$iHep@a*Pa%&9q?;91);x*A2X5`QT#BR~LO|cp`JudYVkOZ*=
z^0GSHEPY3Tb5Z?tp(cAy`{2RqD+R;s7lEXvoWv5eEF;d4I3AUr*S4GoZ?Q}thxYUO
zUc&l>?%ovGlljO3q|f!bkt#)=W&w5+a4}&M@S(W;NZCvmQ{2ur=N$zIv{CE>deRj?
z(YK2%C?b<y=tZg!&JRc7*d{}G=5O|6_D+oN<j;=suJXY|WBbEdSa|X=z5ZCl;4+32
zxr^?)iqwV15ikFxX6aL!(;)Bju~ko|155}1Msu!p&xO_Kvg;R5$FsfSRW|-7;pdvk
z&0d>W@1d2V5A462Uysf$@ST0Btsl9X-W+@&JQD_2YUatF0;>tiYq>ATWZ$ysVO<Dd
z`t82-_ivRWfG=PL7mD6DCoDv{Z?-iH;SfWji{Eb#?IG|OEcBZ#Jn!UV6h0or%9Kxg
zLe5o&KD0!gH>1Vn!y^0q!M4nLW*Y{e4GwElowkS|3&Ys~#tErcegB{c<gu>7-vfNI
z$-8H(g246=dGqxU+v&++^F!rNgY?)qBj2+TFOb)`b08Ol$vF2qIYamL|KA`VFaORC
z1=2h-y1S>Jn%cGA%l-5DF7vxyy)1at<K!}aYBg@ZC;)<o4BKKzc~4k&qC{kNaBYvV
zfXuc-66N9Ay+~;T?ZhbHdmnI|!QE1QIK+LTK*jlki*B9wLEeecWyoTlmm+Km#zk;(
z2J%t{lyp`bs(>$*O*;fmd)6^Th)xUrUZs+XCdK)&LstQ0Ij|kgUqdoeXcj(`b%P@P
zE(?4ajt|4{PXb+kMHm}&zjP4R!*)DKM|s}C9MWjY%tgD=aRic@32T7c8cxEG;|Ljs
z$vt}Y)!uofIUFV60qElMrD9?4>NPRPY=0jYxBLJqXU%tqos2%8!C93nCLYlRK-cMU
z6@+zG&Yv?j?Ymg@xMOYGQ3K)0Y$B5A4WN7Tw)`!R0q98LqOHw7=9`uMvLn<QwtL<I
z+Bugw1zk(A<~~hWvpnurpkL~9kEQV_dGEUws*0uW)txjLJ{D)N2s~^BGqL8mjU1_O
z-Zy9=x|+EF>`=&N)&%hu>q3ryJ{~j(p8p!2KWgcBi*WM@hO3;;SzflgYA*w@YpSI&
zhg!XyYwGlqh&ry|AvE8S!trX)mB$;p{EgXB>9lHqMml#ZOM&o<m$%*%hAVuW##omT
z=qcPfJWO}a^8{ndBpP|0#X0-iANTjh^tntCr?R!aIW6zuy_%NO{?)#G=eMHJ0w;#8
znrG4q!*%|1a)d(3Ex6(kd4ghYEHt%*kVa1Z?T?(QP0&3ev;QxJj9kAj&_W4EhtkfS
z^zY6En$JhmcNrk`C|m1m#>BuM-LJR0{UHMGd-!M5I{x{2t*yeZj*G1oMqF_#yz_|s
z;6a)_?sVjPY>!t+oEIu-A3_Scu68+Se!`Z43&KSfIU2$pJP>KQGQ7}*bft)2l#ZlN
zer`o~i6mn`=_xpT9RpSf{JSLsmchZ4*9!f%zN|z9`gmBtx^jC{U?r5tA^aG<u>=w>
z5YCfQ&lM-rczf!-e^n!L_h>02SZ+SM%|d9bv*~uVRsd}^|4R`L{>jdDB77WGH7asa
zk88vRleqAb5s5xY=V|CQn=8w3>b*)Bu6TJZNGED8AW;ckeO<89hVAyUL-(A9p1!_Y
z48?r*owEyv3Bz4D86+%jM+yY~hCaJF6p<CZ4!^D5%@f1%yKg9yPe)+i!~g^AkWQjf
zm0*R0P61LHDV5WgQ=jU@I(WB`-NJL8NpeBZzC?Q0Z(cF2mmdsn$gjfP;OfvZMWUE4
z=RoUD|GhJnKxwbKbZ=4)Ob?KX|I<8XAbQXeGm1}Q;9|FoklmSrF~{{cmDdwqxH%Mk
z2lcH3o(I`2zbH-$ydsHmWMs-Iufo0{;A$%OUYwr&xd=M8m8L9zf%K3jiZ~4tS1fSX
zVXphP`GZfq*UU*Elnh;OiWpKo22fN7j}L@=O<LcFU#y#7)u#0EK8CaYvQn(j#SXFh
z*gaE?_^BbK2h0nOJlTZyZL<<81+1S`1l*Rib-7LWdMJ-osb~4G*j~dnx}GH{DTX3X
zOpIl`b}~mP{r0Q!RD6n&a2Fm6glumDP%!-WEJUC3Z%fao1G_I=N4u-k&EULT&Co%k
z<UG&cF+2Kh@`_jkraasiFhz<_I12K8p50~+&D415BNT0vdyAxt=pZkllV6P$sDcI&
z1CQq4tdcL^Y4rQQwe9VB+d8`iAi|~#GX++CXo^zpzQz%V_SA@qvSIu9aD7lAbkvj+
z;O)fXtf=2uK@Rz|AtQ{VkzJ+!Rbq1&Nd$5=5E%G=%@fk94f=AeV5GBZo@f&BLy{b@
z0h&U5q=vwYPI#k5@&E2l;5Ykasl7xnK~<P62B<(^d_0sOpzxH<U2^t{Itu024rPi<
z$#*|J)K(c`;hD(cuJN3hjUS|7Z2R~r8&lP)UeTQiUR27Smh=tbH0(QOyasj>X9{yl
z=Cnb$^wCj{*Kt*f$kX51vW|oJje1Z<=aD+LvkZPv@}9aeY@e~g7q@hcB?%qw!v}bL
zxS^vp*mEWb1}X|v;I?h%B(XFWMzrV(`q;fqlhGdO)V)fnF@_t-E3wCH@rkFPv=_C9
zu_yo7oK*XB>q(8MCflq`OB_b#;Z%mK;8)=C$9>7WkDXg>uFt49`EG*dBf`JMfp*GV
z=3?}>pL0MaE7l^IpW$$Ak;jLI_)tjvz#RGA4=5tlEPn<8__p*=e0kvMS|BD@D1Giw
z6RU=e=`(WFQ+3S5a2$TtE?nl_$dc3*7dGlhI%ZZ3E($e0uWbk9trb<?2NnPh2VCKp
zvm=^yX_I}TP!GXJ8d{(Pa0*Z%c4($!4pfcLLmtAcQFZ~vceR4Jk5@9^!)IsdqR&S^
znwY!n4*8MG-B2o)Ds}>gI4gyjYT>Bf-gp^%$}PYSN^4k}6tGgHh-_x!_>fdTQ4V0a
zg%N9Pu@9)vf2p1oLy-83&q<QUn^7~I>JE^iSIK|s#f|kuj{VTV3s-5MwNRyYZJ33r
zBH8gbom9l~C%TJti?}T7lo+b)$NJ_4i5hgj%FoV{e-n?s?)>!kqo)dbN5zJXj8Mde
zf~__#8ui<ZCb4}a!<X0eZEf$BW^OPtzqsTOU*WN=+|ed3m|^*z(r@e!JsK5x%yOQD
z+kD5S!fApaTQ#MAvai+eRq0K(b`BWTbh-l*{}!b)F(b@u`}@0nnhFUj165bVR6Ocp
zQ+ae%`3O<`81<1jHSv7_*ZvgCjxmyI*|$1Hb}G^W(WrEf6n>2XqIM&_y$;)3ryOAM
zbMi5Jk~d=!9K331RB9RP3QxQP)Ox#SCvkjDhwTH<bYfaJ&3+bDZSI=qTk9D)^PH!u
zh9o^QG$u}kSMgct*{xHv4t~VSD@1s+XQWBKt>Kd4K5xnGDv)wDhlyCDEw{s#UY>E9
zk?G__OHNll=he_8uKUDEV95YU$=xb`Z`1H0?~!~Gq4>#46z*cwxJdC<o}^zMJ{mq?
zRr4ES5Lgm!^}II5O7!~==Y#F&A*e((Mcr_DEwh|cny!X9NDhll6`$6uU?3VXK}F{i
zFpLf>YFM5Cm-{n52ce!KDyNal6J<wC92cN64xZIS@hh&E#%lQ7%W}*LlSATQ$)7is
zg_;GLA%#oqmM`rx2eo5GLlaQ5ou+<E_CTkFEqeWHP~n7&R(goTs^S>J&uSEx3k-(g
z!BUCla3QV%bTJG2o8;8$_~uuRhbF&e>W)DAqsEx3Mk-@2z5Az{;ZZPa-UInGM$FP?
zwhx7s%8~KBz;+F?nYdL&bA&cE;!MhFDr4wP_b8oT_MT|S_aHvdaH?%+9#`SI`MNYp
z;n{v@LiS`PzhHj_)ri{-9VToDL7MCjuROt<?;lDQ^vETv&U)1(k)x{=>+A?MtC6TQ
zeyHhi`(bEm9_mmvp9#DdheLOqL~N|fF%$0_dl*nlRh3`BSF2hCK~>M2umSRwqIjRI
zjUJ_{ulBk1&g!~hpZ&{RQ#Lk<z@Nwa4O?;3he^69fMyG9Iak&p=gx#@5(SK80|P%8
zLVE4oYqYX(+=?_g;0eZ=MFrvH0d*gyNM4U15%`7I#Jenk2s{jji!&-xV@h6K#E|X<
zPj$6#HmqUpeAP|WwV3!(3Ffk4nNxf_q3V65wJwm2kHL<{V<Cm`JI|6v@9|Mw(q=<K
zW->>2ea|C!#G!9>MNX@L_!CCt43=k=IO6tgKw!fwS#DMt${6QJOgCYwaP)4;$i(jl
z2|#uDhY~Z1R@TqG{MF^-zan`DP7UR?;XHqBEnoVLdHm8yGR{DjckwS-ngMHCpT4>0
zY-o?gV5K2QpX%AAo%j{cXH-Bp7TE0cH(s5wwYefSpLz(ZBXJSa;3tCO%<R~TzFXQ5
ze;J_7j*=WUYuP|g?rJtBw(lTeplJrpX3t7nY@22s$Gj{idl_DL`uhYx>LqMvtu3qz
zN)1EFZT~u}&@CX{2_oQcBkAiNtd8?`iW_3&)YE$WuyyRg6+k)B40oeNEj!#yoJiDe
z@{~QCkmJuru8RTC;b7A4xK(PlE%S1%=bP3UKx=qD8HK2fb<JFV8l97dIhIVcTeUc{
z4~uAweIeFrsnmC-Nr3Apxw+)o>^T0SF^9X8;(3Oy4^#X;+9QJN)d8)7m_-KY%-y47
zBL*|zA(mmqurbv!a#D%1ZyU)`txh^wwv+dB*O=|6ff*zUL%z?3>=qx(iYrPEpwoT{
z0kX)KX9tHzz2jsfEUC(tCFqa|?d0rl8T1z2ze9anh=#YUHHb^>j1r1#i4|2e#m?=_
zWS#&7-p#t{wr1C@ct#zDF{*yzWkn-oJ<IbC!o@^bF=7g-i<thBn6aXKefDeWN@M}O
zU6yOMFaFG7(C$U0;j*1)4RLjF2-=4^qW)bg6l8bp$aJ)s-`1+#rX;gD1PWmm$PdO*
z<+U(NNX{Wr0Ar|CijG2|bUJ_$m5r}6va7T1;sX2upg1qYYT>e(-WYz5xCrEE3f9tZ
zXccIBM=jYNSX6ISLwS8Flo^oKNzBquT$!Z0Njzx<bna?KCYqb~a6U8aSl9c<_WZ`e
zk3B(7{!x48t;mj%L1vC2E43PIt1oY4SU@ZF_X{F0Tz79i`D6A_X!!GFsIM>tvQ+YX
z6O*JrT)Vk?qSt=YM*g}nHyZH%oIEZ3C)toKf`!An97-^o4qIro-H|M2D{jFkmxl1I
zCNdf^tIl}zl+RwoikU%HgHEN@>oq-=%U^Y^ds*k7KsQ^DLjHHTh$fow6^-7VycbXL
z>iqU9D}g?><rI<uUV|f(2!o^Kyl(PdlM@nnpBmp8k_lv!*FNoeylZOE-e*=Psp*=u
zGXsXbi^D0swXNO`9y12Y#iU`6tGYb<tnGVH?QnUfC5X=O)}>jzTKEKqs#b(+|8TZ`
z!P;)lCSp9ESp{<NlY;I@eqz-mWIQF<tyKQL__DQe4`oHTv{;Mh=);e+EDPlpBbndR
zX5M^`fPo&z%1VCUqI(ZI_MV=okP{K}Kd!BALe~5-Gw0UPYkQtYJj_{Nw%?+h&AJ`Y
zmoI4|O5r}$b7U9>?o`JL2i~f!$7fbnxsx$+$duCgMK+a;J^@A$5LP(c133f<;fTTo
zvxyO?Zq4jIOqzrjG2vDeYZ{yOs-AO?GR1kwnkRBjA-28?n%hg3rKLEI(qTKe9Y0vV
z8g5xy<QLF4Hy6cdsx^Hjn(*70Uf5FzK#x!+fL3rWKs=cdR}uU&U3|PEo?l@=vndUW
zkWD#T09#aZ93A$IMoW>2^V9u;woxK(zTggiVqTYP5&|jDxk$;uasg;Jqu=q~W9_AF
ziHrorDvRI!R<?vg_Q>W#Qg2J}dDT2RfKyY?2jES$e@9Og+PIKwy(*|%?uw<Y`^-qg
zM)*^4x23-qy4tV*m;&QHR=W|1S5<3mi0|)FsuC<MJ&i%DVJTCff1RLt+ogB26l@a(
zp$OLQ@yObEPG{+hXI|h<pBm^PYtG9}cqpaclo{nXC3Xx}*_!0r<1JLKU+5D<SaYhW
zaw+JlSabjmE^!67TEtFt!u8($hK@hfuSA-ull!weOX-j~d|tl-#=yR#bP2F-$iri^
zGqz3nnXRwdTO@I{{coaKR>jx+ib@X&yKq<gOz{syDhrn#rh<5iE<XuPfuF5B^aQBJ
zwF%#aE9rRl(8O(9XyzhCNmtG5E7Ia%GUei!ORxsapThV}FKBem-G>MjcsNDCHEo+9
zlkv#DzyoSuccGI)T`7}t1|Uw>2f%MZMqtDQqnO&R2DVQoi6&wYcJVQ_GJ~i~BLLoK
zPkvAb`tyB?kNxtcjXHakgy6N8i2W5)0h_h5GokGA2lEplJh67PU5Ch^aymbmn;?rL
zG51}jxx_;G8sBN8V$+K3=1-A@c%m97KW|VoA{{QG39+%Oxck;A75;J70%kFsO&CJp
zfx+MPxDGIKm+#KBSp!zS$E3e(HMap$;Y{fC+%S?%ej;{_=UH)~AQ79Nhw7?&Z0A<T
z*roH~M|EJC6mr;+CC4YUVo4J!q5|yNdf(Mljpyn}G>w*?8wi@f*nYa_OsnJe?JVP0
zU&yx6!(WM(-(Gr7bDI2P^L_BuxYvmNlr0WWeorSuakKbyJr=Xz@uVMpvzc4D`m^Lz
zkLkJx6SDEuA8(FsQZGkh$T0qiv(e~O?blV;7J}RF`fyd&%B=l0FwCX<&9334FM@Vr
zqv{y>+!ai3`jCS3NXrcZm$r};ItnWwKLqq(ZnMWMgTu8LzO$w`BzLD^o`(L@b`}+3
zi+$4@-|^qE(RpK2+R%0+`ndk>!l`owD%XG9%oDCjdc;AjRxa?aBml)~CU!kFqB<m7
z(W!2iGZ=I?p@M?_uAR$#`r&y2!CsnLSKIMcQzaLnvSp?5Qo?%ucEL>x@|VD;Tb1(@
zKfisVoUzZN5o^G$L(8J-tj`r8%;&`;RH{zl$tTcT4#o;=4({XgLe6(RT$3i>1i9>Z
zP3fF(q>*#?FMamBH4MsYc&hv&hc=d*Fn^ioDE9Az6d<BaZMxSm1-fi7dv&|F$e6a2
z^Wv8w@$Qn7XKsLRNM%&`Ptv>Q_(dLLK>P0Oc&2}M`4Qf`!LIU1l^>6eWrUh9MDjTf
zusP_t{2zujPlTZ@3k}a>Z5+a?fxkD@*{>+SZU`5AwS<oM>w?$amOy)59Gfq@x;YVg
zLR@E60stX$j1*Gxwv{Pi=IG?tA)Z<I;wDQ_D;>NpwINW@nZZ7QSu0Q@SsZ2GyDz}~
z6!y_5l4|r+0M+tQI@4SB0Hxn|sRbkZ7V1UuTTPRNF(leeq(9({@xalbvveoYdDZ!q
zaHax;n%;nM4uk`a4GUn)XQRE4%X-BGJwZmd<>&J|n>*HAPLIv3^?+=?G##b|*|4p-
zxU-Lg2z!$m|Cq^93bL`&V;2Em)6}d^BA#Die_qdoPQ5SHlZpJiz6(2qSgM);<tTWL
z@}C(-ZO)OhHG2|`#cZa4OK5-X@N3t)?8}_}drx8<6$x!CO=jve|JA;EKby0N5!+2e
z6huc}S|WuqpwG7Q^%sRnPS!lGKzN?W;y>V<A!vE(0zvfD3izXu*VWy)%w&AK`>@q*
z19|Xe=Af^#_nDYF>7`aWHNIM0yfAFQHkRHi4#{{tj}7ZX_8p<Q_QrDJN5wo{ePKbI
zY$T6~!nc13QjXNS&kgum-6scLIC`+ut8Ie9c$3_>HHL-|KAvll7@zb1kGfMZ7IGf$
z((WY7<AkkB>cL~(EVwsdJ1y{KzvVUT%Yf1q-%$z`o#}3@JrS_vv8+^o<2Bp<S$vD&
zeGx0{8r}rR{ed!1O%XIki=OC6LV-}oCh1)<muUowYhOb3O&@vcLHpOGzfCR5*RY8-
zymhFbPyl%(WM1Z6s}-nQYSx=@CG|6bJF}se<p9!Jcu!bV6N6^p&}3U&gu|E*I571E
zyUsy;MKPIT_~F&9dXvK(z5r3OAV9EuRF@hT9a?TB*w!deoBCg4o}hanP?9vuZRS^e
zw;NV<7IEP*yK5<&GM2=_+j(A5ZJeLgF}_luLF+L+`%4A-k4Bv4$;2Mw$E`$PEjigc
z+3i1m$8tNjn`kn!vf-i{KDrPuT~7apI$%Q`Rpi^$?Z~WO|7nk<BMaHNMzRYYz}=>%
z&sJq_!C=tV446T`9kDXOCK4ZqXw}$6t5w>P_51#3&Zn@Gz2m_c9DhDo)tnB4Zx!Fj
z*&8Kai!7`#?j{i(6sz5v`HYC4VcjM;E=3zv$_H7G{nz|=l_c@<5(}O`^c5sT%OeWX
zrr>bC#(O-H$`Sr@KPmBHY9V(~Bd@Z@aqee0PeV*}AZjA7@g-(;@yn$7vh@7|YD1DN
zC>kZ=Khy*!xjgu>zhVHK`gM8#kBU?XK4kLn@i$X??;|Ns;InT{fV(%$73EgxyuM+b
zz0vI5H0aBX>6_SCVPQh6N-{u4m!UY|y~T;4>Svf|Q=MmAmBTTD_<tY)=&R*`>nmRH
zK3o)bwc|Q1HGC4{_^!VxVgRkAwE&K|YPdKy{#&aCL7(&5Uu`N-v*9uxXWpaz09gcJ
z<s%YDx6xDBFd70#4g;`1i$<VriR^~#D#{Y~UD|ww5-SKblt3b44e%Lp5#Ep}!r2>f
z0A5NL$RS?s5G(+>6b}0bV)Kz=DbLLv6)oJ1Et@X`bT(0DEa1D0RKUO86x$uD#+7&V
z&x=JSY1VdH>n^eWjCKFkQXRr?>QZbum+)td^e@5}0CA_!&E;5;*+%Nf>6A2Xam<=%
z5diDGt@J^uWv4mxvia?>%eZwT-yA<Ud-hXTz4V*yRQGM|J4hRJ(mSR37s!z9*AO8V
z{c<+TU=51?8~UXR4;~^(G8BD-cBe+I(R_7ZswrKnAJ_3EP$hlAmf5r=G@Aqymk%=x
zn5!JyiJ#5>FCAA{X}yFT$;m%BK8YY+m$_w7JfA;#X%I`w2OL(r0Os%G*|?#ZSvg!)
z*7tb7CnWN?$<va>N(~PU5QTN<{&-m}33Esy(n`r@hlgaNO2I`qcsAtBE!jBp$fmbY
zK~MYY`BJ)NRdCga%9t_`4(@{4e!o+lR0C&j(Bq&LBWYU)&CCK==CEkLZ@B)KHzE-c
zk${}pw)>eiibS+;iUU3~A_uEt>&P}1az!=hQn`I-BQqGC7uuEtA|lwj(gedtjnDIi
zz#~)^TS;e+v)5Dw(L`h?aZ(OdRaRn$H)80qVJ^<gB>)(X(8JZJ>Zho_>k>#{ChuQ-
z{U3lM5eqM*sU8C^3MJL0%u~ziOBtLvg9(Q65GsR)>|iWF11&5r<(LYHJyIzmzO&nJ
zehUu}MM4nO496<{iXGpM0@%hYjAoEZ!Yz!7Vgi#8!>OY+qf-F!(z|IH>oyU@W7hsD
zeb)1BM!dpkfQr=c^q+*4S3D+mz8WiriI)KiB{{mWTqKb{ODs#7+}81^QrRV$&??nn
z2OdSJxMD4B)K?DmA;IwmTbSsE$&6*fVIzq)WzMOH!G#n51rmZa6vw{>plz)dl{Wp$
zK|xG94BwopW{PV5=y>1{0N{@`qo`W_j*oT=pZ&|y_Xgl9g!dk`gps|%ge}GfeSt7&
z=7Fk4tD0=Hr1K++=+3zuem!Q$3dRPZG^uE=v1%b`+_tno^na`#C}YYc_Fkum&*|vV
zBfFX~AWBt7IqOXg9tekg&s?y8?Fk`nx{6dOjfwkh|4A)rK){)gW|<V!sud?y84$Hf
zu7VWB%Y~H(Eb}SAQY}3t7^*92Rv6;4Q`}Lnx3H#MjU@PAUJptAjTW775ARf$^`o+e
z3K*kG8{@x>Qkm+#Z9T^9?X)+tPuSOUdZ6tnMtnoL#1Qwiw&7F@z}rkD`AEZzfS|*&
z*^V-_A^ix9>%s0pW#rpL5Np2tcddut&*(Isl|s_^J17%bxxW{01ZbpfsRO*}GS?Ok
zE8`m!=>kkn&SD&(D_Zs=LzB*J`ptJTuRdtb|K6<aCbqkYpUD8NAx~FuUrAyv68>Ag
zL+zFO_a(igz_(&_mD0)3^`w83?YBSNxh4)?17TD?Lj#aJP-wKT(!a?~_SxJ_IRE#Z
zaFc$@|0jIr|Cw6zfB7Pe2#$(sEW0OqJ@dcuKllQXOv;|ZqV4<JTEHx>#<KsCXQCb7
ziTZwZ21r`?wH8D`d*1)Y&1rI!g%A1N-VpTR$J9pE5R_(hSs>}3TIehUAPe{5<mo}u
zw`xKk$_oFP3OtRs`Esd)z4gi%U2!41r{{p3OaGVng&3emh-IzXbe`DMw#WmLlA5w;
zz;BRuV`rv)a2Q06_`sr`sk%*-Q!Cmxfv+X=>F-98c~a3RB9qzidX04~L6fOyw-4~Y
zfkcR$KSVEur@p%ttC^Zih#8a+fF4&26NjAscc%sH#`yp|$wD>&KuE%L-+=pcPvL;Y
z_guR{5VrW1#?Fe$2CRDlCkjLh!^0V1VR{qPLg%HpRyX7%^WTE{sKgbhJR0Jn`W}Gn
zygy{A92mH|56<tpZRo%lTypNZX^(wr@%?O_XBCF!Q=oK}YA4!&@ye><vE{)qoPm4Q
zXNomnT}1p3Vvu5I=D#+RkHCgz6SVu$Sm@8wpXCmJ2<AzQYJ(Tp=K^Jhr0iC#S4T^}
zd5!)rJ(V~~Qq`Ieo99*YchElgjIXWY7;~*-fzFft-}x}p4lf%Qmc=HjE#$w+N+DRY
zCESEIQGV>Pcbv%=#%7vd#VYW*-CZ_}r^!^0Pxt;QLXkQu2MA7D5dV$w2Un+%|M3z3
z1fu;Gba?rgrz`=i{`f?6dWw+Rx%9sQV{7SNt)QJMu_I<!YS)oGZGoL1_wYkMUrhL(
ze9r+A&o9Ur-GqVQ7^pdM@qug~4_3{#^_k0rr;gsl{nU^S@!v_bpPBo*zH>m6aQi+B
z?Da2r`xP*WY(8U1mC6YShJVAA;b^;(F%5#gIjE%|n{4e#ECh$CYC2f#b|TozTr3k#
z)bIm|+lVcV)*T%y4QMb{<xKJXRYf`4wz#H+UC$l0H)miW=KEyrqqNQ&(KTey<jaeI
z|NVeB@&c<EZ;yrSKWsM_Y<B(~wBGQXBv(4BxA&qKha-|?oSGMtHg=tW7nKc%-%c$n
z4%H`q3rM94!a*2n-v~G1)M1%c?zyd3$@c(H5Z_^fOejQy>Ar8){g2Q@0?!^L0gNfm
zU%FV~eEM6kakc*DscPm|icV6W+~fdB>Cc(lK|z;*ms1JrAUd@@1+@k<Fx?AzAQm|a
zr!ZnXBC4iEHJ;x~hgZPXZ4cRA>C(CP)`StIUe+#NNfZ{|#`I9&Yp3yVur|eti4P&g
z{LGy1_x>KE@?x=^-R8ZKhk}$NR?5a@icDfy$PlxrKq*im;1-UlyfgIdNgY)mB0kF=
z+8{gk5Sat%f6CeG@9pgGP*L%@*(MaRs|UMhtA|~NF#m!&$^RFedu*TcUZvr?EEOB>
zFKy<ZojZjICzyFVMfc6BiIBo3#J}shk+KT-2#X$QjJDiE-)*CZsD?D#4p$y@n}VR{
zEK6%$k336`eV5s4?WbyxC6xduTqMgOllV^!M!c_71#Buwe<uG+Gd);p%9<8oem5`!
z$f&+v+qtj`(dbW-R#lFe5~@OKHdKt;jfZRX`m*Ra?cHTz$?SZ6<Om*2eEsbDh0PH9
z?h_fxKaV5k2O<YEw*Gg$M-ob6(bxf!wks;gFYC$wJv64@Xziwj;d)A8oKi>I*6<;v
zL^`oXJ|Fm3uC-Z7*)8;{U+ybi4JO#7tYxNKf1P8K2V5ZK$(${FA5y5_=dZ6<nU(Tb
zDZ^oZQVsI+Lz27plK8K__$aBnKgG-FRuHfk&;Z-NtKmUDvjcG91}}uZ35wBPFjYPL
z!t~=d|2g9#;FR;=O;m#KvGeGnY{kmqPtvI`A=_iN0p>s>&D;^~71Yp6m75n@Ps{%<
zT^|lmZ8Dh_G&v5Z6nK&l0o_fAICFX~o-E=-Qqi!uvDKgs(b_0w)RBR%_usM#eDMTo
zM4q+uwlJT(qh%CBC7Kk~)GUgLvJt>5T+c{Kjq>uMTX-#2&V`tFip(-`bqU1dJ!XmC
zHpZ#QYN@QU;_l(L64otpUUBR_H7*={`a5|gRUNNlAm=-|lk*NX<t28<(*EjKP1|zz
zwDE&78%x}6hjpxMw)$Jpk?||9HQi&hE(KrVtNgKgeElG5VKqHudZcq_)BTw?1UFFv
zQmZV;hIxPYo>tSpQOIZSR-4jgs^?zBLd*sOX@FL{xExVa$*$0C-}GVRkqH5vjf2B!
z8Z(~21ZfiRp%4q@U?^=C^7#c@6cGzs>{;%#{h#PQXDJ2X>=Y5VWXh$YMVG5G#-=3E
zV0~i`wY?KrU7nFOU8wv#HbOyj#i52a4woKBuU5lpR0_gDW2Dmgi05C|&iylH$ejiJ
zz>edU>WaCY-^#<X$5YwPh(q3``rbc<{Tu1{mkYe_5275^i00x%@&*xS67;Tr7#74>
zBqnyAbR9nkBiFo>#V+Rm;=0`25_CNr7^MNBwh%nrk!durH;|0_s_|lE)PsT*jWV6v
z4RG}FeR`RDxh#5BTPH7r*v%x~Z3~uBa}E=sQ+OPmzucTnifnW*^JyD;eZUHS*`W*{
zPh#ZESBcloW}w2dWPDmB#83J|{!i$vDXi=~ey+y~`eGpokVb6!{Av-UlO~!l!$}1o
zhz4VuwVF^T^n7?@M>VNBRgAXNfQ|UdXynL5t>v5)R~)K-3DPu#qfwLvNe5RnDt5)(
zJM(46Ut8nB<Gndclckc?TYi{hphHBr3Z^EK*qfZ}Ec((b>~v|uh2((ujtY-ySg_nZ
zQUAT}eiSwWx~ul~p^=W$=W2RQR#bvJ3L30|Qb*K3i_x+EXze|QYP%)%8RS8pAvx9b
z@1>jWD=5jf|6*qbDZkea!+J7!n6_ql+L57v*2-`@xP*mkA1KnE*V>6P<@&sgJ!p=C
z+m8ExZoZ_V-k~DO0+{p9??1*>Tzk*-d2a{Pzjp9VAVpaxf(4ri6wDIc?<OYm0obk-
zoJ>&wCUKxJVo8O({*mmA33~A{XQ&%qCRnPm{XHmKrtp&vkXACU+O67UO2-4w<PynW
zY_k}=9~vG)tpjB44KI$$T2mNxR>pe14#sLV`OblNY0ivxGos4&^(MH^MEGkusy<Sy
zi8Ss3=eXp&c;i^7a@4TQt9;RP&jL|Y!ZJSbX7o}Rk<vsX&1OgSZIhRJ0A189n!R3S
zf;?9mT=sBqVCGlPut1rc@r?M!pnIc;W*H=V753C(TXv_%QRQ#Kqc)pZu2W*oKUS$*
z1Ol%UWc(qH5aw(_666T{?Ofp7jgk%Td#+HPI`Ut+e;!)pmU&oM2*J<MdBePi+LoMv
zLcgw4g)+4liJ0Bj4J!v$_+%P*_8=NPuMdkPlg-_{?<;BFT;wR7o#IVm>-{_y=lxc^
z(5!S~2Qu7dCnZhRi}#L1ld13OhLsD@Pv#k|`cm?^Nk>WCRtB-b!_jh0=RWTLj8{Nu
z0F6`%-B@`EKh6GLkb5UZJKna&)ro33y%Q^saP)r3Mwo#fdCcN?VF-}BOH19AzT-pa
z58_0i5_b*~{rfhn8Z#&GcCP7l&%S@_k=*xVF|iaJ|8Jmc_cOu;sZrN6_U2Rj!Z9*G
z5h^kwA$t)?lqg4cwt89HbNS@(2n&2#U!UJtsTM}t7LfHgEL#ADi;BXZE87l>H*B^i
zeQ}>9HEt@-<;;#F(D!K@%tg(h8z+Xq$*@D1zg1|Yl;7A)CipP6v|+|4A})^PPR8}?
zud)Ab?G8rfld(w8nSTGKs!q9Kp4E3tBC;{%ewRCXWdjFJd<L{a<R}LY^@2F(n6ooY
zOE1d#hMcch{}crdtg{IQniPcnu0b6J{remRR-n}-1J_Z*_(>=~zm0^xp4+|>Y!Rj`
zd>XabJ!#<S2D{V7EO2c%@MNq|A*_0g_rknrS?cSD3`RHyl)!{8st8TVw<=U>grLN8
z#?b3iR26u+N-<46HjDa2SPEqw<hkN*n`s2V$}Ya5p3e?zRVQuPZ?s7;^fJG>7ibN1
zBcgD4>(m(VdPXi=N4>6_eI(f*XeoW8m%P{_pw6O}ELsNA$RHgdUEkf;<{-u0SVeT^
zjB)SQ!ivmoFSE$!WdyyH@e;bvoWY`$w$~j!e9KcGc%q*Z`1CILZ=%1;an)w_^^c4W
zahDJBmDQAghSdAZyoES(^rjb53V9dSh>3ujI!O~phi4s54<kIMOMwGt9bvQB0I7VQ
zM4vUIO~~eyckG0(8cK^tMgH2|p^PWl7sLrXHm*0xmfbI{b4vv;P-sA4y;A0vM}dQn
z{57E(LDH044BNCo{P}2e?;lw7?<Cc_WKMI<uG3V+a`Rm0BL=oY4o;_@cQY<v#qZjC
zA&?G(fSVw0euMmQ#Q^um?!ds2A98Q45tBl%$Y4-&+zO$QO=TY#*1subT{7|$wg|fq
zblhn0CXl0x@As#D86V%+;n)-3346gW{k=66UA8tB>)ngTTpgf`b^X4($@7=L^;)xl
z2rFf$vkvg?(G5Ex$s2+h-s=`Odcq<u4=c9|kmXJH;!#B%?^)|p&L$^}$rvBEn%HoE
zty(3}^TGe(*NqP}oVCE?M~eW#l&#^-QkwJQ5|^{z>T^q<|9FcL-iwP-hA66C-JIT_
zdPV#x$Xb~FbL~1?vz5|;z(&(UY09xxi05|?Do?`+NsryBRVUy3H*0T~++&EcZ0iQ~
zpf~$G2h-JW<6E`rN!EJ3$F!C6VeLG8OS7=Z%?9sZ3N3F*D+2FR{6~d+q5Xa43#2c`
z)3YkIlX>G<@JEs;)3CXjmAe*^C)cG_(rk&D>~VW@CM}HThta%5!A{84Xe02WSHSp_
zCcU7xb7zys?heEVdOYRb67wsit!D|Mm+pnb#ukbW+iRX|2ps?OO=b#q6FxCNf$pBw
z@|VSa6TDHy8YoQl9c5CZq&rR1q=Vn%3~>M3mSwRBKfJRp177sAR2c1BD=|sU_fM)C
zZ-j}2g6e2Kj{N{rxX{QWP5NzpQh1sQoG$6VkHjzF^Bpb^e)SuXkub%ibK5m(YkP+}
z?PWA-{snnH3RUrkVvcnvoHGAYbFHc@=p}IVBn0fnjSAYG?obl*-6kRRKbPfj<)uM$
zcO9@9^(1N*Pa$0miwS~eyQL|MdDm^Wn}Qz4Ed}P(l^-OMm{0uo6u6%spQ|Vz266-a
z1tOnh>ia>kQdabQzp;V2eEuUodb~O<BYOwo6B&dHUlrlNjn)hQ28e4SeN?*f%~1+k
zp>Mwo)k*eKbKozdr{V<Nxn>~(!j84>CvSS?r%M#@Q{Y7*(rUBoL$keIYzCDioJ*KI
z-vTL9)f>QBJ}0!!xdyyXu3mAH{C=`!+7NO9<A>;P9txe?T3ps>y-KFlp#z}+^yj~<
zC(2(<&E0QyDW5A3yy;11Zj51EF1O~=(@^gV(hV}C-$9GusR=4_R-XVk+1I0`9A3_0
zb%7TU6z#gK$NIpFfagyHA>-@m#?LrBYnlDm__4}D@zO(IQP}B%OLW}@j$w}~<p?<q
zy*}f6gepE0vvB3qQC2xqyiEE%<p*UA@K}aZFk5B*<l<)6&Nfd@i=;S$AvLPTE6n7q
zxq4|(O?_SrGk-6!T$D7AoOhx6yv7_CZV7UUN+_C8Au_cF_6S-*Y#*e`S8yjOixM_l
zr@g)m%ijTK&d$wVv)~gNib+7`u4tT878TsnnCW2(n`4LLneMP6h1V{}tS8#ltMpb$
z@46n0VHEIK?$z%#)!#oine8N{z}aF)pcA)Y>Dzbp_S3Ns)Uad0_sNIS4G%7pSbnL0
z5p!Bv#TXo0U>@Y|?m_-oImOY}vv<tI4ZFuo$#)C+0Y?Ox^*R|yZ3rlNomjB>JKYku
z9m|Y+%*2oU;M06-!Pt?kh5%cGI^LzgYep@%b^2UD5MG`j=)hJ5`!^QDR1<Sc!q}dY
z<WJQCZttlaV?vo`n&zWiH}~oAyu?6pcZ=zL=F_YeRmqsi6Yk!d$@uR|@vs;9iyP(J
zy7HO@kEvVqKVN5!do}^^AIC?uUZn=HZcq8oLr;AQcs6xL{!9IRbFjlve&3Dli#)!g
z@aZ?S$0R{but!RFcNNH1AiU@&0t0Dz0w|c%4a=fyv)B8SP~-$KWA`|j^ZtWTe)#OS
zw*`-Wp10;b6pxiyAxtGyu;}<r_`DQKgYH`v`c1}^@*XqXKYxc!i;oA{ty8)MDi)#1
zRZpD~R+L@St?v%J7IVX}B&k)?)rBxqx!dX2%*ibHIuPM8T%!5`TkP89*8(P1J1EXV
zo$D7>mUoVIX_maNPyo2(=y#GPX&b&Y<~=ueTkCq!D2cB<-cUPr$5AT_z$9#Ni_!ST
zZQqj|W`Ho<B`*0LU7Fvb80%(Zugf@KY2X6(+v}O*rfTj8WHB{ikRNaK_3uE#^qKcK
z=}8NrUQHpt4mNk<*vN}Cj1EAz4IqE3>#TBMPU%><?~#2W@br_hUjT)sQB-U^gk!E(
zi@D$TrlW7cV*;1-4S^o8{O6%<LUDCM8cWF809!hajLG^k01yO_lN8hVON}lT>ecr2
zGd$zA00ybL`0BfO2;3Nq*8BU$BL3;A;TiK)bA7+Eq!b8QB|=+J1k`w0B<<&o<I&;`
z%p?XR#m9GN^cj4+gFSg6ycPw<o-!|MxYquUZf%dgJ{)05mdjhy0o;CnhlMWue9are
zxw%Is7{hew!emv0s4J}y`bGLZAZN?ik&g<j(~FMCCg?FwTdaXK3K^dL+nVqL=-D~H
zrOsB`iAD|3;7w+4H>}Bog-%Y#l-}%?1@43)jrP{B|29~l4DdET$&GcAI`~~7)O`V^
zXAW4~{`EakoPCz;&$$B_eE$Oh$k6C%aSF9wU%4R7Yc0133Hq=+xM?Pde)C2;?e{sH
zC~<CMi`t2!h5uV!Hjw8fMGB!x&D|~^s&jA<u4ZU-rmAfrDG!vM;>$lx-V63Cs2`xg
zHBx}e;Jq&sAu}c9HKJgY=HV65^y$|;s^rj6^iWa*x*6aVUBhV~{~UPVC!m_pYyk7#
zbha8_o|{wgd5<LyWPuZBEmBCM(-Bow!yevfT#9{CZ#cZuJ1mko%nQSzy2be{3x9OB
zyF0(0cj3FaNx3<ZuLLy8#&M^>p5p5f5a19Hm<G2W{GtdX+a|37U;tN~I*CPJQgPu#
zF1673E(j*}?(Elw$N@z5*vMATe}12Jq-nP;a)LvDuro!wV!!IL3zJO6kQsy@un2wG
zD{_%Vsp3SVqQ6b_;=cDseVO?n^jvD8Q%8b;=L}*2G}Gtq77UmavHf&)os8KIgi$EP
zD-jyv!HFf~?kB5LF|^6?{Z<@tvfI4P*_=(nhEDOeKqGRr{C6h|no7=PM2;bC4Y*?R
zS`wK-Z5l>5px7S)(477!O+e}|wx;A{{(*;Rv;KnfZ2QhWRemU^;0tY?9zh_3D5p7c
zmg3;AQB+Y2(XlJBM+BdDri=iq&)+2I(fJ*=lXzmSc5p)FJ3QWSr&1dK_Ffol*yO7?
z3%1UG`87If=rW^fKddo{`f^hb`m%eT+u~=bl1^PrU4%%q7sxBtnI^=p8loOWg99uQ
z7O<mA$eO>H2#ru7(UJI!gPNK}rNtp?K^2qaqX1)ksY_}Oh{4k@{((YMTrGxGSsnG@
z%HNiN;M5Q6tFND;E)YTiShJh#OD@qxSMMab?c{l>b33efQ38e#HQ9-mmgl_JJ^pr{
zoq+!=eQ)~UMm3zDb`U6rf=Tf(>joGA2ATX&&It;rS)2(wBMTNS;*fu3hGC<@O`TNK
z=XmGs4n7Sgst#abYb3Rn(=+baW%0$3c4npl7ZGW{)%Kh{O*}NU`JS<$4N_?`jm0Tx
z8?U!mDQ(8~6%e!kS&@yWc13*0q=nNuKQEi52&nCWTJ_%R?KlVYTs@ue<okRns&`&b
z3v?2^AH<0^FN*s#_35Ts0j4}HoFQ;Ec;7_7av;18GS!sP_kzxjauo!?$3C#5k*e$U
zIPDmEah`U`bS}Zc|M9RV#|ksGHMBo5<am9(^6><ONTw@o>Byy-rxn%d4btZMNWu@K
zfi5Pj)=yrD0Y5*vQUmERfu6)V54))W86tiAH-WFX5?BxI_-z(K1TS{^aXSp0{K7;^
zqNDG2fr-H|(&f*t*HykTHH?m5SV2qveo3n}R?ETfG|i)haA#?W6*35|5$@H7QRcDP
z*_bc#qXMb<q*Of&824bLG?q|2*z8wx5)q6~c?bg&lC5ULF`6}6XYH#_G~xiHqLYt~
z&wIlNT>V$aHEGlsdy9A9?PGg<+$W3!2+_$*^N2pP>n_bD^jvU7l3VFTR7z5n(4ZfG
zea60dN<Ibm4KByAc=oVSCe=URE)UZe5c%Dl&G$Dj$>3O^R`Q)z8u$yb`eC44jDHAH
ztHdH+5S5eFFy&=#Fiz|_pQPkmM<**(&XmQ^MQ;FGHS+YtLsTi{x5shTSw*akGBaRf
z+CSN9@77I%KWRCDEw#Bh@136~yb0|7zy)vaj(AV=7CJp!^#A>xwzLH3+>3Y%o-Q^G
zg&6uuIDv%>H0UrgS;T&Y{hrf&o^*!5=hXL~-<{OMF|2y2TD_i|MTe3kiU6PcqtJSa
z+144ef-;SqeFCsDYYD&tuX!0afjGQQqY+)!8XGhg(%KgiSEjc(bU))vF+3Lv7j^iu
zeB+D29@-Y)O$|ys<E!kO%Dt|U+U&FXbt|V-lJn;Cto%mrm+OrnpepX=Loi%0{`Ve^
zUOC$PRJNvQ3`$V3Q1|f_et*B$nPo>^ep}&*igluZyF*Ev9lBdGhP=Sji{ZD|0EF2*
z^pzbK<hkw2v_%oH8u<;rTE{&0G)?JxA?H0OCnv*t7$NtoR@+69-6)tw5RpKPcVBj+
zlIkW8=#-GykNdSh0K5e6C8w*GBJ!`@khXLXLePTl^_@@Pko^yJkQj0*&R^p)zvn1s
z90FTnJx=5kjETVuizu|ru|3lE`i*)L2#Nf0Wc|fIG=@Q)rnDG7grDj(SEyobcJ|=U
z4*WpO9u*z{tE9B`F5fL4x{|W&UkFX`ef3u!+rT5D5mn%@?GAenSL9FSL9!hB_bUw+
zsm4{`Yf-sfNEnLF{f&e_)N5>f{!?oe0lElYtDN<>nwmIk1f{-1dj-ItmrBx|`hDdF
z-HLl3{OUr$bRrW9D?iQIG&M+SxKfJzffz;EXy0>nFR049FTwT^2){J<x&wDlTJLOA
zIepR608gUYTY?Ui*=A3Z3cjiyl#a#|CorkD%ZhD4di4T0XowD1`aLH=`)`U8oM9fb
z7Z(NrSGKE~bQ?G+Az-{dns>(oL8iQS1$P3L+h6%gi_z%ps&?LB5?@8zO7wjljt%VZ
zf$kOvc};55Q?R!udi+pCG+t481A*ns>WT~pGWe17i$4lyImdk(;XTOrIj*Xy?>?^Y
zIbZ2?-I>Crr|^HiilVO~!*Un)GhVic*z|>6UcD>Tk4}qXK#z(ijM1dyRRE6htWOZ*
zD(92~&>6!R5Q8cXkOh=~o;SXOoBctkmFMX>s;+|ZvVM_0EudofiIdp^K#d&KXI%h!
zg1vaG8-Jv{48NI|!n1&{49~WBLJOM6wH;*hy`%>GA=)fZiRPJHyx7q6^00!Z;@NbN
zi61-f6A;T%azIABod}>sY#yb2Lj_>^j!HZ@d^g^6-b>#xI{zaW0_UPKfy(W0iXInL
z(|;(yCE&n1%-)P_vh&_)0jS#h)QvN9J6FJ^C?c8r&U}cq9S{VELQUjJ$|KpamMe7q
z^Yn05h$&r-j;*c^EtRMCq8l<5aTFOg89&8$-=j=cvkWaUa}QsER)rSe#~mzG=YIp<
zLTCZFazEjMR46zfFfnWMuZC6{3fQ`h7o!-kYkU-t=zjPvEEzAWA(S8{Sc{{MIs1Ex
z`eIhFCsI1=Xh@YZ+Q0bRPvIi7K8$T@`Cm`RS_PI^*+}SUp@D{q%HjS0H){kNnZLom
z{LIm@J-oa`E(Z`etR^zK><NNE;0;Ksx-#G4R;b-ISM^l%9-IAA9TRLXd!<D<nN&nM
zcSn%J%n}!0%t7v>L-WTN?lm{<om9|`gx)-=4)qT@2jBCq_AZq>FCDj)BS9ZU$omXc
z|A1Wt@ZwouQd_zb6;P+H8W=9B(E|C%Y5I8y!Rzmv1MD~?B0Io^vV_>k@@EJF_?FMY
z$LT>8pV}-2H3!#mp>D08u}G94300mc!(-CF7lvfa)>W&AG8k0BURwZVTUwY`+8BnL
zkW`f=rqj;$Ha;j{q&Dtn12zME&U*@?(fXhL!BU$?4-3wA7$VGvtHzR11M<o&NW^ES
zaTK|l0(S0`FTT;hXmRou=dB%HS`R97eu_MaI@03=UNTV|G*d@M*S!z${0$)C4qNq?
zu#~MfH~%?b=hdei4w~UdKX|-Txp)&Z(gKSOk^9@5ezn?TnW4olxbQVK+#HzuXf<KF
z2L*4rnr8b%aj!P#5iT%*y!3Jq+jTCgKW1}#!DzWkGV7Ph`KLHj*W#(;3`|P!lh5P&
zTc*CRkI(kgIe3){>_oP;u#4z<fBNA-0>9<H&%NL)e=Nk4#X87|G0f)&Dt)g1{j!Si
z<L=ccW#A;&l7Y`rnGB`Bd*=9Rb9hWO)=a=nykTD5WC1T}5JO@6(wT4(1<sq|!!GNw
z24-~OSK5vybpaQO`{L=-c|JpF{z?5p0ko9O>`x_OLBlGH%KjalXxwx?N90iP;*YTS
ziu}@!O<a~ha?`CGq@B{%7C`s8f1&+R^7E)Lnns*&??QGTpO}Wbel9;sAvBv*zTLp>
zxax&-c>4bUDnZr0G6s<go_R`k+au`LM>T>`s8X1b&W^P|{9$hTVU5S0pm1(e*SdS|
zvitAfbpE-GS6<rw%+qT>|2c*;N~9_Tue#HH?|toy7n+Zp7G?`1Dxq<KCtx3q3RCR=
z(tXR!9w0Ym3IH0=Yz7T#U$e5fX+zKd{LyB!`KkNFmhg$NOr?qP|3JmgS2gkH)y?c^
zl77I%`?&Hp0=R>AY(~jGJAW;J*u;R6HQ#%Ft$NqNitjS{70KKC2bCj>O`YO-O;r%Y
zFjV!2CypM!eZzH6tb24x2XkN{|AeANifsic>!;Y}5F50<>BRgy*;@5BRW*ttRb^&o
zUbJY@kAM8*qmDZ2>tFwRTU*-~zVL;aGiP3W@x{CEzB>^)YZ)-1A!4HjK<5=#*6lJW
zJ7rIGYKWALl8-c#PWi-PuKSKVYk``>4g(Oo+cOK$cHMQf`;^-IXPJw>Qyelx7B1>}
z_C)|93UwsT5OpnFu;^ERU`ymO0O}4spu-kB?zjhlamvoy`;sLB&~?*|`KCT~haIW_
zl2kV;B<Stew)iJ0+X~<e1L(ft`np|r%TL}}MTsk(_lP#Nxz`pq?z$TRY{jWs0fE<k
z?Lu9=sO5)020$>~&(6uMSXFcUDF6zGPw)SmUw2$}&Dt+~0mkdvdmR{*o_rz;6plCo
z07zKL2;j&@NrqnG0%N;-++BARcHA!5X?yF$5O~*Jb89wqUi|Cw)6dkNcuY8~1uo*s
z@VYnb!&`_>3FOu3ya#<nel`y7W1d9f#|!VlN5W*FeB@4j?!S1Cqo=e8U^c4^#0!^1
z{r0^eO-f|+;)*`SqCIctpWYuSKIGmDmDmuhNJBn6_uyu<>tzqFh%eye+w&eAMT%_<
zDeI@$Mo~@D@6|R%#9B*4VHk?YoH=vOKmYt=jydLczx!R^zI}i4lb_6-IrFPu{p#q^
zqn&epRL_`zy=lD=1*WugQTrp0HJo-P1LDN7h;PrUWJ5*Ni4d#P4L9A4i4*e^wiAGu
zA;fI#l7(H5K59-qxqkOuhky5b{cgXbtGQ|AaUbe_Y<4D-0kbd|koC*g{QUdUu|c!O
zIU9}}U(>%|`M!GyAo6`(Ldq0`u6<+IbvNZc{xMTmkI13208v$O8hhQxfvZyNWnZz6
zC1EnAJnzMx*^kwH=2HZ~C4FN)70}sbi)Ax(C;)XpBw=9IT=J*&zyDMH*S}t~+hkiR
z0oZuMOaU4W++P4K_4NDv`3+yXaLsRix9+>&Wy8i=Lj+*$v(HdnZOzD0apY+&R1q*`
z*r(D(z(CKEg~iuiuis^_Ff6bw1IT{(R0c9X_z5ViJ?E1q_KABh?qJ$`a^g;j&FpH_
zvn7Z{k<ao0V;V;HsfGHX0Ip2#SImDjuApy=px$&B1EgTX0NlyD5A8R+>6Yi$uj?tr
zh6eG|Q~bLjW&IS}7Aj=#wQ=61Drj}Ho@ene47<9zZoc{EGtWHplv7TbIdkUjyYK#&
zzx?I;>#zU#$3NbuPakWoi1;)>-2CoS!A=1bXFU)`QU2JY0jL2(LO&E=DZ?5AqQC^@
z1q*|xpRWDzi40Izw{*9w2!QTK9?RLNY5H^lY<ICRamSYHuTu(E9dbn54cAcQ)H(%f
zcb?MJ*s%87-!xET0=sBQ*t)5-aybBHatu}?Cjrp)*qri)wS7)I#Z@Kd)wlDR!1w7<
zoU&H3mK5OFbXTEJ34pe{?xC8n@t9*Al890h4S1pu;KoLuNC>jo)+;Y>{q(st(+}%+
z(YKw5X0zH;;u~(p9(#n_k5+XOX2r^&U;ez|;~!uD{U2=j-#<_$0}!H8e$g_SxRdGM
zAF5;m4FiJO7oT5$>6O^95lk$~K=jc47T~?7`82+>+Bp4CIpIUg)dgpKq;B6staBQ-
z1H8-fvnrG{{wFRJ#MKcX8DLYu5r~{Xopn$lIF0)X5Stapl@~?tQgvI5m9_e-K_HNI
zI;bw(XSA+dql?y-LHyq2DpG8dNW1w`Y%8d898mzqnCd|Pt>}Vpkrk*E?<<pdt5>g{
zIdkT<*IxVl^UwF~+xO&?Pd@qNlP6D}>~;72GmSCTTLclntP&Z7z=oS{H={=9M~(G`
zcOa=rD=ak7XycVPbkCV>qn_-`iwZ#T$}3;J;g(EWYv;PP^*{XS;PXGb_MQi&xo>vp
z5CD`9UD?*{F$K5Xvh384x1MnF>tnVbb?5E1qeoDocF3=OxAf>EJ0E-~v)4iVz}>Lr
zV88tUz>;P_s*cnEZMWVTjTxEUaeD%kN#P(M6SO__Nb!biJ6>Pv)^7}pMN=w~EgKtk
zsHy$lcZQvIx;RVB!Z1MF?RV<5J;T2J#5pw_XMmXjWCst7#*FTqc}v5YXHxry4S)Rm
z%5QzW{($`lU-4JU<Pk+7bNk$fyOu8<^rZ`#j6{(Jp|64u{{8P)EL*nz?9XIdh8GS$
zLJjM%7WdHIJvYtB?leW(Hx-vI+w{MG=(_dx_FR3<{rA-M-9s%9qBw7n12WU55g;5G
zRe%Qn>f$v=9Xa60qZANk1M$uC@0|FPS%1&p1OfR-Kdytv#yjx@9{`hx;7hMezR#Ta
z<ccO!0hJ4O+iQtS8vEku4*~=PVE57eZ@7Q{vrE_SIlL~F;r};A%K9m`E%<W#;>C+!
zef8C8)28L~c^{*{mDg4#@hX$|%<P-<UwP$~Yp%KGmRoLFzI^$ZF=H;g@WSJdKYrAx
zQGQ`zt@Sf|TdgLDpP};k-w6|S{CE)H9Jsfaj2sLpv21$rmA2V)b12cvi=$yfih~Cl
zt}(-hWmI!RS^(7Uu}cRF)zynQA*C>Ik*l4u*SM!2TK)6?S8KI)^l(v2qP2$~GVZYl
zSAF-V(KXi?*ByM~tO4Kpnu@52Lot6^N(&d(@3seL3(gh|8$ZrZfNkHn?&+tX5@zyJ
zsO5al6l(Gfg&+vDu?YZREXa>%mzFN=f96L3sFoQS&urwac!O}zm4AQz<TE=)kL~Jc
zui4bz_sd`Hf6+HMSBt1DEP}A-vFA1dEz@=f;$jAf)LG8w2mS3Y%a1&I;YU9)e(n?b
zVZ-|W=!eTse{IEar<C&rEf=%S$*^Hf|MlaM$DBZ8Mu{tH7&xHr)HAxmLhZPb5ro-z
zwZ80-p~FA+33UR|kjQ^7oFrM^33?~n{O9Zk)>J4C_v48I00=h^+~X{1JVsAo6;wv#
z(A(d)%~EG9RR~n}afcC&U|F)d{C@9RihnN1-|s53%kRVbPNAfvBoGxbCiv`Cum1k#
zrL%uBW!lJEC(69desNN~m++ULwKfQXi!Z+T$3On@l~-PQ=%I)Dzx?BivwHa66S)uW
zqbQm)XU>cnGw!|j-p<a>$&)9aeDcXh9C1W*bF(iZ_#Xbsb}HY3q#<8PNyxW(`(Eg7
zvdUpWlN7MGFoOt)C1C?16kk~YV{&81co8?hO-sZ!qEZR@T7nXsBX;1>3XEqkFBvCF
zu&N<53CX5bTj~n?4Ada>JH{#?@;-a5k3YtNHtxEYSgXbmK(MMzWRl}ZB>IrF@hPb&
zkq8N^5@*)jdspp<p@lI!h_w_LN)C^RR2;E!E0?Xh;>w_|rSFmZ21AECr;aov2NmLM
zal!oMzy96GAAL`nnvE!t`qN>dWH{*DwDy77b-PW@4H^u<ZQ2;U@IrC<N?R%w#*EDE
zv<ubMstOcD>ImeeVo05We7c#4RN+(vWXQ`HBtz}*(ETns6R7n5OI*VDqZS-H6#b)c
zT>t=#xJUr<C5OrfzU`8JGQvShN>(*s%z3@E=htU{<iH`9owdD)Ry8_)K%nv0)j|U1
zWt+-lFPyW_c6Ik$I9WwOAF!kO=i!4K>sN_kR@#+g{c}woPe14n6ujttcX#(a_uR9?
z4m<3+>#hKp`HfH8>T4@3&dlxY?f2bx-&I##HD}J8APDx~fB#cYJ$2uG_swRrTL|!N
zA=gtq+UIZNUI|YK3_x-SDbYV6f8~)EY9T;K)G4U}XUA^`6}TvFpc6J4u`w`)YVxYi
zk|CHtgxMh=-+RY?gWmK8j7Z8xxT!gVX0bvdFy2s>00OhBH14q{!HEiS01{Ot4TQiV
z8t<b3zL8!<0aZh+0jt6r_^4{XgQt@^Q6&y~BS2r-XBH72G-U9Xzv@qxILCl8Ljs@K
z&W;^7>c9VAT<s<T7$gJ=2_nuF>iZvkltQ6aRJpz(KW*P!lCE$nY9k^8VnB^L#{vf^
zu^|@kcLV@x2u1-1fXIl(4TtZ7TooVzj5-l@Fot~>BU`6Lmf4a9s=*s{o8ulyMl6Kn
zoQQWJBnuc{5ZERY{K*kiodW*_>r<d0OY6Ga`@L%^{<-)7vwkdB<)=_9W3vkF&G>05
zH1WZLh!|rYfBf+mUU=c$bI<j?b(<C10c=f1*H-~Ni?3X{@}`?^y6(E`UVi!I0Rsk{
ze){Pj{_uw<O`62azK7qZ{i}xq-z<@k+_g!e3=&cTkygat<H%X!N-nXA5;0f-aY6vI
z;ogwc7?21Ffk;ifZ3dig`!-AhCI}0~dT|B;5m`|{0w`eOB?z%5a%<KpkpxT-Bfhho
zj2E@_Ms!J6zj`&=#AH`t1j_y>Bd5$_zz~jo>nclWpFM!FWe`AySOxI$JU}2(Q8*%E
z!=ja>0y_sLFsT*i$OHo8LuqwFuuwn)W+e`UASA3{Wrlds9<Oij80=f_SdBu!k_^`+
zpVK@0nkTt<s6yBThS(_-0+KKYiAXgd#@6UN4&+5u6bu&xOvIw%b0h-rwtw&G-8}`k
zzDBbF+B)sF*@T$ly~4J@`n|%D8ZSEa|G4au+?)?A6igcHYbgn0nQ~F^`&YL}@y>DE
zZMS8!*&~iP0&hsxY!R3HsNDzmMD)^2FU^=S<JMbmUA1b}xN+mY_O-7acieHqhYzn5
z?<-~gw|-Gj+~^J>uqrrC`&UHlkqG86poT<Y;GNDv$_|=1#E`NP1+&*qiG@&<yqg-S
z=RBVz*0`tKfkFVr0c4aDJ30R}B{(HR0ke7oU-9+$$iOJE!ZLcJR4?ord#J1Nvxo#?
z0f-HQh~gSHC%Ux0*-ZcmD5QY>!TD+jt5~1YRsxCo+5&vjfig(cIHph(6u<=V&x9KG
zq#h{9FB}{xs|y5xQ9>_cQf0)jqPNS0Bm<N5+r2YTY{;Xe!kqcR%$M$YZp}B2AAi^m
zEgOr`kFS1g(WW2>ltqM?5djbce$^*10dc9j^T=HXo^!xh3-hC!=RH1e`InEMbilX)
zB94=M&^E1Vo2c&kYs3%?)QMDOPqzJ)ONzIRZGrVYdM_`p*!>S$bSmE@g+R&Q2sCjE
z*ThN8FSxhuG~0h1s+058yz7bC7_(~Css|o;V7J|Nn>=|klJx!-aXG4Fbb}x$m&=bn
z`smeHUw!}m_xJSl?6%u(U-`;c4nO?xhK7bp+Mk)Ln*-kJIxr5BN#i6lSCUEok%aU8
zs#8fRNR-{YuA1>_azd=0>(f)7&|+RoWMKVJxZf1BFY$9+U@*zP#0(hgtC3-iXYu{i
zpdZl-y@otdkX-nr70o_nCGrjF4AofUUD>D0I63ZKbr>ZE45<Pp)~1iyl1)X$)+^S;
zo$USuD?p*fVzQN7T;<8Wrp)j$JQBY5TukKkg8jk!GsV?sz@M_Jy?rN$8v;}qz?=o;
z*$=zd_OSqZL~eL;{o*yv6o^s@R0E9=8aP5!LSR8^)VTJpzKyl#8~}hGc)9e@?C8{e
zBaDk(em)?f2K91$tU#cZWnX&R{I|rm!}>r)F{<~+e*^`T0#;{Wh1h$KDaeom$gv?6
z6FZ5fSMRoh%52^iyh}vB<^KNr?_aTE#RV5!z|58Oes%S|QoJXkwzjsr@4kD+j2TZn
z@kBP8J@CK-Pd)Y2z4zWblgaqtzLNIG|CP-DhdUi}v&MwI<<%538qiCvaUcm*t(dtc
zq*!S|(U|)wrfy%c_q?xFYhbFpj*>-$7m(Ym&fwb$lQDCu_`oL8^Sz|mTOL7gE6nB_
zsXD}DB#kI;X;h*r?Y7E%T15bW;`2Z;jl_a?Nl7m57Vwh68mq=8$vyuDnk3(ocsV(F
zz3TM6Z}}d5w==mAb-7Gv05pZc4PW24D>7LUE0oQJk1yHyssDX-%y9nGCw3{ofz*I{
zga$SQ(1<9k$p@&<hk<$5JM#V^y-EIOU>jilWO_6S8{#R_&6mns7(i(pO!yGNmqpbv
z8xT~SRTDS?)O&IycaY=_V&RouWy-gW-kKsMGu_Dzwi4IHH?!ofj7mPj-`p!Pn4&jW
z(X9|lT(qoysH%f$^H)@MBI!M@_=s%ztlm^g)v2gT_*HMa()OLiu!%6qW;?5K>s!_5
zRnat09^X7Uzr~=PueIND%PoTk4?gg~1CiL<SElxTir)Wm*|KFf-gx8n*Iz$>{`{7f
zmXCexV;}zThj-j@$GBBKN#>(!#qjPg)HnN>s;;Cec8&An@yA4Q+ezh>RWFEz^OBE*
zD6xG(e4=}ub*sM@u_s(oX^5ZhuSNgSS2W_PIXt;sD_cu8(wjus@?2HToyVVRW$l)f
zIO3JU7T;1-(khkjw`YDN*J$MmL+{lRA3@dnzxUhrh7<T+K~*8~<rxwt4XO86s1q2b
z?yfA#u0YY$5rsTet>u(N#SlakPLefOiWJ)}wgJ|!pz?k3G+HGywgn4SerBkE90DUw
zjCpY0+Krvj{u5jJ<~XutVGa}Oscg6RZM3Qp`|a*2Rc~aoec$3ZDoc*aJzaa<FM16-
zdT$m~ecz}G61RMl%4zETYcgb1MFCq7e8q63x|UL5;Vt&0D#fSq3oG$h64F)eQIf;1
zM0M52RAq}2_de5RtwgH6kMEgho|!v$?s3N**V5ABL;vbWeTx41=bxW3W5(^b-@aze
zn(epW{+r+Y=5fazH)O~Vzc}!+dj6T%ydnQxf!_3&H+f<e)nN;|db3y2oBU31+<N8r
zEf*WrFMN~4_Zy$L*&EP5ct!QmY;jt*a*DV1r>*^GtGm9%i~jjx=oa5DaVn&ufOCdT
zBmxx?6rJW+of1e`1<GmHP>K&0-hbAw<~aXz7=~~XMG=rk?*LJ6xE4=L#ms3{3>$Op
z(<@KA_&JNZiKABj<ijJ5+_ArEB+e>_u?Q7W!k|<|`R`zlt0=6J9gQiyve}s2Hw&>T
z3IF08s%p~P-Fy66y-U7LA<2GJBJGMyY;UlX6r1BQYpIGLE6Lk<6F!4VRVj+fQls)N
zl{2Hg`_{Weh~u1ObrEkesl@sOaPL|qnKMJ;N|&^sZw0sCetWrGKIWKXkW~6BEM6*=
zX3w5|_0?BD_~3)3QfbPRDc}0mx28{@URPJ=gL?oz6M(7|cG}9B;vGPVh{*r}U=0Bb
zCKv`$7|&Ju{tSj6WJI9v{#{d~_?O`QXZ=b!lOUcp3UvW2wRaVPdUeiHU?8L%6MvHP
zCWsOdjCYEdJ8zBErm(5zh2=fR{^I#F_Zjr96GxA1%tcNCgeHzpm6TNhmAGJ-R36_7
zHK-=pNdigSs;uIOAQ34Pfs!*IXejJhK*GwAfJqHIaYjf2C8vJy7)I2pvL72Jh=NFb
z5toUWPzD0)Ju%fNyi%9|>?j~rR^Nve^R<IOVTgqs)Imd5bqqj_Rv6j=qGAA15Xe~}
zCS|Zd7)Buq191X1%B0Q_DUng5ER-mwsQQ*46%Y}V6URhKOd{|ip9&V0(B}a})iC!g
zy{Bu@?B;zw&)L4!+;5C&ZEd~luDf>HX{X(H-@PKj&&(S)ZoK1;J7&z7F?a6VLZL8y
z`t(yyIc4h9sbLsaiuYS6?Y!R?b&7uv_;wD1-X21#8r|TeZWH$mr%3UE!28eonzS7y
zUB>8}Ey23hZZ8b!d0t{WEFbG?rA@k5dF+76!+&|tvsxz37)JyDa)0ZcFRuIM2_rwT
z_wXEvb;8V)yjz7_38G}EDx3WecePT@cB)DOK!k{i6j@;n6iQ4=s@4e+u@V6pRuu-w
z_{Jv^0kb1!aq+JdfR&UhKA57uRhr0wz+_0Z2xJ+75>(18Wl)f4uq55zzP-I`g%X*X
zfvQ%BSpf$nD2oCB69s-ZeJPefMpX$ctgdWX12B2<b0)8js47Cvf|;T~L#HaN0TF>A
z4sa3|j3Y?km<&e%EQ}$rijsvgz@=AiDZg-w-Tll=wr{0o;}iIgJo3oGg$uv=&2Q%O
zd9TO!`s=S>e);7yXU<%@bm`#1gU>nVoRdyEX@?zlsI>9FF`1tt#k)s*ukk~Ym9YsC
z0!i`&5E);uifxHgr1)Uq{b&7z%TcK*?$9rfteLy0?c;k5A@;?4uk3Ar-Yzl3#bI#_
zr#N7Trm3SFAAPkP<ixphP@i4Z$>;s?wVNK>@Uv6KOdeJ*qN2hKN5TMs1>}eVeXFg&
zlw=#!7xVm3F-dvtW5rc3)a3hG1UBl{OfENiddM|UN9YnKQZA7eEf<OEIP6Cz^d9F*
zWD$nO+D$LWhF78uYcri2C5Xy319QWthW&O_4ag9>mUlk+_fqStHMM<AzX?2QYEZwO
zpP=%JsQ^^jDG_t$6PLIr{$G(z{nQI;b~($bH85mYNmUIKk=UqnjTBcyI#>y%o^t0(
zv@Va@H{^EtNN&*X8C7+~p4aZAP3tptO(GQabeo3Z*`a$dWvylaGKN?>W!(!d;>^IE
zl|oAhNmQL84A8#z{-2m^pFY#R0DrQ<hS*7$tXovuZ)8{)fJ!kT8PB`rmRtJv?fan*
zeF(s_&pvzYwb$Np#~m9sY#2Ls?6<%D?c<I+ZqT4Xen8*1{=Lzj2PyZbc#n`Y$LQvN
z-0YWkuK%AR#rubCf%W6aJx*2wlScQiANcy+^Se5&t0iX<Qd|hARK2P9M<~cRCt1U1
zA29gQ1@nL!7H5e=wo==8&r4m8{^;q?A2#SKM~rUF2G)td5QAU|hLkI0{uXw$vcDwV
z%Sx}_tt($d>uaucKDEEr8Z_EM5QrF?vlE}zp5?S@iC8NnPQ$22MS21g<}xN4HLYpt
z*G;C@5plG><AI+P7u;=28^gd@&QTcjgf-pk77sY|dqjbdw%+rD{L7bRGx?q<ax6i%
ze|FU2;k0wZ=ABf%FpEM#Os;c@ef+mUHk)bc+wsURvZD^5#?gRuFPjy;d=+=DELqFN
zE_AkA6k&U`<eUk!TB6Xv6iOkv)pN1z@$#lscH=_R(bgT-NBL|}>U1r8_WSUqFd8zK
zyI(84be&!JP-*q^QZDDFe!O}Ai;RSEzq3KdOE+epyaaV4+@xcJx-p_6pnzGs*4eft
z*^&Dos1p^g3iRjCpa1Bik4~F5ZPB7d-}=_KX3w5&t=(&{y-q&)<O2^p&`;j`8U1R3
z-jw50{F6|bQ+d;_{N7yu)}K-7rqSNkmJ})8f4u*!pL`j-fflUO0X5;FJ7liAZ{78C
z*L-wpi?fnpjvcY$X&L`QpI-B_nFPr3yEXoNX7-g088!?mMQR{o@|hmV{p9)$x6Iw}
z)04*>wNrl;5hvCdhMFpQpSMz)^WvQ#YMuRCESqEc3^wTG(i%+oP~FaF2bl)W*Olhq
zyy@v{!a^Yv)@VaZe#7&f9m@;*e#K-5a<NU8-qW@Em8i4T^sO_Xu9vTuCw`ruaBMi`
z(_wBX3Vjh~>Jh4_QkEbLHGKxTY>43p<aa(9J!`P=$*u+06xTm8<oKJYaRfvZ1a%DV
zdgV6T-BvhmMlI*tE<LQc_J!KU(Lkwn$(-=@m+R`9!d(BLDJQLq!c|X5!-UL%XSuqT
zY^KrF4^?$*Z~KZ_{c@(h58JhYcGLc6<%b`IT#hIk<oXK}5!+1*yKeam7SGnflN<K`
zf|lAg&Am3-;lpA7NoqlX5!+dM?%G_ABfF`zaam9|MpYREf^1lV?hSU(uJIr{$6n+B
z?!NnOYwdys3(h_FTp~LB@WW3#?X)RVruYPY#RGQ>wfqz*wjpeONmd3|<1%GR{VBE$
zy#K6Ur5;9v$fxnYbol5SA8r5Pjm!2M-!!5zA6ZAtMk`D2L;(yGC;vg#Q9~|!(q8?3
zFk>Y()Oe{Lg*YK1nA)0`RuqrDc;1Jnt^VGL<Hk1TtW!rs>_~%HDdjD7<PeBew9h_Y
zq_CDVwVluXv1j>HbvvJx8-9R_1DpKrpUX`?lZ;_%RHpSaj>~3S>L;D2zJX!V$7`Ta
z<KW2J(#v<0`;Khd?}ynw+dCBydDT*4RgkDN2y;zCx?Q$*x6kE9?5E-|@v~;tY}|Bq
zdH(IWDW4HlB{D``dA)t+ip<Dq;gEgHuRY8lEp-AAF)e$4miGEQnXDfH*m&pXY1y+i
z`+qAObp#T3El8>F(Z3F|4HVRDnt6_NKiaU%7lQm?r@}-~WnwL@@4W2-TKA&e<&3&L
zKcDS8(Uq3#Gj|qOKOOd;q_B)^_o^pp?aQbe<l5WOwGIG5NdQK9<;zBknLgX0cY;EL
zAZTxIzyA8`jWKO)ZBwUC{mys3GiJ;fPxhr!DGb9clKCm+r`Q&un~m-E#wJuHiI5`2
z`wf2gn|RkqggyX5V2G`CQ%5v@>ByEv^V?7U%}cMemcoDyL7^fdPQ{5hbs{37PDMyW
zRTZjd?A6lLWLzYPrkB^vRwrf78_m^S^+3n8?>_nahhBH&4Z%2BLcFC&KM}Ga&JWPA
z0KmOETN_%!e&a;}aZW8L<gh=u05;{tcgfP(G~rl)#90*~WoV%4fFN3Wf3*C;?Bq{n
z`)qG*4^&A6@a`9mAWWeY*Y!O12OcsxJ9KyT+ZSX9?U}C~S>CW10D(zMg0Q&wf#Sx6
zxe3P+Aa0YW2D#e!%yPz>oH_^6(wb+YS7y4Q(=sEbi4zB9r;u(T4r&G>sIlEE+-pw;
zLna0J!D=1D!a8-r(9Vbd%PqN&_WDfUqyHytp5RnKff$zVb?OC@0sHzaKza9%gEFSn
z1D`1604Nn#KVkBHgXU2J5J}}0X{|lxm}5qc9NE*;^U_N%{qmQ;y!P5_*RNk61VJW~
z0T4w|rK=ywP~V%4|D{OruLS`RVrWGf5lW&E-}{$b=&?5B`@Od*{?qaPvwoEdFF*`{
znZ(HtPab>D@q?atcH;rxnERvKRxE7maiYdB8!`bK!+|lzaA4RNHUS5QoM3FfnzQz8
zLW$S}RJHB~u{emp)Yq@;4nO-pi;n!o6HhJgG|cigzlQ`zZ~`?Xu483s<<r?=dr?h`
zs1k=n%t}xTF#x*f-=^96+|a22*f1G@9LOpM<cd8{ToPvc*Ns2f0%Wp=3{foB4g`Ll
zuzT5~vVKWs{4r!|oPrqyNO4`U<7Le{&$UWMpwWxBn&u(lsDlV#+pa9e)cDaOQVvuA
zA#iEIt)_czcJzV5^`y!KFcIYfgm5JS+}h`*YZ;F@0<rP{TP6;Io`u(RJ#&TG@uT&-
zeqIG6B?AUjoQla3K%5cV(Y<h1(6WmevQJS}sRY3Lx{2+fjjy74xT$TfTzg(Puc@i&
zi(mZWj2Sb2``h2{x#yk_Km73d=bwMfF~@xOyWf54si&NCVHkSHJYU*J?+2USLsM)s
zpfJ&)BvM0al)cF@!w=l1(%>n!0lfbievd70-bA6|Hy_zyO#j@^Zdv`UKfU^+>*r4x
zRyTS`bDz4fZ!KrZk*aRKYH=!=9M+aXgmB`#UANy{GH@&8It*pBuKB*_yAE6W%-!GE
zdGgSDUm1VP-$z2+L}yB?=0rW~>qj3<0IN8z%<wQ7cE#0L`cP)X-ll1sQ!tcN)H}B^
zy64|mUOubtkgo=f{ap8IAyNoun!q`bg_5s2$bwst@0S~~zW~e@Vqvn~uiovt%Gu%5
z2!KY`<Y?6svgD53o}cCX006GkMx-1T00@y$CWr>cN!cu&ZFBv?5&K&pBq6XeRGC$L
zPjdH?2Q^n)Gh|w<iU4I+7j+jO{c|>#uigI(2pd!!QK$qhZOqyZlWzdPY>G=~nN?4R
zNB`QOMx~V0HbDf}y^c05lktZ$*NIbQu88{jWPYJgIQZa$4>{zJWy_Y`b=O@pXU_cF
z-~M*#rI+rq%Pz+pbIid9A3S*QU{CX2x(Dx}+HQ&z?-22=%cup#qp@X0AYgTx4Eibe
zYKW&7Yl`<5+Y0Mbl@u)z5kUl@>X#22^P$}bU-`(YJ6`N~cJcbT^V<<ItR`#cH{>cb
zLN3pZeME~OK)G7e4yq1D*#wc)HZ`nYxAwtTS|<;!7gh5Xi~3N70c#)*VE6iCnVP=A
zuxSK<91)<b1QlhVwDL*YzM^j5FDe-7yx6>em`F;i%1{0=AJ&!@KUsX`L)f&8O5GiW
z5lshtr*`y#PMKASx!m<ywDhse@ZC)FI4dC0ASl??bGq*SerEd<a$`<V6;LJvY<%T<
zXTtjN$0;CybasLBsA))&DB@HV#;$y_xb`_S?#N8P9aPI;h8iEz3!G4nUAMmW7zXcY
z8V0EVtSm|<)4A{lyYl($exDB-#;X%%bpQc&?Q$-!&eac4K(xC!_phdQP;T5Iy85M@
zD*{;n3L&7h=6PG}t{J#H0H^>A@+Rd3z<2G52oVh#GUV*D&pz|aGoO0usatNj<=%Vm
z{o2>QHgMp;gAO|A=%bIGGG$6Gm#bv>Q{qqYp5U#0|KBvX#2}~yYygBz#t?x4<IM_G
z&}(IvUaBeHS8OY+kIh@OR6$In>YR*i&RulOh;JT^C2hsEZPM0RYU?URq7?V__rfaW
zGxW&f;veo@!&wr~XO+2H&oDwporLIY>loe=9=3B|0Q?p?Ln})NB@Ucj+x^;;H6!*7
za)Z=G%7#V7D*=%J#x9<%xh6Ao@5);!k*W}x@@o$S>z+4Zon7{fZ5){yacFkXq{5&b
zOyfB5;<j*rv9CW6wYSymKOG2-6gkM1*FD>Q!)K^|Q2l{lvw#tS30(Io`|{0Z>`|ug
zM2)&Rtg+pl$kcJB9<k(z6@X#f!kZ)M&P_a#U?2)MeyoVqSvaOJTK`hC;kAZc&IK~i
z2rx<^wA}OjWoYc5pL8nx7Gb^pt6lXh!Uk>}M?iVmV{Y+mv-{bcX^FZXGEUV5K<xO2
z<xfbyPj1AXK%Cu=hnuSWb1PkYA`*t-)TvYV+;h(hF1X;Hd+xdQ)?075;f5JAW=xzo
z@u;JYI^&EpycX~Kec7h?Hvu!jY$mu<eQgl{L(Fd!<RO9pBn*V~&}-eut(R`=3@af}
zu@u6V_Z?d|yiaX<xu)0_unn<(HM|BEDKSx`AaRDwxTc!1jY?Fg5JvS&R8<s)>5*4n
zg0+;ZVHGHZDhirjSGLBM+99P=_U-$V6UU5b&P&BP?5*#fIxxFc&*{d6W}ou`fLMsV
zP%S}-Ll?CcUwb?=c-OFgoJa%;;tpRTMX__iBV_7or~g;o;GImqAA+V>Fi<5fgDV4w
zRy>`nZ_1250ssfu((-#dZ}_~_6dI5FpP*)_NKp|mP+m5R%C>ImSp*1Y8Gvm=zJ@Xd
z@h<900f(-8WqIMf+5WrahU~2X*bv0UDSjd_6UvM3c40mkzLx{Q#11YXX<K4ezLXn(
zNKhE1E)q7RA*+FHFD<^s44i6eM+hJb?r=fAZo-iO6h=TQHkq!Bq7_ev{l^3iBUO}0
zD{H9Axvvi2nYl8s*RNl{(@#JBlv7T5{`u!`yY04{Z@&4;E3Z8B%rh&mN(22A?+iq}
z-F3w-hmECTIc^+L+RO>7WPwfP?hbZ99F<fFcnG1bD?*R`<F$+bP_|G4XP^<NhfP0e
zTXE?RCY`b8SP|t^$1lbEjctzgV}0G&*OdW;AW#4=QmZjJ0PsEI!lE#NS_@=?@P(D_
zSIk<6T!T1O5>NmYZxzGLV#&3)?J%6bcgl{(P3p(uoQPq5i)--2v?mq_DlUE`%oMVN
zC;R<2NwV5&{<w87Q|nT*#~DbRauh`Bm;=|n)+~8QTXxHh|BxaiRS|&-G5fi8h{6~t
zZ<1AuxRA4*O9Co)J#$&-!@n*J+qdzEpHS^6Cyo&)Sb_4qyTTlF?K53VpR(LeYhTBr
z`&_><lF5;R6ar$Pxa2`uKR-P9dnRab;)p0|=M)ezjH25vel#~=`=EY=6$b)XL!f8v
zGq%)&k$aPOFNUfr8+0wWUst@GKkN)*V@vCbOJ?UrObG|>4nQ**VF`-e05k~7ZS#t)
z^J{ng1Sk)+Abw-6^o`_gj3FZ5#Al4zdFP!cPo8|%S!b<Vx30FfHWlMb@$Qh6*+~I=
z%3l3O4GeZ2UVv|JO}^&-`Wo3c_oY(?4R6ZiP-WUxWqEzpzWw(foei>ipM+E*QUtJ(
z$-831<YCQ7uhkUW2DUlYM>4+OYa&~b<*Ar}Q{o**WJr`mH827K{{G0S_U<sOb55MX
zfI(sj0de5EI%`7r=@VPNa>S_qxxiYn#6iXM`z;_bwG(BAY$`8(m|G^B!mvc>XS4XA
zXUUT$hr)=1)OQzv8S#3hyzDXUT$S7R0w7dd5|d#fO8jvIjuixy*FMQ>p2=#dbk%7+
zHT|Q`4TZhGT(`%$%msCk7-NM(!_k^2==H}U>o(r=%TT&4moh<fSXy5y9LQ7ywE%&P
zLhZskk*~{**cX73140szQ38uPa9~>(mDbOz-TO0%&~E@00MW{4P^b%9_5mP97!I71
zsI}**-%DdlIOaeFbnz_PIxjorlY)>H)HuRr8*TgR8GwP(suxiznIU_?dj}=Giu8Yp
zBl%FiQuSBWfddB)95~Q-0KDIqYKjjwynPNsO2Vh_J?PB62Z+Ln7^XzL3rwWqM)%48
z@{}E+;t*L4OvMG601!jZ-e(9fZ1WssT)-kyf0JUSG{>LfeaE)Q`W{v4J&DX!k_gbm
z&{kMk4GSy~7_9CrUNdJs@^wzjiBFF+9Jq+2)U)rz!cR{gz2}Gq5pgzR3W&tl>k@qp
z`agCj5eZ^&dEKkJ{`u@qKY+KNF`W48g4n2ZlwY2i88MyeM!<@gfV{yiSb=E%?I_e|
zM(zWJXI)Yfc0w+22FyxAK-cm|Dc83&Y|m)Lqcv-vH6!;a>~XH+JZVumYcvuuV013H
z1zM~*=2zT2%!pBwrBdsr8E4egHh5=1C=pY6<2-%sVIDZy^cimjOib|25nefz0LYR#
zES=%-eHCD;2qFO;n}R~0uy&v-R3RF&LC=$aF^e9`9sJX<Zmb~cdSzyqYt9Ut3P2sg
zrm;1RE#-Bu<WK^#^6I@o-QcWm<*QoH{eyG9d7p>+v_FzUK(#J!iWKh%l@(_QQYndo
z5Q@>b$VQrcr^$$@wZf1fUa!UpexJcpkaa3*{UVm(>0O*R;1VT6YzRteJ70?T9osJJ
zSCVU+JL6Q0<DpQ2BMnFi;=nv(?%HK*il(mM#DJ8EfQ;?z9?;~zd&Y>f_8pvISZA4u
zOyGO&O+3-}=KJBl+En>XAp*mtrLzQ?%+Ng)03y{4yluQHh@#ccqJ44ppbG&aa)2T4
zEzhjB1MB9Y-vm=XL<GugR8&MLU?FJ48Y&wnMZA2!8M$xmk-xFsOUh6DvFGv2-Rz6%
z_x}Mjt71WH*htUv;!C$>#vhR%bF7Lt0Asu1Wwm8f=npRnqF}(ic7N2}mf!wZV+th^
z_BBl+QW8}H1I3<K@25WF!@lE1HC_lX00gCW;iBhThGD~{Mc4N{`u{NQsN60eRX{qH
zN2_LKh8_?!?f@+lSh)~1k7{53Y;Dh4WAdff9u3DI$=QDD#GmxH*GS>7grr@+^6UG3
zk*4?{!(aXYAX&g=*w5GWmg|$g6NO_VSq(&ufQZyziHK>Opa__NAXzDs0**N2d-nrY
zB~+BZQ>1ubvF)<{{|N#y3;_{Kz}jWyuMez&sa0`CorXrb#YpRE`}P0P31dby*Eknh
ztHu~dhM=rs;shT4kwi8nt$>v}0|yF3uilqw*~RqT84BX8Qsou^0L6K?qG@Di<X%t+
z!}uXL)qu%%tSEJ~<;NVts1YfF4G<6;14rnxR7(&8F0OmUZhF0T-%ksIFf%%{|IeHj
zqKAJ|beZgd-ykkPEEru2?g-o0H%vNR0k{anV3wYCcHPJo;N9++1yS+U`?+pFcFchQ
z0w_V@BNFoRe&jk{FRywkv->C6)T=n(OW})~QDeVoO{bRHsZg++=65`IO>xeD2jdQE
zI{a5QYywbP_LO$5%4~Nyz@lu(F#;L92hF>aI#-rDH$>g7b=w~mNBDl?GEpgi>vzE|
z+Aq>Rz7+2g6@_ns5UhbozQo+?wMzh)l$6AqfQdLFW7x#5mvMsts!~`As1b?>`r>4z
zZ~RGP{S@yrK9H=R(6q7=Se%p4gm*o^^0}p*R8vm?EK6I@_+i1%Pu=mTod!ZRa>5+=
z0M|eeYhyFT%Dr3r9#*PyP@zbWF|b-*>qN?J^JM+Yh4BZ<>N(i7+?LzTps7LM3F<w8
zwWs|06S+}SnHrsxg@RB;$;Z=BTZAj-f;<3X12zyFExo(z#hZF4*Khid)#L!UWe*A0
z>%d7~hesq+yWdwkZAaIm{~fizmK%Qr51&r?Lg~3{G5Ww@;9gcifl*Zh*wR`Rgqdaw
zuqXm@o0hnhFJ^~K&eV=j@z#c(@hwo6(nd$rv*<p!j+%Vdt+?BDthbJ`Ble}jU;w!Z
z$9KJOPuDG<LS5hP#j`ak7xwyeVe03In?o%sGUZopq1us|;d^@N93?|QcEr9tnV|gQ
zO{ER5V$g(e;AB+?V)o~uD%1Z~Klha5Q@n?uiZKp>m>d(4ut2GDZTiALD(yPTq!h3L
zR7K!@Ks68odnwJzoy6Z%$eW5WSgZe>BE|cN4<zd+jNx-}6es{Z^70y#3PDW}b#1E2
z;{4-UE<9>@b1rnE1cZjwH`05joyw#@oX6Yx+r$fbMT8(jSpV=Pr5FAVF<G~^rrg^0
z^8I+>o`AGm><M@Ic>lwGWR(M;xc;Ti_VtAkQveW{i2(;?19CvvI7Ww0Zhia@h0c{+
zXhGMy(wgUc)-TS~_p90ABR-KJfNNW&_5HFnBUPY5NQi)%`U8K|)i}E6vEO&Eda>Up
z4=z1-wO#Ra!>NA(z*+_cz75cItt$rkhC-78gaAN_8(n7`kC_fId`G{6NUSTbyZs_s
zai@sMmD+1iYJKpx<k+BGlw#cxziQm^RA+5CXrHFzep8<FTetk_+zuzyOgcX7x2uY(
zSOhuO`id@_l^b=0sU0d(0tEz!bD2In1`|%`ne!(DsyXr}#09t#IHSZAE0)JHJ@5DR
zl;S@Wv4)0^)s0v7;7ZT%W_3jsC^Mn6<Zgdq*?_v-KH~?eTGMOEOL2!WsJASs2B@qV
zX!DJxNb&yQ1IhYHnCLy*n8^a}z+FcC>AvTBdb;-8so^IlkJ)o*y%n({h8UC#`z1?v
zpS#l}>Gmd|%9<bpM?!Vm9}o^~4y}_F3$5*G9J{wOHDpWybwT4m3pKt55EL+YkD&i<
zzFA!%3PTX32+9=JHckI^*OPy8E1&CGHczQ0+dQOc>Zh}#4<l0ti68{F*_}_9p?iV@
zSXDzr1hs;&cK7qbu?I=i8HD{?mpwz<e<(lva8VUv0vM6PWLjSfI48AD0YGE~$TV-4
zKj0g=;fDb6tqx#SG88uKdZHV*Yo^@Ywc<%MwA8ok0XAf~PHV%a(Gk=z;3PBrkZ|~c
z>Pn%>IbtV{V;w@2+ZJ|qZt6FBngfh75euNC3a;PlOY6HfXwcAS{0RyHXOjG;ZrQMr
zBE|cHxP<3#C!|#j4Ir`Ga)9v*)V0Ov)L*}J(2h-ae0`7#3p0_jijt|y@@r+|RX?P0
zeu`}ZA4t|mLK@Di)l){LbmR{85By-)_M)9Ss-cEqoh2CNtq8VKRiDmVdn0wE3eEJH
zknOXBf@r+gqb05w{;IN;VN6LSXxwh-(Ldv$(Wy7$asUf~p{xd43~Glp9rzQVB(4oq
zLtIBt5r~M<K$tBg(`U!9&jhRGfHMSw8h{BZ&T->*8HH6ePyY&00SE<-5HM6>2q>3|
znyWW8BcLF#s%lVQH}!lHg*bTMT)zS$Q!{uE)hR^CO#C>KjyWHcsQOX}gB^VTU{K>!
z$oVx!+_jV&G^KFh4})R*DF9-DrFh_3f|?QikNv%922`kwUii--MT%_%1oajC_>Mpn
zPwZ$?|5G?XNg2#!xX@p61qJ{Scsq$$Y_fNgF?91^Q>1wR@quLhim??d6N>{WWsQ?v
z2iJN5|FUSv0+3gY-O2`f|NW-Wcv+xA#2P|LRTM%{Aten2qOc4DFq4vIl_Vq~F`DU1
zY6)5y@rN==#u-uyjEXu1XUGg7NTd;~F}@oRL{ttn07M{!T&!nHN~*+S39Ld+4J$GO
zNQ{Fcf^h<{P~d>fuAh)mdvmtgiA$C{P6ZK#AsJSQ8}yYx1StwKP!O@gu~Q)k{9v4L
z+{YghD=A`?V^#qfI6*+FobA^<^*q&xL_iF3ao7YW0BMsU@nwa$c@VLXQHm7X0(yh|
zxO-3Ij9_o$N!;oWuz_aXri=^XrL!OF@!YR(4Cm9UJjFJL4<zertd|2JWhI7|8HPzz
zMJWIU1fh`Q4&8S+BioytQxHT+lmb$YMS-A9>L`r0>cHM1kA)or8aZZCL+Zx^95@m(
zED$DWplk^WQH6#uLU1Ik#uE8PVkHfHzp&?fF-j1CA*_M2Dyb?n9Fd8oKu7>#XH4^U
zVc+rU)MxRHh#&w<C`$1pelnh<B$NY$l~qNUm;xdV)hUr96K5+RN=zhBC&U6ZAcbKC
z$O)-xuRtlgP}mnjiQ|)R$r}gvW<DtmgHmh@NIHEy&1;<E*WN2sJ*|V;F#@%uB@+l3
zAfZ5ENmw`?$V>5o!3UD{H?Niv0U-c@*^9o#kI);Q{`TKSea72Mzp_{8QA&ygOv+T5
z!1c{L<i%!5V|Mko4zGDbL@@r=LMEQwCEsxu?=}MfE5Sbz5K3B($Up80*9#OTTlW2l
z-tmtZ3P(c3M&LEX43VlY0mL22TrDF=aRU+&;!XswTx8-uYj4oQ3@~dv>+kymh`2)M
zJ~fbh1ngB56(J>xChy|oNRi^bMI{&z@7XO1uN18Wf>SjBDOmv-K;i)vs0N9Yh!TvF
z#;e=(%1-e?zz34`<LVp8B$Ke5<ouHXTS}yLsPeCO_*QZLuc9ndkqTsxa#9gY3Iegi
z4#=?jj<#66A8HutrFp$fUq!`NSy|P~_WGg#E4+;l;%osUk@Aea05p*+>{S9*zY7XL
z1CS~y$J0ih#{Ih|lS)ZHF)37yCUi`Gx8uaVf>pt>Vh<P!hQ`NLssAS`MihTE{*3r#
zMWXhzm5hFRk4*7?qOt~cYCz72a-6YK2$dD;CBW1eBETogz)*$|1eJc&L_SpCYO_#^
z6z?ZKkgV@fVOwtxUnv+P;m(RO#=CnTO14vx@k`V;)Hf5x{1-1dlDmLc?n<f)J|T}J
z<gGaNA}J~Mst#8qKS+D|wX!Ks@YU%B#FIgZf@q={(%X)#RV#zu+-0(2qRq}_a;!ko
zlu%heRml)hoT$*`BqaJd@t>22;tf>RKgm)hrTi4z0JOr`tSo-KUSX9(XvPFk7r_!`
zf?j97x2G_XBE>d{4>0Rjym+?OJWKCoDw(kP+aw8W>V3D$h;Q$IfZiS4e19sGz4;GY
zJe<60D=(xi516+2&z7f%Hs9VGci~OX<z}f8+VYjXk8F0Xs~_5858k7XBt?pMh{|HS
z;sRkr95G19iK@v2yt*fP=-Cy03pEGs)CVpS3ex7&6#q8(0JDCI6e&`?T~M!fKH`iQ
zF~O+=Qy_+JcxK%<uAM)BnSshxUmSJHE`zMI0i|wFDgI56Udky_r1+13#+`bqRfQ^$
zT2)mtCYZOOc<LoDocQZ`^H*hZ^*IpDd1*aRDcQZ*)Cq0&CMi;E+el;m6e&{t`y(;W
zs^(=C73VYzOsB)2?pgY?8<(za&znLGNXpKE%Lnc>5I~ts4z5>i0xA``eE`JdRlCvx
zR*DY-Qr1tABE`Qyk_Oz0uZ}u#L?#Sq_QJNWU-il(3nI$bhqcZMm{^Lk`>4Vp6Z=42
zj+kHsfETh>k~m8m_p17Lc)t%ZMT)nCl=V}jNb&CsugD#XZYUIR3IKr#)^wJCbjz!M
zx_5QSWw@>YwNb3D%ZQ>+>_05eC^^akN)RFg0phvfWb;muh;$&O{uCboq^zGJ#k++U
zWQdgzv@+jV8LX_7kx4~B6q`dJep1396>*3+zP#%Hm2rs5uw-R7u!U9JR`(T1yr(K6
z^YPwQJgIu8G?R_>9u)Py4>p^_ta6>9#L=drpHUf2?H#uTDt;Kfud1BD%3CGsxQHL^
zJ(c=aWSzGtvkRbN17=6&@+a1O_nO6vR|?fNQ4VTL1T|`eGtQPrwBXp?2LTc=YdoEA
z0CY#LTq@URBOq3+Omz5D+E4MpK+5_lQoLKlgc-{x#oUi-)=WemE0o_WB$d!w;sRME
zanwteUJ#;6Sq9b04yY(By@h?Ph)ej5`u8QGs-;Wvc2q^>+AH(m$+%Y$1Dokhkj8vm
zIT-ELl8RH}y?U0a@uvOSiK>n?Srk;Z5-V|ePYU%m&HvXTCqAjFMC3#O7!$s@y5s9J
zUcL3HP6Rck!H5%zB7lTKA~sN4>^Xg}J_Bn4XN8S-;vk5FG0QeaC<gs&Ljd9hI;%g2
zccT5BBE`QHDeI?5@y-#Gj}qX#${VW^)TuIo$f^cjL0MIZxV&L4hbE}+qbjk_3#s_<
z)r-Qx8g(kh+bauvr#V4|MU+gei4@axMJ>uD%E_@@)K<tak%J_VH+KL?DngLS3Lp`3
ziqio~{x@Y26|Y0>H(>xJ@+W|x8mTfV0}8{cl{nt#*P#L>fE_?k7EhHzU?CNN!LiJ<
zDyb;QAGwe}e)7}}I0y-I{G5t9_8UJMBs!5wfLd_@o36<H>W-xs&wO=bSH=|Tpk;9-
zAc#e=NdRk|*4N9adkq7eDAYM6Q%Nk``qXMTo-)1#y=e!kbPA?OvF#&e{S+zQIjSt0
z)Z0NpoPdpCb)pa<kT+9MR_hji=p-{_!1$}L5vPVpxXfb6iUv@}!b&C(HDt)CkoRp8
zA~6EB6x%8QsF0dKN#Pjm02KsG*m}enN9PqqMSz?Vi33H@Q2pw~c_#=kDH%W=5U5it
zN`?ZXfO8Hi%*+Bu%3jcpBp?O|D2pd-Z`$MqrZus8P!(9R4s0|C$gda#5GR3Fbf*Q%
z77{3XU3`TlWdc-Hy?0~G=2S^Lyz#TqL?#z1Y6A{uFIfMD%N9Pp*iuac*OHS6;6%>b
zTj(hnqs8Ls(+5o$Q0GL007S<31PJl}KC<dJcdj2k%pEqd89>0qv4&-GEu_q!Vw*?G
z`YBSpbHr(Gg2F(F;k@fn_bV^vhYsglosUUblniyhwla9+;pXFyDg#CZ1QivK$%Nu8
z2W*Jk@)g~y)@ZpWKWelYG{8GEDLF7Pc?XBM7Owz7EP2ZZ1|!a(Q*Q#j8i`OmG;xln
zfK`c!vfcy=AOxgJM55}Q7?~WLP#CJy&WG;jfNG{rb0&m3WfmBQ5Q$n@`sJ_Y_t<sd
zsizu=ynLki+&T-Z5wNln5FkJpfJkjbon4X*k<S?pM7_$rC>UfAf}Lk%joq%r$Ni$6
zou*J95u4sferv+paZ7;{jST0WUD0*y#m}`C>rG=pY)M>F9jI=j07MkIuv_^5-Lt4O
z0zpDTx208GcFuy1XJ6{b*U+!d+-_hYa1vW@(bgPxQ>55tk+Ob@6z>LcgLoNW7841D
z;ks+CT65}YeSdNBzzZ)>TQ(+86$0qxmlc9Nr(^(yh_K;|SXui&m$Y4VWz&d}(VF$0
zvmOk4dLq^GQAg}J^A=Hd5MxaA#N)62*RRL@@shByL7l*Ojb{!-p=1o%j*iafpK02A
zpTyC4OBSlM)d`515Jg?fUXwLzOm`<Xb#i;R>*=(`k}1^iu)${6DcR;GxH7~c$oIVP
z(uO3RRX|(K7KIZWic4{KD-?H!;%>#=-QC@_xI=Nb;uI*Z#hs!>gFED=|9#F&GAC!x
z>@|CRi$!0#_lN{)|8nrlD+qjDC`!QvQ;h&Cy8K}VH}K7n=MRtIoq9P$vKyM5lvmV{
z#`(rfhY(xeqm%3GKQowUul)m1OGqVNS5ySTC$)V#$&gtAj)x`OC(x2qZrcV<A23se
zML=o8;Ojh37)lVc-A8VR#P&NL-2Y0Bfb+UoTw01BeVD*0Nok+GEg%V+nogJQrSaSh
zJq7m`eE^Efl0lml(k#b?dNTUG{5ZcSo~Ke=aq7MmBiJo}B>U&nhP<FD+E@|&is%ss
zZA%x9fu3};0rTz`12-tmRLP#l=^1PyJl>W7B+B8@@q_9ap>n?wV*J9#ODv1a{%q)n
zSgCk>JRVjEkxN-z*#x2Cig5dTBuU(-_j<@OYhY1&QQ&-l+F7dedHY~k)%T(}zfkTe
zVsw)St1j{D@_Qty-plcaGe#EeQ<zcq14;d&rNby$^%FY~K9AK4Kd3T~<0s&pd%NMV
zgp_mhU>M}bVs+eiF@IFsc^NBS{{sn$Ow|~0khP38>vk;zqr5*(9y$E?M8ke&syd5}
z*UrvOQ$FcD!2JC>6s;JCu_-i?+n&3V!<&uZlvhn{pJ`kl;k2TLCKi%a*z>M-bpLw7
z&20GC4e7qeVk|;-L%dJ>?lzM%-f)8YVF`w&pmohRsnD=6p*?1z-@YVzy)7<tXFuzh
z<IsM-(n^8Z68oK*e}6wiyy~)&@V4elF*q)3*U}*pN`O@IViwPL%G6xc;?D959_zo&
zpia~%Tn#BGA*(%G@&isG&V;*K4i`tYw(xY%aq=pHLO_=cK%E+}WN`hZXD=c9yluws
z!50PoO#JPca1Yu&q-ubHj%@yYCp7P2Q!Q}I(gv<Qs?{t`NeOG=v$!2#iIdRmXUOl9
zJ5B}vU33pQIVz1H95`iKP6+wO?aX*>ftG+j;r+3k^I?<U?*ePJanM@-%fld==O=w4
z4qSv4w@roG@;uDLn<>vNmB5`~fNcFI{eY`qZ-GC^*j;jU=H&m%>gQ=0d}X(@`%|$e
zL1fM|vElQCV{iB}zJ|jXsqd*Ufk^uk3b+P-Ju><gbLQF0`{*mI?tHm>A5bN_qs~hP
zC_Akug>iektG~{2O&a@IDS|xvye_=B#L1;7ED?+xE?YdLM}O7#NBqhtBnp2STi`N&
zv)6NPK#o+8l-$uq%(>1aI4k!Pbw#Jq;z<IfA`x{J_QcE3Tk7t(q{*P;nph^queQX0
z$0skeLS=H;ow|-T^>wRZdzRQEYb{((j?@tOidsTBSCW~;%}?`=K^1%Q+B)l+ox6NU
zuI_<--45fQPGI1Yq{AtkL`)E%G%_!?-?xDNLhd90^3SqC&dP8i39zhIhAnh4<LaY*
z$0X0+o&i;D-rLa6XP$+Q^@-zx6GCrU-=Wc}@cVjVj6`~!2CiT=OfIs71Lw<y!|AGe
zUdNxI(U@>zSM{dP1sZpXt)^2~llN+{uplqlp8gFkr{3coYXS6Zv)$_Ohfj0{fj!7)
z!iV*NwnEo<&I28G)_czvjCK2srZ4_iasUZQ4Q)KLbambdMPAmNd|w(LI}d~~Tdj%V
zcmX5y1(!_vPx`8_gP;!uZ)w&9g>7f@Ry(_<FYvE~{HEd(kqhSUuZ_e?E7j-xTHQRJ
zD%?0HaljfrBxz!RB-u&Q#Kxuii+H`mhNGL>7#~don>3=V9h;FX9PzwKKnxm%>O<bY
zio!AXO{KL6!U9^d|8dkifcRyZD6rIio$5du7=@{<_wHk{$gt|pJq}l=PNV#S6G+bz
zK>e`kxS+85>n;sYJ}s6h-#^TXj}d~{h0A>W<InIj`^+flN2r83g)05vJsyv6^%49*
zi7%+e(GK<H7Wx%^?OT|Hj1qsa<l<V&Hy?o8``dUv4kKH60kQ}crn}ShS}_!1z-7tx
zGiL`Vuo@Jpw4jQe2@s)C8wGldu>pF08viu)MT%_7e1TeVS|(<E$XfD#yZduRN^<|W
z=P!J(GHy2d7256n*zsVI(YEh7US3?SV+F2UDjtSuxPkOuO{383JpS)lv{YhjCs)V4
z2V7x7P<(~~Mn{G7bJ4q#-#4;jjTM~-6F-Vw(kvN7#r;d?cr-uT6UlP4dQS7@V|18^
zyp$BAO6cUz3>f<Fvupr)v&?uPAq7o8Yc~VvTiXJ?s-N<IKtvFnOn^3m$V^_lO(m`G
z@NojZa#N!=TF`a%2fza8Gi=b<0ffTj`TAPU=4Un7Q4kW#5yXmUX@s2oflN690lKWZ
z?$ggj|3e}`u?>&j^Pb`K%R+|)O-KbRo3XdJ@5^MC0z5G$_Vvj|4yf9&Bq(zX4@N3T
zVOf7OlMx%0M;f>bkOvIND~w<%tuua_IP0|StM{89km;uakVU~X5bDA<Jo(MM!v320
zbP$SKyKV|X0sB4;mk43kXsbNj7@~GuoEVL=65jPAvSnSznA;13Rn8Omx$<2wgUHlo
zvXGIZ>;P><d_lYobqoHRi|$=#hY3723v+544ZxsmkqOBTp9uWV6zwDVPzbcc?O?6q
z9@!&lL?eq+yZAW}$>)<~(1qU+3GDJFLm8bBCNw9K$PigUm$Ac4$en(XF%Hj(hFDx{
z90&C)HmUl5J<)okjma(?Y%(90RGx0&n4_!zGbb@ePqz;zG(x2+?}g9FDh2u(>qkK7
z68JK5?!+DXQwP<0dqIdoCg-=}Bg2b;6Km1|K20;R=$Kk9y`_c;#a6ZNKY6iWxEO8@
zw6b+C=p60PRw`Oi>9%}a7nYKRk5a)_8CV$l_1S|^s7-UHjIn4NETnt_{8p?r)Od<8
zE?r<38T?Vh%QCW@J?Wok$bhuv8U|=R!$A=^EAWJ&2J0oEp~K5f0e7Ci)a=5n)815Q
z?<+M=wGI8Kw}HV!&UV&P+iQnq2la<#LoU+;5cu}_SKpbzBNC<h7Z`GPGv)jjx8>LH
z!_1QFy*5vgmR_77m)W=5dBUTfm7@WcaQ|nkRR%)9C1YA3ovd^(FeHjj^Vw8f%q~x!
zD5>wsmH+*%8*%WZ;ry>fSa7xokqj^E#@fXve=lkPMzCxO-``F37;IAIQulvn456xh
z>pgrQQ=R&*pOgMH;CM$U1kE~0yr@6Og+O`{w8`HA(5_fCbhDhcnonqfcFzmmoRK8T
zK7NW<tL5B6&pb0(gv?&v^A25S952AsYHpRR0r}-UrO{XKpNWCeBKvwtV{XFNNuT`6
zW~|F26i^z>O~m^<!SDKmh_nl!=Jw7go*LDtO}lc_N_~Xts37D{i_MJq(u<)~075@i
z@wlk6PNT~_j@E)w-rCWES{@HqL~8QmPS#NqAR-=0+2J&*E>Gp}cKJCd<6h4P|GZ%{
z2!DCsa|2@9B0^6hkLE$<$DLsTKc&DGy1nCS#eNJ<@S|b3US5;wT8LX0)mTM)fIj!!
z%7Sj1FSpF7Zv?oz0}+CjavVABY!dkRpm@ki_RT-+8pHW8-dr~pqcLJ(OAXOi^13Dl
zM=Dyq9xKR!-IQz-Q9^Y}N>8xAcAm^FvB<MLOu<voPnADDi2m1J5K}d=hGo9sAejJV
z1D`{o350wY0O!XTrhX)u^2&NWbuY)|*-8RJM6(oa-Osq&^}yS@oD9^y$`(IKI0Sk0
z^G>;`nE-*?%su(G9cmjyZQ&a(Xbut=Jj~FHx<11Lfa}H;OcYk?l-qPb;b}DOv;Pet
z0LHG+vF#|DDV`ykl^p`CQV<N!F7iF64Pr}13X4+7eRG;!CC<M?G$}OWOx57AXxlva
zZJM~Q*Jd#z2y_#EmUbRX3C^-RD$B^qqiNglv9<WiN}mzhkAP8Kr6alGIsyH2_&%T5
z^NfV6)cN|;+kNi0ERLTo$gq_*62TZjp#ZLv?sqO^tIC|FFAvY_*sNs^fVy(ud3SEe
zcEZf=$CGQSS~TRjbDrd!9*8E2qK#;hcRynp=>Pn<_r1;k?F2TlXG1ic1`pTihnM^D
zFzs@H%QpFRBJKAQANqIZ|0=E{P9u_`-;w7>(ZXVzRkN;dOc$Ntt>;w5>4Y3WP!mC#
z4CNKM+i_)iSB#)P+6t-2LrvsF`<9mHfODCNlEuOBKx6{iL!M^e=|xsDWq+wMPeT<9
zIe`1L$jVF3;nfT0kNo$?lWIs|p5)*>`YsPG5gHe}kzMc4yb&OF{&QZ^Z@Oy@r5-oE
zjT<OV&q~zV27#~5kn#yy;qT~45Q+S01+QQ94)!mQ`b7ckpy+BC2-SQX99||^?%q~L
zn)ZgF=!7nfAlOlaR`)lSJ_>d>ZEQ(Vyd<b`WjPwlT4n=}O7<tA<&D{<H|?mQsR%`E
zGNeMI_))V?L$72*F9vX{V~}<!Hk8Gi4`A+Q=5O&+mMe>ei26?%lY*`9GsGI}$e0*~
z!>ke*yqk`0dxD0WcW7vJkE>JlwcB)AojxmPuJvsu+vJYF7U?8o(2)M=V!O=ZttNi7
zRaA;u(g-5fhxW<h@1~@&p#z7#lJwvBI$ewa_rdGUL93R50Ej8C6VB>=m1SVAL^5n;
zVmwvji)K0xS@nKU_vytXlTL14TfxG|Qaxn$j<!C7s?E(m!SMT$s4Xh(h_+m>_kH7u
zLZ5FBb6RO=mm=iHb?aSeKlN0>G$BbH&AYCztxFX+VFKXs07z_76XscW4QIU6>?UCq
z6~^%4+GKPFzWyI8UIuUxlP+9FdLFQ)kkfn6*4%z=hjcmAXYpW|0i{g(2PHhls)?0^
zx17%Ap3X-)E3mcq#Gq1*`-G#xth3G&lL&Zj3q=6EqI%e~>qWqlz7H?(?h6TA&<|A`
zk=Ma>tM5u1(JA1^dnp=?-6rXT{ejR(;x5-dN5ej&;oLj_oa0h|8lX+LBJ1zZ5TGvV
zKWtN}m)zy1Z>ciQX&4<soC<769OL?Z-+S42$G%qsH4Ru1hZ%og0F#djfG~aS!n@Go
z_t09Ssq)9{J8kFXF^dH&XH<r19cR>J_HiQ$ZRH(wR&-`V6`j61kB3cvE&r$1<&Euk
zBEWsn8{#ah<8qzd+g0L80)OLurYT)h1dnsmCQ=2(my$~o{$CUIRn`vE#LgA4A`z(?
z3N1FUJ)IR!v%!D`J^+&n-o9)-v+h&5@S@*y_dsbhRGML4qGf?cdEV;7b>Bxte)sc?
zzP76b{sFRjE=*Mjq5+_<Wk@R0L@G!O82nM=9Bdq;;3-;V^Rr0g?l8wnnUp;M3P8Xb
z7VL7m_WOI=-<iO@v92)FVIO!Qz?$vE%l)3y?;iu(zN(`l;!t!^HF3wOU1zy6N_rem
zSFs-3j`F0m12P`u5X~D52Z5wY#?*>)!vioHr!iE}Gvf)lHk@B-b=n~8QGbbie=dRh
z9}SHvC2W9AdlR^%kpimz-dC~P*AIWLXt9(pHuep4HHDpNX_6rRRXEG=eo6VEH2Mq*
z7f6`HA6ty1&45w>`Bf+Q0nEmz8BzF*xCexZSv>MzYP-9AI!*+lW8oPk$Qa(CypXbg
zjD;IsQ6OBN>gEhQYcy6PZTkeH!e*`=;NA!bAL{}r!$qeMz-k6wxaR%|*nZDO!n5jn
zP=w1y<HzhTlzZpjux;n0&0pg@bw+gEWnxT|4!VE2m|DgmC_h&MAoJ6dAkkJqscML%
zXLT9`c<-^#a1eWGw<#I&(8m#JFSCtK`IhaW1;&m48lD%n8xSd}z}!+}%6UMwTH(HX
z=-t4iCnBf7L>;X(b?-uPBv~zAs$?x<`%xZ1CZ5_HJKo1p3sw49NcK-wc*pQ_qNm6&
z2_@6+bJ!$(_ZqvR&2?L%6@G${#GwEFI5mJhQ4)d$U@$7Pn^?6uZco$ZQycAVdh!=O
z4bt-VhYgWuWg`R9*=ne(CFFyPSofA{eC~ei8FW3GU|9fG;t3}JaLC9(7r6Nlgj{^k
zg_sb#8C8(GFxVV|+(UhYw3K_pvg&ZDSni)f+?^C9$)Hp!#Hbv^;&5TaM6TXnD&74D
zX=fyhKbK>}_v=?{c|tvjLIa{wqW1c0-nJe(Ew?rx96>WUl&h1si)_Npq(7_rG&&k}
zwnQGsoLRl`jETIHLST9HFdO@RJMPx&iJ7~zZn@d^0zPuGI40fXestGd5$7s^@LL?z
z&lYPW9$l{RwH@)14%)-tlETQ@FAGG0Y=B}igYItJy=&|j6iVe0BT2Y%s>bf7)`KRI
zH1%3_uT2)!*Ru#u9vJ$bcmi>VF2?__behPie{q=Roq3pi{fEwjm|_GV$2gYf3s0g`
z?%wNkQOXF$;(S|&l+PtL_iOV|tiaq4CoiyS4_3Z^Dy_~tFDB~8K!GA2uRIb<3!Em;
zMLBgHDw?&s^6{U8BhEVcd=7;g4}Ec&vl?8lvdX3Mz0#nG`oF4ig$ca@YI=7jrmehR
z=Kcz^3L6hZ5Z3G7wASw@3Uyww7cj)4vx|@uqC&9&VS)JK!%4;zG^QrTI^kpvL57@r
zYev_l&-A`ge0VX$_JU;}eeKYt)t~{IoW?(nJ^OS6=Op9H{ada!W~4i?E#zkg!t-27
zs`+;>3za8Sf4lG!2{s)p5uzLYV{d({os}OA(%n8>YQ0~zB!yL%j7G-`lgeOb>0xC-
zEsTc!n{{VIvZL>LE#?Ct5PHZ`RWr4$Y`*Vyy{7z<W+#I(k{1KQ7|~?9VUiY|B!goo
zL=7cN<s@igv&@W+#v3D0HkPcF3i2rf_R*JO1VZj|emnGHgO9!Tde+u|@*G8t;z1j2
zb$dggS_f0y+ua_Uosb#Rw`<%Z;(*&Ze?kJ0?yI7&7wNb;y{|ZGsj+%8ywI&MsStPH
zb!~fl!8?Zkdq;2Jiy@$QfBU+GwC8^1HawMvGtWtNw!O%Ai*Yki0Hem>xX^7OdHwC(
z^JfDF0S^77`SUG3-T2LsyMBPiRd3hJ`)j$&ynBnE_)$H0_W9uE2Ft^yW4{ljzoAe>
zgL}%G<L1njcp{^uHomJ%g+Z54vP^X$a5Sktke=PAr5Q;RuSB2tPv`?AheQB<0f}(W
z6__u7I;>=5TLK%)zDJAKS}Xkg>%`H&`92fvu4|tgAn+#XxwQz3%y%vU5a|bI*wOPl
zX8_qDnNpWUj7q}ZEB5Rv%R~6zSe;x+mzVDJbaCxBtZ;ie4-+%o41KIQ93sz2^U|#y
z+7K3`sx1WdgiFf;BJ(uDLqvWp*$EdN_kkB~u5YFcd(lOVQg)wxaWqFZUtt!5TKHW~
zJAJxwp?>}8s`q{bDp2oc+i4e|G}i>PVbnyag_(tml|;L<lo<!c*c1Fy*XCkvdeK5e
zB>wTPCq8wZP4k1!&dze#%*P*kA9}>m1Y4BpqHHQRuq<qgPaRuCE-EMiUdH@1Se5q>
z`hH%9{!+k*{!a(~BSirfAQChNVH^Sbby1{dz;P^bU<V}Zl(T?7ZWEQ1w`WFX#KLo(
zm~|<}(00Fl{nK{4<9ehblh||K%UFQn`$<X{h_PF!{VrrCtm}f~F`IDU8@c8<8P|ZF
z+b<OE>jLmM+A>xNnvy3!ZC!+u!RZ7S>!~%TTPt8(Nbv7#_nf5~(gx5)lsgcxy**P?
z72b1i_q-cEA#@e}z30Kk|2@m<IHngjz<w6(lNeXfDKH$$ZKk`YvGH^JL|qUjJ~<L^
ziMg)lEctVLU(b7EVvo<((XhUcx5eLv0j<7{!^mx2VF$mu#%TvD=Jr<0=X=K=JFyDo
zd3YQP)$S5DAq`hQ(7If>acyd1M`413rEkV)G4l2$@&+nYFvFu!V7H*J9|%>-5i$*R
zw)oy&65U<%g56nl1=5RvYW@l<+Lq58=BO*ZPf5=1U2%U?(PV0<KeG0PP7gr#h`av-
z%DeaU+zUZZTX?2%jZ4$noTm^IlkZM2yu^MGG%63olL2|f3s{jQha}Cvld<AM#Uzi9
ze&+N&D?(y32;5HM3;VMJemY%~d+(Y_{$h!Ahdu!zDwU2P+v0m{BDYuKY*O#vEX*23
z0X3>o8GwY;w3w(4mEfxfE&n4+|CtxJ?Ro=1;f1TAcH`dz(UF3s+jB=3)6nJwg})->
zP;-dF;EtCowDh!5bbOP0c%CQ(hSG1yg~f$VmqFQvuW7Uj9)Kr2K2$qIWp|I2z_()i
zjJ!^ZjWtl(JQC@XbihD@Cea7_Bk8c<$z=EkWP;2}TXoCy_>xg85nv@Fr{8S}r4;h1
z-!9^Lw4RV%$KS5-LDWW9lDUIw`oZp->DEYjv=@lJ1Wb-0*Or_}?EOVi%qrh_jqx?P
zNsSFzau@?Ysy9-VH#!8Uf<Jm-8$!>ILht-Z-O*Cp-8e|pjCyT$tF^OMz_BCe?(Z(h
z+<KY#q|bdas>Jg|oMMrlGSIqb8`ldGvjP9rHvV%uzmZSl_s0h*&Z!sM-rohF*oR*x
zR}lMB&soz1iT8f8pO7#Ffx$k_V+!HWSz2h^VcjFKQHU9}9V{hSEshU_MenM|6PU*z
z4TPmd%ShsUoGe8Ku86`S^w(6$?~|ELAVFiLAV~`xX`7U&U#z7uFT_3<>Wy#aWo7#~
zo_|2a9(NKe9j~&<>=t7dDP^D{xS<LaVF*}L_z5>Gia0PnK9*sq5iOA}Ip)NqBm&`l
z@XL`iA}@p#C9kQXFVOJyY0^=-!)S0yNR(<u8-2wgDn3D_RCZ2CAH}P{RDUgoH-Z)I
z=|Z800gUz|N=L=<6^o>1^?3%;KQ{VfmLpM9umuqR_+>x4d=_0nwPt=^6<>y%;Wp*v
zvZ;_*s~%P$&~)%nZjUR**x5sx*;Vn(5Q)?;aT+UVIvwaWIeXZ#y{4gl4DCkN6InuK
zMh?7bQaEVDonE!<>h%4TTT^_sBNYQl8ow%t_(96}CqQX`IFk&4%YM!6<-GkLebe=5
z`h30pA1cQi;;6j%v6mo0MG@dn#FAXJ%lx<IvqHS68fp0evZ_lE?#0!6`x1e;;%Zb=
zFf3stS(ut?6nY_rJWT4{r%RaN6if_jGG*IAvCycSE_;o^Y`7Gcp-DT0kd^mHQ6qPB
z%-zfy-<JvcEFRNqP%*=jYJd9s4VA5;vUxCDWoxOj%K+SDAuBpyDaAOr%2<sO-H=Qf
z<#jz+J6d|YI4(#VlDohtp8gX_6Nds1HEJ=1RU4Sp;MJ^aVjdDMW~2>k=I7*?hEBV+
zU7h3E)K=J^62VYa3`Kr0<#Gq}sGQF}xhlW1AiPo(_@}fI`hu_GpgGX{DzMIAr_<;C
zC^t_=BAN%K;NKj^zYqhYHtFM71b}c{yT|z&p~DYj2s`WMa0(LNJ9;Dxe)@-G*&j_7
z!rGY!^w!P{rdFxY5q|uDlgZ<7=Fg7@=PFV)L<W}(CyOZ}I!w3;txThdfKXb>FeD*?
zq@k$_6AT3hH0i>J#&GJFS2nrhV}~oPsltXDK&Sq0#)>k8PVNFg0Me&ve?@<$0x)9y
zqHvcf%GpVbOG}UHan8dO4MWCY7k^C&fskldKtWnhmar;tHE?28<8YHqIPAhKbw_Db
z%9KHvzv|;H5q8_eD6Yk&WG5SmW_<Z5zogHoqJ&Aq1Q$$~%ii~0tAn#1SN+I?*_<9x
zr#$`5mwDy7kReoKy_|_s{a5-ne~4mfvOb^*$^r4XteM!pYi-wzKw?$wR+}JBHa*Wx
zce#vD#1MD~Hw3(br6k}Cfp_!{qxF}+amQ{!(3Uus(Tnhfr4mRimL*0k;vjrtqi~<K
z2m+DxG_17;Yc~)HJRUS;Vr)hLi0-H!{l_~g?ipnm#~U_{=R$TF(1UdVQ^7_r_9&gd
ztMG*K(i|f3LL-CR|Hz3z-+z^m#PE6K<KU1HDF$O>rF;Xe9Ev%ZphQZ>IE^!)6SrH1
z&X?OaW+o`HE<RfL6EqaGEDS8I2{cH-r0=;C3SiP#ZKf?6bj6-<m37?X@fy?%o<)%c
z((-j0PG!D_Z(Waeb?HG+Fu4DEjP;LI?3bY4<x<Rz7g9ez$2jt0@0&w?_eBlvZNHnf
zR$B;{@%E+w{0gyKI@s8#YS`KzSZhErUwL$A`fj{@`@2f5v8xRae+hFG{nb?WMPi+K
zDJZdB^s2hTQ#Tq(L4fxg_BTeLqsdeG7?w2RNr7neZJp((`0xXD(+OTxuj`oYm^S6^
zIkT5+CW4f=>Ref?S7G(Iv#PM5RBq)VX1Ejh{gj8Ief`wUyQlC+Dr(yC`k3A31MbTi
z)t}3c*fjh*!s*QU%1_$W0c`RVClb;9D^0u{kO9tr_k)kiWJSec8(W#xwbT8Mb2?zu
zzkSzLvVNfE+p6c=d8L0K9SD-_%a<=B5L;v4%O*&%K&VHV&Z~EuA}g^r(n0rcqlR=g
zSL_$Ykj8}^g#^lzlh8!0agCO?^s!rJuI^t3%8QQCt9~G8=}FJ3g8Ceg$=!>chnu=L
zD)lE%UH(Vr_Z%WDs~f@X-5#z$tY06M>T|i*#$j&^$lv{Bsy%4Ge%7c{UmX8L!(XgS
zO*8+(UHkY*BNsiDm-HK3B*q@_??+@5au?R1KD7nl)<`UReGymIB_93|LKaiJ+HCpn
z7eF?Vf~d1T@Oybdz;o}*Mb4Cke{+8Ql_N%7Pq=3}_v@vF(j)xL6a2=bhp6LuqT^us
z#d+Te*za0+YwRJMq}rFQ>YJ2IiD_<NVU>#%+a(^YQc41HEz^gpCN9t@8)O{zf#ly8
zv80r-et7O4j^5;XdX2~-XZQ5<Y;SFCZE5MQO|yn@1W&g-|AZT_%+I?*SgQJvua+oq
zE&VBj9|Glo5X&bwJ=Ul2)c*KlWh~4ip9k_Vw8DTt_z=>X)c5gxPBMGLsx?($07{IR
z@5eGP(gf4L<4AwhmX+?C*8Qi=zK=<-=o=G5m0$)#2}h#@lpMwsl=%J$$DM`%kZxv#
z!B_>4VM^0|$A{IKy-lzhoWrcHkOylI;WIuRWC%lK`KhU?e@k>+UhqC_+&%vBcUCpn
z%&P2K6sdJKu0qXyyeL>qh71+kFqQ%EfgS)|&Z-QLN6tzGfFZ)*aY3kPr{tgp2ePH5
z!2n=tSc$f6e&1}xLM}#MU5lL0#6$W&)HitF(9&WDfO;P4^t;;1%3yV|=6v^VlI6Fc
zFQ=Ytm+N}2h6ybvGq0{ESkNFAXV-$4jWOB+0h<LnFh?I##q`mLz7HhviFUC3vIt@&
z2tscrFR5Y((lLc{Oh(?N8UwX@i-`m)Yd{NokQJXGH;#$DiKWSJ<YoSz5eAOUB2m><
z@kCg?Mq=mu8%zM3J!V_&8dV|Bv8f>J)=Di6XWrjK)Qm325eO61vcMl;$2q&bTSjpW
zSr2NtZ*hd&bv}4AI;REq!<-TX2jzuS!JsKbl$$%(4DiJkCutVnWzCzeECtwoqPU}4
z-@g!pjEV_6ilNe+!W>F0B_Sn;k0S67hAE2ETdEvOkK-Ht{Bu5^eV99D=2TdKT6cS)
ztRzkJSv;Sd=(SM{9{p(_v^u+#%V%qMUq{tNxo`LBK$<Eax;hk9N0ZNUaL2^2Rg*R<
z8v3mGeKDtdizU`$z3Mb2m23xkjam<|cP9q>vN?^Kq-vl!GW_zY7CKrrZJoQ)tu`n1
z!@{cJmP~bk1{=A#1eRyhl;F6Z4;T^}V%T8x^WSzSR<x6PsZrkBJ~vb1^BC~{`lJOZ
zdkWl}Ef!M*!f~DuD8r+%fUut-{xD@deh!;|41?7*ih2mv=bfVrPa!YRv$MbL^je_P
z&|w?z1c`sgkT>Nt0D~u1o<un<23=Cw!o>Qht+L}o0-kLU?YFjV7thn3Z<CetyN(h*
zt#7F!&h!|_w#EP&4U@rR+*cIouV8!1EXz8MM=0sT!KgwgsQ!hC<MdUFXnp>obnl%<
z19YWKjm?iynDiO@KQp&+B(T?upFPrLkDS8o@SWRKP4963S}f$h3Ok6M<W-ea>2)qs
z#QG+`mUbS!RrP6>jWGKDWq(f3y5jM0DTP9vU_64*lpxmuBfRt78nL>7e}xe8uHMdy
zw+r0l2>1PVXQ9wV-t#}=->yX7(?E9J=E2HT4T!N(-pKxPmKJV|ZiZx4F?N6$<b2p(
zu;nj4#+Hgx#jf4*`S}&hG#z&Dc_dGuZt8;#EL4PF_&gL6sz)P&b>s>sjR-b`Q(+ZB
z{d;^tm+?wkU7QQ{Dy;#xDNpqjM=QZ&_)eh9GK5wYD~iGt%kIk!T`pYUq9SrbSFsMI
za{&B$zskr+F7!cHGnhEyU^<iSYUkJPreV0&t$3&wHIOfvYCU$EvDr-tEh{6g?hSMq
z8boQ(jW%#g$Z%MPNo(*%UugJsPd0AE%}6}LGgd|pb?ZsvQ;h@0S@Z}6ed9OZc~Y$}
z@|Y5r6~-rRmuj*Eeed@nRBwgsu-dW*M$8UTbZuKw$T1%B2fu}sTi0m_aWR1OY4pqe
zx?|sovB1oOt~_+_Hm+)nJqFGMsd+eG&{x=x)KEdnX4kVi1!`&<7WfTyBjhH(-=vmM
zDo`ZTMI?%4r|I#EW1`f5pPm;aNL)Z+umQ)BF4S1L{jHY|p8?KBX%rCryd!jWMS3-o
zQPkLc?>a}l?_IH-zkLO-)`gofIlmPen`q4>4fULZDkcUJf;ugNz%PDP%@Lxte4G`Y
zzKz~#H%DcxAFqXW6N=wi^%nW2T^5#}2I8(eGhFUM*{Ar!25GDVqL2_Yd8sg{KlVoj
zDU94?;1=|T5g&ZdGi=_BX3)fV#0fDJVdceb?Np`Bh&3YDO}k%g^SAGC)$n+o>*&SO
zNFV*CDOgTCG4L$KlWx+^fr|KiMka<dLNI_{Sm)o^=`sH!l+0A!l$bkcv`y$9eroY9
z%kLWk5mBs)g;jf<ZW<{MQw4F-J^{mN*~*>6Qq9+i3>NT0xsaqp`h|ZVo%3|GG&;=W
z24s_glv#*$ln%kDI6?lnyHS)v*U5?hSi3^E^;XyFLR}{fLMT^i-H6@%MGVqL06Lh4
zG^SC>Jq!UAb0--I1kJf~KtEz8W|?`P7cJF!0OD_3Pyu`lb-gH%9tlXy-r=ZfCO_lL
zqPi5Q8}(r&frqXYS4%*avEWlXbKFw*5aGv`EFDuM5#6*($BlEQ<JEnx9*6PGWEjPX
zgZ$!XS%SlxCpZ$`T8nF1DmCZ|S<BPGz+~L|<~&9e$s(jIKhN*|lxTRYZ$}9?z+#k#
zS2C}65U=UPfs3a$)&knJy{DK*dvk?DLj@gF2n+$DrQJ---?>=Zar-hcx#hR*nbxx^
zcS_(&>&!$*PMK@E`$2=RxaXHHiLy9*MoH?A=F5Y6sxWlz#(#UCpK~khMZKp!to0)Q
z5g|OJ()q&Ng~oS`R0hY&-_`p%Ufbm4G1D}n2a(E7W8>9Wl-<c3XU8``IPAg))Wk!d
ztsLv|4@Es_BkFtFcG$~Ute6dQp7fYj?>Z=5{$ptt2>C;p9AxScf@TCL{&N5xD2A~v
zli$<99M5rgC|aoS!LLuU&)_*->8eecv)+)pRC;F%6Aw5fUn*NA4>02SwDY8O@4%$M
z1in$haE^OJk19NYfiX`IZo1{#Np5;y7t8&Lnz=mr-pJG`Y-Kfi-(t?)BQ=ONe&H2T
zY0a9|EBL{2{qk{fHSSN%+H1f3i_M7+fjn<4n|Q?!jC#S@w}Gjj6_hS-QqbQ`w}|J?
z9QPy7Gx#sdSF=^rt)ytQzVL=)p>{vfThg%(qvT~026nr(uP-0E3E!XQFOyjFJ<a5j
zR#gRsvVI=_;E>2wS1I_EBH#P{4Mh8;`q1)3ip@u}$4&Qg042PWAj5jcPl4v|blWt@
z`pmF-;<$nkf8lm&0Yxnor9H5``_gy;XJA^(^%rW_#Olu|Ytc5*=+FT`SsY}pr{jJ4
zLx?xK!rSG*5SmJ5^WIqBT4N-fzXy7wJb6pk>8h=*>$KE)S^NhZn@GA*ZjRaE@Qt1n
zxIAcdBN;|GR%g;}2rlwyGOKwz;3XdK^T3U#IbLGca;>iCyaM+>Hh5Wv0Re<-V=4Ty
z3JD_dhEO>j-Nv8b2Lt|@0=kKdDz>DSJlldH7FC_&9~4fKVtmxE43P5du~WT|9L06+
zp~-qM|2;tB=tE$8dw#+8-q~3Nr36tUJ#uE!<{cKzq|<B^>%7m@vHJQ-%ko1}1VYmh
zE)rcRVP0AkrT6inUYp@zA?ZGxq6$XCgBUG2wMrB<DiSjme}0^e$D7I2N{NO<`F+~Q
z?M44}+?X2&#<OdB`09q>vDnjwnOA0Zw+y3&xq|~VI%jo*0_cSz^BGsx1H05(tZ6ot
z7r_FM!HtbWLBWEZXKsv_r3%Kn)6k4dxPW=2UFTc(?(Pf0LnmtmaWMl<2}jc=A?Q)(
zez9v~bnyDcpL(55BS!8l<qPCdDlc1x#AE{ms|ygdrchiI^vmeBn<D@~-w&<VRhAlg
z$Lv6ndIRh3G*tlP_-zOYx%v}HirTMpn))Ph`w!9i4+3f6TkAReGR)npf<JsS_^@~Y
z9_JQ#t15rgqHGlN>U-ELK;Gup6Fj=P?{nyU>hqz^HKDrs;^^9}`z&4h5X22>(AuHP
zhQv3shv%S7$vnoyRo)sp-}w=9H3)Tn7osmMi$a#YHWR1k-RMuDBSxINxjAa<aF)PO
z)7Uq|!0v80O^cakSLpI6W_m(8_66Tceyh>o=34zk@Fwyav#<#%ZB;{@Sit1x1EGwx
zCFW0n8UXU=ak&fnTK|swOR!5&S{Lw-{KudY<R)J6Qk&?CZsYtsPeL7!@wlvu+@^06
zG|1|Bxz+?FL%*9leHU}Hf&6O5QLg4RIOu6BVfISz?cHwrJm;^&(7pBFB!Fs!S=kkq
z<_A=gav)sWk{+S20cR)K6$n9lR|4l>uWAkn*rx^oLztF`2flf){Ge{vp(qT<LgEIo
zB4>F+PH2!|=j{uKp0`-71!|dB6n;BaO5{DvJUl!czw|HT6-K%E$KNH-5=>ZqB4%p9
z8W|lzoA(|=+1@RY$jQPj_d@$uNkX`H0P0_d@b@~kj;~iaUX9fFhdP>!v2u!SV1kTd
zZY;t^EhCAMg0w}0QyP%SLh^Svz()gi4kM}9WT;VbyWr-BO=k~{#rN&jd!eH(e!*1v
z6dA`PbiU**-+Br%ijQ7PmLuh$qh_sq{Q{H^3<@gP7zklozH(%xNn5F7w^_Q=TeffS
zD@PkyFv`lxKVROWva%p3ouf0~>94-^clz7)@ZOR`fQ3P_uUCcJuTB@EEUF7))1UD<
zrN=kddWIA7!_{Fn*p$Dz<ymLu=J=$+wg{j{v#SV(h;}kW$6$U<%XpW0Sw4Dr5BluB
zJHeVvj;BG_%+sW|cHQDpzrzb+hf#uw$%R~2Xl`DRJhtU|+r7P$sz}J<&_qQ!M<>Go
zQgP_hzip=4ZvRr`AJ}@6JB0%PiiuKBx$+q2_^55{7F&8)dm4Klme&3YJArgC5GK(2
z+FGvbKLk6DmYy@D*g!%TK@fyf!TW7_0Vu9rf5zY5*-#v%o`Fru^)E1`P^D%cWj*6l
zP%`JM@)CS)73Oyk^cH@x0KA|?EmF?Hx3hXO8x`<@k+k_pk_`;9gHO%E_Oie^Ish5f
z)<IJ<2w(MtzP>xL{ERp;7YT=A0wDGwKbleQJ&mcS<Z3qYP{-3%H7o?y3^~m@6UWDU
zb&9C<<98Ay05Z-9>w(4t)tev+))64cE1?GD)!lSkXEoETFh~XTW2?Z{U5{cqC<A<s
z{$}>;EzOp5{vHTug5gq@gn>%~VrvN)$;%m!2LRIWplG1a3A;i3lN}M1y++q18VvH9
zz`LF$+!DQNfZa-MK!3gOZ=yF?xyI)dWmFMqP}fg~+nRFXgdpn(T)gn?Kh-yoIE?5h
zMZV+G5fb0CNa5E%kk;%^F8^7>;J?Nel3!9&S68>@N1EnlOVG)=aET`My_#AXEgpa;
zR|Nx{bo$!e_aoH~QOQZMRs|G10Wx%bg4Wfl0nPFBP5k(KY6<!=_}2&HV6x)0!2hJ4
zbfNftLL_s%sY)!|2+(&es%sTb3qmoYKUGg34&A*Tjx2K(9~1&3?n@9mTM=lOpZXad
zCl`FAQ0)A8_jX*n(NU#-d_Iv#>V8Q}_?`GnJ&u7e6<e$8pnV+?J~O#l5}U!yRc40$
zR}PpzJ3u0!ytx}Pb^Q0MS13)L!xABoH3_>b*SOXeHBurd5*@$&&+7Ov?R6^JSp1ea
zbxR}CLQ+)<0=|@r=C<v~<Y#6zPAtkM`Cn!2Csq9C0i-P@9yqjSaTVw*j$cV|+Sd;0
zkr3eENx<&d+}IX*LA7##aw4F$ciloG6TLfUTs~q01-TgwP1Vv`etv$}MW-!fx7!B4
zKY$?gJ)VR_NKoL-G`}N6tU#Iel1YIn&$YSnSNK{VLGb=lni?f4&t~#=WxRH~+xjZk
zno`(fax%lfuo}r539oACem+p!?<$?qC7zjIspeGXZzFY%Ni!vxtezT&-!GNeXBJqE
zu1Wc6O#qj{a|#aHQ0jft9Xm(*<_B|z^quyeUCkJzxkqR!rfdxy*<w3oJ^yltfH3XN
z`Sz~P&m0z!uPY@4!yxN*OS5+(MWc;1--({;4=?_i$@S8!Yq=o(&6^Q$+Gq>WgSQy)
zWK62A<rm8jF5!bht4Wd*1~XIcm3TeqgWnZsQDS(*EInCr#2(A+qVA<a8t4!NDp9ht
zZidy!z;mc@ZAB!FJlq+q%<<T^&1Tp@{t=MuV=!Wmb5j60#rLISIJ#Q@N#>rPXU`UD
z5E!{vlDbH)4G5~`-e3&3_&AdM?+eA-jIZ(Gx<i-!`C2P8Gjl91OYh~thwb~ec7;4X
z&ZN5(D#UDoMNrAdGh44(6GA9GnlFN9z?&JN-i#Qs&bzb2-x`4@`u@uAB_qhqjL4Rm
zW67rtsRSk~^l3X`rr)1(Vz%?TDtn8g?*M=xfQ*Ewx`X;2sSwZG%Bp>+BAnboa=3PQ
zma6H6jd!4xA%XgA<T@T|ylH16>Kix5i8wrDj0fIWno97rZ^J@sQ3++T&R&?~mHm~Y
z{lz1hz7=hq<|VA~H6pN(*p&ICj|2H@bNGH5H*2~=q86VP#|3z+LDFiH8e)9>vfl--
zDU8_)!idkTJc&?uSp?K|?|&b~AiWDeyzl9-#$0Yqk&;fo2v0AR4>G)x&|4;%;!(s&
zp6W=5>;yjoNtoJl?_Ae;^xw!-Bv=pJ(KH$S@8EkP>7cv;X}9Fu11kcms|Kx;buAq~
z>^=>jktp$ER~RfJSD=#(!G%sZc%~&<5XB{BYN3Crv&=FC|H$($w9g7J4WYBbWH&2(
z+|VQL(puoq2{v05P-QrC$%Y#v#z_eSG^nIm-@R>c<>-0_2aTHTWk+1zoh~1_bU*^D
zA|l`{F3!$~AOtskef^zFt(Uv@7cS)-;KJp}9!~c_#Hh8n18#P2l4O41{qg%<5yX~`
zn8cEaKzVX~_hj#eR28y)Jg*wrm1wpnQYv;5{CBQQzf6($ZZvsh+TmvF00oBtNx9%S
zi*i%k-L5KqiK7kG8QgA<i1br{^2woum~X)z!(e{F8s_-paib<;WR?ju+wRg00cQlv
zcqDP@ZztbxMfOqo58jJ1lht7ArQ&zPFI-O(G!>-`$VN_!f>Ae5&c5XmO8}@L!d<h|
zVzblV8^&NXyB#bODCr&JTEY#g3DEVPhvUi~abZZ6$j%~r<$x7B&{XO-We)E8FwH%5
z$n<rHt*_%7U%Z_o?;E_w!qzSk4{lx2c#7>!2b5MPT*m7+Mroh?Mf3KjZ-`OX$|^h>
z6GOi=JSQSxJ1B)5TuK(aTZ?6-zRJKFNhqgjZD6#Xo9TQP3Gc5m_~~#XM(;kncaM5Q
z4Ao4B45ziZw<Nsg^@uBqlH&Hm-18(&eE0qhyFrDRo{k<!b||~u`7<H?sb_063O1o1
z5Sis}H693l*$%uOqlFym&mng<U67z3NX9DZOupbtCM34=e#O*t%lk-qv_9r?;jb^}
zE9Dg~`*L8d5{qw}FMeD3Y`)x3?q@Fp%3r5;9pfU%B18SWbK|qeCrN4<2ms@0dGRO-
zF+zVD$qO8~Q+`ILbmHNUfFNX1bxd<{2IOlo`O12k^mEdDV38*+HibB!h=yc31igDp
zbbX4{88ex7lB~vh_+5UGcJCu}X(^J*=DVxLkE-8kPl;u;u&m``C|NDu8pH_e_RZ&C
zVTkXF`oU--%qL)2^~&#A-ikJwlrYsgRRRx292RZXB<029WYi%KMw)QSI!{h!6%NCn
z0cprUu^}i5buIyuPVUSeuv$gGOcpODT&CcHbmkQ!;V1j70yhDIN>I-;QypnI3gf9K
zf9u5#c=r`AJq)F=il)j<Fct?bD>z|GxO@941!sxJrULo4jTA6nQ9L9D1$gf`E#b+?
z@%aJG<c$W<1gNrIvT_~k>~b;HZXXGViT(<f|Izt2{Dsrdl9mL842y=-sVw*l%UtK@
zt6NI!q{WAc;iF=#t%2`_00LT@Tu=X;0iKg*TK$ebhu`l2)t_nyNko+l*NWL3$HXA7
z5eW9R&+BRzLh18{Ou&F20>KX@q;FO;f)4`kk9&!HClwHaHOyA;gMo&4e4j<0)?jte
zEs$x5jP0b|x5KAZayQ*>ah)MnLgEqt4Kw47q~zByrrJw@z1PhUS&mGP=uJ1@$K_qy
z`8q}w2}?%#2aRHUT-BK(hR8Lu@WFR=MW1Wjo>?W7lVM8~hD)`T3#y?tH%LK00l?fb
z#5!UZ`Xx*2bA+V3xMNv$Cda|~kME;+_Xb%K(bzWFICD*%A;zkv$hdii%leu>AHVyN
z*H@iML>pzjV6|7$V$97=IlE5BR6K~$$aev*sE@{0GNYoSX1~hw`ApootLTQtuku_;
ztzyL?VMGs9a}P=UW$I-1Hy2;fBW*p*T76_~R0Bz7F-dcyQu)7`$9Rdk3OQ;^6e=2h
z6?Z~#v3Rzrf3#HrOC~?<kuNT&n@dY)q>7rKnykH=z3;>Z5xFIC$ewqq_e+^Kv|Re|
zUn$RXa6TT<m3jLD4t{ZWh?x%FLgOo&riF=CK{Yg`R0JU_e|@2|gO1N-&N;kU`o(b<
z^<2^`oq`KOCWI9whAs;aJTz%1@paye^3W8|4J9RdoDak%&;h`Or5peF-No_sjx2t2
zF=0anm;@XZDMGFi(J)SRe^C2v`<(<)B$_G>4sfhby4)&>at`HAm29a^;CFW9aaB+Z
zEi_2!srvYC=~$?jTi^r$Ae@a>s!S}nj12hD1<O{GG@#UWeD;MhDlS}CtNo$j58LxF
zoS6qn*SJ4h1^I5j{EJ#6bnL!@NRQw%NK0Sq=x+<joD#ko!a@_gomDB3TGY9*^}<Q|
zXoy3^*8CZphK6fqT^2A1kYrejh!#?AErSa)2C!aVu?s?CoqESbiZ$u5@?~-h77f~U
zee+_pB#axMSqM(rg&&Nm9aWQn$UuX!%ESJGzWWY2M_u=qBeYn%r)I7C?zh$6TK^t9
zmpz>`#4fJD=N95&uHmhhu*VF~`p)a%_{2IWsrc^42<GjAuA<!(Vx8Fbz>6r1WS_?2
z?9=?!>%KsptNrV(U@BbQGfrl3Z^=lHH^@UJ3ZXJ`O8<T5Iq`LOY(nTULd}ElZLmwz
z@AP?!u&q;c*>!N_fya2+S%$({J`eihl)LWn09u#ZQ$yWfH&)QTz736(-@N>@_tjQ&
zYpEbk-}Vv7T3KGe!_UybyUSrYhp$%)e1;6qV&vJJYuzuSbGSqTIEt%Y<iW!XCbiyv
zcRS!|eUFji_u3wt=#!gFE}G8AxT`N`8y6B1#_`@x8Z)@{x{$L~X!?6otoPF6@CQ`;
zw?y&p9Fk;L<8Y&xuk``R<sa`1u?HgzafEpI>B_eF(>2;Fodmo|0mT^AD)&|L3`4{A
zUyb;_?LYXk^dtQtzd)^KxjuZd%K632J}R1tA5F(7wx}aAjE)`ZFoz;=HTs;|N8sq~
zaoISLYWY6;0B&&F`698xqrzoFK}Gz_5}Z~LU^ee?<V&roWS3HBK5f0hK~dU`jT~a=
z9(ivU_26I7IK@C+HcAI1ks!e)F&<0dw>w?YTSrBwR*2f2%`}pY=KcskWcu1RT-(v^
z1@f$T`_b704h7(^>}YM9(w-B?=Th&=XxPhb@YOo*sr7dMbkR5er9T%ZE^i!yJMxo0
zvr2xeh)HjBAHJi9C*c92jK4i>gH1_aR{SA?$zDA7mg|flWKtuK)p$S@p4bh-!#syv
zVT#tbM-Ujm(gn$SqwDLJsk+|7lRZj>;$Id+3;v?>C}umePs4NzGI0hR?8bY(2yeF_
z$`-n@o5q1o-un5=13=8aUNSYoNlh+vRP%ios$fZYSs-}>f`~r!+;@d=MpAvxU(B{#
z4)hoNt`euHdt48<LTwvwmovn%+^Da|CBDnL=buhPW4Yo-N3f*6#4g`XS=FzX^>nb%
zD>~G)n_*g+9p~IE>D^9*^V*_q)S5s4ZEMwS-x3Vmd3~A@aJ4vE-fH=fi$DPgr9R(O
zURgQq<?y^J2RqLFCH?|xrM$aaYs2scILq7m#8~><zK4EzE}tGPXb0mZT#f!tkgMBt
z+bbase15F+Rj`BHZ7{~T3+5L70qZwES7!(dI=i^d#;K@;QJ=z*=!bp571XNv3!jjC
z*i8-%@}JHY7HWJa7S>ssC$By41f}KTuFdX|_;>p5lsW?>{On5peSQvpaN8!k-|X;i
zN5tZTgS+stIF%GW>jtmZF>bY64gmy=5M{sKUu)=}FBiYZQp&Yd#)OCgX|<hL3*|2M
z8~JnQT(ONz1q)4>5X8pmKD{?Pk`&}P(*flTj5Ww%<eWAcTWtLU+PM)Sc3e8qT1#kE
zX!y?}u(^zm`d?3U*j)-dLv^ig1ExkUxC&J}Pvu56PPUwqm6=YE&-znxDqX$bf3QFe
z)lne#OIs7uR&}TV0BDdgyKkFVxKh{_L?1-xDa=(BGVxe#qP{+7Ok{ttN^D+9P{(lk
zbOVTRv`U-u--TCL5IIN)wyZzty2<%&-;PlpP;fHBpltAf?Xx@r?;l6^u~ym3CVrh)
z$BpuNIUSy1t+|Tzlo1JmuHwc?Ct+R6S#92gR%_8Ws-mOgDGW=I2U1X%>-2s9-JAN|
zOhE*WUywT?@dY%j@WA_4)xlFBz6=i1RSn>Nwdkt$D2~Ts*~bWPGrtfCR%&52F?V+O
z8=*_us+8?BG-F}p#e$B&AooX?KQwM%$z!5o%MMFC@Dcl6hn#8zCi6gsdGin5&Y@6n
zis?Ypud`OYes^nwo;IC#u`f!F4HQP-J7ydNL0@Jz?<K&TiH<yh1<ndfhSzoeHAP_0
zsqy+xD_&c?=LG@|RW~MCxfRsxpyPztfL5=er~%NYwYSmADT9T}eRxf@-1l=>T(I%P
z&t^WQ4fj$x29e4Axj4CtQ@s|q8_VtHm&k;?S<D=X{?gqr=D@S-a5SM+|7q*_fa`mF
z1u|fo#oIFITfpf_D`}6*(|N&)xF}RoAulfTX=(kN_3H8#=MD6T5=-e}#ck#Gr{`_q
zKkM8MUVFjkd)MKmVjY>dc4vK;v=ctE*U8PaerH3`_9z%Y|6gWT!|(bd?IIj)bHAtF
zKyRzAJ?R7Hq-%e=--cu8*lRDC^PbKv-<^DC&^)xS8uz~YYK4%8lfP1un73AdanXNk
zOPKViXlR@KCG#R%z0O_#(bBN~=dV7s|7nU#AAW19bv7z!QCq5`=~a|}kF2ehLB^<-
zd}9&jNPqPv>RIzcKb|oS_12^GL6$B5SRCJNVE-_n>6s)V`#uY)Aq~&{ZX|#++2rcG
zqqL=#@zUpww#bj!j5qaMsi@I3kXbn+6n;!-VIHe8JV_M#FX6QyLrtANGVXxB3U3C9
zK_r7!eC-H?&LA=8Md)2V0x5-si*m*{V#Su(whu=-z<m#a4$$%Gv*Q~TcLF1Gr)XE3
zgm3PK%U@jiA_aZerElL}w-yr<%Sl84sln(#kMTdBo)xW+nbVmWTkSnK0!TGJ!@!;@
zPh4;*wO3U+Jc6s9^P|w`9<TQPrQGx$Fgtp`UX84;uP3kGE-DW)hT2HOV^qfoJ3vgj
zo>}PQN}^0$dd=Si1mY_b`#di4kwjEA4TjG(#hIoHy4UyBJPMxuwqzb-D=Kd6ki)lD
z|M-EHv42aE#j6HDd#5pCt?e1Uj!e>{BtMu_r+N|f&5wE#&E8FI-wQWYfL{GYkTk-{
zub?=mF@hw*HIV-YIYGw0j6%OLcR4Ef?CDdFoZ4Kel!T;`*!?HfzyI(tpSg79<#(()
zan})9_nRUYKQ_PnlV?o$+{shKm&Gfwg?hHpt&6*VcVBtOStGu4=A<U#O0krXsmt+y
z9XH|Hhd1A~p!bf2okz|d1@#J51N3&k{m2oYIDWc#FF|^R^uJD>cI^`bOV;-+S~oCm
zdI3HG^52{LT<0eL#$#K)b@p^sfryY9QHb`RGD1|nh+*^HJJxSnS9#AF<IdY}Y$Yi<
zST18yQ|_ZDPI&W@<QMn#zWXR?u(3!OM5I^3tBG3F{|gAu08w_3IO5WlM=nX8xzYHt
zsdU7yb6~-Zy-(d{rBbZ)M04KLe#{p{0Yas7QK@Uo$OHEk0AiHXC_rjdRS`E$wWFu^
zJ^a%;@6a%s2G;hiUs7E2aE>G1^$ciPwO#?Ybvf$WP2*JcP>xkSo0jHdzth(+rTD~8
zdzaiY_MP*R#eYmzKG^ieA2Y0I85DuM0c-z;lF8LIG!fvz2JpRGlD^f2{oVt_B909y
zkr1OY*fsAfgPR{TlGxIgh>D%Je#3aR(mRlt5hq?;x7)emC7ycph~vI8`0&sC^1JgB
z_Rk;lu4u}B&`1+6QLMd7%<{W)la4TrJF0p>lr6lA#>_T5o|xSIU4wFb*yq_cd0#PP
z0*fJ4GkRV&v&zmJ9{xjguNv*V;SFyHKi5cnHN2w8E~hlalc@N_8M186cHS^btWcjg
zR$-e<$!%38kg5`SQBntGWe<SW3~TRJWwngRD-0_M=}Q*3``3d4N`b%#P>eDkIx{z=
zwU8tgYgyGOJfOQwtb@_bgC2k~k$QkV8E_KHAC^=AYs{jp#e1I{95=E4yaU?-*eJ(D
zUKGHQGwMFmga_6n8wPxPodE!{7_>7c%ZHDf4D|xb3<IwoVBGffTJ*bf4;?$EG4H%Y
zg<PZ%#hAvzUeiV_U%vkS=lYMFJxahz<X~Xv^u5La@Sbc00z2nN=Htm@b4!++t{wqE
zIonln_b%>3J-_?l2>>cyW8(<w#gYm^j6qrC_NV*M7{B|_@c@j;#Y|!qpaXXpF}iih
zl63=%HgvyXVylPRAfQs!5qQ;5i~28)s-i?7^Qf(#uuqYry!82vi#@b3Wj{3qHu*eL
zu61Wo>y3{90VN}6*wMRrz_R#IumOl9QAO0)eB77&9{OJYv-4a@^5{g%*t)|$(Xi7=
zCO=j~Wd~|@KHHAknW(V>i6DeXRM<AP9R8WX*(WEJL0dPvYtijG<&gaJqa`o~NE~wv
zP~UQ&i?m@h08gYqJYq_`@B0f=j{uZRN=;%I*g~}9A$4QgbFQzr?h$D@u72zuOomOI
zvrYN2y8#Hqrozr|$<I2?SNd(f6%?rmsS}eh3WHmg_Vsp+n0BDTsCYvv>_N(?YdqxR
z{oTub+}g1Fn*lJ}Y##fUN>?u?wOd`;hs=L<W>Aa$HM~;DZu%g`LQIC(5Kwh!&q6;F
zfxu)Vs*27a31(QL%m5qWz|aCIXt?$Z5PoE+k_jX(1N;kf_g~V`k)#C-1(F@wTY#X<
z5wvy)AQKy~NrVA#)+oW00Pqz4NP!cvc5{Dmfa>ear+@bpBXV3eYDf$ltQ&9~$D0PJ
zXE156HvpJ`q`c?ErZM$7@6|BUZUa^T&uplm-rl*er=#ZypD3lR^q|N-yP<+8x2`7v
zzz|aJRVgyl#?}EamcrvN=!-<;p)9HZEZYrTgDbb9xivR)Ts;8G3`88%&x3+1(W<S*
z&0EX4y8PE~UDi^7I${!4@@x_(iI7WZOP>co5CuRS5=J!+|33}sZK_6~j!dq7zq<B)
zl}N>rRLIt)deXsx7h*|O5I65U{<Lp#ejNE=Vk<-m)WS!WDxwM6-q3XNuh?}-u|sSf
z3hgFH-G#<~f=W_1cK5n5dx|EOgb^vJ5D|n$>&DDc6{xlx{z=%_a|7Tl5tNN80O{|I
zsD<mND1=Fj7v#o{JnV}k3NJLQ;SIzMJ07QcEQs>1OvGfOZ7l3GAuxeegt#s$)T=5a
zAu{1k@IZd-E_}d8qe%xL+<veMA|iN+Tc$UibA=-2c!a6~tdweAJYQ?;rG6<y)$m`7
z_cgqd$g1vH7}u)enr4kAluAktY0`w`ln<?Dsa2eSIP~CyQe~FV3ck(9rH$ViC*U=2
z4V#J85dhQ|3hCdd>dc#t84nx0>d2($uN)TrodW{4U4zk&ZtjHC!3;nhC>KI-92c5&
zfJM^=U;yy-O?4Pz!lyCR1HHW#dA{e_-a8-n;J{RhpbqNm>hg#X`yv2<L9CD{GWn?5
zwV}h#Y@6?AaG+GGVC<+|UBL#zQV=D`H2xT%?qacADVJ^j$~%kD1W>4?B@wc@`p8-*
z!bp4eG6wmp9QJEb|3y(<i+UnfwdzBq2E-sH&WV&nV}>Xxh!_MhphU=^VKP(+qF|!{
zETG;fG-8O?5`@96)HuQ_saMiOnZtTL%tE0^pb-oENQpt-C@T!et2k974AVH3pz4K8
zxQ~KU1)@FP;>R9f<5rQxf|N+9qRK=Llpju;2LSIR%mp|mg?blQMIo`SU<?bcsuIGk
z01_4`s;HYZ?(k2lvWf&<Nuva)H^kzN9Ua(F#6~rUkqjwjzN~K~97c`1SHr6d#kR|O
zAS?H0H-Zie1*uM(VE|Dur3QRxoT-}ADGgVZt$q$6B=^Fq;UA*Q_74CJ)6v^TQ*Y?R
zUvSicAS6lzz*<H^JMUnwc+Y;37f%*YkP|S85fR5Sq1A{8n-?VaP<2`n8^U1e%kP?Y
z_MG-&(TfePUnGE6bxuarTU8-qQs(dq(F^Ws_|Y>nlZZMUR@gx{x6^bRTY(vsFJALs
zb0#!sNy8IURaJ>qTt#Wx$Q<C68OSpHud+CAE$Y80C?oN+3c#cWWW@;J2}VFnMwOHd
zs}g*ewiBrlRX8H?6tjl5Jwu_L6_(Z5>?xq?2*hN3dgT%_@fwiE@FZ4=Ju53XJ)W@G
zP$CaCP|rp^A^;|{t+ZqIZW_A}R6)ioF@<holR9J$sfiJh3R?y<JgF*^CnhyjP04D2
z9y0C&sd847SB4W<56k2Mas(5U3n73eY^{W|Vyb@T-}$|~PM(c5{Hub7*5_40*G$Y+
zhc?NgT3Deyv8Fq5y2qsEm`Rl#Axd9Lvw_clj;EdbujSSKUmwd%m+!AA7-oYYnP%;%
zItBY;$;}v%&s$aHVK;udc2GDM`~c_$_NJhqrq|RY@KSJ<DJ`*UOpIZI8D@l_s!;gM
z4@{r8x_{v#udB;N`Fv;hz(D_CTcI(#X3}?rgKewJH56?B0JRm8X`>sxRI3l^M}^Fg
zC``t30$%N)j~WH~uYjs^*0c*Zw40HDA%-EWL0s8_3;~1)Fsv*vssMR~0#3nJY$OB{
zVO3&6SPq1me@G?<HOtV?4oIX#V9zQ9NjFFvg2Us4m8>TNfJ6QqASjEdB;guV@{S;|
zfqx|1o)r{;hZ3m`T?VX3SqK;=5}qzvLe^6y9H)8(poUTrG9{4Fs+=JiQ5&WSBsGWZ
z8eT(43wNZzFv=*Ui&zlp@JOKQLGSwc;xoVgXlDfgt57t|KK`G5Hm^3ezp?d&{`gn#
z|8#MTv@?2Ed*h_hx^#8lLrc2=TdJr_B=J)5T6U7SimFrCY!$9A5SueC<b9{qL*ls$
z2OLzKuOx8docD<r=e+kaT#6x8C$H|~Mb;SNWnybQV|3i#n>@a<4<P3x>_}9D`m%^j
z8d2DFTvY1uH!bM|P)Q{5>byFyiIa*Km5L%Ec!gG~?j*1Bv$7WTUmQwRflW|oHCKXR
z7=|a3P>=HI!d)np=TK#@Odw;R6v97P2ug{vC@H0@VL(lAr8Zi<a7oiTmny4-+@TUE
z>?gBsNT(N`psdWPYA^|LpdWf-kPz64u}VZlP|~WqZ5H<lsSN~)aLNg!p{rup{Y%4t
zRHSXJP%;V<f|3ce=D=nFSSe;GK?Ejy1lA&d4gbo}bUjakH1=n*>wtKl-Zfr=6Ws7X
zlbE`4)M!alf7q0~N3XxxZnj>?OoJf9O@)Bq*`;YaBC6Su>R^d({}-f;ppe8QO1zG4
zES$PmD+c9DH>@8>l0wd~WotM#lrtRJSk+V~Rk~f&Pd!u8n0@*iCbW*$o1PlH;=zq^
zE^iGRGS*nj99aglWM5Ot+8AEi8P30?V6m<M60h~voViCm%JDbu>gaJkj#6HsAu<L8
z5)T5-Ibamj{rIL0tGbGXoH0z+z!-`QN0t+^4w1r=kkw(if>#r@sQ=;!aW2G0B@l;{
zci^Hj2rvXTDVv<r1bF&0IK3R1NeRpfPy`}UCfKZWNT6y$Cy-s0D5RY@L{6j(5$2TR
zL8)>l0jXAWI6w~4APRuVf&l}lGAN6~1SVAJ@sx#qqeLJk0+clzPq+)hsilKKX;w3w
zAcJ76ru-Gogh-Xbu!NL_W_zZ8pp9e=HT<pE=Beut%65Pgcu9<*Ed!O+-2-Gu(wtv4
zq`{~Q@^nDTM?)q!{d&Ui95w)u0Sg(<9OVdrC4%9!Ob7{7z7Q-K7A9mxQH`h;0L6iC
zA4oqIqVOL$VcL{Q@vRSap8Tx^e}1N8O>c31cX9rP?rRoyo%w%{|LC^om{T_qFu_t3
zDw$#M!+M5E#LsBUedy!~7$|@67t25Or<L<J^={}dFY4%fU{%+}w`_RhH}2lh-Orf@
z6PcW`LtotR@6Qm!uo9=A2zmDr6J}10A9$qi%x^z_`||F!{iT)NgO6?O`P$7Z?_a#t
zM(i|s%YhS)*u8$qlH{asJpQYPHZJKduInu>+|)UDY0rCqx%j`YSP&6Y;O~>m#3EkZ
zFR@nF|Er*kW?TKW`fmuoLzc`mw0_QVgp^*&St>s;gToFqtFc!LI-&n3jnbKHZJKAV
z-YBWvw5C+iG<<7SN|1(}rx7NuRqYSO_ViF4VxAx?<7X3ht!6r(KP1Q|##zlltL#~*
zX7*9x0ICMowXk2qzcQ+Z=z(U=lz0V%BUY#OC--dm=CzBu2a-$PzsqU6j`gBU!{3Y7
zVUT|w(uY#o6DK7n==OsHNf<g3g`vSx1wG|*B?RPcIB-eDqsNs869l%CC=7~akDmTw
zIhYcL@}luB*<ob<Hy_*M!k^5)<B^^_9(g!FvOZ7T(_iAU^#cQYPHh2#Ap*)3MNg$%
ztN<A^s`}I;QYn4v#A)IyU%htyS1x_-tG`3T2rdt5rC(8`_A!HGOzMO#Qt2y}2H}P)
zEN3a-a<PKGq~gN#MH4UWEp@;C@T?2Jx8&9b`fh#fp@x=x;>q<7_@3lvpO|ytENI0v
z#{A0<?0msbpS|VbzPDezz>dfj@=<@^AeA89<+Dzi2&f=Y6H`cEy!xm`{g;4hu5MT{
zT!(^WN?NT3HO<zw)#+opJq@2Y&=5Ng)u_Hr_s&qnO>LTo5r3-vg5j58!MfTwl6A0b
z#~0cz@>WkR<1Xs96Pp(slHfR;&TEDpde)(nE$+jl=XFk5dLF7oq38ddhHvwNqq1#Q
zWV^S#f}=QmCb``Yuhd7s;5q6G(v`OBWdDnw_^OGRWOjGKOWXU1iP+FX>jpk`)pNH!
z*>4JYUu^jHwX06qb?iS#J@sD<L)Quc&>crdXP(fy&-8{MQXH1Sht4c)I&sAAlkx~f
zdK!8d&KhgZJihth*>&MB!U|yj9U4wOdDMQB>i{qjkyl~^E55Stg#6u~dc&We=(y$S
z{?(nOVp6V;nr5}<kKU!}sGZvcYHS2x*YWw&PiQ%8=d4#S13{G-l4s{Xcgn02=d}Of
z(XI2Bm%0Zkj_kN5%$Zg=VNTn)raBQ}=3Hdna!CDP*^J1W>aMB}uM>7}ZpiBnZBaN*
zX0C|rGoj`7&+c~hV_R=u*t@B-6i0SUQ`=#)>QCEyv_fqxUUhs^;ZGml^M+?T=RL7y
z#fGH6Z1UslCXBI%&uThmkI@p6vjR}G{V2aWAm?1ov9yL7{w`#U@8SH%44HZQ&;~=o
z!^8N^Llep@8=B3ZG-DbMpYaV#K<d!U@rAkK&pTLTnbb@|1{g9$2ev!fp%Y1+WrjXO
zho4=x8C6-(!BA1NPpML5bodYq?|K><lB#7bvRUo6Lz_)`|AzVGwg-Y2_-@3D7cvoZ
zchUXtO{>0p<C?)zft!pWO?u1kJh<iO|FKK((hTqPM0<bX@pW5^&s@C!s62c149-m7
z|An8ZVUiQ<K8ZR?#Tg&EcgDolNB?Jzq@)IV;dej<s>+6{SFeMzfC&l1fH;tUtIQ~5
zD%@2U1BQ2GNMS<WP=mB7D<uk2wL~f+Ow4I)(lUsEIJ*!<mDA4n><%FzK#^7)vO@&{
za7vca3zk$M9Qpzj9?I~Vnj~U)6x7rNSyk15K;$#Nvj+?k^${4%l=&uwH5!6~11TIC
zs5b}-!KxITTZ6x9rv4Oq63CMpC4RMi_iAhX8fy5vk#*dpM4&3SiPBE&ZNgmI;++Oq
z9Ue;oXjO4A+iO}B%2Gwc$dzgA&cbW<p?~#YQ?^xF!_llqGb=TvN1P>?vZAc2qB>Wo
zMzgf<Bom>p7QU*bDkU8@Dq5YjRDETu)h%S7J4~@T%US8LC1&WSdBK9!OMq5qM(Ly^
z?MTr~+{KGB88H8G{-#g;cKNd_eVnh0>x{3ks;5}aJ8b;WnQS|>|H2Xel|eOyX8<K9
z%HoO1lVN}v3L-Bg-m`&$$t<=cRKiu&dm#cDQZ+_}KwikhTM&$>5~~GVxFZa)YPqb2
z#Y7OYmc%=8o}!qQt7%RZ@0Az~^Za?XLI@ND{}m2olue}S)rqnpW#(WOEKn~M)!3L!
z`m>stRd|7rBO-=T5(DvM*a!)`!1N>%Xy~>wA$Y4Wg*Z-$SAue2Zh@VM17h+-R!Kb(
zs~swbyqZ^1E$Y`$!{32)?UP1B1u29zNEd{L2vQPO@+t(Sth?BgXM`pWAr+;xftW-p
z3?pO+!b<RzE}>LO!b1a9)we+6P!>{Bc!GfjPUrK3VLE3}^=WQZp(MnfgadaPUWtd+
z)8xDc`6pF~!jKnXC^17bo->Db69L3~fY}g1d`hMAieN)AT%5@P?7&t+9hZVmjWC%s
z3VSGXs&~WL+FXSKVMGX;z}hrJA*B3L8=S+qc!Vw*mOzbFNIIBKRDsBtrCSHS@cSiK
zJuu)oHcfFNP=F)yeRA|J`9pUYAtFRVFurQn1NM-tlwF+!zWyp()#3|<61w5EcPCsr
z0>eaXOd#~6%%-7gC$nJ+Y{<~*12BV$pe*SGmjTOJHw6>LAT_L@NEBc(xd;M~5ObJn
z5*u5cK0ClD2^0_mQmZ7ynzkp!*}TOPs{s)_NT3EvhM*=!dPNPN#ii|AVZENoERs<M
zd9d2Fom&}Tk`P!GBy5a|Dg&?_t_Tk450lPNOt=ggpkekquj*G=i~2Rx@b@4k`?#Q=
zBuWqo4>zC2Qa}W(0%l(RmN(}|j~@Ts@2S(Yv4hmBkc3>K7aK7#8;E!k8X|*u1wxx=
z`l%R}6z33@M2RyF=<}Z&S^bVfwiHCFP)8&Rt17HM=%}Xxp<!_bA~+TARjsiEuTu7i
zh{Y&8v4Iagtj2H}q){P>iD)R%8aT=V1$$=pp5O!!Hl#}80A{iZu!<vAz(NJV|02uz
z2O_J00K)(aBx^-^sX;&NNfRQ1GE)Ax5J9{utTh8(eti3yuU)fjeRs|@wB)tF;+=AA
z94QG<dFx@*81UkPASS62C8|-9xY&3KmF?`9UY|ifBPkP1Dqs?Zcc|Yf%gm&Ip)9u)
z+Jr*tJ!AZ)*Dw>Q3Bn)4RY?Fv4ndD3-LIPLomuPNP@yKYJ0Z&?i8=u+8`i)$GHDVX
z04b+MBa_-C4t++IW|f|DN{^yooKQ_ug&W1DY^L=1(>w^$_GKtcwVV|0_hD_0;eG_v
zgqA^U^TDqcYEi$28vb@<fdmX?4^W~)Fj3OmZR;DnVXKZoDzJu&Yu6XA`9s}NM<`(6
zLB_*D*_tRbcd@Qqah;nh>T<1PIhS`~^#Bz^L`*@-vnoxJl>j8|vi!?LxN7mANJJrG
z45>Al5t71~OfAf-LSe`$kq9%4wNzfY%-ATJFiAuetdvrY2}H!3fBKXBE;}?Fa0JA9
z2E%bMG+_li)F^|rM2Oi&Rr4i*8pH~M3Lpxy!axGD97wEGt-ljgVHx5`y@!C<1dG>~
zm|D-6v6R|iLWoFM)f2%Q^YF6nPyFV&I~Vp-U5ja{AnA3I7h<TY6{)!0Ch-}2j)r<N
zHs#8y5b;3SZRjnwH%enJRpS+c$x0)yx4@5Tp`}_o-=?~i#m#h0JZvLmGa*ft`Kul3
zSt%$qJP|TQvT8;vOj1LZYgjdm(z22c9fL9`Ati`~A){GiK&I!HT?<)^%?>0h%D(^)
zcWB0u(g(7NReH=IRoza&AseS~l0%G{ZDa~FvdR#*<{xpn)S`Y3HT)f@>a(kYw4xka
zJ8<V6&z*bzxc~k9nDgE#Ueyq(7qBc|Xp$t__aFii7^4y?!Epa=xAa_nP5q8DXmdx$
z+?&fQ7Zv+T_AURh`{kFYK;W5evi!Ntm;HMDMW5vQIu!wPAP0n}rDH~^n5<u0H*JR(
z^C_tMcY0MZ6v2AueQ%!@2Yp{(X<(3&3QA?0ue0OY&G>Q5qH5GR6WMad=Dx3eW#bQj
ztQ3!W?*~SI@qaj9a8L_Chzu8>Te0=s@3E(y)bz(AMA-lk2eL3$MTIPZ93<wL2Z}v+
z-;4Y19$dJ<ZtU=JzC3DF-8pX>b><sMAVD{xoRqe#ER3HbWJo}O5ClFlX2UgC71wN>
z{^5_17iE6w3w<DE3y>mGuS5b%;-TIevuQB-^7YGqI&WR6lBebo>`R)6u!RMD#d?n@
zFQxvs?mw<RwxuLtV~t9MR6X*>Jhi&3V~aU-$ND_rA)K|gy$<7jb}^--(`+@L{c-y>
z^0u!IBb*L@&+yHw`XE5HIhck%aaf;ImHHa~NsI6y|ISQQ_Hy5{&29N;*GVI*oWkJ&
zoreEI+x+uFr;F|OFqG05Ivw3^-|1Nl<J`~-zpBGe?A1ms>eo=iD*}RgHrAI*TR-tX
z6F00)etEe+@15Yl$Q1_Ml0`9~Z`d7xQ30%!`Zs*~OUd`YZJb*TP|h}tpU}GdK6Z3_
za@;`*Z^0r8#)dC^uIpD<j(q#O>+0)NG}Uhcgg~k>p}b+k()WFIx8GmI4Gk|C_|sB^
z!Z4K<KC$6T-=Mqi@mn{$iYr*_4YsN}a89@(Ny>BfnEab7^3!(!$EBy9T6ONZX5qpy
zr<@jzY3}&uS2mi`_=~>@C*Cj%$e?t?9~fvo;Y0xB5@oAVo=StsnpL)a5~7I2#nx8V
zZ(jSxvq}%%l|-gx^eAl}Ax-ts2G@1R{jDdRVDk0wB{unv>*lR{^I79A|8?6r=cseA
zR#ljdTerUB9q$|a)h{7i1ya_1^io#)Rb6~d$4waJ#1ww>==v}IX62&whMMa+cA7X(
zc@iddkc5<pOyd0b_S~Bfodh7ZmWdP`DLAIh#q#H_TZ7`jNqbENAP|!lZ(g{4^L2!m
zdv1SqdwQW?=_PGERDyRR%)}dt<x{@=*a16@nD@y8vKcS^_5Hl;)1l}6<z?-kUcxr7
zFltf1h8q4JRCD}*P-L+6mYe9|`x!`9t(E=(+fWBlkO9zp|3f*TX_wsufCxvt<tINY
zef#Uts8R9z-aGy1<7h&Au5A?8H>P<&fv6Z84Lmj9{pyO*Z+uhT<niJ>IODV{l;9u{
z40UYrkKS8ayP<B@4q5Bm_Rmm7s+cseXl3W(6(jc9t*NEWO`R6qdTaOHcTfN0A8F<+
zyJbW1j@!GIuR)S9F!;y=TTVM8TEDURhd*w8@4E@8_}r%dyQJ+S7sX@S#XDl-puJc9
zq0rjWaK@Pc#IUiZ|KSI^KYr1gyYHX+hd+!x{q!VBB5ON-{R=#J*QoQ}+WggTL`?;5
z3TH|R^?<4}EP(#2E+6fa!S8)v-}EN3){&}rmg5b-{AE-bY(4c11(exnDsS~tSJhPQ
zJ7W@N_zLGga{04Aym1rfW7`~gPXa^a2%=1+#9}-XF)--&p3&4@8Qip{SD0d@5^2vs
zJbz8^Pj6edcwz5}hc=wO?-&tJWIU09XsA-ZPILVKdZ^a7q977%0GDq{a`ow}wQnIq
zE$Y`$!{32w907!2faG`A2y-%V2h-KbJw4P=uc~US3=X>c9*U+;;W48HuyK?Wi^)}2
z#X$W}e%gA*Y3e;OE2)Z*sE22!!2aF((;ow9{lEtRIB4WmNMXQ;h?G6R=v%caDU~RG
z(K|YgKg^=icE+irPd{CuMC75%|M2_hzyljkJB?Iz*3A6Dhm8P;$ZgrQ>Fw|E>o+!i
z?OP+?`|fhNKh8DyLl2LBaY^OjhwUk+!c{0&S9<iZzK8B_yWoO&<S6xmifsJS=LRqS
z8YPtpAA0}D0}cQX#kr)|U;fpvF{-)c%U{e*7_B0zN}>V{L~e+Ki3gW2v3K2856G%z
zu6t`Va<qESHuhUP`o8n^!dYk83GM0<HWu(8Qx%4K8Q+*fMA({V)^z{umMzrM3>EJ{
z%E~UCOHk&dN#2Xvg1zCf&TDU70jp38iD3qz21yh*`S-kO;+Nkzv%oAtu#YgTL9m8@
zG{Vpmz~Up3zjt@9JJh0n4K@7js0#CV2wPL>>gm0H?uZjlFuU*G|JASgBw;|*8`$E?
zmAqnU`Aug>`9hME5#<K1{%!B0k2byUKU>c@&6kR7Sk)?vk`+S?_A17j(uNKGmsi#u
zdra=ogVhxc#cGI&RRokkZEDtV`<;#XdNXQdc4YM8$E>Ol2)hJPZs6)al%Aa5`lIg>
zAW0&D(kITu`HqXfo?EyW@4R5-Cq9z6B<7faIL{H_7ClF&z(E-3z4DiNAbRsz1QZuN
zv+;c&ba&s0J$4@Z{U0?Rbg+s@;yAMXzquw^^z5jwd@Y_dd9YN<SteFxW-$sTNKl3u
zyyk{tXLr7>-4~0}H-M2~?~Jh<ulg-_bdNms6a^s4Jk((I(xU6IO-Yyl=-AfWjM2Ps
zZDLxCw_bfhP$fI02ow?n#LgwVP93rLVeOT~MevSdOUcOkXnbq_uvtyJk82Ei^US1~
z1OojtlDGfPs5VZ7uP!*Fz6i_Z-x{^3UqcPA0LXh{Q@Qi@a_3fl``dWg3g^9uR0+Fo
z`7#Dt=Ijf=61l;_zHfdbs;h6g=)(eqGJ;ZwT2`lQNr|0MY>L<ZxjfJ_>H{BDK!j~<
zHvaUIWaHL}pZ;$zjv#DpQmXV^e{I|T2gOZID%Jk67d=>|;89`#bYAnD2H3(;$0<N2
zVg`AkIN!f?`M?i;Y^P2fckx#UAmxzE5K9F3O`8eeTzzHZ+R~-JjCR?*^~hs-@4jo(
z*>9$e8|vP1!RT*&BWh{!iD!mJHtFx}{Mr}dQEd$uyaPZkjtvu(1d^~=kOvXRq8PmT
zs`?#wFb5vsF8x(;OIL2n)Ko;^$}6<F#qP7O2drUF#L7c*J%2s627xyvWrd0n@!Vct
zxcno#f9%rbcRbyL`bLh8I%Us7J}n1?WjnKa9d8GIe%|cNn!*QvX;l!ZI0v?z2`uUB
zvs3>yhg!gYVbnbLYN+AmA;{FJroy6W1D%NgbpGTAg*ex8!bxu9#t1?03nT)d_vr;?
zAl_|90A%BiZ+)wK!81+Y`gYya>F{FmF-*>P_vnVr#1Rw<N0tTJ`Rkv@)2G%QbvO%r
z6noXm_kXbUd*4X9d%>}~1cc(PxAv^r5Wn;7;rR4L4x5Hnutcg9*`#A*ao)}L*dyy^
z><FKLC{ss4`DZ_o(m>&ZAC5+irb>lOx@szK>8SwJkW;{bdM>{r>F#g-;^zkLy>HV=
zr=eqG<F_v!|MQ=kre^28v4A)ObY5~PFInFBsZZo`xys`Y58iuU-)(nSwro&hBL)CO
zn5ndILGPoF#3vrlyY89*ZtGS6Wb?)O3p*Zop!LHaC`_Ehi4*2@K2??NeW?iK2KGTj
z)ybai^*4TUw;#S|N_+jF^bIn3=3$2Gfd%G^oQrOH#{c1|Ekr0MNy$s%;hc(iFCNBb
zR#cQaX}+#me6JDIzJeD<E$Y`$!^;CzAML5CI8?+i_1=D0`R3c|&VQT9=VjwMV@#@w
z6aN3sdy=}QXr~<%C_eMdz~{d>>g=;ez572ZTQ&{ccYF7@zP|oVXFfe;%Co1Q;>v>z
z3dw`yi6{Jn59iK&lgY(WF6W4fPtLC_m_KU&eQirUCmzHAI)C{6k%t`Ca{6g1j+kHc
zxT{AGs5$_H_dZbQ?yGy#n*jJ!7F-e8QupSL%YPlsnmOV>-t7Uk9Fh?Lc+u1K08gAm
zfG-XXUVeE)%Lsk+fz8JrO)O*naP^oE|Ce`;)G@QyM50`2<A&mQFCGaDeEr)?#!c8T
zXWxy79<uc9Z|ivIfuPbV5Qqa%oHsWI&`GD{+b5R*^mm0rF5Y+@1G%%$MxbedY?yFS
zW~Khuk#&uefPAdP<X})yw$S$;I{Bf`&3Wg+Ehtr7;@Fr>T_%hfF)?7Sx@R2#YY?-w
zMvWC_VhT<z|Ip^E8fy5PQOoexP{T{bkWWS?@G)F*W8g<Wc0lvH-VVU^4UjPmc7Tnd
zq_eyH;6o#iJT#uNqW~Qje<SBA4XajfJ@)vXhwiWR^a7}FZWwvg35^$CXe0JeQbR!Z
zjW=1K<%DAZ1as5D-&_r#?)0+=MqHT+xzf|ml;+;l{;OXDfF!UMFI3E|DyWAr8LZ&@
zubWpS8h7}S0tk~;^`eH6-17+5Z!CQDLs5ObuQ)`;h>{@@ulL=fKs;wp00WOaTv_;R
z3z@+$|F3HwKkn+Q>km4>CyopW8KZDQmeDo$cC6lzw2oq{g~JcF2OTl`fW33O?M}_j
zUevNeo!S^c&mXR{^^MI39_VhmodHuSDWDbEa?S7ZyX;Xvb7zInki!4V%B6=cKS0^R
zTaZW0Ktx`_&iM(=xu3mjms1Yx`pi{J=P&Q0`g$@}!~^hT2#SJUzhL08HGOj?)_Wh?
zGExP@RE%{y&&nEV_?JX2>eo=iOUKYNT>yYMNXQ#g+_<@a?v0H{9#+_UUjWt`ClId@
z0GNTNp6co88gb;&3=BT@c<Hh$8u#9F?beR+GtZ9PYqt>x9>Rwl(zyRVrfCFJeP}{5
z1fcKM+tD_vZnxb9pj@u9X?^dnuCTfK_<#cdz!(CyeD3oyx~=}i69Ir<)S-KrpRPb*
zOwRWYCUdXP9eRYFHpOWr(nvE|(4JdwiGil$PY$~j5ReeX#RX3fJo0$mNyo=~?x8^C
z);k&!?^!vQAN!kM)gN?#R1!os(`IvqBd=P%{1*<0fA^b~BMxD)L9s4St1SU)pvp|;
z#f$yD_Z2RDm(AtNQU<_XJp_~&KaEEp&wcvy#8593gnBdYoR=<PTD4NMFu~jnP=X->
zSr$?C{?wgE9QNrqd~5Eq@87VxYp{jub0mXa0U>L0rGfIV?pZhI+*ukN*OW2~Y_7%n
z8eS`?Mg1CTc&QksC>9jORVcX0pfvB6fq~w(|G1EXI(L0b$yF--y*U7YyXP(i_>e;Y
zbYFd?Mbi9>Uyj^q7vJ5RYiT57G-%^H4+tBpL6W{W7%zRg@`g8{Wt1cp8^zsM{Vv|L
zDu3u9bu)LAq)c(H=c?cKU-P>$m;BI<Y^^wNOl0UqXiNkE!NbO=5HK+RvA&IKCw%sQ
z7_g4WKnTP}#k<?@bi3_Zm^Dki3W3PMDHww{%oU*J!~bPqylB_G_cS#&mUh{t{J;Zs
zTRIg03j+c&s-l45WX-z1yYFc@?8v$!jucVjR5&3JbChZhkg5;?-E(hFfbnOY0l?Qa
zSU}dV1JHBzZ!OSr`msJl*HqkenD5`q`Vy&X)|VL`YA{2C5_>O=7N0+Dhtu{M^Qqr1
zn)^ftq6U*A?{Z!hIeXp18$NmBq;bu4q2pjkthL72ui-U<T3x?}8eTGj_)TyJC#7JI
zW7q)$m;S6ZpDP@4L~wM8_kDu{{HhfI#G`lKO)b+W)$hCu6y@9QDD6BmKXYeekZ)}U
zSSty91(J9JL8?2?ux{gKw{BC*-UkrC#=5Sq{!4zufIswb281Im4|aY2)Ajr8GvdN`
z!-@TivM^z{Mqw1H0-*SZt4$%7KkXC&7{(Kz%7ELvk)K}BaKuqylllsI<tTDJeOrF;
zfAy!GQh(%O1n7z-?v6Xk=f8E_efP@#hip9eEv3hwVw+Q6RDzPIf`P$5Ul+NgaPD~q
zVCP)~u?%CWYLf~HX>#}HKT%U1&)N-uw2zGe*|0J3QvC7H%(SWb9d=0#+H@G%?r-@P
zDF~^j@me$abs_?o2?X!lp6yN7e`N1py?gqkX7Bq-hAIZm+IZDwy5hkt092|{NSXg|
z4K=(LP>cFC)bPU4=OMJZS4}&Fh!Q2H(vwdX@4PcQ;kaCTJ0%W)?zayC_K`;cBrBH|
zA9y%E<&7Mhq^A?lEv?^eUy2NzPZY{fYZH#t6AQ7mAW<m;7+ktkfVx>T6=3F$pZ+xe
z>|zVl?X{Z%*6^lJe`;{?($;T%lZb^J88IrU?(7R3y;2Y@1IR{6xzzQWYZ{L`u`srs
z6YmXq0(k{cS++cZH0-@E00#xw3jw_GGoO_88%BTOt0C-m-*u}2T24L%#1nq=OC3FO
z%~@|Mu3pQwj*|ok$~)L{;|&}~O~)SzfQ=HV5zfdF8dmzuJg{QL;N5q%9d=|qv0VlA
z6DDx$D8G7T&y`n|J2n^Ie2!_T7w4#|z(Rk!+!|%w^eHEkQhsClDiIr-sEWw>2Tr=@
zb9=q-sAjGV`ax%zk#D{3ku8c~%AW16CjC~!YX`NcUqcNq3<^yB>}LSp`&8myQ<K%;
z6JEUiP6HI)dOiW9B0$}QDRDzX-=mKJ=)LVGBfjO$=K}zX_&C?r2tb_}b?lr#iO58j
zq`3C8pI!02A6Z)fP~N(wAF$)w2`H~!o&4ZObwD}Jn`u)C=)UTATfhFD5uf;E!+{5R
z@06_vR0ZCvh<ZW#KaU_!MikhGA!lE?{{gpdL*oS(AY{7<1_%q*;@S;mAU|aSf&r>e
zY-G1y_RF5{{c!ZB{$I4~Y?Mm?idX+xTiWX0uvY?~n>2OYZ?5s{)@(TbB)5JOMKQb*
zv-EexPd(*!-NTL>=i!yfcnZvg)RhzjQ2_e*<IZ_|$YBIXQjS_j&@o4s@3@aY{z;S1
zH^1qv0El13;7Y^%R}{(&Lz4JpLWB9i+jgD%-#hQU1C{#v(7R>tNx9JN@Xx+NUkzw2
zY4N&8E$Y`$!}dtER#dA$IOkN=7~{RKR4POy!93fthxGsc>ZEOS(;-JI0Ao}n9y>ZW
zXV0F;pYSCexcs+f&)w_h?Czx)HI1Tu_vxQE52Z5abHrAyH3Z$fY4Dn>R~>lRmM>r2
zw&%{_FxXJhvTq#;V8drV?>283m^R*z9^0_PZhenEu>PHI&(GOw%%}fHAxz3j76!%;
z8#0D$`af7hY*JZ&2p|jr&~eq(JY#yxvBwA?h8e+LTmd9~rNpqbG<krXm?$2YJ9o?Z
z@5mo|_{dNF57bjGUs=1heA`{Q<Bzr@+t{gdNyC8$O#J=tl7&yLeakt1V1O75l)Acf
z?b_yp4kI?GC}5Nck<m)UclUseWR7Iz2Ol(m&e=PB0{}+6|J^=k%A3~4Z+mlL_nn+`
znk}zirMN^HHISf20Ynj5fXiNv+_Clc|K97{Z=3wp3#WhXoSjfLfO~DD+V?0xNMEtS
zz79^ThQELso3Dl%fGpBK|BoWV%+^}({X-8u^!@LDf6}ChU-`;cRlQfSCfE1KQ~rSm
zn%;GxX>Al&fr;2VFxxZFHXr!U?hn1k-E&{-x4#W0RDc+xzVtuKk2q@WVTaVe<Lyan
zvw3ET+<i~iqmLwgeeu*8JKcSA{(wW1ii=_!H#gTqsNCOo`L72r{YA?MKWH9#B-y-a
z;O<*DUGR=Lt{eZeUr>ELd?IXBl{T#N%a-@8-+-<jGcd^IGF6IFDwfAi9Ql^BsBJWy
z5Q9vli7($ax9OzQU@bYv#$=u83?M&sEPKJ$UIQSP8~EezHk^45&)#kPWxp`_oUc?k
zHpM&c7;vSjXP+I$iP=_M(s1HQZ9o6XrVHM^@xA{!;g`QifY60+db<FyWdMF_XYZw#
z_WbZiovowyxa~%6Y4&Wr^P6tIscGbhhCOyAfNbp5Hy?Cp@25Z8{m28;KKF%iXhXJY
zuhvx%$Oe>|uWOLq$$`9Ro#hW7H3ivpFj)P)rjhkM5+qeBj-?WbN6j+mb&Xooub~ED
zSX)>2Kv(a*F~(Y3sZ{Q|>#oZ#yX>yJ?joX-Pd=$Q*k33#Br*t~blvYO!01y?4R2?K
zupwa7`R5gX`pfbqm*|u!4d<K#05<2O(zxFN)8^f}@!c0}e9!wpR8VcGYpOf&pu$_<
z+;ZB<+}NT{46_2csk0jMjlIX7*t4~x@u=h5zxw5sCmdf`wtVw($Dysg{f6rcd+aXW
z5tAwm`=y`%Lhq%&7VnwJ8q1N5^Ko3q1Ke8JWtWDdMhny^0aRYOrnr37xWf+zs^;OL
z0gVB)9DZo`@ux1o@Lg@^o!NWGgZ+2RYdYcR@jv;MozU)l!fXKa|Nff#`uf8D`zwUl
zkg6e*B!0v@-<7Q1u<`SsqxLcF7k|x8oNV86?&eE>*8Auqm@p}~ZgpwV!lILwgAbW{
z;rXbK*f{}N8<{=!8rXTasHH_+1;(<HSDEntJ{g)MM1{$dnlv-;Y6hiLY*90ko~i<d
zNmZQ}0EQ`CA@mv-;G<#QQAj%(VltX}2i5TUL@nyq@b8Xln6Lg?jrRis19Rujz4FQ{
zAAb1Z#>U1o&ph+I^Um9S_uV~_idt9zgUgpoOKbjsgB`$33i3qiE2gQr{g*#o_4&__
ze%sq}W5*^^i4cJxQf@lvfVzhtD?Pi!40QSWIx~5Cp=}fZ5fvxQETkenzvHy{10UM_
z)h{)_<?OLP_#xFb<j;9?*If@4jy-DpcfM1Yxug04GFH_xK!|2seDS~sKf=bawPX!N
zCd$Q}%cCw21|+eD!)Zn%^8TIg<~QsMWO;23Pk&NXa5U+UzghFeudKc5_s!F%P5#3*
z4X2!9*orS3Ht+zTGiOs@5{+*2>I_JUnS{uDLEEQ4ZN`r2eEM+=7NxE*?vfvOAAUgj
zj>iT!tuy1sG#_)!m^YtWzt<kBDuAdesRC2K|2>fk1j1=EJpn2e0Sj3R5i*|1s=o4W
zcEjcnLl(uExIs`C2x7a2hdw)%;MJfQ^x9q-3&2oP87x+crP6j|^7o;d4|z?G(CZ51
zoHNE$AAGO%*=r3oyhIF7=~ox`0D_WkXJ_Yi*Ijqzl~>N6KYz@aF{ht?`kUYU=9x2R
zs;a6Gd4g4e8m=r^)VF*^+i}N>_aGIvMn#oXg_%GEQBet$Z-RO-#1j#7h`~gpDjotg
zOaxW+L{^9Zkz{cB^7=_rBxj(8lVot^bA`z>B%dSanMt_XA@6_)!*1h@mlX_RG&7@z
ziW;scaw-5kF%xmntqboI?>rgH1oZ$Dk%%WzknsjaAWS+q=z2PH?c*SU`fs45>{(Ql
z*)Xa4&?|_*39Nnqprl@ugqWZl9z6sRVgWc+!X-}%6{GBl7$P7LnN(o!m5d?PD}jdV
z`d57ms=kWYR*x~24FG^w{E+@Z$e#NGFlo{0H4m?As*6t9qdjd^`8&eFP;=!l0mWB$
zh!jdnLJne?*Ys3&xah%yXExq)(Vn5Tr54-*YEi$2e^XSSYoT~g8P=~~f6X=5{O)(Z
zTe4)yv}x1MIOB}7&N^$-q)A~jRb@`KS5;XlAu_DSsWK80cE(yb0TWoABZRa$Q3!CV
z1Qw7-VnhHFdIduFSGDasl$IglNevq$-m?%H5`{J7JUCd(8-aS(C@9s5kSaJlVN(-Y
zC1oXo@q|>tR()vLXDDEZkwyLV3812agaqJ7)v2msL!=G{AgkgD9uz@6ltB~(X_bUv
zR6R(<8L${;coz2JKp=*yg-^&Z*b*c-;6YhgNkqT|Ps9qQlv>Ep;1t9m1FgiASqQyC
zmqFN9>ZO-Psq~8&CVr59Jv#ieb(_~;u|xU?BDD;ZAXy{~qokw?VoIHdUWviQKP2ea
zqW()nEyG{KzZZt*^TYa{nVFf0mMvR$)m2yh@sEF8w{G3+*|Wd+#V?+I`st%ajjE;r
zhP7y?&qN|74^_{G35J*q6vQTY@<oPp_=+(!o)JR{3=@%x0t8MC{D`4JWmW-z$p9(>
z3r@9b30`10gisK8m>NAeq{ca;nJEE69J18ep(_z7abR!}cpFB1cB~A7u|X^-J*ccx
zKnVt51r%XTpC;r9OyL~B#v-ipL0~2mk%oRaRTEPX4j?9_R4+$Ki2<;(4iiAsw7Q;E
zo?q$2eqaftwFa+gZ9DB`L^cD<M14SEQO~TJmPLe=6eI-BY5}jo$U;@YN$&+<%bb=j
z{>E`lM-@a+feOfki?n7z{I7so)UV;+1>3RoLzXM#^PhhD>B}#_{Lg>>b5~c_oH=tY
zy6B=4Pdu@ysVVgA5mCs8Ro_ch12?sy$BUH!qa0S*DFfVSrU4gxT7u3``c!064WUk<
zNF%r^WeW2^)iEc;W+kwa4mH~bcX<wmK&${vsyml1hdGT@!xyBgDO0~H<Tq5M&h%y!
zK!jjK&uO>|vkRp#M@gAr931GwiV>N$Pr8pVTt*R&4a#5*GZIxW(jS6^v4%Jkdh=M*
z$xk}WnP|c*t9q*;VMCNaBRFg0Vbz1-u}QNIXQxV`$lz6jH86+>!pH{oH5T_%C!_S5
zAu_OlrqiLn>pVipR}WGIiRmzmudVTUeWDiiYxuXo@O-}ao`|9-^4>rA;Dc9OamBoO
z^UCG&fd?M=*0;X(@WT%;6bi%R{R`|3S-K!BN3$T8E>1~F(`RR@^PX0OGs*T4@lqvt
zgn5Ey;dOXPK7BR&O@|(uS;$nyFd8;#aD+j&h7K`5Jlu8I@elj=h4Z-Ky+cekP1oGp
zI1Q!>dsO}CaAL!=2Vsdf^qLY4I|K^no$=*U?VEI<v^q*i`{Z8nEU`j?h-jdg{Nn!g
zPAwBTFsI%?DM-mcn;UbZn(Wv{H+fX^sC=Z#P$zK2mQ-G|3w@-C_>9boVSW>T8?tnX
z5<I7KHJ{F$6t&g<KOMEGU&Fr&wxid*;(M!-^NlgXoAHb>M;>|PdFP#X;DHBPYeT$e
z=I6)zZH|hD@}d~_7_0t~w)s8#|EkvCuvdrnvhDWU?{(W_+3o|Lx9PCIzu<wsIJP~_
z>VLHDug^QQ7k9s-7kI6DnA^QDd}M}x9{$@a4JFX#LHCJE9xhc{GnWOo?It00VgU~;
z<Hxq`F|Fb3eOk`mXPhN<UZPiGoj;Tq-)2~c{}yIhRGsUk-S*Y;dAgtK98a;`b(Vcl
z`d7_zN!fpzrh~FNe0A<w%>h-XYcFv4$fmwQ;7~mbr!n6Wlrv%}s=FR~C%v+CjT-&}
zYEi$2e-#X|$ud#Azd81A7xG0!L_#xOe}Dfix7>31<(EJB;Db#~O>cbT8_zxW+&OdR
zgv~;kKg9m(AyxPD&mN%~UU`K1;OK_JrT=wsq9){^&9?G8!s<Z%{l0t9FWFqY=jn|%
z+}Cx(z3VUCd&76moxa=n7U#tn`g{3r*}FB(hi6%xYVL%HDb)fFpOsPXuw4`#A}2mS
zKa$pes>K~0nyZA*4~-s#q3Ta{tg@Wdu(2Dq*K9PaDI``TP_M`VAXaF4KV`=j-VqW$
zuR<oLh8IRH>eukk$94=mY#VO>9*78l5Xd__JOBLWKVNah6$=(D7&B(fg%@6U?z!iN
zX1qYoH^zi+y5YV1|Ln1;;nf1AbTd_z`ZzjgPWxYMVj8Na^toyGE$RB->sH?JVBgWp
zAN|e8_d0x9tB8R4Z;Sg`98AkaOsw1FP>|gsP!Uz4ltmHBE#X$tY8=lf5n&b*ZY|2r
zw%VYeAj%3u+bSGa7x>xSAw7t!PF2k&49y-w;7+?UNui>tapVVnwcxe|TP}b1&in1q
z?nOECIigT7%IF{%ng!6`n=`55ub~$8YxpN)yO19gaJPH)ukHcECJI%joX`_bL>o75
z{Qd8L|J&dGcInckQ>RY-=tn<#_St7|-;9@`wx<7gK{aYILj!#s$l~30>tm3rGN24J
z8a{rfkw?rP^^HF*{obvczx11@cK*;lqwDKM|5iymRS<a*%Y5_G8^1emWkX%PdRF!V
zE?6C1r#p>rnzO^GoyXPXLE=1F5)i{fO-7m^WNnTiTuWMFPvr!vF<R4wxI!hhFH9!`
zD#{t@jsRu&?Crzl3NwOb*5{b*HF#4H!cPPR7H{y&mimol55Q|^7DYx?sm@gpIx|t7
zMEtWajn_A7QNM<N6@&mEvg<EPI-*(>c)r-qvSrJD^PAuN@sEF8yLRpD*|Wd!g)f|b
z`sr<LZNr=KGPRsqy#IFtWlL72v@NsRU%ss^S(TTO715oSi1fctneo2kr<W4fU^(0B
zZ@z?535BJbdgk7<#WW?pSjkq^7Ea;U+veP%J2!mxP16sa+UmUX#zv|P4;TT7f{0m#
z)q_aL3&<#^Y{-m)NgzPd4iN%E2d55I_({`Aj}HNu#fN@HD1lgwW@O7O4Uv&H)PfZ7
zq@hH@AmlBOGnOg+T~*18BFqYcyc9uL6M;?h2l{um57h8iP>cFC{7WFz-GBf4-!EUj
z{DKQE7&&reP|kTN@2%$Hhp+ELGv4*rU*Fl;xz}EMee7c&+pZZeZ1(qa2x@qZBa3or
zDV#Em-RFa<X(emeE4*q;EEF3RlgSSLO^1+W>H$RgJh<h^-5b98mKj=fFeo`%*PGn?
zT-WuFblmn(`Tpk?fA{=x|8dY{@=C-Y6;M@DSAm2`0TYys!(;$KXCXZ`Rn8hoSXH53
z)qsQQfTlZxgl1HDLn<KBR7*-g>Onk=R0xF3f~Xo*R;Unc$Qh_WB%MvA{?ts6P|R?l
z764+hCgds796{}7d=*iP`Y#(<k5)RfPdlan3`_5Y)YY&SrRwuw=x<el)N0>T_2wUz
z1*2@cs&-iV2`|8?LDqwlHfjwomjT0TXc%_9T3rDQo!t<7TC)X0STAIUl<iQrIkjQD
z;o<GuI_wZAYZDsQ8jes?2XNI@SIwV4|Ln8R9yxO43ygf00C+($-9XOw-aq{C!<S!v
z`OP=qTrQUnIN*Tu&O7hOBaaNtcwt&FoSa_^_W$;%Uh7r9`w$;3E0(9_@3f6j;R(T;
zScQ~`$Rkrjs&eqDdhb^WbbqZ>1wbM2rPf?wuQ81xN@<~7&)U81OD9kJ+#jC%_6=L#
z`;(Pp8|qHpqb>2U#35R<CL$2B))1=$8&&cmOhm*Oe*Z#M5>>E?I2<-ZRmBUD3K<VF
zAgEVSHpUXE2DWSBy@g>R$FM`zJp=(nU>Qt=^p;e$!@D#A@<Nu>uz0T(g|Q}5_5_Gh
zMZG05K@B3KET9(uUj@{n{>w(%xtt2IWP<0k%`HvCgKbsV3qw&<5p>Z}4ZuNOMTa*>
zhgh%SvnbUKs~!bj;4zX)omcyB)7EVOI;<%-^AQE0;S9EI;I7vBGI3I>_TQ3{W?Clc
zL}Yj;PxUSxHb)tHa;CaxVJ0x7(W&V|fByXW^XJb$_0&@*Po5mO^xKd8wxZEhay~Oh
zQB*FM@4ox)YBSyuM;!6ix4tzr<B3RBlcz?|tKlyro3LwD|6q8LKU~3?{5}txL=P(j
zXqqgbj0yjjo6<;!X7Q|mNy@AMya+Q?!wTFt8hq=VSrzxx5B}Wu>8n>9yi-ek&IC6i
zf{KESwZvQdlMQ{2O){~?HpcPr;#b<}=Y>FQqt3){>Z(wZ)HlQ}4Ytv+I?u$yBq~Og
zt?5bJrk;u~C;7T~Y;#<QjCdh%svMnTSms?9#qVs}c1@Zv)nwZ?C)>7d+qTWgw(Tan
z-urn!_4U5G&VQe?_g-uL+6vEf{^}b|E*m!Fo6UkAWZcU@Tm)QzBUGcs5msO)-uE4U
z-5r0CvzsbPCB>4VEYbdsm)CmemHf|t0a7g)Q)KxWMVg5@QZ{a*Fg8m|fUSv)#E=!b
z{4bBVx@74@<O!vMgsn86uq5HU4pjgKwM(JWX@fRXEes?M_Duem1@7pqCuE}2SUJy5
z996lAd^Wt141q5AT(kEaH?h3G#4*__kcqc^*2Co(H)uH--@Ho)6VL~|%vm@&-GFd~
zU6HYSl@Yz>i63>>P*6}OQ<<)7CaJ*8Dj@^}g#WrsEnr>+@S;yl7wSTqdLHBkvy}S4
z)rB4mh!G4|9NtRmtttO$;338=t`0FSOkb3gfd?J;ibJjD4yIB12?yS=x57n3JQ4=c
zCyjr61#aR%i}G)$bh^Xl3r(&XyX`A{`u+*;V2D`=eMA_JJ@xxpF7oTcO-@cBZ|)Gq
zZQ;xM+kIg1Z}c)0g#ZPWMYP63Enk~`KU=I5{Y8WdHPL;VsQXPm8K2L*^c_qjs})D`
zktYYk>y)>anw82@Li3}iOvJYDWt!s?0hW7kL_o5*FoMxm-uGLxRE@j-{dx{p-r{Cj
z*Gb*exBJ25h&Cea+DEp8#82jB-Ugf9eM`)(hlgE$iGCjEpWRn|Ezd;1jg+(Ft=O39
zC)DA-YRUgg#^nzUy}4AypeA`F`r9&*u$n(F3MgH8MLwnw7M6IFjh9eWS(!dFqT(SJ
zR??{a839$Huso1C{6XSlLrKfh^ijtC5QaM6Z9<jrC&sh|$V+&d4ijXR&_I$&3==QK
zmDCSL7S;!!?we)5$TvW0kPe7J7c@lI4M%Uu2$p9XL5~~;vVyMa1`&baEh4|TAfoLF
zHIe3tn)N7GV08lq0V^4eM78e*z*S7SsYHKT-2ZOy@?kh!`4=!odA*dWD6F_a1nM3}
z9TimMGb{R30gH_>S7m`PtqSSKpMNGy{BK3Kt=?9>Q!`Frx&cxMTo5u$X8-eLWRtm3
z%crNw?RlX3<ccFiS!4*;=a0#A`i_^?mFSkq$%tbKhZ@h;?PQ7Vhv}Bgj8*rUQD>~}
zG&e-J=AabEbN16#FkJ50Ozh+hYx=p7Cf^?{hHK5v_C9_MPOQ0#n)&n{t$(LzTVx}c
zF&oA0?LE%J$2p@ZPF>alD}|*9UMCE!#Jn%$qD7fd35`9Ur=PK@VzCq1p8vvHxEkH=
zmm_Q4uj`XBl>UI=knR1YL}sJRc%CeA?7hES=577{5$#yg<dV8YaL?g=ooadGQ}Ub<
zdpW0Ew^l_5FK9jd4O<BGbVB}7-$b&shD;M~jB;`Aq`%jD)cs3cYktn_RwJJ`VodXV
z4!*+5B9`oQc7v~>Xe3^7B&MP?#R}PqtfFG5ICCCfLWE$>X3fu=n5cx>*T~WjF38vq
z_j9$C{}*R2YC_+(N*`K)8C+x;wqKeEwE}Xb%vrC7lWEuhoM9PNLfnTuHkg>CYLw>O
zq9Pl`!f#;0BplWcua(a2rNu>MnH#mrm%;4Y7R!<UrWrtZ65msOtV;H6mLni%#|x-f
z@<lk4)&iKQmqE~#Q`f_yF)P$jlS4!E2b~KOz$5pYS+K&^<;QtokqmZBWq+;27@dw9
zbc~&OKBvtGxD5_8XTx+cO9x#m9_jGuG`R%&91YL_1C2r=Sn}1(v|kcmdpp}t;8v<W
z{&KrM6);;4zdwIjoehX28zB<DHn#!+h+L^^3;~$c7laGQAbv+I58bJ}@25&fjxpJH
zy(cobD``lM9~&tyx^{mPZuB}$?^aKZ#AF#Fg%kRYdA+9!-g6(`C(S#~Ke^>B|9X-P
zE)c43cpk41>G8=6`_lpdwONf_C7lm7xj%rE!k+isPOpD?E+{4?48;U9?_UA#NKRij
zxw-G-8r;87WTBQnmg+0^jsm{FbUxno%UF}=uhHl0T-5G}6;LTqz%3Eo{DHX*^FvU-
zoiG&T$l<X~K<T0aRI>k26hp#<y9I-VqeA%G(piWL5-2Ex6CTD@1e;yFJE`!G$2*`?
zgRmSD9>WNp<6vNtkWef02(Lzl0jkD}h{=gdB?+q}D`4v{`~ShJvW)v5N}U@i7R1ft
zr$C{YK;S5l0Z@sKImDq`Y|ol2ZC?Cpt*~=!p_p8+wT&k0M+rNp{LGj)Jc&Ba)7^(7
zF!Vmp)#P|z4;j7Bq!iG-_u%x}fMfIjh7#duR30~ABKmH;y%D}oM@oaeTn%?!*hvg!
zou^nXPd50}c>EHrDYTWIP#h0PH8*(bD;#YNwQVJ4`Z})P<30Q9J^LL!{mtHBmG2RF
zP?De|g02Zwm)B?1s_4rvN&3JbQ`iiR1LBECFf=jhs14<TJS9}qf3`Pz+}~?=^-wWi
zFZK)F_Y%WMDT$QqM11a|o2EX`v&X{(pT7!)KRT(nY%+IP?K`>;2eO~=vtzp}O1Xw}
z&@|UG&%H^16=>yNroGF5Pp-{9@9<h18dXh4a*{htB9irbPCE^K_&ToG5L5|IS``Lo
zzSnltLdo2!&EzmwQ|MKcDbY~*e8b4>xZVG312;z^CPwUI1QrtuwrOs!MJOcPOl2<$
z`(ed;23uYi2kwk16C?yHf-?0%Ube)ly<{=#>Td^mwg_c7@(S6%(8D1@)v@v>x3vC2
zgC4pFtLw;aLii2^McbT(Io%-eR>J0lC>4|FS(*@s`LGSJC@YpS0>8wDNDH{4Zc5C7
z!jl(=`DS!}o5L7>L_0}(gWqyhMbeaXU1se8$|_P4>cs_CVQJFin=-#Sk|B7P&gDzx
z?To{KGtofWU&NDv#JYJVX<ly{hAI$#^t~jlcbLjwaK8Yw1<UKzz>qs&_N?b+{iv?z
z6@<Q<+}6Lx&j1LxnzsFji`rhFi{)y^<H=WGlc#At0vud~w(c#FdLuBx;c$d@b1d)Q
zb*A|KD~02ATd%t5aaId>J)I%yy4=orJ06Q{EU2)r$rD!mp^+@CPMWse@XHw1=Izn|
z!m_5__OsLXVHRTa((P}sr-usZ*9QHGQ|8|QP~|ZizlcJ^fMa~X=09n^)$WT~d(6Wy
z$5$to?$Ol&*@J@894SJ*H4aB*kUkx_4*6w{n+2}+UuyOg?7vhQhiFZRVJ$D*;XLJ^
z%%*`ZS!9fC^f5P8WZ}crEwO9n>a)KT-AF%fc<fWz`*^yn>T2nJSv?Q*FJul&q1XG9
z4*?J}ez`bZj7;HW$ARc;pmRN^liuAzgi1q(F^EkTVD@E|TQ7)JtGD!=%r$R~PUAE5
z3$QF9e{?3rSgs-ho#ZTW$wz{sJE&s9IkefyDo792nR+X;#%bflut(>3uH0&5mC1+>
z1vUHL@#Ox`R-}u~`AjK8aS@>j&32!EJE+F-a%pUJ&@eb61Qa>y-QV_JB|^(6rS03F
z+pphybb8J6*`rdJxv@%!Oh+e4lo+{&;Gl0)Bs{L$SAT!8mbHCu@7T>!DdR{=latTt
z_L;Y5Jybn|5h&B3#w)*#Z%q_gl0DW;Tz&U^G#%u@K(0J7bT`g-mek}AliUSrIaIUm
zubBxBmw^on$0?TTz1!`3+%3e2ZsJnQ`g=|Ev|hZ$ihFBq%wdsP!kV)4o>e;0*AlfC
zR_CTYv6KPYfJIZaM%(@C^+d}ukvzQMkDH3F@PBmF|CVqinKrb^lKD6{WrZ=c=&+;-
z<Pu{b@L0Fiw~s(kTim>*2#CJC(aV#KHGFNfwPnsP3Y$O1Cyw;ziwx_hsLiaS1-xcm
zc14O#$>l~1S!#^uL(Q%dVDq}^y)v#pjQ*C!5`nb!T9IhmCV0vAbL~3ah6wyz1je5(
z-D6f&xy3BP#Z#4y5P<y(v$~cAD53T`4FU{1PnzZcX{3Qo#A3?CoN1B)Zv7oU4*#6o
zx&u-$4wY)nl07V4b2ncOSwHVr-qBn^{8aEd;`N^nN5a|)!uVkei!hd2FmqPb*Sn@e
zQ6bVlLVD)cV?Y)~cG`_zRgf55x2b;>4#D5Fm%@kHoTCwM4nn5y>R*}qK>XL1V!;M8
zQE25*Gj#OZdM!U!ML{mFE1C^slJ8u@XG+T5oOeP-Qasc~j7u(yQBz=qJW(*iu%+9-
zNAsLJw%BATk_TSA+jfoBb_{1yk2h1;S57p)^*F@-u^Hgq1(n}+m^jJEc_&MM+h3+C
zJUM>c{6=Fne+~iTLzUsuq*%n#SVF_wX{L5RkN9&b+v09`o3D3bnro%yO7`cbz;X*M
z)zbVTFR8F-2i=n)Sz3`W=&$`y%k)z82`4SzeW&qb_t7I<v#(5#Z*{p7!tusWj*!H!
z?WfPtNgfFWVl;|uIV_@Ym4?fH?UhC_Wx~zJE(IztC|m)O!}NkLs@f_&H<fx5&4%ZV
zgOoMe0z2Zs{^;awgjbx=z~~4l2o0ObZ0^?L;v?W`0p%C=cebqt=K25EL2tS4$O!l2
z+yEKK&<g?Qrh~mX>-YH~y<p~+c!{7)$ac)3;C?TM6J}Q*$&G*cy9ZFM>8akopFb@-
zU+<o;pF(b@Fy@E{kwCpbi37l}j2y%zcp|XKZjCm!*DNk#snbA`odl~3!1aemz<K$4
zavZDxv?-tZIAO4kL^l|Ephv`r7JZTx2#=8V8AgnqMTO#Z>+{gy76JU}2<ZO^jScXd
zeCwd|<N@<kktEP)5xbyF9V7wr&-^sxkX(|GyCEU?fP$o2v~p<aAyZ*qD4Zs9BIXXM
zBSUxOMXxQP-?Xsm#N`Qrj#H16H%1cWaiE|$N@Vxl^Zzw@{!w9w0S~|J*=%+Dj7m4e
zWmX(|S0Ele^`Nzw?C*E?@gn3tD(Y>Tq*rzDM&F~KJEAQw<i0RhVp9}T>ATJB>?}zw
z@-P?QCvUb2D(@Y)wK#W4(Qu|#&`u<(A@`)6^ESg^I>#JWs2XK0x8JW8cT}_;b3NZ8
zuKB$4j=FY6$y^l5sSEnnFMN+RZzQKieQ*YnOKr=nw6UA7Az%ErmzaO9^h7Q;55LBD
zUGwVLcgu2?J^IBK8aScjp|3Bw+Vq??zD`Z-Gjp?WwJ}D-yn1vE)<f1~4=R8SDH=;=
z=A8Ck6EyA7O!83rF3R15X?fUw$7~$j{@MK6jx-CD6uX;jR_K|gk|_4n*<|6aMgi1V
ziu|0FFmge34v??Vt2Z15u9SiBs`vj|7$E%tQ&&!I2Do0R>3KVUzv=DG&)-{AlG_4K
zv+Q0h4}ra>Kv60ZO+3hMSR)L8e*{vMzSx*Lgv$_u6dC|#BBN5qLmE`oHG(taDvtdu
zV(3$nMhDN9KPNhd7Z4ypFu(&2c-TJD`PVVgGzU>v7K<~4S>rS%=CW?-U{p^()V~%I
zHWV2)mKMU3SHG8r%D*RMD&Ggn8BBqoOjMs2!U?rwG(<-IBor*JQVq&4L91`O2HEb7
z2SQ7Z2v{`E7MS!4^}rLjPk2NW3(2$ZKPTd=pNXk1IUELeB<7dMLJlX|G0xj&0#h`)
z@5cS_fP_cM@ZOcpKpPI0+}z%%VQ#6WeNr}E4(tcZ7^Jo?UOChZpaKI~Mlqg`*Pq8_
zxOpk|Ww{y<X^A1s=l62DV>VnSi9Q52m^9cXsf7V5?BFmFYwUUj9VNeaP-541THD{w
zsK#||uY32q?)J3~K3)9Xm{7!O+IZdHT-$L%*A(8-)poAe1ZZJ7=juIsQ=eY_XwL<N
z=RZ&0e`nw!eS7h^*rI@h;??F~6%nzXgtuLEG<xNC+CRNg?$Fcv*N+O@=$r2#5Jf;n
zdW>P=9%o^($v!4O4!`@NSo<u;Z0POM?~kvG>8vzJZXk9v2ST<00QfXu)HZIDf(~S^
z&_X~IIWW$(9@xYl#Bd|}F_)B<o&ecRy+BO&@k}nC%jK$T&wchuj_0*3_Z7Ou#|GQZ
zcjJF>kZvY&oAxat5M^u2@A*tU^raRpN+fgyRARQBd+<u(*@!<@XMdf&?=c_p4F1jP
zWP*g{{Rlk<t))R+WR|g*_Cy(6D#?(}5fT=z4%fV3pj};60M8>fki)H4#I09>`B~5o
z)OIZ4kk-hbUaGW!Vr#5xfObldNfQUgngjZ@-tYma{v-w_6d=MT`ttnKkmK=^mLx0M
zVO*^b9QC>D>$@T{6?;M%?ZjNppmmxQvAtQu<$Jb*(SzR&{N{VOA#s6qp+HE%k3ve}
z!jec@m@!~zI%zzt-+qFY^A(!QQ#d%MJz;p>e)294TEv8HLbKe`)vLiqU7#Zv(EmoA
z*g%58#Q79h2}$&ZBlnA(!$+1>^4}$@O_s@psKkmSAZC9ST3wTC=DBM6>W&h03;}=<
zA%>za0O*Q{pc4EZfr8BKT##L#iP>U6K|%c^67QGJhGe+X+12DxpWYQa#Zl=x;`y`<
zB3SbsQ=8!|Hv;8rHDr*>?u@n+`14L$!h{ZF84k>6INPf4y{mBmLe>KSfDH4C*!b60
zj@MtkAcUrPT9wJ|dJSw${{uuZXtjLoLh*4PB`N*atn~EsJa^xC1IN`@SilvtMx&_;
z+lycq(HpPD#|Q40J#j&A5mutd>#UyJtB77X>=PF$wh|@eE<w4HOARS?`G_hcMDE+?
zct69}Zu%qD6_+HBc()0+nFR>USDQYaUlD1z6lyNH?6@o~#$(Uv-|#=+mptV-F*jQ4
zaD4(}k%7v)Xc#IAkKoy?-7va@nq3(DB2j8!dlA$cR_Bg0JZk%bKGQ*7oG;l_KZ#>U
z)l?%A;>wkUHE3mrkgZQEs@63Fm`U~&ZheeNS}zW<!B}q65dlZSs3%#D;f~=8LloO;
z04cu&tTZeZ{;Itj5n!MhzWn$thN?(9gZT_<=Nk?8>!0c9=KUa?B6!o6*(y~HukLco
zNJ_#yKMHJYL3l;>OS-KiD8~4fTbEWP2x2ATnBh29WH5?HJHR;I^`vt({iPEsa$`gS
z1)7ApRJ&Pv_&xOarxm)$^Ci)KOZL756j+|5M0i#70@+I0I@Cc<Hwgz5@(gE4yb%hW
zd_#n|1lkXjI4d8XYb{$>9!(Gu6(K=hLt2Ixk=Jy*$*&u_OXnOcIRG;_Z%rHZSVUSE
zqb-59j~I`^j@!fbHibZ2SW*~?gxq4gr0DcvNPGp&-Zc=To5kn*Ri)ARqx)eV$iz6F
z(EDm&KK>70{U3sA#_O0hkxI^GTE+ZBXh(=4S8wnFTF62(zl5<x76zbYV!4`@_RF>)
zSSx|G_xUgy$%!MnfU(=81){P;KE-Y0t5AMMZL{%pyIAYa9hfa;RLxeO&pC<_lgIw&
zE33hk4)<|Alzpe$&~h7B9=%`Kiqr@}!`8tsr~77Xx&8?$)~4InL{4W&V2#cYX<u)%
z)n!W4b+@YHcS*FO9?B`mLZ&Nzt=x~R&rdSEXc}z8VjeWGw!6J)869TB8|N|JgRVeL
zly{<?<=4Zvo(}CljnP$X@%hvdJXI(#0Y^fprp;B`bP<{%-$%rmafxvQwDKC?e`v~^
zCOMht7^U>ra4GDjpKD8!T}2jRnW7gKxk(QMREG!s5B3*Q|Av%i#C&&z?-x0OBz&<r
zw5QNe+8~_$0)qq+YYFg~-4~MrfnM%m%QmW7CkaDW9!S6BAnTf@r}+8vP&~v>@KG|r
zseY@SussMr@(ydk(4`Jlbx$nr##PZLPhZe1=9ry)!x#uPLuV}Ry2Ak?{OGR05N2Se
z6w=pxi?kKzZXiEFVS%)8-n{8Nh!*w<z(`)hBk{&-negK~sx9jR;s}H~Xspe@dc8ZD
zk`Ve(SqZNOShaDCFn?$R*TP8GKo}g+_sb|y7yE)jCTev8q>b_Riq6&UJqD9_^ac|z
zH&Zy?J8m6CsF$JIp40ApS&cd)xjb7}OJ`X4xa`S>N}HFvF^8!Cb$5NKYJo!{cx}Y%
zn120gK+559ySygbAYD-UQF}eOIm*54YSn$&{XQg`J%+-nIm2gsSZlj(ckQa?^>sOJ
z(1;uD&?|NA)57O63>TXfPtMX^`~_zw^EyMs<N6hg>GR64fSA5Kk1cW$;qcg9+57Uo
z6ioR#iRW79r$(kLwtU3MV=?Iq#2|+Hbe9~yH2oG;8p`f2o*>+O`{es!jkncdQW=!F
z)Zj74`jqi7^2x5_=z1Gv%X>ekG(yDUX<R8{Ti#R=`}Ht8BH8|JhSlRA{5xz{0CEZQ
z09Tjfzom<yhhv7-9St2}!7)3x4<BSw;3_`ed|$K7@^4KV8}rp$ed3D_ieQgHdzQA6
zm&4@8b}sk*`JB)mJskS*)*}=&F@mq#ka96kXVJcq6wj2@W!|_<`y}fu(*3TEu|uf|
zn~Uxft=I0E895%o;lSU!ZztdB4$;BsqamSUikDIPO+MfN<j@%^yN0LRyA-qit(j;|
zSD8h}3J`I@5Rq^VFUIPg8M(EU#aOkQB>2cWftv(N@6mz*1-;`Ad*EeN;ekw>sk*Kh
zYMr4Eu>Tw$^+9Qs9-HA*T;3Wk1E3%~`6z5ZBuRPTj$X+!6`8cO1Nmuqwq5nwKk$s6
zwmolVe4ZK?O66)J&i04Z?wjL)g#acgT@mE8mYn+5TMjd}^LuY`kCggLgU{aUz%$mO
zb2p8^6(1$?8AU4?3?<&b!#2A&as<woxAp*yPy6i~H)VNc6xaqw%!2gZWCgIqQwl@I
z-Sk)kb8_iyZ+ib-z!jn}v8TerzfB!A#^5>b#RCMq?s^6(pHjwidmaA%(o5T6YN(+h
zvOfDk(LD^4IQjfz^|ef+P$UR}wF}GZ^o3x><?Uvx?MP?G>+PU06Bgonb^c89E&KVV
zqhrJO0*X#2jywcH-%o10Sqz==WvY9N=w%vpVd&9)flqz0VUE}LGM)_or|3Y2RB(hP
z$2%QR0_*#0Y|#+|v_Wx=r|taq<3Y%3vxKzYf^}(^QiWK<K6Ej*+31;y9|3L1k7o=y
z0i)G%#X%37^)$il@bWnOxd5hdU%{5&phyi$T|~mb<fxmADK68a0F)MW&C7mDMrY-l
zN@lI{EKd&7gz&hd$Trybdfn|fyu_XAMAG=4+9i=-(7q8sPJbzZ;J4q9sounN5e<Nr
zMNzV_l<`BhH(&TLlp75Tj717}51>ta`7-XmnHnn(){#Xt_ocHlMlZ8+6TGwD<O=tj
z9|v}riS2=WeJvz7muS)Y5d2KL@y$rDA@MHbU!c?)00n&CW7gdEqZBPdH=u-u$m?Ge
z>+~uRfc_tMFA$OI&*50Sib~Jb{5#OS!4-%KZT@EQd^>fLD^IgH9AVmrmR=%g;APbH
z4(#_OrmTa-mAjPT#mrf=1jD-uauTJcGe29vI%++=E*dW)bWEc6C|lRPU9s&4t_vMA
z%8+rMDiBh9y!Qxic>dh8$vyqpQfHZt{e#P9>F^c(9tDXys5;j7{>^LiR^J)*wiQS)
zQ$;|XGM?yS%kAcJ58QT)vK@7!wPaO51L$8yv|PCCYvX;rj3-z#6IWMto{8+AdJ3O2
zH(74hVjfFd<*ciP{%srF(N_O*T<d)Ph$U<JdcCcdVo&iHA5$1iQh4%9CyUuBWWZEu
z-kqGOH0{2Z{Qt^<1<u3YgY;necSz6CT+|Hw2EUReVGFN@p0>B1&%id&nFN#YNn0ek
z8j&&{X~WJwo&AwLaN5mjq2Gdg`L@Um?I$nnSJgS%B0DC-M+;JQbjPaZGV`g-W3z%n
z<r8WZ837$mK~8r~yfosy4HCDPwwtKfyUejleFr_wuP;>NTde0G#PfyV^06&%h71+T
zNUZWswR+f|Ndtr~;b-B13XO@cur!+cZ+%+*kVa-4>PL>ESUbns!IpCaTalCRW9)=^
zO>ryb0AV5*6)6S<SSd?2Jk@WW>a^R77FdLf6xGOc0x{7sJa=O;F)^LM5!5k|zy$nL
z)BE)%kw{74vT8t;Wj6pU5;|&XSOdOg*GOu|-)4G<z+(X`00N3uE{C{bznIUuAwi%R
zja#&?2Cjl~62`-<Cc~0%oacATMDFpgX*sYYmN=M-)L1pA$R8dLpJ&(;ndq12B?@<R
zbw+5(lte|pIyn7y=kL+hOJ`F@l}m1`gDmV4{Y3~icnoTom^Jgsxkgd5Cz7Iy@Rvf<
zX2Qk=9}@4;^u%~D0sEoJmO)p15n)0u^nDePaK%`}#PGZo&~Li`(&jJ3lT9TWt<cJG
zL1|~U{vsgQzhCz;u_KDBgza+tj?8(S>;HxT*j2!r#I+jh+=yyCQ_JzJ%JLxNqM_dY
z*z6S!$%8jwSKQ3INL7jf5{ypHM<4mV2DeMU<a;;TWkP);9ITON<S870Ta!$O5Gbtg
zb{01bHQL4QERNqAIIX24&iM0af`EvL<Mf(jmv1DLm2$bTfM{xnEB&La7l(%Z9IyA7
zl6;*dw4foTL2HkSOBA_ZX#h&7XaW`b{hnfuKclS9@848|9sX><@lCrqlqi`QonIND
zGIJmpA@N|1qk_rB`N*dnBzz<lHN237Fdzz-?fhXuj_)9fRkc>vd&9bQ%V|MoMcZ)-
z$RTA0&ZG-txXxFNVz+=o=0FxCzu6-_xDU22%J{cGO316?MARx&ddI@~y5Rs$KNx{G
zDB%He<0w+ZUPti|W*ld5k{I+uVCEEz3lM&V`qSwNSG)|%Z_PtqVzCU{5CR`ummw)i
zfdcq!W{RXF4S^P2q5&~n!y6<FX*45Af)5wWh9VA8S#ql-W|2ltEgKk`B-e+Nvm3&M
zrJ}W<5BH~40OodK4>47c3*Z!E-s`youV}r%l9C6N#@~K@RW;li&K#8Z#;`^z<l-{m
zaP4;SDXH62kJjxpp(GbC3Wh3jLGuyZ%ylFj_fDd+-k>q~P}Q*!1ISesWQMKoJx&bc
z`RZ?j?a5`;1NQecJLS2~m_vP1v#U&26BpMtR&M0A`-Jb^a}BH^_rX~guEEUwgvcPQ
z>dd<-+j-6hlZQeu@}zZvxYS^<7?g=lJPK#(b8Q=I;BFXb>f!t*3#2tf7|}au&`E+b
zlRo-JulYn5X&R>M%z%#wt-1DPd<PVM(9<!3^4arF)gu6);n?62MrJ)1G#J@{#{j;P
z^ye|k9MELGa~tED<@@zI!~1^upWY8hkNx}?%GY)HA8Xg&MS1CMN7HjA%h87M3}GIc
z#7Pha8P$V7L);c~=!c>Eexu<E72ojlWx+f*C@lGd;b}Z2gFmvMhl8z6Z|US^vybm(
zx}zL5EG)J&ucXcOZ`bL`taJ^_+Q;gf_oZ6kM<4p%A{#ggLZgy>Fnm{W%=UeHjNjwK
z^^G)r!sVlSta?z>?%K{q(nk{v4o?alAfVCDE74q{Qpu&mwg*>b$yzmk4;tw7siYtw
z4UY2&v<x&}Ld8)O%meq=jKh7>c0X{Jst_JR$G$sWM~{OghURCY-G1jVAarY#eL8ro
zDyoYTURzC|2qD8zl@gW&p+ixYU<fBI*HV9S7Nal;Qvjt7aadmQod9aMd;K5Y9<Dq}
z0ZHNIT|V_L47Hh$_O_z6%blkcg5hf-I=2aUmxo}|21$t#&jl(^Ih$d+U0~4%fefy(
zfC+F#I4Cwh3Q|1>D0#25?J{@atwVR4$KM?W^#tugTSe{#s`4;nv{_`7`v1~*0`cx|
zUMV=DjX<mfBIw?Ja250zp!Y{`=?PQ2@M6_^k>WRoT4x_ZJ!7O0p9lX^0Vuh@@(SU`
z<W_J0Qr7s3;^*Hi(4+4_MvkLo7p46y#22~3v;9&-WdA8X=f6G7u~rh_AM^{p49zAT
zwMClVR$VbJEM1cEH*!n#RP;LJej}*txck}Q%*9n?nBMqI8nY4b^ZpzY<8AmeR{#Nr
zxe5HHfo24b{}iD>rL-5&s^C10lNjz|;^rwJM7UOa531txD8CkG5$jqm3;w8Bj#LEy
zyO&=a>+|fr_~vgoIkBHpI}&w#5#L=pBTYOQa}qM0`q+jkW$;r|i9Y&ybW3zZ0t60q
z(5f0HgcSe<6BWt?dW>!_LbY2onP0l12-Y@HM&e0viofe06AiO4jJ_g;>dmY<^dUfS
ze;Y<!#zir)+|ZDkMqDGQ&*CqiLgV(ARs{WdFK<Yf<&s%GyjTG(2y+DERIHUZrw7(;
zEjlCDUqr+3gQZx>Z~lX*K>$<UW$Q_wDxJ<|y}GE4$+=j0SwaIph=~dtIV>h=p}lX{
zp=?f*-sjg2!_e07=nv}c$Lei}Tx+fes^!JV$l$*T;cqFzk+x%U+!WYsbf^Y?X*%Cr
z(WFn{KF|P&!{K1XbC{Ojlf<QAAc<wh6=vw_r@OmwuJ>W)E#47fthg$_+mKT++&K%Y
zhEl|(V!s+b#$H`Um0($AiTfUF7FA;B4nG4%ZgJmBW%~vxF;jnUDr?t<&@uAGZ2;9r
z)v{Rwn!ygq9XvmMZ=|06R;Yw|b9X#%x`C#`Zu^6g|L)*^D55MLcZbJE9G0%;vgHH<
z{e)Y0+3Rf?-GXq*E{++<U0d>f+!F~P-!n^lMsj;ll*CSO1R!lL7_U8Z$xzj=T+QGy
z)$`zRl7SyoQ7isD!+lm1!j|z9AI<WIVWIg{-}^Eeb@Kvvd_=+kHi?PKVAT1BVn_YY
zVUp@MZ<Jy<vN+0EDM}e68VQRec)Fx9K4<}RO7NoYg&?fc*w`fTN`lWzQ{Tw{zP*q`
z8e()j0~2C361Eih@w<WF?J%8bEobr+mO;D9(PVHWqOWU~ZZAK%Z8#Ke108i+Ghvdy
z*pmI(UU5QM2h>TKL(<bg<58Pp%OsT36?C7SHdofT+zA1B`93R+_-k(csqbTpGw&$~
zqUDicD8Id-Qk!j#6=2oF6R(BZr9{|P{Djr1k)4l<TZ5*I0vQlN6Eu1b%SGvn1cPc)
z-KP@wwsVcfE4lWyrTrY?E3LJX^nBDKXB^v{o8H1t_w*k5JiQH0l<<&VLWKPN_V2{p
z>gYJGsQvm(Li<3uY)ZWMMbEWV&#$7!?JpjLK=^~lUMK;5wX^G0Dc6-n-b7zoT3X%*
zz3Xr<W_q|dUj+F})HHv0B$6$BwS(&fu?>SlJ$y5}=viPz`9b@zg_6c38$7Tqt4)!E
zk}^wYhc-OMAUoeo97Yj!K7<&h*yTSxGuV|~DKXmar5-B2Y#Y24ku8BhzKAccK2}k7
z3q_ZI23pKXLF8rOmp*Vk(EQ4t8Hz|?0VVDJz8v3ZJ6vU3Dl!uUyc7~EFKbs{DWs8a
zz7T&I%B#S|EPkG|2QrqWDURL?jNa9GPmIsg71kiCT|&2}Wo80R4f^z(eY@urQ%WZ+
zxY;9BBRd2M)CY<Xn~H=0{nY?k&24!vm#^K#6^qqnpT(JIYvmBnS40E`q@4;-?20g<
z!AZF!$=eaA9{k52Q!+d(d<VuBb;G4Uj;oRCt2;TVjNxGbL&GuZ>2THjf^qgbGplSf
zvx}eBpU}xkNzkRRG`6$;D#9&oZas4iOV{I#smdN4BN!DMPslde@OgDxFvF#`XT9g}
zYCGB*pIC{uwZ1E@7p;5<Kgi1psAsf|G%45ZxyAVXf_5`^40Ad4)I2T&Bv<A?R;?$)
zuCjJ9SlybLzmvUXNmU+oych>^qyL$>iSZ{@r6c5>+tTKV(i8cC7=kJ#2q-D$)piF3
z$5r;an%}nN{#H;>RiwjZKFaStI@Q@|@xAK{C)Gq7s=VFqoFm_;c6YTRRTh%~7g@&#
zRMdMf?BGUxv#|ki8xW9HC<)7w5MP%Al}0$WzmK{VVe<=2o4dmo0LAZMy%T5e7H>An
z)tE2)xwK2BEVVg9M3UqdGeVemI}XMQ-_G$)jnEb2u!dDWhprDVGk5m%HKPvLe4d_X
z>D)h?%7l*!FextygkW0cU$r;8m!o#L&7+vTZe{q%Gi>n)q|-OvcZ<9BL^e2ia95FQ
zJ$0qJxeQDfnuL}X)u2gs#{)}Dyk?mMhWx=WF{9x;y}PVmUiQ1cH=ZY(A3J^PnohB7
z2JfRoeMVQ#$~<nKYwWGceXuL!%Bim%N}TU}z^M&M^~6(GMJ!KzOKpayDo{p0tzeIW
zd!2?1tA_*`2JtjYhiVMhP2u^TK6*kK%OAQHnNHw>&qn13)$L=-2C{0c)Gt*T8#V|D
z55*$G;@ArIhuCd}xO;0G{=NTzzKHpNAyWg=naj!=dcJ<^AbdBEcdCSTt9|?w3TZ_&
zU*npKAQom8!6@Qo7ixBiqi$jeOMsro^=-(I;p{CzCArq?0h!Rn^|QD0*AvEWuwlYA
zJxzp?-TZkaay6f#Ym0Q2?5X{JV!0*T>^o1#zDtDN^Kc?A1x-7)^pi!mxm#`9sn$bp
z{cub7f~~f*GBXRt`1b2c5cBaHHo+Vh6di=P(1o{$#o3llS@Y8lKRK`*D#&83&IkdP
zOgOk(&MAM%$~z9{mk5YFu{>CC1PNHMgVa07&IOS@+A?q17GwO>8cyhv95*S4hnY$t
zE^~Om9e1_?B>^N!{^}p-fIfbmt)}Ng%E@q@cp0Jc(v8bE|IO3(eSCF5BJxp(YCS?z
zK8DXQJc4E^8{t#;0A?eIpL#zpUL8Aj_f9EY@iUad_pa2l-Neg4+SV|2Jd=Yq2P!Qw
zSSfak(tW3kl`2%jA$bBRwJ|c-ZQt9qB$LHm4<HZYC=1o!5SnB1P2IIUc2L`((nv*L
z0$LuMqPX|Y*nN034)Jp7iw*7-XB)EU@TZ}_m)#7?yX_WJ0mWj|Qcl!OKwb?)Ugo#u
zv-)P!6W58(QDQI?4y50KY4oO}MIod=iAtEl0ndR>G#btlfIr-PVWgisI-}Lh?_Tjq
zSGO5PHpG`Tu9bF1G+!4^*GLX#+f<duh@GOIc(0JwP*L@+kr`etmW+Zee2*NOUV>W&
zfPpP*QaX=9SSx*l?ZYWX5MO%!rZ0<ptF0O>;EyUkR0)pC%pWrb8#!cgwR@#^<$K!_
zPt+FQmc5&hL+DP)L?P)VH2r`~v12(XkG~8pd=9T{07H8YU?&y<uVW!jQPzh}?(sjd
zrIF4tx+4=dl6SB}CJk#4M2Lh%BMCUU7VIh&JI+rxU=@7BcS5FUmdq%#rF-2l3md?s
z!8Yc>;JfmqLd6zecOT#lE=$=<tYLe_#72Eu=ZntIK|Wo=^Uj#w-z}2c;V{_@<MVY7
z_5HiKzVI1r7-ph7r<DHI4rL~1*>&Z}{n~QeyFAzBJ+=%i=`e6hP!e|+M$YCFG~R>&
zRqyvr;^M}*(kVNZf2B)|$>B*VBq{`efG|pqGzm_|AxiJ%I9l=i$sU!F{E8O^S8`zf
zYP>`lGd@W0Z(hR^BIy#DMI4r|X;l>8t@k5%MT@wKI15s=gcfLY=!7{fLltCrt9=ST
z01X|@*<!HF;|Bf(Tqby=g=efpa-u3B8d^DIcmZ=gY307j>SSC?l6pEnC?h&liHvgo
z;tr8KRZLQPwl5dJ4;KU_>2G-hnsTO9ZQgr4*1>z}k0!-#ElE+>zpN)@#`B!t%VPy!
zJNzs{nW&cCgu$VdQE5LFDJ{C1;g>B0m{1OJX_Z&(3_di8O@?12`ZIcq3pvUK^*g9{
z7K47EESZ2;%rlk9^TDFRKxi+)lqjEAjMZu;hG%Bb5cp1>kw=#tIp;misdTFG#)^w#
zlsy$UCr&Ci_GU17#{E`jv5FD@;VuJntb5)=i&m<8kf9<FLgGgo6#y+LFOkL{Ap>5n
zTNtIcb!i$hWtYm*mHgjt72dxWFQf5S1Jl)n6JM=n6J92%(Uc6VPM6QmA9ra`C7M56
z4Z_v3rh-=pLNkF=Zswd<qoesrdRQfFhdfc@lG2hw|6S24spL;0sgJM{6Y@lr-kvYx
zicBm^M^;8CH{~zi93M~kxA!PfWI@i7TBzdJRrAbe#thz4u!`mvLB*@8TsPx>HUQuU
zkPsG7c8&JBsN8luEnYRQ^mP`~b56Et=u#q(lnQlQP5LO~I{SoOPPKV!SP!p~vDj?3
zO;sJO43((u`O<AZ=`hFluB<J_UDn9Rs#xB{1z?|3IA9b(OAm`&E~>*}M?;8?Whj>=
zhD@71&MI?^y5;Wifod%JlI6+d@=E|pWqdl$Sko;C2{Jz%9u;0VpsOFo8Fhyrwu0y~
zhwudRbQy-oDLZPrA6Ga#)EYwdZfC34kX2j{FhbunNAF<=KJnIaeSa&zQ|KoCdZi8b
zX@1qVW+1=%_mYZ)z*RgFEp&AFGx6?K>+0{qIc5SV?5S;hT}1@7`Kh!aQ$FRMl=$Tm
zNwqPPcI5_k2X8`OiO>n4f=k%DlCnZb;3tLg+aj?9U&RS!`b7mv#^RyW(xKy@w@d4n
zpOv9<&0-3PDX*B>1|pX@ydc5G!V8DeiHk)U{oMg#-|H~iA@DcQ4QD65SjZKzRI(QU
zzb;8mvOz5*Nz6Pp`}oC+PI=tRZmNn1HBP7mei?JvlLPoyLRr)=r$j|rLB?_cnR|*`
z|Ch{q@+Aop8B#VHh2-FpB9(mOV&k2qUh=hUW82Rg&+UrXeRCP&=<oCFjD<6+0;e*D
z-;8G=Nfmp|no!~`d`aY#-z959@(ur%bvKr*$aU6N@I^FZ<ghuuK1H&MU#MJb-wuO)
zfhtk>aIjydT}PcMB!gN(^ylB6ej$>v{IqUo!=e_dcR1S?MXbR+c~=R^jNs?s=ZiGr
z-P8?S{03>zvgD*hjl$y4w8Q~`AF`MSK7pj+bV5b^mk<b{Ujp-Lpdjbsc@=RFp|#s;
zG9ik23aI-M;zo}KTFPi9oFr88a;$0?Wfd0Epelb3QE^(x;S^->2~b9nQ&|SG1cw>D
zMMw%N5@waWpo5k8C6twgyNeqo-*D%g#;0=O9u-|^VS5d7_azhHx<&Sy$Hf$=%gfaA
z)u?}}+Sd;;)5`aC4k$+=^b-+n|4ZcSTGWXE6iOzkKCvolhfE=BEPitcD8kD$c~I+k
zmqhtr&fU8VFuD<lhXrMo4d9d#GnK#G!a#?;3_DJ1ycXW4tqsNeR7FMQ^lwwou5c8k
z1t;NOl3-bb4p!+CMp1l)-swUirUcLe-~_~X#t5qSuNH!W2&!Uje6Fsu$`Y99#Ix^%
zSj)=X$W+Q4(9P#lRO6s`)kXM6XsNobpeV!q9sQ6;h2=;0vlw$M706B16x2lFBZA2>
zNrMf-KtWYw$nw^Ei8LP~i41P{(khAiv>pUXgmFzkxZ?&;7GZGwn~lL>Q7VljwF_6Z
zF2S9be=Y}0oNeQ6gRNPo8Z-(6pCZ}2mG$KaW{R<sIXt#hUY42y$OkkV+kg5JC?vV}
ziz<w*%8iU}q4>fuC!zXaNh&Qq9|D^(N9qv%9tNq0rKRnDq<q5W1Za=L<5dvh5fdPj
z?BRNBRX!)pH-IH#D3;2rgs{>}>w}`<2wg_63-|q7pc5>rW4BY#0LMQRC!k3Jw*!ar
zgMZxoES;Z_N{9aUR9^Kjhfa_fxl$tKc9+MIpPfAOpidq-4h9v8)-Nf_Pd^HAs;fBJ
z)zB0h)G$IKSsQ`ziGkmFHUwXsKB-T32^Uvc``_$3(N>3{x36$YB|JP5iZiLkZ@$4^
za2EERUJPI;-3+IYEk_0TZ@s(Xwn+h>e`{uHUWbz?dYp}qaf8`lIP9o!P;ZxsrgnAa
zsXEYh<m7-fhE!ecuDxSV%~o72XJ^DsyF&M;wYv}>*Sp)q-fsp4Qs*J|^GjXd6YIhl
z9#DfH43?#^k>tb|`z(+hZ6?Z^_r4g4>)B}bR-lZCB-0}l3@Ocr=mkI6*+Y*#VDFJx
z_KBIT#OoilKQgy(y!2#xuHnp(_#Vljf?51mX*6hSH5%{Ygd9SNejS}p<kf?4M{A^~
zMMoy}TNqK01_&Gtr$)x|&OJ{mHQD?=5*LLhvj5j*SDNjG)VIP?q{!T!#Yjs|{_A8l
zZ8{VS@rMrA*U&SUD(Ps1vH536tyA@Hf-fOy?8GxChNPu%$ojrI&EwD%>yI0{l+DK~
zL*gD+`)B9e%^FGZ$+$f|M2*FnxV;-O3&d-xr!3sUxasdVL%W+RovYwwAw;Yv0SWB5
zlN>VQb$28{&!A0I)%T<ATh9YVqrsL3Y3Vs<9*T^CW9DBhbb9au=%{AC4_|_vPt^%2
zMvRtK!7Sv)(lmD*Y(wScYMrlQ$Cj_9nfQrb*<Cpgut|2!R?|<BRM#D6u`Uk^qz4RA
z)77tS=l(uojPt@4m&Th-n4NbQPOBhXG+Xu7>!^hZj!om1bnTr#D_u9}KwQ#Sd80Od
zB=aRWCRppX>3E<u_ap$h5_YK0V80DD9wCnwc90&-n{MVq{HUQ)=a-_Wt#>xfPffH3
z-ml4~NMG{+k0FXx!RG<b6z`Ww2H&q<>-TAulRkJAc2N%~UEbTzs~cZNfP$GyzwM7h
zBk@((vmy$`$F9W>NOVi$+1a}g!;kKZ)UWg9rOzMG83WYtxw=bAG5cA~wixNeLLsi$
z|Nqw#`IRVnRLBO)%zEBrvwXj~<eijb9T?im|4~E{v+~>K<4cFqY~8^HfK$SE_;?EJ
zgz_e}Lf-IJ+|If}B(^jeQ3)4@ggQU6L#qTY#UqSQ=u&6DZy9)AWfFA;M|kY_uBiQ#
zFD>G4%Llq^WQLO~FDT{SEDt^lCYx?!wAj5axU0#|%j|`}UAT`i8LvH$)1SUyb$#a9
z=j$tOmbJJD^?tTlxZAvhjZZMv@?LZDJjf4f<l;Sj+OF2^BJw$Gwf3$&^gL%An?~%>
zqsVt$jx#~Sm1aZw1zY{0tz5SHc;Zl<?ls?>@#Qt#?%a16K9)YANKN)})i9Xuxzc-=
z<a9`DL0EZ%M1v$9u#(6oCeAwv*oo2jNbMoY+Gx4ab}N756vcMYv3LI(#u6oZCxpUQ
z?AxD`Kb~h6gr|X!=eiF&-z?@-7&H#6)ops3P-E0-NNDTXI;6FW=%4T3K$wA8{xv!&
zR2W_Rsfq)9wv83|gus6jCy?^N_2;=w5oP7A&P#S@FD@|#Qz|l#6yQJe)YbV+igVlG
z6C2Y&(yHlVfF47A+b(M4&M!JAcTt3(A@>jO=3NB9Q8+|@J7dyo(-Tvn^m{ZLQ5dkF
z>b@q>AtsFJtw{lZY0c3**M|zB1eFkR)nIdP-OmFp8?E0fYmo9w$4K68S+HS6+PaQ!
z@0&tMN4!tr3Vd(=k-+faLllPtB3dMd3cl062IT;zDxP=knQD$!d`D@6ffvpQn#o5U
zcbSzFJ}s};qmgz;+z~l&{*Y|6_TAP=fs0sQ%~gXA&e4%RCCo69WC}{Wq~+@7)}1+k
z^4FbuaVMRXxeAtoeldFW_ug}W&<=)l=CyV@bsov>I0vRN8Vd2F(laQa`R)xj5?~zd
z?((_IQKRm)MCOf(DUBBCILnEeX*<90jB_AY6h}D-_czVoo=uPk$DeP|G%{)`m1K|)
zAZY|c14Tb9dG?0q*!T>)?W<{G6dGv+nJ1Wc7jg9Tw^x{Z=I0}VB8n-u-2#Gl>omuW
z&+}L#@6kSI0x-CRN_2=5A|8x-Ozo|sP)|25vt8?84@zi!fXNCBEv%s~0KN>AShJ0~
zE@eY2#O1O%ZjrKTX@u(tALVh!S#8w&xi!a&2Skxj_)?h|73Fq4_iJ&}N9{T<Q1tsB
zH?6z^prkk-pq%3al;WU*b*t&2f%e-|0cLxy*-Cw#K4C^Bzjwp#ec6s`xZxH`JInr5
zU3HoW%m^WI_`DrFa!c2uLuGNbi9^U6N<?6SKWsTknID}idZS^`Z7#s&kxC8`tY}X}
zrp9nS{c%$y7Wh;?z!PaEX8iie6jUr2*3IwFscr<7{B@F&YUXP_hY|ZwSnm%ur><-e
zCXdVGv~f%@${~r5#*JxPZ~#DMA~AKVWPtXP1o6HYfnzattBe!*gva^vAz!xyJptdH
z4}3<dQqSPNYT7)t{}G{c={KZ1)!p<H9M^SCM6o*K`uo!y{-@WYB{*>mPRMHikfa?D
z$T^Hs=5|o+eWBXob-WPbh-*1!dUfSm-Ej^(0S={5XMl3nd-i6qag$_;cYHOhOMqzE
zt*0)+7QzYf_M^LtaEyp6761#1fL&pAA9%5XnDyS>R_wm|{keLKyJyHo53tYU@3KJv
zzyg`Yz0a+^&S;`d1n%nQYj<o=gxarHjBYpGqI`C?8)s%i_VDF(G+~YeJFo1C<$MPR
zfPgxE>)lmY<}%vB2>>KxFF8JU#t8v^Ho<TW0-y_}SlfXlaA3;}rq37z`mY|IrrVO)
z5!PEw6_^nExFBVS$_aou<VI)HdA9Z2>uFeuql1Uz2nB99A9*lI1U@RO=Y|@9NUl!}
z6Umr@D6E*;>VLbeBvc@=W(YjMVf5!_WB*F>%e|zRJpPX6Mq_>OfM>*Tl`PiB>}kyd
z!1-que8c@V5m|AV^0!_(?+dkTEnjaKIj-6Qx%RZDHNmi>CD3Az(uhGHbln7eUg|mq
z5h3#MK~MQkE-rZA&)QaPR|i!1_jbMEI&*wl?AAROyxkd!9?eKdNx+JUv7Vigww=(f
zm>Uh=vv-xp;$B)sZv&(*GZ37K-o*_ns}I_uh}kRPl|4KF*offr)6ROH3=Degmr{C>
zE44bzwz?mug-?S-n}+=Xo2m;3oa`g8^voWNoys?zy+@fRi!zyt+p`Lq-6dUy5UG){
z{%!<Bbw_uRj1OD(o(9`Imn~9!^5>h-ec&Yz`;l9&+v|7n3xQ_~I+xg!+bAr~#cqMQ
znPwk&L)&^>sv(?4m*)bnv&Az+|30Y*J?af{H8rY*Nx?_KBhtv!VxDv6o%*6&I8BVO
zi>m5b0z1c(F)kv=@a~=4@p-tB$1{>%X|yiQfK!0~PW1NA3fN}x(mg@TNV$(<hdGu0
zIkC{e_00~pkYDk3CIGwVEvntkCUaP2)!OTq8^+_~N!;71TAKYoR~6S<*Q@`oY-={1
zkKx1!0wvc_6Oq^I8V+A`w2n_=y<AsU4+|pdH+!Az>rTJo(Mm~E)o%YncKCcwgjl!Q
z0yBpo`f44ODyiP-=#nJR;gtzJgqLd&WB5>r6;3E}R@ci`cRasrorl7XS!belb&8@(
zq1Gv8aQ#N2O#Or-PrgGk@95iA-Y>C!xYyine&akx>N)b({$R0^AZwt!zPNhL{Wga<
zErF<XT{l@<R{CmKY8riB_52LD05gQ9(a$2wb9&#(4?yCk<+JGmE3@p&?cHa9BTxA7
zl;iM2nKP~<QcULdA<JEl`{VCyC4PXfyr%oFZ+uP9dv}GS*3}@e9Bx=72F4h~&rlq3
zKB(=x+p{%zw*bv~f9RJ29jwt0-y);e$iT)H3k#&`ND(E+a{5iPRi60zS71Rw6Bn_T
zkoautq6M$_zxo=k1;=(C2?d;v@U~#mgCd98==0xrmod4<Y?(%Y{yzCCGVm5#(0#Aw
zJ-N6}xi||S9+p%F;gHT=bUL?UfuyY`1-#j6#DU=g-^P9x@Ik|t->0-Sk7)<=8)e%2
z+Ps?W1>X^sqF=rD)bQjrbqJO+X_ocjL$NG=*O@2Wd5?Wy5c*7cpv-l}VubX(-&Xfn
zy;(%RMpO(D>6OF8^E|%60GHDn0BpC9xqdMs-0-}YvqsvQpZf`LT5`NHboJO2lpu>Z
z6Q>=IEidJz!>ci&gk_mb^;dp@Ay>_=pO?S-m$fSUGz>YJC;LM1Y~cM;e!UO*c2EJ@
zv>}Q4XO>o$tbdjJoTS6{QO*Ymm9X2ECFx=?yMSXV7;1Epyd4%x*}OYKZDd))8X7oM
zT>6Dk_(i0pMTygFn@e5?b&rCMK}idj_af^&4}~!5e4w76CAAv?Ab$A;hgB><XhZ|6
zOvwMMOo)O}4<#(73M9JKU2;y({7o%_%XUxQw~t>glNzUj-`WG;z4T1S(!tC}8%Sy@
z{n`8DOyHam`=)(k=QS-ZFDcK<EQ;gP)D0g8hY=T4enbQ<GuWvH)6OEt;#Ef{n6y0B
zql@eJbt~qs_Y)-l5TyF^_q0k{$RLtv1GR5<wj3vP53_)A^-%H42wxEQHKA2wavAoW
zsV`vG^?i$E*5Od$OFa)cIvNH_PksG)7i_OeBY5ixAm4Q`Jz_X)S7jlpeRr{(D4R%a
zI<c*io;O~_e_Js@fc$Zeh4!?;{Xd${Ix5QU{rW?Pba#VvNQZ!QcM3>%xAf2h0@5kn
zL!)#`3Me4m-Q6AU?eqP;|Ib>p=AQdp=Q{gbdw<0IL#jAFzsW%^QbUxBnwMhH1tRG7
zmIpItQtNRM98?Lxd;P`RZKu%3t+#?d4#?`9b1_@gI)4^-){G%@F{OK`g<?KEMLp)S
zA+p65n6fHB#K9!u;bvGL(B(dCDLo7FcPbU562Zo6=&I)$@;2{rgjT{C3%WkFS4BoK
zelMPSd5AsH1+*B}`m$W=Ueba1=L)@A&M*_2{PE<I7C)2cOfKVBR3sR{+>*@b2A6QY
z)X{co33~huQ+wWeIKH~<+skWJQ9kiEWz|fip|Oq`(pUb4j1)&h60zUyWZxRm>O_h*
z(w;1-;dyq|@OJ5Y`w08>qT}1@9#a9A!S|$NkE`p(_rA^Bc)C&ZAKO(*g{z#uU`_ob
zItcI@>b_JQ^nZL#b^GXr41=%trhl!fX}GBTdeLbU2nUZ!<*@kasNUb;|G+)7gRg$t
z^Z3yeN#<+JOtn)E57+rfe!3}41Tram_32UTc4H&Cm$UEcejin-I%{Pd8ie(OoIH8(
zU(dk~yfoPyUmUPNF0Ht36_9|vNvUsi_*Zs{BCZ7elAlOl)0RS;Q!Bltt>)=B6o<E=
zEPfNUTrV=6%i_~Mi!kig9zfDn6>X*9i$G5PDY#W&SeVffaA=K@;WiaVD;?%vU0Sa0
z5`mUQ|KBst>K+dvRtv!f%Y!?=1R7wnles`JCA``kkvL>+|17i;8?cP~J+dhm%JLqy
zbAw;*`G-=?Nv;0KP*2ruh9i#(dQu^otu|T#eYmF+E1U63;FIfs?n+_VW13h4n8F2w
z#1L&}U%P&2{M%8m?)frTU7W3=$yh>p?H2ApQXP^{MffwUSZt+I+-2!6YWLl$216e?
z>2F-sEfrC6PlRZe-_(7<vCW^CYsOfPT4fjio<_k`>SM3EJ+3jmjd^(SPW!XqeqUFr
z;=A^9shMk38}d_48%CgWGp99YWl8>&E2dfFhr>LHh9b!C(G>CySFPpcAe!mS%K&cN
zv=^VzS?^8WZmC#;I_+WODpy0LYD&V&ZzcZ?RXG!F`m!$rJUe3oE?by^T@%}fJ!I0z
zGz_8jlhATZ7%B#~VTwhbr?b2A{%EYnez9HmB^^qq`2`cmH-Q^F3AxE0hm2fPcWHua
zZi~;o<5MZ<5EPjYD)ei^8@t1^pPVt~x;QNr31tTmTgN`3!haPFf!k@tXL%r4S^fOv
z+E}f&f5<L!UHLobi$qyUB2lkJ0GAj6jVUfsncdc0UeJCb(JyCiNf3teZa87s?dq7g
zQ|U4Y{zLVqwHU?S<g<BA#|j^h4IZf?m3o7u9aogAa_iJ}XEPQCuMWV<a2;6^yw)tS
zXB~}4Xjf6&F1A!JpLl1F*)1!8t-F<H+WR$7%g60Jz3r4<l3=gn5ygO_O!xv~^1Hci
z*090<CNR5{oc|yq>qi}NU77sGd4zopN^nFWCY(5*8qJMNOHq-MNu*p1Ta5wkvX{Zg
zw~kPzNpLf#lSY2$XjMSVVWU9^y&|dTm8+AZ_U=<lft_9e_fPF8<jk0P?k@-4ar1U*
zY=<w82ZX6rKMW}oV-3k9c$p5=d=VuduZBAtdr~!<9B|aFF0L@Jq9DM=kV&BVGCyk*
zfffWu94RIl!5^3Z>7ArJlZ=x1ehw0Gnee%VY7(5<oSA}=8h`ViDCqgkDR(FwTj@76
z5RPoW+2$M<h)fQ+ck;psWkVXc7&xr{<l3Kt%m%(#NWrHf6QdAcYO8ydeKo-B8;HOi
zgxhO-eE|@@75iaHnO{XG<k9T$z@Lqm@`I>h6Z+t0xL31bTlW4AUe8-!P=4kBX`kbG
zs-SeQjj(`tROVqp>3#iVJXfm$s=L;rlNX)0JG;MA6%fR4DXXL1J!Ab&@O!6ETL<@6
zpBfLVA5(K(aMHe*gdoREyUmN7%kh{mnr!HB;c*sJ!m7i-V<9BiftsbirXV6-y5DMv
zGjF?i0EyV%`*D6Fx$hTY&`Yz^@mS0FihYvQ#N=z>8HvGb%!6ZOk88TOPgCo(aQi92
zbMNY!=B|&#!kP6CeTB}Ggl9nz^C)vd`F>kg99$+=rl_&9`r*KcKa~H8Ab)<^H~Fm+
z#H;7+MpQ&!E3+YyqcYY7PcAk8{Tbu<NbsJ$dw`{;cnFvDHm54Ow&neTRaW)eS2S?w
zI98ISdAj!Sm`K~h56v|40K@mC4x9o=UXFjz9PWa|O5f)0k#mbsh!zAM%xq6r%;tE!
z8;B#mFLZ|yYhzG2`h|rXLv8+xh)6QRG)DZHEa=<aVZmjz{hu@JDp^W$IE#!Awx7Ud
zZQk*fKX~3q6+j&Id>_g^2g#Chd3@$>yOgm|d&|ttMP3J5Tiipr%;(C#Y|bR965UVc
zra5=|Jj;p=Ddp7Yp6HQwok`TrvYMl%jU3T1(3?ZhyVW+09AUI50?5F>mzF^o1mc^r
zqE<*qN5(m8g>g1>_Ntz+y-<ZK@9MIEjEMDspds-L$E_b7Sl_&Rri43x^imi^x*B&0
z@mc6WG!ovw74LLemLyHPP3sxw?A7>XL5#==e-V&$rTorft};RU4yYTOWsM0bpy_l6
zHN$!${xN5vn`V<GxhO@>d;wD;_bb!816{wMvksdVbVuu;;2yPMxa11Tn^o(YfH5w6
z(X-A}%<K<rY;a?pdaaMDzNb|ksv`Lxp0`$wmfW7sakR0*J}h%KtUc@u_k;(m-YG1f
zpEi!@-g-O6@K+jo4*PlDE1K`D;>D4W4<=Z$V6khDPYTXW2cL*CrxxS1DA4ss6AQMQ
zHf1M_l`|%FlR#M5oAUv*RdZ*CnQTM=jyVJrMoTJh-u;f~zIE6*$Ix}J?VG!SVW2Ii
zS!FT{1L4hz_o=w*Yje_yBp8a5B@n$JdXr%DC%W1>AhK!6tWUSA08H=&b&9tEhKjVC
zQ}DR;d8RY>rBc;%!IIEj63$@{%lT<@+A;j0HQO7j8|RD7p?c(P*Xvb|*s|CT=_(Bt
zEh4N~=$ZFuFLrV7b9sC{4JTbe-Iv?UIk(we@0acOES;XmnQ;``10uF%G~$&md^{{d
z1@;t%xASm~SlH%D+F=~L2P3e75kcq{>i_BrrUroqK0*Ph69k-kONIjW=g-3LhTP`X
zAI<I(ioLEbEi8PVU;_PuS1G#QyIl)0u6fzoeF*e61x4q<p3TYkiUsKO%8D7L*Ecl3
zBZkFNTB^@z+1yG}{6aXZoWb{iv|BCsk{r=LR+H7>H)TNKx=#P+5+im4BG(54lk-r+
z{idr`oCUwDKpTvr9_M>hTUFE1yq*^d;Fclf^$x`%*J(Q(0TiNEr34<I)=NE4+R?ci
zjj!ibV_$w-x1Bb(T@vnQaN)i+^mYu~aUv14f}?6Z81^6Cx)mns%wm)J?aEr%`N9@p
zJHJp9FpWk!^HTRU0(W3u(9c=q_OXAO<pOsQv)a%&@a*TrhV^m!U$#>}N;D=pZ8nFc
zr3HTbQ*7EnIWlfdso`kXuBY<W=pB-5h-vCo^Gm{=!beX!tzx=6QUXFnkiT6Q8^)sP
zmTq;x2{b%AF3W+U0E<^bT+Msl$X-;)+QG;~E3pT{Q1w(<n7eL__+Q<rp=M%wtf@E(
zzydNyLC;@BK4Q47F9qO(NU1re`bXok=5D^^dIoN2I$8&cc3ta`NRLJ!GSfJoKAZF|
zg(97FaO@o>r1}e3x^V{hP0kfT69jrArpo%Rxf_Sc-3@Oo)3&{{1rH_^f8WXbuC>Qv
zio`op2wz-KHjwd*N%zd)n8joLp8A^0D`1_2?#6(It%nhs7r*(S)`LjGn+U`Ch75!|
zw-Rte*){nIRc%;G(E?F=e4KNCChOD7-e6j`i(zL+$Fut?k=IV7vR}mj{=0$lk0#wa
z8@6p*1F`r~DJWDCri8(Ne&yy;^74AIH<urWo=VPf?i8@-ymp-?pZbh1dv?7cZ(Ol^
z((Rn^ziLze^>{Zjj!g7gu_|v!2L|{^xWww|SPXQfoDZq@^QDy5LZ-XhkrL)~h-Pn-
z7LVvD7tWjC&jj6H(5rHI8yXMJK&2RXzIL((`4?>7WfV$<WoY7yT7*BNy-MT>#Km&G
z{MC5`kHoKwk=3X*U-CmBSdb`M4FcYXHiDJ;*S)08F%iXV#t#{K(k6Yj;>_ydA!^ja
zR77B83>@r8g`?SQDYf`kxv211b$SFw4fu)KuJska$m15mriS>?8(<{RQ+Kg4;OG}0
zVahLfL{|7J=n&o3-IOrjG~hS-!O#bd3zOS0i|X;^>#<8Dryi3ll<UK)vng*H;hV~$
z!o-F^sZ~48cP1XRx3x(#y%yB7!71IaG&qSGF`Oo(X(PyUWgu$e5Ma%0r}x4p!x!Li
zA%WvF)4SDR`$4@h1Ghs)3lT_U$tfB=-3H(5br`2KAGC3O#S)^+>UrDi@Pb}<w#AHU
z-SY4<AE>n02s~GA)}O|lY7&orcHO{n6H98tq?Z+HeY+?R*BoeKd$?dPP<VBd-K`18
zCo;>>z0xWPJo!04N#gvtq_UEDV0>t^B5Gm(Pd>r=<}S6RX`yWyoe?WwJ8!?5+Hva%
zm6DL6tCT`Bl_-L?7_OPsQe##0^~uQ3CE`vCSYeQf!xwU<O2_D!p%S6OtKg$w7h?LQ
z2_Wr4@M2?WzX~Zx@iN!dNzkc<ls1*y!0(28Cb<TTdFz_N>4euacgIQ36CB7(kqb&T
zS`k-)5w!Y(LuS0RQy4=`8yu$U5^=sj$VD0W{qc&>MqyB;Vn7Oc@d)g3f@t_^8u+qx
zP_ADFGvk8lvFT{ET4jW&oN8sEBvx2jQF49Qdg#BHP|9I~f8AeM?XA6hh^MTM#1S;M
zR6|Ap_qk5tOx%9WdCD=8^}oL8ZxT^wCUi4tayT{_qIOOUJI)QzwgObV1Xp0wBMLEI
zovpsmo)+9?Y5n~Km>J%`+->$rjcVQz7aW8|LDIXKYQA*$MWDkEU~z>`)$l=32xe*<
zU`R30SaIFo86ug35OJ}v^fjVrLsr4mA<f9}jnqk8CRlKy51g9a40PtEY<0nPCUATa
z5M-%7a83Vr-DFdSO?v;NwKOm!Jt|7Ns)uOAf(Rs#|Cg(d8NQG2`l?}{0PQ<h1WFGG
z8<+H5>Iaz=t@NSv`1GjfgD;8uFsR)Q;_$s-oX2H4PQONYnBj7f4FepfNd+-=v#)Y6
zzi^VHVtA?CRJ=%6&mzbEJZ(q5jkS-xT=ISh-n@+H7aS+`xcd`tdMVL3GZs(%vqG}-
zG)UiG`}(geWe4EU|CiDLxcAQi!#)ym4Ry`YVGlH?Uze8DIu8t^HFUU{@HbhyTCQPV
zgzY^_;DL7--b%SPI&o52??WbwA5AM|?d%p?2PIZ*xzIN;ES0WpONRhlW8%sNJANLm
zS{)ka(L$}1J58O}5-N@eMR*k@8-I}!JIUZYlZ)-kE&NZz@wlku<vQ{{fqyE8a8{jH
zoV7IPm$xE?e((gnNl3q;_Y#WsV2w`(SxVB_t&9E`OZPq=|H!Pv@YO+s5Z9gHlKbfk
zFs=&R=D=Qr*wg$bga1G_2!LK!r3sBZ2Fbkzc6Uaz{WhZB4!DxR2V?3-<tyWit?};0
zt1Ok8aWE;buskqCb4T;E(j{F)b(0o<<dBC=bW*LL_fx>ZqdZ7DE)tY=G@05$n$D$J
z#uS6yrQbFFdv-;tx`!CZBKPSlRLGR1i+3FdEo#c90QQ?xkyYaIBL0DzS8~Kv7}fr&
zf#x=N3qPtgzB54UeyT#3>G_F=_*B!%O`#4r18$FNL9`*tA8G$EKM}uN{}Vr7^UI7T
z{Yoe%d_}oQ%@v9#{uwIvi_=!TR|8rMPsRc23E;o?TK3dkU8s4Xfi6@Hkz8wkUzCi#
zYk0PS-rF7rHB8^++qBDn&Q~@@fX+XZaL`K}TSeZ~+z@_>%BNdvSjqFco$#jWIXsni
zR5(A-6z8p(+d9-e>01;aDUF&p;bmI<QIZG3#pNLYr|XALps^{BF>YV#c4ep!zH=z~
zcWd^lc2}zryPA`GdU)T_(ZRp!e_5RNsTHsjG&VK@(dy^Pnkw^7ZO0;C$I_lFq9h`M
ze9)#1BBIHd@TAaS#PMn=uQW?7?1J!%v{kz@Y2*oRx&L<^_bN4o=ztB={d;EN`f&Dj
z9W$u;>0%_;XD6f1d@yhWH&<Xi43|MQ=g1xewTs~zc>3-w)@u;?gE!%d5oh0QVQZHP
z0e+YnHTD-t+4*vG5yHkK;wlIB^)>86n>0Mo_>uzi=V-6kpRdn4oL6wMxpDtS<Fiz^
zUkn2wSyuD)mIjUu)BoZO0PDNe)!tvx6}U5wis>J}c7k4~x#()Y$+7l+U`?L0k*F!V
zAr#S0_<)?8SEq@?$&?-@_o32+o(7*z1oJ|rC&R%w;twj&Tb$)0P__h6-G)Ex<_5jg
z1G!8<*xO9CQQPy)V$iAIs_?}x+Qo$hf3B`fg1ZY*KJCMiGmcoPctTW2Poj|7l`sRg
zLywG{;vEQNg(1iZ{bs15_4gMYcQOqpnrT#uBvr+g_1w{aug=b#ltxoT<!<EpKl472
z(t3Mya{}bg89y$}VhHW!0lX5s_8@64F(CLh;%wPpn_B!E@46>u=UF(5?hHRqbaSSz
z{5!>bl8Ba@fhS1o#8iUU&CRA)*0?|cyOyU)Um`A1uq8b2yC`72$^3f{{vgya3_dTb
z-%Mv+oUbHZ7cjO0Rs{CWyNyNXu4Ta{z=`zD_n;g=k)ZdEcf~-SVgipvEvR2taZ%QD
zG??Jx?dljUq>6=t%7%C-Ve&q^tcCM-xN;S|d)dv5EtL5yi%AqT@Ukg08|kVv0M;V=
zuFOMHUz-b*GeG@tU@xPeehsB=RHzN=KMN?=Y6Sv*Ia;<7!`~1_m<l+rv}>veS^~v@
ze*hxqNSv3tm<yzu=-yp)hQR+mn3_WtaBGS}I(}DoBRNa`fdWD98%tVYupB|n!Rut=
zi#^5B-9q@7vH$79YJf*Y$6L%VQf%ee%3<&c>JRT6KK=XoLGPfWs%0qm4aV)|d5_Um
z+9@874-ZbA_dAO0EnEM91NQcZ_86O7mj;RM7I%d3!F_{!1LHg$H{ofFw;)7xofNAW
zhtFf|SNFY5PhIyTFS`e67(&*6okcq~qlCZW5&qQExbXlVnGM(KQv+p=j}}Wkh5vRy
z@|5M^UUcq}N!aS-=%^idqy7cRmFD=`9nIBI0vX}J66|BrEYULFvA(JQ{7Q+;@g{i>
zBZ+<*oQLX@!si}cw1r78u9dG)czERfb=hv9xwS3lQ=9Lf2)Rgu898M=+VOPi-knfD
zo}M!lm$IL!MCq*sI!tAhH7!NhC|svMoGaS8M%$6+B}y3^k>>+3#Fx>gBJDSq?|Xw#
zdH(JwA_R9Z={wfW=myAx(uvd{WkT<LzSfogCMjJ>Ms5M2ZX#rTZ~osuxyr*QzlIw!
zHqIZvS4cZes?1uMpz!tWR=!mCGjF*lw4uH<$_x3YVmmE1RzSA(P`s7gf^uabQrU3Z
z+c?@Xg;GEJBrW&%>pZ~`B)LPu4z0xpCvSrkuh%L#<jM3C(+%A*_}<%U8@1;?ZN2qg
z@!9PsG#0TO#-r?d9k5oPL_Ra}q5PG`s{bz_UpMl+^Kto9C-VH0^POmK5GxL^5p+ej
zrkxJTZ+x6`OxsGs#fj&Mg^-@nzq(a^@O4$@`WA`-H=mNWo*M#Kl5EFIg5O8Q?(;)5
zm}>eYSp!5rpV#!|BJ4M%OneiM<V)+6K8T2bO6xP<*V+Ci51A1$<J@x9c)mX1m~|3H
zS(041SV5;(7SYH}qZ4`Xr=O<2el1dAmMmf6eZ{sllTQn?R>q*^`gVEh7&G7Ox(Rr;
zCa0!&e6MVtrp^4S&;mV&M|JlpgPzV6IXbR@2)Kje<7FVCsq1N*V=RlmCO<!4VKk^A
z8W?@3@gVx_SVSz600p;8x@;&~K*C~F?NZ9@nu;W+gSX<S6#SFwyL12VciH^0n^k>a
z_$%Z|(Hr0Yk>nmK;~>c)r!<?)gPt#sr3~M}`y5D;Kgla`K}Wr&ePcIW2`8UU>1I%I
zBG;Oz+P>DFY}3%LX4Io*=9JmmU7WyYRP-pDEw<xv8sVT@Q5<)*xW7onC967t3AG{v
z{j-=`j-+M>+{|BJU*~+f)fEk2xbA3}5%fOoxc%fdL|-EqLpXR|F?0_&p48EbqdvMn
zvLttC=RFuwweV)-<q&4h)Z5`~P-FJ69$DY*Qhj-FhAlscAY4NfhaY=XE^kG}VT{7b
zLj)y--sx4-0oV`0>FzRAmZ@5ab>vFaaB*=g`b4RNk(w)W?FgJX!VP`-<SBzTdkw!U
zrch)-WVkp+wHkH_*t(ozwCMa`SOqgY2|Bu>J}4@!T1b4ogdDy}IF8gbD6Su@HMffi
z3mQH!Mu$atlPuRx*aj8P(8E{5=B!|>Yc5FjYZ=EdgDu@n0R7Gt>#D0e2cp)x4=a1@
zBhR6UciV3B{0L8+D<6=^Jr2rC|CRJWqA_dC1{A5r&atzB{OO)fW)wk&cp_?WMZd4|
z@fTvKtvPGmmb!|YyC;>G0%9Df3>!K#TtA+>P7VmitMH8FXT=Ea0^v}SaPH)()08L>
z&TcL)b7zeseot8$XK=E1ioDr&G!(&1okkBUnTX0p$pj0=70rrrd|pp{s$3^I9Y^u(
zheMfKh$gSvI~h|d3<VGiQ4xG8mA0gdXtSb5t;t}Y?MhEvcu|il7`zUTXx0t+03#Sr
zk6i#r;6F*QKbRte1nCbQClI9i(2$~{*dTh^QnCHrxMIga7B6Q>?5LRo4`yAbdo_A>
zX6rrS+}5MtL<59CRx%_z_HQnx*ZpV?4iDS*OR|8d!+Pm21ox)x0CCLc@wn;s&4V3K
z@4w}SKcP8eWO8at^~s~vOVMqa!30l|a-Oea1JtIw$S_!kusP!`cs6c>yQR#q#5jYJ
z$=vm9nST!mI>m=%=2-+~paP>nVe6GtQjm`z8ITT?$gd@V+0_TKlVo4iM@R;-Wa5ay
zWznrLmS)v&5wPGuVe<q1*XpNkRiut@dQIJnz^DnZN>JMrqn`{X#`^kR{934*7zwTF
zGSfBB!~{&FL*7%o+JtXuV)^WfV|wIF88*OfF?x*eYAy1{#r5aQb;MyzkR8&sG9Xg8
z$M0lKAk2{Fi8j=&kO35^88|%~jGo7m%d9m6Rc;BiJzN7~w)6q)kG0sNKFtLTDR1aw
znd?9W#->r={kE#u^A%wJv;$tM(a(W*qPUAgXJ4sWjPoLhBeXwANHB1IuCf&Jc@Rsa
zfni~qs*8354JQd9gr%>=lr<gxVIM|2<*YAzEr}0fCXjW2d`kt4tSQ^{L5$UR?w!>M
zKSemz>>Qj|Zr`ZHpm`)5&YJy|HmV~nY7Q|htXh3X#-)ErE0u!(drQdpAn5#SE6}BG
z4%g11U?8HJzgZ-kZy$MFpCcjpygw7%)>#o+;m$|NrvfP<*W$mA&$CQq6xhbND{1Q&
zA7fi(=V)PIfI2*@jjW4lF%xkn?5(v59DEtdc&XOdO;m98?40A!kl+EISh}tr*yI_`
zy;dkM<}ku>w8d!$j2X%<e`&BK8Uo#357K%&&+j@A!S(e8_mPf}D=H|y6m5HYwtcf(
z^ip#Li)XtrJ}dgwJW%n+*t~z?4?~1-CQ(O^03YSRlZpB6Nz2O?^8o5H`5B^KDDx*<
zm@rG_xXYL5biux+<Kdtp4Vb$0JecC}Uw<=5=KdR~*LFkq4U~EW``k~oXBl-^&j;1T
zV>SCP<4a7F13cb@BZNyG%^dGKr3O~AXPLOU-r82;$QEC~EMjDRzrIsA8tpp0-zG&A
z<MF=OTE9NZi%8kZ4R9fTkp%i$eoW8N^o<RBg|9}#%l2Gz32D{ehIBIr<a?VrZKkHb
z2ywhmP#|qt+MYuw$>~R*iB)G$FC)XKT$&SVc%EBDWJ`iE*#GR-i%yB=_i3t;{4kRt
zJz<fm(*7MfmsBwwgN58_Xjj>(KheE9wtL;<{N2r?$2un0V^<bn@f)YBW9HJH5h>>Q
zWYBckraVIM{!Ng!{;McOhb*Y+<>hiy?3L$~uMd`n7sDqi+omosY^G`V;+V&(2GPE-
zv_}8+PJ2(EEEy4&t-R{+Ytzv7!Sv@D>Os5M{E3aj!pI;TM?IWH5yRs4z@ggBAlZ^^
z9-VL)K2f(1`G?qSaqQmhdPJQa99|*mh%<T4yvqKVy=6j{I_g$|I~mp#00d7GL(t;s
z?tD626wG|Mg1KqrHE%g<9Q3@<VBfUp)V5$MqB;XC6zYmy>(7eo;5R3Rx8sEYZI16v
z7l<5)D*cbWIc^9^yJ0y^OxfJPn#ty6V47Z^*?DcDLILZ;bNcDrmHL~E(uW-tN`9Zu
zg%AFKsvCp@tQV_%DqQilwwWE7<MdS@nf8~zcfrP)??sn+4twL{1#~|))H<J2qFxs|
z2Ga5J667{BermGllTXv=Bv;QhxYy!rPH_LjOf|Eisk=KXY82Ex`=%{OVw#y%+vk$G
zz)~$g;2|Mk<P@z*xXN>8@W+(C{-a_0-fXCMAYee`<##fcLsOWu>mTG`-<9F}{mjSL
z>9F*5gfxvWb_EWiyx8oLHhGXKYvYt&ZcpyN^Df}XyY+dWb|+BB9lZ}-DPJm}_%To;
z7t?2M|2!!zRd-2S;9$3~NuZlMQJhspLK1GwXx@9I)Ey4alUpo(@o;mY&7+ssT3bi-
z<J^V#TN7LvxQe^`EbOV2jppiGdbCwhPxke)O?`u-33cIOcpDsi(7*V|?(S|NH8S4o
zydv=ql=FEb3TUDGFgK5Z*4Fr8A`L@u3rj3U*K?pk`^xY8L?H3IC*`d}(s7c=U>FOF
zj_2*&8}>m9d9x2{zc_qNpTE8~`?d}J)R%Q4)uEv#;M-QPon^ZO!R|afSpCT4L<WgT
zu+R;uDSBmCeUALMbV9Z2Wbk{vUg5PM%5{hRkdRa+By1AxZ)$jP%l3Ea9rzM`8Xp(i
z)`Wi+1t@RMFry0pR>>y9Di=a%`H7!G&mXaAz3cUY6|Wl$v%4&=RmB;j_m0uC)5Jm1
zsm)w~jJ=V!X&RWrcX$0uokIAUOxnaDl|n{L>gdnld$Q9?EW5Z=W_B-6%c%s?QhNL$
znX(d2$Cx_uri<-*bK&*@ZCV+O8mx^_QXMc#*d8diT5$Q7hYR>?skCBYbjpd6SgHbR
zPa8SHlyO6RUQ-s`jFar$TOI@)*u2?U98hrRLoW{e8cIXo9<x{ENZ5i<CgTBSuO1y=
zntj%RVjk)>-4n0`907>lcdI`kv*-YqpwAT`Y-1V%eU|r<t1O;$Ad&OvH$gyxm>&Jy
zU@H3MmU!hO)|wR77j4*PYGl=ghS8>L^S3iDq^*I5tY0fjF@HPYsu$$@=!!?p%g4+W
zPuAFdc!~R!`b50q1Nc;xskYA~^T4$51=6e;s<d7|9L?87TF|i=%~H}GI=#=8r!GBA
z4|HKbIZGALB9h+p3(eX2>#@keXZ{cd!dvcbaWn>cT7c`Jgym>BaXaOFSklGfkcF-2
zpr&3YF~F4Zf{u!;FRPB6Ot4CKZX&Sx9;BqDC4idwl*0lq0hJiQVDHHO<j=XZZnG#d
zDmd5~dBPNU0C8+T-@nMTb`x$NOb=<5o0FJMA5WRRUU~M^tezr&Xn5L^G$RxQNrcE1
z1{Mj?oxkxetR`>E(b2!Qj6(t;L(;Vs>jP2v8y0uCL|OfAUJAd8K5Qka{=@IFv9Y<5
zA$?F|Pum^K4f;P8*USPOrpOr(kebbFPr&^}?%#8+Nv%wien4~PKptXc1G%uG3V@~X
zA|6x34Pk`KqIEo(qD&aGPfIh2YLcPOmhdlVr#wKF*@?Gtk<iW{N*N_wY9&PV^G{Z4
z8%Qv>pB@bGVlYYG%P<&kO(AcijW3(>v1Q*eTxzOp-XpZS2)rMPe3w^}1Kr@4AfSW7
zCmXlF7gHN`4uCMwc2g5K=S-{LDsUfj!XnqLV-?4ySl4W_d0`G5?SPoCaeC(vLFFWr
zaFt-z7N%qk&VBK2EVKB(bWzkWFlCj#L@7Bg7e8O(^*<E!Exn%IJN484K?)S+zh7fy
z*M~=i++^t~#lwL}2c9_w##Xbl_{R90L-piXUMji%IgPE$N%@Gw*Rg7BVL<@p3*_>P
z8W=`9J?%PfT>PhS064cIAYQf^$Q}KsiOrcJfdcP<%I<g(WpOUrD~!(jJJ4b^R5L*l
z?gG6o|4{foHt2@k`d2)l7&@a4skMPYvYhA9e&)p{6P>V}kAq+srYU07g6h)Rhy10u
zx9(0qqOqpSwy+-vpGVOvoznv%oc!DCO&eIeQ-)@oxrA+&6>t<prKD4rLK@3kxZ0Ra
zq{+D?5nr#&8$4T%G2lQ_h#ze4v-Ms*0%ue+(7<L>HOK<GNOumQ`NO|mZKc}}u$^+v
z29J+%Ua{!<<<4_x)h*{cY{&^?R@7lQ6Obzb!~3E6?(X18^5VJ$vO(j}n)vC0gb!62
zDH8~}f7hwtCC+4^O}Rx3qBu+q(twFM{Pbzxz`98^1dEw0xgGTsP!C0c!#e;L$N_?3
z8hSs=QU+XI?N3%@icGIRRwe?6J9OXJns<OW@!?<FYxjo-epvj6zv-OlP~1WWh`zVH
z3`vWavEykRoV-|ilr(&agRK=dCzY+*KV~@V@WX!*vrQUhn`+jOqRwx;v&u0%^kZ@q
zYKrIAR9%kYV>g`LL3uJ5yVzsntIgojYCF-ZHq*-+8kH7MEK*MUaDx(?(vm7dUm2Pg
zw7gJ)8zd&9bO6#yWvle))DwGDG7$Sn4@=C!O%oMGB=1vYa9x(|7LT$x)!zeehWL=v
z%1CikCU%G9Kq5tB85|$X<kenT-P@>S){S1tD(Z{<bs5<IIBqwTixTS3-(raSEEmkV
zKHnKV%U_uZ0}HRa)$>p$j?Cxjd=Lmd+ywY)?;nr8e-%Dy2IR(n8UUAnuKtGAuXyU-
z)mY$gPpnn^Ln@=U+1bZ+J4soXZ19`zt_eiyIeLX<ZIELHmrr?3y^Xnj_Zy_%_*6G_
zl^jnuZSY9xdXrREsj#kXD?)X*hmldTpZNpaJ4o7UxyGon5G$g%13J3MMd3!(>aWxD
z;H4W%H6+;d@`)C`G_h~F&s<5Av~{$6Bq?QT&ONOjg9vKUT7jmfSUaIrn3~`zMidjG
z0!}pqP1ot9b8+2C9i$TIW1rH;6OXawp3NKJ3*ZmhuD1RlBI>Si{3%pE{0u5C_Rhb&
ze`=pZ{-8w+4tYY5fv5U(4Y8%5j9WxO)mF9Yp?o=u^cQ`dP~>Q6Xn4Lz>k`?Dmjq5*
z0hcfMDuTag0wU0W4Y(IDvSxiS6Rh}T+}6H9&E5Q0WdZZdQA9(*(p|vidE`sQi$;2@
zalHN4eqx$%cQr+w&72K)euR6j>&VyNd@9Py%}fcmXK|-gAgANYmK(O-^ENoO+1Y^E
zrY~RE8ISINmIxYaT8Ek8=ur8ZcJ5>yHu^QeuDKZ=NbTa_edN6g2nb;&HaPe!;%gcx
z;QCq{fCS-q&k0h7f5`(IxV_<x;ViTUfp9C}T#Nnc4nG+;P3MzM*^`A<*@P?Y`5I5q
z^Jg+eY6dn-Y!){yrlQ=JN5XJAR5G{d!I-hA>$X1(nOyNP@PCqyg3Nn(czAhv3GD(J
zY6K?b%Ia#D)t^kZ#yJ{fSmU{x>}A<l9*%l6z|8{;<g@?V--2<PL^Og^KehAQP*_Kf
z&3PQ8<`uv<?YIgzQ-9-A1tn?S!bvE_l=-Dp2Pu?x4DzXXu+tF9)}BREu2d=lheiG9
z>90gr*P?Epsx2Mvf38Ypk7TV}ceEa>NRr)c3#Zc_SBL~zuXv)ZJcZl1n5(&dG{j97
z8ZkI|+qhB&+5)}N%IS(_VR2g|lc_DR<c;XZLFtC2=EL#SN)^*1>iQ0t8w_!>_-WvO
zWtPHAw{5e6Ou#hixS;1TLu^cs1{J}0u);0w-;p5b;tuj6CpIYDQK7G&7X-obWc6BR
zIk3(t08ItQLVq+h8GT+4|3bjldOc$RXfrF{$3)x8%Wa#My|Vb7|M4z%zdV)w-MIdK
zr{4}-LP<t+k^Xc4%89&VT3<(TrP}t?b0t(3iVxuf&@WPE1gK^aS8Scx3vagv{>0?L
zh^SJ?5Fz%{^XD%ar+bz4Fl!U_+3Hex?rJkg9a4Bd+e7A;?*n9KV$)|HUPzS=Zw{pN
z&;{|$BK+3*j2>f1TT-IudR(e7KVRVitfM7?r5?yTbu_D}N82G-+Z>0%V^oCnq$qgj
z<3baO_8ZgL@53jTwxFJ)_tK!g!`wf!P4Kkdcpxg9z7n+qe+`NU$H1dcKMC9c+enur
zlAp~5ne%~UZ;84j@QQ6?fv{>m(}i(wS_yVzX%D6x92@|!#$152>EY?gX4vfV6M%gX
zvbO_UmP{-OuiwqlzhemyyyReOd%nKvewW}+q>mWCzKU4>#yJAbSq#|O`4ckt?97}`
z@gmedUh7RY<=ajeg2JNh*c-YuX}pS4o;Jw6N;Ri2zT;r)!+-{BVf#nCH7@altz<#p
zGVkaW{S}JtK};PkrKF-AwS#^mBNVP2OEe^cqg;8A^vb3YO`H4j!=3^47F_lslgboi
z2gB8cmEF@}{y2EFHLYo!wAm-YPq#dc!VG1Ur&8Ui2w+Cy5HPcN{zVT>RmGdZV?-($
ztcQ;x;{CB?jT+<dvGVXG;zD|zyTa97r7ve+>l~-bN`KrDpk?92ER%w1_|P|P=cXni
z2uY3jGa`Ok25tNm-idG|WO>@6O@GqAe)m{aRTT-O2(Cv`cDij33!k>T1JF6318BZk
zum_-X{@pN0Wd3JA0jG;5d)lS{3+tTqoYEPA(iS>zb^RORUSXJ*LH=`RjAAL(RB#{f
zCww5gE8}wW^Y2lNKq=kw1K&4zX{(g-#ZC`x$W)f})HoX9!O1fN*(^QSi!Oo9Lct{A
znvB0R-tDsVZ7GnSD+UcK&x?s2cl7Q&JHOPvN(P;F^PbM0?9Ok=(ZVFH4?nzR=2-e8
zOkp2_f+tC@1y7_mnDPNHqPHRl3JdwvDm+tWnnepzJV4&S9HkC{!xEY`2;65EcC(&r
zE{5o70C-dw7%g-vsZ<BXzY(4HxnDmu(|eB-9v6k;yBI0fod^nBie-^j(_qoml6Mv_
z0^{kz#kbN;57@%b$W(N(5W4xds=+4JUgy#lOU<sO*<K6uX@<Z#p5OoFGS}J0=I)Pk
zpgXX~0r+z+0Drpz?0O=(uekyG7Qa9a^&+MGcdg3nj)GA*LaJV-)}I%F1JP_a!v-I8
z3(_ac_}Q%G@UP{Y%OvlHYup&*xe&Q_Npw)&)l!w$k&4$G1{C9^8WCF#+#D2&hLKq(
zj15u4=2J)=QaH_2nAL<AX9lIri0U)zUL@V<bW5|(xqVtz7mI0|y*7M&8IF*Rx`ZZx
z`mL)IB>bG(aOeC^5*e|iJGj1~kl<k<XN*g~hZ{LTS$w-IJEjFrUzfj=K-3|Qi1A?E
z9_^Y^6;3umw)@o<{szSDw)nF%t)SsnarNS-wAg<0*4f7QAjaR1T@glN{dJt<O-DC+
z3h0^`AP|HZu6EN49jUG8a6Xvu6bJQ?VNCfBa~Pf;LdZ~v_erG-I=~(9a0PJ1M}!Y@
zycTS;v;PaQ&*ZhQWvb!;ESsRBzze#Bsq5a-%zCFNG64-vDli=f5{Sbb2In-H8XQeq
zishQngQ5iw(KZ*0&oo*jUU9O(f0>nrLQ%@FacGi-bKFQNJQMN`=@*-19(C}Xg|iTK
z(DB+r9ph}~*Ch|&Qw@Eps(mU0(~=N6H%_(rr-3Y9O)3qe*D}1>x<C07<hSjdXM6Nc
z=f|PFO{DHer&v5BSgFftn5Z@xh+y<y>2NlTb=uM6te~6j;z2YZs?_wA(gmyKZB6;=
zI#D%g1<p@B0n<Fg1wEPSmTD9bO7MDX?%>0&&!D-WgC;g-0;Twrg6*H)J;R-;q3B2V
z;q&M=znS+^&&wLD+g3r$WMYFT7sBnOR~xy$H4k*I^wr5BY&Q-<TajK@IU8nt1O52E
z52Ge?d{)2NQHgN4So4;`p1(NaGQa7n30Q84^2^w<O^}O+q^oPM2wZbgenj`-rBK}%
z*>>{-IKDutyeA;sh(2FU0&w5%iX#u;^0sYVuba6H(OE^y>v=&O#STxW-K&&g*neZ!
zEKdS$TLewqA*I6i6I7}aHci0BcKT>DH=vLB#SwU)yI?63<+RCNL02L{-sc>Ik*LDW
z1Bl^2C&2pWc@2WijLb4%W~R_D%Zdu2#U<z8@j1&uahNYOb+*|(-$kdqS5?1M+@w80
zTwG{<XJs6554^-0Y%3&VY_ZhMnj>V$8<N;!a6BWB`x2U64%)I7n<(_|f-K1$9BmI<
zx5;#wcD+To@_*4M26A?Ae1|!R)XpVyAW`t7JuIc8X+wFN@NLp<w~#+8#k*Xlu{k7b
z!Q=YNb~oY-iM0ST5=ed?kD;OCF>yH2z96R<Jqh#I*ZDSy59(hKKw6A70pOWs<EGtG
zlaEt^G?MV1T=+8KemVNB`$~U!d5`@wdpKzjg5S$?(#w{VRlD(2Jw0~$7G|&aMvi50
zmkz@L8DylyFQMWf(qMUlBQ-?q?n*ir5DPI@xJwit!cSdTka<2VS4oU`?tU3h4F`du
zioWD}da``B*}|E>xhPl*yp|hKN&NEs?<t3P)Q#DJv?=NaWi5B)S+{MTHFT?gC@rWA
z!BVr}Zfk4n!GtMuyFL2<Y*e+<B78`mffYpLJw4B8WwUDY42pUXVk12IRc_UD!pE<n
z{agWGZCX0{W|Ci`W0(V`Pm0cUT&QFF>{5zcP5ki3(#tOVAMU9)%O3)0Lxj`xhg%}l
z2|)H}yle(>A}48$J|ywut1vqBBiEZ4_a$;~+uUroCzqy(ew?oDBBr>>==TRpLToy@
zx_XgdwP8`F1nTe@&}7FW28K2n{ve9eMR#-Km^VK-s^Y~XUa#M^6pV&E`7$z{hDH)R
z)`*3iK6_QPwojirJJlI$R|u)>$6M1>DTyYXvxsXLAAp`__ZxOzi!7XaIH)48mZo19
zDmeZY<?wiryzWRJ-&XpP7>6dvG}Mq>#GcnD4XJ1~ttTyLO`{~|AggLh(>U>7?Ig#w
zp)ncRdG02epLabfSZ6LINk5F<&mgN&)b36Hl0H*~M-I+o#)6YO8s9~XE=ESimV{F-
z=n}^cMtTG@KF!roBtbr)#jQ!ihd>Hry#vi>zEutONzAlM$|kRi5d&SAk<G_6La|)T
zQckiTQBYzrA+@`2#`57)V|(98Qspk$daZH@>@DbYjGHdwgoe>DzJ>HGrFQ^6LaZq3
zq6(-$4Sb#nF6lM{50AD%7<?}sYJ?13DwEdV)a4Z<`dDqWIw1r<)8}TNii@(9n^rC;
z2muPUIIcCy%ba3JiezAt6SU$EVI?e4Qd6c)e0cluVlDQWo1MW0jh;b+p^Q3Cdedd8
zvhJ-R0iP0G@VioaRrBfKGTTSZeAHNC43f`o<`;7+3zR`{)70<BHW!yd-a}bc#s*33
z42pstrjQRdo0)N%coouVllwgACiACP{~nxTO<SNg><;*$!8K6d)EG|>t!PE|Lm%T$
z+8ORqwR>Ai-ZN`VT(KpED&I^}#xPg{fKnm_=|m3sNHhOy9(9F6;s??7X<elK@j&}O
zk6p{T95C)p)%F-Hn_0x51RO00N?bCGk`4Oq@9I@Qjh{urrx3lAbx<!9=}#G@So^m(
zG6E>c$Ye}myb!Fg>dp>oB=D~i-AOC&aO*p7V6efMa({VHLO~{lxUK+gYwa5?Cya@D
zZO|MF%+{~^=+^XkX2DkvYfo4mnwe;~HES5H6zTRilpj9Gd;k%LbE`qHOYstW>PTQd
z;K`tvxE7(A5x#*X0&z{<v6b8OvY)azc+<suLnBP6=py8%vJT3XmmZUollvI8sK{jF
zV&tDbhr!7I%CR$%n|3GW%^w#wwTi{_K}J0`sE{OCr3ps>J}O^Xrw-0AAxqNe7b>(!
zzCssW{G@)riUub(ePp$;U!{NeHU|ny5JHM4iGVVognT`0@cDpaku-OhozcLIwH=y|
z(IT;@c2O%obDlZ-y(B7JRlFTVq<p4*fa4_cv!xdLw4~ohHjyIgF5j~kj?Cob31qS4
z`zbBBAHJFXHwi@;AD^Qz<h5yrVm%n2)!Oqb&naI#hRXb3@5-fEp<HBebYPe~C03Q2
zbe@Y7jyN4{0R$v?-j%p4CHPU-d)ZPY-3yCMyMvPb1|_Hpz12I(GzFmBCI!0f)-svY
zOizMPw@Rg`%+3%<60zSQrJEFdE!WG6a@5(`6*&SLBYhor-pt(~|4{rkAAN%}O(6(^
z{-iO=DQDVk*Igv#pmagUCwc7zt5nFwllNd{@-H-y9F^;(Az6zC#ex<0K0uQOLH)w)
z#>yX+Qy{=bUk7wg#{uq+*QKN?vQ$9q=F>-dC#MfXBvx+x{*Uuo=v=#*KN}B<--|1|
zL@mbf*}`wNT@(GEE~v57AI<X)v>fZ>(kztU;2<*z)}DW-+Ispy_Y3A|S1fXPe|iT7
zULh`J3i8^wf9=Iv{;4>D5JSz4V0uEZbUv)lB%Xw^yw6VlCzh#YZYBLB8z@5O9vTNp
zfk=f%<-sFSn<55lY01kc7U4oU!N_6B`1`!oL(Ca#c}mGSMz5O|3Jg}3+M3FYE2DF#
zW5~klkav&VN>sVzSw*jieF?NWM7^-o@%3U{N^0ol(n4FbT8pIW^o*`<MdJ1Fzv9=A
zCI4mP;kfybyh4*2G6_f`w+<`bu&caY9g-;c0cLEn2#rzahHsg*pmoh4MhL!wp(BiL
zTdo4?wNT)TSg3^&56`(IFN>loBE>-jvv#>h`Ba|qyWkX1?@M<gf_y#qH1&o>c`EzE
z6OPgclW@0(_KhFh8dU~<zLM{d%Q+<-kG-q89~)^<o<&_R8?fRKl<Z>E<9mYf*-Z|b
zx87+|Hg-M|B?zcnz<-xiDv*+i;?-eHCZjgdW}ITwlhgiYDy)SoH`A8f_}WkRcA0<F
z*we23feG3JQ!CL^v=;XD!@gxnT+~{++=sfJ;IMB{L{tu8?066w$F9i}n1qxOUSS#@
zS0>%A8t1&f1Q{HSEf8{!@D|5Xn-c`*N`=UzSYVbCt%D;z`W0Mz=OYlFnBP>ztWqu+
zQT#(ZF<xE7{B28xG)k&~>wycZ%G@1QVL7I?gymV3#C0lkCdy8<KIlsu4G_ox#5tkg
zxir{TrA)ldC{cgsQ)O0WCWOa<d$>%kqN#8)nOT1{jw9j|X(y&z(J;Lwm>Z42aBRB#
zgXtq~L6RBs#8dyNY=g{4N5;3pYVPTxRb~6~Zx?U0xWi{o{5}q;c*WRxBg!M`M52yO
zcfAOUEJX|m!@z~RT&N=rl#34)y?sNL?B~n(?LMjpLzt)QkqD{YM57EsRByram&GC%
zTKCYO&oS7iR*Mq9fIgdJ9iB)+MF1UU?HXt)W>%NZNX{0z&QFDhHw`U<M_1=_MTkHM
zR<r4cf$~{vMa5`_K+kDu6^=NVN^I^gONy8In#zS%D+#;1v6;ua6O>V<H!7)J)(&Ci
zGR>h1XwiDHP*=5ZlDjRz`5>BUBAFCeb!)I)?DlbIBGlV|LTrAkUge!h)&&_kvv_*R
zBd_Y0<hq0{w3lD@xL1|=z!(J}m%cJg+$C3*O<+aO%KfgexXRa^c3Q_5snmUT#$^|F
zyvULCQw!M5&+T|HReyBSx-^15GjtsI{TL$c@ExgnsLDs!X0TqLUL_ZzzvGX(rI=oD
zQ>s;9rc$i;+Wx+Jzgl-HR&86Fu|cL!PYtITxQ;M8J>CAyjwa5+#X9H!i+k&N&Fb6k
zdZb5wwAj4X_+p()aYSq*dKG_8DI}?ljDxFLr)PK}wb>{fZQ3QG^oq*>3HgR2m`;@v
zA2?mDBbhp2GUAXC;FqJ7(%&U#@~d9m3<jIV3)OPY@O(W|_jwJ;f$O4jZ0c0Bf`;p-
z9)PbOaaG-o)_1?JrDb?I*i}O^!;y_n&zLUdR}n!UB_ZDPRMf@je1fep;t2OebGDtc
z^qUkV!GHmo5oy`NNFE_zn+cyyY+NW-EbN@T#Yy<cd%E=5&vE~~5o(bqMR#3o$TahI
ze-r)05Av*~*Q;n@H=9FXxQF(CK`kICw$A$EDSYMqu;9(p>eZD8kI|jM^|75g5$zb;
zD9qo{MHmok6c^0xk{@7ydmUl(wNN&}ZXufj`M4Mx2nXzWl$F_-gvJ>gwLct<DF>|2
z_uQRtkP*+=ZIqhG_yp`m=e~Fx2pZpN{hT%6OH4FZOzE63y2V`aRJHsQomashy~*h1
zWF%Xz#e;qy!Bu#PL_T#9jOk`n_WR@CX=5S3HKbQuyi5Vg%JzfSI^9oy0Ydwzv;qMQ
z{7Khg&T!^5+x0a_D`5A_Jm&{(IJaRum`atIr#smWu~*)XS{`C2>avLkZ_4-A!-~^8
zYi4|2Yx}ag0oQGto#)rB#EWHeIM^9lb$7|!A8%I*9Zv!w@@#e&Ax&I%8h<ucf(nIJ
zz&|tx&3>{^G<(Ni&TPB!*(O<@i~N6+`EBULbWS*wnRN2DkEV4iIzfCk_w`62G`Yr|
zInYP+>~2#q&vo}0K97iRZ28jJ(EG|&xtPY0Thi)Od9ukVvy;t{i*I($K_(8i54*f>
z;@oYvsY_qXR$^`@N8=n?MW}<1@cY5Dqw9Uw)lbabCZe>PkDod!P_ga*WLCU3Urc@F
z3Eyt5<rJu_vNfSv7&lcFuH!UyX!a661<;+6t3TRKSK1$0k57%gA7l}0T7R66hsr)3
z5jIwNPIn@;Uv1Z@h#hfey`d??bV^~VvCO-FrUZfYn5>KRPF|Mpv-zLuo=J^u7Xxa3
zM$d#dZr;w`_}{!i*DRU49CgeQNs#&K`Nz?x9#yI@70(5He7EiVrmL%jMOW8*K3rbJ
zn`=Jr^mcaZ^l@^JGItk&d;V+c_~PtLO$K?H*R;PbqLidIBy8H0N>M2KG3ffQrWOA;
zzYC?eJq)-Bzia3J4QIZ4dX{T`?DROqLoBwh`JxE-Y#Q7q^rI>A!czvuUVEd{?-j4H
zVUdjiH%|t6uWN7Ezg#Z&ZtjdSXYj+CZbZ)9`cz$rG~`djNKJg)bk*u>TzSo;00T{|
zd98X)QzfUk>+#1c44W3c<py1;o-hwnJ*wkKvWcrb$r4^d9zM6nU)Pb8>pA{zJvAI?
zJs>bC5fN@mbGd%7wsp>ab?c9_YDw8o0c^XlwOpq`Vxs5gfQ|Hb&Ub-mM&x0Rn!S#f
zhygeZR)?-wE#_};kSP5;mi2SLoOV<Qn34^`Q_(T1wVoA_jbB_E^HXr@W`}ZN*i9wL
zhtV-)eer$VyAl2fu6{g;YV}i6{4RhL0W$+!A*?&K?Rw2;07h%Lqs8SJ)kq$YvWL$%
z0d4b`XvTgewXkn{Ch#uBo}^2eozw(@jpJjrE!2wA+!l98Ynwb_cotQXOWw=q?ayd_
z6X&khrO7lAgf0a(+jhPl2`A0U<61TNkyNvf-JyT+(-8emM30bOL&2NPMb(wA`h8Kz
z!GKetLcRFnQqkQ&$wBmlF&wNNYl<P4F(t0ggsA3whq9giG0Ql}c#_~d?c{<^(`3xj
zT~tYc#raTatMilj$ob$rr%Q2SE0L$?(wve5|0iGdr?%De#>LpXYYw$vgDI#WikTHq
zGG$OA$juS~(fNs10nUFS3Lb0d;BwTU<JO_&w)#Oe$af$~mH)KCnm*bg{xImSb67dZ
zMqq#H0WfB}VXLTPABOea^97^A>~%cPOne_hLHfv4&=m>uwuj7vy;=V>DDV1*CNhNH
z$`bA`kGyY|=t?rr+V!<Ioels7{W&KHo8Za`@7chU9uJ$;|A3ao@t*_V0IY`~RV+)%
zB6{s{8CAN~xIB<Dg7XH1gs_&{LDuKW>CWHnOA)v__M)19z3ZQSd94*gECUSlagmyu
zA(&AYytJL}1>6knsOA#$@Yrk7U_9jdHTwO9<D^rgH?0RDI<*GgZB@DT%F=P0*AOGy
zPhrt}slbcB(i0MQllkDmAfqLO_u28B9FpGMc#{_%?{@u;rKm|suYxfNfrUGV;sa(z
zjfCpUF@AK<V_CBG|3}tY2DR0F;Xb$&m*VbHthfcY;_mJ)#oeVyp-6GJ;Iz0q1qu{*
zr??lF-1GkL%>8mdB{P}K<eWL#Ywfk4=l38Y>Ew9nU(M$Ky=yjm=S+ArKv7iK*A|zd
zcK&#tgu)oGV*y`*UeqfEGf2q(!*G8@i>+tyXY=9P^G*Bh{y91*o)P8`0s*#a%d=xI
zfevFDh7SqDS~O|voB+e{Qrt|Ei@)1_ooK+CZ<03}|0fxQVHU!&o^QWbZI*I?*#%&n
z66Vg-Q-n&eVu(!*W&90!k!R5Kd$N`5QU?8F3sMdp&h^6W-E-Xh&4*#^uID<&`5TEm
zg1<;&lJHadukT``5XK=X*aB$Smwt(E*EM+%X#=c?O_0}PY`I-qKbj|oqe2pR{&ouv
zeI!>dWcZxNJ2R?%uBh6S(4vWZ={5Q!{K$asoL8d&u{u`6eLC<r&z~Pt^IcHM1plcN
z#%35oSW03-SI`J22-Ur-<)u&LPWLr9*GZpPhb%~UKd$QL`4oJ9?w^e*DzT%xT-l#h
zaNoT8Wzobac!7b~=B#iMI~!z9#SEd89V<ds^j(L9lDIf7L&f-Pro^Qyi%CGyX1=eG
zyhlF#L=i!-2a+3(Du|Nahv6`oW6ZutsQT!oQhPg?xhMoB-l+BGbI1j|;7J(%7ryT&
z6o*giB2}Om1;w2C>yIy0fYcT!+jb?p3zEz%q<ckH?03)?K4F`>{)i8r`@{n!Q6W;0
zX_5MRG<X9gZA#+9qleSrtGOE4*S&xc?zkNS<HN42vJCL&6-ZKUB~BGPGm^OP?Qz~o
zP`=RV=*uCtn;|E!iGEiWMQ3?s_Xdg_Zt{KtfiYns(fsG(BhaNF@1W%FQ;3>5wmGxN
zTZ!IFr@%QpZbVFu1!Ah=q4^nfe~N#!CFn0J$i9H%*iMX{EfI!Dgg68?utyZkZP+r9
zScSn~;C^7#M?^u|uoY!GBSoMUQyr<>j5D&n`Qk;h#p|f+zw*3z(R>2=z#uFdn9;<y
z?)*If37#4)^*m@g&0J(>w7)eLACwyiicxKUJ@4CiEBh$Q6rzBp%IsjviyiaO@%0cB
zH&v65s0;vsR&bX6APoLHc7X9PiF?nUsOW)vz<S)@Oh5G{?nek<Q$;4q2IkIu(SxGV
za_UO-p&z#<H{WE&Uu!5{8A0cVVp#MkWAvZhh`iaWSFN5+pPkU^w=;Tn4)RD|`!Ho%
z*4wZCKE7fKM-aUKbc%Q_{^~y^SAr(}s{lGF)NfvTnP-8w6=(6KFWZIf4!xPo-%%C0
z$#B7t?fB@p`><s|^!BPC@cS2*;V3PHSEc(7g{S*lB(C0JgM*RY?~<5N1Q($Yh}Fl3
z!=h`+FIl25eeF(V9sFDquvqc1=gG3Llzf2>pzx^=m1`a@28EUPG!*Q0M#~RrE{QM0
z8Pd7K%1(!J%S60ufd^+-`RBC^$6f!%8qs~8=Az&*iNQ8KPwUo<sp7B%BoSiww|rRB
zt(%U1drXLmZF}AUTae9`1VZhek#oclGniCUh+k^(g`m*>v09OTCaM8Fw0)^wt$+T@
zLck-0AietY+|r<UuNz(5=dsbt@0m0pD-2{>VmNJ-5LKlnFbj<3(%S!pP;o@%xXc{1
z!9pAz4Oz9=I%X7C8sChfnZ$5nadO~5yx}U~)ti}_?eZKaCo27}+WStZlX=H=*t<^$
z`T0ou42p4p82O<PY5qcFF`kG8qKa>UHhmLKvRq)7WMt7B75Zguc=K%%*6Tb7gHj9b
za~_n$ac$}0ycmJ9xei>NkM}^<$2do<<<`?P>BK|RvdgQAZ(fXw-}`UfBOl>7DHs!k
zC=>?KbwJdqBkqJ2x*qdxS7(Blz1PxC&MX|1g1u)2Oo*<U*heNgnc{h36DeUBO>Ul%
zr`ZDw11F6m-9Ao~ppZ5ayb#=C@Ji6f3kt3M4(9f_|CNx-@i1+Hsv685;xpggZ9uT6
z!au|PK0r1cMyu|uJlK1>PC+paRkCXsCq^%qpN?x`>0kjf=;-*MF4(m=!gEbC<h^uE
zrYc(=zk{U%m5v*mskS;5*Dqhb?h8;SoI31(Ys3FFy~ja@LMWC#!S6`NC|&hMe_>}M
zdrNRZLjR3wYlO>O+DeK9f^&z4TZJ4<z)btUqW8t6d1kyU#4k72Lw;C2S3cI1XVp(;
zU)qsk8_;pEFsEI{jv#q$h!TQ%6V1_2w|>97##~EsDqH-$7RtR7(ETv?5x_5$mSG0a
zwyY=O2o2;2Zgu5h1mm2t`T${EGq}M-Zg&R)jx+8Pg;e<UFw-rjFiFH(JtwT}Tc1o*
zQW|3~+GINfPkJ8D3^h7HaQB<JNPZr^B!eGD^0q)~4an+g)^rvMUF$}ChO69eGB+8M
zDA(N>g>1~YoJ$<?Qix-Shwaazi>o5n|LzA!Bw=;C$i=2ayj|Vu_t!_z1F_@~%AhbV
zWrIIJV|zI`z7}M$(q=Uu1YcejGP*rp<0dyq`I^4r4WZDWsa3}CwjA62kb&7)(Jyq}
z0%0+L3?}och<Hsb1OF>#{GImg>gIY|UE3hjt?MsjUxkk4U<nH<HK+?3ox|%;(bl{^
z`Fm*WPKba$L$8i)2kYBulE{D@Hw$IK_Mnb4?Dy0RF}X1sbodMqc#_+@bQArB&XJA5
z1xCWdmt`n6*5t<K`{Sl+kl^HF(Hy=^z>Py?0BKjsTDtrEOz;w_xH)J@$e9BEI8oqz
z$d5tV5S$+;M`I$_svI`FeB2%U%0*Sgp!J?NEe^Bs^m{-|Ah>|ZrmHIYZ|g4G`BY8s
zrW>=^rFTh8wGs;t3m?&Po91UT4gXsOB}KctJgs`~G8X-%jE4*^d*5NyCPC<$atA0z
zCY}<wIFXP%=yNhR58p&isU%{2s+<s%X52QPf+<<I7!j2pN!KE*Bjax3b)#s?_@BE@
z>c<3#I;~A#a*Bq9YJZ2fb0!z$&ye}dk!Tf$m`=Xf&!z^Be{9PV{}0#2SC3L`5dHlb
zWRBlR@AtUhJziU}>S-OcT9fkyW=Y@&O|R%%D~#btd&2!jWS|Ywhnm_R@crO7>+*Fc
z$?z}K49e0=m2~Qov{DNxJXi-I0CZ24Zp;OY3yN%hE+cX0TW1B`t>A|Iitgew-t;^}
zALx1g?K;62XE7kzbk2+~a;2=emW}glpfYmNec8W|>s^~Ipdl4Wq@aqZs@D)uP}kvB
zw+vs;96q1MgzQ(_9<){KpB41E)~SM_p!E>Dqe1BY9QWj4f+%o#2rh}$@cu}J6I7o@
zn5w&7+i^T5Jnh<#4H}JzpT=fou1>D2_*T(SHq*hG2ktSu*YnD*1i9}vPejJwI`|QY
zm{S;P@?97S`<2KMZEW-2CcH$8E?B-d!QU_$C9=@TC`vN9`q$|q`gmz>L#!{pAW%%~
z#|9piMvjl&0Q)PpBVboBzU&PKvU^b_mM7X*cKHFk9YFe!O<5&nHF_Dfb!#dW5hJEp
z0wTg5D7){A=A6z4Z(B-+dSIi4*%{!+AW|65A*YfD@UKnl1s^}d$hF~k-ts7};()69
zb#KwF7?<kz`xx>ej)(SZ!R&B&8SE^uo(Az+Vsyt(!3R7r4TbH$a6zf*Y<f2vm<wlJ
zE~~Pl0Gfsb1Y63@E}w#1M<XQ}6{nz(^Cg{?Nwm}UAt?yHH})MGuI@aL#cQN2W|n;f
ziU|djj3fa;S6P_dE$6*(tBFBLU(9ir5>H%MlgaqJwbAT+@7+o-Sl2Iy*zGM|mndF!
z4bij+-{}-efBC$#*Z=bj9yE|l$Y7kUs?i|=mrCK=S!Xd2gPG^dSf@q@cK%#HTe}Zl
zSL(`@HvMPEfFqWOt!cpOGQ@z&9xjKt6W<l^XQj^gvHx9F->p*AklnJ1u$c6+csQsc
zbqvX6wSG$531W|iD=v;m9C5ci*$3_F_)$1CnYobSOz!e44=~BUr#3hNh?ngZDcjPt
zi58$?LUU>@7?^%nQ2U!tE~qA&0x?8Siw7JO5OkaxDod_iaFE6FGtlw|28qekz+mUY
zh>o#7Drg}r=DQi1qFCTi9>v>~hR`X;<>~Cg=J$qFn8g`ov4cl~opZ4C2c@#gCg&M=
z9i}>gv(LvTB=<iklF_x0iyE}?V9lvvKXQi0E-`+qxN(0i3BiT+tL;k9CSFzHhY2y0
z`3*AkwU9-DX=(<=n3spf3JNWd`BU2o?{re&n4)#`!FUWLY#VnhGigZgEVZdYUpHyM
zp4w1f=;~wc#n%S5L=;As!xXwHv!u*ac3aH|k6|XFPs8<`-4s-=puMXRyy<msILadU
zy(xa0jcGIIp7F_z^6g$<W{Kg>c?u5Ac`i?_e{&|U`+Ly)fA7f9*ValHg#g+26alg-
zTs>BJT=W@F8TD0E=*)26^;6&Vc;LRx0>;~46_!NN`(>Ul9m>l;+)k1`@nfMu!{WfP
z14CmI;Ks||!nbgtrluYyq$VaM?F4Mk+1c4Z?mA&?ZoPgsd!4aGZ8sJFWr*o9-a%^S
z+>j$&;3Cx!1@tF$g{0WwP4_-jLrJ4J6^UPG^YWL9^TPuJNzhAh0z*#0V@_vh(ZeB4
zm-o6>UJs>lPteYCKE+jRl3Z!O*=Qs^C&EWZDXYiE^B<xbJ@?hEhHQ6uYKzUsbC|8q
z1Q^dlb=RwBs&2_vRjA_v`~>&jvtr;e>>=!(m@zhT&WS`KxN=mwLHiFO-6yr<WUcvR
zJnBXhj_NQJAda>$N(rNISaG(b$=j18VRN874;y*}nq@l~wK=-CZA4QX>vg!F5{UjT
z=uQ4et1Oe_?c8SZ`otdiE0JRL^RYkv*EjNnF(-{OkSVGu8n#xt=xX3ED-!?l*={$C
zW7H@mv~>Vxt;DO-a^nS>(}ju3cORSiRHt{lWwhH9l!#I@Xw_8s2hw;6MpeM~S-P~k
z2aK|JlWUi}K)Adl<5CHiN<&I`mM-&S)G)hA^XcpX_`^a9YG@91E#nhF!%`Nq-&;)D
zL_KB|-sE`qLc}Zdzs5zUziviKN-J|&e|34;7ZBPQx((q23b?GUsndZZQ9+KV9h{}R
z53OyQ<b<9RZdwlfR^?(RNTRw=w-;k0q<WKAhxd8aAVo&vgr`f{^>GKCd{6qxw*W(e
zsIEZY^0E5oNutNcPgxj()AFoQgrf;0e^}E(>s}V4z~JiYvx7I=*YZh-$!&(#^AwTp
zzXJd+XuldgHO+5MkHjI(((~_k_oj)^bX(q+Qp&`H!BA$g*U04A+Kvk6o>#i})Qy9G
zj(PqajI4tHQGr}lCFL6>;R`%B+sK^Gau0G>--e*%ozJ1@CgAc$`=KS4I(y%#WE`zI
zP)NtqJu()#Uqsz12^~H*Z*kDuEdIDot>{@Gcfe;LTfN_`M$l<^*dtiV^S5viY>hGR
z<)|c(8l4aa$N!`o%N!5O#A#bl*W>(5c0#ZCr?rHhLD`XwZyWN?(A(Q=hTBtFvA0s}
z+@LY8HuurNVItgUsM4&M6J4ITDhwW{Kbr*Siwy3Fhq!x?RH)ST`?KE(v%|b(jj#3V
z&9)JOYIIzhEFn2m1J?ifmPa{$cHdu#CR$LNTU#5qx$gshWzXL{i>yZNpT|;}4stvu
z)nv)6`d~R}L2)f>@8SC1p5OHj3yos!R0OS_>GwRJq1)u?L9c#fc`bkRnyl9n+ZqnM
zFi?t~Jnw!hGcC}xv#;H>?kE$vM=MiC{Kij3AEqX^p168&m)CSP!(ugY=ob87;oOpW
zg&{N=e&1%3M{-q6@iYIntr3@(UKN|sZTjc^dD;>W9x5+^IvaPSp{@x5_oBx%mgA2m
ziA?!4{AnN4{Sx_N?_312gp58UWT_Hr>wn)-S|);z;B2b%yK$);1$`#JgAH}S7DBNs
z^^-9O(>>-*WZ*XE9Xz!V^Z=Vafz>6hyX#LnIiJUlFM@-Q#h=~AFR?R>=U$Igyn-$|
z0@Ft2+S;rJFI56MkBV&~43rZKIZa^@hhHAn6!bv+01ZZx8mAdu_F0a>J|+QqDSxfy
z`Wv5O!*Q&CyslAvx7%%8x2OZLdv(JDCya@%Dp$;)CSgKa-Ww*V(6)ugh`>R@=QJ_9
zF{vH+!L61lB0wS=jE6VUz$w#YZ=7aTM)dYWHPBM$mmm2Q82YlGVG;yH%+?rncmbAj
z2EYfDEK4rnxWu87K%kCeqgcP)_|Big=kqo6RW}Dq!0a7Dl^|?*9D7$Pi9pT`dUcUv
zh#(!#$iny$Ev>sOe5_o|uMlcFNyuPxr=#zI6+uh)!S|O10~T)MF8fy(c3Y9t!zZ$q
zWCeI@9(zp-F4l=^1xdvgOS5WF9}}J_faI!BuMB>SYyV@g5(4_92$AR1kDU|Xr9VG9
zfsW4tZ3N~>l9ra6a)MciW~}d4<;ZhJH}~~uu@q@CcAR|7D1SnxSVO8#9ma4x&3kKP
z3CB}rm(vJz4+|MPHcLkH5D=UeJ_2q7wKnpA#4^fv1gr`y1I9{R)cB-&mm!Za4RRE`
zDhZ46ySAo%cB7_;cIgu{5=FMpun6L0FFtvpe|qtEbBEjgr<R*SwR#8POJ;FSy_dy5
zm<n%t{Lnu7Lu-Q_5-X@M#FVekZxygw#C|~y`L~es*ZaXWHRAsZi>jo}SwQ<A-osV1
zE`Z`9Q%{S+|Fi)>R-6I}&*w)+%RnlyvgqSpIHqW8LxY-Du7{MVq^djMFqXkb<9~vl
z2HF&m>LlM*wzn{Jz2v$7ETS{ONXfAzv2mS5Vc>YjT0h<o`zdPZvX70Pwa=2Bbt~sv
zy`#W8B!SgNRz2;>$9;~E>n+F6ZXGr<jp*sPm6#7lzdJLJR?VK)3_Ooh8y6#3zGFeY
zG82s_$ArOB%c3GUl!U}x1=TA^psLdkbX!!n#8}Z`)6r9@C&)!Gf_fDpkYp@IE{?qx
z40Q$x%`-uR8g^Q%rYNzJV>w+t5O5$?pr~qYd-~~}dWDCCX|oh6{(&x)qg+?`R|biJ
zkbV^G%B$qdrDg4iUXD{Kq9SI>6EL&rT(Si<4?K>x7p0hHs%FdwwG!k4<rtI+)r+}{
zv&0z18NK`88<j8HI_$sk@FO`{rF6hek5!8`x?SC)=n0fKSN5Xjoc3e&)$4GJf$qlw
z&**HjIE3uUmXTa|3jGSGQH+YGm(M!>8cKRl3LLu#@DLc)_(=IeeiX<r7$!Veqe&8W
zAHvsJ^OzDbBkVpgx@bQv`T_(rS18hdyA&a^_-5^ISzxIv9Sh>xDq$t3)UDU4_(FB3
z%vKFg9Y!aH&M<iDd6c*CGM2N8*oaoqkMfZdDX)aqUCMMEl@Mzvg%hCxh>y(8@q3x!
zZ^pFdKt?QKRt%yKiI98^F5!rR5zB=OT@Fjdm(q@v6SvgV#J5Cq2%zoi1H;+JXZ+GL
z59!;;(Miz|8(wLxVk4wP3?)SK@fJo0`K%hj7KmP8R)D#DJ~8%OW4o<`zrn;ZYqG7l
z9CB(H^mwi|j$f=ACUTXjVyQ(kR>D>n)%OJ2t^>EISt^84U`Fvp0RFC={qEqErR)U{
zS|O(g+#GoOkM}`=7ekN6)jG^+|B(Y&>N?#wI(;)0|B77rPCXvvgB=!F9ct>9KMuRD
zjk8$`*i!r5H*;RK{{f@dwC*LF#Ed9C6+lciHTv)^^fu?xDrPE0=v}#@@O~{ITQ2ov
z1$-lSA(Mi?O{Hi-jJx@LHN6AlJBe73TA!?q=hST_l7a`ulYj0mDO35?^0O`m^z~G-
z^vG2>!ZP`A^RJZ$-s(R;uyD_i&oItZ(P;G8H)DSMeDPr2RA#)Vg)7^kqa-{?Q~~9I
zrS0U~-b<wo8UM?dMb><%Is^XoFg&s}t*R#@{C&^T>r&j=kKq;pn<cEi1ICnDY>MiZ
z&lODSoznli5$`+4^A7y2fxl`dV3!99-YsB0<K^Sa2PBmQ#y$Wb?c%H)A@3q^*|@1M
zyL^>d-hY))X;d!_!k(19s@~t{cE8}`z9r&ezDfjcAIn<RkewJ0$~`iP8<W{Zrwl4-
z(5nxQ-y4#j1XY}et8<}1cyScOV)IFdZesXPDTH1qiK1bzCPPwuB)LY9HyBtpeB3A~
zA|FiWPgC;W4a#31zuPez5t49GN$C{4QKvSCJJ2k?j6q4b4{#fvx-fX`GFWgt!cdY7
zm6IO*`XQ|mdVe{rx4W7`FY3k!u0-?MS5r#fsY75oDfeM0Xkw8oAXcpn{GBNA#MjhD
z7XFxq`A^Mn>i+EPtYyRRglEz4hwFAfpmYrWS4<&t(Ro9!l%*;3p?;xzrGoj-+6q}^
zy6vAt_BvmKmy9e1Z+)zVQ-PtnO*ae8AFpIySNPC#{w$qMq)z6KXO|GN=A4&MFH-1a
z^tKiWbUIPxlv~wr-wZ-F?GXR321X09#_hsrAcm2KZA=pUoOi`=@$uyoA6w7<ud_jM
zdR%^{!i|kPsi^}MRwdhaQmH%S!aR9^%0`Z-BAk0Af2R)Se;0*Sw_edGbvt$bkkxXY
zr~8Auj{YKcv~Fdw*2sT1f$zz@69t$4KUp^zFe=F=k{+3qLEfz7eg*{}g=g%L!O>ND
z@~YM+b1+M9m-MxfKs}RA$TuG%Nfzq#3`&N3Ex8PuqxnR;-)<P<F^fkD&8Um`nQ`m>
zv7o$HdiSXQf=^}jis!;^p2@*PDeo$L7g|PZZh$?bvvCVBfzl|n#_!Y2I}=Fwl(Tuy
z3^tz@?8<W&78c4N_yb0BdvDMgB=#YpcM4BeUuhaj6hfOelug_=LTOYoIbZy}YaaiI
zWD6?9EAp4j!*WES`Da*TBXPK~>RevSR94@<Mwcf;@q+bAs`Gc8WVAH$<XW&Cx>)Q~
zX&ujU;I=$HItDxH<_>h?t^&-CZ>GS(KrgrEHbT}-c=5&LWe0e-xU{5;0;d`F*nrAX
z`VuUr83~Uz6Z0?l!lb*Iq2iz-LQ%f$S8IFcy0`i&1;H&E(b`W_+-@OnB@I!7{Ez<e
zXk{1<6Z-1-ZrHd?k}<kurkZWct#WnMc)@D_{irkP#sUTecvB80yu_HVtbxqipRH@F
z=Qas17T)jbITSPc?Ip_wL7z7_0an+(`(yn8M#p9UyCmW3QAggB#e31;Nm;&<?PDs1
zl=Pm{CTne<`({{ea$|agSv8Y2c$b#O!JY0bppflBZalBenxl!MM@%vvFGUUI{upG$
z?t)}wSWsP=uUzsDnatRe3tH@G8<$5NVWC<AykL;N1N}q8OuIf!ZmMwCleKCGd9Cvb
zf!*vR`&yxPHiWpnWCc}!Ue#{KS;SXU<0Or-sq$Ko7IG_<s`(?qk}vhAyBq{AcK3O&
zz<iFsf(AvbAxkLs(tj-Y>8KnD5fOub3uHpwQ<dyBbw5mK5`VuHjtOv#@q=G>DZ(lF
zk1LAVfD*J~-WjLMo^VipDL3{3fp(g*>gh)k+0RO;^cXGJw6FZ;)C|;=s=bFSYO?vO
zb9w<jS7oy4j?cmDttJ~iXI14%j}L0aT7yEI%-EHhJ?BS}={xLkskh!_`czY`VUX*9
zmv1S@bd;56kij3j_N8RxaB}VW#ZR%CpLUv5fF&r7RrF265Tk3>)m|aaz8rxE-sCxN
zPYM4+qdod&vBFz~{q1+{^7mJ9zDP_S!p;|MJIGOH(rMq^Pgk~H*n=O(pU*p7?oO8h
zAK-27x$!I7X$9!AFQBgf#{%%j0|;S%V;S;n?d=7D%>!HORv5shDQ_m6x(SRt)fiKO
z(X?(FW@Mq3{kw(oK`NM@g2LpcJl?RiG+=qXDhRqY9Q^$2UABLJUA|zRA)_VZTf1A<
zoXXW=Nb2n-GUN8DrLV!CFEq8oWvIXHBeR>H&Y4Ygh&|}i{<tx4By=fp{~;*k?NuH4
z!GkXmyUU~rdJsK<@HBtecpJogr?2P0(6k!lrOIPnGmMJ!?Y$;aq=II6%2DV78fk-p
zpxsM^Om}9$Qn-8OJ0&sI-0%vn8^F!*pU?h30}`+wddplCe*1OY*?%v}qslbwza2>s
zg-&vpoD@)^9<Sr8=X4baJh}}uJH5HR&j=8WFk~22#8~JKzq$Ipy*w&Ys%AO*m|@VL
z^NZylkI?mY>vNhf%FD&hP<hGi2wfbyO_s3FK%Lr^(+k$Y(uhFK!4Y4UVrJbjl7`{O
zw=QagT(D3lexGmnI_OA3(#GW07~&e@EyaNKrS9g+tDKZ1O}a?&^f@6mD%qZLm`M3l
zp6+N*NjVM1N)iN*fsOr*OB|l|n2;*=whl*KJt~D5R1HrPBZrbmqt=N1O%2J+Ox;sv
zzJap{6;A8njtSKawm~9ZLOjt_Diw_~wlW?hAz`UF^={GG|Lsog;Eh&;u$(!_e(%dJ
z1z<G#oz3@OBP`|SAEP4zm$u0=1GWaHD6dL<>z`k57ugM(CkS?PJs0$fWs~kWgK1Iw
z^h>H<@Pvv}8gBA_1q~~?sYtjuAFjfrr1Wc=+rY>67R50NIy_B(yCWl&KNX)))cQ8~
zxdtc#Af}TGoRpF_;`C8$Y6ry}8+IJRR2U{Z&=vaRcf1Kw*s_$~ySrR!4pb}Tcgbdq
z@cKm+$gqlJePWTRY19J?X@W(0O3k&&jp3{|2*k0Lv8klJUt7uZVvV4Gcd}&VN;%FM
zC!2>qV`BAMY9+Wv9x^Zct1ZUXSM-_A$O*VE5PK@p6|q)cuvk+iOVGf6v4CBo&O@k#
zFlyl;DGS8-@nK0fJ4eL9$nEI8`@H@#ys=-MImUUL?5ovvQriJIBLUDxba7OPiQXR_
zwe{!QF81LLz!k{nxU}`<iaGa+IRL5<X{vC?WE?yAp2>^qU#?wBg^^>hTnMCcNrm!!
z)1n62fv(9dE0wMlh{uk7FE%^QHe>>)zOacvTj)cjUeueSFo*3~>oI<QWN=f$+st8b
zzWVpNgNNek;#gs&CDJfhmrX%)&K+zh--w{WmxH8IV#*i`heDKoemj)rF@BOydbos#
z*AiZ#OSsWG!l~qMLMo9NHnZ~VPySM;IOYEPr}_=~a%}WcEQ96oNCj>$wp>~)btqM;
z-}y5vd+V^8>c;)c-`!xeF=h$sld)WN$7m_d6&%m%7{87-UFr{Q?%u(RGP6z0r>n;N
z*D1pyQ|X~RwHCf3JI@1?86KWaiBf(-r(2my(Lq^#CD<NYpL?D3h-*9fSa0Xg0=1`w
zr30}vsCl*h7VizAL3X4oet)6ZtdAr#Bm_+vEo@Uim3R7MN)q!mBQgot3?Be?008iG
zE0#&P#`hK?d&!;f0<jvlUXHQU<%sxu$;yrZ2F&|NfI#!Gn{=Oh)u~&@<ykR*KvVjH
z7mTB?WJ~8fhTrx{dRYI@L4o<#oR_?RxG^Rfcok(CMsYnjCDlDust6Gk|E|&DrZ&kv
zYcoO<(6|3~uBZ<nPh#%GCx2l7Y#3;w%c^E#V#|%(AI2iMsxbW(IdI=~`aE2@LwHu0
zi`MqJe=-}2ko(E;<}hG(w89S6D(hX2$7^!bq&{^-E+(|uZUDDdwN9FW<nKLO&-<h|
z*5_IkzSG+vt#@Bt;AJNEf4}=z+VOki;zzg);E46*e;vr&eunA1tsGz9&G>SMms^vh
zpxw@jPosv8Xjh1}7o9!)hug94vb3t@LubQM1KY=Xf$%fm%Ec#6%}vKr!n9@yY8m9o
zjP&1RKF5u3N?%ZuC7uL%MMWQ~Sxoiiq_vQn4~|R#BRwt#8fHt+cM@(996}hh3UA!o
z>IRD`dKX*r5fr>$fUCB+xabX7>uBUl=IGa1b^<nuF8*Vy9z&`C7gL^l+o{m5R@{}b
zUO>L}xyC7uv6;VN<xJ&I`HHMl`l9uR8vWT~7#?$7D2o1#-Oq4d2I)9&jQ<t^WL5QM
zh>?K~!ta3_jij;<>`^LE<IUTP4@iNE#lp*ucDLa}1TGK<Tbb@ZP4V9joMW3ir(NIB
z%Q$pDkvY%5m~U$DaY;<};^p`Jat=yOd<zD?DU-IfS`RC+rztlc<^J2Bd|!Lr{O&>Q
z&Ino-xc;X-dcdZZb;DD>(AQhYFa4XG96B0I;JaF8E8^^=7xTH6^_Q#0>$}Ll%U5K3
zRHaC!k*vRJ`%ldU(^za-WL>UgXTJ8c+a@!nSV($1)u}=);FsJmY3Di(->cj0tjA0Z
z_HiYC@g_9O)Q^R&A-{XaTkvdxwc{MzpMwkDdR0_MqH0>8vUaZ@+c;&b1eX-K>+_8B
zavRjy?%U7j3AuB(3I@e|PpF9x)3^PH_TkYOtDA<*-OktAaOoAl1N;oH&qvFZAM@CU
z0o!q2)7^KA#7ex~dESYdn48(Oz_=Fp`E)r|CGCq?nT4LBm3EHGw8~NUZ}Q>V(p_2Y
z*X0l6K=W%9nm_g<RZAc94YLBr<9-)<Mymln7YVbbW{5ML*v@Q_6^o21{aLfS0p+y7
zM9=afP^)z)wA(#N!=p0&>HW!V`HbuCYdR4H3>QQhMx6>db%}7EHAH=J%)PtGH~&V-
zTGtloK%d^m8%)=Jy!V#oK_WXvO~u<kxL;f9e3jc)$>93h>bi=mWjz4TADzy0FhhP&
zLH!9H4Ncgu%{rSH^OgH}gan%^MrK0ENjB(<7^PG2mA(ewMxu!Fs6-7;GK-aBbL;-m
zMq7SU!tc!8w+`o>KlHQ2_))T%5_YRQYu}5t=3%z8MvBgrww;mMo5np&L2p67P)$t-
zX49{io7cZBI9~4iP6^*k^hdq}KlEV&8%Ukc0ZkuV{rvp_yytmUZTm4`CcPi>`44Fd
z&?LN%<ICmuKy`l?GN^~xip#M0)K04)K<Jrz2rNherwXQYVUXrgz5#H*sDM%E5~4QC
zM8kk);61Rc<I~2igJihzns|?s($~onzT^Fjk4AQ@FcqVO;3WUXC)lheV~4+ZLj|R%
z5~|+u1UpXmJjg6J(t0n{DV1rB!*TE?I)1a_2eKCi&WE-&dd3xnc?o}Z$RrX{WAoSH
zrG6p1oW)8=RRbKiOm}!?c_<+zBes@~?{?^<58+h_asRRR7-&y^^v`w9qL0NjqeEtq
zmC2zVr?)r3s}1J-vB-q%empiPQy2xSbAA}KMY6V;EwYW|xvR3kh>A~(O*L1K3yS5!
zs__YN58n^L%vcO;V>^a>A=LYmgwFm+qsoq4DJ0AO!|qy}2Mj6DMZB3g*>s+*wrqN>
zI(2{Cs5cvUd(4MEcHT^83;VhQpT(_!jTXv%917ipH~9p;RN-JhR!T5(PlkLKV5>`*
z@0Q}>u0<`=JvnAEh{(j3qArNDllEIL^Evq1_{K)bxvGJKwM2~3!p7nPkhq|asdgws
zdP@q6$~>%JXU6Y8jnAE42I7H8sOhz)L2gD%E!Fp#`B$xy?{JktA(ZdT)v549ZzF@@
z|JC^*mu|f5-D5cdHjXAdsX&%LSLR;*$sne**hXTnG~@Q6_{=<_ZVC|jq4%)bSIN!^
zj%rXOs3_yYis0`E1v-3*l>~@G4jrWmRQA(MoMW0qk$(YJB3-<Qm@_&FI#%sN`F!p&
zmjGUF!&!odFAv;dUGBSL`&V;nbS3rrp!fLHU{px|sux&q8q%&^;6!?$A1MBlIi8b@
zjO-p5Wr#kXe)RaOSl51!{p@+Nu<kzop_|YJa6sq3p4tHDSx1w29j5N2i&m!1a-Zz4
z@cU#;1?fgAzq&fRdPFUToAymxyLwu?T*NZ9lJTYD<;=8VsiYwFA#ezB$u*utlBMc)
zRtF(XUM=ZSl*$sNi_UT0cuXffFcmm*F#ys`%^fcDYJ}gX*lCwXxQIFDPwmjp^Q%+Q
zi>Ak9W-BX6i3qvW7*zn8-5mI7J(MEJZ!vXM$J{iVpZU}ao-ffV7KGSPH2B4)*fXBU
zKSm;!7VBO3#v}Bj>B@0=PGJZQ0hR&_#aAVj1C^J;6=4wZAtl!+FP#OwIs<)-f`QMv
z#hl=9yeK><`4&P=r8)!G(s~?ZPo7ECo_~|+t!BZ0tY;W-ZvWBU&ulX|3^dVAuIJZc
zGkzv+RsoV??BZsaZQ}ZVXQGKHpSs75A9AvWbzApKQ84kVh9{-ThzmLJsN%C|=K#Fa
z4c<-t4#(w2V5oMzVBoS2aJm4@;3H(n2fte-dFL#AF|3gpB?65o$X<GpGh+g5Yd<bx
zvH>R90szNHpL+Dnv-ESLRN+!O_t93nRcUnKk*cl4r$^US0-z!tEj-}n+JIBKn>%u&
z*87y=td9#WG)Dgoc#{A3Ja^c@5Ta5VzS>R;@)9<KC6_WL;S$kubY(W%ql?^w2OcEx
zLmJudu(e#pn|loXt`Z9hN_{tz{d<@vQyT})3Vvm#qht;@pkhTVr8j>-TNE+)9#s1a
zU57xTFWo=?)oSx2A2->}?Zj5#puo7VX+i6jKDY9yN<d3ba4KRu>qpcNL0hN{C8(%6
zV8fMmLH-w$uqQQUE7vWhr-<JEvG=7;jc+J<uJ8v)(m1GkGE!S_2z(K*4mTH%D$NPR
z(i7U)u#7GSL)&+Xvsj2lJI&n(we>V*ma-IVEC<B@Dk@i?8GIu6(wgAAf<4#alN$hc
z@T0QwXb@Kg{&MFFyRqlI)(@Mh^W)<zUZ<4<F!b@8vfwX*U9J8%q1@EZX~0}Uawt+(
z$+5<Xax6h)txO6ujOLsv5NNK^ZA=@MqN`V5yR<C*Z3LbsTX9(pq!VRVmM7={BIKgd
ze@?FYb5|~Rl}K&1gLKah0s#mzDRIqLBk|LmPw=wCts70R4*BC39)j6n?Kq4ep$koh
z3D8<2SYe#-qd6Eh6`chXD+a?+Z=ZN%K!U#6HOB#|wYqr1_P02c(g2fZso{?-$YqAu
z4i#IuMaGW!Ct+HIciRLnf4qNpJIhv^eTA(?2aDZ*#-iXrK1Ke3{f(qlO==;%*qzsw
zQ+$K()wDp-vf#W_AjJvU3ydC;v4mYfTaAWog|Rs0y>4`?k)#%)MHZTA*k<s4s)^=~
zRgqsEpTjuI6qLF+Dfw#aOpa90dP=_BxJ3T^L57~$OCLp?$65G<qV&nl>)#uwch57x
zVX-z4*t6LD&T0bI5P+FGN3cugcUbt}xD)_T80+Zh9HnZE-&fWc$9W@nw{qV#-W`d+
z+P&|Jf=Rn4=Qw@|A6B0U4*Q-6&9}M8TD-bVQuetBwV+{$3cM%7X9(e9LZ;_EQcT`q
z2R8cNFlk2uu^JXdo;SXx4jgZCx2WLya9!9+j_ms2v)&Z~k8c=U%Z10(gp#lbXh@9S
z|9XTM|040_Ua~mnmJyFE=EkXUAY;kC7yEe>IkCmL%oz=${C6k(FT7ipef->8Z9r`^
z3`f-qnN`)$SLeGE+0T0>RG3?bHWk10s7GwlJrE;jEV0Gf9f9Rhpx29doA~S;%%Hra
zFaWznT$}L^@5d6djDn5mg}^P>2PU8B<_T>{S{e*VkHN=Sk=!~mb_J+dulcuf7=l>y
zorq=k7)TF5;fh<0c(5R}*j3||Rt`Q#$nQi|CKd!y@k?Mzba#JWXWaGl3LHK*=mMiK
znoa22{ZtkY3v!qO!&+|6O)~F*($!=bNGt%|7zi|wATkYM#Ml%-fF~6LA-%VaD9J2Y
z+!VCNWk77$l>m38Xml0>GXw%Gl1YhzsL+ixO!wLPiLW9|&UsZbH5qRlioUZRbDMd&
zpK&^CbztJ~hCybVS*8Q9xqe(t%Cn4`u)UOAB@z=Ls>W0|BNofqZ+fW;DkjeQvUiiv
zz}D@yr`<P`*MGt9Q6i{&>+9uge2><iD!LBdpyY=a(Vvmlv#$e5p1p-q!UO@hTq+vC
zm8h-w$pha1EFm-MOOd0{g`FAyt_8E&mm2JsPbs8cqQZoU0M~98YL6E$5Dz0u+yP6J
z`rgDzuw|ct46fI+Yp&(*ul~JV$60t7!|vteU#Yw>s$74XGsbdBP-ofUw{9qhzjL6u
zv>(wsIQjcO|FoO!23XD)0A&X_?qGc%PFeB&tmLmB(;GFVrXc4?4X+$NGJfRXge*#T
zJ}nVl2obCn!HI5cjJV_OidH==1Y0ZHxSaS6?}7tpzvv!HIZfm7T8<$jzn5s7rjMmE
zBen1qDI>KCTz;)LTlrZ3qbjkog8LfQrKbbTcjNwu&8BLA#A4%Pvyaqb=TjgdnOM$7
zgrA)>XS$zzY-<XKyYx4Le~>liD<Hd5j!!0wL*r!YucTM&N!nZR(4=`?qA$$STUOf(
z<5)Gn!;|taR(`tp{CoE!jZN=!d_NXENL(65=I0~AVpA688zCr~^&2b}<yvzLNz{_r
zM|ccMTtUk)_ve{w+@5BO`>}Wtc(H%iIR!Q(|E<{3e+pvFKGC?CWFis9EC@Rk7)u;x
z!{ch`=q#o(X{@ZS@;R;i40yZ-=EVfj<bH={(PX>;Iol<#=Ueida^4HjlNK+9X5FR*
zE5-ZU$?63u9pvIMy`lcsSy4HYs>Fx>PAaz1T905CAPlAH8M}vCr0>vIe|NYGVn_PG
zY()(#)d$Z=UG4YPKqgbop9MryfR&|zul~vOHXiNFz%8R$$u8?HsO3(rx;wg;ksG%l
zS+ciUnbEx9-;2`<KdpBkPYwl&4*fTe^^+AHiewkZo6;9FSQ^8gUME9BMUhAnh6*W7
zyC*V#Occ|iv>Qj$FwSQf4pUpHNi4X)+|fl3-eTfpL6mEFCh)(6dT*umt4$v9DmmGU
z6eO;i-D{e177(@ppXe&LRx_*DjBE}{1~`j|YZ4=`0+kw(lG&-oY4h!_1po*O0{DC&
zslWj^MEft@AqJZU1N?Hin*<J8sMbqJNXYveSkbef12#G@e^?cpVGxr3Z&;9(0!BiB
zB)UMTvC73-PpL9UhH0#^Of}^TP3f+?EWS9r5k}T>>{}uTFOU2Pr2o)-b7$o>&7Do8
zjDwPlnykSQ9&b-uVlFpEVs~WGh9bLsL}AOne~|eti160=u<dccY}3!uotYUjvFv*C
z#rd%-vuktw@sF}5lPV>1U1>$O`v5ipSqP}*8YY>eWrQfUh4GoL1>$D`@)UVxq1f5}
zSZUVi5lcb@2NE(zp_0mqWa{<2UAq|yrSsm$gu0upp`gk=Gp}5yDhua%6jRc@hoQ11
zF*?_;d|fTEv2fcB=y)cIf07%b(4j{ZmqNrO4;Uu0c{$eU$pX&XfCvmh-|M~QA2!fg
z@aya^hWwWVmw;X1l{I60LEi+T)3Knk_{#J)CRTUlC@8ITr4x~Q3ni6%ZCQ{l6<6d{
z<-@88XmxHGa2#+hq2k1rj*v?UBHrE+vK@Tu+ae0Gs}emdPYkJdaadZ+AQMOs!>}|(
z-yan`Guao+Kp}}CENC@TYvcZ!?_*2kYWAJNTYo$CV!Uj>i!YDDOL%-Z1tjKnRl}CS
za_Y;57cYm6O*w=kAYMxj691*^whAO`nj<QW#tY+tdO_bHI6JvJ1#8_CPQd~l&zbL|
zel@63x{7vk26(pwhOgfJOy^C@dsdoqQ91H|gRC`}IDEe?RV57x-N#fT{98u44_@O0
z9WLn|vfDrJTsI2Ql&zhQq)h!8@sd@bbg4Ya!A<^CENc>QH3Z0V=0CKoU1v@O(C1H?
z0HEssv^fAZ>-9(eE6{dY$$flcQZJFa`ukPp!Gt91`)4tHg~9zhx}`gAQfadB3YSPG
zjN^d46E&5Nle0->fim9}UshP&l|^i4L;WSFWAE4W*!YaWp;!?MN!&Rn16Jp%?VQw4
zGi{wNeIH7PT5M;}YYtvFmJne+-%8TkOt<?h7pDH1wqnR(D|hpGF88bJjFFjwfUm(!
zpSEun|K1DF&m}~%U|S(zhg7tT+hOey<Aub5)SkyC(xrc1EYq)+;4l%1=|$2qJNf1b
zYJ=4EU*(i(RvzR`p8M5<Z7NB5$^LDxU4~12U44F;neokC3=9#_VRjnB09TwXRfYeF
zwmYmw$@wS!$(b|!o@3w+AEEM=KDh0VEI!FF-T_oji0{1&g;h4a?Owc@W|DNZ4}|dI
zdDw5ZOlcM1PK!SMN%{|*<MDUGcPkXB`|-DnF#tOTI66SAUymS=e6ShEmkU!OmrOlW
zOKtTE4pq=`(&C48F$r+_xpm#iPj<NFSh&{ByVlM2jq{ZIIIr3?t<mh`bj#1+f)2Mm
zaoDfKBW_5}^y+^cZ=Ah`BUkfVg@2q|X0w|Xbo7*mlVn{#=X}c#2x~y6RY3dw^Kc8!
zF(`l4GA&g}ZvdEkH$#baqyuJ~;I`Ez&KD?o3X#18@5OD~XEmjjK1z-AdJhA5=+
zWD>)U3`*XM36B+`S$y3*Qdfc=Jkf6a!4t9VeakiWh+?LpjN;r6>1eo}3Xb;2oK3V&
zAA`0)r;|Y%E(=Wc+GrM*kA&M#2bQ~^+Tg&pPArB*j@1y{==Q=)J&O4Ar+fF>(xcXU
z99T11w|fKHg(0m_<5J1m7|`g!&41;gW8=2gX|-A8>980Wv5o+03!R2F*{pSyWNPU+
zBiXU1VB(GM<0y(mgn}pG?VHQAnrn@{k?Woo7(jA$*m$xPwiy<@`0P@DAH^8=c_9`S
zQw*g-H!=of&F|0#$xBF;uZyyaEM4H^XM5{4GhVQQR-u(mJm`@Knwr`En+oJmL`Caq
z_0iJ$rm`o5pK`jtV&abpr<1>AgdCnC9&CDZ9mbN}t1BEbkrbOP(W2kvep|Y{ST(zL
z7{n}_#CN8}NRwAhb~CHoX`sM#P*IEHJN}q&d7|Dglt#&8>!1?lw3wU<IRTJAHeAlh
zFEn;V{uB@u2oa<~yK{X1D3(M`g6D3LsEu8>dvl5Xb{t9%DLG>)3cMIuTv+fLV=B-2
zJR@(=?0D7P$+TSl-_t}QIrdbClpd&^hh~Gs!pL`5(VEcQa%`c62U9hi0%?$3!Fchm
zuu!9G#)@og9JK0@S^hgh`pIH*HxL}7nVN%3PuGZMtv1H3RtC66w_-Z&*r;L#vGpWn
zjOO@HOyuU7Qq6|$|24zg4t%v8{i(PvrzTrMdD7cv_EArIH*X8|0ujF0Qi(TOQR8*a
z4FsQ<7Rjhis1?P?`JHr*e*`kd0U0DsX>wKFlQ{#QcA8isDn*L@!pgaj9CIyD+?_}o
zb{{2%l#L~2X$J0Rj2-vzUQbO7M4?w{U6!r^$1^||c5NK^5p6SyICbHw+=m2c4Dv%3
z+rKgQj*S6X&YFxzsdTJ(LYJ0=r<DD$O|HonyXyiu47ysqA*87&*b(3Anl!Y&hF;aQ
z!)fDFZ>A<9qjJ`x-jx*~Mro3LQv)P5Ft`O6Sg=eDphvhjx}|5<?P~i$sWAnRo#w2W
zEYmrua>?;KOlnUwS791te5pZIN%(p~{CYRjC=D(N<4RKyf=##?lDM>%*aUSVhzqeq
z6`u~@T%D`C$;VLht4~hPwTvF;T8+=OCN<QJVMntof5(MX9-bXY{G<Res;zZV?*jq=
z0d;g}zQju)t!GRDGi50<tvy~f&Zm1aF}AW#F^JWu>WJ6^k&O~Dkz%egD6}D3)`U~D
zFpO{<Oia~Ssibm53Rtq$^)a6wcH@F&;niV7xul9zLs5w;LQ^!%*uJ-~OCVIi5W1pa
z>2{ilk#5~%h2c^o5YegQ)Tij&N$&jS4?*EU@eFCgV+_FpY06WFp<>T@qa$IVNUH+L
zS*#pF=^w`P!4GweGgV7hZ>4o#@Sq&iIxp8{qNV1`yGp)YE#T3qexAqT{H<@=0m14e
zKJSuiZ%s&2Uo<(;VhAngY_ChTsApJV#`YqQ(5^R4ZN*}Ge!!d4-J%Lw+3tLSgDa96
z!BD?@=hv%l$iD?uluvRo-y9wRR}T)o&QW2Y^oPg|g?t~~6qiGX@R39N==JRC`QYxv
zL3Bz%d@*qGS&HbpIi3AncpPp>Tn}zcp|1;_fmB2J)mL?~*5FL1m&YS(;$D?2oe%|n
z1$<Oh(A6eFBq}yaI6S4gcB_xOVqq1}!IvJ~w1U7`MkKC{$&8A!f;3Gnxr9a<t;j2$
z0lNzDkU<R4Jm4Wmv>I-**Uc|(aTb_rC&2Bey#xsR-v99d(L4V^vZ9OBnrXeJI9_!g
zGV!Gqgb5D5Vv6vQ!P>;!S(RM7{AM9e&RI}quAw5O%7b{dM@^24C#49vIwosMmbB8M
zQ#ZFIZIq*qf0NPi#p*GJ3Erojg=K6=vVXvgO)`2xB)Kr>X;2X&-37X2{X+Tt-~1X#
zb>sV7*VyvL(zM0e2@;xQFY_wXA@<qFTDuR&l%w!f?TrjjQ!`bMGu%SmbdjiIOsL%j
zDT?Mx3-5ViTF@u;PER>K=oD3%sAhLWY)?G1x%=YxHwckHUA5#|_UsMgi#{=D-w5V>
z^f9i7Uk&TqZh=>1C=UkK;BvQ5?UqSdnyVNN#~iIlA<FYps-PAHjkR@{wlzTvpSb+)
zv{bz=iNild6Hfvi$f@lMy}FB=>LYSz+(+M+H{crLwpY)92yXn4G`z!cyQnT1I+V81
zUgi+an1ZhNYS-twza)-9UIjT(`{oxE1b0kfajlU=>Zy60*eKY#ODe;=E~X04OdAp<
z)zAYqJ<`Rua0T0LnfUHmp%jAo_Z9P^C1j$GK+b7$vkug-qhZX)>*aEVuWLo-sEK^>
z^xfHupx?Yo^}0^KGWakpJ7~TIYSgRZy~DI#b!zfbp1aaQo1$7H|8IejEycG4+NEDh
zIEDGmmrx54D_B!l=w(*oGE1elGkD~!E@CV~Rqe3`XSMLSQ_X?CNcBSKB058n(P5C*
zO_!C(_%~?xO(9L*i}V$OQQ?l~S#CSC>MLMpZv5eLLmMff%HRe3!yRUS6g5;=PCy{C
zx_1g(zt}Rh8YuON3}hYl6f5$*a5`MZ%H*Z)HS+uS&m*MijE(7yil_&@5KBrqeXrNw
z1Ld5|fvBNlbYZlkkJ{c8idxupiK(|5B|bkYCY<aKDdeand#l(MAe?y-P2PFbT6$Em
zB^C67NyJL@TJlueJ6VqdlSiabi*jGXvw$Dh8>luq5Vn-(pY)xz3PHR@^G6Ekvhy)2
zDZj9WaCkRgDv9xn6evFg37SPgzF6d_Kqg7N&?vi<Y|@#bPejV1xTm75pNN?>Q-3W^
zhD4ZZYH=2oa;V=ZxEX)IXitVcO>Kl3#&n7+=M&0ttSNy;7KwdZGF}_nb?GK?@obJ(
z$V#ziOGcILiR^f62^`!I1w=tW$&;N3BOVtu-%wQNl9!NO{s_{#78Tl4-D!5rsxEQB
zvz#|Dubve9txqV#+KU@jm{poe&y+XzlSVCmP#|k1Qagu80df08W_J$9JhT#1@PYIz
z)KytsEyyF5qqGsuxGHZ{L#>#&Qtw2^UAbwEV#<UWiSAGNLzJ54D_mD#38C!bw|_Y}
zv0+jXgi<n6vAbYeKOoVGv7Gwd<;ZA%tbIw;++zwUt%4404(v4OQsgHHKP=Y!)+Mv|
zCvl;g=1i)7?w)wsMqH~B*F|+ieMQO_MYZss?(EWWd`P$L3NZb;dW9izGWtj11v~*p
zJbqDdc*o}N%u`TdG)46bb{_Fzk@2u9;%<7X^3iB1NH%p4ieR<Ud~n5z071UHY9#wZ
zyd+}QOAoc+J8U5}7AdOvNfcErMomrZV81<-PoQ~5B3EI2wFC!lk=u?3AAw3z2CWP!
zS%?8;f(YYIsDrdz5d;GR774%AvS*{#!(czz2eu5213SFyeG7AYf+Qbhm<vh`7L^)*
zLLIgm$AB&@?neV_YkKQzsIK)Vc<Nm;(HH@yeBW~RqZYiYTcjE0bsd!~h=L&nRjOgB
zi>uJpAXP0Y{ph58DjsO2DJTxl1yx_KipvzGBu}f)a;eZ#T!zcEM#lA(IS1KZiVRx=
zFS#L&(W{hFGC(>~(91QdTFSIoabvyD_o!SO=efu&Il7{kpTBDFi);+Dz}xrb*w@z#
ze9$wcr_;~*2TEbAz7-$*Dq5V%#wdH;@6|Fga8v^dCCD(6=~@v=JH=8ZV^=#Hq1570
zV#bw!nT(LJ5CY12wc5pD?dM>Ikreoik8#DN<~fmFIq=4jH@dSsbS~x~rsjRDc=%-f
z7-La!I1+NX{m4{Iesl|C3f!W-C=A-{6|HgQDWs3t2Q_izx->X&=4SH!vdVe|^e9G;
zxUa)y0{BO?&yw_z<i`Ba(hBJ@V8_+zrb)9L5`J$Lf#YyEX254DgArZN)#+j0PmJ;*
zhAaNgtGL#9w{gNp{OE*EJ_3eITknGyZ&gu9xD2go3>;@?2M(HlF#_!B(t-FqjF$1-
zv8o7#Bz46@>*slNfy)2G(m4jy`S@-8oNU|1vbB7|vTZNBwd`89ZQILkY1y?Fwrrnz
z&VJATyg#q*`~Ld+Tyn6CiLH6L8nTnay3o*20>2PGaR^{*V8WWY7vOu`RZ*k>ku&6s
zj8YJIRS+~tRvU;Cl7OlPP$%PT=Y&->MazkBsURuWVJTGN1S%-PI?l<BZi>1Rk&3GH
zl8%0@WUganDorCf3zll4Aok&GeVQzE#W)2IkwgprJ;W8e4;=4Sl<X)}k?K_B%~X=?
zC2~acib*x&#81AZlRwolz9ojzYLAZsi~GQ93;N;iWGVl?_06Tg?^o=Yfe?Fbm%m==
z3YqKeyRe)kdsUzR=maSfNm*3*%;exdY+9eu8vp9R;hONPqH+aQDZP;UegAD|tm`s(
zmEHF$)$S)WE=bS@%#LX#G+!lX1MO2`**IPFSCxVTd>h`keK_XEQun=<?lnoJTvWDU
zEI2K=?4dcuU*{~w2{?nY`m=efLYW8YcHVEBvqkSJXPEryOM#YJjeCBTv{XpXc|P|e
ze;0&~rds_?>uWwFrci6+VXCRoMqnH>up{9+Zf`zpMvuslo%lRgULSgxsMASO;30%&
zGMI#cT;%EV$-FG=FRm?r9V%L6E-nr0ZO=3ts54@UV&;#7@>D)0C4ATgMg93Uww(7O
z8ZX>kH--`Q>_}AA=#}<fPbV38ypY=WvpR01TxTx;hv>+_2$$ywO!{uC;ZJw)0|rWf
zHb{(=19}fcQ8P$cO`v!zwLL|byz^C(Vt!QNn%@!(^H)z45`NIt;Hp*j8L-{A{j47X
z{5$<O`8Raf0N5k<RZ+W&QmgBG#|Im42gY!<`djbQPEH$MhXq`-Y7ljt*ggqtTZ`>e
zSG1_$<y?=)RHit~LHD`Q!=PLD*4J{!&BeD{?9y7&KO2sGHEgU#j_chHPR5n>^_v?j
zeU|2b207V(Ju|`Q`vrmIJR^>k$Srw~pY!~hPS(<GZk7w`Nqe9d^<&x`7T8;ALy9Yt
ziIhd_*gbZdlKU+T-}?{ieM^s;$gLo2iJwm=JMo<dg0_U9naXwpxGtyO&IWr}4-Wta
z&xgU7ev2MvaZMfAoW$O3mOgN0_g+;HKEBue17U%^DYC;*FL)Z0{H1~)__=@4|Hc8V
z&l~gnM@BX$0VpnIs^IdJgs<dtk=OU+Vf6>0f;z*N{q;<!J8!lS=S5dlDxU#L(~`<+
zhdXK4o8Wf*I_TtL+yC%qDPb|OhTp{$AJfOp#^FLW1pFg{sx+!7I?l9|H`?V++_6N9
zg2@C7>0B$F-1*V-x3n(bI2x?boMvV`G^Z^@2XYJs9;>8VIk^`^wPncuPf@Je^=r`Y
z{%}LXv;3zTHLE$z6<p732Gxq?=MsN~XDs@2KEGq+U*R4e#?J<iJEE*C#@}@8;bmpT
zRkm9`4@H%O<kQigetGFDiFPkGdYp8}o$G%uS%i$aYi=yxBq^=Gc~4qq>vszg6}cMw
z^<)b4I1cwpO2;zClh$L?wUY1;0qrZADDFc%Y@tF%-rq|9$m~pGcj5A?uk6a%u901{
za==CBZF?R#?U!woi#nG(*n4iY*LMF2cAnWBT(d@fJLD_I$Ey)O$%#!j<UJPcW8EVN
zoR+QJF$@Sf=WYBojBS387;|!R(fc>N4mEr1^<->LC4R1PVEF><yi07H`YCy1`{7MI
zc`Pg>T+NM|dWc#EBlOcqMn%2bOM0J%?`M~-yal~&j|{iMhA;%;^T&W4Tc&$%Qg+Ej
zM*TX_!cOmkJ|Yo4Eg}r28vRt&sPkzXZu_^24m(PG3?fc}IPL2gzT{}xlGo$K*-bNI
zlPX!MR8W|2m?#rXT63n3{j+UEMVHqno*u0erJa58n<q3)T1raX{?I`bf}tNm@V%Zl
z-!=}N1YD#ylto*tvk`SWjyIp^5CqA(vDw*-S`kaHhcRoNJOkd&6Po^IKs+QU5GbSa
z_sb5tvBBcynS8eOnK2qXXU>XFJ)1P)oGg_&*G7GB-9;L4M8M0&0ODBc`-u>Xqzpt(
z$aMM8qe%X)(_!;7QoxNUP6(~P4VG>9w3*KTSv|9??dSVlEOQtEr-XhM143KAD`c=k
z=JaxQp*QnUG8Y5eli^l*azB|IMs2*JS?OnpHC{NHheIt<21^7faj#t;2j6xTl?rK_
z36L|5$aQ_E``0V-m-EdobbsenHh=L^3;K&Zf!D}hrn)7zS9(h*{4lsbUmqLyno2`@
zU!@%EW8|c=r*ZTBtWM}f*4{6NN<XO=BhMle5pg%DTEFx=`HOqpQ6Fw*=Cg}SMGClh
z-I}^`h|OXv<EF7V0O1^aTMrB)#7Cmwf>`Q|dco^<5x!SUfL#(o{};Cu+^1ZuoC;Ml
zyhwFB433@4+p;m^KVL2qhtLQ%sHi4TXws4gX#XP0b)2G3D}O)|!)Ln%F9dgV3{^=O
zPGm4=fIy2FILt_O<AO#DtW0_Hvdlpbif~o9IZp218;jIGX{5lInkcCLu|~IlT%P{H
z=f!}pSLfq=5*Fc^$6c#POU3ErFF<A|DGtcf`7plbN6-D#Bd7g>`(LG+Ew1LrEWx|+
zuorE(^Q>ih2n-Rqt4|CkoEC)hQ6~C!GPi0qICHj@|E>Tg^J(#ayCHQuWKbda2m^>*
zLWFO7sT1;X>hhS2XS3r!@2D_<J88gH8vaBtt|@gA{{<hj(#=BQ0XzOq^#04QRqU*p
zk3(UY+OVcDhcu}MjK*|fL<}S=<gN$^L{Y6}zq?Le+H1%?-r|9>{DBnNxxH*T7?~{Z
zV2@m(^f>?71af0(`<6|awXZ~?j1O}mrAt5DlIbEQQ9ym~iZH7S%J*2u(>qT}7b|_@
zU_X}p=ZEF@*R0R>k&~tgh;2K*T+6lf@AqCe*mSau$`0YV(!Hd#Qp373!(vCRE8Pr}
zz_K1mV0Fz0_U~sA>9}&Hzn`oDDx9zPJ7a!ZfJ%h%M4n?11%=&Nks{c*!+-6R-lvtJ
zon=$xofsa4)gmXda+PxqZOF%Mz)w!koge`!E+th~_ET(yImX6dH1P|b!yiH0_yHF0
zA_TNFRZaX}qQBa;)Bx5rl^6xq^vZC+HUjr&QVJxkgnD&Eh0Ec|x>>`%+d}-w6qZ#u
z{-uHewCW!KBrJG4^Q}W-5!uG^Q5Y+`fX~}WL2cq0?A#Gw>Qd+ZWBzNUe#;_g6-<h)
zD6?`=-5I~T(y5mpFf{GUa>*ezBqSyAn>oI3Kfo>5lDoVp16Fy63Fit!eyL|cE{VZ)
zFGXd`X%EIUNJB6)0JH<yiHcSnDYU>r0(bCnI}*;!M^b-XTZg1o*u7H!+AA_ZC@4Hd
zQj^~jGb8VZ|L3fekRdeaeqQK?P)yi?{=IacUN1&ea>3W@d>2YK)!`I)6E|r<`)3w9
z#Tl5k7OUy|6Q@b?kJYA!HP?q{*>6wOr5jH3F4*wkV{OGTlFB^|Q%>>Kl^vhIrSl&<
z-k`~y5i(;i#ClC>CApMH39(z-PgvTQFL)%E>k;^zE*s6-=r*@;-|ms}+O!={L7EtD
zr=l@@^}pivb*ULA<VD(>R%xLqj#Fw1>txxw?UuYQbT+oo9T^H#c|CC3+t48o63gzb
zT}1+)EyK@$94RGN4F*Pqx~gcLU_;k+=Y6P&eAaMhtJHd0FJNn|oJsOZ51YF77`w+b
z<imK&gKg*3G;Fi{Z=ck7MK!{0-a!~Ew}40RT6KWy!(q^zv%0jDYz<lpTJAExpFv2Z
zdBZS1<5FJCF@;x7w}!I98Fj#35yJBJ>(o6@^@gbPUP7XWF8!oXrs|tHHAIBNxQ&5j
zPN-Ia6il(+<*}P(H&p}9R_6aiGENkS|9r{4w@65ggU@pOcyY0m>-%-#Q#m7s!-fTr
zo2MUn(9chs72mz}$ML~{5tjsNsslw#f1rkjDQY(-Z07YL%fVW^i=p}DIknkw4d*V?
zx!VMbh!{wYh=87sUHo01?M~SBz2KpTPsm2)4@qMylreY5T|hwGp4jhnw(0on-9f4~
z=3$^1#MsrfQ3IZR?y7sIP{;OlGu=>o>^V6utvs)to<#ImTXZ^x1V0?@v<7Lw9Rds;
zNl$Mzcstol;Pt@d)Ro`Yh(!_)i*ZXbU3lAthSqOy^F2p~?t6)?F}#?JLqS;Nx!5+d
ze%W<;$LAD4rA9?zL@2V|R?zA*vjAv-O<P8#M^~@~@pXAErVQl;`+V;Pt<$76Y<2}#
z5e~3OJG33=E6PF*Z+Uoc!&w_c^UaB+@q#dQsEg_>n2E7Bp9h!O4BOqWz9pPo+n}BM
zbawO#5pCrJg<Lbe+-v4ICs`pyT{@b35so;6dXz~{&A<j&vkwspxP|;b!dJH+7Z2+V
z`dlZ%sh02=e(2GKSFk{jpp$Vdi^<995|owr4bAv#31ty!9C4)5NSJ{XX{ttxnhD<a
z>rgA2$wj`pkMwh(UTG(2otMfso>Y9P`<j@80o-ux6*)b2e4Db4TCJ?^h<o;o7?hiY
zvG3fMPpKw<3%U3p1>^3x8Jsh{v?LHHtAp6T>o&{&ui{k1vFDT~Er61@B^=%(K`6LR
zj$fb7cB9WVSamS_M3PZ>ttUTiQ3|RO2Pq)o0adz=SfQo_@B!>d(EP4B(|9YjE5`3>
zUgSH@zhIFt^kKOv00*rnC<%5%NEEggf5A&^oIc9a@dQoxmqT0Xvqz#&ovQ8IW-_OE
z(jx$qR$I4U&v&^yFXHBslD(^acPHsJU5`W2O8ou<5uyg04SB1N-R%#BEIVJSq1=z7
zR1M9&U)m~GmBRs{+J><MwXsL}Y=FiltUg@JRU;VHJE1ZVF+qDO6=oQO*xK525b$>9
zM(_U;X02tZ;FfDyUWf??L;iK;4+Ssot=2ao&F7xXlbW((I$)!ma^Rdr6PuSbzTw+o
z=8kT|7vapqV2+sW`CjVN-9wVlNDG;Z(l7ChtmFzMWE4Q)7k>|Us#9UQbbD`nwAy7H
z(&w)5n2GP-YuyiL@pAmu=4tiUc)2AA6EOhh#cC}pt5b{%@RB3!CF`FJ7%TOCr{C>H
z@CQUO`R;Hu5fJVdv{?)v%~oE}9rAeE)&$0u(kd7OA>t0mWh^x!uht5+eSzNd!9m8y
z(zg<Ii3&?tiUb{DnUch2-alXe#2fN$^1dF4&M0J19B^_nIIK<8hIAHwdNoYraaYR_
zY5t|2fBZIp-WKpMKal_O9h9hN3X6ys!l<SL*r@wH39Nm8HZ_hPGx0maRlF;c*~dmp
z41yvJ=-^u6thm}8A`z2Dq+kqj*daem=@yJIeg-S3S4Yc*b5$j}VmO|RGwbYTWqa>C
z1GdmVrs%d)?yRo*KW93E*AFj@m+$|^%kS^@)2Ud-N`y>3SJzM5>yQ&HTsght0q2*k
ztY>WUnyeG1fdJhKH4K~$q34U<ZkNNU=TF@aJneYJ>6%o*jE-CL0S}v$);W2fZRQxR
z1;8uw6+Pz*1bp%;xLRnsu~<HA=2WUwxVRkla-mG*x_%R})hiCx>J6be$eIkHu|*Bv
z9|(ModnF4bYI-eMDFSi#N~pzo>@Nk!h>!47b{%3P3FA_wWO<Nk&eC_&AL^gZr<r&?
zb=6lamO!j(lVb29&Z`+DbM;BTBe~TV%Egl7B9%-4W~rcKO2DlCcic{s=Z=B5x#gH=
z<#^3xGE`LtEltkwXX@A6yt|zqhqYz0A9vmrvN$sM!~BPUkxM`Lcsm4poTy)NC(>Ik
zvF{%Ws@&??)BpuVm&ZWf+ghQtx1CJ~Z2Chs0Us|wq*U-<i9U#?4Kf7nnX~A@&=wzC
zx281zN}j!Qb9yuMcCx{n?O|H6oRFld6-PYP6ck!I^ZqKQuCER*VKiJZ-hfCBGUIx?
z#@<$WzGmPZ1D!PS(sj_EC-LQ*k34<fki#V7D})}!JQ6tt=#Vvac-y{PvFx~{&t#|u
z%!wDV54(7~pW<|LVeN3yYC?zGjIKIQ6LmfmPIZ=Vq)Eq;5cUO60vN!*;waJ)l|<qI
zl5jAd5;5nnvuW-2tw#n4qykW47pK`=VvZB@z>aWZ@Rzv=(euF~PA+0N60x8qS5>DD
zfl0X6#sH3Fdh{#-E<Q_iNBclqn8*(y6qp%X<m8O4PAtAx%>ac_+s2Ags=z&H^t;>p
zwp`Ef9c)<Y^O&3GjK8yaYhP^XB#JCPX_lh!j%JiEdj_1X(|U<)#7mf~sJKE)SQ3N>
zC%PQeA)}LPr}Gr5%LBrYerwNzrGVWKZl6d3h}OvZ3zdH++pKyq7G^a+Rnw?QQ`3pt
z3Y(9+IKDM;upo97bPmv<lHVIAx8a?_h9JR`fD*8c#orbT7nEWyY2}IpvV}4~s{fnS
zToF)@IABD5tir<ccq6H3A6ZFbF+phl-eG@oRA<B+@Cg(qsIEMNC}u)_XO<KbS~?&(
zwbmRCq@sz+k}why#&U}EZ7<?ymHYq|hQ*cK*iuWA-}`j}*UN!q^VA=lSlQjWPgNdH
z8ibo8{m2ASazln6FcM}HP*7j}-+j!3yKZKn0d%k&i<koQ{e1H61M9QCS%=d<h?;+G
z(MyYCSk`j4wAim}@R^fax>muNgt2#yuPqNa^nnk;CDFECx18ep4$^4uD+80Dx!Mc^
z{3`pr=0yDxf@`EH-FEde0Eewwyit+`NI{^rf}^X5rBvb>5&X*{R%Cw8w(pkuD?>MW
zasFP8!w4>zt6I;qDNa4b_!*at0dN?eUgmGqcnG;nB!blklJ8WOGIWwSSR^T*NSn9r
zH{E_F#~;oLTCbw}BVbZ#xz2<YeW2SdIUxdWIQ4R5bFm!dp$57x<Gh^Sv!al0d)=^v
zl}Y?+pbq>db~XL=T9pd=wvT`^_#6JIkadWH49DyC7L)&L#9a@xfTLsZt&nx#*Dx_E
z=vX=EA$`sxQht)lws9H+iGDHICU<r8<0gkE6aduNQ|gLv$G6hlBvtxw65{So(bscQ
zws-sUvLN=cScfViIZ$8|CHv$C8^48u{&Ls2E?tGDXUQ>F=cgUxDjfU|uvCLp0M;G{
z+O~y=Sz<$S=l#X3Vdy`1q@Pr$iMo5)Uhi4h4E^7ildKfY3yP>(9fy!Ckx`>d(Md$V
z*t;zYAjbF&X^`+4(ygx=c0FHB_kH&onosvBJ6pR9AkS^usjxa4Zi;VvqEVWB;`nc*
zGAn6L?}LkJRaVki71_j7SKj_RZsTOz&ECJAV~>jN_h;t1I?oS;thG91Z`amVlfzT#
zeVgpw`3CZF>)|P11=|%e^lS)+#ZppMIY6j|I6Nf2vOlavSDv;mP(HpW_&=i@NkBuZ
zHA)Y$e}5*L+}Mnw=Lgh8A$lv|iD`x?;p#8b*#4-(3<=s6-P9$U*W<q?0FH|xoWj_<
zPf%_AtfVJ_f9&<fL>{&(t?oWqXUbL<EIf}K!eL-DSiPJ7ftvs9{>f$|zIRsi#e1XA
z|0B{5c8H^V2o-RbEK3^(&d`|`c+g-WAi2M&S8V_N7b(Ep(Avv;hII9!;PlTHpkqKX
zHsl^(o_OWK*h}$kii^`>`a_b=+MXioR?BDy#B7n8m)VRvg~I0sdE@<N9ha?+q#)eN
z5fBEA#6$-WNGifyE+7MM9ZivNRx$1nhZF8`E+*)NH4fxRouRWG@3{GH?3+#pVhBk!
zAjbxqVMD8hEsPU+TGW62y1-WlG3WJ|^l%75G}M5?98U<Bo=iN1Yq#9-x^xE2=!6d|
zrbSj$NnNYnMn>kk;&cW^#y-7m$vgR#*d6b`KEcNVLO%gszEZyiMshB^^7I_N+YC%9
zPNBx)sqQ26PMG=rs>H>2!T3C}{C6r!GTn*kK4P+)YD%RaQ^TV`=bSZ67<vTu<oSpV
zHD7<d$HQE^knz1y9X{afv{Le<De)$Vi=O>3_s`;GJLGx{l1DuL#66Q^Vq%)3Bz*Z>
zo%8|dIH9K37bN<=_UMOW?G6n@B#{YKSgH3nVomq#zJA`E`nyMhiOGfkLXD3H*9z63
zw3KAtL>0%4k^sySef-6;1D<KqcO)0~gIj#6X(ad{eD~$=TKP$w5NqFt;|wB-ByF6}
z3jc-nafNNm5(<CE_<BK$rJwuInIvl$gN<zhd$6UVY(v1$L4b{*dm5m|zZz^o)DFHJ
zgOvEO&&zz^=f*D&hTas9{qTDCSy~1?E#TkpX*+g$Zw=H21cPuGwAqOABSFURy)MJ!
zo2p{~A`97mK%<t>TqV+w`^P&52M4F;_GB=SO?f>esKVUt;=2znf|AGx`BoO_E?aag
zDmF(#hh!Ct_da{p;O5l>-3x)olIS=GFIPt;FJ&SO6h}i}#;h73i7ev!Y9odm9)(f>
zbMEz%6nR7RB|9B*N-)<wx{oTws$qUQgLNwkO%8COT3#Q~_v!mG^rL<HBG{n;l?F?i
z0@}lW8;UcgqWh>tGRw=ZnwSBZIH~~9OyO9rvt=r#85zlF*GZjpvD0{}zv77C`t$vD
z!O_7ZC^|H@l5phYsSMDMexcLrN_qf)Bxg#<3T16OgAYJ}>aR%Sd>VvJXNi-8-ZiYn
zQh3Js){i)dP*j0FiZUDUIORevV0AlX3XADNm8Q$178w*}9m#k>|6l45g~p_$qH0>M
zGw9?*8Jb9@k7nSYKnm$SU}9#sc_#$mSa7Q~L0K+Ek|<!*=`Fm4<n#Jp=5ukC%amfR
z2Ip9oy<tIb3L*e#s9<VJqKrukzHdZ24|eR1zycy;X8qyd485j+CNsjwfT+MIjj)2t
zLW_bndxNB9)@^L^$w&n8t2ddi&pzP>&i=2q473k_&eM&SM-N{k8__7J5tb~7y2-l&
zHfW7WUngd9?(@!zWHsBjU3@$z^THT^17luy-VIxCD?I3PInB<u?%=2A8F1h*{!Ds+
z<&Ern{tPOZH-l)+kjdrJWh0|>Ke_-%*1T|o5@pEGBeuBx@)7>LgD;B9joE8iYUz8&
z_;}{p$Ai<8_N4D9ZTtD*tMfZ;|MEQC!-O0on(87nFn`d{(!vT(GRL?kx#Qu<yqMA@
z?_i*UlM<WT);DUZv!0kNLh89F86X#OZ4rberdHU;IrOkf_dAZ%kt}`hw0kOTuXnm=
zZJVQ+q_XU1w4|C$62yaVqD>5!etT>*2U;*^t~jz)SB3TmExX_Eba&jyc-87Znq3qO
z97d3DgFCh&I>s?CjtgVeS-8fQ4Edb&8CyU>TpG1+)v^}^%$BMBAm8gZ2ak^!#+j*&
zJ*-A}bgcWXsGSOatXx$rt3C!>=OJ9E;?he}tT3?QpaK|4BW9JqikD^enSJR027gj9
z>mxnr!-7W{YZ15FH99U3Uft`tS6{ZE$1y~~zJFmQVGBI_yX#BI=b8hZgGCt%5(8L8
zVpaTx-?5en>3hZnG%P1!hCx>l==I|s1NsR%uv`E@6WLrn41VpTidAGWF=_E-*~U%(
zle13BAw4BgQUD>t_>-+AE+s(&Cr#aFH8U*w$;n>s>=<^!f|sglVrw#$_c}?9u$Um_
zcPt=-<Eljjw^|`)6;T*(G8>C=*WXR#(_uu(-e=fy6T2a6{Ojh~8*7nSaZ2X&;I@Hc
z$6oI(RT)PCu=~p{xm&k#ky-m;@O)D`3I^#%WB`?xWN2iqs-2D|i0ZKap~ml&k%(K?
zEB{nk8Z6Gmd(r3Qevk^o4Jkqg3VvZpBcN<I*ljY+e%Z`}XghHyovYYt&m{cvWNCxj
zl@mHlBvX9BJ{1T&<DRH<7Z*B|B5=ilA6(liMQ-YPj<*B$Gag7kE+PPU_7{%3#mb4;
zlAdl>Si4T|?>0A+rtKK0NW@Vi3(!DFKmnD&IaETDDL<DNVPszH$R2$sd)@+yhN>h+
z>G=iYW0r*?kAtzad-RrcekPeI$cPs<8I5dXZ!n<--#K!S2KuSFSc@*WD?rrIprYgU
zzIDsbqpY_;4-1!JPiuiBNvwa=RoVDy5LzaW!_SM}mcXfsRP{7ic7_o%19jvZYJlgo
zej@U-59COXQ4P@g>`SHTF@u>Hf5ZmSiyv3l_k!;?AV2Gw9B+vAqTmrqc8NZkf_!w^
z<ot>?zzXVUFRuz(YG!|s$4)TdK6((=+Hy25je*<VcYF%~KIo-NsAmFvdHPLG?!N6E
zf+qUCLN#q*5VU#KCz|e2jlA>Z1eB!!*{d2{b{KwFVe5tD3fv?8v}em+9ZsSZYbhKl
z4NoGe8NC^N0xBdKdyOc?4okO(kcW0ZuqdKRxvXRS8w0_<K`&ug%r|7tC+D-{*z%mI
z++1qMu9-Ge*Ac1%*Nwv&I+JlmdH@Ettj?5j9{2U(#tOOr>T~lff!jOu=DJm&^Uxv5
zT-QY}fWd@4q@u+U+<z8O&}g2m^l@35wrOp<RK6=eTlDd=*wvvHqr51GrN!^QdSg}~
zc~qGY?3C{kSh=6QljQ#wvDiDa1S{uv|8@Fo?&SbE>!0Jv%y6ruqzWVs>gy^*pd?G8
zO{7GRk=Ngt0Rbv_AfYQ^_QX+hF|jM57A7nvZCrcGDo;<I3uC{+*~s<jSX-fg11%#W
z02xsn%6I_{UW^eMG;ZR-C-SmjS-)*jUUjD;X3DdWk^7O`=Tcm9CXeUVP<_1#20t~H
z@+g{8mGJM0;5Z0_Rt-EKg{{2l^#|Wchv%qL5rK%GJkkv+kc6fhz+e`tv_IyiozLrs
z0C6;-G6Si+JRihqmO_bv6edrP`dpgL=WSP_j<-ty|80VV2I?<jIc7Tlq4*$3f$tDP
zO7zGcWdw;U1Q0^zcl-0ncbis=x!0)fYD>m(+i(6_eU|X*x07|*8OzD?{r3j$uO|10
z^!emoXQ)3F`}kD7FU#MD@Lw=@eKP=MiyMY;krkU;wSBLaj?3cwua{y$d2vCSC+)`f
z$+hn<)g1Znt|#6{Qmnz2o)Tgg$;g0~JYQR1F8U$_`^t`1zaL%!CPrV%TKv`LL@89)
zqOh0LbV@Sel{$|fY#c4=OE6u^ZXbWQ)t5hIctQ6wF(2q{`~3+&3DC}Ym-T!W7k=1o
z`fHvqCUexV;W$g`nt>dhGVy_;eKS&+9u}Uw1T76@A1-$=2;_n@_8Li3pI!Yj088!E
zL-tg5n(^~Y?BSu;sW-vAz-eoJ-p|XSb8oVty8UWLWZQkO(HRMOrLnd76w*2I0cWHv
zIE!pVesgcSS$p38nYt`Y#IF6*vs2{5zeIx^8i^W(JiA%#Fvy4Y`g-@{P^3FHIA(&#
zgKDxyf(AwtNn-+L^?Pl--An(~OMFxgfuzflK;kzyR<oG`v*7eU%*OBipcbBVftzvD
zoo;{4RsO~GCh5HjI`{g#BZdMqg3O<iT<<R1MkJ|QaU8I9UFe@g8XTtA(xA|7nPcn_
zT~C*SKkT7MPJI8h$;Zg?dB4a{dU5CN!pD%O?=Fcry?$L~FPv_*NScC@1I^@T<juhM
z($a`@4-(!+TC7$Y7|6{E&O^{!b9OZz9^I&K)DVT)10Za4K0D>wO9B#X-WnU!2}D09
z5dC|XMl)PHU*{yz4I%iTo5N}I!{T_6=b~s<!N=W@+Exh5>blIuI&E~hk<XxgFZgDK
z)T@XVYYCV!DB=_QG<lSDpZEL<fByOL5+3PAAzhXM(}ROB%x=|u_C9%=sn#KLbaV<9
z-@fY+dX3!ZP=O9&`l<cmN%+xs-YLKkB0UpQm8XH}2l7Jx;1EH=c|S)z9D#?ycBg1!
zh?53;_j`0hp`erG=tf+!hkG?>U?^-2DsGS08GO=qd&Vom=dBD+(AR%c<Z_6hx*=+~
z*=O;y$z1sZj0c#M_&843>hS>rIoL8Z7^j0Sv+<F+%A}3O&oOhLL!!2mGBperICT&?
zaQAw8XMMkn`0FD|44JDQ#tR6(FV#DOXS})`yJJBTKP&1y{=A4TJ{BS)E!^Y6BQ{}h
zpL3bdt*}nyeia1pq<sQnmX(>u(PPF^nb(ZEoESbt`z<k1YI#-zV905rsLVAib};Xn
z-~FC1j_UjNM{!Shk(clzbxDAflr*@oDby<W<z5~+!fr9@E`xN9n1Bra>@O1<sOH9y
z<Gq-&=h_AO*v~V*Cuttp-KNK!DD*Rn?s^znnEPUxwP!sMO3bIiW);{x0T35JYr3Lu
zDu75@)g~weZ@i@}rI31`4j3qGbs9>hp7(-88AamTTzzdL|Jh|MOZ#mk0SYKiv>*;D
z@UH`O$B*`A(H?OzKv`Xa`Yp0c57LIo?hh{CQZ9^PaikJC%G5G$1yAJEN*;-&4yfuc
zdm<~Ul(!!q9(Eu@Fl{66gV(2<larHp(YMEF)-D?eFjT0AIjZPIpGK-`sss!Mql@6w
z8A#QlhV_)etCu~#U2L&2BdzOQQ-@baKAO;>q-cifh-uN%G}ZNW-0AO}W^iRQun&c!
ziM$EN;(w3d(4(RswlE{drsbhg)j0oUj}5dSz)&;@$<RqArmxc6*|ujr?<Hd-q-*z<
zUulvGu_8U}PfHE~!Y1<52BO@FJc_yeogW3?Bj2xSM}l%VkYEK8P?z8K%b%it!;yLg
zgD?phVgLmgmb`@ezMm6sQ5Mfc)pG`Iyur45je^Jae5fo|lSZDa6lxyM%%>USOf=&W
zBdOyUR6C8RdjKfnAV{l(jbG5?y<GqGhVt_S9TZ^HX^~)i&aiiXS@Zp&e|=n6-3D^Q
zO`9e*j)4oT9<ezOQ26p-6q1Xasc0{HL~q(SeL>t=O2+|>8x@19<#hHw$npd71hr!i
zaxfFCIQo*<b&D#hAHXtvZ}*Zfp@q~+%&6Z>6~i!bkB?}lP@L`E#*Q<2&q?>$&R6u$
zw0@xtdt%}Qe_Bk!%6>6Ae>>)pC->>hMT)hL1(ld^#ucggrIB}A<O{<cNyw!2jN=fU
z@Te1gB~FA~9R%PkZst|ZYD9i-ZoawSaOykEvV-W%J1Hf@b08kv7k(E$<x06o{Q~7x
zB4fr&zSscg97zoiG<J3kN)=q0!UYP*;jU=~yGE?IuIX-eOl72Wc+G5VFbg*gM2-Iz
z&q^~BD2@P&clnB-hW07bBsG;p49#9cDzt#+GRihk3{9>CW$5j&9xAxPR85(z?RrTW
zEm4d@g_o`@kinfM9JMkE!HC)FRRckR<_bG7P}V?p(1K+bl$x5GfVzhTMWw3hf}&=n
zh3iU#2LnQ)W|v5!<sVZ}!BcHD3)TD-+oXmX-SgLu%28MngSLW5D-nc3hlQzY)%$R8
z50S`zd6TE*=5_&~0<i*_Rr%*0wxN&fpB_;#f!_2$Ri_9Ul5GC7XT{boGHq?G8FtKH
z_ebr1f7cJ9mr<Mx$wt4nQuc=?Wd(=-EU>?Lt-s(iRq0W;1oYBS_ipFTT19eVs+tX4
zh|^3kX-4&|T%DvSJxcyziCaA(^Cc%Khicl+eb{bI-)^+^QBIi94T=TW(Ht}SKh-4e
zd~|3}Lgqpt|DqJbuJ*rY)n|RG;b!bTkN?}dcc;VU_dgOAgxlK#p_cz1jTJhnX`T@{
z%Hc2{wFy3c-j29cZQU!M%=)n-IYPTkx8DqaQS$&`Hkad9DH>>zVTr>Q8J)si^<i_+
znud{LUGPjy@vm2>T6=ARVx{ZN5VK2oK@$yhXx2hx+)y}ISxNl{nddZ~e_=2&_P|wZ
znUBtq572wV`i8&X$4J%r9b<@uNR|^+4xF_j@=nV|U!6XqYR&6tr#7fduVHr}#T0JK
z`ukoSUu<`kHLqkme^01rmR@qvVY?ou^*Ix6+K#$L`_RO<1>kq=l~lq}8T#Hz0DeT5
z4{%>e%QM?vghj%2k)FULmaMaTNuSNWoK7A~&H2IpI+c`sQm9p%nNxpURt5g-`E#7L
z4TEOuN@0Y4v9d{!cOV)vK=2%D(kI_Iw@;q1k{x1zupk$nj76PQ2KY6@pAFO}C7k#m
z6Tqv7c)jxnGhyR^KG%au2wCgAEZb49+w*L(QhiqVVUufy-+VlQ(q9ko$Zr#q+e@Um
z)AV74?qEVwn<RiQR+pHk@yBC$FZ9ltn9!0aEz6269e!*3xy}c?=nPxY(mku<iv5h=
zM_#)f+P;51A{>SVA4qK;aYH~KDd)5P%3Kr8lOECt`AMUyE|&CHnqAaoE#9o~8U{qO
zCD175*~{BKGhqrG0UGIazL&l;t*~@HMLDjH+DlJ<AgMS(dj~(Jk>~Wu><<sO#cICq
z^I4O3456rNAWYil>8`wV+8K2@iIa0GF6;d?3ZiY(o1bv_Nxie_r4k1UWh|cHZkS)F
zJ8W2)LG@~L^9_M-bR|M8`QX2xh6lNq#M*kCI%G3$_gxdTK1L$6gZmmC@VHrmUDlc^
z1szaW0(@&Hhk!I6Uxt}}KehV44<fS}{69g#(a9+v@<N6|g#Ckv^xV9>TDJEeciyAe
zM&i6=VrwreCYvUZAbu;6bjatE&8wM**gFEXp*Xg9y-Sw9O{M$!-_imi;+3|OF0DPd
zBBynYnZ$d43+=6pk`uF8F@@%r3@6V<iIU`f`fyQ!_92ReX6UDdj9uiX<Oy|;VyDk0
z?@M*~qTjL{Mf{cb?8G!Rb4{_ab}kpo<m>bs-6cVHv1vRg^3=o-p4Ak|T1GU;p%r+U
ztxYzZ6*G*Q1j?x*z)sN&c)476+UhWyzGvnkz^ImnTfTEQe*5`}T+qG6jk1xpPu@j+
zZ?dygTj(HF$<rz3x?0<=G)a87{S1K$DvTqX4}a*r*L<n|e2sf9`-kNV+4JZ&e2imS
z!Z<&BV%EwZveZ46f8F7m-sPmtYn@SSJ<)7E|6k7L$d6C^$=VRg{g&5`Fgv^R<p6vi
zBp~06d}%d(mt%it^OIG34NJT&i!B!@Qc8g`=Y2@;a~r$PpWaeqwoIsH`m=1^%j5R&
zYg#D<(WwslvKpNxkxY=1x~Rk~#&ALR6>`nw|K7h`G?D2`%HTe7{R{halkJRLjAXnY
zc*+vFE#TDk4G$B;O!Hv1RV+6!D&z0Z)Qx(7i<`y;J*920FF2Ky2m+?q!#CZx-4DDF
zQ6o+OZiQi1--NtNx!FgjI&IeUqv>p~B~5MN%YJAGGc*{Pm481$GMEAb)3|wT35Vxj
zyYpd9j-)_Erpl5%l5+~Z?(}*7?niz==S_Y*;g#r-qADg;0O5^3_OE(E*(5iNz*I3T
zDT(mR%nPHIGo}^R09)@UWJQUC<$}l8fHTo5#@{Kc-S9olUY<*Og`vyR!=Z>+;yE0I
zv{W8M0*$~|m&0L;58%U|+<)v|W8{y&m+P91-+4OKKkjI+5t>I<)Ym&f=vG3wzZLUE
zw%EYc!nfPE_S7Rg)E~nHU_((Vzw+XHINL)VGj^W|Ll^5GE7GPWA?)w0OFQW$=X~;C
zk}doeQ+VF=cA2*%+GXePQg#_b4R3tIw!o=MLno;cY%7KHJVVk{5-(<unM4xu?av}6
zg<MhGcmfv;Y;w}Pe*Ln-l^kS*v`0F^gul?L)gde_oT)&UJ*_qD9`KyyTY8{$5DQxT
z`IE!v{uDAlScgy$eXl3}QKA0fV-|kQfARk&8vBh3D0{r-S1h6m75NDiwH>5e{B=5l
ztyB51lU2>CpPg-M{o#7FH-x^u%|dEXkgj-Fhw5Rb(f=%vDHsQ!nJZHpVYKQ0sAz+v
zc1raxIx3mdr`@6Z9fRlP*}%Wa-ye&~!8U`Th$6>tUv>HhA5!}9e;WpX@7VfYuU8?s
zHV7z4j>D*SZAQtp?{-i5il~C}Rqrt;z#vv<iBZck&gqHYR%X9icnMKowE=lehe{2R
zTE~o64+_QPovQsFw4+^#Q#lOqthrn*G~oin^<vACdV7)b2!5{Zkke@S5rx$$?O~v<
zqe(qc_Z=nb%fZs5m95nSE_7oS+oF<JIuWgdnol(ktg>ElN8Pus*6X;eIlWR}KHj3}
z=m5`(*R95c4@&z_!auXBT88oL$0%Nv&|^77a75YJ=jP{wR|W=BW5lF9ltI;6wOx>^
z$g%5h83b0Mn8|sNB+n`Uk@`lUkrE00J=clBzp(vU{preDrM5>QuTq6TwRJ##N&exC
zNM^bo2}38Uz=D#U=7UlOqq&Z(WRh5>XGMlDJ-Y66Kx;^St>dbT!QY2VQb^C%9hW7c
zT}6~dbJx4nby(8>;ZXFB6halA<~b!vKhi*G>?)-}h%5QpQ6}-6YG0=*s8qsD^wKyo
z6nlihgYH$sTyNj2M~ADe`eeHaxzk>6*XA{-WUiiKsZguA%eq7cKiwR$={D_S=Rbn%
z?5fel&u*Jmjzdpw9w1Dt)3xSyhYeoy(HP}`=LN{<!AxB}xk_4f-&iYKarJLHM5smc
zXNl}>q>ilismaCu*d()`MhrA5LBzq4qREse8?Z+GDEHj~4u$az=O1HCLsH{PP3L#1
zZz_!NSQQ;g^+c19=@5TE#5PnrQ@z?gr@;0r)+_=RPmI>2o$*M96z;ZSPKWLAnW)fq
z%d)TUX8s34cH5Pk)IsOkByan6Lypgqul^LDU0~TLzC~b<?fy1BUyUp2Dh2`0yH4ut
zd3|nw%Dn$gR@p#Y(PN@g{&)o{CAdhCMC<$?r(L3uf(o~HEG{{mI~}6HB*qDN35{Ix
zo-oSk!q;lJnCr)&s&M^UT3$f`LDET>X=2?J823g87E%H#G{2z`;DAU7;lqIGQiQfK
zk<AiVfgl1cAVFA26Ovfmr__v4ob81T04+r%h`@IIgT_*#AV##F+>|bsEp0g(g`~7>
z0h%p=Gf2hB|JT|k5Q<|$!jOtRnVnE|%oYZ%0f`<7TFnEBAz77)xkO@kd!}`?cPhbV
zc(@7%S1c?8L^(&%D_lu%_%^cgz}#hbiMYf4gxR>(fxrb}Mm!ZpkgZ$iL&&(+<NzU-
zTRZ(jjpYYZ0ue}e|1W<)tOF2sD~jW~AY^28;oCVm4l>Lof2GMQ#y3Zd2qIrDDt0X7
zasET)$zyu3(|2#jZ++Sj1|_oWt2pAZhFP*aM>ue)CP6PuElop{TC;fc#w-nSvX`l9
z5Cc(XceR244z(e303;G=l42^Q>3hQgFA0ZGe2q85&3=Td#bd^Z4SS3$EL$_YRB?jb
z&W5$8ut8?az<S|9W56NUN<pj6$Ycgbs7Yyxsv(zH2mt8Dd8K8_YKBt5Bga9Fp_PO?
z7-TX9V&4-6snBA`mk3iO6l@Rg9VILRNa{MstI1OuY_2cm9cQ2ak&z+L?w3Mzzj3`2
zZ&IF2A5nqJD1cq<Ma;i6G@UQnS66oq!FSfWk5<?I+?jyhW%GN2U=EgpZ4J<te%#{9
z5p|<z+_*cXcd(D_&_Lc}SdD7S6hCxK+OQKs9S`NtXUSo&yOC?D&)gdtd|)cO?3Tq(
z^9O^nKcM%&Sst4&Shy9d*iaIvsZz;=558TO$!s1T6*~|&^^lqk26dS}eHz0VC8fnm
zplk<+ZASzUdDhMxWPQ2B$#d87T%~N#Rh7YA*2cqVycc;ie7K#3i*Z=JFeXNO30Akn
zAcl&ns6fK)9-3)Ybs;rKcPUT{sI=y3t4H?{bdB)+?zq{_^`XwmZKsJ?acopUG8!8M
z?pfq5b(~$us7Xda2><krc+!sC!_tDDEY{wwgeXBB*KE(`$Dxpj^|d5C`2(trmUy5k
z=Zr@;i#&43*v9j(s6Kn<`fM(1C4K!3h;>Vtw%~U4b7oL$+dSa~1Q$WdX)z8javAiG
z4Su{>*=SznHcF7+S6}-E5h+|?X~u8qJsg&_u&Dw|tHC%z|Gq4-5kHB{L6%uO{H>a7
zqrJMjKN}{|q@-(wb(F*yZUKyX;1-k`o*uhlU^vxe+E9+IO%TZ)T8xOz9ET_hlEdKc
z8_!nrJGH?J$i}{HF0dexlaSHtq0W9hULc2o+Ii!myT$Y=Hph+`shh`*Gyc8h<8{Gq
zr1GH*dGUwPjt4=m2-4`yn?7XeQ-}5NJ5g};YhHPFn=nD$NojWR(Hq~N9Ub>U)2L|6
zG0dVSx1b+5*WsNiPF%6S2JO1Frb<zjyL-<ZO^8$FMy7TfSjj`7IZyjH$>^bNGov^1
zS%hykYz|%<d}_m08&)bE=)tSDc^a^AG27O4wZ^W|%kfcyY-m~K-voiWbZ2XOT`T*Z
zqG&nVxb-*Yho$oABKNE2@DTj*a1=HZ?3Y%}yQ%Lkzen{QI}acsPQS@w!m<189s=z+
zgfx3Ze2xYV9C_(?rLYx@Z;2}U{9;SS+xK6CaRRFU5ublqFRz1Nr@x=k3i$>vTizX%
zet#dSeF@lfCOtZimbdm2_E9V|r9sb{R4fi&dbn$^w--JjrsE6#w)L~>W$;Y|QGU|Q
z<g0Oqc=~3B=w&fWv&eua`$3lYW`@nHEH&KG=qy8R=a7P6>#u(FqJZYQFI`?&t+~yj
zACGH<rglGn*z<A0R`I<`8}AJN{Wg{&m6xr)d2J@SK_=GOXXn;mo0+K2FFKCb^z-?t
z>D7_c+vd$MKdGxQsPd@m7Z$30=M8e^`E+Ju^A|9++Io_}FXX2!B`Fhe0c`G<>yGQu
zF?h!p23{6U^*9ILhxC#Fw->M|(u!z4p?88bg<VNY&+DGUnlFZQ&t|8HO<om}R=-#q
z>zJ4ix4pDMMndS$Mc@0|%*@Phh*%;judj{3Y2+L3igCa{!ZhS?I;&E>+{kr;Hg)C;
z1i!DBrOXJq5W-<JLWY21KOn&1lDh9}_X8XMdLNdLjJOd$d~quCZi-aj+ax<Wy5}dx
zD9W*MIk|qh9zvUn7vAE^A@xx(Dhv#9uB^VDH~baZb7Ji`!5i~!5i<xDvZd-5><?!7
z8Hk6(C7ML~Jzp-gv3xxf|7Yt_O8V3^dfuV=nmq<uO*EcCeztqd=Z)@`WAVW8R3OCm
z4c-?9-XEn(7Xx#h&o^>z)p=O^%i};vkx8K}p`<bxQyp*Eu>BR=vJ0s@px<I62Z=t~
z^{?*QLxeKb@p5sWTHnQp_>c|0V}kd)_WH9g`!$8s`1qYuhVn-G&)ujl2kHk0Sg}vj
z{-41T@~UkPO`<7C1m;ByU67@OOgx$2s%gYPgc=rpE<OY+^S^yoyVH({-}9}q|8+8b
zuDhC>n_Jk0UGx<MCj-G%<U@3h-B496zXM&hKE?32--<X8+rcZkRx$6P#9$?0-V#fE
zo;cHar^)ve0AR|1N=oFA#zxc9r=xg9(nRkg+DtWY?zpT!iSOgUsi^VT8}4)kJ@xIo
zG&c=BVB#YR{I2;-l0DIy*1d87gp!O#dxG#c)4r~(oL#*m1fFPWCOgPVlM1x*x*|;V
zcgYl$1~mMRgZ$NH1~CyA6B`LityrPXTB^mWH`hIKqV4nkDnckQ0nJ_Xa;M8f^?q3X
z<l!yDZKhHzmVH7D9|#4DQI`dueE=;|;bk;wirA=|351i=&uj9KRQTZtg^}adu>29?
z8u70jhM<XsEXETUzSXn>$RNz_e`v=6c=FSe6Cp@b#QObm(5vrdtJ9!Pty%lib~)1v
z4urt-1^f@Zw)<m>uqZGPRduzstHWE|G(MS8NC#xXJW}S1Nm(p*bs(T8N+V<@I^4uV
z8;O;|oGJ*P_vl^bEeAd?NA9N^Goi)3v%Hc1MexU1Sh>613S9^mA(TWiY8Q?dzZ=hT
zYzk1|JV-e2`nY%I`Dfqc)D%W#v2v6!TU*SuCj?&yh-Um2!su}{N`9a+cHECN6q=(M
zjEC51v*4z;X-}6Is)G!YKv=zHd33lc6*_)+w*VWFr;Kxil?X)0=5|n^2P!V7wmq<A
zYZJH?ka9L7)RtrI{&=G>O%&2DaNtv4fr1@>bf16>S+K(k(JzFK4k2wL#B1jxq-~^f
zg_L#w`rc{?S2*je#22z$hlr*ALEE9Kczus^y%jp%R2lCq=)Y$Jim5EHa8a?FPxQl<
z;t%Oa9ErvRROzC4T9G9G7=H2SICu)j?z0VMU%&UIQNk5p_%g!_joA;QC6>M!9*)`B
z7$F$QNQ2A%jJHWao1N{<449W?h~T-gI0HWwvlc3F0klu4FszS(9p8%M5DgY*t07cF
zRnJpXG!s0iF^5iyNtl9w18BXKG;)`~;catU0!I^l?>F4e!(Sfo0XhuOcs2znq&Bl*
zx#ES00L*l)DbzOBNs=q_Q|3;yQ^PW({yRs|iXf>Qp1ODrQetaab+sL&8HHf3HX-&F
zPJQ5eCWw_@;}}_WeZA;e%fbwV7Yr%xRtX@i#8$JOlt*>QCuXMokKs9>zeU(U<N+w+
zxoM82KCRG&(Zr$)ZsJ@-43R&{i+pb9Rrsg}2rl=<3!{wtnjSIxW^7pIxLdjLOx79v
z+}<GTbaP%SW2DCSHZHqXfw7^sB45fgdq_t%5Q|NPiPgpnSuecwOYTgcy(gajF$a;j
z@Itq+bs1fUxF##X?E+Rdkk!R{GtCU^-KqjnS3U|hIG9@PM02tvYsgVl+X4a?Db*%j
zS7UjU=Q4&A!6vw{OaipT?tSiV-Y-pqF2-D89LFS$Aiz*)_OeO`&jz;yFse&V`Hj3?
zX-0`(Z3Th~lcI=T6~-x5oDDyyFdY_$T>n;?_@6e8E!RSH)8h%zY`qZ1Bsm$c{ptuL
zI1Rw!FXH`v%IaBE;QJTIw?F=CQfPP%kto~hTIvF(lRbXN^NL4zsjM}K`O1^30e*h-
zt{}FOCQ}G(AJ#|^Ln&-|rd;SuIhJ<i52W<5l%ASG*=hfL2{Va?g;}=Ryj<IB{Ol9*
zY3#$eY^Fn7$MO03hKS`T*`o6wdg}g)q{0TDRDIX<+^=b$^C*4=B3`Sy05aibxE~*3
z27o4FY1^zcp7C5jF_uViDE4X)1~d+zsQ&rps_P$oATny9Rt)8evEG+jQf}V9vA>_{
zDNK0)MX~YJLMhpv^aYT7(0-n|+wgAU$nxOTM9l*im5Hz{Qt2VtWc~mTh|B-SrDsit
zfP-`;C5QNWuVU4eg+2Di6SoD(135;s&q|bp{^Nb6K}s|}D6AlkpU>8nKCzi!PyiUn
z9F<r&ABM#>);08Q+^weG#J31iF)VtF;jkH>pN+F3Oq|d6jrXNbr8EGXy)v;O&YN-(
zz))i2@iw%X7@;&k)8_l06a3pH?zY2DS+c*CeYC8qDI$XQ&e%9N$)!J(Nt0OSH)0-R
znap6QnktGP;;TJHEvIi&<C51y588J|L3;=_NVIU+lm>#@5SsTQ5DkDWq*>J9Cg5bG
zwR+^9O<JhRFq)tjEdE*Fd(EEf8be)l9oTg1$kKvWMph13MPM?h2;`J}9*Ui{2v_92
zGuMQ{Fb}@Jzi+i$N&9iYjxq-E*KyqJ=;{&-2<?%~_u3NQ>&d?lQThj+;*0u^AzIcY
zhf>h-YN~HDjkZG!h^&QDr~*}ioU`#&SF#`lCZLyhYo%Ps?Bpx&dTadgY6xdBnyTdm
zt1^@s=eCg~&Zz9W)mB3&G)tXdKX3eBzrE`q*cCnqjH`>fH&~$=+Y|;#E!Fv|Cx}HF
zx&H43aijvD)0RRuUMRJ-9Gx-}6bwkr<;icoQ_yXkKI2B|dp&swAQH06_<3%#^!w@8
zf;||4J`t<mHpuyAK~-bfCE{7wGP0tHLpb`4LSJin_8A8_-{)zvG}L(&vEJr~FsHE1
zq0opQNQCWRtaq%l&3*?%0iZHtqEb{;gyf-%f77H+w<8e<KJ;I1zx*|X4xtfZEAgBD
zKaJ2o%D8e%=D0Qezj*wQpATjGRX+)|F~ozOt@@I^iI7@ioJW^?8Te)ypB3KqnlYqV
zr~u&Fll5>p2!RAf)5$2nP-@OJ0YNZQ=m1(A4oXU!$fpfFD{dsGH<b(rpmmn=leg*Q
zXN=&`NfJb;p;C{Bg|4w5nPoxvhZZV$-xS(XON~#Y3HEU@#VD!~w&uI%=F$lL5C91w
zaQ^);cph@g4!UZN#`6sqgptvwwe2sce7*g5ECUT8Km_D!^JF)_lSySTjwJoYl?-D6
zn<#}L(^KTEPy(jP323@CJhvJuFlheyh!KlxPYjUc4hyfUuD$><J^E)p;I!WI7t(h_
z&cucO@!5$TJ4|06uf|B+rXcZI$nV+RxZz0nQSG7UzbwuL!RvTXIHKocbO%TN$WFI;
z`A0`X0$^a!8y=))(As**zhT6PW?oZc2v{?}kolRwZZ`Q;xHwy#ieoTjug>IyD4CIX
z<)CbKjw3ov+vgXtf7p?Iu=h=qKFuZ$uqd%m)tYPy-)ANemOn4BFZ+9HthQj|iITn0
zi<;XfT3c5K8(7gmWR}w2OL0PW#1WCP997tl@qewI`8(9z8^_1YXvP-Vq9K#MGEB+x
zgk;}ImO-J%))<WO*cs9orG$jB<nj<@8T*oCH-sW2)f8o|Ft#X+!C<~0UDx+N`2KXx
zb)EaybI$#~?{i*nwTq{Wf=?TOXRhkr(ByqBRbe4-bH2ND>SRV>{f@X}nSPq5$IP>W
z^;Slx{Jv~|s;Kk=X(#1Lj8_ExHq`Fh6a1~GIIpPX@9jt7bCc`QRvX(xb&|oU9(Lek
zjyhZS1rQO3EveAqi|DIW%?zNi2&kAt*kvbx#{-m3DmGQ;aF_&o%N^9(`U#tsKS_3k
z*k1O@E_a=V`BKKMaF8YXq|msR*;@zDiHf!Gg7*6M1B;*2z`3UilEszT)MTijhznaK
z5~^=S<M|vCosH$d$#z9cPFZ8kbp>uWM^f~v7Jt>gaEm-Gy;1N=;BwpA?p?}JTWdKC
zu5qO`+w{Twul!P&k#~*<bnbpI)lt?!Q=b_J+D;)to3mK)B{%9AlJWJt7AvhhymqIQ
zi-;aO5Q+Ate;%WRHEI*v)=fHwAhc?w=NC+2`7Kr3o1DcMnW5G26mSjUO2eGp^Kq34
z<zvDshPIy@t|*09eK@$HT}Nw}e>0RRGNxW0*(W1J;oz&Q8|V<_m_5Midb3DlBB%$D
zA)ALzKn{u|4trW}F4O+d{>YuPI|^7WSm)^exx|z|yf`R`OEqT8Q3WsO>d$nG3C^0z
zc;vET6@#y06_b2mC9jAhY2`2bA*Y1mxwuZ01LieEqw=QGm{bznIZHU1%eme|!H=%=
zQn}3OSJxx%xv6Sqn~XSE-T&;+cIHR_#hThpGlQ+(!0{E^@KX|`^KTpH%7SY*ps~it
zc+YdpbWzH=ZAWdyY1GlG$bR+I{?&*Cu((@ze=*owKL@qs<TzE2BrcGZ#T=wHV-@MQ
zHBbCSZC_Pcbba^s^x!gsr9%pjdSBfF(H#k!)sW?pvUR!O4%1R?lQD~p&64I)UD9_8
zQfnO*2n$Rqq){hRQ*-&?T?v;2{!@N)Iv5sPN+$eFmQ?c|&f7_}x%D0JNGJgEC5WK5
zGEf;?{8%c8P#?O#%lUB46u9&;dp`lciP3mNQlP7t83pq?^HPaVNvLpyQsHn(#7N!x
zcS2dTYs;8tUx&N~@Y+Pq`(f+Aa{fCTop=fn#do$6#q$yr3&$&<1W{f7ck?n^2OiX{
zxknD_dymzr#TYh%x~iiR{6&t~sDY9bLc`Vcl6*a8XqS8PKYg+28zbrZK5zMO2N5|C
zZ8bhcK}i->h{Iv(4=`sg^~<42qckh^bHmo!z(!kFgccA$hW!3S5<=ED_|4=I$d~oc
z>)nGTM1q80z}FlqE(yGYXKZzDd#o{!%TAAV;-8fh2-=HnJNUJGgVz<&19Ngs!5j3U
zvVAV)XXlRtD%Af;4Q+FCt!LO#qN6}s`|#_!G0no`Y;x7vNr0_H9D~ecA269rGa1sR
zN}tqo&z^W-&jtW>9rJ%TsDO9_0E~coBZGNVZcE+auVX&rAe??7iq41xob!D~*vW9x
znwCIh6<Yb$V~>jA`1t+ZCq9mUQQks!5er!dVXtFl@BK3=Z9moVvwUz_{rr<w3cabi
zon)tw98yGmu^KF#lgCTfw=@}5SIm$pIy&%3sC+CQqKmtJhPnSc+V%Wf$&isw{vpnb
z@y4p(y&P~C3Ub>g<mb3cJ$D5`J$CzH8^$4pHzOsLHCQQRUWodt&A_<m!BRr!^8HP<
z@>?JEx;!d;gm979XtD~n^Bt!)J>IDNVBeYa^_qqhb8e3=(yT15gfX76YM#4Tm0OS-
zaguSrLezlkc3x;B9s}gBY{vdb2=yR&5H9H}>HmGbZT8pI>-TO77<@>mHT+3qtpd<q
zz>&!<ck~ahQnU^DcE*9;7=YnpZ9g@xkOL<3+2j2t%h@e;fg{Cn_cnT=6^vUMaT3#Z
z>Uw{v%^6?vZ3R`?CUgrWVsbtS`M9JWtajBw@I7iJH0x(^>(R6(&w;j|b#ZWho~QhG
zcXBvhXJ}>?-|YE%W3?!Iv-U${aY9>&fYmjE2AiyTX%B99te#)U@E%WN?U1eM()d|@
zFOy>p)_Cn5Sj+qSa^*L<oidZ(@L!zIu;WC>)pZglMX14%<G=P_B74qZ`gUI8#0L3~
zhm5=J{D=WN8U<O(9uq-JaoKU=SK&r|ooH~?3E9)O$`0=%yE+2|qhI|+x6yWX%!Gi7
zf0Mf^8#p!4J@m@zLl$XP&vot(uQHd*#BO%20&hUKHs9?OL8eI2C4%OV7eTQGRs$85
za4`e;i`e(+ey>$jM1}5g)649k`gRHcKfJ=>?hzy(N>)r9B5vCY&lFWeT^G2!_wQKt
zK_a#h3B!entn0s~U~yN=v{W0}2er)VX7U1i?LQx92*0@;ZwTYcRN=yi-@BGE%&Soj
zpU#4di-UM&xy>CwLMj#$z4vE;!krXzQ>o*;-m2{^OftATB@6qsOtrb#&-|P@wnAQ3
z?4Hf+a-}k-GLNtNF$|d_UYt!hJXThOF`vm-%4*+TbG*YE3okBCLGlU#uZ-?~0{{i-
z5QSd2$BdN9PGdXLQ5EcCuAmBr1Hc^U@op9Hk=xLLCqjM(-;v4u20XSLc}+7U?uW6n
zfw7+y5B!X*GGF3pYQ8|p)P)^SSfW<SHM%v298Ii4^a(&co0B%@_uWU+sObA=E>=QQ
zu+Rj-4f1ikoNqq`D*EPIeTE?CcfDBwSJ@X9V>f2;#V{l++<7vL%RNil$NV`NVO^Yw
z-m-GN>3ktQxqLT`I>9{B7B-T_-?uK88W8pB;Oo|Qpgnot<RC&+4>XgVwyW1D`2BJz
zwPMdjWI8z~Jv2&-{jH(s@8Rb7EWxN&>L_MQEL+ue80GW2;Ir-7>>u?%2wraM4su2{
zlmW#F1VfDV{MU&boP|QvLVc+)&MVV?(NqWX1{F1g#`(}um$#f)be$B@PyK02p$ped
ziz!L=CzS-{ETddTZL>_vr?W;F;;xLy!Ve8PmL2_e6_jHZ2U7n!_rR+MK%MC<wxHn>
zq>F60#Cg7<zL#E<Y)`oMCH$Yy{s*^8-I*;q(LVa4<*Cuq9S`2;IfD9F?<Pmrq*pt1
zY~+hs0de8JX=``%8F7UeuXj!7(S#b3c;6?qWE<M;;o5xhy4K&#Hx!Iq6*Tu_*j~ih
z&4q{I-W#NEY~87%e6l)d+nutjmz7r$OpuvKtUZ`up{Ey#<hDLI`jy<yEc<3!45Oo%
zseekIO(Gh#eXg}Rt%Xe>ovJM0SyAekOx$d2rCndau|i*=h9z%?X)xtJu8FfH<F*<)
znxyj|4PUA9<^pvhqPNL@jIRSUz$g{F+fE%C7|)wz&rWSyyU8!U?{S~sWy7|FJhF;x
zQoGY1FGzPk8(#Rf`knoECz?BNnJnpLoJ3oBx_o&q**3!D_65_6kl3d8y}OS$tG#M<
zt4!jmXthNTMkDc)eI4#;&bpwzqeDzCP$civK0m0-K5Z8S(&jD~{J%{1c#6U(T@H`+
z;2;p_WSEIl7#@q&!UhIKh2VXB!$4+%fj<5?q(uN$38|{AqF%+PrU^uf{x8xC7m5wR
d2ZiAS13;k|GsD)jP#CZdvM{kWt}*nu^Iwb4+r9t*

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/metrics/intervals-2.png b/docs/source/assets/design/v1/metrics/intervals-2.png
new file mode 100644
index 0000000000000000000000000000000000000000..03c8b43bb173750838ff4f8f27bc8006058b429a
GIT binary patch
literal 165806
zcmc$Fg;!Kx^zYChEq*~dB?P6r1?dtcrNIHDySqd>L<W$Ml<w{nn2~O9P`XRH-r=|2
zfAHRVZ@FOUc)83yXYWt#Gm#o<^7xM^9zh@wd_@IWO$Y?F5Pa=pqk>ncMp@4x5EO{D
zjEsi0xj6*#^ozNPiC7l|odXjOugUA#Yw3NpN5lzl9_<o7eHuiB_c$pE_o*XVcCo_@
z-AB%yzi%RN9SibES<to!rAjr?YpJaAu;X7O5}4Wq&MFes<9{+V($qk;zzUpwO)vdR
z%KcG%Swd{D+AA{_vL`{4=a4ypCgJ=+Y!BYUy!9|lG;0F76jN3xwwa@)BkCpXccuVh
zw2(d>Ow4s0?5aE%9^S%x@~2Gp^u6y$xwsA+xVUUZ<A`KJ6{XGIFz9~5_yijtfBID7
zg{8@IjD-nNduM0ofB3lARQ7KnEr@UQwAd5m&QCRi7&7s^@CNa`rBaAPP(tWK$RP#O
zl9Gd@cROH7=*$%~RUr^>W(XuO6au*dF9mKwAZ}a`$hHXtB9aP$kU3^FzYzoffoY~J
zFAKSU_><jMm;_$Ia#d7)jkSeJ@%TUCku7Q$@WT*AS?PD4^Sj?X0-wA-8|bJg_UT@4
ztz3;}LLt_8!@aQ!tFaUsaorx-g}HQ|`~u&5U7;A#7_!9Z=u9t!cdNZTpv@(}58~3c
zsNK1<i<YN1&P5l$ZD|-9zUQ3usrUplt<vQnjanPQ<t0ftxc(bwAK7OHUP?jYu%Esl
zjp{SAX4n1U@Nkjx`z|Fv@IsaD{3Gq{_u!}c#<eRok^ggfeejp7hpVe<|0|trc(`SL
znMr>8|J=(C<Nx?p%Qs{Av(Y9{h^&SGh0~|aIQs*KDYSrn%bS{UvwwHN9s}^Lko!2l
z>vS!1juc6f>1p-HaLyOZ6lHbO&8Ir>8<*se7oSMSyMNWZCn}Ra=qLE)%4TD9OUvIH
z(orfohy2=C4KvC2Pjsapfa}|goTTb0v4wIWAJIds$`DU3<psUoCfc(-&(~)cNkPC&
z_&%2TAqPg>^~g~ZD(VCr?g^jVtWOt+w;y2H)yRyQ%`w^bP99o5Bl3g}k-So1Lm1>H
zr-^#J<n`E@#+Q<jk+HY8XMOU~^xr)dzKP43R)Kmc+%`FV8Ee>~mwAl(eTOJ-Vx(0r
zre#zOd_(g7QF^5es*I#nv{$)gR<T{xO=fBb9rApymLLydAG@N!_VvYk0n_f#2pVsz
zgSK-TA;;xP)1Iq;TN4cq$RISVv%Limcq%va^G)viwn$pZi_5A38lU^y^Dv_Rk|X(s
z{SigF1pzHW%8tqFgN}gfy#|)Yky#Eqg|zNtugSi-&s*2k)de*!v1WA@ys~)_n#yl8
zT{mOYVmrsf`y^mPK$jyxzY$@sy$@@uuRlZH-<`i~*-iZV_3Q7~L1Cu0$acT${uL*v
zwi0Y2;LZ@<C3%vJ+&e;aM9K!kX#M`Zf&6jwJYF5laIVr9cp;sICG%IuXsERS1DAAl
zeI2#&ay=b3chZ{W<y4b;de}pl?sPX2aF+xliR?WPGYDB=%TLx^uxp?WlpzofeOsc|
zq0S46;MqFo&l54q9zuu`+kn$=@0~K2U%~@VNoASx(U-6N{r&HLi5?heYp*wh&DhV0
zprLpa7F;(%doJQUq<`!u*|-jUZ31s9UznZs+$n27{WP$m{PiWZgs;$jth>>Uj~yG$
z68_mZCgsS^fI3&riqOT>Z-19LGGpJWf&$FIru7lw;aXXxAIZ%J?bq-P!)Dhm+9xyC
z>&$Fx5w!jnePqTE4W-+wJJ4Gw+S_P>#QiP%%{yOojOLnLHo%H$YiUvY{TqvkiGdh-
z{s~+1KrV4y=1!Sy#Er;r@tmF~XH!POK6cz34}<GbXH6rh3B>=6W?TAWjf+}Fr6VVD
zk(cr$8{;!&s>nNUFX%fg<at)WofjlW!>Xd~?CNsK>fP`A+k=3K0#&rpj)42zI0*{2
zk!fb@Kb*@y3Nv{msT?~|0^U(RtR;AZKbjiz@>Z{QDs+?-*(5JlNf?h?JPz!|Vx%p6
zi1yD{X7KRX7(VxzsY_h0YQMLuTZt9LR0~F+s|_b!Er$@oBS6WOMqXd|<w;a@_`?G<
z_~w{xy{;A<{PHd@FE_H{<Z`iSeGe<%Va4>HZ53vzWl2zXx-ZxXIxb7q@ZGu0I}qS5
zUKeKhd-GA8iXB*r;0c%tGUVfx88jX5H6SrH-o2{@D~(z?=1V$U(5-X}9W%Q*l$&fj
z`6ar(VCb>L&&xY;?l<EW@^*Z$iPm`)lg!odJe<<L@u;7&rn;J0J#(?bxTDjq(mKm5
z1aSc?6pGEQy93pXMaRf!-mv5@Gc<pFxxZs9*PQoGiTF+YbJ$D9fMX{CE!w-ZFwB+M
ze!>d{_!mUH=kXwmy4aB{#G&K%to?fbdgs4VmQ2s?8^@cY0klzl(b$PAJ$+xZyt~CF
z7s;DL8AxAWpFEA1h4-}=<J*mix(wQbe*AjG`K<;kpZo6Y(2x=Yi^@?-Pfu@DxZ_l#
zR(Bp1Y>2hL4qHCV7|w*f+=)hL>o+@+VU;@Pwlr0uBl&Ms22x#96C%|0PdT3qmB;3O
ztG`_D_KRk{fbV7Dt5P&>QVyncOIxQDZc-NY%o~It5;E&|zT|VhZ}%>VE=E+yKjg*S
zyW0byUvU<(bt$Z6!wq(oIsh+pRV3Lxi=;-d{qaSL!{(2YYWEkWH$-Ad;syqy$<gSZ
zJo=K7Qcv@^YnaDwVY#QE;t7Xw`{L7q4<9~6P&s3d-<(iVJ(iXy|JpfQWuBgv2HEYn
zzst|h-_mpsVRQ*O^&RiId*3lP-8QB_z@x|q$@DwZI~i1D`8Ij=CBK7{{rE*nOdoXr
z8P4+~;<0FQUMM^jCA+rPHG<k*Ds77L0u2>5Ht)%E@142fMH9G@>0`e)Uk7>FxQuGv
zn5RD1et=7_(x1H|^y*Wmk?coJ0@U{Sv`?za$(jXB%2#FY`Fd`M&|}H3N5r7G9sf!y
z+CTaWoz0qu5TbqSF)=`u{9$Lyv99^*6DX>K&D^;1A3qd1;fcT1kmuqfxJ=)1&Isno
zsc0t+x$Nu{tgWA*!sO`6;eIhG@h=bkk4;s-{`NgVT<^CW{OyZEo66~Hi5*yx{RjR{
zfw7a9q&5#tw}XGzbi_STL&V2x+B@dKq&y}lAx1uX{N_~mYn%aWtgKuxAIZt7t<<2~
zA@+(?2gK()%xkVs&Z-WIha$nJ6{P>UsW^AV5vsXxj+{J>3?(w#p8S!-tcqIMLm;|a
zJE69&3fn<HVAt3ZnEnsYe`#43li$@-N&i+@wa46tuff5=o!k5*F_FVI;!)(N4IRnj
zZc700pUQL;6<J*z{>kz^R%A$c0G#UT>fk}YKiy#!t;a(*mnRDi4gsUmSoq{Ty+X|~
zN$!7pBa!~sdrkw?U%!3(roa@9hYPi+t!74(^g1ll9CBN_y1MfC?ZrYGMeTp_CqaQp
z_nd!T%G`Wqr=o)(8=$Q;N9u;xx~b&VX6}q}z|#3GXya1=CcjsO7v*l?Y5Y3?w?7Z8
zV9|IkkIB>8O){F_=gX&KUBQS`AihY!Z3PYUPNxiEmv!(4*_Bf`wq~nVz#Av!D1Dkg
z;FM9(apw()CqcpDaJfq+98dA<(c40`XW#AW=L*97=9<^GO2%wv%G=I=`{<PF9^IUb
z7`xAXkR!%Xu{fbC-|^1$7!f~D)#(7xrjo|@<NJ5B)t?EkU%!@fZ!UkYrN#eYDEW01
zY0OjZ?&yer3otBok7dd8MVIy9{x+m{RdyIz*Sie9gYyN44*ur~9salOi%#-+V$Wh%
z;NL{g0Tr-uavp$pOdmnTW6}wk7z`q$vkDP@wuEAW4hTo-hb#jz;Lw`!74(U$KCyL|
z5pl~qzUUl>clZNNX=!QgfE?y(ED4`-nM@YFS-+}}PwuN8s|<s^9h~UT<C4&Xj7Z*#
z6;{I#dD?H^x~zTCkvQx`>xw9Fuc-1A`Vz};`749lyg$QrT#;JD6+##KNtPi;<EP3e
zd1|*Oyy-U==??9}6fq3^I*;fdN)#&a3<dOK!j~@?%b=#AP~<t_=w`PaxS^5nQ6E^P
z=m{l)r+Pojst;MzGG-ebj88Ox;^<K9BN!Ki&6RGV39Ur)!j}*m#{QRUK*PXwpy6#`
z>r|BmwC-FrZY}^EH#Cj$j)*1w-rG|D*H7t?;j`a#zT{#2_ALo7r*7qPD2dwURLKm9
zx|pdNcV?<GXWL}2X{u3-TrO?Ejnn7`kTQjt-XFbn`}4mVQ7k$k()h!|!XWJzf1{=m
zR$~iWeizg49bjij&y|=hmNe-)*R65z9`8$Qer?ay6mG#Cg{UHBRnWo-Y3*%2PV|R|
zhJx!20YgjYEM`}va2nrb-QR4`-d_y7cW66>^Nyr4g0>>#MGe_KTc)miO4EK&Vc1fi
zlM@7UXxS5(u${m;@?13d1DaXopvO_@=1&A|noCH5TBg`#Fz(A)n~FA|rmE-X_0`zT
zOy@7}t`82Hcgg{t5{ey4aoOrqy)`zzUvwFHcn^d?0$67GLGLV-l$CYL^#5~RVaf6x
z2fUWeJwWB6VrXc1<#H=M5#CqbN2H>yz1@&O1TNDuFu=<$EiL{2J@_pN4CWWidM>)(
z7y>h|(#173A!ykyE{W%98b=WGTJE(Rw67=Qa<-288fj|se4J#h*+$k_j)3<!QJkbM
zF-LU>o<9O+?6vl#D+EiPA>l!lf+mAd4ZtM)L@Fe+egmR$XumA5rs3@XIGUi)YcRk;
zY5;b;i%pIw6YS@DU<1;BLCltev}Lb;!P(h4xSz7@nW(7G<?)(y!;Qw*`el|li}eIX
zdDP9(oE}PtR;SCguLn&VnSf`%dx_+t`2nhx4)6!xYoLfnf0+zsZh>@BEeY;h2B<~t
zKHK%AoLs;*l~@OEi#q-3r<>sfuk61m9b*_X#{NFQpG+9&cqh=_t>7(YILlJzq?Vtj
zezkv@4uf3+AV4|~zwiNKn^>NT+yBPXC7OFcbtS~H&qU$h0Wx5z*$wV{dpg5tce;d^
z1N!?LP^+P1*7i+nq$e6d0#Ku!+jHc|kH2Cs{v~GrQ)Nha3Ycs(N9N@C7!=lBFNwNs
z+v)UrtlTj7%AbD|>VjkrGv!NX3l0w9UGelXsD4%L?RP*6t2?Y74bYCTed~1n`OV`u
zxv$Tn(*XC0Akx=--%P(7UeSJ4%;rJ;FZ>O^n$RC~sJ*SN#X49dh9w_U|GNd`9ci>F
zuxBY*y+APOr}hE$mIpI{bC5m};w1rWU<hr;Xa3gQ3@WZ=ACSlUMo0R>;ygr_<o)-k
zmploF5YXOdpvrv@yRaUbw$<w-TFat%8D(>#a0Kvk>Lq9etiZp20ppSJx_Ehc`TMtz
z;b@oXyDhoRU>uut2BHWqxh4tU0!Cd;RxnuyG99ZEG=Y8Qe$9x)@Xw!I$9J>las1&>
zclR^EGQz4lPsKdB0%NH~DdtwL+Xs^b9<N37{z?`1IRi*-6J1$nGjn{UDY+?AGfHXV
z&yR1zEs9l`8hZ=Y5wWtmu|Y|0o>`lnZ8BY|x1Ky><b!znZ5;&*@2QDJV8Wu@`V#2;
zb&;NM{P&e6D7;yY-8hhD_q~s4cBBT;?x#R1p_AJ%mj=N&+y0F&Cq1liUS99b)o#s*
z^U#vn<PX<_U*BIx0r`cpy1M%0$rJRwubTc~dEl*3MA|TC$4zz!0naOT7M2ar%K^ZK
zv8bl;dF&gkG=pC(i2B1FaBB>;=(NF5QKZ}{_oMtVn&{l$RK^+_mB4Y?YxTbzEr}Kr
zC{}D{{-{RFWg^9pz;*B!0Iq=X1NBCe<odbqFA95}Vo|%sd-dPk_^!we!QlqAR>Xy_
z2$L~sC31p@2z<)v3S&v$CxBcTCj@GWo@~C9Bj?%Uo@e1z&yrYbb%Nx|#Q@jT0DL5x
zNl96Kb$EN<@ky0t2aHq`*;|dmrMBW~=_y7dXfFrhY&}HfjiZ&6xX1&vZ8Kf^hfj2U
z?W?hX@e;U2Ydf*_n>S-kE|!pQB5pi9{y>rRH#7lOO%KPQIf&wHpZ)2~SQ#gIcLC7q
z7;K{Qj9T<8*YtmZI!0b$Fr9LPxm<a|lfhJO^VA0b6zrhjY$j$x`kutVbk!wqU0rN)
z6Nug@Dn=f~@Sf-x`-Q!$n<1sSGln|k`^f*2xVBwUgAhf^5&*R=0mKf9;}Bb+;}E9b
z2|^Gp2x~cBhJah~0c+kH8ZIS1Q;sGERzQ+5O9F`VTw!V#ICp`Ji2E*fwtO6oTqb$f
ziqNlG>j{;TvpGvzmj@w>Xe!Wx!U7f>ogP3R^l{9~$Fp^EnrV!YR$zl^MO=%vwt<Bc
z-`rMAVuqc8ryR_vf8^MOL1qBHG0H`A72aPX@6`fyPd>whi+aqPKWvTX1IUhUirUKm
z7gjj^%@@HxlR6Pb<g-^d^8f)awn6c<2)DPkwiXrrWX&znserD7#hB#qqVifzSbv)9
zlm?Ba6#P{b0)hU@uqM#0w}lO=IvUP!yvaAGdNWa|&I!~cq0q<4BA}xIZcwiWg0z6=
z1YdB`aUEv450v?OkMhf=P?#k=)zs*y=uNKwQD=?rJe~Hop!rp|t}P(b0xD~F_xh@(
zm^BO8SnfV}>FV_z*r#aADK|rUSvfg5$Ir08BVs2ikTEOQv)#E2o`INWPdEzBc4iI$
zrik3%Jm^LT=uPM2c|Z&?Q4a!70E>`<@8?Ao%nKmtdh(y12&|~m{*-y3a1h*Gs9j-X
zMphD+V8JrWC!QMq0r%mdjRlWNRsT>_>Y4sm)-I`4OB-;Q`8?809c99&N_Q8id2g^|
zpaGA8Io$`p2JCOo2&j$|*)lr~wWyTM`MQ!O^Yi^BRbJp?Nlpeo5i<%s16S<w=1sH7
z#nAu}J2X6u^gA~^q5Ep!94Y((2x1TLZ#~OzA9}Uq<ieenADC}F5a^EtSV>8Vd<+GK
z$&8@~f0=%xV-EM62S_T)>gR`Mv*lW2sDvMi9d}v@6M)anNWlrh?Nd=@<!F(?$vT3U
zm~yGbqoS;gKLQro)|tYOFlzU0vMzM8C#03|1xy$xab;EMe+1MD_sP%~_824V<p)g0
zm-zdMaeQj@-_vAQEIi(TqdqcdHp3d3UX$~OctNXvG~NfCUQYrdcO8HRc(|IF$3z4X
zg(|Ra_`{=t?QHr57U!w!HCU#nLK(j#?{B2YU;=jYugMH8l2;ZN7r`=*EdE4~N&8jt
z<A*7fWEFcEkfJ5j<+IlewWxcBsK-7Kht{qiAj}$chDr6h9Fia+MC6GcA_3xce*MWN
z{a>n5z~!(pyukwy0l8&H!afW=J}ZlM_G$}v3Jr?@uls9oz6S}zjL7tSC@$9tFv~H4
zgF{0+k{RCHKbUWx)5KAhnSC{MM*;m70jz{B2Q+$EgY_Q>4|q;`BS{da5qRWgYE+W`
zzRUCciBnd*a&4R7E+0NT;}sMTNXwrFlE>?0-RZf~ORE6foX1wL<?^j=HDEAUf7Teo
z6FMS=2bm&V9dLigu2as3Z2vc|yx!w#+5fQH5*%!7+%tIVx8dR8Ms41vGE+}zK$Zn9
zFDlxc{6V65w+H0O*#Q#CH}aELO<0&}fV*1qairMsPevUBr?9HKn4dV)nK6hVIA}z{
ztmNcmUl6>l_dMVOA;1}G*E-^H5DZA$hHeEBO5SiWGUmD!1nxAei=QSpnPgY*f)H;#
z3AlYMZq}{gmz2p@(rb~R#^-AbfBYD1wPHx)vrhCn?GN}ZB5()zR6vlQT@^wgv<z{%
z{f}K+3>XpSi&)t3@GzoVb|h0A6e}V^&Sh^rKT7ZSBj0PVjRsX_VZzM1#Mi4y>ZxA+
z(d0qT7ys7%cAZe&3b1+zfh8j!HdXl+_hn<J3OlprlEM>SYmnS5DIno+6-GnIF33NC
z2P_+O{xvuV><r<{I^RoY^Ae!IOf#G#)0JQ7*ZzFE*5-5Ge%OtB4vGV)n9HAxalSy)
z3)gX6fxo6IA}uPGfY{0?Mj=wro0tc&NWk4q_8MrbZ(ijlZ<Z6}X-wDC(kN;?3F$aF
zIa6~tMzXSCFr3chaDE*Ddo6m|nR3G{|En#KOoxWlkzRH&1^{(dnBhG1psj%+r*)fr
zQwWky5X<?34qXC8x<x#*CGMamb~^bMv4?8e10<;pT!Vo}DO$6*s)~ElYXwc6>9G>q
zgt@Wb>6a>8P;AbB!pI&(i|9e?34l{U({6MZRT5IVe2$kd3G(z=pLheA8e%WGmi#t)
zQqTr&J`no>y7HRvJ3QD*I!U+7x(hw1uhp#Y5s7<XZD63Mn7ss*h%0MhS!23{pq@+Y
z3aFCMo<CN6A^@{N?3cy}HZ5i-5e5@B@5h@+l78`HLQOPAuvXcKYjqufUwB638-5Qw
zGYX(a5Q#S+5mp_+qu^uR>@klMpZ;XZ8y@2f#6Zk76HQ!$E+ejXf&GIgi3SwPKe`<q
z(mOeaVd6@WKMC+f6}u16j*1XAZ~AfAt54nC-O<s}3QRnX%bfrrIXq8ozuk}ykMt<r
zT5#qK#DfS~Is`NSJ!M($VvHD9_X_Z8d;$##jUep<8ewhn(S9fn2|PcVxBkZuHr<0P
zo)4e74OEgno4<~uM>!1%JM`)jnT|574W|Bl^$9QlrR6cec?#K)J)lGTulS((Z&0H*
zpR=>$MsqRZqCD;E#_j;!OfEVrwhOFd3Wr`d<@_O^;f=%ODkT+d<2is{L8wuS`yQYf
z9ZgL$duofvJT4l{O&$lo`&n$~&Ak~f1h794rdc*mqAQ2L%Hag25G0e5TX{)$2lsa)
zLcGE4{3v_SAJ^5xFU{Atx3}qjE^?rwQ}(2edLz$O*$N+!{iGGhWy?&j0?@?k{~ds`
zkv30xY50+}>A|p_Xl&^@g8Ha&LW9G2ExUp-0JM}=R0LdY6{_ZKY-9!4zw`twg;l}E
z)e-`an$|>$`BY~82HwV|-5VH=VA;UL#Khv_=W5DtxP=+F{H**l1syWCs7Ojp(yC0l
zf`MT?1D-%oN8q_R(_?P?xG-F@l;q^pex%!sA>2Su&-y#NTr3s&=CdG7mnkhB3>|^(
z>}2QeP@LhfoW`x5RU2M_BAw&0P3-)l)-28P@4Rt*pwJ8fyR-y8px=s1OZj*UE<q2B
z$#zh38)#^lXf{PJTsE%6qJa?BqP0{`7GumE=;{}Ra95W)kTER!en3rz`+5&hd*D6g
z>J-ukW+<{2^+9AfD_Jj}e#HUX7=z1K=@b*VQLA+i@SOQEP7+}X3^v!|;Q*Awzd$qs
zq20<Ju@PQllnX#Ls^;*%lH%g>#=oC4#eFFJ{sMpgmv**VJRZqhi%?IXfg`=$3G&?H
z*V}=G%ygU9>tE*OGy@P@QBkpxJS(wXlnYM)7^r?|ic8H(^kTgLXw5Z&1zU551Nh5e
zK)vd5omoIFt_~)tCTW)&SWj1~cEH}NX9#gS#;0CH56}jPEtT=OZNJM^KHXaYw;JIm
zZ6bCS_x{H<pGf{V=qF!Jm_G&F_FY7|VGBiAL4EynX=^G~Jzdt}N}MD}{^G=r70k_Z
z**M02yk$x9s_OcQGa<~}iyrs`lX&%SdVtUA_6IrVY~dx~HKl2d{7$Drum}a%lBKPp
z#ynFm=Pa}Mg~>#BD?_a7ap?Ge`6ACliju|87kLleU|foCXO{<AvJ9GCf8fYIV^~S&
z7KgsFVNCd>IDkFQ;{lQ;HC(njFu%K0oNt~a%u9xa!{HYoNlx9`0?Fq3Dep2sQCfls
z#Amk%0&1034~?PzPb83jdRyjEr<HyfVVwAxF4P~$+w0RE%hEG|M-*@LH8c)^d3`X=
zAe75aAc6AW-&6sMpXwdNrKaBdx7M@Gv?m;<<;82L2IJ|2=YZG`JK5)NBujrBEiL?1
za|m!#YUvIzj3&};<>kLFd|*wjt&5+k9<~{|i)pEQ`?hxfMoOPDD7n2UK~JyoYP)1-
zdmGip4cH3i;&Kq!^>g8e8W6#)t}0zNem&SUAkl%!&~cG|JG=>UP+p#K4naOXJ_Deo
zb2M0~2u5JM5il}NBU?DqQOI`jXGVRReKHOdFh(JQrC@#>n7+mDw~+Zd8;@VY8|ZFO
z72F5j^r01YX5tM1E&?ccY;B9+&HDGi;)C5Kn3SlIG9O|k9{cTcVAqXj?7j8O1r9>8
zK5K5)7tQmo)kp6X*}za?LF%OCY~CgYAr9so6NAP9w{=@RU|`n)=n$xnbr6SGr<tOj
z2Rq(^i|uYMj-b-Lo*F72f}5K~a%n9sEHZ$X0CI^B3y(%jNVmhd$vHpLh>F{+cXKQk
zQ7QNz>d?>-d{{N^35w%yuMIyS2c^yLE=K~6!+j~i=r_WuMN@LASJ2rNBw@UkzixoP
zii!HK6Sy~fvoR1+)!URe^$Yc=mQ8&B&cJw#peTl=tGs%mxuxEpY*6>{3-@~vS^#Zq
zmv<pUC~mQD4{xM9M5NY(pG>df3`s*_)+u&lOO&F9ffN}qNGy=8Nf!S-`L0)QtJgdh
z{VpSWdS2G;0~DIZYGuxu_LJj82;^=4+S=0(^3-|t=dW*~lwK)WS@%hOUIN?yrS9~u
zIFU$@#>k0+=?Lg8dYSM=kTVAj{r9tsE)OZ|TvznkT!K3Y_l5T<XTW9DSf~m^`-;$m
zw^S8edNN71XkGYCMOs8HtQCggliLPMZoNHQ;Ita-4#SP%qU5t8;-y?-c=E^%giCpO
z79gPjDih`v*Yy<F>=okCE2hVDUK`(HjQ{+N;uKgq21bIR?p|a01y{HPua1@$Z1?er
z21tSL!6Ji-iKsmmqEt!XdNcq4KjsyQY~~#>ZYN?Ry(|R4<cOthcrp7_zNV#d{z(tD
zMQ^r^df^T~j^DDJFkhE-nWgdBHGO1+u`-c&jpfk<W}yiG>D%jbK{D<=Zi1R@yqZf7
zP)59CKVD!4_L*H0gJu?dI|9-cpcxN?gHcIXEuM#_7B542cxn=>p6u-GIDil;DRla~
zk`0L39}MVff;@$XeWk!Xka=m!u(|Ezw`w>{1HcH^1MRlVKH~;#<-e;dZ<j6*CwuHK
z>M2>94Tn4zEX(R{WDlktf6ot4kfeaicDC~0^NyYAGM|UW@fP6aG-z}LoF&kwty;5z
zNVjB?y@LA#AA1-+eHw7Y&Q%81-5~}83B2p)&!0h|W;;wQQA#7aK&>NEql2u|_Xh-C
z&nIZhp4|fqh93+6Pq&qwM@udTqD2dkB(H!8=KaFxDA8tE`#~JAfTc_5uS%+_jzAOM
zZI@Jh{P<Dji65Buf?%q+xLAc{=baXn5pv=|5vM&>XMA!#YfhN^HZ269`DS$QVR~yF
zF6;q}yx-M7Hg<N=yNi{t9D15>-k9F37KD!wh8dt2K5+HkLIdaX=OREFz!!9sV56;n
z|C-gvb;1GQ@DIHmjR4CU<i*M>7Q7v_9_V*AH?Y|yJ@pC-<{S~KpR6j3+V<PdjlM8=
z9)PLDL(T>s{|t~3Ntv}K?L(ye3E!o4kO|Ko0QCR{LsnU9gRZ}To#A%6*Jv`Ka)p1|
zor|`%wzc)vgC;`vbxJ$xEel2wewO9>QIZe(@Z_-%RwGW#d3Uy|E73Q8>0#deVDf>m
zMgq$^@Z$&4_x+nUZ;Xt}CqAb*j3)9jJ9tCC33o2G)z`cKZuRPi;xn)Pi|=%`VKbyn
z6Z1NLBTroiLi~q`9ndLvz+z2gd62XRn`k}z6wPZ&y2ZmGo*Xn2_ebfzOU&8N?-t~!
z{vbh;0OLp~w~)BmKA^0<v0QmRiWjV`Ak!0nh=P|t)WxfG`QSY|#PF88!U);KUs=c5
z9<#4Z=qf%lzQ39QR`lXQ(4ZVXj{$1x8U(qtywK>#ctZWhV&9ws(~ZF#=PEiHk3?wP
z+cI?e+gqS<5|!(w<>J#>JAezL*a-Dv1B3JjJf-$nJOU(#&QFD3kO0v36okjz07eFr
zSkgOic~i#yc!Bq(1mmzHAB;x^S4ahhgrcG%Z13Y3!QZSL9Ae;ECfJ43vRG779f5~H
zf)$0nFD0c(M%t`bf`}zJ(u2eyv;MN@$I?Qo9(I*k-{<4wV^Dg)u5W_P+6Kc3@Z>39
zzN`tUu|5HD0y_r>UOklXta_!k3_9T<<4_gXGA-iA=vd}vQoT93Zk|zJ=o4RFHc`*V
zKVa*)GC7UF_E+f0_L>VC`0_Tr2=kn056`}~3)gvGuC~MZt&7^;?#ui|V3?5LYj<}S
zEKjrLNG8<s9N!*E^DkhhZA#c~p<%j>;z^re+}z}3t;Jv>xDL=x;ko8vtV&Ptd&iIF
zfBy~hxG*GONA!THd-GtxHONmNbi>FAs>iD+qmSH}!NwDmRIU4@F%CzsCKeV)M@=5z
z3SN8+E?4f*zuJNT3I1>jBt5<4Z91uaCE-K`?5>R5uuH8=Iu7H7V9R#?x#d*|R!Gt(
z3JQu>IU3DA92cT?x0c5{137);1zZ6<?FS8pKjKc$i=KUd7*&A@z;b7hNZmK99WP;_
zr;_G62PR2wc>?Bk^hz<&I81t1ks@{PFXOQ1NmW16W4~A4$=c0uMIjM`dgBTXzN1uG
zo0@viP%pt;?RVQXz~kI8t0=iU4ao+38C?A6w>~TNn^UC4TFrs?!r~KGJ)n5N97dRh
z#S)lbaD)R#jzECvG=Z5pz)B&tpRw$vU-PQ{(JM6JTSf<#+kd1XLniel@@ZtZ1J_(!
z6!3}{6fpk^sf>orc+wDFA&J|O7q@Bas&$(r|1-yCh`;(((J`H(%q0lhp9KsSb4@Ho
zcc!o6^+L_TscV|_V>ABbUk+aL`cy46{e#JduCx2miY|-!tM;uj{IvPMmDzI&FMMq^
z5^v4mQ_zu2Ln9-CY>-xrS@A8AjsI;Le?6n_{`yKu^@<t|$I>F8a!k1yfBw6rK9XS+
zBaCQmZ4I>3i$I{PKw#YDCTM^7&lR7?Q?<ZL;hE{(X#P0H$q1j<gnTafyDbkwz8un#
z-y#r(+tTSZviEp-(-OtB<QYMvvu2Py&ISyJ?SY6Oz(9)g8I$T+mI<WRw!7(_LCC+J
zH?5J-kmucT85aX;Mv}j~v)(!R;}=8#w_)$kpC1*ocFXwKBbR17Lc)PQW*mXYYE@`X
zcw}<AJ@s(<1rwjbB6)NAuOm2=F^F|DxS>jxAt_I4ud!Khk8^WFa3lN;`IkE&)+iYN
z3(6gaT_wnx)Y6l#!bu4RNmDegRGO{r43tGc==CfRfJBmbqqOl6Gs|Ily0PFk0<#-l
zs-^x-WXhLjcj6Q6A<0d_NOhI^*xyA&1pdbb(x?v;<8t|TBenS)9c<95dSG3<Xt+9h
zR09x(`lT4DAF{uH|Nd{guVbV5q@rmUD%365!>q><AkkaV5!zQ>m3nzwlpFh;gQFFM
zM?ifaTyGiMW!4_PrUi5lA{M?DroxI?hhK3f;9#AQ-OF&~aYRyk_~WQP6>INWeJ3QI
z^*6}lxB;0Q4G?jyFZzRHcTgO&i?T9@OEm0WOQy#-nqzmHa<_bCF0-g#@=sB~IlOA}
zak-h(7XfrH;D57{avSx=M1KE<#=qxCdGT-<F96aZQ+ns1v_nM%>X3ApH0aA!ie&f;
zeHHt$7;t*ej=ZJ(|NEK0|BJ^5|DWCJRuu)!MC~?o)gj|1_N`><N}{pjj(YG2`IhC)
zPdnkukt;Ivn#+4_vWu}{J%42(Ph-SN{uJf?nvPsa0Q^>9k5avbE72StEg)L^uvZ&#
z_MKUwd6lYaTgg|SPy!*CC@O<n-_i&_X6BQJp4Nr0CRk6f5!bbH7T&X1;j%b%LWm3P
zKc`r}_h&(^<JxlzNew1bGx^=QbGtRHdH5YgpacD!u3+`&)fdi8nDpS$oLF9ySA1H~
znf!S*XkBvM_W>lGkkn5&id~Chl$vyVF}p0Nso;Eob*U2%DlC;%xPiG2a~cnSGx^O2
z`xNG7SX!YkBTsZ73km~+X{@@xj74wo+Ia+Oj+IfRK!vdkds(ga=Q?H{G;>BHVkgbR
zx6XmEE|Y$<W6Ha!@mU35W`S)q;-D^lmVu`%5y%P`-T!^|G=Lh;b~2>X3#rkBIDJ=S
znGRB@VPNGVW{)4o!%QHe3q;33k+OzbnU#TYwH4A5N`!-h9nbWbh{20CSMce<i_1Wz
zUx)J(w#?vH>F||3&C(U5>=juHrh_ssweFi-8Y*0x>2h0}U3{(!e<F)6%|z`S>PUq>
zs3O}dIYMv#vIGZWV}{QFl_Eh?XoF(P1o0n68Q#Jn`34v0aVTZ{HOy>N^GX$F{GldR
zm5*AL08qH}dtU_nmvM|hrQk?~DQsVC&kK&MU-84Pf3Kw2@7X+4K<PFQRTyAzFUz{D
zPA`gz;LYs|;ql&o@k)4?Jta6*n&8tj;(_VjVlrX_Pq$(x45C=~1?n6ssM==?)GEFt
ziJHf&kmv2rnm6h4B)F2=+|Lxl<*BlGw>8UpWZ{bWf8zG51>~zIFvQOIEuOX;OIm8z
z7L6V?eeAZ}fvT%dzFTu>Gi-SqtD2N1d73$k7ntYpEftpi>1jOM+=$``YJQ#BO8PGo
zvr0@RUDh)?1NogP^Zv=(6S?)P(h>)&fU(!`?YMo*sJ`QxO3iznM1yaZji_*w$3?d{
z1X_3gx}g*U$i(}vEPPGqv~-HGAN93P_U}#mHZsPvqyz-SN!k5)d`6UkwC}>A%GoCS
z@j8o7`^EGy^Td!tG6TIB5^?x_DGJvoEDEzAMFU}LD|XgSR9>7m$Mq-n!sB4bO3t-2
zN9{Ucva#{wU-LgkSR(l{iq~{lbX@dn#dS9XL?k36pZw?z1naHt=GpHQIa^zq1?2~}
zmkJsge@?V|4;(0|3CRkPLr*tYp4IB<vIItAQ}ligoy_|ZZerYR%8XqJ;ja+1^IbuF
z`y};Q*G7nEwalpHUv$!rae(MaAR6xH;}pZ20CLgY_?BDu(kG#()cPFhqT1UId^-bk
z`P~=Ty_rOR`Y|ee-qH*|b^mPKVepRgNcb0ntP+o|!e)U;gLBS=-AMH90cz)exRl7Y
z_VuZglQ^Gw#8%7xNq`{Sn?9{5L4-7R=0~+%?t9uji!X2Fs*)0h$@kuR2=Gc+Yr^BZ
ztyAxGZ;}@3K4RmGKX+N^5N>LNu^QbKjBxMusg$HMb|h}_I=SSb5S3DQT4Am)ypz@4
zjtn9EiT<d-L-{$B%S)lz>}VKcBt?nGKQnP>M?Znx#?rrGJMfl=jPrZ<W6KN2r3MH8
z4v8EZ!;Egtxi(SZgwAP8wJ2;=cZ+J|TV%-7G;DZq9clt*4JkoTsEM`-PMN~bcke8D
zmO06nEa$&QV<;pvduoomIDQU)(#{~y{QS8Nm2Z)twb>E>45{Jm$i;%g0Ig58_1*5`
zUW0j!(bF3(KI8NdPi=Jl&?7pg4u#v<#V1N(*mq+GV`i5VrulbrGRveVPf2hQ@vr){
z1$~{PtOPk+zOp|zk{EDPqH@-~=1{01jmIzPi1t#&=usmXE6+Usre?@XfH<NR6_dCP
z_!av;O!@jlvLV?%7d-u56G2J#I-Ht#&yT~V41+D4t4UK<F{m4^@@ljB$2p(Bqj}>_
zWvO9>yK=)RIh^JPX24<_OmB}dUz{I;&43Dj?6BOxZ&SY$L5B(ZAZvnNE&0Qmi)9Lh
zBrX+CeAT2I%J4T2quD5{j#~(OjnA(SFMF$L<9ig@QD)13;>Ws;ZwueqV{l9Jl7xIp
z{NrnlEBs^BD_7a6c|gMRg(hx3UzkN7WkuueI*n^~Aw3Gcrk<PiJSBaj)AO{+UNq+1
z+ay)>ZFD*D$@$iQpIv@wCo!)h{XNEBZ=e*V^U*&FRT81YX3w{<$Qh&VN@l)$Yg%TF
z7S8%#UU03al#*MZ@RC`Ttw`x45_?)X@zJXS#7$mg-onMyt@l#*qPZ~xEUe&*zrBts
zJ(mqv7rp22=t9Q)kc^k2AE)UnwJ9+CzTLdi7bKCAYcPbUb}N>aIA>U~2Bbz3G9dzG
zm{QZsTgl^JO{%uDoE_m;kF?KS|5i#k)4vQ3bIeF{67(5v9}>GUbyg15rkkVs_@Asd
z7L;03lTLi5Quw(T8<$Z2O5XZ@vOEMq;_mph_H}4hoA&KSAdHQCwqLQUV$kWaatag8
zh5C2HmTGw#LXYjuOB!hsK6-SBCJAnMueX3ep?{7=tG|oRTMd$6X}k-1w&&VYcef0Z
zEWGF0ZPbr~q^;dWS+qorM!uSxous?9=T$_!t@l4(W6N3`O}pN=^S!(L$yP{e=t?6~
zGT%H;W<>lUzo{qU*_t@2AUaE#S6+T4^EK8M`48)(0E7&u7qOB*5N`tX?$q|V3Tbl`
zfdcl<(}y|I<qD<gG+D=<jra6_2t;#!$o20KvysO4*AoYu{pZrU=JNfPkdz@x&qhii
zr-ZcMhW!Tq3bppE%cMMx^DaZVf#APWCQKZ*pE;As+awVDveY8$-|u5q9OC^vo{Jqy
zJo@@^Pa69V_EclD|K8`#E)ThDs%-vLF~^KLqY4E)v3!1;&B^WxObCWmrM8KbjZ_up
zHHCf%1W{Ud<_9eke~YD%PGAa+t(g}2!C;l=O!8JlwixYxH&46>Gh2^6AM#G($aAH+
zID@?++I_ny{c)mLPUM_vSA6g?BrQi=o0*-Uz*rSqBJdq^J1RHdyH_l}`Fu6ZtXXbP
zi?S}}=08N8s7t_zg4MV}RHqfHQwc)&b@?~uYtFuC51oG@l>DWL!KvjbUqjAas@s@T
zAdJ*9c8;9=SaQ~|gi?lWwzT9i(4Ivryy2^-(cAvjm4>S)?sDi{fju00F`6fCY*WU^
zE5qAKkyfi+`X67icDq@=Z@w;fHlNgJ-dp{itbj>fjt<)CX`~glN*W|T%q;X7iOn<v
zak`LPH)3|*0M++D%L<<4FV??^`W@dNvW3Pk$??#UKT_&cRSD4B_H<n~hG0LAk1aOb
zFHZ~KQ)77qE45N<((ht|^OQKwmDpdIsj#MftvYerwMh7^Osol+m1iov%^^%k&-sEy
z#ERP0o0rX>Ovm;V{VNl<j|#*FivNh0n+;_gor}qufj;}0vUZ+9{wJzu`4-HfPkcT<
zb;R(+iC`=3N|REOA<~S(98@@7PYssj{jv5~TUKSYQZ0~r#_q~qttUoMJ{%t*x+T=7
zBuhnyKA0F;Tlf>JMTLNy3YAq2+e<c4v62W*gr}MrXrZb?<DwwaO^?bJOI*XKAiciZ
zERP&%|4AY64h4t48&6@dG*WuBca#ajrkidR(ACap7>^D~F)2gu>>g8*)epp34Uv9S
z)Oz!%+=119*{G0<GgBc{rW$(pXx`<3;Qk^48g3zVO_cVhnHXzSB54#I1vP|<o=?FV
zUkUp~4h9o9&bUNqA_E>YLV;aXUn-vP1;RMp5ZRzBz&1U7k3ocMMIR{>44Y$-XCjTW
zx!S+o{6!Z2=qf0uO^Ag;2KM>~2~&u<vZVXMsnZEo!ts;Y1mcdrCB@_;qxAtU%JLWU
z@L!+G(ze+C+iULeZa|Mlc>#m{hy8Jn!-#ij>1@@p3!?q#73!)LzEl+6Pi)_~L3%dV
zy_<P*PQDlZ>P7ktIMcGLI-JEc0b~OosoeQr>C2*%=)MXbB&I;0MupR`zT`8&Pb3(a
z>6g6LS=tFGOTQNhqTUnw{H*CT>US~hF4V&Lsbe1VBiqS5-Uho=8)>iCroLw;I=CpM
z%{M&wT35YE_R#WCc%7Rn5^m(RL#@o&ZqP?UM|Oy4ZvXhXhLFKInMj|TMQW%KKQwY@
z!e3Y9=<-C{$nEu-o%v`s-FHas=N&k_bx&S_ZBQqeD!}f;cVCPIKAPw13U39wdyR;E
zm}E|k)%9G)6^CObbM_Y;!s7E*eiWf-Ge3S~{YPWPl7zz8=fyd^V`H~A>Ud?G7~Xs>
zbN<hJ??Wh)4W<D!B|BbrrIt;L)|Y9H1x2;}P<p(~bG2hiUE*0jQM_hjH?dOu<PdS}
zXRrV6+kK``t}B{%UK;a1$LZowaY+1fNjl8juXGy|^wvu^koj?1q`AIO+HH8`y;Cy*
zA)z=qg$hv&`(TntE*?}*C$T)0^!=G((YeH<Jo-FWt*OPKD$B5WLO7)tK`m)79*Pv_
z-6-cKn-UiD%5wVGf!D<!Ik~e*R5`kHjIem=Is;9Pec7JI=0?L)UOBFmvdoD4Xxt9W
zyo(XfKRXE!hOp5}^~{|2LM5ctQgMpSlqMSDyr+~SW!H2F`@Z_ibqAqZKMO6%d$%g1
zuvGc|DvWWba50yv{O<5L{J@x=tu~OXqWF3<?P9koe$+~ZgXZAHb+AG@)qZjm^2wuz
zW1omqN^E@@q3>rgf>X0q7!kYbZZSLdjb4LySmpcEZd;>w|9v&0x^`w@F?&oGk13_=
z41Y6R-cCxS)JJcCts~8atI9DOLFoKyhsMixEIa3m;BNz;K@4^^Q7wqcpH8>Dck?q$
zQX+Vzjhl-HY7(&!Pk8)j+MqcrJ~Q$w!#mNl(`P;P6`2<>-By#W_J|8*ZG{AST^j~S
zB0af&a8Mc+Z8Fm_XEPBbf&CAcZaCFvi;H;8z`Bv{({%(pG5~p;!>E1G<Z~%(DI;yg
zEZ?tFVc@i{R|T8E=+v4z<?PVD{OhuM8C)CtEQQMHU!u!>j_;f&q6`d`=YmZ3NYL}2
ztIU2*1X~UcjK&0}F(&HqBz<DXR(z6v6mKBFU-GuX^K!v$h10=vtMa1IoOSoJf>ANE
z1jTwQ^&|S41LDV0zIH2LJ<7d~iOAVr{yy+`$9-~%jv-&vs~>Q;3b&WZgGWpc`Y<~v
zSp;V&Kpb(sJ^wsz^A>^7s=#}hU?>>JK~HhU_)cJHf}HNn2YP4cIAM%x(E+!?IxPl5
zrXBWgCU!llBdA_CHy<OCrp5hq|F+qDv=oyM)IW4G>)LswaY*OZv+861mWKWh&wvfG
zZQ7xDNb>CR=9U)W+~d9Gi>ZqpaCcv6|KYBPw{82N#7Hb>y-%j3f9cPFd96`Zy?`c3
zeA1hZJbqn;K@n}Q9R#`5%CN(iI4UE*sWmZw-%{jsQ-i_G#xR?>Bo&LxBVFzwI@WsI
zFyDi=`39sVhTs$X=f=8%=JiirG5$xuIr%y2zUsL4t+SelSIO%W2GPButbm+9HJqWz
zF-U}->(=YZSt)e-rP&)jRdq?7*3-b%&X|FGA`_h?<uta{{`Haf-hbJITMia&_IJOF
z+9wT@T}F|jjoR*GCPd358(ls=m?M)JpdsELi_ubvq^r(CpZ&m9lWgz%>s%RpD(Yy9
z?3=M?(=En*k?&znctv3no^|%!-9R^Y#b!ng#cy91qkEL$SRvP~CbF`0G4j>&=9_t-
z;7`5w2V)P~6NJA9{5=^-_>7K&N--Z;ZdG;TKkIfI)49rEZKdj%tl>!iZ8lLooq2Ih
z$R9Cz-fB(qz5F0lX7b%@f|fBA2;HF7VTx*`(0Bi{sJmjeqYo28`|_B(UqrX>Sr?6L
zvVLmqDhGG`O)m&<j#iU_;|m))^Q7{HU<cQw2pd+*Wf)xkLG<bM*>8yJUKiw*VDiz+
z!i*3Z1#u2P$s2XnjRZAI+wHeF=1k*%zC<6rSQ0ItRP&#kXNk?fI4-5;L~Obcg~>pu
zn3zDVbyqcU^0#`(OiSn}W#3&lw7qs%rTLeoF<2kjRMGcwX3c~AQDERj#{KISJk#<D
zf3pDUL4@dqvHjdjCTcb0%TMEXx4a~wFO|x^B!Bl_4DH=8UNv_6(r^m7y&Z4OLF29e
zE%NRiT$3of7V}z3NYk2L_LQBIzwd9KJQXvOwz0qU$~TXG3v~0Wp`{a(Z-=r!4gD7O
z{%b%k-3<*Zm3OeUoBd$@ScpPhf-(B<SXry#_uSpjm0X#6ObtI|t_W}Lzv_~y=HmPS
z$rlplYF`_SaNKqnZ-i=aBd2r>iQO{X)!XRuTF#8mz4|^lJ8~vPnLQWg;jMWbKSd$|
zK5kNb;1B8S&Lg*CK(RUwWscurF=JO3RV*vp6!L%7f7UazzERZDpY22qF*G(ly2!<d
zpwT~><z=t9wEXSPh@0gt;__2tXynJeJM=a+{c}YV!%opkI@I^=(HKj<;BO*Z`c$P4
zwv62Lq{9cHU#=(#aZHOBtrV9F^hPdm$d@j=6*k6rYGj*XPhMJItp4qduKKi&8vjaw
zBb{Dn<V<lU;MA72(0qC{iNWv8$>rL*tV<soM}QqoV=b$N{_$S@Yo2p)JI<`yL~rI?
zcAo{8cV<i5IwQk0&bq%2Yl!SBk`tNPqloU$v_8K$dak71?)kmOjCrGBMDm}S^c!ur
z$OB36`Ib|X{X~qA<8$vPbK<7exF{Ey%jhVPZb-gXAJz3`Ki+!r5aM&o_Q%II=>_7$
zb;Bhri)8zCI*kkac#fwHRa!OG!`@RJZehE%Zhta8hD!<H6h9m#lKj7<H7<0netVvc
zc7_sR>}?prz>=AW%(ZzgkfdvK<`;;rLJ-C*Jv4Kd8z+x@ToWE+P<{Q<jW&i|>|%3t
zRegHt-t$)`R#g0ZNw4GWVa}Ir+sBDJ_UIa3SIBLozvxOH32!}Re&<mGa_C@ccy9q;
z>btXDU%&yU_y$$VesF@Q#^s{Mt0{?F=RqwUs}Zs>$CX{!)t<+9H~g3xg9%xqDVGs<
z-$PTuhSXbhMY12E-ewmNzhh}s#|S&Zz%Zt8X0>o^>@6F2>UT&VLG_!~!}SHm1wu<M
z317h;r`LrqA!f9G6fcdXyyi-fUKO{7=ATDI(?ffdq=&X6_Hohb)LV&$Uh27R*!^ku
zI~fs)LU|l4m8Q8yXFJ~BSG`rK8DOr!EJBj{-(~Cy^K;8fm)~)=KHmNHUs=mA^3Q}%
zA7>`v2qtBtB<S8a8C-{9iN{Y^vkn_PCg!R+!$JQvY1B&JhkvTf9H!*mD7;iRx5e>A
zKMQYJClC^~dgfi~_M1Q_RJoy?=3Z*0RsF={w!<p!_Dco2aGJU5u*Arw7{zVuF{v6J
zzQl8%1zuS>HniW(2OmxwigP5yE-v8W73S1ono5;uudzcIR@c$2e9z_RZw@z=F&1_E
zYOns|J(0i9H9a*8_#4nIevbIPcdsDI5w9!3&sf!KTq*lcl))j3TWV5ADOe%zi^uhU
zuT2|tXM=jAc1zg2{Jg`PQ@oTwh{9Dxr|>Kc`kIMK;ey*e_w=A0DYTMK@PS%~`ub~Q
z#e(>$?jI+X7ebyLy`TA^EfG^>hVdSU^n2wk2_#i@gh*QO(e2jTb2hKtH~7TqM_P!<
zAAMg9*m8wwM*mH><X|GHtCq&w<VrAVL^`h3PQ?j_KCq*(ZavCgud8y)vD)9$d3U!X
z(NSFfZ+gL5U36-OBai8VD;ArXR1*@3fxDzeEvtvqiQW~6DV(P#!wQm}ZS$Rdf3Equ
z_u#}DcISJ;s{O8k>hpQ=Tpvz|jpQXAWszZH<?v30#X;5AW0F#mSQ5>;x~i{6oMLB1
zjZcPOHjggZU&j(kFgTyg#gT+1Z4=EQ7&%f@qX9f%6n^>pj3})@OeH6}G)Mj;GzQ)2
zQ!06)8(IdQ&>IYeX|p*?wH6e$^EQ4PTE{O~sBgSCY!rEnguhO!e9isE*qBzQu-}%3
zXEh%DpS2=H*$lC2WA;aQj3z?1WnWKl=g;SRL(NJTH`h_8G=3Mu)_U$mhmJ*Q7AS#G
zEIK#Ib-Uw|=bA^eDSq>AX4`^f4r4D(LXSx}YM@3cuQNqQn_#<@!9M-DsBjyK$sUJJ
zL=W0sCii#Qz7mT@bd$1LA}f7ymhqytva{i(69LEMKUI#9wsvdebb+0jnRDn_tEZM$
ztq<o<qQ^Kw<pQxbtX=WlS|vKuFAXxp%|ubB#e4&3G+NBa_v+LuuxOo{lTjX(JeT=4
zgg#npriJ*owMQsid@Oli?71n?R7wGf6(c7?{nT+8geyxZnE@dyx?K%fOw>HMN+&T6
zm~h&)f%)vVt(S8rv|_(&Ju2aJX`p<a6_CR3sL=KL>xXYnq*W{H7S9npdI8<0D1&&N
zMOA9AU;De~kVNu4p^Ka~)rNOFDX^jbE3IgI7Ukm5!I=C7-vq%=cFnI9relZ(IYKW`
zwT4t5G_v``wY_6twwTRh>RtCWHq?6Kx{aLQtA5+{tzjtm?3F?v9-ChJf9T)-In2d8
z>~%!U`hsrankv;FrswVdplzhoUd`}bC4JFV>~W~70n>IqCG#WRP@M+8dgvRuI74jo
ze2h>!2A*fmsv7O#P{`4<q->HuYN8N2YRj|q14SsMTi0P14kDPfmx`h5v$(4L+4Kfk
zY<u2%&2^)HP|vIF_!XrTeh6Kj@rmD3sUUr7(CFs_K3UhFNAYteSxm&%gR=3EjNq5Q
zXGR2KTZH-kZaQ}K3@SsU_LKzKld6xOmXokWa(WedpVjOtoKf~}!(!$zV4{X{YO!*`
zKi%pulOh$YT>eU6aMv<yHGi+AGpf->ja|XkOjJ-}*dl7F0-w5HKta<HyKg1Zu+iD9
z=r}_0I{jdX^jg}RH1eULGbb1RBnbJGT*nDM;;@bMGG7z-Aj@#|-rzp(mm!}+YfjfX
zbX!5<p6-#8i)e83Hkv95YrnKfy|inw%tVN7nqV%Li|^0`n7Oz7D#`RGmRhR*&#SEk
z)seS)(aLJg?|jl<Br{Z_G*aPz@phMAQFRZX$7f*Z8d^#Qq`OnPhDMNXq(M5QySuxT
zc<7dHlnwzYNh#@)u6z95weFvAdExQ}Yw2PRXYaGm-k)!v;Kvv`%Ck_uG#|Vwku(!a
zIU%e@D=pP~%)k~)Wz^wccKjeaht}@jK|?VPJt7|lov=O~GD)tFZyV_olDl&CZinhx
zbJzW>*x7WV>5wBv3XK+`E1fcMADW;ha)rT^UQ66lCw((p;?XJZapr&*|873$_9iW|
zZe7?UwG2)s4VK64&8L@Si=^l^7Q!AS7P*-V1PPnSs7ui>FCl;=1VD4a&I}~mUn$W8
z%_Ea#xs!j@aEFFZVMFS*jjyg|<7-0P2n0)~6<QzuORP|4Ax8nbxD1GR#3~0cU$0)C
zI~M3Y%@}t@TO?9p(J)z7s3kFwRe<Qwltm#X==FUzVl4G$r4Ibmo_m!#^?2D}!-nRC
zx^!*d?>}9EM+1}NORGE09lW!OF?_|cNFyIXnWsWCX*PJyd%MOP=9Z;1J2Pt}*k0;v
zCB3TQO@R;8(S-GHcC-7In4bH9n6^E)`KJGaMZ&AKhf=a8OI?F>=f9@)TdGWLyL2D5
z0^lI)mzG&(I@RhYOH#1K!rI*7gV$4co~CbKShJ#47<z-0A4(=q!sw2j^(i7bRpJ>M
z!K(MK(a+d2wQn;5#(#{)ATo5GJ$`C&@aj1Iu@!y@4n@JK$a;&$ti|JhvH55!mGSh1
zaP5}~QJf!~4x}T>Mk_|YJhWZ7s~4zui!`98hHI@oot}CuGIsmY20EPv`KvhWG%v8P
z-UJdZtV}-}-=rJ4C7t>pz^n9T)%2M?Ueh*mEnbQjGm>mEJ^%=Xt@lfS*<FREwxNjC
zpN$G<V|AL(y!a`hk}e+t`KsRPvGjoF2YQ?{8x9gx%qIAwbf>Hwe7>g4=DUkPkr0b}
z5kfyz_1Q(&Tf*Z5#6;iE@#d~4G>0b+LPr!TX+YF7Q9^nD%{IH%d3zVHiU&N$JTLP`
zEo(NIAC4CdN!8Qg>wOy?R#)cwJxJ7TtSeY<(&0++?(^Sv>7BQyWAtvTOsuY;TcITW
z$J1ubv5NJc_hAYRa=E$Of8QU|52L*+7?6ITkkqBEQZf~_xp+b96_;Je`|1DTv)?s!
zOQs(ZW8r(*j2}^=51L(0UqiLR{9>@%nP+^A!|Ay(yf$o>La?b(Ep7PJ!J&Lh^CL6J
zA0uI$?+{s<uDd!lRBV6)i93ROng;Gtel`wnJJHr3Kj8PS3Z=)lb!?1tGafa}fP~iE
zFnF43i_3p2F%>1gWDKBSQjm_0g<b3PPE8ORA&CEMRpl_Riw$+B4p%E4I`#}#SO&xT
zFi)J0HP?2C<zf_@3Y<a_31i$&=O1E5KsxjU@6UfOq*D6+xuIyEi^$D=x+Xna4EQ1y
z(Riu|woF?gd!@&ysz5?g|6#QZ|L?fqtF}0!hT`jEPpviA9=#TiN~`?wHO~O{B&UbJ
z3~i^zG0$GEHa$juWxjYM?Ohil5xU?XHHLqkhxT5(LqHl)s*<4OCuz&6*Az9aRgYCY
z!p~k6EIpTbE!zcNd?5sA5t%yNg7b^Oa-ZNn$<Oox7)jeyH&kVHeBqG(rSk8h6|N1+
zJVLMr^Rx7OvD4<o=sT}3Q;40KI25UZCO41A>=xB2TEd*1jMpVj6?>g)0-Iu7?{EU#
z!);?DmYMgh7zl}L{`jd)o-Ckg?f!<STh3C<i=yf3cVAy|w_55lsBcu-_Bel(wrodC
zh&}lfl_&_Qh6@~(htraeCKt`jT{D-nD*MCKnLb7o%>|7#YfkBNQ!t%;{S_`f=$pyZ
zp0i>)>HA+6T6f^Uoa~4C?1#iGu*Cb95&AUMw?WIKX?YaK3u2yglRr$Q7|G2VHC6Q-
z9naJyc7>XVB>y|8&|US(F!1r~aGf+?r)LaNPMB2}^}pT8&LcatJ*4;B_4tE#ypW)=
z#rKif5~^bt4qf~GP4VlmN!sY^W8J%}7IF<4t2{m5t2F8sXm&G<15(ZJGLD0uC^3*T
zpcORNDJ8AEK5?GTGO_F*Iz0J9uk8Ki?!ZXRAAOb_1n#BVZb#tVSz>N^;rm%VLj41N
z&9F;vXUWEaCjTc6DjLRpDz*(iwokuUk8qimqcy2BBjiKC$(uUgVh1Z~&3RHPwac?D
zl8k&JkMQ<)U5dqpYG%~ys43?0LLt3M@oL?c)buH&#QaJ=2DJ~9?lG-(C|4_NdYx(4
zTpM<H3TRYKG$f`f)()`bLF~*|qDfPzxa5hH(rYCmC5#1YxwNQs7bU1+&4?;<DQ55?
za*X-HZR8FX$X)Q4u$c*HVtL8aS01~v&@5xkFBS92@2hAU-5R(zQKNlWAO^v;?jH<%
zUdv~V4#V?%);dhY>llltk~e*}?M7K`yth`}1H{Uo^k48pkexN~Lj%=eT%5EcpESOQ
zd*_4EP@OR256!pjdgOWM@F1Z|vjnfQb&}8emWjQJ!qHN;_}AH(v;Z?JA>UhRcLVJI
z#+5}Sp8A@EN-LR$AoXtwwlrV!8y#)}I20q2=zxY($5J|M>r-4Dxnd~#rJF>}mLib}
zc%(co=hqBsgO2~3s<{_h2;I9K;f%x6{8`<lT)03sO(Qjlk`07|*;pvvMTf><Nb{(%
zfW<StlYldYj*ijLg*qEa0)s+hn~6d0?lmrtq@EWIMRlqP0yDUXwvpgW94(SzMTv;h
zEOFGL=2CO~$GB4}3<b^1SOQNDLfea-f-_S!AdSufR&dz7{t*9(T0MI>La_?TCzuD;
zC(j%YrUNI6=RzJh#av%H8t=)6^&}$sGOQ=Y6_U1Cy06>s?P}tsKPykym=*aIECzv6
zP-rgsBLyv}W470tCVpCFiR*Fo5fhw~8{1}WF>Ky*r5YP5*O$uje`ZKx86qb@xj(;e
z9{K1QCC`XjJrYKbir&!tK+h&`J$klC|0mXfzAB41Dw5K!FkFZ#lv*Bwf2|6tpGUy-
z;PveTp@ma3fQvSr>Mk8Bn!jro`v#V{k>MMitlqeZoRW)eVrWyh*=STSY*Id%v8J=;
zi68uOe=?R7=*hERFb-=ZfW$z+$rGJ<e~au;Bk({rR0gVO(mG6AV7Z2kcnARu@?bLI
z)O?24Jy3&~ti%S137w-t)mD>Cbn!)jhJKP%pf!iznS4U1q1HZZl|nc+KplDAi_V4$
zp+M-RN*p4$6i`x@W+MDp)<^v@1s}ruE;uW_um96=m>Cq7MLWz-Y)Z8IexxdNo&eoT
zS%n5?#I9K$MGQA^T%}~Pj>^H|owTctjrfBkYPgBzi~>imB>&45?IBlFCjs{zs2Dxy
zLlk?@Wb98NagAZkFS2s0k(Iu5&F^f5laNzY**N93F#DkW+lj>-w4PC$@{Ko$jMt$=
z-R9+QiKy`62f9`9Sen#ou0agc<ze*l9P-M8dT2A!>i80Nws&q2qVaC1f?8(MyY-Fq
zuPBMJw(x4MrY^8~{o_|)|IZtahUsAkN*bl@HDn^S*>3?Z_qHPLNQcc$&${JPuQ<kX
z(4j*63@`+I9VxX&69)%SAI<8QrOQOjAQk<omy%iQy5Ai|`QV@w!qp}^)X#G&G9T?`
z`0RG&%ci||2p06YUc43*+JEVC!__9fsMd7e$x55=%VZGdf9jkkl_u*rTK#<@RPwi3
z0J;2}QT?Tqea3ZJNMr_?t-8lRTFlqDxF+#2NEg3;eui8wTOJzowM82EuWRYxys!tS
z=m*x*(J6n&<9nyJ-vu9u+d)jpl1B^{_;^517fxrprpz<Zw7GkwiORr7<kf&0kT^C7
z|GsAv4&S}TwWHx@RQ^iq7sh|r!2g-2Mld6UGOl1gwMbsnuF)>LaEXSGdTE^RRkX!(
z6HlXj2uAMmz7RguG)lJdUt~_HVRFUb_aCu}-*9s`&!@g8{l>0FlZgIDsrtnJ5BlIZ
zbmX1oeCWK!qY~n)&)-JMCCt97TqeuJ=oeQ?$hJ&NbM1#I9JKs9RGEX$d=<bMG^k9n
z`}d9OwW>YWO-fnmI(6nl>rZFa&nG_)Rw!FfWY0pZ?CZcE+w%1)a#y^A?8LS@vBK9%
z0=`LP*eXjFw>j}O1c`ai&JR6qYfP(Byli>eY;GuE!46k|!59#|UP#r0<mH3p#Jm~Q
zzx~lCx2d?t11Y?83*uw<aY;b`OM7e*WMG05*|WJ9&;>^cEI!j&)FAeluwflNY)@6^
zXQvqUkuxxS)YwL$2!0;{qx~Knx2-OcRWthWHi6?KJ>r(3%{5VpU*maZP!hSnSWQYX
zGFh`arQn$gEg7yrp?bftpxb@~VfdVL5DwZQt6cLwm3h6r-v0vxp>RL_q=o+<zTp3_
z4T5)H$j<gc`#>OSOS((kpjVg(oX!X02$|zQqz5MoxH&m9lj(B;P$9)^nL;xLv5df*
z^lvzWhwCc_j9z4ro-#d|&_asCRWwjgmPP@6L5O%~U&ueDv7t?3g~f&?CpDFWP8Jp}
z$jFwH0FSy5mwrPVFwqwjo|s%8G>rwve_-T%4fj>q$C}76<D%qDEPb8%!cn%YwEwYm
zdoOe@vS<zqj4<#v^q<?vr<$1+>AT+Y^|>@GZbLcBS)wZ0#nVq;ti%o%7vC=2gxbuK
zZwMETA41iH+s+sP>fQY-zKfhJyu$_32|=@|0VCt%Yg|r{O!OBd3(3uF)4O8KzNNVe
zOx5pS$$C|81=g6!GRZb8=>h&V`)=T!fpT0@U+%>@qKS22-?$oxC$lInY&=_|f1Z37
zaarfoo~@`)w6JfQktEj5wIFJd9Oun=ABaGTL^Zf{+i!wO_PGq(Dv%_*2$a{}uxtL1
zm6@^}f~;c$l?vf)LS|MTjZk^5SdRP;r?lYyzI>lUw1q<-zEP~JEGZ=!l<5u^R0^g|
z)ay^c@4)1n|C&t764kejm@ENN4n&4D1@3NV3xm}|P)VnwA-EXxc;BHJkff%5UHp(R
zOOK}M;5buuR6TJ{M5sCj(O~feUNBwcc14KnObdw4I50DdDhxJ(=@M#&RSkh8!8Kg|
z`I!w%z~RbL>bAwai)}lEq<PHX)aWP@Y#THXgf7vHPVN1xg)Z2H-aTe{R;Y=YLOTj7
zf@B>sr}#85b4yu(Atmzg38EJF>4gkS<4SVl+5|2Jp%4XfsQ3iyazinxLUB-$7>nTC
zob_s$Me<Ocy0R=-WHU@+(3#yRWKpO!OGVa49=p-9I!l@<zBd$0NBe&tMf{?5w{j5L
z9RJH3GfN~QQt4p#5B4WIKWcjPTX~(80y76-)e^yQdM3QX?mZm?SuS%^_Mr)@V{Xn3
z1p9b1by7|VTr-qGJqdAq8yQI{IMjx68j1~v(huCk4l8qDB!cW1P`C)c)RG|N@?1->
zz(5FZEm2)Ys+RGCCd@>9#%rK)V{{7vF8KV_G&DLXDGO{XKI?z9BF3zJ=Fw@O(sx<-
z)f>Pz<@5<evWXUXd895^lN~M_GsbC-9f{K~cJH01R-|6LM7Nl(qH+`><sOvZ%#PFJ
zVG*jV3Q>KoC=ThkBGC<rr$z6RMu2Dim#^`17m_qeGR!uA94^y-$`pMs6`Dn*g9;j?
zp>0Hr-e2Cqavi#>V0-f=5MMdY=xxVg<o&?~+(BEHk9-*=NFEP^4#7?y<coze9lV*M
z*suN5wNrEb`@Fwuv!VnSF55T@*{g%*9HKiFo`EQ#8rA=F6^_F{{pi0}J+aQdke)Z<
zm+Lq-?ZhlEYq?na_nRkIRsB+l8f&JU-2@?k*q~~SCUJe;UJcJs8DTlXtJI|pf8_Nk
zW7Zks4MWvSuDL(r^Zfm4^5p363S-R&+q;EMZwKq%O<>z87CqUx5?0#O6L04>#eQo(
zN?(*T!`N}R;`Q*Q%)>`QA2#?yeYgze!%l(e@!KAUdi@rbB0QDWM5)cv(44j_|FvXx
zbCtb*seJQm5V8T5nO1zF|5a=={)>Lolb3^pXGt2&@Zs*&)s>WxQ3SjjeXl-+D)&G1
z41yE?Hikd6sqDomnZjQWs<~YfNXho=@>(oySFLV%*hr0=PV+XqBN{J4q1ljnx|B2%
z!>9jkTsQgnq>PahD<}h@>w68^|Kxs5-{RuWU;nElil1%7Md`K+Iyg?cEgSQoiQgbg
zPPw{EX_^;kYkj|a-k#uyXCc)X{$$8Zn>&7V{nTaQc&QaIvUbl<Q#%tQc5T_C)n=>G
z=R75-+a(cM`_#R8>BH^P;BG|U;t@2-G_iEmuI5bX@dk;UPPfJ3hX7AasBLg;a2gs8
zf#kubpV|B`s&r;5me|~IxZ{;}l>7&2iM<smN>>+QSDGAi)a_v2CPXdKB+$WbK;&=m
z=5>fJqi&~iUthv74qMHq)8x;clSv)}x(_giNvwLwe{wD#z5VW5ZC`P7P!tkAz8y6}
z5O30~z*UdgPV9<^!$wv0?{nEKT{tK*bOyrUbac44lAAoN6;Y4C&CD+*bi|X2b)AtQ
z#RX1;CcSo{mQr;QK4S5qP^()Ixf|+ad;h`ySx#DwX|w59#r7?6BaWY-)y)`zfo_xL
ze>X+B3#zgm&K`1<ZRCy{<9Nf+SG(?Z$lzxDlf=sIkzY4-=)@A=kfwR|ur3~BkDvEF
z7L&MzB#A&-CfaW*!p!O9;6GcHSRZLBr~GL?k7HKrgf)2fu<1=VZ+^+){jCSb*Y7p|
zzaQ#S{hh*7p+ChN5oVUGl<wyU)=Bx&oR{kn^Hdi9fn;z2EGtTcb|7l>^tZ+d)|D8`
zj1BP|yG!bCxF`tAj1E=1n*20PHTJLI%E+JbDiTSABi>LuTWKvEq+aLu$NlZ)s!?mg
zg2&+vwUK@kXf3V87KFEHF<@x1bf!Q|&|rnGzhT?#2W{lkYm(Q!XCf+dZ^}eqS-hYL
zs3cvw%HM7;8rKMn{$tN+rqbLx<L1z^9_cPm^qbWQjc8q59|)Fs10z`>S|s~pU>fB~
zBL;{_6wUfVciY^-l}}6QN?R2l#u4e4k}b|xG-Y<zMq~B0$=IJ?eMeZGP%4CxPm_w~
zHuAh<ygGe7enR_z9%Ct+<>K!T8<$n(uy&$1#5Q8MoVP6rdT=9SDPSy}ha&jT5`Q3g
z)4sY6hIOk*9avMveh^0m^Njl%D)951p;A^QLATNWghzSaKezuW`~rp-+MJg)cxbjx
zNg&je`;{?nFH|RK?riUZVP@1=3}6}}|KE|hY748h8&Dew1c;al6Va%3#N0ecIv85}
zVuqP^Jx1yk52#IW*z?4b|3LQed>N`Vd2lM*@H@Jrf>cn739`@ZP`D6wl9$w?mD(#7
zq$G&+Q1!6;&uAI*ipM&G9v>E>G1b2EW1eJQp+3xNU2gxa7pv;#89l~ocLfxfS&C5Z
zeV&Hk5F>G!g`$Nkus>o~sFCh8cOssH@uS8|9fNA6AWpg@jmBEQFDeBP1_zilT4vGs
zQN@0qj6~RYn*hrU)ckWMC~_MN9j6(o%{!g)pQ88_s4-{kgUR1tPx$lH@W<KHkQd6+
z7~|UVLOxB=ZnvUW-Wv)}E{dWh=fH5?T5nP>5l^i^lj@Nmd}@d3m>%dc%J8Ggcz4pB
z()3Z%#i~=#KE5SusD18@!|dlN*Xu4M>X0Y8p(wg}CA-{nCF>0;5<?G#Iac$YbVA{*
zV<@XK(b~^Iw@&w2GOekxUt2B+JT~H)j*yM$NDprOwS)h>tjZf6d%V{5SptxQt674@
zUm?H-Pv02lIGQ4KXX-LzQA><etbM;n=L2&~jPq%EVpzB)-Alyfn9PzQ;}T@@q=e&1
z%SO{5|D^07Y8)|GAPtV~7w?Jsi#C{Ji7I~APeoXn^NC#ub9njMO^b!*uHL;Slxz~I
zu#f^Kdg^z9mo@N=o=8xCyTV^NwElgR`R(Jm(}H2sh2j2p=AN5$WtWH3h$^mY5hT`J
zC;L8qtb(=IOV8wG1J`P3zoGRwc($tuj;o#|!bVKcA4l&vbTh>k-~^z0<gliSkC_*N
z8q4forus<pVc^2e$`{MQ%E!?xWz`eo^(LWR#fXN_1MyQ!&yMKo4P_DtXT$+)Z+@uV
ztSajwAlM}$D~jJKcwZG&d4NDNBmKcE-iBhCT6&#H>qit%bTpo?S6oQ#m#XB6Zp@JF
zk`czMvbF<;ShG3AaIqAh$nII9dfT!Z7|6m*-NvW>%$5|g&=aIXtV=S%jEkvHb54=X
zCMBDYoS@~NAlCQ^nS)vH7v-O&Rwcdap|qiNERL~hwGI_=&VDNi^lyW$Id_xaU2J3~
zsK_K4dRyTt-eVQ~m}^CpvIbR2QP1%_SOgp|v;}yjo5vfsK$*V(+@Vg5i<QKJwpE#(
zFV#lYbd`MhJ)3WxdnY-&th<@-vwiFxmGU#@=~a?RFMB}2Rvd}QHKdRvo-Cf-yLLCO
zXY54S{OAm)m%a~EIUCJd?{7JtZc+XhFm$BY!$D!oj(AfEG!11tu<9yJSDg?%^CubA
zWTG=4tTc?2q1=bdS8>*Sk;(GA9%UjUV4`5U^yW&zu!N^XuD5Q)s0{_FSo)-7t13y-
zgc8C2ySESx%l&M`!KXO-y$ovtUDmvmy&L)?ok607MH!fb&K_0F=?P1!<hI?H<5XC9
z!1veH4iM4vQB;G;TgfN!{e&!zmg!H6-{rNi#n^_=K)`)^;^5#-x|aKfF9BoQ2^se`
zTHQ;)3c3vAMBpK+o({gx@t7uMfQGQwdHbllGXu)&0S(3%=Vi-!OmWn{cG0g7e`Z!|
zNsF2-@5pSwq%L<F*%nwZyc1y;z58mg|B3sx<Kt0x>F8;$j$wN2*dv&wA?!NT>BG)e
zn{Ci6RPSjKZN^!0RgU1qS-04YjFLp*K{7Pfs5V2zJ;r;qrb7|g;GNjsk8u84{Jikp
z(-%jo#zI~RE@nEcTiZ3cq?vB601&_M0yq#JOk+NW0&f-SE;XI$N@DAfc{nN41ydnE
zo(>%>^YYwB_fthAvV^C|$Y$E8oLr$K!g@@jjCfgH7cjtqB*Cv>tUTi4W}5<Ox=b-0
z;6PnNYYSuuS3_7GPpR0~P(d2&GF>l1$=*@_v08ek16QsY1w4Y=hJlB?ww}abe~ql#
zEZdyU_VDkc$l3Kx2V*ks++$>xO4WWFd83GOmWCcC34O<nzQ!r3kZ;rE1lxh`7mi7f
z=jvkem#y>T(=0TtNzMb6F|Qw;ezp^eS~w+4{-8i<I!z>V$QWufRi3Wq%cjQ?7<67g
z)Xn~Q@QcjHcNTjbCX|;FylzzlUj4hN{9i7<BmevP1#|Njj0}p3FS=`~Ln~_CP&T`Q
z!Y$V|oNrn&<K0VN*dyJRzKvK99Whv!r?Hnzvy{A#i|L9!HP7tQG}e%HKF+}Tsp>;z
zH{2)qrIxk@eh86<TO&h4x3s6?YxC1Me@h2L?A|13nBvmSs~{vMla`7O{2b%He&T09
zr?A>YI3x0#%=)(Z93imjP(|QNIjEG3w)>*f2>HE(AGLJ+Rso!xscRz;9F{IgKR>wF
ziF?dppay=xWR&<$hZ=oFOZK6-6dIhRyaAo(|2xYY=*DUq;<$N}(Y$|5AZZP#&Q}ZZ
zOt3aPJoS3kRQ1B(ymVXw5f#=pI!*4<Id}C3K@^m1g3;Z|?17Y<O_Mqr5$W<nHP?<A
zEq{#+<r7G=KZo*xnN5-)#|eji(VZNp-Tf{FNLmtAwqS<*Qr$lze$2vBvK&WGFYnWS
zj0H}-{@rgBW;$PutJ+!bCkyMW3w#zs2^JmBP_L!yG>0gAK6SB7S{M&3Oy|cdvCZc!
zN@a)9(QhvltLM`ciuZ(8+pY0SN#WvRt^W;{{}G#@VK6LObc<8YSx)Y*No=u9)_#t9
z<?lT?xu7s=YAqpI8Qwhmy*&#XmvO)1;O@D9s=wsd;6a9!rMVyp#gaxMrz9#Q2qJ|S
z+GC%s-v;eh?ffLol8m7ohO+W{>44oN!d+4k)lymM4Ar#xudDITF*YAH7X_k;cW%kL
zJXA&Gy|wq<yJHdD5&2BN5~hXNNTB_-xkSe2noSQ54j#nwqM@LiY;T2LE2s{Kd~d>(
z-_-tFb<3N%kuDE{NM7G4wXN|C{GZHUd?;dz2RY5^uIx;y0Y&KV+gboFbj~G0FtYo&
z&7fW(ZifZ3QRqE&G)Fo9r_BM>tk3h^9$=i<SG-NyeXY1#4T;h)?=CS}4PV^;uQ=h`
zt{-_6qqWVYXCLuMiqa(GG&y_#Q(>PxbuFKVH#9|G8FnYd6vcccZ~GE%4oAVyWK4WX
z$4YV2%MO11ec*d=q`QmhQ;Q>r-k3uIPSw6ENPPP1X?s=3`0ijO;=2}w^WG!<b00nC
zA@l}+87(4F=$fD7Kb9WEoS->XQm#<*&lFj!f2O=XP0AI_+Y8g&Z)AmpH;wU7J?h!y
ztwdq7ZVQ4%gI*=eOQi%tVZ+8NHIxpHwpvP)g`|4H0@piV$XSP1Ph2DftNak`@W?BJ
z3O3Ym!PshzXHrW|JT8B>>TgbJ0xlK2?Ahr90_QE!iGvNr@uyqO(J#jO4L!AD=->Uk
znfq9FriEg2j364yOj11Y?1K%C!=P(N%3JmDYbqZ|$<76%CiQ)WDyM-Cx?44c+H~Jc
z(TOK9r{du$GnXEGcc1eV6X$3`9@~De607DyM_ioE<h2t!Jlul|Tbw1pgB6KHCaP4~
zWM6!;T*#v~lx4O|HlZ{9e3wj!k>VPyKI;4riS7iv8h|BXXJ$SGfCqqE69zb*`VT_@
zCl05XksL>11HW0-XL)&{@^_!j{711w(dRQTFbG_Zb5FQFF%i=2bD?+2$$y|IKVmJ0
zux7t}N#TWRM2p+r(7Q(7;qWt;l`4{?j`}DgTcorR*(EBGX3Mwbxe!4I><4m_mF+!z
zRTwSG!6+co5v;5fVcrxvYCeT<l^ryFjPoIRK_3ux{r+ApR`TCL8o2_?1d%fMC`!VK
zSm$g*kI37-6}izCrpcwRin0dWDQ1Bi#Y8&CzgGHhw@UmuoOr}dN;ZUk&-3qf3Yo5M
z^y#t}2p76a$V%GC!>I9SlJrz0XF}0XsRS)Xh>Yz&n`XHk(XW=G!^Ao4sB#w%7^4JF
z%}-oPA_*D`V2oK&5Y$ntv`BM=>59IByKFX!?bMTWZXvyge%$6$!z9l?U;0_+Mk7p=
zj}pJ&+dbY<Mm&%wwvAme8NGYiL9Cy8E?Mj0a-7QHrB*{n&=a?24L#&%JJ0anzy75<
zN<O73UbhV`i6$L(%Iw#TS`@1Pofp33NJxI`NW%I<4^09(9%#f%e?%0nfK0ukH?J22
z3F3h#*S<<Nx1VTHK0m1XT~j(07Yqxf`F)S}a{VNI^7@4Um75tgiA#{grt&sO?P#^*
z27ocOYK=ske-8p8hcHo$%$FbI<KqAz;Rzs2z!`Y}!TLun0&o-{;ACWFF_tLxNFJ61
z%{N@rFCh6lzmjk|0We9P58r{U+lR)0$?#gxr5HFIUlU`@SEEkXX-Nd)&H>9Jhz?lH
zizabslIh&*$um$HHu^Lm<1^NQ8da`4RFE|28U>SWwB@r_?02N-Kl)l8W9-v#e+W~d
zyd;(NN5o7a`_PFfD+Z&pM4He;XDkMV@Yt#6<JHh0xYZFCr{Pu0vmWNWM9-=Av}ygy
zeY0BC#u)@}RhPs~JuMKb*cx3ft$eNPzn0=E+UW}q2MvJTF7MJ=<HU>(T%ou_<Iwn!
zH23odQN1`NQdfhiNhFduQ8{wdY7(xta2x8qHZMl*)>mZ)F?5bAf9i5Rne+WJL%<Xd
z77)D?on*nl5tu`Pk)sR(i;S&|#RrfE(4e+bf3zc)M+YgiX!1?}!|IQ}yLWMpFRWXE
zFR1SjN)~@8%@}i%)t4l#`EbH=A6MRG@WbA{?28=IT*w-;=DxK~fLkZy!C4CW&qv4C
z9uT%s<t1$`tw)M^ed$x9#`A4YD%a-Rmqf6!Z$zbSh@^v?@u+0qe!cc-X)PPKDQl)n
zgd`D+N2j9fb1=@}1sVxbL5Kxc!GU2obaG71HP2fezl}w*e1;Jn{KcyAplr1G*Yg-m
zJegttM6+jrUXL8wA|wO^uPQ)7SqcUUF6>5~0H6-|vPzj8MfLSbk$^;qfsP*gdb(P_
z#R1S=0iLF*3DB5f(};=rM!7C%O*+`9+iqueF_e4`Ra_WUsp<fe&<XJDQ-XWB|8To3
zt9&4XFs+yKDa@GX^EhQ7p9LLgV);!nOx%W+9GNER!ypTQ=glCI^R;~;l{QD0Za@yD
zo5?mbT|-KphYB$&B|k&Tn?X9SC{rq`(|<s76X*JmGBFdWi7;hYNYoqCqYHH<q|mLE
zqf1huqK(s-;E`a8nfyJ?pJ0|4j8J^Kc|9MRVrpQs&wCv`sUYgxUpMpK^BP>pA>h72
z1U1Qf|KBryKFW99=M!kmMt2pzZ#R$AE{YRvuCpyLw*JkRN$HufsJ<PhXtU{#%AIPw
zZP%9Q*ESsq27F!_P9!Qx){jr-i)u3(?fZML6p&7f1Q)eZ63>4Y4DI^mavrddFcC5J
z1!q!F(}~mbxFM*Z2DaSdA6vGIeEk3twYm|Ot@yM>rcLuWdvQ(ihN}bP8_kbaH@r3(
z2U8SeP)YOf;&XwJgtGC<Qk@5h_c8uShvOoZgU%O*qbV;D0>+K+7UxUsgJcT~^!z@(
z{xlen-lIS+caN_Rt1)q2QoxnQu*uLE!JdPK*rZOLp0DPdu4_U!h(Fc?_kFUrzq%28
zNJu&k2cdz^o0%Cad6IvGi6l@kqP0JUwBtC&YqJlFo$adCJ~ds-w+6Z2Ni_!PcG7=#
z5&&X6^ML#Z=$m|tXcd4#v@@Cp+)7y4*^TQ=B)q)tq6wKxm2-e)LcFMdu&{)LgpG~O
zzl+ENXnfc$0Dl4gtf{Nxz>CUsx-W1;dmPOKxmqxmoIU^$&J82reFC8S%0JW^=mFKX
ze`?ehBjJ(YvgZ;EeUk~k?qkEC1(hJ7>54q3IN7|DT@TXZmIxtXi6SV5DlnL`l{Z^b
zQ!`FrM`A*AINk<dgN-qW)6yC}R%?$QT@_SB(?uxNNeNdriw7e5MsXz{0%1c`mI#of
z`J;zu12CR6fqsYWzMz#{sjs|pzIyk;G?njS5iR}eQ;(C;d>05zBY0dPni6()4}#;2
z-F_@G$CP8j%tS>Wgd{_wbD>U?MilG0>Q@^P%<-galY|5i>P7xG+KVB@r#UmgCmx!Q
zaJ8fQ`R-RUVr-F0pq5EetXol2REVB|ZPyEcOYCf0TOo&t1l#96z6B1Nge2g&=;#JE
zzd?o0!GADh2Qs~M;JnF0Zbb^w<;IBc&Eq_yVWqx0eQ7)>f8%o9W9ab;N&H(y;;g>3
zkYr=Q#W$<CZUcf0UX@PbM%!|YDc;kytarS>-gV0ob~Kz+k5Pmt|L3as#v+88Hf$qv
zn*%P}aje$kW_!(+5%p-K;Df-`hEj7aAO1%FnD}L*fN-h+%61YVZBb<wWpqQa<A!##
zW^73W99xxBX69<?fqU^f8G_nvEbK2rqDF4i9B%ptUYI+MiqeuLxsp&>3~g%|iC9KZ
zz>n~s<jVZh6hpvGVt8umbiF44)&>B9OiWAw!`W^=k_<@nKq<lmAe8@^u>bgOUVyLz
ziV6QbIe?-CNI4Hc-v&}3fa12av?SKR6ZjtLf2nA=PFuJ3pwc0ER>XNY1t^~Tvt|E+
z9>44xCx8&ZrdR&f<BHgS=3V=#B3AQ1@U!KU&mOafC=IWtyUq_vAA|ga*W6N8DkvWm
zLYmd%w|M*o-S7Sw?ng@9_ncK&rXzG~)ug6TCRBRw$K}7YXGu#lREJWNT-e$D)+jrV
zwQ-NxeYf%j9)9#o_~z_GUcaxxp!A~Qc|nI1H=Uf2A&xj2d?vUERNyD4fQA-6HBKFR
zA3jQ4;101O+SUr-a9AbAe{#3$Opf<6@~8Xqfqx;2=65pLQ!p>{);Adgbsf)REO-9(
z%zy=v603aRxPe6Z_Db)%qi#Za{L^}6NbnNRjt3!W6%~yy72Ia6*|Co;#6=YN(5pKg
zK4p-X=#(1B^W{3mg!mV!y5J{gYbYz|<PC+?X3r4B__<zU)4r3l+etZ;1cS1kR?Z^I
zGvzgNHGW(`s^v$cqi*CHK4*ofntkJVm<p7~P=?(Cv(xC@_|N`J2SYd`VjHDZL1osf
zkkARUvQ)16-I#D32Wy>s!xMjgpoCFjv+7yltgJPZ*E|oyn~nya;x})x#+oixk7jPO
z+u(B=z8?qr88=S`RYq2#pYnGNlAk?`v*coMASKhx%6`?4sYssd^R`A$f$5ae8{
z%>KT~Tedh8Nn59#^Gk?b>qyr4+&slXUX<~2da*K-T;(svuQ{n5vkr9UYHBJ`aYO{#
z2at+l*_O3wAh<JJUVOP#FH(7=sB=wE1j*`cVS}`vbyxrhzuxv|#lK6?sI#@R)Az?`
z89=|Qsj9NKwRIr})ZEncbN~l)2X1LVjY$^A8px*sg);AP!*(Vp&z9Ug5?`jnhQK6%
zSN$(C^^aBuz9mgv_E+zej`YTRlkOjcx_7{K%l+ulE!FDRFAm>pzT}y{*<$Z3yu9%7
zeJ-CIhLwPl)!CL_#ILS%?mifg3jc8ehPr-@s6>1t=Mh@Xo3(YTI|TYN9j|UwLHE-x
zW3Ox7t4rcgFo}yM4R>~y2C;{$Ls$70uAH@awLy2QW9{&UY2NpM(QMO(`erjQHOdj9
zq#Y_FX?K46wc}nSV+x))W+YVR0wcHo@jEy~;k1pr@`Ls<<L2mK=c(4!l92DWZzcRI
ziJ4D+-&hykzrC{G6stXOI(9E@P1&C|HwsL9l|zOJZ0%6WR)(Spl*&j)cRaWJ>5u5?
zIi>V_{;EN&$|yIioYr7_=ZD%^krOhf>R?c&qZlNksqDx0=$jE#a{BZk5!Wa6uXl<6
zBO-B%k^{bL!+h1bmaSzUCKjEB7W+?W;K&OasxqXfd%wf8f7K&hmv;Jf{UL@7bTB(K
z)L38aVMoos$Ehr-syV7)9-^$?fRk<l$_|BphmyYGgm+VG2%uzZ^k{IgDM{f=Cql6C
z21^^T-5OR@8hkl}FTN?Eqf4tlBvJ+3g*Mmb6UJ+Eygl}a`b&<xcKyd`1UnmPQFY0F
z&i>cSjOW~PF?MQTFKuGC3nl7?J3uM|#`sc$1qLx2&~bPN42Cl$ihzF$@P;6Bb8{dx
z;RZ~kfIAj2qdHvc@+niv12o0x=;;2<GaL)74V2GFv-;MbZwkD2vw#GJFMxCU0D$=C
zAOGq0-zL%cZ&|9cYGw7Y7m6g;w3yl41w9<rS`vtyTlb;SBdXbHO`ddUo+A-|?YLkf
zOc--&b$DzI`{Kzca2%_GBT|^k4o{96!Z&*!{`fQwFG!%X-l<O|*BER0g}v^Z=6pz@
z`*1|T#wP7RB-39UW@cPR1nh^cZdow)c0OOsxWpD3zx0zswM1pK1-4Q9zkKOB>DJxe
z0TrjS81vY^pP%{WowNDlOG2r?2zR@itB_ksm6FZdx7lf0@rPQX&+9w%FU<?Fj$aT_
z>Q0lkwAS6HW-zAgUTz0u=%3U4l<`0%Oz4SMc#cLiQl*g_obut&afQaz-&=x&9fl&u
zts)(7-EAF9(%`$kIz>I->ICJVQdu-H8Lj7+m%)3=cVqG2*q=YzmFPsicM#o5Xgm)n
zT<v2^1*tPP?*Bk$6ilw_NK%TioorOF;O8fn-z2iMxQH=l<IP%iNinAN2?fPnjnhe1
z=qVgfm%q`_ZJUVOd7n>M;*g>zwF&FY7_Q9E(0M9JB~f|6<!WU4$u+w?hp<lP$ImK6
zWjJ42i69vX3uo&>BqJpSswvHYCkT@^0fwhat?G)}+JtXrfS@ZZ+)-ToFTDX6b)1!m
z|8PoH)``hU2ft?E8md8oFa@Vx05zdcz;epJDbT+v(v2UZ#XX@O78$gWjw8jqIxIX{
z;kN%K<Ji;sF~l?G<w0$Yv5b*j0_IX@{f93pHW<4gZnj%4zT@}V7yMdLW8l^rE2>|A
zn&g)0N3}9IvlG+d!PI$_{7B}o`{@g+6;Jogb9UDaI)%G=#9qGDgc73#?lKBF5iH<*
zUaCvBcCTuLI``NP&SA$&ZgoeSv9RXhdSjs|MIi)gI@v_YQ4yQ{Wp}Dl`}Wj)H=A={
zuy~VOk?R>u)UVsH6|d&MH`QZO9i~M}LvFQ$HdN!~ZeP&mN=+eTlLp(gMhy~oT&;Sp
zVik8yHRfuThNdm>W8Dr7isoUI8u6yOyBIY{24DdGUch6!KSpC6BbXsC4cF$`y9>c2
zrw_)BN6%1+Tilr$ki%bwppKC_vSJBrPPKhia`7zqUuLal9r^2G%@T}&#@3`!6JXb>
z0A{cWZWHxV<on~MIZbs%!PNX$YgL*s_z54UagdM{<Qt2NkHzklWjLMX&Kr^gN`wDo
zbara}<n|G|zu{P?H5YWVnLMYdTUuCL0c<aTHSQ}S0>AY7z*AI%#TXHw<O8y|#aQ|T
zkk$Zj(kN+3yS$tneKoadV2uTs_+||T0ZJ}0F>y{+(BDtuTP7UT>!BzAYGpwG=pG<a
zh0TW(yRMfH%ZP4MvMNGNau{~tp2w>#-jtm=?teMG%5qD)AVHyrx60=_AN}S=)B@dB
z7K2aijJ<eYZnwA}?9<e`rAot*(1I#dhiG}MVwc~6NRtCceNHaWj%(XzPy}%CRT%bD
zO+lpUJjyR@*T+X5H&2S8u5Oa@a!}@Ugut6`R+GISC65(JitQ>Z96IEsxkpbZw#;|w
zek;c(DkMtb!BC6(iggv}pI392r$pI1PS+vgccW;G)so`RB^jDxSH4?{wQ`eU`Z>5c
zObix>9UtS1sZ|gX2nx9kk_g?mUVFjlIqZ*q{X$s6EcQp|j{l7Yyg6UpxSS*CS;DM?
zk#T6g=CxSIR}UsEZ}&@e2&Z@b9DJ*-B9e*QJh9F<q(@(y(XPaV^13go6bVk6)|u~}
z`p6<OIJsyo&595jB8&XfV1jm$U_VH!Dva8)4w?*8UofuK;L}BMEC&A&+F)x|{dA#C
zB~6280}RQ!x=Ux91BiJorKOgDWe8}Cz?Y?auQ?VEA0L8~VM}XkEAZ$Gh_V03@c1Ag
zOjTHzn5wQ&-pt0v)(u3rR0MZY0iU$qxX1rrI0pmx)%s|AH}uC@v~f!)>Kike!-Fd7
zu+S+BcWxv(g7e3`n*WmhytDe=317R*Qu`ipg2=I`LMcOmMp3lB4vm}x4i+x9cEGLC
zQNU@K%A;NJI!G&eAgNetqEm9+Dr&Ix{QSRI@+Nk~qeLPW9<=1BpLW~DQ`_?NDNblQ
zz&tmzhi2;_K-T-OrZU=)vv@xhnk5m7d<v1Cl){|I(|aC*Nzq#vG7PDg24c5!O_sP+
zDd+nrkQ?w7)@HOSHf9OG#3Xt&bsbF7#VzBk`5kR2PY%)2|H{KhSzgejz?-Pl?5GR!
zb)_)bUxFO^(9F9U*4&^ZS2(Xfe!I`vt@0F6=rKDhAQG#&dAhHLzc7iocuU27`P<>#
za9c4=LzN1`jWHi-CBk`>OF41p7b|m&!@otGX+$ng1rl#aNJvljSBg`lIzR&8VrNXb
zX#?<{0o<1hSjFOB`!o=G3PjZa^SO@&1(RH$4!E0^)@=aIn}3Ojf9noF1IuS42qoA(
zg3|2*MWFy0?R{K;-21BUR1rI|i+!`A#fJ=(s9x)SWYyKz_c$~l1>UZ|6)dciue&J*
zAU>!?1mq;O<<<AwTe$<zc%x$)`Es!pm9<z_mdzbUc=1DX*-YzpE?gA4I^QMHX*?Zj
zA2wDWYxHHtgzV-as945QsB<crI(y~V9dXD|{kp>YWad2oC-f;X%D~80h?zQh<(^u2
zAl)~G#8aW|A+4@QmwC~Luw{2Oh5E$!YuaKWNrZ;h!mr&GBH~a9*w!wWHwzhzp#lOj
z2`w2PL9P8yW$>+-VtXInzxPbe_H626v<l-453KxI4fc?cMQPLDXt3Vtx!jeGwx_nT
zhdcdR>?Z69MNhO$M^_gYU|q-O@1J_0tdotib$*-8YZt3%0<<B4YaW>E=5v0KRaTzR
zP!$2xTj1FpaEJFM^MLTrI>5#Qn!L#EMu=6OyB2hQp&_X}jRXs|PAWL)ws8tPEPVVP
zB*`gQjE480|AQgQFu5Rt_*3rb&u#XG1hKx6sh<iGiMqVWg|~7AgB7;R1#WLao9&va
zqD1$PF;33=qbaI|VLKIO)n%RGh&TH;e=dvqm9(HUS4_<Fe$Toc+>;O*JJsYlMyu49
z{*+W>(p3u_5F!6c;bBEg!Smw8SVFB~^`37AaN}J?p`7#OZM?ixB?i#_VCo6esKN1l
zm$lB7Und~YJf1oRblf#?{->qK2|}PQ21m-AbpQ82#}hSMVN~i~Du#)kov%IV>mk7F
zXnI!AD-3wa@mBNQNcvZ)>C50=OB|87tGK6$=(uDU-K#p8royOY3<F^F2SobcFF>!0
zh~=PGx6u+95Yp1pO6v;c$uosLYf`xY%oc#sU-MW`0~f23-YU+}rJW_wOnIJ&V$VyI
z@F%D54jmkgP<2iW67@q{vni2?qpI=E0nnVEs3>ykQ6mlrI2nM%BODyW>$#fFZGLz}
z;bCSxfZIIvHUn#JZcbp`{}6|S=;<(uR@k1Blxt;T0z8yO3*!BtS6sn&d4@QSF)y9%
zH{|@gGJWCsboH|oHcndVT<lBJ`T3BdaC%fmfuYC#%eU^_s>*{`P8f<9*dh$eB`JK&
zGTC6=^EJ8m@3XnRn-qlfcwkv1!ES~h?KLf)6N^B-(VGYo&PET!-B~HICR~mL(khqF
zyXmaS6A}j~0e6fEqUF1v1ccu_I;+i$$&DQQqMA^B>T+x`m?1_&!0GvcIOF-Kn6Qb{
zfDAm_17EXS52R#dj_v!UK%62jD?xBK69_H;Mrlec98Uo?S)$U2KK}Q*VxWFVpeXWs
z3u-zaCu#n^`!Zgyn`N)mPAab*+&e^>=w`j`K?)6iYsRo6<ZvZNSpkeykEd@)1K;4y
zT75cANQhf0I{NVv-M^yt4i|%7cMoBl0O0{t(Ve+XKxlQnZ{?wbS6os{_1|+F@m8F(
z^R_YQ{VFFNliHqnP22wUQB~Cul~n!Ke}9a8EO1C9mzJt{8R_6e=;E`Lj+RHAU&bfq
zM?(@xX|f0#A-43&M%w!C-g@(ZQpS>%hML2liNy8jJ~AZ;5q>o<+xy?os^vR&$wcHX
zR&Vfj7_0F%zu4;i*XOAzQq=z2fz7iD%}NL0M*^(lcL3`i{~{Pb5-O$J&dDygPJu`g
zaZUKt0%7v`!8WiNI*koSj4I1l5=ix1838@E(w9y$ELIpdxe{|csKakr$dPSpyTW6Z
zttfm2n$jiTC%j}Pq850mn^%f1-|(Pv${;M`N<*WYVIbZDmCx1Q8d|R(6wt-FJ4_vS
z4zkG{hY!RG&jJz2_NEpb<0eE8W2-v8oBQ&M>FE)CT$mhxh)%oSVv(fLu`LO$>^4r#
z<ZJuW*~1Oe8l+i#hfoTYUJXguCft#Exc&(WT{@KGIo0~O<Wk4?TmU*hWIrb}iI}&9
z1Tmm;&|F>?SXHP@^}NW}b>BHWzHEJu69%|+Ak$C+yuSJuUjVXL3F@Q19h*!djO#>*
zlJK7P8t;2;U45L<gI@)@eeMK2f<H=aW=Yq4=pAeQxP&~+VD%2lJJCB$?qbO%#<=2W
z>A@BGw~oI3R!A?N-TH}&qaWf1$Ji@ZtWS&{vNOu$VM<OC#VMZY*!+C5mgD1peqxlH
zDKCeBDk$5BY7H`cj@?1u#C*lRi_3yQfgpKliT6FPQ7N3P1;{5T7ujdJyO0}e7qunr
z`lJ;wIJkwy2XaoGj2w3Tee425$E1}MY*<Uk3WVCO(}=kfgx5w!es&|sHw0|x)0tN?
zJ~J-9Edk^O-47KPj3p;Pu%qYssun2swE;N|pu^n<<Q21<ABi4LtLdL}Jac9s0+rgP
zR|kXBR0wAosr>%uB=snT_s=M=h6OGiM>G~{%I)k45K*!_MT?p!v4PxK_jUi6)Gti7
z&a<~Ael7!7m%ICD1%yYQb+a`}529#{eSwzN9}LLk0zP-ER2w9}n@0<zJGNsTP&tY@
z2WpdoF5C5Y5ftN~sZ0`ggC8woAWO<+dJcKBB3^Dms(ti27%E8fv@1*4yfsB%nn{91
zA~=geOrp0U$8k%I=H}=UlnAQdHkZfvNIRU4l;elbsxR=rWR-H=wWGmJ%^@2Qiv@xQ
z-N1<DeSLIyyOr?o;TROm^F;Z}6ZZ5uu%Z^9v_g&&lvbd8?+`(8XcQpKciM3~fb(Pi
z$;I=R@`D1{Wwo6l(CBI0E4m4zs^eCn$3^+Xutkx?QWd|@Ql%ED|BHvxM;wcI<VDl;
zQfvrFVGt>>-wZ^Ji_zxb+ZRlW@_O^{$?Wr50P_Td<^UgXNM!xL&WLUVg3~bfl2sV^
z?CyUb)+)|vCT@Kqv^%6zCJI4$la*3YM8G30p!aYP(_GuF=XNSWvf5r>03C7Tm?_VF
zx*FBj<CDuqsTzxd{tj^SVd)|IU*Z#{^Kj$0PO#?%u=NQ-2mhhr(~$$7`@g&$@H#KH
z6Nl-tNrzbH)YiuN5IB|wDmo$u(LNmME|z})&ID`cyJCaA_KaD+yt`F~I3dKly(vKg
zg;L9f&p+L4W|zHYr*O4eQ|Vw<l`jJSA#tVceG?7_J*b&b6>nuOCw4QgVOp2Gg4WA|
zsFO=<=UGK|;RWsy#70i&m%!P2CppX-y+*F?Px%Dcs5CV45oq3b4HX?PS5F&7wv#Jx
z2VBJGzx!FunhlQZAO*W?JQ$Th*-4lc5rv2Ob5>c#;}y^G15KW}15AQ$)`PFM0w*1-
z55|>~DdxP}d&5SOPUs@|v2s|f-77uL@-lX}QHM`BbICauNHqa<)+Zn@zz$5gfl^Ss
z&;dhr^XY@7R1fU=Wr#hsTaMA)dM_fZ#e)`<W&GZ0EDvkw_5OIMnD;{Phj`O!`BsF;
z3_@DTctjMC%t{5yN%ifoB|-Yv8lRrMmbDWl?UDm+t46^6MVKJTcABsjd)<iuN`vR$
zhk{m5U6<=qXYP`cqq+Z0jz+4!kc!W?tk()1-wyuG$`iX25VB6ZFza{rgFUpq;RfE)
zd@G7bUHOD^^G8K9ZLOFQIA;Da8@LPU7(4xzr=V;#zS9$GFzpM5BsLz`ZJ%In!p-iI
zf4;O9K9^{ttB6!@b>Y~S=%Qv`49tBVP9Y@A;VsY3gNoNCv+y|LBr~x%00+Xt)9!(d
zoF81SfmTA!zhdHODkIPW-r@z27@&y>RIajm3IfV^r8WDeyMJ|SnBOv-?+bw1yMGk$
z@giEs%0@eMy$f9ly(q>^y5L6sn}A97^<uoo!uW5(?SrGX$b-wIC`wDu3+{Y!A&*2N
zBFv@tgz&?OPRD}LiNphbWA=mJM5;VN%-%_zhR6CE=BN13+LcF-$^A0TwJY|gD{c36
zcC9uqigh1$yN+XbYHWH;&~i%KH^cQ$b~{l+<6`&!(S3%x5+x?3Q7i?#G;mK2zwT<p
zDd;uQRNnP}*zLNvHLnc96p+*sxw5_rdm%RNs5v{RRIKUFZZ=bgfQibV=!0D*wM^{6
znF>U7Uq~@+_z+DcwEj3|>6AOxKf?CC9)Xn?As|8q<YQd{_XH?m+yTkJzH9IBf-K+H
zEM|5B(|YIx#$GaaeG$A4B7sZ?b!PHSt}Tl*GS9J&oW3+VIWus`(pAs{@zYLZc#4Wg
z=(yP4%8b4P?hjO7q;eV^3^}{J=1U_B$1aV_-Y4DyRdrtFtUD5<viy9XdEHIXE_vo}
zqlD_>oYVX5WXu@oc#hq9uJ<)BQ)^x*I_haQFJ9{!?v8u;(<$7T5-xZgF!bc{R~*cC
zI)=0U-!-+r)!7;Jk#p47;HH^Fq-oy>4)k(xE5)yJQ=E3WPd@kty#IOp`jt5{GQE>O
z&uok1e?m7}tKO4dTdglQC-Z(cR=ZhRSQ1p|@c%S{ddg%0`}(JEX%`#nT9ui=>^QJ1
zMSeE)j0i5DY`)LmT~w((em73Cw2Bd4+g)>$+bH+^d^3-Toqsp$`b$+HKQ1;(QlrSx
z0~V?EbPRv@1j<PhNTN3Qy!6!Wrk)nn`YSa&MZ1-5Gn+z&ua@NXQu893&@tDk27S=(
z`&vC7r*XMj@1dBR``yaD2k|m}FV;BeuVNpBj|BICQd2Ilmag4(ph*=PadWr&HID1m
z1dn?Wwsy7Q%?+WkaLioPl+Q_Pii+XmOa>p(ddI0@i}Q;4d42^C$2<EKot|CqJfTMK
zMSnaN)N*QbNpos!YD<O+Fmk1ZUU`31AQeQMB9s)AJQ-80{(h7budk5FLgq*u_m$-E
zoTg^Xe4<xS5%m8s_Eu4Gbxqf3<L(Z@2_yt}hv4oKoP^-+?oRLocXvo|cMI+gjXN}M
z4c&+D&GX-%+a7!L+N)N{thK9ZqMJ|tnzTw{HuimBA2w4+Y2dR%8+ye$@`i#LI<B==
z7k4^6Xt3dv2Q-T;&#;TXwfELhTaggxa;>nOfhK6YIRuclOy#<LTI2`Du32R2Prqos
z%I@KK$$k<A(BO(&5`xHlO!b?sq|o;2BvdZWctczo^&$QboN=d+o%9>vUDKA#KryTg
z<(8%<Ot^`BPdPSy0Ts4baR$PNvJ*vo=}-Uz{!9n+jU^s;4-(%5LN;oc><oHj@Olrn
zY#%BK&Tq#li9p38SV=9?vD7=rmAWxm&gD4m8m54zfjk;=ZxXz!3}=uS8uiYF=EYZ;
zKJ(<4x?T?>@6WP5fQd|}Pjt;wsS%sk6=eEW*_0X)wQO_ve*C*?>kF0@{tcHhH1Ux!
z8C=D>Ne;GFuMw#iD;co|P)4*jViT%q6RLSSJY(V8OTUXQTB@MW4H>M73;eLiFk<T3
z(u6^H!FhZD@zf8hvar+%1F`PkQ)cjCaX8$FP0uE>&*cw<SqbPg;de$c;!-B)6IJtQ
zs6KI;EBl;gO0SWH1GMC_f~6>}@WUbtKLxa_NrvDSm46AOahW7A_jK~2HQ)#pTkzG+
zKo)|T<{mo{Va#mA+UPuS7O{$|po|g{m{{<wzlTU%0KKEfER6ts#z4G+ir;`oNp+X;
zWe$rRI7{kW$~`mNnJnf!wsW3lb=!>Ms#Hd0)h`)dH~y%t+P^+8f2i)6u!XEY=vSRe
zLn+<J4;LXN$-(C6`Ijzb93Ury)f6-t=c<c3!>>+!VLO9NGpI#%=4|a2UckD|-dH!_
z#$p?2@4i$KbW}x87z3Y$^En-V=GWRcR}y_;er@&5h5x-Os;SP)np5HaH$!=b)V2=$
zPBG2wh>rv1%U=#{Day4@Ec0hZ@XAM<?v$QV)pKtQ<5~^y!UzjnLs^Y7#^nAu^;!lD
zG8>)U=!PL@eWLq%28m7?XFoed$rB>!XpJsGTf_+1k5};bGX*|#OIQ*sCX|#dcN`l!
z2)JMSD5q`eR>u(Q7kzsw=U9b5N!<F9ObH%8UvhG&YW@17*%W;QPy9`xzYkMoZm|5)
z$^Rurq==w8G0_k<)$Lhh3-KnY$5`?GkvXh8*8jkqDN2sTy)(LOA>{g4hw!6T$anXu
zt@Cxnluh1Pg!qfobGzVT$G+f0mF@(JMv@Mts9ksZ#(DC|^vER!R!#eD%65Ei{oc-+
z&g-6G#N@F(JLMK~%eSsCIT~2$lPM3W@q|8`zH}2ZLx||#>QQ!<IYac95!tNM`wm>9
z4*nXA8~rG^%R;opDmam$5hUhGy_EXZj@LIcZD318v|VB<Wb~3#^eg{KJW1Nkd#tXD
z*Y9T14UU;rsN}WM!E>|&WuYV`bG+SYu;d}jZ%M29JM=kw%^wV*Wu(!zABo{A!z0Iw
z(?hiAiK!UT(3E}^Q$K5t#QQ6uf4vpQ-udY!da}YR{^DE6(+D#gmN@yjvo7Ha`+<My
zjz4oQ?Vy(mjOjs-JGJ&+tfV{FMD#5zbId)glYZGjd!Y>J%2#Zr)Mus!B8tbJDZ-ky
zw0AxxV^f<>(;X;Q$mwA;VST~m+=XBUkbvXG`fXCC^kmyD3?Ihlo`MnOBuk2{nTGfK
z5PE_ZNv&%s(VJ;JIB{guG7TIxHMasr!ttuIj$hzXTmr3+{Q$Mli0J7&#>sMQuEH-6
zS!q(vQdO;m9ezMC!a+F87}9i>p(F*7Q#EDOnTfMRS&Gv_QtHeHDC7CjHCEMmA_L4j
z7m0BuxpghHMacO@qI@a%aqQ4dvthe_{(W)iCrJ(mAElF*Ej07)oA!rS5P_eAph!%m
zC9-P;(R#z}L9t&cIsa)O3Ep_Ltv{*O{?L0emNu^^&iiA2D`<M}5xjYC&wMeKJ#p98
zqSga5L&$hWV&_)gF_Q8rYCen{pzLUMJ2BUw>$jd$DP_BdPtva(cpNt50I$2gV!t~b
zQPYcclx!}ntDq<eHaVy_v`rMa(ELrKc<qVyzADRgq~UM1-0y_k$KntZ#mtk(_nuXf
z3-OZfs2?Vlg{)F^$Xt*p_5G4ZF2uIsar?eJjuG@2Uy%KV@i|aiu$Z8f8B=GFGO#$7
z@lXc%{O!dP5pLYiNp7*a3g3N5&(d{_6Mb|6f>?Cf4>fkN(%5@PIxcb&QJ-!MUvr<H
zN1MAAl~)7kQ8b9(D<x4HH13G{U-yJ_!_6)t5@2HZwwW#n=8Eoj8~t(kp^F`6Vh0@o
z2XuV<*50{u?BcTEpjQ{XoXTS0zb5<|0TOn)2vtU-!94lxG_ZN|(<k6MRjUliRV%@a
zxUcHzQ`g<l6D!~E;qlBOMtZt@DR{*f8X?U883ux>M#1oA!USc!(BF@p>!@4dNHTeU
z6J3T&r(+zg2ZKEu=xX7gk9RAPl-??QV%OW|DZ8<*qlJ?vQR&^A7>?7Fh`RL|QRv+@
z*Ysep(Lc{gLP2f&4o|DEc;2g1-XMyp)yiT1PSfL|sh8(oR_HfBoBiIYEP<&GeyM`h
z&4tYw>xt{m;`7h&tE($<PL^ZsYLy>ME*)MsH@aJGH&CXsF1v`v)7i^yS`rOCH!7xb
zo9`QqJKy*&`Hsr#+fRq^_iZDbvZ0>;=RPHA1Vsz&G661(@I2@wzkH@n04Df@#17tx
z%DpC0E_F){7YeT<vv>V~ckt7su;L-M;SU7K1b}Je$|Z%ufQ%Nm*E{ITAg<=Qk3%+O
zR>A`D27s9(<n<i7_sF__6)Vc(_sZ#WpEHlrBSBmjUQRi&J@P|b{VO5BC00m=OZC%5
zmQ4!uU=#yPK6^9gLC#jr&5r9T)9E_Z*322ucvb~ueq0fZQ`e&)pqeS~7LJ_pefuI<
z=Q4+X(--J~itbHDQZBKg<^;%sFSe|||7O3F+P4K<Gzx2p?Z^1_K2;AEurzJ%giyfL
zU;@jA!<c}^&C=mASohXQmCL?c95KmN7_H$%7q@2m66MomduC_gyfoxobekl0M3E9D
zBkM?!w7==E2K(4uxg@qxYQ*89ZnFcYWr7W;<@e@jt4Q7a$-KgsB=E8Sv1(mz9y+XS
za4JKSI5Mf6ZYU%6nG?gxh%{=PiM4hwOiLcXRSSlX`x}M@OU3o$6JhN0my<FQO~7xM
z1~K&SIf5BP-@DA!;Ba#|s?E+XM|u)GP{m_rXJ|BrnkD)-KG{6E<4z}k^xB_{j*<o7
zj)9N?Ns`*P@l2l6dPD6|8}@fYkZnLT+)Rmo8c%V%nG&cc<dk=Ud`Vt_T;G)Axt#(@
z?T$84_B%r<TN0i$AB<%c{p?~A)rey?CP4LPN1ett+|=6Mp3TKw2t{8_Q->~G;5sa;
zlBt7;9NS-l&(X=AIAT4;HWAYscPnVx0S7t?`6QAs2*Mq0@<?@T)6M0?5+t1umkTYh
zahPSE9{@r6>}=~gpxRcHVZR;i->?%@p?ff)$_BxZ)e=2+)}ZpT6phFa&+*x1&;NI(
zb8}3<w9K7y*zq&_usc=*5r9|%tkIUdig$h&<K=HD8(wj1A0;a(`yQDaCjgtZI6T5O
zV7jWj>Z#s!AfGT!n8Ryy@4o6QtW3T{VH{8CCAZI{h~X34+eD<TwBPgJMrra&gLp_|
zEr>CX6m3`o+*OjAzK<{x;y#9J!Ws~4lBS8~t2cz<{^@X1WzxKRjQpQ@w4-t~wVZGJ
zG`y~FM~5|Lgc0_jYyLE7MdK8d`eW;NKDkekz0hyb46D>j=>|}RXAAm>;MmAyQg1J}
zx@r_e={YfX*!@&yC6eZ{>f2JqWYrt25kXTTmdW#2V%Q~(i74({FY4%1aK2Mf{<38A
zkh7`Btv;OM%7O|;m{rW4#)tV;_p>KX1izh$J%jbFHqf1b%ORVV5s>3E9eN*iu_{Fy
z&{<0~X=cO>N<art4Isf@ypS#4S7RVe*zo&KQKWffq;_bDo#YHk56Uti1Bhy|M%DyU
zGlYFI0%@Dqn<>Onp0@lpn9x!X(5UC@z0E~{<#19fChAy)^bnyn`*Zpla^-w8JlQ%6
zgS8@*<U6wc&EgP1{kZsVY~M?2&oNj(LRkb&xv&pE&4KykepK#X|IbannShefQ`cEQ
zR;q17IJr>co3VMAwA@Fz5LJ`gYgSgEr_Yd$%j&(K$<(%ep3T*UiG2{VaBh!gedegI
z?<An9T#N(!Ht9`3pp&a}r)z)S6e&WfLVt!cs)I^4xaJ0ojU-{vRx?}WS!2T>?96M;
z$uqW66n5$zTPpCx(fRqoZ*}VCJ-|Em2XmkL94Do$4^3ZY&FTr;12NIfMcdt(spM#2
z#4WLdXq33*u0OoxHJ@3fJ_av0KV4yKy>Zz~y4w%kOaEFgua!mU3Fx*4zTVrmUQM1M
zSZumJC5dP#Y2t-xRQLP4pRVfUxPeZR8+An5Pj4;GKW9{Uqf*I?<MKhC4>pl_F?Bvg
zS*mLRo?GAQTRhn$76wMc1<a>JOs=DlT9d~}L61N|iuEs*B^h*%j8Xm1dXKx=on$f`
zrKIELZq1;zxf7(TLYovg+3e__@)cN^^OJlIIAWF;GGf@Uo?sXACJzhImW~HBBG!8v
z0<BVYHG%~}od;t-r~r<4+Ww!2q*8kP<H*Ee>D&K#rkcRW-{X3;T6Dw~ZrW2m(Cyl%
z17_ZJA<q{-5ArUDLztgy`W1EMBEG}X+k!;4b>UsNEjgervjCZ!Kygik+gc&-XAc%b
z-{H4$03rd03h_BS<?XYgY`)tQWP~i|e#b6{Sr5%_+zTD|%-iIT$-EcfIyYYE)o5Zr
zMa%iGkhMeQ<4rbo<HmgVx!26aYEbv<*c2&Ho13%rIS$X<z-h_O0^+`xqeVM>Ci?lO
z>)(p)T!`29%gTlf<+R^M_2~ypF~)qH$9ZAzuFGfP&6~pal#uP&T*KD;bR@+K`{YJ4
z#FX2%G5as>quY4U%lvZO4%CmBeJ$#KFaHNsXc7$(Kh7=6@kt@ChzKu;23%f(4F6oO
z+bjZ|fzPK7hG;g4^Iy(IOls39Js~Zrlsv^7<uka{gSdKm%GR|nKtZh69+ivE>#pso
z*B6mQQk?sY1(z9__^jy?qL+&%cOXn;UtR`DE=TdM3Z=-41`%`nQA$^?Q4^{^K~{*1
zFy3Vfp1Be!5VeP&{pJnK`JL<AyzTgy(&6lNcj5FahK?Jn&*~m!$4$c2>p{(FjjrxY
zCjj}U<Z(#n4gXDviY0iIGG(L1<n<iwxtB4)%Ek_ovuk{JmJaw3m%)29#{tWp@kl9H
zw<}15i;wz#I_pI12V&gQkrv4F$_VIt=Sz8{`OVy%4mZkHnPSVHjzkl`fMch>E*4G!
zoVc*Rmdf;c-uNnWo@#K}B~jX-!;!0nn^qKDz+6+xmlz%vb{e5=zvpN^*R~Ebx3-4M
zo5fA>g?D+uD0^-LR-)ttT?|o9(siL{1g`Sbj~o65PjJq!M6<M^nYa@`zr*ysstA)T
zT!yg5riu(7CzH3h4m7}nkb;Tnu+G(Bp<<!X2z$T{1N(WQ=<Amjdq=O@JKm$L$;^>X
zpVc+>B`VyjzJl!pv5)K-pg#$@?3>_!!Q+S8-dN)2aWxY{u~KEpSk1&ciItDWZO23H
z;h4bMn-{Ei9F=oVCX@WCie>15oFP5){OW+{$WjDTsgHTo)WL-r$e)qmNs&uY6rkRl
z7#!a<zshQ+cTdx)8Zg4n!ZGfMBO8?^6^W~`;v-Nii?L$%3P$oxxZGVk!*LO#5J;sk
zAVp$3nu(5nKwp#AxGRi*ldhyYe{^`gJOTAQzZ^O)EQv4enD^>jZs%!_utoERE2T2@
zpqlI?rG+a&Of^^*p&&Z1=Ibpw;W!zAwps!Ud>BrWnC4e}c$|s5%>1vm#KB?M9qd$z
z9U2ZI^itxfh*ZSlvf|SwnivE#1LDYtbTlYZQkDgAijrkJ7`;bu<0j_6@=z_LHG}tl
zVo9m(@XA9>87lnW-s|k!ul>6Zu<AGM@2HgD8d}z6e|LK8P5Fa0hTJeA`xTq1kO!bP
z7W(_081+3E0IXb4v?Yx)o)UcYTLET%6du8Q7<J8Z>(IR#w{kI!2+0NhQb%P;Dg3~)
zXt4nplr8uuiJd%DS*bin<%Rcw$OxF4e^c*HB<XPEviOkPu-$^;X<QlmLLI%l@-5c`
z)(0%RWZy`q-+yG>NHILT*#V0e&z6T)IjtLQbo)cQJgSNR=-HWpdgdqHQN)sn+Wsot
zN)5iuegmGZ{CpVvlv9Qt%7ctu9!Y~OFRKg77U2A3@$`^(AgbQ*cZ`w+4ZT2@prwj#
zdEM@&rXa-uf;$BJvKI7DYZPcE;@pdQ-yw&-#}^k@M=>8he=C{uqY6}mwUJeaYp@+2
z`*uFg9;{bW;(|j6xGccTM-4+w_oA~`OzfzuG%^Cn4!v#e56(ldS|brjb{W1P-b%-~
zeLXu>^sJCma(DKBAc9dHqM_MO#H}WSlZJ{A7bHn2sb$c|WxLs=oRR4jxyMUMo8S{%
zPPEbFT}EN#zTD8j(bV5La#a7rxbEqZ;yO~uRRxP=e0d_yD<bh+^6g6DO=5R{&2nkj
zkTJYOy$kV*HOfB56tuF78jFZ#r0`Jce37`@?bd6s^?Q8`DN(FR%P%9vNAOCe{ig!g
zT&c5q-cLUS;REqh)jUCHv@hBu<3H&5BR{BG*_6AM;l*6g$wOFV|KCz}d$8g^pfF+v
z6if^N$)k*^b?eSaxJxQnV&ZRMS%x|P1e!@zJah<iKd6^YLl^cgw!(+$H$Ot;+2}vV
zyAk&1)H9KJ{ppdHa%C_|%bVs6_u+cg{foCWi417}1E~K$f2z|O(0ttp{NKX3Uid{X
zQF7pRfq(x{XU3!9$uoAt4T?PT{<ru@%61xNtJ!B@9qm*ch5p_joPSZCT2{z)!cgQo
z^bAw1{wZfq*nbqM;1U!)An@{ksFX8_cbvSwhdvsCH`C6s=;PSF4V!dY^R6sBHcni&
zv*ZMj`@cMjHj{h!r<xNH;ScZVJ@4^8d=;U%6z_OGR8`52FzLcX`f`m_6&5;{dk>|8
zu>S!JmM=`_a>Tl>Q}1DL(dm(~vEUsJtWr+9d#_PoGQ)$0g=0F5SpoCn#Ke=UpR6(j
zROy-p*hV2WGXD;Rf7_Ce9GMs!fB<LrWg<z^upk(l7(FKm0f3#N-bwz!tka%{?^`v2
zom%17abK#sAOG!`CA!5cTD7?0=trO<Vm|eZwc$p5t?^T%0_9mHyBamVES0)4ep5fA
z0b9k{DN@T<$u>QyZIrc{T-Cilc(4D%>YQQ+Mo;U%jl53~Mkemezuht_8#<3&2gSc!
z2PxQt4zcZxW&<YmvGCDoR1;{#dv`C}II+mFv1!DpG#wx6{7<Li*6ZQQrh|{tWcx-~
zr2cK_itDcPG8IihY2tluC>_grC72WX*rOBB?VG1v-Q=<O^9-bbh=`u8440oU{QeQ(
zyykb@Y(XO9d0z0jCx?d@fk$QQ$C>rN^Clm&Tz;3Cfd-&qOX}T{l-lnTd=|MHYj-fl
zFEZ`0!3sxjT$OtR44U&Hbf`gz$`n7Ti2eqS`=Q_`U<XfrHQmVdFm-S<*bnsj2~aV+
zAdgdfX{Y&@K*^}TRVI|VQYkW}1`tcpIko+6zAp!^LvJd<u$6K;?F?4EoOX6on~vjL
zJ%W7H!H6;&^uzw?w^I|Kt5Dhju&Uh8n;PR*%ia>+EDlp(&LVbjv^>97R9pAhe@FBo
zjLawu{|mkR%s@=9K~hX05PDiSmEG;T5UwbwkI(W2F`ut7Xy~Ji8-+mq8j=esU}=7<
zI`tcgAUZZ|K28*kXTAIhp*`_a?T?f~VLvd+91Q$SG(0MRxe{5@{rx|H9!J+)jD~I!
z=z?qxpQW&otia=ccKBfIq@$#_YG9%N8*u=j#RIEeL<{v^r+_6M_>96xeq$0K;@z0T
zUil&f-AqV@55%=3W~}HR@_-&)45GcP8ME(OxwQM`otO^-XxrcYKY3NuCLmwJmSj{>
zqU3@%3LK0(z7@@=TRCul&q4fvEJlt?W-s(c4B`Y<3FbXCp^+y<@Qx-G)o(my=aoe+
z=fZpAAX|*%f}BTqK#xzlOyiFp&pzG5PsoWVGhmNuPcYL6lKopMLLw+Z5n=u|mACeF
zxmAIX<z`hxS@1}B5Py<NIh_R8I=brN*5WJ?7Nal{nF${eu6aU`2vrd<@e_i381j0Y
zqeS_9?B}sA=y{7{DC+qZ5}WB_P0Dd)sl2+n20XhA6aP875bDq(fzDc1@Ko-G*Jh(c
zigr>B9efB<m}+t#h(a>EmK!q*csE))g_uxOQK$xvFTNy&QI#qZtw&nI+UO{HK+>^m
zIT?x#?L0=9oARB|XwwLnmd&P(e(K$=M8;l8{7;GHa)@e2aX>0>P+@6PO=Z{vh0|EG
zr+$42lW|L_LSB8~J#%i0W0ZrZo&6sBd3Hi?2IV`_P0bsus>1>TH%yDN<v$S*9f!Y2
zSos{~t$RPz?2H`jee=-<#Jv3c&u`%pHhQ_*LAN@8@U&h~$O}`2?FV_A`53+h+ht^G
zb*oN~yUXearih|=heKG>t%RmNT@@eqoqW59vl*8c|J~e99hr(KEOT)Ns)q907()+t
zvw!-9`JQz^W-?>?F51*0`*nC>bwPKgDoB2tkLcHo*G{q20^cm({dAm4w=d9pqEdtf
zh46P0Ju~ydJ!*TUK#iUnHcc-FB?kjaKGO-9Xu;bQ8OWma&r#%K(}=5+yp3`7!_1hw
zI=)G%-PkgW2$^O|1^YU0p&(#bd7mb2TAllhXRMeNi8NXJX2u4=ONN?LW23q#m&2y~
z1;C0?K;vixGnQTcmsF#|zW;hZX))DtuMyF?&2P9!jOqq5*wm8wjDsj8%^}xnZg(H@
zDHZlo#FNwEfEgampDZ9a9RaZqF9?xcP7(j3l!>IOM$iyb=SKK{8uEASHZsz(k%G;_
z;JMp@|M3ljx^AIAD@o*4;Jq~il&Qa45mNv|4d;M@xQtc>CNw*eT3k48YuAM|J9)v9
zdDuM!yf@M!1@i?*Dpmc>KL2H<bTqBo2AWiT6?pmZyFVT%1D+V-o*ygCn@$rj7PPWk
zB)77AkBYR2n8}2o&jf(r^r*zXjyZb@RI0jU(I;P8N-4-U993kTptv|zmOBr$|B`ip
zuI5M~vgQC1FnPJuxZ!Ximv3(`WVI^br<t4Fbqz10d0Erz<tVrLXG3wuP`$-@0`ZaR
z7Z|Z%cm((lbZWnl0AcamAKB_OaQ}r>#L+wV*5B@ouveRy8JqswNSM#t-4nSR$95X#
zljWRujLCZ$VRt`3D|^=|9RR8Kyh9Fm;@z$7-;#~nNLBUFuLEe>G5*;!LqX&sQYfQr
znyj=-FJwQxzUFyP%|W<au-pMPWOU#?F82BZv<-hE$xEuU<Gf^6?ZQL!zg<ZgAx6NW
zk|zc-j)<qtq-VW`zd|p!;jnV2ep_iE!^#mcol>$_v;p>F!K35Pvft7EnO!zI3G-AT
zwk+(7=0cD3mdiu>WiwQ0uiHq}<C{LM92n@kbEd8J)=QsExpbHqAXznw_kTC3L}41j
zSVdz2pFb#?d~Wz^squ37(FOR}@YhcQ(sN9X?A8)ZTV%BxFG=wq{^S&=UOqEbQAJn&
zT40_MI&SXTJbsiiu2`IuX@?SJ2%?3iWoz8+l8~)5{{2}g{VAOXoJ2S{KNijKRJ(JZ
zR%11|YTuBk=N0mS^)wSy?fGgxmBOB^Ikxe~)7-^Y-#Rq|RPLMpYwa<@A<eRDOz6;N
zt$S$DJ`-f_daFYC<rZk_)iP{8a@dSgfvXuEIRmJ0k;g;Bw<w5Xaz1bN?^e$t^$H<}
zk^SiWz~_w^L$Pir*hhDh)W3E&#~6_iV33r_Kid97t<F<arKJ8yZG~S;bSPia-D|Yc
z0w-2jdv288+B2;vj)x;z-Cg0>Vbp}*#gfsvHL=7$74HWJ4Z0b|9ecL=N34~7;_o-H
z-#$rpza!721>U1oWB-~75=N<AQ2N90fyTUH2I+<tM+~81WUdeueFy-P-6Iy>!N1AV
zYyNC>to}g?Z_uG`WWUfc^YreQlz)><eW3HR80il@a>n2T&GMHNHdf8s?}8Cc!P&}p
zKO_jkMC;J{&BO3u@e|d{6jbb5Qy8UM(fht#VdU*iZH@!j+O7)nP&1O%Z#I761<%;w
z$EvLuJP~;$O&lg*b#m1`UGFT4&Ksvxn_f4oS$6M<*CmS_e2Wz{zbpobME%M&nF%ko
z>_l7Gxs37BuCKLs2|gU<l;q7su%FAJrn?bFqW#Q#^O$*9)U(_G{oNYue@mMsYAo52
z9`JHcHXB7pgSyulbX`A1rkQV<BR9B2O=-oR=rx<A@TL!PcXELxu+<gtwuc>avU|Co
zyMFV+pfKwwkE9yoC+$0Tvxy|Z$EZf*sbd5FvhMF44KCx*w>sPS&oDO`SG-3UH9vIu
z+SP8*&Mw`vVo=U~o<CUFcQ8=cV(#qPh%`zl*>S7W^95P>GqZH3RubEDq8b2&4Jm;Q
zXQK8{-(CDnJBE7}8yi01*Y}y#UblYN_)XadC#&*6=H!OX>e&_CHVrw$BubB)0|L47
z3w$^O|2OiqI81%vO3p%;>+~D@*9#!xup6D5HXs+fsoIo~cv)MC-c8#_ul{($d&Gzn
zGkxLDLs3y-QWD{~q!yL@ZrLZ>W)YvJZ1UP?rNd&Bcap)wLtR57124VAskkeHBTHT8
z@WBEJs2@{oTPwLdf<kDp1wu0T-s8yPByuHSJE5b};(jc_R>pyS2MCkP?_)%SW$aT2
zE6hxFtD7PWq6`2&o%SF{m%Fon{K&-<;I|EzP*O{wkAL-T_cK>M4xJ{BBr<(MwedMz
zV1k|iA^P-;J!^aG8Bu+ab1;IXw5Fzr-vzm7SHqN>c8G9dlS&4?x}9CM_RE<)zcc7B
z5@_XhTv!k*XH^%c9(x2{83m#25_aM}<(lOQ){bIuG@UJcK6V@Pp$uev;JZdZljWVP
zT}u!t&SF6!M}YR)@l@LLP^n4=e-}4A-W9%%bjF?&uG1gUQTO=Y%jT$L@2&Fi3|s8?
zG!B8UFCiseCr2daOx_ZbH0_8?JLkk=cut6WL(UGW8RDH9VJ$D}^R`Bl07O|>r>9jR
zm9CP=FZ5h41n*_@5sP8u%j$lg9@c2X{U8vHX-&}x2ITb>H2`=u+DApHDwH;?7Cpue
zaV8Lc+IsypOe?0Qpa!6pQo)2-rc2$IeTMm9OOYu<Mfm6F+VRtay$l5LN4yveVqf_S
zX#WS)9ja=>?Im72i+Eh?VzTT&x(!zup;Jp7YO$&%{4|o|k;V4Z>Z9a}9vW2?sM?_l
zU*mORCzZ!=Ggpe!hn(~_)I9RY^e;P|Bq<4!jlV|cA6tp@gD{&~1B34CgZ%ByZIpWb
zO^?xWsOP58S2i3S#iEW#TDJ#Y65$hzRIqMG08@oZ45Dv}(CVM5RNmtH#7u-ITzq+K
z(!?*D=ZT2eJEJY*G+hWv58qo?X84qCsa@ab47QiW%4NT|bC7x&%co$AsQkvHFqc)2
zJh(oSgLwy=7;R2Rf?uH;hQp->M<p8rP?D5PHXpAn?9VLo3`_sGjZO7YGCdRLgLHx>
zhBS&Qf*hSP;ElChUDLT<h?-G(%*^e0a>@bOiW(jnH#Maf<|!-O#6Co3JGu#dN30lr
zPa63nT-jK>#jW<{F+yO)FQ6^%X7IG6R8^W|nZjsV5XYj0$FH|<XQf^Mcxl<M1KW6=
z*IzHJaeIf_HoB^yt6(0;Z3ef;I&Tbqt*HOOl2Q9%i##%gGUGu8=NPk~U(*W&HT5UT
zJM3#4uxaBKSFwz)(H+Qu`|`ZI`enxKREFnjpTX4+6bp`17zBiI=0{2oi%^uF#3#O1
zKXiy*bLFR3tLMO@(_%B!Fxr;Ho1jr-a9Fpz*xUdoV&j#Jy^|>KNARRv{^}#vNAZTJ
zX-P?m`+a{*5sXM>J{58WHa&sEdO8?OB#;qX;6vb6lERL&yk`01xR4M(j0>jTZ@B(^
z>kXQfV_^4;9GlPDuk_V;FVXerj1ra?@xJ1TE(1HFWH>17Ra9q2T{`-G7s)6A)N%mE
zGNrs^LdVf|%LvC!yTd?}chgU1b#q3<B4;L~qJW8$DB3Wsm!G82^c@(+zRPta7ml{a
z_aJzqdFiz5<dBm?9UGaLK&6AZi6o6-UzeH&Zj+^}9fT*^{gC5RXF;L%blgoT&|Www
zPS3)PQ%vNfPg1)QLohA~liAaYPBZxIlNl}`5b{CVz67ADc~|@3;5qdG>U8=1>c-#-
z0Sd;C;88i_EtxaC{!AXOc$NxPy-JPQd_~^QGGz)6J_xN-EPUs(@nM!_Kn!6pG(qTn
zV~ke8Tg_*o5`CJ=_NeB>^%hTrMPa3H%JIP3o<g)F1ZQH)MI9ASZ~^}tg(hEWwQ<)m
zgDA=4^Fy7~4<UFt@Gp>m^`R1UCq%~$H<<447WYz>*P-A>b@SrAsO4gO_mcB+;)!tf
zY}q6;|Mo!Qn!}pSc0o}+xyU2DoP_F3?`f^xb8GDs^!l*UXPnc4Mp{jayv^cs8HR^)
zx@I>7bjsT8?dj)WX}57BDw1~58ew;|;o6IkY2R5qvvg{0G}IU`dYv%28>eyr!onq#
zeBT4(nwMB%zfZ|h<$3Hi^F#Ji_uxqc;^|mQBZxCMi7Jgxr~P#Dy$q$<Fhx5M1ToPj
z;*(WBU`k*MI(XXuYPo!er3XYTRJmW5&=-gTlzHkyZ=m`Qho5G*Gd|26YYdBm)9o0t
zs?N`mVJ1;x14oX~O=Kuv&b=s-UEbk(sWK>r?uk}@_x=zpjLf;Wq*>bLYw~CZ?0mq|
zx$C-2KNk&v&aJ94;P!lsp)ngDUaau9-({b$NcH;uH!Uz+1{|q+1TXUm6<LlQRhBeK
zGZk}D1$<%`(CxH^=a2y|C%QP6cx#S$Dos;#ch9K6vd5DyOn;jo6r4P5Nq3HuJ7r={
z{oB*uztFeiWh!_kH4~%vEtQyM`f`abvd7BUjAOYu9{gZ#o}P|XSoo=#e8`p+T1Z@K
zw?-B(*)*{HllRMIif+H}B62=X@Eznb0F=q`WMp!TBlF<DuNbhuZ(|(r-03skn1MB3
zrN-&J(cIP8)9O>INdEY|L^zT4=l1b1>6ZFNSK9e<F%tx*-M#I~0Y&uW1l5Z<FJwaZ
z2~D=H$K;;9X+y&)Ne#S)N-57Q1H^t)S#@HyP<}yQ7w~_1?lzQmo(aftx}ip57P*^8
z5)@3OKQ=@X+TXm|bqHbcf3j)}LxZ)tYkRoMNH?RUleuWQ5n*_E=iRaon$o|dr9)x?
zkHwy6_j>>(>Ms0v)b($xObKCZx*-Hfz2FzH`9X;8Wlm1W1M~9b)0D{L`r*2W+jVv|
zIToN<`;&*?o@vgwZ^&Lxh=M~l-&cZ5!0(3HHm34*jP-!Wue8k@2XZ%02-FErOFq0W
z5K`vuxVnzMgH-J}2sjj?d&btyqkoA3q_X>slk>2@=6GU8`8;Sy=ZC!CW1AWO*~Np2
zyO8gfK6vh(<yGc>nw9z6^_kaFTTnWv71iNo%<Ttn(&FV;P1Bt9y<E){v$eA4K?ZWm
z-k+Pq8N-)20oNM5_(ZVY^#-fM%2cC;sm6=qkawJ>)O+T-Hz@f+7UP!Gg$u;N2pIv(
zh;nGXzQ21+(D(mR#0PvARLfEB6~C08njYy${CXZ<%<8!<+*6l+vlIYE(ycb9PB3_C
zTiOt{Nuea=F=l8z7_$%t2zlNn(qggDww{%%_&I`v#d&Z?rblF1{6OFfgmC-Tu<s*v
zG3dQ9Ram#IIXpfqGb(842O-Ug<7;rGUEffC7j&BLM=>xzjRe#{w&Rg%JKXm+!4eL&
zBz_~-pLAxcW|q3Y?=%oF<SKGh8MI%I8Mj+3R{03%@ftaQ{5wsIx!FNU(|UB<ypDyQ
z16D}twV?h&Tlv%AVP%RLk@QE(A+Trx>azISbM4U;L08X|hG5PGV)nH8qRqQ&IOV;F
zyVYxbno*eBbqfSGLVSSatp}^NcNwI|)%JH^ll+V!xl&>vUE{hgk?zOd1)z}2SDU);
zlO3zk1auSN16zOI_uXQDG~f6=gs()E49V-QIVg`J{HR`cx)ReVJBB(TD!fb^lCy_s
zb#}jIzkwTOJRXSSc9&5kZa@d<QMTWM_Eu7B2NYO+&f_oWkL#;G(h5F5UYJm1p76%&
zbWNJGIZ&hZ@^)|sbBkNT`z)(i!rOR)I4SEAp4uMLr+g1Gr>3A29JyW&7?daZ+izC`
zBJjCk(wDK*BfhjZx(9v-)!P#N5lrxu)?I2-q*W2qm!{3SXKjSye{AXvz#b21GJCo5
z{uJrD{#Gdd%Oi;kl_?^x$7q3Cln^mf6WYJhh0=A<(aH;ecIP2<pY;s93%Wo`dFTG-
zZ#k0se-LgOSF?yh_~|4T**Nrp&1IE7f5(huXTj@wQe$rq1a!Xen4GTwsnZv%>3BDN
zZ{FmclM@@zHJ_%xG>X=e9ya}(Ey?Rm(67&(J<qw&wD0z6mz|k}lzz8{<QueTgbBB?
z($bhry0>ELhHq`Rt!6BH2OJ=|O^@r34<PPgYwo1}2Vi4Mqg$_t54ZOcsnw>%_unQd
z2=(Dpu>78b#wl<L2NUEi_oPYrzu~9nBCv_Y%Pgd8y!R_XnIfEIq{vL02Sd5J?bu)L
zCYuw{b%pUu+BWWBjiSHZ#0Tv7%P8fU=QOO>N#=5s(skc8BRQSyJ3y|4=_Cj{o`&lJ
zn`RV1_rIH0&2!V(n?-*v@@fK4LsE5s&MO<0&o_~pYm508aY6@;{w{ysJg@(<uibp#
z^++wX92&a+?Uamf3CZ7S<lfX&CVT&2{D)un&(ih}K9Wdqv0*Slt*uZsX|gC=?m|NU
z8|oWr9-b;)=i50#K;;|BYmqMDiCdEY{;0miM2pR2)pPWFy3<N+AwG-yS3~lUW?*)1
z+i92k;+s@!{WHIpVI-|z&gE`m+(Hj@=p?lD+DW;SPN6r2KY|jGh<;X?VbCsh9B82h
zVTg(hk5uJ^Neh41y~LmlXAU)KNYF*TjYU8r|NWRid(@eZm2c!QA6vz$-(<4>h-B=}
zC}&JYzv6SCa{==_)lLQUVXNPgdsaH7Oye_~8B-qX79RsXhWi9=-UP_Z@>+MLeNRib
zAdIoSwf|BFGD6dDN37|NDym(O@MD{|(Z&+-RLKnsPn1BTY=LXL0B*{2%**&52<uW-
zFlRL{g}A)Yd~ElqHmmxs3O=`hYVa!@ZV#;7(9j8G*VwWqUVCEg9J+V9Vrh$>ww{PJ
zp-~k46ESX-vN=xM&RNf32g9!5?*}(4JChvq<Wd@U&BgLlmWxbdZEI1juDvI*3QJ1{
zNnCITH}N`|+{N(Jd6iOnm0#Lt68MG_hVKz+hlDf+I^=YuuyT33P9C?y^Vd=zbbiV+
zQsXVfP-^#pZ4W~4D=s%Tr8B;a3Zec@OZ};t%z;w!`M6>^@rGWT3(Zll2zqfe5<PO!
zn<madgKDzivk7duSao`vjJX#O5bqE}$5M<3cxz5CcisxRnH2bxE`rqatdf-|+~T?e
zef|B~Q#!3`><T<l@vU>a$l0j%dLNijgc|d57GSnBe^MXMKqaNXEs=y6iV^T=x*u7{
zSbrEkt~2L)xv0+rIU_oD$>F@Xa~Sb;Bt_<QVpI-g!?}7Z1pK}=`sw^$u-*TKmhZ4r
zg-Z$dOQCi;ze`_$pYOhpsDRg1Uy9ca2Jz!UYxT{8z2D4DY@mxgG2FJb2Qb&}3+MjI
zFe;lX`1}<L-W`==aP8iEqV{x?%lVXG{2@luWWqvZqvZjD0_1PsEsx;l$kqHb`U|_~
z6M9OxNdfWG0_(f+&aHjGmKXs>RfE>reU^yp>v=3BzoC;G1Lkj9QfK115{3JrYzQen
zE&!vKzJb1$uK=uYc)$591~Q6Jia^nV7gXah&^6&uiR+yKH+<eu`oX)SJ2xi(OXcEA
z7rW5VsxK@jatIx!F?7Qa$nTL&wnT1A?8qjiPaqeZ{}o;w7iTJQDC_B*v(&@jb%}d*
z#$Frl-F&_h6-xLi8%(EV^4;po$_2<mxb$Qe%|RCo0_qmno%6JysKe9?O>1Va3<YTz
zHyJmR2_2?pHt3^cl)+(CYsp6Ik*5%>z+g<KulT>}<g)9opbNyhL1@>aXtuM=Gg^YW
z4RxW%DtapZMpND0!{QRSQbzv4S(hCe^g>XtR&4v1v0^wN*nil@i1)$`x>GNmq}{ib
zmn-Zun>wlJJ;X`oWMy+6B-J0#@_l{GC|j9$N}60b2ysGT_JU}0O5dLBp&ukC(jRX3
zd=!Cj^mIFXv%}}8B@-Q)6ps1}CP_t}#}ACeWB+aGFY7I`GCn+)h7zEG!X=BLPT#>M
zofpX3QVIes-FBRBL@9Jk4ZMf9!jI_v#8L~xJWPR>cYo@FkaoF=_0a@B()uZC8g_Pc
zONrM(;ItA?S;%W=ZBxD{owCd3v)xsQ{d2=r1UacJ9-W+ZtTa@`f0o<S{IEVZ5$UA;
z8wzz2z1&WNl%cK++jS_7*&pdo1*bRBDty-QJn#W`A&olkQbhldXm-0qm7@YE*cyvV
zOK-x#V8mlV1EYfvCH4fiKTcBrrkkaZ-w-^zVCLwjI@vL6N%Tv=yeZOaC3rt<ibB*u
zd=@!R@1UPz_miQaosY_~WjP-)o=G4dbSNK!<2N`r_t!8vJPe-Y#((kAx)M21zBstz
zg1h~B{q<h*5*w`mUh_*WcTQt~4uN5>TZ~F$2|hdJ?Mo&i80jo*DvhsR;p8)8LwMNn
z?wa5{VB&`)6$Ny}y>Nt*CmJ>+^@XtK5h?&DmZzYpNTnIPgWroMp9gh;7A`bJ<vtYw
z3e3&zkSN`!-E<08ziKm`$@o7+2(Vuns}KQLa1O6m&tTCFZ;>>XAmgSebPKNiD$wSK
z_*6y6NwYxi(>w2<`AREE1erA*B@cCIKAW$UZw;6!y`Qq^k4NgvB)XsXLwP$z(z)1g
zNVsuu^2aL8qnc6Cg8<SsolO(d&g=Dn7IPO$Ex5j1mScgJk&_@WI2O(i2>v>jC=)Aq
z)%qQqmuE~dG!EU55;{eZmw0l`>ifB-gLDbF-0rUx&?vAwUK=NRrpm67*5djY-pzZ_
zR(c(T_1^Bhmv`WOl1CtQOO5FHh2q*81}wwsc<;sxlK%spG1d=k&7|o%SI!&`iw^f@
z;7gCUN&F1slpydcHVg)5yMBp^z5XBj4Ov0~MwA~bV(`oK3x~_{e4%|3z1T)h0}XRI
z0wHEg6099|+isOQSg+lOlS1~nAA8GI6%Uc(vemS=67-ODHvCUZlwz?&9i}8K+z<;1
zlSR{?06jrvvF|u+UU-e_OVj0y)X)nNm|!|i%8n<J;`=vi+TYwuS7Llt2T0@46|r!x
z&Y_6Jr62n9w~fah3a!eb4(e7=C>f8|A!yBi8HbQm8ShiuWZee^RQgE_prh&eRI020
z5iPUN8A)AR+b?m%dTS&xBvm|EGN|)k_+-Zo;Nt84^-#&3|L^eL?V+ub=WwOni<aW&
z@xqNPVPr0EUi+5;iy;s9m|PyK-PG<`@Y1IjrKO_EC1N%Xn;M>I<!W5yi2NJ>h0ECc
zr4GVCP8t|1>x?DNa=Wurtem+L*k#v+1v|0fd?X(+czNQ9)`ljNnT*Psth<#d)ZfQR
zKVDC?@VZ}ganefL*b(YG<aVA}ghXGiR0w22W&u`0j?hqD&bgl|Ic|@=cB}&Y>CGzr
zRRvN(c}k*C{=lnR5ez^=D$clO#ohkc+f9egLwyc)GGoVnSJDWc(hTFb`nbEZ<3%UC
zS<xeuNO_|v2aApbLZ#B8LM6^QZwWP+=%xTcZ(Bj6wY=v{k8A88crCHyMiXn*e1XN2
z$%S<BVhUIQ8VM5)JYJC4gamOyh5RCNx~FAII+9869RC-I0S$ea59qUc&5VIeX?WK4
z$PU|rT<nF>z3OV9W1Kvz2{P>9Ffko`N}Y)%11Y;G^4C@wcdYB<QUb%_8TYe#oh%pS
z-M8^`rSC?(m3~HBvWab>7jOYi{hwVBhH@c#aPR0M@!*|AP^=ue)D3YLYn1FeQ2K&c
z{TRxdRvIpzLn&-3>F4O18r`lAPTO<t(aR%Y{eYr^Z)kOP(+z4>D;v0IjS%r}{kj$^
z02%l>M~k^+9fn?FeZu9q`{w>SmtO8mh3Alq3gBTh(s+?9mJHX}t<;BMkYGmaTR*Ou
z+Pgh7NkD|rx%yf+YqJ*$@?RU~0O&85;I#F*rxzkpsuSt<8;;!HywBaQv@oPlzoAXm
zfcN*y{qG6t%0~N4q5|n%wa7^CySwt<j@(-C%u!EmAdFr)(a=rH%#+V+MggX$qorWV
z_5~vHN0!r3d=$~Ls?4>)x;k+&ijZWk-RZDEcNS+?ZzD1_QTi#z{dUBiqUmD>tH+Am
zMc){CN>HG-m{Z{APy#JRKrY=5poc#(PZn0Pw=R!13wb^rk&5m~?%>*4g<v?eurhp0
z7ErF<pc`QvonjIyo3wO_3ajQ))qhff=~$2?Po<@ZPVL~ScTc%NqOv@~B8x*$qla<3
zYjkt6Y5Zz^C09E6v}VTaGtTbNyVdYPg`NdN7o+>t&v_bV;RzfVRMC1bAtp2N;|1o<
z=i>I0PE)`+nFi#X{2ht>s(v{pb)-v3s!`KpA27yebq@ZrzrPbG+I|<=-`@4Q@qFIw
zrkK9yH{x%<if#!DKyQtt2_4AypL1&@b2xr}A!<xQ%q`al`GCwRr_ue@50q}4dHK9#
z<)PD*-&}VYwCco=*BE9VM>;N^Vg3MIf>t{;cAv~IuGGt1820>Nz3_s94Id=UDWb@j
z8&sm(hIR-bvkIRWgdMJBrl>Fg6aB#eRfY^?5);fCP+PZyUcL2VsVH88;1zE4XFNC_
zU+6n48_Ix_F~Aqtv9&Q!k<Iu9abHnJ%VbkgKw9H?vu099<g>CHIkUAKkww011D)<~
zy@R0JKU~i%ek&!x@%t9?SK@_y&-nP_vW;RW*bJ%~Sg9UQ<Ab^Ymv5th>1Ep*({c}!
zg)IQKvK$hrK}tj-z=Zzkec=&t$p`%il>F&D0O>a`I}3Qt&q(ZhsPO}#j#TfCq}vMz
z%JN3;L$N<(!_-4TGTvK84`=O58B?##T-H0dXi_dDkz<4+fl_D!al*>8@IjiTdFmrv
ztZJkHfL}mc72>fkBq}Mgp%MOTp6&?pe1J#LX;p~~IRaJtx%4Jf%z3qPEsoFko@|qC
z+lyf=h96%_JEZm}jTbjp8o(yt{^tAhtsKZ#W^*7VY%1uxY#_Jsi@KfVZjnbflGqVY
z6GWNSvZMjnD}g1@f#LCgM0d)1gkZsrKyD*o^1M&V;XfFe6c%^N0s472_o^1JR#RaU
zq+{U+qZE$Sdr)F`!eGQ6&p}dZ{1<#kV&q~d=^Er6JJE?_EcW1@_Sy2EM<FzBUexYG
zZ^eZ4VR{0IDbYL}k0!(#cz!Zi4rj9b_p3PovCSv}3k%W{%zgG`FT$3BBBvw=ad=Io
zXsZ-?mJ>7o*dCk$DHkNUq1z!&oF3G~aKfpCK6q*>%}!*1g9z<}2?ZgxI29aK95$bo
z`vLYR4BSC_E-Y|rN2k}#1>e;VTgnU_QVpD&nrat_6wxIA4O(T_ORQj=zx8WMrs0nG
z$n~AQ-Tn}uNM(f~<|&*Qc0!zuI6DpId8wN&f+A@=la(usF)O5{2-RfwZpniZ5;Hpg
z5&pyjm=3!e=ZgqH6JgzfgZqi^b>@~NZy!l(lWLSg0aiS=l(CE~!bZADK~Q2O3wXM@
zGk)1=VD-&!8-52{XY~{A`=lbkI&_L|CP)Fo;kn5=<hLWb8qXKqGvc=7rkJ(t+(Qi;
z35Nh5YiabI_jMl&#!~o_4JIv8Oe`S?+05muSyK3O&lZS<_^oQDaUyqki<%8_LB4Fs
z2MC2B7q#ze2O<Hi1CK0M9v)v`OQ+}E@<Wit?h5Uc-J1&8?qPGM22e;ATC!~$1mo<~
z{60eQ_HdOy?h3ku1k{%@Bg{{LclTSfy=XKpKH!R19fj(M1r`ACQN_||8j9or<#U=@
zE{^485=nfKgeBfHz5p*{=U;H&X5{5F1H|bV{lELD(_ef<4#|fx4>}I@=zPc<^ic9$
ztiSMEtPgkS5|OL8h0SYn(t?w1Z#P^KLr>wQ!b|1r%d`5jCg?kyZ8+uI7D<I~==53q
zebpCE&*9GJV7zOW)kS+vfJ+JBmt=Vb9kH0ax>mSSH`LH>Lm*;wBpAW@!B2SjUBtI1
zQ{+uRwf0*+x1L<<JrSDPU2<p$z3?*xko`qIdbG!)@+>L`f%I=`c*nLQ9(?yWZz*~a
z30Z9UJwXla<h`L<)B{wZq&JpU9!s!fOB-(!gT)e1V=krTU~waH^JYHRkzEk`Zo&9L
z&|uiuLaC_ctHC|s*H~UfzFqVs&Xtqcj-PCUp=MZ<BcjO@KE=GCwW!+S^)sFCzwD-#
zEU3>i8z=qw>WbG_XS$j_i&9g#U6EzQ;Ixcx#LfqeyJ@je5cIGD^$^gZHx*AlO6P1g
z_sc$^QeyST#w%M6NCA6^8_@>A?>;lo=}nx(xfy$`u8QoE0h_uYW>5&kucm_|UjC2o
zvE=(aF3(78xoC*eCy3`&(cwFt>~&7Z;q#FG5RKfcD&Lzz;p3p^?B)h^j2OWS{wKe`
z6fR_e)u8@$FW5uP9xjk{%7)9sXn@~o+x{tL3q$}AqVrlDuIl$ZgE*~dSP@m<)rIkv
znYXC~RGjiND2g6}HD|V9^Rlb)HHX|jQYE93`QIHo9K0%+8-Nv3=?{eGE$+hC4rKwi
zZY~6H?@ls`!n$d&ZG6atHTpy4Skh1jKjKaxf(?r+<ifBeN@S%kItuZQ_*^Li+2-UC
zel1WwNdkGhR^B`l9`No0o2tB?24n35i}p&iE!>r+7x9|?_LROW5me~fVpPV{?kYII
z@xv(nz89CHo^E3zkzwb~Lx8c*_T4GViw_9&HveR%k&Ly?eDLI}@!6qTm_c;f{D-H+
zb47b-Lmtg(ZL9Y&Z})Ce$`)f4otItY{ZAF*xAAz}J|K51)I68<0m8h+_O3K`2IZs%
z_gA2U-vKA@%N~QA3=&||Sptm;Oo<$m{udYUT+;>nlfHQpm}>$MQ1;H5SaF$H1CrWz
zPaw+lwYkyi7`8qKh=8|T;tZRBkXR{&oa<BL2E`1MKtm^qQxP8>ERo1iZT*%%4*f(>
zi<wQITF7mw0OU?-ZW5Z@pfhN#s2TMA{%mOADi%!m(Zt4Er{lSy_hu^hv6Hfi-0wRw
zcaTPTgG%}lNEaZ+$Q(5{Urx{buF4FHiQC}>p3`Txt5J`Pu$0kSA*BX7Px7Fg(|SDy
zg(EPBiPGDM&jPztC_KDXFUlFdKHH?`y6%)r1(blbYO^5ylr#K^RA!FYd38U-<3x2M
zKOJra4u8o6Macm8%1tD5oW2~U*RBAUgtUGkPe&s#KBUdYeet2L>&HQkbtLC3Vf8vc
zUbrKp&w?NdpjV&Y1avrF26P>@c~7tA_~fi&uCkt<yz992*ag1YlUc<WK)`oE@OIB&
zs-o|bpoQNyTI-7S-hsYeiK6|AjlEHE-2MxoX9S^puJkNo5_6g>EryKua7onIf7R-N
z+;?Q_*T>|<?Is8eQjB6c2jmr$`?C2k7E?}mLUUX~W`D+iK*Gbt;W`77scCNmEfH~R
zvrs(DrcObYhZ!T@!Q<u>lqV4ezs>aaUlUB}egJ}5Tk_nOf^A<%QH)=&Lz`cIn7OrB
z=kd2(QNE18G31`9h*-U}1B<O}T5j$k`VNLJ$8V(T{N@*|{!C)S5u!rAqY5da=gV)&
zq@6(<SNr*W1JC-^%m2sOTSm3LJ#oLmgS2?@qQ%{v;H0>f0>ui&wG=2C+`Yx!U5dLF
zcZw8uclYGx{LguwweHJ%Uq)8)+cJC4o;}~sXws*9eSI5~r@&SgQ~oJKZy$CI!B(|g
z`*xzekT7p$v2@SjWk2mH0c5~3>_fLA1!*_Y#Ic&Moe~pE+^qP*6KO_DaZrJMz5HXv
zuH;<Jv!gPXmJ<GfjDu~LJt~B9r0!qUXE>Dik7k=ak8f70*u+i<U$UTPvj^1^z`F4a
zzN6008Geh`(g8Rby!A)YXGSl-R6pD;+vG#C9bQsT^P(4zE=TlZ#24F&gxAic1y+}x
z2L1Hx_;?s({a6sXCM~T_1n<=VZw7yt@aaVheqMbgQ1P<iwshJt0A|@8KlC}&T#EK7
zC6nlmj7KTdCL;m2ETMJ6dX*(492_l8>)YG6T&mK~8=RM3i>EVVy~6!PN2tLxv%XQ}
z{fs|*TdBPEq}MXB%p@UFbQFm1e>XFOzA<Q=!H~gw*Jf1dInc@B)g3|VuTpq&WijD(
zwp7@*LlH}7mUGn1pi}v{v^W8A-7d6Q!WN`^>48K^T>Si&<sGhf#{Ok6K6(z_BBS{L
zpOQ@Q4o-dA8?%OO!@u%!lOR+pJqQp~7L}nMN9@BYZI0p6JAs0EO|`FT%F#_=f9uH>
z9u^L?Ahz5<SwK<>lChk+WNkWkiAKX#SeR%W1y)}(PWe&(-ihay@)5)_+Ww=2{qQ%D
z&CuwrXyulcF{R(=aG09JHEssiANFVOqZuZ#Di7@^t^p7qauuEu9vSI!lezYdb`=3=
zv{ySE<8OZG(v|<(V=%UFcsvf+<Qsm!Dr{XoeET35>otjgC>3}#B){I}S*JtzKEg=J
z){X+QPi;3qX_O)uM;8T@<i*`iLqb49@e5D8kHe0Fougf`5~r3p%StLaF@NqBRu!u_
zy2pQV=oB&QW&2}MchXE0ADEAr$~f3S8<Pt6rQFr;K7TR?XcN+XlKT40G(qOJ3Dz~r
zq7OSd9UmCS(aH#o-MsWN0dra}#>B%^X{s7AdVB$;$vslF^fkXT`CI8kG7e)3KHd?%
z7K4{p{3C3QJBWFIy6+^Hc@130>+&CZ_Ac~>UilG#WEDKF!W3{^2>4PJ=`*0_?ptYq
z<j@o$%HKi(&f;_{Tf4AJqFEtl_3gAb7qv7}dS4cQV;H|FR*7=&Bum|KvAcTq3{}(8
zLibZ6T>a}Z6Soy3UYbs$Py5|3KbJt~r_l(D&YnZz&(+-fP89Vq%vT8sEsOwvT?@B)
zo-FqKP-G9v8ZB~}D=Wb<s@ziE0K%(p3Ouu9h$fuC>CDV$Z|-ex|DS!Umj-d}*6g@C
z53o=9k=?F{!Ryk4mq%QZC?qlc?1L&5k2O0YSM@OpSUXY2-!EmpAuHq(B<D3o`sRhA
zD{J#{d{73W?2e9Fyj6>pDtgV%B=s>!vixXU6O@b+H)X8vkNC>my&em|$3Uo|u>V(s
zU~Dg`g8B0Ak}=#wQiX*X4$0MSfsCtYBf_6#OCzkOK1uVb&ax}DvhsdJ=fVWO*T5@P
zyd+p`!_}u=?PKm_tdy`$WN|CNm>f;kK~Nvtavs?OtK(+oADb1dXCiM$xgiM0#~#?x
z*RK~d7TMtuJ+)G)h8{+UT6f4?%yOt?LFy{l@;4K1s2k3QlVd3#isA+0X(*6pQ0Ue6
zG`7_lsPNP^l(Me_k?CmQRS;%+95I3z{iQB1o3ByT)#r~1W?5a1w3Oi0kc*d|mDZl~
zDK!~aFbTr~%1~oLi0VIS*z;Z;hl?gnz12@M7%$L$khwf%jW42vmr=~#Z`DIyb{-YD
zp{8ZIO;AMbZc8%kcqlwMrS3D%ZLlRAt9zXC<R7UTXh@iktEm<fUH3UZl}22c%I#Nr
zM=hVne05cu+>)z1;GzTsxxd*%nMzasI!puLHV?@Yi4nyWr7q1vp&AjPvh~~bL){mP
z9ikQEA&#YnU&t6Cc3)8*3*#+xM5-&G<Zw=2R^v-4<pk~?nF^{A*d|8EZIt8Te%WZm
zo8&^bW~_>`)LS1>EZJdBt8px|f7le%OXp=`9|Bye2)lPw{-j{KeCVE&?MoW_7-$?@
zs1yNs(5DTQMW~WDM2(KNi}cP(r{_f7mtdFk+;F;%);Q8lR9|QT&m^A|=PB7&Xi(9k
z?nn~~ODHN*bC@F(6xJ><5;A%|^5K_CMh2V3r1+mM%lf*SysbKnVT#lo<qrB=Yue<n
zX>5~9sW2kqnBaatMk5N$b?$%FtX3x2yN_UgH5gO2*c2xHXXyyppOfhWeav@toQ?F;
zxW{uxK6H0AW1pZ13)O9O|DA-9Y(xwu*(e3Z)Px+}{=Cqxa!Y0e407F%DA_or>X)an
zh!`%T&4$|;R1jCg!4?Jmm9MC+kxY?sWTRjvlvZ6?&4!yLF8aU`!Y@^fbUDp`_tsXY
z#_X+*OxM(XB20DiFIpdynVyC7vh3`KnM|`s&O-~?cBdNuihf}BRR;<pPP4W%%R0K)
z?@(V}hV2CQB7Zrw;u0T;DAw4&v?K`aQTjTPNFYR-r`h#g*cNgbn=+svv@-JIn`jgx
zTJVNc&c+2?s@Bz861vNt%l$jK4it@iCaceDrKv&C7vKvT4GuHGtN=w|Xc1OME<lpu
zL_-E{GK?tH$pdEXNg4HH0lgPmRXWy;$v)kFn)5bZ{au91v3)I)m))N&pPDAQcB4Zc
zN(Ei5Zh)U*)p1b|23V^7qL<%oR&9T!&CcEYCgT%;LxXRr8soOj3x4s9tCPoa9ma@C
zd_L->3+38#ZdOA}RVSyBITinEs!@u?$r)sZNQ{SzM6HF7)W(gT3!sn!(Exo*EpeTN
z3xAC8dm~egt*5s=!T}|lsb3nsFG$dot<d{v&~L#?KKX5TOH(}@?(OPmCkh%UR=(&m
zTK;O`SWjO|K$bmnNzOC869{Z73pMW1VK%sHh57H?O*_Z1RP?2&GuR!L*xnw-qVvXx
zuS`*XH*+hze$T&>xRWy6Q>phRmHkpf$d$tIV`GFa=+|fFtQSr<Z<8YqYb6N}Mr|u4
zHfDsqUrwgC;}t*l**cC2k=(VqA~J@h&lWZZ!}lDO-i-fv>3J5kRM|CC$`Xq6>V73v
zVf(XTR#T2wK=<S6K+_?m#~~p*Gf%1CI3lR&H^FO--;aOLxx>4R0q<0){HPHEbPH3X
zA4mH~6}Q46Pc<1@olVi(MFi+pfZWMk^qAa|zdJBr5DNltwGuKrla5e6YFAlHPx&=A
z8Ynsbn_d$Oy7Nx(rY^oaB2$&xmkkTid<GeNmlY|t!>QKc1BBsFVS6h9<p@Ye=3*+Q
z8&9ZpYsl{?)UQ#x@piq4%5Q5j2DfsBQ0l1i=G)t{be)a`CaxI`&UR3~9xK@?A<FV3
z$M4uyRyRa*Y42bP(Gueo<@fB&YANYnCE}z4sZd))b7&=JF<5@Zsi}kYbgv=yb0!h<
zCk%lD1&^1pYhz_^*-%jY0I@)dfl8*pAXCj^-XN+NchdSZfqA;r5CKgr1*l()xRJ4t
zKTr*~l+$60<?`Ihims;Jy$;{@Xn8J%IKFeC{jzvz;j!B;gd<pVlQIrMqoIP9LDr7V
zsv$(ojg$j+j{fkCj%;k4CGVm?|3fsPn^o$URI5U6(j=X49^Vo~bg)<NjXS?YF|A~v
zO;~0K<-}XkWOEs<Gs&{Iy4|#NEM7b*r!23KMyfcim3G@u{{4P_d@5}o!^4ByU%!Yy
zaI1TyNFt;sWBEo?HmT~V;+mm`gbxd2l&bl`z_+JTcM%JPYAAO$7|8&T@w2ilB;!@y
z7m4+c!S9mZ7Jfg33&dB_y(-q$*2T&~sMWNoGpY-$xAgyj?buX)TOugsADrVVEChF8
z$ugO0B%}H6k|pITNMEe)qqH|^NmKO(-@R;CIzwx6xYV4%c&PH?SG&4r-V2hH&roIg
z(KGiy&bw4N)+EaeKF!;UY4Gf%TDFNJdpl%1?Ux%iZ`i|)$m&z7*lYI}LDhUyya$}B
z^`*3v)aLp0cZqW}xk|lkZyBLO@0v*}nQ&MvvRru1ymKr2_<GZkaY7)14`f?28gEp;
zVY|c2M9H{nMmqYpez?s#PdVal;OE9>-p??d2sFz|5#tbcT-MJNt7zRE1&%3l>G`(w
zRFj5}xMUj1;)J0~TBmrwL+(Sup=WJ>MooQ~DX=i~fGu<yh(QeJ(N^2!Cz=}ci7LHM
z;7(+VeP1KTaR!(33ubhtXmfo6J#%rL*5qnRsq}%a!Vio^;dgej>c3_CJ#I{y_in$4
z6hq(8{P_EgC83}CCx7nqkVf!nV&OYKl7pF*cH8|vqsJ#QmF25WBf$w)Y|Pj~6RU4F
z*c8&x*xWjXpX^0!Z!q9ei?|h>0J8YX==UQ|Xo5x4+1J=#yHi(LDvjLw)}F>Y-4CkP
zTz=1t&nQtNWalA?QoRj}Zn=$M)%*g$Ltx4mLX^=6EO~EeGW45OTVVTzqw-WX!uuqu
zR#nLOJ!hsz*LQ7Qp1>jJ_@a|PTseA7HMIavq>7aF@Xv*~!~65gS!B}HS&R*PdJrY2
z=aBDT`Yj<>nT0RyoJ)I>0DHl$%P%H&kPmGtS3>5QNRN!p;`}Shc_u~bm$Y$%KTrsJ
zY8JMx+uSG@QZblZYTT-Cm(u1Tn{5d?Yo95heJ^`$H=s!+{P=S}6>StVDmq?9nt0)%
z2*pC2?*M8fU{g)LX~!o_e`H4--Dw^Dwn{FSuY29K!tD+-Vk{WWLsQNo_V>qe8MD0V
zd>R6(vobTc6y3L4FY>Qt7o}dsjcvs%<VztDYpJ<ZaGM)nhAKYGuUgJLDbQW!_r_(<
zOzPdurCORh4l_|4EM<x*{&W!WoIN`w{j?~--CD+n><oXH$<a0{b;9@W+%CxIfaHkn
zf@n?&HPhkYlTyAQ5-c!%Ibug8?W?6C+LY<pLz(mumUF<ag-c@qu<pCqD)=u+%UN~t
zH7npZHJ}3v6ZGu>KAwXgKGPWEAPa)%v}vV~!f4v=tC_Nr>&%tc&Zlo~77W-_zUZzE
zPE_i9jCTCU>=^jnf(>cPyDJT<&*{$5As0WrM=DGuVQb<w;8KJ2R<yfSk(axt@ccU2
zMry^*oQlS2(~>MUp6XxZaQK8|c^2yZP?G@0D?c@I9NhLU`o_At4{<&TaAwdl^q9f&
z6Nd^AyFVWzml}n(?4Ww@p0s_jjar3kmEmMOxWFB6g#2AqH_N65$oKtbi(WOo7U|n&
zlk3xZT1x1n|6RuE>jl%zi18L^P1#d?{-(gI(IcVrdnDtx3Ga87^>x}|@^|b;)3LXK
zRwyqws;+^5R8w9G{@mI%W?Nm(M!fObT$X(Sm(@k;FOosGj#2cuj;fSl3SZWkDBP+@
zAJ*MYB{sV%iO9vugd5&!nhpHIFZRI&tOJm~T0H+v&G|=Dy}(S49*Qod3)E1++|+ss
z4i>n6$Tui*DSGH@(8iBEGF-8A-<m&bI=+QsXNbc{t`-Hb_%it*cwO)NUQVO=s)UbL
z$F*Qb6rLH?Mym3g_IgvWAk0MXyX6G!DL6OPhbl(I_E8f%*<)T9R<+jQ0IfTPz5F#S
z;l})Hqk-u=7oOW%P>|QI`z%q@yMV3*srzpDhkBxE_#gg;ogDB&#^!+X6^_emU$3)6
zmv*v4dCHlNy`Q%@ZO-5~#k0fBIvi(lkM&tT{Zm)Mqm0bFc-+ZfC}9uYw)?O)_i_C!
zJ^^BcQ-KQa&XS4&@7o3kqn6E^0VT1|=6B+nODthFP66v`@&!93&aIl%=%%Zz3FqcH
z2ena2w-gg9hn8WQJ^ysl2RaA8E#?wDW99hO(UqFd=XNl+Goo?*w7PkT+uQ|0s*kv5
zw(J+uH!!cAO!<kUVm@vM-SZAf{B;huo7EP_?ccp3YmenZ?gUxyIj=${$lW%f1`BeZ
zJeE)aZxD@XQL|@xN>EaRa0bV0%H>_(x(3((!(WhTKzgg7rE~K-<Gz_piEU&#m0Gvi
zd=j<Rs&)|`dqNY3tuu9xS{Y)7ob$IA?d*hH0~9p1iNk%wntaqLYlaPZ2C6D&LxK-C
z34F2VrVI~K1z!&him`b3iOmRsa*#<evKVD#WxBk;Bt&;26#t6o1iGnk_z~^(yQw}9
z&aVt@VOx8SBWN1Jt`B$y3c4*0({TY>SgWdJ^IRpZq{KiEI`s8*1cNpYeBrXScH0{P
z2&8H|L(LNB%y_rIcVw}HI`V@VO7S>dRaqgy(iElacHDoc1N5*-&lHjR$9Z`D*&I^D
zci?IMBtgHp)wY(gs)s<~Qbsn$R9v^$T~vOLgX&Mwkl{!%vXmmhfL$*ClIpp^O!0X>
zG6_=TQp1SY<6Ql3W<_-W5)0_JrzQylGZXXkG{&9|rideRH+f^~SjbO4+|CjL)V5cg
z3K)~QD^qi$lgQ!t*}c<)0HGq&@QOD#8r`5V&FYW+*!+3F^6VfC@BaNkl~BSSWm<y3
zVr%uA<2WN;k3Yy5qglMK?f1_jNC95i+jeIWrOb3Zf9RZ4NK#L9K2P7UcdL;0{sT>m
z8wmdCGFq@uQ*IRjse9KHD~a5Nj;0hl-fwjJ*KzN#J29Mx8|V)V8V9Eq!grbqO5(En
zxQzcuS0cZ55?mKTu1#X>3+M6`Xs&l-R-KniLR@YudqiMv-J7f3yi;25;0Vvx$WKS@
zWu9nqTB*wOa%tt_UQaQLNd;{Ji$Exe3?}3sng>5H`&s_zGz19I#dJ|Zp+FJS?&4^J
z`Uct9H*cP$MSuKybrGVx46#<UGyCpj)MNr~xcB>_<Gm4>$xe;9PR@EZ(OTfmJD>>^
zp~lXz`FY|!uu2$(ri%05#E~RfKRU!mLa@<W^Yi+};qL2A-Mfn`XE`h7s&38J7lw>^
zHPHBc8zAp>n~SoyR(+w9R1Vc{&OS?4b02fM+EubgLyQAB%9+uAt^g>}#lA;Mk3H}0
zA74Im0AUone|*IYz(D#$dU!4!jCz?9Ys!sC?SxTn@UJ?#Rn$H31&*1qhxX)HZ3<^}
zQ^gY}%U^<-GNkZbd?~~mp9*=fk4l@SDu7+)08mft*x~z*o8SO(S8MN!!;M$cDR*;!
zQpVF(x!P+}^|Dj`Ab0ukEIL3pacAG_Ng5RUV-QfqB+t5!B*j?Ufbmg*Oz3`R?Ki&A
zmbKO*3bS1F?s;+NMd#bILpV+oBpaa%WOA=GHf#M)<?4+L&}TWI{?jVM%9(@yTN?S>
zjz>x$u~r}1kIMHRe<M{U`81D))Q?0Fw7G018F+;Mq-1P?-_#=h!V%q&O`!3!1^<z@
zh^FP~LQfAMq3deYIf>O3qXQu&Q}GmfsvP3Q^5Ut*b2sO47t~7m%P~Fv8cg0$_z6Av
zKU|I+5GZ~XJ%NzNg?*w1A1FB_OPMc-kD>5OX$+hjzjqO%?uE$SRT2&`m(oZbg3zcC
zDj*RAVpy6R$};B;YLSom8B3YsXvoN-aMDSUf_o4g-*Wz9jztNeQkUoE*dO7>qGE=+
z(*J8Qs6U6?!_c#VKS71YnZ^vVofMM@**w(b+9o}}3{=u=Y~CsTE~d!^PF`Al&Q?qJ
z%eFMEkf#FUgd&7fIihNy57}Jny}f3@3AJ4PiCsz{#e`!SXZA6f?$WmRS?Bu5!Qx-l
zZl&tfuk_}R80n~7e4)Vl^Fty;)lzgB00RDx=R~!S=C^x%FXYZ!zUYBkJa~?!h`!$#
zT?cYOzUu1=+t#YPg&y1-Ls_{C8X4|*IUpI`nL9sV^8C$+d<y(txoLgv^k0qditi`l
z5j<8p88bzabHr-K_A=9eyM9ru?KiNR{uo>wK|&CgTzIs7OG<gxPj+8(?QE~gjnqse
zCVNYv5x^A*8i7|Rni16wrTS$f%fF_QR5WQ3IW(KpLu2%emYob#AercFKJ&U_$eDE6
zbwZ$bz5Svn5OwdCDOl7V&mVv?+z$9c?ZafG(S+PD`YC@4J&>;3OVyn-Cw5FAzB6s=
zozKsg6Y#&1cV**}*&rdxz;o)pi_5OI!kTEp-LJGkoJ91GZT8sLX-VaJ4IQqJPR_#2
z)yr>U)RDY9tgJ}S2KYC-Ki)|R=`-3qIDG?4EHWF9UcY<)e}|ObmVP5o@i|1)N{J?2
zv`0=Yc}u8p#@<F=QD0i7AhLO>yaVE#WE%48;oBDVoy4cvf!7k@FZi<M|1N`UT!rG9
zQlskujPhcLzZ{{J|IW)WTZHiKRaYrKficYbNF<#2UtZ(?p=976k!*coK<4a9ZE*WM
zB7_*1k^V2j<zax}kTU7E72$uKNe9Qf6uj7j{@402TSBLbPO|@hoS6TwfEj&w<Noh|
z{l8wolBG^blAc<{|9!$sphD%{XWv*vE(Pirydpp=-T!%?phX1%fN-VVDWeIiRQ$II
zqKNS$;;*Q(ITvjGC}$z_v<p*6q5pWkb6uYtf)ITlTAob!T@iqOIIIr#*mmQVv^V9c
zg<;MjkItc@?3;T3BgUZ1P)SjPn0`_W&M$oq#5Jd)lET3mbd7yp&aHxHip$CA=}jlj
z_<ay3rFd$6`|qq|2tfwL%jIl$y9A;GrdcP`rDSI$LA$Cn3OK0#F|wILEjC>toSZ$@
z%-Yl>eLh708T$;*pw8I`>t>>49J#Ysu`)Lkn2#z?b0-!r2OYt{(T}VHlu4pL9!{~U
z3B7aN1T%!j5zGld$Y$3^XZ|vs|Km)}@UlHv$9{V1GY)L#RZhGU5O;%$t#x`)jWBmY
zmktkp2phTF6PRP6Q!subz-SzrmN*mwfH1Nf*-3sE`dJ){Q8*$Ls$aV4{<|)yD8`XY
zaDU?f`ekPxYMdRX+kD8lIO-eG`LZa~u^3J&Oy+__pi%yZ@&_*_IPUY=18Sw-WQ4WH
ze!4wim`Wkp;!^fxZT4R`MI^J3%`nxCfXh7%<wze`nhW;$6W$FIb=&9@xXE#8BSYh!
zA&pA;ftuynMF$$qLBxA6NwRb|=xOdep==@L7dH;4vC+bJyABKUy&Pu|1dhXSKf9!B
z-$99_fB6qvaUpBepvv3)P-3uYYChQ!bX0;}_$sC>B$&_cjHVW6KYg(pGURks=%!se
z%B%emP&euMfc6%uelVVNr!gvs3<7=&r!;CUb#p(Vbx7x`Z+~a@iXP0R6FTeRD*11h
zL<go3HjWYP;SlQzUc6$*Ol0PD>g+T<k@sodkA5eq&r&H}ONE1he^ya8yXrMc2RQs%
zKhIU8v~U?+fHh62X_`e4u5)k9R?_NmQOIFfmusqb)G6xB3~;tWWZ_qm+gRKAufOn-
z-){*ar~-AJ9zjMYKRR`vBUp4i=QF-BO7(gYa)s03)#UWo#=LbK9JS{u1%kqN_49zk
zsbYCom?WknL@X94h*ZF?mOCrO!$f;>pCxR2%H>$^)3K}D)aAEQj^p5z79v83&VSCH
z!u$4}{k41$IuG|$zKQ5l)d=+H8STSuy`@EwDce0p0pgf3hA>UQdd0vf@+c_eD|KdX
z$V&v*kq59uwqC+N8=m74;XKcJ>OFGzwrP*u^X7`)XaA?WE>J~Vfd)uHhkA7q{~znY
zPxNcYM};4B2fWR$2VEgPPB87pJH`557hnzGL-A`MyIQOyzC^C-5{uY{FFF=I^AbT5
zN47sfC@_LvWXS)Ov_y;7iyQ2Iy->N=!RsR3q0PPnF~ypRI931Fly~jFC2`@`PuvT4
z12J6Mpg}DO?^S;en2GWVLuGqfoiQqyFp!rDXp1||1(0_5eriZ4OPftIT=B!yUrMdc
zBxlzfB283PwpRbWclOS%X}>TBTHi~dS!Q%UU*xsk>}<@dN0_OAfUd@n_ttN)@;{mq
z6P2<IIM~=~v*Tq?HtVIY=G&)68`*OVbg4XK{9>lSLOv<~5*w?hsjMd-r-%2ZvUE$2
zVi?A;QPMZ7&<$l5ieqeIeyx?rgNid@?<xUt)8W-0ivid)zNJ$;afq)`_br!A%#vvS
zbqyk~?Mnl|sr#WQp^5Z<pT1}l2WC13N||kh$N5rIsU0ol|HiN>%dg1TqF0U=^Ni2C
zwG^=(zYnl*yQwRrYgf;(#HCy3<TvF=y@TVi{>Kkhkuu?TD+NGLem)4WliY8O1x(Lv
zrWe7ZQ$X;T60#-Q!A!{GI;8W3sfdE#$Cyql0)E3GrJ(rkq+EeUj+v@CuNw_NP9}&|
z?+zoPqn+GfOMYeT=D+FuX5rZ=ylb?uzU6AobNjB+@E?jyxqgzBXDfy=K??{C)#4RG
zoZx+fVx`f=ZC+udGl8G2ogBpoj+81*a*qOXXmP0~36D5lPfcUhpn1?EZR?v~dxms1
zQL)BQ2*R*Yo&e`Ax~%B!->36f9Y52?P_DGDL_;03k2%XE!`=k-^8>ck%UJzbXVea8
z`pdxnwzt?M>*msSt3CUzi^nGd1K>z+0qhsn;Uocp0IsUtWM$h18(Zi=Ue#Bj$3M^K
z7n4~O-#+sxt|6&uBjMk>QSfZ_^F)8&s%@>GvvIKwQV0eh#jg`<iBhv&hUm*dJIvH(
z{ZpJ+AeOxTt9a6gPKR_(hgvS9ZIZ+8GWN5SPb@Vo|2VY^ax#oGIH4e9M%Z>T+0P!-
zMWvAVwk+^!Ze6j->$n~>7Mcf&zqa`C0-78)CBUmCnYZ2<F+us9*o$v8Fl@@V?dQMp
zJP^yAS1*=77=G-iSsqMQ|Fuj=XCxcl@A&JU{8H9<V?Ut>IsFK!pjf!q%d}5k$Lr05
z+LbVO-^cK^2YrWVj6OY{iTC)DJZ|OReV`>`rNm&U74Dd7Be~Ai-bSr2WLzd&R4jbr
z4*@gHYEOCPu#kh$c^fu`SC`UFnq1^bSaBX2ewSpjV41-A3riA`iVQL{4ld#IR@Spn
ze5h4sC)zJB;$;)(EO?4Ed~-G{Gtni;;3c;FjO_ohLxGR!F_o;VjZoM!5i<p$+ZUAV
zhw6)1Ym7RA)3(#6W5Oz2BCu0(oF?b+hLzYqXK*Z6z5H7po-rKD{X6`oR)K0%ru?G}
zZI+PZ^@)P9RU1VuZWo;qdSaiBGDr`K$88D($|0MSL<Hjc20hbeOM-rVN>(0xt|c?*
z-y1TOL#DmzE802BG>>0nl#h5eoj@77{OwA7$$wex<y`Q|DMFrO?fnB%Uf{B?+Bni>
z*5waM5u$sGOfx`ghNG_xj;~xb(dnA=%>eep3`?Qw=gE1}&3!sZ<C}7S>&K@f0oDTw
z*V2Bk!@y@_jA5biU5UPL?{V^lDsm&oiyKxoaTXH(F%es_JeT2aoKZAd5`XM&Vt9*Q
zZ2nl<QGZOF64g>d0VZ-3S=L1c3NxgJDk{{#RgWyc1%a$t$X#ft0m4pU!m^u;yp$lg
zy2-<Imn|}IT^e`}*R`6z5e-OC6hJ~UF%7RH6aaD~d*n^9&PbiVL9D@%0={82OSBP-
zy5?0q%PA2I`MrZrB=wcdQ~Yf**b)@TOCXMd|3jk-rLT^PlNZq>7YA3n7?0DGO3J=Y
zKyPrA%ukI!Z47hP+`O5bY+zJ`-+`p@VQOptiGQ0P%H>SKYe4N!1;in%$BGnsROJj}
zLK=*sd0(sv&Ookx&5-r|Vih8?6|4EW^08a)JW-Gi!FZS1eirp<q5cT)Jue|k(dvbm
zr;G`L5%5a3#1{i|$=gaLEWXq8e^z6zfP}Zlj5CQ>K^JB%Rg6teYSLa<`k9_rStF^i
z6x?89Kuj))Bfw45`|`f0fWO*l?4k4EW#vnK>zQ_HS^lKVKFV8OB$V<WN?Q14Wpmfi
z163B-U3yd}EW@a|{~ef$Lisnfc1b?Fl{0uVUS?WrnoivXjHoQB>EVZWN`3kf0qC6H
zB!efb=L9ZO#8bvA$xo#Z6HbDswI9Bx&PcXQow4sUy(toJJ>W<RwZuUOHWwHLgejs{
zwLGpCQSf}-qM&(g7>?pI$VzOCLn{OJv(|9Q<rdsbVzpj=0X5h*&jimj1Hg!cq`V68
z)|-}e+(9IGQehsOb-!Gl_K!3ZKFg-fCvPR{kNvzO!2$S3EA&``9xd@x0qC*jKM|lQ
zxe~6wn;RuDC?hn3I7e!vi|UQ;R|7g><6;&um>rPG_zz$_JWH_8*OPcZ)-q<S&!hx7
z54*_v_mUgc!($YT%ZzZYlg(!=2whEmWCODsDQbwzO}NY6v*Uj0(X{t1vEDbMmO_q-
z*(r%n7I`=AXBjseF~J*=5oZ-_Ek$RWBM)U)>r(taQZ8GQ{oSiWo%gf%0oaD_)96iC
zm851L3!G-A463}`?~V+Ul+pv$uv1CvHY?uEE_&ZpJS>xl-re17AcSMZrVG1FFRW$W
z_aqS;W3!j6IGHy1D4>PZKvy|=HP)Kq>OEYZ0}l<K>arYd;aRwt%rrQ(jF#rEJF8xN
zgspd{8=el0uUxQho}l6qywm$fra(4)hQ{MYqk*u2qnQ$MWaac(G#&5F+8dsU%cw<E
z<NH@tX}2qs*@wn;Uwao&IxhrP)%0ILEU$BsKL(_>@X8v72BQLUP0Wh#_9Q5w3#vD_
zA~iQ;Us<{)>kc%>9}j>2F+3e(!7m!>?=(!U&=)wUo9Fz_9Ei7XFuke6ZgieIKl-vL
z(3&NV(4B1A{C;I~%0>m1mlMQYkjKPFyplY6MN<yYpf5>)1OGkTn!SE0Bj_S?s!^pD
zTR74?_$<!sQE)<k5Q(1mc9!2!JDp@Cl&;o}!XCy~HL){TGZTv^x2G^}_@t@x9}r|_
z*DC&r?1*BF@nz!h;uk4)yo|Y6czJBDtL+V{l{vqZ%?+oEiA4WpI5?1EvtczzEU6Sf
zvnYdZ{a|DHpd*hejG(w&NE>6jVCu6tx0q0EHz9^v*_`OMI-5Wjpqes~c;Ru@P6l0j
znuF}vIFE}}?UaPj>jp@H%U%T5nl@*W>RAm^$uDQZ9J~yc5IiGVRvX)$ZxmH+48GkJ
zV)}GGuC2{0BdyDnf(%^#_@RjuML}K2&LECvP@gA5Zzq>aC7!Bm1@2vMTmV7D+C%h)
zgLf+hAU0HMXk%{9?*;q4blxu~yR%KgjeGpe%O{Ea1UlP~oyV^nCZOkOl|&tmn+)Q*
zt(!HnXE3#$WQ*Ij9Xno!QgEtS8vkvE8?>qY?nHb8UbfcJYF5DO%weK-bv3N0GboHE
zQenE|_76<#I%r~O*_~~|do{4#JR}o*&~*FuZ24I1m72QtFS&gf<g9ANNB-!zT_jrE
zpSn2qI^67mXu?+aSKC$S3z9;SUO#PQ%Wr|A^UwGyct8}nAgVs;tI5=za}rj7mUb6m
z%BwMDI|nOrzIUQhbOt_+8cgqrq=4|K`kMQT4~Im}S!C9l`$><F)*|QrQ!p8zJ6kDY
za?CthPTXb40=K|DujGN1=PIAI7k`26-=R)hf>#EN%o055rL=3m&y7lY{&M-ynE~z5
zv_y^j#6lmR+j3bWq2xJmAT}WCX7(`Lp{eRp^?i0#;t34p^~JGEnd1?3*rl%<YkOp#
zU$raaw{2V3NfUq0A*A$P%S?hM<i=YzxZb1+w3W1cvl_4v|8o@w1EdQYJmophakO>b
zRMQ$PE;tS5{OR~={&egPR!vel)pa{$F;E<l6j%W<mm9*K#@=`>u9ybqjyAAIbO;!1
zY-M>$!DT=V6R8<WB&u=s0~s^7Jw(SBf3l`R`_)cT$tucZDSFS>r{_E5I6CUK*osu4
z;|q>fOSmB}SueHSlrNT>`m3mfd#_67Dx1<3leER}Qt}IW4`1onk#roL-kd#jO&UQh
zRTmnc2#d7)qvWUc^cz<pu#g1wEe6<4&0O=9C8{>>(gZShs31ktLCf~4EWYn-=flHu
zUJNkX)DawERG;f}d)Kmhqf${<9j6<Q?B{95#*T!MZ{qW1?L4$Dl>5t@*kQ|1X{RQA
zmW1fJd|H!Sa3)*^pNeJ|>u%odn&eHAp-y1;eYF1&f{Rxw2Wk^cFkX!mHOabAecJys
z@px5333Zuu&J@203Q;X8D%q7R#9`T$eCW^D)u|etVN`JnCtwfyt7&ofPuu_wD*nBe
z#*j`TbfjH{XZ#Q|UMYTeA5E0`ncr5emTrjl5aEXZq-k&CFaA5Q?T`a$SJPe=7M0m6
z#q`HmEq%GlHdo7w%b~1CsYN&(U6;F{XfNaJE8D@NTPH{Pr+0sfm^dBBZf09gEj?Eb
z#3-LW0!bEPadn*z)(RoV5)3};FO{BNUoN+!O*8v&W||lI=5N=$@9rIZP}Ix~r{CWk
z$YZ|q1thSCN@dHI)O&3($VEej6sfYYQ|Vszp1&I#XI~i^V*64n)VJ?6nc^y?>%Ww!
zF7_oIqBAH%7aL_Y%lPsOrhg60xIkJ<8$9feW`_Jykocdslb9xNa+3f!aW@t|PFo}v
zj@A4Z6lcljL$I$~{krXyvxk#a5Z$F}^UV~ChVcxGC|pY#>g^3^X|^>|F`{(ma{_NL
zmgCVpf*MdNySL@cE02p?aHGD8)>wrQQB7AS-ws@dfeD+=IZx-Q)D@phExsb9di!DF
z*gbYUv95?VZFwWBVItD>?#O-dIeO@X5ED6a9Hec%+GO6!8W!=VT=y_h5;Pqa`Uc;3
zGv#%>2>B<dYJmd5L2*-zgv0nimOx|k*!wF&(>0PI(Innv$?V+huEFOv90dT=H`#ER
zPZwv=|D2ckO%=b``kMO)Bby&CckSp@wO*aJOSl9Buu7cStG?{|ilR)6lCdE?j=#7R
z7p@G5jsTjh5gou7#w+vV*6Q<6jw<B!-$Cp6O^c}=-^>JOf-fBkM@UJ0@neuc0WHj>
z(AcbJ&<qMsDtc(7T&c57=R*Ys=SH9=Ty$l@n_Ao=YN%L}_x9-0dnX=G^pzG@R>jlf
zyh!2oH_8_!e<hi*8k|oB++If~Q4<iM6fV=@g*C?twl|2(VF1gc$v8=aj$lm&oA|jm
z>|M>BJ<c&UD`&p_Ew7GXCno`&5A|tcZkJUZQ!bZRR->afeDzUePaWs(8T@%yMyr<H
zzQ_c0hsCld2-34)(?1~C1CjEosjCA7G7|0rV9ZSjrz9?izy=PUz2T|8$ZFa^!kwkw
zM6o{)-dlJ=r?DhQM|0-fxmMxA*?B7DoLv<ZST;}JkKVVRju%+GJo>p`2}_)}r8o$h
zfAGxZwQeTey}j{^v5)IsPxpG74ua^ZIh4^i5H4JqE2Sfw*1nN~k`7G9)7zcc8E7k=
z6`x7C^=KBlDcifK{N_RCNs##sMEYdN;~y&@g&KO?^1so`Ypy1}IZZ7oC74_3(Gb2&
zGpbY0n(`P9qyA29LfsU|i6%XT(??fvzlzYjJ-vOumtSj<c9*My7(c^z+N#6ep=9`Y
znW*b5bxy;q@`||m`I6H`W06Pv%z94P(;SJ|%TX3P8|R%8Z$Q#IK3FmcZyZm#=EnQ^
zMAqZEw$$_FL^`&uDV15~YNkyT&*`{8RNyk)?+{rsqin^aaoH=+b8qRByu~ir_&c2K
zG}6a8f%T&d3oqsj5eZ&yFutjp<#`TV(8Gl7xpW<H_NsfrXAN=>*PR%iBep|ZhQSjV
z9=F(f<`b{Qoz>)BN{>383`Co}dayfmNCf<-sg#EmTu^b5G#xaY7G`zK6p5m}c7H51
z@71#!#&c<6WYg+=($#PNdb!sb+mMfiP?{!tB2lxAl>ApuXs-e-EV8m(;A(&GAlP4S
zy`}rO;R{^V;j&HyF}xXfU===VcqQSzB&k_y1(20z+gjTA$?{=+(1rwe9O-qF)5c>2
zw58icD`4S+on^!i90Wuh<nCDay1MA2siKK9kC9r7jIZl~Z<2zI5yxtof<F6u{esDO
zcf$#_|AZ4IHFwh>!8qy8<3cuuu3#`;SycJUN)Z0qgX_F-^r~Vio`THf?mJZK^---~
z7l~;{Y5YpwQzO2)i_z#5lTRUvq+LICIxu|A#X}WKt<HqjY(8BUp&P+kT$StOdYQvU
zQ?guKBR!3&0pad0=zMX!RXIdI5ihQ_o0N47)G7C~Zo_pDIa32n4f88PKM~nd%an1l
zdHS9c(zO%8x_vW~I>}{g-C~vE>u$T^SF>J>j{Ke)J9lR>LKU&P!$@N*KFW5r=o1uI
z9gqn<rAHT0<nYjQNbD|b-0J_hYx8{E>l~e%@j@CW*_er9HCTo1)ao%PPg%RHk2>(a
z;dr}xZk){H_J}5-m%@o)_Sv_~7mCSli6(6LT+>epee+?Jjh41@@wM@~K&Ohxk_hZZ
z8C3hCPf~miwALoIRlrR~^AE!>J*N(~#lw;9I~c7T9(`fxT=!*ikg$Ig@o|KG(JA-L
zTk(K;zP17XRsc2XXX%nqm_Qw5aeY<WE@SW70w3>3kCAIe<*XM#0TEB60?Un$!)U=!
z$CXLS%9fW<q!tP2v=E1|WxP9wnqoE(5r=f^-pv#q$;zB@RBZTzg9zCB6i*i=w-rog
zFL=KmAR;V0ie+BfvVpdi*7RN11Gbr91Z|6soIyqVQBQX0w!#Bfj-)v*vdjv|{47L%
z&hlAPh`-+cflypZR7{U*-ue8FKJx7Q_#brSF6}>lorWEcMyqo-&wI)~d#&6wE}Ojy
z!u1wCLmZVWE<C<Ual2b1U%J+j5&F2tjK2Gbfv53RF+j|zMHYK0?hg?C;uh}0y@)@^
zz7nwK{$-(qIC)PhLV4U&bWX3Svr_5$N_e`>;;OlB6gUDWSGeZ5+i^`SO1D<X^o?&^
z1Z>+=K<V6>Ki#90O^^Fj(J1xa51R_v0W}GaCQ;G=cGqd7+eq(ZzjCz7iB;F&O80aY
z<o%imjLt|cm(PBE$It#ui@n@=)P)r6*&nlPAl+L&_Mm+9<m0BkH^iYdaL1W!scv=}
z=X@LP24+edEXFC3LIQo!n4vT~c$#=wLjWj;I{v+$!-TDMaJ+;YdYuGUops(zGfY@;
zGBzanLB8?0Q&>pt6LMR4xu|gMo>ZqjYe5Eh7-mT?lb)VBE|R_w8O?hh5}iHnv%npR
zKF<$r*ojiJBm|GYky+(jG5$A<yhL{oQH84!gI!%+eMClHr_FoNlgp8RDoiJ}-S@qb
ziZ5I!^xK&5W~%s*WZ>j}yC?CE0K4u(8F+&C<P;ImL<Q>0!R0q_T!kMDmV_<%y)!B|
zBM-lwcc2AVwwm{(t;H*)&RNb(Cnox>88lrMaHzf(v7L_TP?X$k7qa+(GWl}b$nL%x
z)(d&oTw-G2LAI{kS9=FU>C>+9gZe45r`*?B)Qv9vNDQ-l+j0U*WZ!4|)IAbHUYv@>
z(dMvy2Fbc^Xk@UX*geWEL2!HxT636~b;1`>@u3D-lKRpC!jI>cYM{yn91!=VKqllz
z=r^b!$e~%&JMT$<E%n|3wp*LcP6r3$BF6L1$Q=?HOzPSFns3jsIKAE;EYijn=FAYT
zA7#jv;=W+|wtv>hKUcZ`eV`L@^8Gt2vtkPj-paPp8hh`h5)NrYPvW7uie_h|mS@Oq
z2RA7CX!vpY$&e4rMs7NEsDW-Io^@z6F(rkm0Y7BOWN*YADee0fr-Q3>uwZmbikaMy
zt>PP=$Y2Z7($)607@gTa@j|Yjg-T3^Qmu+Tv<(dQ+A;U4<x1s<<;Z`y$rB4oCWNEq
z<^JxVt|phQxaqDMCuv-2^z`u>3CDLM5%*E6*!S!Yzc>YJYYzdaxP62b#0Re{wJx^C
zUuj<;#Fg(z7x7XwtYnY489b#(jKiuF6K|7xD6{G^!U97<aY^T3o+ntTg04hgCk1od
zGscc*2Lnx3r%}E2?9IFp<fMMS<9b@LqC$E?_(^)nw*3_V55U;=>;)@Ls33YVTTGfL
zg0k=EeWs@8Mi^Xcd_`xL+|yR%NyKh_|GbCWx*iHNRx>=;ASo8xRs#fAU*u9&J{;;C
z-Y(e87{N%fNJ>md<`B&Wi(6V=ZcDNRtrk#VTnCd#DLHC$%uAX~EEai1nI}f=g%2!7
zC;-1y!NZeRkX7DeWvXHpmc8LSaW%lDJY_(WQTH#NYVu0GZfr4Y72ySbfzpUjEJ+kI
zU#Px4iW+2f0g&s$axu8|no;%(xx3Z~gE($5HBUffj7n$(>rzJuC7UQ5zEsW=@&a{p
zdjA%=C^z~Q!3SzXgO0t_vXNjt6=4)m?1oehj;rh99JRsxa`A#v@39yFhil)ivYE*&
zw_^kV@CATnB(<jY_nL4ryTAfu+8M7=y7EcY;++KO8RhMFiyxgWu%Aw5G0%K_w%k4E
z!!7HHPCOAV6RrUuC13FL%bACOA=kpaskSx)wV&#e)!#sqlpj8q)2fv#)hES-lndT<
znA%t@CEy3B5fRf-(fyrN&@M}7IFLu*M*9YC*=?7@EDkLPg|lTks+pn>{+R57w|`CL
zW8PNesz`JF2Ekx2Q@nh$-uJecmX#j{xdTJnEk(J?{AGu-@5YyG{90b_Pyh{5#u@-`
zoo74F$%l-yYl$nRiwm^Y(+<esUWedyXJ=8f?f&mYfN!|Jc7~~q^|)~RJsb#Cqb=%+
zmF~;rnnwGM8u9V8z4djm*e)Z$SaTz$bW)&wh{S{snZUla!fCsD`M4br%Y~t?jp||e
zJ%>P!_O-BY+dx!UWB{sUoxQ=`j?N6Zo7YLDCn96}c`3WTedl){_)Y_J5%{+9U2#i2
zBxGlj2Im942?mgHGzA~f%%h5utu8a?`!NNWD3d~{R-f|u7f4`bS<smwQxpNjYgUak
zoFUfG-v0ez?JF7uITqPzdRuRLql6v0&X1D!1=zqplHE|6oF=!`zD$3Q<=4uc?d(s3
zxts2u0jsy)w(MTB;;zc)vL3VfGt+wMEXCLeAOvABa`rPNDNa@9rNQo&!6C5ozAD2O
z%d?6!`qvdPpBB_&8Sx6GykY(dF87+xBDr`I3^X<s&Zp3)GuYn&ao16<f#(BNNGhjE
z4~Nf11etx)Ms3#1aj|+Hjm{f&joA8~n<L?>dz+UO3)HcIy5%Y=jiLmrKx}HzEGove
zAw<*sv2Jv`1aU9QxG(kV-s5F2w*C*M_G_sN&$WWqGs?S=h>(|Kc}_ZZy4!}YoJ!<>
zL`Y+VRC0??67_NP0lx(twPopPgQVsQl9GF|N1`aIHu0!+jD8n6;E`9>g=yjhDkizE
zcxj&2@z_?%<#GEV9El8$sEQk)eib>!^8J{TuYzs~$NDM0w2!U5G!z9z>pO(|KB{_s
z-6n(#ji<Nlrt?H#8giz9_-KHEe@W;6EWq|6#^#o0PV1ATgsO*+w+0b`(nH-PC_MvN
z`}*y<t}vuzR$t>|{&#?OJ24$-z1C@!$Tpk^LIFJcnbM{l>vkT|O2*GuAj$esI^Rhp
zLyN6j(juyYa=|+DyNfsac=BEPYy1re4M%jA<#y+hghC`{LN+H(5GMhNK1Q!};j4#d
zF4M*NLD;wzQnoc%>8pz7FpNlM_>rkW(2{VqOgYCo;Vk{i%OUdM@UUewoJB?aVVh?{
z^s3|ObO{1`;pn*E9O3AE-u|I#Z)^K)yrk4*B$%oibkVS60D-J5SRjQeeIOx63r4e#
zXC9W>NivFSR!uRD;Ri?+<(JDM{iJjsW;f8gONy}JXibk1WJPXy>#d*_R)G-7EFE^e
zlLak@bmMXPMGU8)FP&dgJaKd)hvJ_Ys?Ih#5$%bcB(m8(PwIk>L73mG9`c`<a~kip
zw6G9xpY{><JMRpmdxHO{pnSga$g+Av1?!3EE}r5wQ4(!t3gAwGx4;UvdZ*WUKD)@Y
zB%~9a8(Y;0k3IH9rh!tHCt09~J(sjG9BleZbHeD}FsUZ56y0B=Z(Y@5E1ctiTrARq
zvY%-rwH-{p57+_rCtuozu_lrcx(uAY9K=Az=XS3K$Y1+@EZat16AvA%DmAsN`5BWH
zo%Z%~nb_LsfH9u^_*9fphSP<+CK7nRdYHTEv(cxA2MQWj+j0V3jh!sFhk+#54>EUJ
zC|E&3U3wsi=DW$q4X=X$79H0FM6#y81zZKs5aoPaD1rW)Qk$bv$jej1hm|n2Y^htB
zI<JSzqsrFPMVqP&2?+FjqxT=*=7*}dd+SNVl|xu=Xi5~9Q@M^-7(#jHZs}+wg^k6S
zhK@NhBG+$%qDinURq5uSnebUl9uQ07UI8Vv94{Nu_=Im>F0bDrMwmZqVj4V9@zg`q
z>I03aukSw3f-bVagj|B{_WB^?sS~^O-rUSV3yVru9j$~uR}xb#d<`z&={|3LtHPVS
zo$n0Jljo!GN=6*+Ct?K=cHoR^g5Q;#prMlve&s><2zmI@c4t`6V;&}0{e=?C9AUE6
z-mZ6Yj>;%6PO~Vzz5u1FQuu*2Q?vvQ$i;K&H*2Bv&aeKidzC`T6>N|rA>h<HrDM?k
z97{V#;k>yo;=m`ZS3iTT|L5iyQ>-gA!xf=xoJmr<kC-?Qr?^_p49ho@fpTb`ch*io
zJ6o+{#XDqqH0Az<l05cdl;M(AJ~Rpc+~&fK2(jw>$Uvuuga?+465a{9On6G3HK(x=
zLDm$rm?uT@i=9(Gi7G>&AR>y|9B1#GkS$-+>GewR%fw9trQx#E<)}$Gkkpo0HDS{T
z?LbeW?@eI%W3iE0Ct3TS`O336f<1}5WoIK8Jl|IIauAcFZIbwE%)#esH_ZWh>eku1
z;<^;3Kp<O6I&ul1lJT{c>2JDqZ9mHi#*d)`1trfimOUa1harxdB_m=|we0pU=T|t_
zn#Zz$b5uugl`05EyyhvQ(9Rq#b#+*alc^=19<rUskDeYJ60_Wc?LWpyp^xKoLMH<j
zVNYl+=HP~g4?UcEb)cT=CVOcwM*xA@$KGPEmngOYO8HkedhC4-2OSO*#Gj={k;=qz
z1?5MF=(9_IlUIIxhb{W|>s$(e_bYl1lW1p&hB8}MG{Hi7`Nx{uWYgkS@9ix1ja^B0
zEQCOTRXoInUhEGbp7Nb)*IOUj%18Z!l;$$=YZJDhYE+Qb5I4$UksUT*j*~}MF@wwl
z>fwF%!#M>gg`nQ_?U(1Z)1pZ*$udS1FW!%`64>;EjjzU0b@v_w{DZ3PcWsheuhOsU
z_&X%}r~oK5#)um8qe56v*Wvo_zSRgP4DrP3yD7q`URI<^0ey_(%M@$WFEko804DP6
z+D1FT=vI}hDSn#Y&r29eT;+|XEfeu%kD@}OM+$WJhwm_pz>fu7Ty4I7PUjLWkMnLc
z_&Rtg<h&bb(gZg>A2?s*pTi*;DJk<G#N9*pYBySHOhJDX4@z&E<CD@QXNUVkC(4+c
zg4NAXJ6P?6>!*wP>UN%eE9|$PbUOO2wrlMHQ4~{;4VNmdDlJEe(=|amge>~zV~Ab+
zYIuTgpIeP&+~H+6VY=UUv>#x18B?%_S#Gm^osKEbGg=PSzYQCK`0gvh%Pu;+-g>S*
z$ltkK8MT0gcLznD^Od%6_w#pNj0_uo!d#^ltoL1?P9{sap6Sz*9n&_a41rwV*9YI1
z#+TWcSi8WUl8=seFLy+*tZx_40Az^BGKj^sb6uxmp1rC*qTe3yc&xv{RvIfF28%|>
z=Av6p4;SI3gG`3ZzG|@FEPq|e)@|K(TLcsSext;WMHVg0{n$6LDE{=I4d31O*IwR$
zT(<{(vY$31jwGI#etP82QW)GVFWpFVvf3V(!-fS&T^H45f%JNfIaO85bue+PVp{G(
ztog5-Ltjd(C`5QgeA4Bjh14{r!?k<XhE#u%P`IC3P>QR+%ldpr_9g1YmBiN%A!k!F
zuhhvQ<m<aUL<Eo-jyz@q!Ku<$G~~ZBD#^*jU3E?Y#nd8h>P3jbW3A&A1%JQB)%b+7
zA*S_RoweVsRx~VHAOHR30O>PI;ku`k$)66JUtRS%s%SKHnblU6c==`rC-b3EdL5P&
zoi;4lci4AM*ETN9WfKZ$Ts}UUPp(gQhgcNClL>!>W;`W@;`CM+)Sp5_cuX8#F8SUy
z1ReZfvsn50K_uv~P<ftnxB=^A&4K@{1t0K*6H^A1G6#?FZ0zSH!X~mvV?@;e`gpw9
zq>y<#I2k~F?@Q9Wy)DZ`^6FzzL_kYR{t}y<NncHMM}h0yYYZTnuL2b+sCnxlX(WOO
zt)9fr_NQ(1WwVl|w;jI`DW9a2+H2%5>VS4kqNygkn*RqPLEXMvPZV{+#wsg0CV*H~
zhJ5u?eXl&fV23?jQ<GcrQo|m5O}O#;-1KP@hROtjTfTJF^DmG4>?atI$Z0^V1jAue
zZu!jT+?q8jF8o1ehiSt;`pK3PP6+S0W8F`Gvif(wvSG-fy1>+(amv^aebnx>gCc^p
z5l}OATIMsKZ$9EERBUdABFn>0IobgdiDeEMtop9y@diYU6aXa2@H>Js0I<G(KvbYi
zV1X(r$9{F<UGYc_LAOYu8v@0VAYaMpn;M`PIVeAFc;k`{eO;xfDQn|z;`=?^6#od&
zyDyk}x6t^T=Tr>yLyLR%`uankJbIg7pSpGA)bP8B@}~G;V6#>J>eZ|F+i$=9_S^3_
zzxj=GuCnfbJ8OLap6@-DE?s)<wbx#I?X?RRE*v##)KNzrb<#;EO`bg2@5}e>G4tzx
z3zEI6Dm#jd2*H$!Y%`P?uzJNeQWdCoie&&qQOCyOt4r&4n4!kR8u+A6V2GkZr?IuL
zmH;JXMFfJXHLL(cL%<M7M2QSD`btq}2Q_A>t{I}$=YWX9^=sN5ePY<rN0GRQ41-uy
z35mMeCbUzr+i|L@Fce4l3$MP?vt)VeK?kV|9a{o1oI)H~c3@I+(ss>Nean`%?s-6N
zw;hDlIRROwh{cfMr5F5X{p?*^4mwnv5Cd_zK$T#aIv;&R4G-FRH>W~Q%JUaS^XC^k
zJ2OLCa<jJ2j2sQs5W;aHB4V}hKCyrh8C4cILZmKmOiW^x6Gzy$yf)s}b(hQ@d^dr@
z_qGcNfboSAo4k_*q68LVK#e*fBR;zC8}MR%fcL21{yj$AsGtN=6(Trc!@FHL_t~X|
zg+Jb5L|s;5amOuf<Zp3UK2Re9W9Zc1Ex6*570+I@$Mn{m6E(D@&A|TYr$?2DqA2<R
zuk|anyWSf*c~C3GIhrP>J_x8*%keaP=bd-<_Vylf$RX7)tG)qMlXUfaMWiB|KX2Z=
zE3UZW)?07gxN+musZ+o6o$nlZ<dMUN53hv%D?R#e#2MAr=E|@_j)XFKRILybcQPV{
z0aTz+VQ^DR?anPCN=~6FtPnBG0+?I_Kp>}p!YIfwtf)qzBW6Mni^Gx-)J0O0q1s`r
z4pB%Zs4PqUf!m?OT8}zP6o#x(0gGb>z)l5Dh}nmt;{y_bDqzWI9H$^r&2KX~xAi2a
zYB(?|>e~;ASs_d$aE1q;a%xLr`VMiH4TOkTjfnEt3%^aOkt%yd0yqOv%8-hPH14*C
zm-J_KLMFT2<lK}=wMh%Py0U-*SYc9C<16_?B})Xbs4yJEC=na5*Q^663z3qV*YOv4
zCs!W`5K$6Q14l&4hK0ldWQZe$12&I%E3v`>5hEa|gA-9zvSgS@DFUFdTk6ec^E;y`
z7#LFH3I#2e-KJlHE%hK${Jr=9v%dGdiiIq^q1b>biBGxt*QHnO2LnC<@0`2w#v3P0
zn6UTWK7CGa`L0Ux-WXFX7Vo?7zALV{;=u<Wbk6O*`|c;7eDVPY98gzRSDn0%d)nWs
zLBMy0Ir2r@#3Ur7l_tHorjDRWZ~}k{AfqY}M2PF=*tcFA01?4@p?6|d=i>kwrvSlN
zN=$Nul;Mm9#3T?nFi5S96VBv|$CQZFiK@b~NUY=L|1gM%84Bismb5sq%YNb@3{o|K
zMxszyqvF3x3K&3H03%|E6&gh$2qeUYVF3_(At4AHIT#|as2HPA<D>81C4ihOGeI~o
z5)g?xm&hBa5P_{xQgQ?WvBEkySY=pMWvQg2<2*WneIGjw814Pt<U|A&fCw4OvF9Hm
zVz3ERl_+?l%MP$gN)kGSa5iAy8X0-D1p(Rk9<+tufc$AB04?<a!eU>6;>JBjO8Nhi
z*y30}(aF-P2k5o0sS+C%fJnWy7^$j|BV_^!DZPavX^QtB)y}re{M1uV&7C{<BOm!l
zQ&UqVykB|oX6y3m<b7LP+s!xMeD&2=&zm=|zP|p5BaS%r)Khofd1qsc@5HOT@@)o$
zE1g?LK?3K~y9^nSO^r*ry_kJ+)h1ykOR?^`vQY@bsLLpn467)JG*E&<n4N>IG<_+7
zl@vz3R*pYKapb-drA)4D76V^A20v%83L#RXaj#*l;pamlhP|&0D+#fuPX&yaWCg8Y
za3cC^)Mz5_!p4$!v3I0a$$7bYfQ&Mj6wbhT!8=uEsDPAUG$J+tdnG(&VkH4F9KfvX
zd-MgC)HzmCvPq_07)oGDit8AF5rE^_&wFtBIZ?yrz6I;Pef3LwZQJs#W2YP7&kw(P
z?c=NR`I<;#JqCrp#i;sv3`I%MT4(?3lxc%&G7r48>Z0pj-go+tuN=9(iV(>7kLO*r
zq5R{aRjxxI%K#*j_dDmC;vbAHiS;W>b4m;&@cML3T0Jok5XX00FOH!Kh{ny#ng~v8
z$zQ&IBWxC&M-`F#r*66B7Gum2M;uZ8g3UHk3FdniU$}7Lwbx#I-F4S3Uc7kh*s-7d
z<R?!$>7;G8*~Y)oPaS+PAHX}y0ek6NurdtPQ*f1TS(V;xtni+=mLU06V|5gx3M5?M
z_4MO8DJv0sd)H*nOR>s+%#&V5##9D{ii9f&*6+{j167n*C|+jz1#<FS(xP7(V-hek
zp1|YpRXhxz?WQ=M;m1c%G73K-@n@*=RW<7U|B^~b_FqZ1RQV>dvX5(MvX-QHXZ{S>
zSP_><EeIT&1jk<Xo_A^Bu2A(c{9T~rjI3&Ra~=wrivA?eTN2*%K+)8S69WxF46cC^
z5P}3=Xuy?+w(!?Zpuxa`jm6vN6g5-$>XAvSKgAvgTQVpKl*Yd*gVgz}Wy>FZioX|I
z1nXBX?JHr^H+@G;`veFDh&!g8s8Af&^o6Psh~v9iiY*4OpF~%+)^6Ca;f_1**m1`l
zx8HvI3O!fb;(5OJ`Z*%<)KgDge);8h+;K-oN5_mAGcLUF!ov?gytTE}2lFdo|7w}P
zzi06k>)#G2qi=psta@&f*TzAqWYw3nZ&Ols6jys`<!@~AI<V?b-v0V^DoLSrEG!pK
zFi=ql)`5XiRqachxlpR~sr$zhk-<&ng%M8<67iqJ62=FxH74bFU8`D|F2u*_-G1Xe
z>!ztVFDQiMv7`bnuCA*blIno!y(Ytnvmu+GlE3461Yj+rK9|X506^snhfX+t_pv$l
zmNi-;`q;&<ys)JIr)N(*a8hkY5tfuxOu(%<0N{*8Lq3-eDrb1htKNTOq-*<sGi(v8
zAER=Asg|=G_+D=kr%#sf0$2i6$}q-opsErn5tIpF9(YUj4I<f8EbsA-znxV#(}C3P
zzz>P;C6L@-E4xk}d)>WO9bvMyxRW@E-VRKV2%GMx63y6b50(34yphVG#}u>qI5um4
z9(X><pd?E#f^IUVO@~*x@m0;?HeJ(WQvp8z!~OT)zhcFT&wcK5zSh)>>QsvL0Qi#o
zzP`SD@4femE3SCtkw*aRvBw@KpM3Iu`|Vd-Tl)r`_wTA2xpJbCbMjiF`s6aw%1T`Q
z2dCAs08iryHxFz$HdWWSF14~Fib=R~xxC(ftEyL$g&Pr+UUOv+j9!zSCmmdsjSSE?
z1|CVO7pbzHfsGI<3`a?kL~^bYfjd-=E8c(Q+DR^@ftq(D*H`6yRZbJ)SL(p2e$RN-
z#|M4aKqMI5oEh8Vy@%pMY%q*awi}w?Zg`y&Wsa|Ui3HxUz=^Aljc<vi`zcZYTNdkA
zfoPLkb~4%5O>Tabo2Q0k?aLebAD+Kv_lYB>46PLxIb{pNK+d>|r2S1ut5*!~<ol^r
zqpCcX5L~6Rpz^XR6|dx0SXmH(1{%QPZBz9-lf_MCeOV#QMDu%-aShCt21Y=eOiq$@
zR`U9)O}^&%21dN;54zcKs-AtF@lf?hR1p`9ZGOT%N#Ah84a0{I-+%x8E01jEfxB+q
zx|?sl`Kqg~diL38o12@DJ@(jBPd#<ktXY0RUlH8htW57cNUm1#7rp7rs(+%*pP}j>
z2IAAN{e6>-y}3+U)sASh-Mz)jU+2-+?r5_hI&FR$UT60-@CDuU&^LREUi+fT2ljfW
zbJNZK<0zF#R3Hve0fQ)ncUVGM1c+^FRXGmi#3laek~zf(16u&=Z_2tspn#T3<;wGM
zWzTCBFZ>ryl-R5(x^sW__+txbaLbxcA2#Z<hm0MR(a<@DHOYNyfKV&zpuevi$n&%g
zY?Bhrfux|jvJ0&$n5n!eSu9c#${ko1X;Sp7Nf$!g)`m)8HkMeRWC2xKcMULW?3@zE
zRx1nZnC2<&A5V@Y*-RxMJ#cQ~!1BN{wK7Q{+B8t>k9d=9#CwfXv6ZAUn|K%TGB;M#
zF~+>`!V8Z*_So^qAK%*A>iN48%qOA+3l>~`_0>1raKqB2OD9a2aQ^w{AAkJuTWz(K
zf4i5>uS^u;3*_Bt_^0^y1gjFmQD7hd8Hj^Xa)9&23~_}iDZt+BKtSnrmg0kg_n-9#
zQuOKo&gTHO9eoa-Rh8n5COJ}4;v|G#iSs$Yu2|Ln)JnzB5gWtex2{<Ax2HRPeDb(M
zr?jZ3b4m<S@nu=wlU$KhBLnaq!oXWE9}+<!g+{)N3S|-!FeqeUM@-@vffztV)sY0?
z$cGA1MW6yTaa7c+d8)$KbtlPGVgVUIwH&DdvC#<P%by7_ejcRcJHJH`Fcjh(F%z*w
zqRhksa-@N(_>KA`tEeL(<+#s51!PowWLN<>z(lH~j$nkmZ@e<IM1UnPpdzFYA@XYU
ztdYWaGdEHf5gB17QGz3*>I@ka*S0<MtGdaD<wx(X@h6fTJ#g!-x0cK0qmDWXKomup
zOoo}AbB{m%_!U=Nao1gUb#-;knl<Z_pZw(EhaWz8@Zjps`|3|Az5S+m*Pu$8geXbr
zff3vDnSt7j*T&pZq}a0Y{<D4+7(lH-kU@K|fnNy5DL!9|l%%d>jTC``hy^-jY|GTK
zjn6Fa<~&hj?&W3WqyPK4GiSGc^SG_XHfEiZh{!O-ZA;3Y;03HuyGbbg&5`sB0!9S_
zM{JmxSd^8CREce<5s6q>M*%Sm+4$Dr^S7b|r$hoG6-SO?nOQaNQB#EwM3tctV1*4E
zMG0iVhQO+-q9C6qkNcT?+ymse)TRC1+Azr3Mu;O~R%irb1&fbZGKgUugB3&|Wd#u{
zLIst<8AD7)6~sQZszwo!s*)7|stiXaOJMTetFTH$q7Giy%=kr-qq2b-AyI`vxZ#D;
z<G(P2C*()(j--;y*YWlA^xS&uty8B?-Eqeq0c5k;-rnBZZ@>L7fBDNZ&pbm!d+)vX
zDW{yW@4oxya=A({-vbHLze~fO6e$3B$~USMYaMVU$jEUzO9A3IK=yZ42Bk>x{$UGZ
zeP0+ntiEP^YuyXW*RJgwKdd$vs$(Lf>f4u-qThkkGr@?9>a0C$uc6QUc`0R^A~BpV
zMBMntd)n`OX8ku#-1^+zM`jphr?K#s5~+Yfg7wWRZ_^+n2}`H~Oa`QeC@L&1EqcI`
z4RUo7_Gr1t<M%W5<3+{7sgzvU1rb$%l{58Z8ljdT1P2&Hfh(*I+g^4Xmr&O#F)TqY
z+cL?HJ%~&W5H+Un<tvM?+?dVgxM{0k+&<ho-IzMBCl<RViXk{Mrhnb@J-2^3)4o(E
z%x*g9`<ictmJO^1Ch1ZqDXlFO)>ENqoD-C^ca`f{A9gMYM@($q{!`GLl8ZJzRb255
z*R<qV%f;Tv=5yl?qHI0j2nr4uHtbm>9SiNC?YVxK#%i-l>cF<|mFs$6x^?LO|AEFa
zP8B2q1VFU@<$N|@GkE;KN1k-n&6zXj)mLBr&Ue0(&1P4wTzShaw_JVo)i1vI;-Eo;
zjydL-GtWG8#*7(WI=9LLx2mq<eHWHa@h=5GP2aLS^HQXEPw@V;KIuRyTqsf-*xk2n
z`SWed=B^A6o0K6@umzlu5V)!|%&&1BAl@GA#9fB}<kqDtdz7q@3)NAkwr+Kg`}l8P
zz3#E~-#c~E4kK&DmEj05%BlqN+M}kr@cymf7r<d}|Efo#Wslhn&xalBsN;oxCt*zk
zxF*xzXSTVUn?}jn=S(Rqm?+~+zjIMo$hcfKtL331>Ua7~e#Bmp6E;+Q?k~m1ep~8U
z0u$7+E89A2Yc^(vH#AJh58aNz9rJIL8$VOau3MaOu3&y+M{b?n`4hQqj`JoTj)cjG
z5(DAW{~N4(g0?*%TzFmKg;}*becWjf6;|}ky+qp=N~zmKJ=(LO=pq(J#oi(ZS#fM7
z!x?U$(&uWW<7H`E<T@9ib4igJ`!e-J<p^0idQ^VgZY7b7O8?>;$_s8NFP$6puI1s|
z4nFqJLH$@2aRg#3J<G#~zMozHOwWW}8)tl2M8UDJO}XtQ$~2fkp7rAz1iw1J@x~hm
z4<0;b%$N%=yzuI)uU@%w<u==F^W`sp`RJpM9zTA(&++@J{z_qgx~xy}4+B-&8{fr7
zs%~v>(1ej9#XleKKkE;CGr(SC?C|Yt|9p%4-#Zr_JZbx6HCxFzS4!}FZJZ%QAmbt(
z)sQ)Mm(jnxc@;HfxyV4aY$MK;In#2-GvQ;eJpI`thkyFe2}3jN!Vqem%`mAYff}te
z6TRhVRftUAlKa~4|3$vB#RUk8i^Aru^1FYgwrOk54MpGLj(dM<HZIHN>(J1eGev&&
zftsmD^R@?5AA<FB%gdfDwJpz&2w{S-dqMAm-^(~O9(th}Ha*BT*948Q*?M(KUX<SC
z@{L+k8`+xt9-p^CgRXt9{PK;R*M6$*i2nMiXKN%NC`643H@@Uvxi>rg+?qrFtM`wG
zm6ku6+v&s1K~MY2?gjTZ<pa*Pnz|;`kI5~$H`F5S@{yVmJ0sU*Ylmm+Th~AF+y1$K
z&NmLTQZ~KoqVcnvclr?3jwX{ewub6PiIfA@&WC<je)xwOQ`4~hhj`?Ku7`is|IDS0
zdwoLz))15}zjR$q$15(|)YrDaXJa77GOKjRx|ekDBwO1mRb70>nECVP&zm=|wzl?y
z3ocl{e*N~_Z~ytvfBx{p4{vU6uC~efJC$})r1&S`b*Bdu7u%+3fD~H}-hbAQmwZG}
zH3T?!)QsVK?XdoaIUE17%c`?>9#!@N=S*>NcvXj-Pa-QJGHT$?+k42L?_beV$`O^F
ziCD|3MyY5T(BpF7y>j^tPi*|osaqW}b%>FYI?EiZhrNAC4X>h>8@tD_vt|Xkk>$Q+
z-GAIS7`01&%2_JHL^^1~;L~Ojh`5nV(EH3Eii@9X-0`z^$P87%b|(!{0a%<^gW|%w
zP4^0O(D!PloTXk!J?Ri&^#b9@HjnN=7UTD8nf?(c%G;ih+wD{4_Vdg0uEEq(VX~wG
zam=Rl!j)z8)$Mq$YR-`>OCN#6Y5j<u>(057xlSb@=KjTZ^t^IicIJoc_xi6SFis#0
znt3+Q{D{f56c^ppcjd#G?at1PKVHN!vnr`X#J2D8pX$Ruu_I?@w*Pc)+6e$M%U>?7
zcpgw8WzE3#^)I<MUo)gE))v+P082zsK-sW!B{nXxQ%(ToArZ)^#*z6eue@^Ix^+W`
z4$bHDzx?Gdk3RZn03uQ<mHgt~$M)Zq(oTvL|4_Vc+oTT2G{s5<sXlm$_Z$4~H}P(f
z=)SN5BxN8&`psh}Wb3n^{@u%0JiRt!*lS`qCr;EwPNGP}IVU3OL{*dpVW`ta)E}^Y
z3wnDkE6Az(YFju*LS!16o?Au7|L}#g|M<ePZn4Iwa}o=Uz0I_6++N3-W|J8VAYAcy
z*x#2Kw>tsQ$jhp+9Za^F5J=cwp7%Q%K9z=SCr-(!iW4x@kug%*P<rlS9yYUf+F9Zv
zXaqRGB@OVgNJ!7zzo5Ufe)@3&u!I_wC^tMaa@Vk{E$Uw%_c_==3M&dP-I$xOTW0Vy
z6#K>XbGC+InE*CSgH!^Asw=psf3GH&+3t7&)P>LxPDI3HhfvTg(DLFtHQ$sQHPb82
zh(sbG8(Um{SNOn17`}7UNtf47J3*rm0A;cic0miQ%uH_MbF%UUoql38d?pt<0OvC)
z07`3LK!0Dhbvqz_1Qa*r&zw1P_UzftxeXgO{Q1v+{`t>;{^E--0?1~w-j64WqUtt0
zP4IqSyD3rth$WHPUyu_2RTV%Lj#L2wg-GiJQfvX(0$4wRf>;f+37sA=so{^Gndl-q
z^@p#X^gqu(ysWD?RBPBU8EcIVjIq`jW7t?Wh6BTF41g~kG>Ypv5+$*$LXf!MM#wqg
zoYj2&W%qXN@y(}x_s~+cmYhknXWn`Oo(TG@A_9y8y66F0JJ^oi0|2o_Tv4tfAq<Ku
zo-A%ymfd=`K>&(in1~6^fAf1^y0)}terEcalxY$oG64oUf<(!|!WA|Y7u;$_O}DMP
zsv<H0I3OTwTgcp#H2*09rDab<-77QOo(e#?59Pj$saNk!q$G+6!fc9d&lgucoE<(R
zGk6CmU>Ib?FaZuA0}VTi%bqnOX4u9Nsvu@10~5HumHqepm(&ibJ@`LOW`sDWYzZhw
z5g<V5S1^4`9%2o1lMf>f)cF|<d(C#!v5>4aL$@MSvNi-jM2<M(h(G=5PnTVG*}3PQ
z+tJbSgCG3hxZ{pH@4WM_yY9NKt}bh>F@E;uyh2~P)=#m;!Ld=6SiVCP0*C-YM%B9}
z8&q^MQlxnQuq8zJD?cbv$Z8`FTvR%3){x;}pZTS$mtJ-E>Z|WxJ9cElj0w4+O?6H6
zL0v|za{#nTV_yLa>0{f{6l~b&jGufb3OE3uBq~N>>uOi`mp=Zhm)pDLg2N_=Xd-9&
z7F^;B5QLyAhBB_arg!DMnvpvNwPU?<PL%?W5de$s34+>Ss{{RWLXN}`L^z|p%gayv
zGCzK|n(a?OxgD+?s@R|gwt-+2N+N8ixOxt?Ey^Bv0ht_yj#$Z9*R!y1*@Ky}dj{Fj
zB96!^is8J=?BH#(6ZRGWuFr<WAZSnk@NTDYLIL1bJVt#RdHkLnWYk5<#`}wtH-RAv
zH@uAYh1r>>5^@fX!0ePPdY`-0t)5$R_)oH}yJ-}ui5E6hXp?LmC`SY>x62Dx>F8O(
zknN?v6V@bBtE2$d&)3=}9W)MrQdMCl;LGx-OqsIXcH4dGQ=fYH;fHUy;fDL~zyG$|
zZW}*-{2_-Na>NlwOrJj8=k+T-dw#i}&Hz$u5nyrT0Ek40wK<HE03u|F2L!Xz>ovvu
zi}#=PD}8ej2x_g0A&AMUli8D69{g(4RZp$GZf?iJ^V_a}qz_>mLI<aw$zD@JLm<?S
ztS4)nIuLKNJ|{pS$|?wQHKF61A7699VG~}f{`rQ86LbtQFp%{xOV1MC^E@Dy;HwDM
z5?cw|N=qKjj@mwGo}}ss77!?)%#5D7f2r+Q?DB)#Zums5ZIP6^dYR1jA0E2b`4BKl
zjxr3C7v7Tz>UqM!4lpDHBU<Xc{hL9lT))?s1b|ZtP+0wtE`2D!?^h^jVL<we2uobs
z1b_EoN1%XgC_UkVyG-5q?4+X<NJ{I;C>#^H5|9s9J}fo}#_jD-i9*?OxpztV+3R@J
z4!J2uL*VN!EwhWddEHCF=sgr!0DUjtBJ1YY9P=3<OQlYe8S2jmap1}u%j*{EkZo<v
zDD|Sei8LQTekt$Q@ijFy2OfCfK?fbQWXY1-Z@>MPTW<Nk|NFl`{pn9T?zrR8M<0E_
z0S9<<o$5t@%Ka(cZxBcW_CYO6RDER{Rdwo^OnQx`*b?ymvwj5>uic3uN+Rr{D9lrE
z)~+KzwCjlVec{p$?d&af_VgFZPKgsUn5r$1_B;OWwQKu|5oH*z43JVHt>L51By7T{
zR}e-g?Kch;$=tV5&*!^&SXBT4ii@7G`I`LrJrfb#xIjTwm`t?xxl-5Sx*g9XSWy>0
zMM}m>VRg9ZcFH!DI#yC~Wi({h%&?t@jGSbK%peIsqF^E;{VR%#AJ2}Rl^rxGauJMG
z^!MHVrP9Lt>Q4AY(6F_0LM#AE&tDtno9iYY1%yBlg*|1spms=XdIMHf!wA>B;8s0i
zr=5`*G)<jTW4&ad7YAkvpoMVJ{lUncY|A7OVS-Zyp}6qwa_1{ed;H91hKh?|2+AOK
zU9Uvl>+`Ks2ykIn`Poa{;EDOI4$@+`3H!kY@JR+E8=o(Buc_PdbcPizg95_ls%Bp3
zwBo8aZrr#}eBu-5o_p@&k3W9XO*h?h&pluK;ulAa8Z~?N>~qdJXUdc*zR^FW{1ksL
z1}+;Wz>s<4gr{MG;Wxh06gF7Zox5UgTjZRO*SLaXLP@u733i*<1bMsN!6{PwL$L+1
zKEo+NV2IcXDJ<9tByt+S4Xv?5YMC?xYJBZ`6)FP5bl=Mx)^!%i=2RmfLtZQ{uAwDk
zoCr%dri|6^pE~8}X@f;VqM+iE`sO1Xu#;ngbd?v~fzkVznypks<2pYdPlp1frB6_R
z%-G$5#M}%31mM#0$92Q=GV7c{JA4$mdd{~(O+*SM2O)seGNIJI-1IC7W`6}RhM}d^
zowt2KUb(a3(4S_vIng;$a>N-aY$z|8o8RH|V9+FW5e!t@g>8@<k}xa#IZ65Dn@C)K
z(jh+buWAX#OB=Hau?;sqS6cf@!#-akSkaJ)h!_anvN_Z^#Ed^6KK9rc2*ri>IL-y5
zW&m(&=D9V`<@fx!$+k*i1+!0UIw$}Wu6>y)w4-(fD(n1#AFJW*a{zd?-ApF4*Is+=
zz4zX0)~vbXjyrC>_14QSyKKMx_M0+gN+rUd5`T)n2LqB{R4O}klnTAIi0vY^It4F;
zLT`kQLOBePydn|pQUIk=3B|$_FBMOH{ApN%G9(0y$>d#o=ivv|-1N2meHTm`;ZO0t
zV+&yYMAMRzBt3x=e;mU=h6u!>P*DIX62B_Zz!oA10B1P(;cd&M%s~br;N3DvSryC*
zV%O(tbN!z^Vb~WA+iGyuhzN*@-hKtYZ@^I|E^k<%?elAQI2|A$1Q=?RoP%L9KyP`;
z!@=;KGc8+doLKkj0%Zl<lDi4H{M1u0XcCJP6&5fNQ>0{68R`I(Ry_twDD}H=eR<W>
zJ$HYH``hbJyd>EA2ogsO6=I-&`2*-(6riv4{2$82HtsDHSKS|Q7Bx*SRuffXF88kv
z=iko5cL+x9>=cH{C@YDuw}Vvx<>il&M0T|2W~*?9*cI1BYhKO{ooVYviV8!VvKXVq
zw&=OPWkzhpgQq#bExOB^+RV1c0RU$rli{#m70Lj#f7w%+rcs%eZME9K&7ajbAZ)+%
zuO#w^4I6g$*=L`2+G)={_uQmOlK^<R9;C&6DgG|_7@tqLPaj?L=`%;n7}ErVAO=j;
zbd;>hNB0}DXj;vbVF3VNo2Li>Fe!kUW1GHw>WECP4ruo8VE{_b1W{@5tT9;yXbY;G
zPVsMzEr9ibIDs6~fp5^#*scx(-fl!yYeR2RtEB9VI#FwE@Wj%N`(NrtO?@OvBrN1C
z0m_t<P*B{k-}X%xoib~OF?Hh9xro`HfwyP9Q_=s2Q*dd?eU59gqjv%zu<YIGpn-<O
znQ+50X?wBefUf~HfCCAX04B~v{mTl=9uLOsYFoDx2?1spB4vnzNgXMOjFawY(QWy%
z6z{vB^vrLidwno`@0t_-Gt;!Sx-xO#pkM<nmR`O(qtZS1dh3ep!gA0UnErB5>*^aV
z01}^x50~BRIv3}rf1QFBk%$vhb5;l<rNE%nyZ9kHWUFk;1XX|A0tni<N_*RBmyZ)n
zq>jiy#X$O>{bRUsdF}3>WzM<6s-9Qx&WzkH+q{jcP`1T1Z4+*MCJGCb8N_|dY4yXH
za4^|Ak;sr4NV&Y`tb3-9Pq=sKtFqh;f?&rTcT9WsQoIKsRV8M1BGX4T{PK(j2QG?c
z;fPpu;6gOekMBR$TYfkVi9})jQw%3gb{ac$mvODH6LYErqauYduZ?V_Nb%3d7Q*_d
z@O`4$7W=YSoM$U0?-e0eFVE-oeLw=3KR&R&P{;-KVW>HZ755CI<a#y?Zwdb6tjQnV
zYea^*=rltS6N`7(s3>o~CGUHEAhJLi3-1kDCv)>;RUsl$a-;+%bpVPBZV=9A$L}k?
zK}|_PBo2n%s<~)il-=cAP>qqYvQG$*u&7$2zC!>}+ahUSO%o3-_qH{yeMSPdyL{9(
zZ7pHH;fw+-LWWCgo|i@U<ahd9ZrZU{%0QNC8%uY7Ib5@t^P`OK5+tKQwCFAl$c)=Z
z0qgw>yps<|p-gZ+EA7h1?92}-H>&u2zXNQ!v_YvW+cXY<)lk%uv2Oj-g-8F#4&Nic
z_3;3Tiyx)-m&~kFfEsm0wz=%6S;eJyNyox$>rTaWFGhVGHRBIJyv<5b=MAof*R0*W
zv0i2AznRHiiWGkbe4P!H7~)i=%nG3lLxZIFHx4K&k%THL9H<rJ$bvzt0#dfDBI-i%
z*JQOt2o&s8h?I>Q@Y`1dOOfIqhb@Nn0aVLYCjTVgkxh6&lgc>n$rHg^gT?K=*F4^V
zd``3<0LZYRs6SGaPv39Iw~pU>>!y4ZNrW&*#t}PWt00Jyp(^X%*FqD^d3Z%$Hqpif
z<uwa3yMLUKb*`T{<NFRt*s)<0!^L;zw%*OwPgLg|lMzGYC4?dC<{J)Z_znb+Z~BLW
z8WBry3I<dF_v#%gHaO%z2j_-G3-0c_=K{U!YlDvezo2nS6h(^25&<Z^aH(bnXLdTz
z)(#gHfS9Nh7O1wy1htaTWz@B}xb*2@+@9GH)1aaEEaq6`P$3|L{*_O0p*KHnZvjXn
z;-prxSPUF7SO9^w#)kdN`|tP<QmxtVe3NYypuGG+mv6~$y&nNe3?SP)zQ;vj+soP3
zoeB%@b`4{*V|PJp?f1^FjH*cfW<Gl<QvAJ0((=SakPLakPYQf7P!a@!B?DzZeWAX3
zeGhUl%$A_GTJR&;FydHZRSR)S{3+gVe1O+X8vE1x?-&2&(RFPdWfQ<5FhnF0b+=C+
zRk-HM+g|eFX<Id9qsWm_L#zr`Vof}J-y-T3OTuZ@gd|%2uxcrocnARIER$6NE0KgC
zL~EWZwXMlbKS2Q?A|J7c-MTeuTf~D#W|}5L;YdiGK|qGB77PiI62TxUFL@|8b{5x6
zX4K~<pHP3)Z%xOl-fKS<cDzE?T4iDz_AM*DbXRuT$)<L=MkS{zs)&kIENixo4WP!C
zNR(GRnCV&*OgfgR1_C7`6p@OmQDWu*K)CSc@{lRP$XNzpwhHTD70A{!l_*o{U&#Qu
z_QL$DIxjt?)VZ?Z#NTE|@9#iTSm~DCYsc(uo2IA$YyjxssmRvq`j=IUZqZ$}Tkp%c
zp>RTAK*$>xzQc&;W|99CDc%*5IHK39FmQxWAPU+n|3pxRV+e><RU`rkAXYKHvej3{
zkP`eK018Ba_^p`4J|t!Q6z@B>sI~sv$ABZ2!Ucvpi>1Flv>rh~aBQvX?rY4JK7P`;
zuN*R_Icr_yKxzyF5Mp9w5F-MSs-ZU);*aP23RNYDN}rUwKw;nFIi|VU7FR`!)@c7)
zSKFEyzdtDhj)BtRhqL+SVECR25C;qh$5Nb%aw+V{Si?5!3mAzwYJau(;#Iv%pKh4_
zwVF{ofi&!TxxZ~i-P8jZh%|ze+~@;v<gdGK{!Hgp$2ac$k!Z-C*^xVyU$}-Nm)-t2
z0LsP!V0NyXi)%CaiHa;FA}~-cFMX8r4f#>K01z@lWE76T5Jc;vazs6CZuJxS;nSt<
zDe38q3Vp%gt!VfjsxZTL4o2@)e&9blS3e_5=S3S|G!yn}nEfAS$V`+Yld=6v?nlSU
zx;?(62vj2wt2!REb#BBgUHVY(;}=E!-3=3u1QMr2(gC)ACwIYgai8M7L~QC2SGX{Y
z3add}TT*qoDTrVI6(vIqHeL>zHr-ev*5M;j4E4oGzCo1+wv-~p`-%@J>-%q<0Dv=K
zdU198E2~4)405HSmizXfQUCo@rtdteUPMKdts#v|1C2LrBLJ3sQu~d*h;bu7M3oKq
ztbU^X&i~H$FGCdO!nG0GwO5`WXqjD4sbTA(XKZcjwgxf`*S}(JFmf6Nb?Sr&%J8vt
zC=q3|Tkl`K<E!0Qo|YZ6gOqy99j_EtzhKk`<M%ZAy4V+>(CeZo(>x6TC?IkwE;sRT
zI-$1fk&D*c{)46?CIrnZyC45me#)VlLDNJ-knx8Vm5SZv@}T-*06^l%8c5W!SX-yo
z586san81ich1m8#{pa2%ey8GWxj(BLWWm#GU;VhHC@L4s#QldIx0|SFP&??bpLRX@
zlhWdQ@^xF+?el-Z)&~&PIVn*<3ULeW4r*F>^zH)05@f1|klo=U{nvfe-2Wr9&4HPb
zJNm*r00ge8;!kgaDc&c<^bFsP>RmhHewIzk@(5m6N(}MC3pXs^*mvmmqnmOzPW@GT
z=vFkp5;F}{J(*H|iY);jNY+nM@Wcu+h*YMJZrp21_JQX%j2&h!IBMeAdkxD_6oq9r
z0W&2&EpcIvH|f>b3+oLRkeUSfh!l#Nris|=)26&0y=zLV7w0GJmaQLbm>t(NWQGKp
zF{<KV7^+QU&4?LZn#(b<D#19YpL5htKevxE#YMM<^Y2HfTwh<a^LcgKon-39sR*2N
z)|era>JIuw&^%KWOau*xG}2NqX5S&LJM^u8wr2F6{fll3b3>YU`ZNH7gR<BE<K9)a
z)JaXl5w}J-qFi>Tk6FqyazajF2w;hXHP~vOnz})j+QRkAqVB$$347aoz$OO@GIgyE
z%wSQ6`eDuc{aE(u6B{sRAd!e;3IKLdXZPCqb;D<xrfpb45JOd&)J3@ohlP87zWCz3
zjk|uyah_vIKKLT4)O<9>7J+0bPD(yqzsY($F7_n>L6{7Fe#`Q!9$NeCFNZe`v!XBq
zjr*H~3zdL@I^i^#pJEHb2a@#@WfmwAYpCG*z+U;O?dL7)oHBOMn7XVuamr+w;vyQ_
zM1*IPx%r#zC@JMpsN;Nd{Z<Eh;oV%^Ntnb9NS(qcEGaZ<Ip9A)8K{6s2rO~ufe<N*
z%9*<DKU_ECY>_^75t};VjH;?Q1&Lv!3a)9IdJ2RT5yjC(5LuOwGPQN1_E44DaeL=a
zzm95#sg?l?BKAhaVabI7nkIU8Ktrr9%1=HnP^eR87Jp2ffx<S8%QTKtqTDtLI3lZ+
zc1a>tA%Gze5>*QF0Ko|eVwe>qk+n5-dws^{TNnUHzy_kq%A!j3JAbZb*2j!(P<39K
zloD}<6e+d{P@<?Ae@#8&cAcbwh#gc389;05P&>%Jw$7L~+bWWYZF;Sz*s}0}Wc`83
zHB^Z~>=Z+5Gly)~s)|sYVPY`I3+3?}cwW5Kw=pR<Bfu-)8Yu$g)R_1_;UWr*I%t_F
z@CEro%3P-uXZRQ{Av#z`$^k(|9Eg}2T;N2(&MAAlK4G5@RE7X7L=I|{6eLEBf)$Z}
zgg8qzBM1s)02L4;QW4WStM<sB2BY_M0K<X>a4ZmF7ziAzA%$Z^pg<!PKpEtSy<Cux
zLO`U%Bnl!m5wj!7Xk=i$tO>>P#uRB#x7}%^5uy+rD5)xw521*tW;8I+GEpXOw@hz>
zDYh_ZyfTj^Cn=WZi%ao+p3kBnFhEuiK?{`DlcxB<;seS0-b;s}0z_o2L_}~fPQ-vM
zC5^M*_4*CeVBW@V;&&=pRxt{ttO7<=l_Pe6umuSlmOSFNeFzMbvN}h`8`%kgRMcAl
zf=OB9{sHHM|4N7*apAk=NWEU35?~d+@Gb(_KgJp<0>H6TF``f-21X!;RGh7Em~sp$
zsYYOek|IOw$R$Px@QsR;MX2y4_XNc9fCirFeRG5X#dp1o5MVU6%41d#M8HDIPC%@N
zSbUvxkaRl<5lACNK(Pf<y7Eb}#evk%(KYGROC|wH!Ji|LWIWRwU$&oK#wk9q_&~CL
zb#aXK<!@m2wPF<y91!Wi?|=VJynIkB9!QB+uriFs1$2IG;3Iuxl!-L1TLTaq|AFB3
zgb0LWSnQL2VCAI$5Cf${e!ZNOzriL#Mg(F8DH$-u;Yp4S>xq~Fh{jdK1d6w9yg2bd
z)X|qMD<=gECQd=ZdlU;LXO0vcPl0%#*a3ck2oRqYB?5Eegh+r_D<vRHo7FU>NU>!=
z<1&5-ph^as5WN7RumV_7Vt@=(5EK62qdAcx#TJ4OChNz^G@sp9g-NPsOrn%eiLT$f
z3@TH_zy^t!m*;sCPX*w8=1nr&P=YGcK)x>#6po7m5l<p2If{6x;Vp3dHaQt~@^att
zOFjT#;p>fKiF@K$AS9NurI>~RFf=|1?6q~PnOvdpU5k{M4*DbZ{l`8{Ur|G@C?ld$
z?V#XziNpXjk@+L46g8`fD<>%tJx+N)#g+u$p$90#`N_EwR3Sp73?`5$319$;<w8l3
zuJluUknsU!eE^B?o5t=raoHVERfJbvw)gIKMTskH;?k8A^HpMf1DZb+cL)qfeIz}*
zvGrd(U>X>2QraGe6*O5(4|owIE?)`vSC~6-2S~(;HJ%H>S0nl8W!$X*KWouALgCeJ
zG|rgBT&>9=CPSoH4cQ+gRfSKICA@!aWf~JtE;MdcMxt4g-YQdUNzf$MB>=+`LLnA0
zjL;gRFc3#7Lbm$aO|Rk<A9Q?>Ss#^l+t;ok-!r~PRf3=rHmLOGZYsjL$*y19BTy|k
z{5l;3giQ`-lUKg(Hfg}UZs13v%{E{4nt^S+<^ZbSu=&n5KgfY2ea%a%2D91mr8mnI
z?-LZ2U{iI4ZZZvzlU7b)qDTV+g}M8cw&fd3N6&0+seuz`V;9gguAkyx0Uv1APmv-;
zinl;a`AKJ<Ptzgk)*}Tg92}|1So6|`zHeQ*<hm!?qh7sx-?l$~W@ZKgbs+P8A4iH5
z?+sGcPmv<UzbtfMc^?<$$G><VOodYwXuz3nk>A|C^e5LY+t8i04K>K{gY&yP`}&8~
z)`)Pd1ezkn2OlZxr$~|FUl2)P-xs_HDb)MpdiGZmh&Z)Gf#JIscK*LB7C!M(57pFg
zeO6Ub>ONqX!NY26Lg!4}-6{aQN}r}Fs}vttq^zGJMT&n(RK&c*Yh944f}sKe#1X=b
z&8_M!fB%M8e}B)$vSfpX1}9dfC<xfn8T$+cz)FO&SI&TMj3G&@_)~mvk+Ob@6e<1%
zQPI||>f%*`3gAR7vmkfrW2?V$&7$RNG?UBPY!C{uM$GJr(Y}-G_t~aJBw{iEC#@<3
z^2WLE_u-{T@%JKS{S+zQHIk-E3<$MSmEl?}&kWVr$T4o|tg0jjDjla)Z%HH_s*H)N
zdn$Z;X62omHHf^?SmSm|RoY~`Ny%H>kFJ#X!fN#Zsz8a-#3oNBeWn<Yqd?_YXtOa4
zw1QS0fAYw{k;gr!)%zLfZGE%j^<rIdFN-M3PMN@gvCpmP{rZ&)Z+WVVf_zXHgen{&
zKo}VTvT)rW-fxTrN9qhKV4)yHfV3CGtg$tLLE4y-;)98l^;4vHx2Whr=>WGSHg1WY
zqR%H2aZ+Da(VM7n-@qgxCQ7A>07635oA!dnbQ<d<ye*%DSVqCiNl~SxE!l}ybR_%}
zu|eX19z%tbX+XXrHmma^)QYG*Dw32)G%ON3tVF4*YX6l}p;f<D9p=EeVs(qRnBVAt
z#uf-jJhwAlq5SH$ML)W6WoN;d`WlKjbPR@LaUdouC7CuhbM%bCs$Q>JlE!`$sxECS
zH{{DrHG;Q3*C|qLiAY&LMT&QU#EI2g8nY@OqN*SRa!~RmWKa-kp-}GHm}wmebpQk=
zk$_JKUlkgABEoq6CDz2Gfyk*hXY%F<i7ZoP!a%+s&6_X2h7qeq8oO2!oEL&rfXMHc
zD}f6injj$ZlGhSJRE+VfXI3F)B^44@BkI)aP!qGNCOe3$>8coAL4ZGeCedVG6bIlH
zJP?9N0s%rq)oXoKfW5s114>M<cf7H*E(nNJn8>iPcf8#G^~)E|T~ef)JlAGj<UkHo
zGNMAHlqE1qZ71%zRc*$UN};jFK^2ix&RO&Lg3kGi`VQH#p(U45(Nt?P#Rn1m?l+Ml
z#lI~Qkr#yusROABh!{lb#6biEg>w!-+wcDL{EVHW70ZYWoUl4oMB>Pm$d#Rl5WyH2
zhGhX%H1ff4aqzsViT9LNX*>Ezzlex95^NF)-*l`M`ASt)6;VhCS5z0dD1s}%h3X_y
zQG^1Gh!jdr;6gP9K|n<6A|LA(_?U!*P=aA&AdW;FA|=&iW$q<y$@@*IAd1A5W0wgc
zC1@lnB+BJD8{ptXL=+@|kR-7WKp?8B4x&(%<k;UBq)?|sMm7|}kNoL{BYyDg+!f08
zbs!Q~1_Vlgal{HDI2V@2j;i_ip2GkNGS(V4Y%SZI<+*EmK6c4MMB!O`4oBR{m(qTU
z4<6FBeu@<D0^W?3yzN1xV%eC&iWNYVA3fFx{F0Z60Tds)yLH_f7mxy0je=OphOHn6
zs3nrFuENq)uCL29)MvMyL@?|N&`H2XVSSgmkFJv^MZoyEyJ0BDVg5Iu!X!N4NSv5}
z)e;nxAz!YBjGs$bRfWL2ONcO-Y*2i4VR`<`b-VAz&4blN#N-&PponZhZC72>^YEi%
z{`;GpuaDCbqCf<I*hB<Y2$6tHU<`~XQE#_GDc`_BAfifYz<`4)jAdx-W2tfNy^@k4
z?(K?lwQM7!MiU>aH~wrOfD#Mn<Nx>Mb&vJhL5)Tu=Ui;0J5Z6~4J<?@YG`P@^s$vu
zDT<htz#?7!?v>SY@4SsY-Q|xS(|XD-qeO(66WQ$)DLzO@SwBUJcY{i~8;e>pro3+X
zl705$AwwoUa5rUZNdyEkOufCOc~94DH#IkOlmlQy!ECa9k3PEcq95g;wRPG4`yMH;
zT24TBzINPG4-T3#6On^4q%!}!a|R!M(%_>GQxWoJhNRxnQJ{vn`>}^}+f2;13{`!j
zwlYc#<C%?d>o)eSU1_^IQ7CCwH}&;Iy*(OO+d3Fir`Bve#X%sEG1m2Vt-tVx-9P_j
z(AT|c#HiuF|7HE5$2f^Bpip5Zpzl{d?R)sq;uk)f8{5FDs%4=}z=5g)5&#ewW-0V_
z-g;BtZTEJ+@O-p#F%+6@40hgW+z-E<AG3{yk+aMkbU*fBX4vrjxUE&hfP_>;EOX)6
zCtv*bMKgZ?2X3x+P=an=>97eDN{omJYkTP7S3_>72Lf>pOs^$uQdNP?(Bc(6UwnE!
zOayU204Pj^oS8Jd=HsVN`1AoIRfOnGCRZs^Y|%(rKShdngG8@|LQz<lbr=1jWyvBe
zUs`zn#o8Tr78L>!Azatai<cKZ@*x<*VFY3_y5>heYX8q~)%W)?p?A=rVA8g9=z+n=
z5fdg(%a5L<3V~QM?U!GJKVIyn&j1dCiZ_|mxE9W-0ESop_m3NnKX&AqXF)`n>2-&y
z08zPf<BHFG7S~+gTj&pgI#2_&kO`!r#+D*ikdCM@{QEy0bHV2wOL^_8HK(6by7#X9
zQHKpV<Mj3)|9b7oXJ?;(Av<=IMgoYjLH`TSmL7bx<=A8E#*T5JK*(gW92L7Bf7CXQ
zs+qKnQ)iR`xAtS7EL?V}ZD?xRW@5O@KBYm8xyD={2ZjFrJfILuwy<L5%7c!`Po6RU
zq1!o=6NiYz2*Y62=f6^y&D!P$7dbLU;gSP;9p<OF3Bjm~hSdir?B4R5o0g-#kptsg
zRd_z3e<dQOs3=pnY5MkOx0RAF@&iB(HEvLCu=UV}h5$|?fB};_N~QRKB4zy)Dc%J%
zX|q$Z#+KG^C|-3<BhcrZELx5ocP8~Uai;gVxd2*r*aLuwGg;g7%(H!8{dyxibHUe|
z&pd}Gj4_!kS9;adK^<A^%Ej&<Tu8%O2c3Qf0B~H3lhoj|uPF<bEy}#|N;0N587w$-
zX2_71bI%$va$IrTShH$L`<K5y=(`uTeC3O>af4g5wB@e5vFG*x+?utkk2#K?e6r!o
zUupgC?=u^kIC<TS9s2(ExBOSWC?bv#0KGTdXdt=sJ`6xLGTBW3OD}hP;qx7L+^Hv?
zHtov4kSma_>v{aC@UlzmwwpTmzPo}U!xEJ|5Rr<Cs1P&IbNemLJ>9{R5BA<Sr{Um3
zA~+*r0^9reW8wYxkGl9W0MLj)AQC06s`lUHVB@|+iinN4pPn{x`q;Yvx_Z&dHln&(
zkg7<0A_vZzK|sq_ub(`0)WjjpBI-}JDiJ|dT;$YPN{TBeCH@p2K%}gnBE`D|K@|}p
zL#6xgE3a7Ezwh4V`yVW}ZLIf1>l6lk56)2_J9%pW4n#oTWq)J|`5WIH{_Sr&5fEuy
z#>NpS%TS4!v)#Ad-uL2*E#LZnX7FGa6=5>$eViEzAyr}%c61b1u523A_@<*IB|s3g
z3?28ApOGq@tF?ChSH4s>TzkrK1n{61-Dz;kPCGiN0Bv8sfS!C3XP(x2(S_<D>df%r
zP0fR%`|iM3z6b_cmO|Kj+2wZB$of5Yj{rlw@n=8k_~y5@r+3IlKhpgFE)Xb520-tn
zmjI~!{)NGiVdW^w#8rtz3NR7F5DQSc`ubX+05xmY!1pyU1L(T&qG-hM+|frVpvL&t
z$W4_BULS@G03*@I_8fW8^daB7X5q#6uZg&Z^Eq`6;?pGrlu5*xY<rjd^4>MSJh{H)
zTt->Qks2j31TvPB-iAseV~P|XETlF3DN?*!2tgPCdTzKmpU*aZ@e43iE*22Sn2iAZ
z$dgi6XD4k1MJAgmFM74`^I!ASS*>5YKqZ0*M@kMrV8{lB3697J(DVC0)YjBBee4_n
zBn(xQ`#|FR|5cQL@>BClZ5`NpB7isLKm`FL8kJlWspazOl|8@szx@8Q^JB(qBm%HT
zu3R+4UDw}SzWlOi$5|tO_Dce@D#%bKm(A8k%a=#Rf-qZ{&N=snue=hRbaK$pC@Yt(
zIqA6W^S^8c4{g2WhSuNxFOMFtPB@b(&Y$1)o8R$1`!ybQgcb`%U5ShsYnaJ0F+miD
zOV2*rH)l?1@+1a|8`=SobHujq*;fiTTsLHoeS*gN(3Kd5h>=JNz3u_Wj?n~yGJ!dW
zoQ$gr{_mVAw|s5-j-y4o`bj8YN9wyH1EHLQB42moW2@J67W086B1^_HSqA&rhd+g}
z3_wcyDL!~eSwBUJcLOgQr^ec7-MZrS*O(&?&ulZL?4;6y9~q<FU9x0;X1mE|*cgc-
z0Bt}1Q899j-}oBmb6SePECI8zFaQZFLKqlZUhrb)%{OPxJtsG8Sh-wqnM~oSc~2g2
zU}?#GW`d{#DbRKG6@&5(O?&Q{DA~Op8+o5OQYaZV#Khp9`ybF!cm9Nv0YF6<AZvoG
z)2REWKTu%kkA7&In>BJ^Wq=}9Mt@(IE0zp1fS%v}CIkk5>=VV+%a<R1c<K6^%wdO*
zeBl0u!;aFjgA>vaK--UgXw)@+{Q|@-aoMaPqYi*0QpD;$eK+46lnQkAM+69$tpuRz
z7zl5^&H=eoPlA^+AV^hP{~LcaUOmYFQ6>mfT=}4FTkiYDZWo?Byn*}5g@_Cp5F`tT
zq7WFfd_7(I#99EN3UH!^kmw*1QXT#jA6%rYpCZLOgu-j<Y9e9h-B{W$zr5VvKj^%V
zhov$B#u`+}Z1JMr^=+9cQw;%u?Y?(T?{ELN@!YcpoqVE-QZ8$VblsBf2j)n3uP`eK
zixALp^;H(AKl2O#tQr92+io!rKi2p3%K#{fj5Tg!N9Wa-XU;s62M<vf5xstOdD7BH
zN>DIE0rX#cZMh~Gbm(CUh=EbS37GrunNxb;AwKH3hJE&f3)Of@L6o{XqxOxt`bIVZ
z0sT)u;cmQf*m>tUBkT9s7tg&IeEI96uD?1MIod^`1jGQ_T<^mVmVf_;#{KuM9ogD`
z!|iMT^yj4)UDS5x-K20NBnkx?iwND<Uxf*yn`a+Tu!c9R0pK#32-0!YRhb>PZ#ec?
zUoyFA@%@{9RzzC$Gchr95IN*5eI%U)Ta<0rg@<k!O1c|Fx<NVx>F#b2DUt5(?(URs
zkZwe}rJJEUzWaH<Uogkq#}#YuYp-*il_5EIbDh+Y*cSgOHXo~_Vif&2M^V;wv(@As
z&jBA-MoB4SU`GBy{cHR-%>UP6p)wk=PR}U;RSZon_b(50>6h5Y7`%0jKFlz7Xdo3!
zkoPtcoXs5{$nQh@YR}92v94}L&-LcsUS)))VcR~Fp{FaPYcXcs?X4TZ?#Hn;OfN%&
zI$p}#&*W^*HjAMsunmVcUdm60PPcFRrt1y@&|ob1_L|lRF@me=TE<@TVxGh(Y!+7-
zT<jpC#Wn$m&Dy<@fK3rE6kFx=x5AU{$cEm3jy^D;q6sA|XfU(4EQ)vfOk!F02N8(6
zCK1TH*cE~JBT6dj)9tjh!Hv+CNH&KZA`UxW-bras71YC<^upKd?#nds)v0HhXGh<e
z*Bj@GUGp%RWW->&>A$MeE6br~w2B`W=ausjL_+KzCOC>M5B3zFz%V#%yx(gLnjJ+_
z9-==p3!_C$BQxNnq>oD0_&(KTWPa{=EzZs1tZCCahtR=>$is^BJ@+NNW#(ur8p#%L
zL+;H}DF;0R8#7pdK1cLYN7IvGFQ3+`@2tX}I#cM&qI;O%T{OJRh$iSp_(F*`carP}
z$nUP_Eqed69nmq*sk_Kxfh2)uJ!mNEZ^Km1&+W@)A-BM?IIZR`XT$F-opTq1)6TSk
zD#G<#ulMUr&bi|xo^}p5T1eSu&~7!?2g23skbkEaGd0B4MjLhA3b?;4SShfk@(%Kc
zJw4lj@iby6>X>EvB(&6a?vR!>8P8MQj{YbaVy7H+I95Nwn>6VC-mZ`T_&wVlJ35f+
ze7gqH^3GXOd2-z<BF93i@G;G2==(*DXA1g+TDUoz8mF_;H*Ib%gzm?tmPX}C6_<79
zS(RWwUyWLy7gi&C%S>Y>ywL*W{}`$v0$#yZZT}&0bf~<lF~%Yd#sWA;NMU6EO~q+b
z#L~AZB-rz)Y^l?h^*EGT!-waN)re1ZTpFS=^}emX{i6ctRHnpp$F}xhU|?S4|Ml1>
zc+~HW+vGb*Vg7whi+lBL84XjQymUPRL^&D|A;A1Ic@~ddfHZr$NmQcf4^O|fSe%X=
z=JgjylM;NtiRbt1dD3^QPUk0h1n7|o;u3^V={dt>5QLEFjZhFYjMe)q8U8<Ubojha
z$N_f1GFpE=X3~A#w*(D<X((T7NxaOt$&J3H8;$d9PNf6{-XV|=fkBw8c>7aKt}bXo
zTQYuH1KQ$!1KWDO!_n^6s?Png!vLI-y=VW?!{}4^?%R%VrlP=nvRnod&go<@y$UNb
z&*nh3_rr`kozJD0EBb^)QN=OwFJ%i{e|6A#h3#o;IM?$T%V2;-!%DZZ2x?MIkCb~_
zPG)_RNEjN(@yGpF#5AlzGP$BC0%Vr%gDlkPbi<tHGo%g|y^BS-*lxXTFH+I7>o5EZ
zcQ<IYwN6hZu7M0HUU_j+4?uH<?!ta0NQM{~RmN8}XfC>55MUoBOC}N`^&^-w^RqHV
z0B!dIjD7v)CE{<ael?+F-=>AtpO2<vUN+xJo+miOYrE@jRmp0Xt!ck6{|VlfV4$S@
z3lf?9GVK&Mc#{s|%{=iqX6QP%NvyVBGsS?58~>2nS$`LKz?1Fva~%pH(m5o2{>X0_
zLL`F2O%z-leZMhN_b$q{GgO9MB4^GY;Y`W^!_{zq*X*hR6`rQ>FoAM@Bz^MvdURr>
z8u!rq!9PJv_^CW6WWP_}J*h*0*EGtx2>JGd5Nh7Rod66w+AtX|WCc)7pa!TgHN-LE
zn=a0t>u|bE457GdE=-Pj?i7t$IweRUMu%k%k?TOX8FV;x`gGhA@w3H7za#QhMF+V`
zjO%L{;@C%MW`SkP%94(r=2?tsQofIy;eA6K_avViyFcOl`WC)t#5kO~4HN594aiyD
z{*IBy+-zqFGle{?%6?Y^*W&1iw!&Bw5x=!v`v|n#$?0jBUXjJXdyYSxp4@(iGo^AO
zJ}ur`aHjfg@^v=pBYkoF50pO8FzeIP_GxvlJ_Bj>&i-ucJx}J`--EN8r@yWd8ZX19
zHwiz)6FR(}yMyB5>NP$T*E%C6R|WC1i&{^KH~w&X^V~!Lxx$Le<)|I~l0F%Z;Yciz
z@S`wME(2+K&cya_^@n<$dy$>7tbeoLJp8re`y6>{{9G$^oEDw?yjCZyzdb_0OH)WW
zv~-2SGIrL|wi7!ZvnnS!LRd|R0Sbtq$KUHXj{>c@vp~ypukyN~e)#mK=5Ay2y%y%Q
z`x)k65uq>}ybnT>@P#bxH9R%e3o>+Ol!USj)O9L4vHRLYK2gWBMU*2&G}A+ZacSf$
z4%v?)TcFcvfY8q1rL@9VS}}xAn{nL>Gp<>a7Zt5n{?6jNSF%6k4Q#wAr<02n>)FE6
z+;Cv;0~JqRV)N)!8Y76si$PTwRyW#7$y^-A3=H~4h{OQr&y>t{+@hiHTi)mroz0tq
zY94juaWiw_sJRc-ab|}09mkBnw!Qtt-Ph`M;l2TzHb|L1g;>PK)VICj_j4{2=K;z!
zw}E?gb*-)7E;;DedW{%bWYQNDgVC?qgu6EKyJ1V=<Rfk`dw;u`YKiX*H@TQ6;a6JB
ztRH_%hmgAEL8D2NuHOW{J3L<(SN6F2v%zCBZ4<UeJLe+$rxeD7&m#Ej7gbD&(xeQS
zyNTRa^Kj1X+TJe%>@IR)dz;_EK*_G$dijCF_Ib@f069oL_!oo4j{D1Rjfe|VYf?Cw
zbQ}sIWF!3iL?`CQY}!UG30N*!e!5TVH<mb;J)^mth;iF6f&VpRmxwDe8$CU`5sa#%
z3l4!%fByR{jNEUhzkDBFAs`$>FWhOtOri51tNasf^eV=A7Ee*#>P%xynnAXytq5eq
zE>Fwcn&O~(0`%TTK@pr_bo5`>o<m^!mrKcMGEbAaO$04X`qqt{6=f<oC08Ng2Os@k
zTA+Ge`+gCp4G!CBz7~s3?~Lhpy)qNTBAe`4`T%FJ@@4T8GA3^H>`IYR-K3nO;#T_!
z2O0l3H(J67CYubLQQXNXqr2;36d7<J>FvscadLcqzP6EBX9;y2oi_B<p(?#+x7>@{
zVhAMomEoX2jsbZdm*qK*@9wL?c$EENbMElRv(DoplWwce-8{ILYl;g`CH%@<6IQEA
zKd(k7ZnT#dwrG?=>EakHZ0WUc!mG`{?$)lCVAWzwbWJ;@iHV#)mup^O1|xE^`-L&_
zx!R~nNazcOi`kK#;-(?(4#Mr2_$S?~da=g+*Y3~E<4=P;=7dGD1&eJxozw~g#FPK%
zhoPjNyEn7bf>#&)pqn)>O(-k=w;)7vIs!Pqi8s$=C=X*}ND~j9fBQ~XC+cBXVdNQ+
z98k%FxKkAJ;+-Q6&YrFZncP?12UENGk1wbiqQ%8~wc?|YGDBGFE-`%39b2{U()B@v
zZcD#!Z)!g8;mlZ#u=qyz;^Selg^j}3rBns8nV}NUny|}HR3Kx~s|ER;hmw7&(H7c`
z;=mlP67pq$Ic}_C4b;U^V#p+?@45o)o3SFM*Z<xhNu2*Jb`74?Ryx|Dx~)<{kASHO
zZzFRTvR%GEFZOJ>j^^4MEr&zj^f5iM6|%WTKpRm~;){E~dQS@;Jave`I>F-PdM~)R
zNOwTc{RVjP=FfT*U*m^wZd}?4W)XZ3QK9DW^CMKJJVk!VdTlCR&Vktw<WtbJP3uFL
ziMWT-da;)9KF5hrpifPgcP7?~B|;TX<eH32XDljSI-;#NUQAwhRYb@TPgtHG9<~Cm
z%G%cL2+JIglBT9xssr!cUDdBS?A8%jeRX=>r;1Hf!xr~z+2Hu-9;L_EyGjDSs*C8_
z4=W3t#4biYV+&Zpa?Ou*fzQ*=7k@Fb=UvXYTvb~zcxukQ4k>ba=#j-azRx|_)>aAM
z4bWrR;x;ucKS}AdRouVv>F2)un1n)*dOjtbYE#^w)PH_`XJkneei`eT`?~(M0C5P*
zX|rDPJU`UzsQzy|T1l-)aRO6vUl}n2SC-R$9(Igs>6lmrs86@11?YL6^TJe+$6#bb
z(WO&}pM}MDO$%ybr0MCu_hZxuUg9t(Nw59%cG%{wA%;(vZ#D6_hrwvfa{Fl^6Uj><
zew>w&cYEKX+hlV4<UCdU%arhb8N<)EqKQfK1UvRRN8pR5qqV&yL_yQ-b^G9qX)&sM
z-%&hPK(-%|d-e845KhMeLgl0n6Z!iO*_%~=+GXGr@5~&R95t)in{vn<vXQ!@Z?0;l
z3=u6&yweAf{KsByGY*e0LoIKBMXgR#5KH9_JAv~a7}1ahc&=~NTQCoA?@LLLaI3SG
zCQU={+fATn1<((oh<MH;f${xtx*T4&Q!RDZJ@)$azpAmWJuhD4!@@b%WwWe75GQA4
zi$8Hgjy@d_Q?kf3T_8&Xju^k{l=X?Hr#zjr{2D>{u;*y#&&pa|{P#ZDz!(9WKfe)_
zyc?uqq9d4ybg@tCb%%@bcK$}1S0V+nblC0ayYKmR*BbswtR(q4x)=CWP`y8436(Fn
zI4AW()nhD#yQSqfJZ)mK-NM|yg19|(iS0~+lwTHR;Y5tz)bKu#Jwuh56@pn6uKVf*
z2g7=rDy;1?7Fw{3(EeTVxiA)E#VHok_f6em3>A;Rj`7d?pPe;BP6QIv-rsOz|DxJQ
z7I@GBU?a^lN#n!PPP6y%(OOV7q40aL>%H*P-#5?2SVE#?<YHGMt){n;e%klbtQ8n^
z6%lp<7|!!wmcys7>>U7#QJ!4I>Gy)E^!xVk@jx4EBliWImlNKgC$w*?Qdu0|LF)AS
zt(dR3fgIvLeczWz5DXM}ZjNXEK-W2#WN7h>qse?aJWy)hTmrTrYwlhZ=x#Ls07~-h
zO`OblArK}7r3G!lU?iTQ$6oTHp7Ue`E>nqo`qi1E?=xT*BjR_l%G%Ln<$warNnFc?
zvCts%1LrQgIoWX!GIeyd#-o4zbLHk5k|RdHm<?aP&z63!)E6Qz7~)Tyga)fS$%P<c
z{)WbXiFRbpcsJ4C`7_h=tp11J<>GfJ%Z4qPX&;;LljJ12fQ%ZoDH`!dHP)9n@z4q1
z<?@Cha9Wz^MpUZWJOi(vI+hlw?OHg|+8Pc_z<zNoqy(CvtTDcWN`!QIzsCCLQPn&K
zyq5{@lCz7BhI379BaWQ4dUJYihMc*>=1GWEfK3XG(+GG>`l#+}-)D3C#eC7rHf<FJ
z?neQS$yj%%Ohgx_gWb0a|0dPaO6XlJO3LpATpbYpqjTM8cre@}EI-jq!W{uHh^WLW
zfpl68>*qdQgS%y-QJQyTN`z9<{h12$<Mvb6SW_SQ`Pqr?=JH$L(8Va8!27{dvbfZq
zu*XH2;soisj%l`P|1MXCDswK5F0Txif@JXnPoFK*?aNbl-#uUu?=uHHd)JdZEiEn6
z0L~(cV~Y3XqW|RNgo%Y}^YkooFX=hgv)f`DHn9?W{tr2ogpzr1`wtCfkTfVBe;5oG
zS$A25N*PT}sSblh+gq1FML!1)eqA?f5T&L1JFKUB&eEGko+)`ahsA)DcZv^d%tGB8
zK8P-(4tI<*u*>z_*-#xItBxN}4(dd4T<b3>!r{YLP3H{b$4&d;2*#&OP+?gnX7fCE
zA-^lWCB(#x_m4mhriQ}A^c0K2SPHiq2mw_pK@q&qJfKF>gN!M2nM=o@4fEyfwsD_2
z3@+#}y>Kal!Tt;jXRA@o>d7MLD<^%;&>yNkZoL1^5Z-9$wr`qR<9--FXxVJ$ViKD;
z9sc0>03-6+EhH*AhyQk%oAcb$v69<qRZ1c~&#!Dab|D%7(N@RxXLC7*e9*fbiLhi@
zG5u*f6`tYic@5h$6<$EE|37RyjWdM7P$F$vhlx@VKui~>8vJ}dYr#Vf25>Dq|9$6w
zelS;$onaEnN%AOOM=5dq(jORIR~4O)JGZ(nd6Zcc5K361qIAX((@cJ9z!XkEo6{ky
zg!n@x4v3mSu%x9Zl+B!7k)tKjoEhUM_ryiUDYTjV{HfThq(C(FQ_}x-t~~q@QEQX%
z8VE39B?$Rh8JubSs}1glu&db$`r_&Fi5YZFAiTrqW}W!m)ye@CI_!S%UxSgT<@7X=
z^}_N0$cw=GhegVudZmIm;=hUbfumTTmHe^dp{WNzoj(;57&IxwjVL$m-bvTHcecHG
zzat|{5YJ(1qe1De|6w6l=aP@6wJPfm8vUKW%I#(H?u7T^I)i)PnEf!@#^dX{-zwUa
zATRyaHA*v>5(8+-mky(bxNheueA6g~)N$wiiN|rnH+IV@V#*Z&7BtPf69s&_enq$f
zdJ_v%`TFD^=BUsj{$f=ATx0)&t^l!t3TH>I1eV82r!@}_5H-+TIO!dT)Ha7;T6WI$
zu)#_*=7E0g=IB5p6ctGkn7!mI;bm)wAjezTF+qeZC7>?dZji6+ul#ERYw1^Ql79*_
zB!!PM_KMIRT~ov`t*cengTuBw7s8DOnD*<QcBQ36>ckk0#*3iRaQdi=ifocaY}7lw
zYO$WxTd+Jx$P~UtkOk$ocDC4Z$$eHld8eU`qXDz4+j_%=CF%p_WH7~_)}aY9VK%<R
zOy5zo%5K;71mRo(b;&R$H25sFsLstrZ$SI~*Qn4XW)yw6-%V-LMjvN7)3U8@dVPHb
zn0E)js(M;7jh&yLC-J_S0p!X4L^?<M@^u&6kj^tdkE02+ZkY~@<K%W~u!K?_W0-tZ
zxaq$~?ypjGV;)7w7Q!&H8NCcK%1`eerNo7H@dKa{=c4cR1e7%E1K^Rv60|e9EWrw7
z75tj00SJS3?$Byr?3MUDRzd~$^iQ?$CS97~-0&YIp_jU|64aYzVmr$+f;N8nRZ81t
zszVJ7&&x0zplC~i&jf6F%&-yq&YJ?H(XngOJ|bf<#R2n+@|{7Uka11!p)b5v8{4I&
zcGm*|=U?{i9tKYD^oMw)v7R&n)fxb*bR@7k?6fbaUk#Y!E9k(Kl$71PJq-=lC7BLS
zCx-9*0RJ*yJQO%u+e)+DRse(RiRqg$zK-qdQ+NBCqmf$jW|K}T^ARXE`6AIyM35rv
zQ7GS9?Xak?Cpns$|J!9Z{DyE{q!|y)L3pR-nZ+!&@LUB6P5Ob^Ns&`G$>ju<PhCP~
z+2OX*girEXlW@lFkHjWtbs9ZEg9>6S#G;}Na+#guz#oAU8D;Cw_bRvPk%}{(ipA=Q
zeHD*?X`l0Vy{z|0ibD9J^qg~ge(a`Q`@C$sD%Nb8do@&qcZGt<Om&u1O}>c#r)oU@
zXrU-<s)(-J?hNcH`!1sXYZ`l;00O@N7u<e^Eue6Ex}J_JO^0J%xAA&@T#eGjk_xr|
zZ8$L8-;}AvD^`#jWweu@pY~&95h?az_VCI3ddj6+C8*NW#9JwJQ8HE(STUr0OIx8+
zgnpW2&{4REQ#Et)k~r5iW5H7SnSSM_RwK4?1?*R0D5}0%e(G0u6uwks0wKevt8KmP
zTE684U({v<lo^A(dYjRff44p-`}o)H-m{UFO8^e;rGo!T7GGqOh#MJ`rrPSHuX1OC
z5DoTA$J~cXy!)IHsE6Cv1e!1Z{f|eB&e23VU*P)U2Qn;lb#<8nI&vU9tLsFN3~q`O
zo`H_e%h^?*3;Xl)v)BWYvprWjjvEcgnwZw*8G7KhAF`<TctDbx6h_6e@l6~Zo_p2d
zA&EfQ28|ocXmIAIdtl7~?<#}B2t~=sl{SBiZlByGD0P6`lQ;_T`sGuB<B!!b9hwXG
z-2{*;fcbw^lF99`#tVFC7OyRoP!cySQ`5qVCzGzR?QZ^@*hXT3L5naBY&kix0$AOU
z(Lb#k*CSu{ZoLKFz<C9@nR&qYtl)o2hYU4ZxBAr=S__Q?I*zb-a$w@+m>#tsU=mzE
zdcC>#JnJhcL_-pM*KvKhCN;NVr}zwAA(@w-+IABF3$ST}MW>CwchPCib~br>H6z6x
zE$MQWXizh$ur_Cu+<|EOA78S6RN)jTb636f6t5pm{`Z^c)i^v1Y<YxW>yNHLv!m%h
z2P=>J`C31^KvhY}km`f3-Dc!ym|6y8eD6veL&)W*ICaB)(_5ku01;+84G{pF#}W)X
z@0IJOQT(H{tN<N4uL^`_T_s5^)YT=D4edvMz9n<17(i=IkF4w_LJRgAP@nHBGIWjo
z@8~v-cS(XWAd`|@P8&#gaqS8HkI0E1GED0E(O}m7{;8u3YxM{M=>dG4pE^$2H>2Ev
zY>X_A{qzQl$*WoD&8UONHwZnUOnd?Ktv+`54C{H!kDlQh$Eb*F%)u|-td!sr&~UNg
zH9o^hkLjU^u!_oHwhO!U-b_)JHV@OgBxTsPKuX1Uj_Lh1)G9<Gyzz;q>_!{L$%+WZ
zbt8%X<8vdQdV6$241*fs5R;IK?0qYl&RRrDqsfe)ne*Bc?syt4OxXqx<1`JO&A(JL
z_X&A}P&8S0oh*8qs~jF7AtB&X-4BFg|M#~*^C+*!tUgU=UG76Cr>C8d`<efN(mk&C
z(MSa!{)<6-y_xmqFfi9qvL32r^_F{N*nr}WUDjNP#qr%y>CC<0AyEHi<YycV9j)|B
zRZf^31pF)K0mcKEqG}~q5ie<_TdYrIQ?%+n6*j)4U|Hqcl@fJYLxZ*G5gxV^(BjjT
zQG_;MTZ}1t%{O+Zmy)7@C89v;B1kFoXp{)NnpOgtS7eK?NpE4%ab7}<%e+F<I^(pg
zN^R<dnHmXZ>Z0pJN0HmhXTT6e0exp_hwZdRm-EBqcK}@O!vNj!$%*%m%}sfxjK{Y{
z<W20E3@+Q6uDg~0NGESEkN=%JeLzqwfQR<5>^!#UY?NXd^h8t`$cEZ(6{P8^+2B$0
zilwBdg02r{mki$c{#PhT>VZi{`<<khscN8|seo+|dpL%iP1O_ni<ufZ?7|QAuL+20
zeN2e~k|dbVUhW?y&czp;661O-_%v9RV@4HmcFh;zAGo_ao5+oY;m}0D($I6Wtq;i5
zlq%a}QTYWpsdY^-h;}0<aVp92C=9lCi}_BR6d%+pTjOE;B7~U=BK&v27DP3^#V(8Y
z4|O4WCsPGtMyN9D13WXphND6Z2t?<%`q$`K5vQu3)#GyXx99D)wwu6l)xO0$(V}x~
zL1j$wGRF^|b{bcALm`<+<wX^96OjO33d;TWmg&?+vALb|hj*o}Pz-XF1z)<dZ}oiw
zwf-{w&3zG;FhM0H8^K@u0|>fd4iG#g-Dv2Pp_GEbN%CmC;uRH`bG+)@@W!zxKe<i~
zWV0)AHLlw1W;LwVp^}WF!|a_NVF12B+~}U&?1Rw+X^AyuBBm>{IlUSQ+njcDadhp?
z#xq(Y%MkZ759AuHSZW?Co}wAo5*%-^xanUf!&vpYvT1O99v1^=m7~>hhLkg*4=MPY
zffs0(IBKJY4%OUmiHt!R4J&UaBI0B^*OGKXXH^nD&!*)s!)2tZ_uUur@|<z^STu^E
z0rd}Lk~haKoH@S09h<6<yb1Mi@wU=rdjKd7K&ugOSmZHw8ipTuexNG=wC>S=C+b!R
z(aOq-#bk!-e*{9Ghb;vKg-qNN^^KS$h-xQYnOV!W>}S6p=O6vvd*EcpUf%W8B(m|9
zwA4>IYrYQ8KXXYxt=CG8Jwb4g%?rdlb_BJr?An~;YkZX#1vlyRGps+1?LPgSj4}=Q
zZ8~O~YE`bOmZz+C%#s?JLZ3Nlb<)h$FvZ3^D?@3jq~&6|HDh?vPP6;8d=pZ_-5=O~
z62eXTbh36~FDm+8*@ElvB;nUwXC|IJ74md?*k$7`$Rc3EUG69Q8<x_^X!U-AY@*WL
zXw6};EfqIJe(dXW81{HETJhte)oOj&nq@!wk=+S~fYx)FqF5g7X+Q+HBYqWOK`J}4
zzZq_8MYG&K$12+F1o{YrX7xv3?swDsD9ZW}xk^%oPwljN4-(}-{oHjO*Y7LWL`6YK
zZjAoCcsKdjUgO#5ySsokJB1S#hm~Z!7J^5k+8@|V>DP(xS#>C9@bv(IPV04hJ$f8u
zTLLZE5uL~i3qi-w>j7*<y`M+L{~cJqzVGK95TE^YO8`FM-=^ol(e3M)><v8F*edBv
z-M@)5q0bal^vMS`TDfBvRlTGCTYMatdp^#JzW1u7z?RUi)_<2%6G(IC7q-fff3%ow
zs5|?;`6nUN#;$985g;yBdw*@=Yjx<$*NHQFZCFRyw;FHUPqB`vzF|J^Ia$@SYVh>^
zEa2stNmQ^5zoeHfg~Qp0gi6#JWsW9g?)EvgVR<dXBUsXz-_W~tt1Nd;UQl@eGcdc?
zqW?U2hN;E8Bxl2Az}NNlv8=B^+Ie;ged*3FZS(emjHp4X{+4i;Pd63f9Y6LecsaK=
zlCH-=ugWpNt)TUO`tGvvD5NVeAm^!n5Lv7&kXCNF+Kw55D2k1djc;T1@mA=zvd>$X
zeI+ydht*k9|5Hv^Bc$yMk=t{I7=DJRlGmlwu~yO~D*9qqgf%&DA-vIyLBk!N{l*w4
z>K7YO0^^886a;{B0Whu{Ay00sJ+Jp9Hd@sFP|}}=312>4PX1`Hoc;`ydv`!?4LKs?
z)Q``oN|X0TEkeK?I2rY(2Q{f8%<sN?Pb*z8aC9JC6x%|Zg-+u;T{#Ad>b7m%dix6c
zNkLS4cJSt(i;iOgr6HooQwy0!GVhVTM?KdklLEfX_}zsSpbY2rkQ3^Ns3va|Nb*#0
zx_9?R-RvVHG$mz3UX=tQ$yihtNs~Cm$kmAc$^A=&;<P3IhhztV9^dmpF6>xrlR}DK
z&=|{x-&@SuCo*^*Yzjtq4hc9eDz}Q~KMEt<5tG{NgXZ@gJ=q1bUu*bo@T$12W%7p3
zrhlz^FL5i-f9xv0u9!V(7U=L<S^i2KmY(;O*=h6IH|+csJ!3e7z|eka3!D7>Tzj>}
z#Gu~+(<qs^QBCisWe8fE;C(GDB?T=r+d-3QbTjjUeJs>sg@n8kD*ZOwbzd+Fndj-R
zb>RA~tZt4d7#V(vX509mUh}<~k+!w5i2;hjSw&%QfEN4N*boV()8UuFlf5Efi3U~U
zjH2Bzcg;dFG?q!!7lk#I_WoR;Ul8tu<785pzq$vs8W@y0)Wzq6@D?YTarp^{&TBw=
z%F7_yvUGAVe54Cr?pq*%;4FG>)pj1T340*y_Qx-CxI)i(r1kb^Q4Q*OgbZv9x?zlY
zDap$P2ajLIyX*s-g=t{P&EbJs=5LZXisH)*u13a-R#=}FKyd8*W^QphwY-py<(5iQ
zV3*xjA!YVmjYmvP78eKCpmj;~xHJb|3om#A7Fpe{*&;wuG3A3WX`1NcmH<N@qO7ds
z=q~aJDf9ZPp~auwl0+7sTT?MjcG^sy+r2(7D@-6nWM=5#Nn_f6<@~Ta+SaZd4g5FC
z4=RaUknBUpb@l2~w<t{wC+3sx2!PdC1;Rd-j6)w2Nnk}aH5b)2^?}$AO`-RfYi4F<
zvZp_xUj@z@8B|L^Wo|=6hu>v@d2V6Sj0g1oRi2?MUPzQ#A~!Ad%n$sSjzIO_@)=Uc
zZ*2d&<r|Wq!tv+9BStg&8ANiX_m(1jC@ew<2y|v`tuN)@CQ+K6y@HiW=}{2;2${q2
zP1`u%L*8(w=g;2X={GC3!<dthC<n&Sk42=$gs#?Hf5#EH&5=@L8hP(^tv*CVBYr{<
znFE8FdvduyA>KW7_C+wsEhYZa_Dc$b^JArCPyhBR*m7Y&$~KOkiSB9#MIRRlw2>&v
z8awvfuP)LpMyL@i>B*(#P79(3J>L>V(gS;N=VCE6xyj)r#Apu2aS4c-eacFf>`S~y
z1h1HMtzb~&2`$6(&sfUYe#cesG?XVdf5ta0pl?5GUIhr44Zuqb$ja6hfLxsb5daYY
zvkxf9o}7UP&r!NuL7<@MX}vcF<=#rS=K-rSH#Fi)Jz^BLG`YA$(bFAQZ3T0RtR!)~
zv6LvW1p5$SS%fNgIGzS$$+H1a`MZNi5Um-UXe0Ph#`pK6zK(K6-I~9CE}a+<EAZ3(
zi4m|eq4A6;QisYL5D;F?<Us%VDf2W<8@To;gBat#nT;?)`)Nm);TQB}gvt?V&YeB;
zal6vsX~Pkvys6LpmHyt}!sc52l_Z`b$a{L7ULmnz=Ux?*&!OD&>lSufqFOo#6<nPr
z?R|1O;ml_h5G3)@2)Zf(G@z%~BEj=<`plW}Hz5Ny0Y#mt@%g6N<xU$+Pdb^ipA?0O
zhfq=^NrEkCPgYw`b90>Mq7HZylhq%L0h~bdS{1}A&04k@<<^4#s%3VfWOpfT&;AQx
zo8XxKPd`t!0k*SefU=wIKJ>iL(*wM$O2-oO)wdHkHtYgT5r#LuKR+%Dq3aTRS;IK7
z*amtt5gi=b<I`?}lM2FMu9vQ+HaFXzW`czV>51pk-ml&QzixPo$#Q;y(J^4yVEXTl
z6Di}Y6##!(^*Ylsb$bHG{g@aC;1nAQbHPiQ9L3n=wo0c#2IHF~siC4qOzW9&fBNUd
z@pS%9yTG#WhSr5FaNC&Q;fp>NBqItv60oX3?soFpWg)~SQoP~WjqB>pLvn2nCzFJR
zyZD(Yl5FJs3Nz6Wj(iYJ?UVDF5(t<Jhhsu0=nAHBck<D)SS>_Us_O$ImD5L2Ld&;a
zZN+g!o8Ci%v~c6L%Iat{+>y|-vE+r<2hS;Ym*P&47P`dxOX<V#Us*PBt9>fo{*keM
zU)!X+MH7+82v)|c##rp?Q_xM%*dpv`7~Xu5%j{;e0JJ+m{2IC18+QkG{eQS~X=0}!
z$DzoAvcHA7L)U;B*aT<V#9`q|cDtTAXM68C5_6fme912B_O^+!#DK;&Bh~rqbm%*M
zc2<Wyw5ywOe87BkNY4#pgcr}1QFFCv3w&0$JazVIgOwOi&=<01(Kr(KJdhsdZ>nJW
zcqE>Q=mnOV)$1=YXnfL6Cv7lrI_Dp!dzi7mdF(aRyM=QrPDO9}UWAPberE}$!6G*s
z`(mx{1gxoe4i4s3itiyvRr(k&c{eXyx1stts2j?~r;%zXwJ(#}llJaKxn4oa%Y1qH
zYy=#SS=y}*zdrV`EIrjt#HAktZA_7!sSm>0F(1}b3D}m=vomC#8DRz`-9K=5uDJYb
zVShBv=cR;!!`8cJJTlPY*pUcus_D6z$gOv!;=sn3{azd19vljShsJk1i_{yg&k*|D
zUosSKY7%XEX%l4)?EUBSj?9dV9DqD%JzMg+ODDYSeZTJQ<bxlAOS6vrm*zhQ2%aEB
zR|#WR(fNtTj^69W&&uUXCna6=xlvWCEnUwKS(aK3If*cwmz@*Cv-PrEA&(15I)_v^
z?i*ZN7%IOey{=j0&J1fxov-u0OjzEdOibCDPS*cQRxbm*lP?DR=I9ySr_6Sd5Y*Vw
zN6W+RTeacx9aIkMj>?os&<Wz)SL6?m$ik9tH{r3MUj3!h^7Ml>A?$_j`z1)fwguHu
z<|y>|c?uWoTtS-cBfTOcQEz*Sva8NB&BODPg24@iZPvU_`UNJn1p^tzVoS|R9Vufu
zxfW*0P}Y#H9f9ID&0UNKo5?%N%E%;11*tKjLQS$UMIoWt`oPw(=wKXwyEt1=FL|&r
zF7y~mO{!ABSqzPcEY^-drBMLkn1uaxWTipH=835U0Uc%{BWhm#u76(h=vP5#KN`4^
z-`d|c3s&Xo(5T$u!PUMW^={?QCK>`)X8!gFF5JK{AoT}T=4jFgxYa5YJtZ~(9Uevy
z&>@{_BS(CCdIFj{ODijK65Z09k++l#Bww3cfyHhZw}y=L2lUA>bC1#zey%Z2#up<k
zNR6ITRBxd4o0+w}`&Rs5O-b8+!*k3yG!0##Er-D{&cH7$dXudSDCg{mSh@<gcsqB$
zMy4qWr>%=#BNLW{vDr5RcB6VMvT%){S<~wIMK;)3e+5l+XQN}KTcVrEbp6+tEixa)
zFWT?S`J<f7m-EU9+*?C5^>2@Br;e;LgTKF>>J}=oN4#An@U69Xk<{bTXb8oCjK%3$
zy0R&205Yn8mt}+m#Wp1zM__S5cC6J|11lCR>Y}QHZ}Gttq_t_9hxl>*^adlePa~UN
z7f8+OUumPpcaZ?hr8PYy!CyUY7kUG1Lw=SDz;R@jwKj9U|8&UCAnKySkti^e>5_|K
zBn~W0j=gxdgml83l~ugH>)S%oogE-YLLyi?Vi0nYBw7^-Oq?xa(N`qKyDJH0SU5>=
zhYJoSxF~1?Q;d!}2>kc?>^_Ixv-bAcbQ+k&-WG3%Od=q*9mp?p9IR=3)?VrR^u}zG
zfDn)jEewwttG(C>*(SN=an#{Gk`TGN*Hpdsk27K><%jo?sZq$-^Q;=E1Vz*i+ETn~
z<p|U&MLZt+OPu8Ax~*4`5DRRQlENj;fX4TKxUl~^TL8Fy!*x;r{b5-6zxx^3n`A)Q
z1Be;bZ&xsga-aKmo(KpCAgs9kqeuiS1_RH>y2&xR5{wm$WAcLX6`dJb7y<m;DFP_L
zT!DM#XJ}M}CcVRU63J>fq7#`==KJ6e)asa`ycTu;l5t(*KH@=v_K(>=b8S_Cm1$AY
zq}1~Cgp|x<^U^*QD4Ek<#O*8oph3;4xH74z=#$v0X4MDdR=7}<!_%aP0GY&kmD1w#
zidX1f0qOJacvg}|0#-*DnS{`TsiYufpS1;%<1Qp34|S7-b2zO>m&VodDE`tMV;NlV
z_6Y96;g|cF)^xWC9J-ivR2m6BC~QGc0KFt*u-TiijdwopbLQI&Q?WAzSG3LJVX@m>
z%>|a|K-^{gcfNE_Q+t*3j(5sr=bfuOo?E_?$j6wBl@BP0{u0LY+COt^^s_e48i-rM
zc_{;e4%?x_<KVpB3lQv29$LLKF#0+vbXz>*?^t%KV}U#&7y?A1K>vdp1r6c#ItWH|
zwey~Ov4?xcOr~@pw=zbBby11a;ZK7E$^)CG*1yc29|?P4h9@ZreJ(@Z_seWdO?QA?
zjh6Sf=UV*^Zv409220>Ee?E~;apON6{HE_akS{}vcmaio=FC?0XwZB(J@P{u{}@V+
zFvCG+&s`DmR6Vkrm`&1b3BR<xy>?Fbg5=goH=5#ay-iWkRB)c9Bh<ID$$pO1><U`e
zA$XR;pW<R>?I;fBJ>$RKIS+^^3%j%8xa6xyO-Pj7XxPIri71Wr7c|dFKDI1MQ5~Bq
ztQLfAjZ}U-V!%kzbKZ!ti>gkzmiq{6L8DZ@JBo5ZFd<<V@m1p{MS`_{W?>i}`S$B5
zn7R?~8XAT*Q}jUdE@ga*XF;l(t9b~&7TOQjAV|cnRrr$0bBvO|p3PxOSJ{<PBVR&{
z{9#PosZ4fkRl?%dFjFp(7OSwzFAlbp3yQwmrPO*oX|+^#g_5PQ2@{L8(D65kdA$PB
z85*o(d{Lq<`)CA0#K{NF^ViLZvv_A4EqX_sDpGO^N`<sSmBlZ68<LkxLVE8yI&P)W
zL?`#%tfWnqNngyWl?sgNojlgpe&ab#JNnA^C-tKdm~%be^}SmDS&=NU-~>UV#EXri
zoSU$_ZJ&c=Mpl)~5MjWm@uKgMjk^>Xk5Jsv$3kfmqJBhr)1RiA96~N=7J}qI9pxCd
zLg<>9b@xh0ih<ZmV(sU&_3M);{+S@~-ms&(-uKuO{hSjk=9+Z6SEfQ)2ucnXjfFW0
z_pj=^HL8M`m>3}1X&cCQ(rUDtIbCi5B0}~f0RTOQ??XTE?$Zvemj7b6I4mX*9ehEe
zWpo^mz@`SPZ}QHGK*1NEdt*9QdAL~L`w0K|cO!1;_4=AP-Dx~SMs}!IZ_(?ysg%3v
z)+u!MZ}fT6N+C-k-rMN+vEfGd>C~g-z@tZH!f2_G2r(|gg5lkxHrd*DySCGvrBA2C
zFTZb59Dg*r+vgJENX1kLScEpu-evQ)`*)T3rMas2tX{8ToUvD*#nL&x9d4h`*5<rV
zAL#o!ZtZyTyl)NIcWKh9#C-C0*@N>nd^wExuHweU-Lro4q<U{J;Ig14i?hJTwy^8t
zv--tSui;-x?HOme(Az@twPV)m?7BKQVpaDPisNeY;UBH)icKedpv-b)UGaIso7Y{W
z=I2(2nUw*7-79=$GGCM73`ehT{1Spu<_5Y+&JG4Y_JnLd+#^1#=$d+6a5Lu$d@IzZ
zx(?Tbq5J_C++6|zrO*2+B&jq&+EDA2`jyJS&TY1$9)sA`x;|;GNDd%km=tn3a*N<E
zAJ#|}S2nn(!L<_<)`LNY=q+wg+P2d6g`g$Oq&Z$(jhcacO6al3H|4K!p6}YqSuCH0
zJul949|e5v&~r^}hQe7eKL#en33^xGD!<;9E)5YFbOkWSKdC5u+Ao`dk_ujy+1&oS
zE#SOkxhDlFL#|`Ynwi7Ple1IOxs&ItqvW~tMlSo1?-7DUE}I%(AgZv^Fg+l{|LpLt
z!QNVO9tUw=M+|7ia|8%S0$lV*KSgnx9N7P5=|0=lm`48X(`~9@7B<I|)kHSHEH?kl
zIcLUuc&1kP0Gv0#Lw%wm?^aN0uFn&YrUwuwb$y>^bKkB~d)6#6?SW2DPa)ts&CjRM
zJ+U<F)a?}PM1#pwO4*pd42;}O+x+aax2b8*R_Fj>&L}%Q_;}KF`|Ap3db(@xhS`0i
z6{(jaa-j;sXwThkJlScY|7pc}hreEjUXtqA21GWvA71Po`hHU``2{6YVVbP-_Ar6^
zUbtgF>r2Yo<!;4^;<~Y(>L*MSu&4IYWa0vbV5{9L)Mkz<c*EwWt*^VsQyL+NdShT_
zvDV{!qB4bhZpFLBF8$BOMc;9bK<-hu<x~boEQ`-=Z}|pYzO#y=|Fgynmtsx#<8R#Q
zPsgi1ktNx5?k;!tpRWquUloL6nLYQLYwhP#b|jX=bZj;5Qexf)sqQ};M4UKh#uP3P
zYv2@kpmgLCWU?+x3`n#W)c%-gn_lB6b>CLww>XHQ9}})OT8B7l-Ns7)ta<-yv8v#}
z>={0aQeZ>5wR(0iUG-+GsvVi2g9YUw9fI@WV*GsPdpHlD64ZXDWxqkuc8*ge+2TCP
zF~-|3cz195I8|H;*rWyK#%QX-g=n=d=u-lTA+0i6*g}A{tYq*YvU518FkV7kV_cbN
zuRq?q`FYj-)shgs8{A|{k+URYY=61jv)8gT%sHiW5JsU?48?G-x;C&7^r;n<dYmkx
z!=Q*nkwUk2gQ)@2LRPRcjtd+>@Z)HIJ1qRxv_)M+AsQ?G2fWgvUeyHt9th+Il9dou
z<L=foP(?wMa~Ko~Y{@=aoOI;d)x2!P;P?BY|MXXACM{cj*rnDw_lt{NU-L|rj~=&>
zUT#mD@iu9Rg%_pCQe8zHISvQ9ZKyiehB|QBZRm8&Yv|eoE3Y)``xp=a<^^PLta)8e
zeo6f#^mJIbc?OY|k{SZTqV){4?H`7ew(B%r*9x!Q{hQ>kLd36x-wQGncDjCjwOnt#
zS$SX7`@m<NJ1N{nCe&tUw7^4_!&`DRHtj$fnzJPL=ezsz>!M?hZsU^g{R5JrunRWT
zSd@YASz=GR`|TuC&-Ef)kAN|X3_%u5S$D`j+5p+xk+ZJ7iu>=<mdpj4{Dd+F&+whY
zh|tt*kH?H~?A5oIVufb1b@GCVoBM-~7@_B7NB!GAhs@?v_Fw8_&@eEuLodsV)vLw<
z9#?%`JdTs)h-{lZWe}<!ue<Oz7|pew(<C5iOu+cVKa?Kd{f%L{Rmb(a>sW;i6Ft)y
zW$;g9b6@=lWM$}nAPyYw+@t8=Fi`6KIFTbGU2W^fPOV|b#p!s8*X3UicOP%*eri_e
z`fnhNTKpfG|9E`E-mBFTnGlJIwK0>td&b&-OS&Iq5>oYRIp^wr?=XC1Rf5MqCioT!
z$@fwfbkOuKO3~5|PtFIUR<%5a*_@=Is>&}sTt)O@_W(k3)aRB;f-SAsQq<!XMbO$4
zL>=Dx-6yf|7@c>Oxw{rJEq&G5({?+Jt(5bQr%4!Q>5YoW=w%W<h=dF`*#gD45M%^G
z{n#z-eD-+h2?E25L=6zZ55v>Ue_reu`Sdf-X_p$crmx;}y+EMsTH6Fg(qT8I#B_MO
z%QE#`xDqn9!_UIkob_!sJ{7oMV-g~v?*DcbPpP*Zd2-ljcUq1cY^PXrS~LI<V1(Q!
zg<rKiD>pDQ2O`$%R9#->b)v$0l*mXgAc+A_Lru*VKsjf-E@=Ok7!{0Xu=M8-IP2v<
zi3~?gJ%~vIPk;A*acCXtxIi(*ki{$9zTmU-ue|I)It9%QBgq84k`$*4w7?{g+e`OV
zNd;<f@pI#Gyrrl6YTG#zf+AOVQWj2R_sk&=2BH3JPS5eydFjHv^a3=w=!VpIO6vms
z(`st-X5+k=;BOrj?r+=E?H-tvPru_$TgcXxJ|^icCnLQ6&TJ%p@3vSR_ieXj4ycip
z`F>?pk-wSKIQXHO?;xx9QNk^I?c%6Qs&DmqA0s51`PzQE`fa>0c;VM=&-1R7fV|b1
zsEpDVep|a&Df?fo)XiS+eYdHbEkDPk6f8-alpEqrw#RC~<zB8m;Y5nn9lj1nQ<jT_
zl*-)s8~%*0(T;S>I`3TN@HxrEOCL0ZR=kJX9^VFI64rXzW$KTw-YIXsw<(^KG?U)*
z%(5<ofmV8UF-ds>ug{Lv)9H(<U_K%_P06qQ`Z28E#pA=x<K&6+MfM=zZSdatU5CSl
zh<qAZ`j5YQ@%M7;0SizPrMWufiX(Q>OZ?qiSl{{9i7QkOza{F(`(MF}RL4rpnVOiV
ztVIZm3m$Sd+N~}$7|aF(r_ZtJYKPgRpu)`Ykvs<$2Tjm*xi@P(nkmd>9_<s$r_*!Y
z2jfDaZlwsGjgH%qNQ$E7Uym>cH=CYP(^=Q$)^hzmn^V)g0oNoOsY_a*(Y}+`k`Ouw
zx=2`(1a457#Yrq~jc~Mv5PqyjE=f-ZA}ZpAQHd|#dR56`=)7-}1q<xka4Z=>mVfvd
zDonK_BuQ^ab+=vIMd9A7aR#EKHxeWYG#d*GVN;mnNUQo}<D3nZ>c14Qg(R25j9*m&
z_H)cwMdQw0w|7A!D%CL!AoJrUo2?rRS`$7%7rgE3O|id)4)V<(WFh<9MuSHa-Vnn1
zaEJEpyxfb;@;8NOP})uyOXL$7cKkK6QEasI3*{o;!61Oyy21qcG0Qm|Ftkk$FVKba
zocfnd)csKy<|db0K<8Ck7;_g|kNy2O^v-(COU+y5J2yMI3iT8RX3!F-W$H{!IEKI8
zRLptGV^k6~qI0oXr(;~L!>D`hSDj-IvvdL?8^f}30gNLRP`+s@5qfc>MicR0Ws>11
zFYY~QBx&@36<11Xd{FI3*uJ-M*H~-1LW}2puc6^FMgUb*kS{bc<6;UbwM^3S&0O_m
zB9G_p6<J+#Ju-MMI=WWbuSRiRU69h+tT*~BP7jdiQYE?jg;6jz&7x07uU7jmzedlZ
zf9(8b$S14Q@?7N?MVrH1jvx|PIdHGKLr)%t_*3a~j!)78C)Rugj>rdASPL1!pBf(f
zMjQ8|?WN<v)@|jUL(MS?MM=|5^5lrTl(-JL8!x|wsf%yzPNM9=u9O>b>fE1>&h@Y4
z`@<#+_Z8H#u!!IYG(^{jLZ9q&@b$jwkH_Rv&Vnw<Y2)A#<bH-4zMklxCF*+9;G7n_
ztGl*3D1Mj&s<lPqijS(;B(ozVXdggPG`%8FwpFZ&VLy?Ye2f)}$tjRO$xzCme)^f-
zQXjPP#6+t9+>650iC`p%#1Q7OmnTv<b?#$Ct(LqcIKN;LDjV%rVGRC7OH=2c-cx~M
ze#G#|VsMG?p<$xoD1n(CoE_xV-B9{RSN{0;8WaXgnHg1v5Qd69C}U4OxB&gF0-BWw
zG+p%(+HJUeQp75b5*DVuBw-!#Uz>SgS$U764<wZ(TlE|42kDeSN#&adWKlB>S13^Y
zJr0bGc+Kpp-5p2edxl!V<A{j5u`OM*Js-v4`N9*lyn>`V8SD?rJJLQm{HFW+5mvDf
z1`icPEl?p_Wte@{>e-ju4le__lX*N|o`eE^72Mkk8xT89xUO!?yVr;_)wa?s5)Dg^
z%_<QO-(2AL9m|-PCER8Y)i5e7%4t$0m^~j3C9zom?p{9@t}@w7CwXy<oAjktKEI;v
zj-c&sGAwU<o{o6WwX9D)U!vThhTblTtQZ;BxDE^z&gi6;n|UkC8H0r{HDiS?5t9_h
zTJR*)ow#|FMDKtRj#!~%-PGc*Gi~bwi|gpC9_;?W1i8bbrlK+plSBC&VAV})X&=;>
z^cnF4Has)#1;eOphC9qv202<p&uwo?)PM5zHP?Gvubs*xPE5b1oOB_P7xdBE!~1h?
z<n`l)9moT&0qj@f)0>of$7S%tlulk|?n()TcYJ8**OpnJba+8YGkiHnL>-qtDkqS!
z9gFynstow~iB%sl{eeGedt?Cm<^~UQ=l20W#IqBWh=603Z_dkhdsfEwF?eh}(f#AU
z9PL-%-*M0^m<(#2&<6RezxpbJJVc)cKUh%0jj7EV%;+msn57fB<{^SARh{y}Y^C_(
z$~d~I%|xr<4bqK~!Zn+4;BI9>AWT6~$$9arx*_{4TTKQ6$0TQHG;|lzCJ%>qvH??f
z#X*1a4^=v9Tyn4`U!S!R`-=X08rPZ`Id6|wecVk#WTmJv#3Vq@XMx5J>N`ihvRQQ(
zTxQCPF`AXYp;6I5q=>jGl=O}}7Lk;v%69LZip@{h<I(dM{<0426vupViX-r1Rp(m-
zT0i$ZPX&>cq(EvOcWBYYfhy0kg_yIu_SG(u)Ry2cRq9U!i#P&hTw6#q<TW}LP>3P(
zFm)+PbNBX>J)eCJicXW4vGF&6_nCuiw!^aU&akaH?~(poMmeIW+AAJ-RLlSg-JeJ|
z^<T_;mvm5T9?$7Yz0HQ3{l!d{ki#gXvhkAOCY$2R=%s5XoPR!#{HNC^nM;kT`qmmL
z>yk3WG*|&v{Rq=Y1um99o4TwVtg%kyzx)985D_io;BYO}a&{&LK4bGH_z9Vww|+_L
z;4}(aD^Vy7!I7|E<d*aFN$N&YoQ|F*BVQ=6%Aou)nb-nYC?DqgenV-|VyVyvMMV%I
zJ721O)#*3KVhB)+*T}lDksXcWhZBvi$3h7L6&9HZhF55r(|*0L_>A<Hos?W*?VBDw
zE$<^W<QXfM*?C+mu*LO}uxp{FZ6vBJ8Fx=5$3?~VRN`Zy%^zIy0q75Cwh%B&T*aIH
zPcan`NfYw7Hm$6c52bB75=R4hD;jVY>{Vvx<`Ivz_?`5qe{@W@zT5sB2f_CJ=1@xi
z`vV|^cIEu5x^<9K-I$jiy7<GL+#QIl$V!e?{fTpPqtz#+|6N$wRnYp@78--f2m{8C
zf}H-BdOb2+Vn~{^6{fL_M;fy8Aw3#8GXzmgLJl69-rv+wn}Rw=Wfr+2Cq+mNB)O}a
zny##6Od}`bP3Qo|b@DS+hI<^t(-lt<<|D@?l566YAxAY=5yxV%?mL#HIfW(UW>~!8
ziAWNgCRLHEWySdv0ndUb3RO7O7d)!EvO#w}W@+}~2z-{r62{Lr!lwnbSLjd|lx*qQ
zbc9JSw~Fr2-@+BMQo53auu5eqKi_=Bxb49~?L=j`DgA;AO5@A#%4fd6_)<Kaisona
z=W!tN`Zy~I8<L=;p++Kc9QU%QN&LZCSFtKM36imi_en|ut^%1QE<*A9uxyhN&-1b?
zlEi>O>D2Rp?Y6R;%H8o_bF<>raRH-i;83l_zk|y=_8ovV`GGDbS8k8)tXnwHhctqj
zL_e3TJpbHp<QD`B3yu9)H$rO;hZ6eYhDjNhW_(^Nr&6ke7${;hPo-byStyoqZ<ZV;
z(_1tk2hR$PUDTj8E6h|~%D5;_Xn>=V4<5~v8KrkRJ}Ijivp#&!Yg`0BUkJeC8;bw{
z1Y|PMcM)`O@S=@lWK>ktyU|~bEW{%lYGXEz)3b{_{E7+^hrXb)zf83SZlo!s7YQ|Z
z)SaFKG1z~=`fk4WLMRKn&hf?(s3!djO2h;vO6_u~Mkk?)9Eij}bNFTyq~=KK%*E@w
ze=Ka`{mh#p9Ip$B-r}V+l^xYwE%9JSw3p>>T|0JyOSbR@-ECGdFBb6#ZnzG=+&bAI
zS%c>B*<mf<cJu6kV3PEno$@!~wI0S86AO0vc#3fxAO}=tb~yqBkFKu$){;1J+{2xW
zh#Qs^7T2tr$JvU8Jbp0uOWX*q*G62dG{nBoBpZ<;JU1*kUPn^=ANU2K7dZ|t`2T3S
z3Wm6rX1Ta6?(VjP;1=B7S==o+Bm{SN*FbPz+}$NuaCb<6;O_Rg_kM5xz@BraXQsNl
zs=Bk!4Y7<r5+uY#kiWImB-1X!)<Z-`O%VuJPKj;Th$MK2C>K5FkZCfDtLC)ZE=Mwy
zVQ8&As-+}Pyn&@LDnIk?$Ue+~sM=Ont<*EyORT}KV_|Oo{8O#A2AP4eo^1%WgboMy
zfZ6Or$+{EM6JVRqL_tB-f>L98zb1Ux=5H4YT+&_Je#64M(KVsAy1*Vj%4D1UpmP??
z*yGq_<w@&)QI+Gf{RO?>gG>%Ks*NE<|Ik-h3lRdtSq706<a66F#X|y2BM-NWsjKX^
zn};e|pCWuO<haV?w#SQ8YTw5mT=Xu90Yib@XcFK>Inr+s8&Jm-E?vgkqa7(GXQ{lP
zqv<@ssLN^lr5x`LH}wv6!eH~RM0(+M?^aUo_=l2J*~6u%nM(2O>v`XMrDZ!J#b@j(
zN%F{(On>ENc2(=$Ebb}w8yagZEV%<lXjv1M_v-M@;~R(g^n*?PAmzG5C4rb6^%l<M
zJjfoe>=3o|SQ3m{ZhRJ$sXtq+obn6unw@77?0|2LZbe)8>vE>M#H)CrV`WBF-C{dF
zQTVt=_e2BCr}`J4ucFpz1?do1<C^Auu%F_p%3)2cWWS&9=TEZlU%W#c0}!74IMy}U
z-f%z$gK;*kM?f-+>I~LenLw879PIV|0<Dp8>wY$U1pg=~Kb&q#dwCqw;vBoKe^4gx
zVBqrzchxmbf!f&G6?hjCcspC6qiDw*f#96qHDz=QFC;J)cmtf&rMqaGUn(B!ZkaFf
z6}%g2*W7TQTv@ptE<HIq`VFfFI&Oa<06@bJ<04F}lYDqU{ip$z`-ik`eajzlt`w25
z<n;%-G7IhX&*dW_=ZQ7%`pOuex7^f4^wU=HexxS_6Pe|)ee@q1STC<!ELk2S`R}7d
z#0=EcTU9{lsVaq!XytS_QH24UHb>q620U7*PM66D+4U%X!%9f&3eeLkNzQpMG6N2L
z`MaMHXTHU4w^EC)9q9H_=mR|yRM>f}&cvV0j-}wWd!3@)TZ1B}D1eHZZs;0O-9dg3
zx~&^Z8VkBYo-t{9%~DDAWw{W(nIdaruHMgnjyehJ&UZYGuD&rr&BX5V-uJr`+c-xZ
zr;9n`>)XfFyGk%PYJV&&n8=^`WND}FJ^p%<8CzD*;%Aq?FjL#!e4AEONe4X_4^1#_
z9Jk(VEvL8f&IdF}Qo?MBc&U!d@pzs%dVSSiR3A%x?X9zTH+-r6J%mcW1nJD~M1<;M
zD~{`~=CiaOhN6JnbM-HV6Yi`DWpb`vtut5qdA}G;qE#G%aIiQR8pa9C?tU!Soi$im
zVG+X^w=-VkAvB)wHo}8{np^zhE}Hkf?@48*=VsLaP>JtcDs(>#ynFoDowi9Y2sYRK
z`Oz@H-$Y;*w_;rwpv4Y<*zvZ4K-cY8z91~Ta`h*Fqbfk6!BRu;SBBc#Yo)Y;10;pQ
z8x2nE(xnk*eD(Ii!ex?>&@mOmbd6%2C(9faJw8}sm@;f%MMs4zS!-D>j6a;gT{9FH
zLG@Eo|9QQ`{cKHvjfGdo&dF$SO+KP3V>bj=G0XrB755|9xAaua+PtfGnn`^Gbrpo(
zUHEN*m2=T&Srinqk)m_7+KucfNX`XPoEGFvJ~~%+)N@^dbFRLBtU!0<&p+m}#HCtj
zetUQ+uBP+Gp|SzKEs|$H{yLtJo>*~M{o-OMwpycQ=quSH4mL?oP7W4egfSHV#@;DP
z@Mu}IDNM;K0%0F#92M;q&&}hF+r|N=8)rg%jz~4~D1BsS(Yomh(KX5w#G%N+(kfq~
zJ#nd6*)&3iMS;f2<*$k&I1;RRq)Ss<VLn$}-;}}bR0YsI;xhA2K*cK*UYB(c1vI6$
znkroV^tBenNld;(VLtc<$3Hi?`9d#gLu$XS@g3CYL518a?FvIeMP~}9=D0L!eUQ~i
z@z$WY!_PEwl9P#s=X8?bV`o}oijh?MZx`o&6<pVGae((oG`_<#fGk@;P=<V(7F*(M
zOqDI)W%A0j8#NVtE!tGF%B>z%%JtEU;-+nC6qxcVIrUP%rF)c`*QnjFm)AnH(D^Jn
z%nD}(^QT-_y%1uSVax5cC5QQyfx)ukkE9B-35*w1!RhHTGdQ11oyBOz7mx!adSMIV
z&u`=h%4}c>>cl?m;L=jrVxTY%C)Hg6I@nThm^8X_@<Mn9Fv?43v4$~fX+`2o5ty*C
z!;J|JxiBX~BhtePMKy3~oH8)dqmi-}#a(fk%B~_nI;NB^5Pu9E3vST6;ZZy<E-#fP
z>XGLKgl0T6%)rLt`<tsW9!Ggj!o`qm&PS8p<dqr;Q7Q#wHP!y!PhRl>$ViN302xG0
z1}oa4rX0>^g)-UD#K;~Dk#pqR^9o>m&1W#4=3cxcoJ9x-$(F0Eu#J_=WVhD_Q$jLa
z5HkUOT{;j^JMY8J`<P4QfJ9FbFi=81S_TS>icXp0EL|sM9z`bv5-2Gc@;N(E43Q{O
zBDn7_Y#AqY8E7OPEj6)Fi@6Y5EdYrFi1sTtM0{_YAM0-XUQ6TXS4`d43W>rWuXH`E
z$qJupy+hnWsWo?5?{?!`hA7zcs33<T$o2dISOF2yg+z^`7#97dqC{QjS7O{!P{fct
z*nZ22Loq0I6+KD<Dmn`=rKY^TtT3{=H8(7GDV<3PpVqeVz>o%08-rph6*3t*<+RNx
zXMUs`g()sAE>)0*g)p<fF-m*@{k)tVjUeX6Yqgu@4F~%Muf~Nt2yls9ke!~6TC#)G
z^#?-_x*vw_#-3jiCje)M0%8}D4lBAGAi)I|qXSc&C5w59+D22-fx$?Yi2?%*0TdW`
z2t7dw#c(JA;j<wyC>~g*rnK&~NfR_Ue|kS4V_F2Vi_XIUDXA!fLlhWrFmd;XoiL1o
zKZS%5CDwtM^r%b}C8$H3oUB;A0##*6)AKQK!?C_^J@ZKAIZ4*g#>$M8SmaS*5HM8G
zTOcxO{TTYb?$?|fDX7eBx`G=PYH=A#vbT*F4QlPTj!-_A7Y$pJ7me@EFsXpxi7>dc
z>nb`~RuNdMp{8~u3hE=tffcH--wg@AU&FE2ysO`eYj!j%(RgvykwU$!&aD@40jTp@
z6a*?xZg5ZJQo|{0%nNx~yIb}mQ(wv^2teP#qIY(s8}Xr=fBw0W-rGcsAO~yHkka@@
zhg@NuXRLjjsi}BdM|H~-8uydHK*~Yg<5PC-Sev7{Kn?JZKFys=EDd}Ls&P7Y${86s
z*|S{YW9~2JdMkOLA!4UPWO^!M@s!!mI1D~i*y4TtgRI_ej%Ch@oEB!``xH&EHOK^Z
z6(ZWOuw2Njj7Zr1_;@UwL9%7V8?hwzwp8?nCcqvNdEFQ+4%IpbgPHm&hr{c4I%V#;
zL?IRIr2+*3JY#<-B3w`h2w-MA_h9Z*a&%m*pL|qW(q>sMm5VWHhx1-;c*sg%AGuG{
zu(K!Hkrh7wrm4r)eSIkiI@6rh6RYuJXizVcCW=I73=J7jL0uGQS_OuriBMSFK*M7c
zl?7mlVWlXMr656rZE~3UQLw{J%DBbFm4iMXa26$&ChnEwW_;$*XuSa;v9yWWD9o>@
z6Il_bj$SqS)&94w%Yri`^Dr#?cWKA)vc3gQWE)8QnENAr+A94M-8Vg_wbi})1I5ft
z&nWq8&0Avm)lK8{65H)mkS+SBQio@E`cVilrN&N$wtt4d<~FT6pKI)hUXiF{zpLu$
zl=05h)%nG8@wL@s(`PP%JsC@?aChU>i!Akt{G;$oOO+vGt;;X!clI3nVL!R5X@J-m
z-UFBLtM$G|clu?IL1VpT<edUEdR_7PoVtOQ;p|20<kGY~24$T;ehsb8OMiWvuTv`1
zEiQp#Qt>zMmvTk&CxTkloR(KI9KpGAt=YMnz38)V8@_jzcfzG3fSYe=i>_&&k6(!u
zOTTy2UIF3DSw3j5@Z6PI&DXgvuk_xOCkncfdHhYFi^R`*>F+In{a~xL|Ag?=xm&D1
zhNs8vdp{gX9mmaeb2uu2xukLIV|lr05V}Zja-Tt#Vw0z}{CTOpKL*t4orF5UK8}M{
zKgB{UH;7Jd353?p_h4$%yvA$JS<}pk0?AU(=gpM!&mPPpD6^1{*T76qF&K7@)%2Cx
zw4E{`n?UG&_Pb`j&%c*3)G;9S-n7OqsEe6;pLy)rl7!q#b?PB+k4Jcp?I*+K)ed~u
z6h>c%C6B5eZO`r|Nq()-vY(G9y{2Y*D!+U^`7Atp`{JmgSU_iNVuHY3<R{efT&SL3
zLdJrr2_nkSJ))0o)^nWx<l*Vi_iliudNYvxt=h=<kMwBsdIo{|=5%ZmxB9G=f!3yd
zbIfy(=*(RHhBWubN^Q!t=e*V_1H(oL9)}<f$o%)$#m9O2Y=$_2I@6mrq;7sAjqq>{
z;A-Efh2VbntlD9Ta+Y1@Vbas7flD9#(^;3-Gx=M?>B^BNiHkx3G9WZ@n`lsbA3u%+
zH?jmF=03P)Qb_M**EG}dwDWh@hNIsv!HYjzB1VK;D{aq97aJ1P8-6nEtSY*Tf`<*&
zOSDYLMsHtF@(@-9@4oqUxo%(3FTDI2zrcU4b!axoVqwdV>~gWZd0V{l6x!K_x@ef2
z2$Cfo&i-<eKXz(#anI*9NrW2(>ei<|M)eDMtt34#I=wmdj``n4DO|;35)kM=zoezj
zJzexGUH`%aKox%8uIs$hdnrpIa`jnx*S$}*oZJ>I+=^OV(WXh-lfIVmb#}iQO!5g+
z78LkCV@q$8UW~7-HXjEo|C|((^g8A~>=shYbhMV{`DstxqpE95BNjTxM19zPX|v{P
zJDZ2V^YR?nQZ&dpr#YV{k<jEV;k#bV!akOk@=lppH3!9eauRA^-OSpOdq_Z?c_JEb
zbB>_(WEkqS8!GX6T(kHxMidOMIQupo%9psFbQ}E?MK~b*JhcT#$x0P$(;UuuuWWYd
zjMb9Y-q;scDi+7`#4u~<{nSUvy~{^=dAU1?Cj71Z%+kKTkp78J*c8W(1_WIHGfrOD
zSnm5=U}*dYzk99Wr^DsJWTVjQz$lBa@5rfF?8{G=MuaZ%2=RgWQd=o0+!@X>y17{x
zCsml8JyfqZ3nL<cOnaK;dujzGLGyg@Lo4O0(OrkCSsKYXl<Li3A`6a2<9=JW5c>7&
zizOI${?hphA1l@8(j1&ZgLwgIyHqa`5cE~B&ZSqqDN0m7c|6K7g2g*&vYlDPR-yzQ
zO8#*Bc(D=XewK9wvUr=h<^LGF(>FU=2H~X_!ZgCtQt%#$^2~8h`TF@RYL>?Up`&I|
z$S^AL-n5kl;^Ve?Cy-gSw@2G^={%_gmW?zVptk@UJndG01U&8wF+p}sr^`lrw&cyE
zg}sJ*A)t+jaNZI~5nmh&{KE4AI^-}(LPJABqr(}p!u6Dj9}u!?4>oL4(CEC&KkRhh
z?@0c^5_SJ||8ZaH+lRF;_-AwUwEI|K7nl%yw<U{>(x#ci-F(nSnAdBS&yF~GDE7sK
z2Uzg)t(y0FDifJbC4`n9v{7F8)=`2<_;`CL+wF(-bM0Yx#ux1JYS206c>h%bO!LJM
z9QQEq%<FKg&V<mv``2#=uJguCI*DfF+x<Hbf!2V_4bXFND^KO|*)xY54sVtM;5kAW
zF+fom4=Vfa;WFpEv;Yjj@@W1JoCyop=PP6=rK(?LC*ypfmu{~nnr~v%KE~r;Y(P<9
z6bYtKY_NJdbXA>ChK;8Q=WKd=wrU4!seW{>Ajwp*-|zkBuf~66%8R_<WwL#x5~@o$
zJA2QHtuKZ$|Ne}-6Pc~}j*Q3_)g$F=?DS%hmfnw6l~r5jMr%f`8E{wY|A-)vE30Fm
z^8=>F_x9vP<oE5nhGS<g;%?(9IDPD>y<O-=$KH;}`C@l^skUlrUq3$r!Nb%dhJs>e
zBhz7?%U0TFzuX+ansl+tzPR`E<s$8!xRG(?hYd?vf1^88o)np)Stzc0z?+)r^F@vQ
zU$~o>cgzl5bDJT&FCF{tH{uB0lPZdQZTQNAFk#QF7+^kc^Ln*wxlejo-=q#U)BXUp
zB~7kKRqk>#HIS}9$+buuCxGR%SNK?I*(RpC<o<g^+X;gUg$Mvm{Pfl1<!&O^+kTZ^
zxZFR&ERvaRkg5V2E3k(E_uI=O6$*}jhJBML12L8ERhcrn(6!Ukncpn<+G5ZB&>bK#
z)X%|sza?C2>M`~+mbM46mmlM9!_RSZG=<sg)DlobY0DJ}pmEzUY06B=?27QDU>`L$
zEkMGoeAz}HMKQACy(@V*m>qm#Fd`r1tE-@ay`-SjG~VLr=AJG7bGM;|Yl=uiL};Xx
zf51U3e7oJ-tq;dQh^@x3aiT2rTR5=d*M51t&P#N(Uy_`t16NlC#60HgBwz1yCd}rf
zOQYItVt8%S^WPsE-TKqlDeAV|SyCdCq;A(YcTHqhCUgZ6*g4w7r~(fXXE|>(uGa~w
zUQvOgM~fA2GiB+#z5{OE2>#BswQLayMe=2=xj+-Z;Q-r|&qWdEel2E+58vjBrmvmS
z_A7#?WB1QW{YnKQoU;LJ^+j>?k@Y*{_DZkxvNA~kv9v*&!J5bNE~$}Vr^CL%sIu^=
zI!~ljQyA!jD%6z6ENa!q#hTw_d5<!~7*j5L=;qh=P3l6`fZC8mlhj_Jw@JSyyLJ?j
z!fX)|P9=MTH?uh0i6`sXRE|vHM@*gi3inBaa^G*a_0qn7zt48}MSwAAOw8Ub5s5pF
zCYs`Wk_$*u1Yw}rjm2Se9WKr5ZN4+17L$uM*^RkpnH!~O#amoim_KI%%XJ{bo=8DY
zWWEGPpZJCDDd9RiGouWQHWS}Iedo^HxV@?NVh}ln2-mYcz@WhcWO<E_7AlP%rsIsS
zmlA~UyI|7o_Twg!i{zy@U5wVI*;vo`1YW;wUxdA1u5P=+<XKL=XFt6ke<!*(5-ktA
z+<kk!rauwcJoBx;#s6I3=I{TiTJULt-*HV<Izqh>s0hGe>1xq8+E@PMu{E7k)S4a8
z=j^=b^EeyBVcF#KyIscn*C=j!Yl?)+IZ~1dc7Mw5qwUe{%49Sl<KVlg7`9ushwG-`
zopQ@<Xld&_pK_>9u>b4Ny=&LXqa`mRfSfhBlLy-P#I1YwuOxmKr{CVK@A~D6OV;R?
z4hMfMFBYcP&5&1}>YW8XOsU`{HF680YP_;Mt@_qe8@0MPU32f}_=JRuv!s6laPgND
zlwoduP0Aj;o8kGY`DJ0o%N-@f#9+~Wwvp>vLY?>STs!%nV!S}SRF<Q*lZ8~D+Yx(C
zE=FAYF3p819oHojhtrNP^^T3y(wHjlkIOqOe)HMo2{}3S$My866MDHD0i~9YL)hIn
z>rLp#qW(m9vgs#Vf_J;wnOb%Z7D?619-q$IK98Zb?>22j>*;Bb5EnpdTEBi)z5{DD
zk$O4_m|2SEsiO59viRye!s|^6(?cfH(mEtqK34PZ6#&EI<9SG2Hoe^@QBX><ozVaT
zZ0NU@uOozGZ%+un{Vs!s#D%x?`=n%LBw_+`d>r+)h|g*a;+d#IO5>JYH@R1zU*NTp
z+*a-9>t32%G94V~lwyZs#+5f@Q2UWEy+10i8+2+J;-0lD|Dv1iq){lIVHFV}3y+LS
zdyFY2umv;C*qz<FQBwaFPW~sidVYe%T+3xDMmCdUS{#mfBe*$G_E%|n*lSEP5^@?T
z>X;rFsdtPlQNAgH-nX&Z&?hB{7W%>trTWpx`|@B{+P9{|{bA{-s(RBar%*q7=XnL?
z<Ca)$&8DHPLuYwyXX|!9Ib`|E{=JXaK2PR^x!A{#934b}7r*BMgbbF>7L#t60j!dG
zjKJ{E?9iBO<Acj@4By-afBNK`9EWG{en$1I>a21a6V6b2vwSi1f?;5BpYAssl>#6V
ztvv;48B8y%{8)9!e4mlv>DW=W-;sk_q0e`DT+gq>uh^l5?R{Q$KfJZ}*+wchd?*G<
zQ6Pn2qu+QdQ64Mz0=?T&=#c=o46!jjG7{fZT!?Mx+onrgmHzyF0XY<fv^k}nHd21g
zLZ#lk+hXT0qeo~!xIf=NELq3&f74P2P|VfL!n!EJ91~ll+ghb;-dW<<yL7wt<ak1n
zVAXZVZB<DqWL3{cO_LkY=8$cGevd%@chg<%VuN4P@1R`3qd@jfIDOMKH93(X%`NL_
zR7)R=)N|?H_HCi9>gR&4o?hDMM0q$9hE5HoB2{eC>BgT4{%_?vHwdTbmY0x2M(_su
zx-91P(Bk_se^M~Sb<&1HV@<|K^{~46RMja&#!>e&X5<7(wQ^d2ge@iK_@rMH@pX6N
zbf+JE!Oc~;`322)HzR{&A6>}&_iR`4rUQa3Np0<U;c=dg4)!L$pTqkn$Dp7ZNY!AB
zOcd+g^bK3K(d?g=?&sI|w8w~v$w-JElaBlM;Us5VikQpiuM<=_(w`-cVNaHXh{Ycs
zA8{W)#R?m$Ca{j-J-vUne<QBL@LMUjjEuM+X-^IG(c3k@WfkA^xz);b==yENs*U{U
zAeEudcVJK@4nh>qh6+NmpReoJZ8%wvnwNpN<}9BodmAh*3~aWkyCsst16#rK7uY`;
zN5ceXgQCpFNU=K&XtK%ao{nv0OY6(8)79*&G_n<JINqmnNTNupI*faL&mJSp_0IHu
zp;uf~t!)v%ql7=r!o=Wb;WsxvnIb)qpaO9K(WEdUV8Xxc50(b3lO2w6nC-`elWTR>
zaR$fXm=H)r5bDY+WB`M*X=Eql^dY2i^BEpc>MkKAP|NtsBfrMmW0hXWre_B`@;Bv!
zOAkY|dtK97Yq9r}B6Zi@dTCYOA1v+!IF$Z&HJp99IH8$``uY{Wdmj$ZXu4SHg6|{m
z`5(5q<~s>!GtcI$hG=prk!6Je+x=8$%ulk#i`C7tyMbdUIRUngh@~hfWMx75tG(qU
z<x|7<{L`j(1;W-&v>HXlT~o?PXAFc~0Y6`;@aA+zz!okm5GLR%JDQD6Tw@?aGu!Vi
z6VQl22JB%)36PNmOMZuf8enQUTCpXt65O+bOV@E-Z*<y(*@Fpd@Vl&sWPbn*&sbD-
zp;1eN0H%N<FNZvR`&nGS+JaA<oG_QP;8X^7NFih(He9kobn{1;bCKyuG&ANLEa*m!
zeV-Ej@7^jvrKzmwwrl1W2_)P2oqyilu0c|NbqX-uZFp#T<Q>J$@wR&icv?KInH`>0
zLbvb&uxGw;$ImyvkW-bfjSahZ7A_Rem|+9Tctorxl3KfZAF0uxwvq<qzk}cB4kRJZ
zLH&I^YBY#(iP?O6rSt3T+RgTgoGq#5f4vXT`~34EW55d$P`ZzmTUiP0!4{xEGfwZQ
z{c;fTGv{$YHV6e@Y9U-1Sy?z`(@K}qqOJ!!Tw)MK#MlxJg+^KJBVBiQaMPwx^G^p1
zbmKpaIgOid8|g;R3ef@XaIiJyiwjL|OQG;^!@4emeaBuFIi2IBXhvijEFfrZ^Kh<j
zoLW$|SWHlV+m6U=9LE$%@Ew3!`VYJK_&OA1{A7T#9xpJh?W}8Ap%@^3?VG%wyluxI
z9vOhqyAMb#SyV!%SNDdXZ!o(w_Hl6-f?Wjp;>n_0Hu`rX%It}!YG<8HWN20KpMs+y
zi*=eZl`Py&vLsYR=Z)s(%5=+77aD5<+)p_PCo!?Ufm*Tk&1%Bh{nxc=%A!Koy^1e>
z-S7qcnq;0<n~gjJs^;kc&f^*v*yPhzT!4=FQ2Yr0e&nR!>`kR(XU)yRLlqa7IvtXV
zZhcjalg>-*s<RNiETP1JT?`m~aUGRvp1uE8|BKJt)zyly2UU>7kJ5Cu-}HW0s}}>Z
zAZTp1bf^=(ocd6{Kx9m0=Nx52{db*0KN1DIjd<Kw7;X4uL5d2BaH$HiqW05|_FaTr
z+@CH+1?FNHFi{3b?U6A!2vMx=V&49?{#K$2)MA&y4e>gyUiamn)j-dQXEQhJHXS@<
z$?9@HTHnzZIC$*dEmok>qS>3`P2T|5%Ml2Lnq<v|`}|!&n~C3*IlDm;(It?z-ja+E
z(F_}^Ji9>|b{GF|^HJj-P=XO9HM5-k+KVl_JMH8JA7JeWBoHf)a3KR4>B7~H8_#V9
zu5=>l1Gu)_8ElVjN7e&>!Q`LZm#3vGJ5K2UF8s$+mqzp-_#cvQa^Cd!jj9aaehOcc
z`1RF&*XwH8O|_lmEqEwYcK9~Go2!pejGJul$OEu-rnp=;IGPZC8)R<w{#s%m$&^Mt
z`XJ_b7uS_!Yd21M4Fyn8Fb=`rM>L;><AI6a$)zusGZgfHwXJ4a9C+xw)AEl-5<<;y
z5dRbeg^dOLuHdopY_-u5>$8at*pY2r(VshRlgA|JcqOXPRd;vr-9Phfwpuj6C%q0x
z+0RDd#pN=WAH#QHHnfX-L4<)YRO)u^T~`YU`#iQ8{m^pmV>2Ufi);R|D^Es;))_Y2
zH{$k1QWxo>6k<1p=_3Quri3lAMxVc{*`DL%VplM%K<=O~a%(uT9pYEQTYNv>I^#iz
zg*7VZfnl|4cS}JpEX?1p-)~)GT(y*`2)zg>N?!lb;`Sz=W7kY<<8WiseV>zx%ApNs
zy~XD4?msh3bR^#}{OgR5C;h;)LcSx)6UjzD9Q_4aOm^ln)RoHK?k{fi?0-wh${)76
zj;9wZ|LC|3%t2Sk+2hVI*Q{m~e$uav=R9c`8%T=f`GpF+*1y0Pb~Ic+FdB^^{GDYc
z4$TTAv{KoE!N5A=y<8Wpnj_!M#Qpnpltb*c#zrPQ<ng)hQ~Z1TWa`^!Q1D>TNt>R7
z4tliEGX**Udl^TtPG=K|kNNhrmzMXgZD)yJXOC`isY(4W`se2#sb+ENY}q9JG|EEn
zG{;h%4)6K9$qxoT4?WIT5sI^H(v3b(whtq-op&;@399bU2drNT-PS)~hEwDNat-5Y
z3~>Pk3gNWDY{|(R?RX9BAv51r`hZcW28?IRwVyk_{F}-?meJ*L08A|WX)affiO$;e
zN<Yc(T7+xM0$A&98Xo>IJ3iZ<RQv+?%ZiQz_nm*!$3Ff(%b2`3@zuzLXtNTY-uJrM
z3i}A)F;kjua@V=`!K$ZefiG-ij61@?gW=A;Bxf!5Usp+YamNgP4i~?ze7;r<n?LPH
z0!6t2JkzQ{4ULKd59+WCCDbKy@%dCth|o0$#XdZf3x$Xnz4-J>2TNYU02Mer#k63Y
zPv5CZar}D{5OmR(`kUS+-e*|?uu&M_pO#RC>}nQ8H}+v*>!dk`glEF8+oxlC$QX?S
zt$zsW;a5|kHF5RJ(BfEV0dv#w{jFPu&})SI8Ugr5NqbojV5%Wlr0YZA@V5CS8ARp;
zjRXaq%b40GHL~iI8EE#SeBoMfOeHm#-Cu0kBv*Mbta&3pQ{VdG$3=Dk=aJca((x!$
z^uU>}h#t<6BnI!PU|+zBU4})B8$#$=iY(UuXv{*x)e4Of0ZLv9kHq?Z4soxspaRoU
zfvbfu>&8=MM#LkMVdch?rP^lm#lVb7J(=3!c4eF`4)*IyUqEs|EX0IFBFU9-1#zf%
z>nDRji?KuQ?^WEUB9D(Ixs8n~T*g-E{jgckGsZ8oAr+urtD^LTIW|JDCYD;<0CEts
zjAX9_C<0&GvZfDP!?>U~Ej2y+rl~82E}AU?7n}%7<ccE_h^c6tKN@{#&_J0&T$ysH
zh%4oW<P#+8MY=dBl}sp?*HM=Nf1X`<35ShqVRouh<GXR$)!$6IZkRJW>h}8=iMKl-
zcmwJxXQBlRJ2b_HFX*=mBcCH+o5n1?*I7#WC@ge`NZ#!H@n1!c1BA`MKMz?coEnac
zGb%B@S#0#+PY(<b8WyODcQLTN{W)5M0NIp^)&<1PdGSuTi_rLiAlh3Pi~4=ZDnn_D
zKq>@ROeL)p@)Z{AR%xU(Vx&hzQj%Ja$Fd?E%Yz}gLaa>fglz9gqHw3Bs@7qaSq2w`
zEa{U~8ed9@hd3S2t^`ih&XdSC-X4siw-q}TDzEFqL^RDa{!L3+-3R{x9ywApAJjKq
zb1ua5<wqV8paVi<!8)M5P!mlT0wp4X<9UpwD#^2LV4}ULiFW!$ZIdXoNnfcE$TAL8
z15?&?{iOb5JthMkC1>3nqwBWW0-lesN&nV$z%N;tLIw`D8+;?9tbf<XR;OuXpquB)
zh2rpGXv<On>EPjX^x=EvrEOT%QlSC@^K3*$d$lH1Ohac>zVbrZ>B!~+bV}81gm8?p
zg4HpMKa_lgqa)Umk^xW!7j@H{@gX80FmWa$Ysqp_(}@gXV+JKBOME9lRx>3^{ls&e
zr--8j;F1_W@F}BdR7V8$fQAX^;QIj;{_HX-CTa!4<A)j&NU1<OtHQBT&uPqK^Z=LA
z{oZzKp;FI~Z5C0+<uZ1lIw-Ot{I4ME!r~X^vN;?~FSw36HhrY~ZdK4AR4HD2;l8I<
z5C<XSzYU7@!h>TG-IKzy{#Ab*Yu5I*s_9XoZPgN30!76FcrBWMWCd-3u#h^m%GkNW
z5&%;Pf=Ue$Tr%een(~+a@>pLDpH~X@R2;d`nPga^415)rm%+S=5Bk74*e7H(>EBMY
z{dl$#{x(GBJlk!c#q!{-EA%R4k-T@Fu~MOp2CkHBo^>5BCB$6?$6SSv1Kh`G?IbE2
zhWbXDd$@~MHJ#QO5&28Q0wWp}kTT1(eGR=a3-2)O7O0{}xCfcf9Uvz3o{wVzU?xxn
zH)v-CatW?`JvKd8)Bm$}Z)7-^TVn2dOb1@`RqgZVrjKhCe|oewPzKj~v#O_TLiO9+
z4u47w!iPEOXYd#`z7%SF=P3k1_pbs>hj}uk9KslvwAL$Am{pcpZWuKTk<+xZw1Ar-
zDx@+ufO#+}SW1bQg!&f>;w~Vip!k<32&VLzBvjYnLrFw9IPE9@jVN}At8_KcQJt0w
zrm!pg+V4OicH<YtK)39!c*TU;W?>Z7OE$I#1JwATSQknIJF!nQ%flP*KkIVOL#TU=
zX2jQhh(};)8pf<1rWw$%eGz(Fp!MU(;w&U&1zy;m*WW}KwO+TH(^!IKGlKj1h>2{5
zIQvUKs-cTr!**)J_KU}ESG6z7CLf2ctaG_0584ZF1w;3Y6Cbm+LNhJ{qg8*4d-&RO
z)ON`%v(F%~+>-}1)T$<mp@tl)0nLyVtNA%b1pMLe`~)~_x#&sL@~gNZ<Pf&`li4Z-
z%h~R4AWz}W<uqp%EFxyVNVrV7^FSVSm>ek*JADto<UbIWot-<F!M(LicQ@Hm8JH=A
z`s5sB02M6xRukhgtkwmOTis?eZ_jb9M1VCtWgZAIyr)bMp{)$Hh9s3}eX<G7)ROb7
zI8IKLLK7Ai4g26+^z9Nn6u0de0*gyfayUnf%#KDjkTRZP;#)=;^%fiI_{#Ecc_}&?
z6^TnJa6{}nVa!7~qEbUTFGLA8Tq6&nU|mcE9woV4(6(e@Pv@nA#LRpeUo<?i1tUbH
zAK*?0P;Lg{n*s{oX(zZZA*f=E^D-DJj4r@^0NAK{LOigYLb1O5LqC%>rH0G1);f_j
zn`uTa`}mi4zd{B8i%oi!m+3F9x9jWL_Sm-#%vk{}DgCl%AWRuft)`g(=W#fpBk@1W
zJ3K6->aQVbf{3z!cI>lfMtyX0JlkyTIQ;T^q*2<4wL5PjK8Oc8!t@r0kogJL(4>+f
zKrTby?=ZDp(cZ(;V-dmkd8y_0KGpGb&Cb5@??nn-rKyUwqTjZymOy8hev;W`O~pi=
zV<lspjqQqq+^TC5(Ta`P*%$;b)PDe?N)X^btg*J=PjXN2m0077uOTt4RD$LJwm4Bm
zcSM*HN+xCIk4h*qps2wfi*hZ`$+%L&u0gFZ2ZB>|CU591x^E8x_%kT}ra&z=EX{vS
z{zN2(0GgP2-gNi}w!GiCpk%o6#}9Mvv;a0$FQv<eAD*`~9>W=ir-zl+mCVfx{3SSY
ztXD!2GEm8#Re!lj!jKgdnTQO7jQ_%qo;sqtxZ<ylxG{a#*s9hKJri{pOh(#sMo@Y_
zHpH%Q%ZHF41)kRjK<eTF6q8%VHN8SHK#f@_9179WY($&B3V#s>u0s=w#Ed%?UukPo
zn35v;57yVzirl4Ef}h~T2_;~mFbKsqhEmAJD3on-EB=6G8}rQvDMj-V-!d+^{})19
z0Z?YfAC>@9g3VSwho8@2dQ07>hko!q%)oVbyDl@5XI!oa(YdAt12N}<4k-X1u=SR%
zF`wEh2niz(4~g@4c5_y}y`5v6g;ZC;1JATsi$d#Oci8Jj9Zs*p_}>iAFMeg3QNLT@
zNEON<tKNQ~8MTHXrNhwbe-y`RM=sTfRFauRp~cli(4<tekg^JsKt@(fK?LH3ARwAd
z8-1bG2n5nqim6_*($Ub5!r#zT0Ll<xMnOaiPkD%71>RLZrJH^~(0|e9?(%JzT8WL4
zoDU2WkFT8zPZmfL;17?|5Bs(7JTTWJx6Q<K!&bHIh*RY(oaQ@Mm9M^fXDPdA3R?QY
z`NPY3GyQ_!_8p7i7roCo!cQg2<1{4$LTd04tPunk;lS{qQy7VXasqaZd2tGWu_`95
zSvnKp9gQJ?9T<;@h+MsaDIKa~5{OTMY08a<Oc<$QMKm0?z4gJE?hHEwX$Mz)mjYSs
z63|PGX@b&}CyCjCk7bDs;b0;QdjpIFy~+G9sDulQRb^j(Lld}j#q)x(G%hIS!$FjX
zJjRs*B7epdUa0)t#6l@R`J?gZzig@>aD4uD?0@X4KN4T95-^XNJ`{*{#u1$>tzWpM
z_XSTHw;pE@=c}T6lz*UNJ-F!a5yX(>#?+YxpMsL}^#$^?GtJ}=#4__RRLy?p&x~e&
z{gOKnD#f|4_f-=t@f{SYlBXmpS-G8GV6na%aA~Y(AOI11SM%BDCz^<WMx|ikCnSnX
zjy127w-aMSTRMeGfZ)lLDl}re8q1E9e=<{Q>fc83bb}=h&Q+yo&|=7P!2$r4a2N)e
zmu~xtnZYjAI;x7%0e`l_abrqofn_x{&$r@A`jfTGsH4TV+p3f>J2oOajffOrtI#}(
zgTZy?ayeclV?>6xV#j3hv^9c)`(}eHrtGi!KydOw96C$q-=-Ah4;O=KjCI9j+;3fF
z-lrF{Ke-n_-T0dj8}NPJt#;5opHgE^F~v}}x{y9udJREWAk_M4*a?N8N~_8h`gVPO
zc~xa}IgTtll{4$Us7UPRJ5y}m*eYiZg^U{rk@{dj`{N}p#IMD99pVxJnqQQBnt(iA
zKK(r7%#)P>;Ub7WFuc%nyg5?-7nb+<g<u;2eT!DC1*M}^|40Wk$k80nRC_33QvC3y
zw4$pYdM$$>7p9r`A!^wtu0vaYgP*mjc7mTR7|JpBZGNqLz02VmR>(6Allf*kiREVf
zUgeHC&h_QsiX(Q}A=)`MT{dsX`*a2lf1`##LY;0w(SGDE*=pnRvL6NB{$n5(V{~5i
zv&*aUYh7^kx{dBhG4){^!YKNO4>>-&`I*_L9jDHE3Zw~lRG~{uAFlk2-j3Zq>V2kO
z_R)MPHY>itd9)O6y7e*gI3I?gqjFqE0YEJ!as4H_mi{4p7K^}7uM`=#-r4xkh<|_Z
z#;@IW@R)CX(c9#)E3-MRGb$YlJ>9PM&Gp}&+<Q|*S{JO`Z{OBs2eWkpcl8!ik2Z1v
zMI7g7WqVb=fQMg6O0cS}rCC;ci?BHHv}&K_b-d&|f?pChxvKK<?dMaP-%cO?yKgH>
zyP_@WA^~kgRQ~;sRl|<+M+<5xo*!;&g+ZTP|1d+<6}EHoYLn?;^BFbKz20=n`+c2<
zApJi4ct3aj!eh+^q0n71`t*|cxvHb`D9JDVDUQWnYfKG~oY;AlqrCfSzfo2Ev^gi+
zbB&u?BAKw0{HLD*=Uw1Y8fDsNq|Rh*fcW!#xlzmS*T%4uS<mT`&o0`Us(T}!Ck^j+
zH!ENL(mu94dBvQxYD<bJY<*}vJMrH<fii#pJ(sW+GTyD6bvM$HgU9fdR#Rseya|Px
z+gPCEqmKKk0-_v@vWR7c1ufU^JD5`w2XK=He7w`x5XK=owfa>a?tI1-5es-@_-l1I
z3D4($RmnFj&9g~5!e{>MtrZt!c7_$T&?j6uxML=~{#jxqUn)%b;kJ|IhJEPV&FdRw
z&2f0<Td{u@_W|Kc>FC=oX*sJ1{n_k4uc@Et?lRFTMcvKokTg<2g)>+Wl`KKc6vuM*
zG)sCU6*HN2v#ltl{V}zJDQ#ek(Mxf|WcGOte~d@4iO653^a+36DKO<C_5T-o<efR=
z?J@EYx@vo4^peBr8D_gPa~@O4#`Qn~8=|J`a(CWWH8KBnX*{)(o-4}gW79)RXgbu2
zk1lnto3JpVe>#~@?cHEX+r_|RK@%A&l3@0CQN#K2NKaNWqZtl29RU}cE2vDj9xnoR
zyARIOv?q-h(yo8~`<W-dL6z3Fed`gdare)Nly=;N7m2B@7;@TaDG*w*pVdu%{2nCZ
zk8o^o^(eP~zok^bb6V<VcvTy5=>c~c>bA7hGZMsd9b&7=u(f#-_vA<a{axhxevnm`
zPzC8phH+lKN;2h{RGx*N@XoDHc^EOc6Kra~+tdG<!BTKIecX}77cMIbuBefKPHVm%
zkPB+@^>AavIqq(OghM;BHC0iP^Rw7!gdpyi%rv?8JX`oMB=ekEIYh_cOB2oDkFY~6
znSABgxsOPxVo2P#`t`WMW$>x3^UF#p9RLsXK9+I|iocZ_T}@4tR)EdVgXMBBFYQst
z<$~IxL}FkxKO^e=Tt0gE;{j>$7<!OG?wviwKZ(M7PlL8s&I2$?_Y0=<cB|aAVOEQK
zM;&?Dy9=ajgxQM;O)@N|bz@@CEtO#F0OXf-tClw3ulnov=mE{RB>rJolotP|2Dao6
zq5XDvtkP(!3Mdf}upcB>ljskK0enMErjRvLB$!h1%bsE@sa^25-xQhlK!}a92xFfg
zQ1wyBlTp^88NLUcxZH^7x0=G_R<{Q)3B9OYT<pV(0;DmZX)$uj%VKOsYd-<RwruwI
z8ZwP7A2-?mcP}oep4h^u;nCgzwnT{Oim)}li%GXrK73M3&r5YX6vfaE1rUv2W3#xr
zxtLupJrYtAb<l(s=)>YXDhoNSH4F1vtc#cu>RZR|gRxjp@hs?Txw6elKjl#HVhHg4
zmSE<{)UfQC+jllT+s63x2ZO-SByhzypKNNl@Wy2Tz;7ji5g)jOOj7dVOI;j^M!~DE
zAr#kJr5=}lAJRE=2TTQ0gZE%^4}VmbM@Wppr^1bv4NkH@Zs`8+@$nK@>_P)&M{2~3
zaU{gmo?$B&c2Pr4kVcW!umhp`DkwCo%_Rm(0p80mw(-Wz2aTj$Ul<t)rS!zfMnkBi
z79>x&jO)bFf*Upruvq-99aRVnx`Z14Qkc*dOlgDw0G~20!!P9*N>ZGWp(|8*xd56K
zOnt4Y%rSh#Fjkc@wwH4}!-7G{jt%p!FcM@nFFrvkEap(9$o{udJt*+&>7M-gp)qB#
zoY$|aU^UemY*YWT8B_|6Cr3&hZX>C87Ye;N3<?iCOP1r@Sjl(t(0~_<5Y4syRDc$u
zKYb}S9IQ}4c(|vQCY;_=c{|?v{e7-|&&U}uOivPA&FmaOqxb4nXE}T%y#7}w!z|~B
ztB9+QKOzCfd+EA!gUTO|3|2WX2#Hj%0<>IqvRH^|dmQ=lnyMmlbk^F<Untl-FjX&+
z&#(b*!ka-@XPi|-&^s?nO&N?QmM4#w>P((c@0BIEDD=apxQ}jtT2I&%U*q<O#&D;j
zj=o%Z8CBN|_v9>^iFfyaWJzo)T5ZazYheEPPIu$~5K7~epUKz1V4l*J0?CL%JC!=j
z-o#qgP*gp$=*FhDani1Ke}e378&oJe1v^w??t~cRz_-_*f$A&<;id=I;zP#*jRxMe
zp^~}$isY>)LtOG{z-+f!teO!*0y^7DF1pBQ6E#XI7}=(rm4;c3cGIFCem`~nj~M>!
zQ?n=Rl88r<aeO*NUvDb&vfr#e(?KSnWA&i=vva>}(Wxc0614XxmsASIDIYaq*{_}s
z2z<VsumD_!rsHlcNjYoXY+g#R*qDX`5HsGMCWvZB{;XlMR0zK{6%BnIFej|#T1aQ8
z%cjEoEh{(-LHR0qEez&2e{MG?dwrL@MQeqG&&GOHM`8sP#J6&fyO+@J0Y@Z#!L7fO
zqtb>dkW~pvP1L>&s#dmEld>0W{5?m0lfkSd<h*HS`I|YlHL5gsE~1*PKj(APZ=3CB
zGBtOpB`mPh9%fIIh)Aq?L`w9*XFr@R?DfwBxk0<Phw??RAia{+n00G|=agX6z`E00
zpd$&lee(e+ZQ38Z>mqSFvWq_F&2G!(jQr=tCtW?;uOT!ZUwj_6_dh9b*!HCt6ZH7`
zIUQzAqCNj*VIsI-YmXz*%E8xaJ1KfSGpuXzlAXjC68KUwqG&ir0O0*n9f7~u{5pAe
zyD;S3?fG|UItJBe_cDkD)%z&%^9SJ93X;<}UV3WVuSR_4GZFX#Z@<R0I$FA(rwycH
z97p8K8ou@jH<W&x6_1+Wb66Sw5~CEqLI;7p<=FW2-`0jIeAp=y0*ZW4f?xE)?$)<a
z-iAW4Dhm&A=~}}y1n<b@3jy3O2#!zXZCJWY*l5rRrYMZV&6v@mbK-?havAgZ>}d;Q
z6Az93OlsDAcjERSG!!TdyaCNO%&jnP_D_AU*ta>;JDnKBbkN2zkhCCfa<`Gs4|Br5
zy8jg4q>UTqIF72Sfv(Z}BuS36;%fY<V+UIMKgYbT%Z9MLPe!YLt$S|v%Sm%AC1*&u
zMdMyfeQ$zM_0XObyDpV?x9lR}3*7b#{}Mi0p*>-KxHLBUT$15^I%hVN^<|DT{4*7X
z$NuN&s<)fZ#Xgr>lW!M8yJ_p~b)7|JOPkA$md0=E^v{2TN5Xt=a$tgfAsYCE54I|w
zvKPmnyH&Zr=66@M{2eNK7}8}<k$<60F35oA3WZW-Wl)tU6H|Sc%%-wG>t+0tO#7wF
zNfA;=6jvt;v^crMaPcb<V(MT+pX0O*!N9vaSlk7=_+-XfZ<KbbY7iFU`f+0308?Kd
zW>(lU!{OvIQp+d<R4*#|Hf%0s<i(YZsVWE=p8u?P93`!g)YIcy8H`uPOgE@L*wb?=
zTL0o0j>5uJl_cd2e|0w<L*Yko^tM~?NBN|6D?7a!>uJZ<<+}Nu+ATYoM=sW`LKoh&
zxMcD6b$5E=ry&Cn51z=@M!I>CtU+-WN}V`;KFQ*2`8?zG4@+ECtL<w~+iAd_F2AkK
z5rQSRJx7zO!AO1WxC*bSJ3gVW!}``ddWE;UNgn@nMCJ6WUK45OdiX}5_dcYgDSOU)
zCp$?#zg8>;e{$jc*~Mp3c!h62WMdviO^s|hmxF%(v~xb3RC+P74&x_COyN!QDr*Zz
z<(~;v<C2p`4?Q#VlPeZu2_7sqf8RW5{UKNh&L97~{6%VrdWTB2oc>m*TjS_?NT-Br
zXaK6x4SV7BO8itSo7((Z50ws;0AiZmnuiZYUGj~kiV-pc5rNCW-fkG8AsQjkRFvrK
zc!h-(u(HH}AjJGGU~I0G<gh#C*a1ZfgJ2FwBL&JL)D5JN+Fv>ofI)eYoeY!#iXbch
zNr|}|0@#6wu@fT05hTYUDl%;=_zbPELs)D`bBJZoKRzBrffj1HFqA5VO5C1re0|^T
z$xc?)YajmuUN9n!@k&f!mz@se1WNW~q7<8UK{{DjDjaN}3{na*hvF}E3DRH+SnS|k
z9a%VueAqx*NW~&!!x^@kOGMv$-3}ALN&!pDPGL3!?2!xd3KW5&)Mnoh&q7Elh9^5G
z61d<N>r<hW_qPCp&w61o_-K1nht*UxOA)XM1LzD<8O1w69+plmnD7<;3~YRK;4!iY
zPYbHRNP1ceyaIjKQJ}b-3auKvF?COQYDl>EyBtFm%VL*=?E{utL7@cn?%VI;$CLk|
zFhD4CWgfC6uDIzi#m!%8W+U_}-y6)@c$45yjSY++LFx$5WhPWyhsz8-fFK}+0<x16
z4#qUA9}}TG#9T%@&p1AK)Uy)tkLuwp`sH&4;uQ>ZbN~^W*jwQMtOP8rC;`yUiG=_o
zfhaS$1!jqu3mR|6$z4wQ;6qX$;&5U-B^+b0C^GndKq3LFWEQP}!~rKs6&j^N^>Yj|
zVGaZfB%>3d0ql*JWeE`}#4{Ah7C}a;!X`aE8+IQXuJC<F0`M3Dv!b6vlffd#y+1s+
zgAU5Hslbwj0P1%*;b}sLf!$zgLHI`}Vh*v0-5J{S{9Nj5KXnBxl!zRxJBA1y_!#L{
zhbv1>18a?;^ihHca>c2@4-^X(b<!&}L6rlTt7@h<80(qE*MTBbsX}|o!>~94a6#t0
zhzM{gq9S=fsTI2=DM$s)fS5VBJOKq7$`oLM60VA1Q8cB{0{{)cVu%rhaAnB^z$QQs
zWOrgB<7Zr?{#Wh-8ZRZ*nHG!`Y6@vTN2=+^$Enk5B8Dxs)b^Hq1fV3#@M#}@V^65+
z;lmv-lbgW=%fBM^!YIM|^Vy{md0I(MWJBJpi3k9Uh$(`KOHpF^MxY;2toTs*CTZnm
z8XrSEqSRxVMdw2>pczA9zanzdM9XU+1q5FZ7k)&976DdvB*3#&sJZMFT89GSJ<n$I
ziWjRX;&lsk1FDlbhuyEtL4yAPb#P#-{NhVtZshtP>lJo(W`1G5-QQmP_U>;F&-9vT
z7@`ddl)cqzR!W5j-H~${5O^cOP_)d9nVKGOzLO1VjF;8l6BBAl7bz@mG^M~AH<`PF
zsb!wVio)&p!JG%8h;k6)_L}<hsr=Ib_J$0HctbDVz9|=qPA*C;|Lei?w^0T)J`pkA
zvf@OkZNVZ=IkhsW+;9C`g%m33?!m-Y4t}(+%?bZ79HXPRVfi5$85}E4jfAN^kmR9O
zD1n;X{U4i?K9;h=Jy_9RyE3qaNnuW$g_`FnLkj9w<zO$P+Kp{iiAN%{!5VAHo5pVy
zWig8o3rW`L{_^c%6qvo^ZsG1(9y0BgHUhal%+L%6F*)5%Gz~hmZBMmTblUlE{c@2o
zDjo*q9La|Ul}f<)2`q?u<kt7$(XkCOc67gYtg7tpYI_(k-{7JR;ZYkwD!8sC+Q~?X
zq^}feb-6d}U#%-q9{bj)%|`<XlXgktEcgL3lQAm#wrRq=2?~So=7z~l`KMJkLzq=n
zmTMCSxaS-`z7Y0T(<yTF*N}$}fji@;{PA#wYc;W!`jF2_AMXEhO@b9!VW1R}Z?mJW
ztZc(~rYwj07&xJJ4qrS3`-@Ru-5YT~z8tM{F9jO<ETS3~M_SNf`I7=mC%{GU)<I^o
z<z(LOilm*-G~%>T#Ibu^((){uEWiIdH5?IoQ>_OX;4`bhC`JiEWlPt_Vc^*|3&qO1
z&~FFdeig~}+w-cn(TUlVG}w@Ytt6H#Zyqr88TGN_rI0{OM9r+`VYS$2jB0%qq}BKg
zTryOUsE=wa>-%SFAwnB-Z2GFT=-=1-Es;_LE=N(zE4~=HP@H|RF4WtW)k&v-hBlT|
zX);u7u`G}XpUJ*HGdTh?Qpj{u)FOLlP{v<$rys*i47)SQqGsE$E@XVNC(X^LNtt0%
zqsO&9n8j@j4y&<}`J>sBBtttU3mQAH@%gUxRYts>43a(bXkS^C0+yvT0rrNMYY3tq
zu<OC9C}Y8Yhqv4A_s7-!&J28GK~+Og*Z*D@T&`&~Yzc#-7GWIbTV3kz(C-e5jElUe
z^M#Tr!N1QdTjVh0hX|O#g}-|>)(Cp$xPkKvDG&-77^mSaU5=f5cb(s;c1=!aj>M)w
zQ;3To%w54cd~YQ7ukcX}5_@JmgsWTTj#aV)=Da`0W|eqXdK4N0!s%Ul&6y;&IukKG
za4~zQY<N-WP*h!9qMeA~wx9?%iWMlxRFeJG1`$2rWM=NmG@QrO*qY}&a{UWNr>*~w
zytis=tL>tNgG+Hrad#`U6e;d44ut?M4h4!^AvggF4en5!A_an%;#%C@-6<L@cwmR;
z{SLm9{R{Rw;2>ARTDfz}oO8@E#uri2XfCt1uj3V9PxmqOB+l>VbD8f`9uo<2a+y{3
z0TrkujKml>U+EZZ9<SEivE8t*`8C>`$j{0`H92(hNhyhBlW0>J18y|2H;pKF4%#xl
z)@17fzF@0<>3%upgFt<~LiBV>Zt~{skLj6{n2EH9>zOl=QpQ3>UApYwJa-^P<BG<y
zWg^phPvp&98T8XUXFw5%&1H&)|G{BPiOg0OrAAAad8;3FK$C=wtYQWIExDEDCYfS0
z7YdFH`p{>Zl91PtK^*T(JCL>VGHAGM<rite9{@N2P@Zpsqk|X8zO$B~fp|fYT&0;Q
ziVbaiB_EAPp^J~Q-y)6wo(!iG`dWL;H|qI7TDjIk1$Rj+?Pt5&$N#l0@#g#{D98QH
zC8MK*H%ArvBBhL{%+RRrRDN*@s&3lc?&--b&!&4i(J=Bx5UZL%aG-J};T0NR^Hu~R
zDqVv-Pz&{2peDb17gXkeOt&jo!Sn~DpBv(UUV^K_W!qoE7#pS!?Ee5wQjGIv38P|2
zQm#fPE+V01ALuevDwg{qffhs-EB!+n7nbB;XxJ1TC0I7h5*=i2z<*XeCfetP+E@8o
z;pFgHDe4xhPLV-0*IT?QC`s5kItBF-kq|uHns;u+`}g8a0OY?WTN~&{hondl37_%X
z0hi*XfP>){smGhO<tse4Fj@(#dsx8Y*Y>~_)}dTFn(l(`XL{MUtPSgm3kQCAFB>_s
z_&+ua*viUXlC56<Zi20+k?*AY;_H0Nk)0W9KSv5^!V5Mv?t*9~6n#5780hDVnty2z
z1~(9^6?i?Hk-B*AvMicVLhZKnE-#vZmmKTyX7r(#j<@Wnz&uV2*%&Zi?y;CzUrcA*
ztmh=s<sQcJ$7C<_JSVB%vB?1{>4TcD<ALVaQ2U}{3&=#Bl)~p{GwS!{Aa%0hzB+##
zJf?_Apv;9g(dVjvz>aZrfCVvF&;fX&zEYj_y8^Aa=nb2K-a6&IB3-Oql*|$&W$-{f
z_(`*{sLNuC1V~q~pGx7L)jBnkLC5-iL3tch^#?0(6VV#+T`f9+#_eyL?2grwjcrB^
z`xlecSF7o&JYm%?Q|^PSO)(NxbBhxDU8G`sZLZTcCRx`hB{*QuaC@_LLlAsA3XA5m
zoozFVJRlO%kps(Jt$5sPID)NKxLi;qb){wis~_T5ky<V9JeIvzb*Onw5+LW&>|>SP
z7c&R~{p*BmIO6syiuu#^d|{F1r^WG=Rf!z@ny73^uPz&tH@rvt3LjC}cnwBaRmdym
zU}zM=Ha(;<1qvZd{J*ZRH%IC3swnG>>D4mLe3?RZ5pAQpFT}IOHiEH3u-;7l>M5cK
zLE6u+_*u`=UJ-~3v45(2UFVMg$7Wm>*sMPM@+764e4N-wk@VE>y>(}0D*S|w8Te1`
zdkms`^za`J7cj8FOd$*z5j;x%H-Mx%Y1Q5g?~z7%{NLgY&Xcr#MjpM86(^~qsmlug
z{y$OXOm|b8w_Ap*CbF4=E`N^s0u0<P*4;_j)UNN`-NlYYuDcS>wrF_8!lOVds1hkg
zHW>IOh}S4AS!(DZRXs_#-3LxIHG7ebnIK*4YM_vc<m2N%^4)Ej-=}QB2KHJ$Lsn4+
z_U4K)Y2uENsp3k3<KJoDz_qjEMT`aglpWJ4Nv&6NHKeX4=wBX+-YsQGnMj;g=uJDm
z8uZa9<+7)+8%0opJZk)4ctlevNz6b&>AiI<YGdC(Q|ug(PXVXh$NP4}#P6bR)>Bb6
z5^Ei`iDD^ED{olYvp$!%>B<b}DH7NVMhGvOUzDoleO#<6;xBP?PheBO9_kNXeU#a>
zDXgOlFq~*{MF52_G9RnSRFb39{ltU?SvDLrYkgXdW{`NjG5&C0pJyJV2ns3kWZ&rW
zf(cR{ns{FjpMwkD`Bt86oJ02q!}NH?5mRpMXBJW~E@?g2;zDi~LN<HxhcZi`!Q8qn
zV`I&}DYWuHLndfz1!!gNG~#kWTPK6m-B`Yh6)JkYmeCt1XpPVZ8ret#?{-FgIHOa#
z#~W7>{p6WY$ouQg5=deTM1ur))Np-f4X)WW$%nQY>P}DG@4isY;s;GkZG<O5_v7QW
zdCT=1ii*9)OL>hNN}(H_!~@XAwMyo!>^BNnxAHah=G!WH>@OW{_?j<*io&@|Afztw
z5Iu6myP-)H*BkP515v&<<#0q&V1e*x1R<KiX@ho_ql3~+nb!1)@~)RuCLzz=Oh5^g
zo1H1%c(&;}%x-%I`hTbNDLH+q4^QxaWQlG#k1Iex_wSp$Q&!iezicB}eNy)&HzLQi
z?0RG!+s(65;h?okNRe$k%g1eF#h!HVb7iE7E_R}lk;}YG_F>ch{yljc|IX;_7Q53{
zynK{`3G6I}p3faVym~DPn4N9fPt>q+sYGn3*jf((G&3X?uiW3=(W5eau7WRlrMxs*
zaJ@f88n#7GGI;<B7OiFh?%!$gzDZjBd)yu!+<yJIc34XqO<H2fS)$U#x<6>FLll4T
zF%V9U_-!?@mAom%MN}Y<`*P*;?c9N5`vj|9+Z8WhYZ?&zSZ`BV*mU!lX_ed8g#P(s
z2yee>DxH_S&?;&T9Q|vio4VKb#)8|(fJO$?a#uB9xs*f)YhGXPW!e3q^<aVdWI;*|
zja%XzMfo8q%#UdUg$Zu&;IjRD#l+~%2IBi%ErE&ZzzOujRTZh{(0kBycE0{vpR7X7
z!VD`CW{CO^K@nCdvbzL8bOSc+yy3tyliQxI?5vUxx_Y`+o%*1=<(6Ro4Yh!nLJh3S
z%hK%n-kmr1TfI>*pKR?!J-(xxky$luZNw=(v$Q@+;>$K%_|1Lm9e_bdhPU^XkLmKT
zutK`}pho5n>$+k2Nk?;_iP7h}jOojUgiN*>vCSy33~{8HIU17-zoF;-_ypogIKu;4
zToFtvB$z1BY6zV-kBD~iYq^_HQQ=N;d5M+t2<3L$RGg-FXF(q-E!MB(ZRO8S;iGFg
zCjelQkI;)el8<{V3ejJ^0*-S`6QNg~DjKDN;ky}nt#?_sj}MLqSOh%Dq(U!*dWow;
z7<JpjP$}v!Z_VT9Itfq#Dsf?qlD0Os8MT~cpY)FRmU9FhcGi(6O5T6X&rxzo7q8FE
zBRM5S3!N_;nmdJ#R__{%xbQ^G*-#cPS1bqLA)Bt<L8!Mo;!B_&%9^|5)rW3Ar4OdE
zHw8%tKDfhe;&oOXp4rdyI7|#%f5RH>sITT$qv;L?Ss_2<V?~^|Qh3I{)7gvceYOfc
z2h2RKVSTvdUA@a1-tLpzq%n|-V`T=oD9A^H%U2O;4r-HT^r!&ycbb^hD6vx4YqI?*
zJK_X6_Y=>Sua=CiTG3S1qRu71#sJ9xND*j3Y<Zs}l$@{2@PZG;>r<0uRn9ovuXOrJ
z*BD54!}*3_kH=RBpgFIu*;Y<)R9~5a)b-NRMtpZ9Hf{(zY1lw}_Dj0)r4LQJS-2EM
zI9a6a(=*;9RhF1sQ8ueb8WKqMRO)}xi3=BRr_ve@Mg2M(N?O1Pb9q?^H*p-hm1mK&
zTuDW-yQMXFrP{wEXQ$eKC=>@T4(WNdywPC&DOyyh_}_4{W5ZuE%(Gs4JAjp3TtQ##
zHa125x$*9x+TAf>d)LiFz|z89uZo?<qwBfZ_t>j0wvV%uZi|OxCU*nv78<W#>7WlF
zDftE__mfHRcQjNn)Vw3XslboVSy1)2T-i2c3BePcF$*9153A`XULcjSE88#YmZ(PO
z-mI9hBl}?e;olc*Cex)nD`m~hS=oReU^%5LfRe|%@9=H^rtz*3YA1c41#&Uaisj#_
zy_n$z*Ljl)l=oA`A}Tumhs8)1(ct*IU4up654x}Ec<cc=ZfaBd<kQpF4DUor1*MSy
z2Q|as)07g(-!GAo?BVr!l_gr{5_457GDpMw<@i$&*SD!fLql&z$I<M^h3kIped}SJ
zUDHidwWB^_B0=zPj4`(Z+NeIas**>-l%%Ga>%Up~vGpriNG7h^m0$Bwv{ha|>a=X}
zIrZw7(m_!3BO+M%A@AUFz8{V7EJ5Rn9P62u!}5%nS$dfQ5ZG?DcH2gHc~ui|k%!Io
zR#%_VRI5t5*vX+-=Zv`r<zMS=AdP0*<_tjOBAc|mXPb<BX4M5&q}^sJZ3J3Q=J9mw
z;u{fc$N**G7vk%2S`pW3UD1LD3sMx@7Y54@o3?v;YHNSpYB|u~{JSQkpQ$`A=fbUJ
z8UbVmAh=P~=--lHbJd~4Lmg~}(~ftuozkn4b`F3Uf#YLfNtMn2-_n!w_N9Xy)*4(H
z-he(XcIS!x$zU)?<$SBrHBhShbgw=XhwNJpO&fKptz}l;8+Hpny%h1YFLd4In;IUq
zil6pt%GtsWifTCZ_WR3>fIf%cRm&9v%LiYMv(rVr{dC<b!K{GJ!YsNWN0?su_|V{Y
zCAg_r!Xbcm68oU?I3;dtzns;?P;_2Q&MzlxM}IX{EvV*RBT$R3>1!WND<C9=w6!2Y
zLqF|W@>^GE2+X>T@uxsCO!+v4g{J7eL;yRLc!c|2%JQfei2aUc<(qs`Hafq^`L6<t
zLb4WUKsJ(7`LK>p7Ob2}<@XVeZNTN1O(h(p-^6_gYqtX=?PsojT+zz|!idtt#gRZ+
z4>ikE!PSXIjk-M{O&?rKs&0JMt70O0AB!X>Zr+N*NSV;WRlnp|^E4+v_-AGdNwjsV
zVnE)Km8*qZkyd>EyN$tj62bSw$<OUDj9v4iy-&i-Oi5VetKhoT2G8u4p5s5?Q9qnz
zwl!8-<wfNQ1mwEWmD393S{53)rmN=lDB^Y?)Z4d!%oa=4qZg}QpaVUPYIojv&pC!D
z4b7@wn8DldUBvIx*x+2^vexRR<D1=E&|E4N)impX?)1AEwi4ZojJwGN#AR8gP+s$A
zr5h)`I{%G?@xId)R(t)wKN~8cB-JL3<|p*C+n^=04uv)ht+bD8w6aBDeHnEYYx~iR
z?^rK}>CYRo<ElQW`#%4)ppW7GaMAl?%=rQf!}PWCUfk<Wq+(NOKeC?KlP)b-bV`h(
z&Jq2~7_p0iMRR}gr$52}bnsVaj1Fv#BrFnIXCJ}W!l5Ct{B=Spu|x}&L`jjHo$OJo
zuv=Dlg~lv(ALrTe8BRCE5ab5H1uYHF$cr6HgMD^5rFLB&E<vB7MgH7W61M*!az0r}
z5ZzeA6z0M0v@37MRLc`>^-Ke-#A2Xeg1yZwX72aN9oL&L+l+DA;c#VZy~#+m&RED)
z1oMw_SoZqe9ku!$7x=G%HKB;qSsybB|A$%sg%k}k>C-TFSG4ZNO-W29-WS9lV;O~t
zF1f&FYJdZ$MkVl)B$hWApOV5|HjYygaj`Z%JC$*tx|GPdp|U12oY(gxVVce`xP#=3
zsW|f|diHMijC!t>Clk`v{40tP4;TyV-=BuKZTz$aBqsB)Gf4TlVd#WR`~d{X0s$6%
zfY?Q+Jj%(V9_>u;B584@jxE52xU)&~vd0Y$Dd2gQ-)q7*N57u|U<%E8d`KgYH45AL
zE_5~v0N&Eq%y|nR0rN!Blo2fHeevG$H3X<!1d;Awf|O+$N=4N)Nx{m+AtfSt^fOm;
zG#$x1;H$5En9O!qapFUg>R`oXPJIq7NvIu(m+AC9X-zPq9LMxB954p4Uc$+riE^RW
ziaNLDO~np7LV^bNsANPXL5bk}b8eHAdJp7zh6(1#Ke5I^>d!kR|93_qj{^t;DiO&S
zqsedC*=_SnyL|@<O~<<aE^#$8o`gAL)7qGaMYUt%#wZ`oxHK`UrBmlYAhImaQuo44
zwdd8Il^6zrS_vAWv&h$2+E)F|8n_2_#lVHCx`3JAr}FBF?@8D!JA>VJ*(4-Pxn3H7
z-W3<JjWO!<&nGH$?uesB(&Gz!=*H?dYmZiMYeg;FugC|$4h|?e`=+IAj?0`Tl?m+8
z;+sCW6}VSJOZUkFd9_0C!?01+s;OAjbRk(kENl-p$4Q?vm}7L7K*N>)8RTq8*WS-1
zplL`_`k6{$#<>x}x?G0r0=}xlhK1h*Y)>t9xx|aGQoGV=HaI<XX+-{SB;NnrNa}wb
zO!|RODNJbo%VxTxZMKMi_oG-6pdAIwO#wCqctSjP{lb1I)ts-tGtv_VKUnV)cz~u4
zYXtxkYm$1pmJD82j!Vmne_&bb%^qK=(KwQ^HQ_-J5i7Z#u798E?l;*b-PRcWonr%6
zu;@7e4Mvx`;0!(NccndqhW_*MgfxoOPU7nWgT=LU8z{~W(SXGzbO4LIRkvQNj_1EX
zt=&xOn%ckIES%3HPIuCY8Pt^^@$#g*&$D!tNKdW07Xr1)CZ-Mxu=$ibW&rZoYM%IA
z+?w8Jr!+)PO)aSEA#*$Se~xpXHML&Wcot?ZYDm%UPlx9B#fqi&pFC-4JprtzY+BM~
zRV^A4t+%%U`lO@+2G4c@wur7?AE(>i_J|11ax~4!splvKD|7(pP|pvJ!olPbsq0FN
z!fZz-G%%gt66zZ_1ArW23y5;D>c%G+HRM#=0KlC(YSXAHuz}aLm1UMNTdDhcjdzUL
zm4EcwD14zdUv&3d<=5kewV*{Wp#A8oXR}wqU=$$T(0ux@;dr6jz^UmXuXo2|$#G*?
z_(#=dHx3Uc_;l0HBJ;qI5G+w93Xd?_&6>T>6xb(}91_FgNX0IL|Munt)V}&bq(IXh
zYI=(`W!I&cZl^Y$KKE&ac|f~r$?azQTzJ)dT0x4LjfE~$VR>z}0?Wn!pbQ(5n_<xS
z72cnHd_QHqu?(W{yRocoZ7tm|nfLir3_^BVBO~E?U*zN=V3h{Q5ApdQYYPVNPSJXc
zXr8sS1cRi;&V%Wng?W=7>K?sF_gE-8S)=967M&+b$<A7IKXuZ3*wGO5Ic`|)CU^Bi
zQu7MwO6(lUm%lIrOw}#sdjS8M_Ch~bbAEATCl(R3V2f(G)=-JQnG3>6h^I`qWh_n?
z)@!D(tTlT?E6i|Fbeu_Mg}L=yNSsLzKKP__n0Wq+>)3Sjez@-4kfM*ab`ZEtv)d;S
z9h3KB=}^r|@{6~#84bAp!J2*d^`O%Y^T8|N^={S9<ilza%wNqfSFy$V*`Lh&Vj%sU
zNWih%@FN^&>tL%vAy`AA{;bi?GS5M{?OG4untc~wgbZiG3H>Vl(;b>#yyRK+OW`s3
zn{IC?W|;<%X?^dFg_-5qXk6eqout&;*V+W5t=|uKtr~Ma3$qv%U3??teJVebGw~?}
z3MY74i?RQ$Mb1R&oG!9+1AD=Trd9uSX8I@Ix}G)W%vY6wwz4wXJA}Z#aMr%U`-t%$
zr{jg{$KsFoUw|MR#Dbt3T?MUx<5@`CmxsQ0%|#q>DIFFX(+;f#HbvT}KTQaNo|l~$
zkV(||91Vf>zH_j_(g{SrE_BB5N?fU?-SOB`!8?2Z99F{X5tmVg4(iaCr)=g@V=12h
zvMY?fUa|S{@$_-$ws=qmz^7|qQa|j>-BH;;E4?RERR%|k9rTuDP~XFEJapbIDzw?X
zJ>3k8WtC%KgRl(u(wTf(oWDt$)XNI+k8?;6DKDvz+K^*zyH|)t%2C39#5l_@uin7Q
z7oAd~go-7$#**a-%jf+-+S-N2W=yao>U63$c+%erKfAROlXK`7o9Ji}-yv&Bqo&E1
z&P*%RLP-bj1W#Wf)7jzwrLzk95eCuCDVL<`wsT(+j_I>E?;02Ds0tRHj6hRyf(gIE
z$@$}UB>4A-=;h&2-3Ju{d5dg_+uUDyQnpkF$ePNxzWv6UgRE?Fzny9T1C&kBjZ1kZ
zvgs7p<OPFTtaj+t%d}VS`xOC&X||i1Mb@B^5~}DwhFguH(E|QcKI@|g2M1I!ck;_-
zSi3^$GM_W9S(KP+8KflwdEVn^KyB>l?iN>50?wERrWM@bBhTWPV>`Gw<HFDLTCcm4
z3F+qjc4=R7PO1ISZ|khBEih7ODbkANuA^~LJ!gEE_$=5(PNMo$I#+fi4vBQ;OH#UY
z^!Km7V~HTIE~_F{ObgZhkd$9)pVUCIg3THwLxrL<tvJxvcYXE0ASjS4;5Q@CWCT`w
z2U)(-1Y!n#i@D1BL807LN>>AC9=#8IIKj)OhKOQWL~+R%msu`(?M^$qefFLVrbEm%
zt254Tp+fW)XeF16PSUwgS`?_IMMA90Q4sMtB*@LU1|EyHNZ{A@tC#5%6&4A!hzd%4
z&j0)mN~%7JR$xyVGjQ7Hoqdq%OAH$ceJfygS3*zp{k02K{8`wPml-N4PJ55fXxwnG
z8WgJ3k&_{4t_m480QW)2C#R@}?Sqjeaq<^|tN^yHLpx}T)Ss#UMA_5-1H7FwbhJ{4
z1_tykfQ)gPEp+y!W&}GbN@0<*+UvB$811Sv@le<Ij&&9q_xa)B*jm#SWn_X3Ay7+Q
zeA+#T4rfvY3W|)cu<2y8uq6`1h&yQ{Att5FOQSlgtc&Lg4E_+y_@M|Jf=7)3i41!G
z7mABYTC7D(MjwsC^m=GrT2-?K6I#p@xG)x?7p42`5}5k2lZ|953jZ<SazsqC9BO+v
z`pGAov##y}ljY*u7&EPmH)#QRRRGte9)^vF0#7p}ymc;i1&!k;^Utd+%?Te@Cs((-
zp^TQ<+lRT`p%xS!hMX@W8cL2k>hrrpLtdB0K4sQ%RGsVg1Z>AzjhWdL6xK);F^AX4
z0=T|Y&<@nL@^_y5#E|<5-A@GxqCane|5zN$`l)m+u-Nd%oyOME=-(f|PfM&S-p>iZ
z2eJb`x&Oh@4$Y^@5zU8280VI!y(}Y}4j5lMFJ_aQ;!i%2%8j-9L1&{M&W-`Mxs#r_
zIL!Z^;7#<akU67r3|~az9aSQo93A@Scw{u*`WcG+yF!Ey%nx8tb@~w{)b(2(B}lNs
z>+EkECjP(gr9LL-zaM>{2QQKwES)T{`vw<k1U&REs65`!%w|D;@~Jp&oIY)g38ltQ
zk`JmtUCyqHNuyw%tW0DNPjo^Vk`VQ<TxutokO*b<LH&6437Ha!Dg8pGL@=@s+-W>o
z638uKnMwOeph<X7=xz-0)qB|KiAA-7^V<*k>J?_N(v{Q2=o7jO3bHRlMgT~s`gN)Q
z?=47W+^4H2RQBnAK3&wfsYr2{r^}w>>H9rhRX<~k|6hA!)yrGLhordE>?j0?kb&7E
zSeo>RL0Sog=-;ZuuSj8eGH_5%4#Q2p^ld9}7cg(guql<G`t);QaD+%<<9?XfqNjvO
zBw_<l7^Y8xWjlG0L>#2eX)IeEvk$YtLqZ=DGERrRG}|u1Wd#DIgZ|moe)6sO*`J*c
z)da>$Cw-G%H$r<$NbH37qtuLaMSl3R)TY$;r#q~u{oy(as{E?MAy4c-qk6L{mVeGq
z+(j~#{cS2iHjL)Y3nvQu!)c9jk>eB%<D0J6f|y`gz;GAx^Fcfv$O&*vav6$~&%jvp
z`mcajkQP&r74gOu?(;wC^32TWo|eSsx^QEe41t{jpr5s|*ltg+w_wk(#_-CSShqv3
zu$bnpBr=+u?bkt0Sz77u4@Ehj&*>O~xQM+lI5{q9C^^tbKHTa|qDvL9*x|k5V8D$H
z#os>Z9OLYWr4dlh#g%{b#rzr0KVKxN&)NTa%yz)++Tj*-Mq(qo7|SDE@Kslv*4ER5
zHUp{4%)N}Ikrg#F?1COD2@)~?7WEZ15%NaDU{7lGrxAQBtiwQB8B-ZTh90H^oH7_}
zdFAid+d;qj2Bc=PGW7yawYZ~&l@Oe?vQ&(uM_wf$CvnehUj9M;Zpp8O7<n@#X_iC_
zEg+M+@NCe64k)qHMD9;~QJT&}<4}Nd?pVLkDH}Tf5MDebD#!3^RV!MvR+(v@%Qx07
z(3<zw6zHg|4G!|tW#J56#SeNW;g~T|6Jd%!P`^#Pr@hh$zBx6ECyoC8zQ!9h-i^%o
zdYI-cogwwt7g0;C_d|+FeQ)h_)V|(vXRbE7Ugq(MK^@jb&5Csf8kfI)NsmfL+0p@w
z^H?REW=lV<%mW3I7LBftGGKB5zhWm78JL3-%40Bdx4}IOTa8kac|Igv|2@Bv19iIc
zg`S0U;y0v9(y?6dGxv_QNxf~s>G8G_sjy}((N{__GQpAMO&979%&b5%oLjXaxwBc|
zE0(u1Sb96tzw7o5WfFWio)fA4dVx+<n5SWl2@W;;l@RF+$=B2(GIK`6xXa->Z61{*
zZvsbu0-)yl%~76a75D!_=L4FpZ7>uj^(&Yl3aThBkA)!mSt3e+`R_UZo&db3zULnA
zoUmjW41n}V2?djodP_SugEN<`^sWLq-`pdb<GL#o#JhsXeV?VM9+e-nr~T7!-JhDo
z(hU3EXi(H7TOO&;F-i~x?&g>lPbJKAOs4$c!H*$V`x53_uoiOnT**++Q)5G(kzxom
zoFNn%12lXs)XB5~kbZ8=h>IO28smzj?RB$?R^T$S6i0A@$tzfQ%`w{XtjyVjP~l8U
zUu1qVz*<E4_Q{L^K*BHuM&ph~69rkuZ){p&b&os5;FC*+)OKv0=Fzy0A5s#{<}f$9
zhod9A4Ovq1bK~YOc)Bpp&Gl_UHR>1YHE3w>B~kVzAlrhkqLJG$?))d@ayiPAtNkg1
z8`GCI;&W^|7wUHz81Z+l^3zis?fcmQpNbm2Qga%&dAkp<)L15QbaqxR%cu3?LZI9Z
z|5K<S?UfAI8+0Ys+SVr08ya$3Z3R6d;zVgw>I5q4s*2<J11X9Uu5YOjeKL8Wn<y=5
zKpw+~3UoMFAZuP;-ETB%HjzG{_7p<+!eb$>25$gO9cSgHZy`xa0yG^-nXcX@xjJFh
z!twndfCmSX1lW`}jb*l*>MpqSFlq=2$**OVazrOT=h{z=greHxNwd&BpJC;ZH=9DM
zkhJb~ctM7%C=W^OYhHHupVQI|YOmCR$j54*%6ADp-cOWImZuABhP@28RkaxLNePw3
z`!U|M(|H#~Wo*I$G-R}n0&p2dB@zLpDCVsKdVHducDkA1&x@Ifcq$?G<*){8JsRI4
z+E>uye(SLr`IX=wcou3g;^%)<=Lc$Qpj@0`RXsm9g&${66X+GlvPZX@$ayt)_rb>t
z-YZ^<zsl-<&Z#_aD6ipz)dVaS7e+r!y<<e(fkI<tbG&xQ^o;bHCZpk`!niJv0dGEA
z9rImDoO!M`lMn6FO&WAa*LmPy*9=Z~L}q|9LADZGL^eg|(XKO>ZD#}fqyDW|Sz>No
zM^d-zMfARH@JIpRH;6pt3zq`J*89<r5S9GrOvD=s_=|nNW#229%D*vRlycI#s(6{$
zw$f}hP9n_D0G<<fc<Vkn7@#d|DF&0Mrusej2ot`E61JOnqKD^}Dm&A84H^ln2f&x;
zoj_McvRzP~(v1B376c$*&YR%{l8cHDN<rMzwcJ)ys`y|=7Tb|n!f062h2_B_Q`7IV
z=lt*hm1w20;J4S;nu5dp2HV!n?Eh`yM{-bQ(-5XhmKXYnyz|hc&4G<>WKW1@-4z|l
zi;DL4fsjAutkDR5G$a?YTiWYyp_ELDk?|XQp{&pB4sXsNom>fZ0?eT?3-%UG{qmk6
z@hG~8vyOlB?5=`Zp5tE?D4!2TbO7v-U#Arj3^%zJmruOGSI|^|G_+ZCF9j1IMNjwC
znTlq%&Tf=b6(b?F3qyqU1Hs4+jn#9pQBj9USmt1|51oGzq9_mXd)s04*72N^z5HRl
zp=3myJK$s;zNoIXOR=KAcy&$c$2U?g1%D=(w`jP8#U<a<Xjvj!ULPcs&3{<Q-2cpN
zYp{h|U35g^{=He9(RX^^B1$nvN5?+F*{&=#H&V1XPoaimq@3Vg>gd90`Wtb0tOF%u
zE?$i)wDEq(NeEUMQCz`$ryT8&W_&Gl1uwBA?J4<<S{eFqX5KrmH{xSuOg}B?9L1=I
zN^Dv(>V#*g@yOUT<?!lP>!Cy^K!1t2QxU%=<PB?w-0<x-H8PMjcnYyjFO_D#q7w_T
zaSdng9@kF;)N7jU&1f$fh{A&V;gS!KCoLgVX$2A4)jYzR!IyOoo%;cIgFV3n+vfPt
z20wY|N(qB?jm2;~M%$3ueD2LZW)$#SoK1FuDt>#wJG<m9z0!O3PF5r%&bG&Q`%muQ
zAM4Eg{;K!?5Q7@`@&_6L6Rs2e(6O2$i!7gLp#?l?n08Es=!Ubc$@16!>`=e0=Ky?b
z-@Hu_c}hj|AOFR3anmWZ+Pkz8yTG0V4fOUzX1Po~lWCWQSwsh`??CT&T915)ey0&g
zLY`3SgInKv-lHq&_d`^sA}S`b5S8b@oq9~t$(o&wN6`r1&_9@6q85b>Eu<H2BJq@m
z4OzeoGcHyJwF~%Bi_bcrt7)S0gI!A7N&0u)k||dU(!1YR`<z&3(sKpu(hB0Z*K-f)
z0Df@IW!#NXiqzQn1WS5MB*2oXR67S5L$CZ78^amPxb#?azFr+U4u1+5c;}X4NCNUe
z1$$dE=A6}G0j5+JPys(szCPaSVSc&qe-sM%N8#?$uQI~fCngtHS@+p|U3{TX1LkBR
zxPQ0nTCSKG{7R;oi~(B*qG~yRw&JBi{Vp0;?p91rjq6a=&u<c^S2jt$@5g0z>o7bD
zAB!#@I^r|=rqpWGFR?F4`N-rp+{VP{{(<nj-&Ue>gSlH=SxJQ*^$xwiKERT3r@q>H
zZJ^}KVwqj-vO3CYtJU<QujSeR_t8mht<+<~%{NjViC3Is@H-RkvHKP`ijxAADq^(!
zeH4lk)leq#lz~~JMF1XP9I6hGKD5tLnDu{r*Dx$R3~RwcSJi^&%xq>p!vqjjhP*{A
z%#$y#kkw2>V=~@j=VQME|MvVT->2Z~r}iK=%X|N_3%sE6qpiNFx%T`IsnBA6!<t{n
ze2|E^+l(#aro{VXB=S&Y%t9cjid)*U-t*<U@|T8hb2gJ6l%;LE56$zIQ`nxxLDU$?
z@w+8P!nonmTk^}}PE6KXrk<v{h2=Z8fw&}Orpd${G_v|wE8=gs9M;Q~BTv<fcD@VM
z4eG22M>4nn{WaHcGCSZ!xtCzXtMQh{g@|Y*kwj8@Zch&Y_2%RVs~jA-s$Ss%d4bsa
zci&b11!C>}BP0~tOjl?59U8{*TT4>Uc>MSd6ORHi^8n{dy1d-LVNdlfKlpo>d|2~t
zdw~1dcN+fN^>^tSk{^?%dV796q!K1NHuuwj439j->RlCb-{W*8i^x8I^{SX%cJF7e
ziiS9F#ZLp{Hn{>dhl)Bk5R2~E@4l*I<!+9HPm&;7BrRW{e6%U$7Y2ZPrl)eI9pcS_
z!o%_ncLG)BTM$l-rnimFtU!crTczpn!ZfXzdL?;=D14EA={Z*yalwqrM}w@$E4HVY
z!J-A*%tWa6exRftp?)61HwH1Q(hVN~$oI!5MH93+Yi({F<<_pNe|z6JA}W9xnt5Xc
zU|_XS!iUHw&`6Ll4k8GppfTdQcL>=00u^Ucy{5j;=+JJ=!wEs%s;+>8^{$01dd}uq
zi7RvO3`|swkumnfJ3a>;TfRq{_|=$u?-3nR5)d2DhP^}~+Na6q<+Vwt)f}jG6x~%F
z4_i_E?equB_M7nST({Q8plmlaBc%Re_2sdcsMOa#UqR#}q83wcSG80~Qrf*(pXDHY
zc7OOTpH#P@waX_%l60ofhC0B%0hpP>tMOQ8^oaK`G=b%KR)CHh=D%$J1Qc7T8)AmA
z#QYcUr-j##egbfwt&P0(o+vpQl@I`dBfKn>y7<k3mPH+St$5?otXU6HM<5y#juc-9
zb?P&=AmTWg>Tg5VZ#N}WYiE!s>m44|E=oxNqKRbeHY}!TB@kQsbL>54OGWF%?0om4
z8Y6@G@xvs&_i?k^oW`iboI{$ohJ0&_-`eq_-u+thS#Eb1f37TQ&iOt!-q`he9+@U4
z;BhwnySsaGN*QW<R$tmcvlm+3%BS-rzI0i9^Pg=8cq)uHno2mzU4X5HgAHP4+ts!h
z07D)UACoUgcl@bmyfGS3Ucdc9(!-qhIbNdJjE-(O)UG@Rk9Z6O8!BhdqIkH`Zlb?7
z=#}~I1EZ>FbRD|#2cUYz_ItSxdBEYb2S!&yhmCpS@QVq7)>p7RdcTJ;xg8TvVlSP(
zuyCmIXZIYP)>H4)M)%2=KEPaG$6;ka;;3D~V;XgN`CI3<9$FASPMF-N)XckcoP5*V
zaMQECjv)b%=s|szyGQ#KJWgvnr~{XWWLhuelZBzvO)LA0NQkFulubYNL3e|)J5g}Y
zUt}t)^go*&Huqlqq+40GeG~o)-|>pvOi*!Vqr(+%9ZF7y!bC2u=RXucH`*nd=EwQg
z%Tl*Ap?Q&jF-9X{wUoByRm4ALKR$5>AI#eW>Z2^N8C8m1V3g(F0k=s&*`W~cg1MXL
zs|MMErjp&BgZ-XJ=rTx88vQePMCyJ;s3uwjE`SRj&Pjd9*)x2-HDyl^RqnaqO<lZN
z3oh%;{zARj<t3PpogDbD@xs+y;84W`@8vllMCtp8e~+g%o7(yUCWf+}$vEzX)S2^k
zz^L%BTG_18bY<@yYqS{z@)LrGzTmdbmkwUJvnp@7a-#dJ7*m=lROT_Iulsv|frJdx
zJKKvXO^WZfXHVRQS#Qzw&Kre)?5?Y`+1aeKt`#Fc6dffOZm9MBUn}uzQ+6FY#1+Bf
zKS*Y%A5#C-(eL-On@{P%gb~+MePcM9JP$WNNBEBQs1^;2Ac|dVoD6tqQ$ul5n=PtO
z387P}DO>iRTw)Zry`J<9Ws^Z`NgM!LOVCu^3PAd?C7{HHR%^)u1UMo$=~7AYG0=bO
z^&O>*Uz0|t+2yhG>^tr2({_YsmoqU82lLlm75h^Bc3HM$5_k^Cq>U6mt?(a@U+R^L
zv-=XKOxe(E(o}n%8$lI)TUlHNud=ZFK);-CpH1SFYMYrB`dl=KbJfix8^jzVf(Py(
z<>4fDYc~Jr3zPVI{j))h&E~K_pP)hd5pnglo$gcybD$*oa@F1X8PRCQUQaZw$0od+
z7L=?U3DJ*|l`!R`@;s{=Koi*HtrdDlGgH53g%4X~Z97GudJdeHE35qA%%ixf7C;FI
zrS-4hS$oE|&;7l+WhXQpUF&GY$7bONc%odP{g&cPSB<b4Oo>|LmPGq~R`qefBSq@>
zR@3ZIcAKoK5pBZhe(WC>&FI`D+U)~6gWGL1_V{c5Nm;>n3<>v<S3*xkux76h2Tfg!
z@5u5oQ7?=G<ovt(LJ8>z0D^BZHy&^S;klmhg{<%NHcM_XMXmD{Mg|YlPOh-Uf*tMb
z*D(^J_7gjE?saLL>1nq+pG%~r^L1%Vxilo!18*ff$ib1}wxiwT@4ynyQh*cl!0pvr
zp^80Q#QkokhwJo@wuJ$<i+W1i`HDS4|J%r4w>@80TO_3MP>adUFU0TpxwdDR2@AV)
zQ&4(^T+s@$(j9B4HS9$X0e}K>lYTx!{VI?4hNYA?{UsM`VNIc%46KkXUlPKd<8-G`
za?#z3@!stGk+`a?E-xoCOd5f2Z=9mKTy8@5#zz*(J<{0_h_t$b=@JR)?RfRJS9Moe
z9-C@G^RK|5FQXYBQs_F7&cLde#@|}}FZ?X7R=uyU9J3yKo03!fqKd1+Da>=?Y$<OR
zq{J&PH|ZhNJX3}1u^?MR<Q~B`-}(g3|KdvjSsAhCHL?N#fB{PKGCJr?s<ror0%(Mi
zrqL2YvrCxmc+@Y@^jon3?WfN)(gicWw#xuSj~@*zyU%`UWKG<k#^B#=^HD$S4!SoT
z54o#3t2avze8c6$(MQQb&<MUUE#W1rtetaTJo4CCwI4jWwQ0ir7SSE}j@|?j@oaze
zEwhv*^cI)WN=dOejJ6FhZ5iN=p4+28JHIUHW_9sb4tKchBx@R%0UdCPCU7TnV+0}x
zC{nPfV^2w)(`AZPJPb{$S&)LWnvS{Ax+DBw8ffex`fm^wBemVlq`jJ~Kn_aX@s*}y
zK{bcryQ4yI5!7=tj+sf$))eA8!mag~pn>0f>$-d=)o$wKQ2rB`Qo)dOR{^RIQTY@j
zo`K>a`B6y^|JjTM0nvwZ?6s}_K7cC!(9nzzEzpin)a5*VbjfPr*GAK9tUxG^4iho;
z`1$#rqmLCCQJBUZGPAg6e=6Dbq!;Wj5mf5l9Rp}njIF>ua+g>+(3`T_(nz`lpZzLR
z)=Brf7SuP#aDTvM9Qy!x-sT-jUzi9XOo_}M@?H>iSJpEd9JrfZvHE#(>16%6*lFWs
z$2tC3Q|Huu{VQiCrX%Z&tnA9u$_hVPuqMm&gW%YopM-J0=6wLNQZy9+_{yu!GGpLl
zoHkK8s*Fe~x^s-gO8vH#$EH#~1Cy3hJ;O+u<T+3E5dm+Zuj<czV4;cjywzZ%hQHZX
zQi_`s=&rx8zBQ{)kqD+F|M&B4mgj`{h32pp{YEYEKDT|Dz~K*URR%^TY;^QQr|iGd
zE)@uXe1q&!SJ=WlF`v=Q)%Vp`rdJDCfcNnr^Y?^naDR=THq0ipzP4DK7%SaL2=<yK
z_;BWknB@mImR2btMZ6#D?IBW+4k2iLY=j1@Tph^5I?jKafla(S8x8A?k!=}_@8+Ol
zHf+-lG;aiOU6~RVRG)|5AKI@@K5Jig4qkRh*?5tTa-x#5ZJ?N@MNKO2Ml-ZH9RUe8
z6PgkLWx-%fD&<Y1yvD{$p1A;s{Z*9kzeq8_ltUvWX>|>)lt|q{md}F!gSNqHj!OtO
zx>l~{71mD%MD$vxgY+k?9Drz{Ny6$E+2!Lq7TOc+Qhs?It|pAZ+_E~s(C%K;UlgCm
z9+m}evwa(n3U}iK0r1vqUz?f3+RkWn%^$A|j?Pq?==^;DhV|aIN>W%Do&`y0h@<4J
zuc4uXU!y2ad+z3&5K)K7zOnAI7>;b)2^Ay<t@f9YXwLaJNp|l)^PdTTMK1Z@O{-@f
zb|+L-_=(LJSjib?K|+;<FaiR8+q;ga#GS&?gwk%79ir28W3!YGqhgp0`c_ZF#p~+R
zQ{`<=I-fE#wkOm=IEszaK$lGUL|%F{BF5rxP%?AO_$}t{#@fk;YDBGHW{HZ*3%uqt
z!`VST0<pd4_;-h$>6^;r`Z%f^*vg@1sSe+OCFC^jTiooK?LWt)$}f6tdm`U}JwW9*
zr!k-2GBMQkqXRO9HJqe&hmtk!+ja>3-Y)}qsyJBKU>_$;(Pb#mwcsCyJxLWt8ZXIC
zc|m{t?h_0v_i1GJe+EGeK_C;8dDS^=@U_1AsMn&;bdp``Rl+Zg?1g%m<cF7}M60eU
zlzp_$*nnaks%R0{iwb}32AiJTQGbHmwCClWl9%tZ1D9=o<3!l7Z{-NRuKhy9*^bJ3
z$xS|a&L_WkOffASkF32*dRCU%72WmUXti-?+$sTBG&flMPorN#_uQqpiMf$pf%sUR
zL7s!P9W{mOEyvvHPNMehpexT=`m0Mk9U9G}`I6Ls*3DP{Rdj}Br=t&JZOLl~3iJp8
zH-rIm03rF3z@^qZQE6CP8!RS8rS)Y%7x8*vcV6`PN?<vCHup;_1ET4N8@D^BdhhN+
z$+w^cr54cW(mlIWjXOS_t($5Hhod9L*y%ET1AA5}jfXSg7{5br>#m8R-yu%7nlSZc
zC(KJaj+7ryVAqr)6bDRRu2bKZZDBnh3q`MbJ}SLyByh)#aDPW%28jDM<d4*5!;DLO
z2Co79P)&?!d~iJv-_+nxO&8kel>#J|!pF+foouP<TwDqr0I^Z#utmwM$56j3u>zIH
zC6}F4dkQYpYp;qI?Dn>7>P+A$R=YQ{_e-U~wpLHmbzdeCAlil08&U-8VXH8JL+Q}A
z(B9;Ji(OA<Z;Y^p#wD_dJ>e^r?{~p~Wnl_H4nrXW#<S-{s8v-=&MN_5z7rGr+wE4+
zzjn>*S(%g)w=E3gef`7Y$Hx<^d#u>)r&gMeQ0})H`9$HWmPZS%2Ne~|R_Pt;V#Rvq
z)I6K}9svP}sUz>U)8YEUioLGQvphxHLvgp|kx4}p=fAmpel@U(5?oT@mmttZ+eLWK
z3Wc`Ywb^EXr=w%U@Zbi%sqKMXTG~Yw)@;UAUX9e9(sHp$JWk%;uk_oO59j+$$MNQ7
zQ`36ep@iozb$^F`fm_57+#ipQH>W7mmZ&wLvXA(%w~{n1@cFuQ?3tV!C&~~3zMe;Q
za~f>d&fzgJJY)JmBnPxVs9dT|&|5v5Z4}UuGb!Hec&6)j^rvG4M(wvrR$Dx-XPyu-
zz&Et-b9g#4IZ_)U8a2rOwsR|g$>Z{EQb<JOh{S9p_2g~CDB4N7M)KWb#jY^<6fB?!
zw!fGDxYSOjce(WSUCOY-oAvz-#q=UQ{R=+*k0^?;CNY4sGi^J>DYM1n%IE;JESPBm
zTnz!)q4_}w!CEI{Y0{_=oSZoogTD{gKh)a%F4!=HN{z#?2C)RjM?FXXLvf^JOvvj6
zWie5MuTKlTlglVh<6=HqutfKN0-R)>H|(4v4Y0;Ki^{FGgj;a|L#9CqwS4}s+7K_6
zrEUveel&ju=lxD^>9Lx(GlmwCz$T0SvbHB4RH^llgXZIZrrm&uUiyg@bF+O@;@GSO
zZk2lMcdDBIwkNz(yQprc5T!+iCg+^VO)d=CdKdN6SN++Iq2H_7DWb)URr_32)qNa1
zm3C;iN}9)9u=ECvLhzZQEJNh&Ic%5CZ-CpP*R6Ppe6Wu%z~LICo);<<)=f>3^VmHP
z2={?epmil<1E>W7ipI`=CoX~Kt@qns>XxZ~InLe=JLwpEo2`(yEqrM!a0#II^}qi{
zy!-Ga!6G_bD643tTJY7_{a-?Y(oeb>q90Mrtzw~R$i8I40DT3P>r)`YYo$Brdg~^*
zP-X)qKkQMR9|wXL0p+rt3ojzk{zR9`iNtBp|2ykANeZX=F-&oNX5TbUJG7_`V-ezC
zVT*N`=8*^=KKkFaAE~P@ZyjaC{3Vu=Ic%OfY&AicT$;QCLNdDY`RzZV2+>h^?4-Kg
zHrg$QMu{O*fm9wRGA$<e=AYe2NAG{x*o=FEi>0~HU&w5#H`OkEt=35*mfj%60tQ`~
z5>kwIwVyomcZ_fcG+Sx|$BB-~t2Tv~@1xtkuJA7x**A~2dgMh-hjnf68W02EcgL1(
zr_&1#wSw4Qa8>94Ls(6%2-0QlfZmhe){&H<?zGDVwDQV!rlj%fPx^nto9qjm33&@?
z3>l`CP>$TNbbHqojgCD9{bAB#XcvKMmufN02wSDOL}mZvlT9xzt$AZg#sq(wv*YCO
z{g5XTM#Y@qL1AUKz@B!JC(8Om%PlVtjCutfkf$5i|Et^BSX?VDl&$NJ!axFdHtih9
zjmBmvrIDT7Qou3QB`hk!d8WXm7Cz>#8A=Iwt3|4tB#-V1B+-&%DMe>>0(gdWnI_Dk
z5O~DsF<N!vzS_IWgh+7ymBVA~x_sq1weeFvRAHL9^jx+yR6hhrOId=ht7Q%!$eIFd
zN$a9`qEK6Dafp|A{$}Gzh$~H*_TmpjGVU~@llqFO<izJ%6p`A$#%Fl)-&9Rj-|EO{
zl4)67PX_oNZ-~u``>&{rMVG;IhaPRlBQ${}T&NcRu2)-Kr{yBDC7kLU6rVR2{lxcs
zWHlVlaQm+|MGJjxl7vk3h%s7Bs;!U|nLO(ktZXk(riH(_9|-d-E0J}5&eTiBr7r2}
z`m3Rv^ZD(Me{o2_*_D41XT!iKD?xV60s~K>LzQSV6F1>>sB{G^Xk_=AV!+C|N3&*$
z5W@{xTB^iEQ{oA37t*pP;t)ImvYyG#G|X}Fq_3r^aN(j-qH>BiTWo#9Py7fB<JEaj
zncU(X_Lqo_ITVFVqn}%rvT|LiBW<Aw2NNyv&#!5n>%gzmY7~aP)|SZ5ivxwH0%n{h
z+1)<x%3Wova8L8ago<^h`%K(Bx(3g25;Qhok(mg?4x`K&!wmz<6n|%>reJ;#RIwcp
zLU7CgMrX+o-6iELCYmBmg!B<@L!_Tqlk^o?RstzvanS*C!3j>9xB15Of3s4slfMCg
zA5d65UUv;CpS*~#zve1X^8v^Ms0TmG3@P#kq9C2*$r&)KFap=Hxt?cr+kL=b;D7PC
zjLH&~`-d4*4Q9`4RZ;#|(0mpKj8qn|r2H6`Q3x>b%{}rPR2hG^Lqy_d&N{gkLm~Wb
z`m0iii%Fy-4bsm~z}v^CZrNk8{P-n+CU=DYIg4ObfzsxAaD!3k_pKZ|D#G`|kxJUu
z;zo5*c<cy6y!h<~)}SC#kTg^5YYM7kG(|xX<C+wWXiRj1;eVX)-wLmH-<M$~_~GD5
zpgZ?jXdry$VkHtc1cJ(9UbDubBJIPH*9Qz+e6X3&I}KRBh<uBENalGP--d}LnO`;%
zJJeze7Bcu}{kfFzX|pM<4uQ2;TiH3#^b(4nv=<;B`f3^E1G`%BfhA~2dc{aw_U)cS
z2N)~y>E4)WY`ysm)^^{6hE?HB2>D_wXY@?T-AMw%q}ld7g9n{h)t)ElbfO361P^Z5
z9*WK1t4CR4u~YSz21C89``>?AMu1O9C&Yjjhf1R@j1{G0{9MY`d}jN3R35}oCC9XH
zBph%o#R+VVx6JPeYj4x!6ig;PB?~T_Qn!$g%8q<s`y9OCDedW;s~!qFBz>gu?zoAs
z<zfPh-nUYdgXZg)412ckS+v;kmol<AB1}#E0~;f2r8HX?;34#8HBo*Ri%e)ad*&;e
zB&3y1lvZ@$*Doh&_hf#Kc>wf-6a^neqmDY&0cHRrR9zIBXJ9Q2jQ=Q`yO*fQ!T*L|
zlk9?}MRJL+xp}`)?0ow2LT1Ls`6X#ItNV07-bUTeige^uctB!GRc5ERo~>7O(qnhW
z$H$ZcR)+pJ+Zy7h@+vBWVeDC+9XRZ&sjovI@+{v_pn)iW3M1e0CM!CoI5|hZqt1Ud
zgfzF13*d}b<E*$jKi-3=1I^i=KgWZJ&XLNC9?&OH?3=IQRU?ChK-g@Q!*Zc#p4)jN
z`~B=M*q$jbCg$X`RcGu(MMq=Mq}_<DVvo;L---~AG3gMNX(VIr<>e){WBvMmNkx{R
z-OOB-1(Mc1N)K9cNw@#4gpKwM?x(*&7<V7}C>yVV{a#+7VBfFuxCM$5eIhK9iHcv`
zJk;mO7hmk(gDLO+lRo^kqr>lXqkGoSZP<O*NZ4tmSxtEPVn|?Ae7`79F^Y=p7Y5^Z
zDf~UOd8*^K6!$}<EYj!H$7mFOe81<h=)pSpub`aI=Fwjv;S6MD?Qs|{YWMpAOtU3m
zuOOwoaUYuQ*l5!oPNrR`E&*#kClEWVOioU=jgk~-+gYDi@RgGmkK9h><7-27Klo<p
zteed)4-#RN3gUMgTExAp8&kJPPm54YY8HvL7$~HqO+B%0uItm0+H6am?K%(R_`D_Y
zv{H>c=F93<7A7YJ92Wjmk4oNw(Flsl%7DOKL+qW$rrxa6ieH~ZIk_;A_dJriKSFhc
zl7^vpEgNydl}{?gt-ekk8<0uWj9&Y(3wjY&P^HF3q$>IM_(UK_{y|VDIBPuyv}{_&
zbveD|YZ2*o{1=P<X{&=Z9=s%;w}_EYm%Q0(xm@6WzKPL7xOhKL;((?YNlbYD*GPE9
zeB(>)40kC35YhFvfd`2gRqVO(4H>!hmI~^v>NY++WNluRBKA%V@GGjB{w_)jtdDwX
z#36s5Hi+h4@>Fz-R&_iaS?2BZAerV8vpZ^UZ(jzj`psLW03%M-MfdV;CVz$C-`Pg_
z0}loIR~c|DW0B?nmeuR<^q8W%-pRJg;uyk6N4`OPLOzMB<bsrk8pQN=j0s$7l5L7A
zZc1GwUEZqM`e{&2WTunY1pPi*BeYxCezlmbEPU|!mlw;Xi@{85=b)i6iX3Eb<c&kj
zn>S;m9`X~^7p&49WUX4=UVcN7)d9?7cn7ob49RIw71@TE>Hxely&BNTWjUV9<~zJS
zPi^T9Fx`h#7>n-S=!uorpBFg!JG=fHOo=<qdcqk&)Ig$Q@=6kYl#oj4ROs>VVpBcs
z?bY}#J?0yLJc~G?0O;#i)oe)ww4~g7J%n8L)k;+_otVu;e$JKICvcr)f8TWl`yVcb
z2}rwgc)k%4{z@PUxTN5o1-jCtq}FgC)n-YVGZ%sFcKLI_bM$G5o5F4X4@qYk)K=SW
z;ox4}-K`WWR@~jSxKrGULy!`j;_eQ?-Cc@9ai>L#JHa`5zcZ7`WG26|clLAZy4K<*
z+ZDHbm7(T)(x2bpNg<o*Y_v~pe;#(@wxQ31bZEp|F3x+mw@6UIu=ldD-`UIK4d1#$
zBlk4_Zh6s95@Fw4weFdyD=uTjj?TeI`|Etp{5jH<EA&h{Fc-{H6c|BTDYM9<JE{Rw
zZ^RD-{XcDKf2%`752lh3VZdc}cf%leaQC|Ch8Vkz(XTuAzSeY}Kcy0EOGaNA`|PK%
z)^@o)-5#;(RHbuSjkh1>`aS#&Z*6G_vlIp_?)Xi8X|w90)^-<BRtl+{dS7ZM#NZVE
zAn=-bE8?wuGop0Oy`^aheorM%j4{r6X7md-6XsaRV7-!2b(mUq)x`7z%EP%9KS%bH
z3d2`dt~!swVXx(VbNQ&s&M9XsF>mbjxp;NlRI4FVhR+*bPI;&D?R?R!Y*Pyn)AHMg
z9vx>zzbWkUr*Guf&;5i}<>jagB)srtQt-GkeUn?yg6C}((QGaJe}douUDgGn-*<a`
zpJ%7N3y3bRC)j)MmJB*OJIj=_JeR)OKt=7r1w`yhXi#Zcc#=^B_Z!763B;V_^Wl7x
zCTCcau7W36F5$(l&OrQO$L{!_qOI4+UUX&vGWQ$Ce?Jh##K{Q4K#sR{>7mlIBA1E|
z#j<Q!X+x<yC<^~5MZP`e7&GBSs&;m07sc52dMY#vb+44eCQj2KW|i6|s-{`aVcvsK
zZhn6M^O<SkOR+Wyy(WQj8IRZ^sW3TH;@{9x`@eaMlfva8p;}}5=B&rzpH=X&EhgP(
zgdw-{s+4Y{)J3v!AO5}`uLcCHKj%aG1p>cu=DIsUzkeB(6={Hm<3l$A?8w~}ebU6c
zhG>V8_0lx}(-cIpm-7dyo-HaB_+7=16ce#|?p&dOSYt_oV|QD;^vM}U+GC6&b;p)g
zTYixrzxTV0$*VCdULrg~1=u9AgEe0x;QO~;0+Mn@Tm*cq@M#x-n9%27HV0|D!pErD
zUc+z|_BoM~F*G@kMy_TGbp$x@WO++sC52X7JV;RMxdVh}qm4y77N!XLTB*FA1gC|i
zNR~Bv%YNbiuW*0>X(47)Sz1Cy%X=ZmYfFHG!{tvfQkcQkEFB}=Atu{MC-i5rcb+yp
zU-XLR%~1aTJ^c8Z<KCZSQ?8ELpbZ=`kVWd1F9y?$Cl8HF7$5>9<c$M&4vQo$Y{tlR
zQ3}-%=1pw|s)AicdefEewo$(@)uJNN!!qNp)Pb?&#HCbnSYmMGh(G?RJH3ZzB4!8l
zM*ymhQzBOCet*YRAHXDu4jB#s@T$qG(JM+B5BpIv4}QZDU{qm=O;<lko+GJogyR8!
ziKoUVcj#B3Qdy4<wNfaIET)4EMjA~-sAK`HM#sC&wHg5~Kt%8@rhU=6Ghy;5mug{A
z9O#j{_CvTU;*p~Oy~oo&T2aZ9w#@oY>ezHW`{$>p^Ny36NL<GAuA6D1#IZPP8Y7-{
zvx8aUi-w6$e%E7+S(%wC{C`7np=kq8&Xt5v;5Q(!6hiEra_lCe6vpG9ks;NMirp7B
zu*?{n{QJ-Dfa+MNrWS%ZlVyU!%Bq3^wGHKDOdxf}aWDM<pB8}nXGr$L8~s(6A3Epq
zXF0<qHxS?2`cfRVc3&lW`H3br6$8TI@fdH|K09;80EydvO3MA=W+weV;}85MOb7a{
zQ6p#4hPx|Y^Djyao=iG~$Cvj=e(mwD#fCelHG=enkP{^O%}6916e@Fx+Rnb88y+y8
z|8CgEWU|w&MwVu%`YSHAGm%r%D``}A#fhS*+IgX0TuC?<QGL=Pia^3F-}N*Wyg#$&
zjxn+{R05QlR@soeVH<%PvWy8S_Oji0m00L1VU7tOL1mq!8xLcx-PjO*K4=N(-9H(T
z9A*Le9`^i1XqQlSoR9*@d*%^ZOhO}!h~NEbT)?SCb#-+?5b!dlQYY!Ax*2+YjI-AI
zUkwxfiXpzqnDIaO@38(puUt9yxM<mJ_n8j&Ii^*$gSr&ZU%!&}#qT-WVO7gOGX{|b
zxVlD>SF;*9grcj(Mor^JMK8rtIH#jKZ#XJ$d$pFkrC?Pw(jg&&sokg|MFpkB|Mv(c
zTw-|b_v~|T?u7s5WIZ=bx6j)|St(bqD!E?U8G0=$O?V}5Y@f4XO-OGubafrZRp-QX
z+$y$SBXlb&6KAn3>>-4bi}(0aYD%!R-3W3|+%^`}18sJLcTd(BeiY`WIZ-@J9NH<X
z8t%aC2m2?;I_s`LJ%=O1#bXj#u+h-6gEn{5OozFJY@L0M!mj^~wV(Co<2?3A!_B;m
zd3=gc3`jUhgchH(wZ{rQvO`j+6P!v$C4K~MLuWHj_ddWooU_d3G3>(=d=?U(z)SPS
z-%!|9Kb*VG=;_f_NH~)3@=AhWlm7!@DUdco3=~^^`-yqm%h+|_y(D%^m+wxKMvXw*
z?BV+OE5q`)EWWwpc6h*s=epx$*ZbRZc;JV7VO*a8o36*3L)FaBM={XI`1G&HWS-dh
z8=YwMp-q*jZE6iFGr;Cwv5C<0-pf|!%$jQ+TiDduzqRMdXPr(G26hfLy}`#W{@N5A
zR_t=~)wY_qyg=V;-4AP34to~8z}u}uzc;rLm2d~{B_Gh3NFMRK9oJ%@eK`v!WGZjf
zb$iC{goO6QzT+@epBEgsO?VxfWv48B_wls*FfAqJrqH~x*1o7vs#b(f^Fz(%i`bdw
z{MAAxNLB4u_P{?|{3OYl(}%4KDFx7vnXzU^NBb~Zu$iU-66Tc8m8DXRp6sJ_vodJ2
zH*`(Eb!YWjvGv{<!YsTOXkV6HOrubx&&i<sRj`xGCT#N{Gz>0q<@x!?6QNMT_7>-Q
z@FJR@Nn33}YEx+}=}7t$1NVDy)LJIF%AemCMG{%n6>%iLz6)JF&rOQTRy!7^NNm%r
zeoK2%$!p>%%FsjeXmJYhHH-Q}`35}to45Zve*HY+>94_e5jWczciqnyo?)<6fP(jZ
z`^mM(QoX9G%oK-fNil&orz(GOect^J9n5B_Lc17SXXiWt_k$$^=kCD|f>6@&@xkj!
z^c90@?n@EnYq@^RVK*iw%7u#dE_eNmw`}=ABdRHkq=<}gvN(D^gH4(8R`EQcTZC{q
zps~F3Wjv}U^=ao*))G^89XFl_<ng!T9ROWzN&lNj`imvQEAcx4^zN`?A=AwcgxXrD
zwhb)sv~OAE1g@>eY&HU~WwqglzO8or-7^Akt%<dlKZBzE2t?;w=F^JI*2_^{Ccjl<
z$sYSpXKv|I#-FGB4&JWbzQ)6+roq%eP688!FjCp&tT<+?ta%No<Y{9&+$HRMGGX0(
zO6fVQsmgIfPY4DlGUmT*G*73Tcs-@(69lEQz=PV>>ID~pJBSTDJa&=opWj=~R|C1%
zDVLj;<Fo~)Gzt$KMP)Qa?V^e%Zk*2aT{O&&IWchK&?_COHC!i!E>}P=e|&_jNjuLD
zj~Vg+SUOYSk3bx?c)-V=<?idCidb`IsQ_ULD#_wW^j6RM8Xp7$3N$CWD=GqQL<(qR
zL_j0(KeIPsC5Uz<gF_yB1V2G*tWfi=nwlDDY<;Y1oL&oje}(G8B(DN^&@VH`pa09=
zIGMZ^D%9y?LxonvyQvX43M%f|-?@3{o&{<X{z8|iZ!JXaw=J@sj!ZTT=`D}TvBVK}
zBG{4uz`98u#e<25w4{nR1M#GP+}BeghZqi4SG#KlpH#dl0ku|h$fmeFe%gIeC%190
zoP6eB@|Fs3`iMkiX&Un?>5@(10F>Aajcng!2>22aP!-Q8tu!e?@GtNf=bVkt!6zG*
z#)?VZH94m)%kL$wrfH2pM$DqHke$7zi@)2mNcg6D{c-$Q&#!HSMb0#xxHO)rD3wL^
zbqwHPK!M?MsyM`}p`)Qmgi;GoaK9*&FmT=bq*m?=n}B4n-urOA^*l3$xox!){x-=&
zOYcec)grU_g{2rXnb7>JuxzSu?)u}PUQayY5($4CG7J3t_FE#m*J9jCOh^)QaOD4v
zr{-oTTBVacDBi=QLhHQu{ms6n{a{+~;&$r-nn(ZFX2|tB|H_0H<~j206-dJM_FVDB
z97}4^*IX4)r8uFZ%^DS<rA-dy?8Rfp!xoN{u(%DpETR5^DSzPePx$}AW-q=5Hw7y3
zU62U}ZZn(biwXe!@CPklMV&Vf6C1E=<4t3CixD6l0!JLr2TM(q?{$CI@RBhgE@~l|
z*%e-|RtH;$%0uH2Q(0Bf<(5R=97`;$yui6mwFO+vmc!cvFqm;Mv8OW|DNgp-z^P$t
zDKLN$(Ba_m@d=PLBXor{jlbs+H~t{+!T3`ZDKV9cl{BKHv!l{_X-3lMxFBRKoUSLg
zAhXunba76pULX+I=hbFiEC-4~$ggK)fabP(S_hkik8dvozwVn%;CpdwS4?1z{a5p;
zNqSHK@lpI1g+4b2!nd=M(CR&h%L)~6xB=yPc#P4fO=b%;>V2ee>4PaCn$b5<p2jo_
z^4Ly2?Z~+Lgneu-Zx+#KJ64I(O3FY43o~R1vlEY!pF6M^B4&zuPOPSakc*Tef_7!|
zQaPse&W3fonP}(R^2vCs^&xVteqFYCH(cNCo83pkna&A9s8yP<5FkW5$Ss#gcIl3-
zTIiNEx6KQ<zRPrudz7Xyyhf7#rLT^GeV<V1&vIAtI>hq8T-v{w%ED-7;|&3_eg*Ag
z@oQ6u(VPd~BciW2aG0?plxu{L&Ki>b(8@_fX?{UZP7|qt+w>gR8d)mWN?(CPwT@u<
zQPFTszujI3T*nQ9EiWWXAmsfQaL&^C&_6&e${e#b>}SL8rWPIb;p4Jlqg!xa`FG5&
z?;zq%ci@Un&a1t<dIuRMek3`2Q7KC7E&^2@7|;eS-@bhLGA;7#VBND5#ah$zbo`4l
zu&bnGe+Ak^xEjLsT7lwf!YOax?o>K5wrsjiUWz_H4+YWk)a%)-<`O+u@BS+plMLIU
zho&Zq(9|S$$Y0hBxhH_YW9F}s2$EholB;=CCG1cC_W7yaT7KWGa*rrNm<+XPQbS?H
z+$W13`<l<6?>AmD%6Atvx;({hekUh|pS_i+O&t;;GUU2rt|zp|LKyp?2va%4rwjgH
zT2CkUFEjwgt<3VSQ|wI9#<pK+3C`GJMuP1tOFT4R6__dq^sWNV7X@#cY0(TQXpnn4
zv-vT_>802Sgo_R9vQ7~JrsWOwHD2m38~b0J^qcPF!2^?6R)f<R>DrBxE1N+PL~xDH
zbNe#{A6}~qe@@slFQ@17oJ^cKEjrk(4=pHR^#MkRE?&=Cw?+IPF8^!FLYq6@Pe*0{
zWq+$Pz<X%+hnaH)9(!~yB7~}BdPAw-mYkKi*z_Q8P*@!=*W!B_y8a?&=_Ya?(^O#J
zLDn^CFD23EH93bX<et~ZHh1|=PfD0{`mo4kA-vS8<Sz<0)_XU14R*N)>(8<dCS*MA
zPPbaz{(jP&>Ao@!CC=*Gg}u_drTh8}QS6+A3u#DB&%U<PQy#K!pX{=$>g&`ISZ0jF
z%XwRevG?)`YBcVRuZNd7s6r$x>MELM3Z)?@dlv3&27c;TNNJSu5*bv`)k{e0%SuIy
zRg^-Tv4Gkq;GO4kPyRzc<+^{;6BoZwe84i!k-qVUt2?ZJ7X$DOXZ`cAlV^EqXACKz
z6^0HAcH%f~XBwYpo#L0F+7%5)D)e+4(=`cb*86B2aJLApO+}th8{?pM!B@M#FwbJr
zQZ{Y6kRCb<-$>*Gcf+kMyV6Fr-t!Xm_i5{!5%3(eIwoqTn=HI`J2+V?1k-dj&(!rF
zf=c|Iin;;CTRw=#5$gJ%HFc*79YypzZy4Lu>`^U(46oJhU~%Z7qdl}X8*yMQH=Yki
zQ$z7hi#toB&1bUP#>}ao9$SsnJ7fwI<7n0gr2#^A8C6@vddz7=OG`^AVpzcl>A`ao
ze1a)!Y@&343e5)884eOuq!q0%#-}DCowgV3aM(kGYUxvLAk56LmVr<RD-~Y7FQ#7p
z=fl#5vBkIy#J*~OaFc`1*{8gy>*}@Mh%tm;o0gU4NA6*CB&~jK67m4|^%&x_v$LtG
zDQH<?@-&zBLHJfp870L}ayv}v;Xh^PQl8S?J4|NFh+@aO*pDhd^O~lavU^=L=+&jt
zt{sw~Lasz5&c|baAdSn(wBO`f@zfjz!vRKLI+M{JD<}^R#;_t`XszN=at<^u43g7C
ztyU6GWJ}iB0;^7HoNX`0!-{}k1&<_m@O1uy`r)D}_O}oOSng&zB%XEcDmuF^MaJ>l
z_|X9&MV}50FSs)9bYoTp_|5|g&GnL#B~xAPP|pd49n>2vysiKVL_FgOthN7)?%d37
zby_?NHS5*zB~t?`imqQ`hq1HPTdDzV?U;HKzjY|x0F54oNvsfY1u2i9d-i}O-k+n*
zDv7f!5d~nzz&+aCk+@!H$jl3Tdt?V*MIS;xKsU3!XT^~m+xJtK^Z|?LJ<EJhH}U@%
z52Vnxiz?C1C<?v0H%N~v*Q-oVYS#+@xF%A%K;Yc@scX4dwIi|(i@v}~%7iygHVjv}
z1BzR7oj<)i8%5KKM8V^%4^<8X*$nom8ox!SX!t{Huae$%j~wIHS}KLFTQ|7A4)=2~
z<0j;5Wd?MsR|~mWRiVaW5MH<RI_MW$w;_b4FJfl|4`YsHf<TU<mJUC@)H2M(H?ifh
z;P$5+Y+C3utS*gVF=Z9k$}M-5dpR||MoJCQP-)}9o%_7axc=&Tk`(!6cnLT0WSo;g
z*&a~iQK{DtBA9Q&UNgCLrQAQ6?VV%~>~ng7u`rMlN=MNtAa!i|{Y&J&UQ<QDYN_fg
zwCDbd9Qp?BMH+wqE&X5jP8^B63EF;kX!<QPLPUc~`XX6*!~OW5{owyD7^a*<`)5Zj
zG<uCnDkRsNi;@Hb*9jdOXa<1F%F5LAi?!RWlA8yld=H<0Di<sE6#)13R<Fw!_^Lw5
zSc6ZQ#91|6AMC546OxG2?=j6*dsd#)X3}1nq}z+P??2^Kww5}U!ax00a*vwo=MFym
zL%Y}E7UXdt6=|s)JIbMc#cu}F@~F~q9t;pqrpYIp$q6nuWjuv}pTY~3F;ghc%?Oy;
z@P15sbQF<bh|?R54<{1yU25=ZvEoL6YUYVF%bP?segCC_ey<bo3X+N0_6ID8L_=PQ
zsG|%)g%ZGxg)IcgVP`5~SR7NeH+F!cTh=Ki;Fv5+&LM)``*8IVT7U@qT<>YB3g1<f
z<z{kPjQp2R+^^ca-<FAPTBR9%`0_7b7_#9$6oFlP@hj;+`Sg_zG)K57TI|SDOFTqD
z#kaq{$|OY6nXK8q(uAgaTv*K5`LNFX8d@70op!uy^GnX3Xyj0RV7&YCUzu0*Nt(%F
zpp=P_6-_w|H3ki5)A8DAm*Ph_pvGvk_Q3|H%b~!g7qIhcus4>3Z=Rolx4^>a2*k=`
zSG5Tp1+9l_4Y-@<_m-L}*NnjF$YBf%qrHS=v|kAx%A9{3`<Ce024+!1qtLtls31HX
zd#@%|ArV0&735`AE3h-`I@3@OG<CWlFN=hLgnP}dtApA=5SN~2ep^g{DL$3mDw)WA
zW$xgmqoTW$*Bvg}u2%@sN*x!|*IH70K<Gxd;U`EBq}v%f-*`OM?R}b+#N6~fsfKI^
zzC#C-GI12$zbO33{4e|eYqh$&H<p$peC>nIMk4Jv0TM3iKM4Qz`soqBdt5$=4hM?L
z^AYZvfj$Dh-qyL|cdiDb619XeYxO4hT4|+<Y8rPJbKL0xf`ZVgiUA3*B|PrRyXF4#
zrD|66+BywGN~e;%rV;I!AHAzwZ!H&8G&Bu1uG7~BKP^n_b~ry}Ut~+G!<U=w`W|9T
zz9~-aSyVXT-dW*X;Dw;-F43646UHhq&3i>tNa}Vv`S81A?<A#mju&&F(h&4by++%v
z(UEiu!#TJ~h5dn9S<?Zz1}Gv2UDR~>YaZ_Z8mT1Y^Z%#%82iB`Vzckq`wh40cW$Yb
zyEc9la(;_EFE~ki#_#sA-D}yYp_e%Z%bVUpV#j>#jUb4V!TSsp`!X^DReOE;H-N_e
za$@ohRWu$t_B<Z^H`V3*MRpTy@<MYv_v#f$1=an-NLbgEzK7Gi;I-5(F`q(J4~L^m
zPl0*GHlB$mBk*po8pbCMC4pN;!(lp!`@p{}w>6&4%8ysiT-TRjL{9yiAoCyOo$hwB
zNm@g2GB75`%?VUsgoFkb&ZVve(dUku;hEOcxBY<?BfJC?!Lkhz*4F0=bZ=0bhSTn=
zU0hq8*OP8%jYzvw+RW(XnZ3-JT82lXZ)A&b6pXBjTHRz%!xF~r<z-}4#4xkupWR1@
zW_>dUuu~1psMd#-EVtkRPA>7v_~^sM=gYeMNOo8CDb3Ge!ul~mFk;~6+6Pj{?ZE9B
z+tsR`64kbpnZuE@RsW&ZLbG*SGEKbUoAck(^w2&Bte!TZ&fiiWMM1kLlNax!EB62s
zD8~<pW=Ihawdkl~IR4EWAhF$L_u)OA)8fD37p@Aw*>twRe*+Te3j`I)U*-OB1R_@w
z^vbcZCb^hD5MY5wB`heUtvAxHrlD2ow(Q_FdEtPAZ<<i^!TANj6M>HhXbzm-b^~VL
z?>ZRYe><ojRLL|y{#B2oc4Tnt@Yz?8H`dr34WNU)x6XO}7z;UWKXmB$Yboetqt)?p
z{CHK!80v5KvZdvK8t~7<w0X!b+;IYsK&^*ek#S4yr(|^TDtgwnf4UP6RR2c4^gQc)
zokbaMpIcbZXn<aHpHlPY=B%kq3Ij`zqw}fu>(2xOsi`dSg`{AXh2Wxdn@>jpr#Sbn
zd#Q&zHNgTYHSKvnn@)F}*1q?u%NnD^njIT_c-Vdm2O4L(6Y{9q30?2(Fdz*ql_&QL
zfQH=P%}+>9AA}L8U|CVLBG3c_s-8<{<~LQZ7@lP<JarK2mC;p_G+#!jm1s~E-%S@R
zVpjccre=h0CS4{@1W&CjyA<gHJfRA9b68G1y3fTjqSgDD5}&B}Dc0yDG$U)hqe6-H
z=7n46-omLK`=b=_se+OE>?CgT9De6reMp2}Z)p@fbT4Eu6K|aT)){<Fa#LARm${Bv
z5xK$1NJTaEiG2}44RJpXT~;&jV_a4u5URm98cl5l_q4s*EPOV<zuRoIiHNhm@c@yF
zWNzPk8?;7hf>1+kS9Exu1G$%ZQR1;8Z9q|RBNy8)lPddGGs`L~Mb76raF2hd1c8PC
zu5mmrc(HEs$7?%HQIen_`PI5ePoY&3#d39aOQK@<tEim^VP+GX0O>WGG$h{>#&Z^R
zxGd!42_AG!*a~%Rr<LNje3LKRGkC<)k*M<0%)g}YyuP#NXcDzc*0^>>*<G?xaZP8*
zBgu9K6bPLhgl+arYE`npOB_1@{#?kv&3=6S(_wN}p5=IG^TH}m;vCxIOO+ZAhpWw(
zfg@eIkfaNvo=BaiTK(lw4>r<w@5W}c?IlD|aA@ogQik3{fau%kf#A<d_pJu;yOV^w
zy$Np{Xkem@_dbtqP@frYS(E=2&w_1-Wd9&ZLK&&x()L5`1Zaap3WNR?COBCeZv?kq
zC!*Z*S@U~ml3hUSx5<_Srdlu!ewbCf4Z^WLLtY^Z&JZUGj6{I6gZtEjaF?Ca%kxOC
zH}PUD4z}bp?XSP%dq%x`sBBpTSgFi1(j={4GVzR)F_OCy>&;_*?Q*rYxj5Uf7D}TO
z#pF0SjFXwXx6$f3o2k!?iKVbv#mQwc?m7d4?bZ^j;R^HrDNu>;PMdhH2Tsbwviw{L
zD~Vw!m<n`bo2Tx7$9jDC3Z&rT71Q_&f?DD3d9V##sfBOutuq6-;s3RkS3Q$aJ20W|
zm7sI6fBbhdO;3>*$=56Xp7|5gnap1pS*Aci!nCBo2MFB}f|jZ8L6)h`@dtmRnYyB2
z#IgNP#K9D~2}nbvJg63#$>JK+b%`L2fAP3JV3#5IT~^Z&Tizgv>>+qFJ%uk2rpj_u
zCfEons$i}Nlx$q>YpF?fxOHF3L<B4tsgI~UgRS;L4`m?EP8tUSHI9Fpsz+dWQ+z8p
z{*j=x2wW6vlu|#1P`!+l*dMKc1T&@(1nPCICT;obgX2~Kac$uNAN9I$nV};PhaJAQ
z<WX~i@pkE6bJt&u9^YNR?**diKT`CX!e&qNGu-)qNiImt<INWPp&gl4d~=J)6rtH5
z%H@4TppRs<)me$xOc9|p7Hw7d6=t{6_9RC_Qu*Ox`HwxuCjlN?$+Rs-A034_rYUpz
zZD%?!C16q)UNWBiChFYa?D3J}FV$Ld;Q=Xw<yAUg!fkC0*wBbS`S9tLy<L(5_qh7U
z02kjNKzb~!tb^#&unF%ne$gcrxmzDB&FFra%1Q0Fp<=wBW-hm2u7-LU(@`dr&}1q}
zlDpSKZp-_`p0m%E?t8rl{M#qcq}{8aNtB?130;n;SW2Y5*mccWQ4AFuB>s>JX2c*2
zU~r=w0pW`>F@mA4JYYf`4H~Q@dU{n<Sjn&2W}zP?1tQ59I?>0?@rn`E)DL0cB<W`9
zXv)DH9vDe+AyH8w$@xKM2nm`*8s=7odU^;;<@Hh|$M-yj!+Hf(>i>vFOqqP{iy0z7
ziI}{hxyg%CEOcN8WVMm_RP!JBAyV}NSd;5O_O0Dw48r-S%3C*rJzd*TdPj~dR=G4C
z=cOih1D7Clh<r1ToRJ3(jMfbm?);vmaJsQxxC*)PJn~OGq7F@^lOeUlih<dj<J-3Q
z;vCT5-)dtt<am(pnMn`KIv4qobC~i6a-Ne(4-bF*BSUZxP7p=;i3^X*-|4fq#_GXh
zd`{J*+3RW^p12?MkI{bX6uHTNkC_YelMllZ=WHoS{q8cr`+$N<`f%JW!Tb?o>#}vb
z-T$Za3qO6y?Msh=CH25^0`$bCzgr3)0HES)qs7Z7y+i=45J;z(rC^Gbihm-*Pb|Sz
zN!X>+?G-TR@5y5@V^g7P)Je$4NF?qPuwrY1`oIjXXjEomAR$~y<neJSszl3H7&RVL
z9<VXlkCfyEoIe5#Q6WH9A8JWUy^zw-$~eN%pMS?D8BP_sLi^=n{0gdyp)SUcb<(v;
zYQM&@H0~^8Q5OKA97wQ2N$Lhh$!`$Dfh7>wK7gJcLmvGI{MjLE*0j=EE0D7oo0Wdd
zLi-G@8w(%4q=415n%9c*^csBfEkcKIS*zPUGh06ZK)-MPV?43^_0N;wYtXkZk;u3c
z@jI!WY0A9q<sD1QmIAqCAoFhzaG6qUOT>fmmtu1ZTmo@TDq7T$aiKTq#a*?yZQj4O
zmrPqK3WwA}h4gA2zFm@ONizJ!!1I7-K=sp*7eWHV#z(PKeo;mDhSPB2$=hbduE4*@
zND*TE+UIP|cPbkEJ^E9ISLuxzdG^L{LDBFc$T!Q&ZRnX3n06^pWvnJ2i-RoQ{$ne=
zlU|okg`yY{uf`n_K*NEYZG0+V#7L}=8<Lzf6}m?}Kxe`h5e5%RsuZG~qAl&GD*_*I
z<SU_Pjm+T#XklSEs1S%%7MR)6bP*gn+!~L}?d9ZnhoF8*_qu|2B`BjxWSO|c`7AnS
zp<pw(5Jao}UbJeNJ$wmj8rX6$s;Sy*EEiKs$E|8`T|7rjL_9{u-$LGhfJoF}YUG>t
zODBeFRsgPC^5<777I7*(FjIbkoW>{@JqCIr3>?nZ!7QG+%e<M|OOgjLQB$Lvmr)am
z8r3f5c#rmb+y_YnxamQL#!;{fC@N7cC-q-zR0Fmgax!CmG}R7xC&S?yrug<pc`;UA
zRS3QRV%WyM5w+23Sy{vi`j*%aKHA4~9CYnCB2=Q_(3+)!dY%Mo0Ec7|MUXz$Ldc*;
z-v{Epz=vgPm%)hS&s9z#|H^JOst+W4#L4!mdo*xpxyQapT5WDyLr)fle9NN*;Jb&M
zS`*2F5}|o!l@s`8ZSQAfl$~3ORD%@i%@*Aq`N<xvE*}O`S$|O2WDgemVt>G%3%NFU
z#JuvaW;Kg9jwJ@%>}Q(T{*Fhx<-X5>OxiHyTqtq`(c&{uw^`iYH|mKQI(nK-7!`~0
z_p9Yvz9*OBXf*_499SQ9jtnYw2`M{C@9?B^<w)a(x<vjUF(noSIYTrMRc?f-RE%bt
z<j2Y4Y=5Q&-|}#&qtad^dF1X0CIP14uNhhz@(Mrhq&O55A|hjGT$L13jOXCFi|`gU
zv58EHYGEV4W~^X|FJ~m+<Wo$dQWH&KOCifd&M*4#Gr_8pJgcdLAZm#myCqoaI2>pw
z(zZ;6YcOcJFzWaCJPBa(jVe8_w4I!QAM<eN?e*m~upCm7h`lY65(37N1yVm3B#5&8
zgTsJV^W5b}Hnx`yYSkYe;>t1uB`B!mKV?d(g@tnH`EJUeO)p*C4Lsf&i|6+*<-8-5
zUFT41z6OrZW>@W93p_l^)A%p8+w-_be{+%Gpfl#s+8At(i<mo!{IFV2N8$#Orvxwd
zYk3*|Xu(|NCC^5n>kuTuC+7;XwAIcrviGnnI1(x#F_z)lmuK@w@=8m(<s#|grvGcB
zP<A0#MEIVAdv<$VC1Hvg!jHw7md5r|U%y>#5L`SS0h8tApkv>U3|DU@K}}gn5*npI
zPcMszI+w15XAA$W{43ra39m=C{+)S94jd<LxN0_!d}Mnfm@7Mk8RMw<nq7?tOm|6z
z$#aC1h841M8i`R651_)oLY;1$0@Wp!Gc=Rf+L0$*i(xAL-ut*vzs4boP{Nj*EG7fc
zo&ndTu4LMQ3w^~x(U~RUtjp|}c(rE1A92KmQ4+=^$-}!cEC>H==BUnnxw)lo$V&t&
zgH34Z`?w^r{VVC<10;Zs*Ta+~(nLxna=<MAqc_qXo<$2R8XYbYoFA3$%9<FK3?@;n
ze$}&kISLHL!X*lp%;R6R_%h!%%XWmJ^tX31{X<OO*NXww&))3%hryq3QwxZ7Ze`+X
zwvKqx2=ov8Ykl@Qd^mqZ!;?HM+;+fp0qJhnsB5_Mmt#-v%B(r&ftzqI?5`u*wVS9k
z*H9h<D7Y*yyAy&$0u^cEPCuslV)9aH3d5m-2)P)H=Q%IZaSPwPPpx{Rc{7k*&899!
z6PdJD4;0)FQZKqK&T_ms@%4Kj+IO5fsp+gam@~c)=1~3A;<+zr-UJ?THConZbkM7=
zz$<ueg6bH?_NCxkA|GKeSG|xQNpt^ntLn7nGB-F)t?~4B6P472#pNchdu!f_DCs3B
z+s<ZV=856*Zrv85HIwLFtNbBvEE$c^{i%I3L{Pl8xZUZ->D7e0EN`GW_3)cyKUS9m
zPbWAVMZNeq6hY033z%QlW$H7PJS71yUtlJ?yZM2(#E)|EdlR+C!(T|U|CV&0lDp1b
zuKOKpr?bGRJ0;PfboyNWa>q8B@04QIjn(i;FVA95s@=KtH4M=tuqwBfZa!q|pU>M-
zLJuyTV94S%E&&aV>ajE}7peyd-nUVq2Xd^#J?5nk4<U=0-{iKG5QF~5M~Yq$RMrk1
zS_wHh)_;whS@5W3QzF6yV1cOj?F^HowUg0xvhSyMHoau}UbtsAcE&3u{w;I<Nzv}_
z+0Axl7lGKQZ<s8r<+%foVtRjO`UGw!D|fuzyI5T%DwqU1O_970RK4L>+CBL-SFS_S
zp1$a$A^IX}k;VzTMpu5SX8Ky?-c1|SO>6(sfL}@JyTazrb9}zkgQGzIb(7c&f)~}y
z1$1!g?asPiS8e(;iciyd|3oU-b4_pDMbtO`{(2W`Ut6{P4v5Xshr?%>znMgPBn~W*
z{D3WHRzQJjQ5{T=--p&1!IRo~(bjpfO!z_Q<(spttb8_oU*?OMtOIZ?uW-$`w%$!;
zU6<`~(K$TF+keVBNsDewMvK1C$0pbKJcT*1Z8X|u$BVl~G}$uEIY&w^1XW_?2eW)l
z1p(p5lgQdGcPn}KyyrfCjIp4L&`zgJ1l1#6*u*_`g+^Dfe6sEISeB+>z>mD8I6Cw%
zZLEb<S77*Q;&lNv>zy!(J?+>qZ>;L=8enD>D*X;>lY1Xw)k5)4g|Tycng5a7<n?H{
zle-4Yr+h@u{hlI6pv!iy<G+UdhzMvZMIwz*rG}(D@0yp<U(UVQ%Cdzn$5~$+jR`<R
z7Q*Qm@OW3_wSh%9Wum0OxvQx%AbsoVP)eRa^Wj=0NvF=ekw0W$23Qj22<mH&^$>A7
zI{6g*{d%eCbr0KO<g(4RM41<y*MTc0;HHqhw2qt4GQg|vv0SYv;%`{rSw?9(y?o$k
zmXEovlB&Lon94@c<|Xs%u{X#2nT_^7Fp$D?@$nGie^+Teu###}y6jZHcugB;{4mV!
z2vt9qs%b68*Xd>emDxF-=HKIBOd<uMM%CU2<Eq>4Y6pJ`6*@hib=THS-HF~U91niL
zE#e7>YU*wrpMJ&8mv;DbQXWRt>7eYH54P_w8~%RZbYDB{5&GLOkq9b8{%Qbx0|lc6
z3Lj)nz;I-1P)`;3Kxz)C9X{rNzaSR>sx6#umOl6fU&hP^V~-pQGN*=O{xpRRwq5Bm
zKG2zw3T-}Egn;?W;8CTUet6JCvJa5v&?~o=gR?jQ)fQs6o(OH32nCVLr4Uo?sC2|o
zpx;cQ$<{SE5;gE?@}SdUZYK$&sENLR-DlOVkhQCi=%c}Ar;F9Q&aVEkJMU?)6_3C6
z{}`l{iL-4H`dKG_*A?fKIa%vw;bt~D2y_`zhyXot3Ue(p$BiEF&4kI%mM!pEh{?w-
z`#6ak7X~q@2<ai2(wjAVo%dTC<-C`n^5{TWF`w0jt<<pCPp=m9#*|EPL_GVott-EG
zjVzQu`x;&#9D3#4D~s$GS-g9@a(t&P3_D(7x|1uvk6~a^6Sb|31^THQO}Kz-6O<Y6
z@>~y&MUA-6ONk$^Eb!w?Zhn2T<;!wJG~EaF9={*!a=IpAw{93V?k@qZP2O2@+!Xfv
zk?bIwGsC&=dkXtKthE*E-Wr=gB)QY}_#1mc!JhXYH715?QATGBNyn|*xrQLenv=N+
zL#zD9Kz5@05_T}8c{0=YV<oE8=pGb?@3#KU^vnp7oW?d4M?t%VG6c&S=n8aQiAcpf
zWdvwC$;rQWcG^}hhN7#3;7J249#;3e)3#bIPh5Zh^2|za8m}CAPx&LI71T8D8E~vO
zcWtOp1o;~z&1`AnTMIP~w>Mu%(1&N1XrC2*gJ+~i!qZ@>Yz%5G>a#^)O{`J|fxXqO
z&3L&9zvsCxvO<J)>v@`MpgVWu(VmY=(O_^xa41=FYpY2TML{U{?P-*Ee&b6IByO_b
z`zo6!jr@N+)RWkOc+Hh&o4}chpHGjEEiF3TWkYiOUp1MLcVam!fe#j6Tm1L5m-Fi*
z3A{c#wjf|}-J7m_3xs6OPcxnOtdE+!CX!|AKoR>>fj6WG*=;Y@21f)2?kjQ2ZHJ{t
z6yF42E2%VVv_9hER<dy7-e6Vc;)OhD0AJb$WEdqv(^<QX?47!5{`Lj9?f!aF5X~Tq
zU2WrcgC#!qIUeho@0}htKcNgXf!@O_F=iHms3MsqK#FW$)uxvyj@cgKd1u*sN?KO$
z{Zx56-0WSSMWyIX<iju_YUkOPw}Cu@)^%xf8&r>y{fY6kmjch@4{UjFN~bwjDlm%?
zFtV2Xk&@p<U+Jtx{=p<aeRm|3iBm^d-{_lQAMSmYoJv#KKQPmGB_V$8T;IOf6C|aR
z1E>zsT}IH%<4z#7PCc%g5Y(PAlrODz6qUtZ;w-%8Y|GWipb9aAV2!T9R7C3Pv9V*d
zpT-6o)0~C6oGLl*Sd5X?i2ka{zhGwRmiH0wE!}_UQ|P%2`-<e8`aO<TiGQV%QQO{8
z7AjYpatC@WA8S-?WHV|fJ1DvvJ;*ia(`$tIoXp2qmXAT(ct|eGX!PEjuI4~!Y7h8N
zXw=0PW}J;zXo+S|nf^7Cv-^H;m-K!^vdczAfkazgJ{H+e4ew6))qDB<t?T{)_xOIL
zd#;D!{QToj2Lv&i0gg$dM&r{AQLn~}oWb^xT67{!H4tRrn2kGshjpiWYVrG28z1oB
z?-yV=UTgZt9vostye~Y}nyR_+$$i8@^5)poj9R0ZhMq<hMU_|oXi)%%{d8OP{9K^L
z?MKchq+|9ck6j8s#de8yWPogHaL@2fR)U^JlZl)Bhi^jnl7O|DcUy?<VL9bDfx~R5
zB==8d*Nt(6Ti@FDOTotK;%%?VAM>9I_~~-u9O#R3i^0Nln8~#{6rG9N9;%Sp$+u>5
zgrF`P`_9@nji3=>h$IIi6;C38Z}o*n;HI|YVwNcfVhu|W#35*Oi`Q>TK)37q;RTg&
zJ!<NzZ|&i+#7NcxUTgbE%&pnH#jeg08P;2SR~(CHauRejQ~hOk?XmGN&EVK}<Ve#i
zSIvY#FR|bQy7R*-0^WES%qUm{j<>n}(zf3qCZA3R{RcHBj8=>phtbX6HFs?ew@v+S
z=7C|qfbLIq0}~?tgMdMpAR4vBAhX##GndO-#9ay@(acg#fHBtB&bG$yX94c(GOL?>
z_6TAp>V%gj{{tdB$I{hhmufOPfyE#tfic0q^WO9JmcXJ$I6rjEu<D*BFZ4;9b3G6N
z`kgp6RfDpymvX{BN<Ux&*rzIB<{X4*Ci=HAOyLNy0Y<G`Qc#cLKm-=MDEJaAhH8dS
z-K9M-aw+__=R5r4FywqDy5-fCqNbvNFfDxaU+M)X1+qI?wQT1NiW6lb7*`QAi>Rjl
zRUgFO0RYl{UmIaU6Ca=NLj_RIjiLXN7mMfvZ!FoW?-h@7IK^?!O})jv!_9vC%@go?
zFcWey@4YhyPelnD(7;E4|A<Oj?@=@7K6rPhS$6P2xb0<DskebVh=jIg&g+%O`dVig
z4&oZb89#Jj{mR4J{7$?EWBRWVr;ApM#-7kqyT^T3G40h+{v%ImkGiyC3cv2U*S~24
zz~aED*JTo~Whg+5HysXV5DIyUApCxt=kZ#ZCc1#ZazIM5Ty^B<xwV$V_&FYRK3;8R
zHb!O9V}K5oVSe|!beb*eTH$_^EYnt(9)rB!Q6#Q9jhTf_D<scnYJ-&WETm#rMn|`T
zslHlT)%HCn(<^tDN~Oz;X6A<=-50%@$le=C?a}ZqZ7@q=*wirOlR^8y>kNtoBhT^|
zc82Qw2liVupZ|v=E>X?KpKxlW#2%S+QN7LM--S~qtotCja%LQd`MOEt!5FM_zjovj
z2NJvft&KKt#K>>5js>*0Bp|EMbAWqnJFCMkT7G5T01f?a!;2hC2Xw^}z`7i)*b9SH
zQU0o}8?KeLjt(OKH~<1V)GF6qTXi5;w@xTqxqir!-?)jR=WsfcHf{`)^32F#1%USg
z^Za&^hFC86`k9FK6N7aTe7^H<99X;cFeLGR-4AX6$Gn%7sCgCkJcvc(=1<yBy-n5=
zZ2t5NaAq3yP{ii$jU6JQn}Or1-*0LAyubqIeQ?``=mA=ut-u4a2mIiOUrWo&!~A}l
z&2p}XFK}WEeow~(KxL0a0c<)PP8%_Lu&=2|Ym*3@{Nnk^`Q6S~nFmv|WWkH(YkL$p
zLQj$YZ#1Re)9c9ryFJo!GZ$++4A;orM1f%IxloI`bjFt1DkC|%QW*-?PV!&+hB?NU
zKri-(t6LDOAjUF}9<#0?{j%Ds?=B($(b%l=LzrY7zw0E}H@YHds*oqZJ<ao?-lx9g
z6Xj&d*4$Iu8mxho_9D@_=l9<^?$e#NtC?Fg%L_>(I|x`hI?nwiaXo>s0BUEk0!Dca
z5OmUaR>@0WAt^EYx`!uF+djH+Jvn}{$<^!F^m@9r3q#F>V-g+{`d?+U^>LAcITB>F
z^KgWI$g;exR-v`lbBO>aciE-R4o^T^Tx>eH<Nuto4c(P)N}UPhZCgtH;x{g4w%`J&
zk_zi)LSKa174|?WUGIVK&Kpo>tC4W=0IFG$6sqbL_u-dw=*otX+Rfw4K7%9Kv)c!m
zh(yqvXop@BrZqn*;*!XvSj{P^s5d-7bzBNOzNC}-@wol%sdV$+j{gN&+l)6TY{%|-
zuLwT9nvXGmZt-h}m9M0T?$eUbei=peX5jYHL~YDkm^>LRjVyzkLaN_kmtp(U_!rZu
z+CcIOJCPl>($8mCZ!e=)6sGZjejn#QwO$9oFa6;#IxvD*aHjd+ciEc7ruv_`#T>+j
zI^IrNEbHa4U|6xm6L-9*>DhZElDQoy4CeesQrNd&9SBkHN`t#0Sc-KL=;xw7waUS2
z1BVD?OUtl9IHKl3ctg(8-Wr{3>Jf>-A&aaleq6;4lW^{xJwA7=n3KoS^}(caT{mR5
zfj)VTq+XlQ@JUqafiXPSHDA0C=(5Gaf)4cEvOD{9R(JFbj-T3Mn~HHSHk{-bUmN~%
z>#MoNWOuD6a2Kqbm%gdU<6&qatr$dPHY32L^8D<TCH1UIQ2@(u06w(kvAm4iZUQ6G
z&=IK(xu8x<AM`&O<vngHGTo?(G#?GMK$R#i&HkJQ{}}&=MJ*-XF-8MWl`~o_SlsRF
z;<v6f0Q0&FBqmX@LTGES#I}-Rzp!`6Sx?zPaZn0zLK{9eY%kXz9ZlHKZOLK8(vipE
zBTLL`UwwOgd?9huMsMAeF^^a6#xM=yMNk7c!1<c}L|>gn977-MID_8Sn`SL;k^=#E
zP|r~y^CF)8{f0JHCLf@1l1cQM$yf1VR36>?aF`^F$lj==_wx<Rrv*I78d_hUB<Ask
zuARdIWTD7^g?f;^H{^tLA%!VQ?khf6;?Gu=Q}uVRPTCLC7jJI|%B?I)>ntQ91P(lO
zJ)~HD3`mURts7EBvOLP`&4&p{Wcek=n6qL~hQ|EO#d>$$nA9THwb)|bv7^aM_$i@#
zdMW(~K+0H1$U)1H=g&;9<8-Z)(VS*OOO3kNmOTewXz&O0(<a-^{v?Ax7S<YdNcFyb
zsn4uL5y!_NGCnlQeg{462)Fra#5x!V6sYwhO^|lV4WUCmujaHIPK?-N?QENJtRF0S
zD}EMq@~B}I`b8>|nDN@6<h~#z(+H{k#OKcVq5#HL0O~z_rL$FaEh_$=LsK!|+h2^@
zx2yAgdKL`}Tq!9sQ?YCnzHY7CP+`6)?6htRhT24Z;9F4|dapvGj@;ZP`5;Mu$iFB?
z9sfrNduyw1kH<h;-1!%Me+T|KaFHFu6Z8Kvwy$6j3wctR@+dyuWytK06h;b4*M8Oh
z5y4+!^Y$$;WRZ+nCIJxg%HAu{YO#x>?Nc6syl6wovxvo<cJqEdOvy_E@g1^HVA`24
z^P;zxeeby@Z+o(5r3ORBAAPVYungu!ZB&*Qb(hXs6sOKRD{-xVM)DKo#`_G~+cbYC
zMa5CK*vWMYT+YFGKAR~T5xAhi3J%5VTU&RUfs1{<U=^GDV+4=5UzU3jaP?R^17zWI
z<%VH_Z0Z7@#LQ>EF9{fK=cnLgg@75$)kEmZn2bD{r}jxdJ$OuoqOibPC+O{_uVO`G
zT8|*Y%?O9v<dzFRJ8%fWi1`f9eKGx0Ve?waE|d2x&4qqY2UmMS{rLA4J<x{+_d7?j
zzPC-pXD&XGb=RfE8*4VOi(_>)NHr!xT%lNHY(QJ28`4JRe!38}T1l^!PvGq%En3V~
zsscbx4?oE6j)c+0C6JRxK3*l#D6FKs;D63cGco#9S~{(txB??3H5Bh=^&HtQ*-HLF
zd*?H`>%Tn)$M=tFbOBPz(*9eStP}>o<=sk1Cf8}ulz|layaCg9G^AnpneRt~6Ei))
z=>e;84-AKx2O{|~bMwF7@;@!HG5!gKG{cY(PdM}8YZj<6%uys{uzSA$oqpsKYUogl
znSLKgHCeNd8G@b26=HlFNp8Q@`X9_FFMcg2AtE9IW&MQc)T8M*^~fjxZFZ0-B111l
z+kyekd_P`J_%b@Wa4sm<-E)5H8Ix4tBMX7mOZs%t&tXSyB}L&xR!Y_`F0G-Zh340n
zvI(!1iW>-FoDjsO07p!#+YY`)=^f2aSLmJ}j`dh9d+A46#-W@=0br^7ToHK}?ln$$
zal763#q}4lm{tmk8ZY8jyX<+i*BY~^nMJKRTg3p{*B>>kwRDooCU1yvA~jXrH|LpC
zE#(3?CueyNN{>|+3mg931nB1K=H_CdtH~xA+?RV6)v{#9+-4!{X{&YNTvY|LPO??;
z5nyph%n^X(evK5c$a4tO>VO}V68Jz1S53KsOJw-BS91B{k2NhkyQOx@XX~V_*3$P=
z`vb$fS(3koT4`S6DsyQ%Ojrq8EWp$9jlb)$QHyBV-e@88cri!dQA)}qG4hL^BNu=~
zjCk}zAaCYnq+;l{AHc3^f0gxly6s>;i{fOFX(m>tix#s0*(Fr39ntmTmlwIFslv_j
zlFypuUSvSi7mbKIg)Nylw$rDPMYQ-bO-wSYNR};`7(UV@xf#>Ex%cP6o3!nDX&!-f
zK*~xqov&!y(mfSgxKS&`5@#QpP8to6)VWB<d|Mc+w)#Q3@;T4H9nz0bB5>1@5Byc3
zEc!G%gz2zK7R?<N^4w>zrnf2vRdz#td@HO`KuYu%!rY`5T6Eaj^h9U?ZkyDW^$~}z
z3MPl0SX#1N$>i#zfTyu$K2=JewN=OM#~NQu>j<PQH=n*GJ~yN4AAAaCW<zx2#XV5C
z*iYX-;`TM2W_ztYGs84e0Q#04Mf=*$aoFKk2weUoeDUB;e?KEz*7D{sLjG|i5z)hZ
zo*Nr2YB<!PwT}LtCD&|%ypqFDWjC!?9R41Rj_q7jemhV#ZSvA^H_08HrF<>+avTjk
z96kYGv!*B?+k#T_uCIPN1Kl4U52J_77t1lu!k%k7*T8FWx$r?T+Jh3~B!lU6KmX{V
z6S?jmWO2GFIUVx?pk@7jR#u(ee&037_b~-jAt-~(h|+ia!SamJnH9&@LR(B&3_P+2
z40tbeuQ9IcP;74*EBBK0#ef+@xyzOfe2jjMp)e*DxNv8HO%F=n+rJOKFygrwD|_<X
zZn15hBBioYY%iAD%{%(WC7>8L%WO~zQzqJJ(I4k*ytF!KXhP-OUNxZdQQTinCn<_@
z`9P#=9=iB}nouBrS$i}H!1=(FD|OKF)xPH;YjuJkRaMyY_P1c{?GDXSvF1V+E|A?0
zWApX&dg4Mq#hHY>=ya&A!?a(pNjrf{@X)=vd0D9K3BsK^NTjyU*6Ub^)rSUO&X0#W
z#rFN*&pXP^z=49K^=rC%$YkoG{n~gUxFYOgmhn(D=YNS}&~nwqk1fdEUH9bx8kEiO
zgFXTlJXt!ojw=i?pOO_Fkirp6FTNr9k#8)Ap>&AZ>ztK!(tA%I{6k9)71jz4{g%K!
zTD_;dBD@nPoW&l~BDjORm9p(Q*Za;+(6b7xi(sIY>%S%ZRKEEhlsVOkr<MK@B{>Jm
zu;H2iiX=FUJ|2lBPRBLk0jbHkN_#5$8Fj4R#m<Q@SKxvg4N^20H#^>Qp^!_CaealG
z5V@-Oc6zx~8=$>F%2nyqSq5k#_YZ>87lk5EtJ>{>FK~`dBtFxv{GJ?8HS7~*n`j#A
z(piTMQ6xYIkdQta(nlA>bvKhc4_RrP`UT`)UOHf%ZfTa%EWMw~1YKma?!T34s^ptj
ze+h3=-xiEgXR6S%u<oimMtg^V_p+3`qiW^;$(D#bYK3ab?+vehX%7VYOqZ|=<|WH&
z8Cp%Vybol*ZB6$~@$&V)Jq(dpbn7xuUy3R>k&&dpP=T>;CUtcJ&&GA3c4JBfSc{2X
zsM=;YyjVmR;>F3b?U45XmyOYpt{XdySK(kq1ze13wqezehR<1kx~jc_DUf~DfNu|~
zaa-tMse>u&D{WDIDZohFw{Nz8cme`iwphRNqw~rkic9iP$?mbFT=Qn(x7tQ3OmmGH
zyjv?IM<3gxVfb)&O*@G6xL&6ZjAjbqR(Kx@XN;CBm`jpBkzYSGT|ZyY{)ZJ5kEQJB
zfoxTRIm0si{QR6dp#lC=`yO=7xy|-Td0lB!MjAyasH2T6&Aw#?H7J4`uOeAEh8h8#
zbp8F`kM3n<XTkSEe*^nrZ5jjkkQ08x)51&|*#ubd2O3M_OBNOiju9=opA*Ibp~VIb
zFgi0(X#Z5+3p&h>*#DJw)?ZPz(H<XCQbLAQ7`j6mkshSGJEglD1WBd45y_zhq&t;H
zx(5kqkQ|zOeAm5y!ToL4I%l1=&Ytt^XUDTYUk1L%bK_p=4CbTymzKuhyeOh~nv23N
zPV2{QzL)!dRD!X@Cr!E&LPI`W@nI3k@L}UC(959luRPw=q*40Y%;4%QmsmwxKubLO
zE_^PcR=kAq*8|`8<8`sJrg;S&ESGpLeA5uyVFeFwe{Xum*}mJ9OU@?{<4tEOXfD+#
zu6pgVkH&S;`w4kSX&E;}BRw)!`AzlN-o(4EhuFAR25*l^pOF%mDW$KTtb+|J9L<T@
zjuHh~c-@5J_>2mxUvOZOQG}N1x-VNOY~~R<Jl}_)H(DNK4?efQA4M875pvo573)jz
z-lAeG^=U05@{QQXujk12*Ks;;;%`r1h)n&w^mGBAAbm>+AswDv2mOX-#8~;}-)~g6
zfT?nl=j2xL%mQn6Da$TWJooh9-l`)u-qQ<`iFN*+#dEqG=ol=!PWYYKeA~5mK7vZH
z>ZQ4wYLa`~02n&x5j0OU^r!ztkCc|)mk2Pjn!@fSg}`?3(~u)H#U@aLC&^7hYbP}`
z_V*-ZG<p+wt-Hk>h4J6V06m!|r^BTb994wAdi83$RHf^7zXBKyxB*UY=eP|sJ33bV
zevRc56>C>qi(8S7!o?737G}BqrQ^L*nXTAKsi6Tp;4t#7>0-o^7!7&?Ra9{X%9n@u
z;A7oJn+l$Zv+CZ#ki>e1pv&Ufu9gCyU9)i47n*NVhRWXTIJXYW&J%epOCT{MGsPsl
z=5N?I-Dla7wvbu-QlkAer;J>X7=s`r?x4DtKiS@EL0L0XaY~25kfMsI+p`bjr+Xe-
z&Gn$bo2TpSv}t8~(X$M}YN%Ru@VKGx-`~IU9|g$|W(J^#5?_PvSif_cJltq}b*0UI
z{*(Z!Zqm_aYlsZQ=Re7<Pc);MhwI!dix_?S15h-RHFXp^9n9VwXC^+g**QYemBRX-
z=b`>|^7w7P_J={|qgc3_#dr0HvC=U|I8tA#=hVtaT%_*Aw%Crd+HFMFg7ZE$SK=+!
zdbp2j0~qqY4`uZMF6z4RdU@9S(!O6c{Iy~JL~Plo{c~S1Sj|elNq<Vm-@WfS!R<1Y
z(bBYz22UlIYBiIl66!3S5vAPh7rUy&7`L$+C*2RjAlc+`JSoxD5M<F@E&LtZ8KyV8
zfwXvQji<YahBzi-X<q1k*ar{ukAF*OH%93S@0@?VIX@gccQ)|osTyg!MOSo;g(XF{
z%6=w6?l#83@*A;Gpb|0d|Bk6NE>x|*#rW*9gb*{CL6$m|Kt6=ecG1K~A(bza@qF1v
z;2y37wCpH&Tb-%_xxlE<HBO_}i>!ZBms$3I#<B%mH~Obl#2$9P##8cGeU_hkNF^E+
zsvaW|yDO)MKjh8K%uqcL5Y9hroF9LYnN#j6aT&&>#S?`AOL?$<d`qpb!<~SlxQLp(
zoweo&S_>*bOo`2xt&i4!HoH)0Z|&Hb%U)6uqwV_Duh#X{fs8z1hm$!R^ZvNp`cveq
zBnlDb4h@EPm3E5@1Hq4h#}NkdBic4P=~bHT2<7&&6OK9OivyFU<F1->^QfV3SBMe#
zvD5X>A4tR}&E))y$g1_LUQ4R;zN?;nUYXiK+uL!@`HP-M7bQ<O!+)s)Ij@E~d%6KJ
z(cdzZ-Y=0PO-YktCgyRvwx*_@T^YDuyl(M-(q8dI7&_*{eKb1%z#j1eAYDW6I_#H5
zR7C2p@PhaAFKL`xeK*a1%+1V(|2aV6<m~-6cY9Rg7ZR<-qF+hjyRB8MW9+ox?Oda3
z&mPd=w~^Zt^`Zdd*ZQ|?p1QfU>sX_S{MQH)9%94=iK3%1@19NeVHWC*0P+vc@%=@&
zuGlF)`z2eQ_H!_URihkExxT<viHNgg?E`gNY@5%Yy1e7iDrVRTVt;yvYy*9)5Noro
z)8(+gD)L<XaIDoXxAPnM)ZDGPeTQvX>|g_N#I}&^SHy7iMky8YY*T<2=Gq<lr<0ep
zLj=OOVt25`CeenpLUIm$PI>d*Xoy-P)8!wIj^G>MQ~KRrjI!0!w3XkfKl?Qjrkxq4
zA&3eTHrC#Sl~SXK+l^-=)*_u|@pia;LadQWL7^7_?dl&62k<^7odGu~%o+e9tho<Q
zti7s={UcIT6tf)KIY#zc>bA+_{*uW948%2wlF?{Z_2ZTBStMM~@7KUc@Z<TBan4v3
zy9=@uM~Y+xes-$Ztr@(%_yy<VY}Vd<6P2%rYj5*eb8XA}k%!Rm4*9YC$Ln3H0271A
zWOpvp_dgt1iuIno;@`K(w-edRSnzFxfTPU%6sntxcIqnjKgXW=j&G}KSRB~3)an9L
z=8>=Tx{ioA<0M8IjAu_T{$xA)E&}z}mu0n5me2g_VTA68++d@#(}_QZ#Wn6qX8qT{
z;9{kZTgiOd6GeX$Yd<NAmLqPn8QAmE?O&CMHvftuu~zE5I8(rW{%Bg<`E9l@KFhol
z2!Yccy5h~(<Qv(l)sgnEmfDU!nUp_uQ`*=tsHwdk^myE^>A)tAySv|MCo!k-2_YJV
z?xR7>xGaATp5Ge(co9!YPj7Y^BP*+`Nj>^2S)om(&F^?s1p(0+5vu4+@(9HZ=C1AT
zfAG$088c2MXB~f$On^+wzzv}ordyMYyrafNola%r5#e>d6sXR+Td&N(^0kP4p~on#
z45o{yIyn-$D|3a5m=_|3@9`8~s#h?pogmTt!6(3L>ddVtAH|WzP|h6Cvyur|lp5*s
zd}x=hFtyM8Xby!E3X#Ypz7cDk6FFPW{lWKc4V$MJ4lN9};!Ztn#H?4}G5qInh^3nT
zMl$R~S_%C-E)NV5c^y<Wd3kys0R=I}T#qSWhD}#T2a(l=S*}KyK-&PLV;gL<&f5#z
zVE!s9=6Y@uncGsE_F)*7y1hNJU+Pr4_CnExqDeka6D<wNr*x3Z(<pc|<~-Q!jj)l1
z()rbNmZ+<n+450GRIeUYx!QXykyWGePlC=4q8Ca!=Aj_ux+1NvSr~}{izW*chCho@
zbcDAy&*f{dJoD05t<S6~<*nHvQw$5Xv7Z0VLxaU2(bY=s{{D<G@`qTMfP)`ppy@y$
zp7U*NjXl#OO*nTsLO+7t&e|JZuQGppAqpF!kZdWX;2cZ8+E(RD=5I>oH<RLUxJIYf
zsu`?<#o)W{+Cx;oDx+BzYU*bQ-v5nbk|M`>&Oc~MPff3N8v2v;*~59*NSFIRL#G_r
zNzfG98fVlV8U;_tz~NUv;+-~)6bCV)!eVjClj_J{kuL|ce#HvvHg2cph<-d?yMbqS
zT54JMd-~EBY0=5~Q4>^5t6ir4=;~O_h+s-ntrjbSK;%qoj^k|9Uz4;V63eohRtw*O
zF{Y!6T8X~M50KV>Z@LJhAh|z2jw7|RvzuXFNA<Os@vad;Y(<y85~RK|PkwAQt4k_g
z?fBQksHNuiNX;jLQ@kKVPzGK99^q&^1m^fZZ~6dmL&3eGWMF<QKR}e~LiFRqYf?%K
z01yi1gZ1L$*%$QKGG;20GT~sd^}k?Ll9dxT{b3HZFzbtx`-0x1zvlm%;SeTI^Sv@8
zludK}%R8h9yhkDV6b~laGLmc6*7$WyY;h@}R#zyxT`I3(85vH^_(z(mZ{rI6@1W2W
zob1UWqqlEOGc{9Ey9<9CMv2FMOJKn){{?v*Pc}2ZDufmq&p`P*rtlMEG|;-CbU0V|
z>VzN|4tiL7NSK6+AbP?%<ZDFPlHKF0JK3T~v>vi3O-x8l-g@j1`!2_m&25g@%Jebs
z1fMVX15?<?FNJ*$_kX>cas?{~KRrW3E1}D(8oJnK7hJ)uzU=d-sg2~`$uQny3Os1l
zsSaw^5ZaJ54xAk79~&n$G<lj3<Gy0Ob0qYC{nBk7$y`qIw^pDsgQ}U0X0A}XQI(Q9
zzE%?Tvvj-SN%WA+lOW$QGGO3rVFb)V%tR7R%dI?uk}LzJyt!)RO9X>oOsUPYZY7+m
z0JQ~+uytIqH5$UC^XM`|4mg0s`iJ9tK)~}ywy%Jh$bS^drSG<b8Tb>{f9AKYP{KZ}
z72(QaJi_SobcyQjTC!e+&#=)2YR+ykeh8t{gRE$vLgWKRdK-9sch^k`YdSnI4!i;=
zrVco7kizH#p_A*0h+FwT-@7Oz(Do9H+vep_F{{77%`g%WB%MldP3zCnN`A>Jj`ZBW
zWwz3n{3QuGp8emN8BP4?g)R#09$o>5zeV(=LLOXng>6>e0q6TVc^R9{x|UML9d@0d
zl-t$`A*)gExQyL))ch8^n03F@NJd5tO#E;d_lqbepT^Cc3abhIi>GB_gkZ(+`Px(7
zj*-2I)zv81ibQo-0Sz(%COAr7qMZ0EgB=d}avo(iuiu7TGOv)_@<sCQkgdIn*ck9z
zWxAva?SLLCg{&`)5z|8v!vsO@bLkp?s6!YZ1m>OpXuHZ9vxy5;Oa2=qMiovr7c}1n
z%vU|EAzDK1_7}@!bFSB!S&P^9Hn;Y~+L}#QzOxg|NMA6bV<|nBVR08EJ0(na`Y{HQ
zv83@*lvDqjw2|qD`_^se$y_yfqL-+CWnw`-I`Un2w}!j?+D+=$bNiFnKfcNL)X`nh
zxr0;q=EhhnH^JN6yGFmkZn5q?XZv5tz`%zjdIcw7cn*MKYVPa``v<*IrnxCQ+@8`G
z={AUf;NmMV8VD|4>8m+v)}ZWeQLzlmYdM0-)WNuWDM?{>VV*GaPL|D!E>W^WcJ5Nm
ziucWRg}ZpUI!cV`vIg%d;9Y_+_3>fFC>CybuqNyfWo<8;lT!JI?qi&DLQHA9`MJy!
zjf!UO(-)p3XR+Fo8g0tm!?}Vv-K|AfYNwpk^`cllADuJQ=&kQ<V&|uiyw-6iCP&d=
zWG1eeCF>`<*=lF8)|#dqK0YBZE3^ojlG=IvOjO`BLa)z}ZIl|LfOHs|<8z)YXRB5I
z)nq{ucGgiwidV^0&ShJP<I=qxm-cwkQzhop2rl8!NoMv+PaCf>@LIl^)92#(XoyE=
z@&?h-1DJ0<XU6d)=w-rZ#82Az`yavgKhoF@fPu}xqlWpQv*i4ox4v$k<9YtG%S4wj
z2j>})tQWQwQ8KMau=bo=lhn6i)5D#zvFNtt0h)OQwnTN>LD@O=bsft2iEq|<J*M7H
zFKVfvIJ2wlQGi&SnNsgO3M`+e43>Z8K8SI+e9D9Q7prl5r6v{>%Gp*1G?1Aj>W{%k
z9M^|fzq35Gmg#;#s{O()<M*`nD4;Dm54%ycvCEJzHolwaf?xQjfA+hRZPmz>x7+s~
zDmr%i6hbz7f7n@Fv)1%5#4eJ;o}nt;ZMjMLP_@^RA7G2#k4!CFt}oBJ7k8qh>c7np
zZt%Ap1F4sec#uVx&om~Aa>3(z;>3B)S6q$baY)Ic^QM-WzmABj2U%ii^Sa|MV?wum
z-#=)N)KofHfVOB;(|>!~vK=78y3AJ_S?32{;R>v{Y{m-2QHuD9$33>?NAJvs&Dd5|
z$G7SBNkH!>&m%jp*2n)mNK7U~8~As+PM3*wIMp@u{_$WL>DrEF6;0@eCNIgc$!bcL
z6i#-%^<1xxyn1O9&OE~%LjdkAJCCkW?yyw1VCy{$B^tfkW1Ww|WO=7Jj9yt(otFSc
zjy7{IVtt%%@Hf4#PmgCsrNPLAevwDIn`${BGrkn*f@+?Oz~7G@?n@Y^<y!2F=K=&?
zcnp~JIhAO@<IT=LG#RjPs}u(7=a5~T9+9MP@6UG>89VsErsCAv5?m4S2&ngi%}&>&
z$CEFuefp}9ySatZQt}Cj>sn0t&jEYE>9CS&V%FNw&zQ}{9f$+gD>ef-aXqGe-LGG9
zT~CMCDkl7|Ij|!^n7r$V2AmF_)eueIXj&p<R7=bcG0kg=mP8>9SOokf=Idscc@GGz
zn%g`R8kiRRW>w{qnXfxf3_|7%Eb$7R8Yh4hCE|aToqRg=!Z#Ly(?8Hwfv;&sbj;}>
zjlvM3zvPELYl*)78OHhofqf>YvkwBas-oYN-DepFifX-vDz@7loYug*Hp5>Hs7Z~%
zo?L&A{_-SbC1b*z`FnTF;{BZRLuj`=76D<X8MQcVC>T|hfhJUZF%{`pCEzc--{dV|
zQ7Rl~{=Cw*eWsY00dpgxRbjX_9!5h&TyOoH(@OIxfHn>sv1~6|7%%7Kd~401Wfml7
zgrXKsQ{Hk{?WkGGiz&a!5CzrLiv2ylc5U6JKS8d?<ETqbjGL4fpF&*bpMP=&GyN=&
zZ1DuHgjQZehG`;8!r^jxYCf}w0Iv3%p<Uhu(t3caDhwcNflTANpHVcCd<XtJWVYv~
zSlxED)e|5eo&aO);E3Oa(x3i0#7~@^f*7IuFCPTpH^z@5&Pty8`Z3F&Sf||DyRIq9
z+#2_BP>Pr-_Nw#oE4Y7tiy4GbOJK-5OqKn_EBLDvYz`$yjgi;FFg1;PM~HXeeRJ}v
zeE;sveW|k69bT1wyZX{=)@VLusiX>Xt7-8+O}B*41(jLRYFR5xt8M|!>y=Ttku47m
zb+Y=T5$V+H{qG&~2I12ADH3roU+s-77QF0Xxhc}apbE>Z)>W#-JWFgwS2>iXJLTy4
zO1=8PyDLME$R_=l?E^cirx-<dcf4dLH#_-Myf%}u0&6}a>`kK?ulqdb0DvWFvV7;f
z&xCNX*zCsEu<OjzrH_ge<C~j_M=Dm|m&YwqYT_{Sjf~7*UZm}?PMf{&<!6c%le5;{
zI1eOefxRb}D!IJbdqrDY-;ovcjq2Wz*qf&D!lY}}LyG;reW&)2?yBhL71=Lh+m}Cg
zC>Tb`s79OWui`Cj6k}%*5$@QFeLh-TopK{vs?Ucqm0UNZR=$w*e48KofTx7ub;(*?
zUiKg3o7c6c^4%h^-U=T1VRLz;|H>1-v~&yjVm2Uu97$sb$i{9%J{l#8zp(DL!M1Pn
zOw^`GgVWD;wbvv=lD<3!ifd(l#R~wh--Mj{KTOkMR+ldY(x%RA%?}-$Xt8>#s+-N&
zrK(J|C(Cc`nSW6y=R1m+a>i6h?qY2azMb}+rs3d-csCX3(ZyF~PE=N;t7Df~wD)K4
z>?@2?|0z^j#dxpiTn@e<$NF3;V$uVx2qld;mBxEW`2nyxyfg65v1wIXM`tJWaBX8l
z$Fuhpu#*7O|3^y=73JmST|9ZP2L7s^NS5UglAwv2H@2tf>PXQLIt+W9kNZiuQ#9Bo
zb*2eB^UrH;OC^KH2pft%h?kA_!q5#v>%z?%3RGj0V#FaxiCW*5KlfFQ=z#$NHXE8E
zn(EIb+?qsK4B=)Vay>riC_V-fK}LwU(I-{yaF97OFAGeY8KRxEz`|2YI8a0e)>6l>
z!1~C>W)S+NXf#F(OG^TOo?rlm`Z`4mV-PIIsLoya8XrGeTuv7vZ`mzx8Vg1$Ku(Yy
z?U#%&r;j4n8hL4h$fK=zsyyz^#!e<a7|Oeop$eQKbsqrh;Iwani@jx9BUY48jJVhE
zg0K6+p5ZBH^B)0-+*Sa;8G%EV#IRYMdM>tmH@N9r15u?p?=SiinjC)Tulf1Mc}2X=
zg9eSBNOCV-Z@OzCV+u-?R%R$Lj!yc49-mbMBfTNZL&lvY+h>-&UsZgIw{xSI(Q{UW
z9beWIXL0u<jf~j`UQHS>QaSf)5Le3Q3OD7#BjMsr()f&m8#R{KJ*Y6d%*85xQdDzN
zQn{F#*UZ>NI8YFqAWlX!5p=Kx?Pr;O1PdCnmb5rlA5H{}1{s;UuaH=^DJv@GB@r7Y
z$~!$ku`teYcAhG(gw2CQLxP16wWrqN$6U3MFo3bn<gqR+oI?pvsa}X%ZU(sA!13ny
zVvncs!f)PiwC)zx1Z=bF82c8VV;4o<eDt5F;YqnZF-*32M=m13bu<=?^cLkWEEuau
z=)1frcL;gR@DG9XMlo`Hw3g76wgz3C5_q?i)k1S0G*R|7OGB@Itz;#GSd`rlu5AoS
zzy!L5b2^1moIii%uGGTG(;(So{OrO<+R65+QR*%3o8BWePjtz+El%yW`b3J&g2_~0
z3n8v>Zzd^otkAp;74N6(u5hDB&(WO0il>^ncpdiF$XL3wSTuEZ!tmW=k)IvKkeCqT
z{S}HP%D80&bkq8I?h`JAc1q337dCUxHj6V<yJRk7dDcL3&jF5*vOw>q-e?YRkP2|H
z7c@5(G>5^$d9cdR&~iBgYj#qp;&jghNd2n8Xwe`th-e*O_Sig5HYx}*e;AjUA9TA1
zK*x&}(*L-%2*!m$#I;;H-H`zGChPWn%XXE(eGP14HkTnE*O#S!`PHQDBJlGA#bSQ<
z4c5y3osw$<__<a4em)Hh@@^tAZeC~bj^9e9ondu#rKv_XvusjU!lI<9zWGV0+IG@n
zIkfHwm%g-xQk9OMV&1Z)Z9$c3^htlUDE^)HXNd_OcwFAYk0cMZqe)4IZ)Sn`b6%b#
zp1r+>o#P<}CjKAu?%xISFTXRx?}BCr{|t*r;cTt?)2otPP7hW{V^ZZtT_EX7XQ$+I
zPfs&$?Q$NG3ima^w;VZIZkv1JYdT4BYpPvk@{%!4)7^A4SxDIC`e|Li==XAA>2XV-
z;gawBkrbTz;uU9vl)@r4mo^hgoL)i=1wjw$c6GJRNv!uhlW4PWT$}CWw=eVXbTV$G
zDqZ!B?!1Kv>!*^*+O~RMrOioqXo@bzJL>Hh%S>YZyvm>?V|FKlrv;1n+4*@+{qGoz
zJEsQrv9CuBInsf3dwY2h2GRX)^NYP{Wh3b{XRYnM#4WRLF(QWm2P67PXh?!mh0pj>
z1y_rjyW^S3uJ6&WEVCC!AAQ@8KG^hMERA$I?Pm)*q;!fL**~GHd3%i-vfywo@H@12
z1&yCXPK%X@)CxVu9Qz{8fe5kzRi2iU7H4Za864v?BOc2bO!DpqNljHS)DDauZkAjq
zLHkKU=>ZJi8ik15>f4UQR5NzkMw43w8%uvd1@ke8-Wy&FMq~*!%?E7R8wm@L;5iL{
z?N4Dxc%~|JgheR*y0_8DPVHQ~XGNs-aT#7*-l~yM8`#(Ng*j~gQi7+>=a;?W&peN<
zb4(XGv5?vcPIIG;s(ZP`TlOM0_Q=Wqtio#=!5tp$nI7~(M#bT2!uhJRovr?_fuw0H
z(hEm}#Dg`GXSmx}tFn~RgZK&ohVc%mUv{$Yxzt?uK908gBaU$?_(!w&?DuENj|FVc
z0`XB-S66|+%^0YQ0=f76<mA6tnZu~n<Iia3IL<m?@~b=UR?7Gv=Z!wIVti40U_Pa;
z%xsVH-;(9--aBcLbrzyIBKBWmxPDq%KYv&d6UQVcbf0$*Ho%I+$vQO&jd1@i!R2SX
ziQ_rj9xHKjHCwVkY3!=wbrD+B(HaUA$@oxFLDpvjb;ak^b{T({wLi*?jf~&tu8TeS
zpU(d9m@752ll^E)R?>VEls)b8N4?hdb5b!kttxAHBFXIPb)ZcdOMi*=9p0O}M}D!B
z`19$f0yp*0pOcySg=AeS3<d##(x<7<>UY+No<HIejPxUmvyn}oPj?lhpI~!b`*__R
z^?bZR{rGi0<#`uUR1)EPf`&`pnh60#CIc)#JQS?CjKTyrH3MvYgw>!imL)ZTpsE@d
zz;ANDzdjxKM1m4k_|x$?v*it$^VZMAe^i(;0r=sY&Ef%%ii4x;mzwcEHhm2M2S6xi
zhNGINifW3ej8_QsR655D(ccEC+Gx+#xvhAAyFU}ls--H77^3Md?5$;~#meM5UNl&7
za~AY+8WW--F<@Z&*}Ncx{32SH!Djv(URtIrc0`W-ZIGo<Vw^YU)yV=8Njw_gCU}_P
zbDP)Ik#4T=oj_lN+|+!zE2p(RCVCM>?R}IM4YPMYXHYzM7`H}{0mw!XQbvGWru`Aa
z)pnkR9cLo0v)dOJO<Ot^)4vf;C4%tgy@5qkJbh3tb%jbqb*s*x9r5TZd_Jok6a1db
zafi|tY_IC@tTM@rnlvT8J;vO`s{h7C55JIMT`-O+`2O^FU!)0urUw~<bAc%FpYi~!
zPIUnAi7_Z;U2Jm(-`pP5Pywy}kwt|uQ2E4~^RWnJ{LZTlttTRv%%dV2A}J}f!{gGv
z)?Ynka1e+;wa8yfH=_VJI;Xx|8-d5kYbuRufdR!wqA1u&-Bw#p?3L-nOeU#Ca!(q?
zqmPm#$Y}Ntwnb!RoswjJA%WRANYVltv<qI(Z<#QVz9Ea(`i30|C+7MkBd7VR9*?=?
zA|vw+OGXV@EnHA88@@RUa5|LX4zn%;Ie;({)ndiX#2aVym)huG@~|>})<iN)3;Ep~
z=<j;eWYa&>3l1LCWaQU=4oWGEA{9TBeAVV%O>c3Q%z3hWmnEMi3<W7JD`yX7{C<I{
zN^*x6w4JKM@3`b~vz@MzBS;_)Aea;em%D<WzS~Y8citV_*L2L$Z)!a|=wXEWzone8
z{r%4>#r7X%(IS)#9roTdw{STrCs36aR;NRY4=o{}qW~HW9wr{!FX#N&n9(1w6dCLS
z*@J$cU|Ua2@Q=+l&hs4~)o*|EYG(fYj1&^}O~Utye7aFA=T48luQnV5%`D(BJa6^D
zZQ2Qg`)gIh=)0m3r&fWNe6%lxWsqY0d&@|z_5tB_LowF(XqG1viY5XAM)DD+w8i2Q
zs7NVV5+xp6S{{c(V;#BpgWqr%)cAM(ZcH5g0c{XC_QAyL0?aj4+|=*6Ts5Vp6zZ}+
zevQb-l9m!z*hPaHLnR$)EwbN*r-3k(f&p38+&Vt^u=YpHzy{yE;q=PNeeOws_2vGo
zqoZTJ^~B@-j#!uFP!fP<vg=S0!>G9cJe0d1;Gw`Q1ApYmLS8P(cK~w~CdeR+ZhdCx
zh=aqaDSRz9vz7$S^Mw|B4+t48g>2W)OwVzm*6rZ&>xfvMy;xD>OnTLCyg;k#dtqt_
z5xX^+et?A|gV+o&I}zxcCAVvN)d8c)*!ySmCig<v2~1=c4V8&uoZG<%PTZF`3u~g$
zBTT%#p6%up^3<7vcI+oUpD9ZJlmuvk%&j#|wRIWlqzf3<5Gh;xy{G31mME+g91V`_
zU3F86?Yv^|UlXE)@b$aS#LzC+B@?qsf<$Vnza2fZuW7~}*4$rT`8hm`v9ZxJm1xB4
z=9fWLWv9i=wX(R_;<VbDNGrnuWF~zPxC7>=Qwp3NfI0#vK-LHLPzJcLMbIs^&k(}r
zZQk@?#Uu0j;RzdyD$=c&hy_;P*(Qf*xpfhP>*Lkr$o6sIqMSFOY*K3jtjH31Z_UCW
zk7@RKd}^I<31<Fi?0syjno@}q4FFf9G39Qw_x7DQUo641#(LrSQX`|bxK9KRPMp7>
zhZqXxzDb<Z{{~`m_7$Y$<rm&=0^H{J#!{T@DzZOW#?77U5o$|Mb(BHBMmt5y?A7gO
zQHzGFHwr?9N~iSJ8bpPioF*sf`1uuUSuh3!DC2Lha9jUQ;Ld-7nnrL(IWAW;wl#Qd
zX(DfIoZ5`Iz4Das>Xl^oM0Hgg0%2j8UR+mfAAj{ce@YN9)b1x81cC^}(Xx(-N!R<4
zG@zqAh_=%<?!!^(EPmGUWAoH!jUI>~J_Mbbg<V6x-?@G^>0Ye0vv}{e?}Sw9Ij}mB
z<-ji-#7~$lhJ>SwMv|RB_d?j_F&%xPMM_A4;n^%xc{nFXHxskbvt~~?A@i05lnIGi
z=5Ko1eg3Ai>G#q}_*W8LdkN(6h2w|qj+&X#$5$Gp75wOsfsS>>3@Yut5E%&i_xmNh
zp3HlpPiPQbn~a%rhHv^C1pybcy9?D>c&qy?c&sVWVCl9KW~Z0n&$)O?Zy&0sm0j<?
z@@rrW2FN0b{LOh8NE>{k^SM0T?HMT@anu)duI(^`*k6Aug<R6ZK&H!$j&n<mjg7#P
zkxOQXeJM1>*X}KMU)>T{v2^f&__xDj_W++q!uPTd+^&1O+8*C$#=1~KlJIeeAQh(H
zK6t!ipFp+PiM&jnLC(3IX9q3UrS(bNoQvA^p<757joKm8FtGN+2CFz@*bkUY{SO<?
zzMXET#jYMazR@7n4j#dxeEo@zfdOn@(B_Ma;LDz4?>Okx=XkY~r+(9~@By=-L=+td
z$*I?<?Q5<Ix&`O+cg8koNH18Kquq)no~ha`{v7Smt4Gp_o#(DS60*WO8I8-eIHQ#g
zr`Fc{{Hr?6XC$npmS=4J9F^%L;O!{xxMB-ANO2t9m&5y0Qk_|U2m3^L7$bVDwQxnG
znRY5ropA@Gi-0cB!cv2M*JYLSDi8s={~2xBhgA@>>2i4;>XssGpS^P4`nPtx|K)=a
zTP>7|^|NY5y6FU=0~(M#Qj7Whq&RJ%!`|e<QY)<(!utchr3CS?Dt_el`23xI-WdOw
zfYi7aa~N_-NI?jgs~k9PH`lnB#(4gIf7QkNeoD-yyvb_mWSis0Vw}G>EsBR&-huZx
zu2B@<#nJk_f~?)9vVU>V!H^>y%|Jj9ReWRgncDg0QU_<wd8FmOCJkDO#Be1`RH|gy
z+S;E>tNxbEo=y5AxI{u6%F=v9i8N|I3?c<XDy$}X;y6BLGW;tiS&-Z2=T$ZIIXAI9
zzrdWChnLhZD8QwI-P6@6!{d0PkHU40G3uNqk$RIHDZ=beC^koiXy7e?(Cq|@TC+g3
zs8|1I#N_cTfm-U%c&`7fmd;qdSn%%Tm((k$T)v3)IimOE)aPyK2b~A+5bdWg!${vr
z+JeaAat3j&KMkTv!P3^f?om^`D<@ahN3Y|f>a8o(@XOJ3pSSnt-qoiEB4Ox_3|Ese
zV9J^zzc14b+nsD|K4Kbcr9a*)TRcxG<Fhs)wxXcl=gR?yNdI&>UB|BTwipX-v*&Cc
zUr3_q<Gbt-yR8XuNgv7`umhuFLHU+0QMcW!?&(32kRk4n=Czdw{H7+$=k271_d*iv
znjjD{#_+bbs!XRBQHc0(Xh;~Bot{p>u}Lqc&Q^9cs_M!Ej_2|79XOJuX3S{I;qG2w
zjhMNC_s6f1m#>28S%|TfK7l~$p7ZoKh?{^QAlW|OnRwbD>AD8wUH&yd0M87>Juij5
z4*xZnfqj2GPozzisYo=)`TRe}+*k3qtg(lWH~&13nTaZ}%Hz9G)4HD|q;bNx)d)R@
z@5|D-`g!YpOM2G5U_&C`pl)B0JWe;|YL7%-#DqumUo6imd+`VY*4hJQPGRj@=8x$C
zwmx2(9)A~)XY-TuR~D~NgNHw7kqi(=q0oSe$Kdm=C4WOzP#51-5St>WelpT8<1<sn
zV6h}Ujjq~G8~a@(-9oSfOOuTyto4^s#W0<$M_7{&w49zJB@C_?g$tz1OT1)C1W9Ig
zXH3;9byRd&?aRz^+#_e_gr^~@o5EkWlqdk_wL#pmO0USR<DCa~eFt8}?<dsrY+6Yq
z=I~?2(#`YCr^1oAY$gln719g=(C{7NBpA>>(W%mJFdvB79!Xcpd+YnVHyo%_Yy)LR
z_24d&ARw9+(z*`I>*f@z4y2DNBTR{Gevg7iu564GqLjw0N~2B_P4?{>|KKaf#q$~+
z5M2>VrKA&tB7eaVJ$?vBR0o`viV-1crMkytuLxn_x{YC<*vaD9I3$gw=kasu^L)zr
zVmOZ{s<HH>@ghS!Q;@02nX~IuG3d_}WVy^$&uF5mIsOwp7Ir`C>uMAo*OOa^iRjUk
zcw{mr>}P2->J@h44uUA%$M<r)cJH|S&{6uS@pVy=Ee~Xflp0sA$fX+WWe|g1c29_6
z*9`mXezErv#x^}u9ST>EygX(fm=%eC(Fq4%^Cmmb&@Ma7kPFnXDjGw3Ai4^s;<C(_
zQ*B3W6}<;_mz53wloA0&tyYJ{x=5;^>-fjrc%U7A4AALQS#=5tw|9Ugm^a;ha`_Zo
zW=E)pr)7|`Ce`SJtiQk7F*h@R@ZqvJSEUdku1TuFb51Nt`3)61CN?q`i}7w$8;#n7
zMaT7pte!tlBi^?0)_EmlkNs#_GvEf^hUetiydMl6oOnjIeCua&@VH)Rj~S(BHj>#h
z?Yb(~D0ZKDMa5Rtwr)Y{tu(d>=-6@!FyNF1Y9hV2D1gK@pbYfgEZN53;J?68{fuyc
zn#o>O>VZCf2EAJ(bf2LQ92+q2H~O*lQ-P0moYOfNWW+e!Vf1^~HgjlR>SN@@c)q*i
zX?dH1b7e7#FumQ_XnSYS8i}f~@<`&)#{uAt&UKF#u~BjQArZRO6zOE%wfr>)UkV%e
z1#VbCa;z=@Gn*GjYKd-C`UJ3LvX_<U)BX8B9jUvcrW%8$@!1o1f=6IC+!VrX-+u})
zRL+K#eS5tc$^XjWA$Do4BWl>!Hnu+8BL3+FM{i2K#Yx?2xVL~yE|D*)4dFbV$CoR_
zs@)A@W<^1!SIx-3J?*5baqRMzvzqZ>N2CaQ5iR3cJ>#2I>f#pm$%zUf)2=EXdfo~G
zQA3Eq*<>PZrzOD$E7wOBoKM!9%?BHCN3S>^1I^~BEU}WI!v*4Ba2Cu>tc567xUfKw
zG!&G$ySRfgY^7CH0<%yF2YQC&1K*p+<5)NvjmG|vi}gx-#n)|4NvT?Pv{%^g6dyAZ
zu%6-S+)q^NOvJ*06wz7jN{VJDG8zOzOOk_1Xex|FVaqMZV^V*5=k?=7=B?V>Kjxl8
zSXkn?3E2;s$IqPj0Q&iOJM&8+sp(3w<|pA>r3+iN2VP&i@lu+kAxmOT<HP&gJu%<&
zP9U<%VcAkBxPMC|FAo8O)!)q+>^psNqwu6U6!}(BvS-eY15Bv2W0EVCsNu3PS-v@)
zF|mqNVb7_DTG*p-!%$zKgpe|I4~WE*AS&nPHkr!Fc-b;|n@aYuuqZT9VQIKuORZ1p
z`DjH)7elkr69<`Igug4A`ITdvv(jd*bjR~#eVybSdAX_iX>yVfnftWovTDSc&G2^M
z{*Y5ro>)S>#QI4CJ#%G2s8HLKbeO-6Awk`LWsz_ldSjz7^Gk2}f*>1v@uN7yPktLO
zQtXdx1rf+v4G@NKO?Bj+!dH=xH~YPla+mXg`E;S56in6a7e96wJXJX_-`*;8*+k?m
z=ljb*L~PT={tkET<#{aS`ys8<!-0TP8I1%3{G(RqA^%gd0e!eW8cKh4xxKXW&m(xM
zIKMVliZ}#{<@MeP97Xmaq+1dO(omzf#9IH1VJU|~qYjP=Ss)~1D%ZjSsrREMphLh+
zn&MC{RcykAx-4=@RBo3)sN7fUJdwvw7YXr?@5hQ*2L_?&<UbfNz$_qPov?%?dUNCo
z_lGvSz8RBGvPewjESM?M(SpS9zv8HmjAA8a;P@d#rOh0JW?q8rqfx-r+UiWNA#|08
zk_ctaM<?-;PP~nCz`JM+0Nd7Pbs`Lzh6|Z5ib!?4==1Y_^`H1tx@M&BFy)E)K`mot
zSeVSopL9u>K2hkUX6On=Xvi`kV`!VM^x)5jGlrsr!r0^rRNpH|N>(4L$PT%(K;Bg%
zPc<X{DlGoqq1AA%?2YP`j7r)Ys6=9B>j_2HbyIr84@3RQq{qaZA^y22#RbY%L>7zm
z6%x_VUo^@>KtNEdPl$$=Hj5#SOrtA4l=Tx-fK$$JP&oa-Cb}!!3;Y2=TBoq5t1{NX
zbcgHJ>hxKyrDu9+s?Z?}0&G+ujK%l}Ms477p>UDU`u*krRIa8j&|GnF!Vj;0&p}V`
z5uQ@64lyBkp_lX-pE)&FJP}Eckhl?xk5HGE4F{|+dI-ZDKn<-U6qlR>6~2?j1);sB
zK?5aw;6c7|_b_A!FwrM^4+WzVSh-<7qJ~D*z2DaDM&+&~W}C#vPvBAcmZXJxaqPAC
zFpgC7@Gr03_n8geVVcS8GIB8?oT+(u@LJjS@YfB6**8~co!TA$6j+@66)et;DzS>s
z!CkWtj;=?7d{8&f&{3DI^KiL2QLZUgmrnTE!j$wn;c*My-MR7>^>~#)Hl)bUB=}Kc
zN>x13k~`+AYsGWd{UkTpz4=R$<e+iQzfuee5Sa+mSB<3e{_-65PkL&b+C+!Ur@SkF
zLxsDahE?``PekrA84W3^BtZ7LB4S}kZAh1+O#Bz;wsQA0IFdnz(XDc*J$b0*oUR8e
zmDX_mEem$?u-ecF3v%4!jb;+ros1%fq8r`R-i1l?qYVb&cu(6;fJ7ecyGc(1w3B+V
zr}Q@CC0dbIU{bJpQT1UA#+x>YE?mfi^%4Alkx@RvOCfgO$j|K-22a$!y4_=)8blgG
z{%{*fQ=F`WnX0EfhmzGPH?^;z#`gi{t=2IA?E8My@bcG-GoQD>`vv}}_~)s7-dbY;
z9aC=Wf1dv|hZ$(){&!gsIF9kZmx0dmf7^n_U)aCsmU1F9R44G?-9F<)zzGE4VL#kU
z*0uhv@3(n8Xf?P0&qw`z0tvXCadONRnSekbN)KsW4_gb1Hx|w=zHYYGHXa}uXJ>1N
z_YgTJ3l0c3*K7VtY#u@2PNM(2)AGH$g`2I5hpn>{$X#7VvNgf|8E_pWC#?*vmNX6h
E9|=SI(f|Me

literal 0
HcmV?d00001

diff --git a/docs/source/assets/design/v1/metrics/intervals-3.png b/docs/source/assets/design/v1/metrics/intervals-3.png
new file mode 100644
index 0000000000000000000000000000000000000000..547cc6db6b7354f878a269f7bf4e98b23f414035
GIT binary patch
literal 164476
zcmd3OWmlA6*zeFCN{4hxGjt<}fRym3yM~lTIs{aZ5b2?jZYhxlr5U=UK}x#2&gOYO
z!Z|OFEZ1V{o_p_o?O$Dc5uu^>3J;qS8v=pgDJjZpLLkV6;Qu}*GI)l1gzW+XL4w%H
z$!XYGT0$V?iI!$&5*-j!PIO${1|Q~^vioY-Prkmv-X$O>e@}?}G(HlC+yy1O#Ce*|
zo@eLpn@>0{1$m^bC|d+FWtyn9)V6t;v0QQZ<_^Jsln8&|g&Uh_Y9L!-1pj$SFFPdj
z4m-B|YfPuw3ky~<;`b95kQt!{(fmG4FX&<3#s_p1JAAq%b2b>Jg^P^~@)hkjrl2P%
zp<TM@=o?s=Re5k++&M!EawaGG&i_bxc@OJ(c^$>S5Xyxq$y&T&&<jTkhmVbslS^^g
zm@P+JnGv?Ox3`nv;b2lbX+fF>Gw5kC$0^*%HQzI2;`-qB;rhxXJqbk$r4OZm6ii7=
z_mST3fGMG~RMb?3Kzvyskl-)~<QhB`{0{>0<b^=C%^(o56bOXOCB5;D1b6}6;`J+e
z$iw4Lc1vMAcm~5mN%bYh7Bb~i64BwUXYSy`5G8rp|GZ~+GrWRHULN<fO-i2p+GuW%
zlO=o;f{cycS!Oe2fiK-fJq+D}OAe=>P~e~8+9-5|2WOipxP7#KZ+)_l16??~$z4hd
z;9REA7~LE88Q5>el3ciJY%D1)VTY?URtzBItpa~pROxY&4rDz!@*`{i-`}0X60b=k
zyDX||$;u|`!K;z^EoS+x;0Zm>Jv5FpuE!Vcc=U>$A3yrquR}TD@!fi}I6B*g;N4wg
zCf_xe9^ajG9`b*EDVM+lWXBi0?uX(}tZA_BjSh~aDd1;^HUT}-d<Dr)V#X5*oy6>H
zdWV~h%)`z0#(@{pReJaT^o*rnroH(H59L#ID6bP@MQyv)*T2}ipFg-^{Vgz6z<4}I
ztm06fz;DXHcsYI3;-GcEe7khR%$X#OqO*<Sz(LXRq`Yp^2f>-o#NZx6h#Wk1tuBg=
zc9xw`{p<eA%MlZ0_Wb|SLfdqG_yzsS>j+<f$B1<82O0XoR~tCCtsXxN4LI8fInJi5
z!})7AeIQ|kFEFP)<NMZhO#|x8Izr?bM*sZLGd3=-OVYG24;=`)4Ha7F<0!%?uUi#?
zrQ=~YvnStv8K@j(=I^pUOjx>jHprx@Eb6sCznSUh^tUtOMYzxb&22^9dBBptJ=5K0
zR#2<=kp+Di$t~#sPml7QPShWq4=)fBLHD;8OF<9kbIwbFH+vWaG!ov2%TfW?rw2<x
zrR6^;dCl56Dw{33vuT4ir8_mO?X#<@j`o`l?zc-TqYf`yaL#O4zSG~m;7{AyD$EM}
z8^Id&=D9|3Q1Uy0FG4kIB_$<{F(MZ2!AQBUzAU=-JX054PY~jvAwL;(ObbuY)a-X%
z?Z%@La;$7QH>k4cQokP4=S(__S78}U7rUHU<aK^K@zr`D$vEI_QtlJ}V1GY$37flj
zoF=l!TC7~k%t^CZ-$RcCvxv`l*``d7*u(7!lE=Yfz<Pfo!V%vo)%sp87kbGoyb?88
z1ilF#bG?z~%<s+*zujvx+8|Qh)Nl0IN-}h^YgN`%p7unV_u0&l#e%A5@>}<-3NO@J
zbYZ*K^I3MM2siHim=n3nzZqQYvbZ`i54hA~%MT;GSoAv>6kg)Y7YL&K(06fHVcZl$
z&i5@ZuhTVsC_|h!@NxyL?Soe2-_yo|BTN1%-Lm2M<%<vSXf+|5q4ZBQzL+xB*4C=3
zs`~mf&iWsPUI`UHzb>x)p7YE7BV(H8=WyyCtm{4=&UPYo3G+Zc<BQG2L$WEE!*>YI
zFk2@lslVZjtvCDPkQU!Fdpo=0vNF_I6_&4~M~M2>^7=(F@N#Nu>hZ~my7b*?cX#*E
zYEPQD7nq>j<S7fgPkO~!zoO#6(!e&j>gTJSms+(7)g*6rYmkV^$bwEsUrG9&@(z6N
z_<Qk%S|F5;x#nGaTUM8+oE7O-%f+IDogI=3SaGM;tBuS}(IMwqrTfB9(*?s{k_#$!
zoD#Urjrd48^~^l(xQ!ceX{D&`#{YwTo$buNFG<W!aHHP*g~O9B;lm&Ngi)dS@^9oT
zYFBwM8DD55)j*YaOE}%$r06SCM0O2HKin1+us~Y;F3@?+e}!G0Y^F(H{IUFk5U$qa
zlqP7Z$oj=e;nc96B)sH*^7R#ssK>vN<=h;(Pv`$e8R06=^Zl2E3nm^!ZQVY4J$<6B
ztGg6%u}}jW<oA6zxND^1+?#W5dmAWv{V(?@&33N&-FNuckT$P?TSUF>33juZUg}ks
zM?_Lm5?HT)<Lc6byHBML76!F>+HTz}Eg6IF&gXcfrI#-55Qi7pig4?jvy$2H+Xy07
zsoTTfkVem)bFf>$$91%|k$!>=N=Zf4<aHqBw$}Rrhb+}i>i%-oeFK5?eNbp_w{~1@
zBO0;L9YqlzAHRBYU-_RwWd1Yr=`*~J?LdBbRL?5H(E|r^G4D=L*&=@}J#7){j*x#3
z2fH{3lIGm%6A&O-xPW;&Y|%n?KU{zF{<VrI+~?|e{pHJ-1?gN|Tn{y2R8<?H+s6{a
zC(U#v;wQ4wm5oB!(gnV%HR$W6_3g6SX*>&PWJZwe>}<2eW}oyG>7x?^FuIOV3>3l_
zDwz^9U?uzCVmItB2w6y`O^fb+|BXogH)E3}DBN~`y~ERd_yQ(%+{-xlsxadnC3Jdf
z>U?Xwhx%$b=>9iCTU+~?_&ZdMH*ek;8)saV&#*lkwy?|6!wKR*IPTt7>~Nd5-kuCg
zV-QBjM_rnF<s=A~9bvrt50<2<mcqMIH*JEVVPuq^n7Hm^Qm~acjsh38xWy7-9GM9-
zO)Ii?SHJb(nrBNe*g+Vg36C1!(04~u>1t`AEH8%=#3<yp9$?s@(=W@{<_NB4y-F*4
z3zK~>+<e>Y6j<;1aQoiC_yB8Xs$pujnf(E?NexC3LTPv~2iJ8s_B#681;dk)e*Bj;
zJ=FlEDpj5N{v$DSeCKbL|8ZZkX&{za&oeJ-DgU2#BSI|#Y%0r2-xnztHMDK+kAE9a
z6K{c01)WXmBH>aAfhog>R=W2wCo8gZaBwX79Z2@L`Y)+7J%wZj-TUPJI{@1nlp9;H
z*P@S2*Jlz4zU1l1-@kvC0;~EguCy$;ZrS<QX@kE=T^IfMLE0`f=Qv)do`v8`fJgJ&
z3~hRr{Y1}QPaCK4S$lG_rLAkcE~4hbkw^5<C%~d1^|OIObI`TS5`EghFOw`q(kEbQ
zb~(H1LM&!gB*uhQG?jhqHBSpF9W_3!=foS?jFev@F_xd7DAGAHaqtzcHiZ|pw%(7(
z5!}r!-FNTk<%DJx=!8nVP{_sWr<gBSEl1~~N~caADH!tEL-0y(2!?)1&@9aK-Hu?*
zwDo5U*%p)zC1HR7Osd^7BQx<fS5k^Nq8aq>$*!8>GEAEi7N!<|S0=jINu(~Xz)-9b
z;e-#0!apnEykV;#EhGUKsYB~lKz0+$*VCmQA?B#nx1sYm5w0`hXA@e{Z6<`7x&{V~
zU8IJWt)@msjur}tv8PXr-_2NW)B=*473e(bq5UGA?NHcN9RX$%N#VZ~>zjBrvA&66
zfcl*jvcB-7O+5WA_)I!XEk)?9fr$xC@Ou=Dz>`5?XMlELSVRadM0-CXo>jf|bnJCc
z6ova{h8MWW^y#Fx02Yp+!xFjU=!e<qvPk@|Edu#WFgs*V5&#|c&<5R3>Y56A;bS`v
zr3*Ufyn81U3~sH9xcx+8TrxE+Ez57eK}%ioECh%9@?Y+k5VCo_1(u?7GE@I2!8?tn
z0cRvo*7&~Vm;6ftmkUND|3$JV7HgNE<tfE;x15fRtBLPbH0>MwaHJ@VUM$%5UwQzA
zzu&N#2^Qe__o_5;ruaTUTh5@IWvSlw+R{$Q5PHHI2dSK!I}j1_3~-gyFZ@lN$Lj-r
z=et<hmOU}?EULZ-Ef?KiXqP4?i2oCZR_ay#%}+M*y<GW3#%<KQ6#*-VFtL$OuP~_V
zC;#2_ZqGnhw`T|X5hL-o9fbhWJ2)uFWBXiG58qa+PnJwj5Eg^#S(I=M?u)ze-#0+7
z^Fa@H-T2O#ngAS3Oz_QsUKku4gilhPc2`;U3}s5D!C6&PiSa)nQM<GwZH#IasPK)4
ze`S1i{KZhG22cPx$C)A4$3%ICs;iAO6wD7V01x05=6|3l2L$A@=(E{!KI@Raf;R#a
zOop?nXZo*33)8su(qCO&0e&8}70@^xwEA^%2gnlWm0#i5xr3+wMt|e>M97DXhY+^k
z9JD!SPL7RPD4bjI`vCTU_wK5Ejn$7bD+}6y+bAn5Bg(;87=~i!H^K~^8oK;_AMP&F
z3ezqE))RGoPyfBgwcY`1uo_J!nxX#@EH6p$s))jNz5cdP<?ZF_mn@$RIKa&iQGU<u
z2}ywN%kA$$rBsQ&hff=pMv`)2tLfF+jsoRH<G(F31TN5p=+6Jm*@H{Yo1K$GUn6xU
z7V``~&IxN>`ACJ3GYz`)R2yi^-t@oOtADuNl>Q>V{}cb2SPJ)(2zjGvLF2!9@oIl-
zJ=)fj3=t|%WSE}%9Clzp9e&>)-(4($*&8G?_89GqAgXNI7jr)bvZ#j&bDV%%;&M3@
zX>ENSewFXjWX8~o0Mg}A1Of1W+eiv+d$_-0d=)eQZtm^ibjK+INVh#OY<QxC?MQZN
zDy92xQ2+D(r-1oWUWdH08WK_c8l_~4d9?D0HkCCKVBN-X(8JBrEs?t=un4s6K&giC
z^`cUu`6}=#!W$dUrVIxQ51LQ#p*^dv87t@{KM$UHFTYnb9D+v2Z-jqs5)>2^-~Apc
zx(Lt^2mpoLHxTfZ5`}2(KjxfpWRyki#+YEOt1;rf=ezoPdLpiWJ9}vSaAk%wB~b`u
zq02%70!HqGf~NjQXoVR6fG&_^3wC8E7k|fDh@s^^S@b3svbMH{$Pk}ye(mXbrf-vx
zxE1n9Ril4DdET8p@08ZqD}W>>CMr+^CHObPYf<#>blhw1Z9NY6&+O`Ie($eM`;7y{
zEY$eAIyzE7u*&*2?JqP<PfYNhqzXEKyKKtV`GiYhVSfg8KMb~Xv9rA$wMzBm!*BCk
z>v!7|#RUc3#AiS!K$F$s{2$<tlpA!rN(+S{mTS448kGyw%IadeELg}?4;EXNe0Rzt
zxf^XS7QCd_6SQE1sn%-HeZK-J=mH~x2jiJXtlbkL$<c0~zkBsd8>b2RIXUKt6uyE#
zvg0YrDI%^6HkU``4$&!4n;5wIZq2ZibIaKj!113q`%R{<T_kmNbscv+O}qT=b8&HT
z1qB672d(04!^<6^gPpcB)z)`67s7>a>n3$V(ebc9ynx(aPq(EeFZdm_Aj#*Bg1t}e
z+W92l=3Mfy9R<a&8v%z>QBsC5tgo#>$OY{at^`OUA3qJbKHIU`Mb%j|<fOuR-0T8F
zI6TcGxHk1mf#Q|9Rqz$ucE&Ff7TiXy{#T$F0|2v<7h9HijO8iWq`P)c89KjJqv5mY
zbO25R)T?TiR6s`v8tk9M`5(){!cVk;-dbAmQ@&|$e;|DaLrW760_fJU4q&KzZ$?jw
zK<r;O!1Qr{vA|nzH3x4{_6jIX0Bi<TX#!5)xEq+8uVAsJ17(717RmJ^yvw3vj?197
z*ZY9#L<(AIQ1_gC3I`jzqG2NyXg*3jN1zneaCcs$Whrp~Fqgx+=O?)XAhU$|9h3-T
zVRRmah>KZ3Kug1KKo)PUu!4!yL~?3tC(1BW#Q6A!6;&ZX9N5aqp+M7gc-=){QUEdr
zv2qon4XUjoe2QH}By5JLPfTD1+8P?)ExJB~RhjGEIzD!*A<V9Gva{=(kW#%`@?OPr
zp97*|etzEUhwhs<=G^IEMXolpq}|pO-3^vn1G0kduYhR?I-jutAjB{HNWch$45sVK
z2QsPC5xI@6?&%H>s$7b+)r~@*JK#AQqxD0ks}zj<_UblDz!$nF7Ow$*W7gvnU5$dO
zeE3=6w_RMA>Y1g%X62^l4`33}JZ<6!uNT>?9rs^yc61bQAbmyS!vqbwJ?hywLwOKC
zA+9-mqpuH7P8J6DWm$c_Dg9tK<g@%9Lj?}3fxDJxvhh3p!w>eGx$#}qRo^wGUl!<C
z5zDf#pKgtdmW#rVJkED#6qfKlyim9T?7E4#wx;8cZghTr?i_TJcGVl@^**&1A#3IT
zJ?RRtQfkK$s29OkQC(E+z$lREX^DI7OTZ8*pxACYFlYr{-2*M_X23*lQ}OH9FCf)o
z;Du@#_EuH`l!Dz64>xTO(A@7;-_lBvO?in5dMQ|@WJA%>X1C+jr6Ovtu3y3YjCHq+
zE>N}VS>)&P9JtuoMS)x%=<kpE^SG}<zfl@S9{oTeR4I@8Afc#4HL;CCAYmq-3rAN?
z;q8#Hm^Stj6nbizbTFFxY1FH<qy%AQ0r(2WV4-sK{$I<<khr+#wg8|V89c#XV~CX~
z?9BLh_dn*J&H$+i`~wR-j9?+>SIqJPyzbT=*9n}t*Nk<t=gt(KOkiNEU76WTmjGWO
z$&m&dQ-1S)<IV<fvsS%??nB023tfQPZHCmM@ZZ{JfBW{gm+=c?U;uh{0#rN3dc#`W
z>ySRw;77F6tu%pfw_aMn0>O$Sj=<<=B>rCSR0_YM@!RE7YwAiV#H1H{xBC;^FsQ}o
zQl0YtUB`l~gP*H0Fiq<yF@*n=z74!kJwq(<EvZSQb*@FeR6XDOz^nCSSS`O)cLx^C
zO1LC$bV!sdF*@fDIa4&fm_}NZ#WVo_jSU!-gtw6gilO<aLYj9fnqtJZ*%%FcFZNqt
zgOsp}($bx={K$qqr=~ra+Wmm({Zrf+Afkm|tOMaG@G4|Qgb=Fm7VQu`N(tF(J{j8R
z1tP!C?v3xfYY!3|7nh)k^V!)M3zEGEb*9i;N@$M-->N1rkWHQeGa*X=$5>QTPXW1`
zx^$pD<{v=ie33Zr1#tXGEbDb#!yW2ceSmDE-?$(>IzNRy{M{>{P{p8B><$}Z-I@N~
z6gDX=X(F!Q8Vof`F@X93^Y00ad0N5NWNGq|AT)vqS@e3rYsqteo}&Gcjr2ch_RHam
zb{6fvz&!c+n!8~wMtmQ$GlxC#3ccK@fxp+J83<!?SL?r12;AQj<{Fbg1f6US6W{}T
z)sb#@y;#*lV`yyrC|oFgjDUQ8ZgsrY$09;#rxky2gMyCB%VkZvfI_7LyAVpe9eqWM
z_F>7n?JkMixE}#TVX@Dt;It7@!e&Wn1vZRV*VvN;3KsATPzUrvn3NsziZ<Z%nb*8)
zFa4(!20RM>&Yz(vi7_rB{y<2z0xR8(PO9=n1rOp4Za}Bpz`F5H>SosxyBQ=LH6u0P
z-u1YaEN1@c6)#PpQvRU|<&zvA7{;|3we{HjQw6w{V+^MjHUVgKPs9@WjkdNm^}nbY
z=s2^>RGGd)EGqe6U3PYR*%I{RG1!t<b8@QayK=78F0X?dPE1aE?#&7kJ_mN3FQm|6
zx&kIc;CRzR?H<)o_{Jo9f^WzuoqqMvlDS&}iw=9pF{7t*X`k8&!?n71f{DT%#x^dv
ze*9t3!Wg{m`p++$b00{e_p~>Ox+Zq`sTsNm)UHd~lv#ZLPL;2)p>#WW#Wd7fG&HpC
zY3J+Ht@m=)BGil4$;K*ZkB-?<uybpdZ^h+L_$`PA9B(g=HgZBS2!uFB3L0L&ejR<>
zk9fGhXd5K@g#V245(D+@QH#3GvbJMx0qNs5OY*yU7vwaNQQ8m7Ha#)V2K&*V2LQu1
zKC+DBL4&Th3O5v~XcL4a=Oq}vFQvQu!b;DqOb@)?+Av!dPF7?J>3YT(6<UM=yI+e}
zpA~TZ9&29k>~U%$k%6kDEo7xuW*_$l3IN7!sH3xS(^^G?^b^QW6+1v1KJIHkF2Htr
zY^od`%O;#lcc}F7mw|2gdzx*DKKQ6fH*&vyV_@V8{|7Fg$!Ga-G!Px1+ID;V_K_6^
zuK>zDD`X|y$Tj;uXZb~PNNlI<vHB;9-q>V#P64GL%C#7kF|L9S@~xg|DjTP#VWoUE
zuJQ+yrMkw(#u~?uld~}yRTig7Sy_9~_RtCzK5BH%u}1YU8ze>JM&7`Ur-=&-2sr+j
zfo&`U?R5jR!B*7IpF(kI(6Uq5MFKDxkLC$%Um75=NOX;kK#JiGDs88+)yBlu@hJ#1
zi~#3e{d;tE2PQf?q`U1xe&>!gW=QRqereczJQfViNV_1nfY*sROmUucI#e`tD|)QX
z6NYPBp7Y|Iv&f~t2?wa5udNL-jgk2m;iVOg{Y;Md47j2hVcbt6$j33Ej|}|b!-oR2
zXd7lIkI~N&1Qq`?A=RGJ@^ZmZuZkTIFMWKGlb)UqRM1$NIPVGYi{{Fc_0CJz*S;tH
z9ZY8+P*}yKO+f6wy$2~0kX(XV)HSso2z800u;MUKU}eR{eYIUc^AAe?UU4B*S@Fq)
zKisX-KJaGI0F4cB>6#k|J^P8GM?pFI?d68-W7Jb1n4h2DrQ)PA{BRKT;2<S9#ZMGT
z0BBT-_a*i`e~1VsXbU7oD>fxv!=2xDH!;6XBFC#ND(d~y2-Q|%)CmnZ!N$f`!KEr(
z-vL!_iO#N7%EC+8{us0&j=YW7ttU=Fur=df>vTNPE-Jx0L`F&qinI`s4oZ|i?P#G)
zX>)UvfJz8~yQCmDW&or#`P>;;Ob3(Bgrz^RUVqFn)Wo}~^fAqTg@F>sZU!zz!Iq+Y
zA4C{}J#!*QcX_iWx@%PLFTfg7AUTt=ka8J#0FwFi+gxiH)-W5MP8VzeTp!*xSJ5Fb
z%2goZ7#3_x6Np@B^2(A9x(71f2dEW{$iaf=Kqr`&j7rQRYNOQizy9pTb=pPJRAyXF
z<}r0D<HVrxru!cQ4I+0QPEG+~KBv8cbKp>)X}z!%a<;R>!leQp2Ij<P)f)@!=wsQo
zwg!R(NL}hG4~~Vl(D&yYDtQxf7UJ2$66*En?h)(t^KLlM;Z(t$boU{?zU0`LZh=I{
zu?@NwH{z5gemr!5ArgEw(kWuN=_*hZ?2rL<krmXFt+x9!9vF>)Z7d%eMd2qsPFMlU
z`uaLB;mt@pA3)AMQ>#_Hq;P<X`0_qu>T;d=DVF7-)rNiz3=T}h^~KKgun3fj@udAd
z3KGb7)5rU6MdjpMZjT4jhb}DkNy%%YZ(61rK~@Bu4A@vL(N6V_Gu3n!b49KNVX}RL
z8cxj_p;b4ra`=4KLbUKu;KVqgwC)H#7_Z>B{IW7fFz14$<bd~ruX>9nWG#}=a7b^L
z9_~zEzWnf{PvMUv8>xhzv=GgWtQ%GQb`NpHH0f3Plxp;9V-?n8bfrKgRD)+3DOAWD
zuf)iNJjqMw2FKrqfP0a2Z?32qnkX4{il%Xf;dWR(c1>7xaSH|CjE{hfwnCI#{x>k1
zLmRU1rR<X|GkrEFPBy`!1<l$O!fi3N?g8maj(1z?yp8>O$6l1ay=|YnlXgZ4%M^Cb
zobes<Zh@cS6X|~k0|v=UNmbQo^BcmGABzQcmW$FL7bcLpG^-)Dz_#09E(ilMqC1zD
zG8+AJAj}f2;@$fUOR(ZV3LXGs?9^C<m1b1=C#Glh?)nTW*zA4uH<3d(^#^{5*kK8u
zIp<q{V$i1JDa=p~7}7Wz9La(+v+$~=_V+~%@LR5bH~$?KLY{8cU447&zq;@Au?<{z
zcz%!e;SvM-cqfk+wYQDU_4+rG^B6k<on|M24)MRg+Du=;oT{-I7MhuwCG0G~IQj?l
zzOE7G8x0M!gnL_oYv8mN+=nGQB7x8@{PAOoc(F`?O0>~_vczoi;Sm8ef#$i`#0yVC
z7~Gl9rrVkHGja9K%TIbPfQG{{7@SZNf!6SQq@XX7=gYX$`!e6r;N~AeM=0?2GU#r{
zv_R(rVfRPpQ$=@CsIInl1_)XUXjIsImQAEUN(7+I{Q-0jQYfLSzbP~yK&-b}>mv<)
z-JDPjx*Wm_9$K8F;}mwNp@gAYYNA~pC7@X|zgAUTS_%z378Mm82GRngvZlcLI5}KL
zRS^K+N$oKr3&Qg5;iW&8$W<*Bg3K;TeC6p4GzX3Q0G}RZZkkeF@fuJVgZ}m`kvlqW
zu@Sbwfn0$_1=}@{s3ti+UQ1tJ-_+DuNBzg5ufq`GRY49JkBOtB9q_}OQOvyYC!e9D
zXVI!%sTTkn)tch3;nxAYOk9RP95)c~@d8z-FK_UpQUWL#0Ms)uj+B<>K(c~f8F?^p
z6BJ)*ttPEELh;3HjdvWrOU*X8ZvqD|r6q3k-AuMXA^eoGhVPdD<RzK0RSXqRA)+IB
zS|<U}<aDWSWh77QpFa<wo)$#VXA%rgBbu+bOA*a(>&lD}AjfU}=Mh;z$*~va4geMP
zBP1l`F_G$xr3c`ku$|`WZ!C8=?1agP1zdqc(soTv4X_*ky?F!dH73!^0_brYe+|46
z!pjq<&-z}Y&nQu^UI*k4rVt+=A3#e->&d20og`y(t$Zid)&nX(v!5JpG<1_%;I0-}
zgYf5pl$e+<<85D<lS1z3=g*&Go{2lv3<^DlmUMf_9GFlZV64~dIoa6$fLv;De&FXQ
z-$@FeW$gJHChxHI1}H_0Di;XQ66BoQ81WN$o^i!mg{e??cXy8G&&7b11D%)pMc+th
z^)Us<1mJY@?Lp4xXy;eUBq0{HN>R?OERP9ejc{%&##>@}`xFO|o{6Yc>VvKrMj%y3
z*#II8ndobm#f)C9X|XeU0=rg06tyTn=nPOsfBbE}T4DW8vX6Q+N_hS=2}eciwSDd&
z2BYILq07XZR~ha@qAZP^ge@$MV&t@gm`)$3A3Kv>W+!&8u2o7O91PZ?g%@3n>w)56
z`Rv@XqW4}yz#5Yp8TsY`DAutYISlr76Tdxci#|jyh+i)e&|z{E&i`@Cz}pt~F<Gi%
z-rn$^y<m)(Lr4cI$)&Jhrndp)YV($8^*!502=)8xNYF?Fa8vj~^Y(e$pi}e4!rX+t
zxp_|CoyFon*|ODklV(4NJD^bWQLwYYlD$-Gn&o@Uryg5cG6w<##&!0SrIoD)+S-C|
zX9e6#L33z&dRkXk*QP?;%D@(JN9%vsp+Xi=)B7|<FUpucr3h`sXJeK%)B78UlqLPn
zoBw@_jO(-FEpG;nk^jaD#>%(Qhl&Xixmt@4SQ5_o4AsO#w=OCz72nE>M+g)}e`WFh
z0D_$<gfX(0V*K`3=u|J_zitW1%hXN_zIdaKwQE6k#!q-ur2eLl6kLGT{opFsmjRG*
zxS-{A08trrI8sfQKHcW=@iC}!Zv<aYiuMW}dU5opuO@!K;~?&bkLcjF6!6%jZtvp!
zeo<l4a=ipp<ReyrX+}K{H5)Yty)+hqpq9wG4tEqPZ>oc9Gt);}N{PeJ3X>tWHqe+O
zw*)EuB*;-8QB7ny7^ULpUu+_ySa^AWrBVQqDD3=d9q-`XZwlDQ_Aa-D)r~|T&iz1d
zLHYy%Wuf7z(Q$4*>IP}8m9=$Za`ON5U>FL7*#DwD6V}ovFcBR`esWnW4dR!#{aK>A
zI8FlyP_LrkF{qih$GfmrHBM4L;05l10G0-cEsLPr_k$SrAP^|&NAY@rD0tP)-F-SE
z)_Mp>v<3ZZVxl!mP(dtAbF6@>o-eD+@L}WQKoXbVaUbhry91maSPWZDgKK@4F*AUU
zXZ@acS^4z=G`*BId|xd?ZTJ26>ZU=b>Fe9fo3t-P;KT^2GE02Lj$3!@sx7UHZ`&cr
zyg(1wV=z^ar}gshpDIg$C&Q1Sr80YbPD@cy(f9AM&~s;FFP`&dM$6~D^O_lOXuT(a
z{U67y*$pfA6Eop4&-#~3BZgMhi&Uw2n)wNDO*+?{g|lEztiu#d4V5?Cm)3Kc=+=ZU
zYHMpxccz{D;+P+aEy5R3_|`;}4V*{X$vk9q(8heDt6L8ev9Yl+5LhCJ5X=f9XyPOF
z4N5K};e6Acj~P#2JZo}NQsCX$G*I^fXtv`FaIVP>*MgkTzev4~Xc>!tMNbSM8|*JI
z^Zx_&)z`NY_4XiCj89buwDH@59vT}PpS}CD0*I7Pd=WNfzA-EmkgM8*Q)0WIER-k&
z$`hn_ikN)X{RzORRrtX+0>b~Nf10LWS-&_t8E#c|D<sl(OBy84&#Vs`;*DMhYVsf-
zd;l5aNTq2TNMmebayq<;^Ii@l)V(JrNFSY5$>F~UAm|nZjZ$=MjaZ<Du!0G{MdRun
z7?xe^g{(|U{KCEKkae#|jKPwg|BUeQ-LUE+CTLSxt7pI6ovG0+(?fcUY+_bX-7|zL
zNhSCcEov5t0)Og6ryK<UY+EYpPrF-Hbz91IQ+aFE&sw(1W`^HbVW_VU4xSi6z*KSa
z@C=tB@XX((4)3COjH?<_B?{LZ!lyw$&?qq@qdWndMFpNQBJdZOH;`&WQo@6UoJTiG
z%sx&OY(S};S`r<fk8?(UctNlA8y-zU%wEJ?UKjBmQC5$ziv@7|59raXVt>?bANo$9
zjhr~^JAT$wTxS*|)i>B2`W2`xDVU+Mn(gf=O-b6ckU9P1noXy;jbk63`?tT-`riip
zZD#8z2~T-l!ZBd|3kiZ?tUfJeYwl{te*r<+`hy&m9KO&^JO$yw^y^wtn--F4r>hTy
zk0)ccCrgnWK@R-_bLj#azhiBZqCW>eDHI}N*D6^)_M|ErEbCPvp1Nt>5FB)h-pKA_
zPMk5E^4omS|K0nqvE!-$_mTL-51PTL1In7@jz(3|l*sQt-m-{1MSL&+=3I443C8xN
z&hr#O0eNE0?PHZ0aTYQ8aJA|CszS)?X5IOSaoFf!r>y>PXYpj3JcJA!tO(>|kGL<4
ztIxT9I69y~*a-=k|9Y^WM=-ul<`!&wJaIt*EusB6OyHGw7Fe6(9x^?n5l?~D2u=c>
zG#%yLyf1h-B57K@VupI{&7Og3h4Wb|$jZvXW`s_|g@`J%7CK2I*I$hV<_F4{1<Ly-
zAS6N_^6&Hl(ck-CpG}*h{Bf)qM4Z2Pi<oG$M~XFfz4A#$xdi$MiI{{0t{O0vXx@fL
zRU)932~Y&R9W;f6Ejq)6Vvwm?aFvUztE&OTIoe*?Er`|Xakepo6Jm*fFX+d&%m=vY
zqvRRhMzyqUy9<*lc$|Uh|Mu-y8Rc#Fuic>57*}G5+1M*ef=<)Q*!K=E3N|~2IU0ap
z5xmrf-H{Fu5TW8nflZT_xAr%Z2S4WF&3`hCwIKZ~`u6P`NW`pi<%JzR5T7UijphPH
zprWv7sx<nlJ5IL1;TMZG6=@`z%uV!s&8>-5V=@#R&suR28TSbsKL~^K(e+u4B|PZ5
z`Q6Si;|3Ch;7(JKT<u6+cvYfCc2#pv{c^qvz?6SM$1l8gTW9w)L6jQ*!8dYUI?Xfm
zC@`&|mp1>3GwIzxf#84ovnX)2SvYKEsmVzCgam@RkC~RL@3(+Crdl2brd_yOIbh&G
zp+L}E>-*;irrDo#wGlUe<Za{4Er>XCz+j`RNCO8Be$&1NJZIYR{DL%+nW%K<|NSAm
z|A*)G{(rqxkYxv?@E$Fcf|}Y~s4WlQ)g^EnS<N_BOxm+@ipp)LkAgLSpISn2)qUua
z$6LAEGiiy&>5jSPje+zW+hF{RJGpG-j2;hRLWbz%RY;(dUvk%RPy1C1&S7D@q8m24
zI|DW<f2ALCD0_f)cTWdF|Ieohd~h>xnzL^4Nx7xpHZ67mHtGo*dbs8%%%mk0m`SO%
zAzGUqeSd7ae1g+m)^)-J0kRAN^>3DIvQWJhFJ@=@6$Cj2uBt!d@!wnnXOb#S>5emM
z3WQKZa3fznkdlEPV^B{?CYu}WJb1#Kfq^hZ6Gaf^b50IiKW#?dAc}~pZ<vn7Jz)C?
zflG)*2tQ}$)eLQWmJq~4ZbeDIFl7+`CZP_NU#p2o-+x{peb+q2Pk>BBgragAIhQV@
zNWy-H^Q0CXiiOmk;Ln&w3P`XrH1WDEs(lv=O;e$<dJ1(cLa=NbVzFm8hCtOs%zi@u
z`eZ@{_36mr%(U5GRHy^2t8Dpw3LO#>y-b`UuN)F~)I>Nwo_AhEz(<QgGa1Z~13^S2
z*NuiS0;S3V+20}y#josHdJxChYw^zX)bCf%@h@ZJBIr<+u(mLJuwaoL>O((&fAJ%9
zh#khA8d)QPWTaBvmyvExJ-~yAi9_Mqk?3-mmDQS)!f}L%=ik*qds&sjVddc^>V|iS
z884s1RBVjtNx)#Kjd;ULG{1TF<L6<=btAa%h<m@P;w+wS1k$Z++v?QL{aA6z&3i|&
z8_ywF%hW+gd~BF>UB?lYOISI6=9-~JxYwGIO-R>4NvF!ROe7~8i>`#Ey;SWIl6vmp
zc%|za^D=!t{cExUI`zP3J!}~NN7Nq%56)=$|9PWBiE2JFG6n`1`)_^Q9g@5y;h6Hj
zG<Q6D%0qMD(%a!9zE_mUkA15C8$lEv21A-cMkN)q4*b@>@h~3w<@8$%lP%%<I@p%g
zR(JRIz{AvG`){@_J-4Nd4;Zw+3=DDFx~jJ<^Y3{=qn;#2!1W-lJk&jv&NL78y{VO+
zo<*+L0cg2S6T!j90#%Gpaae@fTz~9L`iyd~2~P%|$2N7{zhvaeE#nSm8Ef8{+sKc%
zoqQT}^<P=$*I3S#r!UjQ@b~`XYsV8ULqp}ujzAL+3HZpc<vp<Ie7VNm;$l(sQ+;2k
zXI<Cv!-sf#=>VoNFJWx2i83^QnR^Cyv#Vka-;T_bTG*CQ;GyfDY2}EiTFLY0vH=-~
zlI-%4Q<I{iVEC8N=(U7%TtW@(&mR~ia`HdsnT}0S44mU)VxloWgI~VeI9`oW@;`fd
z#m>r{%xU7^&n_w+<uV{<(BWL=c&%6bFklv}$IGYJKCKcXb9?gX&S#vVDN7d0!ko&X
zg&(*pQidgC;Z)SXBC+kPFQ+bFCfa&=((Rfo%(QxiA)#2tvD9M1c{#EIdo>+rPuy}J
z*S0Rp(1eH}CEX-;WBWu;k7>rhtQV*D+$|+;Fw3ZtReD?gq^mcw`iGQ`d?R6w&zZ#~
z-kLDI<5Ys6;`bvyy47!q$2a_|ezWg+*M$@nud?&e-IPW1kYJ<am<b&TIF<&@d}tR+
zqExlQcJ0N?o@%<)2Q@oC1=S^Vyk?7^s5iCvTJ#Dz9kgEE)e{;wd2sHnz1!)AD9vn0
z@&14(uo^pb6$D5lFTEi@AlSnEghi=B8B2kt?B|8<<nw;!(>R_&M}Ssa%I0fcIr}})
zvA6f<r6hiLFDni7?@!wzzfiRNEuaEHSI;+Vww;vE@_Y}hd5wCFhuw_4d-|zI%cn{k
zWB+}^GSb^Uov@fG0uvl*JDCbkNr*LfKCA6kr;WlDW2-)67c^~7qMDVE%zSQvm2G|G
zp*_MV@sBtzKRy53onMu`t?y0s+OefvhF9OdsDMLh%lt7tmsVx@%`QCd88>F4^PoWu
z9U7q=QQv8bD*M;535k3Avzc6rktZ|a#?McD+4mffg%L_B0d$1}y#4zfsc=k}2t+uQ
zNPJ<BUhp)Hs}`dhp;xRBNnw{psf_Yu$X5yWPjXru6eq#wFym~q<KNwLa)c6e=ZEoc
z)W?^&cU&eX83xemGjY_oF!HQf=~JvaCaejszKWTl+LihQD^`~@AH4sT;q~S80A~;9
zZ<LfWEHS6fIAvU2?Is*OUtPM#L$r%i5NGRBkfZdAl#;Ga(%r>m2+GBD@X3)pJ+-#x
zb}E1VL$WX0i|zoe4vq=RK{GLnk>3V4Ox8wDReC}UQ)vfeV<ls}p9F9JuzI~=OOV{Q
zQ2r;+pW`ORz>G!_cl%LvarbED{zm+(<~b2U8A<I*2%oy57&&&@Pr7jYd^X~ny}m#x
z2Z_gp-h07X3?0-YGZ`dGPGT&Ug~NS8``Oj~bDOn&_e#W0S1W7#FXy6wokkJX7vkwC
zBEe7dRN3K%o-adK)VB86Y29<ho>Iljh-pN!1!(gU*(R>pB_b5*(8K#gifX=I6l1;g
zJ37QSS{~~r`aY&9Huw-QlXe^-_AgjXctPgPs~csGuKw}o#figZTF?f%jm4s-be%-i
z8s0)m#%G8QSq;d@?PyqI&5+5-+A2g3?IS5+;R4EFEkmxIVSbr@p*@#pi5F44nZ+fl
zLBQ4I;8GwQoTov<*dLa0oVtiUh#8piwEAh|Wvlk-Q^BVAmUzK}I+CqrhGJnMCW}aK
zxVW`)oBdoTIuKQ4kN8K=n_xdEg~PPb0#7+n82TP>rQ_vf!sV}SLxM75HYG3Pf8P_c
zM7<RRLE)s9!KQm7saj0_5{F{MfDdb-y3D$(UhSfsko&0t5f%eG-FPDb5;CrmE&MuI
z`0fZ;$c9`e*w4>I^76k_%To7lBL0Tb;3qrfCXNM9rPdO#!9u1em2b@QiMOWaIYtm8
zWpK}(o%cwC>|QQY27ngJYmteekS?=)|8kPT^++{}pMr9Q%u;ocE8@RTGFR+APNkyR
zMW>1%uw<_tOb0`TU#mALisOed>hY70P%86<Dj@~K{>hLCxg){$9QGXY1M&k@mH9XZ
zakH-*$X1`CVI1H=kW&+<43H3%hgh?<{8a>Oc;)cg<2vQ%rn2ml&z-7tE{n&Q7Cs6u
zMoC|4>~~-!X9d02#2O6TOANXo7q!WH4RPR^M9hx&j~jeddycoH$4owJBh<kHgJr8$
zNd(uybd<^i%Kf96P@a?IjBz(cbsZambC%Lsrwk$j#B7}=0Rx?La@Kb*P}HX&T^5>t
z;iO2hxG$d`p3YzV^awZno(My?V~}BYz8+BfM4wY3PVVp=$#*?DaG=EYizE|-lrsd)
zI4`S@iA`Tv{C;MkdM^84cw~-Z96BG45~A8M@;En>iDJ?KhgVTkxkHmK|HDX`V#qQt
zQLTl{SJFH^LcWLuSsA-HWXH1RWnMj5%vi$ih_3-p4WM7qu~<jWD3qVPRt!<$NJ-V#
z6tf&TSnz0j*Ty>)pyMJ-ml#a-)_<2>aAma_hdahekZr5yg;OWz4_?Jzth`}XgmT!t
z#B_QE)nZTip!t@o;Y#lXuyipy=rqM-V_#rGtaXhHL^dYMI#BASIGNrvD~6IN$s*6H
zbSui)lg0&nWTpNzqcn`D9YHXSM_Z{#eKtO(q$&jwHCeLGypK4NJa^YqPYxZ@OizWC
zz#I1b4+*&CB(*O2OuP;+IR)ub!gKOCh?{WTL@EUA6lI^f2y!rBQII0!*BqEJ@i{EV
z=?=46d;ZsLJ@^+*Dn1XBInh=3F5QIDUeU4|bq3QKJ{!XlF8vRMk}hZTzxDD<A&FfP
zF5~0XU^-UfoAZc2{rVq1ExcYa%9pf|m2sO%&QWyAJ9+o(=NVC+n&Ct!QyXf~5mV>i
zncu^SITq!ARm?-=4kOS+-&_W7)-Z;}MrfO|JhurOxDFG;3wf8lqRjfk-+$rrOqX1D
zFe<7jF5WBKFX6SZouBn=EiAJBOul$gLl{$%Msc%k*Tzix%tfPUKyAc;_iJ8vWtM`D
z-9D^|S+MDsCmf!L9v(F#Q!t(Uu#4ht@7t}S!YObw7R{uiqhsuwQ}g>U&ai?^Cj7<y
z_V%9(=(dR;<3yp?CqBy*&qn*WKzAbrx3J(47#FHc2wcw|i>(Vm*ONG?RXFjm*)_7v
z^;v=T)$aF0y<(S_FZ0~d{MB)ih?P0@Iln!ruMPbVJ4A+IFSdj3qdIY%r*pK!{dAjl
zVcc<BCT+>TA%nE};P-y44qRb#r--%|ANNSwM(K(Cv@DglTsDsCt0reG_Joh5Oz@mO
zS0=xzZY(WAto66`l2xyQKN2&ME_k@#AFg~5KR}2vJu~+wP~ubSDJ>~$c^Lg8Y-&yG
zsj;=vW6#Et_2ipMvK%m>eBVj$vs8RPB#gg~F~EE!$n*B<HfOgZm(lp$NIz@)7vW$w
z;z1%v6=7^d!@&=_C;NiR5GJFQu)ilJ<y1j$7taf=J(s>D4!?Mzn9cmdL(FqV7UpSj
zu83NiR*Wcr->22%z}Chv;lnyeM~^8_nfL5v2svl%rpT`^!m7=#gYG>7{-?vK57dkN
zt1q8w#>0v_Kh+mE&?4iz$Jg)v-f+%3AyP$DO~yN`3vt@G2{<6fW6EKc3o5OYryE_I
z=G;det!YoT1jZN$Mu!FGb*8GPEw%Etu35Ib#^XhxIWy@7;kboM;vLhE4~zu4iL``Q
zM6|!)m17T{ns-YxGByremc4&PTk&)GFB%9s%lxLba}_2RTn#PmUqEQ&s#q8nG;a2W
zZpyBP-RU|q-{sLU3_{{rjs>wW8EhukgN&;e4zI=cizUz3N_L#-u+AfT>5<*zWJA!h
zH=cRRub<V&>Wf-jB7!k;1014bjXVdqUf&-luZ4(S-w`BJ2NE*}n5M=ic-$gEMJo<&
zCJCg-w<gP%!z=~mL}EzIZArhf&BsD;WbD3B>r1=-3R#(}ps5@9yys=&x??O~q4r$X
z7S5ZqQa#`?X%AWKC#h4Hd~>ny&A`AO_j;3ureR<^$Y|4-i$h(k<-8#oRp~t6RR|ZE
z95^tA4&}Qn6)I^Ptxtnm2T2j=`iqi6H#12`WRuMvOs#&zqY}@!&yX-#_JoK0MaL`^
ziyRHd4k=+%ZSHC(k%eoH?zThWZO@l%;^OKE52q2EB|P(k@8rb3eNGuOxhNJ8-`J{N
z4Nf5&_jFf1Ce9IrkRpOdC=uLS-F*+XDTikF?|O98FVxg7-cm{XX}U*!P~p6zYqfyg
ze~<SVyqz3=W_pvAOC{Z0V%_2QLhN>L!2XIARpxb$fxT2?v-P5(rWm!Mrue=Eu@|?<
zR`mRW6E^eq-7GUDoy&libBUtK%8yqpr0NDIyXKw`qOF1ba+IHEo)jb3@3S2G2`^VJ
z2(xirca%{JQ>TeC9*yLR%4-L;c=YZ2b?^Vw9)c_R?HOJl$_0>dZu{<VmDLd=${<Xw
zw^9z*c^h6mNU<?<hYRT&FC*#mdP4pr$4o?@;Sflgtw%rXdf!O<sy;lgluKTd^y<>#
zOsc5axO(_+!ACga`xM=lK<g1pz?J^gR462NObm{SmBSQTJEhtG(QC1yb#1J=Ms1I8
z@8q^djcmO2)L}}oW}DV~!2yr%{0Q|Rt+GZ<dH`#k4i4!UMa4u~V#1=^Q(_&v-T0|3
zzQ>TJJzc*vyyX7hLiF^_q@aliYW8u(+<b*`B*A>)bxaUq#<6C5Ro&pa{)Fgt^3(5^
z6MRJ_+mc)3pM`s@$rLP2{XfzPby_?g!1?gN+Fl$bA=5AUIYl3HRK&6@<(y|rVMi)Z
zUXfMhiSXa+=t%Zb=m$4cnO{m%fsxy2qAP<`(oNc2DYLcL;#;uw{i};z*@vEf<{VU6
z=#w1lQC3{(`5Yd|M@20EoDPO(_vNYyTD3d-ql-;tzIT<+@B@g^0s_j>1n9BCAc|Yv
zWG#E@4bnkh+-pAFMmw}BRp>q4s>gYw2RW)VjY;ray{z6Y9Kx|C`FP&IG3zNJ+9fu_
zkjENjSs&RQanUIqz;@GXj5@-*ccrmVQbv@STXx);XDIkGBjyh2pS<LtuXIpkoA*G$
z^uq;z;ZAT|u4(q)`xkbYWAY3npOS_)7EKg=2B)RYPz@;EOuV@p3gB@O-*V>UfBfTc
zKOxsZok|U7BC`9$JSA~f3&F;hQt{y$P?zcpy1ufypjqf?A)WPq?M;aMdRT<xXlYtZ
zTJpzoN-kyFO`P}|v&|#}4YG#c#j~Sz4)OG>pcl#^S^+n2SKg!a$4E<^-$nInsT0kJ
z8k_pj<oLs~sK+wvuWxZ8KN4^?ZPI)9&E6dR{_#5lf8F4Uu-Kg5rOh#LCz_Ma=0Z|H
zMw1^J#=<3NL>O<;t6^~z*B`I;uF<pD+bex8zw~0=L*-Jwc0!4`4v~@Oe^(wuW>kAS
z!RL?}SnzK-^fOPB#C%H)8+WDOyRyKWmd|7lR5LKeg%chj&dPm0dtwNZHB?!jvkhHm
z`vDhOyaHKChpVdcetjcJ;g!jgz3&Y~+${BqVd*8wF+=u6uGGnbiW-iKeltgjRf*J<
z#_`6@J}VW;3VypUMu!aK;#`f8Wg)3#D$9SRs*6}5vRi9HoLFV;zr4F~;tZf~M)#+u
zqngat<3FwMc_>T;Gty)0QQfrWuJp79#;|=&IdL5?+tRy&&$(aR?(rapRb*8rsp7B8
zQ%U_4b{M#Ge|MXPo&28U>qH;y(zPaUg#TeOsqbR*z~4Ntr=r9ME_`7PFFThGXdgJ3
z5jct|%iwl9zP4{S<$u$q^YLk^{nGub#SzOI1rsFUtTCEpWR~aR;Zm11mg!_s^)0Km
zrghDy=pP+30{%S_rBK4F9V<&r*wT1~xzaDcj(s-o-thZl<JaDh!TLi|&V83tJ`PiF
z*GYFC&lcG0pK52|#AsB&kGVfpKChnTQ&Q^Cx`dm2=e@XYd*$a5#ibWqtQDy)p7BB-
zLfMv1bK~-2I@v^7CGCgf#e)18S(V<n`0B}pbiyjuXP9uyeF^mB=I4%pk*D@RZSmmJ
z(Dr>_C?s3p^{0>&9QF>|)I+SZU5c5qFX-`f3^|&`8G`STNC_P(es-JIeLF3$U*I;T
zFlc+=^V%*O5EGL*IpMMLdw$+cCCH(3=Dn4e9!d~AMQ1VSKoh`;VcWQ>mML7iT|MAV
zAj)Zz)#9(C8K>q<WBq7CL0O=VMMiZ7Gtdj*$W~n)93MOX!dN8PcSo&dL5_!5DAxW{
z<jzCEizam*YU-=*K*Lzlj$Qa_MPFs+qWPO1jjf#zAGyS}O7}|pJoqk*&1_FDycWZq
zP_H~Ze*$t<CMX{`+Umv9>RsS;9FMoureA%NSnZf{p6Jm&sI7NyZ9DMWbBj0ViZ$nc
za}%E&B@9brpbcM`C%VVI`dy=h&CYl&Eaxd*j?&ZR@Ytx0Z7a*&J%?WzKM<5|q$o%0
z@hyK+%NDrpuI?FwR>BU?Z8mWw(-PIkl(@5=p6#sGu&nQT=$wDYnIS8)lsA`2`46p_
zSYYqDIBl8&bDKRJ>h*5OXY)xm5uc(glS~|%9es2zb+$e$R<+Zv9z~?}<ivmb^;zTj
zeuZD8duQz?s{7*4`kzA<Qxi)oem0MLPH#T*B-WiSYrYrr>i9L1pA3!<lLUX9?h%AO
zj}hrTP3{ho<ZAK=eT{SeM=(8p>8gQDC)2IDqymL5k0Y35SW3wde+cw%C+ioAu#Y)1
zsnTlhTukfQ`|MSrW=|gk1r?Gx?Ph&>Uo!quuXcgNVJkxrc>yb7JDWpxz8ev@cLco)
ze(I3HEY&Y<7qF{D-G0UY{QCVKTBx(7w$9+kO5QcCtyxTa0~$KJO_gV_ver!Ajc(hS
z3YfUr4dq}YO1sb^<wbmWJz|Dbz1<3wWtHXwt8DQ^U2!Kg&Qoi^hTy014;<%r!G(zf
z>B{m$%GXC$YFLB}NDDv5^y?HIy*{Va=Z}5XPdobWDHdkRLv2W~iO`cOvHBbQ4c9Ht
zyZ^3<lx<q*VS_K^#FT4#@!z`pZs}kMsy3EhUuoeGv`laP3`PI1x$z}y&GjzDc7W}W
z5Nn)EJ8+$ls%>WxI3|H4GbjN)b|tC!9U{zKy7VZIaFK`&tm*80?w#0AAJTB>s*Vja
z7K9tm;5kS(9DARBHGI|&MNPfcu32XHnj>=im{FOKkQbirk<{v;+4fGrwvh3HSWc$A
zI4OAC4~i~>{A(1^X<&;<uP7FAu)&1YaChT<hR5~gfdfNN)H!g`t}toQw-hn>TBq`8
z+u>#55X7K0LpG6P-m3ce+CfKJRLEwMa$O;kKpYWndv*)GlJsq0d^e`ixfXD2=VPum
z*s%GlZ~JgxJZJExPG)T_@!xFY<ruU6-HqrtOg=`W==MODI1{R1wJtIm4&!1|9x)4+
zkNvOjb*HwfYF-}hR%>tYN1j?z{xU(7Lsr$#NYA>{xsjeZ7mGhQ>gyzVZP?*<>gqDX
z0@A_^Zx0!B2Cqx3z@K+eAeT9@$r84H*_7q;=0FW?aa*A<FDzXDib0V34sI;NnY4tn
zGk5_tV-YDsqZhh4yZqds&)ALmnGlMq^}H@T={Xkb9HR7ZVpfkQ|ExPXq=U2lDN_zJ
zQI)>jg+LXmUnFb9E`OnaEKG*OB4g~7f~DvSJ;XF0c`ZW8VXRftrT^sB0`Y=H)+~XO
z>n#kKEuH8ubw>!~salzv@?FSLj0bal_BB;Zf&zLbBpCB^7!Ms%AB|N~FL!j1cj)!D
z^r&$qt|xiH9V%pE%7(<lrXR-o)U779Er_1ax%u_-wYB{qb>;8i-IdJ{rMR)D!SP?+
zT8y_Yi$B;Rk+Kj3eyR?`Elw3gl~I(zWLHuY!Dn4{xc50Ble=cM3Bbs<?kMXlJNUbz
zxMivS&ijzQzu)3qUX6IA*`=96W+*%7>u-GhyY>LV;nvypk2}Xh(#B-#AALQT=}a1`
zS9YA;9;&mbu%SCvp5;XqpM#)`Psof-d<08PrT^vq`@`qcftGdl$8peR9HOc{Iof(B
zxJ*pKI8}R?^$<0f#)`_9pr4It5H0Ayl9DH@QYKi8`UH!j0)FhXnhu9_hA_``eGnCQ
zA}C^J%`y=7v7gC(h+8M2By``5EZH7G!dUx1%$?;|l-(Ps2N=3rdgyMDhM|WJm2T;l
zln!a6yG04<?v{=Lq(M^Zr3IwxJpR@?f5G{3=F==b0BdINXW#pd>pJaBIlI%2NE}lP
zE7^^uW89r9Tvc4+<Sg^shnFi-FU<Tb0^#BvoKyv#DKzOl%K=$iLYTY&nNv>D0%!CT
zHLm9h$D9wTEfX!M4=)>ZqOz+(#z)Gu*DY(6GkVd7F(Koo%pwYb7a@SOl@<Nw>jq6l
zB@~49*U+#ILPvz}vgBH^tSBF#>h)e;aPJqAczRfSDEJk$@yz7(&?T9@hp`uac<S-r
zJkM_7K<J`D$$IyFQ;(Yr;ili!^H_NMNB+|9?2SMBiv{#`_3J%yah^JnH`)!Uh}Z7H
zwzxZ>FF3}A3;rSZWWKvL4knUrSBCqCa-2xzhkh<_Ro+bqKIb)@931<X$?1e^FXm0o
z78KthQm;bOht%iRoZY#L^Ldn6hEecw=c_!NyQ}`O5;OhpO6OrshF8zu^53#J*2Ne1
zPWX~Lnm)T57#gBnNq>p?9d_mF+n+biRZRUQ>q^v#cDx>LW+g}`PG5HCzV;`;0`EMp
zNeqRN!$Gaxc5zdMPuB<#B6+vV?B><_>X+WhZIdZ#Ue!TEh%6OXPZ28vu<zVTQ#67-
z{*rX%NPRS|)Svw`@y7lBx~ls`XA6oX+GVbhc)by4`N3T4t1shczcPs1kFRA`%gg%{
z61SS_L!jwq*_0-=3a)mK$wXRDm#+CM_y?bd6Zm;1QX_}aD<s>5H85m(k-<OI3?)(G
zBg}D^;Q1Cw(0d`~eL{Z6P4}ozEZOwLfM(eVWadEzsD{ObH2rxYnxR%NGK9DjG^}f<
z&DP^~>y9MqppV8cFDV2!T_khDa~-XrLYy}lfn+B^!O-gKZ4Q|j;Q=hPlntB#${H=v
ze530+M}sYeX5|UWqZ<99L!D9h&FE<tg$Jc82mw*=lag)gR%HZtftz8vrIV+4m|2ZF
z=L_BQ^0ysUk{_r>1R{PWCYO7}*fqI|ZB~PRCfW1vU8*e9zk?0)7~C;o?cLf03`<e$
z@@AnE%NNsnMY0RQPfgLnk=C4-&En}KK?Y9)(vodkF@1SBK`STq{77GGJ6`$gsBiQO
zWPTwwDkH{JcIsQKR(&X=!k9rwSC;)`?9sJ7G&oKI->_rW*V>csPa1qTb1ANA`5%rk
zb(<8>bJlc5Xoq9t9dxbVyVy-i0G`3hXku&oeKDf-%}M}#5+XB#j#NqE;Lv^B-2ZbJ
zLDNLM!aE~BK?-lp7lZ_fi#k4^=m~R`K@ML@z+AL4-9N53Tv*i)1vC4HtzG<bHu7CU
zzf=)zwA@YoEa}!T&h6HI6MCl90M`($;bAx3+5N8yHJY`c{BZtjkpQos!mK;@yR{pL
zCmYq@Ns6ti=8Kf*&Yok!o{JePjn5~w!|H3#58ACN8fj4uiZF=HZ6dKErAg~>H3$N&
zEMCUyT`cxHS^PNn-(ANk21@g4`35kjK@*tyCo6mooVB7hJ?0;4^<D<(bnrCZ+oZ7W
zv7i{3OR!#8@h<AvOUdyWY#EG{if-(5q57*duO6hDWLkekR`;%;=J7^}DxsBo&W_o3
zkYR>m-WC}~P*J*C(H$85DBSsuR#`dg+cM**0Tf4sJ7{DyTksRO*n13#VphiDu9SVf
zli`;OM#oRNdVDy0?kd5`b!cdnhGlwLoQKDZi@~8B<|YZdHQh)fVmB>!+2@lFaFI$f
zIk`Yt|6T7HIC`2>pQ`|W6ykd=-|eo2U)mkNY;n=VH!}qxQJMO9eU~~Cg3ae^H~phb
z?+kLAu3q1L^YVK5i~I#a>44IMqPlt$tP@l8IpFzj0$<GOy3f!@EZ(9RBJa??N8`Bg
znV5d&-8>u^!Ka!YK3O&9&9TLNt)3}OqZw1=__(>f+i{lN;A5Jl`}SeiCf8Uq@WhI-
zG;dO!<>WuaB;`C6(WZK&uIsAr@Q%#mplrjEfEYP(hYqSy&$NXhmkmdAwbc5&#tvto
z?aTh()0n4iOaR|MZb!MsE@ZDLXO=34_OVHUDgB4%rjUbI;|4*&i+bZCm#a#Y1PO-#
ze+P#Ij|!E&iOA0y<Ku|0zPpYKqO0CS@}gRLbZ;T@Tn=$>kL$sVWQeSheMPQxCRhGv
zv5{p~A~xQe$$^E=9qo)=X^UVOMCE&!7l@2jLzWd*rG%M6n-7K+AtST|Xf34P&m9M&
zltn2;!N4&d^~go~Mae_O{3Y#XJa?)b^`hum?=Q}q_?@IUCo;R<jeDwuf)>c>cJ?9)
zKdVF`@l=G-#HN&nyEdwrrL^fG9ULUnHucbc0!I}Ri6_LwlDo7v-7N=s-`a}X>`SwP
z9IsW;N({<8;v2nwmE5daSyTy+d(vs;Mcr}omQlGCk*JdSv7$yL&fd;oMx)XYvH`iB
zw18FKq<o3ab_NIeY9{Pj%a06t4><VvZ`q9AWCk!wGZ<hJs;V{)<MEZglKEaD_$Q?>
zo+>f#8=2*dTt5J*vZ4n{LmIsUU+gR{$aSCh!F9f`712#*=Cx(5$J9ArDT`_vu;3-7
zRETpIZA&H*Ff@TSRq#@#Rm~$AXo>HztQdHizZ89YRV0ByIA~=rYf4u{GMV-VQg5Wl
z2ST&B76Ix-O2HmWHl{p!e5_C&h9rUsvoa+<ur#L5h+dyD4R{7*=Y>WNV=F{pl^%%c
zCD%?0hJ{j%N=Eq`zxAI>DNGKWu;wnjp8vMw7{NZ!`6)iC=c2%sp~%C2Ts9)n`auLm
z8VwH<J`jq4w$^?YaIeEOZ@MBKTAaA-lZ}i%SyN@RD=2DS@aSQN{=t`hhlY(h3z@2l
zp+izOiG*%o2VGe->Ym(81Fz~GmIbGrdVVV2M0e`k9LsNBP9xJF^V5k2Je!zmssv`C
zE^d5`$THdD=5BX(ldcz*ULX>fJ&J5tl!JfEQHKl3Su?v&K9ThsWsG_;0Ou4JYP|Tu
z3htP}WbXLR-8RAZ6&;+XPMS}>FhcJoU?i=~6Gq3G7OWx171<bUDOIlDcV6u3ul7fL
zG;~y{Y6Y{f5Sh|8I6MlEfVINM*UUyn59yD;#f*rY62er7>@kR=Fg2KfNP8&52p_W+
zbB05FFEw4Gs4p~5J8gZRVcR%yJ(RI2LktS8n#n0dz=EOR`zz@2fE{R(WedOjSXmCv
zo2exmS?<(0ojws3GISW;60W-rCsGBap5KP0IyTQEKz-NZqGs0<2UKP5)#AiY8vG#$
zjYPX5pIXMX<ISF<85y7i0x8xO*cNn(v=};w=Ex0grq)Um1Mo=Hh2-Mt6>oGRuQY8L
z1hde&0lXO1AzN!iJv3~_O*R(k&_cVadSw<AaW}?BGtNzx+rm(z27zKV3%ks?&y}k!
zdaucK7GKaaiB+v<{-A5_2){T$G1A{7e*y7{jPrx-MdJ+Zkp)^AYCChrtqw0d!@r9R
z+Fq8hWJ58^9{<aV{TorR*ehMe=hQeNuz)zhfethQQAK$ze-p0qV#l^1z-iE^Qehp!
zmO+)aE7^Uv`MsRlfX3Z%Umn4n?*TN+du(6ni=D|Bxcr?S`67PA4O1|MAM18{32Wwf
zqqi$FF?q>@TdbJq3IpI~w5&XCiNp^i<3j^LDjMp`%tvq4hlhC3<S_XPb@3v6R*or8
zQXoXTBg)EAgyPZub4S?3d7{XBy`sidHJ9GugOfwnE+01SaxEUd(p?p^0gbcl{QC;{
zXETYKTwlCT=&E?RY$o<7G7fjHS-PnM*(0szEj{O=SH^tH4v<ZL1~WI#Mja0YF|blW
zt%;qAv^#zXY<?lC(h);Z&*8j5eAUp#akV||P5Vt1g?*MAT!pce?sW;o-jmWcF9&hx
zR6c|swdIbdAsfm>rg?v3_6-HjAjein|GjH41Sp#E0SOTgVV;N=vx{2nfLzJ^dvB*I
zA7iWg$g{;gvm8JAS9wB3XD_ywf7)k1T~UqdWt3bW{BU7oeDdULN<L+inr>Z!s-25^
zGtp{SBgu7cVG8Lr(tN;q+GPAo=nH@0S~qZx<q2S6JbUxu{-hoiNX=1u8NV?1+bLy>
z)Yph<mNW&qGZ`q>|8=AlbVSXWay#Uv+xtu1F@jfJ)oonGIO;lQUDWc&yhy0Qg_ETv
z9rHH2-|O(;NL_0tL17H*N7XfkbAmx&uL0GkECXVzS$1t5JP=t|l#r&O!^1XVY1KWM
zb!LH{Xrn>Y%)N(0Z1sM`XVtYl)+eejjh~bb<r9+>6GGX8YCnc#u{}vBJ&2ZA1^J!G
z0Nq~^^Z%FqxAeL5{~to&|9i*b<e71>6f+hRNEL)MFaQdX2KB0f2(wI~V0GhO2wtyh
z*d_=%0pSUr>zdiO5>LW=RI3_<Ni7;>wNFwya<cV){@ZU5o7zrO%;hA}*J*GB7k;uR
z%it3l@sIdNMtqH3Irk!HW>FL_5e4tb{SY=&Ph)qd%u`Con2c_%(+i=Gmaekcm>>+s
z8Xc>$$rR&B><&8Qz6937;D`Hl)+A@Jzi)F=J8x-`-ytqjB5N2c3+fvp^^BORX#mie
zt_ENRg+xPsSN4VSg#{?@$?)6NGFZP(&db!}1(6I`zV%a|LWwdb`fu)S2(s*_sLZU1
z0Kr!fKIM;^DDG)Ybd$NLMaaU%$9UurTMNFy1+WF3#d|k1COj~`m4%r_;p`i%H*S|9
z)_BF90ZfRc94I=sKo9D83D6EmL8>18j>m3h08675(-su0YWPfMrp|gC)2Jy2!nDPv
zBT=v;!BS(ZildM>LQ0)kNwWKm`)q+7k~(685+$h3kB&0IA@@nn*ZGI4+%!2DPnur^
zB5?rg!UthnLlH41u=w|R4ke&;SBm#^3bkPK@P8mmM;SerZd2sYsR42`7*8hgCZQhT
zgIq9!b;Pyw%E&tuI}1F_DtQ7>2vO3UoEbJEfB%~u+d}CWJrLLy`iapFoiht9I9Q)>
zI+zJh+ID7$OQVtu120!+K$>CczoQ~_GgbpN%z@xwl1f1^<dsP>kt{~Cs!SpSN<WgG
zEgDN^Mi}?Z8<gJ##^L|A!Sb!1anQDf>iU4If9RZbmQe~CFMBN$S|SQK6HQqLPl*7}
z(eWCXe+?Z)bF`qlRYsf9t|tK_>p*4(zzb$f$<vONn5t%o?8;?Vh(%_qH$&&~0$GYV
z`f~Qa6ZuvN2~#XGN25`AVQgp?OgbEBiOCEP@Ibe;YjJ$8#4UmnG`n3)jbIoi0hX}N
z=>OIOA+R2}C^>RfNWg<K<x69?kLN#~#l~`-%`{~AzMuo0m%_~)S`E{>E2CqW#Mew&
zDw2ea=@S!jW{dNcy}Xe!s)!uKK<X4Rlra;Ut<R4Rt%XF6vOJkF0ma-bw?>-?0U9u0
zR+9w@2Mmf2lMbO`rECxeg@PK0%yg!4MoYN!)&1`vWlY20!r0}9IU9te7*-reUE>wB
zV_G-yU_4pG*anL})Bhl$J@h(<E|#Gnp}u^kjLmt~R{U9NyMw2U<bbo?Tt02ATJCUF
zfyE$SBzR8n(5^|ne8i8snZTdsHevV2DRb51*?n?!I(PaOW5!z_-?^__*IQ@;_aJp?
zjj}jbp^9hiMD&u_gL(eNiGtsv-4tBvOoTf^@_r|Jx*{E49yGHSa~e@XXlM8eq-7Fu
z(^bhe!*~is+I*{9F8o}sRPC#E)b%1S+<iZLbT~~h<k{+QuuGM;r4u&QAH_!x3377h
zuFN<@K1x)(V?5l6o+fHgYKI~ln*T$i=1``pG8r!dKr-D{s>&dI1?fesTDmQPpewWE
zwF-Ck`=N5{)(5Gyre)JF;?jJqVSg;tT=KRi`~}zTE^Qa_%d1V^scT`6%ZNP-`aP%=
z`X^UA8+G~~^ZLY@+9oS;SITQEV)O?93E*pgt!Xuff^a!UHOEPZz}-1HHLKP@>TDeV
zJHoCwo~SKyI{nF@M9NUy)x%?WT<?y9w$XBCB1Vp-ZRgtxSxxGxFLoO;xFCholg3E4
zAzwE}xB^c-geIn*;%jFhV~43ylDMKDp7Zu?g8_*em&CKIoBQKkP4(TaGyMGG6ZPES
zry_MUoQF_GeYJn!*ViJI)je8DiW5l-2BPKH9RsM|5f8@^2oD!3`>;N`-Pcpv4qFO&
z_0G}D>Zc<P{cB!tj@ey`Z_csdDW^D|>!LGG+*L$Yq&8_+ssaJr3b{#(SKNVXs{^Yo
zkK9AA=Qzg1E;cqfP`?H~&kZ!%$XFT6BIo4^ZZNQ;?0oJVg0tGN%6;#xC^*pYkrU!Z
z0%CI=umu{va`l}v(EF>v9sx;3(nvZQAnEKgT|IN`*0>&Vkkjc#vvOcok&z{I57Q4Z
zQm=G2xHY!C?2=<Ky+dDq*sNFYFMo0Wk=??@E_%r77Nc+1QU1TLnQ0+1XgfK+>{DWt
zphl}PVZB!mM&Mi7F;}j{F+=8XP5Ail4ROW17VnPFZ<QUencw15p9|%^$uznr1}$LF
zsi1|s?&pTsM8%wZHKfe*-fZxRTqQPqa<W(^Vhz6*afsC#WbV|e-AK{@mA9Pi@paA1
zLo8vZ6VB2>8a_S}YcEgNg2&T$2L$bw#4eOCLb@@XYc;5pXgta1XQuOQR6oRwPtvjQ
zlH%nC2lOYFC9R~<3K3-74j7he1v;ieK+qq$N@Iscjc00lf*l8!%;-BQ+<dF#|1iJx
z2eqs!0XP;bas1gL3*<$UM?Flf3;}SLH%lJ=;-CcvR>`xqbp7ogV!1n(H)^jdt+Te}
zE)aOY0!Xllr``(VJi$LO0<IBv)xh=icYG#id!DnPHd>d1f%N6GNEE8ymY!i}4=Xm%
zRF|A0$DV2o5W4&h8wPJL=Ei$xqP0j&LeY^i)GrQ&sbeQO)qJKb@`#)p5;;cKR_k0r
ziq53P;>mn%f9Y+mQMVA7BUMm;s042jA}RBTtt<{;A$<98P2ucQqp2SRH^4*Rsy!27
zl>KIS{LQ-a%M%G&MqqzTi{5i3&#lk^R!h}O)su^8SJZC8R0;dl@<0{-8_2_(ga5ML
zJh^m9kn&_Xsd24rC_L7V%Gtz#(aPS+I{r3xRw_8Iibv?J6I50Z$wj&pB*3ZA8?P+=
zS*Y>-#$*7`Akjl`Q7F&@bN|Zg;dr-hCRpkr4X^H(4Gjte7K{va*xMJFcN-8)Narv{
z>y72i{Y`R>opy#6Jc^8t09mGEjn1V5nW5L7wFY+by}sM342%pDlFCfoPX;j?mWcnh
z3U5H6#Da=&jV20oQ-ffzYlxEI#O@tU#61-#2|EwxFR5nmFRQ3UqpCv^QHZ(~<3E-v
zsx?n(y@MS*yKwpOasGMMz^*EFK~xagEJbzoF=>U2;Z@X0=QHj^Nn7>QrcOwI$UG$v
z6@l5pqihBe2$K$t&*7x!x2QW?Z~Pl8OErdVQ;Fj8?kUN?2xJFf=zy*(%fuE(QA^c1
z8_=H(AKBPd#-ChdW6e!0&(nI?8#Ix%Iyc81h0)qUw8#wqXtiGAl{QK_Fn$pT54OB)
zd9S6JY|JZ=_CSIsb03G9!rnc8*puPvo~lEaW=zU<ASTCWK|(5!>Oncs{gFHlliqM+
z2;{(NBqX^{Eb;Q~&m`%{S9Oyo$Ex>Zi|WoA@jK|8Fcy|XImfMzZZFv`Z*wRi;iyL<
zO|EOY*>B5PgEZOKZW|VNqnd}Y{W2gt(91o!0p4;7xqs6PL)46qq+yCDoy6&q%Qa1k
zn&u&@{<ffA@RE&MSuU?kW}}dt)Y7*0lTMvA$&4yZAXQsIJMX72SwyLpwC`p^(>2a8
zu*EqlZlsSLgc0HM&J2;32>k5O(Cv**^3jv*`4<*Y(UD=dZ{a~ohn#dvegDfKBhvC^
z@eu=?l)El>(yv^3qWKj<1LX|zl8sqmccZ>QZ()6jChbI310?OLTZJLrj;<Spa{Dl)
z`x@a>6~)#&P%lROH}d_#QYnUqGs070Iftq>o@BwgvX_|l3HRaoiNLg@ij+WgKB1P6
z5AN>G!gI_U<{;YO`OXAc<A_$|Cp~6b7(Y6XG~On2PM8^Uha8ez>Po3gUO0dqAegW`
z#wC|Z&Q5toP;p-V%Zn6ivHEA|tH?atq0Id;x{~IRW6PZiEt8_A3AJ*)=LeVvH~W9H
zj5Xrpp5Ry0-H6g?h_FxK#7S=FH_k6w%iv<k;+q<LIb{Ux2bnOaj5K$-28YqfV5i)h
zRTW?BYl^?(3wQ)EdU%EMW%v?q4_+gSkL(KZs8mvEPx)z(W#K2*nPq)3W3=1$4@|Mb
zf*kJkvEg8GFJnbP&S-9b=Js95uRdDajR}U<UH=Jj#yJPP?Q14|)2eyIphCL2*LLQ!
zjI7@knWdP>ZrI%}bbZRXEUPhk8q}&SnwElU26=r$Pvn8?O>X;o2|>#~g{sOpESv(d
zmjm7Q?y)YjMx{jp8!d#8>&JQd!CMJIe9phw<X_t)ayh-n4;qUNd7+k95x92AFUs0=
z#pmTdZDRS(CdTyw@rZldAJ^E%PQz9Sf{}p|MvxRyJLv@IxiysE$@l(S`bIrEfj_(d
zS6xZt$KA@Jk$BrRL@JEt{nhVykWI#<w;<Zzd&G(@)n5Wo(y8kM7W2btO&f-f@xbg2
z8Ig1Zj)TuMX~0@VIw10niH0S5beGvbPVKWxJy#!8APcIg1z#k+o#s1nkvWJpf*?c|
z<CG2U1sT;2n%=+{vx{#%eXYkff_p1r_$k}1aFva}J<qz;hOcs5WsTu6mRCP`#1~zN
zo^a^pG9^M>$17VJwjHwfJ2x(N9a8fsR*rj<5hBNaKO0CN6<I1O=};1mmxTMqyIwx<
ze`5J5a6WV&eY1-zDQPpen@t8r=$kMQj+*~INE7jJ%DSpuMTBn7!34%jf@WIAqi%TS
z?qt^xno8yf-LkY;8Wsz{5KS&CTMCmJegW}U^^Z|f^;Z)ZMPJZAAlOZ?ug^?v4X<*E
zu|lEQc4VKVX%)oV{=8|~evNl$Fw7VU{R&U_3$ev&+%Y3&zT7-pyL#5mDdi*#N<}n$
zb}jNAVk*1)$shSKAEhn(t*C(YH_6Wkq1J2DJBtII#A^2)ckDfSnYjXD&p-E7oL}jB
zu13wM+#Id*PN~^?MT7k#==J(D?~V!9X%f|7hp5|9AjCTZG3~dn2e`>PZU?9AEt=HS
zyN@>{dV)%GFCJl$h?D57^pAYb(gc+-m66Q-gw?M^Jk4dMkq}&(S7u7_F*Qov?)0qO
zCVEAN_Wg`qU)L0E2Oc$Ouoc~i!IY72Wi8f^M3kr|*)#l%Gv77GMWxzV!zHEhU@6@A
zF*Uh^Y89)$tL?8cvY&<$&fZ=!?<cei*et!+S)$iM;HoiAVV`x?=hZ4?*SuAueZlmo
zjoS>yd1@d^>_-24ko;vsiv$G-lXFySo!n+CvfRV=`(j9<mbJzKI0Q_YhzQmemH6$x
z?m2cJ{X;V3PEOj^%P3@LMycM-{^QZNun+x2b`oX*o-`wV#c%hpAhqeJZ$h9Y%_1`j
zEzg|qPIQ6@8?MWm@GQO&(f{{+Al>7MFfRu!5fVhfBcf{!`~1z}{#VKtmTYSG8~gVI
ze-%ddURGCPS)cIvDn|~U*V;`5iX~siH<Zo3;gsv6TCC=Ne=(}Jy7gB*IsM9<O@RQ#
zdyJ{UUGW#L5Jn&M>dG-128!3|7lELj^7A@$hy173i~J0dpbKO8dmHCE&wcTYS=Vsu
zWNQ~^vHhp_^#`zk*Qc}xd8W%rzdNk3znwjt{%VWH$?Ff2s(&#tB>l@VHb&CNQ{Tsx
zK|n1!dW`9IyXYxwm6&LdBInlE(Qa*G+0rB5EU8k9xxirAd7876I-F+<2B9Bt=I&JK
z6ICpQQj|Ab`f35ihyr9;VZ2IO)zuIK9+QEi$GbBjt{j|?xoaXXnTo))%===n4_>^r
z-CG~T&hTwdudG>f3KPSJx!|Hirh<2$kOoKtro|slesoonF6Zen7Mq2p=!r|{w+M*d
zf|Iq14H?3-z<PL5(&V5z$mn_bPQ9H-{}dj?f+i8tz|Di87B`!}w~AAkPu~kNqHh-x
z5U<j!qa(r~CMH&5<sTHsx2uZ7t;+&~Gr;?m73MEvale{Pk#~K7x%vGmfEnPkf7i`j
z{J1Mrs>+*<qfprR9*itg5IqW8XdEXkMs7qehoROMaU1TVmpPGrzcRQ~95bXSLpQvq
zwpK7VQjF#wvwm%$?sSs4;+>VXz$fGGyB_-2pZWm)m6L+u_Tskk%kq3OLe$)8u06dJ
zmQ3mEWeYkyJq$<veD}pIpZ(*7^VL%89+9%ZDT*H7hvcf}1B3r^u!WbETD~}qNKxl}
zo)eNKC+sz7z_XB7xww7kx%Y|>8gWMUrfhhZVUz(sZDcF<wEgcjp_#f9KX(*i45iWx
z?fH@67qn@)_yufufs}%A%hbv}rFB^$@&ndr^v7i5u<;4KMZTj!m;m|OQF_FFyu-(%
z+|K@j4|37LxYyCfYFM|LoTH2dBE~4b@&U4!%oQcdr5j#blO6Zn7U<H|Fgs)A#BV9s
z=kf)LqPVX0Vhz`CN<1T8$M-nHKH>}u>{p~GRL6Qzk(>k-%}9m`@O|3vDPQ9#z2O3Z
zH)Zh}A#jd~p|*)o*nsW?(Wq$4CElAs4(ZCUh*anCiz$P>Zr#?)LAn`vb~S55>FwRt
z>-KK^eSmj#nk(WC5a?zBj7+|SpBF%m1%}A;L_OJ9SOA=|Q|HcSNviKZofeQo`GY7t
zrC8pHQwRN57zxlJa;lnF>;YEWRlvrFz-L|7Dp#%hg4Fja5t1~mt)XueOyFjDd;vwC
zffx+zrKwLk$WoqyBcD$W4rD8;ZB{t`1nf4QWycg^pQa}7P%e2)6meV;!FkBpWYMfq
zJjP}jB$7584Xc~*Y9i-wOjL1H@Rn#3mK=-KdKEG)n!^QEu+xJfEO0DNe+EC|@E5=J
z;EImPu7fs4W(k4)DwUS%5p>oCkg(PCLlb2&XAgBw@SFzrl0vra>t8)cq~g4nu_A|?
z5*P)NR%UO3;eMq=h`G25!pQ3`&iBWbVOGk8unNYWQ+8W|uXhiNZ?BF+WedQRt}+m;
zLNv%AIYA)@!fjkb@}(5^%`&}Fx9oy?#?X5Em#u9FOz$o#3?M;<u8bz6_2j8CFBI-)
zl-4o$q1-(1DJ+LzHvjkIsgLZWxrT$qlvedZOjBySuAuBZmq$b%s39^R&ocMFKG0*Q
zI<?AfRaDTGd8f1UvSbTVycxVGfR=&X1uouF`(yNI((EgbG(UU9H-tg}LsE$_kgoAZ
zHeB5q9fYUn0B0!rMp$7W*(x#<@aw<VrS`YUVYnx?dM<f{N!s=R@JV|Nc>RNn6&Chg
zc2-ta0xVd7x$}?K0jvQ42VqUE2{7#dz%hEjnh%IC9i_IJuA3GKnj;@pAf%<KEKipN
zU}-J@gkLwn3BA8L+W-hP?+;(HQWEB-%d)amZ<wuXalMG8dBE(#$ytSQZ3RtJ<Qzqb
zu#;qtS*<FC3~M#40??}r4HQ-jZHThNx>9IGA5UI{fZeDJg?rbXGM}&@wi$9dE4`eu
zi^jnfmKt8w2mWAs)p>ik0XrRvl9~bd60PV*cjdqakG0G3oND5xYlT}Ez5`bP4PB@z
zGDDU)jW!{AaiiIX?Y;mu2SSbR_TTmU){;JBP?lP=*<~lpXZ&SwQ6P$3_X3yF`RI?8
z{gOG)6Y8#q?}1uM-50QSx{>qj8STXncp8RWeM4y+UHP{TKdfa9dsy!d2-O$y3!TO8
zvZdoI`V4vz4ww>4`w^jzr;5&(7q({sWKlSxT7sN6Qr+x(dp0K?#TJoYCJeG#b}qlu
z(;0hff%dOhRv#usi8VU?-Xqa^=gcB^6oIyQ1Q6<qX+-daX5MRn7>wzZ`PwwI^$>};
z7Vp~DZEglLxIbe_jk8izHZg|<r_(B#I7)x*=IeNfpBXimLfL}3(K0kB9y{O@S0a=Z
z?j&=|-pNyrMOY)B)FQDywYOor8274O$)m2)TY>{ai!JS&RBCFX4{Cy**n{;rngN0r
zur7*KGA04M9f0}+9tHr$T4NYsz|YM5Zg(~@F;P-dYH?m`_c~bt2w~I`z8(OY8gR4#
zjHe@Gax{VG*A+-3e^8dD`T8_u&<aym0Q|$0@b+g4=YNPu=YJlI;JDg_MDpOmOyX3J
zaLdT1i%jE4eL_O0sw8iUE9LO3-RMoiPoqC!qmI(njG39R3WmCGec8zTNlg4kDv^4b
zR|0)2F1A>^do0P4cBEJYq9@XzS~I#t#T}l0UDl&+GZ-Rz;Y2PX7p8F0Z*-H{FYYpY
zQd6`Rx6|FSX2J@Cq-YDYW;Zql+^qY#sSk3>_k{Fm4OmMb`~3DKe~71v=n)Zs!h&b8
zhBy@+C4Iccd!(v*Tni!)@-C9Ijf<Y396<uF5y~`dn~3<N`N0x`=41wctscCqM`CY*
za}V=->I<5?vTkAXhl+!+Ry3oyz>~J`b^S7${*y=L_c^P1)Xc&JO3>_@azqw<Z(_7I
zP%@_Eg-WwqJC=ZwkGogr*X7csFtH~&4x70&nLT#vI622Yxl&h^q<5zdpcr}eXA^IM
zd(xqeZsn?cfNsn7UmJ~ku0@4}R0(M(;UxQFMTt+W)L)82EzQ)m8_m$Sq587z%OZ5D
z4eMU*Ct5uJdB7nAxd!=+*1d!TF$g5)bnw+_zB7rJSexx`-H(5|h)x&;mUPvcRNNj;
zrAH#q#GIpkQodZ2m7ur8*hW6~7i@NRR|ZJqe4-?Q{nA@7v{PoKl>^4D{w)jdGwP8|
zOFn%=Lx(*C03B^mYu1;{r~<&v`0ZwjEcp%|0-c?m0SU-I^btU3t}<--ckZ05Gy%_`
z|M_z!UWH(DVpq-^2Kwd)ZxYakE!+c$+=q*CZc_jNOe-5f2>_ypCSMv#h%A4r6W3Mf
zIj{_WpzJf?_mhbzY&x%)1G49;m7efhA$L?~;={Q#eIyP@BLtcP(FWTxrcKLXDZ<>e
z56b241$$SlY!?!p9fwF8a3s89OAF$iI|Nx1Mv!^!5c$C)s=|80-J!BSJbNn<@K~z^
z*>zzv-73!g-ln?79wrN`Z6=|8%70G@B~O>?`*FX8B32QBGc!ZY&Qizh=LfL~K1yes
zb4qkSk<X`HPkm_=WbR6*NA^+9*l)EtTRF%@wU!WUJ1mIYBeUvlUZY9fFg-*y^+(|q
zo6(A92;$>6(VRrZ;zCxn&~F%nWsUW<tM`&UGxorVQpq5+dV-5V*R{U_J{H}aRuaQ7
zIJ5MUAfWhP6LbliLK9nq1-n?1I<3XoDaqm1%r^Tfhd<OW2Rb?r?~whBYcL-O^#+gR
zBcJiA{|YKu?0#q!@ZJ!UkAxda24IOwrLUC*uD&a@<UESV4LfT~+RK>wLrKa;#!izu
z?=BQJ{tDzDcKLl6r>-@l<%n1`x2BPXPLUV$LLA22p=jBlrDiIXW?M~sCm}yuz9-nw
z+O3`LnASCDN}0s`icSApB8TvQ)IuxzAWFAKLDOdck1Cg)E#JDlM>n2;4}n4yJ$c0v
z-yF^W4PC%FM&Ge2*~>fVAjJs(cVvx^1F|Yd95$fWo?NHKqzhmo4#tw^lmR-W-PRA=
zz)wLi*4WtC@$vCEi7$RPr)%w=72m(Ju(6FzPMUbzL0O53_J&)*)!uawmY&p$`f;ut
z5%;}+2tB}ng@A;@Qv8|fcaR3tjDB&FNhX1&47;^-wJf%0g-b`k?j<*Cm9jV0yTB7q
zBZD8JtRu7=%!_4Y5366_Cseg=E#PDr@7SFr2VZQpS5W&e88)?FzSFfY$O=ahiHB8I
z&%7!9$Zzd;nZd^>!n{Z&DzG2?l<WFhcoc2X$jiwCK*tqoA_W5_GM&oIW%Dv=@)!Xn
zU0?Q$8voPc<M?8(=!nIk`G5y`ceOm~?vn)#@pz6!^wAO&De}t;-Kq;;ClXTME4$JK
zV?U=s`kVdJu7Wc^17+zQ@H;cBAzYcP7wx^<YX^y#nZgmWVSQyp*QGxf6sD#ZWBc=q
zRMQfN?V@_6b&o&40Nb?v^el@8-bx8n@v^tjCJc#x9oNiy(Yy+{38htLMWZEChSn|(
z@D-9hu+`a@J#H@ikfd*DKRI?_gyE$s!>5rC($nP&<i&6uvmOkdjuun%{ucc*C-N_#
zYOu#3pzGd*x8KzK&3?CBcms?$5}Cf15BpX)6EZ<98;ZX3i7`x5@KUbmt*B_5FIQ?Q
zImwW&Kj}#Wu6*n!OJR5vk(L*#%vfBKzHG~XAB}cWYqjP0tn%df(4}1|YuW>c8ah9+
zWmg#C_!LWg3%JJd{-q!5SSV2R=P*OJ+t0Jq>Bc>#IYu||WXg}8_ecU~SFuG*%%=CJ
zD3N&;1Eupj*|%&}xK%a?7m@JIzCeuBF#(PR<7gLq`||2)6TnvX1HhVbn{)y&H2GMP
zDwD2uz|3$dKK!8`03reyMODNh>S^Ez1$c7+m)Fb=WH-1bhh6D_YfD&2N{Kb5B&X08
z1c*o`MfP(6e4Z2lz5|eyNBNzFy2#pgOh_*x*J-YKON~n72~Rycm03QKR8i_(*)%>Z
z{=Fe<!;<jsnkZW1@+wndONW;B_WrQwtq?;XuV8SyBQj&$Ii5e-J}=R88%p#^kW&A0
z`cs^)(XY2%bck;Kw0XOtB5j^N#Cgx2rlX1GAaZ7Pqvq{+#{Ggfz7NJLm!tUB4pww{
z=x_s;!mLr^tC}t?@4h>?=T2+7f;MPtePb`2yKK!ff%>IW-7*{)m)jmN@#{Jae!?{M
zt=`<EH(*jT^1LIiOOUGmcsSbmKH*JZsMQEBNuWg^)2E%1i-6`LLi-Y<5G0C9?PKAK
zC{&xr{giTUmt||seZ*pY3_a6sDY~AX>H-7@Yv`mQq3i_BY84nV$UqAW9j2~#2c~41
zm;6rGR}ee#BCcDg3`-0Rz(y32ae%UBrOZ2|LXv>VoE0g>bmwnz;<KS6vvQRzGwwaf
z7a^SdL$Sx|SgpjrVp3a(h<K%8XiT*W_1`nBc{X*hkldr16CCaQTR&%z@XGen+tS&+
zJ~B_DZ*UjzLmV1Tibg*cmczm>WeNJTEbyNr6OCh$pdLRvGF_D?OV8J$qu^|tsn5bD
zp;TP|Ws4opO@I06td47~!GgV&c@hbBh_Gy>+hy;j9=++WuZ=UM4mRss_h$<m866#+
zoIG9cdib-|ANTi4yr#X~7f{%#q;u%knuXM10J@PI09fW?YwJqMOddx<Oze3u1FS5r
zA^9)v`;)+Z96&<@+%~$NI3);wIJEpzb)Gs@`M_r#z`2S6Ha}E=#t2~GTQA1gU%bIV
zqV*xtW`mGg2ul@lN8wWAXqw;gH~H2h0H1o?NEke@g*r(lQXlxY@a}=SH#(DkZT@}1
zPfQZaH9ymjn>yXf7+axDu0<_?(k`|pMqOM1G5^|l(Mg}D=F}x9maS)&Uqdkx2bYxe
zFqdN(+w+|PVd|OEurUCN#fKctHN2^r(5$4*6P)_&^n9OF-Lk&Z;APylmy)1E<>s1y
zHY#y5DfWkZwN#_gkh}Cv*=NhN<iA<^4>J70^V=I5>F+Gh+G<ecA`||5{;3$NF5&s}
zy!7w!vY5>Sez&$VtZ!v3RxQ&**qZGL@w1lw1KjQ9(rcdiG8@&C%j|WItF!30y4~S=
zbXdTw4LB-13~@U3PvPTCWvWXGz|<kEy8lIX7R116weg?Rk0rWS@i5{HXm!d8W}=l>
z>G5-9F@IL38FL02LuOODzVSswo+CpS3|W=1VY}#e8FD6=mtW<Ll>sfWty6clC~*`v
z%x$JjTd#<&RPTqCIbS!LIx#2|{HaioFofz-LE*|H%q#+&nIv&Mp}FYPwc_)0)vcv}
z;Rh$*o=<3hGE0-lD_+WA`LVRK{a==R#g3b=3bF>d-R+=gTkXyO^&Q|?3Ik>x=%*Us
z1=yu3D~(!Rw;t~<GdO_G`GXlUero|CAwU-e1XZ1#od8MI>u3Q8wVVN8;34n^2-bn$
zLYEX3-Se{b9pSQ6<nn7Q*FE59DERn08nAiXa|hkUMsIGo*00Y&%Mkb;GNpOgX=Trg
z+1YT#NYpr~Bz+}p2GrS-<d!`Bu14Ku>b#{!Hf}()RC%z0({|?72Hh?6VSWgV;4@8E
z%I$@NRu3DWn=q(Qyw!Ppyv8j+_P@QO*aXRlG&<TaUba>lhvd-C-PrA#&ZK}%>E)c8
z1yOX?FO2w@kn^Q&JptU7_2XI;%irNI)(_N^5-pFDVxyHupTMlvg^k=%ASK!@q4~if
z$qc_ab7!yDe)&xDL<e3k5|~ms&3`}h->Vg{<>EPP_cWkmNv1ayrY`5t^px-m;UT?R
z62zpHaJM5gkoaCX{jyyb?SKA#{vENcon6S3j|i7`hgz}m4SuDV6I&w+Woh%X+CD30
zOeQURCRwF4-Fuuo^+Si9w$}x8R441?#A;O?h9%$fVV3Dg455{_2h(=Hb-MJ6ZYCAj
zv{!bFw-%z`ir!(&kfOA$WRf{YI_2T?Yo8k_%N7L$7;>&K$>s8$-HlfW%^pc!e$07y
z_c~Nx8Bi7JX=?sMKSJGF8XGH3x_kk-4#4Ql0qEnvIRwDPC4DY-0C?rEOL00NX~V+G
z%KP&Z;BN#NcZvYc*v`)GAK7mXnn>F#K6lx9e0&VxTkiphVi%xv0WvrhfV&a}^N}L|
zo~Lf%#|J&10c|mGW&3Y!gGR{ykl}^GajL2SCi8J{HpO9=oH%>Z97x1@R?LNIf0C<f
z8;4merz<RxHn`CYVXo_;M0SL_GY-Xa2^fl^IPH>}X|UNM+y7`F!Zw;1Q?wkN+{m;w
zr;%Nm%qdTrhaKFM8cJpWJD6Wwn99{^9Ii&3rWJbq-uibw>Z<goG3R!Z23+pl^Wkab
z_T-}FqnI#T|Fds$@Udz|8uZ|B$kvZbR-(zC!<a)^oM^RYmiF2F$^gz{_;&Ybvif;c
z@2)l!aFF59)8+i8PNe?&nw4teVtYxhNYdbqKVtsbCy`{fY?Of#t3}h6owDvJ*8Qfp
z)<lHTwdwD%U`4<GgX^;jikw`M{MRb|r_Ufhmi3L{N*q(bHYC5!iABxc{l~&vr>y)$
z>11$G04sm_qGRUpfgKpZZ0a}^#z2cW>+P_<w(wX-(Uce1*#~i&=kXkX)!<~(YmWin
z0kAKx6tIEKY6)Dj0Z8h<?c3>d$)u4F!0j_d*x4%poGEatwk9y^L<VBnPhm0&jqJq$
z+xR13;4yI<Wdt&BYXE86_82>>A?%9NtH|Dz%~aTVjX7oRjm8*P-le)Ee(-kKD@5p5
z-*?GdM5H<TCbO6CiTl>hpeT^Iq(~)YhJdsA#ld*7@Q$nLd|aWoHv>q~gP32v*G_l#
z>^CAQcrPVK<+23?f=gP~ukNp-KEE2pA_yJb!3w^Cx9I6JMz~-?k>YMPs<y0*KeSQ6
zwd%sWjHL-=9AHd^WU35(pllj{VZJY>A3N}4^f77CWa?2cBAj*4M6Xgx<h@r2|MV5A
z{XKFVb)}Sx7{r8n!ID+5KhWeHnmOL_I`U};Rh`hY%6`|f_gD?1Qb2c#$?!3&>a4G#
zDRr`IysjZ*t_)Bcg8!mMPbPHv5@UNpb0nqgRD-_q$sCb5%(6C^jwJUyqH*#mFoS7C
zF9$b}ckjlt>GHJ}5+ECuo(o=Ruda3m2$R;-W+jOKlAgdd)XAv|@5hH+K<)$N!hrY<
zNQVJN+#cYp%@gzH;@}Vml%2r)2O#V9&w}*NFcraf<)-kxT$Rl974&#_zdzXT_kS?`
zvNJzz0)K2)o+o~g+vPK;vwST3g9}kSW!ij2$AWS%Efah!t^UIvX`PR|sIdnxs8zk*
z&F86UaUc8CH1M~Qk?wYrfV*l3^0KozK2kR5z`h6hEXb()GCmp19N>iv>BWG;#kTvQ
z&)5A3jsAb6Po8kCX}nzfXe0R^=m?VKDv5_J=#sIk^l!7HDTSInv`!wj*{kn6AQn@k
zGTAuWP<O9JRw>i@*PbAvFjbeq&V$K4P81x(J7MM>1)~<$bTFTS)i_O@n)G1{)4Yl&
zmwTh&GK<x#pH5N_RZ695s^}lH*5|AUDQkW?uKiND8b(P+S^0<wgeb%$B-baugoKx7
zX6ga7IH4f$@y3#J=ZktC0wi0Sgkkz5AW6>2%?((L6urE>0CX<kuHXZ1Ut9f=ITK(C
z7IxkwVBB#n(V#)}yx_%6N8sf)#mrhC>jbYCUemdY_(}i-wV1IfJVdd1`Gf{Zn^Ttq
zX+H!zK2HTb0TK7p&3h<Fuw{d+x+*=qZQN@O_0oEf;*bFE;%~$K`jv)sWs?^qyClf4
z$I*Q1=^3Gylw&K&YfJce(9q<gl4FOlWiQ@t#bGQrhUdw``LiNAjH_y<q1);1J||nq
z#N~UO3qVU-&STR{`WB2wKoifr@UtKHeF`*KJVM3wRp{??%~f_5_Tc4OUJ08=;+DQ<
zM+TOzn@|-7Xt%-`>tOZIN8V+kqCoOJozWhhd#Yb`fi$64AVfWIbu8b~^2rU?rYN$B
zsd6`VQdTeX<!!qb62K}02+#eIxVV%5#Kq<BDs-xiL|oRz`H2DkdBoX6q2_nM&kUqU
zg3bp?V`E}MYkQjx=PP4tt!FCr&;(Q;!`L!JhN<$_8fq?@h${6{(0pbkFV};n$eN8O
z0dt}r|1%Zp$S*5W5QkBUh?=l@O?CBY;qVxa#K3{x5Qa%Wz{T!MMDReevc0WNe+%c5
z`Ec$@i#rNVI>XXHIOutp&`R3e@+MF4OSlKfnh`=?8W<zOk9pRL1clST{5~*E$Dvs9
zABL-)1<hStVG+CFAk;gw%n)jz@9!a)B}3GpPYy2DPC^1J-uzWWa=w^Y*k}y8Z(~aK
z{@bilMYi3pw{Nk3qEDobRiY>MvB){bg_l6CX7PQ<+Gq{D4?ujzzc_kW?U+H<-%^u4
zR2d!3N^xY-Nx82Dtihgu8`!~)xVCbN9?$Jtv^xL03&6AT&&kEg!t#$g-cVovtm}r(
zM3{#l(0v_&no1oY4no*XVZcVO{CJLxh{4sN8CImu^oUM#BpE;-;Oo$kjDu`SA9k-u
z^G%T5eNr45#9*)<FlZ<&<@@8Ds=6b>%}o|rK6hOTwd{$240@8PaoMso!^vS@EKBs=
z**0*KH@N;r3KaPDeA4h+oBX@+4578#gw^Ea&OF77NwM8iNj8T|gj8blg^BFK@T1$N
zxbqukQX{w5pqcFOw{pn{>I8J2XF-6E(F%Kg=<SEGgL%<UPYAZgeri^-Q>zmP36dC%
zDzA~tdQ2A7Pb|b1&lGFL{O+fueub|4_Jda_mo?{td9lJ|{^xyYKdR0Fyw`v)q8e}t
zb_YCI?;)8;$xY*FmzRk8?n@l5_pdn{Ho*BH%OBpp3AospDi1W=d)Wnn0^c>|iZHwV
z<|V>z&}Mk09F_P@N4M&3IPNEv_ZW5kr!&@rvD;r~{CJ|5FF*1+8ng}Ai<=;{u<VZ_
z$ufi46r}<BzP+X(7F#75_VoB)j|X$@buW`*kKbWa)Sfln-}fgbeRq<6%i<<*>ZQ*&
zalF-b<dDo>euExKiNGqX<Z2+9wYfM?dfXMXf+v@p)i>t3r1Dzo!q#@gj3~{aN}g@O
zldzFQf1$7WX93RnxXU?(nb`Undhzw2Z>jOuHV(M=KOyf$Ag-@GjJ*N14<P&bCjbYw
z6yU5a13rlR?jGm8m$~pN^!Y6$fN;U7#ARy0W6vzaozR3!M)KhV)a24VIi9{x)!5@|
z1UhHLBz}9H4)9<WAWn~4_Rj6p6%H6&ZDAlckv2D<@rrHPU!O;)>z%akn92u|7{E|+
zaLP+4DYsWTS4npOh)0$mrUE><jF_V?YFa6Kvp;N0NQ-owk#miV(=_^gZ`EUkUnppT
zK*1m-IqA2>sS9JM?0p2v4lD)6AP18`o%mPe)I5o)3qLxGgU>o#$pPMph>Okv`Y<Nc
zAuk>>7$IZ_G<)@s1Ywl{TueG}^)nNdi<%l7h$wCsd!7Y_gl+(@7hscBku6L-U!Tfl
zbcAj*ThU>Wa=9SVAvGgRi~9x6E^cVfou|dD`A8{^NuBC7n5$jZGpa!wU1)k8KjFAp
zUha%ud9P-;&j53&$5q?t=*)KqmUHjq6GTQrwpo~xAqc`)WwZjtU+bQT*=03<s^@`(
zgEbAp2C_?E-rXGL+$}y!_*Fb(upM>ix8IPwU033Nn8?;oPSs{f)OIbBjqjiSL7QiL
zz7bGkmZ1e&T9lI^6lh4Cx#>&`>XBS6bpgnA>`;vNoXV(W&+FLsb&W!mb1iy-h<(-F
z)ko_2;yyrd{LcUa=wO}zYV|+&N&Y^<$lqV4Q$M8HnE41((4)}aE!skUmWl<>CrH$p
z{rR$Ve3oPCBf+~nJDsGcaPM55`~fr>Lylee;9=EkljtgfZ|ZaH6BvmjuWJeidc;G9
zrh48$ubb&X4W`whI+aAlubaSM5N_*)k+!~CfR6okMB~nSS}s}%F>ys3k1xhklre*I
z$mxg&=?IuM+@}v$H-WpJdVk~@SL=2l3vgidJWF<4Sgn&7W}m#EF$8c5$c_wA)LiXx
zvIHLUCgsulOI_XmRai<_qnOOh;6;!@qjhX}s;komI@xF}6+qb>fgpgL-VNxE?SP!A
zuF;1<AbebEbGIo@-4+f>chO5oFOW3jJ4ZAt6omey1W_eOu*rPg%f-yDZj(XsvB|@R
zN`o}nMtywfTA+35st|*#7L}Zy+s<u%4Y`i%xqMt^x!}ip_MTRbMtxIcWTpi$Yxs7p
zwM8ORC?aT=aC6uI6a^Cyv9X>{`bKYvd-~_klN{nPev#0}oF#Am*6#RKJQy172!3Ug
zl+F5KELU#BQU2D)Pad+M$5EPx;KQ~okU-tFpi*RXMNN;_vf!e3>sj+ERrBH)Gpn>r
z(e>17*c(?pkfrdkoh_7g0dSs2e{?@WU(mTkwM@C%YtVa5%y$EJGxi2@D2qn!a3Ea_
zlr4h*he39$f0bwLuyYRPuXWBeRgh{W7=P3Is05+38U^Yx-5c6ttSq$A3%$<dzi6hI
z%s)dzfO;Lv#pChqBu?+tv~Py4bewYZ9Hc)Q-(2|b!tAv9j`l?Hy@V;fAo>Jhu)qi7
z6MEe0kNYXkH>(D%!_+I0;N1Ec;c33@irlUnXW#w2`Qd^cQc~}LZ8a_JLsqNie$Kli
zpSp~E;}5;k;iRUOqUkY%;}$8BRS&oHPlFlx*KE$6h1<m$Y}{OHu<^ext6xKHdz=mI
z5)<P;s}&gdoQj+)6{NjfUeECEyUC0uT8{lify2b@mIr7pqC(Nu{IoZ4M2afAZzGwq
zR8i2!Lt(ocnSW>B{m`9w{4?xcch(xDW->XEUlUlsr_VtKKE2-tgzA$1ALZo-0WVqy
zVCAz`XG&2n+sqB7Y0VJHN?RDyohWY(v{t%1#GJIUc3BSHjilDb<~UqqZgUH_zV_F6
zh{<|-$KHLWSMRZbq$_nd_uE(@5SOFpj|{vrrCZy;p(-FDOLyIG!y9NAGklllxs|=0
zV94wE2WvPlDWz*?6N$nmkof&|KthwY0emIkMx^&*J)j45&#CPlKzXy)84a^oSeZ4+
z&B~E{_Ap5}HAR|@?gsCHFvmqXXxt6PNjG>JE+o<%mFHdy1Ux*Ie>ls&(T1O=Et>L=
zd)ipZALjcS@efM?|7g=)!NP&d<Jnqy`b)HR-p_{FCnLM!yBi|1bW=n?Dz)h8)4%*9
z2x&%RG5<N}dQ!}!y4?3+%D}H6t4*-&AdVpwDs%P@8iNt_<@*vo@nB5X4Hrr8Bqj;R
zl2+nZNSDw-URd(Fi*rGi`?c@7)MQqh5K%b8GsH3|MT5_KFIB>QFW;u7NmqLGAWL^D
zV9|Ro|0;`WqkSrsBh&o@73+nO44=^M%hOrVKW$cCULMWE((w94ddrT(WBGk$pf`Dw
zU8dg+zG|20`O}+>z}q)f9j6nKG;%M{U#SP5Do9k@|6Vi)m=e{$olnVXEqa(4PwzbU
z<Yd=*_e*pr@I+-><NnKz-7OuH=3ZP@)xK`eG4RJ8GB{@Y`mWIlpOO^gCAO#nX9hZJ
zxWSes<xPX^Izg9~Z3id;2h!gpKWf`OT4H*BUtBH4%*gU7Ewu5RO4B_!q^@}TI066F
zlDDnk8QN$j&G5nZ*RO<v=Cxz=GvT`X-qmLZt^<ju;}!awmYyBPZS^Pfaj#!`=`s=m
zZ|)T?BX2Rg3L(O)+TZlWyjyq7lT1t-e<mwf4)d#KJCevV#uC`qrfPbYgDKYJ@{e$7
zFOH+iX`&y!FKTSjwt#E#8ZguUPlzKfRI{vgqyPAuNR8q0p)O?1ELrbtW6JpcawByp
z{z&vX?f;PXRsnG|O}Oyl9w0z~;2PXr7k60PSqSd#vbcM2ceen65S&1O;2u1|ogj<D
z{^b46&3}C^&+W`kZ%<WE*HcwbRXZ9wKaQL|rg?Ar?$N=Tf$2bX#E?vfnQiFm`Cs=o
zR%bzGi)w+&a5RPzd@ew?yn=M-0@3dP{qS%~L;wgWg(<B9ziO}dCVM6~#+@aHOVTt1
zeJoGNrnIzF9!O8cFh5`ZquE_b*Z98AEGo)CTV`c+r!R0g@Pykjs#|#>g2m?pJx&cO
zk>$rQ4g6uNnZ0J4on?n8eUF8YG{LR7z<U-knB?s!C7_siFa#dK=(8#T`~DAEt!Mci
z;(lDD$4jBkS(~D#lOX$FdaEB8ty};J%34>Q{1by6XPdB#jqQ!L330afkb}4ukoS>9
zi2mMu41+CIm>(`DC}hjT8DP^dt(6xVg33WafSV%#LJWcDM8OD`h@r&6MH+(BB0?%c
zNRTY1CNeE%SS@nFi6&ws{-gcH?>vy1y%Uo){Q`wlqu`66HW61=jM`=FX^wYsJk*<?
z$hUTUOKCapQ-aMW;ip1?FKl?rGKD&7rx%Rz&aqzsPn|Dlyw*r$E=;-rJZRG+ztcOH
z_in%1UImY0>gP{(F9t7aKovZWdN5Vd-wJsB1Hb3btEbIMwpvQTF|;2)Dqb8Vpvn@s
z#-K{rmC_k(|E^J(+pdM5sBbC3<)H+P#<2=jf<E5yP1~fMl>ObT*)!N8DEd@UsRzmv
z7!}XBY;&)0Y<Ik1B*qP`DfP1*m%kgRJ-o{skj=j=DVCC2!qA;{ISl}E&4zsRog8WD
zgr8XSO9jiZeskqnLfb-Gzx&^MlYO^8tZDhDag1}u<!b--<AN26Nh&Iz-OZv^8bBRw
zK(<iKQA^Is%$=9COJnWt)l$TF`)?6QS-G|j8>4aUDVtxfB7sA{F1L9yB9@8R^aN4_
z*FIzprPOtVQc{-;&D|^DIu2l|w)B~gd;&@kaf7tF`Ep1R5yJ33pfgF)s{(IN?DO_F
zF%@0EL`?ZrQ(it%)dbOA#@|t}1o4*CMy^iqWvC-<rDY37GpHyt*eSAl7|5PTr0n@Q
zvNa^^Zs1~c+#8h(+Deq>rzUsI5dvM=D|w!9zpvL{1vmWZkm-@!%;Sy*Y#D7EEA&LC
z6na}w)Q=$37FMLZOODPCo}>e4V03?Jb#h?8_d1tJA0Nj|W#B3XoDtuSI6BdiUQjGG
zzm~8ULBWH*m$+L4(~-3QrbkL;pRM%$?TX8NErKaNQ$%vzOlDAh!L=HL7x!$!i?n^+
z>w5Ct`ht8x`{8MXFoz6vafLw5?<rh&d;I|tvNOk4lS9N6Jr<t1@ny@oy}k$f`E0kg
z1}Ze{mC>HFc(Fy+xzW1+Yt)gI*Ia>A`7#UEwL8C(*#_5dR?NDEa5-0U0X3&-DiSQh
z?M%83-vx6d`uO?q>DTe_(%?kGY~xEm$M$_tcBVO&`1hT4SV^zvR9uly_yTybe|Sy2
zZTl>9N{>zA-W$Kyo!j083+xU0y|r3PL}Zr_%I0b?Py1dIaxy@oYvQvI%gx80f4QhH
zcP20sO5m8G&m&A;yhvl9xThrj{a7l&qp!rw{rlABI5DN1stid_sbu|ffL>lHXJK;1
zlx_WFK}RPOSbb!hK!3--+2)?C_wiWn-<*60<>Pwqi=sQ7=naQEP0BbvP59?7<@ap)
zd#}Ufzuk}TypqCQ4%&>(R}@WtWiGGLA&B?<{k&;B2g^AZzmEEJ_cNb<fklb-JUGn~
z2Te7+B!RZ;M{fv6UmSt<T^0&@&09E$J|NN{u?&uMZgmJXoQD-dDj%i6xVpSNZWDSe
zh-C_a1E&c79gs6LTu3Ix9W|P=6!idcQT`=f4CeXapPrw7$HY$l;hh($aJu{A)4Mu9
zO2Zr>c|q~KZhR~L)@p*O@HXXiKmM3Q0};NyDzT8$uJHC$BxoRP6Lwsa^l}-J-}_6f
zrFLDiC^R?|->wYaAl#VmEl_(rrqnX6thxyv!wO&BQ=t?eh>Fj-fbSAKC9__g!XZQn
z;$f!)YT#&*q8S7WAYlG&1)#M`OXsEf3Q1NW`R(IVf#RxBBKPSoGEMVi7Sd=^_VFBE
z2x2*=Vx16IdCj+}a|ObQsF+~iAPc_j*O@{W-}xtUxEo4M8u!^h`BI|}%{EfB?8bSq
zh;oTP?KDR)WF)clu^+U{Jx19%;Kw=8CZc=j^Nt@AzaOs}*t?w>&l6Dh?dlAg+dizn
zdd$RJ=+Y%XyOuWjsZd)jx+vTlz(KH6U(mc3);JLVa0m4K1LTdsv%t}0`V($DdJngq
zKlXZtR9Dbavt6(LvcDyA-=&=tYA_Evvh)PlyQH2XMx-$LgJ{~9uv2Q>opf*15pgH|
zXcKa`>8wmgel{%na%n*$?f%*_&n2B%Zwj8>mCne?PDiM9Vod^5`j8~A68I|LklbX=
z`<@Vc&DTJVraw$pD7_JO(L?dPeX$wfvc!KbOj9n1JEnK}k~kDtcz?W|x!zW1!b?fE
zsvWdYpeg3QGd?lgHa|gHt7_Qr4YF*n+H)Aa{^RAP^l>h)WCSndn@k`A`}6Dq`C&O2
zvOM4RYG`QUCt*h`NShs|{oQfM$!GzNtHj3Vq8=+~D9X$!iEvvIgtY8*xS2Ogaa9i~
z^0dvLIY>9ov^0M2MIe7cuxj6(@_SoiT53X3tN`#xsux=M@hvJd0wrVqwm_YlwExV{
z@YZyB{EtG1YpoRV!c3qJ@^8$KEOci-qjbq1*B^DS=dsty&D-_2YpP8GZ)*zk+jk=U
zGhd!R`8&-l=<x58BFBWZn%mk;9D{IwSXG<4S$ejX*ijb2X(8^f$-(?!j1WB@FRj&M
zlh)m9g_jG0u*|i!+ZPHkpUr_6if&&+L;}uO`O+HqJh(<|b?9q2)WW<N)Y-MKItBFH
zsKKw_U9bFt0Aes5LY8rr#ho3gE?sRa`0d<FlEhB-%U?DM?J87FzWD=U)H_@193VF~
z0#2XXRtDDm24pXcc`;mbqkAX{du1)2X~cwoo2>Wi`O-f7^A?!=Wg<qHh9MXoX|h@q
z+<FREHlpep?8m2*kuaXe{`{vxrt-;kG~V`<FvxBAl)eqARI}rG<fXJHa9E8^?`ih*
z2YL6{;QU}5W1(+($jIE=rGInZxmR=AC9zHT_*hl32*%1e*qI^dqYVE_J=TrmjxLhm
zX!4^?z~ZpRE|OXnkv?{h$ILMgZmX5yL>U0-G8^s2zNLVcG+>kSwB$tiy^FNd#=gH(
zs%%qX$anXj5*<TPj6q3`4VFkqu+d_DkOm-smYtCWb}N!S&Gf#LSX%-MJ9fr6N_|PT
zGONlt<EnlpLs65;$P1kK`IPW|kcbz8fTJz`Hb$C{I+8}{m{>Aptpd1FKF*Q%u6NNy
zl`M~>?p*{GEp|U7=c|BBihoo|Dg|4wHG$Et<72#LG$CVVK)DgJfb~(8;M`9sQLia;
z(fgh@ozMBQ)rGjb0c9?CQoOznqrag^5v-NECLLqafb>)y#Re@mz-u!ovid`^q(&p{
zdb%S<rrErb!KOve4}N$5G`?*bo(K=@pf%CQ64GL}JH%(Dys8oO^&m(57`&(<SynzP
zG6`wUmW!=%pUsbq$`owX?Qw_3Zm!QdA`X!%FoY*HFn%9s-uR)eFfCfHP_Pe=|4f03
zdS<vd%wqg0!S8B{9IswtA{!yJ+EA3}x11+7^!Cm2FLXu_Pwu0vJ7hiP6&XuYlb~RA
zGiQt%Dvb^<Dnf3jpU!&Lty*URo)a7B9YZS9wKv)=1c7idDmJ<z`D=~F3WQjvx+!%9
z&;>jBfm7^;uk|MAY}%hFH>5v-C>C_i98R5Q9-;%6Pr6$diQlio=hhLChGN}dr@i<Z
zVuP?<Vtf^IYMetnzt9_}`yp@IAT9{_U~Yr4OOE8IQ%0`xW$o8Xc8v?nm$WZdRhxa?
zTOd;uJ`0lxA5}^JfGbh4O0XNr{al7XhW<kKZX^Y53Ii@^PvfYr6NGs-qqq{V%M?+s
zLiIy6DdMU)bamMg%&;`hdOD13A#Aw$MIpN|(;q+q)(yO}?}C_g{^L&1P!#V)76O~I
zoZ#2QTq$}*t1n^=ax}G<gNgJ1ru0Md`vyT)cG!uq_4PgfJX=TiZCn(Jt6gdhZ4tp<
zTVJT)0`r+s3e;ukbk$nT2_>PV2n`{e&*6`?bn$hqKT5EbL?OubqI5+`kAo<(2G2k)
z3C$x%jSQ7R0hflDt4fN-OYMF6OPRh&O|K3`)#7j)p)-8n-K36|$1+f;KXYZ}r*Ao|
zOB+B{6m1cbz$X#k?ac?^%wd0B(*@`F&rVbc^d94-*Wm8QR?h9yjE?8t9<yRy1q4Y`
zj<I;Ea2BkXTMWNiYoiX#`Suh3YU&(hqJw;U6@2|bm2*IF@(j!TgqEc~7#cz+T>=Co
zBekL56dbD8=0m{z!7LV392hznI!de-`lePDig0=tXnA1XV1sw#lBoSe%`AYg(2HWw
zf^-&v^H^K%qM)zC;h&*Ee$k%y$#$aiVG9Ve<IOzbIHCQq<KifKoFvgR0sny~+PBLi
zI6$pX*TuUGY%)%yFW1`;gTMR2nK}WZjCn%)c|wJ>K|$ctyTWzYbk|Xc@Pqe>IIks+
z^?S*<?>n623qN_U`~U(UY80rEe=qT~_ysIeDzXtE0eJz=E6z1-W6ll6o}+<d%;nPB
zd3K}ssf2~J&?4j<LLhx2bNVWWxBh8=Il>0r)_6>sl2fwC)vQdimz2_mAOFOx0$KUe
z>0*RQ(C(U?zrDWF@&{GLHyih~7hA1PB^>5Bh|xoPn#W0xuLs$m=OM&Csrn|0&B}7h
z3#6}-g9BfnwcrleU_SpVeBm|Olc#-4j5|e7U53i_JGg$huI7zVpd4(|>9EJ&x3`L-
zW-0S}{tP?|t34(xypD@H>#FUU;Ctc!aLUv1n#oY51Lgk&O-%NBE}tqzF*$OXIgFVe
zW!zMXceq{f!>$jq6iOc+n@ewqx*xA6yHa2F%vZmmAUAfJ|EBQ1y^aFe-XbuPgO?$5
z1O1<1_an=hQ~nF&no5(|W*TybIgh$xcOXAl^Vvj&H?6`Lkaw*!;2==E^LDZ$Fi1H!
zbK{o0OW&FiPDa~$%BJ_AC!&2lZj8dn*oi<P_3Y&cVGIT42|1UW>S^qJf3s;9ug{d4
z^>yu#!ce6V+tv8LUH6Twys=5#Flktp`03YE{=(~D*lY%OnM#B&yM=M;Z_@zvl(ejl
z4=Z6xn*n1thKXzuI6q5>Bi@_-see@KIQuhXZM=4sGPf60b38Li>wz5(B6->q_SR7m
zzlRQIt0EMIhf#%p+T;*cAb)zH8|(F3lmb5+qn*-WmJ8vt)Sve1%5Bji<srF>+)o&a
zzgBqFo)h_uit>N)uuZk>UQds~gE|*(9)7YG^ID}5t$q)vv9jWpYtMuA8>`+8AuMgQ
z?Co{xDEbRa3;g;(#ZlI{V$KEV!!5ytpTNT}J!|CtpZ%sW|N4i;3?1v~K#3pnU2Kl6
z7qV-d)M)TkR`>SLx2VoXl>14kVF>I?i=-75;0SYs<k+eN?f%oYw{SqF<&tQVodo~f
zd>he+M#xNBpjBeG|4XdgrifF!Z9O0l_?sJ%UF06R+G7T@CEWOY(1Y}9XmPSRWp7vP
zuKR_R-Izio78azwjm;<4inTx9^NV>653H0(v<{qe*Iw=ss^qVk2>nSekn?I{+wa44
zVM3)^wQ}2+DO}s$LeVjDKfgp`UI&qprDy1CKKHMxE?dW`w<<5_$8WC!<-1MPQp*@l
zr=xN1#VcIBl(6V3txZI4jW@IX7p=Vlj>R;aO($);e!T***>GQPe{?f+nlAmCBWa%V
z&$UdIFYX(1LES}`T8~qkV$WFq$|JKyVl-W0L}`-6{aR%2+hGCK>DB5oMnP(kw7%68
zh`}8DJk9`Py;T}n$<F}S;ADw3>H>#r*w`bSUPaHrD2C)9Dq2Z@7NCZPr>YvA14!_W
zl3I&NhIVn+82caSeKl&t2@EXg;n#o#I$Bm7Of9KB@qk|$Np++TGxyyeUQT-2VoASz
zzw^uR#Y#&>SBMs13WMXBps9_=xu~IR8J|~RT=GW!jY!W#Y%2@KN0&5l0^>7DdQwwX
zAH`$#BZ6=T#wnSOl&FYIYp9vZ&<9gYB&dlbYIY{><!PjO3WGq}06ry5nNhI!M@$Iq
z+ZcyD(B+0-#|$gAYVl<CLs4qPNvCtpRSn@6X#G}hulKFsZ3ELi7pt>j=D7rb2}FQO
zSyIC^f_;>SALxb{s)Y;S1reCY&~75tq@oz@%Q45Erow;Sa=8zQbJUX2&b6+@nLYSH
zQ&JOA!;k#QS&#M@Bd$Z%LiArS;#C?SG|K)G+CFuNlVvB9YuV^ue|SBMBNpL#e)Qq$
z>ddI=+mbKmyOr#7TKG}Jr8s$Yol2h?-?synF2uB@)0C)0s*j;ONg5uH_T-KCG$i65
zPZQop`d<i~lTVZ4y#JnRwNPlY+bBNs{l7nB{ti8G*UaL;{X(fzLW70JJh~~?1c_9J
zpBU65Od}ccCeeP5lM<cr-_SFY>q$Jr(c+#(@~iYU4j|@C{jf8eB$6RA<ZhquUf<rl
zsLJKWl7a<%^=w}a#zBfm!y80noP8tmJh{}7GM15fI%-TO>A+_gygPy+Fvq%TR=kzV
z@uF>)2;2%akFQz1iza`0ebhGu2qf|P3=fb|z>b~k8VKCZfQ{!YDwIMRqaL)SybdxO
z#1F>}c^YfgWcxRX{tq$|Pq6evHJbU34FAEU0DZz5flGwZ2RU}#Wo7@UA(5!5jVdFT
zUAFh}hy3;5f8YfJx#ZA)3RZNWB2H?SWWAkE@n>Kv6Z8G9=0}$2ABz9Igl{ni*JV)s
z{?dK0T0p4!8NXXe)W3a_Iz=VoKVRl`u`o$5B)wjcx6u3-wE1^47yZ9o=2Zq-|5Ito
zNQLBo<Gtcqg8y*JYoHA8_-{AI|G8hee<*x17%ECa^Z$GDiDCfuL>=d-_cZ*Y1!gPG
z2ZsDqDfDtq^ttaPaU|W?tp@MlwHa4VMf$vRHk~uAZx_FufsIBDbU~a$Oi%gzn-##>
z3F&&TbG`>Bk7ZJxgA%*wVvC9kBzeANz|qjAkMrAaIE+3mGTj>o)%&xR3!p?z{n*3J
z?Vd$U5MvoS^$+*h!6xy?Oxi_bGv?#V_`wlX+wDF`pS+bXx*+j4{1c(G*{P$#J2}R0
zLrNDb6~9IT?{Wx}A|DSi!E@HXxV_g?ew6bEb6qOEU1s>Fs3&%tgKi!yYTKO44!tA?
z4JHQKoQPQjoOeG}%t|-)-Ocgk5c?_w-YBuVnPvPbyzI_pV*kEnLK;w}aF0k}W?s1G
zVL1pAXhtI8{Mf1n>&J8!7vVK4^|aTV&Y=LZISo}dGb%u?zK{3ve%CDifwT_`qUYH{
zXk0qh&q!ykAKFcR+T4H8j1^J(rw0=Jx74&q7CoV|igSK+$`M7gS+^_`r~J5m4EREn
zgoi-0U`;%{tZD8g4Zahw24Y#3y1YOiqNL5X(MNc*kq^XWJhYkkk|PzeV@(Go7t&_N
zgyvyH3`HPDTE=k*TcS$JA4U~1&EjYtQBu=6F;<2f2Z8rl&dPh|YFYMTXhh*fVk+Dn
zg!K0)a#S|d@JtHuWfJi?RvaOC1i0uCFsZmg9uO4*w6!3zK`~Ti4tgp1Utq8|n1%v}
zCpMHm!n@0s8f#M&EridwgIH^H?JohhXT)akcVRSzu;?my%20-63Q0tsEF6jXU2{&L
zoX@s9r*)6ks>}P4CjO%70LD5;bWG4(BnB82v6v<tegKggloRq%=mafeE7*qA$EUa`
zi((grw18^IDTogl3rNKgyK^3&8$7i6l&rjL^xHK%3(=KnON4X;bAI3nRv17#Zp?W<
zbY8oV60zup_I)n3>Y0S%AO2lVZu*MET&$ALH+r5cYlTW23O8iWbyBH|1C-_5+rpxX
zm@MK^<Y+nsIS(TFAP;Z>DHB~LmH@%wjPRy_Dvo4SAXAf3I%5aM5*#|!Rr%@@CXLQX
z7?A?m1vwUuB$Xs5DpHQ&F=nd%cZ})3a{Sb9vKwWhL4$~J5j#=<qCrF+)M80Q%D&Ym
z^DsU)Ep{8X^EkB2B>19!67Da*yZv1}p`yLrw-e$W(BfTQ+%K?Qo+`c2VmKcU%h=+T
z9al>p-Qw3I1yl1IX~Tct)3l!1R03UTR(6dIwIrTAhjzq{6)P|cDNJDBtW4bqkOyCe
zs|xU>WaV$pt-YhgtuheUCUHeX55*^lur#%T4~h1}MPY!q&5o9cr<JCHCy22A!{#In
zOTqbEWXI9S#xJfw6_5E2;3$nlgiy56m7G!ZD=?gz*Fi@TKnR@XDHNiN#t0px><^s=
zs3E}1NRus)d=ac6tnv4iUW??>5l3|mDbwG2Hti$MWWD>*kyL-+dC*>b65_79QcwK#
z^G2mJlP`^AD^VKJ0^E|g2;%w8dJUQHlMKx?SF<bbs~4^hL1(pRQ761n$E|!98XbE(
zV45VN1#wX4GI#IYRd(}9&$TM4$%sI@;<U=Mx0gfyW@IR%-8{U1JLPXW-;ZJlJbR1+
zBB%_0D8P&~l_v${SY*>SG{GB7(y|ePTY_;0=~U-26eb9Gz?tm<$jM0znfo6b5nBgS
zd}-nE@u_V<3`6sz3DtU|ohw`A@-08(Cu}C<L-y=?g2P`=-!31^#HTI1MSbll&bZ+}
zDhe~Xq(m!4^dl>Xd8)MOPsskGha3z~7tN9-m^L`3opc>=gYDkfS*f<d<XG<nm^EZ4
zpt$hUFe=0qq@VD>#3^%ccw_Pi+C1RgHRh`YOn;~c$?(1Vw@n;lkiebY`w8mRrIDa<
zL&cQlLgLsC#Gok8OD4-Xkp{DT4{41mL9i$rWZcN24c$x@qnF|&?B_%5HK)}>>%5uf
z^B`3E8V<anSc)ebXmzC%^p_Ot9Dh4Yv~E_v?Kg{;rzwd05sI44x*0zj95t_e`A*H4
z?z~Rj(VBw;v{>#)*T@k3Szu!Az`MKN^M+Jc&ENSOXZOu}!?tldPg{G(Z0<^rzBs0q
zHmd%$Cli06S^w4g<$(qBL?Ho?jI>Iq#w<sxhvjP8=l&ROfqki|8l*;nW?3iyE9eok
z?|lE?0Z~+-c6Q?oHdZ*VecijCbM)mg4@Gu~F<2&no-kk7plK4`*9*MYq1(uOidw%Q
zVVptQgPx6oe1pxl4c?nk0BDnoyj3$W%Qhb#X{=4YIWT;+(2CNpijg7`v1<Pe7_a(v
zX8*;4M%?t*0w~<?-DiR_&F;OBbnmtH>#FAp)gv1${dK|35yrp<@&1vPe+h=>Gkjbd
znz@ZD84pC{iQrW4S84ZdnN!rSdzkxfl}+CE^lk?uxa<;O^j`0}HVQ8$6%P1S&~5rJ
z?2c_*Ryg$kbfvg=c$n&C4Z4jnTyxEg<;e2OY%mbKI|jF%_qcOe*5KCM2b8cG9KyZ}
zB8ZCz+H3|Ic>ltp%DS@rPz$@O+?WcY@_$O{NlXkW;I+ySUR}RwF|eM}4?4TQ62Glh
z(EN~b)>Bxkqdlx<qS>gwdGvG4MC_07ro*aJ6ie18K<Jh<@A0AWO(g$<4zyMLx`=`s
z-3nRrVtJq(`=l3E2Zps>B_j~mY;;BxK1jN)I@X^HDFloX;ZvDY<H%{mPuE#5<ixRm
zhCI=%1(4`ARAIgG%5CkvNHBcbG29cLS6C7M+t<0-z5I+luf;_p6FG9(Agm&G6@O!V
zJKp>|kR0nyvgtZXi5Z{My=iTOUVc_tTV)r!o$Y?DkUYN~XJw;CA=Q9>OK<pE;W9_G
z5`beo>>|eZNr}-W+ye)oje|-@$&&-T_@tB(Qr=D>LO!82Ir2l{70Otw+ct2-^{3f!
z;A-}1Nx%K|qz3Ug*wIYQ_?d;>>wNt|ELS;?p=8~i!t;6IRqQ~ep4&p0p6j*!+QDRu
z?o0Po-}g**|M6!iso`hE4~gw8o-U&MCL`(0a<CngjZDdmel$Qzcwe6{VLY*L+jbQV
z9JjXXx09B_+Mw-Y@@X=!WMi+ard{QAtEJa=2xOYW+7&AFb%DLcUOg|77HhwT<a%nn
zzH85&^$rUgKCf<i72eEs2%onuPeEop*Df+Fsblo_cn`(4tpg5<N5M}Hw&M&fLPH8T
zu0&mrZJ*-^GcExGZk7|`FT5@fYD_5P9EXpa@LD#(LO|0BVvQN;%%CT{n-(yl`gq>5
z+wUM__r3UKv;E6Aoz)~rl;%2i=W9pY;FG{HlAq^3k9Sj$Nk5d9ICkTkp($4XL)@1Y
z=G0~3osE?Jhi2y@_9?s<lyty&R?4I>WV5+P4R)B)yE&UA9bi2)j$?<TEkpF)=lPH3
zkM{HyQ`K~}s_a{3l<9cn5d=EA!p|B-dv4f}_jPGsxf@LWE*<NhbOx@PttLbf8>J4D
z{2+QsNO{GoBi}yl?@3)=Uz*ks&s%G`hDa%G-ku#1_d59wapLuQyV)-4$V=gTiNMea
zdbW3bJk`zS4Ey~%44Ym2%x5VhzVm>Btn)PQ%{AtR+b0Tsw0ttSBDHv(=NYx9ym}!M
zPfR#h^sJdK_hs{|$A;dv<Mnff1d>nXyoG5|I$ldbK^gF@;`e4t!m4Q+sZkg;s0#1H
ziajrRW-JaCA-7;3r(@yG^msCJeZIYCiuaF`-mJnecM;BkxvJS*57U{EJObyl{XN(5
zb7P<A+k^goM@$=_F2<>C=;&HBa9Va_MreL{DM*5J?Rc(XtGcIWuwZQ1BFja~(TAj_
z)nE`_hQ`fj$`$&`KIVdtjgVxzDHKf^LKn<1_HO;N?>;AG^5>57q@T7eIJWBty`DbN
z^S6BB#beUIH-dzF0#g}|gzj!IIL%<u4f0O)akzA|+eJbRJHXx_#{x50C^{NjU&s!B
z^Q9yPpk*mtU1lvFO&ynp&xsy8t`<5HoVJ*Hr*v+3lL8hyTHP6I%vXPhMHQwA+%9>M
zRUBW7lU_`c6(u2f>mn;r<aW_1Ht+qTQ8*FuI4=$>7O`ipt?aLE)XnU<N`P)`%z$C^
z0E>lPiYI^$%el~%^{Y}ai~J^3piz5AW8ghQrYdYd&#?AQE}!9vd}pzZN3Is-qN2X>
z;3KGR`kddRJvfZ|3;MHlfr)Wj)%8VEQsC6%1Y=9zI7?Q<c$TCx5*48D=jKc5)^O0Z
z`H--@CUbkD_;Guw*NWp1ikQP;vH;%9$^w5~!K2G=8MfB99|i^(IYiod0QrWFQ7gw>
zm7Fr*)S-CwhQIZyZqHiVHd{u;BpfDy0o8*C+w?GMgTK|o_Lca|_i(pOs#vukB|6cJ
z`R<^qO1Q90s(TC)U@-i`&g9sguuxlJRsDtR<}GQH_j9Py5k_V#ldQCa(>RYJZ>j|#
zWhNM;({UYg7S`jpx(|8u0oSwI+M~&vZN-POMAtAx+R2^sdF95CL6i9cF7I6n=wvJr
zi7u;2BZ#H!s4!9mE#tf)UmZt}RsBti*q<+Jtx1KfKV~4gPWH=c;YT}I{B!FT^R4Xs
z&-QfjCX~Ct7pIL_FPHIc?Y)7s>=3pBXr1D|wB>~8ttY#hwULj)<Ay(up6f&QGa-2P
z=2fskP<x{0x<>DzeX)0gNv}2taS$JPngpQ+1?|=X<j+FWMS97Z*}JBH#M>^LdD8?`
zYk|MGO(X?@vUxmDhQ+l}2DvN&2Wd4&v{ZRBJPHFjXCYwNw}7==gSKpfW*z}+g@M+J
z-Py0_klKg7K+#O2h8&y&v0HZ5o>|W|L$oS8Hg;+Q@VBO*$Mx-7vAz!tuhoh<b3xNw
zjxFlvzRt+eV+egixG~v?bAzm~-KnGfr@(oCF^-oB=gC?8NrjFbMLGf$pR62-;1%(P
zsduawa=jYC^S<(EAJNHp{pG~x3+qkZ#E&qpR{e3gLcq-=?)o<L{72M!{W;co-SybK
z-_rv~E0cYcUyiRqDjjA?2M%0{qlb4%$Qkopj_`cKU?_U46<UL*>&B+8a;7Kp_XaF_
zy?4H`q^Z_>eKR5Mdxtru;hA7C?mHY!I9UE`{$F^&Vx8FRd<WOw!%{c!B!w4zZ{mA3
zF<kfr1)%2ZvcpdcSvz;0LUq4sD1;n!$vuc)R43+pk=>TFTk@O-EVlSHIh+3U?p<%2
zW1~{huMQ8{Sb#h!ttJY8ac<$AdmkJ;PNc~8IV9vdm3buzBFFWOyB6!g&HmQACzsVk
z=wY0KetS5aXRge`;qK<jFHjOqYv`yq-+Dc8d*jIqAh|#tZ-`lJdHeiRv+@`3()KR9
z_0&;+FIdBO=4^DHQ6HC~z!rxA+4<!bWIzJBi-VhI>xqwhr>!bjF5;)0V;Rhi5)F@R
z#m2ax%X$b~BTQv&BfYI}g!_9mo6dX`_bxi)r?7)FhnyrqU#$2`3DT_R;24}wG&;ry
zS$kmst>tf|<Q6&VDHWE%x#+=BByTp~sVNHmuR=C*l?aj`iHV5$vQ*yz%<ws!$XNvM
z_wA2-+m8y34bk>S`$^$LB5V_71+J76<lsgBrLKic5q!3e(-ebtCr<w@?4(?;!M5!d
z7Cd#jP~>UCXA-^#`_GJT|M>~tvh$e}n(Hb>3(SpIVWOoyFW+asD(I-C(1t1(AxkJN
z1M4dpGId`O6(0}JnJ`$zz))(0p-wrsW1{!mEgfB+0Exo~ns9R$06B^p@_p+v3HK3x
zra#xIuHHy&w@C{1&dq+)5q2hezr*S5H++Y;QFyH2*bl$<d`<wn(;ZWKL*YHo%tDl|
zIuzVrSfIY_cl<N%mJ?{Sy-`mNLh8QKAOnCp_sM&AE@mG$>4I}+LR{=#PYXCK-{mwn
zcq4||cf|ctNWp70o(q=iV^+}6vVBe3;WPJM<hKner6Sx=2*9Qg_&4X=zkgx9U0W69
z;dzjvSl*gHU%*OqnXJShwA#<t6*?LDS?Aa+5=y4C6TuEXN-g8@n|BZF=SQ5V{+O~t
zl=t9U`=BPU{bGqw`t#J*PCvQ14?u_rjO|K&8?F3hi^7QP=H;^fE#~Y2v?&MyeITUf
zT6dW{ZDDa9#NB$3Xz0<ke@8rg!s{t1BbgzzUw**)=Y4m6$&Pc{e|uHb-fTrq-wYsp
zSIO9L^&pc}6_k%i^oO^xS<!DGeGHne<OlJxy1b+jHgQ7?V@!ibysEFCcxwyq0luG7
z?Yq5gc<ERB>H$|1rl8iXb6Q<*px^P_V6lhUd!_0-`oLRw9}jSB6mfupr?3cNwO(@x
z%oJaHjY!-kk8f7ta5SXodyKkwGhPgx4jzs&yVSC*{<5*(W!@fE-(uAro}XrkAob-O
zJL?p=7m5NH!Z<kJS`s4HGQ)0e6H84}JmQZ!w{KR`bv*))@PeN3mAJ1q&`3i;<3$_L
zP5<HPsYwSE9g{#6x8ZIfM=t(cJ-;dcw!np%^1vHZE4l$2`1XfT45@mthdX|F7pguw
zc!DHQZ9~s#!=;s`=M&LyB-3Mlh9k_ks9$oRx&(AsTk)za1Qzr?oaHQv(Cmt%=bRw@
z74ZXFL|$tBsV|;zcoNq3Ts_+D#JS3=<&L-YA5O(u9EYpxOUDx5X_UGr{8XUHrI6!O
z{m(yCb-LK`HpbrT&<$!_M?slek;2X#PQailr=2}9`v9aQ)hYB3nqA^P9K@q-?8sdz
z)g~e#8_$=@xv0rSXnG>qjKmfSvMaBrbG7tHsF}7K%k(6_%DKE#0QgPyHhEppr4O<~
zPVEr@MIxR*znX(J1yBCYM0IJ=Ct_vv3u0^_JVPD=(QN#owQyM)S2^E8vZE#E<j9i?
zdM<nX5#*h_zZBg4+-m^<ZQ(5>jAjFG8a@6zsXGS>2luo<KEaU<lNIW2sZ-ttXH8S0
zUq7Ajr*FKR)H~<8Y;_7*Pt8oKek+-C`2q5X!$ZQE6cf(o>VQVmz=vv%R!V?$dvEzf
zT$21Z{pT}@LaUtC1~yTViw%&WIeJyj!sXkw&UrV7o0Ihmokrz|R`fErGCY1Kl0}nI
z{SP6XUtS&tyHV`3=df_q<Zx34J2keLm=a|q5L=S1JUu6QNtjDVE+7(VO20h*MGxz+
zv4mOJ9rwsurhR`)*eCjLujzJwJg#j}v6^a#NGAHD)d#>O06bl79+&>exLqNm)TZ1b
z@iwG(7GZ`CB`v@Ll^%sRJ13TU>Yy(c*V4B3RAE&lejXbLh7$!nQ)d9wI-#q|pzB`^
zEs$kz*!T3CNsC6~F+-<aA)zZt=YHki^YI({n|(E#uyW{csnvZ4XW1$#6WtMYZuO}2
z50FmBPBJ0zbxClY$L6z?q4SWfCR~nRvVkPg!TJ4<(a|@;UwZC>9bAgMZqA2$cFI}Z
z_bBHte+2<>2rwr7Nr>tS{6ofn1^h@-bwADTXVJ825_z*JY*j-wEzJLSL`rnYnEM5R
zLS9HHyW*M+FfiZmS@^t<&awBkz4>mCKS=6R2m5fkDQ#$m>1%DC!<VUyZL!D4D7M~8
zLncQ9CM0f3X6559Kk(ik!^W@96D8UUr}W-g>K=jkJ_2mz2WisWGzwMGReqHg>f9Yw
zD}+dp(homggXPDGb*z$9Y6bKa`7hfU&$pf4o%y;!S37Rq6&o|5Xc1bK#hsNwW1eLG
zXGb5Dd6gKcn<|jf`J3UU-4_z1^t-n`hqdVAuRifh84mT~U;>|;nO9is1%KnfLy?jq
zAVy4b6w6+N%=AGAVWk_;#Eg7z=qVW$A_i4f2awYO6aE_PTFzp?^HK|*B<VoI0Yp$M
z@w#@ygnIYS3FBecAE<uz7du>}&a|}Y6zUZL>9{4Dk6O<wl0=0bGz;B;ehcaAN*Uky
zdrU4dc`m|HH4$u>`li3%{`nkbu;Ef?f8J`RBH@gq?Gs{+BNrX{KKLZIHsNohK+Xsa
zTrp3kmK_EK@xvkPlWtdY*S?9BMC@z}iR+0^Da6gBix^##GuEJ~+d8r50M9=F7GD3O
z9V?=M7d`fkw<I#K`X2E)@L{`_5FWEQW9<6dOGJtl2TdG%esqmcB{j7R%lk(jMP#`l
z2-QE@i@vxUMK_3YL)nN@MJb*2#p7(ABFOr}yZ5EfsS|TOpjLn)bQkUrnm%^CZvV4o
zA7DD&mBcm_+RqXkTn0FaWp1tge%cxr0!OT1mI0f*G4}SfZOPsxYbPvvIiOL{A#v}E
z1Lzq@=}3)m&Bick;baN814x4l;wH!VR#}z<)u=&KzTWwDXQ^(`8M$}e(ER*H0lVvC
zuruJnesRfq&Uz;hhQq~rUo1y)`tq~=XQR-!<Ag>OQ1AUBTT$TVGb$o08XRIY5gihs
zuH#4uknev2Pw7Lb#mE?6X0?TsqgO--Z$H^x<vviB*nJC6L>f;`h0t_~0dB;GdIcU{
zy#XQf`E@%E^1KyQQP3U`%t4pQju`*0DrB?53trw%K=C{TvDfws#%kaII8%T{d3liM
z92rq4Z8cC@8gU3VmG~!L>%CQ1w1oWVk{v2zvp7!atqH^bpnQq9!^XF8NnDKxWMpm<
z*ao%(?BI!$U2v-(;mtw!3Ha!3B<3qnX-7|^kB2MbkW@h7=Fra6{mM%Gy-#H+e#Q58
z_A4ik(x3z)2+F3n;{P2gvV)^r4QMK_MH|VO!wP2qo{5N#(XSz(u6xw`ym%Cfw9$!E
z^Ji;J_Ozvv-PZycFA~$%MWN%SFD&UbiuTRoCw4EmY8KS1FM9YM1wciB3{Vokq|z~{
zD7blE(<6KBy=Jk}digGw+ZhKH@X@6}85MrixV=1hGk^qvG#SDH1pK+=>*c6S=YYIS
z_p&u*l5MT5vT{#wDR3(&GEY7Ep0h(ZA5PD0Ha&I6Bu&sw%^PRt^cD%_W=hOHA!t*o
zMjSOKJFOo=pp#^<rwcN#tc-{PF<xy(Mh2Eo)$I2XwI|;EY)Ga$0ptVLFmJuZY_$q$
z#BpoNj*<N&;?AR<<bV0AI$;f#BVp@QMXRD%ynP^`-%i{1Fz@bJvoh~4F@oYQ*+L?~
z=_Ib?01w+j@>9IaMvNCY(9`qcqk8>>f8Bm@)EN5)yYrqLDkHHnlG)jI{cCICo9WCw
z^S^UH761(j1RzA<?|I?#fS7errOm4u6zqVQdVM6*zx}>pNun#Re}e!l<?jdm_fvZy
zFYYf&*clrkwvf&u9`lOwXX4<tw$?A*vx4tCJae=%g6{35_Fn>@Y&Y3d3eBLHebc}N
zt0^zB&vk6X&43n4OdMP+WS2v(G$yk(X&IaxlzCmQFb)tvf`bkOxqESAbzF8=12k#n
z&ABx{oQT)?E(wphx4nO?Z^T1Q)h&f<nk29(l;iOJt3*TE3eqmd$<f16uy0&58&5CX
zJiZ#9w6PS$L?k#j-QYrdvwOL$%dV%%f9~;>Wb)gyOiwiB(AW3koN8?N8Xq|e2=V_V
z13jY>0-lTZh`J7@OR?DnulBd_Tve4+(*P?;f$M$GxAh&DKfCU_D3*>w9%7x~;PI+H
z4K6p+C6Qi5Ms3`RO{_oOZWy9Z8{aahrt#}=BTW0RCWOVbp7zAS_1AgcJ~xduytb9D
zUU|Nq^|v_u{d{|y_q>`u+zahzZP&YisjhE*5I^|kN(lH<A$+W=h|-N2%<SujY4nmb
z{6@s(<xiCP$uC=7<Hfc|RVpQ!RYZ^NA79b7=Z4|{efQ<y``MrDfj!3v6hx10-Ud3Y
zeYW|((pmN4!)`5opsIO(HSK5Gb(U}X!bhugyRqQjh<!dXojQkzdUraK-;Y%0qY_^V
zPyZGqWSahVTmYbDtc=TO^nuA`^xR{OvI(mQ?uS}DNn1g?;EZXapE`Q}9w%UmGwkp=
z@Bh~K*1+2zOm9w3P`yjnb@H9aY!i2|wh9Z(M#C$#6?H8a-X9Y`*9!>{HF<9jn}8IK
zCfS6SW1?WAW_>8<XuQj$Ft`l-B_f-L>@YY;^y+tt;-C(N%kEbrrq1AH6kHllLIN(M
z#28AT#UU^D&NYzTF-L%*0if(h#zec#@<*%FBl~gG#)VE;_g>v5`}1!0ssBmmusHi7
zqb2}z-P0Rw<@5JMKpOl&_<iH??@hMmQBA?XScGxRa*H30ko$w$#QgTt8#3;;*5o&A
zCdu!a*=$8ml0SoAK0Z46N7*fCGc7Q~cwKK6S?8>^ojY0{ADC<Q^EV+wyY+^SqR<nt
zXU`>(qhot=zdsRA*_L8Gh0kUvcAzwQP|?G$(6dgbbCEx%cxv}fKXdF9hydb8-4uGz
zlIMMjo7B&R_i2R!4SrJy2<_`)J~QPD4U@}|=a#m%w(eWeb;!e5sxIleTi=V>%fN=A
za}AsRhM@iNB*LvsK}@7_-@@(YD*W}+SzW=Zlj{<A)%|6227DSY=8C5#xSW*xTEk?l
z5G&TyG1_}E7IXoHi;XEI%8C*2;RHx>5?Igsp$e<mKCjB%-Yq~edM+wEu5`7^x)Wlg
zL?5>kFid*)r6p)af_8eR{>s&r|Nfp%t@k8>JF%pPZK|Hb<oCpH@*gCZ*vQ~X^3dVH
z2pT`A54wpP247qrjc&3goQ?Q(wisBQo^JR49s2Bl?)c7#Lq+kF;^AQSlh}0cQbD=h
zm!K(8v!AUKYL7Dq?Ok-ASS&m<(;l5Vidz>j#XSv>tAL1#Spo<(-J<n|PG^QI&8AFm
zCA+(x6AjmWjBk^ISok-ygvbK##tgicSp~0?7xfK8#+!lXq#(SBv%ude%Z9(6BEnkR
z0!?_f`FkCG7iJU;JCVkk#{~)A;};=*d_qxGw1eJP795S0S8u{xA0O2a54lQFQuH#k
zZ+@9SfM32I7@s$gdsM}iSrGAB&2tVbKOZOrDeLBx*ZZFcKF2~>VO#vKq%G|HTu08E
zEdI2lAm7!vH^~{zGeve4<W5_JJzq=wRqi0Y4f+d7B>Xq&jn2O3okBOu!=Vu@(XoAf
zM#S1XXxVrq%raCjKL`~7>=IS@(OER87x3CD2ycJIJUV&J-)#3@w#x;1?xxxqNMJ44
zrA#J46|2XjG%gSD4b*8BwJyK)%fKn(i@kdnmO(f`_lqaRue*R2ZB+KKeE+v3-=x!P
z%hT)e+WnqmVfU$r@kHl!YsGitf(hV;k*hS>9QPhnCM_v+R<kpyBna#g0VAVLjcBgt
zDsS?8qH^0WgK@WZTO4(_A^l!GXs&yJ{qdb{Z^<@qqZe8JyGwdzI+P6||J=z_@8!Kb
z@d~P|E#*HWz-YTR9|=$B(d%U{^9@UF_+n~<Cv^U!PNgQGo+O35Prdlv-_p$&3FhWS
z{kd;%#0Q+)Kl$)#gi0*%G!A~M#6g}-(OP(zrc5h$%hkszBG!;AMvR_ovYPm<!fzQE
zn2u&XCVawTY@Bd82td_NE*9Ehvc!!N8J-HQdHXRq6@>MLibaq1y?|Mcvono6hGd9P
zA0lPC-Om6YlI|TI;nPq=o9Sf!%`j^3#ui)|XJiJMaTaxzJR_gY*hZr!kD^Aeqg;M*
zkKE4-<BQC;;@(Bk0=T+qo2Bvv{2#aGWBOxlk*FjFd3!&^m*4S={r4?kM8Qg<r&*k>
z=_AfFil*zDmY18d-m~gShs)+<<H>Q?2pRkS@onJp8>d2vWYL6599}uGMs)J_^6u}3
zyhlNYE5Yf*1Zb&`0|2qMIG{K?;Jw1cxU{ym0^e%80<IVXrCG-95F@$#_in>^E7lDM
z*V~c8H@-ZD?|PT|DMujTNWh|%UnT-uc=j8iMDPIh2?A7riC$tLKDkE-XvEY-wK33U
zfkdr$DwEmt?3YbIwQScR<`zq9`&|e~k=rY60K{s1s}E8y8-9z%x~u5FoWK-ii)XP^
z|KXhPi}`9yeSJ*Mj7?2Wj86_wS-JXt8mzbU%E$Rf-G*9PTyX83ZcB3Hppn8!xopi>
z^YD#Bc@QJu-nxZ&fdn*dC?|e&KiSYAOh`zbsnEsVG11tRG`PN&Av7@CZGV=Bja8Ci
zGD!-#hv<{>qBP!X^^cBd2`G{`W4-%?N1167@6-2@mVur<;!E-|&(&gm%>-3S_@}N%
z;ovTO)0cp&L5E*)zID_jep4k(MrV_hU>|ClgaVb0dV+ENqY2K&qrY0mXH2SQUqWi|
zs^}M}_UZ65?bhK>P=QAt-S#y}3sMl?@E1(m_SST-!qd&$S=+`HOGWE}aSc~;kQs<^
zFPptJ<`=!xzqDL8@gcQ+R+oXrFDbGsCZD0U`!_}2Jm^D=lpK@{l11F*rTXs}wxq+j
zqXqYd_8Ea@)8fw>P6a1XMiN-6rs$eAz8~Hc#SmTUg_O7io$U@qH`LEYFB$mn=sgDV
zL>9w1N#qNb=V&$(<4NHb{hBW3qI08m-E-HnJND~7dqd8P{^UHW!D1pxhT#GWX72~z
zrcC*XgwJc%F0{O)U1ege^EpH5u-}+??r&|M`~6N698(o^ij!)4Z@wJ-m1_tgyH7ry
z;eq7CKjecIi;Ai_YTt+`f$Zw?o!`n98NsG>mK5s^wW5vrp_b8K4@Bibbr9m45;J>P
zW~XDR0tpuZihv@0c-UKNdVG{*I~kKjjH;A%*}jz);*_=O6amKFC1RiE-O0T*%rp^w
z=Q|s<?lJC2z#sf{#AJPQ#i&VQ=nRePw)vdu^U|d|o3Y0{|JmK-YFp6xi>6f0j8yf?
zfxwrGVO@vVe=Y?dZO7s_S?XQ`6R+->(n{MyLW~uFvMDt>lY~UgGyp{&2Yp<9A(@eY
zeyXHT)-2Mt7=14XP4C+GsTe(~lA4p{7)DyM9QPSlx*yghI;{*wl%&ZH>PDfK@u8O>
zLZ!(+2;!DIy=|Bg-}rVN0~WjKDHNI4*jl|=VZoqZsZ!|Z5sMsg|4;8>8aw6+Cg-%y
z)T@8nq#{{ctOUKNJbo>{%Ul9dczweI3Bgs_=Wq8Wl~nbH%AOe0+yx`LO;BtIY<z=F
zrQmKN;u9W=A25)xx&Rn}x&pyXW^5+v5Au{-5(8XJk`FzEQ+Zr%myw!h_ap4&`P`q~
z&$IKXK)hhHXmD;qs$MSp+j|O1Uwnv(d?t>V?Sv3}?<;qEX{&<SH_A_<6j*~#FdHk&
zn1IYU<$>>zSK@lTH4=)u{ZZ3{8a6!Av`Cm4iqGMFFMi?t67aHXRvK9#@_aM3<y^p|
z7a$Wijz@sRGinFbJ>@t|Yo9WhGk&~wra%pjMYK5?XutRuf?ocM-*aW;#;9i9P0$kB
z_4nR|4!<NtGIL3FbgnD51TbYsB%_&;a+EBL6H5%7iKM$+`vfMtcZTTm{9>*-*dji;
z+=RUYcZ&4UU?WE`aHnV)#n?V~KdM2N#XdE5b-pw{>cpTs3QSWr&-|mt+*34ph2i-g
zsUS{}_*`h3rSPBia6K8B9XGDZ=S05XC336<(_I?P<wZU3_tJ1PS=_;(FTzhM7V2K$
zD)dtG8Z=6ZfKjccpdSQEa6*VhsBe86#M?(&=>JC{1YqiB(c8ETynU^17<;qP;9e=1
zue-M1Qr1|}fU0X54_HV2AL|jf=<sIrn30*v1s??~J7egy({nW^J*=I*4&kFi+ChSY
z!@q?}oq`%H`#BI7Zf)cVKo=a?ZYTFU!QCGa{LcKIPvf77c%?5AF+Ln6q9WkJakzg(
zVRAZ*mEbv&lvRS5)9xMr_XFC5%cKMyjQf^SCUUfgBRE)jD{#kG<$uJ+cvI;bxeYm?
zrRSSecDf$#s>G@r67K(zOj9Z&9V#%I(`dyMB5Q0*8C*<eJ~XHN6XPpcQf&e?SL_^1
zUUv9JR`q{liC}45I|6bneZMZ&rX(c%k7()pzQha`O7lPY6ht!IvkUxh;<U}x|0@~#
zzb9q>zcSQvGETr2-o1OkRttbc?TYb+WCML7KPrjW4Lk8y$>$je#q!^nB{8feA@r;5
zD>Q$Rdyyf^uZw6Se>AVqI^&4~%v=6oTijjg+k;*gPWQzW6hNJlP-rw4UIYY!&_BP0
zV*oHDN7Dan8V?D^0h)^|UOcTBCasl&q*L~`jVP|>_-||-R)~V-i@Ui#N`>At(^3vy
z-5-Lwkf^RWcS>0b23MRT)QDF8b`qpg5AB5DFjfGlhqxz1v$0#Utad2$%}Y`suUBwh
zdA#57{zcB55eJN_;S|{%BO3N%nQ3Qmz6UVlP+L&dPE^Bz7+iytdGH$<>u1**iaYM_
zbMR0lbFAewClo?BAkk)>#yQ_uBUT9KP)3KuGXEMy%ET7$=kiTv_on=~h<%PrDkf6d
z#^T#&OQ2D5{m3(^H%@Rlg_m%M@;*v3?bYat#{A1miZX~S@dR@OBh#P}VF>b5Pa}q-
zDP+l0eIDm}EvdCdrHr7|7dW>~V_C9~{hWo3YAcOQqd+4cjf5i$$H;>dtR6ok(#L=t
zOVs>hJQ%=;Pc$LT1sDOe?43kNWTm_fZ=$3HmPQjNX9wdWU{WGsN@oy!7<+z0JwC#M
zU?VA&44jr`DI~;^0KCuA7*ED#z>*A=<esJ_qQW#C*~DBnVvd%kK9+BNHLTDwmn;-s
zxytYmJ1RU^`U;!~wDP7CEPS|xe&zJ=RO$+NAW0&0ZHK@exfrBO35SoCp++WsSd@%8
zt*m83-4UTSU>ad4=FNli5uT0==_4lNJc{~okyhv%M6&QuU=>FpF=%v+g#!aw-Mk{W
z1~puws3R&ARBS7@-SiYF+_#Is#Yt!iPyO{H5?nZ*Ie>JbN)ZIaGZS;CPKHp^;&a#`
z*Jv<0G?cw=T%MHwm|z>Im@XM4KTCq|20xV)p|Wb`RBKX4qo$X+7f9lAB6GV-6mhuz
zls(CdmKfwJjTX{FuB1M?L^@mh06?z-;c_c-Kx-<t<dGQO+DPI?ms-M0l2$cH<2vIj
zf@anP0SvT-O{j=2!5Z+tQMC(E$ftKJ-w<n|4#eL}_XDZ4yO=pG<mZaEUSB_^Xk!kJ
z8uJGbNPtsGO=XcNLCZ-A15HQ6zn*u6A*=35PaX0$JaJYJ@+$4Ag8%VuR`U?0<8em&
zq-QOazQLZn>u<|X>m7y9*FO(8Bm=GR(~J5QBt14=G}{6cR`>oty1p{3t*+@hxEJ^0
z?oix=ySvjCmjcDz3k8b1YjL-tL5oB2;uI$of;;3*@8|k{e*6tN+2@?uv)9b5S)(&y
zGVSF|Ax{=Au)8J;$V4hvCk6L7j^~v)TUOk)tsx)G%*=s*^xWCy^#s~fJ&bop4+wKn
zJGoMuwVKXZ=9Ro0ar%&`ks_KCyd^O`kKKxw+$l@rXdlIdhl}7NY-ZOvbU*27%*6>4
z(KB)DNBZ;XYg{>+`t{=*aYF^q$jMykT&5oM3Ig0WH<OZA?K)eRM9YgTr9xUFp}@HG
z0(rS{X=zz@!)#%TFj~4?YJa;;a2KSe&M<;tzcEF`D?iUkeC+`y?)9!BGdg9U9giQj
zI|ct2k`Er};t^bYKnP6ezMy!$gvnZw%qrYccwYa26zFFO&xc7RE%*}$9qGtOz8ERZ
zNujr|c3y+&MtW~f%Y@fxF$i!8(Jc@1s<}+N_a-b*iOmjdJ~75pPEcCl<ldr?3%#DY
z2R&Q?dCEQ>l%{f8bb<vzwRzt8U3d9@cVuw7CQt2KHmgRG<gF8RExIy{Wph;+)2E`x
z;wyQO)n{9%M%GuMj|U3|hvGtaH%xKD*C8wxPs%uijnrY5U&ottg4XY5v5^Zor9V&E
z<RnO*?#+&L3tYdsHTUhPG38Jt^sI~<Kl@R3*j3)jZlu5J&opU~S^M|csBP#QIMZ+L
z$pai`8Hzzk+I{<&w!9wWdu6Q8UR`G>(s0w3lXp8fD>@x?y#T2d*?&4slRc1(@8sb0
zZW&7~X!KoHqN2RJZ7)m1L%?T_7|vzJ`&xtlA4Uf9Zm#RsIdp;FO=gMTT+at~lzB}o
zCIo|H5fmtU_7^iwG8+NliOwhJ^PQ3+3X>XHwuIT?Ont#~kAlXF+sK_p&f^(=5>j|{
zR@Lp6&sCFPFORQ>xr+ALv7$#51ylDc$EPd%PjdvIkuxjsFbkXz0#WWgr7&dOW6gQZ
z+->hDi+6>Gk#u9lp|Y(N0acU3w;Jos>p2uQ42-7)fLO)t6S|0;0XjrTaq00Iqr3C$
zy}EDXu%WS1Hyterbm?S-`w87%SPFkLP3iBk&r;42c+>d#Y)i!A_A-(RYPes7A~%$T
zXdF5v+X=c^EHqy=aoinQi*R!y=l6SP($cZT+zgm+qD!zhX^q3l5^Hq1&7CHOY<2A3
za_ox0#cc-GD`i;dJ2+Z2%}9*>Zsg?@nmOzCpG^pA`5Oo^a=&%?{_^T18OSKZs`#ZD
znJ#HmM6-TbDck633+hj1i`~keD7IS_<+dj|-5G>C#)mO8l}Vdq3%t7f$!=w|F`5s0
zy@9h_<eH`>b>L49&oKXt-LP$9!Ibr8QL!#y=IIdLk+V12%CMv9@L7`Mx94)#s!8?L
z0a=!{QTme7bnxH_5g^vw?S6aXuF+i3IOsc!@=fC4`K2bCwBz8*2_-zy-&TKt-v!te
z4^K`+VMhGbDW(^PATfA$czx)NX>zLvckQAQ=hLGBF}!$RNwaOa_y&EmWqKL?$qPb)
zlko0E;2j3N3^l34qOa(wkoS^6jdiP@%Hvsw%lUK{vdfS6M~^u~r;R7OX1kA{ddqrU
zr8)zIt3B!LF<V*32n(KvK3Hz{N4+r(#((mU>yJxMSOU|iDc!!D4xoe*ZapH8X=*6J
z4CgekWMp!~x2wjsHK`>LRD1Y1?jlXXfk@Z!L(p0DBAYmPSDwg#!#wg^a%KlHtY$EG
zeSuF)U^xX9)%mV|CU(mYNATVuOL7~ay2jpdh78c2t?~Dqkvn`cyE_xQZUCgrmWU<<
zobG|1Jr{r#n;s>AY@2DOm-da6eD_{wasTz1;Kinw+)4WeRcRvWG$TgjK9<OMc+BT#
z1AqwVR&ROE<|eg2(}oBhObU<4t_6qrdM4-$T;l)k9CeCT@ijg9F$}3^>nVRNAl7_q
zRmA-}<n5H^vV8pOFG=W%B;+DKAy6?i@k`BWec9)c>V(E9sv1sHT{7NE=V^g7iZ+)O
zf5`Z>XuWx-&|5dfC(5oar4;LRp>){r&~pCtadCqe$NjOby3>w3o8Z@vCldZ&(!Fd*
zWj2~H5x3o+0v|MjD|SabbI;WghNSiDTmQCh#ktSaZn;64>H~EE#(dYXbDQ?xS$M`b
z{wu3rSLG(^@W%8p0anHDa&eq*QNJ)3CuUD9W1G8G2fPWIDuCvSzxu2_ZL+bdp8UiF
zE;Wl-e{);{RA9h!pYKu?O8kD9a!LJ}(7syOj42f^<T))2<d2R)f&|{2b%7;C2SL|`
z(=yl?i+<+%3f=gJ(mO{(lS5toU9Wj&UT#PgF8mnt<#UrAF?vx%pj|Wj74LwnSOg>E
zEngIXJQ=knpn684I&iK<i_2eF@rv>*HqdV-yedrlYls#91Q;1}!<UJO-C6S48jxr`
z-2@=XWlNLM(75xLtl_BL4ZeuAn#`<`pl97tK4<U@;U>wwleyc<F~6LdrFzh`|0}d~
zvO{4y=Fqx?a7T63m~flxYh&0Uor@scr2QV&L8kv-$uIo-qalp=^V~#Vc)D1R4AkPq
zx4bC`hBP^xkZ&crNSr@ybuK9%PP`!knxc6CzI^IoKw8&t##9{N@6V|_dl=4owdS(-
z-+zC@M`-ZQ;?{hTB`a|{rm73BuPH^N4sN&i+!~N9bf|J(r**X4*KLZkl>2a^?c#}Z
z^xIFrQun<$e$>9j%fL41RrvJUqPTW~&UwQcdoS&RWSiflx7h2#daT-_sFuT~()PSA
zS>n?5bOyxLbAXj*p8|h!ZH-$F+?o<h&82FFNWlS<Gpm4nR&S18`tASp`ZERHrM8_O
z7@~5rM1_WWjKMEeg$N=R!5uv<8nKk_@{gbws#ka@-=G)1c)EZg$0r?E0syT`m=)2;
zU&+y)sB10cR}~}bo1G_E#c~*X=Q@AE(YE+h%Vo0gQVU5-5dMAwD}Q53_yl|Hx1M3S
zu&U)bWTAyIDv3ZE^{Aj84~5~Y*djt9*-IIL_d0Vt4m%n%sx6V+72j1d>N|F{ogI%w
zz<OC)ifE~&>)ywULQmd#PiK*O)lVSL@n4q@G7|<}f!Mj9cT99u@GRkPl-m6Ew^u+{
z3RG$90V$77aK<b*=AYdlg1u{xUzR=JdhTUYy~>6zowg$s>XH?bJm1~CcXhp<Re-Z(
z2A|bEE_ov_%sgIZU}=WiYx%W%Jy^HawkO%=5EfwQvMVNIig|d?RF=23w|`uXIE#HH
z<?O2Q+vQ+`fE?|>lN8fX$F<3<<gCZiUK`8vLlAl3P||*_jnQbB2sYWIETQEvoP)T`
z42-TIE8zDZn0XW1CPHe8N5okJy&AYp2#^g_>B(o@*T20Hzs+9myw%EiIeOv_2mgvB
zMzkDg9Ko}!Q10MAG2(~*5G<$3d!45m^vA9dJ25!M5>$OXST<9!-d_RLTDbdq&4W;=
zvP~%7)^uMU!PIK%6hqi`%KOF(@|TcwUAnFR%+@1ng6;0s&T@N><m6WTwa~IEZp~fY
zR(n4GUE7lI7@;s`VbQecv^w_R%7`NueQ?}D{Poh;^H#i*&-Wc%6@U>4t%p*ll(+nC
zttZ{i6>5Ksk~DHV$l{$AYZmjb=9}oI4?W0hDrf3>>~wPxRmUG;kJKr)>ui3k-a7kx
zR#`3KH%Z>w<EgviHdl%U39wzx*<5({@r$*MF47mySCg&r;rVz;*aU?N1XV&^US{bT
zGVAQPvw%7-ANr=*|Kc@CR62@=%fFIKnbQG>q5z+yR}3!0+dtU0)&6bWe|M}d`Kp&o
zbty$45D_<NU+4>mRo!|vhxw25#roP!pQ{CT=u+p@TfCL*#h$H)X}dz`$=1>9-$U&Z
zxKCG8U7ODbZ3f-NH`@r`<Tf^k>`(tFfHTZR_?$m~{;ieLmiy;+rOPwQ>^XCmJz$$a
zin77vX`kz5^7xcXw5H%IH&VkhhQ&^|uNesH=^HyGh2g!Ct+t^m+KGzyowIVfV9R~l
zVkG!%zOFf?<+|}f1@yR|X8YoBX#iUC96+nIR;Q4)WK?{l%v{^*akw=1#|7vDjg<|m
zHUbk~k2PA+c2$^q{i_kMomM_$WQnbe6KZHjlnU7$U>-f3Qi*iRw^BW71I}F;rD(oe
zQk;K=c26(e-U_b)cwkNXMqK}ypE~7WJuRRq_H{|X*WtAH=`qMxANoLivl*zc_Q~Jr
zVi`ku%>_okH()KV?yA1q5Ct<Rr2h1RBl!WYl5+`4sXNXJ`r}KX@Rrm;rfz-iZW(kk
z=&oXMZ|ZWVd`v;o4vjUw`^~%Zq0O;=PtS)JeZkab=a=_uhmuPHtUX|NGZ?QKb6Sdi
zrXGJe(JDii>PPTRTE@4R^^ug%q`cor2Y;D)W9ItG?|ZtCT|Kswg!I1d^&`EXG24Dm
z#AmgW+&@vD(qjHeN&oeccoFn$y~vQEH>B};j6DK+dZ9toBLrH;!&B!x0nsCb*U*9d
z1G7WIH67<Ofj5ssF2VIDwJwe9_>c%%n-&q-LNvSd`|FFSC$1kqcEurzo$|ItoM1FJ
z8#v?}GO$YUW6B;wRM(h}J#rHBGYsGTXMf%?b$(zJ7m&#(P;*&*@EK3N$Hx(k-!7nI
z3uwT|dxaP_BQjZPAY>ToV74;FR%d$t2Rn_o?)4&Jyz5lpA$dZ@y{*nR%bNjE4R!Sr
zuphtH2(WQ7!T3ho(XT!cro;-M*RuMK_{;wx?R#1sT>?C?I!7o;yKZFk={)<Z!xki%
z4R}&*fmPTx!uk3f`v?szsEmjHWxCptCenU`6`wDR?`t8-fopVAAN&@D{pH%=G0^_n
zYfbH8&Y&~iKQ`wb!-O=gH1p7?B@WWuNgaitZO-kfw<TSYRxW%)u$cv!c;!p<;H6Cf
zUkR$*kRE^+7wbBPAy_0T&1!B$0uC6XK8zEQxENUJe3NC1wc&M=N|nEKKZINTkME@l
zOT_INCEq2_`1c`nH0jSsa-E()9?w|TdjDECt)$LX%+inQ8sb|_H&3F}|5!&j2J5Y0
z2W@{dbMZMH+InHh<e9*W_Q|-dKX4ZGeCva=YFuMHfp@5?n;C^MC#IXn4EC^=^nr+<
z4P2!dfM~N|;^PO&F<i!{4M*A<eNRqyg?E1?b-jL92HtW4M7-C9#rOFT<b!wcIYj>^
zeqW@oG<0S@fw=S9!vC6VF|C^F30@pNOc4H?NLm2RR?E@N7okvBGPW@<W7Y(~*-hgg
zb-X^tn|Vb7Xf%;Zt1vmu7W{W_%$`8Mzfn<1f1j2?%>**TOQUJQ=#3Ay6QNH`CQkj6
zx;=gHc5uB5L}xuNi%}Y2zdO;jyA7_0$;wz<m?1sl@ZP~f0TiT3|6hkq_2`9nkJVNa
z9IKL-{g=0Eozq%0Sg4-SC5#^`MO=Cuk*7sx)O~E%<PJN%^9`?*ZqA<O<zfrbleH&8
zU@iUiXO3CT_;4ZyFrSHtii&e69`!hHeRWm%@G5#%14p4ifw1Qr&PPK^Qe)@p!U()M
zAh+J6J7wyYVPhERG!Jl{xvzZjKa=<~Uw{@efUw=97m{Y9jsoBX+=x;+Rhm4FNa(M6
zyGVNSru4Pg!1N`D=v^||xKm~4MkvWUAv9-=Od;KYob29DlJw!>Wv6XR0j}_LlZf&~
zEY$K^8JuRBn_eds2yYO9Rs`=WR22D@9K1JpAcY7{k2X5Jx8Fnnv+M0tSg>qZ-oMPW
z*vCtvABdN2cLa&WWa*rzpj(mPh3dSD#VQA>o;jw719S%9R%^TE!(5$UPu)pkhMMF{
zxtT3gpyx1OZsaX~I)|qcEwgA}yT?E~a;<2HRR|pI&aiL!r}LO<$VyOx-k~`?gDv1S
z&>uPV&?%~W-h~K0jCDa6(2_|0>=@){Q`_u$-8f<cXque)?bJ+Om+AMqwSJ`KaDWIO
zamHU+xUh&st;1t^BRM_zn}?$7%(jrX!pDuFyqFu6Bf+MD)}i4QlCwJJFv2N{5CRj0
z#n~q25(L^fy$`V=t*v8)DN&)Ma=%0R<^NI}cnq8f;(||%#F-q@8_Gbg@3gsiuTYV0
zlY1p7>6NPyE}()e56fgRz|i}p>YsBE;&22X!mr@no3hZNy5(Ln;CVb4z#Mzv``}^(
zjXYdr(qKJpvqf(93tVU%i5EV}kP+PKzAqd^fpnWmXIpzp2F1bPP}O{J`jr1r2A<vw
zU#-Obc8mqo?*W%9_<9>bv~43`?!pDDNWPCUsN&uG;<X@AxFfvBA;)A6<IcY`w%8n|
zik>?G&>KNr=>16iGNnm2pH>su0_jTcu>Ssqj#3rTclXsTwulrWK;2&+J}Y9=9rM5l
zFHc3A^M@Q9Ab@F<0tYVwnW5ov{Dgh>^6v_wE&(77P*b}YGGvP$?LD2=tXG{8_-6te
zbl_<OQO3na2zlhsXjjeDT=f(LC~2$bV#1{DpEIZ=?uJo*|8|>)dcL=xUZO#~c<>8q
zx%wCE)WOOQGnoU89P&a3k?+wgr-Xy|;-*ZzSHE&+pv?2fX%`!I)|vVGKD9LmDkz{z
z<wdZc_?=ux{dgS0X5aKQp!|Gv5DW^TOi_Qh{m2DVMp#+vM7SH1Z%u;)XdX<RBCJ=y
z8sDy0cnThC=s6To7ScSMNpCQwsUA00Uab`hgY`OHAHb4%YtZUuzY~O%X$gNY`Ia4U
z1{+l0SJD0!8@Nq9+Jg#^ZB5#a;wwd09L02XeGxS0ZE(7{F0nwM2B05B(?<9LWkL`Q
zcl!H_Ry&ACRo<vEsw;C`b=;Yy0v|$NJiCM^dVN>zs_jHt9>3=d%4O(2b*{zxeAUDp
z?dF18B(I|iK354)FbUioym>mPdmbizIm}-Z@w|jlL)))@v@_&=WXDnj(8-nIHsTEb
zzg*i)w&~wtAjA+O9NPK)w4&sfu~KuV0dZ=+<sd5q3+*2{ls5GOI3WCXsya!N+enY|
zg838s<G6vc|LH#4e&W(g9I1(!@55YuaL3_*3jgk}irG__z=`XO?w04d!RLE7)KHs#
z1H_}}-%<W9T|p0M5aXq}H0!`LzUprd<WH~oHobqob<W2Fs@gZ2z}UfWJ%`>?piq*8
zeHD>8sK801-K+<iD3tYW$H-9tLe5m`C>$JJHlBch;%|}Hw0`wTHK@!i#W#c#k0^fa
zeuLz2(YTxmOa#a(BJwU5%OKACBc_bNb9#ehr=_Kj8bY6%m*1Lf`n#ahPv9lMC=$I6
z-#EG-FNi^<cL$CsMO98+r$h5W?+(;lTzBP7xNQjPEkpPyQnW5${JMV{mrZX_KTEce
zj`!ZsLC9P0;1CJWwMYf7XqfkcW9QY2%0gN2H2MfHyo{P|{$_#1$#J;3-5Rz>;<5t3
z35<b)KiFG`V*TKh42g8#tEYxdvYI&YTwSt(y*F~ipB00h3q4oUox9GCnBq{TNC5Qs
zj2ZF5ZC9oE0Fooww?mhv-eHl6Y@yQ>UdFA^R{=kdSJO2W>}<ZER4QH(2E7<k=|+Q#
zTQrg(lsOjx_vt+7QsXXag${ZIJh%G+d#a6d^Wqj%KiEkE-y&Zuv`+RHJRs8#&1i0k
z=itf${4m{aQvI@<AX<0IKhgpLfl^I)jnfzN{Ft+%USz_6C0Q^ECzW>nati&}RMPVn
z^ht{E|4YmP9V8thINjfq2YpJ8AXvQbFFcs63`wNj#Fl-{NrMX!`cA8i%O4@evoS}I
zi~~k=1z6g1<3jkCR}%3;{0J2?lNqNiKj3KQv5DQ%%D*d3Rv_2|bCShPhY7>-%UK5x
zm-L7aki4v~(F>8A+lW%KS25us@W^hkxi$oX=mM9-ELkyv((eJh7n`U&;Ud{dq(h=i
z5B0w3*)h??nu*rM4k3@!*92j#ANofDMFU^o;fGA?sHy@(KjL!WLeQZKIHgj0>DSiH
z5<$C^yvb}5JqA}MccuxPmo;D8{<&v+T@ajmPstwJCZDwI5KqEas#$n**-K4;3nJR#
zm4~EV{u4^MIrY>o9&NS{M|!=StZ9e+N0mRH%HfOQ$z+iER;Q~C4wEvrU_rSJIto*E
z7e!2ym)d;beP)C9t0E0wGL9Y%weNRnI>MsklbKqVP&ht!+7O%_0IIAkS~PwU?F=$s
zD1L@eDB(Okp^#nDaE69!*w+e?`=8426s1CDboLs>9k4fs@-O{si{f4%nAD7o8sp$o
zylre_r!~#ke%z{gGf*Hudyv5HRS3)^Zm>sZU#aQT;k(zt$@V91$&}fv7Z|*Pi4rEB
z2vEllqPYyd>DdX!9SK+;{D?btlMgh%1wF)H5?`^z;@~hQW0NIzr-!0mdBYzZenH)y
zefB^t!UF@NX(5EvuC5PgG1KDTLJ9U0e~GCrJj43){yYCg<(WNbMx9VSKBh>Q<gDR8
z41@p(lCfgRM;E(<lyN!x);l??5HL|T39()k`SkbQTWzj{zRjy7NhgAuyyBnm`Q3*i
zfZ6?i4;b3dcd}_^1TOTFNTCP20pcZplyKfq@!XLURMSnd%#Su=fiiv`l?jhHbW={j
z79j4sSrt?>jO$COb3YL)Ijl2SKb|7sA|Y&Q{D~u~&-eqQ#>G`MzT`#3<=#|2H?r@z
zExE=Reu2;BcEL_oIjQ#*K%*xqE?}Pp@mUEXm+x8u(<}`Vv#7qt^cRO|LH<*SBFU$!
z;&NQMg%4w#UH_l_Pxz&n=F5Ax4EfTgfegN%fB$I$(Kku?QdYcj`?kFK0`~N{TLQ1R
zi|7Blf~>iw*j<%|y)0Nd7tjZ1^6laJtIpkOH(|)ncR2t14Qc@(W^jqT0dIk=RS+b)
zmW*^nV>QJ03m04knREFuzE?I6kIN_8Id)1`4gbsTagzMo?sp$fgkoW(r=(HF31rQ{
ze%Ey8_stjueqHjP&PEk`<cW9z%=WbXeTkJ-6+}kUPJb_lOm}giU&Z7AmOKneOa+4f
zQ#bl{qy1uI^f`u^lo=gAWKt!Huxu=5r;NVH_l7f^NGGBgSJB*g2+@OHZj!CAkbfE>
z=8eg%n$?eJ5COyNvPeBz>JoF&QVrX<;U?REl1S7i&L=4ZGzf~(Of_2jYaw4IiTWQ{
zjKUDdro4;#gw}uPpd!4*(M-MO6Cle&mDVb${==DATxu`Dp-CNrUc?6uBzzMLk6K8s
zwZpFe^uIeC_p1#0WZ-D7ts52wpK-UMN#qu$mXuyR+IGOv99{fI!`v?gu#Gr6gf`3*
zp12cJXQ>(y5zSA~#X}A#lICT;x&e%jUWp70MRj9jM+-c~(t4f$@5&>7e)q9KB{X{z
zV5uY3IBpYBQQ&6`Q|$o^B)ogReIX@4dW;UPLX0)DKBASZKjbGsR%*T&+0RBpg7JMZ
z5XY*xH`85{1ZLX*jK!~}pmytiSu~X1m%Dd>(0L$=CEJDY%okTP-@!UB1a{Eb$cf@<
zF%`dCXR2wmFX0!`z524HL7ZDpKO9O+GqE5{Q(W{<t?NX-+xydAjSHVe-8B>RX32!u
zYQKq&cYsZbBxcbct}v{~uXBD<xgrB%wveNnOZEl-7Jz?Yg+o8S$lvR6qQykfhE*sD
za@g!>=b|uJVihZjFLDDOlUL=R?~vYczWs5Q#BavHLmRC;Tdj)7jT>Se!SDH3((zg$
zA#*Sh?$ZwipMR#CkAw%wB^Ny*pJI|_6uP_B6Wqvh@Q*U0ILATB2#F};Mo`WD7`#>D
zA6NB84+3jGS~a$RlPpa!6ppU{9$77SVGyXT>^AovefyvzoY)lKG?IoqBDF6u_%K)b
zh+@@R|8(5nufDU_Vr}Xu6kRilO$J^XnQo}^jgf^mm9m6igm7u0$xQp(e^OhBQ*x&0
zIPi8xtSklRkM82*qs<?CKHej}5jq*_$<UL^ptoVDzw$%Y5POlbeuQp<1`kft{L;(7
zW~e15Z$~$)vfTMR=)2t?w$aLG$d_3_K9D-I&GPSrr3edX&TrDC0?TT#Rs#p#>u88s
zeb~}G$j-V8^Xn7%s9e9|eA%>Iydz!M;v?xYylpb=<LZ%<|2mx~=&00W`Af6aO1m0R
zX}?mNwm}SAY7#UI6*cFVLq?#(ay*{)FNK6-`3b-JlUP*e+lBa*VrtCq#X9=71997-
z^UC-Nt)!K9U%+UxK;(hS)V#<-^}XSF6Jr(YWMHh~=n<Gm8OR!<CrHlGrD{JFCS3l|
z9v6WMFN|AuibStiF`lWPSsC8z<2UiU?Q@Vr3~ICGhhk3UfAVBv2+X4?<){e!TA&$y
z>tt^H*=Kc1#{wshnyt>D_X4q`90_IRmGf{hS`>~lzOucG%<<{AXIhGYtbTh%ttjM@
z8ENj~X=)rEB~3bIKr$fH1`)s;U-QU>Ja1`*M?zrJTa#?vZ^9v?uz^`&^h;Myje^hp
zfHWdrZZZGf`kO2*7jum2@gF~%!5g*9j975x%z+OgB4Z2nbdAmV*CTxP7h0ui8uVdi
zvc=`A00%gEM2`|Jtup2U#s`EwH2o>`vzZ%|M`^1U=fDunGCEX#H<;q$7yw%fkeFP-
zJo|jfALmR+-!PQ4Q}KZTKZ*UvyUGtsLa<RNHGCka2t{dyF=}(%?RSNxICD&@J*b*r
z=DvOfQlsgG|0;UGii5o-V!Wx<U21Z&gy%Z8DP<O)gFDcUI>ol`TR5)AR8eBL6<U&u
zLhd@lir{8AjMz4o4ss&)+wnWPFXiYOtUt+CaB6HZK?IA=l@Hc4)h8W47t?m6l^YpU
zW8y`2P=Vn9ZG1O)Z058WS*1R*lper)Mu4Sd37^F)9t*OY=9JDWHFwgaz5c@0`&X=~
zSU4HGX~dWx4Y1vBZs_{E!okj=gu^Ik<l`(^ZQ`nJ=_~uEbad3Pm_%~`pZL=pvsXTw
zcL^(N?T;&!t4ithSi;ry(v$<(;HuVDvKXm@v_+C|6ak@=MSBBzF<Ln?-$yCShcL~L
zyc{CidpGy8T0%viE2oFtB6fk)bEDX}Bynj29Q(S%l|qr|x>s~$mWdzy(IcdAezP+I
zM%I64418<3{q;{&8mX_rNnn_;Fcevvc{JKGl9<}NP0-7S>@p32%PLzq=ej9vb+4Yy
zwTl;u{M{&>RE@A|G|`$K{v^G?AOuK4jn0b6$Hs_;42*U7glZY|(RSVucg3|Pct+f)
zdf2&HHR#Vi=_WhW*2g^RCq5m~$(0d30Eh$!WJPCU`vS&5LGgQ>r_Iy=(rRTbt(be!
zTAB}md^w%-u~QGwu4wyxf$w2N9r{an>1-n}06hX4;sCBWNnCP_P9)cAh6?ZL8E69?
z!;w`GmxcW`={<QaP83dGAN|E>beRl4&#}V=t8!zLJ00=ccb}=TkUi+9!;!UCHPf-k
zR`FS?OFe#VlVs*NYTXDjlNZi^nw@NGw7f3O$Zz-)+|(>A#>3o}0&MZf4Z29CdU*+o
znGr{X=N^o!yZ+)r`L3gRQF^olIsdX-E<o`@59vcR?8xw?c{lI?hRdW!X9oHx_~9PB
zUpCIPjhuKPC1B2wKR2V3wydQ<q*e{x-ZKzn7uy}*$7Vm$Ir{DO@@mp1<PkLn?Y14O
z<LzQ>$>7k;t9x73b6_I)X0G6=(aa{!p^;svAmE|iI}gU&r|SLq^zkke%v*)9efd?j
zrm*+@3ZM!~6}G#_BCr=Mq5JW)d{#jrL^Fq{_v7l1#{S>eGz$m<j@AB<^DTJooLGa4
zSbUgmR%ilCs-CaT_P0bJvrFmdVGFXRA4-xvCfrWDocRwzoJE*gGi0>37ed6I>Y@u|
z21^~jz++OG(Yr*(d{L8=mX9#X`s)6AtwQzCN9t~3JmEim{Mr0aqRpupEl;w??+=Q4
z$Kek;#uu=eD0yMXETyT|33dfCEu`~}*b>_WafZHLPSUF!pCKJ1`(}?I2|4gaP%N^Z
z6%enkc`Ev8i={x)Ar?n9Pnj+`!b(07IVq_c`5!zRlcK>&MB2EihWuR?FcgUnXl!;+
zG$?HHxng<k8gX%-e1ks^<oZT9^P#NuIL_IF*E!+^gJ=CJ_nTTQ!-i?;aju0hNw4>x
zcP(u%e-TW2-P)D<e;0h>uQGku@QmdiN)sm~yUhrv@10Xt`t=rml}~1FnUN)=P`5(K
zHPLo2Hd>54Kch+m7$YR(0x*7;8Y0#aK9$7?;6nUX>==puACQYi4f$Ldff)7I@NsOd
z%~rhhd+hRpi*NVc-E!>@fAiCC<(uxaO#9o;jHSq2@MJU*UvYH96Lm3Y(OH&uuJG&-
z-@^f9%NklxNiKgl_nToDNVIj^mK&ACuULm!(;$eZAoeM7W$_g8i+YT(08|%3;U^HY
z_`+aWo^WIM)pzs&g72wf7p$uf<hh4s;E%sQ38Iq3{8snX{uzq8ZMFAib_WK4bEw0_
zO0y6h=F&&?&75rr`u{39!|2pk)5IEqi!9z|uzID(MRIsMX@X^GjI^v}RuZPGxtx>`
z4{viJc&v|_`|xd%FKu8$Dm;n;PdK&_Sb1zbRHW*+gQ|(#VqY=<x!K7{;q%=4wPg8h
zHX>8G82Z<MZFuTNF<EYaCMuFh&8^n0ApMNNVleG+ObVvKJuG6+uFU&~q{qfivXbfT
z6ZsZK^v0=#1AeyNNxCi<d)#OSsxpL@1$v%TE_E5N8h3Zf>SHY(a!{e8HjjA6Ik<Bw
z0&@M(iLRH3Jq?h4JE>YLD`d5#!p1jJPs16mo>*GnYkM14>*++EIB}%uRm<EgA}lT~
zjb|=>9yG`qOoF0ast)+DK?|B2D|*Eq8sln4XYS4ZA%U>xvkEz1f}(VQYQci>C$t`f
zl`033?`4PvkEi?-TmFxrnTLh^k-uWMDu-MCtCIS+*Xgi<u!qE6J}dDlU>dfxXM~Bz
z-s_Bx;I6<ziA=bq)s@R}GZ6{zYa(~X)>0$I?);#afdaF(qcf!G1ZT;^+qA=OZ{aY<
z_WqOJ#^rb&k>KFLFnsmkdl?0FiJKp0ld2m!*!-8lKcTWWNayN{ul`InxY78B`2v_0
zdfT%5k`y1Ts0)DIG0VRZMPLvRKqAXP@)j&}W6XMbS|qMyM;j9a&@<uJgf38>i9ReR
zOmBM5Nv*kix*Xcz8;5AJKA2FQ1l`=7WrEgk*X{pGnIwcpPMldhbOm#Cfj7@EBs;Fu
zbwmN2CM%%9miXBcpA|F5w(M26A+Hi?eJ);)R(inWBwg*bQ8$gG#qVFREt5!!;$Pjs
zSMoBVmW13?uCMo{T<L&{Z2XrQMNOow&+DWiq!ug2xaAeb>G`f~J|{*(1Bb&i@GwDj
zBu~}8-YZtw^R}wrz;sNzg>kFPW3lN$W#)OWZiIrn4AHm|*vD^wduFjI8SHi$i7)Up
z_lMUKS@GAD<l{(J(0TcO!NZ^oaezm=<!x4p4U>AX`$_RcomtyV5IzX{Whz`t4om}Z
zt$U-Z;&JknPD>t*RrA;}CDB%PeNn=M;iOs?SI*NZN6@!7J*QwH?xq`CFLW2dA&1z9
zE9!~<{jFFRR!2KbSH>?O2Rg)qzc#*_FC^;^<4Zfy#<}{y+Y#ad_2hJ^D|c#4G6GKq
z<P^ap5J{Dg$yo&XNQlpQ|CtL#`vV$^`j|8~v!$~1i!jvu8RCzEz0!Q*&b5JQpG~IP
zqWzgFXsl+-+@j}b6Reit4G}MKtVm~2moc-<Hb(CY%OcYAk4zTS_$H?DJF#lZ>5oub
z>*eCIli+i}V6X*&r@YKn=vav{=&94WprI`8_ui2i6IIX2-t%Jd`KwNTr&|mfMOjM1
z#_ib5e{#Fo(zo9t<~tnMXIuirrUqb~DbqCAI4$rRgeoLr=B?<9n!kFZ<wH{yt7iSD
zX6v;sb1@1`T|#OgE@ofa-Sxn@%et-d4qrI^*N{@TgUocZ?@SqHYgZ73blA$-+IQ5D
z9DHd*b%3)@Ip}lrA{GT)PL?JGR+PF_ae!QbE>3Z&pxDFwZfp0!XE*Nb*~*H#8!)e=
zCjl%*yBsKYE8xW=?Ag7zj&NB=LpNo^v&;G0JpV)Et;9u|s78b36slb)?SxmZ`EeCJ
z4Pre&t%$Ym)9rQ>O-uXh&T1s_){7k(Jy$ZyfUj5!Uw9kmsr!@;hGxssi$+PT*qxIr
zcJTWdQ6`Om%SwT_o5@oyswJTbfRL(+@p&p+3UvyYSKtyOL%NaGtUeQ~e(8a%6S7Qv
z^=e6P%U)=j`k`Db(X6UhwxWRzXss?o>?`Etx`+aJBG?~M)JU^=i)C+)J6Q#a%^9zn
zjDM`MhL(~$@f{c3J=cG@FStpc!#qr}f$I%;b6%4w8WRB1B_^;R;;Kf4xB1qFf196f
za4faiv^hDa+QTN2%kPp-P^Mt6F3EL>4Pmw}9EP*sI;L8^ICmC}`qK0rF5Shr+*4N!
zm4fv14PX1V5UJ1Wx!CBk?WS=1J#8|9y~k$kPrrY026-6j&|(+{b*;J*VqqpeA%u)@
z>Nj%-HZ=)mh$6aBsK~EL;}oj^1q^UYXi;MsPnP>Mzh5TdNu@kDVGPMO*@i#4ocGc}
z<!&SQ`gD1tUiTm9%<PTikY9>{+jo*Hf|OprM?>%6)_izN)xKDmSA{CeaHatw(NVN6
z%>U&2;c|4wtB^EyXaY$HLf)YcZvzlXLN?k+zUgvyJ0gWF7-}^0m9|>8;yBnBh7xFe
z+37|u`UOWljED-bgfC2O7ez_b5MoxRchogu0N^8p5uWtud%(ZlWN7O5In$G@Z*UB;
zFGT9gbHCcnDR3R$`eXt}J=Fd555D~r!uv%HyNaWg!M>bonkyoUe#92;uL!u1B2~06
zg?!Nn=idWITH-N+J38((+SwhH_}n(Dxhm6htLfh<F40K4%ME+LnQn4Kevq5UFBaA?
z9u-DP1oY)*!qPYL#RQt&xWEui)kHKSzL!`hp?nm|AOKy;V@MHo1w*Uh1k1k|on^Sn
zPO~qHekS+}U%+LDr1%!2W=q2_Gho*B6d=G&SG&83(&=feL^5bEf(!cTcx~F}Z|BID
z_zmaD%-hdUP0#lh_akNqSCkg#BiLs~_8caR09CjG20YF688aM&?cx4P9ht#0uEZF_
zDIONFXLxE{lx67*@?zGIU(KWB(Z&rl|59JBq7>L<mz}elJ7NDtpE@y|R(K9@8^?Wx
z07kxOm!UJQ%%^!hEGs~ZF@8-$5gfcGo(JM_g{+bx#%+her(Ipp&E2r%{RW++(4kg~
zuWtClyQOtb(U#O?dP%_$O<s?vUWfcrjWKLg%q3or@)sFW^sX%jON*^OO=hfvPr+n`
z;)`VZcZ&`k{?sA)4x`H$5qOcogBP%5-E$5zplN;80MIX2ul~<u-E|TBrspRh{`0Bg
zf>aoZ#Uf6uDnZRyA1I|3;MTpbF8w#fF1Ala0aYXwI9g%cG4#kb&(B&WAg`IRV36Zf
z9tLt0GBPkrsLd0^A!5<=lIg`If8_x1%HO=*6x3J?)+Yu6;SP#k{9RT&cidWK+R2eJ
zE5bxppETZzd*TF1%Y|;s!gV_&xTUOmWMnx54<aKlGd_MR21)(bIIGk=GzATu45)iR
zl$ub*D=)X;g@|uuc`ucF`<zImH2)L!-+@7%ODel{MXZSEuymS4PPmGgVqG#cn((=I
zN+Am}%;t8ag_?aKh3T2bIE+%^A$;}Hvo@5T`SygU@7#-NAnh2qTHMtL@EO#aLlxPF
zSaTr&AG|QtAsyLF8yp3$$qLRHF3b>VRluAS%k08dpV(bnFEHj)$1gFv`6OVpTh6})
zU?vrx{2)mpl^Z%fKkc!f+8>^whNX<LzjTY#yMKG)rXYa7z^P&e%*OFyP^N#?0q{x7
z4OgQ4LB-{lAyU=DC&VE@k(RSdmxS=qu%Q34l@BxTnehJ}BbZE6QKdt3wHV5L1I5qx
zUsQ0PwJ)9IHHZA$cL#e(F0|5I-yVP1-E!JnfoT#BXMzjyD`x!n2rhzB1EvUvWu*Zz
zGZe@;37D?1BG5%sLnaq$k7L^!3tm%XM6@@nrJG?`h$Elgm}4`(E<inr*xvEhV{B@m
z0+~GZF!Fz$QZFp%`BG%7W4Os%K0~yhdmCyEeO_}a1BR0w&kVWR2*XGb4|4m^R@M`k
z)1zVsQw9!LvT`nmv-_a7kZy#%s_|E7V7@zK(rHT#LSG6;bFE%cqzozit+`OZWoS<L
zx0t!t3rT7?{+s$yPoz3?C;+YWuD5GNg!eA@@ZrVnEN=rqOD~1@St7#9`{1_29dzhB
zt;1JK5WV0G!G2dep)4EZ*vssAYm@k_rmpz{q{i{t(mZxmcIi=^u?uO~#01>Dme0MF
zBOvH+<k7T12ziGuxpCm<j(YSmkpn3^j2zbxM<Kd?HQVc{pTkE*gb$q}j+D;iQ@ZXv
zOqaX{qZch}j3KvChk;SxLxJ0B7nDJ$n4#HSZnxIWz3x-&5ZX6?yiAz;2dVyjS8!zc
z=SR5cXchzJ%rFD2@hUJ<8+Takf1g3)9?^noJx(#4k`NcMVgKB>#yI*y^uI-;BDN9z
z?<c4mT5)tGasFMcHnQ+B6n00bP8|RH2GG)xX72<A9s+N#_ZfVnQonA41hnV4{Ohe*
z|6OOnf2>AN*?2RT)v*RN+aByh-Uyo~R@9=dvZOz}?6Qcp=~d4!7(fTR4Gy#_DpV)G
z76TghEjM~+o;j6u|4Ol&N6?)-tDdIc4b_R!N0}9R<EkF?GFzyf@ec;_q3(I{*_>Ar
zR73#=sT+h)HRaVLS)E9YrwqN${O<xybq2o{ytyZ=n&+Uh%WD-b(#pNZV?RfsjEy&F
z;%ZAF;{bUnFnMq9RZJ~mhGOC~aC?Jp5_T^+$u*y{&~>+SV%SD)rqn1Z=XJMbi{McK
zLfRKHp?}TvfKt-5uBE+;MxC+du|20%ENOJ%kCG6}jhpu3DovER78+4qAScDKoi%Z(
zOP7p_KuBRe$8`;9)c_)VP9p3H_I+27(r#Q#E5c<XNbM7nRW79`R51iZDfXRsN*RP8
z;^?(f!xbtM`e&xH)@p6iL<ou;ZXFW^2wpGb8sJdV<22CG5=zfabX^E=8XO>L(Na;K
zcWxlThpNe)pTk}TI<j;Xn8OfQ2;c!~DsRQ0q5np$Ql<_v<HgsD#A9oAaZ>)fpS0vX
zce3XGl`T|vQ3J<LVD{HM^x3(nu;w_1*a3+ev7#Y4`~o-d4Q{zgFWuA<J8H6`o*?x*
z*vo%r`ezvA-}EupbmP%N-T_d`9q8Ukt4@<u#D38G7?G-LsHB*O$}20Ba-$oYrlj=s
ztuVp}*_~=eY=zLf@9+BaDS?m<FC|;VKFnfW?Yt`}o?=#_y*rV-W@B?&v@Q|=*A~lQ
ziOTK1S(gEq!Hb$!77T#L)O~57#c)01dGXJ=Jbu8bM5Vtv!Dr!ZYPBFHdN2?E9om<!
z2HS6zAC&^<vRNzq+zoUWsber{D;iK0gmJCml2~E$$>C(*9vTwLx+?!-Z?Ndk^{C~w
zQq@x{#$(GYv}dOtl^y+n{Ibegc~H%qIYy$kr5W~EFh*j9EQwnwq=)0q*s%G0mt63L
z&W0x9Xx2sYVOkTQjs~bsl*Yu5sMVrvD7e%J#H`ZPLW-W#`c@@5;9((4&`8x^S|1%E
z>~Em-2b&(YcQF51>o;nZ;fYdy{+>lRt5I%_T+jiPLE5K|HBNKos58mE5y=>2Y=d$u
zRDG;who(pG;IkiI%2P=ww@=Sd2Z<3{M15QVo0SZZk!fc@7t68&LCW_!Us~3B<CWXF
zxF8hD8lLhiu=-at*{bm`t%mFzhM6t(yV81#w`*U&W+{oTs>H&wHi9WCI}$e!&TVoZ
z@hhwDfBE00pc9t0pBSTkAbY(2{m80}2wxckRQkqKXP65uawof6FN0)oxyBn|`At51
zbi=8?5b2J6*!K7^sM6cQ=RK8Mwjsy5ccq4PF{-6&_E8R%JLHx!^_OU_Ak2I8p#Iz9
zLgsX6;Jo8faEq#gP>jk%ddurkd0z0dPDiuZPnWG0=&RYCM9aMjRP?c6S@`oFP%D?+
ztO?9paNK?BMJd|5Kj5$IXdDH-S5VJQmSyDRF!~<+DARIhE^X<Fs1MA|9Bp}iY$DtO
z!A4^F4RVT#;(lrPs?u!n6IU%100;pn$x7*(1zli!I=c@hzB(58Y1?+=!JYO+n|yK*
zIUBMLh8}_;f%ToW{Ot~&rE^M&5-%7jj9B*t8v0a0Ezwnm{J53U1bd}Uhxq{xNjw*4
z{%HUqt(X;$nS9YZ6v_2JJFy=3+>R5B<6O{#WtHXE>aMy(HMe`3u!>~7x2BjnrXry0
zp%ky5Vl~FAtxmqSja?TqwwNMMA9vVyMr%!lpUwx#Z6_OKzp{qxWb%Yd-aUMu$vsh>
zmda`x9iD8&g}J-9e?CM$QGW)vLL^hM$U&7r5PAb~1l_8!TPQxAieE#6^qcP_a(QDV
zy@NhaoHA$Nj0~-dpQP<G<>q(aeg?r<23r`QTQE(^>5bqKc5l5~e5^G|i<!HbccswZ
zFs^sh>Wj?Pnap#a4W4e61>aDZH&1Z`;7Rc7d(T$mXL4P#4TPml)SNfkN+xXc<ObVF
zCA?H`%%1?v271m<%>@hbZ`*c$=r9mt_grF+UCq>Y{GIbvc&qbrk>Gq4YvCqCI20P^
zu#&kBmNbh*Idggb&6J7;HiUyCIn|IbdOV>@xtgB9qDqgwJUQzIU0+$~p7ubu3tn^y
zFyY@u7r7#OamG)^#Bv`0xGuMI_5{{xR*(t*{40dZ$W%IB16bx?N}!tIwUI@6tLYEE
zP;y84)a|=?IPK#95b0cZpU29cMmMB5L5e(ujVKWIxud3vR{P-zq|xSW(4?+cAK)Vp
z-*d&4z@@i1)OmN12_M=nb32U7G;L&CTUGb?>TL_>J`(TTdq!q(M#NNXY~JLfZ#LDf
zGp((sWnkXpaK<S*4Y%~XYf;5|dSdg9tkG(O!bw|IcF3V)z14CuGt&RB(_)nloBEJ)
za)rYohQ<uMeh`GV9IVWscWU12u+;EAHguRuvixPmzO6?lH##P|LYWqvn^sy=b&?;j
zqi1=6WTCC6gm+kaT2*ZUPekckOfd14@k3s8dY9;dv!t8(N3*-}$YKXGBeP_BbzOr1
zF#bHsLdH^^?&4?vr*&9Y-oygWslA>Wp5w3xOKh||Yg>&&Y_-7*rGOJ!!{1!@)%?ut
z8Z)-nU%I*OI}2%Rdae7!pj<SM-pk^u821mSbM7;~q=ql$po*$i2DaiIUD)5C&L2=z
z1#HY*XAMW?!!;d>#bxQE|Eu_1Y@a;dCCuevmE-9a5Y6(~3=4ubE~g-SZ$v-er)coy
z`H8sw&BaOPw|FjghWv$s4l#h+WyC?Pd@p4`3!vQ)lo`Zd7wgy0&vH#$QjFs|9i_-+
zPgKX*yyXY;hwtKqY;`FTLjW!7o>XvHOu2FF%eA{d$xkxa`;BZdU*@SCy*Bb3tNK3-
ziyXF?3b4x3)1tSS`9)?+Lp5F{0XQmPutdPm^Al*t*%?bUR}ZZ;y;2hDer1B^Os>{_
z!3;5SoU2mSC^PjJI5kTDVrT`$&i7uxlKDl>B?i|*z}-%^_^obEB26StKu|HgWnazf
zD?_&xTJY0aIE>!6+H;$bolTG(9qz8osyT#oYTABH462g2-mW@jO2>~HO6Ga#_0)3q
z^jss%=ugjbM<@Jo_lSUv*yi)!DXT6p!A>ikWZ(0=d=lj9HigFYE%S5F@%%Ca1MWnl
zQQ*zD*m}_WQ9=V<NIMro?edo4>wE!t=H9Y^D`?hO>u~914>}H<SCE7cA;fGk_4a&w
z2#%MK0`Qza+*&EZb?m<p42rUgLeq>2FkfnIf4&|W{?Z92`AP%lFi{N7jRpQBkxjyr
z;@H0|vyK(1hbk0YN#;?BuC6x4x=U9Lm%gSplg&IFfeT)NORY2(NS0Hc%S6RLW;zN0
zfRBnwY3>no>kl0D)BAyUH1En7R78TWgBMFiZXceB392SB-$m<wORO^dIN7=8;tXBn
z`plFoa9<qjq`lCzD68A=6pa~9laA9W{%rSY_IW_b7SB?bUSNsA=H9jIsqUgCYk%nt
zEr}NO_-E9x6iJ-l!2h0Ed;%jeZhY}vjlpuz#l*#E(o22!z`{b6noxWeRtPxHYyH#D
z>+jGzD1o+lYrrY7gvQ{P>!(ExCH@$n4IPczAE(ogTtRg<pg=swN=?P3I&k;Jm!n^U
z?lMtI+F`7SsX{WQ#+^T2R%{;w?{)DTQ)7_r9(D`tUb_bjxF)g4^*D+8dGHk&%dHTL
zhvIc3Nse9*Q?-q%B%ZYZs@#@EinOTQ+xP`B`n6|%x1=(AtHw76Ii8pAADrby+>kZ#
zw6yvpp2SwPKV239j9t1G`30AQ_QNXbey&DD6Us^}!xP54uWqi~4%hEJ3EX!5wWa=$
zQW6lC`=0hzzstCR&T_-z^*eC)4+@;6jPwTrWFR!Oj=jqM>xU)2Xz0!2(|<@`_6iA}
zJbXr#4xy&!7LuHbY<BsP*HSY%wL}O-4^}I^c_Ru`Qf5e*ssC{;xL>A%Z1Ixt+}F?9
z_%`bi_5^R+L{HxQjxEH5`{lPRzP`HT`jZr$n^ZtP4K+PvZee7i?8DSpISUS<LyWsk
z?aR;QOV}y;IEMtAgHDNQZ?<@|v;KTPd(rob!yWO>ZU3@L(tO1`p~t~{O>YQ&1eXt<
zg$Hh3m+K+Stc?t~hQT{ty^JY#Z3!rL$V61GLiCV0*EIVW_|hN6TzYjE{HJJ;d|E`V
z=*w;ktt1Pet&pUe6QpVbTD^u!B%LgSUV5KmH6-%HF6>U&+}#U=ZPzfkaM+xFyPBGI
zE^Gui{yC!Cxmc@=Zu+X*{{~oD(o<9<k&!U3EI|*5n(TpL%Ku>(3DE*l!2LtU#+$XJ
zd>`5xnbe@>IYf6Y&-5$bwRMN!tF_heTW6tN_SSgXFE9+A9WRVttj>`FulRkP?9iQG
zG{M{bP+O%sRi#dsv)!A1s&)g^-2%P+h*8o;Xke3K@{$MnIs5^n!sVSuv9D0>CBYjA
zGssL^N*D8aiSzNTH==j*wCQNv>*6=>1yA<x1q}d-vfo0v_zO38+v`$PhJQ4o1CM)A
z@CzM4fk>8mc0;bwv@D*2Tb}MW7fiCPt+m)<nn7pBuNlw376dy=<CsT6LWT4{%Qmu1
zXDz}!2Nq&yQ1D`QbsYzjl2YlXH7&Jr$}OMAuWuX-^&)<IY<_n1?{FN7l8_CFPNo5D
z8+LW=3fFsKQ%El^rs+%N5=o&q!9X7MKZJ|L)GC!1qUQS>HT#NHZ+JRQ>4!Hj1<pg|
z(Gq#Rvx~X-<ue-qFB0yp5u;o&XN*y%s_7^D_W6RZ%KL{*RC-&RmAII)k<7%z<7f<M
z!GBbkb@6{)Y{Nz)R-b%+NKXi;`M7C1zxWgIc$+eqK<N!W=2uYDCoJ$*=mS^R7YD3+
zJiDRG4#m~9)uZ=&a2Y8A#e)0+2VY0j%xcLcX;7Gx=%VL^23A>?Km6Hwr4l9v9z&qq
zJj=O(ZSa@~z(U!9>~x6P(+KErxz0ZChtApWk5OlhI(#2$$lq}<*N^R0$t|+*C`n@$
zUe^cfVSz0?C(!_--2Dvb$i%kO+rF_oP+>|f`lX~<<*Pr`521a?_^>GYx1AyZ$@Pl6
zEll#iI@m`ES|4bZr*mSN*(7ZD4zus|gvxCz=z>N8tq_+b8keOI8Tp*wv&(jdb0*8c
zdz?ik;Th<ut4G2gLPnNUA0LIOeAfB#!N^wd6eNl8#_r*!OiQmeXiw(0QQ?WQA>pl$
z>&>gb0i-+D{%=VYVk#VvE~UBJ;8@aSlFCzB%qmlaXUN`2ERp8D<y1la0Nm5z&-k<t
z1{bYHdZOyg*N~DAUyX{KsQ1s}AurtRgX(%I*oyk>pE@2pzocleVN%)Tl|&0Wep893
z-zhDTdwnn<6|)+m@RlF^5{-(xAFV5gU;PsYLzeC%%m?sGzH8k&l*n*#N^DyE&7Ddy
zTDaSJqTJWVI}PZbCWmDI4yls}1r|#Gi{a?b(XD73m4cqo2463q9?F9&#B9&Dp-((_
z5VobqFTPa+PMsMXE?EM#5GjR&$Azj>0mcG`*N0=*8w&ru{56%nZAcKvX21)aTc2A$
ztwm6qB`)|UV9LA;&K2W;N#{F(#B0(k=*5}pFjM#xRG#{O?7dZ3Tm2J17~CBS6ey75
z6qh1};O-FIwMda-#jQ}JI7LH|qCtXF9Ey8!mqKxO2)60_+yB+x?B!nUZSp)PCnuBn
z&df(9GTZ?-4wLPdlc95aX_@h>$15&&y7p-royY|JxQiq2)tP|)qi*Ns7ZUa}i^t%y
zeFEVG08z1~wUd-)D;&z+V&>!73FCa7`)Oxodd2kiRvxKJVqxx5k0pjJB${i4jni=Q
z)1SF~3+W^+9x2p_4pL%1R!VTOb@2J)jUEcP7*ZZgz=fLfQMkp|bLm^q5RDKk$UccI
zR)5C(VR#W4F?_*$-GgZEsY2jvbt=@NM$YfHE+=l*8jssC%;p1^wm}cye+-1uML7Tg
z9N*VgR!0%5Gepm$Tzq19qiTLhN}@~jOA}0gU8r<;5UQUot)J~;j05CetokjRw#dJh
z1H8|Adeb97<NF!lG`uk#P3NX26ZIMgsm$dFCk570?FAEqUN0J%ATyz{t~*J&rPMWW
ziaz7a$iG^3-CzYc>G|n-dyfuO14!i{g#!%idX;UbE#Uj3xm(Z<vi{Awj<&W@W4|~c
zQnJK@{?rXu;Z@)~k^qJ}a&F>;(4<0dffx?8WG<a<?$!3vNN325-{BtsGJCm@MebEv
zTGRIS)7rD7OmK9Nmc2U7lLTE=$iD%UAUZ8xF|vFlgW^8E`1^-pF5KrnS!K@K{nq#Z
zAhQ~4s0^-5nI38IWTXXv6rCGD4V*v)qzqP>tTb=m?c}4}>_molGFb}#qRtqouiL-i
zgVT5l`O%wIS<`fG_M;?3TBKlE>dd?(8tUI4V)4piVQVb^kZ~=u5_pe9$JL%$;_gPu
zT$c7mWZ$*ifakV`9iSVGFnVMmI65JJ!HU!h&p*kx@q9KjMNgJ*S{4Y1Sh30}aY+U<
z_Y`19c49_iqLSuhp#QZV5d!cKB}Ymn_mn1RF%X9HFd4k-xNpa}>Pg<%f9b^-F0vGB
zoQDt^W^pwM1#P%pVNky}$ya$&l>JTEPmbxt@ZUrsIKivJ2FSz<Y)}EuJhtEV9_9EK
z9HkzlJ|4=eywO7cZ4(aYCISHIg(9siNVZH$vzkS*s$gSKXXjycaZLLG5zSfuM)$rL
z8k!IF7_T75B55SSj!-f}9DVs7nM9sy9?b*KIdS*DM1=^0S%z^HdF2*i4_-jAf(!>G
ziR@Omaw?6tuFqnvbmhID#UTZ&C(`e9G}qWz0Dq#f@s$6{jAP(^R#8lcLG6w+FuG>+
zE$scGk37Jd1_1U1rZBu=<lX4m-v}27yMK}SGF!ychB;0+E}yzO1e@*<ko%aS_ffIb
zt>Y1iPEm$+Z0=i5klT>~?Z{g}yym~f*g~fV)bOrJL{LQh*B<9;fPtJ`rW0)*Xl|yG
zgdE=9J)2(h+rLgc9yaDGTUKOz0B-HGNQP|4gv@G^PV>Kq%uIMTOOkC&FoNUu)Twd<
zcUAyvgXq|d#en8wanc(qbFa0TVbh#2f)W%@<41248BM2r3TTa!<b6t(9(B;K{Hbw@
zh6C}L%?6a;kGEk>{+^?+{DeBw=PAO-%F==!%V5r+O2=8{Fqh`i<5R2Qu_hs8!keRH
zm@~g7esC#g-KzgEbZQ(bdcGS-?!OfEI&|Az5W2B%2teg^wVrp4`zXBeE+q<&^lm;K
z?hBMIGFN$nW197R1ru1i2T;Bss{;MNAkOvOKn0NM$r8a@&Wd`I3F=l%RD<r6XRbG=
zo2YB|CVa3n5hpYQeI^&-ypW8!%6%Pk+S6aZmgT^=a9oB&p-PUioKk`na+slsv;8PR
zse7=qC~_p!^wX>0sE_+j_tWxv{x?S17q-s##X;K<xQL}MA2O-9SwQ}uk<E!gWX=>4
zA5YrwtonBjsHP3Uq+aRzH;*n{Put$2uY#IwVUNZ)&8f_<;)~SNj+3Tm2<~SJ1H`@C
z^@la5;q%&OiSm6Y=s&N=Hu7x{uGq>Kg=q^~-rZk7g6qFvq<yg@j(GB@y3BWfUPEtI
z{8qd<9r*R{cVCwUJ`6vs`r&u%t5eImMgd6WC(io$&8}lIy9rpSJy-@(^*Ft2cN$lH
zrS()d2XH#bwy>BXRp(1<oW{Pg7V%<7znUD6+gd!jQ;v^0aVaKGP6styt%po1j<=~C
zzOq&rY`qvuWj514TUz`1<yB-~EamM@5cs6&ORbHS<xF&KSk3Rp^-V<ZLH-3_UsriP
zikf)d7W}}BQrseoBu>HMyl~+{G$+uox#fcP{(azp1~JAT=$YvBD^Kf-Z$U~8OPYFb
zj(%Rlw}o*qAU>13j!(T|U!wJyhQw~9ZhE3hVA_r|6zI7dUlk2!D}0W_yJ#O00LT_3
zF;iV@KT8~X=C@sp=e?fAez|#SgTzk<mhJobX@b7)fu;?I`xLyrcMhAvUS`euA3rpR
zIy+{x<Z8@m1TB_6c06>1(|D1(jWJpSQJ6J|(YgQD&~kzPeASBsg@+U+tG~K!VJ}&e
zsMf_*z(gFYW_A0$X}0R$m$`i*gR(R+<Q|?LGx3WJXYwFkA1zkA+jF#L-<cttr$$wc
zQ3$PY9ei2c+5K3!36UGhob<D|p;Q<B)N9uV!;5j^?I@fl#sSYr_2g#DC#@Hy)m;lc
zt_7$Pr1li%>JNbf%Q&clq-<%!Zq)ZKQ-0q_l8;P?CC5%ripTDz)wdtE4nzYy*63%W
zc&OXgLq?M?XUWKz*iv-^?nTRjE*u?)1ylo|-YXSwk{$k6ez+gOfBlkPCXe`dP6+2X
z1pOs~!YDrt{O#ikDzu$^Vneng+8G+w8>8hd8J}zCSx-7?VUQ%P#RhnO)=xEi`m%hg
zsoGB~88tP)ouU&NOWIOcNG79WWaj{F`+!TtEuDhP^{_X-$l3ncvB`Gi8ZnB{3E1Ad
zqn8Gg0kJy?S?`@bPggR1{BWSF9#HdJU$5i&JHo$;yRG#=X7|vc9iFq&xW=gq?T;O$
z@$oo9N-jwl550?uRN^x;d2467a{J-@M}O7e-M$#AmX;PKQFBa1JV<F=HO>FK*K(d4
ztnFum8hDbl&BH9CuNXr?^89(2MF(x9u5~5uaVXi(nFMrO-!`Sr_gBODe}^aaRj;$c
zAJn19y4Feqk(ue~zAH4sE-2qn)!~4fb-L4}{Z7StU(qLXS{51V?oeQrq|w}Ad=B64
za{LU9XNtZu6Htip<1bu#+cH67dKriLo52@uBPu;lMg_nW2hKt>d$MZ5z4vdM{B=s?
ztaFnh8Pn4ZkP?z=;JjfPgu+DVkp@WYaS{<3Is)}S6??D<s|QU{h@zTn-p9R0*U5(d
zk||J<$!7zi%cx{>5RersIq8!H-y(g%jO_gC6q#UZha5WFA&)u}ltc9HA*3@RIf}Rw
z7l$F!aaZ|@p}>+P(!q(Mhg%6M6O|l!{w|OoP>mJyQrZDi3HrPFVfWxkE*M&4QG(qi
z<fft_H7KjnQ*Fh=78gl2A_tc3Z7jHW-}~;G=tL88GDQIBiQ)*ub8{j+*lM@)Jkj|U
zB?~U+Q%bZ@T?;*N!wHU5q>Ew{N`Oj5_R2(LOpc-al_!8Yq@<f8b|J*vS=R?+Z9r%%
zZhzRVO$OIU1gK(Q0VE{v5SW$Zf7LFXR&WZnj#V^unR}dsgSn-^X~s4_>W^|lCM_-2
z;z>$M7RqobOeu4Wu_UB6#@;v+R2<9E>fm(q6eIx!v2(g}d<+-au=SV)>+i1JP&O$*
zB%YtAYOgq%TM%3B^oE`$km6)YPw9hme%&U=ddvm4MEQ_XgpQR<_x*q<P%EnI7Xuxm
z1zC70=>olkc`%EuCT*2GQ?|JR%rU-o%J&&jeZ~Sy0_m#$+}50$S}TGnB>uJ>QbvaQ
z&xwGrI)O5O2Q&;{#S^n&$^}%vV}unDc6|6$sEu+?f@NO9M&0!Z<z3m?cpL7$2flIM
z9zt#ctCfLPeP)?~PqWEyH>p4FrOaSnT6mbAv#z}I^`H&aDUO9Y7#99krz{ZjR>Yz%
z#K;^Q_~FfMDq<a~4$gI&KQ#@4hIHO%{CCv<W}p4qHbMgQ#1`WHOO{}PicK8mF}ZT^
z#u0Vr;5NBt?j6vr06R`*ChIsulzv8w&@b(iVJFCQQX6J2dIauNi%N`Rjm#zLiAUEm
z3%wQzb9!ZhS<nN>wcl%Vn(U}CQMxJ12&1QRFX*~X7%_zXB+&{VkBg;4MZ=v$4<;KA
zLSVe){{fgnt3pL}q+<7!u8d4q+n*fN%V0nPfH*2Aj-1|)6yKRzRXCw23#GNKp92Pa
z&9Az4BZ7NPG%dayHomot3H;#sc!B})zraqNX<-`qPXs}*kkE-sF=R@x&gxOy)^y32
z>(*mt)Q9q5+m?&>TWny?ELgWXwptiE8w_KG0CiRNzxh{8qn2OvpibubrvpKp%bUt$
zC&`NGl<^Ud3S8m@;T1l1P_`7^j7qVhU(+wv*n;su>2a!zUAyPjr>ebJ8<(0nej3@`
z0~;Yc`hn{;_P>=$&pT~c(WZ8{A_6B(Dmn`YLqC)d@9h6hy2)gf)q`Fgt0`Aey84qg
z8#7JG(HDh7I*m7N#d3jLTd(|tm*59{b9-ylM)52p8haYgsq9x2D{$Lk9b@b^{M<Rv
zx8--2hU=={GK*G=H{VkcP|?ZD{{Y=i_=^kQj;l{g6Bjpd@uRB-oeYta44HnaG~Z3-
z=RfnlqXQkWTWgoUpN{gMpFDk+*f9nPP@mX8AB+A!P-;Z<{)4cMaQ<Bx0b$P@{u}8a
z0=skeU&s0H&HuN6_y2#X|F1u*^|f4=Gb8gDhW~ylr>0%FVR@E*rhVIyCOpU-hv|PW
z=dl35EIMRR=+oSgI+Yl*1~^}5XAdtE_VVj9NBnM^c<_Hz2SS9{r$+_~;QwaB2!4LK
z*O$J73dL+myB(ri_XJ@ykp7QOVMJ{j0{)}GLC1y~BK5gPGEZieSZ~oa#w@QOr5U%b
zEG1=F(Uab6At-%!jrd=(Km_B<{pZ)g9Mx=54*61mH<;KdWOR{9ODunG{It;u%AAr>
zJ;Ny$ayQ%Bu|x3a|98`Fga9;!E@*xnWi^jUKDP>hkQI}T<-^ZilF2+Yr1b_%O2&%P
z=!7^F{%lK3C(ZxSG)BoldMC0*e#n<(`#a0ohRmdHb@{LFes$QMJkIoY2-~Pd;g%AY
zOCd3pw3D<LPXQ!B$tQN$sJR*J$}DLOM>z{{nXW~|ZBO_k{yS9$T2bQ{NInHASRkS(
zX!OfCK=b|?*;<p^btsvHG01T!RQsBu?*7!llg{?mb5&7=mX)-COjYmJGsnLIPk6<}
z|MjGhI0zYQs3V-{KWd2}Naq_Sv;d0AqWPkhfRR}#nCD^R{MzNw@tR}YQ(K&ThSDwR
zZ8#J&SP|vSpQ62;qh*<ZAO)Z*PZ79FL^<WWLc*7IkLL{G{)z4?gSusk^c2GJuMb4r
ztxsO*0E$L<*vsJ7sg=^2Xr7{`H)*S4_vDQnjt3)}()Eg@8wL9Btkd8$=yOq+k*Jor
z(UGgE6(?=T1^a*q-dt3r={}}rJ9ws^tv4j#Q1G#AqsE<w&qh45GgbD#+%Z3*VMZd4
zgeki^3@dO*9-!cz_`o;kznN2O2C-^tn4OMdm_?#P@d)rgM~Lh#Bgt?T8oHi;>M7Zs
zOJdWavEWmO40n#7m_$e&lO&@9)FNy010*jS-~G$xA)VWDwtHbh9Bs@R=|Z56!cP;i
z?$2if4qvF>>XtEVM3ViR@SaqExZ)wXFV(kXLU@nd(#-#rDPZH4veeO`w^SuH<Fy&v
z9IyD^D!=(;l{Nj%N)T~%<Ny2W;qlGP^#Cji%h;%9iJ(C11;)8i(}r0YftiS2dn|LU
zA5M70ib?U*+;yFr8=uoB*Mop1@XNNK7~Q8(;zXg;kD=+`Hd&r__=elPr_T3_pfPH+
zv?y`781-vO|BLf1^3`ZvUfnK41F6pG^bD`(qxElB2;VRHo}EGufGTpZAZ<C7I_h*R
zv-%OBchZ6NgQpzC_fj5%=1a|_NTEG<)b5l&cPqowjN25FkB32G&XXnR2h&wZZ$eUj
zjXb?j{NJxYttmUw+-wx|v445^#BdVKnI7Q)u89k#?y6=2^a!DNGG6=%gA<gF3Nw$i
zOSJpgnSN60#)!(lmT?Jio*}#8@;@Bh?n4S678m_<*cM@!CgVh^!COi~|NiNP!v8+v
zkPvK}*#y0qHfy^bdf=TAOlK>p|AkH%Op5wZiEdC>5lXd;Bzj|Da>&W_>qH}(Y}!Q5
zF3L!m*YB*^Zm04)+~H<Kcx7Ng+RNqkY+>n$r)EOfj0yn2q&EoqKY2;+^0J=Z_!c&2
znlXOY*IW~xOo7UEC$4GwO);EQ=?=nE{J1-q^!mriVQD2@2&*tzSyd=!jIqv&j^}cL
z_>S7EcRpX*(TLdvH(1zd07HiNXruop<75#{Ylo3PmSJIGQ8?wTNa#|@x1Exppb`m~
zzZA4C!M@F&UDk`$cRb(Kwt~-PYKEP+I89YeRW;LlR@1+B05fdNlV99eL`46#@c!?1
zPw~(2R36K1UpnLED}Cc-1Q^)V)}>|!eeqq)x$#X*m^C#T|2X7ZkgxSLqGza_hF)?`
zq1IUQecbaAqeJV&r2Q!>Th8r{#U)&~x#$L>*<dsJ1e5u!t&Z_RO7~WvY=3)ru<g#Y
zcKEj0y+}#ky8^P{TfaBEq+-)m?!0m@AN!DD?7xjaEMm7TP!V?CBt*at0k5lPBjNXT
zBKK-lUwG3lxpxo=Flq+2q)7%qzbk5APtPrY8G+0j<y98Q{q{`3&mOA>>?HFFQa9NW
z>)ClS?ZK3Z*Sk4vrAb_7tL&;utQ^tUKrJ)nY>5=-9+!TXDRK}-De*K{p3I)0-#7D{
z4k^W{aLIxvg9>|xMmUgtkG{+0^~wlU&m3(m@gxbLRBGmgbEUwn=f?f!2juclF^X+z
z6w;wU5H)>cH(vRRiqpzw#RF$=o>6qQZqd)TNoVaRkrD<OLU?LxmCo^s=_6a1<NKi<
z@8=btO*#J5WY1K7pmYhO2-y|clEm9PvjfCWAW`sAa<lP_8QBuNdGIVurnv+0j1oHd
z{LQi(8`Gx#S^9D12IF7t(-`Xze@+zjD}=Xux{QhxTELLGtrTP1HmTE!98-jh2QIn5
z=0ryBlI*?rW?YITL{(FGm`4guAUQ8;3Z|}WygIokt>HnHe0b@AB)`x}$5-$eEJL<Y
z?8JKXJ=4KQm*NFOU}Cgb#r{rmVgmG6qxX%0WgsMPqn}qv3HhUetoM;8CE9)ujzDRG
zNXajCZRr`uSWJLtK|WpHcicKB>A#rR@~(T_y8%|wP~z$-p@#F)01WZ-l{S#OHNq?n
zYJx?@BpCQrMkzWHu_b3_PHcrY$g`No0gNUV#D9N3`gAKJ#*I^JCD+4~5ov%22~F9E
z_IVa>{%Iue-`9_^1#8tJm&PS1820F}B_qJwLI5%&-C3*>Un2rU(z`9@0cc_(wlNwR
zOCn1E(tH7^3t=xcK%NKe6-gSsr|!X@DCK&MU;sK9?3&;!95@fQZ=qW;;~E(u=<mS8
zHZ~nS@wxUOb_pHdbBZ?J@l21&pQXOY+&6zGCp4PV%Gv#W(e!KLxC-I`KRtYd87fqE
z8eO!>g9~W>`t#`{*nLkWQx}7)Yc&;GppUO*ts{X9f|Ob$&)YvlSK%`z*DulR9hrB4
zmZ9E5F>F~ZhEqw#SJO(Jw-#h~AQ@Ckr67f8bRwd#$aG0`RBpnsuf)~f(+Tt%zI@u4
ztA5Y_n4F)MG;FlM{<FOF=Ri&XDIwz-uyU2#8<(z_4#S2B?5P1JBMT4iRGVhhC9}jT
zC=EZ!-q`ry$%vA`xKmed2y`42!4U!iAbt9UQg5C#*prUX3}V^%u)67IOiVh897J#l
z6Lj7+Wsom)@=h1k5~#HuajG%6Z9FK6(VOlWBh{vl=-X+{4CO}oqQ6QECdJpV@Oq)3
zuXV82LardWk7opWaaOop6BZ$rolrfZL1?U=-j(__M;ECSN-TsxrUwTtmITC4W7~qx
z^Zo>=On9Z>I;wHx{zB!**KnU9ahy$Y8q?U4JFR{p_PWn0|NWy`ZL@!SZ;q`d@!qaZ
znB()K3?iAYA4!SnLcJA?4v53q21+qYUlK>~F{q3s-iok|)u<}ad{xAIW6NYI6xqow
zA0GLdmuOn@jf?_4p=?29xOv__zgC>eD~LR)CN(9CDD%39^PEUczM2lI<sV2i30Y!N
zXOiKMR8-{6A04%p6@(I9Y#Qti;gAv%A`CZIe4twSBG3Q>tV~BJ92u%&ts>5w7zx|V
zG|j7X$<kLO{-aM%rzl39si?Ber%6P_jLM5k(j~)<n?O><+g&Y$)MG&A>Ew$>AO)sZ
zq2@_h|4e62YjZdD4Me4|yM$03>N0iQe3pi-t=UsQX03cs*xZr?P)f!1HZ!=KZV`M9
zyv<)xUll!R78#mNV6Each_&Q%(d<vLM*tXZ1_`z=AJP_XT^_X0P;zS^1X7AW)LA~C
zw!tuY5M!35{(Diy9)<!U?UBDCBCI(voBG)rr6r0<q%m$L-%2a%)jDyd={7!={`BxM
zHgm+TK6vB6xeZ4b-2G$&UzE?;o9ZwNT>km#p6072b8vKs1j0XwCzT~mgOl8@ZT9@~
zvY@ufYrb>w5g);T3{{R)_<;#kF}(y(1pSng><OGS9YpT=wp_-K*7+sE05~bMf>}X}
zMO#WwQmVzkRsb{?9Mbph9zD`Nfr0i4;3)XlYMvmmyfHUYj{B99#b?*8lBMGj0va~0
z8c2*ReLmu$0-(s9RrI`DA8Wj1YoIeJXn5hD*6Ezz*Qv$InU~SUS*``6L+)-LcO)wx
zXY?H9;&^;mK4M=dZoAZGw=#QvDk;Kx0d-*!o)g88@-tOH5qK<|R)R}5!26VK7`1<$
zL<4z|^;u82x!d6nE0sZBSB<dmqIL_0l@)U=7>uqfX2qQF-GV%xTB8%pr_HY4aL@#k
za7d@^zmjA9Xfy0CMG=qRpzdA9%}_M0i9^hhq0wBMxdqtIZ+WJd=IFzf=Of~4by}&j
zaF~T;zz|Ht8Nn^Nre{UyD6>G*!(tp59L}RIUc^@f6F)fg7q&+mYNEbgXx#fji{L8o
zPftg`wgi&BC=_yTyBSM8@t=ElrbE6SDdV|Sa@PI)w6pkl{Y}f;9$q{6Rd0DwU~<J-
zbkW2}zNx8D#dxVveFlHiF%C5Nl^TnuuZ5`QMdd4EWT2vmvbstwpIu9X?c9uosK|12
z!^eYWhUoo!f`OA(?W4hdnSd{JZq)L<P40S-=Y@^zn_!ju!nAs`+iw=nH&%&n+=BJV
zu3eu0?(zMeVGVvyYJfwcNm@9Fgo>#tGx!UQJ?Qa%%*Z8bdpz+YRxo4Amw2i`1WWun
zyMR;E>g!PcFsy|3ZNu5=YcRtoyGLAp$(Gq1=fhSZ39Sz%skFpjmQkjl`+CbM0P+wM
zy>Ibw$3jg?L+x4LC&56aWbSDf462L?wao2-(wx7i7)5>KvohA@D3W7~ZO?g20uO&1
z7CPD+3se5)vu$MF4&JSBaX7Deerk(M*HC9ZMgLb>yHGt-FljAaO@09-E&J~P4xk^J
z%3RitqgtkqtNlh#yY4bG&zr9&C&|#mg+Wh>{fwZBqvOcr<51|qQlrN%k!LrCoSU<&
zw5rAXXK=@-@^;93E2}xv`|akrg`J{`bhzsQZ4l^i+eo#eoh>RdD*rvgc6u3JI{u<S
z!44vL_>}-0|I#)MW*RVLe*t>PJDbYMeg@^7z?atGBudz^Kl*wa=H|X#uO^f11>V&+
ztUXqG^qYvT@`RTtvC;cyFqE=UDlN{9K6FN1?n>i*uKD$MWAS5+^KXfkVVPs4=Hj+0
z__^6;?{mhd1(<N$VoreV)?TsI>O)^lJJ0Ux9YM3RmMsw6Vv0y!kH$ArNM6G#zk%HJ
zjrf(!$`?=ytte+T77BIm7FpS>$(7%$cll4<v`lUh2H8f}gYjASuV%xpSs|{0-Bp@}
z?Vd$>i9--t{t=06eO|XY_wm#*(GHu-l101ku`jc9hN9Dji0$<JFS1bZzdN0yRWoiF
zGmifGknDp#9YtWN?YPG4p<0WKjf#O>s$M|C;-{B@LRk`D^`j0l7O`v(w@vjmq?B5y
zLRNrWmqiEp_I=*GHodtD8uNMB64WrESKEkuY;BjqQnSd-t5Ku-Va!<ujnsL1I!QiS
z`Btty!%q7>*6QS8cvBfHP<`AcTqLoM7a855XJR)80h!LgzpTNir(KS3kCMrh!L{}t
z;BOw)y>Hik`;N|dP5kogXm!H>eSR^l114q%T2?^4-U)tqRdw4sUIu089g|hoHSstr
z>zq9CSJz3_(gZKj`gwSXzwM~5-nurLYAbp3@VDO->4pFKazV@T#~YKf%BD)I{ps^^
zJ<;!eZl>a<cMBI_uoF;m^6Ij71vctsIa7+*&Iu1A4N3s2)f>O5Y<kMvKBzqT<mA8`
zWP0$ydju_$>$w6Ntz7V<_Wn%1nS5M#CXSM;L(pTF+I+@^>t?Q?)nO}CsPT4r912w)
z(o)g!+<My~hV#DV-W2S))l*v8v=@P}Z_mUZ5xG+Fv>aZ6F2V;k-9wtbQ12%(v^hc5
zWMyP~qvshkXL{FK-ZOI3>I-afg6kXf)rd7e+|Pl`&=!lt%b~NW46<w2$>lrU#Gd)H
zB4kFo)k$r4C$l8aiu!srmDaKlu%eDk!J8!OIbHDE{XIt5v#icTh~k!RE%2$@pzioB
z0p~k7v--nRHCjGfgR!nB@F{NMW_RJ$JT6v4Al<1Ky*1hCdDFy>|Igc@kuCPw>*s5N
z{19p5CK;9d%ol$&1r;J_yz@o!SlBp?Q9D({^^xkM98guimO4E5gQ&UJt?ZB$%+ICK
z!9hD&TYFrCD>uJo*UpnK3YU%(w3@ETJKjHZR##4s-a#ACE|z9Jaa*<{BDCZ5c=5i_
zA|aRmU@^Rj8D3OU?5I`4pxYSexfAoMEwGj~_&jenW5a2WM%1GTIdC7^D_{dUsq7F0
zPG`hNymGHoPoLX|E{orjbTyyva|O{ide}c)xm-VA+L@HHXwq})oK=OGsDnxyN|-E3
zzO{%fJ^O%H2CADLZ*(Q*1YynVW!XXZwR=<ihXfst@LgsdtR6sgN7Fl`2-8ck*!%+h
zXk45DB1=zVO0Ni(kG-5t*Kq3JXs*N9fW!GMpk1;+nsL&SML+cMMlCop|EjsGm9JJ8
zwTBH^kjeD-WOYs3(ka#J?10rgIk#nvNUHOu2r0<x99so7By`6fS0!D-tT))^VXB93
zK1tMO#ZQ{U-zaM#q2nf!^nV?vez!4&frkM|3kGhID22I$s>J`5?sGrdHh!|QOB&r}
z>*&1sYX5X_q8$6UuMSc)XN_hZ-bKMhg8J-y9+*GA_G~mUtDRC45IvFrX2u{<RPGzr
z{5-lzo1+zM^1TV@5wQgx$Y>f?v*_&;ck=WDT58ZnOp^gsQFMllqk^KQ?iKOL%q$RX
zX6vs<hwq<8ZDM~kR;;?Khug#<ebmxI7DDbSc`eb^PDk77l#m2<A5u$*X@0@!)Upo?
zz!!bfF^cKNxbbe9o>i;8XlwsqdF4h<o>p{Ml)MzPAHogXW~m&5>2rQKS}e{%QSx5d
z+~&xI1u>dw&pl0VmVwR`FV0662a~CS{A4XggxsQ(qBi!gcXAx>cR&sZZ}{uZ#}B1F
z5p*(%fB5O*WqW08SHj{MxY`(!nLNdw-ap&KAPLP-|Gplu+M?vz>NstRyqFc?W@lBN
zJ%7xOtRPNiGpBn&{p>pAsAtNz#y=~}>|*=^IM5rOSyIWgXsa=bWJN_lV-&C(hf)^H
zY7~OhE%9xzs4InPCAQ%PHzdAXJw~j!%~eM+UBa>)lMa<zzJSh6d~7530(Wmdk9j6f
zD@B;&Nx$?D<mvMGD>c7z#F6lzH@&RJTujs<xGHmYi7L-~vipDNjZq%<iHXfNzZTBt
z0incWDrXvx4J@~{r^*&8H(XobCi@i|U4aTC@62EfpZ&GJ{|*p6#=WO}zmsLfjX8-y
z{6HF2;*EL2Fe46mLkD-;IAt*}9e$*_*SscDt%tbR+ThYQGO${MK29<?ilb>heWNbJ
z`T7tF^Y8t8F=lA1t1n1x|5pNA=G*?K??bh*pVR52f&)%1*vK8##L0r87NHUhw2n<W
z@I4sNyt9|wZLFl;A)i_TThmRkF~t~HYhIw^iP63N#K+9|W_N0|KPNa<U_!5b_W7P%
zlarY-2Gd+xV)&+=i~4Qgi620Pl!%HkTz?a2hl@hzCUF<RD#+U#58;RxH#Ba3m-FJa
zRR7f+GR0+HJQ6oFZlFN#=}6t|<{F1kLght*qA2eEJ60%ksH{AwV)Cb$>Kj9me1W4Q
z*c<(ek-BZOs#i7llgesKUO7Cfmcv6$Z!nHceo~b($I=ahHR!b%Bjhw}^mzf4B?JNk
zT7sv2@$JlbAa?c@cX=INO4hRTpM&Zw${D$<;<4U;YOY=`YQWO`@9Z!ApQyH(7m@$X
z^2+61q|Ia^BQqXj+SfBg`k4?Yde?Db%$M)almN(ZB9DO$84FevG388ph-dseI*A-B
zbwJH~K0)mAmTj{psMN~D7aRZuH<3p#xp+>kVh2|c9v~Uba=|SW<<k=~8mUwyg+`o&
z6cGDGs3v`S!;*7!$RgjE0$&}grp%s28A=%&d-~-58+$uydK9#n3o)Zimv&Y?6L7#`
zH8LRyi)KGjmZ$g1Z=L|d&Hk~$qjtXyCu<69rj)q~FXc3Hbb@v0yWqI~9Cre>3eo4*
zjSH@F;Z5u6$;%zk)%|>bw(2w^lN8ZFmjC^c_ijoS{IJ1pFMQ_^^rUNC)7sX=A-<c?
zMgHAxo<iPoT1L~y#Bu3^_v}fH<7fzmC-UIpEavZYC%x+XzN~%qcRU^am173c(<PyF
zsDar>x2d!OV94%MjjQ#yu~o<QFE_99?b$PfyuT1Cq&6<~Z+n@#t+it0IEWqAlLy_E
zPUI|m|8)QLEfXng-f7y&l1U6dm^$PeCNi<S3U}grLunpQ|ANi1D?>>z8!PfRIjwIK
zk_f~1t76%rTF#)XQ|n2w0!FNwKfO${e)|Hpc%gpZepmYEAHXif#|?3zwDZu3x}aC!
zp6`(|J;HWh&#oWYYi-4!moMxfUNuoTY;m~83k?saUmleTw>usbpWL#iESm}i8m$d(
z790LN!>z7vv7n6f-F<-F)+783?!2l4&KKAyn<=k3I|PRj^lXo94EL}v3kZTh_rws)
zbe?BxTT<_-$l=#@QVPRRaw?Vgn&(VUwxKBm;U=V>tx*uHc@?0m7_L%YY217zx8R)i
zYiy|udBmX27&#@X4HvweZoefCE$AT0K<K1&Tc3}A{psIE$M?{URQ#4MA#vw2_cHf-
z*o+@=-!94qXto`jsf@E8VSu_I|8{%qfFgarR=J>kP8rslQEw`;IJYotpyo-`OnEJU
zTHdm8ZkQ|dxX|D}hE~Gu^vOAXWh9-5i0m^Ob`mO6j*4F&hrxUCr$6w-rOL8O=~5hk
zTi*Sj77^X@UHAP#1JT!;%$96L%S(RNQwOOwgpt!-g;Rf$ICP^Z6=0OfuYa#E?$%{f
z#jV)fbf}q}b_AJTmJZt`|7O0Xwf$hJB69$KF+b2!r-frk=9=H`RsCApz@o2CAGu(X
zuDbZF3tmC{c*;{&gbE6^2xif$K?TO9(Azg+hPbRMXT2fFwG&k`s0{H=Xrsb^8lpMr
z*jUzn#Y^R^vwD-JS=$tRQ&t8l2{bQbQk^;mXN(>MQ1{O++T{dXHy>2C9K<cnHh3P_
zJ$jKKh%&pzAs2|&S^c=Vk>6HjE<k(6z_D{(N)^1)L$d)DL6dNjL1fTY;qz0gN6sg7
zT=VdZT&eZ?_;*l==c#esP71BUEyNmhguzu7XKOA(1VVH$&?Bc9&dPhW6JR--*0J*Z
z!1cl_gDweR-$YmXVD(%4V8*(yES5Yl{D%XFaN6r+%2or|qN#SB!QS#AQVE-RUvqJ>
zo1b_0pganlCXY3#lZyS(;`aS3Cz18weJqk`VQ!-r4fxz9@#0$O@N5qtU(&3a_uNW-
z@Vh$(JnDImP1XAQqcTPE49IKr5E`JBh;7IAEB{XC7L(&JJKP1YPpv!O?|7_K^OV4O
z+)6{J$^gUT=9u99<Tw-(Z1D8VaeSlI(CCb29(;Z96hG>XS-aL3ga{nPKA~v5TaUNT
z`gNyi%qN^gzCAmnQnm$#{@am|*JbD1Rvj5W69PDKtJP}>5urk8c74}OC9}!RuSnJC
z6M2+)bViF5`?O~w6~H<W;t-?~-a0+T9@V{{3R&04s6=GN?cAho6T7Pnc6y1fpd=DZ
z2^mN>E1P7qeNMmXQU1a*0=}MCIvbyeUX8&7*diL-A3rujFkb!K&UAbgaH$Ctc|Y0C
z40-DM^Ft2hWg^mVu`47bf{wH8haQ?~#vY!UNhc_3*{Q%auE=qFBbAo>{DcC|c47@-
z!aD&~=*K&;ei9ACn)egM&&3DFE$AlxXTdK1+9o`ISeUxmh9H0db@8^V3p1n3TH3h#
zr8U`wf<UJRnM6VexO>HW%`s$gGajvVB9m-=cyYp7z2Pdf8xR@4s)h91=i);o)B#@%
z6+mf890_n87w@kq+p2Tn9f)#Bp@>Zf?#MVdBcWi-LVRBaS7vb4k7sC)YPQTj2-8-q
zxR@F3^EW|?{=^0Y(0j976q{r4%;0N9<KN3>fzb-!q>sNy`I3`ZR8ppnkIy~d%G}c$
zw;Y&GE{uh@nk?l99~{1K-)v}>X5RXOPei7c1uu$)3%-96Ru21eLQ>-5dS;uHfX^Pu
zPj4CehefMeP*J@3=JboH$xT|;ta0E;G%ZAgcwftVq$nXFx=}JbHp)5B_D_I&v_KPe
zOC73oZ|ZZrpOyp#Ko%%~wxrRGH*=R4Tfd1hhX);&rAD!q&9#nXX#0*Yqp_+p>oOax
zL>9*i4E%CNLG=a6#w<VyRb^H}(ZFJoY25z)@TTGcV}`(qY`3Ek2yq`d%)VpSEl=mQ
z8M>M?dp;lLx;a|9shd(0eY~9VUJH0@1q6fB(ph$z=R6<&aD`8ev#2af8cR_p-UbTi
zzcBxXDHl19ta+B2RrbY&<fM*0cdQE9$E0UkP3^rp1#4-&{923Cax~}lxK?N_zv5-}
z$ip1v`AeK14~u{DbCS^hTb7`{(uY$NOVQX;O)?;+`NPy8Cctm$2)q=_e#oat#<`(n
zkg9aFGJ;#FvQo6-V}J#vPG)CDG^F7ESKp}wwIG<unlHd89^x0)xcskpDT8_WbUpyx
z+V3aiv&{VMch$K@uZ~b8ipAe2p>{lSJl{dDm4mdshbRSFPPWUh^2edva>3^-H|hSB
zt`B*^&`3|>THpp|M6P)PS+vT1t@>0xT7Av~DuAQM*leoWzfeN7cp0o;Q?Q?p11KS(
z0A((6x82-=S*Lt`lhJ&!w?3IQpRxKOheti?bduxK`pk5jyahFG{7U{ZE8j-|6#PH|
zDG?Z0Iqj+GSZQ74*Abq$vfEwT-eXJ2#U-l(T2_@Flbrli{j<(<MY|j{FebhKu%Z@0
z-781KTILOPBIb{kvL;!(T%No)Gej^AL@grRvz)oeK<$Kyze;p>R6lsO7tuW)d?abG
zr=FtF%TORPU`Dl>t)siR*V^|(Amdfw-%I84f<9cp5vQ;8`Dv!Udy9^+w#Vxw#7+ni
z+mUpxpu1t_9RKqY%^x}eU4nKh4k&dif!;UgHesMgA^<ds&hmO?Une#yp(?49LYt*h
ze^JEK;kj0v8cg1^7#T|fmb<yMOD@DQw)*kzsX+Wm2K+1;mJ>Mkm6CodDScj3bquSk
zdi;@zof}0>%&N-_w*P525#AlArO52IATN_)e6}^G`|h^$^u%mrWQ@rvzr0#^{4+l<
z(NK|)A6p+gHKFIS`&uTWa^KT_8bBvo5|ywTsp>kz{;bLZyyRS_VOb^f%LZ*W@Lp74
zvri!4dKuNS)6uz2?5)4dR51wbcd3bUvubyRF&;-7z<mrNY$#_6jm|82*if#xSy@7e
z&cNMrG0OaLLY`%Cc@AlsF65$%*`=7yU=#uXKs)z?+~DKQP)*@cP-{|fCJMfGaMalt
z*d<I6LrX)_Ftuy$*e12dqPVLb<LfZ!omKBh;ffyt?56il@Cb>539v&0IxY$Lq9-et
z;>VcZc65R)0hOBm&{TnQw6b-_ZuH7b!{1BtW=g&B5gsR7eE4=<x!<YfB0*#a@`u-)
zw69TQ`7`t1ljeIf`}T)SpQ6%uZ3!g_=P6U3PG&wfg+HXae^7}sB0e1^KwmRY2OX|k
zh!T{NgPC7e!<a|g7Ij()5LNf5oC^_kabMw!j3#4F^=y$VL<|I+c45TyZoKk4JljG9
z!S0WEXD6pi_XnuMSDpwYO-nS9XE)E8^rJF-5goD-9|;<sC<+~TN}>|-*f-ih-nki?
z99L3637GK(;cuU*QQOW=`2F#(sqt^mr#qF*uT1=QMp4E<v^@ncV`f;CX3H}lV}|>K
z_S~0?<3c{2nEd_?JAf}a5Qr-E4rjYSm;q_WO)cmpVn?`uq0(YrocPJ|2IZHn>h^!h
z!I|qmX<m0lR0sD2CE^bXXr1;y%q$~*{wnwBjUv%BDde=^-G8%l&K`6gluaPcZ9SxF
z4A!9YxvD?6ir)s^%OY>dIq}_QI;PTm{5_dG{!(xY{?mol?S<u_%oZizM+uM12|#Yo
zFH}#$65=uqYF1<G>8Mp7Z+?%T3m^1~9U?tW+J3a0INjeBI`4=1Kk$Gz8{{|q!^GjR
z!9c@=j1=nJf{4z*){SYRk-u1A0#H<jattB{=IWr&OnfwC=Dcd2<3s`1ixI29;q~kw
z$p}WZv^uY~)62!R<&F&(?~1m~J5P|1<7F{EMZVg%h1=3H)oItX$vk3|<A>LBIjuXt
zL-Tmu3e?`#XB=%Be4a)!v6)*iIexRSFnlysJEv-7{4nE*6Xbmo&BrVd@Yus+bw3-$
z(Ji4Zk1uF&yJ0J*@z}X7>ymi<*p2SA6ARPfJ&Sxlnkj(D`Zx86f$#|5xQ?El`uBHt
z<+Wprh*#a4Aai~l41fd)JRNh(mXBA97_=1rPvl<*g&ha0LcfZDOcV;(P!(?wf`b%)
zRn^A=X|&)$VYH7UN7;_YKO0YYm|u7;GlDGagtrP9i*r`tP2(B+KTnv&H)UqjTSYjY
zw$uj`zv$5j4Zcuzd#1l4>0^i&m%)xRM~X<Op|A=j(|fMnN0A4|Qk9Lp6ntT=Ba4Q8
zEvDU8n?ow2rZ__s(uFDnjswmuHk@A@q5J=p)-<Uw>`xpIBnL`)H`?1*X4ioL^=$k3
zQp;{rqX(R~hAZ;o`rf~y8g!!1@9GIw(=g!FZz+RGTO9G<RXu~A8aB!axb6nz8=C%z
zjFQXnqc0YLWF@-CM6u}hzX}NuQAr2GDA|?Ji~RPI#tdp-#=YY;)8pFW{n!+tt;YHM
zSH3Cdv4G}1{IZy}BJ$KMK(JVwL8knvPr+&j97_mrT7u!LjM6YVZCt{>%k@PJL_u#K
zLh({9_Q|&BgI`9=U9<Y%?OM=+JO<)ja&D%mUifcM8k8A{cK9s+>@gH9;uWo9jd+=I
z=jrhf3_U}EzMZl071jBUt1PNC2dag?z2STuJ}Nu$D}9`dj}^Ax>6$eF5@tMq$JpyY
z*zt7QMiW%rG@Vmf&EDpP<+R?6Jv6!Rta?nTzFqN{f(W}u5YQr4n+3sBwAB_YzR<KT
ztr;GqspR0x6qE;pc_i+t<v$+G&OjcJ3RKO)EhHcbHKAzSBLyQqbBjRll_b&Y&n2Z@
z_58{US^*qj@Dh13n8QHdZv%t$cO|g^Ki>WQTH1Hl^L>O|-lVne<Zj1NoC{pv6oPVp
zSU+5Y(=Kuzx1^HOx3N)J=jvuq*V-Vw2AjZ?_^W{DwpYL4yqJvl-25V#g#uNdv71zr
zN$D?S@$)p)-RUEXj&KPwJ|P*(G2(w|+LV7RO^cR`d+nxbTCZZv1g#iHq6;1D|8TR`
z^6+))BMCG9!vYsYRRn+()|U6q+|^<{0O>vF$A)E(?FHu#7mF9-9s3t%)Tb?LBE*1f
zciri$((zczQ9#CgzjK=1m{ZezdviBISw>YG10!$v-@N{Wl+DK*`Ze~|=KXA@O!WZE
zKd?ZPAUFmU$Q2@T@D(ZLt>>$|pw_$xGlL-rftq#%^81ppkRO#}TH_yDS>xY)ct3)h
z<IzZz>Q;!*yiz9c3S(x;z-7*VB=;Zl@ZX$)#_wl6*_rWA+<$RCmM0HTb!J)ne^XOt
zAyS0lW<H~1OUR&EN6c<_xDK4EtGtBA9_*M1+H?(qvOKq*2b%l{IF00=oYsmj?h*6P
zEShgR%}PEH{*7yd#lvcP6uE4o`|)0JPCKIHKW@K2o}HbIB4G}EzFz}f&s$*-ygIy@
z_p{FZrc{)<{gw)es&3icrLr{0;y!!YuB%vaIubw(?shC*o)73w`B=(sTHR3D^kGGs
zV2q~4WO>%uxPH<995T`IY&Be$!ARHf=mjVsjA74uLiw`nvAGfdeG0yE!u7ad^V&Ic
z(qq00p8|f7Gn(Q5_wYl+R=m)18*l#`69@6@3I7gpU|e{L_PBrl<4ta}M(r8{S_&oh
zCMl0+t5Dn3`EbjN5k*~reiKG*wROm^#p43kv0}N$T}#iQaye3Tg!uBMi)alu!rC8G
zEP0DfD*WB`lHE#*A3Ug{muL|N-<o`G<L(%C4s&T`Jw~iIVvW1#4Zj%eCb+x{$`J^-
zS7`dm5;y3OborPw-h>vT_jp;%??2z^5=bI(<n_9xsILBDKN`LnFKy)Lb{6V6XWHRB
z#9@-qrDq1_v+yHg(J*~ij6>miY5`yX(w2&^i;~M?(98?%nIok;p(6H=kvU`=v>j~y
zoqJ&kpjp}Sw#)feBf6k78Ij|yZWtV@AdG>-;%42&nK*U5TGYJ(8Q$?fgWk2oLn&u9
z813)O0*cKLg#snZL=mMzE5|f>sbvIl541X8Ac9lOapX3{W0h9d<aRFH4-dBDqws?S
zt-}T7zJYY+g!T26DXrrAO8&Wo#de``THq$tn$Xp>cD(4ZP-&gc^9}p;(+t-k1ZDlI
z9nl>kLZszicNp~GX1f~n{5byZVnmuitgg9vF-EKg(l)gDwWL3Agoz&yu=t@^%cO#p
zv|K2E`dNKwfGv1_wsMwH8Hd4Z3^LXg=9v><zP_JnSu=z23YBj4L;oaYF<AtG9wreL
zQ%&4G0kzLT%50EHA;PdOr;a=qf*Jqv?eSSL;?XBf;=R?W_a3k>#(P>&C#_l^APiAJ
zmAH72b#4*+@h7a8Q3hAb;s>DHLl9PL94Q|gdyY1=^J8(}Sh^yrY&4EUr{ZHzu#Hu)
zz(wlJaRB$0elo9Y(_DB&{LhW^JG<|X8`qn}Mf<bYMdN`M>%D91MS+=^LcrXB*=4fO
zqw56%gwirmtDK==@)=`|17=ctYqU13Dw4am9tICOzEC!WOgxOjIPRZrG&r|pJ`Mbq
z1pO@dGon3OqPaiq8W+4crN-%zcEEHD0Ym^T^=E#zkLM<<X=@h8B~0;09gC&=2bh?*
zhoWM?@A)bv99^&zs~YhBz=q0cIko2efvw;(#oy_ajP%GDUGes3LG=;Ez@VvCkh;lr
z#im?LNjzh@&e;k>IObDU4us2x4pr8An(~~6g1`DY2%n~{D47#}0O5MHnWZsX5ltKA
zu*L5%t@ms6FEQrqcWcFxccqeO@K!_>?(WDD+0h1NJf^`;9wq|5=ri{@HGfU)wEfzF
zS6;74`nXB^EQAnTakk%XTp)Jib)3_>j17q$%42=E8BSHr;N9w@a`C_0O1r;0@cx@9
z|J`Rhqvfc9itirTU%cqVGi%7}@9RJQ<Rl+^G@{mJu&gO;B-OCwDu7xg#MK4y|D32y
z*f~bSd93_iQ7b_=1~dmXsY%>lsL5knXl*DUMY0(v<9>;ZOxI*n)Wet&R3N2vOC7vj
z!89*%TtzT{!2vgG$`C?6R$}B*zQEEen9<8+d8h-H9Hy&z_d(G(AkS?lYJ<9ZP^9Iy
z`I$|nsO;x?Dyz<$E741dmpn?4*JKpkQ1h_sh9=O%R<1k9Sj>S`7)`iJBvzk;HLTQg
zkt94Y%zepu7@v)?#oWTa>(@s+yP4lzcSqjb^`0BqBKy?oDMA$@ySb=Lx=3BD(r>DR
zKiWqL^OjRViaHdX4b7%wb*PDyQfECpXSP(N^VllPJ1D<|*<4Yh?guMK($S-p8p>L~
z@2eSG=@9tvDd%BoZK=ft!JiaWYPVCWuHMNK1M(wypGxM%3~Y&cBwPV|sihTj{vVe2
zS_nSot7KjEH+DIpL)X~-hok#UuYyEwxXiM4!64rDx(3=*XZYf5$mr*2J~J8d+b)8!
z?_&4cSuP0op7e2;-&9tu_ioOd-$@R6;=h&=egNw@YFJ9;FcNfL{z;<#?(8o@)hGP1
zz<c}t!*%+VOW=;3=T*Jdp)oylWXPe1O|~7qoiJI7UM3%ZpJXU9uK<XuS#pAHM}bUF
zp?aTzNmR}&A(veO_xMDJB1K51OU7H?;~FQSP<B{j`v?51UHOgBDBjH)5e6A%h)OR;
zw)jp8?f`ShotmIMr7#&<bx!Ljw5HGswVDnnr~wWYv5>%1HtZT@VI>mINg+TNRysHg
zk;&ViF<75OmU5cKnXkhiF(-7&0P03VL_YTPn4={_xmKlJvFW3tvXc_}3@qwz6s~HU
zeV(*By|p#W9zRUYR0WX7i)Rg;5z$F_F;)oiCf?1Tn8FVN*j*m_;+T(ke;e|k+vnKs
zW8r;*sc#<Tz-WJ9bv3dhCu~<o&kytvj5>+`_2W8h)V^vnTWT@X8N7EAI+Cwa<l|kH
z7DOQkauF>kKA7}{wdb_5hctE$UtjJ`6{}`>ZA3BSQwfe_ZNiMo8<p3Te~!aq@=NsJ
zh{Jb(%o)4Kj`uA2owRZNTo3&bCjw21`eHifuVeXR?P0?B>>#5~5!0_i7oAmBeDbZp
z+eb@>^x8i5jhzKWi}~X5;-+))nbqPuN)}+y4quiMiUwLF7z@1|z5NxgdnQoLU`GIr
zr<#jw6QkeWM<VoPzCtXlZOz@Q&#bOaLDYW?L}H&gs*gof25Ct<8Tg_A2zHT|p<~=!
zvysRf<KZJJE7}CCVRo9m@kC~lPV~(tl`K3K;gd})k|b?{bYK#Fl^ZY|hSJ&qlp?;(
zlt4pX2Ea^U-&(#P1sx};y>r4sua1W*Ux-Z2iW*22pG_I|A5NaYykVbwpE8rZD0lT1
zLp{fXr~|FD1b$!7q~cPMiJ9MvV;mVjpDJ^3aXPQreaaTHIA$GwHl=-Jvf|SS7(Y<{
zaLHDZboa8hQ1HS<%iOAIGfp&#K^RfqtXXEPj`_RP!%SMP<3x{tUN1W4_#HQ8W@f&Q
z6WX|(QeSiFc=TubUf;k>;_~R)@qGDyj8u@_4))F0+W_Bv5i`~|(+`ZiUaw^W@qH^&
z#{YKeuwIG1zAi%vhF3^n6ZCV}fK?2Y7ComSvrOiF-sDw9l5KXq<5aY2v4KF%rO`}n
zqG?(W0Rv!dg1HTtA!8lqMXQi*I$y4B)SN;~sc|xlfBy7&i1C<}YS}#s$vW0Fz<@p|
zTYw73NxWjBrmXVYI9682U%N|Z_u^8&Uy=b+j9+OqV?-``AjG*4B}ODI*0L5Kf8>*0
ze|9n`d-D&ChG+^QC(E2KIfkNEtC;~2bEUMyZF3l{gA2Gi9sR>P_O~T2Z!i*PyO!sT
z%Jmqc*W_Vw3~ksMZI4HfEz6X?kFzyIuM@;3N|HK5ua2s}OOF`7|Di`-()jMz_u;zL
zjBAmOw=kZkh!3``ifvCrvxhcz_uYQAn{%sWmrif|O6S%q>}phJD5fSSKhBwWt3rg_
z5FIF*#-78PsP`Jx%{Sg=_non}C-{L&``xzB#da(O&&E{F)1`fJb^~H&b-%@)@o5j^
zBJTeJ3PJV0zs0@G$(~!V*GzJ+a<08D!|Ti4S2pqb7qhvmH+}0(?~Gmx(HmcS-K($L
zGWG0kz4LkFM%F)qH#r7c{~vnub?D!0^xwbQb1m09g*WWyzxXuHn$Tnx$m>s{^3Bd>
zT?nL_l|in2gZ<+o9VTu*a=G3JFB{|{9VKo%s4#j^{-vem&T>+di=Fq3(}p}OHy3(e
z{Iuh=Z)0Quv)8CbMD+B6&T|j)OYs&tw0TQ51J&=D97^RRG6saiW!4xf{SY@gt+S^E
zNoHH5U(xk`{!gZ#_S#w|VQ+R`po-C!Hm6RzITF%Tq;%-)+i|1!00g~@)1*xyD7#kL
z>l^p_SmK_O>zSNI_BHNxuV({+o-sx5Jc(>hH$C+A^s2X-`6gBCJpn3>N>CXtWcy9)
zf2S*Cv(TBSduRUCsZ;08oA<46eXCF?^mgy}p1da_V~n+S+O%ocTyxF6_ukvm(lT-4
z#EUP!`1s?GA2@Je#qHnQnP1t?=4OQbmyc%iI{D4}@G1$MDt^*F^?dZcq0rlysQi@a
zrdN6IzN!3Lam{RAXA7#RVCLe{dgej(v~&r#m#8LJnf|BBBYX2YgAF+4%yq?N0W*G3
zeXdXL<;BG;H8Oi#uRqA$qS+i&ADK?2595434>s*x0-%>OW+UfTGOr{73XxQ8{Ws7X
z$Q6&`TRU|!6J2|JzZFBa_foybD5Tl^Br->9#f;aqA}aNGXtVdUN8WY4|9h^*`q9~b
zmGpU{J&xdPE16do6%TR6t5R`5Aah%1bE?_6=uO64TJcN!@klyA&@9s(6=!(5VM;5)
z`pcLG*YqUp0l4LsTl)0rbJS5sAshAht5Au|XXf_y_IvNW_qyw@^O<;)CQUl)tg{Y2
z_+X!jS8?X|UcKj|*vuq9usK7SUFe70X(b6?hW9NWMrn9_rSb%&l9~-(D+B+{?QX0{
zxq<{jfa%*<NM^USbd?9y#lE$nWDy`+xc@u*9LTCNX-<giYsyVIyS9@Bj)1hc@_JWh
zi)V9Cea%csQDOkKd4@rI(Pi&CDEaQ8m9Xj!*h!X}s)EFDtjf+hL=ut7>X3;7>f^CV
z-^}N(J#(Br2cH_mz3+(3OC}&DkZRWSl%UY88y7QF;nJ||9?u%G;ec2&Muk;1?F21=
z0GtXl`Ic$^zjT_{GK?~-C>)dstjZu&lB^oLC`=`0Oh`zTvSuyWgfDu=9PAAMCx{4O
z*lZ}m0#b5DSp6JrI&Sioi~y=W$6Y~W9SlKKm4v_+h<x;uKVcuo_v)*!KKkgRM;vj)
zs8OT5p<ju&_bR?*$&y=dz4iL*uYcv0SNiqqcjlRAo__l2yX>-yFQwPpnZIR(Qw26}
z{4FGk*H(9w6CK+xUscF;JGZQ*Q`M(YKtM*F3V~GFm^6$-MWk<Sp{Y7jOAA`M60zGf
z)kHZHJ2(bZv&IdZ)m>WY%}k^$fB^>9)wp6yTd@q(FjRdZ!vA(j#CL<NtFmumL_|O?
zLQRB@vb2|}AxEmI!3KafOnNpy-h0*eLW(qd5&+fbVwiY&Yc~K^axe_?eZXl$Z?!Zu
z6@W0J+>?vj>kIJ?{i{UO3Bf3%tja*_Kj*g^TU5N^sbKXqH`CWG2b0>$P%oaO0#YGE
zLIe>OQgWoM)+;eXkXqmbXruzDfU|J`vqIsNd^&ShL#^U(BPBJgRscsP0vre=en8;e
z7@Ed$`$Kfu+4?b7B|FUsv2faS8V~>s2W2ILY_LO65|+&APzhmBq><bpWfdV`N)GHq
zn8>h0WO}<PGVQ<PjypO#JC8s9cmTa!^dd52#*FK(yY9BzZd<;5`S|hUFTC)=6HYk6
z_r$A|)q9;Ye{*`62DUsZcT&5xr~GpIE6a)xerK<Px6Qw{vU8sc9w}-cu!RbgI5MV+
ztV^`45QZunBkjdseQNmUj@quZSUUPg&&^&_e*Oo0?=rN?39$h=1@UI5Bs~sqKcy`I
z3Q+_>;xjnV+flgX>wap>vBLQ?VPk5ltD9SkT_xL)%lj5oTlguyp7&n$vnUr-oGBv*
z)JC<r=N7hGAYw`2^Pb#BeLcLGtK*1&cF&S;{QbqohN|<A9Px!CM%U-aI#Du2#z8r)
zul^>tv;T&AEdx0vfx`EotPELFH6%`_B$Pl>nGYa<mDM*6fkH`ONPQD?q{+}wQU$RP
zE6Df^({#uK6#^L|fiNf?1Eq1&1XVI3%7%!HN_t7jk`g3V(sbk5w1Pl=)|L1?W3YgI
z`6^Nyi35y~_gWw#M+650D5rx1-+xRf^*W@H3<Pz=3K*eul%Nr*g^BIT=hr;-$NrPf
zr#|ETY58kwjOpy`y#4mux8HvIefQnhNA@vuXJ_X_4?T3#O*cLG;DhCId9S_p`s!D|
zdgPHuzOE<Un>zCYflUGdL<K1MYK%3xc=FBzIyi425Qz*gYSCGXWnfc&;xGdtAtI(y
zM`2X|Y5<nCi)%q;q6DZ;3`Yv)UY@$mTRF1~kj?raH=6Y97!*b9oV8B;$&-I<1`q$c
zLz*c%tf{c;i2O@SI+nL}53kP|N5l>=skfz9Uyyh32`B(%j=G%v+x<(e>D!#-zjO7<
zTb^vb=tE<V-M)_qIZ<Yz)J8|vfE4JJ<5Y>zQ)NFZ<cpRQd#`)yue0i)!Z1VxGJZx|
zKuTa$Ri~st><xFoD(6UvNFAk7@szc1BWE4OWJp~*ah^;;0di#FK#0;@>2$oI3S#Fv
z7HPR7Y8a!YkQPf(B{(5pM~%H>iwW6Kg++--NTE(aWJrjOXhcv>C+W#S4SZ1=fhBM(
zMk5d;oBT}O8WkVDlIDdJ&=`{V%evrxe(|qv*VE%Z<Dnpih>A!QMUOr9*vy$Tzx1Uq
z)z;RkYIAe*9e3Pu%{A9N`|Ps~4Gl*eafI)Q=Zi|M*Awsk|40hFmk2?)1eF*JB-`}K
z-};rk{JfWS8XNxOV+%g{v)2yTYv6UC-Psqg%d%S>ATXShGMrOHfC0)3B0tdI*l?x)
z7MuSOZup7yG8IxAN<f+KvUko>8VkO~UyIE^^)(BSQzp`~XwJ}qqvL7QyYG9Y_2c^t
z77;@b@={#MgYK2Y{5D9VQ$(swwD*{%mzK42Awomp*(Jr3e)Q6Z59)i-$)iU!6r2?$
zu&IpeQYlC8WNg5ZBQl(p4RLnWvmFbb%;&3exu{$!ikhnJPo+X1l5TQ|*pf>Ua^1om
zj2y)SQE2i8z9WqxgnZI6Q&v8gtX*i@mxH)mHJ~tjxBSRMpplF6RBq{b@{h^FM@(%~
zu5mPtJv1J;D^oR99PM84Rf*C5(hcsxA0<gkH1YJt$=?Cz6E%hr0p75Hbd}pzBweey
z)NUjpSG4VmXzhybwq+Q9QqzPJohn>salsSGsu!zj8kDQiwNk44N86nM(;#XcOcXOo
z$*p?Pwl0c>A4paGHJhd(5i)3f>__F66@!la8JQdeO2otr6xY6F8b@;Dwu(v(^;EoX
zzWL_9efyqt(n*UJE&BW4|9;zTx6PY3Z<}qlIrrRi&ph+Y2@@uGXMU0-#+XV^yh_$z
z5dOfs1Su&ZP*DINCbbY-sf`G5NeK+01Q3W*Hd&kt0c;ec5j66{<5=RJ=WZphFR*z)
z0MmLixRrVr8yHfy`SISXe(#(vWIBG*ej|T&=bGQ&x8m%*2eKgrgG4neu;6X%O3eA&
z7=8I@Q>S3ygNF?G^OH+e>ckmSs8Z!W-@opj>Cb%olo202V0fNjoe-1B;>|USy3e9I
z)KEn2i#lI>z^-_PnqQ^11=_i)4RxkaO%x;P-~|sPh?AB@Y}<XhPpK?rTT`G8$~)I@
z_x_VEi2CmUM?ib))E}2;OmXG*IA4h3SmPQo^=+%?_Z_eYRre#H_3<mr<3Eb4a-}7(
zmhjXsFl>*yy}q0qeT1qKB#r%NhHWW6du41o3Zo_^)BjN#b6C}=BXWe&@~7IL{bSU=
zBq^2Hb|%GjBxRCf*@~cGVkvMn^088RUFRHK{xVwTa_a)y-QFD6NmX6sx{~T~eTPqq
z>Kj0@>0aIO@?Vp=ciE*2w7iZ+Pws!xZ#8ZtQK*oKx)wZap88!*%iR-CE^IRyD3RtE
z<YZm4dP%N+a8%Q;l2qs?@8`^!^Z4VB4<A1K(n~MB{r212+S+#Ab=ONSx#ZYmj~y^z
zK*gEwC+}Z>3kAIoybDy!zz7<-986bMScsI20tDy%WswpbljHz)OiW4;^;MGzM6Mh;
zaAMQ}P!?9rx&?-+W1AmdWYV8|ME`xmd$0PbUpx(<lL%^s_8(P$@*WL0KH2<_Cs%xI
z-y!A18j$Y*nn{IFS_<UtC6J_TNZ5)TGN#W#+tok#O3~zNoEk=VzJ`}})PDNP1=l~h
z<dP3g++$ds6S3m+b3i0M951U4Rt6PXU%tEa^7Z+FBchsqSUrQboz%F`1*WQr3w3VU
z3oVac8OJVe7=fDpQP=Eb-lNqAf30EgKHRmey!6HH6?0wpx(Hx{;`}G<Q<vq2?^QVD
z*Ky5gs%}J-W6JvkVO3!=`TnCRuC)Wk*X(;fJ4u#3RetIAuItW~5B;TT_#q+*BN8Z+
zDb9UNS3VaX^Q+p?lUK|;q`2gn!l)w%=w7q9xORSh{Q%Q2Ag-?Et~t3`_axQB3;TbT
zo5paVDXQwv`KIOff5XgrG~ai4KCUjiQnc4cswRFkit{kB$@QVQNd;0~)_(h!?aUix
z(CEekzpOQJ>;1p#ntgZm4rjm#7y^`E`g`6LOD2}qS9qJrqDEMljJ7OA^Fp)Z(ZmHf
zRpYZLg_-ZX^UhMKw0iaG#fujoe)!?XAAkJ*`|s~v{gpDh*(XV_`ymi`H>f=99YHvO
zGnE-`B1dWf1_{zRIrzfvoCebnrKN6#V4$4#W(54SVDIV}fz1UU!{mo*=_{(lX};6@
z<)eD<RUbfB9YUcDq~EsxchtxmpPTW&S1s6aaMd0oo4T!yIRYh03}sR!IOQzs{y*+O
zRf$+d#1Zo+4;=E~jJc!=wL-Z>Tx5z`SAFj*?uqX`_qihnU+{s^4OPZEtAvO|)j-Ll
z-|(0O4ZEL#Jx=DRZ*j>ZU9a99jXk3<>>w2fW*#%D-?m2+ml*|6yzb7=qi${OPM<Nk
zdR57fIintwKoW5=Q)%YS5*Mlt_(p!nAu0kQLZT2yY@kpFn5d@jI!&r4ez1DmlT~bf
z?5VQnr<-p&w`<1rRl^Ta#EyhiNVPQmZ&9D|RXd#G+ExRIYzF|f+Bo3^O#A?8m8ueR
z>(jq#FW*~x*jK71eyEZMuY!Sxe4R|5O+&}ymz0;iT(|QF3;lLe5r2tAiL|Tjo(tTf
z+p*gjb-SJy58V^6JpC58>`Cl!Mwwk~a;|lrEPkfkG>kf0xV_DPywHTmlvmG{?ltJY
zJ*8QQ48RyuE|(vD^wI9_ZXy~wbm&n>9d*bdhxkgml^Xk%k1{y(1OH{TCw3<Nwbzhf
z{i0PpWi)90g?ityS%YA_kCY^^<-i-iiH%0r`$5SNoU?~**Z12e4Qg3gI`hf}Ggfp|
zMG>ipkQ1^HAyP8g+<Ut6TmOOowy{yjal7>2VOW(dt06}MF*!%dwkwxM7u8>K<LXJ@
zoi^pgRcxXd#@Q%=v8)QKK-yU(sugY$pg8;PD6Xm=x>FetQF5Yi#IZvSD7I^#Exj@Y
z+fC+Ny|syosEUd+QjFOo%cmD#y*=9Tl>CT;)s|HqR1`6kjrduQ*r{DN?FwodR2Y4-
z0ICT}&Qy<~K4X&RIf^2KRGn~KUiOSz_;A$@ry8n7vX))h78(gY?mrS#iwY6j&N=S+
zznZ4;(YS*h6k38hQE?EC`xDoTZ7t4!B-ej*zHx#&RTWVus)o_|?C+8px0#6_>v!y*
z3PbitqzmZInH+bu5fH%=;1)lew68RKewb?eqS%sNb`XH#@)tOwT)zp~(=mPIMN#zg
zpa1;gi!a_~mtB@FUHX~Nd}i|G$v^zz4`<An;ok)yA|h$q^7UF@2LkUhw7zU_kWjCG
zuWF^pS0M1;W^+*e3fY&7zzk9Mwd2Q~dU)g9S!+M=gBP!Tb|sh$M~2j>bIv7Btf+J1
z1lFm0>pueL)C!zZIZ5jBxik0ehf*iAVOWxqB2KuztdT0XA@|Z6J?ZCj&i?Hyb6dKj
zh>f#G1c=i*w{U<M0%V;mejpjLEA`vXh_YedOpQd1H~^h<9yaA-VcP=~U?yTFQe`G8
zC|u|Bzc<!Z@Awgbgd;K$z*+e{Q57cGl`oqGPvl1*ZEE_$S;x5~50!RWz1YQ#1VaK=
z$3QacW{&gG_+tTxEwU|9t_FY@5(2A`IuJ_pp5isHnlbxDg`wmeQ^7Z5WpbiIL}*^3
z%V+D@gDI*7l-LS{nM*6CmL9pRdgT6f2Yyw>sIyElxmxE`Oay?D2T+=Qf7E9{JnE>j
z7(of}xqLBD#@d%rJBXV``%gO!=^$d}Aw!0oefHUZ{p(-<_P4)%_OqX5=F2X-?6lKP
zJLjBpZn)uwmX;Qu$XA)*_jgPX{=k+=S~>eoOp`$1UFN;_@b?rr&N_NS0g50=Ug@8o
z*=6Suv%h=wqBDN<%Kj6VoPFq^!zc6^QD4YK8&G*I4R;sL0r>R6gZ}*B{Q2t)#W7sD
zSCw2VPC1Uq<o-2v_5Clcyzr!L&OK;koe%I}-{y>=%FM+zbCT7IYbTvzP*7)KSQ$`K
zRz<8@Oy=FKL&oO^?MV(oEFhySNSJfoOCFS$uc_YclZBxZrDL&kiPchE=!<;+w6Ci|
zfn?6zoS4G6!^y`I3)Gn6qDN%a^u|NK0mOm?rb^ltwZD3E)tJL`ea8vll0?LrI1gln
za+QRP0=i(TMzI-lps#%7W9k@8jx9R{ip!rMU0WD=m;#nHQZk4D+xhsP9opkVFXY1D
zve?K_1Qy#-EUm1r8w&u&B&(k<&VQ`1=ZB&~A1QSbRSQ56P^{%v(>y0>92ZqL_->@B
zb0@3FYppfL?7Hi&yY05yXFvPdsZ*!kdh4x^J@(i=_uMmj^ynjwIO6!@kDoYk;_I5^
z1_E0sl^pO2;f4<U7kKYgzZY2sny+(75tA6F-#ljA?xXu$cKgD6rmuVKxwZMext+Gj
z9WZ{-$i9V|yi`|JMTRKN;r~zJ{Hdegk_Z>7h7TG(e{D0xEJUQrNPJy65+f4BB&M-y
zO?UjID;GUGyY<hX*)390Wg}oFs8xU#J*;ZX=*a+JEGSMxd7NqlyJjXYeL1(!rzmbx
z=X|t1E0~O+yW{EK*CMgYo@)NfY1*+Ac5P>VXwAN#@4L%~5@k|_VU;eObE6wNh6nGX
z3L!Nnl67-Sw_X$t-M4Djk3-UGx?OYbk*?*r9Z!Mp=H9)U2~qU`04A;9n-J%0>ucpj
zk6`HDRl^Q}fXH{0f`b?~00ENBeS-Q-$PJo6;w$PKVlFM7s&jA8PdGb2_&{}%SGFw5
zEbTLS-9oBA8vr5coc<@)j;b9_m!zFa-INyc6DE|`Ew!ynqLD`{)sl0>T#3a5KtvT!
zzN$7hHlA?83CA6G+?+XcZomEZJMX;n$}6wD>Z+^u+H0>9Pdst*<jH;e_N_!Dyw~?_
zAn+gQeV0~RU%a6MPq>!@?>?K0>StXp2;`+(4G4yz&N(@He7_^N>-+fpj$5BvICV~G
z`r^{GmlnWU_&9r}*9Qmuo1FjxVMGK4tTqg2jB1Rv3i3tq7z$QZb|7n<Q5$odaNovj
zA8Yx_aqD&&+F->h89@BpNNMg<`9VA6`i+MIl&79D)?|S4f=3cZHKPvn#rTZxFGE5c
zB`crBlIM``<JK*w`e8iou>6qS8i!1zhEZ01**;NbyZo8r%0*R^&W%i+BxR0sw(Yf!
z8$XknV&hR4Gu5h;+2m9@WX4t1{q`&j-`Cj$QH)ZN0aH6Ly(q-0!a!-k6Vkf0df)Sm
zsdeI*(kUJT0$fCq>snl1_Ec`Clh`z=b!>e8DvC3vh~c{3&IABINufkQvU-k5T%rF2
z0Ohsw?97|vQHSUH?NnU*3`E%o0AG-#wEP7Ttsi?(7TC;bFGcF<yoX+`b$uP4v17-6
z@rz&l)Tci6*kg~~dh4wZKm71R4?Q$&*swqU`Ogz3OmNQm#QmWCfh_>vw+m?|aj$r!
z%KPCq4!rAZE~*b?{bbW0vur*XW+Wm+G-*`rq>&RWu(BhWzp{O8S7~jRZR>C)OA7K8
zF!(Jx-YbNZRgD^}6zBPx=T<#4t3;8}o@NpdHXuhT3MB&}mgcpG?NU2-Kn>LTE|Y{r
zDW@&37MH$Iv;W5#RjQJHjr~zGDO_>E!%?4+(STh5Fp+{ph1Cd<%)Hlhwc2AYt{-=l
zMvcbQYF1Lfh!CqI0l-V1(Wnq@e})1a=gKRdY`*!6<U0DC{F}IGN3{-Y0N9nUVAay<
zlYU82BRR`Jx!f#KHRbyPKDJoE1=05I*KWx*49Sf>0=}J5+G0->k-`z7xbQJoY^mP%
z5CCeN(TJE`c^#HL8V}kn8nm-&;>SHqP-uDP-KKFw)PDkiWWmGQ)m_;6OaO2RxFni>
zoB-^SC%JAI58Ms_5$pQd%CBpNi!4>ohw)ceS08@(;YS>C#KMIOZ@J}`XP<p`$dDlj
zPX53agQ6$z+ea@|(gFy*zX-wL54_84PO9J2+FsM4umZ%Yh*+V{3PH&b4Xw)!sqM4D
zPkd2jqV{6>_YW-bK819&oe4i%CMM03>*&mZK7QP=@18iSCT6Q--~d%&HYhKCI<{+~
zZT8Pv<W{EXgxJ{jg~jDF>L#Adaih92GfR4sMAFf{_%ZD_tYQ4AoNKb`gVQa1T|o7P
z?jwbCEq^wyQdu)o3UP7PJ#7#DG#a>F<1v4V8@Cth)G(_BO!DgOLRQvJYhOFlbazYp
z{NmDQqPix|r?o^BU~qBS^R#fP+38HK8!h4(MpUw6R{=2~bMK4lhI0R16c9uP3ZUFP
ztGI4S!}yaJ1*?{cprRZlE1pf}-I3q@<0jXnT6D8-GXo|@BliJ-sNS)$<#q<tL@rrl
zmp+ppI>8i%IxQ2utx7w-ERQd}=hOPP*=C#b&p$s&5<k0N@e{n)_ioRqF>BYDb`#dD
z=4#JVf4!&s`Y%`7xo7ELy8hqz_mc3=no;)BmaOhfrTA#t8JT*Hu%~$y^<G_j%F0zv
zPv68~?RBcqUN2QH1hOGu+UlZbm;d(Yd$Mx=%b)ayLVz#M13(mjN+om`mBPHCZcpId
zXLC~hUZh=}vS#l@q#%=J95@k4dsl<9ua0lZ_aC?H;GA$(9R2mlrLWEFi0k_#lJtIh
zkUEYcXQ9pQ`)*(T!!yPlG_FxZMVtW(8&)VHKsx5%M}4-9`|Y4<!=B3M0U+2#(@eP-
zZFd3z;-?QteKX<WlILXgtay)eO|DLC$*@skQUjkI;J@>1q-9ol>GQenmh!D%ay3oT
z*<QQnN2~Vxd=xdQIEkR@49B)(S!vGw#+27R^UugR7Sen}q100BH=a!c6iShQf|q9B
zNv;%)`2d3wD3L>2Erb=y%&xekxMW&1>L8Qv>%?W{i43iN$!bia_CaN6qn2IN{nVx9
zoYZZ9Du86wqsii@b9;W&L<5{^Tstgk7$L1op$T#9R=nU^SDAgz^*aLrB-v&Ax1U;X
zQTH<MXZ?KyKxI;3^FXue&6TE%eyXRZdvPxhmS4k<bF)dIo-WH-(aD}6L%I@GbU0nc
zub7ssNZnlYpN_L<&7O;xCXS$I{jA#s^%B|2G4}Kx^%1z5ePgMzi)>427l+C~Aw9-Q
zF=@&~kJ1Luem}JSumY7!kzGz~dFm)>_^J=CrOK7`7mc6_lT89_VW$?@g4i5Yzn8O%
z7z!V;mS*QGpbVI_L@uSNe(5mzEw`IJ*K&r+imM-9jeOpb>***B5EC)kj?TVybiwJP
z&O3Bybwt)B1e3<qgB6b0xX$ItvM2LnPco*Dudaih$C?3L`>dOy0o&yU?Wz`P48Wug
z5Wo}{J<7JTcJyRL1WFs&nbf%Qb&7-t-78;=l5)J`N4u9#kJdbgx-nHdpD9t3Bt^t|
zBY-$WNaoxr>t4$rcWIygdw~;}YQvrFSAQa@8btX96$p`Gw&m3__YrLx6AhjS1(Wmb
ze}q((1TG>jt$3l_zNUKHgW*Gsvi>~UxiS~kaNS@9*r~FzDRj^HmwoAm>ODW18?+aI
z^2`UUqH5bC2!L~(YpCu&wsXlV<>K;O)re%_Gcgt7VUrYq#>&cX+c@o=oxgd8?_~v7
zyh>>rkgD2a%x1N{RMOK(i$(UD(yg>_uCz=}dr<e%%JiCPPlHykr`p?Kf9pfdoM`EW
z)EAsYHf-p%0WZO+?o*F(t(p#Tkaok$7O%|E^%@go*G4OyHG2(@sn>yLBaez#<ITJV
zm1|mQ^o0LfNXLI1U+RB&mkb*Qz(mA^o==oiGfz=k{yT_%;9X>MRQ(Fn^MA!s@`jCn
zGAO@sjek?8FEyA*og+4U_l#AK%@wYy5-kG^H4H=~<+7;lI%(ggOHLcRUH@t)YMmO6
zgrFJ$N?L{p8`Hk*IkdM_jXVH=x{`3r%ob!IAmiE>$?~VF_WfT*!I5Pcq>Y?d0jzjM
z>$agjqa6?lS!Pus0hkrm5gP>ocIMr#?{-y3{wVJ{yI;Av`@W0aU0-ZC;g2W`5m#m~
zaU{~(K4VI5;6BykP6G1qb$F5%S8gs;?F9Ic1qb5tvgeXD3##||cvL-1C1Hv(Q#%nv
z{<3w>f3hlHT{U<Y-$EEJEg)u-E;UwtO<Y6FrhDlFoezCGH)yx&ea;hrljh>W$IZYA
zg`s=<s0ISupzZb5`>17^RE_AG`*1vB*SKk%iu2FB*E@~h;x_pAkM;L{1mGXjq~v=G
zk|Qxf!fF)}Kn^glKxBc)DVRW{qND=<xf4VofPq3GM1CmdMZiG~k$_lLA(dKg>hmj7
zEhYjS9I2s9*57PEmE}lgHngOJlb$M80wNXiqfo$sSgnE-%uok1l9}411c;JDR=7$W
z!KsNUYoJ^?i)^|BQ0Gq~?JW-g4q!*j@QpS|NiBhuOg7R>gWvoRNR^-j5wJoGjN;wz
zRqMxRYI>YBnF~YG_Zj}UGVPb_3j5F$RRZNd@y7+Vc{pL;D3bxJriM$GpEGvbel=Fr
z_=eG2&b&onOJs9Y{mk=~{i~I>hm@HU-t;m0y0v<2o!9-)(~Fkj0RHRL#h@}w3}-;Z
zM5?CT(LR1SE<SzxlXe>zsY{$yHgV=k<<!)S{f6C~+o^73w9W1UP?Q%CR166OWY979
zfrP88#vcXmKk#9G<iG|=hgm%jgC?4)fmW;-YcvNoN=~(dv^r7|6WQ+NZrNjnz0U&*
zgs5iXIc%yrZ#l2y=KrlZ=9f|JAm>WTk)1!yt(cZO;g5>E+A_0I0?LWlVpXmV0Kq!I
z05a!x<|H0}7yy6)<82H=ju=FwuG!6fFdlUf*KVt#6d6^BSO%i{(da7kx|OKgPTFU8
zy?j&G<Cn*Sb}5{AIn@j|Kxy?Hte%<M?+Y-siHPyG%;0^E{i&{<t<|-<{Q3BppAupq
z$$DB5{tM6RcbU{rL?omJDqtdI1vCN(Y*~$h1u#}A=4HSAL&v-Y<1YEGV`g;#`Q%8J
zG6127RKUW%l{0~fpk!1XoC+cs<@Kieo&Puqq@4Cb{x>#%#U^)9A{dB3fQST+ltBb#
z6(|WAA_%E~1RQ%OmzE93N(|?SjAgK@@xdS}2{bZ>lzqc0Uk{Qr%~VwgWnr!Yf+~yn
z-HNiZ7!t6le1#(s?d9^zW(|CTt~!B@qMS-k%FMfb6ItRIojG)5y{TR0qWa$pS_U-}
z)%$T?1ZUs_d<kS!wN~2-T7GU0IoB)Nm;mJ(?(Qy)1i(&Vs1ghk2)s*dj;ime_yBwS
zjL1x#N~&ECwEiR?t^Y$hAL<m5iJw`z?)Dj-l&ew)LY~RFuI{Q_`Libu`s)8@@4Mse
zs*3N|%--jg*ZWKFArKNu2qe@{LKo?%R6!AY{Ryb}6}#9ADu@lKLg>9o=)I-^>AmOW
z)mzTlGrvFfIrrT_P(e`2U7x_a@4i#^sk3Kh&6*F4Zpm5~xd_G>AvmZ}^3Ca)M9{Z3
z3OhC5Ez6(DHnm1;A46xi^mcO7SRQ={p#TYBpo{LR8n_eJjB@JVr%uTdRKU>CNvE-W
z1hwQFctpje59D9Cs;6gD+Yvtvs>c|hclFbSqN^LYBLNYdI;tA~5f^obxBg$(HOB@!
z9B+pnnr$D~^Zehk&D*3$?*RpqjZa$NS}I1Sx&;7~tg)`pkzYP5)iN^Gyi*)u_8u4k
zuGjT#QtgXYKb_jVJQy>jxb$Y}+Mu~jcKCj#eUb`-u?Ntzf9k#UY#uPSxb!Jk%vJ65
z;hKH^pGyr;SA<RV&A$oI>Jifw5OM|vs#Wbf6&lC(&cD?zdq8X2(<ApwGO_{~yhl(n
zI}W1k8wwGD<2H@LWP}N-OaO|P{8ra)aOa<cY_0cTohT8pO{*%$_b^LL6;UWbMJZ%u
zm>?vv;$@(c0#Q{V5x^+Js85P*HRB!g^^`0~0dZhbVi9AZT4rHMNKF(Gxqt%z4se8&
z$w-)kD5tAb#0V*ovhyx)ifz}EPf)75cBcsuguzNo;-@F#)7S(Dh`?B4vII3m#E7#F
z8ryl_lwOG-N$|czMDX9UB?CXfkvN$0_Qb?Xo_oFF>%X#VpK~J`vJfExd{~9Wih+v`
z+jYpZzi(<u2ap~^8NfIItPFxmO@QmawktVjc#A<w952Ag+8Q5+Nw^FZn_t1(h<C2l
zr?Q{4mA}2Q2P0PUQvvAD53R}fX42JGI3I<fD-`zLzWOI0nJ{H|orpN6Y6ujus=`D}
zP_-Hw%6811(eubpOjLmDtPeNkdN*yl_Ph*o8g|R@{o6<Fr_9E(3)dIczFar?V+23p
zXCWgbRzaeaYK*o!I`_mMG<Phuj5EDm`Au_+n-&Ij1M0T<2nB5+xd6uX^*YmP>xU^|
zm=#cwnjOx}*9<K_@T0Do7dD-KP;tdQJultWaOe+>X;c>~vnmWw=vxs*q13hj5QGE^
z4g0#+rzRYZpdJYe!8at@@Zir2b8a9(D%_AkuIss*qi1gn*l4cD>_yG(yNHWy)v)?w
zFUmi9u`GWk*#3jHJDrkhnE)jtLe{!)EzQ4I22G{biR6$XhH}V-AXOjiep2DCuWRHo
zdwt1f2Ngvu85R{%#ANk%_dCv8e2!zltRiIDZQQu{^fT)Zo!<JXPje{5!hka-w_=rh
zc|pUpeYmPtL+32BQ)4;WuzJ;un`;IQG|;}AZY<1tj@EV;rtUiG(n~Z`tB8cigjjRq
zjMn|9NlPn999#AoF7|9Zp?uH!bba&dS;6Eh6?K4RYZXQnTr6;*pt*v=i6~JJ)MYhW
z1q~u{9I1i=OI>%}z4@Ep3fC>G{lw>+zx+i`ry&{|W_8RKk*q%L)NW)aJ^!>#iGqzd
z5x_9CKmci40TK!<M=KZgJwLZ=&SI=sY*buxW9@!>*Y3TSl7%8-g2LqcC{t77GkWvM
z%!Nhs*WY>j&`*C_i3KHZ;OzwyjcBU%u`yME8A3#&P$q(+HDlYdjZje(0YmY8#re6G
zHN0I@wG!gVTG^0eT7^%#_=bF_IQkX5y?EzZ{kImLDPs(Gx$xm<S0df)$VENc-l*R^
zY1GH}8Jad4Ibp916n6kLq-5104zi+V<o>DpMjPd1<AShbUG3QE)tM@!3^^NQ+K7x(
zVug{aaLA!LbP50_g^G{?94G=vG~KlOS8_GO!etLfD_<nBLDSf(DW6V{`2g8!QE_kr
z&W_%f(=}8*un44J<0PTe%&0?CgQuoDR%DvTZ+`4P9<^6?`{SH~*l42oQS4r;YD`Tl
z0HT0YIn!8wz}M0(I|4ByPE~!@qG88Vi^F#}DCAc?<0w@>Voz|2*uVrq?GQ#N)`*6w
z+VSZ_e&aL(n2A*#t2ze5L|vQn9ZPD*AIw1}l2FN7DzaKt)J`}Bo7UtvZK&Vnlj^M$
zDTDw)iPin?Mtl@5mOl^zR{!9F)GasUmoAXg&oJqXgA>sJh?XuY^!C*4vb&GXlL%Sc
zdFSn$KL2Gi|K*MGMWI4-eY&BI>S|R&0uXT!*iC=``|?vioVxwKrbG7^Cx%%JSTyn_
zo<yb?&i?Axwma{<>fs}|&gPGE_#|wpt8?S^H*UP`mi&@ssje;)Ng9SWt}EtxT{?^U
zCfa%DVHf;>wi`zz6ctu~=}SEqUK|V_oZV*Zrf-~ET)AM#Z!b}ifDD|Y0G-b~MK8TF
z@FO3lv{fmpC^4~Rmk&)U6%k04m5H$ATi*(Q^BZ(@2$0KWiq-X1>D00Z9^B!I%c=(r
zQdbN(Ra~@a?vW?Ae&MY4GtZDPB5O(%$)#t0#<n!#(_esu#1_hLe?Z)4I-C&&Sz+Or
zQNeLcg$&@FL<FOpp_nH@tXwK<5EWG@q`V-4eB4VtL)AYN<u_FY6}&BYzgYeF)ht?P
zg6TUCn04a{%A`--yXpMn$Bt~wgw91yy(FzbeHLHhv?ME3ks?I4y3y1eNU9h`nf~o6
zlp<0VX9$WAsq~=3zA0=~l!!sX5Z{ekK_n4^>h$DKR8RhdY7d}9wF(s_xNf49MOxrY
zrmcF+2cT*ZK^ZL0_$h=lsfM7vK^^LLJ+)@?Nn-1bM9R!wqX<ytoNZ}o7?PBX!NJsR
ze<DOx9Vbmb0YKF>4$m|VXQkAr!~FqK{E8*0XiOk8s(=%*45v&*LZAepkW|wRBU6X`
z&JLOY#Cd2TWtEUPpxT;!&#ytqnvPVA5;U)zCVfY>eR}gK<&}LjhO8+rT^wC;X%o=7
zd}U$D;;L=8Q-Kk{pz!Q-DS&pH1VBYiklOIb1M7}Fwy~IZ)AnsT{;2fWv8e&W-QYo{
zw$=nzpd=v$f$Qts^!@YO2ej8u-VOjUf#WEN6#y#4RLs@fcDEjVEWFi!Z4AZJ=tZsJ
zO><@~`R4iUD5TojW%xMGZ;*!`tNzj#tM@z5tX<Xn<bxY$KGC`G<+km{g*`p1&OAN3
z_WIN@#}B#aSGIMa`TBWlfB1uzPk%nM{lq8|kXWEN<C+MlJ?k^%^_kMfb@y!k^^Z6H
z{?B!1e`Db1Kjn&fPG>gHoaw&vo%HCjjhFr<J90RhTcx_r1`H^fOsV2;!woky&v~-<
z7hU0T#|Dk{>I4unHuum2xqsX<^yXUuL?El`Yfj%j!|&@G41`I2<q%Os{<Z136meOB
zp}1l}<CfHMVT8n0d1PoS(hDk3A|_Q;R`NYO-sR!13f}Q}zgT_m`7i*{Z=bN;USn$N
zs)GHuAIPfCiQ!VhO{qV%K$Hl^lKL0BlDZ_-N-<-}Z&?Xs3Gh-k#8t7%JaKs$G=snf
zU=pVwPz?ww4p1foQ|({P>?;;kh={@7?)su1PKv9+AgVX23MJ+^S5E|>dYgk?)lgGC
z0_v6a!JaoxAmV;Wlv1`-DE2~Pi{CGrl8`FNkHRX5N#oEmSp<M8o2H4!dMzD>0YQ<1
zQ43GIG=Nc3L;eD!5eqQ%7ofk3k-WtT>O?K0=k`0Jwd;y)19O|#6*@Yq0M@uEbk~Cq
zr+}Jq+X8UJ3>2@qE>q0oQ=cFF+g~MOZR8b(RU(F}V`6mNakISqQuc>G2wDfa&^cC)
z9N8ebVhF&64I8+lBM|p`BlQUt-?p%Ew>=wPnoF!O0i{ywK6}=N=VuQ-`|H7gRu!rH
z*vAG*TqC#U+;ed4^{Imo9eCMgCYu6K{lO1Z|M2_yd;gKyej;nZ1=+A`^X6-=s@-;C
z)$UWAh)Jh<ADp@2i(f20_d;s?_Ei(ME$PqpU3#SgwLkqy?GZ=%a+)|~?q}N)sj*HZ
zy5WX;ptxj3Zo&Lu*U6$F5(3u#^n&c@kqrkO0xg2sDcQJ$@a^-BOOKWD9~r@xmp?8q
z5U3+IT8sotY}g>KHbU8Lp2#hf?j1EHSp#1S^6q~mDtM>kJ>qixOO(__BGPJ(j@fnC
z^zGZ7grUMD92?&TK7RU?;6MAK9rC8m5SW;Vm_bYy#9#&tqrd8zV9>8SqJHWCO4`i%
zm>_Y9Qb)u@mf&57M7M(w7r+q{5h-z;!3E$+_p#~rTEB_5k2qCIwKtBM22<rqwO$IQ
z_~;NT-;_)*GZC0U?7s&Ye=6dw5|u<<$ZN`!&Qvn1P!S|zF|o*uve!9G<P_MSVt+VV
zk}`akqyHF`46y=*n{F`e?di{dzEE_oSVXLRVA0)~d**SB7?~P0P(@Oypl{yX-0v<*
zZNGi<kH4z~Efkz{8aj%E#Ipt1bMkvHyv)`$G@fy?LWvNNL6BAjsSw~43iLhx!Ab_o
z004jhNkl<Zcz3=}2M+ufj{p(CU`BNo=`2#I{L&?z7yq^Sunz<ST3sQpPDMquNX(mV
zyQO&1Md3E%TK{;VsY**R0uW@g7U-Jwf&wZCfc)L}V%h58gb&$NfP8-KcYnC*z(eH4
zS@qxg*6?SZuHSz@i9$|gdtaH?dBq>A4mr5~s3XE+Ud1_$RFoY=1rmvZiF+P>EdS)w
zT3cHz<ip->03wb|s^{4!^yWJnk35pn#yJGaM#&rGxBEVd@8MX6sr;$v7dIe{01#2i
z28LPVSrd`h08#&JC~fi2Z$%RiqY~Gz;J<<Qi`7ru0Wqp1Hi{w_sa0!9%7s$Y*9M{V
zj6x|^(ei3yzAnj^<|gJXDI!MfgvVMXvBpYLc2pANN>oov)rj#<iPTRvl)SzP#eqUE
zr4kPYNkwm>QC8A#MM-DHQ>B%HL?Sp=fQxla{D8{(eU-#y6J5}_COXzwQBS&~L`a0Z
zK$5BtV<zG`h^IJI8czJCEOi<2A`7aj{U_pG4vod}psHlJuxx3^J@?cec~r3TP9Z>%
z08kYI!gU*h#mn5J-B6v4A_t)Rw|~$mtp3I~O--H47r+LrWL2yRkq|*(*mOVtOyA6V
z>OOT=s%@wg3$Uhp_KS-zy3i6UsRN7<fWDh<s2SL1$884y%v)vg$5$X0yAll}0&+Lr
zC{ZzW^sxYn#t<7;B4fq5o*#UlfcD>9m>x1p!pH}+U{VC+SFce(j0Ldn&zAsbJmn)U
z-@ER^$L0R->t@8T=KCLPJO6wHMqJ2X0Np?R8A7f4?3WpAO+Z1wWc)qOASH$n0ev@I
z8yT2mkFgSA^;!VN2?0IV%!q)@{)c$g2qie9NdW!rztH_*!1$?1^bF%-iU_HIuwql;
z8#A80d+usx7STju(LXNz^X259zZ{8BVa4WG@L$3E#p;(7VTc4oWSLA%zk;90Q@m9C
zIL6OqN%^lN3zz&K@#pww`Ja<zN)z~!pd6LFY?Q6ozpv!0{!t{`E+0?-?&7sd;w_|x
z0Yee7FyeTEmsBaO(!UMF+mDA@K75pYnQ)CjC_7Y?9mA^*;x%L5;uMu_?p=&T5HCsU
zi#P>xS6vne?CGa-Zw~?Rx<(2BxkbwzP_yGi223jGdg#H9-~K9f)KQHe{t&C$bc%z(
zkd;tDmQ@J?jQ|v`yaqt#lv4qy7zVn2fAPjoeXj7ze4+r6<G_R~m#@F|_WEN#Y-_4D
za{qMLNjw=KYZTaY^9@0BL+wEa00;~zsj4Hgop;<*eDn!D_JcJCP1n$|jc2w|PnQC)
zX#%Vbx*mIs?zp?^v!CXnt;>%1K>n6H_|PMU+;@N7-g`<IIWk2xWKHiAk91%1r-qY0
zRDbwEg-x3ZYuDtKESJ9CL|RY)Vrz@}?!N19tlxPOA9P>{1nX8Sz=4U1eSOzlm)>Wu
z>cbBXRoE~TMv&-3zqR+0#;p31uA-La)bYOJ$#6+l7`{FT^EZS)_|wu`pIx7<mOO4-
zR}W6Ur%GVIg8v@gFIGR9Nt2KPsnH_Cfk?^FTmI7c?<1Mhcz1+LdDMQL`uBok{Z0a*
ze^RI_ut5-QTD$4z7us#NtDib0TDHQ%OFqV@zVnHPA|N|?G5{?WJO1BS(?bWf{plhQ
z-1=3UAADfrFMht}!2Mp_V_LXk1u>}u1!>pQyYcF4YNzg6z5RISgwsLP*KO{;vvE*6
zHPjJ6A^>`>xID<D>d!t0NTwS9WZVz|R20TUix!6uJZO$M+BP;g=ZqKcBT^uL$wdI#
z&pnR-#|p2&1N6N7QV3**4)KTI`<F`<s2w+M!~Tcqy)*f<pC5YN3_E0?M0pr%Q4HV|
zSo@=2)&S|*bJy&)$I|V_%o{W2#hrFq`lsKTSm}n?2@#^FpVlQyQwJY{?Z)Q-+1#lB
ztSvnDM9<>Ib*G(S07Q~R(PH)5xBBt*QMNb^p9JnoI*6(YCn`yB&(91P4gi}J8d|C{
z=`GH91r@xL@V;U5DJ85c22_zemYzQIE@#bZLOO~Wz%X%|z<2vW<<;}nLKAuDSWQkv
z$e2x6U*p!TZTQ+(7$|Jm5Fn;t5;K6zJ@@7Zw`HbI0nqukEBVPM8}`_5^Y8vtyyq5r
zW>#NsZyv}F8a(jCleoSniNvGu#N)br34itz1fW>pY<BZ~_xCMYT=$7HQ|;{%=3P3C
zFzWdIAF7V~V0P$04MPfo*KJA^+b9JDqGYi7;hBL$^9Mf&Kw`G2GsYH{F7CYZmfFLP
zsF}R0xX7|0K}Zq+=J6*8sM~dC0QvRn3U}YpjG*ru|Cj6PtogqSTEBg^xR4`Nn}&kK
zunqcVzYyGU3mEHOoSSMN+_2l!roHyaPTM~-exeE!Ye*3R0nmB--2`L~I|Mb22H?KD
zzjJdlZmDI;95fw(uVv|faNkN@8!ctF2rQ(Y*`tZLrz(k4!ysl-jpKyzUjhKEf+$px
z|Me|eK?Sdi_lwmpQ@@DYjrf#X8MXFZj5xec{S8c#tT`t&P4Ds}e5as97Ecsn{P|{9
zpm622Twhy%{D%P$Ie4X7fQ>C|ShxAvXX=kQ!ZbC6Jv|-2_*qp=_4-A#^_%zGx;mLU
zwPl}us}9&d*m(zARimnqIGn!mzWdTZ<No^rV8aS@|M71C($n_?Afgtu`O-i4Em<<+
zq`v_FI(V1p_B+Rhfx`7Sg>@~Ny{0M>IYCh{ijO}{MX?_~9)PGL7(*H`g?)vd`(~P!
z)~W*!1(1LIQChmp8lzpE>GS`u?OSI{7^xv;_I0m**4a7ZrhK8$bpF?eobu6B!+=y%
zttJc>Pk}W`WLb-azUyz%?Z(#bwtM%y`35L<cLH!dxwW_7(zwT-=^;Z^g_sEm6Xh)>
zz4v!Vjs+E{q^2K%$qADhW1nBybJJ66w;fV-!sK>H1PT=BXRpw^uNhzk?<>4-tiFfu
z;1TCz%UhSdSK!^B#22lY`?nfi(-3hpNUu4hN{0L9y;6MSq2^D1Dm{280IsQVfZf~)
z07QkSp3C+0w(q|`0mXY~y1BFMCBGhW^oNSeR#puhVA|Wsx7Pv06+jj+zE4yBh1qCp
z&1|!62$)nl|JdWW`Fd(^PEFYzfK6xf3zx0>(pPGaKcQydeO=_3Utj826>+~FhJuU<
zJ2prEc(C?}gKg^o4TY?qH?ja7cixo+HB8%60VXEZP$hsZKJ{qV{CVx?e=8U?5J33w
z!vWAUVw77rzjnk(0U*v=BT)4URmO&haO+J$RZI0JzK|K%7KunGzV?+Yd&A0($a3HF
zF9t8qsru0m0e~U`qSYG+$UXW%uxNof`^)4L6MicDcJ!u;3uDL;8Fe7>bBag^8d+mE
z7v$%6E%?>#i@SPKOs?_l;Gl^uMHdB054{R1c<1B&lGiVx#AZW@RrVf38Q-ts<-fHk
z)qlngop<j<2nesdvIyX~W8y_7P9VT6Ugm!*-g{pS5bV7-fVKDB761(g9Fb{mtKD%&
z9Wc--Bns6<sv)U5aSkeMTo|Tao*VAE6V)|vA!1<NB^Oy}_2j)W!$wFU1hDbP-;sQN
z;P<}oGd#W-PCV9<_gN}0DNS3%86Y?F{;uwhno~}O?~eyxHegr~J^OgL+fJ!rqg0j1
z_`(eU9T)wss=2x0v!8cRDdslbc6V`y2`x`P=Jq{c`5B++d-6$6r=v(To(~BV3Qs-X
z`^>Yc<BmuVY;*adLfAM7pq2r!x)S}Q^Nw5cKxXpp05bLUnZ`!9W)*;r-~X0@y1n-;
z9f7Cty~Ve7e5QP*k8?1NSXCoOhRB-WmN}dD{{B<n{rl=}Y0os&i>4N>&jSD)z3Z7l
z6}*q|zO(x8ZP35`qJsYbrP};h@|ILZ+1jYPYtt2%Rt+CgJ!Lm<Ju;JaPTO?vv(EvD
zis8n4?o3VEIX!M$0ch6j!q~Av`w&$J=WHY(Qe#LN&cJ4@2~?c{-1_yraPfc%lK@cK
zc0DqauDUt^!pV~r;B?S)-yNI&@Tb;qe=EJs7>$aCsnlIEHv7Kp)Bi_t5ut({6hP+{
zm)8y$ShLRo@ZDrUs%n5@#|C|Ae)V2^5kQdxC~^gBZRf4GZ@PL${SSYbZf>%m;@kx^
zXO=zk2;0;=;EGGLLkF+@$g#!cOAw@@NWesXK6J-j3}j9|mH@J3h$1!wW8!Q%SP7OL
zLU+TBGJI%emz@D;1`Z5{4$ZGx(fRbF?v^`fpZzoA#(1G$0C-Dh?C}XgG9VB@h8($2
z1%ct^n~NX)<Lsk<JZttcP1mFhT)v<!jk5pF%>bB<l6$=#-xXBwPD91&S5U#fjWVSh
zC>+JL_=Oi=2p2EO9Cc!<rY?+%Pz?qR&5jz<_tet@MDymlS+mloon`<M3A%F_)GSq2
zA{C$ic1}TL8G-42@PU_q`&+9;0NA)uAv}B}00O#x`s><!0YKH5Aq?b~Ena=n8CARN
z+VaJ(KvfN^*K!wCk<vf-e?X}ORaK2mM;q36-hW@_<daQRMxuiCLQ5(DblqAh7E|MP
z002@r7o;-b;)UxzdP-*M-p!|-5k(<@jyrA+faXJw5Fi*dc;GcxMr$@LKllh)u_7H<
zagLZ2=y>Kyo66KooaBWES(zM2B&OC7IRz8-&0o;>!mNg=d)S7$s2CE6_TP_|Ez5oO
ziv{NzPx%m;K%BfC^H}nMn?wVjK&tAT8k2U0fBV3)spmiax0xGJRb?`1=a@{0T<>u^
zH*M3H4V|#Ez*c716}($evHBHM@NXlbkj0utaSJwVy!|!<G#q!V0OG}TiGnFpa&u>6
zbN9O2?_k4CM<1hrGRqD-n7Iq6yW0eoiC~Q}#t;>jEZ_W%ue~&F-^|8!WUB~3R15)I
zUk_mO<yRGNx)mdb_OZ?GGzqHfPB}ANw|2nqF0@ryaUnqzh?t3pjDay^3>gDvGQJC+
zcu8s2h=88E|3MwOrjLJA;fwwHMXPCZ7Xh}p8Gwcn1vcEUY3=bRP*2a$-~EzP27m(L
zbvG173{CAYi2#I!s@--UIOC>p-rOZ8oe=hP5gTVfx$a=r>~QjqskQ+wau89khRN&;
zOnrX`0P?~-2V{4d0<R<qG<@uIs;f&s@l<N!PIbp0rwUk<OH%#kh^t$bph6Ni_$!Ji
z$Qt|j(vAav^6Y0XS+b@#;QA_w*r6B&2wf^j2dC{f7yv_9N)$ji62NN3?XTb+jQ1Iv
zuYwBx6KJUd6aZLNf*|Va)Jw0R3EQTo>`01$3622R4}Lg%*_FM&{IUD<73m|6utSGx
z*vCOt+m}AG=$fmR9(PRJ*<Yg}0}9Jl=kI@*{&8pbbI(M@aLE6iKj2&6l1SJxY8!>w
zW~z(pR<Hlu$FozW^b8$nR$OllR-bxu?!o(7e|2H?Zo6p|5(f|==S<J0+=h*^xs!Wy
zS`4Ew(tHoq)>TiPg8F)edYL=|Hr+HMwet??apP4SlTA2&3_x3&9bm!`0B5q%hINaM
zI;!x@vqS%WP1Vkm!a^9NgYJ3r^3Oao;QxKeRHwtp1txG|zV6WJ?yrAY`_a?ZeCjhp
z|N0j-Rw60(7NxcxOvbk*B<dWUWk8hO7KMlI20=o)yGt6Rdxq}rE@_Y$I;6W}Xprs{
zkP_*ZZloLTeE0ACeCNbIYp?yRz^kad%~z-E9PJ3>fUon1zh$Hm88*DWBX~@&!w|v6
zmDml$XSz=r-LD$Az8@mB^f0=8<vMB$J!dr?OE2O6OI1&M0I#O3+ayZ(`jGb6+v{iO
zOo?n~CqvB!F0Q7kXOIqfJo;W$7VqGq0@f7|Hb?2t%rn4btFBT=y(q&)tk@z8kDjk9
z2n`~1o%AxGVoQP`@J;)dw7JRb938kOE<knf`3lJsF_*&;k^7~WvkSq3v+Ye7HOwso
zt8&ALHLsoIgk0T>K=G_mduqaLeS4KRxt(%RkjPgE-cNiOTjCkJ)m8pHZ?C0wdIYET
z-FwBL+tsugPf~$y^S!4LXaC~8>wMR#C>U8gSnl_9u_EVJwdW^)-wi3QIn9ZM_x*!n
zv4AvpNu_$NYlMKUkf%SYNEB2fNO0Rj2(#v`@??+w!<(YcN@>FTJ!_9A#?<-bZhvm$
z$Uh53Q*oeRZ+Ms^FBK=i_w^+HhBw9M&GWKz9UH$r2wMNa3x&zmHMfWgJk1OYr?ZIN
z6daFzzdqr+{e187mT5gRv*t60(;`Gd;#x2>mqPNn!*mui>0)hk7q~ht(s}kP|F1_b
z9&b(rAhvaMKYiKE?TfB7MoeP&{$t$c%X61@nvB6ZJ}^A#LuwoonhZ^K1Qd4CGL0N8
zJq&Rzp&+0@VA0F{)}$QQY$;rk@q)UcTcPuc%Jk4GVM#762Y|buSYtE!M;Qrc#59e{
z7}fMq_}{=L>-(Xs{avP(`yr(Y*%E3}bk*wn)>(eee877^|H!R|`6*bv-wRJ!kr_CQ
zg40Kr%urFOoO^qjcio#!%;nKX=50-{WFum-5+P=kaz4K=_Fj5+lVC?Pq(oye4>PDL
z(C$2UH{qa}2CG;JOq4I~K&)i?!Nc4aQBXwDlC}BO?nD%c33pTMoa^tckLM6Udd<Ue
zdyLAffSuh*f@s-mm34L!HMKU!mpKIVFggNW45frsd4LrxaFhrSf~G$$OlYg?{)-HQ
zc|i-qmi%gmyoQ*Eq3dXBqt}3uVd*b5((B7WnWjovneTLQ(+I4M@IhCHrpk9Ef|xXd
ze4X#L3Gik)>gpvM3Y3#2)QCeB5YGOW+sEtLT;82a%!YXb<6;eD-YdgOeE3v=2Fpwd
z4k+?`hAD|jf(D+uQdxKPWGywpt3azfc4CA-O3Utkv($&3-WE#dL6j&sE)_^+y0<|f
z05GI;tB6Ze6&=es=W3c~T-QzL2fTVx_xz(B(`ph5r36tTxlY(J-t-e~<LjWP%p>E-
zCoP4dSEdPZL)>>$Vm-gerKZS4@#uM9yvCW!X8$hciH1$)y<gK5^YZt)k&7Zt3dM+G
z`sv~0@<LQ>G2WdR2Z_lRf^n&?)nP!)`C<a@qA{l8mjeRTOL=Tf-P7eQ9Xv-WFQ}`l
z)l3^`L~_^N4|rAbvKXL(ehL+{8ZSqzq>-s0HWHtMJSaNJ?srKOg;s@tH8@3Ayc$Ow
zRdXZ!3rY#BsU--kUhJAb+Nu&gt%*iDI&*QZ|B(Cf*TU^w#&Jp%4HbPgKmuKAc(;%-
zhGU%Y&27Ie;hdMl%}#0UUIXQq@YQBU!@uax1+q@>^;q`P?HsPA-o^`vHpID^wnsk$
zOf;C^>GnIEpY`j5tnj&D<8l-SbIR-bO)cT)-_pP5_$?^e`p({J%0E<ly9Wx`+5~6;
z!Cis#ohL$X+(7^TFa83EC{ZnHxH&D|2g1^yPQq6?*FdR@FL=^mIo+sSa9I<JN&$L2
zGP)T812#dlr0ftXCBq;`NF1CQo2bN?MEh$ZTp_WbbtZn<YvQ2%aJZl~cgRJfVzruC
zxHK+p@}+Bt$oE8OgkA**S*k!BAO5J%{^V!Yq_})NKL99AAz{wT<5Fm@WmRkv0#RQ8
zH}PV0DuZw$a2t}j?FzMsXdtE`uxRzcanfLe-_i0U3wGryG}HAHr_8$_6*w><RA~$r
z^|)0-zCWwsv16&JXp;XpE3X+y&=igQD@~|-IHirLsO_ICV;m>IXkd~rm?*$Lmg5>H
zmQPf<zHQ>)?4_vFFlml=)M2SzUb}tSuAOS`Sof+V&+B3p-(I!qO&1xtXPG15L4U4W
z_lwU^|F2+~Nk|SdJ}(qJfTnU}e(euMfKtMjpb*&Hm@d{7>B!BL4kWy-JvT6Z@(&@P
z)}4VfEH{ys^4P58XROahITO-$DadMsObvTOb+I|Pqh+ixq5;!A7^HZdKTm9w#3?io
zhu2M#Zc;58rD@>JidZo-wh!S*w1~iPVJy$;T)?G!XP)trB=bR|*1Wl)l_52%(B=kl
z04D>0n&R%#UrH&ZP&zI>ITRUju!5<YZcsgl*!<F{=&kia{~CAUt?7MWn=J$4v4W45
z#H1TXH9o)~k1Q_0p@DB&3PfJ>eK#Wr-KPX`pi_@pMe=Ir|L{I*Cnx3o!MBss`1ih`
z-RGiE42biN-WT+~rV=i<YX4_1D2Ux~^#7L{Q60vS@Il5hk(4yr2bQ)cD6XXE>6ilf
zy!{{#IyXf>KtWegIOLzScowrd$S}HUadmm0jQOMW6?p7b#}V}yrm=QJFr*O71y_3R
zTyGS<q3_unr86bU#w<O%z-v@eO;FRg+2fsAIl*?z{lP4>Fr9MS=Tmaxxxl%YB0O(e
z;cn!~4#uk_zeN|XB{?90mEu{#_tLbExWuv@{C17&&3Ei<ecJ6`@WT5_ZhTHt3^O|R
zCt}*i|N81yM&EueW7M)gDGI0?jxi@>#ekfo@MfLAYI<DnQH#x$6si$%!j<_K1LUd8
zOhq-IGy@)`D?jVD;M(wGXm+Cjps$p5_q;pbKn`}gga7*+uv@%zo6q+PdZ2*6;e>3x
zFITJ1z1>Ys?ogJ~>!_?~@|eX;ejuGq<!#8wSKGMtlZVLrk?wY_d_|=)fIN!t>}BRQ
zc|(Itm)ROO)180wE`54;>NX<Y9+hk9p_DDd+`FKy&i!$i<yx=$&hIDB?dB(bd{rH@
zjUcm^oiZ&~@7&!79znFyYc*o4A|lfJVI?b4pxb4k!s~IT^U*4QJh^nycSMkg+gpkA
zl+M3aSz&xSIVl=^6-M+Z22}WP#0V(WKefu&YttT#XQ>cM0Iwm<hEJ!sph?E$vq+&{
zd-DT_;K2%*64tY!$=7Rk%j~mRv^J5RXAdRjzIB5?UyV|y8+t865lQ+iAMT+;L7FOp
zP<P<nFAFZnab=w)=@M(xB=D}6b>rjBirDWDL0``<XutSHE$5-%gX)TFnhhB?_qFc3
zoB4@T-@h`)IJaxrZmWwjt+;LKpA*-K(!c&n!>kNqcK##GKSor6AnYo2d}chA<8pdV
z(i1rUvaHxCpf8iQCEZV9$%=|l6sGhv8@Z6@*|*i8yj)}shb+vXA8Ja3T*)Bd^hewW
ztv&L>jb=s56&6+hbD;cO<l8(SkM1Y^5}WasTRJ9T+ZdAEHh=ev{upiOInbqwqq>V%
z0db!qGOw9DxW3;EdMD5}xSWx~LXdFT3PH;AzZ%mlQ)*cF4~8MSZ`}C$ej-)5HA%@<
zfbABKF3l4ipE1(~rP~Ar9wZyD_w<z4O<VmbqEU-P2NxDLcNKoC^YveUClk#AE`_G1
zqW3d86z!=Uja(Dce56&KI>owtz<S@;#P`NqY#dDwLf`oa@N?Z8*L+m{@0jW$S9iLQ
zfd_CSNzAOT2OQ*j8wEb@sfyeJ0|H(zp_AO>jOC~GgP1*5)2n{wPo2wJMr;>mH%L|~
zAdE_GKDS{+#m(ofLvwH2_a{)@+M^_SDc3?{9|~2`Xd!Wk(c`~}X;pHqY^(Al+HXUc
zIHZ7EGUTo%<8}#Lqe{u0lsGyX6((14Jo>j=yWQk%HyT^=cUFTThtvS9+u4+YfwBkd
z|H`io!X>-Un}SGv!sjG>4sEv!`gLvF(aTNNSfm2~pm6Oc=r(Db<XiQL+=zS)$m;zO
z9z7P+I_Q^AdX0=;1(%fQS|p4G>A7VVHhvpAb~f(xxfI%S7zYcXic4m0?WN}Oe{TAF
zBZBy$c$5=I5D`TuwawzY+3o(lM(KUFWDv#1Vw!m2&|3@`h`<Pkl)>L9*cZh$nDI#B
z#$Gc;AQQ=7Cfi(^{YRpx@ZX#0SzQ8jTW<w~Xnfub`uEn|de5v~+XV%@LUREcT>cLH
zE|!I6ye!k_xD0Ide+!r}2E}nsyg)p}hj)9Qlj_mOnHa(yB%p0!i1hxC%5Qf|wQnyc
z+6K?k=}Z|KYJ_q<1^!2`%&ME_CgWxJG%8^QBTbm$FrXbYRwg#<<&P%8$Qm9Uq6Zlj
zg1;);ynf69E4vy?$Y}`#L6!taR8s3n{<q`H+H1WH&Q0c3$tRDu2lW0<Pkh^9?QSc^
zgalQ_{eGmh>+<jRl@A+)JBFU~ApSL%K?tadPATJG_eH=(MRB~Ht*!bzL-dPT-S(Y!
z?5iI-kf|m3OVPsYH+QXompZg>N^zdyp&1T|3Lm9rICu)cv0rlb>;fI+e=-K;2Ar~e
zqr<5Bq?aFxI)o!ICE#uls@rbNGv)z~hH$KEFC1XLobSL9H{R#9x+{VZi%cX9MxK0|
zXjtvP%A;~R1p4l*Dc0)q+9~bOi}CzuQLtt`T(j=v1Adwf`tQ}Wl^a^awABO3W=gt^
z<?JE4fK=6XK-Y}0kdPM?K2jHOW*<rFCg<o_V_Xev(iROKuiEWKIeR~1cOjB_+(4e%
z+alF0Or+mkwPX2?8+vP~ZcI+8Z|Jf7%lW+LWrWav<+h|7G4N>S?~Wc<xZILk!PGAv
zErWm|f=r{mF=XU|qS=)$`Mtl%yMaZk*qV>69#797-{CK9VSoMLOz%CtRGH&9^4M>K
zYqfQO-LU6~g7}29yyG$)JlqGBl@NQKcmhfd)9J!q&6nJ$EvZl5Ka!&+!!2*0=*(>M
z(lb65)Da&qAhryjx9YWDPyc@xQ>*2%zfb>d)x5guUQ)q^pAs~V^_~{F$B!9?9;=;Q
zMC{R2?0wqA)ArU_1YJM381Iig(a@vJRvsj-14T2y)Kq;RE8v7_wd8R=I8MCpE+BY|
zwA$VjOc06*TKzq)wI{#mdU#4*6z#Z6FOL_syr(P#Q*RFa^W2%g;7-YiQ|Sj}^f|!d
zCNAb7f~7$+8ByP-$2usetQ2C|sv^YeT@3GMF@_zHvJAgipbg96TL=zEvEWL`fM_`r
zc8^|P8&fI5eSahDmtc#7cK)hUkxIOp!vH8%P`woKV5o3FAS;a`i(XE!IoW&3LcMXc
zBnb3m$t-@j>DA?Cn^^(~Wi0z_$HE`NALs|F4n%nOOF4JTcPf8#yWuwo)BH5eet&@$
zJ^+XH*=5->{Xf9d!U_}%3JvHOgBqp%Dv72S4||VZwf;HDWLkfr5V!l&GNn1N6YUqI
z&0qp*{%Ui(Dc1Za6hj1Kxu97H<43?_;$ixI?MweAVeZo(!n7ck_uIh{aG0c$9g~4u
zfia?jzb%{zu#l!+PVx@ZLfoRdB4SFHj?X}42ql-}yZY}T{qq>2a7lelb4@0)Z1oDC
zNqG!Vil9lE1<8iLkb+HE5k;b9U+Ww!y1T(RO7sr=E|of*xstqhbQCa$;Tt8al#5Dc
z)Tk<cQDP0q7?`L;99=YF$NOOeUjsBNCm^U4%}nx=BC*AfCW-&C#ZF+E2&#Awkoq_#
z(U|lX@VgZM|0|iNoO-l(zDLBK+l#p2T(W$&uVEkQ+>T~80>wsJ=>5}k6J@%5xyL|m
zs8*b93kHI=b*U47z?_ypqaX{llzssXS60j+zSImxs@kY-3R`>jR24VXspS`9;{%aj
zIJfztj8bufv2o!4VRBZ5Eo33s^NBrNWgX!~V%S6U7D+^VOm5de>W4mZnmm>ke;z_^
zx80pxIJ_L4J?>92GfwwEb!?CAL0ul0pTu0m1GK1M1o<+T?S;tmTEFV1S91Pl8*K7(
zT-C!PLXqP7P@>dT#vHWh9zz%VsqI(dh}`Yl<4L~3%;I;na#e>cz82`v+;HO&+vtS$
z!nEF}yNJ0J@yLX#Pr?O8H0r5d<7uCATJ0>Y-wq8UtiNsTVRR?3F11;FQ$+Y&J9gS^
zsPCt|RV2q4?94<<CApAgX*euvY}91Z9cHF)x1Nz^vZ!7&Y3o)}f?rwYSa0Eek$cTo
zV}Molw?%69Io{cMS}#hHcVZl8_|GgqPkZ%$ml~tVzH&C-o}1uw{`<W)i}xfRr3YOR
z^e1R|fU>sH*S%`pCPZ{1m;WezKHE_}M0LR8i$~wmbtumw0(FB7Y~S5o<fdy`Mi&Q2
zX5ieW8^OL{A~v&RprVH&UIR~eVNeM|Bx^=~ukmkx>)uQZ>juHBaeH5vvuE7?bhbd`
z2L(LvvwJy>;(biTXBg+P>~+7{s{6zTA^K8D@o;y`{&F+IE2ymQ_^aAdg#O*<1E)Qw
z_kF@K;+%CVP2|lZ_Xk6t6Nu!2uZ0~BDv&<D?tQP){EmDTKgx%7xZVDW{B^9FJvU6A
z-1cSk!Z^cq+U%FIy0!N0!A~WUReTwBCW6n+9@pjXpD&ICyWzF)iVmk7m4ZB;w(>uT
zwa^9I?PeRjT<!1ii#8k$qpjBZo6RTG8b%x7giuWMW&9mXdp*M2!CY$L^q3m(DRi`-
z9L9n=2|cS8*-yUV5e?Qd)IPN1$-X`fp&V0%mMQ-`K}vhuY`zxOMnRWCP>_mXfBJn{
zthy4AC?~hY@9hq%e#@3#<Jkj|l_^RdEfu|I(N_-P?$VwuYUC$bG`OvxD2Ql!#8{38
z^>`_j?$eGv=`Hw&C%>^YnEC29E4_cgo3EaIQ0!!}E+0anEp_viuly`1`V#1+$nrO@
zWX)&?keao$;j<G5g%nJ)bsQG`hXh4Bc)tyoJMCugxsUMQ8&Bo7n)r{HdoZ0Z2CWv{
zrvwk$cH&8dJSsMmAWyau!z0>&t!C7pud$WdB1M5A(&Mc+6GT+>m=FrOyU<!a1*+%G
zabo!NtF;SzWLj5|MnED&@}wZ<(^*EZ$?{z>v5JVJi@hVCcC1s7-DIk&N50>1^yQ#E
z=4Ov!uq__qY0Kn4Ypwgh6K8g1BH^u2tTPh>8e(N)J+!#k25V!X=X?I@JdTD-?))`#
zGks-M&W@+d=AdrRLD@i|lk;ouWmgl!-zef1usrBXM9bW7T-GD6Ria)uZR@QcXXfVx
z1D-zsDpPX1A=jg5c}CrtqJDE*gLGK6V~FnB_iy`Y&|XPXa`f*Du?{j|3M-&ivAE}M
zqL($$X=+6ii#YJ`!8cpk@tDQvDWNo>C+B&W+{pKza3^N9d=!}C2i@yV=ZF03gU2Iu
za?h<kxK`)teH(fD&G&5x+z2ngMN+~f$aA`g{mt}p>#oB&(D!hP`XuML`x>fdriGWk
z`AP`kk+!;5xyvYy+!qB(-D?gX_oCdW?o=x2G}3S@TY3o{$%i&l<vnz2d~e}OFS3X<
zf3m5(lwZ4sm)B3n9sA)wlH{WD{A$swxj7ex%S~F|r@&58U{Mn_Zb<SV51{R$38HYF
zVy0}e$>j30#d>vC21R#5h5{^jRVKO>B7QxuL4jxqoL11;@Aqw3hh?l*iwk2=Y%7$f
z6?EaZDGWt=25=fZ{wOfJV1A%qva8C^=uP!keZ^u;f>3JRy%ObMR}yP7@oA>;_y*LA
zSI*fy1Vp<fcVOMDG9aauTRV!nKNX1Oh^okRsd~<`v8MPwMhh<*UC(3<wmNGQo^h8=
za#_MjP&(8xk@D4!i%k^*hWNYM&hNb)h(zi9PSRURHgs4aB?3OHPb@q4+-Wbn`;Y1Q
z&1gS=F>837;TcxCT<31)x&3>vN&>g517%PL?y9ueb-UB1M81w<C44%Ndh)}XJ{P81
zx{sVT=5fD{ygT3U_K*T%m%w<s>{kksr3Y9IL}?;{r6~rb$=51fH_3bcU4?FR8ZY^*
z<iSO6bMf6S&2Bgwb>f0RK@UTtn%?9rMbZ>S*O(;L!7-?r6h2O#(_a@^6y_1r0|F-k
zS9t{0#j!}a-E7DL&hKLa{ny8(ydC67NKlaE52k|NcdvVUZ4hWvR`R}n#Jn9?1vZ{T
zj0&kUH|a4z-+M}SCn*@5GO~D>seMQ?C+PimnjexiX3LbKMmOp8D*~P!>08?U>$4tK
zHava`1LKfVZnR?@MQQbCH4@OG4c}+yC^HR)0?fb1X?$){e|Mu6`=qQCz(@r3q*GqD
z$b^CSr*fa6Ozex^_cv<U_?x4<n*0~8Z-%u++<teb|84(K>V&StqL>Kuk5|8p->#ta
ze<&}HJ3BkOn1P8E5t#8v&?P~WawI^N{(0k)R2(ERu#p%wkjO<(43NvBu%8{Nr%`CK
z6gYcDKoH=&x1a!|5Sc=nX+1ZRqyS;2hw$xkfSeXTJO|gQl_}CN-C;p-U+PdR$$3~H
z)g+n@mH7M9jGpwqy=(&UhRzwPiaa$4K)zZ|hB5Cw!Nzhiu5S-81(`e6QolDBe<A<e
z(@yk7#ek^td!nF^hx~NzwDTvi+PAgAFED@*-CX8a61AX*5-kq`ryyU1U@%!QK8u_0
zzmgTaa!iIVNq7Xm4GIMc=sf32gz$?Fv>gH999WOnV&)jKtHR2-@^jH8nQiG}xnc$?
z^-$~U%iDyy)8&GEL7$ob{$)Z}(kxH%4a>fNg_-4PUZULvze43@y3OUBw~QbUPtM4%
zLq<`4acINDGp?I%Y9$J&gAjP+3^PX?b<1Ep<4u?nlTvU$>X#P<Y+7s`N_cfkhVMeA
zeUDaZDIWR=iEWo}9a11{F4#-DjDCnqi53b@Kg3&GXDg-84j)wxzS$05B?^N|16c{T
zh)^5|7miI@83h)1kl9*|uK*{fubI(`he@02O2UZ4Ji}=dM5wDvq^#ONvQt@xQ<eoX
zeWAtr&Fw9j|G$18vO$B!d#dengb|{Hwc-TPr`F!rnlA1QC{rz8rCf2?hhPudmmq!N
z{)lv9ZoKLDj}s3={x@G0tj4})WiYZ5_9m0gGGhh-eEB%$g#7dF{R!FfrZEm+sZVAV
z3qrh-LXbp3@L)|gGjhx`TiH{@ifo;kK?<l3O_liIu=Fv4*UOOY{UtfxBX`x+2jmQ7
z`LfOY0%tyZG?y4*GCZ>OtI988!4hDGt6c)4>~?kT2v{PL`Y<EM(%1CCvoO+&D7R9+
zGp?JGggV$&-IEu~PJVD)kvRs(L<fK@n>g^P7ztKw8^39wpDS3R8NGm=9)O-nbu$)D
zf`gxs3B(ddN&Bc49^&1tMBImW?T7QPwipuQvi{Y2iIIXr;c}*IV%G9}Z(ih2i*3XQ
z02;jWVV8FG8;Lk$I+lFnjidtAF?tdl+S~|2zB(?q_eMEgI0t-^1TrcB4GgPROe_H!
z3r@+<DFpMV|Ilz#i|ZR131o_*-qee3OJR+bdMiOx7<pp*Z8S3=yrva{S}ZMIa!gKo
z@cI-CmLr2xz>Y`$A|o#j^Qb^n5sV(&N93sn(uhQs{6a+scc-H4V4y2wnweri$rXYN
zi{plEK1kbI*Squ5Vjh@9{39WOpaRm|qQ(H_UWydUCTu|IvoJowuaKgz&KHWw55s1z
z`xBWxMAJk&F_%!7K9=ezJN<$7f4f}1f(cAnbQ}immqx|Cq!jp#OYj9In*O93y%apB
zA?Fa@#|A_)gKY|q3JYjnd2TB(R=g>|9g{;ni>raZQ&H~%8l=~%4PI9D0rJq)a8Jy8
zx1KgC?A?J|Fwa5Om|H1C*x|f-z`ZVu84}OC2tXGqfa>Ft!74Bt!6ccH>>GeZFI$Q)
zU`Mn4>1UoA7d$Jm4oHiMDR>T|l8po@1dz_Ep*u8ycMe=Ji2xbNHtf}Vo`Qz+C_u}|
z<^{l3Ta1bS4_FROYjvX|tGN;}ud!rqHeH=sK#2n_yupt`S%5&vL)<|Wlh(U($PjXv
zi~t9OqL>NYEPi>nG*F-PU`9d>&P;9F4J`yEDKu51M*~QJ+}V$+sW{|9VKXFS-Y8Mn
zI>R|_MN--Fu`EQP+MP+e5Ek6r#@V{-bv(ApiBuTarcUUN%78#+OuCYv5qrQvGy6+~
zB=O^x!}#W-QVI5Zkw!SBc;=R0BpNdbo%mQ8h`=TAjgo?a+dV_SJiON9)h7RZ<{=!}
z@LJ#?P3SvQu3`|)wCMBc>2lMwz)oyLL_|%*y&|hiMtgbr!4!0U>S~-hA6mF~bad=M
zNzg-hs%uc<eul2}00)chztm|GCc55GepIF;*Qgp=kY{j+PTAtoY1&#VL`i8J7Mql$
zYN(_Lo6?H*jx!i_toP+mMzdwe`=+cuRz9Q#9)KdMbi5w4sJ&GLp9#FrGp{;s%b+(O
z#$}+g7ays|Z<Hdz)5S#QQPQpdyJ+|KHeANWNm}um&r#PEAs8+!-T{YCCQ14+7m>2~
zxUP-;R1N{<XAHn)K!<@kDCtAajbMpRRcPSO>hu^a09Sj?3<h6`IM;D1LPep{z-?lF
z*VpZbqaPyRr?9X2y&AMq!ttpsX;gqmOyUi{t#vkZ8j}it^w!eEwycUa%^UQ14b1<O
z)o*(5t9hDYR)^-E<YiZABRJ?;9GaF?=9?Ie95O0@F}{ItKYJxRpp4e;-Q>cQ?lc~|
z)a{zU9#vy$u?|@_<nE1+V=q^tXlDI<1()fHgiy0uiD;SR5{}Aw|M@9$;Ja7@=I4Wv
zr=9olBdy59rJ{O977xgNdn^4)eL;CW-QkJVqHHZ88A;Lk-b!;QeL#`vieR$7_MxFh
z&b4^ken<f%yzlUgX#u@bZ0p2;7n{utGdSL5XWQnDdUz^5G>W{E_y=JQ+)sAlIKJDB
zt1*9T?UShV)ngbY968+#K<j>~$9h45DN*p1@GTVg4eRnBG|~7M0z_f>exl2DXC&LX
z=WErl&)!>rZ4-f_aeokO-QtLY)#vWrB*Ciqb@Lk|^VJv?c0ZrD<ao&yH_3|-p*Y&z
zEp3Uy3cLtjQvR<DgV12XWZC_?=h2ix%`X@x2N|)xgyNfvIXw?F>7q#mO7kkl-J@Dl
zc(j;UaDc@U1Qy-Z*K+PMO#2E~2pt_YeI<xWBTAZ8lPcxpNhFpl+0?OxI2<GLROdNn
zM^=FTROT|7sYKeG244qG2S2zXNUvzguE-Fmv0w#@KUhI;ENxbcXV!!Yzm<ZL1x^be
zCMR!Q|5SXEQ^9CY2SnwMQ4p8sTq04AfC{(#@Ihl_v6HNR^#@~$rPng&hK7ddHLN$F
zxUy~JlBPrbHrEkdzT@vNE65Jt9JbWtFb;oNRSkCSE=II;9D!q+sPABOuB-Ug-O5<}
zl1N4@iIH4Se5U3WvSYa!lZ^g409H9qE-LrDUFZA!)mQ__US`;@RbN8s@;AB;mN`6K
zVQ~~33@*6%@5d*=>yQ!h757Di_Hge-C4nd$k8U^3w>)P=b!~=#>YR9o2!je!(LYSf
zWezRlJP8A%A_Exgb-h>cM|z>#Sq?;YiBt0q-)rsE6wP9zg%UX>+OEbnezJkJL{QTa
zE!`CJ<bodD^m}Ur$|=k|rhCR%vNQQI*T#R^w+NzAC7BL>8JO=Tt7^L4QR+|zy^xL*
z{aIfA3eD*_gR1M$lml2bx0_$Wd)0c)$N$mnpmOm(^bzQ`JK6+2V*Yys@;NMtQ66Y3
zLDen6m+Y6Pa^czzFDZNbXXuJ~#_w$lyv<2}7BH8F;aH+Dr;L?EmYzw(koUe!JaYW0
z@=2kOH-RPJb!WvC;}*JVfa*(f1tl$;iCqa2`u?Zv^lawBZ{M%+vIEdyl>Aa1>wd9p
z^u;XhM)J;usbo;91?LP1FvM`uCiBwbgtN9H1BhlF3Oh`e=}f|eKv4DY_M?sfm=YJ0
zbPiBGazLhF&YEhJI*7t+>EY7MGH(7h^75&;5F)pshl(IcUwMq*`h(-ce!l)j!_UJG
z4qT!;pho$|1#l4E3-z4+;Hed=X*y%0(pnbGG*2zGTGI#W`Ym)m2T`o%&I!CehXpHn
z&T+QxK(i)VR;-Zhq+;}awcj4IkK^AYt`kKPjR|YAjdfZF_%J=x;Er>H_}P6ZI!|<=
zeBOK>nR<C;tSE>p4!Rwk(n+4Tyw&JiTx5O%L+nD=EdRmmws{cKLW1grM-dqNV?}VA
z_eKr$Fk3ewM5x)h;%4U~*&ncvE?aHHOHk&ITL6kIw#X`4A5(AsHAQMxd1sr)3?1Xp
zplI43i-Dk>gu;}MCnKRwzt`AZxN{5V%cD|IP`RQ~=sl_}I_7WjR((SuuULO)MmO@F
zr1$%<Hedg%1(mV|A~Kxrj3EG!Tx|yfO!*`&w)oiHa%1WzVc@H%>?l;?`ppwWTA-Z-
zXhIk}awtReXW0a>KfM7|^4*E!uQh0&CEQ6B+DSbP@OK>_^7`+VX~kZ4;3dw|JYp#T
zF%jl)nkuW}6Gex?+NU`&1wM}uI4-}u3;X&j5~~2Xa653$(s*E(J(NEvPx5_-rI;XH
zJn6b%>FIe(2Yy14t)9-8qh;>4T9+n-hvjmS)^M2vrmj18b_};Vw>?#2{AqD)C{PoL
z;-ccB!=^=x!wtZP)oE!1NUNE_ayYI&0rfeqOE;|3J(P%l6Gg;>35b)pG{vina^^sS
zT*2#C3JS{&L+sS_qrqiP!iR;r%;$k#ueO~SMQ#5wV0e`fkBT5Lm{~JE7kJ%^BJD?-
ze<I3QWVeY%nkZlqLkiUbzVCOM-EJap<GNQBVXTtJT*~5Q_h>tfxAz?5L^j%jY0Ny!
z0{u^ZT+oPGyY6VFYpf34B)$@SI69u2N$TgIa-EWx$<5}L()(iR{Y+J?$D=_|2gllQ
z7Yb8wlHM*i8UHh5j-y%KZujt};+yc=nBnz>-SW|(G-8n*#c=%LBXek>=C^*o@%{D^
zoFs$-s`<gP*?;Sbfi4#@i-O9ai%Y>jZQaq=UGQN0&0CfV?CkT>1cGl*5Y9-BSqP>=
z5#=&)uDj?u-u=%oo9GpPCMS}+CrSTuN63%Amd49{((Rt?M_C6GiMf{R8k3IV27Pt#
zVfID}ghG{`=!pzO*f4ES6sDcMeYt8rW<g39eUWaHl_9jL*TcO5MU-C6*9G1}n~ybh
zb?lxa*JFObwI|dKC-67s#0aCg*MDT${&EW_=zkQe{HF7hR6hziF7B6A<ACB7Z9q9h
zaP)}Va?~_-VkP*B{zoKNe>MUb>Zra{|AGtFDK(&rEeyT|`QxY;Kxm>ylDH)$T?6&y
zxA&20N;dAb`iNnJ|6tklY!~tpPM$8=J`^?GecZc8AfjTqV6443T>fg)jVGUy=K0Gz
z5^FuN&E}_sCwUgoZ`m+gENHl>H_)q-^VWzU7#Tl#O0U6I&9S9MbSGzs(f`(y6uVCx
zMa3)`38tPH=3fA^%2M&;KqsN$qs8LJDl|s6M=GG|6BSeRO}GXLHg=%R@s{yg!`<5f
zU%ub^%LOSvn-pE%9=0RX7c^~871EUqD+OQto`y&B|H_)7kNDV8F(()5Ixfr*q*9~=
z+3K~yU`<zSDw5qa5q#F}PT$Q7xDRZwP3-HQ$0kayZ*V_ctmb=+cSdk<_raQhCq17s
z$ok#zYfpWjk_!TUp8k7<Vuz)vnprFioJ0}{A-HV-zSbM^za#1P<$<99{O8d6Y0F~;
zDQzf!y&=4Ly7TMeY@gvQoEJJl%bXZf-k*$$Hn!e}!kevJjplFjjL7j)eR}wvpI*`&
za%3W33aox=QXS+7{F(}`Sc;Uk>|=e6jNtgL%Dfh2ivQvDD|Iv=TmO{{38QlV64NAc
zaa6zEZSOzsqvd#tW6SavX^qMh_w#-v^8e~r>#3Z~>4X28^50cOzy8;hw`$rO_Nhxg
zhnf>xVN&WFKJ)WOxc3C)L=SM>3x(LhXM4l`mCJ>(U-KSm>VA@6zi7>Fw$M&Rj{?{g
zD!I_@f&JG%2g4$9*wVuFF^H2Ub85hwm{4WI(3fH1n8dOd?*oq@L{vD-_7<zHs)r)?
zSK;|xKFgcroBlJ<7DBb|#*tRtMf=^@o?jfwxuA7#$zWY@O=^J>49@~Ro>@fTL71bF
z!R0CS^zGT}En&i1&DXX1r;3@*#^Q(5<QJ)F_V%+bL#%88Ibdw>h*oq23=u(v=MegV
zH=KRIBn)>H8a-C!(|H5+`{t&W|9%3Rb5@gW@0p&)RP`C_#@%T$96is=X&Bj+%*Tbq
z$6MP$)z`_je1oRb+U4BHi#5J_%FdoD=gE<tRVzm&8IkF?-M(0qwd~}zN@zJ0%q59{
zBbXuXmhH1#+@Dx!z&$Q6=gJa^KeBlkaD9Q%M#pW3Me+9i;&mb=xuWiVY;$Q}eVI%Z
zMx0N1OQ%)UzabGoTH~`)ZERB>^j!(GOGWo9f@f{w<X~k#N<6y6#+1O+X=-wfgGc0;
z0;WW#phB+F;e81Oz`XV0eei~@K~uE9Y%D4U4J}y5CUR?Gx>z~pwB&nmwItC*kwcsF
zRCBoCqV~npSQeQW*_Ze+PR^}?3N-?BioVR?l1>h9E4wWlVUuY5pebS$fJPA@>mWY8
z&542^{r(R<eS9W#v9v>hL;p2c0auSNgIHqz|Fzrx^}$Jd^uZwIxQ=|(ELT~#v;OQL
z|5Q~F`c9m|cr=9=>_3|d7=))QcJN;z>Peb@o}B4LMA=b-38n_!IEwS-+JvE*gw`Xi
zqp6`MJM*Y{Ap<Ie&J7)(URGJAhJ;;5Yh)-baVZhRqj-`Ja|UHGohDwk;+yZqdhgzT
z@(l<{qYq9xW=eT~ECPH3$l@0H6*Jo)my9DF^J-9$spt1RR(+q&Z`d07Da79M2sMCO
z+T_`5e)lF8@)8%pPRCYQ`gYDM;p)Q`D;GsJ3XA{YcRS87ewPVew%jgrWV{^&Bhcj+
zM3`L9{Xk4hf^`&rc>szlK^2d4Rf*5$X**xLX|2<LeE8MPF8UsQaPlq5aopc6Kglx<
zCwznm!C7YaCBF!IVPTtn)I)$75O-R4H$8ZQbYp_Q$#BSJyqOgL{bJ-_hn{VxH}dRe
zG}Ho$V&p$SHM+C0o1V5Y4fyfGh%e+jTpD?*wa%xmgMvzlB+dRl2oGBR{VXINB|SWF
zC;Gg&UB-)n^nzU<O_~XG9d!Ra;5XYDH~_?eSMw+YmuWWZ$LEF^xDHdZ3D?`ry#ZLl
zkRf!olmJ*%MRKXcvDl1#Ded?a6W%ogpJUa^to~&~F<BEu84eWP-9?29J$`?F07dkN
z4Ql56-N8U{z8J~K(vp<SxD)*YA)|ObFNpknqGoubDWG-TYjre<`oDy9eW&)l>7ci3
z=(%zvyps#11VgJ})X;*Gv1`bb55c$*!UZseMWkDW?KvQfX6o;4zitQ)Yh~wDepUnU
zJ0`qR1ibHq%9;0z|7L$&Gi;qz2|T4%$j5a(_j_-zDolqgK)5NI$<^pnXCO~aDIFdo
zF0HhBBDER~{RQXNq2a2Lfv7|Om$NDRv|>ufe#~#jD^-pR$z1Rzk(ABeuQqN{puX2`
z>Fs=2E<`FK4aawnr#oXYi1Yc?#;wSxj7`nRgG7HD%q^q1$alT9uRr6&9G^yW5%~eZ
z4GMNZFi|3PIr_$adh=j*6-;w5+&2(S(36o=!ir}f_pc6ZCpVUrxjsBZiwOP&uuWFf
z-<(5<9lbl@)~^eYT7;^YY*aN99Xu~m1MU)05+;+r6rbl*l}=|u1x9?>xlnA@{Et>p
zOOhW~L497>Hi-ycvGZBA%b(d5d2-qkkjO`P0C<@|$c^1VJ$tt2pcZOp=PHyOpWD+L
zDfW`{T%E#<$DjCeWj`>CbqLP=R_0nXWiv!)sKZfgy`NA;pP%>XYoZWLDLk@qTV_h*
zP69YGOP^3J144|A^_h)N!gT)n(NeW^gfzu2zl4ecPZjqS)&YOm<!DgI#d`JgLe*PT
zM1*GdwG-On=lt~plFF4rCetCt3iHs_Ht9}nmp^;}Ip62Jc396_&?@PX8`;!eYEw^g
zYRGok*V*~hbz7`&KN8(x6|JUn1e>g<KMA;OS&Lq0Ld{72TYVwq?+-s!g^o%ztUs%s
z`#D;7MK?gNqIZmq2Oc_Geth~P5{)JAY?$4>u^|cFIgDymEE^&PCTMOf*-{@z^s!pD
z!ddTRu+CC800dgp8%A;zR+C^FE_uOFa459MaW}BExmU-~ly&-jY-C^|2s_KPmePO}
zM^eC40?UQZucG<D1BV}xgWb8E5n<wCUUeaY>#UHI$c2t4?J*D>fdWS&4U`7Ur|L${
zNBt?pST>F0WpaU~;-#`nNpzdyEwOo8Dii2M1t)7-13+3DQVd`YkJ-eITqJsgSLu@D
z93r|t@ou|u67xtPRMeDM&ZOm{66uclG>y3_SV2;Xt6_HA9fk|@clNosfQ_i5mChax
z6$ZRE9@+3Wb)&^l=@q>4lqBTSIv4CnQWc4#Gp|cT0iS8}So~PaJMR~eoM=Xvq<pY%
zyy^YAs9|0PI=cp`!}M`z7?5{+1o&oS!Ltf?R*doutFz;&dy-&lf0IPu7#_x28Q7n=
zBd5`kDdFfDkR~XRocKF3>@=x3{~U-TSA`#(f&j<nd*nOUY(@WR=NoJ+SUOXAGfUS(
zaGeiE-1L_F_cr~vF>+4O+3&1jSOS$)aQk1#&nOKL_5GE#L_Yv}7Nl{Y^E<9s#6C`W
z_goiZ1yod4%7X?u;MI)2CPeESOCUINE7%$9`^Lk{&8ARCS^B>CsaoW`|ND@wlS+e2
zWqEg|C><puog3U|fsEU$#H)q~%*E5%H2dWom75sNz+hr?dXaYg51av1$F`<9GWlXA
z7v(qrf72zwkiqiR^{OOB!5>j4msSIniU16kEE-cC=vxT?0mP>Q580J1FO9eS67WK)
z!O=%$5gQ8imgC_wC7~A3jQk;A3jTaTL17~f4?38Gsm2UW=G+y~LX5WjqcB_jz7MK}
zUi<*%U3?%Z30`f0!~$!Ba6U4CL;hM>Yg0|~X+S2y8p&a>W_3_-=!&dp9P!Y1&&$np
z9_GLr>8zZJA{s>gQd8wRGE-xiIKE&lO9~61JN<f`zW)AT>y`bD81q#ZpC?&5UcdxD
zO(YM?PoFLyxqb&Zd}{kCop(K0$+5b$4jr}V<@iN$aRml%cJMCBTbe<+fRZV8dsbKm
z8-0}|xDlq$qIiwT@LGezhEP)&m>Vpg3a;*`-Cko>I+<$uN)tS={XFk-`>+G(BPvEs
zPZx+mG`x##i7VLWgyK1Vu~0!RZov7esVQjIy}xL$*>E%j0#TyV)R@1$eY$*PSNyAO
z1%M85EfjB3BUK!lpkrsVkmJqq|2?5MD{#ePr-DuYVPE(!j4nRkma4*SHZ2<&H6mO-
z`k}zWY#+!{nXVAq?d?D2*Ef`+UNwUB9^ly%gx7GN{qfZP)I8U76EzN=Lo!f=nWj3)
zbd|jnFTX-MW?7Mkgv~kXlRIo?Ed4T4&t$Z`Z#mO2TrjbjGjGM*WPj^QRBEvQhd)DK
zb(}Xao@Dz=zdv6XL(O|Z)C}_6Nr1ApT@?GGo5b#IvBEiN6%<l0wphN2y5(p0U>H<!
z)MInWnKeY+M;5HS9NT;daTIPP6W+760t`b#YCU!DX;QE+s;bd;#Q75cLBwGy(#hS6
zh1GN&1d)y@6b=rlexh51#LH96D`dB4`mfcWZ1=y!j>KaU#Q$Qc%)SjY(tmwjGpTb{
zn>o!{(s5at_8t49F%W)Vt{<^l#;HcjD-Mr3;K_b_>&YL#6lp>oU9CeYi}<L0J`!zR
zb@$Hc>Iq!h-QP_5)bZ`307|eG$>^M&a~*;S<FS64aW_TNdcVU7i0Y?<bkyO0u?f&r
zQlI;E@97-DYkINAF?J{$Uj94Xf9<G0<*MY|R<ZQwQOM)bSL&RP(7*9c0;QxH8%zz0
zF3~@%!(CSmJjKu^9mye8Ucq#tWYH_*&3EXn{^?WXgD2Z)zTlpT89(wTd}w~*YBIf}
z@Mq2Fuj~lT_duX-Avz%!sZy4BHL8vcm)`n7OL`C6Gyc2QG1HgV^Jp<!*H#`VDo`|@
zCPgFb@j=PzvzGE3v#@Up4fzHIv{kSfI-;Nwg-wlsIqdRR%x6nyRRBoq1f{MBg(L>;
ziMFs2xx@R9p$2Eb+4!Z4j+5+LVLqT_R)eX}d(#zPX#j57AAXM<Ac*|N>s_=)o>#^o
ze>CSfRg25vMuoGn5JX-STkL!ZQMzWpY#I%2<Xu|{O~v}4R3$(EMCV8G0lzrixupW_
z%V`~e0T#BeLFXsDfVt8fuW;{GYL+}XEXaNrygYq6Inq9o`0?{RVH&J@U&x6*$p6s0
z);4GTO%3on+ONU4NARBjRn=6feZ&X7NLjKNn&XHX&#HV`r#P}6q;0iShEil=_<rcq
zo$Kq>F1!C<G*z+Jo3qx<w>~&5{{!L8$J@Ff5$M}(5L4Iznl^iOcD9G{b%#0e<vl3C
z9;ntY*5B(-n~q?$iDD5%l^+KGEs;~S+7THM27d+|1sxX%R_<C4a-T`jx$tR8tgf_N
zWdQM|4Sjw3hPFX+Va%I!uDBb(V7v33=qGM^K~OGhN<ecj3pd}Oq@>+w$xE1c0lFuJ
z3Qx#*xHCc2h%whI<13{RQA09d_*tutB8L<?b%CjSG?_!OE4z(X^{DTc^h5GFn{x!S
zUc=KnY80mAk*J1%)PkeI@q$?P%4>}H7d4)qMbpo`yyB{V_k?{81ItWfqft4xgyo@W
z&3oVQ=PbUQ;{4KNNTHG|pxN`bxxe}uT~_GY>wH&Zu=`=-Y+%w178V#5Ygu7eA?~_y
z2EIDlsU4p?tJv=QQtEwejD_BKYi37@12Bv3NN=}PJxM$}4L++pT>yE()46iE{L0&L
zY5K^^suf9h+tA3+I<^x0O%JKweOyWWd~7qUDsTML-M0d(Fu-s-mR;FOtY`)x#N47S
zz<*QGyM0YTzthp_a<HX?Q^ZFbKruKjSN6J*S_grwK&k#^%GoWZgAtKX-v8aIbI{Fi
z*$b%H+y4zoHJk4YD$3*vdD<x}#~s{&VgnE^P-ujz$2{JH>K2OujA$WL*d!P*OuGcR
z&|%UFb=mPg5@lMynT2?lDkNq7*STqk@8rOb=G5`t5?20&-qImTBmgDt+S5zXl)uMB
zNzyWuw%62J_X*cA;32Fgt~ljjY5ec+wBFRZ=3#}KtYRNc=Yi<?_jE9|hUi%>q-ct0
zwbkG1x9w}9ZIoMrF;7`K6y`}oc75#qUJa}EGY=0{qY)Egf6x};SYZm5oNvum<4|7<
zWoB!L*)3f?Ef_mG6U|$}{HyZwX5-&p&qwP^lps{Zb2NGiOBLC6-T?QvypMh^5T~DN
zIU$XG1b&95BJ)Z`jicZ7ui89nr%nUP8a#1jEMUXyXrtr+4R!_>1F~&_avh$s6``Ph
zS8j9^Dq2ocKq5*(fMLR0$-g1mqe|<AQ=g9DBz>G(AA?BMNuD^(^2^gs$F8qWddE5;
zS*F7>_vK&6?I4I%a?%z6736@@nQr!<&bGFK&>4gT=pX{EV%p0N`Fk5wS#|C>Dud3Y
z_(C1S>(I#Zc)|UQ%blNo<<MPa4&7BB7~DI@jd$LMNEAuC0*9C18+!A1Yi0Ve^Ue0D
z>9iIzV2X(~MW>zc74~znGkg@|`5$-j9a7L}HoLY<AMJ|*g19Xz*jZw~W9aAeb&*`s
zg#~QzgPS-Z!Pu3-eI{lqFfK9Y<1MEqNEdy8q+2DRdWa?<9KqY&SK4kU4;A@m2(+)w
zNg&*8uiOzpZ=OvdX#qxeRY0|mw$InrR>O$Gb}}l;HiCQ-Mi3|3YqxZ}<s_6j%F6qg
zKg!<z9ivK`V)w)5fg2g9GiMx71=uFS-_s1IvR?7W9@CUYjrlYK0dm|>0Dj1juL6|t
z>vy$}t|GaLv35PED}$wzwUT&DihkYCG(Pk6ws73f#xtuPe!U^=&&#jN4Ghspg^6a)
zmp=Hl+<?TSf%8(0a3t})pTQwRQ|Ara(I|8M+y#MRJPOssmhN1+t$Cq!Jg!I%BpXX~
z*^m%$`kz}HnpNKWFPvS+J_wpo0!;@+k}vBGMcoJ-(h8x(mX69*p&QD2_|7Cv_u))E
zRGK0r(fiZRY%VetAV-{3dcG12Wsim+Zzac`z1I)qZx`R{{TC?WJ2Z7BCj9j$<>ay&
zfyL9umkT$%C32JBL$-7peRIvr5g`ru)Q?4d-j9V&QPFkp9W$)B+&G+s!s%i5`N;<K
zdwvc3M?f#NgNf@3zi&5ALj{bt&pBsvI#%IrrQA(}^sX}{MlWm&9h3l$1q8;XON7N&
zRBbHro?o|KyH>B=9M)gFXFeZ>iLH*f0-#w)9^rj0=S9K1EV!jvKlw>tBPaQTD(^cI
z4iGO=R(9-p2fZ-zy9BsB$p46fKp0YCFl{|5gda3{n*q=u3^^yy%`AE@IB&W61cWDD
zWD3m1X*c^k_hj#EW3J&ir|YgMzut-fvbpA|Q;#oJ1D8{_zBh04c%n<5iwI8%F$}Ho
zh$=Qh$J0lgkx|&i6mqwdor(}d>xc`qzC`TUilqfttI@f`_TFR+hZ8Taf4kUnNp2ce
zc<DW99|Or5Q9>y&tH4C&bSgP*(2K|a=}&&(ab>yJvL&Iz*y(@T4y<G&#nrr#LZDey
z^c!LYy5|tv*fcTnR*L@_e*-ShFh0OUD_*$#NsnlAdNF2#glXFN9cwFJ%BsS;oihmx
zLFPs(Z9DkDC4B*H`wv%*s361aO13eez&%6@Da=M+obTL1JB&e?af<(VVN0L2L>Dg)
zhnFk(=UYF~E#`9m9w^srS&=ik%B9+MOyA|Z^K@`pb3JGy`5q+3Kt$uh!F$A*f6{&M
z@^5yqXhof`Ro5@wu1t{ot)hR`pQ3`SR%IR=q}|zqQf3Pe6WlD##AcH}`|$VIQ(k+U
z_580}$713ASJqZHt<|UlIC$5A4>agEw>1QeC5^YJK2$v`6pgGrS8zimmVaGgJNDNy
z{f>MUN4;3*Au9$4LT5`Q%RG^6`Zq-Zg0e>)-vV_>-CbO7-H-Y^SJaUj#~^4D!DDjw
zjdRI<>obP!fz=QWsHqD-w{q(IaT$7)5){ubV;P{Yifg!xjI1-+#w^Ny1sTZ)PbX+f
zOP6$X_(Fx+Yt0Y^A1Ka>3ye&26sTBG3ht5>A3}$NkuzZh_EUDHp^C@$u^oB2^X>Zi
z#zZJfyNGM41No~3xht}$K~nocWIOjWxYlb`e#6~+c#ayS{6jl_>4m_$&G%<}jpqwH
z===198dLIFF9shTW0lOXbMoypTKyn;V1M|Q`YvLUtH2^k1DiodJg0X0^iB6uSLD-H
zuaFNX=?B<K3gN#c1p%_Ux6*UXBb2FjBt+=TtH6Zd4xL>jHty|$mb55`V-Y_?7!Fxt
z+sTS2Tf(`v(1wpLhQ2GUlcTQ`lIWDQo9hkcoMCNN$o~8caXdD}F1r_P9r6@Pz-Y(H
zqOU82Ya8WFe{A+xtcxNjHG!Fou%*Z!ZIGvkXp5A5v(0CB;t}P5IRc8<`F9BxO}6$#
z-B@r*Lbq|!jKOVl;O@1gUPdqWnw4`RaaLG$*pgTzeaP368bcW9gT^R(x^?Fyzp=~Q
zAxP}!Qjm|QW8AwvRsVdQ{3q>yL!|40$(*U~vZ>?IUdd!B1RN>aFAnmyPxMj&lisBT
zj+H_L3YIBH$wR20hY`jKQ-3!oqC2vl#Oe_XI0}Z%Rx%PW(+16Jv^>vK2c$Y32Mtrk
zW6_L<AI7;QNlo0-1*gd2BVzydPdHJGOrrZ#%p@LxPYiq1WAF**{iP#S?C}}ag<vZf
z+iY1<NI2*|l}wD+G386cvH!=aUXQ%*XrzSQspLMkk2No6lb?7TZ=IMPQJsAWIb9px
zo&3r_54_(vI9%?CKxY$!4AtI$#RgnUgA!{ZDOXpBf1I6S5cR{*X%KNxNV{x}$d}i<
zQxy%NdxD<@zH`{)LsrtF)`Tbn%j7}w!wu2qa1}V9Nwsyk+X3Ys2el<wNgOJz@2<qW
zvBGIBf91lXl2X#K`<lOXtlZZ+6RS3;)PJE^ZDh55p<?I8N7XUU>c@{=n|@mG+ZBiU
z_0$`C&<cperP!(ZIiy{ri(u@3^I<x@&ioH4LDs%ssJc-E2G(-{9I=Hsb(t{-Rf7}-
zXpuC<-e(|znt!Ps{VKivcWk4b*c0e4vn?0M)GgAWNzrfYEhU9al&F6B4V0>pN+z~c
zjZASRPiY1~G200L)@z^oE1ydRPyOqZs-1zd*R#(DQE9RMlI>E}m2P3Y6JBcBag&zn
zq~fWMmZmsJ_SS!xTcdE7oxu3xe}iEak7c(JcHy2i+q7kqvrx+SD}efp{lIS3t+iBT
z;E6OY*0hQ9`cp^O_VpFARjCjn#6~p`5oj?m`QGZ0tyMrs1gMEC*8M(8Vso0*eU&DT
z@x@myU8{JG`G)$i{Fk96N&eETwNjGv-v<?oMW05k6hOZUaXF-a(RxKXD8Z?cQ-Hh;
z2H@k>PGFgeB-YD25jh8A0q+w)grG_a7*?msfH<NEl$J=DK&&7t5ojT#;>Zxl#NU+U
zpj5qJmil>yx1gaSAb=Etg5*}(><Ch?ol8bYNm!JaluWt(B*4J|V5g1^t1zjOi789a
zq=~ifhtbdSjm4~lJR>ne#9&TbR_Es*LXMQ-WAdyJh6n;c7#!!+{?#4#T4Vptxl0oj
zj*|PHoYZ~+k=WVw2@Q3kP%`Yh5^otF=s&#JzW2Y$`(NvEqA|&Wpx>I7%6qpynd%>E
zGC*%$`U&J)#sLzAl*9^``TAw!DY4mrvih@r#g%@}REfb^nkn>GO(8K@pt4CtS#8)B
zbBpAV<5To@-VFYgC@KIkk*X_s0!$+C>F-Do7zPv~P*#Q$@?xC+LPx|*%E|5S3v`o^
zIYCvRqGY^~tR{8gs{J>Blm7b&Wp>{;|0}x_wk#@tC!t>rKc*HXQ-J@WAFVMz--{Fw
zaRF}kZ#}tSotyK+U4}MfMPu>YcjhU5bEvSQUKjn>SfRuq1{erH0UK+Gy>2q#KUfBU
zz%OZth>VG!ZD0G&@jsA~0TZ!d;uH~?fG8mHEpN%YendoMyd5oj6Z<S76R`msuVP7-
z34}z`nn0AXRii-+W`K+m5eyj2CFXpRH1{4H6AZC6M8rlZKs;11;E$2vx%d89kjU&3
zd$mZ20hy9(PGsVO2UsFvf9i;c0rN&VvGf}JuWyYqByV3qEa4p|em%ikMF9sDC8#)6
zG0cWlq3Q(ce5^=H(H9Z&y;(i$TcC(k1rCV3YzUHgfiL(Zh{Wblv?OS<bryKyE<#8p
znE?1fi>e3^&rCI*YDMri*u^U3T7)_;N3LbkC{U+R(NJ9^AVoxRTB>v)8n=Ror>xFt
zX|S4X%TwY7fI1R5_4P1*U%vWO;S>sihRN~7r~B=?zOglBQAAEvy`CjeEc@zH-j)c?
zsS_a*R*FkfmDuM2*ocr5aiY$tbL0e}Xe1DUIDrx<NMu{K)LT;#g+eToF>DPP!^W^R
zYz_MtgbE~bASeC(ss1ZB|G$mWm5RiH=z&)@?svg+zqo5L0V4V~UcL&phDwD0pM&C%
zS>kYcb)`gykK?kq#MSXeS7U8^_TtNNEH=)Ca?;renzTra1FP|$RF&DwO_Fl_hG)MN
zt0&g7#$f{=-1QEVzm%HjPL#YUaDUXeWH4!qqw_QHID}h@B~X%%)g+x5#~71nZ5++?
zL`C(4WRw!koQ#7KXVRyylkt_QPP}_j%7XjAMZ)z`a!>*>ix)N_sHm~jb^9&3)$7{M
zI75|~1d7Pd$3&ETXHf@IA@boQA_Pv9je<lS5fNAkLMrN`B}64#E|fTXTk)P^YPhPx
zdGfjQU6vJ4Lq7OG@s;;1atkN$>1l!!Axj|RifW(~5Pa(q#J)ZUe%AyOkuq3)$5Lew
zv8b|eoDnxxBtqoRpeii;o7?(ZHnLEK(ei^Js2Xow)d{Bv3U;JPZ|k@1`htl4olQQj
z7nAvt2OCPvs><Yuppg=(8YBLJj<XURPnIbOnI+Aw8GuPjg%al%{RDXhn9!S#es|BZ
zjxc2`gqR(rSsH4jwV`VJVO85SRizC_q72`3rkrE{4nTjvM^znQ7H{aAdH>3WjC}dv
zcho@NFQ{1ke+OFzHj~(6qPb78Sg*J7r4VP*qbdpi`q-t9+$Gy3?I(WqQ>2m*XTN%7
z6590ZB$JM~6%dsd`SxoxYN;y#$}+{Tv9zT&Q~hGlD9v7ycxy6Ui1(E+w34Bf>Hw3~
z{nQ~zK9}UQLH#VZc)@o$5}Jg!lfqyU5)!gpI%IzbdB<<lcYl~)wyf!-<88W*MAWc4
zF{Esas=`t%i~=Q4qzZx<1|TYyS)dV=l~@5%VUWbLI;fE(obozv(%6zX1b`TcSXI?J
z1B)mmX7ZgNd_+40h{!0IMM)ePCL$=Q4M>EAB35N(z=14Ne$_Hgn^f~)ID%0K41~!M
zIP|0!SS3fIAfwjU<OU&v_!A^rG-O4ssyYI}Q`(a|n4khun3O_T1;YLD?zi(^>Yptq
zsg2mO`v1lBj3P9p2%th5DTu&jQt^}|_)7jm%-1YCye-o!1X4I9L(*BKpWd=|eUGI=
zw?<K-qY)s;X6oG3ZR@{u=->}dY>Pw$WaAt19rSpW!ubB>!$AtQ4K`czjvD0q2NkRT
zukk7&$TCJJZT0gO3?l-zzWf%;_OG&)O>LRSrq?c6X31auu)lVXTRlA5Vs~4NZp)WZ
zY3VJzu-9(8e}u0d-)luB-|gt120;`sYH~gEzxR`d-FI#Mzz5+}j1|?uaDLsU!omgh
z6L$<UHR3|`)DOc{=-c$<lT}Sk88#jF+`nno^TmyugPkXiIrki~nYeUH!_HZ=v*X5b
zriLT+4a#DoB|yq>1Sul0uXk>>zcx7{u$mOdc2|9KFcK&POk~&~a7s`yQcw`tRDR*h
zt1tLTv~p$pCqG;Dfx`i3gkoW30KlpfPVQp2!vps-r;I`f&bj{UuL3X$ku_nlfJIAk
zOBY3})*7Uxr8cwE_QBBMN@P?OtPlz~&RFzE@s)_9&Fed!e7g1E1EE&H8uvzfdlB<4
zsbBlORK!+VL^ZG~o(`*I9AJ@<Fp_2d$>aRm&`f;nN`P9TpeCC#(GQLrJ)(j0PKw~o
zxzsC5dhVXnIrHg`hi9+*@=+tcd(^m;M!rQy>9#CwP0M#!#<bM(HZsYd$~hie9B0|*
zvJGxYwJlBWOS?`;BysLT99)4U>c2TPsoWrxc`)Vsi#?%teR%L4fQr?xpn^9K(tc(f
zK%yFeSO4z!`h)NFPTiyBu!9VzAW%_ZV9~<j`t^3wt^lh#5w-yrUYfi1%l}XAxMMS!
zNF@zrpccGgb5HoiFsRLF6d?%m_dT%i;13SJ`>y5#r$sEnOn^=D3qoolo%_Qdjs5hQ
znf8HOAFKDaL|MQb<(_!1_s)9?OJB}x>g<amgi_qFG2hdTs&sXGbM0=^YCrc0ZX3Xk
zI7n@{@WPGfoWsR1-B`Ek_75~&@T1nV&yix$Xy8~4bI+VP^2kGV(+?y{iHa{ZC7bg0
zlEqUwg{)bB<rUpO{&Dez7yYY;0RybweYc4>+)&j%h+L6O%B@?w<n#Zp`HT}A4>~;N
z(IRHHFa6TFu48lS!H268Sbf!`_U(dF5-{th?<bSjSj`5?A;G|P=Ltl0X$M%m*<gDT
zAB@|%ha{1P#zjX?95kjaBch&RM$@P>jQ;q*lJjS*`SDfjOr$?JVOtU9@(_qZR0&{?
zqY~nj*pm|z=3ty$An`TA6lecbA)?Gg{nY}buU*C|a~V;uKPZ?i%<%<O5g6lt>sP~#
zFCwX`I578L@baO)`}2YC6jZE!1r@w`lw1$*mRe(?4Qq?Pxnuy)xnO>A@rv~5F%YlF
zVhVF!NCB9<GXSSZ1>7;`*)@lJpnk<_H+*zz|GkUj$2E)@Z^jHwH8c-t9iV9|DzE`*
zwDvpSZAf#~$iV<eVIV$5tY9OehNReI|9WY(=Tsat5F%iH4e2;`quAO_k3ICt;m0>^
zUT<q_rKUDuhZU>CBMzyWG(Ek#Cw%^y<v;q~unEJPKJ-yB!SZjOoBQ4c=^b|%@aI2f
zCQe%U@iW$c^Lq^^e3(ZJ*GNRfFz0W)u?P(O>}Ls(f}^xG(9RpK+4%cQ>&`f(@xvcR
zQBBJBKKE?rsUOc)2kGyAt9HUp!N38op)nH#0~}0kvjC(;LTcmnH=Dm-*}ZT<{j}+n
zPD2$UWK3bsi#?bAb?~K^0T6PmCP}@$eTH9AN}>@7)CVmTDhavOXRs8ogw1(9<a<vy
zw^ZHyr3tpbF5-=Y;^0QiEmtLtRBbL&Ktge>N?-srjIT@|(_WoD?GH;XxP9q9lUffM
z)9fUX<q#;;Fe}g*suQUROv;d|I&sWgsuKXT)YGz0(3h7Wpc+etNujE7V=xs*#6*-A
z^yVcnRGlJYpc07>KtU9qdjt_R1SRT;v=E^xpp>!x<_7{-q9XYMB@rkYLlMwhRHvv(
zQOb~th^l8S2*q-S6*j<sf{N9zpo0Gtp5DPxMAmG+_x|G2g@tTQzPls0VSRe^7=^EC
zK<~p3W`OF6;{hM>M&a69QY%)=;U64u>0fPgV}JK<B2giMN`Y;*>%Ir&k%!Y?`bu`p
z7<EOE;V4p@BA*Q>H891^UBQMmHlO=v!&CyRLTV@OG~t1}f^5drHQKheb?1F!;{`t%
z^~>L+M~#816r({sn`~`8fYlfLtoXeP(i11N-+F6i$WQ>aA3vk}Z-49k$K8!*oat1I
zHDNK|`S)w8hYYTozMncVnIOM(*@my3+jrGfg{tbB(@rvgNC*VomtX5#KK<jLx19AU
zbq-#0N!9o&HK)oc1#-9C+zf1-H?Mo%oVuMRsR)U{n03Fnu)4Xq>F|R9s4)tOi{akB
zAb(lOAWFU2MVLg?Dk4izVjs9BP^4ylhvb5yw`2g`q<M<36~uSD2M8?Wj3Iz6`|1FY
z!a>3)I(hfOchA{y*<I1!?puAxm?l+G7}bcBo#AVrUi-KEmOQ)8IhPw;ojrV)mXGf@
zbVOsui86Wmu{hDdnn#v&UUJ{k=N9yI<S58cYc20PruvJAjT}&$jYLRbta)Z-=OuS7
zePBV)=02{54jD)%>@n<&J=#-5DxwU604(B+v5zj={F}QLKCvj*ljr)BOy9Zb#~&V7
zoeIF7v!SX&U_oS53lv=O=*qv&T(x*@F|RtjiI3g2?W6k)8CaW%oElFLg(<r*6}%Z#
zD)%a=;6Ff8rNIIr13<68L2GMMpZrw55JiOo0Em*Y=<Dly>@jW|lo~S30XCJ+uUMJC
z@ONgw;FgOo44NCmFz=kxNFYu`NJ&A;L_pVX|0tl=PoJTHfDJQj%9jcl6<^<$f8n{_
zzCN_I|Fh8&01_cnT~j@A$JD5?wymw$v8nU-7uOtgP<qrDSI9dN1cn=1kjivF_5{9h
zj%}!G{>z`zLxx6u`4Gs|*46@jPdx{~sVGtLp+^d{Ud$YGLaMq}4K`mlW7X6>@>gAz
zKH>Orb6;vZ=|l-bWUBJ3mKXl`+sv+08qfH+MtOCLP&9H*Ma206LgrNW>{-PJ9)hv1
z(C51Q2#CZ{5aj2-9RBGqnd6S8<~E6(@ga}5{|#2M`;HvMk+3nQC~zv4i~^3B(4T8a
z1(dS?sYTkV+`?-X@qrHG1L8+b8H@}+I<Ip@Pr(`sRe}^*KYjU}5C3@9Jul^38*8^8
z+_<_6-@knI{@;G$xuso(S$t)thJoeZ{A0<1=g+zH){c1_9NB17Z|;eCy_eknN{<6T
zPMopkFAuNU`&-XkeCzr(UE11=Fk>H@6@KPdFCF=-=hx&yCJ|*)>(m)zuY6+7^zS}<
z^*_2pvO5k>*Vk8Fbl37-zW&Ip4T_A3BmzjuC?Y1;Ddwa<zWA|Uy!iC|{GgVqF~h1B
zufx~>yma~xpL%IsuQks33ZkTxsDd|*imzWm1#cd44W<&DGl3~=*wAsu&Gm<WFqpni
z_=}%V<csQsfx^ZO!GcBM!H3wo+OQaMD(L>}AES-y>VN&)%;2F>uE;5{L_#U`a_(wK
z#hOgtf(5x7uCDp`Y3VWJU0CEGRam*Q_vIHG_d6tlQG$s;28Ek%u=RC3W*mT0?0L(v
z64ciOI^#5gfpa#^eK+6aHf^jv`X~X&AQqDpCu}-?^h2oB{OZD*U3QaVfm0Shm`V$f
zU%4EBB>~X+m%n6z>Qhc6VCA`I=P&qKP+L{|$KSPl>{HOlxey=*x-a^3zN@?SbDuUr
z;6g>ps0v`%m;8eSE(H@^dsCF_;UkWqn{IY%*20s&fX&xk7a?k%b_f9iVSxK<-rlzO
zK7ll@lSBk#C?dY<={3K+Wl3P=!cR<?)LsX1U_j&fhyGXU&y&g#00Lq_cN&suYoMi_
z^2*x2AvLMUX)2Ze`7QJRc<suacWS)!vlAw?W})a4{nqu1es;~WPyKcFedp~_Wi)bX
zP5PSWR)7A_F9$*0H_sS)(%u7yRNJD%yj5MByL!hoRJqWlgW!%iYtQ=qOF?z|*Pk77
z($qoKRW@>%x3c@IS1-K#!H&-dbFTf|M59jBShmlv>iX;-7v&^4@1yPCIAm;^Ny#ng
zES&V)S>L~U0XQ(?10)c@Qu@2s&Asln)%)%~;Lm4mJF+1ogsy^q^@>;ic<Y)kU;4_;
zU)@=uswU})R&n&-JSx6^1r@w`Xi{SxA{3x-!;P*dpFZ_8Qw$X#VkG40Xvs=AsorTP
z05%om=f2eO-5=P=yEUGEN`!*R1eVjVX|`n4K*({V8UpCM=9&Vu?({POhzJ4eetprh
zgFX-~UB<+aNK+Jc^lYASQ^R2&NVPPob6)CcD`O?F!a&t{StX-D=Z)9b1nHVX4)J_0
zufxnX-TTPHa`R1Y&pjJXIax&!KuH<U-aY{0oB$N0^K<6(TyaHk)DhK_C$Bm4Q_%%K
zv)hhszWbimk9|s`2;!h*0$Z3rf76eDST}vYhLcYX0hqvuP^>@!z!^AeOe9g~_19K!
zyB!^VSP`)6*8pH+L|yUft5e%=SAEo>qA)~W0n?Rn-u|PD=xuc@2-J^QIXDt>YOR^O
zK0M|(b5H*DyysTvW3!6Cx_%x2QWfxj+xhIToN)|=QcG2~tv-!{Z0almluDT;o%x^L
zwVGS1e|hqliLF(IP{L5FjehHhq2sq}d}3aB>+@>~hSc;q_v0H^B21lgY}<E^7&W#o
zQ)O*K+U_x``OqDQiiE}>2mQ`XOT&=9f5Onu?LWLeV^S(v!sKDKe?D{EurW2)J=S&8
z^J|%`iv+-L?p@Ha(VZ}*>AQz-lYu%7MRZhs)fJ!Iv8`U=L<|6^h-Eu_W#{kj>TDQP
z_sdg8jcm;1LXE<(A+6s&ZsgFBb$2|!`GI+B0wRdhQc+_CZx$7+UqJ<L9!XuJGeOGb
za+`ks(^T7ly2B20D^^-Sn32?0cR%?^2xKSi0sxBjUq1&FwEgW0TT{zLaw}H$-2dRJ
zuYUEF51(4xuwIRUBQlvNDsH~wiu&;rs-{ed6r4(n%bB}xtxwg6Is<S71Jir`wS}I1
z{nyVajcIFRjiV9*227wKTaH$*$=`FYJN(epkRgzwQlKghkphKF{#*nazy3`_<g^GF
zU#Qo;coBfA0Yd-~sQYhM1wj4L$E-O0jIKXk6zqTS;D0<=Gj$J*ifTwXU}XU7fAXV-
zLV;~+?OC7Rw8w6%#*BDj)Yuijzt}K|0BRy3CXSw))iwLY%z+2$4wDK1ZQP^)*;MY?
z7rI}VReQ!+$W}=hg2@-lm+I*Mql@SZ@`;q76!)S)tQrdyG1lgRAKbZc_isOS%aa|Z
zy4I!xq-<+bBLF33Nyeaw;Lh6(Es+?RA%LV)CX>#fh@QNc!^ZtDuiLaHJaO0Rea1F9
z(X{0t;K)&JCVj|u^~me}b2kAnmY-eG^~}<q_JP??>^%soPDK%lY9tb>sxsKjUeW!;
z!rYJ{^(RefiRVLr0}6$3WMkFoyVs-OZhE#0fDL$4q43B{J;=((_iP4GRPnl;Q6WF7
zDRbb&dK7x&(ic$xx6kS7>nNVFck7OW>m$)jz<~{%@SwW%zGDcv=%JUofLI2+U+1fW
zH-$=szk&+hJbakdi5L@f&$!jS^m68#=Q5CAyUqe}1(G+z_um&b)Tj5H3SjfCcgamR
zw@*J9^JlNW=oj4&J%M?%G+!(->EVa6Mu694NA&EoZth$@@0)Cl778Yv?tbL)-shjs
zoOp6-<R~o`)fxb6FS@wu$mul`cF-s!8@&EYkz&fUDx4Cd_tD3L-hAz`$HFK4nL$y&
zwzy{9#+znj_uQxcu!F@#hOC0T5TASI={(S~^NtKeT|LE{Z>(a@fAhQfWs8E7KQ!Rt
zKbxv5iSlGDz@&n}<QFe?Gj6az_nmk5vSvmOt=?_#A^S{AAF{t7?m5BY0MmQVz12YW
z0|!w{D*!Fzy<IQdcv}Eu4nNoe$|SMmZJ83%f8g$yS{lR~Q?I6_&P9d{G2c6X)7P(j
z<*AqZsJf184MJB?Q(Yw|?llmAz!HHyx1RvWf5GujG&ogZfFhAZMH7Y*0?MRgU+&o@
z5h%Sbm;U|CWx0?HREQk@KYQODC|6aj|E;z6nVCCtdwNeuAcUIGoAe^mL_iP(6`vIq
zm7h<2V!@8(;j;@Cklv;DUP29>Kzi@Fz0Aznd#&Fe`<$6cz(NQXm`}O)=H8i8_ndFl
zZ)szO=B^(^KDWGE00H=wC1v!Rop)>;*V5p<Gi-vgH$#}<y&7QN%0BJF)M*&jloPMa
zqy(!FvH*6TG8ECu=a&zbRI)@XyZn+hW=Naca$Ft&F)@si7b^g^9@_?4!)lTx2|T-`
z2U$D6C;rugE8=3wkcru-N-ol+{TXDN=XZp9N>dN-zj@SIzm7WIBot&6=)L?B&(wa(
z2LPz^gk(apjUw06U7Y!J)4LAJjvOt{b^iF=24MXo5A@%CM;6G97$Zj>)%>1An%;Gg
zojiq9y-Ikpb>DTj1zHY1Fzm?#{eQe90vZlGh=5pQYcjnz-x5Fd_{c9`4j^fQdVPPt
zqW7vnB$j}l8*VB`+_c|5Nuz;6l!%Rc>Pd9<W=}dXQA)Q=@MI$wd%E);ZOG={eIS6r
zCm$`p`f@8%-;xEn(>^rjg5M#^iuYnN3}UEyWk%19e{r3i`d_CFIpO`IMoqL^ZDFzv
zL77Dq!~no11J$nUuagmDn)cgwVBXRQa6R1sd>pU6;V;?g)ALiOLN#gluASbyNuwDm
z+LH<pVg&>S0FjCi0BfRUJ>~CQKkv`?Eia-iYH9KkYT6Y2rT6UKy3?2@@4aOL$b)T2
zx3fYk&(*gq2~Fw!RD~LlSvSzXrl$v49nzQ$FInGNLIdCZT-TisEdpmC4rIUr5zghI
zCErUPY=DlwK6n}4Oks_g<enfM?jr!!_j)Kd6(Xdn!W@)~i~?wHZa{<S?l1QieYPR$
z>i2_AM>po18gc*(aWXUt8364~k_w(8<VL!?JQ}h$KiYA_olC(UVn7V?P%;f!__$m$
z07xb5uTJ}K26fi2qmDNPRW;0m%a-)tduQ{BC*-E?0D$rh0L(xi00IrpdU2p*?dXH{
z2Qc{1L++vbqHms^d(V;4>_x3xPs(pMl?%;5{Y|_ODI*bD>3ix4X=%%Ew*vr^%?`Zs
zvb*w1Zpvl%+8uz68eC`Bn$Lc!u-BeV#~$wzLj2oIBNTK~OyrCG{?0qHhaPIjO+*|M
zlVh+b0o`}qnE_f4-oL7`BA!h&@WQN)XP;^N_!+q=(*TrbKAchQZEQhzSL<;nAd^M8
zgeXg)gI$S;6>!&IXDE~Z>NoOROb$)q5*I;C6d4xJ3J)?UzVr&7c&z!$U!|zg137?K
zt^p7~GfQ5W75(6c1X6KeQ&FEzzHd50U4RR-23X-qVv#Hxr~Jp6tIxS=!ICv%3L^}A
zCxfaPD^3ky%BSzQ1%NVf7RnGI4*i(w#hAC8l`6v63@Jl0FfXqtt?Y<Kwz$b7bK&+|
zqZsg~zjxFrdu>%5h?zsLfr7{h7hOEIDFg3e3|JvC-~lKSicUnw(+qmfh6za-AoS)L
zC55_}i~(2$s*xrNJRk##Ng9F~D)3gx!-VlsIc88ctwl5d*hU!a{=x~P-n&yv=b%_3
z3sFG9UbXDpgtibfs7LlU1L~|_M;&h%S}QavLd=8LUgt~2{QFN#L`<hl1Q0D<8Y0-G
z2ksVt_j?zBj@xfBK;ua#=eC~Ky7Ml?#*;cP#;Y1AfdrnJiOH#*H*awIHfBh>i=ByV
z*B{Q0iv4cyg9=l&@?~c-w(I}S8(6e-@?}?$vG6fbv~h7;rOzp;f?_4{;u*F$^NG^x
z<s(1v6AHEfGZ4O__~=9N4%=j>P9xDo2hLeG=)33-WTTcZofUGr9k<<;oiw)fs%zJM
z;FMMW^|7&!KhiL1lJ_N|h=XXY<>Je)^gr@&;mD)&TTFJPGFd|^u$d$*5r)(WV9<5v
zEe<H`wHJU)V~c5QD6U=upyT{ML_q7I2Zu{r^Yu+JexmNGAhAj$q#};U<V^J3ik|<z
za_*h8dXZ`1=B!8&o|Fq9fsK8U_uevh(DuVrd?sm^5Y`eX*FRY9?eT4sV}o>z?q>2!
zH`<D(vpK}82!V0K%%v!ncAMV1MN2+*KC-60F@orj4DY|iQ0LWfjTUgK%2)DgfI>qa
znatW=0${^94yw%HNOs;-h+w&=yB`oDFQ8-;sRGb7J$)!@OJlCk5CLvTL9-EhisgY~
zaY(K)OkRQ_$W;xN6$+~!X{~FapeXC8mdxH;40m3P;i~jfG7J}9kc>R6JNs`A^=iI4
z>i9d**Pxm>S0!s*xwQVG3!7Tn8}``80|0Wn?VbVpXU$?jq}+YyZ7ox_Y}jsl1>94!
z{Deu_aTC32StV9aq7gBSAtg(OjAa2c%IiDIuPhw0<8%NN+5T5%y9+Ol0k_MZ48s{a
z@Y0KGzyG85Pk*|w=Uy6@*>I?CPE^m+`L+rKE&ocp7prHW>yk^kxjBEtVE~jZKt`2_
zeP@TAJtx1{UPL5b3`o4AtR0;7T-VQk*Y-c($W56<@bc<hJo*@pdS8B*-P*6d%J+Az
z`pD_pH{eZHoC|Y<0Tl1NjU4QUJ^;Wl3=ud?TTTVmJu8!g=(X4JkoN2zI|0ay8fnLj
zE3R14H}6%s>Z<Gx+hw=iS^?rj`hRnbmr15oDtQNW5?gC}oPP6$1^a&M`8%I3nnJq`
zwv;-MLZjfw!}ot^uMrWb9A+dcVl`z4K%ZOMU+%Y4hDQkLovQx*n_h$fl4;7QG#rTc
zUNjfkJ6~M==O@-9Yfd|0G}02e9miRi_`X-y4~nRB&JV=Sm%T4}DLRSc*o(6SK&Opr
zK!ZNNY+&U;$$)**%>WgIVUPf}8r_&_^{*`JTii8B97!USXViOuW-jVO*>6AAW&x)d
z+iWJa7S?pdb32M4h{D4VBn)wUVQFtN+aSG%Gsfp(+&wS$c!-Nj<+$vei=7O58FW65
z%U*;@4OQk8)v@WQvwj_Q{2f#(x&eqJL(D5vSO_S;{IWjt9KY|V?8xEjVu57GZ_zMe
zT=A({0=W4LN{>DsopN$y4aG5cb`>X#r9uOHp9w_mRn#dfSyNuJbj2-qLCXSQ{c?6i
zv&B>Z2Ix8Oyxh7D1LUSo0N}g3)}DH5G;H{YuYW@UHMYX`iJc<htB@#F<~@d)&NeGO
zJ)O7TojdM?OlzC?G6#)*bqcuktIPfUP20=>0&h8EBj4Sz^g|!QgmLX(JVQMQru&{-
z31~m^7yymaXAJ-I#e)yrzx2cp@L&;{NJzmBK-UvbSWwFz`v6cjN>H!xq9Ra%k^nGV
zTC{lZ(I=bt+B-9Rv@e&~GVigsSun5X%U><`b~k+R!!}!hi#MZ}r<6#$Iv4{NnaqtZ
zcI^M%$G?5mvOdk5rUoyugDQ+5QZnFyEQ(UZ)G>Vg9-{$?3=u`lFoawtx4!KD_x5Gr
z;$b_ENiW@IJ6-~a5D@`jN>mM**+$Wo&o2M)`SS)lq7(OOdEbl?BFfePdhqlyW5#5j
zc(w2HM^{AIh&gACX4qs6vKE;r0!M*D_t>I&`ncTEWqp@Ewj5!&B9sZlWSE%9dzmq+
zaM<?ED^|KIo?HzO!;m__XESz1U+JpH*C9ji-MJNjb7;)v-nDHL`rM@ttp;ETMjfD;
zOm5XceA^4_5SbVTfPn(?-Wfw$hL~HQ>;B7gt8%#pYpgLgGL$paU`-Td)sRA<tcfL6
zXZ}q`o%QRe<F7(%IMg`@z!*bB0zgGoy()n2JMS<+^GWYlzzC^#92uUmQ_qXfqgd{~
z`%VkApLDzeh$1&(LT=e|IDxf9#D>T+n}|xq{;MyZJ#E|WTdwCQ$AFYe9>}&g0_eT>
zq2iw}%eA)&(75fk0M>rtbA!)3JMy>xYe$X{S7L~I6@;dH0y8m#iP$KUz(doqXd(pk
zJ^TQzSTgJr9|Hhl5D11s8P~dQ0?e>s0%%+Y8Fha3=^rn>_}s|f{5ooBgA6KAy860!
z$nb`pc2=m52iuQ3ZrHDX)qm@aOF#8VRuzRZqvUkf%ewtG(a_=E3q;_&8iKWKj1UV%
zRSclBXP4DQyX^%a7(I<2`3M;<Tz5Ujjc)wlNoCZ|fZHq%v?g_pP>?zE^7+U7@cHML
zS#E7+DthliZ-4?p1(v-4d6eB@yNt=_u%@@zF;H4JSX|Xtesyi>nrBxZ`IFfXJ=$@^
z?)gKu9SU*$rhL*cATe}(vJebKMlq+;E$%GMTRZUT+QFw+4qpD`(xd+G*;9VJaOG-u
z%>MabeRNt-dN$;}^WzGc&m2A)W&P5{^M7>L{G|iF4DNAmMQ?oPE1jSD^Rq9l=rByN
zi}S{O^00A0{EaJ@{N%2Mov|w^`kl<}7`$TU(zShq#t=Yfzi&!Yn?2{Ml^4!jGN7tt
zgn9P*flpuX;=<XbV|E|%?(IjyD;oyzkpo8-TFZaDuk%|s&g)ktvI2T}{ov`pd1~=0
zMLwhC!_9PFrw(ua)S;u$r=R)#{NFsZcui59q9=CCd&)OGzxFd1J+rcNkcm}&s?AqV
z@NYWmdHp)-*Z>=h3?&i~5n*O)t#j_?n{QgWbopmL`x$kyXRFqVE5#RHuHz;Y_S_4A
zS`!{g=9r`Mf4QM}>BarO`;#1SU}o#>yo-&EhJE^Diw-?v?OC61|H2ovtwq+Z8+`V;
z?)x8(Z@sy^a6!`vCr<p?55#*mhO=$T5w{KXs%t;>iOiT`-8)RDn{LgG9l!4D-|7C-
z#Vu!jx#_6mq#Pq+r)&(fitF!3Zyyzl@D9$Q6#M2zJAQ&P6GR~bWtg?s-)OepD!1Ft
z0x*U8xP<hz&=Lct!7!kitZI4Xe|@C?x@$*%?|aRMA6axR7c~qnSyX)Rp|+2F#0+VX
z*pbOf*$w;Dr?F~f_qV^hZv6P+-~P6!NU6)NUn+a-#o3H{4`Yc*RxRnf@|u;;&z|tz
z@8(7g7bz=D>9*T)0PV3m0Ai!wH}ChZwIBWD{NMd*-2Z$hH)>4mO5jYIz}O6~UXs&S
zG0eBVu=W=>FEmX<NxhZ_9c6%IEF2+ZrA0X|X%sbPFTQv6`L`|foB?3eDNu?n`eH=x
z$U|FybK3Sf4ctenicOjgtJOh{0H~}OEYd)Kbj*)lQ1Qv+Kd0#FN5*csW#P<2$9!zx
z306@OSOWrqjmGZ(9X4tGfc*5@1z-O4l5?(EIewTe6=h|QD|QK7?+5l71At&9mQTEE
z)auTGZ(X(OOTS(4{p*(xACm7a_N`dwbG<!J{CdCPO_{Qj-NrWm`Ey&I{;Sy^Id{R=
zCajt|+^&gf_ToP2={aJ*QNR1dHin9xvX(R!x97OF^FFlg8Gl%K&YzcEaPQj5BXeu}
z-79muw;64I{;3&f{O_!Rf#KmUcxIL2H{QF|`a!+mmPMcb+59h!T0XYj4i31rJ#KKl
zWHZGxkK7s|p^z41)SdlJMxFKRsN;385#RUs_uqc|?U!D9>7$Q6I(F>1_r3r4F(Za+
z>@u9`eB#OC?B|<5|2fmts&Nd<p}DB-n4|lqZrS@^|K)+<mt2I<mF~;UhrDOR`M+6v
z#+fUA^=oO(H}ni-VlM#gG-LF)E@}S232GvE7aN1&EvC6J@T*^T{PeuRIkVeuzj^i5
zS2elv`s0pU``F|8lTIG-y>E+m#2`<`^1}0fQ~c#`de^QqeSMiYCJ`dZ0DV5+_M7u2
ze(ZGT)e?-cD8+jBea%N4PGr@UV6utKAh813X<OJC(^sGOf8(~^vH#T<JAd?@-bbDu
z`lIuPed+T)E?Wp-`)<E|P;}%8C;HHE4~7x%{IGM*aZ8r3`ObHu;q7go`Mfr_#Cz`R
zZn~lKOJBA-P3N_%`=5Pj-3_;xj+JhY-B2nK$T2d&VE<rWYx}s}cJcr?8-~D*|M3rs
zUp%96@>VL|7>1<d);Gr^Uu(wT4sD7WGVx$6QHH(dKne({5Fo@*1NF)R#9nmta62TQ
z6A7j2$S_)(8mA0vJYdJReYb3HAobp>37zGeM*3k1LckyZuEn^6w<@k0P{UAVPn?g;
z@TTmPG2{2#vUQh<`L=AvI~CFl3zUfjL}g<l@6J7T%DcC1zw(jQPt5OLKPV;GEr#(<
zGcxbleTSXKHHms+5(S+9`q7j3+`9d;2bMfFzu47ZYRX23Zf)MX+YS@jn>>iv#$MmM
z!>B3WDqL{?iu-1FEm+%^&*_PKw!L@fDM#!utY9@3OT-4AV7!-)?Kf`n=%$PAS@igV
z^87V@qnqQij+^wQ_l(a)dj0l;yNz{#5_to5l+D5a_LQxTo8ETC%#}|qEOqxml&6fK
z>ASVPcen8q+q0@M*ltwKS;r=T%H?ttMQKH+Qr(2A<DCTEFjN>m1tEDKg83aC9XH){
z)8&_6{@ioVjT|}h=%bH5@x+t1+G;DZs!pwqR)6o@6=#2C%AI#K9dfAn5|dFFcqdWR
zd)qCmjz1}T;6W3vyBtxLeXL|u3|ks};h8m;Uu)NQNIsX{YHIUdyJolDmTcCEkP;IF
zV%RMF_{a5v3%DUy_~oyMo&J$EKmYmQnP&=7!xug`?EB|(hP)C}1o3Q4=M#^ux#iXj
zD3i@{CZc@KHs-U<jV6<$sarQpp6XOtMVL`6y|Bj~6TkZPwiAyNS7Mt@=9ep}iZaWR
zB?~_Mv970QMlNoju~+-|zR|Sbe(F4k)jQ5aE6)Dbnsa|JZQ<f*>_`y_;=?3fjg9;I
z)_&rntDbsxoBJQij~ZTFv~=Z{K1X*y*jp~CFE-^{?XkxW{o-dErf#FwKot%Yao;O1
zb<ciz%&{kU=MA%xc)$q3FuYe*h2f;DEG@r(leZqBGY#U^7`|xc^8dbk?wZbwDKscV
zMAG8NkOmcyv1D9d=cnE?^4BMC9j4hPvhc|yUJ|cD04u99@g`4&RYf|{o(LW!ULzt+
zC+&qcjqt=y=g^1LlM_{IVgLYu07*naR3O|)R*l(^fJMnN0Y%kftg+WzhMP>7TN`_d
zSi^W+Ru5yKgrcAvgCi7}SfLPr2xWnRFp&`37&_pzfhogDy-P9;h*B9Ss3MH+HT_;=
z)h0^@q*_*0<t(LRi%B7A8y_12wB)H2OBvFTF@+E^fd@2532aWHN*V;xiN?ew_&vY*
z#Jn~Br624(uC+l_DD<$rjiL*6*01B8j`ZQ0@I3%#CZZK9R$P7c)mLAA^}KoWCQX`j
z!U-q5|NZYDJ9ccPnom@iP5&d0Ec@O0TmAh1xV07HK@^^PUSKQ@E?;CC3z?Q-PL)6m
zB~=lyp|D4!A3PNYao%K^V8km_Ig+lvHRu1Xaj%`4_c=hkO0jp<uP<nxzH`$d`@@NH
z36vEu!w~?O*M5q_V@*h*fh@(VGMT>RuQoIdx6N&UCo+NcgOo}v%1m4;b-nO(Lvwp>
z%2Z`iR}zjmxRFf7)hoL0y>H|RCyKEGwd#e4)CfGOu(1QnmS)<TxzI}Dl?{4(%4^n%
zNOsIH&bL&G43xoMJrNNM;8mbRObTTXR78a+B6d&(B7!Py9@ek4uZgHAtl>pnrEgq2
z=kf=aJIY554NlVZF~tCZjFFu8#nxPD=C^j<dRR_eOdJJm9uEP?FocAa2%N~$Y}%oQ
z<^oX)V|@~t-&kPMu60$Ygn5t50uoZIL`pHMDrH2KJ%|zdG>O@GB85h(&Vfn2fdL1F
zLuIIfoCuR=V!#0ERiR<2LJ*h97&XRw0h>gI2_mXcV#6dP4&V|SRzwN`sd@^H7X<a}
z;fVtGD@Z*XQgEPuVj=Zj8EjaTKrpIaJPE;akWC~|kmTJ=?tY#1>#SeLI}H_IKd8q9
z%%4C1iYu<T{`%{eEnBwDHrt$Z(n-f2d+e}b!&KFKAyP76Z#;*B_)r+{6~-u)8BDP1
zRE(%M#;6ih!KCm6_E4kZl}Qy4kdiQnl|f*JD#I&?Kmt-xV@aXj2_rnb6{;>)vc$@O
zC&S_dMxe$^$};JBC8cP_6SFFjS7j1H81W+n;-sZ7eFBIod`Zc&u>^P@lMos9q(&Je
zP$$STdx!@zO5rSw5Lmsc6N(J6R{_{)NtIwSM4IT`dXJ#y3PVs2Phtt;6^4~ylmvl}
zNEHHDkdTU!4Zqw(HNMxm`C+D>^RTMk5`+1sm)8H!mGfVk*F%l1WQ_Wl)GAm_iPW>n
z`tIU)P8xFdQCmCbLYZ4iL7I5tU<7x3Q~4?N^@AZ6MgVys%MbtpLK5}RYNjHo1z^Ad
z08CKZmE?d3fIvgniDP5cQIaN)RWn2ah{7bT9->N6c1DbZV$YzkLlo*AG&FrcDe0pm
z4LeeHtSmujSxAkN4}+fsu&N3Y2CFCx!2uAmtePqVgq}nWMyil7iIRaTzyu+1;Hu5E
z1nR6`XZ<?fzR=gy{f0H~y=P`-R@E0?c;T|kF1z{Wn>#u>cHVjCQ%*VMJ@0u>b8~YD
z0}xSo8CB}6g4dE(7lmgd)eb^cLGs8=f0>k^C$Fz+$tORCM{Y6>Mpe}d5hs%V0K88|
zu7HuwA4}g_Lq4@5U?`M?2VW^oj0Z<C@td{!)+Fq$3JSg^m=fvN@W6)>sS2qIGi&-6
zpmG$+<xVteQk}k<YnER6<j?@EPE<)6@oUaVYoglpW-{fkSrN%SNeM?3s<2F*W%=1{
z3x0OX(w<_*<Qr62REQiPFc~eCr;PMZd}GhHEXQyLj*->s&8(>9Zu((rWwlgdFsP6&
zDwBSas}@)gmF1%#4VFi(#X)Hn2U@xQNm2)(jqhW6W03Gar2;vkV5%(mB*zfmTnWTz
zs+L??@U?3^{edWS94LYYL&eIeNCVX@km^!MERW4q-0==%{dzTD9q$yRL{t4a@V&Lx
zd;jR8k6v=gC3oI=XQ@=$XP<pO_`wezbkIS$TuwxsbIfdvNhEqgt9R(=qZ+Crf(9jB
zN=3#KIa^ZFl-er3U?NnasQ5`?Gn#~310OLPs-Y6VnDkK?Xj7BtDN<#5R7BMhdL-3^
z_NxO*(?_>%^Z-q2rILwuOzDOyryXX|GDw+NL22qhE5i1b%K;#L%rhu8gQzmvmJ+3&
zii}V~KglHxwo1+Qt(Z!iZAFCrB>PY`fHHY6Ek@5dX3EjKjz0T}Ik(U1L@sY43uip#
z$rnu)^Oh8Ed}{fJ_8A9H;H;96R<igAmt}JtS#sJHtFfkKku)_9LZ%|z)zpGYDJqp^
zOlu*uG8!=Pm=zinc!RBnv|Uk&phAEeCJ&BP$suDXi0q}SI=SJRhTSU*GCiCmR8d)h
zVcjMsfmZbXRgo^HG-#oT1scvMY-8hvTF0iK&iZw{{ZR>JRsI=@^Nlg(a{0jrAH4YD
ziywI40cJk%zynV`_0+xh-rHKMs&O0}V?tS9x{>fOPS`r#XgaD56)Pa6tK`95Rms&#
zyLshdrdoM`CcR>*BbC0i)^|@Iw4l@nt{4wV>6lV77ao<TXzhWed|T1V-{^8tWrK;Y
z85lbaHKzey4NkvSY>q0^X+i->C?fIdgHu&QTL~ql;e+tn#0W`<CvB>yYM^AZqjIwX
z-?1U_>YdwpRPM$vPQPg8%I{pgXvyk+ZfamEiD&Xz5dGnyH6Pe#Y$lBAF<Rw%rOjrW
zP-@6k)=N!oV8Wb930>mrr)v~ddk__)p=w2>OLqfbpQ_<7O0R|zp|gapm`65H&ZsJr
za(W0GM1Pdhbyis?l)fl25vsm^<ru<wrF&0?8&@u4vap~Vo!VQl3f@7evwj`_9HdT0
z%_D1LY4I9I{LSv{^(ym`^1U&JnJdkBPdxENQ&ZFX-uJ#!Pd#<|^y#%>fY&C!s!#OP
zk*+Gjr>^X4&!NgzuaT0a+Beqzx53Ufx>~P!KW#t-wO?N29MQ%{{5qfwj=1)q=r#X%
z%>}7>A8oiF+HjW}pZLaWf;N21TNh!aO58kMfy^Ly@uc!!dyYD4oAz&CKmYs(*E_W)
zXGIBx=;2q356td3WZDpQnk2}zHfK;F+1hZ>8$Yzy-qDQ?z2^Hy7wWaL#T(urdd+Q1
z%urGic!A;AQ-=QcmIc4Pb@A<I@4WNGHm|B2R0?dG<EQHF18SDzhAVJ`h5EYpoTzqn
zZFtIW`5Jg9q0ahs{8Ny|=Td+0^=<aeTUJuzpjV6Yt+kz<oj2Zi<7Jm!7Mk%+KmGJm
zPB~@j)Ty;SdmF^_>-8@03?z<pl6z4iBVJ3+4QaB!`tUZ#?B4N}E9O1(N;e8ErUAYj
zU3&kbL#7Q8Q6(iJCIThFZY05TRHEPtyfvYU&5JZ6KxtJSc<lhM>+*eG0C;euR40!A
zXw+|B9d9)>86=m40llgr;!2TD?SrtkiEh(~w)Tx-Yu<aL&3G$UuDs@&Yp%NLs=0IL
zPMS37OJDlZ2`8K|e*E|f--mj=T63Y!_wPJ}kg|cQF~rt}s>(yRZa?T-yZ-KxW#`_o
zd`XXhW$e&q0BniV`b6;x@=yaJRKwX?seRw1?)`r<k_t})2m?UID6EEg%LEV{z(}eE
z>-f(@o%QQ@i=mZZCnRwpYASW9QnCZ2vu$eg=>O<WHq6{34dz!^e8GYRmtTJQb=O_D
zY}vAHx83&J-~M)`887U#QpR69Yo|{6?=Vzy;UrMQbj?$%o?6n=6xkSDR4Tw13fWz@
z9<grLdbDNM_{Q_@TuQlyAvxJ<gx!90TRvh{NG!0-fItalh82(!SXnod)WBPI0V<Uu
zOh}6pm7$qXcV`gN@u?EQzxUbH@w%w9ejWcDgbW>%Mx;=Zq<^dOXQDv8l=Q2;VK<^S
zPEaY%SJjtZdg;<jFTLren?f_*S!bPf<dH{)X1vNkoxu0?c>X&cS}9~x@i6wPhgM&G
z*FsZh^4@_3$igu)j<TdiFSu?UoWjI#Oxf(DiH!$tmp|jciPJ|nI`34OSrkT)wkvGD
zj<bL9MXKXJ8Fkj*G^9n;)sooi?~!z5Q?&w@HfU(3bb>^+k*fM6&`SQVriVH$Wu=Xd
zfvRuOv=6FMGh2PgRG($(Q1fck5jEa8RbHr-GV;o9lV4KJ5vK31q3`sd(!D1kP9aaK
zELI4fy#XZ$oHm4oqWbys=g*xx_g(LLSD{e&2RV47kbZ^lMP%m8nHOJt@m+V_RV)^3
zoACnQhZ=soxr9O;ZzrS;@I)$xp}?1p-Qt4>je#|!no)H`F_8+qXjVOp8Fbv&S6bEU
zADg@Wp_kVF{<_X9@9X>G;qCwPzR64;h@lOMBEs8deNmnG|FcnN{Wpv%;iFosyJ3G-
z+W%Yqd4tiQ)kdqTtjPu+BTZXhDw9UiCff}@a+UVlw24P+tdB}_mtw;{tmKVUTQr}J
zl&w7wt+bKVOtVhI^Sa>yr!BeFuCfhIi4<x92Z|vXVk=3TPi0q0E8TV1U3c!e=U#Ee
z6$c-Du=n2D*I%C-jm4|2!n4+v%jE|ic;J#tE_vXA2Z-pv0}njyw9`T}o`}?j0ctn$
zuUSlW{97UE8c+f|5UK7uA+J;*<qfQ8Y-s6+g>?9+MZNL)_b>hJEo;xYaOvW1IsYS5
zEhr{L<O!-|N#91?U&nte>a72ULDddzAgR@@v?tdzP;Jn|93CZF2?GEXjWkrhRqJQd
zjRk{(QUSQ&(^o5mkVF6~@?wdgWvUiORppk{b|6hOu&ZZOH3dT3u2vO1QW8&8zbgIH
zwY^SBV|c2Vg=&huvKeBC2MZaCA{9G1(+0}b8V1(dVzGGNefMp*-FCa|vI_ua{`>cW
zsFIZ@qW=E=JMOsSvdb=e?6Joh8ynyEzW1Gc^2s~xv=cK|#C9r4gn#uqs^i}fnrbHr
zk+Q-``22YS2_#j)3m<Q%!~$kj4_L^ALiB^9wm4+FVPC&|!J{uMzxJ{GNqdbmPKdSQ
zC~QV$!8+>LG}KxD4WKGm0-+#24mB#ODr_Jicx6CXIYjs!xe&}Q5m?lKoCilRiHu2T
zk5NKUPiID|TM=~MIPkPMA`sc2X{PF+L_|;~K;V_sh*eTiRlqVSSX5b{3{F@+Xv2C4
zpzx%W3=@MA7)FRl6r##8JUp1#kU$7TMCzcT90h3?76K#)c0-8BreYsZhNvir!CsXC
z2!vGCGi8cP?q7P(PbVDl-R!8{E5V82*3X(X>%|ve{OCtN+S=OcoD1Ri*W7|?D8Ay~
zGjnY--iQ$+PCxzhlTSW*>#esA+4|t$Z?wU>e_zLIk!srss{)1%kt82h15=Sz1UQ)e
z1_V+Ico<Q@y8|aR?emo#%Btc-6_k$Lt?r<XI{u}ov;G@FlKU1<WVH-oaHPyo0+nHe
zkP}syplS`FjJIk*gq(zdoyvwtMMV|FM&SrVWSLa~isAa10MiIqQm3R4Lu^F)efM&c
z$ss??aNYyrona5yRE$kQp2$*Z?ZC1pWuP-RX-^a;k&1#<1guOJO5iXRF3r0u5qa{x
z8m~uYh@zl~)R0lHREEhgh@hSf#IS%!NkoajmQ|GuR8n<r7y6xuSc|LUxYssL5mk_5
zH5`VAt(?`=@tjf&08k<ho%8^1zx{S1I_$8+P(#TZ`Q?>jJpiH9f7Pm0p&4(^oH-LG
zPCV<Zv)=#y_fMEGA-t?&7F25eD%<&2FQz*F?NAYyQJ4@M3&W6t&azfjceJ8a4^mZD
z0tZcDjzlyf$}lT45lkx5NLpWZb^KeP&iZcvA=}OfE_Id-)DQT+#qpxYy64}Uk0Lh1
zOK}G&weI^l%@6fGYe{>2spwrVmE*E2Tbo5=8x5Ocvg4cz3#=6xTsye%!SeDaOY0ZP
zz*?|z(a^>TyEX6e88wy+yY(-1-}^nYW|oN}Y1}e1`9QnX!BN}RP_GbzAqB{(dM4Zd
z*!ljE|8?*K6ZdUD{?|4$R#em&xK2OVp>B{}ukTz^?ptsBdle-w{l%^|zI!!yFXEj(
zGIZ}RNolR?UobHHQMY_%w$S7_=LS2qFrj7t|Hs)O@bDb5mVMXr#RZSM&K0?B4~a(Z
zrRoSI6vn6xbl-fI@0ma3r0Y083Mxv5z<@8Wo|9=F6}65}uBa-2pqz8hJ@@Re!wxfM
z%)o}>{o3w%?>!L(zOPKiTfThxv}x1MIp>_?jyrDX(4iH+e{DQ}Bkr%Gj@P1+NYiu#
z0oD9Y)0qmDR9!{DI0*%EkhxEiUFjyZzU=DwH$k2C-vBDIa?ECM#glzc|2A5)ur#nX
z<JK3XR4xpQa-(v#(G{0>UHhRPMA_bzs%2{%v%X(#Qz^=1T``VjAUACIkw0k|zqe<g
z*wJy*XYGQA(L6HWI<7o)E0-J6V6#p|;-axl1av)pMPbg>rPfhp&2!hR-uv!rc=+7X
z9-k=e^+h1BP$m&5C@X_Y2WDTB9lu+{4u|#L^^?-vnReR~n7IGdI|lFlR-cwA?vK<!
z`wG(GV_V7;B;Oe2#$@ut{D^(snrAz&dVk!xN=j>UZjIHb)HH$f?aaQ+Er|Om(+Xo`
z_2cD7e%7~qmUHETFZIv4tNG+VES-aY++DZDCw5~ejqRjCV@%kjv2DAtZQHhu#<p!V
zww>I0-p~CPX3lS)efD1KyE2j`e<%HbHg-4FE%|j>#~d)8Gh*!)8D1yH{N@^^D=^#~
zb+l0mRL-If-2M;c3nI)Td+tE2T5-HXvui&$CESD_7p3U5|BL@+1A-$xoGpVo&neya
zJ#0`z-AnIkEhG35r#UPTx~%9YBjuV%TQf6O>(^-8X#uUo8F&Ae+_qHyWIOkJEvQnY
z{Z}bOTmj5GD$`-oBHk(J`M0mPWZl<_vQ|IeVSr;Bdpssj0&e;q*=(yO+wDn=oj%tu
z8SqzcZu_Zx=Cy1(FIpxKR*E>qr-<u*ZZ7F<Ew{f*I0zpNi-N%s{Sv6cY0072-y6MP
zbe}s3J5A?F1eCXRb-v<roL+iF1Y**M51PMX(ZP67E#W$uw&^>6;YGiz|L~NWhUj*v
zFg4T!82_VgS=2$YP9R^lz!M*jA3nJuN})VobvSO@_yNr!I{<7voZU8t(HXV!_>sI>
zGp!vE(7o9{KTO;=kSn2GX*5>xo#=hM#A&kfyzz<s>MJVJ-m>~b>FQlm!{wE}al+Rf
zd%BDNSHK=yu_GLWJ!>c7T-%5XLsT;_2muA(y2k1PLQk=?vlD{e)2=eE-OadHpg`vC
z`?+5r9Mz*iUF*}9M~F!WD0XIzxySP1{VY*;I77%-L(jO<fdVD=y>qUkJn3-7XpeQy
zoK<<<j{{FyO`Pyc33Pi!1&l=4rhA8<yT%(AINH)AL0=?7RGFh*b^5OtHxWD*Fbg9F
zI!)`^1Y2j}I|4}Nf``tr_8k&pl!63v0}fKE<C|}yJhBHj==U(o?cj;!s??YZJ?v-K
z4=-f*hw4zaXLvC+3DjT5Px$UG$E?4E3K0EB$m~-*9$9&8ka?U1Dt&Ol!*~H#R^IO<
zKX&D0PEX1_rjVu-_jOFuNY>**sRrzDZ9^{Hf>mu-|5SNMwszV_^yA_xQ`PE>E#U~%
zc04UHnL>M<r0Q}={|rVj#5J}_@3A}r*HdO~>uqY72rn_epR&0ao9JoJU9I{Dr=`J`
zvgeK0>~x?J-0?P7^oz*-IFApMuml3;3CSU|UwG!>GD#2p{rY@Md&v#>RxUk+9yeyx
z5%tgO^yZy=>j#E80#Xs_Tsc~@GAj<pI|G9(DtRZ{VjMbH8EP&#0O&M&#Qn+szw2)f
zRva?WpBGHZ(Kas_wT)$-ucsW50uxhVz{sJf$Ys>ugY7M=AEWGU01)O5foW)M-`iOP
zg?zN-q*CleM}!@g7AnFMO9F<mWY*@fQx}mhulbMak}6MpWH6L9($ikAOb=j23WZZN
z_=LGUZL|3R!Vcp8U@Vp<1E@TNk}QRWH`Tj0dz`35t}?Q@o5~N%sWeyWm2$`0+lOSh
z=z32)ZnOu5lc|)#HlRIheu-$jU=Y0q3=vL;QmA0}rz7&PtyBOKp?dmY_FGd|4-6l(
zb3B$Id_m9|LGwj6i~|ReX5<t_FjV8AkEiW!5bxWi`)w0IJLr`(>gob&O({~eSRD^1
zii?XuF%^%AioZb&aBbmf9qI>U>yHN*Gr|l)%(|+=GpIs669tqG>cO2*5j#feYhnK+
z&HX{k&_89|R8|-hh-jEKp|p3#UyD>4!#uD~OFj@1<u8!nC}kA;zbuVL(SO(=Tcj7}
z>kEsPw@d<FkG77iE&B&L0X=eiOtskcv3|Y7-U-&Yi>>=`@U@e$b>%n%K-9#%Z-IMb
z>S|9Ghdm7=9Pz6C!dQ4uYY(zUYtDdlLB<s_Q1(WHOhNu=2uiMv?<TQ06hO~dO#y-y
zyt(6;@l-Qy^#LPFW8{~akU>)+Kq?vjpsB&>LwNN@38AzWhdP!-)AJ>M4nft#<;_fg
zSvJfid5qa@(O`A3xGfG)B;izhKgP#vFg!7m1yh)~E4V9dgkgM|X`2z8nk+A<L2HfE
z1429UgK}R$p^wPE<r<b@W}EF!EgKHw|G&LLBI189Ao?&fX4SFA$-SIezQH1u992ZL
zgbS9V9p8BH%Ko!9Gc>Z2?a)zfDTh=>3Qph$Nlz+004xUw*UK$Pzm39G#NxE|;#4YW
zY&JTFPA7C)xS9U%UgYw}u0|Adn?$gm4lnK$m;}wj<7ru;c)7@`L$J0ChcB(ft#Zt^
z%TQM6LQC)WkUEY@{$AnAVH|Crk83N?9*JGph*=?o1ts<%KC4!zl;-Kr^}_pR!uPK1
zHFOglt8I2{zu}@hFsrwiB<FbiaxR&d(WeJQMTkjdnhVX0`H_Uyf*EMI3m9um42Te*
z#G$J}OQ<V`3ALCNv&fr7K=hBs<5AptU+gGiul3Em+vI>%QX!9t64TT;b)Ibo@&RJ=
zZu{OoZp9lyJ_QSw;?G9qp?mcWo)uEmIaPzDXSM!f8w=w&ecg^T3B3GExN@8J5R3VK
z(HWC(25RsAOWaLT)oIh~^@l^J^aO$D-Yq9xo$9i<w-f?Vcq_&Ollg$-?%^;#UX;ei
zZCBlvK+DhUlqG_%fV72JRT=1c)5=Lap5sosVPa^+eo43M;09za_qS#Wuc9Z<{V;_x
z*Du+}oV6u1a>3I9-C+Ur|DEXpmoP~%SKp8LwD@uH{U-{*zf@8M1yW@LVe<z^?+^dJ
z`vf?3d0Bcn6@8Q2o7X)ypAlm@@5At}w3I|A<|#zZs)ShFWZXK29!M0B0LD4uGG^$~
zBm_i%#;KtUfA7nHx%?37k3>E!=@4(C$5798dA#^mbZ9OfbQFSkAMi8776N2pMMx4t
zvRSO`Es`}VCWs~MWcvZl0xE23Fi;0%t$Xn>AIG@zZ{$eU68RGgoL`JB1}gphx}%L&
zW&9jgqN2E6j>^rZ<_oOfeP3^T=6zU^7=o&h=~9+fZ+&O2xCj6?n;Q<>r|;23q(E>O
z_HytYP&gJI4h|P6(G-kct#${LFVP9gy#=jQI6x#*TkpG!VM3RU>)n1jzE^yw`;Q7$
zAfc^ep;MliAh<qtoU`d#<m68S09pE-$0Zj_!e|9N#?V+M`@uoQpYsv3as4GXU&Ma0
z?*u9%;1<N43%n9RMnA+s%E0;mUg`dNlXPy>EEPoOn$Hsbt=a|2G$dqGv~pyXC=kRt
ze_B@jvubmcLe`keP;k57<67^Ot#fE)65O5xi{*J%^5$Eu!xef%WWr+oG(QIfm)n;%
z`DI4bi5wGYCcBHi+S$zaY@%R1*`E)S)yK~ERBa-Q3DBc9M76@KvFv9Z1BZ4c8iP6l
z!T>Eh&Z?)QHZJsJC3}esLU07`u?%UhAL9qgOh->21SBPR=<3tua&EiMnmF<6H#ks<
zh(bt^%te?DiCmmk;(&vs>DM6SyJ#wTeQ*^4i%fYkL51e?6n>rNw&c+_^d7k&NX#wJ
zqxio|E-ETY_ib(Y^ERdXkF)bsD$72C?sMO62+`Kkl1fG4TTR!8Q-`6D5^g$_!fo6=
zGaa#q%$UG?6K<tkY$%?Hsdnd%50mzv(>hJBuTH~G*?4b|_1M=W6zS(7tE#Ey0=xU!
zb4mVxuNn3NV}txi-M0=+wsite4nRraL>xd)0StfEZ!UlMBGg=dor9Ek?~lSa)V*xg
zE6xKhpB)zmbY%1gL={p@eI_F{>WO^zoAe{c)kCwfJitiqQQJ1+){P90+jcXVw^%db
zmY*8z;6cnuPkOR;*DpPN`IarG2_Z^xB!>8V{DdNj@Cq26HU>_szwd!wm~tHO$Q0Cm
zfs5#Ay@SX?6awDJ#JF5iLuOy_m8pY<&iTcoq(a|)6G`bHKz&W3{Q7F1Cir!PEg5^;
zMGQ1x44yQER5%ZU_!m*k8f7YpV!s@5O&{}r2;RNHh!7<F$5B~thl{l)P>%dtSxGji
ztq4&`zlpWDR(5|nV+2j1e~?>7B?e@(!|I{QQSX3NpbfySpjWs<(8<-3=tAdS!()j`
zTsz`)mJ|xjaj%N=^?@n_=6{u;bZS^pCAY)X<Xb#p3k(3A*6U~788K=1=i=>}ThtYI
zg0vWgikb?k0#5R+M#Dc;ye4bscmDiqp^ac9R5wH1G1|VR;m&{v(?c3Y0mr!L^}MSh
z&2gsf^zDQFsJG9SH4{|Z{(i{V6x*r4U4B%I$1y(RMX4~Vq95Z{cV--J01}ST$WLS0
zn^SD_9JZfq6JaQ*bZ)^x)EKqC%TUNGBnhovmYCFZ18mY|j?VVA4d#8`SSkY^A}H+8
zgKbX(bU9XBJL}AD*iV>8Tl&M7vAA`}03&##uvZUE->~<c+r9J@yk_p16%(iYp;&i-
zs<xAInbPUm87S}iSl0VC0W`N_Had$K%x`K|sBD>C=Dt}5-Ba955rBkHix7v6KhIb!
zuvorOaHwdz7hFH8AltkXVDFSxHYG2>s}n#!z1DxNE=gH$2$WP2)YIKHAuEHTvLLxc
z@lpddo4(yAnfkyB!#8L>;jf=?7}6Up&o>Uk`F%pB=-4|3hE}0z5M%!&bLA+*{~rER
zoxnVy<Nx$}Q`!;}$%y`7uysT9Kt)wiLE9Uta;t7V&MUP}JUO8;%51@n%#78nFBjlJ
zL%KN06;LsIKdoX;+}tAu6lQ~XUw(nbe9Xh0j=$)1tX6K+4o#RCdB-qznrf1*$z)e3
zw7?WdO$fCPzrWyeM%%8OQUh#8wQUwRCctz_n{_;kjq?e)dtDf@XmF=qLEpdMgUQAg
zphrs1Abj8T{rLtoozUv?bZgW4oX`3EVY}4<>d}108CoVx8t^CQE3hvYZ)^cohp%+q
z>t<V?`8^^{H6-PP$TCjlHT$}u+-KdWa@?NDlI>V0;n?||{AE^ITo58CfiXOF`AXgg
z(PsDj8_h|o!xnIRb1W|P_biqL8v|ZYJ3Wt(nD8{t(^B9N_Y7{VQx@YoM<eB`ZoQpb
z*hO`-fP5CNf`55q%n*)Ou?{|;5X*LNzsV_zp%*W^1PCjGu!^my9Zf~-w|VKikjb2D
z`<*eM&><Ae>rGw3$LC5~TbF*X(Ph%1??~dTbvO6=7f=HF>PRg^+jBk@?O*z3Lf!`&
ztxo)&lRO+UBPg$^0B&-L5n?v$JV)8WU#vFiS~}OU=4PXjUMT=#{aWy$)P^bUGZ5$Z
z8PuUV{|ET>`T790uBsj$Z`mG~ZqK_RS1sG#{lQ2G(__oYw~KtA+@GSk%kLR;=8K?a
ziO84x?29ij*Fj<!Mx7D0p11Y6ccpFU#sfW6*gZ29Ou<;J;}J^xOW~4FfC8r!0)P?+
zrlxH6@N0oLJ3|~D#bik+OJT`Z*-*K{2yyIlF=Jvxn{{3w4Q#9;=dyG6KKpFS>gR6$
z#C2BZ6ESH$cWdSS-ofm@C*Z|jE_vdDKrUySTBZbiGz@qDtKDQ?txI*~l@B(`d9CCX
zIio}hTtX}nLK50ZuTYI4Aer+Myuu?bP{>)D_;(mzPa?@cr|gHp&aAHZUoT$HkAC>Y
zx$wPPD%O3MNyMygPWsGc-_|)e&kVLZ1Jfx7?yj0O35@W?Jy7?I#F$}8=T=r+uR$wG
zhY^wr)f#V5wF8aI9@E%Nx7^RF_J^WCD!q?y0`J$GF58`Q)R3>p)~^o)KA;X$ez-Uf
z1j2L4#V}c=3UVb5t^;hmdrXX&J5T%p1cMs8XQ}5JuA^_NIM&7IgjznNnBY)ZOv}6O
zDMn%!daV6QkR%gR!5ihRcbI;u%ni+!b?P4$`V}IOu@2q1b&lh8-Nr&+NksTs?kOw-
zT@!(GeIczb#`?>O7<x{X#J>Cd&nqUHeKaU#!uxK>B+CX=moPoPaEvr$2BhI57=^9*
zjpzCe2Y;=bvlbg3fMOz?H^+p=wIRs@VXoRYN!@?6+zh@p1#C7RL@r=_h?-}8$Huc~
zqM>*%V8v9Dz)MVr&&6uJHxcU3{T!y}g3YfKeRC_z!;aOJ&9^-S3<g3$)2d5<1jbg-
z+U39D6(|Z++vB3CB+GeiR-DKSG|cA*WxsE|X{3R~!KDQtXCn1!*Qahsu>M+Mxm5PZ
zP-jHlp*aT3UTcafe~zuymDfKi8@;MO$Sd;eO_wjYdBD9XCU0^c8(YKRiWoRJg!ouI
zFwrR!>1EOj-GLYRy|+S_)t>Y9y(1>3zryRIDaPx=%&oSA4WdMx-`2+fW4Je4-_{#$
zp6@(99S(aIi7wOZ^t_+gGKOrnzy67G@AjB2ROd&Jl!k*JUxb|H7pnjY_qc@3n>0VA
zo&POJzhTtqG4EhV4ID%t)jb;D#yZsi2c3$)lD0y+_S|8{0xa*=vo?M_P?w~q{eiXh
zn0V!x8r}S2Sf0NKv0VoxP~IxWVA~7X{_uW@re4m>Xq9cX8-vcDy*9Vm_;g-6pwPS~
zGcg*z1jj@H%RH2d&-O4x1farIzZXg-@PFRK`1~Uya=RJCz5+#M{&y8)cRvq8VQVp4
zuQh_cb<3}hC*=FD8XY3gK?Kb}+&~X?-5?c86I9r5wVe?;7asQ|GWIP@PHYHB>2ndP
z>CzngT#O(E4;GMgF#2;U$$GtW8$vi#@EcGM0n^?nqk$)D)^WQf9GlQ4V`xUNZQ(B(
zZo=j(==Jw~e9{n@xhvuDvUg2reTD%x;v2R&yejX}ob4|8Wq33^o;{9?yZO~uG-w=y
z^wBdIh`2x}KZ8t%%xM3OI`Z35FCgxsxqpq45SVBTw_uqk6}WX|Ly+m^1@X!}Gni)5
z@V@B!;%NjgPw>`-2_DO3r$4fLgf!7+uXo@WWdHY&l1R|GG#nyOM(I*>4}05m=+a7q
zvJ3{@O#;^<)^g?TevX|e-ofWCt~<X}6r_|bfK-?@B6xvzZ_hImfW6fCFE^0ia42%x
z^N<acTKo*^!|mTLT6m5=7tiveL5Fi+0LN+R>nvA^ToVf8r|4O;c(ct5W9Wdkqm(uR
zLc%x3R54Q2@Kg!?TvENUW)tVPLc-2pXuZKK*TV2v&=If2gtB#HCC|pP=3fqH)SOlw
zS$1j*tm?{`!s+rt`_CT@;=hxnd6YtdN=R1p2Jng*$glRjPq^#!)c+oT(lrcr<$nkI
zq9lTCcLEnz`s@Hw6MG}gF<g%&@?E$9;XYyXIl}rYb?+KB^>!E)ei<|B$W2W+J!G6+
zXl2_(pr}k<?FG$?^>0ej5@DTGiBW$5jZ4P+%qR>htA8p=TF(KiTB>X5cc-o^4Xu?&
zNM(6eU%9y_f@(&Z^1gOVGGg+pe)4SLJUh^C?LVc<f8$S04Gm%CuWony{|=YayEv#1
z_QB8F4bHV9ffp{u45V&RbzaB@z|Hlsg2k!U3geg@Gv@fxHd-x~`m05P<hs)qol-Xc
zvUv?^<fO#mDbTJkjbNjD^LMlsIFv=k3G?mfClJH(h8!9~F`9Yu!qk*<g~WApA$vF?
z@d(<TJl=4~S;;D;kVKl%CDLVUgt=H(3CtDsBUm8knDr&t2{{DE#`MJHZs=(TCX9<E
z^g=sIfs+a1omax^KfVF#9F0g(At;Ro9K9=^97yQ|M_~gjaG+fI^f$v%QC9e71W-YW
zS=JS>2Zb)~t?bet_lwRii}J%JqFE1Lv}h07D5~%;aNj5LeX+p;LbGTaVuO2~fUpqF
z?34yVh4hu#f<jY@`mh!fx%pCnICo_P1p^fY*gAJoH0E4NW+t3U;cN^bG_h<SP_9NN
z6b^D<aultg(xRF7&q68TESK;iSmxBLirAW`1p7o82GBjRBc7JNOw~AlrH@uERL~$%
zurN50pyFmgxK^r=HgF=8P&o)F)W#f%BL^1HtDpoG3nv6ke!E`=|8fSGusG-T{1f;I
z@w80S9qq&Y>Fn`=U&XYz=7P4^CBOv^=nE<R{pw`py+Bsv1CcbH@9F@r!aPDXU<?cu
znbHjlQj&zzV<DGBU_w=+Mm)cPj6M1XVFN&5CB_<qXB&xCkh}YKcf*xuse(mHLZa|(
z(RYXWSThV(8X#y-Y)ny?FKCo8B*F%t;QCewhzsVFylzSA`z9et2w?=9Oagf;Em#Ok
zRa9Qd3{LG<Kq-yVatelKpc3Q)ugr+<DJCgYZUhMr#wztwpCV8WcnlxwbN_vz^cJeP
zMiJES{&}B#c;t5clX%TZ4+f5==!Yglgk;?dcYuxosSGb0;XW-zn1Sh}`&AP_D{saj
zhk`R;ktmD-dgMgmluP?c^4B2-fkT(2XlGUBX(dcS3;G7ew=*xp45Z^ACJS4E`Azim
zNQB|Q_XPgoLIi`92f{8v`Gt`JPOa<NgP;w`Fj1jO1cGh6pc82d(NH>k3KAh9<G<)A
zkU4+<W@v-XV~S1dvk%eV+YJh=C8UJMMlQWc`|N5{FL~ttL_&dd3sb^{k3|vQVnCgf
zMO07@6-1OSgby+;+Z0A=gcOv6D4;KEw_U@`{2Z^SCtcWt6whbeR~<h5?;K)Lzv$|D
zl5d)nSIV(UZvRA8LQ+5=l)^jaSA)-^2k75o`B!8B@4;ozf8ZX%EE-W2o)r84`i<Gp
zy_W+K29=wLhu^!Gd#!?={tev&Jkb?1Ov~&w2<?o*Ee*|*8LIsA*UmNhCXr4m0ffI|
z4EbwbAqUcGh=ULmOIfkEOwdpTjGRv{sJBK^MhQwo1vf#di0@Op^>YAnyc|P$0J7;z
z{eG-8C3>4A$cBf6(<JRJeZ4+(1cz$Bf)VFdt}V&Eu8KQnuSn$o3&pWON9g=yDKl&U
z33hyl%F9$UO$Ph5)~?OJNt(!@ISAFZ<S%4VKdbC{jdpC(eiXIyr%ydg<olF%kj1(T
z9wYz#eGy?)E*^^}Ja!gFkw4lkiAoP&epyFppPdRd%J}0#McN=DOk8*M0eZU;FB;e$
zHg+ips{oU9q;m_c<eBl>P#OJ&Xg6spMuw=y4i062U_e1aA_r@R8a^{65fXwOQil$S
zX#n`1IEywQ@Gg~8u2ilek~9??+^VDk0B4m$pD?bPKBzEs{I3uag~g<MmZct+z8FvH
za=iYTJ9`i2cy+w(Yz{_Z?_yqTA8hPp<a9T@@UyCILTYZgIa#!EEP&5Piq7WP_zWI?
zNT;jYwcLLE^fEA@NY<ffRbV;f(1F#cd+dYYlet%FE4rnC{9Uf1V-Z}e6}u+4=%QfZ
z-Q~@fs?w5qXwI<(D8DHl3oBaEOg|Tjs?&I}wl-+5FAK*^yKH<!($FSfjEK+VRH9pl
z|B^7!If>Z54R1LOgGYtHSx@(PUH=<@TI)H~51=iB#73pNE^?85p7|?=XlViH$#jL6
z8ldJ`nGJduF}+fRW{C(E-oOK17idC;3Qk=62HIy|*)3EHEpW*>S>gT9qn-g~-UDf#
z+-%aPk<OZjq@(+xiHEe78&5KtmK*%25aCZ7mk~~+CQ6Z~zz%B$VQIK;Znsi<9jU;t
zQXX#)J0HkB@drV20A{iA54%8CU=@b)XEk(9{ITr8fI>vBf>k_d$ySnXqiV@&l+>`+
ziu}OPTIWm4f+4N52(&6ev?W!REiYFQ)lcLFgWt#OPj?W>XvjOIYE#~QEo1l97qE9N
z2%f-Ju4YoU60D*1RaVA9p&DA^$=K&;44RjP_P=<>(#~vggfsr1BS?mXA*)o<2T)c}
zZVsuc7%l~gPMe0IgvM-+uS(1+)3HN)rlL-I6NhO|D8b;PgvPi?RL+hkIcxfmm)smB
zKRK<e+i`2?RINoZt4z`u<@qrdM~thD8ZK+$^+7&fB%DMZui{)W178<HHWxl$11!gM
z6pzcE(!>HFzwmr(a;hCAk&JD@fJVRt7nBx;WR6fp^ON~4PV1bAM#mdU(4#XKk)8ZZ
z?xK<kW-JqxanIO+Zmv5IkR4o?73MSFl-GE=H({DOW8q!&w%~Y1`8X=rYEuP)1i+JE
zg2VfY^xfo3U5~(mQg)hEPWj5^0Z<rQ5$-{>o`b*{K{nd*ElXN0t<5|s(HZ8dJJ@?_
zS~)PKJaBml;rbIS$ZKC=sB2=~O~JV%)NgX}A(_;`06Db2Enq@D`mB+n@uFRExennD
z7`(gOXmR}h?-vavQIc;cNMn&`cw&8ife~oGNjfy??y;vJw+bBQ!k2w4Y&op-NP}z#
zqz9M*C19xEZX$`V^wYv}Q7Gi8d+C6Ah$75bAp30C&lj{VRCE@wQ7)<cbg6C^V%kHw
z^7S-EFpAK4)%fz!)=@g*FnWqv%A7=PO|Y`pdstot%?DjCc@E^$E|7iNiaTCd79385
zhND&pSCS8RAB++Q87vGgD>Es!y1cI+_i(|Gh5hF*3g09?rNuI8MB*(rRP+k{U5hi(
zxjh<qR$tW{_zL202B93A<g^c^eg;7c$Iw*=i3;v^<&cyY1jd5Idr3+LCJHA0Z1E)F
zKTggAp}SQ(si5QJgZ%U06|j)0EE@6$NSWnun21ro?Nj$FW~z-O)Ws6t1A*jW0pjq+
z{ykt5p^)j)4({Wb`sEm-<dUi3mA}^%kjcHAOEEOe+ye&jzlrcHf`!Xn$(o-GLSH;f
zs#;S3loavqq6<Itd=)61lNzC*Uh4IP(L7PO$`;_dXS1v@$lzO+-0FbA^Nfh@>oZ>d
z+-J-3!x@6{rz*y-wel3TF!(b&;YmnjA?TWAKX@tg#^kbwQH;^eeUObELapY=RZajQ
zLnweHH3KNYB^-H7<$-?^X^FggL4E)Hfd^r$_5ReZuS4V{*WuJ;IKpGlW2Z{wD~RUm
zT03l|)aa<{{{9{QBRr{ysJJ-t6gAR9Q0Z_M?7?`1=M6|mkdN1eDwb^D;R}pl%g+N5
zdzpoQ;)nDrP7jm=+@K@k!&!+dvj>CAQs7aIQxStkB+%ANqH?w}J}51v7IO;`*e@<5
z^Y4M?uLWq8rYN|i580vUP|-#nVQICyBrrV0Ws0R@&)ac{x<aXQX{o{_auh;e0inDs
z7)jM9G93!+wuZ!yT`fMjEUF_%^`klSZ^>JyJ`e0}^QGd?AOp1wvHfe{0@aI&cyjAo
zxutG)hw(6y4_I%QL{S}UM@a5gLf+o5>CVQt>~a2g-GyZ+w8eDaV%P*cyOTCkZV)Fc
zNSx^J-hK|vbcXCiesDdiW<Q4|y(Y&5a4$D}PtW`$f6p)`(WYb4r80#WS{^eLAVkms
z0nD{pH5xTp5i=rXO_l~yl14{P`c@$;B$n4vuUd8JO}*B5w;h#qJDl<8s|E5&z{<%R
zbh~f<@*rq^UlxF2b2E@f4lAOsh6Ify5qAQ4@lA*?&yRgxl6u97@Ldl_dy5dLk2!=B
z3BssIn<-87G06aNU_yK>DXR1g4C%7LygWflc*5I#8PewcUPbBj)|?UzoFw;|<rbm-
z6MQv~1W(<vTelmu5Rc$FO4k(ZVTrJQN?*F@OEm+}oV{+!RpbI_<g4*NJo0ThC~lK~
zcAfO9M&l}r;GSasUWt{D4WRW4QsX?vbSo$=Lz5n3g2w!aPG_x-B_UtL{(9l<Ci(wQ
zRslIpQeR>bPSj-Q;E0F+<A$Vg_Bsv#JBH84iD488T6H{($|8B49Dt;sOc~7&D-B=b
z17ri3qN6w$m+FtJ(=IyiCE&5<wj!!WeIZwyd%I}_d6tJ0bGM6K-DDaK+}>`COU`Sb
zU2}rnAg5}LI__|Q4$C>uHlfes@&JN*HA55Im`+kv@?q!3&B|T7f$_q!ke4VP&b{T5
zWE}>36Gltgg8Pb4(B`8<A7loJ`rG`M=2w+%dY26X;`^s6x@`w#_W{Q_A4?f^!-eP0
z)_H{Ni}Do+n+~W8XHjRBlb4_N?L!;PO3_hk---7#&AB)|87cjSBUfB^!)9vrRzMUE
zFc?|cN@L^s-r;k8#7~FxQ_+M7wJe_G{$YxLcLCMZ(`uoRVbp_F-l=gw(<MbwG{OR2
zEJ2t}9tmxBBB^Z_RuqM)I=rU4(=dc#Jv#N<EGwBX`{*6kciQqvA_*r5nz97xcnqH>
z*w4r9ZQ8Y(&M8Ud(BSb?LqGlI;N^QH4wHP(A&bQ^(tMe})r*&5QZQ1wajynEG(^U^
z`8sI{M#2<WPdPFFt0~1IVG{j;hO8*ekZNB|4NVOV2k{XZPpjgFrlh|ePb7uvpHr<(
zKAxtMLQGAjUKNIovDv5GC^idA(O7=j>kW<)9d|w~#p%8nyp`qHF$A$+{j7ZNt2ox^
zKHeZg&ivA}wHgJs2|J$Y$vWEC8Wot85Kc6&H%1ag*rRV0(lGqJMD$IigZn6a)FH2L
zd|vLp<k?4nwueK(Z^BtoIDcAAxUB*q&(A&`UT$@D8b9`3IIQZDn6OoIURM3KKfCmw
zIzzs;r%67>zwvucs&@0Aq+6m>eTU4w8XLzo87L!C@m-L|=zRfc$4*`MR&R)SqK0Iy
zi|R+yv&XB?-gm@Z|BWQ?N8@ojN%49TH2qkoR<to!4ogZMgsnXs7I0YpDk`v6dK(rv
zr=c`X!pT(s3K7^#8(v3lhZLO29@0FCDk202S5T;?g+T#A!W&A**P~)mDo|j8;pfgW
zr&}c=?QJY|3^E$BC!mYjsf>vSDe<cHAhOHlHFEFP+!aDX6a{ZpRH56720qv5n%-ms
zfJ>DOLjv8#H#Jx4ZT)61o2Jg|<?M9%KxkWwAL6m1NMLj<Mp#v#*`xrobxNMEJVTrb
zT^5CH!t;L?F;M*z!W7CBRUT}JTGIA*yJ6yF&$``RQsh4#kFFv~YW)41cg1y>WBzGD
z@vQr4vluyh;Wg<?=VkoPbwv&UNzy~7IaGe-)AF>WDgL=V9qfH64ftrt#@{e+89n--
zZy3ZX;CFBTc`5OoT{0cNAWvl3`xHG3TX*ig^9uRx>Z^nln8*tuqLyL_-}fVo7iFev
z`tKU!-%|YnW^3!=>%6ny?>%~^M_5k+5<}oPpJNd>JAx()T!8WJYqziU?F@)-cwQe#
z&2F%$cmnqBPfXt08qMTv&K-$aqbq(vzc!*cO}F2+CKg5yT$wY>akSd4K2kpadLA5j
z1tt5DA=`ZC>}7%5tKQDB^?i(BviSKd_rQ-@Ul!ebZB*81hRXMPZI}BrWH~u3P>C2C
z8&#;L@ph++D6qG=D!)I#1MdWGYp$b9Clj@+L*<;|Xpy=)!{^BTb^9DIQXIqY{-$4N
zri26+8!Jp023DT0ZcAdWL@S!`35`sFj1pHTmLwtrRPcL#C0EyX1G7-m`I#jE0JDM(
zE@_9)xY=n^_>Nl5u9V4X)q5BWLZ)7f{bXWLSYPn`YOt(vGkW<VU=amImKB0=?CMcl
ze;Nr&5b9;8Z^z(x{QlNI88#Q-k%7Vwn)(0N3kZpZ$YEL)v^G?H`AkhUHgSIK@W7+g
znFgZdQQ-tFszt%qes*^g93YT+-?E*a*He`Ai$h$)D>M%!9!*|;OvS7Tcn!@=XQ?wr
z=x=j-{cRTbK!r!m*>=LF@P}o=UarJ#Jzz}x(e@B28}6T}+TZ&E#wK6X(J=;3PXK|$
zV`ba2Ei}-;;>Fa#^?Y1Vb>x{6nPkvybZc_zEXIr9@EISmQR5@7Y0duHhtOsmiUA{2
zO(I~kmwOk$)DcU_^L-DD$L?d%IPMWW#(>z6^^EE7pAU82S><s&S<kN((9Vd>q>?Gc
zigNvJgzU!a$(a9kH#q?Lp-2r8_bPsM0hkN>UGLA9tD^G*H}Xm~BSVT;qCB=HP&-A-
z0Bd6UYfl`I8)(!Q<@U?gzSRn?ShvrEr82ZqeM#|sDe8`~qh|f-O@OcVwafOr{p^|J
z2AmZVyZq)WO<E{vOrdsv=3=KRtUdg4OLh6T)#T-mJj`<VgsBZzhC9`8SuUD)a<Cab
zW`HFdhg6Nzn19Cl8fi}eZni{+nu;3!q_)8G!<mWhhhEN#*nBbhccip9Yjtw=Bq0F{
z^!aZmGndy4;P=(5XK|eCHY|t$y`(^;T-yu0H>v`VF=nCeL}D)<64N&F0z`BB6h#RB
zI2M1gH!M2#TvCezmBn)KVDu@YC0%b(kO2MLS1)otD%tDN#3&$W_W}S*UJiZiaa;%M
zy)w*?2QgsGA(qtX0#JYa4e+yPVgCos)Iq&o$L;Z2CZ8uiMgdG7F<ZlnSuP~Sw1bAM
z#l{pbXD+^TIZE>DLk)oiWUycF4T$4jT%5d3J>eE#yd%`_4@;TPtVe=TkI^#Mx{A{I
zBK}n<`-<td#bGVHzBF8TX)^Whx?1j@Xl^-VdY`PF=V#Qr$&QlGecB*$(>pGij#a9k
zVG}pfrfnLA2JHP}qLE3DeQI_&n9D2TPcS!y#>nz{H%uHoblE}xq(T|gs<$fB%}FFJ
zT}Z(TAW?<Fcch1wO|*hc-nP1Tr+bTWqnuc4jorfxn)6Md|53i|q{nkyWixnfsx)#8
zb`piw&M0jG$k|!@m`Qgi&?y(v_-*6!LE9O3F@I0+-90f&3=Y;jl22W<qWz7_65kOw
z5L%wKy=>93rtMr%)EJkRI9PBLK^z()d#c-IH>NtPyS^eULj%#kiC~!31o-FqUf!3=
z9}NbHVr`~zbBB&YgJkieSh<>%b_f{5c--o!ax|rw;exjgDQ|+s<=Z;f`P6Lp0&?>H
z_rBY8tMJuF{iExu?|gi{gzTjRW~7Ju&t;C$T6q$<w%YmW)0LjV+jNkS14gWUhBU)$
z1Pg$!qefgfBw%i})1SY`{f&r--rM5~?9~&&Kv%59X?+oc!T2Ts4ZsDL)|h#JMW*t{
zli=3+32yI}t6J0DVEk%ha+C$dY$ZtCidcJkTJfIwvyF{^v3|R&OG|jjoe33jVAz)i
z3k*aAvuU^4DN<eQ`gka9vR;L|dRX4rDVfIk7)B_-<r%0p!U4C8x@f<PuFkNr`r0VL
zcU~10rM6@Pexv1I`0naTVd<gk+ADAaoX4KtbGsXJ*`Vd~N-K6wEq&f&<G!2SIO0#k
zWl`{ywyHoq<b0t2U7|&a=8Jz)XEtZQ3)a6h!$#-c?K#--H~TEFGzg!qvgotkilx{-
z#(EecJI!gHZtSZrpenk*oSIq%yP#B<z1~FL^yF-rzT-Vld8^i%sq$)}MCWqLSSf;f
zJ)Bjhs@AJ%vYwW55GZ;ae%XpL$guh&izr;ZG5@fmj4H#qqR@jv<tQk070=sD=qRNo
zRlw0%z%;#}I5MSP5HTS!qU4g@Q)$ax)xU0(T+XV}u&H->0KD`OLa^eEA{Wubp80t+
ztAd$2N5EApd(^EC68Z<OO?V5$F2sLsD5nl}lo15{`H!Bg*XM+)?dN9PtxGMdC<;MX
z{Vir<l={fU?&mVyMzaOJr<El!7`=y6R^PY+FjzlHutoy!k=MKe6`u#XHi*Z%s{Ta}
zc)n+SZ#q9Hi-yaCRfrgVw8@n=^Q+Cm;`R$CBH6M>CQEMD*$5!?jbY4FwHn8sqp27(
zUWy(WrbMDLC(hSiO1z~R&%PpH1X!3J1T(|wBLop^YYj;pKDGVcQI|L4dHgqp;7-S9
zA1Tdno9Fd?dp%D?i-1iWl0A5EaPug09?~cz`uJs5mfLjtQoCEYU?@!YOJV+gbD@G&
z4MXr`Ni)3X`iTHv*U`Qh!%;RT8VpREb3+My$m<<Z$kiP|IbhK?_@3^TTjRv4!U7RJ
zth9V><WJH+AKNUKtqaRn==$b5C<SMYa6sZgfJbw*1#{g~awrBPZdV8r!XC%CK?U^;
z%&iO*W*AUe6zMK^=jn;*ZOiGGstYW3`iiUSqfC_dwE0C7v}Dq~qGav@G;HOuGSUjY
zE*rD0vH4lpWPb)*{1i6(>4OZFiUL$N5FjL~tj|#M-A<rG|I+sIk~)b^K5rJHu3au?
z2e_8LKMz9Y05bIfr$QzHsH!!a@}vc^wH+#ZIAwVyI#!8%Ptk76+by2U7aB3^*3`g6
z_1!)til7ps%f0p9)tdhffEBlbpDUR3HqjS`snu=)5x8C`m>G9|62|YuN(Kh?t(yR%
z7O&IDVhH2~hf$OIL1k@#nHs*Zbt6d>A2Q+EW2dc)&z>RO*LKSA$EPe%XVF(F3=9X;
zHCA-ENmWz_U*Lr$VUp{JzD~vcUZ+HhZd49ti4(8@QGC_BPD<?ZuU8OnJWvg&9mCJ^
zeC~LW^WkZ-_Be%vlMqnLjB9^0KaDoix~K#w!p<HJKo&0$bMg6S=>0gp%Em8n+CdaQ
z+SfM14Cm%FQT!Cm^D>*Jo*!-NIS*b<Bg^J&vu;{b+9u=Q!ws7ZrO~lF6nvO`-t`x0
z%aINYg9m)!?KWxopYuAynllfVED5??)SeCAMaW2Pxx4HT<bffUf&%jnrp6-y3vM%K
zckQjEDubstNm_8Exe`;ha<9@Y&XUq=5C1*@tPTwaExH&HDiRfd5`?MXt%zxt9@_;U
zDk&+Nlt)T`nlyrQIE?k=^%DLQnSVPV*x#J=LD)c^zT0%Vm{6QEc`zg?Y~i8m_f#;9
zP%n*8xFRd_H2o=~)4&^Hrg7{O3;t4hP;Hrfa;%st6Ff=(9#B8>#tIZ&aI##Z&g%lT
z9vsPlL-m~*l9f7~#%y_K_AMrrqeBiOti0X}Q~7Y2t*(#AVi?%tK?;JW35CTr9gb0@
zWP2UdOZR$y4M-v_al@r~Rv*)K=oIh#r>z26MSL)lm<=e@s{z1Ku|Z|ax0+73m78JH
z;h1HvV41OTbQx@LbR4zUtSjU2!Qzjn%dWGj><pV)x}}qGCahq@$wT<EVtkqgSKQm1
z|ArD}_aO*8Pkf~xjmYjtt0*CL?AP&8rAoBAPFc1V=08s*pi<RdY+3X~c~g}!=}~MM
z!VfO-ZO3i8MfA@uFldKr{UMp{TDH%8Hzcpvd_kMB0$?TsqYf4yKiif~h}kP5V6$Z#
z>?lK}N`sIoYU=eoH7Ap5ssc;D9z1o12KtY#7X{e!bqL$Mi`lxpn0Lm7o0&%~4;n6y
zJ#+K@ZNvwgDMz+vdQ^TdwFBVk3GCSCv#1MO2j`~B5eEg6;+N9oxQ)fU3dL2%H(25a
zJSkWJ2|yj>K@30A;=8&HS$0d{WJuo2Z_Q*ySYdwYnyd3bTy<CLbz>(;e~v7N`IFm?
z7`28+56I6~_>_s7o0YO!W1CX}Qz&81jyhluxZKD%`h0v`gs?YC&rkvpAA8Dc2j?F%
zz<qloE#^hmm`d`@bOpnKMDmNlXLh^&6?&6?^I}R&xFlget#6O0Uw-4w=@1)E{;GiK
za$fAf&pDp4iZCa25WNb(w9FIR5rsnfO#Bc7%jOMLN`p*?Km2Oqk%s;CU3gRyMxBNv
zKa`8|^>gxWy3IW<;wB>IH@9m>Uftc29ji9&3#6()r!CCd|4Ky%i<Q<J{dcSJWu1Z#
zub`NBY=E&rtV&5`Uqo%#()%g)>)ZCfFct5efV$CNl!oxD$?SB$G-VfuC}sYwbfDD*
z;IW;n3TQu0BoJ<Y{T_!KwikgI2uNHkE!D5=KKt%=^U_x`4dQkdOvUZagiiX9!Artv
zLe2sV^XDoW%%{#LTDnDshNf^ie9N24*IbS+UKl9Z&)X}){VS>DF%j;!oMdz8kiX8U
zth$82yso==XNZP*^*64Wa;jpDiUB|64Hq=glT@qhm%oBqHocE+y@yv}%fVe4BEXFp
za0ICh&%5_+Abxni?*qs)jB^)O<6t%>IdO>{Cw6yl=b%gODKCv2WQ6CvRx4(6!Ko~w
zqC^>6ZEsp?bX)7$4B@SDY1+|vTYL>lsS;65^Ur%m-=Kw|gp?$EzvKDoX#a$zMYe<s
z)VEAW+-B}$KyWG`4<12?qZN_DjShT1Rj=swKIl_T;bHud8afb?EMx4CG5#iP&GTms
zWieieBT^<GDdI&K$3Fq}7eRnhxO&Fl!xQBe-Zt+P>kQ-#8!n|(Nsei=eH3-m(iVE{
zXm6+WzMRckS8d$_8pNhKY#w~a?{;!f$bf;`xudO4$8i$HbtQ7cpUU{J;^)XR4{@>1
zKLNnAF^8i{<?eRNhY8d6HSt5A$Jx=6OWuj$srs@<NTp!&uGPUu+ra<5^|pmb+px+z
zltl+{6hAukp_X(!l=bIYgT0;<p(vvL6w>KT4FBlmes$#JSxfwxdJ?;8)b6U9L^j{G
za(=M6I<*{0Erx#H8AeZd9kClVrjj;)yNOxKcCeU6T5ieu<3K+KO-w-w4kG>j5#0S4
zhVU~F8uTsW=3$c2$;V5R@cf%gD-O-~w4n^W7VSX<(61L7Oj>2qTzl(F5D~c7r5p>R
z>jkRpjskF6uffxJ+insS!4RvpR_|sy`=hy8`FDKhu%|ViHi@D?yT2YV(A5hpgvz5)
zka#&7R_1x$A%^Wa@aQoehiKVG)|8}vgRx@?E;}vX#y-A(TL%kWTMrj(D`2ZRZry1K
zGM~pQzN!>N+w!?<VMm1nNDw2cLD#{`H5c#3QSj+@8GMpWKaedCL?Nm^78($gnPL(b
zpuYKa1+-XinvRnUwpytHNg<4BkE|XSoj#WN8=8%Q{gGi&fYki8pW_*S8qI$~m-TVg
zKwC&VKdyb+YJZ$A#yFcNHp!?T{VWTT#ju9s=l}5CO&)Rk1+4^UQiuQ!YmNl1q&!{@
zKe~R7J`N+`GYB60)hEB7f!Kj<l2O>aecyykMAYBjv7U7wVM^rvvsf6PGimqLfD|ll
zsIsoFG#GanT5H2J0~O(~){~x<WRydi-lyM+B-bx+=&|Y&wM&|0(;6B97So-w7q4+F
zXSt>CU02tR+XD7)voc#h>)D%pB<+Z7@w<@!*J!>gF9hF<l3#nlGL(RShd0q)o_#uJ
zoArp<jeOW*D`H#gLuY-=+>N%Zp_$TU-b=S!i{bg*iETm}@6e{jXsRJjslDJE!XhWA
zJ;XfYFmPiz<lXr-HopFw8x%}$Ma%4Lcd+-lZo`~VV&t+02hNc=0W0%s7D7_4h@UT0
zRJ06}Rvt18hVj|aCdb9Nd37iZm?)9RBA3oXrMNWF1w)LAqY&klIQd7{!+M*HO<dQj
zis1*%&Uf#gheW$u8nWNqlF2pa?<caiY|WN$*PuA*+q9AJ0p{;G?{|Z^xw|X-03(J9
z`Nv2lAqmOR3U;k@fo0xzyTfPXm1ZnJR03rR*ooSr(@~aM`^0B=jasd{?8J%UvyA1o
z63<_tKCv*l*TVJYna;<-<oK%>S<Amsj~iMmA$7B~9bQeli7o$z90AWrFg?=4LVjx$
z*FW}GoJ`sCO-9%A!-!c6TgP4#l~S|8MR&IUnCFFAp=kzBfA%-4zS~}@xm`xxEk7}H
zB2k59JvBYJPtZ~bSd&L*^GkOKN_N!D=2lfY_=VR}D2VWjCy#90SUgdA3z)uad@^M+
z@BYgY3xAqGgunS(BU>;0CA%BW6Z~-}a!2IdoQlg1l={$7vci&*g9$2u4vd>D{7t@`
z__f9UYr4zsrxh*=HB|xe*C$CKDWnd`_pJi@k+isehvDzsm-8W!ymk)wG?{l+h_FJ&
zH`ar9_@HGtNIUvk+WPEwmA0z!*J81}=vT^R`&l(yLOHly1oi>XU;M{4Y(RxXs(Z<`
zoPx2Xc3I=)-|flEIuQPSI4nXmY0xq8rYlrISLDo&cw(PLNOUvSRNs${q3!Xa1=rKQ
zIYc3~oGN-{;EC6a=WoHhK$f+|QfuW>uyCD{b4zhf1#4#T9Ge0tnoQAM|8a+oDe`#}
zWZ=2d3($pvmO{YB&t>+40Z3JDDRL-k)Y>gl@#}CMZ(0v`*ANWLV>oTP8j3XM*>`^c
z2<<PC_u3p)y#5s=TH5fPxLy&2R*nvUL~62paohG{HW?SdKz%P*_Rx6h29lr<9J`~B
z828+h(tZFn`QORr^wIlGyNsqM<{8i1H%r^!#=*OkWL`dIA_z-?WK-|whak8?-9mHI
z3L*~^05)4$snZ_{m5!J;8xQ|HD5mDGiAsrJRcOzj6tl~*>Xbv5zhE9c*)O(TKC~Fb
z5We#g<CH7Z>Ct|wK~e+3ms!j9j|X?AU91vql<OmVzi8f|L}UTXYJ1OyA-@UY8>T8)
zh}}0`Ta~HlP$k9Xb2R3>W<M&bJ?)xYil)1L-jQW9Y4>_EgKPa4z)Sj|GyWVToI->D
zIlHOasBPtC)?<zO7jdoAn``G`2&IMtf6)5xIoEnkoMVW9(>0nPh151-@}u(%hB@(e
zwX}_d_J>h$%i0Ci4;jKF!t^K{@~NMshM_wmzbx@EJl<rs!2YKcz+AY0r@Zk5*`8V0
z*lKh-UtiB^y07~XI-gv&y<C3%iU<uf^(Uf-i`_P%74A=XD$ggbGYZseJDQ6ab3Bah
zI6~Iv;Ib0i#gx0jp*KcAf~+OMl-*&TNb2CEWuD<mR=_14E6mT10&B1NO9`$a@G1n*
z+fQh%2zRA~md8280$<4vRa*53><~Z=s}=fER?T>9KHQIi=^29lE-j2>xvW;duvX~n
zUS##p!(%zivmWu=H=ZkTcsq>I_G&*(+v$s^>WNZ$v=kI$@sFFnB5{JY!vLze86M4V
z{1V|59Kp%-mzwR3*V+9@JLCM#1K*88D@l}VfUs*1xx+c3eju7TgfE@(a8&XBR!ZkP
zJk-0r_@U{9+*Ro0;b{;8)VaB9pi(D<sV_G6eea=+(XTp$7k7#|3n;FHA#mu5fWN2J
zqNg~Br2s74b*0=aO3qG~r??y*SFJ4vI|9xWn;2|UnLqszmRmks2nVhb3;B2+M-<0j
zq*at)kcl_H@$e?|!T7GSf7+FY`VW1Gg^-qe1q@!YV!5QJ)i~E^=mho;NX~L@aUS%0
z#3+L_^Uo<tz2+<f85Rpv)D5UKM1T8h1-zWs8<;Y>|5T+4Lo{$Dk()vRD#-!prbVU~
z-xET-k4mlQByhI|P|5rFNw?_Y>Y6nQa-spQXH;BY6>=bm*{Osa+-8Nv&Kt@!Xh1Qm
z3^GVuG%Yjgbhv^@4g9--=p5l+tH~zF62tC+m27PNzM*?7mXg0ZK0S`uFB=v2{X9Zp
z?adU)NV#kq3D$$L00>=gGHm)|fyiDfZ)lFtVk5v#myI?1Cv8vyIdlUmHUu+AxGFu<
zmPgB~@zF*jjT}^+vY@D1RFir2TK`~ZXLzkD;}7cf!bJq;WQ^nXo7W+-EyAXjLicCD
z+;1u<(>wV*Nx+!NpIV<m$txny_qttdZG{zY4_$)Bw&2S&<kaY?KKo&Cl;%^Z+1O^5
zW!*NTFNme@KAw-eF!k#fxxUppIS^(gS%6y50SVoUei+QKp~FLb*^#6jQp10GJmmx3
zY&fEt(j+}`O+B?=PXgm#4iT6F@4t><bBWO{$my29WcU+kW|K`rp*+}qy&pR@`MIys
ziq1UR>PF4Z^ZJ4pg76?o_D9hIF{aWGx;y3>$=E)A3!@qvN&jMGD{mvQ<ofXn10hZK
zL6G6v6DDxBsIik>qLTdP@RBEAr-ilGNU#lts^{n6K&9;1cHE-MNdLrxBh{v1k$y*a
z4Jakyu}S@DFnvhojQ7~JDHdrq5{k|W&z__Xw<!F`i4RUC7({x*W!=I8W5svB?vr+R
z<guK@Zh04nEpja^IVY7X7hp9$_{8wB@M$Q9U5P$zu6i!uO>Q2N6WqZE2F$xvQ4AAL
zfG3<g<Qn!-{9lBP9Er!m^AqqMWNOrPKPmc;<J9hSW-^g_vpVhlI6@X<Wo`Yi6#<J1
zr9N%;X6-0W*!liXQT+W?HR|m<#?znsu4m7e-H@eC_oq^#FJg!4)}5rhWv!u51A8ky
zkSGNJ{U#Lv0ksU1d$)}Jo9lq9FC+>l@l}d0*+ti3bj`7A`A^?XfA<4sjUH)@f#00}
z%kg(Ju2USbp|A$4&hVB#QT6NbuW>$CpAMro+~WtvVw^heqkTV?1S(y7{qe-A0R%8+
zO7EAXy-d{*?+?1gh*<tfOtTfs>4U-Sp><meft!Wi;qMyuhuAPV?(WYoN60@fFo-<r
zaRGcjZEHvP8Zlf_00=14!gJPwq3E{Tv4gjZ?%S39p)D8jAI=Dbd{XtR&z#tZHl5ZB
z3dfpl<v;G?rVtqM)Y>X{V?{zTJrQRLiI=zM>;9PEy=`Zh+c-}pG@XtI;jTxuUvZFH
zY*<spsYR9HP-I;AWhRft)#{^NJ$=t{+gbnXb=MA+!r#g$X5`?8CMmX5iq;WNfy#0I
zI(~<Rl*wUO_%s%|KrF9(!$`@&zC=>gJXTD=Nr5+Rv<v1qV6*>qJ<|Z;#YdULqC<kh
zDm6h`X>5w-MpFCuf0dnOR8)P~?uQNm>Fy5c?iyNp1_Y#}gdwCmC8fI?1cnj?q)|Eq
zl<p4c2I;eT&N^qU_v8C{_8Qjy5BsitU%xB;;suM7WW(TP6rBh29|1VE;@=y4r%`OD
zqDDFEgM`HdEDrP{sW@|NGt#1{D=-|d-e7zgv$*Kj`X`GQSN9W*(gE?8rk(aj$*#jG
zPPA%$zAs;3cju&j3+sh_5|(H`)4kIKW6c&sn?+vG#FS`VcG_K((mQ9oe*7bFc`j7y
z2ZC^=7qh}8E|;ejuz;QUUd2;Cywhb0iwpy3zRzBT8dvk?)q>OU@$m{Eq0s=4bn-Z&
zjQL{#TB!!B5urf!*Xca;h4g*YS+y{6<QsR;N%Ci`r&*SWr)vXlAX^yo1@8UopOTH?
zNtr}vQ*JsS@xOZA+FM>{`q=lC=KjldV8^_G!@kZ0tD%%>8aJEJreeThw*08r@J<E_
zcoPy8`z!zYOohe4`;q31--8fO{uO&(+iCqt+0J15wa^@vFjy*o*z(0(UkGgEifG<Q
zQ>?=Ipy{g@M^D<_QJ~kTujcIIoYG;B3NA*6W~65X&smG<gWr^J$Hc%~O<df4%whli
z#o~Ryo$v9hL<Yn^_YNLlu6dW~iOpPmJS^>lwr`DvcX6%6?blwui+>hOTzCui1D;x~
zOdl0%D!b*>=<8w?sa)M2zvXisbi7<pCY5_P_)7Bl>1^RxWOMZaI;1Lu-WcY_2S(b(
zJ>TBEhvU@ZzQOhV6w`x@x{z(-ndd*(wnKvTp%6icqbng}3WF?YAg$iMPv};uQ_|mF
zSN_!y#MXi(6oSpZMB}7mKwD_1jRxKRd#Ye;+U)oE!2ME-%Z6`M9O?kVSS^Us3pGFU
zN0OFuKX?+i*U1>V5T-uP!op&Pp2B-9<=U-qsW35$M^gSy*S9Ma<_M&EL52F<`XAG;
zUp?$%4~zL?3ll-0Ox8p}13z94Q73QKI2}x*6C?iQAk#49-J86Oe=cs@<h6~e%HY8R
z!&_(FFQxqknbH@5Y6XNNvKsUZ*~|C8iYj_vO#V3J5vdFTg~Uh4ODtaxsb$*BcTZLq
z?~3cl#|5o-dE=`ER93fneO|RmNJx4)#;0?srM5Toc)zpH-Hu%MJ0XqkuA>qlT6ssS
ztW#!)rko)ZZgfEQ>M{|#bu#qIF!9#3DATbZm7ji>GZTM|<dd^AMe_k{*6n7g@05kU
zPNA3KCwf6I`{sWFTO<H2stZSa+hZw~zPOHMLw3E_&_dtX*L`t9`iOx>kA}lYhZQ3V
zj;p|I(g@5#&Kj3I^!4}}+Z+YAN8I<P=gXr$AL7Q+;6Txsn2->eCfA__L&zv9q~@=v
zwFFD-RHk^QaE#`#qG|E0pyuz~sl$KbYHJf@)+F7nw36JPeOAcz#}j_3eY&Y8M{F6K
z#I%2VJ9>9gGDah@d^7u@9JKoT%1Vv{1<vG`orA5<_#G0`m1GBIWgiXvFy~oo`n6HF
z`S&mU=t#yJ24&^bbj!1Z_3o7=7`RUaV`B@>hhqM~4pA{k3i6zC*YGp>X>Idh_g-CK
zDpN1=6GH;;ACAh~4!@BcV5Anz`C<B#myV`ZLh8?-6i+Ml#U7;OzD{IOs%zVnw|7_1
zHW^<1g<t0dcQtJ2$tfw@_G?<XI{vK#@+E=%F@OPaECu{109@6^8pj%;wA^7w&H5W|
zg(tCwI@kRifo#t|gp8c8P;M~zRmlmDsvRM@OI;B-JS8P`4vuL%p1}>@95%Clhk(>G
zD3V`5yqfX)3ms*}sjQQ}PVStR?-l|p7Fz`ZUfmaZ!VI=x%E@q11hxmfM7!72yfJlA
z<OQE1NHUa_W$o}+J}E;9hj=g%BoY$~@U-X~X*r5Ow9BN)L&S+mKHOERzsx;ad18{T
zF-1_l3GhN;ET2n536yQWNv^~E{59=EWmM*k3fSX*^kvxS<K+1Fs(qJQs?PMt*5<}6
zKh>j8b$QaT;D71wIwO&c271T(Gh!%~NBYf*z&?wvXA*k=!m4BnF4?zo*VTRfvG#*M
z)P+?eKVMYvad3z-2@c#oK1%!f-0%9vx-aY6xp!={My?_dg<eV2(gqh!lCX4r_M~ka
zg=(&2$`o;9(LzqV6UwbKhQKpY4F}o!zj9lXD(b?}>Fan8&=&7J`SkOeqX}%xQrO5|
z8MD*`0@9q?jYBNx1FW-F<jC0A_&XrX1E4)(u<0uE?(k<J9!tL8S>u9ZYodWx<<z;p
zCqoGB2gRJpe(iEiv<%&fjv)dF%!~7MCUHALa1#52`ZQnRR+i6+YVK@!)X2=EZ!z^@
z&tE*I&#{;da2o=9bZy*V;iG>=7j5PEA3;;0^&dq+8IVbZ29G9k{HqF+2r!=#EIt!1
zug#%-^u6edSaX9HFrMZc4}Z6M*01Mxo}~~yonSi<Y~cUOl2cTl-T1nhGk(byJSWZ!
zo2$gcg<4-b))a$B+Ga|4G*hB}xShX%J?@+-aC6~tk-qc7I6vvZZBAP8t^D=TG#AIK
z90en?dD;!GBE<-k$hFqZJ^40Q#L=)*Y?}jrhszXh!$XWc@#*njnQ+H#cI(W`KGEdw
zL7&RnE~HOJcE#hJJanx2z|5(i;mI@7&vU?2q5kli=5Y45ps7lI(`S_^z=`0fZR2xX
zY}x<>QauEkIU>&QyvTyE&#nU>2GhS<hH*|Pwp=)*c9tAM1)VB=f+_f~QA5}%!VZ!T
z5s3`kCk4`{AknbY?s_}(&`!v|EBC&5*j|RCOs}BsIJ~p+qzjcLwS&6^`wN2c)GxUT
zWok!zK?wbyRF39d3y;S{EBwa%JfF*>VCA-Qc!RZ->240ij<MJEeA1=}4V_RpE7w(r
zZ~sBH%^@Zm0;(n9(CX%`qsG}w{-B9&h7kBQb;&fCO5u7#JnepcI`sjmXGHW8@w5=z
z6b;qsVO%J}NmzooAqt`ui)id_g*fG|Q|WU*{W?#D#K9e;b(tie3JwEV2UyC^1e%`i
z6?3PT)8!AJnCYLQ(NLbIcC(1<3-#e<;(qlu@|bHUdLE8BX-=kXp-rAWBr&Pgw4DvX
zYMQN2*L#f|8m6LH^1K$E`vHZ~*R*5piQ#L{t1ve^8zz@#glk>Z#R}h#ZuKy)CDenL
zZGAp6@FqP$8bt9@Be=i9p?N96M8)l5&5ilSlH<v!D&)Hdj6!+McFIv1BbhxS@l4l8
z4%(<8UYwBUYekU?4~ym2F5SDYhUIIdsa;+OwBx9(%F)4J_x$o4+rOW;I?wYeeHp7h
zjmZ0E!?&|#YB@O~GP)OxELg6{Bec(3=4mvMfC-j488U0U*gRYQR-56u!!J3oC5q(U
z$FnHocYoAeIX0^ejs%_TSMS5!p~l}+FK?#*$W5uukpIlx9E$cL;uRvlaC!6x7GcdF
zheltiJ_Jg^<|}+-qH`kzwdaWq%4z~}pC92B#9E3^EA>-NiLNwR#~;Tv-uD5(!{KM{
zmLP2U7r<`-z&fJqkqm}LMjYlrTHT$4zQBVYT0j3*M$0OawTD!EbyL`rTkN`z33f-(
z{8M#isPq21-ohj6Z|)CVS<<TPR&?`jh8M)2L70N7TIk5(qRj1o|D>+8c>FnozI%af
z=CPio<@c9D{7^*bxQ*082&S@W<gzCJI%+jc8LVvn`eLsoSbDkDveG=3M#>DU0>u6$
z%}(Cb{oen%FA01x*|MG8Mkws#%efLgP-XD#IadH-$HC0Ztk}Bk6Z4oNvHZiv364@r
z0y1s6eh5pq336nNx0U;8_RO!=n`Wzs1ChO=_`}~R+UWe=OksvT(TE#sKk$u>peP)n
zt12;alz0_&*4~~sMl;-x6aH$Fe2o=@)%!OOfw6age-c>iRP#PvRf^{_CR+9o7MV<v
zGt-x2f}>Pq6MRCSVxKFP`{>hoEk7$N$F3}_g$lPRepkx=mw<X12$(*{?bq<VM_{w=
z=gWW(z{(Hyuv7+0s`2DZAXS61H3^<93NBqYJ2Ej55h<)KeJj_$+lazd2n)9nSz9Jb
zv}wZ$n*?gOJkf%Ipd5*;%8ttuJiB;h*Y8=AA$bm^`r~*6Yy({}P~>>2(JeeVae{R-
zUuNpP1>x3aSw?4M7N}xjgb_M*4thCJ0Rtfb2~j)MQfut%@9z%+;FcVs|2o^5h35@v
z@3h><eiZ_{S!!GuBVG|y9<#P(xs3qp+<vn7$us(L&#Pa)(w)$jj4={I2;#3`Bp<fL
zTbI@rVy*~Rez8$dc((ik(qlMb<nnTKH$z0C^Dr`mRc0s)egA2ptV_6@TcyT+-Z+b}
zc~ZLGiSqehGyl_!dZzyE-<qcNPb>G^!qWF^F9GfHSV79Q@bo6E_mUmFJTc*VJ|b-D
zF=zG7dW4+Ma^P7QImA{LYTY)U?oj{pcD;7PUHdRWS)9YJl^^ks{<$v2FRQ@emz;u1
za7oYdiuZV7_zDNmTsy_grg`QYC(YZ`EL8}c6edFvP_aRyXdDqppj9DQv*!rKb_A)c
z!RCYC9?rn)9c&vkyt;owO5ZiMOsO8aSkX}2+kQl<EO#h{E=&<qRy(rKL9G}l0@dc<
zR}wYU;vIj+M$P>kr`V8Q<-<<Ig4`glYe#UhCR}I@SalvhvlP_ShF3b#1igb*r~ETw
zfCMmO>^=qkW-~@D?6{a|lV`9HL-McVYogf5%xOHW@%(O98TlU&ys=$hXZWcH3SF{c
zQi4*n@*%(c_mz&1Ja~%j)$%&?z=*dGh2NjpC&acqQLX-!XZ{k;c$$U!T^u{3N#V@=
zD07LU+`Nl?Luid{r8}oE%Yv$rLBr4r#Za|Ym}c_5xgA+SQDl-tfeyb6%&^W4sXN%t
zuuDfU!_72`m6TnwOQl6!d$0}akt`6YJ4IPAb5+#2odoAQ&k=Q{4!l1F=o%lg9U3tV
zZe}{F&Rh%pcHWkX1OibR7&A=)HWuw(r|T8EWdRTSrhp1~Ln($7FX>u&B*f=UWt=)n
z`!?)Q^ND%5qOnsa>Ideh$%xMKOJt3htMB!6$JL;R?#`+WCqTdEgl>pesrs<U_*rl5
zGGU_9Ef+O$f1VbqFvR%hN#mo7AYD+XzOB5SLTWjIo<^l}z1&lUaIHBAHZG|lC?z$8
zYvWa+$AG)LoHV-RHe>kbu;AqEQ&HBmec1R1#rghNRbGhCuPnzKL0(Lh55Obq;o(8;
zF>4Nz`|;#{EPKfK7RHG;k|%iwOh-D`!k<=_o0aCEX||8vE5{3WIl5&!-yu+TMxtL2
zcFM^hj7>x^?N8Dz`q#n;O8w3)C=j=KJ)5)_#*!N1HJNob2gBrrHV}KOz5+hF1hXK<
z46p#*T9Qm6JzWK3s?k#AsHpGpvHwoIzXmio)=-3>Yvg)Z+GgvZVQl$s(qPQfdu8Xw
zc|OJypj%d%c1k()sU;!H_`ug15V&vhq~9(}T(j2x6fMR|NFx2Pb$xfoGk1M2{iE`c
zwETSzB7C%58_h;n-o0F>%Z}<tWCm%LGb5G=J~91zXb)|$B?#rkIGj8<EncN;?X8q_
zIlk9u0ys4j8$yMQ$sEix^yjz0y)FqD6ZUgR8T^J|yPhIeyU`aKEE8rHOlk+h*D0~y
zljFgmg@)^vW)#TvvEsX6S(t&8?2s`g(J3oGT1P^Z3sFI=!C&pNd~<^c?Vu=dv*eun
z-J9IN8LvZ|kDtS~!dZWiCNmDEaT)<>Nx$k%@K8wLO<g=Y3TaJ?=J)&Sqb2bcvg2XX
z$FoZSSUkL?D7?|WrFH>$M&p*hy2m5@9i;dwBB1r3^w_bq$z^{`2qB^_BVA=7xstsK
zktM*;D1?KjKw#7gVshNct|W3ncGXvk7;>u0Z+dk;AFLWUGg;_pBT`nNqky?o#OC7o
zQ)VX9<*5?Y!u-5M<j8T0O7<1lh+d!5)D6mxmy;lrX}`6Xv6aixLS>D=FT{aZTIMo#
z71D*E*rFqHq?k|1&>;s=VQ{dO4(7@h?SjCBa#>maUklLL3~E#e6@O}#*?}ZGjOM>H
z4@(n6@@&Mar9Z6xczn2XYCUY+-rk1HoLYD0t__I89FiEtJr2A8Tt=hA!e@A;UUkd)
zsJMR3Te9%A1;Lfht+4)cH1(FHd`Cb^^ND79?K1F5w@K?1E=|mCvQ~m>`D1t5hgYn_
zcXeP4Ud=1Z%tb_rcB4R31jiu1E~hAA_0V>rPZYCC7Q7*^$pTY~FjiIZ(6YH7$?XJh
zE^3pDAbj9mNXm5a4mk1&A+jdl`@3=(yB^g(XCGUUZEnr^l7(~U%W_2u3~6oLROs8G
z?_;6mUHUpJ=Jv8q;D)~WSB<bp0~GQX!5hI%*Ioi2!i!886jW6wmlECb$;j<DZYapP
zZW7?CE{(Gty}1&ElW`2BRHVvhBwHj)KO4pLlxVFvRP^1>ZgezlPHH%``JseZ+B{~F
zs*(#GgqJxPO-B^WMo7w{{|J(D0z~x}?Dq|osNw8E)kfd@oZA4#;CG2LbHK|3<}e>~
zy>uMlIa6cY{MHd5;{o0iJ3v*bLzBm-62n`u+}R5PnVd<IP%@a~LYD0*x#S*=O|y19
zBlI8Yis3!u{+55YXolwUn~#J;3MW4Qp6e!Hof!z8m(zbbwV{)rTp|-?W<V&RVJ5E8
zNC^GZ!ge{c<dmJJwO$aGSJsh1>Bp#(T}iGqC&ND#=>sgNx=f_OYI;HKtu*LY_wF?<
zeM~hU{Lqf?j%6ouKFmo%&z(XHp@_V+J=07!N&YWCH1c+SFmPUp$8>cn&#I#i?TV5t
zwdr(3(`c~_N-6@&$Kl825)hz>j-ocxeuO*EKPE39eRSI^E&R#<^9LK(6-ih0Yc6?s
zuX8e=Yfasa0kIzHg6~y;ew+kezBE2dR<@VH0~*cmMdVa4(JxZ_6>A(n4yCYn{WLb#
zRh$vF_~M=&ZXLP%Mh+bfU*!;B4uOGB0Q*_g^Jo!>kd(!IPUCO@t-*hNb>+C+c2gKZ
zt(tu*WIy`>;IRRqHnaL2$E?Dg?}w7qASXLkQX%+tXUq%g+FOglHqmXFMbCe`<8_A{
zS=(7~AJGeRb>G15sr*v=>cPp@WZ$u)8z=KO+1X;8MoZ*wG?exW+O6Bsvcr5_9w}m`
zta)C>xul?Jtj4FF!v~tRUzK+HPpKoZW%g98*^bl6JyDE3LOCuf^_^*kX`6ZhTr4QN
z|Gam5W832&VA<Gw6HfRnJYYDOwO;xof9;l|4^K{G(@9t3zaE5NZ=8afuI*oITMx+%
zO=j+$s$6qi_0klW>s`X{>I2xV$UhdueT3_3`e%;ThY=9(_+GnS;~9HgH>n*l*63tA
zp~I&>$gOq=qtZM;79SZkO>{c1q#5G#)B^=C=F{%GS|QM=;PKyT@s<T~gO^l0_4DzQ
zxNn`57G3GT|0NEA;*sLB^kwYi`PNidA13GDTT10??(BC%uWk93;{z^=BWMEtt^WL2
zU0t33uZQKCsAD_vxxF{JKl{*2f_kjB7svt1`zB<1Ig=(W!D)-5IowqK@awQ+E1MNk
zArtsIzIEPNUi9a;#bz>*2RHpvirC5Lu17)D;T;PahDPn75NK}Kd%u&2lo6Y>r|d80
z9iLLDI#ZD4coY$6&A7m_S4-b5xeaU9d2U(`N@>z){OYfN>yAvUXd<(HpHvv+?%?D+
z4-tjc?_kTw@PeGRM-~Moj@#i_MDLDPZjOuT;nR!hna<gtf>;p-w9gyzZIF913-bfK
z|DG)Gp#;9$-)3IVh%M)t?emv3k}9?du&!w<yG?pIx9v&LGyNo^am=z<s@2Wv-bw9d
z|8k%CC633b85#&3Pn@2>v(fdmEiNA#P>>2r7|9m4;mcgxd-yZr^<1d;#ZpXIwweBk
zxz-H7297Fzhtq3{28jWi`?VYb3u;UkOq0H9u;DRL=1ng<A(r4LWXh%q-;=W^A^m0b
z#`)&Z2}8-SA9WaNES~)i@jE*!shM61>#DEAQzkqlxP;dzrt%(_URv%MedV92fa{vG
zP-;wO4fb~6DBzWcfbF~1Pi=|_q-%|DylELu3smq2V~`3gL3&}TMSZGciJL#JORRiE
zF3E{Nyk=xr#W50*J(q;0zG*C@nEPcFJz!e%sn8g*`G@a5?>VY(zlB3hiL{l2Sz#Gk
z5-(|k!-IS+^ecWRH{TM>J|J=8C^t(6fe1$_&AR7KUw~MV;1#u_aa7{O+$Iu%k3ji`
ziwp+fz}%7-zvTgYlb+p)G7_%0Ue`;WoyY#im9@1rT+`>);?(i|=Qt;DX~m)F`*(zP
z1{(pQjk(Mw=KI#NZ%C2N%ae4ikS?gbRiTxiYmk5RknZN%$iBUwLyEC)`kRkr$x3St
z)fS=Gdoz`hsm5YDMQ~@itE4_#vYvs>2EIFNzOt{O2C=G`p+}m`DbLyEotvOClIgHH
zCm{WxRef=wLdM!wC%7w}lX*>+Y@5UwE?5#?h^(z1$>%(ixOq_7%<+`cJfspO2ywUX
z8n3fK&A;<)H8Va;4dhx^ekW)^N75=HAPOQ5rf2K8BADRmf9pq;_mF0ngZcG~Ra43S
zrG0>(mNeVpECCb*WLy!zAEaOU(b?iPW&fD;lOOyVXrPAEIG;hkwXH`jeo{}B(dUxS
zXOg`N4aCA~)#sQDe)Yb5Uu)65{%R>uZlIi5E_`^45A_#2il<k$ZFA4-w;9{u62S)e
zyIDe$6}UsZb^0=qfS}3@U3$tpci9ofY#7Ad_KsTS4=kRo`Jn2$;B8ViZbVq4!(b*!
z|I&$|T&gmg1;5HMtoNg4_&29|cYaV)39HT!vNuRen`a;;_CD@Nh#AYe3MBIyQJ%4+
z$SjPGR;qDWl}`LnQtW^Wtca`R`zRIt&U&j%<UOHt4_zd#XKz?d!6iP(UPP>2+}py)
zan3)-cKRLn0xsxha16&~#10M<_BVl<lokMO@Z+<3HnVzGz0c*|Qmbd+dG?Cm?QWUE
z=3XU$fxKFHMseB;iTeadiz&>gb@XqX#t_xs9wY0hroI;U2>E6`&yEsVaxhk1wMojF
zCs|Tvij}^k^cGD;#;Tu`o_I$3BpS_5gxbr0Olx7p`UKm<+4ON8&LO=}f1(%sg*`d(
zJ*{mZJ^{(>wgL_=5v*%-vEz|5FfY@?eLHC(M{bT!%kQ$^@Bn)!kQOSZ{})XqgW@HK
zk(NLn+j;R)?(|>HK}qV`CK005I~~MvSyTk#g!Q_^MS-!`8O0#ogtTF7s-+}KZ@#uo
z)7idZC_WN#InuxT(}JDIiU(W}3hLEkG1KUXQEjg)xss|k!kZM*nmhLy-w^9s;S{1M
zg;?Hu;2fw$J`|LeZaqUXfr{<y<{o&b3fN1;0gd&m+==+B%?b(0ko+xQAUcb8RX%2v
z&#vK;eUT|@JZz38p6&C-CE}}MxB5;+$9FV~Q)XYrrAbxCp=W?m7dv*4I>8OTVP-5X
z@L(Tl<`v&vY9JuifqsJ{#n97Zs^yMh=Su5*Tc5D+YqtD7-zMJL>9o-=umJIY&<}bA
ztD+Ry<!rl<7q_zY7`MVF!G$<i<Y$)O(wD$?K^5e3r?ECk!8j(T3f-cKq+cpNb^@El
zSTmlCqNj3>e~f=^<S=d-V-);5d-?W_Zu>tjc3*#1x;maP*9bzci6)SY+f`7<SySet
ziIH@tn~&$OC<HPY8RQ<8Yeq|$O${O_0S*G(4YzeF+#c_5YHDhHfu8Fl&Hd`HO#X@W
zdsD}bYt`dZFNsbjpnR8W&eEhqY#hIQS9iYJluE-wN`%%8@#rIwD_~8HJF4H<K~_&M
znSdbRiMk*vuBEax%_{6u+h?@vXpf|*Kkz9i$f9CefsQU7dC<F?kI{ZKvWuN1AF1c_
zHT!L#QGXGffWCuv%ebA<wrIv1J)mrp{|QAP7wZe2dYg-gjM{^1>(X&@#y>Z2yLOSP
z(@cSz_%)%NHIa>ag`xE4!NpdpmV|aA*Ip3;x1|xW-GsezVuH3#nEAjehtY4I34swE
zTB%#CqCB{QFp}0greN?c4B>4*cX?y4(5U*uLo6NC0R~e`DygllZD_azd}n4p%$v-&
zrE6R$v0UCX_{aBe+&BP%7FFj@FdsODPU+f^ZGO5E^dpAR84(;)(%C-g{eH0s+a8v~
zh>D9?Y0;Q<0q)Ni#CAR7UuJlV1cjopPV!(3$PihA)q3L|%*`%mkGxz6sqRf5TB&&^
zYAZU|ID`W0M3gLL>*Psk<_Z4>wViJMSSw><ANWCbwAg<$*atxXk*-f7{~08?+yr#R
z;RQaQB7<Iw0%++j8HA>*GW$cu2@y>q5K>$a5c9@;zuNtDAePNqU|F9H?Tw=nIrFmK
z?Psgf*N?<;DvWxezRw5x;;!d_e~oA+q$6f5;?64W-GQC0Fhu1Gzip`XcX)byF#7Je
z)Zu?uCjGG6($cauoL+0(>e2g&Xk=t$mr#0XQnSJj&*UEvba!%}W3HB{w??YUDr%hl
zi@h?G4U!nuI!!1<mX*qXv=3>{Kv>V-Z?1&M^;hr4-BPAS=I~QW2)T<~g}fB{ChJJG
zHl4L+O|JM=Sg&qxzx2mF)Rgy&Lt{I{fuZWD$#3Squ)Ahlbdt9}5~Mdga7$XK5aLCs
z%kYkB@R*wn&vk?cl{@g;^5!@rfu`PwT0(WozJTzun`X5S{&{<ByEDIs=tFQOII&$l
z0teq@c7#lrN$C}WYH*KlQBlmTmwgb|<91Y80unpdP01z_%3s(qqg_rU<!!JltXGNq
zYTb&3u+boZ7%_)2ap9GfwSa#Wa2%eVp2u_FY|ghwo_j*Tt>!rvyKD_Lz-FEueE{Tt
z_t_J%Jdl)t$cs^X+ljID-Ww}o@k?T%g^Qi1$Fc*Dx4T@pBEI_|Ytj0!Ch|!1NT?_r
zR7bB~fDrU|7U!dm<F+7XQ0NatBJ>x*$g~Qtd3r}57<sv#-0IF#zqo29RAbvzze`SM
z3=u?D!VD_b&sVt>uXMV}He1Jc(L)l#zxF>yN~xB8Qwm2$K+_TVIP<;GJkGWmf{-BF
z92buOC1Unqx!Tn#%M8E<(QUij($s$+n-v?4MO#JBQO2DOCg~)g^5J{qt!aIak5bgg
z8T>WZ85@+;GwU}i0k{Dkm3YyYXHiov5g`j@xVRf$p{GQsRP;;XP0av`k8johPq$1X
zcOHnmz#EnhpcV!SWUsy$G5|T*u@+82%xRc1XsJkX_K&RdzwGpQsCgO@8#jILMtuyZ
zpk|LO-~ZMy8V#b?Ba_mKT5VZanv={#5$64>r9>N2Fy5wAc0JKVETEx~6<HdDF0S@Y
z0ZJ=_D&zN8$$2>D0Di}^c)N6v^<?<z)>qh{ot?6fHk36A&w}ZQ!e{r+=k)8r<Yi7K
zxS5$kNOV-GH)KSZ9kE<4HMnGB?%mAkuQsEP_TQ9WTgJ#CpwX)<Gk|)pY!dPl91M`B
zkWrIbkm;SBY2{0Tx^P|1UPRE4`Q5IVPk+51kB1{_>#uF<F1pw@dRM4+(SrEQ;W)Xa
zN8e=les_$#ZIdX^#V7+22pDg%tWR1?SFaKzrlqA}QwaIpEvy7w{85s=ZXSc`>r4Mz
z{RkK&EpUEcJiDw7d;l=oyVH-yx5r%@r(OZvr|>5jElR0&Jtz`{$Pe;%sI<+nHms!&
z5~1aqX({{m(6vUTt(BhHju?bbh(n9OPxk4~`1<tA7CuEiEnYAYy%0iXQMW<3Rcz+q
zWq!h_|4pVaV(_QxEPvO_+s<iS2mkoRLmv}M3yiLU@w%nK=5!eGiUMsDIxa#2miAko
zISK4+G|=Gw!c_Rr7#ISN%;tEC6cZwu5H#sRvERd*+r$>m%z?}Eky8ehC$?HeAhg3m
zb$#@yXg{;_ulHu2+0a>w5p~IjPxz{3rbYbR3kI-R2nHmQ2;H!ThbyXOe&hV^-^JRm
zwrm8D9M2YKVxlsS8RV}9Wj|QJ_v!%1Se^i=bOrG5`EPQ4eH{q6$@x6q?C`P&&j2SR
zVbN84(#e_SxJFQcS&43-diy6(AnvYV@=!x6y&`vWUa0kPv}v|l{l&SfBeZ1rQaMh_
zc<F1;=4Y`e^zcYPRSb<yU?9nCLQfg6L@&}|ZE#?x{-@!o&DFB&)IF~U3gefKrjN;P
zA_Zl>$r;vYCp(UA6%GveOC$U@<T_LIsj9_W)_LV)qm}u;nyDhyKDh!AcHUUpZHRp?
zs3@_tDteSyEN@tF42xn19x_l4Pr~I@uu1N~DNAxY;z#O7`qt!jLX#BCMYV@vUG*(F
zy)P&?KvI2C<Yc20otzoRjatAGR@)-(+sk5pmViEh)qY!^m9=<5gb8|$3>^{RCV!tt
zMRnWJsDIwXHW!^<;)IC9V|e8Gk7fFAbpvSJGWRv}H@Dq!tKl?&^6D?{c?7)C<3NaM
z+_-#hR-NMXAHLf(iPC5>qBicVrak69(J3PzxtZKXhv}H^bXp~EH7QpgHfQcnB|wYA
zjtbAYh2K7MBgkY%qJULP_Gs<*-5$e_B%j!BM(rlYipR+$7o<c4ERD7W#sa0c$noih
zf0skx)J}WMY%+vFvC2uAzP`qXq3ffySJIu{T_NHdGs0;m{w>=`Pl=+%_;`rHtXh<N
zWLofw%nTK$&_cqO#L<~#!e1)%@R$&lp_zXdM?0fUZu3>i6?a?{jv5a|{^>SGfsrx`
z>YUGXVl9$0CqGOCvK>4`Y|cL3Z_x~nJz3P`S0p76_HOdHjZy{-y?*m6mcNjv1hKpv
zW1lbm!QgCGQh^Msrak`~3DYRvx1Y*UJ+%W5lxZG#2wFx)t-##MjrFAUarnebB5y^%
zZSTo$yibV!4J+KnG(A%!)mXa~hh<01&c=A>j1DG)?$9VJN()vkpk3YF=eaQiy{2_h
z4M#y#jg*VpOIxB6P@iOIEq?3>-G`U0@B~cz9`sfkAce^lep;+=P=|6d+fTY!Q1Qkv
za{2z5%FM1UCl|}C$X%Z4Kyq~&8EscUU@F$qF((B!66nmK1R(lKL<#|6lc&_z-Mt_g
zaQEGw%u*^%)Q%|$qmD>}W+ZtTr#=qWtAPwZ*CM%JABJvhH}oEu#*-A}7^4tA@?pL|
zefwk<*xj{-I>oN>vnS~Ty`12=;~|BJ?t!S`_rDA{iUzTW<Uv-JkW1^Wr!9v&$YaF*
zqfXn^f5miA5i4fMkGuSPzAd5vCY1>nfrdt_l6Xu;TLZ8dyH*%nrwAQgRk@&0>s6=>
zgbf*43(cZFC85DmxscH|TM7?Mxf+>3tCa)_A`j!q37Kg33QocdmQ|KPWzu0uv0_5O
zcoEIB5KPkHepS(F`|#V-{=uFIjsAV}4-Hv~p&@G0j_kx;;c<5mJ(Ld0N>K3wfqCQ?
z-S~wlNMit3;iH45RhG?M%Werq0W!mu(f}<k^1GDKfkaYP4vzPXGpjR!B*k2$sfoc)
zM!?gMSGNv*IwVUZ#{=xEK@d`nROM=Z3|Lm~i(q^R-WPU4-flS}cFQW(QXh>v1PCG%
z6dH~jMX#lfO#4+4A?$0>B4h`hPL|^iOJ`bJgtb(xI_L)(Hu@-Ua->CfY0dha{#O#?
zzAs>U#)>-W1O&WFyJot=w-RWAi0A_e7FG!L61cP>1h|n^@XuEvs$kj#xGtZJj~u>8
zkVq`O9=jTUFMD{x<3#<DEQespR$Mr50L#gA+jBRp0c44*nDP$HIy^ibItvJTkrvZC
zx?luR1iYN!uur%g_{G6GSajyyDdptsGw3D$uyq+w)t4mPZZ`fTfv3L2FNS*BA^6|q
z=u!_+nN^hwzm$B~B*pr|V1tbONkwH2pE3m!l>yiZ{r(y##g>xFirtYcK)~&<f@KAT
z>P=@b7FPw8D9WcIOX=w_L!nCHP!*_vAf{yqJ|n6@q)tn(CXU@eMfg_6JC`cv2E;xm
zfhn3Bxp9)m+DRaUFO!*EuH-?I#&fT}D#Gz)qiC(VTZJ9CbxYgx*^|#E<E%@>jT4X1
zgCIl@6XRM+d8_oN3)kTVR4eCK`9hLP-gV-}44S@}sVU<GWgAFPT$vc{ZCT{0`u9?j
z{R7TPib&8SjS1m!8n9JeVU9e{y>trsldaESRqD0^T?)yIp)wTPmU+{Tm*mZLyyedw
zv{!%{@R%EhPsOu)NN%+|{f_($jZ=5{7eVo7cu0TRSVwM-mZaFU6fRY@R;&@T#?6S`
zSLt_l!i2yOQAD|)p2m=}Qkobow-pkwi>eDNya1=dNR8i1`)Q^|Xp{Y*rD97tuHkE<
z{hNK{=@qKQw!{$QZng59&7Oo$1n)cJ@NBs%zoe1$N9-&T@lxa=r!%-upc_HSIj12~
z`EtV)`xTRjzt6H$ciqI!^&zMTTZl~;uZZ;L|G-T$r}axjAMS^25K_cYp@i*z;b%#*
z@Kpxg(>2YSw*fwu-uK#>55;Y73R8E*$8!^h29^A<Ia-lf^*R3;Jm38NQ~%iI^N?ku
zS?h$_=D))BEvwhm|DD*${jB`&gd|8XP5%E?@aHa)dMqqNALye7CPzI%Adr~~SQY}@
z5*K0H9uO<y`&|oHq5s`u-V2o4+yC{~{?D&E2?P;xs(z&RzS0N&7PXtg8#h}E3o#3)
z58f`eHt*d)N={BT_Exkijuu?Bd_25DRWJBOfGY|A-<6hDt`;t~AKYx696_!+N^&g;
Su4usLAQc4-aE+W<$o~QI&*=>S

literal 0
HcmV?d00001

diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
new file mode 100644
index 00000000000..0d74d21a4fb
--- /dev/null
+++ b/docs/source/design/v1/metrics.md
@@ -0,0 +1,712 @@
+# Metrics
+
+Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
+
+## Objectives
+
+- Achieve parity of metrics between v0 and v1.
+- The priority use case is accessing these metrics via Prometheus as this is what we expect to be used in production environments.
+- Logging support - i.e. printing metrics to the info log - is provided for more ad-hoc testing, debugging, development, and exploratory use cases.
+
+## Background
+
+Metrics in vLLM can be categorized as follows:
+
+1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+
+The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
+
+### v0 Metrics
+
+In v0, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix:
+
+- `vllm:num_requests_running` (Gauge)
+- `vllm:num_requests_swapped` (Gauge)
+- `vllm:num_requests_waiting` (Gauge)
+- `vllm:gpu_cache_usage_perc` (Gauge)
+- `vllm:cpu_cache_usage_perc` (Gauge)
+- `vllm:gpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:cpu_prefix_cache_hit_rate` (Gauge)
+- `vllm:prompt_tokens_total` (Counter)
+- `vllm:generation_tokens_total` (Counter)
+- `vllm:request_success_total` (Counter)
+- `vllm:request_prompt_tokens` (Histogram)
+- `vllm:request_generation_tokens` (Histogram)
+- `vllm:time_to_first_token_seconds` (Histogram)
+- `vllm:time_per_output_token_seconds` (Histogram)
+- `vllm:e2e_request_latency_seconds` (Histogram)
+- `vllm:request_queue_time_seconds` (Histogram)
+- `vllm:request_inference_time_seconds` (Histogram)
+- `vllm:request_prefill_time_seconds` (Histogram)
+- `vllm:request_decode_time_seconds` (Histogram)
+- `vllm:request_max_num_generation_tokens` (Histogram)
+- `vllm:num_preemptions_total` (Counter)
+- `vllm:cache_config_info` (Gauge)
+- `vllm:lora_requests_info` (Gauge)
+- `vllm:tokens_total` (Counter)
+- `vllm:iteration_tokens_total` (Histogram)
+- `vllm:time_in_queue_requests` (Histogram)
+- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_execute_time_milliseconds` (Histogram)
+- `vllm:request_params_n` (Histogram)
+- `vllm:request_params_max_tokens` (Histogram)
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md).
+
+### Grafana Dashboard
+
+vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+
+The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
+
+- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
+- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
+- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
+- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
+- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
+- `vllm:gpu_cache_usage_perc` - Percentage of used cache blocks by vLLM.
+- `vllm:request_prompt_tokens` - Request prompt length
+- `vllm:request_generation_tokens` - request generation length
+- `vllm:request_success_total` - Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached
+- `vllm:request_queue_time_seconds` - Queue Time
+- `vllm:request_prefill_time_seconds` - Requests Prefill Time
+- `vllm:request_decode_time_seconds` - Requests Decode Time
+- `vllm:request_max_num_generation_tokens` - Max Generation Token in Sequence Group
+
+See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful background on the choices made here.
+
+### Prometheus Client Library
+
+Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
+
+### Multi-process Mode
+
+In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+
+### Built in Python/Process Metrics
+
+The following metrics are supported by default by `prometheus_client`, but the are not exposed with multiprocess mode is used:
+
+- `python_gc_objects_collected_total`
+- `python_gc_objects_uncollectable_total`
+- `python_gc_collections_total`
+- `python_info`
+- `process_virtual_memory_bytes`
+- `process_resident_memory_bytes`
+- `process_start_time_seconds`
+- `process_cpu_seconds_total`
+- `process_open_fds`
+- `process_max_fds`
+
+This is relevant because if we move away from multiprocess mode in v1,
+we get these back. However, it's questionable how relevant these are
+if they don't aggregate these stats for all processes that make up a
+vLLM instance.
+
+### v0 PRs and Issues
+
+For background, these are some of the relevant PRs which added the v0 metrics:
+
+- <gh-pr:1890>
+- <gh-pr:2316>
+- <gh-pr:2730>
+- <gh-pr:4464>
+- <gh-pr:7279>
+
+Also note the ["Even Better Observability"](gh-issue:3616) feature where e.g. [a detailed roadmap was laid out](gh-issue:3616#issuecomment-2030858781).
+
+## v1 Design
+
+### v1 PRs
+
+For background, here are the relevant v1 PRs relating to the v1
+metrics issue <gh-issue:10582>:
+
+- <gh-pr:11962>
+- <gh-pr:11973>
+- <gh-pr:10907>
+- <gh-pr:12416>
+- <gh-pr:12478>
+- <gh-pr:12516>
+- <gh-pr:12530>
+- <gh-pr:12561>
+- <gh-pr:12579>
+- <gh-pr:12592>
+- <gh-pr:12644>
+
+### Metrics Collection
+
+In v1, we wish to move computation and overhead out of the engine core
+process to minimize the time between each forward pass.
+
+The overall idea of V1 EngineCore design is:
+- EngineCore is the inner loop. Performance is most critical here
+- AsyncLLM is the outer loop. This is overlapped with GPU execution
+  (ideally), so this is where any "overheads" should be if
+  possible. So AsyncLLM.output_handler_loop is the ideal place for the
+  metrics bookkeeping if possible.
+
+We will achieve this by collecting metrics in the frontend API server,
+and base these metrics on information we can glean from the
+`EngineCoreOutputs` returned by the engine core process to the
+frontend.
+
+### Interval Calculations
+
+Many of our metrics are the time interval between various events in
+the processing of a request. It is best practice to use timestamps
+based on "monotonic time" (`time.monotonic()`) rather than "wall-clock
+time" (`time.time()`) to calculate intervals as the former is
+unaffected by system clock changes (e.g. from NTP).
+
+It's also important to note that monotonic clocks differ between
+processes - each process has its own reference. point. So it is
+meaningless to compare monotonic timestamps from different processes.
+
+Therefore, in order to calculate an interval, we must compare two
+monotonic timestamps from the same process.
+
+### Scheduler Stats
+
+The engine core process will collect some key statistics from the
+scheduler - e.g. the number of requests that were scheduled or waiting
+after the last scheduler pass - and include those statistics in
+`EngineCoreOutputs`.
+
+### Engine Core Events
+
+The engine core will also record the timestamp of certain per-request
+events so that the frontend can calculate the interval between these
+events.
+
+The events are:
+
+- `QUEUED` - when the request was received by the engine core and
+  added to the scheduler queue.
+- `SCHEDULED` - when the request was first scheduled for execution.
+- `PREEMPTED` - the request has been put back in the waiting queue
+  in order to make room for other requests to complete. It will be
+  re-scheduled in future and re-start its prefill phase.
+- `NEW_TOKENS` - when the output included in `EngineCoreOutput` was
+  generated. Since this is common to all requests in a given
+  iteration, we use a single timestamp on `EngineCoreOutputs` to
+  record this event.
+
+And the calculated intervals are:
+
+- Queue interval - between `QUEUED` and most recent `SCHEDULED`.
+- Prefill interval - between most recent `SCHEDULED` and the subsequent
+  first `NEW_TOKENS`.
+- Decode interval - between first (after the most recent `SCHEDULED`) and
+  last `NEW_TOKENS`.
+- Inference interval - between most recent `SCHEDULED` and last `NEW_TOKENS`.
+- Inter-token interval - between successive `NEW_TOKENS`.
+
+Put another way:
+
+:::{image} /assets/design/v1/metrics/intervals-1.png
+:alt: Interval calculations - common case
+:::
+
+We explored the possibility of having the frontend calculate these
+intervals using the timing of events visible by the frontend. However,
+the frontend does not have visibility into the timing of the `QUEUED`
+and `SCHEDULED` events and, since we need to calculate intervals based
+on monotonic timestamps from the same process ... we need the engine
+core to record timestamps for all of these events.
+
+#### Interval Calculations vs Preemptions
+
+When a preemption occurs during decode, since any already generated
+tokens are reused, we consider the preemption as affecting the
+inter-token, decode, and inference intervals.
+
+:::{image} /assets/design/v1/metrics/intervals-2.png
+:alt: Interval calculations - preempted decode
+:::
+
+When a preemption occurs during prefill (assuming such an event
+is possible), we consider the preemption as affecting the
+time-to-first-token and prefill intervals.
+
+:::{image} /assets/design/v1/metrics/intervals-3.png
+:alt: Interval calculations - preempted prefill
+:::
+
+### Frontend Stats Collection
+
+As the frontend processes a single `EngineCoreOutputs` - i.e. the
+output from a single engine core iteration - it collects various
+statistics relating to that iteration:
+
+- The total number of new tokens generated in this iteration.
+- The total number of prompt tokens processed by the prefills that
+  completed in this iteration.
+- The queue intervals for any requests that were scheduled in this
+  iteration.
+- The prefill intervals for any requests that completed prefill in
+  this iteration.
+- The inter-token intervals (Time Per Output Token, TPOT), for all
+  requests included in this iteration.
+- The Time-To-First-Token (TTFT) for any requests that completed
+  prefill in this iteration. However, we calculate this interval
+  relative to when the request was first received by the frontend
+  (`arrival_time`) in order to account for input processing time.
+
+For any requests that were completed in a given iteration, we also
+record:
+
+- The inference and decode intervals - relative to the scheduled and
+  first token events, as described above.
+- End-to-end latency - the interval between frontend `arrival_time`
+  and the frontend receiving the final token.
+
+### Metrics Publishing - Logging
+
+The `LoggingStatLogger` metrics publisher outputs a log `INFO` message
+every 5 seconds with some key metrics:
+
+- The current number of running/waiting requests
+- The current GPU cache usage
+- The number of prompt tokens processed per second over the past 5
+  seconds
+- The number of new tokens generated per second over the past 5
+  seconds
+- The prefix cache hit rate over the most recent 1k kv-cache block queries
+
+### Metrics Publishing - Prometheus
+
+The `PrometheusStatLogger` metrics publisher makes the metrics
+available via a `/metrics` HTTP endpoint in a Prometheus-compatible
+format. A Prometheus instance can then be configured to poll this
+endpoint (e.g. every second) and record the values in its time-series
+database. Prometheus is often used via Grafana, allowing these metrics
+to be graphed over time.
+
+Prometheus supports the following metric types:
+
+- Counter: a value that will increase over time, never reducing, and
+  generally reset to zero when the vLLM instance restarts. For
+  example, the number of tokens generated over the lifetime of the
+  instance.
+- Gauge: a value that goes up and down, for example the number of
+  requests currently scheduled for execution.
+- Histogram: a count of metric samples, recorded in buckets. For
+  example, the number of requests whose TTFT was <1ms, <5ms, <10ms,
+  <20ms, and so on.
+
+Prometheus metrics can also be labelled, allowing metrics to be
+combined according to matching labels. In vLLM, we add a `model_name`
+label to every metric which includes the name of the model served by
+that instance.
+
+Example output:
+
+```bash
+$ curl http://0.0.0.0:8000/metrics
+# HELP vllm:num_requests_running Number of requests in model execution batches.
+# TYPE vllm:num_requests_running gauge
+vllm:num_requests_running{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.0
+...
+# HELP vllm:generation_tokens_total Number of generation tokens processed.
+# TYPE vllm:generation_tokens_total counter
+vllm:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 27453.0
+...
+# HELP vllm:request_success_total Count of successfully processed requests.
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B-Instruct"} 131.0
+vllm:request_success_total{finished_reason="abort",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+...
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 13.0
+vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 97.0
+vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 123.0
+vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 138.0
+vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+```
+
+Note - the choice of histogram buckets to be most useful to users
+across a broad set of use cases is not straightforward and will
+require refinement over time.
+
+### Cache Config Info
+
+`prometheus_client` has support for [Info
+metrics](https://prometheus.github.io/client_python/instrumenting/info/)
+which are equivalent to a `Gauge` whose value is permanently set to 1,
+but exposes interesting key/value pair information via labels. This is
+used for information about an instance that does not change - so it
+only needs to be observed at startup - and allows comparing across
+instances in Prometheus.
+
+We use this concept for the `vllm:cache_config_info` metric:
+
+```
+# HELP vllm:cache_config_info Information of the LLMEngine CacheConfig
+# TYPE vllm:cache_config_info gauge
+vllm:cache_config_info{block_size="16",cache_dtype="auto",calculate_kv_scales="False",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",...} 1.0
+
+```
+
+However, `prometheus_client` has [never supported Info metrics in
+multiprocessing
+mode](https://github.com/prometheus/client_python/pull/300) - for
+[unclear
+reasons](gh-pr:7279#discussion_r1710417152). We
+simply use a `Gauge` metric set to 1 and
+`multiprocess_mode="mostrecent"` instead.
+
+### LoRA Metrics
+
+The `vllm:lora_requests_info` `Gauge` is somewhat similar, except the
+value is the current wall-clock time, and is updated every iteration.
+
+The label names used are:
+
+- `running_lora_adapters`: a per-adapter count of the number requests
+  running using that adapter, formatted as a comma-separated string.
+- `waiting_lora_adapters`: similar, except counting requests that are
+  waiting to be scheduled.
+- `max_lora` - the static "max number of LoRAs in a single batch."
+  configuration.
+
+Encoding a running/waiting counts for multiple adapters in a
+comma-separated string seems quite misguided - we could use labels to
+distinguish between per-adapter counts. This should be revisited.
+
+Note that `multiprocess_mode="livemostrecent"` is used - the most
+recent metric is used, but only from currently running processes.
+
+This was added in
+<gh-pr:9477> and there is
+[at least one known
+user](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/54). If
+we revisit this design and deprecate the old metric, we should reduce
+the need for a significant deprecation period by making the change in
+v0 also and asking this project to move to the new metric.
+
+### Prefix Cache metrics
+
+The discussion in <gh-issue:10582> about adding prefix cache metrics yielded
+some interesting points which may be relevant to how we approach
+future metrics.
+
+Every time the prefix cache is queried, we record the number of blocks
+queried and the number of queried blocks present in the cache
+(i.e. hits).
+
+However, the metric of interest is the hit rate - i.e. the number of
+hits per query.
+
+In the case of logging, we expect the user is best served by
+calculating the hit rate over a fixed number of the most recent
+queries (the interval is fixed to 1k most recent queries for now).
+
+In the case of Prometheus though, we should take advantage of the
+time-series nature of Prometheus and allow the user to calculate the
+hit rate over an interval of their choosing. For example, a PromQL
+query to calculate the hit interval of the past 5 minutes:
+
+```text
+rate(cache_query_hit[5m]) / rate(cache_query_total[5m])
+```
+
+To achieve this, we should record the queries and hits as counters in
+Prometheus, rather than recording the hit rate as a gauge.
+
+## Deprecated Metrics
+
+### How To Deprecate
+
+Deprecating metrics shouldn't be taken lightly. Users may not notice a
+metric has been deprecated, and may be quite inconvenienced when it is
+suddenly (from their perspective) when it is removed, even if there is
+an equivalent metric for them to use.
+
+As an example, see how `vllm:avg_prompt_throughput_toks_per_s` was
+[deprecated](gh-pr:2764) (with a
+comment in the code),
+[removed](gh-pr:12383), and then
+[noticed by a
+user](gh-issue:13218).
+
+In general:
+
+1) We should be cautious about deprecating metrics, especially since
+   it can be hard to predict the user impact.
+2) We should include a prominent deprecation notice in the help string
+   that is included in the `/metrics' output.
+3) We should list deprecated metrics in user-facing documentation and
+   release notes.
+4) We should consider hiding deprecated metrics behind a CLI argument
+   in order to give administrators [an escape
+   hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
+   for some time before deleting them.
+
+### Unimplemented - `vllm:tokens_total`
+
+Added by <gh-pr:4464>, but apparently never implemented. This can just be
+removed.
+
+### Duplicated - Queue Time
+
+The `vllm:time_in_queue_requests` Histogram metric was added by
+<gh-pr:9659> and its calculation is:
+
+```
+    self.metrics.first_scheduled_time = now
+    self.metrics.time_in_queue = now - self.metrics.arrival_time
+```
+
+Two weeks later, <gh-pr:4464> added `vllm:request_queue_time_seconds` leaving
+us with:
+
+```
+if seq_group.is_finished():
+    if (seq_group.metrics.first_scheduled_time is not None and
+            seq_group.metrics.first_token_time is not None):
+        time_queue_requests.append(
+            seq_group.metrics.first_scheduled_time -
+            seq_group.metrics.arrival_time)
+    ...
+    if seq_group.metrics.time_in_queue is not None:
+        time_in_queue_requests.append(
+            seq_group.metrics.time_in_queue)
+```
+
+This seems duplicative, and one of them should be removed. The latter
+is used by the Grafana dashboard, so we should deprecate or remove the
+former from v0.
+
+### Prefix Cache Hit Rate
+
+See above - we now expose 'queries' and 'hits' counters rather than a
+'hit rate' gauge.
+
+### KV Cache Offloading
+
+Two v0 metrics relate to a "swapped" preemption mode that is no
+longer relevant in v1:
+
+- `vllm:num_requests_swapped`
+- `vllm:cpu_cache_usage_perc`
+
+In this mode, when a request is preempted (e.g. to make room in KV
+cache to complete other requests), we swap kv cache blocks out to CPU
+memory. This is also known as "KV cache offloading" and is configured
+with `--swap-space` and `--preemption-mode`.
+
+In v0, [VLLM has long supported beam
+search](gh-issue:6226). The
+SequenceGroup encapsulated the idea of N Sequences which
+all shared the same prompt kv blocks. This enabled KV cache block
+sharing between requests, and copy-on-write to do branching. CPU
+swapping was intended for these beam search like cases.
+
+Later, the concept of prefix caching was introduced, which allowed KV
+cache blocks to be shared implicitly. This proved to be a better
+option than CPU swapping since blocks can be evicted slowly on demand
+and the part of the prompt that was evicted can be recomputed.
+
+SequenceGroup was removed in V1, although a replacement will be
+required for "parallel sampling" (`n>1`). [Beam search was moved out of
+the core (in
+V0)](gh-issue:8306). There was a
+lot of complex code for a very uncommon feature.
+
+In V1, with prefix caching being better (zero over head) and therefore
+on by default, the preemption and recompute strategy should work
+better.
+
+## Future Work
+
+### Parallel Sampling
+
+Some v0 metrics are only relevant in the context of "parallel
+sampling". This is where the `n` parameter in a request is used to
+request multiple completions from the same prompt.
+
+As part of adding parallel sampling support in <gh-pr:10980> we should
+also add these metrics.
+
+- `vllm:request_params_n` (Histogram)
+
+Observes the value of the 'n' parameter of every finished request.
+
+- `vllm:request_max_num_generation_tokens` (Histogram)
+
+Observes the maximum output length of all sequences in every finished
+sequence group. In the absence of parallel sampling, this is
+equivalent to `vllm:request_generation_tokens`.
+
+### Speculative Decoding
+
+Some v0 metrics are specific to "speculative decoding". This is where
+we generate candidate tokens using a faster, approximate method or
+model and then validate those tokens with the larger model.
+
+- `vllm:spec_decode_draft_acceptance_rate` (Gauge)
+- `vllm:spec_decode_efficiency` (Gauge)
+- `vllm:spec_decode_num_accepted_tokens_total` (Counter)
+- `vllm:spec_decode_num_draft_tokens_total` (Counter)
+- `vllm:spec_decode_num_emitted_tokens_total` (Counter)
+
+There is a PR under review (<gh-pr:12193>) to add "prompt lookup (ngram)"
+seculative decoding to v1. Other techniques will follow. We should
+revisit the v0 metrics in this context.
+
+Note - we should probably expose acceptance rate as separate accepted
+and draft counters, like we do for prefix caching hit rate. Efficiency
+likely also needs similar treatment.
+
+### Autoscaling and Load-balancing
+
+A common use case for our metrics is to support automated scaling of
+vLLM instances.
+
+For related discussion from the [Kubernetes Serving Working
+Group](https://github.com/kubernetes/community/tree/master/wg-serving),
+see:
+
+- [Standardizing Large Model Server Metrics in
+  Kubernetes](https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk)
+- [Benchmarking LLM Workloads for Performance Evaluation and
+  Autoscaling in
+  Kubernetes](https://docs.google.com/document/d/1k4Q4X14hW4vftElIuYGDu5KDe2LtV1XammoG-Xi3bbQ)
+- [Inference
+  Perf](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf)
+- <gh-issue:5041> and <gh-pr:12726>.
+  
+This is a non-trivial topic. Consider this comment from Rob:
+
+> I think this metric should focus on trying to estimate what the max
+> concurrency that will cause the average request length > queries per
+> second ... since this is really what will "saturate" the server.
+
+A clear goal is that we should expose the metrics required to detect
+this saturation point, so administrators can implement auto-scaling
+rules based on those. However, in order to do so, we need to have a
+clear view on how an administrator (and automated monitoring system)
+should judge an instance as approaching saturation:
+
+> To identify, what is the saturation point for model server compute
+> (the inflection point where we cannot get more throughput with a
+> higher request rate, but start to incur additional latency) so we
+> can autoscale effectively?
+
+### Metric Naming
+
+Our approach to naming metrics probably deserves to be revisited:
+
+1. The use of colons in metric names seems contrary to ["colons are
+   reserved for user defined recording
+   rules"](https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels)
+2. Most of our metrics follow the convention of ending with units, but
+   not all do.
+3. Some of our metric names end with `_total`:
+
+```
+If there is a suffix of `_total` on the metric name, it will be removed. When
+exposing the time series for counter, a `_total` suffix will be added. This is
+for compatibility between OpenMetrics and the Prometheus text format, as OpenMetrics
+requires the `_total` suffix.
+```
+
+### Adding More Metrics
+
+There is no shortage of ideas for new metrics:
+
+- Examples from other projects like
+  [TGI](https://github.com/IBM/text-generation-inference?tab=readme-ov-file#metrics)
+- Proposals arising from specific use cases, like the Kubernetes
+  auto-scaling topic above
+- Proposals that might arise out of standardisation efforts like
+  [OpenTelemetry Semantic Conventions for Gen
+  AI](https://github.com/open-telemetry/semantic-conventions/tree/main/docs/gen-ai).
+
+We should be cautious in our approach to adding new metrics. While
+metrics are often relatively straightforward to add:
+
+1. They can be difficult to remove - see the section on deprecation
+   above.
+2. They can have a meaningful performance impact when enabled. And
+   metrics are usually of very limited use unless they can be enabled
+   by default and in production.
+3. They have an impact on development and maintenance of the
+   project. Every metric added to v0 has made this v1 effort more
+   time-consuming, and perhaps not all metrics justify this ongoing
+   investment in their maintenance.
+
+## Tracing - OpenTelemetry
+
+Metrics provide an aggregated view over time of the system's
+performance and health. Tracing, on the other hand, tracks individual
+requests as they move through different services and components. Both
+fall under the more general heading of "Observability".
+
+v0 has support for OpenTelemetry tracing:
+
+- Added by <gh-pr:4687>
+- Configured with `--oltp-traces-endpoint` and
+  `--collect-detailed-traces`
+- [OpenTelemetry blog
+  post](https://opentelemetry.io/blog/2024/llm-observability/)
+- [User-facing
+  docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html)
+- [Blog
+  post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
+- [IBM product
+  docs](https://www.ibm.com/docs/en/instana-observability/current?topic=mgaa-monitoring-large-language-models-llms-vllm-public-preview)
+  
+OpenTelemetry has a [Gen AI Working
+Group](https://github.com/open-telemetry/community/blob/main/projects/gen-ai.md).
+
+Since metrics is a big enough topic on its own, we are going to tackle
+the topic of tracing in v1 separately.
+
+### OpenTelemetry Model Forward vs Execute Time
+
+In v0, we have the following two metrics:
+
+- `vllm:model_forward_time_milliseconds` (Histogram) - The time spent
+  in the model forward pass when this request was in the batch.
+- `vllm:model_execute_time_milliseconds` (Histogram) - The time spent
+  in the model execute function. This will include model forward,
+  block/sync across workers, cpu-gpu sync time and sampling time.
+
+These metrics are only enabled when OpenTelemetry tracing is enabled
+and if `--collect-detailed-traces=all/model/worker` is used. The
+documentation for this option states:
+
+> collect detailed traces for the specified "modules. This involves
+> use of possibly costly and or blocking operations and hence might
+> have a performance impact.
+
+The metrics were added by <gh-pr:7089> and who up in an OpenTelemetry trace
+as:
+
+```
+-> gen_ai.latency.time_in_scheduler: Double(0.017550230026245117)
+-> gen_ai.latency.time_in_model_forward: Double(3.151565277099609)
+-> gen_ai.latency.time_in_model_execute: Double(3.6468167304992676)
+```
+
+We already have `inference_time` and `decode_time` metrics, so the
+question is whether there are sufficiently common use cases for the
+higher-resolution timings to justify the overhead.
+
+Since we are going to treat the question of OpenTelemetry support
+separately, we will include these particular metrics under that topic.
diff --git a/docs/source/index.md b/docs/source/index.md
index d17155647f9..a6806900cb3 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -158,6 +158,7 @@ design/multiprocessing
 :maxdepth: 2
 
 design/v1/prefix_caching
+design/v1/metrics
 :::
 
 % How to contribute to the vLLM project

From 0a1c010533b5a6464e30132b121e7f5a3ba41e14 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 4 Mar 2025 15:10:32 -0600
Subject: [PATCH 0498/1240] [Security] Serialize using safetensors instead of
 pickle in Mooncake Pipe (#14228)

Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../distributed/kv_transfer/kv_pipe/mooncake_pipe.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
index 57a2b0393ba..ec46d404544 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py
@@ -2,13 +2,14 @@
 
 import json
 import os
-import pickle
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import Optional, Union
 
 import torch
 import zmq
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
 
 from vllm.config import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
@@ -237,14 +238,13 @@ def tensor_hash(self, tensor: torch.Tensor) -> int:
         return hash(tensor.data_ptr())
 
     def _send_impl(self, tensor: torch.Tensor) -> None:
-        """Implement the tensor sending logic."""
-        value_bytes = pickle.dumps(tensor)
-        self.transfer_engine.send_bytes(value_bytes)
+        """Implement the tensor sending logic using safetensors."""
+        self.transfer_engine.send_bytes(safetensors_save({"tensor": tensor}))
 
     def _recv_impl(self) -> torch.Tensor:
-        """Implement the tensor receiving logic."""
+        """Implement the tensor receiving logic using safetensors."""
         data = self.transfer_engine.recv_bytes()
-        return pickle.loads(data)
+        return safetensors_load(data)["tensor"].to(self.device)
 
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """Send tensor to the target process."""

From f18b5f2a056b0c3f6e76e012d6f4f409c7441c1f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 4 Mar 2025 16:27:00 -0500
Subject: [PATCH 0499/1240] Clean up unused padding_idx variables across many
 model definitions (#13240)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/arctic.py        | 1 -
 vllm/model_executor/models/baichuan.py      | 1 -
 vllm/model_executor/models/bart.py          | 1 -
 vllm/model_executor/models/chameleon.py     | 1 -
 vllm/model_executor/models/deepseek.py      | 1 -
 vllm/model_executor/models/deepseek_v2.py   | 1 -
 vllm/model_executor/models/exaone.py        | 1 -
 vllm/model_executor/models/florence2.py     | 1 -
 vllm/model_executor/models/fuyu.py          | 1 -
 vllm/model_executor/models/granite.py       | 1 -
 vllm/model_executor/models/granitemoe.py    | 1 -
 vllm/model_executor/models/idefics3.py      | 1 -
 vllm/model_executor/models/internlm2.py     | 1 -
 vllm/model_executor/models/jamba.py         | 1 -
 vllm/model_executor/models/llama.py         | 1 -
 vllm/model_executor/models/mamba.py         | 1 -
 vllm/model_executor/models/minicpm.py       | 1 -
 vllm/model_executor/models/mixtral.py       | 1 -
 vllm/model_executor/models/mixtral_quant.py | 1 -
 vllm/model_executor/models/mllama.py        | 1 -
 vllm/model_executor/models/nemotron.py      | 1 -
 vllm/model_executor/models/olmoe.py         | 1 -
 vllm/model_executor/models/opt.py           | 1 -
 vllm/model_executor/models/orion.py         | 1 -
 vllm/model_executor/models/phimoe.py        | 1 -
 vllm/model_executor/models/qwen2.py         | 1 -
 vllm/model_executor/models/qwen2_moe.py     | 1 -
 vllm/model_executor/models/solar.py         | 1 -
 vllm/model_executor/models/starcoder2.py    | 2 --
 vllm/model_executor/models/whisper.py       | 6 +-----
 30 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index e2d4a8de605..065715cbde4 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -370,7 +370,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 4fb68e7b48d..7e2b7c862e5 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -267,7 +267,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 82684dfa730..5847c50862e 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -725,7 +725,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 137bfc0f98c..68284a018af 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -851,7 +851,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index c04e7a02bae..f0212f37657 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -339,7 +339,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index cf244ff572c..548f913c83c 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -570,7 +570,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank:
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 79939f6f40e..7d01dd37826 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -313,7 +313,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 7c226ea47bd..e892a1a4fc6 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -592,7 +592,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 6f4b6cdda33..51c79ba846c 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -255,7 +255,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.text_config.vocab_size
         self.image_token_id = _IMAGE_TOKEN_ID
         self.image_feature_size = config.patch_size**2 * config.num_channels
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 201e15d3a30..eba8207d2cd 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -260,7 +260,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 9b56874a8ad..5152539c68f 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -252,7 +252,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 3e0b3c768b6..19d5a4c2599 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -404,7 +404,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
         self.vision_model = Idefics3VisionTransformer(
             config.vision_config,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 41ca399b9ef..520b85c0cdf 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -261,7 +261,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.tok_embeddings = VocabParallelEmbedding(
             config.vocab_size,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 58eccd6a6b8..4ac83c6ece1 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -271,7 +271,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a0aff9e609d..81b5d9bda9a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -302,7 +302,6 @@ def __init__(self,
 
         self.config = config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 46b9182f2d7..7a525ad8e49 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -90,7 +90,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         is_lora_enabled = bool(lora_config)
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 34e1f3927a9..cf03396a9ca 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -365,7 +365,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index c8dea557e57..b21aa601879 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -254,7 +254,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 21b52d9f54c..8a893b6d858 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -302,7 +302,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2a829bf0e61..b1ccd8e851c 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1031,7 +1031,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8,
                                                    config.hidden_size)
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 3b86b91465c..a2b49494968 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -300,7 +300,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index e27ff5deace..8e72b36e7e5 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -252,7 +252,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index e4775478a54..d4c2b4c48d9 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -200,7 +200,6 @@ def __init__(
     ):
         super().__init__()
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
         self.vocab_size = config.vocab_size
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 6668ede91ee..0b42666e02d 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -217,7 +217,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index c35c7e9fcce..d14425f4a70 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -441,7 +441,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index fe615c41aea..c4d02e5ddeb 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -284,7 +284,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.config = config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank or (config.tie_word_embeddings
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 41536b34b2f..92a66568b0f 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -325,7 +325,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 0f9e517aeb5..1cae0a7fe0d 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -269,7 +269,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 90098af9dde..3d11dfd7792 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -212,10 +212,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
 
         self.config = config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
-        # TODO: consider padding_idx (currently removed)
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 1cb026f4bcd..8ed68bd89e5 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -49,10 +49,7 @@ class WhisperAudioInputs(TypedDict):
 
 class WhisperPositionalEmbedding(nn.Embedding):
 
-    def __init__(self,
-                 num_positions: int,
-                 embedding_dim: int,
-                 padding_idx: Optional[int] = None):
+    def __init__(self, num_positions: int, embedding_dim: int):
         super().__init__(num_positions, embedding_dim)
 
     def forward(self, position_ids):
@@ -359,7 +356,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         embed_dim = config.d_model
         self.num_mel_bins = config.num_mel_bins
-        self.padding_idx = config.pad_token_id
         self.max_source_positions = config.max_source_positions
         self.embed_scale = (math.sqrt(embed_dim)
                             if config.scale_embedding else 1.0)

From 2b559ee417e5cf88d81ecf71bab1cc3a3cadec85 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Tue, 4 Mar 2025 15:37:55 -0800
Subject: [PATCH 0500/1240] [ROCm] Disable a few more kernel tests that are
 broken on ROCm (#14145)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 96fcafc9dc1..955baa1ff8b 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -93,7 +93,12 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_rand.py \
   --ignore=kernels/test_sampler.py \
   --ignore=kernels/test_cascade_flash_attn.py \
-  --ignore=kernels/test_mamba_mixer2.py"
+  --ignore=kernels/test_mamba_mixer2.py \
+  --ignore=kernels/test_aqlm.py \
+  --ignore=kernels/test_machete_mm.py \
+  --ignore=kernels/test_mha_attn.py \
+  --ignore=kernels/test_block_fp8.py \
+  --ignore=kernels/test_permute_cols.py"
 fi
 
 #ignore certain Entrypoints tests

From ceca6af4b81ad52438e4484620f6d1c84b4874da Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Mar 2025 19:58:48 -0500
Subject: [PATCH 0501/1240] [V1][TPU] TPU multimodal model support for ragged
 attention (#14158)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 222 +++++++++++++++++++++++++----
 vllm/v1/worker/tpu_worker.py       |   2 +-
 2 files changed, 194 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 104e5a3dcfc..f9a3217fbef 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -15,14 +15,18 @@
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
 from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
                                                NUM_QUERIES_PER_BLOCK,
                                                PallasAttentionBackend,
                                                PallasMetadata)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
@@ -72,8 +76,10 @@ def __init__(
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = scheduler_config.max_num_batched_tokens
-        self.max_num_reqs = scheduler_config.max_num_seqs
+        self.max_num_tokens = _get_padded_number(
+            scheduler_config.max_num_batched_tokens, NUM_QUERIES_PER_BLOCK)
+        self.max_num_reqs = _get_padded_number(scheduler_config.max_num_seqs,
+                                               NUM_QUERIES_PER_BLOCK)
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -84,6 +90,28 @@ def __init__(
         self.head_size = model_config.get_head_size()
         self.hidden_size = model_config.get_hidden_size()
 
+        # Multi-modal data support
+        self.input_registry = INPUT_REGISTRY
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.uses_mrope = model_config.uses_mrope
+        # TODO: Support M-RoPE (e.g, Qwen2-VL)
+        assert not self.uses_mrope, "TPU does not support M-RoPE yet."
+
+        encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
+            model_config=model_config,
+            scheduler_config=scheduler_config,
+        )
+        self.max_num_encoder_input_tokens = encoder_compute_budget
+        self.encoder_cache_size = encoder_cache_size
+
+        # Lazy initialization
+        # self.model: nn.Module  # Set after load_model
+        self.kv_caches: list[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
+
+        # Request states.
+        self.requests: dict[str, CachedRequestState] = {}
         # Persistent batch.
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
@@ -91,18 +119,9 @@ def __init__(
             max_num_blocks_per_req=self.max_num_blocks_per_req,
             device=self.device,
             pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
+            vocab_size=model_config.get_vocab_size(),
         )
 
-        # Request states.
-        self.requests: dict[str, CachedRequestState] = {}
-
-        # req_id -> (input_id -> encoder_output)
-        self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {}
-
-        # KV caches for forward pass
-        self.kv_caches: list[tuple[torch.Tensor, torch.Tensor]] = []
-
         # Cached torch/numpy tensor
         # The pytorch tensor and numpy array share the same buffer.
         # Sometimes the numpy op is faster so we create both.
@@ -164,6 +183,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -177,6 +197,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
             if req_index is not None:
                 removed_req_indices.append(req_index)
 
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
+
         # Remove the unscheduled requests from the persistent batch.
         # NOTE(woosuk): The unscheduled requests are either preempted requests
         # or running requests that are not scheduled in this step. We remove
@@ -426,6 +454,92 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = query_start_loc[1:] - 1
         return attn_metadata, logits_indices
 
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+            for input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[input_id])
+                req_input_ids.append((req_id, input_id))
+
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
+        # FIXME(ywang96): This is a hacky way to deal with multiple modalities
+        # in the same batch while still being able to benefit from batching
+        # multimodal inputs. The proper solution should be reordering the
+        # encoder outputs.
+        grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
+
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                           device=self.device)
+
+            # Run the encoder.
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
+                **batched_mm_inputs)
+
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_encoder_outputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> list[torch.Tensor]:
+        encoder_outputs: list[torch.Tensor] = []
+        for req_id in self.input_batch.req_ids:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
+
     @torch.no_grad()
     def execute_model(
         self,
@@ -434,16 +548,42 @@ def execute_model(
         # Update cached state
         self._update_states(scheduler_output)
 
+        if self.is_multimodal_model:
+            # Run the multimodal encoder if any.
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+        else:
+            encoder_outputs = []
+
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
 
+        if self.is_multimodal_model:
+            # NOTE(woosuk): To unify token ids and soft tokens (vision
+            # embeddings), we always use embeddings (rather than token ids)
+            # as input to the multimodal model, even when the input is text.
+            if encoder_outputs:
+                inputs_embeds = self.model.get_input_embeddings(
+                    self.input_ids, encoder_outputs)
+            else:
+                inputs_embeds = self.model.get_input_embeddings(self.input_ids)
+            input_ids = None
+        else:
+            # For text-only models, we use token ids as input.
+            # While it is possible to use embeddings as input just like the
+            # multimodal models, it is not desirable for performance since
+            # then the embedding layer is not included in the CUDA graph.
+            input_ids = self.input_ids
+            inputs_embeds = None
+
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
-                token_ids=self.input_ids,
-                position_ids=self.position_ids,
+                input_ids=input_ids,
+                positions=self.position_ids,
                 kv_caches=self.kv_caches,
+                inputs_embeds=inputs_embeds,
             )
         hidden_states = hidden_states[:total_num_scheduled_tokens]
         num_reqs = self.input_batch.num_reqs
@@ -538,14 +678,21 @@ def load_model(self) -> None:
                                    fullgraph=True,
                                    dynamic=False)
 
-    def dummy_run(
+    def _dummy_run(
         self,
         kv_caches,
         num_tokens: int,
     ) -> None:
-        input_ids = torch.zeros(num_tokens,
-                                dtype=torch.int32,
-                                device=self.device)
+        if self.is_multimodal_model:
+            input_ids = None
+            inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
+                                        dtype=self.dtype,
+                                        device=self.device)
+        else:
+            input_ids = torch.zeros((num_tokens),
+                                    dtype=torch.int32,
+                                    device=self.device)
+            inputs_embeds = None
         position_ids = torch.zeros(num_tokens,
                                    dtype=torch.int32,
                                    device=self.device)
@@ -571,7 +718,10 @@ def dummy_run(
             num_seqs=num_tokens,
         )
 
-        torch._dynamo.mark_dynamic(input_ids, 0)
+        if self.is_multimodal_model:
+            torch._dynamo.mark_dynamic(inputs_embeds, 0)
+        else:
+            torch._dynamo.mark_dynamic(input_ids, 0)
         torch._dynamo.mark_dynamic(position_ids, 0)
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
         torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
@@ -580,7 +730,12 @@ def dummy_run(
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
-            self.model(input_ids, position_ids, kv_caches)
+            self.model(
+                input_ids=input_ids,
+                positions=position_ids,
+                kv_caches=kv_caches,
+                inputs_embeds=inputs_embeds,
+            )
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -590,11 +745,11 @@ def capture_model(self) -> None:
         start = time.perf_counter()
         num_tokens = 16
         while True:
-            self.dummy_run(self.kv_caches, num_tokens)
+            self._dummy_run(self.kv_caches, num_tokens)
             logger.info("  -- num_tokens: %d", num_tokens)
             xm.mark_step()
             xm.wait_device_ops()
-            if num_tokens >= self.scheduler_config.max_num_batched_tokens:
+            if num_tokens >= self.max_num_tokens:
                 break
             num_tokens *= 2
         end = time.perf_counter()
@@ -647,17 +802,20 @@ def __init__(self, model: nn.Module):
 
     def forward(
         self,
-        token_ids: torch.Tensor,
-        position_ids: torch.Tensor,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
         kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Executes the forward pass of the model and samples the next token.
 
         Args:
-            token_ids: The input token IDs of shape [num_tokens].
-            position_ids: The input position IDs of shape [num_tokens].
+            input_ids: The input token IDs of shape [num_tokens].
+            positions: The input position IDs of shape [num_tokens].
             kv_caches: The key and value caches. They can be None during the
                 memory profiling at initialization.
+            inputs_embeds: The input embeddings of shape [num_tokens,
+                hidden_size]. It is used for multimodal models.
         """
         # Skip this in memory profiling at initialization.
         if kv_caches[0][0].numel() > 0:
@@ -684,9 +842,9 @@ def forward(
 
         assert self.model is not None
         hidden_states = self.model(
-            token_ids,
-            position_ids,
-            kv_caches,
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
         )
 
         return hidden_states
@@ -699,6 +857,12 @@ def compute_logits(
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
         return logits
 
+    def get_multimodal_embeddings(self, *args, **kwargs):
+        return self.model.get_multimodal_embeddings(*args, **kwargs)
+
+    def get_input_embeddings(self, *args, **kwargs):
+        return self.model.get_input_embeddings(*args, **kwargs)
+
 
 def _get_padded_number(n: int, multiple: int) -> int:
     return ((n + multiple - 1) // multiple) * multiple
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 76b6297606c..d09f5dd8400 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -134,7 +134,7 @@ def determine_available_memory(self) -> int:
             self.vllm_config.compilation_config.static_forward_context,
             runner_kv_caches)
 
-        self.model_runner.dummy_run(
+        self.model_runner._dummy_run(
             runner_kv_caches,
             num_tokens=self.scheduler_config.max_num_batched_tokens,
         )

From 0ad30fe339154f9a53f4af18f3d1fbd547695969 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 5 Mar 2025 10:33:50 +0800
Subject: [PATCH 0502/1240] [misc] announce china meetup (#14248)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f22a1f9c5c8..613f178ea52 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,11 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-We are excited to invite you to our Menlo Park meetup with Meta, evening of Thursday, February 27! Meta engineers will discuss the improvements on top of vLLM, and vLLM contributors will share updates from the v0.7.x series of releases. [Register Now](https://lu.ma/h7g3kuj9)
+We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
+
+Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
+
+👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
 
 ---
 

From 638546238dea81d55f429588e692661db1102236 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 5 Mar 2025 09:55:00 +0530
Subject: [PATCH 0503/1240] Moved numba from common requirements to cuda/rocm
 specific requirements (#14199)

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-common.txt | 1 -
 requirements-cuda.txt   | 2 ++
 requirements-rocm.txt   | 4 +++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index fb84d6d9e7b..3446ae4c464 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -1,7 +1,6 @@
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding.
 requests >= 2.26.0
 tqdm
 blake3
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 2de06668c3a..b641ea7f870 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r requirements-common.txt
 
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+
 # Dependencies for NVIDIA GPUs
 ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch == 2.5.1
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 97985655cbf..83f3e18a96a 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1,6 +1,8 @@
 # Common dependencies
 -r requirements-common.txt
 
+numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+
 # Dependencies for AMD GPUs
 awscli
 boto3
@@ -11,4 +13,4 @@ peft
 pytest-asyncio
 tensorizer>=2.9.0
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
\ No newline at end of file
+runai-model-streamer-s3==0.11.0

From e8ccfc27bd0f800b5d4c21e9a4b939d41caf44ce Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Mar 2025 23:25:24 -0500
Subject: [PATCH 0504/1240] Disable GPTQ AllSpark kernels for CUDA Compiler <
 12.0 (#14157)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7e329294ce..c09325ad059 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # Only build AllSpark kernels if we are building for at least some compatible archs.
   cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (ALLSPARK_ARCHS)
+  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
     set(ALLSPARK_SRCS
        "csrc/quantization/gptq_allspark/allspark_repack.cu"
        "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
   else()
     message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
+                   " in CUDA target architectures, or CUDA not >= 12.0")
   endif()
 
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require

From e4574b1952d2a7b02dba9bbb36f043126a18a840 Mon Sep 17 00:00:00 2001
From: rainkert <93575312+rainkert@users.noreply.github.com>
Date: Wed, 5 Mar 2025 12:25:53 +0800
Subject: [PATCH 0505/1240] [Bugfix] Fix gptq_marlin for deepseek-v3 (#13750)

Signed-off-by: dangshunya <dangshunya@baichuan-inc.com>
Co-authored-by: dangshunya <dangshunya@baichuan-inc.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/quantization/gptq_marlin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 21db8ccba05..9f53ffc1d7f 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -569,7 +569,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
-            size_k=layer.w2_scales.shape[1] * self.quant_config.pack_factor,
+            size_k=layer.w2_scales.shape[1] *
+            (self.quant_config.group_size if self.quant_config.group_size != -1
+             else self.quant_config.pack_factor),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
         )

From d07a230a84300619e5d543edcae73c688c49d53e Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 4 Mar 2025 20:39:13 -0800
Subject: [PATCH 0506/1240] [V1][Bugfix] Do not reset prefix caching metrics
 (#14235)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/metrics/loggers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index d02b9a5da27..44493709b63 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -34,6 +34,9 @@ class LoggingStatLogger(StatLoggerBase):
     def __init__(self):
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
+        # Prefix cache metrics. This cannot be reset.
+        # TODO: Make the interval configurable.
+        self.prefix_caching_metrics = PrefixCachingMetrics()
 
     def _reset(self, now):
         self.last_log_time = now
@@ -42,9 +45,6 @@ def _reset(self, now):
         self.num_prompt_tokens: list[int] = []
         self.num_generation_tokens: list[int] = []
 
-        # Prefix cache metrics. TODO: Make the interval configurable.
-        self.prefix_caching_metrics = PrefixCachingMetrics()
-
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)

From 819553d93c0cc024df4744b690935553b9273e3f Mon Sep 17 00:00:00 2001
From: Congcong Chen <congcongchen@microsoft.com>
Date: Tue, 4 Mar 2025 20:57:01 -0800
Subject: [PATCH 0507/1240] [Model] New model support for
 Phi-4-multimodal-instruct (#14119)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |    9 +-
 requirements-common.txt                       |    1 +
 tests/models/registry.py                      |    2 +
 vllm/config.py                                |    4 +-
 vllm/entrypoints/chat_utils.py                |    4 +
 vllm/model_executor/models/phi4mm.py          | 1803 +++++++++++++++
 vllm/model_executor/models/phi4mm_audio.py    | 1403 ++++++++++++
 vllm/model_executor/models/phi4mm_utils.py    | 1969 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 .../models/vision_siglip_navit.py             | 1966 ++++++++++++++++
 10 files changed, 7159 insertions(+), 3 deletions(-)
 create mode 100644 vllm/model_executor/models/phi4mm.py
 create mode 100644 vllm/model_executor/models/phi4mm_audio.py
 create mode 100644 vllm/model_executor/models/phi4mm_utils.py
 create mode 100644 vllm/model_executor/models/vision_siglip_navit.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 409a4d1210b..fc363585b0e 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -410,7 +410,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `Phi3ForCausalLM`
   * Phi-4, Phi-3
-  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
   * ✅︎
   * ✅︎
 - * `Phi3SmallForCausalLM`
@@ -856,6 +856,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Phi4MMForCausalLM`
+  * Phi-4-multimodal
+  * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
+  * `microsoft/Phi-4-multimodal-instruct`, etc.
+  * ✅︎
+  *
+  *
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
diff --git a/requirements-common.txt b/requirements-common.txt
index 3446ae4c464..27f5aad96aa 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -37,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
+scipy # Required for phi-4-multimodal-instruct
\ No newline at end of file
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 97db33b46fa..3c3247eaf3e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -272,6 +272,8 @@ def check_available_online(
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True),
+    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
+                                        trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                        tokenizer_mode="mistral"),
     "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
diff --git a/vllm/config.py b/vllm/config.py
index f87d2d6e82c..3f1bff49812 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2284,9 +2284,9 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        # Setting the maximum rank to 256 should be able to satisfy the vast
+        # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256)
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
         possible_lora_extra_vocab_size = (0, 256, 512)
         if self.max_lora_rank not in possible_max_ranks:
             raise ValueError(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b05842dd27d..8f906cf1d80 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -395,6 +395,8 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
+            if model_type == "phi4mm":
+                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
             if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
@@ -424,6 +426,8 @@ def _placeholder_str(self, modality: ModalityStr,
         elif modality == "audio":
             if model_type == "ultravox":
                 return "<|audio|>"
+            if model_type == "phi4mm":
+                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
             if model_type == "qwen2_audio":
                 return (f"Audio {current_count}: "
                         f"<|audio_bos|><|AUDIO|><|audio_eos|>")
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
new file mode 100644
index 00000000000..27ae9bcca2e
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm.py
@@ -0,0 +1,1803 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+import re
+from functools import lru_cache
+from typing import (Dict, Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import numpy as np
+import scipy.signal
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import PretrainedConfig
+from transformers.utils import logging
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext)
+from vllm.inputs.data import TokenInputs, token_inputs
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
+from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+
+from .interfaces import SupportsLoRA, SupportsMultiModal
+from .phi4mm_audio import AudioEmbedding
+from .utils import maybe_prefix
+from .vision_siglip_navit import get_siglip_vision_model
+
+# <|endoftext10|> (see vocab.json in hf model)
+_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
+# <|endoftext11|>
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011
+
+_AUDIO_MAX_SOUNDFILE_SIZE = 241_000
+DUMMY_SAMPLING_FREQUENCY = 16_000  # kHz
+
+DYNAMIC_HD = 16
+AUDIO_TOKEN_PATTERN = r"<\|audio_(\d+)\|>"
+IMAGE_TOKEN_PATTERN = r"<\|image_(\d+)\|>"
+
+SIGLIP_NAME = "siglip-so400m-patch14-448"
+VISION_ENCODER_TO_PROCESSING_CONFIG = {
+    'siglip-so400m-patch14-448': {
+        'dynamic_hd': 16,
+        'vit_image_size': 448,
+        'vit_patch_size': 14,
+        'token_compression_factor': 2,
+    },
+}
+logger = logging.get_logger(__name__)
+# This is a workaround to prevent text (user input) + audio + image
+# from being used in the same prompt.
+# It includes token ids for "/n" and tokens in added_tokens_decoder
+# from the tokenizer_confg.json file.
+NON_USER_INPUT_TOKENS = {
+    198, 200010, 200011, 199999, 200018, 200019, 200020, 200021, 200022,
+    200023, 200024, 200025, 200026, 200027, 200028
+}
+
+
+def get_max_dummy_image(ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+
+    max_side = vit_image_size * dynamic_hd_size
+    dummy_image = dummy_image_for_phi4mm(vit_image_size, max_side)
+    return dummy_image
+
+
+# image token length
+def get_max_phi4mm_image_tokens(ctx: InputContext):
+    dummy_image = get_max_dummy_image(ctx)
+
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+    token_compression_factor = prepro_config['token_compression_factor']
+
+    image_num_tokens = _compute_num_image_tokens(dummy_image, dynamic_hd_size,
+                                                 vit_image_size,
+                                                 vit_patch_size,
+                                                 token_compression_factor)
+    return image_num_tokens
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height,
+                              image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def _find_target_aspect_ratio(image, image_size, max_num, min_num):
+    orig_width, orig_height = image.size
+
+    w_crop_num = math.ceil(orig_width / float(image_size))
+    h_crop_num = math.ceil(orig_height / float(image_size))
+    if w_crop_num * h_crop_num > max_num:
+        aspect_ratio = orig_width / orig_height
+
+        # calculate the existing image aspect ratio
+        target_ratios = set((i, j) for i in range(1, max_num + 1)
+                            for j in range(1, max_num + 1)
+                            if i * j <= max_num and i * j >= min_num)
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        logger.debug("target_aspect_ratio: %s", target_aspect_ratio)
+    else:
+        target_width = image_size * w_crop_num
+        target_height = image_size * h_crop_num
+        target_aspect_ratio = (w_crop_num, h_crop_num)
+    return target_aspect_ratio, target_height, target_width
+
+
+def _get_padding_size(image, target_height, target_width):
+    orig_width, orig_height = image.size
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+
+    if ratio_width < ratio_height:
+        padding_width = 0
+        padding_height = target_height - int(orig_height * ratio_width)
+    else:
+        padding_width = target_width - int(orig_width * ratio_height)
+        padding_height = 0
+    return padding_height, padding_width
+
+
+def dynamic_preprocess(image,
+                       min_num=1,
+                       max_num=12,
+                       image_size=384,
+                       mask_size=27):
+    target_aspect_ratio, target_height, target_width =\
+          _find_target_aspect_ratio(
+        image, image_size, max_num, min_num)
+    padding_height, padding_width = _get_padding_size(image, target_height,
+                                                      target_width)
+
+    # Calculate the ratio
+    orig_width, orig_height = image.size
+    ratio_width = target_width / orig_width
+    ratio_height = target_height / orig_height
+    if ratio_width < ratio_height:
+        new_size = (target_width, int(orig_height * ratio_width))
+    else:
+        new_size = (int(orig_width * ratio_height), target_height)
+
+    attention_mask = torch.ones((int(mask_size * target_aspect_ratio[1]),
+                                 int(mask_size * target_aspect_ratio[0])))
+    if padding_width >= 14:
+        attention_mask[:, -math.floor(padding_width / 14):] = 0
+    if padding_height >= 14:
+        attention_mask[-math.floor(padding_height / 14):, :] = 0
+    assert attention_mask.sum(
+    ) > 0, f'attention mask is empty {attention_mask}'
+
+    if min(new_size[1], target_height) < 10 or min(new_size[0],
+                                                   target_width) < 10:
+        raise ValueError(f'the aspect ratio is very extreme {new_size}')
+
+    image = T.functional.resize(
+        image,
+        [new_size[1], new_size[0]],
+    )
+
+    resized_img = T.functional.pad(image,
+                                   [0, 0, padding_width, padding_height],
+                                   fill=[255, 255, 255])
+
+    return resized_img, attention_mask
+
+
+def pad_to_max_num_crops(images, max_crops=5):
+    """
+    images: B x 3 x H x W, B<=max_crops
+    """
+    B, _, H, W = images.shape
+    if max_crops > B:
+        pad = torch.zeros(max_crops - B,
+                          3,
+                          H,
+                          W,
+                          dtype=images.dtype,
+                          device=images.device)
+        images = torch.cat([images, pad], dim=0)
+    return images
+
+
+def pad_mask_to_max_num_crops(masks, max_crops=5):
+    B, H, W = masks.shape
+    if max_crops > B:
+        pad = torch.ones(max_crops - B,
+                         H,
+                         W,
+                         dtype=masks.dtype,
+                         device=masks.device)
+        masks = torch.cat([masks, pad], dim=0)
+    return masks
+
+
+def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
+
+    # Basic settings.
+    img_processor = T.Compose([
+        T.ToTensor(),
+        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+    ])
+    # Dynamic HD
+    base_resolution = vit_resolution
+    images = [image.convert('RGB') for image in images]
+    # cover 384 and 448 resolution
+    mask_resolution = base_resolution // vit_patch_size
+    elems, image_attention_masks = [], []
+    for im in images:
+        elem, attention_mask = dynamic_preprocess(im,
+                                                  max_num=dynamic_hd_size,
+                                                  image_size=base_resolution,
+                                                  mask_size=mask_resolution)
+        elems.append(elem)
+        image_attention_masks.append(attention_mask)
+    hd_images = [img_processor(im) for im in elems]
+    global_image = [
+        torch.nn.functional.interpolate(
+            im.unsqueeze(0).float(),
+            size=(base_resolution, base_resolution),
+            mode='bicubic',
+        ).to(im.dtype) for im in hd_images
+    ]
+    shapes = [[im.size(1), im.size(2)] for im in hd_images]
+    mask_shapes = [[mask.size(0), mask.size(1)]
+                   for mask in image_attention_masks]
+    global_attention_mask = [
+        torch.ones((1, mask_resolution, mask_resolution)) for _ in hd_images
+    ]
+    hd_images_reshape = [
+        im.reshape(1, 3, h // base_resolution, base_resolution,
+                   w // base_resolution, base_resolution).permute(
+                       0, 2, 4, 1, 3, 5).reshape(-1, 3, base_resolution,
+                                                 base_resolution).contiguous()
+        for im, (h, w) in zip(hd_images, shapes)
+    ]
+    attention_masks_reshape = [
+        mask.reshape(1, h // mask_resolution, mask_resolution,
+                     w // mask_resolution, mask_resolution).permute(
+                         0, 1, 3, 2, 4).reshape(-1, mask_resolution,
+                                                mask_resolution).contiguous()
+        for mask, (h, w) in zip(image_attention_masks, mask_shapes)
+    ]
+    # NOTE token compression is hard coded here, and odd numbers seems to fail
+    downsample_attention_masks = [
+        mask[:, 0::2,
+             0::2].reshape(1, h // mask_resolution, w // mask_resolution,
+                           mask_resolution // 2 + mask_resolution % 2,
+                           mask_resolution // 2 + mask_resolution % 2).permute(
+                               0, 1, 3, 2, 4)
+        for mask, (h, w) in zip(attention_masks_reshape, mask_shapes)
+    ]
+    downsample_attention_masks = [
+        mask.reshape(mask.size(1) * mask.size(2),
+                     mask.size(3) * mask.size(4))
+        for mask in downsample_attention_masks
+    ]
+    # NOTE hard coded number of tokens
+    num_img_tokens = [
+        256 + 1 + int(mask.sum().item()) + int(mask[:, 0].sum().item()) + 16
+        for mask in downsample_attention_masks
+    ]
+
+    hd_images_reshape = [
+        torch.cat([_global_image] + [_im], dim=0)
+        for _global_image, _im in zip(global_image, hd_images_reshape)
+    ]
+    hd_masks_reshape = [
+        torch.cat([_global_mask] + [_mask],
+                  dim=0) for _global_mask, _mask in zip(
+                      global_attention_mask, attention_masks_reshape)
+    ]
+    max_crops = max([img.size(0) for img in hd_images_reshape])
+    image_transformed = [
+        pad_to_max_num_crops(im, max_crops) for im in hd_images_reshape
+    ]
+    image_transformed = torch.stack(image_transformed, dim=0)
+    mask_transformed = [
+        pad_mask_to_max_num_crops(mask, max_crops) \
+            for mask in hd_masks_reshape
+    ]
+    mask_transformed = torch.stack(mask_transformed, dim=0)
+
+    returned_input_image_embeds = image_transformed
+    returned_image_sizes = torch.tensor(shapes, dtype=torch.long)
+    returned_image_attention_mask = mask_transformed
+    returned_num_img_tokens = num_img_tokens
+
+    data = {
+        "pixel_values": returned_input_image_embeds,
+        "image_sizes": returned_image_sizes,
+        "image_attention_mask": returned_image_attention_mask,
+        "num_img_tokens": returned_num_img_tokens,
+    }
+    return data
+
+
+class Phi4MMImageEncoder(nn.Module):
+    """Image embedding."""
+
+    def __init__(self,
+                 config: PretrainedConfig,
+                 quant_config: Optional[QuantizationConfig],
+                 prefix: str = "",
+                 model_dir: str = "") -> None:
+        super().__init__()
+
+        # n_embed or hidden_size
+        hidden_size = config.n_embd if hasattr(
+            config, 'n_embd') else config.hidden_size
+        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
+            embd_drop = config.embd_pdrop if hasattr(
+                config, 'embd_pdrop') else config.embed_pdrop
+            self.drop = nn.Dropout(embd_drop)
+        else:
+            self.drop = None
+
+        # layer_idx to output the img features
+        if isinstance(config.img_processor, dict):
+            self.layer_idx = config.img_processor.get('layer_idx', -2)
+            self.type_feature = config.img_processor.get(
+                'type_feature', 'patch')
+        else:
+            self.layer_idx = -2
+            self.type_feature = 'patch'
+
+        self.img_processor = get_siglip_vision_model(
+            _flash_attn_2_enabled=True)
+
+        pe_weight = self.img_processor.embeddings.position_embedding.weight
+        L, D = pe_weight.size()
+        H = int(math.sqrt(L))
+        assert H**2 == L, f'position embedding size {L} is not square'
+        if H % 2 != 0:
+            self.img_processor_padding = nn.ReflectionPad2d((0, 1, 0, 1))
+            H += 1
+        image_dim_out = D
+        # ((448/14)//2)**2
+        self.num_img_tokens = (H // 2)**2
+        self.base_feat_height_target = H
+
+        self.image_dim_out = image_dim_out
+        self.img_sizes = None
+        self.image_attention_mask = None
+
+        # global_gn and sub_gn for hd transform, serves as line separator
+        self.use_hd_transform = True
+        self.with_learnable_separator = True
+        self.hd_transform_order = "sub_glb"
+        self.freeze_img_processor = False
+        self.crop_size = 448
+
+        # image token compression
+        self.image_token_compression_cls = 'avg_pool_2d'
+        self.image_token_compression = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.base_feat_height_reduction = 1
+        self.base_feat_height_target = self.base_feat_height_target // 2
+
+        # with_hd_transform and with_learnable_separator should have same value
+        assert self.use_hd_transform == self.with_learnable_separator, \
+        'use_hd_transform and with_learnable_separator should have same value'
+        assert self.use_hd_transform, \
+            'learnable separator is only for hd transform'
+        # 1024 * 4, merge spatial to channel dimension
+        self.glb_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+        self.sub_GN = nn.Parameter(
+            torch.zeros([
+                1, 1, 1,
+                self.image_dim_out * self.base_feat_height_reduction**2
+            ]))
+
+        dim_projection = hidden_size
+        depth = 2
+        layers = [
+            nn.Linear(image_dim_out * self.base_feat_height_reduction**2,
+                      dim_projection)
+        ]
+        for _ in range(1, depth):
+            layers.extend(
+                [nn.GELU(),
+                 nn.Linear(dim_projection, dim_projection)])
+        self.img_projection = nn.Sequential(*layers)
+
+        self.vocab_size = config.vocab_size
+        self.img_features = None
+
+        self.use_out_place_operations = False
+
+    def get_img_features(self,
+                         img_embeds: torch.FloatTensor,
+                         attention_mask=None) -> torch.FloatTensor:
+        LAYER_IDX = self.layer_idx
+        TYPE_FEATURE = self.type_feature
+
+        img_processor_output = self.img_processor(
+            img_embeds,
+            output_hidden_states=True,
+            patch_attention_mask=attention_mask)
+        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+
+        if TYPE_FEATURE == "patch":
+            patch_feature = img_feature
+
+            use_token_compression = self.image_token_compression is not None
+            use_padding = getattr(self, 'img_processor_padding',
+                                  None) is not None
+            if use_token_compression or use_padding:
+                # reshape to 2D tensor
+                width = int(math.sqrt(patch_feature.size(1)))
+                patch_feature = patch_feature.view(-1, width, width,
+                                                   patch_feature.size(-1))
+                # convert to NCHW
+                patch_feature = patch_feature.permute(0, 3, 1, 2)
+
+                if use_padding:
+                    patch_feature = self.img_processor_padding(patch_feature)
+                if use_token_compression:
+                    patch_feature = self.image_token_compression(patch_feature)
+
+                # convert to NHWC
+                patch_feature = patch_feature.permute(0, 2, 3, 1)
+                patch_feature = patch_feature.view(
+                    -1,
+                    patch_feature.size(1) * patch_feature.size(2),
+                    patch_feature.size(-1))
+
+            return patch_feature
+
+        raise NotImplementedError
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                image_sizes: torch.Tensor,
+                image_attention_mask: torch.Tensor) -> torch.FloatTensor:
+        """
+        process image and return vision embeddings.
+
+        pixel_values: (num_images, num_crops, c, h, w)
+        image_sizes: [[h1, w1], [h2, w2]]
+        image_attention_mask: num_images x num_crops x 32 x 32
+        output: (num_images, num_img_tokens, hidden_size)
+        """
+
+        # eg
+        # pixel_values: torch.Size([1, 7, 3, 448, 448])
+        # image_sizes: tensor([[ 896, 1344]], device='cuda:0')
+        # output: torch.Size([1, 1841, 3072])
+
+        if isinstance(self.img_projection, nn.Sequential):
+            target_device = self.img_projection[0].bias.device
+            target_dtype = self.img_projection[0].bias.dtype
+        else:  # It's a single nn.Linear layer
+            target_device = self.img_projection.bias.device
+            target_dtype = self.img_projection.bias.dtype
+
+        img_sizes = image_sizes
+        num_images, num_crops, c, h, w = pixel_values.shape
+        bs = num_images
+        pixel_values = pixel_values.flatten(0, 1)
+
+        img_features = self.get_img_features(
+            pixel_values,
+            image_attention_mask.type(torch.BoolTensor).flatten(
+                0, 1).to(target_device))
+
+        base_feat_height_target = self.base_feat_height_target
+        base_resolution = self.crop_size
+        base_feat_height_reduction = self.base_feat_height_reduction
+
+        base_feat_height = base_feat_width = int(np.sqrt(
+            img_features.shape[1]))
+        assert base_feat_height == base_feat_height_target \
+            and base_feat_width == base_feat_height_target, \
+                f'base_feat_height: {base_feat_height},"\
+                f" base_feat_width: {base_feat_width}, "\
+                f"expect {base_feat_height_target} features for hd transform'
+
+        # bs x max_num_crops x (24x24) x C
+        img_features = img_features.view(bs, -1,
+                                         base_feat_height * base_feat_width,
+                                         self.image_dim_out)
+        C = self.image_dim_out
+        H = base_feat_height
+
+        output_imgs = []
+        output_len = []
+        # training is tensor, inference is list
+        if isinstance(img_sizes, torch.Tensor):
+            img_sizes = img_sizes.view(-1, 2)
+        for _bs in range(bs):
+            h, w = img_sizes[_bs]
+            h = h // base_resolution
+            w = w // base_resolution
+            B_ = h * w
+
+            # 1 x (24x24) x 1024
+            global_img_feature = img_features[_bs, :1]
+
+            # 1 x 12 x 12 x 4096
+            glb_img = global_img_feature.reshape(1, H, H, C).reshape(
+                1, H // base_feat_height_reduction, base_feat_height_reduction,
+                H // base_feat_height_reduction, base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, H // base_feat_height_reduction,
+                    H // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C).contiguous()
+            temp_glb_GN = self.sub_GN.repeat(1,
+                                             H // base_feat_height_reduction,
+                                             1, 1)
+
+            # 1 x 156 x 4096
+            glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+
+            # (max_num_crops-1) x (12x12) x C
+            sub_img = img_features[_bs, 1:]
+            # 16x574x1024
+            # get rid of padding sub_img
+            sub_img = sub_img[:B_]
+
+            # (num_crops, 12, 2, 12, 2, 1024) ->
+            # (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
+            sub_img = sub_img.reshape(B_, H, H, C).reshape(
+                B_, H // base_feat_height_reduction,
+                base_feat_height_reduction, H // base_feat_height_reduction,
+                base_feat_height_reduction,
+                C).contiguous().permute(0, 1, 3, 2, 4, 5).reshape(
+                    B_, -1, base_feat_height_reduction *
+                    base_feat_height_reduction * C).contiguous()
+            sub_img = sub_img.reshape(
+                1, h, w, base_feat_height // base_feat_height_reduction,
+                base_feat_width // base_feat_height_reduction,
+                -1).permute(0, 1, 3, 2, 4, 5).reshape(
+                    1, h * base_feat_height // base_feat_height_reduction,
+                    w * base_feat_width // base_feat_height_reduction,
+                    base_feat_height_reduction * base_feat_height_reduction *
+                    C)
+
+            if image_attention_mask is not None and len(
+                    image_attention_mask) > 0:
+                reshaped_image_attention_mask = image_attention_mask[
+                    _bs, 1:B_ + 1, 0::2, 0::2].reshape(
+                        1, h, w,
+                        base_feat_height // base_feat_height_reduction,
+                        base_feat_width // base_feat_height_reduction).permute(
+                            0, 1, 3, 2, 4).reshape(
+                                1, h * base_feat_height //
+                                base_feat_height_reduction, w *
+                                base_feat_width // base_feat_height_reduction)
+                useful_height = int(
+                    reshaped_image_attention_mask[0, :, 0].sum().item())
+                useful_width = int(
+                    reshaped_image_attention_mask[0, 0, :].sum().item())
+                sub_img = sub_img[:, :useful_height, :useful_width]
+                temp_sub_GN = self.sub_GN.repeat(1, useful_height, 1, 1)
+                temp_len = int(
+                    image_attention_mask[_bs, :B_ + 1, 0::2, 0::2].sum().item(
+                    )) + (useful_height +
+                          1) + base_feat_height // base_feat_height_reduction
+            else:
+                temp_sub_GN = self.sub_GN.repeat(
+                    1, h * base_feat_height // base_feat_height_reduction, 1,
+                    1)
+                temp_len = int((h * w + 1) * self.num_img_tokens + 1 +
+                               (h + 1) * base_feat_height //
+                               base_feat_height_reduction)
+
+            sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(
+                1, -1,
+                base_feat_height_reduction * base_feat_height_reduction * C)
+            # (1, num_img_tokens, 1024*4)
+
+            # glb + sub
+            if self.hd_transform_order == 'glb_sub':
+                output_imgs.append(
+                    torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
+            elif self.hd_transform_order == 'sub_glb':
+                output_imgs.append(
+                    torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
+            else:
+                raise NotImplementedError(
+                    f'hd_transform_order = {self.hd_transform_order}, "\
+                        "not implemented')
+
+            #temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
+            assert temp_len == output_imgs[-1].shape[
+                1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\
+                    "{output_imgs[-1].shape[1]}'
+
+            output_len.append(temp_len)
+
+        img_set_tensor = []
+        for _output_img in output_imgs:
+            img_feature_proj = self.img_projection(
+                _output_img.to(target_device).to(target_dtype))
+            img_set_tensor.append(img_feature_proj)
+
+        return img_set_tensor
+
+
+class Phi4MMAudioFeatureInputs(TypedDict):
+    type: Literal["audio_features"]
+    data: Tuple[NestedTensors]
+    """Shape: `((batch_size, num_audios, 80, M), )"""
+
+
+class Phi4MMAudioEmbeddingInputs(TypedDict):
+    type: Literal["audio_embeds"]
+    data: NestedTensors
+    """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
+
+
+Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
+
+
+def speechlib_mel(sample_rate, n_fft, n_mels, fmin=None, fmax=None):
+    """Create a Mel filter-bank the same as SpeechLib FbankFC.
+
+    Args:
+        sample_rate (int): Sample rate in Hz. number > 0 [scalar]
+        n_fft (int): FFT size. int > 0 [scalar]
+        n_mel (int): Mel filter size. int > 0 [scalar]
+        fmin (float): lowest frequency (in Hz). If None use 0.0.
+            float >= 0 [scalar]
+        fmax: highest frequency (in Hz). If None use sample_rate / 2.
+            float >= 0 [scalar]
+
+    Returns
+        out (numpy.ndarray): Mel transform matrix
+            [shape=(n_mels, 1 + n_fft/2)]
+    """
+
+    bank_width = int(n_fft // 2 + 1)
+    if fmax is None:
+        fmax = sample_rate / 2
+    if fmin is None:
+        fmin = 0
+    assert fmin >= 0, "fmin cannot be negative"
+    assert (fmin < fmax <=
+            sample_rate / 2), "fmax must be between (fmin, samplerate / 2]"
+
+    def mel(f):
+        return 1127.0 * np.log(1.0 + f / 700.0)
+
+    def bin2mel(fft_bin):
+        return 1127.0 * np.log(1.0 + fft_bin * sample_rate / (n_fft * 700.0))
+
+    def f2bin(f):
+        return int((f * n_fft / sample_rate) + 0.5)
+
+    # Spec 1: FFT bin range [f2bin(fmin) + 1, f2bin(fmax) - 1]
+    klo = f2bin(fmin) + 1
+    khi = f2bin(fmax)
+
+    khi = max(khi, klo)
+
+    # Spec 2: SpeechLib uses triangles in Mel space
+    mlo = mel(fmin)
+    mhi = mel(fmax)
+    m_centers = np.linspace(mlo, mhi, n_mels + 2)
+    ms = (mhi - mlo) / (n_mels + 1)
+
+    matrix = np.zeros((n_mels, bank_width), dtype=np.float32)
+    for m in range(0, n_mels):
+        left = m_centers[m]
+        center = m_centers[m + 1]
+        right = m_centers[m + 2]
+        for fft_bin in range(klo, khi):
+            mbin = bin2mel(fft_bin)
+            if left < mbin < right:
+                matrix[m, fft_bin] = 1.0 - abs(center - mbin) / ms
+
+    return matrix
+
+
+class LogFbankProcessor:
+
+    def __init__(self):
+
+        self._eightk_method = "fillzero"
+        self._mel = speechlib_mel(16000, 512, 80, fmin=None, fmax=7690).T
+
+        self._hamming400 = np.hamming(400)  # for 16k audio
+        self._hamming200 = np.hamming(200)  # for 8k audio
+
+    def extract_spectrogram(self, wav, fs):
+        """Extract spectrogram features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        if wav.ndim > 1:
+            wav = np.squeeze(wav)
+
+        # by default, we extract the mean if stereo
+        if len(wav.shape) == 2:
+            wav = wav.mean(1)
+
+        # Resample to 16000 or 8000 if needed
+        if fs > 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 16000)
+            fs = 16000
+        elif 8000 < fs < 16000:
+            wav = scipy.signal.resample_poly(wav, 1, fs // 8000)
+            fs = 8000
+        elif fs < 8000:
+            raise RuntimeError(f"Unsupported sample rate {fs}")
+
+        if fs == 8000:
+            if self._eightk_method == "resample":
+                # Input audio is 8 kHz. Convert to 16 kHz before feature
+                # extraction
+                wav = scipy.signal.resample_poly(wav, 2, 1)
+                fs = 16000
+            # Do nothing here for fillzero method
+        elif fs != 16000:
+            # Input audio is not a supported sample rate.
+            raise RuntimeError(
+                f"Input data using an unsupported sample rate: {fs}")
+
+        preemphasis = 0.97
+
+        if fs == 8000:
+            n_fft = 256
+            win_length = 200
+            hop_length = 80
+            fft_window = self._hamming200
+        elif fs == 16000:
+            n_fft = 512
+            win_length = 400
+            hop_length = 160
+            fft_window = self._hamming400
+
+        # Spec 1: SpeechLib cut remaining sample insufficient for a hop
+        n_batch = (wav.shape[0] - win_length) // hop_length + 1
+        # Here we don't use stride_tricks since the input array may not satisfy
+        # memory layout requirement and we need writeable output
+        # Here we only use list of views before copy to destination
+        # so it is more efficient than broadcasting
+        y_frames = np.array(
+            [
+                wav[_stride:_stride + win_length]
+                for _stride in range(0, hop_length * n_batch, hop_length)
+            ],
+            dtype=np.float32,
+        )
+
+        # Spec 2: SpeechLib applies preemphasis within each batch
+        y_frames_prev = np.roll(y_frames, 1, axis=1)
+        y_frames_prev[:, 0] = y_frames_prev[:, 1]
+        y_frames = (y_frames - preemphasis * y_frames_prev) * 32768
+
+        S = np.fft.rfft(fft_window * y_frames, n=n_fft,
+                        axis=1).astype(np.complex64)
+
+        if fs == 8000:
+            # Need to pad the output to look like 16 kHz data but with zeros in
+            # the 4 to 8 kHz bins.
+            frames, bins = S.shape
+            padarray = np.zeros((frames, bins))
+            S = np.concatenate((S[:, 0:-1], padarray),
+                               axis=1)  # Nyquist bin gets set to zero
+
+        spec = np.abs(S).astype(np.float32)
+        return spec
+
+    def extract_features(self, wav, fs):
+        """Extract log filterbank features from waveform.
+        Args:
+            wav (1D array): waveform of the input
+            fs (int): sampling rate of the waveform, 16000 or 8000.
+                If fs=8000, the waveform will be resampled to 16000Hz.
+        Output:
+            log_fbank (2D array): a TxD matrix of log Mel filterbank features.
+                D=80, and T is the number of frames.
+        """
+        spec = self.extract_spectrogram(wav, fs)
+        spec_power = spec**2
+
+        fbank_power = np.clip(spec_power.dot(self._mel), 1.0, None)
+        log_fbank = np.log(fbank_power).astype(np.float32)
+
+        return log_fbank
+
+
+@lru_cache
+def audio_feature_extractor() -> LogFbankProcessor:
+    # Creates an instance of the audio processor, needed to extract the
+    # the audio features from the sound file
+    # LRU cache ensures that we only make one copy
+    return LogFbankProcessor()
+
+
+def _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
+                              vit_patch_size, token_compression_factor):
+    """
+    compute the number of tokens an image is expected to take up considering 
+    the image encoder architecture and exclude output features containing 
+    only padding pixels
+
+    for siglip, vit_image_size=448, vit_patch_size=14, so output will be 
+    32x32 feature map
+    NOTE right now, Phi4MM uses hard-coded token_compression_factor=2
+    """
+    assert vit_image_size % vit_patch_size == 0, \
+        "vit_image_size must be divisible by vit_patch_size"
+    assert vit_image_size // vit_patch_size % token_compression_factor == 0, \
+        "vit_image_size // vit_patch_size must be divisible by "\
+            "token_compression_factor"
+
+    target_aspect_ratio, target_height, target_width = (
+        _find_target_aspect_ratio(image,
+                                  vit_image_size,
+                                  dynamic_hd_size,
+                                  min_num=1))
+    assert target_aspect_ratio[
+        0] * vit_image_size == target_width, \
+            f"{target_aspect_ratio[0]} * {vit_image_size} != {target_width}"
+    assert target_aspect_ratio[
+        1] * vit_image_size == target_height, \
+            f"{target_aspect_ratio[1]} * {vit_image_size} != {target_height}"
+    assert (target_height % vit_image_size == 0
+            and target_width % vit_image_size == 0)
+
+    padding_height, padding_width = _get_padding_size(image, target_height,
+                                                      target_width)
+    assert padding_width == 0 or padding_height == 0, \
+        "padding_width or padding_height must be 0"
+
+    target_feat_width = target_width // vit_patch_size
+    target_feat_height = target_height // vit_patch_size
+    if padding_width >= vit_patch_size:
+        assert padding_height == 0, "padding_height not 0"
+        non_pad_feat_width = target_feat_width - math.floor(
+            padding_width / vit_patch_size)
+        non_pad_feat_height = target_feat_height
+    elif padding_height >= vit_patch_size:
+        assert padding_width == 0, "padding_width not 0"
+        non_pad_feat_height = target_feat_height - math.floor(
+            padding_height / vit_patch_size)
+        non_pad_feat_width = target_feat_width
+    else:
+        # small padding shorter than a vit patch
+        non_pad_feat_width = target_feat_width
+        non_pad_feat_height = target_feat_height
+
+    feat_width = non_pad_feat_width // token_compression_factor
+    feat_height = non_pad_feat_height // token_compression_factor
+    # NOTE it's possible that the non-padding feature is not divisible
+    if non_pad_feat_width % token_compression_factor != 0:
+        feat_width += 1
+    if non_pad_feat_height % token_compression_factor != 0:
+        feat_height += 1
+    num_hd_patch_tokens = feat_width * feat_height
+    num_hd_newline_tokens = feat_height
+    vit_feature_size = vit_image_size // vit_patch_size
+    num_global_image_tokens = (vit_feature_size // token_compression_factor)**2
+    num_sep_tokens = 1
+    num_global_image_newline_tokens = \
+        vit_feature_size // token_compression_factor
+
+    return (num_global_image_tokens + num_sep_tokens + num_hd_patch_tokens +
+            num_hd_newline_tokens + num_global_image_newline_tokens)
+
+
+def compute_logfbank_output_size(wav_length: int, fs: int) -> Tuple[int, int]:
+    """
+    Compute the output size of the `extract_features` method.
+
+    Args:
+        wav_length (int): Length of the input waveform in samples.
+        fs (int): Sampling rate of the waveform, either 16000 or 8000.
+
+    Returns:
+        tuple (int, int): Output size as (T, D), where:
+            T: Number of time frames.
+            D: Number of Mel filterbank bins (80).
+    """
+
+    # Resample to 16000 or 8000 if needed
+    if fs > 16000:
+        wav_length //= fs // 16000
+        fs = 16000
+    elif 8000 <= fs < 16000:
+        # We'll resample to 16K from 8K
+        wav_length *= 2
+        fs = 16000
+    elif fs < 8000:
+        raise RuntimeError(f"Unsupported sample rate {fs}")
+
+    # Spectrogram parameters for 16 kHz
+    win_length = 400  # Frame length in samples
+    hop_length = 160  # Frame shift in samples
+    mel_bins = 80  # Number of mel filterbank bins
+
+    # Calculate number of frames (T)
+    T = (wav_length - win_length) // hop_length + 1
+    if T < 1:
+        raise ValueError("Waveform too short for given parameters.")
+
+    # Return time frames (T) and mel bins (D)
+    return T, mel_bins
+
+
+def _get_audio_embed_sizes(audios, ctx: InputContext):
+    """
+    Get the audio embedding sizes for each audio file.
+
+    Args:
+        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
+            waveform and sample rate.
+        ctx (InputContext): Input context.
+
+    Returns:
+        List[int]: List of audio embedding sizes.
+    """
+    audio_embed_sizes = []
+    for audio in audios:
+        audio_data, sf = audio
+        audio_frames, _ = compute_logfbank_output_size(len(audio_data), sf)
+        audio_embed_size = _compute_audio_embed_size(ctx.get_hf_config(),
+                                                     audio_frames)
+        audio_embed_sizes.append(audio_embed_size)
+    return audio_embed_sizes
+
+
+def _get_audio_id_to_input_ids(audios, ctx: InputContext, prompt_str=""):
+    """
+    The following will search for `<|audio_{idx}|>` tokens and
+    return a mapping of audio placeholder tokens to audio placeholder token ids
+    based on the size of the audio embeddings.
+
+    Args:
+        audios (List[Tuple[np.ndarray, int]]): List of audio files as tuples of
+            waveform and sample rate.
+        ctx (InputContext): Input context.
+        prompt_str (str): The prompt string.
+
+    Returns:
+        Dict[str, List[int]]: Mapping of audio placeholder tokens to audio 
+        placeholder token ids.
+
+    """
+    if len(audios) == 0:
+        return {}
+
+    audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
+    audio_ids = re.findall(AUDIO_TOKEN_PATTERN, prompt_str)
+    audio_ids = [int(audio_id) for audio_id in audio_ids]
+    assert len(audio_ids) == len(
+        audio_embed_sizes
+    ), "Number of audio tokens and audio features do not match"
+    assert tuple(audio_ids) == tuple(range(1,
+                                           len(audio_ids) +
+                                           1)), "Audio ids are not in order!"
+    audio_id_to_input_ids = {
+        f"<|audio_{audio_id}|>":
+        [_AUDIO_PLACEHOLDER_TOKEN_ID] * audio_embed_size
+        for audio_id, audio_embed_size in zip(audio_ids, audio_embed_sizes)
+    }
+
+    return audio_id_to_input_ids
+
+
+def _count_image_tokens(images, ctx: InputContext):
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+    token_compression_factor = prepro_config['token_compression_factor']
+
+    image_token_counts = [
+        _compute_num_image_tokens(image, dynamic_hd_size, vit_image_size,
+                                  vit_patch_size, token_compression_factor)
+        for image in images
+    ]
+    return image_token_counts
+
+
+def _get_image_id_to_input_ids(images, prompt, ctx: InputContext):
+    if len(images) == 0:
+        return {}
+
+    image_ids = re.findall(IMAGE_TOKEN_PATTERN, prompt)
+    image_ids = [int(image_id) for image_id in image_ids]
+    assert len(image_ids) == len(
+        set(image_ids)), "Duplicate image tokens in prompt"
+    assert len(images) == len(
+        image_ids), "Number of images and image tokens in prompt do not match"
+
+    # NOTE the following assertion is not strictly necessary
+    assert tuple(image_ids) == tuple(range(1,
+                                           len(image_ids) +
+                                           1)), "Image ids are not in order"
+
+    image_token_counts = _count_image_tokens(images, ctx)
+    image_id_to_input_ids = {
+        f"<|image_{image_id}|>": [_IMAGE_PLACEHOLDER_TOKEN_ID] * num_tokens
+        for image_id, num_tokens in zip(image_ids, image_token_counts)
+    }
+    return image_id_to_input_ids
+
+
+def input_processor_for_phi4mm(ctx: InputContext,
+                               inputs: DecoderOnlyInputs) -> TokenInputs:
+    """
+    Implements the input processor, which transforms the input prompt ids
+    to include the audio placeholder token.  This will become the `input_ids`
+    in `forward` for the model.
+
+    Args:
+        ctx (InputContext): Input context.
+        inputs (DecoderOnlyInputs): The inputs (e.g. prompt, prompt_token_ids)
+        to process.
+
+    Returns:
+        TokenInputs: Processed inputs
+    """
+    multi_modal_data = inputs.get("multi_modal_data")
+    if (multi_modal_data is None or
+        ("audio" not in multi_modal_data and "image" not in multi_modal_data)):
+        # pure text input, so no need to do pre-processing
+        return inputs
+
+    prompt_str = inputs.get("prompt")
+    prompt_token_ids = inputs.get("prompt_token_ids")
+    # for offline_inference, we will get str input and we parse MM special
+    # tokens from it
+    # (ignore prompt_token_ids)
+    # for OAI server, we will get prompt_token_ids, where MM special tokens
+    # are already parsed
+
+    if 'audio' in multi_modal_data:
+        audios = multi_modal_data["audio"]
+
+        if not isinstance(audios, list):
+            audios = [audios]
+        if prompt_str is not None:
+            audio_id_to_input_ids = _get_audio_id_to_input_ids(
+                audios, ctx, prompt_str=prompt_str)
+            audio_embed_sizes = []
+        elif prompt_token_ids is not None:
+            audio_id_to_input_ids = {}
+            audio_embed_sizes = _get_audio_embed_sizes(audios, ctx)
+    else:
+        audio_id_to_input_ids = {}
+        audio_embed_sizes = []
+
+    if 'image' in multi_modal_data:
+        # PIL Image or list of PIL Images
+        images = multi_modal_data["image"]
+        if not isinstance(images, list):
+            images = [images]
+        if prompt_str is not None:
+            image_id_to_input_ids = _get_image_id_to_input_ids(
+                images, prompt_str, ctx)
+            image_token_counts = []
+        elif prompt_token_ids is not None:
+            image_id_to_input_ids = {}
+            image_token_counts = _count_image_tokens(images, ctx)
+    else:
+        image_id_to_input_ids = {}
+        image_token_counts = []
+
+    # Handle the case where the prompt is a string and we need to manually
+    # tokenize it.
+    # In this case, the `audio_id_to_input_ids` dict will be mapping from
+    # an audio placeholder
+    # string (e.g. `<|audio_1|>`) to the audio placeholder tokens for the
+    # given audio length.
+    if prompt_str:
+        pattern = r"(<\|image_\d+\|>|<\|audio_\d+\|>)"
+        prompt_chunk_strings = re.split(pattern, prompt_str)
+        prompt_chunk_strings = [s for s in prompt_chunk_strings if s != ""]
+
+        # Create the new input_ids with the placeholder image and audio
+        # tokens inserted
+        tokenizer = cached_tokenizer_from_config(ctx.model_config)
+        input_ids = []
+        has_imag, has_audio, has_user_text_input = False, False, False
+        for prompt_chunk_string in prompt_chunk_strings:
+            if re.match(IMAGE_TOKEN_PATTERN, prompt_chunk_string):
+                input_ids.extend(image_id_to_input_ids[prompt_chunk_string])
+                has_imag = True
+            elif re.match(AUDIO_TOKEN_PATTERN, prompt_chunk_string):
+                input_ids.extend(audio_id_to_input_ids[prompt_chunk_string])
+                has_audio = True
+            else:
+                curr_token_ids = tokenizer(prompt_chunk_string).input_ids
+                if not has_user_text_input:
+                    for token_id in curr_token_ids:
+                        if token_id not in NON_USER_INPUT_TOKENS:
+                            has_user_text_input = True
+                            break
+                input_ids.extend(curr_token_ids)
+        if has_audio and has_imag and has_user_text_input:
+            raise ValueError(
+                "Phi4MMForCausalLM does not support text + audio + image" +
+                " inputs in the same prompt")
+    # Handle the case where the prompt is already tokenized
+    else:
+        assert prompt_token_ids is not None, \
+            "If string prompt isn't provided, prompt_token_ids must be"
+
+        i = 0
+        input_ids = prompt_token_ids
+        # only needed for later assertion
+        img_cnt, audio_cnt, user_text_input_cnt = 0, 0, 0
+        image_token_count_iter = iter(image_token_counts)
+        audio_embed_size_iter = iter(audio_embed_sizes)
+        while i < len(input_ids):
+            token_id = input_ids[i]
+            if token_id == _AUDIO_PLACEHOLDER_TOKEN_ID:
+                token_count = next(audio_embed_size_iter)
+                audio_cnt += 1
+            elif token_id == _IMAGE_PLACEHOLDER_TOKEN_ID:
+                token_count = next(image_token_count_iter)
+                img_cnt += 1
+            else:
+                user_text_input_cnt += 1 if token_id not in \
+                    NON_USER_INPUT_TOKENS else 0
+                i += 1
+                continue
+            tokens = [token_id] * token_count
+            input_ids = input_ids[:i] + tokens + input_ids[i + 1:]
+            i += token_count
+
+        if audio_cnt > 0 and img_cnt > 0 and user_text_input_cnt > 0:
+            raise ValueError(
+                "Phi4MMForCausalLM does not support text + audio + image" +
+                " inputs in the same prompt")
+        # If the below assertion fails, it might be that input pure-text
+        # messages contain image/audio special tokens literally
+        # (<|endoftext10|>, <|endoftext11|>).
+        assert (img_cnt == len(image_token_counts)), (
+            f"Number of image tokens in prompt_token_ids ({img_cnt}) "
+            f"does not match number of images ({len(image_token_counts)})")
+        assert (audio_cnt == len(audio_embed_sizes)), (
+            f"Number of audio tokens in prompt_token_ids ({audio_cnt}) "
+            f"does not match number of audios ({len(audio_embed_sizes)})")
+
+    # NOTE: Create a defensive copy of the original inputs
+    return token_inputs(
+        prompt_token_ids=input_ids,
+        prompt=prompt_str,
+        multi_modal_data=multi_modal_data,
+    )
+
+
+def _compute_audio_embed_size(hf_config, audio_frames):
+    """
+    Compute the audio embedding size based on the audio frames and
+    compression rate.
+    """
+    compression_rate = hf_config.embd_layer['audio_embd_layer'][
+        'compression_rate']
+    # NOTE: this is a hard-coded value but might be configurable in the future
+    qformer_compression_rate = 1
+    integer = audio_frames // compression_rate
+    remainder = audio_frames % compression_rate
+
+    result = integer if remainder == 0 else integer + 1
+
+    integer = result // qformer_compression_rate
+    remainder = result % qformer_compression_rate
+    result = integer if remainder == 0 else integer + 1  # qformer compression
+
+    return result
+
+
+def get_max_phi4mm_audio_tokens(ctx: InputContext) -> int:
+    return 10000
+
+
+def dummy_audio_for_phi4mm(audio_count: int) -> dict:
+    """
+    Create dummy audio data for the Phi4MM model, which is used for profiling.
+
+    Args:
+        audio_count (int): Number of audio samples.
+
+    Returns:
+        dict: Dummy audio data.
+    """
+    dummy_audio = np.full((_AUDIO_MAX_SOUNDFILE_SIZE, ), 0.0)
+    return [(dummy_audio, DUMMY_SAMPLING_FREQUENCY)] * audio_count
+
+
+def dummy_image_for_phi4mm(width: int, height: int):
+    image = Image.new('RGB', (width, height), color='black')
+    return image
+
+
+def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
+                          mm_counts: Mapping[str, int]) -> DummyData:
+    """
+    Create dummy sequence (input_ids) and audio data for the Phi4MM model, 
+    which is used for profiling.
+
+    In this case, the sequence data is a bunch of 0s with a number of audio 
+    tokens that correspond to the audio embed size of the 
+    _AUDIO_MAX_SOUNDFILE_SIZE.
+
+    Args:
+        ctx (InputContext): Input context.
+        seq_len (int): Length of the sequence.
+        mm_counts (Mapping[str, int]): Multi-modal counts.
+
+    Returns:
+        Tuple: Dummy sequence data and dummy audio data.
+    """
+    audio_count = mm_counts["audio"]
+    audio_frames, _ = compute_logfbank_output_size(_AUDIO_MAX_SOUNDFILE_SIZE,
+                                                   DUMMY_SAMPLING_FREQUENCY)
+    audio_feature_size = _compute_audio_embed_size(ctx.get_hf_config(),
+                                                   audio_frames)
+
+    image_count = mm_counts["image"]
+    dummy_image = get_max_dummy_image(ctx)
+    max_image_tokens = get_max_phi4mm_image_tokens(ctx)
+    total_image_tokens = image_count * max_image_tokens
+
+    if seq_len - audio_feature_size * audio_count - total_image_tokens < 0:
+        raise RuntimeError(
+            f"Phi4MM cannot process {audio_count} audios and {image_count}"
+            f"images in a prompt, please increase max_model_len to be at"
+            f" larger than "
+            f"{audio_feature_size * audio_count + total_image_tokens}"
+            " or reduce audio/image limit by --limit-mm-per-prompt.")
+
+    if audio_feature_size * audio_count > total_image_tokens:
+        seq_data = SequenceData.from_prompt_token_counts(
+            (_AUDIO_PLACEHOLDER_TOKEN_ID, audio_feature_size * audio_count),
+            (0, seq_len - audio_feature_size * audio_count),
+        )
+        mm_data = {
+            "audio": dummy_audio_for_phi4mm(audio_count),
+        }
+    else:
+        seq_data = SequenceData.from_prompt_token_counts(
+            (_IMAGE_PLACEHOLDER_TOKEN_ID, total_image_tokens),
+            (0, seq_len - total_image_tokens),
+        )
+        mm_data = {
+            "image": [dummy_image] * image_count,
+        }
+    return DummyData(seq_data, mm_data)
+
+
+def input_mapper_for_phi4mm_audio(ctx: InputContext,
+                                  data: object) -> MultiModalInputs:
+    """
+    This function is used to create the MultiModalInputs for the Phi4MM 
+    (audio) model.
+    Specifically, for audio, we extract the audio features from the sound 
+    file and create pairs of audio features and audio embed lengths (the
+    latter of which is used to repeat the audio placeholder token in the 
+    input prompt IDs).
+    These pairs are used, downstream, in `_audio_features_to_embeddings`
+    (via `_process_audio_input`).
+
+    Note that the incoming audio data (each entry in `data`) is a tuple of 
+    the audio data and the sampling frequency (e.g. from soundfile.read).
+
+    Args:
+        ctx (InputContext): Input context.
+        data (object): Audio data.
+
+    Returns:
+        MultiModalInputs: Multi-modal inputs.
+    """
+    if not isinstance(data, list):
+        data = [data]
+
+    if len(data) == 0:
+        return MultiModalInputs()
+
+    audio_features = []
+    for audio_input in data:
+        if not isinstance(audio_input, tuple):
+            raise NotImplementedError(
+                f"Unsupported data type: {type(audio_input)}")
+
+        audio, sf = audio_input
+        feature_extractor = audio_feature_extractor()
+        single_audio_features = feature_extractor.extract_features(audio, sf)
+        feat_stride = (1 if not hasattr(feature_extractor, "stride") else
+                       feature_extractor.stride)
+        audio_frames = len(single_audio_features) * feat_stride
+        single_audio_embed_size = _compute_audio_embed_size(
+            ctx.get_hf_config(), audio_frames)
+        single_audio_feature_audio_len_pair = (
+            single_audio_features,
+            [single_audio_embed_size],
+        )
+        audio_features.append(single_audio_feature_audio_len_pair)
+    return MultiModalInputs({"audio_features": audio_features})
+
+
+def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
+    if not isinstance(data, list):
+        data = [data]
+    # data: list of PIL images
+    if len(data) == 0:
+        return MultiModalInputs()
+    hf_config = ctx.get_hf_config()
+    vision_encoder_name = hf_config.img_processor
+    if vision_encoder_name is None:
+        vision_encoder_name = SIGLIP_NAME
+    prepro_config = VISION_ENCODER_TO_PROCESSING_CONFIG[vision_encoder_name]
+    dynamic_hd_size = prepro_config['dynamic_hd']
+    vit_image_size = prepro_config['vit_image_size']
+    vit_patch_size = prepro_config['vit_patch_size']
+
+    image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
+                                  vit_patch_size)
+    return MultiModalInputs({
+        "pixel_values":
+        image_input_dict["pixel_values"],
+        "image_sizes":
+        image_input_dict["image_sizes"],
+        "image_attention_mask":
+        image_input_dict["image_attention_mask"],
+        "num_img_tokens":
+        image_input_dict["num_img_tokens"],
+    })
+
+
+def cat_with_pad(tensors, dim, padding_value=0):
+    """
+    cat along dim, while pad to max for all other dims
+    """
+    ndim = tensors[0].dim()
+    assert all(
+        t.dim() == ndim for t in
+        tensors[1:]), "All tensors must have the same number of dimensions"
+
+    out_size = [max(t.shape[i] for t in tensors) for i in range(ndim)]
+    out_size[dim] = sum(t.shape[dim] for t in tensors)
+    output = tensors[0].new_full(out_size, padding_value)
+
+    index = 0
+    for t in tensors:
+        # Create a slice list where every dimension except dim is full slice
+        slices = [slice(0, t.shape[d]) for d in range(ndim)]
+        # Update only the concat dimension slice
+        slices[dim] = slice(index, index + t.shape[dim])
+
+        output[slices] = t
+        index += t.shape[dim]
+
+    return output
+
+
+@MULTIMODAL_REGISTRY.register_input_mapper("audio",
+                                           input_mapper_for_phi4mm_audio)
+@MULTIMODAL_REGISTRY.register_input_mapper("image",
+                                           input_mapper_for_phi4mm_image)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "audio", get_max_phi4mm_audio_tokens)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
+    "image", get_max_phi4mm_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+    """
+    Implements the Phi-4-multimodal-instruct model in VLLM.
+    """
+    # LoRA specific attributes
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "qkv_proj",
+        ],
+        "gate_up_proj": [
+            "gate_up_proj",
+        ],
+    }
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
+    ]
+    # Phi4MMForCausalLM does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        assert multimodal_config, "multimodal_config is required"
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.lora_config = lora_config
+
+        # Tensor/Pipeline parallel not supported for now.
+        assert get_tensor_model_parallel_world_size(
+        ) == 1, "tensor parallel is not supported"
+        assert get_pp_group(
+        ).world_size == 1, "pipeline parallel is not supported"
+
+        self.vision_encoder = Phi4MMImageEncoder(
+            config,
+            quant_config,
+            prefix="model.vision_embed_tokens",
+            model_dir=config._name_or_path)
+
+        if isinstance(config.embd_layer["audio_embd_layer"], dict):
+            embedding_config = {
+                "embedding_cls":
+                config.embd_layer["audio_embd_layer"]["embedding_cls"],
+                **config.embd_layer["audio_embd_layer"],
+            }
+        else:
+            embedding_config = {
+                "embedding_cls": self.config.embd_layer["embedding_cls"]
+            }
+
+        self.embed_tokens_extend = AudioEmbedding(config, **embedding_config)
+        self.model = LlamaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=(
+                DEFAULT_VOCAB_PADDING_SIZE
+                # We need bigger padding if using lora for kernel
+                # compatibility
+                if not lora_config else lora_config.lora_vocab_padding_size),
+            quant_config=quant_config,
+        )
+        if config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+    def _audio_features_to_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        input_features: List[torch.Tensor],
+        audio_input_sizes: torch.Tensor,
+        audio_projection_mode: str,
+    ) -> torch.Tensor:
+        """
+        Convert audio features to embeddings, which are used as input to the 
+        model (via `inputs_embeds`).
+
+        Args:
+            input_ids (torch.Tensor): Input IDs (the prompt in this case).
+            input_features (list[torch.Tensor]): Input features (the audio 
+            embeddings).
+            audio_input_sizes (list[torch.Tensor]): Audio input sizes (the 
+            audio embed lengths to use for padding the audio placeholder token 
+            in the input prompt IDs).
+        """
+        # The audio projection can either be a single linear or Sequential,
+        # so handle both cases
+        if isinstance(self.embed_tokens_extend.audio_projection,
+                      nn.Sequential):
+            target_dtype = self.embed_tokens_extend.audio_projection[
+                0].bias.dtype
+        else:
+            target_dtype = self.embed_tokens_extend.audio_projection.bias.dtype
+
+        audio_input = [
+            input.unsqueeze(0).to(target_dtype) for input in input_features
+        ]
+        kwargs = {
+            "wte": self.model.embed_tokens,
+            'audio_projection_mode': audio_projection_mode
+        }
+        audio_embeddings = self.embed_tokens_extend(input_ids, audio_input,
+                                                    audio_input_sizes,
+                                                    **kwargs)
+        audio_embeddings = audio_embeddings.to(target_dtype)
+        return audio_embeddings
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[Phi4MMAudioInputs]:
+        """
+        Parse and validate the audio input to the model.  This handles both 
+        audio features and audio embeddings, but only the former is used for
+        now.
+
+        Args:
+            kwargs (object): Keyword arguments.
+
+        Returns:
+            Optional[Phi4MMAudioInputs]: Parsed and validated audio inputs.
+        """
+        audio_features = kwargs.pop("audio_features", None)
+        audio_embeds = kwargs.pop("audio_embeds", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio features. "
+                                 f"Got type: {type(audio_features)}")
+
+            return Phi4MMAudioFeatureInputs(type="audio_features",
+                                            data=audio_features)
+
+        if audio_embeds is not None:
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
+            return Phi4MMAudioEmbeddingInputs(type="audio_embeds",
+                                              data=audio_embeds)
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_audio_input(self, input_ids: torch.Tensor,
+                             audio_input: Phi4MMAudioInputs,
+                             audio_projection_mode: str) -> NestedTensors:
+        """
+        Create the audio embeddings from the audio input, where the audio input
+        is pairs of audio features and audio embed lengths.  The audio input is
+        created by `input_mapper_for_phi4mm_audio`.
+
+        Args:
+            input_ids (torch.Tensor): Input IDs (the prompt in this case, 
+            before the audio token replication).
+            audio_input (Phi4MMAudioInputs): Audio input.
+
+        Returns:
+            NestedTensors: Audio embeddings
+        """
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["data"]
+
+        audio_features = audio_input["data"]
+        # (e.g. multiple examples) and the second dim is the multi-audio dim
+        # (e.g. multiple audios in the same example)
+        audio_feature = [i[0] for j in audio_features for i in j]
+        audio_feature_len = [i[1].item() for j in audio_features for i in j]
+        # Add the batch dim via `squeeze`
+
+        return self._audio_features_to_embeddings(
+            input_ids.unsqueeze(0),
+            audio_feature,
+            audio_feature_len,
+            audio_projection_mode,
+        ).squeeze(0)
+
+    def _parse_and_validate_image_input(self,
+                                        **kwargs: object) -> Optional[Dict]:
+        pixel_values: Optional[Dict] = kwargs.get("pixel_values")
+        if pixel_values is None:
+            return None
+
+        image_sizes = kwargs.get("image_sizes")
+        image_attention_mask = kwargs.get("image_attention_mask")
+        num_img_tokens = kwargs.get("num_img_tokens")
+        assert image_sizes is not None and image_attention_mask is not None\
+              and num_img_tokens is not None, "Missing image inputs"
+
+        if isinstance(pixel_values, list):
+            assert pixel_values[0].dim() == 5, "Incorrect image inputs"
+            # list len is batch_size.
+            # each tensor has dimension: num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # need to pad along num_hd_patches.
+            # mask size num_img_per_prompt, num_hd_patches, feat_h, heat_w.
+            pixel_values = cat_with_pad(pixel_values, dim=0)
+        elif isinstance(pixel_values, torch.Tensor):
+            # dimension: batch_size, num_img_per_example, num_hd_patches,
+            # channels, height, width.
+            # we flatten first 2 dims to make it a single large batch for
+            # SigLIP Encoder.
+            assert pixel_values.dim() == 6, "Incorrect image inputs"
+            pixel_values = pixel_values.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect pixel_values inputs")
+
+        if isinstance(image_attention_mask, list):
+            image_attention_mask = cat_with_pad(image_attention_mask, dim=0)
+        elif isinstance(image_attention_mask, torch.Tensor):
+            image_attention_mask = image_attention_mask.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(image_sizes, list):
+            image_sizes = torch.cat(image_sizes, dim=0)
+        elif isinstance(image_sizes, torch.Tensor):
+            image_sizes = image_sizes.flatten(0, 1)
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        if isinstance(num_img_tokens, list):
+            num_img_tokens = [
+                n for num_tensor in num_img_tokens
+                for n in num_tensor.tolist()
+            ]
+        elif isinstance(num_img_tokens, torch.Tensor):
+            num_img_tokens = num_img_tokens.flatten(0, 1).tolist()
+        else:
+            raise ValueError("Incorrect image_attention_mask inputs")
+
+        return {
+            'pixel_values': pixel_values,
+            'image_sizes': image_sizes,
+            'image_attention_mask': image_attention_mask,
+            'num_img_tokens': num_img_tokens,
+        }
+
+    def merge_image_features_to_inputs_embeds(
+        self,
+        input_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        image_set_tensors: List[torch.Tensor],
+    ):
+        position_tuple = (input_ids == _IMAGE_PLACEHOLDER_TOKEN_ID).nonzero(
+            as_tuple=True)
+
+        assert all([t.shape[0] == 1 for t in image_set_tensors
+                    ]), 'img_set_tensor should have shape (1, N_tokens, C)'
+        # Shape: (merged_N_tokens, C)
+        image_set_tensor = torch.cat(image_set_tensors, dim=1).squeeze(0)
+        image_set_tensor = image_set_tensor.to(inputs_embeds.dtype).to(
+            inputs_embeds.device)
+        merged_embeds = inputs_embeds.index_put(
+            indices=position_tuple,
+            values=image_set_tensor,
+            accumulate=False,
+        )
+        return merged_embeds
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        weights = {name: weight for name, weight in weights}
+        adjusted_weights = {}
+
+        for name, weight in weights.items():
+            # NOTE vision-speech tasks use a separate projection layer
+            audio_proj_4v = \
+                "model.embed_tokens_extend.audio_embed.audio_projection.vision"
+            if name.startswith(audio_proj_4v):
+                name = name.replace(
+                    audio_proj_4v,
+                    "embed_tokens_extend.audio_projection_for_vision")
+
+            name = (name.replace(
+                "model.embed_tokens_extend.audio_embed."\
+                    "audio_projection.speech.",
+                "embed_tokens_extend.audio_projection.",
+            ).replace(
+                "model.embed_tokens_extend.audio_embed.",
+                "embed_tokens_extend.",
+            ).replace("model.embed_tokens_extend.image_embed.",
+                      "vision_encoder."))
+            # NOTE: this is deal with LoRA injection, where `base_layer`
+            # remains as the original layer in the model
+            if name.endswith(".base_layer.weight"):
+                name = name.replace(".base_layer.weight", ".weight")
+            adjusted_weights[name] = weight
+
+        missing_keys, unexpected_keys = self.load_state_dict(adjusted_weights,
+                                                             strict=False)
+        logger.debug("*** missing keys:")
+        for key in missing_keys:
+            logger.debug(key)
+        logger.debug("**** unexpected keys:")
+        for key in unexpected_keys:
+            logger.debug(key)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+        else:
+            # Each entry in this is a pair of audio_features and audio_embed
+            # lengths
+            audio_input = self._parse_and_validate_audio_input(**kwargs)
+            image_inputs = self._parse_and_validate_image_input(**kwargs)
+
+            has_audio = audio_input is not None
+            has_image = image_inputs is not None
+
+            if has_audio:
+                audio_projection_mode = 'vision' if has_image else 'speech'
+                inputs_embeds = self._process_audio_input(
+                    input_ids, audio_input, audio_projection_mode)
+
+            if has_image:
+                dtype = self.vision_encoder.img_processor.embeddings.\
+                    patch_embedding.weight.dtype
+                pixel_values = image_inputs['pixel_values'].to(dtype)
+                image_sizes = image_inputs['image_sizes']
+                image_attention_mask = image_inputs['image_attention_mask']
+                image_set_tensors = self.vision_encoder(
+                    pixel_values, image_sizes, image_attention_mask)
+                if not has_audio:
+                    inputs_embeds = self.model.embed_tokens(input_ids)
+
+                inputs_embeds = self.merge_image_features_to_inputs_embeds(
+                    input_ids, inputs_embeds, image_set_tensors)
+
+            if has_image or has_audio:
+                # multi-modal input, we have set inputs_embeds properly in
+                # previous steps
+                input_ids = None
+            else:
+                # text-only, we keep using original input_ids
+                inputs_embeds = None
+
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
new file mode 100644
index 00000000000..f9d4881c55e
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -0,0 +1,1403 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import abc
+import math
+from functools import partial
+from typing import Callable, Dict, List, Literal, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, CheckpointWrapper, checkpoint_wrapper, offload_wrapper)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullyShardedDataParallel)
+from torch.utils.checkpoint import checkpoint
+from transformers import PretrainedConfig
+
+from vllm.model_executor.models.phi4mm_utils import (
+    AbsolutePositionalEncoding, ConvModule, FeedForward, MeanVarianceNormLayer,
+    MultiHeadedAttention, NemoConvSubsampling, T5RelativeAttentionLogitBias,
+    adaptive_enc_mask, attn_checkpointing, embedding_checkpoint_wrapper,
+    get_offset, repeat, unfold_tensor, validate_checkpointing_config)
+
+_AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
+
+
+def encoder_checkpoint_wrapper(
+    activation_checkpointing: Union[str, Dict],
+    layer_cls: type,
+    idx: int = 0,
+) -> Callable:
+    """return encoder activation checkpoint wrapper"""
+    validate_checkpointing_config(activation_checkpointing)
+
+    if isinstance(activation_checkpointing, str):
+        if activation_checkpointing:
+            if activation_checkpointing == "offload":
+                return offload_wrapper
+            return partial(checkpoint_wrapper)
+        return lambda x: x
+
+    if isinstance(activation_checkpointing, dict):
+        target_layer_cls = activation_checkpointing.get(
+            "module", "transformer")
+        if target_layer_cls.lower() == "transformer":
+            target_layer_cls = (
+                "EncoderLayer",
+                "ConformerEncoderLayer",
+            )
+        elif target_layer_cls.lower() == "attention":
+            target_layer_cls = ("MultiHeadedAttention", "MultiHeadAttention")
+        checkpointing_interval = activation_checkpointing.get("interval", 1)
+        offloading = activation_checkpointing.get("offload", False)
+        impl = (CheckpointImpl.REENTRANT if activation_checkpointing.get(
+            "reentrant", True) else CheckpointImpl.NO_REENTRANT)
+
+        if (idx % checkpointing_interval == 0
+                and layer_cls.__name__ in target_layer_cls):
+            if offloading:
+                return offload_wrapper
+            return partial(checkpoint_wrapper, checkpoint_impl=impl)
+        return lambda x: x
+
+    raise ValueError("Invalid activation_checkpointing config")
+
+
+class ConformerEncoderLayer(nn.Module):
+    """ConformerEncoder Layer module.
+    for more details see conformer paper:
+        https://arxiv.org/abs/2005.08100
+    This module implement the Conformer block layer.
+
+    Args:
+        d_model: int
+            attention dim.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a 
+             channel_out of the second conv1d layer. 
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        n_head: int
+            the number of heads for multihead attention module.
+        d_ffn: int
+            output size of the feed_forward blocks.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        activation: str, optional
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "relu".
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        chunk_size: int, optional
+            chunk_size for cnn. default 18
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation function used for the glu inside
+            the ConvModule part of the conformer.
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_innner_dim: int, optional
+            if equal to -1, attention dim for linears k/q/v is
+            equal to d_model. otherwise attention_innner_dim is used.
+            default -1.
+        attention_glu_type: str, optional
+            activation function for glu used in the multihead attention,
+             default "swish".
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set to True, use pytorch's scaled dot product attention 
+            implementation in training.
+        attn_group_sizes: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
+            attn_group_sizes = attenion_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        d_model=512,
+        ext_pw_out_channel=0,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        n_head=4,
+        d_ffn=2048,
+        ext_pw_kernel_size=1,
+        kernel_size=3,
+        dropout_rate=0.1,
+        causal=False,
+        batch_norm=False,
+        activation="relu",
+        chunk_se=0,
+        chunk_size=18,
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_innner_dim=-1,
+        attention_glu_type="swish",
+        activation_checkpointing="",
+        export=False,
+        use_pt_scaled_dot_product_attention=False,
+        attn_group_sizes: int = 1,
+    ):
+        super().__init__()
+
+        self.feed_forward_in = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.self_attn = encoder_checkpoint_wrapper(
+            activation_checkpointing,
+            MultiHeadedAttention,
+        )(MultiHeadedAttention(
+            n_head,
+            d_model,
+            dropout_rate,
+            attention_innner_dim,
+            attention_glu_type,
+            bias_in_glu,
+            use_pt_scaled_dot_product_attention=
+            use_pt_scaled_dot_product_attention,
+            group_size=attn_group_sizes,
+        ))
+        self.conv = ConvModule(
+            d_model,
+            ext_pw_out_channel,
+            depthwise_seperable_out_channel,
+            ext_pw_kernel_size,
+            kernel_size,
+            depthwise_multiplier,
+            dropout_rate,
+            causal,
+            batch_norm,
+            chunk_se,
+            chunk_size,
+            conv_activation,
+            conv_glu_type,
+            bias_in_glu,
+            linear_glu_in_convm,
+            export=export,
+        )
+
+        self.feed_forward_out = FeedForward(
+            d_model=d_model,
+            d_inner=d_ffn,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            bias_in_glu=bias_in_glu,
+        )
+
+        self.layer_norm_att = nn.LayerNorm(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(
+        self,
+        x,
+        pos_k,
+        pos_v,
+        mask,
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """ConformerEncoder forward.
+
+        Args:
+            x: torch.Tensor
+                input feature of shape (batch, max_time_in, size)
+            pos_k: torch.Tensor
+                positional key embedding.
+            mask: torch.Tensor
+                mask for x (batch, max_time_in)
+            relative_attention_bias: Optional[torch.Tensor]
+                bias added to attention logits w.r.t. relative positions 
+                (1, n_head, time1, time2)
+        """
+        x = x + 0.5 * self.feed_forward_in(x)
+        norm_x = self.layer_norm_att(x)
+
+        x = x + self.self_attn(
+            norm_x,
+            norm_x,
+            norm_x,
+            pos_k,
+            pos_v,
+            mask,
+            relative_attention_bias=relative_attention_bias,
+        )
+        x = x + self.conv(x)
+        x = x + 0.5 * self.feed_forward_out(x)
+
+        out = self.layer_norm(x)
+
+        return out, pos_k, pos_v, mask
+
+
+class TransformerEncoderBase(abc.ABC, nn.Module):
+    """The Base class for Transformer based encoders
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        time_reduction: int, optional
+            time reduction factor
+            default 4
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        padding_idx: int, optional
+            padding index for input_layer=embed
+            default -1
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        positional_dropout_rate: float, optional
+            dropout rate after positional encoding. default 0.0
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default None
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True).
+            if True or feat_time, the extra padding is added into non full
+            supraframe utts in batch.
+            Default: none
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query 
+            Attention
+            attention_group_size = attenion_heads = Multi-Query Attention
+    """
+
+    def __init__(
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        attention_dim=256,
+        attention_heads=4,
+        input_layer="nemo_conv",
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        time_reduction=4,
+        dropout_rate=0.0,
+        padding_idx=-1,
+        relative_attention_bias_args=None,
+        positional_dropout_rate=0.0,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.input_layer = input_layer
+        self.chunk_size = chunk_size
+        self.left_chunk = left_chunk
+        self.attention_dim = attention_dim
+        self.num_heads = attention_heads
+        self.attention_group_size = attention_group_size
+        self.time_reduction = time_reduction
+        self.nemo_conv_settings = nemo_conv_settings
+        self.encoder_embedding_config = encoder_embedding_config
+
+        if self.input_layer == "nemo_conv":
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.time_reduction,
+                "feat_in": input_size,
+                "feat_out": attention_dim,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.embed = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.pos_emb = AbsolutePositionalEncoding(attention_dim,
+                                                  positional_dropout_rate)
+
+        self.relative_attention_bias_type = (
+            relative_attention_bias_args.get("type")
+            if relative_attention_bias_args else None)
+        if self.relative_attention_bias_type == "t5":
+            assert (self.num_heads % self.attention_group_size == 0
+                    ), "attention_group_size must divide n_head"
+            self.relative_attention_bias_layer = T5RelativeAttentionLogitBias(
+                self.num_heads // self.attention_group_size,
+                max_distance=relative_attention_bias_args.get(
+                    "t5_bias_max_distance", 1000),
+                symmetric=relative_attention_bias_args.get(
+                    "t5_bias_symmetric", False),
+            )
+        else:
+            raise NotImplementedError
+
+    def post_init(self, init_model_config):
+
+        pretrained_speech_encoder_path = init_model_config.get(
+            "pretrained_speech_encoder_path", None)
+        if pretrained_speech_encoder_path:
+            model_state = torch.load(pretrained_speech_encoder_path,
+                                     map_location="cpu")
+            encoder_state_dict = {}
+            for k, v in model_state.items():
+                if "encoder." in k:
+                    tmp_k = k.replace("encoder.", "")
+                    encoder_state_dict[tmp_k] = v
+
+            if hasattr(self, "encoder_embedding"):
+                del self.encoder_embedding
+            self.load_state_dict(encoder_state_dict)
+
+        if not hasattr(self, "encoder_embedding"):
+            self.encoder_embedding = MeanVarianceNormLayer(
+                self.encoder_embedding_config["input_size"])
+
+    def compute_lens_change(self, feature_lens):
+        """feature_lens: int
+        return updated feature lens.
+
+        This used to return a different lambda function for each case that 
+        computed the right thing.  That does not work within Torchscript. 
+        If you really need this to be faster, create nn.Module()-s for all
+        the cases and return one of them.  Torchscript does support that.
+        """
+        if self.input_layer == "nemo_conv":
+            # Handle the special causal case
+            subsampling_causal_cond = self.nemo_conv_settings.get(
+                "subsampling", "dw_striding") in [
+                    "dw_striding",
+                    "striding",
+                    "striding_conv1d",
+                ]
+            is_causal = self.nemo_conv_settings.get("is_causal", False)
+            if is_causal and subsampling_causal_cond:
+                lens_change = (torch.ceil(feature_lens /
+                                          self.time_reduction).long()
+                               if isinstance(feature_lens, Tensor) else
+                               math.ceil(feature_lens / self.time_reduction))
+                feature_lens_remainder = feature_lens % self.time_reduction
+                if isinstance(feature_lens, Tensor):
+                    lens_change[feature_lens_remainder != 1] += 1
+                elif feature_lens_remainder != 1:
+                    lens_change += 1
+                return lens_change
+            ceil_func = (math.ceil
+                         if isinstance(feature_lens, int) else torch.ceil)
+            return ceil_func(feature_lens / self.time_reduction)
+
+    @abc.abstractmethod
+    def forward(self):
+        """Abstract forward method implementation."""
+
+    def _chunk_size_selection(self, chunk_size=None, left_chunk=None):
+        """If chunk size is a list, we will randomly select a chunk size."""
+
+        if chunk_size is None:
+            chunk_size = self.chunk_size
+        if left_chunk is None:
+            left_chunk = self.left_chunk
+        if isinstance(chunk_size, list):
+            # Variable chunk size during training
+            chunk_size_index = int(
+                torch.randint(low=0, high=len(chunk_size), size=(1, )))
+            chunk_size_train_eff = chunk_size[chunk_size_index]
+            if not isinstance(left_chunk, list):
+                raise ValueError(
+                    "Since chunk_size is a list, left_chunk must be a list")
+            if len(left_chunk) != len(chunk_size):
+                raise ValueError(
+                    "The length of left_chunk must be the same as length of "\
+                        "chunk_size."
+                )
+            left_chunk_train_eff = left_chunk[chunk_size_index]
+        else:
+            chunk_size_train_eff = chunk_size
+            left_chunk_train_eff = left_chunk
+
+        return chunk_size_train_eff, left_chunk_train_eff
+
+    def _get_embed_class(self, embed):
+        # pylint: disable=protected-access
+        is_embed_using_act_chkpt = isinstance(embed, CheckpointWrapper)
+        is_embed_fsdp_wrapped = isinstance(embed, FullyShardedDataParallel)
+        embed_class = embed
+        if is_embed_using_act_chkpt:
+            embed_class = embed._checkpoint_wrapped_module
+        if is_embed_fsdp_wrapped:
+            embed_class = embed.module
+        return embed_class
+
+    def _forward_embeddings_core(self, input_tensor, masks):
+        embed_class = self._get_embed_class(self.embed)
+        assert isinstance(embed_class, NemoConvSubsampling)
+        input_tensor, masks = self.embed(input_tensor, masks)
+        return input_tensor, masks
+
+    def _position_embedding(self, input_tensor):
+        pos_k = None
+        pos_v = None
+        if self.relative_attention_bias_layer is None:
+            input_tensor = self.pos_emb(
+                input_tensor)  # default to add abs sinusoid embedding
+        return pos_k, pos_v
+
+    def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
+        chunk_size_train_eff, left_chunk_train_eff = \
+            self._chunk_size_selection(chunk_size, left_chunk)
+
+        # Create mask matrix for streaming
+        # S stores start index. if chunksize is 18, s is [0,18,36,....]
+        chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
+        # avoid randomness when run evaluation or decoding
+        if self.training and np.random.rand() > 0.5:
+            # Either first or last chunk is not complete.
+            # If only the last one is not complete, EOS is not effective
+            chunk_start_idx = seq_len - chunk_start_idx
+            chunk_start_idx = chunk_start_idx[::-1]
+            chunk_start_idx = chunk_start_idx[:-1]
+            chunk_start_idx = np.insert(chunk_start_idx, 0, 0)
+
+        enc_streaming_mask = (adaptive_enc_mask(
+            seq_len, chunk_start_idx,
+            left_window=left_chunk_train_eff).unsqueeze(0).expand(
+                [batch_size, -1, -1]))
+        return enc_streaming_mask
+
+    def forward_embeddings(self,
+                           xs_pad,
+                           masks,
+                           chunk_size_nc=None,
+                           left_chunk_nc=None):
+        """Forwarding the inputs through the top embedding layers
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                input mask
+            chunk_size_nc: (optional, default is None) chunk size for 
+                            non-causal layers
+            left_chunk_nc: (optional, default is None) # of left chunks for
+                            non-causal layers
+        """
+        # pylint: disable=R0915
+        # get new lens.
+        seq_len = int(self.compute_lens_change(xs_pad.shape[1]))
+        if seq_len <= 0:
+            raise ValueError(
+                f"""The sequence length after time reduction is invalid: 
+                {seq_len}. Your input feature is too short. Consider 
+                filtering out the very short sentence from data 
+                loader""", )
+
+        batch_size = xs_pad.shape[0]
+
+        enc_streaming_mask = self._streaming_mask(seq_len, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+
+        if xs_pad.is_cuda:
+            enc_streaming_mask = enc_streaming_mask.cuda()
+            xs_pad = xs_pad.cuda()
+
+        input_tensor = xs_pad
+        input_tensor, masks = self._forward_embeddings_core(
+            input_tensor, masks)
+
+        streaming_mask = enc_streaming_mask
+        if streaming_mask is not None and masks is not None:
+            hs_mask = masks & streaming_mask
+        elif masks is not None:
+            hs_mask = masks
+        else:
+            hs_mask = streaming_mask
+
+        if chunk_size_nc is not None:
+            enc_streaming_mask_nc = self._streaming_mask(
+                seq_len, batch_size, chunk_size_nc, left_chunk_nc)
+            if xs_pad.is_cuda:
+                enc_streaming_mask_nc = enc_streaming_mask_nc.cuda()
+            if masks is not None:
+                hs_mask_nc = masks & enc_streaming_mask_nc
+            else:
+                hs_mask_nc = enc_streaming_mask_nc
+        else:
+            hs_mask_nc = None
+
+        pos_k, pos_v = self._position_embedding(input_tensor)
+
+        if chunk_size_nc is None:
+            return input_tensor, pos_k, pos_v, hs_mask, masks
+        return input_tensor, pos_k, pos_v, hs_mask, masks, hs_mask_nc
+
+    def get_offset(self):
+        """Returns offset used when retaining inputs for decoding.
+
+        This is essentially, how many additional frames have to be added to
+        the front-end CNN input to ensure it can produce a single output.
+        So if the "padding" parameter is 0, typically offset will be > 0.
+        """
+        return get_offset(self.input_layer, self.time_reduction)
+
+
+class ConformerEncoder(TransformerEncoderBase):
+    """ConformerEncoder module.
+    see original paper for more details:
+        https://arxiv.org/abs/2005.08100
+
+    Please set causal = True in streaming model
+    Args:
+        input_size: int
+            input feature dimension.
+        chunk_size: int, list(int)
+            Number of frames for each chunk
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training
+            Some examples for the 2 cases:
+            chunk_size = 12
+            chunk_size = [6, 8, 12, 24]
+        left_chunk: int, list(int)
+            Number of chunks used for masking in streaming mode.
+            This variable can take 2 forms:
+            int:  Used for inference, or single chunk size training
+            list(int) : Used only for variable chunk size training. When
+            chunk_size is a list, left_chunk must be a list with same length.
+            Some examples for the 2 cases:
+            left_chunk = 6
+            left_chunk = [12, 9, 6, 3]
+        left_chunk: int
+            number of chunks used for masking in streaming mode.
+        num_lang: int
+            This parameter is used to store the number of languages in the 
+            lang_dict, only used for multiseed/multilingual models. 
+            default None.
+        attention_dim: int, optional
+            attention dimension. default 256.
+        attention_heads: int, optional
+            the number of heads. default 4
+        linear_units:
+            the number of units of position-wise feed forward.
+            default 2048
+        num_block:
+            number of Transformer layer. default 6
+        dropout_rate: float, optional
+            dropout rate. default 0.1
+        input_layer: str, optional
+            input layer type before Conformer,
+            one of ["linear", "conv2d", "custom", "vgg2l", "embed"],
+            default "conv2d"
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation
+            in ConvModule layer of the conformer.
+            default False
+        cnn_out: int, optional
+            the number of CNN channels before Conformer.
+            default -1.
+        cnn_layer_norm: bool, optional
+            layer norm between Conformer and the first CNN.
+            default False.
+        ext_pw_out_channel: int, optional
+            the number of channel for CNN
+            before depthwise_seperable_CNN.
+            If 0 then use linear. default 0.
+        ext_pw_kernel_size: int, optional
+            kernel size of N before depthwise_seperable_CNN.
+            only work for ext_pw_out_channel > 0.
+            default 1
+        depthwise_seperable_out_channel: int, optional
+            the number of channel for
+            depthwise_seperable_CNN.
+            default 256.
+        depthwise_multiplier: int, optional
+            the number of multiplier for
+            depthwise_seperable_CNN.
+            default 1.
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+            default 0.
+        kernel_size: int, optional
+            the number of kernels for depthwise_seperable_CNN.
+            default 3.
+        activation: str, optional
+            FeedForward block activation.
+            one of ["relu", "swish", "sigmoid"]
+            default "relu".
+        conv_activation: str, optional
+            activation function used in ConvModule part
+            of the conformer, default "relu".
+        conv_glu_type: str, optional
+            activation used use glu in depthwise_seperable_CNN,
+            default "sigmoid"
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU. default True
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        attention_glu_type: str
+            only work for glu_in_attention !=0
+            default "swish".
+        export: bool, optional
+            if set to True, it remove the padding from convolutional layers
+             and allow the onnx conversion for inference.
+              default False.
+        activation_checkpointing: str, optional
+            a dictionarry of {"module","interval","offload"}, where
+                "module": str
+                    accept ["transformer", "attention"] to select
+                    which module should do activation checkpointing.
+                "interval": int, default 1,
+                    interval of applying activation checkpointing,
+                    interval = 1 means that we apply checkpointing
+                    on every layer (if activation), otherwise,
+                    we apply it every x interval.
+                "offload": bool, default False,
+                    if set to True, we offload activation to cpu and
+                    reload it during backward, otherwise,
+                    we recalculate activation in backward.
+            default "".
+        extra_layer_output_idx: int
+            the layer index to be exposed.
+        relative_attention_bias_args: dict, optional
+            use more efficient scalar bias-based relative multihead attention 
+            (Q*K^T + B) implemented in cmb.basics.embedding.
+            [T5/ALiBi]RelativeAttentionLogitBias
+            usage: relative_attention_bias_args={"type": t5/alibi}
+            additional method-specific arguments can be provided (see 
+            transformer_base.py)
+        time_reduction: int optional
+            time reduction factor
+            default 4
+        use_pt_scaled_dot_product_attention: whether to use pytorch scaled 
+            dot product attention in training.
+            Default: False
+        nemo_conv_settings: dict, optional
+            A dictionary of settings for NeMo Subsampling.
+            default: None
+            usage: nemo_conv_settings=
+                {
+                    "subsampling":
+                    dw_striding/striding/dw_striding_conv1d/striding_conv1d,
+                    "conv_channels": int,
+                    "subsampling_conv_chunking_factor": int,
+                    "is_causal": True/False
+                }
+        conv2d_extra_padding: str, optional
+            Add extra padding in conv2d subsampling layers. Choices are
+            (feat, feat_time, none, True)
+            Default: none
+        replication_pad_for_subsample_embedding:  For batched-streaming 
+            decoding, use "replication" padding for the cache at start of
+            utterance.
+            Default: False
+        attention_group_size: int, optional
+            the number of groups to use for attention, default 1 
+            (Multi-Head Attention),
+            1 = typical Multi-Head Attention,
+            1 < attention_group_size < attention_heads = Grouped-Query
+            Attention
+            attention_group_size = attenion_heads = Multi-Query Attention
+    """
+
+    extra_multi_layer_output_idxs: List[int]
+
+    def __init__(  # pylint: disable-all
+        self,
+        input_size,
+        chunk_size,
+        left_chunk,
+        num_lang=None,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        input_layer="nemo_conv",
+        causal=True,
+        batch_norm=False,
+        cnn_out=-1,
+        cnn_layer_norm=False,
+        ext_pw_out_channel=0,
+        ext_pw_kernel_size=1,
+        depthwise_seperable_out_channel=256,
+        depthwise_multiplier=1,
+        chunk_se=0,
+        kernel_size=3,
+        activation="relu",
+        conv_activation="relu",
+        conv_glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        attention_glu_type="swish",
+        export=False,
+        extra_layer_output_idx=-1,
+        extra_multi_layer_output_idxs=[],  # noqa
+        activation_checkpointing="",
+        relative_attention_bias_args=None,
+        time_reduction=4,
+        use_pt_scaled_dot_product_attention=False,
+        nemo_conv_settings=None,
+        conv2d_extra_padding: Literal["feat", "feat_time", "none",
+                                      True] = "none",
+        replication_pad_for_subsample_embedding=False,
+        attention_group_size=1,
+        encoder_embedding_config=None,
+    ):
+        super().__init__(
+            input_size,
+            chunk_size,
+            left_chunk,
+            attention_dim,
+            attention_heads,
+            input_layer,
+            cnn_out,
+            cnn_layer_norm,
+            time_reduction,
+            dropout_rate=dropout_rate,
+            relative_attention_bias_args=relative_attention_bias_args,
+            positional_dropout_rate=0.0,
+            nemo_conv_settings=nemo_conv_settings,
+            conv2d_extra_padding=conv2d_extra_padding,
+            attention_group_size=attention_group_size,
+            encoder_embedding_config=encoder_embedding_config,
+        )
+        self.num_blocks = num_blocks
+        self.num_lang = num_lang
+        self.kernel_size = kernel_size
+        self.embed = embedding_checkpoint_wrapper(activation_checkpointing)(
+            self.embed)
+        self.replication_pad_for_subsample_embedding: bool = (
+            replication_pad_for_subsample_embedding)
+        assert (self.num_heads % attention_group_size == 0
+                ), "attention_group_size must divide n_head"
+        self.num_heads_k = self.num_heads // attention_group_size
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda i: encoder_checkpoint_wrapper(activation_checkpointing,
+                                                 ConformerEncoderLayer, i)
+            (ConformerEncoderLayer(
+                d_model=attention_dim,
+                ext_pw_out_channel=ext_pw_out_channel,
+                depthwise_seperable_out_channel=
+                depthwise_seperable_out_channel,
+                depthwise_multiplier=depthwise_multiplier,
+                n_head=attention_heads,
+                d_ffn=linear_units,
+                ext_pw_kernel_size=ext_pw_kernel_size,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate,
+                causal=causal,
+                batch_norm=batch_norm,
+                activation=activation,
+                chunk_se=chunk_se,
+                chunk_size=chunk_size,
+                conv_activation=conv_activation,
+                conv_glu_type=conv_glu_type,
+                bias_in_glu=bias_in_glu,
+                linear_glu_in_convm=linear_glu_in_convm,
+                attention_glu_type=attention_glu_type,
+                activation_checkpointing=attn_checkpointing(
+                    activation_checkpointing, i),
+                export=export,
+                use_pt_scaled_dot_product_attention=
+                use_pt_scaled_dot_product_attention,
+                attn_group_sizes=attention_group_size,
+            )),
+        )
+        self.extra_layer_output_idx = extra_layer_output_idx
+        self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
+        # Make a zeros scalar we can use in get_initial_state to determine
+        # the device and the needed dtype:
+        self.register_buffer("dev_type", torch.zeros(()), persistent=False)
+
+    def init_relative_attention_bias(self, input_tensor):
+        if self.relative_attention_bias_layer:
+            return self.relative_attention_bias_layer(input_tensor)
+
+    def calculate_hs_mask(self, xs_pad, device, mask):
+        max_audio_length = xs_pad.shape[1]
+        batch_size = xs_pad.shape[0]
+        enc_streaming_mask = self._streaming_mask(max_audio_length, batch_size,
+                                                  self.chunk_size,
+                                                  self.left_chunk)
+        enc_streaming_mask = enc_streaming_mask.to(device)
+        if mask is None:
+            return enc_streaming_mask
+
+        feature_lens = mask.sum(1)
+        padding_length = feature_lens
+        pad_mask = (torch.arange(0, max_audio_length,
+                                 device=device).expand(padding_length.size(0),
+                                                       -1)
+                    < padding_length.unsqueeze(1))
+        pad_mask = pad_mask.unsqueeze(1)
+        pad_mask = pad_mask & enc_streaming_mask
+        return pad_mask
+
+    @torch.jit.ignore
+    def forward(self, xs_pad, masks):
+        """Conformer Forward function
+
+        Args:
+            xs_pad: torch.Tensor
+                input tensor
+            masks: torch.Tensor
+                post-embedding input lengths
+        """
+        xs_pad = self.encoder_embedding(xs_pad)
+        input_tensor, pos_k, pos_v, hs_mask, masks = self.forward_embeddings(
+            xs_pad, masks)
+
+        unfolded = False
+        ori_bz, seq_len, D = input_tensor.shape
+        max_seq_len = 500  #maximum position for absolute positional encoding
+        if seq_len > max_seq_len:
+            # audio sequence is longer than max_seq_len, unfold it into chunks
+            # of max_seq_len
+            unfolded = True
+            # the unfold op will drop residual frames, pad it to the multiple
+            # of max_seq_len
+            if seq_len % max_seq_len > 0:
+                chunk_pad_size = max_seq_len - (seq_len % max_seq_len)
+            else:
+                chunk_pad_size = 0
+            if chunk_pad_size > 0:
+                input_tensor_pad = F.pad(input_tensor,
+                                         (0, 0, 0, chunk_pad_size), "constant",
+                                         0)
+                input_tensor = input_tensor_pad.to(input_tensor.device)
+            input_tensor = unfold_tensor(input_tensor, max_seq_len)
+            if masks is not None:
+                # revise hs_mask here because the previous calculated hs_mask
+                # did not consider extra pad
+                subsampled_pad_mask = masks.squeeze(
+                    1)  # [bz, subsampled_unmask_seq_len]
+                extra_padded_subsamlped_pad_mask = F.pad(
+                    subsampled_pad_mask, (0, chunk_pad_size), "constant",
+                    False)  # extra padding to the pad mask
+                extra_padded_subsamlped_pad_mask = \
+                    extra_padded_subsamlped_pad_mask.unsqueeze(-1).float()
+                masks_unfold = unfold_tensor(
+                    extra_padded_subsamlped_pad_mask, max_seq_len
+                )  # unfold the pad mask like we did to the input tensor
+                masks_unfold = masks_unfold.squeeze(
+                    -1).bool()  # unfold op does not support bool tensor
+            else:
+                masks_unfold = None
+            hs_mask = self.calculate_hs_mask(
+                input_tensor, input_tensor.device, masks_unfold
+            )  # calculate hs_mask based on the unfolded pad mask
+
+        # layer_emb = None
+
+        relative_attention_bias = self.init_relative_attention_bias(
+            input_tensor)
+
+        _simplified_path = (self.extra_layer_output_idx == -1
+                            and relative_attention_bias is None)
+
+        if _simplified_path:
+            input_tensor, *_ = self.encoders(input_tensor, pos_k, pos_v,
+                                             hs_mask)
+        else:
+            for i, layer in enumerate(self.encoders):
+                input_tensor, _, _, _ = layer(
+                    input_tensor,
+                    pos_k,
+                    pos_v,
+                    hs_mask,
+                    relative_attention_bias=relative_attention_bias,
+                )
+
+                # if i == self.extra_layer_output_idx:
+                #     layer_emb = input_tensor
+
+        if unfolded:
+            embed_dim = input_tensor.shape[-1]
+            input_tensor = input_tensor.reshape(ori_bz, -1, embed_dim)
+            # if we ever padded before unfolding, we need to remove the padding
+            if chunk_pad_size > 0:
+                input_tensor = input_tensor[:, :-chunk_pad_size, :]
+
+        return input_tensor, masks  # , layer_emb
+
+    def gradient_checkpointing_enable(self):
+        pass
+
+
+class WindowQformer(nn.Module):
+    """Window-level Qformer"""
+
+    def __init__(
+        self,
+        window_size: int = 8,
+        num_queries: int = 1,
+        num_blocks: int = 2,
+        attention_dim: int = 512,
+        attention_heads: int = 8,
+        linear_units: int = 2048,
+        dropout_rate: float = 0.0,
+        normalize_before: bool = True,
+    ):
+        super().__init__()
+
+        self.decoders = nn.ModuleList([
+            nn.TransformerDecoderLayer(
+                d_model=attention_dim,
+                nhead=attention_heads,
+                dim_feedforward=linear_units,
+                dropout=dropout_rate,
+                activation="relu",
+                batch_first=True,
+                norm_first=normalize_before,  # TODO need to verify
+            ) for _ in range(num_blocks)
+        ])
+
+        self.queries = nn.Parameter(torch.zeros(1, num_queries, attention_dim))
+        self.after_norm = (nn.LayerNorm(attention_dim, eps=1e-12)
+                           if normalize_before else None)
+        self.window_size = window_size
+        self.gradient_checkpointing_enable = False
+
+    def enable_gradient_checkpointing(self):
+        self.gradient_checkpointing_enable = True
+
+    def disable_gradient_checkpointing(self):
+        self.gradient_checkpointing_enable = False
+
+    def forward(self, audio_embed, mask, embed_len=None):
+        """forward decoder"""
+        # audio_embed: N x T x D => N x D x T
+
+        audio_embed = audio_embed.transpose(1, 2)
+        # audio_embed: N x D x 1 x T => N x DK x T'
+        padding = audio_embed.shape[-1] % self.window_size
+        if padding > 0:
+            audio_embed = F.pad(audio_embed, (0, self.window_size - padding),
+                                "constant", 0)
+
+        embed_chunk = F.unfold(
+            audio_embed[..., None, :],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.window_size),
+        )
+        bsz, _, slen = embed_chunk.shape
+        # N x D x K x T'
+        embed_chunk = embed_chunk.view(bsz, -1, self.window_size, slen)
+        # N x T' x K x D
+        embed_chunk = embed_chunk.transpose(1, 3).contiguous()
+        # NT' x K x D
+        embed_chunk = embed_chunk.view(bsz * slen, self.window_size, -1)
+        # NT' x 1 x D
+        q = self.queries.expand(bsz * slen, -1, -1)
+        for layer in self.decoders:
+            if self.gradient_checkpointing_enable and self.training:
+                q = checkpoint(
+                    layer.__call__,
+                    q,
+                    embed_chunk,
+                    None,
+                    mask,
+                    use_reentrant=True,
+                )
+            else:
+                q = layer(tgt=q,
+                          memory=embed_chunk,
+                          tgt_mask=None,
+                          memory_mask=mask)
+
+        if self.after_norm is not None:
+            q = self.after_norm(q)
+
+        if embed_len is not None:
+            embed_len = embed_len // self.window_size
+        # N x T' x D
+        out = q.view(bsz, slen, -1)
+
+        return out, embed_len
+
+
+class AudioEmbedding(nn.Module):
+    """Image embedding."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        # n_embed or hidden_size for text LM
+        hidden_size = (config.n_embd
+                       if hasattr(config, "n_embd") else config.hidden_size)
+
+        if hasattr(config, "embd_pdrop") or hasattr(config, "embed_pdrop"):
+            embd_drop = (config.embd_pdrop if hasattr(config, "embd_pdrop")
+                         else config.embed_pdrop)
+            self.drop = nn.Dropout(embd_drop)
+        else:
+            self.drop = None
+
+        # self.wte = nn.Embedding(config.vocab_size, hidden_size)
+
+        audio_dim_out = (
+            None  # Set this variable according to the actual audio processor
+        )
+        self.layer_idx = -2
+
+        if (isinstance(config.audio_processor, dict)
+                and config.audio_processor.get("name", None) == "cascades"):
+            encoder_config = config.audio_processor.get("config", None)
+            assert encoder_config is not None
+            self.encoder = ConformerEncoder(**encoder_config)
+
+            # fake initialization, create encoder_embedding layer only so that
+            # in decoding, all parameters can be loaded in
+            # from_pretrained_function in training, we do post init after
+            # from_pretrained function to make sure the correct initialization
+            self.encoder.post_init({})
+
+            audio_dim_out = encoder_config["attention_dim"]
+            n_mels = encoder_config["input_size"]
+        else:
+            raise NotImplementedError("")
+
+        assert (audio_dim_out
+                is not None), "Remember to set values for audio_dim_out"
+        self.audio_dim_out = audio_dim_out
+        self.audio_dim_in = n_mels
+
+        self.freeze_audio_processor = kwargs.get("freeze_audio_processor",
+                                                 False)
+
+        self.downsample_rate = kwargs.get("downsample_rate", 1)
+
+        if kwargs.get("use_qformer", False):
+            qformer_config = kwargs.get("qformer_config", {})
+            qformer_config["attention_dim"] = audio_dim_out
+            self.qformer = WindowQformer(**qformer_config)
+        else:
+            self.qformer = None
+
+        if kwargs.get("use_conv_downsample", False):
+            assert (self.qformer is None
+                    ), "don't support use qformer and conv downsample together"
+            nemo_conv_settings = kwargs.get("nemo_conv_settings", {})
+            default_nemo_conv_settings = {
+                "subsampling": "dw_striding",
+                "subsampling_factor": self.downsample_rate,
+                "feat_in": audio_dim_out,
+                "feat_out": audio_dim_out,
+                "conv_channels": 256,
+                "subsampling_conv_chunking_factor": 1,
+                "activation": nn.ReLU(),
+                "is_causal": False,
+            }
+            # Override any of the defaults with the incoming, user settings
+            if nemo_conv_settings:
+                default_nemo_conv_settings.update(nemo_conv_settings)
+                for i in ["subsampling_factor", "feat_in", "feat_out"]:
+                    assert (
+                        i not in nemo_conv_settings
+                    ), "{i} should be specified outside of the NeMo dictionary"
+
+            self.conv_ds = NemoConvSubsampling(**default_nemo_conv_settings, )
+        else:
+            self.conv_ds = None
+
+        enable_gradient_checkpointing = kwargs.get(
+            "enable_gradient_checkpointing", False)
+        if enable_gradient_checkpointing:
+            self.encoder.gradient_checkpointing_enable()
+
+            if self.qformer:
+                self.qformer.enable_gradient_checkpointing()
+
+        projection_cls = kwargs.get("projection_cls", "linear")
+        if projection_cls == "linear":
+            self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
+        elif projection_cls == "mlp":
+            # follow llava-v1.5's implementation
+            # (do not use image_projection and image_proj_norm)
+            dim_projection = hidden_size
+            depth = 2
+            self.linear_downsample_rate = (1 if (self.qformer or self.conv_ds)
+                                           else self.downsample_rate)
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection = nn.Sequential(*layers)
+            # NOTE vision-speech tasks use a separate projection layer
+            layers = [
+                nn.Linear(audio_dim_out * self.linear_downsample_rate,
+                          dim_projection)
+            ]
+            for _ in range(1, depth):
+                layers.extend(
+                    [nn.GELU(),
+                     nn.Linear(dim_projection, dim_projection)])
+            self.audio_projection_for_vision = nn.Sequential(*layers)
+        else:
+            raise NotImplementedError(
+                f"projection_cls = {projection_cls}, not implemented")
+
+        # TODO: audio sequence compression - Qformer
+        self.vocab_size = config.vocab_size
+        self.input_embeds = None
+        self.audio_embed_sizes = None
+
+    def set_audio_embeds(self, input_embeds: torch.FloatTensor) -> None:
+        self.input_embeds = input_embeds
+
+    def set_audio_embed_sizes(self,
+                              audio_embed_sizes: torch.LongTensor) -> None:
+        self.audio_embed_sizes = audio_embed_sizes
+
+    def get_audio_features(
+        self,
+        input_embeds: torch.FloatTensor,
+        audio_attention_mask: torch.Tensor = None,
+        audio_projection_mode: str = "speech",
+    ):
+
+        if self.freeze_audio_processor:
+            with torch.no_grad():
+                audio_features, masks = self.encoder(input_embeds,
+                                                     audio_attention_mask)
+        else:
+            audio_features, masks = self.encoder(input_embeds,
+                                                 audio_attention_mask)
+
+        if self.qformer is not None:
+            audio_features, _ = self.qformer(audio_features, mask=None)
+
+        if self.conv_ds is not None:
+            if masks is not None:
+                masks = masks.squeeze(1)
+
+            audio_features, masks = self.conv_ds(audio_features, mask=masks)
+
+        if self.linear_downsample_rate != 1:
+            bs, seq_len, feat_dim = audio_features.size()
+            padding = seq_len % self.linear_downsample_rate
+            if padding > 0:
+                audio_features = F.pad(
+                    audio_features,
+                    (0, 0, 0, self.linear_downsample_rate - padding),
+                    "constant",
+                    0,
+                )
+
+            seq_len = audio_features.size(1)
+            audio_features = audio_features.view(
+                bs,
+                seq_len // self.linear_downsample_rate,
+                feat_dim * self.linear_downsample_rate,
+            )
+
+        if audio_projection_mode == 'speech':
+            audio_set_tensor = self.audio_projection(audio_features)
+        elif audio_projection_mode == 'vision':
+            audio_set_tensor = self.audio_projection_for_vision(audio_features)
+        else:
+            raise ValueError(
+                f"audio_projection_mode = {audio_projection_mode} not "\
+                    "implemented"
+            )
+
+        return audio_set_tensor
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        input_embeds: torch.FloatTensor,
+        audio_embed_sizes,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        arguments:
+            input_ids: input text ids (B, U)
+            input_embeds: audio features (B, T, D)  B: num audios in a sequence
+        """
+        assert input_embeds is not None and len(input_embeds) == len(
+            audio_embed_sizes)
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        with torch.no_grad():
+            positions = (input_ids == _AUDIO_PLACEHOLDER_TOKEN_ID).nonzero(
+                as_tuple=False)
+
+        if not isinstance(input_embeds, list):
+            input_embeds = [input_embeds]
+
+        audio_projection_mode = kwargs.get("audio_projection_mode", "speech")
+        audio_set_tensor = [
+            self.get_audio_features(
+                input_embed, audio_projection_mode=audio_projection_mode)
+            for input_embed in input_embeds
+        ]
+
+        with torch.no_grad():
+            input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
+
+        if "wte" in kwargs:
+            # we use the token embedding layer from the huggingface model, this
+            # is REQUIRED to make sure we are using the loaded weights.
+            hidden_states = kwargs["wte"](input_ids)
+        else:
+            # otherwise, we use token embedding in pretrained mixformer from
+            # phi team
+            hidden_states = self.wte(input_ids)
+
+        if len(positions.tolist()) > 0:
+            assert sum(audio_embed_sizes) == len(
+                positions
+            ), "please ensure the encoder outputs have the same length as"\
+                " defined in input_ids!"
+            idx = 0
+            for i in range(len(audio_embed_sizes)):
+                cnt = audio_embed_sizes[i]
+                assert audio_set_tensor[i].shape[0] == 1
+                hidden_states[
+                    positions[idx, 0],
+                    positions[idx, 1]:positions[idx, 1] + cnt,
+                ] = (audio_set_tensor[i][0, :audio_embed_sizes[i], :].to(
+                    hidden_states.dtype).to(hidden_states.device))
+                idx += cnt
+
+        else:
+            if self.training:
+                # hidden_states[:, 0:img_set_tensor.shape[0]]  =
+                # hidden_states[:, 0:img_set_tensor.shape[0]] +
+                # 0 * img_set_tensor.to(hidden_states.dtype)
+                # .to(hidden_states.device)
+                hidden_states[:, 0:1] = hidden_states[:, 0:1] + \
+                    0 * audio_set_tensor[:, 0:1].to(hidden_states.dtype)\
+                        .to(hidden_states.device)
+
+        if self.drop is not None:
+            hidden_states = self.drop(hidden_states)
+        return hidden_states
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
new file mode 100644
index 00000000000..16b62c60836
--- /dev/null
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -0,0 +1,1969 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+# Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com)
+# but implemented by the Phi-Speech team
+#!/usr/bin/env python3
+import math
+from functools import partial
+from typing import Callable, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointImpl, checkpoint_wrapper, offload_wrapper)
+
+
+class Block(nn.Module):
+    """Block abstract module"""
+
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+
+
+def get_activation(name="relu"):
+    """Select an activation function by name
+
+    Args:
+        name: str
+            activation function name,
+            one of ["relu", "gelu", "swish", "sigmoid"],
+            default "relu".
+    """
+    name = name.lower()
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "gelu":
+        return nn.GELU()
+    if name == "swish":
+        return Swish()
+    if name == "sigmoid":
+        return torch.nn.Sigmoid()
+    return nn.Identity()
+
+
+def adaptive_enc_mask(x_len, chunk_start_idx, left_window=0, right_window=0):
+    """
+    The function is very important for Transformer Transducer Streaming mode
+    Args:
+        xs_len (int): sequence length
+        chunk_start_idx (list): first idx of each chunk, such as [0,18,36,48]. 
+        It also supports adaptive chunk size [0,10,15,45]
+        left_window (int): how many left chunks can be seen
+        right_window (int): how many right chunks can be seen. It is used for 
+        chunk overlap model.
+        Returns:
+            mask (torch.Tensor): a mask tensor for streaming model
+            Torch 1.0.1
+            tensor([[1., 1., 0., 0.],
+                    [0., 1., 1., 0.],
+                    [0., 0., 1., 1.]])
+            Torch 1.4.1
+            tensor([[True., True., False., False.],
+                    [False., True., True., False.],
+                    [False., False., True., True.]])
+    """
+    chunk_start_idx = torch.Tensor(chunk_start_idx).long(
+    )  # first idx of each chunk, such as [0,18,36,48].
+    start_pad = torch.nn.functional.pad(
+        chunk_start_idx,
+        (1, 0))  # append 0 to the beginning, so it becomes [0, 0, 18, 36, 48]
+    end_pad = torch.nn.functional.pad(
+        chunk_start_idx, (0, 1), value=x_len
+    )  # append x_len to the end, so it becomes [0,18,36,48, x_len]
+    seq_range = torch.arange(0,
+                             x_len).unsqueeze(-1)  # seq_range size: [x_len, 1]
+    idx = ((seq_range < end_pad) &
+           (seq_range >= start_pad)).nonzero()[:, 1]  # idx size: [x_len]
+    # boundary = end_pad[idx]  # boundary size: [x_len]
+    seq_range_expand = (torch.arange(0, x_len).unsqueeze(0).expand(x_len, -1)
+                        )  # seq_range_expand size [x_len, x_len]
+    idx_left = idx - left_window
+    idx_left[idx_left < 0] = 0
+    boundary_left = start_pad[idx_left]
+    mask_left = seq_range_expand >= boundary_left.unsqueeze(-1)
+    idx_right = idx + right_window
+    idx_right[idx_right > len(chunk_start_idx)] = len(chunk_start_idx)
+    boundary_right = end_pad[idx_right]
+    mask_right = seq_range_expand < boundary_right.unsqueeze(-1)
+    return mask_left & mask_right
+
+
+class Swish(nn.Module):
+    """Implement Swish activation module.
+    From https://arxiv.org/pdf/2005.03191.pdf
+
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        self.act_fn = nn.Sigmoid()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Apply Swish function
+
+        Args:
+            x: torch.Tensor
+                Input.
+        """
+        return x * self.act_fn(x)
+
+
+class GLU(nn.Module):
+    """Implement Gated Linear Unit (GLU) module"""
+
+    def __init__(self, dim: int = -1, act_name: str = "sigmoid") -> None:
+        super().__init__()
+        self.dim = dim
+        self.act_name = act_name.lower()
+
+        if self.act_name == "relu":
+            self.act_fn = nn.ReLU(inplace=True)
+        elif self.act_name == "gelu":
+            self.act_fn = nn.GELU()
+        elif self.act_name == "swish":
+            self.act_fn = Swish()
+        elif self.act_name == "sigmoid":
+            self.act_fn = nn.Sigmoid()
+        else:
+            self.act_fn = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """GLU forward
+        Apply Swish function on the first half of input matrices
+        with sigmoid of the second half.
+
+        Args:
+            x: torch.Tensor
+                Input.
+
+        """
+        half_x, gate = x.chunk(2, dim=self.dim)
+        return half_x * self.act_fn(gate)
+
+
+# TODO: Abdel, this can be improved using GLU module
+class GLUPointWiseConv(nn.Module):
+    """GLUPointWiseConv module
+    used for conformer architecture,
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        output_dim: int
+            output channel size.
+        kernel_size: int
+            kernel size
+        glu_type: str, optional
+            activation function one of
+             ["sigmoid", "relu", "gelu"]
+              default "sigmoid".
+        bias_in_glu: bool, optional
+            use addtive bias in glu
+        causal: bool, optional
+            if set to True, padding is set to the half of
+             kernel size, ie, convolution can't see future frames.
+              default False.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        causal=False,
+    ):
+        super().__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+        self.bias_in_glu = bias_in_glu
+        if causal:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1),
+            )
+        else:
+            self.ext_pw_conv_1d = nn.Conv1d(
+                input_dim,
+                output_dim * 2,
+                kernel_size,
+                1,
+                padding=(kernel_size - 1) // 2,
+            )
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        else:
+            raise ValueError(f"Unsupported activation type {self.glu_act}")
+
+        if bias_in_glu:
+            self.b1 = nn.Parameter(torch.zeros(1, output_dim, 1))
+            self.b2 = nn.Parameter(torch.zeros(1, output_dim, 1))
+
+    def forward(self, x):
+        """
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        # to be consistent with GLULinear, we assume the input always has the
+        # #channel (#dim) in the last dimension of the tensor, so need to
+        # switch the dimension first for 1D-Conv case
+        x = x.permute([0, 2, 1])
+        x = self.ext_pw_conv_1d(x)
+        if self.glu_type == "bilinear":
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * (
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * (
+                    x[:, self.output_dim:self.output_dim * 2, :])
+        else:
+            if self.bias_in_glu:
+                x = (x[:, 0:self.output_dim, :] + self.b1) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :] + self.b2)
+            else:
+                x = (x[:, 0:self.output_dim, :]) * self.glu_act(
+                    x[:, self.output_dim:self.output_dim * 2, :])
+
+        x = x.permute([0, 2, 1])
+        return x
+
+
+class DepthWiseSeperableConv1d(nn.Module):
+    """DepthWiseSeperableConv1d module used in Convnet module
+    for the conformer, for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel will be used as a channel_out
+             of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        kernel_size: int
+            kernel_size
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+            will be used to compute the hidden channels of the Conv1D.
+        padding: int, optional
+            padding for the conv1d,
+             default: 0.
+
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        depthwise_seperable_out_channel,
+        kernel_size,
+        depthwise_multiplier,
+        padding=0,
+    ):
+        super().__init__()
+
+        self.dw_conv = nn.Conv1d(
+            input_dim,
+            input_dim * depthwise_multiplier,
+            kernel_size,
+            1,
+            padding=padding,
+            groups=input_dim,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            self.pw_conv = nn.Conv1d(
+                input_dim * depthwise_multiplier,
+                depthwise_seperable_out_channel,
+                1,
+                1,
+                0,
+            )
+        else:
+            self.pw_conv = nn.Identity()
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+
+    def forward(self, x):
+        """
+
+        Args:
+            x: torch.Tensor
+                input tensor
+        """
+        x = self.dw_conv(x)
+        if self.depthwise_seperable_out_channel != 0:
+            x = self.pw_conv(x)
+        return x
+
+
+class ConvModule(nn.Module):
+    """ConvModule Module for the conformer block.
+    for more details see:
+    https://arxiv.org/pdf/2005.08100v1.pdf
+
+    Args:
+        input_dim: int
+            input channel size.
+        ext_pw_out_channel: int
+            if > 0, ext_pw_out_channel is a dim channel size
+             for the last pointwise conv after swish activation.
+        depthwise_seperable_out_channel: int
+            if set different to 0, the number of 
+             depthwise_seperable_out_channel
+             will be used as a channel_out of the second conv1d layer.
+             otherwise, it equal to 0, the second conv1d layer is skipped.
+        ext_pw_kernel_size: int
+            kernel size of the conv pointwise of the conformer.
+        kernel_size: int
+            kernel size.
+        depthwise_multiplier: int
+            number of input_dim channels duplication. this value
+             will be used to compute the hidden channels of the Conv1D.
+        dropout_rate: float
+            dropout rate.
+        causal: bool, optional
+            if set to True, convolution have no access
+             to future frames. default False.
+        batch_norm: bool, optional
+            if set to True, apply batchnorm before activation.
+            default False
+        chunk_se: int, optional
+            0 for offline SE.
+            1 for streaming SE, where mean is computed
+             by accumulated history until current chunk_se.
+            2 for streaming SE, where mean is computed
+             by only the current chunk.
+        chunk_size: int, optional
+            chunk size for cnn. default 18
+        activation: str, optional
+            activation function used in ConvModule,
+            default: "relu".
+        glu_type: str, optional
+            activation function used for the glu,
+            default: "sigmoid".
+        bias_in_glu: bool, optional
+            if set to True, use additive bias in the weight module
+             before GLU.
+        linear_glu_in_convm: bool, optional
+            if set to True, use GLULinear module,
+             otherwise, used GLUPointWiseConv module.
+              default to False.
+        export: bool, optional,
+            if set to True, padding is equal to 0.  This is for inference,
+             or onnx export.  Typically this is set by the export program or
+             the decoder program, and it isn't present in your config file.
+             default False
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        ext_pw_out_channel,
+        depthwise_seperable_out_channel,
+        ext_pw_kernel_size,
+        kernel_size,
+        depthwise_multiplier,
+        dropout_rate,
+        causal=False,
+        batch_norm=False,
+        chunk_se=0,
+        chunk_size=18,
+        activation="relu",
+        glu_type="sigmoid",
+        bias_in_glu=True,
+        linear_glu_in_convm=False,
+        export=False,
+    ):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(input_dim)
+        self.input_dim = input_dim
+        self.ext_pw_out_channel = ext_pw_out_channel
+        self.ext_pw_kernel_size = ext_pw_kernel_size
+        self.depthwise_seperable_out_channel = depthwise_seperable_out_channel
+        self.glu_type = glu_type
+        self.bias_in_glu = bias_in_glu
+        self.linear_glu_in_convm = linear_glu_in_convm
+        self.causal = causal
+
+        self._add_ext_pw_layer()
+
+        self.batch_norm = batch_norm
+        self.kernel_size = kernel_size
+
+        if batch_norm:
+            self.bn_layer = nn.BatchNorm1d(input_dim)
+
+        self.act = get_activation(activation)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.export = export
+
+        if causal:
+            padding = 0 if export else kernel_size - 1
+        else:
+            padding = (kernel_size - 1) // 2
+
+        self.dw_sep_conv_1d = DepthWiseSeperableConv1d(
+            input_dim,
+            depthwise_seperable_out_channel,
+            kernel_size,
+            depthwise_multiplier,
+            padding=padding,
+        )
+
+        if depthwise_seperable_out_channel != 0:
+            if input_dim != depthwise_seperable_out_channel:
+                self.ln2 = nn.Linear(depthwise_seperable_out_channel,
+                                     input_dim)
+        else:
+            if depthwise_multiplier != 1:
+                self.ln2 = nn.Linear(input_dim * depthwise_multiplier,
+                                     input_dim)
+
+    def _add_ext_pw_layer(self):
+        """
+        This function is an extension of __init__ function
+        and dedicated to the convolution module creation
+        of the conformer.
+        """
+        self.ln1 = self.glu = self.bn_layer = self.ext_pw_conv_1d = (
+            nn.Identity())  # jit hacks.
+        self.squeeze_excitation = nn.Identity()  # jit.
+        self.apply_ln1 = self.fix_len1 = False  # jit.
+
+        if self.ext_pw_out_channel != 0:
+            if self.causal:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1),
+                )
+                if self.ext_pw_kernel_size > 1:
+                    self.fix_len1 = True
+                else:
+                    self.fix_len1 = False
+            else:
+                self.ext_pw_conv_1d = nn.Conv1d(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    1,
+                    padding=(self.ext_pw_kernel_size - 1) // 2,
+                )
+                self.fix_len1 = False
+
+            if self.linear_glu_in_convm:
+                self.glu = GLULinear(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.glu_type,
+                    self.bias_in_glu,
+                )
+            else:
+                self.glu = GLUPointWiseConv(
+                    self.input_dim,
+                    self.ext_pw_out_channel,
+                    self.ext_pw_kernel_size,
+                    self.glu_type,
+                    self.bias_in_glu,
+                    self.causal,
+                )
+
+            if self.input_dim != self.ext_pw_out_channel:
+                self.apply_ln1 = True
+                self.ln1 = nn.Linear(self.ext_pw_out_channel, self.input_dim)
+            else:
+                self.apply_ln1 = False
+        else:
+            self.pw_conv_simplify_w = torch.nn.Parameter(torch.ones(3))
+            self.pw_conv_simplify_b = torch.nn.Parameter(torch.zeros(3))
+
+    def forward(self, x):
+        """ConvModule Forward.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        x = self.layer_norm(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.glu(x)
+            if self.causal and self.ext_pw_kernel_size > 1:
+                x = x[:, :-(self.ext_pw_kernel_size - 1), :]
+            if self.apply_ln1:
+                x = self.ln1(x)
+        else:
+            x_0 = x * self.pw_conv_simplify_w[0] + self.pw_conv_simplify_b[0]
+            x_1 = x * self.pw_conv_simplify_w[1] + self.pw_conv_simplify_b[1]
+            x = x_0 + x_1
+
+        x = x.permute([0, 2, 1])
+
+        x = self.dw_sep_conv_1d(x)
+        if self.causal and self.kernel_size > 1:
+            x = x[:, :, :-(self.kernel_size - 1)]
+        if hasattr(self, "ln2"):
+            x = x.permute([0, 2, 1])
+            x = self.ln2(x)
+            x = x.permute([0, 2, 1])
+        if self.batch_norm:
+            x = self.bn_layer(x)
+        x = self.act(x)
+
+        if self.ext_pw_out_channel != 0:
+            x = self.ext_pw_conv_1d(x)
+            if self.fix_len1:
+                x = x[:, :, :-(self.ext_pw_kernel_size - 1)]
+
+            if self.apply_ln1:
+                x = x.permute([0, 2, 1])
+                x = self.ln1(x)
+                x = x.permute([0, 2, 1])
+
+            x = x.permute([0, 2, 1])
+        else:
+            x = x.unsqueeze(1).permute([0, 1, 3, 2])
+            x = x * self.pw_conv_simplify_w[2] + self.pw_conv_simplify_b[2]
+            x = x.squeeze(1)
+
+        x = self.dropout(x)
+        return x
+
+
+class GLULinear(nn.Module):
+    """Linear + GLU module
+
+    Args:
+        input_dim: int
+            input size
+        output_dim: int
+            output size.
+        glu_type:
+            activation function name used in glu module.
+            default "sigmoid" (swish function).
+        bias_in_glu: bool, optional
+            If True, the addtive bias is added. Default False.
+    """
+
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        glu_type="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.linear = nn.Linear(input_dim, output_dim * 2, bias_in_glu)
+        self.glu_act = GLU(-1, glu_type)
+
+    def forward(self, x):
+        """GLULinear forward
+
+        Args:
+            x: torch.Tensor
+                inpute tensor.
+        """
+        x = self.linear(x)
+        return self.glu_act(x)
+
+
+class FeedForward(nn.Module):
+    """FeedForward Module.
+    For more details see Conformer paper:
+        https://arxiv.org/pdf/2005.08100.pdf
+
+    Args:
+        d_model: int
+            input size.
+        d_inner: int
+            output size.
+        dropout_rate: float,
+            dropout rate.
+        activation: str,
+            activation function name,
+            one of ["relu", "swish", "sigmoid"],
+            sigmoid activation is only used with "glu_in_fnn=True",
+            default "sigmoid".
+        bias_in_glu: bool, optional
+    """
+
+    def __init__(
+        self,
+        d_model,
+        d_inner,
+        dropout_rate,
+        activation="sigmoid",
+        bias_in_glu=True,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.d_inner = d_inner
+
+        self.layer_norm = nn.LayerNorm(d_model)
+        module = GLULinear(d_model, d_inner, activation, bias_in_glu)
+        self.net = nn.Sequential(
+            module,
+            nn.Dropout(dropout_rate),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout_rate),
+        )
+
+    def forward(self, x):
+        """FeedForward forward function.
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+        """
+        out = self.net(self.layer_norm(x))
+
+        return out
+
+
+#### positional encoding starts here
+def _pre_hook(
+    state_dict,
+    prefix,
+    local_metadata,
+    strict,
+    missing_keys,
+    unexpected_keys,
+    error_msgs,
+):
+    """Perform pre-hook in load_state_dict for backward compatibility.
+
+    Note:
+        We saved self.pe until v.0.5.2 but we have omitted it later.
+        Therefore, we remove the item "pe" from `state_dict` for backward 
+        compatibility.
+
+    """
+    k = prefix + "pe"
+    if k in state_dict:
+        state_dict.pop(k)
+
+
+class T5RelativeAttentionLogitBias(nn.Module):
+    """
+    This module implements the relative position bias described in Section 
+    2.1 of the T5 paper: https://arxiv.org/pdf/1910.10683.pdf
+
+    The Huggingface implementation is used as a reference
+    https://github.com/huggingface/transformers/blob/v4.30.0/src/
+    transformers/models/t5/modeling_t5.py#L435
+
+    Modifies attention as Q*K^T + B, where B is a learned scalar bias based
+    on relative position of the query and key. It is HxNxN, where H is the 
+    number of heads, N is the sequence length.
+
+    I've made these modifications to the original T5 bias:
+    - Skipping of the bucketing step. Original T5 bias converted rel 
+      position distances into logarithmically increasing buckets. This is 
+      supposed to help with length generalization.
+    - I just directly use rel position index as bias values, as we don't 
+      need length generalization (40s max is good enough for ASR encoder), 
+      and it keeps ONNX export simple.
+    - I've also extended it so that biases can be asymmetric, the default 
+      implementation treats L->R and R->L the same. Asymmetric was found to 
+      yield better results in my experiments.
+
+    Args:
+        num_heads: int
+            Number of attention heads
+        num_buckets: int
+            Number of buckets to use for relative attention bias. This is the
+            size of the learnable bias parameter. Bucketing is not yet 
+            supported, so this defaults to -1 which means no bucketing is
+            used (max_distance determines size of bias param).
+        max_distance: int
+            Maximum distance to use for relative attention bias. With 
+            num_buckets=-1, this directly controls the max size of the bias 
+            parameter. When num_buckets > 0 is supported, this will control 
+            the maximum distance for logarithmic bucketing after which all 
+            positions are in the same bucket.
+        symmetric: bool
+            Whether to use symmetric or asymmetric biases. symmetric=False uses
+            2x number of bias params to distinguish L->R from R->L. This was 
+            found to be better for the encoder.
+    """
+
+    def __init__(self,
+                 num_heads,
+                 num_buckets=-1,
+                 max_distance=1000,
+                 symmetric=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        self.symmetric = symmetric
+        self._skip_bucketing = self.num_buckets < 0
+        if self._skip_bucketing:
+            self.num_buckets = max_distance
+        else:
+            raise NotImplementedError(
+                "T5 attention bias with bucketed positions is not yet tested")
+        if not self.symmetric:
+            self.num_buckets *= 2
+        self.bias_values = nn.Embedding(self.num_buckets, self.num_heads)
+
+    def forward(self, x):
+        # instantiate bias compatible with shape of x
+        maxpos = x.size(1)
+        context_position = torch.arange(maxpos,
+                                        device=x.device,
+                                        dtype=torch.long)[:, None]
+        memory_position = torch.arange(maxpos,
+                                       device=x.device,
+                                       dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position
+        # clipping to a maximum distance using ops that play well with ONNX
+        # export
+        relative_position = relative_position.masked_fill(
+            relative_position < -self.max_distance, -self.max_distance)
+        relative_position = relative_position.masked_fill(
+            relative_position > self.max_distance - 1, self.max_distance - 1)
+
+        # mapping from relative position to index in the bias parameter
+        if self._skip_bucketing:
+            bias_idx = relative_position
+        else:
+            bias_idx = self._bucket_relative_position(relative_position)
+        if self.symmetric:
+            bias_idx = bias_idx.abs()
+        else:
+            bias_idx += self.num_buckets // 2
+
+        t5_rel_att_bias = self.bias_values(bias_idx)  # [L, L, H]
+        t5_rel_att_bias = t5_rel_att_bias.permute(2, 0, 1).unsqueeze(
+            0)  # [1, H, L, L]
+
+        return t5_rel_att_bias
+
+    def _bucket_relative_position(self, relative_position):
+        # this is a placeholder (isn't tested, likely buggy) using HuggingFace
+        # implem as a reference this also needs to be extended to support
+        # asymmetric +/- ve positions
+        relative_buckets = 0
+        if not self.causal:
+            self.num_buckets //= 2
+            relative_buckets += (relative_position > 0).to(
+                torch.long) * self.num_buckets
+            relative_position = torch.abs(relative_position)
+        else:
+            relative_position = -torch.min(relative_position,
+                                           torch.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = self.num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in
+        # positions up to max_distance
+        relative_position_if_large = max_exact + (
+            torch.log(relative_position.float() / max_exact) /
+            math.log(self.max_distance / max_exact) *
+            (self.num_buckets - max_exact)).to(torch.long)
+        relative_position_if_large = torch.min(
+            relative_position_if_large,
+            torch.full_like(relative_position_if_large, self.num_buckets - 1),
+        )
+
+        relative_buckets += torch.where(is_small, relative_position,
+                                        relative_position_if_large)
+        return relative_buckets
+
+
+class AbsolutePositionalEncoding(nn.Module):
+    """Absolute Positional encoding module.
+    This module implement Absolute sinusoidal positional encoding
+    from: https://arxiv.org/pdf/1706.03762.pdf
+
+    Args:
+        d_model: int
+            Input embedding size.
+        dropout_rate: float
+            dropout rate
+        max_len: int, optional
+            Maximum input length sequence, Default 5000
+
+    """
+
+    def __init__(self, d_model, dropout_rate, max_len=5000):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, max_len))
+        self._register_load_state_dict_pre_hook(_pre_hook)
+
+    def extend_pe(self, x):
+        """Reset the positional encodings.
+
+        Args:
+            x: torch.Tensor
+        """
+        if self.pe is not None and self.pe.size(1) >= x.size(1):
+            if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+            return
+        pe = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, x: torch.Tensor):
+        """Add positional encoding.
+
+        Args:
+            x: torch.Tensor
+                Input tensor. shape is (batch, time, ...)
+
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+
+        """
+        self.extend_pe(x)
+        x = x * self.xscale + self.pe[:, :x.size(1)]
+        return self.dropout(x)
+
+
+#### forward embedding layers starts here
+class MeanVarianceNormLayer(nn.Module):
+    """Mean/variance normalization layer.
+
+    Will subtract mean and multiply input by inverted standard deviation.
+    Typically used as a very first layer in a model.
+
+    Args:
+        input_size: int
+            layer input size.
+    """
+
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.register_buffer("global_mean", torch.zeros(input_size))
+        self.register_buffer("global_invstd", torch.ones(input_size))
+        self.global_mean: Optional[Tensor]
+        self.global_invstd: Optional[Tensor]
+
+    def forward(self, input_: Tensor) -> Tensor:
+        """MeanVarianceNormLayer Forward
+
+        Args:
+            input_: torch.Tensor
+                input tensor.
+        """
+        return (input_ - self.global_mean) * self.global_invstd
+
+
+class CausalConv1D(nn.Conv1d):
+    """
+    A causal version of nn.Conv1d where each step would have limited access to
+    locations on its right or left
+    All arguments are the same as nn.Conv1d except padding.
+
+    If padding is set None, then paddings are set automatically to make it a 
+    causal convolution where each location would not see any steps on its right.
+
+    If padding is set as a list (size of 2), then padding[0] would be used as 
+    left padding and padding[1] as right padding.
+    It would make it possible to control the number of steps to be accessible
+    on the right and left.
+    This mode is not supported when stride > 1. padding[0]+padding[1] should 
+    be equal to (kernel_size - 1).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        self.cache_drop_size = None
+        if padding is None:
+            self._left_padding = kernel_size - 1
+            self._right_padding = stride - 1
+        else:
+            if stride != 1 and padding != kernel_size - 1:
+                raise ValueError(
+                    "No striding allowed for non-symmetric convolutions!")
+            if isinstance(padding, int):
+                self._left_padding = padding
+                self._right_padding = padding
+            elif (isinstance(padding, list) and len(padding) == 2
+                  and padding[0] + padding[1] == kernel_size - 1):
+                self._left_padding = padding[0]
+                self._right_padding = padding[1]
+            else:
+                raise ValueError(f"Invalid padding param: {padding}!")
+
+        self._max_cache_len = self._left_padding
+
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode=padding_mode,
+            device=device,
+            dtype=dtype,
+        )
+
+    def update_cache(self, x, cache=None):
+        if cache is None:
+            new_x = F.pad(x, pad=(self._left_padding, self._right_padding))
+            next_cache = cache
+        else:
+            new_x = F.pad(x, pad=(0, self._right_padding))
+            new_x = torch.cat([cache, new_x], dim=-1)
+            if self.cache_drop_size > 0:
+                next_cache = new_x[:, :, :-self.cache_drop_size]
+            else:
+                next_cache = new_x
+            next_cache = next_cache[:, :, -cache.size(-1):]
+        return new_x, next_cache
+
+    def forward(self, x, cache=None):
+        x, cache = self.update_cache(x, cache=cache)
+        x = super().forward(x)
+        if cache is None:
+            return x
+        else:
+            return x, cache
+
+
+class CausalConv2D(nn.Conv2d):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be 
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        padding: Union[str, int] = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        device=None,
+        dtype=None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2D.")
+        self._left_padding = kernel_size - 1
+        self._right_padding = stride - 1
+
+        padding = 0
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            device,
+            dtype,
+        )
+
+    def forward(
+        self,
+        x,
+    ):
+        if self.training:
+            x = F.pad(
+                x,
+                pad=(
+                    self._left_padding,
+                    self._right_padding,
+                    self._left_padding,
+                    self._right_padding,
+                ),
+            )
+        else:
+            x = F.pad(
+                x,
+                pad=(self._left_padding, self._right_padding, 0, 0),
+            )
+        x = super().forward(x)
+        return x
+
+
+class NemoConvSubsampling(torch.nn.Module):
+    """Convlutional subsampling module, taken from NeMo ASR
+    (https://github.com/NVIDIA/NeMo/blob/b367413645d5c72db3c2c96e46e95a
+    34501479cf/nemo/collections/asr/parts/submodules/subsampling.py)
+
+    Striding Subsampling: "Speech-Transformer: A No-Recurrence 
+    Sequence-to-Sequence Model for Speech Recognition" by Linhao Dong 
+    et al. (https://ieeexplore.ieee.org/document/8462506)
+
+
+    Compared with the EncoderConv2D (`input_layer: custom`), this is a 
+    much simplified approach, and uses no LayerNorm and far fewer Conv2Ds.
+    Moreover, depthwise convolutions are used to reduce FLOPs, but the first
+      layer is kept as a regular convolution so as not to degrade accuracy.
+
+    `Striding` and `dw_striding` are the same except that the latter uses 
+    depthwise convolutions after the first layer, whereas the former does not.
+
+    Args:
+        subsampling_factor (int): Time reduction factor
+        feat_in (int): size of the input features
+        feat_out (int): size of the output features
+        subsampling (str): The subsampling technique, choose from
+            {"striding", "dw-striding", "striding_conv1d", 
+            "dw_striding_conv1d"}
+        conv_channels (int): Number of channels for the convolution layers, 
+                            default is 256.
+        subsampling_conv_chunking_factor (int): Input chunking factor which 
+            can be -1 (no chunking) 1 (auto) or a power of 2. Default is 1
+        activation (Module): activation function, default is nn.ReLU()
+        is_causal (bool): whether to use causal Conv1/2D, where each step will
+            have limited access to locations on its right or left
+    """
+
+    def __init__(
+            self,
+            feat_in,
+            feat_out,
+            subsampling_factor=4,
+            subsampling="dw_striding",
+            conv_channels=256,
+            subsampling_conv_chunking_factor=1,
+            activation=nn.ReLU(),  # noqa: B008
+            is_causal=False,
+    ):
+        super().__init__()
+        self._subsampling = subsampling
+        self._conv_channels = conv_channels
+        self._feat_in = feat_in
+        self._feat_out = feat_out
+
+        if subsampling_factor % 2 != 0:
+            raise ValueError("Sampling factor should be a multiply of 2!")
+        self._sampling_num = int(math.log(subsampling_factor, 2))
+        self.subsampling_factor = subsampling_factor
+        self.is_causal = is_causal
+        self.subsampling_causal_cond = subsampling in (
+            "dw_striding",
+            "striding",
+            "striding_conv1d",
+        )
+
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = \
+            subsampling_conv_chunking_factor
+
+        in_channels = 1
+        layers = []
+
+        if subsampling == "dw_striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            # Layer 1
+            if self.is_causal:
+                layers.append(
+                    CausalConv2D(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=None,
+                    ))
+            else:
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                    ))
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                            groups=in_channels,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=in_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                            groups=in_channels,
+                        ))
+
+                layers.append(
+                    torch.nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=conv_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding":
+            self._stride = 2
+            self._kernel_size = 3
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv2D(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv2d(
+                            in_channels=in_channels,
+                            out_channels=conv_channels,
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            if self.is_causal:
+                self._left_padding = self._kernel_size - 1
+                self._right_padding = self._stride - 1
+                self._max_cache_len = subsampling_factor + 1
+            else:
+                self._left_padding = (self._kernel_size - 1) // 2
+                self._right_padding = (self._kernel_size - 1) // 2
+                self._max_cache_len = 0
+
+            for i in range(self._sampling_num):
+                if self.is_causal:
+                    layers.append(
+                        CausalConv1D(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=None,
+                        ))
+                else:
+                    layers.append(
+                        torch.nn.Conv1d(
+                            in_channels=in_channels,
+                            out_channels=(feat_out if self._sampling_num == i +
+                                          1 else conv_channels),
+                            kernel_size=self._kernel_size,
+                            stride=self._stride,
+                            padding=self._left_padding,
+                        ))
+                layers.append(activation)
+                in_channels = conv_channels
+
+        elif subsampling == "dw_striding_conv1d":
+            in_channels = feat_in
+
+            self._stride = 2
+            self._kernel_size = 5
+            self._ceil_mode = False
+
+            self._left_padding = (self._kernel_size - 1) // 2
+            self._right_padding = (self._kernel_size - 1) // 2
+
+            # Layer 1
+            layers.extend([
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=self._kernel_size,
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=in_channels,
+                ),
+                torch.nn.Conv1d(
+                    in_channels=in_channels,
+                    out_channels=(feat_out if self._sampling_num == 1 else
+                                  conv_channels),
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                ),
+            ])
+            in_channels = conv_channels
+            layers.append(activation)
+
+            for i in range(self._sampling_num - 1):
+                layers.extend([
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        kernel_size=self._kernel_size,
+                        stride=self._stride,
+                        padding=self._left_padding,
+                        groups=in_channels,
+                    ),
+                    torch.nn.Conv1d(
+                        in_channels=in_channels,
+                        out_channels=(feat_out if self._sampling_num == i +
+                                      2 else conv_channels),
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=1,
+                    ),
+                ])
+                layers.append(activation)
+                in_channels = conv_channels
+
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        if subsampling in ["dw_striding", "striding"]:
+            in_length = torch.tensor(feat_in, dtype=torch.float)
+            out_length = calc_length(
+                lengths=in_length,
+                all_paddings=self._left_padding + self._right_padding,
+                kernel_size=self._kernel_size,
+                stride=self._stride,
+                ceil_mode=self._ceil_mode,
+                repeat_num=self._sampling_num,
+            )
+            self.out = torch.nn.Linear(conv_channels * int(out_length),
+                                       feat_out)
+            self.conv2d_subsampling = True
+        elif subsampling in ["striding_conv1d", "dw_striding_conv1d"]:
+            self.out = None
+            self.conv2d_subsampling = False
+        else:
+            raise ValueError(f"Not valid sub-sampling: {subsampling}!")
+
+        self.conv = torch.nn.Sequential(*layers)
+
+    def get_sampling_frames(self):
+        return [1, self.subsampling_factor]
+
+    def get_streaming_cache_size(self):
+        return [0, self.subsampling_factor + 1]
+
+    def forward(self, x, mask):
+        """
+        Forward method for NeMo subsampling.
+
+        Args:
+            x[Batch, Time, Filters]: torch.Tensor
+                input tensor
+            x_mask: torch.Tensor
+                input mask
+
+        Returns:
+            x: torch.Tensor
+                Resulting tensor from subsampling (B, T // 
+                time_reduction_factor, feat_out)
+            pad_mask: torch.Tensor
+                tensor of padded hidden state sequences (B, 1, T // 
+                time_reduction_factor)
+        """
+        x = x.unsqueeze(1) if self.conv2d_subsampling else x.transpose(1, 2)
+
+        # split inputs if chunking_factor is set
+        if (self.subsampling_conv_chunking_factor != -1
+                and self.conv2d_subsampling):
+            if self.subsampling_conv_chunking_factor == 1:
+                # if subsampling_conv_chunking_factor is 1, we split only
+                # if needed.
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31.
+                # see https://github.com/pytorch/pytorch/issues/80020
+                x_ceil = (2**31 / self._conv_channels * self._stride *
+                          self._stride)
+                need_to_split = torch.numel(x) > x_ceil
+            else:
+                # if subsampling_conv_chunking_factor > 1 we always split
+                need_to_split = True
+
+            if need_to_split:
+                x, success = self.conv_split_by_batch(x)
+                if not success:  # if unable to split by batch, try by channel
+                    if self._subsampling == "dw_striding":
+                        x = self.conv_split_by_channel(x)
+                    else:
+                        x = self.conv(x)  # try anyway
+            else:
+                x = self.conv(x)
+        else:
+            x = self.conv(x)
+
+        # Flatten Channel and Frequency Axes
+        if self.conv2d_subsampling:
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).reshape(b, t, -1))
+        # Transpose to Channel Last mode
+        else:
+            x = x.transpose(1, 2)
+
+        if mask is None:
+            return x, None
+
+        max_audio_length = x.shape[1]
+        feature_lens = mask.sum(1)
+        padding_length = torch.ceil(feature_lens / self.subsampling_factor)
+        if self.is_causal and self.subsampling_causal_cond:
+            feature_lens_remainder = feature_lens % self.subsampling_factor
+            padding_length[feature_lens_remainder != 1] += 1
+        pad_mask = torch.arange(0, max_audio_length, device=x.device).expand(
+            padding_length.size(0), -1) < padding_length.unsqueeze(1)
+        return x, pad_mask.unsqueeze(1)
+
+    def reset_parameters(self):
+        # initialize weights
+        if self._subsampling == "dw_striding":
+            with torch.no_grad():
+                # init conv
+                scale = 1.0 / self._kernel_size
+                dw_max = (self._kernel_size**2)**-0.5
+                pw_max = self._conv_channels**-0.5
+
+                torch.nn.init.uniform_(self.conv[0].weight, -scale, scale)
+                torch.nn.init.uniform_(self.conv[0].bias, -scale, scale)
+
+                for idx in range(2, len(self.conv), 3):
+                    torch.nn.init.uniform_(self.conv[idx].weight, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx].bias, -dw_max,
+                                           dw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].weight, -pw_max,
+                                           pw_max)
+                    torch.nn.init.uniform_(self.conv[idx + 1].bias, -pw_max,
+                                           pw_max)
+
+                # init fc (80 * 64 = 5120 from https://github.com/kssteven418/
+                # Squeezeformer/blob/13c97d6cf92f2844d2cb3142b4c5bfa9ad1a8951/
+                # src/models/conformer_encoder.py#L487
+                fc_scale = (self._feat_out * self._feat_in /
+                            self._sampling_num)**-0.5
+                torch.nn.init.uniform_(self.out.weight, -fc_scale, fc_scale)
+                torch.nn.init.uniform_(self.out.bias, -fc_scale, fc_scale)
+
+    def conv_split_by_batch(self, x):
+        """Tries to split input by batch, run conv and concat results"""
+        b, _, _, _ = x.size()
+        if b == 1:  # can't split if batch size is 1
+            return x, False
+
+        if self.subsampling_conv_chunking_factor > 1:
+            cf = self.subsampling_conv_chunking_factor
+        else:
+            # avoiding a bug / feature limiting indexing of tensors to 2**31
+            # see https://github.com/pytorch/pytorch/issues/80020
+            x_ceil = 2**31 / self._conv_channels * self._stride * self._stride
+            p = math.ceil(math.log(torch.numel(x) / x_ceil, 2))
+            cf = 2**p
+
+        new_batch_size = b // cf
+        if new_batch_size == 0:  # input is too big
+            return x, False
+
+        return (
+            torch.cat([
+                self.conv(chunk)
+                for chunk in torch.split(x, new_batch_size, 0)
+            ]),
+            True,
+        )
+
+    def conv_split_by_channel(self, x):
+        """For dw convs, tries to split input by time, run conv and concat 
+        results"""
+        x = self.conv[0](x)  # full conv2D
+        x = self.conv[1](x)  # activation
+
+        for i in range(self._sampling_num - 1):
+            _, c, t, _ = x.size()
+
+            if self.subsampling_conv_chunking_factor > 1:
+                cf = self.subsampling_conv_chunking_factor
+            else:
+                # avoiding a bug / feature limiting indexing of tensors
+                # to 2**31
+                # see https://github.com/pytorch/pytorch/issues/80020
+                p = math.ceil(math.log(torch.numel(x) / 2**31, 2))
+                cf = 2**p
+
+            new_c = int(c // cf)
+            if new_c == 0:
+                new_c = 1
+
+            new_t = int(t // cf)
+            if new_t == 0:
+                new_t = 1
+
+            x = self.channel_chunked_conv(self.conv[i * 3 + 2], new_c,
+                                          x)  # conv2D, depthwise
+
+            # splitting pointwise convs by time
+            x = torch.cat(
+                [
+                    self.conv[i * 3 + 3](chunk)
+                    for chunk in torch.split(x, new_t, 2)
+                ],
+                2,
+            )  # conv2D, pointwise
+            x = self.conv[i * 3 + 4](x)  # activation
+        return x
+
+    def channel_chunked_conv(self, conv, chunk_size, x):
+        """Performs channel chunked convolution"""
+
+        ind = 0
+        out_chunks = []
+        for chunk in torch.split(x, chunk_size, 1):
+            step = chunk.size()[1]
+
+            if self.is_causal:
+                chunk = nn.functional.pad(
+                    chunk,
+                    pad=(
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                        self._kernel_size - 1,
+                        self._stride - 1,
+                    ),
+                )
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=0,
+                    groups=step,
+                )
+            else:
+                ch_out = nn.functional.conv2d(
+                    chunk,
+                    conv.weight[ind:ind + step, :, :, :],
+                    bias=conv.bias[ind:ind + step],
+                    stride=self._stride,
+                    padding=self._left_padding,
+                    groups=step,
+                )
+            out_chunks.append(ch_out)
+            ind += step
+
+        return torch.cat(out_chunks, 1)
+
+    def change_subsampling_conv_chunking_factor(
+            self, subsampling_conv_chunking_factor: int):
+        if (subsampling_conv_chunking_factor != -1
+                and subsampling_conv_chunking_factor != 1
+                and subsampling_conv_chunking_factor % 2 != 0):
+            raise ValueError(
+                "subsampling_conv_chunking_factor should be -1, 1, or a "\
+                    "power of 2"
+            )
+        self.subsampling_conv_chunking_factor = subsampling_conv_chunking_factor
+
+
+def calc_length(lengths,
+                all_paddings,
+                kernel_size,
+                stride,
+                ceil_mode,
+                repeat_num=1):
+    """Calculates the output length of a Tensor passed through a convolution or
+      max pooling layer"""
+    add_pad: float = all_paddings - kernel_size
+    one: float = 1.0
+    for i in range(repeat_num):
+        lengths = (torch.div(lengths.to(dtype=torch.float) + add_pad, stride) +
+                   one)
+        lengths = torch.ceil(lengths) if ceil_mode else torch.floor(lengths)
+    return lengths.to(dtype=torch.int)
+
+
+####  multihead attention starts here
+class AttModule(nn.Module):
+    """Attention abstraction module"""
+
+    def __init__(self):
+        super().__init__()
+        self.export_mode = False
+
+    def set_export(self, mode=True):
+        """set the export mode"""
+        self.export_mode = mode
+
+    def forward(
+        self,
+        x: Tensor,
+        memory: Optional[Tensor] = None,
+        pos_emb: Optional[Tensor] = None,
+        att_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        """AttModule forward
+
+        Args:
+            x: torch.Tensor
+                input tensor.
+            memory: torch.Tensor, optional
+                memory tensor.
+            pos_emb: torch.Tensor, optional
+                positional encoder embedding.
+            att_mask: torch.Tensor, optional
+                attention mask tensor.
+        """
+        return x, memory, pos_emb, att_mask
+
+
+class AttBlock(Block, AttModule):
+    """Attention Block module to support both Attention and Block module."""
+
+    def memory_dims(self, max_len=False):
+        """memory dimensions"""
+        return (1, self.input_size)
+
+
+def masked_softmax(
+    scores,
+    mask: Optional[Tensor],
+):
+    if mask is not None:
+        mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+        scores = scores.masked_fill(mask, -torch.inf)
+        attn = torch.softmax(scores, dim=-1).masked_fill(
+            mask, 0.0)  # (batch, head, time1, time2)
+    else:
+        attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+    return attn
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer with optional relative position embedding 
+    and GLU.
+
+    Args:
+        n_head: int
+            the number of heads.
+        n_feat: int
+            input size features.
+        dropout_rate: float
+            dropout rate.
+        use_LN: bool
+            apply layer norm or not
+        dropout_at_output: bool
+            whether to apply dropout at output
+        attention_inner_dim: int, optional
+            the attention dimension used in the class,
+            it can be different from the input dimension n_feat.
+            default: -1 (equal to n_feat).
+        use_pt_scaled_dot_product_attention: bool, optional
+            if set True, use pytorch scaled dot product attention in training.
+            NOTE: this will NOT be used in ONNX decoding due to a lack of 
+            support.  In that case, we use the original attention 
+            implementation, which shows no regression.
+            default: False.
+        n_value: int, optional
+            if set to values other than -1, use a different dimension for 
+            value. With the default value (i.e. -1), it is backward compatible.
+        group_size: int, optional. must divide `n_head`
+            if group_size > 1:       GQA
+            if group_size = 1:       MHA
+            if group_size = n_head:  MQA
+    """
+
+    inv_sqrt_d_k: torch.jit.Final[float]
+    h: torch.jit.Final[int]
+    h_k: torch.jit.Final[int]
+    g: torch.jit.Final[int]
+
+    def __init__(
+        self,
+        n_head,
+        n_feat,
+        dropout_rate,
+        attention_inner_dim=-1,
+        glu_type="swish",
+        bias_in_glu=True,
+        use_pt_scaled_dot_product_attention=False,
+        n_value=-1,
+        group_size: int = 1,
+    ):
+        super().__init__()
+        if n_value == -1:
+            n_value = n_feat
+        if attention_inner_dim == -1:
+            attention_inner_dim = n_feat
+        assert attention_inner_dim % n_head == 0
+
+        # We assume d_v always equals d_k
+        self.d_k = attention_inner_dim // n_head
+        self.inv_sqrt_d_k = 1.0 / math.sqrt(self.d_k)
+        self.h = n_head
+        assert n_head % group_size == 0, "group_size must divide n_head"
+        self.g = group_size
+        self.h_k = n_head // group_size
+
+        self.linear_q = nn.Linear(n_feat, attention_inner_dim)
+        self.linear_k = nn.Linear(n_feat, attention_inner_dim // group_size)
+        self.linear_v = nn.Linear(n_value, attention_inner_dim // group_size)
+        self.linear_out = nn.Linear(attention_inner_dim // group_size, n_value)
+
+        self.attn = torch.jit.Attribute(None, Optional[Tensor])
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.dropout_rate = dropout_rate
+        self.use_pt_scaled_dot_product_attention = (
+            use_pt_scaled_dot_product_attention)
+
+        if use_pt_scaled_dot_product_attention and group_size > 1:
+            raise ValueError("Cannot use PT Scaled Attention with GQA")
+
+        # Torchscript eager quantization.  Note that these functions below are
+        # NOOPs and have very little impact on performance unless quantization
+        # is enabled.
+        self.quant_q = torch.ao.quantization.QuantStub()
+        self.quant_x = torch.ao.quantization.QuantStub()
+        self.dequant = torch.ao.quantization.DeQuantStub()
+        self.ffunc = torch.ao.nn.quantized.FloatFunctional()
+
+    def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        pos_k: Tensor,
+        pos_v: Tensor,
+        mask: Optional[Tensor],
+        relative_attention_bias: Optional[Tensor] = None,
+    ):
+        """Compute 'Scaled Dot Product Attention'.
+
+        Args:
+            query: torch.Tensor
+                query tensor (batch, time1, size)
+            key: torch.Tensor
+                key tensor (batch, time2, size)
+            value: torch.Tensor
+                value tensor (batch, time1, size)
+            pos_k: torch.Tensor
+                key tensor used for relative positional embedding.
+            pos_v: torch.Tensor
+                value tensor used for relative positional embedding.
+            mask: torch.Tensor
+                mask tensor (batch, time1, time2)
+            relative_attention_bias: torch.Tensor
+                bias added to attention logits w.r.t. relative positions
+                (1, n_head, time1, time2)
+        """
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h,
+                                      self.d_k)  # (b, t, d)
+        k = self.linear_k(key).view(n_batch, -1, self.h_k,
+                                    self.d_k)  # (b, t, d)
+        v = self.linear_v(value).view(n_batch, -1, self.h_k, self.d_k)
+        q = (q.transpose(1, 2) if self.use_pt_scaled_dot_product_attention
+             and not torch.jit.is_scripting() else q.transpose(1, 2) *
+             self.inv_sqrt_d_k)
+        k = k.transpose(1, 2)  # (batch, head_k, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head_k, time2, d_k)
+
+        if (self.use_pt_scaled_dot_product_attention
+                and not torch.jit.is_scripting()):
+            attn_mask = None
+            if mask is not None:
+                mask = mask.unsqueeze(1)
+                if relative_attention_bias is not None:
+                    attn_mask = mask + relative_attention_bias
+                else:
+                    attn_mask = mask
+                if mask.dtype != q.dtype:
+                    attn_mask = attn_mask.to(q.dtype)
+
+            with torch.backends.cuda.sdp_kernel(enable_flash=True,
+                                                enable_math=True,
+                                                enable_mem_efficient=True):
+                x = torch.nn.functional.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attn_mask,
+                    dropout_p=self.dropout_rate,
+                )
+        else:
+            if self.h != self.h_k:
+                q = q.reshape(n_batch, self.g, self.h_k, -1, self.d_k)
+                A = torch.einsum("b g h t d, b h s d -> b h t s", q, k)
+            else:
+                A = torch.matmul(q, k.transpose(-2, -1))
+            if pos_k is not None:
+                if self.h != self.h_k:
+                    B = torch.einsum("b g h t d, t s d -> b h t s", q, pos_k)
+                else:
+                    reshape_q = (q.contiguous().view(n_batch * self.h, -1,
+                                                     self.d_k).transpose(0, 1)
+                                 )  # (t1,nh,dk)
+                    B = torch.matmul(reshape_q,
+                                     pos_k.transpose(-2,
+                                                     -1))  # pos_k: (t1,dk,t2)
+                    B = B.transpose(0, 1).view(n_batch, self.h, pos_k.size(0),
+                                               pos_k.size(1))
+                scores = A + B
+            else:
+                scores = A
+
+            if relative_attention_bias is not None:
+                scores = scores + relative_attention_bias
+
+            attn = masked_softmax(scores, mask)  # (batch, head, time1, time2)
+
+            self.attn = attn
+
+            p_attn = self.dropout(attn)
+            x = torch.matmul(p_attn.to(v.dtype),
+                             v)  # (batch, head, time1, d_k)
+            if pos_v is not None:
+                reshape_attn = (p_attn.contiguous().view(
+                    n_batch * self.h, pos_v.size(0),
+                    pos_v.size(1)).transpose(0, 1))  # (t1, bh, t2)
+
+                attn_v = (torch.matmul(reshape_attn, pos_v).transpose(
+                    0, 1).contiguous().view(n_batch, self.h, pos_v.size(0),
+                                            self.d_k))
+                x = x + attn_v
+        x = (x.transpose(1, 2).contiguous().view(n_batch, -1,
+                                                 self.h_k * self.d_k)
+             )  # (batch, time1, d_model)
+
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+
+def validate_checkpointing_config(activation_checkpointing):
+    """validate activation checkpointing configuration"""
+    if isinstance(activation_checkpointing, str):
+        assert activation_checkpointing in (
+            "",
+            "checkpoint",
+            "offload",
+        ), "activation_checkpointing has to be a dict or a str in "\
+            "('', 'checkpoint', 'offload')."
+    elif isinstance(activation_checkpointing, dict):
+        assert activation_checkpointing.get("module", "transformer") in (
+            "transformer",
+            "attention",
+        ), "module in activation_checkpointing has to be in "\
+            "('transformer', 'attention')."
+    else:
+        raise ValueError("activation_checkpointing has to be a str"\
+                         " or dict.")
+
+
+def embedding_checkpoint_wrapper(
+    activation_checkpointing: Union[str, Dict], ) -> Callable:
+    """return encoder embedding activation checkpoint wrapper"""
+    validate_checkpointing_config(activation_checkpointing)
+
+    if isinstance(activation_checkpointing, str):
+        if activation_checkpointing:
+            if activation_checkpointing == "offload":
+                return offload_wrapper
+            return partial(checkpoint_wrapper)
+        return lambda x: x
+
+    if isinstance(activation_checkpointing, dict):
+        enabled = activation_checkpointing.get("embed", False)
+        if enabled:
+            offloading = activation_checkpointing.get("offload", False)
+            if offloading:
+                return offload_wrapper
+            impl = (CheckpointImpl.REENTRANT if activation_checkpointing.get(
+                "reentrant", False) else CheckpointImpl.NO_REENTRANT)
+            return partial(checkpoint_wrapper, checkpoint_impl=impl)
+        return lambda x: x
+    raise ValueError("Invalid activation_checkpointing config")
+
+
+def attn_checkpointing(activation_checkpointing: Union[str, Dict],
+                       i) -> Union[str, Dict]:
+    """return activation checkpointing config for attention layer"""
+    if isinstance(activation_checkpointing, str):
+        return ""
+
+    if isinstance(activation_checkpointing, dict):
+        target_layer_cls = activation_checkpointing.get(
+            "module", "transformer")
+        checkpointing_interval = activation_checkpointing.get("interval", 1)
+        if target_layer_cls == "attention" and i % checkpointing_interval == 0:
+            return activation_checkpointing
+        return ""
+
+    raise ValueError("Invalid activation_checkpointing config")
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential"""
+
+    @torch.jit.ignore
+    def forward(self, *args):
+        """Forward method implementation."""
+        for m in self:
+            args = m(*args)
+        return args
+
+
+def repeat(repeat_num, module_gen_fn):
+    """repeat module N times
+
+    :param int repeat_num: repeat time
+    :param function module_gen_fn: function to generate module
+    :return: repeated modules
+    :rtype: MultiSequential
+    """
+    return MultiSequential(*[module_gen_fn(i) for i in range(repeat_num)])
+
+
+def get_offset(input_layer: str, time_reduction: int):
+    """Get an offset. We will use the offset for determining #frames of a 
+    subsampled feature.
+
+    Args:
+        input_layer (str): Type of an input layer
+        time_reduction (int): time reduction factor for downsampling a feature
+    Returns:
+        int: offset
+    """
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 4:
+        return 3
+    if input_layer in ("conv2d", ) and time_reduction == 6:
+        return 1
+    if input_layer in ("conv2d", "nemo_conv") and time_reduction == 8:
+        return 7
+    return 0
+
+
+def unfold_tensor(xs_pad, max_seq_len):
+    """
+    For a given tensor with shape of (N, T, D), if sequence length T is 
+    longer than max_seq_len, this function unfold it to a 
+    (NT', max_seq_len, D) where T' is T // max_seq_len.
+    Args:
+        xs_pad: N, T, D
+    """
+    _, _, D = xs_pad.shape
+    xs_pad = xs_pad.transpose(-1, -2)  # convert to N, D, T
+    # N x D x 1 x T => N x (D x max_seq_len) x T'
+    xs_pad = F.unfold(
+        xs_pad[..., None, :],
+        kernel_size=(1, max_seq_len),
+        stride=(1, max_seq_len),
+    )
+    new_bsz, _, slen = xs_pad.shape
+    # N x D x max_seq_len x T'
+    xs_pad = xs_pad.view(new_bsz, -1, max_seq_len, slen)
+    # N x T' x max_seq_len x D
+    xs_pad = xs_pad.permute(0, 3, 2, 1).contiguous()
+    # NT' x max_seq_len x D
+    xs_pad = xs_pad.view(-1, max_seq_len, D)
+    return xs_pad
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 3a7fcdcf7b3..74160e2d9ee 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -182,6 +182,7 @@
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py
new file mode 100644
index 00000000000..3a9597a845f
--- /dev/null
+++ b/vllm/model_executor/models/vision_siglip_navit.py
@@ -0,0 +1,1966 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Siglip model configuration"""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (ModelOutput, add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+
+from vllm.platforms import _Backend
+
+from .vision import get_vit_attn_backend
+
+logger = logging.get_logger(__name__)
+
+SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/siglip-base-patch16-224":
+    "https://huggingface.co/google/siglip-base-patch16-224/"\
+        "resolve/main/config.json",
+}
+
+
+class SiglipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a 
+    [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder
+    according to the specified arguments, defining the model architecture. 
+    Instantiating a configuration with the defaults will yield a similar 
+    configuration to that of the text encoder of the Siglip [google/
+    siglip-base-patch16-224](https://huggingface.co/google/siglip-base
+    -patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from 
+    [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip text model. Defines the number of 
+            different tokens that can be represented by the `inputs_ids` 
+            passed when calling [`SiglipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer 
+            in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the 
+            Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used 
+            with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to 
+            `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the
+            encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+    Example:
+    ```python
+    >>> from transformers import SiglipTextConfig, SiglipTextModel
+    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 
+        style configuration
+    >>> configuration = SiglipTextConfig()
+    >>> # Initializing a SiglipTextModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/siglip
+        # See https://github.com/huggingface/transformers/pull/24773#
+        # issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id,
+                         bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to instantiate a model of "
+                "type %s. This is not supported for all configurations of "
+                "models and can yield errors.", config_dict['model_type'],
+                cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a 
+    [`SiglipVisionModel`]. It is used to instantiate a
+    Siglip vision encoder according to the specified arguments, defining the 
+    model architecture. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of the vision encoder of the Siglip
+    [google/siglip-base-patch16-224](https://huggingface.co/google/
+    siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used 
+    to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer
+            in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the
+            Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to 
+            `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the 
+            encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and
+            `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    Example:
+    ```python
+    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
+    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224
+        style configuration
+    >>> configuration = SiglipVisionConfig()
+    >>> # Initializing a SiglipVisionModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "siglip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        _flash_attn_2_enabled=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self._flash_attn_2_enabled = _flash_attn_2_enabled
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
+                                                                  os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SiglipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(
+                cls,
+                "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                "You are using a model of type %s to "
+                "instantiate a model of type %s. This is not"
+                " supported for all configurations of models and can yield"
+                " errors.", config_dict['model_type'], cls.model_type)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class SiglipConfig(PretrainedConfig):
+    r"""
+    [`SiglipConfig`] is the configuration class to store the configuration of a
+    [`SiglipModel`]. It is used to instantiate a Siglip model according to the 
+    specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Siglip [google/siglip-base-patch16-224](
+    https://huggingface.co/google/siglip-base-patch16-224) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from 
+    [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize 
+            [`SiglipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize 
+            [`SiglipVisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from transformers import SiglipConfig, SiglipModel
+    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 
+        style configuration
+    >>> configuration = SiglipConfig()
+    >>> # Initializing a SiglipModel (with random weights) from the 
+        google/siglip-base-patch16-224 style configuration
+    >>> model = SiglipModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig 
+        and a SiglipVisionConfig
+    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
+    >>> # Initializing a SiglipText and SiglipVision configuration
+    >>> config_text = SiglipTextConfig()
+    >>> config_vision = SiglipVisionConfig()
+    >>> config = SiglipConfig.from_text_vision_configs(config_text, 
+        config_vision)
+    ```"""
+
+    model_type = "siglip"
+
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info(
+                "`text_config` is `None`. Initializing the `SiglipTextConfig`"
+                " with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the "
+                        "`SiglipVisionConfig` with default values.")
+
+        self.text_config = SiglipTextConfig(**text_config)
+        self.vision_config = SiglipVisionConfig(**vision_config)
+
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: SiglipTextConfig,
+                                 vision_config: SiglipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text 
+        model configuration and siglip vision
+        model configuration.
+        Returns:
+            [`SiglipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(),
+                   vision_config=vision_config.to_dict(),
+                   **kwargs)
+
+
+# coding=utf-8
+# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Siglip model."""
+
+_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
+
+SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/siglip-base-patch16-224",
+    # See all SigLIP models at https://huggingface.co/models?filter=siglip
+]
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(
+        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official
+    # releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/
+    # truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)  # noqa
+    u = norm_cdf((b - mean) / std)  # noqa
+
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    if tensor.dtype in [torch.float16, torch.bfloat16]:
+        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
+        og_dtype = tensor.dtype
+        tensor = tensor.to(torch.float32)
+        tensor.erfinv_()
+        tensor = tensor.to(og_dtype)
+    else:
+        tensor.erfinv_()
+
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+
+    # Clamp to ensure it's in the proper range
+    if tensor.dtype == torch.float16:
+        # The `clamp_` op is not (yet?) defined in float16+cpu
+        tensor = tensor.to(torch.float32)
+        tensor.clamp_(min=a, max=b)
+        tensor = tensor.to(torch.float16)
+    else:
+        tensor.clamp_(min=a, max=b)
+
+
+def trunc_normal_tf_(tensor: torch.Tensor,
+                     mean: float = 0.0,
+                     std: float = 1.0,
+                     a: float = -2.0,
+                     b: float = 2.0) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where 
+    the bounds [a, b] are applied when sampling the normal distribution with
+    mean=0, std=1.0 and the result is subsequently scaled and shifted by the
+    mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+
+
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+
+    variance = scale / denom
+
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+
+
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+
+
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with
+# CLIP->Siglip
+class SiglipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings
+    of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+            *optional* returned when model is initialized with 
+            `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to
+            the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
+            sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the 
+            model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_hidden_states=True` is passed or when 
+            `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, 
+            if the model has an embedding layer, + one for the output of each
+            layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the 
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_attentions=True` is passed or when 
+            `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape 
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the 
+            weighted average in the self-attention heads.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with
+# CLIP->Siglip
+class SiglipTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the 
+    last hidden states.
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+             *optional* returned when model is initialized with 
+             `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to
+            model.
+            the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
+            sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_hidden_states=True` is passed or when 
+            `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the 
+            embeddings, if the model has an embedding layer, + one for the 
+            output of each layer) of shape `(batch_size, sequence_length, 
+            hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the
+            optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
+            `output_attentions=True` is passed or when 
+            `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape 
+            `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute
+            the weighted average in the self-attention heads.
+    """
+
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+# Copied from transformers.models.clip.modeling_clip.CLIPOutput with
+# CLIP->Siglip
+class SiglipOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
+          `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size,
+          text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and 
+            `text_embeds`. This represents the image-text similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, 
+            image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and 
+            `image_embeds`. This represents the text-image similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to 
+            the pooled output of [`SiglipTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to
+            the pooled output of [`SiglipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`SiglipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"
+                                 ] else getattr(self, k).to_tuple()
+            for k in self.keys())
+
+
+class SiglipVisionEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches_per_side = self.image_size // self.patch_size
+        self.num_patches = self.num_patches_per_side**2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions,
+                                               self.embed_dim)
+
+    def forward(self, pixel_values: torch.FloatTensor,
+                patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
+        batch_size = pixel_values.size(0)
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
+        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \
+            max_im_w // self.patch_size
+        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
+                                  1 / self.num_patches_per_side)
+        position_ids = torch.full(
+            size=(
+                batch_size,
+                max_nb_patches_h * max_nb_patches_w,
+            ),
+            fill_value=0,
+        )
+
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            nb_patches_h = p_attn_mask[:, 0].sum()
+            nb_patches_w = p_attn_mask[0].sum()
+
+            fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h,
+                                                 nb_patches_h)
+            fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w,
+                                                 nb_patches_w)
+
+            bucket_coords_h = torch.bucketize(fractional_coords_h,
+                                              boundaries,
+                                              right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w,
+                                              boundaries,
+                                              right=True)
+
+            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
+                       bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+
+        position_ids = position_ids.to(self.position_embedding.weight.device)
+
+        embeddings = embeddings + self.position_embedding(position_ids)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with
+# CLIP->Siglip
+class SiglipTextEmbeddings(nn.Module):
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings,
+                                               embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and
+        # exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[
+            -1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class SiglipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`:"
+                f" {self.embed_dim} and `num_heads`: {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(
+            2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len,
+                                   k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size "
+                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}")
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(f"Attention mask should be of size "
+                                 f"{(batch_size, 1, q_len, k_v_seq_len)}, "
+                                 f"but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights,
+                                             dim=-1,
+                                             dtype=torch.float32).to(
+                                                 query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights,
+                                             p=self.dropout,
+                                             training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len,
+                                  self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size "
+                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SiglipFlashAttention2(SiglipAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as
+    the weights of the module stays untouched. The only required change would
+    be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any
+    of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_causal = False  # Hack to make sure we don't use a causal mask
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_heads,
+                                     self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_heads,
+                                         self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(
+                kv_seq_len, self.layer_idx)
+
+        # TODO: These transpose are quite inefficient but Flash Attention
+        #  requires the layout [batch_size, sequence_length, num_heads,
+        #  head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training
+        # stability reasons therefore the input hidden states gets silently
+        # casted in float32. Hence, we need cast them back in the correct
+        # dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to
+        # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                "The input hidden states seems to be silently casted in "
+                "float32, this might be related to the fact you have upcasted "
+                "embedding or layer norm layers in float32. We will cast "
+                f"back the input in {target_dtype}.")
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(query_states,
+                                                    key_states,
+                                                    value_states,
+                                                    attention_mask,
+                                                    q_len,
+                                                    dropout=dropout_rate)
+
+        attn_output = attn_output.reshape(bsz, q_len,
+                                          self.embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+    def _flash_attention_forward(self,
+                                 query_states,
+                                 key_states,
+                                 value_states,
+                                 attention_mask,
+                                 query_length,
+                                 dropout=0.0,
+                                 softmax_scale=None):
+        """
+        Calls the forward method of Flash Attention - if the input hidden 
+        states contain at least one padding token first unpad the input, 
+        then computes the attention scores and pad the final attention 
+        scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size 
+                `(batch_size, seq_len)` where 0 stands for the position 
+                of padding tokens and 1 for the position of non-padding 
+                tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / 
+                sqrt(head_dim)
+        """
+        from flash_attn import flash_attn_func, flash_attn_varlen_func
+        from flash_attn.bert_padding import pad_input  # noqa
+
+        # TODO: Remove the `query_length != 1` check once Flash Attention for
+        # RoCm is bumped to 2.1. For details, please see the comment in
+        # LlamaFlashAttention2 __init__.
+        causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, \
+                max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask,
+                query_length)
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
+                                    query_length)
+        else:
+            attn_output = flash_attn_func(query_states,
+                                          key_states,
+                                          value_states,
+                                          dropout,
+                                          softmax_scale=softmax_scale,
+                                          causal=causal)
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
+                    query_length):
+        from flash_attn.bert_padding import index_first_axis, unpad_input
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                              head_dim), indices_k)
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
+                                head_dim), indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
+                                    head_dim), indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \
+                unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
+class SiglipMLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with
+# CLIP->Siglip
+class SiglipEncoderLayer(nn.Module):
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = (SiglipAttention(config) if
+                          not getattr(config, "_flash_attn_2_enabled", False)
+                          else SiglipFlashAttention2(config))
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
+                                        eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where
+                padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under returned tensors for
+                more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states, )
+
+        if output_attentions:
+            outputs += (attn_weights, )
+
+        return outputs
+
+
+class SiglipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface 
+    for downloading and loading pretrained models.
+    """
+
+    config_class = SiglipConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+
+        if isinstance(module, SiglipVisionEmbeddings):
+            width = (self.config.vision_config.hidden_size if isinstance(
+                self.config, SiglipConfig) else self.config.hidden_size)
+            nn.init.normal_(module.position_embedding.weight,
+                            std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, SiglipAttention):
+            nn.init.normal_(module.q_proj.weight)
+            nn.init.normal_(module.k_proj.weight)
+            nn.init.normal_(module.v_proj.weight)
+            nn.init.normal_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, SiglipMLP):
+            nn.init.normal_(module.fc1.weight)
+            nn.init.normal_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
+            nn.init.normal_(module.probe.data)
+            nn.init.normal_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, SiglipModel):
+            logit_scale_init = torch.tensor(0.0)
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+SIGLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass 
+    documentation for the generic methods the library implements for all 
+    its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/
+    stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation
+    for all matter related to general usage and behavior.
+    Parameters:
+        config ([`SiglipConfig`]): Model configuration class with all the 
+            parameters of the model.
+            Initializing with a config file does not load the weights 
+            associated with the model, only the configuration. Check out 
+            the [`~PreTrainedModel.from_pretrained`] method to load the 
+            model weights.
+"""
+
+SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)
+        `):
+            Indices of input sequence tokens in the vocabulary. Padding will 
+            be ignored by default should you provide it.
+            Indices can be obtained using [`AutoTokenizer`]. See 
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0, 
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention 
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+SIGLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size,
+             num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you 
+            provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`]
+            for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention
+            layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+SIGLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, 
+        sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding 
+            will be ignored by default should you provide it.
+            Indices can be obtained using [`AutoTokenizer`]. See 
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+            for details. [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`
+            , *optional*):
+            Mask to avoid performing attention on padding token indices. Mask
+            values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, 
+            sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position
+            embeddings. Selected in the range `[0, 
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, 
+            num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you 
+            provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] 
+            for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention 
+            layers. See `attentions` under returned tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See 
+            `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a 
+            plain tuple.
+"""
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with
+# CLIP->Siglip
+class SiglipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` 
+    self attention layers. Each layer is a [`SiglipEncoderLayer`].
+    Args:
+        config: SiglipConfig
+    """
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([
+            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.gradient_checkpointing = False
+
+    # Ignore copy
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, 
+                sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to 
+                directly pass an embedded representation.
+                This is useful if you want more control over how to convert 
+                `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, 
+                sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. 
+                Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all 
+                attention layers. See `attentions` under returned tensors for
+                  more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See 
+                `hidden_states` under returned tensors for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a
+                  plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1], )
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states, )
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states,
+                               hidden_states=encoder_states,
+                               attentions=all_attentions)
+
+
+class SiglipTextTransformer(nn.Module):
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = SiglipTextEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim,
+                                             eps=config.layer_norm_eps)
+
+        self.head = nn.Linear(embed_dim, embed_dim)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states \
+                                    is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(input_ids=input_ids,
+                                        position_ids=position_ids)
+
+        # note: SigLIP's text model does not use a causal mask, unlike the
+        # original CLIP model.
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] ->
+            # [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # Assuming "sticky" EOS tokenization, last token is always EOS.
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """The text model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipTextModel(SiglipPreTrainedModel):
+    config_class = SiglipTextConfig
+
+    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
+
+    def __init__(self, config: SiglipTextConfig):
+        super().__init__(config)
+        self.text_model = SiglipTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipTextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, SiglipTextModel
+        >>> model = SiglipTextModel.
+            from_pretrained("google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.
+            from_pretrained("google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" 
+            as that's how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
+            padding="max_length", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) 
+            states
+        ```"""
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SiglipVisionTransformer(nn.Module):
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim,
+                                           eps=config.layer_norm_eps)
+        self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None\
+              else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        batch_size = pixel_values.size(0)
+        if patch_attention_mask is None:
+            patch_attention_mask = torch.ones(
+                size=(
+                    batch_size,
+                    pixel_values.size(2) // self.config.patch_size,
+                    pixel_values.size(3) // self.config.patch_size,
+                ),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask)
+
+        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
+        # The call to `_upad_input` in `_flash_attention_forward` is expensive
+        # So when the `patch_attention_mask` is full of 1s (i.e. attending
+        # to the whole sequence), avoiding passing the attention_mask, which
+        # is equivalent to attending to the full sequence
+        if not torch.any(~patch_attention_mask):
+            attention_mask = None
+        else:
+            attention_mask = (_prepare_4d_attention_mask(
+                patch_attention_mask, hidden_states.dtype)
+                              if not self.config._flash_attn_2_enabled else
+                              patch_attention_mask)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(
+            hidden_state=last_hidden_state,
+            attention_mask=patch_attention_mask,
+        )
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(
+            config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, attention_mask):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(query=probe,
+                                      key=hidden_state,
+                                      value=hidden_state,
+                                      key_padding_mask=~attention_mask)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+@add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""",
+    SIGLIP_START_DOCSTRING,
+)
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = SiglipVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: SiglipVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
+                               config_class=SiglipVisionConfig)
+    def forward(
+        self,
+        pixel_values,
+        patch_attention_mask: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, SiglipVisionModel
+        >>> model = SiglipVisionModel.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = 
+            "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled features
+        ```"""
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+@add_start_docstrings(SIGLIP_START_DOCSTRING)
+class SiglipModel(SiglipPreTrainedModel):
+    config_class = SiglipConfig
+
+    def __init__(self, config: SiglipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, SiglipTextConfig):
+            raise ValueError("config.text_config is expected to be of type "
+                             f"SiglipTextConfig but is of type"
+                             f" {type(config.text_config)}.")
+
+        if not isinstance(config.vision_config, SiglipVisionConfig):
+            raise ValueError("config.vision_config is expected to be of type "
+                             "SiglipVisionConfig but is of type"
+                             f" {type(config.vision_config)}.")
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_model = SiglipTextTransformer(text_config)
+        self.vision_model = SiglipVisionTransformer(vision_config)
+
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size,
+              output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output
+              of [`SiglipTextModel`].
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> tokenizer = AutoTokenizer.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> # important: make sure to set padding="max_length" as that's 
+            how the model was trained
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
+            padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead
+        # of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None\
+              else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None \
+            else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, 
+            output_dim`): The image embeddings obtained by applying the
+            projection layer to the pooled output of [`SiglipVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use SiglipModel's config for some fields (if specified) instead
+        # of those of vision & text components.
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]
+
+        return pooled_output
+
+    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SiglipOutput,
+                               config_class=SiglipConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SiglipOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import torch
+        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+        >>> processor = AutoProcessor.from_pretrained(
+            "google/siglip-base-patch16-224")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
+        >>> # important: we pass `padding=max_length` since the model was 
+            trained with this
+        >>> inputs = processor(text=texts, images=image, 
+            padding="max_length", return_tensors="pt")
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image
+        >>> probs = torch.sigmoid(logits_per_image) # these are the 
+            probabilities
+        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
+        31.9% that image 0 is 'a photo of 2 cats'
+        ```"""
+        # Use SigLIP model's config for some fields (if specified) instead of
+        # those of vision & text components.
+        output_attentions = output_attentions if output_attentions \
+            is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else
+                                self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else \
+            self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(
+            p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t(
+        )) * self.logit_scale.exp() + self.logit_bias
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            raise NotImplementedError("SigLIP loss to be implemented")
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds,
+                      image_embeds, text_outputs, vision_outputs)
+            return ((loss, ) + output) if loss is not None else output
+
+        return SiglipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs):
+    siglip_vision_config = {
+        "hidden_size": 1152,
+        "image_size": 448,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+
+    # Detect attention implementation.
+    attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+    if attn_backend != _Backend.FLASH_ATTN:
+        _flash_attn_2_enabled = False
+
+    model_config = SiglipVisionConfig(
+        **siglip_vision_config,
+        _flash_attn_2_enabled=_flash_attn_2_enabled,
+        **kwargs)
+
+    vision_model = SiglipVisionModel(model_config).vision_model
+
+    return vision_model

From 2befbe291705e4a45ef76d6bd67fb174409e3b9d Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 5 Mar 2025 00:27:26 -0500
Subject: [PATCH 0508/1240] [V1] EP/TP MoE + DP Attention (#13931)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/data_parallel.py   |  17 +-
 tests/kernels/test_moe.py                     |   1 +
 vllm/attention/layer.py                       |   4 +-
 vllm/compilation/backends.py                  |   5 +-
 vllm/forward_context.py                       |  22 ++-
 vllm/model_executor/layers/fused_moe/layer.py | 187 +++++++++++++++---
 vllm/model_executor/models/aria.py            |  18 +-
 vllm/model_executor/models/dbrx.py            |   8 +-
 vllm/model_executor/models/jamba.py           |  21 +-
 vllm/model_executor/models/mixtral.py         |   2 +
 vllm/model_executor/models/olmoe.py           |   4 +-
 vllm/model_executor/models/phimoe.py          |   5 +-
 vllm/model_executor/models/qwen2_moe.py       |   7 +-
 vllm/platforms/cuda.py                        |   9 +
 vllm/utils.py                                 |   4 +-
 vllm/v1/engine/core.py                        |   1 -
 vllm/v1/worker/gpu_model_runner.py            |  10 +-
 17 files changed, 250 insertions(+), 75 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 2e1fa50e2ab..2ac98976539 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
-# usage: VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
+# usage:
+# VLLM_TEST_ENABLE_EP=1 VLLM_USE_V1=1 \
+#   python examples/offline_inference/data_parallel.py
 # we need to have a launcher to create multiple data parallel
 # ranks. And each rank will create a vLLM instance to process its own prompts.
 import os
@@ -7,6 +9,9 @@
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
+GPUs_per_dp_rank = 2
+DP_size = 2
+
 
 def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     os.environ["VLLM_DP_RANK"] = str(dp_rank)
@@ -48,8 +53,8 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
                                      max_tokens=16 * (dp_rank + 1))
 
     # Create an LLM.
-    llm = LLM(model="facebook/opt-125m",
-              tensor_parallel_size=2,
+    llm = LLM(model="ibm-research/PowerMoE-3b",
+              tensor_parallel_size=GPUs_per_dp_rank,
               enforce_eager=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
@@ -62,14 +67,12 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
 
 if __name__ == "__main__":
     from multiprocessing import Process
-    dp_size = 2
-    GPUs_per_dp_rank = 2
     dp_master_ip = "127.0.0.1"
     dp_master_port = get_open_port()
     procs = []
-    for i in range(dp_size):
+    for i in range(DP_size):
         proc = Process(target=main,
-                       args=(dp_size, i, dp_master_ip, dp_master_port,
+                       args=(DP_size, i, dp_master_ip, dp_master_port,
                              GPUs_per_dp_rank))
         proc.start()
         procs.append(proc)
diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 2f5c69046f4..52893f4329e 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -217,6 +217,7 @@ def test_mixtral_moe(dtype: torch.dtype):
         intermediate_size=config.intermediate_size,
         params_dtype=dtype,
         tp_size=1,
+        dp_size=1,
     ).cuda()
 
     # Load the weights
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 58a3b4ee43c..7810089a05c 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -324,7 +324,7 @@ def unified_attention(
 ) -> torch.Tensor:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
-    self = forward_context.attn_layers[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
     return self.impl.forward(self, query, key, value, kv_cache, attn_metadata)
 
@@ -356,7 +356,7 @@ def unified_attention_with_output(
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     attn_metadata = forward_context.attn_metadata
-    self = forward_context.attn_layers[layer_name]
+    self = forward_context.no_compile_layers[layer_name]
     kv_cache = self.kv_cache[forward_context.virtual_engine]
     self.impl.forward(self,
                       query,
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index b972f03c968..afb63cf8319 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -396,8 +396,9 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         cache_dir = self.compilation_config.cache_dir
         os.makedirs(cache_dir, exist_ok=True)
-        local_cache_dir = os.path.join(
-            cache_dir, f"rank_{vllm_config.parallel_config.rank}")
+        rank = vllm_config.parallel_config.rank
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
         self.compilation_config.local_cache_dir = local_cache_dir
 
         disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index c3d20cff426..540a35e1ecb 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -25,16 +25,22 @@
 batchsize_forward_time: defaultdict = defaultdict(list)
 
 
+@dataclass
+class DPMetadata:
+    num_tokens_across_dp: list[int]
+    cu_tokens_across_dp_cpu: torch.Tensor
+
+
 @dataclass
 class ForwardContext:
     # copy from vllm_config.compilation_config.static_forward_context
-    attn_layers: dict[str, Any]
+    no_compile_layers: dict[str, Any]
     # TODO: extend to support per-layer dynamic forward context
     attn_metadata: "AttentionMetadata"  # set dynamically for each forward pass
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
-    num_tokens_across_dp: Optional[
-        list[int]] = None  # set dynamically for each forward pass
+    # set dynamically for each forward pass
+    dp_metadata: Optional[DPMetadata] = None
 
 
 _forward_context: Optional[ForwardContext] = None
@@ -61,7 +67,7 @@ def set_forward_context(attn_metadata: Any,
     need_to_track_batchsize = track_batchsize and attn_metadata is not None
     if need_to_track_batchsize:
         forward_start_time = time.perf_counter()
-    num_tokens_across_dp = None
+    dp_metadata: Optional[DPMetadata] = None
     if vllm_config.parallel_config.data_parallel_size > 1:
         dp_size = vllm_config.parallel_config.data_parallel_size
         dp_rank = vllm_config.parallel_config.data_parallel_rank
@@ -82,15 +88,17 @@ def set_forward_context(attn_metadata: Any,
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        num_tokens_across_dp = num_tokens_tensor.tolist()
+        cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
+        dp_metadata = DPMetadata(num_tokens_across_dp, cu_tokens_across_dp_cpu)
 
     global _forward_context
     prev_context = _forward_context
     _forward_context = ForwardContext(
-        attn_layers=vllm_config.compilation_config.static_forward_context,
+        no_compile_layers=vllm_config.compilation_config.
+        static_forward_context,
         virtual_engine=virtual_engine,
         attn_metadata=attn_metadata,
-        num_tokens_across_dp=num_tokens_across_dp)
+        dp_metadata=dp_metadata)
     try:
         yield
     finally:
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 052d4d54601..33d2896f3fd 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -8,9 +8,11 @@
 from torch.nn.parameter import UninitializedParameter
 
 import vllm.envs as envs
-from vllm.distributed import (get_tensor_model_parallel_rank,
+from vllm.config import get_current_vllm_config
+from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization.base_config import (
@@ -18,6 +20,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
+from vllm.utils import direct_register_custom_op
 
 if current_platform.is_cuda_alike():
     from .fused_moe import fused_experts
@@ -246,6 +249,51 @@ def forward_tpu(
     forward_native = forward_cuda
 
 
+def determine_expert_map(
+        ep_size: int, ep_rank: int,
+        global_num_experts: int) -> Tuple[int, Optional[torch.Tensor]]:
+    """
+        Calculates how many experts should be assigned to each rank for EP and
+        creates a mapping from global to local expert index. Experts are
+        distributed evenly across ranks. Any remaining are assigned to the
+        last rank.
+
+        Args:
+            ep_size (int): The size of the expert parallel group
+            global_num_experts (int): The total number of experts in the model.
+
+        Returns:
+            Tuple[int, Optional[torch.Tensor]]: A tuple containing:
+                - local_num_experts (int): The number of experts assigned
+                    to the current rank.
+                - expert_map (Optional[torch.Tensor]): A tensor of shape
+                    (global_num_experts,) mapping from global to local index.
+                    Contains -1 for experts not assigned to the current rank.
+                    Returns None if ep_size is 1.
+        """
+    assert ep_size > 0
+    if ep_size == 1:
+        return (global_num_experts, None)
+
+    local_num_experts = global_num_experts // ep_size
+
+    # Create a tensor of size num_experts filled with -1
+    expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
+    # Create a expert map for the local experts
+    if ep_rank < (ep_size - 1):
+        # Each non-last rank gets local_num_experts experts.
+        expert_map[ep_rank * local_num_experts:
+                        (ep_rank + 1) * local_num_experts] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    else:
+        # All remaining experts are assigned to the last rank.
+        local_num_experts = (global_num_experts - ep_rank * local_num_experts)
+
+        expert_map[-local_num_experts:] = \
+            torch.arange(0, local_num_experts, dtype=torch.int32)
+    return (local_num_experts, expert_map)
+
+
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
@@ -282,6 +330,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         ep_size: Optional[int] = None,
+        dp_size: Optional[int] = None,
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
@@ -293,16 +342,48 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
 
+        # For smuggling this layer into the fused moe custom op
+        compilation_config = get_current_vllm_config().compilation_config
+        if prefix in compilation_config.static_forward_context:
+            raise ValueError("Duplicate layer name: {}".format(prefix))
+        compilation_config.static_forward_context[prefix] = self
+        self.layer_name = prefix
+        self.use_direct_call = not envs.VLLM_TEST_ENABLE_EP
+
+        # Note: here we guard against accessing the TP and DP groups when
+        # uninitialized (this happens when testing)
         self.tp_size = (tp_size if tp_size is not None else
                         get_tensor_model_parallel_world_size())
+        tp_rank = 0 if self.tp_size == 1 else get_tensor_model_parallel_rank()
+        self.dp_size = (dp_size
+                        if dp_size is not None else get_dp_group().world_size)
+        self.dp_rank = (0
+                        if self.dp_size == 1 else get_dp_group().rank_in_group)
+        self.global_num_experts = num_experts
+
         if envs.VLLM_TEST_ENABLE_EP:
-            self.ep_size = self.tp_size
+            # Set TP size to 1 to adjust for EP and adjust EP size and rank
+            # for DP attention.
+            self.ep_rank = tp_rank + self.tp_size * self.dp_rank
+            self.tp_rank = 0
+            self.ep_size = self.tp_size * self.dp_size
             self.tp_size = 1
+
+            self.local_num_experts, self.expert_map = determine_expert_map(
+                ep_size=self.ep_size,
+                ep_rank=self.ep_rank,
+                global_num_experts=self.global_num_experts)
         else:
+            # Adjust TP size for DP attention
+            self.tp_rank = tp_rank + self.tp_size * self.dp_rank
+            self.ep_rank = 0
+            self.tp_size = self.tp_size * self.dp_size
             self.ep_size = 1
+            self.local_num_experts = self.global_num_experts
+            self.expert_map = None
         self.top_k = top_k
         self.global_num_experts = num_experts
-        self.local_num_experts = self.global_num_experts // self.ep_size
+
         assert intermediate_size % self.tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.tp_size
         self.reduce_results = reduce_results
@@ -316,26 +397,6 @@ def __init__(
         self.scoring_func = scoring_func
         self.e_score_correction_bias = e_score_correction_bias
         self.activation = activation
-        self.expert_map = None
-
-        if self.ep_size > 1:
-            # Create a tensor of size num_experts filled with -1
-            self.expert_map = torch.full((self.global_num_experts, ),
-                                         -1,
-                                         dtype=torch.int32)
-            # Create a expert map for the local experts
-            ep_rank = get_tensor_model_parallel_rank()
-            if ep_rank < (self.ep_size - 1):
-                # Each non-last rank gets local_num_experts experts.
-                self.expert_map[ep_rank * self.local_num_experts:
-                                (ep_rank + 1) * self.local_num_experts] = \
-                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
-            else:
-                # All remaining experts are assigned to the last rank.
-                self.local_num_experts = (self.global_num_experts -
-                                          ep_rank * self.local_num_experts)
-                self.expert_map[-self.local_num_experts:] = \
-                    torch.arange(0, self.local_num_experts, dtype=torch.int32)
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
@@ -493,9 +554,6 @@ def weight_loader(self, param: torch.nn.Parameter,
         if expert_id == -1:
             return
 
-        # TP rank is set to 0 if EP is enabled
-        tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
-
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
         # against known CompressionFormat enum values that have this quality
@@ -539,8 +597,7 @@ def weight_loader(self, param: torch.nn.Parameter,
             final_shape = list(loaded_weight.shape)
             if shard_id in ["w1", "w3"]:
                 final_shape[1] *= 2
-            final_shape[shard_dim] = final_shape[
-                shard_dim] // get_tensor_model_parallel_world_size()
+            final_shape[shard_dim] = final_shape[shard_dim] // self.tp_size
             param.materialize(final_shape, dtype=loaded_weight.dtype)
 
         expert_data = param.data if full_load else param.data[expert_id]
@@ -567,7 +624,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                              shard_id=shard_id,
                              loaded_weight=loaded_weight,
                              expert_data=expert_data,
-                             tp_rank=tp_rank)
+                             tp_rank=self.tp_rank)
             return
 
         # Case weight scales and zero_points
@@ -584,7 +641,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank)
+                    tp_rank=self.tp_rank)
             elif quant_method in [
                     FusedMoeWeightScaleSupported.GROUP.value,
                     FusedMoeWeightScaleSupported.BLOCK.value,
@@ -594,7 +651,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                     shard_dim=shard_dim,
                     loaded_weight=loaded_weight,
                     expert_data=expert_data,
-                    tp_rank=tp_rank,
+                    tp_rank=self.tp_rank,
                     load_full_w2=getattr(param, "load_full_w2", False))
             elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                 self._load_per_tensor_weight_scale(shard_id=shard_id,
@@ -621,7 +678,7 @@ def weight_loader(self, param: torch.nn.Parameter,
                 shard_dim=shard_dim,
                 loaded_weight=loaded_weight,
                 expert_data=expert_data,
-                tp_rank=tp_rank)
+                tp_rank=self.tp_rank)
             return
 
     @staticmethod
@@ -665,10 +722,45 @@ def select_experts(hidden_states: torch.Tensor,
 
         return topk_weights, topk_ids
 
+    def naive_multicast(self, x: torch.Tensor,
+                        cu_tokens_across_dp_cpu: torch.Tensor):
+        assert (len(x.shape) == 2)
+        buffer = torch.empty((cu_tokens_across_dp_cpu[-1], x.size(1)),
+                             device=x.device,
+                             dtype=x.dtype)
+
+        start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+            self.dp_rank - 1]
+        end = cu_tokens_across_dp_cpu[self.dp_rank]
+        buffer[start:end, :].copy_(x)
+        for idx in range(get_dp_group().world_size):
+            start = 0 if idx == 0 else cu_tokens_across_dp_cpu[idx - 1]
+            end = cu_tokens_across_dp_cpu[idx]
+            get_dp_group().broadcast(buffer[start:end, :], idx)
+
+        return buffer
+
     def forward(self, hidden_states: torch.Tensor,
                 router_logits: torch.Tensor):
+        if self.use_direct_call:
+            return self.forward_impl(hidden_states, router_logits)
+        else:
+            return torch.ops.vllm.moe_forward(hidden_states, router_logits,
+                                              self.layer_name)
+
+    def forward_impl(self, hidden_states: torch.Tensor,
+                     router_logits: torch.Tensor):
         assert self.quant_method is not None
 
+        if self.dp_size > 1:
+            cu_tokens_across_dp_cpu = get_forward_context(
+            ).dp_metadata.cu_tokens_across_dp_cpu
+
+            hidden_states = self.naive_multicast(hidden_states,
+                                                 cu_tokens_across_dp_cpu)
+            router_logits = self.naive_multicast(router_logits,
+                                                 cu_tokens_across_dp_cpu)
+
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -687,6 +779,14 @@ def forward(self, hidden_states: torch.Tensor,
             activation=self.activation,
         )
 
+        if self.dp_size > 1:
+            start = 0 if self.dp_rank == 0 else cu_tokens_across_dp_cpu[
+                self.dp_rank - 1]
+            end = cu_tokens_across_dp_cpu[self.dp_rank]
+
+            all_hidden_states = get_dp_group().all_reduce(final_hidden_states)
+            final_hidden_states = all_hidden_states[start:end, :]
+
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             # Default set to False. (May have to add shared expert outputs.)
             final_hidden_states = tensor_model_parallel_all_reduce(
@@ -757,3 +857,26 @@ def extra_repr(self) -> str:
         s += f", scoring_func='{self.scoring_func}', activation='{self.activation}'"  # noqa: E501
 
         return s
+
+
+def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                layer_name: str) -> torch.Tensor:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    assert self.quant_method is not None
+
+    return self.forward_impl(hidden_states, router_logits)
+
+
+def moe_forward_fake(hidden_states: torch.Tensor, router_logits: torch.Tensor,
+                     layer_name: str) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
+direct_register_custom_op(
+    op_name="moe_forward",
+    op_func=moe_forward,
+    mutates_args=[],
+    fake_impl=moe_forward_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index c8bf6681d7c..de3512cf18d 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -46,7 +46,7 @@ class AriaImagePixelInputs(TypedDict):
     pixel_values: torch.Tensor
     pixel_mask: Optional[torch.Tensor]
     """
-    Shape: 
+    Shape:
         pixel_values: `(batch_size * num_images, num_channels, height, width)`
         pixel_mask: `(batch_size * num_images, height, width)`
     """
@@ -135,11 +135,11 @@ class AriaProjector(nn.Module):
         query numbers,
             e.g., {1225: 128, 4900: 256}. This allows for different query sizes
             based on image resolution.
-        embed_dim (int): Embedding dimension. 
-        num_heads (int): Number of attention heads. 
-        kv_dim (int): Dimension of key and value. 
-        ff_dim (int): Hidden dimension of the feed-forward network. 
-        output_dim (int): Output dimension. 
+        embed_dim (int): Embedding dimension.
+        num_heads (int): Number of attention heads.
+        kv_dim (int): Dimension of key and value.
+        ff_dim (int): Hidden dimension of the feed-forward network.
+        output_dim (int): Output dimension.
         norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm.
 
     Outputs:
@@ -239,6 +239,7 @@ def __init__(
         self,
         config: AriaTextConfig,
         quant_config: Optional[QuantizationConfig],
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.config = config
@@ -254,6 +255,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
             reduce_results=True,
+            prefix=f"{prefix}.experts",
         )
         self.shared_experts = LlamaMLP(
             config.hidden_size,
@@ -301,7 +303,9 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__(config, cache_config, quant_config, prefix)
-        self.mlp = AriaTextMoELayer(config, quant_config=quant_config)
+        self.mlp = AriaTextMoELayer(config,
+                                    quant_config=quant_config,
+                                    prefix=f"{prefix}.mlp")
 
 
 class AriaTextModel(LlamaModel, SupportsQuant):
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 7830dd4ce2e..b66529860bc 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -65,6 +65,7 @@ def __init__(
         config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
     ):
         super().__init__(
             num_experts=config.ffn_config.moe_num_experts,
@@ -76,6 +77,7 @@ def __init__(
             renormalize=True,
             quant_config=quant_config,
             tp_size=get_tensor_model_parallel_world_size(),
+            prefix=prefix,
         )
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -139,6 +141,7 @@ def __init__(
         config: DbrxConfig,
         quant_config: Optional[QuantizationConfig] = None,
         params_dtype: Optional[torch.dtype] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.d_model = config.d_model
@@ -150,7 +153,8 @@ def __init__(
 
         self.experts = DbrxExperts(config=config,
                                    quant_config=quant_config,
-                                   params_dtype=self.params_dtype)
+                                   params_dtype=self.params_dtype,
+                                   prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -291,7 +295,7 @@ def __init__(
             cache_config,
             quant_config,
             prefix=f"{prefix}.norm_attn_norm")
-        self.ffn = DbrxMoE(config, quant_config)
+        self.ffn = DbrxMoE(config, quant_config, prefix=f"{prefix}.ffn")
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 4ac83c6ece1..11b863ded45 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -47,7 +47,8 @@ def __init__(self,
                  top_k: Optional[int] = None,
                  params_dtype: Optional[torch.dtype] = None,
                  tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__()
         self.num_total_experts = num_experts or config.num_experts
         self.top_k = top_k or config.num_experts_per_tok
@@ -70,7 +71,8 @@ def __init__(self,
                                 reduce_results=True,
                                 renormalize=False,
                                 use_grouped_topk=False,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         orig_shape = hidden_states.shape
@@ -92,13 +94,15 @@ def __init__(self,
                  config: JambaConfig,
                  params_dtype: Optional[torch.dtype] = None,
                  tp_size: Optional[int] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
         super().__init__(config,
                          num_experts=1,
                          top_k=1,
                          params_dtype=params_dtype,
                          tp_size=tp_size,
-                         quant_config=quant_config)
+                         quant_config=quant_config,
+                         prefix=prefix)
 
 
 class JambaMambaDecoderLayer(nn.Module):
@@ -109,6 +113,7 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  is_lora_enabled: Optional[bool] = False,
+                 prefix: str = "",
                  **kwargs) -> None:
         super().__init__()
         self.config = config
@@ -129,7 +134,9 @@ def __init__(self,
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
@@ -211,7 +218,9 @@ def __init__(self,
 
         num_experts = config.layers_num_experts[layer_idx]
         ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP
-        self.feed_forward = ffn_layer_class(config, quant_config=quant_config)
+        self.feed_forward = ffn_layer_class(config,
+                                            quant_config=quant_config,
+                                            prefix=f"{prefix}.feed_forward")
         self.input_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
         self.pre_ff_layernorm = RMSNorm(config.hidden_size,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index b21aa601879..0ae7688779b 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -71,6 +71,7 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  tp_size: Optional[int] = None,
+                 dp_size: Optional[int] = None,
                  prefix: str = ""):
         super().__init__()
         self.hidden_size = hidden_size
@@ -93,6 +94,7 @@ def __init__(self,
                                 renormalize=True,
                                 quant_config=quant_config,
                                 tp_size=tp_size,
+                                dp_size=dp_size,
                                 prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 8e72b36e7e5..6cf3f1f8264 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -80,7 +80,8 @@ def __init__(self,
                                 reduce_results=True,
                                 renormalize=False,
                                 quant_config=quant_config,
-                                tp_size=tp_size)
+                                tp_size=tp_size,
+                                prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -212,6 +213,7 @@ def __init__(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index d14425f4a70..f8728acdfbf 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -249,6 +249,7 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -272,7 +273,8 @@ def __init__(
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
-            custom_routing_function=phimoe_routing_function)
+            custom_routing_function=phimoe_routing_function,
+            prefix=f"{prefix}.experts")
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # NOTE: hidden_states can have either 1D or 2D shape.
@@ -396,6 +398,7 @@ def __init__(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             quant_config=quant_config,
+            prefix=f"{prefix}.block_sparse_moe",
         )
         self.input_layernorm = nn.LayerNorm(config.hidden_size,
                                             eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 92a66568b0f..21855ba9dcf 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -100,6 +100,7 @@ def __init__(
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -115,7 +116,8 @@ def __init__(
                                 intermediate_size=config.moe_intermediate_size,
                                 reduce_results=False,
                                 renormalize=config.norm_topk_prob,
-                                quant_config=quant_config)
+                                quant_config=quant_config,
+                                prefix=f"{prefix}.experts")
 
         self.gate = ReplicatedLinear(config.hidden_size,
                                      config.num_experts,
@@ -277,7 +279,8 @@ def __init__(
                 config.num_experts > 0 and
             (layer_idx + 1) % config.decoder_sparse_step == 0):
             self.mlp = Qwen2MoeSparseMoeBlock(config=config,
-                                              quant_config=quant_config)
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.mlp")
         else:
             self.mlp = Qwen2MoeMLP(
                 hidden_size=config.hidden_size,
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 00bbfec1ef7..4be93148139 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -111,6 +111,7 @@ def log_warnings(cls):
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
+        compilation_config = vllm_config.compilation_config
 
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
@@ -150,6 +151,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "FlashMLA: Forcing kv cache block size to 64 since this"
                 " is currently the only block size supported by the kernel.")
 
+        if (parallel_config.data_parallel_size > 1
+                and compilation_config.use_cudagraph):
+            logger.info(
+                "Data Parallel: Forcing enforce eager to be True since DP is "
+                "currently not supported with CUDA Graphs.")
+            vllm_config.model_config.enforce_eager = True
+            compilation_config.use_cudagraph = False
+
     @classmethod
     def get_current_memory_usage(cls,
                                  device: Optional[torch.types.Device] = None
diff --git a/vllm/utils.py b/vllm/utils.py
index 66d629011dd..1de2180deb5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2196,8 +2196,8 @@ def bind_kv_cache(
     from vllm.model_executor.models.utils import extract_layer_index
     layer_need_kv_cache = [
         layer_name for layer_name in ctx
-        if ctx[layer_name].attn_type in (AttentionType.DECODER,
-                                         AttentionType.ENCODER_DECODER)
+        if (hasattr(ctx[layer_name], 'attn_type') and ctx[layer_name].attn_type
+            in (AttentionType.DECODER, AttentionType.ENCODER_DECODER))
     ]
     layer_index_sorted = sorted(
         set(
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b78b903b805..671a72e2112 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -149,7 +149,6 @@ def step(self) -> EngineCoreOutputs:
         if not self.scheduler.has_unfinished_requests():
             return EngineCoreOutputs(
                 outputs=[], scheduler_stats=self.scheduler.make_stats())
-
         scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4a1fb0514c3..a1a50e89676 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -17,6 +17,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -1357,7 +1358,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV 
+            kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
         if len(kv_cache_config.groups) > 1:
@@ -1389,10 +1390,10 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
     def get_kv_cache_spec(self) -> KVCacheSpec:
         """
-        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
         Returns:
-            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
 
@@ -1400,6 +1401,9 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: KVCacheSpec = {}
         for layer_name, attn_module in forward_ctx.items():
+            if isinstance(attn_module, FusedMoE):
+                continue
+
             # TODO: Support other attention modules, e.g., sliding window,
             # cross-attention, MLA.
             assert isinstance(attn_module, Attention)

From a21b57ce42026aec9e633d832025dfe785d72ee6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 5 Mar 2025 13:32:18 +0800
Subject: [PATCH 0509/1240] [platforms] improve rocm debugging info (#14257)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 74ef8bd1cff..4912906fef4 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -106,6 +106,9 @@ def rocm_platform_plugin() -> Optional[str]:
             if len(amdsmi.amdsmi_get_processor_handles()) > 0:
                 is_rocm = True
                 logger.debug("Confirmed ROCm platform is available.")
+            else:
+                logger.debug("ROCm platform is not available because"
+                             " no GPU is found.")
         finally:
             amdsmi.amdsmi_shut_down()
     except Exception as e:

From 67b39c92013cc67407e4585cf75b9dba87459198 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Mar 2025 01:10:35 -0500
Subject: [PATCH 0510/1240] Temporarily disable test_awq_gemm_opcheck (#14251)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_awq.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index ace75a33617..37ce00c7403 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -26,6 +26,7 @@ def test_awq_dequantize_opcheck():
             (qweight, scales, zeros, split_k_iters, thx, thy))
 
 
+@pytest.mark.skip(reason="Not working; needs investigation.")
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                     reason="AWQ is not supported on this GPU type.")
 def test_awq_gemm_opcheck():

From cf3bb7bac723e999c81193497588a9bc77024b22 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Wed, 5 Mar 2025 01:30:40 -0500
Subject: [PATCH 0511/1240] [Frontend] Allow return_tokens_as_token_ids to be
 passed as a request param (#14066)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/test_return_tokens_as_ids.py       | 44 +++++++++++++------
 vllm/entrypoints/openai/protocol.py           | 12 +++++
 vllm/entrypoints/openai/serving_chat.py       | 21 +++++----
 vllm/entrypoints/openai/serving_completion.py | 12 +++--
 4 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index 9b33eddae2a..6474858642d 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -17,18 +17,28 @@
 
 
 @pytest.fixture(scope="module")
-def server_with_return_tokens_as_token_ids_flag(
-        default_server_args):  # noqa: F811
-    args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
-    with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
-        yield remote_server
+def server_fixture(request, default_server_args):  # noqa: F811
+    use_server_flag = request.param
+    if use_server_flag:
+        args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
+        with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
+            yield (remote_server, True)
+    else:
+        with RemoteOpenAIServer(MODEL_NAME,
+                                default_server_args) as remote_server:
+            yield (remote_server, False)
 
 
 @pytest.mark.asyncio
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
 async def test_completion_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+        server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
 
         completion = await client.completions.create(
             model=MODEL_NAME,
@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
             echo=True,
             temperature=0,
             max_tokens=10,
-            logprobs=1)
+            logprobs=1,
+            extra_body=request_args)
 
         text = completion.choices[0].text
         token_strs = completion.choices[0].logprobs.tokens
@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
 
 
 @pytest.mark.asyncio
-async def test_chat_return_tokens_as_token_ids_completion(
-        server_with_return_tokens_as_token_ids_flag):
-    async with server_with_return_tokens_as_token_ids_flag.get_async_client(
-    ) as client:
+@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
+async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
+    server, use_server_flag = server_fixture
+    request_args = {}
+    if not use_server_flag:
+        request_args["return_tokens_as_token_ids"] = True
+
+    async with server.get_async_client() as client:
         response = await client.chat.completions.create(
             model=MODEL_NAME,
             # Include Unicode characters to test for dividing a single
@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
             }],
             temperature=0,
             max_tokens=8,
-            logprobs=True)
+            logprobs=True,
+            extra_body=request_args)
 
         text = response.choices[0].message.content
         tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 14ce71cd3c2..2c740caf20f 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -369,6 +369,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
 
     # doc: end-chat-completion-extra-params
 
@@ -739,6 +745,12 @@ class CompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+    return_tokens_as_token_ids: Optional[bool] = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."))
 
     # doc: end-completion-extra-params
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index fc4dd110712..98b6679722d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -450,6 +450,8 @@ async def chat_completion_stream_generator(
                             top_logprobs=output.logprobs,
                             tokenizer=tokenizer,
                             num_output_top_logprobs=request.top_logprobs,
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -705,6 +707,7 @@ async def chat_completion_full_generator(
                     top_logprobs=out_logprobs,
                     num_output_top_logprobs=request.top_logprobs,
                     tokenizer=tokenizer,
+                    return_as_token_id=request.return_tokens_as_token_ids,
                 )
             else:
                 logprobs = None
@@ -852,13 +855,14 @@ async def chat_completion_full_generator(
 
     def _get_top_logprobs(
             self, logprobs: dict[int, Logprob], top_logprobs: Optional[int],
-            tokenizer: AnyTokenizer) -> list[ChatCompletionLogProb]:
+            tokenizer: AnyTokenizer,
+            should_return_as_token_id: bool) -> list[ChatCompletionLogProb]:
         return [
             ChatCompletionLogProb(token=(token := self._get_decoded_token(
                 p[1],
                 p[0],
                 tokenizer,
-                return_as_token_id=self.return_tokens_as_token_ids)),
+                return_as_token_id=should_return_as_token_id)),
                                   logprob=max(p[1].logprob, -9999.0),
                                   bytes=list(
                                       token.encode("utf-8", errors="replace")))
@@ -872,15 +876,18 @@ def _create_chat_logprobs(
         top_logprobs: GenericSequence[Optional[dict[int, Logprob]]],
         tokenizer: AnyTokenizer,
         num_output_top_logprobs: Optional[int] = None,
+        return_as_token_id: Optional[bool] = None,
     ) -> ChatCompletionLogProbs:
         """Create OpenAI-style logprobs."""
         logprobs_content: list[ChatCompletionLogProbsContent] = []
 
+        should_return_as_token_id = return_as_token_id if \
+            return_as_token_id is not None else self.return_tokens_as_token_ids
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
-                if self.return_tokens_as_token_ids:
+                if should_return_as_token_id:
                     token = f"token_id:{token_id}"
 
                 logprobs_content.append(
@@ -898,16 +905,14 @@ def _create_chat_logprobs(
                             step_token,
                             token_id,
                             tokenizer,
-                            self.return_tokens_as_token_ids,
+                            should_return_as_token_id,
                         ),
                         logprob=max(step_token.logprob, -9999.0),
                         bytes=None if step_decoded is None else list(
                             step_decoded.encode("utf-8", errors="replace")),
                         top_logprobs=self._get_top_logprobs(
-                            step_top_logprobs,
-                            num_output_top_logprobs,
-                            tokenizer,
-                        ),
+                            step_top_logprobs, num_output_top_logprobs,
+                            tokenizer, should_return_as_token_id),
                     ))
 
         return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index f52e1de9840..edcf1b086ba 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -316,6 +316,8 @@ async def completion_stream_generator(
                             num_output_top_logprobs=request.logprobs,
                             tokenizer=tokenizer,
                             initial_text_offset=previous_text_lens[i],
+                            return_as_token_id=request.
+                            return_tokens_as_token_ids,
                         )
                     else:
                         logprobs = None
@@ -436,6 +438,7 @@ def request_output_to_completion_response(
                         top_logprobs=out_logprobs,
                         tokenizer=tokenizer,
                         num_output_top_logprobs=request.logprobs,
+                        return_as_token_id=request.return_tokens_as_token_ids,
                     )
                 else:
                     logprobs = None
@@ -477,6 +480,7 @@ def _create_completion_logprobs(
         num_output_top_logprobs: int,
         tokenizer: AnyTokenizer,
         initial_text_offset: int = 0,
+        return_as_token_id: Optional[bool] = None,
     ) -> CompletionLogProbs:
         """Create logprobs for OpenAI Completion API."""
         out_text_offset: list[int] = []
@@ -486,11 +490,13 @@ def _create_completion_logprobs(
 
         last_token_len = 0
 
+        should_return_as_token_id = return_as_token_id if \
+            return_as_token_id is not None else self.return_tokens_as_token_ids
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
                 token = tokenizer.decode(token_id)
-                if self.return_tokens_as_token_ids:
+                if should_return_as_token_id:
                     token = f"token_id:{token_id}"
 
                 out_tokens.append(token)
@@ -503,7 +509,7 @@ def _create_completion_logprobs(
                     step_token,
                     token_id,
                     tokenizer,
-                    return_as_token_id=self.return_tokens_as_token_ids,
+                    return_as_token_id=should_return_as_token_id,
                 )
                 token_logprob = max(step_token.logprob, -9999.0)
 
@@ -520,7 +526,7 @@ def _create_completion_logprobs(
                     self._get_decoded_token(top_lp[1],
                                             top_lp[0],
                                             tokenizer,
-                                            return_as_token_id=self.return_tokens_as_token_ids):
+                                            return_as_token_id=should_return_as_token_id):
                     max(top_lp[1].logprob, -9999.0)
                     for i, top_lp in enumerate(step_top_logprobs.items())
                     if num_output_top_logprobs >= i

From 2d0ac48c9c9956ccc29f5b942e99c61fbed4c1dc Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 4 Mar 2025 23:37:16 -0800
Subject: [PATCH 0512/1240] [Misc][V1] Avoid using `envs.VLLM_USE_V1` in mm
 processing (#14256)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/preprocess.py                     | 24 +++++++++++++++++--
 vllm/model_executor/models/llava.py           |  4 +++-
 vllm/model_executor/models/minicpmv.py        |  4 +++-
 vllm/model_executor/models/mllama.py          |  4 +++-
 .../models/prithvi_geospatial_mae.py          |  1 +
 vllm/multimodal/processing.py                 |  8 ++++---
 vllm/v1/engine/processor.py                   |  1 +
 7 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 206a76e52b7..2545635da32 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -254,6 +254,7 @@ def _process_multimodal(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -274,7 +275,8 @@ def _process_multimodal(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
 
     async def _process_multimodal_async(
         self,
@@ -282,6 +284,7 @@ async def _process_multimodal_async(
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """Async version of :meth:`_process_multimodal`."""
         # At the moment on model (PrithviGeoSpatialMAE) requires to be
@@ -299,13 +302,15 @@ async def _process_multimodal_async(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
+                                  return_mm_hashes)
 
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -315,6 +320,7 @@ def _prompt_to_llm_inputs(
         * request_id
         * prompt: single encoder or decoder input prompt
         * lora_request: this is only valid for decoder prompts
+        * return_mm_hashes: whether to return multimodal hashes
 
         Returns:
 
@@ -349,6 +355,7 @@ def _prompt_to_llm_inputs(
                     multi_modal_data,
                     mm_processor_kwargs,
                     lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
                 )
 
             return token_inputs(
@@ -695,6 +702,7 @@ def _process_decoder_only_prompt(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -706,6 +714,7 @@ def _process_decoder_only_prompt(
         * request_id
         * lora_request
         * prompt_adapter_request
+        * return_mm_hashes
 
         Returns:
 
@@ -729,6 +738,7 @@ async def _process_decoder_only_prompt_async(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
         """Async version of :meth:`_process_decoder_only_prompt`."""
         prompt_comps = await self._prompt_to_llm_inputs_async(
@@ -748,9 +758,13 @@ def preprocess(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return self._process_encoder_decoder_prompt(
@@ -768,6 +782,7 @@ def preprocess(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
         )
 
     async def preprocess_async(
@@ -776,9 +791,13 @@ async def preprocess_async(
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
         """Async version of :meth:`preprocess`."""
         if self.model_config.is_encoder_decoder:
+            assert not return_mm_hashes, (
+                "Multimodal hashes for encoder-decoder models should not be ",
+                "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return await self._process_encoder_decoder_prompt_async(
@@ -796,4 +815,5 @@ async def preprocess_async(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=return_mm_hashes,
         )
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 542eb944de9..66b79f809bc 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -767,6 +767,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -777,7 +778,8 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                               return_mm_hashes)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 1816bf5d008..cf103edd0bc 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -780,6 +780,7 @@ def apply(
         prompt: Union[str, List[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         supported_mm_modalities = self.info.get_supported_mm_modalities()
         if isinstance(prompt, list):
@@ -791,7 +792,8 @@ def apply(
                 [index for index, m in enumerate(matches) if m == modality])
             for modality in supported_mm_modalities
         }
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                               return_mm_hashes)
         # Exclude <image_id>x</image_id> from placeholders
         if "image" in result["mm_placeholders"] and \
             self.info.get_model_version() == (2, 6):
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index b1ccd8e851c..f74fa7a4662 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -175,8 +175,10 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                                  return_mm_hashes)
 
         # Check that the number of image tokens in the decoder prompt matches
         # the number of images provided in mm_data
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index d922329b3a4..3f5faea4f87 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -93,6 +93,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         mm_kwargs = {}
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 7232df074f8..3f13cd8582f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -14,7 +14,6 @@
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
-import vllm.envs as envs
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
@@ -1435,6 +1434,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1451,11 +1451,11 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
-        # Create MM hashes (only used in V1)
+        # Create MM hashes to be returned (only used in V1)
         # TODO: Use these hash keys for caching operations in apply_hf_processor
         # instead of rehashing.
 
-        if envs.VLLM_USE_V1:
+        if return_mm_hashes:
             model_id = self.info.model_id
             mm_hashes = {
                 modality: [
@@ -1554,6 +1554,7 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1567,6 +1568,7 @@ def apply(
             encoder_prompt,
             mm_data,
             hf_processor_mm_kwargs,
+            return_mm_hashes,
         )
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 3a3fc69e53e..d687ed49b71 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -131,6 +131,7 @@ def process_inputs(
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
+            return_mm_hashes=self.use_hash,
         )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 

From 21fab45a40df2d7086cbba042e922026031e91f8 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Mar 2025 00:49:44 -0800
Subject: [PATCH 0513/1240] [Bugfix][V1] Fix allowed_token_ids for v1 Sampler
 (#14169)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/processor.py       | 8 +++++---
 vllm/v1/worker/gpu_input_batch.py | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index d687ed49b71..97c1ef5e9e5 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -92,10 +92,12 @@ def _validate_allowed_token_ids(
             return
         if params.allowed_token_ids is None:
             return
-        if not all(0 <= tid < self.model_config.vocab_size
-                   for tid in params.allowed_token_ids):
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        vocab_size = self.model_config.get_vocab_size()
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
             raise ValueError(
-                "allowed_token_ids contains out-of-vocab token id")
+                "allowed_token_ids contains out-of-vocab token id!")
 
     def process_inputs(
         self,
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index b0b218d92b9..2fe177ea4e1 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -199,6 +199,8 @@ def __init__(
         self.logit_bias: list[Optional[dict[int,
                                             float]]] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
+        # NOTE(lufang): In the mask tensor, if the corresponding token allowed,
+        # the value is False. Since we use masked_fill_ to set -inf.
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
@@ -300,6 +302,7 @@ def add_request(
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:
                 # Lazy allocation for this tensor, which can be large.
+                # False means we don't fill with -inf.
                 self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs,
                                                           self.vocab_size,
                                                           dtype=torch.bool,
@@ -309,8 +312,10 @@ def add_request(
                     self.vocab_size,
                     dtype=torch.bool,
                     device="cpu")
+            self.allowed_token_ids_mask_cpu_tensor[req_index] = True
+            # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index][
-                sampling_params.allowed_token_ids] = True
+                sampling_params.allowed_token_ids] = False
 
         # Add request lora ID
         if request.lora_request:
@@ -359,6 +364,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.logit_bias[req_index] = None
         self.has_allowed_token_ids.discard(req_id)
         if self.allowed_token_ids_mask_cpu_tensor is not None:
+            # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
         return req_index
 

From b73e7711c161c8f40e5c535b7f447424fc0065db Mon Sep 17 00:00:00 2001
From: Iacopo Poli <iacopo@lighton.ai>
Date: Wed, 5 Mar 2025 12:44:10 +0100
Subject: [PATCH 0514/1240] [Doc] Update nginx guide: remove privileged from
 vllm container run and add target GPU ID (#14217)

Signed-off-by: Iacopo Poli <iacopo@lighton.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/nginx.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index 87feb488568..62816f514c0 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -95,14 +95,14 @@ Notes:
 
 - If you have your HuggingFace models cached somewhere else, update `hf_cache_dir` below.
 - If you don't have an existing HuggingFace cache you will want to start `vllm0` and wait for the model to complete downloading and the server to be ready. This will ensure that `vllm1` can leverage the model you just downloaded and it won't have to be downloaded again.
-- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus all`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
+- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
 - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
 
 ```console
 mkdir -p ~/.cache/huggingface/hub/
 hf_cache_dir=~/.cache/huggingface/
-docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
 ```
 
 :::{note}

From 8d7c6add636661736c63231cc066c9de040f3ba1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Mar 2025 20:29:50 +0800
Subject: [PATCH 0515/1240] [Doc] [3/N] Refer code examples for common cases in
 dev multimodal processor (#14278)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md | 34 +++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index c8046d24850..f55a62ef01b 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -859,7 +859,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of `PromptUpdateDetails`
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
 with different `full` and `feature` attributes:
 
 ```python
@@ -948,3 +948,35 @@ to register them to the multi-modal registry:
 +                                         dummy_inputs=YourDummyInputsBuilder)
   class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
 ```
+
+## Notes
+
+### Inserting feature tokens without replacement
+
+Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
+
+Examples:
+
+- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
+- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Handling prompt updates unrelated to multi-modal data
+
+{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
+
+Examples:
+
+- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Custom HF processor
+
+Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
+
+Examples:
+
+- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>

From 8132442d01495304a213ecb2567fe16aefd83289 Mon Sep 17 00:00:00 2001
From: Zhe Zhang <zhz@apache.org>
Date: Wed, 5 Mar 2025 05:30:00 -0800
Subject: [PATCH 0516/1240] Small update for external_launcher backend docs
 (#14288)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/uniproc_executor.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index e041215de66..8c004c790fc 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -103,6 +103,7 @@ def _init_executor(self) -> None:
         # so we can use the env:// method.
         # required env vars:
         # - RANK
+        # - LOCAL_RANK
         # - MASTER_ADDR
         # - MASTER_PORT
         distributed_init_method = "env://"

From 69adf3b88632fe6d170a5cb12444f179980216fb Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Wed, 5 Mar 2025 14:18:55 +0000
Subject: [PATCH 0517/1240] [V1][Frontend] Add Testing For V1 Runtime
 Parameters (#14159)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 150 ++++++++++++++++++++
 vllm/v1/engine/processor.py                 |  63 +++++---
 vllm/v1/worker/gpu_input_batch.py           |   5 +
 3 files changed, 201 insertions(+), 17 deletions(-)
 create mode 100644 tests/v1/sample/test_sampling_params_e2e.py

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
new file mode 100644
index 00000000000..e47f13f0531
--- /dev/null
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -0,0 +1,150 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def model() -> LLM:
+    return LLM(MODEL, enforce_eager=True)
+
+
+def test_n_gt_1(model):
+    """ParallelSampling is supported."""
+
+    params = SamplingParams(n=3)
+    outputs = model.generate(PROMPT, params)
+    assert len(outputs[0].outputs) == 3
+
+
+def test_best_of(model):
+    """Raise a ValueError since best_of is deprecated."""
+
+    params = SamplingParams(n=2, best_of=3)
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, params)
+
+
+def test_penalties(model):
+    """Check that we do not get errors if applied."""
+
+    params = SamplingParams(
+        temperature=1.2,
+        presence_penalty=1.2,
+        frequency_penalty=1.2,
+        repetition_penalty=1.2,
+        min_p=0.5,
+        top_p=0.5,
+        top_k=3,
+    )
+    _ = model.generate(PROMPT, params)
+
+
+def test_stop(model):
+    """Check that we respect the stop words."""
+
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    STOP_IDX = 5
+    params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
+    output = model.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should not contain the stop word.
+    assert len(new_split_text) == STOP_IDX
+
+    params = SamplingParams(temperature=0,
+                            stop=split_text[STOP_IDX],
+                            include_stop_str_in_output=True)
+    output = model.generate(PROMPT, params)
+    new_split_text = output[0].outputs[0].text.split()
+
+    # Output should contain the stop word.
+    assert len(new_split_text) == STOP_IDX + 1
+
+
+def test_stop_token_ids(model):
+    """Check that we respect the stop token ids."""
+
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+
+    stop_token_id_0 = output[0].outputs[0].token_ids[5]
+    stop_token_id_1 = output[0].outputs[0].token_ids[6]
+
+    stop_token_ids = [stop_token_id_1, stop_token_id_0]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = model.generate(PROMPT, params)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+    stop_token_ids = [stop_token_id_0, stop_token_id_1]
+    params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
+
+
+def test_bad_words(model):
+    """Check that we respect bad words."""
+
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, SamplingParams(bad_words=["Hello"]))
+
+
+def test_logits_processor(model):
+    """Check that we reject logits processor."""
+
+    # This sample logits processor gives infinite score to the i-th token,
+    # where i is the length of the input sequence.
+    # We therefore expect the output token sequence to be [0, 1, 2, ...]
+    def pick_ith(token_ids, logits):
+        logits[len(token_ids)] = float("inf")
+        return logits
+
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT,
+                           SamplingParams(logits_processors=[pick_ith]))
+
+
+def test_allowed_token_ids(model):
+    """Check that we can use allowed_token_ids."""
+
+    TOKEN_ID = 10
+    allowed_token_ids = [TOKEN_ID]
+    output = model.generate(
+        PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+    assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
+
+    # Reject negative token id.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+
+    # Reject out of vocabulary.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT,
+                           SamplingParams(allowed_token_ids=[10000000]))
+
+
+def test_priority(model):
+    """Check that we reject requests with priority."""
+
+    # Reject all allowed token ids
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, priority=[1])
+
+
+def test_seed(model):
+    """Check that seed impacts randomness."""
+
+    out_1 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = model.generate(PROMPT, SamplingParams(seed=42))
+    out_3 = model.generate(PROMPT, SamplingParams(seed=43))
+
+    assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
+    assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 97c1ef5e9e5..713a5d38dfd 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -55,11 +55,8 @@ def __init__(
 
     def _validate_logprobs(
         self,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
     ) -> None:
-        if not isinstance(params, SamplingParams):
-            return
-
         max_logprobs = self.model_config.max_logprobs
         # Validate sample logprobs.
         if params.logprobs and params.logprobs > max_logprobs:
@@ -79,17 +76,10 @@ def _validate_logprobs(
             raise ValueError("Prefix caching with prompt logprobs not yet "
                              "supported on VLLM V1.")
 
-    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-
-    def _validate_allowed_token_ids(
+    def _validate_sampling_params(
         self,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
     ) -> None:
-        if not isinstance(params, SamplingParams):
-            return
         if params.allowed_token_ids is None:
             return
         if not params.allowed_token_ids:
@@ -99,6 +89,42 @@ def _validate_allowed_token_ids(
             raise ValueError(
                 "allowed_token_ids contains out-of-vocab token id!")
 
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Best of not yet supported.
+        if params.best_of:
+            raise ValueError("VLLM V1 does not yet support best_of.")
+        # Bad words not yet supported.
+        if params.bad_words:
+            raise ValueError("VLLM V1 does not yet support bad_words.")
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError("VLLM V1 does not support per request "
+                             "user provided logits processors.")
+
+    def _validate_params(
+        self,
+        params: Union[SamplingParams, PoolingParams],
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if not isinstance(params, SamplingParams):
+            raise ValueError("V1 does not yet support Pooling models.")
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+
     def process_inputs(
         self,
         request_id: str,
@@ -114,14 +140,17 @@ def process_inputs(
         # TODO(woosuk): Support pooling models.
         # TODO(woosuk): Support encoder-decoder models.
 
-        self._validate_logprobs(params)
         self._validate_lora(lora_request)
-        self._validate_allowed_token_ids(params)
+        self._validate_params(params)
+        if priority != 0:
+            raise ValueError("V1 does not support priority yet.")
+        if trace_headers is not None:
+            raise ValueError("V1 does not support tracing yet.")
+        if prompt_adapter_request is not None:
+            raise ValueError("V1 does not support prompt_adapter_request.")
 
         if arrival_time is None:
             arrival_time = time.time()
-        assert priority == 0, "vLLM V1 does not support priority at the moment."
-        assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 2fe177ea4e1..c0e9ff0286d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -298,6 +298,11 @@ def add_request(
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
+        # FIXME: this implementation is incorrect. We create this mask
+        # then apply -inf to these specific tokens, which means we never
+        # select the allowed tokens! We cannot do the reverse, since
+        # this will impact the requests that do not have allowed_token_ids.
+        # This feature is currently disabled on V1 (we reject in Processor).
         if sampling_params.allowed_token_ids:
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:

From 01b47f94f3ad97d5eb4c51e2e00d3e3b58ca58f3 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 5 Mar 2025 23:06:28 +0800
Subject: [PATCH 0518/1240] [LoRA] Remove linear hack outside transformers
 backend (#14177)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/layers.py                        |  51 +++---
 vllm/lora/utils.py                         |  10 --
 vllm/model_executor/layers/linear.py       | 171 +++++++++++++--------
 vllm/model_executor/models/transformers.py |  15 +-
 4 files changed, 142 insertions(+), 105 deletions(-)

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 5a4d991da1b..ff1b6501d1f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -395,17 +395,20 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+
+        # In transformers backend, x and output have extra batch dimension like
+        # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
+        # therefore we need to flatten the batch dimensions.
+        if x.ndim == 3 and output.ndim == 3:
+            output = output.flatten(0, 1)
+            x = x.flatten(0, 1)
+
         self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked,
                                             self.lora_b_stacked,
                                             self.lora_bias_stacked, 1.0,
                                             self.output_slices)
         return output
 
-    @classmethod
-    def get_source_layer(cls, source_layer: nn.Module) -> type:
-        # Check parent_cls in case source_layer is a HFCompatibleLinear.
-        return getattr(source_layer, "parent_cls", type(source_layer))
-
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
@@ -418,7 +421,7 @@ def __init__(self, base_layer: ReplicatedLinear) -> None:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ReplicatedLinearWithLoRA
 
         Args:
@@ -436,6 +439,10 @@ def forward(
 
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
+
+        if not self.base_layer.return_bias:
+            return output
+
         return output, output_bias
 
     # ReplicatedLinear should always be replaced, regardless of the fully
@@ -448,8 +455,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ReplicatedLinear
+        return type(source_layer) is ReplicatedLinear
 
 
 class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
@@ -512,7 +518,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of ColumnParallelLinear
 
         Args:
@@ -532,6 +538,10 @@ def forward(
             output = tensor_model_parallel_all_gather(output_parallel)
         else:
             output = output_parallel
+
+        if not self.base_layer.return_bias:
+            return output
+
         output_bias = (self.base_layer.bias
                        if self.base_layer.skip_bias_add else None)
         return output, output_bias
@@ -545,9 +555,8 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is ColumnParallelLinear or (
-            source_layer is MergedColumnParallelLinear
+        return type(source_layer) is ColumnParallelLinear or (
+            type(source_layer) is MergedColumnParallelLinear
             and len(packed_modules_list) == 1)
 
 
@@ -689,8 +698,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is MergedColumnParallelLinear
+        return (type(source_layer) is MergedColumnParallelLinear
                 and len(packed_modules_list) == 2)
 
 
@@ -758,8 +766,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
     def can_replace_layer(cls, source_layer: nn.Module,
                           lora_config: LoRAConfig, packed_modules_list: List,
                           model_config: Optional[PretrainedConfig]) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is QKVParallelLinear and len(
+        return type(source_layer) is QKVParallelLinear and len(
             packed_modules_list) == 1
 
 
@@ -820,8 +827,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return (source_layer is QKVParallelLinear
+        return (type(source_layer) is QKVParallelLinear
                 and len(packed_modules_list) == 3)
 
 
@@ -855,7 +861,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
 
     def forward(
         self, input_: torch.Tensor
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[torch.Tensor]]]:
         """Forward of RowParallelLinear
 
         Args:
@@ -890,6 +896,10 @@ def forward(
         else:
             output = output_
             output_bias = self.base_layer.bias
+
+        if not self.base_layer.return_bias:
+            return output
+
         return output, output_bias
 
     @property
@@ -906,8 +916,7 @@ def can_replace_layer(
         packed_modules_list: List,
         model_config: Optional[PretrainedConfig],
     ) -> bool:
-        source_layer = cls.get_source_layer(source_layer)
-        return source_layer is RowParallelLinear
+        return type(source_layer) is RowParallelLinear
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 9f1b14b4970..610cbf87f66 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -67,16 +67,6 @@ def from_layer(layer: nn.Module,
                                       packed_modules_list=packed_modules_list,
                                       model_config=model_config):
             instance_layer = lora_cls(layer)
-            if layer.__class__.__name__ == "HFCompatibleLinear":
-                # HACK:  Make the forward method compatible with the original
-                # forward method of the instance_layer.
-                original_forward = instance_layer.forward
-
-                def new_forward(input):
-                    input = input.squeeze(0)
-                    return original_forward(input)[0]  # noqa: B023
-
-                instance_layer.forward = new_forward
             instance_layer.create_lora_weights(max_loras, lora_config,
                                                model_config)
             return instance_layer
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index b9c85aaf50b..600284a8fa7 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,7 +2,7 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -152,6 +152,7 @@ class LinearBase(torch.nn.Module):
         skip_bias_add: If true, skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
         quant_config: Quantization configure.
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
@@ -162,6 +163,8 @@ def __init__(
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         super().__init__()
 
@@ -178,9 +181,11 @@ def __init__(
         else:
             self.quant_method = quant_config.get_quant_method(self,
                                                               prefix=prefix)
+        self.return_bias = return_bias
 
-    def forward(self,
-                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         raise NotImplementedError
 
 
@@ -198,20 +203,25 @@ class ReplicatedLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         super().__init__(input_size,
                          output_size,
                          skip_bias_add,
                          params_dtype,
                          quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
         # All the linear layer supports quant method.
         assert self.quant_method is not None
@@ -254,12 +264,15 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             f"to a parameter of size {param.size()}")
         param.data.copy_(loaded_weight)
 
-    def forward(self,
-                x: torch.Tensor) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, x: torch.Tensor
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         bias = self.bias if not self.skip_bias_add else None
         assert self.quant_method is not None
         output = self.quant_method.apply(self, x, bias)
         output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -293,16 +306,20 @@ class ColumnParallelLinear(LinearBase):
                         (e.g. model.layers.0.qkv_proj) 
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 output_sizes: Optional[list[int]] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        output_sizes: Optional[list[int]] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         # Divide the weight matrix along the last dimension.
         self.tp_size = get_tensor_model_parallel_world_size()
         self.input_size_per_partition = input_size
@@ -315,8 +332,13 @@ def __init__(self,
                 for output_size in self.output_sizes
             ]
 
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
 
         self.gather_output = gather_output
 
@@ -393,7 +415,9 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
             loaded_weight = loaded_weight.reshape(1)
         param.load_column_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         bias = self.bias if not self.skip_bias_add else None
 
         # Matrix multiply.
@@ -405,6 +429,8 @@ def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
@@ -439,15 +465,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_sizes: list[int],
-                 bias: bool = True,
-                 gather_output: bool = False,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_sizes: list[int],
+        bias: bool = True,
+        gather_output: bool = False,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
@@ -458,7 +488,8 @@ def __init__(self,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
                          quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
     def weight_loader(self,
                       param: Parameter,
@@ -711,16 +742,20 @@ class QKVParallelLinear(ColumnParallelLinear):
                         (e.g. model.layers.0.qkv_proj)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 head_size: int,
-                 total_num_heads: int,
-                 total_num_kv_heads: Optional[int] = None,
-                 bias: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        hidden_size: int,
+        head_size: int,
+        total_num_heads: int,
+        total_num_kv_heads: Optional[int] = None,
+        bias: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         self.hidden_size = hidden_size
         self.head_size = head_size
         self.total_num_heads = total_num_heads
@@ -753,7 +788,8 @@ def __init__(self,
                          skip_bias_add=skip_bias_add,
                          params_dtype=params_dtype,
                          quant_config=quant_config,
-                         prefix=prefix)
+                         prefix=prefix,
+                         return_bias=return_bias)
 
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
         shard_offset_mapping = {
@@ -1048,16 +1084,20 @@ class RowParallelLinear(LinearBase):
         quant_config: Quantization configure.
     """
 
-    def __init__(self,
-                 input_size: int,
-                 output_size: int,
-                 bias: bool = True,
-                 input_is_parallel: bool = True,
-                 skip_bias_add: bool = False,
-                 params_dtype: Optional[torch.dtype] = None,
-                 reduce_results: bool = True,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int,
+        bias: bool = True,
+        input_is_parallel: bool = True,
+        skip_bias_add: bool = False,
+        params_dtype: Optional[torch.dtype] = None,
+        reduce_results: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+        *,
+        return_bias: bool = True,
+    ):
         # Divide the weight matrix along the first dimension.
         self.tp_rank = get_tensor_model_parallel_rank()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -1065,8 +1105,13 @@ def __init__(self,
         self.output_size_per_partition = output_size
         self.output_partition_sizes = [output_size]
 
-        super().__init__(input_size, output_size, skip_bias_add, params_dtype,
-                         quant_config, prefix)
+        super().__init__(input_size,
+                         output_size,
+                         skip_bias_add,
+                         params_dtype,
+                         quant_config,
+                         prefix,
+                         return_bias=return_bias)
 
         self.input_is_parallel = input_is_parallel
         self.reduce_results = reduce_results
@@ -1145,7 +1190,9 @@ def weight_loader_v2(self, param: BasevLLMParameter,
 
         param.load_row_parallel_weight(loaded_weight=loaded_weight)
 
-    def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
+    def forward(
+        self, input_
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -1169,6 +1216,8 @@ def forward(self, input_) -> tuple[torch.Tensor, Optional[Parameter]]:
 
         output_bias = self.bias if self.skip_bias_add else None
 
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def extra_repr(self) -> str:
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index a6bfdebb1a7..be788d63200 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -96,23 +96,12 @@ def replace_linear_class(
         "rowwise": RowParallelLinear,
     }.get(style, ReplicatedLinear)
 
-    class HFCompatibleLinear(vllm_linear_cls):
-        """
-        Wrapper class that removes `output_bias` from returned output.
-        """
-        # NOTE: The LoRA layer needs to use `parent_cls`.
-        @property
-        def parent_cls(self) -> type:
-            return vllm_linear_cls
-
-        def forward(self, input: torch.Tensor) -> torch.Tensor:
-            return super().forward(input)[0]
-
-    return HFCompatibleLinear(
+    return vllm_linear_cls(
         input_size=linear.in_features,
         output_size=linear.out_features,
         bias=linear.bias is not None,
         quant_config=quant_config,
+        return_bias=False,
     )
 
 
From 68bbaf4831017faa608fa98259992ab08388b5a6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 5 Mar 2025 23:11:29 +0800
Subject: [PATCH 0519/1240] [Misc] Add Qwen2MoeForCausalLM moe tuning support 
 (#14276)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index bb28c32798e..dce0bef4203 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -509,6 +509,11 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
         block_quant_shape = config.quantization_config['weight_block_size']
+    elif config.architectures[0] == "Qwen2MoeForCausalLM":
+        E = config.num_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.moe_intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Default: Mixtral.
         E = config.num_local_experts

From aa5463d353ff171d8dcb5ed5caa30a9808c5b0a2 Mon Sep 17 00:00:00 2001
From: DaividFrank <49250948+DaividFrank@users.noreply.github.com>
Date: Wed, 5 Mar 2025 16:43:13 +0100
Subject: [PATCH 0520/1240] prefix_caching.md: Fixed typo (#14293)

Signed-off-by: Daivid Savernin-Frenk <daivid.frank@TurboNext.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/v1/prefix_caching.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index dc8432baef9..d4167ff0c27 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -183,7 +183,7 @@ When a request is finished, we free all its blocks if no other requests are usin
 
 When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
 
-1. Pop the block from the head of the free queue. This is the LRU black to be evicted.  
+1. Pop the block from the head of the free queue. This is the LRU block to be evicted.  
 2. Remove the block ID from the Cache Block.  
 3. Remove the block hash.
 

From 725b1b4317e44e02db2edff970cd9dfcc6214449 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 5 Mar 2025 23:57:10 +0800
Subject: [PATCH 0521/1240] [Bugfix] Fix broken vision language example
 (#14292)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language.py | 40 +++++++++----------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index a0a71f18ed9..270c0f59cc5 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -152,15 +152,13 @@ def run_h2ovl(questions: list[str], modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    prompts = [
-        tokenizer.apply_chat_template([{
-            'role': 'user',
-            'content': f"<image>\n{question}"
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for question in questions
-    ]
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
@@ -209,15 +207,13 @@ def run_internvl(questions: list[str], modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    prompts = [
-        tokenizer.apply_chat_template([{
-            'role': 'user',
-            'content': f"<image>\n{question}"
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
-        for question in questions
-    ]
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -399,7 +395,7 @@ def run_mllama(questions: list[str], modality: str):
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
+    messages = [[{
         "role":
         "user",
         "content": [{
@@ -408,7 +404,7 @@ def run_mllama(questions: list[str], modality: str):
             "type": "text",
             "text": f"{question}"
         }]
-    } for question in questions]
+    }] for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
                                             add_generation_prompt=True,
                                             tokenize=False)
@@ -454,10 +450,10 @@ def run_nvlm_d(questions: list[str], modality: str):
 
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    messages = [{
+    messages = [[{
         'role': 'user',
         'content': f"<image>\n{question}"
-    } for question in questions]
+    }] for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
                                             tokenize=False,
                                             add_generation_prompt=True)

From 18358e29b3fe2fa264189096ae460e829df9b49a Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 5 Mar 2025 08:30:23 -0800
Subject: [PATCH 0522/1240] [Docs] Add Meta Slides (#14297)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md                        | 1 +
 docs/source/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 613f178ea52..bdf72d9a00b 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in *
 
 *Latest News* 🔥
 
+- [2024/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index ab5ea147f4c..c57f27b49b8 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
 - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)

From 4d0c48bc01e4835723f83f1cb9558d8f909db887 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 5 Mar 2025 11:59:23 -0800
Subject: [PATCH 0523/1240] [V1][Minor] Remove obsolete FIXME comment (#14304)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_input_batch.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index c0e9ff0286d..2fe177ea4e1 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -298,11 +298,6 @@ def add_request(
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
-        # FIXME: this implementation is incorrect. We create this mask
-        # then apply -inf to these specific tokens, which means we never
-        # select the allowed tokens! We cannot do the reverse, since
-        # this will impact the requests that do not have allowed_token_ids.
-        # This feature is currently disabled on V1 (we reject in Processor).
         if sampling_params.allowed_token_ids:
             self.has_allowed_token_ids.add(req_id)
             if self.allowed_token_ids_mask_cpu_tensor is None:

From ccc2a0453dc84062cd885296f72b82a1752f7d7e Mon Sep 17 00:00:00 2001
From: Vincent <vincentzhongy+githubvincent4@gmail.com>
Date: Wed, 5 Mar 2025 15:22:43 -0500
Subject: [PATCH 0524/1240] Deprecate `best_of` Sampling Parameter in
 anticipation for vLLM V1 (#13997)

Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py            |  5 ----
 benchmarks/benchmark_serving.py               | 14 ----------
 .../offline_inference/llm_engine_example.py   |  1 -
 .../opentelemetry/dummy_client.py             |  1 -
 tests/core/test_scheduler.py                  |  4 ---
 tests/core/utils.py                           | 27 +++++++++----------
 tests/v1/sample/test_sampling_params_e2e.py   |  8 ------
 vllm/entrypoints/llm.py                       |  5 +---
 vllm/entrypoints/openai/protocol.py           |  4 ---
 vllm/entrypoints/openai/serving_completion.py |  8 ++----
 vllm/sampling_params.py                       | 24 -----------------
 vllm/v1/engine/processor.py                   |  3 ---
 12 files changed, 16 insertions(+), 88 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 158705769b5..d53428d219e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -27,7 +27,6 @@ class RequestFuncInput:
     output_len: int
     model: str
     model_name: Optional[str] = None
-    best_of: int = 1
     logprobs: Optional[int] = None
     extra_body: Optional[dict] = None
     multi_modal_content: Optional[dict] = None
@@ -58,7 +57,6 @@ async def async_request_tgi(
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
         params = {
-            "best_of": request_func_input.best_of,
             "max_new_tokens": request_func_input.output_len,
             "do_sample": True,
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
@@ -130,7 +128,6 @@ async def async_request_trt_llm(
 
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
         payload = {
             "accumulate_tokens": True,
             "text_input": request_func_input.prompt,
@@ -195,7 +192,6 @@ async def async_request_deepspeed_mii(
 ) -> RequestFuncOutput:
     async with aiohttp.ClientSession(trust_env=True,
                                      timeout=AIOHTTP_TIMEOUT) as session:
-        assert request_func_input.best_of == 1
 
         payload = {
             "prompt": request_func_input.prompt,
@@ -249,7 +245,6 @@ async def async_request_openai_completions(
                 if request_func_input.model_name else request_func_input.model,
             "prompt": request_func_input.prompt,
             "temperature": 0.0,
-            "best_of": request_func_input.best_of,
             "max_tokens": request_func_input.output_len,
             "logprobs": request_func_input.logprobs,
             "stream": True,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 16ec0a4817a..68ca2dc8f7d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -560,7 +560,6 @@ async def benchmark(
     tokenizer: PreTrainedTokenizerBase,
     input_requests: list[tuple[str, int, int]],
     logprobs: Optional[int],
-    best_of: int,
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
@@ -592,7 +591,6 @@ async def benchmark(
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         logprobs=logprobs,
-        best_of=best_of,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
     )
@@ -619,7 +617,6 @@ async def benchmark(
                                          prompt_len=test_prompt_len,
                                          output_len=test_output_len,
                                          logprobs=logprobs,
-                                         best_of=best_of,
                                          multi_modal_content=test_mm_content,
                                          ignore_eos=ignore_eos)
         profile_output = await request_func(request_func_input=profile_input)
@@ -668,7 +665,6 @@ async def limited_request_func(request_func_input, pbar):
                                               prompt_len=prompt_len,
                                               output_len=output_len,
                                               logprobs=logprobs,
-                                              best_of=best_of,
                                               multi_modal_content=mm_content,
                                               ignore_eos=ignore_eos)
         tasks.append(
@@ -686,7 +682,6 @@ async def limited_request_func(request_func_input, pbar):
             prompt_len=test_prompt_len,
             output_len=test_output_len,
             logprobs=logprobs,
-            best_of=best_of,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -958,7 +953,6 @@ def main(args: argparse.Namespace):
             tokenizer=tokenizer,
             input_requests=input_requests,
             logprobs=args.logprobs,
-            best_of=args.best_of,
             request_rate=args.request_rate,
             burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
@@ -983,7 +977,6 @@ def main(args: argparse.Namespace):
         result_json["backend"] = backend
         result_json["model_id"] = model_id
         result_json["tokenizer_id"] = tokenizer_id
-        result_json["best_of"] = args.best_of
         result_json["num_prompts"] = args.num_prompts
 
         # Metadata
@@ -1081,13 +1074,6 @@ def main(args: argparse.Namespace):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
-    parser.add_argument(
-        "--best-of",
-        type=int,
-        default=1,
-        help="Generates `best_of` sequences per prompt and "
-        "returns the best one.",
-    )
     parser.add_argument("--use-beam-search", action="store_true")
     parser.add_argument(
         "--num-prompts",
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index f7741a37224..e94f47b72b2 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -15,7 +15,6 @@ def create_test_prompts() -> list[tuple[str, SamplingParams]]:
          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
         ("What is the meaning of life?",
          SamplingParams(n=2,
-                        best_of=5,
                         temperature=0.8,
                         top_p=0.95,
                         frequency_penalty=0.1)),
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index 7a605f85b97..a8b353090d7 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -28,7 +28,6 @@
         "model": "facebook/opt-125m",
         "prompt": prompt,
         "max_tokens": 10,
-        "best_of": 20,
         "n": 3,
         "use_beam_search": "true",
         "temperature": 0.0,
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 9e461d4e0b4..8bd64923fe2 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -617,7 +617,6 @@ def test_schedule_decode_blocks_to_copy_update():
                                      num_gpu_blocks=16)
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     curr_loras = None
     scheduler._allocate_and_set_running(seq_group)
@@ -686,7 +685,6 @@ def test_schedule_swapped_cannot_swap_in():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -717,7 +715,6 @@ def test_infeasible_swap():
     for i in range(2):
         _, seq_group = create_dummy_prompt(str(i),
                                            prompt_length=60,
-                                           best_of=2,
                                            block_size=block_size)
         scheduler._allocate_and_set_running(seq_group)
         append_new_token_seq_group(60, seq_group, 1)
@@ -747,7 +744,6 @@ def test_schedule_swapped_blocks_to_copy():
     curr_loras = None
     _, seq_group = create_dummy_prompt("1",
                                        prompt_length=60,
-                                       best_of=2,
                                        block_size=block_size)
     scheduler._allocate_and_set_running(seq_group)
     append_new_token_seq_group(60, seq_group, 1)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index ba4265e3c20..ea18b879a31 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -18,7 +18,6 @@ def create_dummy_prompt(
     prompt_length: int = -1,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
     prompt_tokens: Optional[list[int]] = None,
     min_tokens: int = 0,
     max_tokens: int = 16,
@@ -32,17 +31,19 @@ def create_dummy_prompt(
         prompt_tokens = list(range(prompt_length))
 
     prompt_str = " ".join([str(t) for t in prompt_tokens])
-    prompt = Sequence(int(request_id),
-                      inputs=token_inputs(prompt_tokens, prompt=prompt_str),
-                      block_size=block_size)
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=[prompt],
-                              arrival_time=time.time(),
-                              sampling_params=SamplingParams(
-                                  best_of=best_of,
-                                  max_tokens=max_tokens,
-                                  min_tokens=min_tokens),
-                              lora_request=lora_request)
+    prompt = Sequence(
+        int(request_id),
+        inputs=token_inputs(prompt_tokens, prompt=prompt_str),
+        block_size=block_size,
+    )
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=[prompt],
+        arrival_time=time.time(),
+        sampling_params=SamplingParams(max_tokens=max_tokens,
+                                       min_tokens=min_tokens),
+        lora_request=lora_request,
+    )
 
     return prompt, seq_group
 
@@ -72,7 +73,6 @@ def create_dummy_prompt_encoder_decoder(
     encoder_prompt_length: int,
     block_size: Optional[int] = None,
     lora_request: Optional[LoRARequest] = None,
-    best_of: int = 1,
 ) -> tuple[Sequence, Sequence, SequenceGroup]:
     if not block_size:
         block_size = decoder_prompt_length
@@ -102,7 +102,6 @@ def create_dummy_prompt_encoder_decoder(
 
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
-                              sampling_params=SamplingParams(best_of=best_of),
                               arrival_time=time.time(),
                               lora_request=lora_request,
                               encoder_seq=encoder_prompt)
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index e47f13f0531..f17d4b77afc 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -25,14 +25,6 @@ def test_n_gt_1(model):
     assert len(outputs[0].outputs) == 3
 
 
-def test_best_of(model):
-    """Raise a ValueError since best_of is deprecated."""
-
-    params = SamplingParams(n=2, best_of=3)
-    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, params)
-
-
 def test_penalties(model):
     """Check that we do not get errors if applied."""
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index fc585ee9e54..dd46a1376ad 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -97,10 +97,7 @@ class LLM:
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-            This can be used for temporarily storing the states of the requests
-            when their `best_of` sampling parameters are larger than 1. If all
-            requests will have `best_of=1`, you can safely set this to 0.
-            Otherwise, too small values may cause out-of-memory (OOM) errors.
+           Too small values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 2c740caf20f..4c4d86fddb5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,7 +242,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # doc: begin-chat-completion-sampling-params
-    best_of: Optional[int] = None
     use_beam_search: bool = False
     top_k: Optional[int] = None
     min_p: Optional[float] = None
@@ -479,7 +478,6 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -650,7 +648,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: Optional[str] = None
     prompt: Union[list[int], list[list[int]], str, list[str]]
-    best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[dict[str, float]] = None
@@ -848,7 +845,6 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index edcf1b086ba..592f213b6f5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -168,12 +168,8 @@ async def create_completion(
         model_name = self._get_model_name(request.model, lora_request)
         num_prompts = len(engine_prompts)
 
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use
-        # beam search.
-        stream = (request.stream
-                  and (request.best_of is None or request.n == request.best_of)
-                  and not request.use_beam_search)
+        # We do not stream the results when use beam search.
+        stream = (request.stream and not request.use_beam_search)
 
         # Streaming response
         if stream:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 17e4e43387d..599d52ee670 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -116,10 +116,6 @@ class SamplingParams(
 
     Args:
         n: Number of output sequences to return for the given prompt.
-        best_of: Number of output sequences that are generated from the prompt.
-            From these `best_of` sequences, the top `n` sequences are returned.
-            `best_of` must be greater than or equal to `n`. By default,
-            `best_of` is set to `n`.
         presence_penalty: Float that penalizes new tokens based on whether they
             appear in the generated text so far. Values > 0 encourage the model
             to use new tokens, while values < 0 encourage the model to repeat
@@ -187,7 +183,6 @@ class SamplingParams(
     """
 
     n: int = 1
-    best_of: Optional[int] = None
     _real_n: Optional[int] = None
     presence_penalty: float = 0.0
     frequency_penalty: float = 0.0
@@ -231,7 +226,6 @@ class SamplingParams(
     @staticmethod
     def from_optional(
         n: Optional[int] = 1,
-        best_of: Optional[int] = None,
         presence_penalty: Optional[float] = 0.0,
         frequency_penalty: Optional[float] = 0.0,
         repetition_penalty: Optional[float] = 1.0,
@@ -270,7 +264,6 @@ def from_optional(
 
         return SamplingParams(
             n=1 if n is None else n,
-            best_of=best_of,
             presence_penalty=0.0
             if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0
@@ -303,20 +296,6 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of``:
-        # if `best_of`` is not set, we default to `n`;
-        # if `best_of`` is set, we set `n`` to `best_of`,
-        # and set `_real_n`` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}.")
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
 
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
@@ -423,9 +402,6 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop.")
-        if self.best_of != self._real_n and self.output_kind == (
-                RequestOutputKind.DELTA):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 713a5d38dfd..6a2c1c545f1 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,9 +93,6 @@ def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
     ) -> None:
-        # Best of not yet supported.
-        if params.best_of:
-            raise ValueError("VLLM V1 does not yet support best_of.")
         # Bad words not yet supported.
         if params.bad_words:
             raise ValueError("VLLM V1 does not yet support bad_words.")

From 0c4035c212e7e0bcc562df0afa104bec9674da6b Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 5 Mar 2025 12:43:04 -0800
Subject: [PATCH 0525/1240] [V1][BugFix] Fix for mixed top_k batch (#14301)

Signed-off-by: Nick Hill <nhill@redhat.com>

Co-authored-by: Ye Cao <caoye.cao@alibaba-inc.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_input_batch.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 2fe177ea4e1..02b2aa3ea67 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -264,9 +264,12 @@ def add_request(
         self.top_p_cpu[req_index] = sampling_params.top_p
         if sampling_params.top_p < 1:
             self.top_p_reqs.add(req_id)
-        self.top_k_cpu[req_index] = sampling_params.top_k
-        if sampling_params.top_k > 0:
+        top_k = sampling_params.top_k
+        if 0 < top_k < self.vocab_size:
             self.top_k_reqs.add(req_id)
+        else:
+            top_k = self.vocab_size
+        self.top_k_cpu[req_index] = top_k
         self.min_p_cpu[req_index] = sampling_params.min_p
         self.frequency_penalties_cpu[
             req_index] = sampling_params.frequency_penalty

From c2aa8f2e2305f2ead0461661a72a9ee47cc45ebb Mon Sep 17 00:00:00 2001
From: Serena <yangsijia.614@bytedance.com>
Date: Thu, 6 Mar 2025 05:28:50 +0800
Subject: [PATCH 0526/1240] [misc] Add FlashMLA as a new option of
 VLLM_ATTENTION_BACKEND env (#14267)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index f6c038967b6..edabd647db2 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -321,6 +321,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # - "XFORMERS": use XFormers
     # - "ROCM_FLASH": use ROCmFlashAttention
     # - "FLASHINFER": use flashinfer
+    # - "FLASHMLA": use FlashMLA
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 

From 45ca2531e07b5e681ca6c0e664cf5e46cb602eb6 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Wed, 5 Mar 2025 13:41:18 -0800
Subject: [PATCH 0527/1240] [V1][Easy] Add empty allowed_token_ids in the v1
 sampler test (#14308)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index f17d4b77afc..4e88feae44d 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -113,6 +113,10 @@ def test_allowed_token_ids(model):
         PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
     assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
 
+    # Reject empty allowed_token_ids.
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+
     # Reject negative token id.
     with pytest.raises(ValueError):
         _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))

From c5a805782f736d8bdf09f3587df6b06e675668eb Mon Sep 17 00:00:00 2001
From: pyc96 <pychen96@gmail.com>
Date: Wed, 5 Mar 2025 14:22:40 -0800
Subject: [PATCH 0528/1240] [Bugfix] Fix DeepSeek MTP crash when using
 TP1ModelRunner with CUDA graph due to shape mismatch (#14237)

Signed-off-by: pyc96 <pychen96@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/spec_decode/draft_model_runner.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index bc1b3e2319d..3ad9b499332 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -133,7 +133,7 @@ def _gpu_advance_step(self, model_input: ModelRunnerInputBase,
     def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest):
         """Determines if draft_model_runner GPU multi-step can be used.
         Currently required conditions are:
-            1. Only decodes 
+            1. Only decodes
             2. Only flash-attn
             3. No LORA
             4. No prompt_adapter_config
@@ -171,12 +171,12 @@ def execute_model(
         num_steps: int = 1,
         **kwargs,
     ) -> Optional[List[SamplerOutput]]:
-        """Executes num_steps forward passes with advacement of input tensors 
+        """Executes num_steps forward passes with advacement of input tensors
         on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions.
 
         Optimizations used:
             1. Input tensors are updated on the GPU directly
-            2. Skips GPU=>CPU serialization of sampler outputs (we don't need 
+            2. Skips GPU=>CPU serialization of sampler outputs (we don't need
                 them since we do batch expansion later that uses GPU outputs)
             3. Reuses sampling tensors (since we run only decodes and they have
                 a repeating sampling logic)
@@ -302,7 +302,12 @@ def execute_model(
             outputs.append(output)
 
             if self.return_hidden_states and is_fallback:
-                output.hidden_states = hidden_states
+                if use_cuda_graph:
+                    indices = model_input.sampling_metadata\
+                      .selected_token_indices
+                    output.hidden_states = hidden_states[:len(indices)]
+                else:
+                    output.hidden_states = hidden_states
 
             if model_input.attn_metadata.num_prefills == 0 \
                 and self.indices_of_seq_with_bonus_tokens is not None:

From f7d303bae6e2e9eb2e0a0aa2391d70d76b7d1e83 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 5 Mar 2025 18:55:55 -0500
Subject: [PATCH 0529/1240] [Bugfix] Remove num_tokens_across_dp (#14302)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/forward_context.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 540a35e1ecb..e195a03c5ca 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -27,7 +27,6 @@
 
 @dataclass
 class DPMetadata:
-    num_tokens_across_dp: list[int]
     cu_tokens_across_dp_cpu: torch.Tensor
 
 
@@ -89,7 +88,7 @@ def set_forward_context(attn_metadata: Any,
         from vllm.distributed.parallel_state import get_dp_group
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
         cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0)
-        dp_metadata = DPMetadata(num_tokens_across_dp, cu_tokens_across_dp_cpu)
+        dp_metadata = DPMetadata(cu_tokens_across_dp_cpu)
 
     global _forward_context
     prev_context = _forward_context

From c5e1fca9ce29c46b5a67e3f6cc26cd00395cf6f5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 5 Mar 2025 20:07:42 -0500
Subject: [PATCH 0530/1240] [BugFix] Fix prefix caching V0 MLA (#14255)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Ying Zhong <zhongyingmatrix@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py | 44 +++++++++++++++------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index f240074f252..8184b073275 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -313,9 +313,10 @@ def __init__(self, runner):
         cache_config = runner.cache_config
 
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = cache_config.enable_prefix_caching
 
-        if self.chunked_prefill_enabled:
-            self.chunked_prefill_workspace_size = min(
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            self.context_chunk_workspace_size = min(
                 # Max sure there is enough for 8 full length request or at least
                 # 4 pages of cache per request
                 max(
@@ -330,7 +331,7 @@ def __init__(self, runner):
                 #   2*(192*128)*(64*1024) = 3gb
                 # (assuming 192 QK head dim, 128 heads, and fp16)
                 128 * 1024)
-            assert self.chunked_prefill_workspace_size >= \
+            assert self.context_chunk_workspace_size >= \
                 scheduler_config.max_num_seqs * cache_config.block_size
 
     @contextmanager
@@ -430,23 +431,23 @@ def prepare_graph_input_buffers(self,
                 "TritonMLAState does not support encoder/decoder yet")
 
     def begin_forward(self, model_input):
-        if self.chunked_prefill_enabled:
-            if not hasattr(self, "chunked_prefill_workspace"):
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
+            if not hasattr(self, "context_chunk_workspace"):
                 # not self.runner.device does not return the correct device
                 # for this process, (init_device sets the correct device but
                 # only on the Worker). The only way Ive figured out to get the
                 # correct device is to allocate the workspace on the first call
                 # to begin_forward and use the device of the input tokens
                 assert model_input.input_tokens is not None
-                self.chunked_prefill_workspace = torch.empty(
-                    (self.chunked_prefill_workspace_size,
+                self.context_chunk_workspace = torch.empty(
+                    (self.context_chunk_workspace_size,
                      self.model_config.get_head_size()),
                     dtype=self.model_config.dtype,
                     device=model_input.input_tokens.device,
                 )
 
-            model_input.attn_metadata.chunked_prefill_workspace = \
-                self.chunked_prefill_workspace
+            model_input.attn_metadata.context_chunk_workspace = \
+                self.context_chunk_workspace
 
 
 @dataclass
@@ -537,7 +538,7 @@ class MLACommonMetadata(AttentionMetadata):
     context_chunk_seq_tot: Optional[List[int]] = None
     context_chunk_max_seq_lens: Optional[List[int]] = None
     # Set by MLAAttentionState in `begin_forward` so it doesn't get broadcasted
-    chunked_prefill_workspace: Optional[torch.Tensor] = None
+    context_chunk_workspace: Optional[torch.Tensor] = None
 
     def __post_init__(self):
         supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
@@ -747,11 +748,13 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.block_size = input_builder.block_size
         self.chunked_prefill_enabled = \
             self.runner.scheduler_config.chunked_prefill_enabled
+        self.enable_prefix_caching = \
+            self.runner.cache_config.enable_prefix_caching
 
-        if self.chunked_prefill_enabled:
+        if self.chunked_prefill_enabled or self.enable_prefix_caching:
             attn_state = self.input_builder.runner.attn_state
-            self.chunked_prefill_workspace_size = \
-                attn_state.chunked_prefill_workspace_size
+            self.context_chunk_workspace_size = \
+                attn_state.context_chunk_workspace_size
             self.page_size = self.runner.block_size
 
     def prepare(self):
@@ -920,7 +923,8 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         context_chunk_seq_tot = None
         context_chunk_max_seq_lens = None
 
-        if self.chunked_prefill_enabled and self.num_prefills > 0 \
+        if (self.chunked_prefill_enabled or self.enable_prefix_caching) \
+            and self.num_prefills > 0 \
             and context_lens_tensor is not None \
             and context_lens_tensor[:self.num_prefills].max() > 0:
 
@@ -936,7 +940,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # algorithm here and allocate more workspace to prefills with
             # longer context lengths
             max_context_chunk = \
-                self.chunked_prefill_workspace_size // num_prefills_with_context
+                self.context_chunk_workspace_size // num_prefills_with_context
 
             # align max_context_chunk to page_size by rounding down,
             # currently the `gather_cache` kernel cannot handle
@@ -965,7 +969,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
                 chunk_seq_lens.max(dim=1).values.tolist()
             context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
             assert max(context_chunk_seq_tot) <= \
-                self.chunked_prefill_workspace_size
+                self.context_chunk_workspace_size
 
         return self.runner.attn_backend.make_metadata(
             # Required by ModelRunner
@@ -1288,8 +1292,8 @@ def _compute_prefill_context(
         # Fetch from attn_metadata directly, since it late bound by
         # MLAAttentionState, grabbing it directly `attn_metadata` can avoid
         # any weirdness around prefill_metadata caching
-        assert attn_metadata.chunked_prefill_workspace is not None
-        workspace = attn_metadata.chunked_prefill_workspace
+        assert attn_metadata.context_chunk_workspace is not None
+        workspace = attn_metadata.context_chunk_workspace
 
         for i in range(iters):
             toks = prefill_metadata.context_chunk_seq_tot[i]
@@ -1502,12 +1506,12 @@ def forward(
                 "output is not yet supported for MLAImplBase")
 
         if attn_metadata.is_profile_run and \
-            attn_metadata.chunked_prefill_workspace is not None:
+            attn_metadata.context_chunk_workspace is not None:
             # During the profile run try to simulate to worse case output size
             # for `self.kv_b_proj(kv_c_normed)` in `_compute_prefill_context`
             # since this can be large
             _ = torch.empty(
-                (attn_metadata.chunked_prefill_workspace.shape[0],
+                (attn_metadata.context_chunk_workspace.shape[0],
                  self.num_heads, self.qk_nope_head_dim + self.v_head_dim),
                 device=k_c_normed.device,
                 dtype=k_c_normed.dtype,

From 032e04522fa4146d455d68b8bc50548f785a4d82 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 5 Mar 2025 20:08:02 -0500
Subject: [PATCH 0531/1240] [CI/Build] Use spawn multiprocessing mode for V1
 test pipeline (#14243)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d0f5c94ffd8..521faeedd41 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -78,6 +78,7 @@ steps:
   - tests/basic_correctness/test_preemption
   - tests/basic_correctness/test_cumem.py
   commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
@@ -112,6 +113,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   - tests/entrypoints/offline_mode
   commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
@@ -196,6 +198,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - VLLM_USE_V1=1 pytest -v -s v1/core
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample

From f89d144369908d9b332fd45db441a9ed80ebaf58 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Mar 2025 20:08:51 -0500
Subject: [PATCH 0532/1240] Add benchmark for DeepGEMM and vLLM Block FP8 Dense
 GEMM (#13917)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/deepgemm/README.md         | 129 +++++
 .../benchmark_fp8_block_dense_gemm.py         | 464 ++++++++++++++++++
 2 files changed, 593 insertions(+)
 create mode 100644 benchmarks/kernels/deepgemm/README.md
 create mode 100644 benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py

diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
new file mode 100644
index 00000000000..917e814010f
--- /dev/null
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -0,0 +1,129 @@
+# DeepSeek DeepGEMM Kernels Benchmark
+
+This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
+
+Currently this just includes dense GEMMs and only works on Hopper GPUs.
+
+## Setup
+
+You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
+
+```
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM
+cd DeepGEMM
+python setup.py install
+uv pip install -e .
+```
+
+## Usage
+
+```
+python benchmark_fp8_block_dense_gemm.py
+INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
+===== STARTING FP8 GEMM BENCHMARK =====
+PyTorch version: 2.5.1+cu124
+CUDA version: 12.4
+Triton version: 3.1.0
+Using device: NVIDIA H100 80GB HBM3
+WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
+
+===== PERFORMANCE COMPARISON =====
+
+DeepGEMM Implementation:
++------+-------+-------+-----------+--------+--------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   |
++------+-------+-------+-----------+--------+--------+
+|    8 |  4096 |  7168 | 102.9     | 4.6    | 286.4  |
+|    8 |  7168 | 18432 | 70.8      | 29.8   | 1868.8 |
+|    8 | 18432 |  7168 | 69.3      | 30.5   | 1911.8 |
+|   64 |  4096 |  7168 | 69.1      | 54.4   | 439.0  |
+|   64 |  7168 | 18432 | 69.4      | 243.6  | 1933.6 |
+|   64 | 18432 |  7168 | 70.4      | 240.3  | 1917.2 |
+|   64 | 24576 |  1536 | 70.1      | 68.9   | 584.6  |
+|   64 | 32768 |   512 | 68.4      | 31.4   | 307.1  |
+|   64 |  7168 | 16384 | 69.5      | 216.3  | 1718.5 |
+|  128 |  4096 |  7168 | 141.1     | 53.3   | 222.1  |
+|  128 |  7168 | 18432 | 71.9      | 470.5  | 1896.1 |
+|  128 | 18432 |  7168 | 69.3      | 488.2  | 1988.2 |
+| 1024 |  4096 |  7168 | 89.7      | 670.1  | 502.5  |
+| 1024 | 18432 |  7168 | 279.0     | 969.8  | 635.2  |
+| 2048 |  4096 |  7168 | 175.1     | 687.0  | 347.4  |
+| 4096 |  4096 |  7168 | 335.4     | 717.0  | 275.1  |
++------+-------+-------+-----------+--------+--------+
+
+vLLM Triton Implementation:
++------+-------+-------+-----------+--------+--------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  |
++------+-------+-------+-----------+--------+--------+--------------+
+|    8 |  4096 |  7168 | 74.0      | 6.3    | 398.2  | 1.39x faster |
+|    8 |  7168 | 18432 | 89.6      | 23.6   | 1478.1 | 0.79x slower |
+|    8 | 18432 |  7168 | 113.2     | 18.7   | 1170.4 | 0.61x slower |
+|   64 |  4096 |  7168 | 79.4      | 47.3   | 382.2  | 0.87x slower |
+|   64 |  7168 | 18432 | 98.5      | 171.7  | 1363.0 | 0.70x slower |
+|   64 | 18432 |  7168 | 119.5     | 141.5  | 1129.4 | 0.59x slower |
+|   64 | 24576 |  1536 | 37.6      | 128.4  | 1089.7 | 1.86x faster |
+|   64 | 32768 |   512 | 38.7      | 55.5   | 542.6  | 1.77x faster |
+|   64 |  7168 | 16384 | 86.1      | 174.5  | 1386.4 | 0.81x slower |
+|  128 |  4096 |  7168 | 90.7      | 82.9   | 345.4  | 1.56x faster |
+|  128 |  7168 | 18432 | 144.0     | 234.9  | 946.9  | 0.50x slower |
+|  128 | 18432 |  7168 | 229.5     | 147.4  | 600.1  | 0.30x slower |
+| 1024 |  4096 |  7168 | 242.3     | 248.2  | 186.1  | 0.37x slower |
+| 1024 | 18432 |  7168 | 897.8     | 301.4  | 197.4  | 0.31x slower |
+| 2048 |  4096 |  7168 | 463.0     | 259.7  | 131.4  | 0.38x slower |
+| 4096 |  4096 |  7168 | 901.8     | 266.7  | 102.3  | 0.37x slower |
++------+-------+-------+-----------+--------+--------+--------------+
+
+vLLM CUTLASS Implementation:
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+| m    | n     | k     | Time (μs) | TFLOPS | GB/s   | vs DeepGEMM  | vs Triton    |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+|    8 |  4096 |  7168 | 34.6      | 13.6   | 852.3  | 2.98x faster | 2.14x faster |
+|    8 |  7168 | 18432 | 78.9      | 26.8   | 1677.3 | 0.90x slower | 1.13x faster |
+|    8 | 18432 |  7168 | 81.2      | 26.0   | 1631.1 | 0.85x slower | 1.39x faster |
+|   64 |  4096 |  7168 | 36.9      | 101.9  | 822.9  | 1.87x faster | 2.15x faster |
+|   64 |  7168 | 18432 | 87.4      | 193.4  | 1535.2 | 0.79x slower | 1.13x faster |
+|   64 | 18432 |  7168 | 85.0      | 199.0  | 1587.6 | 0.83x slower | 1.41x faster |
+|   64 | 24576 |  1536 | 28.0      | 172.8  | 1465.8 | 2.51x faster | 1.35x faster |
+|   64 | 32768 |   512 | 28.8      | 74.5   | 728.5  | 2.37x faster | 1.34x faster |
+|   64 |  7168 | 16384 | 77.9      | 193.0  | 1532.8 | 0.89x slower | 1.11x faster |
+|  128 |  4096 |  7168 | 39.1      | 192.4  | 802.0  | 3.61x faster | 2.32x faster |
+|  128 |  7168 | 18432 | 93.7      | 360.8  | 1454.2 | 0.77x slower | 1.54x faster |
+|  128 | 18432 |  7168 | 85.7      | 394.8  | 1608.0 | 0.81x slower | 2.68x faster |
+| 1024 |  4096 |  7168 | 99.7      | 603.1  | 452.2  | 0.90x slower | 2.43x faster |
+| 1024 | 18432 |  7168 | 331.3     | 816.7  | 534.9  | 0.84x slower | 2.71x faster |
+| 2048 |  4096 |  7168 | 198.3     | 606.6  | 306.7  | 0.88x slower | 2.34x faster |
+| 4096 |  4096 |  7168 | 392.2     | 613.2  | 235.3  | 0.86x slower | 2.30x faster |
++------+-------+-------+-----------+--------+--------+--------------+--------------+
+
+===== AVERAGE PERFORMANCE =====
++----------------+------------+----------+---------------+
+| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
++----------------+------------+----------+---------------+
+| DeepGEMM       | 310.98     | 1052.10  | 0.11          |
+| vLLM Triton    | 144.30     | 715.60   | 0.23          |
+| vLLM CUTLASS   | 286.78     | 1076.67  | 0.11          |
++----------------+------------+----------+---------------+
+
+===== AVERAGE SPEEDUPS =====
++-----------------------------+--------------+
+| Comparison                  | Speedup      |
++-----------------------------+--------------+
+| DeepGEMM vs vLLM Triton     | 1.71x faster |
+| DeepGEMM vs vLLM CUTLASS    | 0.94x slower |
+| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
++-----------------------------+--------------+
+
+===== ACCURACY COMPARISON =====
++----------------+-----------------------+
+| Implementation | Avg Diff vs Reference |
++----------------+-----------------------+
+| DeepGEMM       | 0.000684              |
+| vLLM Triton    | 0.000684              |
+| vLLM CUTLASS   | 0.000684              |
++----------------+-----------------------+
+```
diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
new file mode 100644
index 00000000000..7892f126e7d
--- /dev/null
+++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py
@@ -0,0 +1,464 @@
+# SPDX-License-Identifier: Apache-2.0
+# fmt: off
+# ruff: noqa: E501
+import time
+
+# Import DeepGEMM functions
+import deep_gemm
+import torch
+import triton
+from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
+
+# Import vLLM functions
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
+def per_token_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-token scaling."""
+    assert x.dim() == 2 and x.size(1) % 128 == 0
+    m, n = x.shape
+    x_view = x.view(m, -1, 128)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
+        torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
+
+
+# Copied from
+# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
+def per_block_cast_to_fp8(
+        x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    """Convert tensor to FP8 format with per-block scaling."""
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
+                           dtype=x.dtype,
+                           device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
+        x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+
+
+def benchmark_shape(m: int,
+                    n: int,
+                    k: int,
+                    warmup: int = 100,
+                    repeat: int = 10000,
+                    verbose: bool = False) -> dict:
+    """Benchmark all implementations for a specific (m, n, k) shape."""
+    if verbose:
+        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
+
+    # Create test tensors
+    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
+    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
+
+    # Reference result in BF16
+    torch.cuda.synchronize()
+    C_ref = A @ B.t()
+
+    # Pre-quantize B for all implementations
+    # (weights can be pre-quantized offline)
+    B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
+    B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
+
+    # Block size configuration
+    block_size = [128, 128]
+
+    # Pre-quantize A for all implementations
+    A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        A, block_size[1], column_major_scales=True)
+
+    # === DeepGEMM Implementation ===
+    def deepgemm_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
+        # A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
+        #     A, block_size[1])
+        # A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
+        # C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
+        deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
+                                       (B_deepgemm, B_scale_deepgemm),
+                                       C_deepgemm)
+        return C_deepgemm
+
+    # === vLLM Triton Implementation ===
+    def vllm_triton_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
+        return w8a8_block_fp8_matmul(A_vllm,
+                                     B_vllm,
+                                     A_scale_vllm,
+                                     B_scale_vllm,
+                                     block_size,
+                                     output_dtype=torch.bfloat16)
+
+    # === vLLM CUTLASS Implementation ===
+    def vllm_cutlass_gemm():
+        # A quantization is inside the loop as it depends on activations
+        # A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
+        #     A, block_size[1], column_major_scales=True)
+        return ops.cutlass_scaled_mm(A_vllm_cutlass,
+                                     B_vllm.T,
+                                     scale_a=A_scale_vllm_cutlass,
+                                     scale_b=B_scale_vllm.T,
+                                     out_dtype=torch.bfloat16)
+
+    # Run correctness check first
+    if verbose:
+        print("Running correctness check...")
+    C_deepgemm = deepgemm_gemm()
+    C_vllm_triton = vllm_triton_gemm()
+    C_vllm_cutlass = vllm_cutlass_gemm()
+
+    deepgemm_diff = calc_diff(C_deepgemm, C_ref)
+    vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
+    vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
+
+    if verbose:
+        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
+        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
+        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
+        print("vLLM Triton vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
+        print("vLLM CUTLASS vs DeepGEMM difference: "
+              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
+
+    # Benchmark implementations
+    implementations = {
+        "DeepGEMM": deepgemm_gemm,
+        "vLLM Triton": vllm_triton_gemm,
+        "vLLM CUTLASS": vllm_cutlass_gemm
+    }
+
+    benchmark_results = {
+        "shape": {
+            "m": m,
+            "n": n,
+            "k": k
+        },
+        "implementations": {}
+    }
+
+    for name, func in implementations.items():
+        # Warmup
+        for _ in range(warmup):
+            func()
+            torch.cuda.synchronize()
+
+        # Timing loop
+        torch.cuda.synchronize()
+        start = time.time()
+        for _ in range(repeat):
+            func()
+        torch.cuda.synchronize()
+        end = time.time()
+
+        # Calculate timing and TFLOPS
+        avg_time_ms = (end - start) / repeat * 1000
+        avg_time_us = avg_time_ms * 1000
+        tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
+        gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
+
+        benchmark_results["implementations"][name] = {
+            "time_ms": avg_time_ms,
+            "time_us": avg_time_us,
+            "tflops": tflops,
+            "gb_s": gb_s,
+            "diff": {
+                "DeepGEMM":
+                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
+                "Reference":
+                deepgemm_diff if name == "DeepGEMM" else
+                (vllm_triton_diff
+                 if name == "vLLM Triton" else vllm_cutlass_diff)
+            }
+        }
+
+        if verbose:
+            print(
+                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
+            )
+
+    # Calculate speedups
+    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
+    for name, data in benchmark_results["implementations"].items():
+        if name != "DeepGEMM":
+            speedup = baseline / data["time_ms"]
+            benchmark_results["implementations"][name][
+                "speedup_vs_deepgemm"] = speedup
+            if verbose:
+                print(f"DeepGEMM is {1/speedup:.2f}x "
+                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
+
+    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
+        "time_ms"]
+    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
+        "time_ms"]
+    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
+    benchmark_results["implementations"]["vLLM CUTLASS"][
+        "speedup_vs_triton"] = cutlass_vs_triton
+    if verbose:
+        print(
+            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
+            f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
+        )
+
+    return benchmark_results
+
+
+def format_table_row(values, widths):
+    """Format a row with specified column widths."""
+    return "| " + " | ".join(f"{val:{w}}"
+                             for val, w in zip(values, widths)) + " |"
+
+
+def print_table(headers, rows, title=None):
+    """Print a table with headers and rows."""
+    if title:
+        print(f"\n{title}")
+
+    # Calculate column widths based on headers and data
+    widths = [
+        max(len(str(h)), max(len(str(row[i])) for row in rows))
+        for i, h in enumerate(headers)
+    ]
+
+    # Create separator line
+    separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
+
+    # Print table
+    print(separator)
+    print(format_table_row(headers, widths))
+    print(separator)
+    for row in rows:
+        print(format_table_row(row, widths))
+    print(separator)
+
+
+def format_speedup(value):
+    """Format speedup value with indicator if it's faster or slower."""
+    return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
+
+
+def run_benchmarks(verbose: bool = False):
+    """Run benchmarks for a set of common shapes."""
+    print("===== STARTING FP8 GEMM BENCHMARK =====")
+
+    # Make sure we're using the GPU
+    if not torch.cuda.is_available():
+        print("CUDA not available! Tests require GPU.")
+        return
+
+    # Print system information
+    print(f"PyTorch version: {torch.__version__}")
+    print(f"CUDA version: {torch.version.cuda}")
+    print(f"Triton version: {triton.__version__}")
+    print(f"Using device: {torch.cuda.get_device_name()}")
+
+    # Enable TF32 for better performance
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+
+    # Set seeds for reproducibility
+    torch.manual_seed(42)
+    torch.cuda.manual_seed(42)
+
+    # Define benchmark shapes (m, n, k)
+    shapes = [
+        (8, 4096, 7168),
+        (8, 7168, 18432),
+        (8, 18432, 7168),
+        (64, 4096, 7168),
+        (64, 7168, 18432),
+        (64, 18432, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 18432),
+        (128, 18432, 7168),
+        (1024, 4096, 7168),
+        (1024, 18432, 7168),
+        (2048, 4096, 7168),
+        (4096, 4096, 7168),
+    ]
+    shapes = [
+        # (64, 2112, 7168),
+        (64, 24576, 1536),
+        (64, 32768, 512),
+        (64, 7168, 16384),
+        (64, 4096, 7168),
+        (64, 7168, 2048),
+        # (128, 2112, 7168),
+        (128, 24576, 1536),
+        (128, 32768, 512),
+        (128, 7168, 16384),
+        (128, 4096, 7168),
+        (128, 7168, 2048),
+        # (4096, 2112, 7168),
+        (4096, 24576, 1536),
+        (4096, 32768, 512),
+        (4096, 7168, 16384),
+        (4096, 4096, 7168),
+        (4096, 7168, 2048),
+    ]
+
+    all_results = []
+    for m, n, k in shapes:
+        result = benchmark_shape(m, n, k, verbose=verbose)
+        all_results.append(result)
+
+    # Print results in a nicely formatted table
+    print("\n===== PERFORMANCE COMPARISON =====")
+
+    # Print DeepGEMM table
+    deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
+    deepgemm_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["DeepGEMM"]
+        deepgemm_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
+        ])
+
+    print_table(deepgemm_headers,
+                deepgemm_rows,
+                title="DeepGEMM Implementation:")
+
+    # Print vLLM Triton table
+    triton_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
+    ]
+    triton_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM Triton"]
+        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
+        triton_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(speedup)
+        ])
+
+    print_table(triton_headers,
+                triton_rows,
+                title="vLLM Triton Implementation:")
+
+    # Print vLLM CUTLASS table
+    cutlass_headers = [
+        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
+        "vs Triton"
+    ]
+    cutlass_rows = []
+    for result in all_results:
+        shape = result["shape"]
+        impl_data = result["implementations"]["vLLM CUTLASS"]
+        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
+        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
+        cutlass_rows.append([
+            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
+            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
+            format_speedup(vs_deepgemm),
+            format_speedup(vs_triton)
+        ])
+
+    print_table(cutlass_headers,
+                cutlass_rows,
+                title="vLLM CUTLASS Implementation:")
+
+    # Calculate and print averages
+    print("\n===== AVERAGE PERFORMANCE =====")
+
+    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
+    avg_metrics = {
+        impl: {
+            "tflops": 0,
+            "gb_s": 0,
+            "time_ms": 0
+        }
+        for impl in implementations
+    }
+
+    for result in all_results:
+        for impl in implementations:
+            impl_data = result["implementations"][impl]
+            avg_metrics[impl]["tflops"] += impl_data["tflops"]
+            avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
+            avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
+
+    num_shapes = len(all_results)
+    avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
+    avg_rows = []
+
+    for impl in implementations:
+        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
+        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
+        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
+        avg_rows.append([
+            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
+        ])
+
+    print_table(avg_headers, avg_rows)
+
+    # Calculate average speedups
+    avg_speedups = {
+        "DeepGEMM vs vLLM Triton": 0,
+        "DeepGEMM vs vLLM CUTLASS": 0,
+        "vLLM CUTLASS vs vLLM Triton": 0
+    }
+
+    for result in all_results:
+        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
+        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
+        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
+            "time_ms"]
+
+        avg_speedups[
+            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
+        avg_speedups[
+            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
+        avg_speedups[
+            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
+
+    print("\n===== AVERAGE SPEEDUPS =====")
+    speedup_headers = ["Comparison", "Speedup"]
+    speedup_rows = []
+    for comparison, total in avg_speedups.items():
+        avg_speedup = total / num_shapes
+        status = "faster" if avg_speedup > 1 else "slower"
+        speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
+
+    print_table(speedup_headers, speedup_rows)
+
+    # Average accuracy comparison
+    print("\n===== ACCURACY COMPARISON =====")
+    avg_diff = {impl: 0 for impl in implementations}
+
+    for result in all_results:
+        for impl in implementations:
+            avg_diff[impl] += result["implementations"][impl]["diff"][
+                "Reference"]
+
+    diff_headers = ["Implementation", "Avg Diff vs Reference"]
+    diff_rows = []
+    for impl in implementations:
+        diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
+
+    print_table(diff_headers, diff_rows)
+
+
+if __name__ == "__main__":
+    run_benchmarks(verbose=False)

From c34c8cc238e8232deb4e86f2ec68d77d8eb068fb Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 5 Mar 2025 20:09:29 -0500
Subject: [PATCH 0533/1240] [Build] Add UV_HTTP_TIMEOUT to avoid timeout during
 installation (#13850)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 63314b906f1..bb9b7b24e46 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,6 +31,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 RUN apt-get install -y gcc-10 g++-10
@@ -81,6 +85,10 @@ ARG TARGETPLATFORM
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-build.txt
 
@@ -143,6 +151,10 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 #################### DEV IMAGE ####################
 FROM base as dev
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 COPY requirements-lint.txt requirements-lint.txt
 COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
@@ -181,6 +193,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 RUN --mount=type=cache,target=/root/.cache/uv \
     python3 -m pip install uv
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
@@ -237,6 +253,10 @@ FROM vllm-base AS test
 
 ADD . /vllm-workspace/
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements-dev.txt
@@ -265,6 +285,10 @@ RUN mv vllm test_docs/
 # base openai image with additional requirements, for any subsequent openai-style images
 FROM vllm-base AS vllm-openai-base
 
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \

From 2bf6c294a6a1d2a9db0323abcdfa5d5375611808 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Wed, 5 Mar 2025 20:10:13 -0500
Subject: [PATCH 0534/1240] [BugFix] MLA + V1, illegal memory access and
 accuracy issues (#14253)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_input_batch.py      |  90 +++++-
 vllm/v1/attention/backends/flash_attn.py     |   4 +-
 vllm/v1/attention/backends/mla/common.py     | 305 +++++++++++--------
 vllm/v1/attention/backends/mla/flashmla.py   |  58 ++--
 vllm/v1/attention/backends/mla/triton_mla.py |   7 +-
 vllm/v1/worker/gpu_input_batch.py            |  25 +-
 vllm/v1/worker/gpu_model_runner.py           |   6 +-
 7 files changed, 334 insertions(+), 161 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 72ec7370115..5f0cb1d3d3b 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import inspect
 from typing import Optional
 
 import numpy as np
@@ -9,7 +10,8 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
+from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState,
+                                            InputBatch)
 
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
@@ -20,6 +22,34 @@
 MAX_NUM_PROMPT_TOKENS = 64
 
 
+def _compare_objs(obj1, obj2):
+    attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a)))
+    attr_names = set([
+        a[0] for a in attrs
+        if not (a[0].startswith('__') and a[0].endswith('__'))
+    ])
+    for attr_name in attr_names:
+        a = getattr(obj1, attr_name)
+        b = getattr(obj2, attr_name)
+
+        is_same = False
+        if isinstance(a, torch.Tensor):
+            if (a.numel() == 0 or b.numel() == 0):
+                is_same = (a.numel() == 0 and b.numel() == 0)
+            elif torch.allclose(a, b):
+                is_same = True
+        elif isinstance(a, np.ndarray):
+            if np.allclose(a, b):
+                is_same = True
+        elif isinstance(a, (BlockTable, SamplingMetadata)):
+            _compare_objs(a, b)
+            is_same = True  # if we make it here must be same
+        elif a == b:
+            is_same = True
+        assert is_same, f"Attribute {attr_name} is different"\
+            f" in {obj1} and {obj2}: {a} != {b}"
+
+
 def _remove_requests(
         input_batch: InputBatch, batch_size: int,
         reqs: list[CachedRequestState]) -> tuple[set[str], list[int]]:
@@ -254,3 +284,61 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
             sampling_metadata.allowed_token_ids_mask)
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("swap_list", [((0, 1), )])
+def test_swap_states_in_input_batch(device: str, batch_size: int,
+                                    swap_list: list):
+    """
+    Tests the logic for managing sampling metadata in the InputBatch.
+
+    This test involves adding a set of requests to the InputBatch,
+    followed by removing a subset of them. Afterward, the batch is compacted,
+    and the `make_sampling_metadata` method is invoked on the batch. The
+    output of `make_sampling_metadata` is then compared against the expected
+    results to ensure correctness.
+    """
+    input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_blocks_per_req=10,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+    )
+    ref_input_batch: InputBatch = InputBatch(
+        max_num_reqs=batch_size,
+        max_model_len=1024,
+        max_num_blocks_per_req=10,
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=1024,
+    )
+
+    reqs: list[CachedRequestState] = []
+    req_id_reqs = {}
+    req_id_output_token_ids = {}
+    # Add requests
+    for req_index in range(batch_size):
+        req: CachedRequestState = _construct_cached_request_state(req_index)
+        input_batch.add_request(req, req_index)
+        reqs.append(req)
+        req_id_reqs[req.req_id] = req
+        req_id_output_token_ids[req.req_id] = req.output_token_ids
+
+    reordered_reqs = reqs.copy()
+    for swap_pair in swap_list:
+        reordered_reqs[swap_pair[0]], reordered_reqs[swap_pair[1]] = \
+            reordered_reqs[swap_pair[1]], reordered_reqs[swap_pair[0]]
+        input_batch.swap_states(swap_pair[0], swap_pair[1])
+
+    for req_index in range(batch_size):
+        req = reordered_reqs[req_index]
+        ref_input_batch.add_request(req, req_index)
+
+    input_batch.refresh_sampling_metadata()
+    ref_input_batch.refresh_sampling_metadata()
+
+    _compare_objs(input_batch, ref_input_batch)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 8bf7f3587bc..e7c2fd412eb 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -100,8 +100,8 @@ def __init__(self, runner: "GPUModelRunner"):
         self.runner = runner
 
     def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput"):
-        pass
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 824ffcfd61b..c98262eea1e 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -275,17 +275,47 @@ def use_cascade_attention(*args, **kwargs) -> bool:
 
 
 @dataclass
-class MLACommonMetadata:
-    """Metadata for MLACommon.
+class MLACommonPrefillMetadata:
+    """ Prefill Specific Metadata """
+
+    @dataclass
+    class ChunkedContextMetadata:
+        # New for MLA (compared to FlashAttention)
+        # For handling chunked prefill
+        cu_seq_lens: torch.Tensor
+        starts: torch.Tensor
+        seq_tot: list[int]
+        max_seq_lens: list[int]
+        workspace: torch.Tensor
 
-    NOTE: Please read the comment at the top of the file before trying to
-    understand this class
-    """
-    # New for MLA (compared to FlashAttention)
     # Input positions for rotrary embeddings since for MLA the rotary
     # position embeddings are applied inside the attention backend
     input_positions: torch.Tensor
+    block_table: torch.Tensor
+    query_start_loc: torch.Tensor
+    max_query_len: int
+    chunked_context: Optional[ChunkedContextMetadata] = None
+
 
+@dataclass
+class MLACommonDecodeMetadata:
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor
+    block_table: torch.Tensor
+    seq_lens: torch.Tensor
+
+
+D = TypeVar("D", bound=MLACommonDecodeMetadata)
+
+
+@dataclass
+class MLACommonMetadata(Generic[D]):
+    """Metadata for MLACommon.
+
+    NOTE: Please read the comment at the top of the file before trying to
+    understand this class
+    """
     # NOTE(sang): Definition of context_len, query_len, and seq_len.
     # |---------- N-1 iteration --------|
     # |---------------- N iteration ---------------------|
@@ -295,30 +325,23 @@ class MLACommonMetadata:
     #                                   |-- query_len ---|
 
     num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
     query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
     slot_mapping: torch.Tensor
 
+    # New for MLA (compared to FlashAttention)
+    # For handling prefill decode split
+    num_decodes: int
+    num_decode_tokens: int
+    num_prefills: int
+
     # For logging.
     num_input_tokens: int = 0  # Number of tokens including padding.
 
     # The dimension of the attention heads
     head_dim: Optional[int] = None
 
-    # New for MLA (compared to FlashAttention)
-    # For chunked prefill
-    num_decodes: Optional[int] = None
-    num_decode_tokens: Optional[int] = None
-    num_prefills: Optional[int] = None
-    has_context: bool = False
-    context_chunk_cu_seq_lens: Optional[torch.Tensor] = None
-    context_chunk_starts: Optional[torch.Tensor] = None
-    context_chunk_seq_tot: Optional[list[int]] = None
-    context_chunk_max_seq_lens: Optional[list[int]] = None
-    chunked_prefill_workspace: Optional[torch.Tensor] = None
+    decode: Optional[D] = None
+    prefill: Optional[MLACommonPrefillMetadata] = None
 
     def __post_init__(self):
         supported_head_sizes = MLACommonBackend.get_supported_head_sizes()
@@ -329,10 +352,10 @@ def __post_init__(self):
                 f"received {self.head_dim}.")
 
 
-T = TypeVar("T", bound=MLACommonMetadata)
+M = TypeVar("M", bound=MLACommonMetadata)
 
 
-class MLACommonMetadataBuilder(Generic[T]):
+class MLACommonMetadataBuilder(Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -340,8 +363,9 @@ class MLACommonMetadataBuilder(Generic[T]):
 
     def __init__(self,
                  runner: "GPUModelRunner",
-                 cls: Optional[type[T]] = None):
-        self.cls = cls if cls is not None else MLACommonMetadata
+                 metadata_cls: Optional[type[M]] = None):
+        self.metadata_cls = metadata_cls \
+            if metadata_cls is not None else MLACommonMetadata
         self.runner = runner
         scheduler_config = runner.scheduler_config
         model_config = runner.model_config
@@ -375,7 +399,7 @@ def __init__(self,
             self.page_size = self.runner.block_size
 
     def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput"):
+                      scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are and
         # the front and the "prefill" requests are at the using the least amount
         # swaps possible. (NOTE for now we loosely use "decode" to mean requests
@@ -413,6 +437,7 @@ def reorder_batch(self, input_batch: "InputBatch",
         num_decodes = len(decodes)
         num_prefills = len(prefills)
         first_prefill = 0
+        modified_batch = False
 
         for i in range(1, min(num_decodes, num_prefills) + 1):
             # If the decode is at the "back" of the batch, i, we can swap it
@@ -421,6 +446,7 @@ def reorder_batch(self, input_batch: "InputBatch",
                 input_batch.swap_states(prefills[first_prefill],
                                         decodes[num_decodes - i])
                 first_prefill += 1
+                modified_batch = True
             else:
                 break
 
@@ -432,10 +458,21 @@ def reorder_batch(self, input_batch: "InputBatch",
         self._num_decode_tokens = num_decode_tokens
         self._num_prefill_tokens = num_prefill_tokens
 
+        return modified_batch
+
+    def _build_decode(self, input_positions: torch.Tensor,
+                      block_table: torch.Tensor, seq_lens: torch.Tensor):
+        return MLACommonDecodeMetadata(
+            input_positions=input_positions,
+            block_table=block_table,
+            seq_lens=seq_lens,
+        )
+
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int) -> T:
+              common_prefix_len: int) -> M:
+        assert self._num_decodes + self._num_prefills == num_reqs
+
         device = self.runner.device
-        max_seq_len = self.runner.seq_lens_np[:num_reqs].max()
         query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
             device, non_blocking=True)
         seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(device,
@@ -447,85 +484,103 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
             device, non_blocking=True).long()
 
-        context_chunk_cu_seq_lens = None
-        context_chunk_starts = None
-        context_chunk_seq_tot = None
-        context_chunk_max_seq_lens = None
-
-        num_computed_tokens_cpu_tensor = \
-            self.runner.input_batch.num_computed_tokens_cpu_tensor[:num_reqs]
-        context_lens_tensor = \
-            num_computed_tokens_cpu_tensor.to(device, non_blocking=True)
-
-        if self.chunked_prefill_enabled and self._num_prefills > 0 \
-            and context_lens_tensor[self._num_decodes:].max() > 0:
-            # NOTE: it is recommend you read the `Chunked Prefill` section in
-            # the comment at the top of the file before trying to understand
-            # the following code
-
-            self.has_context = True
-
-            num_prefills_with_context = \
-                (context_lens_tensor[self._num_decodes:] > 0).sum().item()
-
-            # currently we allocate an equal amount of workspace for each
-            # prefill in the batch, we could probably use a more advanced
-            # algorithm here and allocate more workspace to prefills with
-            # longer context lengths
-            max_context_chunk = \
-                self.chunked_prefill_workspace_size // num_prefills_with_context
-
-            # align max_context_chunk to page_size by rounding down,
-            # currently the `gather_cache` kernel cannot handle
-            # `context_chunk_starts` that are not aligned to page_size
-            max_context_chunk = round_down(max_context_chunk, self.page_size)
-            assert max_context_chunk > 0
-            num_chunks = cdiv(context_lens_tensor.max(), max_context_chunk)
-
-            # if `max_context_chunk = 256`, `num_chunks = 3`, and
-            #   `num_prefills_with_context = 4`, create a tensor that looks like
-            #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
-            context_chunk_starts = \
-                torch.arange(num_chunks, device=device, dtype=torch.int32) \
-                .unsqueeze(1).expand(-1, self._num_prefills) \
-                * max_context_chunk
-            chunk_ends = torch.min(context_lens_tensor[self._num_decodes:] \
-                .unsqueeze(0), context_chunk_starts + max_context_chunk)
-            chunk_seq_lens = (chunk_ends - context_chunk_starts).clamp(min=0)
-            _context_chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
-                torch.int32)
-            zero = torch.zeros(num_chunks, dtype=torch.int32, device=device) \
-                .unsqueeze(-1)
-            context_chunk_cu_seq_lens = \
-                torch.cat([zero, _context_chunk_cu_seq_lens], dim=1)
-            context_chunk_max_seq_lens = \
-                chunk_seq_lens.max(dim=1).values.tolist()
-            context_chunk_seq_tot = chunk_seq_lens.sum(dim=1).tolist()
-            assert max(context_chunk_seq_tot) <= \
-                self.chunked_prefill_workspace_size
-
-        return self.cls(
-            input_positions=input_positions,
+        prefill_metadata = None
+        if self._num_prefills > 0:
+            reqs_start = self._num_decodes  # prefill_start
+            tokens_start = self._num_decode_tokens
+
+            context_lens_cpu = self.runner.input_batch.\
+                num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
+            context_lens = context_lens_cpu.to(device, non_blocking=True)
+
+            chunked_context_metadata = None
+            if self.chunked_prefill_enabled and self._num_prefills > 0 \
+                and context_lens.max() > 0:
+                # NOTE: it is recommend you read the `Chunked Prefill` section
+                # in the comment at the top of the file before trying to
+                # understand the following code
+
+                num_prefills_with_context = (context_lens > 0).sum().item()
+
+                # currently we allocate an equal amount of workspace for each
+                # prefill in the batch, we could probably use a more advanced
+                # algorithm here and allocate more workspace to prefills with
+                # longer context lengths
+                max_context_chunk = \
+                    self.chunked_prefill_workspace_size \
+                        // num_prefills_with_context
+
+                # align max_context_chunk to page_size by rounding down,
+                # currently the `gather_cache` kernel cannot handle
+                # `context_chunk_starts` that are not aligned to page_size
+                max_context_chunk = round_down(max_context_chunk,
+                                               self.page_size)
+
+                assert max_context_chunk > 0
+                num_chunks = cdiv(context_lens.max(), max_context_chunk)
+
+                # if `max_context_chunk = 256`, `num_chunks = 3`, and
+                #   `num_prefills_with_context = 4`, create a tensor that looks
+                # like
+                #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+                chunk_starts = \
+                    torch.arange(num_chunks, device=device, dtype=torch.int32) \
+                    .unsqueeze(1).expand(-1, self._num_prefills) \
+                    * max_context_chunk
+                chunk_ends = torch.min(context_lens.unsqueeze(0),
+                                       chunk_starts + max_context_chunk)
+                chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
+                _chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
+                    torch.int32)
+                zero = torch.zeros(num_chunks,
+                                   dtype=torch.int32,
+                                   device=device).unsqueeze(-1)
+
+                chunked_context_metadata = \
+                    MLACommonPrefillMetadata.ChunkedContextMetadata(
+                    cu_seq_lens=torch.cat(
+                        [zero, _chunk_cu_seq_lens], dim=1),
+                    starts=chunk_starts,
+                    seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
+                    max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
+                    workspace=self.chunked_prefill_workspace,
+                )
+
+                assert max(chunked_context_metadata.max_seq_lens) <= \
+                    self.chunked_prefill_workspace_size
+
+            prefill_metadata = MLACommonPrefillMetadata(
+                input_positions=input_positions[tokens_start:],
+                block_table=block_table[reqs_start:, ...],
+                query_start_loc=query_start_loc[reqs_start:] -
+                query_start_loc[reqs_start],
+                max_query_len=seq_lens[reqs_start:].max().item(),
+                chunked_context=chunked_context_metadata,
+            )
+
+        decode_metadata = None
+        if self._num_decodes > 0:
+            decode_metadata = self._build_decode(
+                input_positions=input_positions[:self._num_decode_tokens],
+                block_table=block_table[:self._num_decodes, ...],
+                seq_lens=seq_lens[:self._num_decodes],
+            )
+
+        return self.metadata_cls(
             num_actual_tokens=num_actual_tokens,
-            max_query_len=max_query_len,
             query_start_loc=query_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table,
             slot_mapping=slot_mapping,
             head_dim=self.runner.model_config.get_head_size(),
             # MLACommonMetadata Chunk prefill specific
             num_decodes=self._num_decodes,
             num_decode_tokens=self._num_decode_tokens,
             num_prefills=self._num_prefills,
-            context_chunk_cu_seq_lens=context_chunk_cu_seq_lens,
-            context_chunk_starts=context_chunk_starts,
-            context_chunk_seq_tot=context_chunk_seq_tot,
-            context_chunk_max_seq_lens=context_chunk_max_seq_lens,
+            prefill=prefill_metadata,
+            decode=decode_metadata,
         )
 
 
-class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
+class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -798,28 +853,24 @@ def _compute_prefill_context(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
     ):
-        assert attn_metadata.num_prefills is not None
-        assert attn_metadata.context_chunk_seq_tot is not None
-        assert attn_metadata.context_chunk_cu_seq_lens is not None
-        assert attn_metadata.context_chunk_starts is not None
-        assert attn_metadata.context_chunk_max_seq_lens is not None
+        assert attn_metadata.prefill is not None
+        prefill_metadata = attn_metadata.prefill
+        assert prefill_metadata.chunked_context is not None
 
         output = None
-        iters = len(attn_metadata.context_chunk_seq_tot)
-
-        assert attn_metadata.chunked_prefill_workspace is not None
-        workspace = attn_metadata.chunked_prefill_workspace
+        iters = len(prefill_metadata.chunked_context.seq_tot)
+        workspace = prefill_metadata.chunked_context.workspace
 
         for i in range(iters):
-            toks = attn_metadata.context_chunk_seq_tot[i]
+            toks = prefill_metadata.chunked_context.seq_tot[i]
 
             ops.gather_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
-                block_table=attn_metadata.block_table,
-                cu_seq_lens=attn_metadata.context_chunk_cu_seq_lens[i],
+                block_table=prefill_metadata.block_table,
+                cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
                 batch_size=attn_metadata.num_prefills,
-                seq_starts=attn_metadata.context_chunk_starts[i],
+                seq_starts=prefill_metadata.chunked_context.starts[i],
             )
 
             kv_c_normed = workspace[:toks]\
@@ -845,10 +896,10 @@ def _compute_prefill_context(
                 q=q,
                 k=k,
                 v=v_padded,
-                cu_seqlens_q=attn_metadata.query_start_loc,
-                cu_seqlens_k=attn_metadata.context_chunk_cu_seq_lens[i],
-                max_seqlen_q=attn_metadata.max_query_len,
-                max_seqlen_k=attn_metadata.context_chunk_max_seq_lens[i],
+                cu_seqlens_q=prefill_metadata.query_start_loc,
+                cu_seqlens_k=prefill_metadata.chunked_context.cu_seq_lens[i],
+                max_seqlen_q=prefill_metadata.max_query_len,
+                max_seqlen_k=prefill_metadata.chunked_context.max_seq_lens[i],
                 softmax_scale=self.scale,
                 causal=False,  # Context is unmasked
                 return_softmax_lse=True,
@@ -881,7 +932,9 @@ def _forward_prefill(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
-        has_context = attn_metadata.has_context
+        assert attn_metadata.prefill is not None
+
+        has_context = attn_metadata.prefill.chunked_context is not None
         kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
             -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
         k_nope, v = kv_nope\
@@ -898,10 +951,10 @@ def _forward_prefill(
             q=q,
             k=k,
             v=v_padded,
-            cu_seqlens_q=attn_metadata.query_start_loc,
-            cu_seqlens_k=attn_metadata.query_start_loc,
-            max_seqlen_q=attn_metadata.max_query_len,
-            max_seqlen_k=attn_metadata.max_seq_len,
+            cu_seqlens_q=attn_metadata.prefill.query_start_loc,
+            cu_seqlens_k=attn_metadata.prefill.query_start_loc,
+            max_seqlen_q=attn_metadata.prefill.max_query_len,
+            max_seqlen_k=attn_metadata.prefill.max_query_len,
             softmax_scale=self.scale,
             causal=True,
             return_softmax_lse=has_context,
@@ -934,7 +987,7 @@ def _forward_decode(
         q_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
-        attn_metadata: T,
+        attn_metadata: M,
     ) -> torch.Tensor:
         raise NotImplementedError
 
@@ -945,7 +998,7 @@ def forward(
         k_c_normed: torch.Tensor,  # key in unified attn
         k_pe: torch.Tensor,  # value in unified attn
         kv_cache: torch.Tensor,
-        attn_metadata: T,
+        attn_metadata: M,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
@@ -966,7 +1019,6 @@ def forward(
 
         # Restore head dim (for rotary embedding)
         k_pe = k_pe.unsqueeze(1)
-        assert hasattr(attn_metadata, "input_positions")
 
         assert attn_metadata.num_decodes is not None and \
             attn_metadata.num_prefills is not None and \
@@ -978,28 +1030,27 @@ def forward(
 
         decode_hs_or_q_c = hidden_states_or_q_c[:num_decode_tokens]
         decode_k_pe = k_pe[:num_decode_tokens]
-        decode_input_positions = \
-            attn_metadata.input_positions[:num_decode_tokens]
 
         prefill_hs_or_q_c = hidden_states_or_q_c[num_decode_tokens:]
         prefill_k_pe = k_pe[num_decode_tokens:]
-        prefill_input_positions = \
-            attn_metadata.input_positions[num_decode_tokens:]
         prefill_k_c_normed = k_c_normed[num_decode_tokens:]
 
         if has_decode:
+            assert attn_metadata.decode is not None
             decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
-                decode_input_positions, decode_q_pe, decode_k_pe)
+                attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe)
 
         if has_prefill:
+            assert attn_metadata.prefill is not None
             prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
                 .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
             prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
-                prefill_input_positions, prefill_q_pe, prefill_k_pe)
+                attn_metadata.prefill.input_positions, prefill_q_pe,
+                prefill_k_pe)
 
         # write the latent and rope to kv cache
         if kv_cache.numel() > 0:
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index b357d714241..d5bf9cd22f1 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -11,6 +11,7 @@
                                          is_flashmla_supported)
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
+                                                   MLACommonDecodeMetadata,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
@@ -38,34 +39,41 @@ def get_impl_cls() -> type["FlashMLAImpl"]:
 
 
 @dataclass
-class FlashMLAMetadata(MLACommonMetadata):
-    decode_tile_scheduler_metadata: Optional[tuple[torch.Tensor,
-                                                   torch.Tensor]] = None
-    decode_num_splits: Optional[torch.Tensor] = None
+class FlashMLADecodeMetadata(MLACommonDecodeMetadata):
+    tile_scheduler_metadata: tuple[torch.Tensor, torch.Tensor]
+    num_splits: torch.Tensor
+
+
+@dataclass
+class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
+    pass
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
 
     def __init__(self, runner):
-        super().__init__(runner, cls=FlashMLAMetadata)
+        super().__init__(runner)
 
         self.num_q_heads = self.runner.model_config.get_num_attention_heads(
             self.runner.parallel_config)
 
-    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
-              common_prefix_len: int):
-        m = super().build(num_reqs, num_actual_tokens, max_query_len,
-                          common_prefix_len)
-
-        if m.num_decode_tokens is not None and m.num_decode_tokens > 0:
-            m.decode_tile_scheduler_metadata, m.decode_num_splits = \
-                get_mla_metadata(
-                m.seq_lens[:m.num_decode_tokens],
-                self.num_q_heads,
-                1, # MQA for the decode path
-            )
+    def _build_decode(self, input_positions: torch.Tensor,
+                      block_table: torch.Tensor,
+                      seq_lens: torch.Tensor) -> FlashMLADecodeMetadata:
+        tile_scheduler_metadata, num_splits = \
+            get_mla_metadata(
+            seq_lens,
+            self.num_q_heads,
+            1, # MQA for the decode path
+        )
 
-        return m
+        return FlashMLADecodeMetadata(
+            input_positions=input_positions,
+            block_table=block_table,
+            seq_lens=seq_lens,
+            tile_scheduler_metadata=tile_scheduler_metadata,
+            num_splits=num_splits,
+        )
 
 
 class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
@@ -115,6 +123,8 @@ def _forward_decode(
         attn_metadata: FlashMLAMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError("FP8 FlashMLA not yet supported")
 
@@ -124,14 +134,12 @@ def _forward_decode(
         o, _ = flash_mla_with_kvcache(
             q=q,
             k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1
-            block_table=attn_metadata.block_table[:attn_metadata.num_decodes,
-                                                  ...],
-            cache_seqlens=attn_metadata.seq_lens[:attn_metadata.
-                                                 num_decode_tokens],
+            block_table=attn_metadata.decode.block_table,
+            cache_seqlens=attn_metadata.decode.seq_lens,
             head_dim_v=self.kv_lora_rank,
-            tile_scheduler_metadata=attn_metadata.
-            decode_tile_scheduler_metadata,
-            num_splits=attn_metadata.decode_num_splits,
+            tile_scheduler_metadata=attn_metadata.decode.
+            tile_scheduler_metadata,
+            num_splits=attn_metadata.decode.num_splits,
             softmax_scale=self.scale,
             causal=True,
         )
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index 3f9b349a5f0..cef7a3a9a72 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -69,6 +69,8 @@ def _forward_decode(
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
+        assert attn_metadata.decode is not None
+
         if self.kv_cache_dtype.startswith("fp8"):
             raise NotImplementedError("FP8 Triton MLA not yet supported")
 
@@ -104,7 +106,8 @@ def _forward_decode(
 
         # Run MQA
         decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
-                             attn_metadata.block_table, attn_metadata.seq_lens,
-                             attn_logits, num_kv_splits, self.scale, PAGE_SIZE)
+                             attn_metadata.decode.block_table,
+                             attn_metadata.decode.seq_lens, attn_logits,
+                             num_kv_splits, self.scale, PAGE_SIZE)
 
         return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 02b2aa3ea67..6239a182e31 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -383,8 +383,6 @@ def swap_states(self, i1: int, i2: int) -> None:
             self.req_id_to_index[old_id_i2], self.req_id_to_index[old_id_i1]
         self.num_tokens[i1], self.num_tokens[i2] =\
             self.num_tokens[i2], self.num_tokens[i1]
-        self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
-            self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
         self.num_tokens_no_spec[i1], self.num_tokens_no_spec[i2] =\
             self.num_tokens_no_spec[i2], self.num_tokens_no_spec[i1]
         self.num_prompt_tokens[i1], self.num_prompt_tokens[i2] =\
@@ -406,24 +404,47 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.min_p_cpu[i1], self.min_p_cpu[i2] =\
             self.min_p_cpu[i2], self.min_p_cpu[i1]
 
+        # NOTE: the following is unsafe
+        # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
+        #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
+        # instead, we need to temporiarily copy the data for one of the indices
+        # TODO(lucas): optimize this by only copying valid indices
+        tmp = self.token_ids_cpu[i1, ...].copy()
+        self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
+        self.token_ids_cpu[i2, ...] = tmp
+
         g1 = self.generators.get(i1)
         g2 = self.generators.get(i2)
         if g1 is not None:
             self.generators[i2] = g1
+        else:
+            self.generators.pop(i2, None)
         if g2 is not None:
             self.generators[i1] = g2
+        else:
+            self.generators.pop(i1, None)
 
         t1 = self.min_tokens.get(i1)
         t2 = self.min_tokens.get(i2)
         if t1 is not None:
             self.min_tokens[i2] = t1
+        else:
+            self.min_tokens.pop(i2, None)
         if t2 is not None:
             self.min_tokens[i1] = t2
+        else:
+            self.min_tokens.pop(i1, None)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
         self.logit_bias[i1], self.logit_bias[i2] =\
             self.logit_bias[i2], self.logit_bias[i1]
+
+        if self.allowed_token_ids_mask_cpu_tensor is not None:
+            self.allowed_token_ids_mask_cpu_tensor[i1], \
+                self.allowed_token_ids_mask_cpu_tensor[i2] =\
+                self.allowed_token_ids_mask_cpu_tensor[i2], \
+                    self.allowed_token_ids_mask_cpu_tensor[i1]
         self.block_table.swap_row(i1, i2)
 
     def condense(self, empty_req_indices: list[int]) -> None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a1a50e89676..519f38cb0b7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -456,8 +456,10 @@ def _prepare_inputs(
         # Some attention backends (namely MLA) may want to separate requests
         # based on if the attention computation will be compute-bound or
         # memory-bound. This gives them a hook to do that.
-        self.attn_metadata_builder.reorder_batch(self.input_batch,
-                                                 scheduler_output)
+        modified_batch = self.attn_metadata_builder.reorder_batch(
+            self.input_batch, scheduler_output)
+        if modified_batch:
+            self.input_batch.refresh_sampling_metadata()
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.

From b68d9fac360d1b98e5e305cb0a562634910e6086 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 5 Mar 2025 18:00:36 -0800
Subject: [PATCH 0535/1240] [misc] Mention `ray list nodes` command to
 troubleshoot ray issues (#14318)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/distributed_serving.md |  4 ++--
 vllm/executor/ray_utils.py                 | 13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index 54c7ded2042..e6be644b739 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -81,7 +81,7 @@ Then you get a ray cluster of **containers**. Note that you need to keep the she
 Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
 :::
 
-Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
+Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
 After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
@@ -111,5 +111,5 @@ When you use huggingface repo id to refer to the model, you should append your h
 :::
 
 :::{warning}
-If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` to see the IP address used by Ray. See <gh-issue:7815> for more information.
+If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information.
 :::
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 6067f9a3c13..5d8b48ac67b 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -184,8 +184,9 @@ def _verify_bundles(placement_group: "PlacementGroup",
             f"group {placement_group.id}. Node id -> bundles "
             f"{node_id_to_bundle}. "
             "You don't have enough GPUs available in a current node. Check "
-            "`ray status` to see if you have available GPUs in a node "
-            f"{driver_node_id} before starting an vLLM engine.")
+            "`ray status` and `ray list nodes` to see if you have available "
+            "GPUs in a node `{driver_node_id}` before starting an vLLM engine."
+        )
 
     for node_id, bundles in node_id_to_bundle.items():
         if len(bundles) < parallel_config.tensor_parallel_size:
@@ -225,8 +226,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         wait_interval *= 2
         logger.info(
             "Waiting for creating a placement group of specs for "
-            "%d seconds. specs=%s. Check "
-            "`ray status` to see if you have enough resources,"
+            "%d seconds. specs=%s. Check `ray status` and "
+            "`ray list nodes` to see if you have enough resources,"
             " and make sure the IP addresses used by ray cluster"
             " are the same as VLLM_HOST_IP environment variable"
             " specified in each node if you are running on a multi-node.",
@@ -238,8 +239,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
         raise ValueError(
             "Cannot provide a placement group of "
             f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` to make sure the cluster has enough resources."
-        ) from None
+            "`ray status` and `ray list nodes` to make sure the cluster has "
+            "enough resources.") from None
 
 
 def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):

From b52964ec582ea27d4eaedd0f9697fa05522eba79 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Thu, 6 Mar 2025 11:49:20 +0800
Subject: [PATCH 0536/1240] [Bugfix][Structured Output] Support outlines engine
 with reasoning outputs for DeepSeek R1 (#14114)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md     | 49 ++++++++++++-
 ...etion_structured_outputs_with_reasoning.py | 69 ++++++++++++++++++-
 .../guided_decoding/__init__.py               |  1 +
 .../outlines_logits_processors.py             | 14 ++--
 .../guided_decoding/reasoner/__init__.py      | 10 ++-
 .../reasoner/deepseek_reasoner.py             | 10 +++
 .../guided_decoding/reasoner/reasoner.py      |  4 ++
 .../guided_decoding/xgrammar_decoding.py      |  2 +-
 8 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 230e461f69f..e2c154d31ba 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -10,7 +10,9 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
 
 vLLM currently supports the following reasoning models:
 
-- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for `<think> ... </think>`)
+| Model Series | Parser Name | Structured Output Support |
+|--------------|-------------|------------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
 
 ## Quickstart
 
@@ -78,11 +80,51 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 
 Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
+## Structured output
+
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
+
+```python
+from openai import OpenAI
+from pydantic import BaseModel
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+json_schema = People.model_json_schema()
+
+prompt = ("Generate a JSON with the name and age of one random person.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+```
+
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
 - It is not compatible with [`tool_calling`](#tool_calling).
-- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning.
 
 ## How to support a new reasoning model
 
@@ -166,9 +208,10 @@ class DeepSeekReasoner(Reasoner):
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.end_token_id in input_ids
+    ...
 ```
 
-The structured output engine like xgrammar will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
+The structured output engine like `xgrammar` will use `end_token_id` to check if the reasoning content is present in the model output and skip the structured output if it is the case.
 
 Finally, you can enable reasoning for the model by using the `--enable-reasoning` and `--reasoning-parser` flags.
 
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index 1f72e1164d4..9ceeae8fa96 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -33,6 +33,42 @@
 models = client.models.list()
 model = models.data[0].id
 
+# Guided decoding by Regex
+prompt = ("What is the capital of France?")
+
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={
+        "guided_regex": "(Paris|London)",
+    },
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+
+class People(BaseModel):
+    name: str
+    age: int
+
+
+json_schema = People.model_json_schema()
+
+prompt = ("Generate a JSON with the name and age of one random person.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_json": json_schema},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
 
 # Guided decoding by JSON using Pydantic schema
 class CarType(str, Enum):
@@ -51,7 +87,7 @@ class CarDescription(BaseModel):
 json_schema = CarDescription.model_json_schema()
 
 prompt = ("Generate a JSON with the brand, model and car_type of"
-          "the most iconic car from the 90's, think in 100 tokens")
+          "the most iconic car from the 90's")
 completion = client.chat.completions.create(
     model=model,
     messages=[{
@@ -60,5 +96,34 @@ class CarDescription(BaseModel):
     }],
     extra_body={"guided_json": json_schema},
 )
-print("content", completion.choices[0].message.content)
 print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
+
+# Guided decoding by Grammar
+simplified_sql_grammar = """
+    ?start: select_statement
+
+    ?select_statement: "SELECT " column_list " FROM " table_name
+
+    ?column_list: column_name ("," column_name)*
+
+    ?table_name: identifier
+
+    ?column_name: identifier
+
+    ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
+"""
+
+# This may be very slow https://github.com/vllm-project/vllm/issues/12122
+prompt = ("Generate an SQL query to show the 'username' and 'email'"
+          "from the 'users' table.")
+completion = client.chat.completions.create(
+    model=model,
+    messages=[{
+        "role": "user",
+        "content": prompt,
+    }],
+    extra_body={"guided_grammar": simplified_sql_grammar},
+)
+print("reasoning_content: ", completion.choices[0].message.reasoning_content)
+print("content: ", completion.choices[0].message.content)
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 86f6f0e5f90..6b9a855eecc 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -112,6 +112,7 @@ async def get_guided_decoding_logits_processor(
     reasoner = get_reasoner(tokenizer, reasoning_backend)
 
     guided_params = maybe_backend_fallback(guided_params)
+
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
         # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index db5d738f42e..de24eaa1fb6 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -43,7 +43,7 @@ class BaseLogitsProcessor:
 
     def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
         self._guide: Guide = guide
-        self._reasoner = reasoner
+        self._reasoner: Optional[Reasoner] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -54,10 +54,14 @@ def __call__(self, input_ids: List[int],
 
         # Skip the structured logits processing if reasoning is not finished.
         # reasoner is not None only when `--enable-reasoning` is set.
-        if self._reasoner is not None and \
-        not self._reasoner.is_reasoning_end(
-                input_ids):
-            return scores
+        if self._reasoner is not None:
+            if not self._reasoner.is_reasoning_end(input_ids):
+                return scores
+            else:
+                # Remove the reasoning tokens from the input_ids
+                # We need this because our implementation relies on the
+                # hash of the input_ids to store the FSM state.
+                input_ids = self._reasoner.extract_content(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index 5a91f791d45..d930d3dbe94 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -4,10 +4,13 @@
 
 from transformers import PreTrainedTokenizer
 
+from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner.deepseek_reasoner import (  # noqa: E501
     DeepSeekReasoner)
 from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
 
+logger = init_logger(__name__)
+
 
 def get_reasoner(tokenizer: PreTrainedTokenizer,
                  reasoning_backend: str | None) -> Reasoner | None:
@@ -17,7 +20,12 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
     else:
-        raise ValueError(f"Unknown reasoning backend '{reasoning_backend}'")
+        # Raise a warning for unknown reasoning backend and return None
+        # We cannot raise an error here because some reasoning models
+        # may not have a corresponding Reasoner class.
+        logger.warning("Unknown reasoning backend %s for structured outputs ",
+                       reasoning_backend)
+        return None
 
 
 __all__ = ["Reasoner", "get_reasoner"]
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
index e762fb0659d..7e61e6a9620 100644
--- a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
+++ b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
@@ -26,3 +26,13 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.end_token_id in input_ids
+
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.end_token_id not in input_ids or \
+            input_ids.index(self.end_token_id) + 1 == len(input_ids):
+            return []
+        else:
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
index 5db0c9bc785..df21b1db622 100644
--- a/vllm/model_executor/guided_decoding/reasoner/reasoner.py
+++ b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
@@ -17,3 +17,7 @@ def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
     @abstractmethod
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         pass
+
+    @abstractmethod
+    def extract_content(self, input_ids: list[int]) -> list[int]:
+        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index ce278c15ab3..9405ef93e14 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -392,7 +392,7 @@ def __call__(self, input_ids: list[int],
     def clone(self) -> XGrammarLogitsProcessor:
         """Create a new instance with shared compiled grammar
           but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config)
+        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner)
 
         # Share the compiled grammar context (immutable after compilation)
         new_processor.ctx = self.ctx

From f645ae4a5bb1e725cc923971baa3662f2e71d48c Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 5 Mar 2025 22:55:42 -0500
Subject: [PATCH 0537/1240] [V1] LoRA - Enable more V1 tests (#14315)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_chatglm3_tp.py |  3 ---
 tests/lora/test_mixtral.py     |  8 ++++++++
 tests/lora/test_qwen2vl.py     |  8 ++++++++
 tests/lora/test_ultravox.py    |  9 +++++++++
 tests/lora/test_worker.py      | 37 +++++++++++++++++++++++++++++-----
 5 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fc0434e7a7e..6bc9bf78876 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -55,7 +55,6 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -75,7 +74,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@@ -97,7 +95,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index caa65f2dc63..5d59df365fe 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -10,6 +10,14 @@
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
               prompts: list[str]) -> list[str]:
 
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 24eff013e20..90735d55be7 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -12,6 +12,14 @@
 from vllm.platforms import current_platform
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @dataclass
 class TestConfig:
     model_path: str
diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
index 6d2833bd125..2faabcb031f 100644
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@@ -4,6 +4,7 @@
 from os import path
 from tempfile import TemporaryDirectory
 
+import pytest
 import torch
 from huggingface_hub import snapshot_download
 from safetensors.torch import load_file, save_file
@@ -21,6 +22,14 @@
 PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def llama3_1_8b_chess_lora_path():
     return snapshot_download(
         repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 797141ea39e..fc1be4ed440 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,18 +3,45 @@
 import os
 import random
 import tempfile
+from typing import Union
 from unittest.mock import patch
 
+import pytest
+
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
+
+    def set_active_loras(worker: Union[Worker, V1Worker],
+                         lora_requests: list[LoRARequest]):
+        lora_mapping = LoRAMapping([], [])
+        if isinstance(worker, Worker):
+            # v0 case
+            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
+        else:
+            # v1 case
+            worker.model_runner.lora_manager.set_active_adapters(
+                lora_requests, lora_mapping)
+
+    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+
     vllm_config = VllmConfig(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
@@ -40,16 +67,17 @@ def test_worker_apply_lora(sql_lora_files):
         lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                                max_loras=32),
     )
-    worker = Worker(
+    worker = worker_cls(
         vllm_config=vllm_config,
         local_rank=0,
         rank=0,
         distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
     )
+
     worker.init_device()
     worker.load_model()
 
-    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    set_active_loras(worker, [])
     assert worker.list_loras() == set()
 
     n_loras = 32
@@ -57,7 +85,7 @@ def test_worker_apply_lora(sql_lora_files):
         LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
     ]
 
-    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    set_active_loras(worker, lora_requests)
     assert worker.list_loras() == {
         lora_request.lora_int_id
         for lora_request in lora_requests
@@ -69,8 +97,7 @@ def test_worker_apply_lora(sql_lora_files):
                                             k=random.randint(1, n_loras))
         random.shuffle(iter_lora_requests)
         iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
-        worker.model_runner.set_active_loras(iter_lora_requests,
-                                             LoRAMapping([], []))
+        set_active_loras(worker, lora_requests)
         assert worker.list_loras().issuperset(
             {lora_request.lora_int_id
              for lora_request in iter_lora_requests})

From 800b8f445f410734ef0ec041ae60c9751186a60f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 6 Mar 2025 05:00:53 +0100
Subject: [PATCH 0538/1240] [Bugfix][CI] ALiBi test case in xformers
 multi_query_kv_attention (#11301)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_attention.py      | 95 +++++++++++++++++++++++-----
 tests/kernels/test_prefix_prefill.py |  8 ++-
 vllm/attention/backends/xformers.py  |  2 -
 3 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py
index fc549d7a7c1..0d7898a900e 100644
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@@ -17,6 +17,8 @@
     from xformers import ops as xops
     from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
 
+    from vllm.attention.backends.xformers import _make_alibi_bias
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -345,20 +347,26 @@ def ref_multi_query_kv_attention(
     key: torch.Tensor,
     value: torch.Tensor,
     scale: float,
+    alibi_bias: Optional[list[torch.Tensor]],
     dtype: torch.dtype,
 ) -> torch.Tensor:
     num_seqs = len(cu_seq_lens) - 1
     ref_outputs: list[torch.Tensor] = []
+    if alibi_bias:
+        assert len(alibi_bias) == num_seqs
     for i in range(num_seqs):
         start_idx = cu_seq_lens[i]
         end_idx = cu_seq_lens[i + 1]
         seq_len = end_idx - start_idx
 
-        # Create attention mask.
-        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
-                               diagonal=1)
-        attn_mask = attn_mask * torch.finfo(dtype).min
-        attn_mask = attn_mask.to(dtype=dtype)
+        # Create attention mask. ALiBi already includes a tril causal mask.
+        if alibi_bias:
+            attn_mask = alibi_bias[i]
+        else:
+            attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
+                                   diagonal=1)
+            attn_mask = attn_mask * torch.finfo(dtype).min
+            attn_mask = attn_mask.to(dtype=dtype)
 
         ref_output = ref_masked_attention(
             query[start_idx:end_idx],
@@ -372,7 +380,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-# TODO(woosuk): Add tests for USE_ALIBI=True.
 @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
@@ -389,6 +396,7 @@ def test_multi_query_kv_attention(
     dtype: torch.dtype,
     seed: int,
     device: str,
+    use_alibi: bool = False,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -414,16 +422,40 @@ def test_multi_query_kv_attention(
         # Handle MQA and GQA
         key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
         value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-    output = xops.memory_efficient_attention_forward(
-        query.unsqueeze(0),
-        key.unsqueeze(0),
-        value.unsqueeze(0),
-        attn_bias=attn_bias,
-        p=0.0,
-        scale=scale,
-    )
-    output = output.squeeze(0)
+    alibi_bias = None
+    if use_alibi:
+        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
+        attn_bias = _make_alibi_bias(alibi_slopes, num_kv_heads, dtype,
+                                     seq_lens)
+        output = torch.empty_like(query)
+        start = 0
+        # Dynamic sequence length not supported with custom attn_bias.
+        for i, seq_len in enumerate(seq_lens):
+            end = start + seq_len
+            out = xops.memory_efficient_attention_forward(
+                query[None, start:end],
+                key[None, start:end],
+                value[None, start:end],
+                attn_bias=attn_bias[i],
+                p=0.0,
+                scale=scale)
+            output[start:end].copy_(out.view_as(query[start:end]))
+            start += seq_len
+        # xformers.AttentionBias to Tensor for use in reference impl.
+        alibi_bias = [
+            b.materialize(b.shape, device=device).squeeze() for b in attn_bias
+        ]
+    else:
+        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
+        output = xops.memory_efficient_attention_forward(
+            query.unsqueeze(0),
+            key.unsqueeze(0),
+            value.unsqueeze(0),
+            attn_bias=attn_bias,
+            p=0.0,
+            scale=scale,
+        )
+        output = output.squeeze(0)
 
     cu_seq_lens = [0]
     for seq_len in seq_lens:
@@ -434,8 +466,37 @@ def test_multi_query_kv_attention(
         key,
         value,
         scale,
+        alibi_bias,
         dtype,
     )
     atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
     rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
\ No newline at end of file
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
+
+
+@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", [64])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+@torch.inference_mode()
+def test_multi_query_kv_attention_with_alibi(
+    num_seqs: int,
+    num_heads: tuple[int, int],
+    head_size: int,
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+) -> None:
+    return test_multi_query_kv_attention(
+        num_seqs,
+        num_heads,
+        head_size,
+        dtype,
+        seed,
+        device,
+        use_alibi=True,
+    )
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index c3ac6a37e71..f2c7f2c809e 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -439,14 +439,16 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # heads.
         #
         # see also: vllm/model_executor/layers/attention.py
-        query = query.view(query.shape[0], num_kv_heads, num_queries_per_kv,
-                           query.shape[-1])
         key = key[:, :, None, :].expand(key.shape[0], num_kv_heads,
                                         num_queries_per_kv, key.shape[-1])
         value = value[:, :,
                       None, :].expand(value.shape[0], num_kv_heads,
                                       num_queries_per_kv, value.shape[-1])
-
+        # [seq, num_kv_heads, num_queries_per_kv, dk]=>
+        # [seq, num_kv_heads*num_queries_per_kv, dk] to comply with rest of the
+        # codebase. We save some time reshaping alibi matrix at runtime.
+        key = key.reshape(key.shape[0], -1, key.shape[-1])
+        value = value.reshape(value.shape[0], -1, value.shape[-1])
     query = query.unsqueeze(0)
     key = key.unsqueeze(0)
     value = value.unsqueeze(0)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 9fa76634e1f..14c94c9ac4c 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -788,8 +788,6 @@ def _make_alibi_bias(
             dtype=dtype,
         )[:, :, :, :seq_len].copy_(bias)
         bias.mul_(alibi_slopes[:, None, None])
-        if num_heads != num_kv_heads:
-            bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
         attn_biases.append(LowerTriangularMaskWithTensorBias(bias))
 
     return attn_biases

From 9f8001db39d6192cca842a8305ee41680b6d112a Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Wed, 5 Mar 2025 22:01:37 -0800
Subject: [PATCH 0539/1240] [Hardware] Update the flash attn tag to support
 Blackwell (#14244)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 cmake/external_projects/vllm_flash_attn.cmake | 4 ++--
 vllm/attention/backends/utils.py              | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index ef6261fa6d9..f2d01099097 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 720c94869cf2e0ff5a706e9c7f1dce0939686ade
+          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
@@ -64,4 +64,4 @@ install(
   DESTINATION vllm_flash_attn
   COMPONENT _vllm_fa3_C
   FILES_MATCHING PATTERN "*.py"
-)
\ No newline at end of file
+)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index baf01c9263d..4374b542225 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -595,7 +595,7 @@ def get_flash_attn_version():
         # if hopper default to FA3, otherwise stick to FA2 for now
         # TODO(lucas): profile FA3 on ampere to see if it makes sense to
         #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] >= 9:
+        if current_platform.get_device_capability()[0] == 9:
             fa_version = 3 if is_fa_version_supported(3) else 2
         else:
             fa_version = 2
@@ -603,6 +603,11 @@ def get_flash_attn_version():
         if envs.VLLM_FLASH_ATTN_VERSION is not None:
             assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
             fa_version = envs.VLLM_FLASH_ATTN_VERSION
+            if (current_platform.get_device_capability()[0] == 10
+                    and envs.VLLM_FLASH_ATTN_VERSION == 3):
+                logger.warning("Cannot use FA version 3 on Blackwell platform",
+                               "defaulting to FA version 2.")
+                fa_version = 2
 
         if not is_fa_version_supported(fa_version):
             logger.error("Cannot use FA version %d is not supported due to %s",

From 8674c3a8dd9f27d0776243a684172fdd0b05e4a1 Mon Sep 17 00:00:00 2001
From: kYLe <kylhuang@nvidia.com>
Date: Thu, 6 Mar 2025 02:31:38 -0600
Subject: [PATCH 0540/1240] [Model] Update Paligemma multimodal processing with
 PromptUpdate  (#14015)

Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   6 +-
 .../vision_language/test_models.py            |   5 +-
 .../multimodal/processing/test_common.py      |   2 +
 vllm/model_executor/models/paligemma.py       | 219 +++++++++++-------
 4 files changed, 146 insertions(+), 86 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fc363585b0e..ca2c4d35d77 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -842,13 +842,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
-- * `PaliGemmaForConditionalGeneration`\*
-  * PaliGemma, PaliGemma 2
+- * `PaliGemmaForConditionalGeneration`
+  * PaliGemma (see note), PaliGemma 2 (see note)
   * T + I<sup>E</sup>
   * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
   *
   * ✅︎
-  *
+  * ✅︎
 - * `Phi3VForCausalLM`
   * Phi-3-Vision, Phi-3.5-Vision
   * T + I<sup>E+</sup>
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3f7a7c01aeb..2540933bbc2 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -116,9 +116,8 @@
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
-               else ("half", "float")),
-        marks=[pytest.mark.core_model],
+        dtype="bfloat16",
+        marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
     ),
     # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
     # once we upgraded to transformers>=4.49.0.
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 7534f0c9779..629d1012d18 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -175,6 +175,8 @@ def _test_processing_correctness(
     "Qwen/Qwen2-Audio-7B-Instruct",
     "fixie-ai/ultravox-v0_4",
     "openai/whisper-large-v3",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 0e39389eb63..f3dc87854cb 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -5,22 +5,26 @@
 
 import torch
 from torch import nn
-from transformers import PaliGemmaConfig
+from transformers import BatchFeature, PaliGemmaConfig
 
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import NestedTensors
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalInputs, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsMultiModal, SupportsPP, SupportsV0Only
-from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
-                     dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
+from .interfaces import SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel, get_max_siglip_image_tokens
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -46,97 +50,152 @@ class PaliGemmaImageEmbeddingInputs(TypedDict):
                              PaliGemmaImageEmbeddingInputs]
 
 
-def get_max_paligemma_image_tokens(ctx: InputContext):
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
-    vision_config = hf_config.vision_config
-
-    return get_max_siglip_image_tokens(vision_config)
-
-
-def dummy_data_for_paligemma(ctx: InputContext, seq_len: int,
-                             mm_counts: Mapping[str, int]):
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
-    vision_config = hf_config.vision_config
-    num_images = mm_counts["image"]
-
-    seq_data, ranges = dummy_seq_data_for_siglip(
-        vision_config,
-        seq_len,
-        num_images,
-        image_token_id=hf_config.image_token_index,
-    )
-
-    mm_data = dummy_image_for_siglip(vision_config, num_images)
-    return DummyData(seq_data, mm_data, ranges)
-
-
-def input_processor_for_paligemma(ctx: InputContext,
-                                  inputs: DecoderOnlyInputs):
+class PaliGemmaMultiModalProjector(nn.Module):
 
-    """
-    The correct prompt format needs to be:
-    '<image>' * image_feature_size + '<bos>' + prompt + '\n'
+    def __init__(self, vision_hidden_size: int, projection_dim: int):
+        super().__init__()
 
-    See https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/models/paligemma/processing_paligemma.py#L55
-    """ # noqa
+        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
 
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear(image_features)
+        return hidden_states
 
-    model_config = ctx.model_config
-    hf_config = ctx.get_hf_config(PaliGemmaConfig)
 
-    tokenizer = cached_tokenizer_from_config(model_config)
-    image_feature_size = hf_config.text_config.num_image_tokens
-    image_token_str = tokenizer.decode(hf_config.image_token_index)
-    bos_token = tokenizer.decode(hf_config.bos_token_id)
-    image_token_str_pad = image_token_str * image_feature_size
-    image_token_ids_pad = [hf_config.image_token_index] * image_feature_size
+class PaliGemmaProcessingInfo(BaseProcessingInfo):
 
-    orig_prompt = inputs.get("prompt")
-    orig_prompt_ids = inputs.get("prompt_token_ids")
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(PaliGemmaConfig)
 
-    if orig_prompt is not None and image_token_str in orig_prompt:
-        logger.warning(
-            "The image token '%s' was detected in the prompt and "
-            "will be removed. Please follow the proper prompt format"
-            " documented on HuggingFace.", image_token_str)
-        orig_prompt = orig_prompt.replace(image_token_str, "")
-        orig_prompt_ids.remove(hf_config.image_token_index)
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": 1}
 
-    new_prompt = f"{image_token_str_pad}{bos_token}{orig_prompt}\n"
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_num_image_tokens()}
 
-    # The PaliGemma 2 tokenizer does not include a starting BOS token
-    if orig_prompt_ids[0] != hf_config.bos_token_id:
-        orig_prompt_ids = [hf_config.bos_token_id] + orig_prompt_ids
+    def get_num_image_tokens(self) -> int:
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        return get_max_siglip_image_tokens(vision_config)
 
-    new_token_ids = image_token_ids_pad + orig_prompt_ids + [108]  #newline
 
-    # NOTE: Create a defensive copy of the original inputs
-    return token_inputs(prompt_token_ids=new_token_ids,
-                        prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+class PaliGemmaDummyInputsBuilder(
+        BaseDummyInputsBuilder[PaliGemmaProcessingInfo]):
 
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        max_image_size = vision_config.image_size
+
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=max_image_size,
+                                   height=max_image_size,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
 
-class PaliGemmaMultiModalProjector(nn.Module):
 
-    def __init__(self, vision_hidden_size: int, projection_dim: int):
-        super().__init__()
+class PaliGemmaMultiModalProcessor(
+        BaseMultiModalProcessor[PaliGemmaProcessingInfo]):
 
-        self.linear = nn.Linear(vision_hidden_size, projection_dim, bias=True)
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        tokenizer = self.info.get_tokenizer()
+        if not mm_data:
+            prompt_ids = tokenizer.encode(prompt)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
 
-    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.linear(image_features)
-        return hidden_states
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
 
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        tokenizer = self.info.get_tokenizer()
+        num_image_tokens = self.info.get_num_image_tokens()
+        image_tokens = [image_token_id] * num_image_tokens
+
+        bos_token_id = tokenizer.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        # Paligemma 1 and 2 have different tokenizer.add_bos_token
+        # Insert <image>*n + <bos> after <bos> for Paligemma 1
+        # Insert <image>*n + <bos> for Paligemma 2
+        return [
+            PromptInsertion(
+                modality="image",
+                target=PromptIndexTargets.prefix(
+                    [bos_token_id] if tokenizer.add_bos_token else []),
+                insertion=PromptUpdateDetails(
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
+                ),
+            )
+        ]
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper()
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_paligemma_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_paligemma)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma)
+    def apply(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data: MultiModalDataDict,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> MultiModalInputs:
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
+
+        tokenizer = self.info.get_tokenizer()
+        newline_prompt = "\n"
+        newline_token_id = tokenizer.encode(newline_prompt)[-1]  # 108
+        # Force to add newline at the end of prompt for paligemma's format
+        # This step can NOT be replacemented by current PromptUpdate methods
+        if len(prompt_token_ids) and prompt_token_ids[-1] != newline_token_id:
+            prompt_token_ids.append(newline_token_id)
+            mm_inputs["prompt_token_ids"] = prompt_token_ids
+            mm_inputs["prompt"] += newline_prompt
+
+        return mm_inputs
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    PaliGemmaMultiModalProcessor,
+    info=PaliGemmaProcessingInfo,
+    dummy_inputs=PaliGemmaDummyInputsBuilder)
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP, SupportsV0Only):
+                                        SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 6072e057ec35407628d0accca5cbe4290095cd52 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Thu, 6 Mar 2025 00:58:41 -0800
Subject: [PATCH 0541/1240] [V1][VLM][Pixtral-HF] Support Pixtral-HF on V1
 (#14275)

Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |   6 +-
 vllm/model_executor/models/llava.py    | 171 ++++++++++++++++++++++++-
 vllm/model_executor/models/molmo.py    |   4 +-
 vllm/model_executor/models/pixtral.py  |  10 +-
 4 files changed, 175 insertions(+), 16 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ca2c4d35d77..ff28fde5b7f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -866,7 +866,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc.
+  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
   *
   * ✅︎
   * ✅︎
@@ -930,10 +930,6 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 Currently the PaliGemma model series is implemented without PrefixLM attention mask. This model series may be deprecated in a future release.
 :::
 
-:::{note}
-`mistral-community/pixtral-12b` does not support V1 yet.
-:::
-
 :::{note}
 To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 66b79f809bc..e83dfd320bb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
-                    TypedDict, TypeVar, Union)
+                    TypedDict, TypeVar, Union, cast)
 
 import torch
 import torch.nn as nn
@@ -35,6 +35,7 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .clip import CLIPVisionModel
 from .interfaces import SupportsMultiModal, SupportsPP
@@ -56,6 +57,25 @@ class LlavaImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_crops, num_patch)`
+    """
+
+    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_embeds)`
+    """
+
+    num_crops: torch.Tensor
+    """Shape: `(batch_size, num_images)`"""
+
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -65,6 +85,25 @@ class LlavaImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image features correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_crops, num_patch)`
+    """
+
+    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_embeds)`
+    """
+
+    num_crops: torch.Tensor
+    """Shape: `(batch_size, num_images)`"""
+
 
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
 
@@ -317,6 +356,26 @@ def _call_hf_processor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
+            hf_config = self.info.get_hf_config()
+
+            tile_sizes = [
+                get_pixtral_hf_image_feature_grid_size(
+                    hf_config.vision_config,
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2])
+                for pixel_value in processed_outputs["pixel_values"]
+            ]
+            num_crops = torch.tensor([(ncols + 1) * nrows
+                                      for ncols, nrows in tile_sizes])
+            # Each image may result to masks of different sizes, so we need to
+            # flatten the list and later use `num_crops` to get per-image masks.
+            embed_is_patch = torch.tensor(
+                flatten_2d_lists([([True] * ncols + [False]) * nrows
+                                  for ncols, nrows in tile_sizes]))
+            processed_outputs["num_crops"] = num_crops
+            processed_outputs["embed_is_patch"] = embed_is_patch
+            processed_outputs["feat_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -324,7 +383,13 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_crops = hf_inputs.get("num_crops", torch.empty(0)).view(-1)
         return dict(
+            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            embed_is_patch=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops),
+            num_crops=MultiModalFieldConfig.batched("image"),
             pixel_values=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -562,6 +627,23 @@ def _parse_and_validate_image_input(
         if pixel_values is None and image_embeds is None:
             return None
 
+        feat_is_patch = kwargs.pop("feat_is_patch", None)
+        if feat_is_patch is not None and not isinstance(
+                feat_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of feat_is_patch. "
+                             f"Got type: {type(feat_is_patch)}")
+
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if embed_is_patch is not None and not isinstance(
+                embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        num_crops = kwargs.pop("num_crops", None)
+        if num_crops is not None and not isinstance(num_crops, torch.Tensor):
+            raise ValueError("Incorrect type of num_crops. "
+                             f"Got type: {type(num_crops)}")
+
         if pixel_values is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
@@ -571,12 +653,18 @@ def _parse_and_validate_image_input(
                 return LlavaImagePixelInputs(
                     type="pixel_values",
                     data=flatten_bn(pixel_values),
+                    feat_is_patch=feat_is_patch,
+                    embed_is_patch=embed_is_patch,
+                    num_crops=num_crops,
                 )
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
                 data=self._validate_pixel_values(
                     flatten_bn(pixel_values, concat=True)),
+                feat_is_patch=feat_is_patch,
+                embed_is_patch=embed_is_patch,
+                num_crops=num_crops,
             )
 
         if image_embeds is not None:
@@ -587,6 +675,9 @@ def _parse_and_validate_image_input(
             return LlavaImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
+                feat_is_patch=feat_is_patch,
+                embed_is_patch=embed_is_patch,
+                num_crops=num_crops,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -633,16 +724,74 @@ def _process_image_input(self,
 
         assert self.vision_tower is not None
         image_features = self._process_image_pixels(image_input)
-        return self.multi_modal_projector(image_features)
 
-    def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features)
+
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def _get_mm_embeds(
+            self,
+            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
+            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
+            num_crops: torch.Tensor,  # Shape: (num_images,)
+            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
+    ) -> list[torch.Tensor]:
+        """Scatter the patch features into a contiguous tensor that corresponds
+        to the embedding tokens defined by the multimodal processor.
+
+        Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
+        """
+
+        # Insert columns of nan values according to `feat_is_patch`. This work
+        # ideally should be done in `_process_image_input`, but
+        # `_process_image_input` is used in both V0 and V1 path. It's safer to
+        # put the logic here.
+        # FIXME: Move this logic to `_process_image_input` when v0 is
+        # deprecated. Merge this function with `Molmo._get_mm_embeds`.
+        feat_is_patch = feat_is_patch.view(-1)
+        embed_is_patch = embed_is_patch.view(-1)
+        expanded_embedding = torch.full(
+            (sum(num_crops), *features.shape[1:]),
+            torch.nan,
+            dtype=features.dtype).to(features.device)
+        expanded_embedding[feat_is_patch] = features
+
+        num_crops_per_image = num_crops.tolist()
+        feats_per_image = expanded_embedding.split(num_crops_per_image)
+        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
+
+        embed_dim = expanded_embedding.shape[-1]
+        num_embeds = embed_is_patch.shape[0]
+
+        embeds_in_batch = list[torch.Tensor]()
+        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
+            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
+            embeds[embed_is_patch] = feats[f_is_patch]
+            embeds_in_batch.append(embeds)
+
+        return embeds_in_batch
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
         vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+        if kwargs.get("v0_path", False):
+            return vision_embeddings
+        else:
+            nested_emb = [
+                self._get_mm_embeds(*args) for args in zip(
+                    vision_embeddings, image_input["feat_is_patch"],
+                    image_input["num_crops"], image_input["embed_is_patch"])
+            ]
+            return flatten_2d_lists(nested_emb)
 
     def get_input_embeddings(
         self,
@@ -651,8 +800,15 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
+            # Extract the patch tokens
+            patch_embeddings = json_map_leaves(
+                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+                cast(JSONTree[torch.Tensor], multimodal_embeddings),
+            )
+
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
+                input_ids, inputs_embeds, cast(NestedTensors,
+                                               patch_embeddings),
                 self.config.image_token_index)
         return inputs_embeds
 
@@ -705,6 +861,7 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc571bc24ba..55408053305 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1484,8 +1484,8 @@ def _parse_and_validate_image_input(
 
         img_patch_id = kwargs.pop("img_patch_id", None)
         if not isinstance(img_patch_id, torch.Tensor):
-            raise ValueError("Incorrect type of num_crops. "
-                             f"Got type: {type(num_crops)}")
+            raise ValueError("Incorrect type of img_patch_id. "
+                             f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
         return MolmoImageInputs(
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index d2388dda3f4..8acc07ac353 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1042,9 +1042,13 @@ def forward(
             for img in pixel_values
         ]
 
+        patch_embeds = [
+            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
+        ]
+        embed_sizes = [p.shape[1] for p in patch_embeds]
+
         # flatten to a single sequence
-        patch_embeds = torch.cat(
-            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = torch.cat(patch_embeds, dim=1)
         patch_embeds = self.ln_pre(patch_embeds)
 
         # positional embeddings
@@ -1075,6 +1079,8 @@ def forward(
         out = resolve_visual_encoder_outputs(out, feature_sample_layers, None,
                                              self.config.num_hidden_layers)
 
+        # squeeze dim 0 and split into separate tensors for each image
+        out = torch.split(torch.squeeze(out), embed_sizes)
         return out
 
     # (TODO) Add prefix argument for filtering out weights to be loaded

From fd3d80792e2d2ed4f156ec9f271d5608cff43982 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 6 Mar 2025 10:37:26 +0100
Subject: [PATCH 0542/1240] [Core] Optimizing cross-attention
 `QKVParallelLinear` computation (#12325)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal>
Co-authored-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/linear.py | 95 ++++++++++++++++++++++++++++
 vllm/model_executor/models/bart.py   | 39 ++++--------
 vllm/model_executor/models/mllama.py | 29 ++++-----
 vllm/model_executor/models/utils.py  |  2 +-
 4 files changed, 121 insertions(+), 44 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 600284a8fa7..c96e2b220d6 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1227,3 +1227,98 @@ def extra_repr(self) -> str:
         s += f", tp_size={self.tp_size}"
         s += f", reduce_results={self.reduce_results}"
         return s
+
+
+class QKVCrossParallelLinear(torch.nn.Module):
+
+    def __init__(self,
+                 hidden_size: int,
+                 head_size: int,
+                 total_num_heads: int,
+                 total_num_kv_heads: Optional[int] = None,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        # Empty placeholders for loading as a single module.
+        self.weight = torch.nn.Parameter()
+        set_weight_attrs(self.weight, {
+            "weight_loader": self.weight_loader_weight,
+        })
+        # Use a dictionary to avoid submodules parameters auto-registration:
+        # drop-in replacement for a `QKVParallelLinear` module.
+        self.proj = dict()
+        self.proj["q_proj_decoder"] = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=total_num_heads * head_size,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.q_proj_decoder")
+
+        self.proj["kv_proj_encoder"] = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=head_size,
+            total_num_heads=0,
+            total_num_kv_heads=total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            skip_bias_add=skip_bias_add,
+            params_dtype=params_dtype,
+            prefix=f"{prefix}.kv_proj_encoder")
+
+        # `kv_proj_encoder.num_kv_heads` accounts for sharding with tp>1.
+        self.kv_size = self.kv_proj_encoder.num_kv_heads * head_size
+
+        if bias:
+            self.bias = torch.nn.Parameter()
+            set_weight_attrs(self.bias, {
+                "weight_loader": self.weight_loader_bias,
+            })
+
+    @property
+    def q_proj_decoder(self):
+        return self.proj["q_proj_decoder"]
+
+    @property
+    def kv_proj_encoder(self):
+        return self.proj["kv_proj_encoder"]
+
+    def forward(self, decoder_hidden_states, encoder_hidden_states):
+        q, _ = self.q_proj_decoder(decoder_hidden_states)
+        if encoder_hidden_states is None:
+            # Encoder KV already cached.
+            k = None
+            v = None
+        else:
+            # Prefill phase, encoder KV cached here.
+            kv_enc, _ = self.kv_proj_encoder(encoder_hidden_states)
+            # Split kv in half
+            k, v = kv_enc.split(self.kv_size, dim=-1)
+        return q, k, v
+
+    def weight_loader_weight(self,
+                             param: torch.nn.Parameter,
+                             loaded_weight: torch.Tensor,
+                             loaded_shard_id: Optional[str] = None):
+        # NOTE Use QKV/ColumnParallel weight_loader, ignore placeholder param.
+        param = self.q_proj_decoder.weight if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.weight
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
+
+    def weight_loader_bias(self,
+                           param: torch.nn.Parameter,
+                           loaded_weight: torch.Tensor,
+                           loaded_shard_id: Optional[str] = None):
+        param = self.q_proj_decoder.bias if loaded_shard_id == "q" \
+            else self.kv_proj_encoder.bias
+        param.weight_loader(
+            param,
+            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
+                param, loaded_weight, loaded_shard_id)
\ No newline at end of file
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 5847c50862e..109b65d92cf 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -31,6 +31,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -169,7 +170,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
@@ -248,7 +249,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
+        self.num_kv_heads = self.num_heads
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
 
@@ -299,14 +300,14 @@ def __init__(
                              f" and `num_heads`: {num_heads}).")
         self.scaling = self.head_dim**-0.5
 
-        self.qkv_proj = QKVParallelLinear(
-            self.d_model,
-            self.d_model // self.total_num_heads,
-            self.total_num_heads,
-            self.total_num_kv_heads,
-            bias=bias,
-            quant_config=quant_config,
-        )
+        # TP sharding sizes is accounted for within "*Parallel" layers.
+        self.qkv_proj = QKVCrossParallelLinear(self.d_model,
+                                               self.d_model //
+                                               self.total_num_heads,
+                                               self.total_num_heads,
+                                               self.total_num_kv_heads,
+                                               bias,
+                                               quant_config=quant_config)
 
         self.out_proj = RowParallelLinear(
             embed_dim,
@@ -327,10 +328,7 @@ def __init__(
             # Number of KV heads is less than TP size, so we replicate
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_world_size % self.total_num_kv_heads == 0
-        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_world_size)
-        self.q_size = self.num_heads * self.head_dim
-        self.kv_size = self.num_kv_heads * self.head_dim
-
+        self.num_kv_heads = self.num_heads  # No GQA in bart
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
@@ -347,18 +345,7 @@ def forward(
     ) -> torch.Tensor:
         """Input shape: Batch x Time x Channel"""
 
-        # (afeldman-nm 2024/07/22) TODO:
-        # Need a more efficient solution for q/k/v
-        qkv_dec, _ = self.qkv_proj(decoder_hidden_states)
-        q, _, _ = qkv_dec.split([self.q_size, self.kv_size, self.kv_size],
-                                dim=-1)
-        if encoder_hidden_states is None:
-            k = None
-            v = None
-        else:
-            qkv_enc, _ = self.qkv_proj(encoder_hidden_states)
-            _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size],
-                                    dim=-1)
+        q, k, v = self.qkv_proj(decoder_hidden_states, encoder_hidden_states)
 
         attn_output = self.attn(q, k, v)
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index f74fa7a4662..a9de63245d9 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -43,6 +43,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -798,21 +799,22 @@ def __init__(
         self.config = config
         self.pipeline_parallel_rank = get_pp_group().rank_in_group
         self.tensor_parallel_size = get_tp_group().world_size
-        self.num_heads = self.config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
         self.num_local_heads = self.num_heads // self.tensor_parallel_size
-        self.num_key_value_heads = self.config.num_key_value_heads
         self.num_local_key_value_heads = \
             self.num_key_value_heads // self.tensor_parallel_size
-        self.dropout = config.dropout
         self.hidden_size = config.hidden_size
         self.head_dim = config.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+
         self.layer_idx = layer_idx
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.q_local_size = self.num_local_heads * self.head_dim
         self.kv_local_size = self.num_local_key_value_heads * self.head_dim
 
-        # TODO: change to Q/KV separate linear after #7448 is merged
-        self.qkv_proj = QKVParallelLinear(
+        self.qkv_proj = QKVCrossParallelLinear(
             self.hidden_size,
             self.head_dim,
             self.num_heads,
@@ -821,6 +823,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
         )
+
         self.o_proj = RowParallelLinear(
             self.num_heads * self.head_dim,
             self.hidden_size,
@@ -851,21 +854,12 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         cross_attention_states: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        qkv_dec, _ = self.qkv_proj(hidden_states)
-        q, _, _ = qkv_dec.split(
-            [self.q_local_size, self.kv_local_size, self.kv_local_size],
-            dim=-1)
-        if cross_attention_states is None:
-            k = None
-            v = None
-        else:
-            qkv_enc, _ = self.qkv_proj(cross_attention_states)
-            _, k, v = qkv_enc.split(
-                [self.q_local_size, self.kv_local_size, self.kv_local_size],
-                dim=-1)
+        q, k, v = self.qkv_proj(hidden_states, cross_attention_states)
+        if cross_attention_states is not None:
             k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
             v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
             k = self.k_norm(k)
+
         q = q.view(-1, self.num_local_heads, self.head_dim)
         q = self.q_norm(q)
 
@@ -889,6 +883,7 @@ def _attention_with_mask(
         kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank]
         attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
         # Skip writing kv-cache for the initial profiling run.
+        # TODO (NickLucche) replace with custom attn bias and use standard attn
         if len(kv_cache.shape) > 1:
             i = torch.ones(1, dtype=torch.float32)
             if self.attn.backend in (_Backend.FLASH_ATTN,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index f9aa5da39a5..a705aeffef3 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -650,4 +650,4 @@ def cast_overflow_tensors(
     if tensors.isinf().any() or tensors.isnan().any():
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
+    return tensors
\ No newline at end of file

From 6d1793e5b591944c8bd693091b8afacd634ab9d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 6 Mar 2025 11:39:35 +0100
Subject: [PATCH 0543/1240] [Frontend][Docs] Transcription API streaming
 (#13301)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../serving/openai_compatible_server.md       |   4 +
 .../openai_transcription_client.py            |  59 ++++++-
 .../openai/test_transcription_validation.py   |  72 +++++++++
 vllm/entrypoints/openai/protocol.py           |  40 ++++-
 .../openai/serving_transcription.py           | 148 ++++++++++++++++--
 5 files changed, 297 insertions(+), 26 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 5ab46da90ea..0880a4530d8 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -379,6 +379,10 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
+:::{note}
+To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
+:::
+
 <!-- TODO: api enforced limits + uploading audios -->
 
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index bd3c02a8a95..494e7c8ebe1 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+import asyncio
+import json
+
+import httpx
 from openai import OpenAI
 
 from vllm.assets.audio import AudioAsset
@@ -13,11 +17,50 @@
     api_key=openai_api_key,
     base_url=openai_api_base,
 )
-with open(str(mary_had_lamb), "rb") as f:
-    transcription = client.audio.transcriptions.create(
-        file=f,
-        model="openai/whisper-large-v3",
-        language="en",
-        response_format="text",
-        temperature=0.0)
-    print("transcription result:", transcription)
+
+
+def sync_openai():
+    with open(str(mary_had_lamb), "rb") as f:
+        transcription = client.audio.transcriptions.create(
+            file=f,
+            model="openai/whisper-small",
+            language="en",
+            response_format="json",
+            temperature=0.0)
+        print("transcription result:", transcription.text)
+
+
+sync_openai()
+
+
+# OpenAI Transcription API client does not support streaming.
+async def stream_openai_response():
+    data = {
+        "language": "en",
+        'stream': True,
+        "model": "openai/whisper-large-v3",
+    }
+    url = openai_api_base + "/audio/transcriptions"
+    print("transcription result:", end=' ')
+    async with httpx.AsyncClient() as client:
+        with open(str(winning_call), "rb") as f:
+            async with client.stream('POST', url, files={'file': f},
+                                     data=data) as response:
+                async for line in response.aiter_lines():
+                    # Each line is a JSON object prefixed with 'data: '
+                    if line:
+                        if line.startswith('data: '):
+                            line = line[len('data: '):]
+                        # Last chunk, stream ends
+                        if line.strip() == '[DONE]':
+                            break
+                        # Parse the JSON response
+                        chunk = json.loads(line)
+                        # Extract and print the content
+                        content = chunk['choices'][0].get('delta',
+                                                          {}).get('content')
+                        print(content, end='')
+
+
+# Run the asynchronous function
+asyncio.run(stream_openai_response())
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 5d4a5de4bad..29571bcd764 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -3,12 +3,14 @@
 # imports for guided decoding tests
 import io
 import json
+from unittest.mock import patch
 
 import librosa
 import numpy as np
 import openai
 import pytest
 import soundfile as sf
+from openai._base_client import AsyncAPIClient
 
 from vllm.assets.audio import AudioAsset
 
@@ -120,3 +122,73 @@ async def test_completion_endpoints():
         res = await client.completions.create(model=model_name, prompt="Hello")
         assert res.code == 400
         assert res.message == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    transcription = ""
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res_no_stream = await client.audio.transcriptions.create(
+            model=model_name,
+            file=winning_call,
+            response_format="json",
+            language="en",
+            temperature=0.0)
+        # Unfortunately this only works when the openai client is patched
+        # to use streaming mode, not exposed in the transcription api.
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True))
+            # Reconstruct from chunks and validate
+            async for chunk in res:
+                # just a chunk
+                text = chunk.choices[0]['delta']['content']
+                transcription += text
+
+        assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        original_post = AsyncAPIClient.post
+
+        async def post_with_stream(*args, **kwargs):
+            kwargs['stream'] = True
+            return await original_post(*args, **kwargs)
+
+        with patch.object(AsyncAPIClient, "post", new=post_with_stream):
+            client = remote_server.get_async_client()
+            res = await client.audio.transcriptions.create(
+                model=model_name,
+                file=winning_call,
+                language="en",
+                temperature=0.0,
+                extra_body=dict(stream=True,
+                                stream_include_usage=True,
+                                stream_continuous_usage_stats=True))
+            final = False
+            continuous = True
+            async for chunk in res:
+                if not len(chunk.choices):
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and hasattr(chunk, 'usage')
+            assert final and continuous
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4c4d86fddb5..1c85e27a558 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1285,6 +1285,21 @@ class ChatCompletionStreamResponse(OpenAIBaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
 
 
+class TranscriptionResponseStreamChoice(OpenAIBaseModel):
+    delta: DeltaMessage
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = None
+
+
+class TranscriptionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"trsc-{random_uuid()}")
+    object: Literal["transcription.chunk"] = "transcription.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[TranscriptionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+
+
 class BatchRequestInput(OpenAIBaseModel):
     """
     The per-line object of the batch input file.
@@ -1510,6 +1525,15 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
+    stream: Optional[bool] = False
+    """Custom field not present in the original OpenAI definition. When set, 
+    it will enable output to be streamed in a similar fashion as the Chat
+    Completion endpoint. 
+    """
+    # Flattened stream option to simplify form data.
+    stream_include_usage: Optional[bool] = False
+    stream_continuous_usage_stats: Optional[bool] = False
+
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
         "temperature": 0,
@@ -1530,7 +1554,21 @@ def to_sampling_params(
                 "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"])
 
         return SamplingParams.from_optional(temperature=temperature,
-                                            max_tokens=max_tokens)
+                                            max_tokens=max_tokens,
+                                            output_kind=RequestOutputKind.DELTA
+                                            if self.stream \
+                                            else RequestOutputKind.FINAL_ONLY)
+
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        stream_opts = ["stream_include_usage", "stream_continuous_usage_stats"]
+        stream = data.get("stream", False)
+        if any(bool(data.get(so, False)) for so in stream_opts) and not stream:
+            raise ValueError(
+                "Stream options can only be defined when `stream=True`.")
+
+        return data
 
 
 # Transcription response objects
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 402a0bb7a6b..13565d0ef8d 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -1,24 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import io
+import time
 from collections.abc import AsyncGenerator
-from typing import Optional, Union, cast
+from math import ceil
+from typing import Final, Optional, Union, cast
 
 from fastapi import Request
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
-                                              RequestResponseMetadata,
-                                              TranscriptionRequest,
-                                              TranscriptionResponse,
-                                              TranscriptionResponseVerbose)
+from vllm.entrypoints.openai.protocol import (
+    DeltaMessage, ErrorResponse, RequestResponseMetadata, TranscriptionRequest,
+    TranscriptionResponse, TranscriptionResponseStreamChoice,
+    TranscriptionStreamResponse, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.transformers_utils.processor import cached_get_processor
 from vllm.utils import PlaceholderModule
 
 try:
@@ -140,8 +142,6 @@
 # As per https://platform.openai.com/docs/guides/speech-to-text#overview.
 # TODO configurable
 MAX_AUDIO_CLIP_FILESIZE_MB = 25
-# TODO get from processor.feature_extractor.chunk_length
-MAX_AUDIO_CLIP_DURATION_S = 30
 
 
 class OpenAIServingTranscription(OpenAIServing):
@@ -163,6 +163,11 @@ def __init__(
 
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())
+        processor = cached_get_processor(model_config.model)
+        self.max_audio_clip_s = processor.feature_extractor.chunk_length
+        self.model_sr = processor.feature_extractor.sampling_rate
+        self.hop_length = processor.feature_extractor.hop_length
+
         if self.default_sampling_params:
             logger.info(
                 "Overwriting default completion sampling param with: %s",
@@ -172,7 +177,7 @@ async def _preprocess_transcription(
         self,
         request: TranscriptionRequest,
         audio_data: bytes,
-    ) -> PromptType:
+    ) -> tuple[PromptType, float]:
         # Validate request
         # TODO language should be optional and can be guessed.
         # For now we default to en. See
@@ -198,9 +203,11 @@ async def _preprocess_transcription(
 
         with io.BytesIO(audio_data) as bytes_:
             y, sr = librosa.load(bytes_)
-        if librosa.get_duration(y=y, sr=sr) > MAX_AUDIO_CLIP_DURATION_S:
+
+        duration = librosa.get_duration(y=y, sr=sr)
+        if duration > self.max_audio_clip_s:
             raise ValueError(
-                f"Maximum clip duration ({MAX_AUDIO_CLIP_DURATION_S}s) "
+                f"Maximum clip duration ({self.max_audio_clip_s}s) "
                 "exceeded.")
 
         prompt = {
@@ -213,13 +220,13 @@ async def _preprocess_transcription(
             "decoder_prompt":
             f"<|startoftranscript|>{lang_token}<|transcribe|><|notimestamps|>{request.prompt}"
         }
-        return cast(PromptType, prompt)
+        return cast(PromptType, prompt), duration
 
     # TODO (varun) : Make verbose response work !
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest,
         raw_request: Request
-    ) -> Union[TranscriptionResponse, TranscriptionResponseVerbose,
+    ) -> Union[TranscriptionResponse, AsyncGenerator[str, None],
                ErrorResponse]:
         """Transcription API similar to OpenAI's API.
 
@@ -240,8 +247,7 @@ async def create_transcription(
             return self.create_error_response(
                 "Currently only support response_format `text` or `json`")
 
-        # TODO cmpl->transcription?
-        request_id = f"cmpl-{self._base_request_id(raw_request)}"
+        request_id = f"trsc-{self._base_request_id(raw_request)}"
 
         request_metadata = RequestResponseMetadata(request_id=request_id)
         if raw_request:
@@ -261,7 +267,7 @@ async def create_transcription(
                     "Currently do not support PromptAdapter for Transcription."
                 )
 
-            prompt = await self._preprocess_transcription(
+            prompt, duration_s = await self._preprocess_transcription(
                 request=request,
                 audio_data=audio_data,
             )
@@ -293,7 +299,12 @@ async def create_transcription(
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
 
-        # TODO(rob): figure out a way to pipe streaming in.
+        if request.stream:
+            return self.transcription_stream_generator(request,
+                                                       result_generator,
+                                                       request_id,
+                                                       request_metadata,
+                                                       duration_s)
         # Non-streaming response.
         try:
             assert result_generator is not None
@@ -305,3 +316,106 @@ async def create_transcription(
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
+
+    async def transcription_stream_generator(
+            self, request: TranscriptionRequest,
+            result_generator: AsyncGenerator[RequestOutput, None],
+            request_id: str, request_metadata: RequestResponseMetadata,
+            audio_duration_s: float) -> AsyncGenerator[str, None]:
+        created_time = int(time.time())
+        model_name = request.model
+        chunk_object_type: Final = "transcription.chunk"
+
+        completion_tokens = 0
+        num_prompt_tokens = 0
+
+        include_usage = request.stream_include_usage \
+            if request.stream_include_usage else False
+        include_continuous_usage = request.stream_continuous_usage_stats\
+              if include_usage and request.stream_continuous_usage_stats\
+                else False
+
+        try:
+            async for res in result_generator:
+                # On first result.
+                if res.prompt_token_ids is not None:
+                    # Do not account the 4-tokens `<|startoftranscript|>..`
+                    # Could be negative when language token is not specified.
+                    num_prompt_tokens = max(len(res.prompt_token_ids) - 4, 0)
+                    # NOTE(NickLucche) user can't pass encoder prompts directly
+                    # at least not to Whisper. One indicator of the encoder
+                    # amount of processing is the log-mel spectogram length.
+                    num_prompt_tokens += ceil(audio_duration_s *
+                                              self.model_sr / self.hop_length)
+
+                # We need to do it here, because if there are exceptions in
+                # the result_generator, it needs to be sent as the FIRST
+                # response (by the try...catch).
+
+                # Just one output (n=1) supported.
+                assert len(res.outputs) == 1
+                output = res.outputs[0]
+
+                delta_message = DeltaMessage(content=output.text)
+                completion_tokens += len(output.token_ids)
+
+                if output.finish_reason is None:
+                    # Still generating, send delta update.
+                    choice_data = TranscriptionResponseStreamChoice(
+                        delta=delta_message)
+                else:
+                    # Model is finished generating.
+                    choice_data = TranscriptionResponseStreamChoice(
+                        delta=delta_message,
+                        finish_reason=output.finish_reason,
+                        stop_reason=output.stop_reason)
+
+                chunk = TranscriptionStreamResponse(id=request_id,
+                                                    object=chunk_object_type,
+                                                    created=created_time,
+                                                    choices=[choice_data],
+                                                    model=model_name)
+
+                # handle usage stats if requested & if continuous
+                if include_continuous_usage:
+                    chunk.usage = UsageInfo(
+                        prompt_tokens=num_prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=num_prompt_tokens + completion_tokens,
+                    )
+
+                data = chunk.model_dump_json(exclude_unset=True)
+                yield f"data: {data}\n\n"
+
+            # Once the final token is handled, if stream_options.include_usage
+            # is sent, send the usage.
+            if include_usage:
+                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=completion_tokens,
+                                        total_tokens=num_prompt_tokens +
+                                        completion_tokens)
+
+                final_usage_chunk = TranscriptionStreamResponse(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
+            # report to FastAPI middleware aggregate usage across all choices
+            request_metadata.final_usage_info = UsageInfo(
+                prompt_tokens=num_prompt_tokens,
+                completion_tokens=completion_tokens,
+                total_tokens=num_prompt_tokens + completion_tokens)
+
+        except Exception as e:
+            # TODO: Use a vllm-specific Validation Error
+            logger.exception("Error in chat completion stream generator.")
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"

From a8571fe8ac6f167a93858c361ee87d0eb97dc537 Mon Sep 17 00:00:00 2001
From: Yanyi Liu <wolfsonliu@163.com>
Date: Thu, 6 Mar 2025 21:20:37 +0800
Subject: [PATCH 0544/1240] [Doc] Update reasoning with stream example to use
 OpenAI library (#14077)

Signed-off-by: liuyanyi <wolfsonliu@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md     | 50 ++++++++++-
 ...hat_completion_with_reasoning_streaming.py | 89 +++++++------------
 2 files changed, 82 insertions(+), 57 deletions(-)

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index e2c154d31ba..e5c03793f75 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -78,7 +78,55 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
 }
 ```
 
-Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client support extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
+
+```python
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+stream = client.chat.completions.create(model=model,
+                                        messages=messages,
+                                        stream=True)
+
+print("client: Start streaming chat completions...")
+printed_reasoning_content = False
+printed_content = False
+
+for chunk in stream:
+    reasoning_content = None
+    content = None
+    # Check the content is reasoning_content or content
+    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif hasattr(chunk.choices[0].delta, "content"):
+        content = chunk.choices[0].delta.content
+
+    if reasoning_content is not None:
+        if not printed_reasoning_content:
+            printed_reasoning_content = True
+            print("reasoning_content:", end="", flush=True)
+        print(reasoning_content, end="", flush=True)
+    elif content is not None:
+        if not printed_content:
+            printed_content = True
+            print("\ncontent:", end="", flush=True)
+        # Extract and print the content
+        print(content, end="", flush=True)
+```
+
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Structured output
 
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 489bfcd5ec2..fe4332576d4 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -19,73 +19,50 @@
 where you want to display chat completions to the user as they are generated
 by the model.
 
-Here we do not use the OpenAI Python client library, because it does not support
-`reasoning_content` fields in the response.
+Remember to check content and reasoning_content exist in `ChatCompletionChunk`,
+content may not exist leading to errors if you try to access it.
 """
 
-import json
-
-import requests
+from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-models = requests.get(
-    f"{openai_api_base}/models",
-    headers={
-        "Authorization": f"Bearer {openai_api_key}"
-    },
-).json()
-model = models["data"][0]["id"]
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
 
-# Streaming chat completions
-messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+models = client.models.list()
+model = models.data[0].id
 
-response = requests.post(
-    f"{openai_api_base}/chat/completions",
-    headers={"Authorization": f"Bearer {openai_api_key}"},
-    json={
-        "model": model,
-        "messages": messages,
-        "stream": True
-    },
-)
+messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+stream = client.chat.completions.create(model=model,
+                                        messages=messages,
+                                        stream=True)
 
 print("client: Start streaming chat completions...")
 printed_reasoning_content = False
 printed_content = False
-# Make the streaming request
-if response.status_code == 200:
-    # Process the streaming response
-    for line in response.iter_lines():
-        if line:  # Filter out keep-alive new lines
-            # Decode the line and parse the JSON
-            decoded_line = line.decode("utf-8")
-            if decoded_line.startswith("data:"):
-                data = decoded_line[5:].strip()  # Remove "data:" prefix
-                if data == "[DONE]":  # End of stream
-                    print("\nclient: Stream completed.")
-                    break
-                try:
-                    # Parse the JSON data
-                    chunk = json.loads(data)
-                    reasoning_content = chunk["choices"][0]["delta"].get(
-                        "reasoning_content", "")
-                    content = chunk["choices"][0]["delta"].get("content", "")
 
-                    if reasoning_content:
-                        if not printed_reasoning_content:
-                            printed_reasoning_content = True
-                            print("reasoning_content:", end="", flush=True)
-                        print(reasoning_content, end="", flush=True)
-                    elif content:
-                        if not printed_content:
-                            printed_content = True
-                            print("\ncontent:", end="", flush=True)
-                        # Extract and print the content
-                        print(content, end="", flush=True)
-                except json.JSONDecodeError:
-                    print("Error decoding JSON:", decoded_line)
-else:
-    print(f"Error: {response.status_code} - {response.text}")
+for chunk in stream:
+    reasoning_content = None
+    content = None
+    # Check the content is reasoning_content or content
+    if hasattr(chunk.choices[0].delta, "reasoning_content"):
+        reasoning_content = chunk.choices[0].delta.reasoning_content
+    elif hasattr(chunk.choices[0].delta, "content"):
+        content = chunk.choices[0].delta.content
+
+    if reasoning_content is not None:
+        if not printed_reasoning_content:
+            printed_reasoning_content = True
+            print("reasoning_content:", end="", flush=True)
+        print(reasoning_content, end="", flush=True)
+    elif content is not None:
+        if not printed_content:
+            printed_content = True
+            print("\ncontent:", end="", flush=True)
+        # Extract and print the content
+        print(content, end="", flush=True)

From d342ba5d373eb16e79f04178d037d3f0e188a471 Mon Sep 17 00:00:00 2001
From: Irina Yuryeva <76484191+upayuryeva@users.noreply.github.com>
Date: Thu, 6 Mar 2025 18:37:10 +0300
Subject: [PATCH 0545/1240] [Doc] Correct beam_search using in
 generative_models.md (#14363)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/generative_models.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index f31e5715d17..06daa04f2de 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -54,14 +54,16 @@ The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggi
 For example, to search using 5 beams and output at most 50 tokens:
 
 ```python
+from vllm import LLM
+from vllm.sampling_params import BeamSearchParams
+
 llm = LLM(model="facebook/opt-125m")
 params = BeamSearchParams(beam_width=5, max_tokens=50)
-outputs = llm.generate("Hello, my name is", params)
+outputs = llm.beam_search([{"prompt": "Hello, my name is "}], params)
 
 for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    generated_text = output.sequences[0].text
+    print(f"Generated text: {generated_text!r}")
 ```
 
 ### `LLM.chat`

From c78f96cfb2e3147099e7a7a4d807bcd799f2d2b6 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 6 Mar 2025 16:39:16 +0100
Subject: [PATCH 0546/1240] [Kernel] [V1] Improved performance for V1 Triton
 (ROCm) backend  (#14152)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_prefix_prefill.py          | 135 ++++----
 .../ops/chunked_prefill_paged_decode.py       | 289 ++++++++++++++++++
 vllm/attention/ops/prefix_prefill.py          |  14 +-
 vllm/v1/attention/backends/rocm_attn.py       |  37 +--
 4 files changed, 398 insertions(+), 77 deletions(-)
 create mode 100644 vllm/attention/ops/chunked_prefill_paged_decode.py

diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index f2c7f2c809e..50eaa92f59b 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -3,6 +3,7 @@
 import math
 import random
 import time
+from collections.abc import Callable
 
 import pytest
 import torch
@@ -10,6 +11,8 @@
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask
 
 from vllm.attention.backends.xformers import _make_alibi_bias
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
 from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.platforms import current_platform
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@@ -24,6 +27,8 @@
 SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
+OPS = [chunked_prefill_paged_decode, context_attention_fwd]
+
 
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV)
@@ -32,6 +37,7 @@
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention(
     num_heads: int,
@@ -41,6 +47,7 @@ def test_contexted_kv_attention(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
 
     if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
@@ -65,6 +72,9 @@ def test_contexted_kv_attention(
     block_size = 32
     max_block_per_request = 64
     query_lens = [random.randint(16, MAX_SEQ_LEN) for _ in range(BS)]
+    # ensure one sequence in batch is a decode
+    query_lens[-1] = 1
+
     ctx_lens = [random.randint(16, MAX_CTX_LEN) for _ in range(BS)]
     seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)]
     num_kv_heads = num_heads // num_queries_per_kv
@@ -144,36 +154,36 @@ def test_contexted_kv_attention(
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          sliding_window=sliding_window)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
     torch.cuda.synchronize()
     start_time = time.time()
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          sliding_window=sliding_window)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       sliding_window=sliding_window)
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
@@ -228,7 +238,7 @@ def test_contexted_kv_attention(
     end_time = time.time()
     print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms")
     output_ref = output_ref.reshape(output.shape)
-    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6
+    atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-4
     torch.testing.assert_close(output, output_ref, atol=atol, rtol=0)
 
 
@@ -238,6 +248,7 @@ def test_contexted_kv_attention(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi(
     num_heads: int,
@@ -246,6 +257,7 @@ def test_contexted_kv_attention_alibi(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
 
     if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability(
@@ -375,36 +387,36 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 
     # Warm up the Triton kernel by calling it once before actually measuring
     # generation time
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          alibi_slopes=alibi_slopes)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     start_time = time.time()
-    context_attention_fwd(query,
-                          k,
-                          v,
-                          output,
-                          kv_cache_dtype,
-                          k_cache,
-                          v_cache,
-                          block_table,
-                          b_start_loc,
-                          b_seq_len,
-                          max_input_len,
-                          k_scale,
-                          v_scale,
-                          alibi_slopes=alibi_slopes)
+    op(query,
+       k,
+       v,
+       output,
+       kv_cache_dtype,
+       k_cache,
+       v_cache,
+       block_table,
+       b_start_loc,
+       b_seq_len,
+       max_input_len,
+       k_scale,
+       v_scale,
+       alibi_slopes=alibi_slopes)
     torch.cuda.synchronize()
     end_time = time.time()
     print(f"triton Time: {(end_time - start_time)*1000:.2f} ms")
@@ -503,6 +515,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("sliding_window", SLIDING_WINDOW)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_f32(
     num_heads: int,
@@ -512,9 +525,11 @@ def test_contexted_kv_attention_f32(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
     test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size,
-                                sliding_window, dtype, kv_cache_dtype, device)
+                                sliding_window, dtype, kv_cache_dtype, device,
+                                op)
 
 
 @pytest.mark.optional
@@ -524,6 +539,7 @@ def test_contexted_kv_attention_f32(
 @pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("op", OPS)
 @torch.inference_mode()
 def test_contexted_kv_attention_alibi_f32(
     num_heads: int,
@@ -532,6 +548,7 @@ def test_contexted_kv_attention_alibi_f32(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
+    op: Callable,
 ) -> None:
     test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size,
-                                      dtype, kv_cache_dtype, device)
+                                      dtype, kv_cache_dtype, device, op)
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
new file mode 100644
index 00000000000..807a270b43d
--- /dev/null
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -0,0 +1,289 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import triton
+import triton.language as tl
+
+from .prefix_prefill import context_attention_fwd
+
+
+@triton.jit
+def cdiv_fn(x, y):
+    return (x + y - 1) // y
+
+
+@triton.jit
+def kernel_paged_attention_2d(
+        output_ptr,  # [num_tokens, num_query_heads, head_size]
+        query_ptr,  # [num_tokens, num_query_heads, head_size]
+        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+        seq_lens_ptr,  # [num_seqs]
+        alibi_slopes_ptr,  # [num_query_heads]
+        scale,  # float32
+        k_scale,  # float32
+        v_scale,  # float32
+        num_query_heads: tl.constexpr,  # int
+        num_queries_per_kv: tl.constexpr,  # int
+        block_table_stride: tl.constexpr,  # int
+        query_stride_0: tl.constexpr,  # int
+        query_stride_1: tl.constexpr,  # int, should be equal to head_size
+        output_stride_0: tl.constexpr,  # int
+        output_stride_1: tl.constexpr,  # int, should be equal to head_size
+        BLOCK_SIZE: tl.constexpr,  # int
+        HEAD_SIZE: tl.constexpr,  # int
+        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+        USE_ALIBI_SLOPES: tl.constexpr,  # bool
+        SLIDING_WINDOW: tl.constexpr,  # int
+        x: tl.constexpr,  # int
+        stride_k_cache_0: tl.constexpr,  # int
+        stride_k_cache_1: tl.constexpr,  # int
+        stride_k_cache_2: tl.constexpr,  # int
+        stride_k_cache_3: tl.constexpr,  # int
+        stride_k_cache_4: tl.constexpr,  # int
+        stride_v_cache_0: tl.constexpr,  # int
+        stride_v_cache_1: tl.constexpr,  # int
+        stride_v_cache_2: tl.constexpr,  # int
+        stride_v_cache_3: tl.constexpr,  # int
+        filter_by_query_len: tl.constexpr,  # bool
+        query_start_len_ptr,  # [num_seqs+1]
+):
+    seq_idx = tl.program_id(0)
+    query_head_idx = tl.program_id(1)
+    kv_head_idx = query_head_idx // num_queries_per_kv
+
+    if filter_by_query_len:
+        cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
+        cur_batch_in_all_stop_index = tl.load(query_start_len_ptr + seq_idx +
+                                              1)
+        cur_batch_query_len = cur_batch_in_all_stop_index \
+            - cur_batch_in_all_start_index
+        if cur_batch_query_len > 1:
+            return
+    else:
+        cur_batch_in_all_start_index = seq_idx
+
+    query_offset = (cur_batch_in_all_start_index * query_stride_0 +
+                    query_head_idx * query_stride_1)
+
+    dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
+                        0).to(tl.int1)
+
+    # Q : (HEAD_SIZE,)
+    Q = tl.load(
+        query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED),
+        mask=dim_mask,
+        other=0.0,
+    )
+
+    block_table_offset = seq_idx * block_table_stride
+
+    M = tl.full([1], float("-inf"), dtype=tl.float32)
+    L = tl.full([1], 1.0, dtype=tl.float32)
+    acc = tl.zeros([HEAD_SIZE_PADDED], dtype=tl.float32)
+
+    # sequence len for this particular sequence
+    seq_len = tl.load(seq_lens_ptr + seq_idx)
+
+    # alibi slope for this head
+    if USE_ALIBI_SLOPES:
+        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx)
+
+    num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
+
+    # iterate through tiles
+    for j in range(0, num_blocks):
+
+        physical_block_idx = tl.load(block_tables_ptr + block_table_offset + j)
+
+        offs_n = tl.arange(0, BLOCK_SIZE)
+        offs_d = tl.arange(0, HEAD_SIZE_PADDED)
+
+        v_offset = (physical_block_idx * stride_v_cache_0 +
+                    kv_head_idx * stride_v_cache_1 +
+                    offs_d[:, None] * stride_v_cache_2 +
+                    offs_n[None, :] * stride_v_cache_3)
+
+        k_offset = (physical_block_idx * stride_k_cache_0 +
+                    kv_head_idx * stride_k_cache_1 +
+                    (offs_d[:, None] // x) * stride_k_cache_2 +
+                    offs_n[None, :] * stride_k_cache_3 +
+                    (offs_d[:, None] % x) * stride_k_cache_4)
+
+        # K : (HEAD_SIZE, BLOCK_SIZE)
+        K_load = tl.load(key_cache_ptr + k_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if K_load.dtype.is_fp8():
+            K = (K_load.to(tl.float32) * tl.load(k_scale)).to(Q.dtype)
+        else:
+            K = K_load
+
+        # V : (HEAD_SIZE, BLOCK_SIZE)
+        V_load = tl.load(value_cache_ptr + v_offset,
+                         mask=dim_mask[:, None],
+                         other=0.0)
+
+        if V_load.dtype.is_fp8():
+            V = (V_load.to(tl.float32) * tl.load(v_scale)).to(Q.dtype)
+        else:
+            V = V_load
+
+        tmp = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32)
+        mask_new = tmp < boundary
+        # S : (BLOCK_SIZE,)
+        S = tl.where(mask_new, 0.0, float("-inf")).to(tl.float32)
+        S += scale * tl.sum(K * Q[:, None], axis=0)
+
+        if SLIDING_WINDOW > 0:
+            S = tl.where((seq_len - 1 - tmp) < SLIDING_WINDOW, S, -10000)
+
+        if USE_ALIBI_SLOPES:
+            S += alibi_slope * (tmp - seq_len + 1)
+
+        # compute running maximum
+        # m_j : (1,)
+        m_j = tl.maximum(M, tl.max(S, axis=0))
+
+        # P : (BLOCK_SIZE,)
+        P = tl.exp(S - m_j)
+
+        # l_j : (1,)
+        l_j = tl.sum(P, axis=0)
+
+        # alpha : (1, )
+        alpha = tl.exp(M - m_j)
+
+        # acc : (BLOCK_SIZE,)
+        acc = acc * alpha
+
+        # update constants
+        L = L * alpha + l_j
+        M = m_j
+
+        # acc : (BLOCK_SIZE,)
+        acc += tl.sum(V * P[None, :], axis=1)
+
+    # epilogue
+    acc = acc / L
+
+    output_offset = (cur_batch_in_all_start_index * output_stride_0 +
+                     query_head_idx * output_stride_1)
+
+    tl.store(output_ptr + output_offset + tl.arange(0, HEAD_SIZE_PADDED),
+             acc,
+             mask=dim_mask)
+
+
+def chunked_prefill_paged_decode(
+    query,
+    key,
+    value,
+    output,
+    kv_cache_dtype,
+    key_cache,
+    value_cache,
+    block_table,
+    query_start_loc,
+    seq_lens,
+    max_query_len,
+    k_scale,
+    v_scale,
+    alibi_slopes=None,
+    sliding_window=None,
+    sm_scale=None,
+):
+
+    if sm_scale is None:
+        sm_scale = 1.0 / (query.shape[1]**0.5)
+
+    use_alibi_slopes = alibi_slopes is not None
+
+    if sliding_window is None or sliding_window <= 0:
+        sliding_window = 0
+
+    if max_query_len > 1:
+        context_attention_fwd(
+            q=query,
+            k=key,
+            v=value,
+            o=output,
+            kv_cache_dtype=kv_cache_dtype,
+            k_cache=key_cache,
+            v_cache=value_cache,
+            b_loc=block_table,
+            b_start_loc=query_start_loc,
+            b_seq_len=seq_lens,
+            max_input_len=max_query_len,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            alibi_slopes=alibi_slopes,
+            sliding_window=sliding_window,
+            sm_scale=sm_scale,
+            skip_decode=True,
+        )
+
+    block_size = value_cache.shape[3]
+    num_seqs = len(seq_lens)
+    num_query_heads = query.shape[1]
+    num_queries_per_kv = query.shape[1] // key.shape[1]
+    head_size = query.shape[2]
+
+    # Conversion of FP8 Tensor from uint8 storage to
+    # appropriate torch.dtype for interpretation by Triton
+    if "fp8" in kv_cache_dtype:
+        assert key_cache.dtype == torch.uint8
+        assert value_cache.dtype == torch.uint8
+
+        if kv_cache_dtype in ("fp8", "fp8_e4m3"):
+            target_dtype = torch.float8_e4m3fn
+        elif kv_cache_dtype == "fp8_e5m2":
+            target_dtype = torch.float8_e5m2
+        else:
+            raise ValueError("Unsupported FP8 dtype:", kv_cache_dtype)
+
+        key_cache = key_cache.view(target_dtype)
+        value_cache = value_cache.view(target_dtype)
+
+    kernel_paged_attention_2d[(
+        num_seqs,
+        num_query_heads,
+    )](
+        output_ptr=output,
+        query_ptr=query,
+        key_cache_ptr=key_cache,
+        value_cache_ptr=value_cache,
+        block_tables_ptr=block_table,
+        seq_lens_ptr=seq_lens,
+        alibi_slopes_ptr=alibi_slopes,
+        scale=sm_scale,
+        k_scale=k_scale,
+        v_scale=v_scale,
+        num_query_heads=num_query_heads,
+        num_queries_per_kv=num_queries_per_kv,
+        block_table_stride=block_table.stride(0),
+        query_stride_0=query.stride(0),
+        query_stride_1=query.stride(1),
+        output_stride_0=output.stride(0),
+        output_stride_1=output.stride(1),
+        BLOCK_SIZE=block_size,
+        HEAD_SIZE=head_size,
+        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+        USE_ALIBI_SLOPES=use_alibi_slopes,
+        SLIDING_WINDOW=sliding_window,
+        x=key_cache.shape[4],
+        stride_k_cache_0=key_cache.stride(0),
+        stride_k_cache_1=key_cache.stride(1),
+        stride_k_cache_2=key_cache.stride(2),
+        stride_k_cache_3=key_cache.stride(3),
+        stride_k_cache_4=key_cache.stride(4),
+        stride_v_cache_0=value_cache.stride(0),
+        stride_v_cache_1=value_cache.stride(1),
+        stride_v_cache_2=value_cache.stride(2),
+        stride_v_cache_3=value_cache.stride(3),
+        filter_by_query_len=True,
+        query_start_len_ptr=query_start_loc,
+    )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 103c408ebbf..e85ec605ad2 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -64,7 +64,9 @@ def _fwd_kernel(
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
         SLIDING_WINDOW: tl.constexpr,
+        SKIP_DECODE: tl.constexpr,
     ):
+
         cur_batch = tl.program_id(0)
         cur_head = tl.program_id(1)
         start_m = tl.program_id(2)
@@ -78,6 +80,9 @@ def _fwd_kernel(
                                cur_batch_in_all_start_index)
         cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
+        if SKIP_DECODE and cur_batch_query_len == 1:
+            return
+
         # start position inside of the query
         # generally, N goes over kv, while M goes over query_len
         block_start_loc = BLOCK_M * start_m
@@ -500,6 +505,7 @@ def _fwd_kernel_alibi(
         BLOCK_DMODEL: tl.constexpr,  # head size
         BLOCK_DMODEL_PADDED: tl.constexpr,  # head size padded to a power of 2
         BLOCK_N: tl.constexpr,
+        SKIP_DECODE: tl.constexpr,
     ):
         # attn_bias[]
         cur_batch = tl.program_id(0)
@@ -518,6 +524,9 @@ def _fwd_kernel_alibi(
                                cur_batch_in_all_start_index)
         cur_batch_ctx_len = cur_batch_seq_len - cur_batch_query_len
 
+        if SKIP_DECODE and cur_batch_query_len == 1:
+            return
+
         block_start_loc = BLOCK_M * start_m
 
         # initialize offsets
@@ -721,7 +730,8 @@ def context_attention_fwd(q,
                               v_scale: torch.Tensor,
                               alibi_slopes=None,
                               sliding_window=None,
-                              sm_scale=None):
+                              sm_scale=None,
+                              skip_decode=False):
 
         q_dtype_is_f32 = q.dtype is torch.float32
         # need to reduce num. blocks when using fp32
@@ -823,6 +833,7 @@ def context_attention_fwd(q,
                 BLOCK_DMODEL=Lk,
                 BLOCK_DMODEL_PADDED=Lk_padded,
                 BLOCK_N=BLOCK,
+                SKIP_DECODE=skip_decode,
                 num_warps=NUM_WARPS,
                 num_stages=1,
             )
@@ -875,6 +886,7 @@ def context_attention_fwd(q,
             BLOCK_DMODEL_PADDED=Lk_padded,
             BLOCK_N=BLOCK,
             SLIDING_WINDOW=sliding_window,
+            SKIP_DECODE=skip_decode,
             num_warps=NUM_WARPS,
             num_stages=1,
         )
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index a625d99f4a1..640c3b3d4fb 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -6,8 +6,9 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
+from vllm.attention.ops.chunked_prefill_paged_decode import (
+    chunked_prefill_paged_decode)
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.attention.ops.prefix_prefill import context_attention_fwd
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.flash_attn import (
     FlashAttentionMetadata, FlashAttentionMetadataBuilder)
@@ -156,20 +157,22 @@ def forward(
         )
 
         # Compute attention and update output up to `num_actual_tokens`.
-        context_attention_fwd(q=query[:num_actual_tokens],
-                              k=key[:num_actual_tokens],
-                              v=value[:num_actual_tokens],
-                              o=output[:num_actual_tokens],
-                              kv_cache_dtype=self.kv_cache_dtype,
-                              k_cache=key_cache,
-                              v_cache=value_cache,
-                              b_loc=attn_metadata.block_table,
-                              b_start_loc=attn_metadata.query_start_loc,
-                              b_seq_len=attn_metadata.seq_lens,
-                              max_input_len=attn_metadata.max_query_len,
-                              k_scale=layer._k_scale,
-                              v_scale=layer._v_scale,
-                              alibi_slopes=self.alibi_slopes,
-                              sliding_window=self.sliding_window[0],
-                              sm_scale=self.scale)
+        chunked_prefill_paged_decode(
+            query=query[:num_actual_tokens],
+            key=key[:num_actual_tokens],
+            value=value[:num_actual_tokens],
+            output=output[:num_actual_tokens],
+            kv_cache_dtype=self.kv_cache_dtype,
+            key_cache=key_cache,
+            value_cache=value_cache,
+            block_table=attn_metadata.block_table,
+            query_start_loc=attn_metadata.query_start_loc,
+            seq_lens=attn_metadata.seq_lens,
+            max_query_len=attn_metadata.max_query_len,
+            k_scale=layer._k_scale,
+            v_scale=layer._v_scale,
+            alibi_slopes=self.alibi_slopes,
+            sliding_window=self.sliding_window[0],
+            sm_scale=self.scale)
+
         return output

From cdb78457c52c59463263b601a12dd1a1f9a72d3f Mon Sep 17 00:00:00 2001
From: courage17340 <courage17340@users.noreply.github.com>
Date: Thu, 6 Mar 2025 23:59:32 +0800
Subject: [PATCH 0547/1240] [Bugfix][Core] fix abort_seq_group and memory leak
 when n>1 (#14326)

Signed-off-by: courage17340 <courage17340@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/core/scheduler.py    | 33 ++++++++++++++++++++++++---------
 vllm/engine/llm_engine.py |  8 +++++++-
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 3cdad496e84..e93143c83d9 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -16,8 +16,9 @@
 from vllm.lora.request import LoRARequest
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
-                           SequenceGroupMetadata, SequenceGroupMetadataDelta,
-                           SequenceStage, SequenceStatus)
+                           SequenceGroupBase, SequenceGroupMetadata,
+                           SequenceGroupMetadataDelta, SequenceStage,
+                           SequenceStatus)
 from vllm.utils import Device, PyObjectCache
 
 logger = init_logger(__name__)
@@ -561,7 +562,11 @@ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
         # Only for testing purposes.
         self.swapped.append(seq_group)
 
-    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+    def abort_seq_group(
+        self,
+        request_id: Union[str, Iterable[str]],
+        seq_id_to_seq_group: Optional[Dict[str, SequenceGroupBase]] = None,
+    ) -> None:
         """Aborts a sequence group with the given ID.
 
         Check if the sequence group with the given ID
@@ -573,21 +578,29 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
 
         Args:
             request_id: The ID(s) of the sequence group to abort.
+            seq_id_to_seq_group: helper for groups with n>1
         """
         if isinstance(request_id, str):
             request_id = (request_id, )
         request_ids = set(request_id)
+        seq_id_to_seq_group = seq_id_to_seq_group or {}
         for state_queue in [self.waiting, self.running, self.swapped]:
             aborted_groups: List[SequenceGroup] = []
             for seq_group in state_queue:
-                if not request_ids:
-                    # Using 'break' here may add two extra iterations,
-                    # but is acceptable to reduce complexity.
-                    break
-                if seq_group.request_id in request_ids:
+                # When n>1, seq_group.request_id looks like
+                # foo_parallel_sample_0, while request_ids is just foo, and we
+                # should resolve it as real_request_id to match.
+                if seq_group.request_id in seq_id_to_seq_group:
+                    real_request_id = seq_id_to_seq_group[
+                        seq_group.request_id].group_id
+                else:
+                    real_request_id = seq_group.request_id
+                if real_request_id in request_ids:
                     # Appending aborted group into pending list.
                     aborted_groups.append(seq_group)
-                    request_ids.remove(seq_group.request_id)
+                    # We can't remove real_request_id in request_ids here,
+                    # because there may be other seq groups sharing the same
+                    # real_request_id
             for aborted_group in aborted_groups:
                 # Remove the sequence group from the state queue.
                 state_queue.remove(aborted_group)
@@ -598,6 +611,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
                         continue
                     seq.status = SequenceStatus.FINISHED_ABORTED
                     self.free_seq(seq)
+                if aborted_group.request_id in seq_id_to_seq_group:
+                    del seq_id_to_seq_group[aborted_group.request_id]
 
                 self._free_seq_group_cross_attn_blocks(aborted_group)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f055438d1fe..783275ab41d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -887,7 +887,8 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             >>> engine.abort_request(request_id)
         """
         for scheduler in self.scheduler:
-            scheduler.abort_seq_group(request_id)
+            scheduler.abort_seq_group(
+                request_id, seq_id_to_seq_group=self.seq_id_to_seq_group)
 
     def get_model_config(self) -> ModelConfig:
         """Gets the model configuration."""
@@ -1354,6 +1355,11 @@ def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
 
             finished_requests_ids = self.scheduler[
                 virtual_engine].get_and_reset_finished_requests_ids()
+            # When n>1, elements in self.seq_id_to_seq_group should be deleted
+            # here, otherwise memory leaks.
+            for finished_request_id in finished_requests_ids:
+                if finished_request_id in self.seq_id_to_seq_group:
+                    del self.seq_id_to_seq_group[finished_request_id]
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:

From b594982fc0acfba1f45ab303044d164d0d2d2e69 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 00:03:31 +0800
Subject: [PATCH 0548/1240] [Core] Don't use cache during multi-modal profiling
 (#14336)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/registry.py     |  4 +++-
 vllm/multimodal/registry.py | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 691fcd7dc53..babfc4fb809 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -331,7 +331,9 @@ def dummy_data_for_profiling(
 
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = mm_registry.create_processor(model_config, tokenizer)
+            processor = mm_registry.create_processor(model_config,
+                                                     tokenizer,
+                                                     disable_cache=True)
             profiler = MultiModalProfiler(processor)
             dummy_data = profiler.get_dummy_data(
                 seq_len, is_encoder_data=is_encoder_data)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 1882ffe9bf6..a9eb250cb87 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -257,7 +257,9 @@ def get_max_tokens_per_item_by_modality(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
             return processor.info.get_mm_max_tokens_per_item(
@@ -372,7 +374,9 @@ def get_mm_limits_per_prompt(
         """
         if self.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config, tokenizer)
+            processor = self.create_processor(model_config,
+                                              tokenizer,
+                                              disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -433,6 +437,8 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         tokenizer: AnyTokenizer,
+        *,
+        disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -440,11 +446,13 @@ def create_processor(
         See also:
             :ref:`mm-processing`
         """
+        if disable_cache is None:
+            disable_cache = model_config.disable_mm_preprocessor_cache
+
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = InputProcessingContext(model_config, tokenizer)
-        cache = (None if model_config.disable_mm_preprocessor_cache else
-                 self._processing_cache)
+        cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)

From da6e289c006378a8c2db5d6353f1a1387b00c06d Mon Sep 17 00:00:00 2001
From: Jitse Klomp <jitse.klomp@conclusionxforce.nl>
Date: Thu, 6 Mar 2025 17:29:57 +0100
Subject: [PATCH 0549/1240] [Doc] Fix date typo in README.md (#14366)

Signed-off-by: Jitse Klomp <jitse.klomp@conclusionxforce.nl>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bdf72d9a00b..49d6d525161 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in *
 
 *Latest News* 🔥
 
-- [2024/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
+- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!

From 631aa3e1c2eb409aecc6d217b5859ba34eeee7c2 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 7 Mar 2025 00:32:46 +0800
Subject: [PATCH 0550/1240] [RLHF] use worker_extension_cls for compatibility
 with V0 and V1 (#14185)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml               |   6 +-
 examples/offline_inference/rlhf.py          |  66 +-----------
 examples/offline_inference/rlhf_colocate.py |  36 +------
 examples/offline_inference/rlhf_utils.py    | 105 ++++++++++++++++++++
 vllm/config.py                              |   4 +
 vllm/engine/arg_utils.py                    |   9 ++
 vllm/worker/worker_base.py                  |  27 +++++
 7 files changed, 153 insertions(+), 100 deletions(-)
 create mode 100644 examples/offline_inference/rlhf_utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 521faeedd41..ef05cb99cf1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -145,8 +145,10 @@ steps:
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
-  - python3 ../examples/offline_inference/rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
+  - pushd ../examples/offline_inference
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 172d18cbce2..b0418c092ca 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -18,72 +18,11 @@
 import torch
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
 from transformers import AutoModelForCausalLM
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_ip, get_open_port
-from vllm.worker.worker import Worker
-
-
-def stateless_init_process_group(master_address, master_port, rank, world_size,
-                                 device):
-    """
-    vLLM provides `StatelessProcessGroup` to create a process group
-    without considering the global process group in torch.distributed.
-    It is recommended to create `StatelessProcessGroup`, and then initialize
-    the data-plane communication (NCCL) between external (train processes) 
-    and vLLM workers.
-    """
-    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
-    from vllm.distributed.utils import StatelessProcessGroup
-    pg = StatelessProcessGroup.create(host=master_address,
-                                      port=master_port,
-                                      rank=rank,
-                                      world_size=world_size)
-    pynccl = PyNcclCommunicator(pg, device=device)
-    return pynccl
-
-
-class MyWorker(Worker):
-    """
-    The `MyWorker` class inherits from `Worker` to provide custom functions.
-    For simplicity, we define the `MyWorker` class in this self-contained 
-    script. Normally, we should define the `MyWorker` class in a separate 
-    file and pass the qualified name of the class to the `worker_cls` 
-    parameter.
-    """
-
-    def init_weight_update_group(self, master_address, master_port,
-                                 rank_offset, world_size):
-        from vllm.distributed.parallel_state import get_world_group
-        rank = get_world_group().rank + rank_offset
-        self.model_update_group = stateless_init_process_group(
-            master_address,
-            master_port,
-            rank,
-            world_size,
-            self.device,
-        )
-
-    def update_weight(self, name, dtype, shape):
-        weight = torch.empty(shape, dtype=dtype, device="cuda")
-        self.model_update_group.broadcast(weight,
-                                          src=0,
-                                          stream=torch.cuda.current_stream())
-
-        self.model_runner.model.load_weights(weights=[(name, weight)])
-
-        del weight
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated
 
 
 class MyLLM(LLM):
@@ -129,7 +68,7 @@ def __init__(self, *args, **kwargs):
 )(MyLLM).remote(
     model="facebook/opt-125m",
     enforce_eager=True,
-    worker_cls=MyWorker,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
     tensor_parallel_size=2,
     distributed_executor_backend="ray",
 )
@@ -159,6 +98,7 @@ def __init__(self, *args, **kwargs):
 
 handle = llm.collective_rpc.remote("init_weight_update_group",
                                    args=(master_address, master_port, 1, 3))
+
 model_update_group = stateless_init_process_group(master_address, master_port,
                                                   0, 3, torch.device("cuda:0"))
 ray.get(handle)
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 15dc7edc18a..3ceac0fa2e2 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -17,40 +17,6 @@
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
 from vllm import LLM
-from vllm.worker.worker import Worker
-
-
-class MyWorker(Worker):
-
-    def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
-        self.device_uuid = current_platform.get_device_uuid(self.device.index)
-        return self.device_uuid
-
-    def update_weights_from_ipc_handles(self, ipc_handles):
-        handles = ipc_handles[self.device_uuid]
-        device_id = self.device.index
-        weights = []
-        for name, handle in handles.items():
-            func, args = handle
-            list_args = list(args)
-            # the key is to change device id to the current device id
-            # in case two processes have different CUDA_VISIBLE_DEVICES
-            list_args[6] = device_id
-            tensor = func(*list_args)
-            weights.append((name, tensor))
-        self.model_runner.model.load_weights(weights=weights)
-        torch.cuda.synchronize()
-
-    def check_weights_changed(self):
-        """
-        Check if the weights are updated to 0.
-        """
-        weights_updated = True
-        for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
-        return weights_updated
 
 
 class MyLLM(LLM):
@@ -150,7 +116,7 @@ def get_weight_ipc_handles(self):
     )(MyLLM).remote(
         model="facebook/opt-125m",
         enforce_eager=True,
-        worker_cls=MyWorker,
+        worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
         tensor_parallel_size=2,
         distributed_executor_backend="ray",
         gpu_memory_utilization=0.4,
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
new file mode 100644
index 00000000000..11b73b7c4a0
--- /dev/null
+++ b/examples/offline_inference/rlhf_utils.py
@@ -0,0 +1,105 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def stateless_init_process_group(master_address, master_port, rank, world_size,
+                                 device):
+    """
+    vLLM provides `StatelessProcessGroup` to create a process group
+    without considering the global process group in torch.distributed.
+    It is recommended to create `StatelessProcessGroup`, and then initialize
+    the data-plane communication (NCCL) between external (train processes) 
+    and vLLM workers.
+    """
+    from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+    from vllm.distributed.utils import StatelessProcessGroup
+    pg = StatelessProcessGroup.create(host=master_address,
+                                      port=master_port,
+                                      rank=rank,
+                                      world_size=world_size)
+    pynccl = PyNcclCommunicator(pg, device=device)
+    return pynccl
+
+
+class WorkerExtension:
+    """
+    The class for vLLM's worker to inherit from.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def init_weight_update_group(self, master_address, master_port,
+                                 rank_offset, world_size):
+        from vllm.distributed.parallel_state import get_world_group
+        rank = get_world_group().rank + rank_offset
+        self.model_update_group = stateless_init_process_group(
+            master_address,
+            master_port,
+            rank,
+            world_size,
+            self.device,
+        )
+
+    def update_weight(self, name, dtype, shape):
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(weight,
+                                          src=0,
+                                          stream=torch.cuda.current_stream())
+
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+
+        del weight
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
+
+
+class ColocateWorkerExtension:
+    """
+    The class for vLLM's worker to inherit from, in the colocate setting.
+    By defining an extension class, the code can work no matter what is
+    the underlying worker class. This way, the code can be compatible
+    with both vLLM V0 and V1.
+    NOTE: we define this class in a separate module, and the main module
+    should pass the full qualified name as `worker_extension_cls` argument.
+    """
+
+    def report_device_id(self) -> str:
+        from vllm.platforms import current_platform
+        self.device_uuid = current_platform.get_device_uuid(self.device.index)
+        return self.device_uuid
+
+    def update_weights_from_ipc_handles(self, ipc_handles):
+        handles = ipc_handles[self.device_uuid]
+        device_id = self.device.index
+        weights = []
+        for name, handle in handles.items():
+            func, args = handle
+            list_args = list(args)
+            # the key is to change device id to the current device id
+            # in case two processes have different CUDA_VISIBLE_DEVICES
+            list_args[6] = device_id
+            tensor = func(*list_args)
+            weights.append((name, tensor))
+        self.model_runner.model.load_weights(weights=weights)
+        torch.cuda.synchronize()
+
+    def check_weights_changed(self):
+        """
+        Check if the weights are updated to 0.
+        """
+        weights_updated = True
+        for name, p in self.model_runner.model.named_parameters():
+            weights_updated = weights_updated and torch.allclose(
+                p, torch.zeros_like(p))
+        return weights_updated
diff --git a/vllm/config.py b/vllm/config.py
index 3f1bff49812..9b84d0405dc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1366,6 +1366,7 @@ class ParallelConfig:
     # will be determined based on the platform.
     worker_cls: str = "auto"
     sd_worker_cls: str = "auto"
+    worker_extension_cls: str = ""
 
     # world_size is TPxPP, it affects the number of workers we create.
     world_size: int = field(init=False)
@@ -1523,6 +1524,9 @@ def _verify_args(self) -> None:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
 
+        assert isinstance(self.worker_extension_cls, str), (
+            "worker_extension_cls must be a string (qualified class name).")
+
 
 @dataclass
 class SchedulerConfig:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 989eb4dbfd1..d033acff5b0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -202,6 +202,7 @@ class EngineArgs:
     override_pooler_config: Optional[PoolerConfig] = None
     compilation_config: Optional[CompilationConfig] = None
     worker_cls: str = "auto"
+    worker_extension_cls: str = ""
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
@@ -1015,6 +1016,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=str,
             default="auto",
             help='The worker class to use for distributed execution.')
+        parser.add_argument(
+            '--worker-extension-cls',
+            type=str,
+            default="",
+            help='The worker extension class on top of the worker cls, '
+            'it is useful if you just want to add new functions to the worker '
+            'class without changing the existing functions.')
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
@@ -1209,6 +1217,7 @@ def create_engine_config(self,
             ray_workers_use_nsight=self.ray_workers_use_nsight,
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
+            worker_extension_cls=self.worker_extension_cls,
         )
 
         max_model_len = model_config.max_model_len
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7cc1562a5bc..e5662e69343 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -558,10 +558,37 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
             worker_class = resolve_obj_by_qualname(
                 self.vllm_config.parallel_config.worker_cls)
         else:
+            logger.warning(
+                "passing worker_cls as a class object is strongly deprecated,"
+                " as the serialization of class objects can be tricky and"
+                " error-prone. To be safe, please keep the class in a separate"
+                " module and pass the qualified name of the class as a string."
+            )
             assert isinstance(self.vllm_config.parallel_config.worker_cls,
                               bytes)
             worker_class = cloudpickle.loads(
                 self.vllm_config.parallel_config.worker_cls)
+        if self.vllm_config.parallel_config.worker_extension_cls:
+            worker_extension_cls = resolve_obj_by_qualname(
+                self.vllm_config.parallel_config.worker_extension_cls)
+            extended_calls = []
+            if worker_extension_cls not in worker_class.__bases__:
+                # check any conflicts between worker and worker_extension_cls
+                for attr in dir(worker_extension_cls):
+                    if attr.startswith("__"):
+                        continue
+                    assert not hasattr(worker_class, attr), (
+                        f"Worker class {worker_class} already has an attribute"
+                        f" {attr}, which conflicts with the worker"
+                        f" extension class {worker_extension_cls}.")
+                    if callable(getattr(worker_extension_cls, attr)):
+                        extended_calls.append(attr)
+                # dynamically inherit the worker extension class
+                worker_class.__bases__ = worker_class.__bases__ + (
+                    worker_extension_cls, )
+                logger.info(
+                    "Injected %s into %s for extended collective_rpc calls %s",
+                    worker_extension_cls, worker_class, extended_calls)
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during worker initialization
             self.worker = worker_class(**kwargs)

From 5ce75b235b19a521d92db63bf07e2b4caa2a8619 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 6 Mar 2025 17:34:22 +0100
Subject: [PATCH 0551/1240] Reinstate `best_of` for V0 (#14356)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampling_params_e2e.py   |  8 +++++++
 vllm/entrypoints/llm.py                       |  6 ++++-
 vllm/entrypoints/openai/protocol.py           |  4 ++++
 vllm/entrypoints/openai/serving_completion.py |  8 +++++--
 vllm/sampling_params.py                       | 24 +++++++++++++++++++
 vllm/v1/engine/processor.py                   |  3 +++
 6 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 4e88feae44d..2ea01e667c6 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -25,6 +25,14 @@ def test_n_gt_1(model):
     assert len(outputs[0].outputs) == 3
 
 
+def test_best_of(model):
+    """Raise a ValueError since best_of is deprecated."""
+
+    params = SamplingParams(n=2, best_of=3)
+    with pytest.raises(ValueError):
+        _ = model.generate(PROMPT, params)
+
+
 def test_penalties(model):
     """Check that we do not get errors if applied."""
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index dd46a1376ad..4be1a532ee7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -97,7 +97,11 @@ class LLM:
             throughput. However, if the value is too high, it may cause out-of-
             memory (OOM) errors.
         swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
-           Too small values may cause out-of-memory (OOM) errors.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Noting that `best_of` is only supported in V0. Otherwise, too small
+            values may cause out-of-memory (OOM) errors.
         cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
             the model weights. This virtually increases the GPU memory space
             you can use to hold the model weights, at the cost of CPU-GPU data
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 1c85e27a558..6b519e1b704 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -242,6 +242,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
 
     # doc: begin-chat-completion-sampling-params
+    best_of: Optional[int] = None
     use_beam_search: bool = False
     top_k: Optional[int] = None
     min_p: Optional[float] = None
@@ -478,6 +479,7 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
+            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -648,6 +650,7 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: Optional[str] = None
     prompt: Union[list[int], list[list[int]], str, list[str]]
+    best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[dict[str, float]] = None
@@ -845,6 +848,7 @@ def to_sampling_params(
 
         return SamplingParams.from_optional(
             n=self.n,
+            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 592f213b6f5..667ff448e04 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -168,8 +168,12 @@ async def create_completion(
         model_name = self._get_model_name(request.model, lora_request)
         num_prompts = len(engine_prompts)
 
-        # We do not stream the results when use beam search.
-        stream = (request.stream and not request.use_beam_search)
+        # Similar to the OpenAI API, when n != best_of, we do not stream the
+        # results. Noting that best_of is only supported in V0. In addition,
+        # we do not stream the results when use beam search.
+        stream = (request.stream
+                  and (request.best_of is None or request.n == request.best_of)
+                  and not request.use_beam_search)
 
         # Streaming response
         if stream:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 599d52ee670..1848fd1de5c 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -116,6 +116,10 @@ class SamplingParams(
 
     Args:
         n: Number of output sequences to return for the given prompt.
+        best_of: Number of output sequences that are generated from the prompt.
+            From these `best_of` sequences, the top `n` sequences are returned.
+            `best_of` must be greater than or equal to `n`. By default,
+            `best_of` is set to `n`. Warning, this is only supported in V0.
         presence_penalty: Float that penalizes new tokens based on whether they
             appear in the generated text so far. Values > 0 encourage the model
             to use new tokens, while values < 0 encourage the model to repeat
@@ -183,6 +187,7 @@ class SamplingParams(
     """
 
     n: int = 1
+    best_of: Optional[int] = None
     _real_n: Optional[int] = None
     presence_penalty: float = 0.0
     frequency_penalty: float = 0.0
@@ -226,6 +231,7 @@ class SamplingParams(
     @staticmethod
     def from_optional(
         n: Optional[int] = 1,
+        best_of: Optional[int] = None,
         presence_penalty: Optional[float] = 0.0,
         frequency_penalty: Optional[float] = 0.0,
         repetition_penalty: Optional[float] = 1.0,
@@ -264,6 +270,7 @@ def from_optional(
 
         return SamplingParams(
             n=1 if n is None else n,
+            best_of=best_of,
             presence_penalty=0.0
             if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0
@@ -296,6 +303,20 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
+        # how we deal with `best_of``:
+        # if `best_of`` is not set, we default to `n`;
+        # if `best_of`` is set, we set `n`` to `best_of`,
+        # and set `_real_n`` to the original `n`.
+        # when we return the result, we will check
+        # if we need to return `n` or `_real_n` results
+        if self.best_of:
+            if self.best_of < self.n:
+                raise ValueError(
+                    f"best_of must be greater than or equal to n, "
+                    f"got n={self.n} and best_of={self.best_of}.")
+            if not self._real_n:
+                self._real_n = self.n
+                self.n = self.best_of
 
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
@@ -402,6 +423,9 @@ def _verify_args(self) -> None:
             raise ValueError(
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop.")
+        if self.best_of != self._real_n and self.output_kind == (
+                RequestOutputKind.DELTA):
+            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6a2c1c545f1..a75f0946b4c 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,6 +93,9 @@ def _validate_supported_sampling_params(
         self,
         params: SamplingParams,
     ) -> None:
+        # Best of not yet supported.
+        if params.best_of is not None and params.best_of > 1:
+            raise ValueError("VLLM V1 does not yet support best_of.")
         # Bad words not yet supported.
         if params.bad_words:
             raise ValueError("VLLM V1 does not yet support bad_words.")

From ae693041e913503785d246fa7c000920682e178a Mon Sep 17 00:00:00 2001
From: Dilip Gowda Bhagavan <110233170+dilipgb@users.noreply.github.com>
Date: Thu, 6 Mar 2025 22:10:53 +0530
Subject: [PATCH 0552/1240] Adding cpu inference with VXE ISA for s390x
 architecture (#12613)

Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
Signed-off-by: Rishika Kedia <rishika.kedia@in.ibm.com>
Co-authored-by: Rishika Kedia <rishika.kedia@in.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.s390x           | 152 ++++++++++++
 cmake/cpu_extension.cmake  |  11 +-
 csrc/cpu/attention.cpp     |   4 +-
 csrc/cpu/cpu_types.hpp     |   3 +
 csrc/cpu/cpu_types_vxe.hpp | 480 +++++++++++++++++++++++++++++++++++++
 csrc/cpu/quant.cpp         |   2 +-
 requirements-cpu.txt       |   9 +-
 7 files changed, 653 insertions(+), 8 deletions(-)
 create mode 100644 Dockerfile.s390x
 create mode 100644 csrc/cpu/cpu_types_vxe.hpp

diff --git a/Dockerfile.s390x b/Dockerfile.s390x
new file mode 100644
index 00000000000..b499d4cb21d
--- /dev/null
+++ b/Dockerfile.s390x
@@ -0,0 +1,152 @@
+# Base UBI image for s390x architecture
+ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
+ARG PYTHON_VERSION=3.12
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+
+# Install basic dependencies
+ARG PYTHON_VERSION
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Install development utilities
+RUN microdnf install -y \
+    which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
+    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
+    openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
+    microdnf clean all
+
+# Python Installation
+FROM base AS python-install
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel  && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+
+FROM python-install AS pyarrow
+
+# Build Apache Arrow
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/apache/arrow.git && \
+    cd arrow/cpp && \
+    mkdir release && cd release && \
+    cmake -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_INSTALL_PREFIX=/usr/local \
+          -DARROW_PYTHON=ON \
+          -DARROW_PARQUET=ON \
+          -DARROW_ORC=ON \
+          -DARROW_FILESYSTEM=ON \
+          -DARROW_WITH_LZ4=ON \
+          -DARROW_WITH_ZSTD=ON \
+          -DARROW_WITH_SNAPPY=ON \
+          -DARROW_JSON=ON \
+          -DARROW_CSV=ON \
+          -DARROW_DATASET=ON \
+          -DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \
+          -DARROW_DEPENDENCY_SOURCE=BUNDLED \
+          .. && \
+    make -j$(nproc) && \
+    make install && \
+    cd ../../python && \
+    export PYARROW_PARALLEL=4 && \
+    export ARROW_BUILD_TYPE=release && \
+    uv pip install -r requirements-build.txt && \
+    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
+
+FROM python-install AS numa-build
+# Install numactl (needed for numa.h dependency)
+WORKDIR /tmp
+RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \
+    tar -xvzf v2.0.16.tar.gz && \
+    cd numactl-2.0.16 && \
+    ./autogen.sh && \
+    ./configure && \
+    make
+
+# Set include path
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+
+FROM python-install AS rust
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    . "$CARGO_HOME/env" && \
+    rustup default stable && \
+    rustup show
+
+FROM python-install AS torch-vision
+# Install torchvision
+ARG TORCH_VERSION=2.7.0.dev20250304
+ARG TORCH_VISION_VERSION=v0.20.1
+WORKDIR /tmp
+RUN --mount=type=cache,target=/root/.cache/uv \
+    git clone https://github.com/pytorch/vision.git && \
+    cd vision && \
+    git checkout $TORCH_VISION_VERSION && \
+    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
+    python setup.py bdist_wheel
+
+# Final build stage
+FROM python-install AS vllm-cpu
+ARG PYTHON_VERSION
+
+# Set correct library path for torch and numactl
+ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
+ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
+ENV UV_LINK_MODE=copy
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+COPY . /workspace/vllm
+WORKDIR /workspace/vllm
+
+RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \
+    make -C /numactl install
+
+# Install dependencies, including PyTorch and Apache Arrow
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
+    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
+     sed -i '/^torch/d' requirements-build.txt && \
+     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
+     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
+    uv pip install -v \    
+        $ARROW_WHL_FILE  \
+        $VISION_WHL_FILE \
+        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        --index-strategy unsafe-best-match \
+        -r requirements-build.txt \
+        -r requirements-cpu.txt 
+
+# Build and install vllm
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    uv pip install "$(echo dist/*.whl)[tensorizer]"
+
+# setup non-root user for vllm
+RUN umask 002 && \
+    useradd --uid 2000 --gid 0 vllm && \
+    mkdir -p /home/vllm && \
+    chmod g+rwx /home/vllm
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+USER 2000
+WORKDIR /home/vllm
+
+# Set the default entrypoint
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 714abca2a5f..ca2ffb1bc3c 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -81,6 +81,7 @@ else()
     find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
     find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
     find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+    find_isa(${CPUINFO} "S390" S390_FOUND)
 endif()
 
 
@@ -129,8 +130,16 @@ elseif (ASIMD_FOUND)
 elseif(APPLE_SILICON_FOUND)
     message(STATUS "Apple Silicon Detected")
     set(ENABLE_NUMA OFF)
+elseif (S390_FOUND)
+    message(STATUS "S390 detected")
+    # Check for S390 VXE support
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mvx"
+        "-mzvector"
+        "-march=native"
+        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
 endif()
 
 #
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index b9764056e8a..0257d8ff16b 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -24,8 +24,8 @@ struct KernelVecType<float> {
 
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
-  // Power architecture-specific vector types
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
   using q_load_vec_type = vec_op::FP32Vec8;
   using k_load_vec_type = vec_op::FP32Vec16;
   using v_load_vec_type = vec_op::FP32Vec16;
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index a7181510613..17bbe04eef9 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -7,6 +7,9 @@
 #elif defined(__POWER9_VECTOR__)
   // ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__s390x__)
+  // s390 implementation
+  #include "cpu_types_vxe.hpp"
 #elif defined(__aarch64__)
   // arm implementation
   #include "cpu_types_arm.hpp"
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
new file mode 100644
index 00000000000..ab8cbbbf4ec
--- /dev/null
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -0,0 +1,480 @@
+
+#ifndef CPU_TYPES_VXE_HPP
+#define CPU_TYPES_VXE_HPP
+
+#include <vecintrin.h>
+#include <cmath>
+#include <torch/all.h>
+namespace vec_op {
+
+#define vec_neg(a) (-(a))
+#define vec_add(a, b) ((a) + (b))
+#define vec_sub(a, b) ((a) - (b))
+#define vec_mul(a, b) ((a) * (b))
+#define vec_div(a, b) ((a) / (b))
+#define vec_sr(a, b) ((a) >> (b))  // Vector Shift Right Algebaic
+#define vec_sl(a, b) ((a) << (b))  // Vector Shift Left
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+  #define CPU_KERNEL_GUARD_IN(NAME)
+  #define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+  #define CPU_KERNEL_GUARD_IN(NAME) \
+    std::cout << #NAME << " invoked." << std::endl;
+  #define CPU_KERNEL_GUARD_OUT(NAME) \
+    std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+};  // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F&& f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T>
+struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+typedef struct ss16x8x2_t {
+  __vector signed short val[2];
+} ss16x8x2_t;
+
+typedef struct ss16x8x4_t {
+  __vector signed short val[4];
+} ss16x8x4_t;
+
+typedef struct f32x4x2_t {
+  __vector float val[2];
+} f32x4x2_t;
+
+typedef struct f32x4x4_t {
+  __vector float val[4];
+} f32x4x4_t;
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __vector signed short reg;
+
+  explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {}
+  explicit BF16Vec8(const FP32Vec8&);
+
+  void save(void* ptr) const {
+    *reinterpret_cast<__vector signed short*>(ptr) = reg;
+  }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  ss16x8x2_t reg;
+
+  explicit BF16Vec16(const void* ptr) {
+    // Load 256 bits in two parts
+    reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
+    reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
+  }
+
+  explicit BF16Vec16(const FP32Vec16&);
+
+  void save(void* ptr) const {
+    // Save 256 bits in two parts
+    vec_xst(reg.val[0], 0, (signed short*)ptr);
+    vec_xst(reg.val[1], 16, (signed short*)ptr);
+  }
+};
+
+const static __vector signed short zero = vec_splats((signed short)0);
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  ss16x8x4_t reg;
+  explicit BF16Vec32(const void* ptr)
+      : reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
+
+  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8& vec8_data)
+      : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
+
+  void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __vector float reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __vector float reg;
+
+  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
+
+  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
+
+  explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
+
+  explicit FP32Vec4(__vector float data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    f32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x2_t reg;
+
+  explicit FP32Vec8(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+  }
+
+  explicit FP32Vec8() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec8(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+  }
+
+  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+  }
+
+  explicit FP32Vec8(const BF16Vec8& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::exp(ar.values[0]);
+    ret.val[0][1] = std::exp(ar.values[1]);
+    ret.val[0][2] = std::exp(ar.values[2]);
+    ret.val[0][3] = std::exp(ar.values[3]);
+    ret.val[1][0] = std::exp(ar.values[4]);
+    ret.val[1][1] = std::exp(ar.values[5]);
+    ret.val[1][2] = std::exp(ar.values[6]);
+    ret.val[1][3] = std::exp(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 tanh() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::tanh(ar.values[0]);
+    ret.val[0][1] = std::tanh(ar.values[1]);
+    ret.val[0][2] = std::tanh(ar.values[2]);
+    ret.val[0][3] = std::tanh(ar.values[3]);
+    ret.val[1][0] = std::tanh(ar.values[4]);
+    ret.val[1][1] = std::tanh(ar.values[5]);
+    ret.val[1][2] = std::tanh(ar.values[6]);
+    ret.val[1][3] = std::tanh(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 er() const {
+    // TODO: Vectorize this
+    AliasReg ar;
+    ar.reg = reg;
+    f32x4x4_t ret;
+    ret.val[0][0] = std::erf(ar.values[0]);
+    ret.val[0][1] = std::erf(ar.values[1]);
+    ret.val[0][2] = std::erf(ar.values[2]);
+    ret.val[0][3] = std::erf(ar.values[3]);
+    ret.val[1][0] = std::erf(ar.values[4]);
+    ret.val[1][1] = std::erf(ar.values[5]);
+    ret.val[1][2] = std::erf(ar.values[6]);
+    ret.val[1][3] = std::erf(ar.values[7]);
+    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator+(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator-(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
+  }
+
+  FP32Vec8 operator/(const FP32Vec8& b) const {
+    return FP32Vec8(
+        {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    f32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  f32x4x4_t reg;
+
+  explicit FP32Vec16(float v) {
+    reg.val[0] = vec_splats(v);
+    reg.val[1] = vec_splats(v);
+    reg.val[2] = vec_splats(v);
+    reg.val[3] = vec_splats(v);
+  }
+
+  explicit FP32Vec16() {
+    reg.val[0] = vec_splats(0.0f);
+    reg.val[1] = vec_splats(0.0f);
+    reg.val[2] = vec_splats(0.0f);
+    reg.val[3] = vec_splats(0.0f);
+  }
+
+  explicit FP32Vec16(const float* ptr) {
+    reg.val[0] = vec_xl(0, ptr);
+    reg.val[1] = vec_xl(16, ptr);
+    reg.val[2] = vec_xl(32, ptr);
+    reg.val[3] = vec_xl(48, ptr);
+  }
+
+  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[2];
+    reg.val[3] = data.reg.val[3];
+  }
+
+  explicit FP32Vec16(const FP32Vec4& data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8& data) {
+    reg.val[0] = data.reg.val[0];
+    reg.val[1] = data.reg.val[1];
+    reg.val[2] = data.reg.val[0];
+    reg.val[3] = data.reg.val[1];
+  }
+
+  explicit FP32Vec16(const BF16Vec16& v) {
+    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
+    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
+    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
+    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+  }
+
+  explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
+                                vec_mul(reg.val[1], b.reg.val[1]),
+                                vec_mul(reg.val[2], b.reg.val[2]),
+                                vec_mul(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
+                                vec_add(reg.val[1], b.reg.val[1]),
+                                vec_add(reg.val[2], b.reg.val[2]),
+                                vec_add(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
+                                vec_sub(reg.val[1], b.reg.val[1]),
+                                vec_sub(reg.val[2], b.reg.val[2]),
+                                vec_sub(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
+                                vec_div(reg.val[1], b.reg.val[1]),
+                                vec_div(reg.val[2], b.reg.val[2]),
+                                vec_div(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  template <int group_size>
+  float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&result, &start, ar](int i) { result += ar.values[start + i]; });
+
+    return result;
+  }
+
+  void save(float* ptr) const {
+    vec_xst(reg.val[0], 0, ptr);
+    vec_xst(reg.val[1], 16, ptr);
+    vec_xst(reg.val[2], 32, ptr);
+    vec_xst(reg.val[3], 48, ptr);
+  }
+};
+
+template <typename T>
+struct VecType {
+  using vec_type = void;
+};
+
+template <typename T>
+using vec_t = typename VecType<T>::vec_type;
+
+template <>
+struct VecType<float> {
+  using vec_type = FP32Vec8;
+};
+
+template <>
+struct VecType<c10::BFloat16> {
+  using vec_type = BF16Vec8;
+};
+
+template <typename T>
+void storeFP32(float v, T* ptr) {
+  *ptr = v;
+}
+
+inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+  acc = acc + a * b;
+}
+
+namespace c10 {
+struct BFloat16 {
+  uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
+                   // value.
+};
+}  // namespace c10
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
+  c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
+      reinterpret_cast<c10::BFloat16*>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+#ifndef __VEC_CLASS_FP_NAN
+  #define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
+                                             18, 19, 22, 23, 26, 27, 30, 31};
+const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
+                                           0x00007fff};
+const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
+                                          0x7fc00000};
+const static __vector unsigned int sh16 = {16, 16, 16, 16};
+const static __vector unsigned int one = {1, 1, 1, 1};
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
+}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
+  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
+  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
+  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  int cc;
+  __vector __bool int sel0 =
+      vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel1 =
+      vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel2 =
+      vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
+  __vector __bool int sel3 =
+      vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
+  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
+  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
+  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
+  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
+}
+
+inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+
+};  // namespace vec_op
+
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index 33b16378328..6751e7e55fc 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -25,7 +25,7 @@ struct KernelVecType<c10::BFloat16> {
 
 template <>
 struct KernelVecType<c10::Half> {
-#ifdef __powerpc64__
+#if defined(__powerpc64__) || defined(__s390x__)
   // Power architecture-specific vector type
   using load_vec_type = vec_op::FP32Vec16;
 #else
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index ecfa822e011..9491e27d127 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,14 +2,15 @@
 -r requirements-common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin" 
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le"
+torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
 torchaudio==2.5.1; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le"
+torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
 torchvision==0.20.1; platform_machine == "ppc64le"
 datasets # for benchmark scripts

From 9d5bed6ae16d67d529436fe02ae6f1904307395b Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 6 Mar 2025 17:43:09 +0100
Subject: [PATCH 0553/1240] Add authors to license header. (#14371)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/ops/chunked_prefill_paged_decode.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 807a270b43d..16d67e3abe8 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -1,5 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+# Authors:
+#  - Burkhard Ringlein
+#  - Jan van Lunteren
+#  - Thomas Parnell
+
 import torch
 import triton
 import triton.language as tl

From 45a9d2c7dfa5d8826f39264932a7ee0bab743515 Mon Sep 17 00:00:00 2001
From: Ying Zhong <zhongyingmatrix@gmail.com>
Date: Fri, 7 Mar 2025 01:35:49 +0800
Subject: [PATCH 0554/1240] Fix mla prefill context performance (#13897)

Signed-off-by: ZhongYingMatrix <zhongyingmatrix@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py    | 2 +-
 vllm/v1/attention/backends/mla/common.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 8184b073275..109e8496fc3 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1308,7 +1308,7 @@ def _compute_prefill_context(
             )
 
             kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank].unsqueeze(1)
+                [..., :self.kv_lora_rank]
             k_pe = workspace[:toks]\
                 [..., self.kv_lora_rank:].unsqueeze(1)
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index c98262eea1e..0b55854de94 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -874,7 +874,7 @@ def _compute_prefill_context(
             )
 
             kv_c_normed = workspace[:toks]\
-                [..., :self.kv_lora_rank].unsqueeze(1)
+                [..., :self.kv_lora_rank]
             k_pe = workspace[:toks]\
                 [..., self.kv_lora_rank:].unsqueeze(1)
 

From ce28a50d85c239d6fe1476dcfcd4df7fcc17baa0 Mon Sep 17 00:00:00 2001
From: Himanshu Jaju <hj@mistral.ai>
Date: Thu, 6 Mar 2025 19:40:24 +0100
Subject: [PATCH 0555/1240] [V1] Do not detokenize if sampling param detokenize
 is False (#14224)

Signed-off-by: Himanshu Jaju <hj@mistral.ai>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_sampling_params_e2e.py | 29 ++++++++++++++-
 vllm/v1/engine/detokenizer.py               | 40 ++++++++++++---------
 vllm/v1/engine/logprobs.py                  | 25 +++++++------
 vllm/v1/engine/output_processor.py          |  2 ++
 4 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 2ea01e667c6..dcb0fa20b1a 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -14,7 +14,10 @@
 
 @pytest.fixture(scope="module")
 def model() -> LLM:
-    return LLM(MODEL, enforce_eager=True)
+    # Disable prefix caching so that we can test prompt logprobs.
+    # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
+    # is merged
+    return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
 
 
 def test_n_gt_1(model):
@@ -87,9 +90,33 @@ def test_stop_token_ids(model):
 
     stop_token_ids = [stop_token_id_0, stop_token_id_1]
     params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
+    output = model.generate(PROMPT, params)
     assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
 
 
+def test_detokenize_false(model):
+    """Check that detokenize=False option works."""
+
+    output = model.generate(PROMPT, SamplingParams(detokenize=False))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    output = model.generate(
+        PROMPT, SamplingParams(detokenize=False, logprobs=3,
+                               prompt_logprobs=3))
+    assert len(output[0].outputs[0].token_ids) > 0
+    assert len(output[0].outputs[0].text) == 0
+
+    prompt_logprobs = output[0].prompt_logprobs
+    sampled_logprobs = output[0].outputs[0].logprobs
+    assert len(prompt_logprobs) > 1
+    assert len(sampled_logprobs) > 1
+    for all_logprobs in (prompt_logprobs[1:], sampled_logprobs):
+        for logprobs in all_logprobs:
+            assert 3 <= len(logprobs) <= 4
+            assert all(lp.decoded_token is None for lp in logprobs.values())
+
+
 def test_bad_words(model):
     """Check that we respect bad words."""
 
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 4a1636f4949..92754920b62 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from vllm.engine.output_processor.stop_checker import StopChecker
@@ -16,41 +16,46 @@
 class IncrementalDetokenizer:
 
     # Generation data
-    output_text: str
-    tokens: list[str]
     token_ids: list[int]
-    prompt_len: int
+    output_text: str = ""
+    tokens: list[str] = field(default_factory=list)
+    prompt_len: int = 0
 
     # Stop strings
-    stop: list[str]
-    include_stop_str_in_output: bool
+    stop: list[str] = field(default_factory=list)
+    include_stop_str_in_output: bool = False
 
     # Metadata for incremental detokenization
-    prefix_offset: int
-    read_offset: int
+    prefix_offset: int = 0
+    read_offset: int = 0
 
     # Parameters for detokenization
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
 
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer] = None
 
     # Accounting for stop string buffering
-    stop_buffer_length: int
+    stop_buffer_length: int = 0
     _last_output_text_offset: int = 0
 
     @property
     def output_token_ids(self) -> list[int]:
-        return self.token_ids[self.prompt_len:]
+        return self.token_ids if not self.prompt_len else (
+            self.token_ids[self.prompt_len:])
 
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
 
+        if tokenizer is None:
+            return cls(token_ids=[])
+
         tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
             tokenizer=tokenizer,
             prompt_ids=request.prompt_token_ids,
@@ -66,7 +71,6 @@ def from_new_request(
             stop_buffer_length = 0
 
         return cls(
-            output_text="",
             tokens=tokens,
             # Detokenizer mutates this list, so need a unique copy.
             # NOTE(Nick): could we take ownership of it though?
@@ -93,6 +97,10 @@ def update(self, new_token_ids: list[int]) -> Optional[str]:
         Return matched stop string or None.
         """
 
+        if self.tokenizer is None:
+            self.token_ids.extend(new_token_ids)
+            return None
+
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 7f572163ead..500de14e57d 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
+from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Optional
 
@@ -13,12 +14,15 @@
 
 logger = init_logger(__name__)
 
+NONES = itertools.repeat(None)
+
 
 @dataclass
 class LogprobsProcessor:
 
-    # Tokenizer for this request
-    tokenizer: AnyTokenizer
+    # Tokenizer for this request,
+    # None if detokenization is disabled.
+    tokenizer: Optional[AnyTokenizer]
 
     # Logprobs for this request
     logprobs: Optional[SampleLogprobs]
@@ -30,7 +34,7 @@ class LogprobsProcessor:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
         num_logprobs = request.sampling_params.logprobs
@@ -66,8 +70,8 @@ def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None:
                                              token_ids_lst):
 
             # Detokenize (non-incrementally).
-            decoded_tokens = convert_ids_list_to_tokens(
-                self.tokenizer, token_ids)
+            decoded_tokens = NONES if self.tokenizer is None else (
+                convert_ids_list_to_tokens(self.tokenizer, token_ids))
 
             # Sampler puts the sampled logprob in first.
             sampled_token_logprob = logprobs[0]
@@ -103,9 +107,9 @@ def _update_prompt_logprobs(
 
         # Detokenize non-incrementally.
         # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps]
-        decoded_tokens = convert_ids_list_to_tokens(
-            self.tokenizer,
-            token_ids.flatten().tolist())
+        decoded_tokens = None if self.tokenizer is None else (
+            convert_ids_list_to_tokens(self.tokenizer,
+                                       token_ids.flatten().tolist()))
 
         # Recover shapes.
         num_prompt_tokens, num_logprobs = logprobs.shape
@@ -121,7 +125,8 @@ def _update_prompt_logprobs(
             # Handle flattening.
             offset = pos * num_logprobs
             offset_end = offset + num_logprobs
-            decoded_tokens_for_pos = decoded_tokens[offset:offset_end]
+            decoded_tokens_for_pos = NONES \
+            if decoded_tokens is None else decoded_tokens[offset:offset_end]
 
             # Update with the Logprob dictionary for this pos.
             self.prompt_logprobs.append(
@@ -153,7 +158,7 @@ def pop_prompt_logprobs(self) -> Optional[PromptLogprobs]:
     def _make_logprob_dict(
         logprobs: list[float],
         logprob_token_ids: list[int],
-        decoded_tokens: list[str],
+        decoded_tokens: Iterable[Optional[str]],
         rank: int,
         num_logprobs: int,
     ) -> dict[int, Logprob]:
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 75c638a854f..aca0233e416 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -68,6 +68,8 @@ def from_new_request(
         queue: Optional[asyncio.Queue[RequestOutput]],
         log_stats: bool,
     ) -> "RequestState":
+        if not request.sampling_params.detokenize:
+            tokenizer = None
         return cls(
             request_id=request.request_id,
             parent_req=parent_req,

From e21a6fb89a9c749e3faf2f576cc104dbe88b1e8e Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 6 Mar 2025 13:54:45 -0500
Subject: [PATCH 0556/1240] [Distributed] Add enable_expert_parallel arg
 (#14305)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/data_parallel.py   |  6 ++---
 vllm/config.py                                |  3 ++-
 vllm/engine/arg_utils.py                      |  7 ++++++
 vllm/envs.py                                  |  7 ------
 vllm/model_executor/layers/fused_moe/layer.py | 25 +++++++++++--------
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 2ac98976539..b00519314d8 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # usage:
-# VLLM_TEST_ENABLE_EP=1 VLLM_USE_V1=1 \
-#   python examples/offline_inference/data_parallel.py
+# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
 # we need to have a launcher to create multiple data parallel
 # ranks. And each rank will create a vLLM instance to process its own prompts.
 import os
@@ -55,7 +54,8 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     # Create an LLM.
     llm = LLM(model="ibm-research/PowerMoE-3b",
               tensor_parallel_size=GPUs_per_dp_rank,
-              enforce_eager=True)
+              enforce_eager=True,
+              enable_expert_parallel=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
     for output in outputs:
diff --git a/vllm/config.py b/vllm/config.py
index 9b84d0405dc..3eca09b232a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -754,7 +754,7 @@ def verify_with_parallel_config(
                 " must be divisible by tensor parallel size "
                 f"({tensor_parallel_size}).")
 
-        if envs.VLLM_TEST_ENABLE_EP:
+        if parallel_config.enable_expert_parallel:
             self._verify_with_expert_parallelism()
 
         pipeline_parallel_size = parallel_config.pipeline_parallel_size
@@ -1334,6 +1334,7 @@ class ParallelConfig:
     # IP of the data parallel master.
     data_parallel_master_ip: str = "127.0.0.1"
     data_parallel_master_port: int = 29500  # Port of the data parallel master.
+    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
 
     # Maximum number of multiple batches
     # when load model sequentially. To avoid RAM OOM when using tensor
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d033acff5b0..77bbb59efef 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -114,6 +114,7 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
+    enable_expert_parallel: bool = False
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
@@ -440,6 +441,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
                             help='Number of tensor parallel replicas.')
+        parser.add_argument(
+            '--enable-expert-parallel',
+            action='store_true',
+            help='Use expert parallelism instead of tensor parallelism '
+            'for MoE layers.')
         parser.add_argument(
             '--max-parallel-loading-workers',
             type=int,
@@ -1207,6 +1213,7 @@ def create_engine_config(self,
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
             tokenizer_pool_config=TokenizerPoolConfig.create_config(
diff --git a/vllm/envs.py b/vllm/envs.py
index edabd647db2..2489affbcbd 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -86,7 +86,6 @@
     VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
     VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
-    VLLM_TEST_ENABLE_EP: bool = False
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
@@ -579,12 +578,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
                  ),
 
-    # If set, vLLM will use the experimental expert parallel implementation on
-    # the FusedMoE layer, using tensor parallelism size as expert parallelism
-    # size.
-    "VLLM_TEST_ENABLE_EP":
-    lambda: bool(int(os.getenv("VLLM_TEST_ENABLE_EP", "0"))),
-
     # Number of GPUs per worker in Ray, if it is set to be a fraction,
     # it allows ray to schedule multiple actors on a single GPU,
     # so that users can colocate other actors on the same GPUs as vLLM.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 33d2896f3fd..d0209eb40e8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -7,7 +7,6 @@
 import torch
 from torch.nn.parameter import UninitializedParameter
 
-import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -342,14 +341,6 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
 
-        # For smuggling this layer into the fused moe custom op
-        compilation_config = get_current_vllm_config().compilation_config
-        if prefix in compilation_config.static_forward_context:
-            raise ValueError("Duplicate layer name: {}".format(prefix))
-        compilation_config.static_forward_context[prefix] = self
-        self.layer_name = prefix
-        self.use_direct_call = not envs.VLLM_TEST_ENABLE_EP
-
         # Note: here we guard against accessing the TP and DP groups when
         # uninitialized (this happens when testing)
         self.tp_size = (tp_size if tp_size is not None else
@@ -361,7 +352,21 @@ def __init__(
                         if self.dp_size == 1 else get_dp_group().rank_in_group)
         self.global_num_experts = num_experts
 
-        if envs.VLLM_TEST_ENABLE_EP:
+        # Use expert parallelism instead of tensor parallelism?
+        vllm_config = get_current_vllm_config()
+        use_ep = (vllm_config.parallel_config.enable_expert_parallel
+                  and self.tp_size > 1)
+
+        # For smuggling this layer into the fused moe custom op
+        self.use_direct_call = self.dp_size == 1
+        if self.use_direct_call:
+            compilation_config = vllm_config.compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError("Duplicate layer name: {}".format(prefix))
+            compilation_config.static_forward_context[prefix] = self
+            self.layer_name = prefix
+
+        if use_ep:
             # Set TP size to 1 to adjust for EP and adjust EP size and rank
             # for DP attention.
             self.ep_rank = tp_rank + self.tp_size * self.dp_rank

From 4ae6eb28e2b0b5e92ee099406aff447a3da9f88b Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Mar 2025 16:08:36 -0500
Subject: [PATCH 0557/1240] [CI/Build] Use uv python for docker rather than
 ppa:deadsnakes/ppa (#13569)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 82 +++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index bb9b7b24e46..128600b0f5e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,22 +14,19 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install Python and other dependencies
+# Install minimal dependencies and uv
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+    && apt-get install -y ccache git curl wget sudo \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -51,21 +48,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 WORKDIR /workspace
 
-# install build and runtime dependencies
-
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-cuda.txt
+    uv pip install -r requirements-cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -90,12 +85,12 @@ COPY requirements-build.txt requirements-build.txt
 ENV UV_HTTP_TIMEOUT=500
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-build.txt
+    uv pip install -r requirements-build.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 
 # max jobs used by Ninja to build extensions
 ARG max_jobs=2
@@ -132,6 +127,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git  \
     if [ "$USE_SCCACHE" != "1" ]; then \
+        # Clean any existing CMake artifacts
+        rm -rf .deps && \
+        mkdir -p .deps && \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
@@ -159,7 +157,7 @@ COPY requirements-lint.txt requirements-lint.txt
 COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-dev.txt
+    uv pip install -r requirements-dev.txt
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
@@ -175,23 +173,20 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install Python and other dependencies
+# Install minimal dependencies and uv
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
-    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-    && python3 --version && python3 -m pip --version
-# Install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv
+    && apt-get install -y ccache git curl wget sudo vim \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add uv to PATH
+ENV PATH="/root/.local/bin:$PATH"
+# Create venv with specified Python and activate by placing at the front of path
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -209,13 +204,13 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system dist/*.whl --verbose
+    uv pip install dist/*.whl --verbose
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -230,9 +225,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 
@@ -242,7 +236,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-build.txt
+    uv pip install -r requirements-build.txt
 
 #################### vLLM installation IMAGE ####################
 
@@ -259,15 +253,15 @@ ENV UV_HTTP_TIMEOUT=500
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements-dev.txt
+    uv pip install -r requirements-dev.txt
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
+    uv pip install -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
+    uv pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
 # Copy in the v1 package for testing (it isn't distributed yet)
@@ -292,9 +286,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image

From 047ee9da7ff39e8cef835bcab36c90106da61180 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 6 Mar 2025 22:52:46 +0100
Subject: [PATCH 0558/1240] [CI] Disable spawn when running V1 Test (#14345)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ef05cb99cf1..7b74bc9c352 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -200,7 +200,6 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - VLLM_USE_V1=1 pytest -v -s v1/core
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample

From 4e123a7b16c07c82d0ab1e81e33aedec5563655a Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:17:09 -0500
Subject: [PATCH 0559/1240] [Kernel] Add needs_fixed_stride_order tag to most
 GEMMs (#14306)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/torch_bindings.cpp | 55 ++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 0b0334f84ef..fe7a674bb03 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -4,6 +4,7 @@
 #include "core/registration.h"
 
 #include <torch/library.h>
+#include <torch/version.h>
 
 // Note on op signatures:
 // The X_meta signatures are for the meta functions corresponding to op X.
@@ -17,6 +18,15 @@
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
+  //
+
+  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // to override this for many GEMMs with the following tag. Otherwise,
+  // torch.compile will force all input tensors to be contiguous(), which
+  // will break many custom ops that require column-major weight matrices.
+  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
+  // to match exact eager-mode strides.
+  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
 
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
@@ -163,25 +173,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
       "Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
-      "-> Tensor");
+      "-> Tensor",
+      {stride_tag});
   ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
 
   // Decompression method for AQLM.
   ops.def(
       "aqlm_dequant(Tensor codes, Tensor codebooks, "
-      "int[] codebook_partition_sizes) -> Tensor");
+      "int[] codebook_partition_sizes) -> Tensor",
+      {stride_tag});
   ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
 
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
+      {stride_tag});
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
   ops.def(
       "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
+      {stride_tag});
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
   // Note about marlin kernel 'workspace' arguments:
@@ -202,7 +216,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
-      "Tensor");
+      "Tensor",
+      {stride_tag});
   // conditionally compiled so impl in source file
 
   // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
@@ -210,7 +225,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
       "Tensor b_scales, Tensor workspace, "
       "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
+      {stride_tag});
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -236,7 +252,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor? channel_scales,"
       "   Tensor? token_scales,"
       "   str?    schedule"
-      ") -> Tensor");
+      ") -> Tensor",
+      {stride_tag});
   ops.def(
       "machete_prepack_B("
       "   Tensor B,"
@@ -255,7 +272,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
       "int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
@@ -291,7 +309,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
       "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // marlin_qqq_gemm for QQQ.
@@ -299,14 +318,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
       "Tensor s_tok, Tensor s_ch, Tensor s_group, "
       "Tensor! workspace, SymInt size_m, SymInt size_n, "
-      "SymInt size_k) -> Tensor");
+      "SymInt size_k) -> Tensor",
+      {stride_tag});
   // conditionally compiled so impl registration is in source file
 
   // CUTLASS nvfp4 block scaled GEMM
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
       "                      Tensor block_scale_a, Tensor block_scale_b,"
-      "                      Tensor alpha) -> ()");
+      "                      Tensor alpha) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -314,7 +335,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()");
+      "                  Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -323,7 +345,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()");
+      "                  Tensor? azp, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -351,7 +374,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                         Tensor bt_nzs,"
       "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()");
+      "                         Tensor b_scales, Tensor? bias) -> ()",
+      {stride_tag});
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
@@ -407,7 +431,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
-      "-> Tensor");
+      "-> Tensor",
+      {stride_tag});
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.

From 4fc7ca1c8ca9fd737dea5866bd58333d38a7b8db Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:17:21 -0500
Subject: [PATCH 0560/1240] [Bugfix] Fix use_direct_call condition in FusedMoE
 layer for  (#14382)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index d0209eb40e8..51c4df9d4a5 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -359,7 +359,7 @@ def __init__(
 
         # For smuggling this layer into the fused moe custom op
         self.use_direct_call = self.dp_size == 1
-        if self.use_direct_call:
+        if not self.use_direct_call:
             compilation_config = vllm_config.compilation_config
             if prefix in compilation_config.static_forward_context:
                 raise ValueError("Duplicate layer name: {}".format(prefix))

From 928ad1d3d17f97f7c9c1ef2dab48a2ec38b871dd Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:18:06 -0500
Subject: [PATCH 0561/1240] [Bug] Fix Attention when ignored in by quant_method
 (#14313)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/layer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 7810089a05c..3cbd38dbd46 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -11,6 +11,7 @@
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
@@ -97,7 +98,8 @@ def __init__(
 
         quant_method = quant_config.get_quant_method(
             self, prefix=prefix) if quant_config else None
-        if quant_method is not None:
+        if quant_method is not None and not isinstance(
+                quant_method, UnquantizedLinearMethod):
             assert isinstance(quant_method, BaseKVCacheMethod)
             # TODO (mgoin): kv cache dtype should be specified in the FP8
             # checkpoint config and become the "auto" behavior

From b9807a6a6a4a63764b4acf409e76d38748ef5799 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:18:29 -0500
Subject: [PATCH 0562/1240] [V1][Bugfix] Standardize quantized kv cache
 rejection for attention backends (#14221)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/abstract.py          |  4 ++++
 vllm/attention/backends/flash_attn.py        |  9 ++++++++-
 vllm/attention/backends/flashmla.py          |  9 ++++++---
 vllm/attention/backends/hpu_attn.py          |  7 ++++++-
 vllm/attention/backends/ipex_attn.py         |  5 +++--
 vllm/attention/backends/pallas.py            |  5 +++--
 vllm/attention/backends/torch_sdpa.py        |  8 ++++++--
 vllm/attention/backends/triton_mla.py        |  9 ++++++---
 vllm/v1/attention/backends/flash_attn.py     |  6 +++++-
 vllm/v1/attention/backends/mla/flashmla.py   | 10 ++++++----
 vllm/v1/attention/backends/mla/triton_mla.py |  7 ++++++-
 11 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 5f0a5401354..0cd95e0749d 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -294,3 +294,7 @@ def forward(
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         raise NotImplementedError
+
+
+def is_quantized_kv_cache(kv_cache_dtype: str) -> bool:
+    return kv_cache_dtype != "auto"
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 5aca10079f9..0e331efa6a3 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -8,11 +8,15 @@
 import torch
 
 from vllm import _custom_ops as ops
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
-                                              AttentionType)
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+# yapf: enable
 from vllm.attention.backends.utils import (
     PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
     compute_slot_mapping_start_idx, get_flash_attn_version,
@@ -626,6 +630,9 @@ def __init__(
         self.sliding_window = ((sliding_window - 1,
                                 0) if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttention with FP8 KV cache not yet supported")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py
index 273c69b63ec..5d0c2309331 100644
--- a/vllm/attention/backends/flashmla.py
+++ b/vllm/attention/backends/flashmla.py
@@ -6,7 +6,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
                                                 MLACommonMetadata,
@@ -207,6 +208,10 @@ def __init__(
                                       "are not implemented for "
                                       "FlashMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -215,8 +220,6 @@ def _forward_decode(
         attn_metadata: FlashMLAMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 FlashMLA not yet supported")
 
         decode_meta = attn_metadata.decode_metadata
         assert decode_meta is not None
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
index 9eb533685db..f948fbc0a10 100644
--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -15,7 +15,8 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
                                                HPUPagedAttentionMetadata)
@@ -158,6 +159,10 @@ def __init__(
                                       "are not implemented for "
                                       "HPUAttentionImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "HPUAttention with FP8 KV cache not yet supported")
+
     def forward(
         self,
         layer: AttentionLayer,
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index b4879af4cf2..d3c61ea26a0 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -9,7 +9,8 @@
 from vllm._ipex_ops import ipex_ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.paged_attn import (PagedAttention,
                                            PagedAttentionMetadata)
@@ -145,7 +146,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError(
                 "IPEX backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index b61dfe63ddc..2ee66ab9e96 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -8,7 +8,8 @@
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import CommonAttentionState
 
 
@@ -119,7 +120,7 @@ def __init__(
             raise NotImplementedError("Alibi slopes is not supported.")
         if sliding_window is not None:
             raise NotImplementedError("Sliding window is not supported.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 25fe6ed95c5..37dd75da275 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -7,11 +7,15 @@
 import torch
 from torch.nn.functional import scaled_dot_product_attention
 
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionLayer,
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
-                                              AttentionType)
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+# yapf: enable
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
@@ -427,7 +431,7 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-        if kv_cache_dtype != "auto":
+        if is_quantized_kv_cache(kv_cache_dtype):
             raise NotImplementedError(
                 "Torch SDPA backend does not support FP8 KV cache. "
                 "Please use xFormers backend instead.")
diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py
index 08e8226ab04..61e5c76d9fd 100644
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.mla.common import (MLACommonBackend,
                                                 MLACommonImpl,
                                                 MLACommonMetadata)
@@ -58,6 +59,10 @@ def __init__(
                                       "are not implemented for "
                                       "TritonMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -66,8 +71,6 @@ def _forward_decode(
         attn_metadata: MLACommonMetadata,
     ) -> torch.Tensor:
         assert kv_c_and_k_pe_cache.numel() > 0
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 Triton MLA not yet supported")
 
         decode_meta = attn_metadata.decode_metadata
         assert decode_meta is not None
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index e7c2fd412eb..db80e52bf07 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -7,7 +7,8 @@
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
+                                              AttentionMetadata, AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 from vllm.logger import init_logger
@@ -180,6 +181,9 @@ def __init__(
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashAttention V1 with FP8 KV cache not yet supported")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index d5bf9cd22f1..143bfe35bb5 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -5,7 +5,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
@@ -115,6 +116,10 @@ def __init__(
                                       "are not implemented for "
                                       "FlashMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "FlashMLA V1 with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,
@@ -125,9 +130,6 @@ def _forward_decode(
         assert kv_c_and_k_pe_cache.numel() > 0
         assert attn_metadata.decode is not None
 
-        if self.kv_cache_dtype.startswith("fp8"):
-            raise NotImplementedError("FP8 FlashMLA not yet supported")
-
         q = torch.cat([q_nope, q_pe], dim=-1)\
             .unsqueeze(1) # Add seqlen dim of 1 (decode)
 
diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py
index cef7a3a9a72..8e7e4f10b81 100644
--- a/vllm/v1/attention/backends/mla/triton_mla.py
+++ b/vllm/v1/attention/backends/mla/triton_mla.py
@@ -4,7 +4,8 @@
 
 import torch
 
-from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.abstract import (AttentionType,
+                                              is_quantized_kv_cache)
 from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
@@ -61,6 +62,10 @@ def __init__(
                                       "are not implemented for "
                                       "TritonMLAImpl")
 
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "TritonMLA V1 with FP8 KV cache not yet supported")
+
     def _forward_decode(
         self,
         q_nope: torch.Tensor,

From 498fb41f37ec13f061febe31633a2869e9e56c54 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Thu, 6 Mar 2025 17:19:58 -0500
Subject: [PATCH 0563/1240] [Docs] Add nsight guide to profiling docs (#14298)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../contributing/profiling/profiling_index.md | 91 ++++++++++++++++++-
 1 file changed, 88 insertions(+), 3 deletions(-)

diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index 3d044f89038..d6e597ea9e9 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -4,6 +4,8 @@
 Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 :::
 
+## Profile with PyTorch Profiler
+
 We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
@@ -22,13 +24,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
 `export VLLM_RPC_TIMEOUT=1800000`
 :::
 
-## Example commands and usage
+### Example commands and usage
 
-### Offline Inference
+#### Offline Inference
 
 Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
 
-### OpenAI Server
+#### OpenAI Server
 
 ```bash
 VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
@@ -39,3 +41,86 @@ benchmark_serving.py:
 ```bash
 python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
 ```
+
+## Profile with NVIDIA Nsight Systems
+
+Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
+
+[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
+The following block is an example for Ubuntu.
+
+```bash
+apt update
+apt install -y --no-install-recommends gnupg
+echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+apt update
+apt install nsight-systems-cli
+```
+
+### Example commands and usage
+
+#### Offline Inference
+
+For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
+
+The following is an example using the `benchmarks/benchmark_latency.py` script:
+
+```bash
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
+```
+
+#### OpenAI Server
+
+To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
+
+```bash
+# server
+nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
+
+# client
+python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
+```
+
+In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
+
+```
+nsys sessions list
+```
+
+to get the session id in the form of `profile-XXXXX`, then run:
+
+```
+nsys stop --session=profile-XXXXX
+```
+
+to manually kill the profiler and generate your `nsys-rep` report.
+
+#### Analysis
+
+You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
+
+CLI example:
+
+```bash
+nsys stats report1.nsys-rep
+...
+ ** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
+
+ Time (%)  Total Time (ns)  Instances   Avg (ns)     Med (ns)    Min (ns)  Max (ns)   StdDev (ns)                                                  Name                                                
+ --------  ---------------  ---------  -----------  -----------  --------  ---------  -----------  ----------------------------------------------------------------------------------------------------
+     46.3   10,327,352,338     17,505    589,965.9    144,383.0    27,040  3,126,460    944,263.8  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
+     14.8    3,305,114,764      5,152    641,520.7    293,408.0   287,296  2,822,716    867,124.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
+     12.1    2,692,284,876     14,280    188,535.4     83,904.0    19,328  2,862,237    497,999.9  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
+      9.5    2,116,600,578     33,920     62,399.8     21,504.0    15,326  2,532,285    290,954.1  sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
+      5.0    1,119,749,165     18,912     59,208.4      9,056.0     6,784  2,578,366    271,581.7  void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
+      4.1      916,662,515     21,312     43,011.6     19,776.0     8,928  2,586,205    199,790.1  void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
+      2.6      587,283,113     37,824     15,526.7      3,008.0     2,719  2,517,756    139,091.1  std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
+      1.9      418,362,605     18,912     22,121.5      3,871.0     3,328  2,523,870    175,248.2  void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in…
+      0.7      167,083,069     18,880      8,849.7      2,240.0     1,471  2,499,996    101,436.1  void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
+... 
+```
+
+GUI example:
+
+<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />

From a176a2a6a5cde28dd12e492f4e0136614c0303c2 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Thu, 6 Mar 2025 15:31:05 -0800
Subject: [PATCH 0564/1240] [Hardware][TPU]Enable ragged paged attention kernel
 and resolve recompilation issue (#14310)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements-tpu.txt                 | 12 ++---
 vllm/v1/attention/backends/pallas.py | 39 +++++++++------
 vllm/v1/worker/tpu_model_runner.py   | 73 +++++++++++-----------------
 3 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index d999e8f1c90..4bc6a9b83bd 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -17,9 +17,9 @@ ray[default]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250227%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250227%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 543e8487e28..bbbdf50ac0c 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,7 +12,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 16
+NUM_QUERIES_PER_BLOCK = 32
 NUM_KV_PAGES_PER_BLOCK = 128
 
 
@@ -41,7 +41,7 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_kv_heads, num_blocks, block_size, head_size)
+        return (num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -115,6 +115,17 @@ def __init__(
                                       "are not implemented for "
                                       "PallasAttentionBackendImpl")
 
+        tpu_version = torch_xla.tpu.version()
+        if tpu_version < 4:
+            raise NotImplementedError("TPU version must be 4 or higher.")
+        # NOTE(chengjiyao): the TPU v4's vmem capacity is 16MB
+        # TODO(chengjiyao): autotune NUM_QUERIES_PER_BLOCK,
+        # NUM_KV_PAGES_PER_BLOCK and vmem_limit_bytes
+        if tpu_version == 4:
+            self.vmem_limit_bytes = 16 * 1024 * 1024
+        else:
+            self.vmem_limit_bytes = 64 * 1024 * 1024
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -131,8 +142,8 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_kv_heads, num_blocks, block_size, head_size], 
-                        [num_kv_heads, num_blocks, block_size, head_size])
+            kv_cache = ([num_blocks, block_size, num_kv_heads, head_size], 
+                        [num_blocks, block_size, num_kv_heads, head_size])
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -154,10 +165,6 @@ def forward(
             slot_mapping = attn_metadata.slot_mapping
             write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
 
-        query = query * self.scale
-        # use_kernel switches between using kernel or reference implementation
-        # (non kernel: https://github.com/pytorch/xla/blob/cee0820e78fc9675e2d0511db891fd44342e890d/torch_xla/experimental/custom_kernel.py#L890).
-        use_kernel = False
         output = torch.ops.xla.ragged_paged_attention(
             query,
             key_cache,
@@ -168,8 +175,9 @@ def forward(
             attn_metadata.num_seqs,
             num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
             num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            use_kernel=use_kernel,
-        )
+            vmem_limit_bytes=self.vmem_limit_bytes,
+            use_kernel=True,
+            sm_scale=self.scale)
 
         return output.reshape(num_tokens, hidden_size)
 
@@ -186,16 +194,15 @@ def write_to_kv_cache(
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
         value: shape = [num_tokens, num_kv_heads, head_size]
-        k_cache = [num_kv_heads, num_blocks, block_size, head_size]
-        v_cache = [num_kv_heads, num_blocks, block_size, head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
 
     """
     torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
     torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
 
-    key = key.flatten(0, 1)
-    value = value.flatten(0, 1)
-    key_cache = key_cache.flatten(0, 2)
-    value_cache = value_cache.flatten(0, 2)
+    key_cache = key_cache.flatten(0, 1)
+    value_cache = value_cache.flatten(0, 1)
+    slot_mapping = slot_mapping.flatten()
     key_cache.index_copy_(0, slot_mapping, key)
     value_cache.index_copy_(0, slot_mapping, value)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f9a3217fbef..f661412d937 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -14,7 +14,7 @@
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
-from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
@@ -416,8 +416,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
             num_scheduled_tokens_per_req)
 
         # Do the padding and copy the tensors to the TPU.
-        padded_total_num_scheduled_tokens = _get_padded_number(
-            total_num_scheduled_tokens, NUM_QUERIES_PER_BLOCK)
+        padded_total_num_scheduled_tokens = _get_padded_token_len(
+            total_num_scheduled_tokens)
         self.input_ids = self.input_ids_cpu[:
                                             padded_total_num_scheduled_tokens].to(
                                                 self.device)
@@ -428,23 +428,22 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         slot_mapping = self.slot_mapping_cpu[:
                                              padded_total_num_scheduled_tokens].to(
                                                  self.device)
-        padded_block_table = self.block_table_cpu[:
-                                                  padded_total_num_scheduled_tokens]
-        padded_block_table[:num_reqs, :self.max_num_blocks_per_req] = (
+        block_tables = self.block_table_cpu[:self.max_num_reqs]
+        block_tables[:num_reqs, :self.max_num_blocks_per_req] = (
             self.input_batch.block_table.get_cpu_tensor()[:num_reqs])
-        padded_block_table = padded_block_table.to(self.device)
-        query_start_loc = self.query_start_loc_cpu[:
-                                                   padded_total_num_scheduled_tokens
-                                                   + 1].to(self.device)
-        seq_lens = self.seq_lens_cpu[:padded_total_num_scheduled_tokens].to(
+        block_tables = block_tables.to(self.device)
+        query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to(
             self.device)
+        seq_lens = self.seq_lens_cpu[:self.max_num_reqs].to(self.device)
 
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
-            block_tables=padded_block_table,
+            block_tables=block_tables,
             context_lens=seq_lens,
             query_start_loc=query_start_loc,
-            num_seqs=num_reqs,
+            num_seqs=torch.tensor([num_reqs],
+                                  dtype=torch.int32,
+                                  device=self.device),
         )
         # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial
         # request in the batch. While we should not sample any token from this
@@ -693,29 +692,34 @@ def _dummy_run(
                                     dtype=torch.int32,
                                     device=self.device)
             inputs_embeds = None
+        actual_num_reqs = min(num_tokens, self.max_num_reqs)
         position_ids = torch.zeros(num_tokens,
                                    dtype=torch.int32,
                                    device=self.device)
         slot_mapping = torch.zeros(num_tokens,
                                    dtype=torch.int64,
                                    device=self.device)
-        block_tables = torch.zeros((num_tokens, self.block_table_cpu.shape[1]),
-                                   dtype=torch.int32,
-                                   device=self.device)
-        query_lens = [1] * num_tokens
+        block_tables = torch.zeros(
+            (self.max_num_reqs, self.block_table_cpu.shape[1]),
+            dtype=torch.int32,
+            device=self.device)
+        query_lens = [1] * self.max_num_reqs
         query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
                                                     dtype=torch.int32),
                                        dim=0,
                                        dtype=torch.int32).to(self.device)
-        context_lens = torch.ones((num_tokens, ),
+        context_lens = torch.ones((self.max_num_reqs, ),
                                   dtype=torch.int32,
                                   device=self.device)
+        num_seqs = torch.tensor([actual_num_reqs],
+                                dtype=torch.int32,
+                                device=self.device)
         attn_metadata = PallasMetadata(
             slot_mapping=slot_mapping,
             block_tables=block_tables,
             context_lens=context_lens,
             query_start_loc=query_start_loc,
-            num_seqs=num_tokens,
+            num_seqs=num_seqs,
         )
 
         if self.is_multimodal_model:
@@ -724,9 +728,6 @@ def _dummy_run(
             torch._dynamo.mark_dynamic(input_ids, 0)
         torch._dynamo.mark_dynamic(position_ids, 0)
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.block_tables, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.query_start_loc, 0)
-        torch._dynamo.mark_dynamic(attn_metadata.context_lens, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
@@ -817,28 +818,6 @@ def forward(
             inputs_embeds: The input embeddings of shape [num_tokens,
                 hidden_size]. It is used for multimodal models.
         """
-        # Skip this in memory profiling at initialization.
-        if kv_caches[0][0].numel() > 0:
-            attn_metadata = get_forward_context().attn_metadata
-            # index_copy_(slot_mapping) only works when the inserted dimension
-            # is 0. However, the KV cache in the Pallas backend has the shape
-            # [num_kv_heads, num_blocks, block_size, head_size]. To make it
-            # work, we need to flatten the first three dimensions and modify
-            # the slot_mapping accordingly.
-            # kv_caches: list[tuple[torch.Tensor, torch.Tensor]]
-            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
-            slot_mapping = attn_metadata.slot_mapping
-            slot_mapping = slot_mapping.flatten()
-            head_indicies = torch.arange(0,
-                                         num_kv_heads,
-                                         device=slot_mapping.device,
-                                         dtype=slot_mapping.dtype)
-            head_indicies *= block_size * num_blocks
-            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
-                -1, num_kv_heads)
-            slot_mapping = slot_mapping + head_indicies.view(1, -1)
-            slot_mapping = slot_mapping.flatten()
-            attn_metadata.slot_mapping = slot_mapping
 
         assert self.model is not None
         hidden_states = self.model(
@@ -866,3 +845,9 @@ def get_input_embeddings(self, *args, **kwargs):
 
 def _get_padded_number(n: int, multiple: int) -> int:
     return ((n + multiple - 1) // multiple) * multiple
+
+
+def _get_padded_token_len(x: int) -> int:
+    if x <= 16:
+        return 16
+    return 1 << (x - 1).bit_length()

From 88c2c8874dd454623f3129930cd8990aa15b8857 Mon Sep 17 00:00:00 2001
From: Daniel Li <dyli@google.com>
Date: Thu, 6 Mar 2025 16:31:52 -0800
Subject: [PATCH 0565/1240] [Doc] Fix a typo (#14385)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 68ca2dc8f7d..b95c8b14cc0 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -174,7 +174,7 @@ def sample_sonnet_requests(
 ) -> list[tuple[str, str, int, int, None]]:
     assert (
         input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+    ), "'args.sonnet-input-len' must be greater than 'args.sonnet-prefix-len'."
 
     # Load the dataset.
     with open(dataset_path, encoding='utf-8') as f:

From 6fb944bd69419f4a62b31b9cfe33576b38d09404 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Thu, 6 Mar 2025 19:42:49 -0500
Subject: [PATCH 0566/1240] [Bugfix] Correctly call `cudaProfilerStop` in
 benchmarks script (#14183)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_layernorm.py       | 2 +-
 benchmarks/kernels/benchmark_lora.py            | 1 -
 benchmarks/kernels/benchmark_machete.py         | 3 +--
 benchmarks/kernels/benchmark_moe.py             | 1 +
 benchmarks/kernels/benchmark_paged_attention.py | 2 +-
 benchmarks/kernels/benchmark_quant.py           | 2 +-
 6 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index d265c91bfef..e12d74c01e4 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 5eaeec01705..3c4d6a6aa46 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
         result = torch.nn.functional.linear(x, w)
         result *= scaling
         out_list.append(result)
-    torch.cat(out_list, dim=0)
 
     cat_result = torch.cat(out_list, dim=0)
 
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 3fa57bd7b23..a661ea9d7e6 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -45,7 +45,6 @@ def terse_type_name(dt):
         torch.float16: "fp16",
         torch.int8: "int8",
         torch.float8_e4m3fn: "fp8",
-        torch.bfloat16: "bf16",
         torch.float: "float",
         torch.int: "int",
     }[dt]
@@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
 
     return lambda: ops.machete_mm(
         a=bt.a,
-        b_q=bt.w_q,
+        b_q=w_q,
         b_type=bt.wtype,
         b_group_scales=bt.w_g_s,
         b_group_zeros=w_g_zp,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index dce0bef4203..9de8d5af624 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import json
 import time
 from contextlib import nullcontext
 from datetime import datetime
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 221d7b7d5d9..48b351bc481 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -176,7 +176,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 0ddea9390d7..b643897a60e 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -40,7 +40,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
         end_time = time.perf_counter()
         if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
         return (end_time - start_time) / num_iters
 
     # Warmup.

From c1c24558fe80688b0a627229b27ed4260cb1bbee Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 6 Mar 2025 22:59:14 -0500
Subject: [PATCH 0567/1240] [Perf] Reduce MLA CPU overheads in V1 (#14384)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/rotary_embedding.py |  9 +++++++--
 vllm/v1/attention/backends/mla/common.py       | 15 +++++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 64c2dac524f..48cdebee916 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,8 +161,13 @@ def forward_cuda(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         from vllm import _custom_ops as ops
 
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
         # ops.rotary_embedding()/batched_rotary_embedding()
         # are in-place operations that update the query and key tensors.
         if offsets is not None:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0b55854de94..5b9a4b5ca4e 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -222,8 +222,8 @@
     apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import (
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 try:
@@ -627,8 +627,15 @@ def __init__(
         self.v_head_dim = v_head_dim
 
         self.rotary_emb = rotary_emb
-        self.use_yarn_rope = isinstance(rotary_emb,
-                                        DeepseekScalingRotaryEmbedding)
+
+        if current_platform.is_cuda():
+            # Hack for V1 for now to avoid torch library overhead (since we are
+            # already inside an attention custom op), pull out the forward
+            # method from the rotary embedding and call it directly (and avoid
+            # calling forward_native, when we can call forward_cuda)
+            # TODO(lucas): we should probably find a cleaner way to do this
+            self.rotary_emb = rotary_emb.forward_cuda
+
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj

From ee27d0800788e6bced550c67d084fa50939ec69d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 7 Mar 2025 00:20:16 -0500
Subject: [PATCH 0568/1240] [FP8] Refactor apply_fp8_linear and
 apply_fp8_linear_generic into an object (#14390)

Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/test_fusion.py                  |  20 +-
 vllm/attention/backends/mla/common.py         |   7 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    |  19 +-
 .../layers/quantization/fbgemm_fp8.py         |  22 +-
 .../model_executor/layers/quantization/fp8.py |  21 +-
 .../layers/quantization/modelopt.py           |  16 +-
 .../layers/quantization/ptpc_fp8.py           |  18 +-
 .../quark/schemes/quark_w8a8_fp8.py           |  18 +-
 .../layers/quantization/utils/fp8_utils.py    |  92 +++---
 .../layers/quantization/utils/w8a8_utils.py   | 270 ++++++++++--------
 vllm/v1/attention/backends/mla/common.py      |   7 +-
 11 files changed, 268 insertions(+), 242 deletions(-)

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 89abc001764..aaf02778109 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -13,7 +13,7 @@
 from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_FP8_SUPPORTED, apply_fp8_linear, maybe_create_device_identity)
+    CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
 
 from .backend import TestBackend
 
@@ -34,26 +34,20 @@ def __init__(self, hidden_size: int, eps: float, static: bool,
             torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
             for _ in range(2)
         ]
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_enabled,
+            use_per_token_if_dynamic=True)
 
     def forward(self, x):
         resid = torch.sqrt(x)
         y = self.norm[0](x)
 
-        x2 = apply_fp8_linear(y,
-                              self.w[0],
-                              self.wscale[0],
-                              self.scale[0],
-                              use_per_token_if_dynamic=True,
-                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
+        x2 = self.fp8_linear.apply(y, self.w[0], self.wscale[0], self.scale[0])
         # make sure resid is used for replacement to work
         y2, resid = self.norm[1](x2, resid)
 
-        x3 = apply_fp8_linear(y2,
-                              self.w[1],
-                              self.wscale[1],
-                              self.scale[1],
-                              use_per_token_if_dynamic=True,
-                              cutlass_fp8_supported=self.cutlass_fp8_enabled)
+        x3 = self.fp8_linear.apply(y2, self.w[1], self.wscale[1],
+                                   self.scale[1])
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 109e8496fc3..4f4b70cd8f4 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -226,7 +226,7 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
@@ -1057,6 +1057,7 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.triton_fa_func = triton_attention
+        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -1071,7 +1072,7 @@ def __init__(
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_UV_O):
-                output_parallel = apply_fp8_linear_generic(
+                output_parallel = self.fp8_linear_generic.apply(
                     x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape)
@@ -1091,7 +1092,7 @@ def _v_up_proj_and_o_proj(self, x):
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_Q_UK):
-                return apply_fp8_linear_generic(
+                return self.fp8_linear_generic.apply(
                     x, self.W_Q_UK, self.W_Q_UK_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape).view(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 32072e9fa57..aca25c9bfa1 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -9,8 +9,8 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -24,7 +24,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -140,11 +140,8 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 20f2c3da600..110e4ef2e9f 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -11,14 +11,12 @@
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, maybe_create_device_identity,
-    normalize_e4m3fn_to_e4m3fnuz)
+    Fp8LinearOp, maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter)
 from vllm.platforms import current_platform
@@ -37,6 +35,7 @@ def __init__(self, ignore_list: List[str], input_scale_ub: float):
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
         self.use_marlin = not current_platform.has_device_capability(89)
+        self.fp8_linear = Fp8LinearOp()
 
     @classmethod
     def get_name(cls) -> str:
@@ -73,7 +72,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
     def create_weights(
         self,
@@ -159,12 +158,9 @@ def apply(self,
                 size_k=layer.input_size_per_partition,
                 bias=bias)
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=None,
-            input_scale_ub=layer.input_scale_ub,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=None,
+                                     input_scale_ub=layer.input_scale_ub,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a705f63be4a..3f8e0a2f923 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    all_close_1d, apply_fp8_linear, convert_to_channelwise,
+    Fp8LinearOp, all_close_1d, convert_to_channelwise,
     cutlass_block_fp8_supported, cutlass_fp8_supported,
     maybe_create_device_identity, normalize_e4m3fn_to_e4m3fnuz,
     per_tensor_dequantize, requantize_with_max_scale)
@@ -137,7 +137,6 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
@@ -153,6 +152,10 @@ def __init__(self, quant_config: Fp8Config):
             # Marlin doesn't support block-wise fp8
             self.use_marlin = False
 
+        self.fp8_linear = Fp8LinearOp(
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=cutlass_fp8_supported())
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -381,15 +384,11 @@ def apply(self,
                 cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
             )
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            # Default to using per_token quantization if cutlass is supported
-            use_per_token_if_dynamic=self.cutlass_fp8_supported)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 36711a7a509..1f8af8d678c 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -12,7 +12,7 @@
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
+    Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
 
@@ -95,7 +95,7 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptFp8Config):
         self.quant_config = quant_config
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp()
 
     def create_weights(
         self,
@@ -157,10 +157,8 @@ def apply(
         x: torch.Tensor,
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index 1ded5389e5f..592ffc5dad1 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    Fp8LinearOp)
 from vllm.platforms import current_platform
 
 ACTIVATION_SCHEMES = ["static", "dynamic"]
@@ -93,6 +93,8 @@ def __init__(self, quant_config: PTPCFp8Config):
         super().__init__(quant_config=quant_config)
         # Force weight quantization
         self.quant_config.is_checkpoint_fp8_serialized = False
+        self.fp8_linear = Fp8LinearOp(cutlass_fp8_supported=False,
+                                      use_per_token_if_dynamic=True)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.weight = torch.nn.Parameter(layer.weight.data,
@@ -115,11 +117,9 @@ def apply(self,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(input=x,
-                                weight=layer.weight,
-                                weight_scale=layer.weight_scale,
-                                input_scale=None,
-                                input_scale_ub=None,
-                                bias=bias,
-                                cutlass_fp8_supported=False,
-                                use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=None,
+                                     input_scale_ub=None,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index c885e98a4d6..7676fbddb6b 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -7,8 +7,7 @@
 
 from vllm.model_executor.layers.quantization.quark.schemes import QuarkScheme
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
-    requantize_with_max_scale)
+    Fp8LinearOp, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
@@ -22,7 +21,7 @@ class QuarkW8A8Fp8(QuarkScheme):
     def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
-        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -132,11 +131,8 @@ def apply_weights(self,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        return apply_fp8_linear(
-            input=x,
-            weight=layer.weight,
-            weight_scale=layer.weight_scale,
-            input_scale=layer.input_scale,
-            bias=bias,
-            cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=True)
+        return self.fp8_linear.apply(input=x,
+                                     weight=layer.weight,
+                                     weight_scale=layer.weight_scale,
+                                     input_scale=layer.input_scale,
+                                     bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 7d91d2cf1c6..62569185ef4 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -15,7 +15,8 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     _normalize_quant_group_shape, scaled_dequantize)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_BLOCK_FP8_SUPPORTED, CUTLASS_FP8_SUPPORTED, apply_fp8_linear)
+    CUTLASS_BLOCK_FP8_SUPPORTED, Fp8LinearOp, cutlass_block_fp8_supported,
+    cutlass_fp8_supported)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -32,6 +33,8 @@ def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     return x == torch.float8_e4m3fn or x == torch.float8_e4m3fnuz
 
 
+# TODO fix ROCm->Triton custom path:
+#  https://github.com/vllm-project/vllm/issues/14397
 def apply_w8a8_block_fp8_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -49,6 +52,7 @@ def apply_w8a8_block_fp8_linear(
     shape_supported_by_cutlass = (weight.shape[0] % 128 == 0
                                   and weight.shape[1] % 128 == 0)
     if current_platform.is_rocm():
+        # TODO this is never used, as cutlass_block_fp8_supported is False
         scale_a_shape = ((input_2d.shape[-1] // block_size[1], ) +
                          input_2d.shape[:-1])[::-1]
         scale_b_shape = (weight_scale.view(-1, 1)
@@ -104,43 +108,55 @@ def apply_w8a8_block_fp8_linear_fake(
 # Unify the interface between `apply_w8a8_block_fp8_linear` and
 # `apply_fp8_linear`
 # NOTE(lucas): this is quite messy, we should think through this more formally
-def apply_fp8_linear_generic(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_group_shape: Tuple[int, int],
-    weight_group_shape: Tuple[int, int],
-    input_scale: Optional[torch.Tensor] = None,  # static scale if one
-    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
-    cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
-) -> torch.Tensor:
-    # View input as 2D matrix for fp8 methods
-    input = input.view(-1, input.shape[-1])
-
-    weight_group_shape = _normalize_quant_group_shape(\
-        weight, weight_group_shape)
-    input_group_shape = _normalize_quant_group_shape(input, input_group_shape)
-
-    def is_dim_blocked(dim, shape, group_shape):
-        return group_shape < shape[dim] and group_shape > 1
-
-    if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
-     and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
-     input_group_shape == (1, weight_group_shape[1]):
-        return apply_w8a8_block_fp8_linear(
-            input,
-            weight,
-            list(weight_group_shape),
-            weight_scale,
-            cutlass_block_fp8_supported=cutlass_block_fp8_supported)
-    else:
-        # Despite having linear in the it doesn't conform to
-        # `torch.nn.functional.linear` which is defined as `input @ weight.T`
-        # so we explicitly transpose the weight matrix here
-        return apply_fp8_linear(input, weight.T, weight_scale.T,
-                    cutlass_fp8_supported=cutlass_fp8_supported,
-                         use_per_token_if_dynamic=\
-                             (input_group_shape == (1, input.shape[1])))
+# TODO(luka): unify this better
+#  https://github.com/vllm-project/vllm/issues/14397
+class Fp8LinearGenericOp:
+
+    def __init__(
+            self,
+            cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+            cutlass_block_fp8_supported: bool = cutlass_block_fp8_supported(),
+    ):
+        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported
+        self.fp8_linear = Fp8LinearOp(
+            cutlass_fp8_supported=cutlass_fp8_supported)
+
+    def apply(
+            self,
+            input: torch.Tensor,
+            weight: torch.Tensor,
+            weight_scale: torch.Tensor,
+            input_group_shape: Tuple[int, int],
+            weight_group_shape: Tuple[int, int],
+            input_scale: Optional[torch.Tensor] = None,  # static scale if one
+    ) -> torch.Tensor:
+        # View input as 2D matrix for fp8 methods
+        input = input.view(-1, input.shape[-1])
+
+        weight_group_shape = _normalize_quant_group_shape( \
+            weight, weight_group_shape)
+        input_group_shape = _normalize_quant_group_shape(
+            input, input_group_shape)
+
+        def is_dim_blocked(dim, shape, group_shape):
+            return group_shape < shape[dim] and group_shape > 1
+
+        if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
+         and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
+         input_group_shape == (1, weight_group_shape[1]):
+            return apply_w8a8_block_fp8_linear(
+                input,
+                weight,
+                list(weight_group_shape),
+                weight_scale,
+                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported)
+        else:
+            # Despite having linear in the name it doesn't conform to
+            # `torch.nn.functional.linear` which is defined as
+            # `input @ weight.T` so we explicitly transpose the weight matrix
+            return self.fp8_linear.apply(input, weight.T, weight_scale.T,
+                             use_per_token_if_dynamic=\
+                                 (input_group_shape == (1, input.shape[1])))
 
 
 def input_to_float8(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 8072f307763..9de8e453354 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -121,134 +121,162 @@ def maybe_create_device_identity():
         TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
-def apply_fp8_linear(
-    input: torch.Tensor,
-    weight: torch.Tensor,
-    weight_scale: torch.Tensor,
-    input_scale: Optional[torch.Tensor] = None,
-    input_scale_ub: Optional[torch.Tensor] = None,
-    bias: Optional[torch.Tensor] = None,
-    cutlass_fp8_supported: bool = CUTLASS_FP8_SUPPORTED,
-    use_per_token_if_dynamic: bool = False,
-) -> torch.Tensor:
-    # ops.scaled_fp8_quant supports both dynamic and static quant.
-    #   If dynamic, layer.input_scale is None and x_scale computed from x.
-    #   If static, layer.input_scale is scalar and x_scale is input_scale.
-
-    # View input as 2D matrix for fp8 methods
-    input_2d = input.view(-1, input.shape[-1])
-    output_shape = [*input.shape[:-1], weight.shape[1]]
-
-    # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
-    if cutlass_fp8_supported:
-        qinput, x_scale = ops.scaled_fp8_quant(
-            input_2d,
-            input_scale,
-            scale_ub=input_scale_ub,
-            use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-        # Fused GEMM_DQ
-        output = ops.cutlass_scaled_mm(qinput,
-                                       weight,
-                                       out_dtype=input.dtype,
-                                       scale_a=x_scale,
-                                       scale_b=weight_scale,
-                                       bias=bias)
-        return output.view(*output_shape)
-
-    # torch.scaled_mm supports per tensor weights + activations only
-    # so fallback to naive if per channel or per token
-    else:
+# TODO(luka): follow similar pattern for marlin and block-fp8-linear
+#  https://github.com/vllm-project/vllm/issues/14397
+class Fp8LinearOp:
+    """
+    This class executes a FP8 linear layer using cutlass if supported and
+    torch.scaled_mm otherwise.
+    It needs to be a class instead of a method so that config can be read
+    in the __init__ method, as reading config is not allowed inside forward.
+    """
+
+    def __init__(self,
+                 cutlass_fp8_supported: bool = cutlass_fp8_supported(),
+                 use_per_token_if_dynamic: bool = False,
+                 pad_output: Optional[bool] = None):
+        self.cutlass_fp8_supported = cutlass_fp8_supported
+        self.use_per_token_if_dynamic = use_per_token_if_dynamic
+
         # Note: we pad the input because torch._scaled_mm is more performant
         # for matrices with batch dimension > 16.
         # This could change in the future.
         # We also don't pad when using torch.compile,
         # as it breaks with dynamic shapes.
-        config = get_current_vllm_config().compilation_config
-        do_pad = config.level < CompilationLevel.PIECEWISE
-        qinput, x_scale = ops.scaled_fp8_quant(
-            input_2d,
-            input_scale,
-            num_token_padding=17 if do_pad else None,
-            use_per_token_if_dynamic=use_per_token_if_dynamic)
-
-        per_tensor_weights = (weight_scale.numel() == 1)
-        per_tensor_activations = (x_scale.numel() == 1)
-
-        if per_tensor_weights and per_tensor_activations:
-            # Fused GEMM_DQ
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale,
-                                      bias=bias)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-
-            return torch.narrow(output, 0, 0,
-                                input_2d.shape[0]).view(*output_shape)
-
-        elif (use_per_token_if_dynamic and not per_tensor_weights
-              and not per_tensor_activations and USE_ROWWISE_TORCH_SCALED_MM):
-            # For now validated on ROCm platform
-            # fp8 rowwise scaling in torch._scaled_mm is introduced in
-            # https://github.com/pytorch/pytorch/pull/144432 using
-            # hipBLASLt and ROCm 6.3, which only exists in torch 2.7 and above.
-            # For CUDA platform please validate if the
-            # torch._scaled_mm support rowwise scaled GEMM
-            # Fused GEMM_DQ Rowwise GEMM
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      out_dtype=input.dtype,
-                                      scale_a=x_scale,
-                                      scale_b=weight_scale.t(),
-                                      bias=bias)
-
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            output = output.view(*output_shape)
-            return output
+        if pad_output is None:
+            config = get_current_vllm_config().compilation_config
+            pad_output = config.level < CompilationLevel.PIECEWISE
+        self.output_padding = 17 if pad_output else None
+
+    def apply(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        weight_scale: torch.Tensor,
+        input_scale: Optional[torch.Tensor] = None,
+        input_scale_ub: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+        # TODO(luka) remove this parameter in favor of __init__
+        use_per_token_if_dynamic: Optional[bool] = None
+    ) -> torch.Tensor:
+        # ops.scaled_fp8_quant supports both dynamic and static quant.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
+
+        # View input as 2D matrix for fp8 methods
+        input_2d = input.view(-1, input.shape[-1])
+        output_shape = [*input.shape[:-1], weight.shape[1]]
+
+        # TODO(luka) this is here because currently MLA only decides this
+        #  during the forward method instead of in __init__.
+        if use_per_token_if_dynamic is None:
+            use_per_token_if_dynamic = self.use_per_token_if_dynamic
+
+        # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
+        if self.cutlass_fp8_supported:
+            qinput, x_scale = ops.scaled_fp8_quant(
+                input_2d,
+                input_scale,
+                scale_ub=input_scale_ub,
+                use_per_token_if_dynamic=use_per_token_if_dynamic)
 
+            # Fused GEMM_DQ
+            output = ops.cutlass_scaled_mm(qinput,
+                                           weight,
+                                           out_dtype=input.dtype,
+                                           scale_a=x_scale,
+                                           scale_b=weight_scale,
+                                           bias=bias)
+            return output.view(*output_shape)
+
+        # torch.scaled_mm supports per tensor weights + activations only
+        # so fallback to naive if per channel or per token
         else:
-            # Fallback for channelwise case, where we use unfused DQ
-            # due to limitations with scaled_mm
-
-            # Symmetric quantized GEMM by definition computes the following:
-            #   C = (s_x * X) (s_w * W) + bias
-            # This is equivalent to dequantizing the weights and activations
-            # before applying a GEMM.
-            #
-            # In order to compute quantized operands, a quantized kernel
-            # will rewrite the above like so:
-            #   C = s_w * s_x * (X * W) + bias
-            #
-            # For the scaled_mm fallback case, we break this down, since it
-            # does not support s_w being a vector.
-
-            # GEMM
-            # This computes C = (X * W).
-            # Output in fp32 to allow subsequent ops to happen in-place
-            output = torch._scaled_mm(qinput,
-                                      weight,
-                                      scale_a=TORCH_DEVICE_IDENTITY,
-                                      scale_b=TORCH_DEVICE_IDENTITY,
-                                      out_dtype=torch.float32)
-            # A fix for discrepancy in scaled_mm which returns tuple
-            # for torch < 2.5 and a single value in torch >= 2.5
-            if type(output) is tuple and len(output) == 2:
-                output = output[0]
-            # Unpad (undo num_token_padding)
-            output = torch.narrow(output, 0, 0, input_2d.shape[0])
-            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-
-            # DQ
-            # C = sw * sx * (X * W) + bias
-            output = output * x_scale * weight_scale.t()
-            if bias is not None:
-                output = output + bias
-            return output.to(dtype=input.dtype).view(*output_shape)
+            # Maybe apply padding to output, see comment in __init__
+            qinput, x_scale = ops.scaled_fp8_quant(
+                input_2d,
+                input_scale,
+                num_token_padding=self.output_padding,
+                use_per_token_if_dynamic=use_per_token_if_dynamic)
+
+            per_tensor_weights = (weight_scale.numel() == 1)
+            per_tensor_activations = (x_scale.numel() == 1)
+
+            if per_tensor_weights and per_tensor_activations:
+                # Fused GEMM_DQ
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          out_dtype=input.dtype,
+                                          scale_a=x_scale,
+                                          scale_b=weight_scale,
+                                          bias=bias)
+                # A fix for discrepancy in scaled_mm which returns tuple
+                # for torch < 2.5 and a single value in torch >= 2.5
+                if type(output) is tuple and len(output) == 2:
+                    output = output[0]
+
+                return torch.narrow(output, 0, 0,
+                                    input_2d.shape[0]).view(*output_shape)
+
+            elif (use_per_token_if_dynamic and not per_tensor_weights
+                  and not per_tensor_activations
+                  and USE_ROWWISE_TORCH_SCALED_MM):
+                # For now validated on ROCm platform
+                # fp8 rowwise scaling in torch._scaled_mm is introduced in
+                # https://github.com/pytorch/pytorch/pull/144432 using hipBLASLt
+                # and ROCm 6.3, which only exists in torch 2.7 and above.
+                # For CUDA platform please validate if the
+                # torch._scaled_mm support rowwise scaled GEMM
+                # Fused GEMM_DQ Rowwise GEMM
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          out_dtype=input.dtype,
+                                          scale_a=x_scale,
+                                          scale_b=weight_scale.t(),
+                                          bias=bias)
+
+                output = torch.narrow(output, 0, 0, input_2d.shape[0])
+                output = output.view(*output_shape)
+                return output
+
+            else:
+                # Fallback for channelwise case, where we use unfused DQ
+                # due to limitations with scaled_mm
+
+                # Symmetric quantized GEMM by definition computes the following:
+                #   C = (s_x * X) (s_w * W) + bias
+                # This is equivalent to dequantizing the weights and activations
+                # before applying a GEMM.
+                #
+                # In order to compute quantized operands, a quantized kernel
+                # will rewrite the above like so:
+                #   C = s_w * s_x * (X * W) + bias
+                #
+                # For the scaled_mm fallback case, we break this down, since it
+                # does not support s_w being a vector.
+
+                # GEMM
+                # This computes C = (X * W).
+                # Output in fp32 to allow subsequent ops to happen in-place
+                output = torch._scaled_mm(qinput,
+                                          weight,
+                                          scale_a=TORCH_DEVICE_IDENTITY,
+                                          scale_b=TORCH_DEVICE_IDENTITY,
+                                          out_dtype=torch.float32)
+                # A fix for discrepancy in scaled_mm which returns tuple
+                # for torch < 2.5 and a single value in torch >= 2.5
+                if type(output) is tuple and len(output) == 2:
+                    output = output[0]
+                # Unpad (undo num_token_padding)
+                output = torch.narrow(output, 0, 0, input_2d.shape[0])
+                x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
+
+                # DQ
+                # C = sw * sx * (X * W) + bias
+                output = output * x_scale * weight_scale.t()
+                if bias is not None:
+                    output = output + bias
+                return output.to(dtype=input.dtype).view(*output_shape)
 
 
 def normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 5b9a4b5ca4e..f3fff585be6 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -219,7 +219,7 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    apply_fp8_linear_generic, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
@@ -640,6 +640,7 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
+        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -653,7 +654,7 @@ def __init__(
     def _v_up_proj_and_o_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_UV_O):
-                output_parallel = apply_fp8_linear_generic(
+                output_parallel = self.fp8_linear_generic.apply(
                     x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape)
@@ -673,7 +674,7 @@ def _v_up_proj_and_o_proj(self, x):
     def _q_proj_and_k_up_proj(self, x):
         if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
             if is_fp8(self.W_Q_UK):
-                return apply_fp8_linear_generic(
+                return self.fp8_linear_generic.apply(
                     x, self.W_Q_UK, self.W_Q_UK_scales,
                     self.reqaunt_input_group_shape,
                     self.reqaunt_weight_group_shape).view(

From 579d6d2d7ba7f91253641794e630ef895fd0d177 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 7 Mar 2025 00:56:06 -0500
Subject: [PATCH 0569/1240] [BugFix] Illegal Memory Access in the blockwise
 cutlass fp8 GEMMs (#14396)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index 928a9500cbb..d922a3349e1 100644
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -402,7 +402,7 @@ struct CollectiveMma<
 
     // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
     TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
-      Layout<Shape<_32, _1>>{}, Layout<Shape<_4, _1>>{}); // (1,1,1)
+      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
     TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
       Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
     ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);

From 0141e1b85da3fdca1d43cf7f4857942af7773dc4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 7 Mar 2025 14:05:47 +0800
Subject: [PATCH 0570/1240] [Bugfix] Fix JambaForCausalLM LoRA  (#14370)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py    | 24 -----------------
 tests/lora/test_jamba.py  | 54 ---------------------------------------
 tests/lora/test_layers.py |  3 +++
 vllm/lora/layers.py       | 37 +++++++++++++++++++++++----
 4 files changed, 35 insertions(+), 83 deletions(-)
 delete mode 100644 tests/lora/test_jamba.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index dd14abff630..f3b545670b8 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -6,7 +6,6 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-import safetensors
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download
@@ -191,29 +190,6 @@ def mixtral_lora_files_all_target_modules():
     return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
 
 
-@pytest.fixture(scope="session")
-def jamba_lora_files():
-    #   some of the adapters have unnecessary weights for serving,
-    #   hence we remove them
-    def remove_unnecessary_weights(path):
-        lora_path = f"{adapter_path}/adapter_model.safetensors"
-        tensors = safetensors.torch.load_file(lora_path)
-        nonlora_keys = []
-        for k in list(tensors.keys()):
-            if "lora" not in k:
-                nonlora_keys.append(k)
-        for k in nonlora_keys:
-            del tensors[k]
-        safetensors.torch.save_file(tensors, lora_path)
-
-    adapter_path = snapshot_download(
-        repo_id=
-        "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
-
-    remove_unnecessary_weights(adapter_path)
-    return adapter_path
-
-
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py
deleted file mode 100644
index 885851880b5..00000000000
--- a/tests/lora/test_jamba.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-import torch
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
-
-MAX_TOKENS = 40
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: list[str]) -> list[str]:
-
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-@pytest.mark.parametrize("tp_size", [4])
-def test_jamba_lora(jamba_lora_files, tp_size):
-    """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    prompts = ["Write a story about a sheep and a goat."]
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        distributed_executor_backend="ray",
-        tensor_parallel_size=tp_size,
-    )
-
-    expected_jamba_output = [
-        """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming"""  # noqa: E501
-    ]
-    assert do_sample(llm, jamba_lora_files, lora_id=1,
-                     prompts=prompts) == expected_jamba_output
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 3507d012121..428a1c71d09 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -632,6 +632,7 @@ def create_random_linear_replicated_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_replicated_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -757,6 +758,7 @@ def create_random_linear_parallel_layer():
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, lora_linear = create_random_linear_parallel_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, _ = populate_loras(
             id_to_index,
@@ -904,6 +906,7 @@ class FakeConfig:
         id_to_index = get_random_id_to_index(num_loras, max_loras)
 
         linear, lora_linear = create_column_parallel_packed_layer()
+        assert torch.equal(linear.weight, lora_linear.weight)
         lora_linear.set_mapping(punica_wrapper)
         lora_dict, sublora_dict = populate_loras(
             id_to_index,
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index ff1b6501d1f..1c1f76702dd 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -274,6 +274,10 @@ def can_replace_layer(
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
+    @property
+    def weight(self):
+        return self.base_layer.weight
+
 
 class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
 
@@ -409,6 +413,34 @@ def apply(self,
                                             self.output_slices)
         return output
 
+    @property
+    def weight(self) -> torch.Tensor:
+
+        # unquantizedLinear
+        if hasattr(self.base_layer, "weight"):
+            return self.base_layer.weight
+        # Compressed Tensor
+        elif hasattr(self.base_layer, "weight_packed"):
+            return self.base_layer.weight_packed
+        # GPTQ/AWQ
+        elif hasattr(self.base_layer, "qweight"):
+            return self.base_layer.qweight
+        # marlin
+        elif hasattr(self.base_layer, "B"):
+            return self.base_layer.B
+        # HQQ marlin
+        elif hasattr(self.base_layer, "W_q"):
+            return self.base_layer.W_q
+        else:
+            raise ValueError(f"Unsupported base layer: {self.base_layer}")
+
+    @property
+    def bias(self) -> Optional[torch.Tensor]:
+        if hasattr(self.base_layer, "bias"):
+            return self.base_layer.bias
+        else:
+            return None
+
 
 class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
 
@@ -902,11 +934,6 @@ def forward(
 
         return output, output_bias
 
-    @property
-    def weight(self):
-        return (self.base_layer.weight if hasattr(self.base_layer, "weight")
-                else self.base_layer.qweight)
-
     @classmethod
     @_not_fully_sharded_can_replace
     def can_replace_layer(

From 9a7ca6be6896c6d9cce484817b0104d51255dffb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 7 Mar 2025 14:10:57 +0800
Subject: [PATCH 0571/1240] [Build] Add nightly wheel fallback when latest
 commit wheel unavailable (#14358)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/setup.py b/setup.py
index 1a6f2ffd852..749f415b6b3 100755
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,18 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
+def is_url_available(url: str) -> bool:
+    from urllib.request import urlopen
+
+    status = None
+    try:
+        with urlopen(url) as f:
+            status = f.status
+    except Exception:
+        return False
+    return status == 200
+
+
 class CMakeExtension(Extension):
 
     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
@@ -320,6 +332,10 @@ def run(self) -> None:
         if wheel_location is None:
             base_commit = self.get_base_commit_in_main_branch()
             wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+            # Fallback to nightly wheel if latest commit wheel is unavailable,
+            # in this rare case, the nightly release CI hasn't finished on main.
+            if not is_url_available(wheel_location):
+                wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
         import zipfile
 

From c7ee2a02a185717da006164a394713d2d1ff0254 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 7 Mar 2025 10:24:49 +0400
Subject: [PATCH 0572/1240] OpenVINO: added CPU-like conditions (#14338)

Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/conftest.py | 3 ++-
 tests/utils.py    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 57a33ad08c9..4fbb4132d38 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -257,7 +257,8 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
             return x
 
         if device is None:
-            device = "cpu" if current_platform.is_cpu() else "cuda"
+            device = "cpu" if current_platform.is_cpu(
+            ) or current_platform.is_openvino() else "cuda"
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
diff --git a/tests/utils.py b/tests/utils.py
index 5a97636eec6..ba490cc3845 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -698,7 +698,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     without enough resources, or called when filtering tests to run directly.
     """
     try:
-        if current_platform.is_cpu():
+        if current_platform.is_cpu() or current_platform.is_openvino():
             memory_gb = 0
         else:
             memory_gb = current_platform.get_device_total_memory() / GB_bytes

From 3f192bda2012af4dae5b9221cf54a4589ef5dd1c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 15:26:32 +0800
Subject: [PATCH 0573/1240] [GH] Auto-apply multi-modality label to relevant
 PRs (#14402)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/mergify.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index e41107ae0a0..54f56210b28 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -36,6 +36,21 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-multi-modality
+  description: Automatically apply multi-modality label
+  conditions:
+    - or:
+      - files~=^vllm/multimodal/
+      - files~=^tests/multimodal/
+      - files~=^tests/models/multimodal/
+      - files~=^tests/models/*/audio_language/
+      - files~=^tests/models/*/vision_language/
+      - files=tests/models/test_vision.py
+  actions:
+    label:
+      add:
+        - multi-modality
+
 - name: label-structured-output
   description: Automatically apply structured-output label
   conditions:

From 10ccdec870a2f1a0722bcf330119cbfe1e0caded Mon Sep 17 00:00:00 2001
From: Peng Li <justdoit.pli@gmail.com>
Date: Fri, 7 Mar 2025 16:01:18 +0800
Subject: [PATCH 0574/1240] correct wrong markdown syntax (#14414)

Signed-off-by: vincent-pli <justdoit.pli@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/huggingface_integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/huggingface_integration.md b/docs/source/design/huggingface_integration.md
index 99b4cb56424..7d271b1cfb3 100644
--- a/docs/source/design/huggingface_integration.md
+++ b/docs/source/design/huggingface_integration.md
@@ -14,7 +14,7 @@ Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qw
 
 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
 
-3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
+3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
 
    - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.

From 0edcafdbbe51454be7a9d1ffe1982913ac25981b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 7 Mar 2025 18:30:55 +0800
Subject: [PATCH 0575/1240] [Bugfix] Further clean up LoRA test (#14422)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py         |  5 ----
 tests/lora/test_mixtral.py     | 42 ----------------------------------
 tests/lora/test_quant_model.py |  3 ++-
 3 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index f3b545670b8..25665517fee 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -185,11 +185,6 @@ def mixtral_lora_files():
     return snapshot_download(repo_id="SangBinCho/mixtral-lora")
 
 
-@pytest.fixture(scope="session")
-def mixtral_lora_files_all_target_modules():
-    return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
-
-
 @pytest.fixture(scope="session")
 def gemma_lora_files():
     return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 5d59df365fe..aea7691935d 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -69,45 +69,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
                      prompts=prompts) == expected_lora_output
     assert do_sample(llm, mixtral_lora_files, lora_id=2,
                      prompts=prompts) == expected_lora_output
-
-
-@pytest.mark.parametrize("tp_size", [4])
-@pytest.mark.parametrize("fully_shard", [True, False])
-def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size, fully_shard):
-    """This LoRA model has all supported Mixtral target modules"""
-
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-
-    prompts = [
-        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
-        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
-        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
-    ]
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        distributed_executor_backend="ray",
-        tensor_parallel_size=tp_size,
-        fully_sharded_loras=fully_shard,
-        max_lora_rank=32,
-    )
-
-    expected_lora_output = [
-        "A: Nothing happens if you touch the eyes of a blind man.",
-        "A: add heat",
-        "1: Craig",
-    ]
-
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm,
-                     mixtral_lora_files_all_target_modules,
-                     lora_id=2,
-                     prompts=prompts) == expected_lora_output
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index b4f3d8dc478..d607bf66ebd 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -178,7 +178,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
                                  model):
     if num_gpus_available < 2:
         pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-
+    if model.quantization == "GPTQ":
+        pytest.skip("GPTQ lora outputs are just incredibly unstable")
     llm_tp1 = vllm.LLM(
         model=model.model_path,
         enable_lora=True,

From fc0a7623dc15fbede0b9223a9276564b40abceb3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 7 Mar 2025 18:33:38 +0800
Subject: [PATCH 0576/1240] [Bugfix] Clean up multi-modal processors (#14417)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                                |  9 +++
 vllm/model_executor/models/deepseek_vl2.py    | 56 +++++++++---------
 vllm/model_executor/models/h2ovl.py           | 58 +++++++++----------
 .../model_executor/models/llava_next_video.py |  2 +-
 vllm/model_executor/models/llava_onevision.py |  4 +-
 vllm/model_executor/models/minicpmo.py        |  6 +-
 vllm/model_executor/models/minicpmv.py        |  4 +-
 vllm/model_executor/models/pixtral.py         |  2 +-
 vllm/model_executor/models/qwen2_vl.py        |  4 +-
 vllm/multimodal/processing.py                 |  4 +-
 vllm/multimodal/profiling.py                  |  4 +-
 vllm/multimodal/registry.py                   |  2 +-
 12 files changed, 79 insertions(+), 76 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 3eca09b232a..5f61c2d592c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2405,6 +2405,15 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
+    def get_limit_per_prompt(self, modality: str) -> int:
+        """
+        Get the maximum number of input items allowed per prompt
+        for the given modality.
+
+        If not set by the user, this defaults to `1`.
+        """
+        return self.limit_per_prompt.get(modality, 1)
+
     # TODO: Add configs to init vision tower or not.
 
 
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 532400b3b42..fd5d5a564b5 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -14,7 +14,6 @@
 from transformers import BatchFeature
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -25,8 +24,8 @@
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config,
@@ -42,8 +41,6 @@
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 
-logger = init_logger(__name__)
-
 # The image token id may be various
 _IMAGE_TOKEN = "<image>"
 
@@ -216,30 +213,6 @@ def get_dummy_processor_inputs(
 class DeepseekVL2MultiModalProcessor(
         BaseMultiModalProcessor[DeepseekVL2ProcessingInfo]):
 
-    def __init__(
-            self,
-            info: DeepseekVL2ProcessingInfo,
-            dummy_inputs: "BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]",
-            *,
-            cache: Optional[ProcessingCache] = None,
-            enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] > 2:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "image limit larger than 2.")
-
     def _call_hf_processor(
         self,
         prompt: str,
@@ -316,6 +289,31 @@ def get_replacement_deepseek_vl2(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        # The processor logic is different for len(images) <= 2 vs > 2
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 2:
+            # This code path corresponds to the cache being disabled
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     DeepseekVL2MultiModalProcessor,
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index d336d7521a2..e23765cc4fb 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -8,21 +8,19 @@
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
 from collections.abc import Mapping, Sequence
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from PIL import Image
 from transformers import PretrainedConfig
 
-from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    MultiModalDataItems)
-from vllm.multimodal.processing import (ProcessingCache, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .intern_vit import InternVisionModel
@@ -32,8 +30,6 @@
                        InternVLMultiModalProcessor, build_transform,
                        find_closest_aspect_ratio, get_internvl_target_ratios)
 
-logger = init_logger(__name__)
-
 
 def resolve_h2ovl_min_max_num(
     *,
@@ -465,29 +461,6 @@ def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
 
-    def __init__(self,
-                 info: H2OVLProcessingInfo,
-                 dummy_inputs: "BaseDummyInputsBuilder[H2OVLProcessingInfo]",
-                 *,
-                 cache: Optional[ProcessingCache] = None,
-                 enable_sanity_checks: bool = True) -> None:
-        super().__init__(
-            info,
-            dummy_inputs,
-            cache=cache,
-            enable_sanity_checks=enable_sanity_checks,
-        )
-
-        mm_limit = self.info.ctx.model_config.multimodal_config.limit_per_prompt
-        if self.cache is not None and mm_limit["image"] >= 2:
-            # The processor output depends on the number of images passed,
-            # making it incompatible with processing cache which is supposed
-            # to be invariant of how many images are passed per prompt
-            self.cache = None
-            logger.warning_once(
-                f"{type(self).__name__} does not support processing cache with "
-                "multi-image support enabled.")
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -543,6 +516,31 @@ def get_replacement_internvl(item_idx: int):
             )
         ]
 
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        # The processor logic is different for len(images) <= 1 vs > 1
+        # Since the processing cache assumes that the processor output is
+        # invariant of how many images are passed per prompt, we only
+        # perform caching for the most common case
+        if mm_data_items.get_count("image", strict=False) > 1:
+            # This code path corresponds to the cache being disabled
+            return self._apply_hf_processor_main(
+                prompt=prompt,
+                mm_items=mm_data_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+                enable_hf_prompt_update=True,
+            )
+
+        return super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     H2OVLMultiModalProcessor,
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 508b47d1351..d974c3d2240 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -133,7 +133,7 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index e87ef24ce2c..f41f45e3e40 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -206,8 +206,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index f35c230c0ce..bf6c38d2796 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -201,9 +201,9 @@ def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
-        max_audios = mm_config.limit_per_prompt.get("audio", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
+        max_audios = mm_config.get_limit_per_prompt("audio")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index cf103edd0bc..48f0c09cdfb 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -446,8 +446,8 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         # count <image_idx></image_idx> tokens
         # which are not in get_max_image_tokens
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8acc07ac353..f17f9fb8e0c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -68,7 +68,7 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
     image_token_id = mm_encoder.special_ids.img
 
     mm_config = ctx.get_mm_config()
-    num_images = mm_config.limit_per_prompt.get("image", 1)
+    num_images = mm_config.get_limit_per_prompt("image")
 
     # dummy size
     size = 256
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 523b53d5ee4..ac3d154dd88 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -911,8 +911,8 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
     def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.limit_per_prompt.get("image", 1)
-        max_videos = mm_config.limit_per_prompt.get("video", 1)
+        max_images = mm_config.get_limit_per_prompt("image")
+        max_videos = mm_config.get_limit_per_prompt("video")
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3f13cd8582f..ba8a458e84c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -984,10 +984,10 @@ def _to_mm_items(
         before passing them to :meth:`_get_hf_mm_data`.
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
+        mm_config = self.info.ctx.get_mm_config()
 
-        mm_limits = self.info.ctx.get_mm_config().limit_per_prompt
         for modality, items in mm_items.items():
-            limit = mm_limits.get(modality, 1)
+            limit = mm_config.get_limit_per_prompt(modality)
             if len(items) > limit:
                 raise ValueError(
                     f"You set {modality}={limit} (or defaulted to 1) in "
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 3178b0f8c3e..57f045dae6b 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -110,12 +110,10 @@ def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]:
 
     def get_mm_limits(self) -> Mapping[str, int]:
         mm_config = self.processing_info.ctx.get_mm_config()
-        mm_limit_per_prompt = mm_config.limit_per_prompt
-
         supported_mm_limits = self.processing_info.get_supported_mm_limits()
 
         mm_limits = {
-            modality: mm_limit_per_prompt.get(modality, 1)
+            modality: mm_config.get_limit_per_prompt(modality)
             for modality in supported_mm_limits
         }
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a9eb250cb87..4987cdc4a2e 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -355,7 +355,7 @@ def init_mm_limits_per_prompt(
             # TODO: Automatically determine the limits based on budget
             # once more models support multi-image inputs
             limits_per_plugin = {
-                key: config_limits_per_plugin.get(key, 1)
+                key: multimodal_config.get_limit_per_prompt(key)
                 for key in self._plugins
             }
 

From 0edd84a23bf11e16b773f347345920feba21a836 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E0=AE=AE=E0=AE=A9=E0=AF=8B=E0=AE=9C=E0=AF=8D=E0=AE=95?=
 =?UTF-8?q?=E0=AF=81=E0=AE=AE=E0=AE=BE=E0=AE=B0=E0=AF=8D=20=E0=AE=AA?=
 =?UTF-8?q?=E0=AE=B4=E0=AE=A9=E0=AE=BF=E0=AE=9A=E0=AF=8D=E0=AE=9A=E0=AE=BE?=
 =?UTF-8?q?=E0=AE=AE=E0=AE=BF?= <smartmanoj42857@gmail.com>
Date: Fri, 7 Mar 2025 16:10:01 +0530
Subject: [PATCH 0577/1240] [Misc] Set default value of seed to None (#14274)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/distributed/test_torchrun_example.py    | 3 ++-
 tests/entrypoints/llm/test_encode.py          | 3 ++-
 tests/entrypoints/llm/test_guided_generate.py | 2 +-
 tests/entrypoints/openai/test_chat_echo.py    | 2 ++
 tests/entrypoints/openai/test_metrics.py      | 2 ++
 tests/entrypoints/openai/test_root_path.py    | 2 ++
 vllm/engine/arg_utils.py                      | 2 +-
 vllm/entrypoints/llm.py                       | 2 +-
 vllm/utils.py                                 | 4 ++--
 9 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 1c6c28b4ed3..4ef33932538 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -25,7 +25,8 @@
           tensor_parallel_size=2,
           distributed_executor_backend="external_launcher",
           gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4))
+          swap_space=random.randint(1, 4),
+          seed=0)
 
 outputs = llm.generate(prompts, sampling_params)
 
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index 6438743b649..d10257761c8 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -34,7 +34,8 @@ def llm():
               max_num_batched_tokens=32768,
               tensor_parallel_size=1,
               gpu_memory_utilization=0.75,
-              enforce_eager=True)
+              enforce_eager=True,
+              seed=0)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index fce581c7828..97ee027bde3 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -21,7 +21,7 @@
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=MODEL_NAME, max_model_len=1024, seed=0)
 
     with llm.deprecate_legacy_api():
         yield weakref.proxy(llm)
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 3e76158a8c1..15da0f2fb5f 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -24,6 +24,8 @@ def server():
         "4080",
         "--chat-template",
         DUMMY_CHAT_TEMPLATE,
+        "--seed",
+        "0",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 2bffd0ce138..aa290fc25d7 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -47,6 +47,8 @@ def default_server_args():
         "--enforce-eager",
         "--max-num-seqs",
         "128",
+        "--seed",
+        "0",
     ]
 
 
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index c9fa192fb6a..71fe8cbdba3 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -30,6 +30,8 @@ def server():
         "/" + ROOT_PATH,
         "--chat-template",
         DUMMY_CHAT_TEMPLATE,
+        "--seed",
+        "0",
     ]
     envs = os.environ.copy()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 77bbb59efef..e97d6dc58cf 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -104,7 +104,7 @@ class EngineArgs:
     config_format: ConfigFormat = ConfigFormat.AUTO
     dtype: str = 'auto'
     kv_cache_dtype: str = 'auto'
-    seed: int = 0
+    seed: Optional[int] = None
     max_model_len: Optional[int] = None
     # Note: Specifying a custom executor backend by passing a class
     # is intended for expert use only. The API may change without
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4be1a532ee7..6eb838fa02e 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -169,7 +169,7 @@ def __init__(
         quantization: Optional[str] = None,
         revision: Optional[str] = None,
         tokenizer_revision: Optional[str] = None,
-        seed: int = 0,
+        seed: Optional[int] = None,
         gpu_memory_utilization: float = 0.9,
         swap_space: float = 4,
         cpu_offload_gb: float = 0,
diff --git a/vllm/utils.py b/vllm/utils.py
index 1de2180deb5..4c4c2509850 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -644,7 +644,7 @@ def create_kv_caches_with_random_flash(
     head_size: int,
     cache_dtype: Optional[Union[str, torch.dtype]],
     model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: int = 0,
+    seed: Optional[int] = None,
     device: Optional[str] = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     from vllm.platforms import current_platform
@@ -681,7 +681,7 @@ def create_kv_caches_with_random(
     head_size: int,
     cache_dtype: Optional[Union[str, torch.dtype]],
     model_dtype: Optional[Union[str, torch.dtype]] = None,
-    seed: int = 0,
+    seed: Optional[int] = None,
     device: Optional[str] = "cuda",
 ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
 

From 8c4ebe393bff5c1bd5d09b483eba46f365f65aab Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Fri, 7 Mar 2025 02:51:47 -0800
Subject: [PATCH 0578/1240] [BUGFIX] Skip tokenization support for throughput
 benchmark (#12712)

Signed-off-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_throughput.py | 55 ++++++++++++++++++------------
 vllm/engine/arg_utils.py           |  4 ++-
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index aabce64ff77..d8353cf1f71 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -7,7 +7,7 @@
 import random
 import time
 from functools import cache
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import torch
 import uvloop
@@ -20,7 +20,7 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
-from vllm.inputs import TextPrompt
+from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
@@ -178,10 +178,13 @@ def run_vllm(
             "Please ensure that max_model_len is greater than the sum of"
             " prompt_len and expected_output_len for all requests.")
     # Add the requests to the engine.
-    prompts: list[TextPrompt] = []
+    prompts: list[Union[TextPrompt, TokensPrompt]] = []
     sampling_params: list[SamplingParams] = []
     for request in requests:
         prompts.append(
+            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                       multi_modal_data=request.multi_modal_data)
+            if "prompt_token_ids" in request.prompt else \
             TextPrompt(prompt=request.prompt,
                        multi_modal_data=request.multi_modal_data))
         sampling_params.append(
@@ -242,11 +245,14 @@ async def run_vllm_async(
                 " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
-        prompts: list[TextPrompt] = []
+        prompts: list[Union[TextPrompt, TokensPrompt]] = []
         sampling_params: list[SamplingParams] = []
         lora_requests: list[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
+                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
+                        multi_modal_data=request.multi_modal_data)
+                if "prompt_token_ids" in request.prompt else \
                 TextPrompt(prompt=request.prompt,
                            multi_modal_data=request.multi_modal_data))
             sampling_params.append(
@@ -393,24 +399,29 @@ def main(args: argparse.Namespace):
                 random.randint(0, vocab_size - 1)
                 for _ in range(args.input_len)
             ]
-            # As tokenizer may add additional tokens like BOS, we need to try
-            # different lengths to get the desired input length.
-            for _ in range(5):  # Max attempts to correct
-                candidate_prompt = request_tokenizer.decode(candidate_ids)
-                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
-
-                if tokenized_len == args.input_len:
-                    break
-
-                # Adjust length based on difference
-                diff = args.input_len - tokenized_len
-                if diff > 0:
-                    candidate_ids.extend([
-                        random.randint(100, vocab_size - 100)
-                        for _ in range(diff)
-                    ])
-                else:
-                    candidate_ids = candidate_ids[:diff]
+
+            candidate_prompt = {"prompt_token_ids": candidate_ids}
+
+            if not args.skip_tokenizer_init:
+                # As tokenizer may add additional tokens like BOS, we need
+                # to try different lengths to get the desired input length.
+                for _ in range(5):  # Max attempts to correct
+                    candidate_prompt = request_tokenizer.decode(candidate_ids)
+                    tokenized_len = len(
+                        request_tokenizer.encode(candidate_prompt))
+
+                    if tokenized_len == args.input_len:
+                        break
+
+                    # Adjust length based on difference
+                    diff = args.input_len - tokenized_len
+                    if diff > 0:
+                        candidate_ids.extend([
+                            random.randint(100, vocab_size - 100)
+                            for _ in range(diff)
+                        ])
+                    else:
+                        candidate_ids = candidate_ids[:diff]
             requests.append(
                 SampleRequest(prompt=candidate_prompt,
                               prompt_len=args.input_len,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e97d6dc58cf..b00363617da 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -276,7 +276,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
-            help='Skip initialization of tokenizer and detokenizer.')
+            help='Skip initialization of tokenizer and detokenizer. '
+            'Expects valid prompt_token_ids and None for prompt from '
+            'the input. The generated output will contain token ids.')
         parser.add_argument(
             '--revision',
             type=nullable_str,

From e85d84bbd216005e9d4e1b2add00b1e0d13d962e Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 7 Mar 2025 13:30:42 +0100
Subject: [PATCH 0579/1240] Fix missing `kv_caches` and `attn_metadata` in
 `OpenVINOCausalLM` (#14271)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/openvino.py | 19 +++++++++----------
 vllm/worker/openvino_model_runner.py         |  2 ++
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
index 805f0cfc585..cd2d427edbb 100644
--- a/vllm/model_executor/model_loader/openvino.py
+++ b/vllm/model_executor/model_loader/openvino.py
@@ -2,7 +2,7 @@
 
 # ruff: noqa: SIM117
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import Optional
 
 import openvino as ov
 import torch
@@ -12,8 +12,8 @@
 from torch import nn
 
 import vllm.envs as envs
-from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
                                                          _prune_hidden_states)
@@ -24,7 +24,7 @@
 logger = init_logger(__name__)
 
 
-def _flattenize_inputs(inputs):
+def _flatten_inputs(inputs):
     """
     Helper function for making nested inputs flattens
     """
@@ -33,10 +33,9 @@ def _flattenize_inputs(inputs):
         if input_data is None:
             continue
         if isinstance(input_data, (list, tuple)):
-            flatten_inputs.extend(_flattenize_inputs(input_data))
+            flatten_inputs.extend(_flatten_inputs(input_data))
         elif isinstance(input_data, dict):
-            flatten_inputs.extend(_flattenize_inputs(list(
-                input_data.values())))
+            flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
         else:
             flatten_inputs.append(input_data)
     return flatten_inputs
@@ -147,15 +146,15 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
-        attn_metadata: OpenVINOAttentionMetadata,
+        kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
     ) -> torch.Tensor:
-        flatten_kv_cache = _flattenize_inputs(kv_caches)
+        flat_kv_caches = _flatten_inputs(kv_caches)
+        attn_metadata = get_forward_context().attn_metadata
 
         inputs = [
             input_ids,
             positions,
-            *flatten_kv_cache,
+            *flat_kv_caches,
             attn_metadata.past_lens,
             attn_metadata.subsequence_begins,
             attn_metadata.block_indices,
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 5035ea20294..aa1d2cbb2df 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -346,6 +346,8 @@ def execute_model(
             input_tokens,
             "positions":
             input_positions,
+            "kv_caches":
+            kv_caches,
             **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
                                          device=self.device),
         }

From 76a58cadeab75b6079904e43c1a9e4741ebd8e6c Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Fri, 7 Mar 2025 05:25:13 -0800
Subject: [PATCH 0580/1240] Use the optimized block sizes after tuning the
 kernel. (#14329)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/pallas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bbbdf50ac0c..bf3992281a7 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
+NUM_QUERIES_PER_BLOCK = 16
+NUM_KV_PAGES_PER_BLOCK = 256
 
 
 class PallasAttentionBackend(AttentionBackend):

From 9e980b0db22ee87871f017fdd95ce3b1024a0095 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Fri, 7 Mar 2025 10:19:11 -0500
Subject: [PATCH 0581/1240] [V1][Core] Support for Structured Outputs (#12388)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 .gitignore                                    |   2 +-
 benchmarks/benchmark_guided.py                | 507 ------------------
 ...=> benchmark_serving_structured_output.py} |  79 +--
 benchmarks/run_structured_output_benchmark.sh |  64 +++
 .../structured_schema_1.json                  | 132 +----
 tests/v1/core/test_scheduler.py               |  39 +-
 tests/v1/entrypoints/conftest.py              |  22 +-
 tests/v1/entrypoints/llm/__init__.py          |   0
 .../llm/test_struct_output_generate.py        | 269 ++++++++++
 tests/v1/structured_output/__init__.py        |   0
 tests/v1/structured_output/test_utils.py      | 196 +++++++
 tests/v1/worker/test_gpu_model_runner.py      |  12 +
 vllm/utils.py                                 |  59 +-
 vllm/v1/core/scheduler.py                     |  65 ++-
 vllm/v1/core/scheduler_output.py              |  27 +-
 vllm/v1/engine/async_llm.py                   |   8 +-
 vllm/v1/engine/core.py                        |  22 +-
 vllm/v1/engine/llm_engine.py                  |   4 +-
 vllm/v1/engine/processor.py                   |  47 +-
 vllm/v1/request.py                            |  35 +-
 vllm/v1/structured_output/__init__.py         | 152 ++++++
 vllm/v1/structured_output/grammar.py          |  77 +++
 vllm/v1/structured_output/request.py          |  71 +++
 vllm/v1/structured_output/utils.py            | 295 ++++++++++
 vllm/v1/worker/gpu_model_runner.py            |  58 +-
 26 files changed, 1528 insertions(+), 715 deletions(-)
 delete mode 100644 benchmarks/benchmark_guided.py
 rename benchmarks/{benchmark_serving_guided.py => benchmark_serving_structured_output.py} (94%)
 create mode 100755 benchmarks/run_structured_output_benchmark.sh
 create mode 100644 tests/v1/entrypoints/llm/__init__.py
 create mode 100644 tests/v1/entrypoints/llm/test_struct_output_generate.py
 create mode 100644 tests/v1/structured_output/__init__.py
 create mode 100644 tests/v1/structured_output/test_utils.py
 create mode 100644 vllm/v1/structured_output/__init__.py
 create mode 100644 vllm/v1/structured_output/grammar.py
 create mode 100644 vllm/v1/structured_output/request.py
 create mode 100644 vllm/v1/structured_output/utils.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7b74bc9c352..f60aeaf93c2 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -204,6 +204,7 @@ steps:
     - VLLM_USE_V1=1 pytest -v -s v1/engine
     - VLLM_USE_V1=1 pytest -v -s v1/sample
     - VLLM_USE_V1=1 pytest -v -s v1/worker
+    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
     - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
     - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
     # TODO: accuracy does not match, whether setting
diff --git a/.gitignore b/.gitignore
index 89dab8f13ba..e40752f4dea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -197,7 +197,7 @@ _build/
 hip_compat.h
 
 # Benchmark dataset
-benchmarks/*.json
+benchmarks/**/*.json
 
 # Linting
 actionlint
diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
deleted file mode 100644
index 2e0f6c6b5d2..00000000000
--- a/benchmarks/benchmark_guided.py
+++ /dev/null
@@ -1,507 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""Benchmark guided decoding throughput."""
-import argparse
-import dataclasses
-import json
-import os
-import random
-import time
-
-import datasets
-import pandas as pd
-import uvloop
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-
-from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
-from vllm.sampling_params import GuidedDecodingParams
-from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-
-
-@dataclasses.dataclass
-class SampleRequest:
-    """A class representing a single inference request for benchmarking.
-
-    Attributes:
-        prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
-        prompt_len: The length of the prompt in tokens.
-        expected_output_len: The expected length of the output in tokens.
-    """
-    prompt: str
-    prompt_len: int
-    expected_output_len: int
-    schema: dict
-    structure_type: str = 'json'
-    completion: str = None
-
-
-def run_vllm(requests: list[SampleRequest],
-             engine_args: EngineArgs,
-             n: int,
-             guided_decoding_rate: float = 1.0,
-             warmup: bool = False) -> float:
-    from vllm import LLM, SamplingParams
-    llm = LLM(**vars(engine_args))
-    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests.")
-
-    # Add the requests to the engine.
-    prompts: list[str] = []
-    sampling_params: list[SamplingParams] = []
-    # create a list containing random selected true or false
-    guided_decoding_req_idx = random.sample(
-        range(len(requests)), int(len(requests) * guided_decoding_rate))
-
-    if warmup:
-        print(">>>>> Running warmup prompt, for the first 5")
-        # We setup the first 5 requests to warmup FSM
-        # if using xgrammar dataset, we will skip warmup
-        warmup_requests = requests[:5]
-        for i, request in enumerate(warmup_requests):
-            prompts.append(request.prompt)
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    guided_decoding=GuidedDecodingParams(json=request.schema)
-                    if guided_decoding_rate > 0 else None,
-                ))
-        llm.generate(prompts, sampling_params, use_tqdm=False)
-
-    print(">>>>> Benchmark started...")
-    prompts = []
-    sampling_params = []
-    for i, request in enumerate(requests):
-        prompts.append(request.prompt)
-        sampling_params.append(
-            SamplingParams(
-                n=n,
-                temperature=1.0,
-                top_p=1.0,
-                ignore_eos=True,
-                max_tokens=request.expected_output_len,
-                guided_decoding=GuidedDecodingParams(
-                    **{request.structure_type: request.schema})
-                if i in guided_decoding_req_idx else None,
-            ))
-
-    start = time.perf_counter()
-    outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
-    ret = []
-    for output, request in zip(outputs, requests):
-        generated_text = output.outputs[0].text
-        ret.append({
-            "generated": generated_text,
-            "expected": request.completion
-        })
-    end = time.perf_counter()
-    return end - start, ret
-
-
-async def run_vllm_async(
-        requests: list[SampleRequest],
-        engine_args: AsyncEngineArgs,
-        n: int,
-        guided_decoding_rate: float = 1.0,
-        warmup: bool = False,
-        disable_frontend_multiprocessing: bool = False) -> float:
-    from vllm import SamplingParams
-
-    async with build_async_engine_client_from_engine_args(
-            engine_args, disable_frontend_multiprocessing) as llm:
-
-        assert all(
-            llm.model_config.max_model_len >= (request.prompt_len +
-                                               request.expected_output_len)
-            for request in requests), (
-                "Please ensure that max_model_len is greater than the sum of"
-                " prompt_len and expected_output_len for all requests.")
-
-        # Add the requests to the engine.
-        prompts: list[str] = []
-        sampling_params: list[SamplingParams] = []
-        guided_decoding_req_idx = random.sample(
-            range(len(requests)), int(len(requests) * guided_decoding_rate))
-
-        if warmup:
-            print(">>>>>> Running warmup prompt, for the first 5")
-            # We setup the first 5 requests to warmup FSM
-            # if using xgrammar dataset, we will skip warmup
-            warmup_requests = requests[:5]
-            for i, request in enumerate(warmup_requests):
-                prompts.append(request.prompt)
-                sampling_params.append(
-                    SamplingParams(
-                        n=n,
-                        temperature=1.0,
-                        top_p=1.0,
-                        ignore_eos=True,
-                        max_tokens=request.expected_output_len,
-                        guided_decoding=GuidedDecodingParams(
-                            json=request.schema)
-                        if guided_decoding_rate > 0 else None,
-                    ))
-            generators = []
-            for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-                generator = llm.generate(prompt, sp, request_id=f"test{i}")
-                generators.append(generator)
-            all_gens = merge_async_iterators(*generators)
-            async for i, res in all_gens:
-                pass
-
-        print(">>>>> Benchmark started...")
-        prompts = []
-        sampling_params = []
-        for i, request in enumerate(requests):
-            prompts.append(request.prompt)
-            sampling_params.append(
-                SamplingParams(
-                    n=n,
-                    temperature=1.0,
-                    top_p=1.0,
-                    ignore_eos=True,
-                    max_tokens=request.expected_output_len,
-                    guided_decoding=GuidedDecodingParams(json=request.schema)
-                    if i in guided_decoding_req_idx else None,
-                ))
-
-        generators = []
-        start_time = []
-        latencies = []
-        start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
-            generators.append(generator)
-            start_time.append(time.perf_counter())
-            latencies.append([])
-        all_gens = merge_async_iterators(*generators)
-        generated_texts = [''] * len(requests)
-        async for i, res in all_gens:
-            generated_texts[i] = res.outputs[0].text
-            lat = time.perf_counter() - start_time[i]
-            latencies[i].append(lat)
-        ret = [{
-            'generated': gt,
-            'expected': req.completion
-        } for gt, req in zip(generated_texts, requests)]
-        end = time.perf_counter()
-        first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
-        next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
-                                  for lat in latencies])
-        return end - start, ret, (first_latency, next_latency)
-
-
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json':
-        if args.json_schema_path is None:
-            dir_path = os.path.dirname(os.path.realpath(__file__))
-            args.json_schema_path = os.path.join(dir_path,
-                                                 "structured_schemas",
-                                                 "structured_schema_1.json")
-        with open(args.json_schema_path) as f:
-            schema = json.load(f)
-        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=schema,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "grammar":
-        schema = """
-            ?start: select_statement
-
-            ?select_statement: "SELECT " column_list " FROM " table_name
-
-            ?column_list: column_name ("," column_name)*
-
-            ?table_name: identifier
-
-            ?column_name: identifier
-
-            ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
-        """
-        prompt = "Generate an SQL query to show the 'username' \
-            and 'email' from the 'users' table."
-
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=schema,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "regex":
-        regex = r"\w+@\w+\.com\n"
-        args.regex = regex
-        prompt = "Generate an email address for Alan Turing, \
-            who works in Enigma. End in .com and new line. \
-                Example result: alan.turing@enigma.com\n"
-
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=regex,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "choice":
-        choice = ["Positive", "Negative"]
-        args.choice = choice
-        prompt = "Classify this sentiment: vLLM is wonderful!"
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
-        requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
-                          expected_output_len=args.output_len,
-                          schema=choice,
-                          structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
-        ]
-
-    elif args.dataset == "xgrammar_bench":
-        args.warmup = False
-        requests: list[SampleRequest] = []
-        dataset = datasets.load_dataset("NousResearch/json-mode-eval",
-                                        split="train")
-        print(f"dataset has {len(dataset)} entries")
-        len_dataset = len(dataset)
-        for data_point_idx in range(args.num_prompts):
-            idx = data_point_idx
-            while idx >= len_dataset:
-                idx -= len_dataset
-            schema = dataset["schema"][idx]
-            prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
-                                                   tokenize=False)
-            input_len = len(tokenizer(prompt).input_ids)
-            completion = dataset["completion"][idx]
-
-            requests.append(
-                SampleRequest(prompt=prompt,
-                              prompt_len=input_len,
-                              expected_output_len=args.output_len,
-                              schema=schema,
-                              completion=completion))
-
-    return requests
-
-
-def evaluate(ret, args):
-
-    def _eval_correctness_json(expected, actual):
-        # extract json string from string using regex
-        import re
-        actual = actual.replace('\n', '').replace(' ', '').strip()
-        try:
-            actual = re.search(r'\{.*\}', actual).group()
-            actual = json.loads(actual)
-        except Exception:
-            return False
-
-        return True
-
-    def _eval_correctness_choice(expected, actual):
-        return actual in args.choice
-
-    def _eval_correctness_regex(expected, actual):
-        import re
-        return re.match(args.regex, actual) is not None
-
-    def _eval_correctness(expected, actual):
-        if args.structure_type == 'json':
-            return _eval_correctness_json(expected, actual)
-        elif args.structure_type == 'regex':
-            return _eval_correctness_regex(expected, actual)
-        elif args.structure_type == 'choice':
-            return _eval_correctness_choice(expected, actual)
-        else:
-            return None
-
-    scores = []
-    for res in ret:
-        score = _eval_correctness(res['expected'], res['generated'])
-        res['correctness'] = score
-        scores.append(score)
-
-    not_none_scores = [score for score in scores if score is not None]
-
-    return (sum(not_none_scores) / len(not_none_scores) *
-            100) if len(not_none_scores) > 0 else None
-
-
-def main(args: argparse.Namespace):
-    print(args)
-    random.seed(args.seed)
-
-    # async engine is working for 'regex', 'choice' and 'grammar'
-    if args.dataset == 'grammar':
-        args.structure_type = 'grammar'
-        args.async_engine = False
-    elif args.dataset == 'regex':
-        args.structure_type = 'regex'
-        args.async_engine = False
-    elif args.dataset == 'choice':
-        args.structure_type = 'choice'
-        args.async_engine = False
-    else:
-        args.structure_type = 'json'
-
-    if args.no_guided_decoding:
-        args.guided_decoding_ratio = 0
-    if args.save_results:
-        result_file_name = f'{args.guided_decoding_ratio}guided'
-        result_file_name += f"_{args.model.split('/')[-1]}"
-        result_file_name += f"_{args.dataset}"
-        result_file_name += f"_{args.num_prompts}"
-        result_file_name += f"_out{args.output_len}"
-        result_file_name += f"_async{args.async_engine}"
-        result_file_name += f"_warmup{args.warmup}"
-        result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
-        result_file_name += ".txt"
-    else:
-        result_file_name = None
-
-    # Synthesize a prompt with the given input length.
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
-    requests = sample_requests(tokenizer, args)
-
-    if args.async_engine:
-        engine_args = AsyncEngineArgs.from_cli_args(args)
-        elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
-            run_vllm_async(requests, engine_args, args.n,
-                           args.guided_decoding_ratio, args.warmup,
-                           args.disable_frontend_multiprocessing))
-    else:
-        engine_args = EngineArgs.from_cli_args(args)
-        elapsed_time, ret = run_vllm(requests, engine_args, args.n,
-                                     args.guided_decoding_ratio, args.warmup)
-        first_latency, next_latency = None, None
-
-    score = evaluate(ret, args)
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-                           for request in requests)
-    total_output_tokens = sum(request.expected_output_len
-                              for request in requests)
-    if first_latency is not None:
-        latency_breakdown = "\nFirst token latency(msecs):\n"
-        latency_breakdown += f"{first_latency.describe()}"
-        latency_breakdown += "\nNext token latency(msecs):\n"
-        latency_breakdown += f"{next_latency.describe()}"
-    print(
-        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-        f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
-        f"Correct rate is {score} %",
-        f"{latency_breakdown if first_latency is not None else ''}")
-
-    # Output JSON results if specified
-    if args.output_json or result_file_name:
-        results = {
-            "elapsed_time": elapsed_time,
-            "num_requests": len(requests),
-            "total_num_tokens": total_num_tokens,
-            "total_output_tokens": total_output_tokens,
-            "requests_per_second": len(requests) / elapsed_time,
-            "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
-            "output_tokens_per_second":
-            f"{total_output_tokens / elapsed_time:.2f}",
-            "correct_rate(%)": score
-        }
-        results = {"outputs": ret, **results}
-        if first_latency is not None:
-            results["first_token_latency(msecs)"] = first_latency.describe(
-            ).to_dict()
-            results["next_token_latency(msecs)"] = next_latency.describe(
-            ).to_dict()
-        if args.output_json:
-            with open(args.output_json, "w") as f:
-                json.dump(results, f, indent=4)
-        elif result_file_name:
-            with open(result_file_name, "w") as f:
-                json.dump(results, f, indent=4)
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
-    parser = AsyncEngineArgs.add_cli_args(parser)
-
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=512,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument(
-        "--dataset",
-        default='json',
-        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
-    parser.add_argument("--json_schema_path",
-                        type=str,
-                        default=None,
-                        help="Path to json schema.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=10,
-                        help="Number of prompts to process.")
-    parser.add_argument(
-        '--output-json',
-        type=str,
-        default=None,
-        help='Path to save the throughput results in JSON format.')
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--no-guided-decoding",
-                        action='store_true',
-                        default=False,
-                        help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--guided-decoding-ratio",
-                        type=float,
-                        default=1.0,
-                        help="Ratio of Guided Decoding requests")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
-    parser.add_argument("--warmup",
-                        action="store_true",
-                        default=False,
-                        help="Run warmup prompts before benchmark.")
-    parser.add_argument("--save-results",
-                        action="store_true",
-                        default=False,
-                        help="save output results.")
-    args = parser.parse_args()
-    if args.tokenizer is None:
-        args.tokenizer = args.model
-    main(args)
diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_structured_output.py
similarity index 94%
rename from benchmarks/benchmark_serving_guided.py
rename to benchmarks/benchmark_serving_structured_output.py
index 6c132d05f1b..3d43e04598f 100644
--- a/benchmarks/benchmark_serving_guided.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-r"""Benchmark online serving throughput with guided decoding.
+r"""Benchmark online serving throughput with structured outputs.
 
 On the server side, run one of the following commands:
     (vLLM OpenAI API server)
@@ -9,12 +9,12 @@
     ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
 
 On the client side, run:
-    python benchmarks/benchmark_serving_guided.py \
+    python benchmarks/benchmark_serving_structured_output.py \
         --backend <backend> \
         --model <your_model> \
         --dataset json \
-        --guided-decoding-ratio 1.0 \
-        --guided-decoding-backend xgrammar \
+        --structured-output-ratio 1.0 \
+        --structured-output-backend xgrammar \
         --request-rate 10 \
         --num-prompts 1000
 
@@ -52,6 +52,9 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+from vllm.v1.structured_output.utils import (
+    has_xgrammar_unsupported_json_features)
+
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
 
@@ -191,7 +194,17 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
         requests: list[SampleRequest] = []
         dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                         split="train")
-        print(f"dataset has {len(dataset)} entries")
+        full_dataset_len = len(dataset)
+
+        def _filter_func(item):
+            import json
+            schema = json.loads(item["schema"])
+            return not has_xgrammar_unsupported_json_features(schema)
+
+        dataset = dataset.filter(_filter_func)
+        num_filtered_out = full_dataset_len - len(dataset)
+        print(f"dataset has {len(dataset)} entries after filtering "
+              f"out {num_filtered_out} entries with unsupported features")
         len_dataset = len(dataset)
         for data_point_idx in range(args.num_prompts):
             idx = data_point_idx
@@ -220,21 +233,21 @@ async def get_request(
     burstiness: float = 1.0,
 ) -> AsyncGenerator[tuple[int, SampleRequest], None]:
     """
-    Asynchronously generates requests at a specified rate 
+    Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
-    
+
     Args:
-        input_requests: 
+        input_requests:
             A list of input requests, each represented as a tuple.
-        request_rate: 
+        request_rate:
             The rate at which requests are generated (requests/s).
-        burstiness (optional): 
-            The burstiness factor of the request generation. 
+        burstiness (optional):
+            The burstiness factor of the request generation.
             Only takes effect when request_rate is not inf.
             Default value is 1, which follows a Poisson process.
             Otherwise, the request intervals follow a gamma distribution.
-            A lower burstiness value (0 < burstiness < 1) results 
-            in more bursty requests, while a higher burstiness value 
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
     input_requests = iter(input_requests)
@@ -378,8 +391,8 @@ async def benchmark(
     selected_percentiles: list[str],
     ignore_eos: bool,
     max_concurrency: Optional[int],
-    guided_decoding_ratio: float,
-    guided_decoding_backend: str,
+    structured_output_ratio: float,
+    structured_output_backend: str,
     goodput_config_dict: Optional[dict[str, float]] = None,
 ):
     if backend in ASYNC_REQUEST_FUNCS:
@@ -391,16 +404,18 @@ def prepare_extra_body(request) -> dict:
         extra_body = {}
         # Add the schema to the extra_body
         extra_body[request.structure_type] = request.schema
-        # Add the specific guided_decoding_backend
-        extra_body["guided_decoding_backend"] = guided_decoding_backend
+        # Add the specific structured_output_backend
+        extra_body["guided_decoding_backend"] = structured_output_backend
         return extra_body
 
     print("Starting initial single prompt test run...")
-    guided_decoding_req_idx = random.sample(
+    structured_output_req_idx = random.sample(
         range(len(input_requests)),
-        int(len(input_requests) * guided_decoding_ratio))
+        int(len(input_requests) * structured_output_ratio))
 
     test_request = input_requests[0]
+    test_req_extra_body = (prepare_extra_body(test_request)
+                           if 0 in structured_output_req_idx else None)
     test_input = RequestFuncInput(
         model=model_id,
         prompt=test_request.prompt,
@@ -408,7 +423,7 @@ def prepare_extra_body(request) -> dict:
         prompt_len=test_request.prompt_len,
         output_len=test_request.expected_output_len,
         ignore_eos=ignore_eos,
-        extra_body=prepare_extra_body(test_request),
+        extra_body=test_req_extra_body,
     )
     test_output = await request_func(request_func_input=test_input)
     if not test_output.success:
@@ -427,7 +442,7 @@ def prepare_extra_body(request) -> dict:
             prompt_len=test_request.prompt_len,
             output_len=test_request.expected_output_len,
             ignore_eos=ignore_eos,
-            extra_body=prepare_extra_body(test_request),
+            extra_body=test_req_extra_body,
         )
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
@@ -465,7 +480,7 @@ async def limited_request_func(request_func_input, pbar):
     async for i, request in get_request(input_requests, request_rate,
                                         burstiness):
         extra_body = prepare_extra_body(
-            request) if i in guided_decoding_req_idx else None
+            request) if i in structured_output_req_idx else None
         request_func_input = RequestFuncInput(
             model=model_id,
             prompt=request.prompt,
@@ -708,10 +723,10 @@ def main(args: argparse.Namespace):
     else:
         args.structure_type = 'guided_json'
 
-    if args.no_guided_decoding:
-        args.guided_decoding_ratio = 0
+    if args.no_structured_output:
+        args.structured_output_ratio = 0
     if args.save_results:
-        result_file_name = f'{args.guided_decoding_ratio}guided'
+        result_file_name = f'{args.structured_output_ratio}guided'
         result_file_name += f"_{backend}"
         result_file_name += f"_{args.request_rate}qps"
         result_file_name += f"_{args.model.split('/')[-1]}"
@@ -744,8 +759,8 @@ def main(args: argparse.Namespace):
             ],
             ignore_eos=args.ignore_eos,
             max_concurrency=args.max_concurrency,
-            guided_decoding_ratio=args.guided_decoding_ratio,
-            guided_decoding_backend=args.guided_decoding_backend,
+            structured_output_ratio=args.structured_output_ratio,
+            structured_output_backend=args.structured_output_backend,
             goodput_config_dict=goodput_config_dict,
         ))
 
@@ -943,19 +958,19 @@ def main(args: argparse.Namespace):
         "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
         "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
 
-    parser.add_argument("--no-guided-decoding",
+    parser.add_argument("--no-structured-output",
                         action='store_true',
                         default=False,
                         help="Whether to disable JSON decoding or not.")
-    parser.add_argument("--guided-decoding-ratio",
+    parser.add_argument("--structured-output-ratio",
                         type=float,
                         default=1.0,
-                        help="Ratio of Guided Decoding requests")
-    parser.add_argument("--guided-decoding-backend",
+                        help="Ratio of Structured Outputs requests")
+    parser.add_argument("--structured-output-backend",
                         type=str,
                         choices=["outlines", "lm-format-enforcer", "xgrammar"],
                         default="xgrammar",
-                        help="Backend to use for guided decoding")
+                        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh
new file mode 100755
index 00000000000..8a777320f73
--- /dev/null
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Define the model to use
+MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"}
+
+# Define the backend to use
+BACKEND=${2:-"vllm"}
+
+# Define the dataset to use
+DATASET=${3:-"xgrammar_bench"}
+
+# Define the guided decoding backend
+GUIDED_BACKEND=${4:-"xgrammar"}
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
+
+GUIDED_RATIO=${6:-0.5}
+
+# Create output directory if it doesn't exist
+mkdir -p "$OUTPUT_DIR"
+
+# Define QPS values to test
+QPS_VALUES=(70 60 50 25 20 15 10)
+
+# Common parameters
+COMMON_PARAMS="--backend $BACKEND \
+               --model $MODEL \
+               --dataset $DATASET \
+               --structured-output-backend $GUIDED_BACKEND \
+               --structured-output-ratio $GUIDED_RATIO \
+               --save-results \
+               --result-dir $OUTPUT_DIR"
+
+echo "Starting structured output benchmark with model: $MODEL"
+echo "Backend: $BACKEND"
+echo "Dataset: $DATASET"
+echo "Structured output backend: $GUIDED_BACKEND"
+echo "Results will be saved to: $OUTPUT_DIR"
+echo "----------------------------------------"
+
+# Run benchmarks with different QPS values
+for qps in "${QPS_VALUES[@]}"; do
+  echo "Running benchmark with QPS: $qps"
+
+  # Get git hash and branch for the filename
+  GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
+  GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
+
+  # Construct filename for this run
+  FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
+
+  # Run the benchmark
+  python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
+    --request-rate $qps \
+    --result-filename "$FILENAME" \
+    --port ${PORT:-8000}
+
+  echo "Completed benchmark with QPS: $qps"
+  echo "----------------------------------------"
+done
+
+echo "All benchmarks completed!"
+echo "Results saved to: $OUTPUT_DIR"
diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
index 6003698469e..1bd189c9e70 100644
--- a/benchmarks/structured_schemas/structured_schema_1.json
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -1,113 +1,25 @@
 {
-    "$schema":
-    "https://json-schema.org/draft/2020-12/schema",
-    "title":
-    "User Profile",
-    "type":
-    "object",
+  "type": "array",
+  "items": {
+    "type": "object",
     "properties": {
-        "userId": {
-            "type": "string",
-            "description": "Unique identifier for the user."
-        },
-        "personalInfo": {
-            "type": "object",
-            "properties": {
-                "firstName": {
-                    "type": "string",
-                    "description": "The user's first name."
-                },
-                "lastName": {
-                    "type": "string",
-                    "description": "The user's last name."
-                },
-                "age": {
-                    "type": "integer",
-                    "minimum": 0,
-                    "description": "The user's age."
-                },
-                "phoneNumbers": {
-                    "type":
-                    "array",
-                    "items": {
-                        "type": "object",
-                        "properties": {
-                            "type": {
-                                "type": "string",
-                                "enum": ["home", "work", "mobile"],
-                                "description": "Type of phone number."
-                            },
-                            "number": {
-                                "type": "string",
-                                "pattern": "^\\+?[1-9]\\d{1,14}$",
-                                "description": "Phone number in E.164 format."
-                            }
-                        },
-                        "required": ["type", "number"]
-                    },
-                    "description":
-                    "List of phone numbers associated with the user."
-                }
-            },
-            "required": ["firstName", "lastName"]
-        },
-        "address": {
-            "type": "object",
-            "properties": {
-                "street": {
-                    "type": "string",
-                    "description": "Street address."
-                },
-                "city": {
-                    "type": "string",
-                    "description": "City name."
-                },
-                "state": {
-                    "type": "string",
-                    "description": "State or province."
-                },
-                "postalCode": {
-                    "type": "string",
-                    "pattern": "^\\d{5}(-\\d{4})?$",
-                    "description": "Postal code."
-                },
-                "country": {
-                    "type": "string",
-                    "description": "Country name."
-                }
-            },
-            "required": ["street", "city", "state", "postalCode", "country"]
-        },
-        "preferences": {
-            "type": "object",
-            "properties": {
-                "newsletterSubscribed": {
-                    "type":
-                    "boolean",
-                    "description":
-                    "Indicates if the user is subscribed to the newsletter."
-                },
-                "favoriteCategories": {
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    },
-                    "description": "List of user's favorite categories."
-                }
-            },
-            "required": ["newsletterSubscribed"]
-        },
-        "accountStatus": {
-            "type": "string",
-            "enum": ["active", "inactive", "suspended"],
-            "description": "Current status of the user's account."
-        },
-        "registrationDate": {
-            "type": "string",
-            "format": "date-time",
-            "description": "ISO 8601 formatted date-time of user registration."
-        }
+      "name": { "type": "string" },
+      "race": { "type": "string" },
+      "class": { "type": "string" },
+      "level": { "type": "integer" },
+      "background": { "type": "string" },
+      "alignment": { "type": "string" },
+      "backstory": { "type": "string" }
     },
-    "required":
-    ["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
-}
\ No newline at end of file
+    "required": [
+      "name",
+      "race",
+      "class",
+      "level",
+      "background",
+      "alignment",
+      "backstory"
+    ]
+  }
+}
+
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index f45c21ab75b..738ab2ef03d 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
 
-from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
 
 EOS_TOKEN_ID = 50256
 
@@ -36,13 +37,21 @@ def create_scheduler(
         swap_space=0,
         cache_dtype="auto",
     )
+    vllm_config = VllmConfig(
+        scheduler_config=scheduler_config,
+        model_config=model_config,
+        cache_config=cache_config,
+    )
     cache_config.num_gpu_blocks = 10000
-    return Scheduler(scheduler_config,
-                     model_config,
-                     cache_config,
-                     speculative_config=None,
-                     lora_config=None,
-                     log_stats=True)
+    return Scheduler(
+        scheduler_config,
+        model_config,
+        cache_config,
+        speculative_config=None,
+        lora_config=None,
+        log_stats=True,
+        structured_output_manager=StructuredOutputManager(vllm_config),
+    )
 
 
 def create_requests(
@@ -249,7 +258,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -299,7 +310,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -347,7 +360,9 @@ def test_stop_via_update_from_output():
                                        },
                                        num_common_prefix_blocks=0,
                                        finished_req_ids=set(),
-                                       free_encoder_input_ids=[])
+                                       free_encoder_input_ids=[],
+                                       structured_output_request_ids={},
+                                       grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[req.request_id for req in requests],
@@ -392,7 +407,9 @@ def test_stop_via_update_from_output():
         },
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
-        free_encoder_input_ids=[])
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None)
 
     model_output = ModelRunnerOutput(
         req_ids=[requests[0].request_id],
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
index b00e168db9d..6d4278b4c87 100644
--- a/tests/v1/entrypoints/conftest.py
+++ b/tests/v1/entrypoints/conftest.py
@@ -29,6 +29,7 @@ def sample_regex():
             r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
 
 
+# Note: Ensure this only uses attributes compatible with xgrammar
 @pytest.fixture
 def sample_json_schema():
     return {
@@ -44,9 +45,7 @@ def sample_json_schema():
                 "type": "array",
                 "items": {
                     "type": "string",
-                    "maxLength": 10
-                },
-                "minItems": 3
+                }
             },
             "work_history": {
                 "type": "array",
@@ -71,8 +70,9 @@ def sample_json_schema():
     }
 
 
+# A schema unsupported by xgrammar
 @pytest.fixture
-def sample_complex_json_schema():
+def unsupported_json_schema():
     return {
         "type": "object",
         "properties": {
@@ -150,7 +150,19 @@ def sample_guided_choice():
 
 
 @pytest.fixture
-def sample_sql_statements():
+def sample_sql_ebnf():
+    return """
+root ::= select_statement
+select_statement ::= "SELECT" column "from" table "where" condition
+column ::= "col_1" | "col_2"
+table ::= "table_1" | "table_2"
+condition ::= column "=" number
+number ::= "1" | "2"
+"""
+
+
+@pytest.fixture
+def sample_sql_lark():
     return ("""
 start: select_statement
 select_statement: "SELECT" column "from" table "where" condition
diff --git a/tests/v1/entrypoints/llm/__init__.py b/tests/v1/entrypoints/llm/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
new file mode 100644
index 00000000000..871739bcf16
--- /dev/null
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -0,0 +1,269 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+
+import jsonschema
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
+
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_completion(monkeypatch, sample_json_schema,
+                                guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_json_schema,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_object(monkeypatch, guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=100,
+                                     n=2,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json_object=True,
+                                         backend=guided_decoding_backend))
+
+    outputs = llm.generate(
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_json_unsupported_schema(monkeypatch, unsupported_json_schema,
+                                        guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=unsupported_json_schema,
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="The provided JSON schema contains features "
+                       "not supported by xgrammar."):
+        llm.generate(prompts=[
+            f"Give an example JSON for an employee profile "
+            f"that fits this schema: {unsupported_json_schema}"
+        ] * 2,
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_ebnf(monkeypatch, sample_sql_ebnf,
+                             guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_ebnf,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts=("Generate a sql statement that selects col_1 from "
+                 "table_1 where it is equal to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_lark(monkeypatch, sample_sql_lark,
+                             guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_lark,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts=("Generate a sql statement that selects col_1 from "
+                 "table_1 where it is equal to 1"),
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+
+        # use Lark to parse the output, and make sure it's a valid parse tree
+        from lark import Lark
+        parser = Lark(sample_sql_lark)
+        parser.parse(generated_text)
+
+        # remove spaces for comparison b/c we removed them in the grammar
+        ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(
+            " ", "")
+
+        assert generated_text.strip() == ground_truth
+
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_grammar_ebnf_invalid(monkeypatch,
+                                     guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar="not a grammar",
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="Failed to convert the grammar "
+                       "from Lark to EBNF."):
+        llm.generate(
+            prompts=("Generate a sql statement that selects col_1 from "
+                     "table_1 where it is equal to 1"),
+            sampling_params=sampling_params,
+            use_tqdm=True,
+        )
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_regex(monkeypatch, sample_regex, guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend=guided_decoding_backend))
+    with pytest.raises(ValueError,
+                       match="Regex guided decoding is not supported."):
+        llm.generate(prompts=[
+            f"Give an example IPv4 address with this regex: {sample_regex}"
+        ] * 2,
+                     sampling_params=sampling_params,
+                     use_tqdm=True)
+
+    # Once regex is supported --
+    #assert outputs is not None
+    #for output in outputs:
+    #    assert output is not None
+    #    assert isinstance(output, RequestOutput)
+    #    prompt = output.prompt
+    #    generated_text = output.outputs[0].text
+    #    print(generated_text)
+    #    assert generated_text is not None
+    #    assert re.fullmatch(sample_regex, generated_text) is not None
+    #    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+def test_guided_choice_completion(monkeypatch, sample_guided_choice,
+                                  guided_decoding_backend: str):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         choice=sample_guided_choice,
+                                         backend=guided_decoding_backend))
+    outputs = llm.generate(
+        prompts="The best language for type-safe systems programming is ",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert generated_text in sample_guided_choice
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/v1/structured_output/__init__.py b/tests/v1/structured_output/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
new file mode 100644
index 00000000000..3aa86cbec53
--- /dev/null
+++ b/tests/v1/structured_output/test_utils.py
@@ -0,0 +1,196 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm.v1.structured_output.utils import (
+    has_xgrammar_unsupported_json_features)
+
+
+@pytest.fixture
+def unsupported_string_schemas():
+    return [
+        {
+            "type": "string",
+            "pattern": "^[a-zA-Z]+$"
+        },
+        {
+            "type": "string",
+            "enum": ["active", "inactive", "pending"]
+        },
+        {
+            "type": "string",
+            "minLength": 1
+        },
+        {
+            "type": "string",
+            "maxLength": 100
+        },
+        {
+            "type": "string",
+            "format": "email"
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_integer_schemas():
+    return [
+        {
+            "type": "integer",
+            "minimum": 0
+        },
+        {
+            "type": "integer",
+            "maximum": 120
+        },
+        {
+            "type": "integer",
+            "exclusiveMinimum": 120
+        },
+        {
+            "type": "integer",
+            "exclusiveMaximum": 120
+        },
+        {
+            "type": "integer",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_number_schemas():
+    return [
+        {
+            "type": "number",
+            "minimum": 0
+        },
+        {
+            "type": "number",
+            "maximum": 120
+        },
+        {
+            "type": "number",
+            "exclusiveMinimum": 120
+        },
+        {
+            "type": "number",
+            "exclusiveMaximum": 120
+        },
+        {
+            "type": "number",
+            "multipleOf": 120
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_array_schemas():
+    return [
+        {
+            "type": "array",
+            "uniqueItems": True
+        },
+        {
+            "type": "array",
+            "contains": {
+                "type": "string"
+            }
+        },
+        {
+            "type": "array",
+            "minContains": 1
+        },
+        {
+            "type": "array",
+            "maxContains": 5
+        },
+        {
+            "type": "array",
+            "minItems": 1
+        },
+        {
+            "type": "array",
+            "maxItems": 10
+        },
+    ]
+
+
+@pytest.fixture
+def unsupported_object_schemas():
+    return [
+        {
+            "type": "object",
+            "minProperties": 1
+        },
+        {
+            "type": "object",
+            "maxProperties": 5
+        },
+        {
+            "type": "object",
+            "propertyNames": {
+                "pattern": "^[a-z]+$"
+            }
+        },
+        {
+            "type": "object",
+            "patternProperties": {
+                "^S": {
+                    "type": "string"
+                }
+            }
+        },
+    ]
+
+
+@pytest.fixture
+def supported_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "status": {
+                "type": "string"
+            },
+            "scores": {
+                "type": "array",
+                "items": {
+                    "type": "number"
+                }
+            },
+            "address": {
+                "type": "object",
+                "properties": {
+                    "street": {
+                        "type": "string"
+                    },
+                    "city": {
+                        "type": "string"
+                    }
+                }
+            }
+        }
+    }
+
+
+@pytest.mark.parametrize("schema_type", [
+    "unsupported_string_schemas", "unsupported_integer_schemas",
+    "unsupported_number_schemas", "unsupported_array_schemas",
+    "unsupported_object_schemas"
+])
+def test_unsupported_json_features_by_type(schema_type, request):
+    schemas = request.getfixturevalue(schema_type)
+    for schema in schemas:
+        assert has_xgrammar_unsupported_json_features(
+            schema), f"Schema should be unsupported: {schema}"
+
+
+def test_supported_json_features(supported_schema):
+    assert not has_xgrammar_unsupported_json_features(
+        supported_schema), "Schema should be supported"
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index ff4058a3b92..345519a07e4 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -72,6 +72,8 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
 
@@ -135,6 +137,8 @@ def test_update_states_request_finished(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids={req_id},
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -165,6 +169,8 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     model_runner._update_states(scheduler_output)
@@ -190,6 +196,8 @@ def test_update_states_request_resumed(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -221,6 +229,8 @@ def test_update_states_no_changes(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner.input_batch.sampling_metadata
@@ -256,6 +266,8 @@ def test_update_states_request_unscheduled(model_runner):
         num_common_prefix_blocks=0,
         finished_req_ids=set(),
         free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
     )
 
     metadata_before = model_runner._update_states(scheduler_output)
diff --git a/vllm/utils.py b/vllm/utils.py
index 4c4c2509850..883a9e50406 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import argparse
 import asyncio
 import concurrent
@@ -8,6 +10,7 @@
 import enum
 import gc
 import getpass
+import importlib
 import importlib.metadata
 import importlib.util
 import inspect
@@ -23,6 +26,7 @@
 import threading
 import time
 import traceback
+import types
 import uuid
 import warnings
 import weakref
@@ -982,7 +986,7 @@ def current_stream() -> torch.cuda.Stream:
     return _current_stream
 
 
-def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None:
+def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
     """Set up function tracing for the current thread,
     if enabled via the VLLM_TRACE_FUNCTION environment variable
     """
@@ -1977,7 +1981,7 @@ def measure(self):
         self.non_torch_memory = self.cuda_memory - self.torch_memory
         self.timestamp = time.time()
 
-    def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
+    def __sub__(self, other: MemorySnapshot) -> MemorySnapshot:
         return MemorySnapshot(
             torch_peak=self.torch_peak - other.torch_peak,
             cuda_memory=self.cuda_memory - other.cuda_memory,
@@ -2306,3 +2310,54 @@ def wrapped_init(self, *args, **kwargs) -> None:
 
     type.__setattr__(cls, '__init__', wrapped_init)
     return cls
+
+
+class LazyLoader(types.ModuleType):
+    """
+    LazyLoader module borrowed from Tensorflow
+    https://github.com/tensorflow/tensorflow/blob/main/tensorflow/python/util/lazy_loader.py
+    with a addition of "module caching".
+
+    Lazily import a module, mainly to avoid pulling in large dependencies.
+    Modules such as `xgrammar` might do additional side effects, so we
+    only want to use this when it is needed, delaying all eager effects
+    """
+
+    def __init__(
+        self,
+        local_name: str,
+        parent_module_globals: dict[str, Any],
+        name: str,
+    ):
+        self._local_name = local_name
+        self._parent_module_globals = parent_module_globals
+        self._module: types.ModuleType | None = None
+
+        super().__init__(str(name))
+
+    def _load(self) -> types.ModuleType:
+        # Import the target module and insert it into the parent's namespace
+        try:
+            module = importlib.import_module(self.__name__)
+            self._parent_module_globals[self._local_name] = module
+            # The additional add to sys.modules
+            # ensures library is actually loaded.
+            sys.modules[self._local_name] = module
+        except ModuleNotFoundError as err:
+            raise err from None
+
+        # Update this object's dict so that if someone keeps a
+        # reference to the LazyLoader, lookups are efficient
+        # (__getattr__ is only called on lookups that fail).
+        self.__dict__.update(module.__dict__)
+        return module
+
+    def __getattr__(self, item: Any) -> Any:
+        if self._module is None:
+            self._module = self._load()
+        return getattr(self._module, item)
+
+    def __dir__(self) -> list[str]:
+        if self._module is None:
+            self._module = self._load()
+        return dir(self._module)
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index db14c9455a1..70e36e2dc15 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import time
 from collections import deque
 from collections.abc import Iterable
@@ -18,6 +20,7 @@
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
 
@@ -32,12 +35,14 @@ def __init__(
         lora_config: Optional[LoRAConfig],
         speculative_config: Optional[SpeculativeConfig],
         log_stats: bool,
+        structured_output_manager: StructuredOutputManager,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
         self.speculative_config = speculative_config
         self.log_stats = log_stats
+        self.structured_output_manager = structured_output_manager
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -97,7 +102,7 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
-    def schedule(self) -> "SchedulerOutput":
+    def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
         # Each request just has the num_computed_tokens and
@@ -114,6 +119,14 @@ def schedule(self) -> "SchedulerOutput":
         scheduled_running_reqs: list[Request] = []
         preempted_reqs: list[Request] = []
 
+        # NOTE: structured_output_request_ids maps
+        # a request's (request that uses structured output)
+        # request_id to the running request index.
+        # This will helps us determine to slice the grammar bitmask
+        # and only applies valid mask for requests that
+        # uses structured decoding.
+        structured_output_request_ids: dict[str, int] = {}
+
         req_to_new_block_ids: dict[str, list[int]] = {}
         num_scheduled_tokens: dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
@@ -184,6 +197,12 @@ def schedule(self) -> "SchedulerOutput":
             # Schedule the request.
             scheduled_running_reqs.append(request)
             self.scheduled_req_ids.add(request.request_id)
+            if request.use_structured_output:
+                # PERF: in case of chunked prefill,
+                # request might not include any new tokens.
+                # Therefore, we might introduce some additional
+                # cycle to fill in the bitmask, which could be a big no-op.
+                structured_output_request_ids[request.request_id] = req_index
             req_to_new_block_ids[request.request_id] = [
                 b.block_id for b in new_blocks
             ]
@@ -219,6 +238,10 @@ def schedule(self) -> "SchedulerOutput":
                 if req.lora_request and req.lora_request.lora_int_id > 0)
             assert len(requested_loras) <= self.lora_config.max_loras
 
+        # Use a temporary deque to collect requests that need to be skipped
+        # and put back at the head of the waiting queue later
+        waiting_for_fsm: deque[Request] = deque()
+
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting and token_budget > 0:
@@ -227,6 +250,16 @@ def schedule(self) -> "SchedulerOutput":
 
                 request = self.waiting[0]
 
+                if request.status == RequestStatus.WAITING_FOR_FSM:
+                    structured_output_req = request.structured_output_request
+                    if structured_output_req and structured_output_req.grammar:
+                        request.status = RequestStatus.WAITING
+                    else:
+                        waiting_structured_output_req = self.waiting.popleft()
+                        waiting_for_fsm.appendleft(
+                            waiting_structured_output_req)
+                        continue
+
                 # Check that adding the request still respects the max_loras
                 # constraint.
                 if self.lora_config and request.lora_request:
@@ -281,6 +314,10 @@ def schedule(self) -> "SchedulerOutput":
                     break
 
                 self.waiting.popleft()
+                if request.use_structured_output:
+                    structured_output_request_ids[
+                        request.request_id] = req_index
+                req_index += 1
                 self.running.append(request)
                 self.scheduled_req_ids.add(request.request_id)
                 self.request_scheduled(request, scheduled_timestamp)
@@ -311,6 +348,10 @@ def schedule(self) -> "SchedulerOutput":
                         self.encoder_cache_manager.allocate(request, i)
                     encoder_budget = new_encoder_budget
 
+        # Put back any skipped requests at the head of the waiting queue
+        if waiting_for_fsm:
+            self.waiting.extendleft(waiting_for_fsm)
+
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
         assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
@@ -331,6 +372,11 @@ def schedule(self) -> "SchedulerOutput":
                 self.kv_cache_manager.get_num_common_prefix_blocks(
                     any_request, len(self.running)))
 
+        grammar_bitmask = self.structured_output_manager.grammar_bitmask(
+            self.requests,
+            structured_output_request_ids,
+            len(self.running),
+        )
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
@@ -369,6 +415,8 @@ def schedule(self) -> "SchedulerOutput":
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
             free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+            structured_output_request_ids=structured_output_request_ids,
+            grammar_bitmask=grammar_bitmask,
         )
 
         self.finished_req_ids = set()
@@ -381,7 +429,7 @@ def _make_cached_request_data(
         num_scheduled_spec_tokens: int,
         new_block_ids: list[int],
         resumed_from_preemption: bool,
-    ) -> "CachedRequestData":
+    ) -> CachedRequestData:
         # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
         # them at each scheduling step.
         num_computed_tokens = request.num_computed_tokens
@@ -474,8 +522,8 @@ def _try_schedule_encoder_inputs(
 
     def update_from_output(
         self,
-        scheduler_output: "SchedulerOutput",
-        model_runner_output: "ModelRunnerOutput",
+        scheduler_output: SchedulerOutput,
+        model_runner_output: ModelRunnerOutput,
     ) -> EngineCoreOutputs:
         sampled_token_ids = model_runner_output.sampled_token_ids
         spec_token_ids = model_runner_output.spec_token_ids
@@ -565,6 +613,15 @@ def update_from_output(
                     # the outer lists can be of length > 1.
                     new_logprobs = logprobs.slice(req_index, req_index + 1)
 
+            if new_token_ids and request.use_structured_output:
+                # NOTE: structured_output_request
+                # should not be None if use_structured_output, we have
+                # check above, so safe to ignore type warning
+                request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
+                    request.request_id,
+                    new_token_ids,
+                )
+
             # Transmit partial if chunked prefill & prompt logprobs is enabled
             if new_token_ids or prompt_logprobs_tensors is not None:
                 # Add EngineCoreOutput for this Request.
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/scheduler_output.py
index b6caa8b4ebf..bb883acdb44 100644
--- a/vllm/v1/core/scheduler_output.py
+++ b/vllm/v1/core/scheduler_output.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
 if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
     from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
     from vllm.multimodal.base import PlaceholderRange
@@ -17,20 +22,20 @@ class NewRequestData:
     req_id: str
     prompt_token_ids: list[int]
     prompt: Optional[str]
-    mm_inputs: list["MultiModalKwargs"]
+    mm_inputs: list[MultiModalKwargs]
     mm_hashes: list[str]
-    mm_positions: list["PlaceholderRange"]
-    sampling_params: "SamplingParams"
+    mm_positions: list[PlaceholderRange]
+    sampling_params: SamplingParams
     block_ids: list[int]
     num_computed_tokens: int
-    lora_request: Optional["LoRARequest"]
+    lora_request: Optional[LoRARequest]
 
     @classmethod
     def from_request(
         cls,
-        request: "Request",
+        request: Request,
         block_ids: list[int],
-    ) -> "NewRequestData":
+    ) -> NewRequestData:
         return cls(
             req_id=request.request_id,
             prompt_token_ids=request.prompt_token_ids,
@@ -60,11 +65,11 @@ class CachedRequestData:
     @classmethod
     def from_request(
         cls,
-        request: "Request",
+        request: Request,
         resumed_from_preemption: bool,
         new_token_ids: list[int],
         new_block_ids: list[int],
-    ) -> "CachedRequestData":
+    ) -> CachedRequestData:
         return cls(
             req_id=request.request_id,
             resumed_from_preemption=resumed_from_preemption,
@@ -111,3 +116,9 @@ class SchedulerOutput:
     # list of (req_id, encoder_input_index) tuples.
     # Used to free the encoder cache.
     free_encoder_input_ids: list[tuple[str, int]]
+
+    # Dict of request ids to their index within the batch
+    # for filling the next token bitmask
+    structured_output_request_ids: dict[str, int]
+    # the bitmask for the whole batch
+    grammar_bitmask: Optional[npt.NDArray[np.int32]]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 4c9d4cb467a..32cbc10e16f 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -72,9 +72,7 @@ def __init__(
 
         # Processor (converts Inputs --> EngineCoreRequests).
         self.processor = Processor(
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            lora_config=vllm_config.lora_config,
+            vllm_config=vllm_config,
             tokenizer=self.tokenizer,
             input_registry=input_registry,
         )
@@ -194,8 +192,8 @@ async def generate(
             * 3) Adding the Request to the Detokenizer.
             * 4) Adding the Request to the EngineCore (separate process).
 
-        A separate output_handler loop runs in a background AsyncIO task, 
-        pulling outputs from EngineCore and putting them into the 
+        A separate output_handler loop runs in a background AsyncIO task,
+        pulling outputs from EngineCore and putting them into the
         per-request AsyncStream.
 
         The caller of generate() iterates the returned AsyncGenerator,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 671a72e2112..e60aa5d4581 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -29,6 +29,7 @@
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.structured_output import StructuredOutputManager
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -61,6 +62,8 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
+        self.structured_output_manager = StructuredOutputManager(vllm_config)
+
         # Setup scheduler.
         self.scheduler = Scheduler(
             scheduler_config=vllm_config.scheduler_config,
@@ -69,6 +72,7 @@ def __init__(
             lora_config=vllm_config.lora_config,
             speculative_config=vllm_config.speculative_config,
             log_stats=self.log_stats,
+            structured_output_manager=self.structured_output_manager,
         )
 
         # Setup MM Input Mapper.
@@ -131,6 +135,9 @@ def add_request(self, request: EngineCoreRequest):
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
+        if req.use_structured_output:
+            # Start grammar compilation asynchronously
+            self.structured_output_manager.populate_cache(req)
 
         self.scheduler.add_request(req)
 
@@ -148,11 +155,24 @@ def step(self) -> EngineCoreOutputs:
 
         if not self.scheduler.has_unfinished_requests():
             return EngineCoreOutputs(
-                outputs=[], scheduler_stats=self.scheduler.make_stats())
+                outputs=[],
+                scheduler_stats=self.scheduler.make_stats(),
+            )
         scheduler_output = self.scheduler.schedule()
+
+        # This case may occur when the only unfinished requests are
+        # structured output requests where the grammar has not finished
+        # compiling yet, so there's nothing to run.
+        if scheduler_output.total_num_scheduled_tokens == 0:
+            return EngineCoreOutputs(
+                outputs=[],
+                scheduler_stats=self.scheduler.make_stats(),
+            )
+
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)  # type: ignore
+
         return engine_core_outputs
 
     def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 99b97ac8e6c..213faaa4516 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -66,9 +66,7 @@ def __init__(
         self.tokenizer.ping()
 
         # Processor (convert Inputs --> EngineCoreRequests)
-        self.processor = Processor(model_config=vllm_config.model_config,
-                                   cache_config=vllm_config.cache_config,
-                                   lora_config=vllm_config.lora_config,
+        self.processor = Processor(vllm_config=vllm_config,
                                    tokenizer=self.tokenizer,
                                    input_registry=input_registry,
                                    mm_registry=mm_registry)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index a75f0946b4c..b3226a280d8 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -4,7 +4,7 @@
 from collections.abc import Mapping
 from typing import Optional, Union
 
-from vllm.config import CacheConfig, LoRAConfig, ModelConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs
@@ -19,39 +19,41 @@
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
+from vllm.v1.structured_output.utils import validate_structured_output_request
 
 
 class Processor:
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
+        vllm_config: VllmConfig,
         tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.lora_config = lora_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.decoding_config = vllm_config.decoding_config
         self.tokenizer = tokenizer
 
-        self.generation_config_fields = model_config.try_get_generation_config(
-        )
-        self.input_preprocessor = InputPreprocessor(model_config,
+        self.generation_config_fields = (
+            self.model_config.try_get_generation_config())
+        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
                                                     mm_registry)
         self.input_processor = input_registry.create_input_processor(
-            model_config)
+            self.model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_cache_client = MMInputCacheClient(model_config)
+        self.mm_input_cache_client = MMInputCacheClient(self.model_config)
 
         # Multi-modal hasher (for images)
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        self.use_hash = (
+            not self.model_config.disable_mm_preprocessor_cache) or \
+            self.cache_config.enable_prefix_caching
 
     def _validate_logprobs(
         self,
@@ -80,6 +82,8 @@ def _validate_sampling_params(
         self,
         params: SamplingParams,
     ) -> None:
+        self._validate_structured_output(params)
+
         if params.allowed_token_ids is None:
             return
         if not params.allowed_token_ids:
@@ -125,6 +129,21 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
                              "not enabled!")
 
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.guided_decoding or not self.decoding_config:
+            return
+        if self.decoding_config.guided_decoding_backend != "xgrammar":
+            raise ValueError(
+                "Only xgrammar structured output is supported in V1.")
+        if (params.guided_decoding.backend
+                and params.guided_decoding.backend != 'xgrammar'):
+            raise ValueError(
+                "Only xgrammar structured output is supported in V1.")
+        if self.vllm_config.speculative_config:
+            raise ValueError("Structured output is not supported with "
+                             "speculative decoding.")
+        validate_structured_output_request(params)
+
     def process_inputs(
         self,
         request_id: str,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 99df5473483..29609d31330 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,13 +3,15 @@
 import enum
 from typing import TYPE_CHECKING, Optional, Union
 
-from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
+from vllm.v1.structured_output.request import StructuredOutputRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+
+    from vllm.lora.request import LoRARequest
     from vllm.multimodal import MultiModalKwargs
     from vllm.multimodal.inputs import PlaceholderRange
 
@@ -27,15 +29,19 @@ def __init__(
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional["LoRARequest"] = None,
+        structured_output_request: Optional["StructuredOutputRequest"] = None,
     ) -> None:
         self.request_id = request_id
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
+        self.structured_output_request = structured_output_request
 
-        self.status = RequestStatus.WAITING
+        self.status = (RequestStatus.WAITING_FOR_FSM
+                       if sampling_params.guided_decoding is not None else
+                       RequestStatus.WAITING)
         self.events: list[EngineCoreEvent] = []
         self.stop_reason: Union[int, str, None] = None
         assert sampling_params.max_tokens is not None
@@ -78,6 +84,8 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
             lora_request=request.lora_request,
+            structured_output_request=StructuredOutputRequest(
+                sampling_params=request.sampling_params),
         )
 
     def queued(self, timestamp: Optional[float] = None) -> None:
@@ -134,18 +142,23 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
         num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
+    @property
+    def use_structured_output(self) -> bool:
+        return self.sampling_params.guided_decoding is not None
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""
-    WAITING = 0
-    RUNNING = 1
-    PREEMPTED = 2
-    # Note: anything after PREEMPTED (2) will be considered
+    WAITING = enum.auto()
+    WAITING_FOR_FSM = enum.auto()
+    RUNNING = enum.auto()
+    PREEMPTED = enum.auto()
+    # Note: anything after PREEMPTED will be considered
     # as a finished status.
-    FINISHED_STOPPED = 3
-    FINISHED_LENGTH_CAPPED = 4
-    FINISHED_ABORTED = 5
-    FINISHED_IGNORED = 6
+    FINISHED_STOPPED = enum.auto()
+    FINISHED_LENGTH_CAPPED = enum.auto()
+    FINISHED_ABORTED = enum.auto()
+    FINISHED_IGNORED = enum.auto()
 
     @staticmethod
     def is_finished(status: "RequestStatus") -> bool:
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
new file mode 100644
index 00000000000..0c2e0ac2aa7
--- /dev/null
+++ b/vllm/v1/structured_output/__init__.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import copy
+import multiprocessing
+from collections import OrderedDict
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Optional
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
+                                               StructuredOutputOptions)
+
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+    import xgrammar as xgr
+
+    from vllm.v1.request import Request
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputManager:
+
+    def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+        self.vllm_config = vllm_config
+
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=self.vocab_size)
+        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+
+        self.max_cache_size = max_cache_size
+        self.request_key_to_grammar: OrderedDict[StructuredOutputKey,
+                                                 Grammar] = OrderedDict()
+
+        # The default max_workers if not specified is the number of CPUs * 5,
+        # which is way too high since these tasks are CPU-bound, not I/O bound.
+        # We also know we would never dominate CPU usage with just grammar
+        # compilation, so we set it to half the number of CPUs.
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self._grammar_bitmask = xgr.allocate_token_bitmask(
+            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+
+    def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
+        # We need to pop and re-insert the grammar here for LRU cache
+        # of request_key_to_grammar
+        if key in self.request_key_to_grammar:
+            # Move accessed item to the end (most recently used)
+            value = self.request_key_to_grammar.pop(key)
+            if value is not None:
+                self.request_key_to_grammar[key] = value
+            return value
+        return None
+
+    def populate_cache(self, request: Request) -> None:
+        if request.structured_output_request is None:
+            return
+
+        grammar = self.request_key_to_grammar.get(
+            request.structured_output_request.structured_output_key)
+        if grammar:
+            request.structured_output_request.grammar = copy.copy(grammar)
+            return
+        request.structured_output_request.grammar = self.cache(request)
+
+    def cache(self, request: Request):
+        return self.executor.submit(self._executor_loop, request)
+
+    def _executor_loop(self, request: Request) -> Grammar:
+        # NOTE: The structured_output_request should never be
+        # None in this case, but mypy can't infer this
+        # correctly, so we need to ignore the error here.
+        key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
+        grammar = self.request_key_to_grammar.get(key)
+        if grammar is not None:
+            return copy.copy(grammar)
+        grammar = self.initialize_grammar(key)
+        # If cache is full, remove the least recently used item
+        if len(self.request_key_to_grammar) >= self.max_cache_size:
+            self.request_key_to_grammar.popitem(last=False)
+        self.request_key_to_grammar[key] = grammar
+        return copy.copy(grammar)
+
+    def initialize_grammar(self, key: StructuredOutputKey) -> Grammar:
+        # Note that the request was validated in the engine core client,
+        # so at this point we know it is a supported type of request.
+        #
+        # TODO: we still need to handle xgrammar compilation failures
+        request_type, grammar_spec = key
+
+        if request_type == StructuredOutputOptions.JSON:
+            # TODO -- allow any_whitespace to be configurable
+            # pending merge of https://github.com/vllm-project/vllm/pull/12744
+            ctx = self.compiler.compile_json_schema(grammar_spec,
+                                                    any_whitespace=False)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_builtin_json_grammar()
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        else:
+            logger.error("Validation should have already occurred. "
+                         "Please file an issue.")
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        return Grammar(
+            matcher=xgr.GrammarMatcher(ctx),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def grammar_bitmask(
+        self,
+        requests: dict[str, Request],
+        structured_output_request_ids: dict[str, int],
+        batch_len: int,
+    ) -> Optional[npt.NDArray[np.int32]]:
+        # Prepare the structured output bitmask for this batch.
+        if not structured_output_request_ids:
+            return None
+
+        # Fill the bitmask using the index of each request equal to its
+        # position in the batch. Resize the bitmask down to the size of
+        # the batch.
+        bitmask_tensor = self._grammar_bitmask
+        for req_id, batch_index in structured_output_request_ids.items():
+            request = requests[req_id].structured_output_request
+            assert request is not None and request.grammar is not None
+            if not request.grammar.matcher.is_terminated():
+                request.grammar.fill_bitmask(bitmask_tensor, batch_index)
+        if batch_len < self._grammar_bitmask.shape[0]:
+            bitmask_tensor = self._grammar_bitmask[:batch_len]
+
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
diff --git a/vllm/v1/structured_output/grammar.py b/vllm/v1/structured_output/grammar.py
new file mode 100644
index 00000000000..0e9b2b17226
--- /dev/null
+++ b/vllm/v1/structured_output/grammar.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import enum
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+
+
+StructuredOutputKey = tuple[StructuredOutputOptions, str]
+
+
+@dataclass
+class Grammar:
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # TODO: support max_rollback_tokens
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.", request_id, token)
+                return False
+            self.num_processed_tokens += 1
+        return True
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> bool:
+        return self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
+
+    def __copy__(self):
+        return Grammar(
+            matcher=xgr.GrammarMatcher(self.ctx),
+            vocab_size=self.vocab_size,
+            ctx=self.ctx,
+        )
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
new file mode 100644
index 00000000000..fbcfd541df5
--- /dev/null
+++ b/vllm/v1/structured_output/request.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import dataclasses
+import functools
+import json
+from concurrent.futures import Future
+from concurrent.futures._base import TimeoutError
+from typing import Optional, Union, cast
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
+                                               StructuredOutputOptions)
+
+
+@dataclasses.dataclass
+class StructuredOutputRequest:
+
+    sampling_params: SamplingParams
+    _grammar: Optional[Union[Future[Grammar], Grammar]] = None
+
+    def _check_grammar_completion(self) -> bool:
+        # NOTE: We have to lazy import to gate circular imports
+        from vllm.v1.request import RequestStatus
+
+        if isinstance(self._grammar, Future):
+            try:
+                # We will check whether the future is ready within 100 us
+                self._grammar = self._grammar.result(timeout=0.0001)
+                self.status = RequestStatus.WAITING
+            except TimeoutError:
+                return False
+        return True
+
+    @property
+    def is_grammar_ready(self) -> bool:
+        return self._check_grammar_completion()
+
+    @property
+    def grammar(self) -> Optional[Grammar]:
+        completed = self._check_grammar_completion()
+        return cast(Optional[Grammar], self._grammar) if completed else None
+
+    @grammar.setter
+    def grammar(self, grammar: Union[Grammar, Future[Grammar]]) -> None:
+        self._grammar = grammar
+
+    @functools.cached_property
+    def structured_output_key(self) -> StructuredOutputKey:
+        params = self.sampling_params.guided_decoding
+        assert params is not None, "params can't be None."
+        if params.json is not None:
+            if not isinstance(params.json, str):
+                json_str = json.dumps(params.json)
+            else:
+                json_str = params.json
+            return (StructuredOutputOptions.JSON, json_str)
+        elif params.json_object:
+            return (StructuredOutputOptions.JSON_OBJECT, "")
+        elif params.regex is not None:
+            return (StructuredOutputOptions.REGEX, params.regex)
+        elif params.choice is not None:
+            if not isinstance(params.choice, str):
+                json_str = json.dumps(params.choice)
+            else:
+                json_str = params.choice
+            return (StructuredOutputOptions.CHOICE, json_str)
+        elif params.grammar is not None:
+            return (StructuredOutputOptions.GRAMMAR, params.grammar)
+        else:
+            raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
new file mode 100644
index 00000000000..7b1adb834e7
--- /dev/null
+++ b/vllm/v1/structured_output/utils.py
@@ -0,0 +1,295 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import re
+from typing import TYPE_CHECKING, Any
+
+from vllm.sampling_params import SamplingParams
+from vllm.utils import LazyLoader
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+
+def has_xgrammar_unsupported_json_features(schema: dict[str, Any]) -> bool:
+    """Check if JSON schema contains features unsupported by xgrammar."""
+
+    def check_object(obj: dict[str, Any]) -> bool:
+        if not isinstance(obj, dict):
+            return False
+
+        # Check for pattern restrictions
+        if "pattern" in obj:
+            return True
+
+        # Check for enum restrictions
+        if "enum" in obj:
+            return True
+
+        # Check for numeric ranges
+        if obj.get("type") in ("integer", "number") and any(
+                key in obj
+                for key in ("minimum", "maximum", "exclusiveMinimum",
+                            "exclusiveMaximum", "multipleOf")):
+            return True
+
+        # Check for array unsupported keywords
+        if obj.get("type") == "array" and any(
+                key in obj
+                for key in ("uniqueItems", "contains", "minContains",
+                            "maxContains", "minItems", "maxItems")):
+            return True
+
+        # Unsupported keywords for strings
+        if obj.get("type") == "string" and any(
+                key in obj for key in ("minLength", "maxLength", "format")):
+            return True
+
+        # Unsupported keywords for objects
+        if obj.get("type") == "object" and any(
+                key in obj for key in ("minProperties", "maxProperties",
+                                       "propertyNames", "patternProperties")):
+            return True
+
+        # Recursively check all nested objects and arrays
+        for value in obj.values():
+            if isinstance(value, dict):
+                if check_object(value):
+                    return True
+            elif isinstance(value, list):
+                for item in value:
+                    if isinstance(item, dict) and check_object(item):
+                        return True
+
+        return False
+
+    return check_object(schema)
+
+
+def grammar_is_likely_lark(grammar_str: str) -> bool:
+    """
+    Check if grammar appears to use Lark syntax.
+
+    Args:
+        grammar_str: Input grammar string
+
+    Returns:
+        bool: True if grammar appears to be in Lark format, False otherwise
+
+    Examples:
+        >>> grammar_is_likely_lark("rule: 'abc'")
+        True
+        >>> grammar_is_likely_lark("rule ::= 'abc'")
+        False
+    """
+    if not grammar_str or not isinstance(grammar_str, str):
+        return False
+
+    for line in grammar_str.split('\n'):
+        # Remove both comment styles
+        line = re.sub(r'(#|//).*$', '', line).strip()
+        if not line:
+            continue
+
+        # Look for EBNF rule definition
+        if '::=' in line:
+            return False
+
+    return True
+
+
+def convert_lark_to_ebnf(grammar_str: str) -> str:
+    """
+    Convert a Lark grammar string to EBNF format.
+
+    EBNF reference:
+    https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md
+    Lark grammar reference:
+    https://lark-parser.readthedocs.io/en/latest/grammar.html
+
+    Args:
+        grammar_str: Input grammar in Lark format
+
+    Returns:
+        str: Converted grammar in EBNF format
+
+    Examples:
+        >>> print(convert_lark_to_ebnf("rule: 'hello'"))
+        root ::= rule
+        rule ::= "hello"
+    """
+    if not isinstance(grammar_str, str):
+        raise ValueError(f"Grammar must be a string, got {type(grammar_str)}")
+    if not grammar_str.strip():
+        raise ValueError("Grammar string cannot be empty")
+
+    defined_rules = set()
+    referenced_rules = set()
+    output_lines = []
+
+    def clean_line(line: str) -> str:
+        """Remove comments and whitespace from line."""
+        return re.sub(r'(#|//).*$', '', line).strip()
+
+    def check_quotes(text: str, rule_name: str, line_num: int) -> None:
+        """Validate quote matching in text."""
+        if text.count("'") % 2 != 0 or text.count('"') % 2 != 0:
+            raise ValueError(
+                f"Mismatched quotes in {rule_name} on line {line_num}")
+
+    def extract_references(text: str) -> set:
+        """Extract rule references from text."""
+        # Remove quoted strings and special characters
+        text = re.sub(r'"[^"]*"', '', text)
+        text = re.sub(r'[+*?()|\[\]{}]', ' ', text)
+        return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text))
+
+    # First pass: Find root rule and validate rule definitions
+    lines = [clean_line(line) for line in grammar_str.split('\n')]
+    first_rule = None
+
+    for line_num, line in enumerate(lines, 1):
+        if not line or line.startswith('|'):
+            continue
+
+        if ':' in line:
+            try:
+                name = line.split(':', 1)[0].strip().strip('?')
+                defined_rules.add(name)
+                if first_rule is None:
+                    first_rule = name
+                if name == 'start':
+                    first_rule = 'start'
+            except IndexError as e:
+                raise ValueError(f"Invalid rule format on line {line_num}. "
+                                 "Expected 'rule_name: definition'") from e
+
+    if not defined_rules:
+        raise ValueError("No valid rules found in grammar")
+
+    # Add root rule
+    output_lines.append(f"root ::= {first_rule}")
+
+    # Second pass: Process rule definitions and alternatives
+    current_rule = None
+    current_definition = []
+
+    for line_num, line in enumerate(lines, 1):
+        if not line:
+            continue
+
+        try:
+            if ':' in line and not line.startswith('|'):
+                # Save previous rule if exists
+                if current_rule:
+                    output_lines.append(
+                        f"{current_rule} ::= {' | '.join(current_definition)}")
+
+                # Process new rule
+                name, definition = line.split(':', 1)
+                current_rule = name.strip().strip('?')
+
+                check_quotes(definition, f"rule '{current_rule}'", line_num)
+                definition = re.sub(r"'([^']*)'", r'"\1"', definition)
+                referenced_rules.update(extract_references(definition))
+                current_definition = [definition.strip()]
+
+            elif line.startswith('|'):
+                if not current_rule:
+                    raise ValueError(f"Alternative '|' on line {line_num} "
+                                     "without a preceding rule definition")
+
+                alt_def = line[1:].strip()
+                check_quotes(alt_def, f"alternative for rule '{current_rule}'",
+                             line_num)
+                alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def)
+                referenced_rules.update(extract_references(alt_def))
+                current_definition.append(alt_def)
+
+        except ValueError as e:
+            raise ValueError(f"Error on line {line_num}: {str(e)}") from e
+
+    # Add final rule if exists
+    if current_rule:
+        output_lines.append(
+            f"{current_rule} ::= {' | '.join(current_definition)}")
+
+    # Validate all rules are defined
+    undefined_rules = referenced_rules - defined_rules - {'root'}
+    if undefined_rules:
+        raise ValueError("Referenced rules are not defined: "
+                         f"{', '.join(sorted(undefined_rules))}")
+
+    return '\n'.join(output_lines)
+
+
+def choice_as_grammar(choice: list[str]) -> str:
+
+    def escape_ebnf_string(s: str) -> str:
+        """Escape special characters in a EBNF string."""
+        # Escape double quotes and backslashes
+        return re.sub(r'(["\\])', r'\\\1', s)
+
+    escaped_choices = (escape_ebnf_string(c) for c in choice)
+    grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
+    return grammar
+
+
+def validate_structured_output_request(
+        sampling_params: SamplingParams) -> None:
+    """Validate that the request is supported by structured output.
+
+    Raises ValueError if the request is not supported.
+    """
+    if sampling_params.guided_decoding is None:
+        return
+
+    gd_params = sampling_params.guided_decoding
+
+    if gd_params.regex:
+        raise ValueError("Regex structured output is not supported.")
+
+    if gd_params.choice:
+        choice_grammar = choice_as_grammar(gd_params.choice)
+        try:
+            xgr.Grammar.from_ebnf(choice_grammar)
+        except Exception as err:
+            raise ValueError("Failed to transform choices into a grammar: "
+                             "{err}") from err
+        gd_params.choice = None
+        gd_params.grammar = choice_grammar
+        return
+
+    if gd_params.json:
+        if isinstance(gd_params.json, str):
+            try:
+                schema = json.loads(gd_params.json)
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid JSON grammar specification.") from e
+        else:
+            schema = gd_params.json
+
+        if has_xgrammar_unsupported_json_features(schema):
+            raise ValueError("The provided JSON schema contains features not "
+                             "supported by xgrammar.")
+        return
+
+    if gd_params.grammar:
+        if grammar_is_likely_lark(gd_params.grammar):
+            # xgrammar supports EBNF grammars only
+            try:
+                gd_params.grammar = convert_lark_to_ebnf(gd_params.grammar)
+            except ValueError as e:
+                raise ValueError(
+                    "Failed to convert the grammar from Lark to EBNF. ") from e
+
+        # Test parsing EBNF grammar, possibly already converted from Lark
+        try:
+            # parse the grammar, but we aren't compiling it.
+            xgr.Grammar.from_ebnf(gd_params.grammar)
+        except Exception as e:
+            raise ValueError("Invalid grammar specification.") from e
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 519f38cb0b7..2484f0799b8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,8 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, cdiv, is_pin_memory_available)
+                        LayerBlockType, LazyLoader, cdiv,
+                        is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
@@ -40,7 +41,11 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
 if TYPE_CHECKING:
+    import xgrammar as xgr
+
     from vllm.v1.core.scheduler_output import SchedulerOutput
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
 
 logger = init_logger(__name__)
 
@@ -860,6 +865,53 @@ def _gather_encoder_outputs(
     def get_model(self) -> nn.Module:
         return self.model
 
+    def apply_grammar_bitmask(
+        self,
+        scheduler_output: "SchedulerOutput",
+        logits: torch.Tensor,
+    ):
+        # Serialization of np.ndarray is much more efficient than a tensor,
+        # so we receive it in that format.
+        grammar_bitmask = scheduler_output.grammar_bitmask
+        if grammar_bitmask is None:
+            return
+
+        # We receive the structured output bitmask from the scheduler, but the
+        # indices of the requests in the batch may not match the indices of
+        # the bitmask since the scheduler doesn't know how the gpu runner is
+        # ordering the requests in the batch. We need to sort the bitmask to
+        # match the order of the requests used here.
+        struct_out_req_batch_indices: dict[str, int] = {}
+        indices_match = True
+        for req_id in self.input_batch.req_ids:
+            mask_index = scheduler_output.structured_output_request_ids.get(
+                req_id)
+            if mask_index is None:
+                # not a structured output request
+                continue
+            batch_index = self.input_batch.req_id_to_index[req_id]
+            if batch_index != mask_index:
+                indices_match = False
+            struct_out_req_batch_indices[req_id] = batch_index
+
+        if not indices_match:
+            # Sort the bitmask to match the order of the requests
+            sorted_bitmask = np.zeros_like(grammar_bitmask)
+            for req_id, batch_index in struct_out_req_batch_indices.items():
+                orig_index = scheduler_output.structured_output_request_ids[
+                    req_id]
+                sorted_bitmask[batch_index] = grammar_bitmask[orig_index]
+            grammar_bitmask = sorted_bitmask
+
+        grammar_bitmask = torch.from_numpy(grammar_bitmask)
+
+        # TODO: compatibility with spec decode
+        xgr.apply_token_bitmask_inplace(
+            logits,
+            grammar_bitmask.to(self.device, non_blocking=True),
+            indices=list(struct_out_req_batch_indices.values()),
+        )
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -945,6 +997,10 @@ def execute_model(
         sample_hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
 
+        # Apply structured output bitmasks if present
+        if scheduler_output.grammar_bitmask is not None:
+            self.apply_grammar_bitmask(scheduler_output, logits)
+
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
         if not self.use_spec_decode:

From 62b6978ff7e78f92eb62f659a9c4710a96521f33 Mon Sep 17 00:00:00 2001
From: York-RDWang <103811994+York-RDWang@users.noreply.github.com>
Date: Fri, 7 Mar 2025 23:29:00 +0800
Subject: [PATCH 0582/1240] [Doc] Update prefix_caching.md to match the example
 image (#14420)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/v1/prefix_caching.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index d4167ff0c27..2fae22cc264 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -221,7 +221,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 6
 :::
 
-**Time 7: Request 2 comes in with the 33 prompt tokens, where the first 16 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
+**Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
 :::{image} /assets/design/v1/prefix_caching/example-time-7.png
 :alt: Example Time 7

From 9e4b7776e0b1ea411c0291e8937608d9162532a0 Mon Sep 17 00:00:00 2001
From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:09:00 -0600
Subject: [PATCH 0583/1240] [Benchmarks] Make detokenization optional in
 benchmark scripts (#11697)

Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_latency.py        |  7 +++++++
 benchmarks/benchmark_prefix_caching.py | 10 +++++++++-
 benchmarks/benchmark_prioritization.py | 13 +++++++++++--
 benchmarks/benchmark_throughput.py     | 22 ++++++++++++++++++----
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d7f39f50f6c..dfd9bb1e6a4 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
         top_p=1.0,
         ignore_eos=True,
         max_tokens=args.output_len,
+        detokenize=not args.disable_detokenize,
     )
     print(sampling_params)
     dummy_prompt_token_ids = np.random.randint(10000,
@@ -173,6 +174,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
         default=None,
         help="Path to save the latency results in JSON format.",
     )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index fba32520442..4fff7a8fc8e 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -194,7 +194,9 @@ def main(args):
 
     llm = LLM(**dataclasses.asdict(engine_args))
 
-    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+    sampling_params = SamplingParams(temperature=0,
+                                     max_tokens=args.output_len,
+                                     detokenize=not args.disable_detokenize)
 
     print("Testing filtered requests")
     prompts = repeat_and_sort_requests(filtered_requests,
@@ -243,6 +245,12 @@ def main(args):
         "subtract this length when filtering prompts. Only used "
         "when dataset-path is not provided.",
     )
+    parser.add_argument(
+        '--disable-detokenize',
+        action='store_true',
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
index 43b2c1b0332..76fe00ede24 100644
--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -23,7 +23,7 @@ def sample_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> list[tuple[str, int, int]]:
+) -> list[tuple[str, int, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -71,6 +71,7 @@ def run_vllm(
     requests: list[tuple[str, int, int]],
     n: int,
     engine_args: EngineArgs,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
@@ -95,6 +96,7 @@ def run_vllm(
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=output_len,
+                detokenize=not disable_detokenize,
             ))
 
     start = time.perf_counter()
@@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
 
     if args.backend == "vllm":
         elapsed_time = run_vllm(requests, args.n,
-                                EngineArgs.from_cli_args(args))
+                                EngineArgs.from_cli_args(args),
+                                args.disable_detokenize)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
     total_num_tokens = sum(prompt_len + output_len
@@ -174,6 +177,12 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--disable-detokenize',
+        action='store_true',
+        help=("Do not detokenize responses (i.e. do not include "
+              "detokenization time in the latency measurement)"),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index d8353cf1f71..4ab824470b2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -168,6 +168,7 @@ def run_vllm(
     requests: list[SampleRequest],
     n: int,
     engine_args: EngineArgs,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
@@ -194,6 +195,7 @@ def run_vllm(
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
             ))
     lora_requests: Optional[list[LoRARequest]] = None
     if engine_args.enable_lora:
@@ -232,6 +234,7 @@ async def run_vllm_async(
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
+    disable_detokenize: bool = False,
 ) -> float:
     from vllm import SamplingParams
 
@@ -262,6 +265,7 @@ async def run_vllm_async(
                     top_p=1.0,
                     ignore_eos=True,
                     max_tokens=request.expected_output_len,
+                    detokenize=not disable_detokenize,
                 ))
             lora_requests.append(request.lora_request)
 
@@ -288,6 +292,7 @@ def run_hf(
     n: int,
     max_batch_size: int,
     trust_remote_code: bool,
+    disable_detokenize: bool = False,
 ) -> float:
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
@@ -327,8 +332,9 @@ def run_hf(
             use_cache=True,
             max_new_tokens=max_output_len,
         )
-        # Include the decoding time.
-        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        if not disable_detokenize:
+            # Include the decoding time.
+            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
         pbar.update(len(batch))
 
         # Clear the batch.
@@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
                     args.n,
                     AsyncEngineArgs.from_cli_args(args),
                     args.disable_frontend_multiprocessing,
+                    args.disable_detokenize,
                 ))
         else:
             elapsed_time = run_vllm(requests, args.n,
-                                    EngineArgs.from_cli_args(args))
+                                    EngineArgs.from_cli_args(args),
+                                    args.disable_detokenize)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code,
+                              args.disable_detokenize)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
@@ -526,6 +535,11 @@ def main(args: argparse.Namespace):
                         action='store_true',
                         default=False,
                         help="Disable decoupled async engine frontend.")
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize the response (i.e. do not include "
+              "detokenization time in the measurement)"))
     # LoRA
     parser.add_argument(
         "--lora-path",

From df06bdad25fa4ea192cf7007c08304a2c1db122d Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sat, 8 Mar 2025 00:53:38 +0800
Subject: [PATCH 0584/1240] [Kernel] optimize performance of gptq marlin kernel
 when n is small (#14138)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 62 ++++++++++++++-----
 csrc/torch_bindings.cpp                       |  3 +-
 tests/kernels/test_marlin_gemm.py             | 16 +++--
 vllm/_custom_ops.py                           |  5 +-
 vllm/envs.py                                  |  5 ++
 .../layers/quantization/utils/marlin_utils.py | 32 ++++++++++
 6 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 7c33fea93d6..72627df24b9 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -538,6 +538,7 @@ __global__ void Marlin(
     int prob_n,           // output dimension n
     int prob_k,           // reduction dimension k
     int* locks,           // extra global storage for barrier synchronization
+    bool use_atomic_add,  // whether to use atomic add to reduce
     bool use_fp32_reduce  // whether to use fp32 global reduce
 ) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
@@ -1542,7 +1543,17 @@ __global__ void Marlin(
          i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
-        C[c_gl_wr] = sh_red[c_sh_rd];
+        if (use_atomic_add && slice_count > 1) {
+          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
+          scalar_t2* sh_red_half2 =
+              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+  #pragma unroll
+          for (int a = 0; a < 4; a++) {
+            atomicAdd(&C_half2[a], sh_red_half2[a]);
+          }
+        } else {
+          C[c_gl_wr] = sh_red[c_sh_rd];
+        }
         c_gl_wr += c_gl_wr_delta;
         c_sh_rd += c_sh_rd_delta;
       }
@@ -1644,7 +1655,7 @@ __global__ void Marlin(
           }
           cp_async_fence();
         } else {
-          if (last) {
+          if (last || use_atomic_add) {
             if (s_sh_wr_pred) {
               cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
@@ -1664,7 +1675,7 @@ __global__ void Marlin(
           }
 
         } else {
-          if (last) {
+          if (last || use_atomic_add) {
             cp_async_wait<0>();
             __syncthreads();
             if (threadIdx.x / 32 < thread_n_blocks / 4) {
@@ -1703,8 +1714,8 @@ __global__ void Marlin(
         }
       }
 
-      if (slice_count > 1) {  // only globally reduce if there is more than one
-                              // block in a slice
+      if (slice_count > 1 && !use_atomic_add) {
+        // only globally reduce if there is more than one block in a slice
         barrier_acquire(&locks[slice_col], slice_idx);
         if (use_fp32_reduce) {
           global_reduce_fp32(slice_idx == 0, last);
@@ -1713,7 +1724,8 @@ __global__ void Marlin(
         }
         barrier_release(&locks[slice_col], last);
       }
-      if (last)  // only the last block in a slice actually writes the result
+      if (last || use_atomic_add)
+        // only the last block in a slice actuallywrites the result
         write_result();
       slice_row = 0;
       slice_col_par++;
@@ -1768,7 +1780,8 @@ __global__ void Marlin(
                HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce);   \
+                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
+                use_fp32_reduce);                                              \
       }                                                                        \
     }
 
@@ -2062,7 +2075,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                vllm::ScalarType const& q_type, bool has_act_order,
                bool is_k_full, bool has_zp, int num_groups, int group_size,
                int dev, cudaStream_t stream, int thread_k, int thread_n,
-               int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) {
+               int sms, int max_par, bool use_atomic_add, bool use_fp32_reduce,
+               bool is_zp_float) {
   if (has_zp) {
     TORCH_CHECK(
         q_type == vllm::kU4 || q_type == vllm::kU8,
@@ -2243,7 +2257,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                                torch::Tensor& workspace,
                                vllm::ScalarTypeId const& b_q_type_id,
                                int64_t size_m, int64_t size_n, int64_t size_k,
-                               bool is_k_full, bool has_zp,
+                               bool is_k_full, bool has_zp, bool use_atomic_add,
                                bool use_fp32_reduce, bool is_zp_float) {
   vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
   if (has_zp) {
@@ -2306,19 +2320,34 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
-  torch::Tensor c = torch::empty({size_m, size_n}, options);
-  torch::Tensor a_tmp = torch::empty({size_m, size_k}, options);
+  torch::Tensor c;
+  if (use_atomic_add) {
+    c = torch::zeros({size_m, size_n}, options);
+  } else {
+    c = torch::empty({size_m, size_n}, options);
+  }
+
+  torch::Tensor a_tmp;
+  bool has_act_order = g_idx.size(0) != 0;
+  if (has_act_order) {
+    a_tmp = torch::empty({size_m, size_k}, options);
+  } else {
+    a_tmp = torch::empty({0}, options);
+  }
 
   // Alloc C tmp buffer that is going to be used for the global reduce
+  torch::Tensor c_tmp;
   int reduce_max_m = marlin::determine_reduce_max_m(size_m, marlin::max_par);
   int reduce_n = size_n;
   auto options_fp32 =
       torch::TensorOptions().dtype(at::kFloat).device(a.device());
-  if (!use_fp32_reduce) {
+  if (use_fp32_reduce) {
+    c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
+  } else {
     reduce_max_m = 0;
     reduce_n = 0;
+    c_tmp = torch::empty({0}, options_fp32);
   }
-  torch::Tensor c_tmp = torch::empty({reduce_max_m, reduce_n}, options_fp32);
 
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
@@ -2339,7 +2368,6 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   // Detect groupsize and act_order
   int num_groups = -1;
   int group_size = -1;
-  bool has_act_order = g_idx.size(0) != 0;
 
   int rank = b_scales.sizes().size();
   TORCH_CHECK(rank == 2, "b_scales rank = ", rank, " is not 2");
@@ -2407,7 +2435,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
+        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
+        use_fp32_reduce, is_zp_float);
   } else if (a.scalar_type() == at::ScalarType::BFloat16) {
     marlin::marlin_mm<nv_bfloat16>(
         a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
@@ -2416,7 +2445,8 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float);
+        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
+        use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index fe7a674bb03..b06b1222079 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -272,7 +272,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
       "int b_q_type, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
+      "bool has_zp, bool use_atomic_add, bool use_fp32_reduce, "
+      "bool is_zp_float) -> Tensor",
       {stride_tag});
   // conditionally compiled so impl registration is in source file
 
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index b96aca06cdf..c0cf5b099f9 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -34,6 +34,7 @@
 
 ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]
+USE_ATOMIC_ADD_OPTS = [False, True]
 USE_FP32_REDUCE_OPTS = [False, True]
 
 MARLIN_K_CHUNKS = [128]
@@ -194,6 +195,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size,
 @pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
 @pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
+@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
 @pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
 def test_gptq_marlin_gemm(
     k_chunk,
@@ -203,6 +205,7 @@ def test_gptq_marlin_gemm(
     mnk_factors,
     act_order,
     is_k_full,
+    use_atomic_add,
     use_fp32_reduce,
 ):
     m_factor, n_factor, k_factor = mnk_factors
@@ -228,12 +231,12 @@ def test_gptq_marlin_gemm(
     workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
                                 GPTQ_MARLIN_MAX_PARALLEL)
 
-    opcheck(
-        torch.ops._C.gptq_marlin_gemm,
-        (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
-         workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1],
-         a_input.shape[1], is_k_full, False, use_fp32_reduce, False),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS)
+    opcheck(torch.ops._C.gptq_marlin_gemm,
+            (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices,
+             workspace.scratch, quant_type.id, a_input.shape[0],
+             b_weight.shape[1], a_input.shape[1], is_k_full, False,
+             use_atomic_add, use_fp32_reduce, False),
+            test_utils=DEFAULT_OPCHECK_TEST_UTILS)
 
     output = ops.gptq_marlin_gemm(
         a_input,
@@ -249,6 +252,7 @@ def test_gptq_marlin_gemm(
         a_input.shape[1],
         is_k_full=is_k_full,
         has_zp=False,
+        use_atomic_add=use_atomic_add,
         use_fp32_reduce=use_fp32_reduce,
         is_zp_float=False,
     )
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3c822028426..1f362a45aa7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -301,6 +301,7 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor,
                                size_k: torch.SymInt,
                                is_k_full: bool,
                                has_zp: bool = False,
+                               use_atomic_add: bool = False,
                                use_fp32_reduce: bool = False,
                                is_zp_float: bool = False) -> torch.Tensor:
         return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
@@ -713,12 +714,14 @@ def gptq_marlin_gemm(a: torch.Tensor,
                      size_k: int,
                      is_k_full: bool,
                      has_zp: bool = False,
+                     use_atomic_add: bool = False,
                      use_fp32_reduce: bool = False,
                      is_zp_float: bool = False) -> torch.Tensor:
     return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros,
                                          g_idx, perm, workspace, b_q_type.id,
                                          size_m, size_n, size_k, is_k_full,
-                                         has_zp, use_fp32_reduce, is_zp_float)
+                                         has_zp, use_atomic_add,
+                                         use_fp32_reduce, is_zp_float)
 
 
 # fp8 marlin
diff --git a/vllm/envs.py b/vllm/envs.py
index 2489affbcbd..187d28b2d6d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
+    VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
 
 
 def get_default_cache_root():
@@ -630,6 +631,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Whether to use S3 path for model loading in CI via RunAI Streamer
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
+
+    # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
+    "VLLM_MARLIN_USE_ATOMIC_ADD":
+    lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 80416c1bc6e..d1fb52ae09d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -5,6 +5,7 @@
 import numpy
 import torch
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.linear import LinearBase
 from vllm.platforms import current_platform
@@ -290,6 +291,23 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
     return output
 
 
+def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
+                                 dtype: torch.dtype) -> bool:
+    # disable atomicAdd reduce by default,
+    # one can enable it with VLLM_MARLIN_USE_ATOMIC_ADD=1
+    if not envs.VLLM_MARLIN_USE_ATOMIC_ADD or device.type != "cuda":
+        return False
+
+    # sm8x doesn't support atomicAdd + bfloat16 natively
+    device_capability = torch.cuda.get_device_capability(device)
+    if device_capability[0] < 9 and dtype == torch.bfloat16:
+        return False
+
+    # the performance of atomicAdd is better than global reduce
+    # only when m*n is small and k is large
+    return max(m, 64) * n < 64 * 2048 and k >= 2048
+
+
 def apply_gptq_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,
@@ -307,6 +325,12 @@ def apply_gptq_marlin_linear(
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   weight,
                                   weight_scale,
@@ -320,6 +344,7 @@ def apply_gptq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=is_k_full,
                                   has_zp=False,
+                                  use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 
@@ -345,6 +370,12 @@ def apply_awq_marlin_linear(
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition, )
 
+    use_atomic_add = should_use_atomic_add_reduce(m=reshaped_x.size(0),
+                                                  n=output_size_per_partition,
+                                                  k=reshaped_x.size(1),
+                                                  device=input.device,
+                                                  dtype=input.dtype)
+
     output = ops.gptq_marlin_gemm(reshaped_x,
                                   weight,
                                   weight_scale,
@@ -358,6 +389,7 @@ def apply_awq_marlin_linear(
                                   size_k=input_size_per_partition,
                                   is_k_full=True,
                                   has_zp=True,
+                                  use_atomic_add=use_atomic_add,
                                   use_fp32_reduce=use_fp32_reduce,
                                   is_zp_float=False)
 

From f6ab5fc09596070e7048d9d8d86c9ec832314c06 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 8 Mar 2025 01:28:52 +0800
Subject: [PATCH 0585/1240] [Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/audio_language.py  | 38 ++++++++++++++++
 examples/offline_inference/vision_language.py | 38 ++++++++++++++++
 .../vision_language_multi_image.py            | 44 +++++++++++++++++++
 vllm/model_executor/models/phi4mm.py          | 18 +++++---
 4 files changed, 131 insertions(+), 7 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 1ceec026b31..4aa233211b0 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -6,10 +6,14 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import os
+
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
+from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 
 audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
@@ -51,6 +55,39 @@ def run_minicpmo(question: str, audio_count: int):
     return llm, prompt, stop_token_ids
 
 
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: str, audio_count: int):
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process audio inputs.
+    """
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    speech_lora_path = os.path.join(model_path, "speech-lora")
+    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
+
+    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+    )
+    lora_request = LoRARequest("speech", 1, speech_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
+
+
 # Qwen2-Audio
 def run_qwen2_audio(question: str, audio_count: int):
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
@@ -113,6 +150,7 @@ def run_whisper(question: str, audio_count: int):
 
 model_example_map = {
     "minicpmo": run_minicpmo,
+    "phi4_mm": run_phi4mm,
     "qwen2_audio": run_qwen2_audio,
     "ultravox": run_ultravox,
     "whisper": run_whisper,
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 270c0f59cc5..716c31b96ed 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -6,13 +6,16 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+import os
 import random
 
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -519,6 +522,40 @@ def run_phi3v(questions: list[str], modality: str):
     return llm, prompts, stop_token_ids
 
 
+# Phi-4-multimodal-instruct
+def run_phi4mm(questions: list[str], modality: str):
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process image inputs.
+    """
+    assert modality == "image"
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    prompts = [
+        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
+        for question in questions
+    ]
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+    )
+    lora_request = LoRARequest("vision", 1, vision_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
+
+
 # Pixtral HF-format
 def run_pixtral_hf(questions: list[str], modality: str):
     assert modality == "image"
@@ -644,6 +681,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,
+    "phi4_mm": run_phi4mm,
     "pixtral_hf": run_pixtral_hf,
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index b1aec33cff4..6fdd4383c1a 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -4,13 +4,16 @@
 multi-image input on vision language models for text generation,
 using the chat template defined by the model.
 """
+import os
 from argparse import Namespace
 from typing import NamedTuple, Optional
 
+from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
 
 from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 
@@ -294,6 +297,46 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
+    """
+    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
+    show how to process multi images inputs.
+    """
+
+    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+    # Since the vision-lora and speech-lora co-exist with the base model,
+    # we have to manually specify the path of the lora weights.
+    vision_lora_path = os.path.join(model_path, "vision-lora")
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=10000,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+        enable_lora=True,
+        max_lora_rank=320,
+        lora_extra_vocab_size=0,
+    )
+    lora_request = LoRARequest("vision", 1, vision_lora_path)
+    # To maintain code compatibility in this script, we add LoRA here.
+    llm.llm_engine.add_lora(lora_request=lora_request)
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+
+    placeholders = "".join(f"<|image_{i}|>"
+                           for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
+    stop_token_ids = None
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_qwen_vl_chat(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
@@ -459,6 +502,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
+    "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,
     "qwen_vl_chat": load_qwen_vl_chat,
     "qwen2_vl": load_qwen2_vl,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 27ae9bcca2e..6f5ea5af6c0 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -25,6 +25,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
@@ -1421,7 +1422,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
     Implements the Phi-4-multimodal-instruct model in VLLM.
     """
-    # LoRA specific attributes
     packed_modules_mapping = {
         "qkv_proj": [
             "qkv_proj",
@@ -1430,12 +1430,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
             "gate_up_proj",
         ],
     }
-    supported_lora_modules = [
-        "qkv_proj", "o_proj", "gate_up_proj", "down_proj"
-    ]
-    # Phi4MMForCausalLM does not apply LoRA to the embedding layer.
-    embedding_modules = {}
-    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -1801,3 +1795,13 @@ def sample(
     ) -> Optional[SamplerOutput]:
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.",
+            connector=["audio_projection_for_vision", "audio_projection"],
+            tower_model=["vision_encoder", "embed_tokens_extend"],
+        )
\ No newline at end of file

From 86b2c21b98831992c173b933cacfd2b4be4771dc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Mar 2025 01:55:50 +0800
Subject: [PATCH 0586/1240] [v1] torch.compile integration explanation (#14437)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/v1/torch_compile.md | 139 +++++++++++++++++++++++++
 docs/source/index.md                   |   1 +
 vllm/compilation/compiler_interface.py |  15 ++-
 3 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/design/v1/torch_compile.md

diff --git a/docs/source/design/v1/torch_compile.md b/docs/source/design/v1/torch_compile.md
new file mode 100644
index 00000000000..0dadc808999
--- /dev/null
+++ b/docs/source/design/v1/torch_compile.md
@@ -0,0 +1,139 @@
+# vLLM's `torch.compile` integration
+
+In vLLM's V1 architecture, `torch.compile` is enabled by default and is a critical part of the framework. This document gives a simple walk-through example to show how to understand the `torch.compile` usage.
+
+Throughout the example, we will run a common Llama model using v1, and turn on debug level logging to show all the details. The command to be used is `VLLM_USE_V1=1 VLLM_LOGGING_LEVEL=DEBUG vllm serve meta-llama/Llama-3.2-1B`.
+
+## Compilation Cache
+
+In the very verbose logs, we can see:
+
+```
+INFO 03-07 03:06:55 [backends.py:409] Using cache directory: ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0 for vLLM's torch.compile
+```
+
+vLLM will take all the available factors into consideration, and decide a directory to store all the compilation artifact. This means, you can directly copy the whole `~/.cache/vllm/torch_compile_cache` directory in your deployment scenario to save a great amount of compilation time, and hence accelerating the starting time of the vLLM instance.
+
+The factors considered include:
+
+- All the related configs (see the `compute_hash` functions in the [config.py](gh-file:vllm/config.py))
+- PyTorch configs (see the `compute_hash` functions in the [compiler_interface.py](gh-file:vllm/compilation/compiler_interface.py))
+- The model's forward function and the relevant functions called by the forward function (see below)
+
+With all these factors taken into consideration, usually we can guarantee that the cache is safe to use, and will not cause any unexpected behavior. Therefore, the cache is enabled by default. If you want to debug the compilation process, or if you suspect the cache is causing some issues, you can disable it by setting the environment variable `VLLM_DISABLE_COMPILE_CACHE=1`.
+
+A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
+
+## Python Code Compilation
+
+In the very verbose logs, we can see:
+
+```
+DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
+
+DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
+DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
+
+DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
+DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
+```
+
+This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
+
+The result of the Dynamo compilation, is a new function stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py`. Usually, this function unpacks tensors from the module, and then pass it to the traced computation graph. The computation graph is stored in `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py`.
+
+## Computation Graph Processing
+
+The computation graph has shape annotations for every tensor. The inputs are input ids, position ids, weights and buffers from the model, and the outputs are the final hidden states. Note that lm head projection and sampling operations are not considered in the graph.
+
+Most of the inputs to the computation graph has static shape, since they are model weights and buffers, and will not change during the lifetime of the model. Only the input ids and position ids have symbolic shapes, i.e. the shape can change from batch to batch. However, they will share the same symbolic shapes. That is to say, the only changing size to the computation graph, is the batch size (number of tokens processed in the current forward pass).
+
+The attention operation is complicated, and it needs to interact with kv caches, with complicated shapes. Fortunately, the output of the attention operation just share the same shape as the input query of the attention operation. Therefore, we wrap the whole attention operation into a PyTorch custom op `torch.ops.vllm.unified_attention_with_output`, so that Dynamo will not try to inspect any of the internal operations. This way, although attention operation is complicated, we can still capture the model's computation graph as a full-graph, from Dynamo's perspective.
+
+The computation graph is further split into pieces, by the `splitting_ops` (usually this is the attention operation). Therefore, in the `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py` file, we can see lots of submodules, each submodule is a piece of graph after splitting:
+
+- Attention operation itself is a submodule.
+- The part of computation graph, from one attention operation to the next attention operation, is a submodule.
+
+Every submodule can be identified by its index, and will be processed individually.
+
+## Computation Graph Compilation
+
+In the very verbose logs, we can also see:
+
+```
+DEBUG 03-07 03:52:37 [backends.py:134] store the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+DEBUG 03-07 03:52:39 [backends.py:134] store the 1-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+...
+DEBUG 03-07 03:52:45 [backends.py:134] store the 15-th graph for shape None from inductor via handle ('f7fmlodmf3h3by5iiu2c4zarwoxbg4eytwr3ujdd2jphl4pospfd', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/ly/clyfzxldfsj7ehaluis2mca2omqka4r7mgcedlf6xfjh645nw6k2.py')
+DEBUG 03-07 03:52:45 [backends.py:134] store the 16-th graph for shape None from inductor via handle ('fvj3ccoi7m34f3dnr4itmu55mmun44l5xymwhrjlwisylsk7q6jy', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/tf/ctfftkglj7b4lcttq5cymx6cew372uoauupqn6ldsvpiucavqcjc.py')
+```
+
+This means the first piece of computation graph (with shape `None` for symbolic shape) is compiled by Inductor (with a key `fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw`). The compiled kernel is stored in  `~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py`. You can open the file to see what is the code Inductor finally runs.
+
+One more detail: you can see that the 1-th graph and the 15-th graph have the same key, while the 0-th graph and the 16-th graph are different. This is expected, since we split the graph by the attention op, we get 3 unique subgraphs:
+
+- the first layer before attention
+- every middle layer, from one attention operation to the next attention operation
+- the final layer after attention
+
+If we already have the cache directory (e.g. run the same code for the second time), we will see the following logs:
+
+```
+DEBUG 03-07 04:00:45 [backends.py:86] Directly load the 0-th graph for shape None from inductor via handle ('fpegyiq3v3wzjzphd45wkflpabggdbjpylgr7tta4hj6uplstsiw', '~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/inductor_cache/iw/ciwzrk3ittdqatuzwonnajywvno3llvjcs2vfdldzwzozn3zi3iy.py')
+```
+
+This time, Inductor compilation is completely bypassed, and we will load from disk to read the compilation artifact we get from the last time.
+
+The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
+
+`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+
+Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
+
+When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
+
+```
+AUTOTUNE mm(8x2048, 2048x3072)
+  triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+  triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
+  mm 0.0160 ms 81.6% 
+  triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
+  triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
+  triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
+  triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+  triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
+  triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
+SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
+```
+
+It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
+
+Unfortunately, because auto-tuning takes quite a long time (from seconds to minutes, depending on the model size and the batch size), even though it can be cached for later use, for the sake of user-friendliness, we turn it off by default. If you want to have max performance, it is recommended to try it, by compiling specific shapes.
+
+## Cudagraph Capture
+
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+
+The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
+
+The cudagraphs are captured and managed by the compiler backend, and replayed when the batch size has corresponding cudagraph captured. The caller of the model (model runner) only needs to make sure it manages the input buffers correctly. All of the intermediate buffers are managed automatically by the compiler backend.
+
+By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
+
+`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+
+Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
diff --git a/docs/source/index.md b/docs/source/index.md
index a6806900cb3..0bd8e12d088 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -157,6 +157,7 @@ design/multiprocessing
 :caption: V1 Design Documents
 :maxdepth: 2
 
+design/v1/torch_compile
 design/v1/prefix_caching
 design/v1/metrics
 :::
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index ac0544ad640..d280fdfbe0d 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -155,6 +155,7 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
         triton_cache = os.path.join(cache_dir, "triton_cache")
         os.makedirs(triton_cache, exist_ok=True)
         os.environ["TRITON_CACHE_DIR"] = triton_cache
+        self.cache_dir = cache_dir
 
     def compile(
         self,
@@ -200,7 +201,19 @@ def compile(
             def hijack_load(*args, **kwargs):
                 inductor_compiled_graph = original_load(*args, **kwargs)
                 nonlocal file_path
-                file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                compiled_fn = inductor_compiled_graph.current_callable
+                file_path = compiled_fn.__code__.co_filename  # noqa
+                if not file_path.startswith(self.cache_dir):
+                    # hooked in the align_inputs_from_check_idxs function
+                    # in torch/_inductor/utils.py
+                    for cell in compiled_fn.__closure__:
+                        if not callable(cell.cell_contents):
+                            continue
+                        if cell.cell_contents.__code__.co_filename.startswith(
+                                self.cache_dir):
+                            # this is the real file path compiled from Inductor
+                            file_path = cell.cell_contents.__code__.co_filename
+                            break
                 return inductor_compiled_graph
 
             hijacked_compile_fx_inner = torch._inductor.compile_fx.compile_fx_inner  # noqa

From 858f9f413791b8bcef944f29b2a0fc2034306c27 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 7 Mar 2025 10:56:00 -0800
Subject: [PATCH 0587/1240] [V1] Eagerly remove finished requests from the
 batch (#14388)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_engine_core.py        | 10 ++++++++++
 tests/v1/engine/test_engine_core_client.py |  4 ++--
 vllm/v1/core/scheduler.py                  | 11 ++++++++++-
 vllm/v1/engine/async_llm.py                |  6 +++---
 vllm/v1/engine/core.py                     |  6 ++++--
 vllm/v1/metrics/loggers.py                 | 12 ++++++++----
 vllm/v1/outputs.py                         | 10 ++++++++++
 vllm/v1/worker/gpu_model_runner.py         |  9 ++++++---
 vllm/v1/worker/tpu_model_runner.py         |  6 +++++-
 9 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 11c22effb12..5fdbcf5b996 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -102,14 +102,24 @@ def test_engine_core(monkeypatch):
         engine_core.add_request(req)
         assert len(engine_core.scheduler.waiting) == 1
         assert len(engine_core.scheduler.running) == 0
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         _ = engine_core.step()
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 1
+        assert engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         engine_core.abort_requests([request_id])
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert engine_core.scheduler.has_finished_requests()
+
+        _ = engine_core.step()
+        assert not engine_core.scheduler.has_unfinished_requests()
+        assert not engine_core.scheduler.has_finished_requests()
 
         # Add, step, abort 1 of the 3.
         req0 = make_request()
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 3880a3dd9b8..e646ccbd460 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -50,7 +50,7 @@ def loop_until_done(client: EngineCoreClient, outputs: dict):
         engine_core_outputs = client.get_output().outputs
 
         if len(engine_core_outputs) == 0:
-            break
+            continue
 
         all_finished = True
         for out in engine_core_outputs:
@@ -68,7 +68,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict):
         engine_core_outputs = (await client.get_output_async()).outputs
 
         if len(engine_core_outputs) == 0:
-            break
+            continue
 
         all_finished = True
         for out in engine_core_outputs:
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 70e36e2dc15..a7e50f8f40e 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -682,7 +682,8 @@ def finish_requests(
         assert RequestStatus.is_finished(finished_status)
         if isinstance(request_ids, str):
             request_ids = (request_ids, )
-        request_ids = set(request_ids)
+        else:
+            request_ids = set(request_ids)
 
         for req_id in request_ids:
             request = self.requests.get(req_id)
@@ -714,6 +715,14 @@ def get_num_unfinished_requests(self) -> int:
     def has_unfinished_requests(self) -> bool:
         return self.get_num_unfinished_requests() > 0
 
+    def has_finished_requests(self) -> bool:
+        return len(self.finished_req_ids) > 0
+
+    def has_requests(self):
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
     def get_num_unscheduled_requests(self) -> int:
         """Number of requests that are not being processed by the executor."""
         return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 32cbc10e16f..3dc513a7283 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -253,13 +253,14 @@ async def _run_output_handler(self):
             while True:
                 # 1) Pull EngineCoreOutputs from the EngineCore.
                 outputs = await self.engine_core.get_output_async()
+                num_outputs = len(outputs.outputs)
 
-                iteration_stats = IterationStats() if self.log_stats else None
+                iteration_stats = IterationStats() if (
+                    self.log_stats and num_outputs) else None
 
                 # Split outputs into chunks of at most
                 # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                 # event loop for too long.
-                num_outputs = len(outputs.outputs)
                 if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
                     slices = (outputs.outputs, )
                 else:
@@ -313,7 +314,6 @@ def _record_stats(
             return
 
         assert scheduler_stats is not None
-        assert iteration_stats is not None
         for stat_logger in self.stat_loggers:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e60aa5d4581..bdf9203b1b1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -153,7 +153,9 @@ def abort_requests(self, request_ids: list[str]):
     def step(self) -> EngineCoreOutputs:
         """Schedule, execute, and make output."""
 
-        if not self.scheduler.has_unfinished_requests():
+        # Check for any requests remaining in the scheduler - unfinished,
+        # or finished and not yet removed from the batch.
+        if not self.scheduler.has_requests():
             return EngineCoreOutputs(
                 outputs=[],
                 scheduler_stats=self.scheduler.make_stats(),
@@ -335,7 +337,7 @@ def run_busy_loop(self):
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            while not self.scheduler.has_unfinished_requests():
+            while not self.scheduler.has_requests():
                 logger.debug("EngineCore busy loop waiting.")
                 req = self.input_queue.get()
                 self._handle_client_request(*req)
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 44493709b63..fcb4d4f5a25 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -22,7 +22,7 @@ class StatLoggerBase(ABC):
 
     @abstractmethod
     def record(self, scheduler_stats: SchedulerStats,
-               iteration_stats: IterationStats):
+               iteration_stats: Optional[IterationStats]):
         ...
 
     def log(self):  # noqa
@@ -56,10 +56,11 @@ def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
         return float(np.sum(tracked_stats) / (now - self.last_log_time))
 
     def record(self, scheduler_stats: SchedulerStats,
-               iteration_stats: IterationStats):
+               iteration_stats: Optional[IterationStats]):
         """Log Stats to standard output."""
 
-        self._track_iteration_stats(iteration_stats)
+        if iteration_stats:
+            self._track_iteration_stats(iteration_stats)
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
@@ -319,7 +320,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
         info_gauge.set(1)
 
     def record(self, scheduler_stats: SchedulerStats,
-               iteration_stats: IterationStats):
+               iteration_stats: Optional[IterationStats]):
         """Log to prometheus."""
         self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
         self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
@@ -331,6 +332,9 @@ def record(self, scheduler_stats: SchedulerStats,
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if iteration_stats is None:
+            return
+
         self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs)
         self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
         self.counter_generation_tokens.inc(
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index dc3ad402e06..edae654b5d3 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -80,3 +80,13 @@ class ModelRunnerOutput:
     # [prompt_len, num_prompt_logprobs]
     # [prompt_len]
     prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
+
+
+EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+    req_ids=[],
+    req_id_to_index={},
+    sampled_token_ids=[],
+    spec_token_ids=None,
+    logprobs=None,
+    prompt_logprobs_dict={},
+)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2484f0799b8..5cd7e25edca 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -32,7 +32,8 @@
 from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
@@ -919,6 +920,9 @@ def execute_model(
         intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Union[ModelRunnerOutput, torch.Tensor]:
         self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
@@ -1069,7 +1073,7 @@ def execute_model(
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids)
 
-        model_runner_output = ModelRunnerOutput(
+        return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
             req_id_to_index=self.input_batch.req_id_to_index,
             sampled_token_ids=valid_sampled_token_ids,
@@ -1077,7 +1081,6 @@ def execute_model(
             logprobs=logprobs_lists,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
-        return model_runner_output
 
     def generate_draft_token_ids(
         self,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f661412d937..d4ebb3adcf8 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -29,7 +29,8 @@
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
-from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
+from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
+                             ModelRunnerOutput)
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -546,6 +547,9 @@ def execute_model(
     ) -> ModelRunnerOutput:
         # Update cached state
         self._update_states(scheduler_output)
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return EMPTY_MODEL_RUNNER_OUTPUT
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.

From d9cc362371105bfc36115f2f691db2f17a45cc83 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 7 Mar 2025 20:36:16 +0000
Subject: [PATCH 0588/1240] [V1][Metrics] Fix traceback with preemptions+LoRA
 (#14220)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/metrics/stats.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 14ec7d2d746..36317fc9aee 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -150,6 +150,7 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                 LoRARequestStates.scheduled_request(lora_stats, req_id)
             elif event.type == EngineCoreEventType.PREEMPTED:
                 self.num_preempted_reqs += 1
+                LoRARequestStates.preempted_request(lora_stats, req_id)
 
     def update_from_finished_request(self, finish_reason: "FinishReason",
                                      num_prompt_tokens: int,
@@ -224,6 +225,13 @@ def scheduled_request(lora_stats: Optional[LoRAStats], request_id: str):
         lora_stats.waiting_requests.remove(request_id)
         lora_stats.running_requests.add(request_id)
 
+    @staticmethod
+    def preempted_request(lora_stats: Optional[LoRAStats], request_id: str):
+        if lora_stats is None:
+            return
+        lora_stats.running_requests.remove(request_id)
+        lora_stats.waiting_requests.add(request_id)
+
     def update_iteration_stats(self,
                                iteration_stats: Optional[IterationStats]):
         if iteration_stats is None:

From 80d21ef3d7a708ef9e1c37a3860b609093d96238 Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Fri, 7 Mar 2025 15:17:04 -0800
Subject: [PATCH 0589/1240] [Bugfix] Fix torch_xla which can't handle None seed
 introduced in #14274 (#14459)

Signed-off-by: Yarong Mu <ymu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index d09f5dd8400..e190797dfb7 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -75,6 +75,9 @@ def __init__(
                         self.profile_dir)
             self.profiler = xp.start_server(9012)
 
+        if self.model_config.seed is None:
+            self.model_config.seed = 0
+
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
         torch.set_grad_enabled(False)

From ca2caf79dedc40aba5dcc69d9806164482bfb598 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Fri, 7 Mar 2025 20:48:12 -0500
Subject: [PATCH 0590/1240] [V1] Prompt logprobs + APC compatibility; prompt
 logprobs reqs cannot fill APC (#13949)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_prefix_caching.py | 112 ++++++++++++++++++++-
 tests/v1/core/test_scheduler.py      |  61 ++++++++---
 tests/v1/engine/test_async_llm.py    |  36 -------
 tests/v1/engine/test_llm_engine.py   |  15 ---
 tests/v1/engine/utils.py             |   3 -
 tests/v1/sample/test_logprobs.py     | 145 ++++++++++++++++-----------
 tests/v1/sample/utils.py             |  33 ++++--
 vllm/v1/core/kv_cache_manager.py     |  43 ++++----
 vllm/v1/engine/processor.py          |   6 --
 9 files changed, 292 insertions(+), 162 deletions(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index cce2fb2c481..6129752bcdd 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
 
+from typing import Optional
+
 import pytest
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
@@ -15,7 +17,8 @@
 def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
-                 mm_hashes=None):
+                 mm_hashes=None,
+                 prompt_logprobs: Optional[int] = None):
     if mm_positions is None:
         multi_modal_inputs = None
     else:
@@ -28,7 +31,8 @@ def make_request(request_id,
         multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
-        sampling_params=SamplingParams(max_tokens=17),
+        sampling_params=SamplingParams(max_tokens=17,
+                                       prompt_logprobs=prompt_logprobs),
         eos_token_id=100,
         arrival_time=0,
         lora_request=None,
@@ -144,6 +148,110 @@ def test_prefill():
     assert manager.block_pool.free_block_queue.free_list_tail is None
 
 
+def test_prefill_plp():
+    '''Test prefill with APC and some prompt logprobs (plp) requests.
+
+    1. Schedule plp request and validate APC block allocation
+    2. Schedule non-plp request and validate blocks
+    3. Schedule plp request; no hit should occur; validate blocks
+    '''
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Request #0 is a prompt logprobs request
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    all_token_ids = common_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids, prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert len(manager.req_to_block_hashes[req0.request_id]) == 3
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req0, 55, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    req0_block_hashes = [b.block_hash for b in blocks]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for block_id in (0, 1, 2):
+        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        assert manager.block_pool.blocks[block_id].block_hash == block_hash
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+        parent_block_hash = block_hash.hash_value
+
+    # Check partial/preallocated block metadata
+    for block_id in (3, 4):
+        assert manager.block_pool.blocks[block_id].block_hash is None
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    # Request #1 is a non-prompt-logprobs request:
+    # Cache hit in the common prefix when the original block is still in use.
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
+    assert [b.block_id for b in blocks] == [5, 6]
+    for block in computed_blocks:
+        assert block.ref_cnt == 2
+
+    # At this point, we should have 3 free blocks left.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 3
+
+    manager.free(req0)
+    manager.free(req1)
+
+    # All blocks should be available.
+    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    # The order should be
+    # [unallocated (7, 8, 9)]
+    # [unique_req0 (4, 3)]
+    # [unique_req1 (6, 5)]
+    # [common (2, 1, 0)]
+    assert [
+        b.block_id
+        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
+    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+
+    # Request #2 is a prompt-logprobs request:
+    # NO cache hit in the common prefix; duplicates request #0 cached blocks
+    unique_token_ids = [3] * 6
+    req2 = make_request("2",
+                        common_token_ids + unique_token_ids,
+                        prompt_logprobs=5)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(req2, 55, computed_blocks)
+    block_ids = [b.block_id for b in blocks]
+    # Duplicate cached blocks have different ids but same hashes vs request #0
+    assert [b.block_hash for b in blocks] == req0_block_hashes
+    assert block_ids != [0, 1, 2, 3, 4]
+
+    # Request #2 block hashes are valid since request #0 hashes are.
+    # Check block reference counts.
+    for block_id in block_ids:
+        assert manager.block_pool.blocks[block_id].ref_cnt == 1
+
+    manager.free(req2)
+
+
 def test_decode():
     manager = KVCacheManager(
         block_size=16,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 738ab2ef03d..9413373390f 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from typing import Optional
 
+import pytest
+
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -16,7 +18,21 @@ def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
+    enable_prefix_caching: Optional[bool] = None,
 ) -> Scheduler:
+    '''Create scheduler under test.
+    
+    Args:
+      model: model under test
+      max_num_seqs: max sequences to schedule
+      max_num_batch_tokens: max num tokens to batch
+      enable_prefix_caching: optionally force APC config
+                             (True/False) or use default
+                             (None)
+
+    Returns:
+      :class:`Scheduler` instance
+    '''
     scheduler_config = SchedulerConfig(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
@@ -31,11 +47,16 @@ def create_scheduler(
         dtype="float16",
         seed=42,
     )
+    # Cache config, optionally force APC
+    kwargs_cache = ({} if enable_prefix_caching is None else {
+        'enable_prefix_caching': enable_prefix_caching
+    })
     cache_config = CacheConfig(
         block_size=16,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
+        **kwargs_cache,
     )
     vllm_config = VllmConfig(
         scheduler_config=scheduler_config,
@@ -54,16 +75,16 @@ def create_scheduler(
     )
 
 
-def create_requests(
-    num_requests: int,
-    num_tokens: int = 10,
-    mm_positions: Optional[list[PlaceholderRange]] = None,
-    max_tokens: int = 16,
-    stop_token_ids: Optional[list[int]] = None,
-):
+def create_requests(num_requests: int,
+                    num_tokens: int = 10,
+                    mm_positions: Optional[list[PlaceholderRange]] = None,
+                    max_tokens: int = 16,
+                    stop_token_ids: Optional[list[int]] = None,
+                    prompt_logprobs: Optional[int] = None):
     sampling_params = SamplingParams(ignore_eos=False,
                                      max_tokens=max_tokens,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=stop_token_ids,
+                                     prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
         if mm_positions is not None:
@@ -122,9 +143,18 @@ def test_get_num_unfinished_requests():
         assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
 
 
-def test_schedule():
-    scheduler = create_scheduler()
-    requests = create_requests(num_requests=10)
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule(enable_prefix_caching: Optional[bool],
+                  prompt_logprobs: Optional[int]):
+    '''Test scheduling. 
+    Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
+    '''
+    scheduler = create_scheduler(enable_prefix_caching=enable_prefix_caching)
+    requests = create_requests(num_requests=10,
+                               prompt_logprobs=prompt_logprobs)
     for request in requests:
         scheduler.add_request(request)
 
@@ -427,14 +457,21 @@ def test_stop_via_update_from_output():
     assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]
 
 
-def test_schedule_concurrent_batches():
+@pytest.mark.parametrize("enable_prefix_caching, prompt_logprobs", [
+    (None, None),
+    (True, 5),
+])
+def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
+                                     prompt_logprobs: Optional[int]):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
         max_num_seqs=2,
+        enable_prefix_caching=enable_prefix_caching,
     )
     requests = create_requests(
         num_requests=2,
         num_tokens=512,
+        prompt_logprobs=prompt_logprobs,
     )
 
     # Schedule the first request.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index e7b91aeb0fb..0de0026eb28 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -6,7 +6,6 @@
 
 import pytest
 
-from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -72,41 +71,6 @@ async def generate(engine: AsyncLLM,
     return count, request_id
 
 
-@pytest.mark.parametrize(
-    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.asyncio
-async def test_async_llm_refuses_prompt_logprobs_with_apc(
-        monkeypatch, output_kind: RequestOutputKind):
-    """Test passes if AsyncLLM raises an exception when it is configured
-    for automatic prefix caching and it receives a request with
-    prompt_logprobs enabled, which is incompatible."""
-    # TODO(rickyx): Remove monkeypatch VLLM_USE_V1 setting once we have a
-    # better way to test V1 so that in the future when we switch, we don't
-    # have to change all the tests.
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    # Create AsyncLLM engine with APC
-    apc_engine_args = AsyncEngineArgs(model="facebook/opt-125m",
-                                      enable_prefix_caching=True,
-                                      gpu_memory_utilization=0.8,
-                                      disable_log_requests=True)
-    engine = AsyncLLM.from_engine_args(apc_engine_args)
-    try:
-        with pytest.raises(ValueError) as excinfo:
-            # Issue a request with prompt logprobs enabled, which should fail
-            await asyncio.create_task(
-                generate(engine,
-                         "request-0",
-                         TEXT_PROMPT,
-                         output_kind,
-                         10,
-                         prompt_logprobs=5))
-        # Validate exception string is correct
-        assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
-    finally:
-        # Shut down engine
-        engine.shutdown()
-
-
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.parametrize("engine_args_and_prompt",
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 43b16d3e5a2..5446653cc3a 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -5,7 +5,6 @@
 
 import pytest
 
-from tests.v1.engine.utils import PLP_APC_UNSUPPORTED_MSG
 from vllm import LLM, SamplingParams
 
 MODEL = "facebook/opt-125m"
@@ -98,17 +97,3 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
                 f" {n}. Repeats: {repeats}")
-
-
-def test_llm_engine_refuses_prompt_logprobs_with_apc(vllm_model_apc):
-    """Test passes if LLMEngine raises an exception when it is configured
-    for automatic prefix caching and it receives a request with
-    prompt_logprobs enabled, which is incompatible."""
-    model: LLM = vllm_model_apc.model
-    with pytest.raises(ValueError) as excinfo:
-        model.generate(
-            "Hello, my name is",
-            SamplingParams(temperature=0.8, top_p=0.95, prompt_logprobs=5))
-
-    # Validate exception string is correct
-    assert str(excinfo.value) == PLP_APC_UNSUPPORTED_MSG
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index 02baa4801a4..f0e344cfa6f 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -30,9 +30,6 @@
 STOP_STRINGS = ["I love working on", "company by far", "brother in"]
 PROMPT_LEN = 5
 
-PLP_APC_UNSUPPORTED_MSG = ("Prefix caching with prompt logprobs not yet "
-                           "supported on VLLM V1.")
-
 random.seed(42)
 
 
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index d564a8c2e7a..9715573e3f1 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -1,24 +1,34 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import itertools
+from collections.abc import Generator
 
 import pytest
 import torch
 
 from tests.kernels.utils import override_backend_env_variable
 from tests.v1.sample.utils import (
+    BatchLogprobsComposition, BatchLogprobsSpecType,
     assert_incr_detok_str_matches_non_incr_detok_str,
     compute_correct_cumulative_logprob, get_test_batch)
 from vllm import SamplingParams
 
-from ...conftest import VllmRunner
+from ...conftest import HfRunner, VllmRunner
 
 MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 DTYPE = "half"
 
+NONE = BatchLogprobsComposition.NONE
+SAMPLE = BatchLogprobsComposition.SAMPLE
+PROMPT = BatchLogprobsComposition.PROMPT
+SAMPLE_PROMPT = BatchLogprobsComposition.SAMPLE_PROMPT
 
-@pytest.fixture(scope="module")
-def vllm_model(vllm_runner):
+
+@pytest.fixture(
+    scope="module",
+    # Parameterize APC
+    params=[False, True])
+def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
     with vllm_runner(
             MODEL,
             dtype=DTYPE,
@@ -31,22 +41,22 @@ def vllm_model(vllm_runner):
             enforce_eager=True,
             #TODO: enable this once we support it for
             # prompt logprobs.
-            enable_prefix_caching=False,
+            enable_prefix_caching=request.param,
             gpu_memory_utilization=0.5,
     ) as vllm_model:
         yield vllm_model
 
 
 @pytest.fixture(scope="module")
-def hf_model(hf_runner):
+def hf_model(hf_runner) -> Generator[HfRunner, None, None]:
     with hf_runner(MODEL, dtype=DTYPE) as hf_model:
         yield hf_model
 
 
 def _repeat_logprob_config(
     test_prompts,
-    logprob_prompt_logprob_list: list[tuple],
-) -> list[tuple]:
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
+) -> BatchLogprobsSpecType:
     """Ensure each test prompt has a logprob config.
     
     A logprob config specifies the optional (i.e.
@@ -91,42 +101,17 @@ def _repeat_logprob_config(
     return logprob_prompt_logprob_list
 
 
-def _test_case_get_logprobs_and_prompt_logprobs(
-    hf_model,
-    vllm_model,
-    batch_logprobs_composition: str,
+def _run_and_validate(
+    vllm_model: VllmRunner,
+    test_prompts: list[str],
+    vllm_sampling_params: SamplingParams,
+    hf_logprobs: list[list[torch.Tensor]],
+    hf_outputs: list[tuple[list[int], str]],
+    logprob_prompt_logprob_list: BatchLogprobsSpecType,
     temperature: float,
-    example_prompts,
+    max_tokens: int,
+    do_apc: bool,
 ) -> None:
-    test_prompts = example_prompts
-
-    max_tokens = 5
-    hf_outputs = hf_model.generate_greedy(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-
-    # Batch has mixed sample params
-    # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-
-    # Ensure that each test prompt has a logprob config for testing
-    logprob_prompt_logprob_list = _repeat_logprob_config(
-        test_prompts, logprob_prompt_logprob_list)
-    # Generate SamplingParams
-    vllm_sampling_params = [
-        SamplingParams(max_tokens=max_tokens,
-                       logprobs=num_lp,
-                       prompt_logprobs=num_plp,
-                       temperature=temperature,
-                       seed=1984)
-        for num_lp, num_plp in logprob_prompt_logprob_list
-    ]
-
     vllm_results = vllm_model.model.generate(
         test_prompts, sampling_params=vllm_sampling_params)
 
@@ -267,14 +252,13 @@ def _test_case_get_logprobs_and_prompt_logprobs(
             assert vllm_result.prompt_logprobs is None
 
 
-#@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("batch_logprobs_composition",
-                         ["NONE", "SAMPLE", "PROMPT", "SAMPLE_PROMPT"])
+                         [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
     hf_model,
     vllm_model,
-    batch_logprobs_composition: str,
+    batch_logprobs_composition: BatchLogprobsComposition,
     temperature: float,
     example_prompts,
 ) -> None:
@@ -292,25 +276,70 @@ def test_get_logprobs_and_prompt_logprobs(
     batch_logprobs_composition controls the logprobs configurations for
     requests in the batch under test.
 
+    APC tests run two test iterations so that cache hits occur.
+
+    To save time, only test one APC-enabled scenario
+    (sample & prompt logprobs enabled, temperature>0.0).
+    
     Args:
-      hf_model
-      vllm_model
+      hf_model: HuggingFace reference model fixture
+      vllm_model: vLLM model fixture
       batch_logprobs_composition: logprobs configuration for test batch
-      example_prompts
-      monkeypatch
+      temperature: "temperature" sampling parameter
+      example_prompts: example prompt fixture
     """
-    _test_case_get_logprobs_and_prompt_logprobs(
-        hf_model=hf_model,
-        vllm_model=vllm_model,
-        batch_logprobs_composition=batch_logprobs_composition,
-        temperature=temperature,
-        example_prompts=example_prompts)
+    do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+    if do_apc and (temperature < 2.0
+                   or batch_logprobs_composition != SAMPLE_PROMPT):
+        # Skip some test-cases to save time.
+        pytest.skip()
+    test_prompts = example_prompts
+
+    max_tokens = 5
+    hf_outputs = hf_model.generate_greedy(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+    hf_logprobs = hf_model.generate_greedy_logprobs(
+        test_prompts,
+        max_tokens=max_tokens,
+    )
+
+    # Batch has mixed sample params
+    # (different logprobs/prompt logprobs combos)
+    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
+
+    # Ensure that each test prompt has a logprob config for testing
+    logprob_prompt_logprob_list = _repeat_logprob_config(
+        test_prompts, logprob_prompt_logprob_list)
+    # Generate SamplingParams
+    vllm_sampling_params = [
+        SamplingParams(max_tokens=max_tokens,
+                       logprobs=num_lp,
+                       prompt_logprobs=num_plp,
+                       temperature=temperature,
+                       seed=1984)
+        for num_lp, num_plp in logprob_prompt_logprob_list
+    ]
+    for _ in range(2 if do_apc else 1):
+        _run_and_validate(
+            vllm_model=vllm_model,
+            test_prompts=test_prompts,
+            vllm_sampling_params=vllm_sampling_params,
+            hf_logprobs=hf_logprobs,
+            hf_outputs=hf_outputs,
+            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            do_apc=do_apc)
 
 
 def test_max_logprobs(monkeypatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
     Should also fail for `prompt_logprobs > max_logprobs`
+
+    APC should not matter as this test checks basic request validation.
     
     Args:
       monkeypatch
@@ -330,14 +359,12 @@ def test_max_logprobs(monkeypatch):
         runner.generate(["Hello world"], sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
+def test_none_logprobs(vllm_model, example_prompts):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
     
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
-      monkeypatch: supports editing env vars and rolling back changes
-                   after the test
     """
     max_tokens = 5
 
@@ -356,14 +383,12 @@ def test_none_logprobs(vllm_model, example_prompts, monkeypatch):
         assert results_logprobs_none[i].prompt_logprobs is None
 
 
-def test_zero_logprobs(vllm_model, example_prompts, monkeypatch):
+def test_zero_logprobs(vllm_model, example_prompts):
     """Engine should return sampled token and prompt token logprobs
     
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
-      monkeypatch: supports editing env vars and rolling back changes
-                   after the test
     """
     max_tokens = 5
 
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index c69d0d49c46..f540895bbf1 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,27 +1,42 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import re
+from enum import Enum
+from typing import Optional
 
 from vllm import CompletionOutput
 
 
-def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
+class BatchLogprobsComposition(Enum):
+    """Types of logprobs configs to include in test batch"""
+    NONE = 0
+    SAMPLE = 1
+    PROMPT = 2
+    SAMPLE_PROMPT = 3
+
+
+BatchLogprobsSpecType = list[tuple[Optional[int], Optional[int]]]
+
+
+def get_test_batch(
+    batch_logprobs_composition: BatchLogprobsComposition
+) -> BatchLogprobsSpecType:
     """Generate logprobs configs for a batch of requests
     
     A given request's logprobs configuration is (1) num_sample_logprobs and (2)
     num_prompt_logprobs. The batch logprobs configuration is the list of request
     logprobs configs.
 
-    batch_logprobs_composition == "NONE" yields a batch with no sample or prompt
+    batch_logprobs_composition == NONE yields a batch with no sample or prompt
     logprobs
 
-    batch_logprobs_composition == "SAMPLE" yields a batch with some requests
+    batch_logprobs_composition == SAMPLE yields a batch with some requests
     configured for sample logprobs only, and others configured for no logprobs
 
-    batch_logprobs_composition == "PROMPT" yields a batch with some requests
+    batch_logprobs_composition == PROMPT yields a batch with some requests
     configured for prompt logprobs only, and others configured for no logprobs
 
-    batch_logprobs_composition == "SAMPLE_PROMPT" yields a batch with some
+    batch_logprobs_composition == SAMPLE_PROMPT yields a batch with some
     requests configured for sample logprobs and prompt logprobs, some configured
     for only sample logprobs or only prompt logprobs, and some configured for
     no logprobs
@@ -34,10 +49,10 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
       list of (Optional[num_sample_logprobs], Optional[num_prompt_logprobs])
       tuples
     """
-    if batch_logprobs_composition == "NONE":
+    if batch_logprobs_composition == BatchLogprobsComposition.NONE:
         # No requests with sample or prompt logprobs
         return [(None, None)]
-    elif batch_logprobs_composition == "SAMPLE":
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE:
         # Requests requiring sample logprobs or no logprobs
         return [
             (None, None),
@@ -45,7 +60,7 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
             (5, None),
             (3, None),
         ]
-    elif batch_logprobs_composition == "PROMPT":
+    elif batch_logprobs_composition == BatchLogprobsComposition.PROMPT:
         # Requests requiring prompt logprobs or no logprobs
         return [
             (None, None),
@@ -53,7 +68,7 @@ def get_test_batch(batch_logprobs_composition: str) -> list[tuple]:
             (None, 6),
             (None, 5),
         ]
-    elif batch_logprobs_composition == "SAMPLE_PROMPT":
+    elif batch_logprobs_composition == BatchLogprobsComposition.SAMPLE_PROMPT:
         # Requests requiring either no logprobs, just
         # sample logprobs, just prompt logprobs, or
         # both sample and prompt logprobs
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 6c6be01a2ff..5cfe2b96865 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -105,8 +105,6 @@ def get_computed_blocks(
             # Prefix caching is disabled.
             return [], 0
 
-        computed_blocks = []
-
         # The block hashes for the request may already be computed
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
@@ -114,24 +112,31 @@ def get_computed_blocks(
             block_hashes = hash_request_tokens(self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
-        for block_hash in block_hashes:
-            # block_hashes is a chain of block hashes. If a block hash is not
-            # in the cached_block_hash_to_id, the following block hashes are
-            # not computed yet for sure.
-            if cached_block := self.block_pool.get_cached_block(block_hash):
-                computed_blocks.append(cached_block)
-            else:
-                break
-
         self.prefix_cache_stats.requests += 1
-        self.prefix_cache_stats.queries += len(block_hashes)
-        self.prefix_cache_stats.hits += len(computed_blocks)
-
-        # NOTE(woosuk): Since incomplete blocks are not eligible for
-        # sharing, `num_computed_tokens` is always a multiple of
-        # `block_size`.
-        num_computed_tokens = len(computed_blocks) * self.block_size
-        return computed_blocks, num_computed_tokens
+        if request.sampling_params.prompt_logprobs is None:
+            # Check for cache hits
+            computed_blocks = []
+            for block_hash in block_hashes:
+                # block_hashes is a chain of block hashes. If a block hash
+                # is not in the cached_block_hash_to_id, the following
+                # block hashes are not computed yet for sure.
+                if cached_block := self.block_pool.get_cached_block(
+                        block_hash):
+                    computed_blocks.append(cached_block)
+                else:
+                    break
+
+            self.prefix_cache_stats.queries += len(block_hashes)
+            self.prefix_cache_stats.hits += len(computed_blocks)
+
+            # NOTE(woosuk): Since incomplete blocks are not eligible for
+            # sharing, `num_computed_tokens` is always a multiple of
+            # `block_size`.
+            num_computed_tokens = len(computed_blocks) * self.block_size
+            return computed_blocks, num_computed_tokens
+        else:
+            # Skip cache hits for prompt logprobs
+            return [], 0
 
     def allocate_slots(
         self,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b3226a280d8..247fb046e81 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -72,12 +72,6 @@ def _validate_logprobs(
                 f"Requested prompt logprobs of {params.prompt_logprobs}, "
                 f"which is greater than max allowed: {max_logprobs}")
 
-        # TODO(andy): enable this in follow up by recomputing.
-        if (params.prompt_logprobs is not None
-                and self.cache_config.enable_prefix_caching):
-            raise ValueError("Prefix caching with prompt logprobs not yet "
-                             "supported on VLLM V1.")
-
     def _validate_sampling_params(
         self,
         params: SamplingParams,

From 5a1bf7d5b86ec234d85f8eaebe6fbf0f90ad6b83 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 8 Mar 2025 01:18:25 -0500
Subject: [PATCH 0591/1240] [Bugfix][V1] Handle MLA in kv_cache_interface
 (#14462)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/kv_cache_interface.py      | 13 ++++++++-----
 vllm/v1/worker/gpu_model_runner.py |  5 +++--
 vllm/v1/worker/tpu_model_runner.py |  7 ++++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index dfef1039fce..1f885c10c8c 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -23,9 +23,9 @@ class KVCacheSpecBase:
     def type_id(self) -> str:
         """
         The type identifier of this KV cache.
-        Return different strings for layers with different KV cache type (e.g., 
-        different number of tokens like full attention vs sliding window 
-        attention, different KV cache size per token like layers with different 
+        Return different strings for layers with different KV cache type (e.g.,
+        different number of tokens like full attention vs sliding window
+        attention, different KV cache size per token like layers with different
         number of heads)
 
         Returns:
@@ -59,6 +59,7 @@ class FullAttentionSpec(KVCacheSpecBase):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
+    use_mla: bool
 
     @property
     def type_id(self) -> str:
@@ -66,7 +67,9 @@ def type_id(self) -> str:
 
     @property
     def page_size_bytes(self) -> int:
-        return  2 * self.block_size * self.num_kv_heads * self.head_size \
+        # For MLA we only store a single latent vector
+        coef = 1 if self.use_mla else 2
+        return coef * self.block_size * self.num_kv_heads * self.head_size \
                 * get_dtype_size(self.dtype)
 
     def bytes_for_tokens(self, num_tokens: int) -> int:
@@ -104,7 +107,7 @@ class KVCacheConfig:
     2. (not implemented yet) A model with the same number of full attention
     layers and sliding window attention layers: two groups, one for full
     attention layers and one for sliding window attention layers.
-    3. (not implemented yet) A model with 2 full attention layers and 4 sliding 
+    3. (not implemented yet) A model with 2 full attention layers and 4 sliding
     window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
     """
     groups: list[list[str]]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 5cd7e25edca..0cdf8f1ab8c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1460,13 +1460,14 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
 
         forward_ctx = self.vllm_config.compilation_config.static_forward_context
         block_size = self.vllm_config.cache_config.block_size
+        use_mla = self.vllm_config.model_config.use_mla
         kv_cache_spec: KVCacheSpec = {}
         for layer_name, attn_module in forward_ctx.items():
             if isinstance(attn_module, FusedMoE):
                 continue
 
             # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention, MLA.
+            # cross-attention
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
                 kv_cache_spec[layer_name] = FullAttentionSpec(
@@ -1474,7 +1475,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
                     dtype=attn_module.dtype,
-                )
+                    use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d4ebb3adcf8..7ac07906fab 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -303,10 +303,10 @@ def get_model(self) -> nn.Module:
 
     def get_kv_cache_spec(self) -> KVCacheSpec:
         """
-        Generates the KVCacheSpec by parsing the kv cache format from each 
+        Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
         Returns:
-            KVCacheSpec: A dictionary mapping layer names to their KV cache 
+            KVCacheSpec: A dictionary mapping layer names to their KV cache
             format. Layers that do not need KV cache are not included.
         """
 
@@ -323,6 +323,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
                     dtype=attn_module.dtype,
+                    use_mla=False,
                 )
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
@@ -764,7 +765,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
         Args:
-            kv_cache_config: Configuration for the KV cache, including the KV 
+            kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
         if len(kv_cache_config.groups) > 1:

From 0492d832670c303cce59dfcb4410e19870c84fee Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 8 Mar 2025 01:18:53 -0500
Subject: [PATCH 0592/1240] Revert "[Perf] Reduce MLA CPU overheads in V1
 (#14384)" (#14471)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/rotary_embedding.py |  9 ++-------
 vllm/v1/attention/backends/mla/common.py       | 15 ++++-----------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 48cdebee916..64c2dac524f 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,13 +161,8 @@ def forward_cuda(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         from vllm import _custom_ops as ops
 
-        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
-        # is expensive, so avoid calling it if possible
-        if self.cos_sin_cache.device != query.device or \
-            self.cos_sin_cache.dtype != query.dtype:
-            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                       dtype=query.dtype)
-
+        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                   dtype=query.dtype)
         # ops.rotary_embedding()/batched_rotary_embedding()
         # are in-place operations that update the query and key tensors.
         if offsets is not None:
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index f3fff585be6..886295ee895 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -222,8 +222,8 @@
     Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
-from vllm.platforms import current_platform
+from vllm.model_executor.layers.rotary_embedding import (
+    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.utils import cdiv, round_down
 
 try:
@@ -627,15 +627,8 @@ def __init__(
         self.v_head_dim = v_head_dim
 
         self.rotary_emb = rotary_emb
-
-        if current_platform.is_cuda():
-            # Hack for V1 for now to avoid torch library overhead (since we are
-            # already inside an attention custom op), pull out the forward
-            # method from the rotary embedding and call it directly (and avoid
-            # calling forward_native, when we can call forward_cuda)
-            # TODO(lucas): we should probably find a cleaner way to do this
-            self.rotary_emb = rotary_emb.forward_cuda
-
+        self.use_yarn_rope = isinstance(rotary_emb,
+                                        DeepseekScalingRotaryEmbedding)
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj

From 44a964bade774fceb6d3b554b33197ce4e58094e Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis@mistral.ai>
Date: Sat, 8 Mar 2025 07:39:31 +0100
Subject: [PATCH 0593/1240] [Bugfix][Disaggregated] Add a check in
 send_kv_caches_and_hidden_states and fix the reshape of the KVCache (#14369)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../kv_connector/simple_connector.py            | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 8e2fbf36b4d..7315a6f45f7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -2,7 +2,7 @@
 """
 Simple KV Cache Connector for Distributed Machine Learning Inference
 
-The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache 
+The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache
 producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or
 MooncakePipe.
 
@@ -159,6 +159,7 @@ def send_kv_caches_and_hidden_states(
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
@@ -166,7 +167,8 @@ def send_kv_caches_and_hidden_states(
         num_heads = int(model_config.num_key_value_heads / self.tp_size)
         hidden_size = model_config.hidden_size
         num_attention_heads = model_config.num_attention_heads
-        head_size = int(hidden_size / num_attention_heads)
+        head_size = getattr(model_config, "head_dim",
+                            int(hidden_size // num_attention_heads))
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -174,6 +176,15 @@ def send_kv_caches_and_hidden_states(
         for idx, slen in enumerate(seq_lens):
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You have some decode requests while using "
+                               "SimpleConnector. Their KVCache won't be sent.")
+                break
+
             current_tokens = input_tokens_tensor[start_pos:end_pos]
 
             keys, values = [], []
@@ -236,7 +247,7 @@ def recv_kv_caches_and_hidden_states(
                 # - input_tokens[num_prefill_tokens:] contains decode tokens.
                 logger.warning("You should set --enable_chunked_prefill=False "
                                "and --max_num_batched_tokens "
-                               "should be equal to max_seq_len_to_capture")
+                               "should be equal to --max_seq_len_to_capture")
                 bypass_model_exec = False
                 assert start_pos == num_prefill_tokens
                 break

From bc85f6e062d747bc06710c3b8a4a36dcd8890121 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Fri, 7 Mar 2025 22:40:06 -0800
Subject: [PATCH 0594/1240] [MISC][V1] Register process killing handler only in
 the main thread (#14380)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core_client.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 55057179f3a..d060d9778e7 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -4,6 +4,7 @@
 import os
 import queue
 import signal
+import threading
 import uuid
 import weakref
 from abc import ABC, abstractmethod
@@ -260,7 +261,14 @@ def sigusr1_handler(signum, frame):
                          "down. See stack trace above for root cause issue.")
             kill_process_tree(os.getpid())
 
-        signal.signal(signal.SIGUSR1, sigusr1_handler)
+        if threading.current_thread() == threading.main_thread():
+            signal.signal(signal.SIGUSR1, sigusr1_handler)
+        else:
+            logger.warning("SIGUSR1 handler not installed because we are not "
+                           "running in the main thread. In this case the "
+                           "forked engine process may not be killed when "
+                           "an exception is raised, and you need to handle "
+                           "the engine process shutdown manually.")
 
         # Serialization setup.
         self.encoder = MsgpackEncoder()

From e5c3743baafd9657cd043d4f6c4a32f0289b291f Mon Sep 17 00:00:00 2001
From: Aviv Keshet <akeshet@scaledcognition.com>
Date: Fri, 7 Mar 2025 22:41:18 -0800
Subject: [PATCH 0595/1240] [core] add `extra_args` to `SamplingParams`
 (#13300)

Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/sampling_params.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 1848fd1de5c..ca577a6721f 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -184,6 +184,9 @@ class SamplingParams(
         allowed_token_ids: If provided, the engine will construct a logits
             processor which only retains scores for the given token ids.
             Defaults to None.
+        extra_args: Arbitrary additional args, that can be used by custom
+            sampling implementations. Not used by any in-tree sampling
+            implementations.
     """
 
     n: int = 1
@@ -227,6 +230,7 @@ class SamplingParams(
     guided_decoding: Optional[GuidedDecodingParams] = None
     logit_bias: Optional[dict[int, float]] = None
     allowed_token_ids: Optional[list[int]] = None
+    extra_args: Optional[dict[str, Any]] = None
 
     @staticmethod
     def from_optional(
@@ -259,6 +263,7 @@ def from_optional(
         guided_decoding: Optional[GuidedDecodingParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
         allowed_token_ids: Optional[list[int]] = None,
+        extra_args: Optional[dict[str, Any]] = None,
     ) -> "SamplingParams":
         if logit_bias is not None:
             # Convert token_id to integer
@@ -300,6 +305,7 @@ def from_optional(
             guided_decoding=guided_decoding,
             logit_bias=logit_bias,
             allowed_token_ids=allowed_token_ids,
+            extra_args=extra_args,
         )
 
     def __post_init__(self) -> None:
@@ -509,7 +515,8 @@ def __repr__(self) -> str:
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
             f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
-            f"guided_decoding={self.guided_decoding})")
+            f"guided_decoding={self.guided_decoding}, "
+            f"extra_args={self.extra_args})")
 
 
 class BeamSearchParams(

From 7d501bce5900c2c3c2f895d44b5e4d414a327d14 Mon Sep 17 00:00:00 2001
From: Roger Meier <r.meier@siemens.com>
Date: Sat, 8 Mar 2025 07:42:01 +0100
Subject: [PATCH 0596/1240] [CI/Build] refactor: set timezone of container to
 UTC (#12888)

Signed-off-by: Roger Meier <r.meier@siemens.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 128600b0f5e..ece22ed3df8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,9 +15,7 @@ ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install minimal dependencies and uv
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
     && apt-get install -y ccache git curl wget sudo \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
@@ -174,9 +172,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
 # Install minimal dependencies and uv
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
+RUN apt-get update -y \
     && apt-get install -y ccache git curl wget sudo vim \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
     && curl -LsSf https://astral.sh/uv/install.sh | sh

From 89f34e0db30bda9eda117c773e33e6e7bb2f58ee Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 8 Mar 2025 07:46:15 +0100
Subject: [PATCH 0597/1240] Default to `generation_config` from model (#12622)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../entrypoints/openai/correctness/test_lmeval.py |  2 +-
 tests/entrypoints/openai/test_serving_chat.py     |  1 +
 tests/test_config.py                              |  8 ++++----
 vllm/config.py                                    | 15 ++++++---------
 vllm/engine/arg_utils.py                          | 14 +++++++-------
 vllm/entrypoints/openai/serving_chat.py           |  6 ++++--
 vllm/entrypoints/openai/serving_completion.py     |  7 ++++---
 7 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index 902df929e78..e4c087db3d4 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -20,7 +20,7 @@
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUE = 0.54
 DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
 MORE_ARGS_LIST = [
     [],  # Default
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index 1e7dbaf60dc..19d16713b20 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -38,6 +38,7 @@ class MockModelConfig:
     diff_sampling_param: Optional[dict] = None
     allowed_local_media_path: str = ""
     encoder_config = None
+    generation_config: str = "auto"
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}
diff --git a/tests/test_config.py b/tests/test_config.py
index 709d60b8367..06264c5b99b 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -289,7 +289,7 @@ def test_uses_mrope(model_id, uses_mrope):
 def test_generation_config_loading():
     model_id = "Qwen/Qwen2.5-1.5B-Instruct"
 
-    # When set generation_config to None, the default generation config
+    # When set generation_config to "vllm", the default generation config
     # will not be loaded.
     model_config = ModelConfig(model_id,
                                task="auto",
@@ -298,7 +298,7 @@ def test_generation_config_loading():
                                trust_remote_code=False,
                                seed=0,
                                dtype="float16",
-                               generation_config=None)
+                               generation_config="vllm")
     assert model_config.get_diff_sampling_param() == {}
 
     # When set generation_config to "auto", the default generation config
@@ -340,7 +340,7 @@ def test_generation_config_loading():
 
     assert model_config.get_diff_sampling_param() == override_result
 
-    # When generation_config is set to None and override_generation_config
+    # When generation_config is set to "vllm" and override_generation_config
     # is set, the override_generation_config should be used directly.
     model_config = ModelConfig(
         model_id,
@@ -350,7 +350,7 @@ def test_generation_config_loading():
         trust_remote_code=False,
         seed=0,
         dtype="float16",
-        generation_config=None,
+        generation_config="vllm",
         override_generation_config=override_generation_config)
 
     assert model_config.get_diff_sampling_param() == override_generation_config
diff --git a/vllm/config.py b/vllm/config.py
index 5f61c2d592c..ebdcc5e0de9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -255,7 +255,7 @@ def __init__(
         override_neuron_config: Optional[dict[str, Any]] = None,
         override_pooler_config: Optional["PoolerConfig"] = None,
         logits_processor_pattern: Optional[str] = None,
-        generation_config: Optional[str] = None,
+        generation_config: str = "auto",
         enable_sleep_mode: bool = False,
         override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
@@ -951,7 +951,7 @@ def get_multimodal_config(self) -> "MultiModalConfig":
         return self.multimodal_config
 
     def try_get_generation_config(self) -> dict[str, Any]:
-        if self.generation_config is None or self.generation_config == "auto":
+        if self.generation_config in ("auto", "vllm"):
             config = try_get_generation_config(
                 self.hf_config_path or self.model,
                 trust_remote_code=self.trust_remote_code,
@@ -971,17 +971,14 @@ def try_get_generation_config(self) -> dict[str, Any]:
     def get_diff_sampling_param(self) -> dict[str, Any]:
         """
         This method returns a dictionary containing the parameters
-        that differ from the default sampling parameters, but only
-        if `generation_config` is set. If `generation_config` is not
-        set, an empty dictionary is returned.
+        that differ from the default sampling parameters. If
+        `generation_config` is `"vllm"`, an empty dictionary is returned.
 
         Returns:
             dict[str, Any]: A dictionary with the differing sampling
-            parameters if `generation_config` is set, otherwise an
-            empty dictionary.
+            parameters, if `generation_config` is `"vllm"` an empty dictionary.
         """
-        if self.generation_config is None:
-            # When generation_config is not set
+        if self.generation_config == "vllm":
             config = {}
         else:
             config = self.try_get_generation_config()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b00363617da..0e572a6f07b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -207,7 +207,7 @@ class EngineArgs:
 
     kv_transfer_config: Optional[KVTransferConfig] = None
 
-    generation_config: Optional[str] = None
+    generation_config: Optional[str] = "auto"
     override_generation_config: Optional[Dict[str, Any]] = None
     enable_sleep_mode: bool = False
     model_impl: str = "auto"
@@ -1034,13 +1034,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             "--generation-config",
             type=nullable_str,
-            default=None,
+            default="auto",
             help="The folder path to the generation config. "
-            "Defaults to None, no generation config is loaded, vLLM defaults "
-            "will be used. If set to 'auto', the generation config will be "
-            "loaded from model path. If set to a folder path, the generation "
-            "config will be loaded from the specified folder path. If "
-            "`max_new_tokens` is specified in generation config, then "
+            "Defaults to 'auto', the generation config will be loaded from "
+            "model path. If set to 'vllm', no generation config is loaded, "
+            "vLLM defaults will be used. If set to a folder path, the "
+            "generation config will be loaded from the specified folder path. "
+            "If `max_new_tokens` is specified in generation config, then "
             "it sets a server-wide limit on the number of output tokens "
             "for all requests.")
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 98b6679722d..1ba33f78cde 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -109,8 +109,10 @@ def __init__(
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())
         if self.default_sampling_params:
-            logger.info("Overwriting default chat sampling param with: %s",
-                        self.default_sampling_params)
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default chat sampling params from %s: %s",
+                        source, self.default_sampling_params)
 
     async def create_chat_completion(
         self,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 667ff448e04..1db91a91e37 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -55,9 +55,10 @@ def __init__(
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())
         if self.default_sampling_params:
-            logger.info(
-                "Overwriting default completion sampling param with: %s",
-                self.default_sampling_params)
+            source = self.model_config.generation_config
+            source = "model" if source == "auto" else source
+            logger.info("Using default completion sampling params from %s: %s",
+                        source, self.default_sampling_params)
 
     async def create_completion(
         self,

From d6085bac7726fddccef40c0b8771bb628dba1ae9 Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Sat, 8 Mar 2025 14:58:46 +0800
Subject: [PATCH 0598/1240] [Doc]add doc for Qwen models tool calling (#14478)

Signed-off-by: WangErXiao <863579016@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/tool_calling.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 85a9e037398..2e1081bf8d1 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -209,6 +209,15 @@ AI21's Jamba-1.5 models are supported.
 
 Flags: `--tool-call-parser jamba`
 
+### Qwen Models
+
+For Qwen2.5, the chat template in tokenizer_config.json has already included support for the Hermes-style tool use. Therefore, you can use the `hermes` parser to enable tool calls for Qwen models. For more detailed information, please refer to the official [Qwen documentation](https://qwen.readthedocs.io/en/latest/framework/function_call.html#vllm)
+
+* `Qwen/Qwen2.5-*`
+* `Qwen/QwQ-32B`
+
+Flags: `--tool-call-parser hermes`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.

From 35226b0c8607bfa02c05811f3266fd4050b9e35b Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Sat, 8 Mar 2025 15:07:32 +0800
Subject: [PATCH 0599/1240] =?UTF-8?q?[Doc]=20Added=20QwQ-32B=20to=20the=20?=
 =?UTF-8?q?supported=20models=20list=20in=20the=20reasoning=20out=E2=80=A6?=
 =?UTF-8?q?=20(#14479)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: WangErXiao <863579016@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index e5c03793f75..b5fad26368b 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -13,6 +13,7 @@ vLLM currently supports the following reasoning models:
 | Model Series | Parser Name | Structured Output Support |
 |--------------|-------------|------------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
 
 ## Quickstart
 

From 1ea3d49242b70fe8f0ec60980120d697ef48819c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 8 Mar 2025 15:12:22 +0800
Subject: [PATCH 0600/1240] [Bugfix] Make the deviceprofiler include LoRA
 memory. (#14469)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/model_runner.py | 58 ++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0ea1d5dcbbb..473bd901b5b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1111,41 +1111,41 @@ def load_model(self) -> None:
         with DeviceMemoryProfiler(self.device) as m:
             time_before_load = time.perf_counter()
             self.model = get_model(vllm_config=self.vllm_config)
+            if self.lora_config:
+                assert supports_lora(
+                    self.model
+                ), f"{self.model.__class__.__name__} does not support LoRA yet."
+
+                if supports_multimodal(self.model):
+                    logger.warning(
+                        "Regarding multimodal models, vLLM currently "
+                        "only supports adding LoRA to language model.")
+                # It's necessary to distinguish between the
+                # max_position_embeddings of VLMs and LLMs.
+                if hasattr(self.model.config, "max_position_embeddings"):
+                    max_pos_embeddings = (
+                        self.model.config.max_position_embeddings)
+                else:
+                    max_pos_embeddings = (
+                        self.model.config.text_config.max_position_embeddings)
+
+                self.lora_manager = LRUCacheWorkerLoRAManager(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens,
+                    self.vocab_size,
+                    self.lora_config,
+                    self.device,
+                    self.model.embedding_modules,
+                    self.model.embedding_padding_modules,
+                    max_position_embeddings=max_pos_embeddings,
+                )
+                self.model = self.lora_manager.create_lora_manager(self.model)
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GB and %.6f seconds",
                     self.model_memory_usage / float(2**30),
                     time_after_load - time_before_load)
-
-        if self.lora_config:
-            assert supports_lora(
-                self.model
-            ), f"{self.model.__class__.__name__} does not support LoRA yet."
-
-            if supports_multimodal(self.model):
-                logger.warning("Regarding multimodal models, vLLM currently "
-                               "only supports adding LoRA to language model.")
-            # It's necessary to distinguish between the max_position_embeddings
-            # of VLMs and LLMs.
-            if hasattr(self.model.config, "max_position_embeddings"):
-                max_pos_embeddings = self.model.config.max_position_embeddings
-            else:
-                max_pos_embeddings = (
-                    self.model.config.text_config.max_position_embeddings)
-
-            self.lora_manager = LRUCacheWorkerLoRAManager(
-                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens,
-                self.vocab_size,
-                self.lora_config,
-                self.device,
-                self.model.embedding_modules,
-                self.model.embedding_padding_modules,
-                max_position_embeddings=max_pos_embeddings,
-            )
-            self.model = self.lora_manager.create_lora_manager(self.model)
-
         if self.prompt_adapter_config:
             self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
                 self.scheduler_config.max_num_seqs,

From c2a1bc48abd77ccf19e0764b41435624feceea68 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 8 Mar 2025 08:35:07 +0100
Subject: [PATCH 0601/1240] Add training doc signposting to TRL (#14439)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/index.md        |  8 ++++++++
 docs/source/training/trl.md | 13 +++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 docs/source/training/trl.md

diff --git a/docs/source/index.md b/docs/source/index.md
index 0bd8e12d088..3db79456a4e 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -100,6 +100,14 @@ features/compatibility_matrix
 
 % Details about running vLLM
 
+:::{toctree}
+:caption: Training
+:maxdepth: 1
+
+training/trl.md
+
+:::
+
 :::{toctree}
 :caption: Inference and Serving
 :maxdepth: 1
diff --git a/docs/source/training/trl.md b/docs/source/training/trl.md
new file mode 100644
index 00000000000..ebdf593dbde
--- /dev/null
+++ b/docs/source/training/trl.md
@@ -0,0 +1,13 @@
+# Transformers Reinforcement Learning
+
+Transformers Reinforcement Learning (TRL) is a full stack library that provides a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more. The library is integrated with 🤗 transformers.
+
+Online methods such as GRPO or Online DPO require the model to generate completions. vLLM can be used to generate these completions!
+
+See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
+
+:::{seealso}
+For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
+- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
+- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
+:::

From 512b0cd645c71b6e7ecf7d7eaa87a142c8d40949 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 8 Mar 2025 03:11:56 -0500
Subject: [PATCH 0602/1240] [Build/BugFix] Fix hopper 12.8 build (#14354)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                | 78 +++++++++++--------
 .../cutlass_w8a8/scaled_mm_c3x_sm100.cu       | 34 ++++++++
 ...scaled_mm_c3x.cu => scaled_mm_c3x_sm90.cu} | 27 +------
 .../cutlass_w8a8/scaled_mm_entry.cu           | 30 ++++---
 4 files changed, 96 insertions(+), 73 deletions(-)
 create mode 100644 csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
 rename csrc/quantization/cutlass_w8a8/{scaled_mm_c3x.cu => scaled_mm_c3x_sm90.cu} (74%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c09325ad059..5349b64aecb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -333,36 +333,64 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures, or CUDA not >= 12.0")
   endif()
 
+
+  set(SCALED_MM_3X_ARCHS)
   # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
-  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
-  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a;10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
+  # CUDA 12.0 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
     set(SRCS
-       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
-    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
                      "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                      "later if you intend on running FP8 quantized models on "
                      "Hopper.")
     else()
-      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
+      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
                      "in CUDA target architectures")
     endif()
+  endif()
 
-    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
-    # build any 3x kernels
-    set(SCALED_MM_3X_ARCHS)
+  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
+  # CUDA 12.8 or later
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+    set(SRCS
+      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
+      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
+    )
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
+    # Let scaled_mm_c2x know it doesn't need to build these arches
+    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
+    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
+                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
+                     "later if you intend on running FP8 quantized models on "
+                     "Blackwell.")
+    else()
+      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
   endif()
 
   #
@@ -395,16 +423,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
   # require CUDA 12.2 or later (and only work on Hopper and Blackwell).
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
-      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
-    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
+    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
   else()
-    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
       message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
                      "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
                      "if you intend on running FP8 sparse quantized models on Hopper.")
@@ -432,22 +460,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
-  # FP8 Blackwell Archs 
-  cuda_archs_loose_intersection(BLACKWELL_ARCHS "10.0;10.1;12.0" "${CUDA_ARCHS}")
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND BLACKWELL_ARCHS)
-    set(SRCS 
-      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
-    )
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${BLACKWELL_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    message(STATUS "Building FP8 for archs: ${BLACKWELL_ARCHS}")
-  else()
-    # clear BLACKWELL_ARCHS
-    set(BLACKWELL_ARCHS)
-  endif()
-  
   #
   # Machete kernels
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
new file mode 100644
index 00000000000..459eb1bb76e
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu
@@ -0,0 +1,34 @@
+#include <cudaTypedefs.h>
+#include "c3x/scaled_mm_kernels.hpp"
+
+#include "cuda_utils.h"
+
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm100 (Blackwell).
+*/
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+
+void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
+                             torch::Tensor const& b,
+                             torch::Tensor const& a_scales,
+                             torch::Tensor const& b_scales,
+                             std::optional<torch::Tensor> const& bias) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int M = a.size(0), N = b.size(1), K = a.size(1);
+  TORCH_CHECK(
+      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
+          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
+      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
+
+  // Standard per-tensor/per-token/per-channel scaling
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
+              "Currently, only fp8 gemm is implemented for Blackwell");
+  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
+}
+
+#endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
similarity index 74%
rename from csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
rename to csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
index df7c178f79d..bcb91040d5e 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu
@@ -5,9 +5,11 @@
 
 /*
    This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
+   NVIDIA GPUs with sm90a (Hopper).
 */
 
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
+
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
@@ -72,27 +74,4 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a,
                                         azp, bias);
 }
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12080
-
-void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
-                             torch::Tensor const& b,
-                             torch::Tensor const& a_scales,
-                             torch::Tensor const& b_scales,
-                             std::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-
-  int M = a.size(0), N = b.size(1), K = a.size(1);
-  TORCH_CHECK(
-      (a_scales.numel() == 1 || a_scales.numel() == a.size(0)) &&
-          (b_scales.numel() == 1 || b_scales.numel() == b.size(1)),
-      "Currently, block scaled fp8 gemm is not implemented for Blackwell");
-
-  // Standard per-tensor/per-token/per-channel scaling
-  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn,
-              "Currently, only fp8 gemm is implemented for Blackwell");
-  vllm::cutlass_scaled_mm_sm100_fp8(c, a, b, a_scales, b_scales, bias);
-}
-
 #endif
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index f2508ea7610..b08386459cb 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -23,12 +23,15 @@ void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& b,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+#endif
+
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
 void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              torch::Tensor const& b,
                              torch::Tensor const& a_scales,
@@ -60,7 +63,7 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a,
                                 std::optional<torch::Tensor> const& azp,
                                 std::optional<torch::Tensor> const& bias);
 
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a,
                                 torch::Tensor const& b,
                                 torch::Tensor const& a_scales,
@@ -121,26 +124,21 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
 
   at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
   int32_t version_num = get_sm_version_num();
-  // Hopper
-
-  // Guard against compilation issues for sm90 kernels
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
 
-  #if defined CUDA_VERSION && CUDA_VERSION < 12080
-  if (version_num >= 90 && version_num < 100) {
-    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
+#if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
+  if (version_num >= 100) {
+    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
     return;
   }
-  #else
+#endif
+
+  // Guard against compilation issues for sm90 kernels
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
   if (version_num >= 90 && version_num < 100) {
+    // Hopper
     cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias);
     return;
-  } else if (version_num >= 100) {
-    cutlass_scaled_mm_sm100(c, a, b, a_scales, b_scales, bias);
-    return;
   }
-  #endif
-
 #endif
 
 #if defined ENABLE_SCALED_MM_C2X && ENABLE_SCALED_MM_C2X
@@ -211,7 +209,7 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
 
   int32_t version_num = get_sm_version_num();
 
-#if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X
+#if defined ENABLE_SCALED_MM_SM90 && ENABLE_SCALED_MM_SM90
   if (version_num >= 90) {
     cutlass_scaled_mm_azp_sm90(c, a, b, a_scales, b_scales, azp_adj, azp, bias);
     return;

From 81f1d84601df8cf15ba6f66f13330c3f934a85cb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 8 Mar 2025 10:51:39 +0100
Subject: [PATCH 0603/1240] Add RLHF document (#14482)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/generate_examples.py |  3 ++-
 docs/source/index.md             |  1 +
 docs/source/training/rlhf.md     | 11 +++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/training/rlhf.md

diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index c51ca18667e..1206d5fe753 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -14,13 +14,14 @@
 def fix_case(text: str) -> str:
     subs = {
         "api": "API",
-        "Cli": "CLI",
+        "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
         "tpu": "TPU",
         "aqlm": "AQLM",
         "gguf": "GGUF",
         "lora": "LoRA",
+        "rlhf": "RLHF",
         "vllm": "vLLM",
         "openai": "OpenAI",
         "multilora": "MultiLoRA",
diff --git a/docs/source/index.md b/docs/source/index.md
index 3db79456a4e..09ada43335c 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -105,6 +105,7 @@ features/compatibility_matrix
 :maxdepth: 1
 
 training/trl.md
+training/rlhf.md
 
 :::
 
diff --git a/docs/source/training/rlhf.md b/docs/source/training/rlhf.md
new file mode 100644
index 00000000000..00822aefe11
--- /dev/null
+++ b/docs/source/training/rlhf.md
@@ -0,0 +1,11 @@
+# Reinforcement Learning from Human Feedback
+
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+
+vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
+
+See the following basic examples to get started if you don't want to use an existing library:
+
+- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html)
+- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html)
+- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html)

From 28e6b5cee14e98ed98059e9901aaa3dbd51c45d9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 8 Mar 2025 19:30:09 +0800
Subject: [PATCH 0604/1240] [CI/Build] Use a fixed seed to avoid flaky tests
 (#14480)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_chat_echo.py | 2 --
 tests/entrypoints/openai/test_metrics.py   | 2 --
 tests/entrypoints/openai/test_root_path.py | 2 --
 tests/utils.py                             | 7 +++++++
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 15da0f2fb5f..3e76158a8c1 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -24,8 +24,6 @@ def server():
         "4080",
         "--chat-template",
         DUMMY_CHAT_TEMPLATE,
-        "--seed",
-        "0",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index aa290fc25d7..2bffd0ce138 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -47,8 +47,6 @@ def default_server_args():
         "--enforce-eager",
         "--max-num-seqs",
         "128",
-        "--seed",
-        "0",
     ]
 
 
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index 71fe8cbdba3..c9fa192fb6a 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -30,8 +30,6 @@ def server():
         "/" + ROOT_PATH,
         "--chat-template",
         DUMMY_CHAT_TEMPLATE,
-        "--seed",
-        "0",
     ]
     envs = os.environ.copy()
 
diff --git a/tests/utils.py b/tests/utils.py
index ba490cc3845..fc19c8d031b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -76,6 +76,7 @@ def __init__(self,
                  vllm_serve_args: list[str],
                  *,
                  env_dict: Optional[dict[str, str]] = None,
+                 seed: Optional[int] = 0,
                  auto_port: bool = True,
                  max_wait_seconds: Optional[float] = None) -> None:
         if auto_port:
@@ -87,6 +88,12 @@ def __init__(self,
             vllm_serve_args = vllm_serve_args + [
                 "--port", str(get_open_port())
             ]
+        if seed is not None:
+            if "--seed" in vllm_serve_args:
+                raise ValueError("You have manually specified the seed "
+                                 f"when `seed={seed}`.")
+
+            vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]
 
         parser = FlexibleArgumentParser(
             description="vLLM's remote OpenAI server.")

From a483a11f4c4f080b58697ef2aae3675d8b095dec Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Sat, 8 Mar 2025 08:19:38 -0500
Subject: [PATCH 0605/1240] [V1] TPU - Add tensor parallel support via Ray
 (#13618)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/llm/test_accuracy.py    |  8 ++++
 tests/v1/tpu/__init__.py                  |  0
 tests/v1/tpu/test_basic.py                | 54 +++++++++++++++++++++++
 vllm/executor/ray_distributed_executor.py |  7 ++-
 vllm/executor/ray_utils.py                | 10 ++++-
 vllm/v1/worker/tpu_model_runner.py        |  2 +
 vllm/v1/worker/tpu_worker.py              |  3 +-
 7 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 tests/v1/tpu/__init__.py
 create mode 100644 tests/v1/tpu/test_basic.py

diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 620355923b4..3ebc5a44d80 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -42,6 +42,10 @@ def run_test(more_args=None):
             ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
 
 
+# TODO: [AlexM] Fix it with new CI/CD tests
+TPU_TP_TEST_STR = ""  #"tensor_parallel_size=4"
+
+
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
@@ -56,6 +60,10 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
             # Limit compilation time for TPU V1
             more_args = "max_num_seqs=64"
 
+            # Add TP test (if provided)
+            if TPU_TP_TEST_STR:
+                more_args += ",{}".format(TPU_TP_TEST_STR)
+
         run_test(more_args)
 
 
diff --git a/tests/v1/tpu/__init__.py b/tests/v1/tpu/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
new file mode 100644
index 00000000000..0309f545ea4
--- /dev/null
+++ b/tests/v1/tpu/test_basic.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A basic correctness check for TPUs
+
+Run `pytest tests/v1/tpu/test_basic.py`.
+"""
+import pytest
+
+from vllm.platforms import current_platform
+
+from ...conftest import VllmRunner
+
+MODELS = [
+    # "Qwen/Qwen2-7B-Instruct",
+    "meta-llama/Llama-3.1-8B",
+    # TODO: Add models here as necessary
+]
+
+TENSOR_PARALLEL_SIZES = [1]
+
+# TODO: Enable when CI/CD will have a multi-tpu instance
+# TENSOR_PARALLEL_SIZES = [1, 4]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic test for TPU only")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
+def test_models(
+    monkeypatch,
+    model: str,
+    max_tokens: int,
+    enforce_eager: bool,
+    tensor_parallel_size: int,
+) -> None:
+    prompt = "The next numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with VllmRunner(
+                model,
+                max_model_len=8192,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                max_num_seqs=16,
+                tensor_parallel_size=tensor_parallel_size) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+    output = vllm_outputs[0][1]
+    assert "1024" in output
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 3b1735fdcf7..18ff32155c5 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -73,9 +73,14 @@ class RayDistributedExecutor(DistributedExecutorBase):
     def _init_executor(self) -> None:
         self.forward_dag: Optional[ray.dag.CompiledDAG] = None
         if envs.VLLM_USE_V1:
-            # v1 always uses the compiled DAG and SPMD worker.
+            # V1 uses SPMD worker and compiled DAG
             os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1"
             os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1"
+
+            # For TPU, avoid compiling NVIDIA's NCCL
+            if current_platform.is_tpu():
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
         # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 5d8b48ac67b..c1bf2fb316d 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -11,6 +11,7 @@
 from vllm.config import ParallelConfig
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import get_ip
 from vllm.worker.worker_base import WorkerWrapperBase
@@ -106,10 +107,15 @@ def setup_device_if_necessary(self):
             # on a background thread, so we need to reset torch's current
             # device.
             # We can remove this API after it is fixed in compiled graph.
-            import torch
             assert self.worker is not None, "Worker is not initialized"
             if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
+                if current_platform.is_tpu():
+                    # Not needed
+                    pass
+                else:
+                    import torch
+                    torch.cuda.set_device(self.worker.device)
+
                 self.compiled_dag_cuda_device_set = True
 
         def execute_model_ray(
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7ac07906fab..1da4a59a26c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -21,6 +21,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
+from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
 from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
                                                NUM_QUERIES_PER_BLOCK,
@@ -545,6 +546,7 @@ def _gather_encoder_outputs(
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> ModelRunnerOutput:
         # Update cached state
         self._update_states(scheduler_output)
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index e190797dfb7..9f595611927 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -96,7 +96,8 @@ def init_device(self):
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
-        xm.set_rng_state(self.model_config.seed, self.device)
+        if self.model_config.seed is not None:
+            xm.set_rng_state(self.model_config.seed, self.device)
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.

From f61db1552af62bd9fba1a47d084b122e34eb6ec6 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 8 Mar 2025 21:57:14 +0800
Subject: [PATCH 0606/1240] [VLM] Add TP support for Phi-4-MM (#14453)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/audio_language.py |   1 +
 vllm/model_executor/models/phi4mm.py         |  73 +++-----
 vllm/model_executor/models/phi4mm_audio.py   | 168 ++-----------------
 vllm/model_executor/models/phi4mm_utils.py   | 103 +-----------
 4 files changed, 50 insertions(+), 295 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 4aa233211b0..293b9fddac8 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -77,6 +77,7 @@ def run_phi4mm(questions: str, audio_count: int):
         enable_lora=True,
         max_lora_rank=320,
         lora_extra_vocab_size=0,
+        limit_mm_per_prompt={"audio": audio_count},
     )
     lora_request = LoRARequest("speech", 1, speech_lora_path)
     # To maintain code compatibility in this script, we add LoRA here.
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 6f5ea5af6c0..89abfc5919a 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -15,7 +15,7 @@
 from transformers.utils import logging
 
 from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import get_pp_group
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext)
 from vllm.inputs.data import TokenInputs, token_inputs
@@ -34,7 +34,7 @@
 
 from .interfaces import SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
-from .utils import maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 from .vision_siglip_navit import get_siglip_vision_model
 
 # <|endoftext10|> (see vocab.json in hf model)
@@ -352,12 +352,6 @@ def __init__(self,
         # n_embed or hidden_size
         hidden_size = config.n_embd if hasattr(
             config, 'n_embd') else config.hidden_size
-        if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
-            embd_drop = config.embd_pdrop if hasattr(
-                config, 'embd_pdrop') else config.embed_pdrop
-            self.drop = nn.Dropout(embd_drop)
-        else:
-            self.drop = None
 
         # layer_idx to output the img features
         if isinstance(config.img_processor, dict):
@@ -1431,6 +1425,20 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
         ],
     }
 
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            "base_layer.": "",
+        },
+        orig_to_new_prefix={
+            "model.embed_tokens_extend.audio_embed.audio_projection.vision.":
+            "embed_tokens_extend.audio_projection_for_vision.",
+            "model.embed_tokens_extend.audio_embed.audio_projection.speech.":
+            "embed_tokens_extend.audio_projection.",
+            "model.embed_tokens_extend.audio_embed.": "embed_tokens_extend.",
+            "model.embed_tokens_extend.image_embed.": "vision_encoder.",
+        },
+    )
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -1445,8 +1453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.lora_config = lora_config
 
         # Tensor/Pipeline parallel not supported for now.
-        assert get_tensor_model_parallel_world_size(
-        ) == 1, "tensor parallel is not supported"
         assert get_pp_group(
         ).world_size == 1, "pipeline parallel is not supported"
 
@@ -1686,44 +1692,6 @@ def merge_image_features_to_inputs_embeds(
         )
         return merged_embeds
 
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> None:
-        weights = {name: weight for name, weight in weights}
-        adjusted_weights = {}
-
-        for name, weight in weights.items():
-            # NOTE vision-speech tasks use a separate projection layer
-            audio_proj_4v = \
-                "model.embed_tokens_extend.audio_embed.audio_projection.vision"
-            if name.startswith(audio_proj_4v):
-                name = name.replace(
-                    audio_proj_4v,
-                    "embed_tokens_extend.audio_projection_for_vision")
-
-            name = (name.replace(
-                "model.embed_tokens_extend.audio_embed."\
-                    "audio_projection.speech.",
-                "embed_tokens_extend.audio_projection.",
-            ).replace(
-                "model.embed_tokens_extend.audio_embed.",
-                "embed_tokens_extend.",
-            ).replace("model.embed_tokens_extend.image_embed.",
-                      "vision_encoder."))
-            # NOTE: this is deal with LoRA injection, where `base_layer`
-            # remains as the original layer in the model
-            if name.endswith(".base_layer.weight"):
-                name = name.replace(".base_layer.weight", ".weight")
-            adjusted_weights[name] = weight
-
-        missing_keys, unexpected_keys = self.load_state_dict(adjusted_weights,
-                                                             strict=False)
-        logger.debug("*** missing keys:")
-        for key in missing_keys:
-            logger.debug(key)
-        logger.debug("**** unexpected keys:")
-        for key in unexpected_keys:
-            logger.debug(key)
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -1796,6 +1764,13 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        weights = ((name, data) for name, data in weights
+                   if "lora" not in name)
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
     def get_mm_mapping(self) -> MultiModelKeys:
         """
         Get the module prefix in multimodal models
@@ -1804,4 +1779,4 @@ def get_mm_mapping(self) -> MultiModelKeys:
             language_model="model.",
             connector=["audio_projection_for_vision", "audio_projection"],
             tower_model=["vision_encoder", "embed_tokens_extend"],
-        )
\ No newline at end of file
+        )
diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py
index f9d4881c55e..db90848f980 100644
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@@ -6,69 +6,26 @@
 #!/usr/bin/env python3
 import abc
 import math
-from functools import partial
-from typing import Callable, Dict, List, Literal, Optional, Union
+from typing import List, Literal, Optional
 
 import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    CheckpointImpl, CheckpointWrapper, checkpoint_wrapper, offload_wrapper)
+    CheckpointWrapper)
 from torch.distributed.fsdp.fully_sharded_data_parallel import (
     FullyShardedDataParallel)
-from torch.utils.checkpoint import checkpoint
 from transformers import PretrainedConfig
 
 from vllm.model_executor.models.phi4mm_utils import (
     AbsolutePositionalEncoding, ConvModule, FeedForward, MeanVarianceNormLayer,
-    MultiHeadedAttention, NemoConvSubsampling, T5RelativeAttentionLogitBias,
-    adaptive_enc_mask, attn_checkpointing, embedding_checkpoint_wrapper,
-    get_offset, repeat, unfold_tensor, validate_checkpointing_config)
+    MultiHeadedAttention, MultiSequential, NemoConvSubsampling,
+    T5RelativeAttentionLogitBias, adaptive_enc_mask, get_offset, unfold_tensor)
 
 _AUDIO_PLACEHOLDER_TOKEN_ID = 200011  # <|endoftext11|>
 
 
-def encoder_checkpoint_wrapper(
-    activation_checkpointing: Union[str, Dict],
-    layer_cls: type,
-    idx: int = 0,
-) -> Callable:
-    """return encoder activation checkpoint wrapper"""
-    validate_checkpointing_config(activation_checkpointing)
-
-    if isinstance(activation_checkpointing, str):
-        if activation_checkpointing:
-            if activation_checkpointing == "offload":
-                return offload_wrapper
-            return partial(checkpoint_wrapper)
-        return lambda x: x
-
-    if isinstance(activation_checkpointing, dict):
-        target_layer_cls = activation_checkpointing.get(
-            "module", "transformer")
-        if target_layer_cls.lower() == "transformer":
-            target_layer_cls = (
-                "EncoderLayer",
-                "ConformerEncoderLayer",
-            )
-        elif target_layer_cls.lower() == "attention":
-            target_layer_cls = ("MultiHeadedAttention", "MultiHeadAttention")
-        checkpointing_interval = activation_checkpointing.get("interval", 1)
-        offloading = activation_checkpointing.get("offload", False)
-        impl = (CheckpointImpl.REENTRANT if activation_checkpointing.get(
-            "reentrant", True) else CheckpointImpl.NO_REENTRANT)
-
-        if (idx % checkpointing_interval == 0
-                and layer_cls.__name__ in target_layer_cls):
-            if offloading:
-                return offload_wrapper
-            return partial(checkpoint_wrapper, checkpoint_impl=impl)
-        return lambda x: x
-
-    raise ValueError("Invalid activation_checkpointing config")
-
-
 class ConformerEncoderLayer(nn.Module):
     """ConformerEncoder Layer module.
     for more details see conformer paper:
@@ -208,10 +165,7 @@ def __init__(
             bias_in_glu=bias_in_glu,
         )
 
-        self.self_attn = encoder_checkpoint_wrapper(
-            activation_checkpointing,
-            MultiHeadedAttention,
-        )(MultiHeadedAttention(
+        self.self_attn = MultiHeadedAttention(
             n_head,
             d_model,
             dropout_rate,
@@ -221,7 +175,7 @@ def __init__(
             use_pt_scaled_dot_product_attention=
             use_pt_scaled_dot_product_attention,
             group_size=attn_group_sizes,
-        ))
+        )
         self.conv = ConvModule(
             d_model,
             ext_pw_out_channel,
@@ -441,26 +395,8 @@ def __init__(
         else:
             raise NotImplementedError
 
-    def post_init(self, init_model_config):
-
-        pretrained_speech_encoder_path = init_model_config.get(
-            "pretrained_speech_encoder_path", None)
-        if pretrained_speech_encoder_path:
-            model_state = torch.load(pretrained_speech_encoder_path,
-                                     map_location="cpu")
-            encoder_state_dict = {}
-            for k, v in model_state.items():
-                if "encoder." in k:
-                    tmp_k = k.replace("encoder.", "")
-                    encoder_state_dict[tmp_k] = v
-
-            if hasattr(self, "encoder_embedding"):
-                del self.encoder_embedding
-            self.load_state_dict(encoder_state_dict)
-
-        if not hasattr(self, "encoder_embedding"):
-            self.encoder_embedding = MeanVarianceNormLayer(
-                self.encoder_embedding_config["input_size"])
+        self.encoder_embedding = MeanVarianceNormLayer(
+            self.encoder_embedding_config["input_size"])
 
     def compute_lens_change(self, feature_lens):
         """feature_lens: int
@@ -558,14 +494,6 @@ def _streaming_mask(self, seq_len, batch_size, chunk_size, left_chunk):
         # Create mask matrix for streaming
         # S stores start index. if chunksize is 18, s is [0,18,36,....]
         chunk_start_idx = np.arange(0, seq_len, chunk_size_train_eff)
-        # avoid randomness when run evaluation or decoding
-        if self.training and np.random.rand() > 0.5:
-            # Either first or last chunk is not complete.
-            # If only the last one is not complete, EOS is not effective
-            chunk_start_idx = seq_len - chunk_start_idx
-            chunk_start_idx = chunk_start_idx[::-1]
-            chunk_start_idx = chunk_start_idx[:-1]
-            chunk_start_idx = np.insert(chunk_start_idx, 0, 0)
 
         enc_streaming_mask = (adaptive_enc_mask(
             seq_len, chunk_start_idx,
@@ -883,23 +811,17 @@ def __init__(  # pylint: disable-all
         self.num_blocks = num_blocks
         self.num_lang = num_lang
         self.kernel_size = kernel_size
-        self.embed = embedding_checkpoint_wrapper(activation_checkpointing)(
-            self.embed)
         self.replication_pad_for_subsample_embedding: bool = (
             replication_pad_for_subsample_embedding)
         assert (self.num_heads % attention_group_size == 0
                 ), "attention_group_size must divide n_head"
         self.num_heads_k = self.num_heads // attention_group_size
 
-        self.encoders = repeat(
-            num_blocks,
-            lambda i: encoder_checkpoint_wrapper(activation_checkpointing,
-                                                 ConformerEncoderLayer, i)
-            (ConformerEncoderLayer(
+        self.encoders = MultiSequential(*[
+            ConformerEncoderLayer(
                 d_model=attention_dim,
                 ext_pw_out_channel=ext_pw_out_channel,
-                depthwise_seperable_out_channel=
-                depthwise_seperable_out_channel,
+                depthwise_seperable_out_channel=depthwise_seperable_out_channel,
                 depthwise_multiplier=depthwise_multiplier,
                 n_head=attention_heads,
                 d_ffn=linear_units,
@@ -916,14 +838,13 @@ def __init__(  # pylint: disable-all
                 bias_in_glu=bias_in_glu,
                 linear_glu_in_convm=linear_glu_in_convm,
                 attention_glu_type=attention_glu_type,
-                activation_checkpointing=attn_checkpointing(
-                    activation_checkpointing, i),
+                activation_checkpointing=activation_checkpointing,
                 export=export,
                 use_pt_scaled_dot_product_attention=
                 use_pt_scaled_dot_product_attention,
                 attn_group_sizes=attention_group_size,
-            )),
-        )
+            ) for _ in range(num_blocks)
+        ])
         self.extra_layer_output_idx = extra_layer_output_idx
         self.extra_multi_layer_output_idxs = extra_multi_layer_output_idxs
         # Make a zeros scalar we can use in get_initial_state to determine
@@ -1041,9 +962,6 @@ def forward(self, xs_pad, masks):
 
         return input_tensor, masks  # , layer_emb
 
-    def gradient_checkpointing_enable(self):
-        pass
-
 
 class WindowQformer(nn.Module):
     """Window-level Qformer"""
@@ -1077,13 +995,6 @@ def __init__(
         self.after_norm = (nn.LayerNorm(attention_dim, eps=1e-12)
                            if normalize_before else None)
         self.window_size = window_size
-        self.gradient_checkpointing_enable = False
-
-    def enable_gradient_checkpointing(self):
-        self.gradient_checkpointing_enable = True
-
-    def disable_gradient_checkpointing(self):
-        self.gradient_checkpointing_enable = False
 
     def forward(self, audio_embed, mask, embed_len=None):
         """forward decoder"""
@@ -1111,20 +1022,10 @@ def forward(self, audio_embed, mask, embed_len=None):
         # NT' x 1 x D
         q = self.queries.expand(bsz * slen, -1, -1)
         for layer in self.decoders:
-            if self.gradient_checkpointing_enable and self.training:
-                q = checkpoint(
-                    layer.__call__,
-                    q,
-                    embed_chunk,
-                    None,
-                    mask,
-                    use_reentrant=True,
-                )
-            else:
-                q = layer(tgt=q,
-                          memory=embed_chunk,
-                          tgt_mask=None,
-                          memory_mask=mask)
+            q = layer(tgt=q,
+                      memory=embed_chunk,
+                      tgt_mask=None,
+                      memory_mask=mask)
 
         if self.after_norm is not None:
             q = self.after_norm(q)
@@ -1147,13 +1048,6 @@ def __init__(self, config: PretrainedConfig, **kwargs) -> None:
         hidden_size = (config.n_embd
                        if hasattr(config, "n_embd") else config.hidden_size)
 
-        if hasattr(config, "embd_pdrop") or hasattr(config, "embed_pdrop"):
-            embd_drop = (config.embd_pdrop if hasattr(config, "embd_pdrop")
-                         else config.embed_pdrop)
-            self.drop = nn.Dropout(embd_drop)
-        else:
-            self.drop = None
-
         # self.wte = nn.Embedding(config.vocab_size, hidden_size)
 
         audio_dim_out = (
@@ -1167,12 +1061,6 @@ def __init__(self, config: PretrainedConfig, **kwargs) -> None:
             assert encoder_config is not None
             self.encoder = ConformerEncoder(**encoder_config)
 
-            # fake initialization, create encoder_embedding layer only so that
-            # in decoding, all parameters can be loaded in
-            # from_pretrained_function in training, we do post init after
-            # from_pretrained function to make sure the correct initialization
-            self.encoder.post_init({})
-
             audio_dim_out = encoder_config["attention_dim"]
             n_mels = encoder_config["input_size"]
         else:
@@ -1221,14 +1109,6 @@ def __init__(self, config: PretrainedConfig, **kwargs) -> None:
         else:
             self.conv_ds = None
 
-        enable_gradient_checkpointing = kwargs.get(
-            "enable_gradient_checkpointing", False)
-        if enable_gradient_checkpointing:
-            self.encoder.gradient_checkpointing_enable()
-
-            if self.qformer:
-                self.qformer.enable_gradient_checkpointing()
-
         projection_cls = kwargs.get("projection_cls", "linear")
         if projection_cls == "linear":
             self.audio_projection = nn.Linear(audio_dim_out, hidden_size)
@@ -1388,16 +1268,4 @@ def forward(
                     hidden_states.dtype).to(hidden_states.device))
                 idx += cnt
 
-        else:
-            if self.training:
-                # hidden_states[:, 0:img_set_tensor.shape[0]]  =
-                # hidden_states[:, 0:img_set_tensor.shape[0]] +
-                # 0 * img_set_tensor.to(hidden_states.dtype)
-                # .to(hidden_states.device)
-                hidden_states[:, 0:1] = hidden_states[:, 0:1] + \
-                    0 * audio_set_tensor[:, 0:1].to(hidden_states.dtype)\
-                        .to(hidden_states.device)
-
-        if self.drop is not None:
-            hidden_states = self.drop(hidden_states)
         return hidden_states
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index 16b62c60836..ca00207a9b6 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -5,14 +5,11 @@
 # but implemented by the Phi-Speech team
 #!/usr/bin/env python3
 import math
-from functools import partial
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
-from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-    CheckpointImpl, checkpoint_wrapper, offload_wrapper)
 
 
 class Block(nn.Module):
@@ -873,10 +870,8 @@ class MeanVarianceNormLayer(nn.Module):
     def __init__(self, input_size):
         super().__init__()
         self.input_size = input_size
-        self.register_buffer("global_mean", torch.zeros(input_size))
-        self.register_buffer("global_invstd", torch.ones(input_size))
-        self.global_mean: Optional[Tensor]
-        self.global_invstd: Optional[Tensor]
+        self.global_mean = nn.Parameter(torch.zeros(input_size))
+        self.global_invstd = nn.Parameter(torch.ones(input_size))
 
     def forward(self, input_: Tensor) -> Tensor:
         """MeanVarianceNormLayer Forward
@@ -1023,21 +1018,10 @@ def forward(
         self,
         x,
     ):
-        if self.training:
-            x = F.pad(
-                x,
-                pad=(
-                    self._left_padding,
-                    self._right_padding,
-                    self._left_padding,
-                    self._right_padding,
-                ),
-            )
-        else:
-            x = F.pad(
-                x,
-                pad=(self._left_padding, self._right_padding, 0, 0),
-            )
+        x = F.pad(
+            x,
+            pad=(self._left_padding, self._right_padding, 0, 0),
+        )
         x = super().forward(x)
         return x
 
@@ -1840,68 +1824,6 @@ def forward(
         return self.linear_out(x)  # (batch, time1, d_model)
 
 
-def validate_checkpointing_config(activation_checkpointing):
-    """validate activation checkpointing configuration"""
-    if isinstance(activation_checkpointing, str):
-        assert activation_checkpointing in (
-            "",
-            "checkpoint",
-            "offload",
-        ), "activation_checkpointing has to be a dict or a str in "\
-            "('', 'checkpoint', 'offload')."
-    elif isinstance(activation_checkpointing, dict):
-        assert activation_checkpointing.get("module", "transformer") in (
-            "transformer",
-            "attention",
-        ), "module in activation_checkpointing has to be in "\
-            "('transformer', 'attention')."
-    else:
-        raise ValueError("activation_checkpointing has to be a str"\
-                         " or dict.")
-
-
-def embedding_checkpoint_wrapper(
-    activation_checkpointing: Union[str, Dict], ) -> Callable:
-    """return encoder embedding activation checkpoint wrapper"""
-    validate_checkpointing_config(activation_checkpointing)
-
-    if isinstance(activation_checkpointing, str):
-        if activation_checkpointing:
-            if activation_checkpointing == "offload":
-                return offload_wrapper
-            return partial(checkpoint_wrapper)
-        return lambda x: x
-
-    if isinstance(activation_checkpointing, dict):
-        enabled = activation_checkpointing.get("embed", False)
-        if enabled:
-            offloading = activation_checkpointing.get("offload", False)
-            if offloading:
-                return offload_wrapper
-            impl = (CheckpointImpl.REENTRANT if activation_checkpointing.get(
-                "reentrant", False) else CheckpointImpl.NO_REENTRANT)
-            return partial(checkpoint_wrapper, checkpoint_impl=impl)
-        return lambda x: x
-    raise ValueError("Invalid activation_checkpointing config")
-
-
-def attn_checkpointing(activation_checkpointing: Union[str, Dict],
-                       i) -> Union[str, Dict]:
-    """return activation checkpointing config for attention layer"""
-    if isinstance(activation_checkpointing, str):
-        return ""
-
-    if isinstance(activation_checkpointing, dict):
-        target_layer_cls = activation_checkpointing.get(
-            "module", "transformer")
-        checkpointing_interval = activation_checkpointing.get("interval", 1)
-        if target_layer_cls == "attention" and i % checkpointing_interval == 0:
-            return activation_checkpointing
-        return ""
-
-    raise ValueError("Invalid activation_checkpointing config")
-
-
 class MultiSequential(torch.nn.Sequential):
     """Multi-input multi-output torch.nn.Sequential"""
 
@@ -1913,17 +1835,6 @@ def forward(self, *args):
         return args
 
 
-def repeat(repeat_num, module_gen_fn):
-    """repeat module N times
-
-    :param int repeat_num: repeat time
-    :param function module_gen_fn: function to generate module
-    :return: repeated modules
-    :rtype: MultiSequential
-    """
-    return MultiSequential(*[module_gen_fn(i) for i in range(repeat_num)])
-
-
 def get_offset(input_layer: str, time_reduction: int):
     """Get an offset. We will use the offset for determining #frames of a 
     subsampled feature.

From baad27f6720d2e1aa4e0dd45839a32ea69efb8a8 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Sat, 8 Mar 2025 08:57:46 -0500
Subject: [PATCH 0607/1240] [Misc] add `use_tqdm_on_load` to reduce logs
 (#14407)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                                |  3 ++
 vllm/engine/arg_utils.py                      | 10 +++++
 vllm/model_executor/model_loader/loader.py    | 26 ++++++++++---
 .../model_loader/weight_utils.py              | 37 ++++++++++---------
 4 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index ebdcc5e0de9..ad436a1e65e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1277,6 +1277,8 @@ class LoadConfig:
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
             checkpoints.
+        use_tqdm_on_load: Whether to enable tqdm for showing progress bar during
+            loading. Default to True
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -1284,6 +1286,7 @@ class LoadConfig:
     model_loader_extra_config: Optional[Union[str, dict]] = field(
         default_factory=dict)
     ignore_patterns: Optional[Union[list[str], str]] = None
+    use_tqdm_on_load: bool = True
 
     def compute_hash(self) -> str:
         """
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0e572a6f07b..351ac175e3e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -217,6 +217,7 @@ class EngineArgs:
     additional_config: Optional[Dict[str, Any]] = None
     enable_reasoning: Optional[bool] = None
     reasoning_parser: Optional[str] = None
+    use_tqdm_on_load: bool = True
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -751,6 +752,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             default=1,
                             help=('Maximum number of forward steps per '
                                   'scheduler call.'))
+        parser.add_argument(
+            '--use-tqdm-on-load',
+            dest='use_tqdm_on_load',
+            action=argparse.BooleanOptionalAction,
+            default=EngineArgs.use_tqdm_on_load,
+            help='Whether to enable/disable progress bar '
+            'when loading model weights.',
+        )
 
         parser.add_argument(
             '--multi-step-stream-outputs',
@@ -1179,6 +1188,7 @@ def create_load_config(self) -> LoadConfig:
             download_dir=self.download_dir,
             model_loader_extra_config=self.model_loader_extra_config,
             ignore_patterns=self.ignore_patterns,
+            use_tqdm_on_load=self.use_tqdm_on_load,
         )
 
     def create_engine_config(self,
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 4f1092f68f5..bf226f66112 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -354,11 +354,18 @@ def _get_weights_iterator(
                 self.load_config.download_dir,
                 hf_folder,
                 hf_weights_files,
+                self.load_config.use_tqdm_on_load,
             )
         elif use_safetensors:
-            weights_iterator = safetensors_weights_iterator(hf_weights_files)
+            weights_iterator = safetensors_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         else:
-            weights_iterator = pt_weights_iterator(hf_weights_files)
+            weights_iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
 
         if current_platform.is_tpu():
             # In PyTorch XLA, we should call `xm.mark_step` frequently so that
@@ -806,9 +813,15 @@ def _prepare_weights(self, model_name_or_path: str,
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:
-            iterator = safetensors_weights_iterator(hf_weights_files)
+            iterator = safetensors_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         else:
-            iterator = pt_weights_iterator(hf_weights_files)
+            iterator = pt_weights_iterator(
+                hf_weights_files,
+                self.load_config.use_tqdm_on_load,
+            )
         for org_name, param in iterator:
             # mapping weight names from transformers to vllm while preserving
             # original names.
@@ -1396,7 +1409,10 @@ def _get_weights_iterator(
             revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]:
         """Get an iterator for the model weights based on the load format."""
         hf_weights_files = self._prepare_weights(model_or_path, revision)
-        return runai_safetensors_weights_iterator(hf_weights_files)
+        return runai_safetensors_weights_iterator(
+            hf_weights_files,
+            self.load_config.use_tqdm_on_load,
+        )
 
     def download_model(self, model_config: ModelConfig) -> None:
         """Download model if necessary"""
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index d184079fb25..926172a1daa 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -366,16 +366,22 @@ def filter_files_not_needed_for_inference(
 _BAR_FORMAT = "{desc}: {percentage:3.0f}% Completed | {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]\n"  # noqa: E501
 
 
+def enable_tqdm(use_tqdm_on_load: bool):
+    return use_tqdm_on_load and (not torch.distributed.is_initialized()
+                                 or torch.distributed.get_rank() == 0)
+
+
 def np_cache_weights_iterator(
-    model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
-    hf_weights_files: List[str]
+    model_name_or_path: str,
+    cache_dir: Optional[str],
+    hf_folder: str,
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model np files.
 
     Will dump the model weights to numpy files if they are not already dumped.
     """
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     # Convert the model weights from torch tensors to numpy arrays for
     # faster loading.
     np_folder = os.path.join(hf_folder, "np")
@@ -389,7 +395,7 @@ def np_cache_weights_iterator(
             for bin_file in tqdm(
                     hf_weights_files,
                     desc="Loading np_cache checkpoint shards",
-                    disable=not enable_tqdm,
+                    disable=not enable_tqdm(use_tqdm_on_load),
                     bar_format=_BAR_FORMAT,
             ):
                 state = torch.load(bin_file,
@@ -414,15 +420,14 @@ def np_cache_weights_iterator(
 
 
 def safetensors_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     for st_file in tqdm(
             hf_weights_files,
             desc="Loading safetensors checkpoint shards",
-            disable=not enable_tqdm,
+            disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
         with safe_open(st_file, framework="pt") as f:
@@ -432,16 +437,15 @@ def safetensors_weights_iterator(
 
 
 def runai_safetensors_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     with SafetensorsStreamer() as streamer:
         for st_file in tqdm(
                 hf_weights_files,
                 desc="Loading safetensors using Runai Model Streamer",
-                disable=not enable_tqdm,
+                disable=not enable_tqdm(use_tqdm_on_load),
                 bar_format=_BAR_FORMAT,
         ):
             streamer.stream_file(st_file)
@@ -449,15 +453,14 @@ def runai_safetensors_weights_iterator(
 
 
 def pt_weights_iterator(
-    hf_weights_files: List[str]
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model bin/pt files."""
-    enable_tqdm = not torch.distributed.is_initialized(
-    ) or torch.distributed.get_rank() == 0
     for bin_file in tqdm(
             hf_weights_files,
             desc="Loading pt checkpoint shards",
-            disable=not enable_tqdm,
+            disable=not enable_tqdm(use_tqdm_on_load),
             bar_format=_BAR_FORMAT,
     ):
         state = torch.load(bin_file, map_location="cpu", weights_only=True)

From eac1b9f990ea7e867b4b3cc6a609cc9bc0e2dfb5 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 8 Mar 2025 06:11:04 -0800
Subject: [PATCH 0608/1240] [V1][Core] Fix memory issue with logits & sampling
 (#13776)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py | 11 ++++-
 vllm/v1/worker/gpu_model_runner.py    | 66 +++++++++++++++------------
 vllm/v1/worker/gpu_worker.py          | 21 +++++++++
 3 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 61c79a7bbc9..ba81f2bb79d 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -142,7 +142,16 @@ def test_end_to_end(model: str, use_v1: bool):
     used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
     # now the memory usage is mostly cudagraph memory pool,
     # and it should be less than the model weights (1B model, 2GiB weights)
-    assert used_bytes < 2 * GiB_bytes
+
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    if use_v1:
+        assert used_bytes < 7 * GiB_bytes
+    else:
+        assert used_bytes < 2 * GiB_bytes
 
     llm.wake_up()
     output2 = llm.generate(prompt, sampling_params)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0cdf8f1ab8c..81dec429b42 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1238,6 +1238,42 @@ def _dummy_run(
             )
         return hidden_states
 
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        logits = self.model.compute_logits(hidden_states, None)
+        num_reqs = logits.size(0)
+
+        dummy_tensors = lambda v: torch.full(
+            (num_reqs, ), v, device=self.device)
+
+        dummy_metadata = SamplingMetadata(
+            temperature=dummy_tensors(0.5),
+            all_greedy=False,
+            all_random=False,
+            top_p=dummy_tensors(0.9),
+            top_k=dummy_tensors(logits.size(1) - 1),
+            min_p=None,
+            generators={},
+            max_num_logprobs=None,
+            no_penalties=True,
+            prompt_token_ids=None,
+            frequency_penalties=dummy_tensors(0.1),
+            presence_penalties=dummy_tensors(0.1),
+            repetition_penalties=dummy_tensors(0.1),
+            output_token_ids=[[] for _ in range(num_reqs)],
+            min_tokens={},
+            logit_bias=[None for _ in range(num_reqs)],
+            allowed_token_ids_mask=None,
+        )
+        sampler_output = self.model.sample(logits=logits,
+                                           sampling_metadata=dummy_metadata)
+
+        return sampler_output
+
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         # TODO: handle encoder-decoder models once we support them.
@@ -1353,37 +1389,11 @@ def profile_run(self) -> None:
             hidden_states = self._dummy_run(self.max_num_tokens)
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
-                logits = self.model.compute_logits(hidden_states, None)
-                dummy_tensors = lambda v: torch.full(
-                    (num_reqs, ), v, device=self.device)
-                dummy_metadata = SamplingMetadata(
-                    temperature=dummy_tensors(0.5),
-                    all_greedy=False,
-                    all_random=False,
-                    top_p=dummy_tensors(0.9),
-                    top_k=dummy_tensors(logits.size(1) - 1),
-                    min_p=None,
-                    generators={},
-                    max_num_logprobs=None,
-                    no_penalties=True,
-                    prompt_token_ids=torch.ones_like(logits,
-                                                     dtype=torch.int64),
-                    frequency_penalties=dummy_tensors(0.1),
-                    presence_penalties=dummy_tensors(0.1),
-                    repetition_penalties=dummy_tensors(0.1),
-                    output_token_ids=[[] for _ in range(num_reqs)],
-                    min_tokens={},
-                    logit_bias=[None for _ in range(num_reqs)],
-                    allowed_token_ids_mask=None,
-                )
-                sampler_output = self.model.sample(
-                    logits=logits, sampling_metadata=dummy_metadata)
+                sampler_output = self._dummy_sampler_run(hidden_states)
             else:
-                logits = None
                 sampler_output = None
-                dummy_metadata = None
             torch.cuda.synchronize()
-            del hidden_states, logits, sampler_output, dummy_metadata
+            del hidden_states, sampler_output
             self.encoder_cache.clear()
         gc.collect()
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index cc6268d6569..01025732559 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -119,6 +119,8 @@ def init_device(self):
         self.model_runner: GPUModelRunner = GPUModelRunner(
             self.vllm_config, self.device)
 
+    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
+    # to hijack tensor allocation.
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
@@ -211,6 +213,25 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
+
+        # Warm up sampler and preallocate memory buffer for logits and other
+        # sampling related tensors of max possible shape to avoid memory
+        # fragmentation issue.
+        # NOTE: This is called after `capture_model` on purpose to prevent
+        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        try:
+            self.model_runner._dummy_sampler_run(
+                hidden_states=self.model_runner._dummy_run(
+                    num_tokens=self.scheduler_config.max_num_seqs))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up sampler. "
+                    "Please try lowering `gpu_memory_utilization` when "
+                    "initializing the engine.") from None
+            else:
+                raise e
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 523318109adef7b50438c79882d358f1405d019b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 8 Mar 2025 09:36:39 -0500
Subject: [PATCH 0609/1240] [benchmarks] Add option to use unique jsonschema
 for each request (#14457)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../benchmark_serving_structured_output.py    | 58 +++++++++++++------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 3d43e04598f..dccef9d96d0 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -24,11 +24,13 @@
 """
 import argparse
 import asyncio
+import copy
 import dataclasses
 import json
 import os
 import random
 import time
+import uuid
 import warnings
 from collections.abc import AsyncGenerator
 from dataclasses import dataclass
@@ -109,24 +111,43 @@ class SampleRequest:
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                     args: argparse.Namespace) -> list[SampleRequest]:
-    if args.dataset == 'json':
+    if args.dataset == 'json' or args.dataset == 'json-unique':
         if args.json_schema_path is None:
             dir_path = os.path.dirname(os.path.realpath(__file__))
             args.json_schema_path = os.path.join(dir_path,
                                                  "structured_schemas",
                                                  "structured_schema_1.json")
+        json_schemas = []
         with open(args.json_schema_path) as f:
             schema = json.load(f)
-        prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
-        input_len = len(tokenizer(prompt).input_ids)
-        print(f"Input length of the prompt: {input_len} tokens")
+
+        if args.dataset == 'json-unique':
+            json_schemas = [
+                copy.deepcopy(schema) for _ in range(args.num_prompts)
+            ]
+            for i in range(len(json_schemas)):
+                json_schemas[i]["properties"][
+                    f"__optional_field_{uuid.uuid4()}"] = {
+                        "type":
+                        "string",
+                        "description":
+                        "An unique optional field to avoid cached schemas"
+                    }
+
+        def gen_prompt(index: int):
+            schema = json_schemas[index % len(json_schemas)]
+            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
+
+        def get_schema(index: int):
+            return json_schemas[index % len(json_schemas)]
+
         requests = [
-            SampleRequest(prompt=prompt,
-                          prompt_len=input_len,
+            SampleRequest(prompt=gen_prompt(i),
+                          prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
                           expected_output_len=args.output_len,
-                          schema=schema,
+                          schema=get_schema(i),
                           structure_type=args.structure_type)
-            for _ in range(args.num_prompts)
+            for i in range(args.num_prompts)
         ]
 
     elif args.dataset == "grammar":
@@ -821,10 +842,12 @@ def main(args: argparse.Namespace):
         default="/v1/completions",
         help="API endpoint.",
     )
-    parser.add_argument(
-        "--dataset",
-        default='json',
-        choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
+    parser.add_argument("--dataset",
+                        default='json',
+                        choices=[
+                            'json', 'json-unique', 'grammar', 'regex',
+                            'choice', 'xgrammar_bench'
+                        ])
     parser.add_argument("--json_schema_path",
                         type=str,
                         default=None,
@@ -966,11 +989,12 @@ def main(args: argparse.Namespace):
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "json-unique"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)

From 49f358194b9d0a95de8404d2367880a1074ef1fd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 8 Mar 2025 23:16:40 +0800
Subject: [PATCH 0610/1240] [Misc] Don't run ruff at all on 3rd party libs
 (#14493)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 04e0c9e67eb..8a127ebb97a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ exclude = [
 ]
 
 [tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
 "vllm/version.py" = ["F401"]
 "vllm/_version.py" = ["ALL"]
 # Python 3.8 typing. TODO: Remove these excludes after v1.0.0
@@ -84,7 +85,6 @@ exclude = [
 "vllm/profiler/**/*.py" = ["UP006", "UP035"]
 "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"]
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
-"vllm/third_party/**/*.py" = ["UP006", "UP035"]
 "vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/usage/**/*.py" = ["UP006", "UP035"]

From 2497bf84ac4a8badcb26c1c4edf4a8d473e0146c Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 8 Mar 2025 17:44:35 +0100
Subject: [PATCH 0611/1240] Move requirements into their own directory (#12547)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../scripts/run-nightly-benchmarks.sh         |  2 +-
 .buildkite/run-cpu-test.sh                    |  2 +-
 .buildkite/test-pipeline.yaml                 |  2 +-
 .github/workflows/publish.yml                 |  2 +-
 .github/workflows/scripts/build.sh            |  2 +-
 .pre-commit-config.yaml                       |  4 +-
 .readthedocs.yaml                             |  2 +-
 Dockerfile                                    | 24 +++----
 Dockerfile.arm                                | 10 +--
 Dockerfile.cpu                                | 10 +--
 Dockerfile.hpu                                |  2 +-
 Dockerfile.neuron                             |  2 +-
 Dockerfile.openvino                           |  2 +-
 Dockerfile.ppc64le                            |  4 +-
 Dockerfile.rocm                               |  6 +-
 Dockerfile.s390x                              |  8 +--
 Dockerfile.tpu                                |  2 +-
 Dockerfile.xpu                                |  6 +-
 MANIFEST.in                                   | 10 +--
 docs/README.md                                |  2 +-
 docs/source/contributing/overview.md          |  2 +-
 .../ai_accelerator/hpu-gaudi.inc.md           |  4 +-
 .../installation/ai_accelerator/neuron.inc.md |  2 +-
 .../ai_accelerator/openvino.inc.md            |  2 +-
 .../installation/ai_accelerator/tpu.inc.md    |  2 +-
 .../installation/cpu/apple.inc.md             |  2 +-
 .../installation/cpu/build.inc.md             |  2 +-
 .../installation/gpu/cuda.inc.md              |  2 +-
 .../installation/gpu/rocm.inc.md              |  2 +-
 .../installation/gpu/xpu.inc.md               |  2 +-
 format.sh                                     |  2 +-
 pyproject.toml                                |  2 +-
 .../build.txt                                 |  0
 .../common.txt                                |  0
 requirements-cpu.txt => requirements/cpu.txt  |  2 +-
 .../cuda.txt                                  |  2 +-
 requirements-dev.txt => requirements/dev.txt  |  4 +-
 .../docs.txt                                  |  0
 requirements-hpu.txt => requirements/hpu.txt  |  2 +-
 .../lint.txt                                  |  0
 .../neuron.txt                                |  2 +-
 .../openvino.txt                              |  2 +-
 .../rocm-build.txt                            |  2 +-
 .../rocm.txt                                  |  2 +-
 requirements-test.in => requirements/test.in  |  0
 .../test.txt                                  | 72 +++++++++----------
 requirements-tpu.txt => requirements/tpu.txt  |  2 +-
 requirements-xpu.txt => requirements/xpu.txt  |  2 +-
 setup.py                                      | 27 ++++---
 use_existing_torch.py                         |  2 +-
 50 files changed, 125 insertions(+), 128 deletions(-)
 rename requirements-build.txt => requirements/build.txt (100%)
 rename requirements-common.txt => requirements/common.txt (100%)
 rename requirements-cpu.txt => requirements/cpu.txt (96%)
 rename requirements-cuda.txt => requirements/cuda.txt (95%)
 rename requirements-dev.txt => requirements/dev.txt (67%)
 rename docs/requirements-docs.txt => requirements/docs.txt (100%)
 rename requirements-hpu.txt => requirements/hpu.txt (88%)
 rename requirements-lint.txt => requirements/lint.txt (100%)
 rename requirements-neuron.txt => requirements/neuron.txt (77%)
 rename requirements-openvino.txt => requirements/openvino.txt (94%)
 rename requirements-rocm-build.txt => requirements/rocm-build.txt (89%)
 rename requirements-rocm.txt => requirements/rocm.txt (91%)
 rename requirements-test.in => requirements/test.in (100%)
 rename requirements-test.txt => requirements/test.txt (90%)
 rename requirements-tpu.txt => requirements/tpu.txt (98%)
 rename requirements-xpu.txt => requirements/xpu.txt (95%)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index 32bd34c431c..4d01a314adc 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -426,7 +426,7 @@ main() {
 
   pip install -U transformers
 
-  pip install -r requirements-dev.txt
+  pip install -r requirements/dev.txt
   which genai-perf
 
   # check storage
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 2ead1f51ed8..f6dad818ddc 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -35,7 +35,7 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements-test.txt
+    pip install -r vllm/requirements/test.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f60aeaf93c2..2af76cb24dd 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -35,7 +35,7 @@ steps:
   fast_check: true
   no_gpu: True
   commands:
-  - pip install -r requirements-docs.txt
+  - pip install -r ../../requirements/docs.txt
   - SPHINXOPTS=\"-W\" make html
   # Check API reference (if it fails, you may have missing mock imports)
   - grep \"sig sig-object py\" build/html/api/inference_params.html
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index e40ceaaa8b0..fc6739eb355 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -50,7 +50,7 @@ jobs:
   #     matrix:
   #         os: ['ubuntu-20.04']
   #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
+  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
   #         cuda-version: ['11.8', '12.1']
 
   #   steps:
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 122e4e101e2..0f010832b46 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
 LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 
 # Install requirements
-$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
+$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 23a38d49638..074ac9d122b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -44,8 +44,8 @@ repos:
   rev: 0.6.2
   hooks:
     - id: pip-compile
-      args: [requirements-test.in, -o, requirements-test.txt]
-      files: ^requirements-test\.(in|txt)$
+      args: [requirements/test.in, -o, requirements/test.txt]
+      files: ^requirements/test\.(in|txt)$
 - repo: local
   hooks:
   - id: mypy-local
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 284196bc2d2..2781ec223b6 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -18,4 +18,4 @@ formats: []
 # Optionally declare the Python requirements required to build your docs
 python:
   install:
-    - requirements: docs/requirements-docs.txt
+    - requirements: requirements/docs.txt
diff --git a/Dockerfile b/Dockerfile
index ece22ed3df8..ff4a0839f6e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -55,10 +55,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
     fi
 
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements/common.txt requirements/common.txt
+COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements-cuda.txt
+    uv pip install -r requirements/cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -76,14 +76,14 @@ FROM base AS build
 ARG TARGETPLATFORM
 
 # install build dependencies
-COPY requirements-build.txt requirements-build.txt
+COPY requirements/build.txt requirements/build.txt
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements-build.txt
+    uv pip install -r requirements/build.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -151,11 +151,11 @@ FROM base as dev
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
+COPY requirements/lint.txt requirements/lint.txt
+COPY requirements/test.txt requirements/test.txt
+COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements-dev.txt
+    uv pip install -r requirements/dev.txt
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
@@ -230,9 +230,9 @@ COPY examples examples
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements-build.txt requirements-build.txt
+COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements-build.txt
+    uv pip install -r requirements/build.txt
 
 #################### vLLM installation IMAGE ####################
 
@@ -249,7 +249,7 @@ ENV UV_HTTP_TIMEOUT=500
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements-dev.txt
+    uv pip install -r requirements/dev.txt
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/Dockerfile.arm b/Dockerfile.arm
index 093ee220922..bad09368423 100644
--- a/Dockerfile.arm
+++ b/Dockerfile.arm
@@ -26,18 +26,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
     pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+    pip install -r requirements/build.txt
 
 FROM cpu-test-arm AS build
 
 WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ebe226cf6d1..08a4e188f4c 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -29,18 +29,18 @@ WORKDIR /workspace
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
     pip install --upgrade pip && \
-    pip install -r requirements-build.txt
+    pip install -r requirements/build.txt
 
 FROM cpu-test-1 AS build
 
 WORKDIR /workspace/vllm
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
-    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
-    pip install -v -r requirements-cpu.txt
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    pip install -v -r requirements/cpu.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index 66cf68c32f2..48211c88f87 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -4,7 +4,7 @@ COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
 
-RUN pip install -v -r requirements-hpu.txt
+RUN pip install -v -r requirements/hpu.txt
 
 ENV no_proxy=localhost,127.0.0.1
 ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 27658d836d9..06764590636 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -36,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN python3 -m pip install -U \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements-neuron.txt
+        -r requirements/neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
 RUN --mount=type=bind,source=.git,target=.git \
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
index 32bcbfa9cc1..445c70ab89d 100644
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@@ -16,7 +16,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 
 RUN python3 -m pip install -U pip
 # install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
 # build vLLM with OpenVINO backend
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
 
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index c4c1f3e3579..c5ca20d76e3 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -6,7 +6,7 @@ ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
 RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
-# Some packages in requirements-cpu are installed here
+# Some packages in requirements/cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
 RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements-cpu.txt \
+        -r requirements/cpu.txt \
         xformers uvloop==0.20.0
 
 RUN --mount=type=bind,source=.git,target=.git \
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 14c522afd7f..02ccb8eff4e 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -38,7 +38,7 @@ FROM fetch_vllm AS build_vllm
 ARG USE_CYTHON
 # Build vLLM
 RUN cd vllm \
-    && python3 -m pip install -r requirements-rocm.txt \
+    && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
     && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
@@ -60,7 +60,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
-    && pip install -U -r requirements-rocm.txt \
+    && pip install -U -r requirements/rocm.txt \
     && pip uninstall -y vllm \
     && pip install *.whl
 
@@ -99,7 +99,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
 # Install vLLM
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
-    && pip install -U -r requirements-rocm.txt \
+    && pip install -U -r requirements/rocm.txt \
     && pip uninstall -y vllm \
     && pip install *.whl
 
diff --git a/Dockerfile.s390x b/Dockerfile.s390x
index b499d4cb21d..5a84dc12d8f 100644
--- a/Dockerfile.s390x
+++ b/Dockerfile.s390x
@@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     cd ../../python && \
     export PYARROW_PARALLEL=4 && \
     export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements-build.txt && \
+    uv pip install -r requirements/build.txt && \
     python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
 
 FROM python-install AS numa-build
@@ -120,7 +120,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
-     sed -i '/^torch/d' requirements-build.txt && \
+     sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
     uv pip install -v \    
@@ -128,8 +128,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         $VISION_WHL_FILE \
         --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
         --index-strategy unsafe-best-match \
-        -r requirements-build.txt \
-        -r requirements-cpu.txt 
+        -r requirements/build.txt \
+        -r requirements/cpu.txt 
 
 # Build and install vllm
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index e268b394766..960dc8e9ed9 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -19,7 +19,7 @@ ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 -m pip install \
-        -r requirements-tpu.txt
+        -r requirements/tpu.txt
 RUN python3 setup.py develop
 
 # install development dependencies (for testing)
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index a374f20d7d9..530809bcd4d 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -25,12 +25,12 @@ RUN apt-get update -y && \
     wget
 
 WORKDIR /workspace/vllm
-COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
-COPY requirements-common.txt /workspace/vllm/requirements-common.txt
+COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
+COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
-    -r requirements-xpu.txt
+    -r requirements/xpu.txt
 
 RUN git clone https://github.com/intel/pti-gpu && \
     cd pti-gpu/sdk && \
diff --git a/MANIFEST.in b/MANIFEST.in
index 82be639ef4d..82fd22b845f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,9 @@
 include LICENSE
-include requirements-common.txt
-include requirements-cuda.txt
-include requirements-rocm.txt
-include requirements-neuron.txt
-include requirements-cpu.txt
+include requirements/common.txt
+include requirements/cuda.txt
+include requirements/rocm.txt
+include requirements/neuron.txt
+include requirements/cpu.txt
 include CMakeLists.txt
 
 recursive-include cmake *
diff --git a/docs/README.md b/docs/README.md
index 1a44c1341f4..74e05ce0263 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,7 +4,7 @@
 
 ```bash
 # Install dependencies.
-pip install -r requirements-docs.txt
+pip install -r ../requirements/docs.txt
 
 # Build the docs.
 make clean
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 5f8f5525e52..a4141183166 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -23,7 +23,7 @@ Check out the [building from source](#build-from-source) documentation for detai
 ## Testing
 
 ```bash
-pip install -r requirements-dev.txt
+pip install -r requirements/dev.txt
 
 # Linting, formatting and static type checking
 pre-commit install --hook-type pre-commit --hook-type commit-msg
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index f3b0d6dc9bd..7e52f604890 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -63,7 +63,7 @@ To build and install vLLM from source, run:
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-hpu.txt
+pip install -r requirements/hpu.txt
 python setup.py develop
 ```
 
@@ -73,7 +73,7 @@ Currently, the latest features and performance optimizations are developed in Ga
 git clone https://github.com/HabanaAI/vllm-fork.git
 cd vllm-fork
 git checkout habana_main
-pip install -r requirements-hpu.txt
+pip install -r requirements/hpu.txt
 python setup.py develop
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index f149818acaf..4c668a8e689 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -116,7 +116,7 @@ Once neuronx-cc and transformers-neuronx packages are installed, we will be able
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -U -r requirements-neuron.txt
+pip install -U -r requirements/neuron.txt
 VLLM_TARGET_DEVICE="neuron" pip install .
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 4f25252d9da..5641c156365 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -32,7 +32,7 @@ Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend insta
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 Finally, install vLLM with OpenVINO backend:
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index c0d50feafce..6c7bbf60249 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -151,7 +151,7 @@ pip uninstall torch torch-xla -y
 Install build dependencies:
 
 ```bash
-pip install -r requirements-tpu.txt
+pip install -r requirements/tpu.txt
 sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
 ```
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 3bf1d47fa0f..7bc9e85ecd9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -25,7 +25,7 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-pip install -r requirements-cpu.txt
+pip install -r requirements/cpu.txt
 pip install -e . 
 ```
 
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/source/getting_started/installation/cpu/build.inc.md
index 46329e9bd28..39d9dfbd2b2 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@@ -18,7 +18,7 @@ Third, install Python packages for vLLM CPU backend building:
 ```console
 pip install --upgrade pip
 pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
-pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 ```
 
 Finally, build and install vLLM CPU backend:
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 2477c3e4c93..7e3b884c2ab 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -148,7 +148,7 @@ To build vLLM using an existing PyTorch installation:
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 python use_existing_torch.py
-pip install -r requirements-build.txt
+pip install -r requirements/build.txt
 pip install -e . --no-build-isolation
 ```
 
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 84e7f6507de..4381cef5e96 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -84,7 +84,7 @@ Currently, there are no pre-built ROCm wheels.
     # Install dependencies
     $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm
     $ pip install "numpy<2"
-    $ pip install -r requirements-rocm.txt
+    $ pip install -r requirements/rocm.txt
 
     # Build vLLM for MI210/MI250/MI300.
     $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index ef02d9a078a..9678c25b1dd 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -25,7 +25,7 @@ Currently, there are no pre-built XPU wheels.
 ```console
 source /opt/intel/oneapi/setvars.sh
 pip install --upgrade pip
-pip install -v -r requirements-xpu.txt
+pip install -v -r requirements/xpu.txt
 ```
 
 - Finally, build and install vLLM XPU backend:
diff --git a/format.sh b/format.sh
index 3e78bf9865f..fb503ec4bbf 100755
--- a/format.sh
+++ b/format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
 echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements-lint.txt', followed by"
+echo "Please run 'pip install -r requirements/lint.txt', followed by"
 echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
 echo "Then linters will run automatically before each commit."
diff --git a/pyproject.toml b/pyproject.toml
index 8a127ebb97a..2eafffaa786 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-# Should be mirrored in requirements-build.txt
+# Should be mirrored in requirements/build.txt
 requires = [
     "cmake>=3.26",
     "ninja",
diff --git a/requirements-build.txt b/requirements/build.txt
similarity index 100%
rename from requirements-build.txt
rename to requirements/build.txt
diff --git a/requirements-common.txt b/requirements/common.txt
similarity index 100%
rename from requirements-common.txt
rename to requirements/common.txt
diff --git a/requirements-cpu.txt b/requirements/cpu.txt
similarity index 96%
rename from requirements-cpu.txt
rename to requirements/cpu.txt
index 9491e27d127..ba059d3ff72 100644
--- a/requirements-cpu.txt
+++ b/requirements/cpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
diff --git a/requirements-cuda.txt b/requirements/cuda.txt
similarity index 95%
rename from requirements-cuda.txt
rename to requirements/cuda.txt
index b641ea7f870..46bb17361b2 100644
--- a/requirements-cuda.txt
+++ b/requirements/cuda.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
diff --git a/requirements-dev.txt b/requirements/dev.txt
similarity index 67%
rename from requirements-dev.txt
rename to requirements/dev.txt
index 421aa2e796e..e75821eb4a8 100644
--- a/requirements-dev.txt
+++ b/requirements/dev.txt
@@ -1,5 +1,5 @@
--r requirements-lint.txt
--r requirements-test.txt
+-r lint.txt
+-r test.txt
 
 # Avoid adding requirements directly to this file.
 # Instead, modify the two files referenced above.
diff --git a/docs/requirements-docs.txt b/requirements/docs.txt
similarity index 100%
rename from docs/requirements-docs.txt
rename to requirements/docs.txt
diff --git a/requirements-hpu.txt b/requirements/hpu.txt
similarity index 88%
rename from requirements-hpu.txt
rename to requirements/hpu.txt
index 63a5f8b18f6..a61d72d04f4 100644
--- a/requirements-hpu.txt
+++ b/requirements/hpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for HPU code
 ray
diff --git a/requirements-lint.txt b/requirements/lint.txt
similarity index 100%
rename from requirements-lint.txt
rename to requirements/lint.txt
diff --git a/requirements-neuron.txt b/requirements/neuron.txt
similarity index 77%
rename from requirements-neuron.txt
rename to requirements/neuron.txt
index 09820c73e4e..5f25bd0546e 100644
--- a/requirements-neuron.txt
+++ b/requirements/neuron.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for Neuron devices
 torch-neuronx >= 2.5.0
diff --git a/requirements-openvino.txt b/requirements/openvino.txt
similarity index 94%
rename from requirements-openvino.txt
rename to requirements/openvino.txt
index ac9d851d661..04b8c3b009a 100644
--- a/requirements-openvino.txt
+++ b/requirements/openvino.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
diff --git a/requirements-rocm-build.txt b/requirements/rocm-build.txt
similarity index 89%
rename from requirements-rocm-build.txt
rename to requirements/rocm-build.txt
index 00ae0340fc5..4d4945b007e 100644
--- a/requirements-rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 --extra-index-url https://download.pytorch.org/whl/rocm6.2
 torch==2.5.1
diff --git a/requirements-rocm.txt b/requirements/rocm.txt
similarity index 91%
rename from requirements-rocm.txt
rename to requirements/rocm.txt
index 83f3e18a96a..345c84b0f6c 100644
--- a/requirements-rocm.txt
+++ b/requirements/rocm.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
diff --git a/requirements-test.in b/requirements/test.in
similarity index 100%
rename from requirements-test.in
rename to requirements/test.in
diff --git a/requirements-test.txt b/requirements/test.txt
similarity index 90%
rename from requirements-test.txt
rename to requirements/test.txt
index f5722c82e20..f112320725c 100644
--- a/requirements-test.txt
+++ b/requirements/test.txt
@@ -1,5 +1,5 @@
 # This file was autogenerated by uv via the following command:
-#    uv pip compile requirements-test.in -o requirements-test.txt
+#    uv pip compile requirements/test.in -o requirements/test.txt
 absl-py==2.1.0
     # via rouge-score
 accelerate==1.0.1
@@ -32,9 +32,9 @@ attrs==24.2.0
 audioread==3.0.1
     # via librosa
 awscli==1.35.23
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 bitsandbytes==0.45.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
 boto3==1.35.57
@@ -47,7 +47,7 @@ botocore==1.35.57
 bounded-pool-executor==0.0.3
     # via pqdm
 buildkite-test-collector==0.1.9
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 certifi==2024.8.30
     # via
     #   httpcore
@@ -79,7 +79,7 @@ cupy-cuda12x==13.3.0
 cycler==0.12.1
     # via matplotlib
 datamodel-code-generator==0.26.3
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 dataproperty==1.0.1
     # via
     #   pytablewriter
@@ -91,7 +91,7 @@ datasets==3.0.2
 decorator==5.1.1
     # via librosa
 decord==0.6.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -104,7 +104,7 @@ docutils==0.16
     # via awscli
 einops==0.8.0
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   encodec
     #   vector-quantize-pytorch
     #   vocos
@@ -145,7 +145,7 @@ fsspec==2024.9.0
     #   huggingface-hub
     #   torch
 genai-perf==0.0.8
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
@@ -155,7 +155,7 @@ hiredis==3.0.0
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 huggingface-hub==0.26.2
     # via
     #   accelerate
@@ -187,7 +187,7 @@ jinja2==3.1.4
     #   datamodel-code-generator
     #   torch
 jiwer==3.0.5
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 jmespath==1.0.1
     # via
     #   boto3
@@ -214,11 +214,11 @@ lazy-loader==0.4
 libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 llvmlite==0.43.0
     # via numba
 lm-eval==0.4.4
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 lxml==5.3.0
     # via sacrebleu
 markdown-it-py==3.0.0
@@ -226,7 +226,7 @@ markdown-it-py==3.0.0
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.9.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 mbstrdecoder==1.1.3
     # via
     #   dataproperty
@@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
 mdurl==0.1.2
     # via markdown-it-py
 mistral-common==1.5.1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 more-itertools==10.5.0
     # via lm-eval
 mpmath==1.3.0
@@ -264,7 +264,7 @@ numexpr==2.10.1
     # via lm-eval
 numpy==1.26.4
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   accelerate
     #   bitsandbytes
     #   contourpy
@@ -366,7 +366,7 @@ patsy==1.0.1
     # via statsmodels
 peft==0.13.2
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   lm-eval
 pillow==10.4.0
     # via
@@ -388,7 +388,7 @@ pooch==1.8.2
 portalocker==2.10.1
     # via sacrebleu
 pqdm==0.2.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 propcache==0.2.0
     # via yarl
 protobuf==5.28.3
@@ -426,7 +426,7 @@ pytablewriter==1.2.0
     # via lm-eval
 pytest==8.3.3
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   buildkite-test-collector
     #   genai-perf
     #   pytest-asyncio
@@ -435,15 +435,15 @@ pytest==8.3.3
     #   pytest-rerunfailures
     #   pytest-shard
 pytest-asyncio==0.24.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-forked==1.6.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-mock==3.14.0
     # via genai-perf
 pytest-rerunfailures==14.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 pytest-shard==0.1.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 python-dateutil==2.9.0.post0
     # via
     #   botocore
@@ -473,7 +473,7 @@ pyyaml==6.0.2
 rapidfuzz==3.12.1
     # via jiwer
 ray==2.43.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 redis==5.2.0
     # via tensorizer
 referencing==0.35.1
@@ -512,9 +512,9 @@ rpds-py==0.20.1
 rsa==4.7.2
     # via awscli
 runai-model-streamer==0.11.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 runai-model-streamer-s3==0.11.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 s3transfer==0.10.3
     # via
     #   awscli
@@ -540,7 +540,7 @@ scipy==1.13.1
     #   statsmodels
     #   vocos
 sentence-transformers==3.2.1
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
@@ -557,7 +557,7 @@ sniffio==1.3.1
     #   httpx
 soundfile==0.12.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   librosa
 soxr==0.5.0.post1
     # via librosa
@@ -580,7 +580,7 @@ tenacity==9.0.0
     #   lm-eval
     #   plotly
 tensorizer==2.9.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 threadpoolctl==3.5.0
     # via scikit-learn
 tiktoken==0.7.0
@@ -588,12 +588,12 @@ tiktoken==0.7.0
     #   lm-eval
     #   mistral-common
 timm==1.0.11
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 tokenizers==0.21.0
     # via transformers
 torch==2.5.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   accelerate
     #   bitsandbytes
     #   encodec
@@ -609,7 +609,7 @@ torch==2.5.1
     #   vocos
 torchaudio==2.5.1
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   encodec
     #   vocos
 torchvision==0.20.1
@@ -630,19 +630,19 @@ tqdm-multiprocess==0.0.11
     # via lm-eval
 transformers==4.48.2
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   genai-perf
     #   lm-eval
     #   peft
     #   sentence-transformers
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 triton==3.1.0
     # via torch
 tritonclient==2.51.0
     # via
-    #   -r requirements-test.in
+    #   -r requirements/test.in
     #   genai-perf
 typepy==1.3.2
     # via
@@ -668,9 +668,9 @@ urllib3==2.2.3
     #   responses
     #   tritonclient
 vector-quantize-pytorch==1.21.2
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 vocos==0.1.0
-    # via -r requirements-test.in
+    # via -r requirements/test.in
 word2number==1.1
     # via lm-eval
 xxhash==3.5.0
diff --git a/requirements-tpu.txt b/requirements/tpu.txt
similarity index 98%
rename from requirements-tpu.txt
rename to requirements/tpu.txt
index 4bc6a9b83bd..e8e3b0af5db 100644
--- a/requirements-tpu.txt
+++ b/requirements/tpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 # Dependencies for TPU
 cmake>=3.26
diff --git a/requirements-xpu.txt b/requirements/xpu.txt
similarity index 95%
rename from requirements-xpu.txt
rename to requirements/xpu.txt
index be5cb6a4a99..265205957be 100644
--- a/requirements-xpu.txt
+++ b/requirements/xpu.txt
@@ -1,5 +1,5 @@
 # Common dependencies
--r requirements-common.txt
+-r common.txt
 
 ray >= 2.9
 cmake>=3.26
diff --git a/setup.py b/setup.py
index 749f415b6b3..d18fe53f12d 100755
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ def load_module_from_path(module_name, path):
     return module
 
 
-ROOT_DIR = os.path.dirname(__file__)
+ROOT_DIR = Path(__file__).parent
 logger = logging.getLogger(__name__)
 
 # cannot import envs directly because it depends on vllm,
@@ -520,10 +520,6 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
-def get_path(*filepath) -> str:
-    return os.path.join(ROOT_DIR, *filepath)
-
-
 def get_gaudi_sw_version():
     """
     Returns the driver version.
@@ -591,9 +587,10 @@ def get_vllm_version() -> str:
 
 def get_requirements() -> list[str]:
     """Get Python package dependencies from requirements.txt."""
+    requirements_dir = ROOT_DIR / "requirements"
 
     def _read_requirements(filename: str) -> list[str]:
-        with open(get_path(filename)) as f:
+        with open(requirements_dir / filename) as f:
             requirements = f.read().strip().split("\n")
         resolved_requirements = []
         for line in requirements:
@@ -606,9 +603,9 @@ def _read_requirements(filename: str) -> list[str]:
         return resolved_requirements
 
     if _no_device():
-        requirements = _read_requirements("requirements-common.txt")
+        requirements = _read_requirements("common.txt")
     elif _is_cuda():
-        requirements = _read_requirements("requirements-cuda.txt")
+        requirements = _read_requirements("cuda.txt")
         cuda_major, cuda_minor = torch.version.cuda.split(".")
         modified_requirements = []
         for req in requirements:
@@ -619,19 +616,19 @@ def _read_requirements(filename: str) -> list[str]:
             modified_requirements.append(req)
         requirements = modified_requirements
     elif _is_hip():
-        requirements = _read_requirements("requirements-rocm.txt")
+        requirements = _read_requirements("rocm.txt")
     elif _is_neuron():
-        requirements = _read_requirements("requirements-neuron.txt")
+        requirements = _read_requirements("neuron.txt")
     elif _is_hpu():
-        requirements = _read_requirements("requirements-hpu.txt")
+        requirements = _read_requirements("hpu.txt")
     elif _is_openvino():
-        requirements = _read_requirements("requirements-openvino.txt")
+        requirements = _read_requirements("openvino.txt")
     elif _is_tpu():
-        requirements = _read_requirements("requirements-tpu.txt")
+        requirements = _read_requirements("tpu.txt")
     elif _is_cpu():
-        requirements = _read_requirements("requirements-cpu.txt")
+        requirements = _read_requirements("cpu.txt")
     elif _is_xpu():
-        requirements = _read_requirements("requirements-xpu.txt")
+        requirements = _read_requirements("xpu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
diff --git a/use_existing_torch.py b/use_existing_torch.py
index a578328b035..7d352c6ca6f 100644
--- a/use_existing_torch.py
+++ b/use_existing_torch.py
@@ -2,7 +2,7 @@
 
 import glob
 
-requires_files = glob.glob('requirements*.txt')
+requires_files = glob.glob('requirements/*.txt')
 requires_files += ["pyproject.toml"]
 for file in requires_files:
     print(f">>> cleaning {file}")

From e9a6be2315b9df1232f8c661aebb82712fd907f4 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 8 Mar 2025 11:47:03 -0500
Subject: [PATCH 0612/1240] [Bugfix] DeepSeek Accuracy (#14476)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 886295ee895..0b0f521672b 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -222,8 +222,7 @@
     Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
-from vllm.model_executor.layers.rotary_embedding import (
-    DeepseekScalingRotaryEmbedding, RotaryEmbedding)
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.utils import cdiv, round_down
 
 try:
@@ -626,9 +625,12 @@ def __init__(
         self.qk_head_dim = qk_head_dim
         self.v_head_dim = v_head_dim
 
-        self.rotary_emb = rotary_emb
-        self.use_yarn_rope = isinstance(rotary_emb,
-                                        DeepseekScalingRotaryEmbedding)
+        # Hack for V1 for now to avoid torch library overhead (since we are
+        # already inside an attention custom op), pull out the forward
+        # method from the rotary embedding and call it directly
+        # TODO(lucas): we should probably find a cleaner way to do this
+        self.rotary_emb = rotary_emb._forward_method
+
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj

From 696d5242ae02b19b42710b089ca1043108cf29d9 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 9 Mar 2025 00:52:34 +0800
Subject: [PATCH 0613/1240] [Bugfix] Fix profiling OOM and decouple encoder
 multimodal profiling (#14361)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/multimodal/test_processing.py |  2 +-
 vllm/inputs/registry.py             |  6 ++-
 vllm/multimodal/profiling.py        | 84 ++++++++++++++++++-----------
 3 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index ba3df86f715..a358eee5ddb 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -873,7 +873,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         exc_ctx = pytest.raises(ValueError, match="this model only supports")
 
     with exc_ctx:
-        profiler.get_dummy_data(model_config.max_model_len)
+        profiler.get_decoder_dummy_data(model_config.max_model_len)
 
 
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index babfc4fb809..a89be6e8882 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -335,8 +335,10 @@ def dummy_data_for_profiling(
                                                      tokenizer,
                                                      disable_cache=True)
             profiler = MultiModalProfiler(processor)
-            dummy_data = profiler.get_dummy_data(
-                seq_len, is_encoder_data=is_encoder_data)
+            dummy_data_factory = (profiler.get_encoder_dummy_data
+                                  if is_encoder_data else
+                                  profiler.get_decoder_dummy_data)
+            dummy_data = dummy_data_factory(seq_len)
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 57f045dae6b..b791fb83478 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, TypeVar
+from typing import Generic, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -13,7 +13,8 @@
 from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
-from .inputs import MultiModalDataDict, MultiModalInputs
+from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
+                     MultiModalInputs)
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -142,14 +143,10 @@ def _get_dummy_mm_inputs(
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
 
-    def get_dummy_data(
+    def get_and_validate_mm_inputs(
         self,
         seq_len: int,
-        is_encoder_data: bool = False,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
+    ) -> tuple[MultiModalInputs, Mapping[str, int]]:
         mm_counts = self.get_mm_limits()
 
         info = self.processing_info
@@ -165,11 +162,6 @@ def get_dummy_data(
 
         mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts)
         placeholders_by_modality = mm_inputs["mm_placeholders"]
-        # For encoder-decoder models, use encoder prompt token ids instead of
-        # decoder prompt to construct dummy seq_data for encoder profiling.
-        prompt_token_ids = (
-            mm_inputs["prompt_token_ids"] if not is_encoder_data else
-            mm_inputs["encoder_prompt_token_ids"])  # type: ignore
 
         total_placeholders_by_modality = {
             modality: sum(item["length"] for item in placeholders)
@@ -185,28 +177,60 @@ def get_dummy_data(
                 f"{total_placeholders_by_modality} placeholder tokens, which "
                 f"is not the expected {expected_placeholders_by_modality} "
                 "tokens.")
+        return mm_inputs, total_placeholders_by_modality
+
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
+        mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
+
+        # For encoder-decoder models, use encoder prompt token ids instead of
+        # decoder prompt to construct dummy seq_data for encoder profiling.
+        encoder_prompt_token_ids = mm_inputs["encoder_prompt_token_ids"]
+
+        total_len = len(encoder_prompt_token_ids)
+        num_tokens_to_pad = max(total_len, seq_len) - total_len
+        encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
+
+        return DummyData(
+            seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
+            multi_modal_data=None,
+            multi_modal_placeholders=None,
+        )
+
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+    ) -> DummyData:
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        (mm_inputs, total_placeholders_by_modality
+         ) = self.get_and_validate_mm_inputs(seq_len)
 
+        prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
 
         # V0 does not support chunked prefill.
-        if (total_len > seq_len and not envs.VLLM_USE_V1) or is_encoder_data:
-            if total_len > seq_len and not is_encoder_data:
-                logger.warning(
-                    "The context length (%d) of the model is too short "
-                    "to hold the multi-modal embeddings in the worst case "
-                    "(%d tokens in total, out of which %s are reserved for "
-                    "multi-modal embeddings). This may cause certain "
-                    "multi-modal inputs to fail during inference, even when "
-                    "the input text is short. To avoid this, you should "
-                    "increase `max_model_len`, reduce `max_num_seqs`, "
-                    "and/or reduce `mm_counts`.", seq_len, total_len,
-                    total_placeholders_by_modality)
-
-            num_tokens_to_pad = max(total_len, seq_len) - total_len
-            prompt_token_ids.extend([0] * num_tokens_to_pad)
+        if total_len > seq_len and not envs.VLLM_USE_V1:
+            logger.warning(
+                "The context length (%d) of the model is too short "
+                "to hold the multi-modal embeddings in the worst case "
+                "(%d tokens in total, out of which %s are reserved for "
+                "multi-modal embeddings). This may cause certain "
+                "multi-modal inputs to fail during inference, even when "
+                "the input text is short. To avoid this, you should "
+                "increase `max_model_len`, reduce `max_num_seqs`, "
+                "and/or reduce `mm_counts`.", seq_len, total_len,
+                total_placeholders_by_modality)
 
             return DummyData(
-                seq_data=SequenceData.from_seqs(prompt_token_ids),
+                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
                 multi_modal_data=None,
                 multi_modal_placeholders=None,
             )
@@ -216,5 +240,5 @@ def get_dummy_data(
         return DummyData(
             seq_data=SequenceData.from_seqs(prompt_token_ids),
             multi_modal_data=mm_inputs["mm_kwargs"],
-            multi_modal_placeholders=placeholders_by_modality,
+            multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )

From 8a94c6c710b986222e61aecbfb226a99b2715782 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 8 Mar 2025 12:19:51 -0500
Subject: [PATCH 0614/1240] Update CODEOWNERS for structured output (#14496)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/CODEOWNERS | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index bc324d8b988..860c5c6cd53 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -10,27 +10,32 @@
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin
+/vllm/model_executor/guided_decoding @mgoin @russellb
 /vllm/multimodal @DarkLight1337 @ywang96
 CMakeLists.txt @tlrmchlsmth
 
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
+/vllm/v1/structured_output @mgoin @russellb
 
 # Test ownership
+/.buildkite/lm-eval-harness @mgoin @simon-mo
 /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
+/tests/distributed/test_multi_node_assignment.py @youkaichao
+/tests/distributed/test_pipeline_parallel.py @youkaichao
+/tests/distributed/test_same_node.py @youkaichao
 /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
+/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
+/tests/kernels @tlrmchlsmth @WoosukKwon
+/tests/model_executor/test_guided_processors.py @mgoin @russellb
 /tests/models @DarkLight1337 @ywang96
+/tests/multi_step @alexm-redhat @comaniac
 /tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/kernels @tlrmchlsmth @WoosukKwon
 /tests/quantization @mgoin @robertgshaw2-redhat
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/multi_step @alexm-redhat @comaniac
+/tests/spec_decode @njhill @LiuXiaoxuanPKU
+/tests/test_inputs.py @DarkLight1337 @ywang96
+/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
+/tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

From c1e683044a5cf712b40717d9e26be09facf71209 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 9 Mar 2025 01:35:50 +0800
Subject: [PATCH 0615/1240] [Misc] Upgrade to Python 3.9 typing for additional
 directories (#14492)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 pyproject.toml              |  5 ----
 vllm/assets/video.py        |  6 ++---
 vllm/inputs/data.py         | 54 ++++++++++++++++++-------------------
 vllm/inputs/parse.py        | 17 ++++++------
 vllm/inputs/preprocess.py   | 23 ++++++++--------
 vllm/inputs/registry.py     |  5 ++--
 vllm/multimodal/base.py     |  7 ++---
 vllm/multimodal/hasher.py   |  3 ++-
 vllm/multimodal/image.py    |  4 +--
 vllm/multimodal/registry.py | 14 +++++-----
 vllm/multimodal/video.py    |  4 +--
 vllm/usage/usage_lib.py     | 14 +++++-----
 12 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2eafffaa786..836389bc9a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,11 +75,8 @@ exclude = [
 "vllm/distributed/**/*.py" = ["UP006", "UP035"]
 "vllm/engine/**/*.py" = ["UP006", "UP035"]
 "vllm/executor/**/*.py" = ["UP006", "UP035"]
-"vllm/inputs/**/*.py" = ["UP006", "UP035"]
-"vllm/logging_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/lora/**/*.py" = ["UP006", "UP035"]
 "vllm/model_executor/**/*.py" = ["UP006", "UP035"]
-"vllm/multimodal/**/*.py" = ["UP006", "UP035"]
 "vllm/platforms/**/*.py" = ["UP006", "UP035"]
 "vllm/plugins/**/*.py" = ["UP006", "UP035"]
 "vllm/profiler/**/*.py" = ["UP006", "UP035"]
@@ -87,9 +84,7 @@ exclude = [
 "vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
 "vllm/transformers_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
-"vllm/usage/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
-"vllm/assets/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 494cfc38381..e45e1a65f89 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,7 +2,7 @@
 
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import List, Literal
+from typing import Literal
 
 import cv2
 import numpy as np
@@ -58,7 +58,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 
 def video_to_pil_images_list(path: str,
-                             num_frames: int = -1) -> List[Image.Image]:
+                             num_frames: int = -1) -> list[Image.Image]:
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
@@ -72,7 +72,7 @@ class VideoAsset:
     num_frames: int = -1
 
     @property
-    def pil_images(self) -> List[Image.Image]:
+    def pil_images(self) -> list[Image.Image]:
         video_path = download_video_asset(self.name)
         ret = video_to_pil_images_list(video_path, self.num_frames)
         return ret
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 2ffebeee392..138a8f61107 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cached_property
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
-                    Optional, Tuple, Union, cast)
+from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast
 
 import torch
 from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
@@ -26,7 +26,7 @@ class TextPrompt(TypedDict):
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -38,10 +38,10 @@ class TextPrompt(TypedDict):
 class TokensPrompt(TypedDict):
     """Schema for a tokenized prompt."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """A list of token IDs to pass to the model."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """A list of token type IDs to pass to the cross encoder model."""
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
@@ -50,7 +50,7 @@ class TokensPrompt(TypedDict):
     if the model supports it.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -115,7 +115,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 
     decoder_prompt: Optional[_T2_co]
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
 
 
 PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt]
@@ -136,10 +136,10 @@ class TokenInputs(TypedDict):
     type: Literal["token"]
     """The type of inputs."""
 
-    prompt_token_ids: List[int]
+    prompt_token_ids: list[int]
     """The token IDs of the prompt."""
 
-    token_type_ids: NotRequired[List[int]]
+    token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
     prompt: NotRequired[str]
@@ -164,12 +164,12 @@ class TokenInputs(TypedDict):
     Placeholder ranges for the multi-modal data.
     """
 
-    multi_modal_hashes: NotRequired[List[str]]
+    multi_modal_hashes: NotRequired[list[str]]
     """
     The hashes of the multi-modal data.
     """
 
-    mm_processor_kwargs: NotRequired[Dict[str, Any]]
+    mm_processor_kwargs: NotRequired[dict[str, Any]]
     """
     Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
@@ -179,14 +179,14 @@ class TokenInputs(TypedDict):
 
 
 def token_inputs(
-    prompt_token_ids: List[int],
-    token_type_ids: Optional[List[int]] = None,
+    prompt_token_ids: list[int],
+    token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
     multi_modal_data: Optional["MultiModalDataDict"] = None,
     multi_modal_inputs: Optional["MultiModalKwargs"] = None,
-    multi_modal_hashes: Optional[List[str]] = None,
+    multi_modal_hashes: Optional[list[str]] = None,
     multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None,
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
 ) -> TokenInputs:
     """Construct :class:`TokenInputs` from optional values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
@@ -255,7 +255,7 @@ def prompt(self) -> Optional[str]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def prompt_token_ids(self) -> List[int]:
+    def prompt_token_ids(self) -> list[int]:
         inputs = self.inputs
 
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
@@ -264,7 +264,7 @@ def prompt_token_ids(self) -> List[int]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def token_type_ids(self) -> List[int]:
+    def token_type_ids(self) -> list[int]:
         inputs = self.inputs
 
         if inputs["type"] == "token" or inputs["type"] == "multimodal":
@@ -294,7 +294,7 @@ def multi_modal_data(self) -> "MultiModalDataDict":
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
+    def multi_modal_inputs(self) -> Union[dict, "MultiModalKwargs"]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -306,7 +306,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]:
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def multi_modal_hashes(self) -> List[str]:
+    def multi_modal_hashes(self) -> list[str]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -331,7 +331,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
         assert_never(inputs)  # type: ignore[arg-type]
 
     @cached_property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
+    def mm_processor_kwargs(self) -> dict[str, Any]:
         inputs = self.inputs
 
         if inputs["type"] == "token":
@@ -355,7 +355,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]:
 def build_explicit_enc_dec_prompt(
     encoder_prompt: _T1,
     decoder_prompt: Optional[_T2],
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
 ) -> ExplicitEncoderDecoderPrompt[_T1, _T2]:
     if mm_processor_kwargs is None:
         mm_processor_kwargs = {}
@@ -368,9 +368,9 @@ def build_explicit_enc_dec_prompt(
 def zip_enc_dec_prompts(
     enc_prompts: Iterable[_T1],
     dec_prompts: Iterable[Optional[_T2]],
-    mm_processor_kwargs: Optional[Union[Iterable[Dict[str, Any]],
-                                        Dict[str, Any]]] = None,
-) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
+    mm_processor_kwargs: Optional[Union[Iterable[dict[str, Any]],
+                                        dict[str, Any]]] = None,
+) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
     :class:`ExplicitEncoderDecoderPrompt` instances.
@@ -380,12 +380,12 @@ def zip_enc_dec_prompts(
     provided, it will be zipped with the encoder/decoder prompts.
     """
     if mm_processor_kwargs is None:
-        mm_processor_kwargs = cast(Dict[str, Any], {})
+        mm_processor_kwargs = cast(dict[str, Any], {})
     if isinstance(mm_processor_kwargs, dict):
         return [
             build_explicit_enc_dec_prompt(
                 encoder_prompt, decoder_prompt,
-                cast(Dict[str, Any], mm_processor_kwargs))
+                cast(dict[str, Any], mm_processor_kwargs))
             for (encoder_prompt,
                  decoder_prompt) in zip(enc_prompts, dec_prompts)
         ]
@@ -399,7 +399,7 @@ def zip_enc_dec_prompts(
 
 def to_enc_dec_tuple_list(
     enc_dec_prompts: Iterable[ExplicitEncoderDecoderPrompt[_T1, _T2]],
-) -> List[Tuple[_T1, Optional[_T2]]]:
+) -> list[tuple[_T1, Optional[_T2]]]:
     return [(enc_dec_prompt["encoder_prompt"],
              enc_dec_prompt["decoder_prompt"])
             for enc_dec_prompt in enc_dec_prompts]
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index 454d9d8303b..ed1056948d8 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Literal, Sequence, TypedDict, Union, cast, overload
+from collections.abc import Sequence
+from typing import Literal, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
 
@@ -17,24 +18,24 @@ class ParsedText(TypedDict):
 
 
 class ParsedTokens(TypedDict):
-    content: List[int]
+    content: list[int]
     is_tokens: Literal[True]
 
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[str, List[str]]) -> Sequence[ParsedText]:
+        prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
     ...
 
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[List[int], List[List[int]]]) -> Sequence[ParsedTokens]:
+        prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
     ...
 
 
 def parse_and_batch_prompt(
-    prompt: Union[str, List[str], List[int], List[List[int]]],
+    prompt: Union[str, list[str], list[int], list[list[int]]],
 ) -> Union[Sequence[ParsedText], Sequence[ParsedTokens]]:
     if isinstance(prompt, str):
         # case 1: a string
@@ -46,16 +47,16 @@ def parse_and_batch_prompt(
 
         if is_list_of(prompt, str):
             # case 2: array of strings
-            prompt = cast(List[str], prompt)
+            prompt = cast(list[str], prompt)
             return [
                 ParsedText(content=elem, is_tokens=False) for elem in prompt
             ]
         if is_list_of(prompt, int):
             # case 3: array of tokens
-            prompt = cast(List[int], prompt)
+            prompt = cast(list[int], prompt)
             return [ParsedTokens(content=prompt, is_tokens=True)]
         if is_list_of(prompt, list):
-            prompt = cast(List[List[int]], prompt)
+            prompt = cast(list[list[int]], prompt)
             if len(prompt[0]) == 0:
                 raise ValueError("please provide at least one prompt")
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 2545635da32..f56cff292b6 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-from typing import List, Mapping, Optional, Tuple, Union, cast
+from collections.abc import Mapping
+from typing import Optional, Union, cast
 
 from typing_extensions import assert_never
 
@@ -92,7 +93,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
 
         return dec_start_token_id
 
-    def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
+    def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
         '''
         Specifically for encoder/decoder models:
         generate a default decoder prompt for when
@@ -130,8 +131,8 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]:
 
     def _prepare_decoder_input_ids_for_generation(
         self,
-        decoder_input_ids: Optional[List[int]],
-    ) -> List[int]:
+        decoder_input_ids: Optional[list[int]],
+    ) -> list[int]:
         """
         Prepares `decoder_input_ids` for generation with encoder-decoder models.
 
@@ -168,9 +169,9 @@ def _prepare_decoder_input_ids_for_generation(
 
     def _apply_prompt_adapter(
         self,
-        prompt_token_ids: List[int],
+        prompt_token_ids: list[int],
         prompt_adapter_request: Optional[PromptAdapterRequest],
-    ) -> List[int]:
+    ) -> list[int]:
         if prompt_adapter_request:
             prompt_token_ids = (
                 [0] * prompt_adapter_request.prompt_adapter_num_virtual_tokens
@@ -183,7 +184,7 @@ def _tokenize_prompt(
         prompt: str,
         request_id: str,
         lora_request: Optional[LoRARequest],
-    ) -> List[int]:
+    ) -> list[int]:
         """
         Apply the model's tokenizer to a text prompt, returning the
         corresponding token IDs.
@@ -211,7 +212,7 @@ async def _tokenize_prompt_async(
         prompt: str,
         request_id: str,
         lora_request: Optional[LoRARequest],
-    ) -> List[int]:
+    ) -> list[int]:
         """Async version of :meth:`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
         add_special_tokens = None
@@ -250,7 +251,7 @@ def _can_process_multimodal(self) -> bool:
 
     def _process_multimodal(
         self,
-        prompt: Union[str, List[int]],
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
@@ -280,7 +281,7 @@ def _process_multimodal(
 
     async def _process_multimodal_async(
         self,
-        prompt: Union[str, List[int]],
+        prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Mapping[str, object]],
         lora_request: Optional[LoRARequest],
@@ -511,7 +512,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
         self,
         inputs: SingletonInputs,
         decoder_inputs_to_override: Optional[SingletonInputs] = None,
-    ) -> Tuple[SingletonInputs, SingletonInputs]:
+    ) -> tuple[SingletonInputs, SingletonInputs]:
         """
         For encoder/decoder models only:
         Separate Encoder/Decoder inputs from a MultiModalEncDecInputs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index a89be6e8882..a0bd8f278fd 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -2,9 +2,10 @@
 
 import functools
 from collections import UserDict
+from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Callable, Mapping, NamedTuple,
-                    Optional, Protocol, Union)
+from typing import (TYPE_CHECKING, Any, Callable, NamedTuple, Optional,
+                    Protocol, Union)
 
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c48d07ba365..e0b160a6504 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,9 +2,10 @@
 
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from collections.abc import Sequence
 from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple,
-                    Optional, Sequence, Tuple, Type, TypeVar, Union)
+                    Optional, TypeVar, Union)
 
 from torch import nn
 
@@ -39,7 +40,7 @@
 """
 
 _T = TypeVar("_T")
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 
 
 class MultiModalPlugin(ABC):
@@ -274,7 +275,7 @@ def __init__(self):
     @classmethod
     def from_seq_group(
         cls, seq_group: "SequenceGroupMetadata", positions: range
-    ) -> Tuple[Optional[MultiModalDataDict], dict[str,
+    ) -> tuple[Optional[MultiModalDataDict], dict[str,
                                                   "MultiModalPlaceholderMap"]]:
         """
         Returns the multi-modal items that intersect with the portion of a
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 7d277fd67de..11665ef6675 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
-from typing import TYPE_CHECKING, Iterable, Mapping, Optional
+from collections.abc import Iterable, Mapping
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 98ece8f806f..f76982ef8d7 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,7 +3,7 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 from PIL import Image
@@ -31,7 +31,7 @@ def get_data_key(self) -> str:
     def _get_hf_image_processor(
         self,
         model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4987cdc4a2e..febf3ad9eea 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,9 +2,9 @@
 
 import functools
 from collections import UserDict
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, Any, Dict, Generic, Mapping, Optional,
-                    Protocol, Sequence, Type, TypeVar)
+from typing import TYPE_CHECKING, Any, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
 
@@ -29,7 +29,7 @@
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=type[nn.Module])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
@@ -83,13 +83,13 @@ def build_processor(
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
+class _MultiModalLimits(UserDict["ModelConfig", dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
     when attempting to access a model that does not exist.
     """
 
-    def __getitem__(self, key: "ModelConfig") -> Dict[str, int]:
+    def __getitem__(self, key: "ModelConfig") -> dict[str, int]:
         try:
             return super().__getitem__(key)
         except KeyError as exc:
@@ -170,7 +170,7 @@ def map_input(
         self,
         model_config: "ModelConfig",
         data: MultiModalDataDict,
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ) -> MultiModalKwargs:
         """
         Apply an input mapper to the data passed to the model.
@@ -184,7 +184,7 @@ def map_input(
         Note:
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
-        merged_dict: Dict[str, NestedTensors] = {}
+        merged_dict = dict[str, NestedTensors]()
 
         for data_key, data_value in data.items():
             plugin = self._get_plugin(data_key)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 8004377191b..0b3d3f8c79d 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -4,7 +4,7 @@
 from functools import partial
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import numpy.typing as npt
@@ -39,7 +39,7 @@ def get_data_key(self) -> str:
     def _get_hf_video_processor(
         self,
         model_config: "ModelConfig",
-        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[dict[str, Any]] = None,
     ):
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index fbbb21c8937..2ee3f9104d1 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -9,7 +9,7 @@
 from enum import Enum
 from pathlib import Path
 from threading import Thread
-from typing import Any, Dict, Optional, Union
+from typing import Any, Optional, Union
 from uuid import uuid4
 
 import cpuinfo
@@ -27,7 +27,7 @@
 _USAGE_STATS_ENABLED = None
 _USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
 
-_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
+_GLOBAL_RUNTIME_DATA = dict[str, Union[str, int, bool]]()
 
 _USAGE_ENV_VARS_TO_COLLECT = [
     "VLLM_USE_MODELSCOPE",
@@ -150,7 +150,7 @@ def __init__(self) -> None:
     def report_usage(self,
                      model_architecture: str,
                      usage_context: UsageContext,
-                     extra_kvs: Optional[Dict[str, Any]] = None) -> None:
+                     extra_kvs: Optional[dict[str, Any]] = None) -> None:
         t = Thread(target=self._report_usage_worker,
                    args=(model_architecture, usage_context, extra_kvs or {}),
                    daemon=True)
@@ -158,13 +158,13 @@ def report_usage(self,
 
     def _report_usage_worker(self, model_architecture: str,
                              usage_context: UsageContext,
-                             extra_kvs: Dict[str, Any]) -> None:
+                             extra_kvs: dict[str, Any]) -> None:
         self._report_usage_once(model_architecture, usage_context, extra_kvs)
         self._report_continous_usage()
 
     def _report_usage_once(self, model_architecture: str,
                            usage_context: UsageContext,
-                           extra_kvs: Dict[str, Any]) -> None:
+                           extra_kvs: dict[str, Any]) -> None:
         # Platform information
         from vllm.platforms import current_platform
         if current_platform.is_cuda_alike():
@@ -227,7 +227,7 @@ def _report_continous_usage(self):
             self._write_to_file(data)
             self._send_to_server(data)
 
-    def _send_to_server(self, data: Dict[str, Any]) -> None:
+    def _send_to_server(self, data: dict[str, Any]) -> None:
         try:
             global_http_client = global_http_connection.get_sync_client()
             global_http_client.post(_USAGE_STATS_SERVER, json=data)
@@ -235,7 +235,7 @@ def _send_to_server(self, data: Dict[str, Any]) -> None:
             # silently ignore unless we are using debug log
             logging.debug("Failed to send usage data to server")
 
-    def _write_to_file(self, data: Dict[str, Any]) -> None:
+    def _write_to_file(self, data: dict[str, Any]) -> None:
         os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
         Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
         with open(_USAGE_STATS_JSON_PATH, "a") as f:

From ec96ac1e237b77d30e308b8a9658ea235bc8dda9 Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Sat, 8 Mar 2025 14:50:26 -0800
Subject: [PATCH 0616/1240] [V1] Support bad_words in sampler (#13376)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/test_utils.py                         | 25 ++++++-
 tests/v1/sample/test_rejection_sampler.py   |  1 +
 tests/v1/sample/test_sampler.py             | 76 +++++++++++++++++++++
 tests/v1/sample/test_sampling_params_e2e.py | 18 ++++-
 tests/v1/worker/test_gpu_input_batch.py     |  6 ++
 vllm/sampling_params.py                     | 52 +++++++++++++-
 vllm/utils.py                               | 16 +++++
 vllm/v1/engine/processor.py                 |  5 +-
 vllm/v1/sample/metadata.py                  |  3 +
 vllm/v1/sample/ops/bad_words.py             | 38 +++++++++++
 vllm/v1/sample/sampler.py                   | 16 +++++
 vllm/v1/worker/gpu_input_batch.py           | 37 +++++-----
 vllm/v1/worker/gpu_model_runner.py          |  1 +
 13 files changed, 266 insertions(+), 28 deletions(-)
 create mode 100644 vllm/v1/sample/ops/bad_words.py

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8b67e92fca6..49fb02fd040 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -14,7 +14,7 @@
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                         PlaceholderModule, StoreBoolean, bind_kv_cache,
                         deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw)
+                        merge_async_iterators, supports_kw, swap_dict_values)
 
 from .utils import error_on_warning, fork_new_process_for_each_test
 
@@ -449,3 +449,26 @@ def build_ctx():
     with build_ctx():
         # Test conflict with internal __module attribute
         _ = placeholder_attr.module
+
+
+@pytest.mark.parametrize(
+    "obj,key1,key2",
+    [
+        # Tests for both keys exist
+        ({1: "a", 2: "b"}, 1, 2),
+        # Tests for one key does not exist
+        ({1: "a", 2: "b"}, 1, 3),
+        # Tests for both keys do not exist
+        ({1: "a", 2: "b"}, 3, 4),
+    ])
+def test_swap_dict_values(obj, key1, key2):
+    original_obj = obj.copy()
+    swap_dict_values(obj, key1, key2)
+    if key1 in original_obj:
+        assert obj[key2] == original_obj[key1]
+    else:
+        assert key2 not in obj
+    if key2 in original_obj:
+        assert obj[key1] == original_obj[key2]
+    else:
+        assert key1 not in obj
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index b1862455d0e..190927745f1 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -42,6 +42,7 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
         min_tokens={},
         logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
+        bad_words_token_ids={},
     )
 
 
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index b702d9ed7f8..5f041b44893 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -77,6 +77,49 @@ def _create_allowed_token_ids(
     return mask
 
 
+def _create_bad_words_token_ids(
+        batch_size: int, vocab_size: int,
+        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(vocab_size,
+                                         size=bad_words_length,
+                                         replace=True).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+def _update_output_token_ids_for_bad_words(
+        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
+                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = (bad_word_token_ids[-2] +
+                                            1) % vocab_size
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
 def _create_default_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -112,6 +155,7 @@ def _create_default_sampling_metadata(
         min_tokens={},
         logit_bias=[None] * batch_size,
         allowed_token_ids_mask=None,
+        bad_words_token_ids={},
     )
     return fake_sampling_metadata
 
@@ -467,3 +511,35 @@ def test_sampler_allowed_token_ids(device: str, batch_size: int,
                     "inf"), f"{batch_idx}, {token_id}"
             else:
                 assert logits_for_req[token_id] != -float("inf")
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2, 32])
+@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
+def test_sampler_bad_words(device: str, batch_size: int,
+                           bad_words_lengths: list[tuple[int]]):
+    """
+    Test to verify that when the bad words restriction is present, tokens
+    are penalized based on their match with the bad words.
+    """
+    torch.set_default_device(device)
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
+        batch_size, VOCAB_SIZE, bad_words_lengths)
+    bad_words_last_tokens = _update_output_token_ids_for_bad_words(
+        sampling_metadata, VOCAB_SIZE)
+    sampler = Sampler()
+    logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        for token_id in range(VOCAB_SIZE):
+            if (batch_idx in bad_words_last_tokens
+                    and token_id in bad_words_last_tokens[batch_idx]):
+                assert logits_for_req[token_id] == -float("inf")
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index dcb0fa20b1a..0512a1e0266 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -120,8 +120,22 @@ def test_detokenize_false(model):
 def test_bad_words(model):
     """Check that we respect bad words."""
 
-    with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(bad_words=["Hello"]))
+    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    split_text = output[0].outputs[0].text.split()
+
+    bad_words_1 = " ".join(split_text[:2])
+    params = SamplingParams(temperature=0, bad_words=[bad_words_1])
+    output = model.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+
+    bad_words_2 = new_text.split()[-1]
+    params = SamplingParams(temperature=0,
+                            bad_words=[bad_words_1, bad_words_2])
+    output = model.generate(PROMPT, params)
+    new_text = output[0].outputs[0].text
+    assert bad_words_1 not in new_text
+    assert bad_words_2 not in new_text
 
 
 def test_logits_processor(model):
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 5f0cb1d3d3b..192ddefe102 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -100,6 +100,7 @@ def _construct_expected_sampling_metadata(
                                          VOCAB_SIZE,
                                          dtype=torch.bool,
                                          device=device)
+    bad_words_token_ids = {}
     for req in reqs:
         if req.req_id not in req_ids_retained:
             continue
@@ -123,6 +124,8 @@ def _construct_expected_sampling_metadata(
         if req.sampling_params.allowed_token_ids:
             allowed_token_ids_mask[index_in_input_batch][
                 req.sampling_params.allowed_token_ids] = True
+        bad_words_token_ids[
+            index_in_input_batch] = req.sampling_params.bad_words_token_ids
 
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
@@ -159,6 +162,7 @@ def _construct_expected_sampling_metadata(
                       and all(x == 1 for x in repetition_penalties)),
         logit_bias=logit_bias,
         allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids=bad_words_token_ids,
     )
 
 
@@ -284,6 +288,8 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
         assert torch.allclose(
             expected_sampling_metadata.allowed_token_ids_mask,
             sampling_metadata.allowed_token_ids_mask)
+    assert expected_sampling_metadata.bad_words_token_ids == \
+        sampling_metadata.bad_words_token_ids
 
 
 @pytest.mark.parametrize("device", CUDA_DEVICES)
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index ca577a6721f..110efa22982 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -11,6 +11,8 @@
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 logger = init_logger(__name__)
 
@@ -202,7 +204,6 @@ class SamplingParams(
     seed: Optional[int] = None
     stop: Optional[Union[str, list[str]]] = None
     stop_token_ids: Optional[list[int]] = None
-    bad_words: Optional[list[str]] = None
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
     min_tokens: int = 0
@@ -232,6 +233,10 @@ class SamplingParams(
     allowed_token_ids: Optional[list[int]] = None
     extra_args: Optional[dict[str, Any]] = None
 
+    # Fields used for bad words
+    bad_words: Optional[list[str]] = None
+    _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list)
+
     @staticmethod
     def from_optional(
         n: Optional[int] = 1,
@@ -464,6 +469,46 @@ def update_from_generation_config(
                     eos_ids.update(self.stop_token_ids)
                     self.stop_token_ids = list(eos_ids)
 
+    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+        if self.bad_words is None:
+            return
+        for bad_word in self.bad_words:
+            # To prohibit words both at the beginning
+            # and in the middle of text
+            # (related to add_prefix_space tokenizer parameter)
+            for add_prefix_space in [False, True]:
+                prefix = " " if add_prefix_space else ""
+                prompt = prefix + bad_word.lstrip()
+
+                if isinstance(tokenizer, MistralTokenizer):
+                    # Mistral tokenizers should not add special tokens
+                    prompt_token_ids = tokenizer.encode(text=prompt)
+                else:
+                    prompt_token_ids = tokenizer.encode(
+                        text=prompt, add_special_tokens=False)
+
+                # If no space at the beginning
+                # or if prefix space produces a new word token
+                if (not add_prefix_space) or (
+                        add_prefix_space and prompt_token_ids[0]
+                        != self._bad_words_token_ids[-1][0]
+                        and len(prompt_token_ids) == len(
+                            self._bad_words_token_ids[-1])):
+                    self._bad_words_token_ids.append(prompt_token_ids)
+
+        invalid_token_ids = [
+            token_id for bad_words_token_ids in self._bad_words_token_ids
+            for token_id in bad_words_token_ids
+            if token_id < 0 or token_id > tokenizer.max_token_id
+        ]
+        if len(invalid_token_ids) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {tokenizer.max_token_id+1},"
+                f" but the following tokens"
+                f" were specified as bad: {invalid_token_ids}."
+                f" All token id values should be integers satisfying:"
+                f" 0 <= token_id <= {tokenizer.max_token_id}.")
+
     @cached_property
     def sampling_type(self) -> SamplingType:
         if self.temperature < _SAMPLING_EPS:
@@ -476,6 +521,11 @@ def sampling_type(self) -> SamplingType:
     def all_stop_token_ids(self) -> set[int]:
         return self._all_stop_token_ids
 
+    @property
+    def bad_words_token_ids(self) -> list[list[int]]:
+        # For internal use only. Backward compatibility not guaranteed
+        return self._bad_words_token_ids
+
     def clone(self) -> "SamplingParams":
         """Deep copy, but maybe not the LogitsProcessor objects.
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 883a9e50406..9cad2b8854a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2361,3 +2361,19 @@ def __dir__(self) -> list[str]:
         if self._module is None:
             self._module = self._load()
         return dir(self._module)
+
+
+def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
+    """
+    Helper function to swap values for two keys
+    """
+    v1 = obj.get(key1)
+    v2 = obj.get(key2)
+    if v1 is not None:
+        obj[key2] = v1
+    else:
+        obj.pop(key2, None)
+    if v2 is not None:
+        obj[key1] = v2
+    else:
+        obj.pop(key1, None)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 247fb046e81..38638c1ee36 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -94,9 +94,6 @@ def _validate_supported_sampling_params(
         # Best of not yet supported.
         if params.best_of is not None and params.best_of > 1:
             raise ValueError("VLLM V1 does not yet support best_of.")
-        # Bad words not yet supported.
-        if params.bad_words:
-            raise ValueError("VLLM V1 does not yet support bad_words.")
         # Logits processors not supported.
         if params.logits_processors:
             raise ValueError("VLLM V1 does not support per request "
@@ -203,6 +200,8 @@ def process_inputs(
         sampling_params = params.clone()
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
+        sampling_params.update_from_tokenizer(
+            self.tokenizer.get_lora_tokenizer(lora_request))
 
         # Multimodal related.
         # Compute MM hashes (if enabled)
diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py
index 55d9739b800..e97e1235fb3 100644
--- a/vllm/v1/sample/metadata.py
+++ b/vllm/v1/sample/metadata.py
@@ -38,3 +38,6 @@ class SamplingMetadata:
     # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
     # vocab size).
     allowed_token_ids_mask: Optional[torch.Tensor]
+
+    # req_index -> bad_words_token_ids
+    bad_words_token_ids: dict[int, list[list[int]]]
diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py
new file mode 100644
index 00000000000..2984d4e4806
--- /dev/null
+++ b/vllm/v1/sample/ops/bad_words.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+_SMALLEST_LOGIT = float("-inf")
+
+
+def _apply_bad_words_single_batch(
+    logits: torch.Tensor,
+    bad_words_token_ids: list[list[int]],
+    past_tokens_ids: list[int],
+) -> None:
+    for bad_word_ids in bad_words_token_ids:
+        if len(bad_word_ids) > len(past_tokens_ids) + 1:
+            continue
+
+        prefix_length = len(bad_word_ids) - 1
+        last_token_id = bad_word_ids[-1]
+        if prefix_length > 0:
+            actual_prefix = past_tokens_ids[-prefix_length:]
+        else:
+            actual_prefix = []
+        expected_prefix = bad_word_ids[:prefix_length]
+
+        assert len(actual_prefix) == len(expected_prefix)
+
+        if actual_prefix == expected_prefix:
+            logits[last_token_id] = _SMALLEST_LOGIT
+
+
+def apply_bad_words(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+) -> None:
+    for i, bad_words_ids in bad_words_token_ids.items():
+        _apply_bad_words_single_batch(logits[i], bad_words_ids,
+                                      past_tokens_ids[i])
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index b0eb533ae2e..96f6d807b10 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -6,6 +6,7 @@
 
 from vllm.v1.outputs import LogprobsTensors, SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.bad_words import apply_bad_words
 from vllm.v1.sample.ops.penalties import (apply_all_penalties,
                                           apply_min_token_penalties)
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
@@ -38,6 +39,8 @@ def forward(
         logits = logits.to(torch.float32)
         # Apply allowed token ids.
         logits = self.apply_allowed_token_ids(logits, sampling_metadata)
+        # Apply bad words exclusion.
+        logits = self.apply_bad_words(logits, sampling_metadata)
         # Apply logits bias.
         logits = self.apply_logits_bias(logits, sampling_metadata)
         # Apply penalties (e.g., min_tokens, freq_penalties).
@@ -237,3 +240,16 @@ def apply_allowed_token_ids(
             logits.masked_fill_(sampling_metadata.allowed_token_ids_mask,
                                 float("-inf"))
         return logits
+
+    def apply_bad_words(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        if sampling_metadata.bad_words_token_ids:
+            apply_bad_words(
+                logits,
+                sampling_metadata.bad_words_token_ids,
+                sampling_metadata.output_token_ids,
+            )
+        return logits
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 6239a182e31..9707cb5774c 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -10,6 +10,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import swap_dict_values
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
@@ -204,6 +205,9 @@ def __init__(
         self.allowed_token_ids_mask: Optional[torch.Tensor] = None
         self.allowed_token_ids_mask_cpu_tensor: Optional[torch.Tensor] = None
 
+        # req_index -> bad_words_token_ids
+        self.bad_words_token_ids: dict[int, list[list[int]]] = {}
+
         self.req_output_token_ids: list[Optional[list[int]]] = []
 
         # This is updated each time the batch constituents change.
@@ -320,6 +324,9 @@ def add_request(
             self.allowed_token_ids_mask_cpu_tensor[req_index][
                 sampling_params.allowed_token_ids] = False
 
+        self.bad_words_token_ids[
+            req_index] = sampling_params.bad_words_token_ids
+
         # Add request lora ID
         if request.lora_request:
             lora_id = request.lora_request.lora_int_id
@@ -369,6 +376,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         if self.allowed_token_ids_mask_cpu_tensor is not None:
             # False means we don't fill with -inf.
             self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False)
+        self.bad_words_token_ids.pop(req_index, None)
         return req_index
 
     def swap_states(self, i1: int, i2: int) -> None:
@@ -413,27 +421,9 @@ def swap_states(self, i1: int, i2: int) -> None:
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
         self.token_ids_cpu[i2, ...] = tmp
 
-        g1 = self.generators.get(i1)
-        g2 = self.generators.get(i2)
-        if g1 is not None:
-            self.generators[i2] = g1
-        else:
-            self.generators.pop(i2, None)
-        if g2 is not None:
-            self.generators[i1] = g2
-        else:
-            self.generators.pop(i1, None)
-
-        t1 = self.min_tokens.get(i1)
-        t2 = self.min_tokens.get(i2)
-        if t1 is not None:
-            self.min_tokens[i2] = t1
-        else:
-            self.min_tokens.pop(i2, None)
-        if t2 is not None:
-            self.min_tokens[i1] = t2
-        else:
-            self.min_tokens.pop(i1, None)
+        swap_dict_values(self.generators, i1, i2)
+        swap_dict_values(self.min_tokens, i1, i2)
+        swap_dict_values(self.bad_words_token_ids, i1, i2)
 
         self.request_lora_mapping[i1], self.request_lora_mapping[i2] =\
             self.request_lora_mapping[i2], self.request_lora_mapping[i1]
@@ -518,6 +508,10 @@ def condense(self, empty_req_indices: list[int]) -> None:
                     empty_index] = self.allowed_token_ids_mask_cpu_tensor[
                         last_req_index]
 
+            bad_words_token_ids = self.bad_words_token_ids.pop(
+                last_req_index, None)
+            if bad_words_token_ids is not None:
+                self.bad_words_token_ids[empty_index] = bad_words_token_ids
             # Decrement last_req_index since it is now empty.
             last_req_index -= 1
 
@@ -585,6 +579,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             no_penalties=self.no_penalties,
             logit_bias=self.logit_bias[:num_reqs],
             allowed_token_ids_mask=allowed_token_ids_mask,
+            bad_words_token_ids=self.bad_words_token_ids,
         )
 
     def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 81dec429b42..e41427f6c45 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1268,6 +1268,7 @@ def _dummy_sampler_run(
             min_tokens={},
             logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
+            bad_words_token_ids={},
         )
         sampler_output = self.model.sample(logits=logits,
                                            sampling_metadata=dummy_metadata)

From 68fe7a9a339c667befac1041f4d9941a7974125a Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sat, 8 Mar 2025 20:43:37 -0500
Subject: [PATCH 0617/1240] Revert "[V1][Core] Fix memory issue with logits &
 sampling" (#14504)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py | 11 +----
 vllm/v1/worker/gpu_model_runner.py    | 68 ++++++++++++---------------
 vllm/v1/worker/gpu_worker.py          | 21 ---------
 3 files changed, 30 insertions(+), 70 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index ba81f2bb79d..61c79a7bbc9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -142,16 +142,7 @@ def test_end_to_end(model: str, use_v1: bool):
     used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
     # now the memory usage is mostly cudagraph memory pool,
     # and it should be less than the model weights (1B model, 2GiB weights)
-
-    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-    # is captured but cannot be releasesd from PyTorch due to a known bug,
-    # therefore high memory usage after `llm.sleep` is called is expected.
-    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-    # in V1.
-    if use_v1:
-        assert used_bytes < 7 * GiB_bytes
-    else:
-        assert used_bytes < 2 * GiB_bytes
+    assert used_bytes < 2 * GiB_bytes
 
     llm.wake_up()
     output2 = llm.generate(prompt, sampling_params)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e41427f6c45..57b05908a64 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1238,43 +1238,6 @@ def _dummy_run(
             )
         return hidden_states
 
-    @torch.inference_mode()
-    def _dummy_sampler_run(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-
-        logits = self.model.compute_logits(hidden_states, None)
-        num_reqs = logits.size(0)
-
-        dummy_tensors = lambda v: torch.full(
-            (num_reqs, ), v, device=self.device)
-
-        dummy_metadata = SamplingMetadata(
-            temperature=dummy_tensors(0.5),
-            all_greedy=False,
-            all_random=False,
-            top_p=dummy_tensors(0.9),
-            top_k=dummy_tensors(logits.size(1) - 1),
-            min_p=None,
-            generators={},
-            max_num_logprobs=None,
-            no_penalties=True,
-            prompt_token_ids=None,
-            frequency_penalties=dummy_tensors(0.1),
-            presence_penalties=dummy_tensors(0.1),
-            repetition_penalties=dummy_tensors(0.1),
-            output_token_ids=[[] for _ in range(num_reqs)],
-            min_tokens={},
-            logit_bias=[None for _ in range(num_reqs)],
-            allowed_token_ids_mask=None,
-            bad_words_token_ids={},
-        )
-        sampler_output = self.model.sample(logits=logits,
-                                           sampling_metadata=dummy_metadata)
-
-        return sampler_output
-
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         # TODO: handle encoder-decoder models once we support them.
@@ -1390,11 +1353,38 @@ def profile_run(self) -> None:
             hidden_states = self._dummy_run(self.max_num_tokens)
             if get_pp_group().is_last_rank:
                 hidden_states = hidden_states[logit_indices]
-                sampler_output = self._dummy_sampler_run(hidden_states)
+                logits = self.model.compute_logits(hidden_states, None)
+                dummy_tensors = lambda v: torch.full(
+                    (num_reqs, ), v, device=self.device)
+                dummy_metadata = SamplingMetadata(
+                    temperature=dummy_tensors(0.5),
+                    all_greedy=False,
+                    all_random=False,
+                    top_p=dummy_tensors(0.9),
+                    top_k=dummy_tensors(logits.size(1) - 1),
+                    min_p=None,
+                    generators={},
+                    max_num_logprobs=None,
+                    no_penalties=True,
+                    prompt_token_ids=torch.ones_like(logits,
+                                                     dtype=torch.int64),
+                    frequency_penalties=dummy_tensors(0.1),
+                    presence_penalties=dummy_tensors(0.1),
+                    repetition_penalties=dummy_tensors(0.1),
+                    output_token_ids=[[] for _ in range(num_reqs)],
+                    min_tokens={},
+                    logit_bias=[None for _ in range(num_reqs)],
+                    allowed_token_ids_mask=None,
+                    bad_words_token_ids={},
+                )
+                sampler_output = self.model.sample(
+                    logits=logits, sampling_metadata=dummy_metadata)
             else:
+                logits = None
                 sampler_output = None
+                dummy_metadata = None
             torch.cuda.synchronize()
-            del hidden_states, sampler_output
+            del hidden_states, logits, sampler_output, dummy_metadata
             self.encoder_cache.clear()
         gc.collect()
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 01025732559..cc6268d6569 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -119,8 +119,6 @@ def init_device(self):
         self.model_runner: GPUModelRunner = GPUModelRunner(
             self.vllm_config, self.device)
 
-    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
-    # to hijack tensor allocation.
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
@@ -213,25 +211,6 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
-
-        # Warm up sampler and preallocate memory buffer for logits and other
-        # sampling related tensors of max possible shape to avoid memory
-        # fragmentation issue.
-        # NOTE: This is called after `capture_model` on purpose to prevent
-        # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        try:
-            self.model_runner._dummy_sampler_run(
-                hidden_states=self.model_runner._dummy_run(
-                    num_tokens=self.scheduler_config.max_num_seqs))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                raise RuntimeError(
-                    "CUDA out of memory occurred when warming up sampler. "
-                    "Please try lowering `gpu_memory_utilization` when "
-                    "initializing the engine.") from None
-            else:
-                raise e
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From f9b30df72073d8079fbae3dce2f39062025fc7a0 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 8 Mar 2025 21:18:39 -0500
Subject: [PATCH 0618/1240] [Attention] Default to FlashMLA backend for MLA
 (#14451)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cuda.py | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 4be93148139..1bba99088bb 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -112,6 +112,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         compilation_config = vllm_config.compilation_config
+        model_config = vllm_config.model_config
 
         if parallel_config.worker_cls == "auto":
             if scheduler_config.is_multi_step:
@@ -142,14 +143,21 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
+
         # TODO(lucas): handle this more gracefully
-        if envs.VLLM_ATTENTION_BACKEND is not None \
-           and envs.VLLM_ATTENTION_BACKEND == "FLASHMLA" \
-           and cache_config.block_size != 64:
-            cache_config.block_size = 64
-            logger.info(
-                "FlashMLA: Forcing kv cache block size to 64 since this"
-                " is currently the only block size supported by the kernel.")
+        # Note: model_config may be None during testing
+        if model_config is not None and model_config.use_mla:
+            # if `VLLM_ATTENTION_BACKEND` is not set and we are using MLA, then
+            # we default to FlashMLA backend, so we need to force the blocksize
+            # here
+            use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
+                or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
+            from vllm.attention.backends.flashmla import is_flashmla_supported
+            if use_flashmla and is_flashmla_supported()[0] \
+                and cache_config.block_size != 64:
+                cache_config.block_size = 64
+                logger.info(
+                    "Forcing kv cache block size to 64 for FlashMLA backend.")
 
         if (parallel_config.data_parallel_size > 1
                 and compilation_config.use_cudagraph):
@@ -173,7 +181,15 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         if use_mla:
             # TODO(lucas): refactor to  be more concise
             #  we should probably consider factoring out V1 here
-            if selected_backend == _Backend.FLASHMLA:
+            if selected_backend == _Backend.TRITON_MLA or block_size != 64:
+                if use_v1:
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
+                    return ("vllm.v1.attention.backends.mla."
+                            "triton_mla.TritonMLABackend")
+                else:
+                    logger.info("Using Triton MLA backend.")
+                    return "vllm.attention.backends.triton_mla.TritonMLABackend"
+            else:
                 from vllm.attention.backends.flashmla import (
                     is_flashmla_supported)
                 if not is_flashmla_supported()[0]:
@@ -195,14 +211,6 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         logger.info("Using FlashMLA backend.")
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
-
-            if use_v1:
-                logger.info_once("Using Triton MLA backend on V1 engine.")
-                return ("vllm.v1.attention.backends.mla."
-                        "triton_mla.TritonMLABackend")
-            else:
-                logger.info("Using Triton MLA backend.")
-                return "vllm.attention.backends.triton_mla.TritonMLABackend"
         if use_v1:
             logger.info_once("Using Flash Attention backend on V1 engine.")
             return ("vllm.v1.attention.backends.flash_attn."

From 3114e78016c0185f0fc8817782a8d9ed45bfbf71 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Sat, 8 Mar 2025 18:56:04 -0800
Subject: [PATCH 0619/1240] [V1][TPU] Remove unnecessary padding for running on
 TPU. (#14467)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/pallas.py |  4 ++--
 vllm/v1/worker/tpu_model_runner.py   | 20 ++++----------------
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bf3992281a7..bbbdf50ac0c 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -12,8 +12,8 @@
 from vllm.attention.backends.utils import CommonAttentionState
 
 # These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 16
-NUM_KV_PAGES_PER_BLOCK = 256
+NUM_QUERIES_PER_BLOCK = 32
+NUM_KV_PAGES_PER_BLOCK = 128
 
 
 class PallasAttentionBackend(AttentionBackend):
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 1da4a59a26c..1897d859b71 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,9 +23,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               NUM_QUERIES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -78,10 +76,8 @@ def __init__(
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
-        self.max_num_tokens = _get_padded_number(
-            scheduler_config.max_num_batched_tokens, NUM_QUERIES_PER_BLOCK)
-        self.max_num_reqs = _get_padded_number(scheduler_config.max_num_seqs,
-                                               NUM_QUERIES_PER_BLOCK)
+        self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = scheduler_config.max_num_seqs
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(
@@ -142,16 +138,8 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
-        # self.input_batch.block_table has a shape of [max_num_reqs,
-        # max_num_blocks_per_req]. To reduce the number of recompilation,
-        # we want the block_table.shape[0] to be num_tokens.
-        # To make the block_table to be compatible with the paged attention
-        # kernel, we want the block_table[1] to be multiple of
-        # NUM_KV_PAGES_PER_BLOCK.
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From 299def229670142dad100a9387ae6e208a285f3d Mon Sep 17 00:00:00 2001
From: Jiayi Yao <82156730+YaoJiayi@users.noreply.github.com>
Date: Sat, 8 Mar 2025 21:30:06 -0600
Subject: [PATCH 0620/1240] [Feat] Support chunked prefill for LMCache
 connector (#14505)

Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../kv_connector/lmcache_connector.py         | 28 ++++++-------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
index bf9117133af..42de227b6c3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py
@@ -38,7 +38,8 @@ def __init__(
         from lmcache.integration.vllm.utils import ENGINE_NAME
         from lmcache.integration.vllm.vllm_adapter import (
             RetrieveStatus, StoreStatus, init_lmcache_engine,
-            lmcache_retrieve_kv, lmcache_should_store, lmcache_store_kv)
+            lmcache_retrieve_kv, lmcache_should_retrieve, lmcache_should_store,
+            lmcache_store_kv)
         logger.info("Initializing LMCacheConfig under kv_transfer_config %s",
                     self.transfer_config)
 
@@ -54,6 +55,7 @@ def __init__(
         self.cache_config = config.cache_config
         self.lmcache_retrieve_kv = lmcache_retrieve_kv
         self.lmcache_store_kv = lmcache_store_kv
+        self.lmcache_should_retrieve = lmcache_should_retrieve
         self.lmcache_should_store = lmcache_should_store
         self.store_status = StoreStatus
         self.retrieve_status = RetrieveStatus
@@ -65,15 +67,11 @@ def recv_kv_caches_and_hidden_states(
     ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
                "ModelInputForGPUWithSamplingMetadata"]:
 
-        hidden_or_intermediate_states = None
-
-        # TODO (Jiayi): Need to support chunked prefill
-        retrieve_status = self.retrieve_status.PREFILL
-
-        model_input, bypass_model_exec = self.lmcache_retrieve_kv(
-            model_executable, model_input, self.cache_config, kv_caches,
-            retrieve_status)
-
+        retrieve_status = self.lmcache_should_retrieve(model_input)
+        model_input, bypass_model_exec, hidden_or_intermediate_states =\
+            self.lmcache_retrieve_kv(
+                model_executable, model_input, self.cache_config, kv_caches,
+                retrieve_status)
         return hidden_or_intermediate_states, bypass_model_exec, model_input
 
     def send_kv_caches_and_hidden_states(
@@ -84,15 +82,7 @@ def send_kv_caches_and_hidden_states(
         hidden_or_intermediate_states: Union[torch.Tensor,
                                              IntermediateTensors],
     ) -> None:
-        num_reqs = 0
-        seq_group_list = model_input.sampling_metadata.seq_groups
-        assert seq_group_list is not None
-        for seq_group in seq_group_list:
-            seq_ids = seq_group.seq_ids
-            for seq_id in seq_ids:
-                num_reqs += 1
-
-        # TODO (Jiayi): Only normal prefill is supported for now
+
         store_status = self.lmcache_should_store(model_input)
         self.lmcache_store_kv(
             self.model_config,

From e2c591903a5e2b518fb8f7635222483ebdc580f7 Mon Sep 17 00:00:00 2001
From: Yuchen Yan <50619811+yanyc428@users.noreply.github.com>
Date: Sun, 9 Mar 2025 12:14:53 +0800
Subject: [PATCH 0621/1240] [Bugfix] Fix tqdm progress bar when
 SamplingParams.n > 1 (#12428)

Signed-off-by: Yuchen Yan <740987012@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/llm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6eb838fa02e..6c2e87416b9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1394,7 +1394,9 @@ def _run_engine(
                             pbar.postfix = (
                                 f"est. speed input: {in_spd:.2f} toks/s, "
                                 f"output: {out_spd:.2f} toks/s")
-                        pbar.update(1)
+                            pbar.update(len(output.outputs))
+                        else:
+                            pbar.update(1)
 
         if use_tqdm:
             pbar.close()

From f163b36372e3e503533eff9a63e07b5c41609f11 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 9 Mar 2025 12:47:45 +0800
Subject: [PATCH 0622/1240] [Bugfix] Revert QKVCrossParallelLinear usage in
 Mllama to keep BNB quantization work (#14498)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/mllama.py | 39 +++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index a9de63245d9..45f5dea0852 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -43,7 +43,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -814,11 +813,20 @@ def __init__(
         self.q_local_size = self.num_local_heads * self.head_dim
         self.kv_local_size = self.num_local_key_value_heads * self.head_dim
 
-        self.qkv_proj = QKVCrossParallelLinear(
+        # TODO(Isotr0py): Use QKVCrossParallelLinear when it supports
+        # quantization
+        self.q_proj = ColumnParallelLinear(
+            input_size=self.hidden_size,
+            output_size=self.num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.q_proj",
+        )
+        self.kv_proj = QKVParallelLinear(
             self.hidden_size,
             self.head_dim,
-            self.num_heads,
-            self.num_key_value_heads,
+            total_num_heads=0,
+            total_num_kv_heads=self.num_key_value_heads,
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
@@ -854,11 +862,15 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         cross_attention_states: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        q, k, v = self.qkv_proj(hidden_states, cross_attention_states)
+        q, _ = self.q_proj(hidden_states)
         if cross_attention_states is not None:
+            kv, _ = self.kv_proj(cross_attention_states)
+            k, v = kv.split([self.kv_local_size, self.kv_local_size], dim=-1)
             k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
             v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
             k = self.k_norm(k)
+        else:
+            k = v = None
 
         q = q.view(-1, self.num_local_heads, self.head_dim)
         q = self.q_norm(q)
@@ -1149,8 +1161,13 @@ def forward(
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsV0Only):
     packed_modules_mapping = {
-        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"]
+        "self_attn.qkv_proj": [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+        ],
+        "cross_attn.kv_proj": ["cross_attn.k_proj", "cross_attn.v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -1420,9 +1437,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".cross_attn.kv_proj", ".cross_attn.k_proj", "k"),
+            (".cross_attn.kv_proj", ".cross_attn.v_proj", "v"),
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]

From f017b1b65d33ba14cb1cf233b6796bde9b1aa19e Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Sun, 9 Mar 2025 01:44:39 -0800
Subject: [PATCH 0623/1240] [Hardware][TPU] Fix the recompiling issue in logits
 processor after warmup (#14510)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/tpu.py  |  4 ++-
 vllm/v1/worker/tpu_model_runner.py | 47 ++++++++++++++++++++++++------
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index bd0e984627d..4a8f17ba1d0 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -21,7 +21,9 @@
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
 # In real workloads, `enforace_eager` should be `False`.
-llm = LLM(model="google/gemma-2b", enforce_eager=True)
+llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+          max_num_batched_tokens=64,
+          max_num_seqs=4)
 outputs = llm.generate(prompts, sampling_params)
 for output, answer in zip(outputs, answers):
     prompt = output.prompt
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 1897d859b71..00869467be3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -401,6 +401,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         self.query_start_loc_np[0] = 0
         np.cumsum(num_scheduled_tokens_per_req,
                   out=self.query_start_loc_np[1:num_reqs + 1])
+        self.query_start_loc_np[num_reqs + 1:] = 1
 
         self.seq_lens_np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] +
@@ -441,7 +442,10 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # partial request, we do so for simplicity. We will ignore the sampled
         # token from the partial request.
         # TODO: Support prompt logprobs.
-        logits_indices = query_start_loc[1:] - 1
+        padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
+            num_reqs, self.max_num_reqs)
+        logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
+        logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
@@ -551,7 +555,6 @@ def execute_model(
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
-        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
 
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
@@ -579,12 +582,10 @@ def execute_model(
                 kv_caches=self.kv_caches,
                 inputs_embeds=inputs_embeds,
             )
-        hidden_states = hidden_states[:total_num_scheduled_tokens]
         num_reqs = self.input_batch.num_reqs
-        logits_indices = logits_indices[:num_reqs]
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, None)
-        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        selected_token_ids = self.model.compute_logits(hidden_states,
+                                                       logits_indices, None)
+        selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
         # Then, let's update the cache state.
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
@@ -726,12 +727,31 @@ def _dummy_run(
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
             assert self.model is not None
-            self.model(
+            hidden_states = self.model(
                 input_ids=input_ids,
                 positions=position_ids,
                 kv_caches=kv_caches,
                 inputs_embeds=inputs_embeds,
             )
+            num_reqs = _get_padded_num_reqs_with_upper_limit(
+                64, self.max_num_reqs)
+            # NOTE(chengjiyao): In total, the compute_logits function utilizes a
+            # compilation cache size of token_bucket_num multiplied by
+            # req_bucket_num. This is acceptable, given the graph's relatively
+            # small size.
+            while True:
+                logits_indices = torch.zeros(
+                    num_reqs,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                torch._dynamo.mark_dynamic(hidden_states, 0)
+                torch._dynamo.mark_dynamic(logits_indices, 0)
+                self.model.compute_logits(hidden_states, logits_indices, None)
+                if num_reqs >= self.max_num_reqs:
+                    break
+                num_reqs = _get_padded_num_reqs_with_upper_limit(
+                    num_reqs + 1, self.max_num_reqs)
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -823,13 +843,17 @@ def forward(
 
         return hidden_states
 
+    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
+        logits_indices: torch.Tensor,
         sampling_metadata,
     ) -> Optional[torch.Tensor]:
+        hidden_states = hidden_states[logits_indices]
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
-        return logits
+        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
+        return selected_token_ids
 
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
@@ -846,3 +870,8 @@ def _get_padded_token_len(x: int) -> int:
     if x <= 16:
         return 16
     return 1 << (x - 1).bit_length()
+
+
+def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
+    res = 64 if x <= 64 else 1 << (x - 1).bit_length()
+    return min(res, upper_limit)

From 4921cce49f610fcbb3dc0a786add215fc9716dda Mon Sep 17 00:00:00 2001
From: Yanyi Liu <wolfsonliu@163.com>
Date: Sun, 9 Mar 2025 20:13:31 +0800
Subject: [PATCH 0624/1240] [Misc] Ensure out-of-tree quantization method
 recognize by cli args (#14328)

Signed-off-by: liuyanyi <wolfsonliu@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 351ac175e3e..0d285acd15f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1478,15 +1478,15 @@ class AsyncEngineArgs(EngineArgs):
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
                      async_args_only: bool = False) -> FlexibleArgumentParser:
+        # Initialize plugin to update the parser, for example, The plugin may
+        # adding a new kind of quantization method to --quantization argument or
+        # a new device to --device argument.
+        load_general_plugins()
         if not async_args_only:
             parser = EngineArgs.add_cli_args(parser)
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
-        # Initialize plugin to update the parser, for example, The plugin may
-        # adding a new kind of quantization method to --quantization argument or
-        # a new device to --device argument.
-        load_general_plugins()
         from vllm.platforms import current_platform
         current_platform.pre_register_and_update(parser)
         return parser

From 268a0dcde2330d4040b75eadcead9a880707f7b6 Mon Sep 17 00:00:00 2001
From: Martin Hoyer <mhoyer@redhat.com>
Date: Mon, 10 Mar 2025 03:49:46 +0100
Subject: [PATCH 0625/1240] [Bugfix] Wrong requirements path - rocm (#14527)

Signed-off-by: Martin Hoyer <mhoyer@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 02ccb8eff4e..e2d9ab37533 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -45,7 +45,7 @@ RUN cd vllm \
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples

From 7ae8756595cf2ff52f56d38e47250ffa7e840047 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <JenZhao@users.noreply.github.com>
Date: Mon, 10 Mar 2025 00:23:11 -0700
Subject: [PATCH 0626/1240] [Feature] Consolidate performance benchmark
 datasets (#14036)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_dataset.py    | 667 +++++++++++++++++++++++++++++
 benchmarks/benchmark_serving.py    | 459 ++++----------------
 benchmarks/benchmark_throughput.py | 278 ++++--------
 3 files changed, 825 insertions(+), 579 deletions(-)
 create mode 100644 benchmarks/benchmark_dataset.py

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
new file mode 100644
index 00000000000..30fffdda491
--- /dev/null
+++ b/benchmarks/benchmark_dataset.py
@@ -0,0 +1,667 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This module defines a framework for sampling benchmark requests from various
+datasets. Each dataset subclass of BenchmarkDataset must implement sample
+generation. Supported dataset types include:
+  - ShareGPT
+  - Random (synthetic)
+  - Sonnet
+  - BurstGPT
+  - HuggingFace
+  - VisionArena
+
+TODO: Implement CustomDataset to parse a JSON file and convert its contents into
+SampleRequest instances, similar to the approach used in ShareGPT.
+"""
+
+import base64
+import io
+import json
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from dataclasses import dataclass
+from functools import cache
+from typing import Any, Optional, Union
+
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from PIL import Image
+from transformers import PreTrainedTokenizerBase
+
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+
+# -----------------------------------------------------------------------------
+# Data Classes
+# -----------------------------------------------------------------------------
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+# -----------------------------------------------------------------------------
+# Benchmark Dataset Base Class
+# -----------------------------------------------------------------------------
+
+
+class BenchmarkDataset(ABC):
+    DEFAULT_SEED = 0
+
+    # num_requests has default 1000 in both the benchmark_serving.py and
+    # benchmark_throughput.py
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = (random_seed
+                            if random_seed is not None else self.DEFAULT_SEED)
+        self.data = None
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+        
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+        
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError(
+            "load_data must be implemented in subclasses.")
+
+    def get_random_lora_request(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+    ) -> tuple[Optional[LoRARequest], AnyTokenizer]:
+        """
+        Optionally select a random LoRA request and return its associated
+        tokenizer.
+        
+        This method is used when LoRA parameters are provided.  It randomly
+        selects a LoRA based on max_loras and retrieves a cached tokenizer for
+        that LoRA if available. Otherwise, it returns the base tokenizer.
+        
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
+            LoRA is selected.  max_loras (Optional[int]): The maximum number of
+            LoRAs available. If None, LoRA is not used.  lora_path
+            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
+            is not used.
+        
+        Returns:
+            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
+            element is a LoRARequest (or None if not applicable) and the second
+            element is the tokenizer associated with the LoRA request (or the
+            base tokenizer).
+        """
+        if max_loras is None or lora_path is None:
+            return None, tokenizer
+
+        # Generate a random LoRA ID in the range [1, max_loras].
+        lora_id = random.randint(1, max_loras)
+        lora_request = LoRARequest(
+            lora_name=str(lora_id),
+            lora_int_id=lora_id,
+            lora_path=lora_path_on_disk(lora_path),
+        )
+        if lora_id not in lora_tokenizer_cache:
+            lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+        # Return lora_request and the cached tokenizer if available; otherwise,
+        # return the base tokenizer
+        return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
+
+    @abstractmethod
+    def sample(self, tokenizer: PreTrainedTokenizerBase,
+               num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+        
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+        
+        Args:
+            tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
+             for processing the dataset's text.
+            num_requests (int): The number of sample requests to generate.
+        
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions and Global Caches
+# -----------------------------------------------------------------------------
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len
+                                                            < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long
+                or combined_too_long)
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+# Global cache for LoRA tokenizers.
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    For a PIL.Image.Image input:
+      - Converts the image to RGB.
+      - Saves the image as a JPEG in-memory.
+      - Encodes the JPEG data as a base64 string.
+      - Returns a dictionary with the image as a base64 data URL.
+
+    For a string input:
+      - Treats the string as a URL or file path.
+      - Prepends "file://" if the string doesn't start with "http://" or
+        "file://".
+      - Returns a dictionary with the image URL.
+
+    Raises:
+      ValueError: If the input is neither a PIL.Image.Image nor a string.
+    """
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(
+                image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{image_base64}"
+            },
+        }
+
+    if isinstance(image, str):
+        image_url = (image if image.startswith(
+            ("http://", "file://")) else f"file://{image}")
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+
+
+# -----------------------------------------------------------------------------
+# Random Dataset Implementation (Synthetic Data)
+# -----------------------------------------------------------------------------
+
+
+class RandomDataset(BenchmarkDataset):
+    # Default values copied from benchmark_serving.py for the random dataset.
+    DEFAULT_PREFIX_LEN = 0
+    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_INPUT_LEN = 1024
+    DEFAULT_OUTPUT_LEN = 128
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               range_ratio: float = DEFAULT_RANGE_RATIO,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               **kwargs) -> list[SampleRequest]:
+
+        vocab_size = tokenizer.vocab_size
+
+        prefix_token_ids = (np.random.randint(
+            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
+
+        input_low = int(input_len * range_ratio)
+        output_low = int(output_len * range_ratio)
+
+        input_lens = np.random.randint(input_low,
+                                       input_len + 1,
+                                       size=num_requests)
+        output_lens = np.random.randint(output_low,
+                                        output_len + 1,
+                                        size=num_requests)
+        offsets = np.random.randint(0, vocab_size, size=num_requests)
+
+        requests = []
+        for i in range(num_requests):
+            inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
+                         vocab_size).tolist()
+            token_sequence = prefix_token_ids + inner_seq
+            prompt = tokenizer.decode(token_sequence)
+            total_input_len = prefix_len + int(input_lens[i])
+            requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=total_input_len,
+                    expected_output_len=int(output_lens[i]),
+                ))
+        return requests
+
+
+# -----------------------------------------------------------------------------
+# ShareGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ShareGPTDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = json.load(f)
+        # Filter entries with at least two conversation turns.
+        self.data = [
+            entry for entry in self.data
+            if "conversations" in entry and len(entry["conversations"]) >= 2
+        ]
+        random.seed(self.random_seed)
+        random.shuffle(self.data)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               lora_path: Optional[str] = None,
+               max_loras: Optional[int] = None,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        samples: list = []
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt, completion = entry["conversations"][0]["value"],\
+                entry["conversations"][1]["value"]
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            new_output_len = (len(completion_ids)
+                              if output_len is None else output_len)
+            if not is_valid_sequence(prompt_len,
+                                     new_output_len,
+                                     skip_min_output_len_check=output_len
+                                     is not None):
+                continue
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=new_output_len,
+                    lora_request=lora_request,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# Sonnet Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class SonnetDataset(BenchmarkDataset):
+    """
+    Simplified implementation of the Sonnet dataset.  Loads poem lines from a
+    text file and generates sample requests.  Default values here copied from
+    `benchmark_serving.py` for the sonnet dataset.
+    """
+
+    DEFAULT_PREFIX_LEN = 200
+    DEFAULT_INPUT_LEN = 550
+    DEFAULT_OUTPUT_LEN = 150
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided.")
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = f.readlines()
+
+    def sample(self,
+               tokenizer,
+               num_requests: int,
+               prefix_len: int = DEFAULT_PREFIX_LEN,
+               input_len: int = DEFAULT_INPUT_LEN,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               return_prompt_formatted: bool = False,
+               **kwargs) -> list:
+        # Calculate average token length for a poem line.
+        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
+        avg_len = sum(len(tokens)
+                      for tokens in \
+                        tokenized_lines) / len(tokenized_lines)
+
+        # Build the base prompt.
+        base_prompt = "Pick as many lines as you can from these poem lines:\n"
+        base_msg = [{"role": "user", "content": base_prompt}]
+        base_fmt = tokenizer.apply_chat_template(base_msg,
+                                                 add_generation_prompt=True,
+                                                 tokenize=False)
+        base_offset = len(tokenizer(base_fmt).input_ids)
+        if input_len <= base_offset:
+            raise ValueError(
+                f"'input_len' must be higher than the base prompt length "
+                f"({base_offset}).")
+
+        # Determine how many poem lines to use.
+        num_input_lines = round((input_len - base_offset) / avg_len)
+        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        prefix_lines = self.data[:num_prefix_lines]
+
+        samples = []
+        for _ in range(num_requests):
+            extra_lines = random.choices(self.data,
+                                         k=num_input_lines - num_prefix_lines)
+            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
+            msg = [{"role": "user", "content": prompt}]
+            prompt_formatted = tokenizer.apply_chat_template(
+                msg, add_generation_prompt=True, tokenize=False)
+            prompt_len = len(tokenizer(prompt_formatted).input_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt_formatted
+                    if return_prompt_formatted else prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# BurstGPT Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class BurstGPTDataset(BenchmarkDataset):
+    """
+    Implements the BurstGPT dataset.  Loads data from a CSV file and generates
+    sample requests based on synthetic prompt generation. Only rows with Model
+    "GPT-4" and positive response tokens are used.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self, ):
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        df = pd.read_csv(self.dataset_path)
+        # Filter to keep only GPT-4 rows.
+        gpt4_df = df[df["Model"] == "GPT-4"]
+        # Remove failed requests (where Response tokens is 0 or less).
+        gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+        # Sample the desired number of rows.
+        self.data = gpt4_df
+
+    def _sample_loaded_data(self, num_requests: int) -> list:
+        if num_requests <= len(self.data):
+            data = self.data.sample(n=num_requests,
+                                    random_state=self.random_seed)
+        else:
+            data = self.data.sample(
+                n=num_requests,
+                random_state=self.random_seed,
+                replace=True,
+            )
+        # Convert the dataframe to a list of lists.
+        return data.values.tolist()
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               max_loras: Optional[int] = None,
+               lora_path: Optional[str] = None,
+               **kwargs) -> list[SampleRequest]:
+        samples = []
+        data = self._sample_loaded_data(num_requests=num_requests)
+        for i in range(num_requests):
+            input_len = int(data[i][2])
+            output_len = int(data[i][3])
+            lora_req, tokenizer = self.get_random_lora_request(
+                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
+            vocab_size = tokenizer.vocab_size
+            # Generate a synthetic prompt: a list of token IDs computed as (i +
+            # j) modulo vocab_size.
+            token_ids = [(i + j) % vocab_size for j in range(input_len)]
+            prompt = tokenizer.decode(token_ids)
+            samples.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=input_len,
+                    expected_output_len=output_len,
+                    lora_request=lora_req,
+                ))
+        return samples
+
+
+# -----------------------------------------------------------------------------
+# HuggingFace Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class HuggingFaceDataset(BenchmarkDataset):
+    """
+    Dataset class for processing a HuggingFace dataset with conversation data
+    and optional images.
+    """
+    DEFAULT_NUM_REQUESTS = 1000
+
+    def __init__(
+        self,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        if not self.dataset_path:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        self.data = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+
+        if "conversations" not in self.data.features:
+            raise ValueError("HF Dataset must have a 'conversations' column.")
+
+        # Shuffle and filter examples with at least 2 conversations.
+        self.data = self.data.shuffle(seed=self.random_seed).filter(
+            lambda x: len(x["conversations"]) >= 2)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               lora_path: Optional[str] = None,
+               max_loras: Optional[int] = None,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+
+            conv = item["conversations"]
+            prompt, completion = conv[0]["value"], conv[1]["value"]
+
+            lora_request, tokenizer = self.get_random_lora_request(
+                tokenizer, lora_path=lora_path, max_loras=max_loras)
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(
+                    prompt_len, completion_len):
+                continue
+
+            mm_content = process_image(
+                item["image"]) if "image" in item else None
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                    lora_request=lora_request,
+                ))
+        return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Vision Arena Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class VisionArenaDataset(BenchmarkDataset):
+    """
+    Vision Arena Dataset.
+    """
+
+    DEFAULT_OUTPUT_LEN = 128
+    DEFAULT_NUM_REQUESTS = 1000
+    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
+
+    def __init__(
+        self,
+        dataset_split: str,
+        dataset_subset: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.dataset_split = dataset_split
+        self.dataset_subset = dataset_subset
+
+        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
+            raise ValueError(f"Only support Vision Arena dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+        self.load_data()
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: int = DEFAULT_OUTPUT_LEN,
+               **kwargs) -> list:
+        # TODO (jenniferzhao): Add support for offline benchmark sampling
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = item["turns"][0][0]["content"]
+            prompt_len = len(tokenizer(prompt).input_ids)
+            mm_content = process_image(item["images"][0])
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=mm_content,
+                ))
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index b95c8b14cc0..1dd01ca9686 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,25 +25,20 @@
 """
 import argparse
 import asyncio
-import base64
 import gc
-import io
 import json
 import os
 import random
 import time
 import warnings
-from collections.abc import AsyncGenerator, Collection
+from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
 from typing import Any, Optional
 
 import numpy as np
-import pandas as pd
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
                                   RequestFuncOutput)
-from datasets import load_dataset
-from PIL.Image import Image
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
 
@@ -57,6 +52,9 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -92,325 +90,18 @@ class BenchmarkMetrics:
     percentiles_e2el_ms: list[tuple[float, float]]
 
 
-def sample_sharegpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, int, int, None]]:
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: list[tuple[str, int, int]] = []
-    for i in range(len(dataset)):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append((prompt, prompt_len, output_len, None))
-
-    return filtered_dataset
-
-
-def sample_burstgpt_requests(
-    dataset_path: str,
-    num_requests: int,
-    random_seed: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int, None]]:
-    df = pd.read_csv(dataset_path)
-    gpt4_df = df[df["Model"] == "GPT-4"]
-    # Remove the failed requests (i.e., response length is 0)
-    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
-    # Randomly sample num_requests from the dataset
-    if num_requests <= len(gpt4_df):
-        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
-    else:
-        gpt4_df = gpt4_df.sample(n=num_requests,
-                                 random_state=random_seed,
-                                 replace=True)
-    # Convert the dataframe to a list of tuples
-    dataset = gpt4_df.values.tolist()
-    input_requests = []
-    for i in range(num_requests):
-        input_len = int(dataset[i][2])
-        output_len = int(dataset[i][3])
-        prompt = tokenizer.decode([(i + j) % tokenizer.vocab_size
-                                   for j in range(input_len)])
-        input_requests.append((prompt, input_len, output_len, None))
-    return input_requests
-
-
-def sample_sonnet_requests(
-    dataset_path: str,
-    num_requests: int,
-    input_len: int,
-    output_len: int,
-    prefix_len: int,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, str, int, int, None]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.sonnet-prefix-len'."
-
-    # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
-        poem_lines = f.readlines()
-
-    # Tokenize the poem lines.
-    poem_token_ids = tokenizer(poem_lines).input_ids
-    average_poem_len = sum(
-        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
-
-    # Base prefix for all requests.
-    base_prompt = "Pick as many lines as you can from these poem lines:\n"
-    base_message = [{
-        "role": "user",
-        "content": base_prompt,
-    }]
-    base_prompt_formatted = tokenizer.apply_chat_template(
-        base_message, add_generation_prompt=True, tokenize=False)
-    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
-
-    assert (
-        input_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
-    num_input_lines = round(
-        (input_len - base_prompt_offset) / average_poem_len)
-
-    # First approximately `prefix_len` number of tokens in the
-    # prompt are fixed poem lines.
-    assert (
-        prefix_len > base_prompt_offset
-    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
-
-    num_prefix_lines = round(
-        (prefix_len - base_prompt_offset) / average_poem_len)
-    prefix_lines = poem_lines[:num_prefix_lines]
-
-    # Sample the rest of lines per request.
-    sampled_requests: list[tuple[str, int, int]] = []
-    for _ in range(num_requests):
-        num_lines_needed = num_input_lines - num_prefix_lines
-        sampled_lines = "".join(prefix_lines +
-                                random.choices(poem_lines, k=num_lines_needed))
-
-        prompt = f"{base_prompt}{sampled_lines}"
-        message = [
-            {
-                "role": "user",
-                "content": prompt,
-            },
-        ]
-        prompt_formatted = tokenizer.apply_chat_template(
-            message, add_generation_prompt=True, tokenize=False)
-        prompt_len = len(tokenizer(prompt_formatted).input_ids)
-        sampled_requests.append(
-            (prompt, prompt_formatted, prompt_len, output_len, None))
-
-    return sampled_requests
-
-
-def sample_vision_arena_requests(
-    dataset,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
-    sampled_requests: list[tuple[str, int, int, dict[str,
-                                                     Collection[str]]]] = []
-    for data in dataset:
-        if len(sampled_requests) == num_requests:
-            break
-
-        prompt = data["turns"][0][0]['content']
-
-        prompt_token_ids = tokenizer(prompt).input_ids
-        if fixed_output_len is None:
-            # Default max output len is set to 128
-            print("--hf-output-len is not provided. Using default value 128.")
-            fixed_output_len = 128
-
-        prompt_len = len(prompt_token_ids)
-        output_len = fixed_output_len
-
-        assert isinstance(
-            data["images"][0],
-            Image), ("Input image format must be `PIL.Image.Image`, "
-                     f"given {type(data['image'])}.")
-        image: Image = data["images"][0]
-        image = image.convert("RGB")
-        image_data = io.BytesIO()
-        image.save(image_data, format='JPEG')
-        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
-        mm_content = {
-            "type": "image_url",
-            "image_url": {
-                "url": f"data:image/jpeg;base64,{image_base64}"
-            },
-        }
-
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-
-    return sampled_requests
-
-
-def sample_hf_requests(
-    dataset_path: str,
-    dataset_subset: Optional[str],
-    dataset_split: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    random_seed: int,
-    fixed_output_len: Optional[int] = None,
-) -> list[tuple[str, str, int, Optional[dict[str, Collection[str]]]]]:
-
-    # Special case for vision_arena dataset
-    if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
-        and dataset_subset is None:
-        assert dataset_split == "train"
-        dataset = load_dataset(dataset_path,
-                               name=dataset_subset,
-                               split=dataset_split,
-                               streaming=True)
-        dataset = dataset.shuffle(seed=random_seed)
-        return sample_vision_arena_requests(dataset, num_requests, tokenizer,
-                                            fixed_output_len)
-
-    dataset = load_dataset(dataset_path,
-                           name=dataset_subset,
-                           split=dataset_split,
-                           streaming=True)
-    assert "conversations" in dataset.features, (
-        "HF Dataset must have 'conversations' column.")
-    filter_func = lambda x: len(x["conversations"]) >= 2
-    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
-    sampled_requests: list[tuple[str, int, int, dict[str,
-                                                     Collection[str]]]] = []
-    for data in filtered_dataset:
-        if len(sampled_requests) == num_requests:
-            break
-
-        # Tokenize the prompts and completions.
-        prompt = data["conversations"][0]["value"]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = data["conversations"][1]["value"]
-        completion_token_ids = tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
-            # Prune too short sequences.
-            continue
-        if fixed_output_len is None and \
-            (prompt_len > 1024 or prompt_len + output_len > 2048):
-            # Prune too long sequences.
-            continue
-
-        if "image" in data and isinstance(data["image"], Image):
-            image: Image = data["image"]
-            image = image.convert("RGB")
-            image_data = io.BytesIO()
-            image.save(image_data, format='JPEG')
-            image_base64 = base64.b64encode(
-                image_data.getvalue()).decode("utf-8")
-            mm_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": f"data:image/jpeg;base64,{image_base64}"
-                },
-            }
-        elif "image" in data and isinstance(data["image"], str):
-            if (data["image"].startswith("http://") or \
-                data["image"].startswith("file://")):
-                image_url = data["image"]
-            else:
-                image_url = f"file://{data['image']}"
-
-            mm_content = {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                },
-            }
-        else:
-            mm_content = None
-
-        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
-
-    return sampled_requests
-
-
-def sample_random_requests(
-    prefix_len: int,
-    input_len: int,
-    output_len: int,
-    num_prompts: int,
-    range_ratio: float,
-    tokenizer: PreTrainedTokenizerBase,
-) -> list[tuple[str, int, int]]:
-    prefix_token_ids = np.random.randint(0,
-                                         tokenizer.vocab_size,
-                                         size=prefix_len).tolist()
-
-    input_lens = np.random.randint(
-        int(input_len * range_ratio),
-        input_len + 1,
-        size=num_prompts,
-    )
-    output_lens = np.random.randint(
-        int(output_len * range_ratio),
-        output_len + 1,
-        size=num_prompts,
-    )
-    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
-    input_requests = []
-    for i in range(num_prompts):
-        prompt = tokenizer.decode(prefix_token_ids +
-                                  [(offsets[i] + i + j) % tokenizer.vocab_size
-                                   for j in range(input_lens[i])])
-
-        input_requests.append((prompt, int(prefix_len + input_lens[i]),
-                               int(output_lens[i]), None))
-
-    return input_requests
-
-
 async def get_request(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     request_rate: float,
     burstiness: float = 1.0,
-) -> AsyncGenerator[tuple[str, int, int], None]:
+) -> AsyncGenerator[SampleRequest, None]:
     """
     Asynchronously generates requests at a specified rate
     with OPTIONAL burstiness.
 
     Args:
         input_requests:
-            A list of input requests, each represented as a tuple.
+            A list of input requests, each represented as a SampleRequest.
         request_rate:
             The rate at which requests are generated (requests/s).
         burstiness (optional):
@@ -422,7 +113,7 @@ async def get_request(
             in more bursty requests, while a higher burstiness value
             (burstiness > 1) results in a more uniform arrival of requests.
     """
-    input_requests = iter(input_requests)
+    input_requests: Iterable[SampleRequest] = iter(input_requests)
 
     # Calculate scale parameter theta to maintain the desired request_rate.
     assert burstiness > 0, (
@@ -444,7 +135,7 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
@@ -475,7 +166,7 @@ def calculate_metrics(
                     tokenizer(outputs[i].generated_text,
                               add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
-            total_input += input_requests[i][1]
+            total_input += input_requests[i].prompt_len
             tpot = 0
             if output_len > 1:
                 latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@@ -558,18 +249,18 @@ async def benchmark(
     model_id: str,
     model_name: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: list[tuple[str, int, int]],
+    input_requests: list[SampleRequest],
     logprobs: Optional[int],
     request_rate: float,
     burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: list[str],
-    selected_percentiles: list[str],
+    selected_percentiles: list[float],
     ignore_eos: bool,
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
-    lora_modules: Optional[list[str]],
+    lora_modules: Optional[Iterable[str]],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -577,12 +268,16 @@ async def benchmark(
         raise ValueError(f"Unknown backend: {backend}")
 
     print("Starting initial single prompt test run...")
-    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
-        input_requests[0])
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = \
+        input_requests[0].prompt, input_requests[0].prompt_len, \
+        input_requests[0].expected_output_len, \
+            input_requests[0].multi_modal_data
+
     if backend != "openai-chat" and test_mm_content is not None:
         # multi-modal benchmark is only available on OpenAI Chat backend.
         raise ValueError(
             "Multi-modal content is only supported on 'openai-chat' backend.")
+    assert test_mm_content is None or isinstance(test_mm_content, dict)
     test_input = RequestFuncInput(
         model=model_id,
         model_name=model_name,
@@ -606,7 +301,8 @@ async def benchmark(
     if lora_modules:
         # For each input request, choose a LoRA module at random.
         lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))])
+            [random.choice(lora_modules) \
+                for _ in range(len(input_requests))])
 
     if profile:
         print("Starting profiler...")
@@ -652,7 +348,9 @@ async def limited_request_func(request_func_input, pbar):
     benchmark_start_time = time.perf_counter()
     tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate, burstiness):
-        prompt, prompt_len, output_len, mm_content = request
+        prompt, prompt_len, output_len, mm_content = request.prompt, \
+            request.prompt_len, request.expected_output_len, \
+                request.multi_modal_data
         req_model_id, req_model_name = model_id, model_name
         if lora_modules:
             req_lora_module = next(lora_modules)
@@ -867,76 +565,72 @@ def main(args: argparse.Namespace):
             "Please specify '--dataset-name' and the corresponding "
             "'--dataset-path' if required.")
 
-    elif args.dataset_name == "sharegpt":
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-
-    elif args.dataset_name == "burstgpt":
-        input_requests = sample_burstgpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            random_seed=args.seed,
-            tokenizer=tokenizer,
-        )
-
-    elif args.dataset_name == "sonnet":
-        # Do not format the prompt, pass to message directly
+    if args.dataset_name == "sonnet":
+        dataset = SonnetDataset(dataset_path=args.dataset_path)
+        # For the "sonnet" dataset, formatting depends on the backend.
         if args.backend == "openai-chat":
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [(prompt, prompt_len, output_len, None)
-                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
+            input_requests = dataset.sample(num_requests=args.num_prompts,
+                                            input_len=args.sonnet_input_len,
+                                            output_len=args.sonnet_output_len,
+                                            prefix_len=args.sonnet_prefix_len,
+                                            tokenizer=tokenizer,
+                                            return_prompt_formatted=False)
         else:
-            assert (
-                tokenizer.chat_template or tokenizer.default_chat_template
-            ), "Tokenizer/model must have chat template for sonnet dataset."
-            input_requests = sample_sonnet_requests(
-                dataset_path=args.dataset_path,
-                num_requests=args.num_prompts,
-                input_len=args.sonnet_input_len,
-                output_len=args.sonnet_output_len,
-                prefix_len=args.sonnet_prefix_len,
-                tokenizer=tokenizer,
-            )
-            input_requests = [(prompt_formatted, prompt_len, output_len, None)
-                              for prompt, prompt_formatted, prompt_len,
-                              output_len, _ in input_requests]
+            assert tokenizer.chat_template or tokenizer.default_chat_template, (
+                "Tokenizer/model must have chat template for sonnet dataset.")
+            input_requests = dataset.sample(num_requests=args.num_prompts,
+                                            input_len=args.sonnet_input_len,
+                                            output_len=args.sonnet_output_len,
+                                            prefix_len=args.sonnet_prefix_len,
+                                            tokenizer=tokenizer,
+                                            return_prompt_formatted=True)
 
     elif args.dataset_name == "hf":
-        input_requests = sample_hf_requests(
+        # Choose between VisionArenaDataset
+        # and HuggingFaceDataset based on provided parameters.
+        dataset_class = (VisionArenaDataset if args.dataset_path
+                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                         and args.hf_subset is None else HuggingFaceDataset)
+        input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
             dataset_split=args.hf_split,
+        ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
             random_seed=args.seed,
-            fixed_output_len=args.hf_output_len,
-        )
-
-    elif args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            prefix_len=args.random_prefix_len,
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
+            output_len=args.hf_output_len,
         )
 
     else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+        # For datasets that follow a similar structure, use a mapping.
+        dataset_mapping = {
+            "sharegpt":
+            lambda: ShareGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).sample(
+                                        tokenizer=tokenizer,
+                                        num_requests=args.num_prompts,
+                                        output_len=args.sharegpt_output_len,
+                                    ),
+            "burstgpt":
+            lambda: BurstGPTDataset(random_seed=args.seed,
+                                    dataset_path=args.dataset_path).
+            sample(tokenizer=tokenizer, num_requests=args.num_prompts),
+            "random":
+            lambda: RandomDataset(dataset_path=args.dataset_path).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                prefix_len=args.random_prefix_len,
+                input_len=args.random_input_len,
+                output_len=args.random_output_len,
+                range_ratio=args.random_range_ratio,
+            )
+        }
 
+        try:
+            input_requests = dataset_mapping[args.dataset_name]()
+        except KeyError as err:
+            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
     # Avoid GC processing "static" data - reduce pause times.
@@ -1298,4 +992,5 @@ def main(args: argparse.Namespace):
                         "script chooses a LoRA module at random.")
 
     args = parser.parse_args()
+
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 4ab824470b2..7e6556733b2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,13 +6,14 @@
 import os
 import random
 import time
-from functools import cache
+import warnings
 from typing import Any, Optional, Union
 
 import torch
 import uvloop
+from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
+                               ShareGPTDataset, SonnetDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -22,148 +23,10 @@
     build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
-from vllm.lora.utils import get_adapter_absolute_path
-from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
-@dataclasses.dataclass
-class SampleRequest:
-    """A class representing a single inference request for benchmarking.
-
-    Attributes:
-        prompt: The input text prompt for the model.
-        prompt_len: The length of the prompt in tokens.
-        expected_output_len: The expected length of the output in tokens.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
-        lora_request: Optional LoRARequest specifying the LoRA to use. 
-    """
-    prompt: str
-    prompt_len: int
-    expected_output_len: int
-    multi_modal_data: Optional[MultiModalDataDict] = None
-    lora_request: Optional[LoRARequest] = None
-
-
-def _get_prompt_for_image_model(question: str, *, model: str) -> str:
-    """Prepend and append special tokens around the question to form a prompt.
-
-    Args:
-        question: The input question text to wrap with special tokens
-        model: The name of the model being used, to determine which special
-            tokens to add
-
-    Returns:
-        The formatted prompt string with appropriate special tokens for the
-            model
-
-    Raises:
-        ValueError: If an unsupported model name is provided
-    """
-    model = model.lower()
-    if "pixtral" in model:
-        return f"<s>[INST]{question}\n[IMG][/INST]"
-    raise ValueError(f"Unsupported model {model}")
-
-
-@cache
-def lora_path_on_disk(lora_path: str) -> str:
-    return get_adapter_absolute_path(lora_path)
-
-
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
-
-
-def get_random_lora_request(
-        args: argparse.Namespace
-) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
-    global lora_tokenizer_cache
-    lora_id = random.randint(1, args.max_loras)
-    lora_request = LoRARequest(lora_name=str(lora_id),
-                               lora_int_id=lora_id,
-                               lora_path=lora_path_on_disk(args.lora_path))
-    if lora_id not in lora_tokenizer_cache:
-        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
-    return lora_request, lora_tokenizer_cache[lora_id]
-
-
-def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> list[SampleRequest]:
-
-    dataset_path: str = args.dataset
-    num_requests: int = args.num_prompts
-    fixed_output_len: Optional[int] = args.output_len
-    model: str = args.model
-    if fixed_output_len is not None and fixed_output_len < 4:
-        raise ValueError("output_len too small")
-
-    # Load the dataset.
-    with open(dataset_path) as f:
-        dataset = json.load(f)
-    # Filter out the conversations with less than 2 turns.
-    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Shuffle the dataset.
-    random.shuffle(dataset)
-
-    # Filter out sequences that are too long or too short
-    filtered_dataset: list[SampleRequest] = []
-    for data in tqdm(dataset,
-                     total=len(filtered_dataset),
-                     desc="sampling requests"):
-        if len(filtered_dataset) == num_requests:
-            break
-
-        # Only keep the first two turns of each conversation.
-        prompt = data["conversations"][0]["value"]
-        completion = data["conversations"][1]["value"]
-
-        multi_modal_data: Optional[MultiModalDataDict] = None
-        if "image" in data:
-            multi_modal_data = multi_modal_data or {}
-            image_path = data["image"]
-            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
-            assert isinstance(image_path,
-                              str), "Only support single image input"
-            try:
-                multi_modal_data["image"] = Image.open(image_path).convert(
-                    "RGB")
-            except FileNotFoundError:
-                # Ignore datapoint where asset is missing
-                continue
-            prompt = _get_prompt_for_image_model(question=prompt, model=model)
-
-        request_tokenizer = tokenizer
-        lora_request: Optional[LoRARequest] = None
-        if args.enable_lora:
-            lora_request, lora_tokenizer = get_random_lora_request(args)
-            if lora_tokenizer:
-                request_tokenizer = lora_tokenizer
-
-        # Tokenize the prompts and completions.
-        prompt_token_ids = request_tokenizer(prompt).input_ids
-        completion_token_ids = request_tokenizer(completion).input_ids
-        prompt_len = len(prompt_token_ids)
-        output_len = len(completion_token_ids
-                         ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or output_len < 4:
-            # Prune too short sequences.
-            continue
-        if prompt_len > 1024 or prompt_len + output_len > 2048:
-            # Prune too long sequences.
-            continue
-        filtered_dataset.append(
-            SampleRequest(prompt=prompt,
-                          prompt_len=prompt_len,
-                          expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data,
-                          lora_request=lora_request))
-
-    return filtered_dataset
-
-
 def run_vllm(
     requests: list[SampleRequest],
     n: int,
@@ -381,61 +244,50 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
         write_to_json(pt_file, pt_records)
 
 
+def get_requests(args, tokenizer):
+    # Common parameters for all dataset types.
+    common_kwargs = {
+        "dataset_path": args.dataset_path,
+        "random_seed": args.seed,
+    }
+    sample_kwargs = {
+        "tokenizer": tokenizer,
+        "lora_path": args.lora_path,
+        "max_loras": args.max_loras,
+        "num_requests": args.num_prompts,
+        "input_len": args.input_len,
+        "output_len": args.output_len,
+    }
+    if args.dataset_path is None or args.dataset_name == "random":
+        sample_kwargs["range_ratio"] = args.random_range_ratio
+        sample_kwargs["prefix_len"] = args.prefix_len
+        dataset_cls = RandomDataset
+    elif args.dataset_name == "sharegpt":
+        dataset_cls = ShareGPTDataset
+    elif args.dataset_name == "sonnet":
+        assert tokenizer.chat_template or tokenizer.default_chat_template, (
+            "Tokenizer/model must have chat template for sonnet dataset.")
+        dataset_cls = SonnetDataset
+        sample_kwargs["prefix_len"] = args.prefix_len
+        sample_kwargs["return_prompt_formatted"] = True
+    elif args.dataset_name == "burstgpt":
+        dataset_cls = BurstGPTDataset
+    else:
+        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
+    # Remove None values
+    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
+    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
+
+
 def main(args: argparse.Namespace):
+    if args.seed is None:
+        args.seed = 0
     print(args)
     random.seed(args.seed)
-
     # Sample the requests.
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
-    if args.dataset is None:
-        vocab_size = tokenizer.vocab_size
-        requests = []
-        for _ in range(args.num_prompts):
-
-            request_tokenizer = tokenizer
-            lora_request: Optional[LoRARequest] = None
-            if args.enable_lora:
-                lora_request, lora_tokenizer = get_random_lora_request(args)
-                if lora_tokenizer:
-                    request_tokenizer = lora_tokenizer
-
-            # Synthesize a prompt with the given input length.
-            candidate_ids = [
-                random.randint(0, vocab_size - 1)
-                for _ in range(args.input_len)
-            ]
-
-            candidate_prompt = {"prompt_token_ids": candidate_ids}
-
-            if not args.skip_tokenizer_init:
-                # As tokenizer may add additional tokens like BOS, we need
-                # to try different lengths to get the desired input length.
-                for _ in range(5):  # Max attempts to correct
-                    candidate_prompt = request_tokenizer.decode(candidate_ids)
-                    tokenized_len = len(
-                        request_tokenizer.encode(candidate_prompt))
-
-                    if tokenized_len == args.input_len:
-                        break
-
-                    # Adjust length based on difference
-                    diff = args.input_len - tokenized_len
-                    if diff > 0:
-                        candidate_ids.extend([
-                            random.randint(100, vocab_size - 100)
-                            for _ in range(diff)
-                        ])
-                    else:
-                        candidate_ids = candidate_ids[:diff]
-            requests.append(
-                SampleRequest(prompt=candidate_prompt,
-                              prompt_len=args.input_len,
-                              expected_output_len=args.output_len,
-                              lora_request=lora_request))
-    else:
-        requests = sample_requests(tokenizer, args)
-
+    requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
     if args.backend == "vllm":
@@ -470,7 +322,7 @@ def main(args: argparse.Namespace):
         print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
               "following metrics are not accurate because image tokens are not"
               " counted. See vllm-project/vllm/issues/9778 for details.")
-        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@@ -495,12 +347,23 @@ def main(args: argparse.Namespace):
                         type=str,
                         choices=["vllm", "hf", "mii"],
                         default="vllm")
-    parser.add_argument("--dataset",
+    parser.add_argument("--dataset-name",
+                        type=str,
+                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
+                        help="Name of the dataset to benchmark on.",
+                        default="sharegpt")
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in\
+            the next release. The dataset is expected to "
+        "be a json in form of list[dict[..., conversations: "
+        "list[dict[..., value: <prompt_or_response>]]]]")
+    parser.add_argument("--dataset-path",
                         type=str,
                         default=None,
-                        help="Path to the dataset. The dataset is expected to "
-                        "be a json in form of list[dict[..., conversations: "
-                        "list[dict[..., value: <prompt_or_response>]]]]")
+                        help="Path to the dataset")
     parser.add_argument("--input-len",
                         type=int,
                         default=None,
@@ -547,14 +410,35 @@ def main(args: argparse.Namespace):
         default=None,
         help="Path to the lora adapters to use. This can be an absolute path, "
         "a relative path, or a Hugging Face model identifier.")
+    parser.add_argument("--prefix-len",
+                        type=int,
+                        default=None,
+                        help="Number of prefix tokens per request."
+                        "This is for the RandomDataset and SonnetDataset")
+    # random dataset
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for RandomDataSet.",
+    )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
-    if args.dataset is None:
-        assert args.input_len is not None
-        assert args.output_len is not None
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+    if args.dataset is None and args.dataset_path is None:
+        # for random dataset, the default sampling setting is in
+        # benchmark_dataset.RandomDataset
+        print("When dataset is not set, it will default to random dataset")
     else:
         assert args.input_len is None
     if args.enable_lora:

From 133f272a0524dac5b98e3bed0c41f42423201841 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 10 Mar 2025 16:40:50 +0800
Subject: [PATCH 0627/1240] [Misc] Add log information for
 handle_process_request. (#14130)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/multiprocessing/engine.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index efea6ee2c69..5d14b4112a8 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -292,6 +292,8 @@ def _handle_process_request(self, request: RPCProcessRequest):
             # We do not set self._errored = True here, since the error
             # is due to an issue adding this request to the engine,
             # rather than an issue with the engine itself.
+            logger.debug("Failed to add request %s to engine. %s",
+                         request.request_id, e)
             is_errored = self._errored_with is not None
             rpc_err = RPCError(request_id=request_id,
                                is_engine_errored=is_errored,

From d9b2aa22b5a8de9d076e429d6bb355e36ae7e61a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 10 Mar 2025 13:13:10 +0100
Subject: [PATCH 0628/1240] [Docs] Mention `model_impl` arg when explaining
 Transformers fallback (#14552)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ff28fde5b7f..1fde1761672 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -59,6 +59,10 @@ llm.apply_model(lambda model: print(type(model)))
 
 If it is `TransformersModel` then it means it's based on Transformers!
 
+:::{tip}
+You can force the use of `TransformersModel` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
+:::
+
 :::{note}
 vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
 :::

From afe93ce6115e83e25787aaaae454eded75602159 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Mon, 10 Mar 2025 20:36:03 +0800
Subject: [PATCH 0629/1240] [Frontend] support image embeds (#13955)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/multimodal_inputs.md |  67 +++++++++++++-
 vllm/entrypoints/chat_utils.py           | 113 +++++++++++++++++++++--
 vllm/multimodal/image.py                 |  19 ++++
 vllm/multimodal/utils.py                 |  14 ++-
 4 files changed, 201 insertions(+), 12 deletions(-)

diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index c540bff2cf3..2e2016c95e4 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -462,4 +462,69 @@ export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
 
 ### Embedding Inputs
 
-TBD
+To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
+pass a tensor of shape to the corresponding field of the multi-modal dictionary.
+#### Image Embedding Inputs
+For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
+The following example demonstrates how to pass image embeddings to the OpenAI server:
+
+```python
+image_embedding = torch.load(...)
+grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
+
+buffer = io.BytesIO()
+torch.save(image_embedding, buffer)
+buffer.seek(0)
+binary_data = buffer.read()
+base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+# Basic usage - this is equivalent to the LLaVA example for offline inference
+model = "llava-hf/llava-1.5-7b-hf"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": f"{base64_image_embedding}" 
+}
+
+# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
+model = "Qwen/Qwen2-VL-2B-Instruct"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": {
+        "image_embeds": f"{base64_image_embedding}" , # Required
+        "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
+    },
+}
+model = "openbmb/MiniCPM-V-2_6"
+embeds =  {
+    "type": "image_embeds",
+    "image_embeds": {
+        "image_embeds": f"{base64_image_embedding}" , # Required
+        "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
+    },
+}
+chat_completion = client.chat.completions.create(
+    messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": [
+        {
+            "type": "text",
+            "text": "What's in this image?",
+        },
+        embeds,
+        ],
+    },
+],
+    model=model,
+)
+```
+
+:::{note}
+Only one message can contain `{"type": "image_embeds"}`.
+If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
+:::
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 8f906cf1d80..b51ade17def 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -56,6 +56,17 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
+    image_embeds: Required[Union[str, dict[str, str]]]
+    """
+    The image embeddings. It can be either:
+    - A single base64 string.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["image_embeds"]]
+    """The type of the content part."""
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -109,6 +120,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
+    ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
     CustomChatCompletionContentSimpleVideoParam, str]
 
@@ -350,7 +362,7 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-ModalityStr = Literal["image", "audio", "video"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds"]
 _T = TypeVar("_T")
 
 
@@ -391,7 +403,7 @@ def _placeholder_str(self, modality: ModalityStr,
         hf_config = self._model_config.hf_config
         model_type = hf_config.model_type
 
-        if modality == "image":
+        if modality in ["image", "image_embeds"]:
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
@@ -470,10 +482,27 @@ def create_parser(self) -> "BaseMultiModalContentParser":
 class MultiModalItemTracker(BaseMultiModalItemTracker[object]):
 
     def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items_by_modality:
-            return dict(self._items_by_modality)
-
-        return None
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = dict(self._items_by_modality)
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(\
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(\
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        elif "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        elif "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        elif "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return MultiModalContentParser(self)
@@ -482,13 +511,31 @@ def create_parser(self) -> "BaseMultiModalContentParser":
 class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]):
 
     async def all_mm_data(self) -> Optional[MultiModalDataDict]:
-        if self._items_by_modality:
-            return {
+        if not self._items_by_modality:
+            return None
+        mm_inputs = {}
+        items_by_modality = {
                 modality: await asyncio.gather(*items)
                 for modality, items in self._items_by_modality.items()
             }
 
-        return None
+        if "image" in items_by_modality and "image_embeds" in items_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed")
+
+        if "image_embeds" in items_by_modality:
+            image_embeds_lst = items_by_modality["image_embeds"]
+            if len(image_embeds_lst) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}")
+            mm_inputs["image"] = image_embeds_lst[0]
+        elif "image" in items_by_modality:
+            mm_inputs["image"] = items_by_modality["image"] # A list of images
+        elif "audio" in items_by_modality:
+            mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
+        elif "video" in items_by_modality:
+            mm_inputs["video"] = items_by_modality["video"] # A list of videos
+        return mm_inputs
 
     def create_parser(self) -> "BaseMultiModalContentParser":
         return AsyncMultiModalContentParser(self)
@@ -513,6 +560,11 @@ def mm_placeholder_counts(self) -> dict[str, int]:
     def parse_image(self, image_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
@@ -543,6 +595,21 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image)
         self._add_placeholder(placeholder)
 
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            placeholder = self._tracker.add("image_embeds", embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.fetch_image_embedding(image_embeds)
+            placeholder = self._tracker.add("image_embeds", embedding)
+
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
@@ -579,6 +646,25 @@ def parse_image(self, image_url: str) -> None:
         placeholder = self._tracker.add("image", image_coro)
         self._add_placeholder(placeholder)
 
+    def parse_image_embeds(self,
+                           image_embeds: Union[str, dict[str, str]]) -> None:
+        future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
+
+        if isinstance(image_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_image_embedding(v)
+                for k, v in image_embeds.items()
+            }
+            future.set_result(embeds)
+
+        if isinstance(image_embeds, str):
+            embedding = self._connector.\
+                fetch_image_embedding(image_embeds)
+            future.set_result(embedding)
+
+        placeholder = self._tracker.add("image_embeds", future)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
@@ -684,6 +770,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
+_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
@@ -700,6 +787,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _TextParser(part).get("text", ""),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", ""),
+    "image_embeds":
+    lambda part: _ImageEmbedsParser(part).get("image_embeds", {}),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
     "input_audio":
@@ -769,6 +858,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
+                                       "image_embeds",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -843,7 +933,10 @@ def _parse_chat_message_content_part(
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)
         return {'type': 'image'} if wrap_dicts else None
-
+    if part_type == "image_embeds":
+        content = cast(Union[str, dict[str, str]], content)
+        mm_parser.parse_image_embeds(content)
+        return {'type': 'image'} if wrap_dicts else None
     if part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index f76982ef8d7..255fac30bd7 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -134,3 +134,22 @@ def encode_base64(
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode('utf-8')
+
+
+class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(base64.b64decode(data))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        return base64.b64encode(media.numpy()).decode('utf-8')
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 6e6c10b34a2..ad381e1d1d0 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import numpy.typing as npt
+import torch
 from PIL import Image
 
 import vllm.envs as envs
@@ -16,7 +17,7 @@
 
 from .audio import AudioMediaIO
 from .base import MediaIO
-from .image import ImageMediaIO
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
@@ -245,6 +246,17 @@ async def fetch_video_async(
             fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
         )
 
+    def fetch_image_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load image embedding from a URL.
+        """
+        image_embedding_io = ImageEmbeddingMediaIO()
+
+        return image_embedding_io.load_base64("", data)
+
 
 global_media_connector = MediaConnector()
 """The global :class:`MediaConnector` instance used by vLLM."""

From daf0b7d3dafff5b80b8bc5b3b98e10e48aa6a345 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Mon, 10 Mar 2025 15:30:04 +0100
Subject: [PATCH 0630/1240] [Kernel] Add more dtype support for GGUF kernels
 (#14043)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gguf/gguf_kernel.cu         | 320 +++++++++---------
 csrc/quantization/gguf/mmq.cuh                | 138 ++++----
 csrc/quantization/gguf/mmvq.cuh               | 101 +++---
 tests/kernels/test_gguf.py                    |  18 +-
 vllm/_custom_ops.py                           |   4 +-
 .../layers/quantization/gguf.py               |   5 +-
 6 files changed, 319 insertions(+), 267 deletions(-)

diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 5f0eaf5a973..1150bd8f225 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -5,6 +5,7 @@
 #include <c10/cuda/CUDAGuard.h>
 
 #include "cuda_compat.h"
+#include "dispatch_utils.h"
 
 #include "ggml-common.h"
 #include "vecdotq.cuh"
@@ -13,7 +14,8 @@
 #include "mmq.cuh"
 
 // Q8 gemv
-static __global__ void quantize_q8_1(const half* __restrict__ x,
+template <typename scalar_t>
+static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                      void* __restrict__ vy, const int kx,
                                      const int kx_padded) {
   const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@@ -28,7 +30,7 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,
   const int ib = i_padded / QK8_1;   // block index
   const int iqs = i_padded % QK8_1;  // quant index
 
-  const float xi = ix < kx ? __half2float(x[iy * kx + ix]) : 0.0f;
+  const float xi = ix < kx ? static_cast<float>(x[iy * kx + ix]) : 0.0f;
   float amax = fabsf(xi);
   float sum = xi;
 
@@ -51,14 +53,16 @@ static __global__ void quantize_q8_1(const half* __restrict__ x,
   y[ib].ds.y = __float2half(sum);
 }
 
-static void quantize_row_q8_1_cuda(const half* x, void* vy, const int kx,
+template <typename scalar_t>
+static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
                                    const int ky, cudaStream_t stream) {
   const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
   const int block_num_x =
       (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
   const dim3 num_blocks(block_num_x, ky, 1);
   const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-  quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+  quantize_q8_1<scalar_t>
+      <<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
@@ -79,101 +83,112 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W,  // quant weight
   int col = X.sizes()[1];
   const int padded = (col + 512 - 1) / 512 * 512;
   const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
   at::Tensor Y = torch::empty({1, row}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
   options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
   at::Tensor quant_X = torch::empty({1, padded / 32 * 9}, options);
-  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col, 1,
-                         stream);
-  switch (type) {
-    case 2:
-      mul_mat_vec_q4_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 3:
-      mul_mat_vec_q4_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 6:
-      mul_mat_vec_q5_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 7:
-      mul_mat_vec_q5_1_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 8:
-      mul_mat_vec_q8_0_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 10:
-      mul_mat_vec_q2_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 11:
-      mul_mat_vec_q3_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 12:
-      mul_mat_vec_q4_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 13:
-      mul_mat_vec_q5_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 14:
-      mul_mat_vec_q6_K_q8_1_cuda((void*)W.data_ptr(), (void*)quant_X.data_ptr(),
-                                 (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 16:
-      mul_mat_vec_iq2_xxs_q8_1_cuda((void*)W.data_ptr(),
-                                    (void*)quant_X.data_ptr(),
-                                    (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 17:
-      mul_mat_vec_iq2_xs_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 18:
-      mul_mat_vec_iq3_xxs_q8_1_cuda((void*)W.data_ptr(),
-                                    (void*)quant_X.data_ptr(),
-                                    (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 19:
-      mul_mat_vec_iq1_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 20:
-      mul_mat_vec_iq4_nl_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 21:
-      mul_mat_vec_iq3_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 22:
-      mul_mat_vec_iq2_s_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 23:
-      mul_mat_vec_iq4_xs_q8_1_cuda((void*)W.data_ptr(),
-                                   (void*)quant_X.data_ptr(),
-                                   (half*)Y.data_ptr(), col, row, stream);
-      break;
-    case 29:
-      mul_mat_vec_iq1_m_q8_1_cuda((void*)W.data_ptr(),
-                                  (void*)quant_X.data_ptr(),
-                                  (half*)Y.data_ptr(), col, row, stream);
-      break;
-  }
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_vec_a8", [&] {
+    quantize_row_q8_1_cuda<scalar_t>((scalar_t*)X.data_ptr(),
+                                     (void*)quant_X.data_ptr(), col, 1, stream);
+    switch (type) {
+      case 2:
+        mul_mat_vec_q4_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 3:
+        mul_mat_vec_q4_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 6:
+        mul_mat_vec_q5_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 7:
+        mul_mat_vec_q5_1_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 8:
+        mul_mat_vec_q8_0_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 10:
+        mul_mat_vec_q2_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 11:
+        mul_mat_vec_q3_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 12:
+        mul_mat_vec_q4_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 13:
+        mul_mat_vec_q5_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 14:
+        mul_mat_vec_q6_K_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 16:
+        mul_mat_vec_iq2_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 17:
+        mul_mat_vec_iq2_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 18:
+        mul_mat_vec_iq3_xxs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 19:
+        mul_mat_vec_iq1_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 20:
+        mul_mat_vec_iq4_nl_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 21:
+        mul_mat_vec_iq3_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 22:
+        mul_mat_vec_iq2_s_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 23:
+        mul_mat_vec_iq4_xs_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+      case 29:
+        mul_mat_vec_iq1_m_q8_1_cuda<scalar_t>(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, stream);
+        break;
+    }
+  });
   return Y;
 }
 
@@ -184,66 +199,67 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
   int padded = (col + 512 - 1) / 512 * 512;
   int batch = X.sizes()[0];
   const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
   at::Tensor Y = torch::empty({batch, row}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
   options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
   at::Tensor quant_X = torch::empty({batch, padded / 32 * 9}, options);
-  quantize_row_q8_1_cuda((half*)X.data_ptr(), (void*)quant_X.data_ptr(), col,
-                         batch, stream);
-
-  switch (type) {
-    case 2:
-      ggml_mul_mat_q4_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 3:
-      ggml_mul_mat_q4_1_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 6:
-      ggml_mul_mat_q5_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 7:
-      ggml_mul_mat_q5_1_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 8:
-      ggml_mul_mat_q8_0_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 10:
-      ggml_mul_mat_q2_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 11:
-      ggml_mul_mat_q3_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 12:
-      ggml_mul_mat_q4_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 13:
-      ggml_mul_mat_q5_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-    case 14:
-      ggml_mul_mat_q6_K_q8_1_cuda(
-          (void*)W.data_ptr(), (void*)quant_X.data_ptr(), (half*)Y.data_ptr(),
-          col, row, batch, padded, row, stream);
-      break;
-  }
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_mul_mat_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, batch, stream);
+
+    switch (type) {
+      case 2:
+        ggml_mul_mat_q4_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 3:
+        ggml_mul_mat_q4_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 6:
+        ggml_mul_mat_q5_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 7:
+        ggml_mul_mat_q5_1_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 8:
+        ggml_mul_mat_q8_0_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 10:
+        ggml_mul_mat_q2_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 11:
+        ggml_mul_mat_q3_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 12:
+        ggml_mul_mat_q4_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 13:
+        ggml_mul_mat_q5_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+      case 14:
+        ggml_mul_mat_q6_K_q8_1_cuda(
+            (void*)W.data_ptr(), (void*)quant_X.data_ptr(),
+            (scalar_t*)Y.data_ptr(), col, row, batch, padded, row, stream);
+        break;
+    }
+  });
   return Y;
 }
diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
index c935faa07df..e2b93680ffb 100644
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -1,8 +1,8 @@
 // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmq.cu
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
               allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 
     const block_q_t  * x = (const block_q_t  *) vx;
@@ -38,7 +38,7 @@ static __device__ __forceinline__ void mul_mat_q(
                    threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
 
 #pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
+        for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
             const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
             const int kbxd = kqs / QI8_1;
 
@@ -98,7 +98,7 @@ static __device__ __forceinline__ void mul_mat_q(
             if (row_dst >= nrows_dst) {
                 continue;
             }
-            dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]);
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE_GGUF][j/nwarps];
         }
     }
 }
@@ -113,24 +113,25 @@ static __device__ __forceinline__ void mul_mat_q(
 #define NWARPS_Q4_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2)
 #endif
 mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_0;
     const int mmq_y  =  MMQ_Y_Q4_0;
     const int nwarps = NWARPS_Q4_0;
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+    mul_mat_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     int mmq_x  =  MMQ_X_Q4_0;
@@ -144,11 +145,11 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -163,24 +164,25 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 #define NWARPS_Q4_1 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2)
 #endif
 mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_1;
     const int mmq_y  =  MMQ_Y_Q4_1;
     const int nwarps = NWARPS_Q4_1;
 
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+    mul_mat_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     int mmq_x  =  MMQ_X_Q4_1;
@@ -194,11 +196,11 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -213,24 +215,25 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 #define NWARPS_Q5_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2)
 #endif
 mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_0;
     const int mmq_y  =  MMQ_Y_Q5_0;
     const int nwarps = NWARPS_Q5_0;
 
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+    mul_mat_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q5_0;
@@ -244,11 +247,11 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -263,24 +266,25 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 #define NWARPS_Q5_1 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2)
 #endif
 mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_1;
     const int mmq_y  =  MMQ_Y_Q5_1;
     const int nwarps = NWARPS_Q5_1;
 
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+    mul_mat_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q5_1;
     const int mmq_y  =  MMQ_Y_Q5_1;
@@ -293,11 +297,11 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -312,24 +316,25 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 #define NWARPS_Q8_0 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2)
 #endif
 mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q8_0;
     const int mmq_y  =  MMQ_Y_Q8_0;
     const int nwarps = NWARPS_Q8_0;
 
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+    mul_mat_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q8_0;
     const int mmq_y  =  MMQ_Y_Q8_0;
@@ -342,11 +347,11 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -361,24 +366,25 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 #define NWARPS_Q2_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2)
 #endif
 mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q2_K;
     const int mmq_y  =  MMQ_Y_Q2_K;
     const int nwarps = NWARPS_Q2_K;
 
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q2_K;
     const int mmq_y  =  MMQ_Y_Q2_K;
@@ -391,11 +397,11 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -410,25 +416,26 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 #define NWARPS_Q3_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2)
 #endif
 mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
 
     const int mmq_x  =  MMQ_X_Q3_K;
     const int mmq_y  =  MMQ_Y_Q3_K;
     const int nwarps = NWARPS_Q3_K;
 
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q3_K;
@@ -442,11 +449,11 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -461,24 +468,25 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 #define NWARPS_Q4_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2)
 #endif
 mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q4_K;
     const int mmq_y  =  MMQ_Y_Q4_K;
     const int nwarps = NWARPS_Q4_K;
 
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q4_K;
     const int mmq_y  =  MMQ_Y_Q4_K;
@@ -491,11 +499,11 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -510,24 +518,25 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 #define NWARPS_Q5_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2)
 #endif
 mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q5_K;
     const int mmq_y  =  MMQ_Y_Q5_K;
     const int nwarps = NWARPS_Q5_K;
 
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
 
     const int mmq_x  =  MMQ_X_Q5_K;
@@ -541,11 +550,11 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
@@ -560,24 +569,25 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 #define NWARPS_Q6_K 4
 #endif
 
-template <bool need_check> static __global__ void
+template<typename scalar_t, bool need_check> static __global__ void
 #if defined(USE_ROCM)
 __launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2)
 #endif
 mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst,
+    const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
     const int mmq_x  =  MMQ_X_Q6_K;
     const int mmq_y  =  MMQ_Y_Q6_K;
     const int nwarps = NWARPS_Q6_K;
 
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+    mul_mat_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 }
 
+template<typename scalar_t>
 static void ggml_mul_mat_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, half * dst, const int ncols_x, const int nrows_x,
+    const void * vx, const void * vy, scalar_t * dst, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
     const int mmq_x  =  MMQ_X_Q6_K;
     const int mmq_y  =  MMQ_Y_Q6_K;
@@ -590,11 +600,11 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     } else {
         const bool need_check = true;
-        mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
+        mul_mat_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>
             (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
     }
 }
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
index b01e939808a..d83f2974555 100644
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -1,6 +1,6 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, const int ncols, const int nrows) {
+template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
     const int row = blockIdx.x*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
@@ -33,158 +33,177 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     }
 
     if (threadIdx.x == 0) {
-        dst[row] = __float2half(tmp);
+        dst[row] = tmp;
     }
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
+    mul_mat_vec_q<scalar_t, QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
+    mul_mat_vec_q<scalar_t, QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq2_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq3_xxs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq1_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq1_m_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI1_M, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq4_nl_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
+    mul_mat_vec_q<scalar_t, QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq4_xs_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
-static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, half * dst, const int ncols, const int nrows, cudaStream_t stream) {
+template<typename scalar_t>
+static void mul_mat_vec_iq3_s_q8_1_cuda(const void * vx, const void * vy, scalar_t * dst, const int ncols, const int nrows, cudaStream_t stream) {
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    mul_mat_vec_q<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
+    mul_mat_vec_q<scalar_t, QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index aa666a464a5..dde3741d3c4 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -22,7 +22,7 @@ def get_gguf_sample_tensors(
     return GGUFReader(sample_file).tensors
 
 
-DTYPES = [torch.half]
+DTYPES = [torch.half, torch.bfloat16, torch.float32]
 # Hidden_size for testing, must match the sample file in HF repo,
 # we have `hidden_size = 256, 1024` for test in HF repo currently.
 HIDDEN_SIZES = [256, 1024]
@@ -52,7 +52,7 @@ def get_gguf_sample_tensors(
 
 
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dtype", [torch.half])
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -122,7 +122,13 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
         ref_output = x @ weight.T
 
         qweight = torch.tensor(tensor.data, device="cuda")
-        output = ops.ggml_mul_mat_a8(qweight, x, quant_type,
-                                     qweight.shape[0]).to(dtype)
-
-        torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
+        output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0])
+        atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2}
+        # test matrix has inputs centered around 0 and lower precision from
+        # bfloat16 tends to accumulate and can greatly inflate rtol
+        # since outputs are also very close to 0
+        rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1}
+        torch.testing.assert_close(output,
+                                   ref_output,
+                                   atol=atols[dtype],
+                                   rtol=rtols[dtype])
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 1f362a45aa7..d25e7944a05 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -436,7 +436,7 @@ def _ggml_mul_mat_vec_a8_fake(
         quant_type: int,
         row: torch.SymInt,
     ) -> torch.Tensor:
-        return torch.empty((1, row), dtype=torch.float16, device=W.device)
+        return torch.empty((1, row), dtype=X.dtype, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_a8")
     def _ggml_mul_mat_a8_fake(
@@ -446,7 +446,7 @@ def _ggml_mul_mat_a8_fake(
         row: torch.SymInt,
     ) -> torch.Tensor:
         batch = X.size(0)
-        return torch.empty((batch, row), dtype=torch.float16, device=W.device)
+        return torch.empty((batch, row), dtype=X.dtype, device=W.device)
 
 
 # cutlass
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index ba176e4a567..d4e97177693 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -32,7 +32,7 @@ def get_name(self) -> str:
         return "gguf"
 
     def get_supported_act_dtypes(self) -> List[torch.dtype]:
-        return [torch.half]
+        return [torch.half, torch.bfloat16, torch.float32]
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -134,6 +134,7 @@ def create_weights(self, layer: torch.nn.Module,
                        output_partition_sizes: List[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
+        self.params_dtype = params_dtype
         output_size_per_partition = sum(output_partition_sizes)
 
         tensor_shape = (output_size_per_partition, input_size_per_partition)
@@ -326,7 +327,7 @@ def embedding(self, layer: torch.nn.Module,
         x_flat = x.flatten()
         quant = torch.index_select(qweight, dim=0, index=x_flat)
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0])
+                                      x_flat.shape[0]).to(self.params_dtype)
         return dequant.view(*x.shape, hidden_size)
 
 
From c9049a031d2d0577f57928b1410fbafc599bf409 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Mar 2025 23:02:28 +0800
Subject: [PATCH 0631/1240] [Doc] Update PaliGemma note to a warning (#14565)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1fde1761672..c9140bd06e8 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -847,7 +847,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `PaliGemmaForConditionalGeneration`
-  * PaliGemma (see note), PaliGemma 2 (see note)
+  * PaliGemma ⚠️, PaliGemma 2 ⚠️
   * T + I<sup>E</sup>
   * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
   *
@@ -917,6 +917,12 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
+:::{warning}
+vLLM does not currently support PrefixLM attention mask, so our PaliGemma implementation uses regular causal attention, which causes the model output to be unstable.
+
+We may deprecate this model series in a future release.
+:::
+
 :::{note}
 `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
 :::
@@ -930,10 +936,6 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`
 For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 :::
 
-:::{note}
-Currently the PaliGemma model series is implemented without PrefixLM attention mask. This model series may be deprecated in a future release.
-:::
-
 :::{note}
 To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::

From f4a291fabe85a401cde3cae96095d9b7c3a18034 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 10 Mar 2025 16:53:33 +0100
Subject: [PATCH 0632/1240] Correct capitalisation: `Github` -> `GitHub`
 (#14561)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/workflows/publish.yml               | 2 +-
 .github/workflows/scripts/create_release.js | 2 +-
 README.md                                   | 4 ++--
 vllm/engine/async_llm_engine.py             | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index fc6739eb355..bfd02879965 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -39,7 +39,7 @@ jobs:
             const script = require('.github/workflows/scripts/create_release.js')
             await script(github, context, core)
 
-  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
+  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
   # wheel:
   #   name: Build Wheel
   #   runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js
index 475742118af..0feb5dc2cf8 100644
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -1,4 +1,4 @@
-// Uses Github's API to create the release and wait for result.
+// Uses GitHub's API to create the release and wait for result.
 // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 
 module.exports = async (github, context, core) => {
diff --git a/README.md b/README.md
index 49d6d525161..5367972fc7e 100644
--- a/README.md
+++ b/README.md
@@ -151,9 +151,9 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
-- For technical questions and feature requests, please use Github issues or discussions.
+- For technical questions and feature requests, please use GitHub issues or discussions.
 - For discussing with fellow users and coordinating contributions and development, please use Slack.
-- For security disclosures, please use Github's security advisory feature.
+- For security disclosures, please use GitHub's security advisory feature.
 - For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 
 ## Media Kit
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 90e66b005f3..ebba34c5c86 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -69,7 +69,7 @@ def _log_task_completion(task: asyncio.Task,
         error_callback(exception)
         raise AsyncEngineDeadError(
             "Task finished unexpectedly. This should never happen! "
-            "Please open an issue on Github. See stack trace above for the "
+            "Please open an issue on GitHub. See stack trace above for the "
             "actual cause.") from e
 
 
From dc015e3c7b131b0bea5284fef203207011c35d4b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 10 Mar 2025 09:03:11 -0700
Subject: [PATCH 0633/1240] [V1][Bugfix] Fix handing of `second_per_grid_ts`
 for Qwen2-VL & Qwen2.5-VL (#14548)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/rotary_embedding.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 64c2dac524f..d4b8cf25fec 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -923,14 +923,19 @@ def forward(
     def get_input_positions(
         input_tokens: List[int],
         hf_config: PretrainedConfig,
-        image_grid_thw: Union[List[List[int]], torch.Tensor],
-        video_grid_thw: Union[List[List[int]], torch.Tensor],
-        second_per_grid_ts: Optional[List[float]] = None,
+        image_grid_thw: Optional[Union[List[List[int]], torch.Tensor]],
+        video_grid_thw: Optional[Union[List[List[int]], torch.Tensor]],
+        second_per_grid_ts: Optional[List[float]],
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
+        image_grid_thw = [] if image_grid_thw is None else image_grid_thw
+        video_grid_thw = [] if video_grid_thw is None else video_grid_thw
+        second_per_grid_ts = [] if second_per_grid_ts is None else \
+            second_per_grid_ts
+
         llm_positions, mrope_position_delta = \
             MRotaryEmbedding.get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -950,7 +955,7 @@ def get_input_positions_tensor(
         hf_config: PretrainedConfig,
         image_grid_thw: Union[List[List[int]], torch.Tensor],
         video_grid_thw: Union[List[List[int]], torch.Tensor],
-        second_per_grid_ts: Optional[List[float]] = None,
+        second_per_grid_ts: List[float],
         context_len: int = 0,
         seq_len: Optional[int] = None,
     ) -> Tuple[torch.Tensor, int]:
@@ -1006,7 +1011,7 @@ def get_input_positions_tensor(
                     video_grid_thw[video_index][2],
                 )
                 video_second_per_grid_t = 1.0
-                if second_per_grid_ts is not None:
+                if second_per_grid_ts:
                     video_second_per_grid_t = second_per_grid_ts[video_index]
                 video_index += 1
                 remain_videos -= 1

From fae409eadbf56808ceb2f6555440e2fae48013c2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 10 Mar 2025 17:36:21 +0100
Subject: [PATCH 0634/1240] Correct capitalisation: `VLLM` -> `vLLM` (#14562)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_rmsnorm.py                   | 2 +-
 docs/source/contributing/vulnerability_management.md      | 2 +-
 docs/source/design/v1/metrics.md                          | 2 +-
 .../offline_inference/disaggregated_prefill_lmcache.py    | 2 +-
 tests/tpu/test_quantization_accuracy.py                   | 2 +-
 vllm/attention/selector.py                                | 2 +-
 vllm/compilation/backends.py                              | 2 +-
 vllm/compilation/compiler_interface.py                    | 2 +-
 vllm/config.py                                            | 8 ++++----
 vllm/entrypoints/openai/protocol.py                       | 2 +-
 vllm/envs.py                                              | 6 +++---
 vllm/model_executor/models/phi4mm.py                      | 2 +-
 vllm/platforms/cuda.py                                    | 2 +-
 vllm/platforms/rocm.py                                    | 4 ++--
 vllm/transformers_utils/tokenizers/mistral.py             | 2 +-
 vllm/v1/engine/core_client.py                             | 2 +-
 vllm/v1/engine/output_processor.py                        | 2 +-
 vllm/v1/engine/processor.py                               | 4 ++--
 18 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py
index 010a38b7527..eaf6b25e8ca 100644
--- a/benchmarks/kernels/benchmark_rmsnorm.py
+++ b/benchmarks/kernels/benchmark_rmsnorm.py
@@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
 
     print(f"Naive output={output_naive}")
     print(f"FlashInfer output={output_flashinfer}")
-    print(f"VLLM output={output_vllm}")
+    print(f"vLLM output={output_vllm}")
 
     if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
                       rtol=1e-2) and torch.allclose(
diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md
index a9bbfde2af7..1842b3010c4 100644
--- a/docs/source/contributing/vulnerability_management.md
+++ b/docs/source/contributing/vulnerability_management.md
@@ -37,7 +37,7 @@ you may contact the following individuals:
 
 ## Slack Discussion
 
-You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
+You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
 to discuss security-related topics. However, please do not disclose any
 vulnerabilities in this channel. If you need to report a vulnerability, please
 use the GitHub security advisory system or contact a VMT member privately.
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index 0d74d21a4fb..bed40516ca4 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -509,7 +509,7 @@ cache to complete other requests), we swap kv cache blocks out to CPU
 memory. This is also known as "KV cache offloading" and is configured
 with `--swap-space` and `--preemption-mode`.
 
-In v0, [VLLM has long supported beam
+In v0, [vLLM has long supported beam
 search](gh-issue:6226). The
 SequenceGroup encapsulated the idea of N Sequences which
 all shared the same prompt kv blocks. This enabled KV cache block
diff --git a/examples/offline_inference/disaggregated_prefill_lmcache.py b/examples/offline_inference/disaggregated_prefill_lmcache.py
index 36d343c6812..5c84bbfc92c 100644
--- a/examples/offline_inference/disaggregated_prefill_lmcache.py
+++ b/examples/offline_inference/disaggregated_prefill_lmcache.py
@@ -5,7 +5,7 @@
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and launch an additional LMCache server.
 KV cache is transferred in the following manner: 
-VLLM prefill node -> LMCache server -> VLLM decode node.
+vLLM prefill node -> LMCache server -> vLLM decode node.
 
 Note that `pip install lmcache` is needed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.
diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py
index 3db9bc73aa8..20f9dd77d0e 100644
--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
@@ -25,7 +25,7 @@ def get_model_args(self) -> str:
     GSM8KAccuracyTestConfig(
         model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
         excepted_value=0.76),  # no bias
-    # NOTE(rob): We cannot re-initialize VLLM in the same process for TPU,
+    # NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
     # so only one of these tests can run in a single call to pytest. As
     # a follow up, move this into the LM-EVAL section of the CI.
     # GSM8KAccuracyTestConfig(
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 26c6ac812a1..ebbdea27f41 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -51,7 +51,7 @@ def get_env_variable_attn_backend() -> Optional[_Backend]:
 # (default behavior if this variable is None)
 #
 # THIS SELECTION TAKES PRECEDENCE OVER THE
-# VLLM ATTENTION BACKEND ENVIRONMENT VARIABLE
+# VLLM_ATTENTION_BACKEND ENVIRONMENT VARIABLE
 forced_attn_backend: Optional[_Backend] = None
 
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index afb63cf8319..cdae42fe4fc 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -278,7 +278,7 @@ def call_module(self, target: torch.fx.node.Target,
 
 
 class VllmBackend:
-    """The compilation backend for `torch.compile` with VLLM.
+    """The compilation backend for `torch.compile` with vLLM.
     It is used for compilation level of `CompilationLevel.PIECEWISE`,
     where we customize the compilation.
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index d280fdfbe0d..b45c694fd7f 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -31,7 +31,7 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         """
-        Gather all the relevant information from the VLLM config,
+        Gather all the relevant information from the vLLM config,
         to compute a hash so that we can cache the compiled model.
 
         See :meth:`VllmConfig.compute_hash` to check what information
diff --git a/vllm/config.py b/vllm/config.py
index ad436a1e65e..a6ac9f432df 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3572,11 +3572,11 @@ def __str__(self):
 @contextmanager
 def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
     """
-    Temporarily set the current VLLM config.
+    Temporarily set the current vLLM config.
     Used during model initialization.
-    We save the current VLLM config in a global variable,
+    We save the current vLLM config in a global variable,
     so that all modules can access it, e.g. custom ops
-    can access the VLLM config to determine how to dispatch.
+    can access the vLLM config to determine how to dispatch.
     """
     global _current_vllm_config
     old_vllm_config = _current_vllm_config
@@ -3611,7 +3611,7 @@ def get_current_vllm_config() -> VllmConfig:
         # in ci, usually when we test custom ops/modules directly,
         # we don't set the vllm config. In that case, we set a default
         # config.
-        logger.warning("Current VLLM config is not set.")
+        logger.warning("Current vLLM config is not set.")
         from vllm.config import VllmConfig
         return VllmConfig()
     return _current_vllm_config
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 6b519e1b704..90076a45d41 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -237,7 +237,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     tool_choice: Optional[Union[Literal["none"], Literal["auto"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
 
-    # NOTE this will be ignored by VLLM -- the model determines the behavior
+    # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 187d28b2d6d..24ee4583c75 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -164,7 +164,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VERBOSE":
     lambda: bool(int(os.getenv('VERBOSE', '0'))),
 
-    # Root directory for VLLM configuration files
+    # Root directory for vLLM configuration files
     # Defaults to `~/.config/vllm` unless `XDG_CONFIG_HOME` is set
     # Note that this not only affects how vllm finds its configuration files
     # during runtime, but also affects how vllm installs its configuration
@@ -178,7 +178,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # ================== Runtime Env Vars ==================
 
-    # Root directory for VLLM cache files
+    # Root directory for vLLM cache files
     # Defaults to `~/.cache/vllm` unless `XDG_CACHE_HOME` is set
     "VLLM_CACHE_ROOT":
     lambda: os.path.expanduser(
@@ -260,7 +260,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_ENGINE_ITERATION_TIMEOUT_S":
     lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
 
-    # API key for VLLM API server
+    # API key for vLLM API server
     "VLLM_API_KEY":
     lambda: os.environ.get("VLLM_API_KEY", None),
 
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 89abfc5919a..2a839f3a503 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1414,7 +1414,7 @@ def cat_with_pad(tensors, dim, padding_value=0):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
 class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
     """
-    Implements the Phi-4-multimodal-instruct model in VLLM.
+    Implements the Phi-4-multimodal-instruct model in vLLM.
     """
     packed_modules_mapping = {
         "qkv_proj": [
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 1bba99088bb..a986ec0a33f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -119,7 +119,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                         "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index a4f18cbfc58..de4f6070f0e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -173,7 +173,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
                         "Multi-step scheduling is not supported (and not "
-                        "needed) on VLLM V1. Please launch without "
+                        "needed) on vLLM V1. Please launch without "
                         "--num-scheduler-steps.")
                 else:
                     parallel_config.worker_cls = \
@@ -181,7 +181,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             elif vllm_config.speculative_config:
                 if envs.VLLM_USE_V1:
                     raise NotImplementedError(
-                        "Speculative decoding is not yet supported on VLLM V1."
+                        "Speculative decoding is not yet supported on vLLM V1."
                     )
                 else:
                     parallel_config.worker_cls = \
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 801597bd365..40a5777f958 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -249,7 +249,7 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                          revision=revision)
         return tokenizer_file
 
-    # the following attributes are set to fit VLLM's design and are used
+    # the following attributes are set to fit vLLM's design and are used
     # by the guided structured output backends.
     @property
     def all_special_tokens_extended(self) -> List[str]:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index d060d9778e7..0f92adcc863 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -255,7 +255,7 @@ def __init__(
         # TODO(rob): rather than killing the main process, we should
         # figure out how to raise an AsyncEngineDeadError and
         # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
+        # error code to the clients calling vLLM.
         def sigusr1_handler(signum, frame):
             logger.fatal("Got fatal signal from worker processes, shutting "
                          "down. See stack trace above for root cause issue.")
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index aca0233e416..aea526188a8 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -248,7 +248,7 @@ def process_outputs(
 
         ****************** NOTE FOR DEVELOPERS ******************
 
-        VLLM V1 minimizes the number of python loops over the full
+        vLLM V1 minimizes the number of python loops over the full
         batch to ensure system overheads are minimized. This is the 
         only function that should loop over EngineCoreOutputs.
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 38638c1ee36..5c940cce5d5 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,10 +93,10 @@ def _validate_supported_sampling_params(
     ) -> None:
         # Best of not yet supported.
         if params.best_of is not None and params.best_of > 1:
-            raise ValueError("VLLM V1 does not yet support best_of.")
+            raise ValueError("vLLM V1 does not yet support best_of.")
         # Logits processors not supported.
         if params.logits_processors:
-            raise ValueError("VLLM V1 does not support per request "
+            raise ValueError("vLLM V1 does not support per request "
                              "user provided logits processors.")
 
     def _validate_params(

From 61165a523d91adbe86ef357866f0ebfb369ff887 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 10 Mar 2025 18:43:08 +0100
Subject: [PATCH 0635/1240] [Docs] Make installation URLs nicer (#14556)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md                                     |  2 +-
 .../index.md => installation.md}              | 12 ++--
 .../index.md => ai_accelerator.md}            | 64 +++++++++----------
 .../installation/{cpu/index.md => cpu.md}     | 20 +++---
 .../installation/cpu/arm.inc.md               |  2 +-
 .../installation/cpu/x86.inc.md               |  2 +-
 .../installation/{gpu/index.md => gpu.md}     | 46 ++++++-------
 docs/source/index.md                          |  2 +-
 8 files changed, 75 insertions(+), 75 deletions(-)
 rename docs/source/getting_started/{installation/index.md => installation.md} (62%)
 rename docs/source/getting_started/installation/{ai_accelerator/index.md => ai_accelerator.md} (77%)
 rename docs/source/getting_started/installation/{cpu/index.md => cpu.md} (96%)
 rename docs/source/getting_started/installation/{gpu/index.md => gpu.md} (84%)

diff --git a/README.md b/README.md
index 5367972fc7e..405e3a257f7 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ pip install vllm
 ```
 
 Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
-- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
+- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
 - [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
 - [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
 
diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation.md
similarity index 62%
rename from docs/source/getting_started/installation/index.md
rename to docs/source/getting_started/installation.md
index c64c3a7208e..cbaa11806f5 100644
--- a/docs/source/getting_started/installation/index.md
+++ b/docs/source/getting_started/installation.md
@@ -8,20 +8,20 @@ vLLM supports the following hardware platforms:
 :maxdepth: 1
 :hidden:
 
-gpu/index
-cpu/index
-ai_accelerator/index
+installation/gpu
+installation/cpu
+installation/ai_accelerator
 :::
 
-- <project:gpu/index.md>
+- <project:installation/gpu.md>
   - NVIDIA CUDA
   - AMD ROCm
   - Intel XPU
-- <project:cpu/index.md>
+- <project:installation/cpu.md>
   - Intel/AMD x86
   - ARM AArch64
   - Apple silicon
-- <project:ai_accelerator/index.md>
+- <project:installation/ai_accelerator.md>
   - Google TPU
   - Intel Gaudi
   - AWS Neuron
diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator.md
similarity index 77%
rename from docs/source/getting_started/installation/ai_accelerator/index.md
rename to docs/source/getting_started/installation/ai_accelerator.md
index 01793572fee..61a853ccefd 100644
--- a/docs/source/getting_started/installation/ai_accelerator/index.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
@@ -9,7 +9,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 :selected:
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -19,7 +19,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -29,7 +29,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -39,7 +39,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -56,7 +56,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 :::
@@ -66,7 +66,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 :::
@@ -76,7 +76,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "## Requirements"
 :end-before: "## Configure a new environment"
 :::
@@ -86,7 +86,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -103,7 +103,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 :::
@@ -113,7 +113,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 :::
@@ -123,7 +123,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "## Configure a new environment"
 :end-before: "## Set up using Python"
 :::
@@ -133,7 +133,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} ../python_env_setup.inc.md
+:::{include} python_env_setup.inc.md
 :::
 
 ::::
@@ -150,7 +150,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -160,7 +160,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -170,7 +170,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -180,7 +180,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -197,7 +197,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -207,7 +207,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -217,7 +217,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -227,7 +227,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -246,7 +246,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -256,7 +256,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -266,7 +266,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -276,7 +276,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -293,7 +293,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
@@ -303,7 +303,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
@@ -313,7 +313,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
@@ -323,7 +323,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Extra information"
 :::
@@ -340,7 +340,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Google TPU
 :sync: tpu
 
-:::{include} tpu.inc.md
+:::{include} ai_accelerator/tpu.inc.md
 :start-after: "## Extra information"
 :::
 
@@ -349,7 +349,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} Intel Gaudi
 :sync: hpu-gaudi
 
-:::{include} hpu-gaudi.inc.md
+:::{include} ai_accelerator/hpu-gaudi.inc.md
 :start-after: "## Extra information"
 :::
 
@@ -358,7 +358,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} AWS Neuron
 :sync: neuron
 
-:::{include} neuron.inc.md
+:::{include} ai_accelerator/neuron.inc.md
 :start-after: "## Extra information"
 :::
 
@@ -367,7 +367,7 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 ::::{tab-item} OpenVINO
 :sync: openvino
 
-:::{include} openvino.inc.md
+:::{include} ai_accelerator/openvino.inc.md
 :start-after: "## Extra information"
 :::
 
diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu.md
similarity index 96%
rename from docs/source/getting_started/installation/cpu/index.md
rename to docs/source/getting_started/installation/cpu.md
index 9c5977939cc..d7b8cc843bc 100644
--- a/docs/source/getting_started/installation/cpu/index.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -9,7 +9,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 :selected:
 :sync: x86
 
-:::{include} x86.inc.md
+:::{include} cpu/x86.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -19,7 +19,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ::::{tab-item} ARM AArch64
 :sync: arm
 
-:::{include} arm.inc.md
+:::{include} cpu/arm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -29,7 +29,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ::::{tab-item} Apple silicon
 :sync: apple
 
-:::{include} apple.inc.md
+:::{include} cpu/apple.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -48,7 +48,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ::::{tab-item} Intel/AMD x86
 :sync: x86
 
-:::{include} x86.inc.md
+:::{include} cpu/x86.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -58,7 +58,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ::::{tab-item} ARM AArch64
 :sync: arm
 
-:::{include} arm.inc.md
+:::{include} cpu/arm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -68,7 +68,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 ::::{tab-item} Apple silicon
 :sync: apple
 
-:::{include} apple.inc.md
+:::{include} cpu/apple.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -81,7 +81,7 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Create a new Python environment
 
-:::{include} ../python_env_setup.inc.md
+:::{include} python_env_setup.inc.md
 :::
 
 ### Pre-built wheels
@@ -96,7 +96,7 @@ Currently, there are no pre-built CPU wheels.
 ::::{tab-item} Intel/AMD x86
 :sync: x86
 
-:::{include} x86.inc.md
+:::{include} cpu/x86.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -106,7 +106,7 @@ Currently, there are no pre-built CPU wheels.
 ::::{tab-item} ARM AArch64
 :sync: arm
 
-:::{include} arm.inc.md
+:::{include} cpu/arm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -116,7 +116,7 @@ Currently, there are no pre-built CPU wheels.
 ::::{tab-item} Apple silicon
 :sync: apple
 
-:::{include} apple.inc.md
+:::{include} cpu/apple.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
index a661a0ca5ad..e7d8d60630d 100644
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ b/docs/source/getting_started/installation/cpu/arm.inc.md
@@ -20,7 +20,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 ### Build wheel from source
 
-:::{include} build.inc.md
+:::{include} cpu/build.inc.md
 :::
 
 Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index 1dafc366006..b2f3bafb4e5 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -22,7 +22,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 ### Build wheel from source
 
-:::{include} build.inc.md
+:::{include} cpu/build.inc.md
 :::
 
 :::{note}
diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu.md
similarity index 84%
rename from docs/source/getting_started/installation/gpu/index.md
rename to docs/source/getting_started/installation/gpu.md
index f82c4bda286..22db992354f 100644
--- a/docs/source/getting_started/installation/gpu/index.md
+++ b/docs/source/getting_started/installation/gpu.md
@@ -9,7 +9,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 :selected:
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -19,7 +19,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -29,7 +29,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "# Installation"
 :end-before: "## Requirements"
 :::
@@ -49,7 +49,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -59,7 +59,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -69,7 +69,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "## Requirements"
 :end-before: "## Set up using Python"
 :::
@@ -82,7 +82,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
 ### Create a new Python environment
 
-:::{include} ../python_env_setup.inc.md
+:::{include} python_env_setup.inc.md
 :::
 
 :::::{tab-set}
@@ -91,7 +91,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "## Create a new Python environment"
 :end-before: "### Pre-built wheels"
 :::
@@ -122,7 +122,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -132,7 +132,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -142,7 +142,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "### Pre-built wheels"
 :end-before: "### Build wheel from source"
 :::
@@ -161,7 +161,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -171,7 +171,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -181,7 +181,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "### Build wheel from source"
 :end-before: "## Set up using Docker"
 :::
@@ -200,7 +200,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -210,7 +210,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -220,7 +220,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "### Pre-built images"
 :end-before: "### Build image from source"
 :::
@@ -237,7 +237,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 :::
@@ -247,7 +247,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 :::
@@ -257,7 +257,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "### Build image from source"
 :end-before: "## Supported features"
 :::
@@ -274,7 +274,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} NVIDIA CUDA
 :sync: cuda
 
-:::{include} cuda.inc.md
+:::{include} gpu/cuda.inc.md
 :start-after: "## Supported features"
 :::
 
@@ -283,7 +283,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} AMD ROCm
 :sync: rocm
 
-:::{include} rocm.inc.md
+:::{include} gpu/rocm.inc.md
 :start-after: "## Supported features"
 :::
 
@@ -292,7 +292,7 @@ There is no extra information on creating a new Python environment for this devi
 ::::{tab-item} Intel XPU
 :sync: xpu
 
-:::{include} xpu.inc.md
+:::{include} gpu/xpu.inc.md
 :start-after: "## Supported features"
 :::
 
diff --git a/docs/source/index.md b/docs/source/index.md
index 09ada43335c..52c4622d3e5 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -62,7 +62,7 @@ For more information, check out the following:
 :caption: Getting Started
 :maxdepth: 1
 
-getting_started/installation/index
+getting_started/installation
 getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting

From 4416ea179d80e0cb8176ea957e12d834528a1afb Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 11 Mar 2025 02:24:51 +0800
Subject: [PATCH 0636/1240] [Bugfix][v1] fixed llava-hf/llava-1.5-7b-hf is
 broken on V1 (#14554)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/llava.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index e83dfd320bb..0c0d8e109c9 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -783,15 +783,19 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
         if image_input is None:
             return None
         vision_embeddings = self._process_image_input(image_input)
-        if kwargs.get("v0_path", False):
+
+        if kwargs.get("v0_path", False) or \
+            image_input.get("feat_is_patch") is None or \
+                image_input.get("embed_is_patch") is None:
+            # The path is used for pixtral (V0 only) and llava (V0/V1)
             return vision_embeddings
-        else:
-            nested_emb = [
-                self._get_mm_embeds(*args) for args in zip(
-                    vision_embeddings, image_input["feat_is_patch"],
-                    image_input["num_crops"], image_input["embed_is_patch"])
-            ]
-            return flatten_2d_lists(nested_emb)
+
+        nested_emb = [
+            self._get_mm_embeds(*args) for args in zip(
+                vision_embeddings, image_input["feat_is_patch"],
+                image_input["num_crops"], image_input["embed_is_patch"])
+        ]
+        return flatten_2d_lists(nested_emb)
 
     def get_input_embeddings(
         self,

From 8e4139049a47586c2069ae9460a56a6cdd9cc6cb Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 10 Mar 2025 12:06:58 -0700
Subject: [PATCH 0637/1240] [Perf] Improve MLA on V1 (#14540)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 68 ++++++++++++++----------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 0b0f521672b..526b792ab1f 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -223,6 +223,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 try:
@@ -471,18 +472,23 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int) -> M:
         assert self._num_decodes + self._num_prefills == num_reqs
 
+        # Note(simon): be careful about the CPU <> GPU memory movement in this
+        # function. We should avoid GPU -> CPU sync as much as possible because
+        # it blocks on all previous kernels.
         device = self.runner.device
-        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
-            device, non_blocking=True)
-        seq_lens = self.runner.seq_lens_cpu[:num_reqs].to(device,
-                                                          non_blocking=True)
         block_table = (
             self.runner.input_batch.block_table.get_device_tensor()[:num_reqs])
+        query_start_loc = self.runner.query_start_loc_cpu[:num_reqs + 1].to(
+            device, non_blocking=True)
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             device, non_blocking=True).long()
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(
             device, non_blocking=True).long()
 
+        seq_lens_cpu = self.runner.seq_lens_cpu[:num_reqs]
+        seq_lens = seq_lens_cpu.to(device, non_blocking=True)
+        max_query_len = seq_lens_cpu.max().item()
+
         prefill_metadata = None
         if self._num_prefills > 0:
             reqs_start = self._num_decodes  # prefill_start
@@ -490,24 +496,22 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
 
             context_lens_cpu = self.runner.input_batch.\
                 num_computed_tokens_cpu_tensor[reqs_start:num_reqs]
-            context_lens = context_lens_cpu.to(device, non_blocking=True)
+            max_context_len_cpu = context_lens_cpu.max().item()
+            num_prefills_with_context_cpu = (context_lens_cpu > 0).sum().item()
 
             chunked_context_metadata = None
             if self.chunked_prefill_enabled and self._num_prefills > 0 \
-                and context_lens.max() > 0:
+                and max_context_len_cpu > 0:
                 # NOTE: it is recommend you read the `Chunked Prefill` section
                 # in the comment at the top of the file before trying to
                 # understand the following code
 
-                num_prefills_with_context = (context_lens > 0).sum().item()
-
                 # currently we allocate an equal amount of workspace for each
                 # prefill in the batch, we could probably use a more advanced
                 # algorithm here and allocate more workspace to prefills with
                 # longer context lengths
-                max_context_chunk = \
-                    self.chunked_prefill_workspace_size \
-                        // num_prefills_with_context
+                max_context_chunk = (self.chunked_prefill_workspace_size //
+                                     num_prefills_with_context_cpu)
 
                 # align max_context_chunk to page_size by rounding down,
                 # currently the `gather_cache` kernel cannot handle
@@ -516,30 +520,35 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                                                self.page_size)
 
                 assert max_context_chunk > 0
-                num_chunks = cdiv(context_lens.max(), max_context_chunk)
+                num_chunks = cdiv(max_context_len_cpu, max_context_chunk)
 
                 # if `max_context_chunk = 256`, `num_chunks = 3`, and
                 #   `num_prefills_with_context = 4`, create a tensor that looks
                 # like
                 #  [[0, 0, 0, 0], [256, 256, 256, 256], [512, 512, 512, 512]]
+                # Note(simon): this is done in CPU because of downstream's
+                # of `to_list`.
                 chunk_starts = \
-                    torch.arange(num_chunks, device=device, dtype=torch.int32) \
+                    torch.arange(num_chunks, dtype=torch.int32) \
                     .unsqueeze(1).expand(-1, self._num_prefills) \
                     * max_context_chunk
-                chunk_ends = torch.min(context_lens.unsqueeze(0),
+                chunk_ends = torch.min(context_lens_cpu.unsqueeze(0),
                                        chunk_starts + max_context_chunk)
                 chunk_seq_lens = (chunk_ends - chunk_starts).clamp(min=0)
-                _chunk_cu_seq_lens = chunk_seq_lens.cumsum(dim=1).to(
-                    torch.int32)
-                zero = torch.zeros(num_chunks,
-                                   dtype=torch.int32,
-                                   device=device).unsqueeze(-1)
+
+                cu_seq_lens_cpu = torch.zeros(num_chunks,
+                                              self._num_prefills + 1,
+                                              dtype=torch.int32,
+                                              pin_memory=True)
+                torch.cumsum(chunk_seq_lens,
+                             dim=1,
+                             out=cu_seq_lens_cpu[:, 1:],
+                             dtype=torch.int32)
 
                 chunked_context_metadata = \
                     MLACommonPrefillMetadata.ChunkedContextMetadata(
-                    cu_seq_lens=torch.cat(
-                        [zero, _chunk_cu_seq_lens], dim=1),
-                    starts=chunk_starts,
+                    cu_seq_lens=cu_seq_lens_cpu.to(device, non_blocking=True),
+                    starts=chunk_starts.to(device, non_blocking=True),
                     seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                     max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                     workspace=self.chunked_prefill_workspace,
@@ -553,7 +562,7 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
                 block_table=block_table[reqs_start:, ...],
                 query_start_loc=query_start_loc[reqs_start:] -
                 query_start_loc[reqs_start],
-                max_query_len=seq_lens[reqs_start:].max().item(),
+                max_query_len=max_query_len,
                 chunked_context=chunked_context_metadata,
             )
 
@@ -629,7 +638,9 @@ def __init__(
         # already inside an attention custom op), pull out the forward
         # method from the rotary embedding and call it directly
         # TODO(lucas): we should probably find a cleaner way to do this
-        self.rotary_emb = rotary_emb._forward_method
+        self.rotary_emb = rotary_emb.forward_native
+        if current_platform.is_cuda():
+            self.rotary_emb = rotary_emb.forward_cuda
 
         self.q_proj = q_proj
         self.kv_b_proj = kv_b_proj
@@ -1043,17 +1054,20 @@ def forward(
             decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
                 .view(-1, self.num_heads, self.qk_rope_head_dim)
+
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
-                attn_metadata.decode.input_positions, decode_q_pe, decode_k_pe)
+                attn_metadata.decode.input_positions, decode_q_pe.contiguous(),
+                decode_k_pe)
 
         if has_prefill:
             assert attn_metadata.prefill is not None
             prefill_q = self.q_proj(prefill_hs_or_q_c)[0]\
                 .view(-1, self.num_heads, self.qk_head_dim)
             prefill_q_pe = prefill_q[..., self.qk_nope_head_dim:]
+
             prefill_q_pe[...], prefill_k_pe[...] = self.rotary_emb(
-                attn_metadata.prefill.input_positions, prefill_q_pe,
-                prefill_k_pe)
+                attn_metadata.prefill.input_positions,
+                prefill_q_pe.contiguous(), prefill_k_pe)
 
         # write the latent and rope to kv cache
         if kv_cache.numel() > 0:

From 01062b84e62f2d72f3d7b748aac17eef35ac919a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Mar 2025 14:23:48 -0700
Subject: [PATCH 0638/1240] [Minor] Update the tqdm bar for parallel sampling
 (#14571)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/llm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 6c2e87416b9..e8f3c1f4e50 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1384,8 +1384,9 @@ def _run_engine(
                     if use_tqdm:
                         if isinstance(output, RequestOutput):
                             # Calculate tokens only for RequestOutput
+                            n = len(output.outputs)
                             assert output.prompt_token_ids is not None
-                            total_in_toks += len(output.prompt_token_ids)
+                            total_in_toks += len(output.prompt_token_ids) * n
                             in_spd = total_in_toks / pbar.format_dict["elapsed"]
                             total_out_toks += sum(
                                 len(stp.token_ids) for stp in output.outputs)
@@ -1394,7 +1395,7 @@ def _run_engine(
                             pbar.postfix = (
                                 f"est. speed input: {in_spd:.2f} toks/s, "
                                 f"output: {out_spd:.2f} toks/s")
-                            pbar.update(len(output.outputs))
+                            pbar.update(n)
                         else:
                             pbar.update(1)
 

From f64306d37df1b2baa9bae86a3ed306033b238bef Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Mon, 10 Mar 2025 17:27:53 -0400
Subject: [PATCH 0639/1240] [V1] LoRA - Add triton kernels for V1 (#13096)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_lora.py          | 165 ++++++++--
 tests/lora/test_layers.py                     |  68 +++--
 tests/lora/test_punica_ops.py                 | 205 ++++++++-----
 vllm/lora/models.py                           |   8 +-
 vllm/lora/ops/triton_ops/utils.py             |   4 +-
 vllm/lora/ops/triton_ops/v1/__init__.py       |  11 +
 vllm/lora/ops/triton_ops/v1/v1_expand.py      | 282 ++++++++++++++++++
 .../ops/triton_ops/v1/v1_kernel_metadata.py   | 117 ++++++++
 vllm/lora/ops/triton_ops/v1/v1_shrink.py      | 236 +++++++++++++++
 vllm/lora/punica_wrapper/punica_gpu.py        | 248 +++++++++++----
 vllm/v1/worker/lora_model_runner_mixin.py     |   6 +-
 11 files changed, 1162 insertions(+), 188 deletions(-)
 create mode 100644 vllm/lora/ops/triton_ops/v1/__init__.py
 create mode 100644 vllm/lora/ops/triton_ops/v1/v1_expand.py
 create mode 100644 vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py
 create mode 100644 vllm/lora/ops/triton_ops/v1/v1_shrink.py

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 3c4d6a6aa46..115b92539f9 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -23,6 +23,7 @@
 from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
 from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -171,6 +172,8 @@ class OpType(Enum):
     SGMV_EXPAND = auto()
     BGMV_EXPAND = auto()
     BGMV_EXPAND_SLICE = auto()
+    V1_SHRINK = auto()
+    V1_EXPAND = auto()
 
     @staticmethod
     def from_str(s: str) -> "OpType":
@@ -184,28 +187,43 @@ def from_str(s: str) -> "OpType":
             return OpType.BGMV_EXPAND
         if s.lower() == "bgmv_expand_slice":
             return OpType.BGMV_EXPAND_SLICE
+        if s.lower() == "v1_shrink":
+            return OpType.V1_SHRINK
+        if s.lower() == "v1_expand":
+            return OpType.V1_EXPAND
         raise ValueError(f"Unrecognized str {s} to convert to OpType")
 
     def is_shrink_fn(self) -> bool:
-        return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
+        return self in [
+            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
+        ]
 
     def is_expand_fn(self) -> bool:
-        return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
+        return self in [
+            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
+        ]
 
     def is_prefill_op(self) -> bool:
-        return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
+        return self in [
+            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
+            OpType.V1_EXPAND
+        ]
 
     def is_decode_op(self) -> bool:
         return self in [
-            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
+            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
+            OpType.V1_SHRINK, OpType.V1_EXPAND
         ]
 
     def is_expand_slice_fn(self) -> bool:
         return self in [OpType.BGMV_EXPAND_SLICE]
 
     def num_slices(self) -> list[int]:
-        if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
-            # SGMV kernels supports slices
+        if self in [
+                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
+                OpType.V1_EXPAND
+        ]:
+            # SGMV kernels and v1 kernels supports slices
             return [1, 2, 3]
         if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
             return [1]
@@ -250,11 +268,13 @@ def matmul_shapes(
         m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
 
         b_shape = (num_loras, n, k)  # col-major
-        if self == OpType.SGMV_SHRINK:
-            # SGMV shrink supports num_slices inherently in the kernel
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+            # SGMV shrink and V1 shrink kernels support num_slices inherently
+            # in the kernel.
             return ((m, k), b_shape, (num_slices, m, n))
-        if self == OpType.SGMV_EXPAND:
-            # SGMV expand supports num_slices inherently in the kernel
+        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
+            # SGMV expand and V1 expand kernels support num_slices inherently
+            # in the kernel
             return ((num_slices, m, k), b_shape, (m, n * num_slices))
         if self == OpType.BGMV_SHRINK:
             return ((m, k), b_shape, (m, n))
@@ -281,25 +301,30 @@ def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
             return bgmv_expand
         if self == OpType.BGMV_EXPAND_SLICE:
             return emulate_bgmv_expand_slice
+        if self == OpType.V1_SHRINK:
+            return v1_shrink
+        if self == OpType.V1_EXPAND:
+            return v1_expand
+
         raise ValueError(f"Unrecognized optype {self}")
 
     def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
                            lora_weights: list[torch.Tensor],
                            **kwargs) -> Callable:
-        """Each benchmark operation expected the input, lora_weights and outputs
+        """Each benchmark operation expects the input, lora_weights and outputs
            in a slightly different format. Refer to self.matmul_shapes().
            run_ref_group_gemm accounts for those differences in executing a
            reference group gemm for correctness testing.
         """
         w_dtype = lora_weights[0].dtype
         num_slices = len(lora_weights)
-        if self == OpType.SGMV_SHRINK:
+        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
             for slice_idx in range(num_slices):
                 ref_group_gemm(ref_out=output[slice_idx, :],
                                input=input,
                                lora_weights=lora_weights[slice_idx],
                                **kwargs)
-        if self == OpType.SGMV_EXPAND:
+        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
             hidden_size = lora_weights[0].shape[1]
             for slice_idx in range(num_slices):
                 slice_offset = slice_idx * hidden_size
@@ -308,19 +333,19 @@ def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
                     input=input[slice_idx].clone().to(dtype=w_dtype),
                     lora_weights=lora_weights[slice_idx],
                     **kwargs)
-        if self == OpType.BGMV_SHRINK:
+        elif self == OpType.BGMV_SHRINK:
             assert num_slices == 1
             ref_group_gemm(ref_out=output,
                            input=input,
                            lora_weights=lora_weights[0],
                            **kwargs)
-        if self == OpType.BGMV_EXPAND:
+        elif self == OpType.BGMV_EXPAND:
             assert num_slices == 1
             ref_group_gemm(ref_out=output,
                            input=input.clone().to(dtype=w_dtype),
                            lora_weights=lora_weights[0],
                            **kwargs)
-        if self == OpType.BGMV_EXPAND_SLICE:
+        elif self == OpType.BGMV_EXPAND_SLICE:
             hidden_size = lora_weights[0].shape[1]
             for slice_idx in range(num_slices):
                 slice_offset = slice_idx * hidden_size
@@ -329,7 +354,8 @@ def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
                     input=input[slice_idx].clone().to(dtype=w_dtype),
                     lora_weights=lora_weights[slice_idx],
                     **kwargs)
-        raise ValueError(f"Unrecognized optype {self}")
+        else:
+            raise ValueError(f"Unrecognized optype {self}")
 
 
 @dataclass
@@ -390,6 +416,8 @@ class BenchmarkTensors:
     seq_start_loc: torch.Tensor
     prompt_lora_mapping: torch.Tensor
     token_lora_mapping: torch.Tensor
+    # v1 kernel metadata
+    v1_kernel_meta: Optional[V1KernelMeta] = None
 
     def io_types(self) -> str:
         return (f"{dtype_to_str(self.input.dtype)}x"
@@ -432,10 +460,19 @@ def make(ctx: BenchmarkContext,
             total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
             seq_len_tensor, "cpu")
 
+        v1_kernel_meta = None
+        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
+            v1_kernel_meta = V1KernelMeta.make(
+                max_loras=ctx.num_loras,
+                max_num_tokens=token_lora_indices_tensor.size(0),
+                device="cpu")
+            v1_kernel_meta.prepare_tensors(
+                token_lora_mapping=token_lora_indices_tensor)
+
         return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
                                 seq_len_tensor, seq_start_loc_tensor,
                                 prompt_lora_indices_tensor,
-                                token_lora_indices_tensor)
+                                token_lora_indices_tensor, v1_kernel_meta)
 
     def sanity_check(self) -> None:
         """
@@ -468,6 +505,13 @@ def to_device(tensor: torch.Tensor):
         for i in range(len(self.lora_weights_lst)):
             self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
 
+        # v1 meta
+        if self.v1_kernel_meta:
+            for field_name in V1KernelMeta.__dataclass_fields__:
+                field = getattr(self.v1_kernel_meta, field_name)
+                assert isinstance(field, torch.Tensor)
+                setattr(self.v1_kernel_meta, field_name, to_device(field))
+
     def metadata(self) -> tuple[int, int, int]:
         """
         Return num_seqs, num_tokens and max_seq_len
@@ -667,6 +711,78 @@ def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
             })
         return {'kwargs_list': kwargs_list}
 
+    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
+        assert self.v1_kernel_meta is not None
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape [num_tokens, hidden_size]
+        assert len(i_shape) == 2
+        assert i_shape[0] == num_tokens
+        hidden_size = i_shape[1]
+        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == hidden_size
+        lora_rank = lw_shape[1]
+        # Expected output shape [num_slices, num_tokens, lora_rank]
+        assert len(o_shape) == 3
+        assert o_shape == (num_slices, num_tokens, lora_rank)
+
+        return {
+            'inputs': self.input,
+            'lora_a_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_indices_sorted_by_lora_ids':
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            'scaling': 1.0,
+        }
+
+    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
+        assert self.v1_kernel_meta is not None
+        self.sanity_check()
+        self.to_device(self.input.device)
+
+        _, num_tokens, _, num_slices = self.metadata()
+
+        # Sanity check matrix shapes.
+        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
+            0].shape, self.output.shape
+        # Expected input shape : [num_slices, num_tokens, lora_rank]
+        assert len(i_shape) == 3
+        assert i_shape[0] == num_slices
+        assert i_shape[1] == num_tokens
+        lora_rank = i_shape[2]
+        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
+        assert len(lw_shape) == 3
+        assert lw_shape[2] == lora_rank
+        hidden_size = lw_shape[1]
+        # Expected output shape : [num_tokens, hidden_size * num_slices]
+        assert len(o_shape) == 2
+        assert o_shape == (num_tokens, hidden_size * num_slices)
+
+        return {
+            'inputs': self.input,
+            'lora_b_weights': self.lora_weights_lst,
+            'output_tensor': self.output,
+            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_indices_sorted_by_lora_ids':
+            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            'offset_start': 0,
+            'add_inputs': add_inputs,
+        }
+
     def bench_fn_kwargs(self,
                         op_type: OpType,
                         add_inputs: Optional[bool] = None) -> dict[str, Any]:
@@ -685,6 +801,10 @@ def bench_fn_kwargs(self,
             return self.as_bgmv_expand_kwargs(add_inputs)
         if op_type == OpType.BGMV_EXPAND_SLICE:
             return self.as_bgmv_expand_slice_kwargs(add_inputs)
+        if op_type == OpType.V1_SHRINK:
+            return self.as_v1_shrink_kwargs()
+        if op_type == OpType.V1_EXPAND:
+            return self.as_v1_expand_kwargs(add_inputs)
         raise ValueError(f"Unrecognized optype {self}")
 
     def test_correctness(self, op_type: OpType,
@@ -872,12 +992,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
     timers = []
     for bench_ctx in bench_ctxs:
         for seq_len in args.seq_lengths:
-            bench_ops: list[OpType] = []
-            if seq_len == 1:
-                # bench all decode ops
-                bench_ops = [op for op in args.op_types if op.is_decode_op()]
-            else:
-                # bench all prefill ops
+            bench_ops: list[OpType] = args.op_types
+            if seq_len > 1:
+                # bench only prefill ops
                 bench_ops = [op for op in args.op_types if op.is_prefill_op()]
 
             seq_len_timers = []
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 428a1c71d09..8c8e55edae6 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import importlib
 import random
 from copy import deepcopy
 from dataclasses import dataclass
@@ -63,6 +64,36 @@
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
+# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
+# the tests in this file run twice, once with the V0 engine and then with
+# the V1 engine.
+# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
+# with the inclusion of V1 tests to maintain the CI test times.
+NUM_RANDOM_SEEDS = 5
+# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
+# 256 before. It is cut to half with the inclusion of V1 tests to maintain
+# the CI test times.
+VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+
+    # Reload punica_gpu as the kernels used are tied to engine type.
+    from vllm.lora.punica_wrapper import punica_gpu
+    importlib.reload(punica_gpu)
+
+    # Release any memory we might be holding on to. CI runs OOMs otherwise.
+    from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
+                                                _LORA_B_PTR_DICT)
+    _LORA_B_PTR_DICT.clear()
+    _LORA_A_PTR_DICT.clear()
+
+    yield
+
 
 def get_random_id_to_index(num_loras: int,
                            num_slots: int,
@@ -226,7 +257,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -241,7 +272,7 @@ def create_random_embedding_layer():
 
         return embedding, lora_embedding
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -329,7 +360,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -353,7 +384,7 @@ def create_random_embedding_layer():
 
         return expanded_embedding, lora_embedding
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -468,7 +499,7 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
 
     torch.set_default_device(device)
     max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
@@ -490,7 +521,7 @@ def _pretest():
 
         return linear, logits_processor, lora_logits_processor
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -600,10 +631,10 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
+    max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              lora_dtype=torch.float16,
@@ -627,7 +658,7 @@ def create_random_linear_replicated_layer():
             assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -716,10 +747,10 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
+    max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
@@ -753,7 +784,7 @@ def create_random_linear_parallel_layer():
             assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -842,10 +873,10 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
 
+    max_loras = 8
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              fully_sharded_loras=fully_shard,
@@ -900,7 +931,7 @@ class FakeConfig:
             assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
-    for i in range(10):
+    for i in range(NUM_RANDOM_SEEDS):
         set_random_seed(i)
 
         id_to_index = get_random_id_to_index(num_loras, max_loras)
@@ -1002,12 +1033,12 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
                                        is_neox_style, rotary_dim, head_size,
                                        seq_len) -> None:
     dtype = torch.float16
+    max_loras = 8
     seed = 0
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device)
+    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    max_loras = 8
     lora_config = LoRAConfig(max_loras=max_loras,
                              max_lora_rank=8,
                              long_lora_scaling_factors=scaling_factors,
@@ -1083,7 +1114,8 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
 
 
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
-@pytest.mark.parametrize("seed", list(range(256)))
+@pytest.mark.parametrize(
+    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
 def test_vocab_parallel_embedding_indices(tp_size, seed):
     random.seed(seed)
     vocab_size = random.randint(4000, 64000)
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index c75e866172e..a412a80dd70 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -5,10 +5,12 @@
 import torch
 
 import vllm.lora.ops.triton_ops  # noqa: F401
+import vllm.lora.ops.triton_ops.v1  # noqa: F401
 from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
                                      bgmv_shrink, sgmv_expand,
                                      sgmv_expand_slice, sgmv_shrink)
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.lora.ops.triton_ops.v1 import V1KernelMeta
 from vllm.platforms import current_platform
 
 from .utils import (PunicaTensors, assert_close, generate_data,
@@ -91,12 +93,12 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()
 
 
-def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, nslices: int, dtype: torch.dtype,
-                      device: str, seq_length: int, scaling: float):
+def check_shrink_kernels(batches: int, num_loras: int, rank: int,
+                         hidden_size: int, nslices: int, dtype: torch.dtype,
+                         device: str, seq_length: int, scaling: float):
     """
-    Compare outputs of vllm.sgmv_shrink kernel against a reference
-    implementation.
+    Compare outputs of vllm.sgmv_shrink and vllm.v1_shrink kernel against a
+    reference implementation.
     """
     data: PunicaTensors = generate_data_for_nslices(
         batches,
@@ -111,44 +113,63 @@ def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
     )
     max_seq_length, token_nums = data.meta()
 
+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the V1 kernel.
+    v1_meta = V1KernelMeta.make(max_loras=num_loras,
+                                max_num_tokens=token_nums,
+                                device='cuda')
+    v1_meta.prepare_tensors(data.token_lora_mapping)
+
+    ref_out_tensor = data.ref_out_tensor
+    sgmv_out_tensor = data.our_out_tensor
+    v1_out_tensor = data.our_out_tensor.clone()
+
     # Preventing cache error pointer.
     with _dict_lock:
+        # SGMV shrink kernel
         _LORA_A_PTR_DICT.clear()
         torch.ops.vllm.sgmv_shrink(
             data.inputs_tensor,
             data.lora_weights,
-            data.our_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
+            sgmv_out_tensor,
+            *sgmv_meta_args,
             scaling,
         )
 
-        sgmv_shrink_for_nslices(
-            nslices,
+        # V1 shrink kernel
+        _LORA_A_PTR_DICT.clear()
+        torch.ops.vllm.v1_shrink(
             data.inputs_tensor,
             data.lora_weights,
-            data.ref_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
+            v1_out_tensor,
+            *v1_meta.meta_args(token_nums=token_nums),
             scaling,
         )
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+
+    # Reference
+    sgmv_shrink_for_nslices(
+        nslices,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        scaling,
+    )
+
+    assert_close(sgmv_out_tensor, ref_out_tensor)
+    assert_close(v1_out_tensor, ref_out_tensor)
 
 
-def check_sgmv_expand(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, nslices: int, dtype: torch.dtype,
-                      device: str, seq_length: int, add_inputs: bool):
+def check_expand_kernels(batches: int, num_loras: int, rank: int,
+                         hidden_size: int, nslices: int, dtype: torch.dtype,
+                         device: str, seq_length: int, add_inputs: bool):
     """
-    Compare outputs of vllm.sgmv_expand kernel against a reference
-    implementation.
+    Compare outputs of vllm.sgmv_expand and vllm.v1_expand kernels against a
+    reference implementation.
     """
     data: PunicaTensors = generate_data_for_nslices(
         batches,
@@ -164,36 +185,54 @@ def check_sgmv_expand(batches: int, num_loras: int, rank: int,
 
     max_seq_length, token_nums = data.meta()
 
+    # Setup metadata information for SGMV and reference kernels
+    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
+                      data.prompt_lora_mapping, batches, max_seq_length,
+                      token_nums)
+
+    # Setup metadata information for the V1 kernel.
+    v1_meta = V1KernelMeta.make(max_loras=num_loras,
+                                max_num_tokens=token_nums,
+                                device='cuda')
+    v1_meta.prepare_tensors(data.token_lora_mapping)
+
+    # Setup output tensors
+    ref_out_tensor = data.ref_out_tensor
+    sgmv_out_tensor = data.our_out_tensor
+    v1_out_tensor = data.our_out_tensor.clone()
+
     with _dict_lock:
+        # SGMV expand kernel
         _LORA_B_PTR_DICT.clear()
         torch.ops.vllm.sgmv_expand(
             data.inputs_tensor,
             data.lora_weights,
-            data.our_out_tensor,
-            data.b_seq_start_loc,
-            data.seq_len_tensor,
-            data.prompt_lora_mapping,
-            batches,
-            max_seq_length,
-            token_nums,
+            sgmv_out_tensor,
+            *sgmv_meta_args,
             offset_start=0,
             add_inputs=add_inputs,
         )
 
+        # V1 expand kernel
+        _LORA_B_PTR_DICT.clear()
+        torch.ops.vllm.v1_expand(data.inputs_tensor,
+                                 data.lora_weights,
+                                 v1_out_tensor,
+                                 *v1_meta.meta_args(token_nums=token_nums),
+                                 offset_start=0,
+                                 add_inputs=add_inputs)
+
+    # Reference
     sgmv_expand_for_nslices(nslices,
                             hidden_size,
                             data.inputs_tensor,
                             data.lora_weights,
-                            data.ref_out_tensor,
-                            data.b_seq_start_loc,
-                            data.seq_len_tensor,
-                            data.prompt_lora_mapping,
-                            batches,
-                            max_seq_length,
-                            token_nums,
+                            ref_out_tensor,
+                            *sgmv_meta_args,
                             add_inputs=add_inputs)
 
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+    assert_close(sgmv_out_tensor, ref_out_tensor)
+    assert_close(v1_out_tensor, ref_out_tensor)
 
 
 def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
@@ -439,7 +478,7 @@ def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_sgmv(
+def test_kernels(
     batches: int,
     num_loras: int,
     rank: int,
@@ -450,29 +489,32 @@ def test_punica_sgmv(
     seed: int,
     op_type: str,
 ):
+    """
+    Tests SGMV and V1 kernels.
+    """
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_sgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          scaling=0.5)
+        check_shrink_kernels(batches=batches,
+                             num_loras=num_loras,
+                             rank=rank,
+                             hidden_size=hidden_size,
+                             nslices=nslices,
+                             dtype=dtype,
+                             device=device,
+                             seq_length=128,
+                             scaling=0.5)
     else:
-        check_sgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          add_inputs=True)
+        check_expand_kernels(batches=batches,
+                             num_loras=num_loras,
+                             rank=rank,
+                             hidden_size=hidden_size,
+                             nslices=nslices,
+                             dtype=dtype,
+                             device=device,
+                             seq_length=128,
+                             add_inputs=True)
 
 
 @pytest.mark.parametrize("batches", hs_test_params['batches'])
@@ -484,7 +526,7 @@ def test_punica_sgmv(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("seed", SEED)
 @pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_sgmv_hidden_size(
+def test_kernels_hidden_size(
     batches: int,
     num_loras: int,
     rank: int,
@@ -495,29 +537,32 @@ def test_punica_sgmv_hidden_size(
     seed: int,
     op_type: str,
 ):
+    """
+    Tests SGMV and V1 kernels.
+    """
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_sgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          scaling=0.5)
+        check_shrink_kernels(batches=batches,
+                             num_loras=num_loras,
+                             rank=rank,
+                             hidden_size=hidden_size,
+                             nslices=nslices,
+                             dtype=dtype,
+                             device=device,
+                             seq_length=128,
+                             scaling=0.5)
     else:
-        check_sgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          nslices=nslices,
-                          dtype=dtype,
-                          device=device,
-                          seq_length=128,
-                          add_inputs=True)
+        check_expand_kernels(batches=batches,
+                             num_loras=num_loras,
+                             rank=rank,
+                             hidden_size=hidden_size,
+                             nslices=nslices,
+                             dtype=dtype,
+                             device=device,
+                             seq_length=128,
+                             add_inputs=True)
 
 
 @pytest.mark.parametrize("batches", test_params['batches'])
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index e1294884ac2..174b9f0b977 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -326,9 +326,11 @@ def __init__(
         self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots
         self.vocab_size = vocab_size
         self.long_lora_context: Optional[LongContextLoRAContext] = None
-        self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens,
-                                                 max_batches=self.max_num_seqs,
-                                                 device=self.device)
+        self.punica_wrapper = get_punica_wrapper(
+            max_num_batched_tokens,
+            max_batches=self.max_num_seqs,
+            device=self.device,
+            max_loras=self.lora_config.max_loras)
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index 78409b91a14..b52a842cdaf 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -54,7 +54,7 @@ def get_lora_op_configs(op_type: str, batch: int,
 _LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
 
 
-def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
+def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: torch.device):
     """
     `_LORA_A_PTR_DICT` collects the required information during `profile_run`, 
     After this, it remains constant and subsequent usage is through LUT.
@@ -100,7 +100,7 @@ def _get_lora_a_ptr(lora_a_weights: List[torch.Tensor], device: str):
 
 
 def _get_lora_b_ptr(lora_weights: List[torch.Tensor], offset_start: int,
-                    device: str):
+                    device: torch.device):
     """ 
      `_LORA_B_PTR_DICT` collects the required information during `profile_run`, 
     After this, it remains constant and subsequent usage is through LUT.
diff --git a/vllm/lora/ops/triton_ops/v1/__init__.py b/vllm/lora/ops/triton_ops/v1/__init__.py
new file mode 100644
index 00000000000..1d2c46f4e9f
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/v1/__init__.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.lora.ops.triton_ops.v1.v1_expand import v1_expand
+from vllm.lora.ops.triton_ops.v1.v1_kernel_metadata import V1KernelMeta
+from vllm.lora.ops.triton_ops.v1.v1_shrink import v1_shrink
+
+__all__ = [
+    "v1_expand",
+    "v1_shrink",
+    "V1KernelMeta",
+]
\ No newline at end of file
diff --git a/vllm/lora/ops/triton_ops/v1/v1_expand.py b/vllm/lora/ops/triton_ops/v1/v1_expand.py
new file mode 100644
index 00000000000..20c7f8f4c7f
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/v1/v1_expand.py
@@ -0,0 +1,282 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
+Punica: Multi-Tenant LoRA Serving.
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import List
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.lora.ops.triton_ops.kernel_utils import do_expand_kernel
+from vllm.lora.ops.triton_ops.utils import _get_lora_b_ptr
+from vllm.utils import direct_register_custom_op
+
+
+@triton.jit
+def _v1_expand_kernel(
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        M,
+        N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        slice_start_loc,
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,  # 1
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,  # 1
+        output_d0_stride,
+        output_d1_stride,  # 1
+        output_hs_ptr,
+        BLOCK_M: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        EVEN_K: tl.constexpr,
+        ADD_INPUTS: tl.constexpr,
+        CAST_TYPE: tl.constexpr,
+        SLICE_NUM: tl.constexpr,
+        SAME_STRIDE: tl.constexpr):
+
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    cta_m_num = tl.cdiv(M, BLOCK_M)
+
+    pid_mn = tl.program_id(axis=0)
+    pid_m = pid_mn % cta_m_num
+    pid_n = (pid_mn // cta_m_num) % cta_n_num
+
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+
+    lora_id = tl.load(lora_ids + lora_idx)
+    if lora_id == -1:
+        # Early exit for the no-lora case.
+        return
+
+    lora_m_size = tl.load(num_tokens_per_lora + lora_idx)
+
+    cta_m_offset = pid_m * BLOCK_M
+    if cta_m_offset >= lora_m_size:
+        # Early exit CTA.
+        return
+
+    # When the output dimensions of each slice are the same,cur_n=N, otherwise
+    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
+    # qkv linear.
+    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
+    if pid_n * BLOCK_N >= curr_N:
+        # Early exit CTA.
+        return
+
+    # num rows this CTA should process.
+    cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset)
+
+    # Identify all rows that this CTA should process.
+    lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
+    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
+                            lora_m_indices_start + cta_m_offset)
+
+    # Load all relevant row indices.
+    offset_m = tl.arange(0, BLOCK_M) % cta_m_len
+    ram = tl.load(cta_lora_seq_indices + offset_m)
+
+    do_expand_kernel(
+        pid_n,
+        lora_id,
+        slice_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        curr_N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        slice_start_loc,
+        # input ptr strides
+        input_d0_stride,
+        input_d1_stride,
+        input_d2_stride,
+        # lora ptr strides
+        ls_d0_ptr,
+        ls_d1_ptr,
+        ls_d2_ptr,
+        # out ptr strides
+        output_d0_stride,
+        output_d1_stride,
+        # constants
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        SAME_STRIDE,
+        SLICE_NUM,
+        EVEN_K,
+        CAST_TYPE,
+        ADD_INPUTS)
+
+
+@torch.inference_mode()
+def _v1_expand(
+    inputs: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
+    lora_b_weights: List[
+        torch.Tensor],  # shape [num_lora, hidden_size, lora_rank]
+    output_tensor: torch.
+    Tensor,  # shape [num_tokens, hidden_size * num_slices]
+    token_lora_mapping: torch.Tensor,  # shape [num_tokens]
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens]
+    num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
+    lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
+    lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    offset_start: int = 0,
+    add_inputs: bool = False,
+) -> None:
+    """
+    Args:
+        inputs (torch.Tensor): input tensor
+        lora_b_weights (List[torch.Tensor]): lora'b weight
+        output_tensor (torch.Tensor): output tensor
+        token_lora_mapping (torch.Tensor): A tensor mapping each input token
+            to the lora-id related to that token. A value of -1 indicates that
+            LoRA doesn't apply to that token.
+        token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
+            the A matrix grouped by LoRA IDs.
+        num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
+            of tokens that are to be processed by LoRA ID lora_ids[i] 
+        lora_token_start_loc (torch.Tensor): A cumulative sum of
+            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
+            lora_token_start_loc[i], along with num_tokens_per_lora[i]
+            identifies the the region in token_indices_sorted_by_lora_ids that
+            LoRA lora_ids[i] should process.
+        lora_ids (torch.Tensor): LoRA ids to process.
+        offset_start (int, optional): Offset start for output_tensor. 
+            Defaults to 0.
+        add_inputs (bool, optional): Whether to add the input tensor to the 
+            output tensor. Defaults to False.
+    """
+    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
+    for weight in lora_b_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(0) == len(lora_b_weights)
+    assert output_tensor.is_contiguous()
+
+    # metadata sanity check.
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
+        0)
+    assert lora_ids.size(0) == num_tokens_per_lora.size(0)
+    assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
+
+    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
+     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
+     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
+                                           inputs.device)
+
+    K = lora_b_weights[0].shape[-1]  # K= rank
+    M = inputs.size(1)
+    ADD_INPUTS = add_inputs
+    MAX_LORAS = lora_ids.size(0)
+    CAST_TYPE = False
+    NUM_SLICES = len(lora_b_weights)
+
+    # Triton kernel configs.
+    BLOCK_M = 64
+    BLOCK_N = 128
+    BLOCK_K = 16
+    NUM_WARPS = 4
+    NUM_CTAS = 1
+    NUM_STAGES = 2
+    MAX_NREG = None
+
+    EVEN_K = K % BLOCK_K == 0  # type: ignore
+
+    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
+            torch.float16,
+            torch.bfloat16,
+    ]:
+        CAST_TYPE = True
+
+    # TODO (varun): This grid formulation maximizes parallelization at the
+    # cost of wasteful thread block launch when only a few input tokens require
+    # LoRA. This might not be the best in all cases.
+    grid = (
+        triton.cdiv(M, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
+        NUM_SLICES,
+        # Each LoRA receives its own set of thread blocks for output
+        # computation. If some LoRA doesn't have any tokens to process, its
+        # thread blocks simply exit.
+        MAX_LORAS,
+    )
+
+    _v1_expand_kernel[grid](
+        inputs,
+        lora_ptr_tensor,
+        output_tensor,
+        M,
+        MAX_N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        slice_start_tensor,
+        inputs.stride(0),
+        inputs.stride(1),
+        inputs.stride(2),
+        lora_strides_d0_tensor,
+        lora_strides_d1_tensor,
+        lora_strides_d2_tensor,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        hidden_sizes_tensor,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        ADD_INPUTS,
+        CAST_TYPE,
+        NUM_SLICES,
+        same_stride,
+        num_warps=NUM_WARPS,
+        num_ctas=NUM_CTAS,
+        num_stages=NUM_STAGES,
+        maxnreg=MAX_NREG,
+    )
+
+    return
+
+
+def _v1_expand_fake(
+    inputs: torch.Tensor,
+    lora_b_weights: List[torch.Tensor],
+    output_tensor: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    token_indices_sorted_by_lora_ids: torch.Tensor,
+    num_tokens_per_lora: torch.Tensor,
+    lora_token_start_loc: torch.Tensor,
+    lora_ids: torch.Tensor,
+    offset_start: int = 0,
+    add_inputs: bool = False,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="v1_expand",
+        op_func=_v1_expand,
+        mutates_args=["output_tensor"],
+        fake_impl=_v1_expand_fake,
+    )
+    v1_expand = torch.ops.vllm.v1_expand
+
+except AttributeError:
+    v1_expand = _v1_expand
diff --git a/vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py b/vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py
new file mode 100644
index 00000000000..57b4dd7a902
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+V1 LoRA kernels metadata preparation utilities.
+"""
+
+from dataclasses import dataclass
+from typing import Tuple, Union
+
+import torch
+
+
+@dataclass
+class V1KernelMeta:
+    token_lora_mapping: torch.Tensor
+    token_indices_sorted_by_lora_ids: torch.Tensor
+    active_lora_ids: torch.Tensor
+    num_tokens_per_lora: torch.Tensor
+    lora_token_start_loc: torch.Tensor
+
+    @staticmethod
+    def make(max_loras: int, max_num_tokens: int,
+             device: Union[torch.device, str]) -> "V1KernelMeta":
+
+        token_lora_mapping = torch.empty(max_num_tokens,
+                                         dtype=torch.int32,
+                                         device=device)
+
+        token_indices_sorted_by_lora_ids = torch.empty(max_num_tokens,
+                                                       dtype=torch.int32,
+                                                       device=device)
+
+        # +1 because "no-lora" is also a possibility
+        # example: let max_loras be 3, active_lora_ids of [-1, 0, 2, 1]
+        # is a possibility.
+        active_lora_ids = torch.empty(max_loras + 1,
+                                      dtype=torch.int32,
+                                      device=device)
+
+        # using running example, [3, 10, 5, 2] is a possibility.
+        num_tokens_per_lora = torch.zeros(max_loras + 1,
+                                          dtype=torch.int32,
+                                          device=device)
+
+        # +2 for this because, the first index is always 0.
+        # using running example, lora_token_start_loc
+        # is [0, 3, 13, 18, 20].
+        lora_token_start_loc = torch.zeros(max_loras + 2,
+                                           dtype=torch.int32,
+                                           device=device)
+        return V1KernelMeta(
+            token_lora_mapping=token_lora_mapping,
+            token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
+            active_lora_ids=active_lora_ids,
+            num_tokens_per_lora=num_tokens_per_lora,
+            lora_token_start_loc=lora_token_start_loc)
+
+    def _reset(self):
+        self.active_lora_ids.fill_(-1)
+        self.num_tokens_per_lora.fill_(0)
+        self.lora_token_start_loc.fill_(0)
+
+    def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
+        """
+        Prepare kernel metadata tensors for the current forward pass.
+
+        Args:
+            token_lora_tensor (torch.Tensor): Tensor containing lora indices
+            for each input token.
+        """
+
+        self._reset()
+
+        num_tokens = token_lora_mapping.size(0)
+
+        # copy token lora mapping
+        self.token_lora_mapping[:num_tokens].copy_(token_lora_mapping,
+                                                   non_blocking=True)
+
+        # token_indices_sorted_by_lora_ids
+        _, token_indices_sorted_by_lora_ids = torch.sort(token_lora_mapping,
+                                                         stable=True)
+        # start gpu transfer
+        self.token_indices_sorted_by_lora_ids[:num_tokens].copy_(
+            token_indices_sorted_by_lora_ids, non_blocking=True)
+
+        # active_lora_ids, num_tokens_per_lora
+        lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
+                                                     sorted=False,
+                                                     return_counts=True)
+        self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
+                                                      non_blocking=True)
+        self.num_tokens_per_lora[:num_tokens_per_lora.size(0)].copy_(
+            num_tokens_per_lora, non_blocking=True)
+
+        # lora_token_start_loc
+        lora_token_start_loc = torch.cumsum(num_tokens_per_lora, dim=0)
+        self.lora_token_start_loc[1:1 + lora_token_start_loc.size(0)].copy_(
+            lora_token_start_loc, non_blocking=True)
+
+    def meta_args(
+        self, token_nums: int
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+               torch.Tensor]:
+        """
+        This function returns the kernel metadata required for the current
+        forward pass execution of the kernel. The function returns all the
+        metadata required by the kernel, in order, as a tuple, so it can be
+        unpacked directly during the v1_shrink/v1_expand function call.
+
+        Args:
+            token_nums (int): Number of input tokens in the current forward
+            pass. 
+        """
+        return (self.token_lora_mapping[:token_nums],
+                self.token_indices_sorted_by_lora_ids[:token_nums],
+                self.num_tokens_per_lora, self.lora_token_start_loc,
+                self.active_lora_ids)
diff --git a/vllm/lora/ops/triton_ops/v1/v1_shrink.py b/vllm/lora/ops/triton_ops/v1/v1_shrink.py
new file mode 100644
index 00000000000..39affd18922
--- /dev/null
+++ b/vllm/lora/ops/triton_ops/v1/v1_shrink.py
@@ -0,0 +1,236 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Based on:
+Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
+Punica: Multi-Tenant LoRA Serving. 
+https://arxiv.org/abs/2310.18547
+"""
+
+from typing import List
+
+import torch
+import triton
+import triton.language as tl
+
+from vllm.lora.ops.triton_ops.kernel_utils import do_shrink_kernel
+from vllm.lora.ops.triton_ops.utils import _get_lora_a_ptr
+from vllm.utils import direct_register_custom_op
+
+
+@triton.jit
+def _v1_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
+                      token_indices_sorted_by_lora_ids, num_tokens_per_lora,
+                      lora_token_start_loc, lora_ids, scaling, input_d0_stride,
+                      input_d1_stride, lora_d0_stride, lora_d1_stride,
+                      lora_d2_stride, output_d0_stride, output_d1_stride,
+                      output_d2_stride, BLOCK_M: tl.constexpr,
+                      BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+                      EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr,
+                      SLICE_NUM: tl.constexpr):
+
+    cta_n_num = tl.cdiv(N, BLOCK_N)
+    cta_m_num = tl.cdiv(M, BLOCK_M)
+
+    pid_sk_m_n = tl.program_id(axis=0)
+    pid_sk = pid_sk_m_n % SPLIT_K
+    pid_m = (pid_sk_m_n // SPLIT_K) % cta_m_num
+    pid_n = pid_sk_m_n // (SPLIT_K * cta_m_num) % cta_n_num
+
+    slice_id = tl.program_id(axis=1)
+    lora_idx = tl.program_id(axis=2)
+
+    lora_id = tl.load(lora_ids + lora_idx)
+    if lora_id == -1:
+        # Early exit for the no-lora case.
+        return
+
+    lora_m_size = tl.load(num_tokens_per_lora + lora_idx)
+
+    cta_m_offset = pid_m * BLOCK_M
+    if cta_m_offset >= lora_m_size:
+        # Early exit CTA.
+        return
+
+    # num rows this CTA should process.
+    cta_m_len = min(BLOCK_M, lora_m_size - cta_m_offset)
+
+    # Identify all rows that this CTA should process.
+    lora_m_indices_start = tl.load(lora_token_start_loc + lora_idx)
+    cta_lora_seq_indices = (token_indices_sorted_by_lora_ids +
+                            lora_m_indices_start + cta_m_offset)
+
+    # Load all relevant row indices.
+    offset_m = tl.arange(0, BLOCK_M) % cta_m_len
+    ram = tl.load(cta_lora_seq_indices + offset_m)
+
+    do_shrink_kernel(
+        pid_n,
+        pid_sk,
+        slice_id,
+        lora_id,
+        input_ptr,
+        lora_ptr,
+        out_ptr,
+        N,
+        K,
+        cta_m_len,
+        ram,  # array identifying the rows of Input ptr to operate on
+        # input strides
+        input_d0_stride,
+        input_d1_stride,
+        # lora strides
+        lora_d0_stride,
+        lora_d1_stride,
+        lora_d2_stride,
+        # output strides
+        output_d0_stride,
+        output_d1_stride,
+        output_d2_stride,
+        scaling,
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        SLICE_NUM)
+
+
+@torch.inference_mode()
+def _v1_shrink(
+    inputs: torch.Tensor,  #  shape [num_tokens, hidden_size]
+    lora_a_weights: List[
+        torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
+    output_tensor: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
+    token_lora_mapping: torch.Tensor,  # shape [num_tokens]
+    token_indices_sorted_by_lora_ids: torch.Tensor,  # shape [num_tokens] 
+    num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
+    lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
+    lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    scaling: float,
+) -> None:
+    """
+    Args:
+        inputs (torch.Tensor): Input tensor
+        lora_a_weights (List[torch.Tensor]): LoRA weights
+        output_tensor (torch.Tensor): output tensor
+        token_lora_mapping (torch.Tensor): A tensor mapping each input token
+            to the lora-id related to that token. A value of -1 indicates that
+            LoRA doesn't apply to that token.
+        token_indices_sorted_by_lora_ids (torch.Tensor): Row/Token indices from
+            the A matrix grouped by LoRA IDs.
+        num_tokens_per_lora (torch.Tensor): num_tokens_per_lora[i] is the number
+            of tokens that are to be processed by LoRA ID lora_ids[i] 
+        lora_token_start_loc (torch.Tensor): A cumulative sum of
+            num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that
+            lora_token_start_loc[i], along with num_tokens_per_lora[i]
+            identifies the region in token_indices_sorted_by_lora_ids that
+            LoRA lora_ids[i] should process.
+        lora_ids (torch.Tensor): LoRA ids to process.
+        scaling (float): Scaling factor.
+    """
+    assert inputs.dtype == lora_a_weights[0].dtype
+    assert inputs.dtype in [torch.float16, torch.bfloat16]
+    for weight in lora_a_weights:
+        assert weight.dtype in [torch.float16, torch.bfloat16]
+
+    assert inputs.size(1) == lora_a_weights[0].size(-1)
+    assert inputs.is_contiguous()
+    assert output_tensor.is_contiguous()
+
+    # metadata sanity check
+    assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
+        0)
+    assert lora_ids.size(0) == num_tokens_per_lora.size(0)
+    assert lora_token_start_loc.size(0) == lora_ids.size(0) + 1
+
+    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
+     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
+    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
+    M = inputs.size(0)
+    NUM_SLICES = len(lora_a_weights)
+    MAX_LORAS = lora_ids.size(0)
+
+    # Triton kernel configs
+    BLOCK_M = 32
+    BLOCK_N = 16
+    BLOCK_K = 256 if M < 128 else 32
+    SPLIT_K = 64 if M < 128 else 8
+    NUM_WARPS = 4
+    NUM_CTAS = 1
+    NUM_STAGES = 2
+    MAX_NREG = None
+
+    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0  # type: ignore
+
+    # TODO (varun): This grid formulation maximizes parallelization at the
+    # cost of wasteful thread block launch when only few of the input tokens
+    # require LoRA. This might not be the best in all cases.
+    grid = (
+        SPLIT_K * triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+        NUM_SLICES,
+        # Each LoRA receives its own set of thread blocks for output
+        # computation. If some LoRA doesn't have any tokens to process, its
+        # thread blocks exit early.
+        MAX_LORAS,
+    )
+
+    _v1_shrink_kernel[grid](
+        inputs,
+        lora_ptr_tensor,
+        output_tensor,
+        M,
+        N,
+        K,
+        token_indices_sorted_by_lora_ids,
+        num_tokens_per_lora,
+        lora_token_start_loc,
+        lora_ids,
+        scaling,
+        inputs.stride(0),
+        inputs.stride(1),
+        lora_strides_d0,
+        lora_strides_d1,
+        lora_strides_d2,
+        output_tensor.stride(0),
+        output_tensor.stride(1),
+        output_tensor.stride(2),
+        BLOCK_M,
+        BLOCK_N,
+        BLOCK_K,
+        EVEN_K,
+        SPLIT_K,
+        NUM_SLICES,
+        num_warps=NUM_WARPS,
+        num_ctas=NUM_CTAS,
+        num_stages=NUM_STAGES,
+        maxnreg=MAX_NREG,
+    )
+
+    return
+
+
+def _v1_shrink_fake(
+    inputs: torch.Tensor,
+    lora_a_weights: List[torch.Tensor],
+    output_tensor: torch.Tensor,
+    token_lora_mapping: torch.Tensor,
+    token_indices_sorted_by_lora_ids: torch.Tensor,
+    num_tokens_per_lora: torch.Tensor,
+    lora_token_start_loc: torch.Tensor,
+    lora_ids: torch.Tensor,
+    scaling: float,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="v1_shrink",
+        op_func=_v1_shrink,
+        mutates_args=["output_tensor"],
+        fake_impl=_v1_shrink_fake,
+    )
+    v1_shrink = torch.ops.vllm.v1_shrink
+
+except AttributeError:
+    v1_shrink = _v1_shrink
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 9ccd9c36a07..3a4fcd04dbe 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -6,24 +6,83 @@
 https://arxiv.org/abs/2310.18547
 """
 
-from typing import Optional, Tuple, Union, final
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final
 
 import torch
 
+import vllm.envs as env
+from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
 
 if HAS_TRITON:
-    from vllm.lora.ops.triton_ops import bgmv_expand
-    from vllm.lora.ops.triton_ops import bgmv_expand_slice
-    from vllm.lora.ops.triton_ops import bgmv_shrink
-    from vllm.lora.ops.triton_ops import sgmv_expand
-    from vllm.lora.ops.triton_ops import sgmv_shrink
+    if env.VLLM_USE_V1:
+        from vllm.lora.ops.triton_ops.v1 import (V1KernelMeta, v1_expand,
+                                                 v1_shrink)
+    else:
+        from vllm.lora.ops.triton_ops import bgmv_expand
+        from vllm.lora.ops.triton_ops import bgmv_expand_slice
+        from vllm.lora.ops.triton_ops import bgmv_shrink
+        from vllm.lora.ops.triton_ops import sgmv_expand
+        from vllm.lora.ops.triton_ops import sgmv_shrink
 
 from .punica_base import PunicaWrapperBase
 
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.models import LongContextLoRAContext
+
+
+class V1KernelMixin:
+
+    def _v1_make_metadata(self, max_loras: int, max_num_batched_tokens: int,
+                          max_batches: int, device: Union[torch.device, str]):
+        self.token_mapping_v1_meta = V1KernelMeta.make(max_loras,
+                                                       max_num_batched_tokens,
+                                                       device=device)
+        self.prompt_mapping_v1_meta = V1KernelMeta.make(max_loras,
+                                                        max_batches,
+                                                        device=device)
+
+    def _v1_prepare_metadata_tensors(self, token_lora_indices: torch.Tensor,
+                                     sampler_indices: torch.Tensor):
+        self.token_mapping_v1_meta.prepare_tensors(token_lora_indices)
+        self.prompt_mapping_v1_meta.prepare_tensors(sampler_indices)
+
+    def _v1_apply_shrink(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
+        scale: float,
+    ):
+        v1_shrink(
+            x,
+            w_t_all,
+            y,
+            *self.token_mapping_v1_meta.meta_args(x.size(0)),
+            scale,
+        )
+
+    def _v1_apply_expand(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
+        offset_start: int,
+        add_inputs: bool,
+    ):
+        v1_expand(
+            x,
+            w_t_all,
+            y,
+            *self.token_mapping_v1_meta.meta_args(x.size(0)),
+            offset_start=offset_start,
+            add_inputs=add_inputs,
+        )
+
 
 @final
-class PunicaWrapperGPU(PunicaWrapperBase):
+class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin):
     """
     PunicaWrapperGPU is designed to manage and provide metadata for the punica 
     kernel. The main function is to maintain the state information for 
@@ -35,6 +94,36 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches,
                                    device)
 
+        self.max_loras = kwargs['max_loras']
+
+        if env.VLLM_USE_V1:
+            self._v1_make_metadata(self.max_loras, max_num_batched_tokens,
+                                   max_batches, device)
+
+    def update_metadata(
+            self,
+            mapping: LoRAMapping,
+            lora_index_to_id: List[Optional[int]],
+            max_loras: int,
+            vocab_size: int,
+            extra_vocab_size: int,
+            long_lora_context: Optional["LongContextLoRAContext"] = None,
+            **kwargs):
+
+        if env.VLLM_USE_V1:
+            self.is_prefill = mapping.is_prefill
+            self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                       vocab_size, extra_vocab_size,
+                                       long_lora_context)
+            self._v1_prepare_metadata_tensors(self.token_lora_indices,
+                                              self.sampler_indices)
+        else:
+            # Forward to base class update_metadata
+            PunicaWrapperBase.update_metadata(self, mapping, lora_index_to_id,
+                                              max_loras, vocab_size,
+                                              extra_vocab_size,
+                                              long_lora_context, **kwargs)
+
     def _apply_shrink_prefill(
         self,
         y: torch.Tensor,
@@ -66,7 +155,7 @@ def _apply_expand_prefill(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        w_t_all: torch.Tensor,
+        w_t_all: Tuple[torch.Tensor, ...],
         offset_start: int,
         add_inputs: bool,
     ):
@@ -118,14 +207,21 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
 
         x = x.view(-1, x.shape[-1])
 
-        if self.is_prefill:
-            # NOTE fused kernel
-            self._apply_shrink_prefill(y, x, lora_a_stacked, scale)
+        if env.VLLM_USE_V1:
+            self._v1_apply_shrink(y, x, lora_a_stacked, scale)  # type: ignore
         else:
-            # TODO fuse these kernels
-            for slice_idx in range(len(lora_a_stacked)):
-                self._apply_shrink_decode(y[slice_idx], x,
-                                          lora_a_stacked[slice_idx], scale)
+            if self.is_prefill:
+                # NOTE fused kernel
+                self._apply_shrink_prefill(
+                    y,  # type: ignore
+                    x,
+                    lora_a_stacked,
+                    scale)
+            else:
+                # TODO fuse these kernels
+                for slice_idx in range(len(lora_a_stacked)):
+                    self._apply_shrink_decode(y[slice_idx], x,
+                                              lora_a_stacked[slice_idx], scale)
 
     def add_expand(self,
                    y: torch.Tensor,
@@ -160,25 +256,38 @@ def add_expand(self,
         if lora_bias_stacked is not None:
             self._apply_bias(self.token_lora_indices, y, output_slices,
                              lora_bias_stacked)
-        if self.is_prefill:
-            # NOTE fused kernel
-            self._apply_expand_prefill(y,
-                                       x,
-                                       lora_b_stacked,
-                                       offset_start,
-                                       add_inputs=True)
+
+        if env.VLLM_USE_V1:
+            # TODO (varun): Profile with add_inputs = False. i.e. move the
+            # addition out of the kernel
+            self._v1_apply_expand(
+                y,
+                x,  # type: ignore
+                lora_b_stacked,
+                offset_start,
+                add_inputs=True)
         else:
-            # TODO fuse these kernels
-            for slice_idx in range(len(lora_b_stacked)):
-                self._apply_expand_decode(
+
+            if self.is_prefill:
+                # NOTE fused kernel
+                self._apply_expand_prefill(
                     y,
-                    x[slice_idx],
-                    lora_b_stacked[slice_idx],
+                    x,  # type: ignore
+                    lora_b_stacked,
                     offset_start,
-                    output_slices[slice_idx],
-                    add_inputs=add_inputs,
-                )
-                offset_start += output_slices[slice_idx]
+                    add_inputs=True)
+            else:
+                # TODO fuse these kernels
+                for slice_idx in range(len(lora_b_stacked)):
+                    self._apply_expand_decode(
+                        y,
+                        x[slice_idx],
+                        lora_b_stacked[slice_idx],
+                        offset_start,
+                        output_slices[slice_idx],
+                        add_inputs=add_inputs,
+                    )
+                    offset_start += output_slices[slice_idx]
         y = y.view_as(y_org)
 
     def add_lora_embedding(self,
@@ -200,18 +309,24 @@ def add_lora_embedding(self,
             add_inputs (bool): Default to True.
         """
 
-        if self.is_prefill:
-            sgmv_expand(
-                x.unsqueeze(dim=0),
-                [lora_b_stacked],
-                y,
-                *self.prefill_metadata,
-                offset_start=0,
-                add_inputs=add_inputs,
-            )
+        if env.VLLM_USE_V1:
+            self._v1_apply_expand(y,
+                                  x.unsqueeze(dim=0), (lora_b_stacked, ),
+                                  offset_start=0,
+                                  add_inputs=add_inputs)
         else:
-            bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
-                        add_inputs)
+            if self.is_prefill:
+                sgmv_expand(
+                    x.unsqueeze(dim=0),
+                    (lora_b_stacked, ),
+                    y,
+                    *self.prefill_metadata,
+                    offset_start=0,
+                    add_inputs=add_inputs,
+                )
+            else:
+                bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
+                            add_inputs)
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -257,19 +372,25 @@ def add_lora_linear(self,
             r = lora_b_stacked[0].size(-1)
             # We set the buffer to be float32 by default ,refer to:
             # https://github.com/triton-lang/triton/issues/1387
-            buffer = torch.zeros(
+            buffer = torch.zeros(  # type: ignore
                 (len(output_slices), x.size(0), r),
                 dtype=torch.float32,
                 device=x.device,
             )
-        self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs)
-        self.add_expand(y,
-                        buffer,
-                        lora_b_stacked,
-                        None,
-                        output_slices,
-                        add_inputs=True,
-                        **kwargs)
+        self.add_shrink(
+            buffer,  # type: ignore
+            x,
+            lora_a_stacked,
+            scale,
+            **kwargs)
+        self.add_expand(
+            y,
+            buffer,  # type: ignore
+            lora_b_stacked,
+            None,
+            output_slices,
+            add_inputs=True,
+            **kwargs)
 
     def add_lora_logits(self,
                         y: torch.Tensor,
@@ -305,11 +426,22 @@ def add_lora_logits(self,
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,
                                  device=x.device)
-        # LogitsProcessorWithLoRA always using bgmv.
-        bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
-        bgmv_expand(buffer,
-                    lora_b_stacked,
-                    y,
-                    self.sampler_indices,
-                    add_inputs=True)
+
+        if env.VLLM_USE_V1:
+            v1_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
+                      *self.prompt_mapping_v1_meta.meta_args(x.size(0)), scale)
+
+            v1_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
+                      y,
+                      *self.prompt_mapping_v1_meta.meta_args(buffer.size(0)),
+                      add_inputs=True)
+        else:
+
+            # V0 LogitsProcessorWithLoRA always using bgmv.
+            bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
+            bgmv_expand(buffer,
+                        lora_b_stacked,
+                        y,
+                        self.sampler_indices,
+                        add_inputs=True)
         y = y.view_as(y_org)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index f34aacacf3e..0b30a467305 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -62,9 +62,9 @@ def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...],
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
 
-        # We dont make any distinction between prefills and decodes in the
-        # scheduler. To that effect, set is_prefill to True so we use the
-        # sgmv punica kernels always.
+        # Set is_prefill to True, so we always use the SGMV kernels.
+        # For cuda platforms, we have specialized triton kernels, and
+        # the cuda path ignores `is_prefill`.
         lora_mapping = LoRAMapping(token_lora_mapping,
                                    prompt_lora_mapping,
                                    is_prefill=True)

From 7e5fe872c89523df2377b0aa7a0b627a12727b1e Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 10 Mar 2025 17:58:58 -0400
Subject: [PATCH 0640/1240] Fix typo in benchmark_serving_structured_output.py
 (#14566)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving_structured_output.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index dccef9d96d0..3a6e962c115 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -989,12 +989,11 @@ def main(args: argparse.Namespace):
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument(
-        "--structured-output-backend",
-        type=str,
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "json-unique"],
-        default="xgrammar",
-        help="Backend to use for structured outputs")
+    parser.add_argument("--structured-output-backend",
+                        type=str,
+                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
+                        default="xgrammar",
+                        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)

From 402f07b4b092aa967b130daa2f4d4a9648883bdb Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 10 Mar 2025 19:06:19 -0400
Subject: [PATCH 0641/1240] [V1] Prevent xgrammar from breaking TPU support
 (#14575)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/processor.py           | 4 ++++
 vllm/v1/structured_output/__init__.py | 9 +++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5c940cce5d5..56846030ac4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -4,6 +4,7 @@
 from collections.abc import Mapping
 from typing import Optional, Union
 
+import vllm.platforms
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
@@ -133,6 +134,9 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if self.vllm_config.speculative_config:
             raise ValueError("Structured output is not supported with "
                              "speculative decoding.")
+        if vllm.platforms.current_platform.is_tpu():
+            raise ValueError("Structured output is not supported on TPU.")
+
         validate_structured_output_request(params)
 
     def process_inputs(
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 0c2e0ac2aa7..1f6e3564392 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -17,6 +17,7 @@
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
+    import torch
     import xgrammar as xgr
 
     from vllm.v1.request import Request
@@ -53,8 +54,7 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
         # compilation, so we set it to half the number of CPUs.
         max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self._grammar_bitmask = xgr.allocate_token_bitmask(
-            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+        self._grammar_bitmask: Optional[torch.Tensor] = None
 
     def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
         # We need to pop and re-insert the grammar here for LRU cache
@@ -134,6 +134,11 @@ def grammar_bitmask(
         if not structured_output_request_ids:
             return None
 
+        if self._grammar_bitmask is None:
+            self._grammar_bitmask = xgr.allocate_token_bitmask(
+                self.vllm_config.scheduler_config.max_num_seqs,
+                self.vocab_size)
+
         # Fill the bitmask using the index of each request equal to its
         # position in the batch. Resize the bitmask down to the size of
         # the batch.

From 5c2e1f885daa9f4f9478619ea8eae8c3c8b439e1 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Tue, 11 Mar 2025 08:12:40 +0800
Subject: [PATCH 0642/1240] [Kernel] moe wna16 cuda kernel (#13321)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |   8 +
 csrc/moe/moe_ops.h                            |  10 +
 csrc/moe/moe_wna16.cu                         | 346 ++++++++++++++++++
 csrc/moe/moe_wna16_utils.h                    | 200 ++++++++++
 csrc/moe/torch_bindings.cpp                   |  10 +
 vllm/_custom_ops.py                           |  15 +
 .../layers/fused_moe/fused_moe.py             | 110 +++++-
 7 files changed, 698 insertions(+), 1 deletion(-)
 create mode 100644 csrc/moe/moe_wna16.cu
 create mode 100644 csrc/moe/moe_wna16_utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5349b64aecb..55ac3c77b62 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,6 +558,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
+  "csrc/moe/moe_wna16.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
 set_gencode_flags_for_srcs(
@@ -565,6 +566,13 @@ set_gencode_flags_for_srcs(
   CUDA_ARCHS "${CUDA_ARCHS}")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  set(VLLM_MOE_WNA16_SRC
+    "csrc/moe/moe_wna16.cu")
+
+  set_gencode_flags_for_srcs(
+    SRCS "${VLLM_MOE_WNA16_SRC}"
+    CUDA_ARCHS "${CUDA_ARCHS}")
+
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 66bb5f41b7f..371edb6495b 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -18,3 +18,13 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                               torch::Tensor sorted_token_ids,
                               torch::Tensor experts_ids,
                               torch::Tensor num_tokens_post_pad);
+
+torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
+                             torch::Tensor b_qweight, torch::Tensor b_scales,
+                             std::optional<torch::Tensor> b_qzeros,
+                             std::optional<torch::Tensor> topk_weights,
+                             torch::Tensor sorted_token_ids,
+                             torch::Tensor expert_ids,
+                             torch::Tensor num_tokens_post_pad, int64_t top_k,
+                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+                             int64_t BLOCK_SIZE_K, int64_t bit);
diff --git a/csrc/moe/moe_wna16.cu b/csrc/moe/moe_wna16.cu
new file mode 100644
index 00000000000..51ae76c1ec8
--- /dev/null
+++ b/csrc/moe/moe_wna16.cu
@@ -0,0 +1,346 @@
+
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "moe_wna16_utils.h"
+
+#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
+
+template <typename scalar_t, int bit, int GROUPS>
+__global__ void moe_wna16_gemm_kernel(
+    const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
+
+    const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
+    const uint32_t* __restrict__ qzeros,
+
+    const float* __restrict__ topk_weights,
+    const int32_t* __restrict__ sorted_token_ids,
+    const int32_t* __restrict__ expert_ids,
+    const int32_t* __restrict__ num_tokens_post_pad,
+
+    uint16_t num_experts, uint16_t group_size, uint16_t top_k, uint32_t size_m,
+    uint32_t size_n, uint32_t size_k, uint16_t BLOCK_SIZE_M,
+    uint16_t BLOCK_SIZE_N, uint16_t BLOCK_SIZE_K, bool has_zp,
+    bool mul_topk_weight) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+    return;
+  } else {
+#endif
+
+    using Dtype = ScalarType<scalar_t>;
+    using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+
+    if (blockIdx.x * BLOCK_SIZE_M >= num_tokens_post_pad[0]) return;
+
+    const int32_t offset_n = blockIdx.y * BLOCK_SIZE_N + threadIdx.x;
+    const int32_t offset_k = blockIdx.z * BLOCK_SIZE_K;
+
+    const int32_t expert_id = expert_ids[blockIdx.x];
+
+    int32_t num_valid_tokens = 0;
+    extern __shared__ uint16_t block_input_tmp[];
+    scalar_t* block_input = reinterpret_cast<scalar_t*>(block_input_tmp);
+    scalar_t2* block_input_half2 = reinterpret_cast<scalar_t2*>(block_input);
+
+    // load BLOCK_SIZE_M * BLOCK_SIZE_K into shared memory
+    for (int m = 0; m < BLOCK_SIZE_M; m++) {
+      const int32_t offset_m = blockIdx.x * BLOCK_SIZE_M + m;
+      const int32_t token_index = sorted_token_ids[offset_m];
+      if (token_index / top_k >= size_m) break;
+
+      num_valid_tokens = m + 1;
+      if (blockIdx.z == 0 && offset_n < size_n)
+        output[token_index * size_n + offset_n] = Dtype::int2num(0);
+
+      if (expert_id != -1) {
+        int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
+        for (int i = 0; i < k_per_thread; i++) {
+          int k = BLOCK_SIZE_N * i + threadIdx.x;
+          if (k >= BLOCK_SIZE_K) break;
+          if (offset_k + k >= size_k) break;
+
+          // load input to shared memory
+          // use a special layout to fit the layout of dequanted-weight
+          int origin_k;
+          if constexpr (bit == 4) {
+            // [0, 4, 1, 5, 2, 6, 3, 7]
+            int8_t order = (threadIdx.x % 2) * 4 + ((threadIdx.x % 8) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 8 * 8 + order;
+          } else {
+            // [0, 2, 1, 3]
+            int8_t order = (threadIdx.x % 2) * 2 + ((threadIdx.x % 4) / 2);
+            origin_k = BLOCK_SIZE_N * i + threadIdx.x / 4 * 4 + order;
+          }
+
+          origin_k += token_index / top_k * size_k + blockIdx.z * BLOCK_SIZE_K;
+          block_input[m * BLOCK_SIZE_K + k] = input[origin_k];
+        }
+      }
+    }
+
+    if (expert_id == -1) return;
+    __syncthreads();
+    if (threadIdx.x >= BLOCK_SIZE_N || offset_n >= size_n) return;
+
+    float res[64];  // assume BLOCK_SIZE_M <= 64
+    scalar_t2 res2;
+    scalar_t2 scale_f2;
+    scalar_t2 qzero_f2;
+
+    // note that (size_n * size_k * expert_id) may greater than 2 ** 31
+    constexpr int8_t pack_factor = 32 / bit;
+    const uint64_t expert_offset = ((uint64_t)size_n) * size_k * expert_id;
+    const uint32_t* expert_qweight = qweight + expert_offset / pack_factor;
+    const scalar_t* expert_scales = scales + expert_offset / group_size;
+    const uint32_t* expert_qzeros =
+        qzeros + expert_offset / group_size / pack_factor;
+
+    // load 4*int32 one time: 4 int32 = 128 bit = 1 float4
+    // weight would be loaded in loop
+    uint32_t expert_qweight_tmp[4];
+    float4* expert_qweight_tmp_float4 =
+        reinterpret_cast<float4*>(expert_qweight_tmp);
+
+    // load all required scales one time
+    scalar_t expert_scales_groups[GROUPS];
+    int scales_offset_tmp =
+        (offset_n * size_k + offset_k) / group_size / GROUPS;
+    if constexpr (GROUPS == 1) {
+      *expert_scales_groups = expert_scales[scales_offset_tmp];
+    } else if constexpr (GROUPS == 2) {
+      float* expert_scales_groups_tmp =
+          reinterpret_cast<float*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 4) {
+      float2* expert_scales_groups_tmp =
+          reinterpret_cast<float2*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float2*>(expert_scales)[scales_offset_tmp];
+    } else if constexpr (GROUPS == 8) {
+      float4* expert_scales_groups_tmp =
+          reinterpret_cast<float4*>(expert_scales_groups);
+      *expert_scales_groups_tmp =
+          reinterpret_cast<const float4*>(expert_scales)[scales_offset_tmp];
+    }
+
+    // load all required qzeros one time
+    uint8_t expert_qzeros_groups[GROUPS];
+    if (!has_zp) {
+      if constexpr (bit == 4) {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(8));
+      } else {
+        qzero_f2 = Dtype::num2num2(Dtype::int2num(128));
+      }
+    } else {
+      int qzeros_offset_tmp =
+          (offset_n / (8 / bit)) * (size_k / group_size / GROUPS) +
+          offset_k / group_size / GROUPS;
+      if constexpr (GROUPS == 1) {
+        uint8_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint8_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint8_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 2) {
+        uint16_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint16_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint16_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 4) {
+        uint32_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint32_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint32_t*>(expert_qzeros)[qzeros_offset_tmp];
+      } else if constexpr (GROUPS == 8) {
+        uint64_t* expert_qzeros_groups_tmp =
+            reinterpret_cast<uint64_t*>(expert_qzeros_groups);
+        *expert_qzeros_groups_tmp =
+            reinterpret_cast<const uint64_t*>(expert_qzeros)[qzeros_offset_tmp];
+      }
+    }
+
+    for (int tmp_k = 0; tmp_k < BLOCK_SIZE_K / pack_factor; tmp_k++) {
+      int k = offset_k + tmp_k * pack_factor;
+      if (k >= size_k) break;
+      const int32_t weight_offset = offset_n * size_k + k;
+
+      if (tmp_k % 4 == 0) {
+        *expert_qweight_tmp_float4 = reinterpret_cast<const float4*>(
+            expert_qweight)[weight_offset / pack_factor / 4];
+      }
+
+      if (tmp_k % (group_size / pack_factor) == 0) {
+        scalar_t scale_f =
+            expert_scales_groups[tmp_k / (group_size / pack_factor)];
+        scale_f2 = Dtype::num2num2(scale_f);
+
+        if (has_zp) {
+          uint8_t qzero =
+              expert_qzeros_groups[tmp_k / (group_size / pack_factor)];
+          if constexpr (bit == 4) {
+            qzero = (qzero >> ((threadIdx.x % 2) * 4)) & 0xF;
+          }
+          qzero_f2 = Dtype::num2num2(Dtype::int2num(qzero));
+        }
+      }
+
+      scalar_t2 weight_half2[16 / bit];
+      dequant<scalar_t2, bit>(expert_qweight_tmp[tmp_k % 4], weight_half2);
+
+      for (int m = 0; m < num_valid_tokens; m++) {
+        res2 = {};
+
+#pragma unroll
+        for (int i = 0; i < 16 / bit; i++) {
+          int32_t offset_input = m * BLOCK_SIZE_K / 2 + tmp_k * (16 / bit) + i;
+          res2 = __hfma2(__hmul2(__hsub2(weight_half2[i], qzero_f2), scale_f2),
+                         block_input_half2[offset_input], res2);
+        }
+
+        if (tmp_k == 0) {
+          res[m] = Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        } else {
+          res[m] += Dtype::num2float(res2.x) + Dtype::num2float(res2.y);
+        }
+      }
+    }
+
+    for (int m = 0; m < num_valid_tokens; ++m) {
+      const int32_t token_index =
+          sorted_token_ids[blockIdx.x * BLOCK_SIZE_M + m];
+      if (mul_topk_weight) {
+        res[m] *= topk_weights[token_index];
+      }
+      atomicAdd(&output[token_index * size_n + offset_n],
+                Dtype::float2num(res[m]));
+    }
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 800
+  }
+#endif
+}
+
+template <typename scalar_t>
+void run_moe_wna16_gemm(const scalar_t* input, scalar_t* output,
+                        const uint32_t* b_qweight, const scalar_t* b_scales,
+                        const uint32_t* b_qzeros, const float* topk_weights,
+                        const int32_t* sorted_token_ids,
+                        const int32_t* expert_ids,
+                        const int32_t* num_tokens_post_pad, int num_experts,
+                        int group_size, int num_token_blocks, int top_k,
+                        int size_m, int size_n, int size_k, int BLOCK_SIZE_M,
+                        int BLOCK_SIZE_N, int BLOCK_SIZE_K, int bit,
+                        bool has_zp, bool mul_topk_weight) {
+  dim3 blockDim, gridDim;
+  blockDim.x = BLOCK_SIZE_N;
+  blockDim.y = 1;
+  blockDim.z = 1;
+  gridDim.x = num_token_blocks;
+  gridDim.y = DIVIDE(size_n, BLOCK_SIZE_N);
+  gridDim.z = DIVIDE(size_k, BLOCK_SIZE_K);
+
+  auto kernel = moe_wna16_gemm_kernel<scalar_t, 4, 1>;
+  if (bit == 4) {
+    if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 4, 8>;
+    }
+  } else {
+    if (BLOCK_SIZE_K / group_size == 1) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 1>;
+    } else if (BLOCK_SIZE_K / group_size == 2) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 2>;
+    } else if (BLOCK_SIZE_K / group_size == 4) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 4>;
+    } else if (BLOCK_SIZE_K / group_size == 8) {
+      kernel = moe_wna16_gemm_kernel<scalar_t, 8, 8>;
+    }
+  }
+
+  const int shared_mem_size = BLOCK_SIZE_M * BLOCK_SIZE_K * 2;
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  kernel<<<gridDim, blockDim, shared_mem_size, stream>>>(
+      input, output, b_qweight, b_scales, b_qzeros, topk_weights,
+      sorted_token_ids, expert_ids, num_tokens_post_pad, num_experts,
+      group_size, top_k, size_m, size_n, size_k, BLOCK_SIZE_M, BLOCK_SIZE_N,
+      BLOCK_SIZE_K, has_zp, mul_topk_weight);
+}
+
+torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
+                             torch::Tensor b_qweight, torch::Tensor b_scales,
+                             std::optional<torch::Tensor> b_qzeros,
+                             std::optional<torch::Tensor> topk_weights,
+                             torch::Tensor sorted_token_ids,
+                             torch::Tensor expert_ids,
+                             torch::Tensor num_tokens_post_pad, int64_t top_k,
+                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
+                             int64_t BLOCK_SIZE_K, int64_t bit) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto options =
+      torch::TensorOptions().dtype(input.dtype()).device(input.device());
+
+  const int num_experts = b_qweight.size(0);
+  const int size_m = input.size(0);
+  const int size_n = b_qweight.size(1);
+  const int size_k = input.size(1);
+  const int group_size = size_k / b_scales.size(2);
+
+  int64_t EM = sorted_token_ids.size(0);
+  if (size_m <= BLOCK_SIZE_M) {
+    EM = min(EM, size_m * BLOCK_SIZE_M * top_k);
+  }
+  const int num_token_blocks = (EM + BLOCK_SIZE_M - 1) / BLOCK_SIZE_M;
+
+  const uint32_t* b_qzeros_ptr;
+  if (b_qzeros.has_value())
+    b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
+  const float* topk_weights_ptr;
+  if (topk_weights.has_value())
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+
+  int groups_per_block_row = BLOCK_SIZE_K / group_size;
+  TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
+  TORCH_CHECK(size_k % BLOCK_SIZE_K == 0,
+              "size_k must divisible by BLOCK_SIZE_K");
+  TORCH_CHECK(BLOCK_SIZE_K % group_size == 0,
+              "BLOCK_SIZE_K must divisible by group_size");
+  TORCH_CHECK(BLOCK_SIZE_M <= 64, "BLOCK_SIZE_M must less or equal to 64");
+  TORCH_CHECK(groups_per_block_row == 1 || groups_per_block_row == 2 ||
+                  groups_per_block_row == 4 || groups_per_block_row == 8,
+              "BLOCK_SIZE_K // group_size must be one of [1, 2, 4, 8]");
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    run_moe_wna16_gemm<half>(
+        (const half*)input.data_ptr<at::Half>(),
+        (half*)output.data_ptr<at::Half>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const half*)b_scales.data_ptr<at::Half>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else if (input.scalar_type() == at::ScalarType::BFloat16) {
+    run_moe_wna16_gemm<nv_bfloat16>(
+        (const nv_bfloat16*)input.data_ptr<at::BFloat16>(),
+        (nv_bfloat16*)output.data_ptr<at::BFloat16>(),
+        (const uint32_t*)b_qweight.data_ptr<uint8_t>(),
+        (const nv_bfloat16*)b_scales.data_ptr<at::BFloat16>(), b_qzeros_ptr,
+        topk_weights_ptr, sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(),
+        num_experts, group_size, num_token_blocks, top_k, size_m, size_n,
+        size_k, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, bit,
+        b_qzeros.has_value(), topk_weights.has_value());
+  } else {
+    TORCH_CHECK(false, "moe_wna16_gemm only supports bfloat16 and float16");
+  }
+  return output;
+}
diff --git a/csrc/moe/moe_wna16_utils.h b/csrc/moe/moe_wna16_utils.h
new file mode 100644
index 00000000000..4396b80240e
--- /dev/null
+++ b/csrc/moe/moe_wna16_utils.h
@@ -0,0 +1,200 @@
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+template <typename scalar_t>
+class ScalarType {};
+
+template <>
+class ScalarType<half> {
+ public:
+  using scalar_t = half;
+  using scalar_t2 = half2;
+
+  static __device__ float inline num2float(const half x) {
+    return __half2float(x);
+  }
+
+  static __device__ half2 inline num2num2(const half x) {
+    return __half2half2(x);
+  }
+
+  static __device__ half2 inline nums2num2(const half x1, const half x2) {
+    return __halves2half2(x1, x2);
+  }
+
+  static __host__ __device__ half inline float2num(const float x) {
+    return __float2half(x);
+  }
+
+  static __host__ __device__ half inline int2num(const float x) {
+    return __int2half_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
+
+  static __host__ __device__ half2 inline float22num2(const float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+template <>
+class ScalarType<nv_bfloat16> {
+ public:
+  using scalar_t = nv_bfloat16;
+  using scalar_t2 = nv_bfloat162;
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  static __device__ float inline num2float(const nv_bfloat16 x) {
+    return __bfloat162float(x);
+  }
+
+  static __device__ nv_bfloat162 inline num2num2(const nv_bfloat16 x) {
+    return __bfloat162bfloat162(x);
+  }
+
+  static __device__ nv_bfloat162 inline nums2num2(const nv_bfloat16 x1,
+                                                  const nv_bfloat16 x2) {
+    return __halves2bfloat162(x1, x2);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
+    return __float2bfloat16(x);
+  }
+
+  static __host__ __device__ nv_bfloat16 inline int2num(const float x) {
+    return __int2bfloat16_rn(x);
+  }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
+
+  static __host__ __device__ nv_bfloat162 inline float22num2(const float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+#endif
+};
+
+template <int lut>
+__device__ inline int lop3(int a, int b, int c) {
+  int res;
+  asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
+               : "=r"(res)
+               : "r"(a), "r"(b), "r"(c), "n"(lut));
+  return res;
+}
+
+template <int start_byte, int mask>
+__device__ inline uint32_t prmt(uint32_t a) {
+  uint32_t res;
+  asm volatile("prmt.b32 %0, %1, %2, %3;\n"
+               : "=r"(res)
+               : "r"(a), "n"(start_byte), "n"(mask));
+  return res;
+}
+
+template <typename scalar_t2, int bit>
+__device__ inline void dequant(int q, scalar_t2* res) {}
+
+template <>
+__device__ inline void dequant<half2, 4>(int q, half2* res) {
+  const int LO = 0x000f000f;
+  const int HI = 0x00f000f0;
+  const int EX = 0x64006400;
+  const int SUB = 0x64006400;
+  const int MUL = 0x2c002c00;
+  const int ADD = 0xd400d400;
+
+  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+  q >>= 8;
+  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX);
+  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX);
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo0),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[1] = __hfma2(*reinterpret_cast<half2*>(&hi0),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+  res[2] = __hsub2(*reinterpret_cast<half2*>(&lo1),
+                   *reinterpret_cast<const half2*>(&SUB));
+  res[3] = __hfma2(*reinterpret_cast<half2*>(&hi1),
+                   *reinterpret_cast<const half2*>(&MUL),
+                   *reinterpret_cast<const half2*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<half2, 8>(int q, half2* res) {
+  static constexpr uint32_t mask_for_elt_01 = 0x5250;
+  static constexpr uint32_t mask_for_elt_23 = 0x5351;
+  static constexpr uint32_t start_byte_for_fp16 = 0x64646464;
+
+  uint32_t lo = prmt<start_byte_for_fp16, mask_for_elt_01>(q);
+  uint32_t hi = prmt<start_byte_for_fp16, mask_for_elt_23>(q);
+
+  static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400;
+
+  res[0] = __hsub2(*reinterpret_cast<half2*>(&lo),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+  res[1] = __hsub2(*reinterpret_cast<half2*>(&hi),
+                   *reinterpret_cast<const half2*>(&I8s_TO_F16s_MAGIC_NUM));
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+template <>
+__device__ inline void dequant<nv_bfloat162, 4>(int q, nv_bfloat162* res) {
+  static constexpr uint32_t MASK = 0x000f000f;
+  static constexpr uint32_t EX = 0x43004300;
+
+  int lo0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int lo1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+  q >>= 4;
+  int hi1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX);
+
+  static constexpr uint32_t MUL = 0x3F803F80;
+  static constexpr uint32_t ADD = 0xC300C300;
+
+  res[0] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[1] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi0),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[2] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&lo1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+  res[3] = __hfma2(*reinterpret_cast<nv_bfloat162*>(&hi1),
+                   *reinterpret_cast<const nv_bfloat162*>(&MUL),
+                   *reinterpret_cast<const nv_bfloat162*>(&ADD));
+}
+
+template <>
+__device__ inline void dequant<nv_bfloat162, 8>(int q, nv_bfloat162* res) {
+  float fp32_intermediates[4];
+  uint32_t* fp32_intermediates_casted =
+      reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+  static constexpr uint32_t fp32_base = 0x4B000000;
+  fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650);
+  fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652);
+  fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651);
+  fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653);
+
+  fp32_intermediates[0] -= 8388608.f;
+  fp32_intermediates[1] -= 8388608.f;
+  fp32_intermediates[2] -= 8388608.f;
+  fp32_intermediates[3] -= 8388608.f;
+
+  uint32_t* bf16_result_ptr = reinterpret_cast<uint32_t*>(res);
+  bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0],
+                                   fp32_intermediates_casted[1], 0x7632);
+  bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2],
+                                   fp32_intermediates_casted[3], 0x7632);
+}
+#endif
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 8540633dcc8..d2c03c4d4be 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -31,6 +31,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                         Tensor! num_tokens_post_pad) -> ()");
   m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
 
+  m.def(
+      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
+      "Tensor b_scales, Tensor? b_qzeros, "
+      "Tensor? topk_weights, Tensor sorted_token_ids, "
+      "Tensor expert_ids, Tensor num_tokens_post_pad, "
+      "int top_k, int BLOCK_SIZE_M, int BLOCK_SIZE_N, int BLOCK_SIZE_K, "
+      "int bit) -> Tensor");
+
+  m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
+
 #ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d25e7944a05..53065dd0119 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1098,6 +1098,21 @@ def sgl_moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                                               experts_ids, num_tokens_post_pad)
 
 
+def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
+                   b_qweight: torch.Tensor, b_scales: torch.Tensor,
+                   b_qzeros: Optional[torch.Tensor],
+                   topk_weights: Optional[torch.Tensor],
+                   sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor,
+                   num_tokens_post_pad: torch.Tensor, top_k: int,
+                   BLOCK_SIZE_M: int, BLOCK_SIZE_N: int, BLOCK_SIZE_K: int,
+                   bit: int) -> torch.Tensor:
+    torch.ops._moe_C.moe_wna16_gemm(input, output, b_qweight, b_scales,
+                                    b_qzeros, topk_weights, sorted_token_ids,
+                                    experts_ids, num_tokens_post_pad, top_k,
+                                    BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K,
+                                    bit)
+
+
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
                  gating_output: float) -> None:
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 5336b3c1002..89ceba12274 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -719,6 +719,33 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
 
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(
+            num_valid_tokens=topk_ids.numel(),
+            group_size=block_shape[1],
+            num_experts=B.shape[0],
+            bit=4 if use_int4_w4a16 else 8)
+        config = config.copy()
+        config.update(
+            get_moe_wna16_block_config(config=config,
+                                       use_moe_wna16_cuda=use_moe_wna16_cuda,
+                                       num_valid_tokens=topk_ids.numel(),
+                                       size_k=A.shape[1],
+                                       size_n=B.shape[1],
+                                       num_experts=B.shape[1],
+                                       group_size=block_shape[1],
+                                       real_top_k=topk_ids.shape[1],
+                                       block_size_m=config["BLOCK_SIZE_M"]))
+
+        if use_moe_wna16_cuda:
+            bit = 4 if use_int4_w4a16 else 8
+            ops.moe_wna16_gemm(A, C, B, B_scale, B_zp,
+                               topk_weights if mul_routed_weight else None,
+                               sorted_token_ids, expert_ids,
+                               num_tokens_post_padded, top_k,
+                               config["BLOCK_SIZE_M"], config["BLOCK_SIZE_N"],
+                               config["BLOCK_SIZE_K"], bit)
+            return
+
         fused_moe_kernel_gptq_awq[grid](
             A,
             B,
@@ -852,6 +879,70 @@ def get_moe_configs(
     return None
 
 
+def get_moe_wna16_block_config(config: Dict[str,
+                                            int], use_moe_wna16_cuda: bool,
+                               num_valid_tokens: int, size_k: int, size_n: int,
+                               num_experts: int, group_size: int,
+                               real_top_k: int, block_size_m: int):
+    if "BLOCK_SIZE_N" in config and "BLOCK_SIZE_K" in config:
+        # optimal block config is set
+        return {}
+    if not use_moe_wna16_cuda:
+        # triton moe wna16 kernel
+        if num_valid_tokens // real_top_k == 1:
+            # if bs=1, use a smaller BLOCK_SIZE_N
+            return {"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 64}
+        else:
+            return {"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}
+    else:
+        # cuda moe wna16 kernel
+        # set default block_size 128, and increase them when num_blocks
+        # is too large.
+        block_size_n = 128
+        block_size_k = 128
+        if block_size_k <= group_size:
+            block_size_k = group_size
+
+        num_n_blocks = size_k // block_size_k
+        num_k_blocks = size_n // block_size_k
+        num_m_blocks = (num_valid_tokens + block_size_m - 1) / block_size_m + \
+            num_experts
+        if num_valid_tokens // real_top_k <= block_size_m:
+            num_m_blocks = min(num_m_blocks, num_valid_tokens)
+        num_blocks = num_m_blocks * num_n_blocks * num_k_blocks
+
+        if size_k % 256 == 0 and num_blocks >= 256 and \
+                block_size_k < 256:
+            block_size_k = 256
+            num_blocks = num_blocks // (256 // block_size_k)
+
+        if num_m_blocks <= 16 and size_k % (block_size_k * 2) == 0 and \
+                size_k % (block_size_k * 2) == 0 and block_size_k <= 512 and \
+                num_blocks >= 512:
+            block_size_k = block_size_k * 2
+            num_blocks = num_blocks // 2
+
+        if num_blocks > 1024:
+            block_size_n = 256
+            num_n_blocks = num_n_blocks // 2
+            num_blocks = num_blocks // 2
+
+        if size_n <= 1024 and num_blocks >= 1024:
+            # The kernel performance got much better with BLOCK_SIZE_N=1024
+            # when num_blocks is large, event when N is small.
+            # Not sure why, maybe it force the CUDA SM process only one block
+            # at the same time.
+            block_size_n = 1024
+
+        return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k}
+
+
+def should_moe_wna16_use_cuda(num_valid_tokens: int, group_size: int,
+                              num_experts: int, bit: int):
+    return bit == 4 and group_size in [32, 64, 128] and \
+        num_valid_tokens / num_experts <= 6
+
+
 def get_default_config(
     M: int,
     E: int,
@@ -873,6 +964,21 @@ def get_default_config(
             "num_warps": 4,
             "num_stages": 3,
         }
+    elif dtype in ["int4_w4a16", "int8_w8a16"] and block_shape is not None:
+        # moe wna16 kernels
+        # only set BLOCK_SIZE_M
+        # BLOCK_SIZE_N and BLOCK_SIZE_K would be set later
+        bit = 4 if dtype == "int4_w4a16" else 8
+        use_moe_wna16_cuda = should_moe_wna16_use_cuda(M * topk,
+                                                       block_shape[1], E, bit)
+        if use_moe_wna16_cuda:
+            config = {"BLOCK_SIZE_M": min(16, M)}
+        elif M <= 20:
+            config = {"BLOCK_SIZE_M": 16, "GROUP_SIZE_M": 1}
+        elif M <= 40:
+            config = {"BLOCK_SIZE_M": 32, "GROUP_SIZE_M": 1}
+        else:
+            config = {"BLOCK_SIZE_M": 64, "GROUP_SIZE_M": 1}
     else:
         config = {
             "BLOCK_SIZE_M": 64,
@@ -907,6 +1013,8 @@ def try_get_optimal_moe_config(
     else:
         # First try to load optimal config from the file
         E, _, N = w2_shape
+        if dtype == "int4_w4a16":
+            N = N * 2
         block_n = block_shape[0] if block_shape else 0
         block_k = block_shape[1] if block_shape else 0
         configs = get_moe_configs(E, N, dtype, block_n, block_k)
@@ -1027,7 +1135,7 @@ def get_config_dtype_str(dtype: torch.dtype,
     elif use_int8_w8a16:
         return "int8_w8a16"
     elif use_int4_w4a16:
-        return "int4_w8a16"
+        return "int4_w4a16"
     elif dtype == torch.float:
         # avoiding cases where kernel fails when float32 MoE
         # use fp16/bfloat16 configs

From 3004e87b2f885127203e670659b5cc5556d82ced Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 10 Mar 2025 17:42:11 -0700
Subject: [PATCH 0643/1240] [MISC][V1] Handle exception of
 current_platform.get_device_name() in arg_utils (#14379)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0d285acd15f..baa075bc5e3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1440,8 +1440,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # Use different default values for different hardware.
-        from vllm.platforms import current_platform
-        device_name = current_platform.get_device_name().lower()
+
+        # Try to query the device name on the current platform. If it fails,
+        # it may be because the platform that imports vLLM is not the same
+        # as the platform that vLLM is running on (e.g. the case of scaling
+        # vLLM with Ray) and has no GPUs. In this case we use the default
+        # values for non-H100/H200 GPUs.
+        try:
+            from vllm.platforms import current_platform
+            device_name = current_platform.get_device_name().lower()
+        except Exception:
+            # This is only used to set default_max_num_batched_tokens
+            device_name = "no-device"
+
         if "h100" in device_name or "h200" in device_name:
             # For H100 and H200, we use larger default values.
             default_max_num_batched_tokens = {

From 51820a18b58cc62926448284eb590ec7f43467c7 Mon Sep 17 00:00:00 2001
From: gnovack <gnovack@amazon.com>
Date: Mon, 10 Mar 2025 18:37:04 -0700
Subject: [PATCH 0644/1240] [Neuron] Add Neuron device communicator for vLLM v1
 (#14085)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/test_comm_ops.py                 | 100 ++++++++++++++++++
 .../neuron_communicator.py                    |  19 ++++
 vllm/platforms/neuron.py                      |   8 ++
 3 files changed, 127 insertions(+)
 create mode 100644 tests/neuron/test_comm_ops.py
 create mode 100644 vllm/distributed/device_communicators/neuron_communicator.py

diff --git a/tests/neuron/test_comm_ops.py b/tests/neuron/test_comm_ops.py
new file mode 100644
index 00000000000..3cad160b2cb
--- /dev/null
+++ b/tests/neuron/test_comm_ops.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+import functools
+from typing import Callable
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch_xla.distributed.xla_multiprocessing as xmp
+from typing_extensions import ParamSpec
+
+from vllm.distributed.communication_op import (
+    tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce)
+from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
+                                             init_distributed_environment)
+from vllm.utils import get_distributed_init_method, get_open_port
+
+_P = ParamSpec("_P")
+
+
+def reinitialize_neuron_runtime(f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to reinitialize the Neuron Runtime before executing a test.
+    This is necessary for distributed tests which need to reallocate Neuron
+    Cores to separate subprocesses.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        runtime = torch.classes.neuron.Runtime()
+        runtime.initialize()
+        runtime.unsafe_close()
+
+        f(*args, **kwargs)
+        runtime.initialize()
+
+    return wrapper
+
+
+def all_gather_test_worker(index, tp_degree, distributed_init_method):
+    init_distributed_environment(tp_degree,
+                                 index,
+                                 distributed_init_method,
+                                 index,
+                                 backend="xla")
+    ensure_model_parallel_initialized(tp_degree, 1)
+
+    num_dimensions = 3
+    tensor_size = list(range(2, num_dimensions + 2))
+    total_size = 1
+    for s in tensor_size:
+        total_size *= s
+
+    all_gather_dimension = -1
+    all_tensors = [
+        torch.arange(total_size, dtype=torch.float32,
+                     device="xla").reshape(tensor_size) * (r + 1)
+        for r in range(tp_degree)
+    ]
+    expected = torch.cat(all_tensors, dim=all_gather_dimension)
+    t = all_tensors[index % tp_degree]
+    t = tensor_model_parallel_all_gather(t, all_gather_dimension)
+    torch.testing.assert_close(t, expected)
+
+
+def all_reduce_test_worker(index, tp_degree, distributed_init_method):
+    init_distributed_environment(tp_degree,
+                                 index,
+                                 distributed_init_method,
+                                 index,
+                                 backend="xla")
+    ensure_model_parallel_initialized(tp_degree, 1)
+
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="xla") * (r + 1)
+        for r in range(tp_degree)
+    ]
+    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    t = all_tensors[index % tp_degree]
+    t = tensor_model_parallel_all_reduce(t)
+    torch.testing.assert_close(t, expected)
+
+
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("test_target",
+                         [all_reduce_test_worker, all_gather_test_worker])
+@reinitialize_neuron_runtime
+def test_neuron_multi_process_tensor_parallel(monkeypatch, tp_size,
+                                              test_target):
+
+    with patch('torch_xla._XLAC._xla_runtime_is_initialized',
+               return_value=False):
+        distributed_init_method = get_distributed_init_method(
+            "127.0.0.1", get_open_port())
+
+        monkeypatch.setenv("VLLM_USE_V1", "1")
+        monkeypatch.setenv("NEURONCORE_NUM_DEVICES", str(tp_size))
+        monkeypatch.setenv("NEURON_PJRT_PROCESSES_NUM_DEVICES",
+                           ','.join(['1' for _ in range(tp_size)]))
+
+        xmp.spawn(test_target, args=(tp_size, distributed_init_method))
diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py
new file mode 100644
index 00000000000..dfa4b5194bd
--- /dev/null
+++ b/vllm/distributed/device_communicators/neuron_communicator.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+from vllm.distributed.device_communicators.base_device_communicator import (
+    DeviceCommunicatorBase)
+from vllm.platforms import current_platform
+
+if current_platform.is_neuron():
+    import torch_xla.core.xla_model as xm
+
+
+class NeuronCommunicator(DeviceCommunicatorBase):
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        return xm.all_reduce(xm.REDUCE_SUM, x)
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        assert dim == -1, "Neuron only supports dim=-1 for all-gather."
+        return xm.all_gather(x, dim=dim)
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index b2eadb7932f..c1f426e5b88 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING, Optional
 
+from vllm import envs
 from vllm.logger import init_logger
 
 from .interface import Platform, PlatformEnum
@@ -56,6 +57,13 @@ def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on Neuron.")
         return False
 
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        if envs.VLLM_USE_V1:
+            return "vllm.distributed.device_communicators.neuron_communicator.NeuronCommunicator"  # noqa
+        else:
+            return Platform.get_device_communicator_cls()
+
     @classmethod
     def use_all_gather(cls) -> bool:
         return True

From e341989d238ef1b1d68df97184935b010fcc0801 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Mon, 10 Mar 2025 18:37:29 -0700
Subject: [PATCH 0645/1240] [neuron] add reshape_and_cache (#14391)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/test_cache.py           | 83 ++++++++++++++++++++++++++++
 vllm/attention/ops/nki_flash_attn.py | 43 ++++++++++++++
 2 files changed, 126 insertions(+)
 create mode 100644 tests/neuron/test_cache.py

diff --git a/tests/neuron/test_cache.py b/tests/neuron/test_cache.py
new file mode 100644
index 00000000000..ea33727b7cf
--- /dev/null
+++ b/tests/neuron/test_cache.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.attention.ops.nki_flash_attn import reshape_and_cache
+
+
+@pytest.mark.parametrize(
+    "num_tokens, n_kv_head, d_head, num_blocks, block_size",
+    [
+        # Small model configuration (e.g., GPT-2 small)
+        (32, 12, 64, 4, 128),  # Typical sequence processing
+        (1, 12, 64, 4, 128),  # Single token update
+        (128, 12, 64, 4, 128),  # Longer sequence
+
+        # Medium model configuration (e.g., GPT-2 medium)
+        (64, 16, 96, 8, 256),  # Standard batch
+        (256, 16, 96, 8, 256),  # Large batch
+
+        # Large model configuration (e.g., GPT-3 style)
+        (48, 32, 128, 16, 512),  # Typical processing window
+        (512, 32, 128, 16, 512),  # Full context window
+
+        # Edge cases and stress tests
+        (1024, 8, 32, 32, 32),  # Many tokens, small heads
+        (16, 64, 256, 4, 64),  # Few tokens, many heads
+        (2048, 24, 128, 64, 128),  # Large scale test
+
+        # Minimal configurations for debugging
+        (4, 2, 16, 2, 16),  # Tiny test case
+        (1, 1, 8, 1, 8),  # Minimal possible
+    ])
+def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
+                           block_size):
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+
+    # Create CPU tensors for reference implementation
+    key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
+        torch.tensor(d_head))
+    value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
+        torch.tensor(d_head))
+    key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
+    value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
+    slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
+
+    # Run reference implementation on CPU
+    block_indices = torch.div(slot_mapping_cpu,
+                              block_size,
+                              rounding_mode="floor")
+    block_offsets = slot_mapping_cpu % block_size
+
+    for i in range(num_tokens):
+        block_idx = block_indices[i]
+        block_offset = block_offsets[i]
+        key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
+        value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
+
+    # Create XLA device tensors
+    device = torch.device('xla')
+    key = key_cpu.to(device)
+    value = value_cpu.to(device)
+    key_cache = torch.zeros_like(key_cache_cpu, device=device)
+    value_cache = torch.zeros_like(value_cache_cpu, device=device)
+    slot_mapping = slot_mapping_cpu.to(device)
+
+    # Run vectorized implementation on XLA device
+    reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+
+    # Move results back to CPU for comparison
+    key_cache_result = key_cache.cpu()
+    value_cache_result = value_cache.cpu()
+
+    # Assert results match
+    torch.testing.assert_close(key_cache_result,
+                               key_cache_cpu,
+                               rtol=1e-5,
+                               atol=1e-5)
+    torch.testing.assert_close(value_cache_result,
+                               value_cache_cpu,
+                               rtol=1e-5,
+                               atol=1e-5)
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index 20f9dcd163f..dcf9b0ef1f2 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -869,3 +869,46 @@ def flash_attn_varlen_nkifunc(
 
     o = flash_paged_attention[1, n_kv_head](**kwargs)
     return o
+
+
+def reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+) -> None:
+    """
+    Writes key-value pairs to the KV cache at specified positions.
+
+    Args:
+        key (torch.Tensor): Key tensor with shape
+            (num_tokens, n_kv_head, d_head)
+        value (torch.Tensor): Value tensor with shape 
+            (num_tokens, n_kv_head, d_head)
+        key_cache (torch.Tensor): Key cache tensor with shape 
+            (num_blocks, n_kv_head, block_size, d_head)
+        value_cache (torch.Tensor): Value cache tensor with shape
+            (num_blocks, n_kv_head, block_size, d_head) 
+        slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
+            with shape (num_tokens)
+
+    Returns:
+        None: Updates the key_cache and value_cache tensors in-place
+    """
+    block_size = key_cache.size(2)
+
+    # Calculate indices with explicit floor division
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_offsets = slot_mapping % block_size
+
+    # Update caches using index_put_
+    key_cache.index_put_(
+        (block_indices.unsqueeze(1),
+         torch.arange(key_cache.size(1),
+                      device=key.device), block_offsets.unsqueeze(1)), key)
+
+    value_cache.index_put_(
+        (block_indices.unsqueeze(1),
+         torch.arange(value_cache.size(1),
+                      device=value.device), block_offsets.unsqueeze(1)), value)

From 7b5af749adf5532b0594e39714197ac21bdaaad6 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Mon, 10 Mar 2025 19:48:24 -0700
Subject: [PATCH 0646/1240] [V1][PP] Do not block engine core when no requests
 to schedule (#14585)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index bdf9203b1b1..43cb95fb47f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -205,23 +205,18 @@ def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]:
                 self.batch_queue.put_nowait(
                     (future, scheduler_output))  # type: ignore
 
-        # If all requests are scheduled or the job queue is full,
+        scheduled_batch = (scheduler_output is not None
+                           and scheduler_output.total_num_scheduled_tokens > 0)
+
+        # If no more requests can be scheduled and the job queue is not empty,
         # block until the first batch in the job queue is finished.
-        if (scheduler_output is None
-                or scheduler_output.total_num_scheduled_tokens == 0):
-            try:
-                future, scheduler_output = self.batch_queue.get(
-                    timeout=POLLING_TIMEOUT_S)
-                # Blocking until the first result is available.
-                model_output = future.result()
-                self.batch_queue.task_done()
-                engine_core_outputs = self.scheduler.update_from_output(
-                    scheduler_output, model_output)
-            except queue.Empty:
-                # If the queue is empty (timeout at .get), return
-                # an empty EngineCoreOutputs for logging.
-                engine_core_outputs = EngineCoreOutputs(
-                    outputs=[], scheduler_stats=self.scheduler.make_stats())
+        if not scheduled_batch and not self.batch_queue.empty():
+            future, scheduler_output = self.batch_queue.get_nowait()
+            # Blocking until the first result is available.
+            model_output = future.result()
+            self.batch_queue.task_done()
+            engine_core_outputs = self.scheduler.update_from_output(
+                scheduler_output, model_output)
 
         return engine_core_outputs
 

From be5784fd18aec1ab26434eb61fec10cc435104e1 Mon Sep 17 00:00:00 2001
From: Concurrensee <yida.wu@amd.com>
Date: Mon, 10 Mar 2025 22:46:59 -0500
Subject: [PATCH 0647/1240] [Bugfix] Fix FP16 overflow for DeepSeek V2 (#13232)

Signed-off-by: Yida Wu <yida.wu@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_v2.py | 28 +++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 548f913c83c..d66f61a8988 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -155,11 +155,21 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits) * self.routed_scaling_factor
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits) * self.routed_scaling_factor
+        else:
+            # This is a special case to avoid FP16 overflow
+            final_hidden_states = self.experts(hidden_states=hidden_states,
+                                               router_logits=router_logits)
         if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output
+            if hidden_states.dtype != torch.float16:
+                final_hidden_states = final_hidden_states + shared_output
+            else:
+                # This is a special case to avoid FP16 overflow
+                final_hidden_states = final_hidden_states + shared_output \
+                    * (1. / self.routed_scaling_factor)
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
@@ -531,6 +541,7 @@ def __init__(
                                        eps=config.rms_norm_eps)
         self.post_attention_layernorm = RMSNorm(config.hidden_size,
                                                 eps=config.rms_norm_eps)
+        self.routed_scaling_factor = config.routed_scaling_factor
 
     def forward(
         self,
@@ -551,9 +562,18 @@ def forward(
         )
 
         # Fully Connected
+        if isinstance(self.mlp, DeepseekV2MoE) and \
+            hidden_states.dtype == torch.float16:
+            # This is a special case to avoid FP16 overflow
+            hidden_states *= 1. / self.routed_scaling_factor
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
+        if isinstance(self.mlp, DeepseekV2MLP) and \
+            hidden_states.dtype == torch.float16:
+            # This is a special case to avoid FP16 overflow
+            hidden_states *= 1. / self.routed_scaling_factor
+            residual *= 1. / self.routed_scaling_factor
         return hidden_states, residual
 
 
From 3f4c3119b2e351d02ba1edabc7131aad0bdc8ad3 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 10 Mar 2025 21:03:41 -0700
Subject: [PATCH 0648/1240] [V1][Core] Fix memory issue with logits & sampling
 (#14508)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Varun Sundar Rabindranath <3337719+varun-sundar-rabindranath@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py     |  11 +-
 vllm/config.py                            |   5 +
 vllm/v1/worker/gpu_model_runner.py        | 185 ++++++++++++----------
 vllm/v1/worker/gpu_worker.py              |  23 +++
 vllm/v1/worker/lora_model_runner_mixin.py |   6 +-
 5 files changed, 139 insertions(+), 91 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 61c79a7bbc9..ba81f2bb79d 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -142,7 +142,16 @@ def test_end_to_end(model: str, use_v1: bool):
     used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
     # now the memory usage is mostly cudagraph memory pool,
     # and it should be less than the model weights (1B model, 2GiB weights)
-    assert used_bytes < 2 * GiB_bytes
+
+    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+    # is captured but cannot be releasesd from PyTorch due to a known bug,
+    # therefore high memory usage after `llm.sleep` is called is expected.
+    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+    # in V1.
+    if use_v1:
+        assert used_bytes < 7 * GiB_bytes
+    else:
+        assert used_bytes < 2 * GiB_bytes
 
     llm.wake_up()
     output2 = llm.generate(prompt, sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
index a6ac9f432df..26c02563b1b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3525,6 +3525,11 @@ def _set_cudagraph_sizes(self):
                 not self.model_config.enforce_eager:
                 batch_size_capture_list = [1, 2, 4
                                            ] + [i for i in range(8, 513, 8)]
+                max_num_tokens = self.scheduler_config.max_num_batched_tokens
+                batch_size_capture_list = [
+                    size for size in batch_size_capture_list
+                    if size <= max_num_tokens
+                ]
 
         self.compilation_config.init_with_cudagraph_sizes(
             batch_size_capture_list)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 57b05908a64..732792885fb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1202,41 +1202,98 @@ def _dummy_run(
         self,
         num_tokens: int,
     ) -> torch.Tensor:
-        model = self.model
-        if self.is_multimodal_model:
-            input_ids = None
-            inputs_embeds = self.inputs_embeds[:num_tokens]
-        else:
-            input_ids = self.input_ids[:num_tokens]
-            inputs_embeds = None
-        if self.uses_mrope:
-            positions = self.mrope_positions[:, :num_tokens]
-        else:
-            positions = self.positions[:num_tokens]
 
-        if get_pp_group().is_first_rank:
-            intermediate_tensors = None
-        else:
-            if self.intermediate_tensors is None:
-                self.intermediate_tensors = (
-                    self.model.make_empty_intermediate_tensors(
-                        batch_size=self.max_num_tokens,
-                        dtype=self.model_config.dtype,
-                        device=self.device))
-            intermediate_tensors = IntermediateTensors({
-                k: v[:num_tokens]
-                for k, v in self.intermediate_tensors.items()
-            })
+        # Set num_scheduled_tokens based on num_tokens and max_num_seqs
+        # for dummy run with LoRA so that the num_reqs collectively
+        # has num_tokens in total.
+        assert num_tokens <= self.scheduler_config.max_num_batched_tokens
+        max_num_reqs = self.scheduler_config.max_num_seqs
+        num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens
+        min_tokens_per_req = num_tokens // num_reqs
+        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
+        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
+        assert sum(num_scheduled_tokens_list) == num_tokens
+        assert len(num_scheduled_tokens_list) == num_reqs
+        num_scheduled_tokens = np.array(num_scheduled_tokens_list,
+                                        dtype=np.int32)
 
-        with set_forward_context(None, self.vllm_config,
-                                 num_tokens=num_tokens):
-            hidden_states = model(
-                input_ids=input_ids,
-                positions=positions,
-                intermediate_tensors=intermediate_tensors,
-                inputs_embeds=inputs_embeds,
-            )
-        return hidden_states
+        with self.maybe_dummy_run_with_lora(self.lora_config,
+                                            num_scheduled_tokens):
+            model = self.model
+            if self.is_multimodal_model:
+                input_ids = None
+                inputs_embeds = self.inputs_embeds[:num_tokens]
+            else:
+                input_ids = self.input_ids[:num_tokens]
+                inputs_embeds = None
+            if self.uses_mrope:
+                positions = self.mrope_positions[:, :num_tokens]
+            else:
+                positions = self.positions[:num_tokens]
+
+            if get_pp_group().is_first_rank:
+                intermediate_tensors = None
+            else:
+                if self.intermediate_tensors is None:
+                    self.intermediate_tensors = (
+                        self.model.make_empty_intermediate_tensors(
+                            batch_size=self.max_num_tokens,
+                            dtype=self.model_config.dtype,
+                            device=self.device))
+                intermediate_tensors = IntermediateTensors({
+                    k: v[:num_tokens]
+                    for k, v in self.intermediate_tensors.items()
+                })
+
+            with set_forward_context(None,
+                                     self.vllm_config,
+                                     num_tokens=num_tokens):
+                hidden_states = model(
+                    input_ids=input_ids,
+                    positions=positions,
+                    intermediate_tensors=intermediate_tensors,
+                    inputs_embeds=inputs_embeds,
+                )
+
+        logit_indices = np.cumsum(num_scheduled_tokens) - 1
+        return hidden_states[logit_indices]
+
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        logits = self.model.compute_logits(hidden_states, None)
+        num_reqs = logits.size(0)
+
+        dummy_tensors = lambda v: torch.full(
+            (num_reqs, ), v, device=self.device)
+
+        dummy_metadata = SamplingMetadata(
+            temperature=dummy_tensors(0.5),
+            all_greedy=False,
+            all_random=False,
+            top_p=dummy_tensors(0.9),
+            top_k=dummy_tensors(logits.size(1) - 1),
+            min_p=None,
+            generators={},
+            max_num_logprobs=None,
+            no_penalties=True,
+            prompt_token_ids=None,
+            frequency_penalties=dummy_tensors(0.1),
+            presence_penalties=dummy_tensors(0.1),
+            repetition_penalties=dummy_tensors(0.1),
+            output_token_ids=[[] for _ in range(num_reqs)],
+            min_tokens={},
+            logit_bias=[None for _ in range(num_reqs)],
+            allowed_token_ids_mask=None,
+            bad_words_token_ids={},
+        )
+        sampler_output = self.model.sample(logits=logits,
+                                           sampling_metadata=dummy_metadata)
+
+        return sampler_output
 
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
@@ -1332,60 +1389,14 @@ def profile_run(self) -> None:
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
-        # For profile, have maximum num_reqs and that collectively have
-        # maximum num_tokens.
-        num_reqs = self.scheduler_config.max_num_seqs
-        num_tokens = self.max_num_tokens
-        min_tokens_per_req = num_tokens // num_reqs
-
-        num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
-        num_scheduled_tokens_list[-1] += num_tokens % num_reqs
-        assert sum(num_scheduled_tokens_list) == num_tokens
-        assert len(num_scheduled_tokens_list) == num_reqs
-
-        num_scheduled_tokens = np.array(num_scheduled_tokens_list,
-                                        dtype=np.int32)
-        logit_indices = np.cumsum(num_scheduled_tokens) - 1
-
-        with self.maybe_profile_with_lora(self.lora_config,
-                                          num_scheduled_tokens):
-            # Trigger compilation for general shape.
-            hidden_states = self._dummy_run(self.max_num_tokens)
-            if get_pp_group().is_last_rank:
-                hidden_states = hidden_states[logit_indices]
-                logits = self.model.compute_logits(hidden_states, None)
-                dummy_tensors = lambda v: torch.full(
-                    (num_reqs, ), v, device=self.device)
-                dummy_metadata = SamplingMetadata(
-                    temperature=dummy_tensors(0.5),
-                    all_greedy=False,
-                    all_random=False,
-                    top_p=dummy_tensors(0.9),
-                    top_k=dummy_tensors(logits.size(1) - 1),
-                    min_p=None,
-                    generators={},
-                    max_num_logprobs=None,
-                    no_penalties=True,
-                    prompt_token_ids=torch.ones_like(logits,
-                                                     dtype=torch.int64),
-                    frequency_penalties=dummy_tensors(0.1),
-                    presence_penalties=dummy_tensors(0.1),
-                    repetition_penalties=dummy_tensors(0.1),
-                    output_token_ids=[[] for _ in range(num_reqs)],
-                    min_tokens={},
-                    logit_bias=[None for _ in range(num_reqs)],
-                    allowed_token_ids_mask=None,
-                    bad_words_token_ids={},
-                )
-                sampler_output = self.model.sample(
-                    logits=logits, sampling_metadata=dummy_metadata)
-            else:
-                logits = None
-                sampler_output = None
-                dummy_metadata = None
-            torch.cuda.synchronize()
-            del hidden_states, logits, sampler_output, dummy_metadata
-            self.encoder_cache.clear()
+        hidden_states = self._dummy_run(self.max_num_tokens)
+        if get_pp_group().is_last_rank:
+            sampler_output = self._dummy_sampler_run(hidden_states)
+        else:
+            sampler_output = None
+        torch.cuda.synchronize()
+        del hidden_states, sampler_output
+        self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index cc6268d6569..040a27de948 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -119,6 +119,8 @@ def init_device(self):
         self.model_runner: GPUModelRunner = GPUModelRunner(
             self.vllm_config, self.device)
 
+    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
+    # to hijack tensor allocation.
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
             allocator = CuMemAllocator.get_instance()
@@ -211,6 +213,27 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size)
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
+
+        # Warm up sampler and preallocate memory buffer for logits and other
+        # sampling related tensors of max possible shape to avoid memory
+        # fragmentation issue.
+        # NOTE: This is called after `capture_model` on purpose to prevent
+        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        try:
+            max_num_reqs = min(self.scheduler_config.max_num_seqs,
+                               self.scheduler_config.max_num_batched_tokens)
+            self.model_runner._dummy_sampler_run(
+                hidden_states=self.model_runner._dummy_run(
+                    num_tokens=max_num_reqs))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up sampler. "
+                    "Please try lowering `gpu_memory_utilization` when "
+                    "initializing the engine.") from None
+            else:
+                raise e
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 0b30a467305..2814f0fda7c 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -83,8 +83,8 @@ def set_active_loras(self, input_batch: InputBatch,
                                       lora_requests)
 
     @contextmanager
-    def maybe_profile_with_lora(self, lora_config: LoRAConfig,
-                                num_scheduled_tokens: np.ndarray):
+    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
+                                  num_scheduled_tokens: np.ndarray):
         if lora_config is None:
             yield
         else:
@@ -145,4 +145,4 @@ def pin_lora(self, lora_id: int) -> bool:
     def list_loras(self) -> set[int]:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
-        return self.lora_manager.list_adapters()
\ No newline at end of file
+        return self.lora_manager.list_adapters()

From b90500b4b0a1c03223e2e423f24309e7e75261db Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 11 Mar 2025 12:37:11 +0800
Subject: [PATCH 0649/1240] [Misc] Correct deepseek-vl2 chat template (#14558)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/template_deepseek_vl2.jinja | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/template_deepseek_vl2.jinja b/examples/template_deepseek_vl2.jinja
index fbf3d320094..6dbfb0274ee 100644
--- a/examples/template_deepseek_vl2.jinja
+++ b/examples/template_deepseek_vl2.jinja
@@ -12,12 +12,12 @@
     {%- endif -%}
 
     {%- if message['role'] == 'user' -%}
-        {{ '<|User|>: ' + message['content'] + '\n' }}
+        {{ '<|User|>: ' + message['content'] + '\n\n' }}
     {%- elif message['role'] == 'assistant' -%}
-        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n' }}
+        {{ '<|Assistant|>: ' + message['content'] + eos_token + '\n\n' }}
     {%- endif -%}
 {%- endfor -%}
 
 {%- if add_generation_prompt -%}
     {{ '<|Assistant|>: ' }}
-{% endif %}
+{%- endif -%}

From d468e24257ec0d5e60b5586079a707e51e1f707f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?yexin=28=E5=8F=B6=E9=91=AB=29?= <yexin93@qq.com>
Date: Tue, 11 Mar 2025 15:39:56 +0800
Subject: [PATCH 0650/1240] [Perf]:Optimize qwen2-vl to reduce cudaMemcpyAsync
 (#14377)

Signed-off-by: cynthieye <987073381@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 45 ++++++++++++++++------
 vllm/model_executor/models/qwen2_vl.py   | 49 ++++++++++++++++++------
 2 files changed, 70 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ef3d28c8087..ae48c779481 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -255,10 +255,12 @@ def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         return q, k, v
 
     def forward(
-        self,
-        x: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -285,7 +287,6 @@ def forward(
 
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             output = flash_attn_varlen_func(q,
                                             k,
                                             v,
@@ -321,7 +322,6 @@ def forward(
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
                                                        kv_seqlen=None,
                                                        device=q.device)
@@ -364,11 +364,20 @@ def __init__(
                                      quant_config=quant_config,
                                      prefix=f"{prefix}.mlp")
 
-    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
-                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
         x = x + self.attn(self.norm1(x),
                           cu_seqlens=cu_seqlens,
-                          rotary_pos_emb=rotary_pos_emb)
+                          rotary_pos_emb=rotary_pos_emb,
+                          max_seqlen=max_seqlen,
+                          seqlens=seqlens)
+
         x = x + self.mlp(self.norm2(x))
         return x
 
@@ -528,6 +537,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
         )
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
 
     @property
     def dtype(self) -> torch.dtype:
@@ -633,14 +643,25 @@ def forward(
 
         # transformers
         hidden_states = hidden_states.unsqueeze(1)
+
+        max_seqlen = None
+        seqlens = None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         for layer_num, blk in enumerate(self.blocks):
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
             else:
                 cu_seqlens_now = cu_window_seqlens
-            hidden_states = blk(hidden_states,
-                                cu_seqlens=cu_seqlens_now,
-                                rotary_pos_emb=rotary_pos_emb)
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens_now,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
         # for long visual tokens sequences.
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ac3d154dd88..0e9fa7183c8 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -303,10 +303,12 @@ def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
         return q, k, v
 
     def forward(
-        self,
-        x: torch.Tensor,
-        cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
     ) -> torch.Tensor:
 
         # [s, b, c] --> [s, b, 3 * head * head_dim]
@@ -329,7 +331,6 @@ def forward(
 
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
 
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
             output = flash_attn_varlen_func(q,
                                             k,
                                             v,
@@ -365,7 +366,6 @@ def forward(
             from xformers import ops as xops
             from xformers.ops.fmha.attn_bias import BlockDiagonalMask
 
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
             attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
                                                        kv_seqlen=None,
                                                        device=q.device)
@@ -409,11 +409,22 @@ def __init__(
                                   quant_config=quant_config,
                                   prefix=f"{prefix}.mlp")
 
-    def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor,
-                rotary_pos_emb: torch.Tensor) -> torch.Tensor:
-        x = x + self.attn(self.norm1(x),
-                          cu_seqlens=cu_seqlens,
-                          rotary_pos_emb=rotary_pos_emb)
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        x = x + self.attn(
+            self.norm1(x),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+
         x = x + self.mlp(self.norm2(x))
         return x
 
@@ -570,6 +581,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
         )
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
 
     @property
     def dtype(self) -> torch.dtype:
@@ -624,8 +636,21 @@ def forward(
 
         # transformers
         x = x.unsqueeze(1)
+
+        max_seqlen = None
+        seqlens = None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         for blk in self.blocks:
-            x = blk(x, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+            x = blk(
+                x,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
 
         # adapter
         x = self.merger(x)

From d1197d3f802238eed3d8ea835fb356b81b7efec1 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 11 Mar 2025 19:27:36 +0800
Subject: [PATCH 0651/1240] [VLM] Cleanup siglip legacy code and fix broken
 paligemma multimodal processor (#14602)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/paligemma.py | 15 +++--
 vllm/model_executor/models/siglip.py    | 75 ++-----------------------
 2 files changed, 14 insertions(+), 76 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index f3dc87854cb..d4758079c42 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -24,9 +24,10 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .siglip import SiglipVisionModel, get_max_siglip_image_tokens
+from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info
 
 logger = init_logger(__name__)
 
@@ -67,6 +68,9 @@ class PaliGemmaProcessingInfo(BaseProcessingInfo):
     def get_hf_config(self):
         return self.ctx.get_hf_config(PaliGemmaConfig)
 
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": 1}
 
@@ -78,9 +82,8 @@ def get_mm_max_tokens_per_item(
         return {"image": self.get_num_image_tokens()}
 
     def get_num_image_tokens(self) -> int:
-        hf_config = self.get_hf_config()
-        vision_config = hf_config.vision_config
-        return get_max_siglip_image_tokens(vision_config)
+        vision_encoder_info = self.get_vision_encoder_info()
+        return vision_encoder_info.get_max_image_tokens()
 
 
 class PaliGemmaDummyInputsBuilder(
@@ -173,8 +176,10 @@ def apply(
         prompt: Union[str, list[int]],
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
+        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
+                                  return_mm_hashes)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 2892f696107..518dbc73f8c 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -6,7 +6,6 @@
 from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
-from PIL import Image
 from torch import nn
 from transformers import SiglipVisionConfig
 
@@ -20,74 +19,10 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import SequenceData
 
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 
-def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
-    # Since interpolation is applied, the image size need not be divisible
-    # assert image_size % patch_size == 0
-    return image_size // patch_size
-
-
-def get_siglip_num_patches(*, image_size: int, patch_size: int) -> int:
-    grid_length = get_siglip_patch_grid_length(image_size=image_size,
-                                               patch_size=patch_size)
-    return grid_length * grid_length
-
-
-def get_siglip_image_feature_size(hf_config: SiglipVisionConfig) -> int:
-    return get_siglip_num_patches(image_size=hf_config.image_size,
-                                  patch_size=hf_config.patch_size)
-
-
-def get_max_siglip_image_tokens(hf_config: SiglipVisionConfig) -> int:
-    return get_siglip_image_feature_size(hf_config)
-
-
-def dummy_seq_data_for_siglip(
-    hf_config: SiglipVisionConfig,
-    seq_len: int,
-    num_images: int,
-    *,
-    image_token_id: int,
-    image_feature_size_override: Optional[int] = None,
-    mm_key: str = "image",
-):
-    if image_feature_size_override is None:
-        image_feature_size = get_siglip_image_feature_size(hf_config)
-    else:
-        image_feature_size = image_feature_size_override
-
-    return SequenceData.from_prompt_token_counts(
-        (image_token_id, image_feature_size * num_images),
-        (0, seq_len - image_feature_size * num_images),
-    ), {
-        mm_key:
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-
-
-def dummy_image_for_siglip(
-    hf_config: SiglipVisionConfig,
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
 class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
 
     def get_num_image_tokens(
@@ -96,10 +31,10 @@ def get_num_image_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        return get_siglip_image_feature_size(self.vision_config)
+        return self.get_patch_grid_length()**2
 
     def get_max_image_tokens(self) -> int:
-        return get_max_siglip_image_tokens(self.vision_config)
+        return self.get_patch_grid_length()**2
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
@@ -108,10 +43,8 @@ def get_patch_size(self) -> int:
         return self.vision_config.patch_size
 
     def get_patch_grid_length(self) -> int:
-        return get_siglip_patch_grid_length(
-            image_size=self.vision_config.image_size,
-            patch_size=self.vision_config.patch_size,
-        )
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+        return image_size // patch_size
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa

From 159dee842eac41c0a619ce2559409ecc6fc41a15 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Mar 2025 09:39:30 -0400
Subject: [PATCH 0652/1240] benchmarks: simplify test jsonschema (#14567)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../structured_schema_1.json                  | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json
index 1bd189c9e70..13bd6b6d16c 100644
--- a/benchmarks/structured_schemas/structured_schema_1.json
+++ b/benchmarks/structured_schemas/structured_schema_1.json
@@ -1,25 +1,19 @@
 {
-  "type": "array",
-  "items": {
     "type": "object",
     "properties": {
       "name": { "type": "string" },
-      "race": { "type": "string" },
-      "class": { "type": "string" },
-      "level": { "type": "integer" },
-      "background": { "type": "string" },
-      "alignment": { "type": "string" },
-      "backstory": { "type": "string" }
+      "email": { "type": "string" },
+      "street": { "type": "string" },
+      "city": { "type": "string" },
+      "state": { "type": "string" },
+      "zip": { "type": "string" },
+      "phone": { "type": "string" },
+      "website": { "type": "string" },
+      "company": { "type": "string" },
+      "age": { "type": "integer" }
     },
     "required": [
       "name",
-      "race",
-      "class",
-      "level",
-      "background",
-      "alignment",
-      "backstory"
+      "email"
     ]
-  }
 }
-

From 7ec16dd07668eb51736bf6f2c43ce411a144b336 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Tue, 11 Mar 2025 07:54:56 -0700
Subject: [PATCH 0653/1240] dynamic distpatch of fp8 kernels (#14245)

Signed-off-by: Jeff Daily <jeff.daily@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py           |  3 +-
 csrc/dispatch_utils.h                         | 32 +++++--
 csrc/layernorm_quant_kernels.cu               | 58 +++++++------
 csrc/quantization/fp8/amd/quant_utils.cuh     | 22 +++++
 csrc/quantization/fp8/common.cu               | 68 +++++++++------
 csrc/quantization/fp8/common.cuh              | 86 +++++++++++++------
 ...fused_layernorm_dynamic_per_token_quant.cu |  3 +
 .../fused_kernels/quant_conversions.cuh       | 19 ++--
 csrc/quantization/vectorization.cuh           |  1 -
 tests/kernels/quant_utils.py                  |  3 +-
 tests/kernels/test_triton_scaled_mm.py        |  7 +-
 tests/quantization/test_fp8.py                |  9 +-
 vllm/_custom_ops.py                           | 35 ++++----
 vllm/attention/backends/mla/common.py         |  6 +-
 .../compressed_tensors_moe.py                 |  3 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    |  4 +-
 .../layers/quantization/fbgemm_fp8.py         |  2 +-
 .../model_executor/layers/quantization/fp8.py | 14 ++-
 .../layers/quantization/quark/quark_moe.py    |  3 +-
 .../quark/schemes/quark_w8a8_fp8.py           |  4 +-
 .../layers/quantization/utils/fp8_utils.py    | 12 +--
 vllm/platforms/cuda.py                        |  4 +
 vllm/platforms/interface.py                   | 30 +++++++
 vllm/platforms/rocm.py                        | 17 ++++
 vllm/v1/attention/backends/mla/common.py      |  6 +-
 25 files changed, 292 insertions(+), 159 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 9de8d5af624..233fc35d2cf 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -18,8 +18,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import FlexibleArgumentParser
 
-FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
-) else torch.float8_e4m3fn
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 class BenchmarkConfig(TypedDict):
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 03414b7e1ae..dc6e0769b87 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -6,6 +6,11 @@
 
 #include <torch/all.h>
 
+// Need a special dispatch case macro since we will nest the FP8 dispatch.
+// Instead of the usual 'scalar_t', this names the dispatched type 'fp8_t'.
+#define AT_DISPATCH_FP8_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, fp8_t, __VA_ARGS__)
+
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)  \
@@ -14,17 +19,32 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
-// TODO(luka/varun): use FP8_TYPE macro after refactoring
-#ifndef USE_ROCM
-  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
-    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
-    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
-#else
+// ROCm devices might use either fn or fnuz, so set up dispatch table for both.
+// A host-based check at runtime will create a preferred FP8 type for ROCm
+// such that the correct kernel is dispatched.
+#ifdef USE_ROCM
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...)                          \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__)
+
   #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                      \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)   \
     AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fnuz, __VA_ARGS__) \
     AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+#else
+  #define VLLM_DISPATCH_CASE_FP8_TYPES(...) \
+    AT_DISPATCH_FP8_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
+
+  #define VLLM_DISPATCH_CASE_QUANT_TYPES(...)                    \
+    AT_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
+    AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
 #endif
 
+// When using this dispatch macro, the type is 'fp8_t' not 'scalar_t'.
+// See AT_DISPATCH_FP8_CASE above.
+#define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
index c18e2a4e4ab..d595b9e889c 100644
--- a/csrc/layernorm_quant_kernels.cu
+++ b/csrc/layernorm_quant_kernels.cu
@@ -21,9 +21,9 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
+template <typename scalar_t, typename fp8_type>
 __global__ void rms_norm_static_fp8_quant_kernel(
-    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    fp8_type* __restrict__ out,           // [..., hidden_size]
     const scalar_t* __restrict__ input,   // [..., hidden_size]
     const scalar_t* __restrict__ weight,  // [hidden_size]
     const float* __restrict__ scale,      // [1]
@@ -52,7 +52,7 @@ __global__ void rms_norm_static_fp8_quant_kernel(
     float x = (float)input[blockIdx.x * hidden_size + idx];
     float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
     out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true>(out_norm, scale_inv);
+        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
   }
 }
 
@@ -60,10 +60,10 @@ __global__ void rms_norm_static_fp8_quant_kernel(
    Additional optimizations we can make in this case are
    packed and vectorized operations, which help with the
    memory latency bottleneck. */
-template <typename scalar_t, int width>
+template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    fp8_type* __restrict__ out,           // [..., hidden_size]
     scalar_t* __restrict__ input,         // [..., hidden_size]
     scalar_t* __restrict__ residual,      // [..., hidden_size]
     const scalar_t* __restrict__ weight,  // [hidden_size]
@@ -114,7 +114,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 #pragma unroll
     for (int i = 0; i < width; ++i) {
       out[id * width + i] =
-          scaled_fp8_conversion<true>(float(temp.data[i]), scale_inv);
+          scaled_fp8_conversion<true, fp8_type>(float(temp.data[i]), scale_inv);
     }
   }
 }
@@ -122,10 +122,10 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 /* Generic fused_add_rms_norm_kernel
    The width field is not used here but necessary for other specializations.
  */
-template <typename scalar_t, int width>
+template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    fp8_type* __restrict__ out,           // [..., hidden_size]
     scalar_t* __restrict__ input,         // [..., hidden_size]
     scalar_t* __restrict__ residual,      // [..., hidden_size]
     const scalar_t* __restrict__ weight,  // [hidden_size]
@@ -158,7 +158,7 @@ fused_add_rms_norm_static_fp8_quant_kernel(
     float x = (float)residual[blockIdx.x * hidden_size + idx];
     float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
     out[blockIdx.x * hidden_size + idx] =
-        scaled_fp8_conversion<true>(out_norm, scale_inv);
+        scaled_fp8_conversion<true, fp8_type>(out_norm, scale_inv);
   }
 }
 
@@ -176,25 +176,33 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
   dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
-    vllm::rms_norm_static_fp8_quant_kernel<scalar_t>
-        <<<grid, block, 0, stream>>>(
-            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
-            weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), epsilon,
-            num_tokens, hidden_size);
-  });
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "rms_norm_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "rms_norm_kernel_fp8_type", [&] {
+              vllm::rms_norm_static_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),
+                      epsilon, num_tokens, hidden_size);
+            });
+      });
 }
 
-#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
-  VLLM_DISPATCH_FLOATING_TYPES(                                             \
-      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {               \
-        vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t, width>   \
-            <<<grid, block, 0, stream>>>(                                   \
-                out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),       \
-                residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
-                scale.data_ptr<float>(), epsilon, num_tokens, hidden_size); \
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                     \
+  VLLM_DISPATCH_FLOATING_TYPES(                                              \
+      input.scalar_type(), "fused_add_rms_norm_kernel_scalar_type", [&] {    \
+        VLLM_DISPATCH_FP8_TYPES(                                             \
+            out.scalar_type(), "fused_add_rms_norm_kernel_fp8_type", [&] {   \
+              vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t,     \
+                                                               width, fp8_t> \
+                  <<<grid, block, 0, stream>>>(                              \
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),     \
+                      residual.data_ptr<scalar_t>(),                         \
+                      weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),  \
+                      epsilon, num_tokens, hidden_size);                     \
+            });                                                              \
       });
-
 void fused_add_rms_norm_static_fp8_quant(
     torch::Tensor& out,       // [..., hidden_size],
     torch::Tensor& input,     // [..., hidden_size]
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index b812b28b607..f01427cc3d0 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -13,6 +13,28 @@ namespace vllm {
 namespace fp8 {
   #ifdef ENABLE_FP8
 
+// Use hardware cvt instruction for fp8 on rocm
+template <typename fp8_type>
+__device__ __forceinline__ fp8_type cvt_c10(float const r) {
+  return {};
+}
+
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+  return c10::Float8_e4m3fn(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
+                             __hip_fp8_e4m3::__default_interpret),
+      c10::Float8_e4m3fn::from_bits());
+}
+
+template <>
+__device__ __forceinline__ c10::Float8_e4m3fnuz cvt_c10(float const r) {
+  return c10::Float8_e4m3fnuz(
+      __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3_fnuz::__default_saturation,
+                             __hip_fp8_e4m3_fnuz::__default_interpret),
+      c10::Float8_e4m3fnuz::from_bits());
+}
+
 template <typename Tout, typename Tin>
 __inline__ __device__ Tout vec_conversion(const Tin& x) {
   return x;
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index e4f6615ede1..8f9aa21aae4 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -11,8 +11,8 @@
 
 namespace vllm {
 
-template <typename scalar_t>
-__global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
+template <typename scalar_t, typename fp8_type>
+__global__ void scaled_fp8_quant_kernel(fp8_type* __restrict__ out,
                                         const scalar_t* __restrict__ input,
                                         const float* __restrict__ scale,
                                         int64_t num_elems) {
@@ -25,12 +25,13 @@ __global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
       out, input, inverted_scale, num_elems, tid, blockDim.x * gridDim.x);
 }
 
-template <typename scalar_t>
+template <typename scalar_t, typename fp8_type>
 __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
-    FP8_TYPE* __restrict__ out, float* __restrict__ scale,
+    fp8_type* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
-  float const min_scaling_factor = 1.0f / (FP8_E4M3_MAX * 512.f);
+  float const min_scaling_factor =
+      1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
 
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
@@ -38,7 +39,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
   // Use int64 to avoid overflowing an int32 when calculating this offset
   int64_t offset = static_cast<int64_t>(token_idx) * hidden_size;
   scalar_t const* __restrict__ token_input = &input[offset];
-  FP8_TYPE* __restrict__ token_output = &out[offset];
+  fp8_type* __restrict__ token_output = &out[offset];
 
   // For vectorization, token_input and token_output pointers need to be
   // aligned at 8-byte and 4-byte addresses respectively.
@@ -66,7 +67,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / FP8_E4M3_MAX, min_scaling_factor);
+    token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
+                      min_scaling_factor);
     scale[token_idx] = token_scale;
   }
   __syncthreads();
@@ -77,7 +79,7 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
         token_output, token_input, token_scale, hidden_size, tid, blockDim.x);
   } else {
     for (int i = tid; i < hidden_size; i += blockDim.x) {
-      token_output[i] = scaled_fp8_conversion<false>(
+      token_output[i] = scaled_fp8_conversion<false, fp8_type>(
           static_cast<float>(token_input[i]), token_scale);
     }
   }
@@ -96,10 +98,14 @@ void static_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
-        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
-            scale.data_ptr<float>(), num_elems);
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
       });
 }
 
@@ -114,12 +120,18 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out,          // [..., d]
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "scaled_fp8_quant_kernel", [&] {
-        vllm::segmented_max_reduction<scalar_t><<<grid, block, 0, stream>>>(
-            scale.data_ptr<float>(), input.data_ptr<scalar_t>(), num_elems);
-        vllm::scaled_fp8_quant_kernel<scalar_t><<<grid, block, 0, stream>>>(
-            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
-            scale.data_ptr<float>(), num_elems);
+      input.scalar_type(), "scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(), "scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::segmented_max_reduction<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(scale.data_ptr<float>(),
+                                               input.data_ptr<scalar_t>(),
+                                               num_elems);
+              vllm::scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),
+                      scale.data_ptr<float>(), num_elems);
+            });
       });
 }
 
@@ -138,12 +150,18 @@ void dynamic_per_token_scaled_fp8_quant(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_per_token_scaled_fp8_quant_kernel", [&] {
-        vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t>
-            <<<grid, block, 0, stream>>>(
-                out.data_ptr<FP8_TYPE>(), scales.data_ptr<float>(),
-                input.data_ptr<scalar_t>(),
-                scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                hidden_size);
+      input.scalar_type(),
+      "dynamic_per_token_scaled_fp8_quant_kernel_scalar_type", [&] {
+        VLLM_DISPATCH_FP8_TYPES(
+            out.scalar_type(),
+            "dynamic_per_token_scaled_fp8_quant_kernel_fp8_type", [&] {
+              vllm::dynamic_per_token_scaled_fp8_quant_kernel<scalar_t, fp8_t>
+                  <<<grid, block, 0, stream>>>(
+                      out.data_ptr<fp8_t>(), scales.data_ptr<float>(),
+                      input.data_ptr<scalar_t>(),
+                      scale_ub.has_value() ? scale_ub->data_ptr<float>()
+                                           : nullptr,
+                      hidden_size);
+            });
       });
 }
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index fac99b29734..d331c63ae82 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -7,18 +7,52 @@
 
 #ifndef USE_ROCM
   #include <c10/util/Float8_e4m3fn.h>
-using FP8_TYPE = c10::Float8_e4m3fn;
-C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
-    std::numeric_limits<FP8_TYPE>::max();
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
 #else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
   #include <c10/util/Float8_e4m3fnuz.h>
   #include "amd/quant_utils.cuh"
-using FP8_TYPE = c10::Float8_e4m3fnuz;
-// Using the default max value from pytorch (240.0) will cause accuracy
-// issue when running dynamic quantization. Here use 224.0f for rocm.
-constexpr auto FP8_E4M3_MAX = 224.0f;
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+// Determines the preferred FP8 type for the current platform.
+// Note that for CUDA this just returns true,
+// but on ROCm it will check device props.
+static bool is_fp8_ocp() {
+#ifndef USE_ROCM
+  return true;
+#else
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  std::string device_arch = dprops->gcnArchName;
+  size_t substring = device_arch.find("gfx94");
+  return substring == std::string::npos;
 #endif
-constexpr static auto kFp8Type = c10::CppTypeToScalarType<FP8_TYPE>::value;
+}
+
+template <typename T>
+struct fp8_e4m3_adjusted_max;
+
+template <>
+struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
+  static constexpr c10::Float8_e4m3fn val() {
+    return std::numeric_limits<c10::Float8_e4m3fn>::max();
+  }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
+    fp8_e4m3_adjusted_max<T>::val();
 
 namespace vllm {
 
@@ -32,8 +66,8 @@ __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
   return old;
 }
 
-template <bool is_scale_inverted>
-__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
+template <bool is_scale_inverted, typename fp8_type>
+__device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
                                                           float const scale) {
   float x = 0.0f;
   if constexpr (is_scale_inverted) {
@@ -42,15 +76,13 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
     x = val / scale;
   }
 
-  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+  float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
+                 fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
 #ifndef USE_ROCM
-  return static_cast<c10::Float8_e4m3fn>(r);
+  return static_cast<fp8_type>(r);
 #else
   // Use hardware cvt instruction for fp8 on rocm
-  return c10::Float8_e4m3fnuz(
-      __hip_cvt_float_to_fp8(r, fp8::fp8_type::__default_saturation,
-                             fp8::fp8_type::__default_interpret),
-      c10::Float8_e4m3fnuz::from_bits());
+  return fp8::cvt_c10<fp8_type>(r);
 #endif
 }
 
@@ -60,7 +92,7 @@ __device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
 // So to get the right answer, *scale needs to be initialized to
 // a value <= 0.0 and we need to wait for all thread blocks to
 // finish before consuming *scale.
-template <typename scalar_t>
+template <typename scalar_t, typename fp8_type>
 __global__ void segmented_max_reduction(float* __restrict__ scale,
                                         const scalar_t* __restrict__ input,
                                         int64_t num_elems) {
@@ -91,7 +123,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX);
+    atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
   }
 }
 
@@ -123,13 +155,13 @@ __device__ float thread_max_vec(scalar_t const* __restrict__ input,
   return absmax_val;
 }
 
-template <typename scalar_t, bool is_scale_inverted>
-__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
+template <typename scalar_t, bool is_scale_inverted, typename fp8_type>
+__device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out,
                                           scalar_t const* __restrict__ input,
                                           float const scale,
                                           int64_t const num_elems,
                                           int const tid, int const step) {
-  using float8x4_t = q8x4_t<FP8_TYPE>;
+  using float8x4_t = q8x4_t<fp8_type>;
   // Vectorized input/output to better utilize memory bandwidth.
   auto const* vectorized_in = reinterpret_cast<vec4_t<scalar_t> const*>(input);
   auto* vectorized_out = reinterpret_cast<float8x4_t*>(out);
@@ -141,22 +173,22 @@ __device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
     vec4_t<scalar_t> in_vec = vectorized_in[i];
     float8x4_t out_vec;
 
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
+    out_vec.x = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
+    out_vec.y = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
+    out_vec.z = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
+    out_vec.w = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(in_vec.w), scale);
     vectorized_out[i] = out_vec;
   }
 
   // Handle the remaining elements if num_elems is not divisible by 4
   for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    out[i] = scaled_fp8_conversion<is_scale_inverted>(
+    out[i] = scaled_fp8_conversion<is_scale_inverted, fp8_type>(
         static_cast<float>(input[i]), scale);
   }
 }
 
-}  // namespace vllm
\ No newline at end of file
+}  // namespace vllm
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 3c4f183bf4b..1be89c504bf 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -144,6 +144,9 @@ void rms_norm_dynamic_per_token_quant(
     torch::Tensor& scales,        // [num_tokens]
     double const var_epsilon,     // Variance epsilon used in norm calculation
     std::optional<at::Tensor> scale_ub, std::optional<at::Tensor> residual) {
+  static c10::ScalarType kFp8Type = is_fp8_ocp()
+                                        ? c10::ScalarType::Float8_e4m3fn
+                                        : c10::ScalarType::Float8_e4m3fnuz;
   TORCH_CHECK(out.dtype() == kFp8Type || out.dtype() == torch::kInt8);
   TORCH_CHECK(out.is_contiguous() && input.is_contiguous());
 
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index f8a9872226a..9ac7b188f51 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -31,9 +31,11 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
 #endif
 }
 
-static __device__ __forceinline__ FP8_TYPE float_to_fp8(float const x) {
-  float const r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
-  return static_cast<FP8_TYPE>(r);
+template <typename fp8_type>
+static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
+  float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
+                       fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  return static_cast<fp8_type>(r);
 }
 
 template <typename quant_type_t, bool is_scale_inverted, typename enable = void>
@@ -54,15 +56,16 @@ struct ScaledQuant<
 };
 
 template <typename quant_type_t, bool is_scale_inverted>
-struct ScaledQuant<
-    quant_type_t, is_scale_inverted,
-    typename std::enable_if_t<std::is_same_v<quant_type_t, FP8_TYPE>>> {
+struct ScaledQuant<quant_type_t, is_scale_inverted,
+                   typename std::enable_if_t<
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
+                       std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>>> {
   static __device__ __forceinline__ quant_type_t quant_fn(float const x,
                                                           float const scale) {
     if constexpr (is_scale_inverted) {
-      return float_to_fp8(x * scale);
+      return float_to_fp8<quant_type_t>(x * scale);
     } else {
-      return float_to_fp8(x / scale);
+      return float_to_fp8<quant_type_t>(x / scale);
     }
   }
 };
diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh
index 44c999130f7..866da10b5bc 100644
--- a/csrc/quantization/vectorization.cuh
+++ b/csrc/quantization/vectorization.cuh
@@ -4,7 +4,6 @@
  */
 
 // Include both AMD and NVIDIA fp8 types to avoid circular import
-// TODO(luka/varun) use FP8_TYPE instead after refactoring
 #include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e4m3fn.h>
 
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
index a21d642bcaa..498da6001ae 100644
--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -9,8 +9,7 @@
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
 ROCM_FP8_MAX = 224.0
-FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
-                else torch.float8_e4m3fn
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
index bbff3e0a041..45f10b0eb1d 100644
--- a/tests/kernels/test_triton_scaled_mm.py
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -32,11 +32,8 @@ def scaled_mm_torch(a: torch.Tensor,
 
 def get_8bit_types():
     types = [torch.int8]
-    supports_fp8 = current_platform.has_device_capability(89)
-    if current_platform.is_rocm() and supports_fp8:
-        types.append(torch.float8_e4m3fnuz)
-    elif current_platform.is_cuda() and supports_fp8:
-        types.append(torch.float8_e4m3fn)
+    if current_platform.supports_fp8():
+        types.append(current_platform.fp8_dtype())
     return types
 
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 3a7f0a196b5..b9a1d759b9a 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -103,8 +103,7 @@ def check_model(model):
                 assert attn._v_scale == 1.0
 
             if current_platform.is_cuda():
-                if current_platform.has_device_capability(
-                        89) and not force_marlin:
+                if current_platform.supports_fp8() and not force_marlin:
                     # For GPUs with hardware support, we keep weights in fp8
                     assert fc1.weight.dtype == torch.float8_e4m3fn
                 else:
@@ -112,11 +111,9 @@ def check_model(model):
                     # for weight-only quantization using Marlin kernels
                     assert fc1.weight.dtype == torch.int32
             elif current_platform.is_rocm():
-                # Only MI300 and above support quantization='fp8'
-                if current_platform.has_device_capability(
-                        94) and not force_marlin:
+                if current_platform.supports_fp8() and not force_marlin:
                     # For GPUs with hardware support, we keep weights in fp8
-                    assert fc1.weight.dtype == torch.float8_e4m3fnuz
+                    assert fc1.weight.dtype == current_platform.fp8_dtype()
                 else:  # unsupported ROCm platform
                     pytest.skip(
                         "Skip `test_load_fp16_model`. "
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 53065dd0119..14cfe751514 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -478,16 +478,16 @@ def cutlass_scaled_mm(a: torch.Tensor,
                       out_dtype: torch.dtype,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     """
-    `cutlass_scaled_mm` implements a fused version of 
+    `cutlass_scaled_mm` implements a fused version of
         `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
-    where scale_a * a and scale_b * b are implemented using numpy-style 
-    broadcasting. 
-    
-    In order to support blockwise scaling like found in DeepSeek V3 we also 
-    support extended "group" broadcast rules. We extend the numpy-style 
-    broadcasting rules with the following rule: 
-        "if the extent of a dimension in the source shape is between 1 and 
-        corresponding extent in the target shape we repeat each element along 
+    where scale_a * a and scale_b * b are implemented using numpy-style
+    broadcasting.
+
+    In order to support blockwise scaling like found in DeepSeek V3 we also
+    support extended "group" broadcast rules. We extend the numpy-style
+    broadcasting rules with the following rule:
+        "if the extent of a dimension in the source shape is between 1 and
+        corresponding extent in the target shape we repeat each element along
         that dimension  src_shape[dim] // target_shape[dim] times consecutively"
     example if we have:
           a = [[1, 2], and target_shape = (2, 4)
@@ -564,7 +564,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \
     with Cutlass sparse kernels.
 
     Args:
-        a (torch.Tensor): 
+        a (torch.Tensor):
             The input tensor to be compressed. Must have one of the following data types:
             - `torch.int8`
             - `torch.float8_e4m3fn`
@@ -572,7 +572,7 @@ def cutlass_sparse_compress(a: torch.Tensor) \
             - `torch.float16`
 
     Returns:
-        tuple[torch.Tensor, torch.Tensor]: 
+        tuple[torch.Tensor, torch.Tensor]:
             A tuple containing:
             - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
             - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
@@ -875,9 +875,8 @@ def scaled_fp8_quant(
     # This code assumes batch_dim and num_tokens are flattened
     assert (input.ndim == 2)
     shape: Union[tuple[int, int], torch.Size] = input.shape
-    # For rocm, the output fp8 dtype is torch.float_e3m3fnuz
-    out_dtype: torch.dtype = torch.float8_e4m3fnuz \
-            if current_platform.is_rocm() else torch.float8_e4m3fn
+    # For ROCm on MI300, the output fp8 dtype is torch.float_e3m3fnuz
+    out_dtype: torch.dtype = current_platform.fp8_dtype()
     if num_token_padding:
         shape = (max(num_token_padding, input.shape[0]), shape[1])
     output = torch.empty(shape, device=input.device, dtype=out_dtype)
@@ -908,7 +907,7 @@ def allspark_repack_weight(
         has_zp: bool = False
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
-    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format 
+    Rearrange qweight, scale, and zero_point(if asymmetric) to n32k16 format
     for Ampere W8A16 Fused Gemm kernel
 
     Args:
@@ -917,10 +916,10 @@ def allspark_repack_weight(
         zero_point: fp16/bf16 weight zero_point tensor, 1 x n format.
             Must be provided for asymmetric quantization.
         has_zp: if use symmetric quantization, has_zp = False.
-            if use asymmetric quantization, has_zp = True.  
-    
+            if use asymmetric quantization, has_zp = True.
+
     Returns:
-        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : 
+        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
             rearranged weight, scale, and optionally zero_point.
     """
     K = qweight.shape[0]
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 4f4b70cd8f4..e912b1e9757 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -226,7 +226,7 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
@@ -1238,7 +1238,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 W_Q_UK, W_Q_UK_scales = scaled_quantize(
                     W_Q_UK,
                     self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                 # For FP8 save the transpose so we can use
                 # `apply_w8a8_block_fp8_linear` directly
                 self.W_Q_UK = W_Q_UK.T.contiguous()
@@ -1255,7 +1255,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 W_UV_O, W_UV_O_scales = scaled_quantize(
                     W_UV_O,
                     self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                 # For FP8 save the transpose so we can use
                 # `apply_w8a8_block_fp8_linear` directly
                 self.W_UV_O = W_UV_O.T.contiguous()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index c9aa0ec285b..ff381a4cc1a 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -158,8 +158,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_input_scale = torch.nn.Parameter(
                 layer.w2_input_scale.max(), requires_grad=False)
 
-        # If rocm, normalize the weights and scales to e4m3fnuz
-        if current_platform.is_rocm():
+        if current_platform.is_fp8_fnuz():
             # Normalize the weights and scales
             w13_weight, w13_weight_scale, w13_input_scale = \
                 normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index aca25c9bfa1..27a74d677da 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -42,7 +42,7 @@ def process_weights_after_loading(self, layer) -> None:
                 logical_widths=layer.logical_widths,
             )
 
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 input_scale = getattr(layer, 'input_scale', None)
 
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
@@ -60,7 +60,7 @@ def process_weights_after_loading(self, layer) -> None:
         elif self.strategy == QuantizationStrategy.CHANNEL:
             weight = layer.weight
 
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 input_scale = getattr(layer, 'input_scale', None)
 
                 weight, weight_scale, input_scale = \
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 110e4ef2e9f..1cc431c5cc7 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -127,7 +127,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
         weight = layer.weight
 
-        if current_platform.is_rocm():
+        if current_platform.is_fp8_fnuz():
             weight, weight_scale, input_scale = \
                 normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 3f8e0a2f923..2d5d8e6adc9 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -270,7 +270,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # TODO(rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 weight, weight_scale_inv, _ = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=layer.weight,
@@ -327,8 +327,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 weight = layer.weight
                 weight_scale = layer.weight_scale
 
-                # If rocm, use float8_e4m3fnuz.
-                if current_platform.is_rocm():
+                if current_platform.is_fp8_fnuz():
                     weight, weight_scale, input_scale = \
                         normalize_e4m3fn_to_e4m3fnuz(
                             weight=weight,
@@ -533,7 +532,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 w13_weight, w13_weight_scale_inv, w13_input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         layer.w13_weight, layer.w13_weight_scale_inv,
@@ -559,9 +558,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
         # If checkpoint is fp16, quantize in place.
         if not self.quant_config.is_checkpoint_fp8_serialized:
-            # If rocm, use float8_e4m3fnuz as dtype
-            fp8_dtype = torch.float8_e4m3fnuz \
-                        if current_platform.is_rocm() else torch.float8_e4m3fn
+            fp8_dtype = current_platform.fp8_dtype()
             w13_weight = torch.empty_like(layer.w13_weight.data,
                                           dtype=fp8_dtype)
             w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype)
@@ -608,8 +605,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     layer.w13_input_scale.max(), requires_grad=False)
                 layer.w2_input_scale = torch.nn.Parameter(
                     layer.w2_input_scale.max(), requires_grad=False)
-            # If rocm, normalize the weights and scales to e4m3fnuz
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 # Normalize the weights and scales
                 w13_weight, w13_weight_scale, w13_input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 32dce5aaf5e..bc26a455c6f 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -142,8 +142,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_input_scale = torch.nn.Parameter(
                 layer.w2_input_scale.max(), requires_grad=False)
 
-        # If rocm, normalize the weights and scales to e4m3fnuz
-        if current_platform.is_rocm():
+        if current_platform.is_fp8_fnuz():
             # Normalize the weights and scales
             w13_weight, w13_weight_scale, w13_input_scale = \
                 normalize_e4m3fn_to_e4m3fnuz(
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 7676fbddb6b..3e4251e4693 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -39,7 +39,7 @@ def process_weights_after_loading(self, layer) -> None:
                 logical_widths=layer.logical_widths,
             )
 
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
                     weight=weight,
                     weight_scale=max_w_scale,
@@ -55,7 +55,7 @@ def process_weights_after_loading(self, layer) -> None:
         elif self.qscheme == "per_channel":
             weight = layer.weight
 
-            if current_platform.is_rocm():
+            if current_platform.is_fp8_fnuz():
                 weight, weight_scale, input_scale = \
                     normalize_e4m3fn_to_e4m3fnuz(
                         weight=weight,
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 62569185ef4..1e19302cbad 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -22,10 +22,6 @@
 
 logger = init_logger(__name__)
 
-current_platform_fp8_dtype = (torch.float8_e4m3fnuz
-                              if current_platform.is_rocm() else
-                              torch.float8_e4m3fn)
-
 
 def is_fp8(x: Union[torch.dtype, torch.Tensor]) -> bool:
     if isinstance(x, torch.Tensor):
@@ -165,9 +161,7 @@ def input_to_float8(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """This function quantizes input values to float8 values "
     "with tensor-wise quantization."""
-    if dtype is None:
-        dtype = (torch.float8_e4m3fnuz
-                 if current_platform.is_rocm() else torch.float8_e4m3fn)
+    dtype = current_platform.fp8_dtype() if dtype is None else dtype
     finfo = torch.finfo(dtype)
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
@@ -311,9 +305,7 @@ def per_token_group_quant_fp8(
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
         scaling factor for quantization.
     """
-    if dtype is None:
-        dtype = (torch.float8_e4m3fnuz
-                 if current_platform.is_rocm() else torch.float8_e4m3fn)
+    dtype = current_platform.fp8_dtype() if dtype is None else dtype
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
         f"by `group_size` {group_size}")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index a986ec0a33f..3897584307e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -293,6 +293,10 @@ def get_punica_wrapper(cls) -> str:
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
 
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        return cls.has_device_capability(89)
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index e7e55e11775..7415b5d5f06 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -330,6 +330,36 @@ def get_device_communicator_cls(cls) -> str:
         """
         return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"  # noqa
 
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        """
+        Returns whether the current platform supports FP8 types.
+        """
+        return False
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        """
+        Returns whether the preferred FP8 type is FNUZ on the current platform.
+
+        There are two representations of FP8, OCP FP8 and FNUZ FP8.
+        The OCP specification can be found at https://tinyurl.com/b7jvwpft.
+        The FNUZ specification can be found at https://tinyurl.com/5n6hwwu5.
+
+        AMD's MI300 and MI325 have native hardware support for FNUZ. All other
+        hardware has converged on the OCP FP8 standard.
+        """
+        return False
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        """
+        Returns the preferred FP8 type on the current platform.
+
+        See the documentation for is_fp8_fnuz for details.
+        """
+        return torch.float8_e4m3fn
+
     @classmethod
     def use_all_gather(cls) -> bool:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index de4f6070f0e..75f287b568a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -231,3 +231,20 @@ def get_current_memory_usage(cls,
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"  # noqa
+
+    @classmethod
+    def supports_fp8(cls) -> bool:
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        return any(gfx in gcn_arch for gfx in ['gfx94', 'gfx95', 'gfx12'])
+
+    @classmethod
+    def is_fp8_fnuz(cls) -> bool:
+        # only device 0 is checked, this assumes MI300 platforms are homogeneous
+        return 'gfx94' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def fp8_dtype(cls) -> torch.dtype:
+        if cls.is_fp8_fnuz():
+            return torch.float8_e4m3fnuz
+        else:
+            return torch.float8_e4m3fn
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 526b792ab1f..14a7bd35352 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -219,7 +219,7 @@
     CompressedTensorsW8A8Fp8)
 from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    Fp8LinearGenericOp, current_platform_fp8_dtype, is_fp8)
+    Fp8LinearGenericOp, is_fp8)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
@@ -826,7 +826,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 W_Q_UK, W_Q_UK_scales = scaled_quantize(
                     W_Q_UK,
                     self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                 # For FP8 save the transpose so we can use
                 # `apply_w8a8_block_fp8_linear` directly
                 self.W_Q_UK = W_Q_UK.T.contiguous()
@@ -843,7 +843,7 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 W_UV_O, W_UV_O_scales = scaled_quantize(
                     W_UV_O,
                     self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform_fp8_dtype)
+                    quant_dtype=current_platform.fp8_dtype())
                 # For FP8 save the transpose so we can use
                 # `apply_w8a8_block_fp8_linear` directly
                 self.W_UV_O = W_UV_O.T.contiguous()

From 59dc534b8cac9dfbc00055e9787d1c4daea986a5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 11 Mar 2025 22:59:43 +0800
Subject: [PATCH 0654/1240] [Bugfix] Update `--hf-overrides` for
 `Alibaba-NLP/gte-Qwen2` (#14609)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md            | 11 ++++-------
 tests/models/embedding/language/test_embedding.py |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index c9140bd06e8..e46934b9cae 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -541,14 +541,11 @@ You should manually set mean pooling by passing `--override-pooler-config '{"poo
 :::
 
 :::{note}
-Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention.
-You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly.
+The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results,
+you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other.
 
-On the other hand, its 1.5B variant (`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention
-despite being described otherwise on its model card.
-
-Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be
-loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
+For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
+See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
 :::
 
 If your model is not in the above list, we will try to automatically convert the model using
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 4b9926860f2..a8ac70d58e6 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -42,8 +42,8 @@ def test_models(
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["override_pooler_config"] = \
             PoolerConfig(pooling_type="MEAN")
-    if model == "Alibaba-NLP/gte-Qwen2-7B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": False}
+    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
+        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"

From 65b9fa4796e95014f56299a564dcee5840502be2 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Tue, 11 Mar 2025 08:01:35 -0700
Subject: [PATCH 0655/1240] Uninstall dependencies before installing
 requirements/tpu.txt (#14586)

Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.tpu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 960dc8e9ed9..50806d8820a 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -15,6 +15,9 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
+# Remove existing versions of dependencies
+RUN pip uninstall -y torch torch_xla torchvision
+
 ENV VLLM_TARGET_DEVICE="tpu"
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \

From 120f0cfd106fb696091068cc31fb5c0eaa4fe013 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Mar 2025 11:03:44 -0400
Subject: [PATCH 0656/1240] [V1] Add regex structured output support with
 xgrammar (#14590)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt                       |  4 +--
 .../llm/test_struct_output_generate.py        | 32 +++++++++----------
 vllm/v1/structured_output/__init__.py         |  2 ++
 vllm/v1/structured_output/utils.py            |  6 +++-
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 27f5aad96aa..13a06011e40 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,7 +19,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.11; platform_machine == "x86_64"
+xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
@@ -37,4 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
-scipy # Required for phi-4-multimodal-instruct
\ No newline at end of file
+scipy # Required for phi-4-multimodal-instruct
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 871739bcf16..bddd224548c 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+import re
 
 import jsonschema
 import pytest
@@ -219,25 +220,24 @@ def test_guided_regex(monkeypatch, sample_regex, guided_decoding_backend: str):
                                      guided_decoding=GuidedDecodingParams(
                                          regex=sample_regex,
                                          backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="Regex guided decoding is not supported."):
-        llm.generate(prompts=[
+    outputs = llm.generate(
+        prompts=[
             f"Give an example IPv4 address with this regex: {sample_regex}"
         ] * 2,
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
+        sampling_params=sampling_params,
+        use_tqdm=True,
+    )
 
-    # Once regex is supported --
-    #assert outputs is not None
-    #for output in outputs:
-    #    assert output is not None
-    #    assert isinstance(output, RequestOutput)
-    #    prompt = output.prompt
-    #    generated_text = output.outputs[0].text
-    #    print(generated_text)
-    #    assert generated_text is not None
-    #    assert re.fullmatch(sample_regex, generated_text) is not None
-    #    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(generated_text)
+        assert generated_text is not None
+        assert re.fullmatch(sample_regex, generated_text) is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 1f6e3564392..3f828e08547 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -112,6 +112,8 @@ def initialize_grammar(self, key: StructuredOutputKey) -> Grammar:
             ctx = self.compiler.compile_builtin_json_grammar()
         elif request_type == StructuredOutputOptions.GRAMMAR:
             ctx = self.compiler.compile_grammar(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            ctx = self.compiler.compile_regex(grammar_spec)
         else:
             logger.error("Validation should have already occurred. "
                          "Please file an issue.")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 7b1adb834e7..b373d31e0ab 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -251,7 +251,11 @@ def validate_structured_output_request(
     gd_params = sampling_params.guided_decoding
 
     if gd_params.regex:
-        raise ValueError("Regex structured output is not supported.")
+        try:
+            xgr.Grammar.from_regex(gd_params.regex)
+        except Exception as err:
+            raise ValueError("Failed to transform regex into a grammar: "
+                             f"{err}") from err
 
     if gd_params.choice:
         choice_grammar = choice_as_grammar(gd_params.choice)

From 8ffebd3e3883cda159cefd5af302f08e2e622a1d Mon Sep 17 00:00:00 2001
From: Dilip Gowda Bhagavan <110233170+dilipgb@users.noreply.github.com>
Date: Tue, 11 Mar 2025 22:32:17 +0530
Subject: [PATCH 0657/1240] docs: Add documentation for s390x cpu
 implementation (#14198)

Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation.md   |  1 +
 .../getting_started/installation/cpu.md       | 34 ++++++++++
 .../installation/cpu/s390x.inc.md             | 62 +++++++++++++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 docs/source/getting_started/installation/cpu/s390x.inc.md

diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
index cbaa11806f5..af9fd495a72 100644
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@@ -21,6 +21,7 @@ installation/ai_accelerator
   - Intel/AMD x86
   - ARM AArch64
   - Apple silicon
+  - IBM Z (S390X)
 - <project:installation/ai_accelerator.md>
   - Google TPU
   - Intel Gaudi
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index d7b8cc843bc..9ca25e4709e 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -36,6 +36,16 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
+::::{tab-item} IBM Z (S390X)
+:sync: s390x
+
+:::{include} cpu/s390x.inc.md
+:start-after: "# Installation"
+:end-before: "## Requirements"
+:::
+
+::::
+
 :::::
 
 ## Requirements
@@ -75,6 +85,16 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ::::
 
+::::{tab-item} IBM Z (S390X)
+:sync: s390x
+
+:::{include} cpu/s390x.inc.md
+:start-after: "## Requirements"
+:end-before: "## Set up using Python"
+:::
+
+::::
+
 :::::
 
 ## Set up using Python
@@ -123,6 +143,16 @@ Currently, there are no pre-built CPU wheels.
 
 ::::
 
+::::{tab-item} IBM Z (s390x)
+:sync: s390x
+
+:::{include} cpu/s390x.inc.md
+:start-after: "### Build wheel from source"
+:end-before: "## Set up using Docker"
+:::
+
+::::
+
 :::::
 
 ## Set up using Docker
@@ -147,6 +177,10 @@ $ docker run -it \
 For ARM or Apple silicon, use `Dockerfile.arm`
 ::::
 
+::::{tip}
+For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+::::
+
 ## Supported features
 
 vLLM CPU backend supports the following vLLM features:
diff --git a/docs/source/getting_started/installation/cpu/s390x.inc.md b/docs/source/getting_started/installation/cpu/s390x.inc.md
new file mode 100644
index 00000000000..9b41173b44c
--- /dev/null
+++ b/docs/source/getting_started/installation/cpu/s390x.inc.md
@@ -0,0 +1,62 @@
+# Installation
+
+vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
+
+Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+
+:::{attention}
+There are no pre-built wheels or images for this device, so you must build vLLM from source.
+:::
+
+## Requirements
+
+- OS: `Linux`
+- SDK: `gcc/g++ >= 12.3.0` or later with Command Line Tools
+- Instruction Set Architecture (ISA): VXE support is required. Works with Z14 and above.
+- Build install python packages: `pyarrow`, `torch` and `torchvision`
+
+## Set up using Python
+
+### Pre-built wheels
+
+### Build wheel from source
+
+Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4:
+
+```console
+dnf install -y \
+    which procps findutils tar vim git gcc g++ make patch make cython zlib-devel \
+    libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
+    openssl-devel openblas openblas-devel wget autoconf automake libtool cmake numactl-devel
+```
+
+Install rust>=1.80 which is needed for `outlines-core` and `uvloop` python packages installation.
+
+```console
+curl https://sh.rustup.rs -sSf | sh -s -- -y && \
+    . "$HOME/.cargo/env"
+```
+
+Execute the following commands to build and install vLLM from the source.
+
+::::{tip}
+Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
+::::
+
+```console
+    sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
+    pip install -v \
+        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        -r requirements-build.txt \
+        -r requirements-cpu.txt \
+    VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
+    pip install dist/*.whl
+```
+
+## Set up using Docker
+
+### Pre-built images
+
+### Build image from source
+
+## Extra information

From 0f60848a24a970b6495557cda2efdc6aa0030f8d Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 11 Mar 2025 13:09:03 -0400
Subject: [PATCH 0658/1240] [BugFix/Build] Fix sparse kernels not getting built
 on hopper (#14572)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                | 3 ++-
 csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55ac3c77b62..e028bf5951a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -422,7 +422,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # 2:4 Sparse Kernels
 
   # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
-  # require CUDA 12.2 or later (and only work on Hopper and Blackwell).
+  # require CUDA 12.2 or later (and only work on Hopper).
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
     set_gencode_flags_for_srcs(
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
index 8c408719e8e..38b929be41c 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu
@@ -58,7 +58,9 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
 
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-  if (version_num >= 90) {
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
     cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
                                   bias);
     return;
@@ -82,7 +84,9 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {
 
   // Guard against compilation issues for sm90 kernels
 #if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
-  if (version_num >= 90) {
+  // We build for 9.0a which is not forward compatible, so restrict this to
+  // Hopper only
+  if (version_num == 90) {
     std::vector<torch::Tensor> result_tensors;
 
     auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);

From 3f1b88f1725940cc6cc5a65aa2905493a00fd343 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Tue, 11 Mar 2025 10:11:47 -0700
Subject: [PATCH 0659/1240] [Hardware][Intel GPU] upgrade IPEX dependency to
 2.6.10.  (#14564)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.xpu                                | 22 +++++++++----------
 .../installation/gpu/xpu.inc.md               | 18 ++++++++++-----
 requirements/xpu.txt                          | 17 +++++++++-----
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 530809bcd4d..672a494eef9 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -21,7 +21,8 @@ RUN apt-get update -y && \
     python3 \
     python3-dev \
     python3-pip \
-    # vim \
+    libze-intel-gpu-dev \
+    libze-intel-gpu1 \
     wget
 
 WORKDIR /workspace/vllm
@@ -32,19 +33,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
     -r requirements/xpu.txt
 
-RUN git clone https://github.com/intel/pti-gpu && \
-    cd pti-gpu/sdk && \
-    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
-    mkdir build && \
-    cd build && \
-    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
-    make -j && \
-    cmake --install . --config Release --prefix "/usr/local"
-
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
 
 COPY . .
-ARG GIT_REPO_CHECK
+ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
@@ -54,6 +46,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     python3 setup.py install
 
+# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
+# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install intel-extension-for-pytorch==2.6.10+xpu \
+    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+
 CMD ["/bin/bash"]
 
 FROM vllm-base AS vllm-openai
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 9678c25b1dd..5a47b16f776 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -9,7 +9,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 ## Requirements
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
-- OneAPI requirements: oneAPI 2024.2
+- OneAPI requirements: oneAPI 2025.0
 
 ## Set up using Python
 
@@ -19,21 +19,27 @@ Currently, there are no pre-built XPU wheels.
 
 ### Build wheel from source
 
-- First, install required driver and intel OneAPI 2024.2 or later.
+- First, install required driver and Intel OneAPI 2025.0 or later.
 - Second, install Python packages for vLLM XPU backend building:
 
 ```console
-source /opt/intel/oneapi/setvars.sh
 pip install --upgrade pip
 pip install -v -r requirements/xpu.txt
 ```
 
-- Finally, build and install vLLM XPU backend:
+- Then, build and install vLLM XPU backend:
 
 ```console
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
+- Finally, due to a known issue of conflict dependency(oneapi related) in torch-xpu 2.6 and ipex-xpu 2.6, we install ipex here. This will be fixed in the ipex-xpu 2.7.
+
+```console
+pip install intel-extension-for-pytorch==2.6.10+xpu \
+    --extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+```
+
 :::{note}
 - FP16 is the default data type in the current XPU backend. The BF16 data
   type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
@@ -59,7 +65,7 @@ $ docker run -it \
 
 ## Supported features
 
-XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
 
 ```console
 python -m vllm.entrypoints.openai.api_server \
@@ -73,3 +79,5 @@ python -m vllm.entrypoints.openai.api_server \
 ```
 
 By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+
+There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 265205957be..0e3252f02d3 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -1,7 +1,7 @@
 # Common dependencies
 -r common.txt
 
-ray >= 2.9
+ray>=2.9
 cmake>=3.26
 ninja
 packaging
@@ -9,9 +9,16 @@ setuptools-scm>=8
 setuptools>=75.8.0
 wheel
 jinja2
+datasets # for benchmark scripts
 
-torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch==2.6.0+xpu
+torchaudio
+torchvision
+pytorch-triton-xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
 
-triton-xpu == 3.0.0b1
+# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
+# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
+# intel-extension-for-pytorch==2.6.10+xpu
+oneccl_bind_pt==2.6.0+xpu
+--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
\ No newline at end of file

From 0e64fd0257adabdbeffabd9f989ac662aa7293ab Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Mar 2025 13:36:07 -0400
Subject: [PATCH 0660/1240] [V1] Remove cache from StructuredOutputManager
 (#14622)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core.py                |  2 +-
 vllm/v1/structured_output/__init__.py | 58 +++++----------------------
 2 files changed, 12 insertions(+), 48 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 43cb95fb47f..1ba55797770 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -137,7 +137,7 @@ def add_request(self, request: EngineCoreRequest):
         req = Request.from_engine_core_request(request)
         if req.use_structured_output:
             # Start grammar compilation asynchronously
-            self.structured_output_manager.populate_cache(req)
+            self.structured_output_manager.grammar_init(req)
 
         self.scheduler.add_request(req)
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 3f828e08547..fd1e6feed6a 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -1,18 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import copy
 import multiprocessing
-from collections import OrderedDict
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import Future, ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.utils import LazyLoader
-from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
-                                               StructuredOutputOptions)
+from vllm.v1.structured_output.grammar import Grammar, StructuredOutputOptions
 
 if TYPE_CHECKING:
     import numpy as np
@@ -29,7 +26,7 @@
 
 class StructuredOutputManager:
 
-    def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
+    def __init__(self, vllm_config: VllmConfig):
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
@@ -44,10 +41,6 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
             tokenizer, vocab_size=self.vocab_size)
         self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
 
-        self.max_cache_size = max_cache_size
-        self.request_key_to_grammar: OrderedDict[StructuredOutputKey,
-                                                 Grammar] = OrderedDict()
-
         # The default max_workers if not specified is the number of CPUs * 5,
         # which is way too high since these tasks are CPU-bound, not I/O bound.
         # We also know we would never dominate CPU usage with just grammar
@@ -56,51 +49,22 @@ def __init__(self, vllm_config: VllmConfig, max_cache_size: int = 500):
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
         self._grammar_bitmask: Optional[torch.Tensor] = None
 
-    def __getitem__(self, key: StructuredOutputKey) -> Optional[Grammar]:
-        # We need to pop and re-insert the grammar here for LRU cache
-        # of request_key_to_grammar
-        if key in self.request_key_to_grammar:
-            # Move accessed item to the end (most recently used)
-            value = self.request_key_to_grammar.pop(key)
-            if value is not None:
-                self.request_key_to_grammar[key] = value
-            return value
-        return None
-
-    def populate_cache(self, request: Request) -> None:
+    def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:
             return
 
-        grammar = self.request_key_to_grammar.get(
-            request.structured_output_request.structured_output_key)
-        if grammar:
-            request.structured_output_request.grammar = copy.copy(grammar)
-            return
-        request.structured_output_request.grammar = self.cache(request)
-
-    def cache(self, request: Request):
-        return self.executor.submit(self._executor_loop, request)
+        grammar: Future[Grammar] = self.executor.submit(
+            self._async_create_grammar, request)
+        request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
-    def _executor_loop(self, request: Request) -> Grammar:
-        # NOTE: The structured_output_request should never be
-        # None in this case, but mypy can't infer this
-        # correctly, so we need to ignore the error here.
+    def _async_create_grammar(self, request: Request) -> Grammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
-        grammar = self.request_key_to_grammar.get(key)
-        if grammar is not None:
-            return copy.copy(grammar)
-        grammar = self.initialize_grammar(key)
-        # If cache is full, remove the least recently used item
-        if len(self.request_key_to_grammar) >= self.max_cache_size:
-            self.request_key_to_grammar.popitem(last=False)
-        self.request_key_to_grammar[key] = grammar
-        return copy.copy(grammar)
-
-    def initialize_grammar(self, key: StructuredOutputKey) -> Grammar:
+
         # Note that the request was validated in the engine core client,
         # so at this point we know it is a supported type of request.
         #
-        # TODO: we still need to handle xgrammar compilation failures
+        # TODO: we still need to handle xgrammar compilation failures,
+        # though it should be unlikely as we test that up front as well.
         request_type, grammar_spec = key
 
         if request_type == StructuredOutputOptions.JSON:

From d210937bb04b8e65f4908bea5cf7b7aee04695a8 Mon Sep 17 00:00:00 2001
From: "Yang.Tao" <hackty@163.com>
Date: Wed, 12 Mar 2025 01:38:24 +0800
Subject: [PATCH 0661/1240] fix some typos : supported_head_sizes (#14627)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/blocksparse_attn.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
index 9765e7881ad..ea4f840729b 100644
--- a/vllm/attention/backends/blocksparse_attn.py
+++ b/vllm/attention/backends/blocksparse_attn.py
@@ -335,11 +335,11 @@ def __init__(
         self.sparse_block_size = self.blocksparse_params.block_size
         self.head_sliding_step = self.blocksparse_params.head_sliding_step
 
-        suppored_head_sizes = PagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        supported_head_sizes = PagedAttention.get_supported_head_sizes()
+        if head_size not in supported_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
 
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()

From 3ce279e26b5e9b9fc9947729e28f0d861c68f6a7 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 11 Mar 2025 16:21:33 -0400
Subject: [PATCH 0662/1240] [V1] Delay all xgrammar usage until needed (#14616)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 33 ++++++++++++++++-----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index fd1e6feed6a..3428b8522e5 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -14,7 +14,6 @@
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
-    import torch
     import xgrammar as xgr
 
     from vllm.v1.request import Request
@@ -27,14 +26,18 @@
 class StructuredOutputManager:
 
     def __init__(self, vllm_config: VllmConfig):
-        tokenizer_group = init_tokenizer_from_configs(
-            model_config=vllm_config.model_config,
-            scheduler_config=vllm_config.scheduler_config,
-            parallel_config=vllm_config.parallel_config,
-            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
         self.vocab_size = vllm_config.model_config.get_vocab_size()
         self.vllm_config = vllm_config
+        self.init_complete = False
+
+    def _delayed_init(self):
+        """Initialization delayed until we know it is needed."""
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=self.vllm_config.model_config,
+            scheduler_config=self.vllm_config.scheduler_config,
+            parallel_config=self.vllm_config.parallel_config,
+            lora_config=self.vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         tokenizer_info = xgr.TokenizerInfo.from_huggingface(
@@ -47,12 +50,21 @@ def __init__(self, vllm_config: VllmConfig):
         # compilation, so we set it to half the number of CPUs.
         max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self._grammar_bitmask: Optional[torch.Tensor] = None
+        self._grammar_bitmask = xgr.allocate_token_bitmask(
+            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+
+        self.init_complete = True
 
     def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:
             return
 
+        # The first time this is called, we need to finish initialization
+        # of xgrammar. We defer it to avoid the import of xgrammar and
+        # initialization cost if it is not going to be used.
+        if not self.init_complete:
+            self._delayed_init()
+
         grammar: Future[Grammar] = self.executor.submit(
             self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
@@ -100,11 +112,6 @@ def grammar_bitmask(
         if not structured_output_request_ids:
             return None
 
-        if self._grammar_bitmask is None:
-            self._grammar_bitmask = xgr.allocate_token_bitmask(
-                self.vllm_config.scheduler_config.max_num_seqs,
-                self.vocab_size)
-
         # Fill the bitmask using the index of each request equal to its
         # position in the batch. Resize the bitmask down to the size of
         # the batch.

From 2b3be1b344a8fccacb64ce22ddc989f66dccdbfc Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Tue, 11 Mar 2025 14:14:33 -0700
Subject: [PATCH 0663/1240] Fix run_tpu_test (#14641)

Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 650af0fac4c..8ba2e4e386f 100755
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -19,7 +19,6 @@ docker run --privileged --net host --shm-size=16G -it \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
     && python3 -m pip install pytest \
     && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
     && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
     && python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \

From d912de59674733df399c1a6669ab08208fa42193 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Tue, 11 Mar 2025 16:12:26 -0700
Subject: [PATCH 0664/1240] [V1][TPU] Pad the block_table.shape[1] so the
 ragged paged attention can handle correctly (#14597)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 00869467be3..effcac7e7bd 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,7 +23,8 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -138,8 +139,10 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
+        padded_max_num_blocks_per_req = _get_padded_number(
+            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, self.max_num_blocks_per_req),
+            (self.max_num_tokens, padded_max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From 0765a5b39645222e0e5b54c505a29a6044016c9f Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 11 Mar 2025 16:40:07 -0700
Subject: [PATCH 0665/1240] [Bugfix][V1][PP] Only warmup sampler at last PP
 rank (#14643)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_worker.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 040a27de948..5527a105f86 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -14,6 +14,7 @@
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
+from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
@@ -219,20 +220,22 @@ def compile_or_warm_up_model(self) -> None:
         # fragmentation issue.
         # NOTE: This is called after `capture_model` on purpose to prevent
         # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        try:
-            max_num_reqs = min(self.scheduler_config.max_num_seqs,
-                               self.scheduler_config.max_num_batched_tokens)
-            self.model_runner._dummy_sampler_run(
-                hidden_states=self.model_runner._dummy_run(
-                    num_tokens=max_num_reqs))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                raise RuntimeError(
-                    "CUDA out of memory occurred when warming up sampler. "
-                    "Please try lowering `gpu_memory_utilization` when "
-                    "initializing the engine.") from None
-            else:
-                raise e
+        if get_pp_group().is_last_rank:
+            try:
+                max_num_reqs = min(
+                    self.scheduler_config.max_num_seqs,
+                    self.scheduler_config.max_num_batched_tokens)
+                self.model_runner._dummy_sampler_run(
+                    hidden_states=self.model_runner._dummy_run(
+                        num_tokens=max_num_reqs))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    raise RuntimeError(
+                        "CUDA out of memory occurred when warming up sampler. "
+                        "Please try lowering `gpu_memory_utilization` when "
+                        "initializing the engine.") from None
+                else:
+                    raise e
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.

From 8cf3af9386187a75d48c5d7566f0bd5cb0f7cb66 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Tue, 11 Mar 2025 17:14:50 -0700
Subject: [PATCH 0666/1240] [release] Add commands to clean up logs on TPU
 release node (#14642)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 37cdab9e01e..9354bdd8a44 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -57,6 +57,8 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
+      - "rm /var/log/syslog"
+      - "rm /var/log/kern.log"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

From c4bb7bc83b6533cb749bce4b03d971b57de9c79e Mon Sep 17 00:00:00 2001
From: Randy Chen <152194236+randyjhc@users.noreply.github.com>
Date: Tue, 11 Mar 2025 17:31:48 -0700
Subject: [PATCH 0667/1240] [Feature] Add `vllm bench` CLI (#13993)

Signed-off-by: Randy Chen <acad.randyjhc@gmail.com>
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/benchmarks/endpoint_request_func.py   | 160 ++++
 vllm/benchmarks/serve.py                   | 927 +++++++++++++++++++++
 vllm/benchmarks/utils.py                   |  69 ++
 vllm/entrypoints/cli/benchmark/__init__.py |   0
 vllm/entrypoints/cli/benchmark/base.py     |  37 +
 vllm/entrypoints/cli/benchmark/main.py     |  50 ++
 vllm/entrypoints/cli/benchmark/serve.py    |  29 +
 vllm/entrypoints/cli/main.py               |   2 +
 8 files changed, 1274 insertions(+)
 create mode 100644 vllm/benchmarks/endpoint_request_func.py
 create mode 100644 vllm/benchmarks/serve.py
 create mode 100644 vllm/benchmarks/utils.py
 create mode 100644 vllm/entrypoints/cli/benchmark/__init__.py
 create mode 100644 vllm/entrypoints/cli/benchmark/base.py
 create mode 100644 vllm/entrypoints/cli/benchmark/main.py
 create mode 100644 vllm/entrypoints/cli/benchmark/serve.py

diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py
new file mode 100644
index 00000000000..32767a89607
--- /dev/null
+++ b/vllm/benchmarks/endpoint_request_func.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""The request function for API endpoints."""
+
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """The input for the request function."""
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    best_of: int = 1
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    """The output of the request function including metrics."""
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """The async request function for the OpenAI Completions API.
+
+    Args:
+        request_func_input: The input for the request function.
+        pbar: The progress bar to display the progress.
+
+    Returns:
+        The output of the request function.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# TODO: Add more request functions for different API protocols.
+ASYNC_REQUEST_FUNCS = {
+    "openai-comp": async_request_openai_completions,
+}
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
new file mode 100644
index 00000000000..cddfd672e7a
--- /dev/null
+++ b/vllm/benchmarks/serve.py
@@ -0,0 +1,927 @@
+# SPDX-License-Identifier: Apache-2.0
+r"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands
+to launch the vLLM OpenAI API server:
+    vllm serve <your_model> <engine arguments>        
+
+On the client side, run:
+    vllm bench serve \
+        --endpoint-type <endpoint_type. Default 'openi-comp'> \
+        --label <benchmark result label. Default using endpoint_type> \
+        --model <your_model> \
+        --dataset-name <dataset_name. Default 'random'> \
+        --request-rate <request_rate. Default inf> \
+        --num-prompts <num_prompts. Default 1000>
+"""
+import argparse
+import asyncio
+import gc
+import json
+import os
+import random
+import time
+import warnings
+from collections.abc import AsyncGenerator
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, Optional
+
+import numpy as np
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS,
+                                                   RequestFuncInput,
+                                                   RequestFuncOutput)
+from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format,
+                                   write_to_json)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: list[tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: list[tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: list[tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: list[tuple[float, float]]
+
+
+def sample_random_requests(
+    prefix_len: int,
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+) -> list[tuple[str, int, int]]:
+    prefix_token_ids = np.random.randint(0,
+                                         tokenizer.vocab_size,
+                                         size=prefix_len).tolist()
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(num_prompts):
+        prompt = tokenizer.decode(prefix_token_ids +
+                                  [(offsets[i] + i + j) % tokenizer.vocab_size
+                                   for j in range(input_lens[i])])
+
+        input_requests.append((prompt, int(prefix_len + input_lens[i]),
+                               int(output_lens[i]), None))
+
+    return input_requests
+
+
+async def get_request(
+    input_requests: list[tuple[str, int, int]],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a tuple.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
+    for request in input_requests:
+        yield request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: list[tuple[str, int, int]],
+    outputs: list[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentiles: list[float],
+    goodput_config_dict: dict[str, float],
+) -> tuple[BenchmarkMetrics, list[int]]:
+    """Calculate the metrics for the benchmark.
+
+    Args:
+        input_requests: The input requests.
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        tokenizer: The tokenizer to use.
+        selected_percentiles: The percentiles to select.
+        goodput_config_dict: The goodput configuration.
+
+    Returns:
+        A tuple of the benchmark metrics and the actual output lengths.
+    """
+    actual_output_lens: list[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: list[float] = []
+    tpots: list[float] = []
+    all_tpots: list[float] = []
+    ttfts: list[float] = []
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if output_len is None:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(outputs[i].generated_text,
+                              add_special_tokens=False).input_ids)
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i][1]
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(goodput_config_dict["ttft"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(goodput_config_dict["tpot"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(goodput_config_dict["e2el"] /
+                              MILLISECONDS_TO_SECONDS_CONVERSION)
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by the endpoint
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
+                             for p in selected_percentiles],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
+                            for p in selected_percentiles],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
+                             for p in selected_percentiles],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    endpoint_type: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: list[tuple[str, int, int]],
+    logprobs: Optional[int],
+    best_of: int,
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: list[str],
+    selected_percentiles: list[str],
+    ignore_eos: bool,
+    goodput_config_dict: dict[str, float],
+    max_concurrency: Optional[int],
+    lora_modules: Optional[list[str]],
+):
+    if endpoint_type in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
+    else:
+        raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = (
+        input_requests[0])
+    if endpoint_type != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat endpoint.
+        raise ValueError("Multi-modal content is only supported on "
+                         "'openai-chat' endpoint_type.")
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        best_of=best_of,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+    )
+
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}")
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))])
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(model=model_id,
+                                         model_name=model_name,
+                                         prompt=test_prompt,
+                                         api_url=base_url + "/start_profile",
+                                         prompt_len=test_prompt_len,
+                                         output_len=test_output_len,
+                                         logprobs=logprobs,
+                                         best_of=best_of,
+                                         multi_modal_content=test_mm_content,
+                                         ignore_eos=ignore_eos)
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = (asyncio.Semaphore(max_concurrency)
+                 if max_concurrency else None)
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input,
+                                      pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: list[asyncio.Task] = []
+    async for request in get_request(input_requests, request_rate, burstiness):
+        prompt, prompt_len, output_len, mm_content = request
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(model=req_model_id,
+                                              model_name=req_model_name,
+                                              prompt=prompt,
+                                              api_url=api_url,
+                                              prompt_len=prompt_len,
+                                              output_len=output_len,
+                                              logprobs=logprobs,
+                                              best_of=best_of,
+                                              multi_modal_content=mm_content,
+                                              ignore_eos=ignore_eos)
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input,
+                                     pbar=pbar)))
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            best_of=best_of,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    if goodput_config_dict:
+        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
+                                        metrics.request_goodput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
+                                    metrics.total_token_throughput))
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "request_goodput:":
+        metrics.request_goodput if goodput_config_dict else None,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+        print("{:<40} {:<10.2f}".format(
+            f"Mean {metric_name} (ms):",
+            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
+        print("{:<40} {:<10.2f}".format(
+            f"Median {metric_name} (ms):",
+            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms")
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms")
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms")
+        for p, value in getattr(metrics,
+                                f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
+                                            value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT",
+                       "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result
+
+
+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. ")
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative.")
+    return goodput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds.") from err
+    return goodput_config_dict
+
+
+def save_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                     results: dict[str, Any],
+                                     file_name: str) -> None:
+    metrics = [
+        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
+        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
+        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]]
+                 for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results if k not in metrics and k not in ignored_metrics
+        })
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        write_to_json(pt_file, pt_records)
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--endpoint-type",
+        type=str,
+        default="openai-comp",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--label",
+        type=str,
+        default=None,
+        help="The label (prefix) of the benchmark results. If not specified, "
+        "the endpoint type will be used as the label.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="random",
+        choices=["random"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.")
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and "
+        "returns the best one.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=("Number of logprobs-per-token to compute & return as part of "
+              "the request. If unspecified, then either (1) if beam search "
+              "is disabled, no logprobs are computed & a single dummy "
+              "logprob is returned for each token; or (2) if beam search "
+              "is enabled 1 logprob per token is computed"),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  # noqa
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
+        "Default value is \"ttft,tpot,itl\".")
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
+        "Default value is \"99\". "
+        "Use \"--percentile-metrics\" to select metrics.",
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        "pairs, where the key is a metric name, and the value is in "
+        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        "separated by spaces. Allowed request level metric names are "
+        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help=
+        "Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help=
+        "Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).")
+
+    parser.add_argument(
+        '--tokenizer-mode',
+        type=str,
+        default="auto",
+        choices=['auto', 'slow', 'mistral', 'custom'],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        'always use the slow tokenizer. \n* '
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.')
+
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. "
+                        "If not specified, the model name will be the "
+                        "same as the ``--model`` argument. ")
+
+    parser.add_argument("--lora-modules",
+                        nargs='+',
+                        default=None,
+                        help="A subset of LoRA module names passed in when "
+                        "launching the server. For each request, the "
+                        "script chooses a LoRA module at random.")
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    endpoint_type = args.endpoint_type
+    label = args.label
+    model_id = args.model
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(tokenizer_id,
+                              tokenizer_mode=tokenizer_mode,
+                              trust_remote_code=args.trust_remote_code)
+    # TODO: This should be refactored to use the benchmark_dataset.py
+    # in later PRs.
+    if args.dataset_name is None:
+        raise ValueError(
+            "Please specify '--dataset-name' and the corresponding "
+            "'--dataset-path' if required.")
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            prefix_len=args.random_prefix_len,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+        )
+
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+
+    goodput_config_dict = check_goodput_args(args)
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            endpoint_type=endpoint_type,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            model_name=model_name,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            logprobs=args.logprobs,
+            best_of=args.best_of,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[
+                float(p) for p in args.metric_percentiles.split(",")
+            ],
+            ignore_eos=args.ignore_eos,
+            goodput_config_dict=goodput_config_dict,
+            max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
+        ))
+
+    # Save config and results to json
+    if args.save_result:
+        result_json: dict[str, Any] = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["endpoint_type"] = endpoint_type
+        result_json["label"] = label
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
+        # Traffic
+        result_json["request_rate"] = (args.request_rate if args.request_rate
+                                       < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
+                               if args.max_concurrency is not None else "")
+        label = label or endpoint_type
+        file_name = f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_filename:
+            file_name = args.result_filename
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(file_name, "w", encoding='utf-8') as outfile:
+            json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py
new file mode 100644
index 00000000000..45a0ddbd5d0
--- /dev/null
+++ b/vllm/benchmarks/utils.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
diff --git a/vllm/entrypoints/cli/benchmark/__init__.py b/vllm/entrypoints/cli/benchmark/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py
new file mode 100644
index 00000000000..c41b2c58678
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/base.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+
+
+class BenchmarkSubcommandBase(CLISubcommand):
+    """ The base class of subcommands for vllm bench. """
+
+    @property
+    def help(self) -> str:
+        """The help message of the subcommand."""
+        raise NotImplementedError
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            self.name,
+            help=self.help,
+            usage=f"vllm bench {self.name} [options]")
+        self.add_cli_args(parser)
+        return parser
diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py
new file mode 100644
index 00000000000..7583540920d
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/main.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+import vllm.entrypoints.cli.benchmark.serve
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+
+# TODO: Add the rest of the benchmark subcommands here,
+# e.g., throughput, latency, etc.
+BENCHMARK_CMD_MODULES = [
+    vllm.entrypoints.cli.benchmark.serve,
+]
+
+
+class BenchmarkSubcommand(CLISubcommand):
+    """ The `bench` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "bench"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+
+    def validate(self, args: argparse.Namespace) -> None:
+        if args.bench_type in self.cmds:
+            self.cmds[args.bench_type].validate(args)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            "bench",
+            help="vLLM bench subcommand.",
+            usage="vllm bench <bench_type> [options]")
+        bench_subparsers = bench_parser.add_subparsers(required=True,
+                                                       dest="bench_type")
+        self.cmds = {}
+        for cmd_module in BENCHMARK_CMD_MODULES:
+            new_cmds = cmd_module.cmd_init()
+            for cmd in new_cmds:
+                cmd.subparser_init(bench_subparsers).set_defaults(
+                    dispatch_function=cmd.cmd)
+                self.cmds[cmd.name] = cmd
+        return bench_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]
diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py
new file mode 100644
index 00000000000..d5a858920eb
--- /dev/null
+++ b/vllm/entrypoints/cli/benchmark/serve.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.serve import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
+    """ The `serve` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "serve"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark the online serving throughput."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkServingSubcommand()]
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index e94d9a0561f..13f2761b0db 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -5,6 +5,7 @@
 import signal
 import sys
 
+import vllm.entrypoints.cli.benchmark.main
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
@@ -16,6 +17,7 @@
 CMD_MODULES = [
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
+    vllm.entrypoints.cli.benchmark.main,
 ]
 
 
From 6b6da6c8b5580e38cbeabfd11ec2844d49f5e77a Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 11 Mar 2025 19:15:15 -0600
Subject: [PATCH 0668/1240] [core][V1] pluggable scheduler (#14466)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/plugins_tests/test_scheduler_plugins.py | 49 +++++++++++++++----
 vllm/engine/arg_utils.py                      |  5 ++
 vllm/v1/engine/core.py                        | 15 +++++-
 3 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 84688cee966..98981a81e90 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -1,27 +1,35 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 from vllm.core.scheduler import Scheduler
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.scheduler import Scheduler as V1Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
 
 
-class DummyScheduler(Scheduler):
+class DummyV0Scheduler(Scheduler):
 
     def schedule(self):
-        raise Exception("Exception raised by DummyScheduler")
+        raise Exception("Exception raised by DummyV0Scheduler")
+
 
+class DummyV1Scheduler(V1Scheduler):
 
-def test_scheduler_plugins():
-    import pytest
+    def schedule(self):
+        raise Exception("Exception raised by DummyV1Scheduler")
 
-    from vllm.engine.arg_utils import EngineArgs
-    from vllm.engine.llm_engine import LLMEngine
-    from vllm.sampling_params import SamplingParams
 
+def test_scheduler_plugins_v0(monkeypatch):
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with pytest.raises(Exception) as exception_info:
 
         engine_args = EngineArgs(
             model="facebook/opt-125m",
             enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyScheduler,
+            scheduler_cls=DummyV0Scheduler,
         )
 
         engine = LLMEngine.from_engine_args(engine_args=engine_args)
@@ -30,4 +38,27 @@ def test_scheduler_plugins():
         engine.add_request("0", "foo", sampling_params)
         engine.step()
 
-    assert str(exception_info.value) == "Exception raised by DummyScheduler"
+    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+
+
+def test_scheduler_plugins_v1(monkeypatch):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    # Explicitly turn off engine multiprocessing so that the scheduler runs in
+    # this process
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    with pytest.raises(Exception) as exception_info:
+
+        engine_args = EngineArgs(
+            model="facebook/opt-125m",
+            enforce_eager=True,  # reduce test time
+            scheduler_cls=DummyV1Scheduler,
+        )
+
+        engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+
+        sampling_params = SamplingParams(max_tokens=1)
+        engine.add_request("0", "foo", sampling_params)
+        engine.step()
+
+    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index baa075bc5e3..cef3a3f78b0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1437,6 +1437,11 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
 
         # V1 always uses chunked prefills.
         self.enable_chunked_prefill = True
+        # V1 should use the new scheduler by default.
+        # Swap it only if this arg is set to the original V0 default
+        if self.scheduler_cls == EngineArgs.scheduler_cls:
+            self.scheduler_cls = "vllm.v1.core.scheduler.Scheduler"
+
         # When no user override, set the default values based on the usage
         # context.
         # Use different default values for different hardware.
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1ba55797770..5a4e67a2dd7 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -19,9 +19,10 @@
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
-from vllm.utils import get_exception_traceback, zmq_socket_ctx
+from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
+                        zmq_socket_ctx)
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
-from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
+from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
@@ -65,6 +66,16 @@ def __init__(
         self.structured_output_manager = StructuredOutputManager(vllm_config)
 
         # Setup scheduler.
+        if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
+            logger.warning(
+                "Using configured V1 scheduler class %s. "
+                "This scheduler interface is not public and "
+                "compatibility may not be maintained.",
+                vllm_config.scheduler_config.scheduler_cls)
+            Scheduler = resolve_obj_by_qualname(
+                vllm_config.scheduler_config.scheduler_cls)
+        else:
+            Scheduler = vllm_config.scheduler_config.scheduler_cls
         self.scheduler = Scheduler(
             scheduler_config=vllm_config.scheduler_config,
             model_config=vllm_config.model_config,

From a95d12a78613c699f16f20ac4780dd2f41dafda7 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <JenZhao@users.noreply.github.com>
Date: Tue, 11 Mar 2025 19:23:04 -0700
Subject: [PATCH 0669/1240] [Doc] Update benchmarks README (#14646)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md | 178 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 165 insertions(+), 13 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 367ef93457f..edc10d8b43e 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,29 +1,181 @@
 # Benchmarking vLLM
 
-## Downloading the ShareGPT dataset
+This README guides you through running benchmark tests with the extensive
+datasets supported on vLLM. It’s a living document, updated as new features and datasets
+become available.
 
-You can download the dataset by running:
+## Dataset Overview
+
+<table style="width:100%; border-collapse: collapse;">
+  <thead>
+    <tr>
+      <th style="width:15%; text-align: left;">Dataset</th>
+      <th style="width:10%; text-align: center;">Online</th>
+      <th style="width:10%; text-align: center;">Offline</th>
+      <th style="width:65%; text-align: left;">Data Path</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td><strong>ShareGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
+    </tr>
+    <tr>
+      <td><strong>BurstGPT</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
+    </tr>
+    <tr>
+      <td><strong>Sonnet</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
+    </tr>
+    <tr>
+      <td><strong>Random</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>synthetic</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🚧</td>
+      <td>Specify your dataset path on HuggingFace</td>
+    </tr>
+    <tr>
+      <td><strong>VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🚧</td>
+      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
+    </tr>
+  </tbody>
+</table>
+✅: supported  
+🚧: to be supported
+
+**Note**: VisionArena’s `dataset-name` should be set to `hf`
+
+---
+## Example - Online Benchmark
+
+First start serving your model
 
 ```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+vllm serve ${MODEL_NAME} --disable-log-requests
 ```
 
-## Downloading the ShareGPT4V dataset
+Then run the benchmarking script
+
+```bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="sharegpt"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
+python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+```
+
+If successful, you will see the following output
+
+```
+============ Serving Benchmark Result ============
+Successful requests:                     10        
+Benchmark duration (s):                  5.78      
+Total input tokens:                      1369      
+Total generated tokens:                  2212      
+Request throughput (req/s):              1.73      
+Output token throughput (tok/s):         382.89    
+Total Token throughput (tok/s):          619.85    
+---------------Time to First Token----------------
+Mean TTFT (ms):                          71.54     
+Median TTFT (ms):                        73.88     
+P99 TTFT (ms):                           79.49     
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          7.91      
+Median TPOT (ms):                        7.96      
+P99 TPOT (ms):                           8.03      
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           7.74      
+Median ITL (ms):                         7.70      
+P99 ITL (ms):                            8.39      
+==================================================
+```
 
-The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
-will ignore a datapoint if the referred image is missing.
+### VisionArena Benchmark for Vision Language Models
 
 ```bash
-wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
-mkdir coco -p
-wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
-unzip coco/train2017.zip -d coco/
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
-# Downloading the BurstGPT dataset
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT='train'
+
+python3 benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}"
+```
 
-You can download the BurstGPT v1.1 dataset by running:
+---
+## Example - Offline Throughput Benchmark
 
 ```bash
-wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv
+MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
+NUM_PROMPTS=10
+DATASET_NAME="sonnet"
+DATASET_PATH="benchmarks/sonnet.txt"
+
+python3 benchmarks/benchmark_throughput.py \
+  --model "${MODEL_NAME}" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}"
+  ```
+
+If successful, you will see the following output
+
+```
+Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
 ```
+
+### Benchmark with LoRA Adapters
+
+``` bash
+MODEL_NAME="meta-llama/Llama-2-7b-hf"
+BACKEND="vllm"
+DATASET_NAME="sharegpt"
+DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+NUM_PROMPTS=10
+MAX_LORAS=2
+MAX_LORA_RANK=8
+ENABLE_LORA="--enable-lora"
+LORA_PATH="yard1/llama-2-7b-sql-lora-test"
+
+python3 benchmarks/benchmark_throughput.py \
+  --model "${MODEL_NAME}" \
+  --backend "${BACKEND}" \
+  --dataset_path "${DATASET_PATH}" \
+  --dataset_name "${DATASET_NAME}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --max-loras "${MAX_LORAS}" \
+  --max-lora-rank "${MAX_LORA_RANK}" \
+  ${ENABLE_LORA} \
+  --lora-path "${LORA_PATH}"
+  ```

From aba48933ffe4d4598fc7656d68fad3df5678b1d5 Mon Sep 17 00:00:00 2001
From: Farzad Abdolhosseini <farzad@fixie.ai>
Date: Tue, 11 Mar 2025 19:27:10 -0700
Subject: [PATCH 0670/1240] [Model] Extend Ultravox to accept audio longer than
 30s (#13631)

Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../audio_language/test_ultravox.py           |   2 +-
 .../multimodal/processing/test_common.py      |  57 +++-
 tests/models/registry.py                      |   3 +-
 vllm/model_executor/models/ultravox.py        | 252 +++++++++++++-----
 4 files changed, 233 insertions(+), 81 deletions(-)

diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 13433b04225..f8770bca4e9 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -15,7 +15,7 @@
 from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
-MODEL_NAME = "fixie-ai/ultravox-v0_4"
+MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
 AudioTuple = tuple[np.ndarray, int]
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 629d1012d18..e64b703cc52 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
 from functools import partial
+from typing import Optional
 
 import numpy as np
 import pytest
@@ -21,6 +23,7 @@ def _test_processing_correctness(
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
+    ignore_mm_keys: Optional[list[str]] = None,
 ):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
@@ -123,8 +126,10 @@ def _test_processing_correctness(
             hf_processor_mm_kwargs={},
         )
 
-        assert baseline_result == cached_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        assert _drop_mm_kwargs_keys(
+            baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
+                cached_result, ignore_mm_keys), (
+                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
         baseline_tokenized_result = baseline_processor.apply(
             tokenizer.encode(prompt, **tokenizer_encode_kwargs),
@@ -132,8 +137,10 @@ def _test_processing_correctness(
             hf_processor_mm_kwargs={},
         )
 
-        assert baseline_result == baseline_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        assert _drop_mm_kwargs_keys(
+            baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
+                baseline_tokenized_result, ignore_mm_keys), (
+                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
         cached_tokenized_result = cached_processor.apply(
             tokenizer.encode(prompt, **tokenizer_encode_kwargs),
@@ -141,8 +148,10 @@ def _test_processing_correctness(
             hf_processor_mm_kwargs={},
         )
 
-        assert cached_result == cached_tokenized_result, (
-            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        assert _drop_mm_kwargs_keys(
+            cached_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
+                cached_tokenized_result, ignore_mm_keys), (
+                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
 
 
 # yapf: disable
@@ -173,7 +182,7 @@ def _test_processing_correctness(
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_4",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
@@ -188,11 +197,19 @@ def test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
+    ignore_mm_keys = None
+    if 'ultravox' in model_id:
+        # In Ultravox, the audio_features can be different depending on padding
+        # The slight difference should not be a problem though, since
+        # attention_mask lets us ignore the difference.
+        ignore_mm_keys = ['audio_features']
+
     _test_processing_correctness(
         model_id,
         hit_rate=hit_rate,
         num_batches=num_batches,
         simplify_rate=simplify_rate,
+        ignore_mm_keys=ignore_mm_keys,
     )
 
 
@@ -221,3 +238,29 @@ def test_processing_correctness_phi3v(
         num_batches=num_batches,
         simplify_rate=simplify_rate,
     )
+
+
+def _drop_mm_kwargs_keys(result: dict,
+                         ignore_mm_keys: Optional[list[str]] = None) -> dict:
+    """Drop specified keys from result['mm_kwargs'].
+
+    This is mainly to avoid doing exact match of audio_features in ultravox.
+
+    Args:
+        result: Result to drop keys from
+        ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
+    """
+    if not ignore_mm_keys:
+        return result
+
+    if 'mm_kwargs' in result:
+        result = copy.deepcopy(result)
+        mm_kwargs = result['mm_kwargs']
+        for key in ignore_mm_keys:
+            mm_kwargs.pop(key, None)
+        for items in mm_kwargs._items_by_modality.values():
+            for item in items:
+                for key in ignore_mm_keys:
+                    item.pop(key, None)
+
+    return result
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 3c3247eaf3e..a7a88d19904 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -284,8 +284,7 @@ def check_available_online(
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
-    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
-                                     extras={"v0.5": "fixie-ai/ultravox-v0_5-llama-3_2-1b"},  # noqa: E501
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
     # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 90a833a83b6..f639b8d8f9b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,7 +5,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.utils.checkpoint
@@ -44,12 +44,23 @@
 _AUDIO_PLACEHOLDER_OVERRIDE = "<|reserved_special_token_0|>"
 _AUDIO_PLACEHOLDER_TOKEN = 128002
 _AUDIO_TOKENS_PER_SECOND = 6.25
+_MAX_ENCODER_BATCH_SIZE = 16
 
 
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
     data: NestedTensors
-    """Shape: `(batch_size, num_audios, 80, M)`"""
+    """Shape: `(batch_size, num_chunks, 80, M)`"""
+    lens: NestedTensors
+    """
+    Length of the audio frames. Used for attention mask in WhisperEncoder.
+    Shape: `(batch_size, num_chunks)`
+    """
+    token_len: NestedTensors
+    """
+    Length of the audio tokens. Used for flattening the audio features.
+    Shape: `(batch_size, num_chunks)`
+    """
 
 
 class UltravoxAudioEmbeddingInputs(TypedDict):
@@ -78,6 +89,7 @@ def get_hf_processor(
         # token, thus we override placeholder with a reserved special
         # token.
         hf_processor.audio_token_replacement = _AUDIO_PLACEHOLDER_OVERRIDE
+        hf_processor.audio_replacement_token_id = _AUDIO_PLACEHOLDER_TOKEN
         return hf_processor
 
     def get_feature_extractor(
@@ -104,7 +116,7 @@ def get_mm_max_tokens_per_item(
         max_audio_tokens = math.ceil(feature_extractor.chunk_length *
                                      _AUDIO_TOKENS_PER_SECOND)
 
-        return {"audio": max_audio_tokens}
+        return {"audio": max_audio_tokens * _MAX_ENCODER_BATCH_SIZE}
 
 
 class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]
@@ -118,7 +130,8 @@ def get_dummy_processor_inputs(
         feature_extractor = self.info.get_feature_extractor()
 
         sampling_rate = feature_extractor.sampling_rate
-        audio_len = feature_extractor.chunk_length * sampling_rate
+        audio_len = (feature_extractor.chunk_length * sampling_rate *
+                     _MAX_ENCODER_BATCH_SIZE)
         num_audios = mm_counts.get("audio", 0)
 
         mm_data = {
@@ -160,41 +173,38 @@ def _call_hf_processor(
         mm_kwargs = dict(
             **mm_kwargs,
             sampling_rate=feature_extractor.sampling_rate,
+            include_audio_num_chunks=True,
         )
 
-        # Ultravox processor doesn't support multiple inputs,
-        # therefore we need to input text and audio one by one
-        audio_features, audio_token_len = [], []
-        shared_outputs = {}
-        for audio in audios:
-            # NOTE: Ultravox processor accepts "audio" instead of "audios"
-            item_processor_data = dict(**mm_data, audio=audio)
-
-            item_outputs = super()._call_hf_processor(
-                prompt=prompt,
-                mm_data=item_processor_data,
-                mm_kwargs=mm_kwargs,
-            )
-
-            audio_features.append(item_outputs.pop("audio_values")[0])
-            audio_token_len.append(item_outputs.pop("audio_token_len").item())
-            shared_outputs = item_outputs
+        item_processor_data = dict(**mm_data, audios=audios)
 
-        combined_outputs = dict(
-            **shared_outputs,
-            audio_features=audio_features,
-            audio_token_len=audio_token_len,
+        output = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=item_processor_data,
+            mm_kwargs=mm_kwargs,
         )
-        return BatchFeature(combined_outputs)
+        output['audio_features'] = output.pop('audio_values')
+
+        return output
 
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_chunks = hf_inputs.get('audio_num_chunks', torch.zeros(0))
         return dict(
-            audio_features=MultiModalFieldConfig.batched("audio"),
-            audio_token_len=MultiModalFieldConfig.batched("audio"),
+            # to handle longer than 30s audio, each audio might be split
+            # into multiple chunks as such, their batch dimension can be
+            # higher than the number of audio samples
+            audio_features=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            audio_token_len=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            audio_lens=MultiModalFieldConfig.flat_from_sizes(
+                "audio", num_chunks),
+            # num_chunks can convert audio_chunked to audio batch dimension
+            audio_num_chunks=MultiModalFieldConfig.batched("audio"),
             audio_embeds=MultiModalFieldConfig.batched("audio"),
         )
 
@@ -205,14 +215,23 @@ def _get_prompt_updates(
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        tokenizer = self.info.get_tokenizer()
-        vocab = tokenizer.get_vocab()
 
-        replacement_id = vocab[
-            hf_processor.audio_token_replacement]  # type: ignore
+        replacement_id = hf_processor.audio_replacement_token_id  # type: ignore
+
+        # Each audio can be split into multiple chunks.
+        # chunks_start_idx[i] indicates the start index of the chunks
+        # belonging to the i-th audio.
+        num_chunks = out_mm_kwargs.get("audio_num_chunks", torch.zeros(0))
+        chunks_start_idx: torch.Tensor = torch.cumsum(num_chunks,
+                                                      dim=0,
+                                                      dtype=torch.int32)
+        chunks_start_idx = torch.cat(
+            [torch.tensor([0], dtype=torch.int32), chunks_start_idx])
 
         def get_replacement_ultravox(item_idx: int):
-            audio_token_len = out_mm_kwargs["audio_token_len"][item_idx]
+            start = chunks_start_idx[item_idx]
+            end = chunks_start_idx[item_idx + 1]
+            audio_token_len = out_mm_kwargs["audio_token_len"][start:end].sum()
             return [replacement_id] * int(audio_token_len)  # type: ignore
 
         return [
@@ -304,12 +323,49 @@ class ModifiedWhisperEncoder(WhisperEncoder):
 
     base_model_prefix = "model.encoder"
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config.is_decoder = False
+
+    @property
+    def max_context_length(self):
+        return (self.config.max_source_positions * self.conv1.stride[0] *
+                self.conv2.stride[0])
+
+    def get_attention_mask_by_audio_len(self,
+                                        audio_lens: Optional[torch.Tensor],
+                                        hidden_states: torch.Tensor):
+        """
+        Create attention mask based on audio lengths to mask out padding tokens
+        For each sample in batch:
+        - Convert raw audio length to feature length after convolutions
+        - Create bool mask: True for valid positions and False for padding
+        - Convert to attention mask format expected by transformer layers
+        (1.0 for positions to attend to, large negative for positions to ignore)
+        This masking ensures consistent behavior between training and inference
+        by preventing the model from attending to padding tokens in both cases
+        """
+        if audio_lens is None:
+            return None
+
+        audio_feature_len = self._get_feat_extract_output_lengths(audio_lens)
+        max_seq_len = hidden_states.shape[1]
+        attention_mask = torch.arange(max_seq_len,
+                                      device=hidden_states.device)[None, :].lt(
+                                          audio_feature_len.view(-1, 1))
+        attention_mask = self.get_extended_attention_mask(
+            attention_mask,
+            None,
+            dtype=hidden_states.dtype,
+        )
+        return attention_mask
+
     def forward(
         self,
-        input_features,
+        input_features: torch.Tensor,
+        audio_lens: Optional[torch.Tensor] = None,
     ):
-        expected_seq_length = (self.config.max_source_positions *
-                               self.conv1.stride[0] * self.conv2.stride[0])
+        expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length "
@@ -328,10 +384,13 @@ def forward(
                                               p=self.dropout,
                                               training=self.training)
 
+        attention_mask = self.get_attention_mask_by_audio_len(
+            audio_lens, hidden_states)
+
         for encoder_layer in self.layers:
             layer_outputs = encoder_layer(
                 hidden_states,
-                None,
+                attention_mask,
                 layer_head_mask=None,
             )
 
@@ -409,17 +468,34 @@ def get_mm_mapping(self) -> MultiModelKeys:
         )
 
     def _audio_features_to_embeddings(
-            self, input_features: torch.Tensor) -> torch.Tensor:
-        audio_input = input_features.to(self.audio_tower.dtype)
-        audio_features = self.audio_tower(audio_input)
-        audio_features = audio_features.to(self.audio_tower.dtype)
-        audio_embeddings = self.multi_modal_projector(audio_features)
+            self, input_features: torch.Tensor,
+            audio_lens: torch.Tensor) -> torch.Tensor:
+        audio_features = input_features.to(self.audio_tower.dtype)
+        batch_size = audio_features.size(0)
+        audio_embeddings = []
+
+        # Process audio features in batches to keep memory usage predictable
+        for start in range(0, batch_size, _MAX_ENCODER_BATCH_SIZE):
+            end = min(start + _MAX_ENCODER_BATCH_SIZE, batch_size)
+            # Process through audio tower
+            batch_features = self.audio_tower(audio_features[start:end],
+                                              audio_lens[start:end])
+            batch_features = batch_features.to(self.audio_tower.dtype)
+
+            # Process through projector
+            batch_embeddings = self.multi_modal_projector(batch_features)
+            audio_embeddings.append(batch_embeddings)
+
+        # Concatenate results
+        audio_embeddings = torch.cat(audio_embeddings, dim=0)
         return audio_embeddings
 
     def _parse_and_validate_audio_input(
             self, **kwargs: object) -> Optional[UltravoxAudioInputs]:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
+        audio_lens = kwargs.pop("audio_lens", None)
+        audio_token_len = kwargs.pop("audio_token_len", None)
 
         if audio_features is None and audio_embeds is None:
             return None
@@ -430,7 +506,9 @@ def _parse_and_validate_audio_input(
                                  f"Got type: {type(audio_features)}")
 
             return UltravoxAudioFeatureInputs(type="audio_features",
-                                              data=audio_features)
+                                              data=audio_features,
+                                              lens=audio_lens,
+                                              token_len=audio_token_len)
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
@@ -447,34 +525,34 @@ def _process_audio_input(
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
-        audio_features = audio_input["data"]
-        if isinstance(audio_features, torch.Tensor):
-            # Combine the B and N dimensions for the encoder/projector
-            flattened = flatten_bn(audio_features)
-            flattened_embeddings = self._audio_features_to_embeddings(
-                flattened)
-
-            # Restore the original dimensions
-            embeddings = flattened_embeddings.unflatten(
-                0, audio_features.shape[:2])
-            return embeddings
-
-        result = []
-        # TODO: Batch heterogeneous tensors through the encoder/projector
-        for audio_features_item in audio_features:
-            if isinstance(audio_features_item, torch.Tensor):
-                result.append(
-                    self._audio_features_to_embeddings(audio_features_item))
-            else:
-                embeddings = [
-                    # Add a batch dimension to embed it, then remove it.
-                    self._audio_features_to_embeddings(tensor.unsqueeze(0)
-                                                       ).squeeze(0)
-                    for tensor in audio_features_item
-                ]
-                result.append(embeddings)
+        # Pad and concatenate audio features
+        # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
+        audio_features = pad_and_concat_to_dim3(audio_input["data"])
+
+        if isinstance(audio_input['lens'], list):
+            # [B1, B2] -> [B1+B2]
+            audio_lens = torch.cat(audio_input['lens'])
+            audio_token_len = torch.cat(audio_input['token_len'])
+        else:
+            audio_lens = flatten_bn(audio_input['lens'])
+            audio_token_len = flatten_bn(audio_input['token_len'])
+
+        embeddings = self._audio_features_to_embeddings(
+            audio_features, audio_lens)
+
+        # We should flatten and concatenate embeddings based on token lengths
+        # For example, with token_len = [4, 2, 3], flattened_embeddings will be
+        # concat(embeddings[0][:4], embeddings[1][:2], embeddings[2][:3])
 
-        return result
+        # Create a mask of valid indices based on token lengths
+        max_len = embeddings.shape[1]
+        indices = torch.arange(max_len, device=embeddings.device).expand(
+            embeddings.shape[0], -1)
+        mask = indices < audio_token_len[:, None]
+        # Apply mask and flatten
+        flattened_embeddings = embeddings[mask]
+
+        return flattened_embeddings
 
     def get_multimodal_embeddings(
         self, **kwargs
@@ -521,7 +599,11 @@ def forward(self,
         with the `input_ids`.
 
         Args:
-            audio_features: A batch of audio inputs [B, N, 80, M].
+            audio_features: A batch of audio input chunks [B, N, 80, M].
+            audio_lens: Length of audio frames for each audio chunk [B].
+            audio_token_len: Length of audio tokens for each audio chunk [B'].
+                Note: batch dim is different from batch dim in audio chunks.
+
         """
 
         if intermediate_tensors is not None:
@@ -560,3 +642,31 @@ def load_weights(self, weights: Iterable[Tuple[str,
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["audio_tower."])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+
+def pad_and_concat_to_dim3(
+    features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]]
+) -> torch.Tensor:
+    """
+    Pad and concatenate a list of tensors.
+
+    output:
+        Tensor of shape [B, C, M] where M is the maximum length of the input
+        tensors, B is the sum of the batch sizes of the input tensors.
+        C must be the same for all input tensors.
+    """
+    if isinstance(features, torch.Tensor):
+        if features.ndim > 3:
+            # Flatten [B, N, 80, M] -> [B * N, 80, M]
+            features = flatten_bn(features)
+        return features
+
+    features = [pad_and_concat_to_dim3(f) for f in features]
+
+    max_len = max(f.shape[-1] for f in features)
+    # Ensure all features have dim=3
+    features = [f.view(-1, *f.shape[-2:]) for f in features]
+    # Pad and oncatenate:
+    # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
+    features = [F.pad(f, (0, max_len - f.shape[-1])) for f in features]
+    return torch.cat(features)

From 4cad5b95e52fd68e5048bc1738f1116374bccbf0 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Tue, 11 Mar 2025 22:40:09 -0400
Subject: [PATCH 0671/1240] [V1][Core] Support MistralTokenizer for Structured
 Output (#14625)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 87 ++++++++++++++-----
 vllm/v1/structured_output/__init__.py         | 41 ++++++++-
 2 files changed, 102 insertions(+), 26 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index bddd224548c..b99fb6a7782 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import json
 import re
+from typing import Any
 
 import jsonschema
 import pytest
@@ -10,17 +13,27 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
 
 
+@pytest.fixture
+def model_name():
+    return [
+        "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+    ]
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_json_completion(monkeypatch, sample_json_schema,
-                                guided_decoding_backend: str):
+def test_guided_json_completion(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_json_schema: dict[str, Any],
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=1.0,
                                      max_tokens=1000,
                                      guided_decoding=GuidedDecodingParams(
@@ -50,9 +63,13 @@ def test_guided_json_completion(monkeypatch, sample_json_schema,
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_json_object(monkeypatch, guided_decoding_backend: str):
+def test_guided_json_object(
+    monkeypatch: pytest.MonkeyPatch,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=1.0,
                                      max_tokens=100,
                                      n=2,
@@ -84,10 +101,14 @@ def test_guided_json_object(monkeypatch, guided_decoding_backend: str):
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_json_unsupported_schema(monkeypatch, unsupported_json_schema,
-                                        guided_decoding_backend: str):
+def test_guided_json_unsupported_schema(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=1.0,
                                      max_tokens=1000,
                                      guided_decoding=GuidedDecodingParams(
@@ -107,10 +128,14 @@ def test_guided_json_unsupported_schema(monkeypatch, unsupported_json_schema,
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_grammar_ebnf(monkeypatch, sample_sql_ebnf,
-                             guided_decoding_backend: str):
+def test_guided_grammar_ebnf(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_sql_ebnf: str,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=1000,
@@ -145,10 +170,14 @@ def test_guided_grammar_ebnf(monkeypatch, sample_sql_ebnf,
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_grammar_lark(monkeypatch, sample_sql_lark,
-                             guided_decoding_backend: str):
+def test_guided_grammar_lark(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_sql_lark: str,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=1000,
@@ -188,10 +217,13 @@ def test_guided_grammar_lark(monkeypatch, sample_sql_lark,
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_grammar_ebnf_invalid(monkeypatch,
-                                     guided_decoding_backend: str):
+def test_guided_grammar_ebnf_invalid(
+    monkeypatch: pytest.MonkeyPatch,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      max_tokens=1000,
@@ -212,9 +244,14 @@ def test_guided_grammar_ebnf_invalid(monkeypatch,
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_regex(monkeypatch, sample_regex, guided_decoding_backend: str):
+def test_guided_regex(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_regex: str,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      guided_decoding=GuidedDecodingParams(
@@ -243,10 +280,14 @@ def test_guided_regex(monkeypatch, sample_regex, guided_decoding_backend: str):
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
-def test_guided_choice_completion(monkeypatch, sample_guided_choice,
-                                  guided_decoding_backend: str):
+def test_guided_choice_completion(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_guided_choice: str,
+    guided_decoding_backend: str,
+    model_name: str,
+):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=MODEL_NAME, max_model_len=1024)
+    llm = LLM(model=model_name, max_model_len=1024)
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
                                      guided_decoding=GuidedDecodingParams(
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 3428b8522e5..45fec1122cc 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -8,6 +8,7 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.utils import LazyLoader
 from vllm.v1.structured_output.grammar import Grammar, StructuredOutputOptions
 
@@ -40,8 +41,40 @@ def _delayed_init(self):
         tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-            tokenizer, vocab_size=self.vocab_size)
+        if isinstance(tokenizer, MistralTokenizer):
+            # NOTE: ideally, xgrammar should handle this accordingly.
+            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
+            try:
+                encoded_vocab = [
+                    token for token, _ in sorted(
+                        tokenizer.get_vocab().items(),
+                        key=lambda x: x[1],
+                    )
+                ]
+                stop_token_ids = None
+                if hasattr(
+                        tokenizer,
+                        "eos_token_id",
+                ) and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+            tokenizer_info = xgr.TokenizerInfo(
+                encoded_vocab=encoded_vocab,
+                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type=xgr.VocabType.BYTE_FALLBACK,
+                vocab_size=self.vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+        else:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                tokenizer,
+                vocab_size=self.vocab_size,
+            )
         self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
 
         # The default max_workers if not specified is the number of CPUs * 5,
@@ -51,7 +84,9 @@ def _delayed_init(self):
         max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
         self._grammar_bitmask = xgr.allocate_token_bitmask(
-            self.vllm_config.scheduler_config.max_num_seqs, self.vocab_size)
+            self.vllm_config.scheduler_config.max_num_seqs,
+            self.vocab_size,
+        )
 
         self.init_complete = True
 

From 9dc57c684a848ede333bc04826bf22d7133b9ee2 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 12 Mar 2025 11:12:52 +0800
Subject: [PATCH 0672/1240] [Core] Refactor `QKVCrossParallelLinear`
 implementation to support BNB 4-bit quantization (#14545)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_mllama.py            |  45 ++++
 vllm/model_executor/layers/linear.py          | 213 +++++++++++++++---
 vllm/model_executor/models/mllama.py          |  41 +---
 3 files changed, 234 insertions(+), 65 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 1e202907171..08e4b1b2f30 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -17,6 +17,7 @@
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                           _ImageAssets)
+from ....quantization.utils import is_quant_method_supported
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 
@@ -397,6 +398,50 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
         )
 
 
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+def test_bnb_regression(
+    image_assets: _ImageAssets,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+):
+    stop_sign = image_assets[0].pil_image
+    prompts = [
+        {
+            "prompt": "<|begin_of_text|>The content of the image <|image|> is",
+            "multi_modal_data": {
+                "image": stop_sign
+            },
+        },
+        {
+            "prompt":
+            "The color of the sky is blue but sometimes it can also be",
+        },
+    ]
+    # Test regression about QKVCrossParallelLinear
+    llm = LLM(
+        model=model,
+        dtype=dtype,
+        max_model_len=4096,
+        max_num_seqs=2,
+        enforce_eager=True,
+        quantization="bitsandbytes",
+        load_format="bitsandbytes",
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_tokens,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    assert outputs
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c96e2b220d6..3912c53e183 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -2,9 +2,10 @@
 
 import itertools
 from abc import abstractmethod
-from typing import Optional, Union
+from typing import Any, Literal, Optional, Union
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter, UninitializedParameter
 
@@ -84,6 +85,43 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
 
 
+# TODO(Isotr0py): We might need a more flexible structure to handle
+# bitsandbytes shard offsets.
+def left_shift_bitsandbytes_4bit_shard(bnb_weight_attrs: dict[str, Any]):
+    """
+    Separate the BitsAndBytes 4-bit shard.
+
+    For example, given bnb weight attributes as below:
+    {
+        'bnb_shard_offsets': array([0, 4, 8, 16]), 
+        'bnb_quant_state': {0: ..., 1: ..., 2: ...},
+    }
+
+    The function will return:
+    {
+        'bnb_shard_offsets': array([0, 4]), 
+        'bnb_quant_state': {0: ...},
+    }
+    and
+    {
+        'bnb_shard_offsets': array([0, 4, 12]),
+        'bnb_quant_state': {0: ..., 1: ...},
+    }
+    """
+    shard_offsets = bnb_weight_attrs["bnb_shard_offsets"]
+    offset_l = shard_offsets[:2]
+    offset_r = shard_offsets[1:] - shard_offsets[1]
+    quant_state_l = {0: bnb_weight_attrs["bnb_quant_state"][0]}
+    quant_state_r = {
+        i - 1: bnb_weight_attrs["bnb_quant_state"][i]
+        for i in range(1,
+                       len(shard_offsets) - 1)
+    }
+    left = dict(bnb_shard_offsets=offset_l, bnb_quant_state=quant_state_l)
+    right = dict(bnb_shard_offsets=offset_r, bnb_quant_state=quant_state_r)
+    return left, right
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -1229,7 +1267,24 @@ def extra_repr(self) -> str:
         return s
 
 
-class QKVCrossParallelLinear(torch.nn.Module):
+class QKVCrossParallelLinear(LinearBase):
+    """Linear layers for efficient cross-attention's QKV transformation.
+
+    Args:
+        hidden_size: input hidden state size of the transformer.
+        head_size: size of each attention head.
+        total_num_heads: total number of attention query heads.
+        total_num_kv_heads: total number of attention key/value heads. If
+                            None, assume total_num_kv_heads = total_num_heads.
+        bias: If true, add bias.
+        skip_bias_add: This was added to enable performance optimizations where
+                       bias can be fused with other element-wise operations. we
+                       skip adding bias but instead return it.
+        params_dtype: Data type for the parameters.
+        quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.qkv_proj)
+    """
 
     def __init__(self,
                  hidden_size: int,
@@ -1241,12 +1296,28 @@ def __init__(self,
                  params_dtype: Optional[torch.dtype] = None,
                  quant_config: Optional[QuantizationConfig] = None,
                  prefix: str = ""):
-        super().__init__()
+        # input_size and output_size are not used, just for alignment
+        input_size = hidden_size
+        output_size = (total_num_heads + (total_num_kv_heads or 0)) * head_size
+        super().__init__(input_size=input_size,
+                         output_size=output_size,
+                         skip_bias_add=skip_bias_add,
+                         params_dtype=params_dtype,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        self.quant_config = quant_config
+
         # Empty placeholders for loading as a single module.
-        self.weight = torch.nn.Parameter()
-        set_weight_attrs(self.weight, {
-            "weight_loader": self.weight_loader_weight,
-        })
+        placeholder_size = 0
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         placeholder_size, [placeholder_size],
+                                         placeholder_size,
+                                         placeholder_size,
+                                         self.params_dtype,
+                                         weight_loader=self.weight_loader)
+
         # Use a dictionary to avoid submodules parameters auto-registration:
         # drop-in replacement for a `QKVParallelLinear` module.
         self.proj = dict()
@@ -1276,18 +1347,94 @@ def __init__(self,
         if bias:
             self.bias = torch.nn.Parameter()
             set_weight_attrs(self.bias, {
-                "weight_loader": self.weight_loader_bias,
+                "output_dim": 0,
+                "weight_loader": self.weight_loader,
             })
+        else:
+            self.bias = None
 
     @property
-    def q_proj_decoder(self):
-        return self.proj["q_proj_decoder"]
+    def q_proj_decoder(self) -> ColumnParallelLinear:
+        layer = self.proj["q_proj_decoder"]
+        for name, param in self.named_parameters():
+            target_param = getattr(layer, name)
+            self.sync_weight_attrs(param, target_param, mode="q_proj_decoder")
+        return layer
 
     @property
-    def kv_proj_encoder(self):
-        return self.proj["kv_proj_encoder"]
+    def kv_proj_encoder(self) -> QKVParallelLinear:
+        layer = self.proj["kv_proj_encoder"]
+        for name, param in self.named_parameters():
+            target_param = getattr(layer, name)
+            self.sync_weight_attrs(param, target_param, mode="kv_proj_encoder")
+        return layer
+
+    def sync_weight_attrs(
+        self,
+        src_param: nn.Parameter,
+        tgt_param: nn.Parameter,
+        mode: Literal["q_proj_decoder", "kv_proj_encoder"],
+    ):
+        missing_attrs_dict = {
+            k: getattr(src_param, k)
+            for k in (set(src_param.__dict__.keys()) -
+                      set(tgt_param.__dict__.keys()))
+        }
+        # TODO(Isotr0py): handle bitsandbytes 8bit
+        use_bitsandbytes_4bit = getattr(src_param, "use_bitsandbytes_4bit",
+                                        False)
+        if (missing_attrs_dict and use_bitsandbytes_4bit):
+            q_proj_attrs, kv_proj_attrs = left_shift_bitsandbytes_4bit_shard(
+                missing_attrs_dict)
+            if mode == "q_proj_decoder":
+                set_weight_attrs(tgt_param, q_proj_attrs)
+            elif mode == "kv_proj_encoder":
+                set_weight_attrs(tgt_param, kv_proj_attrs)
+        else:
+            set_weight_attrs(tgt_param, missing_attrs_dict)
 
-    def forward(self, decoder_hidden_states, encoder_hidden_states):
+    def _is_same_param(
+        self,
+        src_param: torch.nn.Parameter,
+        map_param: torch.nn.Parameter,
+    ) -> bool:
+        """Check if two parameters are exactly pointing to same things."""
+        # ignore weight_loader because it's always different
+        key_to_ignore = ["weight_loader", "_weight_loader"]
+        has_same_type_name = type(src_param) is type(map_param)
+        src_param_attrs = {
+            k: v
+            for k, v in src_param.__dict__.items() if k not in key_to_ignore
+        }
+        map_param_attrs = {
+            k: v
+            for k, v in map_param.__dict__.items() if k not in key_to_ignore
+        }
+        has_same_attrs = src_param_attrs == map_param_attrs
+        return has_same_type_name and has_same_attrs
+
+    def select_proj_params(
+        self,
+        layer: nn.Module,
+        param: nn.Parameter,
+    ) -> nn.Parameter:
+        """
+        Given the placeholder param, 
+        return the corresponding param in the proj layers.
+        """
+        target_param_list = [
+            v for _, v in layer.named_parameters()
+            if self._is_same_param(param, v)
+        ]
+        assert len(target_param_list) == 1
+        target_param = target_param_list[0]
+        return target_param
+
+    def forward(  # type: ignore[override]
+        self,
+        decoder_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
         q, _ = self.q_proj_decoder(decoder_hidden_states)
         if encoder_hidden_states is None:
             # Encoder KV already cached.
@@ -1300,25 +1447,21 @@ def forward(self, decoder_hidden_states, encoder_hidden_states):
             k, v = kv_enc.split(self.kv_size, dim=-1)
         return q, k, v
 
-    def weight_loader_weight(self,
-                             param: torch.nn.Parameter,
-                             loaded_weight: torch.Tensor,
-                             loaded_shard_id: Optional[str] = None):
-        # NOTE Use QKV/ColumnParallel weight_loader, ignore placeholder param.
-        param = self.q_proj_decoder.weight if loaded_shard_id == "q" \
-            else self.kv_proj_encoder.weight
-        param.weight_loader(
-            param,
-            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
-                param, loaded_weight, loaded_shard_id)
-
-    def weight_loader_bias(self,
-                           param: torch.nn.Parameter,
-                           loaded_weight: torch.Tensor,
-                           loaded_shard_id: Optional[str] = None):
-        param = self.q_proj_decoder.bias if loaded_shard_id == "q" \
-            else self.kv_proj_encoder.bias
-        param.weight_loader(
-            param,
-            loaded_weight) if loaded_shard_id == "q" else param.weight_loader(
-                param, loaded_weight, loaded_shard_id)
\ No newline at end of file
+    def weight_loader(self,
+                      param: torch.nn.Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[str] = None):
+        layer = (self.q_proj_decoder
+                 if loaded_shard_id == "q" else self.kv_proj_encoder)
+        target_param = self.select_proj_params(layer, param)
+        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
+        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+
+    def extra_repr(self) -> str:
+        s = f"in_features={self.input_size}"
+        s += f", q_size={self.q_proj_decoder.output_size_per_partition}"
+        s += f", kv_size={self.kv_size}"
+        s += f", bias={self.bias is not None}"
+        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += ", gather_output=False"
+        return s
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 45f5dea0852..afc30f93b52 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -43,6 +43,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVCrossParallelLinear,
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -813,20 +814,11 @@ def __init__(
         self.q_local_size = self.num_local_heads * self.head_dim
         self.kv_local_size = self.num_local_key_value_heads * self.head_dim
 
-        # TODO(Isotr0py): Use QKVCrossParallelLinear when it supports
-        # quantization
-        self.q_proj = ColumnParallelLinear(
-            input_size=self.hidden_size,
-            output_size=self.num_heads * self.head_dim,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.q_proj",
-        )
-        self.kv_proj = QKVParallelLinear(
+        self.qkv_proj = QKVCrossParallelLinear(
             self.hidden_size,
             self.head_dim,
-            total_num_heads=0,
-            total_num_kv_heads=self.num_key_value_heads,
+            self.num_heads,
+            self.num_key_value_heads,
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
@@ -862,15 +854,11 @@ def forward(
         kv_range_for_decode: Optional[List[Tuple[int, int]]],
         cross_attention_states: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        q, _ = self.q_proj(hidden_states)
+        q, k, v = self.qkv_proj(hidden_states, cross_attention_states)
         if cross_attention_states is not None:
-            kv, _ = self.kv_proj(cross_attention_states)
-            k, v = kv.split([self.kv_local_size, self.kv_local_size], dim=-1)
             k = k.view(-1, self.num_local_key_value_heads, self.head_dim)
             v = v.view(-1, self.num_local_key_value_heads, self.head_dim)
             k = self.k_norm(k)
-        else:
-            k = v = None
 
         q = q.view(-1, self.num_local_heads, self.head_dim)
         q = self.q_norm(q)
@@ -1161,13 +1149,8 @@ def forward(
 class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                      SupportsV0Only):
     packed_modules_mapping = {
-        "self_attn.qkv_proj": [
-            "self_attn.q_proj",
-            "self_attn.k_proj",
-            "self_attn.v_proj",
-        ],
-        "cross_attn.kv_proj": ["cross_attn.k_proj", "cross_attn.v_proj"],
-        "gate_up_proj": ["gate_proj", "up_proj"],
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
     }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -1437,11 +1420,9 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
-            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
-            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
-            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
-            (".cross_attn.kv_proj", ".cross_attn.k_proj", "k"),
-            (".cross_attn.kv_proj", ".cross_attn.v_proj", "v"),
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
@@ -1570,4 +1551,4 @@ def convert_dense_cross_attention_mask_to_tensor(
     full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None])
     mask *= full_text_mask
     # (num_prompt_tokens, num_encoder_tokens)
-    return mask
+    return mask
\ No newline at end of file

From 13257be4273e503b13e6b6164027e159b6ebe70d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Wed, 12 Mar 2025 04:33:27 +0100
Subject: [PATCH 0673/1240] [Kernel] GGUF MoE kernel (#14613)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/ops.h                                    |   8 +
 csrc/quantization/gguf/gguf_kernel.cu         | 142 +++-
 csrc/quantization/gguf/moe.cuh                | 739 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |  10 +
 tests/kernels/test_ggml.py                    |  13 +
 tests/kernels/test_gguf.py                    |  64 ++
 vllm/_custom_ops.py                           |  37 +
 .../layers/quantization/gguf.py               |  82 +-
 8 files changed, 1070 insertions(+), 25 deletions(-)
 create mode 100644 csrc/quantization/gguf/moe.cuh

diff --git a/csrc/ops.h b/csrc/ops.h
index 13fbbe41286..724d7c92b82 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -151,6 +151,14 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
 torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type,
                               int64_t row);
 
+torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens);
+
+int64_t ggml_moe_get_block_size(int64_t type);
+
 #ifndef USE_ROCM
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 1150bd8f225..46b716bbd98 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -12,6 +12,7 @@
 #include "dequantize.cuh"
 #include "mmvq.cuh"
 #include "mmq.cuh"
+#include "moe.cuh"
 
 // Q8 gemv
 template <typename scalar_t>
@@ -59,10 +60,14 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
   const int64_t kx_padded = (kx + 512 - 1) / 512 * 512;
   const int block_num_x =
       (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-  const dim3 num_blocks(block_num_x, ky, 1);
-  const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-  quantize_q8_1<scalar_t>
-      <<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+  constexpr int MAX_BLOCK_SIZE = 65535;
+  for (int off = 0; off < ky; off += MAX_BLOCK_SIZE) {
+    const int num_blocks_y = std::min(ky, off + MAX_BLOCK_SIZE) - off;
+    const dim3 num_blocks(block_num_x, num_blocks_y, 1);
+    const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(
+        &x[off * kx], (int32_t*)vy + off * (kx_padded / 32 * 9), kx, kx_padded);
+  }
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
@@ -263,3 +268,132 @@ torch::Tensor ggml_mul_mat_a8(torch::Tensor W,  // quant weight
   });
   return Y;
 }
+
+torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
+                          torch::Tensor W,  // expert weights
+                          torch::Tensor sorted_token_ids,
+                          torch::Tensor expert_ids,
+                          torch::Tensor num_tokens_post_padded, int64_t type,
+                          int64_t row, int64_t top_k, int64_t tokens) {
+  int col = X.sizes()[1];
+  int padded = (col + 512 - 1) / 512 * 512;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(X));
+  auto options = torch::TensorOptions().dtype(X.dtype()).device(W.device());
+  at::Tensor Y = torch::empty({tokens * top_k, row}, options);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+  options = torch::TensorOptions().dtype(torch::kInt32).device(W.device());
+  at::Tensor quant_X = torch::empty({tokens, padded / 32 * 9}, options);
+  VLLM_DISPATCH_FLOATING_TYPES(X.scalar_type(), "ggml_moe_a8", [&] {
+    quantize_row_q8_1_cuda((scalar_t*)X.data_ptr(), (void*)quant_X.data_ptr(),
+                           col, tokens, stream);
+    switch (type) {
+      case 2:
+        ggml_moe_q4_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 3:
+        ggml_moe_q4_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 6:
+        ggml_moe_q5_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 7:
+        ggml_moe_q5_1_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 8:
+        ggml_moe_q8_0_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 10:
+        ggml_moe_q2_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 11:
+        ggml_moe_q3_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 12:
+        ggml_moe_q4_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 13:
+        ggml_moe_q5_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+      case 14:
+        ggml_moe_q6_K_q8_1_cuda(
+            (void*)quant_X.data_ptr(), (void*)W.data_ptr(),
+            (scalar_t*)Y.data_ptr(), (int*)sorted_token_ids.data_ptr(),
+            (int*)expert_ids.data_ptr(),
+            (int*)num_tokens_post_padded.data_ptr(), W.stride(0), col, row,
+            tokens, padded, row, top_k, sorted_token_ids.sizes()[0], stream);
+        break;
+    }
+  });
+  return Y;
+}
+
+int64_t ggml_moe_get_block_size(int64_t type) {
+  switch (type) {
+    case 2:
+      return MMQ_X_Q4_0;
+    case 3:
+      return MMQ_X_Q4_1;
+    case 6:
+      return MMQ_X_Q5_0;
+    case 7:
+      return MMQ_X_Q5_1;
+    case 8:
+      return MMQ_X_Q8_0;
+    case 10:
+      return MMQ_X_Q2_K;
+    case 11:
+      return MMQ_X_Q3_K;
+    case 12:
+      return MMQ_X_Q4_K;
+    case 13:
+      return MMQ_X_Q5_K;
+    case 14:
+      return MMQ_X_Q6_K;
+  }
+  return 0;
+}
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
new file mode 100644
index 00000000000..e499f53a2ac
--- /dev/null
+++ b/csrc/quantization/gguf/moe.cuh
@@ -0,0 +1,739 @@
+#include <cstdint>
+
+/* Adapted from ./csrc/quantization/gguf/mmq.cuh
+   based on ./vllm/model_executor/layers/fused_moe/fused_moe.py */
+template <typename scalar_t, int qk, int qr, int qi, bool need_sum,
+          typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+          allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles,
+          int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void moe_q(
+    const void* __restrict__ vx, const void* __restrict__ vy,
+    scalar_t* __restrict__ dst, const int* __restrict__ sorted_token_ids,
+    const int* __restrict__ expert_ids,
+    const int* __restrict__ num_tokens_post_padded, const int exp_stride,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y,
+    const int nrows_dst, const int top_k) {
+  const int blocks_per_row_x = ncols_x / qk;
+  const int blocks_per_col_y = nrows_y / QK8_1;
+  const int blocks_per_warp = WARP_SIZE_GGUF / qi;
+
+  const int ncols_dst = ncols_y * top_k;
+
+  const int row_dst_0 = blockIdx.x * mmq_y;
+  const int& row_x_0 = row_dst_0;
+
+  const int col_dst_0 = blockIdx.y * mmq_x;
+
+  int token_offs[mmq_x / nwarps];
+  for (int i = 0; i < mmq_x; i += nwarps) {
+    token_offs[i / nwarps] = sorted_token_ids[col_dst_0 + threadIdx.y + i];
+  }
+
+  const int exp_idx = expert_ids[blockIdx.y];
+  if (exp_idx > 255 || exp_idx < 0) return;
+  if (blockIdx.y * mmq_x > num_tokens_post_padded[0]) return;
+
+  const block_q_t* x = (const block_q_t*)((char*)vx + exp_idx * exp_stride);
+  const block_q8_1* y = (const block_q8_1*)(vy);
+
+  int* tile_x_ql = nullptr;
+  half2* tile_x_dm = nullptr;
+  int* tile_x_qh = nullptr;
+  int* tile_x_sc = nullptr;
+
+  allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+  __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF];
+  __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF / QI8_1];
+
+  float sum[mmq_y / WARP_SIZE_GGUF][mmq_x / nwarps] = {{0.0f}};
+
+  for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+    load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+               tile_x_qh, tile_x_sc, threadIdx.y, nrows_x - row_x_0 - 1,
+               threadIdx.x, blocks_per_row_x);
+
+    const int n_per_r = ((qk * blocks_per_warp) / qr);
+#pragma unroll
+    for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
+      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+      for (int i = 0; i < mmq_x; i += nwarps) {
+        const int col_y_eff = token_offs[i / nwarps] / top_k;
+        const int block_x = ib0 * (qk / QK8_1) + kbxd;
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const block_q8_1* by0 = &y[col_y_eff * blocks_per_col_y + block_x];
+          const int index_y =
+              (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF;
+          tile_y_qs[index_y] =
+              get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+        }
+      }
+
+      if (threadIdx.x < n_per_r / QK8_1) {
+        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const int col_y_eff = token_offs[threadIdx.y] / top_k;
+        const int block_x =
+            ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
+
+        if (col_y_eff < ncols_y && block_x < blocks_per_col_y) {
+          const half2* dsi_src = &y[col_y_eff * blocks_per_col_y + block_x].ds;
+          half2* dsi_dst =
+              &tile_y_ds[threadIdx.y * (WARP_SIZE_GGUF / QI8_1) + kby];
+
+          if (need_sum) {
+            *dsi_dst = *dsi_src;
+          } else {
+            float* dfi_dst = (float*)dsi_dst;
+            *dfi_dst = __low2float(*dsi_src);
+          }
+        }
+      }
+      __syncthreads();
+
+      // #pragma unroll // unrolling this loop causes too much register pressure
+      for (int k = ir * WARP_SIZE_GGUF / qr; k < (ir + 1) * WARP_SIZE_GGUF / qr;
+           k += vdr) {
+#pragma unroll
+        for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+          for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+            sum[i / WARP_SIZE_GGUF][j / nwarps] +=
+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs,
+                        tile_y_ds, threadIdx.x + i, threadIdx.y + j, k);
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+#pragma unroll
+  for (int j = 0; j < mmq_x; j += nwarps) {
+    const int col_dst = token_offs[j / nwarps];
+    if (col_dst >= ncols_dst) {
+      return;
+    }
+
+#pragma unroll
+    for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
+      const int row_dst = row_dst_0 + threadIdx.x + i;
+      if (row_dst >= nrows_dst) {
+        continue;
+      }
+      dst[col_dst * nrows_dst + row_dst] = sum[i / WARP_SIZE_GGUF][j / nwarps];
+    }
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q4_0 64
+  #define MMQ_Y_Q4_0 128
+  #define NWARPS_Q4_0 8
+#else
+  #define MMQ_X_Q4_0 4
+  #define MMQ_Y_Q4_0 32
+  #define NWARPS_Q4_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
+#endif
+    moe_q4_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q4_0;
+  const int mmq_y = MMQ_Y_Q4_0;
+  const int nwarps = NWARPS_Q4_0;
+
+  moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_0<mmq_y>, load_tiles_q4_0<mmq_y, nwarps, need_check>,
+        VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MMQ_X_Q4_0;
+  int mmq_y = MMQ_Y_Q4_0;
+  int nwarps = NWARPS_Q4_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q4_1 64
+  #define MMQ_Y_Q4_1 128
+  #define NWARPS_Q4_1 8
+#else
+  #define MMQ_X_Q4_1 4
+  #define MMQ_Y_Q4_1 32
+  #define NWARPS_Q4_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
+#endif
+    moe_q4_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q4_1;
+  const int mmq_y = MMQ_Y_Q4_1;
+  const int nwarps = NWARPS_Q4_1;
+
+  moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_1<mmq_y>, load_tiles_q4_1<mmq_y, nwarps, need_check>,
+        VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  int mmq_x = MMQ_X_Q4_1;
+  int mmq_y = MMQ_Y_Q4_1;
+  int nwarps = NWARPS_Q4_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q5_0 64
+  #define MMQ_Y_Q5_0 128
+  #define NWARPS_Q5_0 8
+#else
+  #define MMQ_X_Q5_0 4
+  #define MMQ_Y_Q5_0 32
+  #define NWARPS_Q5_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
+#endif
+    moe_q5_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q5_0;
+  const int mmq_y = MMQ_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_0<mmq_y>, load_tiles_q5_0<mmq_y, nwarps, need_check>,
+        VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q5_0;
+  const int mmq_y = MMQ_Y_Q5_0;
+  const int nwarps = NWARPS_Q5_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q5_1 64
+  #define MMQ_Y_Q5_1 128
+  #define NWARPS_Q5_1 8
+#else
+  #define MMQ_X_Q5_1 4
+  #define MMQ_Y_Q5_1 32
+  #define NWARPS_Q5_1 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
+#endif
+    moe_q5_1(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q5_1;
+  const int mmq_y = MMQ_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_1<mmq_y>, load_tiles_q5_1<mmq_y, nwarps, need_check>,
+        VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_1_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q5_1;
+  const int mmq_y = MMQ_Y_Q5_1;
+  const int nwarps = NWARPS_Q5_1;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_1<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q8_0 64
+  #define MMQ_Y_Q8_0 128
+  #define NWARPS_Q8_0 8
+#else
+  #define MMQ_X_Q8_0 4
+  #define MMQ_Y_Q8_0 32
+  #define NWARPS_Q8_0 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
+#endif
+    moe_q8_0(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q8_0;
+  const int mmq_y = MMQ_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q8_0<mmq_y>, load_tiles_q8_0<mmq_y, nwarps, need_check>,
+        VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q8_0_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q8_0;
+  const int mmq_y = MMQ_Y_Q8_0;
+  const int nwarps = NWARPS_Q8_0;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q8_0<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q2_K 64
+  #define MMQ_Y_Q2_K 128
+  #define NWARPS_Q2_K 8
+#else
+  #define MMQ_X_Q2_K 4
+  #define MMQ_Y_Q2_K 32
+  #define NWARPS_Q2_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
+#endif
+    moe_q2_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q2_K;
+  const int mmq_y = MMQ_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q2_K<mmq_y>, load_tiles_q2_K<mmq_y, nwarps, need_check>,
+        VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q2_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q2_K;
+  const int mmq_y = MMQ_Y_Q2_K;
+  const int nwarps = NWARPS_Q2_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q2_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q3_K 64
+  #define MMQ_Y_Q3_K 128
+  #define NWARPS_Q3_K 8
+#else
+  #define MMQ_X_Q3_K 4
+  #define MMQ_Y_Q3_K 32
+  #define NWARPS_Q3_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
+#endif
+    moe_q3_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+
+  const int mmq_x = MMQ_X_Q3_K;
+  const int mmq_y = MMQ_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q3_K<mmq_y>, load_tiles_q3_K<mmq_y, nwarps, need_check>,
+        VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+template <typename scalar_t>
+static void ggml_moe_q3_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q3_K;
+  const int mmq_y = MMQ_Y_Q3_K;
+  const int nwarps = NWARPS_Q3_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q3_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q4_K 64
+  #define MMQ_Y_Q4_K 128
+  #define NWARPS_Q4_K 8
+#else
+  #define MMQ_X_Q4_K 4
+  #define MMQ_Y_Q4_K 32
+  #define NWARPS_Q4_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
+#endif
+    moe_q4_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q4_K;
+  const int mmq_y = MMQ_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q4_K<mmq_y>, load_tiles_q4_K<mmq_y, nwarps, need_check>,
+        VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q4_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q4_K;
+  const int mmq_y = MMQ_Y_Q4_K;
+  const int nwarps = NWARPS_Q4_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q4_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q5_K 64
+  #define MMQ_Y_Q5_K 128
+  #define NWARPS_Q5_K 8
+#else
+  #define MMQ_X_Q5_K 4
+  #define MMQ_Y_Q5_K 32
+  #define NWARPS_Q5_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
+#endif
+    moe_q5_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q5_K;
+  const int mmq_y = MMQ_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q5_K<mmq_y>, load_tiles_q5_K<mmq_y, nwarps, need_check>,
+        VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q5_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q5_K;
+  const int mmq_y = MMQ_Y_Q5_K;
+  const int nwarps = NWARPS_Q5_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q5_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
+
+#if defined(USE_ROCM)
+  #define MMQ_X_Q6_K 64
+  #define MMQ_Y_Q6_K 128
+  #define NWARPS_Q6_K 8
+#else
+  #define MMQ_X_Q6_K 4
+  #define MMQ_Y_Q6_K 32
+  #define NWARPS_Q6_K 4
+#endif
+
+template <typename scalar_t, bool need_check>
+static __global__ void
+#if defined(USE_ROCM)
+__launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
+#endif
+    moe_q6_K(const void* __restrict__ vx, const void* __restrict__ vy,
+             scalar_t* __restrict__ dst, const int* sorted_token_ids,
+             const int* expert_ids, const int* num_tokens_post_padded,
+             const int exp_stride, const int ncols_x, const int nrows_x,
+             const int ncols_y, const int nrows_y, const int nrows_dst,
+             const int top_k) {
+  const int mmq_x = MMQ_X_Q6_K;
+  const int mmq_y = MMQ_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+        allocate_tiles_q6_K<mmq_y>, load_tiles_q6_K<mmq_y, nwarps, need_check>,
+        VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>(
+      vx, vy, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+      exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+}
+
+template <typename scalar_t>
+static void ggml_moe_q6_K_q8_1_cuda(
+    const void* inp, const void* w, scalar_t* dst, const int* sorted_token_ids,
+    const int* expert_ids, const int* num_tokens_post_padded,
+    const int exp_stride, const int ncols_x, const int nrows_x,
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
+    const int tokens_post_padded, cudaStream_t stream) {
+  const int mmq_x = MMQ_X_Q6_K;
+  const int mmq_y = MMQ_Y_Q6_K;
+  const int nwarps = NWARPS_Q6_K;
+
+  const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+  const int block_num_y = (tokens_post_padded) / mmq_x;
+  const dim3 block_nums(block_num_x, block_num_y, 1);
+  const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1);
+
+  if (nrows_x % mmq_y == 0) {
+    constexpr bool need_check = false;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  } else {
+    constexpr bool need_check = true;
+    moe_q6_K<scalar_t, need_check><<<block_nums, block_dims, 0, stream>>>(
+        w, inp, dst, sorted_token_ids, expert_ids, num_tokens_post_padded,
+        exp_stride, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, top_k);
+  }
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b06b1222079..eac27e648f8 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -305,6 +305,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
   ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
 
+  // moe kernel for GGML.
+  ops.def(
+      "ggml_moe_a8(Tensor X, Tensor W, "
+      "Tensor sorted_token_ids, Tensor expert_ids, Tensor "
+      "num_tokens_post_padded, "
+      "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
+  ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
+
+  ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
+
 #ifndef USE_ROCM
   // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
   ops.def(
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
index dc728fd4861..23fa1fdfda1 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/test_ggml.py
@@ -22,3 +22,16 @@ def test_ggml_opcheck(quant_type):
             (qweight, x, quant_type, qweight.shape[0]))
     opcheck(torch.ops._C.ggml_mul_mat_vec_a8,
             (qweight, x, quant_type, qweight.shape[0]))
+
+    shape = [256, 1024, 336]
+    qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
+    x = torch.rand((1, 1024), device='cuda', dtype=torch.float16)
+    sorted_token_ids = torch.arange(776, device='cuda')
+    expert_ids = torch.randint(0, 256, (194, ), device='cuda')
+    num_tokens_post_padded = torch.tensor([1],
+                                          dtype=torch.int64,
+                                          device='cuda')
+
+    opcheck(torch.ops._C.ggml_moe_a8,
+            (x, qweight, sorted_token_ids, expert_ids, num_tokens_post_padded,
+             quant_type, qweight.shape[0], 1, x.shape[0]))
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index dde3741d3c4..ede941844dc 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -8,9 +8,13 @@
 from huggingface_hub import snapshot_download
 
 import vllm._custom_ops as ops
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_experts
+from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
 from vllm.platforms import current_platform
 
 GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
+GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
 
 
 def get_gguf_sample_tensors(
@@ -22,6 +26,15 @@ def get_gguf_sample_tensors(
     return GGUFReader(sample_file).tensors
 
 
+def get_gguf_MoE_tensors(
+        hidden_size: int,
+        quant_type: GGMLQuantizationType) -> list[ReaderTensor]:
+    sample_dir = GGUF_SAMPLE_MOE
+    filename = f"Quant_{quant_type.name}_{hidden_size}.gguf"
+    sample_file = Path(sample_dir) / filename
+    return GGUFReader(sample_file).tensors
+
+
 DTYPES = [torch.half, torch.bfloat16, torch.float32]
 # Hidden_size for testing, must match the sample file in HF repo,
 # we have `hidden_size = 256, 1024` for test in HF repo currently.
@@ -132,3 +145,54 @@ def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype,
                                    ref_output,
                                    atol=atols[dtype],
                                    rtol=rtols[dtype])
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", [512])
+@pytest.mark.parametrize("top_k", [4, 8])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize(
+    "quant_type",
+    [
+        # k-quants
+        GGMLQuantizationType.Q2_K,
+        GGMLQuantizationType.Q3_K,
+        GGMLQuantizationType.Q4_K,
+        GGMLQuantizationType.Q5_K,
+        GGMLQuantizationType.Q6_K,
+        # standard quants
+        GGMLQuantizationType.Q4_0,
+        GGMLQuantizationType.Q5_0,
+        GGMLQuantizationType.Q8_0,
+    ])
+@torch.inference_mode()
+def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
+             quant_type: GGMLQuantizationType, top_k: int):
+    current_platform.seed_everything(0)
+    H, E = 1024, 256
+
+    x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
+
+    topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype)
+    topk_ids = torch.randint(0, E, (num_tokens, top_k), device="cuda")
+
+    tensors = get_gguf_MoE_tensors(hidden_size, quant_type)
+
+    w13 = tensors[0]
+    w2 = tensors[1]
+
+    w13_dequant = torch.tensor(dequantize(w13.data, quant_type),
+                               device="cuda").to(dtype)
+
+    w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
+                              device="cuda").to(dtype)
+    act = SiluAndMul()
+
+    output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
+                             torch.tensor(w2.data,
+                                          device="cuda"), topk_weights,
+                             topk_ids, quant_type, quant_type, act)
+
+    ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
+                               topk_ids).reshape(output.shape)
+    torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 14cfe751514..9f5b48714e1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -448,6 +448,23 @@ def _ggml_mul_mat_a8_fake(
         batch = X.size(0)
         return torch.empty((batch, row), dtype=X.dtype, device=W.device)
 
+    @register_fake("_C::ggml_moe_a8")
+    def _ggml_moe_a8_fake(
+        X: torch.Tensor,
+        W: torch.Tensor,
+        sorted_token_ids: torch.Tensor,
+        expert_ids: torch.Tensor,
+        num_tokens_post_padded: torch.Tensor,
+        quant_type: int,
+        row: torch.SymInt,
+        top_k: torch.SymInt,
+        tokens: torch.SymInt,
+    ) -> torch.Tensor:
+        tokens = X.size(0)
+        return torch.empty((tokens * top_k, row),
+                           dtype=torch.float16,
+                           device=W.device)
+
 
 # cutlass
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
@@ -1034,6 +1051,26 @@ def ggml_mul_mat_a8(
     return torch.ops._C.ggml_mul_mat_a8(W, X, quant_type, row)
 
 
+def ggml_moe_a8(
+    X: torch.Tensor,
+    W: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_padded: torch.Tensor,
+    quant_type: int,
+    row: int,
+    top_k: int,
+    tokens: int,
+) -> torch.Tensor:
+    return torch.ops._C.ggml_moe_a8(X, W, sorted_token_ids, expert_ids,
+                                    num_tokens_post_padded, quant_type, row,
+                                    top_k, tokens)
+
+
+def ggml_moe_get_block_size(quant_type: int) -> int:
+    return torch.ops._C.ggml_moe_get_block_size(quant_type)
+
+
 # mamba
 def causal_conv1d_fwd(x: torch.Tensor, weight: torch.Tensor,
                       bias_: Optional[torch.Tensor],
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index d4e97177693..5d4c1c6ec89 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -8,7 +8,9 @@
 from torch.nn.parameter import Parameter, UninitializedParameter
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -18,6 +20,8 @@
     VocabParallelEmbedding)
 from vllm.model_executor.utils import set_weight_attrs
 
+logger = init_logger(__name__)
+
 
 class GGUFConfig(QuantizationConfig):
     """Config class for GGUF."""
@@ -119,6 +123,59 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
     return y
 
 
+def _fused_moe_gguf(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    qweight_type: int,
+    qweight_type2: int,
+    act,
+) -> torch.Tensor:
+    out_hidden_states = torch.empty_like(x)
+    if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
+        num_tokens, _ = x.shape
+        E, N, _ = w1.shape
+        top_k = topk_ids.shape[1]
+        BLOCK_SIZE = ops.ggml_moe_get_block_size(qweight_type)
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = \
+                moe_align_block_size(topk_ids, BLOCK_SIZE, E)
+        out = ops.ggml_moe_a8(x, w1, sorted_token_ids, expert_ids,
+                              num_tokens_post_padded, qweight_type, N, top_k,
+                              num_tokens)
+        out = act(out)
+        out = ops.ggml_moe_a8(out, w2, sorted_token_ids, expert_ids,
+                              num_tokens_post_padded, qweight_type2,
+                              w2.shape[1], 1, num_tokens * top_k)
+        out = out.reshape(num_tokens, top_k, w2.shape[1]).mul_(
+            topk_weights.view(num_tokens, top_k, 1))
+        ops.moe_sum(out, out_hidden_states)
+    else:
+        logger.warning_once("There is no support for fast MoE kernel "
+                            "for current quantization method. "
+                            "Falling back to slow implementation. ")
+        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
+            inp = x[tok].reshape((1, ) + x.shape[1:])
+            current_hidden_state = None
+            for ww, ii in zip(w, idx):
+                expert_up = w1[ii]
+
+                out = _fuse_mul_mat(inp, expert_up, qweight_type)
+                out = act(out)
+
+                expert_down = w2[ii]
+                current_state = _fuse_mul_mat(out, expert_down,
+                                              qweight_type2).mul_(ww)
+                if current_hidden_state is None:
+                    current_hidden_state = current_state
+                else:
+                    current_hidden_state.add_(current_state)
+            out_hidden_states[tok] = current_hidden_state
+    return out_hidden_states
+
+
 class GGUFLinearMethod(LinearMethodBase):
     """Linear method for GGUF.
 
@@ -285,27 +342,10 @@ def apply(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
-        final_hidden_states = torch.empty_like(x)
-        for tok, (w, idx) in enumerate(zip(topk_weights, topk_ids)):
-            inp = x[tok].reshape((1, ) + x.shape[1:])
-            current_hidden_state = None
-            for ww, ii in zip(w, idx):
-                expert_up = layer.w13_qweight[ii]
-
-                out = _fuse_mul_mat(inp, expert_up,
-                                    layer.w13_qweight_type.weight_type)
-                out = self.act(out)
-
-                expert_down = layer.w2_qweight[ii]
-                current_state = _fuse_mul_mat(
-                    out, expert_down,
-                    layer.w2_qweight_type.weight_type).mul_(ww)
-                if current_hidden_state is None:
-                    current_hidden_state = current_state
-                else:
-                    current_hidden_state.add_(current_state)
-            final_hidden_states[tok] = current_hidden_state
-        return final_hidden_states
+        return _fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
+                               topk_weights, topk_ids,
+                               layer.w13_qweight_type.weight_type,
+                               layer.w2_qweight_type.weight_type, self.act)
 
 
 class GGUFEmbeddingMethod(GGUFLinearMethod):

From dd2b433cb87daba7eb799f5f5cae480357c4e068 Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <benjamin.chislett@centml.ai>
Date: Wed, 12 Mar 2025 01:12:41 -0400
Subject: [PATCH 0674/1240] [V1][Bugfix][Spec Decode] Fix incorrect outputs in
 V1 speculative decoding due to batch indexing (#14645)

Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/e2e/test_ngram_spec_decode.py | 61 +++++++++++++++++++++-----
 vllm/v1/worker/gpu_model_runner.py     |  4 +-
 2 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
index 150caa150a5..519a74cab84 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+import random
+
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -6,16 +8,41 @@
 
 @pytest.fixture
 def test_prompts():
-    return [
-        "Can you repeat the sentence ten times, this is a sentence.",
-        "Can you repeat the sentence ten times, this is a test.",
-    ]
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
 
 
 @pytest.fixture
 def sampling_config():
     # Only support greedy for now
-    return SamplingParams(temperature=0, max_tokens=30, ignore_eos=False)
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
 
 
 @pytest.fixture
@@ -32,18 +59,28 @@ def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name)
-        ref_outputs = ref_llm.generate(test_prompts, sampling_config)
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
         spec_llm = LLM(model=model_name,
                        speculative_model='[ngram]',
                        ngram_prompt_lookup_max=5,
                        ngram_prompt_lookup_min=3,
-                       num_speculative_tokens=3)
-        spec_outputs = spec_llm.generate(test_prompts, sampling_config)
+                       num_speculative_tokens=3,
+                       max_model_len=1024)
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
         for ref_output, spec_output in zip(ref_outputs, spec_outputs):
-            assert ref_output.outputs[0].text == spec_output.outputs[0].text, \
-                (f"ref_output: {ref_output.outputs[0].text},"
-                 f"spec_output: {spec_output.outputs[0].text}")
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
         del spec_llm
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 732792885fb..df7ca70924b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1015,11 +1015,9 @@ def execute_model(
         else:
             target_probs = self.model.sampler.compute_probs(
                 logits, sampling_metadata)
-            scheduled_request_ids = scheduler_output.num_scheduled_tokens.keys(
-            )
             draft_token_ids = [
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
-                for req_id in scheduled_request_ids
+                for req_id in self.input_batch.req_ids
             ]
             sampler_output = self.rejection_sampler(draft_token_ids,
                                                     target_probs,

From c433866ae870781ae1aac1360543b107bcaffaaa Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Tue, 11 Mar 2025 22:13:11 -0700
Subject: [PATCH 0675/1240] [Kernel] Add ModelOpt FP4 Checkpoint Support
 (#12520)

Signed-off-by: Pavani Majety <pmajety@nvidia.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/ops.h                                    |   8 +-
 .../quantization/fp4/nvfp4_scaled_mm_entry.cu |   6 +
 .../fp4/nvfp4_scaled_mm_kernels.cu            |   7 +-
 csrc/torch_bindings.cpp                       |   4 +
 .../decoder_only/language/test_nvfp4.py       |  82 ++++++
 vllm/_custom_ops.py                           |   4 +
 vllm/config.py                                |   2 +-
 vllm/model_executor/layers/linear.py          |  23 +-
 .../layers/quantization/__init__.py           |   4 +-
 .../layers/quantization/modelopt.py           | 278 +++++++++++++++++-
 10 files changed, 388 insertions(+), 30 deletions(-)
 create mode 100644 tests/models/decoder_only/language/test_nvfp4.py

diff --git a/csrc/ops.h b/csrc/ops.h
index 724d7c92b82..7434aead57f 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -160,14 +160,16 @@ torch::Tensor ggml_moe_a8(torch::Tensor X, torch::Tensor W,
 int64_t ggml_moe_get_block_size(int64_t type);
 
 #ifndef USE_ROCM
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
+
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
                            torch::Tensor const& B_sf,
                            torch::Tensor const& alpha);
 
-bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
-bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
-
 void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
index 7b57b32fdb0..61b75e92dfa 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -36,3 +36,9 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                               "be compiled using CUDA 12.8 and target "
                               "compute capability 100 or above.");
 }
+
+bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
+  int runtimeVersion;
+  cudaRuntimeGetVersion(&runtimeVersion);
+  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+}
\ No newline at end of file
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
index 9b30e4fef35..6e14de0c780 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu
@@ -201,10 +201,11 @@ void runGemm(at::Tensor& D, at::Tensor const& A, at::Tensor const& B,
 #endif  // defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)
 
 #define CHECK_TYPE(x, st, m) \
-  TORCH_CHECK(x.scalar_type() == st, "Inconsistency of Tensor type:", m)
-#define CHECK_TH_CUDA(x, m) TORCH_CHECK(x.is_cuda(), m, "must be a CUDA tensor")
+  TORCH_CHECK(x.scalar_type() == st, ": Inconsistency of Tensor type:", m)
+#define CHECK_TH_CUDA(x, m) \
+  TORCH_CHECK(x.is_cuda(), m, ": must be a CUDA tensor")
 #define CHECK_CONTIGUOUS(x, m) \
-  TORCH_CHECK(x.is_contiguous(), m, "must be contiguous")
+  TORCH_CHECK(x.is_contiguous(), m, ": must be contiguous")
 #define CHECK_INPUT(x, st, m) \
   CHECK_TH_CUDA(x, m);        \
   CHECK_CONTIGUOUS(x, m);     \
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index eac27e648f8..d3bcb86adbc 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -434,6 +434,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                 Tensor! output_scale, Tensor input_scale) -> ()");
   ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
 
+  // Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
+  // of the given capability
+  ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_scaled_mm_supports_fp4", &cutlass_scaled_mm_supports_fp4);
 #endif
 
   // Quantized GEMM for GPTQ.
diff --git a/tests/models/decoder_only/language/test_nvfp4.py b/tests/models/decoder_only/language/test_nvfp4.py
new file mode 100644
index 00000000000..442e8e93cfa
--- /dev/null
+++ b/tests/models/decoder_only/language/test_nvfp4.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+# flake8: noqa
+"""Tests Model Optimizer nvfp4 models against ground truth generation
+Note: these tests will only pass on B200
+"""
+import os
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
+
+os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+MAX_MODEL_LEN = 1024
+
+MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]
+
+EXPECTED_STRS_MAP = {
+    "nvidia/Llama-3.3-70B-Instruct-FP4": [
+        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
+        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
+        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
+        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
+        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
+        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+    ]
+}
+
+
+# This test compares against golden strings for exact match since
+# there is no baseline implementation to compare against
+# and is unstable w.r.t specifics of the fp4 implementation or
+# the hardware being run on.
+# Disabled to prevent it from breaking the build
+@pytest.mark.skip(
+    reason=
+    "Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system.")
+@pytest.mark.quant_model
+@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
+                    reason="nvfp4 is not supported on this GPU type.")
+@pytest.mark.parametrize("model_name", MODELS)
+def test_models(example_prompts, model_name) -> None:
+    model = LLM(
+        model=model_name,
+        max_model_len=MAX_MODEL_LEN,
+        trust_remote_code=True,
+        enforce_eager=True,
+        quantization="nvfp4",
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    formatted_prompts = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      tokenize=False,
+                                      add_generation_prompt=True)
+        for prompt in example_prompts
+    ]
+    params = SamplingParams(max_tokens=20, temperature=0)
+    generations: List[str] = []
+    # Note: these need to be run 1 at a time due to numerical precision,
+    # since the expected strs were generated this way.
+    for prompt in formatted_prompts:
+        outputs = model.generate(prompt, params)
+        generations.append(outputs[0].outputs[0].text)
+    del model
+
+    print(model_name, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name]
+    for i in range(len(example_prompts)):
+        generated_str = generations[i]
+        expected_str = expected_strs[i]
+        assert expected_str == generated_str, (
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 9f5b48714e1..64175cc4e13 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -467,6 +467,10 @@ def _ggml_moe_a8_fake(
 
 
 # cutlass
+def cutlass_scaled_mm_supports_fp4(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_scaled_mm_supports_fp4(cuda_device_capability)
+
+
 def cutlass_scaled_fp4_mm(a: torch.Tensor, b: torch.Tensor,
                           block_scale_a: torch.Tensor,
                           block_scale_b: torch.Tensor, alpha: torch.Tensor,
diff --git a/vllm/config.py b/vllm/config.py
index 26c02563b1b..a0f30d0e7b7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -613,7 +613,7 @@ def _verify_quantization(self) -> None:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
-            "compressed-tensors", "experts_int8", "quark"
+            "compressed-tensors", "experts_int8", "quark", "nvfp4"
         ]
         if self.quantization is not None:
             self.quantization = self.quantization.lower()
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 3912c53e183..1ae574072b8 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -30,12 +30,23 @@
 logger = init_logger(__name__)
 
 WEIGHT_LOADER_V2_SUPPORTED = [
-    "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
-    "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
-    "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
-    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
-    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod",
-    "HQQMarlinMethod", "QuarkLinearMethod"
+    "CompressedTensorsLinearMethod",
+    "AWQMarlinLinearMethod",
+    "AWQLinearMethod",
+    "GPTQMarlinLinearMethod",
+    "Fp8LinearMethod",
+    "MarlinLinearMethod",
+    "QQQLinearMethod",
+    "GPTQMarlin24LinearMethod",
+    "TPUInt8LinearMethod",
+    "GPTQLinearMethod",
+    "FBGEMMFp8LinearMethod",
+    "ModelOptFp8LinearMethod",
+    "IPEXAWQLinearMethod",
+    "IPEXGPTQLinearMethod",
+    "HQQMarlinMethod",
+    "QuarkLinearMethod",
+    "ModelOptNvFp4LinearMethod",
 ]
 
 
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 6cd508d057a..a4dc4e9cbf2 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -14,6 +14,7 @@
     "ptpc_fp8",
     "fbgemm_fp8",
     "modelopt",
+    "nvfp4",
     # The order of gptq methods is important for config.py iteration over
     # override_quantization_method(..)
     "marlin",
@@ -97,7 +98,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     from .hqq_marlin import HQQMarlinConfig
     from .ipex_quant import IPEXConfig
     from .marlin import MarlinConfig
-    from .modelopt import ModelOptFp8Config
+    from .modelopt import ModelOptFp8Config, ModelOptNvFp4Config
     from .moe_wna16 import MoeWNA16Config
     from .neuron_quant import NeuronQuantConfig
     from .ptpc_fp8 import PTPCFp8Config
@@ -112,6 +113,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
         "fp8": Fp8Config,
         "fbgemm_fp8": FBGEMMFp8Config,
         "modelopt": ModelOptFp8Config,
+        "nvfp4": ModelOptNvFp4Config,
         # The order of gptq methods is important for config.py iteration over
         # override_quantization_method(..)
         "marlin": MarlinConfig,
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 1f8af8d678c..3de15369915 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -1,24 +1,31 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter
 
+from vllm._custom_ops import (cutlass_scaled_fp4_mm,
+                              cutlass_scaled_mm_supports_fp4, scaled_fp4_quant)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
-ACTIVATION_SCHEMES = ["static"]
+QUANT_ALGOS = ["FP8", "NVFP4"]
+KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
 class ModelOptFp8Config(QuantizationConfig):
@@ -54,12 +61,13 @@ def get_config_filenames(cls) -> List[str]:
     def from_config(cls, config: Dict[str, Any]) -> "ModelOptFp8Config":
         quant_config = cls.get_from_keys(config, ["quantization"])
         quant_method = quant_config["quant_algo"]
-        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
-        if not is_checkpoint_fp8_serialized:
-            raise ValueError("ModelOpt currently only supports static FP8 "
-                             "quantization in vLLM. Please check the "
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
+                             " quantizations in vLLM. Please check the "
                              "`hf_quant_config.json` file for your model's "
                              "quant configuration.")
+        is_checkpoint_fp8_serialized = ("FP8" in quant_method)
+
         return cls(is_checkpoint_fp8_serialized)
 
     def get_quant_method(self, layer: torch.nn.Module,
@@ -72,15 +80,6 @@ def get_quant_method(self, layer: torch.nn.Module,
         return None
 
 
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
-    """
-
-    def __init__(self, quant_config: ModelOptFp8Config):
-        super().__init__(quant_config)
-
-
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
     Supports loading FP8 checkpoints with static weight scale and
@@ -162,3 +161,250 @@ def apply(
                                      weight_scale=layer.weight_scale,
                                      input_scale=layer.input_scale,
                                      bias=bias)
+
+
+class ModelOptNvFp4Config(QuantizationConfig):
+    """Config class for ModelOpt FP4."""
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool,
+        kv_cache_quant_algo: str,
+        exclude_modules: List[str],
+        group_size: int = 16,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected ModelOpt NVFP4 checkpoint. Please note that"
+                " the format is experimental and could change in future.")
+
+            self.group_size = group_size
+            self.kv_cache_quant_algo = kv_cache_quant_algo
+            self.exclude_modules = exclude_modules
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "modelopt_nvfp4"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 100
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "ModelOptNvFp4Config":
+        quant_config = cls.get_from_keys(config, ["quantization"])
+        quant_method = quant_config["quant_algo"]
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}"
+                             " quantizations in vLLM. Please check the "
+                             "`hf_quant_config.json` file for your model's "
+                             "quant configuration.")
+        is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method)
+        kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        group_size = quant_config["group_size"]
+        exclude_modules = quant_config["exclude_modules"]
+        if not (group_size and kv_cache_quant_algo and exclude_modules):
+            raise ValueError("NVFP4 quantization requires group size and "
+                             "kv_cache_quant_algo specified in "
+                             "hf_quant_config.json")
+        return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo,
+                   exclude_modules, group_size)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["QuantizeMethodBase"]:
+        from vllm.attention.layer import Attention  # Avoid circular import
+        if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules):
+                return UnquantizedLinearMethod()
+            return ModelOptNvFp4LinearMethod(self)
+        elif isinstance(layer, Attention):
+            return ModelOptFp8KVCacheMethod(self)
+        return None
+
+
+def cutlass_fp4_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+    return cutlass_scaled_mm_supports_fp4(capability)
+
+
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: Union[ModelOptFp8Config,
+                                           ModelOptNvFp4Config]):
+        super().__init__(quant_config)
+
+
+class ModelOptNvFp4LinearMethod(LinearMethodBase):
+    """Linear method for Model Optimizer NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+    
+    input_scale: torch.float32, scalar ,
+    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
+    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
+    weight_scale_2: torch.float32, scalar,
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptNvFp4Config):
+        self.quant_config = quant_config
+        self.cutlass_nvfp4_supported = cutlass_fp4_supported()
+        if not self.cutlass_nvfp4_supported:
+            raise ValueError("Current platform does not support NVFP4"
+                             " quantization. Please use Blackwell and above.")
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del input_size, output_size
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError("NVFP4 quantization was selected, "
+                             " dynamic quantization is not supported.")
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size_per_partition = input_size_per_partition
+        layer.output_size_per_partition = output_size_per_partition
+
+        if (input_size_per_partition % 16 != 0):
+            raise ValueError("Unsupported model when in features size is "
+                             "not multiple of 16")
+        # The nvfp4 weight is still represented as
+        weight_dtype = (torch.float8_e4m3fn
+                        if self.quant_config.is_checkpoint_nvfp4_serialized
+                        else params_dtype)
+        # Weight
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                # 2 fp4 items are packed in the input dimension
+                layer.output_size_per_partition,
+                layer.input_size_per_partition // 2,
+                dtype=torch.uint8),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader)
+        layer.register_parameter("weight", weight)
+
+        # Input Weight Scale
+        input_scale = PerTensorScaleParameter(data=torch.empty(
+            len(output_partition_sizes), dtype=torch.float32),
+                                              weight_loader=weight_loader)
+        layer.register_parameter("input_scale", input_scale)
+
+        # Global Weight Scale
+        weight_scale_2 = PerTensorScaleParameter(data=torch.empty(
+            len(output_partition_sizes), dtype=torch.float32),
+                                                 weight_loader=weight_loader)
+        layer.register_parameter("weight_scale_2", weight_scale_2)
+
+        # Per Block Weight Scale
+        weight_scale = ModelWeightParameter(data=torch.empty(
+            output_size_per_partition,
+            input_size_per_partition // self.quant_config.group_size,
+            dtype=weight_dtype,
+        ),
+                                            input_dim=1,
+                                            output_dim=0,
+                                            weight_loader=weight_loader)
+
+        layer.register_parameter("weight_scale", weight_scale)
+
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert (scale.dtype == torch.float8_e4m3fn)
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32,
+                                            cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (swizzled_scale.reshape(M, K)
+                if scale_ndim == 2 else swizzled_scale.reshape(B, M, K))
+
+    def process_weights_after_loading(self, layer: Module) -> None:
+
+        # global scales:
+        input_scale_2 = layer.input_scale.max().to(torch.float32)
+        layer.input_scale = Parameter(input_scale_2, requires_grad=False)
+
+        weight_scale_2 = layer.weight_scale_2.max().to(torch.float32)
+        layer.weight_scale_2 = Parameter(weight_scale_2, requires_grad=False)
+
+        layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
+                                requires_grad=False)
+
+        # Swizzle the weight blockscale.
+        # contracting dimension is input dimension
+        # block_size = 16;
+        assert (layer.weight_scale.shape[1] % 16 == 0), (
+            "Expected weight_scale.dim(1) to be divisible by 16")
+        assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
+            "Weight Block scale must be represented as FP8-E4M3")
+        swizzled_weight_scale = self.swizzle_blockscale(layer.weight_scale)
+
+        layer.weight_scale_swizzled = Parameter(swizzled_weight_scale,
+                                                requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        output_dtype = x.dtype
+
+        # for input only the contracting dimension has a constraint.
+        x_m, _ = x.shape
+        w_n, _ = layer.weight.shape
+        output_shape = [x_m, w_n]
+
+        # quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        s_quant = 1 / layer.input_scale
+        x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant)
+
+        # validate dtypes of quantized input, input block scale,
+        # weight and weight_blockscale
+        assert (x_fp4.dtype == torch.uint8)
+        assert (layer.weight.dtype == torch.uint8)
+        assert (x_blockscale.dtype == torch.float8_e4m3fn)
+        assert (layer.weight_scale_swizzled.dtype == torch.float8_e4m3fn)
+        assert (layer.alpha.dtype == torch.float32)
+
+        out = cutlass_scaled_fp4_mm(x_fp4, layer.weight, x_blockscale,
+                                    layer.weight_scale_swizzled, layer.alpha,
+                                    output_dtype)
+        if bias is not None:
+            out = out + bias
+        return out.view(*output_shape)

From 06da01af2c4fb91246d12f7b91580ed40362d9a0 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 12 Mar 2025 18:41:13 +0800
Subject: [PATCH 0676/1240] [CPU] Upgrade CPU backend to torch-2.6 (#13381)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-cpu-test.sh                    | 8 +++++---
 Dockerfile.cpu                                | 2 +-
 cmake/cpu_extension.cmake                     | 2 +-
 requirements/cpu.txt                          | 2 +-
 tests/lora/test_qwen2vl.py                    | 2 +-
 vllm/attention/ops/ipex_attn.py               | 2 +-
 vllm/executor/multiproc_worker_utils.py       | 9 +++++----
 vllm/model_executor/layers/fused_moe/layer.py | 6 +++++-
 vllm/platforms/cpu.py                         | 3 +++
 9 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f6dad818ddc..e45e184852f 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -19,13 +19,14 @@ remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
+  export BUILDKITE_BUILD_NUMBER=$3
 
   # offline inference
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
@@ -36,6 +37,7 @@ function cpu_tests() {
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pip install -r vllm/requirements/test.txt
+    pip install -r vllm/requirements/cpu.txt
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@@ -85,4 +87,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 08a4e188f4c..a10090529d8 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install intel_extension_for_pytorch==2.5.0
+RUN pip install intel_extension_for_pytorch==2.6.0
 
 WORKDIR /workspace
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index ca2ffb1bc3c..345b75d6223 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.6
+        GIT_TAG  v3.7.1
         GIT_PROGRESS TRUE
         GIT_SHALLOW TRUE
     )
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index ba059d3ff72..b4e6abb6e3d 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,7 +2,7 @@
 -r common.txt
 
 # Dependencies for CPUs
-torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
+torch==2.6.0+cpu; platform_machine == "x86_64"
 torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 90735d55be7..7bd3e3d0fe2 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -12,7 +12,7 @@
 from vllm.platforms import current_platform
 
 
-@pytest.fixture(autouse=True)
+@pytest.fixture(autouse=not current_platform.is_cpu())
 def v1(run_with_both_engines_lora):
     # Simple autouse wrapper to run both engines for each test
     # This can be promoted up to conftest.py to run for every
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 598ceea130d..6d96f58320c 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -17,7 +17,7 @@ class _PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [32, 64, 80, 96, 112, 128, 256]
+        return [32, 64, 80, 96, 112, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 68a83bb610a..74237f9eb45 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -254,10 +254,11 @@ def _run_worker_process(
     # online (in situ) tuning is enabled.
     # Offline tuning API (record_untuned_is_enabled()) only
     # available in PyTorch 2.6 or later.
-    import torch.cuda.tunable as tunable
-    if (tunable.is_enabled() and tunable.tuning_is_enabled()
-            and not tunable.record_untuned_is_enabled()):
-        tunable.write_file()
+    if torch.cuda.is_available():
+        import torch.cuda.tunable as tunable
+        if (tunable.is_enabled() and tunable.tuning_is_enabled()
+                and not tunable.record_untuned_is_enabled()):
+            tunable.write_file()
 
     logger.info("Worker exiting")
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 51c4df9d4a5..2c5fa509c59 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -193,10 +193,11 @@ def forward_cpu(
         global_num_experts: int = -1,
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
         **kwargs,
     ):
-        assert custom_routing_function is None
         assert activation == "silu", f"{activation} is not supported."
         return layer.ipex_fusion(
             x,
@@ -206,6 +207,9 @@ def forward_cpu(
             renormalize,
             topk_group,
             num_expert_group,
+            custom_routing_function,
+            scoring_func,
+            e_score_correction_bias,
         )
 
     def forward_tpu(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index ab8982a3a6e..140335dfb64 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -121,6 +121,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
+        # MLA attention is not supported
+        os.environ["VLLM_MLA_DISABLE"] = "1"
+
         # Intel OpenMP setting
         ld_prealod_str = os.getenv("LD_PRELOAD", "")
         if "libiomp5.so" in ld_prealod_str:

From 86e3c5bda3f055340fe37b88f7392eebac6b7ad1 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 12 Mar 2025 05:00:28 -0700
Subject: [PATCH 0677/1240] [ROCm][Bugfix] Ensure that the moe_wna16_gemm
 kernel is not built on ROCm platforms. (#14629)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt              | 2 +-
 csrc/moe/moe_ops.h          | 3 ++-
 csrc/moe/torch_bindings.cpp | 2 +-
 vllm/_custom_ops.py         | 4 ++++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e028bf5951a..ea6d5237949 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -559,7 +559,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 set(VLLM_MOE_EXT_SRC
   "csrc/moe/torch_bindings.cpp"
   "csrc/moe/moe_align_sum_kernels.cu"
-  "csrc/moe/moe_wna16.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
 set_gencode_flags_for_srcs(
@@ -574,6 +573,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     SRCS "${VLLM_MOE_WNA16_SRC}"
     CUDA_ARCHS "${CUDA_ARCHS}")
 
+  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
   cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
     set(MARLIN_MOE_SRC
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 371edb6495b..0bae119a7c4 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -18,7 +18,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                               torch::Tensor sorted_token_ids,
                               torch::Tensor experts_ids,
                               torch::Tensor num_tokens_post_pad);
-
+#ifndef USE_ROCM
 torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor b_qweight, torch::Tensor b_scales,
                              std::optional<torch::Tensor> b_qzeros,
@@ -28,3 +28,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor num_tokens_post_pad, int64_t top_k,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit);
+#endif
\ No newline at end of file
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index d2c03c4d4be..957ac765290 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -31,6 +31,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "                         Tensor! num_tokens_post_pad) -> ()");
   m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
 
+#ifndef USE_ROCM
   m.def(
       "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
       "Tensor b_scales, Tensor? b_qzeros, "
@@ -41,7 +42,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
 
-#ifndef USE_ROCM
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 64175cc4e13..d68c097fbe8 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1146,6 +1146,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
                    num_tokens_post_pad: torch.Tensor, top_k: int,
                    BLOCK_SIZE_M: int, BLOCK_SIZE_N: int, BLOCK_SIZE_K: int,
                    bit: int) -> torch.Tensor:
+    if not current_platform.is_cuda():
+        raise NotImplementedError(
+            "The optimized moe_wna16_gemm kernel is only "
+            "available on CUDA platforms")
     torch.ops._moe_C.moe_wna16_gemm(input, output, b_qweight, b_scales,
                                     b_qzeros, topk_weights, sorted_token_ids,
                                     experts_ids, num_tokens_post_pad, top_k,

From 7abc0bfc1767d3b402467007afc5c390af183682 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 12 Mar 2025 08:36:33 -0700
Subject: [PATCH 0678/1240] [Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  41 +-
 examples/offline_inference/vision_language.py |  20 +-
 .../vision_language_multi_image.py            |  37 ++
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   4 +
 vllm/config.py                                |  15 +-
 vllm/entrypoints/chat_utils.py                |   2 +
 vllm/model_executor/models/gemma3.py          | 533 ++++++++++++++++++
 vllm/model_executor/models/gemma3_mm.py       | 425 ++++++++++++++
 vllm/model_executor/models/registry.py        |   2 +
 10 files changed, 1071 insertions(+), 9 deletions(-)
 create mode 100644 vllm/model_executor/models/gemma3.py
 create mode 100644 vllm/model_executor/models/gemma3_mm.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index e46934b9cae..98e7572981d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -263,10 +263,15 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `Gemma2ForCausalLM`
-  * Gemma2
+  * Gemma 2
   * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
   * ✅︎
   * ✅︎
+- * `Gemma3ForCausalLM`
+  * Gemma 3
+  * `google/gemma-3-1b-it`, etc.
+  * ✅︎
+  * ✅︎
 - * `GlmForCausalLM`
   * GLM-4
   * `THUDM/glm-4-9b-chat-hf`, etc.
@@ -504,7 +509,7 @@ you should explicitly specify the task type to ensure that the model is used in
   *
   *
 - * `Gemma2Model`
-  * Gemma2-based
+  * Gemma 2-based
   * `BAAI/bge-multilingual-gemma2`, etc.
   *
   * ✅︎
@@ -752,6 +757,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `Gemma3ForConditionalGeneration`
+  * Gemma 3
+  * T + I<sup>+</sup>
+  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
+  * ✅︎
+  * ✅︎
+  * ✅︎\*
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -937,6 +949,31 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
 :::
 
+:::{note}
+To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers`.
+The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
+
+Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
+However, there are differences in how they handle text + image inputs:
+
+V0 correctly implements the model's attention pattern:
+- Uses bidirectional attention between the image tokens corresponding to the same image
+- Uses causal attention for other tokens
+- Implemented via (naive) PyTorch SDPA with masking tensors
+- Note: May use significant memory for long prompts with image
+
+V1 currently uses a simplified attention pattern:
+- Uses causal attention for all tokens, including image tokens
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Will be updated in the future to support the correct behavior
+
+This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
+Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
+:::
+
 ### Pooling Models
 
 See [this page](pooling-models) for more information on how to use pooling models.
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 716c31b96ed..39acab4765a 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -118,6 +118,23 @@ def run_fuyu(questions: list[str], modality: str):
     return llm, prompts, stop_token_ids
 
 
+# Gemma 3
+def run_gemma3(questions: list[str], modality: str):
+    assert modality == "image"
+    model_name = "google/gemma-3-4b-it"
+
+    llm = LLM(model=model_name,
+              max_model_len=2048,
+              max_num_seqs=2,
+              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    prompts = [("<bos><start_of_turn>user\n"
+                f"<start_of_image>{question}<end_of_turn>\n"
+                "<start_of_turn>model\n") for question in questions]
+    stop_token_ids = None
+    return llm, prompts, stop_token_ids
+
+
 # GLM-4v
 def run_glm4v(questions: list[str], modality: str):
     assert modality == "image"
@@ -405,7 +422,7 @@ def run_mllama(questions: list[str], modality: str):
             "type": "image"
         }, {
             "type": "text",
-            "text": f"{question}"
+            "text": question
         }]
     }] for question in questions]
     prompts = tokenizer.apply_chat_template(messages,
@@ -664,6 +681,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
     "deepseek_vl_v2": run_deepseek_vl2,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
+    "gemma3": run_gemma3,
     "glm4v": run_glm4v,
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 6fdd4383c1a..4963e6a8c4e 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -80,6 +80,42 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
     )
 
 
+def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
+    model_name = "google/gemma-3-4b-it"
+
+    llm = LLM(model=model_name,
+              max_model_len=8192,
+              max_num_seqs=2,
+              limit_mm_per_prompt={"image": len(image_urls)})
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-800m"
 
@@ -496,6 +532,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
 model_example_map = {
     "aria": load_aria,
     "deepseek_vl_v2": load_deepseek_vl2,
+    "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index e64b703cc52..467114eedb0 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -162,6 +162,7 @@ def _test_processing_correctness(
     "deepseek-ai/deepseek-vl2-tiny",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
+    "google/gemma-3-4b-it",
     "THUDM/glm-4v-9b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a7a88d19904..eadbd7e6f49 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -124,6 +124,8 @@ def check_available_online(
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
+                                         min_transformers_version="4.50"),
     "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
     "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
     "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
@@ -241,6 +243,8 @@ def check_available_online(
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
+                                                      min_transformers_version="4.50"),
     "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index a0f30d0e7b7..2ee45f1837c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -350,10 +350,11 @@ def __init__(
         if self.enforce_eager is None:
             self.enforce_eager = False
 
+        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in ["gemma2", "cohere2"]))
+            (self.hf_text_config.model_type in interleaved_attn_models))
 
         if (not self.disable_sliding_window and has_interleaved_attention):
             if (backend :=
@@ -2501,11 +2502,11 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                if config.model_type == "gemma2":
+                if config.model_type in ("gemma2", "gemma3", "gemma3_text"):
                     logger.info(
-                        "For Gemma 2, we downcast float32 to bfloat16 instead "
-                        "of float16 by default. Please specify `dtype` if you "
-                        "want to use float16.")
+                        "For Gemma 2 and 3, we downcast float32 to bfloat16 "
+                        "instead of float16 by default. Please specify `dtype` "
+                        "if you want to use float16.")
                     torch_dtype = torch.bfloat16
                 else:
                     # Following the common practice, we use float16 for float32
@@ -2637,7 +2638,9 @@ def _get_and_verify_max_len(
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
-    if rope_scaling is not None:
+    # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
+    # scaling, so we skip applying the scaling factor again.
+    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
         # No need to consider "type" key because of patch_rope_scaling when
         # loading HF config
         rope_type = rope_scaling["rope_type"]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index b51ade17def..61f21482f70 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -433,6 +433,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<image>"
             if model_type == "aria":
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
+            if model_type == "gemma3":
+                return "<start_of_image>"
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
new file mode 100644
index 00000000000..f1ecf7fa821
--- /dev/null
+++ b/vllm/model_executor/models/gemma3.py
@@ -0,0 +1,533 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2025 The vLLM team.
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Iterable, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import Gemma3TextConfig
+
+from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Gemma3MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_activation: str,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size, [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           quant_config=quant_config)
+        if hidden_activation != "gelu_pytorch_tanh":
+            raise ValueError(
+                "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
+                "function. Please set `hidden_act` and `hidden_activation` to "
+                "`gelu_pytorch_tanh`.")
+        self.act_fn = GeluAndMul(approximate="tanh")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class Gemma3Attention(nn.Module):
+
+    def __init__(self,
+                 config: Gemma3TextConfig,
+                 hidden_size: int,
+                 num_heads: int,
+                 num_kv_heads: int,
+                 head_dim: int,
+                 max_position_embeddings: int,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 attn_logits_soft_cap: Optional[float] = None,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.config = config
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = config.query_pre_attn_scalar**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=config.attention_bias,
+            quant_config=quant_config,
+        )
+
+        self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # TODO(woosuk): Add reference to the original HF implementation.
+        layer_idx = extract_layer_index(prefix)
+        self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
+        # Initialize the rotary embedding.
+        if self.is_sliding:
+            # Local attention. Override the values in config.json.
+            self.rope_theta = config.rope_local_base_freq
+            self.rope_scaling = {"rope_type": "default"}
+            self.sliding_window = config.interleaved_sliding_window
+        else:
+            # Global attention. Use the values in config.json.
+            self.rope_theta = config.rope_theta
+            self.rope_scaling = config.rope_scaling
+            self.sliding_window = None
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            is_neox_style=True,
+            rope_scaling=self.rope_scaling,
+        )
+
+        # Initialize the attention.
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              logits_soft_cap=attn_logits_soft_cap,
+                              per_layer_sliding_window=self.sliding_window,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q = q.unflatten(-1, (self.num_heads, self.head_dim))
+        q = self.q_norm(q)
+        q = q.flatten(-2, -1)
+        k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
+        k = self.k_norm(k)
+        k = k.flatten(-2, -1)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+
+        if not kwargs.get("has_images", False):
+            # Fast path for text-only inputs. The performance for the text-only
+            # inputs are not affected by the naive attention below.
+            output, _ = self.o_proj(attn_output)
+            return output
+
+        # NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens
+        # that correspond to the same image while using causal attention
+        # otherwise. Current attention backends cannot handle this pattern, so
+        # we temporarily use a naive attention implementation with mask tensors.
+
+        # We intentionally keep the attention backend as-is and only override
+        # `attn_output` with the naive implementation's output. This minimizes
+        # changes to existing model runners and attention backends. The call to
+        # `self.attn(q, k, v)` is only used to populate the KV cache - its
+        # output is discarded and overwritten below. While this duplicates
+        # computation, it maintains compatibility.
+        # TODO(woosuk): Optimize by implementing custom attention kernels.
+        attn_output = self.naive_attn_with_masks(q,
+                                                 k,
+                                                 v,
+                                                 out=attn_output,
+                                                 **kwargs)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def naive_attn_with_masks(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        out: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        # NOTE(woosuk): As described in the comment above, this code is not
+        # meant to be performant. It is only meant to be correct.
+        q = q.view(-1, self.num_heads, self.head_dim)
+        # Expand the key and value to handle GQA.
+        num_queries_per_kv = self.num_heads // self.num_kv_heads
+        k = k.view(-1, self.num_kv_heads, self.head_dim)
+        k = k.repeat_interleave(num_queries_per_kv, dim=-2)
+        v = v.view(-1, self.num_kv_heads, self.head_dim)
+        v = v.repeat_interleave(num_queries_per_kv, dim=-2)
+
+        if self.is_sliding:
+            attn_masks = kwargs["local_attn_masks"]
+        else:
+            attn_masks = kwargs["global_attn_masks"]
+
+        seq_lens = kwargs["seq_lens"]
+        start_idx = 0
+        for seq_len, attn_mask in zip(seq_lens, attn_masks):
+            end_idx = start_idx + seq_len
+            query = q[start_idx:end_idx].unsqueeze(0)
+            key = k[start_idx:end_idx].unsqueeze(0)
+            value = v[start_idx:end_idx].unsqueeze(0)
+
+            # Transpose.
+            query = query.transpose(1, 2)
+            key = key.transpose(1, 2)
+            value = value.transpose(1, 2)
+
+            output = F.scaled_dot_product_attention(
+                query,
+                key,
+                value,
+                attn_mask,
+                self.scaling,
+            )
+            output = output.transpose(1, 2).flatten(-2, -1)
+            out[start_idx:end_idx] = output
+            start_idx = end_idx
+        return out
+
+
+class Gemma3DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: Gemma3TextConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Gemma3Attention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=config.head_dim,
+            max_position_embeddings=config.max_position_embeddings,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            attn_logits_soft_cap=None,
+            prefix=f"{prefix}.self_attn",
+        )
+        self.hidden_size = config.hidden_size
+        self.mlp = Gemma3MLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_activation=config.hidden_activation,
+            quant_config=quant_config,
+        )
+        self.input_layernorm = GemmaRMSNorm(config.hidden_size,
+                                            eps=config.rms_norm_eps)
+        self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                     eps=config.rms_norm_eps)
+        self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+        self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
+                                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            **kwargs,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        hidden_states, residual = self.pre_feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_feedforward_layernorm(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Gemma3Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Gemma3DecoderLayer(
+                config, cache_config, quant_config, prefix=prefix),
+            prefix=f"{prefix}.layers")
+        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        # Normalize the embedding by sqrt(hidden_size)
+        # The normalizer's data type should be downcasted to the model's
+        # data type such as bfloat16, not float32.
+        # See https://github.com/huggingface/transformers/pull/29402
+        normalizer = self.config.hidden_size**0.5
+        self.register_buffer("normalizer", torch.tensor(normalizer))
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        # NOTE(woosuk): Only apply the normalizer to the output of
+        # vocab embedding. Don't apply it to the vision embedding.
+        return self.embed_tokens(input_ids) * self.normalizer
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+        for layer in self.layers[self.start_layer:self.end_layer]:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+                **kwargs,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache scales for compressed-tensors quantization
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = loaded_weight[0]
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            logger.warning(
+                "Some weights are not initialized from checkpoints: %s",
+                unloaded_params)
+        return loaded_params
+
+
+class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        del lora_config  # Unused.
+        super().__init__()
+        self.config = config
+        # currently all existing Gemma models have `tie_word_embeddings` enabled
+        assert config.tie_word_embeddings
+        self.quant_config = quant_config
+        self.model = Gemma3Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+        self.logits_processor = LogitsProcessor(
+            config.vocab_size, soft_cap=config.final_logit_softcapping)
+        self.sampler = get_sampler()
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
new file mode 100644
index 00000000000..121aee51786
--- /dev/null
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -0,0 +1,425 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
+                    Tuple, TypedDict, Union)
+
+import torch
+from torch import nn
+from transformers import BatchFeature, Gemma3Config, ProcessorMixin
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import GemmaRMSNorm
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+
+logger = init_logger(__name__)
+
+
+class Gemma3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+
+
+Gemma3ImageInputs = Gemma3ImagePixelInputs
+
+
+class Gemma3ProcessingInfo(BaseProcessingInfo):
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        hf_config = self.ctx.get_hf_config()
+        return {"image": hf_config.mm_tokens_per_image}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[ProcessorMixin],
+    ) -> int:
+        hf_config = self.ctx.get_hf_config()
+        return hf_config.mm_tokens_per_image
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        # Result in the max possible feature size (h:w = 16:1)
+        return ImageSize(height=8000, width=50)
+
+
+class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        tokenizer = self.info.get_tokenizer()
+        boi_token = tokenizer.boi_token
+
+        num_images = mm_counts.get("image", 0)
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+        return ProcessorInputs(
+            prompt_text=" ".join([boi_token] * num_images),
+            mm_data=mm_data,
+        )
+
+
+class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # TODO(woosuk): Support pan-and-scan.
+        img_kwargs = mm_kwargs.get("images_kwargs", {})
+        img_kwargs["do_pan_and_scan"] = False
+        mm_kwargs["images_kwargs"] = img_kwargs
+        return super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        tokenizer = self.info.get_tokenizer()
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+
+        boi_token = tokenizer.boi_token
+        image_token = tokenizer.image_token
+        mm_tokens_per_image = hf_config.mm_tokens_per_image
+        image_tokens_expanded = "".join([image_token] * mm_tokens_per_image)
+
+        def get_replacement_gemma3(item_idx: int):
+            return PromptUpdateDetails(
+                full=hf_processor.full_image_sequence,
+                features=image_tokens_expanded,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=boi_token,
+                replacement=get_replacement_gemma3,
+            )
+        ]
+
+
+class Gemma3MultiModalProjector(nn.Module):
+
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(config.vision_config.hidden_size,
+                        config.text_config.hidden_size))
+
+        self.mm_soft_emb_norm = GemmaRMSNorm(
+            config.vision_config.hidden_size,
+            eps=config.vision_config.layer_norm_eps)
+
+        self.patches_per_image = int(config.vision_config.image_size //
+                                     config.vision_config.patch_size)
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size,
+                                     stride=self.kernel_size)
+
+    def forward(self, vision_outputs: torch.Tensor):
+        batch_size, _, seq_length = vision_outputs.shape
+
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, seq_length, self.patches_per_image,
+            self.patches_per_image)
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight)
+        return projected_vision_outputs.type_as(vision_outputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor,
+                                        info=Gemma3ProcessingInfo,
+                                        dummy_inputs=Gemma3DummyInputsBuilder)
+class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                     SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+        self.sliding_window = config.text_config.interleaved_sliding_window
+
+        self.vision_tower = SiglipVisionModel(config.vision_config,
+                                              quant_config,
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_tower"))
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Gemma3ForCausalLM"],
+        )
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.language_model.logits_processor.scale *= logit_scale
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @property
+    def sampler(self):
+        return self.language_model.sampler
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            if d.shape != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Gemma3 does not support image_embeds."
+        if pixel_values is None:
+            return None
+
+        if not isinstance(pixel_values, (torch.Tensor, list[torch.Tensor])):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        return Gemma3ImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_pixel_values(pixel_values),
+        )
+
+    def _image_pixels_to_features(
+        self,
+        vision_tower: SiglipVisionModel,
+        pixel_values: torch.Tensor,
+    ) -> torch.Tensor:
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
+        return image_features
+
+    def _process_image_input(
+        self,
+        image_input: Gemma3ImageInputs,
+    ) -> torch.Tensor:
+        assert self.vision_tower is not None
+        pixel_values = image_input["data"]
+        vision_outputs = self._image_pixels_to_features(
+            self.vision_tower,
+            pixel_values,
+        )
+        return self.multi_modal_projector(vision_outputs)
+
+    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        if multimodal_embeddings is None:
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        else:
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            if vision_embeddings is not None:
+                kwargs = self.prepare_attn_masks(
+                    input_ids,
+                    positions,
+                    mask_dtype=vision_embeddings.dtype,
+                    **kwargs)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds,
+                                                  **kwargs)
+
+        return hidden_states
+
+    def prepare_attn_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+        **kwargs,
+    ):
+        kwargs["has_images"] = True
+        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
+        # This is a HACK. Fix this.
+        start_idices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_idices)
+        seq_lens = []
+        for i in range(num_seqs):
+            start_idx = start_idices[i].item()
+            if i < num_seqs - 1:
+                end_idx = start_idices[i + 1].item()
+            else:
+                end_idx = len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+        kwargs["seq_lens"] = seq_lens
+
+        global_attn_masks = []
+        local_attn_masks = []
+        start_idx = 0
+        for seq_len in seq_lens:
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+            start_idx = end_idx
+            # Create a global causal mask.
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            # Fill the lower triangle with 0.
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+
+            # Consider the bidirectional attention between image tokens.
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_pos = (input_token_ids == self.config.image_token_index)
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+
+            # Create a local causal mask with sliding window (1024).
+            local_attn_mask = torch.ones_like(global_attn_mask)
+            local_attn_mask = torch.tril(local_attn_mask,
+                                         diagonal=-self.sliding_window)
+            local_attn_mask = torch.where(local_attn_mask == 0,
+                                          global_attn_mask, float("-inf"))
+            local_attn_masks.append(local_attn_mask)
+        kwargs["global_attn_masks"] = global_attn_masks
+        kwargs["local_attn_masks"] = local_attn_masks
+        return kwargs
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 74160e2d9ee..5dd3aa2973c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -53,6 +53,7 @@
     "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
     "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
     "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
+    "Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
     "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
@@ -161,6 +162,7 @@
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
+    "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),

From fd144c030ae84cfe5579e4dcbb91c2960e4a458a Mon Sep 17 00:00:00 2001
From: ameyanjarlekar <40833548+ameyanjarlekar@users.noreply.github.com>
Date: Wed, 12 Mar 2025 08:50:49 -0700
Subject: [PATCH 0679/1240] [Bugfix] Missing thumbnail from NVLM-D processor
 (#14633)

Signed-off-by: ameyanjarlekar <aanjarlekar@nvidia.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/nvlm_d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 1e1760491a9..0f5cbf082d9 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -45,7 +45,7 @@ def get_image_repl_features(
             raise NotImplementedError("Embedding inputs are not supported")
 
         tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
-        if self.use_thumbnail and num_patches != 1:
+        if self.use_thumbnail:
             tile_pos_identifiers += ["<tile_global_thumbnail>"]
 
         context_size = feature_size // num_patches

From 0494a7537bc278b387b31f3edd65f3b0e495245a Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Wed, 12 Mar 2025 08:51:20 -0700
Subject: [PATCH 0680/1240] [ROCm] Enable chunked prefill/paged attention in
 MLA on ROCm (#14316)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py | 18 ++----------------
 vllm/config.py                        |  4 ++--
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index e912b1e9757..fc5f3420e39 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -1327,21 +1327,7 @@ def _compute_prefill_context(
                                                [0, q.shape[-1] - v.shape[-1]],
                                                value=0)
 
-            if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
-                attn_output, attn_softmax_lse = self.triton_fa_func(
-                    q,
-                    k,
-                    v_padded,
-                    None,
-                    prefill_metadata.query_start_loc,
-                    prefill_metadata.context_chunk_cu_seq_lens[i],
-                    prefill_metadata.max_query_len,
-                    prefill_metadata.context_chunk_max_seq_lens[i],
-                    False,  # causal
-                    self.scale,
-                    None,  # attn_mask is None unless applying ALiBi mask
-                )
-            elif is_vllm_fa:
+            if is_vllm_fa:
                 attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
                     q=q,
                     k=k,
@@ -1416,7 +1402,7 @@ def _forward_prefill(
         v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
                                            value=0)
 
-        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
+        if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
             output = self.triton_fa_func(
                 q,
                 k,
diff --git a/vllm/config.py b/vllm/config.py
index 2ee45f1837c..b61d1a22c8a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3450,9 +3450,9 @@ def __post_init__(self):
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
         if self.model_config and self.model_config.use_mla and \
-            not current_platform.is_cuda():
+            not (current_platform.is_cuda() or current_platform.is_rocm()):
             logger.info(
-                "MLA is enabled on a non-cuda platform; forcing chunked "
+                "MLA is enabled on a non-GPU platform; forcing chunked "
                 "prefill and prefix caching to be disabled.")
             self.scheduler_config.enable_chunked_prefill = False
             self.scheduler_config.chunked_prefill_enabled = False

From c61a56b368440810ad98e165b7546ed952b5ee58 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Thu, 13 Mar 2025 00:31:19 +0800
Subject: [PATCH 0681/1240] [FEAT] [ROCm] [Embedding] Add encoder-only model
 support into ROCm Flash Attention to enable embedding models. (#14664)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |   4 +
 csrc/moe/torch_bindings.cpp                   |   1 +
 .../embedding/language/test_cls_models.py     |  17 ++-
 .../embedding/language/test_embedding.py      |  13 +-
 .../models/embedding/language/test_gritlm.py  |   4 +-
 .../vision_language/test_llava_next.py        |  17 +++
 vllm/attention/backends/rocm_flash_attn.py    | 112 +++++++++++-------
 7 files changed, 118 insertions(+), 50 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea6d5237949..5baa39b6f9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -561,6 +561,10 @@ set(VLLM_MOE_EXT_SRC
   "csrc/moe/moe_align_sum_kernels.cu"
   "csrc/moe/topk_softmax_kernels.cu")
 
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
+endif()
+
 set_gencode_flags_for_srcs(
   SRCS "${VLLM_MOE_EXT_SRC}"
   CUDA_ARCHS "${CUDA_ARCHS}")
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 957ac765290..718418e6cd4 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -52,6 +52,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "int moe_block_size, bool replicate_input, bool apply_weights)"
       " -> Tensor");
   // conditionally compiled so impl registration is in source file
+
 #endif
 }
 
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index b0420ff5cc7..c6155da50b5 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -7,6 +7,8 @@
 import torch
 from transformers import AutoModelForSequenceClassification
 
+from vllm.platforms import current_platform
+
 
 @pytest.mark.parametrize(
     "model",
@@ -15,14 +17,21 @@
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
     ],
 )
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype",
+                         ["half"] if current_platform.is_rocm() else ["float"])
 def test_classification_models(
     hf_runner,
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
+    monkeypatch,
 ) -> None:
+    if current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
@@ -43,4 +52,8 @@ def print_model(model):
         hf_output = torch.tensor(hf_output)
         vllm_output = torch.tensor(vllm_output)
 
-        assert torch.allclose(hf_output, vllm_output, 1e-3)
+        # the tolerance value of 1e-2 is selected based on the
+        # half datatype tests in
+        # tests/models/embedding/language/test_embedding.py
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index a8ac70d58e6..6c28ee91a50 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -6,6 +6,7 @@
 import pytest
 
 from vllm.config import PoolerConfig
+from vllm.platforms import current_platform
 
 from ..utils import check_embeddings_close
 
@@ -18,15 +19,15 @@
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
         pytest.param("intfloat/e5-mistral-7b-instruct",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
-        # [Encoder-decoder]
+        # [Cross-Encoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
@@ -37,11 +38,19 @@ def test_models(
     example_prompts,
     model,
     dtype: str,
+    monkeypatch,
 ) -> None:
+
+    if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
+        # ROCm Triton FA does not currently support sliding window attention
+        # switch to use ROCm CK FA backend
+        monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
+
     vllm_extra_kwargs = {}
     if model == "ssmits/Qwen2-7B-Instruct-embed-base":
         vllm_extra_kwargs["override_pooler_config"] = \
             PoolerConfig(pooling_type="MEAN")
+
     if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
         vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
 
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index 470dc041077..cae3e1a5c62 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -15,8 +15,8 @@
 from ....utils import RemoteOpenAIServer
 
 # GritLM embedding implementation is only supported by XFormers backend.
-pytest.mark.skipif(not importlib.util.find_spec("xformers"),
-                   reason="GritLM requires XFormers")
+pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
+                                reason="GritLM requires XFormers")
 
 MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
 MAX_MODEL_LEN = 4000
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 4c2fbd526ed..8b9a856d005 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -4,10 +4,27 @@
 import torch.nn.functional as F
 from transformers import AutoModelForVision2Seq
 
+from vllm.platforms import current_platform
+
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
 from ..utils import check_embeddings_close
 
+# Llava Next embedding implementation is only supported by CUDA.
+# If run on ROCm, hf_model.model.resize_token_embeddings will
+# cause the following error:
+#    RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
+#    requires compiling PyTorch with MAGMA. Please use PyTorch
+#    built with MAGMA support.
+# If run on CPU, hf_model.model.resize_token_embeddings will
+# cause the following error:
+#    RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
+#    requires compiling PyTorch with LAPACK. Please use PyTorch
+#    built with LAPACK support.
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="Llava Next model uses op that is only supported in CUDA")
+
 llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
 
 HF_TEXT_PROMPTS = [
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 02a2a48fe85..c47202099ac 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer ROCm GPUs."""
+import itertools
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
 
@@ -342,28 +343,27 @@ def _get_seq_len_block_table_args(
     Decoder attn -> select entirely decoder self-attention-related fields
     Encoder/decoder cross-attn -> select encoder sequence lengths
     Encoder attn -> select encoder sequence lengths fields
+    Encoder-only attn -> select prefill sequence lengths with 
+        bidirectional attention
     
     Arguments:
 
     * attn_metadata: Attention metadata structure associated with attention op
     * attn_type: encoder attention, decoder self-attention,
-                encoder/decoder cross-attention
+                encoder/decoder cross-attention, encoder-only
 
     Returns:
 
     * Appropriate sequence-lengths tensors for query and key
     * Appropriate max sequence-length scalar
+    * Causal masking flag
     '''
 
-    partial_prefix_sum = 0
     if attn_type == AttentionType.ENCODER:
         assert attn_metadata.encoder_seq_lens is not None
         assert attn_metadata.encoder_seq_lens_tensor is not None
         query_seq_start_loc = torch.tensor(
-            [0] + [
-                partial_prefix_sum := partial_prefix_sum + i
-                for i in attn_metadata.encoder_seq_lens
-            ],
+            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
             device=attn_metadata.encoder_seq_lens_tensor.device,
             dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
         causal_mask = False
@@ -372,16 +372,29 @@ def _get_seq_len_block_table_args(
         return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
                 query_seq_start_loc, attn_metadata.max_encoder_seq_len,
                 attn_metadata.encoder_seq_lens, causal_mask)
+
+    elif attn_type == AttentionType.ENCODER_ONLY:
+        # For encoder-only models, we use the prefill sequence lengths
+        assert attn_metadata.seq_lens is not None
+        assert attn_metadata.seq_lens_tensor is not None
+        query_seq_start_loc = torch.tensor(
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
+            device=attn_metadata.seq_lens_tensor.device,
+            dtype=attn_metadata.seq_lens_tensor.dtype)
+        max_seq_len = attn_metadata.max_prefill_seq_len
+        # Encoder-only models typically use bidirectional attention
+        causal_mask = False
+
+        return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
+                max_seq_len, attn_metadata.seq_lens, causal_mask)
+
     elif attn_type == AttentionType.DECODER:
         # Decoder self-attention
         # Choose max_seq_len based on whether we are in prompt_run
         assert attn_metadata.seq_lens is not None
         assert attn_metadata.seq_lens_tensor is not None
         query_seq_start_loc = torch.tensor(
-            [0] + [
-                partial_prefix_sum := partial_prefix_sum + i
-                for i in attn_metadata.seq_lens
-            ],
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
             device=attn_metadata.seq_lens_tensor.device,
             dtype=attn_metadata.seq_lens_tensor.dtype)
         max_seq_len = attn_metadata.max_prefill_seq_len
@@ -393,21 +406,14 @@ def _get_seq_len_block_table_args(
         assert attn_metadata.seq_lens is not None
         assert attn_metadata.encoder_seq_lens_tensor is not None
         query_start_loc = torch.tensor(
-            [0] + [
-                partial_prefix_sum := partial_prefix_sum + i
-                for i in attn_metadata.seq_lens
-            ],
+            list(itertools.accumulate([0] + attn_metadata.seq_lens)),
             device=attn_metadata.encoder_seq_lens_tensor.device,
             dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
 
-        partial_prefix_sum = 0
         assert attn_metadata.encoder_seq_lens is not None
         assert attn_metadata.seq_lens_tensor is not None
         key_seq_start_loc = torch.tensor(
-            [0] + [
-                partial_prefix_sum := partial_prefix_sum + i
-                for i in attn_metadata.encoder_seq_lens
-            ],
+            list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
             device=attn_metadata.seq_lens_tensor.device,
             dtype=attn_metadata.seq_lens_tensor.dtype)
         causal_mask = False
@@ -584,6 +590,8 @@ def forward(
                 will match encoder sequence lengths, pass encoder sequence
                 attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
                 max_encoder_seq_len)
+            * ENCODER_ONLY: bidirectional attention with no KV caching;
+                use prefill sequence attributes
 
         Args:
             query: shape = [num_tokens, num_heads * head_size]
@@ -608,7 +616,11 @@ def forward(
         else:
             assert value is None
 
-        if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
+        # Only update KV cache for decoder self-attention
+        # and encoder-decoder cross-attention
+        if self.attn_type not in [
+                AttentionType.ENCODER, AttentionType.ENCODER_ONLY
+        ] and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
 
@@ -632,6 +644,9 @@ def forward(
 
         if self.attn_type != AttentionType.ENCODER:
             num_prefill_tokens = attn_metadata.num_prefill_tokens
+        elif self.attn_type == AttentionType.ENCODER_ONLY:
+            # For encoder-only models, all tokens are processed in one go
+            num_prefill_tokens = query.shape[0]
         else:
             assert attn_metadata.num_encoder_tokens is not None
             num_prefill_tokens = attn_metadata.num_encoder_tokens
@@ -642,8 +657,13 @@ def forward(
         # QKV for prefill.
         query = query[:num_prefill_tokens]
 
+        # For encoder-only and encoder models,
+        # we process all tokens at once
+        # For decoder and encoder-decoder,
+        # we may need to limit key/value to prefill tokens
         if key is not None and value is not None \
-            and self.attn_type != AttentionType.ENCODER_DECODER:
+            and self.attn_type not in [AttentionType.ENCODER_DECODER,
+                                       AttentionType.ENCODER_ONLY]:
             key = key[:num_prefill_tokens]
             value = value[:num_prefill_tokens]
 
@@ -678,7 +698,7 @@ def forward(
                             self.alibi_slopes,
                             query.dtype,
                             seq_lens,
-                            make_attn_mask=False)  # type: ignore
+                            make_attn_mask=causal_mask)  # type: ignore
                     out, _ = self.attn_func(
                         query,
                         key,
@@ -703,7 +723,7 @@ def forward(
                             self.alibi_slopes,
                             query.dtype,
                             attn_metadata.seq_lens,
-                            make_attn_mask=True)  # type: ignore
+                            make_attn_mask=causal_mask)  # type: ignore
                     query = query.movedim(0, query.dim() - 2)
                     key = key.movedim(0, key.dim() - 2)
                     value = value.movedim(0, value.dim() - 2)
@@ -729,7 +749,7 @@ def forward(
                         max_seqlen_q=prefill_meta.max_prefill_seq_len,
                         max_seqlen_k=key_max_seq_len,
                         softmax_scale=self.scale,
-                        causal=True,
+                        causal=causal_mask,
                         window_size=self.sliding_window,
                         alibi_slopes=self.alibi_slopes,
                         softcap=self.logits_soft_cap,
@@ -742,25 +762,29 @@ def forward(
                 else:
                     output = out
             else:
-                # prefix-enabled attention
-                output[:num_prefill_tokens] = PagedAttention.forward_prefix(
-                    query,
-                    key,
-                    value,
-                    self.kv_cache_dtype,
-                    key_cache,
-                    value_cache,
-                    prefill_meta.block_tables,
-                    prefill_meta.query_start_loc,
-                    prefill_meta.seq_lens_tensor,
-                    prefill_meta.max_query_len,
-                    self.alibi_slopes,
-                    self.sliding_window[0],
-                    layer._k_scale,
-                    layer._v_scale,
-                )
-
-        if decode_meta := attn_metadata.decode_metadata:
+                # prefix-enabled attention -
+                # not applicable for encoder-only models
+                if self.attn_type != AttentionType.ENCODER_ONLY:
+                    output[:
+                           num_prefill_tokens] = PagedAttention.forward_prefix(
+                               query,
+                               key,
+                               value,
+                               self.kv_cache_dtype,
+                               key_cache,
+                               value_cache,
+                               prefill_meta.block_tables,
+                               prefill_meta.query_start_loc,
+                               prefill_meta.seq_lens_tensor,
+                               prefill_meta.max_query_len,
+                               self.alibi_slopes,
+                               self.sliding_window[0],
+                               layer._k_scale,
+                               layer._v_scale,
+                           )
+        # Skip decode phase for encoder-only models
+        if (decode_meta := attn_metadata.decode_metadata) and (
+                self.attn_type != AttentionType.ENCODER_ONLY):
             # Decoding run.
             # Whether to use rocm custom paged attention or not
             num_seqs, num_heads, head_size = decode_query.shape
@@ -885,4 +909,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
\ No newline at end of file
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)

From fe2f183b7911bef03265b74cbcb936ecf2ca20e3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 12 Mar 2025 13:29:48 -0400
Subject: [PATCH 0682/1240] [BugFix][V1] Fix parallel sampling finishing/aborts
 (#14512)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_async_llm.py             | 56 ++++++++++++++--
 .../v1/entrypoints/openai/test_completion.py  | 21 ++++--
 vllm/outputs.py                               | 64 ++++++-------------
 vllm/v1/engine/async_llm.py                   |  3 +-
 vllm/v1/engine/llm_engine.py                  |  2 +-
 vllm/v1/engine/output_processor.py            | 49 ++++++++------
 vllm/v1/engine/parallel_sampling.py           | 55 +++++++---------
 7 files changed, 137 insertions(+), 113 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 0de0026eb28..5b9725d59dd 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -46,6 +46,7 @@ async def generate(engine: AsyncLLM,
                    prompt: PromptType,
                    output_kind: RequestOutputKind,
                    max_tokens: int,
+                   n: int = 1,
                    prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
     # Ensure generate doesn't complete too fast for cancellation test.
     await asyncio.sleep(0.2)
@@ -54,13 +55,15 @@ async def generate(engine: AsyncLLM,
     sampling_params = SamplingParams(max_tokens=max_tokens,
                                      ignore_eos=True,
                                      output_kind=output_kind,
-                                     temperature=0,
+                                     temperature=0.5,
+                                     seed=33,
+                                     n=n,
                                      prompt_logprobs=prompt_logprobs)
     async for out in engine.generate(request_id=request_id,
                                      prompt=prompt,
                                      sampling_params=sampling_params):
 
-        num_tokens = len(out.outputs[0].token_ids)
+        num_tokens = sum(len(output.token_ids) for output in out.outputs)
         if output_kind == RequestOutputKind.DELTA:
             count += num_tokens
         else:
@@ -136,17 +139,22 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
 
         NUM_REQUESTS = 100
         NUM_EXPECTED_TOKENS = 100
+        NUM_EXPECTED_TOKENS_LONG = 50000
         REQUEST_IDS_TO_ABORT = range(1, 100, 10)
+        PARALLEL_SAMPLE_REQ_IDS = range(1, 100, 15)
 
         request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
 
         # Create concurrent requests.
         tasks: list[asyncio.Task] = []
-        for request_id in request_ids:
+        for idx, request_id in enumerate(request_ids):
+            max_tokens = NUM_EXPECTED_TOKENS_LONG if (
+                idx in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS
+            n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
             tasks.append(
                 asyncio.create_task(
                     generate(engine, request_id, prompt, output_kind,
-                             NUM_EXPECTED_TOKENS)))
+                             max_tokens, n)))
 
         # API server cancels requests when they disconnect.
         for idx in REQUEST_IDS_TO_ABORT:
@@ -162,10 +170,13 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
             else:
                 # Otherwise, make sure the request was not impacted.
                 num_generated_tokens, request_id = await task
-                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
+                expected_tokens = NUM_EXPECTED_TOKENS * n
+                assert num_generated_tokens == expected_tokens, (
                     f"{request_id} generated {num_generated_tokens} but "
-                    f"expected {NUM_EXPECTED_TOKENS}")
+                    f"expected {expected_tokens}")
 
+        # Make sure all aborted requests were really aborted.
         assert not engine.output_processor.has_unfinished_requests()
 
         # Confirm we can do another generation.
@@ -176,3 +187,36 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
         num_generated_tokens, request_id = await task
         assert num_generated_tokens == NUM_EXPECTED_TOKENS
         assert not engine.output_processor.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("n", [1, 3])
+@pytest.mark.parametrize("engine_args_and_prompt",
+                         [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
+                          (VISION_ENGINE_ARGS, VISION_PROMPT)])
+@pytest.mark.asyncio
+async def test_finished_flag(monkeypatch, n: int,
+                             engine_args_and_prompt: tuple[AsyncEngineArgs,
+                                                           PromptType]):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+        engine_args, prompt = engine_args_and_prompt
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        sampling_params = SamplingParams(max_tokens=100,
+                                         output_kind=RequestOutputKind.DELTA,
+                                         temperature=1.0,
+                                         seed=33,
+                                         n=n)
+        outputs = [
+            out
+            async for out in engine.generate(request_id="request-33",
+                                             prompt=prompt,
+                                             sampling_params=sampling_params)
+        ]
+
+        # Assert only the last output has the finished flag set
+        assert all(not out.finished for out in outputs[:-1])
+        assert outputs[-1].finished
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 171c84176ea..57ca99e1f68 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -263,15 +263,16 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
 
     prompt = "What is an LLM?"
     n = 3
-    max_tokens = 5
+    max_tokens = 50  # we want some to finish earlier than others
 
     # High temperature to maximize chance of unique completions.
     completion = await client.completions.create(model=model_name,
                                                  prompt=prompt,
                                                  max_tokens=max_tokens,
                                                  n=n,
-                                                 temperature=0.95,
+                                                 temperature=1.0,
                                                  stream=False,
+                                                 logprobs=0,
                                                  seed=42)
 
     # Assert `n` completions
@@ -279,6 +280,7 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
     assert num_completions == n, (
         f"Num completions {num_completions} but expected {n}.")
     completion_repeats: dict[str, int] = {}
+    output_token_lengths = set()
     for idx, choice in enumerate(completion.choices):
         # Assert correct completion index & some finish reason.
         assert choice.index == idx, (
@@ -287,6 +289,9 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
             "None finish_reason is invalid.")
         text = choice.text
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
+        output_token_lengths.add(len(choice.logprobs.tokens))
+    # Assert subrequests finished at different times
+    assert len(output_token_lengths) > 1
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
@@ -312,16 +317,16 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
 
     prompt = "What is an LLM?"
     n = 3
-    max_tokens = 5
+    max_tokens = 50  # we want some to finish earlier than others
 
     stream = await client.completions.create(model=model_name,
                                              prompt=prompt,
                                              max_tokens=max_tokens,
                                              n=n,
-                                             temperature=0.95,
+                                             temperature=1.0,
                                              stream=True,
                                              seed=42)
-    chunks: list[list[str]] = [[] for i in range(n)]
+    chunks: list[list[str]] = [[] for _ in range(n)]
     finish_reason_count = 0
     async for chunk in stream:
         index = chunk.choices[0].index
@@ -333,14 +338,18 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
     assert finish_reason_count == n, (
         f"Expected {n} completions with valid indices and finish_reason.")
     completion_repeats: dict[str, int] = {}
+    chunk_lengths = set()
     for chunk in chunks:
         chunk_len = len(chunk)
         # Assert correct number of completion tokens
-        assert chunk_len == max_tokens, (
+        chunk_lengths.add(chunk_len)
+        assert chunk_len <= max_tokens, (
             f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
         text = "".join(chunk)
         completion_repeats[text] = completion_repeats.get(text, 0) + 1
         print(text)
+    # Assert subrequests finished at different times
+    assert len(chunk_lengths) > 1
     # Assert `n` unique completions
     num_unique = len(completion_repeats)
     if num_unique != n:
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 8c355c89e3e..7a20c340edc 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -134,57 +134,29 @@ def __init__(
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
         self.num_cached_tokens = num_cached_tokens
 
-    @classmethod
-    def new(
-        cls,
-        request_id: str,
-        prompt: Optional[str],
-        prompt_token_ids: Optional[list[int]],
-        text: str,
-        token_ids: list[int],
-        logprobs: Optional[SampleLogprobs],
-        prompt_logprobs: Optional[PromptLogprobs],
-        cumulative_logprob: Optional[float],
-        finished: bool = False,
-    ) -> "RequestOutput":
-        """Initialize a new RequestOutput object."""
-
-        # TODO: Support `n` > 1.
-        completion_output = CompletionOutput(
-            index=0,
-            text=text,
-            token_ids=token_ids,
-            cumulative_logprob=cumulative_logprob,
-            logprobs=logprobs)
-
-        return RequestOutput(
-            request_id=request_id,
-            prompt=prompt,
-            prompt_token_ids=prompt_token_ids,
-            prompt_logprobs=prompt_logprobs,
-            outputs=[completion_output],
-            finished=finished,
-        )
-
     def add(self, next_output: "RequestOutput") -> None:
         """Merge subsequent RequestOutput into this one"""
 
-        self.prompt = next_output.prompt
-        self.prompt_token_ids = next_output.prompt_token_ids
-        self.prompt_logprobs = next_output.prompt_logprobs
         self.finished |= next_output.finished
 
-        #TODO assuming n == 1 for now
-        completion = self.outputs[0]
-        next_completion = next_output.outputs[0]
-        completion.text += next_completion.text
-        if not isinstance(completion.token_ids, MutableSequence):
-            completion.token_ids = list(completion.token_ids)
-        completion.token_ids.extend(next_completion.token_ids)
-        if next_completion.logprobs:
-            assert completion.logprobs is not None
-            completion.logprobs.extend(next_completion.logprobs)
-        completion.cumulative_logprob = next_completion.cumulative_logprob
+        for next_completion in next_output.outputs:
+            for completion in self.outputs:
+                if completion.index == next_completion.index:
+                    # Merge outputs with same index
+                    completion.text += next_completion.text
+                    if not isinstance(completion.token_ids, MutableSequence):
+                        completion.token_ids = list(completion.token_ids)
+                    completion.token_ids.extend(next_completion.token_ids)
+                    if next_completion.logprobs:
+                        assert completion.logprobs is not None
+                        completion.logprobs.extend(next_completion.logprobs)
+                    completion.cumulative_logprob = (
+                        next_completion.cumulative_logprob)
+                    completion.finish_reason = next_completion.finish_reason
+                    completion.stop_reason = next_completion.stop_reason
+                    break
+            else:
+                self.outputs.append(next_completion)
 
     @classmethod
     def from_seq_group(
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3dc513a7283..05633352be6 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -298,9 +298,8 @@ async def _run_output_handler(self):
     async def abort(self, request_id: str) -> None:
         """Abort RequestId in OutputProcessor and EngineCore."""
 
-        request_ids = [request_id]
+        request_ids = self.output_processor.abort_requests((request_id, ))
         await self.engine_core.abort_requests_async(request_ids)
-        self.output_processor.abort_requests(request_ids)
 
         if self.log_requests:
             logger.info("Aborted request %s.", request_id)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 213faaa4516..d56aee1accc 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -137,8 +137,8 @@ def validate_outputs(cls, outputs, output_type):
     def abort_request(self, request_ids: list[str]) -> None:
         """Remove request_ids from EngineCore and Detokenizer."""
 
+        request_ids = self.output_processor.abort_requests(request_ids)
         self.engine_core.abort_requests(request_ids)
-        self.output_processor.abort_requests(request_ids)
 
     def add_request(
         self,
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index aea526188a8..83180b66bea 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import Optional, Union
 
@@ -102,8 +103,7 @@ def make_request_output(
     ) -> Optional[RequestOutput]:
 
         finished = finish_reason is not None
-        output_kind = self.output_kind
-        final_only = output_kind == RequestOutputKind.FINAL_ONLY
+        final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
 
         # In follow up, we will switch to invariant where EngineCore
         # does not stream partial prefills.
@@ -111,24 +111,24 @@ def make_request_output(
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
-        def new_request_output(request_id: str) -> RequestOutput:
-            return self._new_request_output(request_id, finished)
-
         completion_output = self._new_completion_output(
             new_token_ids, finish_reason, stop_reason)
 
-        if self.parent_req is not None:
-            return self.parent_req.make_request_output(final_only,
-                                                       completion_output,
-                                                       new_request_output)
+        request_id = self.request_id
+        if self.parent_req is None:
+            outputs = [completion_output]
+        else:
+            request_id, outputs, finished = self.parent_req.get_outputs(
+                request_id, completion_output)
+            if not outputs:
+                return None
 
-        request_output = new_request_output(self.request_id)
-        request_output.outputs.append(completion_output)
-        return request_output
+        return self._new_request_output(request_id, outputs, finished)
 
     def _new_request_output(
         self,
         request_id: str,
+        outputs: list[CompletionOutput],
         finished: bool,
     ) -> RequestOutput:
 
@@ -143,7 +143,7 @@ def _new_request_output(
             prompt=self.prompt,
             prompt_token_ids=self.prompt_token_ids,
             prompt_logprobs=prompt_logprobs,
-            outputs=[],
+            outputs=outputs,
             finished=finished,
         )
 
@@ -188,6 +188,7 @@ def __init__(
         self.log_stats = log_stats
         self.tokenizer = tokenizer
         self.request_states: dict[str, RequestState] = {}
+        self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates()
 
     def get_num_unfinished_requests(self):
@@ -198,14 +199,20 @@ def has_unfinished_requests(self) -> bool:
 
     def abort_requests(
         self,
-        request_ids: list[str],
-    ) -> None:
+        request_ids: Iterable[str],
+    ) -> list[str]:
+        request_ids_to_abort = []
         for request_id in request_ids:
             req_state = self.request_states.pop(request_id, None)
             if req_state is not None:
                 self.lora_states.abort_request(req_state)
-                if req_state.parent_req is not None:
-                    req_state.parent_req.finish_child_request(request_id)
+                request_ids_to_abort.append(request_id)
+            else:
+                parent = self.parent_requests.pop(request_id, None)
+                if parent and parent.child_requests:
+                    self.abort_requests(parent.child_requests)
+                    request_ids_to_abort.extend(parent.child_requests)
+        return request_ids_to_abort
 
     def add_request(
         self,
@@ -227,6 +234,8 @@ def add_request(
             log_stats=self.log_stats)
         self.request_states[request_id] = req_state
         self.lora_states.add_request(req_state)
+        if parent_req:
+            self.parent_requests[parent_req.request_id] = parent_req
 
     def process_outputs(
         self,
@@ -314,12 +323,14 @@ def process_outputs(
             # Free completed requests.
             if finish_reason is not None:
                 self.request_states.pop(req_id)
+                # Remove parent request if applicable.
+                parent_req = req_state.parent_req
+                if parent_req and not parent_req.child_requests:
+                    self.parent_requests.pop(parent_req.request_id, None)
                 if not engine_core_output.finished:
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.
                     reqs_to_abort.append(req_id)
-                if req_state.parent_req is not None:
-                    req_state.parent_req.finish_child_request(req_id)
 
                 # Track per-request stats
                 self._update_stats_from_finished(req_state, finish_reason,
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 4e2c78173b5..0eeca657406 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from copy import copy
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
-from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.outputs import CompletionOutput
 from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -23,7 +23,7 @@ class ParentRequest:
     child_requests: set[str]
 
     # To aggregate child completions when not streaming
-    output_aggregator: Optional[RequestOutput]
+    output_aggregator: list[CompletionOutput]
 
     # To find the max number of generated tokens across all children
     max_num_generation_tokens: int
@@ -37,7 +37,9 @@ def __init__(self, request_id: str,
         self.sampling_params = sampling_params
 
         self.child_requests = set()
-        self.output_aggregator = None
+        self.output_aggregator = [None] * sampling_params.n if (
+            sampling_params.output_kind
+            == RequestOutputKind.FINAL_ONLY) else []
         self.max_num_generation_tokens = 0
         self.cached_child_sampling_params = None
 
@@ -93,43 +95,30 @@ def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
         """
         child_req_id = f"{index}_{self.request_id}"
         self.child_requests.add(child_req_id)
-        return (child_req_id, self._get_child_sampling_params(index))
-
-    def finish_child_request(self, req_id: str):
-        self.child_requests.remove(req_id)
+        return child_req_id, self._get_child_sampling_params(index)
 
     @property
     def n(self) -> int:
         return self.sampling_params.n
 
-    def make_request_output(
+    def get_outputs(
         self,
-        final_only: bool,
+        child_request_id: str,
         completion_output: CompletionOutput,
-        new_request_output: Callable[[str], RequestOutput],
-    ) -> Optional[RequestOutput]:
-        # Use an existing RequestOutput if we're aggregating
-        request_output = self.output_aggregator
-
-        # Make new RequestOutput otherwise
-        if request_output is None:
-            request_output = new_request_output(self.request_id)
-
-        # Add a new completion
-        request_output.outputs.append(completion_output)
+    ) -> tuple[str, list[CompletionOutput], bool]:
+        if completion_output.finished():
+            self.child_requests.remove(child_request_id)
 
-        # If not streaming, aggregate until all child requests complete
-        if final_only and len(request_output.outputs) != self.n:
-            self.output_aggregator = request_output
-            return None
-
-        # We're done aggregating
-        self.output_aggregator = None
+        if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
+            # If streaming, just return the current output.
+            outputs = [completion_output]
+        else:
+            # If not streaming, aggregate the n final outputs.
+            self.output_aggregator[completion_output.index] = completion_output
+            outputs = [] if self.child_requests else self.output_aggregator
 
-        # Parent completion output list must be sorted by index
-        request_output.outputs = sorted(request_output.outputs,
-                                        key=lambda x: x.index)
-        return request_output
+        finished = not self.child_requests
+        return self.request_id, outputs, finished
 
     def observe_num_generation_tokens(self, num_generation_tokens: int):
         self.max_num_generation_tokens = max(num_generation_tokens,

From 685b32be22f6d942712d500aa684bd6d6823c258 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 12 Mar 2025 11:21:19 -0700
Subject: [PATCH 0683/1240] [V1] Allow sliding window + prefix caching (#13069)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index b61d1a22c8a..aa8b16920a9 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1146,7 +1146,7 @@ def _verify_prefix_caching(self) -> None:
         if not self.enable_prefix_caching:
             return
 
-        if self.sliding_window is not None:
+        if self.sliding_window is not None and not envs.VLLM_USE_V1:
             raise NotImplementedError(
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")

From 4e18623bc8072192eb11e6bd1b9495be9039a054 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Wed, 12 Mar 2025 15:35:18 -0700
Subject: [PATCH 0684/1240] [release] Add force remove for TPU logs (#14697)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 9354bdd8a44..096a1c870c6 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -57,8 +57,8 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
-      - "rm /var/log/syslog"
-      - "rm /var/log/kern.log"
+      - "rm -f /var/log/syslog"
+      - "rm -f /var/log/kern.log"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

From 8410906d5473769a09c843691966069cc175059a Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 12 Mar 2025 21:12:13 -0600
Subject: [PATCH 0685/1240] [bugfix] fixup warning message for plugged
 schedulers for v1 (#14700)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5a4e67a2dd7..174d96ec437 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,6 +22,7 @@
 from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
                         zmq_socket_ctx)
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
+from vllm.v1.core.scheduler import Scheduler as V1Scheduler
 from vllm.v1.core.scheduler import SchedulerOutput
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
@@ -67,15 +68,21 @@ def __init__(
 
         # Setup scheduler.
         if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
+            Scheduler = resolve_obj_by_qualname(
+                vllm_config.scheduler_config.scheduler_cls)
+        else:
+            Scheduler = vllm_config.scheduler_config.scheduler_cls
+
+        # This warning can be removed once the V1 Scheduler interface is
+        # finalized and we can maintain support for scheduler classes that
+        # implement it
+        if Scheduler is not V1Scheduler:
             logger.warning(
                 "Using configured V1 scheduler class %s. "
                 "This scheduler interface is not public and "
                 "compatibility may not be maintained.",
                 vllm_config.scheduler_config.scheduler_cls)
-            Scheduler = resolve_obj_by_qualname(
-                vllm_config.scheduler_config.scheduler_cls)
-        else:
-            Scheduler = vllm_config.scheduler_config.scheduler_cls
+
         self.scheduler = Scheduler(
             scheduler_config=vllm_config.scheduler_config,
             model_config=vllm_config.model_config,

From 8fcd9b8e0e89b89e9ba4f6609bd42af017ec5cf3 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Wed, 12 Mar 2025 20:13:48 -0700
Subject: [PATCH 0686/1240] Add ray[data] as tpu dependency (#14691)

Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/tpu.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index e8e3b0af5db..e071c604b5c 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -9,6 +9,7 @@ setuptools-scm>=8
 wheel
 jinja2
 ray[default]
+ray[data]
 
 # Install torch_xla
 --pre

From e23aa259b173f4b8e6a90f107f581b57c0581b12 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Wed, 12 Mar 2025 23:14:04 -0400
Subject: [PATCH 0687/1240] [ROCm][FP8] Fix for adjustments needed only for
 fnuz (#14689)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/quantization/kv_cache.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 388a4f16699..92990487885 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -50,7 +50,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 # We prefer to use separate k_scale and v_scale if present
                 k_scale = layer.k_scale.to("cpu").tolist()
                 v_scale = layer.v_scale.to("cpu").tolist()
-                if current_platform.is_rocm():
+                if current_platform.is_fp8_fnuz():
                     k_scale *= 2
                     v_scale *= 2
             elif layer.k_scale < 0.0 and layer.v_scale < 0.0:
@@ -66,7 +66,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 scale_to_duplicate = max(layer.k_scale, layer.v_scale)
                 k_scale = scale_to_duplicate.to("cpu").tolist()
                 v_scale = scale_to_duplicate.to("cpu").tolist()
-                if current_platform.is_rocm():
+                if current_platform.is_fp8_fnuz():
                     k_scale *= 2
                     v_scale *= 2
 

From 80bc6ad4496a9cad1c74c950b05673bcabe37484 Mon Sep 17 00:00:00 2001
From: TY-AMD <tianyuan.wu@amd.com>
Date: Thu, 13 Mar 2025 11:14:36 +0800
Subject: [PATCH 0688/1240] [BugFix][TritonMLA] Process weights after model
 loading for GGUF (#14555)

Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index bf226f66112..c88af56e180 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1330,11 +1330,14 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
                 local_model_path, gguf_weights_map):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
+        target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
+            with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
             model.load_weights(
                 self._get_weights_iterator(local_model_path, gguf_weights_map))
+
+            _process_weights_after_loading(model, model_config, target_device)
         return model
 
 
From b0d318e5704f8abdee42c8fe22c234fe98a2d8fc Mon Sep 17 00:00:00 2001
From: Mathis Felardos <mathis@mistral.ai>
Date: Thu, 13 Mar 2025 04:15:20 +0100
Subject: [PATCH 0689/1240] [Config][Disaggregated] Add timeout configuration
 for the torch.store and add KVTransferConfig.kv_connector_extra_config
 (#14367)

Signed-off-by: Mathis Felardos <mathis@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                                |  6 +++++
 .../kv_lookup_buffer/simple_buffer.py         |  2 +-
 .../kv_transfer/kv_pipe/pynccl_pipe.py        | 22 ++++++++++---------
 vllm/distributed/utils.py                     |  3 +++
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index aa8b16920a9..3ac7ceabd8d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2837,6 +2837,9 @@ class KVTransferConfig(BaseModel):
     # The KV connector port, used to build distributed connection
     kv_port: int = 14579
 
+    # any extra config that the connector may need
+    kv_connector_extra_config: dict[str, Any] = {}
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2896,6 +2899,9 @@ def is_kv_consumer(self) -> bool:
         return self.kv_connector is not None and \
             self.kv_role in ["kv_consumer", "kv_both"]
 
+    def get_from_extra_config(self, key, default) -> Any:
+        return self.kv_connector_extra_config.get(key, default)
+
 
 class CompilationLevel:
     # constants for the levels of the compilation process
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 3462f7de020..10bbfe1ddd8 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -6,7 +6,7 @@
     - Distributed KV cache transmission using PyNccl pipes.
     - Non-blocking `insert`, blocking `drop_select`.
     - Use CPU signal pipe to avoid racing condition
-    - Handles buffer size constraints and provide backpressure mechanism to 
+    - Handles buffer size constraints and provide backpressure mechanism to
       stop the prefill instance when the decode instance is slow.
 """
 import threading
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index 7aa53d07a9e..e8bf607eb89 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-    This module implements a PyNccl pipe for sending and receiving 
-    Optional[torch.Tensor] between distributed ranks with advanced 
+    This module implements a PyNccl pipe for sending and receiving
+    Optional[torch.Tensor] between distributed ranks with advanced
     communication features.
 
     Key Features:
@@ -59,11 +59,13 @@ def __init__(self,
             self.device = self._select_device(device)
 
         # build distributed connection and send/recv implementation
+        store_timeout = self.config.get_from_extra_config("store_timeout", 300)
         self.group = StatelessProcessGroup.create(
             host=self.config.kv_ip,
             port=self.config.kv_port + port_offset,
             rank=self.kv_rank,
             world_size=self.kv_parallel_size,
+            store_timeout=store_timeout,
         )
         # add a barrier to make sure the connection is initiated properly
         self.group.barrier()
@@ -134,11 +136,11 @@ def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
         Create a buffer to receive the tensor based on the provided metadata.
 
         Parameters:
-            - metadata: A dictionary with keys "dtype" and "shape", describing 
+            - metadata: A dictionary with keys "dtype" and "shape", describing
               the tensor's data type and shape.
 
         Returns:
-            - buffer: A tensor of the specified type and shape, allocated on 
+            - buffer: A tensor of the specified type and shape, allocated on
               self.device.
         """
         return torch.empty(metadata["shape"],
@@ -159,18 +161,18 @@ def _recv_metadata(self) -> Metadata:
         Receive the metadata dictionary from the target rank.
 
         Returns:
-            - metadata: A dictionary with keys "dtype" and "shape" describing 
+            - metadata: A dictionary with keys "dtype" and "shape" describing
               the tensor.
         """
         return self.group.recv_obj(self.target_rank_for_recv)
 
     def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
         """
-        The actual implementation of sending the tensor and its metadata to the 
+        The actual implementation of sending the tensor and its metadata to the
         target rank.
 
         Parameters:
-            - tensor: The input tensor to be sent, or None if no tensor is 
+            - tensor: The input tensor to be sent, or None if no tensor is
               being sent.
         """
         metadata = self._make_metadata(tensor)
@@ -181,7 +183,7 @@ def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
 
     def _recv_impl(self) -> Optional[torch.Tensor]:
         """
-        The actual implementation of receiving a tensor and its metadata from 
+        The actual implementation of receiving a tensor and its metadata from
         the target rank.
 
         Returns:
@@ -213,7 +215,7 @@ def send_tensor_wrapper(self, tensor: Optional[torch.Tensor],
 
     def block_if_full(self):
         """
-        Block the current thread if the buffer size is larger than the 
+        Block the current thread if the buffer size is larger than the
         threshold.
         """
         while self.buffer_size > self.buffer_size_thresh:
@@ -222,7 +224,7 @@ def block_if_full(self):
 
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """
-        Sends a tensor and its metadata to the destination rank in a 
+        Sends a tensor and its metadata to the destination rank in a
         non-blocking way.
 
         Parameters:
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index d6fca4f0221..25202062e97 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -5,6 +5,7 @@
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
+import datetime
 import pickle
 import time
 from collections import deque
@@ -217,6 +218,7 @@ def create(
         rank: int,
         world_size: int,
         data_expiration_seconds: int = 3600,
+        store_timeout: int = 300,
     ) -> "StatelessProcessGroup":
         """A replacement for `torch.distributed.init_process_group` that does not
         pollute the global state.
@@ -238,6 +240,7 @@ def create(
             port=port,
             world_size=world_size,
             is_master=(rank == 0),
+            timeout=datetime.timedelta(seconds=store_timeout),
         )
 
         return StatelessProcessGroup(

From 190d1f8804541b2e05fee894bfc70204e54a4f13 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Wed, 12 Mar 2025 21:37:58 -0700
Subject: [PATCH 0690/1240] [V1][TPU] Add assertion on multi-step-scheduler
 (#14707)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/tpu.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 0b66b52713e..fc68e5d63a6 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -91,13 +91,19 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
         if parallel_config.worker_cls == "auto":
-            if envs.VLLM_USE_V1:
-                parallel_config.worker_cls = \
-                    "vllm.v1.worker.tpu_worker.TPUWorker"
-            else:
-                if scheduler_config.is_multi_step:
+            if scheduler_config.is_multi_step:
+                if envs.VLLM_USE_V1:
+                    raise NotImplementedError(
+                        "Multi-step scheduling is not supported (and not "
+                        "needed) on vLLM V1. Please launch without "
+                        "--num-scheduler-steps.")
+                else:
                     parallel_config.worker_cls = \
                         "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker"
+            else:
+                if envs.VLLM_USE_V1:
+                    parallel_config.worker_cls = \
+                        "vllm.v1.worker.tpu_worker.TPUWorker"
                 else:
                     parallel_config.worker_cls = \
                         "vllm.worker.tpu_worker.TPUWorker"

From e7efe75f5577052360ea2f5c35f4fe7e1213b807 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 13 Mar 2025 00:55:59 -0400
Subject: [PATCH 0691/1240] [Quant] BartModel SupportsQuant (#14699)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/bart.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 109b65d92cf..04d6cde555e 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -44,7 +44,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsV0Only
+from .interfaces import SupportsQuant, SupportsV0Only
 from .utils import maybe_prefix
 
 logger = logging.get_logger(__name__)
@@ -697,7 +697,7 @@ def forward(
         return hidden_states
 
 
-class BartModel(nn.Module):
+class BartModel(nn.Module, SupportsQuant):
     _tied_weights_keys = [
         "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
     ]
@@ -763,7 +763,8 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
         return decoder_outputs
 
 
-class BartForConditionalGeneration(nn.Module, SupportsV0Only):
+class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
     base_model_prefix = "model"
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From bb9675896138b60b5e35298aafdc572966be59c4 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 13 Mar 2025 00:57:05 -0400
Subject: [PATCH 0692/1240] [Quant] Bamba SupportsQuant (#14698)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/bamba.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index ec62e41d59f..61b68125e07 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -33,7 +33,7 @@
 from vllm.utils import LayerBlockType
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
-                         SupportsV0Only)
+                         SupportsQuant, SupportsV0Only)
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -367,7 +367,7 @@ def forward(
 
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
-                       IsHybrid, SupportsV0Only):
+                       IsHybrid, SupportsV0Only, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From 3200fa889ce16f8fa49cb50abf3b54defd07baac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Thu, 13 Mar 2025 08:19:03 +0100
Subject: [PATCH 0693/1240] [Bugfix] Fix chunked prefill for GGUF (#14666)

Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/quantization/gguf.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 5d4c1c6ec89..c92bcbea540 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -98,6 +98,13 @@ def get_quant_method(self, layer: torch.nn.Module,
 
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
+    # HACK: when doing chunked prefill we don't generate output tokens
+    # so input to logits generator is empty which causes invalid parameter
+    if x.shape[0] == 0:
+        return torch.empty(x.shape[0],
+                           qweight.shape[0],
+                           dtype=x.dtype,
+                           device=x.device)
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T

From 52916ac97e4d09dde48a4172072522b75f171329 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 13 Mar 2025 15:57:39 +0800
Subject: [PATCH 0694/1240] [CI/Build]  Delete ultravox LoRA test (#14730)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_ultravox.py | 131 ------------------------------------
 1 file changed, 131 deletions(-)
 delete mode 100644 tests/lora/test_ultravox.py

diff --git a/tests/lora/test_ultravox.py b/tests/lora/test_ultravox.py
deleted file mode 100644
index 2faabcb031f..00000000000
--- a/tests/lora/test_ultravox.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import shutil
-from os import path
-from tempfile import TemporaryDirectory
-
-import pytest
-import torch
-from huggingface_hub import snapshot_download
-from safetensors.torch import load_file, save_file
-from transformers import AutoTokenizer
-
-from vllm.lora.request import LoRARequest
-
-from ..models.utils import check_outputs_equal
-
-ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
-LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
-
-VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
-
-PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-def llama3_1_8b_chess_lora_path():
-    return snapshot_download(
-        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
-
-
-# can't use llama lora adapter without module name transformation
-# because ultravox nest language model
-def transform_module_names_for_ultravox(state_dict):
-    transformed_state_dict = {}
-    for key, value in state_dict.items():
-        new_key = key.replace("base_model.model",
-                              "base_model.model.language_model")
-        transformed_state_dict[new_key] = value
-    return transformed_state_dict
-
-
-def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
-    tensor_file = "adapter_model.safetensors"
-    state_dict = load_file(path.join(source_repo, tensor_file))
-    transformed_state_dict = transform_module_names_for_ultravox(state_dict)
-
-    save_file(transformed_state_dict, path.join(target_path, tensor_file))
-
-    config_file = "adapter_config.json"
-    shutil.copyfile(path.join(source_repo, config_file),
-                    path.join(target_path, config_file))
-    return target_path
-
-
-def _get_prompt(audio_count, question, placeholder, model_name) -> str:
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    placeholder = f"{placeholder}\n" * audio_count
-
-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
-
-
-def test_ultravox_lora(vllm_runner):
-    """
-    TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
-    """
-    # Workaround to prevent device mismatch in Whisper.
-    # Can be removed when it is fixed upstream in transformer
-    # https://github.com/huggingface/transformers/pull/35866
-    torch.set_default_device("cpu")
-
-    llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
-    with TemporaryDirectory() as temp_ultravox_lora_dir:
-        llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
-            llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
-        with vllm_runner(
-                ULTRAVOX_MODEL_NAME,
-                enforce_eager=True,
-                max_num_seqs=2,
-                enable_lora=True,
-                max_loras=1,
-                max_lora_rank=128,
-                dtype="bfloat16",
-                max_model_len=1024,
-        ) as vllm_model:
-            ultravox_outputs: list[tuple[
-                list[int], str]] = vllm_model.generate_greedy(
-                    [
-                        _get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
-                                    ULTRAVOX_MODEL_NAME)
-                    ],
-                    256,
-                    lora_request=LoRARequest(str(1), 1,
-                                             llama3_1_8b_ultravox_chess_lora),
-                )
-
-    # run llama with and without lora to compare outputs with above
-    with vllm_runner(
-            LLMA_MODEL_NAME,
-            enforce_eager=True,
-            max_num_seqs=2,
-            enable_lora=True,
-            max_loras=1,
-            max_lora_rank=128,
-            dtype="bfloat16",
-            max_model_len=1024,
-    ) as vllm_model:
-        llama_outputs: list[tuple[list[int], str]] = (
-            vllm_model.generate_greedy(
-                [_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
-                256,
-                lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
-            ))
-
-    check_outputs_equal(
-        outputs_0_lst=ultravox_outputs,
-        outputs_1_lst=llama_outputs,
-        name_0="ultravox",
-        name_1="llama",
-    )

From 104c4ea69c6016255f6d8980e350dc40cff57504 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Thu, 13 Mar 2025 16:12:42 +0800
Subject: [PATCH 0695/1240] [Bugfix] fix benchmark moe (#14653)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 233fc35d2cf..491f8c3962f 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -365,6 +365,7 @@ def benchmark(
         dtype: torch.dtype,
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
+        block_quant_shape: List[int] = None,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
         dtype_str = get_config_dtype_str(dtype,
@@ -385,10 +386,17 @@ def benchmark(
         else:
             config = op_config[min(op_config.keys(),
                                    key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
-                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8_w8a8,
-                                       use_int8_w8a16)
+        kernel_time = benchmark_config(config,
+                                       num_tokens,
+                                       num_experts,
+                                       shard_intermediate_size,
+                                       hidden_size,
+                                       topk,
+                                       dtype,
+                                       use_fp8_w8a8,
+                                       use_int8_w8a16,
+                                       num_iters=100,
+                                       block_quant_shape=block_quant_shape)
         return config, kernel_time
 
     def tune(
@@ -487,6 +495,14 @@ def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
         f.write("\n")
 
 
+def get_weight_block_size_safety(config, default_value=None):
+
+    quantization_config = getattr(config, 'quantization_config', {})
+    if isinstance(quantization_config, dict):
+        return quantization_config.get('weight_block_size', default_value)
+    return default_value
+
+
 def main(args: argparse.Namespace):
     print(args)
     block_quant_shape = None
@@ -508,7 +524,7 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
-        block_quant_shape = config.quantization_config['weight_block_size']
+        block_quant_shape = get_weight_block_size_safety(config)
     elif config.architectures[0] == "Qwen2MoeForCausalLM":
         E = config.num_experts
         topk = config.num_experts_per_tok

From bb5d20a362272eea550ead654af8b5f1383e1a44 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 13 Mar 2025 17:23:12 +0800
Subject: [PATCH 0696/1240] [VLM] Support pan-and-scan for Gemma3 multi-modal
 processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  55 ++--
 examples/offline_inference/vision_language.py |  12 +-
 .../vision_language_multi_image.py            |  12 +-
 .../vision_language/test_models.py            |  19 +-
 .../vision_language/vlm_utils/model_utils.py  |  12 +
 vllm/inputs/registry.py                       |   9 +-
 vllm/model_executor/models/gemma3_mm.py       | 267 +++++++++++++++---
 vllm/multimodal/base.py                       |   6 +-
 vllm/utils.py                                 |   4 +-
 9 files changed, 315 insertions(+), 81 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 98e7572981d..5db82c8e556 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
   * ✅︎
   * ✅︎
-  * ✅︎\*
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -856,12 +856,12 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
 - * `PaliGemmaForConditionalGeneration`
-  * PaliGemma ⚠️, PaliGemma 2 ⚠️
+  * PaliGemma, PaliGemma 2
   * T + I<sup>E</sup>
   * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
   *
   * ✅︎
-  * ✅︎
+  * ⚠️
 - * `Phi3VForCausalLM`
   * Phi-3-Vision, Phi-3.5-Vision
   * T + I<sup>E+</sup>
@@ -926,34 +926,15 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
-:::{warning}
-vLLM does not currently support PrefixLM attention mask, so our PaliGemma implementation uses regular causal attention, which causes the model output to be unstable.
-
-We may deprecate this model series in a future release.
-:::
-
-:::{note}
-`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
-:::
-
-:::{note}
-To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-:::
-
-:::{note}
-The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
-For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-:::
-
-:::{note}
-To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
-:::
-
-:::{note}
+:::{important}
 To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
 `pip install git+https://github.com/huggingface/transformers`.
-The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
 
+Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
+You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
+:::
+
+:::{warning}
 Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
 However, there are differences in how they handle text + image inputs:
 
@@ -969,9 +950,23 @@ V1 currently uses a simplified attention pattern:
 - Will be updated in the future to support the correct behavior
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+:::
+
+:::{note}
+`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support backends other than FlashAttention.
+:::
 
-Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
-Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
+:::{note}
+To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+:::
+
+:::{note}
+The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
+For more details, please see: <gh-pr:4087#issuecomment-2250397630>
+:::
+
+:::{warning}
+Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 :::
 
 ### Pooling Models
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 39acab4765a..432cda5e243 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -123,10 +123,14 @@ def run_gemma3(questions: list[str], modality: str):
     assert modality == "image"
     model_name = "google/gemma-3-4b-it"
 
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    llm = LLM(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        # Default is False; setting it to True is not supported in V1 yet
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
 
     prompts = [("<bos><start_of_turn>user\n"
                 f"<start_of_image>{question}<end_of_turn>\n"
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 4963e6a8c4e..b47004aa961 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -83,10 +83,14 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
 def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
     model_name = "google/gemma-3-4b-it"
 
-    llm = LLM(model=model_name,
-              max_model_len=8192,
-              max_num_seqs=2,
-              limit_mm_per_prompt={"image": len(image_urls)})
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        # Default is False; setting it to True is not supported in V1 yet
+        mm_processor_kwargs={"do_pan_and_scan": True},
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
     messages = [{
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2540933bbc2..880d1bd1dc4 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
@@ -234,6 +234,23 @@
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
+    "gemma3": VLMTestInfo(
+        models=["google/gemma-3-4b-it"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<start_of_image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<start_of_image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        # TODO: Use AutoModelForVision2Seq once transformers supports this
+        auto_cls=AutoModelForPreTraining,
+        dtype="bfloat16",
+        vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
+        patch_hf_runner=model_utils.gemma3_patch_hf_runner,
+    ),
     "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 66410f66ca0..5e1fcfd8f08 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -304,6 +304,18 @@ def processor(*args, text="", images=None, **kwargs):
     return hf_model
 
 
+def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for Gemma 3."""
+    hf_processor = hf_model.processor
+
+    def processor(*args, **kwargs):
+        return hf_processor(*args, do_pan_and_scan=True, **kwargs)
+
+    hf_model.processor = processor
+
+    return hf_model
+
+
 def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for GLM4."""
     hf_processor = hf_model.processor
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index a0bd8f278fd..b6ceb5fb82d 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -348,7 +348,11 @@ def dummy_data_for_profiling(
                 dummy_factory = self._get_dummy_data_factory(model_cls)
             mm_counts = mm_registry.get_mm_limits_per_prompt(model_config)
             mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-                dummy_factory, overrides=model_config.mm_processor_kwargs)
+                dummy_factory,
+                overrides=model_config.mm_processor_kwargs,
+                requires_kw_only=False,
+                allow_var_kwargs=True,
+            )
 
             dummy_data = dummy_factory(InputContext(model_config), seq_len,
                                        _MultiModalCounts(mm_counts),
@@ -381,6 +385,7 @@ def _default_input_processor(
         self,
         ctx: InputContext,
         inputs: ProcessorInputs,
+        **kwargs: object,
     ) -> ProcessorInputs:
         """The default input processor is a no-op."""
         return inputs
@@ -447,6 +452,8 @@ def process_input(self, model_config: "ModelConfig",
             model_config.mm_processor_kwargs,
             inputs.get("mm_processor_kwargs", {}),  # type: ignore
             processor,
+            requires_kw_only=False,
+            allow_var_kwargs=True,
         )
 
         processed_inputs = processor(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 121aee51786..ac80059cbe6 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+import math
 from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
                     Tuple, TypedDict, Union)
 
 import torch
 from torch import nn
-from transformers import BatchFeature, Gemma3Config, ProcessorMixin
+from transformers import BatchFeature, Gemma3Config, Gemma3Processor
+from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
@@ -14,10 +16,11 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate, encode_tokens)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -31,8 +34,15 @@
 
 class Gemma3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_crops_total, num_channels, height, width)`
+
+    `num_crops_total` is the total number of crops
+    over each image over each prompt in the batch.
+    """
+    num_crops: torch.Tensor
+    """Shape: `(batch_size * num_images,)`"""
 
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
@@ -40,6 +50,9 @@ class Gemma3ImagePixelInputs(TypedDict):
 
 class Gemma3ProcessingInfo(BaseProcessingInfo):
 
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
+
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
 
@@ -48,22 +61,160 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        hf_config = self.ctx.get_hf_config()
-        return {"image": hf_config.mm_tokens_per_image}
+        return {"image": self.get_max_image_tokens()}
+
+    def _resolve_image_kwargs(
+        self,
+        processor: Gemma3Processor,
+        keys: set[str],
+    ) -> dict[str, Any]:
+        image_processor = processor.image_processor
+        kwargs = processor._merge_kwargs(
+            Gemma3ProcessorKwargs,
+            tokenizer_init_kwargs=processor.tokenizer.init_kwargs,
+        )
+
+        images_kwargs = kwargs["images_kwargs"]
+
+        def _resolve_kw(key: str):
+            val = getattr(image_processor, key)
+            if val is None:
+                val = images_kwargs[key]
+
+            return val
+
+        return {k: _resolve_kw(k) for k in keys}
+
+    def get_num_crops(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3Processor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        images_kwargs = self._resolve_image_kwargs(
+            processor, {
+                "do_pan_and_scan", "pan_and_scan_min_crop_size",
+                "pan_and_scan_max_num_crops",
+                "pan_and_scan_min_ratio_to_activate"
+            })
+
+        do_pan_and_scan = images_kwargs["do_pan_and_scan"]
+        pan_and_scan_min_crop_size = images_kwargs[
+            "pan_and_scan_min_crop_size"]
+        pan_and_scan_max_num_crops = images_kwargs[
+            "pan_and_scan_max_num_crops"]
+        pan_and_scan_min_ratio_to_activate = images_kwargs[
+            "pan_and_scan_min_ratio_to_activate"]
+
+        if not do_pan_and_scan:
+            return 0
+
+        # Based on Gemma3ImageProcessor.pan_and_scan
+        if image_width >= image_height:
+            if image_width / image_height < pan_and_scan_min_ratio_to_activate:
+                return 0
+
+            num_crops_w = min(
+                int(math.floor(image_width / pan_and_scan_min_crop_size)),
+                int(math.floor(image_width / image_height + 0.5)),
+            )
+
+            num_crops_w = max(2, num_crops_w)
+            num_crops_w = min(pan_and_scan_max_num_crops, num_crops_w)
+            num_crops_h = 1
+        else:
+            if image_height / image_width < pan_and_scan_min_ratio_to_activate:
+                return 0
+
+            num_crops_h = min(
+                int(math.floor(image_height / pan_and_scan_min_crop_size)),
+                int(math.floor(image_height / image_width + 0.5)),
+            )
+
+            num_crops_h = max(2, num_crops_h)
+            num_crops_h = min(pan_and_scan_max_num_crops, num_crops_h)
+            num_crops_w = 1
+
+        crop_size_w = int(math.ceil(image_width / num_crops_w))
+        crop_size_h = int(math.ceil(image_height / num_crops_h))
+
+        if min(crop_size_w, crop_size_h) < pan_and_scan_min_crop_size:
+            return 0
+
+        return num_crops_w * num_crops_h
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Gemma3Processor],
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token = processor.boi_token
+
+        num_crops = self.get_num_crops(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        if num_crops == 0:
+            image_text = image_token
+        else:
+            crops_image_tokens = " ".join(image_token
+                                          for _ in range(num_crops))
+            image_text = (
+                f"Here is the original image {image_token} and here are some "
+                f"crops to help you see better {crops_image_tokens}")
+
+        return image_text.replace(image_token, processor.full_image_sequence)
 
     def get_num_image_tokens(
         self,
         *,
         image_width: int,
         image_height: int,
-        processor: Optional[ProcessorMixin],
+        processor: Optional[Gemma3Processor],
     ) -> int:
-        hf_config = self.ctx.get_hf_config()
-        return hf_config.mm_tokens_per_image
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
 
     def get_image_size_with_most_features(self) -> ImageSize:
-        # Result in the max possible feature size (h:w = 16:1)
-        return ImageSize(height=8000, width=50)
+        processor = self.get_hf_processor()
+
+        images_kwargs = self._resolve_image_kwargs(
+            processor, {"pan_and_scan_max_num_crops"})
+        max_num_crops = images_kwargs["pan_and_scan_max_num_crops"]
+
+        # Result in the max possible feature size (h:w = max_num_crops:1)
+        return ImageSize(height=50 * max_num_crops, width=50)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
 
 
 class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
@@ -73,10 +224,11 @@ def get_dummy_processor_inputs(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> ProcessorInputs:
-        tokenizer = self.info.get_tokenizer()
-        boi_token = tokenizer.boi_token
+        processor = self.info.get_hf_processor()
+        image_token = processor.boi_token
 
         num_images = mm_counts.get("image", 0)
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
 
@@ -86,8 +238,13 @@ def get_dummy_processor_inputs(
                                    height=target_height,
                                    num_images=num_images)
         }
+
+        # NOTE: We need to separate the image tokens here because
+        # encode("\n\n\n\n") != encode("\n\n") * 2, which interferes
+        # with the detection of prompt updates when the image tokens are
+        # right next to each other
         return ProcessorInputs(
-            prompt_text=" ".join([boi_token] * num_images),
+            prompt_text=" ".join([image_token] * num_images),
             mm_data=mm_data,
         )
 
@@ -100,22 +257,49 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # TODO(woosuk): Support pan-and-scan.
-        img_kwargs = mm_kwargs.get("images_kwargs", {})
-        img_kwargs["do_pan_and_scan"] = False
-        mm_kwargs["images_kwargs"] = img_kwargs
-        return super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
         )
 
+        # HF processor pops the `num_crops` kwarg, which is needed by vLLM
+        if (images := mm_data.get("images")) is not None:
+            assert isinstance(images, list)
+
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+            hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+            num_crops = [
+                self.info.get_num_crops(image_width=size.width,
+                                        image_height=size.height,
+                                        processor=hf_processor)
+                for size in image_sizes
+            ]
+
+            processed_outputs["num_crops"] = torch.tensor(num_crops)
+
+        return processed_outputs
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(pixel_values=MultiModalFieldConfig.batched("image"))
+        num_crops = hf_inputs.get("num_crops", torch.empty(0))
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_crops + 1),
+            num_crops=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -123,25 +307,23 @@ def _get_prompt_updates(
         hf_processor_mm_kwargs: Mapping[str, Any],
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
-        tokenizer = self.info.get_tokenizer()
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-        hf_config = self.info.get_hf_config()
-
-        boi_token = tokenizer.boi_token
-        image_token = tokenizer.image_token
-        mm_tokens_per_image = hf_config.mm_tokens_per_image
-        image_tokens_expanded = "".join([image_token] * mm_tokens_per_image)
+        image_token = hf_processor.boi_token
 
         def get_replacement_gemma3(item_idx: int):
-            return PromptUpdateDetails(
-                full=hf_processor.full_image_sequence,
-                features=image_tokens_expanded,
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            return self.info.get_image_repl(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                processor=hf_processor,
             )
 
         return [
             PromptReplacement(
                 modality="image",
-                target=boi_token,
+                target=image_token,
                 replacement=get_replacement_gemma3,
             )
         ]
@@ -254,19 +436,27 @@ def _validate_shape(d: torch.Tensor):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
+        num_crops = kwargs.pop("num_crops", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, (torch.Tensor, list[torch.Tensor])):
+        if not isinstance(pixel_values, (torch.Tensor, list)):
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
+        if not isinstance(num_crops, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_crops values. "
+                             f"Got type: {type(num_crops)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
+        num_crops = flatten_bn(num_crops, concat=True)
+
         return Gemma3ImagePixelInputs(
             type="pixel_values",
-            data=self._validate_pixel_values(pixel_values),
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_crops=num_crops,
         )
 
     def _image_pixels_to_features(
@@ -283,7 +473,8 @@ def _process_image_input(
         image_input: Gemma3ImageInputs,
     ) -> torch.Tensor:
         assert self.vision_tower is not None
-        pixel_values = image_input["data"]
+
+        pixel_values = image_input["pixel_values"]
         vision_outputs = self._image_pixels_to_features(
             self.vision_tower,
             pixel_values,
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index e0b160a6504..5159b0bca8c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -226,7 +226,11 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
 
         if callable(max_mm_tokens):
             mm_processor_kwargs = get_allowed_kwarg_only_overrides(
-                max_mm_tokens, overrides=model_config.mm_processor_kwargs)
+                max_mm_tokens,
+                overrides=model_config.mm_processor_kwargs,
+                requires_kw_only=False,
+                allow_var_kwargs=True,
+            )
             max_mm_tokens = max_mm_tokens(InputContext(model_config),
                                           **mm_processor_kwargs)
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 9cad2b8854a..a8eba27dbcd 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1488,11 +1488,11 @@ def get_allowed_kwarg_only_overrides(
         if requires_kw_only:
             logger.warning(
                 "The following intended overrides are not keyword-only args "
-                "and and will be dropped: %s", dropped_keys)
+                "and will be dropped: %s", dropped_keys)
         else:
             logger.warning(
                 "The following intended overrides are not keyword args "
-                "and and will be dropped: %s", dropped_keys)
+                "and will be dropped: %s", dropped_keys)
 
     return filtered_overrides
 

From 7c6e8988426bfac4bcd072e500a844b83c8fea69 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 13 Mar 2025 18:10:02 +0800
Subject: [PATCH 0697/1240] [VLM] Support loading InternVideo2.5 models as
 original InternVLChatModel (#14738)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 4 ++--
 vllm/model_executor/models/internvl.py | 9 ++++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5db82c8e556..bcbd7bf9600 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -786,9 +786,9 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
 - * `InternVLChatModel`
-  * InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
   * T + I<sup>E+</sup>
-  * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
   *
   * ✅︎
   * ✅︎
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 1aa8455bad8..fcaf7fecaaf 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -981,5 +981,12 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        loader = AutoWeightsLoader(self)
+        # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
         return loader.load_weights(weights)

From 06cf17a954f5b7224857a06696b6e389bae4b612 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 13 Mar 2025 19:37:17 +0800
Subject: [PATCH 0698/1240] [Bugfix] Fix prompt format of GLM4V (#14539)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_models.py             | 14 +++++++++++---
 .../vision_language/vlm_utils/core.py          |  4 +++-
 .../vision_language/vlm_utils/model_utils.py   | 18 +++++++++++++-----
 vllm/config.py                                 | 12 ++++++++----
 vllm/entrypoints/chat_utils.py                 |  7 ++++---
 vllm/model_executor/models/chatglm.py          |  3 ++-
 vllm/model_executor/models/qwen.py             |  4 ++--
 7 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 880d1bd1dc4..84a5260ad9a 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -254,13 +254,21 @@
     "glm4v": VLMTestInfo(
         models=["THUDM/glm-4v-9b"],
         test_type=VLMTestType.IMAGE,
-        prompt_formatter=identity,
-        img_idx_to_prompt=lambda idx: "",
+        prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",  # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?",  # noqa: E501
+        }),
         max_model_len=2048,
         max_num_seqs=2,
         dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        patch_hf_runner=model_utils.glm_patch_hf_runner,
+        patch_hf_runner=model_utils.glm4v_patch_hf_runner,
+        # The image embeddings match with HF but the outputs of the language
+        # decoder are only consistent up to 2 decimal places.
+        # So, we need to reduce the number of tokens for the test to pass.
+        max_tokens=8,
+        num_logprobs=10,
         marks=[large_gpu_mark(min_gb=32)],
     ),
     "h2ovl": VLMTestInfo(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index aaad584c9cd..31f0209b102 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -61,7 +61,9 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    vllm_runner_kwargs_: dict[str, Any] = {}
+    vllm_runner_kwargs_: dict[str, Any] = {
+        "disable_mm_preprocessor_cache": True,
+    }
     if model_info.tokenizer:
         vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
     if model_info.tokenizer_mode:
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 5e1fcfd8f08..3b4d1237c37 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -316,8 +316,8 @@ def processor(*args, **kwargs):
     return hf_model
 
 
-def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
-    """Patches and returns an instance of the HfRunner to use for GLM4."""
+def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for GLM4V."""
     hf_processor = hf_model.processor
     patch_padding_side(hf_processor)
 
@@ -325,12 +325,20 @@ def processor(*args, text="", images=None, **kwargs):
         if images is None:
             return hf_processor(*args, **kwargs)
 
+        images = [images] if isinstance(images, Image) else images
+
+        contents = re.findall(
+            r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
+            text,
+        )
+        assert len(contents) == len(images)
+
         return hf_processor.apply_chat_template(
             [{
                 "role": "user",
-                "image": images,
-                "content": text
-            }],
+                "image": image,
+                "content": content
+            } for image, content in zip(images, contents)],
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
diff --git a/vllm/config.py b/vllm/config.py
index 3ac7ceabd8d..35411ca73ad 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -286,14 +286,18 @@ def __init__(
         if rope_scaling is not None:
             hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
             hf_overrides_kw.update(hf_override)
-            msg = ("`--rope-scaling` will be removed in a future release. "
-                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            hf_overrides_str = json.dumps(hf_overrides)
+            msg = (
+                "`--rope-scaling` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
         if rope_theta is not None:
             hf_override = {"rope_theta": rope_theta}
             hf_overrides_kw.update(hf_override)
-            msg = ("`--rope-theta` will be removed in a future release. "
-                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            hf_overrides_str = json.dumps(hf_overrides)
+            msg = (
+                "`--rope-theta` will be removed in a future release. "
+                f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
 
         self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 61f21482f70..4ce4fa897cc 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -403,7 +403,9 @@ def _placeholder_str(self, modality: ModalityStr,
         hf_config = self._model_config.hf_config
         model_type = hf_config.model_type
 
-        if modality in ["image", "image_embeds"]:
+        if modality in ("image", "image_embeds"):
+            if model_type == "chatglm":
+                return "<|begin_of_image|><|endoftext|><|end_of_image|>"
             if model_type == "phi3_v":
                 # Workaround since this token is not defined in the tokenizer
                 return f"<|image_{current_count}|>"
@@ -411,8 +413,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
-                              "pixtral"):
+            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6eca25212ee..14dca23b393 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,6 +2,7 @@
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
 """Inference-only ChatGLM model compatible with THUDM weights."""
+import json
 from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
@@ -463,7 +464,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 "The configuration of this model indicates that it supports "
                 "vision inputs, but you instantiated the text-only version "
                 "of this model. Please use the vision model by setting "
-                f"`--hf-overrides {hf_overrides!r}`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
 
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 96abfb9d109..a33739a8eef 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -5,7 +5,7 @@
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 """Inference-only QWen model compatible with HuggingFace weights."""
-
+import json
 from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
 
 import torch
@@ -354,7 +354,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 "The configuration of this model indicates that it supports "
                 "vision inputs, but you instantiated the text-only version "
                 "of this model. Please use the vision model by setting "
-                f"`--hf-overrides {hf_overrides!r}`")
+                f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
 
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 

From 8bb520dd8e7db50ea58c4e5d7d1c0b1012e48cc6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 13 Mar 2025 08:53:22 -0700
Subject: [PATCH 0699/1240] [V1][Minor] Minor enhancements on scheduler
 (#14732)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/scheduler.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a7e50f8f40e..d498891f476 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -587,9 +587,6 @@ def update_from_output(
             if spec_token_ids is not None:
                 request.spec_token_ids = spec_token_ids[req_index]
 
-            # Get prompt logprobs for this request.
-            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
-
             stopped = False
             new_logprobs = None
             new_token_ids: list[int] = []
@@ -622,6 +619,8 @@ def update_from_output(
                     new_token_ids,
                 )
 
+            # Get prompt logprobs for this request.
+            prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
             # Transmit partial if chunked prefill & prompt logprobs is enabled
             if new_token_ids or prompt_logprobs_tensors is not None:
                 # Add EngineCoreOutput for this Request.
@@ -693,8 +692,7 @@ def finish_requests(
 
             if request.status == RequestStatus.RUNNING:
                 self.running.remove(request)
-                if request.request_id in self.scheduled_req_ids:
-                    self.scheduled_req_ids.remove(request.request_id)
+                self.scheduled_req_ids.discard(request.request_id)
             else:
                 self.waiting.remove(request)
             request.status = finished_status

From 424c05f05dffc102e312f4160698207eac8ba689 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Mar 2025 02:25:37 +0800
Subject: [PATCH 0700/1240] [Misc] Clean up processor tests (#14771)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../multimodal/processing/test_h2ovl.py       |  8 +++---
 .../multimodal/processing/test_idefics3.py    | 10 +++-----
 .../multimodal/processing/test_internvl.py    |  8 +++---
 .../multimodal/processing/test_llava_next.py  |  9 +++----
 .../processing/test_llava_onevision.py        |  9 +++----
 .../multimodal/processing/test_phi3v.py       |  4 +--
 .../multimodal/processing/test_qwen2_vl.py    |  3 +--
 tests/models/utils.py                         | 25 ++++++++++---------
 8 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 84471c92a29..713fc733e21 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -96,14 +96,14 @@ def _run_check(
     tokenizer = processor.info.get_tokenizer()
     config = processor.info.get_hf_config()
 
+    prompt = "<image>" * len(images)
     mm_data = {"image": images}
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
         for image in images)
 
-    processed_inputs = processor.apply("<image>" * len(images), mm_data,
-                                       mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
@@ -152,9 +152,7 @@ def test_processor_override(
     }
 
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
-        trust_remote_code=True,
+        model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 0a0f1cb3893..fdbe2f17692 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -9,10 +9,8 @@
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 
-models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
 
-
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"])
 # yapf: disable
 @pytest.mark.parametrize(
     ("mm_processor_kwargs", "expected_toks_per_img"),
@@ -25,7 +23,7 @@
 @pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
     image_assets: _ImageAssets,
-    model: str,
+    model_id: str,
     mm_processor_kwargs: dict[str, object],
     expected_toks_per_img: int,
     num_imgs: int,
@@ -36,9 +34,7 @@ def test_processor_override(
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
     ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
+        model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index adbc4f5b558..f5bd661071a 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -56,14 +56,14 @@ def _run_check(
     tokenizer = processor.info.get_tokenizer()
     config = processor.info.get_hf_config()
 
+    prompt = "<image>" * len(images)
     mm_data = {"image": images}
 
     total_expected_num_patches = sum(
         _get_expected_num_patches(config, image, len(images), min_num, max_num)
         for image in images)
 
-    processed_inputs = processor.apply("<image>" * len(images), mm_data,
-                                       mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
     image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
@@ -109,9 +109,7 @@ def test_processor_override(
     }
 
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
-        trust_remote_code=True,
+        model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index dca25e5d4c4..74bca0e3589 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
@@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 96abc840f05..c27898a40b7 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
                          ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
 @pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
@@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
 @pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 420644f7084..2f0c8e7e549 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -35,9 +35,7 @@ def test_processor_override(
     from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
-        trust_remote_code=True,
+        model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index b882528aafb..95204c7ebb4 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -30,8 +30,7 @@ def test_processor_override(
 ):
     """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
     ctx = build_model_context(
-        model_name=model_id,
-        tokenizer_name=model_id,
+        model_id,
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
diff --git a/tests/models/utils.py b/tests/models/utils.py
index b0182d545f4..2280a6c916d 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -10,6 +10,8 @@
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
+from .registry import HF_EXAMPLE_MODELS
+
 TokensText = tuple[list[int], str]
 
 
@@ -250,10 +252,8 @@ def check_logprobs_close(
 
 
 def build_model_context(
-    model_name: str,
+    model_id: str,
     task: TaskOption = "auto",
-    tokenizer_name: Optional[str] = None,
-    trust_remote_code: bool = False,
     dtype: Optional[Union[str, torch.dtype]] = None,
     mm_processor_kwargs: Optional[dict] = None,
     limit_mm_per_prompt: Optional[dict] = None,
@@ -262,9 +262,7 @@ def build_model_context(
     """Creates an InputContext for a given model.
 
     Args:
-        model_name: Name of the model being considered.
-        tokenizer_name: Name of the tokenizer being considered.
-        trust_remote_code: Whether or not to allow loading remote code.
+        model_id: ID of the model being considered.
         mm_processor_kwargs: optional processor kwargs for to be leveraged
             in the input processor, mapper, dummy data creation, etc.
         limit_mm_per_prompt: Multimodal limits.
@@ -272,21 +270,24 @@ def build_model_context(
     Returns:
         InputContext for the model being considered.
     """
-    if tokenizer_name is None:
-        tokenizer_name = model_name
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
+
     if dtype is None:
         dtype = "half"
 
     model_config = ModelConfig(
-        model_name,
+        model_id,
         task=task,
-        tokenizer=tokenizer_name,
-        tokenizer_mode="auto",
-        trust_remote_code=trust_remote_code,
+        tokenizer=model_info.tokenizer or model_id,
+        tokenizer_mode=model_info.tokenizer_mode,
+        trust_remote_code=model_info.trust_remote_code,
         dtype=dtype,
         seed=0,
         mm_processor_kwargs=mm_processor_kwargs,
         limit_mm_per_prompt=limit_mm_per_prompt,
         disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
+        hf_overrides=model_info.hf_overrides,
     )
     return InputContext(model_config)

From d13fdc87ba233dd3905e566a069b77cf401f615b Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Thu, 13 Mar 2025 14:39:28 -0400
Subject: [PATCH 0701/1240] [V1][Core] using cached vocab_size for Structured
 Outputs (#14630)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 45fec1122cc..a341d74c581 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -27,7 +27,6 @@
 class StructuredOutputManager:
 
     def __init__(self, vllm_config: VllmConfig):
-        self.vocab_size = vllm_config.model_config.get_vocab_size()
         self.vllm_config = vllm_config
         self.init_complete = False
 
@@ -41,6 +40,7 @@ def _delayed_init(self):
         tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.vocab_size = tokenizer.max_token_id
         if isinstance(tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98

From 7fb4940f12b8133ce18f78be754dc928389ecece Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Thu, 13 Mar 2025 15:07:34 -0400
Subject: [PATCH 0702/1240] [V1] Detokenizer: Respect Stop Tokens + not
 include_stop_str_in_output (#14624)

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_output_processor.py | 179 +++++++++++++++++++++--
 tests/v1/engine/utils.py                 |  23 ++-
 vllm/v1/engine/detokenizer.py            |  25 +++-
 vllm/v1/engine/output_processor.py       |   6 +-
 4 files changed, 215 insertions(+), 18 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 0de853ba6e5..388f7f45e05 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -470,22 +470,184 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
     assert not output_processor.has_unfinished_requests()
 
 
+@pytest.mark.parametrize(
+    "include_stop_str_in_output,stop_token_type,ignore_eos,num_sample_logprobs",
+    [(False, "stop_token_ids", False, None),
+     (True, "stop_token_ids", False, None),
+     (False, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (True, "stop_token_ids", False, NUM_SAMPLE_LOGPROBS_UNDER_TEST),
+     (False, "eos_token_id", False, None), (True, "eos_token_id", False, None),
+     (False, "eos_token_id", True, None)])
+def test_stop_token(include_stop_str_in_output: bool,
+                    num_sample_logprobs: Optional[int], stop_token_type: str,
+                    ignore_eos: bool, dummy_test_vectors):
+    """Test output processor EOS/stop token handling.
+
+    Send mock engine core request to mock engine core and pass core outputs
+    to output processor. Validate output processor tokens, text and
+    (if enabled) sample logprobs. Batch-size one.
+
+    The test emulates a scenario where a model outputs text tokens followed
+    by two identical control tokens:
+    <token><token>...<token><control><control>
+
+    If EOS is under test, the control tokens are EOS; otherwise, they are
+    some other token id.
+
+    Test behavior:
+
+    * If EOS is under test and `ignore_eos=True`, the detokenized string
+      should be <token><token>...<token><control><control> and the finish
+      reason should be "length" (i.e. no stop occurs)
+
+    * else, if `include_stop_str_in_output==True`, the detokenized
+      string should be <token><token>...<token><control> and the finish
+      reason should be "stop" (i.e. first control token causes stop
+      and is represented in output text)
+
+    * else, the detokenized string should be 
+      <token><token>...<token> and the finish reason should be "stop"
+      (i.e. first control token causes stop but is not represented
+      in output text.)
+
+    Note: some test details are tuned for meta-llama/Llama-3.2-1B,
+    another model should work only if the test is modified.
+
+    Args:
+        include_stop_str_in_output: stop token str appears in output text
+        num_sample_logprobs: number of sample logprobs (`None` for no logprobs)
+        stop_token_type: "eos_token_id" for EOS, "stop_token_ids" for stop token
+        ignore_eos: if True, EOS stops are disabled
+        dummy_test_vectors: dummy engine core outputs and other data structures
+    """
+    model_id = dummy_test_vectors.tokenizer.name_or_path
+    if model_id != 'meta-llama/Llama-3.2-1B':
+        raise AssertionError("Test requires meta-llama/Llama-3.2-1B but "
+                             f"{model_id} is in use.")
+    do_logprobs = num_sample_logprobs is not None
+    # EOS under test; if False, stop_token_ids under test
+    is_eos_test = stop_token_type == "eos_token_id"
+    # EOS under test but ignore_eos enabled
+    is_eos_ignore_test = is_eos_test and ignore_eos
+    eos_token_id = (
+        dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
+    )  # '<|end_of_text|>'
+    stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
+
+    output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
+                                       log_stats=False)
+    # Dummy engine core outputs, with control tokens suffixed to test stops
+    suffix_token = ([eos_token_id] if is_eos_test else stop_token_ids)
+    assert suffix_token is not None and isinstance(suffix_token[0], int)
+    generation_string = dummy_test_vectors.generation_strings[0]
+    generation_tokens = (dummy_test_vectors.generation_tokens[0] +
+                         2 * suffix_token)
+    if do_logprobs:
+        generation_logprobs = (
+            dummy_test_vectors.generation_logprobs[0] +
+            2 * [dummy_test_vectors.generation_logprobs[0][-1]])
+    prompt_string = dummy_test_vectors.prompt_strings[0]
+    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos)
+
+    # Make request.
+    request_id = "request-0"
+    request = EngineCoreRequest(
+        request_id=request_id,
+        prompt=prompt_string,
+        prompt_token_ids=prompt_tokens,
+        arrival_time=0,
+        mm_inputs=None,
+        mm_hashes=None,
+        mm_placeholders=None,
+        eos_token_id=eos_token_id,
+        lora_request=None,
+        sampling_params=SamplingParams(
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=RequestOutputKind.DELTA,
+            stop=[],
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_stop_str_in_output,
+            logprobs=num_sample_logprobs,
+            prompt_logprobs=None,
+            ignore_eos=ignore_eos,
+        ))
+
+    # Add request to the detokenizer.
+    output_processor.add_request(request)
+
+    # Loop over engine core steps; run output processor
+    gen_string = ""
+    gen_tokens = []
+    gen_logprobs = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        assert len(request_outputs) == 1
+        # Stop token does not rely on abort
+        assert not processed_outputs.reqs_to_abort
+
+        # Update tracking.
+        request_output = request_outputs[0]
+        if request_output.finished:
+            finish_reason = ("length" if is_eos_ignore_test else "stop")
+            assert request_output.outputs[0].finish_reason == finish_reason
+
+        gen_string += request_output.outputs[0].text
+        gen_tokens.extend(request_output.outputs[0].token_ids)
+        if do_logprobs:
+            gen_logprobs.extend(request_output.outputs[0].logprobs)
+
+    # Validate generated text
+    control_token = '<|end_of_text|>' if is_eos_test else '<|eot_id|>'
+    if is_eos_ignore_test:
+        # Length-based stop; expect full string
+        ref_str = generation_string + 2 * control_token
+    elif include_stop_str_in_output:
+        # Stop token triggered; include in output
+        ref_str = generation_string + control_token
+    else:
+        # Stop token triggered but not in output
+        ref_str = generation_string
+    assert gen_string == ref_str, (f"{gen_string=}, {ref_str=}")
+
+    if do_logprobs:
+        # Validate number of sample logprobs
+        num_tokens = len(gen_tokens)
+        num_logprobs = len(gen_logprobs)
+        assert num_tokens == num_logprobs, (
+            f"Token count ({num_tokens}) != logprobs count ({num_logprobs})")
+
+    # Check requests are finished
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 @pytest.mark.parametrize("num_sample_logprobs",
                          [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs",
-                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
 def test_stop_string(include_stop_str_in_output: bool,
-                     num_sample_logprobs: Optional[int],
-                     num_prompt_logprobs: Optional[int], dummy_test_vectors):
+                     num_sample_logprobs: Optional[int], dummy_test_vectors):
     output_processor = OutputProcessor(dummy_test_vectors.tokenizer_group,
                                        log_stats=False)
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
         generated_logprobs_raw=dummy_test_vectors.generation_logprobs
         if num_sample_logprobs else None,
-        prompt_logprobs_raw=dummy_test_vectors.prompt_logprobs
-        if num_prompt_logprobs else None)
+        prompt_logprobs_raw=None)
 
     # Make N requests.
     request_id_list = [
@@ -510,7 +672,7 @@ def test_stop_string(include_stop_str_in_output: bool,
                 stop=STOP_STRINGS,
                 include_stop_str_in_output=include_stop_str_in_output,
                 logprobs=num_sample_logprobs,
-                prompt_logprobs=num_prompt_logprobs,
+                prompt_logprobs=None,
             )) for idx, (prompt, prompt_tokens) in enumerate(
                 zip(dummy_test_vectors.prompt_strings,
                     dummy_test_vectors.prompt_tokens))
@@ -594,8 +756,7 @@ def test_stop_string(include_stop_str_in_output: bool,
     # Confirmed tracked logprobs match what we expect
     _validate_logprobs(gen_tokens, gen_logprobs, gen_prompt_logprobs,
                        gen_cumulative_logprobs, dummy_test_vectors,
-                       request_id_list, num_sample_logprobs,
-                       num_prompt_logprobs)
+                       request_id_list, num_sample_logprobs, None)
 
     assert output_processor.get_num_unfinished_requests() == 0
     assert not output_processor.has_unfinished_requests()
diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py
index f0e344cfa6f..1ee93c72cd2 100644
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -20,7 +20,7 @@
 # Number of prompt logprobs to request when testing prompt logprobs
 NUM_PROMPT_LOGPROBS_UNDER_TEST = 7
 
-TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+TOKENIZER_NAME = "meta-llama/Llama-3.2-1B"
 
 FULL_STRINGS = [
     "My name is Robert from Neural Magic and I love working on vLLM so much!",
@@ -330,13 +330,21 @@ def __init__(
         # each matrix has dimensions
         # (num prompt toks) x (num prompt logprobs+1)
         prompt_logprobs_raw: Optional[list[LogprobsTensors]] = None,
+        eos_token_id: Optional[int] = None,
+        stop_token_ids: Optional[list[int]] = None,
+        ignore_eos: bool = False,
     ) -> None:
+        self.num_requests = len(tokens_list)
         self.tokens_list = tokens_list
         self.current_idx = 0
         self.generated_logprobs_raw = generated_logprobs_raw
         self.do_logprobs = generated_logprobs_raw is not None
         self.prompt_logprobs_raw = prompt_logprobs_raw
         self.do_prompt_logprobs = prompt_logprobs_raw is not None
+        self.request_finished = [False for _ in range(self.num_requests)]
+        self.eos_token_id = eos_token_id
+        self.stop_token_ids = stop_token_ids
+        self.ignore_eos = ignore_eos
 
     def get_outputs(self) -> list[EngineCoreOutput]:
         do_logprobs = self.do_logprobs
@@ -345,7 +353,7 @@ def get_outputs(self) -> list[EngineCoreOutput]:
 
         outputs = []
         for req_idx, token_ids in enumerate(self.tokens_list):
-            if len(token_ids) > token_idx:
+            if not self.request_finished[req_idx]:
                 if do_logprobs:
                     assert self.generated_logprobs_raw is not None
                     (logprobs_token_ids_, logprobs_, sampled_token_ranks_) = (
@@ -365,14 +373,23 @@ def get_outputs(self) -> list[EngineCoreOutput]:
                         prompt_logprobs = None
                 else:
                     prompt_logprobs = None
+                new_token_id = token_ids[token_idx]
                 output = EngineCoreOutput(
                     request_id=f"request-{req_idx}",
-                    new_token_ids=[token_ids[token_idx]],
+                    new_token_ids=[new_token_id],
                     new_logprobs=logprobs,
                     new_prompt_logprobs_tensors=prompt_logprobs,
                 )
                 if token_idx == len(token_ids) - 1:
+                    output.finish_reason = FinishReason.LENGTH
+                    self.request_finished[req_idx] = True
+                if not self.ignore_eos and new_token_id == self.eos_token_id:
                     output.finish_reason = FinishReason.STOP
+                    self.request_finished[req_idx] = True
+                if new_token_id in (self.stop_token_ids or ()):
+                    output.finish_reason = FinishReason.STOP
+                    output.stop_reason = new_token_id
+                    self.request_finished[req_idx] = True
                 outputs.append(output)
 
         self.current_idx += 1
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 92754920b62..bf06a17507b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -88,7 +88,8 @@ def from_new_request(
             stop_buffer_length=stop_buffer_length,
         )
 
-    def update(self, new_token_ids: list[int]) -> Optional[str]:
+    def update(self, new_token_ids: list[int],
+               stop_terminated: bool) -> Optional[str]:
         """
         Update RequestState for the request_id by:
             1) Detokenize the new token ids incrementally.
@@ -96,11 +97,22 @@ def update(self, new_token_ids: list[int]) -> Optional[str]:
 
         Return matched stop string or None.
         """
-
+        if not new_token_ids:
+            # Skip detokenization if no new token ids
+            return None
         if self.tokenizer is None:
+            # Skip detokenization if no tokenizer
             self.token_ids.extend(new_token_ids)
             return None
 
+        if stop_terminated and not self.include_stop_str_in_output:
+            # If stop-terminated, exclude last token from detokenization
+            # based on include_stop_str_in_output parameter.
+            skipped_stop_token_id = new_token_ids[-1]
+            new_token_ids = new_token_ids[:-1]
+        else:
+            skipped_stop_token_id = None
+
         # 1) Detokenize the new token ids incrementally.
         # TODO(woosuk): This method becomes very inefficient when the number of
         # new_token_ids is more than 1. We need to optimize this.
@@ -127,7 +139,14 @@ def update(self, new_token_ids: list[int]) -> Optional[str]:
 
         self.output_text += decoded_text
 
-        # 2) Evaluate stop criteria.
+        if stop_terminated:
+            if skipped_stop_token_id is not None:
+                # Cleanup after skipping detokenization
+                self.token_ids.append(skipped_stop_token_id)
+            # Stop token triggered; skip stop string check
+            return None
+
+        # 2) Evaluate stop strings.
         stop_string = None
         if self.stop:
             stop = StopChecker.check_stop_strings(
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 83180b66bea..04235eda092 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -299,9 +299,9 @@ def process_outputs(
             # in the EngineCore.
             req_state.is_prefilling = not new_token_ids
 
-            # 2) Detokenize the token ids into text and check for stop
-            #    strings.
-            stop_string = req_state.detokenizer.update(new_token_ids)
+            # 2) Detokenize the token ids into text and perform stop checks.
+            stop_string = req_state.detokenizer.update(
+                new_token_ids, finish_reason == FinishReason.STOP)
             if stop_string and finish_reason != FinishReason.STOP:
                 finish_reason = FinishReason.STOP
                 stop_reason = stop_string

From 01bdf9bdfa7bde9fdddf3358d921616643c626e5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 13 Mar 2025 17:31:14 -0400
Subject: [PATCH 0703/1240] [Attention] Remove slow setattr in MLA (#14769)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/rotary_embedding.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d4b8cf25fec..fd27775b7dc 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -161,8 +161,13 @@ def forward_cuda(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         from vllm import _custom_ops as ops
 
-        self.cos_sin_cache = self.cos_sin_cache.to(query.device,
-                                                   dtype=query.dtype)
+        # __setattr__ in nn.Module (called by `self.cos_sin_cache = ...`)
+        # is expensive, so avoid calling it if possible
+        if self.cos_sin_cache.device != query.device or \
+            self.cos_sin_cache.dtype != query.dtype:
+            self.cos_sin_cache = self.cos_sin_cache.to(query.device,
+                                                       dtype=query.dtype)
+
         # ops.rotary_embedding()/batched_rotary_embedding()
         # are in-place operations that update the query and key tensors.
         if offsets is not None:

From fa20f291a5e1f9c154d40e3c66320a6ff3188550 Mon Sep 17 00:00:00 2001
From: yasu52 <65061005+yasu52@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:33:09 -0700
Subject: [PATCH 0704/1240] [Doc] Fix typo in documentation (#14783)

Signed-off-by: yasu52 <tsuguro4649@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/frameworks/helm.md                 | 4 ++--
 docs/source/deployment/k8s.md                             | 2 +-
 docs/source/design/kernel/paged_attention.md              | 2 +-
 docs/source/design/v1/metrics.md                          | 4 ++--
 docs/source/features/lora.md                              | 2 +-
 docs/source/getting_started/faq.md                        | 2 +-
 .../installation/ai_accelerator/hpu-gaudi.inc.md          | 2 +-
 .../installation/ai_accelerator/openvino.inc.md           | 2 +-
 docs/source/getting_started/installation/gpu/xpu.inc.md   | 8 ++++----
 docs/source/serving/distributed_serving.md                | 4 ++--
 docs/source/training/rlhf.md                              | 2 +-
 examples/other/logging_configuration.md                   | 2 +-
 vllm/distributed/kv_transfer/README.md                    | 2 +-
 13 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
index e4fc5e13130..7320d727fba 100644
--- a/docs/source/deployment/frameworks/helm.md
+++ b/docs/source/deployment/frameworks/helm.md
@@ -4,9 +4,9 @@
 
 A Helm chart to deploy vLLM for Kubernetes
 
-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
 
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
 
 ## Prerequisites
 
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 64071ba042d..dd3769c47fc 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -14,7 +14,7 @@ Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vl
 
 ## Pre-requisite
 
-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-medal GPU machine).
+Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-metal GPU machine).
 
 ## Deployment using native K8s
 
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
index 5f258287726..e1770c82264 100644
--- a/docs/source/design/kernel/paged_attention.md
+++ b/docs/source/design/kernel/paged_attention.md
@@ -419,7 +419,7 @@ List of `v_vec` for one thread
   which is also `V_VEC_SIZE` elements from `logits`. Overall, with
   multiple inner iterations, each warp will process one block of value
   tokens. And with multiple outer iterations, the whole context value
-  tokens are processd
+  tokens are processed
 
   ```cpp
   float accs[NUM_ROWS_PER_THREAD];
diff --git a/docs/source/design/v1/metrics.md b/docs/source/design/v1/metrics.md
index bed40516ca4..b3981b2dc24 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@@ -13,7 +13,7 @@ Ensure the v1 LLM Engine exposes a superset of the metrics available in v0.
 Metrics in vLLM can be categorized as follows:
 
 1. Server-level metrics: these are global metrics that track the state and performance of the LLM engine. These are typically exposed as Gauges or Counters in Prometheus.
-2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histrograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
+2. Request-level metrics: these are metrics that track the characteristics - e.g. size and timing - of individual requests. These are typically exposed as Histograms in Prometheus, and are often the SLO that an SRE monitoring vLLM will be tracking.
 
 The mental model is that the "Server-level Metrics" explain why the "Request-level Metrics" are what they are.
 
@@ -47,7 +47,7 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:tokens_total` (Counter)
 - `vllm:iteration_tokens_total` (Histogram)
 - `vllm:time_in_queue_requests` (Histogram)
-- `vllm:model_forward_time_milliseconds` (Histogram
+- `vllm:model_forward_time_milliseconds` (Histogram)
 - `vllm:model_execute_time_milliseconds` (Histogram)
 - `vllm:request_params_n` (Histogram)
 - `vllm:request_params_max_tokens` (Histogram)
diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md
index dff7e916fb4..a71da72e436 100644
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@@ -110,7 +110,7 @@ In addition to serving LoRA adapters at server startup, the vLLM server now supp
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.
 
-Note: Enabling this feature in production environments is risky as user may participate model adapter management.
+Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
 
 To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
 is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md
index 4751b325e6f..c1bb28937c1 100644
--- a/docs/source/getting_started/faq.md
+++ b/docs/source/getting_started/faq.md
@@ -15,7 +15,7 @@ more are listed [here](#supported-models).
 
 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
-but they are expected be inferior to models that are specifically trained on embedding tasks.
+but they are expected to be inferior to models that are specifically trained on embedding tasks.
 
 ______________________________________________________________________
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 7e52f604890..e91ed6fbd7a 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -119,7 +119,7 @@ If you're observing the following error: `docker: Error response from daemon: Un
 
 ## Supported configurations
 
-The following configurations have been validated to be function with
+The following configurations have been validated to function with
 Gaudi2 devices. Configurations that are not listed may or may not work.
 
 - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b)
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
index 5641c156365..ab0db4795da 100644
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
@@ -19,7 +19,7 @@ Currently, there are no pre-built OpenVINO wheels.
 
 ### Build wheel from source
 
-First, install Python and ensure you lave the latest pip. For example, on Ubuntu 22.04, you can run:
+First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
 
 ```console
 sudo apt-get update  -y
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 5a47b16f776..84a9b387789 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -1,6 +1,6 @@
 # Installation
 
-vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+vLLM initially supports basic model inference and serving on Intel GPU platform.
 
 :::{attention}
 There are no pre-built wheels or images for this device, so you must build vLLM from source.
@@ -65,7 +65,7 @@ $ docker run -it \
 
 ## Supported features
 
-XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+XPU platform supports **tensor parallel** inference/serving and also supports **pipeline parallel** as a beta feature for online serving. We require Ray as the distributed runtime backend. For example, a reference execution like following:
 
 ```console
 python -m vllm.entrypoints.openai.api_server \
@@ -78,6 +78,6 @@ python -m vllm.entrypoints.openai.api_server \
      -tp=8
 ```
 
-By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
 
-There are some new features coming with ipex-xpu 2.6, eg: **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
+There are some new features coming with ipex-xpu 2.6, e.g. **chunked prefill**, **V1 engine support**, **lora**, **MoE**, etc.
diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index e6be644b739..b36a3dcb170 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -20,7 +20,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b
 
 ## Running vLLM on a single node
 
-vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inference currently requires Ray.
 
 Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured `tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the `LLM` class `distributed_executor_backend` argument or `--distributed-executor-backend` API server argument. Set it to `mp` for multiprocessing or `ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
 
@@ -29,7 +29,7 @@ To run multi-GPU inference with the `LLM` class, set the `tensor_parallel_size`
 ```python
 from vllm import LLM
 llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
-output = llm.generate("San Franciso is a")
+output = llm.generate("San Francisco is a")
 ```
 
 To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
diff --git a/docs/source/training/rlhf.md b/docs/source/training/rlhf.md
index 00822aefe11..72e89c0c747 100644
--- a/docs/source/training/rlhf.md
+++ b/docs/source/training/rlhf.md
@@ -1,6 +1,6 @@
 # Reinforcement Learning from Human Feedback
 
-Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviours.
+Reinforcement Learning from Human Feedback (RLHF) is a technique that fine-tunes language models using human-generated preference data to align model outputs with desired behaviors.
 
 vLLM can be used to generate the completions for RLHF. The best way to do this is with libraries like [TRL](https://github.com/huggingface/trl), [OpenRLHF](https://github.com/OpenRLHF/OpenRLHF) and [verl](https://github.com/volcengine/verl).
 
diff --git a/examples/other/logging_configuration.md b/examples/other/logging_configuration.md
index c70b853c127..fbdbce6a461 100644
--- a/examples/other/logging_configuration.md
+++ b/examples/other/logging_configuration.md
@@ -127,7 +127,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
     "vllm": {
       "handlers": ["vllm"],
       "level": "DEBUG",
-      "propagage": false
+      "propagate": false
     },
     "vllm.example_noisy_logger": {
       "propagate": false
diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md
index c408d4a6752..349d3dfbd84 100644
--- a/vllm/distributed/kv_transfer/README.md
+++ b/vllm/distributed/kv_transfer/README.md
@@ -24,6 +24,6 @@ NOTE: If you want to not only transfer KV caches, but adjust the model execution
 
 The example usage is in [this file](../../../examples/online_serving/disaggregated_prefill.sh).
 
-Here is the diagram of how we run disaggretgated prefilling.
+Here is the diagram of how we run disaggregated prefilling.
 
 ![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg)

From c0071e83cf0a6f9eeb3e3db35ea963d6db55f98e Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 14 Mar 2025 11:33:12 +0800
Subject: [PATCH 0705/1240] [Doc] Fix small typo in Transformers fallback
 (#14791)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index bcbd7bf9600..3d42d5f6b52 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -101,7 +101,7 @@ class MyAttention(nn.Module):
 
   def forward(self, hidden_states, **kwargs): # <- kwargs are required
     ...
-    attention_interface = attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
     attn_output, attn_weights = attention_interface(
       self,
       query_states,

From b882840ade5b1ff92f28824c11652d09aeb3d850 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Thu, 13 Mar 2025 23:40:05 -0400
Subject: [PATCH 0706/1240] [V1] TPU - Enable prefix caching by default
 (#14773)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/tpu.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index fc68e5d63a6..8e2c28d9327 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -108,12 +108,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     parallel_config.worker_cls = \
                         "vllm.worker.tpu_worker.TPUWorker"
 
-        # Adjust scheduler config for V1
-        # TODO: Add support for these
-        if envs.VLLM_USE_V1 and vllm_config.cache_config.enable_prefix_caching:
-            logger.warning("[V1][TPU] Disable prefix caching")
-            vllm_config.cache_config.enable_prefix_caching = False
-
         assert not vllm_config.speculative_config, (
             "Speculative decoding is not yet supported for TPU backend")
 

From 855033080ab038864895a8e4e711b7d03598258c Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Thu, 13 Mar 2025 20:40:15 -0700
Subject: [PATCH 0707/1240] forward fix PR 14245, restore build on ROCm 6.2
 (#14709)

Signed-off-by: Jeff Daily <jeff.daily@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/fp8/amd/quant_utils.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index f01427cc3d0..feda497d021 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -19,12 +19,24 @@ __device__ __forceinline__ fp8_type cvt_c10(float const r) {
   return {};
 }
 
+// __hip_fp8_e4m3 only exists starting in ROCm 6.3. The macro
+// HIP_FP8_TYPE_OCP comes from the hip_fp8.h header and also makes
+// its first appearance in ROCm 6.3. Since VLLM_DISPATCH_FP8_TYPES
+// on ROCm instantiates both OCP and FNUZ kernels, we need to replace
+// the new HW cvt with something reasonable that doesn't rely on the
+// ROCm 6.3 feature. This allows compiling on ROCm 6.2 or newer.
 template <>
 __device__ __forceinline__ c10::Float8_e4m3fn cvt_c10(float const r) {
+    #if HIP_FP8_TYPE_OCP
   return c10::Float8_e4m3fn(
       __hip_cvt_float_to_fp8(r, __hip_fp8_e4m3::__default_saturation,
                              __hip_fp8_e4m3::__default_interpret),
       c10::Float8_e4m3fn::from_bits());
+    #else
+  // Cast implemented by pytorch. Uses bit manipulation instead of HW cvt.
+  // HW cvt above is faster when it is available (ROCm 6.3 or newer).
+  return static_cast<c10::Float8_e4m3fn>(r);
+    #endif
 }
 
 template <>

From 4fc210842994dc6c458ce34c53429a4eb15cd5eb Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 13 Mar 2025 20:40:23 -0700
Subject: [PATCH 0708/1240] [V1] Move OOM check into sampler run (#14728)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 15 ++++++++++++---
 vllm/v1/worker/gpu_worker.py       | 20 +++++---------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index df7ca70924b..c2a976108e4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1288,9 +1288,18 @@ def _dummy_sampler_run(
             allowed_token_ids_mask=None,
             bad_words_token_ids={},
         )
-        sampler_output = self.model.sample(logits=logits,
-                                           sampling_metadata=dummy_metadata)
-
+        try:
+            sampler_output = self.model.sample(
+                logits=logits, sampling_metadata=dummy_metadata)
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up sampler with "
+                    f"{num_reqs} dummy requests. Please try lowering "
+                    "`max_num_seqs` or `gpu_memory_utilization` when "
+                    "initializing the engine.") from e
+            else:
+                raise e
         return sampler_output
 
     def profile_run(self) -> None:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 5527a105f86..241869e35c6 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -221,21 +221,11 @@ def compile_or_warm_up_model(self) -> None:
         # NOTE: This is called after `capture_model` on purpose to prevent
         # memory buffers from being cleared by `torch.cuda.empty_cache`.
         if get_pp_group().is_last_rank:
-            try:
-                max_num_reqs = min(
-                    self.scheduler_config.max_num_seqs,
-                    self.scheduler_config.max_num_batched_tokens)
-                self.model_runner._dummy_sampler_run(
-                    hidden_states=self.model_runner._dummy_run(
-                        num_tokens=max_num_reqs))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    raise RuntimeError(
-                        "CUDA out of memory occurred when warming up sampler. "
-                        "Please try lowering `gpu_memory_utilization` when "
-                        "initializing the engine.") from None
-                else:
-                    raise e
+            max_num_reqs = min(self.scheduler_config.max_num_seqs,
+                               self.scheduler_config.max_num_batched_tokens)
+            self.model_runner._dummy_sampler_run(
+                hidden_states=self.model_runner._dummy_run(
+                    num_tokens=max_num_reqs))
 
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.

From 75d70d3b2a24eec6cdfa1bc2ed44331036d12a07 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 13 Mar 2025 20:40:35 -0700
Subject: [PATCH 0709/1240] [V1] Temporarily disable FlashInfer Rejection
 Sampler (#14788)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py |  2 +-
 vllm/v1/sample/rejection_sampler.py     | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 1bb950be822..7d70e839b6f 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -22,7 +22,7 @@ class TopKTopPSampler(nn.Module):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda:
+        if current_platform.is_cuda():
             if is_flashinfer_available:
                 if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 80a4b24186a..ea7f3353c11 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -24,9 +24,18 @@ class RejectionSampler(nn.Module):
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda:
+        if current_platform.is_cuda():
             if is_flashinfer_available:
                 if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                    # FIXME(woosuk): Currently, we have errors when using
+                    # FlashInfer for rejection sampling. As a workaround, we
+                    # disable FlashInfer for rejection sampling by default.
+                    logger.info("Currently, FlashInfer rejection sampler is "
+                                "disabled because of a bug. Falling back to "
+                                "the PyTorch-native implementation of "
+                                "rejection sampling.")
+                    self.forward_method = self.forward_native
+
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
                     # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
                     # default it is unused). For backward compatibility, we set
@@ -35,8 +44,8 @@ def __init__(self):
                     # None means False, while in V1, None means True. This is
                     # why we use the condition
                     # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    logger.info("Using FlashInfer for rejection sampling.")
-                    self.forward_method = self.flashinfer_sample
+                    # logger.info("Using FlashInfer for rejection sampling.")
+                    # self.forward_method = self.flashinfer_sample
                 else:
                     logger.warning(
                         "FlashInfer is available, but it is not enabled. "

From 8af826a91ec7363994aa9caa390b1c7e40052a06 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 13 Mar 2025 23:42:04 -0400
Subject: [PATCH 0710/1240] [Kernel] LoRA - Enable CUDAGraphs for V1 (#14626)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_worker.py              |  1 +
 vllm/config.py                         | 25 +++++++++++++++++++------
 vllm/lora/layers.py                    | 15 +++++++++------
 vllm/lora/punica_wrapper/punica_gpu.py |  8 ++++++--
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index fc1be4ed440..30b74ce3ef7 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -52,6 +52,7 @@ def set_active_loras(worker: Union[Worker, V1Worker],
             seed=0,
             dtype="float16",
             revision=None,
+            enforce_eager=True,
         ),
         load_config=LoadConfig(
             download_dir=None,
diff --git a/vllm/config.py b/vllm/config.py
index 35411ca73ad..429ec0dd51c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2287,9 +2287,14 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        # no factors to consider.
-        # LoRA is not compatible with `torch.compile` .
         factors: list[Any] = []
+        factors.append(self.max_lora_rank)
+        factors.append(self.max_loras)
+        factors.append(self.fully_sharded_loras)
+        factors.append(self.lora_dtype)
+        factors.append(self.lora_extra_vocab_size)
+        factors.append(self.long_lora_scaling_factors)
+        factors.append(self.bias_enabled)
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
@@ -3303,6 +3308,11 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         if self.lora_config:
             vllm_factors.append(self.lora_config.compute_hash())
+            # LoRA creates static buffers based on max_num_batched_tokens.
+            # The tensor sizes and strides get captured in the torch.compile
+            # graph explicitly.
+            vllm_factors.append(
+                str(self.scheduler_config.max_num_batched_tokens))
         else:
             vllm_factors.append("None")
         if self.speculative_config:
@@ -3453,12 +3463,15 @@ def __post_init__(self):
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
-        if self.lora_config is not None and self.compilation_config.level !=\
-             CompilationLevel.NO_COMPILATION:
-            logger.warning("LoRA is not supported with `torch.compile` yet. "
-                           "Disabling `torch.compile`.")
+        if ((not envs.VLLM_USE_V1) and self.lora_config is not None
+                and self.compilation_config.level
+                != CompilationLevel.NO_COMPILATION):
+            logger.warning(
+                "LoRA for V0 is not supported with `torch.compile` yet. "
+                "Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
+
         if self.model_config and self.model_config.use_mla and \
             not (current_platform.is_cuda() or current_platform.is_rocm()):
             logger.info(
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 1c1f76702dd..7a9d5237ab7 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -237,16 +237,19 @@ def set_lora(
                 self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = x > self.base_layer.org_vocab_size - 1
-        embeddings_indices = self.punica_wrapper.embeddings_indices
-        indices = embeddings_indices[1].view_as(x)
+        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1,
+                                        1, 0)
+        embeddings_indices = torch.narrow(
+            self.punica_wrapper._embeddings_indices, 1, 0, x.size(0))
+
+        indices = embeddings_indices[1]
         full_lora_a_embeddings = F.embedding(
             x + indices,
             self.lora_a_stacked_2d,
         )
-        indices = embeddings_indices[0].view_as(x)
-        full_output = self.base_layer.forward(
-            x.add_(indices * added_tokens_mask))
+        indices = embeddings_indices[0]
+        full_output = self.base_layer.forward(x +
+                                              (indices * added_tokens_mask))
 
         full_output_org = full_output
         if full_output.ndim == 3:
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 3a4fcd04dbe..19a94eea910 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -254,7 +254,9 @@ def add_expand(self,
         y_org = y
         y = y.view(-1, y.shape[-1])
         if lora_bias_stacked is not None:
-            self._apply_bias(self.token_lora_indices, y, output_slices,
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            self._apply_bias(token_lora_indices, y, output_slices,
                              lora_bias_stacked)
 
         if env.VLLM_USE_V1:
@@ -365,7 +367,9 @@ def add_lora_linear(self,
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
         if lora_bias_stacked is not None:
             assert len(lora_bias_stacked) == len(output_slices)
-            y = self._apply_bias(self.token_lora_indices, y, output_slices,
+            token_lora_indices = torch.narrow(self._token_lora_indices, 0, 0,
+                                              y.size(0))
+            y = self._apply_bias(token_lora_indices, y, output_slices,
                                  lora_bias_stacked)
 
         if buffer is None:

From 87acd48eac04367343877c7d5a27c038c26523e4 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 14 Mar 2025 04:42:27 +0100
Subject: [PATCH 0711/1240] [Kernel] [V1] Further optimizations to ROCm
 (Triton) Backend to better handle GQA. (#14431)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../ops/chunked_prefill_paged_decode.py       | 103 +++++++++++-------
 1 file changed, 63 insertions(+), 40 deletions(-)

diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 16d67e3abe8..48db3ebfd74 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Authors:
-#  - Burkhard Ringlein
-#  - Jan van Lunteren
-#  - Thomas Parnell
+#  - Burkhard Ringlein <ngl@zurich.ibm.com>
+#  - Jan van Lunteren <jvl@zurich.ibm.com>
+#  - Chih-Chieh Yang <chih.chieh.yang@ibm.com>
+#  - Thomas Parnell <tpa@zurich.ibm.com>
 
 import torch
 import triton
@@ -31,6 +32,7 @@ def kernel_paged_attention_2d(
         v_scale,  # float32
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
+        num_queries_per_kv_padded: tl.constexpr,  # int
         block_table_stride: tl.constexpr,  # int
         query_stride_0: tl.constexpr,  # int
         query_stride_1: tl.constexpr,  # int, should be equal to head_size
@@ -55,8 +57,7 @@ def kernel_paged_attention_2d(
         query_start_len_ptr,  # [num_seqs+1]
 ):
     seq_idx = tl.program_id(0)
-    query_head_idx = tl.program_id(1)
-    kv_head_idx = query_head_idx // num_queries_per_kv
+    kv_head_idx = tl.program_id(1)
 
     if filter_by_query_len:
         cur_batch_in_all_start_index = tl.load(query_start_len_ptr + seq_idx)
@@ -69,31 +70,40 @@ def kernel_paged_attention_2d(
     else:
         cur_batch_in_all_start_index = seq_idx
 
+    query_head_idx = kv_head_idx * num_queries_per_kv + tl.arange(
+        0, num_queries_per_kv_padded)
+
     query_offset = (cur_batch_in_all_start_index * query_stride_0 +
-                    query_head_idx * query_stride_1)
+                    query_head_idx[:, None] * query_stride_1)
+
+    head_mask = query_head_idx < (kv_head_idx + 1) * num_queries_per_kv
+    head_mask = head_mask & (query_head_idx < num_query_heads)
 
     dim_mask = tl.where(tl.arange(0, HEAD_SIZE_PADDED) < HEAD_SIZE, 1,
                         0).to(tl.int1)
 
-    # Q : (HEAD_SIZE,)
+    # Q : (num_queries_per_kv, HEAD_SIZE,)
     Q = tl.load(
-        query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED),
-        mask=dim_mask,
+        query_ptr + query_offset + tl.arange(0, HEAD_SIZE_PADDED)[None, :],
+        mask=dim_mask[None, :] & head_mask[:, None],
         other=0.0,
     )
 
     block_table_offset = seq_idx * block_table_stride
 
-    M = tl.full([1], float("-inf"), dtype=tl.float32)
-    L = tl.full([1], 1.0, dtype=tl.float32)
-    acc = tl.zeros([HEAD_SIZE_PADDED], dtype=tl.float32)
+    M = tl.full([num_queries_per_kv_padded], float("-inf"), dtype=tl.float32)
+    L = tl.full([num_queries_per_kv_padded], 1.0, dtype=tl.float32)
+    acc = tl.zeros([num_queries_per_kv_padded, HEAD_SIZE_PADDED],
+                   dtype=tl.float32)
 
     # sequence len for this particular sequence
     seq_len = tl.load(seq_lens_ptr + seq_idx)
 
     # alibi slope for this head
     if USE_ALIBI_SLOPES:
-        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx)
+        alibi_slope = tl.load(alibi_slopes_ptr + query_head_idx,
+                              mask=head_mask,
+                              other=0.0)
 
     num_blocks = cdiv_fn(seq_len, BLOCK_SIZE)
 
@@ -107,8 +117,8 @@ def kernel_paged_attention_2d(
 
         v_offset = (physical_block_idx * stride_v_cache_0 +
                     kv_head_idx * stride_v_cache_1 +
-                    offs_d[:, None] * stride_v_cache_2 +
-                    offs_n[None, :] * stride_v_cache_3)
+                    offs_d[None, :] * stride_v_cache_2 +
+                    offs_n[:, None] * stride_v_cache_3)
 
         k_offset = (physical_block_idx * stride_k_cache_0 +
                     kv_head_idx * stride_k_cache_1 +
@@ -126,9 +136,9 @@ def kernel_paged_attention_2d(
         else:
             K = K_load
 
-        # V : (HEAD_SIZE, BLOCK_SIZE)
+        # V : (BLOCK_SIZE, HEAD_SIZE)
         V_load = tl.load(value_cache_ptr + v_offset,
-                         mask=dim_mask[:, None],
+                         mask=dim_mask[None, :],
                          other=0.0)
 
         if V_load.dtype.is_fp8():
@@ -136,51 +146,59 @@ def kernel_paged_attention_2d(
         else:
             V = V_load
 
-        tmp = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
         boundary = tl.full([BLOCK_SIZE], seq_len, dtype=tl.int32)
-        mask_new = tmp < boundary
-        # S : (BLOCK_SIZE,)
-        S = tl.where(mask_new, 0.0, float("-inf")).to(tl.float32)
-        S += scale * tl.sum(K * Q[:, None], axis=0)
+        seq_mask = seq_offset[None, :] < boundary
+
+        # S : (num_queries_per_kv, BLOCK_SIZE,)
+        S = tl.where(head_mask[:, None] & seq_mask, 0.0,
+                     float("-inf")).to(tl.float32)
+        S += scale * tl.dot(Q, K)
+
+        context_len = seq_len - 1
 
         if SLIDING_WINDOW > 0:
-            S = tl.where((seq_len - 1 - tmp) < SLIDING_WINDOW, S, -10000)
+            S = tl.where((context_len - seq_offset) < SLIDING_WINDOW, S,
+                         -10000)
 
         if USE_ALIBI_SLOPES:
-            S += alibi_slope * (tmp - seq_len + 1)
+            S += alibi_slope[:, None] * (seq_offset - context_len)
 
         # compute running maximum
-        # m_j : (1,)
-        m_j = tl.maximum(M, tl.max(S, axis=0))
+        # m_j : (num_queries_per_kv,)
+        m_j = tl.maximum(M, tl.max(S, axis=1))
 
-        # P : (BLOCK_SIZE,)
-        P = tl.exp(S - m_j)
+        # P : (num_queries_per_kv, BLOCK_SIZE,)
+        P = tl.exp(S - m_j[:, None])
 
-        # l_j : (1,)
-        l_j = tl.sum(P, axis=0)
+        # l_j : (num_queries_per_kv,)
+        l_j = tl.sum(P, axis=1)
 
-        # alpha : (1, )
+        # alpha : (num_queries_per_kv, )
         alpha = tl.exp(M - m_j)
 
-        # acc : (BLOCK_SIZE,)
-        acc = acc * alpha
+        # acc : (num_queries_per_kv, BLOCK_SIZE,)
+        acc = acc * alpha[:, None]
 
         # update constants
         L = L * alpha + l_j
         M = m_j
 
-        # acc : (BLOCK_SIZE,)
-        acc += tl.sum(V * P[None, :], axis=1)
+        # acc : (num_queries_per_kv, BLOCK_SIZE,)
+        acc += tl.dot(P.to(V.dtype), V)
 
     # epilogue
-    acc = acc / L
+    acc = acc / L[:, None]
 
     output_offset = (cur_batch_in_all_start_index * output_stride_0 +
                      query_head_idx * output_stride_1)
 
-    tl.store(output_ptr + output_offset + tl.arange(0, HEAD_SIZE_PADDED),
-             acc,
-             mask=dim_mask)
+    tl.store(
+        output_ptr + output_offset[:, None] +
+        tl.arange(0, HEAD_SIZE_PADDED)[None, :],
+        acc,
+        mask=dim_mask[None, :] & head_mask[:, None],
+    )
 
 
 def chunked_prefill_paged_decode(
@@ -234,6 +252,7 @@ def chunked_prefill_paged_decode(
     block_size = value_cache.shape[3]
     num_seqs = len(seq_lens)
     num_query_heads = query.shape[1]
+    num_kv_heads = key.shape[1]
     num_queries_per_kv = query.shape[1] // key.shape[1]
     head_size = query.shape[2]
 
@@ -253,9 +272,12 @@ def chunked_prefill_paged_decode(
         key_cache = key_cache.view(target_dtype)
         value_cache = value_cache.view(target_dtype)
 
+    num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
+                                    16)
+
     kernel_paged_attention_2d[(
         num_seqs,
-        num_query_heads,
+        num_kv_heads,
     )](
         output_ptr=output,
         query_ptr=query,
@@ -269,6 +291,7 @@ def chunked_prefill_paged_decode(
         v_scale=v_scale,
         num_query_heads=num_query_heads,
         num_queries_per_kv=num_queries_per_kv,
+        num_queries_per_kv_padded=num_queries_per_kv_padded,
         block_table_stride=block_table.stride(0),
         query_stride_0=query.stride(0),
         query_stride_1=query.stride(1),

From ef1eed962e7c60bded90429c9bd706d47a8566dd Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 14 Mar 2025 11:43:18 +0800
Subject: [PATCH 0712/1240] [Bugfix][IPEX] Add `VLLM_CPU_MOE_PREPACK` to allow
 disabling MoE prepack when CPU does not support it (#14681)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/cpu.md | 1 +
 vllm/envs.py                                    | 7 +++++++
 vllm/model_executor/layers/fused_moe/layer.py   | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 9ca25e4709e..43c9187f072 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -195,6 +195,7 @@ vLLM CPU backend supports the following vLLM features:
 
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
+- `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 
 ## Performance tips
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 24ee4583c75..259501056cc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -40,6 +40,7 @@
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
+    VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_OPENVINO_DEVICE: str = "CPU"
     VLLM_OPENVINO_KVCACHE_SPACE: int = 0
     VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
@@ -349,6 +350,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_CPU_OMP_THREADS_BIND":
     lambda: os.getenv("VLLM_CPU_OMP_THREADS_BIND", "all"),
 
+    # (CPU backend only) whether to use prepack for MoE layer. This will be
+    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
+    # need to set this to "0" (False).
+    "VLLM_CPU_MOE_PREPACK":
+    lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
+
     # OpenVINO device selection
     # default is CPU
     "VLLM_OPENVINO_DEVICE":
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 2c5fa509c59..91764313464 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -7,6 +7,7 @@
 import torch
 from torch.nn.parameter import UninitializedParameter
 
+from vllm import envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -104,7 +105,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
                     layer.w13_weight,
                     layer.w2_weight,
-                    use_prepack=True,
+                    use_prepack=envs.VLLM_CPU_MOE_PREPACK,
                 )
             else:
                 raise NotImplementedError("CPU MOE only supports x86 arch.")

From fa1a35c74d6f5ca052a1e1e2b1392630f9d28333 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 13 Mar 2025 20:43:45 -0700
Subject: [PATCH 0713/1240] [ci] Reduce number of tests in fastcheck (#14782)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2af76cb24dd..81a97139047 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -41,7 +41,6 @@ steps:
   - grep \"sig sig-object py\" build/html/api/inference_params.html
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
-  fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/mq_llm_engine
@@ -126,7 +125,6 @@ steps:
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
-  fast_check: true
   source_file_dependencies:
   - vllm/distributed/
   - vllm/core/
@@ -152,7 +150,6 @@ steps:
 
 - label: Metrics, Tracing Test # 10min
   num_gpus: 2
-  fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/metrics
@@ -284,7 +281,6 @@ steps:
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
-  fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/compile
@@ -528,7 +524,6 @@ steps:
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  fast_check: true
   source_file_dependencies:
   - vllm/plugins/
   - tests/plugins/

From b22dafb0ab5ecda5aeae5aa9a4504a2eef7e7ff7 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 13 Mar 2025 23:44:20 -0400
Subject: [PATCH 0714/1240] [Misc][Minor] Simplify
 `SamplingParams.__post_init__()` (#14772)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/sampling_params.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 110efa22982..b0a5777cc8d 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -338,29 +338,23 @@ def __post_init__(self) -> None:
 
         if self.seed == -1:
             self.seed = None
-        else:
-            self.seed = self.seed
 
         if self.stop is None:
             self.stop = []
         elif isinstance(self.stop, str):
             self.stop = [self.stop]
-        else:
-            self.stop = list(self.stop)
 
         if self.stop_token_ids is None:
             self.stop_token_ids = []
-        else:
-            self.stop_token_ids = list(self.stop_token_ids)
 
         if self.bad_words is None:
             self.bad_words = []
-        else:
-            self.bad_words = list(self.bad_words)
 
-        self.logprobs = 1 if self.logprobs is True else self.logprobs
-        self.prompt_logprobs = (1 if self.prompt_logprobs is True else
-                                self.prompt_logprobs)
+        if self.logprobs is True:
+            self.logprobs = 1
+
+        if self.prompt_logprobs is True:
+            self.prompt_logprobs = 1
 
         # Number of characters to hold back for stop string evaluation
         # until sequence is finished.

From 27cdb3e96db0a12927e21658f9a882bd0aae257a Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Thu, 13 Mar 2025 20:46:56 -0700
Subject: [PATCH 0715/1240] [Neuron] flatten test parameterization for neuron
 attention kernels (#14712)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-neuron-test.sh                 |  2 +-
 tests/neuron/{ => 1_core}/test_activation.py  |  0
 tests/neuron/{ => 1_core}/test_block_table.py |  0
 tests/neuron/{ => 1_core}/test_cache.py       |  0
 tests/neuron/{ => 1_core}/test_layernorm.py   |  0
 .../{ => 1_core}/test_logits_processor.py     |  0
 .../{ => 1_core}/test_prefix_prefill.py       | 46 ++++++++++---------
 .../{ => 1_core}/test_rotary_embedding.py     |  0
 tests/neuron/{ => 2_core}/test_comm_ops.py    |  0
 9 files changed, 26 insertions(+), 22 deletions(-)
 rename tests/neuron/{ => 1_core}/test_activation.py (100%)
 rename tests/neuron/{ => 1_core}/test_block_table.py (100%)
 rename tests/neuron/{ => 1_core}/test_cache.py (100%)
 rename tests/neuron/{ => 1_core}/test_layernorm.py (100%)
 rename tests/neuron/{ => 1_core}/test_logits_processor.py (100%)
 rename tests/neuron/{ => 1_core}/test_prefix_prefill.py (92%)
 rename tests/neuron/{ => 1_core}/test_rotary_embedding.py (100%)
 rename tests/neuron/{ => 2_core}/test_comm_ops.py (100%)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 55c374fcc33..06924fea619 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -51,4 +51,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
diff --git a/tests/neuron/test_activation.py b/tests/neuron/1_core/test_activation.py
similarity index 100%
rename from tests/neuron/test_activation.py
rename to tests/neuron/1_core/test_activation.py
diff --git a/tests/neuron/test_block_table.py b/tests/neuron/1_core/test_block_table.py
similarity index 100%
rename from tests/neuron/test_block_table.py
rename to tests/neuron/1_core/test_block_table.py
diff --git a/tests/neuron/test_cache.py b/tests/neuron/1_core/test_cache.py
similarity index 100%
rename from tests/neuron/test_cache.py
rename to tests/neuron/1_core/test_cache.py
diff --git a/tests/neuron/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py
similarity index 100%
rename from tests/neuron/test_layernorm.py
rename to tests/neuron/1_core/test_layernorm.py
diff --git a/tests/neuron/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py
similarity index 100%
rename from tests/neuron/test_logits_processor.py
rename to tests/neuron/1_core/test_logits_processor.py
diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
similarity index 92%
rename from tests/neuron/test_prefix_prefill.py
rename to tests/neuron/1_core/test_prefix_prefill.py
index 2c6ac47888d..326a1f82e9b 100644
--- a/tests/neuron/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -292,28 +292,32 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
 
 
 @pytest.mark.parametrize(
-    "prefill_batch_size,decode_batch_size,block_size,large_tile_size",
+    "prefill_batch_size,decode_batch_size,block_size,large_tile_size,num_heads,num_queries_per_kv,head_size,mixed_precision",
     [
-        (1, 199, 1, 512),  # 512 blocks
-        (4, 12, 256, 2048),  # 128 blocks
-        (4, 12, 16, 2048),  # 128 blocks
-        (4, 12, 4, 1024),  # 256 blocks
-        (4, 12, 32, 2048),  # 64 blocks
-        (4, 12, 32, 4096),  # 128 blocks
-        (4, 12, 32, 8192),  # 256 blocks
-        (4, 12, 64, 8192),  # 128 blocks
-    ],
-)
-@pytest.mark.parametrize(
-    "num_heads,num_queries_per_kv,head_size",
-    [
-        (4, 2, 8),
-        (32, 8, 64),
-        (4, 4, 128),
-        (8, 1, 32),
-    ],
-)
-@pytest.mark.parametrize("mixed_precision", [True, False])
+        # Test minimal configurations (small block size)
+        (1, 199, 1, 512, 4, 2, 8, False
+         ),  # minimal block size, small dimensions
+        (1, 199, 1, 512, 4, 2, 8, True),  # same with mixed precision
+
+        # Test common/medium configurations
+        (4, 12, 32, 2048, 32, 8, 64, False),  # common case, larger heads
+        (4, 12, 32, 2048, 16, 4, 32,
+         True),  # medium size, mixed precision, grouped-query attention (GQA)
+
+        # Test large configurations
+        (4, 12, 256, 8192, 8, 1, 128, False),  # large blocks, large head size
+        (4, 12, 256, 8192, 64, 8, 64, True),  # large blocks, many heads
+
+        # Test asymmetric configurations
+        (2, 24, 64, 4096, 12, 4, 96, False),  # varied batch sizes
+        (8, 8, 128, 2048, 24, 2, 48, True),  # balanced batches
+
+        # Test edge cases
+        (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
+        (16, 4, 8, 8192, 48, 1, 128, True),  # large prefill batch
+        (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
+        (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
+    ])
 @torch.inference_mode()
 def test_contexted_kv_attention(
     prefill_batch_size: int,
diff --git a/tests/neuron/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py
similarity index 100%
rename from tests/neuron/test_rotary_embedding.py
rename to tests/neuron/1_core/test_rotary_embedding.py
diff --git a/tests/neuron/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py
similarity index 100%
rename from tests/neuron/test_comm_ops.py
rename to tests/neuron/2_core/test_comm_ops.py

From cd8fbd6f8a715e720e2f82f667da093b7c13e6e6 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Thu, 13 Mar 2025 21:07:54 -0700
Subject: [PATCH 0716/1240] [Feature] Add visionarena offline support for
 benchmark_throughput (#14654)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md               |  58 +++++--
 benchmarks/benchmark_dataset.py    |  65 ++++---
 benchmarks/benchmark_throughput.py | 267 ++++++++++++++++++++++-------
 3 files changed, 291 insertions(+), 99 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index edc10d8b43e..c64c24fd3ad 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -43,20 +43,26 @@ become available.
     <tr>
       <td><strong>HuggingFace</strong></td>
       <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">🟡</td>
       <td>Specify your dataset path on HuggingFace</td>
     </tr>
     <tr>
       <td><strong>VisionArena</strong></td>
       <td style="text-align: center;">✅</td>
-      <td style="text-align: center;">🚧</td>
+      <td style="text-align: center;">✅</td>
       <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
     </tr>
   </tbody>
 </table>
-✅: supported  
+
+✅: supported
+
 🚧: to be supported
 
+🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
+formats, please consider contributing.
+
 **Note**: VisionArena’s `dataset-name` should be set to `hf`
 
 ---
@@ -79,7 +85,7 @@ NUM_PROMPTS=10
 BACKEND="openai-chat"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```
 
 If successful, you will see the following output
@@ -123,7 +129,7 @@ DATASET_NAME="hf"
 DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
 DATASET_SPLIT='train'
 
-python3 benchmarks/benchmark_serving.py \
+python3 vllm/benchmarks/benchmark_serving.py \
   --backend "${BACKEND}" \
   --model "${MODEL_NAME}" \
   --endpoint "/v1/chat/completions" \
@@ -140,35 +146,65 @@ python3 benchmarks/benchmark_serving.py \
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
 DATASET_NAME="sonnet"
-DATASET_PATH="benchmarks/sonnet.txt"
+DATASET_PATH="vllm/benchmarks/sonnet.txt"
 
-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
   --model "${MODEL_NAME}" \
   --dataset-name "${DATASET_NAME}" \
   --dataset-path "${DATASET_PATH}" \
   --num-prompts "${NUM_PROMPTS}"
-  ```
+```
 
 If successful, you will see the following output
 
 ```
-Throughput: 7.35 requests/s, 4789.20 total tokens/s, 1102.83 output tokens/s
+Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
+Total num prompt tokens:  5014
+Total num output tokens:  1500
+```
+
+### VisionArena Benchmark for Vision Language Models
+
+``` bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+DATASET_NAME="hf"
+DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
+DATASET_SPLIT="train"
+
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model "${MODEL_NAME}" \
+  --backend "vllm-chat" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-split "${DATASET_SPLIT}"
+```
+
+The `num prompt tokens` now includes image token counts
+
+```
+Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
+Total num prompt tokens:  14527
+Total num output tokens:  1280
 ```
 
 ### Benchmark with LoRA Adapters
 
 ``` bash
+# download dataset
+# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="meta-llama/Llama-2-7b-hf"
 BACKEND="vllm"
 DATASET_NAME="sharegpt"
-DATASET_PATH="/home/jovyan/data/vllm_benchmark_datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
 NUM_PROMPTS=10
 MAX_LORAS=2
 MAX_LORA_RANK=8
 ENABLE_LORA="--enable-lora"
 LORA_PATH="yard1/llama-2-7b-sql-lora-test"
 
-python3 benchmarks/benchmark_throughput.py \
+python3 vllm/benchmarks/benchmark_throughput.py \
   --model "${MODEL_NAME}" \
   --backend "${BACKEND}" \
   --dataset_path "${DATASET_PATH}" \
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 30fffdda491..55109dab000 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -46,7 +46,7 @@ class SampleRequest:
     Represents a single inference request for benchmarking.
     """
 
-    prompt: str
+    prompt: Union[str, Any]
     prompt_len: int
     expected_output_len: int
     multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
@@ -84,6 +84,20 @@ def __init__(
                             if random_seed is not None else self.DEFAULT_SEED)
         self.data = None
 
+    def apply_multimodal_chat_transformation(
+            self,
+            prompt: str,
+            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
+        """
+        Transform a prompt and optional multimodal content into a chat format.
+        This method is used for chat models that expect a specific 
+        conversation format.
+        """
+        content = [{"text": prompt, "type": "text"}]
+        if mm_content is not None:
+            content.append(mm_content)
+        return [{"role": "user", "content": content}]
+
     def load_data(self) -> None:
         """
         Load data from the dataset path into self.data.
@@ -338,6 +352,7 @@ def sample(self,
                lora_path: Optional[str] = None,
                max_loras: Optional[int] = None,
                output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
                **kwargs) -> list:
         samples: list = []
         for entry in self.data:
@@ -358,6 +373,9 @@ def sample(self,
                                      skip_min_output_len_check=output_len
                                      is not None):
                 continue
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, None)
             samples.append(
                 SampleRequest(
                     prompt=prompt,
@@ -550,10 +568,13 @@ def load_data(self) -> None:
             split=self.dataset_split,
             streaming=True,
         )
-
-        if "conversations" not in self.data.features:
-            raise ValueError("HF Dataset must have a 'conversations' column.")
-
+        if self.data.features is None or "conversations" \
+            not in self.data.features:
+            raise ValueError(
+                "HuggingFaceDataset currently only supports datasets with "
+                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
+                "Please consider contributing if you would like to add "
+                "support for additional dataset formats.")
         # Shuffle and filter examples with at least 2 conversations.
         self.data = self.data.shuffle(seed=self.random_seed).filter(
             lambda x: len(x["conversations"]) >= 2)
@@ -561,9 +582,8 @@ def load_data(self) -> None:
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
                num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
                output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
                **kwargs) -> list:
         sampled_requests = []
         dynamic_output = output_len is None
@@ -571,13 +591,9 @@ def sample(self,
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-
             conv = item["conversations"]
             prompt, completion = conv[0]["value"], conv[1]["value"]
 
-            lora_request, tokenizer = self.get_random_lora_request(
-                tokenizer, lora_path=lora_path, max_loras=max_loras)
-
             prompt_ids = tokenizer(prompt).input_ids
             completion_ids = tokenizer(completion).input_ids
             prompt_len = len(prompt_ids)
@@ -587,16 +603,20 @@ def sample(self,
             if dynamic_output and not is_valid_sequence(
                     prompt_len, completion_len):
                 continue
-
             mm_content = process_image(
                 item["image"]) if "image" in item else None
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len and output len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
                     prompt_len=prompt_len,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
-                    lora_request=lora_request,
                 ))
         return sampled_requests
 
@@ -606,7 +626,7 @@ def sample(self,
 # -----------------------------------------------------------------------------
 
 
-class VisionArenaDataset(BenchmarkDataset):
+class VisionArenaDataset(HuggingFaceDataset):
     """
     Vision Arena Dataset.
     """
@@ -617,14 +637,9 @@ class VisionArenaDataset(BenchmarkDataset):
 
     def __init__(
         self,
-        dataset_split: str,
-        dataset_subset: Optional[str] = None,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
-        self.dataset_split = dataset_split
-        self.dataset_subset = dataset_subset
-
         if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
             raise ValueError(f"Only support Vision Arena dataset.\
                     This data path {self.dataset_path} is not valid.")
@@ -645,9 +660,9 @@ def load_data(self) -> None:
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
                num_requests: int,
-               output_len: int = DEFAULT_OUTPUT_LEN,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
                **kwargs) -> list:
-        # TODO (jenniferzhao): Add support for offline benchmark sampling
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
@@ -655,8 +670,14 @@ def sample(self,
             if len(sampled_requests) >= num_requests:
                 break
             prompt = item["turns"][0][0]["content"]
-            prompt_len = len(tokenizer(prompt).input_ids)
             mm_content = process_image(item["images"][0])
+            prompt_len = len(tokenizer(prompt).input_ids)
+            if enable_multimodal_chat:
+                # Note: when chat is enabled the request prompt_len is no longer
+                # accurate and we will be using request output to count the
+                # actual prompt len
+                prompt = self.apply_multimodal_chat_transformation(
+                    prompt, mm_content)
             sampled_requests.append(
                 SampleRequest(
                     prompt=prompt,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7e6556733b2..53869db478c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,8 +11,9 @@
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, RandomDataset, SampleRequest,
-                               ShareGPTDataset, SonnetDataset)
+from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -23,6 +24,7 @@
     build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
+from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
@@ -32,7 +34,7 @@ def run_vllm(
     n: int,
     engine_args: EngineArgs,
     disable_detokenize: bool = False,
-) -> float:
+) -> tuple[float, Optional[list[RequestOutput]]]:
     from vllm import LLM, SamplingParams
     llm = LLM(**dataclasses.asdict(engine_args))
     assert all(
@@ -66,12 +68,13 @@ def run_vllm(
 
     use_beam_search = False
 
+    outputs = None
     if not use_beam_search:
         start = time.perf_counter()
-        llm.generate(prompts,
-                     sampling_params,
-                     lora_request=lora_requests,
-                     use_tqdm=True)
+        outputs = llm.generate(prompts,
+                               sampling_params,
+                               lora_request=lora_requests,
+                               use_tqdm=True)
         end = time.perf_counter()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
@@ -89,7 +92,46 @@ def run_vllm(
                 ignore_eos=True,
             ))
         end = time.perf_counter()
-    return end - start
+    return end - start, outputs
+
+
+def run_vllm_chat(
+        requests: list[SampleRequest],
+        n: int,
+        engine_args: EngineArgs,
+        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    """
+    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
+    multimodal models as it properly handles multimodal inputs and chat
+    formatting. For non-multimodal models, use run_vllm() instead.
+    """
+    from vllm import LLM, SamplingParams
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    assert all(
+        llm.llm_engine.model_config.max_model_len >= (
+            request.prompt_len + request.expected_output_len)
+        for request in requests), (
+            "Please ensure that max_model_len is greater than the sum of "
+            "prompt_len and expected_output_len for all requests.")
+
+    prompts = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(request.prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+                detokenize=not disable_detokenize,
+            ))
+    start = time.perf_counter()
+    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start, outputs
 
 
 async def run_vllm_async(
@@ -264,6 +306,8 @@ def get_requests(args, tokenizer):
         dataset_cls = RandomDataset
     elif args.dataset_name == "sharegpt":
         dataset_cls = ShareGPTDataset
+        if args.backend == "vllm-chat":
+            sample_kwargs["enable_multimodal_chat"] = True
     elif args.dataset_name == "sonnet":
         assert tokenizer.chat_template or tokenizer.default_chat_template, (
             "Tokenizer/model must have chat template for sonnet dataset.")
@@ -272,6 +316,19 @@ def get_requests(args, tokenizer):
         sample_kwargs["return_prompt_formatted"] = True
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
+    elif args.dataset_name == "hf":
+        if args.backend != "vllm-chat":
+            raise ValueError(
+                "hf datasets only are supported by vllm-chat backend")
+        # Choose between VisionArenaDataset and HuggingFaceDataset based on
+        # provided parameters.
+        dataset_cls = (VisionArenaDataset if args.dataset_path
+                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                       and args.hf_subset is None else HuggingFaceDataset)
+        common_kwargs['dataset_subset'] = args.hf_subset
+        common_kwargs['dataset_split'] = args.hf_split
+        sample_kwargs["enable_multimodal_chat"] = True
+
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -290,6 +347,7 @@ def main(args: argparse.Namespace):
     requests = get_requests(args, tokenizer)
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
+    request_outputs: Optional[list[RequestOutput]] = None
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -301,9 +359,9 @@ def main(args: argparse.Namespace):
                     args.disable_detokenize,
                 ))
         else:
-            elapsed_time = run_vllm(requests, args.n,
-                                    EngineArgs.from_cli_args(args),
-                                    args.disable_detokenize)
+            elapsed_time, request_outputs = run_vllm(
+                requests, args.n, EngineArgs.from_cli_args(args),
+                args.disable_detokenize)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -312,20 +370,45 @@ def main(args: argparse.Namespace):
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
+    elif args.backend == "vllm-chat":
+        elapsed_time, request_outputs = run_vllm_chat(
+            requests, args.n, EngineArgs.from_cli_args(args),
+            args.disable_detokenize)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(request.prompt_len + request.expected_output_len
-                           for request in requests)
-    total_output_tokens = sum(request.expected_output_len
-                              for request in requests)
-    if is_multi_modal:
-        print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+
+    if request_outputs:
+        # Note: with the vllm and vllm-chat backends,
+        # we have request_outputs, which we use to count tokens.
+        total_prompt_tokens = 0
+        total_output_tokens = 0
+        for ro in request_outputs:
+            if not isinstance(ro, RequestOutput):
+                continue
+            total_prompt_tokens += len(
+                ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            total_output_tokens += sum(
+                len(o.token_ids) for o in ro.outputs if o)
+        total_num_tokens = total_prompt_tokens + total_output_tokens
+    else:
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len
+                               for r in requests)
+        total_output_tokens = sum(r.expected_output_len for r in requests)
+        total_prompt_tokens = total_num_tokens - total_output_tokens
+
+    if is_multi_modal and args.backend != "vllm-chat":
+        print("\033[91mWARNING\033[0m: Multi-modal request with "
+              f"{args.backend} backend detected. The "
               "following metrics are not accurate because image tokens are not"
               " counted. See vllm-project/vllm/issues/9778 for details.")
         # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
+        # vllm-chat backend counts the image tokens now
+
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
           f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(f"Total num prompt tokens:  {total_prompt_tokens}")
+    print(f"Total num output tokens:  {total_output_tokens}")
 
     # Output JSON results if specified
     if args.output_json:
@@ -341,17 +424,100 @@ def main(args: argparse.Namespace):
         save_to_pytorch_benchmark_format(args, results)
 
 
+def validate_args(args):
+    """
+    Validate command-line arguments.
+    """
+
+    # === Deprecation and Defaulting ===
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next release. "
+            "Please use '--dataset-name' and '--dataset-path' instead.",
+            stacklevel=2)
+        args.dataset_path = args.dataset
+
+    if not getattr(args, "tokenizer", None):
+        args.tokenizer = args.model
+
+    # === Backend Validation ===
+    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
+    if args.backend not in valid_backends:
+        raise ValueError(f"Unsupported backend: {args.backend}")
+
+    # === Dataset Configuration ===
+    if not args.dataset and not args.dataset_path:
+        print(
+            "When dataset path is not set, it will default to random dataset")
+        args.dataset_name = 'random'
+        if args.input_len is None:
+            raise ValueError("input_len must be provided for a random dataset")
+
+    # === Dataset Name Specific Checks ===
+    # --hf-subset and --hf-split: only used
+    # when dataset_name is 'hf'
+    if args.dataset_name != "hf" and (
+            getattr(args, "hf_subset", None) is not None
+            or getattr(args, "hf_split", None) is not None):
+        warnings.warn("--hf-subset and --hf-split will be ignored \
+                since --dataset-name is not 'hf'.",
+                      stacklevel=2)
+    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
+        raise ValueError(
+            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+
+    # --random-range-ratio: only used when dataset_name is 'random'
+    if args.dataset_name != 'random' and args.random_range_ratio is not None:
+        warnings.warn("--random-range-ratio will be ignored since \
+                --dataset-name is not 'random'.",
+                      stacklevel=2)
+
+    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
+    # set.
+    if args.dataset_name not in {"random", "sonnet", None
+                                 } and args.prefix_len is not None:
+        warnings.warn("--prefix-len will be ignored since --dataset-name\
+                 is not 'random', 'sonnet', or not set.",
+                      stacklevel=2)
+
+    # === LoRA Settings ===
+    if getattr(args, "enable_lora", False) and args.backend != "vllm":
+        raise ValueError(
+            "LoRA benchmarking is only supported for vLLM backend")
+    if getattr(args, "enable_lora", False) and args.lora_path is None:
+        raise ValueError("LoRA path must be provided when enable_lora is True")
+
+    # === Backend-specific Validations ===
+    if args.backend == "hf" and args.hf_max_batch_size is None:
+        raise ValueError("HF max batch size is required for HF backend")
+    if args.backend != "hf" and args.hf_max_batch_size is not None:
+        raise ValueError("HF max batch size is only for HF backend.")
+
+    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
+                                                 None) is not None:
+        raise ValueError("Quantization is only for vLLM backend.")
+
+    if args.backend == "mii" and args.dtype != "auto":
+        raise ValueError("dtype must be auto for MII backend.")
+    if args.backend == "mii" and args.n != 1:
+        raise ValueError("n must be 1 for MII backend.")
+    if args.backend == "mii" and args.tokenizer != args.model:
+        raise ValueError(
+            "Tokenizer must be the same as the model for MII backend.")
+
+
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(description="Benchmark the throughput.")
     parser.add_argument("--backend",
                         type=str,
-                        choices=["vllm", "hf", "mii"],
+                        choices=["vllm", "hf", "mii", "vllm-chat"],
                         default="vllm")
-    parser.add_argument("--dataset-name",
-                        type=str,
-                        choices=["sharegpt", "random", "sonnet", "burstgpt"],
-                        help="Name of the dataset to benchmark on.",
-                        default="sharegpt")
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
+        help="Name of the dataset to benchmark on.",
+        default="sharegpt")
     parser.add_argument(
         "--dataset",
         type=str,
@@ -419,55 +585,24 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--random-range-ratio",
         type=float,
-        default=1.0,
+        default=None,
         help="Range of sampled ratio of input/output length, "
         "used only for RandomDataSet.",
     )
 
+    # hf dtaset
+    parser.add_argument("--hf-subset",
+                        type=str,
+                        default=None,
+                        help="Subset of the HF dataset.")
+    parser.add_argument("--hf-split",
+                        type=str,
+                        default=None,
+                        help="Split of the HF dataset.")
+
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model
-    if args.dataset is not None:
-        warnings.warn(
-            "The '--dataset' argument will be deprecated in the next "
-            "release. Please use '--dataset-name' and "
-            "'--dataset-path' in the future runs.",
-            stacklevel=2)
-        args.dataset_path = args.dataset
-    if args.dataset is None and args.dataset_path is None:
-        # for random dataset, the default sampling setting is in
-        # benchmark_dataset.RandomDataset
-        print("When dataset is not set, it will default to random dataset")
-    else:
-        assert args.input_len is None
-    if args.enable_lora:
-        assert args.lora_path is not None
-
-    if args.backend == "vllm":
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-    elif args.backend == "hf":
-        if args.hf_max_batch_size is None:
-            raise ValueError("HF max batch size is required for HF backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
-    elif args.backend == "mii":
-        if args.dtype != "auto":
-            raise ValueError("dtype must be auto for MII backend.")
-        if args.n != 1:
-            raise ValueError("n must be 1 for MII backend.")
-        if args.quantization is not None:
-            raise ValueError("Quantization is only for vLLM backend.")
-        if args.hf_max_batch_size is not None:
-            raise ValueError("HF max batch size is only for HF backend.")
-        if args.tokenizer != args.model:
-            raise ValueError("Tokenizer must be the same as the model for MII "
-                             "backend.")
-        if args.enable_lora is not None:
-            raise ValueError("LoRA benchmarking is only supported for vLLM"
-                             " backend")
+    validate_args(args)
     main(args)

From 087909811685985a5c00b918a3cba3b8dac0244b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Thu, 13 Mar 2025 22:52:15 -0700
Subject: [PATCH 0717/1240] [CI] Fix missing example model id in processor test
 (#14787)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/multimodal/processing/test_common.py | 2 +-
 tests/models/registry.py                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 467114eedb0..aef5db9bc06 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -215,7 +215,7 @@ def test_processing_correctness(
 
 
 # yapf: disable
-@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"])
+@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
 @pytest.mark.parametrize("simplify_rate", [1.0])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index eadbd7e6f49..372ea33ba9f 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -274,7 +274,7 @@ def check_available_online(
                               trust_remote_code=True),
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
-    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
+    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct",
                                         trust_remote_code=True),
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),

From 27577166ed36e8cb9f76a35fad3b78c87089bad9 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Fri, 14 Mar 2025 02:39:02 -0400
Subject: [PATCH 0718/1240] [Attention] MLA get rid of materialization (#14770)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/mla/common.py         | 267 ++++-------------
 vllm/envs.py                                  |  19 --
 .../layers/quantization/utils/fp8_utils.py    |  59 +---
 vllm/v1/attention/backends/mla/common.py      | 271 ++++--------------
 4 files changed, 117 insertions(+), 499 deletions(-)

diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index fc5f3420e39..ff411f75ae7 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -7,22 +7,22 @@
 Sq      as Q sequence length
 Skv     as KV sequence length
 
-MLA has two possible ways of computing, a data-movement friendly approach and a 
-compute friendly approach, we generally want to use the compute friendly 
-approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1) 
-and the data-movement friendly approach for "decode" (i.e. the ratio 
-Sq / Skv is "large"). 
-
-NOTE what we deem small and large is currently determined by if its labelled 
-prefill or decode by the scheduler, but this is something we should probably 
+MLA has two possible ways of computing, a data-movement friendly approach and a
+compute friendly approach, we generally want to use the compute friendly
+approach for "prefill" (i.e. the ratio Sq / Skv is "small", is near 1)
+and the data-movement friendly approach for "decode" (i.e. the ratio
+Sq / Skv is "large").
+
+NOTE what we deem small and large is currently determined by if its labelled
+prefill or decode by the scheduler, but this is something we should probably
 tune.
 
 Main reference: DeepseekV2 paper, and FlashInfer Implementation
 (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
 
 Deepseek's MLA attention works the following way:
-* Use a single latent vector to represent the per-token entry of the KV cache.  
-* For decode (i.e. the memory friendly approach) the attention "simulates" a 
+* Use a single latent vector to represent the per-token entry of the KV cache.
+* For decode (i.e. the memory friendly approach) the attention "simulates" a
 multi-head attention, while the compute is similar to multi-query attention.
 
 Below is example of both paths assuming batchsize = 1
@@ -54,9 +54,9 @@
 W_UQ        project q_c to q_nope               shape [Lq, N * P]
 W_QR        project q_c to q_pe                 shape [Lq, N * R]
 W_DKV       project h_t to kv_c                 shape [H, Lkv]
-W_UK        project kv_c to k_nope              shape [Lkv, N * P]
-W_KR        project h_t to k_pe                 shape [H, N * R]
-W_UV        project kv_c to v                   shape [Lkv, N * V]
+W_UK        project kv_c to k_nope              shape [Lkv, N, P]
+W_KR        project h_t to k_pe                 shape [H, R]
+W_UV        project kv_c to v                   shape [Lkv, N, V]
 W_O         project v to h_t                    shape [N * V, H]
 
 
@@ -69,8 +69,8 @@
 new_k_pe = RoPE(h_t @ W_KR)
 kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
 k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
-k_nope   = (kv_c @ W_UK).view(Skv, N, P)
-v        = (kv_c @ W_UV).view(Skv, N, V)
+k_nope   = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
+v        = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
 
 // MHA with QK headdim = P + R
 //           V headdim = V
@@ -90,20 +90,10 @@
 
 ## Data-Movement Friendly Approach (i.e. "_forward_decode"):
 
-Ahead of time, compute:
-
-% this projects from q_c to [Sq, N * Lkv]
-W_UQ_UK = einsum("qnp,knp -> qnk"
-                     W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P)
-                ).view(Lkv, N * Lkv)
-% this projects from attn output [Sq, N * Lkv] to [Sq, H]
-W_UV_O  = einsum("knv,nvh -> nkh"
-                     W_UV.view(Lkv, N, V), W_O.view(N, V, H)
-                ).view(N * Lkv, H)
-
 Runtime
 q_c      = h_t @ W_DQ
-q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv)
+q_nope   = (q_c @ W_UQ).view(-1, N, P)
+ql_nope  = einsum("snh,lnh->snl", q, W_UK)
 q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
 new_kv_c = h_t @ W_DKV
 new_k_pe = RoPE(h_t @ W_KR)
@@ -116,11 +106,13 @@
 // NOTE: this is less compute-friendly since Lkv > P
 //       but is more data-movement friendly since its MQA vs MHA
 spda_o = scaled_dot_product_attention(
-    torch.cat([q_latent, q_pe], dim=-1),
+    torch.cat([ql_nope, q_pe], dim=-1),
     torch.cat([kv_c, k_pe], dim=-1),
     kv_c
 )
-return spda_o.reshape(-1, N * Lkv) @ W_UV_O
+
+o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
+return o.view(-1, N * V) @ self.num_heads @ W_O
 
 
 ## Chunked Prefill
@@ -146,8 +138,8 @@
 q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
 new_kv_c   = h_t @ W_DKV
 new_k_pe   = RoPE(h_t @ W_KR)
-new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P)
-new_v      = (new_kv_c @ W_UV).view(Sq, N, V)
+new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
 
 // MHA between queries and new KV
 //     with QK headdim = P + R
@@ -171,17 +163,17 @@
     cache_k_pe_chunk   = cache_k_pe[chunk_start:chunk_end]
     cache_k_nope_chunk = (cache_kv_c_chunk @ W_UK).view(-1, N, P)
     cache_v_chunk      = (cache_kv_c_chunk @ W_UV).view(-1, N, V)
-    
+
     chunk_o, chunk_lse = scaled_dot_product_attention(
         torch.cat([q_nope, q_pe], dim=-1),
-        torch.cat([cache_k_nope_chunk, 
-                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)], 
+        torch.cat([cache_k_nope_chunk,
+                   cache_k_pe_chunk.unsqueeze(1).expand(-1, N, -1)],
                    dim=-1),
         cache_v_chunk,
         casual=False,
         return_softmax_lse=True
     )
-    
+
     curr_o, curr_lse = merge_attn_states(
         suffix_output=curr_o,
         suffix_lse=curr_lse,
@@ -202,7 +194,6 @@
                     Type, TypeVar)
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm import _custom_ops as ops
 from vllm import envs
@@ -215,20 +206,9 @@
                                            get_flash_attn_version,
                                            is_block_tables_empty)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsW8A8Fp8)
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    Fp8LinearGenericOp, is_fp8)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.multimodal import MultiModalPlaceholderMap
@@ -1057,7 +1037,6 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.triton_fa_func = triton_attention
-        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -1070,79 +1049,28 @@ def __init__(
                                   fa_version=self.vllm_flash_attn_version)
 
     def _v_up_proj_and_o_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_UV_O):
-                output_parallel = self.fp8_linear_generic.apply(
-                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape)
-            else:
-                output_parallel = torch.matmul(x.flatten(start_dim=1),
-                                               self.W_UV_O)
-            if self.tp_size > 1:
-                output = tensor_model_parallel_all_reduce(output_parallel)
-            else:
-                output = output_parallel
-            return output
-        else:
-            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
-            return self.o_proj(x.reshape(-1,
-                                         self.num_heads * self.v_head_dim))[0]
-
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return self.o_proj(x)[0]
+
+    # Return `ql_nope`, `q_pe`
     def _q_proj_and_k_up_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_Q_UK):
-                return self.fp8_linear_generic.apply(
-                    x, self.W_Q_UK, self.W_Q_UK_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape).view(
-                        -1, self.num_heads, self.kv_lora_rank)
-            return torch.matmul(x, self.W_Q_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
-        else:
-            x = torch.matmul(x, self.W_Q)\
-                .view(-1, self.num_heads, self.qk_nope_head_dim)
-            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
+        q_nope, q_pe = self.q_proj(x)[0]\
+            .view(-1, self.num_heads, self.qk_head_dim)\
+            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+        ql_nope = torch.bmm(q_nope, self.W_UK_T)
+        # Convert from (N, B, L) to (B, N, L)
+        return ql_nope.transpose(0, 1), q_pe
 
-        # TODO(lucas) This is very gross, we need a more wide scale refactor of
-        # all the FP8 code with a more standard way of
-        # defining schemes/group-shapes, we should also potentially force
-        # quant_methods to support a decompress function
-        #
-        # returns input_group_shape, weight_group_shape
-        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
-            Tuple[Tuple[int, int], Tuple[int, int]]:
-            if isinstance(layer.quant_method, Fp8LinearMethod):
-                if layer.quant_method.block_quant:
-                    weight_block_size = \
-                        layer.quant_method.quant_config.weight_block_size
-                    # per-token-group (1, X), block-quantized (X, Y)
-                    return (1, weight_block_size[-1]), weight_block_size
-                else:
-                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
-            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
-                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
-                # this is hacky but we always assume the for
-                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
-                # we ignore if it is static-per-tensor since we are going to
-                # requantize after later anyways
-                strategy = layer.scheme.strategy
-                if strategy == QuantizationStrategy.TENSOR:
-                    return (1, -1), (-1, -1)  # per-token, per-tensor
-                elif strategy == QuantizationStrategy.CHANNEL:
-                    return (1, -1), (-1, 1)  # per-token, per-channel
-                else:
-                    raise NotImplementedError(
-                        f"QuantizationStrategy.{strategy} is not supported for "
-                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
-            else:
-                raise NotImplementedError(
-                    "Can't determine scale group shapes for "
-                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
-                )
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
 
         def get_layer_weight(layer):
             WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
@@ -1167,10 +1095,9 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 return dequant_weights.T
             return layer.weight
 
-        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
-        assert get_layer_weight(self.o_proj).dtype == weight_dtype
-        assert get_layer_weight(self.q_proj).dtype == weight_dtype
-
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
@@ -1189,89 +1116,10 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
-                .view(-1, self.num_heads, self.qk_head_dim)
-
-        # can be W_Q or W_UQ depending q_lora_rank, the former if
-        # q_lora_rank is None, the latter otherwise. From the Attention backend
-        # perspective though we call these both W_Q and rely on the layer
-        # to pass in the correct matrix
-        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
-        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
-            .flatten(start_dim=1).contiguous()
-
-        # W_QR is small so for simplicity we dont bother requantizing it
-        self.W_QR = self.W_QR.to(act_dtype)
-
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
-            if is_fp8(weight_dtype) and requantization_enabled:
-                # This assumes it wise to requantize using the same group shapes
-                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
-                # weights were originally quantized
-                requant_input_group_shape, requant_weight_group_shape = \
-                    get_scale_group_shapes_for_fp8(self.q_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.o_proj)
-                self.reqaunt_input_group_shape = requant_input_group_shape
-                self.reqaunt_weight_group_shape = requant_weight_group_shape
-
-            #
-            # Perform matrix-absorption following
-            #     https://github.com/flashinfer-ai/flashinfer/pull/551
-            # for decode, as a result we end up with absorbed weights for decode
-            # and another copy of raw weights for prefill.
-            #
-            self.W_UK, self.W_UV = kv_b_proj_weight.split(
-                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
-            # depending q_lora_rank, the former if q_lora_rank is None, the
-            # latter otherwise
-            # basically if q_lora_rank is none we are absorbing into q_proj
-            # instead of UQ
-            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
-                .flatten(start_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_Q_UK, W_Q_UK_scales = scaled_quantize(
-                    W_Q_UK,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform.fp8_dtype())
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_Q_UK = W_Q_UK.T.contiguous()
-                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
-            else:
-                self.W_Q_UK = W_Q_UK.to(act_dtype)
-
-            W_O = get_and_maybe_dequant_weights(self.o_proj)\
-                .view(-1, self.num_heads, self.v_head_dim)
-            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
-                .flatten(start_dim=0, end_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_UV_O, W_UV_O_scales = scaled_quantize(
-                    W_UV_O,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform.fp8_dtype())
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_UV_O = W_UV_O.T.contiguous()
-                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
-            else:
-                self.W_UV_O = W_UV_O.to(act_dtype)
-
-            self.tp_size = get_tensor_model_parallel_world_size()
-        else:
-            if is_fp8(weight_dtype):
-                raise NotImplementedError(
-                    "Currently fp8 requires matrix absorption")
-
-            self.W_UV = W_UV
-            self.W_UK = W_UK
-            self.W_Q = W_Q.flatten(start_dim=1)
+        # Convert from (L, N, V) to (N, L, V)
+        self.W_UV = W_UV.transpose(0, 1)
+        # Convert from (L, N, P) to (N, P, L)
+        self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,
@@ -1471,7 +1319,7 @@ def _forward_prefill(
     @abstractmethod
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
+        ql_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: T,
@@ -1525,9 +1373,8 @@ def forward(
         prefill_k_c_normed = k_c_normed[:num_prefill_tokens]
 
         if has_decode:
-            decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
-            decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
-                .view(-1, self.num_heads, self.qk_rope_head_dim)
+            decode_ql_nope, decode_q_pe = \
+                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                 decode_input_positions, decode_q_pe, decode_k_pe)
 
@@ -1561,6 +1408,6 @@ def forward(
 
         if has_decode:
             output[num_prefill_tokens:] = self._forward_decode(
-                decode_q_nope, decode_q_pe, kv_cache, attn_metadata)
+                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
 
         return output
diff --git a/vllm/envs.py b/vllm/envs.py
index 259501056cc..a36d20a4f8b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -84,8 +84,6 @@
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
-    VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
-    VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
     VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
@@ -563,23 +561,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_MLA_DISABLE":
     lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
 
-    # Flag that can control whether or not we perform matrix-absorption for MLA
-    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
-    # matrices reduces the runtime FLOPs needed to compute MLA but requires
-    # storing more weights, W_Q_UK and W_UV_O, so can increase memory usage,
-    # the is enabled by default
-    "VLLM_MLA_PERFORM_MATRIX_ABSORPTION":
-    lambda: bool(int(os.getenv("VLLM_MLA_PERFORM_MATRIX_ABSORPTION", "1"))),
-
-    # When running MLA with matrix-absorption enabled and fp8 quantized weights
-    # we perform the matrix-absorption in float32 precision, after the matrices
-    # are absorbed we requantize the weights back to fp8, this flag can be used
-    # to disable the requantization step, and instead convert the absorbed
-    # matrices to match the activation type. This can lead to higher memory and
-    # compute usage but better preserves the accuracy of the original model.
-    "VLLM_MLA_DISABLE_REQUANTIZATION":
-    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE_REQUANTIZATION", "0"))),
-
     # If set, vLLM will use the Triton implementation of moe_align_block_size,
     # i.e. moe_align_block_size_triton in fused_moe.py.
     "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 1e19302cbad..ecb7996e1e8 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -13,10 +13,9 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    _normalize_quant_group_shape, scaled_dequantize)
+    scaled_dequantize)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    CUTLASS_BLOCK_FP8_SUPPORTED, Fp8LinearOp, cutlass_block_fp8_supported,
-    cutlass_fp8_supported)
+    CUTLASS_BLOCK_FP8_SUPPORTED)
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -101,60 +100,6 @@ def apply_w8a8_block_fp8_linear_fake(
 )
 
 
-# Unify the interface between `apply_w8a8_block_fp8_linear` and
-# `apply_fp8_linear`
-# NOTE(lucas): this is quite messy, we should think through this more formally
-# TODO(luka): unify this better
-#  https://github.com/vllm-project/vllm/issues/14397
-class Fp8LinearGenericOp:
-
-    def __init__(
-            self,
-            cutlass_fp8_supported: bool = cutlass_fp8_supported(),
-            cutlass_block_fp8_supported: bool = cutlass_block_fp8_supported(),
-    ):
-        self.cutlass_block_fp8_supported = cutlass_block_fp8_supported
-        self.fp8_linear = Fp8LinearOp(
-            cutlass_fp8_supported=cutlass_fp8_supported)
-
-    def apply(
-            self,
-            input: torch.Tensor,
-            weight: torch.Tensor,
-            weight_scale: torch.Tensor,
-            input_group_shape: Tuple[int, int],
-            weight_group_shape: Tuple[int, int],
-            input_scale: Optional[torch.Tensor] = None,  # static scale if one
-    ) -> torch.Tensor:
-        # View input as 2D matrix for fp8 methods
-        input = input.view(-1, input.shape[-1])
-
-        weight_group_shape = _normalize_quant_group_shape( \
-            weight, weight_group_shape)
-        input_group_shape = _normalize_quant_group_shape(
-            input, input_group_shape)
-
-        def is_dim_blocked(dim, shape, group_shape):
-            return group_shape < shape[dim] and group_shape > 1
-
-        if is_dim_blocked(0, weight.shape, weight_group_shape[0])\
-         and is_dim_blocked(1, weight.shape, weight_group_shape[1]) and\
-         input_group_shape == (1, weight_group_shape[1]):
-            return apply_w8a8_block_fp8_linear(
-                input,
-                weight,
-                list(weight_group_shape),
-                weight_scale,
-                cutlass_block_fp8_supported=self.cutlass_block_fp8_supported)
-        else:
-            # Despite having linear in the name it doesn't conform to
-            # `torch.nn.functional.linear` which is defined as
-            # `input @ weight.T` so we explicitly transpose the weight matrix
-            return self.fp8_linear.apply(input, weight.T, weight_scale.T,
-                             use_per_token_if_dynamic=\
-                                 (input_group_shape == (1, input.shape[1])))
-
-
 def input_to_float8(
         x: torch.Tensor,
         dtype: Optional[torch.dtype] = None
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 14a7bd35352..f801745ab5c 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -21,7 +21,7 @@
 (https://arxiv.org/abs/2405.04434 and https://github.com/flashinfer-ai/flashinfer/pull/551).
 
 Deepseek's MLA attention works the following way:
-* Use a single latent vector to represent the per-token entry of the KV cache.
+* Use a single latent vector to represent the per-token entry of the KV cache. 
 * For decode (i.e. the memory friendly approach) the attention "simulates" a
 multi-head attention, while the compute is similar to multi-query attention.
 
@@ -54,9 +54,9 @@
 W_UQ        project q_c to q_nope               shape [Lq, N * P]
 W_QR        project q_c to q_pe                 shape [Lq, N * R]
 W_DKV       project h_t to kv_c                 shape [H, Lkv]
-W_UK        project kv_c to k_nope              shape [Lkv, N * P]
-W_KR        project h_t to k_pe                 shape [H, N * R]
-W_UV        project kv_c to v                   shape [Lkv, N * V]
+W_UK        project kv_c to k_nope              shape [Lkv, N, P]
+W_KR        project h_t to k_pe                 shape [H, R]
+W_UV        project kv_c to v                   shape [Lkv, N, V]
 W_O         project v to h_t                    shape [N * V, H]
 
 
@@ -69,8 +69,8 @@
 new_k_pe = RoPE(h_t @ W_KR)
 kv_c     = torch.cat([new_kv_c, cache_kv_c], dim=0)
 k_pe     = torch.cat([new_k_pe, cache_k_pe], dim=0)
-k_nope   = (kv_c @ W_UK).view(Skv, N, P)
-v        = (kv_c @ W_UV).view(Skv, N, V)
+k_nope   = (kv_c @ W_UK.view(Lkv, N * P)).view(Skv, N, P)
+v        = (kv_c @ W_UV.view(Lkv, N * V)).view(Skv, N, V)
 
 // MHA with QK headdim = P + R
 //           V headdim = V
@@ -79,7 +79,7 @@
     torch.cat([q_nope, q_pe], dim=-1),
     torch.cat([k_nope, k_pe.unsqueeze(1).expand(-1, N, -1)], dim=-1),
     v
-)
+) 
 return spda_o @ W_O
 
 NOTE: in the actual code,
@@ -90,20 +90,10 @@
 
 ## Data-Movement Friendly Approach (i.e. "_forward_decode"):
 
-Ahead of time, compute:
-
-% this projects from q_c to [Sq, N * Lkv]
-W_UQ_UK = einsum("qnp,knp -> qnk"
-                     W_UQ.view(Lq, N, P), W_UK.view(Lkv, N, P)
-                ).view(Lkv, N * Lkv)
-% this projects from attn output [Sq, N * Lkv] to [Sq, H]
-W_UV_O  = einsum("knv,nvh -> nkh"
-                     W_UV.view(Lkv, N, V), W_O.view(N, V, H)
-                ).view(N * Lkv, H)
-
 Runtime
 q_c      = h_t @ W_DQ
-q_latent = q_c @ W_UQ_UK.view(Sq, N, Lkv)
+q_nope   = (q_c @ W_UQ).view(-1, N, P)
+ql_nope  = einsum("snh,lnh->snl", q, W_UK)
 q_pe     = RoPE(q_c @ W_QR).view(Sq, N, R)
 new_kv_c = h_t @ W_DKV
 new_k_pe = RoPE(h_t @ W_KR)
@@ -116,29 +106,31 @@
 // NOTE: this is less compute-friendly since Lkv > P
 //       but is more data-movement friendly since its MQA vs MHA
 spda_o = scaled_dot_product_attention(
-    torch.cat([q_latent, q_pe], dim=-1),
+    torch.cat([ql_nope, q_pe], dim=-1),
     torch.cat([kv_c, k_pe], dim=-1),
     kv_c
 )
-return spda_o.reshape(-1, N * Lkv) @ W_UV_O
+
+o = einsum("snl,lnv->snv", spda_o.reshape(-1, N, Lkv), W_UV)
+return o.view(-1, N * V) @ self.num_heads @ W_O
 
 
 ## Chunked Prefill
 
-For chunked prefill we want to use the compute friendly algorithm. We are
-assuming sufficiently large Sq / Skv ratio, in the future may want to switch to
+For chunked prefill we want to use the compute friendly algorithm. We are 
+assuming sufficiently large Sq / Skv ratio, in the future may want to switch to 
 the data-movement friendly approach if the chunk (i.e. `Sq`) is small.
 
 However, the compute-friendly approach can potentially run out of memory if Skv
 is large due to: `k_nope = (kv_c @ W_UK).view(Skv, N, P)`
 
-To mitigate this, we chunk the computation of attention with respect to the
-current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a
+To mitigate this, we chunk the computation of attention with respect to the 
+current context (i.e. `cache_kv_c` and `cache_k_pe`) so that we can used a 
 fixed workspace size.
 
 The chunked prefill approach is as follows:
 
-MCC        Max chunk of context to process per iter, computed dynamically,
+MCC        Max chunk of context to process per iter, computed dynamically, 
            used to bound the memory usage
 
 q_c        = h_t @ W_DQ
@@ -146,8 +138,8 @@
 q_pe       = RoPE(q_c @ W_QR).view(Sq, N, R)
 new_kv_c   = h_t @ W_DKV
 new_k_pe   = RoPE(h_t @ W_KR)
-new_k_nope = (new_kv_c @ W_UK).view(Sq, N, P)
-new_v      = (new_kv_c @ W_UV).view(Sq, N, V)
+new_k_nope = (new_kv_c @ W_UK.view(Lkv, N * P)).view(Sq, N, P)
+new_v      = (new_kv_c @ W_UV.view(Lkv, N * V)).view(Sq, N, V)
 
 // MHA between queries and new KV
 //     with QK headdim = P + R
@@ -160,7 +152,7 @@
     new_v,
     casual=True,
     return_softmax_lse=True
-)
+) 
 
 // Compute attention with the already existing context
 for chunk_idx in range(cdiv(C, MCC)):
@@ -198,30 +190,17 @@
 from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm import _custom_ops as ops
-from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.distributed import (get_tensor_model_parallel_world_size,
-                              tensor_model_parallel_all_reduce)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod)
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsW8A8Fp8)
-from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    Fp8LinearGenericOp, is_fp8)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    scaled_quantize)
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
@@ -646,7 +625,6 @@ def __init__(
         self.kv_b_proj = kv_b_proj
         self.o_proj = o_proj
         self.vllm_flash_attn_version = get_flash_attn_version()
-        self.fp8_linear_generic = Fp8LinearGenericOp()
 
         # Handle the differences between the flash_attn_varlen from flash_attn
         # and the one from vllm_flash_attn. The former is used on RoCM and the
@@ -658,88 +636,37 @@ def __init__(
                                   fa_version=self.vllm_flash_attn_version)
 
     def _v_up_proj_and_o_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_UV_O):
-                output_parallel = self.fp8_linear_generic.apply(
-                    x.flatten(start_dim=1), self.W_UV_O, self.W_UV_O_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape)
-            else:
-                output_parallel = torch.matmul(x.flatten(start_dim=1),
-                                               self.W_UV_O)
-            if self.tp_size > 1:
-                output = tensor_model_parallel_all_reduce(output_parallel)
-            else:
-                output = output_parallel
-            return output
-        else:
-            x = torch.einsum("bnl,lnv->bnv", x, self.W_UV)
-            return self.o_proj(x.reshape(-1,
-                                         self.num_heads * self.v_head_dim))[0]
-
+        # Convert from (B, N, L) to (N, B, L)
+        x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+        x = torch.bmm(x, self.W_UV)
+        # Convert from (N, B, V) to (B, N * V)
+        x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return self.o_proj(x)[0]
+
+    # Return `ql_nope`, `q_pe`
     def _q_proj_and_k_up_proj(self, x):
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            if is_fp8(self.W_Q_UK):
-                return self.fp8_linear_generic.apply(
-                    x, self.W_Q_UK, self.W_Q_UK_scales,
-                    self.reqaunt_input_group_shape,
-                    self.reqaunt_weight_group_shape).view(
-                        -1, self.num_heads, self.kv_lora_rank)
-            return torch.matmul(x, self.W_Q_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
-        else:
-            x = torch.matmul(x, self.W_Q)\
-                .view(-1, self.num_heads, self.qk_nope_head_dim)
-            return torch.einsum("bnp,lnp->bnl", x, self.W_UK)\
-                .view(-1, self.num_heads, self.kv_lora_rank)
+        q_nope, q_pe = self.q_proj(x)[0]\
+            .view(-1, self.num_heads, self.qk_head_dim)\
+            .split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
 
-    def process_weights_after_loading(self, act_dtype: torch.dtype):
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+        ql_nope = torch.bmm(q_nope, self.W_UK_T)
+        # Convert from (N, B, L) to (B, N, L)
+        return ql_nope.transpose(0, 1), q_pe
 
-        # TODO(lucas) This is very gross, we need a more wide scale refactor of
-        # all the FP8 code with a more standard way of
-        # defining schemes/group-shapes, we should also potentially force
-        # quant_methods to support a decompress function
-        #
-        # returns input_group_shape, weight_group_shape
-        def get_scale_group_shapes_for_fp8(layer: LinearBase) -> \
-            tuple[tuple[int, int], tuple[int, int]]:
-            if isinstance(layer.quant_method, Fp8LinearMethod):
-                if layer.quant_method.block_quant:
-                    weight_block_size = \
-                        layer.quant_method.quant_config.weight_block_size
-                    # per-token-group (1, X), block-quantized (X, Y)
-                    return (1, weight_block_size[-1]), weight_block_size
-                else:
-                    return (-1, -1), (-1, -1)  # per-tensor, per-tensor
-            elif isinstance(layer.quant_method, CompressedTensorsLinearMethod)\
-                and isinstance(layer.scheme, CompressedTensorsW8A8Fp8):
-                # this is hacky but we always assume the for
-                # CompressedTensorsW8A8Fp8 the input is dynamic per-token
-                # we ignore if it is static-per-tensor since we are going to
-                # requantize after later anyways
-                strategy = layer.scheme.strategy
-                if strategy == QuantizationStrategy.TENSOR:
-                    return (1, -1), (-1, -1)  # per-token, per-tensor
-                elif strategy == QuantizationStrategy.CHANNEL:
-                    return (1, -1), (-1, 1)  # per-token, per-channel
-                else:
-                    raise NotImplementedError(
-                        f"QuantizationStrategy.{strategy} is not supported for "
-                        "fp8 MLA, please run with VLLM_MLA_DISABLE=1")
-            else:
-                raise NotImplementedError(
-                    "Can't determine scale group shapes for "
-                    f"{layer.quant_method}, please run with VLLM_MLA_DISABLE=1"
-                )
+    def process_weights_after_loading(self, act_dtype: torch.dtype):
 
         def get_layer_weight(layer):
-            if hasattr(layer, "weight"):
-                return layer.weight
-            elif hasattr(layer, "qweight"):
-                return layer.qweight
-            else:
-                raise AttributeError(
-                    f"Layer '{layer}' has neither weight nor qweight")
+            WEIGHT_NAMES = ("weight", "qweight", "weight_packed")
+            for attr in WEIGHT_NAMES:
+                if hasattr(layer, attr):
+                    return getattr(layer, attr)
+            raise AttributeError(
+                f"Layer '{layer}' has no recognized weight attribute:"
+                f" {WEIGHT_NAMES}.")
 
         def get_and_maybe_dequant_weights(layer: LinearBase):
             if not isinstance(layer.quant_method, UnquantizedLinearMethod):
@@ -755,10 +682,9 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
                 return dequant_weights.T
             return layer.weight
 
-        weight_dtype = get_layer_weight(self.kv_b_proj).dtype
-        assert get_layer_weight(self.o_proj).dtype == weight_dtype
-        assert get_layer_weight(self.q_proj).dtype == weight_dtype
-
+        # we currently do not have quantized bmm's which are needed for
+        # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform
+        # the bmm's in 16-bit, the extra memory overhead of this is fairly low
         kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
         assert kv_b_proj_weight.shape == (
             self.kv_lora_rank,
@@ -777,89 +703,10 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        q_proj_weight = get_and_maybe_dequant_weights(self.q_proj).T\
-                .view(-1, self.num_heads, self.qk_head_dim)
-
-        # can be W_Q or W_UQ depending q_lora_rank, the former if
-        # q_lora_rank is None, the latter otherwise. From the Attention backend
-        # perspective though we call these both W_Q and rely on the layer
-        # to pass in the correct matrix
-        W_Q = q_proj_weight[..., :self.qk_nope_head_dim]
-        self.W_QR = q_proj_weight[..., self.qk_nope_head_dim:]\
-            .flatten(start_dim=1).contiguous()
-
-        # W_QR is small so for simplicity we dont bother requantizing it
-        self.W_QR = self.W_QR.to(act_dtype)
-
-        if envs.VLLM_MLA_PERFORM_MATRIX_ABSORPTION:
-            requantization_enabled = not envs.VLLM_MLA_DISABLE_REQUANTIZATION
-            if is_fp8(weight_dtype) and requantization_enabled:
-                # This assumes it wise to requantize using the same group shapes
-                # (i.e. strategy, per-tensor, per-channel, block etc.) that the
-                # weights were originally quantized
-                requant_input_group_shape, requant_weight_group_shape = \
-                    get_scale_group_shapes_for_fp8(self.q_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.kv_b_proj)
-                assert (requant_input_group_shape, requant_weight_group_shape)\
-                    == get_scale_group_shapes_for_fp8(self.o_proj)
-                self.reqaunt_input_group_shape = requant_input_group_shape
-                self.reqaunt_weight_group_shape = requant_weight_group_shape
-
-            #
-            # Perform matrix-absorption following
-            #     https://github.com/flashinfer-ai/flashinfer/pull/551
-            # for decode, as a result we end up with absorbed weights for decode
-            # and another copy of raw weights for prefill.
-            #
-            self.W_UK, self.W_UV = kv_b_proj_weight.split(
-                [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
-            # We absorb `W_UK` into `W_Q` resulting in either W_Q_UK or W_UQ_UK
-            # depending q_lora_rank, the former if q_lora_rank is None, the
-            # latter otherwise
-            # basically if q_lora_rank is none we are absorbing into q_proj
-            # instead of UQ
-            W_Q_UK = torch.einsum("qnd,lnd -> qnl", W_Q, W_UK)\
-                .flatten(start_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_Q_UK, W_Q_UK_scales = scaled_quantize(
-                    W_Q_UK,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform.fp8_dtype())
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_Q_UK = W_Q_UK.T.contiguous()
-                self.W_Q_UK_scales = W_Q_UK_scales.T.contiguous()
-            else:
-                self.W_Q_UK = W_Q_UK.to(act_dtype)
-
-            W_O = get_and_maybe_dequant_weights(self.o_proj)\
-                .view(-1, self.num_heads, self.v_head_dim)
-            W_UV_O = torch.einsum("lnd,hnd -> nlh", W_UV, W_O)\
-                .flatten(start_dim=0, end_dim=1).contiguous()
-
-            if is_fp8(weight_dtype) and requantization_enabled:
-                W_UV_O, W_UV_O_scales = scaled_quantize(
-                    W_UV_O,
-                    self.reqaunt_weight_group_shape,
-                    quant_dtype=current_platform.fp8_dtype())
-                # For FP8 save the transpose so we can use
-                # `apply_w8a8_block_fp8_linear` directly
-                self.W_UV_O = W_UV_O.T.contiguous()
-                self.W_UV_O_scales = W_UV_O_scales.T.contiguous()
-            else:
-                self.W_UV_O = W_UV_O.to(act_dtype)
-
-            self.tp_size = get_tensor_model_parallel_world_size()
-        else:
-            if is_fp8(weight_dtype):
-                raise NotImplementedError(
-                    "Currently fp8 requires matrix absorption")
-
-            self.W_UV = W_UV
-            self.W_UK = W_UK
-            self.W_Q = W_Q.flatten(start_dim=1)
+        # Convert from (L, N, V) to (N, L, V)
+        self.W_UV = W_UV.transpose(0, 1)
+        # Convert from (L, N, P) to (N, P, L)
+        self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,
@@ -998,7 +845,7 @@ def _forward_prefill(
     @abstractmethod
     def _forward_decode(
         self,
-        q_nope: torch.Tensor,
+        ql_nope: torch.Tensor,
         q_pe: torch.Tensor,
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: M,
@@ -1051,10 +898,8 @@ def forward(
 
         if has_decode:
             assert attn_metadata.decode is not None
-            decode_q_nope = self._q_proj_and_k_up_proj(decode_hs_or_q_c)
-            decode_q_pe = torch.matmul(decode_hs_or_q_c, self.W_QR)\
-                .view(-1, self.num_heads, self.qk_rope_head_dim)
-
+            decode_ql_nope, decode_q_pe = \
+                self._q_proj_and_k_up_proj(decode_hs_or_q_c)
             decode_q_pe[...], decode_k_pe[...] = self.rotary_emb(
                 attn_metadata.decode.input_positions, decode_q_pe.contiguous(),
                 decode_k_pe)
@@ -1087,6 +932,6 @@ def forward(
 
         if has_decode:
             output[:num_decode_tokens] = self._forward_decode(
-                decode_q_nope, decode_q_pe, kv_cache, attn_metadata)
+                decode_ql_nope, decode_q_pe, kv_cache, attn_metadata)
 
         return output_padded

From 963a6261421028e31bf6541f33dfb938c179d480 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Fri, 14 Mar 2025 14:47:49 +0800
Subject: [PATCH 0719/1240] [Bugfix][Kernel][CPU] Fix num_tokens in CPU rotary
 embedding kernel (#14667)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cpu/pos_encoding.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index 96bce7dda01..8a59e884d6c 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -170,7 +170,7 @@ void rotary_embedding_gptj_impl(
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox) {
-  int num_tokens = query.numel() / query.size(-1);
+  int num_tokens = positions.numel();
   int rot_dim = cos_sin_cache.size(1);
   int num_heads = query.size(-1) / head_size;
   int num_kv_heads = key.size(-1) / head_size;

From ab0c384c0ffcdb507555884a28e50dda02870c59 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Fri, 14 Mar 2025 15:02:05 +0800
Subject: [PATCH 0720/1240] [BugFix]Fix performance serving benchmark when
 enable profiling (#14737)

Signed-off-by: wangli <wangli858794774@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index d53428d219e..6a7db920b5b 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -333,7 +333,7 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith(
-        "chat/completions"
+        ("chat/completions", "profile")
     ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
 
     async with aiohttp.ClientSession(trust_env=True,

From c9275e1add411d3452debe9fab6f658dc2e86ff9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Mar 2025 15:59:56 +0800
Subject: [PATCH 0721/1240] [Misc] Clean up type annotation for
 `SupportsMultiModal` (#14794)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md  |  5 ++--
 tests/distributed/test_pipeline_parallel.py   |  4 ++--
 vllm/model_executor/models/aria.py            | 10 ++++----
 vllm/model_executor/models/blip2.py           | 10 ++++----
 vllm/model_executor/models/chameleon.py       | 10 ++++----
 vllm/model_executor/models/deepseek_vl2.py    |  7 +++---
 vllm/model_executor/models/florence2.py       | 10 ++++----
 vllm/model_executor/models/fuyu.py            |  9 ++++---
 vllm/model_executor/models/gemma3_mm.py       | 10 ++++----
 vllm/model_executor/models/glm4v.py           | 10 ++++----
 vllm/model_executor/models/idefics3.py        |  7 +++---
 vllm/model_executor/models/interfaces.py      | 24 ++++++++++---------
 vllm/model_executor/models/internvl.py        |  7 +++---
 vllm/model_executor/models/llava.py           |  7 +++---
 vllm/model_executor/models/llava_next.py      |  9 ++++---
 .../model_executor/models/llava_next_video.py | 10 ++++----
 vllm/model_executor/models/llava_onevision.py | 14 +++++------
 vllm/model_executor/models/molmo.py           |  9 ++++---
 vllm/model_executor/models/paligemma.py       | 10 ++++----
 vllm/model_executor/models/phi3v.py           | 11 ++++-----
 vllm/model_executor/models/pixtral.py         |  9 ++++---
 vllm/model_executor/models/qwen2_5_vl.py      | 12 +++++-----
 vllm/model_executor/models/qwen2_audio.py     | 10 ++++----
 vllm/model_executor/models/qwen2_vl.py        | 12 +++++-----
 vllm/model_executor/models/qwen_vl.py         | 11 ++++-----
 vllm/model_executor/models/ultravox.py        |  8 +++----
 vllm/model_executor/models/whisper.py         |  7 +++---
 27 files changed, 121 insertions(+), 141 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index f55a62ef01b..9cbfc32991f 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -34,7 +34,8 @@ Further update the model as follows:
             image_features = self.vision_encoder(image_input)
             return self.multi_modal_projector(image_features)
 
-        def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
+        def get_multimodal_embeddings(
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
             # Validate the multimodal input keyword arguments
             image_input = self._parse_and_validate_image_input(**kwargs)
@@ -61,7 +62,7 @@ Further update the model as follows:
         def get_input_embeddings(
             self,
             input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[NestedTensors] = None,
+            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
         ) -> torch.Tensor:
 
             # `get_input_embeddings` should already be implemented for the language 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5562b36816c..4b479a0c93a 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -214,7 +214,7 @@ def iter_params(self, model_id: str):
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
     "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
     "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
-    "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(),
     "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
     "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
     "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
@@ -237,7 +237,7 @@ def iter_params(self, model_id: str):
     "BAAI/bge-multilingual-gemma2",
     # [MULTIMODAL GENERATION]
     "OpenGVLab/InternVL2-1B",
-    "microsoft/Phi-3-vision-128k-instruct",
+    "microsoft/Phi-3.5-vision-instruct",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     # [LANGUAGE GENERATION - HYBRID ARCH]
     "ai21labs/Jamba-tiny-dev",
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index de3512cf18d..ecd0a04b1df 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -21,8 +21,7 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -35,7 +34,7 @@
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsMultiModal, SupportsQuant
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsQuant
 from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter, maybe_prefix,
@@ -607,8 +606,7 @@ def _process_image_input(
         return self.multi_modal_projector(image_outputs, image_attn_mask)
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -618,7 +616,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index d7eaac2563f..47362e3d897 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -15,8 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
@@ -25,7 +24,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -629,8 +628,7 @@ def _process_image_input(self,
         return self.language_projection(query_output)
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -640,7 +638,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 68284a018af..66bf85b59d1 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -30,8 +30,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -39,7 +38,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -986,8 +985,7 @@ def _parse_and_validate_image_input(
         )
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -1000,7 +998,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
 
         inputs_embeds = self.model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index fd5d5a564b5..6ea8de8450b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -36,7 +36,7 @@
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -605,8 +605,7 @@ def _process_image_input(
             pixel_values=pixel_values, images_spatial_crop=images_spatial_crop)
 
     def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -616,7 +615,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index e892a1a4fc6..3883cd4460f 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -20,7 +20,7 @@
                                              BartParallelLMHead,
                                              BartScaledWordEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataDict, MultiModalDataItems
 from vllm.multimodal.processing import (BaseProcessingInfo,
@@ -30,7 +30,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsV0Only
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsV0Only)
 from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
 
 
@@ -1037,8 +1038,7 @@ def _process_image_input(
         return self._encode_image(pixel_values)
 
     def get_multimodal_embeddings(
-        self, **kwargs: object
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -1048,7 +1048,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 51c79ba846c..a6fcb5b81b1 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import List, Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -41,7 +41,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 
@@ -327,8 +327,7 @@ def _process_image_input(
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -338,7 +337,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index ac80059cbe6..ce7c89449e0 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -14,8 +14,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -24,7 +23,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -481,7 +480,8 @@ def _process_image_input(
         )
         return self.multi_modal_projector(vision_outputs)
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -491,7 +491,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         if multimodal_embeddings is None:
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 2700ebccb83..9889b7e4de4 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -28,7 +28,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BatchFeature,
@@ -39,7 +39,8 @@
 from vllm.transformers_utils.configs import ChatGLMConfig
 
 from .chatglm import ChatGLMBaseModel, ChatGLMModel
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import flatten_bn, merge_multimodal_embeddings
 
 
@@ -596,8 +597,7 @@ def _process_image_input(
         return self.transformer.vision(pixel_values)
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -608,7 +608,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.transformer.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 19d5a4c2599..234e4498f16 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -49,7 +49,7 @@
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsLoRA, SupportsMultiModal
+from .interfaces import MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -617,8 +617,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.sampler = get_sampler()
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self.model._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -628,7 +627,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 43196bf544e..13d7394ac08 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch import Tensor
-from typing_extensions import TypeIs, TypeVar
+from typing_extensions import TypeIs
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -20,7 +20,14 @@
 
 logger = init_logger(__name__)
 
-T = TypeVar("T", default=Union[list[Tensor], Tensor, tuple[Tensor, ...]])
+MultiModalEmbeddings = Union[list[Tensor], Tensor, tuple[Tensor, ...]]
+"""
+The output embeddings must be one of the following formats:
+
+- A list or tuple of 2D tensors, where each tensor corresponds to
+    each input multimodal data item (e.g, image).
+- A single 3D tensor, with the batch dimension grouping the 2D tensors.
+"""
 
 
 @runtime_checkable
@@ -36,17 +43,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
-    def get_multimodal_embeddings(self, **kwargs) -> T:
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         """
         Returns multimodal embeddings generated from multimodal kwargs 
         to be merged with text embeddings.
 
-        The output embeddings must be one of the following formats:
-    
-        - A list or tuple of 2D tensors, where each tensor corresponds to
-          each input multimodal data item (e.g, image).
-        - A single 3D tensor, with the batch dimension grouping the 2D tensors.
-
         Note:
             The returned multimodal embeddings must be in the same order as
             the appearances of their corresponding multimodal data item in the
@@ -60,7 +62,7 @@ def get_multimodal_embeddings(self, **kwargs) -> T:
     def get_input_embeddings(
         self,
         input_ids: Tensor,
-        multimodal_embeddings: Optional[T] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
         attn_metadata: Optional["AttentionMetadata"] = None,
     ) -> Tensor:
         ...
@@ -69,7 +71,7 @@ def get_input_embeddings(
     def get_input_embeddings(
         self,
         input_ids: Tensor,
-        multimodal_embeddings: Optional[T] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> Tensor:
         """
         Returns the input embeddings merged from the text embeddings from 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index fcaf7fecaaf..e91d0ba1b38 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -37,7 +37,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -905,8 +905,7 @@ def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
             self.visual_token_mask = None
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -916,7 +915,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0c0d8e109c9..ecdd6dfb0a7 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -38,7 +38,7 @@
 from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
 from .clip import CLIPVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import (PixtralHFVisionModel,
                       get_pixtral_hf_image_feature_grid_size)
 from .siglip import SiglipVisionModel
@@ -778,7 +778,8 @@ def _get_mm_embeds(
 
         return embeds_in_batch
 
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -800,7 +801,7 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 04b0f291029..db89bbf1af6 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -16,12 +16,12 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.multimodal.parse import ImageSize
 from vllm.sequence import IntermediateTensors
 
 from .clip import CLIPVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingInfo,
                     LlavaDummyInputsBuilder, LlavaLikeConfig,
                     LlavaMultiModalProjector, init_vision_tower_for_llava)
@@ -480,8 +480,7 @@ def _process_image_input(
         ]
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -491,7 +490,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
 
         if multimodal_embeddings is None:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index d974c3d2240..5eb56d6711f 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -16,8 +16,7 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -27,7 +26,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import init_vision_tower_for_llava
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
@@ -421,8 +420,7 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
                 f"Unsupported type of video input {type(video_pixels)}")
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
         if video_input is None:
             return None
@@ -432,7 +430,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index f41f45e3e40..c6bc9ffcbf3 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
                                    VideoEmbeddingItems, VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
@@ -29,7 +28,7 @@
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llava import LlavaDummyInputsBuilder, init_vision_tower_for_llava
 from .llava_next import (BaseLlavaNextMultiModalProcessor, LlavaNextLikeConfig,
                          LlavaNextProcessingInfo)
@@ -856,7 +855,7 @@ def apply_pooling(self, image_features, stride=2):
         return image_feature
 
     def get_multimodal_embeddings(
-            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
             return None
@@ -882,7 +881,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
@@ -894,10 +893,9 @@ def get_input_embeddings(
     def get_input_embeddings_v0(
         self,
         input_ids: torch.Tensor,
-        image_input: Optional[NestedTensors] = None,
-        video_input: Optional[NestedTensors] = None,
+        image_input: Optional[LlavaOnevisionImagePixelInputs] = None,
+        video_input: Optional[LlavaOnevisionVideoPixelInputs] = None,
     ) -> torch.Tensor:
-
         inputs_embeds = self.get_input_embeddings(input_ids)
         if image_input is not None:
             image_embeds = self._process_image_input(image_input)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 55408053305..9696a858ecd 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -52,8 +52,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
 
-from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
-                         SupportsQuant)
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP, SupportsQuant)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -1577,8 +1577,7 @@ def _get_mm_embeds(
         return embeds_in_batch
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -1598,7 +1597,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index d4758079c42..88a6226d214 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -13,8 +13,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors)
+                                    MultiModalInputs, MultiModalKwargs)
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
@@ -23,7 +22,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -328,8 +327,7 @@ def _process_image_input(
         return self.multi_modal_projector(image_features)
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -341,7 +339,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 06fa5c5e019..5305f1e03e1 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -31,8 +31,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 # yapf conflicts with isort for this block
@@ -48,7 +47,8 @@
 from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
-from .interfaces import SupportsMultiModal, SupportsPP, SupportsQuant
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -649,8 +649,7 @@ def _process_image_input(
         return image_embeds
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -660,7 +659,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.embed_tokens(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f17f9fb8e0c..25b4cc4a9fb 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -30,12 +30,12 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
@@ -221,8 +221,7 @@ def sampler(self):
         return get_sampler()
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input, image_tokens = self._parse_and_validate_image_input(
             **kwargs)
         if image_input is None:
@@ -255,7 +254,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index ae48c779481..8a570d138c6 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -59,7 +59,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.config import uses_mrope
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .qwen2_vl import Qwen2VLDummyInputsBuilder as Qwen2_5_VLDummyInputsBuilder
 from .qwen2_vl import (Qwen2VLMultiModalProcessor, Qwen2VLProcessingInfo,
                        apply_rotary_pos_emb_vision)
@@ -952,7 +953,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         return modalities
 
     def get_multimodal_embeddings(
-            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
@@ -978,7 +979,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
@@ -990,10 +991,9 @@ def get_input_embeddings(
     def get_input_embeddings_v0(
         self,
         input_ids: torch.Tensor,
-        image_input: Optional[tuple[torch.Tensor, ...]] = None,
-        video_input: Optional[tuple[torch.Tensor, ...]] = None,
+        image_input: Optional[Qwen2_5_VLImageInputs] = None,
+        video_input: Optional[Qwen2_5_VLVideoInputs] = None,
     ) -> torch.Tensor:
-
         inputs_embeds = self.get_input_embeddings(input_ids)
         if image_input is not None:
             image_embeds = self._process_image_input(image_input)
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index c44f4fa4d75..aae30f1fd66 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -37,8 +37,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
                                    MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -47,7 +46,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -357,8 +356,7 @@ def _process_audio_input(self,
                            audio_output_lengths.flatten().tolist())
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
@@ -368,7 +366,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0e9fa7183c8..b8ac40b7e7f 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -71,7 +71,8 @@
 from vllm.transformers_utils.processor import (
     cached_image_processor_from_config)
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -1262,7 +1263,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         return modalities
 
     def get_multimodal_embeddings(
-            self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
 
         modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
         if not modalities:
@@ -1289,7 +1290,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
@@ -1301,10 +1302,9 @@ def get_input_embeddings(
     def get_input_embeddings_v0(
         self,
         input_ids: torch.Tensor,
-        image_input: Optional[tuple[torch.Tensor, ...]] = None,
-        video_input: Optional[tuple[torch.Tensor, ...]] = None,
+        image_input: Optional[Qwen2VLImagePixelInputs] = None,
+        video_input: Optional[Qwen2VLVideoPixelInputs] = None,
     ) -> torch.Tensor:
-
         inputs_embeds = self.get_input_embeddings(input_ids)
         if image_input is not None:
             image_embeds = self._process_image_input(image_input)
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index ff581b093b4..1a39d2e74b1 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -32,8 +32,7 @@
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
@@ -41,7 +40,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .qwen import QWenBaseModel, QWenModel
 from .utils import flatten_bn, merge_multimodal_embeddings
 
@@ -741,8 +741,7 @@ def _process_image_input(self,
         return self.transformer.visual(image_input["data"])
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
@@ -753,7 +752,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.transformer.get_input_embeddings(input_ids)
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index f639b8d8f9b..51b1c33cfbd 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -35,7 +35,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
-from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings,
@@ -555,8 +556,7 @@ def _process_audio_input(
         return flattened_embeddings
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         audio_input = self._parse_and_validate_audio_input(**kwargs)
         if audio_input is None:
             return None
@@ -566,7 +566,7 @@ def get_multimodal_embeddings(
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[NestedTensors] = None,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 8ed68bd89e5..eb6404922c6 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -34,8 +34,8 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 
-from .interfaces import (SupportsMultiModal, SupportsTranscription,
-                         SupportsV0Only)
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
+                         SupportsTranscription, SupportsV0Only)
 from .utils import (AutoWeightsLoader, WeightsMapper, cast_overflow_tensors,
                     make_layers)
 
@@ -689,8 +689,7 @@ def forward(
         return decoder_outputs
 
     def get_multimodal_embeddings(
-        self, **kwargs
-    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         # TODO: This method does not obey the interface for SupportsMultiModal.
         # Refactor this once encoder/decoder support is implemented in V1.
         audio_input = self._parse_and_validate_audio_input(**kwargs)

From b4abce9b62438d6326a8723212c30ec354a73882 Mon Sep 17 00:00:00 2001
From: WeiCheng <bravo325806@gmail.com>
Date: Fri, 14 Mar 2025 16:05:17 +0800
Subject: [PATCH 0722/1240] [Bugfix] Fix small typo in the example of Streaming
 delimiter (#14793)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/online_serving/api_client.py       | 2 +-
 examples/online_serving/gradio_webserver.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 22bb1a87bfd..e2944896d16 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -42,7 +42,7 @@ def post_http_request(prompt: str,
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
         if chunk:
             data = json.loads(chunk.decode("utf-8"))
             output = data["text"]
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index c619146b03a..85a9119c6aa 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -21,7 +21,7 @@ def http_bot(prompt):
 
     for chunk in response.iter_lines(chunk_size=8192,
                                      decode_unicode=False,
-                                     delimiter=b"\0"):
+                                     delimiter=b"\n"):
         if chunk:
             data = json.loads(chunk.decode("utf-8"))
             output = data["text"][0]

From cb08cdfe39f5f76baefaa95041fab4085496a2d8 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 14 Mar 2025 16:07:30 +0800
Subject: [PATCH 0723/1240] [Misc] Gemma3ForConditionalGeneration supports LoRA
 (#14797)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/gemma3_mm.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index ce7c89449e0..b945e4732a5 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -12,6 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
@@ -23,7 +24,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -371,8 +373,8 @@ def forward(self, vision_outputs: torch.Tensor):
 @MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor,
                                         info=Gemma3ProcessingInfo,
                                         dummy_inputs=Gemma3DummyInputsBuilder)
-class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                     SupportsPP):
+class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                     SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -614,3 +616,12 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector",
+            tower_model="vision_tower")

From dd372c573e1c3807f2200a52e6feb0928b16578f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 14 Mar 2025 01:21:28 -0700
Subject: [PATCH 0724/1240] [V1][Minor] Minor code cleanup for scheduling
 metrics (#14800)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/scheduler.py | 35 ++++++++++-------------------------
 vllm/v1/request.py        | 28 +++++++++++++---------------
 2 files changed, 23 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index d498891f476..056458ef9dd 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -15,8 +15,8 @@
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
                                            SchedulerOutput)
-from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
-                            EngineCoreOutput, EngineCoreOutputs)
+from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
+                            EngineCoreOutputs)
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -178,7 +178,9 @@ def schedule(self) -> SchedulerOutput:
                     self.kv_cache_manager.free(preempted_req)
                     preempted_req.status = RequestStatus.PREEMPTED
                     preempted_req.num_computed_tokens = 0
-                    self.request_preempted(preempted_req, scheduled_timestamp)
+                    if self.log_stats:
+                        preempted_req.record_event(
+                            EngineCoreEventType.PREEMPTED, scheduled_timestamp)
 
                     self.waiting.appendleft(preempted_req)
                     preempted_reqs.append(preempted_req)
@@ -320,7 +322,9 @@ def schedule(self) -> SchedulerOutput:
                 req_index += 1
                 self.running.append(request)
                 self.scheduled_req_ids.add(request.request_id)
-                self.request_scheduled(request, scheduled_timestamp)
+                if self.log_stats:
+                    request.record_event(EngineCoreEventType.SCHEDULED,
+                                         scheduled_timestamp)
                 if request.status == RequestStatus.WAITING:
                     scheduled_new_reqs.append(request)
                 elif request.status == RequestStatus.PREEMPTED:
@@ -666,7 +670,8 @@ def _check_stop(self, request: Request) -> bool:
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
-        self.request_queued(request)
+        if self.log_stats:
+            request.record_event(EngineCoreEventType.QUEUED)
 
     def finish_requests(
         self,
@@ -728,26 +733,6 @@ def get_num_unscheduled_requests(self) -> int:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def request_queued(self, request: Request):
-        if not self.log_stats:
-            return
-        request.events.append(
-            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED))
-
-    def request_scheduled(self, request: Request, timestamp: float):
-        if not self.log_stats:
-            return
-        request.events.append(
-            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
-                                      timestamp))
-
-    def request_preempted(self, request: Request, timestamp: float):
-        if not self.log_stats:
-            return
-        request.events.append(
-            EngineCoreEvent.new_event(EngineCoreEventType.PREEMPTED,
-                                      timestamp))
-
     def make_stats(self) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 29609d31330..efb5a54d120 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -88,21 +88,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 sampling_params=request.sampling_params),
         )
 
-    def queued(self, timestamp: Optional[float] = None) -> None:
-        self.events.append(
-            EngineCoreEvent.new_event(EngineCoreEventType.QUEUED, timestamp))
-
-    def scheduled(self, timestamp: Optional[float] = None) -> None:
-        self.events.append(
-            EngineCoreEvent.new_event(EngineCoreEventType.SCHEDULED,
-                                      timestamp))
-
-    def take_events(self) -> Optional[list[EngineCoreEvent]]:
-        if not self.events:
-            return None
-        events, self.events = self.events, []
-        return events
-
     def append_output_token_ids(
         self,
         token_ids: Union[int, list[int]],
@@ -146,6 +131,19 @@ def get_num_encoder_tokens(self, input_id: int) -> int:
     def use_structured_output(self) -> bool:
         return self.sampling_params.guided_decoding is not None
 
+    def record_event(
+        self,
+        event_type: EngineCoreEventType,
+        timestamp: Optional[float] = None,
+    ) -> None:
+        self.events.append(EngineCoreEvent.new_event(event_type, timestamp))
+
+    def take_events(self) -> Optional[list[EngineCoreEvent]]:
+        if not self.events:
+            return None
+        events, self.events = self.events, []
+        return events
+
 
 class RequestStatus(enum.IntEnum):
     """Status of a request."""

From ac930243e1d9e16198127a4ab168ab01d4278cc8 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Fri, 14 Mar 2025 18:32:42 +0800
Subject: [PATCH 0725/1240] [Bugfix][W8A8] fixed cutlass block fp8 binding
 (#14796)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/torch_bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index d3bcb86adbc..eb3a2c911d5 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -370,7 +370,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
       "bool");
   ops.impl("cutlass_scaled_mm_supports_block_fp8",
-           &cutlass_scaled_mm_supports_fp8);
+           &cutlass_scaled_mm_supports_block_fp8);
 
   // Check if cutlass sparse scaled_mm is supported for CUDA devices of the
   // given capability

From f74411f347e83c1314b8508796964215f5b5f9af Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 14 Mar 2025 20:58:19 +0800
Subject: [PATCH 0726/1240] [VLM] Various cleanup and fixes (#14806)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/chat_utils.py                |  15 +-
 vllm/model_executor/models/fuyu.py            |  15 +-
 vllm/model_executor/models/interfaces.py      |   4 +-
 vllm/model_executor/models/llava.py           | 136 +++++++++---------
 vllm/model_executor/models/llava_next.py      |   7 +-
 vllm/model_executor/models/llava_onevision.py |  29 ++--
 vllm/model_executor/models/minicpmo.py        |  84 ++++++-----
 vllm/model_executor/models/minicpmv.py        | 120 ++++++++++------
 vllm/model_executor/models/molmo.py           |   2 +-
 vllm/model_executor/models/pixtral.py         | 124 ++++++----------
 vllm/model_executor/models/qwen2_audio.py     |   8 +-
 vllm/multimodal/inputs.py                     |   4 +
 vllm/multimodal/parse.py                      |   4 +
 vllm/multimodal/processing.py                 |   4 +-
 14 files changed, 283 insertions(+), 273 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 4ce4fa897cc..61a91fe03d2 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -37,6 +37,7 @@
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import MediaConnector
+from vllm.transformers_utils.processor import cached_get_processor
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 
 logger = init_logger(__name__)
@@ -1070,7 +1071,19 @@ def apply_hf_chat_template(
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
-    if chat_template is None and tokenizer.chat_template is None:
+    if chat_template is None:
+        chat_template = tokenizer.chat_template
+
+    # FIXME: Temporary workaround for
+    # https://huggingface.co/mistral-community/pixtral-12b/discussions/31
+    if chat_template is None:
+        try:
+            processor = cached_get_processor(tokenizer.name_or_path)
+            chat_template = processor.chat_template
+        except Exception:
+            pass
+
+    if chat_template is None:
         raise ValueError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a6fcb5b81b1..bd7ef29e1f6 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import List, Literal, Optional, Set, Tuple, TypedDict
+from typing import Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -31,8 +31,7 @@
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -58,10 +57,12 @@ class FuyuImagePatchInputs(TypedDict):
     `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
     """
 
-    patches_per_image: List[int]
+    patches_per_image: list[int]
     """
-    List of number of total patches for each image in the batch.
-    This is used to restore the first two dimensions of `flat_data`.
+    The number of total patches for each image in the batch.
+
+    This is used to split the embeddings which has the first two dimensions
+    flattened just like `flat_data`.
     """
 
 
@@ -317,7 +318,7 @@ def _parse_and_validate_image_input(
         return None
 
     def _process_image_input(
-            self, image_input: FuyuImagePatchInputs) -> NestedTensors:
+            self, image_input: FuyuImagePatchInputs) -> MultiModalEmbeddings:
         image_patches_flat = image_input["flat_data"]
         patches_per_image = image_input["patches_per_image"]
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 13d7394ac08..c77324bab59 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch import Tensor
-from typing_extensions import TypeIs
+from typing_extensions import Self, TypeIs
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
@@ -451,7 +451,7 @@ class SupportsQuant:
     packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {}
     quant_config: Optional[QuantizationConfig] = None
 
-    def __new__(cls, *args, **kwargs) -> "SupportsQuant":
+    def __new__(cls, *args, **kwargs) -> Self:
         instance = super().__new__(cls)
         quant_config = cls._find_quant_config(*args, **kwargs)
         if quant_config is not None:
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ecdd6dfb0a7..478dbd83d30 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -3,8 +3,8 @@
 from abc import abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple,
-                    TypedDict, TypeVar, Union, cast)
+from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
+                    TypeVar, Union, cast)
 
 import torch
 import torch.nn as nn
@@ -39,8 +39,7 @@
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .pixtral import (PixtralHFVisionModel,
-                      get_pixtral_hf_image_feature_grid_size)
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -49,7 +48,7 @@
 
 class LlavaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    pixel_values: torch.Tensor
     """
     Shape: `(batch_size * num_images, num_channels, height, width)`
 
@@ -57,7 +56,18 @@ class LlavaImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+
+class PixtralHFImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
@@ -65,7 +75,7 @@ class LlavaImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
@@ -73,7 +83,7 @@ class LlavaImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_embeds)`
     """
 
-    num_crops: torch.Tensor
+    num_crops: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -85,27 +95,9 @@ class LlavaImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
-    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
-    """
-    A boolean mask indicating which image features correspond
-    to patch tokens.
-
-    Shape: `(batch_size, num_crops, num_patch)`
-    """
-
-    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_embeds)`
-    """
-
-    num_crops: torch.Tensor
-    """Shape: `(batch_size, num_images)`"""
-
 
-LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
+LlavaImageInputs = Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs,
+                         LlavaImageEmbeddingInputs]
 
 
 class LlavaMultiModalProjector(nn.Module):
@@ -357,13 +349,15 @@ def _call_hf_processor(
                 ]
 
             hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
 
             tile_sizes = [
-                get_pixtral_hf_image_feature_grid_size(
-                    hf_config.vision_config,
+                encoder_info.get_patch_grid_size(
                     image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2])
-                for pixel_value in processed_outputs["pixel_values"]
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
             ]
             num_crops = torch.tensor([(ncols + 1) * nrows
                                       for ncols, nrows in tile_sizes])
@@ -411,13 +405,13 @@ def _get_prompt_updates(
 
         vision_config = hf_config.vision_config
         assert isinstance(vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(vision_config)
 
         def get_replacement(item_idx: int):
             images = mm_items.get_items("image", ImageProcessorItems)
             image_size = images.get_image_size(item_idx)
 
-            ncols, nrows = get_pixtral_hf_image_feature_grid_size(
-                vision_config,
+            ncols, nrows = encoder_info.get_patch_grid_size(
                 image_width=image_size.width,
                 image_height=image_size.height,
             )
@@ -512,7 +506,7 @@ def init_vision_tower_for_llava(
     *,
     require_post_norm: Optional[bool] = None,
     prefix: str = "",
-):
+) -> Union[CLIPVisionModel, SiglipVisionModel, PixtralHFVisionModel]:
     vision_config = hf_config.vision_config
 
     # Initialize the vision tower only up to the deepest required feature layer
@@ -627,32 +621,30 @@ def _parse_and_validate_image_input(
         if pixel_values is None and image_embeds is None:
             return None
 
-        feat_is_patch = kwargs.pop("feat_is_patch", None)
-        if feat_is_patch is not None and not isinstance(
-                feat_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of feat_is_patch. "
-                             f"Got type: {type(feat_is_patch)}")
-
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if embed_is_patch is not None and not isinstance(
-                embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        num_crops = kwargs.pop("num_crops", None)
-        if num_crops is not None and not isinstance(num_crops, torch.Tensor):
-            raise ValueError("Incorrect type of num_crops. "
-                             f"Got type: {type(num_crops)}")
-
         if pixel_values is not None:
             if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                return LlavaImagePixelInputs(
-                    type="pixel_values",
-                    data=flatten_bn(pixel_values),
+                feat_is_patch = kwargs.pop("feat_is_patch")
+                if not isinstance(feat_is_patch, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of feat_is_patch. "
+                                     f"Got type: {type(feat_is_patch)}")
+
+                embed_is_patch = kwargs.pop("embed_is_patch")
+                if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of embed_is_patch. "
+                                     f"Got type: {type(embed_is_patch)}")
+
+                num_crops = kwargs.pop("num_crops")
+                if not isinstance(num_crops, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of num_crops. "
+                                     f"Got type: {type(num_crops)}")
+
+                return PixtralHFImagePixelInputs(
+                    type="pixel_values_pixtral",
+                    pixel_values=flatten_bn(pixel_values),
                     feat_is_patch=feat_is_patch,
                     embed_is_patch=embed_is_patch,
                     num_crops=num_crops,
@@ -660,11 +652,8 @@ def _parse_and_validate_image_input(
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
+                pixel_values=self._validate_pixel_values(
                     flatten_bn(pixel_values, concat=True)),
-                feat_is_patch=feat_is_patch,
-                embed_is_patch=embed_is_patch,
-                num_crops=num_crops,
             )
 
         if image_embeds is not None:
@@ -672,12 +661,12 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
+            if self.config.vision_config.model_type == "pixtral":
+                raise ValueError("Pixtral-HF does not support image_embeds.")
+
             return LlavaImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
-                feat_is_patch=feat_is_patch,
-                embed_is_patch=embed_is_patch,
-                num_crops=num_crops,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -696,7 +685,7 @@ def _image_pixels_to_features(
         self,
         vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
                             PixtralHFVisionModel],
-        pixel_values: torch.Tensor,
+        pixel_values: Union[torch.Tensor, list[torch.Tensor]],
     ) -> torch.Tensor:
 
         # NOTE: we skip the step to select the vision feature layer since
@@ -708,17 +697,20 @@ def _image_pixels_to_features(
             strategy=self.config.vision_feature_select_strategy,
         )
 
-    def _process_image_pixels(self,
-                              inputs: LlavaImagePixelInputs) -> torch.Tensor:
+    def _process_image_pixels(
+        self,
+        inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs],
+    ) -> torch.Tensor:
         assert self.vision_tower is not None
 
-        pixel_values = inputs["data"]
+        pixel_values = inputs["pixel_values"]
 
         return self._image_pixels_to_features(self.vision_tower, pixel_values)
 
-    def _process_image_input(self,
-                             image_input: LlavaImageInputs) -> torch.Tensor:
-
+    def _process_image_input(
+        self,
+        image_input: LlavaImageInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -783,11 +775,11 @@ def get_multimodal_embeddings(
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
+
         vision_embeddings = self._process_image_input(image_input)
 
-        if kwargs.get("v0_path", False) or \
-            image_input.get("feat_is_patch") is None or \
-                image_input.get("embed_is_patch") is None:
+        if (kwargs.get("v0_path", False)
+                or image_input["type"] != "pixel_values_pixtral"):
             # The path is used for pixtral (V0 only) and llava (V0/V1)
             return vision_embeddings
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index db89bbf1af6..4de13e54073 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -32,7 +32,7 @@
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape:
     `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
@@ -315,7 +315,8 @@ def _parse_and_validate_image_input(
 
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(flatten_bn(pixel_values)),
+                pixel_values=self._validate_pixel_values(
+                    flatten_bn(pixel_values)),
                 image_sizes=self._validate_image_sizes(
                     flatten_bn(image_sizes, concat=True)),
             )
@@ -434,7 +435,7 @@ def _process_image_pixels(
     ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         assert self.vision_tower is not None
 
-        pixel_values = inputs["data"]
+        pixel_values = inputs["pixel_values"]
 
         if isinstance(pixel_values, torch.Tensor):
             b, num_patches, c, h, w = pixel_values.shape
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c6bc9ffcbf3..52ec0abcdc5 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -42,7 +42,7 @@
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)`
 
@@ -54,7 +54,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict):
 
 class LlavaOnevisionImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: Union[torch.Tensor, List[torch.Tensor]]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape:
     `(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
@@ -521,7 +521,7 @@ def _parse_and_validate_image_input(
 
             return LlavaOnevisionImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_image_pixel_values(
+                pixel_values=self._validate_image_pixel_values(
                     flatten_bn(pixel_values)),
                 image_sizes=self._validate_image_sizes(
                     flatten_bn(image_sizes, concat=True)),
@@ -570,21 +570,20 @@ def _parse_and_validate_video_input(
                 List[b, Tensor(nb_frames, nb_channels, height, width)]
         }
         """
-        pixel_values = kwargs.pop("pixel_values_videos", None)
-
-        if pixel_values is None:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        if pixel_values_videos is None:
             return None
 
-        if not (is_list_of(pixel_values,
-                           (torch.Tensor))  # different shape videos 
-                or isinstance(pixel_values,
+        if not (is_list_of(pixel_values_videos,
+                           torch.Tensor)  # different shape videos 
+                or isinstance(pixel_values_videos,
                               torch.Tensor)):  # same shape videos
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
+            raise ValueError("Incorrect type of pixel_values_videos. "
+                             f"Got type: {type(pixel_values_videos)}")
 
         return LlavaOnevisionVideoPixelInputs(
             type="pixel_values_videos",
-            data=pixel_values,
+            pixel_values_videos=pixel_values_videos,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -723,7 +722,7 @@ def _process_image_pixels(
     ) -> Union[torch.Tensor, List[torch.Tensor]]:
         assert self.vision_tower is not None
 
-        pixel_values = inputs["data"]
+        pixel_values = inputs["pixel_values"]
 
         if isinstance(pixel_values, torch.Tensor):
             b, num_patches, c, h, w = pixel_values.shape
@@ -757,7 +756,7 @@ def _process_image_input(
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:
-            batch_size = len(image_input["data"])
+            batch_size = len(image_input["pixel_values"])
             vision_config = self.config.vision_config
             default_height = default_width = vision_config.image_size
             image_sizes = torch.as_tensor([[default_height, default_width]
@@ -808,7 +807,7 @@ def _video_pixels_to_features(
     def _process_video_pixels(self, inputs: LlavaOnevisionVideoPixelInputs):
         assert self.vision_tower is not None
 
-        video_pixels = inputs["data"]
+        video_pixels = inputs["pixel_values_videos"]
 
         if isinstance(video_pixels, torch.Tensor):
             b, num_videos, frames, c, h, w = video_pixels.shape
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index bf6c38d2796..ac10c211fa8 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,7 +23,6 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from functools import partial
 from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
                     TypedDict, Union)
 
@@ -36,11 +35,12 @@
 
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import MultiModalFieldConfig
-from vllm.multimodal.parse import (AudioItem, DictEmbeddingItems, ModalityData,
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
+                                   DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import PromptReplacement
+from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -272,8 +272,13 @@ def get_special_tokens(self) -> Dict[str, torch.Tensor]:
                 tokenizer.audio_end_id)
         return special_tokens
 
-    def process_audios(self, mm_data: Mapping[str, object],
-                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+    def process_audios(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        mm_data = dict(mm_data)
+
         audios = mm_data.pop("audios", [])
         audio_embeds = mm_data.pop("audio_embeds", [])
         if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
@@ -332,11 +337,15 @@ def get_placeholder_match_pattern(self) -> str:
     def get_placeholder_split_pattern(self) -> str:
         return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
 
-    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+    def process_mm_inputs(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, Mapping[str, NestedTensors]]:
         return {
             "image": self.process_images(mm_data, mm_kwargs),
             "video": self.process_videos(mm_data, mm_kwargs),
-            "audio": self.process_audios(mm_data, mm_kwargs)
+            "audio": self.process_audios(mm_data, mm_kwargs),
         }
 
     def get_modality_num_counter(self, modality: str) -> str:
@@ -358,39 +367,38 @@ def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
         return super().get_prompt_texts_by_modality(inputs, modality, index)
 
     def _get_prompt_updates(
-            self, mm_items: MultiModalDataItems,
-            hf_processor_mm_kwargs: Mapping[str, Any],
-            out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]:
-        placeholder = {
-            "image": self.info.image_pattern,
-            "video": self.info.video_pattern,
-            "audio": self.info.audio_pattern
-        }
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        base_updates = super()._get_prompt_updates(
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            out_mm_kwargs=out_mm_kwargs,
+        )
 
-        def get_replacement_minicpmv(item_idx: int, modality: str):
-            if modality == "image":
-                return self.get_image_prompt_texts(
-                    mm_items["image"].get_image_size(item_idx), item_idx)
-            elif modality == "video":
-                return self.get_video_prompt_texts(
-                    mm_items["video"].get_frame_size(item_idx),
-                    mm_items["video"].get_num_frames(item_idx))
-            else:  # audio
-                if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems):
-                    single_audio_embeds = mm_items["audio"].get(item_idx)
-                    audio_len = self.info.get_audio_len_by_num_chunks(
-                        sum(chunk_embeds.shape[0]
-                            for chunk_embeds in single_audio_embeds))
-                    return self.get_audio_prompt_texts(audio_len)
-                return self.get_audio_prompt_texts(
-                    len(mm_items["audio"].get(item_idx)))
+        audio_placeholder = self.info.audio_pattern
+
+        def get_audio_replacement(item_idx: int):
+            audios = mm_items.get_items(
+                "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems))
+
+            if isinstance(audios, MiniCPMOAudioEmbeddingItems):
+                single_audio_embeds = audios.get(item_idx)["audio_embeds"]
+                audio_len = self.info.get_audio_len_by_num_chunks(
+                    sum(chunk_embeds.shape[0]
+                        for chunk_embeds in single_audio_embeds))
+            else:
+                audio_len = audios.get_audio_length(item_idx)
+
+            return self.get_audio_prompt_texts(audio_len)
 
         return [
-            PromptReplacement(modality=modality,
-                              target=placeholder[modality],
-                              replacement=partial(get_replacement_minicpmv,
-                                                  modality=modality))
-            for modality in ("image", "video", "audio")
+            *base_updates,
+            PromptReplacement(modality="audio",
+                              target=audio_placeholder,
+                              replacement=get_audio_replacement),
         ]
 
     def _get_mm_fields_config(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 48f0c09cdfb..48c8572c05f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -24,7 +24,6 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
-from collections import Counter
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
 from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
@@ -51,13 +50,16 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, PlaceholderRange)
-from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, ImageSize,
+                                    MultiModalInputs, NestedTensors,
+                                    PlaceholderRange)
+from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
+                                   ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
                                    MultiModalDataItems, MultiModalDataParser,
-                                   VideoItem)
+                                   VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -557,8 +559,13 @@ def repack_processor_outputs(outputs: Any) -> BatchFeature:
         outputs = {key: outputs[key][0] for key in valid_keys}
         return outputs
 
-    def process_images(self, mm_data: Mapping[str, object],
-                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+    def process_images(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        mm_data = dict(mm_data)
+
         images = mm_data.pop("images", [])
         image_embeds = mm_data.pop("image_embeds", [])
         if isinstance(images, Image.Image):
@@ -568,8 +575,7 @@ def process_images(self, mm_data: Mapping[str, object],
                 prompt=self.info.image_pattern * len(images),
                 mm_data={"images": images},
                 mm_kwargs=mm_kwargs)
-            image_outputs = MiniCPMVMultiModalProcessor.\
-                repack_processor_outputs(image_outputs)
+            image_outputs = self.repack_processor_outputs(image_outputs)
         elif len(image_embeds) > 0:
             image_sizes = mm_data.pop("image_sizes", None)
             image_outputs = {
@@ -580,8 +586,13 @@ def process_images(self, mm_data: Mapping[str, object],
             image_outputs = {}
         return image_outputs
 
-    def process_videos(self, mm_data: Mapping[str, object],
-                       mm_kwargs: Mapping[str, object]) -> Dict[str, object]:
+    def process_videos(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        mm_data = dict(mm_data)
+
         videos = mm_data.pop("videos", [])
         video_embeds = mm_data.pop("video_embeds", [])
         if len(videos) > 0 and isinstance(videos[0], Image.Image):
@@ -635,10 +646,14 @@ def get_placeholder_match_pattern(self) -> str:
     def get_placeholder_split_pattern(self) -> str:
         return r"\(<(?:image|video)>./</(?:image|video)>\)"
 
-    def process_mm_inputs(self, mm_data, mm_kwargs) -> object:
+    def process_mm_inputs(
+        self,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, Mapping[str, NestedTensors]]:
         return {
             "image": self.process_images(mm_data, mm_kwargs),
-            "video": self.process_videos(mm_data, mm_kwargs)
+            "video": self.process_videos(mm_data, mm_kwargs),
         }
 
     def get_input_modalities(self, mm_data) -> List[str]:
@@ -655,8 +670,10 @@ def get_modality_num_counter(self, modality: str) -> str:
         elif modality == "video":
             return "video_image_sizes"
 
-    def get_num_slices_by_modality(self, inputs: Dict[str, object],
-                                   modality: str, index: int) -> int:
+        raise NotImplementedError(modality)
+
+    def get_num_slices_by_modality(self, inputs: dict[str, Any], modality: str,
+                                   index: int) -> int:
         if modality == "image":
             return self.info.get_image_slice_nums(
                 inputs[modality]["image_sizes"][index],
@@ -669,20 +686,7 @@ def get_num_slices_by_modality(self, inputs: Dict[str, object],
         else:
             raise ValueError(f"Unexpected modality: {modality}")
 
-    def check_mm_inputs(self, inputs: Dict[str, object],
-                        matches: List[str]) -> None:
-        counts = Counter(matches)
-        for modality, count in counts.items():
-            if modality not in inputs or not inputs[modality]:
-                raise ValueError(f"None input data of {modality}."
-                                 " But prompt requires.")
-            counter_key = self.get_modality_num_counter(modality)
-            if len(inputs[modality][counter_key]) != count:
-                raise ValueError(f"The prompt requires {count} "
-                                 f"{modality} inputs while you pass "
-                                 f"{len(inputs[modality][counter_key])}")
-
-    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
+    def get_prompt_texts_by_modality(self, inputs: dict[str, Any],
                                      modality: str, index: int) -> str:
         if modality == "image":
             return self.get_image_prompt_texts(
@@ -715,13 +719,23 @@ def _call_hf_processor(
         tokenizer = self.info.get_tokenizer()
         inputs = self.process_mm_inputs(mm_data, mm_kwargs)
         mm_input_modalities = self.get_input_modalities(inputs)
-        num_mm_slices = {modality: [] for modality in mm_input_modalities}
+
+        num_mm_slices_lst = {
+            modality: list[int]()
+            for modality in mm_input_modalities
+        }
         for modality in mm_input_modalities:
             num_counter_key = self.get_modality_num_counter(modality)
             for index in range(len(inputs[modality][num_counter_key])):
-                num_mm_slices[modality].append(
+                num_mm_slices_lst[modality].append(
                     self.get_num_slices_by_modality(inputs, modality, index))
-        return {
+
+        num_mm_slices = {
+            modality: torch.tensor(v)
+            for modality, v in num_mm_slices_lst.items()
+        }
+
+        return BatchFeature({
             "input_ids": np.array([tokenizer.encode(prompt)]),
             **{
                 key: value
@@ -732,7 +746,7 @@ def _call_hf_processor(
                 f"{modality}_num_slices": num_mm_slices[modality]
                 for modality in mm_input_modalities
             }
-        }
+        })
 
     def _hf_processor_applies_updates(
         self,
@@ -743,28 +757,42 @@ def _hf_processor_applies_updates(
         return False
 
     def _get_prompt_updates(
-            self, mm_items: MultiModalDataItems,
-            hf_processor_mm_kwargs: Mapping[str, Any],
-            out_mm_kwargs: MultiModalKwargs) -> Sequence[PromptReplacement]:
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
         placeholder = {
             "image": self.info.image_pattern,
             "video": self.info.video_pattern,
         }
 
-        def get_replacement_minicpmv(item_idx: int, modality: str):
-            if modality == "image":
-                return self.get_image_prompt_texts(
-                    mm_items["image"].get_image_size(item_idx), item_idx)
-            else:  # video
-                return self.get_video_prompt_texts(
-                    mm_items["video"].get_frame_size(item_idx),
-                    mm_items["video"].get_num_frames(item_idx))
+        def get_image_replacement(item_idx: int):
+            images = mm_items.get_items(
+                "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems))
+
+            image_size = images.get_image_size(item_idx)
+
+            return self.get_image_prompt_texts(image_size, item_idx)
+
+        def get_video_replacement(item_idx: int):
+            videos = mm_items.get_items(
+                "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems))
+
+            frame_size = videos.get_frame_size(item_idx)
+            num_frames = videos.get_num_frames(item_idx)
+
+            return self.get_video_prompt_texts(frame_size, num_frames)
+
+        get_replacement = {
+            "image": get_image_replacement,
+            "video": get_video_replacement,
+        }
 
         return [
             PromptReplacement(modality=modality,
                               target=placeholder[modality],
-                              replacement=partial(get_replacement_minicpmv,
-                                                  modality=modality))
+                              replacement=get_replacement[modality])
             for modality in ("image", "video")
         ]
 
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 9696a858ecd..444b619437a 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1478,7 +1478,7 @@ def _parse_and_validate_image_input(
                              f"Got type: {type(embed_is_patch)}")
 
         num_crops = kwargs.pop("num_crops", None)
-        if not isinstance(num_crops, torch.Tensor):
+        if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 25b4cc4a9fb..2e71390623f 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
+from collections.abc import Iterable, Mapping
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
+from typing import List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -683,79 +684,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 # and [`MistralForCausalLM`] for its language decoder.
 
 
-def get_pixtral_hf_patch_grid_length(*, image_size: int,
-                                     patch_size: int) -> int:
-    # Since interpolation is applied, the image size need not be divisible
-    # assert image_size % patch_size == 0
-    return image_size // patch_size
-
-
-def get_pixtral_hf_image_feature_size(
-    *,
-    image_size: int,
-    patch_size: int,
-) -> int:
-    grid_length = get_pixtral_hf_patch_grid_length(
-        image_size=image_size,
-        patch_size=patch_size,
-    )
-
-    # Consider the image_break_token
-    return (grid_length + 1) * grid_length
-
-
-def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
-    grid_length = get_pixtral_hf_patch_grid_length(
-        image_size=hf_config.image_size,
-        patch_size=hf_config.patch_size,
-    )
-
-    # Consider the image_break_token
-    return (grid_length + 1) * grid_length
-
-
-def dummy_image_for_pixtral_hf(
-    hf_config: PixtralVisionConfig,
-    num_images: int,
-    *,
-    image_width_override: Optional[int] = None,
-    image_height_override: Optional[int] = None,
-):
-    width = height = hf_config.image_size
-    if image_width_override is not None:
-        width = image_width_override
-    if image_height_override is not None:
-        height = image_height_override
-
-    image = Image.new("RGB", (width, height), color=0)
-    return {"image": image if num_images == 1 else [image] * num_images}
-
-
-# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
-# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180
-def get_pixtral_hf_image_feature_grid_size(
-    hf_config: PixtralVisionConfig,
-    *,
-    image_width: int,
-    image_height: int,
-) -> tuple[int, int]:
-    max_width = max_height = hf_config.image_size
-    patch_width = patch_height = hf_config.patch_size
-
-    ratio = max(image_width / max_width, image_height / max_height)
-
-    if ratio > 1:
-        image_width = int(math.ceil(image_width / ratio))
-        image_height = int(math.ceil(image_height / ratio))
-
-    nrows, ncols = _get_pixtral_hf_num_image_tokens(
-        (image_height, image_width),
-        (patch_height, patch_width),
-    )  # type: ignore
-
-    return ncols, nrows
-
-
 class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
 
     def get_num_image_tokens(
@@ -764,13 +692,21 @@ def get_num_image_tokens(
         image_width: int,
         image_height: int,
     ) -> int:
-        return get_pixtral_hf_image_feature_size(
-            image_size=self.vision_config.image_size,
-            patch_size=self.vision_config.patch_size,
+        ncols, nrows = self.get_patch_grid_size(
+            image_width=image_width,
+            image_height=image_height,
         )
 
+        # Consider the image_break_token
+        return (ncols + 1) * nrows
+
     def get_max_image_tokens(self) -> int:
-        return get_max_pixtral_hf_image_tokens(self.vision_config)
+        image_size = self.get_image_size()
+
+        return self.get_num_image_tokens(
+            image_width=image_size,
+            image_height=image_size,
+        )
 
     def get_image_size(self) -> int:
         return self.vision_config.image_size
@@ -779,10 +715,34 @@ def get_patch_size(self) -> int:
         return self.vision_config.patch_size
 
     def get_patch_grid_length(self) -> int:
-        return get_pixtral_hf_patch_grid_length(
-            image_size=self.vision_config.image_size,
-            patch_size=self.vision_config.patch_size,
-        )
+        image_size, patch_size = self.get_image_size(), self.get_patch_size()
+
+        # Since interpolation is applied, the image size need not be divisible
+        # assert image_size % patch_size == 0
+        return image_size // patch_size
+
+    # Adapted from: https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/pixtral/image_processing_pixtral.py#L99
+    def get_patch_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        max_width = max_height = self.get_image_size()
+        patch_width = patch_height = self.get_patch_size()
+
+        ratio = max(image_width / max_width, image_height / max_height)
+
+        if ratio > 1:
+            image_width = int(math.ceil(image_width / ratio))
+            image_height = int(math.ceil(image_height / ratio))
+
+        nrows, ncols = _get_pixtral_hf_num_image_tokens(
+            (image_height, image_width),
+            (patch_height, patch_width),
+        )  # type: ignore
+
+        return ncols, nrows
 
 
 class PixtralHFMLP(nn.Module):
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index aae30f1fd66..f63bd0a1145 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -222,10 +222,10 @@ def get_replacement_qwen2_audio(item_idx: int):
             num_features = audio_output_lengths[item_idx]
             if num_features == 0:
                 audios = mm_items.get_items("audio", AudioProcessorItems)
-                audio = audios.get(item_idx)
-                raise ValueError(
-                    f"The audio {audio} (len={len(audio)}) is too short "
-                    "to be represented inside the model")
+                audio_len = audios.get_audio_length(item_idx)
+
+                raise ValueError(f"The audio (len={audio_len}) is too short "
+                                 "to be represented inside the model")
 
             audio_tokens = [audio_token_id] * num_features
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index e93fa24a6e4..7b186d89dad 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -433,6 +433,10 @@ def flat_from_sizes(modality: str, size_per_item: torch.Tensor):
             :func:`MultiModalFieldConfig.flat`
         """
 
+        if size_per_item.ndim != 1:
+            raise ValueError("size_per_item should be a 1-D tensor, "
+                             f"but found shape: {size_per_item.shape}")
+
         slice_idxs = [0, *accumulate(size_per_item)]
         slices = [
             slice(slice_idxs[i], slice_idxs[i + 1])
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 4e3e5b20886..772b1609a9f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -176,6 +176,10 @@ class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]):
     def __init__(self, data: Sequence[HfAudioItem]) -> None:
         super().__init__(data, "audio")
 
+    def get_audio_length(self, item_idx: int) -> int:
+        audio = self.get(item_idx)
+        return len(audio)
+
 
 class AudioEmbeddingItems(EmbeddingItems):
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index ba8a458e84c..080a2362aac 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1311,8 +1311,8 @@ def _cached_apply_hf_processor(
 
     def _bind_and_group_updates(
         self,
-        prompt_updates: list[PromptUpdate],
-    ) -> dict[str, list[BoundPromptUpdate]]:
+        prompt_updates: Sequence[PromptUpdate],
+    ) -> dict[str, Sequence[BoundPromptUpdate]]:
         tokenizer = self.info.get_tokenizer()
 
         it = (update.bind(tokenizer) for update in prompt_updates)

From 5aaef0e6b941886d921800dc082e7028c1743f00 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <guillaume.calmettes@gmail.com>
Date: Fri, 14 Mar 2025 08:58:34 -0400
Subject: [PATCH 0727/1240] [BugFix]: properly catch templating error when
 preprocess input (#13976)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py         | 10 ++++++++++
 vllm/entrypoints/openai/serving_completion.py   | 10 ++++++++++
 vllm/entrypoints/openai/serving_embedding.py    |  3 +++
 vllm/entrypoints/openai/serving_pooling.py      |  7 +++++++
 vllm/entrypoints/openai/serving_tokenization.py |  7 +++++++
 5 files changed, 37 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1ba33f78cde..130dfe1841f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -7,6 +7,7 @@
 from collections.abc import Sequence as GenericSequence
 from typing import Callable, Final, Optional, Union
 
+import jinja2
 from fastapi import Request
 
 from vllm.config import ModelConfig
@@ -199,6 +200,15 @@ async def create_chat_completion(
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except RuntimeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except jinja2.TemplateError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
         request_id = "chatcmpl-" \
                      f"{self._base_request_id(raw_request, request.request_id)}"
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 1db91a91e37..1067f35ce24 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -6,6 +6,7 @@
 from collections.abc import Sequence as GenericSequence
 from typing import Optional, Union, cast
 
+import jinja2
 from fastapi import Request
 
 from vllm.config import ModelConfig
@@ -114,6 +115,15 @@ async def create_completion(
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except RuntimeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except jinja2.TemplateError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[RequestOutput, None]] = []
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 5f6e06e6f79..1c2c78aaf89 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -142,6 +142,9 @@ async def create_embedding(
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 0a3ca2aa7c5..894128ee974 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -6,6 +6,7 @@
 from collections.abc import AsyncGenerator
 from typing import Final, Literal, Optional, Union, cast
 
+import jinja2
 import numpy as np
 from fastapi import Request
 from typing_extensions import assert_never
@@ -138,6 +139,12 @@ async def create_pooling(
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except jinja2.TemplateError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 4e95ef59e80..90c0da2a24d 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -2,6 +2,7 @@
 
 from typing import Final, Optional, Union
 
+import jinja2
 from fastapi import Request
 
 from vllm.config import ModelConfig
@@ -91,6 +92,12 @@ async def create_tokenize(
         except ValueError as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
+        except TypeError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
+        except jinja2.TemplateError as e:
+            logger.exception("Error in preprocessing prompt inputs")
+            return self.create_error_response(str(e))
 
         input_ids: list[int] = []
         for i, engine_prompt in enumerate(engine_prompts):

From a821190b5bf2521d201ba97a1c3775c633a58838 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Mar 2025 00:11:23 +0800
Subject: [PATCH 0728/1240] [Bugfix] Fix Aria test loading (#14823)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/decoder_only/vision_language/test_models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 84a5260ad9a..a0f1229f0af 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,8 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
+from transformers import (AutoModelForImageTextToText, AutoModelForPreTraining,
+                          AutoModelForVision2Seq)
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
@@ -163,6 +164,7 @@
         img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
         max_model_len=4096,
         max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
         single_image_prompts=IMAGE_ASSETS.prompts({
             "stop_sign": "<vlm_image>Please describe the image shortly.",
             "cherry_blossom": "<vlm_image>Please infer the season with reason.",

From 6fc05e5e846d1d9e81394f7349bb1a0c87206ec2 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 12:18:38 -0400
Subject: [PATCH 0729/1240] [V1] Fix vocab size calculation for structured
 output (#14826)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index a341d74c581..32ea1852d0a 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,7 +40,7 @@ def _delayed_init(self):
         tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = tokenizer.max_token_id
+        self.vocab_size = tokenizer.max_token_id + 1
         if isinstance(tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98

From 82f5c288eba49038e2da4177dfd42d3c23b67b73 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 12:21:09 -0400
Subject: [PATCH 0730/1240] [Frontend] Fix log message to use http vs https
 (#14774)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ec2099d4ceb..7583078e946 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -955,8 +955,10 @@ def _listen_addr(a: str) -> str:
                 return '[' + a + ']'
             return a or "0.0.0.0"
 
-        logger.info("Starting vLLM API server on http://%s:%d",
-                    _listen_addr(sock_addr[0]), sock_addr[1])
+        is_ssl = args.ssl_keyfile and args.ssl_certfile
+        logger.info("Starting vLLM API server on http%s://%s:%d",
+                    "s" if is_ssl else "", _listen_addr(sock_addr[0]),
+                    sock_addr[1])
 
         shutdown_task = await serve_http(
             app,

From c89fd01c81d81e089777e80d4b9937acc78f4eab Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 14 Mar 2025 16:45:25 +0000
Subject: [PATCH 0731/1240] [V1][Metrics] Updated list of deprecated metrics in
 v0.8 (#14695)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/metrics.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md
index 1d55f201503..647ece3f85f 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/source/serving/metrics.md
@@ -39,7 +39,16 @@ The following metrics are exposed:
 
 The following metrics are deprecated and due to be removed in a future version:
 
-- *(No metrics are currently deprecated)*
+- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
+  `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
+  used in V1.
+- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
+  counters in V1.
+- `vllm:time_in_queue_requests` because it duplicates
+  `vllm:request_queue_time_seconds`.
+- `vllm:model_forward_time_milliseconds` and
+  `vllm:model_execute_time_milliseconds` because
+  prefill/decode/inference time metrics should be used instead.
 
 Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
 but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,

From 83bf6cd3203082ec09530b8bc4ad67a424e55c1b Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Fri, 14 Mar 2025 09:53:17 -0700
Subject: [PATCH 0732/1240] [Frontend] track server_load (#13950)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_basic.py | 48 ++++++++++++++++++++++++++
 vllm/entrypoints/openai/api_server.py  | 32 +++++++++++++++--
 vllm/entrypoints/openai/cli_args.py    |  7 ++++
 vllm/entrypoints/utils.py              | 48 ++++++++++++++++++++++++--
 4 files changed, 131 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index e7bf974f13e..a4ac8007077 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -171,3 +171,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
             extra_headers={
                 "Content-Type": "application/x-www-form-urlencoded"
             })
+
+
+@pytest.mark.parametrize(
+    "server_args",
+    [
+        pytest.param(["--enable-server-load-tracking"],
+                     id="enable-server-load-tracking")
+    ],
+    indirect=True,
+)
+@pytest.mark.asyncio
+async def test_server_load(server: RemoteOpenAIServer):
+    # Check initial server load
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
+
+    def make_long_completion_request():
+        return requests.post(
+            server.url_for("v1/completions"),
+            headers={"Content-Type": "application/json"},
+            json={
+                "prompt": "Give me a long story",
+                "max_tokens": 1000,
+                "temperature": 0,
+            },
+        )
+
+    # Start the completion request in a background thread.
+    completion_future = asyncio.create_task(
+        asyncio.to_thread(make_long_completion_request))
+
+    # Give a short delay to ensure the request has started.
+    await asyncio.sleep(0.1)
+
+    # Check server load while the completion request is running.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 1
+
+    # Wait for the completion request to finish.
+    await completion_future
+    await asyncio.sleep(0.1)
+
+    # Check server load after the completion request has finished.
+    response = requests.get(server.url_for("load"))
+    assert response.status_code == HTTPStatus.OK
+    assert response.json().get("server_load") == 0
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7583078e946..52e65fc214b 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -80,7 +80,7 @@
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.utils import with_cancellation
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
@@ -347,6 +347,24 @@ async def health(raw_request: Request) -> Response:
     return Response(status_code=200)
 
 
+@router.get("/load")
+async def get_server_load_metrics(request: Request):
+    # This endpoint returns the current server load metrics.
+    # It tracks requests utilizing the GPU from the following routes:
+    # - /v1/chat/completions
+    # - /v1/completions
+    # - /v1/audio/transcriptions
+    # - /v1/embeddings
+    # - /pooling
+    # - /score
+    # - /v1/score
+    # - /rerank
+    # - /v1/rerank
+    # - /v2/rerank
+    return JSONResponse(
+        content={'server_load': request.app.state.server_load_metrics})
+
+
 @router.api_route("/ping", methods=["GET", "POST"])
 async def ping(raw_request: Request) -> Response:
     """Ping check. Endpoint required for SageMaker"""
@@ -400,6 +418,7 @@ async def show_version():
 @router.post("/v1/chat/completions",
              dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
     handler = chat(raw_request)
@@ -421,6 +440,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
 @router.post("/v1/completions", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_completion(request: CompletionRequest, raw_request: Request):
     handler = completion(raw_request)
     if handler is None:
@@ -439,6 +459,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
 
 @router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_embedding(request: EmbeddingRequest, raw_request: Request):
     handler = embedding(raw_request)
     if handler is None:
@@ -485,6 +506,7 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
 
 @router.post("/pooling", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_pooling(request: PoolingRequest, raw_request: Request):
     handler = pooling(raw_request)
     if handler is None:
@@ -503,6 +525,7 @@ async def create_pooling(request: PoolingRequest, raw_request: Request):
 
 @router.post("/score", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_score(request: ScoreRequest, raw_request: Request):
     handler = score(raw_request)
     if handler is None:
@@ -521,6 +544,7 @@ async def create_score(request: ScoreRequest, raw_request: Request):
 
 @router.post("/v1/score", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def create_score_v1(request: ScoreRequest, raw_request: Request):
     logger.warning(
         "To indicate that Score API is not part of standard OpenAI API, we "
@@ -531,10 +555,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
 
 @router.post("/v1/audio/transcriptions")
 @with_cancellation
+@load_aware_call
 async def create_transcriptions(request: Annotated[TranscriptionRequest,
                                                    Form()],
                                 raw_request: Request):
-
     handler = transcription(raw_request)
     if handler is None:
         return base(raw_request).create_error_response(
@@ -556,6 +580,7 @@ async def create_transcriptions(request: Annotated[TranscriptionRequest,
 
 @router.post("/rerank", dependencies=[Depends(validate_json_request)])
 @with_cancellation
+@load_aware_call
 async def do_rerank(request: RerankRequest, raw_request: Request):
     handler = rerank(raw_request)
     if handler is None:
@@ -894,6 +919,9 @@ async def init_app_state(
     ) if model_config.runner_type == "transcription" else None
     state.task = model_config.task
 
+    state.enable_server_load_tracking = args.enable_server_load_tracking
+    state.server_load_metrics = 0
+
 
 def create_server_socket(addr: tuple[str, int]) -> socket.socket:
     family = socket.AF_INET
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index b8cc57430f8..bd66416d90c 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -257,6 +257,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         action='store_true',
         default=False,
         help="If set to True, enable prompt_tokens_details in usage.")
+    parser.add_argument(
+        "--enable-server-load-tracking",
+        action='store_true',
+        default=False,
+        help=
+        "If set to True, enable tracking server_load_metrics in the app state."
+    )
 
     return parser
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 9af37871d57..60cbb58af3d 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -4,6 +4,8 @@
 import functools
 
 from fastapi import Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from starlette.background import BackgroundTask, BackgroundTasks
 
 
 async def listen_for_disconnect(request: Request) -> None:
@@ -17,9 +19,9 @@ async def listen_for_disconnect(request: Request) -> None:
 def with_cancellation(handler_func):
     """Decorator that allows a route handler to be cancelled by client
     disconnections.
-    
+
     This does _not_ use request.is_disconnected, which does not work with
-    middleware. Instead this follows the pattern from 
+    middleware. Instead this follows the pattern from
     starlette.StreamingResponse, which simultaneously awaits on two tasks- one
     to wait for an http disconnect message, and the other to do the work that we
     want done. When the first task finishes, the other is cancelled.
@@ -57,3 +59,45 @@ async def wrapper(*args, **kwargs):
         return None
 
     return wrapper
+
+
+def decrement_server_load(request: Request):
+    request.app.state.server_load_metrics -= 1
+
+
+def load_aware_call(func):
+
+    @functools.wraps(func)
+    async def wrapper(*args, raw_request: Request, **kwargs):
+        if not raw_request.app.state.enable_server_load_tracking:
+            return await func(*args, raw_request=raw_request, **kwargs)
+
+        raw_request.app.state.server_load_metrics += 1
+        try:
+            response = await func(*args, raw_request=raw_request, **kwargs)
+        except Exception:
+            raw_request.app.state.server_load_metrics -= 1
+            raise
+
+        if isinstance(response, (JSONResponse, StreamingResponse)):
+            if response.background is None:
+                response.background = BackgroundTask(decrement_server_load,
+                                                     raw_request)
+            elif isinstance(response.background, BackgroundTasks):
+                response.background.add_task(decrement_server_load,
+                                             raw_request)
+            elif isinstance(response.background, BackgroundTask):
+                # Convert the single BackgroundTask to BackgroundTasks
+                # and chain the decrement_server_load task to it
+                tasks = BackgroundTasks()
+                tasks.add_task(response.background.func,
+                               *response.background.args,
+                               **response.background.kwargs)
+                tasks.add_task(decrement_server_load, raw_request)
+                response.background = tasks
+        else:
+            raw_request.app.state.server_load_metrics -= 1
+
+        return response
+
+    return wrapper

From d6d6bd0c5f4ba89d5153a08ec2839f809864e432 Mon Sep 17 00:00:00 2001
From: Yajie Wang <wyajieha@outlook.com>
Date: Sat, 15 Mar 2025 00:55:14 +0800
Subject: [PATCH 0733/1240] [Bugfix][Kernel]: Fix AllSpark kernel compilation
 errors and enable for CUDA < 12.0 (#14430)

Signed-off-by: wyj371990 <wyj371990@alibaba-inc.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                      |  4 ++--
 .../gptq_allspark/allspark_qgemm_w8a16.cu           | 13 ++++++++-----
 csrc/quantization/gptq_allspark/allspark_utils.cuh  |  8 +++++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5baa39b6f9e..b7bfdc6c857 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -319,7 +319,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # Only build AllSpark kernels if we are building for at least some compatible archs.
   cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND ALLSPARK_ARCHS)
+  if (ALLSPARK_ARCHS)
     set(ALLSPARK_SRCS
        "csrc/quantization/gptq_allspark/allspark_repack.cu"
        "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
@@ -330,7 +330,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
   else()
     message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures, or CUDA not >= 12.0")
+                   " in CUDA target architectures")
   endif()
 
 
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index c4ed98ca64f..b520f8c32b9 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -437,9 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
   #pragma unroll
         for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 = static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2]);
+          FType low16 =
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
           FType high16 =
-              static_cast<FType>(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
           uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                          (reinterpret_cast<uint32_t&>(high16) << 16);
           int sts_offset =
@@ -793,7 +794,7 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
   FT scale_reg[4];
   *(reinterpret_cast<uint2*>(scale_reg)) =
       *(reinterpret_cast<const uint2*>(scales + params_nidx));
-  FT zero_reg[4] = {0};
+  FT zero_reg[4];
   if (zeros != nullptr) {
     *(reinterpret_cast<uint2*>(zero_reg)) =
         *(reinterpret_cast<const uint2*>(zeros + params_nidx));
@@ -809,8 +810,10 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
         reinterpret_cast<typename HalfType<FT>::T2*>(&(fval_reg[ni * 4])));
   #pragma unroll
     for (int ki = 0; ki < 4; ++ki) {
-      fval_reg[ni * 4 + ki] =
-          (fval_reg[ni * 4 + ki] - zero_reg[ni]) * scale_reg[ni];
+      if (zeros != nullptr) {
+        fval_reg[ni * 4 + ki] = __hsub(fval_reg[ni * 4 + ki], zero_reg[ni]);
+      }
+      fval_reg[ni * 4 + ki] = __hmul(fval_reg[ni * 4 + ki], scale_reg[ni]);
       int sts_offset = sts_base_offset + ((ki / 2) * 8 + (ki % 2)) * 32 +
                        ((ni + lane_id % 4) % 4) * 8;
       smem[sts_offset] = fval_reg[ni * 4 + ki];
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 7aded9a1728..80456c25590 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -7,6 +7,8 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <iostream>
+#include "../gptq_marlin/marlin_dtypes.cuh"
+using marlin::ScalarType;
 
 namespace allspark {
 
@@ -66,14 +68,14 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
     return;
   }
 
-  FType sum(0);
+  float sum = 0.f;
 
   int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
   for (int i = 0; i < n_mat; ++i) {
-    sum += C_split[idx + i * matrix_size];
+    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
   }
 
-  C[idx] = sum;
+  C[idx] = ScalarType<FType>::float2num(sum);
 }
 
 template <typename FType>

From 809739291f827967e9bef0b1770fe4fa9e0e3362 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 14 Mar 2025 11:59:52 -0700
Subject: [PATCH 0734/1240] [release] Remove log cleanup commands from TPU job
 (#14838)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 096a1c870c6..37cdab9e01e 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -57,8 +57,6 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
-      - "rm -f /var/log/syslog"
-      - "rm -f /var/log/kern.log"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"

From 465136ce58a453fc86db802d80808e982894a23d Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:18:13 -0500
Subject: [PATCH 0735/1240] Re-enable the AMD Entrypoints Test (#14711)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh | 24 +++++++++++++++++++-----
 Dockerfile.rocm            |  1 +
 requirements/rocm-test.txt | 23 +++++++++++++++++++++++
 3 files changed, 43 insertions(+), 5 deletions(-)
 create mode 100644 requirements/rocm-test.txt

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 955baa1ff8b..0680bae13dd 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -101,16 +101,30 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_permute_cols.py"
 fi
 
-#ignore certain Entrypoints tests
+#ignore certain Entrypoints/openai tests
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_accuracy.py \
   --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_encoder_decoder.py \
-  --ignore=entrypoints/openai/test_embedding.py \
-  --ignore=entrypoints/openai/test_oot_registration.py "}
+  --ignore=entrypoints/openai/test_chat.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e2d9ab37533..f852f3d6975 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -61,6 +61,7 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
     && pip install -U -r requirements/rocm.txt \
+    && pip install -U -r requirements/rocm-test.txt \
     && pip uninstall -y vllm \
     && pip install *.whl
 
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
new file mode 100644
index 00000000000..52fbf787f1d
--- /dev/null
+++ b/requirements/rocm-test.txt
@@ -0,0 +1,23 @@
+
+# entrypoints test
+# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+audioread==3.0.1
+cffi==1.17.1
+decorator==5.2.1
+lazy-loader==0.4
+platformdirs==4.3.6
+pooch==1.8.2
+#pycparse==2.22
+soundfile==0.13.1
+soxr==0.5.0.post1
+librosa==0.10.2.post1
+
+# entrypoints test
+#vllm[video] # required by entrypoints/openai/test_video.py
+decord==0.6.0
+
+# entrypoints test
+#sentence-transformers # required by entrypoints/openai/test_score.py
+sentence-transformers==3.4.1
+
+

From b5a740f681dff61facaae0346849674be5d7c2fc Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Fri, 14 Mar 2025 16:36:18 -0400
Subject: [PATCH 0736/1240] [Model] Mamba2 Prefill Performance Tweaks: Fixing
 Flurry of Unnecessary Memory Copies  (#14778)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/mamba/mamba_mixer2.py              | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index b53a540ed66..5b19e3f3554 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -466,10 +466,17 @@ def forward_cuda(
         if has_prefill:
 
             initial_states = None
-            if has_initial_states is not None and any(has_initial_states):
-                for idx in mamba_cache_params.state_indices_tensor[
-                        ~has_initial_states]:
-                    mamba_cache_params.ssm_state[idx].zero_()
+
+            if has_initial_states is not None and torch.any(
+                    has_initial_states):
+
+                # vectorized ssm_state zero init
+                batched_zero_init_func = torch.vmap(
+                    lambda idx: mamba_cache_params.ssm_state[idx].zero_())
+                batched_zero_init_func(
+                    mamba_cache_params.
+                    state_indices_tensor[~has_initial_states].unsqueeze(
+                        dim=-1), )
                 initial_states = mamba_cache_params.ssm_state[
                     mamba_cache_params.state_indices_tensor]
 
@@ -493,10 +500,17 @@ def forward_cuda(
                 dt_limit=(0.0, float("inf")),
             )
 
-            # update ssm states
-            # - varlen state is a (batch, nheads, headdim, dstate) tensor
-            for i, idx in enumerate(mamba_cache_params.state_indices_tensor):
-                mamba_cache_params.ssm_state[idx].copy_(varlen_state[i])
+            # vectorized ssm state update using vmap
+            # the 1d state_indices_tensor needs to be unsqueezed to avoid vmap
+            # limitation which doesn't allow use of `item()`
+            # Note: the lambda capture can happen where ssm_state is initialized
+            #       instead of here
+            batched_copy = torch.vmap(
+                lambda idx, source_state: mamba_cache_params.ssm_state[
+                    idx].copy_(source_state))
+            batched_copy(
+                mamba_cache_params.state_indices_tensor.unsqueeze(dim=-1),
+                varlen_state)
 
             # - reshape
             hidden_states = scan_output.view(seq_len, -1)

From 71a8238c8a226d19a0a4bee3f5b7dfa94896464a Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 16:55:18 -0400
Subject: [PATCH 0737/1240] [V1] Fix model parameterization for structured
 output tests (#14833)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py         | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b99fb6a7782..b4eb475c23b 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -14,18 +14,15 @@
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
-
-
-@pytest.fixture
-def model_name():
-    return [
-        "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
-    ]
+MODELS_TO_TEST = [
+    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+]
 
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_completion(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -63,6 +60,7 @@ def test_guided_json_completion(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_object(
     monkeypatch: pytest.MonkeyPatch,
     guided_decoding_backend: str,
@@ -101,6 +99,7 @@ def test_guided_json_object(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_unsupported_schema(
     monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
@@ -128,6 +127,7 @@ def test_guided_json_unsupported_schema(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_grammar_ebnf(
     monkeypatch: pytest.MonkeyPatch,
     sample_sql_ebnf: str,
@@ -170,6 +170,7 @@ def test_guided_grammar_ebnf(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_grammar_lark(
     monkeypatch: pytest.MonkeyPatch,
     sample_sql_lark: str,
@@ -217,6 +218,7 @@ def test_guided_grammar_lark(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_grammar_ebnf_invalid(
     monkeypatch: pytest.MonkeyPatch,
     guided_decoding_backend: str,
@@ -244,6 +246,7 @@ def test_guided_grammar_ebnf_invalid(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_regex(
     monkeypatch: pytest.MonkeyPatch,
     sample_regex: str,
@@ -280,6 +283,7 @@ def test_guided_regex(
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_choice_completion(
     monkeypatch: pytest.MonkeyPatch,
     sample_guided_choice: str,

From be1b0b733158905ed8073995eabcb40debf77629 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 14 Mar 2025 16:58:30 -0400
Subject: [PATCH 0738/1240] Update to torch==2.6.0 (#12721)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt           |  4 ++--
 Dockerfile               |  2 +-
 pyproject.toml           |  2 +-
 requirements/build.txt   |  2 +-
 requirements/cuda.txt    | 10 +++++-----
 requirements/test.in     |  7 ++++---
 requirements/test.txt    | 18 ++++++++++--------
 tests/compile/backend.py |  6 ++++--
 vllm/config.py           | 15 +++++++++++++++
 9 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7bfdc6c857..65d1ddbeee0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/Dockerfile b/Dockerfile
index ff4a0839f6e..79bca1cf9f8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -222,7 +222,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 RUN --mount=type=cache,target=/root/.cache/uv \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 
diff --git a/pyproject.toml b/pyproject.toml
index 836389bc9a6..ee4e2ed0b7c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging",
     "setuptools>=61",
     "setuptools-scm>=8.0",
-    "torch == 2.5.1",
+    "torch == 2.6.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index fec01caaf25..364a16d80b7 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.6.0
 wheel
 jinja2
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 46bb17361b2..702d4b0bb32 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,9 +4,9 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
-ray[cgraph] >= 2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch == 2.5.1
-torchaudio==2.5.1
+ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+torch==2.6.0
+torchaudio==2.6.0
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
+torchvision==0.21.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers==0.0.29.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.6.0
diff --git a/requirements/test.in b/requirements/test.in
index de33f92b37b..cc89d518c7e 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,8 +21,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm # required for internvl test
-torch==2.5.1
-torchaudio==2.5.1
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.0 # required for pixtral test
@@ -30,7 +31,7 @@ datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.48.2 
 # quantization
-bitsandbytes>=0.45.0
+bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
 
 genai_perf==0.0.8
diff --git a/requirements/test.txt b/requirements/test.txt
index f112320725c..a235c8b24ee 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -33,7 +33,7 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements/test.in
-bitsandbytes==0.45.0
+bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
     # via datamodel-code-generator
@@ -127,7 +127,6 @@ filelock==3.16.1
     #   ray
     #   torch
     #   transformers
-    #   triton
 fonttools==4.54.1
     # via matplotlib
 frozendict==2.4.6
@@ -320,6 +319,8 @@ nvidia-cusparse-cu12==12.3.1.170
     # via
     #   nvidia-cusolver-cu12
     #   torch
+nvidia-cusparselt-cu12==0.6.2
+    # via torch
 nvidia-nccl-cu12==2.21.5
     # via torch
 nvidia-nvjitlink-cu12==12.4.127
@@ -591,7 +592,7 @@ timm==1.0.11
     # via -r requirements/test.in
 tokenizers==0.21.0
     # via transformers
-torch==2.5.1
+torch==2.6.0
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -607,13 +608,15 @@ torch==2.5.1
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.5.1
+torchaudio==2.6.0
     # via
     #   -r requirements/test.in
     #   encodec
     #   vocos
-torchvision==0.20.1
-    # via timm
+torchvision==0.21.0
+    # via
+    #   -r requirements/test.in
+    #   timm
 tqdm==4.66.6
     # via
     #   datasets
@@ -638,7 +641,7 @@ transformers==4.48.2
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.1.0
+triton==3.2.0
     # via torch
 tritonclient==2.51.0
     # via
@@ -651,7 +654,6 @@ typepy==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
-    #   bitsandbytes
     #   huggingface-hub
     #   librosa
     #   mistral-common
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index 64416eb136c..a21e8eca3a6 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -6,6 +6,7 @@
 from torch import fx
 
 from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import get_current_vllm_config
 
 
 class TestBackend:
@@ -17,13 +18,14 @@ class TestBackend:
     Inductor config can be modified directly by editing the inductor_config
     property. This can be helpful for adding passes like the
     'pre_grad_custom_pass' and the 'post_grad_custom_pre_pass'.
+    Inductor config is default-initialized from VllmConfig.CompilationConfig.
     """
 
     def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph],
                                                              None]]):
         self.custom_passes = list(passes)
-        from torch._inductor import config
-        self.inductor_config = config.shallow_copy_dict()
+        compile_config = get_current_vllm_config().compilation_config
+        self.inductor_config = compile_config.inductor_compile_config
         self.inductor_config['force_disable_caches'] = True
         self.inductor_config['post_grad_custom_post_pass'] = self.post_pass
 
diff --git a/vllm/config.py b/vllm/config.py
index 429ec0dd51c..40ea50cb083 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -52,6 +52,8 @@
 else:
     QuantizationConfig = None
 
+from packaging.version import Version
+
 logger = init_logger(__name__)
 
 # This value is chosen to have a balance between ITL and TTFT. Note it is
@@ -3126,6 +3128,19 @@ def model_post_init(self, __context: Any) -> None:
         count_all = self.custom_ops.count("all")
         assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
 
+        # TODO(zou3519/luka): There are 2 issues with auto-functionalization V2:
+        # 1. A bug in PyTorch, fixed in 2.7:
+        #    https://github.com/pytorch/pytorch/issues/147924
+        # 2. Custom passes (fusion) rely on auto-functionalization V1 and don't
+        #    work with V2. Addressing this will take extra engineering effort
+        #    and it is not yet a priority. RFC here:
+        #    https://github.com/vllm-project/vllm/issues/14703
+
+        if Version(torch.__version__) >= Version("2.6"):
+            KEY = 'enable_auto_functionalized_v2'
+            if KEY not in self.inductor_compile_config:
+                self.inductor_compile_config[KEY] = False
+
         if self.splitting_ops is None:
             if envs.VLLM_USE_V1:
                 # v1 must split the graph on attention ops

From a08f11b5680231a4d6bc264d6b15c595e636aab8 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Fri, 14 Mar 2025 14:13:30 -0700
Subject: [PATCH 0739/1240] [CI] Add TPU v1 test (#14834)

Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100755 .buildkite/run-tpu-v1-test.sh

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
new file mode 100755
index 00000000000..a6a14d0829d
--- /dev/null
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it \
+    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
+    && python3 -m pip install pytest \
+    && python3 -m pip install lm_eval[api]==0.4.4 \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py"

From 18b8113fca005342cd318e7fb9dc2a30b16f37ed Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 17:25:28 -0400
Subject: [PATCH 0740/1240] [Build/CI] Move ninja to common deps (#14835)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt     | 1 +
 requirements/rocm-build.txt | 1 -
 requirements/tpu.txt        | 1 -
 requirements/xpu.txt        | 3 +--
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 13a06011e40..3cd933f347f 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -38,3 +38,4 @@ cloudpickle # allows pickling lambda functions in model_executor/models/registry
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
+ninja # Required for xgrammar, rocm, tpu, xpu
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 4d4945b007e..f378663ade7 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -7,7 +7,6 @@ torchvision==0.20.1
 torchaudio==2.5.1
 
 cmake>=3.26
-ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index e071c604b5c..06bcecfc004 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -3,7 +3,6 @@
 
 # Dependencies for TPU
 cmake>=3.26
-ninja
 packaging
 setuptools-scm>=8
 wheel
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 0e3252f02d3..3fd0655904e 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -3,7 +3,6 @@
 
 ray>=2.9
 cmake>=3.26
-ninja
 packaging
 setuptools-scm>=8
 setuptools>=75.8.0
@@ -21,4 +20,4 @@ pytorch-triton-xpu
 # FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
 # intel-extension-for-pytorch==2.6.10+xpu
 oneccl_bind_pt==2.6.0+xpu
---extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
\ No newline at end of file
+--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

From 75c437b021504d422b9f0a75e65b36d37f242fcf Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 19:11:28 -0400
Subject: [PATCH 0741/1240] [Build/CI] Upgrade aiohttp to incldue CVE fix
 (#14840)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index a235c8b24ee..0a2b491669a 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -8,7 +8,7 @@ accelerate==1.0.1
     #   peft
 aiohappyeyeballs==2.4.3
     # via aiohttp
-aiohttp==3.10.10
+aiohttp==3.10.11
     # via
     #   datasets
     #   fsspec

From 74fe43b6b3e19ce3e4d6426dbd9daeb42b36577f Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Fri, 14 Mar 2025 19:12:36 -0400
Subject: [PATCH 0742/1240] [Doc] More neutral K8s deployment guide (#14084)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/k8s.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index dd3769c47fc..b31344b1996 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -4,17 +4,19 @@
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
---------
-
-Alternatively, you can also deploy Kubernetes using [helm chart](https://docs.vllm.ai/en/latest/deployment/frameworks/helm.html). There are also open-source projects available to make your deployment even smoother.
-
-* [vLLM production-stack](https://github.com/vllm-project/production-stack): Born out of a Berkeley-UChicago collaboration, vLLM production stack is a project that contains latest research and community effort, while still delivering production-level stability and performance. Checkout the [documentation page](https://docs.vllm.ai/en/latest/deployment/integrations/production-stack.html) for more details and examples.
-
---------
+Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+* [Helm](frameworks/helm.md)
+* [InftyAI/llmaz](integrations/llmaz.md)
+* [KServe](integrations/kserve.md)
+* [kubernetes-sigs/lws](frameworks/lws.md)
+* [meta-llama/llama-stack](integrations/llamastack.md)
+* [substratusai/kubeai](integrations/kubeai.md)
+* [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
+* [vllm-project/production-stack](integrations/production-stack.md)
 
 ## Pre-requisite
 
-Ensure that you have a running Kubernetes environment with GPU (you can follow [this tutorial](https://github.com/vllm-project/production-stack/blob/main/tutorials/00-install-kubernetes-env.md) to install a Kubernetes environment on a bare-metal GPU machine).
+Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
 
 ## Deployment using native K8s
 

From bbfc31bfa551a046309e73ffd5362012554687dc Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Fri, 14 Mar 2025 17:41:15 -0700
Subject: [PATCH 0743/1240] =?UTF-8?q?[Bugfix]=20Fix=20torch=5Fxla=20in=20V?=
 =?UTF-8?q?0=20which=20can't=20handle=20None=20seed=20introduced=20?=
 =?UTF-8?q?=E2=80=A6=20(#14844)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Yarong Mu <ymu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/tpu_worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 1a5eaba09b9..66911790662 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -51,6 +51,9 @@ def __init__(
         self.model_runner: TPUModelRunner = TPUModelRunner(
             vllm_config=vllm_config, is_driver_worker=is_driver_worker)
 
+        if self.model_config.seed is None:
+            self.model_config.seed = 0
+
     def init_device(self) -> None:
         os.environ["PJRT_DEVICE"] = "TPU"
         torch.set_grad_enabled(False)

From c2ebf4dbe7f15e5cddb6dbed5a8493dd3d712f49 Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Fri, 14 Mar 2025 18:51:35 -0700
Subject: [PATCH 0744/1240] [Neuron][CI] update docker run command (#14829)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-neuron-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 06924fea619..ad5ae6f4157 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -44,7 +44,7 @@ remove_docker_container() {
 trap remove_docker_container EXIT
 
 # Run the image
-docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+docker run --rm -it --device=/dev/neuron0 --network bridge \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
        -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \

From fc9448c9a15691c9d04a3ccf2ed0fc5c45565ee6 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Sat, 15 Mar 2025 11:42:38 +0800
Subject: [PATCH 0745/1240] [Bugfix][V1] Fix flashinfer sampling (#14815)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 7d70e839b6f..d461a809893 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -24,7 +24,24 @@ def __init__(self):
         super().__init__()
         if current_platform.is_cuda():
             if is_flashinfer_available:
-                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
+                flashinfer_version = flashinfer.__version__
+                if flashinfer_version >= "0.2.3":
+                    # FIXME(DefTruth): Currently, we have errors when using
+                    # FlashInfer>=v0.2.3 for top-p & top-k sampling. As a
+                    # workaround, we disable FlashInfer for top-p & top-k
+                    # sampling by default while FlashInfer>=v0.2.3.
+                    # The sampling API removes the success return value
+                    # of all sampling API, which is not compatible with
+                    # earlier design.
+                    # https://github.com/flashinfer-ai/flashinfer/releases/
+                    # tag/v0.2.3
+                    logger.info(
+                        "Currently, FlashInfer top-p & top-k sampling sampler "
+                        "is disabled because FlashInfer>=v0.2.3 is not "
+                        "backward compatible. Falling back to the PyTorch-"
+                        "native implementation of top-p & top-k sampling.")
+                    self.forward = self.forward_native
+                elif envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
                     # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
                     # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
                     # default it is unused). For backward compatibility, we set

From 69ebbe1a4f22dfa8ccbe5ad30c012cf0cc87794c Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 14 Mar 2025 23:45:42 -0400
Subject: [PATCH 0746/1240] =?UTF-8?q?Revert=20"[Model]=20Mamba2=20Prefill?=
 =?UTF-8?q?=20Performance=20Tweaks:=20Fixing=20Flurry=20of=20U=E2=80=A6=20?=
 =?UTF-8?q?(#14848)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/mamba/mamba_mixer2.py              | 30 +++++--------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 5b19e3f3554..b53a540ed66 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -466,17 +466,10 @@ def forward_cuda(
         if has_prefill:
 
             initial_states = None
-
-            if has_initial_states is not None and torch.any(
-                    has_initial_states):
-
-                # vectorized ssm_state zero init
-                batched_zero_init_func = torch.vmap(
-                    lambda idx: mamba_cache_params.ssm_state[idx].zero_())
-                batched_zero_init_func(
-                    mamba_cache_params.
-                    state_indices_tensor[~has_initial_states].unsqueeze(
-                        dim=-1), )
+            if has_initial_states is not None and any(has_initial_states):
+                for idx in mamba_cache_params.state_indices_tensor[
+                        ~has_initial_states]:
+                    mamba_cache_params.ssm_state[idx].zero_()
                 initial_states = mamba_cache_params.ssm_state[
                     mamba_cache_params.state_indices_tensor]
 
@@ -500,17 +493,10 @@ def forward_cuda(
                 dt_limit=(0.0, float("inf")),
             )
 
-            # vectorized ssm state update using vmap
-            # the 1d state_indices_tensor needs to be unsqueezed to avoid vmap
-            # limitation which doesn't allow use of `item()`
-            # Note: the lambda capture can happen where ssm_state is initialized
-            #       instead of here
-            batched_copy = torch.vmap(
-                lambda idx, source_state: mamba_cache_params.ssm_state[
-                    idx].copy_(source_state))
-            batched_copy(
-                mamba_cache_params.state_indices_tensor.unsqueeze(dim=-1),
-                varlen_state)
+            # update ssm states
+            # - varlen state is a (batch, nheads, headdim, dstate) tensor
+            for i, idx in enumerate(mamba_cache_params.state_indices_tensor):
+                mamba_cache_params.ssm_state[idx].copy_(varlen_state[i])
 
             # - reshape
             hidden_states = scan_output.view(seq_len, -1)

From 792cc32fe88f6362d2f24273f6f1f210303990af Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 14 Mar 2025 23:57:55 -0400
Subject: [PATCH 0747/1240] Disable outlines cache by default (#14837)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                                           |  7 +++++++
 .../guided_decoding/outlines_logits_processors.py      | 10 +++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index a36d20a4f8b..0b1bcd9eb35 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_V0_USE_OUTLINES_CACHE: bool = False
 
 
 def get_default_cache_root():
@@ -623,6 +624,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
+
+    # Whether to turn on the outlines cache for V0
+    # This cache is unbounded and on disk, so it's not safe to use in
+    # an environment with potentially malicious users.
+    "VLLM_V0_USE_OUTLINES_CACHE":
+    lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index de24eaa1fb6..8b2a0f4cfe6 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 from outlines import grammars
-from outlines.caching import cache
+from outlines.caching import cache, disable_cache
 from outlines.fsm.guide import (CFGGuide, CFGState, Generate, Guide,
                                 RegexGuide, Write)
 from outlines.fsm.parsing import PartialLark
@@ -32,12 +32,20 @@
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
+if envs.VLLM_V0_USE_OUTLINES_CACHE:
+    logger.warning("Enabling outlines cache. This is an unbounded on-disk "
+                   "cache. It may consume a lot of disk space and should "
+                   "not be used with untrusted clients.")
+else:
+    disable_cache()
+
 
 class BaseLogitsProcessor:
 

From c01dedcbfafcc761bf2a962f27e01566faf89c71 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 15 Mar 2025 12:35:12 +0800
Subject: [PATCH 0748/1240] [Misc] Remove misleading message in gemma2 and
 gemma3 (#14850)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/gemma.py  | 6 +-----
 vllm/model_executor/models/gemma2.py | 5 -----
 vllm/model_executor/models/gemma3.py | 5 -----
 3 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index da17646c540..d741880c00d 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -424,9 +424,5 @@ def load_weights(self, weights: Iterable[Tuple[str,
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
-        unloaded_params = params_dict.keys() - loaded_params
-        if unloaded_params:
-            logger.warning(
-                "Some weights are not initialized from checkpoints: %s",
-                unloaded_params)
+
         return loaded_params
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index cf744fc2b9d..d125c666f3c 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -358,11 +358,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
-        unloaded_params = params_dict.keys() - loaded_params
-        if unloaded_params:
-            logger.warning(
-                "Some weights are not initialized from checkpoints: %s",
-                unloaded_params)
         return loaded_params
 
 
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index f1ecf7fa821..55c96f649fb 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -452,11 +452,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
 
-        unloaded_params = params_dict.keys() - loaded_params
-        if unloaded_params:
-            logger.warning(
-                "Some weights are not initialized from checkpoints: %s",
-                unloaded_params)
         return loaded_params
 
 
From fcf524b65fc475c3b8c0c4b4cec3d1f0502b26e4 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Fri, 14 Mar 2025 21:40:09 -0700
Subject: [PATCH 0749/1240] [Misc][Easy] Annotate unused vars in the csrc files
 (#14798)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/prepare_inputs/advance_step.cu       |  2 +-
 csrc/quantization/fp8/amd/quant_utils.cuh |  2 +-
 csrc/quantization/gptq/q_gemm.cu          | 16 ++++++++--------
 csrc/rocm/attention.cu                    |  7 ++++---
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index c3902f4c2a1..fea4bc2ca0d 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -274,7 +274,7 @@ void advance_step_flashinfer(
   cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
   cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
 
-  int block_tables_stride = block_tables.stride(0);
+  [[maybe_unused]] int block_tables_stride = block_tables.stride(0);
   TORCH_CHECK((blocks * threads > num_queries),
               "multi-step: not enough threads to map to num_queries = ",
               num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
index feda497d021..c4ed1b47579 100644
--- a/csrc/quantization/fp8/amd/quant_utils.cuh
+++ b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -446,7 +446,7 @@ scaled_vec_conversion<uint16_t, uint8_t>(const uint8_t& a, float scale) {
 template <>
 __inline__ __device__ uint32_t
 scaled_vec_conversion<uint32_t, uint16_t>(const uint16_t& a, float scale) {
-  __half2_raw h2r =
+  [[maybe_unused]] __half2_raw h2r =
       __hip_cvt_fp8x2_to_halfraw2(a, fp8_type::__default_interpret);
   union {
     __half2_raw h2r;
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 785f1a09c19..538cb5848e2 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -206,8 +206,8 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
   int offset_m = blockIdx.y * m_count;
   int offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -344,8 +344,8 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
   int offset_m = blockIdx.y * m_count;
   int offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -465,8 +465,8 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
   int offset_m = blockIdx.y * m_count;
   int offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
@@ -593,8 +593,8 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
   int offset_m = blockIdx.y * m_count;
   int offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
-  int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
-  int end_m = min(offset_m + m_count, size_m);
+  [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
+  [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   int n = offset_n + t * 4;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 86029da141b..90f0b54d2f0 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -308,8 +308,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
 
   constexpr int GQA_RATIO4 = DIVIDE_ROUND_UP(GQA_RATIO, 4);
 
-  __shared__ float shared_qk_max[NWARPS][16 + 1];
-  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_qk_max[NWARPS][16 + 1];
+  [[maybe_unused]] __shared__ float shared_exp_sum[NWARPS][16 + 1];
   // shared_logits is used for multiple purposes
   __shared__ _B16x4 shared_logits[NWARPS][4][16][4];
 
@@ -426,7 +426,8 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
     const int klocal_token_idx =
         TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
-    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    [[maybe_unused]] const int kglobal_token_idx =
+        partition_start_token_idx + klocal_token_idx;
     const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
     const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
 

From ca6671a5591ac6d7f9875693e878411cc67f17b3 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sat, 15 Mar 2025 01:02:20 -0400
Subject: [PATCH 0750/1240] [V1] V1 Enablement Oracle  (#13726)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../configs/Minitron-4B-Base-FP8.yaml         |   4 +-
 .../test_lm_eval_correctness.py               |   5 +
 .buildkite/test-pipeline.yaml                 |  34 +-
 tests/async_engine/conftest.py                |  11 +
 tests/async_engine/test_api_server.py         |   6 +-
 tests/async_engine/test_async_llm_engine.py   |   9 +
 .../basic_correctness/test_chunked_prefill.py |   9 +
 tests/basic_correctness/test_cpu_offload.py   |   7 +
 tests/basic_correctness/test_preemption.py    |   9 +
 tests/compile/conftest.py                     |  14 +
 tests/conftest.py                             |  20 +
 tests/core/conftest.py                        |  11 +
 .../__init__.py                               |   0
 tests/detokenizer/conftest.py                 |  10 +
 .../test_disable_detokenization.py}           |   1 +
 .../test_stop_checker.py                      |   0
 .../test_stop_reason.py                       |   0
 tests/detokenizer/test_stop_strings.py        | 141 ++++++
 tests/distributed/test_pipeline_parallel.py   |  12 +
 tests/encoder_decoder/test_e2e_correctness.py |   9 +
 tests/engine/conftest.py                      |  11 +
 ...py => test_multi_step_output_processor.py} |   2 +-
 tests/engine/test_stop_strings.py             | 165 -------
 tests/entrypoints/llm/test_lazy_outlines.py   |   9 +
 tests/entrypoints/openai/test_chat_echo.py    |   3 -
 tests/entrypoints/openai/test_root_path.py    |   3 -
 tests/kernels/test_attention_selector.py      |  28 +-
 tests/kernels/test_encoder_decoder_attn.py    |  10 +
 tests/kernels/test_rocm_attention_selector.py |   3 +-
 tests/lora/test_llama_tp.py                   |   4 +
 tests/lora/test_lora_functions.py             |   4 +-
 tests/lora/test_lora_manager.py               |   3 +
 tests/metrics/test_metrics.py                 |   9 +
 .../models/decoder_only/language/test_gguf.py |  20 +-
 .../decoder_only/language/test_hybrid.py      |  14 +-
 .../decoder_only/language/test_mamba.py       |   7 -
 .../decoder_only/language/test_mistral.py     |  21 +-
 .../decoder_only/language/test_models.py      |  16 +-
 .../decoder_only/vision_language/test_awq.py  |   7 +-
 .../vision_language/test_models.py            |  89 ++--
 .../vision_language/test_qwen2_vl.py          |  12 +-
 .../embedding/language/test_cls_models.py     |   7 -
 .../embedding/language/test_embedding.py      |   7 -
 tests/models/registry.py                      |   8 +-
 tests/models/test_initialization.py           |  14 +-
 tests/models/test_oot_registration.py         |   8 +-
 tests/mq_llm_engine/conftest.py               |  11 +
 tests/plugins_tests/conftest.py               |  11 +
 .../test_disable_sliding_window.py            |   5 +-
 tests/prefix_caching/test_prefix_caching.py   |   9 +
 tests/quantization/test_compressed_tensors.py |   8 +
 tests/quantization/test_cpu_offload.py        |   7 +
 tests/quantization/test_fp8.py                |   7 +-
 tests/quantization/test_gptq_dynamic.py       |   6 +-
 tests/quantization/test_lm_head.py            |   3 +
 tests/quantization/test_quark.py              |   4 +-
 .../test_register_quantization_config.py      |   4 +-
 tests/samplers/test_beam_search.py            |   8 +
 tests/samplers/test_ignore_eos.py             |   7 +
 tests/samplers/test_logits_processor.py       |   8 +
 tests/samplers/test_logprobs.py               |   9 +
 tests/samplers/test_no_bad_words.py           |   7 +
 tests/samplers/test_ranks.py                  |   6 +
 tests/samplers/test_rejection_sampler.py      |   9 +
 tests/samplers/test_sampler.py                |   8 +
 tests/samplers/test_seeded_generate.py        |   4 +-
 .../test_typical_acceptance_sampler.py        |   8 +
 tests/spec_decode/conftest.py                 |  11 +
 tests/tensorizer_loader/conftest.py           |   8 +
 tests/test_regression.py                      |   2 +
 tests/test_utils.py                           |   5 +-
 tests/tokenization/test_detokenize.py         |   5 +
 tests/tool_use/utils.py                       |  30 +-
 tests/tracing/test_tracing.py                 |  10 +
 tests/v1/engine/test_engine_args.py           |  26 +-
 tests/v1/sample/test_logprobs.py              |   4 +-
 tests/v1/test_oracle.py                       | 169 +++++++
 tests/weight_loading/test_weight_loading.py   |   9 +-
 tests/worker/conftest.py                      |  10 +
 vllm/config.py                                |  27 +-
 vllm/engine/arg_utils.py                      | 426 ++++++++++++++----
 vllm/engine/async_llm_engine.py               |  57 ++-
 vllm/engine/llm_engine.py                     |  47 +-
 vllm/engine/multiprocessing/client.py         |   5 +-
 vllm/engine/multiprocessing/engine.py         |  55 ++-
 vllm/entrypoints/llm.py                       |  19 +-
 vllm/entrypoints/openai/api_server.py         |  52 ++-
 vllm/envs.py                                  |  20 +-
 vllm/model_executor/model_loader/utils.py     |   3 +-
 vllm/model_executor/models/bloom.py           |   4 +-
 vllm/model_executor/models/glm.py             |   3 +-
 vllm/model_executor/models/ultravox.py        |   5 +-
 vllm/v1/attention/backends/flash_attn.py      |   3 +-
 vllm/v1/engine/async_llm.py                   |  47 +-
 vllm/v1/engine/llm_engine.py                  |  27 ++
 vllm/v1/engine/processor.py                   |  19 +-
 96 files changed, 1539 insertions(+), 514 deletions(-)
 create mode 100644 tests/async_engine/conftest.py
 create mode 100644 tests/compile/conftest.py
 create mode 100644 tests/core/conftest.py
 rename tests/{engine/output_processor => detokenizer}/__init__.py (100%)
 create mode 100644 tests/detokenizer/conftest.py
 rename tests/{engine/test_detokenization.py => detokenizer/test_disable_detokenization.py} (98%)
 rename tests/{engine/output_processor => detokenizer}/test_stop_checker.py (100%)
 rename tests/{engine => detokenizer}/test_stop_reason.py (100%)
 create mode 100644 tests/detokenizer/test_stop_strings.py
 create mode 100644 tests/engine/conftest.py
 rename tests/engine/{output_processor/test_multi_step.py => test_multi_step_output_processor.py} (99%)
 delete mode 100644 tests/engine/test_stop_strings.py
 create mode 100644 tests/mq_llm_engine/conftest.py
 create mode 100644 tests/plugins_tests/conftest.py
 create mode 100644 tests/spec_decode/conftest.py
 create mode 100644 tests/v1/test_oracle.py
 create mode 100644 tests/worker/conftest.py

diff --git a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
index 3ea0b7bb5cd..4ef8b5c3709 100644
--- a/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Minitron-4B-Base-FP8.yaml
@@ -4,8 +4,8 @@ tasks:
 - name: "gsm8k"
   metrics:
   - name: "exact_match,strict-match"
-    value: 0.233
+    value: 0.231
   - name: "exact_match,flexible-extract"
-    value: 0.236
+    value: 0.22
 limit: 1000
 num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
index 96e57dfd064..4ae23eff62f 100644
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -13,6 +13,7 @@
 
 import lm_eval
 import numpy
+import pytest
 import yaml
 
 RTOL = 0.05
@@ -46,6 +47,10 @@ def test_lm_eval_correctness():
     eval_config = yaml.safe_load(
         Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
 
+    if eval_config[
+            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
+        pytest.skip("FBGEMM is currently failing on main.")
+
     # Launch eval requests.
     results = launch_lm_eval(eval_config)
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 81a97139047..93ac8a29c67 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -117,10 +117,10 @@ steps:
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
-  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Distributed Tests (4 GPUs) # 10min
   working_dir: "/vllm-workspace/tests"
@@ -136,7 +136,7 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   commands:
-  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
+  - python3 ../examples/offline_inference/data_parallel.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -197,16 +197,17 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
-    - VLLM_USE_V1=1 pytest -v -s v1/core
-    - VLLM_USE_V1=1 pytest -v -s v1/engine
-    - VLLM_USE_V1=1 pytest -v -s v1/sample
-    - VLLM_USE_V1=1 pytest -v -s v1/worker
-    - VLLM_USE_V1=1 pytest -v -s v1/structured_output
-    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
-    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/core
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - VLLM_USE_V1=1 pytest -v -s v1/e2e
+    - pytest -v -s v1/e2e
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
@@ -226,12 +227,12 @@ steps:
     - python3 offline_inference/llm_engine_example.py
     - python3 offline_inference/vision_language.py
     - python3 offline_inference/vision_language_multi_image.py
-    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder.py
     - python3 offline_inference/basic/classify.py
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
-    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
 - label: Prefix Caching Test # 9min
   mirror_hardwares: [amd]
@@ -375,7 +376,8 @@ steps:
   commands:
     - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
-    - pytest -v -s models/test_initialization.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
 
 - label: Language Models Test (Standard) # 32min
   #mirror_hardwares: [amd]
@@ -518,8 +520,8 @@ steps:
   # this test fails consistently.
   # TODO: investigate and fix
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py
new file mode 100644
index 00000000000..1a20e2c135c
--- /dev/null
+++ b/tests/async_engine/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 77f3fb0025a..410cece795e 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import subprocess
 import sys
 import time
@@ -44,7 +45,10 @@ def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
         distributed_executor_backend,
     ]
 
-    uvicorn_process = subprocess.Popen(commands)
+    # API Server Test Requires V0.
+    my_env = os.environ.copy()
+    my_env["VLLM_USE_V1"] = "0"
+    uvicorn_process = subprocess.Popen(commands, env=my_env)
     yield
     uvicorn_process.terminate()
 
diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py
index 6307bd7d646..48e2e31e5db 100644
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -151,6 +151,10 @@ def uid() -> str:
 
 @pytest_asyncio.fixture(scope="module")
 async def async_engine():
+    # We cannot use monkeypatch since this is a module
+    # scoped fixture and monkeypatch is function scoped.
+    previous_value = os.getenv("VLLM_USE_V1", None)
+    os.environ["VLLM_USE_V1"] = "0"
     engine = await asyncio.get_event_loop().run_in_executor(executor=None,
                                                             func=start_engine)
     try:
@@ -161,6 +165,11 @@ async def async_engine():
         await asyncio.sleep(0.1)
         cleanup_dist_env_and_memory()
 
+        if previous_value:
+            os.environ["VLLM_USE_V1"] = previous_value
+        else:
+            del os.environ["VLLM_USE_V1"]
+
 
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index fd4a804183b..5bf48b5cced 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -23,6 +23,15 @@
 ]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index be3ad12396b..436e43638a3 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,8 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 from ..utils import compare_two_settings
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index a32b7cac080..63dc0f8c8e3 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -21,6 +21,15 @@
 ]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    We should enable this for V1, but VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT,
+    so use VLLM_USE_V1=0 for all tests in the file.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.fixture(scope="module", autouse=True)
 def check_settings():
     assert ENABLE_ARTIFICIAL_PREEMPT is True, (
diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py
new file mode 100644
index 00000000000..7118810a586
--- /dev/null
+++ b/tests/compile/conftest.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+# TEST V1: this should be removed. Right now V1 overrides
+# all the torch compile logic. We should re-enable this
+# as we add torch compile support back to V1.
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/conftest.py b/tests/conftest.py
index 4fbb4132d38..4716ca2e315 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -111,6 +111,26 @@ def prompts(self, prompts: _VideoAssetPrompts) -> list[str]:
 """Singleton instance of :class:`_VideoAssets`."""
 
 
+@pytest.fixture(scope="function", autouse=True)
+def cleanup_VLLM_USE_V1(monkeypatch):
+    """
+    The V1 oracle sets "VLLM_USE_V1" during loading. This means
+    that each invocation of a test change the env variable.
+
+    If we touch "VLLM_USE_V1" with monkeypatch, then any changes
+    made during the test run by vLLM will be cleaned up.
+
+    This fixture is used by every test.
+    """
+
+    # If VLLM_USE_V1 is not set, set then delete. This will
+    # cause monkeypatch to clean up VLLM_USE_V1 upon exit
+    # if VLLM modifies the value of envs.VLLM_USE_V1.
+    if "VLLM_USE_V1" not in os.environ:
+        monkeypatch.setenv("VLLM_USE_V1", "")
+        monkeypatch.delenv("VLLM_USE_V1")
+
+
 @pytest.fixture(params=[True, False])
 def run_with_both_engines(request, monkeypatch):
     # Automatically runs tests twice, once with V1 and once without
diff --git a/tests/core/conftest.py b/tests/core/conftest.py
new file mode 100644
index 00000000000..1a20e2c135c
--- /dev/null
+++ b/tests/core/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/engine/output_processor/__init__.py b/tests/detokenizer/__init__.py
similarity index 100%
rename from tests/engine/output_processor/__init__.py
rename to tests/detokenizer/__init__.py
diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py
new file mode 100644
index 00000000000..59394b0351b
--- /dev/null
+++ b/tests/detokenizer/conftest.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
diff --git a/tests/engine/test_detokenization.py b/tests/detokenizer/test_disable_detokenization.py
similarity index 98%
rename from tests/engine/test_detokenization.py
rename to tests/detokenizer/test_disable_detokenization.py
index 2b7ebf705bb..14f9babb8d8 100644
--- a/tests/engine/test_detokenization.py
+++ b/tests/detokenizer/test_disable_detokenization.py
@@ -6,6 +6,7 @@
 from vllm.sampling_params import SamplingParams
 
 
+@pytest.mark.skip_v1
 @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py
similarity index 100%
rename from tests/engine/output_processor/test_stop_checker.py
rename to tests/detokenizer/test_stop_checker.py
diff --git a/tests/engine/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
similarity index 100%
rename from tests/engine/test_stop_reason.py
rename to tests/detokenizer/test_stop_reason.py
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
new file mode 100644
index 00000000000..0607dd01a33
--- /dev/null
+++ b/tests/detokenizer/test_stop_strings.py
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams, envs
+
+MODEL = "meta-llama/llama-2-7b-hf"
+MAX_TOKENS = 200
+
+
+def _test_stopping(llm: LLM,
+                   expected_output: str,
+                   expected_reason: Any,
+                   stop: Optional[list[str]] = None,
+                   stop_token_ids: Optional[list[int]] = None,
+                   include_in_output: bool = False) -> None:
+    output = llm.generate(
+        "A story about vLLM:\n",
+        SamplingParams(
+            temperature=0.0,
+            max_tokens=MAX_TOKENS,
+            stop=stop,
+            stop_token_ids=stop_token_ids,
+            include_stop_str_in_output=include_in_output,
+        ))[0].outputs[0]
+
+    assert output is not None
+    assert output.text == expected_output
+    assert output.stop_reason == expected_reason
+
+
+def _set_async_mode(llm, is_async):
+    llm.llm_engine.scheduler[0].use_async_output_proc = is_async
+
+
+def _stop_basic(llm):
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=".")
+
+    _test_stopping(llm,
+                   stop=["."],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization.",
+                   expected_reason=".")
+
+
+def _stop_multi_tokens(llm):
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=False,
+        expected_output="VLLM is a 100% volunteer organization. We are a ",
+        expected_reason="group of peo")
+
+    _test_stopping(
+        llm,
+        stop=["group of peo", "short"],
+        include_in_output=True,
+        expected_output=
+        "VLLM is a 100% volunteer organization. We are a group of peo",
+        expected_reason="group of peo")
+
+
+def _stop_partial_token(llm):
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer or",
+                   expected_reason="gani")
+
+    _test_stopping(llm,
+                   stop=["gani"],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organi",
+                   expected_reason="gani")
+
+
+def _stop_token_id(llm):
+    # token id 13013 => " organization"
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=False,
+                   expected_output="VLLM is a 100% volunteer",
+                   expected_reason=13013)
+
+    _test_stopping(llm,
+                   stop_token_ids=[13013],
+                   include_in_output=True,
+                   expected_output="VLLM is a 100% volunteer organization",
+                   expected_reason=13013)
+
+
+@pytest.mark.skip_global_cleanup
+def test_stop_strings():
+    # If V0, must set enforce_eager=False since we use
+    # async output processing below.
+    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+
+    if envs.VLLM_USE_V1:
+        _stop_basic(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_basic(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_basic(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_multi_tokens(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_multi_tokens(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_multi_tokens(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        _stop_partial_token(vllm_model)
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_partial_token(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_partial_token(vllm_model)
+
+    if envs.VLLM_USE_V1:
+        # FIXME: this does not respect include_in_output=False
+        # _stop_token_id(vllm_model)
+        pass
+    else:
+        _set_async_mode(vllm_model, True)
+        _stop_token_id(vllm_model)
+
+        _set_async_mode(vllm_model, False)
+        _stop_token_id(vllm_model)
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 4b479a0c93a..05b6ba40506 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -24,6 +24,18 @@
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    For PP, we fall back to V0 by default. This means
+    that the TP baseline runs with V1 while the PP engine
+    runs with V0. This gives divergent results with dummy
+    weights. Once we enable V1 by default for PP, we can
+    remove this.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index cb772fc7608..0f46fba3ac4 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -21,6 +21,15 @@
 ]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def vllm_to_hf_output(
     vllm_output: tuple[list[int], str, Optional[SampleLogprobs]],
     decoder_prompt_type: DecoderPromptType,
diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py
new file mode 100644
index 00000000000..1a20e2c135c
--- /dev/null
+++ b/tests/engine/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/test_multi_step_output_processor.py
similarity index 99%
rename from tests/engine/output_processor/test_multi_step.py
rename to tests/engine/test_multi_step_output_processor.py
index 3ba3c4ec53a..b67dd86bfdf 100644
--- a/tests/engine/output_processor/test_multi_step.py
+++ b/tests/engine/test_multi_step_output_processor.py
@@ -15,7 +15,7 @@
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.utils import Counter
 
-from ...core.utils import create_seq_group
+from ..core.utils import create_seq_group
 
 
 @pytest.mark.parametrize("seq_output_len", [128])
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
deleted file mode 100644
index 62d167aa14b..00000000000
--- a/tests/engine/test_stop_strings.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Any, Optional
-
-import pytest
-
-from vllm import CompletionOutput, LLMEngine, SamplingParams
-
-MODEL = "meta-llama/llama-2-7b-hf"
-MAX_TOKENS = 200
-
-IS_ASYNC = False
-
-
-@pytest.fixture(scope="session")
-def vllm_model(vllm_runner):
-    with vllm_runner(MODEL) as vllm_model:
-        yield vllm_model
-
-
-def _test_stopping(llm_engine: LLMEngine,
-                   expected_output: str,
-                   expected_reason: Any,
-                   stop: Optional[list[str]] = None,
-                   stop_token_ids: Optional[list[int]] = None,
-                   include_in_output: bool = False,
-                   use_async_output_proc: bool = False) -> None:
-    llm_engine.add_request(
-        "id", "A story about vLLM:\n",
-        SamplingParams(
-            temperature=0.0,
-            max_tokens=MAX_TOKENS,
-            stop=stop,
-            stop_token_ids=stop_token_ids,
-            include_stop_str_in_output=include_in_output,
-        ), None)
-
-    output: Optional[CompletionOutput] = None
-    output_text = ""
-    stop_reason = None
-
-    if use_async_output_proc:
-        llm_engine.step()
-
-    while llm_engine.has_unfinished_requests():
-        (request_output, ) = llm_engine.step()
-        (output, ) = request_output.outputs
-
-        # Ensure we don't backtrack
-        assert output.text.startswith(output_text)
-        output_text = output.text
-        stop_reason = output.stop_reason
-
-    assert output is not None
-    assert output_text == expected_output
-    assert stop_reason == expected_reason
-
-
-def _set_async_mode(llm_engine, is_async):
-    llm_engine.scheduler[0].use_async_output_proc = is_async
-
-
-def _stop_basic(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["."],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization.",
-                   expected_reason=".",
-                   use_async_output_proc=is_async)
-
-
-def _stop_multi_tokens(llm_engine, is_async):
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=False,
-        expected_output="VLLM is a 100% volunteer organization. We are a ",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-    _test_stopping(
-        llm_engine,
-        stop=["group of peo", "short"],
-        include_in_output=True,
-        expected_output=
-        "VLLM is a 100% volunteer organization. We are a group of peo",
-        expected_reason="group of peo",
-        use_async_output_proc=is_async)
-
-
-def _stop_partial_token(llm_engine, is_async):
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer or",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop=["gani"],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organi",
-                   expected_reason="gani",
-                   use_async_output_proc=is_async)
-
-
-def _stop_token_id(llm_engine, is_async):
-    # token id 13013 => " organization"
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=False,
-                   expected_output="VLLM is a 100% volunteer",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-    _test_stopping(llm_engine,
-                   stop_token_ids=[13013],
-                   include_in_output=True,
-                   expected_output="VLLM is a 100% volunteer organization",
-                   expected_reason=13013,
-                   use_async_output_proc=is_async)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_basic(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_basic(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_basic(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_multi_tokens(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_multi_tokens(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_partial_token(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_partial_token(vllm_model.model.llm_engine, is_async=False)
-
-
-@pytest.mark.skip_global_cleanup
-def test_stop_token_id(vllm_model):
-    _set_async_mode(vllm_model.model.llm_engine, True)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=True)
-
-    _set_async_mode(vllm_model.model.llm_engine, False)
-    _stop_token_id(vllm_model.model.llm_engine, is_async=False)
diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 0598e3990d8..f065f6564cd 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -3,12 +3,21 @@
 import sys
 from contextlib import nullcontext
 
+import pytest
 from vllm_test_utils import BlameResult, blame
 
 from vllm import LLM, SamplingParams
 from vllm.distributed import cleanup_dist_env_and_memory
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 only supports xgrammar so this is irrelevant.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def run_normal_opt125m():
     prompts = [
         "Hello, my name is",
diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py
index 3e76158a8c1..86ee17c6f44 100644
--- a/tests/entrypoints/openai/test_chat_echo.py
+++ b/tests/entrypoints/openai/test_chat_echo.py
@@ -10,7 +10,6 @@
 
 # # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
-DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 
 
 @pytest.fixture(scope="module")
@@ -22,8 +21,6 @@ def server():
         "--enforce-eager",
         "--max-model-len",
         "4080",
-        "--chat-template",
-        DUMMY_CHAT_TEMPLATE,
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py
index c9fa192fb6a..106d6b2c14f 100644
--- a/tests/entrypoints/openai/test_root_path.py
+++ b/tests/entrypoints/openai/test_root_path.py
@@ -11,7 +11,6 @@
 
 # # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
-DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 API_KEY = "abc-123"
 ERROR_API_KEY = "abc"
 ROOT_PATH = "llm"
@@ -28,8 +27,6 @@ def server():
         "4080",
         "--root-path",  # use --root-path=/llm for testing
         "/" + ROOT_PATH,
-        "--chat-template",
-        DUMMY_CHAT_TEMPLATE,
     ]
     envs = os.environ.copy()
 
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 0e87437312e..570e643e036 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -23,12 +23,14 @@ def clear_cache():
 
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+@pytest.mark.parametrize("use_v1", [True, False])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, device: str, monkeypatch):
+def test_env(name: str, use_v1: bool, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
     """
 
+    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
     override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
@@ -40,7 +42,8 @@ def test_env(name: str, device: str, monkeypatch):
         with patch("vllm.attention.selector.current_platform", RocmPlatform()):
             backend = get_attn_backend(16, torch.float16, torch.float16, 16,
                                        False)
-        assert backend.get_name() == "ROCM_FLASH"
+        EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+        assert backend.get_name() == EXPECTED
     elif device == "openvino":
         with patch("vllm.attention.selector.current_platform",
                    OpenVinoPlatform()), patch.dict('sys.modules',
@@ -54,7 +57,8 @@ def test_env(name: str, device: str, monkeypatch):
                        CudaPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            16, False)
-            assert backend.get_name() == name
+            EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+            assert backend.get_name() == EXPECTED
 
 
 def test_flash_attn(monkeypatch):
@@ -95,13 +99,23 @@ def test_flash_attn(monkeypatch):
     assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
-def test_invalid_env(monkeypatch):
+@pytest.mark.parametrize("use_v1", [True, False])
+def test_invalid_env(use_v1: bool, monkeypatch):
     """Ignore the invalid env variable if it is set."""
+    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
     override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+
     with patch("vllm.attention.selector.current_platform", CudaPlatform()):
         backend = get_attn_backend(32, torch.float16, None, 16, False)
-        assert backend.get_name() == "FLASH_ATTN"
+        EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
+        assert backend.get_name() == EXPECTED
 
         # when block size == 16, backend will fall back to XFORMERS
-        backend = get_attn_backend(16, torch.float16, None, 16, False)
-        assert backend.get_name() == "XFORMERS"
+        # this behavior is not yet supported on V1.
+        if use_v1:
+            # TODO: support fallback on V1!
+            # https://github.com/vllm-project/vllm/issues/14524
+            pass
+        else:
+            backend = get_attn_backend(16, torch.float16, None, 16, False)
+            assert backend.get_name() == "XFORMERS"
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index 547a63499b2..c8ee46bc65d 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -22,6 +22,16 @@
 from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Encoder-decoder is only supported on V0, so set 
+    VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 # List of support backends for encoder/decoder models
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
 HEAD_SIZES = [64, 256]
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 5848dc014ca..7cd60824866 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -24,7 +24,8 @@ def test_selector(monkeypatch):
 
     with patch("vllm.attention.selector.current_platform", RocmPlatform()):
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-        assert backend.get_name() == "ROCM_FLASH"
+        assert (backend.get_name() == "ROCM_FLASH"
+                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
         # mla test for deepseek related
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index e84ff30ba99..d497ae6b2bc 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -80,6 +80,8 @@ def v1(run_with_both_engines_lora):
     pass
 
 
+# V1 Test: Failing due to numerics on V1.
+@pytest.mark.skip_v1
 @fork_new_process_for_each_test
 def test_llama_lora(sql_lora_files):
 
@@ -123,6 +125,8 @@ def get_num_gpu_blocks_no_lora():
         "less when using lora than when not using lora")
 
 
+# V1 Test: Failing due to numerics on V1.
+@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @fork_new_process_for_each_test
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index b279566c00f..204624a0540 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -8,7 +8,7 @@
 import pytest
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
-from vllm.entrypoints.llm import LLM
+from vllm.engine.llm_engine import LLMEngine
 from vllm.lora.request import LoRARequest
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
@@ -43,7 +43,7 @@ def test_lora_functions_sync():
                              gpu_memory_utilization=0.8,
                              enforce_eager=True)
 
-    llm = LLM.get_engine_class().from_engine_args(engine_args)
+    llm = LLMEngine.from_engine_args(engine_args)
 
     def run_check(fn, args, expected: list):
         fn(args)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 8d258331259..db6a6ec78fa 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -7,6 +7,7 @@
 from safetensors.torch import load_file
 from torch import nn
 
+from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
@@ -410,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
+@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
@@ -489,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
+@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e23ff43ebd7..8ddcefd9191 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -15,6 +15,15 @@
 from vllm.sampling_params import SamplingParams
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 MODELS = [
     "distilbert/distilgpt2",
 ]
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 804df4c4903..dd34a2577a0 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -110,16 +110,6 @@ def test_models(
         example_prompts = tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True)
 
-    # Run unquantized model.
-    with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tp_size) as original_model:
-        original_outputs = original_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
-
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
                      enforce_eager=True,
@@ -130,6 +120,16 @@ def test_models(
         gguf_outputs = gguf_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
+    # Run unquantized model.
+    with vllm_runner(
+            model_name=model.original_model,
+            enforce_eager=True,  # faster tests
+            dtype=dtype,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tp_size) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
+
     check_logprobs_close(
         outputs_0_lst=original_outputs,
         outputs_1_lst=gguf_outputs,
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
index a39b1192358..1a78b30930e 100644
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -9,7 +9,9 @@
 from ...utils import check_outputs_equal
 
 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
+MODELS = ["ai21labs/Jamba-tiny-dev"]
+# Bamba at Fp32 is too big for the CI (L4 GPU).
+# MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -41,13 +43,6 @@ def test_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
@@ -192,6 +187,7 @@ def test_parallel_sampling(
     )
 
 
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
@@ -293,6 +289,7 @@ def test_state_cleanup(
                     "could be related to finished_requests_ids")
 
 
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_multistep(
@@ -308,6 +305,7 @@ def test_multistep(
         vllm_model.generate_greedy([example_prompts[0]] * 10, 1)
 
 
+@pytest.mark.skip(reason="RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 80d13b667bb..47b9c0f69c3 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -68,13 +68,6 @@ def test_models(
     with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
         vllm_output_ids, vllm_output_str = vllm_outputs[i]
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 17923673023..7e1337b7d48 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -213,16 +213,6 @@ def test_mistral_format(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="auto",
-            load_format="safetensors",
-            config_format="hf",
-    ) as hf_format_model:
-        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
     with vllm_runner(
             model,
             dtype=dtype,
@@ -233,6 +223,16 @@ def test_mistral_format(
         mistral_format_outputs = mistral_format_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tokenizer_mode="auto",
+            load_format="safetensors",
+            config_format="hf",
+    ) as hf_format_model:
+        hf_format_outputs = hf_format_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
     check_logprobs_close(
         outputs_0_lst=hf_format_outputs,
         outputs_1_lst=mistral_format_outputs,
@@ -261,6 +261,7 @@ def test_mistral_symbolic_languages(
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
+@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                          MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 71e4a9f11ab..a49926ea220 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -7,6 +7,12 @@
 
 from ...utils import check_logprobs_close
 
+# These have unsupported head_dim for FA. We do not
+# not have a clean way to fall back, so we fail with
+# a clear msg when it happens.
+# https://github.com/vllm-project/vllm/issues/14524
+REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
+
 
 @pytest.mark.parametrize(
     "model",
@@ -71,7 +77,10 @@ def test_models(
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
+    monkeypatch,
 ) -> None:
+    if model in REQUIRES_V0:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
 
     with hf_runner(model, dtype=dtype) as hf_model:
         if model.startswith("THUDM/chatglm3"):
@@ -85,13 +94,6 @@ def test_models(
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs)
 
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/decoder_only/vision_language/test_awq.py b/tests/models/decoder_only/vision_language/test_awq.py
index f4a6dd0f101..6cc81d2b9ed 100644
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -108,7 +108,12 @@ def run_awq_test(
 @pytest.mark.parametrize("num_logprobs", [5])
 @torch.inference_mode()
 def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs) -> None:
+                    size_factors, dtype, max_tokens, num_logprobs,
+                    monkeypatch) -> None:
+
+    # Test V1: this test hangs during setup on single-scale input.
+    # TODO: fixure out why and re-enable this on V1.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     run_awq_test(
         vllm_runner,
         image_assets,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index a0f1229f0af..7cdd037d49a 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,8 +9,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import (AutoModelForImageTextToText, AutoModelForPreTraining,
-                          AutoModelForVision2Seq)
+from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
@@ -33,6 +32,16 @@
 if current_platform.is_rocm():
     os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 
+REQUIRES_V0_MODELS = [
+    # V1 Test: no way to fall back for head_dim = 80
+    # https://github.com/vllm-project/vllm/issues/14524
+    "qwen_vl",
+    "h2ovl",
+    "blip2",
+    # V1 Test: not enough KV cache space in C1.
+    "fuyu",
+]
+
 # yapf: disable
 COMMON_BROADCAST_SETTINGS = {
     "test_type": VLMTestType.IMAGE,
@@ -157,25 +166,25 @@
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
-    "aria": VLMTestInfo(
-        models=["rhymes-ai/Aria"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        auto_cls=AutoModelForImageTextToText,
-        single_image_prompts=IMAGE_ASSETS.prompts({
-            "stop_sign": "<vlm_image>Please describe the image shortly.",
-            "cherry_blossom": "<vlm_image>Please infer the season with reason.",
-        }),
-        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-        postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"),
-        stop_str=["<|im_end|>"],
-        image_size_factors=[(0.10, 0.15)],
-        max_tokens=64,
-        marks=[large_gpu_mark(min_gb=64)],
-    ),
+    # "aria": VLMTestInfo(
+    #     models=["rhymes-ai/Aria"],
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+    #     img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+    #     max_model_len=4096,
+    #     max_num_seqs=2,
+    #     auto_cls=AutoModelForImageTextToText,
+    #     single_image_prompts=IMAGE_ASSETS.prompts({
+    #         "stop_sign": "<vlm_image>Please describe the image shortly.",
+    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
+    #     }),
+    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+    #     postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
+    #     stop_str=["<|im_end|>"],
+    #     image_size_factors=[(0.10, 0.15)],
+    #     max_tokens=64,
+    #     marks=[large_gpu_mark(min_gb=64)],
+    # ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
@@ -589,7 +598,9 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
                              hf_runner: type[HfRunner],
                              vllm_runner: type[VllmRunner],
-                             image_assets: _ImageAssets):
+                             image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_single_image_test(
         tmp_path=tmp_path,
@@ -612,7 +623,9 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
-                            image_assets: _ImageAssets):
+                            image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_multi_image_test(
         tmp_path=tmp_path,
@@ -635,7 +648,9 @@ def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
                                 hf_runner: type[HfRunner],
                                 vllm_runner: type[VllmRunner],
-                                image_assets: _ImageAssets):
+                                image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_embedding_test(
         model_test_info=model_test_info,
@@ -655,7 +670,9 @@ def test_image_embedding_models(model_type: str,
     ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
-                      video_assets: _VideoAssets):
+                      video_assets: _VideoAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_video_test(
         model_test_info=model_test_info,
@@ -678,7 +695,10 @@ def test_custom_inputs_models(
     test_case: ExpandableVLMTestArgs,
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
+    monkeypatch,
 ):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_custom_inputs_test(
         model_test_info=model_test_info,
@@ -701,7 +721,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
                                    hf_runner: type[HfRunner],
                                    vllm_runner: type[VllmRunner],
-                                   image_assets: _ImageAssets):
+                                   image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_single_image_test(
         tmp_path=tmp_path,
@@ -725,7 +747,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
                                   hf_runner: type[HfRunner],
                                   vllm_runner: type[VllmRunner],
-                                  image_assets: _ImageAssets):
+                                  image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_multi_image_test(
         tmp_path=tmp_path,
@@ -749,7 +773,9 @@ def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
                                       hf_runner: type[HfRunner],
                                       vllm_runner: type[VllmRunner],
-                                      image_assets: _ImageAssets):
+                                      image_assets: _ImageAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_embedding_test(
         model_test_info=model_test_info,
@@ -770,7 +796,9 @@ def test_image_embedding_models_heavy(model_type: str,
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: type[HfRunner],
                             vllm_runner: type[VllmRunner],
-                            video_assets: _VideoAssets):
+                            video_assets: _VideoAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_video_test(
         model_test_info=model_test_info,
@@ -794,7 +822,10 @@ def test_custom_inputs_models_heavy(
     test_case: ExpandableVLMTestArgs,
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
+    monkeypatch,
 ):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
     model_test_info = VLM_TEST_SETTINGS[model_type]
     runners.run_custom_inputs_test(
         model_test_info=model_test_info,
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index af494eb2e62..0b27a4caf6e 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -14,6 +14,15 @@
                           PromptVideoInput, VllmRunner)
 from ...utils import check_logprobs_close
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    V1 Test: batch_make_xxxxx_embeddings calls a V0 internal
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 models = ["Qwen/Qwen2-VL-2B-Instruct"]
 target_dtype = "half"
 
@@ -118,6 +127,7 @@ def get_image_embeds(model):
             return visual(pixel_values_on_device,
                           grid_thw=image_grid_thw_on_device)
 
+    # V1 Test: this calls a V0 internal.
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
@@ -201,6 +211,7 @@ def get_image_embeds(model):
             return visual(pixel_values_on_device,
                           grid_thw=video_grid_thw_on_device)
 
+    # V1 Test: this calls a V0 internal.
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
     # split into original batches
@@ -253,7 +264,6 @@ def run_embedding_input_test(
 
     processor = AutoProcessor.from_pretrained(model)
 
-    # NOTE:
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
                      task="generate",
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index c6155da50b5..6a3cd8a5c59 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -35,13 +35,6 @@ def test_classification_models(
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
 
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
     with hf_runner(model,
                    dtype=dtype,
                    auto_cls=AutoModelForSequenceClassification) as hf_model:
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 6c28ee91a50..5deb35fa321 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -73,13 +73,6 @@ def test_models(
                      **vllm_extra_kwargs) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
 
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
         embeddings_1_lst=vllm_outputs,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 372ea33ba9f..6b0ac46b0c3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -256,7 +256,8 @@ def check_available_online(
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
-                                                     extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
+                                                     extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
+                                                             "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
@@ -274,8 +275,9 @@ def check_available_online(
                               trust_remote_code=True),
     "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
-    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-vision-instruct",
-                                        trust_remote_code=True),
+    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
+                                        trust_remote_code=True,
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index c58c6372316..adb2d6d0a99 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -6,6 +6,8 @@
 from transformers import PretrainedConfig
 
 from vllm import LLM
+from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
+from vllm.v1.engine.core import EngineCore as V1EngineCore
 
 from .registry import HF_EXAMPLE_MODELS
 
@@ -36,12 +38,18 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         return hf_config
 
     # Avoid calling model.forward()
-    def _initialize_kv_caches(self) -> None:
+    def _initialize_kv_caches_v0(self) -> None:
         self.cache_config.num_gpu_blocks = 0
         self.cache_config.num_cpu_blocks = 0
 
-    with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
-                      _initialize_kv_caches):
+    def _initalize_kv_caches_v1(self, vllm_config):
+        # gpu_blocks (> 0), cpu_blocks
+        return 1, 0
+
+    with (patch.object(V0LLMEngine, "_initialize_kv_caches",
+                       _initialize_kv_caches_v0),
+          patch.object(V1EngineCore, "_initialize_kv_caches",
+                       _initalize_kv_caches_v1)):
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index f2a505596ce..d3d07d0d9ac 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -11,12 +11,14 @@
 
 
 @fork_new_process_for_each_test
-def test_plugin(dummy_opt_path):
+def test_plugin(dummy_opt_path, monkeypatch):
+    # V1 shuts down rather than raising an error here.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     os.environ["VLLM_PLUGINS"] = ""
     with pytest.raises(Exception) as excinfo:
         LLM(model=dummy_opt_path, load_format="dummy")
     error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM."
+                "the Transformers implementation is not compatible with vLLM"
     assert (error_msg in str(excinfo.value))
 
 
@@ -51,7 +53,7 @@ def test_oot_registration_embedding(dummy_gemma2_embedding_path):
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path):
+def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
     os.environ["VLLM_PLUGINS"] = "register_dummy_model"
     prompts = [{
         "prompt": "What's in the image?<image>",
diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py
new file mode 100644
index 00000000000..1a20e2c135c
--- /dev/null
+++ b/tests/mq_llm_engine/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py
new file mode 100644
index 00000000000..8561f2ddfa2
--- /dev/null
+++ b/tests/plugins_tests/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 19f393e0798..4cc399175df 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -34,7 +34,10 @@ def test_disable_sliding_window(model_len_len, ):
     del vllm_disabled_model
     cleanup_dist_env_and_memory()
 
-    vllm_enabled_model = LLM(model, disable_sliding_window=False)
+    vllm_enabled_model = LLM(model,
+                             enforce_eager=True,
+                             disable_sliding_window=False,
+                             enable_prefix_caching=False)
     vllm_enabled_model.generate("Hi my name is")
     model_config = vllm_enabled_model.llm_engine.model_config
     assert model_config.max_model_len == full_len, (
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index d7d84bdcf38..7a4bc7aecc0 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -16,6 +16,15 @@
 
 from ..models.utils import check_outputs_equal
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 MODELS = [
     "distilbert/distilgpt2",
 ]
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b9b2b634e0b..133475a3e06 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -21,6 +21,14 @@
 from vllm.platforms import current_platform
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module relies on V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize(
     "model_args",
     [
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index de03d37a74b..79afcc916f2 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,6 +10,13 @@
 from ..utils import compare_two_settings
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    # Fall back to V0 if cpu offloading is enabled.
+    # Fixture is required to that baseline uses V0.
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index b9a1d759b9a..19cf29d3e65 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -47,7 +47,9 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
 
         def check_model(model):
@@ -86,6 +88,9 @@ def check_model(model):
 @pytest.mark.parametrize("force_marlin", [False, True])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
                          monkeypatch) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index c6f34fef274..22055c49ae2 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -28,8 +28,10 @@
 
 
 @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT)
-def test_gptq_with_dynamic(vllm_runner, model_id: str,
-                           use_marlin_kernel: bool):
+def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
+                           monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
 
     vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048)
 
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index 20435a287e3..1c6bd18521c 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -29,7 +29,10 @@ def test_lm_head(
     vllm_runner,
     model_id: str,
     lm_head_quantized: bool,
+    monkeypatch,
 ) -> None:
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_id, dtype=torch.float16,
                      max_model_len=2048) as vllm_model:
 
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 491370c7cc2..85dc695be68 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -10,7 +10,9 @@
     QuarkLinearMethod, QuarkW8A8Fp8)
 
 
-def test_quark_fp8(vllm_runner):
+def test_quark_fp8(vllm_runner, monkeypatch):
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test"
     with vllm_runner(model_path) as llm:
 
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index f64dca6e4bb..abc1c05de3c 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -101,8 +101,10 @@ def test_register_quantization_config():
                          argvalues=[
                              "meta-llama/Llama-3.2-1B-Instruct",
                          ])
-def test_custom_quant(vllm_runner, model):
+def test_custom_quant(vllm_runner, model, monkeypatch):
     """Test infer with the custom quantization method."""
+    # vllm_runner.apply_model() relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_name=model,
                      quantization="custom_quant",
                      enforce_eager=True) as llm:
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 39feb1895b0..a1a81b3891f 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -6,6 +6,13 @@
 
 import pytest
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
 #   2. Increase beam_width to 8.
@@ -15,6 +22,7 @@
 MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
 
 
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 673d1b9a7ef..2a124aa0c59 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -8,6 +8,13 @@
 
 from vllm import SamplingParams
 
+
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 # We also test with llama because it has generation_config to specify EOS
 # (past regression).
 MODELS = ["distilbert/distilgpt2", "meta-llama/Llama-3.2-1B"]
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index f237b616077..74f1eb4a954 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -8,6 +8,14 @@
 MODELS = ["distilbert/distilgpt2"]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_logits_processor_force_generate(
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 58c7c256473..5cc646e76ec 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -10,6 +10,15 @@
 MODELS = ["distilbert/distilgpt2"]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module is V0 only since it uses dtype=float, so
+    set VLLM_USE_V1=0 for all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype",
                          ["float"])  # needed for comparing logprobs with HF
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 29e73eb1bea..355e3adcf5f 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -6,11 +6,18 @@
 """
 from typing import Optional
 
+import pytest
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 def _generate(
     model: LLM,
     prompt: str,
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 66779d97a92..ebe9b302148 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -7,6 +7,12 @@
 MODELS = ["distilbert/distilgpt2"]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    """We can run both engines for this test."""
+    pass
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_ranks(
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 2b86dcac7f0..8884f8ae70b 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -8,6 +8,15 @@
 from vllm.model_executor.layers.rejection_sampler import RejectionSampler
 from vllm.model_executor.utils import set_random_seed
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 68944ac7e1e..6924aba1157 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -18,6 +18,14 @@
 from vllm.utils import Counter, is_pin_memory_available
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 class MockLogitsSampler(Sampler):
 
     def __init__(self, fake_logits: torch.Tensor):
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index 4e828256130..efa2642dba9 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,7 +17,9 @@
 
 
 @pytest.fixture
-def vllm_model(vllm_runner):
+def vllm_model(vllm_runner, monkeypatch):
+    # This file relies on V0 internals.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(MODEL, dtype="half") as vllm_model:
         yield vllm_model
 
diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py
index ecf98179ca2..279e5ed100d 100644
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -11,6 +11,14 @@
 CUDA_DEVICES = [f"cuda:{i}" for i in range(1)]
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This file tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 def get_zero_temperature_prob_dist(batch_size, k, vocab_size):
     """
     Generates a fake temperature zero probability distribution.
diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py
new file mode 100644
index 00000000000..1a20e2c135c
--- /dev/null
+++ b/tests/spec_decode/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index 694bb5fbc3f..a88ae8cda73 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -12,6 +12,14 @@
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Tensorizer only tested on V0 so far.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 @pytest.fixture(autouse=True)
 def cleanup():
     cleanup_dist_env_and_memory(shutdown_ray=True)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index ce9498e8d7e..b54dc6af3e9 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -7,11 +7,13 @@
 """
 import gc
 
+import pytest
 import torch
 
 from vllm import LLM, SamplingParams
 
 
+@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
 def test_duplicated_ignored_sequence_group():
     """https://github.com/vllm-project/vllm/issues/1655"""
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 49fb02fd040..dcca7d5965e 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -366,7 +366,10 @@ def test_bind_kv_cache_non_attention():
     assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
 
 
-def test_bind_kv_cache_encoder_decoder():
+def test_bind_kv_cache_encoder_decoder(monkeypatch):
+    # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
     from vllm.attention import Attention, AttentionType
 
     # example from bart
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 9aa2eea3154..b1860e0bb70 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -279,7 +279,12 @@ def test_decode_prompt_logprobs_chunked_prefill(
     model,
     chunked_prefill_token_size: int,
     example_prompts,
+    monkeypatch,
 ):
+    # VLLM V1 does not use incremental detokenization for
+    # prompt logprobs, so this test strategy is irrelevant.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
+
     max_num_seqs = 256
     enable_chunked_prefill = False
     max_num_batched_tokens = None
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index fd947bd7fed..aad37eb9b8f 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -91,20 +91,22 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally."
     },
-    "granite20b": {
-        "model":
-        "mbayser/granite-20b-functioncalling-FP8-KV",
-        "arguments": [
-            "--tool-call-parser", "granite-20b-fc", "--chat-template",
-            str(VLLM_PATH /
-                "examples/tool_chat_template_granite_20b_fc.jinja"),
-            "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
-        ],
-        "supports_parallel":
-        False,
-        "supports_rocm":
-        False,
-    },
+    # V1 Test: Passing locally but failing in CI. This runs the
+    # V0 Engine because of CPU offloading. Need to debug why.
+    # "granite20b": {
+    #     "model":
+    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "arguments": [
+    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
+    #         str(VLLM_PATH /
+    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #     ],
+    #     "supports_parallel":
+    #     False,
+    #     "supports_rocm":
+    #     False,
+    # },
     "granite-3.0-8b": {
         "model":
         "ibm-granite/granite-3.0-8b-instruct",
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 5fc5d08b327..97149884497 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -19,6 +19,16 @@
 from vllm import LLM, SamplingParams
 from vllm.tracing import SpanAttributes
 
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    Since this module is V0 only, set VLLM_USE_V1=0 for
+    all tests in the module.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
+
+
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
 
 FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index a3540582a39..02470ca92f4 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -18,19 +18,19 @@
 def test_prefix_caching_from_cli():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert (engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert (vllm_config.cache_config.enable_prefix_caching
             ), "V1 turns on prefix caching by default."
 
     # Turn it off possible with flag.
     args = parser.parse_args(["--no-enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert not vllm_config.cache_config.enable_prefix_caching
 
     # Turn it on with flag.
     args = parser.parse_args(["--enable-prefix-caching"])
-    engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.enable_prefix_caching
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.enable_prefix_caching
 
 
 def test_defaults_with_usage_context():
@@ -38,11 +38,21 @@ def test_defaults_with_usage_context():
     vllm_config: VllmConfig = engine_args.create_engine_config(
         UsageContext.LLM_CLASS)
 
+    from vllm.platforms import current_platform
+    device_name = current_platform.get_device_name().lower()
+    if "h100" in device_name or "h200" in device_name:
+        # For H100 and H200, we use larger default values.
+        default_llm_tokens = 16384
+        default_server_tokens = 8192
+    else:
+        default_llm_tokens = 8192
+        default_server_tokens = 2048
+
     assert vllm_config.scheduler_config.max_num_seqs == 1024
-    assert vllm_config.scheduler_config.max_num_batched_tokens == 8192
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
 
     engine_args = EngineArgs(model="facebook/opt-125m")
     vllm_config = engine_args.create_engine_config(
         UsageContext.OPENAI_API_SERVER)
     assert vllm_config.scheduler_config.max_num_seqs == 1024
-    assert vllm_config.scheduler_config.max_num_batched_tokens == 2048
+    assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 9715573e3f1..e763aa2c869 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -6,7 +6,6 @@
 import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from tests.v1.sample.utils import (
     BatchLogprobsComposition, BatchLogprobsSpecType,
     assert_incr_detok_str_matches_non_incr_detok_str,
@@ -334,7 +333,7 @@ def test_get_logprobs_and_prompt_logprobs(
             do_apc=do_apc)
 
 
-def test_max_logprobs(monkeypatch):
+def test_max_logprobs():
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
     
     Should also fail for `prompt_logprobs > max_logprobs`
@@ -344,7 +343,6 @@ def test_max_logprobs(monkeypatch):
     Args:
       monkeypatch
     """
-    override_backend_env_variable(monkeypatch, "FLASH_ATTN")
 
     runner = VllmRunner("facebook/opt-125m",
                         max_logprobs=1,
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
new file mode 100644
index 00000000000..d74a96fbfa0
--- /dev/null
+++ b/tests/v1/test_oracle.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+import vllm.envs as envs
+from vllm import LLM
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+
+UNSUPPORTED_MODELS_V1 = [
+    "openai/whisper-large-v3",  # transcription
+    "facebook/bart-large-cnn",  # encoder decoder
+    "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
+    "ibm-ai-platform/Bamba-9B",  # hybrid
+    "BAAI/bge-m3",  # embedding
+]
+
+MODEL = "meta-llama/Llama-3.2-1B-Instruct"
+
+
+@pytest.mark.parametrize("model", UNSUPPORTED_MODELS_V1)
+def test_reject_unsupported_models(monkeypatch, model):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        args = AsyncEngineArgs(model=model)
+
+        with pytest.raises(NotImplementedError):
+            _ = args.create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_bad_config(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+
+
+def test_unsupported_configs(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                kv_cache_dtype="fp8",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                speculative_model=MODEL,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                guided_decoding_backend="lm-format-enforcer:no-fallback",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                preemption_mode="swap",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                disable_async_output_proc=True,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                scheduling_policy="priority",
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                num_scheduler_steps=5,
+            ).create_engine_config()
+
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(
+                model=MODEL,
+                scheduler_delay_factor=1.2,
+            ).create_engine_config()
+
+
+def test_enable_by_default_fallback(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        _ = AsyncEngineArgs(
+            model=MODEL,
+            enforce_eager=True,
+        ).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Should fall back to V0 for experimental config.
+        _ = AsyncEngineArgs(
+            model=MODEL,
+            enable_lora=True,
+        ).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Should fall back to V0 for supported model.
+        _ = AsyncEngineArgs(
+            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_llm_by_default(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Should default to V1 for supported config.
+        model = LLM(MODEL, enforce_eager=True)
+        print(model.generate("Hello my name is"))
+        assert hasattr(model.llm_engine, "engine_core")
+        m.delenv("VLLM_USE_V1")
+
+
+def test_v1_attn_backend(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
+        # Fall back to V0.
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert not envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+        # Reject if V1.
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError):
+            AsyncEngineArgs(model=MODEL).create_engine_config()
+        m.delenv("VLLM_USE_V1")
+
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
+        _ = AsyncEngineArgs(model=MODEL).create_engine_config()
+        assert envs.VLLM_USE_V1
+        m.delenv("VLLM_USE_V1")
+
+
+def test_reject_using_constructor_directly(monkeypatch):
+    with monkeypatch.context() as m:
+        if os.getenv("VLLM_USE_V1", None):
+            m.delenv("VLLM_USE_V1")
+
+        # Sets VLLM_USE_V1=1.
+        vllm_config = AsyncEngineArgs(model=MODEL).create_engine_config()
+
+        # This uses the V0 constructor directly.
+        with pytest.raises(ValueError):
+            AsyncLLMEngine(vllm_config,
+                           AsyncLLMEngine._get_executor_cls(vllm_config),
+                           log_stats=True)
+
+        m.delenv("VLLM_USE_V1")
diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py
index 9d6b25da7e6..9f99b3725fe 100644
--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -15,6 +15,9 @@
 MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "80")
 
 
+@pytest.mark.skipif(
+    MODEL_NAME == "casperhansen/deepseek-coder-v2-instruct-awq",
+    reason="OOM in the CI")
 @pytest.mark.skipif(
     not current_platform.has_device_capability(int(MIN_CAPABILITY)),
     reason="Current system does not have minimum capability.")
@@ -22,10 +25,14 @@ def test_weight_loading(vllm_runner):
     """
     Test parameter weight loading with tp>1.
     """
+
+    # MoE models need fp16.
+    NEEDS_FP16 = (QUANTIZATION == "gptq" or MODEL_NAME
+                  == "nm-testing/test-w4a16-mixtral-actorder-group")
     with vllm_runner(
             model_name=MODEL_NAME,
             revision=REVISION,
-            dtype=torch.half if QUANTIZATION == "gptq" else "auto",
+            dtype=torch.half if NEEDS_FP16 else "auto",
             quantization=None if QUANTIZATION == "None" else QUANTIZATION,
             max_model_len=MAX_MODEL_LEN,
             tensor_parallel_size=2) as model:
diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py
new file mode 100644
index 00000000000..372d71a78d0
--- /dev/null
+++ b/tests/worker/conftest.py
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+
+
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch):
+    """
+    This module tests V0 internals, so set VLLM_USE_V1=0.
+    """
+    monkeypatch.setenv('VLLM_USE_V1', '0')
\ No newline at end of file
diff --git a/vllm/config.py b/vllm/config.py
index 40ea50cb083..70cc0affe99 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1140,6 +1140,10 @@ def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
         elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
+            if envs.VLLM_USE_V1:
+                raise NotImplementedError(
+                    "V1 does not yet support fp8 KV cache. "
+                    "Set VLLM_USE_V1=0 to enable fp8 kv cache.")
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
@@ -3142,16 +3146,7 @@ def model_post_init(self, __context: Any) -> None:
                 self.inductor_compile_config[KEY] = False
 
         if self.splitting_ops is None:
-            if envs.VLLM_USE_V1:
-                # v1 must split the graph on attention ops
-                # for piecewise cudagraph
-                self.splitting_ops = [
-                    "vllm.unified_attention",
-                    "vllm.unified_attention_with_output",
-                ]
-            else:
-                # v0 uses full graph compilation
-                self.splitting_ops = []
+            self.splitting_ops = []
 
         for k, v in self.inductor_passes.items():
             if not isinstance(v, str):
@@ -3246,6 +3241,15 @@ def init_with_cudagraph_sizes(self,
         self.bs_to_padded_graph_size[
             self.max_capture_size] = self.max_capture_size
 
+    def set_splitting_ops_for_v1(self):
+        # If default, override splitting ops for piecewise cudagraph on V1.
+        # NOTE: this function needs to be called
+        if not self.splitting_ops:
+            self.splitting_ops = [
+                "vllm.unified_attention",
+                "vllm.unified_attention_with_output",
+            ]
+
 
 @dataclass
 class VllmConfig:
@@ -3297,6 +3301,7 @@ def compute_hash(self) -> str:
         vllm_factors: list[Any] = []
         from vllm import __version__
         vllm_factors.append(__version__)
+        vllm_factors.append(envs.VLLM_USE_V1)
         if self.model_config:
             vllm_factors.append(self.model_config.compute_hash())
         else:
@@ -3460,6 +3465,7 @@ def __post_init__(self):
             # CUDA graphs do not work properly with the custom CUDA kernels.
             # FIXME(woosuk): Disable inductor to reduce the compilation time
             # and avoid any potential issues with the inductor.
+            # FIXME(rob): Add function to set all of these.
             self.compilation_config.custom_ops = ["none"]
             self.compilation_config.use_cudagraph = True
             self.compilation_config.use_inductor = True
@@ -3467,6 +3473,7 @@ def __post_init__(self):
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
             self.compilation_config.level = CompilationLevel.PIECEWISE
+            self.compilation_config.set_splitting_ops_for_v1()
 
         self._set_cudagraph_sizes()
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cef3a3f78b0..31d567de0ef 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -223,15 +223,6 @@ def __post_init__(self):
         if not self.tokenizer:
             self.tokenizer = self.model
 
-        # Override the default value of enable_prefix_caching if it's not set
-        # by user.
-        if self.enable_prefix_caching is None:
-            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
-
-        # Override max_num_seqs if it's not set by user.
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
-
         # support `EngineArgs(compilation_config={...})`
         # without having to manually construct a
         # CompilationConfig object
@@ -246,7 +237,6 @@ def __post_init__(self):
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         """Shared CLI arguments for vLLM engine."""
-
         # Model arguments
         parser.add_argument(
             '--model',
@@ -1191,24 +1181,51 @@ def create_load_config(self) -> LoadConfig:
             use_tqdm_on_load=self.use_tqdm_on_load,
         )
 
-    def create_engine_config(self,
-                             usage_context: Optional[UsageContext] = None
-                             ) -> VllmConfig:
+    def create_engine_config(
+        self,
+        usage_context: Optional[UsageContext] = None,
+    ) -> VllmConfig:
+        """
+        Create the VllmConfig.
+
+        NOTE: for autoselection of V0 vs V1 engine, we need to
+        create the ModelConfig first, since ModelConfig's attrs
+        (e.g. the model arch) are needed to make the decision.
+        
+        This function set VLLM_USE_V1=X if VLLM_USE_V1 is
+        unspecified by the user.
+
+        If VLLM_USE_V1 is specified by the user but the VllmConfig
+        is incompatible, we raise an error.
+        """
         from vllm.platforms import current_platform
         current_platform.pre_register_and_update()
 
-        if envs.VLLM_USE_V1:
-            self._override_v1_engine_args(usage_context)
-
         device_config = DeviceConfig(device=self.device)
         model_config = self.create_model_config()
 
-        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
-                and self.enable_prefix_caching):
-            logger.warning("--enable-prefix-caching is currently not "
-                           "supported for multimodal models in v0 and "
-                           "has been disabled.")
-            self.enable_prefix_caching = False
+        # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
+        #   and fall back to V0 for experimental or unsupported features.
+        # * If VLLM_USE_V1=1, we enable V1 for supported + experimental
+        #   features and raise error for unsupported features.
+        # * If VLLM_USE_V1=0, we disable V1.
+        use_v1 = False
+        try_v1 = envs.VLLM_USE_V1 or not envs.is_set("VLLM_USE_V1")
+        if try_v1 and self._is_v1_supported_oracle(model_config):
+            use_v1 = True
+
+        # If user explicitly set VLLM_USE_V1, sanity check we respect it.
+        if envs.is_set("VLLM_USE_V1"):
+            assert use_v1 == envs.VLLM_USE_V1
+        # Otherwise, set the VLLM_USE_V1 variable globally.
+        else:
+            envs.set_vllm_use_v1(use_v1)
+
+        # Set default arguments for V0 or V1 Engine.
+        if use_v1:
+            self._set_default_args_v1(usage_context)
+        else:
+            self._set_default_args_v0(model_config)
 
         cache_config = CacheConfig(
             block_size=self.block_size,
@@ -1239,50 +1256,6 @@ def create_engine_config(self,
             worker_extension_cls=self.worker_extension_cls,
         )
 
-        max_model_len = model_config.max_model_len
-        use_long_context = max_model_len > 32768
-        if self.enable_chunked_prefill is None:
-            # If not explicitly set, enable chunked prefill by default for
-            # long context (> 32K) models. This is to avoid OOM errors in the
-            # initial memory profiling phase.
-
-            # For multimodal models and models with MLA, chunked prefill is
-            # disabled by default in V0, but enabled by design in V1
-            if model_config.is_multimodal_model or model_config.use_mla:
-                self.enable_chunked_prefill = bool(envs.VLLM_USE_V1)
-
-            elif use_long_context:
-                is_gpu = device_config.device_type == "cuda"
-                use_sliding_window = (model_config.get_sliding_window()
-                                      is not None)
-                use_spec_decode = self.speculative_model is not None
-                from vllm.platforms import current_platform
-                if (is_gpu and not use_sliding_window and not use_spec_decode
-                        and not self.enable_lora
-                        and not self.enable_prompt_adapter
-                        and model_config.runner_type != "pooling"
-                        and not current_platform.is_rocm()):
-                    self.enable_chunked_prefill = True
-                    logger.warning(
-                        "Chunked prefill is enabled by default for models with "
-                        "max_model_len > 32K. Currently, chunked prefill might "
-                        "not work with some features or models. If you "
-                        "encounter any issues, please disable chunked prefill "
-                        "by setting --enable-chunked-prefill=False.")
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = False
-
-        if not self.enable_chunked_prefill and use_long_context:
-            logger.warning(
-                "The model has a long context length (%s). This may cause OOM "
-                "errors during the initial memory profiling phase, or result "
-                "in low performance due to small KV cache space. Consider "
-                "setting --max-model-len to a smaller value.", max_model_len)
-        elif (self.enable_chunked_prefill
-              and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for pooling models"
-            raise ValueError(msg)
-
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
@@ -1425,18 +1398,282 @@ def create_engine_config(self,
             additional_config=self.additional_config,
         )
 
-        if envs.VLLM_USE_V1:
-            self._override_v1_engine_config(config)
         return config
 
-    def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
-        """
-        Override the EngineArgs's args based on the usage context for V1.
-        """
-        assert envs.VLLM_USE_V1, "V1 is not enabled"
+    def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
+        """Oracle for whether to use V0 or V1 Engine by default."""
+
+        #############################################################
+        # Unsupported Feature Flags on V1.
+
+        if (self.load_format == LoadFormat.TENSORIZER.value
+                or self.load_format == LoadFormat.SHARDED_STATE.value):
+            _raise_or_fallback(
+                feature_name=f"--load_format {self.load_format}",
+                recommend_to_remove=False)
+            return False
+
+        if (self.logits_processor_pattern
+                != EngineArgs.logits_processor_pattern):
+            _raise_or_fallback(feature_name="--logits-processor-pattern",
+                               recommend_to_remove=False)
+            return False
+
+        if self.preemption_mode != EngineArgs.preemption_mode:
+            _raise_or_fallback(feature_name="--preemption-mode",
+                               recommend_to_remove=True)
+            return False
+
+        if (self.disable_async_output_proc
+                != EngineArgs.disable_async_output_proc):
+            _raise_or_fallback(feature_name="--disable-async-output-proc",
+                               recommend_to_remove=True)
+            return False
+
+        if self.scheduling_policy != EngineArgs.scheduling_policy:
+            _raise_or_fallback(feature_name="--scheduling-policy",
+                               recommend_to_remove=False)
+            return False
+
+        if self.worker_cls != EngineArgs.worker_cls:
+            _raise_or_fallback(feature_name="--worker-cls",
+                               recommend_to_remove=False)
+            return False
+
+        if self.worker_extension_cls != EngineArgs.worker_extension_cls:
+            _raise_or_fallback(feature_name="--worker-extension-cls",
+                               recommend_to_remove=False)
+            return False
+
+        if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
+            _raise_or_fallback(feature_name="--num-scheduler-steps",
+                               recommend_to_remove=True)
+            return False
+
+        if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor:
+            _raise_or_fallback(feature_name="--scheduler-delay-factor",
+                               recommend_to_remove=True)
+            return False
+
+        if self.additional_config != EngineArgs.additional_config:
+            _raise_or_fallback(feature_name="--additional-config",
+                               recommend_to_remove=False)
+            return False
+
+        # Only support Xgrammar for guided decoding so far.
+        SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
+        if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
+            _raise_or_fallback(feature_name="--guided-decoding-backend",
+                               recommend_to_remove=False)
+            return False
+
+        # Need at least Ampere for now (FA support required).
+        from vllm.platforms import current_platform
+        if (current_platform.is_cuda()
+                and current_platform.get_device_capability().major < 8):
+            _raise_or_fallback(feature_name="Compute Capability < 8.0",
+                               recommend_to_remove=False)
+            return False
+
+        # No Fp8 KV cache so far.
+        if self.kv_cache_dtype != "auto":
+            _raise_or_fallback(feature_name="--kv-cache-dtype",
+                               recommend_to_remove=False)
+            return False
+
+        # No Prompt Adapter so far.
+        if self.enable_prompt_adapter:
+            _raise_or_fallback(feature_name="--enable-prompt-adapter",
+                               recommend_to_remove=False)
+            return False
+
+        # No MistralTokenizer support so far (not compatible
+        # with xgrammar)
+        if model_config.tokenizer_mode == "mistral":
+            _raise_or_fallback(feature_name="--tokenizer-mode mistral",
+                               recommend_to_remove=False)
+            return False
+
+        # No CPU offloading yet.
+        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
+            _raise_or_fallback(feature_name="--cpu-offload-gb",
+                               recommend_to_remove=False)
+            return False
+
+        # Only Fp16 and Bf16 dtypes since we only support FA.
+        V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
+        if model_config.dtype not in V1_SUPPORTED_DTYPES:
+            _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
+                               recommend_to_remove=False)
+            return False
+
+        # Some quantization is not compatible with torch.compile.
+        V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
+        if model_config.quantization in V1_UNSUPPORTED_QUANT:
+            _raise_or_fallback(
+                feature_name=f"--quantization {model_config.quantization}",
+                recommend_to_remove=False)
+            return False
+
+        # No Embedding Models so far.
+        if model_config.task not in ["generate"]:
+            _raise_or_fallback(feature_name=f"--task {model_config.task}",
+                               recommend_to_remove=False)
+            return False
+
+        # No Mamba or Encoder-Decoder so far.
+        if not model_config.is_v1_compatible:
+            _raise_or_fallback(feature_name=model_config.architectures,
+                               recommend_to_remove=False)
+            return False
+
+        # No TransformersModel support so far.
+        if (model_config.model_impl == ModelImpl.TRANSFORMERS
+                or model_config.model_impl == "transformers"):
+            _raise_or_fallback(
+                feature_name=f"model_impl={model_config.model_impl}",
+                recommend_to_remove=False)
+            return False
+
+        # No Concurrent Partial Prefills so far.
+        if (self.max_num_partial_prefills
+                != EngineArgs.max_num_partial_prefills
+                or self.max_long_partial_prefills
+                != EngineArgs.max_long_partial_prefills
+                or self.long_prefill_token_threshold
+                != EngineArgs.long_prefill_token_threshold):
+            _raise_or_fallback(feature_name="Concurrent Partial Prefill",
+                               recommend_to_remove=False)
+            return False
+
+        # No OTLP observability so far.
+        if (self.otlp_traces_endpoint or self.collect_detailed_traces):
+            _raise_or_fallback(feature_name="--otlp-traces-endpoint",
+                               recommend_to_remove=False)
+            return False
+
+        # Only Ngram speculative decoding so far.
+        if (self.speculative_model is not None
+                or self.num_speculative_tokens is not None):
+            # This is supported but experimental (handled below).
+            if self.speculative_model == "[ngram]":
+                pass
+            else:
+                _raise_or_fallback(feature_name="Speculative Decoding",
+                                   recommend_to_remove=False)
+                return False
+
+        # No Disaggregated Prefill so far.
+        if self.kv_transfer_config != EngineArgs.kv_transfer_config:
+            _raise_or_fallback(feature_name="--kv-transfer-config",
+                               recommend_to_remove=False)
+            return False
+
+        # No FlashInfer or XFormers so far.
+        V1_BACKENDS = [
+            "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
+            "TRITON_MLA", "FLASHMLA"
+        ]
+        if (envs.is_set("VLLM_ATTENTION_BACKEND")
+                and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
+            name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=True)
+            return False
+
+        #############################################################
+        # Experimental Features - allow users to opt in.
+
+        # MLA is is supported on V1, but off by default for now.
+        if model_config.use_mla and _warn_or_fallback("MLA"):
+            return False
+
+        # LoRA is supported on V1, but off by default for now.
+        if self.enable_lora and _warn_or_fallback("LORA"):
+            return False
+
+        # PP is supported on V1, but off by default for now.
+        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+            return False
+
+        # ngram is supported on V1, but off by default for now.
+        if self.speculative_model == "[ngram]" and _warn_or_fallback("ngram"):
+            return False
+
+        # Non-CUDA is supported on V1, but off by default for now.
+        not_cuda = not current_platform.is_cuda()
+        if not_cuda and _warn_or_fallback(  # noqa: SIM103
+                current_platform.device_type):
+            return False
+        #############################################################
+
+        return True
+
+    def _set_default_args_v0(self, model_config: ModelConfig) -> None:
+        """Set Default Arguments for V0 Engine."""
+
+        max_model_len = model_config.max_model_len
+        use_long_context = max_model_len > 32768
+        if self.enable_chunked_prefill is None:
+            # Chunked prefill not supported for Multimodal or MLA in V0.
+            if model_config.is_multimodal_model or model_config.use_mla:
+                self.enable_chunked_prefill = False
+
+            # Enable chunked prefill by default for long context (> 32K)
+            # models to avoid OOM errors in initial memory profiling phase.
+            elif use_long_context:
+                from vllm.platforms import current_platform
+                is_gpu = current_platform.is_cuda()
+                use_sliding_window = (model_config.get_sliding_window()
+                                      is not None)
+                use_spec_decode = self.speculative_model is not None
+
+                if (is_gpu and not use_sliding_window and not use_spec_decode
+                        and not self.enable_lora
+                        and not self.enable_prompt_adapter
+                        and model_config.runner_type != "pooling"):
+                    self.enable_chunked_prefill = True
+                    logger.warning(
+                        "Chunked prefill is enabled by default for models "
+                        "with max_model_len > 32K. Chunked prefill might "
+                        "not work with some features or models. If you "
+                        "encounter any issues, please disable by launching "
+                        "with --enable-chunked-prefill=False.")
+
+            if self.enable_chunked_prefill is None:
+                self.enable_chunked_prefill = False
+
+        if not self.enable_chunked_prefill and use_long_context:
+            logger.warning(
+                "The model has a long context length (%s). This may cause"
+                "OOM during the initial memory profiling phase, or result "
+                "in low performance due to small KV cache size. Consider "
+                "setting --max-model-len to a smaller value.", max_model_len)
+        elif (self.enable_chunked_prefill
+              and model_config.runner_type == "pooling"):
+            msg = "Chunked prefill is not supported for pooling models"
+            raise ValueError(msg)
+
+        # Disable prefix caching for multimodal models for VLLM_V0.
+        if (model_config.is_multimodal_model and self.enable_prefix_caching):
+            logger.warning(
+                "--enable-prefix-caching is not supported for multimodal "
+                "models in V0 and has been disabled.")
+            self.enable_prefix_caching = False
+
+        # Set max_num_seqs to 256 for VLLM_V0.
+        if self.max_num_seqs is None:
+            self.max_num_seqs = 256
+
+    def _set_default_args_v1(self, usage_context: UsageContext) -> None:
+        """Set Default Arguments for V1 Engine."""
 
         # V1 always uses chunked prefills.
         self.enable_chunked_prefill = True
+
+        # V1 enables prefix caching by default.
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = True
+
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
@@ -1471,19 +1708,21 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
                 UsageContext.OPENAI_API_SERVER: 2048,
             }
 
+        use_context_value = usage_context.value if usage_context else None
         if (self.max_num_batched_tokens is None
                 and usage_context in default_max_num_batched_tokens):
             self.max_num_batched_tokens = default_max_num_batched_tokens[
                 usage_context]
-            logger.warning(
+            logger.debug(
                 "Setting max_num_batched_tokens to %d for %s usage context.",
-                self.max_num_batched_tokens, usage_context.value)
+                self.max_num_batched_tokens, use_context_value)
 
-    def _override_v1_engine_config(self, engine_config: VllmConfig) -> None:
-        """
-        Override the EngineConfig's configs based on the usage context for V1.
-        """
-        assert envs.VLLM_USE_V1, "V1 is not enabled"
+        default_max_num_seqs = 1024
+        if self.max_num_seqs is None:
+            self.max_num_seqs = default_max_num_seqs
+
+            logger.debug("Setting max_num_seqs to %d for %s usage context.",
+                         self.max_num_seqs, use_context_value)
 
 
 @dataclass
@@ -1508,6 +1747,33 @@ def add_cli_args(parser: FlexibleArgumentParser,
         return parser
 
 
+def _raise_or_fallback(feature_name: str, recommend_to_remove: bool):
+    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+        raise NotImplementedError(
+            f"VLLM_USE_V1=1 is not supported with {feature_name}.")
+    msg = f"{feature_name} is not supported by the V1 Engine. "
+    msg += "Falling back to V0. "
+    if recommend_to_remove:
+        msg += f"We recommend to remove {feature_name} from your config "
+        msg += "in favor of the V1 Engine."
+    logger.warning(msg)
+
+
+def _warn_or_fallback(feature_name: str) -> bool:
+    if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+        logger.warning(
+            "Detected VLLM_USE_V1=1 with %s. Usage should "
+            "be considered experimental. Please report any "
+            "issues on Github.", feature_name)
+        should_exit = False
+    else:
+        logger.info(
+            "%s is experimental on VLLM_USE_V1=1. "
+            "Falling back to V0 Engine.", feature_name)
+        should_exit = True
+    return should_exit
+
+
 # These functions are used by sphinx to build the documentation
 def _engine_args_parser():
     return EngineArgs.add_cli_args(FlexibleArgumentParser())
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index ebba34c5c86..84f5528a06d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -595,6 +595,13 @@ def __init__(self,
                  log_requests: bool = True,
                  start_engine_loop: bool = True,
                  **kwargs) -> None:
+        if envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
         self.log_requests = log_requests
         self.engine = self._engine_class(*args, **kwargs)
 
@@ -629,33 +636,53 @@ def _get_executor_cls(cls,
                           engine_config: VllmConfig) -> Type[ExecutorBase]:
         return LLMEngine._get_executor_cls(engine_config)
 
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+    ) -> "AsyncLLMEngine":
+        """Create an AsyncLLMEngine from the EngineArgs."""
+
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=cls._get_executor_cls(vllm_config),
+            start_engine_loop=start_engine_loop,
+            log_requests=not disable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
     @classmethod
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
     ) -> "AsyncLLMEngine":
         """Creates an async LLM engine from the engine arguments."""
-        # Create the engine configs.
-        if engine_config is None:
-            engine_config = engine_args.create_engine_config(usage_context)
-
-        executor_class = cls._get_executor_cls(engine_config)
-
-        # Create the async LLM engine.
-        engine = cls(
-            vllm_config=engine_config,
-            executor_class=executor_class,
-            log_requests=not engine_args.disable_log_requests,
-            log_stats=not engine_args.disable_log_stats,
+
+        vllm_config = engine_args.create_engine_config(usage_context)
+
+        async_engine_cls = cls
+        if envs.VLLM_USE_V1:
+            from vllm.v1.engine.async_llm import AsyncLLM as V1AsyncLLMEngine
+            async_engine_cls = V1AsyncLLMEngine
+
+        return async_engine_cls.from_vllm_config(
+            vllm_config=vllm_config,
             start_engine_loop=start_engine_loop,
             usage_context=usage_context,
             stat_loggers=stat_loggers,
+            disable_log_stats=engine_args.disable_log_stats,
+            disable_log_requests=engine_args.disable_log_requests,
         )
-        return engine
 
     @property
     def is_running(self) -> bool:
@@ -1203,7 +1230,7 @@ async def add_lora(self, lora_request: LoRARequest) -> None:
 
 
 # TODO(v1): Remove this class proxy when V1 goes default.
-if envs.VLLM_USE_V1:
+if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
     from vllm.v1.engine.async_llm import AsyncLLM
 
     AsyncLLMEngine = AsyncLLM  # type: ignore
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 783275ab41d..94687a13c52 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -216,6 +216,12 @@ def __init__(
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
+        if envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V0 LLMEngine, but envs.VLLM_USE_V1=True. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
@@ -479,6 +485,22 @@ def _get_executor_cls(cls,
                              f"{distributed_executor_backend}")
         return executor_class
 
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=cls._get_executor_cls(vllm_config),
+            log_stats=(not disable_log_stats),
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
     @classmethod
     def from_engine_args(
         cls,
@@ -488,19 +510,20 @@ def from_engine_args(
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config(usage_context)
-        executor_class = cls._get_executor_cls(engine_config)
-        # Create the LLM engine.
-        engine = cls(
-            vllm_config=engine_config,
-            executor_class=executor_class,
-            log_stats=not engine_args.disable_log_stats,
+        vllm_config = engine_args.create_engine_config(usage_context)
+
+        engine_cls = cls
+        if envs.VLLM_USE_V1:
+            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+            engine_cls = V1LLMEngine
+
+        return engine_cls.from_vllm_config(
+            vllm_config=vllm_config,
             usage_context=usage_context,
             stat_loggers=stat_loggers,
+            disable_log_stats=engine_args.disable_log_stats,
         )
 
-        return engine
-
     def __reduce__(self):
         # This is to ensure that the LLMEngine is not referenced in
         # the closure used to initialize Ray worker actors
@@ -2097,6 +2120,6 @@ def _build_logits_processors(
         return sampling_params
 
 
-# TODO(v1): Remove this class proxy when V1 goes default.
-if envs.VLLM_USE_V1:
-    from vllm.v1.engine.llm_engine import LLMEngine  # type: ignore
+if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
+    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+    LLMEngine = V1LLMEngine  # type: ignore
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 005ba81cd22..b1bb0fd53d6 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -18,7 +18,6 @@
 from vllm import PoolingParams
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.engine.arg_utils import AsyncEngineArgs
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.engine.async_llm_engine import (
@@ -133,9 +132,9 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
         self._engine_process = psutil.Process(engine_pid)
 
     @staticmethod
-    def is_unsupported_config(engine_args: AsyncEngineArgs):
+    def is_unsupported_config(vllm_config: VllmConfig):
         # Pipeline parallel not yet supported
-        return engine_args.pipeline_parallel_size > 1
+        return vllm_config.parallel_config.pipeline_parallel_size > 1
 
     @contextmanager
     def get_data_socket(self) -> Iterator[Socket]:
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 5d14b4112a8..312e0e98d56 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -9,6 +9,7 @@
 import zmq
 
 from vllm import AsyncEngineArgs, SamplingParams
+from vllm.config import VllmConfig
 from vllm.engine.llm_engine import LLMEngine
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -110,25 +111,39 @@ def dead_error(self) -> BaseException:
             return ENGINE_DEAD_ERROR()
 
     @classmethod
-    def from_engine_args(cls, engine_args: AsyncEngineArgs,
-                         usage_context: UsageContext, ipc_path: str):
-        """Creates an MQLLMEngine from the engine arguments."""
+    def from_vllm_config(cls, vllm_config: VllmConfig,
+                         usage_context: UsageContext,
+                         disable_log_requests: bool, disable_log_stats: bool,
+                         ipc_path: str) -> "MQLLMEngine":
         # Setup plugins for each process
         from vllm.plugins import load_general_plugins
         load_general_plugins()
 
-        engine_config = engine_args.create_engine_config(usage_context)
-        executor_class = LLMEngine._get_executor_cls(engine_config)
+        use_async_sockets = vllm_config.model_config.use_async_output_proc
+
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=LLMEngine._get_executor_cls(vllm_config),
+            ipc_path=ipc_path,
+            usage_context=usage_context,
+            use_async_sockets=use_async_sockets,
+            log_requests=(not disable_log_requests),
+            log_stats=(not disable_log_stats),
+        )
 
-        use_async_sockets = engine_config.model_config.use_async_output_proc
+    @staticmethod
+    def from_engine_args(engine_args: AsyncEngineArgs,
+                         usage_context: UsageContext, ipc_path: str):
+        """Creates an MQLLMEngine from the engine arguments."""
 
-        return cls(ipc_path=ipc_path,
-                   use_async_sockets=use_async_sockets,
-                   vllm_config=engine_config,
-                   executor_class=executor_class,
-                   log_requests=not engine_args.disable_log_requests,
-                   log_stats=not engine_args.disable_log_stats,
-                   usage_context=usage_context)
+        vllm_config = engine_args.create_engine_config(usage_context)
+        return MQLLMEngine.from_vllm_config(
+            ipc_path=ipc_path,
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            disable_log_requests=engine_args.disable_log_requests,
+            disable_log_stats=engine_args.disable_log_stats,
+        )
 
     def start(self):
         try:
@@ -396,12 +411,16 @@ def signal_handler(*_) -> None:
     raise KeyboardInterrupt("MQLLMEngine terminated")
 
 
-def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext,
-                  ipc_path: str, engine_alive):
+def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
+                  ipc_path: str, disable_log_stats: bool,
+                  disable_log_requests: bool, engine_alive):
     try:
-        engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
-                                              usage_context=usage_context,
-                                              ipc_path=ipc_path)
+        engine = MQLLMEngine.from_vllm_config(
+            vllm_config=vllm_config,
+            usage_context=usage_context,
+            disable_log_stats=disable_log_stats,
+            disable_log_requests=disable_log_requests,
+            ipc_path=ipc_path)
 
         signal.signal(signal.SIGTERM, signal_handler)
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e8f3c1f4e50..a0e2fa2918b 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -11,7 +11,6 @@
 from tqdm import tqdm
 from typing_extensions import TypeVar, deprecated
 
-from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
 from vllm.config import CompilationConfig
@@ -238,23 +237,15 @@ def __init__(
             compilation_config=compilation_config_instance,
             **kwargs,
         )
-        # Logic to switch between engines is done at runtime instead of import
-        # to avoid import order issues
-        self.engine_class = self.get_engine_class()
-        self.llm_engine = self.engine_class.from_engine_args(
-            engine_args, usage_context=UsageContext.LLM_CLASS)
+
+        # Create the Engine (autoselects V0 vs V1)
+        self.llm_engine = LLMEngine.from_engine_args(
+            engine_args=engine_args, usage_context=UsageContext.LLM_CLASS)
+        self.engine_class = type(self.llm_engine)
 
         self.request_counter = Counter()
         self.default_sampling_params: Union[dict[str, Any], None] = None
 
-    @staticmethod
-    def get_engine_class() -> type[LLMEngine]:
-        if envs.VLLM_USE_V1:
-            # Lazy import: the v1 package isn't distributed
-            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-            return V1LLMEngine  # type: ignore
-        return LLMEngine
-
     def get_tokenizer(self) -> AnyTokenizer:
         return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 52e65fc214b..694d4f9cf11 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -154,21 +154,47 @@ async def build_async_engine_client_from_engine_args(
     Returns the Client or None if the creation failed.
     """
 
-    # AsyncLLMEngine.
-    if (MQLLMEngineClient.is_unsupported_config(engine_args)
-            or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
+    # Create the EngineConfig (determines if we can use V1).
+    usage_context = UsageContext.OPENAI_API_SERVER
+    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
+
+    # V1 AsyncLLM.
+    if envs.VLLM_USE_V1:
+        if disable_frontend_multiprocessing:
+            logger.warning(
+                "V1 is enabled, but got --disable-frontend-multiprocessing. "
+                "To disable frontend multiprocessing, set VLLM_USE_V1=0.")
+
+        from vllm.v1.engine.async_llm import AsyncLLM
+        async_llm: Optional[AsyncLLM] = None
+        try:
+            async_llm = AsyncLLM.from_vllm_config(
+                vllm_config=vllm_config,
+                usage_context=usage_context,
+                disable_log_requests=engine_args.disable_log_requests,
+                disable_log_stats=engine_args.disable_log_stats)
+            yield async_llm
+        finally:
+            if async_llm:
+                async_llm.shutdown()
+
+    # V0 AsyncLLM.
+    elif (MQLLMEngineClient.is_unsupported_config(vllm_config)
+          or disable_frontend_multiprocessing):
 
         engine_client: Optional[EngineClient] = None
         try:
-            engine_client = AsyncLLMEngine.from_engine_args(
-                engine_args=engine_args,
-                usage_context=UsageContext.OPENAI_API_SERVER)
+            engine_client = AsyncLLMEngine.from_vllm_config(
+                vllm_config=vllm_config,
+                usage_context=usage_context,
+                disable_log_requests=engine_args.disable_log_requests,
+                disable_log_stats=engine_args.disable_log_stats)
             yield engine_client
         finally:
             if engine_client and hasattr(engine_client, "shutdown"):
                 engine_client.shutdown()
 
-    # MQLLMEngine.
+    # V0MQLLMEngine.
     else:
         if "PROMETHEUS_MULTIPROC_DIR" not in os.environ:
             # Make TemporaryDirectory for prometheus multiprocessing
@@ -199,10 +225,11 @@ async def build_async_engine_client_from_engine_args(
         # not actually result in an exitcode being reported. As a result
         # we use a shared variable to communicate the information.
         engine_alive = multiprocessing.Value('b', True, lock=False)
-        engine_process = context.Process(target=run_mp_engine,
-                                         args=(engine_args,
-                                               UsageContext.OPENAI_API_SERVER,
-                                               ipc_path, engine_alive))
+        engine_process = context.Process(
+            target=run_mp_engine,
+            args=(vllm_config, UsageContext.OPENAI_API_SERVER, ipc_path,
+                  engine_args.disable_log_stats,
+                  engine_args.disable_log_requests, engine_alive))
         engine_process.start()
         engine_pid = engine_process.pid
         assert engine_pid is not None, "Engine process failed to start."
@@ -217,8 +244,7 @@ def _cleanup_ipc_path():
         atexit.register(_cleanup_ipc_path)
 
         # Build RPCClient, which conforms to EngineClient Protocol.
-        engine_config = engine_args.create_engine_config()
-        build_client = partial(MQLLMEngineClient, ipc_path, engine_config,
+        build_client = partial(MQLLMEngineClient, ipc_path, vllm_config,
                                engine_pid)
         mq_engine_client = await asyncio.get_running_loop().run_in_executor(
             None, build_client)
diff --git a/vllm/envs.py b/vllm/envs.py
index 0b1bcd9eb35..7e079006b27 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: list[str] = []
-    VLLM_USE_V1: bool = False
+    VLLM_USE_V1: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -522,7 +522,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # If set, use the V1 code path.
     "VLLM_USE_V1":
-    lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
+    lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
 
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
@@ -644,3 +644,19 @@ def __getattr__(name: str):
 
 def __dir__():
     return list(environment_variables.keys())
+
+
+def is_set(name: str):
+    """Check if an environment variable is explicitly set."""
+    if name in environment_variables:
+        return name in os.environ
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def set_vllm_use_v1(use_v1: bool):
+    if is_set("VLLM_USE_V1"):
+        raise ValueError(
+            "Should not call set_vllm_use_v1() if VLLM_USE_V1 is set "
+            "explicitly by the user. Please raise this as a Github "
+            "Issue and explicitly set VLLM_USE_V1=0 or 1.")
+    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 9686231fb4b..0b487282731 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -74,7 +74,8 @@ def resolve_transformers_fallback(model_config: ModelConfig,
             if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
                     f"{arch} has no vLLM implementation and the Transformers "
-                    "implementation is not compatible with vLLM.")
+                    "implementation is not compatible with vLLM. Try setting "
+                    "VLLM_USE_V1=0.")
             logger.warning(
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 84b79613abc..50f48f91798 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsPP, SupportsV0Only
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -279,7 +279,7 @@ def forward(
         return hidden_states
 
 
-class BloomForCausalLM(nn.Module, SupportsPP):
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py
index 5f1903345f0..8d52da8b748 100644
--- a/vllm/model_executor/models/glm.py
+++ b/vllm/model_executor/models/glm.py
@@ -3,10 +3,11 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
+from .interfaces import SupportsV0Only
 from .utils import PPMissingLayer
 
 
-class GlmForCausalLM(LlamaForCausalLM):
+class GlmForCausalLM(LlamaForCausalLM, SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 51b1c33cfbd..d368c145d55 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -36,7 +36,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+                         SupportsMultiModal, SupportsPP, SupportsV0Only)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings,
@@ -405,7 +405,8 @@ def forward(
     UltravoxMultiModalProcessor,
     info=UltravoxProcessingInfo,
     dummy_inputs=UltravoxDummyInputsBuilder)
-class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
+                    SupportsV0Only):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index db80e52bf07..ad44f256a7b 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -196,7 +196,8 @@ def __init__(
         if head_size not in support_head_sizes:
             raise ValueError(
                 f"Head size {head_size} is not supported by FlashAttention. "
-                f"Supported head sizes are: {support_head_sizes}.")
+                f"Supported head sizes are: {support_head_sizes}. "
+                "Set VLLM_USE_V1=0 to use another attention backend.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 05633352be6..7188f10b188 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 
+import vllm.envs as envs
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
@@ -49,6 +50,12 @@ def __init__(
         log_requests: bool = True,
         start_engine_loop: bool = True,
     ) -> None:
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
         assert start_engine_loop
 
@@ -92,22 +99,50 @@ def __init__(
 
         self.output_handler: Optional[asyncio.Task] = None
 
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        disable_log_requests: bool = False,
+        disable_log_stats: bool = False,
+    ) -> "AsyncLLM":
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 AsyncLLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
+        # FIXME(rob): refactor VllmConfig to include the StatLoggers
+        # include StatLogger in the Oracle decision.
+        if stat_loggers is not None:
+            raise ValueError("Custom StatLoggers are not yet supported on V1. "
+                             "Explicitly set VLLM_USE_V1=0 to disable V1.")
+
+        # Create the LLMEngine.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=Executor.get_class(vllm_config),
+            start_engine_loop=start_engine_loop,
+            log_requests=not disable_log_requests,
+            log_stats=not disable_log_stats,
+            usage_context=usage_context,
+        )
+
     @classmethod
     def from_engine_args(
         cls,
         engine_args: AsyncEngineArgs,
-        engine_config: Optional[VllmConfig] = None,
         start_engine_loop: bool = True,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
     ) -> "AsyncLLM":
         """Create an AsyncLLM from the EngineArgs."""
 
         # Create the engine configs.
-        if engine_config is None:
-            vllm_config = engine_args.create_engine_config(usage_context)
-        else:
-            vllm_config = engine_config
-
+        vllm_config = engine_args.create_engine_config(usage_context)
         executor_class = Executor.get_class(vllm_config)
 
         # Create the AsyncLLM.
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index d56aee1accc..cbd19d4d637 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -46,6 +46,13 @@ def __init__(
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
+        if not envs.VLLM_USE_V1:
+            raise ValueError(
+                "Using V1 LLMEngine, but envs.VLLM_USE_V1=False. "
+                "This should not happen. As a workaround, try using "
+                "LLMEngine.from_vllm_config(...) or explicitly set "
+                "VLLM_USE_V1=0 or 1 and report this issue on Github.")
+
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -88,6 +95,26 @@ def __init__(
             # for v0 compatibility
             self.model_executor = self.engine_core.engine_core.model_executor  # type: ignore
 
+    @classmethod
+    def from_vllm_config(
+        cls,
+        vllm_config: VllmConfig,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
+        disable_log_stats: bool = False,
+    ) -> "LLMEngine":
+        if stat_loggers is not None:
+            raise NotImplementedError(
+                "Passing StatLoggers to V1 is not yet supported. "
+                "Set VLLM_USE_V1=0 and file and issue on Github.")
+
+        return cls(vllm_config=vllm_config,
+                   executor_class=Executor.get_class(vllm_config),
+                   log_stats=(not disable_log_stats),
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING)
+
     @classmethod
     def from_engine_args(
         cls,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 56846030ac4..663e1e36f75 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -184,7 +184,7 @@ def process_inputs(
         # Only applicable to multimodal models with legacy input processor.
         processed_inputs = self.input_processor(preprocessed_inputs)
 
-        self._validate_model_inputs(processed_inputs)
+        self._validate_model_inputs(processed_inputs, lora_request)
 
         if is_encoder_decoder_inputs(processed_inputs):
             decoder_inputs = SingletonInputsAdapter(
@@ -200,8 +200,12 @@ def process_inputs(
             raise NotImplementedError
 
         assert isinstance(params, SamplingParams)
-        # TODO: can we avoid cloning here in multiproc case
+        # TODO: can we avoid cloning here in multiproc case?
         sampling_params = params.clone()
+        # If unset max tokens, then generate up to the max_model_len.
+        if sampling_params.max_tokens is None:
+            sampling_params.max_tokens = (self.model_config.max_model_len -
+                                          len(decoder_inputs.prompt_token_ids))
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
         sampling_params.update_from_tokenizer(
@@ -296,7 +300,9 @@ def process_inputs(
             lora_request=lora_request,
         )
 
-    def _validate_model_inputs(self, inputs: ProcessorInputs):
+    def _validate_model_inputs(self,
+                               inputs: ProcessorInputs,
+                               lora_request: Optional[LoRARequest] = None):
         if is_encoder_decoder_inputs(inputs):
             # For encoder-decoder multimodal models, the max_prompt_len
             # restricts the decoder prompt length
@@ -310,6 +316,13 @@ def _validate_model_inputs(self, inputs: ProcessorInputs):
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
+        max_input_id = max(prompt_ids)
+        max_allowed = self.tokenizer.get_lora_tokenizer(
+            lora_request).max_token_id
+        if max_input_id > max_allowed:
+            raise ValueError(
+                "Token id {} is out of vocabulary".format(max_input_id))
+
         if len(prompt_ids) >= self.model_config.max_model_len:
             raise ValueError(
                 f"Prompt length of {len(prompt_ids)} is longer than the "

From 0baa2157f66fb11ffe0535c5dcf573306b8fc4eb Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 14 Mar 2025 22:06:38 -0700
Subject: [PATCH 0751/1240] [Docs] Add new East Coast vLLM Meetup slides to
 README and meetups.md (#14852)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md                        | 1 +
 docs/source/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 405e3a257f7..bfab7faf598 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ Join us to connect with the **vLLM team** and explore how vLLM is leveraged in *
 
 *Latest News* 🔥
 
+- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index c57f27b49b8..efb4f692972 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,7 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
 - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)

From be57aaf3d0675cade551bdb2174db2683973de24 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 15 Mar 2025 13:07:36 +0800
Subject: [PATCH 0752/1240] [CPU] Support FP8 KV cache (#14741)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cpu/cache.cpp                            | 38 ++++++------
 csrc/cpu/cpu_types_x86.hpp                    |  9 +++
 .../getting_started/installation/cpu.md       |  2 +-
 .../basic_correctness/test_chunked_prefill.py |  4 +-
 .../models/decoder_only/language/test_fp8.py  | 61 +++++++++++++++++++
 vllm/attention/backends/torch_sdpa.py         |  9 +--
 vllm/platforms/cpu.py                         | 30 +++++----
 vllm/worker/cpu_worker.py                     |  5 +-
 8 files changed, 122 insertions(+), 36 deletions(-)

diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index e3809acad74..d726ee9307f 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -3,6 +3,12 @@
 
 #include "cpu_types.hpp"
 
+#if defined(__x86_64__)
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
+#else
+  #define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
+#endif
+
 namespace {
 template <typename scalar_t>
 void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@@ -95,13 +101,12 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
   }
 
   const int element_num_per_block = key_caches[0][0].numel();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
-        copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
-                                       element_num_per_block, num_layers);
-        CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
-      });
+  DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
+    copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
+                                   element_num_per_block, num_layers);
+    CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
+  });
 }
 
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@@ -118,16 +123,15 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
   int key_stride = key.stride(0);
   int value_stride = value.stride(0);
 
-  VLLM_DISPATCH_FLOATING_TYPES(
-      key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
-        CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
-        reshape_and_cache_cpu_impl<scalar_t>(
-            key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
-            key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
-            slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
-            value_stride, num_heads, head_size, block_size, x);
-        CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
-      });
+  DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
+    CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
+    reshape_and_cache_cpu_impl<scalar_t>(
+        key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
+        key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
+        slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
+        num_heads, head_size, block_size, x);
+    CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
+  });
 }
 
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index a4ef2be2a58..a9369e1fd10 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -16,9 +16,18 @@ namespace vec_op {
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
+
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME,                                \
+                     VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
+
 #ifndef CPU_OP_GUARD
   #define CPU_KERNEL_GUARD_IN(NAME)
   #define CPU_KERNEL_GUARD_OUT(NAME)
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 43c9187f072..65af7b50bdc 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -189,7 +189,7 @@ vLLM CPU backend supports the following vLLM features:
 - Model Quantization (`INT8 W8A8, AWQ, GPTQ`)
 - Chunked-prefill
 - Prefix-caching
-- FP8-E5M2 KV-Caching (TODO)
+- FP8-E5M2 KV cache
 
 ## Related runtime environment variables
 
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 5bf48b5cced..be007de321c 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -266,7 +266,7 @@ def test_with_prefix_caching(
 
 
 @pytest.mark.parametrize("model", ["facebook/opt-125m"])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
 @pytest.mark.parametrize("enforce_eager", [False])
@@ -303,7 +303,7 @@ def test_models_cpu(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index 27c125160aa..faca7a566e7 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -11,6 +11,7 @@
 
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
+from vllm.platforms import current_platform
 
 from ...utils import check_logprobs_close
 
@@ -93,3 +94,63 @@ def test_models(
         name_0="fp16_kv_cache",
         name_1="fp8_kv_cache",
     )
+
+
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(),
+                    reason="test for the CPU backend.")
+@pytest.mark.parametrize(
+    "kv_cache_dtype,base_model,test_model",
+    [
+        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct"),
+    ])
+# Due to low-precision numerical divergence, we only test logprob of 4 tokens
+@pytest.mark.parametrize("max_tokens", [4])
+# Due to low-precision numerical divergence, this test is too sensitive for
+# the async postprocessor
+@pytest.mark.parametrize("disable_async_output_proc", [True])
+def test_cpu_models(
+    vllm_runner,
+    example_prompts,
+    kv_cache_dtype: str,
+    base_model: str,
+    test_model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+) -> None:
+    """
+    Only checks log probs match to cover the discrepancy in
+    numerical sensitive kernels.
+    """
+
+    MAX_MODEL_LEN = 1024
+    NUM_LOG_PROBS = 8
+
+    with vllm_runner(
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        baseline_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    with vllm_runner(
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        test_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, NUM_LOG_PROBS)
+
+    check_logprobs_close(
+        outputs_0_lst=baseline_outputs,
+        outputs_1_lst=test_outputs,
+        name_0="bf16_kv_cache",
+        name_1="fp8_kv_cache",
+    )
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index 37dd75da275..afe2acff4ab 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -17,7 +17,7 @@
                                               is_quantized_kv_cache)
 # yapf: enable
 from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.ipex_attn import PagedAttention
+from vllm.attention.ops.ipex_attn import PagedAttention, _use_ipex
 from vllm.attention.ops.paged_attn import PagedAttentionMetadata
 from vllm.logger import init_logger
 from vllm.utils import make_tensor_with_pad
@@ -431,10 +431,11 @@ def __init__(
             raise ValueError(
                 f"Head size {head_size} is not supported by PagedAttention. "
                 f"Supported head sizes are: {supported_head_sizes}.")
-        if is_quantized_kv_cache(kv_cache_dtype):
+
+        if is_quantized_kv_cache(kv_cache_dtype) and not _use_ipex:
             raise NotImplementedError(
-                "Torch SDPA backend does not support FP8 KV cache. "
-                "Please use xFormers backend instead.")
+                "Torch SDPA backend FP8 KV cache requires "
+                "intel_extension_for_pytorch support.")
         self.attn_type = attn_type
 
     def forward(
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 140335dfb64..40eacfd080e 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -60,9 +60,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Reminder: Please update docs/source/features/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
-            logger.warning(
-                "CUDA graph is not supported on CPU, fallback to the eager "
-                "mode.")
             model_config.enforce_eager = True
 
         cache_config = vllm_config.cache_config
@@ -70,6 +67,25 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
+        scheduler_config = vllm_config.scheduler_config
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and cache_config.cache_dtype != "auto"):
+            raise RuntimeError("Chunked-prefill and prefix-cache on the CPU "
+                               "backend is not compatible with FP8 KV cache.")
+
+        if cache_config.cache_dtype == "fp8_e4m3":
+            cache_config.cache_dtype = "fp8_e5m2"
+            logger.warning(
+                "CPU backend doesn't support fp8_e4m3 KV cache type, "
+                "cast to fp8_e5m2.")
+
+        if (cache_config.cache_dtype != "auto"
+                and model_config.dtype == torch.half):
+            logger.warning("FP8 KV cache on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            model_config.dtype = torch.bfloat16
+
         kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
 
         if kv_cache_space >= 0:
@@ -85,14 +101,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
                 f" {kv_cache_space}, expect a positive integer value.")
 
-        scheduler_config = vllm_config.scheduler_config
-        if ((scheduler_config.chunked_prefill_enabled
-             or cache_config.enable_prefix_caching)
-                and model_config.dtype == torch.half):
-            logger.warning("Chunked-prefill on the CPU backend only does not"
-                           " support fp16 for now, cast to bf16.")
-            model_config.dtype = torch.bfloat16
-
         parallel_config = vllm_config.parallel_config
         if (parallel_config.distributed_executor_backend is not None
                 and parallel_config.distributed_executor_backend != "mp"):
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 27b1a2dd1be..70d2924a045 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -53,8 +53,11 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
 
         if cache_config.cache_dtype == "auto":
             self.dtype = model_config.dtype
+        elif cache_config.cache_dtype in ["fp8", "fp8_e5m2"]:
+            self.dtype = torch.float8_e5m2
         else:
-            self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
+            raise NotImplementedError(f"Unsupported KV cache type "
+                                      f"{cache_config.cache_dtype}.")
 
         # Get attention backend.
         self.attn_backend = get_attn_backend(

From 0090ed52d9ed108f0b27240aa4aa16bb3d8922dd Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sat, 15 Mar 2025 01:08:25 -0400
Subject: [PATCH 0753/1240] [Attention] Get rid of mla cache alignment (#14842)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_cache.py | 39 ++++++++++------------------------
 vllm/envs.py                | 10 ---------
 vllm/utils.py               |  6 ------
 vllm/worker/cache_engine.py | 42 +++----------------------------------
 4 files changed, 14 insertions(+), 83 deletions(-)

diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index b55ebd967fd..f7936989c96 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -8,7 +8,6 @@
 from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import align_to_256bytes
 
 COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
@@ -450,22 +449,13 @@ def _create_mla_cache(
     dtype: torch.dtype,
     kv_cache_dtype: str,
     device: str,
-    align_cache: bool,
 ) -> torch.Tensor:
     cache_dtype = torch.uint8 if kv_cache_dtype == "fp8" else dtype
-
-    if align_cache:
-        alloc_entry_size = align_to_256bytes(entry_size, cache_dtype)
-        alloc_shape = (num_blocks, block_size, alloc_entry_size)
-        cache_full = torch.zeros(alloc_shape, dtype=cache_dtype, device=device)
-        cache = cache_full[..., :entry_size]
-    else:
-        cache = torch.zeros(num_blocks,
-                            block_size,
-                            entry_size,
-                            dtype=cache_dtype,
-                            device=device)
-    return cache
+    return torch.zeros(num_blocks,
+                       block_size,
+                       entry_size,
+                       dtype=cache_dtype,
+                       device=device)
 
 
 def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
@@ -488,7 +478,6 @@ def _fill_mla_cache(cache: torch.Tensor, kv_cache_dtype: str):
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@pytest.mark.parametrize("align_cache", [False])
 @torch.inference_mode()
 def test_concat_and_cache_mla(
     kv_lora_rank: int,
@@ -500,7 +489,6 @@ def test_concat_and_cache_mla(
     seed: int,
     device: str,
     kv_cache_dtype: str,
-    align_cache: bool,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -520,7 +508,7 @@ def test_concat_and_cache_mla(
 
     scale = torch.tensor(0.1, dtype=torch.float32, device=device)
     kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                 kv_cache_dtype, device, align_cache)
+                                 kv_cache_dtype, device)
     ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
 
     for i in range(num_tokens):
@@ -576,7 +564,6 @@ def test_concat_and_cache_mla(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@pytest.mark.parametrize("align_cache", [False, True])
 @torch.inference_mode()
 def test_copy_blocks_mla(
     kv_lora_rank: int,
@@ -588,7 +575,6 @@ def test_copy_blocks_mla(
     seed: int,
     device: str,
     kv_cache_dtype: str,
-    align_cache: bool,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -598,7 +584,7 @@ def test_copy_blocks_mla(
     kv_caches = []
     for _ in range(num_layers):
         kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                     kv_cache_dtype, device, align_cache)
+                                     kv_cache_dtype, device)
         _fill_mla_cache(kv_cache, kv_cache_dtype=kv_cache_dtype)
         kv_caches.append(kv_cache)
 
@@ -642,7 +628,6 @@ def test_copy_blocks_mla(
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE)
-@pytest.mark.parametrize("align_cache", [False, True])
 @torch.inference_mode()
 def test_swap_blocks_mla(
     kv_lora_rank: int,
@@ -653,7 +638,6 @@ def test_swap_blocks_mla(
     seed: int,
     device: str,
     kv_cache_dtype: str,
-    align_cache: bool,
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
@@ -661,9 +645,9 @@ def test_swap_blocks_mla(
     entry_size = kv_lora_rank + qk_rope_head_dim
 
     src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device, align_cache)
+                                  kv_cache_dtype, device)
     dst_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device, align_cache)
+                                  kv_cache_dtype, device)
 
     _fill_mla_cache(src_cache, kv_cache_dtype)
     _fill_mla_cache(dst_cache, kv_cache_dtype)
@@ -704,15 +688,14 @@ def test_swap_blocks_mla(
 @pytest.mark.parametrize("dtype", [torch.float32])
 @pytest.mark.parametrize("kv_cache_dtype",
                          ["auto"])  # You can also test "fp8" if needed.
-@pytest.mark.parametrize("align_cache", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
                           num_blocks, max_seq_len, batch_size, dtype,
-                          kv_cache_dtype, align_cache, device):
+                          kv_cache_dtype, device):
     entry_size = kv_lora_rank + qk_rope_head_dim
     src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
-                                  kv_cache_dtype, device, align_cache)
+                                  kv_cache_dtype, device)
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
     seq_len_tensor = torch.randint(0,
diff --git a/vllm/envs.py b/vllm/envs.py
index 7e079006b27..463059dc067 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -84,7 +84,6 @@
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
-    VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
     VLLM_RAY_BUNDLE_INDICES: str = ""
@@ -580,15 +579,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_RAY_BUNDLE_INDICES":
     lambda: os.getenv("VLLM_RAY_BUNDLE_INDICES", ""),
 
-    # When on a Nvidia GPU aligns single entries (within a page) so they are 256
-    # byte aligned for better performance, this increases the memory usage of
-    # the cache. Currently this only affects MLA that results in non-256
-    # byte aligned entries. This matches the alignment the CUDA runtime uses
-    # for all allocations. Currently this primarily affects MLA, for most other
-    # models the alignment is already naturally aligned to 256 bytes.
-    "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
-    lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
-
     # In some system, find_loaded_library() may not work. So we allow users to
     # specify the path through environment variable VLLM_CUDART_SO_PATH.
     "VLLM_CUDART_SO_PATH":
diff --git a/vllm/utils.py b/vllm/utils.py
index a8eba27dbcd..93347412250 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -827,12 +827,6 @@ def get_dtype_size(dtype: torch.dtype) -> int:
     return torch.tensor([], dtype=dtype).element_size()
 
 
-def align_to_256bytes(extent: int, dtype: torch.dtype) -> int:
-    dtype_size = get_dtype_size(dtype)
-    eles_per_256bytes = 256 // dtype_size
-    return round_up(extent, eles_per_256bytes)
-
-
 # `collections` helpers
 def is_list_of(
     value: object,
diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
index 004b4e4b757..85ebe8121e5 100644
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@@ -1,18 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """CacheEngine class for managing the KV cache."""
-from math import prod
 from typing import List
 
 import torch
 
-from vllm import envs
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType,
-                        align_to_256bytes, get_dtype_size,
-                        is_pin_memory_available)
+                        get_dtype_size, is_pin_memory_available)
 
 logger = init_logger(__name__)
 
@@ -42,7 +38,6 @@ def __init__(
         self.num_attention_layers = model_config.get_num_layers_by_block_type(
             parallel_config, LayerBlockType.attention)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-        self.align_cache = self._align_cache(model_config)
 
         self.block_size = cache_config.block_size
         self.num_gpu_blocks = cache_config.num_gpu_blocks
@@ -81,38 +76,18 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
 
-        # Align entries so they are 256 byte aligned for better performance
-        # Primarily targets MLA as this typically only ends up having entries
-        # be 128 byte aligned.
-        if self.align_cache:
-            # We assume the cache shape is:
-            #    (TOTAL_PAGES, PAGE_SIZE, entry_shape...)
-            # NOTE this assumption currently only holds for MLA so we only apply
-            # this optimization when `use_mla` is true
-            entry_shape = kv_cache_shape[2:]
-            entry_size = prod(entry_shape)
-            alloc_entry_size = align_to_256bytes(entry_size, self.dtype)
-            alloc_shape = (*kv_cache_shape[:2], alloc_entry_size)
-        else:
-            alloc_shape = kv_cache_shape
-
         for _ in range(self.num_attention_layers):
             # null block in CpuGpuBlockAllocator requires at least that
             # block to be zeroed-out.
             # We zero-out everything for simplicity.
-            layer_kv_cache = torch.zeros(alloc_shape,
+            layer_kv_cache = torch.zeros(kv_cache_shape,
                                          dtype=self.dtype,
                                          pin_memory=pin_memory,
                                          device=device)
 
-            # If we allocated with padding for alignment reasons truncate the
-            # shape while preserving the aligned stride
-            if self.align_cache:
-                layer_kv_cache = layer_kv_cache[..., :entry_size]
-
             # view back to (TOTAL_PAGES, PAGE_SIZE, entry_shape...) for cases
             # when entry_shape is higher than 1D
-            kv_cache.append(layer_kv_cache.view(kv_cache_shape))
+            kv_cache.append(layer_kv_cache)
         return kv_cache
 
     def swap_in(self, src_to_dst: torch.Tensor) -> None:
@@ -128,14 +103,6 @@ def swap_out(self, src_to_dst: torch.Tensor) -> None:
     def copy(self, src_to_dsts: torch.Tensor) -> None:
         self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
 
-    @staticmethod
-    def _align_cache(model_config: ModelConfig):
-        # Currently align_cache only applies to MLA models since the other
-        # cache kernels haven't been updated yet to support non-continguous
-        # tensors
-        return model_config.use_mla and current_platform.is_cuda() \
-            and envs.VLLM_CUDA_MEM_ALIGN_KV_CACHE
-
     @staticmethod
     def get_cache_block_size(
         cache_config: CacheConfig,
@@ -153,9 +120,6 @@ def get_cache_block_size(
             dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
 
         key_cache_entry = num_heads * head_size
-        if CacheEngine._align_cache(model_config):
-            key_cache_entry = align_to_256bytes(key_cache_entry,
-                                                model_config.dtype)
 
         # For MLA there is no value cache, since the latent vector
         # is joint keys and values.

From e4b42edecc5fd46d4c954a5e71b5a8f64abc435b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 15 Mar 2025 13:09:25 +0800
Subject: [PATCH 0754/1240] [CI/Build] Delete LoRA bias test (#14849)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py           |  5 ---
 tests/lora/test_lora_bias_e2e.py | 63 --------------------------------
 2 files changed, 68 deletions(-)
 delete mode 100644 tests/lora/test_lora_bias_e2e.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 25665517fee..ee01a1a524f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -173,11 +173,6 @@ def sql_lora_files(sql_lora_huggingface_id):
     return snapshot_download(repo_id=sql_lora_huggingface_id)
 
 
-@pytest.fixture(scope="session")
-def lora_bias_files():
-    return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias")
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
deleted file mode 100644
index d4245a89dff..00000000000
--- a/tests/lora/test_lora_bias_e2e.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "ibm-granite/granite-3b-code-base"
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    generated_texts: list[str] = []
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        generated_texts.append(generated_text)
-    return generated_texts
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-# Skipping for V1 for now as we are hitting,
-# "Head size 80 is not supported by FlashAttention." error.
-@pytest.mark.skip_v1
-@pytest.mark.parametrize("lora_bias", [True])
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_lora_rank=8,
-                   max_loras=1,
-                   enable_lora_bias=lora_bias,
-                   tensor_parallel_size=1,
-                   fully_sharded_loras=fully_sharded)
-
-    print("lora adapter created")
-    output1 = do_sample(llm, lora_bias_files, lora_id=0)
-
-    print("lora")
-    output2 = do_sample(llm, lora_bias_files, lora_id=1)
-
-    if lora_bias:
-        assert output1 != output2
-    else:
-        assert output1 == output2

From 81003c1f542f6fb4d074bf88190bace8bec553ab Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Sat, 15 Mar 2025 01:09:51 -0400
Subject: [PATCH 0755/1240] [V1][Structured Output] calculate vocab_size
 eagerly (#14851)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 32ea1852d0a..77bafdee85c 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,7 +40,7 @@ def _delayed_init(self):
         tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = tokenizer.max_token_id + 1
+        self.vocab_size = len(tokenizer.get_vocab())
         if isinstance(tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98

From 8de306b043bf360c03e34a0990db5f4e81ab48b9 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Fri, 14 Mar 2025 22:17:59 -0700
Subject: [PATCH 0756/1240] [Doc] V1 user guide (#13991)

Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/v1_user_guide.md | 159 +++++++++++++++++++
 docs/source/index.md                         |   2 +
 2 files changed, 161 insertions(+)
 create mode 100644 docs/source/getting_started/v1_user_guide.md

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
new file mode 100644
index 00000000000..533324f9174
--- /dev/null
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -0,0 +1,159 @@
+# vLLM V1 User Guide
+
+V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
+
+## Why vLLM V1?
+
+vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
+
+Building on V0’s success, vLLM V1 retains the stable and proven components from V0
+(such as the models, GPU kernels, and utilities). At the same time, it significantly
+re-architects the core systems, covering the scheduler, KV cache manager, worker,
+sampler, and API server, to provide a cohesive, maintainable framework that better
+accommodates continued growth and innovation.
+
+Specifically, V1 aims to:
+
+- Provide a **simple, modular, and easy-to-hack codebase**.
+- Ensure **high performance** with near-zero CPU overhead.
+- **Combine key optimizations** into a unified architecture.
+- Require **zero configs** by enabling features/optimizations by default.
+
+We see significant performance improvements from upgrading to V1 core engine, in
+particular for long context scenarios. Please see performance benchmark (To be
+added).
+
+For more details, check out the vLLM V1 blog post [vLLM V1: A Major
+Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html) (published Jan 27, 2025).
+
+This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
+
+### Supports Overview
+#### Hardware
+
+| Hardware | Status                                   |
+|----------|------------------------------------------|
+| **NVIDIA** | <nobr>🚀 Natively Supported</nobr>         |
+| **AMD**    | <nobr>🚧 WIP</nobr>           |
+| **TPU**    | <nobr>🚧 WIP</nobr>           |
+#### Feature / Model
+
+| Feature / Model | Status |
+|-----------------|-----------------------------------------------------------------------------------|
+| **Prefix Caching**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Chunked Prefill**                    | <nobr>🚀 Optimized</nobr>                                                        |
+| **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
+| **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
+| **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
+| **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
+| **Encoder-Decoder Models**                  | <nobr>🟡 Planned</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
+| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
+| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+
+- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
+- **🟢 Functional**: Fully operational, with ongoing optimizations.  
+- **🚧 WIP**: Under active development.  
+- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).  
+- **🔴 Deprecated**: Not planned for v1 unless there is strong demand.
+
+**Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same
+way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically
+allocate a fixed token budget per request, enabling features like chunked prefills,
+prefix caching, and speculative decoding without a strict separation between prefill
+and decode phases.
+
+### Semantic Changes and Deprecated Features
+
+#### Logprobs
+
+vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
+differences compared to V0:
+
+**Logprobs Calculation**
+
+Logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+Support for logprobs with post-sampling adjustments is in progress and will be added in future updates.
+
+**Prompt Logprobs with Prefix Caching**
+
+Currently prompt logprobs are only supported when prefix caching is turned off via `--no-enable-prefix-caching`. In a future release, prompt logprobs will be compatible with prefix caching, but a recomputation will be triggered to recover the full prompt logprobs even upon a prefix cache hit. See details in [RFC #13414](https://github.com/vllm-project/vllm/issues/13414).
+
+#### Deprecated Features
+
+As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+
+**Sampling features**
+
+- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **Per-Request Logits Processors**: In V0, users could pass custom
+  processing functions to adjust logits on a per-request basis. In vLLM V1, this
+  feature has been deprecated. Instead, the design is moving toward supporting **global logits
+  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+
+**KV Cache features**
+
+- **GPU <> CPU KV Cache Swapping**: with the new simplified core architecture, vLLM V1 no longer requires KV cache swapping
+to handle request preemptions.
+
+**Structured Output features**
+
+- **Request-level Structured Output Backend**: Deprecated, alternative backends
+  (outlines, guidance) with fallbacks is WIP.
+### Feature & Model Support in Progress
+
+Although we have re-implemented and partially optimized many features and models from V0 in vLLM V1, optimization work is still ongoing for some, and others remain unsupported.
+
+#### Features to Be Optimized
+
+These features are already supported in vLLM V1, but their optimization is still
+in progress.
+
+- **LoRA**: LoRA is functionally working on vLLM V1 but its performance is
+  inferior to that of V0. The team is actively working on improving its
+  performance
+(e.g., see [PR #13096](https://github.com/vllm-project/vllm/pull/13096)).
+
+- **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
+  will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
+
+#### Features to Be Supported
+
+- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
+
+- **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
+  supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
+  Details about the structured outputs can be found
+  [here](https://docs.vllm.ai/en/latest/features/structured_outputs.html).
+
+#### Models to Be Supported
+
+vLLM V1 currently excludes model architectures with the `SupportsV0Only` protocol,
+and the majority fall into the following categories. V1 support for these models will be added eventually.
+
+**Embedding Models**  
+Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage.
+
+**Mamba Models**  
+Models using selective state-space mechanisms (instead of standard transformer attention)
+are not yet supported (e.g., `MambaForCausalLM`, `JambaForCausalLM`).
+
+**Encoder-Decoder Models**  
+vLLM V1 is currently optimized for decoder-only transformers. Models requiring
+  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
+
+For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## FAQ
+
+TODO
diff --git a/docs/source/index.md b/docs/source/index.md
index 52c4622d3e5..1624d5cf5aa 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -67,6 +67,8 @@ getting_started/quickstart
 getting_started/examples/examples_index
 getting_started/troubleshooting
 getting_started/faq
+getting_started/v1_user_guide
+
 :::
 
 % What does vLLM support?

From 12e3b38f7dfe8f5255a99b4dd6af2083d45ed2f9 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 15 Mar 2025 01:38:19 -0400
Subject: [PATCH 0757/1240] [Build/CI] Upgrade jinja2 to get 3 moderate CVE
 fixes (#14839)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/build.txt      | 2 +-
 requirements/rocm-build.txt | 2 +-
 requirements/test.txt       | 2 +-
 requirements/tpu.txt        | 2 +-
 requirements/xpu.txt        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements/build.txt b/requirements/build.txt
index 364a16d80b7..13d643bcaff 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -6,4 +6,4 @@ setuptools>=61
 setuptools-scm>=8
 torch==2.6.0
 wheel
-jinja2
+jinja2>=3.1.6
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index f378663ade7..a0731c51d46 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -11,5 +11,5 @@ packaging
 setuptools>=61
 setuptools-scm>=8
 wheel
-jinja2
+jinja2>=3.1.6
 amdsmi==6.2.4
diff --git a/requirements/test.txt b/requirements/test.txt
index 0a2b491669a..c2cdd2c8664 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -181,7 +181,7 @@ iniconfig==2.0.0
     # via pytest
 isort==5.13.2
     # via datamodel-code-generator
-jinja2==3.1.4
+jinja2==3.1.6
     # via
     #   datamodel-code-generator
     #   torch
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 06bcecfc004..97a39bcd4a6 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -6,7 +6,7 @@ cmake>=3.26
 packaging
 setuptools-scm>=8
 wheel
-jinja2
+jinja2>=3.1.6
 ray[default]
 ray[data]
 
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 3fd0655904e..fa09004d0a9 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -7,7 +7,7 @@ packaging
 setuptools-scm>=8
 setuptools>=75.8.0
 wheel
-jinja2
+jinja2>=3.1.6
 datasets # for benchmark scripts
 
 torch==2.6.0+xpu

From 36b95abe6dc34f6909fafd7349a79474b21dac38 Mon Sep 17 00:00:00 2001
From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com>
Date: Fri, 14 Mar 2025 23:50:33 -0700
Subject: [PATCH 0758/1240] [Bugfix] EAGLE output norm bug (#14464)

Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/spec_decode.md        |  2 +-
 examples/offline_inference/eagle.py        | 93 ++++++++++++++++++++++
 vllm/engine/llm_engine.py                  |  7 +-
 vllm/engine/output_processor/multi_step.py |  5 ++
 vllm/model_executor/models/eagle.py        |  2 +-
 vllm/sequence.py                           | 44 ++++++----
 vllm/spec_decode/spec_decode_worker.py     |  2 +-
 vllm/spec_decode/util.py                   | 32 ++++----
 8 files changed, 152 insertions(+), 35 deletions(-)
 create mode 100644 examples/offline_inference/eagle.py

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index cc8d6fceb7d..852248e418c 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -162,7 +162,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model.
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>).
 
 ```python
 from vllm import LLM, SamplingParams
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
new file mode 100644
index 00000000000..baa91b2d036
--- /dev/null
+++ b/examples/offline_inference/eagle.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+import json
+import os
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--dataset",
+    type=str,
+    default="./examples/data/gsm8k.jsonl",
+    help="downloaded from the eagle repo " \
+    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+)
+parser.add_argument("--max_num_seqs", type=int, default=8)
+parser.add_argument("--num_prompts", type=int, default=80)
+parser.add_argument("--num_spec_tokens", type=int, default=2)
+parser.add_argument("--tp", type=int, default=1)
+parser.add_argument("--draft_tp", type=int, default=1)
+parser.add_argument("--enforce_eager", action='store_true')
+parser.add_argument("--enable_chunked_prefill", action='store_true')
+parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+parser.add_argument("--temp", type=float, default=0)
+
+args = parser.parse_args()
+
+print(args)
+
+model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+max_model_len = 2048
+
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+if os.path.exists(args.dataset):
+    prompts = []
+    num_prompts = args.num_prompts
+    with open(args.dataset) as f:
+        for line in f:
+            data = json.loads(line)
+            prompts.append(data["turns"][0])
+else:
+    prompts = ["The future of AI is", "The president of the United States is"]
+
+prompts = prompts[:args.num_prompts]
+num_prompts = len(prompts)
+
+prompt_ids = [
+    tokenizer.apply_chat_template([{
+        "role": "user",
+        "content": prompt
+    }],
+                                  add_generation_prompt=True)
+    for prompt in prompts
+]
+
+llm = LLM(
+    model=model_dir,
+    trust_remote_code=True,
+    tensor_parallel_size=args.tp,
+    enable_chunked_prefill=args.enable_chunked_prefill,
+    max_num_batched_tokens=args.max_num_batched_tokens,
+    enforce_eager=args.enforce_eager,
+    max_model_len=max_model_len,
+    max_num_seqs=args.max_num_seqs,
+    gpu_memory_utilization=0.8,
+    speculative_model=eagle_dir,
+    num_speculative_tokens=args.num_spec_tokens,
+    speculative_draft_tensor_parallel_size=args.draft_tp,
+    speculative_max_model_len=max_model_len,
+    disable_log_stats=False,
+)
+
+sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+outputs = llm.generate(prompt_token_ids=prompt_ids,
+                       sampling_params=sampling_params)
+
+# calculate the average number of accepted tokens per forward pass, +1 is
+# to account for the token from the target model that's always going to be
+# accepted
+acceptance_counts = [0] * (args.num_spec_tokens + 1)
+for output in outputs:
+    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
+        acceptance_counts[step] += count
+
+print(f"mean acceptance length: \
+    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 94687a13c52..6dc0055bdfb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -853,6 +853,10 @@ def _create_sequence_group_with_sampling(
             self.generation_config_fields, seq.eos_token_id)
 
         # Create the sequence group.
+        draft_size = 1
+        if self.vllm_config.speculative_config is not None:
+            draft_size = \
+                self.vllm_config.speculative_config.num_speculative_tokens + 1
         seq_group = SequenceGroup(
             request_id=request_id,
             seqs=[seq],
@@ -862,7 +866,8 @@ def _create_sequence_group_with_sampling(
             trace_headers=trace_headers,
             prompt_adapter_request=prompt_adapter_request,
             encoder_seq=encoder_seq,
-            priority=priority)
+            priority=priority,
+            draft_size=draft_size)
 
         return seq_group
 
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 8ceef855e02..4c5d78a43df 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -100,6 +100,11 @@ def process_outputs(self,
             seqs = sequence_group.get_seqs(
                 status=SequenceStatus.FINISHED_ABORTED)
 
+        for output in outputs:
+            if output.samples[0].output_token != VLLM_INVALID_TOKEN_ID:
+                sequence_group.metrics.spec_token_acceptance_counts[
+                    output.step_index] += 1
+
         assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences"
         assert len(seqs) == 1, (
             "Beam search not supported in multi-step decoding.")
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index f2a2935e6c6..010e51a3b9f 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -38,7 +38,7 @@ def forward(self, x, residual):
         if residual is None:
             return x
         else:
-            return x, residual
+            return x + residual, None
 
 
 class EAGLE(nn.Module):
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6a7b1e62a60..61867b02531 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -111,6 +111,13 @@ class RequestMetrics:
         model_execute_time: The time spent in the model execute function. This
                             will include model forward, block/sync across
                             workers, cpu-gpu sync time and sampling time.
+        spec_token_acceptance_counts: number of accepted speculative tokens at
+                                      each position; the first token is from 
+                                      the target model and is always accepted;
+                                      e.g., when it's [10, 8, 4, 2] for a req, 
+                                      it means there were 10 forward passes in
+                                      total, and there were 8, 4, 2 accepted 
+                                      tokens at 1st, 2nd, 3rd speculation step. 
     """
     arrival_time: float
     last_token_time: float
@@ -121,6 +128,7 @@ class RequestMetrics:
     scheduler_time: Optional[float] = None
     model_forward_time: Optional[float] = None
     model_execute_time: Optional[float] = None
+    spec_token_acceptance_counts: Optional[list[int]] = None
 
 
 class SequenceDataDelta(
@@ -639,22 +647,25 @@ class SequenceGroup:
         trace_headers: OpenTelemetry trace headers.
         prompt_adapter_request: Prompt Adapter request.
         priority: User-defined priority of the request.
+        draft_size: The number of speculative tokens plus one from the target 
+                    model; equal to max number of tokens a step can generate
+                    for single-draft speculative decoding but larger than 
+                    that for multi-draft SD (currently not supported).
     """
 
-    def __init__(
-        self,
-        request_id: str,
-        seqs: list[Sequence],
-        arrival_time: float,
-        sampling_params: Optional[SamplingParams] = None,
-        lora_request: Optional[LoRARequest] = None,
-        pooling_params: Optional[PoolingParams] = None,
-        pooled_data: Optional[torch.Tensor] = None,
-        encoder_seq: Optional[Sequence] = None,
-        trace_headers: Optional[Mapping[str, str]] = None,
-        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
-        priority: int = 0,
-    ) -> None:
+    def __init__(self,
+                 request_id: str,
+                 seqs: list[Sequence],
+                 arrival_time: float,
+                 sampling_params: Optional[SamplingParams] = None,
+                 lora_request: Optional[LoRARequest] = None,
+                 pooling_params: Optional[PoolingParams] = None,
+                 pooled_data: Optional[torch.Tensor] = None,
+                 encoder_seq: Optional[Sequence] = None,
+                 trace_headers: Optional[Mapping[str, str]] = None,
+                 prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+                 priority: int = 0,
+                 draft_size: int = 1) -> None:
         self.request_id = request_id
         self.seqs = seqs
         self.first_seq = seqs[0]
@@ -667,7 +678,9 @@ def __init__(
                                       last_token_time=arrival_time,
                                       first_scheduled_time=None,
                                       first_token_time=None,
-                                      time_in_queue=None)
+                                      time_in_queue=None,
+                                      spec_token_acceptance_counts=[0] *
+                                      draft_size)
         self.last_token_latency = 0.0
         self.lora_request = lora_request
         self.prompt_logprobs: Optional[PromptLogprobs] = None
@@ -1079,6 +1092,7 @@ class CompletionSequenceGroupOutput(
     samples: list[SequenceOutput]
     # Prompt logprob for each prompt query token.
     prompt_logprobs: Optional[PromptLogprobs]
+    step_index: Optional[int] = 0
 
     def __repr__(self) -> str:
         return (f"CompletionSequenceGroupOutput(samples={self.samples}, "
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 8909a41bc99..5bf4f67d35b 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1080,7 +1080,7 @@ def _create_output_sampler_list(
                         [sequence_index][:num_logprobs],
                         topk_logprobs=topk_logprobs_by_step[step_index]
                         [sequence_index][:num_logprobs],
-                    ))
+                        step_index=step_index))
             sampler_output_list.append(
                 SamplerOutput(outputs=step_output_token_ids))
 
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 9c04680a6a7..466269b2107 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -93,14 +93,14 @@ def create_logprobs_output(
 
 
 def create_sequence_group_output(
-    token_id: int,
-    token_id_logprob_rank: int,
-    token_id_logprob: float,
-    seq_id: SeqId,
-    topk_token_ids: List[Optional[int]],
-    topk_logprobs: List[Optional[float]],
-    prompt_logprobs: Optional[PromptLogprobs] = None,
-) -> CompletionSequenceGroupOutput:
+        token_id: int,
+        token_id_logprob_rank: int,
+        token_id_logprob: float,
+        seq_id: SeqId,
+        topk_token_ids: List[Optional[int]],
+        topk_logprobs: List[Optional[float]],
+        prompt_logprobs: Optional[PromptLogprobs] = None,
+        step_index: Optional[int] = 0) -> CompletionSequenceGroupOutput:
     """Create a SequenceGroupOutput given the sampling results.
 
     Args:
@@ -110,6 +110,7 @@ def create_sequence_group_output(
         seq_id (int): The sequence id.
         topk_token_ids (List[Optional[int]]): The list of top-k token ids.
         topk_logprobs (List[Optional[float]]): The list of top-k logprobs.
+        step_index: (Optional[int]): The index of the speculative token.
     """
 
     logprobs = create_logprobs_output(
@@ -120,14 +121,13 @@ def create_sequence_group_output(
         topk_logprobs,
     )
 
-    return CompletionSequenceGroupOutput(
-        samples=[
-            SequenceOutput(parent_seq_id=seq_id,
-                           output_token=token_id,
-                           logprobs=logprobs)
-        ],
-        prompt_logprobs=prompt_logprobs,
-    )
+    return CompletionSequenceGroupOutput(samples=[
+        SequenceOutput(parent_seq_id=seq_id,
+                       output_token=token_id,
+                       logprobs=logprobs)
+    ],
+                                         prompt_logprobs=prompt_logprobs,
+                                         step_index=step_index)
 
 
 def split_batch_by_proposal_len(

From aac27a66d4a226efde7d8c6b162c503543784496 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 15 Mar 2025 17:52:05 +0800
Subject: [PATCH 0759/1240] [VLM] Limit multimodal input cache by memory
 (#14805)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml                       |  2 +-
 requirements/common.txt                       |  1 +
 requirements/docs.txt                         |  1 +
 .../multimodal/processing/test_common.py      |  2 +-
 vllm/envs.py                                  | 11 ++-
 vllm/jsontree.py                              | 79 +++++++++++++++++++
 vllm/model_executor/models/llava.py           |  3 +-
 vllm/model_executor/models/molmo.py           |  3 +-
 vllm/multimodal/inputs.py                     |  3 +-
 vllm/multimodal/processing.py                 | 51 ++++++++++--
 vllm/multimodal/registry.py                   |  4 +-
 vllm/utils.py                                 | 16 ----
 vllm/v1/engine/mm_input_cache.py              | 38 ++++-----
 13 files changed, 159 insertions(+), 55 deletions(-)
 create mode 100644 vllm/jsontree.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 074ac9d122b..484cd171f5f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,7 +53,7 @@ repos:
     entry: tools/mypy.sh 0 "local"
     language: python
     types: [python]
-    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
+    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
     stages: [pre-commit] # Don't run in CI
   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
     name: Run mypy for Python 3.9
diff --git a/requirements/common.txt b/requirements/common.txt
index 3cd933f347f..bb021d9e454 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,3 +1,4 @@
+cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 1d669699f4b..7a9b921a117 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -9,6 +9,7 @@ msgspec
 cloudpickle
 
 # packages to install to build the documentation
+cachetools
 pydantic >= 2.8
 -f https://download.pytorch.org/whl/cpu
 torch
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index aef5db9bc06..0e0d3711357 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -48,7 +48,7 @@ def _test_processing_correctness(
         tokenizer=cached_tokenizer_from_config(model_config),
     )
     # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity=1 << 30)
+    cache = ProcessingCache(capacity_gb=2048)
 
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
diff --git a/vllm/envs.py b/vllm/envs.py
index 463059dc067..bf214f314c4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -56,7 +56,7 @@
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_VIDEO_FETCH_TIMEOUT: int = 30
     VLLM_AUDIO_FETCH_TIMEOUT: int = 10
-    VLLM_MM_INPUT_CACHE_SIZE: int = 256
+    VLLM_MM_INPUT_CACHE_GIB: int = 8
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -432,11 +432,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_AUDIO_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
-    # Cache size for multimodal feature/input cache for multimodal models
-    # in unit of number of multimodal data items (e.g. image, video, audio).
-    # Default is 256 multimodal data items.
-    "VLLM_MM_INPUT_CACHE_SIZE":
-    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")),
+    # Cache size (in GiB) for multimodal input cache
+    # Default is 8GiB
+    "VLLM_MM_INPUT_CACHE_GIB":
+    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "8")),
 
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
diff --git a/vllm/jsontree.py b/vllm/jsontree.py
new file mode 100644
index 00000000000..91cd7cb216d
--- /dev/null
+++ b/vllm/jsontree.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Helper functions to work with nested JSON structures."""
+from collections.abc import Iterable
+from functools import reduce
+from typing import Callable, TypeVar, Union, overload
+
+_T = TypeVar("_T")
+_U = TypeVar("_U")
+
+JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"],
+                 tuple["JSONTree[_T]", ...], _T]
+"""A nested JSON structure where the leaves need not be JSON-serializable."""
+
+
+def json_iter_leaves(value: JSONTree[_T]) -> Iterable[_T]:
+    """Iterate through each leaf in a nested JSON structure."""
+    if isinstance(value, dict):
+        for v in value.values():
+            yield from json_iter_leaves(v)
+    elif isinstance(value, (list, tuple)):
+        for v in value:
+            yield from json_iter_leaves(v)
+    else:
+        yield value
+
+
+def json_map_leaves(
+    func: Callable[[_T], _U],
+    value: JSONTree[_T],
+) -> JSONTree[_U]:
+    """Apply a function to each leaf in a nested JSON structure."""
+    if isinstance(value, dict):
+        return {k: json_map_leaves(func, v) for k, v in value.items()}
+    elif isinstance(value, list):
+        return [json_map_leaves(func, v) for v in value]
+    elif isinstance(value, tuple):
+        return tuple(json_map_leaves(func, v) for v in value)
+    else:
+        return func(value)
+
+
+@overload
+def json_reduce_leaves(
+    func: Callable[[_T, _T], _T],
+    value: JSONTree[_T],
+    /,
+) -> _T:
+    ...
+
+
+@overload
+def json_reduce_leaves(
+    func: Callable[[_U, _T], _U],
+    value: JSONTree[_T],
+    initial: _U,
+    /,
+) -> _U:
+    ...
+
+
+def json_reduce_leaves(
+    func: Callable[..., Union[_T, _U]],
+    value: JSONTree[_T],
+    initial: _U = ...,  # type: ignore[assignment]
+    /,
+) -> Union[_T, _U]:
+    """
+    Apply a function of two arguments cumulatively to each leaf in a
+    nested JSON structure, from left to right, so as to reduce the
+    sequence to a single value.
+    """
+    if initial is ...:
+        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+
+    return reduce(
+        func,  # type: ignore[arg-type]
+        json_iter_leaves(value),
+        initial,
+    )
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 478dbd83d30..42bf6a5b297 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,6 +18,7 @@
 
 from vllm.config import VllmConfig
 from vllm.inputs import InputProcessingContext
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -35,7 +36,7 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
+from vllm.utils import flatten_2d_lists
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 444b619437a..e709b08815e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -24,6 +24,7 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather)
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
                                                    SiluAndMul)
@@ -50,7 +51,7 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import JSONTree, flatten_2d_lists, json_map_leaves
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 7b186d89dad..3c609fd9676 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -16,7 +16,8 @@
 from transformers import BatchFeature
 from typing_extensions import NotRequired, TypeAlias
 
-from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
+from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.utils import full_groupby, is_list_of
 
 if TYPE_CHECKING:
     from .hasher import MultiModalHashDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 080a2362aac..cdbbed27a52 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
 import re
+import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
@@ -11,14 +11,17 @@
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
                     TypeVar, Union, cast)
 
+import torch
+from cachetools import LRUCache
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
 from vllm.inputs import InputProcessingContext
+from vllm.jsontree import json_map_leaves, json_reduce_leaves
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
+from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
@@ -812,25 +815,50 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
+_V = TypeVar("_V", bound="Union[MultiModalKwargs, MultiModalKwargsItem]")
+
+
 class ProcessingCache:
 
-    def __init__(self, capacity: int) -> None:
+    @staticmethod
+    def get_lru_cache(
+        capacity_gb: int,
+        value_type: type[_V],
+    ) -> LRUCache[str, _V]:
+
+        def get_size(leaf: object) -> int:
+            if isinstance(leaf, torch.Tensor):
+                return leaf.nbytes  # sys.getsizeof doesn't work for tensors
+
+            return sys.getsizeof(leaf)
+
+        return LRUCache[str, _V](
+            GiB_bytes * capacity_gb,
+            getsizeof=lambda x: json_reduce_leaves(
+                lambda a, b: a + b,
+                json_map_leaves(get_size, x),
+            ),
+        )
+
+    def __init__(self, capacity_gb: int) -> None:
         super().__init__()
 
         # DEBUG: Set to None to disable
         self.debug_cache_hit_ratio_steps: Optional[int] = None
+        self.debug_cache_hits = 0
+        self.debug_cache_total = 0
 
-        self._cache = LRUCache[str, MultiModalKwargsItem](capacity)
+        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
 
     def _maybe_log_cache_stats(self) -> None:
         steps = self.debug_cache_hit_ratio_steps
         if not steps:
             return
 
-        cache_stats = self._cache.stat()
-        if cache_stats.total % steps == 0:
+        total = self.debug_cache_total
+        if total > 0 and total % steps == 0:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
-                         cache_stats.hit_ratio)
+                         self.debug_cache_hits / total)
 
     def get(
         self,
@@ -853,6 +881,13 @@ def get(
         cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                  **{modality: input_item},
                                                  **input_kwargs)
+
+        if self.debug_cache_hit_ratio_steps:
+            if cache_key in self._cache:
+                self.debug_cache_hits += 1
+
+            self.debug_cache_total += 1
+
         return self._cache.get(cache_key)
 
     def put(
@@ -870,7 +905,7 @@ def put(
         cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                  **{modality: input_item},
                                                  **input_kwargs)
-        self._cache.put(cache_key, output_kwargs)
+        self._cache[cache_key] = output_kwargs
 
 
 class BaseProcessingInfo:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index febf3ad9eea..24b83589827 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -8,7 +8,7 @@
 
 import torch.nn as nn
 
-from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -119,7 +119,7 @@ def __init__(
 
         self._limits_by_model = _MultiModalLimits()
 
-        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE)
+        self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_GIB)
 
     def register_plugin(self, plugin: MultiModalPlugin) -> None:
         """
diff --git a/vllm/utils.py b/vllm/utils.py
index 93347412250..632b3666e95 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -845,22 +845,6 @@ def is_list_of(
     assert_never(check)
 
 
-JSONTree = Union[dict[str, "JSONTree[T]"], list["JSONTree[T]"],
-                 tuple["JSONTree[T]", ...], T]
-"""A nested JSON structure where the leaves need not be JSON-serializable."""
-
-
-def json_map_leaves(func: Callable[[T], U], value: JSONTree[T]) -> JSONTree[U]:
-    if isinstance(value, dict):
-        return {k: json_map_leaves(func, v) for k, v in value.items()}
-    elif isinstance(value, list):
-        return [json_map_leaves(func, v) for v in value]
-    elif isinstance(value, tuple):
-        return tuple(json_map_leaves(func, v) for v in value)
-    else:
-        return func(value)
-
-
 def flatten_2d_lists(lists: list[list[T]]) -> list[T]:
     """Flatten a list of lists to a single list."""
     return [item for sublist in lists for item in sublist]
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 0f66f68109b..e2dda73ba42 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -3,11 +3,11 @@
 from typing import Any, Optional
 
 from vllm.config import ModelConfig
-from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
+from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
-from vllm.utils import LRUCache
+from vllm.multimodal.processing import ProcessingCache
 
 logger = init_logger(__name__)
 
@@ -30,7 +30,7 @@
 
 # Both Client and Server must use the same cache size
 # (to perform mirrored caching). This cache size is set by the environment
-# variable VLLM_MM_INPUT_CACHE_SIZE.
+# variable VLLM_MM_INPUT_CACHE_GIB.
 
 
 # TODO(ywang96): Deprecate this class once all multimodal models migrate to use
@@ -50,18 +50,20 @@ def __init__(
 
         # Init cache
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUCache[str,
-                                 MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
+        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
+                                                      MultiModalKwargs)
 
         # DEBUG: Set to None to disable
         self.mm_debug_cache_hit_ratio_steps = None
-        self.mm_cache_hits = 0
-        self.mm_cache_total = 0
+        self.mm_debug_cache_hits = 0
+        self.mm_debug_cache_total = 0
 
     def cache_hit_ratio(self, steps):
-        if self.mm_cache_total > 0 and self.mm_cache_total % steps == 0:
+        total = self.mm_debug_cache_total
+
+        if total > 0 and total % steps == 0:
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
-                         self.mm_cache_hits / self.mm_cache_total)
+                         self.mm_debug_cache_hits / total)
 
     # NOTE: process_inputs only supports image inputs since all multimodal
     # models with other modalities have migrated to use merged preprocessor.
@@ -71,7 +73,7 @@ def process_inputs(
         mm_hashes: Optional[list[str]],
         mm_processor_kwargs: Optional[dict[str, Any]],
         precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
-    ) -> list[MultiModalKwargs]:
+    ) -> list[Optional[MultiModalKwargs]]:
         if precomputed_mm_inputs is None:
             image_inputs = mm_data["image"]
             if not isinstance(image_inputs, list):
@@ -88,7 +90,7 @@ def process_inputs(
         # Process each image input separately, so that later we can schedule
         # them in a fine-grained manner.
         # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_inputs: list[MultiModalKwargs] = []
+        ret_inputs: list[Optional[MultiModalKwargs]] = []
         for input_id in range(num_inputs):
             if self.mm_debug_cache_hit_ratio_steps is not None:
                 self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
@@ -99,7 +101,7 @@ def process_inputs(
                 mm_hash = mm_hashes[input_id]
                 mm_input = self.mm_cache.get(mm_hash)
 
-            self.mm_cache_total += 1
+            self.mm_debug_cache_total += 1
             if mm_input is None:
                 if precomputed_mm_inputs is not None:
                     # Reuse precomputed input (for merged preprocessor)
@@ -114,9 +116,9 @@ def process_inputs(
                 if self.use_cache:
                     # Add to cache
                     assert mm_hash is not None
-                    self.mm_cache.put(mm_hash, mm_input)
+                    self.mm_cache[mm_hash] = mm_input
             else:
-                self.mm_cache_hits += 1
+                self.mm_debug_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
 
             ret_inputs.append(mm_input)
@@ -128,14 +130,14 @@ class MMInputCacheServer:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = LRUCache[str,
-                                 MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)
+        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
+                                                      MultiModalKwargs)
 
     def get_and_update(
         self,
         mm_inputs: list[Optional[MultiModalKwargs]],
         mm_hashes: list[str],
-    ) -> list[MultiModalKwargs]:
+    ) -> list[Optional[MultiModalKwargs]]:
         assert len(mm_inputs) == len(mm_hashes)
 
         if not self.use_cache:
@@ -148,7 +150,7 @@ def get_and_update(
                 mm_input = self.mm_cache.get(mm_hash)
                 assert mm_input is not None
             else:
-                self.mm_cache.put(mm_hash, mm_input)
+                self.mm_cache[mm_hash] = mm_input
 
             full_mm_inputs.append(mm_input)
 

From 09cba34a184ce8d27f01e05947a8c67309b065f3 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sat, 15 Mar 2025 04:58:53 -0700
Subject: [PATCH 0760/1240] [CI][Intel GPU] refine intel GPU ci docker build
 (#14860)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-xpu-test.sh | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index d48639e5720..a9c71201a74 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -4,16 +4,27 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
 # Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
+docker build -t ${image_name} -f Dockerfile.xpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
+remove_docker_container() { 
+  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
-docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
+docker run \
+    --device /dev/dri \
+    -v /dev/dri/by-path:/dev/dri/by-path \
+    --entrypoint="" \
+    --name "${container_name}" \
+    "${image_name}" \
+    sh -c '
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '

From d1a5731b9b40a6838f668d0b5811ecd8d8529d79 Mon Sep 17 00:00:00 2001
From: Jun Duan <jun.duan.phd@outlook.com>
Date: Sat, 15 Mar 2025 09:28:14 -0400
Subject: [PATCH 0761/1240] [Core] Expose API endpoint `/is_sleeping` (#14312)

Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_sleep.py  |  7 +++++++
 vllm/engine/async_llm_engine.py         |  3 +++
 vllm/engine/llm_engine.py               |  3 +++
 vllm/engine/multiprocessing/__init__.py | 16 +++++++++++++--
 vllm/engine/multiprocessing/client.py   | 27 +++++++++++++++++++++++--
 vllm/engine/multiprocessing/engine.py   | 13 ++++++++++++
 vllm/engine/protocol.py                 |  5 +++++
 vllm/entrypoints/openai/api_server.py   |  6 ++++++
 vllm/v1/engine/async_llm.py             |  3 +++
 vllm/v1/engine/core.py                  |  3 +++
 vllm/v1/engine/core_client.py           | 15 ++++++++++++++
 vllm/v1/engine/llm_engine.py            |  3 +++
 12 files changed, 100 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 1caa743c401..8bdf00bcee1 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -28,5 +28,12 @@ def test_sleep_mode():
         response = requests.post(remote_server.url_for("/sleep"),
                                  data={"level": "1"})
         assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
         response = requests.post(remote_server.url_for("/wake_up"))
         assert response.status_code == 200
+        response = requests.get(remote_server.url_for("/is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 84f5528a06d..63787590bf4 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1225,6 +1225,9 @@ async def sleep(self, level: int = 1) -> None:
     async def wake_up(self) -> None:
         self.engine.wake_up()
 
+    async def is_sleeping(self) -> bool:
+        return self.engine.is_sleeping()
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         self.engine.add_lora(lora_request)
 
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 6dc0055bdfb..ca50f08a380 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1948,6 +1948,9 @@ def wake_up(self) -> None:
             "Sleep mode is not enabled in the model config")
         self.model_executor.wake_up()
 
+    def is_sleeping(self) -> bool:
+        return self.model_executor.is_sleeping
+
     def check_health(self) -> None:
         if self.tokenizer:
             self.tokenizer.check_health()
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 26dfb63c3db..144dd822a17 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -136,6 +136,18 @@ class RPCWakeUpRequest(Enum):
     WAKE_UP = 1
 
 
+@dataclass
+class RPCIsSleepingRequest:
+    # Set the default value of request_id to a new UUID
+    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+
+@dataclass
+class RPCIsSleepingResponse:
+    request_id: str
+    is_sleeping: bool
+
+
 @dataclass
 class RPCLoadAdapterRequest:
     lora_request: LoRARequest
@@ -151,10 +163,10 @@ class RPCAdapterLoadedResponse:
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
                       RPCUProfileRequest, RPCLoadAdapterRequest,
                       RPCResetPrefixCacheRequest, RPCSleepRequest,
-                      RPCWakeUpRequest]
+                      RPCWakeUpRequest, RPCIsSleepingRequest]
 
 REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse,
-                          RPCError]
+                          RPCIsSleepingResponse, RPCError]
 
 
 def ENGINE_DEAD_ERROR(
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index b1bb0fd53d6..e2ae9486e43 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -27,6 +27,8 @@
                                          IPC_OUTPUT_EXT, RPC_REQUEST_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
                                          RPCAdapterLoadedResponse, RPCError,
+                                         RPCIsSleepingRequest,
+                                         RPCIsSleepingResponse,
                                          RPCLoadAdapterRequest,
                                          RPCProcessRequest,
                                          RPCResetPrefixCacheRequest,
@@ -246,7 +248,9 @@ async def run_output_handler_loop(self):
                         if queue is not None:
                             queue.put_nowait(exception)
                 # Put each output into the appropriate queue.
-                elif isinstance(request_outputs, RPCAdapterLoadedResponse):
+                elif isinstance(
+                        request_outputs,
+                    (RPCAdapterLoadedResponse, RPCIsSleepingResponse)):
                     self._add_output(request_outputs)
                 else:
                     for request_output in request_outputs:
@@ -256,7 +260,8 @@ async def run_output_handler_loop(self):
             logger.debug("Shutting down MQLLMEngineClient output handler.")
 
     def _add_output(self, request_output: Union[RequestOutput,
-                                                RPCAdapterLoadedResponse]):
+                                                RPCAdapterLoadedResponse,
+                                                RPCIsSleepingResponse]):
         queue = self.output_queues.get(request_output.request_id)
         if queue is not None:
             queue.put_nowait(request_output)
@@ -696,6 +701,24 @@ async def wake_up(self) -> None:
         return await self._send_one_way_rpc_request(
             request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
 
+    async def is_sleeping(self) -> bool:
+        """Check whether the engine is sleeping"""
+        request = RPCIsSleepingRequest()
+
+        queue: asyncio.Queue[Union[BaseException,
+                                   RPCIsSleepingResponse]] = asyncio.Queue()
+        self.output_queues[request.request_id] = queue
+
+        request_bytes = pickle.dumps(request)
+        await self.input_socket.send_multipart((request_bytes, ), copy=False)
+
+        request_output = await queue.get()
+        self.output_queues.pop(request.request_id)
+
+        if isinstance(request_output, BaseException):
+            raise request_output
+        return request_output.is_sleeping
+
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
         # Uses the same I/O as generate requests
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 312e0e98d56..33b96af3018 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -18,6 +18,8 @@
                                          IPC_OUTPUT_EXT, REQUEST_OUTPUTS_T,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
                                          RPCAdapterLoadedResponse, RPCError,
+                                         RPCIsSleepingRequest,
+                                         RPCIsSleepingResponse,
                                          RPCLoadAdapterRequest,
                                          RPCProcessRequest,
                                          RPCResetPrefixCacheRequest,
@@ -271,6 +273,8 @@ def handle_new_input(self):
                     self.sleep(request.value)
                 elif isinstance(request, RPCWakeUpRequest):
                     self.wake_up()
+                elif isinstance(request, RPCIsSleepingRequest):
+                    self._handle_is_sleeping_request(request)
                 else:
                     raise ValueError("Unknown RPCRequest Type: "
                                      f"{type(request)}")
@@ -337,6 +341,12 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
         self._send_outputs(
             RPCAdapterLoadedResponse(request_id=request.request_id))
 
+    def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
+        is_sleeping = self.is_sleeping()
+        self._send_outputs(
+            RPCIsSleepingResponse(request_id=request.request_id,
+                                  is_sleeping=is_sleeping))
+
     def _health_check(self):
         # Send unhealthy if engine has already errored
         if self._errored_with is not None:
@@ -406,6 +416,9 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         self.engine.wake_up()
 
+    def is_sleeping(self) -> bool:
+        return self.engine.is_sleeping()
+
 
 def signal_handler(*_) -> None:
     raise KeyboardInterrupt("MQLLMEngine terminated")
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index ee9accd32f2..f314075b166 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -288,6 +288,11 @@ async def wake_up(self) -> None:
         """Wake up the engine"""
         ...
 
+    @abstractmethod
+    async def is_sleeping(self) -> bool:
+        """Check whether the engine is sleeping"""
+        ...
+
     @abstractmethod
     async def add_lora(self, lora_request: LoRARequest) -> None:
         """Load a new LoRA adapter into the engine for future requests."""
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 694d4f9cf11..bc74ebd205d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -694,6 +694,12 @@ async def wake_up(raw_request: Request):
         # is sent but does not finish yet when we return a response.
         return Response(status_code=200)
 
+    @router.get("/is_sleeping")
+    async def is_sleeping(raw_request: Request):
+        logger.info("check whether the engine is sleeping")
+        is_sleeping = await engine_client(raw_request).is_sleeping()
+        return JSONResponse(content={"is_sleeping": is_sleeping})
+
 
 @router.post("/invocations", dependencies=[Depends(validate_json_request)])
 async def invocations(raw_request: Request):
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7188f10b188..d4ac9c066d5 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -407,6 +407,9 @@ async def sleep(self, level: int = 1) -> None:
     async def wake_up(self) -> None:
         await self.engine_core.wake_up_async()
 
+    async def is_sleeping(self) -> bool:
+        return await self.engine_core.is_sleeping_async()
+
     async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         return await self.engine_core.add_lora_async(lora_request)
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 174d96ec437..8f93d3c71cd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -253,6 +253,9 @@ def sleep(self, level: int = 1):
     def wake_up(self):
         self.model_executor.wake_up()
 
+    def is_sleeping(self) -> bool:
+        return self.model_executor.is_sleeping
+
     def execute_dummy_batch(self):
         self.model_executor.collective_rpc("execute_dummy_batch")
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 0f92adcc863..5ed46457978 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -89,6 +89,9 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         raise NotImplementedError
 
+    def is_sleeping(self) -> bool:
+        raise NotImplementedError
+
     def execute_dummy_batch(self) -> None:
         raise NotImplementedError
 
@@ -128,6 +131,9 @@ async def sleep_async(self, level: int = 1) -> None:
     async def wake_up_async(self) -> None:
         raise NotImplementedError
 
+    async def is_sleeping_async(self) -> bool:
+        raise NotImplementedError
+
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         raise NotImplementedError
 
@@ -182,6 +188,9 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         self.engine_core.wake_up()
 
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
     def execute_dummy_batch(self) -> None:
         self.engine_core.execute_dummy_batch()
 
@@ -433,6 +442,9 @@ def sleep(self, level: int = 1) -> None:
     def wake_up(self) -> None:
         self._call_utility("wake_up")
 
+    def is_sleeping(self) -> bool:
+        return self._call_utility("is_sleeping")
+
     def execute_dummy_batch(self) -> None:
         self._call_utility("execute_dummy_batch")
 
@@ -523,6 +535,9 @@ async def sleep_async(self, level: int = 1) -> None:
     async def wake_up_async(self) -> None:
         await self._call_utility_async("wake_up")
 
+    async def is_sleeping_async(self) -> bool:
+        return await self._call_utility_async("is_sleeping")
+
     async def execute_dummy_batch_async(self) -> None:
         await self._call_utility_async("execute_dummy_batch")
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index cbd19d4d637..63b0a8fca32 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -235,6 +235,9 @@ def sleep(self, level: int = 1):
     def wake_up(self):
         self.engine_core.wake_up()
 
+    def is_sleeping(self) -> bool:
+        return self.engine_core.is_sleeping()
+
     def get_tokenizer_group(
         self,
         group_type: type[_G] = BaseTokenizerGroup,

From 65ab4533db8039d441863fb7aceb3cf7d1155de4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Delacourt?=
 <54138269+Flechman@users.noreply.github.com>
Date: Sat, 15 Mar 2025 14:28:27 +0100
Subject: [PATCH 0762/1240] [VLM] Merged multi-modal processor for Pixtral
 (#12211)

Signed-off-by: remi <remi@mistral.ai>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/pixtral.py         |  24 +-
 .../multimodal/processing/test_common.py      | 200 ++++--
 vllm/model_executor/models/llava.py           | 126 ++--
 vllm/model_executor/models/molmo.py           |   6 +-
 vllm/model_executor/models/paligemma.py       |   9 +-
 vllm/model_executor/models/pixtral.py         | 582 +++++++++++-------
 vllm/multimodal/processing.py                 |  14 +-
 vllm/transformers_utils/tokenizer.py          |  19 +-
 vllm/utils.py                                 |   2 +-
 9 files changed, 622 insertions(+), 360 deletions(-)

diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py
index 760de114508..03e6eea8910 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -43,12 +43,18 @@
 #     python demo.py advanced
 
 
-def run_simple_demo():
+def run_simple_demo(args: argparse.Namespace):
     model_name = "mistralai/Pixtral-12B-2409"
     sampling_params = SamplingParams(max_tokens=8192)
 
-    # Lower max_num_seqs or max_model_len on low-VRAM GPUs.
-    llm = LLM(model=model_name, tokenizer_mode="mistral")
+    # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
+    llm = LLM(
+        model=model_name,
+        tokenizer_mode="mistral",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
 
     prompt = "Describe this image in one sentence."
     image_url = "https://picsum.photos/id/237/200/300"
@@ -76,7 +82,7 @@ def run_simple_demo():
     print(outputs[0].outputs[0].text)
 
 
-def run_advanced_demo():
+def run_advanced_demo(args: argparse.Namespace):
     model_name = "mistralai/Pixtral-12B-2409"
     max_img_per_msg = 5
     max_tokens_per_img = 4096
@@ -87,6 +93,7 @@ def run_advanced_demo():
         tokenizer_mode="mistral",
         limit_mm_per_prompt={"image": max_img_per_msg},
         max_model_len=max_img_per_msg * max_tokens_per_img,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
     prompt = "Describe the following image."
@@ -153,14 +160,19 @@ def main():
         help="Specify the demo mode: 'simple' or 'advanced'",
     )
 
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
     args = parser.parse_args()
 
     if args.mode == "simple":
         print("Running simple demo...")
-        run_simple_demo()
+        run_simple_demo(args)
     elif args.mode == "advanced":
         print("Running advanced demo...")
-        run_advanced_demo()
+        run_advanced_demo(args)
 
 
 if __name__ == "__main__":
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 0e0d3711357..f761190a8d0 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -2,17 +2,23 @@
 
 import copy
 from functools import partial
-from typing import Optional
+from typing import Optional, Union
 
 import numpy as np
 import pytest
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.processing import ProcessingCache
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.inputs import MultiModalInputs
+from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.transformers_utils.tokenizer import (MistralTokenizer,
+                                               cached_tokenizer_from_config)
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -85,14 +91,6 @@ def _test_processing_correctness(
         partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
     }
 
-    tokenizer_encode_kwargs = {}
-    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
-        # For some multimodal models, tokenizer will always add bos_token
-        # at the beginning of prompt by default, causing hf_processor outputs
-        # incorrect token ids. So we need use `add_special_tokens=False` here
-        # to leave bos_token to be added by the processor.
-        tokenizer_encode_kwargs = {"add_special_tokens": False}
-
     for batch_idx in range(num_batches):
         mm_data = {
             k:
@@ -115,43 +113,131 @@ def _test_processing_correctness(
                 elif len(mm_data[k]) == 1:
                     mm_data[k] = mm_data[k][0]
 
-        baseline_result = baseline_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-        cached_result = cached_processor.apply(
-            prompt,
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert _drop_mm_kwargs_keys(
-            baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
-                cached_result, ignore_mm_keys), (
-                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert _drop_mm_kwargs_keys(
-            baseline_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
-                baseline_tokenized_result, ignore_mm_keys), (
-                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
-
-        cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
-            mm_data=mm_data,
-            hf_processor_mm_kwargs={},
-        )
-
-        assert _drop_mm_kwargs_keys(
-            cached_result, ignore_mm_keys) == _drop_mm_kwargs_keys(
-                cached_tokenized_result, ignore_mm_keys), (
-                    f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
+        if isinstance(tokenizer, MistralTokenizer):
+            _test_processing_correctness_mistral(
+                model_config,
+                tokenizer,
+                prompt,
+                mm_data,
+                baseline_processor,
+                cached_processor,
+                batch_idx,
+                ignore_mm_keys=ignore_mm_keys,
+            )
+        else:
+            _test_processing_correctness_hf(
+                model_config,
+                tokenizer,
+                prompt,
+                mm_data,
+                baseline_processor,
+                cached_processor,
+                batch_idx,
+                ignore_mm_keys=ignore_mm_keys,
+            )
+
+
+def _test_processing_correctness_hf(
+    model_config: ModelConfig,
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    prompt: str,
+    mm_data: MultiModalDataDict,
+    baseline_processor: BaseMultiModalProcessor,
+    cached_processor: BaseMultiModalProcessor,
+    batch_idx: int,
+    ignore_mm_keys: Optional[list[str]] = None,
+):
+    if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
+        # For some multimodal models, tokenizer will always add bos_token
+        # at the beginning of prompt by default, causing hf_processor outputs
+        # incorrect token ids. So we need use `add_special_tokens=False` here
+        # to leave bos_token to be added by the processor.
+        token_prompt = tokenizer.encode(prompt, add_special_tokens=False)
+    else:
+        token_prompt = tokenizer.encode(prompt)
+
+    baseline_result = baseline_processor.apply(
+        prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+    cached_result = cached_processor.apply(
+        prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert _inputs_equal(
+        baseline_result,
+        cached_result,
+        ignore_mm_keys,
+    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+
+    baseline_tokenized_result = baseline_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert _inputs_equal(
+        baseline_result,
+        baseline_tokenized_result,
+        ignore_mm_keys,
+    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+
+    cached_tokenized_result = cached_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert _inputs_equal(
+        cached_result,
+        cached_tokenized_result,
+        ignore_mm_keys,
+    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+
+
+def _test_processing_correctness_mistral(
+    model_config: ModelConfig,
+    tokenizer: MistralTokenizer,
+    prompt: str,
+    mm_data: MultiModalDataDict,
+    baseline_processor: BaseMultiModalProcessor,
+    cached_processor: BaseMultiModalProcessor,
+    batch_idx: int,
+    ignore_mm_keys: Optional[list[str]] = None,
+):
+    images = mm_data.get("image", [])
+    if not isinstance(images, list):
+        images = [images]
+
+    request = ChatCompletionRequest(messages=[
+        UserMessage(content=[
+            TextChunk(text=prompt),
+            *(ImageChunk(image=image) for image in images),
+        ]),
+    ])
+    res = tokenizer.mistral.encode_chat_completion(request)
+    token_prompt = res.tokens
+
+    # Mistral chat outputs tokens directly, rather than text prompts
+    baseline_tokenized_result = baseline_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+    cached_tokenized_result = cached_processor.apply(
+        token_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert _inputs_equal(
+        baseline_tokenized_result,
+        cached_tokenized_result,
+        ignore_mm_keys,
+    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
 
 
 # yapf: disable
@@ -173,6 +259,7 @@ def _test_processing_correctness(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
@@ -241,8 +328,19 @@ def test_processing_correctness_phi3v(
     )
 
 
-def _drop_mm_kwargs_keys(result: dict,
-                         ignore_mm_keys: Optional[list[str]] = None) -> dict:
+def _inputs_equal(
+    a: MultiModalInputs,
+    b: MultiModalInputs,
+    ignore_mm_keys: Optional[list[str]] = None,
+):
+    return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
+        b, ignore_mm_keys)
+
+
+def _drop_mm_kwargs_keys(
+    result: MultiModalInputs,
+    ignore_mm_keys: Optional[list[str]] = None,
+) -> MultiModalInputs:
     """Drop specified keys from result['mm_kwargs'].
 
     This is mainly to avoid doing exact match of audio_features in ultravox.
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 42bf6a5b297..3a8d184528d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -68,23 +68,15 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image features correspond
-    to patch tokens.
-
-    Shape: `(batch_size, num_crops, num_patch)`
-    """
-
     embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
     
-    Shape: `(batch_size, num_embeds)`
+    Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_crops: Union[torch.Tensor, list[torch.Tensor]]
+    num_patches: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -360,16 +352,16 @@ def _call_hf_processor(
                     image_height=pixel_value.shape[-2],
                 ) for pixel_value in processed_outputs["pixel_values"]
             ]
-            num_crops = torch.tensor([(ncols + 1) * nrows
-                                      for ncols, nrows in tile_sizes])
+            num_patches = torch.tensor([(ncols + 1) * nrows
+                                        for ncols, nrows in tile_sizes])
             # Each image may result to masks of different sizes, so we need to
-            # flatten the list and later use `num_crops` to get per-image masks.
-            embed_is_patch = torch.tensor(
-                flatten_2d_lists([([True] * ncols + [False]) * nrows
-                                  for ncols, nrows in tile_sizes]))
-            processed_outputs["num_crops"] = num_crops
+            # later use `num_patches` to get per-image masks.
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["num_patches"] = num_patches
             processed_outputs["embed_is_patch"] = embed_is_patch
-            processed_outputs["feat_is_patch"] = embed_is_patch
 
         return processed_outputs
 
@@ -378,14 +370,10 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        num_crops = hf_inputs.get("num_crops", torch.empty(0)).view(-1)
         return dict(
-            feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.flat_from_sizes(
-                "image", num_crops),
-            num_crops=MultiModalFieldConfig.batched("image"),
             pixel_values=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -628,27 +616,21 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                feat_is_patch = kwargs.pop("feat_is_patch")
-                if not isinstance(feat_is_patch, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of feat_is_patch. "
-                                     f"Got type: {type(feat_is_patch)}")
-
                 embed_is_patch = kwargs.pop("embed_is_patch")
                 if not isinstance(embed_is_patch, (torch.Tensor, list)):
                     raise ValueError("Incorrect type of embed_is_patch. "
                                      f"Got type: {type(embed_is_patch)}")
 
-                num_crops = kwargs.pop("num_crops")
-                if not isinstance(num_crops, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of num_crops. "
-                                     f"Got type: {type(num_crops)}")
+                num_patches = kwargs.pop("num_patches")
+                if not isinstance(num_patches, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of num_patches. "
+                                     f"Got type: {type(num_patches)}")
 
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
-                    feat_is_patch=feat_is_patch,
                     embed_is_patch=embed_is_patch,
-                    num_crops=num_crops,
+                    num_patches=num_patches,
                 )
 
             return LlavaImagePixelInputs(
@@ -687,21 +669,26 @@ def _image_pixels_to_features(
         vision_tower: Union[CLIPVisionModel, SiglipVisionModel,
                             PixtralHFVisionModel],
         pixel_values: Union[torch.Tensor, list[torch.Tensor]],
-    ) -> torch.Tensor:
-
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
         image_features = vision_tower(pixel_values)
 
-        return self._select_image_features(
-            image_features,
-            strategy=self.config.vision_feature_select_strategy,
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
         )
 
     def _process_image_pixels(
         self,
         inputs: Union[LlavaImagePixelInputs, PixtralHFImagePixelInputs],
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         assert self.vision_tower is not None
 
         pixel_values = inputs["pixel_values"]
@@ -731,45 +718,30 @@ def _process_image_input(
 
     def _get_mm_embeds(
             self,
-            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
-            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
-            num_crops: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
-    ) -> list[torch.Tensor]:
+            features: torch.Tensor,  # Shape: (num_patch, d)
+            num_patches: torch.Tensor,  # Shape: (num_images,)
+            embed_is_patch: torch.Tensor,  # Shape: (num_images, num_embeds)
+    ) -> tuple[torch.Tensor, ...]:
         """Scatter the patch features into a contiguous tensor that corresponds
         to the embedding tokens defined by the multimodal processor.
 
         Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
         """
-
-        # Insert columns of nan values according to `feat_is_patch`. This work
+        # Insert columns of nan values according to `embed_is_patch`. This work
         # ideally should be done in `_process_image_input`, but
         # `_process_image_input` is used in both V0 and V1 path. It's safer to
         # put the logic here.
         # FIXME: Move this logic to `_process_image_input` when v0 is
         # deprecated. Merge this function with `Molmo._get_mm_embeds`.
-        feat_is_patch = feat_is_patch.view(-1)
-        embed_is_patch = embed_is_patch.view(-1)
-        expanded_embedding = torch.full(
-            (sum(num_crops), *features.shape[1:]),
-            torch.nan,
-            dtype=features.dtype).to(features.device)
-        expanded_embedding[feat_is_patch] = features
+        num_patches_per_image: list[int] = num_patches.tolist()
 
-        num_crops_per_image = num_crops.tolist()
-        feats_per_image = expanded_embedding.split(num_crops_per_image)
-        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
-
-        embed_dim = expanded_embedding.shape[-1]
-        num_embeds = embed_is_patch.shape[0]
-
-        embeds_in_batch = list[torch.Tensor]()
-        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
-            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
-            embeds[embed_is_patch] = feats[f_is_patch]
-            embeds_in_batch.append(embeds)
+        embeds_flat = features.new_full(
+            (sum(num_patches_per_image), *features.shape[1:]),
+            fill_value=torch.nan,
+        )
+        embeds_flat[embed_is_patch.view(-1)] = features
 
-        return embeds_in_batch
+        return embeds_flat.split(num_patches_per_image)
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -784,12 +756,12 @@ def get_multimodal_embeddings(
             # The path is used for pixtral (V0 only) and llava (V0/V1)
             return vision_embeddings
 
-        nested_emb = [
+        return flatten_2d_lists(
             self._get_mm_embeds(*args) for args in zip(
-                vision_embeddings, image_input["feat_is_patch"],
-                image_input["num_crops"], image_input["embed_is_patch"])
-        ]
-        return flatten_2d_lists(nested_emb)
+                vision_embeddings,
+                image_input["num_patches"],
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
@@ -805,9 +777,11 @@ def get_input_embeddings(
             )
 
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, cast(NestedTensors,
-                                               patch_embeddings),
-                self.config.image_token_index)
+                input_ids,
+                inputs_embeds,
+                cast(NestedTensors, patch_embeddings),
+                self.config.image_token_index,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index e709b08815e..c7f6cf461d5 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -1585,15 +1585,13 @@ def get_multimodal_embeddings(
 
         image_features = self._process_image_input(image_input)
 
-        nested_embeds = [
+        return flatten_2d_lists(
             self._get_mm_embeds(*args) for args in zip(
                 image_features,
                 image_input["feat_is_patch"],
                 image_input["num_crops"],
                 image_input["embed_is_patch"],
-            )
-        ]
-        return flatten_2d_lists(nested_embeds)
+            ))
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 88a6226d214..8a773607ce4 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
-from typing import (Iterable, Literal, Mapping, Optional, Set, Tuple,
-                    TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 from torch import nn
@@ -17,7 +16,7 @@
 from vllm.multimodal.parse import MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptReplacement,
+                                        PromptInsertion, PromptUpdate,
                                         PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -144,7 +143,7 @@ def _get_prompt_updates(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptReplacement]:
+    ) -> Sequence[PromptUpdate]:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 2e71390623f..fff630056e4 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,26 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import math
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import List, Optional, Set, Tuple, Union
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from mistral_common.protocol.instruct.messages import ImageChunk
+from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
 from PIL import Image
-from transformers import PixtralVisionConfig
+from transformers import PixtralVisionConfig, TensorType
+from transformers.image_utils import ImageInput
 from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens as _get_pixtral_hf_num_image_tokens)
 from transformers.models.pixtral.modeling_pixtral import (
     PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
+from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
-                         InputContext, token_inputs)
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -31,13 +33,20 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import consecutive_placeholder_ranges
-from vllm.sequence import IntermediateTensors, SequenceData
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import (MistralTokenizer,
+                                               cached_tokenizer_from_config)
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (init_vllm_registered_model, maybe_prefix,
+from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
 from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
@@ -48,132 +57,275 @@
     USE_XFORMERS_OPS = False
 
 
-def get_max_pixtral_image_tokens(ctx: InputContext):
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    mm_encoder = tokenizer.instruct.mm_encoder
+class PixtralImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
 
-    image_config = mm_encoder.mm_config if hasattr(
-        mm_encoder, "mm_config") else mm_encoder.image_config
+    images: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
 
-    max_image_size = image_config.max_image_size
-    image_patch_size = image_config.image_patch_size
+    The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
+    """
 
-    return ((max_image_size // image_patch_size)**2)
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
 
+    num_patches: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size, num_images)`"""
 
-def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
-                           mm_counts: Mapping[str, int]):
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
 
-    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-    image_token_id = mm_encoder.special_ids.img
+class PixtralProcessorAdapter:
+    """
+    Provide a HF-compatible interface for
+    :class:`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    """
 
-    mm_config = ctx.get_mm_config()
-    num_images = mm_config.get_limit_per_prompt("image")
+    def __init__(self, tokenizer: MistralTokenizer) -> None:
+        super().__init__()
 
-    # dummy size
-    size = 256
-    image = Image.new("RGB", (size, size), color=0)
+        self.tokenizer = tokenizer
 
-    encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image))
-    image_feature_size = len(encoding.tokens)
-    num_image_tokens = image_feature_size * num_images
-    seq_data = SequenceData.from_prompt_token_counts(
-        (image_token_id, num_image_tokens),
-        (0, seq_len - num_image_tokens),
-    )
+    @property
+    def image_processor(self) -> ImageEncoder:
+        image_encoder = self.tokenizer.instruct.mm_encoder
+        assert isinstance(image_encoder, ImageEncoder)
+        return image_encoder
 
-    mm_data = {"image": num_images * [image]}
-    mm_placeholders = {
-        "image":
-        consecutive_placeholder_ranges(num_items=num_images,
-                                       item_size=image_feature_size)
-    }
-    return DummyData(seq_data, mm_data, mm_placeholders)
+    @cached_property
+    def image_break_id(self) -> int:
+        return self.image_processor.special_ids.img_break
 
+    @cached_property
+    def image_token_id(self) -> int:
+        return self.image_processor.special_ids.img
 
-def input_mapper_for_pixtral(ctx: InputContext,
-                             data: object) -> MultiModalKwargs:
-    """Maps the input data to its MultiModalKwargs (if any).
+    @cached_property
+    def image_end_id(self) -> int:
+        return self.image_processor.special_ids.img_end
 
-    Args:
-        ctx: Context of the loaded model.
-        data: data potentially containing PIL images to be processed
-            and mapped to `images`.
+    @cached_property
+    def image_size(self) -> int:
+        return self.image_processor.mm_config.max_image_size
 
-    Returns:
-        MultiModalKwargs containing the stacked normalized images tensor or
-        image embeddings.
-    """
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-
-    data_list = data if isinstance(data, list) else [data]
-
-    images = []
-    image_tokens_list = []
-    for image_data in data_list:
-        image = ImageChunk(image=image_data)
-        encoding = tokenizer.instruct.mm_encoder(image)
-        image = torch.from_numpy(encoding.image).to(dtype=torch.float16)
-        images.append(image)
-        image_tokens_list.append(encoding.tokens)
-
-    image_tokens = torch.tensor([
-        token_id for image_tokens in image_tokens_list
-        for token_id in image_tokens
-    ])
-    return MultiModalKwargs({"images": images, "image_tokens": image_tokens})
-
-
-def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
-    multi_modal_data = inputs.get("multi_modal_data")
-    if multi_modal_data is None or "image" not in multi_modal_data:
-        return inputs
-
-    prompt_token_ids = inputs.get("prompt_token_ids")
-    prompt = inputs.get("prompt")
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-
-    mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder
-    image_token_id = mm_encoder.special_ids.img
-    image_break_id = mm_encoder.special_ids.img_break
-    image_end_id = mm_encoder.special_ids.img_end
-
-    if image_token_id not in inputs['prompt_token_ids']:
-        raise ValueError(
-            f"You've passed {inputs=} without {image_token_id=}"
-            " Make sure to process your input via mistral_common's"
-            " tokenizer or pass a chat completion request. For more"
-            " For more info, see: "
-            "https://github.com/vllm-project/vllm/issues/8411.")
-
-    # Get precise tracking of placeholder positions
-    placeholder_ranges = []
-    curr_offset = -1
-    curr_length = 0
-    for i in range(len(prompt_token_ids)):
-        if prompt_token_ids[i] in (image_token_id, image_break_id):
-            if curr_offset < 0:
-                curr_offset = i
-            curr_length += 1
-        elif prompt_token_ids[i] == image_end_id:
-            curr_length += 1
-            placeholder_ranges.append(
-                PlaceholderRange(offset=curr_offset, length=curr_length))
-            curr_offset = -1
-            curr_length = 0
-        else:
-            pass
-    return token_inputs(prompt=prompt,
-                        prompt_token_ids=prompt_token_ids,
-                        multi_modal_data=multi_modal_data,
-                        multi_modal_placeholders={"image": placeholder_ranges})
+    @cached_property
+    def patch_size(self) -> int:
+        return self.image_processor.mm_config.image_patch_size
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, list[TextInput]]] = None,
+        images: Optional[Union[ImageInput, list[ImageInput]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if not images:
+            input_ids = self.tokenizer(text).input_ids
+
+            return {"input_ids": torch.tensor(input_ids)}
+
+        # Allow dummy text, which is used for profiling as well as token inputs
+        if any(len(t) > 0 for t in text):
+            raise ValueError(
+                "You've passed text inputs instead of token inputs. "
+                "Make sure to process your input via `mistral_common`'s "
+                "tokenizer or pass a chat completion request. "
+                "For more info, see: "
+                "https://github.com/vllm-project/vllm/issues/8411.")
+
+        image_token_id = self.image_token_id
+
+        images_processed = list[torch.Tensor]()
+        images_tokens = list[torch.Tensor]()
+        images_embed_is_patch = list[torch.Tensor]()
+        images_num_patches = list[int]()
+
+        for image in images:
+            image_inputs = self.image_processor(ImageChunk(image=image))
+
+            image_processed = torch.tensor(image_inputs.image)
+            image_tokens = torch.tensor(image_inputs.tokens)
+
+            images_processed.append(image_processed)
+            images_tokens.append(image_tokens)
+            images_embed_is_patch.append(image_tokens == image_token_id)
+            images_num_patches.append(len(image_tokens))
+
+        return {
+            "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
+            "images": images_processed,
+            "embed_is_patch": images_embed_is_patch,
+            "num_patches": torch.tensor(images_num_patches),
+        }
+
+
+class PixtralProcessingInfo(BaseProcessingInfo):
+
+    def get_tokenizer(self) -> MistralTokenizer:
+        tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
+        if not isinstance(tokenizer, MistralTokenizer):
+            raise ValueError("This model requires `--tokenizer-mode mistral`")
+
+        return tokenizer
+
+    def get_hf_processor(self) -> PixtralProcessorAdapter:
+        return PixtralProcessorAdapter(self.get_tokenizer())
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_vision_config(
+        self,
+        processor: Optional[PixtralProcessorAdapter] = None,
+    ):
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return PixtralVisionConfig(
+            image_size=processor.image_size,
+            patch_size=processor.patch_size,
+        )
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[PixtralProcessorAdapter] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        ncols, nrows = processor.image_processor._image_to_num_tokens(
+            Image.new("RGB", (image_width, image_height)))
+
+        return (ncols + 1) * nrows
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_hf_processor().image_processor
+        max_image_size = image_processor.mm_config.max_image_size
+
+        return ImageSize(width=max_image_size, height=max_image_size)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="",
+            mm_data=mm_data,
+        )
+
+
+class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
+                                 ):
 
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            images=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        image_break_id = processor.image_break_id
+        image_token_id = processor.image_token_id
+        image_end_id = processor.image_end_id
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = processor.image_processor._image_to_num_tokens(
+                Image.new("RGB", (image_size.width, image_size.height)))
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="",  # Never match the prompt (see below note)
+                replacement=get_replacement,
+            ),
+        ]
+
+    def _cached_apply_hf_processor(
+        self,
+        prompt: Union[str, list[int]],
+        mm_data_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> tuple[list[int], MultiModalKwargs, bool]:
+        prompt_ids, mm_kwargs, _ = super()._cached_apply_hf_processor(
+            prompt=prompt,
+            mm_data_items=mm_data_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        # NOTE: The tokens are already inserted by the chat template
+        return prompt_ids, mm_kwargs, True
 
-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_pixtral_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(dummy_data_for_pixtral)
-@INPUT_REGISTRY.register_input_processor(input_processor_for_pixtral)
+
+@MULTIMODAL_REGISTRY.register_processor(PixtralMultiModalProcessor,
+                                        info=PixtralProcessingInfo,
+                                        dummy_inputs=PixtralDummyInputsBuilder)
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP):
 
@@ -191,13 +343,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             if key in dataclass_fields
         }
 
-        if not ("image_break_token_id" in vision_args
-                and "image_end_token_id" in vision_args):
-            raise ValueError(
-                "'image_break_token_id' and 'image_end_token_id' not found "
-                "in the vision_encoder arguments. Please download the latest "
-                "version of 'params.json' from the model repository.")
-
         self.vision_args = VisionEncoderArgs(**vision_args)
 
         # init MistralForCausalLM
@@ -221,36 +366,92 @@ def sampler(self):
 
         return get_sampler()
 
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[PixtralImagePixelInputs]:
+        images = kwargs.pop("images", None)
+        if images is None:
+            return None
+
+        if not isinstance(images, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of images. "
+                             f"Got type: {type(images)}")
+
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        num_patches = kwargs.pop("num_patches")
+        if not isinstance(num_patches, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_patches. "
+                             f"Got type: {type(num_patches)}")
+
+        return PixtralImagePixelInputs(
+            type="pixel_values",
+            images=flatten_bn(images),
+            embed_is_patch=embed_is_patch,
+            num_patches=num_patches,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: PixtralImagePixelInputs,
+    ) -> tuple[torch.Tensor, ...]:
+        images = image_input["images"]
+
+        image_features = self.vision_encoder(images)
+        feature_sizes = [
+            image_feature.shape[0] for image_feature in image_features
+        ]
+
+        image_embeds = self.vision_language_adapter(torch.cat(image_features))
+        image_embeds = torch.split(image_embeds, feature_sizes)
+        return image_embeds
+
+    def _get_mm_embeds(
+            self,
+            features: torch.Tensor,  # Shape: (num_patch, d)
+            num_patches: torch.Tensor,  # Shape: (num_images,)
+            embed_is_patch: torch.Tensor,  # Shape: (num_images, num_embeds)
+    ) -> tuple[torch.Tensor, ...]:
+        """Scatter the patch features into a contiguous tensor that corresponds
+        to the embedding tokens defined by the multimodal processor.
+
+        Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
+        """
+        # Insert columns of nan values according to `embed_is_patch`. This work
+        # ideally should be done in `_process_image_input`, but
+        # `_process_image_input` is used in both V0 and V1 path. It's safer to
+        # put the logic here.
+        # FIXME: Move this logic to `_process_image_input` when v0 is
+        # deprecated. Merge this function with `Molmo._get_mm_embeds`.
+        num_patches_per_image: list[int] = num_patches.tolist()
+
+        embeds_flat = features.new_full(
+            (sum(num_patches_per_image), *features.shape[1:]),
+            fill_value=torch.nan,
+        )
+        embeds_flat[embed_is_patch.view(-1)] = features
+
+        return embeds_flat.split(num_patches_per_image)
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        image_input, image_tokens = self._parse_and_validate_image_input(
-            **kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
 
-        vision_embeddings = self._process_image_input(image_input)
-
-        # NOTE: We patch the outputs of the vision encoder with embeddings
-        # from `[IMG_BREAK]` and `[IMG_END]` tokens.
-        image_embeds = self.language_model.get_input_embeddings(image_tokens)
-        image_token_mask = image_tokens == self.vision_args.image_token_id
-        image_embeds[image_token_mask] = vision_embeddings
+        image_features = self._process_image_input(image_input)
 
-        # NOTE: Image embeddings are split into separate tensors for each image
-        # by the indices of `[IMG_END]` token.
-        image_end_mask = image_tokens == self.vision_args.image_end_token_id
-        split_indices = torch.where(image_end_mask)[0] + 1
-        if len(split_indices) <= 1:
-            # Do not split, return as tensor of shape [1, fs, hs]
-            return image_embeds.unsqueeze(0)
+        if kwargs.get("v0_path", False):
+            return image_features
 
-        # If the last split index is the last index in image_tokens, we
-        # ignore it to avoid empty split tensor
-        if split_indices[-1] == len(image_tokens):
-            split_indices = split_indices[:-1]
-
-        image_embeds = image_embeds.tensor_split(split_indices.cpu())
-        return image_embeds
+        return flatten_2d_lists(
+            self._get_mm_embeds(*args) for args in zip(
+                image_features,
+                image_input["num_patches"],
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
@@ -259,12 +460,17 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
+            # Extract the patch tokens
+            patch_embeddings = json_map_leaves(
+                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+                cast(JSONTree[torch.Tensor], multimodal_embeddings),
+            )
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings, [
-                    self.vision_args.image_token_id,
-                    self.vision_args.image_break_token_id,
-                    self.vision_args.image_end_token_id,
-                ])
+                input_ids,
+                inputs_embeds,
+                cast(NestedTensors, patch_embeddings),
+                self.vision_args.image_token_id,
+            )
         return inputs_embeds
 
     def forward(
@@ -275,14 +481,14 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        """Run forward pass for pixtral.
-        """
+        """Run forward pass for pixtral."""
         if intermediate_tensors is not None:
             inputs_embeds = None
 
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
@@ -295,47 +501,6 @@ def forward(
 
         return hidden_states
 
-    def _parse_and_validate_image_input(
-        self,
-        images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor],
-                               torch.Tensor]] = None,
-        image_tokens: Optional[torch.Tensor] = None,
-    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]:
-        if images is None:
-            return None, None
-
-        if isinstance(images, torch.Tensor):
-            # if passed as batch take all images
-            N, B, C, W, H = images.shape
-            images = images.reshape(N * B, C, W, H)
-            images = [images[i] for i in range(images.size(0))]
-        elif isinstance(images, list):
-            # if passed as list flatten lists of tensors
-            flatten_images = []
-            for imgs_per_req in images:
-                imgs_per_req = [
-                    imgs_per_req[i] for i in range(imgs_per_req.size(0))
-                ] if isinstance(imgs_per_req, torch.Tensor) else imgs_per_req
-
-                flatten_images.extend(imgs_per_req)
-
-            images = flatten_images
-
-        if isinstance(image_tokens, torch.Tensor):
-            # image_tokens are batched
-            image_tokens = image_tokens.flatten()
-        elif isinstance(image_tokens, list):
-            # image_tokens are of different lengths thus passed as a list
-            image_tokens = torch.cat(image_tokens)
-
-        assert image_tokens.dim() == 1
-
-        return images, image_tokens
-
-    def _process_image_input(self,
-                             image_input: List[torch.Tensor]) -> torch.Tensor:
-        return self.vision_language_adapter(self.vision_encoder(image_input))
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
@@ -400,8 +565,6 @@ class VisionEncoderArgs:
     num_attention_heads: int
     rope_theta: float  # for rope-2D
     image_token_id: int
-    image_break_token_id: int
-    image_end_token_id: int
     adapter_bias: bool = True
 
 
@@ -637,9 +800,13 @@ def forward(
             self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
         ]
 
+        patch_embeds = [
+            p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list
+        ]
+        embed_sizes = [p.shape[1] for p in patch_embeds]
+
         # flatten to a single sequence
-        patch_embeds = torch.cat(
-            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = torch.cat(patch_embeds, dim=1)
         patch_embeds = self.ln_pre(patch_embeds)
 
         # positional embeddings
@@ -655,8 +822,8 @@ def forward(
                               "with the Mistral format")
         out = self.transformer(patch_embeds, mask=mask, freqs_cis=freqs_cis)
 
-        # remove batch dimension of the single sequence
-        return out.squeeze(0)
+        # squeeze dim 0 and split into separate tensors for each image
+        return torch.split(out.squeeze(0), embed_sizes)
 
 
 class VisionLanguageAdapter(nn.Module):
@@ -978,9 +1145,9 @@ def __init__(
 
     def forward(
         self,
-        pixel_values: List[torch.Tensor],
+        pixel_values: list[torch.Tensor],
         feature_sample_layers: Optional[list[int]] = None,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, ...]:
         """
         Args:
             pixel_values: Each image to be processed will be a separate tensor
@@ -1039,8 +1206,7 @@ def forward(
                                              self.config.num_hidden_layers)
 
         # squeeze dim 0 and split into separate tensors for each image
-        out = torch.split(torch.squeeze(out), embed_sizes)
-        return out
+        return torch.split(out.squeeze(0), embed_sizes)
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index cdbbed27a52..10c53dfb2c6 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -77,7 +77,9 @@ def get_match_index(
             else:
                 if isinstance(prefix, str):
                     # Make both `list[int]`
-                    prefix = encode_tokens(tokenizer, prefix)
+                    prefix = encode_tokens(tokenizer,
+                                           prefix,
+                                           add_special_tokens=False)
 
             match_idx = len(prefix)
             return match_idx if prompt[:match_idx] == prefix else None
@@ -318,7 +320,7 @@ def _cached_encode(
     tokenizer: AnyTokenizer,
     text: str,
     *,
-    add_special_tokens: bool = False,
+    add_special_tokens: Optional[bool] = None,
 ) -> list[int]:
     return encode_tokens(tokenizer,
                          text,
@@ -330,7 +332,7 @@ def _cached_decode(
     tokenizer: AnyTokenizer,
     token_ids: tuple[int, ...],
     *,
-    skip_special_tokens: bool = False,
+    skip_special_tokens: Optional[bool] = None,
 ) -> str:
     return decode_tokens(tokenizer,
                          list(token_ids),
@@ -395,7 +397,9 @@ def text(self) -> str:
     def token_ids(self) -> list[int]:
         if self._token_ids is None:
             assert self._text is not None
-            self._token_ids = _cached_encode(self.tokenizer, self._text)
+            self._token_ids = _cached_encode(self.tokenizer,
+                                             self._text,
+                                             add_special_tokens=False)
 
         return self._token_ids
 
@@ -1046,7 +1050,7 @@ def _get_prompt_updates(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         out_mm_kwargs: MultiModalKwargs,
-    ) -> list[PromptUpdate]:
+    ) -> Sequence[PromptUpdate]:
         """
         Given the original multi-modal items for this modality
         and HF-processed data, output the updates to perform.
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 2c34f2f5d44..1bfb5032833 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -34,13 +34,20 @@ def decode_tokens(
     tokenizer: AnyTokenizer,
     token_ids: list[int],
     *,
-    skip_special_tokens: bool = False,
+    skip_special_tokens: Optional[bool] = None,
 ) -> str:
     """
     Backend-agnostic equivalent of HF's
-    :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`.
+    :code:`tokenizer.decode(token_ids, ...)`.
+
+    :code:`skip_special_tokens=None` means to use the backend's default
+    settings.
     """
-    return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+    if skip_special_tokens is not None:
+        return tokenizer.decode(token_ids,
+                                skip_special_tokens=skip_special_tokens)
+
+    return tokenizer.decode(token_ids)
 
 
 def encode_tokens(
@@ -51,10 +58,14 @@ def encode_tokens(
 ) -> list[int]:
     """
     Backend-agnostic equivalent of HF's
-    :code:`tokenizer.encode(text, add_special_tokens=...)`.
+    :code:`tokenizer.encode(text, ...)`.
+
+    :code:`add_special_tokens=None` means to use the backend's default
+    settings.
     """
     if add_special_tokens is not None:
         return tokenizer.encode(text, add_special_tokens=add_special_tokens)
+
     return tokenizer.encode(text)
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 632b3666e95..79787303af5 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -845,7 +845,7 @@ def is_list_of(
     assert_never(check)
 
 
-def flatten_2d_lists(lists: list[list[T]]) -> list[T]:
+def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
     """Flatten a list of lists to a single list."""
     return [item for sublist in lists for item in sublist]
 

From d2823e98f8573cfaceabaf99122ccdd3db127fa2 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 15 Mar 2025 18:46:17 -0700
Subject: [PATCH 0763/1240] [Misc][Doc] Minor benchmark README update (#14874)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index c64c24fd3ad..3225a4b0db3 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -82,10 +82,10 @@ Then run the benchmarking script
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
 NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
 DATASET_NAME="sharegpt"
 DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
 ```
 
 If successful, you will see the following output

From 4437e334cda916fcc29073e3666d195ded751af2 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 16 Mar 2025 09:53:52 +0800
Subject: [PATCH 0764/1240] [VLM] Clean up Phi-4-MM ViT implementation (#14812)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/test.in                          |    1 +
 requirements/test.txt                         |    2 +
 .../vision_language/test_phi4mm.py            |  229 ++
 vllm/model_executor/models/aria.py            |    4 +-
 .../models/idefics2_vision_model.py           |   57 +-
 vllm/model_executor/models/phi4mm.py          |   45 +-
 .../models/vision_siglip_navit.py             | 1966 -----------------
 7 files changed, 316 insertions(+), 1988 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_phi4mm.py
 delete mode 100644 vllm/model_executor/models/vision_siglip_navit.py

diff --git a/requirements/test.in b/requirements/test.in
index cc89d518c7e..c171e8d41dd 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -8,6 +8,7 @@ pytest-shard
 
 # testing utils
 awscli
+backoff # required for phi4mm test
 decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
diff --git a/requirements/test.txt b/requirements/test.txt
index c2cdd2c8664..10fb1f14c3a 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -33,6 +33,8 @@ audioread==3.0.1
     # via librosa
 awscli==1.35.23
     # via -r requirements/test.in
+backoff==2.2.1
+    # via -r requirements/test.in
 bitsandbytes==0.45.3
     # via -r requirements/test.in
 black==24.10.0
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
new file mode 100644
index 00000000000..fb69beaf775
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import re
+from typing import Optional
+
+import pytest
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import rescale_image_size
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+    "cherry_blossom":
+    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+models = [model_path]
+
+
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
+                                         Optional[SampleLogprobs]],
+                      model: str):
+    """Sanitize vllm output to be comparable with hf output."""
+    _, output_str, out_logprobs = vllm_output
+
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
+
+    hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
+
+    return hf_output_ids, hf_output_str, out_logprobs
+
+
+target_dtype = "half"
+
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+    os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    inputs: list[tuple[list[str], PromptImageInput]],
+    model: str,
+    *,
+    max_model_len: int,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test are from IMAGE_ASSETS.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects
+    and corresponding MultiModalConfig as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=max_model_len,
+            max_num_seqs=2,
+            dtype=dtype,
+            limit_mm_per_prompt={"image": mm_limit},
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_lora=True,
+            max_lora_rank=320,
+            lora_extra_vocab_size=0,
+            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+            enforce_eager=True,
+    ) as vllm_model:
+        lora_request = LoRARequest("vision", 1, vision_lora_path)
+        vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images)
+            for prompts, images in inputs
+        ]
+
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_case = [
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=images,
+                                                    eos_token_id=eos_token_id,
+                                                    num_logits_to_keep=0)
+            for prompts, images in inputs
+        ]
+
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+                                        vllm_outputs_per_case):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+
+
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.7, 0.75, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_model_len: int, max_tokens: int,
+                num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        # [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.xfail(
+    reason="Phi-4-MM multi-image inference is divergent with hf model.")
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+                             size_factors, dtype: str, max_model_len: int,
+                             max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case = [
+        ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+         [[rescale_image_size(image, factor) for image in images]
+          for factor in size_factors])
+    ]
+
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index ecd0a04b1df..8cd3be90ca8 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -60,7 +60,7 @@ def __init__(
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
-        super().__init__(config, quant_config, prefix)
+        super().__init__(config, quant_config=quant_config, prefix=prefix)
         # Unlike Idefics3VisionTransformer which uses LayerNorm after the
         # final layer, Aria omits this normalization, so we replace it with an
         # Identity layer
@@ -512,7 +512,7 @@ def __init__(
         self.config = config
         self.vision_tower = AriaVisionTransformer(
             config.vision_config,
-            quant_config,
+            quant_config=quant_config,
             prefix=f"{prefix}.vision_tower",
         )
         self.multi_modal_projector = AriaProjector(config)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index f9c2175b298..cb0379c10f3 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -113,7 +113,7 @@ class Idefics2VisionAttention(nn.Module):
 
     def __init__(
         self,
-        config: Idefics2Config,
+        config: Idefics2VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -164,7 +164,7 @@ class Idefics2VisionMLP(nn.Module):
 
     def __init__(
         self,
-        config: Idefics2Config,
+        config: Idefics2VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> None:
@@ -249,16 +249,24 @@ def __init__(
         self,
         config: Idefics2Config,
         quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
 
         self.config = config
+
+        if num_hidden_layers_override is None:
+            num_hidden_layers = config.num_hidden_layers
+        else:
+            num_hidden_layers = num_hidden_layers_override
+
         self.layers = nn.ModuleList([
             Idefics2EncoderLayer(config,
                                  quant_config=quant_config,
                                  prefix=f"{prefix}.layers.{layer_idx}")
-            for layer_idx in range(config.num_hidden_layers)
+            for layer_idx in range(num_hidden_layers)
         ])
 
     def forward(
@@ -287,6 +295,9 @@ def __init__(
         self,
         config: Idefics2VisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        *,
+        num_hidden_layers_override: Optional[int] = None,
+        require_post_norm: bool = True,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -294,11 +305,24 @@ def __init__(
         embed_dim = config.hidden_size
         self.config = config
         self.embeddings = Idefics2VisionEmbeddings(config)
-        self.encoder = Idefics2Encoder(config,
-                                       quant_config=quant_config,
-                                       prefix=f"{prefix}.encoder")
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
+        self.encoder = Idefics2Encoder(
+            config,
+            quant_config=quant_config,
+            num_hidden_layers_override=num_hidden_layers_override,
+            prefix=f"{prefix}.encoder")
+
+        num_hidden_layers = config.num_hidden_layers
+        if len(self.encoder.layers) > config.num_hidden_layers:
+            raise ValueError(
+                f"The original encoder only has {num_hidden_layers} "
+                f"layers, but you requested {len(self.encoder.layers)} layers."
+            )
+
+        self.require_post_norm = require_post_norm
+        self.post_layernorm = nn.LayerNorm(
+            embed_dim,
+            eps=config.layer_norm_eps,
+        ) if require_post_norm else nn.Identity()
 
     def get_input_embeddings(self):
         return self.embeddings
@@ -328,7 +352,24 @@ def load_weights(self, weights: Iterable[Tuple[str,
         ]
         params_dict = dict(self.named_parameters())
         loaded_params: Set[str] = set()
+        layer_count = len(self.encoder.layers)
+
         for name, loaded_weight in weights:
+            # skip pooling header
+            if name.startswith("head."):
+                continue
+
+            # post_layernorm is optional
+            if (name.startswith("post_layernorm.")
+                    and not self.require_post_norm):
+                continue
+
+            # omit layers when num_hidden_layers_override is set
+            if name.startswith("encoder.layers."):
+                layer_idx = int(name.split(".")[2])
+                if layer_idx >= layer_count:
+                    continue
+
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 2a839f3a503..7250aaba557 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -11,7 +11,7 @@
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import PretrainedConfig
+from transformers import PretrainedConfig, SiglipVisionConfig
 from transformers.utils import logging
 
 from vllm.config import VllmConfig
@@ -32,10 +32,10 @@
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
+from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal
 from .phi4mm_audio import AudioEmbedding
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
-from .vision_siglip_navit import get_siglip_vision_model
 
 # <|endoftext10|> (see vocab.json in hf model)
 _IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -339,6 +339,33 @@ def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
     return data
 
 
+def get_navit_vision_model(layer_idx: int = -1, **kwargs):
+    vision_config = {
+        "hidden_size": 1152,
+        "image_size": 448,
+        "intermediate_size": 4304,
+        "model_type": "siglip_vision_model",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 27,
+        "patch_size": 14,
+    }
+
+    model_config = SiglipVisionConfig(**vision_config, **kwargs)
+    if layer_idx < 0:
+        num_hidden_layers = model_config.num_hidden_layers \
+            + layer_idx + 1
+    else:
+        num_hidden_layers = layer_idx + 1
+
+    vision_model = Idefics2VisionTransformer(
+        config=model_config,
+        require_post_norm=False,
+        num_hidden_layers_override=num_hidden_layers,
+    )
+
+    return vision_model
+
+
 class Phi4MMImageEncoder(nn.Module):
     """Image embedding."""
 
@@ -362,8 +389,7 @@ def __init__(self,
             self.layer_idx = -2
             self.type_feature = 'patch'
 
-        self.img_processor = get_siglip_vision_model(
-            _flash_attn_2_enabled=True)
+        self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx)
 
         pe_weight = self.img_processor.embeddings.position_embedding.weight
         L, D = pe_weight.size()
@@ -430,16 +456,11 @@ def __init__(self,
     def get_img_features(self,
                          img_embeds: torch.FloatTensor,
                          attention_mask=None) -> torch.FloatTensor:
-        LAYER_IDX = self.layer_idx
-        TYPE_FEATURE = self.type_feature
 
-        img_processor_output = self.img_processor(
-            img_embeds,
-            output_hidden_states=True,
-            patch_attention_mask=attention_mask)
-        img_feature = img_processor_output.hidden_states[LAYER_IDX]
+        img_feature = self.img_processor(img_embeds,
+                                         patch_attention_mask=attention_mask)
 
-        if TYPE_FEATURE == "patch":
+        if self.type_feature == "patch":
             patch_feature = img_feature
 
             use_token_compression = self.image_token_compression is not None
diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py
deleted file mode 100644
index 3a9597a845f..00000000000
--- a/vllm/model_executor/models/vision_siglip_navit.py
+++ /dev/null
@@ -1,1966 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Siglip model configuration"""
-
-import math
-import os
-import warnings
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-from transformers.activations import ACT2FN
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import (BaseModelOutput,
-                                           BaseModelOutputWithPooling)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (ModelOutput, add_start_docstrings,
-                                add_start_docstrings_to_model_forward, logging,
-                                replace_return_docstrings)
-
-from vllm.platforms import _Backend
-
-from .vision import get_vit_attn_backend
-
-logger = logging.get_logger(__name__)
-
-SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/siglip-base-patch16-224":
-    "https://huggingface.co/google/siglip-base-patch16-224/"\
-        "resolve/main/config.json",
-}
-
-
-class SiglipTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a 
-    [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder
-    according to the specified arguments, defining the model architecture. 
-    Instantiating a configuration with the defaults will yield a similar 
-    configuration to that of the text encoder of the Siglip [google/
-    siglip-base-patch16-224](https://huggingface.co/google/siglip-base
-    -patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from 
-    [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
-            Vocabulary size of the Siglip text model. Defines the number of 
-            different tokens that can be represented by the `inputs_ids` 
-            passed when calling [`SiglipModel`].
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer 
-            in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the 
-            Transformer encoder.
-        max_position_embeddings (`int`, *optional*, defaults to 64):
-            The maximum sequence length that this model might ever be used 
-            with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (`str` or `function`, *optional*, defaults to 
-            `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the
-            encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            The id of the padding token in the vocabulary.
-        bos_token_id (`int`, *optional*, defaults to 49406):
-            The id of the beginning-of-sequence token in the vocabulary.
-        eos_token_id (`int`, *optional*, defaults to 49407):
-            The id of the end-of-sequence token in the vocabulary.
-    Example:
-    ```python
-    >>> from transformers import SiglipTextConfig, SiglipTextModel
-    >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 
-        style configuration
-    >>> configuration = SiglipTextConfig()
-    >>> # Initializing a SiglipTextModel (with random weights) from the 
-        google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipTextModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "siglip_text_model"
-
-    def __init__(
-        self,
-        vocab_size=32000,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        max_position_embeddings=64,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        # This differs from `CLIPTokenizer`'s default and from openai/siglip
-        # See https://github.com/huggingface/transformers/pull/24773#
-        # issuecomment-1632287538
-        pad_token_id=1,
-        bos_token_id=49406,
-        eos_token_id=49407,
-        _flash_attn_2_enabled=True,
-        **kwargs,
-    ):
-        super().__init__(pad_token_id=pad_token_id,
-                         bos_token_id=bos_token_id,
-                         eos_token_id=eos_token_id,
-                         **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.attention_dropout = attention_dropout
-        self._flash_attn_2_enabled = _flash_attn_2_enabled
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
-                                                                  os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs)
-
-        # get the text config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["text_config"]
-
-        if "model_type" in config_dict and hasattr(
-                cls,
-                "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                "You are using a model of type %s to instantiate a model of "
-                "type %s. This is not supported for all configurations of "
-                "models and can yield errors.", config_dict['model_type'],
-                cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class SiglipVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a 
-    [`SiglipVisionModel`]. It is used to instantiate a
-    Siglip vision encoder according to the specified arguments, defining the 
-    model architecture. Instantiating a configuration with the defaults will
-    yield a similar configuration to that of the vision encoder of the Siglip
-    [google/siglip-base-patch16-224](https://huggingface.co/google/
-    siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used 
-    to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (`int`, *optional*, defaults to 3072):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer
-            in the Transformer encoder.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the
-            Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            Number of channels in the input images.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        hidden_act (`str` or `function`, *optional*, defaults to 
-            `"gelu_pytorch_tanh"`):
-            The non-linear activation function (function or string) in the 
-            encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and
-            `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    Example:
-    ```python
-    >>> from transformers import SiglipVisionConfig, SiglipVisionModel
-    >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224
-        style configuration
-    >>> configuration = SiglipVisionConfig()
-    >>> # Initializing a SiglipVisionModel (with random weights) from the 
-        google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipVisionModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-
-    model_type = "siglip_vision_model"
-
-    def __init__(
-        self,
-        hidden_size=768,
-        intermediate_size=3072,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        num_channels=3,
-        image_size=224,
-        patch_size=16,
-        hidden_act="gelu_pytorch_tanh",
-        layer_norm_eps=1e-6,
-        attention_dropout=0.0,
-        _flash_attn_2_enabled=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self._flash_attn_2_enabled = _flash_attn_2_enabled
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
-                                                                  os.PathLike],
-                        **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from SiglipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(
-                cls,
-                "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                "You are using a model of type %s to "
-                "instantiate a model of type %s. This is not"
-                " supported for all configurations of models and can yield"
-                " errors.", config_dict['model_type'], cls.model_type)
-
-        return cls.from_dict(config_dict, **kwargs)
-
-
-class SiglipConfig(PretrainedConfig):
-    r"""
-    [`SiglipConfig`] is the configuration class to store the configuration of a
-    [`SiglipModel`]. It is used to instantiate a Siglip model according to the 
-    specified arguments, defining the text model and vision model configs.
-    Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Siglip [google/siglip-base-patch16-224](
-    https://huggingface.co/google/siglip-base-patch16-224) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to
-    control the model outputs. Read the documentation from 
-    [`PretrainedConfig`] for more information.
-    Args:
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize 
-            [`SiglipTextConfig`].
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize 
-            [`SiglipVisionConfig`].
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-    Example:
-    ```python
-    >>> from transformers import SiglipConfig, SiglipModel
-    >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 
-        style configuration
-    >>> configuration = SiglipConfig()
-    >>> # Initializing a SiglipModel (with random weights) from the 
-        google/siglip-base-patch16-224 style configuration
-    >>> model = SiglipModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    >>> # We can also initialize a SiglipConfig from a SiglipTextConfig 
-        and a SiglipVisionConfig
-    >>> from transformers import SiglipTextConfig, SiglipVisionConfig
-    >>> # Initializing a SiglipText and SiglipVision configuration
-    >>> config_text = SiglipTextConfig()
-    >>> config_vision = SiglipVisionConfig()
-    >>> config = SiglipConfig.from_text_vision_configs(config_text, 
-        config_vision)
-    ```"""
-
-    model_type = "siglip"
-
-    def __init__(self, text_config=None, vision_config=None, **kwargs):
-        super().__init__(**kwargs)
-
-        if text_config is None:
-            text_config = {}
-            logger.info(
-                "`text_config` is `None`. Initializing the `SiglipTextConfig`"
-                " with default values.")
-
-        if vision_config is None:
-            vision_config = {}
-            logger.info("`vision_config` is `None`. initializing the "
-                        "`SiglipVisionConfig` with default values.")
-
-        self.text_config = SiglipTextConfig(**text_config)
-        self.vision_config = SiglipVisionConfig(**vision_config)
-
-        self.initializer_factor = 1.0
-
-    @classmethod
-    def from_text_vision_configs(cls, text_config: SiglipTextConfig,
-                                 vision_config: SiglipVisionConfig, **kwargs):
-        r"""
-        Instantiate a [`SiglipConfig`] (or a derived class) from siglip text 
-        model configuration and siglip vision
-        model configuration.
-        Returns:
-            [`SiglipConfig`]: An instance of a configuration object
-        """
-
-        return cls(text_config=text_config.to_dict(),
-                   vision_config=vision_config.to_dict(),
-                   **kwargs)
-
-
-# coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Siglip model."""
-
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/siglip-base-patch16-224",
-    # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(
-        torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-
-
-def _trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official
-    # releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/
-    # truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-
-    # Values are generated by using a truncated uniform distribution and
-    # then using the inverse CDF for the normal distribution.
-    # Get upper and lower cdf values
-    l = norm_cdf((a - mean) / std)  # noqa
-    u = norm_cdf((b - mean) / std)  # noqa
-
-    # Uniformly fill tensor with values from [l, u], then translate to
-    # [2l-1, 2u-1].
-    tensor.uniform_(2 * l - 1, 2 * u - 1)
-
-    # Use inverse cdf transform for normal distribution to get truncated
-    # standard normal
-    if tensor.dtype in [torch.float16, torch.bfloat16]:
-        # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
-        og_dtype = tensor.dtype
-        tensor = tensor.to(torch.float32)
-        tensor.erfinv_()
-        tensor = tensor.to(og_dtype)
-    else:
-        tensor.erfinv_()
-
-    # Transform to proper mean, std
-    tensor.mul_(std * math.sqrt(2.0))
-    tensor.add_(mean)
-
-    # Clamp to ensure it's in the proper range
-    if tensor.dtype == torch.float16:
-        # The `clamp_` op is not (yet?) defined in float16+cpu
-        tensor = tensor.to(torch.float32)
-        tensor.clamp_(min=a, max=b)
-        tensor = tensor.to(torch.float16)
-    else:
-        tensor.clamp_(min=a, max=b)
-
-
-def trunc_normal_tf_(tensor: torch.Tensor,
-                     mean: float = 0.0,
-                     std: float = 1.0,
-                     a: float = -2.0,
-                     b: float = 2.0) -> torch.Tensor:
-    """Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \\leq \text{mean} \\leq b`.
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where 
-    the bounds [a, b] are applied when sampling the normal distribution with
-    mean=0, std=1.0 and the result is subsequently scaled and shifted by the
-    mean and std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    """
-    with torch.no_grad():
-        _trunc_normal_(tensor, 0, 1.0, a, b)
-        tensor.mul_(std).add_(mean)
-
-
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-
-    variance = scale / denom
-
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        with torch.no_grad():
-            tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        with torch.no_grad():
-            tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-
-
-def default_flax_embed_init(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="normal")
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with
-# CLIP->Siglip
-class SiglipVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings
-    of the pooling of the last hidden states.
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
-            *optional* returned when model is initialized with 
-            `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to
-            the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
-            sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the 
-            model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
-            `output_hidden_states=True` is passed or when 
-            `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, 
-            if the model has an embedding layer, + one for the output of each
-            layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the 
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
-            `output_attentions=True` is passed or when 
-            `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape 
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the 
-            weighted average in the self-attention heads.
-    """
-
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with
-# CLIP->Siglip
-class SiglipTextModelOutput(ModelOutput):
-    """
-    Base class for text model's outputs that also contains a pooling of the 
-    last hidden states.
-    Args:
-        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
-             *optional* returned when model is initialized with 
-             `with_projection=True`):
-            The text embeddings obtained by applying the projection layer to
-            model.
-            the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, 
-            sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when 
-            `output_hidden_states=True` is passed or when 
-            `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the 
-            embeddings, if the model has an embedding layer, + one for the 
-            output of each layer) of shape `(batch_size, sequence_length, 
-            hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the
-            optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when 
-            `output_attentions=True` is passed or when 
-            `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape 
-            `(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute
-            the weighted average in the self-attention heads.
-    """
-
-    text_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with
-# CLIP->Siglip
-class SiglipOutput(ModelOutput):
-    """
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
-          `return_loss` is `True`):
-            Contrastive loss for image-text similarity.
-        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size,
-          text_batch_size)`):
-            The scaled dot product scores between `image_embeds` and 
-            `text_embeds`. This represents the image-text similarity scores.
-        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, 
-            image_batch_size)`):
-            The scaled dot product scores between `text_embeds` and 
-            `image_embeds`. This represents the text-image similarity scores.
-        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The text embeddings obtained by applying the projection layer to 
-            the pooled output of [`SiglipTextModel`].
-        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
-            The image embeddings obtained by applying the projection layer to
-            the pooled output of [`SiglipVisionModel`].
-        text_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipTextModel`].
-        vision_model_output(`BaseModelOutputWithPooling`):
-            The output of the [`SiglipVisionModel`].
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits_per_image: torch.FloatTensor = None
-    logits_per_text: torch.FloatTensor = None
-    text_embeds: torch.FloatTensor = None
-    image_embeds: torch.FloatTensor = None
-    text_model_output: BaseModelOutputWithPooling = None
-    vision_model_output: BaseModelOutputWithPooling = None
-
-    def to_tuple(self) -> Tuple[Any]:
-        return tuple(
-            self[k] if k not in ["text_model_output", "vision_model_output"
-                                 ] else getattr(self, k).to_tuple()
-            for k in self.keys())
-
-
-class SiglipVisionEmbeddings(nn.Module):
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-
-        self.num_patches_per_side = self.image_size // self.patch_size
-        self.num_patches = self.num_patches_per_side**2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions,
-                                               self.embed_dim)
-
-    def forward(self, pixel_values: torch.FloatTensor,
-                patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
-        batch_size = pixel_values.size(0)
-
-        patch_embeds = self.patch_embedding(pixel_values)
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
-
-        max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
-        max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \
-            max_im_w // self.patch_size
-        boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
-                                  1 / self.num_patches_per_side)
-        position_ids = torch.full(
-            size=(
-                batch_size,
-                max_nb_patches_h * max_nb_patches_w,
-            ),
-            fill_value=0,
-        )
-
-        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
-            nb_patches_h = p_attn_mask[:, 0].sum()
-            nb_patches_w = p_attn_mask[0].sum()
-
-            fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h,
-                                                 nb_patches_h)
-            fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w,
-                                                 nb_patches_w)
-
-            bucket_coords_h = torch.bucketize(fractional_coords_h,
-                                              boundaries,
-                                              right=True)
-            bucket_coords_w = torch.bucketize(fractional_coords_w,
-                                              boundaries,
-                                              right=True)
-
-            pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
-                       bucket_coords_w).flatten()
-            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-
-        position_ids = position_ids.to(self.position_embedding.weight.device)
-
-        embeddings = embeddings + self.position_embedding(position_ids)
-        return embeddings
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with
-# CLIP->Siglip
-class SiglipTextEmbeddings(nn.Module):
-
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        embed_dim = config.hidden_size
-
-        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
-        self.position_embedding = nn.Embedding(config.max_position_embeddings,
-                                               embed_dim)
-
-        # position_ids (1, len position emb) is contiguous in memory and
-        # exported when serialized
-        self.register_buffer(
-            "position_ids",
-            torch.arange(config.max_position_embeddings).expand((1, -1)),
-            persistent=False)
-
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ) -> torch.Tensor:
-        seq_length = input_ids.shape[
-            -1] if input_ids is not None else inputs_embeds.shape[-2]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:, :seq_length]
-
-        if inputs_embeds is None:
-            inputs_embeds = self.token_embedding(input_ids)
-
-        position_embeddings = self.position_embedding(position_ids)
-        embeddings = inputs_embeds + position_embeddings
-
-        return embeddings
-
-
-class SiglipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`:"
-                f" {self.embed_dim} and `num_heads`: {self.num_heads}).")
-        self.scale = self.head_dim**-0.5
-        self.dropout = config.attention_dropout
-
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(
-            2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len,
-                                   k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size "
-                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}")
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(f"Attention mask should be of size "
-                                 f"{(batch_size, 1, q_len, k_v_seq_len)}, "
-                                 f"but is {attention_mask.size()}")
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(
-                                                 query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights,
-                                             p=self.dropout,
-                                             training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len,
-                                  self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size "
-                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class SiglipFlashAttention2(SiglipAttention):
-    """
-    Llama flash attention module. This module inherits from `LlamaAttention` as
-    the weights of the module stays untouched. The only required change would
-    be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any
-    of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_causal = False  # Hack to make sure we don't use a causal mask
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
-               Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_heads,
-                                     self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_heads,
-                                         self.head_dim).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(
-                kv_seq_len, self.layer_idx)
-
-        # TODO: These transpose are quite inefficient but Flash Attention
-        #  requires the layout [batch_size, sequence_length, num_heads,
-        #  head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training
-        # stability reasons therefore the input hidden states gets silently
-        # casted in float32. Hence, we need cast them back in the correct
-        # dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to
-        # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                "The input hidden states seems to be silently casted in "
-                "float32, this might be related to the fact you have upcasted "
-                "embedding or layer norm layers in float32. We will cast "
-                f"back the input in {target_dtype}.")
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = self._flash_attention_forward(query_states,
-                                                    key_states,
-                                                    value_states,
-                                                    attention_mask,
-                                                    q_len,
-                                                    dropout=dropout_rate)
-
-        attn_output = attn_output.reshape(bsz, q_len,
-                                          self.embed_dim).contiguous()
-        attn_output = self.out_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
-
-    def _flash_attention_forward(self,
-                                 query_states,
-                                 key_states,
-                                 value_states,
-                                 attention_mask,
-                                 query_length,
-                                 dropout=0.0,
-                                 softmax_scale=None):
-        """
-        Calls the forward method of Flash Attention - if the input hidden 
-        states contain at least one padding token first unpad the input, 
-        then computes the attention scores and pad the final attention 
-        scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size 
-                `(batch_size, seq_len)` where 0 stands for the position 
-                of padding tokens and 1 for the position of non-padding 
-                tokens.
-            dropout (`int`, *optional*):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / 
-                sqrt(head_dim)
-        """
-        from flash_attn import flash_attn_func, flash_attn_varlen_func
-        from flash_attn.bert_padding import pad_input  # noqa
-
-        # TODO: Remove the `query_length != 1` check once Flash Attention for
-        # RoCm is bumped to 2.1. For details, please see the comment in
-        # LlamaFlashAttention2 __init__.
-        causal = self.is_causal and query_length != 1
-
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, \
-                max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask,
-                query_length)
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
-                                    query_length)
-        else:
-            attn_output = flash_attn_func(query_states,
-                                          key_states,
-                                          value_states,
-                                          dropout,
-                                          softmax_scale=softmax_scale,
-                                          causal=causal)
-
-        return attn_output
-
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
-                    query_length):
-        from flash_attn.bert_padding import index_first_axis, unpad_input
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
-            attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                              head_dim), indices_k)
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
-                                head_dim), indices_k)
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
-                                    head_dim), indices_k)
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \
-                unpad_input(query_layer, attention_mask)
-
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with
-# CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = (SiglipAttention(config) if
-                          not getattr(config, "_flash_attn_2_enabled", False)
-                          else SiglipFlashAttention2(config))
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim,
-                                        eps=config.layer_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            attention_mask (`torch.FloatTensor`):
-                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where
-                padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under returned tensors for
-                more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states, )
-
-        if output_attentions:
-            outputs += (attn_weights, )
-
-        return outputs
-
-
-class SiglipPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface 
-    for downloading and loading pretrained models.
-    """
-
-    config_class = SiglipConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-
-        if isinstance(module, SiglipVisionEmbeddings):
-            width = (self.config.vision_config.hidden_size if isinstance(
-                self.config, SiglipConfig) else self.config.hidden_size)
-            nn.init.normal_(module.position_embedding.weight,
-                            std=1 / np.sqrt(width))
-        elif isinstance(module, nn.Embedding):
-            default_flax_embed_init(module.weight)
-        elif isinstance(module, SiglipAttention):
-            nn.init.normal_(module.q_proj.weight)
-            nn.init.normal_(module.k_proj.weight)
-            nn.init.normal_(module.v_proj.weight)
-            nn.init.normal_(module.out_proj.weight)
-            nn.init.zeros_(module.q_proj.bias)
-            nn.init.zeros_(module.k_proj.bias)
-            nn.init.zeros_(module.v_proj.bias)
-            nn.init.zeros_(module.out_proj.bias)
-        elif isinstance(module, SiglipMLP):
-            nn.init.normal_(module.fc1.weight)
-            nn.init.normal_(module.fc2.weight)
-            nn.init.normal_(module.fc1.bias, std=1e-6)
-            nn.init.normal_(module.fc2.bias, std=1e-6)
-        elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
-            nn.init.normal_(module.probe.data)
-            nn.init.normal_(module.attention.in_proj_weight.data)
-            nn.init.zeros_(module.attention.in_proj_bias.data)
-        elif isinstance(module, SiglipModel):
-            logit_scale_init = torch.tensor(0.0)
-            module.logit_scale.data.fill_(logit_scale_init)
-            module.logit_bias.data.zero_()
-        elif isinstance(module, (nn.Linear, nn.Conv2d)):
-            lecun_normal_(module.weight)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-
-SIGLIP_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass 
-    documentation for the generic methods the library implements for all 
-    its model (such as downloading or saving, resizing the input embeddings,
-    pruning heads etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/
-    stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation
-    for all matter related to general usage and behavior.
-    Parameters:
-        config ([`SiglipConfig`]): Model configuration class with all the 
-            parameters of the model.
-            Initializing with a config file does not load the weights 
-            associated with the model, only the configuration. Check out 
-            the [`~PreTrainedModel.from_pretrained`] method to load the 
-            model weights.
-"""
-
-SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)
-        `):
-            Indices of input sequence tokens in the vocabulary. Padding will 
-            be ignored by default should you provide it.
-            Indices can be obtained using [`AutoTokenizer`]. See 
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details. [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, 
-            sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, 
-            sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0, 
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention 
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See 
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a 
-            plain tuple.
-"""
-
-SIGLIP_VISION_INPUTS_DOCSTRING = r"""
-    Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size,
-             num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you 
-            provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`]
-            for details.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention
-            layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See 
-            `hidden_states` under returned tensors for more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a 
-            plain tuple.
-"""
-
-SIGLIP_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, 
-        sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding 
-            will be ignored by default should you provide it.
-            Indices can be obtained using [`AutoTokenizer`]. See 
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
-            for details. [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`
-            , *optional*):
-            Mask to avoid performing attention on padding token indices. Mask
-            values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        position_ids (`torch.LongTensor` of shape `(batch_size, 
-            sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position
-            embeddings. Selected in the range `[0, 
-            config.max_position_embeddings - 1]`.
-            [What are position IDs?](../glossary#position-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, 
-            num_channels, height, width)`):
-            Pixel values. Padding will be ignored by default should you 
-            provide it. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] 
-            for details.
-        return_loss (`bool`, *optional*):
-            Whether or not to return the contrastive loss.
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention 
-            layers. See `attentions` under returned tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See 
-            `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a 
-            plain tuple.
-"""
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with
-# CLIP->Siglip
-class SiglipEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` 
-    self attention layers. Each layer is a [`SiglipEncoderLayer`].
-    Args:
-        config: SiglipConfig
-    """
-
-    def __init__(self, config: SiglipConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([
-            SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
-        ])
-        self.gradient_checkpointing = False
-
-    # Ignore copy
-    def forward(
-        self,
-        inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, 
-                sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to 
-                directly pass an embedded representation.
-                This is useful if you want more control over how to convert 
-                `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, 
-                sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. 
-                Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all 
-                attention layers. See `attentions` under returned tensors for
-                  more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See 
-                `hidden_states` under returned tensors for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a
-                  plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions \
-            is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else \
-            self.config.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states, )
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1], )
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states, )
-
-        if not return_dict:
-            return tuple(
-                v for v in [hidden_states, encoder_states, all_attentions]
-                if v is not None)
-        return BaseModelOutput(last_hidden_state=hidden_states,
-                               hidden_states=encoder_states,
-                               attentions=all_attentions)
-
-
-class SiglipTextTransformer(nn.Module):
-
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-        self.embeddings = SiglipTextEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.final_layer_norm = nn.LayerNorm(embed_dim,
-                                             eps=config.layer_norm_eps)
-
-        self.head = nn.Linear(embed_dim, embed_dim)
-
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
-                               config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions \
-            is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states \
-                                    is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else \
-            self.config.use_return_dict
-
-        if input_ids is None:
-            raise ValueError("You have to specify input_ids")
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        hidden_states = self.embeddings(input_ids=input_ids,
-                                        position_ids=position_ids)
-
-        # note: SigLIP's text model does not use a causal mask, unlike the
-        # original CLIP model.
-        # expand attention_mask
-        if attention_mask is not None:
-            # [batch_size, seq_len] ->
-            # [batch_size, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask(
-                attention_mask, hidden_states.dtype)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.final_layer_norm(last_hidden_state)
-
-        # Assuming "sticky" EOS tokenization, last token is always EOS.
-        pooled_output = last_hidden_state[:, -1, :]
-        pooled_output = self.head(pooled_output)
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """The text model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipTextModel(SiglipPreTrainedModel):
-    config_class = SiglipTextConfig
-
-    _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
-
-    def __init__(self, config: SiglipTextConfig):
-        super().__init__(config)
-        self.text_model = SiglipTextTransformer(config)
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.text_model.embeddings.token_embedding
-
-    def set_input_embeddings(self, value):
-        self.text_model.embeddings.token_embedding = value
-
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
-                               config_class=SiglipTextConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, SiglipTextModel
-        >>> model = SiglipTextModel.
-            from_pretrained("google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.
-            from_pretrained("google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" 
-            as that's how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
-            padding="max_length", return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) 
-            states
-        ```"""
-        return_dict = return_dict if return_dict is not None else \
-            self.config.use_return_dict
-
-        return self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-
-class SiglipVisionTransformer(nn.Module):
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = SiglipVisionEmbeddings(config)
-        self.encoder = SiglipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim,
-                                           eps=config.layer_norm_eps)
-        self.head = SiglipMultiheadAttentionPoolingHead(config)
-
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
-                               config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        """
-        output_attentions = output_attentions if output_attentions is not None\
-              else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None \
-            else self.config.use_return_dict
-
-        batch_size = pixel_values.size(0)
-        if patch_attention_mask is None:
-            patch_attention_mask = torch.ones(
-                size=(
-                    batch_size,
-                    pixel_values.size(2) // self.config.patch_size,
-                    pixel_values.size(3) // self.config.patch_size,
-                ),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-
-        hidden_states = self.embeddings(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask)
-
-        patch_attention_mask = patch_attention_mask.view(batch_size, -1)
-        # The call to `_upad_input` in `_flash_attention_forward` is expensive
-        # So when the `patch_attention_mask` is full of 1s (i.e. attending
-        # to the whole sequence), avoiding passing the attention_mask, which
-        # is equivalent to attending to the full sequence
-        if not torch.any(~patch_attention_mask):
-            attention_mask = None
-        else:
-            attention_mask = (_prepare_4d_attention_mask(
-                patch_attention_mask, hidden_states.dtype)
-                              if not self.config._flash_attn_2_enabled else
-                              patch_attention_mask)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-
-        pooled_output = self.head(
-            hidden_state=last_hidden_state,
-            attention_mask=patch_attention_mask,
-        )
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-class SiglipMultiheadAttentionPoolingHead(nn.Module):
-    """Multihead Attention Pooling."""
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__()
-
-        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.attention = torch.nn.MultiheadAttention(
-            config.hidden_size, config.num_attention_heads, batch_first=True)
-        self.layernorm = nn.LayerNorm(config.hidden_size,
-                                      eps=config.layer_norm_eps)
-        self.mlp = SiglipMLP(config)
-
-    def forward(self, hidden_state, attention_mask):
-        batch_size = hidden_state.shape[0]
-        probe = self.probe.repeat(batch_size, 1, 1)
-
-        hidden_state = self.attention(query=probe,
-                                      key=hidden_state,
-                                      value=hidden_state,
-                                      key_padding_mask=~attention_mask)[0]
-
-        residual = hidden_state
-        hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
-
-        return hidden_state[:, 0]
-
-
-@add_start_docstrings(
-    """The vision model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING,
-)
-class SiglipVisionModel(SiglipPreTrainedModel):
-    config_class = SiglipVisionConfig
-    main_input_name = "pixel_values"
-
-    def __init__(self, config: SiglipVisionConfig):
-        super().__init__(config)
-
-        self.vision_model = SiglipVisionTransformer(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
-
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
-                               config_class=SiglipVisionConfig)
-    def forward(
-        self,
-        pixel_values,
-        patch_attention_mask: Optional[torch.BoolTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, SiglipVisionModel
-        >>> model = SiglipVisionModel.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> url = 
-            "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output  # pooled features
-        ```"""
-        return_dict = return_dict if return_dict is not None \
-            else self.config.use_return_dict
-
-        return self.vision_model(
-            pixel_values=pixel_values,
-            patch_attention_mask=patch_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-
-@add_start_docstrings(SIGLIP_START_DOCSTRING)
-class SiglipModel(SiglipPreTrainedModel):
-    config_class = SiglipConfig
-
-    def __init__(self, config: SiglipConfig):
-        super().__init__(config)
-
-        if not isinstance(config.text_config, SiglipTextConfig):
-            raise ValueError("config.text_config is expected to be of type "
-                             f"SiglipTextConfig but is of type"
-                             f" {type(config.text_config)}.")
-
-        if not isinstance(config.vision_config, SiglipVisionConfig):
-            raise ValueError("config.vision_config is expected to be of type "
-                             "SiglipVisionConfig but is of type"
-                             f" {type(config.vision_config)}.")
-
-        text_config = config.text_config
-        vision_config = config.vision_config
-
-        self.text_model = SiglipTextTransformer(text_config)
-        self.vision_model = SiglipVisionTransformer(vision_config)
-
-        self.logit_scale = nn.Parameter(torch.randn(1))
-        self.logit_bias = nn.Parameter(torch.randn(1))
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
-    def get_text_features(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            text_features (`torch.FloatTensor` of shape `(batch_size,
-              output_dim`): The text embeddings obtained by
-            applying the projection layer to the pooled output
-              of [`SiglipTextModel`].
-        Examples:
-        ```python
-        >>> from transformers import AutoTokenizer, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> tokenizer = AutoTokenizer.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> # important: make sure to set padding="max_length" as that's 
-            how the model was trained
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], 
-            padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     text_features = model.get_text_features(**inputs)
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead
-        # of those of vision & text components.
-        output_attentions = output_attentions if output_attentions is not None\
-              else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None \
-            else self.config.use_return_dict
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = text_outputs[1]
-
-        return pooled_output
-
-    @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
-    def get_image_features(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> torch.FloatTensor:
-        r"""
-        Returns:
-            image_features (`torch.FloatTensor` of shape `(batch_size, 
-            output_dim`): The image embeddings obtained by applying the
-            projection layer to the pooled output of [`SiglipVisionModel`].
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> inputs = processor(images=image, return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     image_features = model.get_image_features(**inputs)
-        ```"""
-        # Use SiglipModel's config for some fields (if specified) instead
-        # of those of vision & text components.
-        output_attentions = output_attentions if output_attentions \
-            is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else \
-            self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        pooled_output = vision_outputs[1]
-
-        return pooled_output
-
-    @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=SiglipOutput,
-                               config_class=SiglipConfig)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SiglipOutput]:
-        r"""
-        Returns:
-        Examples:
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, AutoModel
-        >>> import torch
-        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
-        >>> processor = AutoProcessor.from_pretrained(
-            "google/siglip-base-patch16-224")
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
-        >>> # important: we pass `padding=max_length` since the model was 
-            trained with this
-        >>> inputs = processor(text=texts, images=image, 
-            padding="max_length", return_tensors="pt")
-        >>> with torch.no_grad():
-        ...     outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image
-        >>> probs = torch.sigmoid(logits_per_image) # these are the 
-            probabilities
-        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
-        31.9% that image 0 is 'a photo of 2 cats'
-        ```"""
-        # Use SigLIP model's config for some fields (if specified) instead of
-        # those of vision & text components.
-        output_attentions = output_attentions if output_attentions \
-            is not None else self.config.output_attentions
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else
-                                self.config.output_hidden_states)
-        return_dict = return_dict if return_dict is not None else \
-            self.config.use_return_dict
-
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        text_outputs = self.text_model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        image_embeds = vision_outputs[1]
-        text_embeds = text_outputs[1]
-
-        # normalized features
-        image_embeds = image_embeds / image_embeds.norm(
-            p=2, dim=-1, keepdim=True)
-        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-
-        # cosine similarity as logits
-        logits_per_text = torch.matmul(text_embeds, image_embeds.t(
-        )) * self.logit_scale.exp() + self.logit_bias
-        logits_per_image = logits_per_text.t()
-
-        loss = None
-        if return_loss:
-            raise NotImplementedError("SigLIP loss to be implemented")
-
-        if not return_dict:
-            output = (logits_per_image, logits_per_text, text_embeds,
-                      image_embeds, text_outputs, vision_outputs)
-            return ((loss, ) + output) if loss is not None else output
-
-        return SiglipOutput(
-            loss=loss,
-            logits_per_image=logits_per_image,
-            logits_per_text=logits_per_text,
-            text_embeds=text_embeds,
-            image_embeds=image_embeds,
-            text_model_output=text_outputs,
-            vision_model_output=vision_outputs,
-        )
-
-
-def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs):
-    siglip_vision_config = {
-        "hidden_size": 1152,
-        "image_size": 448,
-        "intermediate_size": 4304,
-        "model_type": "siglip_vision_model",
-        "num_attention_heads": 16,
-        "num_hidden_layers": 27,
-        "patch_size": 14,
-    }
-
-    # Detect attention implementation.
-    attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
-    if attn_backend != _Backend.FLASH_ATTN:
-        _flash_attn_2_enabled = False
-
-    model_config = SiglipVisionConfig(
-        **siglip_vision_config,
-        _flash_attn_2_enabled=_flash_attn_2_enabled,
-        **kwargs)
-
-    vision_model = SiglipVisionModel(model_config).vision_model
-
-    return vision_model

From 98031590eb7fd997c377af48e7c478defda6d35f Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 15 Mar 2025 20:21:11 -0700
Subject: [PATCH 0765/1240] [V1] Remove V0 fallback for mistral-tokenizer
 (#14873)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31d567de0ef..4e695da4ef7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1487,13 +1487,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # No MistralTokenizer support so far (not compatible
-        # with xgrammar)
-        if model_config.tokenizer_mode == "mistral":
-            _raise_or_fallback(feature_name="--tokenizer-mode mistral",
-                               recommend_to_remove=False)
-            return False
-
         # No CPU offloading yet.
         if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
             _raise_or_fallback(feature_name="--cpu-offload-gb",

From d5bd53b93cbeddae2bbbe72cc13287f7205d2f47 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 15 Mar 2025 20:25:03 -0700
Subject: [PATCH 0766/1240] [Kernel] Add more tuned configs (#14877)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...192,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...=64,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
 ...280,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=64,N=1280,device_name=NVIDIA_H200.json  | 146 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=64,N=2560,device_name=NVIDIA_H200.json  | 146 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...320,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=64,N=320,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...640,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
 ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=64,N=640,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=14336,device_name=NVIDIA_H200.json  | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=1792,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=2048,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=3584,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=4096,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../E=8,N=7168,device_name=NVIDIA_H200.json   | 146 ++++++++++++++++++
 ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
 .../layers/fused_moe/configs/README           |   3 +
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 105 files changed, 13627 insertions(+)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 00000000000..0611620eb33
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4dd00d110e4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 00000000000..48f9697af26
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..a8c05712ba5
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 00000000000..f1244c61efb
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..a2ee05da1d7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..fc573cd6e85
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c6d7e96c7f0
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 00000000000..21f60229ff8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 00000000000..39a9912fa4b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..05b54639d23
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..c17a4ec3469
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..170ae7f3fff
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..1d9d352edeb
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..9ad5b316750
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..2883dfd11e7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..8abfd84a776
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 00000000000..2fc18a5e43d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..be8d4a7fd23
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..71fdd88643c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 00000000000..c02de2f628b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..3e0bc75ff87
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..9f7ed6726f4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..21b72557e36
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..eaf32f6d76c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..841044a4fc6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..59be497fc42
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..e4110a5d2e7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..0883ef40582
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..1a0aa331933
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..9952be6ba4a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..32bbadbb9ea
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..e6f753cdba3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..53f3394693f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..4dd475c02a1
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..2ed15f30fe6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..eb817268d41
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
new file mode 100644
index 00000000000..0c7062aea6c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 00000000000..96cbc111c7f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README
index 45d40cbfb1a..787bd061166 100644
--- a/vllm/model_executor/layers/fused_moe/configs/README
+++ b/vllm/model_executor/layers/fused_moe/configs/README
@@ -8,3 +8,6 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration
 The example configurations provided are for the Mixtral model for TP2 on H100
 and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
 N = 7168 and for TP4 we have N = 3584.
+
+Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py`
+Some of the configurations files are copied from the SGLang repository. Thank you!
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..3e8ebf3f730
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..2bb5b457d77
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6e2aeee9b75
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b0f9442a6aa
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bee8d03ba47
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9da876d3ccb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..0a1a252a5e0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d6279a1e37b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..3bc003647cd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..310dff4635c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..206c8a2bac6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..edc23530ea7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..43b5bdbdff5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bffa749724a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f96f12787f6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..fe3e18cf01a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e4d5b2dd02a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..137b9ddaca3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..38cac4690a8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..8e6ebe21fc3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..1225d847b7d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d44e38438c9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c559a69a77e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..8ec2005f02e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..65840aa538b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4e120d6d084
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..5c298746788
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4990268b2a9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..18afdd96fbf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..51d10bb0ee1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..1480e092932
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6bd350c3889
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..2b9f0d1ec64
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d979c6b66d0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6eb22deb8dd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c746e708052
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..0b4746ceeb6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..8ec2005f02e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..202acf23f8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..11a9bceb77c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..386ee59beae
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..60df5e33eed
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4f1747b81f5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..53bbaca407a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..ffe67dcf48c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..2a17e164e9e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b259993b617
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..a71ab88d43c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..eda96e76cb6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bd0767b5ef6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..29f76518769
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6db13852c9d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..1a47cae9e17
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..8dd5ae5c497
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..6d1a8b56a28
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e77abaf3968
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..01327b2c4f9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..12eea5fb668
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9db9daece8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..365f8d0d8ab
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f080ea5da7d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e9bf04442a9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c37aced26e8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d6bef7f60c6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..8df6e4b6e5d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
\ No newline at end of file

From 065d1fde97e9f1cb8e08cb6c6ded5d437f996af3 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sat, 15 Mar 2025 20:26:19 -0700
Subject: [PATCH 0767/1240] [BugFix] Fix torch distributed stateless PG backend
 init (#14870)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/data_parallel.py | 5 +++++
 vllm/distributed/utils.py                   | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b00519314d8..b73770ce382 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -76,5 +76,10 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
                              GPUs_per_dp_rank))
         proc.start()
         procs.append(proc)
+    exit_code = 0
     for proc in procs:
         proc.join()
+        if proc.exitcode:
+            exit_code = proc.exitcode
+
+    exit(exit_code)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 25202062e97..84899358a6d 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
     # different systems (e.g. RPC) in case the store is multi-tenant.
     prefix_store = PrefixStore(init_method, store)
 
-    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
-
     pg: ProcessGroup = ProcessGroup(
         prefix_store,
         group_rank,
         group_size,
-        pg_options,
     )
 
     if backend == "gloo":
@@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
                                          backend_options)
         backend_type = ProcessGroup.BackendType.NCCL
         device = torch.device("cuda")
+    else:
+        raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
 
+    pg._set_default_backend(backend_type)
     backend_class._set_sequence_number_for_group()
 
     pg._register_backend(device, backend_type, backend_class)

From 84b64ac769e9786ab61703b6ab84d21d2c3cf258 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sun, 16 Mar 2025 00:29:22 -0700
Subject: [PATCH 0768/1240] [V1] [Spec Decode] Fix ngram tests (#14878)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/spec_decode/test_ngram.py | 53 ++++++++++++++++--------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index ec663c84d0d..2c2e125ade4 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,32 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
-import pytest
 
-from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.utils import ConstantList
+import numpy as np
 
+from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
+                                                _kmp_lps_array)
 
-@pytest.fixture
-def proposer():
-    return NgramProposer()
 
+def test_kmp_lps_array():
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
+                                  np.array([0, 1, 2]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
+                                  np.array([0, 0, 0, 0]))
+    np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
+                                  np.array([0, 0, 1, 2, 0]))
 
-def test_kmp_lps_array(proposer):
-    assert proposer._kmp_lps_array([]) == []
-    assert proposer._kmp_lps_array([1]) == [0]
-    assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2]
-    assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0]
-    assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0]
 
-
-def test_find_subarray_kmp(proposer):
-    X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6])
-    assert proposer._find_subarray_kmp(X, 2, 2) is None
-    X = ConstantList([1, 2, 3, 4, 1, 2, 3])
-    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
-    assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1]
-    assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2]
-    assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1]
-    X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3])
-    assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
+def test_find_subarray_kmp():
+    X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+    assert _find_subarray_kmp(X, 2, 2) is None
+    X = np.array([1, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+                                  np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
+                                                                         1]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+                                  np.array([4, 1, 2]))
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
+                                                                         1]))
+    X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+                                  np.array([4, 1, 2]))
     # Return on the first match
-    assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3]
\ No newline at end of file
+    np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+                                  np.array([6, 2, 3]))

From 83b9fe534ffbf186937a3624acf0a8bbbf216d28 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Sun, 16 Mar 2025 10:44:19 -0400
Subject: [PATCH 0769/1240] [Bugfix] Limit profiling run sequence length by
 max_model_len (#14785)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/registry.py              | 5 +++++
 vllm/worker/enc_dec_model_runner.py  | 1 +
 vllm/worker/model_runner.py          | 1 +
 vllm/worker/openvino_model_runner.py | 1 +
 vllm/worker/xpu_model_runner.py      | 1 +
 5 files changed, 9 insertions(+)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b6ceb5fb82d..24980833864 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -330,6 +330,11 @@ def dummy_data_for_profiling(
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
 
+        if seq_len > model_config.max_model_len:
+            raise AssertionError(
+                f"Profiling attempted with sequence length ({seq_len}) "
+                f"greater than model length ({model_config.max_model_len})")
+
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5f39f2fa494..f34597ac05d 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -281,6 +281,7 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             decoder_dummy_data = self.input_registry \
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 473bd901b5b..3181483fe83 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1302,6 +1302,7 @@ def _dummy_run(self,
             for group_id in range(max_num_seqs):
                 seq_len = (max_num_batched_tokens // max_num_seqs +
                            (group_id < max_num_batched_tokens % max_num_seqs))
+                seq_len = min(seq_len, self.model_config.max_model_len)
                 batch_size += seq_len
 
                 dummy_data = self.input_registry \
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index aa1d2cbb2df..9b484a9f543 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -148,6 +148,7 @@ def _prepare_model_input(
                 seq_len = min(
                     seq_data.get_len(),
                     computed_len + seq_group_metadata.token_chunk_size,
+                    self.model_config.max_model_len,
                 )
                 if is_prompt:
                     tokens = seq_data.get_token_ids()[computed_len:seq_len]
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 39957e661c4..2103260d890 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -466,6 +466,7 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             dummy_data = self.input_registry \

From 1d078c4dcdc4d182b675959f919539f86107a09d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 00:05:40 +0800
Subject: [PATCH 0770/1240] [Bugfix] Explicitly disable Phi-4-multimodal in V1
 (#14889)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/phi4mm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 7250aaba557..3d4505d556e 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -33,7 +33,7 @@
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
 from .phi4mm_audio import AudioEmbedding
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
@@ -1433,7 +1433,8 @@ def cat_with_pad(tensors, dim, padding_value=0):
     "image", get_max_phi4mm_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
+                        SupportsV0Only):
     """
     Implements the Phi-4-multimodal-instruct model in vLLM.
     """

From c4f7468153d1ddd033d977be863807f0dffcbd8a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 00:13:46 +0800
Subject: [PATCH 0771/1240] Revert "[Bugfix] Limit profiling run sequence
 length by max_model_len (#14785) (#14892)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/registry.py              | 5 -----
 vllm/worker/enc_dec_model_runner.py  | 1 -
 vllm/worker/model_runner.py          | 1 -
 vllm/worker/openvino_model_runner.py | 1 -
 vllm/worker/xpu_model_runner.py      | 1 -
 5 files changed, 9 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 24980833864..b6ceb5fb82d 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -330,11 +330,6 @@ def dummy_data_for_profiling(
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
 
-        if seq_len > model_config.max_model_len:
-            raise AssertionError(
-                f"Profiling attempted with sequence length ({seq_len}) "
-                f"greater than model length ({model_config.max_model_len})")
-
         if mm_registry.has_processor(model_config):
             tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index f34597ac05d..5f39f2fa494 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -281,7 +281,6 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             decoder_dummy_data = self.input_registry \
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 3181483fe83..473bd901b5b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1302,7 +1302,6 @@ def _dummy_run(self,
             for group_id in range(max_num_seqs):
                 seq_len = (max_num_batched_tokens // max_num_seqs +
                            (group_id < max_num_batched_tokens % max_num_seqs))
-                seq_len = min(seq_len, self.model_config.max_model_len)
                 batch_size += seq_len
 
                 dummy_data = self.input_registry \
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 9b484a9f543..aa1d2cbb2df 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -148,7 +148,6 @@ def _prepare_model_input(
                 seq_len = min(
                     seq_data.get_len(),
                     computed_len + seq_group_metadata.token_chunk_size,
-                    self.model_config.max_model_len,
                 )
                 if is_prompt:
                     tokens = seq_data.get_token_ids()[computed_len:seq_len]
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 2103260d890..39957e661c4 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -466,7 +466,6 @@ def profile_run(self) -> None:
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_len = min(seq_len, self.model_config.max_model_len)
             batch_size += seq_len
 
             dummy_data = self.input_registry \

From 01648a38a285249dd6ab2f83fb3da285fa677407 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 16 Mar 2025 14:53:34 -0700
Subject: [PATCH 0772/1240] [BugFix][V1] Fix overhead related to bad_words
 sampling when not in use (#14894)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/worker/test_gpu_input_batch.py | 5 +++--
 vllm/sampling_params.py                 | 7 ++++---
 vllm/v1/worker/gpu_input_batch.py       | 5 +++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 192ddefe102..2486c26c607 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -124,8 +124,9 @@ def _construct_expected_sampling_metadata(
         if req.sampling_params.allowed_token_ids:
             allowed_token_ids_mask[index_in_input_batch][
                 req.sampling_params.allowed_token_ids] = True
-        bad_words_token_ids[
-            index_in_input_batch] = req.sampling_params.bad_words_token_ids
+        if req.sampling_params.bad_words_token_ids:
+            bad_words_token_ids[
+                index_in_input_batch] = req.sampling_params.bad_words_token_ids
 
     return SamplingMetadata(
         temperature=torch.tensor(temperature, dtype=torch.float,
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index b0a5777cc8d..9b474a37b96 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -235,7 +235,7 @@ class SamplingParams(
 
     # Fields used for bad words
     bad_words: Optional[list[str]] = None
-    _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list)
+    _bad_words_token_ids: Optional[list[list[int]]] = None
 
     @staticmethod
     def from_optional(
@@ -464,8 +464,9 @@ def update_from_generation_config(
                     self.stop_token_ids = list(eos_ids)
 
     def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
-        if self.bad_words is None:
+        if not self.bad_words:
             return
+        self._bad_words_token_ids = []
         for bad_word in self.bad_words:
             # To prohibit words both at the beginning
             # and in the middle of text
@@ -516,7 +517,7 @@ def all_stop_token_ids(self) -> set[int]:
         return self._all_stop_token_ids
 
     @property
-    def bad_words_token_ids(self) -> list[list[int]]:
+    def bad_words_token_ids(self) -> Optional[list[list[int]]]:
         # For internal use only. Backward compatibility not guaranteed
         return self._bad_words_token_ids
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9707cb5774c..55d5429a893 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -324,8 +324,9 @@ def add_request(
             self.allowed_token_ids_mask_cpu_tensor[req_index][
                 sampling_params.allowed_token_ids] = False
 
-        self.bad_words_token_ids[
-            req_index] = sampling_params.bad_words_token_ids
+        if sampling_params.bad_words_token_ids:
+            self.bad_words_token_ids[
+                req_index] = sampling_params.bad_words_token_ids
 
         # Add request lora ID
         if request.lora_request:

From b47658e138b24c23c434c6c97760d8474c33c47a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Mar 2025 14:53:53 -0700
Subject: [PATCH 0773/1240] [V1][BugFix] Detect interleaved sliding window
 attention (#14896)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c2a976108e4..8dd7521ff49 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -82,8 +82,15 @@ def __init__(
             self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 cache_config.cache_dtype]
 
-        self.is_multimodal_model = model_config.is_multimodal_model
+        # NOTE(woosuk): sliding_window is None for models with interleaved
+        # attention. Use interleaved_sliding_window instead.
         self.sliding_window = model_config.get_sliding_window()
+        self.interleaved_sliding_window = getattr(
+            model_config.hf_text_config, "interleaved_sliding_window", None)
+        self.window_size = (self.sliding_window
+                            or self.interleaved_sliding_window)
+
+        self.is_multimodal_model = model_config.is_multimodal_model
         self.block_size = cache_config.block_size
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
@@ -674,7 +681,7 @@ def _compute_cascade_attn_prefix_len(
             num_query_heads=self.num_query_heads,
             num_kv_heads=self.num_kv_heads,
             use_alibi=False,  # FIXME
-            use_sliding_window=self.sliding_window is not None,
+            use_sliding_window=self.window_size is not None,
             num_sms=self.num_sms,
         )
         return common_prefix_len if use_cascade else 0

From 2e0ade10a241849e6aa5d822ea74d2770f3e7134 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:46:42 -0700
Subject: [PATCH 0774/1240] [Misc] Catching Ray Compiled Graph PP test failures
 for V1 (#14847)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/distributed/test_pipeline_parallel.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 05b6ba40506..4d3306509c8 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -350,6 +350,10 @@ def _compare_tp(
     else:
         pp_env = None
 
+    tp_env = {
+        "VLLM_USE_V1": vllm_major_version,
+    }
+
     pp_args = [
         *common_args,
         "--pipeline-parallel-size",
@@ -374,14 +378,20 @@ def _compare_tp(
     ]
 
     try:
-        compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
+        compare_two_settings(model_id,
+                             pp_args,
+                             tp_args,
+                             pp_env,
+                             tp_env,
+                             method=method)
     except Exception:
-        if pp_env is None:
-            raise
-        else:
-            # Ray Compiled Graph tests are flaky,
+        testing_ray_compiled_graph = pp_env is not None
+        if testing_ray_compiled_graph and vllm_major_version == "0":
+            # Ray Compiled Graph tests are flaky for V0,
             # so we don't want to fail the test
             logger.exception("Ray Compiled Graph tests failed")
+        else:
+            raise
 
 
 @pytest.mark.parametrize(

From ed60c26b63bf462b542a9d2645cd7ba97d5fc8b6 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Mon, 17 Mar 2025 03:10:04 +0400
Subject: [PATCH 0775/1240] [Doc] Add guidance for using `ccache` with `pip
 install -e .` in doc (#14901)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/gpu/cuda.inc.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 7e3b884c2ab..d3e375aec10 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
 For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
 As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
 [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
 The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 :::

From ff82632410602e8435a32cb0e55a2b5cec858e11 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 16 Mar 2025 20:56:16 -0400
Subject: [PATCH 0776/1240] [V1] Enable Entrypoints Tests (#14903)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                           | 1 +
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 93ac8a29c67..a6616d7b414 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -198,6 +198,7 @@ steps:
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
+    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/engine
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b4eb475c23b..98983fa05b8 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -18,6 +18,9 @@
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
 
+# Undo after https://github.com/vllm-project/vllm/pull/14868
+pytest.skip(allow_module_level=True)
+
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",

From d1448c23e3f52857ec47500fff1d1f26dd6ee4e7 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 16 Mar 2025 22:06:43 -0400
Subject: [PATCH 0777/1240] [CI] Nightly Tests (#14898)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../models/decoder_only/language/test_mistral.py  |  1 +
 tests/tool_use/utils.py                           | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 7e1337b7d48..4c2055361d4 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -201,6 +201,7 @@ def test_models(
     )
 
 
+@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index aad37eb9b8f..df117b96cd0 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -46,6 +46,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "NousResearch/Hermes-3-Llama-3.1-8B",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "hermes", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
         ],
@@ -60,6 +61,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "llama3_json", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
         ],
@@ -70,6 +72,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "meta-llama/Llama-3.2-3B-Instruct",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "llama3_json", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
         ],
@@ -80,6 +83,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "mistral", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
             "--ignore-patterns=\"consolidated.safetensors\""
@@ -111,22 +115,28 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "ibm-granite/granite-3.0-8b-instruct",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "granite", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
         ],
     },
     "granite-3.1-8b": {
-        "model": "ibm-granite/granite-3.1-8b-instruct",
+        "model":
+        "ibm-granite/granite-3.1-8b-instruct",
         "arguments": [
+            "--enforce-eager",
+            "--no-enable-prefix-caching",
             "--tool-call-parser",
             "granite",
         ],
-        "supports_parallel": True,
+        "supports_parallel":
+        True,
     },
     "internlm": {
         "model":
         "internlm/internlm2_5-7b-chat",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "internlm", "--chat-template",
             str(VLLM_PATH /
                 "examples/tool_chat_template_internlm2_tool.jinja"),
@@ -139,6 +149,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "model":
         "Team-ACE/ToolACE-8B",
         "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
             "--tool-call-parser", "pythonic", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
         ],

From 3aaa25a8e86ba9dcd392be30f4d82c2e5de4fa29 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 10:38:15 +0800
Subject: [PATCH 0778/1240] [CI/Build] Update defaults for test reproducibility
 (#14893)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/conftest.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 4716ca2e315..41c0e62ce14 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -681,6 +681,17 @@ def hf_runner():
 
 
 class VllmRunner:
+    """
+    The default value of some arguments have been modified from
+    :class:`~vllm.LLM` as follows:
+    - `trust_remote_code`: Set to `True` instead of `False` for convenience.
+    - `seed`: Set to `0` instead of `None` for test reproducibility.
+    - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
+    - `block_size`: Set to `16` instead of `None` to reduce memory usage.
+    - `enable_chunked_prefill`: Set to `False` instead of `None` for
+      test reproducibility.
+    - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
+    """
 
     def __init__(
         self,
@@ -688,6 +699,8 @@ def __init__(
         task: TaskOption = "auto",
         tokenizer_name: Optional[str] = None,
         tokenizer_mode: str = "auto",
+        trust_remote_code: bool = True,
+        seed: Optional[int] = 0,
         # Use smaller max model length, otherwise bigger model cannot run due
         # to kv cache size limit.
         max_model_len: int = 1024,
@@ -695,7 +708,7 @@ def __init__(
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16,
-        enable_chunked_prefill: bool = False,
+        enable_chunked_prefill: Optional[bool] = False,
         swap_space: int = 4,
         enforce_eager: Optional[bool] = False,
         **kwargs,
@@ -705,8 +718,9 @@ def __init__(
             task=task,
             tokenizer=tokenizer_name,
             tokenizer_mode=tokenizer_mode,
-            trust_remote_code=True,
+            trust_remote_code=trust_remote_code,
             dtype=dtype,
+            seed=seed,
             swap_space=swap_space,
             enforce_eager=enforce_eager,
             disable_log_stats=disable_log_stats,

From 02629177fb8b28547fa6743308ae092eea16d83e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Mar 2025 20:19:30 -0700
Subject: [PATCH 0779/1240] [V1] Optimize the overhead of rewinding (#14905)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8dd7521ff49..4059d5b17b7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1032,17 +1032,16 @@ def execute_model(
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
-        for i, req_id in enumerate(self.input_batch.req_ids):
+        for i, generator in self.input_batch.generators.items():
+            req_id = self.input_batch.req_ids[i]
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
             if seq_len < req_state.num_tokens:
-                # Ignore the sampled token.
+                # Ignore the sampled token for partial prefills.
                 # Rewind the generator state as if the token was not sampled.
-                generator = self.input_batch.generators.get(i)
-                if generator is not None:
-                    # This relies on cuda-specific torch-internal impl details
-                    generator.set_offset(generator.get_offset() - 4)
+                # This relies on cuda-specific torch-internal impl details
+                generator.set_offset(generator.get_offset() - 4)
 
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.

From 1be674cbc5ba2562ddfff89cf2b3a2167f95a188 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 16 Mar 2025 20:20:15 -0700
Subject: [PATCH 0780/1240] [V1][Minor] Add __repr__ to ConstantList (#14907)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8e1fb18cca0..6c01ed3de52 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -86,6 +86,9 @@ def __contains__(self, item):
     def __len__(self):
         return len(self._x)
 
+    def __repr__(self):
+        return f"ConstantList({self._x})"
+
 
 class BackgroundProcHandle:
     """

From 71224e2675c9535578f7c93925185fcb25acb738 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 16 Mar 2025 23:35:37 -0400
Subject: [PATCH 0781/1240] [BugFix] Fix MLA + V1 + TP==1 causing
 reinitialization of cuda context (#14910)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cuda.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3897584307e..8a53337ebc0 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,7 +152,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             # here
             use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
                 or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
-            from vllm.attention.backends.flashmla import is_flashmla_supported
+            from vllm.attention.ops.flashmla import is_flashmla_supported
             if use_flashmla and is_flashmla_supported()[0] \
                 and cache_config.block_size != 64:
                 cache_config.block_size = 64

From 1b62dd773bf0031a1c396e1a258a53cbcf8b134b Mon Sep 17 00:00:00 2001
From: Sibi <85477603+t-sibiraj@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:35:57 +0800
Subject: [PATCH 0782/1240] [Misc] Replace os environ to monkeypatch in test
 suite (#14516)

Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 .../test_basic_correctness.py                 | 115 ++++---
 .../basic_correctness/test_chunked_prefill.py | 174 +++++-----
 tests/basic_correctness/test_cumem.py         |  72 ++--
 tests/compile/test_basic_correctness.py       | 207 +++++------
 tests/compile/test_full_graph.py              | 113 +++++-
 tests/compile/utils.py                        |  93 -----
 tests/conftest.py                             |   2 +-
 tests/distributed/test_comm_ops.py            |  85 +++--
 tests/distributed/test_custom_all_reduce.py   | 183 +++++-----
 tests/distributed/test_pipeline_partition.py  |  64 ++--
 tests/distributed/test_pp_cudagraph.py        |  42 ++-
 tests/entrypoints/llm/test_accuracy.py        |   4 +-
 .../offline_mode/test_offline_mode.py         |  49 +--
 .../openai/correctness/test_lmeval.py         |   5 +-
 tests/kernels/test_attention_selector.py      | 131 ++++---
 tests/kernels/test_awq.py                     |  60 ++--
 tests/kernels/test_rocm_attention_selector.py |  18 +-
 tests/kernels/utils.py                        |  64 ++--
 .../{disagg_test.py => test_disagg.py}        |   0
 .../{module_test.py => test_module.py}        |   0
 .../models/decoder_only/language/test_fp8.py  | 136 ++++----
 .../models/embedding/language/test_gritlm.py  |  96 +++---
 tests/models/test_oot_registration.py         | 136 ++++----
 tests/mq_llm_engine/test_error_handling.py    |  31 +-
 .../multi_step/test_correctness_async_llm.py  | 234 ++++++-------
 tests/multi_step/test_correctness_llm.py      | 325 +++++++++---------
 tests/neuron/1_core/test_block_table.py       |  84 ++---
 tests/neuron/1_core/test_prefix_prefill.py    | 322 ++++++++---------
 tests/plugins_tests/test_platform_plugins.py  |  13 +-
 tests/plugins_tests/test_scheduler_plugins.py |  62 ++--
 tests/prefix_caching/test_prefix_caching.py   | 125 +++----
 tests/test_regression.py                      |  16 +-
 tests/test_utils.py                           |  63 ++--
 tests/tpu/test_custom_dispatcher.py           |  25 +-
 tests/tracing/test_tracing.py                 | 297 ++++++++--------
 tests/utils.py                                |  11 +-
 tests/v1/e2e/test_ngram_spec_decode.py        |  11 +-
 tests/v1/engine/test_async_llm.py             |  11 +-
 tests/v1/engine/test_engine_core.py           |  10 +-
 tests/v1/engine/test_engine_core_client.py    |   5 +-
 tests/v1/sample/test_logprobs.py              | 240 ++++++-------
 tests/v1/tpu/test_basic.py                    |  16 +-
 43 files changed, 1997 insertions(+), 1755 deletions(-)
 delete mode 100644 tests/compile/utils.py
 rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%)
 rename tests/kv_transfer/{module_test.py => test_module.py} (100%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a6616d7b414..f85572e7c23 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -522,7 +522,7 @@ steps:
   # TODO: investigate and fix
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 
 - label: Plugin Tests (2 GPUs) # 40min
   working_dir: "/vllm-workspace/tests"
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cb3b739b72..1458f0893a9 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
 def test_models(
+    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     model: str,
     backend: str,
@@ -63,31 +64,33 @@ def test_models(
         pytest.skip(
             f"{backend} does not support gemma2 with full context length.")
 
-    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
-    # 5042 tokens for gemma2
-    # gemma2 has alternating sliding window size of 4096
-    # we need a prompt with more than 4096 tokens to test the sliding window
-    prompt = "The following numbers of the sequence " + ", ".join(
-        str(i) for i in range(1024)) + " are:"
-    example_prompts = [prompt]
+        # 5042 tokens for gemma2
+        # gemma2 has alternating sliding window size of 4096
+        # we need a prompt with more than 4096 tokens to test the sliding window
+        prompt = "The following numbers of the sequence " + ", ".join(
+            str(i) for i in range(1024)) + " are:"
+        example_prompts = [prompt]
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with VllmRunner(model,
-                    max_model_len=8192,
-                    dtype=dtype,
-                    enforce_eager=enforce_eager,
-                    gpu_memory_utilization=0.7) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with VllmRunner(model,
+                        max_model_len=8192,
+                        dtype=dtype,
+                        enforce_eager=enforce_eager,
+                        gpu_memory_utilization=0.7) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -104,6 +107,7 @@ def test_models(
         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
+    monkeypatch: pytest.MonkeyPatch,
     hf_runner,
     vllm_runner,
     example_prompts,
@@ -116,34 +120,41 @@ def test_models_distributed(
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
-    if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    if attention_backend:
-        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
-
-    dtype = "half"
-    max_tokens = 5
-
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-    with vllm_runner(model,
-                     dtype=dtype,
-                     tensor_parallel_size=2,
-                     distributed_executor_backend=distributed_executor_backend
-                     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as monkeypatch_context:
+        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+            # test Ray Compiled Graph
+            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        if attention_backend:
+            monkeypatch_context.setenv(
+                "VLLM_ATTENTION_BACKEND",
+                attention_backend,
+            )
+
+        dtype = "half"
+        max_tokens = 5
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with fork method
+        # (the default method).
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index be007de321c..06c9e25ed8d 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,16 +7,22 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 
+if TYPE_CHECKING:
+    from .conftest import HfRunner, VllmRunner
+
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-3.2-1B-Instruct",
@@ -24,12 +30,14 @@
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the file.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     dtype: str,
@@ -52,37 +60,39 @@ def test_models(
     enforce_eager: bool,
     tensor_parallel_size: int,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Checks exact match decode between huggingface model and vllm runner with
     chunked prefill.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
 
-    max_num_seqs = chunked_prefill_token_size
-    max_num_batched_tokens = chunked_prefill_token_size
+        max_num_seqs = chunked_prefill_token_size
+        max_num_batched_tokens = chunked_prefill_token_size
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=True,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            max_num_seqs=max_num_seqs,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                max_num_batched_tokens=max_num_batched_tokens,
+                enable_chunked_prefill=True,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                max_num_seqs=max_num_seqs,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @multi_gpu_test(num_gpus=2)
@@ -90,57 +100,61 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
 def test_models_distributed(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     distributed_executor_backend: str,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    if (model == "meta-llama/Llama-3.2-1B-Instruct"
-            and distributed_executor_backend == "ray"):
-        # test Ray Compiled Graph
-        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
-        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
-
-    dtype = "half"
-    max_tokens = 5
-    chunked_prefill_token_size = 16
-
-    # Add a chunked prefill config.
-    max_num_seqs = min(chunked_prefill_token_size, 256)
-    assert chunked_prefill_token_size != -1
-    enable_chunked_prefill = True
-    max_num_batched_tokens = chunked_prefill_token_size
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+        if (model == "meta-llama/Llama-3.2-1B-Instruct"
+                and distributed_executor_backend == "ray"):
+            # test Ray Compiled Graph
+            m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+            m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
+
+        dtype = "half"
+        max_tokens = 5
+        chunked_prefill_token_size = 16
+
+        # Add a chunked prefill config.
+        max_num_seqs = min(chunked_prefill_token_size, 256)
+        assert chunked_prefill_token_size != -1
+        enable_chunked_prefill = True
+        max_num_batched_tokens = chunked_prefill_token_size
+
+        # NOTE: take care of the order. run vLLM first, and then run HF.
+        # vLLM needs a fresh new process without cuda initialization.
+        # if we run HF first, the cuda initialization will be done and it
+        # will hurt multiprocessing backend with
+        # fork method (the default method).
 
-    # NOTE: take care of the order. run vLLM first, and then run HF.
-    # vLLM needs a fresh new process without cuda initialization.
-    # if we run HF first, the cuda initialization will be done and it
-    # will hurt multiprocessing backend with fork method (the default method).
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            tensor_parallel_size=2,
-            max_num_seqs=max_num_seqs,
-            enable_chunked_prefill=enable_chunked_prefill,
-            max_num_batched_tokens=max_num_batched_tokens,
-            distributed_executor_backend=distributed_executor_backend,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                tensor_parallel_size=2,
+                max_num_seqs=max_num_seqs,
+                enable_chunked_prefill=enable_chunked_prefill,
+                max_num_batched_tokens=max_num_batched_tokens,
+                distributed_executor_backend=distributed_executor_backend,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(
+                example_prompts,
+                max_tokens,
+            )
 
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize(
@@ -158,7 +172,7 @@ def test_models_distributed(
 # the async postprocessor
 @pytest.mark.parametrize("disable_async_output_proc", [True])
 def test_models_with_fp8_kv_cache(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     example_prompts,
     kv_cache_dtype: str,
     model: str,
@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
 @pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("dtype", ["half"])
 def test_with_prefix_caching(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
@@ -254,8 +268,10 @@ def test_with_prefix_caching(
         ) as vllm_model:
             outputs[enable] = []
             for prompt in full_prompts:
-                outputs[enable] += vllm_model.generate_greedy([prompt],
-                                                              max_tokens)
+                outputs[enable] += vllm_model.generate_greedy(
+                    [prompt],
+                    max_tokens,
+                )
 
     check_outputs_equal(
         outputs_0_lst=outputs[False],
@@ -274,8 +290,8 @@ def test_with_prefix_caching(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_models_cpu(
-    hf_runner,
-    vllm_runner,
+    hf_runner: HfRunner,
+    vllm_runner: VllmRunner,
     example_prompts,
     model: str,
     dtype: str,
@@ -283,7 +299,7 @@ def test_models_cpu(
     chunked_prefill_token_size: int,
     enforce_eager: bool,
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     test_models(
         hf_runner,
@@ -307,7 +323,7 @@ def test_models_cpu(
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
 def test_with_prefix_caching_cpu(
-    vllm_runner,
+    vllm_runner: VllmRunner,
     max_tokens: int,
     enforce_eager: bool,
     chunk_size: int,
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index ba81f2bb79d..f5ee469fb00 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -123,40 +123,38 @@ def model(x):
         # sleep mode with pytorch checkpoint
         ("facebook/opt-125m", False),
     ])
-def test_end_to_end(model: str, use_v1: bool):
-    import os
-    os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
-    free, total = torch.cuda.mem_get_info()
-    used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
-    prompt = "How are you?"
-    sampling_params = SamplingParams(temperature=0, max_tokens=10)
-    output = llm.generate(prompt, sampling_params)
-
-    # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
-    # which is difficult to measure in the test. therefore, we only
-    # test sleep level 1 here.
-    llm.sleep(level=1)
-
-    free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
-    used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
-    # now the memory usage is mostly cudagraph memory pool,
-    # and it should be less than the model weights (1B model, 2GiB weights)
-
-    # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
-    # is captured but cannot be releasesd from PyTorch due to a known bug,
-    # therefore high memory usage after `llm.sleep` is called is expected.
-    # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
-    # in V1.
-    if use_v1:
-        assert used_bytes < 7 * GiB_bytes
-    else:
-        assert used_bytes < 2 * GiB_bytes
-
-    llm.wake_up()
-    output2 = llm.generate(prompt, sampling_params)
-
-    # cmp output
-    assert output[0].outputs[0].text == output2[0].outputs[0].text
-
-    del os.environ["VLLM_USE_V1"]
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        free, total = torch.cuda.mem_get_info()
+        used_bytes_baseline = total - free  # in case other process is running
+        llm = LLM(model, enable_sleep_mode=True)
+        prompt = "How are you?"
+        sampling_params = SamplingParams(temperature=0, max_tokens=10)
+        output = llm.generate(prompt, sampling_params)
+
+        # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+        # which is difficult to measure in the test. therefore, we only
+        # test sleep level 1 here.
+        llm.sleep(level=1)
+
+        free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+        # now the memory usage is mostly cudagraph memory pool,
+        # and it should be less than the model weights (1B model, 2GiB weights)
+
+        # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+        # is captured but cannot be releasesd from PyTorch due to a known bug,
+        # therefore high memory usage after `llm.sleep` is called is expected.
+        # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+        # in V1.
+        if use_v1:
+            assert used_bytes < 7 * GiB_bytes
+        else:
+            assert used_bytes < 2 * GiB_bytes
+
+        llm.wake_up()
+        output2 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 48323b21a8c..b639fd719ca 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import dataclasses
-from typing import Optional
 
 import pytest
 
@@ -22,75 +22,76 @@ class TestSetting:
     fullgraph: bool
 
 
-# representative settings for testing
-test_settings = [
-    # basic llama model
-    TestSetting(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        model_args=[],
-        pp_size=2,
-        tp_size=2,
-        attn_backend="FLASHINFER",
-        method="generate",
-        fullgraph=True,
-    ),
-    # llama model with quantization
-    TestSetting(
-        model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-        model_args=["--quantization", "gptq"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # MoE model
-    TestSetting(
-        model="ibm/PowerMoE-3b",
-        model_args=[],
-        pp_size=1,
-        tp_size=2,
-        attn_backend="FLASH_ATTN",
-        method="generate",
-        fullgraph=True,
-    ),
-    # embedding model
-    TestSetting(
-        model="BAAI/bge-multilingual-gemma2",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="encode",
-        fullgraph=True,
-    ),
-    # encoder-based embedding model (BERT)
-    TestSetting(
-        model="BAAI/bge-base-en-v1.5",
-        model_args=["--task", "embed"],
-        pp_size=1,
-        tp_size=1,
-        attn_backend="XFORMERS",
-        method="encode",
-        fullgraph=True,
-    ),
-    # vision language model
-    TestSetting(
-        model="microsoft/Phi-3.5-vision-instruct",
-        model_args=["--trust-remote-code", "--max-model-len", "2048"],
-        pp_size=2,
-        tp_size=1,
-        attn_backend="FLASH_ATTN",
-        method="generate_with_image",
-        fullgraph=False,
-    ),
-]
-
-
 # we cannot afford testing the full Catesian product
 # of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+@pytest.mark.parametrize(
+    "test_setting",
+    [
+        # basic llama model
+        TestSetting(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            model_args=[],
+            pp_size=2,
+            tp_size=2,
+            attn_backend="FLASHINFER",
+            method="generate",
+            fullgraph=True,
+        ),
+        # llama model with quantization
+        TestSetting(
+            model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            model_args=["--quantization", "gptq"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # MoE model
+        TestSetting(
+            model="ibm/PowerMoE-3b",
+            model_args=[],
+            pp_size=1,
+            tp_size=2,
+            attn_backend="FLASH_ATTN",
+            method="generate",
+            fullgraph=True,
+        ),
+        # embedding model
+        TestSetting(
+            model="BAAI/bge-multilingual-gemma2",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="encode",
+            fullgraph=True,
+        ),
+        # encoder-based embedding model (BERT)
+        TestSetting(
+            model="BAAI/bge-base-en-v1.5",
+            model_args=["--task", "embed"],
+            pp_size=1,
+            tp_size=1,
+            attn_backend="XFORMERS",
+            method="encode",
+            fullgraph=True,
+        ),
+        # vision language model
+        TestSetting(
+            model="microsoft/Phi-3.5-vision-instruct",
+            model_args=["--trust-remote-code", "--max-model-len", "2048"],
+            pp_size=2,
+            tp_size=1,
+            attn_backend="FLASH_ATTN",
+            method="generate_with_image",
+            fullgraph=False,
+        ),
+    ])
+def test_compile_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_setting: TestSetting,
+):
     # this test is run under multiple suits, with different GPUs.
     # make sure we only run the test with correct CUDA devices.
     # don't use "<", as it will duplicate the tests.
@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
     fullgraph = test_setting.fullgraph
     if cuda_device_count_stateless() != pp_size * tp_size:
         pytest.skip("Not correct CUDA devices for the test.")
-    import os
-    os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
-    final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
-                ["-tp", str(tp_size)]
 
-    all_args: list[list[str]] = []
-    all_envs: list[Optional[dict[str, str]]] = []
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+        final_args = [
+            "--enforce-eager", *model_args, "-pp",
+            str(pp_size), "-tp",
+            str(tp_size)
+        ]
+
+        all_args: list[list[str]] = []
+        all_envs: list[dict[str, str] | None] = []
 
-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.PIECEWISE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.PIECEWISE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
 
-    # inductor will change the output, so we only compare if the output
-    # is close, not exactly the same.
-    compare_all_settings(
-        model,
-        all_args,
-        all_envs,
-        method=method if method != "generate" else "generate_close")
-    all_envs.clear()
-    all_args.clear()
+        # inductor will change the output, so we only compare if the output
+        # is close, not exactly the same.
+        compare_all_settings(
+            model,
+            all_args,
+            all_envs,
+            method=method if method != "generate" else "generate_close")
+        all_envs.clear()
+        all_args.clear()
 
-    for level in [
-            CompilationLevel.NO_COMPILATION,
-            CompilationLevel.DYNAMO_AS_IS,
-            CompilationLevel.DYNAMO_ONCE,
-    ]:
-        all_args.append(final_args + [f"-O{level}"])
-        all_envs.append({})
-        if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
-            # "DYNAMO_ONCE" will always use fullgraph
-            all_envs[-1][
-                "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
+        for level in [
+                CompilationLevel.NO_COMPILATION,
+                CompilationLevel.DYNAMO_AS_IS,
+                CompilationLevel.DYNAMO_ONCE,
+        ]:
+            all_args.append(final_args + [f"-O{level}"])
+            all_envs.append({})
+            if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+                # "DYNAMO_ONCE" will always use fullgraph
+                all_envs[-1][
+                    "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0"  # type: ignore
 
-    compare_all_settings(model, all_args * 3, all_envs, method=method)
+        compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 6e83fa36881..cf463f3e752 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,22 +1,115 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
+from typing import Any
+
 import pytest
+import torch
 
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
 from vllm.config import CompilationLevel
+from vllm.platforms import current_platform
 
 from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support
 
 
-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+    TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+        ("facebook/opt-125m", {}),
+        ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+            "dtype": torch.float16,
+            "quantization": "compressed-tensors"
+        }),
+        ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+            "quantization": "compressed-tensors"
+        }),
+        ("meta-llama/Llama-3.2-1B-Instruct", {}),
+    ]
+
+    if is_quant_method_supported("aqlm"):
+        TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+            "quantization": "aqlm"
+        }))
+
+    # TODO: figure out why this fails.
+    if False and is_quant_method_supported("gguf"):  # noqa: SIM223
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+            "quantization": "gguf"
+        }))
+
+    if is_quant_method_supported("gptq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+            "quantization": "gptq"
+        }))
+
+    if is_quant_method_supported("gptq_marlin"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+            "quantization": "gptq_marlin"
+        }))
+
+    if is_quant_method_supported("gptq_marlin_24"):
+        TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+            "quantization": "gptq_marlin_24"
+        }))
+
+    if is_quant_method_supported("marlin"):
+        TEST_MODELS.append(
+            ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+                "quantization": "marlin"
+            }))
+
+    if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+        TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+            "quantization": "AWQ"
+        }))
+
+    return TEST_MODELS
+
+
 @pytest.mark.parametrize(
     "optimization_level",
-    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+    [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
 @fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
-    model = model_info[0]
-    model_kwargs = model_info[1]
-    check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1)
+def test_full_graph(
+    monkeypatch: pytest.MonkeyPatch,
+    model_info: tuple[str, dict[str, Any]],
+    optimization_level: int,
+):
+    model, model_kwargs = model_info
+
+    with monkeypatch.context() as m:
+        # make sure these models can be captured in full graph mode
+        m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+        print(f"MODEL={model}")
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(
+            model=model,
+            enforce_eager=True,
+            tensor_parallel_size=1,
+            disable_custom_all_reduce=True,
+            compilation_config=optimization_level,
+            **model_kwargs,
+        )
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
deleted file mode 100644
index fb8270c26b1..00000000000
--- a/tests/compile/utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-TEST_MODELS = [
-    ("facebook/opt-125m", {}),
-    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
-        "dtype": torch.float16,
-        "quantization": "compressed-tensors"
-    }),
-    ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
-        "quantization": "compressed-tensors"
-    }),
-    ("meta-llama/Llama-3.2-1B-Instruct", {}),
-]
-
-if is_quant_method_supported("aqlm"):
-    TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
-        "quantization": "aqlm"
-    }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"):  # noqa: SIM223
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
-        "quantization": "gguf"
-    }))
-
-if is_quant_method_supported("gptq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
-        "quantization": "gptq"
-    }))
-
-if is_quant_method_supported("gptq_marlin"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
-        "quantization": "gptq_marlin"
-    }))
-
-if is_quant_method_supported("gptq_marlin_24"):
-    TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
-        "quantization": "gptq_marlin_24"
-    }))
-
-if is_quant_method_supported("marlin"):
-    TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
-        "quantization": "marlin"
-    }))
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
-    TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
-        "quantization": "AWQ"
-    }))
-
-
-def check_full_graph_support(model,
-                             model_kwargs,
-                             optimization_level,
-                             tp_size=1):
-    # make sure these models can be captured in full graph mode
-    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
-    print(f"MODEL={model}")
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=model,
-              enforce_eager=True,
-              tensor_parallel_size=tp_size,
-              disable_custom_all_reduce=True,
-              compilation_config=optimization_level,
-              **model_kwargs)
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 41c0e62ce14..30e5ca2eb13 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
     skip_optional = pytest.mark.skip(reason="need --optional option to run")
     for item in items:
         if "optional" in item.keywords:
-            item.add_marker(skip_optional)
+            item.add_marker(skip_optional)
\ No newline at end of file
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 7b0346b8ab5..ac6d6aae300 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -3,7 +3,10 @@
 
 Run `pytest tests/distributed/test_comm_ops.py`.
 """
-import os
+
+from __future__ import annotations
+
+from typing import Any, Callable
 
 import pytest
 import ray
@@ -17,12 +20,18 @@
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_reduce_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
-                           distributed_init_port: str):
+def all_gather_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
+def broadcast_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
     # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
     # so that each worker can see all the GPUs
     # they will be able to set the device to the correct GPU
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
-                                      distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_tensor_dict_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
-                          distributed_init_port: str):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
     torch.cuda.set_device(device)
     init_test_distributed_environment(tp_size, pp_size, rank,
@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
     all_reduce_test_worker, all_gather_test_worker,
     broadcast_tensor_dict_test_worker
 ])
-def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_parallel(tp_size, 1, test_target)
+def test_multi_process_tensor_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, tp_size, 1, test_target)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
 @pytest.mark.parametrize("pp_size", [2])
 @pytest.mark.parametrize(
     "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
-    multi_process_parallel(1, pp_size, test_target)
+def test_multi_process_pipeline_parallel(
+    monkeypatch: pytest.MonkeyPatch,
+    pp_size: int,
+    test_target: Callable[..., Any],
+):
+    multi_process_parallel(monkeypatch, 1, pp_size, test_target)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel_pipeline_parallel(
-        tp_size, pp_size, test_target):
-    multi_process_parallel(tp_size, pp_size, test_target)
+    tp_size: int,
+    pp_size: int,
+    test_target: Callable[..., Any],
+    monkeypatch: pytest.MonkeyPatch,
+):
+    multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 4928690bebb..bfa7d06c4d0 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import random
 
 import pytest
@@ -23,95 +22,115 @@
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
-    ensure_model_parallel_initialized(tp_size, pp_size)
-    group = get_tensor_model_parallel_group().device_group
-
-    # A small all_reduce for warmup.
-    # this is needed because device communicators might be created lazily
-    # (e.g. NCCL). This will ensure that the communicator is initialized
-    # before any communication happens, so that this group can be used for
-    # graph capture immediately.
-    data = torch.zeros(1)
-    data = data.to(device=device)
-    torch.distributed.all_reduce(data, group=group)
-    torch.cuda.synchronize()
-    del data
-
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
-
-    for sz in test_sizes:
-        for dtype in [torch.float32, torch.float16, torch.bfloat16]:
-            with graph_capture(device=device) as graph_capture_context:
-                # use integers so result matches NCCL exactly
-                inp1 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                inp2 = torch.randint(1,
-                                     16, (sz, ),
-                                     dtype=dtype,
-                                     device=torch.cuda.current_device())
-                torch.cuda.synchronize()
-                graph = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(graph,
-                                      stream=graph_capture_context.stream):
-                    for i in range(num_communication):
-                        out1 = tensor_model_parallel_all_reduce(inp1)
-                        # the input buffer is immediately modified to test
-                        # synchronization
-                        dist.all_reduce(inp1, group=group)
-                        out2 = tensor_model_parallel_all_reduce(inp2)
-                        dist.all_reduce(inp2, group=group)
-            graph.replay()
-            torch.testing.assert_close(out1, inp1)
-            torch.testing.assert_close(out2, inp2)
+def graph_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+        ensure_model_parallel_initialized(tp_size, pp_size)
+        group = get_tensor_model_parallel_group().device_group
+
+        # A small all_reduce for warmup.
+        # this is needed because device communicators might be created lazily
+        # (e.g. NCCL). This will ensure that the communicator is initialized
+        # before any communication happens, so that this group can be used for
+        # graph capture immediately.
+        data = torch.zeros(1)
+        data = data.to(device=device)
+        torch.distributed.all_reduce(data, group=group)
+        torch.cuda.synchronize()
+        del data
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+
+        for sz in test_sizes:
+            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+                with graph_capture(device=device) as graph_capture_context:
+                    # use integers so result matches NCCL exactly
+                    inp1 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    inp2 = torch.randint(1,
+                                         16, (sz, ),
+                                         dtype=dtype,
+                                         device=torch.cuda.current_device())
+                    torch.cuda.synchronize()
+                    graph = torch.cuda.CUDAGraph()
+                    with torch.cuda.graph(graph,
+                                          stream=graph_capture_context.stream):
+                        for i in range(num_communication):
+                            out1 = tensor_model_parallel_all_reduce(inp1)
+                            # the input buffer is immediately modified to test
+                            # synchronization
+                            dist.all_reduce(inp1, group=group)
+                            out2 = tensor_model_parallel_all_reduce(inp2)
+                            dist.all_reduce(inp2, group=group)
+                graph.replay()
+                torch.testing.assert_close(out1, inp1)
+                torch.testing.assert_close(out2, inp2)
 
 
 @ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
-    os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
-
-    # we use the first group to communicate once
-    # and the second group to communicate twice
-    # and so on
-    # this is used to demonstrate that each group can
-    # communicate independently
-    num_communication = rank // tp_size + 1
-    sz = 1024
-    fa = get_tp_group().ca_comm
-    inp = torch.ones(sz, dtype=torch.float32, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
-
-    inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
-    out = inp
-    for _ in range(num_communication):
-        out = fa.all_reduce(out, registered=False)
-    torch.testing.assert_close(out, inp * (tp_size**num_communication))
+def eager_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pp_size,
+    rank,
+    distributed_init_port,
+):
+    with monkeypatch.context() as m:
+        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(device)
+        init_test_distributed_environment(tp_size, pp_size, rank,
+                                          distributed_init_port)
+
+        # we use the first group to communicate once
+        # and the second group to communicate twice
+        # and so on
+        # this is used to demonstrate that each group can
+        # communicate independently
+        num_communication = rank // tp_size + 1
+        sz = 1024
+        fa = get_tp_group().ca_comm
+        inp = torch.ones(sz, dtype=torch.float32, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
+
+        inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+        out = inp
+        for _ in range(num_communication):
+            out = fa.all_reduce(out, registered=False)
+        torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
 
 @pytest.mark.parametrize("tp_size", [2])
 @pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
 @pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+                           test_target)
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 18c5be29c5c..7bf93f27014 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -7,33 +7,35 @@
 from vllm.distributed.utils import get_pp_indices
 
 
-def test_custom_layer_partition():
-
-    def _verify(partition_str, num_layers, pp_size, goldens):
-        bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
-        os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
-        for pp_rank, golden in enumerate(goldens):
-            assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
-        if bak is not None:
-            os.environ["VLLM_PP_LAYER_PARTITION"] = bak
-
-    # Even partition
-    _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Balanced partition
-    _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
-    # Put reminder somewhere
-    _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
-    # Invalid partition strings
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    with pytest.raises(ValueError):
-        _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of partitions
-    with pytest.raises(ValueError):
-        _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
-    # Wrong number of layers
-    with pytest.raises(ValueError):
-        _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m:
+
+        def _verify(partition_str, num_layers, pp_size, goldens):
+            bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+            m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
+            for pp_rank, golden in enumerate(goldens):
+                assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+            if bak is not None:
+                m.setenv("VLLM_PP_LAYER_PARTITION", bak)
+
+        # Even partition
+        _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Balanced partition
+        _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+        # Put reminder somewhere
+        _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+        # Invalid partition strings
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        with pytest.raises(ValueError):
+            _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of partitions
+        with pytest.raises(ValueError):
+            _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+        # Wrong number of layers
+        with pytest.raises(ValueError):
+            _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
 
 
 @pytest.mark.parametrize(
@@ -55,6 +57,10 @@ def _verify(partition_str, num_layers, pp_size, goldens):
         (5, 3, 1, (2, 4)),
         (5, 3, 2, (4, 5)),
     ])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
-                               pp_rank: int, indices: tuple[int, int]):
+def test_uneven_auto_partition(
+    num_hidden_layers: int,
+    pp_size: int,
+    pp_rank: int,
+    indices: tuple[int, int],
+):
     assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3bc85b05e7d..19414971f2b 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
-import os
+from typing import TYPE_CHECKING
 
 import pytest
 
 from ..utils import compare_two_settings, fork_new_process_for_each_test
 
+if TYPE_CHECKING:
+    from typing_extensions import LiteralString
+
 
 @pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
     (2, "JackFram/llama-160m"),
@@ -15,18 +19,24 @@
     "FLASHINFER",
 ])
 @fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
-    cudagraph_args = [
-        # use half precision for speed and memory savings in CI environment
-        "--dtype",
-        "float16",
-        "--pipeline-parallel-size",
-        str(PP_SIZE),
-        "--distributed-executor-backend",
-        "mp",
-    ]
-    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
-
-    eager_args = cudagraph_args + ["--enforce-eager"]
-
-    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+def test_pp_cudagraph(
+    monkeypatch: pytest.MonkeyPatch,
+    PP_SIZE: int,
+    MODEL_NAME: str,
+    ATTN_BACKEND: LiteralString,
+):
+    with monkeypatch.context() as m:
+        cudagraph_args = [
+            # use half precision for speed and memory savings in CI environment
+            "--dtype",
+            "float16",
+            "--pipeline-parallel-size",
+            str(PP_SIZE),
+            "--distributed-executor-backend",
+            "mp",
+        ]
+        m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
+
+        eager_args = cudagraph_args + ["--enforce-eager"]
+
+        compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 3ebc5a44d80..77fbb5827da 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -49,7 +49,7 @@ def run_test(more_args=None):
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
         run_test(more_args)
 
 
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V0 Engine."""
 
     with monkeypatch.context() as m:
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 85156d6931c..23fd72f4ebb 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -53,32 +53,37 @@ def cache_models():
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
     # Set HF to offline mode and ensure we can still construct an LLM
-    try:
-        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
-        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+    with monkeypatch.context() as m:
+        try:
+            m.setenv("HF_HUB_OFFLINE", "1")
+            m.setenv("VLLM_NO_USAGE_STATS", "1")
 
-        def disable_connect(*args, **kwargs):
-            raise RuntimeError("No http calls allowed")
+            def disable_connect(*args, **kwargs):
+                raise RuntimeError("No http calls allowed")
 
-        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
-                            disable_connect)
-        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
-                            disable_connect)
+            m.setattr(
+                urllib3.connection.HTTPConnection,
+                "connect",
+                disable_connect,
+            )
+            m.setattr(
+                urllib3.connection.HTTPSConnection,
+                "connect",
+                disable_connect,
+            )
 
-        # Need to re-import huggingface_hub and friends to setup offline mode
-        _re_import_modules()
-        # Cached model files should be used in offline mode
-        for model_config in MODEL_CONFIGS:
-            LLM(**model_config)
-    finally:
-        # Reset the environment after the test
-        # NB: Assuming tests are run in online mode
-        monkeypatch.delenv("HF_HUB_OFFLINE")
-        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
-        _re_import_modules()
-        pass
+            # Need to re-import huggingface_hub
+            # and friends to setup offline mode
+            _re_import_modules()
+            # Cached model files should be used in offline mode
+            for model_config in MODEL_CONFIGS:
+                LLM(**model_config)
+        finally:
+            # Reset the environment after the test
+            # NB: Assuming tests are run in online mode
+            _re_import_modules()
 
 
 def _re_import_modules():
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index e4c087db3d4..d3948e2ed57 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -70,7 +70,7 @@ def run_test(more_args):
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
 
 
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+                                    more_args):
     """Run with the V0 Engine."""
 
     with monkeypatch.context() as m:
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 570e643e036..66db7509cc4 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,13 +5,12 @@
 import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
 @pytest.fixture(autouse=True)
@@ -25,87 +24,111 @@ def clear_cache():
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
 @pytest.mark.parametrize("use_v1", [True, False])
 @pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
+    name: str,
+    use_v1: bool,
+    device: str,
+    monkeypatch: pytest.MonkeyPatch,
+):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
     """
 
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, name)
-
-    if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "TORCH_SDPA"
-    elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
-        assert backend.get_name() == EXPECTED
-    elif device == "openvino":
-        with patch("vllm.attention.selector.current_platform",
-                   OpenVinoPlatform()), patch.dict('sys.modules',
-                                                   {'openvino': Mock()}):
-            backend = get_attn_backend(16, torch.float16, torch.float16, 16,
-                                       False)
-        assert backend.get_name() == "OPENVINO"
-    else:
-        if name in ["XFORMERS", "FLASHINFER"]:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, name)
+
+        if device == "cpu":
+            with patch("vllm.attention.selector.current_platform",
+                       CpuPlatform()):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == "TORCH_SDPA"
+        elif device == "hip":
             with patch("vllm.attention.selector.current_platform",
-                       CudaPlatform()):
+                       RocmPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            16, False)
-            EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
             assert backend.get_name() == EXPECTED
+        elif device == "openvino":
+            with patch("vllm.attention.selector.current_platform",
+                       OpenVinoPlatform()), patch.dict('sys.modules',
+                                                       {'openvino': Mock()}):
+                backend = get_attn_backend(16, torch.float16, torch.float16,
+                                           16, False)
+            assert backend.get_name() == "OPENVINO"
+        else:
+            if name in ["XFORMERS", "FLASHINFER"]:
+                with patch("vllm.attention.selector.current_platform",
+                           CudaPlatform()):
+                    backend = get_attn_backend(16, torch.float16,
+                                               torch.float16, 16, False)
+                EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+                assert backend.get_name() == EXPECTED
 
 
-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
     # TODO: When testing for v1, pipe in `use_v1` as an argument to
     # get_attn_backend
 
-    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
 
-    # Unsupported CUDA arch
-    with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+        # Unsupported CUDA arch
+        monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+                            (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported data type
-    backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Reset the monkeypatch for subsequent tests
+        monkeypatch.undo()
 
-    # Unsupported kv cache data type
-    backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported data type
+        backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported block size
-    backend = get_attn_backend(16, torch.float16, None, 8, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported kv cache data type
+        backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # flash-attn is not installed
-    with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+        # Unsupported block size
+        backend = get_attn_backend(16, torch.float16, None, 8, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # flash-attn is not installed
+        import sys
+        original_module = sys.modules.get('vllm_flash_attn')
+        monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
         backend = get_attn_backend(16, torch.float16, None, 16, False)
         assert backend.get_name() != STR_FLASH_ATTN_VAL
 
-    # Unsupported head size
-    backend = get_attn_backend(17, torch.float16, None, 16, False)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Restore the original module if it existed
+        if original_module is not None:
+            monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+                                original_module)
+        else:
+            monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
 
-    # Attention-free models should bypass env and use PlaceholderAttention
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
-    assert backend.get_name() != STR_FLASH_ATTN_VAL
+        # Unsupported head size
+        backend = get_attn_backend(17, torch.float16, None, 16, False)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+        # Attention-free models should bypass env and use PlaceholderAttention
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+        assert backend.get_name() != STR_FLASH_ATTN_VAL
 
 
 @pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
-    """Ignore the invalid env variable if it is set."""
-    monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
+
+    with monkeypatch.context() as m, patch(
+            "vllm.attention.selector.current_platform", CudaPlatform()):
+        m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
 
-    with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        # Test with head size 32
         backend = get_attn_backend(32, torch.float16, None, 16, False)
         EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
         assert backend.get_name() == EXPECTED
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index 37ce00c7403..248b294e546 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-
 import pytest
 import torch
 
@@ -11,36 +9,38 @@
 
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
                     reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
-    zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
-    split_k_iters = 0
-    thx = 0
-    thy = 0
-    opcheck(torch.ops._C.awq_dequantize,
-            (qweight, scales, zeros, split_k_iters, thx, thy))
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+        zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+        split_k_iters = 0
+        thx = 0
+        thy = 0
+        opcheck(torch.ops._C.awq_dequantize,
+                (qweight, scales, zeros, split_k_iters, thx, thy))
 
 
 @pytest.mark.skip(reason="Not working; needs investigation.")
 @pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
                     reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
-    os.environ["VLLM_USE_TRITON_AWQ"] = "0"
-    input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
-    qweight = torch.randint(-2000000000,
-                            2000000000, (8192, 256),
-                            device='cuda',
-                            dtype=torch.int32)
-    scales = torch.randint(-2000000000,
-                           2000000000, (64, 256),
-                           device='cuda',
-                           dtype=torch.int32)
-    qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
-    split_k_iters = 8
-    opcheck(torch.ops._C.awq_gemm,
-            (input, qweight, qzeros, scales, split_k_iters))
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_TRITON_AWQ", "0")
+        input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+        qweight = torch.randint(-2000000000,
+                                2000000000, (8192, 256),
+                                device='cuda',
+                                dtype=torch.int32)
+        scales = torch.randint(-2000000000,
+                               2000000000, (64, 256),
+                               device='cuda',
+                               dtype=torch.int32)
+        qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+        split_k_iters = 8
+        opcheck(torch.ops._C.awq_gemm,
+                (input, qweight, qzeros, scales, split_k_iters))
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 7cd60824866..724f0af283f 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -1,13 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from unittest.mock import patch
-
 import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 
 @pytest.fixture(autouse=True)
@@ -17,15 +15,19 @@ def clear_cache():
     _cached_get_attn_backend.cache_clear()
 
 
-def test_selector(monkeypatch):
-    """Test that the attention selector for ROCm.
-    """
-    override_backend_env_variable(monkeypatch, "ROCM_FLASH")
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
 
-    with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        # Set the current platform to ROCm using monkeypatch
+        monkeypatch.setattr("vllm.attention.selector.current_platform",
+                            RocmPlatform())
+
+        # Test standard ROCm attention
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert (backend.get_name() == "ROCM_FLASH"
                 or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+
         # mla test for deepseek related
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
                                    False, True)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 010974076ba..22b3d7c2be7 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -36,12 +36,12 @@
 
 class QKVInputs(NamedTuple):
     '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
     query/key/values and their sequence lengths.
 
     Attributes:
 
-        * {query,key,value}: unpacked (batch_size x padded_seq_len x 
+        * {query,key,value}: unpacked (batch_size x padded_seq_len x
                              num_heads x head_size) attention inputs
         * q_seq_lens: query sequence lengths list
         * kv_seq_lens: shared key/value sequence lengths list
@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):
 
 class QKVO(NamedTuple):
     '''
-    Data structure for representing unpacked attention inputs, 
+    Data structure for representing unpacked attention inputs,
     alongside unpacked known-correct attention output
 
     Attributes:
 
-        * qkv: unpacked (batch_size x padded_seq_len x 
+        * qkv: unpacked (batch_size x padded_seq_len x
                              num_heads x head_size) attention inputs
-        * ideal_output: unpacked (batch_size x padded_seq_len x 
+        * ideal_output: unpacked (batch_size x padded_seq_len x
                         num_heads x head_size) known-correct attention output
     '''
 
@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):
 
     Attributes:
 
-        * {query,key,value}: packed (number_of_tokens x num_heads 
+        * {query,key,value}: packed (number_of_tokens x num_heads
                              x head_size) attention inputs
         * q_start_loc_list: list of query start locations within packed tensor
         * kv_start_loc_list: shared list of key/value start locations within
@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):
 
 class PackedQKVO(NamedTuple):
     '''
-    Data structure for representing packed attention inputs, 
+    Data structure for representing packed attention inputs,
     alongside packed known-correct attention output
 
     Attributes:
 
-        * packed_qkv: packed (number_of_tokens x num_heads 
+        * packed_qkv: packed (number_of_tokens x num_heads
                       x head_size) attention inputs
-        * ideal_output: packed (number_of_tokens x num_heads 
+        * ideal_output: packed (number_of_tokens x num_heads
                         x head_size) known-correct attention output
     '''
 
@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):
 
     Attributes:
 
-        * packed_qkvo: packed (number_of_tokens x num_heads 
+        * packed_qkvo: packed (number_of_tokens x num_heads
                        x head_size) attention inputs & known-correct
                        output
         * kv_mmap: KV cache memory mapping, specific to this test phase &
@@ -195,7 +195,7 @@ def make_causal_mask(
     Create a q_max_seq_len x kv_max_seq_len causal mask
 
     Arguments:
-    
+
     * q_max_seq_len: query max seq len
     * kv_max_seq_len: key/value max seq len
 
@@ -320,9 +320,9 @@ def make_qkv(
     * max_kv_seq_len: max key/value seq len
     * num_heads
     * head_size
-    * is_encoder_decoder_attn: if True, query seqlen may differ from 
-      key/value seqlen (as is often the case for cross-attention); 
-      o/w, query/key/value seqlens match at each batch index 
+    * is_encoder_decoder_attn: if True, query seqlen may differ from
+      key/value seqlen (as is often the case for cross-attention);
+      o/w, query/key/value seqlens match at each batch index
       (max_kv_seq_len is unused)
     * force_kv_seq_lens: if not None, overrides kv sequence lengths
     * attn_type: encoder, decoder self, or enc/dec cross attention
@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
     Individually pack each of Q, K and V, each with dimensions batch_size x
     padded_seq_len x num_heads x head_size, into respective number_of_tokens x
     num_heads x head_size tensors.
-    
+
     For Q, number_of_tokens = sum(q_seq_lens).
 
     For K and V, number_of_tokens = sum(kv_seq_lens)
@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
     Returns:
 
     * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS' 
+    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
-    *     for backend 'FLASH_ATTN'  
+    *     for backend 'FLASH_ATTN'
     '''
     if backend == 'XFORMERS':
         kv_cache = torch.rand(
@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
     Context:
     * Your goal is to test (1) prefill of N prompts, with prompt-lengths
       {K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
-      for all N prompts (N tokens total); the resultant sequence lengths 
+      for all N prompts (N tokens total); the resultant sequence lengths
       after decode would be {K_i + 1 for i \\in [0,N)}
-    * The test you want to do requires (1) having the prefill slot mapping 
-      for all tokens present during prefill, the number of which is 
-      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N 
+    * The test you want to do requires (1) having the prefill slot mapping
+      for all tokens present during prefill, the number of which is
+      M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
       decoded tokens
-    
-    This function consumes a single 1D slot mapping, which is the 
+
+    This function consumes a single 1D slot mapping, which is the
     concatenation of N slot mappings each of length K_i + 1 (corresponding
     to the  sequence lengths after decode), with a total length of
     P = \\sum_i{K_i + 1} = M + N
 
     The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
-    from each of the N subsequences in the slot mapping (i.e. omitting the 
+    from each of the N subsequences in the slot mapping (i.e. omitting the
     decoded token's mapping.)
 
     The N excised entries are appended to obtain the decode-phase slot mapping
@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
 
     * slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
       post-decode sequences
-    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the 
+    * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
       description above)
     * device: cuda, cpu, etc.
 
     Returns:
 
-    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor) 
+    * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
       reflecting all N prefill prompts
-    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting 
+    * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
       all N decoded tokens
     '''
 
@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(
 
     Then the minimum KV cache size in blocks is
 
-    total_cache_blocks = sum(num_blocks for all seqs) 
+    total_cache_blocks = sum(num_blocks for all seqs)
 
     Then, the blocktable mapping counts downward from
 
@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
     to
 
     block_base_addr
-    
+
 
     The constructed block-tables and slot-mapping are sized to the
     lengths of the sequences in their entirety (as reflected by seq_lens),
@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(
 
     Return:
 
-    * block_tables_tensor: block table for sequence   
+    * block_tables_tensor: block table for sequence
     * slot_mapping_list: slot mapping for sequence
     * max_block_idx: the highest block address within this block table
     '''
@@ -807,7 +807,7 @@ def make_test_metadata(
     encoder_test_params and cross_test_params arguments allow encoder
     attention and enc/dec cross-attention (respectively) to use distinct
     metadata values from decoder self-attention (decoder_test_params.)
-    
+
     if encoder_test_params and cross_test_params are None, the attention
     metadata will support decoder-only scenario.
 
@@ -820,7 +820,7 @@ def make_test_metadata(
     * attn_backend_name: Backend for sourcing attention kernels
     * is_prompt: prefill if True, o/w decode
     * seq_lens: list of token counts for each sequence
-    * decoder_test_params: decoder self-attention test params; 
+    * decoder_test_params: decoder self-attention test params;
                            this function requires
                            kv_mmap (memory mapping) field
     * device: CPU or CUDA device
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py
similarity index 100%
rename from tests/kv_transfer/disagg_test.py
rename to tests/kv_transfer/test_disagg.py
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py
similarity index 100%
rename from tests/kv_transfer/module_test.py
rename to tests/kv_transfer/test_module.py
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index faca7a566e7..51abcb7172c 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -12,11 +12,10 @@
 from tests.kernels.utils import override_backend_env_variable
 from tests.quantization.utils import is_quant_method_supported
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ...utils import check_logprobs_close
 
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
 
 @pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,45 +54,47 @@ def test_models(
     backend: str,
     tensor_parallel_size: int,
     disable_async_output_proc: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Only checks log probs match to cover the discrepancy in
     numerical sensitive kernels.
     """
-    override_backend_env_variable(monkeypatch, backend)
-
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
-
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=tensor_parallel_size,
-            enforce_eager=enforce_eager,
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="fp16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                tensor_parallel_size=tensor_parallel_size,
+                enforce_eager=enforce_eager,
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="fp16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
 
 
 @pytest.mark.cpu_model
@@ -119,38 +120,41 @@ def test_cpu_models(
     test_model: str,
     max_tokens: int,
     disable_async_output_proc: bool,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Only checks log probs match to cover the discrepancy in
     numerical sensitive kernels.
     """
-
-    MAX_MODEL_LEN = 1024
-    NUM_LOG_PROBS = 8
-
-    with vllm_runner(
-            base_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype="auto",
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        baseline_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    with vllm_runner(
-            test_model,
-            max_model_len=MAX_MODEL_LEN,
-            dtype="bfloat16",
-            kv_cache_dtype=kv_cache_dtype,
-            disable_async_output_proc=disable_async_output_proc,
-    ) as vllm_model:
-        test_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, NUM_LOG_PROBS)
-
-    check_logprobs_close(
-        outputs_0_lst=baseline_outputs,
-        outputs_1_lst=test_outputs,
-        name_0="bf16_kv_cache",
-        name_1="fp8_kv_cache",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+
+        MAX_MODEL_LEN = 1024
+        NUM_LOG_PROBS = 8
+
+        with vllm_runner(
+                base_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype="auto",
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            baseline_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        with vllm_runner(
+                test_model,
+                max_model_len=MAX_MODEL_LEN,
+                dtype="bfloat16",
+                kv_cache_dtype=kv_cache_dtype,
+                disable_async_output_proc=disable_async_output_proc,
+        ) as vllm_model:
+            test_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, NUM_LOG_PROBS)
+
+        check_logprobs_close(
+            outputs_0_lst=baseline_outputs,
+            outputs_1_lst=test_outputs,
+            name_0="bf16_kv_cache",
+            name_1="fp8_kv_cache",
+        )
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index cae3e1a5c62..d6bf7d27063 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import importlib.util
 import math
@@ -11,6 +12,7 @@
 
 import vllm
 import vllm.config
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ....utils import RemoteOpenAIServer
 
@@ -29,36 +31,34 @@ def _arr(arr):
     return array("i", arr)
 
 
-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
     # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
-    from vllm.model_executor.models.gritlm import GritLMPooler
+        from vllm.model_executor.models.gritlm import GritLMPooler
 
-    # Create an LLM object to get the model config.
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
-    pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+        # Create an LLM object to get the model config.
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
 
-    arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+        arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
 
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
-    assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
-    assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+        assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+        assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
 
-    with pytest.raises(ValueError):
-        pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+        with pytest.raises(ValueError):
+            pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
 
 
 @pytest.fixture(scope="module")
 def server_embedding():
     # GritLM embedding implementation is only supported by XFormers backend.
-    with pytest.MonkeyPatch.context() as mp:
-        mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-
-        args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+    args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
 
 
 @pytest.fixture(scope="module")
@@ -69,9 +69,12 @@ def server_generate():
 
 
 @pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
-    async with server_embedding.get_async_client() as async_client:
-        yield async_client
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
+                           server_embedding: RemoteOpenAIServer):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+        async with server_embedding.get_async_client() as async_client:
+            yield async_client
 
 
 @pytest_asyncio.fixture
@@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
         yield async_client
 
 
-def run_llm_encode(llm: vllm.LLM, queries: list[str],
-                   instruction: str) -> list[float]:
+def run_llm_encode(
+    llm: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
     outputs = llm.encode([instruction + q for q in queries], )
     return [output.outputs.embedding for output in outputs]
 
 
-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
-                                instruction: str) -> list[float]:
+async def run_client_embeddings(
+    client: vllm.LLM,
+    queries: list[str],
+    instruction: str,
+) -> list[float]:
     outputs = await client.embeddings.create(
         model=MODEL_NAME,
         input=[instruction + q for q in queries],
@@ -106,7 +115,7 @@ def get_test_data():
     README.md in https://github.com/ContextualAI/gritlm
     """
     q_instruction = gritlm_instruction(
-        "Given a scientific paper title, retrieve the paper's abstract")
+        "Given a scientific paper title, retrieve the paper's abstract", )
     queries = [
         "Bitcoin: A Peer-to-Peer Electronic Cash System",
         "Generative Representational Instruction Tuning",
@@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
     assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
 
 
-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
     # GritLM embedding implementation is only supported by XFormers backend.
-    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
 
-    queries, q_instruction, documents, d_instruction = get_test_data()
+        queries, q_instruction, documents, d_instruction = get_test_data()
 
-    llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+        llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
 
-    d_rep = run_llm_encode(
-        llm,
-        documents,
-        d_instruction,
-    )
-    q_rep = run_llm_encode(
-        llm,
-        queries,
-        q_instruction,
-    )
+        d_rep = run_llm_encode(
+            llm,
+            documents,
+            d_instruction,
+        )
+        q_rep = run_llm_encode(
+            llm,
+            queries,
+            q_instruction,
+        )
 
-    validate_embed_output(q_rep, d_rep)
+        validate_embed_output(q_rep, d_rep)
 
 
 @pytest.mark.asyncio
 async def test_gritlm_api_server_embedding(
-        client_embedding: openai.AsyncOpenAI):
+    client_embedding: openai.AsyncOpenAI, ):
     queries, q_instruction, documents, d_instruction = get_test_data()
 
     d_rep = await run_client_embeddings(
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index d3d07d0d9ac..465c496f4c0 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
-
 import pytest
 
 from vllm import LLM, SamplingParams
@@ -11,76 +9,92 @@
 
 
 @fork_new_process_for_each_test
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
     # V1 shuts down rather than raising an error here.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    os.environ["VLLM_PLUGINS"] = ""
-    with pytest.raises(Exception) as excinfo:
-        LLM(model=dummy_opt_path, load_format="dummy")
-    error_msg = "has no vLLM implementation and " \
-                "the Transformers implementation is not compatible with vLLM"
-    assert (error_msg in str(excinfo.value))
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        m.setenv("VLLM_PLUGINS", "")
 
+        with pytest.raises(Exception) as excinfo:
+            LLM(model=dummy_opt_path, load_format="dummy")
+        error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM"  # noqa: E501
+        assert (error_msg in str(excinfo.value))
 
-@fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_opt_path, load_format="dummy")
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
 
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+@fork_new_process_for_each_test
+def test_oot_registration_text_generation(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_opt_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_opt_path, load_format="dummy")
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = ["Hello, my name is", "The text does not matter"]
-    llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
-    outputs = llm.embed(prompts)
+def test_oot_registration_embedding(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_gemma2_embedding_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = ["Hello, my name is", "The text does not matter"]
+        llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+        outputs = llm.embed(prompts)
 
-    for output in outputs:
-        assert all(v == 0 for v in output.outputs.embedding)
+        for output in outputs:
+            assert all(v == 0 for v in output.outputs.embedding)
 
 
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
 
 @fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
-    os.environ["VLLM_PLUGINS"] = "register_dummy_model"
-    prompts = [{
-        "prompt": "What's in the image?<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }, {
-        "prompt": "Describe the image<image>",
-        "multi_modal_data": {
-            "image": image
-        },
-    }]
-
-    sampling_params = SamplingParams(temperature=0)
-    llm = LLM(model=dummy_llava_path,
-              load_format="dummy",
-              max_num_seqs=1,
-              trust_remote_code=True,
-              gpu_memory_utilization=0.98,
-              max_model_len=4096,
-              enforce_eager=True,
-              limit_mm_per_prompt={"image": 1})
-    first_token = llm.get_tokenizer().decode(0)
-    outputs = llm.generate(prompts, sampling_params)
-
-    for output in outputs:
-        generated_text = output.outputs[0].text
-        # make sure only the first token is generated
-        rest = generated_text.replace(first_token, "")
-        assert rest == ""
+def test_oot_registration_multimodal(
+    monkeypatch: pytest.MonkeyPatch,
+    dummy_llava_path: str,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PLUGINS", "register_dummy_model")
+        prompts = [{
+            "prompt": "What's in the image?<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }, {
+            "prompt": "Describe the image<image>",
+            "multi_modal_data": {
+                "image": image
+            },
+        }]
+
+        sampling_params = SamplingParams(temperature=0)
+        llm = LLM(model=dummy_llava_path,
+                  load_format="dummy",
+                  max_num_seqs=1,
+                  trust_remote_code=True,
+                  gpu_memory_utilization=0.98,
+                  max_model_len=4096,
+                  enforce_eager=True,
+                  limit_mm_per_prompt={"image": 1})
+        first_token = llm.get_tokenizer().decode(0)
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            # make sure only the first token is generated
+            rest = generated_text.replace(first_token, "")
+            assert rest == ""
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index aad7fc5303c..e617bd057f1 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
 
 
 @pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
 
-    parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
-    parser = make_arg_parser(parser)
-    args = parser.parse_args([])
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])
 
-    # When LLMEngine is loaded, it will crash.
-    def mock_init():
-        raise ValueError
+        # When LLMEngine is loaded, it will crash.
+        def mock_init():
+            raise ValueError
 
-    monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+        m.setattr(LLMEngine, "__init__", mock_init)
 
-    start = time.perf_counter()
-    async with build_async_engine_client(args):
-        pass
-    end = time.perf_counter()
+        start = time.perf_counter()
+        async with build_async_engine_client(args):
+            pass
+        end = time.perf_counter()
 
-    assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
-                              "if there is an error in the startup.")
+        assert end - start < 60, (
+            "Expected vLLM to gracefully shutdown in <60s "
+            "if there is an error in the startup.")
 
 
 @pytest.mark.asyncio
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index f925e42f46d..ce716e6474c 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close
 from ..utils import (completions_with_server_args, get_client_text_generations,
@@ -52,7 +52,7 @@ async def test_multi_step(
     num_logprobs: Optional[int],
     attention_backend: str,
     enable_chunked_prefill: bool,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling in an OpenAI-protocol
     client/server environment.
@@ -82,67 +82,70 @@ async def test_multi_step(
         pytest.skip("Multi-step with Chunked-Prefill only supports"
                     "PP=1 and FLASH_ATTN backend")
 
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
-
-    if not is_async:
-        ms_server_args += ["--disable-async-output-proc"]
-
-    if eager_mode:
-        ms_server_args.append("--enforce-eager")
-
-    if enable_chunked_prefill:
-        ms_server_args.append("--enable-chunked-prefill")
-
-    distributed_args = [
-        "--tensor-parallel-size",
-        str(tp_size),
-        "--pipeline-parallel-size",
-        str(pp_size),
-    ]
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 5x to 1200 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts,
-        model,
-        server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-    test_completions = await completions_with_server_args(
-        prompts,
-        model,
-        ms_server_args + distributed_args,
-        num_logprobs,
-        max_wait_seconds=5 * 240)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-    assert ref_generations == test_generations
-
-    # Assert multi-step scheduling produces nearly-identical logprobs
-    # to single-step scheduling.
-    ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
-    test_text_logprobs = get_client_text_logprob_generations(test_completions)
-    check_logprobs_close(
-        outputs_0_lst=ref_text_logprobs,
-        outputs_1_lst=test_text_logprobs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+            ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+
+        if not is_async:
+            ms_server_args += ["--disable-async-output-proc"]
+
+        if eager_mode:
+            ms_server_args.append("--enforce-eager")
+
+        if enable_chunked_prefill:
+            ms_server_args.append("--enable-chunked-prefill")
+
+        distributed_args = [
+            "--tensor-parallel-size",
+            str(tp_size),
+            "--pipeline-parallel-size",
+            str(pp_size),
+        ]
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 5x to 1200 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts,
+            model,
+            server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+        test_completions = await completions_with_server_args(
+            prompts,
+            model,
+            ms_server_args + distributed_args,
+            num_logprobs,
+            max_wait_seconds=5 * 240)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+        assert ref_generations == test_generations
+
+        # Assert multi-step scheduling produces nearly-identical logprobs
+        # to single-step scheduling.
+        ref_text_logprobs = get_client_text_logprob_generations(
+            ref_completions)
+        test_text_logprobs = get_client_text_logprob_generations(
+            test_completions)
+        check_logprobs_close(
+            outputs_0_lst=ref_text_logprobs,
+            outputs_1_lst=test_text_logprobs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize(("tp_size, pp_size"), [
@@ -152,7 +155,7 @@ async def test_multi_step(
 async def test_multi_step_pp_smoke(
     tp_size: int,
     pp_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Smoke test for the vLLM engine with multi-step scheduling in an
@@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
     attention_backend = "FLASH_ATTN"
     max_num_seqs = 3
 
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    # Prompt from the ShareGPT dataset
-    prompts = [
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-        "in the jtbd context whats a push?",  # codespell:ignore
-    ]
-    # Use varying max_tokens to introduce scheduling randomness.
-    max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
-    assert len(prompts) == len(max_tokens)
-
-    test_args = [
-        "--tensor-parallel-size",
-        str(tp_size), "--pipeline-parallel-size",
-        str(pp_size), "--max-num-seqs",
-        str(max_num_seqs)
-    ]
-
-    server_args = DEFAULT_SERVER_ARGS + test_args
-    ms_server_args = DEFAULT_SERVER_ARGS + \
-       ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
-       test_args
-
-    # Spin up client/server & issue completion API requests.
-    # Default `max_wait_seconds` is 240 but was empirically
-    # was raised 3x to 720 *just for this test* due to
-    # observed timeouts in GHA CI
-    ref_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    test_completions = await completions_with_server_args(
-        prompts=prompts,
-        model_name=model,
-        server_cli_args=ms_server_args,
-        num_logprobs=None,
-        max_wait_seconds=5 * 240,
-        max_tokens=max_tokens)
-
-    # Assert multi-step scheduling produces identical tokens
-    # to single-step scheduling.
-    ref_generations = get_client_text_generations(ref_completions)
-    test_generations = get_client_text_generations(test_completions)
-
-    assert ref_generations == test_generations
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        # Prompt from the ShareGPT dataset
+        prompts = [
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+            "in the jtbd context whats a push?",  # codespell:ignore
+        ]
+        # Use varying max_tokens to introduce scheduling randomness.
+        max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+        assert len(prompts) == len(max_tokens)
+
+        test_args = [
+            "--tensor-parallel-size",
+            str(tp_size), "--pipeline-parallel-size",
+            str(pp_size), "--max-num-seqs",
+            str(max_num_seqs)
+        ]
+
+        server_args = DEFAULT_SERVER_ARGS + test_args
+        ms_server_args = DEFAULT_SERVER_ARGS + \
+          ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+          test_args
+
+        # Spin up client/server & issue completion API requests.
+        # Default `max_wait_seconds` is 240 but was empirically
+        # was raised 3x to 720 *just for this test* due to
+        # observed timeouts in GHA CI
+        ref_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        test_completions = await completions_with_server_args(
+            prompts=prompts,
+            model_name=model,
+            server_cli_args=ms_server_args,
+            num_logprobs=None,
+            max_wait_seconds=5 * 240,
+            max_tokens=max_tokens)
+
+        # Assert multi-step scheduling produces identical tokens
+        # to single-step scheduling.
+        ref_generations = get_client_text_generations(ref_completions)
+        test_generations = get_client_text_generations(test_completions)
+
+        assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 29d5ffd4c9c..a823e484bea 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
 
@@ -42,7 +42,7 @@ def test_multi_step_llm(
     num_prompts: int,
     num_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step scheduling via sync LLM Engine.
 
@@ -70,48 +70,49 @@ def test_multi_step_llm(
       num_logprobs: corresponds to the `logprobs` argument to the OpenAI
                     completions endpoint; `None` -> 1 logprob returned.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=enable_chunked_prefill,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
-                        if num_logprobs is None else
-                        vllm_model.generate_greedy_logprobs(
-                            prompts, max_tokens, num_logprobs))
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
-                      if num_logprobs is None else
-                      hf_model.generate_greedy_logprobs_limit(
-                          prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=enable_chunked_prefill,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+                            if num_logprobs is None else
+                            vllm_model.generate_greedy_logprobs(
+                                prompts, max_tokens, num_logprobs))
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+                          if num_logprobs is None else
+                          hf_model.generate_greedy_logprobs_limit(
+                              prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            check_outputs_equal(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
+        else:
+            check_logprobs_close(
+                outputs_0_lst=hf_outputs,
+                outputs_1_lst=vllm_outputs,
+                name_0="hf",
+                name_1="vllm",
+            )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
     num_logprobs: Optional[int],
     num_prompt_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test prompt logprobs with multi-step scheduling via sync LLM Engine.
 
@@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
                            note that this argument is not supported by the
                            OpenAI completions endpoint.
     """
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    prompts = example_prompts
-    if len(prompts) < num_prompts:
-        prompts = prompts * ((num_prompts // len(prompts)) + 1)
-    prompts = prompts[:num_prompts]
-    assert len(prompts) == num_prompts
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-    ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-    ) as vllm_model:
-        single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
-            prompts,
-            max_tokens,
-            num_logprobs,
-            num_prompt_logprobs=num_prompt_logprobs)
-
-    check_logprobs_close(
-        outputs_0_lst=single_step_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        prompts = example_prompts
+        if len(prompts) < num_prompts:
+            prompts = prompts * ((num_prompts // len(prompts)) + 1)
+        prompts = prompts[:num_prompts]
+        assert len(prompts) == num_prompts
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+        ) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+        ) as vllm_model:
+            single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs,
+                num_prompt_logprobs=num_prompt_logprobs)
+
+        check_logprobs_close(
+            outputs_0_lst=single_step_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     num_prompts: int,
     num_logprobs: Optional[int],
     attention_backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
 
@@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
     #
     # The Incorrect scheduling behavior - if it occurs - will cause an exception
     # in the model runner resulting from `do_sample=False`.
-    override_backend_env_variable(monkeypatch, attention_backend)
-
-    assert len(example_prompts) >= 2
-    challenge_prompts = copy.deepcopy(example_prompts)
-    challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
-                            'inference and serving engine for LLMs.\n'
-                            )  # 24 tok
-    challenge_prompts[1] = (
-        'Briefly describe the major milestones in the '
-        'development of artificial intelligence from 1950 to 2020.\n'
-    )  # 30 tok
-
-    # If necessary, adjust the length of `challenge_prompts` to match
-    # `num_prompts`
-    if len(challenge_prompts) < num_prompts:
-        challenge_prompts = (challenge_prompts *
-                             ((num_prompts // len(challenge_prompts)) + 1))
-    challenge_prompts = challenge_prompts[:num_prompts]
-    assert len(challenge_prompts) == num_prompts
-
-    # Single-step scheduler baseline
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_baseline = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                            vllm_model.generate_greedy_logprobs(
-                                challenge_prompts, max_tokens, num_logprobs))
-
-    # multi-step+"single-step chunked prefill"+APC
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enforce_eager=enforce_eager,
-            gpu_memory_utilization=0.7,
-            tensor_parallel_size=tp_size,
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            num_scheduler_steps=num_scheduler_steps,
-            max_model_len=48,
-            max_num_batched_tokens=48,
-            max_num_seqs=4,
-            block_size=16,
-    ) as vllm_model:
-        outputs_w_features = (vllm_model.generate_greedy(
-            challenge_prompts, max_tokens) if num_logprobs is None else
-                              vllm_model.generate_greedy_logprobs(
-                                  challenge_prompts, max_tokens, num_logprobs))
-
-    if num_logprobs is None:
-        # No-logprobs test
-        check_outputs_equal(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
-    else:
-        # Yes-logprobs test
-        check_logprobs_close(
-            outputs_0_lst=outputs_baseline,
-            outputs_1_lst=outputs_w_features,
-            name_0="multi-step",
-            name_1="multi-step+features",
-        )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+
+        assert len(example_prompts) >= 2
+        challenge_prompts = copy.deepcopy(example_prompts)
+        challenge_prompts[0] = (
+            'vLLM is a high-throughput and memory-efficient '
+            'inference and serving engine for LLMs.\n')  # 24 tok
+        challenge_prompts[1] = (
+            'Briefly describe the major milestones in the '
+            'development of artificial intelligence from 1950 to 2020.\n'
+        )  # 30 tok
+
+        # If necessary, adjust the length of `challenge_prompts` to match
+        # `num_prompts`
+        if len(challenge_prompts) < num_prompts:
+            challenge_prompts = (challenge_prompts *
+                                 ((num_prompts // len(challenge_prompts)) + 1))
+        challenge_prompts = challenge_prompts[:num_prompts]
+        assert len(challenge_prompts) == num_prompts
+
+        # Single-step scheduler baseline
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_baseline = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        # multi-step+"single-step chunked prefill"+APC
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enforce_eager=enforce_eager,
+                gpu_memory_utilization=0.7,
+                tensor_parallel_size=tp_size,
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                num_scheduler_steps=num_scheduler_steps,
+                max_model_len=48,
+                max_num_batched_tokens=48,
+                max_num_seqs=4,
+                block_size=16,
+        ) as vllm_model:
+            outputs_w_features = (
+                vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+                num_logprobs is None else vllm_model.generate_greedy_logprobs(
+                    challenge_prompts, max_tokens, num_logprobs))
+
+        if num_logprobs is None:
+            # No-logprobs test
+            check_outputs_equal(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
+        else:
+            # Yes-logprobs test
+            check_logprobs_close(
+                outputs_0_lst=outputs_baseline,
+                outputs_1_lst=outputs_w_features,
+                name_0="multi-step",
+                name_1="multi-step+features",
+            )
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 30dcdd573ed..033a36b4156 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import os
 
 import neuronxcc.nki.language as nl
 import pytest
@@ -99,6 +98,7 @@ def ref_block_tables_transform(
 )
 @torch.inference_mode()
 def test_load_and_transform_block_tables(
+    monkeypatch: pytest.MonkeyPatch,
     num_tiles,
     num_blocks_per_tile,
     q_head_per_kv_head,
@@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
 
     device = xm.xla_device()
 
-    compiler_flags = [
+    compiler_flags_str = " ".join([
         "-O1",
         "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
-
-    torch.manual_seed(10000)
-    torch.set_printoptions(sci_mode=False)
-
-    # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-    B_P_SIZE = 128
-    if num_blocks_per_tile < B_P_SIZE:
-        assert B_P_SIZE % num_blocks_per_tile == 0
-        block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-    else:
-        block_size_tiling_factor = 1
-    max_num_blocks = 100000
-    block_tables = torch.randint(
-        0,
-        max_num_blocks,
-        (num_tiles * num_blocks_per_tile, ),
-        dtype=torch.int32,
-    )
-    nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-        block_tables.to(device=device),
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    ).cpu()
-    ref_out = ref_block_tables_transform(
-        block_tables,
-        num_tiles,
-        num_blocks_per_tile,
-        q_head_per_kv_head,
-        head_id,
-        block_size_tiling_factor,
-    )
-    assert (nki_out.shape == ref_out.shape
-            ), f"{nki_out.shape=} != {ref_out.shape=}"
-    assert torch.all(nki_out == ref_out)
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(10000)
+        torch.set_printoptions(sci_mode=False)
+
+        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+        B_P_SIZE = 128
+        if num_blocks_per_tile < B_P_SIZE:
+            assert B_P_SIZE % num_blocks_per_tile == 0
+            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+        else:
+            block_size_tiling_factor = 1
+        max_num_blocks = 100000
+        block_tables = torch.randint(
+            0,
+            max_num_blocks,
+            (num_tiles * num_blocks_per_tile, ),
+            dtype=torch.int32,
+        )
+        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+            block_tables.to(device=device),
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        ).cpu()
+        ref_out = ref_block_tables_transform(
+            block_tables,
+            num_tiles,
+            num_blocks_per_tile,
+            q_head_per_kv_head,
+            head_id,
+            block_size_tiling_factor,
+        )
+        assert (nki_out.shape == ref_out.shape
+                ), f"{nki_out.shape=} != {ref_out.shape=}"
+        assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 326a1f82e9b..37d6679f8d5 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
     ])
 @torch.inference_mode()
 def test_contexted_kv_attention(
+    monkeypatch: pytest.MonkeyPatch,
     prefill_batch_size: int,
     decode_batch_size: int,
     num_heads: int,
@@ -329,7 +330,6 @@ def test_contexted_kv_attention(
     large_tile_size,
     mixed_precision: bool,
 ) -> None:
-    import os
 
     import torch_xla.core.xla_model as xm
 
@@ -340,174 +340,178 @@ def test_contexted_kv_attention(
 
     device = xm.xla_device()
 
-    compiler_flags = [
+    compiler_flags_str = " ".join([
         "-O1",
         "--retry_failed_compilation",
-    ]
-    compiler_flags_str = " ".join(compiler_flags)
-    os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
-
-    torch.manual_seed(0)
-    torch.set_printoptions(sci_mode=False)
-    torch.set_default_device("cpu")
-    dtype = torch.float32
-
-    min_ctx_len = 32
-    max_ctx_len = 1024
-    min_query_len = 16
-    max_query_len = 512
-    num_kv_heads = num_heads // num_queries_per_kv
-    (
-        query,
-        k_active,
-        v_active,
-        k_cache,
-        v_cache,
-        block_table,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-    ) = sample_inputs(
-        prefill_batch_size=prefill_batch_size,
-        decode_batch_size=decode_batch_size,
-        min_query_len=min_query_len,
-        max_query_len=max_query_len,
-        min_ctx_len=min_ctx_len,
-        max_ctx_len=max_ctx_len,
-        block_size=block_size,
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        head_size=head_size,
-        dtype=dtype,
-    )
+    ])
+    with monkeypatch.context() as m:
+        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
+
+        torch.manual_seed(0)
+        torch.set_printoptions(sci_mode=False)
+        torch.set_default_device("cpu")
+        dtype = torch.float32
+
+        min_ctx_len = 32
+        max_ctx_len = 1024
+        min_query_len = 16
+        max_query_len = 512
+        num_kv_heads = num_heads // num_queries_per_kv
+        (
+            query,
+            k_active,
+            v_active,
+            k_cache,
+            v_cache,
+            block_table,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+        ) = sample_inputs(
+            prefill_batch_size=prefill_batch_size,
+            decode_batch_size=decode_batch_size,
+            min_query_len=min_query_len,
+            max_query_len=max_query_len,
+            min_ctx_len=min_ctx_len,
+            max_ctx_len=max_ctx_len,
+            block_size=block_size,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_size=head_size,
+            dtype=dtype,
+        )
 
-    output_ref = ref_context_attention(
-        query,
-        key,
-        value,
-        query_lens,
-        seq_lens,
-        head_size,
-        num_queries_per_kv,
-        return_max_reduce=False,
-    )
+        output_ref = ref_context_attention(
+            query,
+            key,
+            value,
+            query_lens,
+            seq_lens,
+            head_size,
+            num_queries_per_kv,
+            return_max_reduce=False,
+        )
 
-    # build neuron program
-    B_P_SIZE = 128
-    assert (large_tile_size >= B_P_SIZE
-            ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
-
-    def ceil_div(a, b):
-        return (a + b - 1) // b
-
-    def pad_to_multiple(a, b):
-        return ceil_div(a, b) * b
-
-    def pad_to_next_power_of_2(a):
-        assert a > 0
-        return 2**int(a - 1).bit_length()
-
-    # calculate input shapes
-    max_num_queries = pad_to_next_power_of_2(sum(query_lens))
-    context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
-    num_active_blocks = ceil_div(context_lens, block_size).sum().item()
-    num_active_blocks = pad_to_multiple(num_active_blocks,
-                                        large_tile_size // block_size)
-    context_kv_len = num_active_blocks * block_size
-    assert (context_kv_len %
-            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+        # build neuron program
+        B_P_SIZE = 128
+        assert (large_tile_size >= B_P_SIZE
+                ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
 
-    # pad QKV tensors
-    pad_dims = (
-        0,
-        0,
-        0,
-        0,
-        0,
-        max_num_queries - query.shape[0],
-    )
-    query = F.pad(query, pad_dims, "constant", 0)
-    k = F.pad(k_active, pad_dims, "constant", 0)
-    v = F.pad(v_active, pad_dims, "constant", 0)
-
-    # permute QKV tensors
-    # query: (1, n_heads, d, seq_q)
-    # key:   (1, n_kv_heads, d, seq_k)
-    # value: (1, n_kv_heads, seq_v, d)
-    query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
-    v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-    k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-    v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
-
-    # transform block table
-    active_block_table = get_active_block_tables(
-        block_table.cpu(),
-        torch.tensor(query_lens).cpu(),
-        torch.tensor(seq_lens).cpu(),
-        block_size,
-        num_active_blocks,
-    )
+        def ceil_div(a, b):
+            return (a + b - 1) // b
 
-    # Build attention masks
-    prior_mask, active_mask = (
-        BlockDiagonalCausalFromBottomRightMask.from_seqlens(
-            query_lens, seq_lens, block_size=block_size))
-    prior_mask_padded = F.pad(
-        prior_mask,
-        (
+        def pad_to_multiple(a, b):
+            return ceil_div(a, b) * b
+
+        def pad_to_next_power_of_2(a):
+            assert a > 0
+            return 2**int(a - 1).bit_length()
+
+        # calculate input shapes
+        max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+        context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+        num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+        num_active_blocks = pad_to_multiple(num_active_blocks,
+                                            large_tile_size // block_size)
+        context_kv_len = num_active_blocks * block_size
+        assert (
+            context_kv_len %
+            large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
+
+        # pad QKV tensors
+        pad_dims = (
             0,
-            context_kv_len - prior_mask.shape[1],
             0,
-            max_num_queries - prior_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    active_mask_padded = F.pad(
-        active_mask,
-        (
             0,
-            max_num_queries - active_mask.shape[1],
             0,
-            max_num_queries - active_mask.shape[0],
-        ),
-        "constant",
-        0,
-    ).bool()
-    attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
-
-    attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
-
-    input_args = (
-        query.to(device=device),
-        k.to(device=device),
-        v.to(device=device),
-        k_cache.to(device=device),
-        v_cache.to(device=device),
-        active_block_table.to(device=device),
-        attn_mask.to(device=device),
-    )
-    input_kwargs = dict(
-        n_kv_head=num_kv_heads,
-        head_size=head_size,
-        mixed_precision=mixed_precision,
-        LARGE_TILE_SZ=large_tile_size,
-    )
+            0,
+            max_num_queries - query.shape[0],
+        )
+        query = F.pad(query, pad_dims, "constant", 0)
+        k = F.pad(k_active, pad_dims, "constant", 0)
+        v = F.pad(v_active, pad_dims, "constant", 0)
+
+        # permute QKV tensors
+        # query: (1, n_heads, d, seq_q)
+        # key:   (1, n_kv_heads, d, seq_k)
+        # value: (1, n_kv_heads, seq_v, d)
+        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
+
+        # transform block table
+        active_block_table = get_active_block_tables(
+            block_table.cpu(),
+            torch.tensor(query_lens).cpu(),
+            torch.tensor(seq_lens).cpu(),
+            block_size,
+            num_active_blocks,
+        )
 
-    output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+        # Build attention masks
+        prior_mask, active_mask = (
+            BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+                query_lens, seq_lens, block_size=block_size))
+        prior_mask_padded = F.pad(
+            prior_mask,
+            (
+                0,
+                context_kv_len - prior_mask.shape[1],
+                0,
+                max_num_queries - prior_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        active_mask_padded = F.pad(
+            active_mask,
+            (
+                0,
+                max_num_queries - active_mask.shape[1],
+                0,
+                max_num_queries - active_mask.shape[0],
+            ),
+            "constant",
+            0,
+        ).bool()
+        attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
+                                 dim=1)
+
+        attn_mask = reorder_context_mask(attn_mask, large_tile_size,
+                                         block_size)
+
+        input_args = (
+            query.to(device=device),
+            k.to(device=device),
+            v.to(device=device),
+            k_cache.to(device=device),
+            v_cache.to(device=device),
+            active_block_table.to(device=device),
+            attn_mask.to(device=device),
+        )
+        input_kwargs = dict(
+            n_kv_head=num_kv_heads,
+            head_size=head_size,
+            mixed_precision=mixed_precision,
+            LARGE_TILE_SZ=large_tile_size,
+        )
 
-    num_actual_tokens = sum(query_lens)
-    # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
-    output_nki = output_nki.cpu().permute(0, 2, 1, 3)
-    output_nki = output_nki[0, :num_actual_tokens, :, :]
-    output_ref_padded = F.pad(
-        output_ref,
-        (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
-        "constant",
-        0,
-    )
-    output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+        output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+
+        num_actual_tokens = sum(query_lens)
+        # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+        output_nki = output_nki.cpu().permute(0, 2, 1, 3)
+        output_nki = output_nki[0, :num_actual_tokens, :, :]
+        output_ref_padded = F.pad(
+            output_ref,
+            (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
+            "constant",
+            0,
+        )
+        output_ref = output_ref_padded.transpose(
+            0, 1)[0, :num_actual_tokens, :, :]
 
-    torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
+        torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 3be248f5aca..9d6872e0e07 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
 import torch
 
-from tests.kernels.utils import override_backend_env_variable
 from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
 
 
 def test_platform_plugins():
@@ -25,8 +25,9 @@ def test_platform_plugins():
         f" is loaded. The first import:\n{_init_trace}")
 
 
-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
     # ignore the backend env variable if it is set
-    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
-    backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
-    assert backend.get_name() == "Dummy_Backend"
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        assert backend.get_name() == "Dummy_Backend"
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 98981a81e90..7abf5066a41 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -22,43 +22,47 @@ def schedule(self):
         raise Exception("Exception raised by DummyV1Scheduler")
 
 
-def test_scheduler_plugins_v0(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with pytest.raises(Exception) as exception_info:
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with pytest.raises(Exception) as exception_info:
 
-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV0Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV0Scheduler,
+            )
 
-        engine = LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = LLMEngine.from_engine_args(engine_args=engine_args)
 
-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
 
-    assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV0Scheduler"
 
 
-def test_scheduler_plugins_v1(monkeypatch):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    # Explicitly turn off engine multiprocessing so that the scheduler runs in
-    # this process
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        # Explicitly turn off engine multiprocessing so
+        # that the scheduler runs in this process
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-    with pytest.raises(Exception) as exception_info:
+        with pytest.raises(Exception) as exception_info:
 
-        engine_args = EngineArgs(
-            model="facebook/opt-125m",
-            enforce_eager=True,  # reduce test time
-            scheduler_cls=DummyV1Scheduler,
-        )
+            engine_args = EngineArgs(
+                model="facebook/opt-125m",
+                enforce_eager=True,  # reduce test time
+                scheduler_cls=DummyV1Scheduler,
+            )
 
-        engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+            engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
 
-        sampling_params = SamplingParams(max_tokens=1)
-        engine.add_request("0", "foo", sampling_params)
-        engine.step()
+            sampling_params = SamplingParams(max_tokens=1)
+            engine.add_request("0", "foo", sampling_params)
+            engine.step()
 
-    assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+        assert str(
+            exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7a4bc7aecc0..607b6c43e02 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,25 +4,29 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 
+from __future__ import annotations
+
 import pytest
 
 from tests.conftest import VllmRunner
 from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from tests.kernels.utils import override_backend_env_variable
 from vllm import SamplingParams, TokensPrompt
 from vllm.core.scheduler import Scheduler
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
 
 from ..models.utils import check_outputs_equal
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     This module relies on V0 internals, so set VLLM_USE_V1=0.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 MODELS = [
@@ -56,7 +60,7 @@ def test_mixed_requests(
     cached_position: int,
     enable_chunked_prefill: bool,
     block_size: int,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     """
     Test the case when some sequences have the prefix cache hit
@@ -67,72 +71,77 @@ def test_mixed_requests(
         pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    cached_prompt = example_prompts[cached_position]
-    with vllm_runner(
-            model,
-            dtype=dtype,
-            enable_prefix_caching=True,
-            enable_chunked_prefill=enable_chunked_prefill,
-            block_size=block_size,
-    ) as vllm_model:
-        # Run the first prompt so the cache is populated
-        vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
-
-        # Run all the promopts
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
-
-        # Verify number of cached tokens
-        for i in range(len(req_outputs)):
-            if i == cached_position:
-                expected_num_cached_tokens = (
-                    len(req_outputs[i].prompt_token_ids) //
-                    block_size) * block_size
-            else:
-                expected_num_cached_tokens = 0
-            assert (
-                req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
-
-        vllm_outputs = [(
-            output.prompt_token_ids + list(output.outputs[0].token_ids),
-            output.prompt + output.outputs[0].text,
-        ) for output in req_outputs]
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with hf_runner(model, dtype=dtype) as hf_model:
+            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+
+        cached_prompt = example_prompts[cached_position]
+        with vllm_runner(
+                model,
+                dtype=dtype,
+                enable_prefix_caching=True,
+                enable_chunked_prefill=enable_chunked_prefill,
+                block_size=block_size,
+        ) as vllm_model:
+            # Run the first prompt so the cache is populated
+            vllm_outputs = vllm_model.generate_greedy([cached_prompt],
+                                                      max_tokens)
+
+            # Run all the promopts
+            greedy_params = SamplingParams(temperature=0.0,
+                                           max_tokens=max_tokens)
+            req_outputs = vllm_model.model.generate(example_prompts,
+                                                    greedy_params)
+
+            # Verify number of cached tokens
+            for i in range(len(req_outputs)):
+                if i == cached_position:
+                    expected_num_cached_tokens = (
+                        len(req_outputs[i].prompt_token_ids) //
+                        block_size) * block_size
+                else:
+                    expected_num_cached_tokens = 0
+                assert (req_outputs[i].num_cached_tokens ==
+                        expected_num_cached_tokens)
+
+            vllm_outputs = [(
+                output.prompt_token_ids + list(output.outputs[0].token_ids),
+                output.prompt + output.outputs[0].text,
+            ) for output in req_outputs]
+
+        check_outputs_equal(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
 def test_unstable_prompt_sequence(
     vllm_runner,
     backend: str,
-    monkeypatch,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
 
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
     if backend == "XFORMERS" and current_platform.is_rocm():
         pytest.skip("Xformers does not support ROCm/HIP.")
-    override_backend_env_variable(monkeypatch, backend)
-
-    with vllm_runner(
-            "Qwen/Qwen2.5-0.5B-Instruct",
-            enable_chunked_prefill=True,
-            enable_prefix_caching=True,
-            max_model_len=4096,
-    ) as vllm_model:
-        for prompt in UNSTABLE_PROMPT_SEQUENCE:
-            vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
-                                SamplingParams(max_tokens=1))
+    with monkeypatch.context() as m:
+        m.setenv(STR_BACKEND_ENV_VAR, backend)
+
+        with vllm_runner(
+                "Qwen/Qwen2.5-0.5B-Instruct",
+                enable_chunked_prefill=True,
+                enable_prefix_caching=True,
+                max_model_len=4096,
+        ) as vllm_model:
+            for prompt in UNSTABLE_PROMPT_SEQUENCE:
+                vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+                                    SamplingParams(max_tokens=1))
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index b54dc6af3e9..8c9d4a91c73 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -56,12 +56,11 @@ def test_gc():
     assert allocated < 50 * 1024 * 1024
 
 
-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
     # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
-    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
-    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
-    try:
-        llm = LLM(model=MODELSCOPE_MODEL_NAME)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
 
         prompts = [
             "Hello, my name is",
@@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
 
         outputs = llm.generate(prompts, sampling_params)
         assert len(outputs) == 4
-    finally:
-        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
-
-
-if __name__ == "__main__":
-    import pytest
-    pytest.main([__file__])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index dcca7d5965e..ae4fddd046d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
 
 import asyncio
-import os
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@@ -112,16 +112,16 @@ def dummy(*, old_arg: object = None, new_arg: object = None):
         dummy(old_arg=1)
 
 
-def test_get_open_port():
-    os.environ["VLLM_PORT"] = "5678"
-    # make sure we can get multiple ports, even if the env var is set
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
-        s1.bind(("localhost", get_open_port()))
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
-            s2.bind(("localhost", get_open_port()))
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
-                s3.bind(("localhost", get_open_port()))
-    os.environ.pop("VLLM_PORT")
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_PORT", "5678")
+        # make sure we can get multiple ports, even if the env var is set
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+            s1.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+                s2.bind(("localhost", get_open_port()))
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                    s3.bind(("localhost", get_open_port()))
 
 
 # Tests for FlexibleArgumentParser
@@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
     assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
 
 
-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
     # V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
 
-    from vllm.attention import Attention, AttentionType
+        from vllm.attention import Attention, AttentionType
 
-    # example from bart
-    ctx = {
-        'encoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
-        'decoder.layers.0.encoder_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
-        'decoder.layers.0.self_attn.attn':
-            Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
-    }
+        # example from bart
+        ctx = {
+            'encoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+            'decoder.layers.0.encoder_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+            'decoder.layers.0.self_attn.attn':
+                Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+        }
 
-    kv_cache = [
-        torch.zeros((1, )),
-    ]
-    encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+        kv_cache = [
+            torch.zeros((1, )),
+        ]
+        encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
 
-    bind_kv_cache(ctx, [kv_cache])
-    assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
-    assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
-    assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+        bind_kv_cache(ctx, [kv_cache])
+        assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+        assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+        assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
 
 
 def test_bind_kv_cache_pp():
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index e94bbd28772..f7a59f054b6 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
+import pytest
 
 from vllm.config import CompilationLevel
 
@@ -9,16 +9,17 @@
 # --enforce-eager on TPU causes graph compilation
 # this times out default Health Check in the MQLLMEngine,
 # so we set the timeout here to 30s
-os.environ["VLLM_RPC_TIMEOUT"] = "30000"
 
 
-def test_custom_dispatcher():
-    compare_two_settings(
-        "google/gemma-2b",
-        arg1=[
-            "--enforce-eager",
-            f"-O{CompilationLevel.DYNAMO_ONCE}",
-        ],
-        arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
-        env1={},
-        env2={})
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_RPC_TIMEOUT", "30000")
+        compare_two_settings(
+            "google/gemma-2b",
+            arg1=[
+                "--enforce-eager",
+                f"-O{CompilationLevel.DYNAMO_ONCE}",
+            ],
+            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+            env1={},
+            env2={})
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 97149884497..a781b8b563b 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
 
-import os
 import threading
 from collections.abc import Iterable
 from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal
 
 import grpc
 import pytest
@@ -21,12 +23,14 @@
 
 
 @pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
     """
     Since this module is V0 only, set VLLM_USE_V1=0 for
     all tests in the module.
     """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
 
 
 FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@@ -67,7 +71,7 @@ def Export(self, request, context):
 
 
 @pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
     """Fixture to set up a fake gRPC trace service"""
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
     service = FakeTraceService()
@@ -80,136 +84,153 @@ def trace_service():
     server.stop(None)
 
 
-def test_traces(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
-
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    # Model forward and model execute should be none, since detailed traces is
-    # not enabled.
-    assert metrics.model_forward_time is None
-    assert metrics.model_execute_time is None
-
-
-def test_traces_with_detailed_steps(trace_service):
-    os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
-
-    sampling_params = SamplingParams(temperature=0.01,
-                                     top_p=0.1,
-                                     max_tokens=256)
-    model = "facebook/opt-125m"
-    llm = LLM(
-        model=model,
-        otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
-        collect_detailed_traces="all",
-    )
-    prompts = ["This is a short prompt"]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    timeout = 5
-    if not trace_service.evt.wait(timeout):
-        raise TimeoutError(
-            f"The fake trace service didn't receive a trace within "
-            f"the {timeout} seconds timeout")
-
-    request = trace_service.request
-    assert len(request.resource_spans) == 1, (
-        f"Expected 1 resource span, "
-        f"but got {len(request.resource_spans)}")
-    assert len(request.resource_spans[0].scope_spans) == 1, (
-        f"Expected 1 scope span, "
-        f"but got {len(request.resource_spans[0].scope_spans)}")
-    assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
-        f"Expected 1 span, "
-        f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
-
-    attributes = decode_attributes(
-        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
-                          ) == sampling_params.temperature
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
-        outputs[0].prompt_token_ids)
-    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
-    assert attributes.get(
-        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
-    metrics = outputs[0].metrics
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
-    ttft = metrics.first_token_time - metrics.arrival_time
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
-    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
-    assert metrics.scheduler_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
-                          ) == metrics.scheduler_time
-    assert metrics.model_forward_time > 0
-    assert attributes.get(
-        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
-            metrics.model_forward_time / 1000)
-    assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
-                          ) == metrics.model_execute_time
-    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        # Model forward and model execute should be none, since detailed traces is
+        # not enabled.
+        assert metrics.model_forward_time is None
+        assert metrics.model_execute_time is None
+
+
+def test_traces_with_detailed_steps(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(
+            model=model,
+            otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+            collect_detailed_traces="all",
+        )
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        timeout = 5
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+        metrics = outputs[0].metrics
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+                              ) == metrics.time_in_queue
+        ttft = metrics.first_token_time - metrics.arrival_time
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        e2e_time = metrics.finished_time - metrics.arrival_time
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+        assert metrics.scheduler_time > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                              ) == metrics.scheduler_time
+        assert metrics.model_forward_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+        ) == pytest.approx(metrics.model_forward_time / 1000)
+        assert metrics.model_execute_time > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+        ) == metrics.model_execute_time
+        assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/tests/utils.py b/tests/utils.py
index fc19c8d031b..06ba8a2421c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -566,6 +566,7 @@ def init_test_distributed_environment(
 
 
 def multi_process_parallel(
+    monkeypatch: pytest.MonkeyPatch,
     tp_size: int,
     pp_size: int,
     test_target: Any,
@@ -582,7 +583,13 @@ def multi_process_parallel(
     refs = []
     for rank in range(tp_size * pp_size):
         refs.append(
-            test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+            test_target.remote(
+                monkeypatch,
+                tp_size,
+                pp_size,
+                rank,
+                distributed_init_port,
+            ), )
     ray.get(refs)
 
     ray.shutdown()
@@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     """
     Get a pytest mark, which skips the test if the GPU doesn't meet
     a minimum memory requirement in GB.
-    
+
     This can be leveraged via `@large_gpu_test` to skip tests in environments
     without enough resources, or called when filtering tests to run directly.
     """
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
index 519a74cab84..6cca3245145 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
 import random
+from typing import Any
 
 import pytest
 
@@ -50,8 +53,12 @@ def model_name():
     return "meta-llama/Meta-Llama-3-8B-Instruct"
 
 
-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
-                           model_name):
+def test_ngram_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
     '''
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5b9725d59dd..0ff804976ad 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
-                    engine_args_and_prompt: tuple[AsyncEngineArgs,
-                                                  PromptType]):
+async def test_load(
+    monkeypatch: pytest.MonkeyPatch,
+    output_kind: RequestOutputKind,
+    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
+):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
@@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
+                     output_kind: RequestOutputKind,
                      engine_args_and_prompt: tuple[AsyncEngineArgs,
                                                    PromptType]):
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5fdbcf5b996..2ec4f7e034a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
 
 
 @fork_new_process_for_each_test
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -159,10 +159,10 @@ def test_engine_core(monkeypatch):
 
 
 @fork_new_process_for_each_test
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
     """
-    A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as top_p, min_tokens, and 
+    A basic end-to-end test to verify that the engine functions correctly
+    when additional sampling parameters, such as top_p, min_tokens, and
     presence_penalty, are set.
     """
     with monkeypatch.context() as m:
@@ -209,7 +209,7 @@ def _check_engine_state():
 
 
 @fork_new_process_for_each_test
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     """
     Test that the engine can handle multiple concurrent batches.
     """
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e646ccbd460..004b4dc82f4 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
 
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
+                            multiprocessing_mode: bool):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
@@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
 
 
 @pytest.mark.asyncio(loop_scope="function")
-async def test_engine_core_client_asyncio(monkeypatch):
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index e763aa2c869..3800cb392fb 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -57,7 +57,7 @@ def _repeat_logprob_config(
     logprob_prompt_logprob_list: BatchLogprobsSpecType,
 ) -> BatchLogprobsSpecType:
     """Ensure each test prompt has a logprob config.
-    
+
     A logprob config specifies the optional (i.e.
     may-be-`None`) number of sample logprobs and
     the optional number of prompt logprobs.
@@ -80,7 +80,7 @@ def _repeat_logprob_config(
                             (optional num sample logprob,
                              optional num prompt logprob)
                              tuples
-    
+
     Returns:
       list of
       (optional num sample logprob,optional num prompt logprob)
@@ -255,14 +255,12 @@ def _run_and_validate(
                          [NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
 @pytest.mark.parametrize("temperature", [0.0, 2.0])
 def test_get_logprobs_and_prompt_logprobs(
-    hf_model,
-    vllm_model,
-    batch_logprobs_composition: BatchLogprobsComposition,
-    temperature: float,
-    example_prompts,
-) -> None:
+        hf_model, vllm_model,
+        batch_logprobs_composition: BatchLogprobsComposition,
+        temperature: float, example_prompts: list[str],
+        monkeypatch: pytest.MonkeyPatch) -> None:
     """Test V1 Engine logprobs & prompt logprobs
-    
+
     Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
     settings and validate that
     * The generated logprobs and prompt logprobs are consistent with the
@@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs(
 
     To save time, only test one APC-enabled scenario
     (sample & prompt logprobs enabled, temperature>0.0).
-    
+
     Args:
       hf_model: HuggingFace reference model fixture
       vllm_model: vLLM model fixture
@@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
       temperature: "temperature" sampling parameter
       example_prompts: example prompt fixture
     """
-    do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
-    if do_apc and (temperature < 2.0
-                   or batch_logprobs_composition != SAMPLE_PROMPT):
-        # Skip some test-cases to save time.
-        pytest.skip()
-    test_prompts = example_prompts
-
-    max_tokens = 5
-    hf_outputs = hf_model.generate_greedy(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        test_prompts,
-        max_tokens=max_tokens,
-    )
-
-    # Batch has mixed sample params
-    # (different logprobs/prompt logprobs combos)
-    logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-
-    # Ensure that each test prompt has a logprob config for testing
-    logprob_prompt_logprob_list = _repeat_logprob_config(
-        test_prompts, logprob_prompt_logprob_list)
-    # Generate SamplingParams
-    vllm_sampling_params = [
-        SamplingParams(max_tokens=max_tokens,
-                       logprobs=num_lp,
-                       prompt_logprobs=num_plp,
-                       temperature=temperature,
-                       seed=1984)
-        for num_lp, num_plp in logprob_prompt_logprob_list
-    ]
-    for _ in range(2 if do_apc else 1):
-        _run_and_validate(
-            vllm_model=vllm_model,
-            test_prompts=test_prompts,
-            vllm_sampling_params=vllm_sampling_params,
-            hf_logprobs=hf_logprobs,
-            hf_outputs=hf_outputs,
-            logprob_prompt_logprob_list=logprob_prompt_logprob_list,
-            temperature=temperature,
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+        if do_apc and (temperature < 2.0
+                       or batch_logprobs_composition != SAMPLE_PROMPT):
+            # Skip some test-cases to save time.
+            pytest.skip()
+        test_prompts = example_prompts
+
+        max_tokens = 5
+        hf_outputs = hf_model.generate_greedy(
+            test_prompts,
             max_tokens=max_tokens,
-            do_apc=do_apc)
-
-
-def test_max_logprobs():
+        )
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            test_prompts,
+            max_tokens=max_tokens,
+        )
+
+        # Batch has mixed sample params
+        # (different logprobs/prompt logprobs combos)
+        logprob_prompt_logprob_list = get_test_batch(
+            batch_logprobs_composition)
+
+        # Ensure that each test prompt has a logprob config for testing
+        logprob_prompt_logprob_list = _repeat_logprob_config(
+            test_prompts, logprob_prompt_logprob_list)
+        # Generate SamplingParams
+        vllm_sampling_params = [
+            SamplingParams(max_tokens=max_tokens,
+                           logprobs=num_lp,
+                           prompt_logprobs=num_plp,
+                           temperature=temperature,
+                           seed=1984)
+            for num_lp, num_plp in logprob_prompt_logprob_list
+        ]
+        for _ in range(2 if do_apc else 1):
+            _run_and_validate(
+                vllm_model=vllm_model,
+                test_prompts=test_prompts,
+                vllm_sampling_params=vllm_sampling_params,
+                hf_logprobs=hf_logprobs,
+                hf_outputs=hf_outputs,
+                logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                do_apc=do_apc)
+
+
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
     """vLLM v1 engine should fail a request with `logprobs > max_logprobs`
-    
     Should also fail for `prompt_logprobs > max_logprobs`
-
     APC should not matter as this test checks basic request validation.
-    
-    Args:
-      monkeypatch
     """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
 
-    runner = VllmRunner("facebook/opt-125m",
-                        max_logprobs=1,
-                        enable_prefix_caching=False,
-                        max_model_len=256)
-    vllm_sampling_params = SamplingParams(logprobs=1)
-    # should pass
-    runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+        runner = VllmRunner("facebook/opt-125m",
+                            max_logprobs=1,
+                            enable_prefix_caching=False,
+                            max_model_len=256)
+        vllm_sampling_params = SamplingParams(logprobs=1)
+        # should pass
+        runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
 
-    bad_sampling_params = SamplingParams(logprobs=2)
-    with pytest.raises(ValueError):
-        runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+        bad_sampling_params = SamplingParams(logprobs=2)
+        with pytest.raises(ValueError):
+            runner.generate(["Hello world"],
+                            sampling_params=bad_sampling_params)
 
 
-def test_none_logprobs(vllm_model, example_prompts):
+def test_none_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
     """Engine should return `logprobs` and `prompt_logprobs` as `None`
-    
+
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    max_tokens = 5
-
-    sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
-                                                   logprobs=None,
-                                                   prompt_logprobs=None,
-                                                   temperature=0.0)
-    results_logprobs_none = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_none)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
 
-    for i in range(len(results_logprobs_none)):
-        # Check sample logprobs are None
-        assert results_logprobs_none[i].outputs[0].logprobs is None
-        assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
-        # Check prompt logprobs are None
-        assert results_logprobs_none[i].prompt_logprobs is None
-
-
-def test_zero_logprobs(vllm_model, example_prompts):
+        sampling_params_logprobs_none = SamplingParams(
+            max_tokens=max_tokens,
+            logprobs=None,
+            prompt_logprobs=None,
+            temperature=0.0,
+        )
+        results_logprobs_none = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params_logprobs_none,
+        )
+
+        for i in range(len(results_logprobs_none)):
+            # Check sample logprobs are None
+            assert results_logprobs_none[i].outputs[0].logprobs is None
+            assert results_logprobs_none[i].outputs[
+                0].cumulative_logprob is None
+            # Check prompt logprobs are None
+            assert results_logprobs_none[i].prompt_logprobs is None
+
+
+def test_zero_logprobs(vllm_model, example_prompts,
+                       monkeypatch: pytest.MonkeyPatch):
     """Engine should return sampled token and prompt token logprobs
-    
+
     Args:
       vllm_model: vLLM model fixture
       example_prompts: list of example prompts (test fixture)
     """
-    max_tokens = 5
-
-    sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
-                                                   logprobs=0,
-                                                   prompt_logprobs=0,
-                                                   temperature=0.0)
-    results_logprobs_zero = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params_logprobs_zero)
-
-    for i in range(len(results_logprobs_zero)):
-        # Check that there is one sample logprob dict for each
-        # sample token
-        logprobs = results_logprobs_zero[i].outputs[0].logprobs
-        prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
-        sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
-        prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
-        assert logprobs is not None
-        assert len(sampled_token_ids) == len(logprobs)
-        assert results_logprobs_zero[i].outputs[
-            0].cumulative_logprob is not None
-        # Check that there is one prompt logprob dict for each
-        # prompt token
-        assert prompt_logprobs is not None
-        assert len(prompt_token_ids) == len(prompt_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        max_tokens = 5
+
+        sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+                                                       logprobs=0,
+                                                       prompt_logprobs=0,
+                                                       temperature=0.0)
+        results_logprobs_zero = vllm_model.model.generate(
+            example_prompts, sampling_params=sampling_params_logprobs_zero)
+
+        for i in range(len(results_logprobs_zero)):
+            # Check that there is one sample logprob dict for each
+            # sample token
+            logprobs = results_logprobs_zero[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+            sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+            prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+            assert logprobs is not None
+            assert len(sampled_token_ids) == len(logprobs)
+            assert results_logprobs_zero[i].outputs[
+                0].cumulative_logprob is not None
+            # Check that there is one prompt logprob dict for each
+            # prompt token
+            assert prompt_logprobs is not None
+            assert len(prompt_token_ids) == len(prompt_logprobs)
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 0309f545ea4..241f49e4fae 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -3,11 +3,16 @@
 
 Run `pytest tests/v1/tpu/test_basic.py`.
 """
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
 import pytest
 
 from vllm.platforms import current_platform
 
-from ...conftest import VllmRunner
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
 
 MODELS = [
     # "Qwen/Qwen2-7B-Instruct",
@@ -28,7 +33,8 @@
 @pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
-    monkeypatch,
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
     enforce_eager: bool,
@@ -41,7 +47,7 @@ def test_models(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        with VllmRunner(
+        with vllm_runner(
                 model,
                 max_model_len=8192,
                 enforce_eager=enforce_eager,
@@ -50,5 +56,5 @@ def test_models(
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
-    output = vllm_outputs[0][1]
-    assert "1024" in output
+        output = vllm_outputs[0][1]
+        assert "1024" in output

From 2611585ba2a98dddacff02b34764fbf0fa7d4d56 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sun, 16 Mar 2025 21:48:11 -0700
Subject: [PATCH 0783/1240] [Benchmark] Do not save detailed info to json by
 default (#14879)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py |  5 ++++-
 benchmarks/benchmark_serving.py    | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 6a7db920b5b..09c8e23ebb1 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,7 +14,8 @@
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
@@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
         from modelscope import snapshot_download
 
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
         # Use file lock to prevent multiple processes from
         # downloading the same model weights at the same time.
         with get_lock(pretrained_model_name_or_path):
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1dd01ca9686..47627126b66 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
                         "Invalid metadata format. Please use KEY=VALUE format."
                     )
 
+        if not args.save_detailed:
+            # Remove fields with too many data points
+            for field in [
+                    "input_lens", "output_lens", "ttfts", "itls",
+                    "generated_texts", "errors"
+            ]:
+                if field in result_json:
+                    del result_json[field]
+
         # Traffic
         result_json["request_rate"] = (args.request_rate if args.request_rate
                                        < float("inf") else "inf")
@@ -828,6 +837,12 @@ def main(args: argparse.Namespace):
         action="store_true",
         help="Specify to save benchmark results to a json file",
     )
+    parser.add_argument(
+        "--save-detailed",
+        action="store_true",
+        help="When saving the results, whether to include per request "
+        "information such as response, error, ttfs, tpots, etc.",
+    )
     parser.add_argument(
         "--metadata",
         metavar="KEY=VALUE",

From 40e65137b68273c20e673c82170c00e8c7a818d7 Mon Sep 17 00:00:00 2001
From: Lily Liu <lilyliupku@gmail.com>
Date: Sun, 16 Mar 2025 22:00:20 -0700
Subject: [PATCH 0784/1240] [V1] [Spec Decode] Support random sampling for spec
 decode (#13933)

Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py | 307 ++++++++++++++---
 vllm/v1/sample/rejection_sampler.py       | 402 +++++++++++++++-------
 vllm/v1/sample/sampler.py                 |   8 -
 vllm/v1/spec_decode/utils.py              |  22 ++
 vllm/v1/worker/gpu_model_runner.py        |  31 +-
 5 files changed, 572 insertions(+), 198 deletions(-)
 create mode 100644 vllm/v1/spec_decode/utils.py

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 190927745f1..84139a40b54 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,37 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Any, Optional
 
 import pytest
 import torch
+import torch.nn.functional as F
 
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 
+DEVICE = "cpu"
+
 
 @pytest.fixture
 def sampler():
     return RejectionSampler()
 
 
-def create_logits_tensor(token_ids: list[int],
+def create_logits_tensor(token_ids: list[list[int]],
                          vocab_size: int = 100) -> torch.Tensor:
     """Helper function to create logits tensor that 
        will produce desired token ids on argmax"""
-    logits = torch.full((len(token_ids), vocab_size), -100.0).cuda()
-    for i, token_id in enumerate(token_ids):
-        logits[i, token_id] = 100.0
+    num_total_tokens = sum(len(tokens) for tokens in token_ids)
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+    start_loc = 0
+    for tokens in token_ids:
+        for j, token_id in enumerate(tokens):
+            logits[start_loc + j, token_id] = 100.0
+        start_loc += len(tokens)
     return logits
 
 
-def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
-    batch_size = len(spec_tokens)
+def create_sampling_metadata(
+        all_greedy: bool,
+        generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
+    """Create a v1 sampling metadata object with all_greedy set 
+        to the given value. Either all greedy or all random sampling 
+        is used.
+    """
+    generators = generators or {}
     return SamplingMetadata(
         temperature=torch.tensor([]),
-        all_greedy=True,
-        all_random=False,
+        all_greedy=all_greedy,
+        all_random=not all_greedy,
         top_p=None,
         top_k=None,
-        min_p=torch.empty(batch_size, ),
-        generators={},
+        min_p=torch.empty(1, ),
+        generators=generators,
         max_num_logprobs=0,
         no_penalties=False,
         prompt_token_ids=None,
@@ -40,129 +54,310 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
         repetition_penalties=torch.tensor([]),
         output_token_ids=[],
         min_tokens={},
-        logit_bias=[None] * batch_size,
+        logit_bias=[None],
         allowed_token_ids_mask=None,
         bad_words_token_ids={},
     )
 
 
+########################### Tests for Greedy Sampling ###################
 def test_perfect_match(sampler):
     """Test when output tokens perfectly match speculated tokens"""
     spec_tokens = [[1, 2, 3]]
-    output_tokens = [1, 2, 3, 4]  # 4 is the bonus token
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[1, 2, 3, 4]],
                             dtype=torch.int,
                             device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 def test_early_mismatch(sampler):
     """Test when there's an early mismatch in tokens"""
     spec_tokens = [[1, 2, 3]]
-    output_tokens = [1, 5, 3, 4]  # Mismatch at position 1
+    output_tokens = [[1, 5, 3, 4]]  # Mismatch at position 1
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 def test_multiple_sequences(sampler):
     """Test handling multiple sequences of speculated tokens"""
     spec_tokens = [[1, 2], [3]]
-    output_tokens = [1, 2, 5, 3, 4]  # Two sequences with bonus tokens 5 and 4
+    output_tokens = [[1, 2, 5], [3,
+                                 4]]  # Two sequences with bonus tokens 5 and 4
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 def test_single_token_sequence(sampler):
     """Test handling sequences with single token"""
     spec_tokens = [[1]]
-    output_tokens = [1, 2]  # Single token with bonus token 2
+    output_tokens = [[1, 2]]  # Single token with bonus token 2
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 def test_empty_sequence(sampler):
     """Test handling empty sequence of speculated tokens"""
     spec_tokens: list[list[int]] = [[]]
-    output_tokens = [5]  # Just the bonus token
+    output_tokens = [[5]]  # Just the bonus token
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+                                      device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 def test_multiple_mismatches(sampler):
     """Test handling multiple sequences with mismatches"""
     spec_tokens = [[1, 2, 3], [4, 5, 6]]
-    output_tokens = [1, 2, 7, 6, 4, 8, 6, 9]  # Mismatches in both sequences
+    output_tokens = [[1, 2, 7, 6], [4, 8, 6,
+                                    9]]  # Mismatches in both sequences
 
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor(
+        [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
                              [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
+    assert torch.equal(output, expected)
 
 
 @pytest.mark.parametrize(
     "spec_tokens,output_tokens,expected",
     [
-        ([[1, 2]], [1, 2, 3], [[1, 2, 3]]),  # Perfect match with bonus
-        ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]),  # First mismatch
-        ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID],
-                                                [3, 4, 7]]),  # Mixed matches
+        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
+        ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]),  # First mismatch
+        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
+         [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
     ])
 def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
     """Parametrized test for various matching scenarios"""
-    metadata = create_sampling_metadata(spec_tokens)
+    metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
+    bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
+                                      device=logits.device)
 
-    output = sampler(spec_tokens, logits, metadata)
+    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
     expected_tensor = torch.tensor(expected,
                                    dtype=torch.int,
                                    device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected_tensor)
-
-
-def test_logits_shape_handling(sampler):
-    """Test handling of different logits tensor shapes"""
-    spec_tokens = [[1, 2]]
-    output_tokens = [1, 2, 3]
-    vocab_size = 1000
-
-    metadata = create_sampling_metadata(spec_tokens)
-    logits = create_logits_tensor(output_tokens, vocab_size)
+    assert torch.equal(output, expected_tensor)
+
+
+########################### Tests for Random Sampling ###################
+@pytest.mark.parametrize("k", [1, 3, 5])
+@pytest.mark.parametrize("vocab_size", [1000])
+@pytest.mark.parametrize("batch_size", [1, 4, 8])
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
+@pytest.mark.parametrize("n_rep", [20])
+def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
+                                   batch_size: int, frac_seeded: float,
+                                   n_rep: int):
+    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+    target_probs = torch.rand(batch_size * (k + 1),
+                              vocab_size,
+                              dtype=torch.float32)
+    bonus_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, 1),
+                                    dtype=torch.int64)
+    draft_token_ids = torch.randint(low=0,
+                                    high=vocab_size,
+                                    size=(batch_size, k),
+                                    dtype=torch.int64)
+
+    seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
+
+    results = []
+    for _ in range(n_rep):
+        seeded_seqs = {
+            i: torch.Generator(device=DEVICE).manual_seed(i)
+            for i in range(batch_size) if seeded_mask[i]
+        }
+
+        sampling_metadata = create_sampling_metadata(all_greedy=False,
+                                                     generators=seeded_seqs)
+        rep_result = sampler(draft_token_ids.tolist(), draft_probs,
+                             bonus_token_ids, target_probs, sampling_metadata)
+
+        results.append(rep_result)
+
+    for i in range(batch_size):
+        if seeded_mask[i]:
+            for j in range(1, n_rep):
+                assert torch.equal(results[j][i], results[0][i])
+
+
+def test_rejection_sampling_approximates_target_distribution():
+    """Verify rejection sampling approximates target distribution,
+    despite sampling from a potentially distinct draft distribution.
+
+    This is done by first creating a random target probability
+    distribution and a random draft probability distribution. We then
+    sample token ids from the rejection sampler using these draft
+    and target distributions. The samples are used to estimate
+    the output probability distribution, which we expect to approximate
+    the target distribution.
+
+    A basic distance metric is used to determine similarity between
+    distributions.
+
+    We expect that as we increase the number of samples,
+    the distance between the observed distribution and the target
+    distribution decreases. To measure this, we compare the distance
+    of the observed distribution against both the target distribution
+    and a uniform random distribution. We expect the distance between
+    the observed distribution and the target distribution to improve
+    much more than the distance improvement between the observed
+    distribution and the random distribution.
+    """
+    torch.set_default_device(DEVICE)
+    vocab_size = 10
+    k = 2
+    num_reference_probs = 100
+
+    # Prepare draft, target, and reference probability distributions
+    draft_probs, target_probs = (F.softmax(
+        torch.rand(vocab_size, dtype=torch.float32),
+        dim=-1,
+    ) for _ in range(2))
+    reference_probs = F.softmax(
+        torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
+        dim=-1,
+    )
 
-    output = sampler(spec_tokens, logits, metadata)
-    expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
-    assert torch.equal(output.sampled_token_ids, expected)
-    assert logits.shape[-1] == vocab_size
+    sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+    distance_wrt_reference: list[float] = []
+    distance_wrt_target: list[float] = []
+
+    for num_samples in sample_sizes:
+        # Sample using rejection sampling.
+        rej_sample_probs = estimate_rejection_sampling_pdf(
+            draft_probs, target_probs, k, vocab_size, num_samples)
+        rej_sample_probs = rej_sample_probs.to(DEVICE)
+
+        # Average distance from reference probs.
+        reference_vs_rejsample_dist = torch.dist(
+            reference_probs,
+            rej_sample_probs).item() / reference_probs.shape[0]
+        target_vs_rejsample_dist = torch.dist(target_probs,
+                                              rej_sample_probs).item()
+
+        distance_wrt_reference.append(reference_vs_rejsample_dist)
+        distance_wrt_target.append(target_vs_rejsample_dist)
+
+        relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+            distance_wrt_target)
+        relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+            distance_wrt_reference)
+
+        print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+              f"{reference_vs_rejsample_dist=:.05f}")
+        print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+              f"{relative_change_in_distance_wrt_reference=:.02f}")
+
+    relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+        distance_wrt_target)
+    relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+        distance_wrt_reference)
+
+    expected_improvement_multiplier = 20
+    assert (relative_change_in_distance_wrt_target
+            > relative_change_in_distance_wrt_reference *
+            expected_improvement_multiplier)
+
+
+def get_ratio_first_to_last(elements: list[float]) -> float:
+    return elements[0] / elements[-1]
+
+
+def estimate_rejection_sampling_pdf(
+    draft_probs: torch.Tensor,
+    target_probs: torch.Tensor,
+    k: int,
+    vocab_size: int,
+    num_samples: int,
+) -> torch.Tensor:
+    """Estimate the probability distribution of the output tokens
+    using rejection sampling.
+
+    Args:
+        draft_probs: Draft probability distribution.
+        target_probs: Target probability distribution.
+        num_samples: Number of samples to draw.
+
+    Returns:
+        Estimated probability distribution of the output tokens.
+    """
+    sampler = RejectionSampler()
+    # Repeat draft probs num_samples times.
+    draft_probs = draft_probs.reshape(1, 1,
+                                      vocab_size).repeat(num_samples, k, 1)
+
+    # Repeat target probs num_samples * (k + 1) times.
+    target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
+        num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
+
+    # Randomly sample draft token ids from draft probs.
+    draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
+                                        num_samples=k,
+                                        replacement=True).reshape(
+                                            num_samples, k)
+
+    # Bonus tokens not used but required.
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
+                                  device=DEVICE).repeat(num_samples, 1)
+
+    sampling_metadata = create_sampling_metadata(all_greedy=False)
+    output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
+                               bonus_token_ids, target_probs,
+                               sampling_metadata)
+    output_token_ids = output_token_ids[:, :-1].flatten()
+
+    hist = torch.histogram(output_token_ids.to(dtype=torch.float,
+                                               device="cpu"),
+                           bins=vocab_size,
+                           range=(0, vocab_size),
+                           density=True)
+
+    return hist.hist
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index ea7f3353c11..5601c62e91f 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,87 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Optional
 
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import pad_sequence
 
-from vllm import envs
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.v1.outputs import SamplerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
-
-try:
-    import flashinfer.sampling as fs
-    is_flashinfer_available = True
-except ImportError:
-    is_flashinfer_available = False
+from vllm.v1.spec_decode.utils import random_sample
 
 logger = init_logger(__name__)
 INVALID_TOKEN_ID = -1
 
 
 class RejectionSampler(nn.Module):
+    """
+    The implementation strictly follows the algorithm described in 
+        https://arxiv.org/abs/2211.17192.
+    However, we want to clarify the terminology used in the implementation:
+    accepted tokens: tokens that are accepted based on the relationship 
+            between the "raw" draft and target probabilities.
+    recovered tokens: tokens that are sampled based on the adjusted probability
+        distribution, which is derived from both the draft and target 
+        probabilities.
+    bonus tokens:
+        If all proposed tokens are accepted, the bonus token is added to the
+        end of the sequence. The bonus token is only sampled from the target
+        probabilities. We pass in the bonus tokens instead of sampling them
+        in the rejection sampler to allow for more flexibility in the
+        sampling process. For example, we can use top_p, top_k sampling for
+        bonus tokens, while spec decode does not support these sampling
+        strategies.
+    output tokens: 
+        Tokens are finally generated with the rejection sampler. 
+        output tokens = accepted tokens + recovered tokens + bonus tokens
+    """
 
     def __init__(self):
         super().__init__()
-        if current_platform.is_cuda():
-            if is_flashinfer_available:
-                if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
-                    # FIXME(woosuk): Currently, we have errors when using
-                    # FlashInfer for rejection sampling. As a workaround, we
-                    # disable FlashInfer for rejection sampling by default.
-                    logger.info("Currently, FlashInfer rejection sampler is "
-                                "disabled because of a bug. Falling back to "
-                                "the PyTorch-native implementation of "
-                                "rejection sampling.")
-                    self.forward_method = self.forward_native
-
-                    # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
-                    # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
-                    # default it is unused). For backward compatibility, we set
-                    # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
-                    # interpret it differently in V0 and V1 samplers: In V0,
-                    # None means False, while in V1, None means True. This is
-                    # why we use the condition
-                    # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    # logger.info("Using FlashInfer for rejection sampling.")
-                    # self.forward_method = self.flashinfer_sample
-                else:
-                    logger.warning(
-                        "FlashInfer is available, but it is not enabled. "
-                        "Falling back to the PyTorch-native implementation of "
-                        "rejection sampling. For the best performance, "
-                        "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
-                    self.forward_method = self.forward_native
-            else:
-                logger.warning(
-                    "FlashInfer is not available. Falling back to the PyTorch-"
-                    "native implementation of rejection sampling. For the "
-                    "best performance, please install FlashInfer.")
-                self.forward_method = self.forward_native
-        else:
-            self.forward_method = self.forward_native
-
-    def forward(self, draft_token_ids: list[list[int]],
-                target_probs: torch.Tensor,
-                sampling_metadata: SamplingMetadata) -> SamplerOutput:
-        if not sampling_metadata.all_greedy:
-            raise NotImplementedError(
-                "Currently, only greedy sampling is supported by "
-                "rejection sampler.")
-        return self.forward_method(draft_token_ids, target_probs,
-                                   sampling_metadata)
 
-    def flashinfer_sample(
+    def forward(
         self,
         draft_token_ids: list[list[int]],
-        target_probs: torch.Tensor,
+        draft_probs: Optional[torch.Tensor],
+        bonus_token_ids_tensor: torch.Tensor,  # [batch_size, 1]
+        target_probs: torch.Tensor,  # [num_total_tokens, vocab_size]
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
+        '''
+        Args:
+            draft_token_ids (List[List[int]]):
+                A 2D list of token IDs for each request in the batch. 
+                Each request might have different number of draft tokens. 
+                It may also contain empty lists for requests that have 
+                no draft tokens.
+            draft_probs (Optional[torch.Tensor]):
+                Probability distribution for the draft tokens. Shape is
+                [batch_size, max_spec_len, vocab_size]. Can be None if 
+                probabilities are not provided, which is the case for
+                ngram spec decode.
+            bonus_token_ids_tensor (torch.Tensor):
+                A tensor containing bonus tokens. Shape is [batch_size, 1]. 
+                Bonus tokens are added to the end of the sequence if all 
+                proposed tokens are accepted. We generate the bonus tokens 
+                outside of the rejection sampler with the default sampling 
+                strategy. It allows for more flexibility in the sampling 
+                process such as top_p, top_k sampling.
+            target_probs (torch.Tensor):
+                Target model probability distribution.
+                Shape is [num_total_tokens, vocab_size]. num_total_tokens 
+                is the total number of tokens from all requests. Here, 
+                probabilities from different requests are flattened into
+                a single tensor because this is the shape of the output 
+                logits.
+            sampling_metadata (SamplingMetadata):
+                Additional metadata needed for sampling, such as temperature,
+                top-k/top-p parameters, or other relevant information.
+        Returns:
+            output_token_ids (torch.Tensor):
+                A tensor containing the final output token IDs.
+        '''
+
         # NOTE: The following input preparationg can be moved
         # to the model runner with a persistent manner for better
         # performance.
-        sample_lens = [len(x) + 1 for x in draft_token_ids]
         # Convert draft token IDs to a tensor, split by sample_lens, then pad.
         draft_token_ids = [
             torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
@@ -90,90 +92,171 @@ def flashinfer_sample(
                                               batch_first=True,
                                               padding_value=INVALID_TOKEN_ID)
 
-        if sampling_metadata.all_greedy:
-            target_token_ids = target_probs.argmax(dim=-1).view(-1)
-            target_token_ids = target_token_ids.split(sample_lens)
-            target_token_ids = pad_sequence(target_token_ids,
-                                            batch_first=True,
-                                            padding_value=INVALID_TOKEN_ID)
+        # NOTE: CPU <-> GPU synchronization happens here.
+        draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
 
+        # Create one-hot tensor for draft token ids.
+        # This is used for ngram where we don't have draft_probs.
+        if draft_probs is None and not sampling_metadata.all_greedy:
             vocab_size = target_probs.size(-1)
-            # NOTE: CPU <-> GPU synchronization happens here.
-            draft_token_ids_tensor = draft_token_ids_tensor.to(
-                target_probs.device)
             draft_probs = _create_greedy_token_probs(draft_token_ids_tensor,
                                                      vocab_size,
                                                      target_probs.device)
-            target_probs = _create_greedy_token_probs(target_token_ids,
-                                                      vocab_size,
-                                                      target_probs.device)
-            uniform_samples = torch.zeros(draft_token_ids_tensor.size(0),
-                                          draft_token_ids_tensor.size(1) + 1,
-                                          device=target_probs.device)
-        else:
-            raise NotImplementedError(
-                "Currently, only greedy sampling is supported by "
-                "rejection sampler.")
-
-        sampled_token_ids, _, _ = fs.chain_speculative_sampling(
-            draft_probs,
-            draft_token_ids_tensor,
-            uniform_samples,
-            target_probs,
-        )
-        return SamplerOutput(sampled_token_ids=sampled_token_ids,
-                             logprobs_tensors=None)
+        sample_lens = [len(x) + 1 for x in draft_token_ids]
+        target_probs = _convert_2d_probs(target_probs, sample_lens)
+
+        return self.forward_native(draft_token_ids_tensor, draft_probs,
+                                   bonus_token_ids_tensor, target_probs,
+                                   sampling_metadata)
 
     # TODO: The following method can be optimized for better performance.
     def forward_native(
         self,
-        draft_token_ids: list[list[int]],
+        draft_token_ids_tensor: torch.Tensor,
+        # [batch_size, max_spec_len, vocab_size]
+        draft_probs: Optional[torch.Tensor],
+        bonus_token_ids_tensor: torch.Tensor,
+        # [batch_size, max_spec_len + 1, vocab_size]
         target_probs: torch.Tensor,
         sampling_metadata: SamplingMetadata,
-    ) -> SamplerOutput:
-        sample_lens = [len(x) + 1 for x in draft_token_ids]
-        # Convert draft token IDs to a tensor, split by sample_lens, then pad.
-        draft_token_ids = [
-            torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
-        ]
-        draft_token_ids_tensor = pad_sequence(draft_token_ids,
-                                              batch_first=True,
-                                              padding_value=INVALID_TOKEN_ID)
-        draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
+    ) -> torch.Tensor:
         # Add 1 to include the 'bonus' token.
         if sampling_metadata.all_greedy:
-            output_token_ids = target_probs.argmax(dim=-1).view(-1)
-            output_token_ids = output_token_ids.split(sample_lens)
-            output_token_ids = pad_sequence(output_token_ids,
-                                            batch_first=True,
-                                            padding_value=INVALID_TOKEN_ID)
             # Produce a mask that remains 1 (True) until the first
             # mismatch (cumprod turns 0 after a mismatch).
-            accept_mask = (
-                output_token_ids[:, :-1] == draft_token_ids_tensor).cumprod(
-                    dim=1)
+            target_token_ids_tensor = target_probs.argmax(dim=-1)
+            accept_mask = (target_token_ids_tensor[:, :-1] ==
+                           draft_token_ids_tensor).cumprod(dim=1)
+
+            # Identify valid positions (non-padding).
+            valid_mask = target_token_ids_tensor != INVALID_TOKEN_ID
+            # Generate mask with bonus token.
+            generate_mask = torch.cat([
+                accept_mask,
+                torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
+            ],
+                                      dim=1).to(torch.bool) & valid_mask
+            zeros_mask = (generate_mask == 0)
+            first_zero_idx = zeros_mask.float().argmax(dim=1)
+            # Figure out which rows actually contain at least one zero.
+            rows_with_zero = zeros_mask.any(dim=1)
+            # Use indexing to set the first zero in each of those rows to 1.
+            generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
+
+            output_token_ids = target_token_ids_tensor
+            output_token_ids[~generate_mask] = INVALID_TOKEN_ID
         else:
-            raise NotImplementedError(
-                "Currently, only greedy sampling is supported by "
-                "rejection sampler.")
-        # Identify valid positions (non-padding).
-        valid_mask = output_token_ids != INVALID_TOKEN_ID
-        # Generate mask with bonus token.
-        generate_mask = torch.cat([
-            accept_mask,
-            torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
-        ],
-                                  dim=1).to(torch.bool) & valid_mask
-        zeros_mask = (generate_mask == 0)
-        first_zero_idx = zeros_mask.float().argmax(dim=1)
-        # Figure out which rows actually contain at least one zero.
-        rows_with_zero = zeros_mask.any(dim=1)
-        # Use indexing to set the first zero in each of those rows to 1.
-        generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
-
-        output_token_ids[~generate_mask] = INVALID_TOKEN_ID
-        return SamplerOutput(sampled_token_ids=output_token_ids,
-                             logprobs_tensors=None)
+            # Reference: https://arxiv.org/pdf/2211.17192
+            # 1. Extract the probabilities of the draft tokens.
+            # [batch_size, max_spec_len]
+            batch_size = draft_token_ids_tensor.size(0)
+            max_spec_len = draft_token_ids_tensor.size(1)
+            invalid_idx = draft_token_ids_tensor == INVALID_TOKEN_ID
+            draft_token_ids_tensor[invalid_idx] = 0
+            assert draft_probs is not None
+            draft_token_probs = draft_probs.gather(
+                dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
+            target_token_probs = target_probs.gather(
+                dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
+            # Force the probabilities of invalid tokens to inf
+            # so that they are not accepted.
+            draft_token_probs[invalid_idx] = float('inf')
+
+            # 2. Generate uniform samples.
+            # [batch_size, max_spec_len + 1]
+            uniform_samples = _create_uniform_samples(
+                sampling_metadata.generators, batch_size, max_spec_len,
+                target_probs.device)
+
+            # 3. Accept or reject the samples.
+            # [batch_size, max_spec_len]
+            # If the draft token probabilities are 0, set them to the smallest
+            # positive normal value representable by float32.
+            safe_draft_probs = torch.where(draft_token_probs > 0,
+                                           draft_token_probs,
+                                           torch.finfo(torch.float32).tiny)
+            accepted = uniform_samples <= target_token_probs / safe_draft_probs
+            accept_mask = accepted.cumprod(dim=1)
+            # Set the token ids to the draft token ids if accepted, otherwise
+            # set them to INVALID_TOKEN_ID.
+            accepted_token_ids = (draft_token_ids_tensor * accept_mask +
+                                  INVALID_TOKEN_ID * (1 - accept_mask))
+
+            # 4. Adjust the distribution for the recovered tokens.
+            # Clamp the bonus probabilities to the smallest positive normal
+            # value representable by float32.
+            bonus_prob = torch.clamp(target_probs[:, :-1, :] - draft_probs,
+                                     min=torch.finfo(torch.float32).tiny)
+            normalized_bonus_prob = bonus_prob / bonus_prob.sum(dim=-1,
+                                                                keepdim=True)
+
+            # 5. Sample recovered token ids.
+            recovered_token_ids = random_sample(
+                normalized_bonus_prob,
+                sampling_metadata.generators).reshape(batch_size, max_spec_len)
+
+            # 6. Get the final output token ids.
+            # output_token_ids = accepted_token_ids +
+            #                    recovered_token_ids +
+            #                    bonus_token_id
+            recovered_bonus_token_ids = torch.cat(
+                [recovered_token_ids, bonus_token_ids_tensor], dim=1)
+            # Generate mask with bonus tokens.
+            generate_mask = torch.cat([
+                accept_mask,
+                torch.zeros(batch_size, 1, device=accept_mask.device)
+            ],
+                                      dim=1).to(torch.bool)
+            zeros_mask = (generate_mask == 0)
+            first_zero_idx = zeros_mask.float().argmax(dim=1)
+            output_token_ids = torch.cat([
+                accepted_token_ids,
+                torch.full((batch_size, 1),
+                           fill_value=INVALID_TOKEN_ID,
+                           device=accept_mask.device)
+            ],
+                                         dim=1)
+            output_token_ids[torch.arange(batch_size),
+                             first_zero_idx] = recovered_bonus_token_ids[
+                                 torch.arange(batch_size), first_zero_idx]
+
+        return output_token_ids
+
+    def compute_probs(self, logits: torch.Tensor,
+                      sampling_metadata: SamplingMetadata,
+                      sample_lens: list[int]) -> torch.Tensor:
+        """
+        Compute probability distribution from logits based on sampling metadata.
+    
+        This function applies temperature scaling to the logits and converts 
+        them to probabilities using softmax. Note that division by 
+        temperature is not performed inplace to preserve the original logits 
+        tensor, which will be used by the original sampler to get bonus tokens.
+        
+        Args:
+            logits: Input logits tensor to be converted to probabilities
+            sampling_metadata: Metadata containing sampling parameters such 
+                    as temperature and whether greedy sampling is used
+            sample_lens: List of sample lengths used for repeating 
+                    temperature values
+            
+        Returns:
+            torch.Tensor: Probability distribution (softmax of scaled logits) 
+                    if non-greedy sampling is used, otherwise returns the 
+                    original logits
+        """
+        if sampling_metadata.all_greedy:
+            return logits
+        assert sampling_metadata.temperature is not None
+        # We should optimize the following code as
+        # it will cause CPU -> GPU synchronization.
+        temperature = torch.repeat_interleave(
+            sampling_metadata.temperature,
+            torch.tensor(sample_lens,
+                         device=sampling_metadata.temperature.device))
+        temperature = temperature.unsqueeze(dim=1)
+        logits = logits / temperature
+        return logits.softmax(dim=-1, dtype=torch.float32)
 
 
 def _create_greedy_token_probs(
@@ -199,3 +282,66 @@ def _create_greedy_token_probs(
                          src=valid_mask.unsqueeze(-1).float())
 
     return token_probs
+
+
+def _convert_2d_probs(
+        probs: torch.Tensor,  # [num_total_tokens, vocab_size]
+        sample_lens: list[int]) -> torch.Tensor:
+    """
+        Converts a 2D tensor of probabilities to a 3D tensor with padding.
+        [num_total_tokens, vocab_size] -> 
+            [batch_size, max_spec_len + 1, vocab_size]
+    """
+    cumulative_lens = torch.cumsum(torch.tensor(sample_lens,
+                                                device=probs.device),
+                                   dim=0)
+    split_indices = cumulative_lens[:-1].tolist()  # Exclude last index
+
+    # Split into chunks without loops
+    chunks = torch.tensor_split(probs, split_indices, dim=0)
+
+    # Pad all sequences to maximum length
+    padded_probs = pad_sequence(chunks, batch_first=True, padding_value=0.0)
+    return padded_probs
+
+
+def _create_uniform_samples(seeded_seqs: dict[int, torch.Generator],
+                            batch_size: int, k: int,
+                            device: torch.device) -> torch.Tensor:
+    """
+        Generates a batch of uniform random samples, with optional seeding 
+        for specific sequences.
+
+        This method creates a tensor of shape `(batch_size, k)` filled 
+        with uniform random values in the range [0, 1). If `seeded_seqs` 
+        is provided, the sequences corresponding to specific indices 
+        will be generated using the provided `torch.Generator` for 
+        reproducibility. The other sequences will be generated without 
+        a seed.
+
+        Args:
+            seeded_seqs : Optional[Dict[int, torch.Generator]]
+                A dictionary mapping indices in the batch to 
+                `torch.Generator` objects.
+            batch_size : int
+                The number of sequences to generate.
+            k : int
+                The number of random samples per sequence.
+            device : torch.device
+                The device on which to allocate the tensor.
+
+        Returns:
+            uniform_rand : torch.Tensor
+                A tensor of shape `(batch_size, k)` containing uniform 
+                random values in the range [0, 1).
+        """
+
+    uniform_rand = torch.rand(batch_size,
+                              k,
+                              dtype=torch.float32,
+                              device=device)
+    # Apply seeded generators only where needed
+    if seeded_seqs:
+        for idx, generator in seeded_seqs.items():
+            uniform_rand[idx].uniform_(0, 1, generator=generator)
+    return uniform_rand
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 96f6d807b10..d91c057083f 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -119,14 +119,6 @@ def sample(
         )
         return sampled
 
-    def compute_probs(self, logits: torch.Tensor,
-                      sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        if sampling_metadata.all_greedy:
-            return logits
-        # Apply temperature. This is an in-place op changing logits.
-        logits = self.apply_temperature(logits, sampling_metadata.temperature)
-        return logits.softmax(dim=-1, dtype=torch.float32)
-
     def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
         return logits.log_softmax(dim=-1, dtype=torch.float32)
 
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
new file mode 100644
index 00000000000..58414013677
--- /dev/null
+++ b/vllm/v1/spec_decode/utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.sample.ops.topk_topp_sampler import random_sample  # noqa
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
+    if req_id in input_batch.top_k_reqs or req_id in input_batch.top_p_reqs:
+        # Spec decode doesn't support top_p/top_k sampling.
+        return False
+    elif req_id in input_batch.min_p_reqs:
+        # Spec decode doesn't support min_p sampling.
+        return False
+    elif (req_id in input_batch.frequency_penalties_reqs
+          or req_id in input_batch.presence_penalties_reqs
+          or req_id in input_batch.repetition_penalties_reqs):
+        # Spec decode doesn't support penalties.
+        return False
+    elif req_id in input_batch.num_logprobs:
+        # Spec decode doesn't support logprobs.
+        return False
+
+    return True
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4059d5b17b7..2a98bea562d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -37,6 +37,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.spec_decode.utils import is_spec_decode_supported
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -1020,15 +1021,26 @@ def execute_model(
                 sampling_metadata=sampling_metadata,
             )
         else:
-            target_probs = self.model.sampler.compute_probs(
-                logits, sampling_metadata)
             draft_token_ids = [
                 scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
                 for req_id in self.input_batch.req_ids
             ]
-            sampler_output = self.rejection_sampler(draft_token_ids,
-                                                    target_probs,
-                                                    sampling_metadata)
+            sample_lens = [len(tokens) + 1 for tokens in draft_token_ids]
+            recover_logits_idx = np.cumsum(sample_lens) - 1
+            target_probs = self.rejection_sampler.compute_probs(
+                logits, sampling_metadata, sample_lens)
+            sampler_output = self.model.sample(
+                logits=logits[recover_logits_idx, :],
+                sampling_metadata=sampling_metadata,
+            )
+            bonus_token_ids = sampler_output.sampled_token_ids
+            output_token_ids = self.rejection_sampler(
+                draft_token_ids,
+                None,  # draft_probs
+                bonus_token_ids,
+                target_probs,
+                sampling_metadata)
+            sampler_output.sampled_token_ids = output_token_ids
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
@@ -1075,7 +1087,7 @@ def execute_model(
             spec_token_ids = None
         else:
             spec_token_ids = self.generate_draft_token_ids(
-                valid_sampled_token_ids)
+                valid_sampled_token_ids, sampling_metadata)
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1089,6 +1101,7 @@ def execute_model(
     def generate_draft_token_ids(
         self,
         sampled_token_ids: list[list[int]],
+        sampling_metadata: SamplingMetadata,
     ) -> list[list[int]]:
         # TODO(woosuk): Optimize.
         draft_token_ids: list[list[int]] = []
@@ -1099,6 +1112,12 @@ def generate_draft_token_ids(
                 draft_token_ids.append([])
                 continue
 
+            # Skip requests that require top-p, top-k, etc.
+            req_id = self.input_batch.req_ids[i]
+            if not is_spec_decode_supported(req_id, self.input_batch):
+                draft_token_ids.append([])
+                continue
+
             # Add sampled_token_ids to token_ids_cpu.
             start_idx = self.input_batch.num_tokens_no_spec[i]
             end_idx = start_idx + num_sampled_ids

From 0421aa86913021f0ecf5d1239964581bfee92924 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 14:42:06 +0800
Subject: [PATCH 0785/1240] [V1] Remove input cache client (#14864)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/preprocess.py          |   6 ++
 vllm/v1/engine/__init__.py         |   2 +-
 vllm/v1/engine/mm_input_cache.py   | 122 +++--------------------------
 vllm/v1/engine/processor.py        |  80 ++++++-------------
 vllm/v1/worker/gpu_model_runner.py |  39 ++-------
 5 files changed, 48 insertions(+), 201 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f56cff292b6..af35e43d825 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -379,6 +379,7 @@ def _prompt_to_llm_inputs(
                     multi_modal_data,
                     mm_processor_kwargs,
                     lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
                 )
 
             prompt_token_ids = self._tokenize_prompt(
@@ -401,6 +402,7 @@ async def _prompt_to_llm_inputs_async(
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
+        return_mm_hashes: bool = False,
     ) -> SingletonInputs:
         """Async version of :meth:`_extract_prompt_components`."""
         parsed = parse_singleton_prompt(prompt)
@@ -431,6 +433,7 @@ async def _prompt_to_llm_inputs_async(
                     multi_modal_data,
                     mm_processor_kwargs,
                     lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
                 )
 
             return token_inputs(
@@ -452,6 +455,7 @@ async def _prompt_to_llm_inputs_async(
                     multi_modal_data,
                     mm_processor_kwargs,
                     lora_request=lora_request,
+                    return_mm_hashes=return_mm_hashes,
                 )
 
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -726,6 +730,7 @@ def _process_decoder_only_prompt(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
         )
 
         return self._build_decoder_only_llm_inputs(
@@ -746,6 +751,7 @@ async def _process_decoder_only_prompt_async(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
+            return_mm_hashes=return_mm_hashes,
         )
 
         return self._build_decoder_only_llm_inputs(
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cd29c2d7d57..3699779b3a0 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -52,7 +52,7 @@ class EngineCoreRequest(
     # Detokenizer, but set to None when it is added to EngineCoreClient.
     prompt: Optional[str]
     prompt_token_ids: list[int]
-    mm_inputs: Optional[list[Optional[MultiModalKwargs]]]
+    mm_inputs: Optional[list[MultiModalKwargs]]
     mm_hashes: Optional[list[str]]
     mm_placeholders: Optional[list[PlaceholderRange]]
     sampling_params: SamplingParams
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index e2dda73ba42..61a55d2499b 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,131 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Optional
-
-from vllm.config import ModelConfig
 from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
-from vllm.logger import init_logger
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.processing import ProcessingCache
 
-logger = init_logger(__name__)
-
 # The idea of multimodal preprocessing caching is based on having a client and
 # a server, where the client executes in the frontend process (=P0) and the
 # server in the core process (=P1).
 #
 # -- Client:
-#  - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
-#  - Perform caching of the generated MultiModalKwargs.
-#  - This client can be deprecated once all mutimodal models migrate to use
-#    merged preprocessor with built-in caching functionality.
+#  - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
+#    with built-in caching functionality, with mm_hash as its identifier.
 #
 # -- Server:
-#  - Perform caching of the received MultiModalKwargs.
+#  - MMInputCacheServer to perform caching of the received MultiModalKwargs.
 #
-# The caching for both client and server is mirrored/similar, and this allows us
+# The caching for both client and server is mirrored, and this allows us
 # to avoid the serialization of "mm_inputs" (like pixel values) between
-# client (=P0) and server (=P1) processes.
+# client (=P0) and server (=P1) processes if the mm_hash is found in the client
+# cache.
 
 # Both Client and Server must use the same cache size
 # (to perform mirrored caching). This cache size is set by the environment
 # variable VLLM_MM_INPUT_CACHE_GIB.
 
 
-# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
-# merged preprocessor with built-in caching functionality.
-class MMInputCacheClient:
-
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ):
-        self.model_config = model_config
-        self.mm_registry = mm_registry
-        self.multi_modal_input_mapper = mm_registry.create_input_mapper(
-            model_config)
-        self.mm_registry.init_mm_limits_per_prompt(model_config)
-
-        # Init cache
-        self.use_cache = not model_config.disable_mm_preprocessor_cache
-        self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
-                                                      MultiModalKwargs)
-
-        # DEBUG: Set to None to disable
-        self.mm_debug_cache_hit_ratio_steps = None
-        self.mm_debug_cache_hits = 0
-        self.mm_debug_cache_total = 0
-
-    def cache_hit_ratio(self, steps):
-        total = self.mm_debug_cache_total
-
-        if total > 0 and total % steps == 0:
-            logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
-                         self.mm_debug_cache_hits / total)
-
-    # NOTE: process_inputs only supports image inputs since all multimodal
-    # models with other modalities have migrated to use merged preprocessor.
-    def process_inputs(
-        self,
-        mm_data: MultiModalDataDict,
-        mm_hashes: Optional[list[str]],
-        mm_processor_kwargs: Optional[dict[str, Any]],
-        precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
-    ) -> list[Optional[MultiModalKwargs]]:
-        if precomputed_mm_inputs is None:
-            image_inputs = mm_data["image"]
-            if not isinstance(image_inputs, list):
-                image_inputs = [image_inputs]
-            num_inputs = len(image_inputs)
-        else:
-            num_inputs = len(precomputed_mm_inputs)
-
-        # Sanity
-        if self.use_cache:
-            assert mm_hashes is not None
-            assert num_inputs == len(mm_hashes)
-
-        # Process each image input separately, so that later we can schedule
-        # them in a fine-grained manner.
-        # Apply caching (if enabled) and reuse precomputed inputs (if provided)
-        ret_inputs: list[Optional[MultiModalKwargs]] = []
-        for input_id in range(num_inputs):
-            if self.mm_debug_cache_hit_ratio_steps is not None:
-                self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
-
-            mm_input = None
-            if self.use_cache:
-                assert mm_hashes is not None
-                mm_hash = mm_hashes[input_id]
-                mm_input = self.mm_cache.get(mm_hash)
-
-            self.mm_debug_cache_total += 1
-            if mm_input is None:
-                if precomputed_mm_inputs is not None:
-                    # Reuse precomputed input (for merged preprocessor)
-                    mm_input = precomputed_mm_inputs[input_id]
-                else:
-                    # Apply legacy input_mapper
-                    mm_input = self.multi_modal_input_mapper(
-                        {"image": [image_inputs[input_id]]},
-                        mm_processor_kwargs=mm_processor_kwargs,
-                    )
-
-                if self.use_cache:
-                    # Add to cache
-                    assert mm_hash is not None
-                    self.mm_cache[mm_hash] = mm_input
-            else:
-                self.mm_debug_cache_hits += 1
-                mm_input = None  # Avoids sending mm_input to Server
-
-            ret_inputs.append(mm_input)
-
-        return ret_inputs
-
-
 class MMInputCacheServer:
 
     def __init__(self, model_config):
@@ -135,9 +34,9 @@ def __init__(self, model_config):
 
     def get_and_update(
         self,
-        mm_inputs: list[Optional[MultiModalKwargs]],
+        mm_inputs: list[MultiModalKwargs],
         mm_hashes: list[str],
-    ) -> list[Optional[MultiModalKwargs]]:
+    ) -> list[MultiModalKwargs]:
         assert len(mm_inputs) == len(mm_hashes)
 
         if not self.use_cache:
@@ -147,8 +46,7 @@ def get_and_update(
         for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
             assert mm_hash is not None
             if mm_input is None:
-                mm_input = self.mm_cache.get(mm_hash)
-                assert mm_input is not None
+                mm_input = self.mm_cache[mm_hash]
             else:
                 self.mm_cache[mm_hash] = mm_input
 
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 663e1e36f75..4e9e5506bb5 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -11,15 +11,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
-                             MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+                             MultiModalRegistry)
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.structured_output.utils import validate_structured_output_request
 
 
@@ -45,11 +45,6 @@ def __init__(
         self.input_preprocessor = InputPreprocessor(self.model_config,
                                                     self.tokenizer,
                                                     mm_registry)
-        self.input_processor = input_registry.create_input_processor(
-            self.model_config)
-
-        # Multi-modal (huggingface) input mapper
-        self.mm_input_cache_client = MMInputCacheClient(self.model_config)
 
         # Multi-modal hasher (for images)
         self.use_hash = (
@@ -171,7 +166,7 @@ def process_inputs(
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
         # 3. Apply prompt adapter to prompt token ids if one exists.
-        preprocessed_inputs = self.input_preprocessor.preprocess(
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
@@ -180,10 +175,6 @@ def process_inputs(
         )
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
-        # Process prompt and prompt token ids.
-        # Only applicable to multimodal models with legacy input processor.
-        processed_inputs = self.input_processor(preprocessed_inputs)
-
         self._validate_model_inputs(processed_inputs, lora_request)
 
         if is_encoder_decoder_inputs(processed_inputs):
@@ -212,36 +203,22 @@ def process_inputs(
             self.tokenizer.get_lora_tokenizer(lora_request))
 
         # Multimodal related.
-        # Compute MM hashes (if enabled)
-        mm_hashes = None
-        if self.use_hash:
-            # Use mm_hashes from processed inputs if the model has merged
-            # input processor.
-            if decoder_inputs.multi_modal_hashes:
-                mm_hashes = decoder_inputs.multi_modal_hashes
-            # Fallback to using MultiModalHasher directly.
-            else:
-                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+        sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
+        sorted_mm_positions: Optional[list[PlaceholderRange]] = None
+        sorted_mm_hashes: Optional[list[str]] = None
+        if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
+            assert isinstance(decoder_mm_inputs, MultiModalKwargs)
 
-        # For merged preprocessor, mm_data is already mm_inputs
-        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
-        decoder_mm_data = decoder_inputs.multi_modal_data
-        if isinstance(decoder_mm_data, MultiModalKwargs):
-            # The output of merged multi-modal processor (`decoder_mm_data`)
+            # The output of merged multi-modal processor (`decoder_mm_inputs`)
             # contains the kwargs for all items from all modalities.
             # This code separates them so that there is one set of kwargs
             # per item per modality.
-            precomputed_mm_inputs = [
+            individual_mm_inputs = [
                 MultiModalKwargs.from_items([item])
-                for modality in decoder_mm_data.modalities
-                for item in decoder_mm_data.get_items(modality)
+                for modality in decoder_mm_inputs.modalities
+                for item in decoder_mm_inputs.get_items(modality)
             ]
 
-        mm_positions = decoder_inputs.multi_modal_placeholders
-
-        # Last-mile processing of multimodal metadata and inputs.
-        if mm_positions:
-
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
@@ -251,14 +228,13 @@ def process_inputs(
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
-                mm_positions,
-                mm_hashes,
+                decoder_inputs.multi_modal_placeholders,
+                decoder_inputs.multi_modal_hashes if self.use_hash else None,
             )
 
             # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
-            # modalities involved AND the model supports merged input processor.
-            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
-
+            # modalities involved.
+            if len(sorted_modalities) > 1:
                 modality_order_dict = {
                     modality: order
                     for order, modality in enumerate(sorted_modalities)
@@ -266,26 +242,16 @@ def process_inputs(
 
                 # Sanity check to make sure each multimodal input has only one
                 # modality key.
-                for mm_input in precomputed_mm_inputs:
+                for mm_input in individual_mm_inputs:
                     assert len(mm_input.modalities) == 1
 
-                # Sort MultiModalKwags to match sorted_mm_positions
-                precomputed_mm_inputs = sorted(
-                    precomputed_mm_inputs,
+                # Sort MultiModalKwargs to match sorted_mm_positions
+                sorted_mm_inputs = sorted(
+                    individual_mm_inputs,
                     key=lambda mm_input: modality_order_dict[list(
                         mm_input.modalities)[0]])
-
-            # Apply mm input cache update and legacy input mapper if one exists.
-            sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
-                mm_data=decoder_mm_data,
-                mm_hashes=sorted_mm_hashes,
-                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs=precomputed_mm_inputs,
-            )
-        else:
-            sorted_mm_inputs = None
-            sorted_mm_hashes = None
-            sorted_mm_positions = None
+            else:
+                sorted_mm_inputs = individual_mm_inputs
 
         return EngineCoreRequest(
             request_id=request_id,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2a98bea562d..66015382bfe 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -29,7 +29,6 @@
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
@@ -133,14 +132,6 @@ def __init__(
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
-        if self.is_multimodal_model:
-            # NOTE: Initialized client is only used for processing dummy
-            # multimodal data into multimodal kwargs for GPU memory profiling.
-            # Only applicable to multimodal models with legacy input mapper.
-            self.mm_input_mapper_profiling = MMInputCacheClient(
-                self.model_config)
-            self.mm_input_mapper_profiling.use_cache = False
-
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
@@ -1376,32 +1367,18 @@ def profile_run(self) -> None:
                 mm_registry=self.mm_registry,
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
+            if not isinstance(dummy_mm_data, MultiModalKwargs):
+                # TODO: Delete this check once input mapper is fully removed.
+                raise RuntimeError(
+                    "Legacy input mapper is not supported in V1")
 
-            # Dummy data definition in V0 may contain multiple multimodal items
+            # Dummy data definition may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
             # always replicate first item by max_num_mm_items times since in V1
             # they are scheduled to be processed separately.
-
-            # Case when models have a merged processor, their dummy data is
-            # already batched `MultiModalKwargs`, therefore we take the first
-            # `MultiModalKwargsItem` from the desired modality to profile on.
-            if isinstance(dummy_mm_data, MultiModalKwargs):
-                dummy_mm_item = dummy_mm_data.get_item(
-                    modality=dummy_data_modality, item_index=0)
-                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
-
-            # Case when models have dummy data explicitly defined as
-            # `MultiModalDataDict`, so they need to be processed through input
-            # mapper.
-            # TODO (ywang96): deprecate this path once merged processor is
-            # supported on all models.
-            else:
-                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
-                    mm_data=dummy_mm_data,
-                    mm_hashes=None,
-                    mm_processor_kwargs=None,
-                    precomputed_mm_inputs=None)
-                dummy_mm_kwargs = mm_kwargs_list[0]
+            dummy_mm_item = dummy_mm_data.get_item(
+                modality=dummy_data_modality, item_index=0)
+            dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)

From 2bcffe8aae94890946bf38ce15d246ceab8d751f Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Mon, 17 Mar 2025 16:22:14 +0800
Subject: [PATCH 0786/1240] [Misc][XPU] Use None as device capacity for XPU
 (#14932)

Signed-off-by: yan ma <yan.ma@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/xpu.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index d99d4ef3dac..225e756cd7c 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -37,10 +37,11 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
         return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
     @staticmethod
-    def get_device_capability(device_id: int = 0) -> DeviceCapability:
-        major, minor, *_ = torch.xpu.get_device_capability(
-            device_id)['version'].split('.')
-        return DeviceCapability(major=int(major), minor=int(minor))
+    def get_device_capability(
+            device_id: int = 0) -> Optional[DeviceCapability]:
+        # capacity format differs from cuda's and will cause unexpected
+        # failure, so use None directly
+        return None
 
     @staticmethod
     def get_device_name(device_id: int = 0) -> str:

From db9277fa9ce8f966448b4a6d1cd9a1044b40b21b Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Mon, 17 Mar 2025 16:29:36 +0800
Subject: [PATCH 0787/1240] [Doc] Add vLLM Beijing meetup slide (#14938)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/README.md b/README.md
index bfab7faf598..f61b4218e18 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
----
-
-We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!  
-
-Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
-
-👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!  
-
----
-
 *Latest News* 🔥
 
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

From 178ec3af105b8ba020b86a08a7be1c4b3638042c Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 17 Mar 2025 04:37:42 -0400
Subject: [PATCH 0788/1240] setup.py: drop assumption about local `main` branch
 (#14692)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/setup.py b/setup.py
index d18fe53f12d..d412f34b3e3 100755
--- a/setup.py
+++ b/setup.py
@@ -294,26 +294,28 @@ def get_base_commit_in_main_branch(self) -> str:
             ]).decode("utf-8")
             upstream_main_commit = json.loads(resp_json)["sha"]
 
-            # Check if the local main branch is up-to-date. This is to ensure
-            # the base commit we found is the most recent commit on the main
-            # branch.
-            local_main_commit = subprocess.check_output(
-                ["git", "rev-parse", "main"]).decode("utf-8").strip()
-            if local_main_commit != upstream_main_commit:
-                raise ValueError(
-                    f"Local main branch ({local_main_commit}) is not "
-                    "up-to-date with upstream main branch "
-                    f"({upstream_main_commit}). Please pull the latest "
-                    "changes from upstream main branch first.")
+            # Check if the upstream_main_commit exists in the local repo
+            try:
+                subprocess.check_output(
+                    ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+            except subprocess.CalledProcessError:
+                # If not present, fetch it from the remote repository.
+                # Note that this does not update any local branches,
+                # but ensures that this commit ref and its history are
+                # available in our local repo.
+                subprocess.check_call([
+                    "git", "fetch", "https://github.com/vllm-project/vllm",
+                    "main"
+                ])
 
             # Then get the commit hash of the current branch that is the same as
             # the upstream main commit.
             current_branch = subprocess.check_output(
                 ["git", "branch", "--show-current"]).decode("utf-8").strip()
 
-            base_commit = subprocess.check_output(
-                ["git", "merge-base", "main",
-                 current_branch]).decode("utf-8").strip()
+            base_commit = subprocess.check_output([
+                "git", "merge-base", f"{upstream_main_commit}", current_branch
+            ]).decode("utf-8").strip()
             return base_commit
         except ValueError as err:
             raise ValueError(err) from None

From ee2b92e464429e7bf6c670b4745763e39cb4491a Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Mon, 17 Mar 2025 01:40:41 -0700
Subject: [PATCH 0789/1240] [MISC] More AMD unused var clean up (#14926)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/rocm/attention.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 90f0b54d2f0..c500d00ea52 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
 
 template <typename T>
 __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
     uint16_t u;
     _Float16 f;
     __hip_bfloat16 b;
@@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
 template <typename T>
 __device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
                                         const _B16x4& inp2) {
-  union tmpcvt {
+  [[maybe_unused]] union tmpcvt {
     uint16_t u;
     _Float16 f;
     __hip_bfloat16 b;
@@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   const int seq_idx = blockIdx.y;
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
   const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS

From 78d01f3272e83495bd280f3aade7878ecdd216e6 Mon Sep 17 00:00:00 2001
From: kushanam <42385577+kushanam@users.noreply.github.com>
Date: Mon, 17 Mar 2025 01:47:58 -0700
Subject: [PATCH 0790/1240] fix minor miscalled method (#14327)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>

From 874948f6972581cb19bbe0ece7e1e09cd2f7666b Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Mon, 17 Mar 2025 01:48:28 -0700
Subject: [PATCH 0791/1240] [V1][TPU] Apply the ragged paged attention kernel
 fix and remove the padding. (#14846)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/tpu.txt               | 12 ++++++------
 vllm/v1/worker/tpu_model_runner.py |  7 ++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 97a39bcd4a6..7246fc19bfa 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index effcac7e7bd..00869467be3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,8 +23,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -139,10 +138,8 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From d181bdfaf2f65dba967387f0aea51461b00531ac Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 17:15:20 +0800
Subject: [PATCH 0792/1240] [Bugfix] Fix Ultravox on V1 (#14929)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/ultravox.py | 42 +++++++++++++++-----------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d368c145d55..cb1e1438384 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,7 +5,7 @@
 import math
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.utils.checkpoint
@@ -36,7 +36,7 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP, SupportsV0Only)
+                         SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
                     init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings,
@@ -50,14 +50,14 @@
 
 class UltravoxAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: NestedTensors
+    data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
     """Shape: `(batch_size, num_chunks, 80, M)`"""
-    lens: NestedTensors
+    lens: Union[torch.Tensor, list[torch.Tensor]]
     """
     Length of the audio frames. Used for attention mask in WhisperEncoder.
     Shape: `(batch_size, num_chunks)`
     """
-    token_len: NestedTensors
+    token_len: Union[torch.Tensor, list[torch.Tensor]]
     """
     Length of the audio tokens. Used for flattening the audio features.
     Shape: `(batch_size, num_chunks)`
@@ -405,8 +405,7 @@ def forward(
     UltravoxMultiModalProcessor,
     info=UltravoxProcessingInfo,
     dummy_inputs=UltravoxDummyInputsBuilder)
-class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
-                    SupportsV0Only):
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -506,6 +505,12 @@ def _parse_and_validate_audio_input(
             if not isinstance(audio_features, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio features. "
                                  f"Got type: {type(audio_features)}")
+            if not isinstance(audio_lens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_lens. "
+                                 f"Got type: {type(audio_features)}")
+            if not isinstance(audio_token_len, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_token_len. "
+                                 f"Got type: {type(audio_features)}")
 
             return UltravoxAudioFeatureInputs(type="audio_features",
                                               data=audio_features,
@@ -523,7 +528,9 @@ def _parse_and_validate_audio_input(
         raise AssertionError("This line should be unreachable.")
 
     def _process_audio_input(
-            self, audio_input: UltravoxAudioInputs) -> NestedTensors:
+        self,
+        audio_input: UltravoxAudioInputs,
+    ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
         if audio_input["type"] == "audio_embeds":
             return audio_input["data"]
 
@@ -531,13 +538,9 @@ def _process_audio_input(
         # [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
         audio_features = pad_and_concat_to_dim3(audio_input["data"])
 
-        if isinstance(audio_input['lens'], list):
-            # [B1, B2] -> [B1+B2]
-            audio_lens = torch.cat(audio_input['lens'])
-            audio_token_len = torch.cat(audio_input['token_len'])
-        else:
-            audio_lens = flatten_bn(audio_input['lens'])
-            audio_token_len = flatten_bn(audio_input['token_len'])
+        # [B1, B2] -> [B1+B2]
+        audio_lens = flatten_bn(audio_input['lens'], concat=True)
+        audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
 
         embeddings = self._audio_features_to_embeddings(
             audio_features, audio_lens)
@@ -554,7 +557,12 @@ def _process_audio_input(
         # Apply mask and flatten
         flattened_embeddings = embeddings[mask]
 
-        return flattened_embeddings
+        # Return one tensor per input audio
+        embed_lens = [
+            token_len_item.sum().item()
+            for token_len_item in audio_input['token_len']
+        ]
+        return flattened_embeddings.split(embed_lens)
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -646,7 +654,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 
 def pad_and_concat_to_dim3(
-    features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]]
+    features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
 ) -> torch.Tensor:
     """
     Pad and concatenate a list of tensors.

From 9c06b8a4f409134113c776e26d5e32e02424227d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 18:00:17 +0800
Subject: [PATCH 0793/1240] [Misc] Add `--seed` option to offline multi-modal
 examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   7 +-
 examples/offline_inference/audio_language.py  | 132 +++--
 .../encoder_decoder_multimodal.py             |  48 +-
 examples/offline_inference/vision_language.py | 455 ++++++++++++------
 .../vision_language_embedding.py              |  31 +-
 .../vision_language_multi_image.py            | 179 ++++---
 6 files changed, 537 insertions(+), 315 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f85572e7c23..f5be8dca05f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -226,10 +226,13 @@ steps:
     - python3 offline_inference/basic/chat.py
     - python3 offline_inference/prefix_caching.py
     - python3 offline_inference/llm_engine_example.py
-    - python3 offline_inference/vision_language.py
-    - python3 offline_inference/vision_language_multi_image.py
+    - python3 offline_inference/audio_language.py --seed 0
+    - python3 offline_inference/vision_language.py --seed 0
+    - python3 offline_inference/vision_language_embedding.py --seed 0
+    - python3 offline_inference/vision_language_multi_image.py --seed 0
     - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     - python3 offline_inference/basic/classify.py
     - python3 offline_inference/basic/embed.py
     - python3 offline_inference/basic/score.py
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 293b9fddac8..02dbdcb6423 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -7,11 +7,13 @@
 on HuggingFace model repository.
 """
 import os
+from dataclasses import asdict
+from typing import NamedTuple, Optional
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
@@ -23,21 +25,31 @@
     2: "What sport and what nursery rhyme are referenced?"
 }
 
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompt: str
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 
 
 # MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    llm = LLM(model=model_name,
-              trust_remote_code=True,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
 
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
                                            tokenize=False,
                                            add_generation_prompt=True,
                                            chat_template=audio_chat_template)
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     """
     Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
     show how to process audio inputs.
@@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
     speech_lora_path = os.path.join(model_path, "speech-lora")
     placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
 
-    prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+    prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
         max_model_len=4096,
@@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
         lora_extra_vocab_size=0,
         limit_mm_per_prompt={"audio": audio_count},
     )
-    lora_request = LoRARequest("speech", 1, speech_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
 
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompts,
+        lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+    )
 
 
 # Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
     model_name = "Qwen/Qwen2-Audio-7B-Instruct"
 
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
 
     audio_in_prompt = "".join([
         f"Audio {idx+1}: "
@@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
               "<|im_start|>user\n"
               f"{audio_in_prompt}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
 
 
 # Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=5,
-              trust_remote_code=True,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
 
 
 # Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
     assert audio_count == 1, (
         "Whisper only support single audio input per prompt")
     model_name = "openai/whisper-large-v3-turbo"
 
     prompt = "<|startoftranscript|>"
 
-    llm = LLM(model=model_name,
-              max_model_len=448,
-              max_num_seqs=5,
-              limit_mm_per_prompt={"audio": audio_count})
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=448,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
 
 
 model_example_map = {
@@ -164,14 +194,24 @@ def main(args):
         raise ValueError(f"Model type {model} is not supported.")
 
     audio_count = args.num_audios
-    llm, prompt, stop_token_ids = model_example_map[model](
-        question_per_audio_count[audio_count], audio_count)
+    req_data = model_example_map[model](question_per_audio_count[audio_count],
+                                        audio_count)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
                                      max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)
 
     mm_data = {}
     if audio_count > 0:
@@ -183,7 +223,7 @@ def main(args):
         }
 
     assert args.num_prompts > 0
-    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
     if args.num_prompts > 1:
         # Batch inference
         inputs = [inputs] * args.num_prompts
@@ -214,6 +254,10 @@ def main(args):
                         default=1,
                         choices=[0, 1, 2],
                         help="Number of audio items per prompt.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
 
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index f44bc423658..6d0c3ac1ee0 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -4,16 +4,23 @@
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
 import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.utils import FlexibleArgumentParser
 
 
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: Sequence[PromptType]
+
+
 def run_florence2():
-    # Create a Florence-2 encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
         tokenizer="facebook/bart-large",
         max_num_seqs=8,
@@ -39,12 +46,15 @@ def run_florence2():
             "decoder_prompt": "",
         },
     ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 def run_mllama():
-    # Create a Mllama encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
         model="meta-llama/Llama-3.2-11B-Vision-Instruct",
         max_model_len=4096,
         max_num_seqs=2,
@@ -69,12 +79,15 @@ def run_mllama():
             "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
         },
     ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 def run_whisper():
-    # Create a Whisper encoder/decoder model instance
-    llm = LLM(
+    engine_args = EngineArgs(
         model="openai/whisper-large-v3-turbo",
         max_model_len=448,
         max_num_seqs=16,
@@ -99,7 +112,11 @@ def run_whisper():
             "decoder_prompt": "<|startoftranscript|>",
         }
     ]
-    return llm, prompts
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 model_example_map = {
@@ -114,7 +131,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
-    llm, prompts = model_example_map[model]()
+    req_data = model_example_map[model]()
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    prompts = req_data.prompts
 
     # Create a sampling params object.
     sampling_params = SamplingParams(
@@ -153,6 +175,10 @@ def main(args):
                         default="mllama",
                         choices=model_example_map.keys(),
                         help='Huggingface "model_type".')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
 
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 432cda5e243..58fd5e53bf8 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,122 +8,164 @@
 """
 import os
 import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
 
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
 from vllm.utils import FlexibleArgumentParser
 
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 # lower-end GPUs.
 # Unless specified, these settings have been tested to work on a single L4.
 
 
 # Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
     model_name = "rhymes-ai/Aria"
 
     # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              dtype="bfloat16",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
 
     prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
                 "<|im_end|>\n<|im_start|>assistant\n")
                for question in questions]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompts = [f"Question: {question} Answer:" for question in questions]
-    llm = LLM(model="Salesforce/blip2-opt-2.7b",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="Salesforce/blip2-opt-2.7b",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     prompts = [f"{question}<image>" for question in questions]
-    llm = LLM(model="facebook/chameleon-7b",
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="facebook/chameleon-7b",
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "deepseek-ai/deepseek-vl2-tiny"
 
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+    )
 
     prompts = [
         f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
         for question in questions
     ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    llm = LLM(model="microsoft/Florence-2-large",
-              tokenizer="facebook/bart-large",
-              max_num_seqs=8,
-              trust_remote_code=True,
-              dtype="bfloat16",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model="microsoft/Florence-2-large",
+        tokenizer="facebook/bart-large",
+        max_num_seqs=8,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
 
-    prompt = "<MORE_DETAILED_CAPTION>"
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     prompts = [f"{question}\n" for question in questions]
-    llm = LLM(model="adept/fuyu-8b",
-              max_model_len=2048,
-              max_num_seqs=2,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="adept/fuyu-8b",
+        max_model_len=2048,
+        max_num_seqs=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
     model_name = "google/gemma-3-4b-it"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=2048,
         max_num_seqs=2,
@@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
     prompts = [("<bos><start_of_turn>user\n"
                 f"<start_of_image>{question}<end_of_turn>\n"
                 "<start_of_turn>model\n") for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
     model_name = "THUDM/glm-4v-9b"
 
-    llm = LLM(model=model_name,
-              max_model_len=2048,
-              max_num_seqs=2,
-              trust_remote_code=True,
-              enforce_eager=True,
-              hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        enforce_eager=True,
+        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
 
     prompts = [
         f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
     ]
 
     stop_token_ids = [151329, 151336, 151338]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "h2oai/h2ovl-mississippi-800m"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
@@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
     stop_token_ids = [tokenizer.eos_token_id]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
@@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
     prompts = [(
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
     ) for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "OpenGVLab/InternVL2-2B"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
@@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
     # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     prompts = [
         f"USER: <image>\n{question}\nASSISTANT:" for question in questions
     ]
 
-    llm = LLM(model="llava-hf/llava-1.5-7b-hf",
-              max_model_len=4096,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
+                         modality: str) -> ModelRequestData:
     assert modality == "video"
 
     prompts = [
         f"USER: <video>\n{question} ASSISTANT:" for question in questions
     ]
-    llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
-              max_model_len=8192,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
+        max_model_len=8192,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # LLaVA-OneVision
-def run_llava_onevision(questions: list[str], modality: str):
+def run_llava_onevision(questions: list[str],
+                        modality: str) -> ModelRequestData:
 
     if modality == "video":
         prompts = [
@@ -305,15 +387,20 @@ def run_llava_onevision(questions: list[str], modality: str):
         <|im_start|>assistant\n" for question in questions
         ]
 
-    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
-              max_model_len=16384,
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    engine_args = EngineArgs(
+        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+        max_model_len=16384,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Mantis
-def run_mantis(questions: list[str], modality: str):
+def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
@@ -322,14 +409,19 @@ def run_mantis(questions: list[str], modality: str):
         for question in questions
     ]
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
         max_model_len=4096,
         hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
     stop_token_ids = [128009]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
 # MiniCPM-V
@@ -357,7 +449,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
     # model_name = "openbmb/MiniCPM-o-2_6"
     tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
         max_num_seqs=2,
@@ -389,19 +481,24 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
             tokenize=False,
             add_generation_prompt=True) for question in questions
     ]
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 
 
-def run_minicpmo(questions: list[str], modality: str):
+def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
 
 
-def run_minicpmv(questions: list[str], modality: str):
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
 # LLama 3.2
-def run_mllama(questions: list[str], modality: str):
+def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
@@ -411,7 +508,7 @@ def run_mllama(questions: list[str], modality: str):
     # You may lower either to run this example on lower-end GPUs.
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
@@ -432,17 +529,20 @@ def run_mllama(questions: list[str], modality: str):
     prompts = tokenizer.apply_chat_template(messages,
                                             add_generation_prompt=True,
                                             tokenize=False)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Molmo
-def run_molmo(questions: list[str], modality: str):
+def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "allenai/Molmo-7B-D-0924"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         dtype="bfloat16",
@@ -453,18 +553,21 @@ def run_molmo(questions: list[str], modality: str):
         f"<|im_start|>user <image>\n{question}<|im_end|> \
         <|im_start|>assistant\n" for question in questions
     ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # NVLM-D
-def run_nvlm_d(questions: list[str], modality: str):
+def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "nvidia/NVLM-D-72B"
 
     # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
@@ -481,36 +584,47 @@ def run_nvlm_d(questions: list[str], modality: str):
     prompts = tokenizer.apply_chat_template(messages,
                                             tokenize=False,
                                             add_generation_prompt=True)
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # PaliGemma
-def run_paligemma(question: str, modality: str):
+def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
-    prompt = ["caption en"]
-    llm = LLM(model="google/paligemma-3b-mix-224",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma-3b-mix-224",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # PaliGemma 2
-def run_paligemma2(question: str, modality: str):
+def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     # PaliGemma 2 has special prompt format for VQA
-    prompt = ["caption en"]
-    llm = LLM(model="google/paligemma2-3b-ft-docci-448",
-              disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
+    prompts = ["caption en" for _ in questions]
+    engine_args = EngineArgs(
+        model="google/paligemma2-3b-ft-docci-448",
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Phi-3-Vision
-def run_phi3v(questions: list[str], modality: str):
+def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     prompts = [
@@ -530,7 +644,7 @@ def run_phi3v(questions: list[str], modality: str):
     #
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
@@ -539,12 +653,15 @@ def run_phi3v(questions: list[str], modality: str):
         mm_processor_kwargs={"num_crops": 16},
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Phi-4-multimodal-instruct
-def run_phi4mm(questions: list[str], modality: str):
+def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     """
     Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
     show how to process image inputs.
@@ -558,7 +675,7 @@ def run_phi4mm(questions: list[str], modality: str):
         f"<|user|><|image_1|>{question}<|end|><|assistant|>"
         for question in questions
     ]
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
         max_model_len=4096,
@@ -567,24 +684,22 @@ def run_phi4mm(questions: list[str], modality: str):
         max_lora_rank=320,
         lora_extra_vocab_size=0,
     )
-    lora_request = LoRARequest("vision", 1, vision_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
 
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
+    )
 
 
 # Pixtral HF-format
-def run_pixtral_hf(questions: list[str], modality: str):
+def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
     model_name = "mistral-community/pixtral-12b"
 
     # NOTE: Need L40 (or equivalent) to avoid OOM
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
@@ -592,15 +707,18 @@ def run_pixtral_hf(questions: list[str], modality: str):
     )
 
     prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Qwen
-def run_qwen_vl(questions: list[str], modality: str):
+def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model="Qwen/Qwen-VL",
         trust_remote_code=True,
         max_model_len=1024,
@@ -610,16 +728,19 @@ def run_qwen_vl(questions: list[str], modality: str):
     )
 
     prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Qwen2-VL
-def run_qwen2_vl(questions: list[str], modality: str):
+def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
         max_num_seqs=5,
@@ -642,16 +763,19 @@ def run_qwen2_vl(questions: list[str], modality: str):
          f"{question}<|im_end|>\n"
          "<|im_start|>assistant\n") for question in questions
     ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 # Qwen2.5-VL
-def run_qwen2_5_vl(questions: list[str], modality: str):
+def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
         max_num_seqs=5,
@@ -674,8 +798,11 @@ def run_qwen2_5_vl(questions: list[str], modality: str):
          f"{question}<|im_end|>\n"
          "<|im_start|>assistant\n") for question in questions
     ]
-    stop_token_ids = None
-    return llm, prompts, stop_token_ids
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
 
 model_example_map = {
@@ -789,18 +916,28 @@ def main(args):
     data = mm_input["data"]
     questions = mm_input["questions"]
 
-    llm, prompts, stop_token_ids = model_example_map[model](questions,
-                                                            modality)
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
     # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = prompts if args.use_different_prompt_per_request else [
-        prompts[0]
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
     ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2,
                                      max_tokens=64,
-                                     stop_token_ids=stop_token_ids)
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
@@ -865,6 +1002,10 @@ def main(args):
                         type=int,
                         default=16,
                         help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
 
     parser.add_argument(
         '--image-repeat-prob',
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index 3075fbbfa0f..a0b2b44b4e8 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -7,11 +7,12 @@
 on HuggingFace model repository.
 """
 from argparse import Namespace
+from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
 
 from PIL.Image import Image
 
-from vllm import LLM
+from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
 
@@ -37,12 +38,12 @@ class TextImageQuery(TypedDict):
 
 
 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
     prompt: str
     image: Optional[Image]
 
 
-def run_e5_v(query: Query):
+def run_e5_v(query: Query) -> ModelRequestData:
     llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
 
     if query["modality"] == "text":
@@ -58,20 +59,20 @@ def run_e5_v(query: Query):
         modality = query['modality']
         raise ValueError(f"Unsupported query modality: '{modality}'")
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model="royokong/e5-v",
         task="embed",
         max_model_len=4096,
     )
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         image=image,
     )
 
 
-def run_vlm2vec(query: Query):
+def run_vlm2vec(query: Query) -> ModelRequestData:
     if query["modality"] == "text":
         text = query["text"]
         prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
@@ -87,7 +88,7 @@ def run_vlm2vec(query: Query):
         modality = query['modality']
         raise ValueError(f"Unsupported query modality: '{modality}'")
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model="TIGER-Lab/VLM2Vec-Full",
         task="embed",
         trust_remote_code=True,
@@ -95,7 +96,7 @@ def run_vlm2vec(query: Query):
     )
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         image=image,
     )
@@ -126,15 +127,18 @@ def get_query(modality: QueryModality):
     raise ValueError(msg)
 
 
-def run_encode(model: str, modality: QueryModality):
+def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
     query = get_query(modality)
     req_data = model_example_map[model](query)
 
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
     mm_data = {}
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = req_data.llm.embed({
+    outputs = llm.embed({
         "prompt": req_data.prompt,
         "multi_modal_data": mm_data,
     })
@@ -144,7 +148,7 @@ def run_encode(model: str, modality: QueryModality):
 
 
 def main(args: Namespace):
-    run_encode(args.model_name, args.modality)
+    run_encode(args.model_name, args.modality, args.seed)
 
 
 model_example_map = {
@@ -167,5 +171,10 @@ def main(args: Namespace):
                         default="image",
                         choices=get_args(QueryModality),
                         help='Modality of the input.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
     args = parser.parse_args()
     main(args)
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index b47004aa961..c110f96669e 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -6,13 +6,14 @@
 """
 import os
 from argparse import Namespace
+from dataclasses import asdict
 from typing import NamedTuple, Optional
 
 from huggingface_hub import snapshot_download
 from PIL.Image import Image
 from transformers import AutoProcessor, AutoTokenizer
 
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@@ -25,11 +26,12 @@
 
 
 class ModelRequestData(NamedTuple):
-    llm: LLM
+    engine_args: EngineArgs
     prompt: str
-    stop_token_ids: Optional[list[int]]
     image_data: list[Image]
-    chat_template: Optional[str]
+    stop_token_ids: Optional[list[int]] = None
+    chat_template: Optional[str] = None
+    lora_requests: Optional[list[LoRARequest]] = None
 
 
 # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
@@ -37,53 +39,55 @@ class ModelRequestData(NamedTuple):
 # Unless specified, these settings have been tested to work on a single L4.
 
 
-def load_aria(question, image_urls: list[str]) -> ModelRequestData:
+def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "rhymes-ai/Aria"
-    llm = LLM(model=model_name,
-              tokenizer_mode="slow",
-              trust_remote_code=True,
-              dtype="bfloat16",
-              limit_mm_per_prompt={"image": len(image_urls)})
+    engine_args = EngineArgs(
+        model=model_name,
+        tokenizer_mode="slow",
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
     placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
     prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
               "<|im_start|>assistant\n")
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
-def load_deepseek_vl2(question: str, image_urls: list[str]):
+def load_deepseek_vl2(question: str,
+                      image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
 
-    llm = LLM(model=model_name,
-              max_model_len=4096,
-              max_num_seqs=2,
-              hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-              limit_mm_per_prompt={"image": len(image_urls)})
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
 
     placeholder = "".join(f"image_{i}:<image>\n"
                           for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
-def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
+def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "google/gemma-3-4b-it"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
@@ -112,18 +116,16 @@ def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
                                            add_generation_prompt=True)
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
 def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "h2oai/h2ovl-mississippi-800m"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
@@ -146,19 +148,18 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
     stop_token_ids = [tokenizer.eos_token_id]
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
-def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
+def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=8192,
         max_num_seqs=16,
@@ -177,18 +178,16 @@ def load_idefics3(question, image_urls: list[str]) -> ModelRequestData:
                              for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
 def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "OpenGVLab/InternVL2-2B"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
@@ -214,19 +213,18 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
-def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
+def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
     # The configuration below has been confirmed to launch on a single L40 GPU.
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
         max_num_seqs=16,
@@ -236,19 +234,17 @@ def load_mllama(question, image_urls: list[str]) -> ModelRequestData:
     placeholders = "<|image|>" * len(image_urls)
     prompt = f"{placeholders}<|begin_of_text|>{question}"
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=None,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
-def load_nvlm_d(question: str, image_urls: list[str]):
+def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "nvidia/NVLM-D-72B"
 
     # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=8192,
@@ -266,14 +262,11 @@ def load_nvlm_d(question: str, image_urls: list[str]):
     prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=True)
-    stop_token_ids = None
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
@@ -281,7 +274,7 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
     # Adjust this as necessary to fit in GPU
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
@@ -291,14 +284,11 @@ def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
 
     placeholders = "[IMG]" * len(image_urls)
     prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
-    stop_token_ids = None
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
@@ -315,7 +305,7 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
     #
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-    llm = LLM(
+    engine_args = EngineArgs(
         model="microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         max_model_len=4096,
@@ -326,14 +316,11 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
     placeholders = "\n".join(f"<|image_{i}|>"
                              for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
-    stop_token_ids = None
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
     )
 
 
@@ -347,7 +334,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     # Since the vision-lora and speech-lora co-exist with the base model,
     # we have to manually specify the path of the lora weights.
     vision_lora_path = os.path.join(model_path, "vision-lora")
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_path,
         trust_remote_code=True,
         max_model_len=10000,
@@ -357,30 +344,23 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
         max_lora_rank=320,
         lora_extra_vocab_size=0,
     )
-    lora_request = LoRARequest("vision", 1, vision_lora_path)
-    # To maintain code compatibility in this script, we add LoRA here.
-    llm.llm_engine.add_lora(lora_request=lora_request)
-    # You can also add LoRA using:
-    # llm.generate(prompts, lora_request=lora_request,...)
 
     placeholders = "".join(f"<|image_{i}|>"
                            for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
-    stop_token_ids = None
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
-        chat_template=None,
+        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
     )
 
 
 def load_qwen_vl_chat(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
         max_model_len=1024,
@@ -411,7 +391,7 @@ def load_qwen_vl_chat(question: str,
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
         stop_token_ids=stop_token_ids,
         image_data=[fetch_image(url) for url in image_urls],
@@ -419,7 +399,7 @@ def load_qwen_vl_chat(question: str,
     )
 
 
-def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
@@ -431,7 +411,7 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     # Tested on L40
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=32768 if process_vision_info is None else 4096,
         max_num_seqs=5,
@@ -460,23 +440,19 @@ def load_qwen2_vl(question, image_urls: list[str]) -> ModelRequestData:
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    stop_token_ids = None
-
     if process_vision_info is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
         image_data, _ = process_vision_info(messages)
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=image_data,
-        chat_template=None,
     )
 
 
-def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
+def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
@@ -487,7 +463,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
-    llm = LLM(
+    engine_args = EngineArgs(
         model=model_name,
         max_model_len=32768 if process_vision_info is None else 4096,
         max_num_seqs=5,
@@ -516,8 +492,6 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                            tokenize=False,
                                            add_generation_prompt=True)
 
-    stop_token_ids = None
-
     if process_vision_info is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
@@ -525,11 +499,9 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
                                             return_video_kwargs=False)
 
     return ModelRequestData(
-        llm=llm,
+        engine_args=engine_args,
         prompt=prompt,
-        stop_token_ids=stop_token_ids,
         image_data=image_data,
-        chat_template=None,
     )
 
 
@@ -551,14 +523,25 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
 }
 
 
-def run_generate(model, question: str, image_urls: list[str]):
+def run_generate(model, question: str, image_urls: list[str],
+                 seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=128,
                                      stop_token_ids=req_data.stop_token_ids)
 
-    outputs = req_data.llm.generate(
+    outputs = llm.generate(
         {
             "prompt": req_data.prompt,
             "multi_modal_data": {
@@ -572,13 +555,24 @@ def run_generate(model, question: str, image_urls: list[str]):
         print(generated_text)
 
 
-def run_chat(model: str, question: str, image_urls: list[str]):
+def run_chat(model: str, question: str, image_urls: list[str],
+             seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
     sampling_params = SamplingParams(temperature=0.0,
                                      max_tokens=128,
                                      stop_token_ids=req_data.stop_token_ids)
-    outputs = req_data.llm.chat(
+    outputs = llm.chat(
         [{
             "role":
             "user",
@@ -607,11 +601,12 @@ def run_chat(model: str, question: str, image_urls: list[str]):
 def main(args: Namespace):
     model = args.model_type
     method = args.method
+    seed = args.seed
 
     if method == "generate":
-        run_generate(model, QUESTION, IMAGE_URLS)
+        run_generate(model, QUESTION, IMAGE_URLS, seed)
     elif method == "chat":
-        run_chat(model, QUESTION, IMAGE_URLS)
+        run_chat(model, QUESTION, IMAGE_URLS, seed)
     else:
         raise ValueError(f"Invalid method: {method}")
 
@@ -632,6 +627,10 @@ def main(args: Namespace):
                         default="generate",
                         choices=["generate", "chat"],
                         help="The method to run in `vllm.LLM`.")
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
 
     args = parser.parse_args()
     main(args)

From 2c679d4b18048ef40db3abed862d97dae9553825 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 17 Mar 2025 19:33:35 +0800
Subject: [PATCH 0794/1240] [Bugfix][ROCm] running new process using spawn
 method for rocm in tests. (#14810)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py         | 10 +--
 tests/compile/test_full_graph.py              |  4 +-
 tests/distributed/test_expert_parallel.py     |  4 +-
 tests/distributed/test_pipeline_parallel.py   |  8 +-
 tests/distributed/test_pp_cudagraph.py        |  4 +-
 tests/entrypoints/llm/test_collective_rpc.py  |  4 +-
 tests/lora/test_chatglm3_tp.py                |  9 +--
 tests/lora/test_llama_tp.py                   | 13 ++-
 tests/lora/test_minicpmv_tp.py                |  9 ++-
 tests/lora/test_transfomers_model.py          |  9 +--
 .../vision_language/test_models.py            | 30 +++----
 .../vlm_utils/case_filtering.py               | 12 +--
 .../audio_language/test_whisper.py            |  4 +-
 tests/models/test_oot_registration.py         | 10 +--
 tests/models/test_registry.py                 |  6 +-
 tests/quantization/test_bitsandbytes.py       | 13 +--
 .../e2e/test_multistep_correctness.py         | 26 +++---
 tests/test_utils.py                           |  4 +-
 tests/utils.py                                | 80 ++++++++++++++++++-
 tests/v1/engine/test_engine_core.py           |  9 ++-
 tests/v1/engine/test_engine_core_client.py    |  5 +-
 21 files changed, 174 insertions(+), 99 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index f5ee469fb00..31aa8982820 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -7,10 +7,10 @@
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
 
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_python_error():
     """
     Test if Python error occurs when there's low-level
@@ -36,7 +36,7 @@ def test_python_error():
         allocator.wake_up()
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_basic_cumem():
     # some tensors from default memory pool
     shape = (1024, 1024)
@@ -69,7 +69,7 @@ def test_basic_cumem():
     assert torch.allclose(output, torch.ones_like(output) * 3)
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_cumem_with_cudagraph():
     allocator = CuMemAllocator.get_instance()
     with allocator.use_memory_pool():
@@ -114,7 +114,7 @@ def model(x):
     assert torch.allclose(y, x + 1)
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize(
     "model, use_v1",
     [
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index cf463f3e752..3a45c35442c 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -12,7 +12,7 @@
 from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 
 
 @pytest.fixture(params=None, name="model_info")
@@ -78,7 +78,7 @@ def models_list_fixture(request):
     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
 @pytest.mark.parametrize("model_info", "", indirect=True)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
     model_info: tuple[str, dict[str, Any]],
diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py
index 2e575f95d5f..db828161780 100644
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -8,7 +8,7 @@
 from vllm.config import TaskOption
 from vllm.logger import init_logger
 
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_expert_parallel")
 
@@ -209,7 +209,7 @@ def _compare_tp(
         for params in settings.iter_params(model_name)
     ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ep(
     model_name: str,
     parallel_setup: ParallelSetup,
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 4d3306509c8..1342f0da29d 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 
 from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 
 logger = init_logger("test_pipeline_parallel")
 
@@ -402,7 +402,7 @@ def _compare_tp(
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_language_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
@@ -431,7 +431,7 @@ def test_tp_language_generation(
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_language_embedding(
     model_id: str,
     parallel_setup: ParallelSetup,
@@ -460,7 +460,7 @@ def test_tp_language_embedding(
         for params in settings.iter_params(model_id) if model_id in TEST_MODELS
     ],
 )
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_tp_multimodal_generation(
     model_id: str,
     parallel_setup: ParallelSetup,
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 19414971f2b..3ca6e7b33a5 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from ..utils import compare_two_settings, fork_new_process_for_each_test
+from ..utils import compare_two_settings, create_new_process_for_each_test
 
 if TYPE_CHECKING:
     from typing_extensions import LiteralString
@@ -18,7 +18,7 @@
     "FLASH_ATTN",
     "FLASHINFER",
 ])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_pp_cudagraph(
     monkeypatch: pytest.MonkeyPatch,
     PP_SIZE: int,
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 39d4810de9e..64c473c4c53 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -4,12 +4,12 @@
 
 from vllm import LLM
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 
 
 @pytest.mark.parametrize("tp_size", [1, 2])
 @pytest.mark.parametrize("backend", ["mp", "ray"])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend):
     if tp_size == 1 and backend == "ray":
         pytest.skip("Skip duplicate test case")
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index 6bc9bf78876..fa8c66d1030 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -3,10 +3,9 @@
 import pytest
 
 import vllm
-from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
 
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 
 MODEL_PATH = "THUDM/chatglm3-6b"
 
@@ -55,7 +54,7 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -75,7 +74,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -96,7 +95,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index d497ae6b2bc..0acdaeac695 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -4,10 +4,9 @@
 import ray
 
 import vllm
-from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
 
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
@@ -82,7 +81,7 @@ def v1(run_with_both_engines_lora):
 
 # V1 Test: Failing due to numerics on V1.
 @pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
 
     llm = vllm.LLM(MODEL_PATH,
@@ -97,7 +96,7 @@ def test_llama_lora(sql_lora_files):
 # Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
 # used by the engine yet.
 @pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_warmup(sql_lora_files):
     """Test that the LLM initialization works with a warmup LORA path and
     is more conservative"""
@@ -128,7 +127,7 @@ def get_num_gpu_blocks_no_lora():
 # V1 Test: Failing due to numerics on V1.
 @pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
 
     llm = vllm.LLM(
@@ -143,7 +142,7 @@ def test_llama_lora_tp4(sql_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 
     llm = vllm.LLM(
@@ -159,7 +158,7 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
 
 
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
 
     llm = vllm.LLM(
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index f596651be01..ee0d7b5da3a 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -3,11 +3,12 @@
 import pytest
 
 import vllm
-from tests.utils import fork_new_process_for_each_test
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform
 
+from ..utils import create_new_process_for_each_test
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 
 PROMPT_TEMPLATE = (
@@ -57,7 +58,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -80,7 +81,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -101,7 +102,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index ff3bfcac505..f65fb1cdbbd 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -3,10 +3,9 @@
 import pytest
 
 import vllm
-from tests.utils import fork_new_process_for_each_test
 from vllm.lora.request import LoRARequest
 
-from ..utils import multi_gpu_test
+from ..utils import create_new_process_for_each_test, multi_gpu_test
 
 MODEL_PATH = "ArthurZ/ilama-3.2-1B"
 
@@ -56,7 +55,7 @@ def v1(run_with_both_engines_lora):
 
 
 @pytest.mark.skip_v1
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -77,7 +76,7 @@ def test_ilama_lora(ilama_lora_files):
 
 @pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -99,7 +98,7 @@ def test_ilama_lora_tp4(ilama_lora_files):
 
 @pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7cdd037d49a..92fb2404d8a 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -17,7 +17,7 @@
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets,
                           _VideoAssets)
-from ....utils import (fork_new_process_for_each_test, large_gpu_mark,
+from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                        multi_gpu_marks)
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
@@ -592,7 +592,7 @@ def _mark_splits(
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
     ))
 def test_single_image_models(tmp_path: PosixPath, model_type: str,
                              test_case: ExpandableVLMTestArgs,
@@ -617,7 +617,7 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
     ))
 def test_multi_image_models(tmp_path: PosixPath, model_type: str,
                             test_case: ExpandableVLMTestArgs,
@@ -642,7 +642,7 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
     ))
 def test_image_embedding_models(model_type: str,
                                 test_case: ExpandableVLMTestArgs,
@@ -666,7 +666,7 @@ def test_image_embedding_models(model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
     ))
 def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
                       hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
@@ -688,7 +688,7 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=False,
+        create_new_process_for_each_test=False,
     ))
 def test_custom_inputs_models(
     model_type: str,
@@ -714,9 +714,9 @@ def test_custom_inputs_models(
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
     ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                    test_case: ExpandableVLMTestArgs,
                                    hf_runner: type[HfRunner],
@@ -740,9 +740,9 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.MULTI_IMAGE,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
     ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
                                   test_case: ExpandableVLMTestArgs,
                                   hf_runner: type[HfRunner],
@@ -766,9 +766,9 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.EMBEDDING,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
     ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_image_embedding_models_heavy(model_type: str,
                                       test_case: ExpandableVLMTestArgs,
                                       hf_runner: type[HfRunner],
@@ -791,7 +791,7 @@ def test_image_embedding_models_heavy(model_type: str,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.VIDEO,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
     ))
 def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
                             hf_runner: type[HfRunner],
@@ -814,9 +814,9 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
     get_parametrized_options(
         VLM_TEST_SETTINGS,
         test_type=VLMTestType.CUSTOM_INPUTS,
-        fork_new_process_for_each_test=True,
+        create_new_process_for_each_test=True,
     ))
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
     model_type: str,
     test_case: ExpandableVLMTestArgs,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
index c189e5a761f..8e825676b8f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py
@@ -13,9 +13,9 @@
                     ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
 
 
-def get_filtered_test_settings(test_settings: dict[str, VLMTestInfo],
-                               test_type: VLMTestType,
-                               fork_per_test: bool) -> dict[str, VLMTestInfo]:
+def get_filtered_test_settings(
+        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
+        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
     """Given the dict of potential test settings to run, return a subdict
     of tests who have the current test type enabled with the matching val for
     fork_per_test.
@@ -43,7 +43,7 @@ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
 
             # Everything looks okay; keep if this is has correct proc handling
             if (test_info.distributed_executor_backend
-                    is not None) == fork_per_test:
+                    is not None) == new_proc_per_test:
                 matching_tests[test_name] = test_info
 
     return matching_tests
@@ -51,14 +51,14 @@ def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
 
 def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
                              test_type: VLMTestType,
-                             fork_new_process_for_each_test: bool):
+                             create_new_process_for_each_test: bool):
     """Converts all of our VLMTestInfo into an expanded list of parameters.
     This is similar to nesting pytest parametrize calls, but done directly
     through an itertools product so that each test can set things like
     size factors etc, while still running in isolated test cases.
     """
     matching_tests = get_filtered_test_settings(
-        test_settings, test_type, fork_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test)
 
     # Ensure that something is wrapped as an iterable it's not already
     ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
index 80d6897da7e..7897bf113d3 100644
--- a/tests/models/encoder_decoder/audio_language/test_whisper.py
+++ b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -10,7 +10,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.audio import AudioAsset
 
-from ....utils import fork_new_process_for_each_test, multi_gpu_test
+from ....utils import create_new_process_for_each_test, multi_gpu_test
 
 PROMPTS = [
     {
@@ -119,7 +119,7 @@ def run_test(
         assert output.outputs[0].text == expected
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.core_model
 @pytest.mark.parametrize(
     "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index 465c496f4c0..e6141b97b10 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -5,10 +5,10 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_plugin(
     monkeypatch: pytest.MonkeyPatch,
     dummy_opt_path: str,
@@ -24,7 +24,7 @@ def test_plugin(
         assert (error_msg in str(excinfo.value))
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_oot_registration_text_generation(
     monkeypatch: pytest.MonkeyPatch,
     dummy_opt_path: str,
@@ -44,7 +44,7 @@ def test_oot_registration_text_generation(
             assert rest == ""
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_oot_registration_embedding(
     monkeypatch: pytest.MonkeyPatch,
     dummy_gemma2_embedding_path: str,
@@ -62,7 +62,7 @@ def test_oot_registration_embedding(
 image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_oot_registration_multimodal(
     monkeypatch: pytest.MonkeyPatch,
     dummy_llava_path: str,
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 80d3f78f9f3..3282284b6b2 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -17,7 +17,7 @@
                                                  ModelRegistry)
 from vllm.platforms import current_platform
 
-from ..utils import fork_new_process_for_each_test
+from ..utils import create_new_process_for_each_test
 from .registry import HF_EXAMPLE_MODELS
 
 
@@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
         assert supports_multimodal(model_cls)
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
     ("LlamaForCausalLM", False, False, False),
     ("MllamaForConditionalGeneration", True, False, False),
@@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
                 stacklevel=2)
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
     ("MLPSpeculatorPreTrainedModel", False, False),
     ("DeepseekV2ForCausalLM", True, False),
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 4b5210cdf07..d6844b8dc5f 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -10,7 +10,8 @@
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from tests.utils import compare_two_settings, fork_new_process_for_each_test
+
+from ..utils import compare_two_settings, create_new_process_for_each_test
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
@@ -32,7 +33,7 @@
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
@@ -45,7 +46,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                          models_pre_qaunt_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                        model_name, description) -> None:
 
@@ -57,7 +58,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                          models_pre_quant_8bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
@@ -70,7 +71,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                 model_name, description) -> None:
 
@@ -88,7 +89,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
     common_args = [
         "--disable-log-stats",
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index d396e52a9dd..56acf664ab5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -42,7 +42,7 @@
 
 from vllm import SamplingParams
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 from .conftest import (get_output_from_llm_generator,
                        run_equality_correctness_test)
 
@@ -82,7 +82,7 @@
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_with_detokenization(test_llm_generator,
                                              batch_size: int):
     """Run generation with speculative decoding on a batch. Verify the engine
@@ -170,7 +170,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
     ])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@@ -244,7 +244,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     ])
 @pytest.mark.parametrize("batch_size", [64])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@@ -300,7 +300,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
 ])
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
@@ -356,7 +356,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
         256,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@@ -411,7 +411,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
         64,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@@ -469,7 +469,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
     ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_e2e_greedy_correctness_with_preemption(
         vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
@@ -534,7 +534,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
                                           per_test_common_llm_kwargs,
                                           baseline_llm_kwargs, test_llm_kwargs,
@@ -594,7 +594,7 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
         64,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_skip_speculation(vllm_runner, common_llm_kwargs,
                           per_test_common_llm_kwargs, baseline_llm_kwargs,
                           test_llm_kwargs, batch_size: int, output_len: int,
@@ -644,7 +644,7 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("output_len", [10])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_disable_speculation(vllm_runner, common_llm_kwargs,
                              per_test_common_llm_kwargs, baseline_llm_kwargs,
                              test_llm_kwargs, batch_size: int, output_len: int,
@@ -697,7 +697,7 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                 baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                 output_len: int, seed: int):
@@ -752,7 +752,7 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         32,
     ])
 @pytest.mark.parametrize("seed", [1])
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs,
                                      per_test_common_llm_kwargs,
                                      baseline_llm_kwargs, test_llm_kwargs,
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ae4fddd046d..3660cfa0e49 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -16,7 +16,7 @@
                         deprecate_kwargs, get_open_port, memory_profiling,
                         merge_async_iterators, supports_kw, swap_dict_values)
 
-from .utils import error_on_warning, fork_new_process_for_each_test
+from .utils import create_new_process_for_each_test, error_on_warning
 
 
 @pytest.mark.asyncio
@@ -276,7 +276,7 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
     ) == is_supported
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_memory_profiling():
     # Fake out some model loading + inference memory usage to test profiling
     # Memory used by other processes will show up as cuda usage outside of torch
diff --git a/tests/utils.py b/tests/utils.py
index 06ba8a2421c..627cf567afc 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -7,12 +7,14 @@
 import signal
 import subprocess
 import sys
+import tempfile
 import time
 import warnings
-from contextlib import contextmanager
+from contextlib import contextmanager, suppress
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Literal, Optional, Union
 
+import cloudpickle
 import openai
 import pytest
 import requests
@@ -703,6 +705,78 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
     return wrapper
 
 
+def spawn_new_process_for_each_test(
+        f: Callable[_P, None]) -> Callable[_P, None]:
+    """Decorator to spawn a new process for each test function.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
+        # Check if we're already in a subprocess
+        if os.environ.get('RUNNING_IN_SUBPROCESS') == '1':
+            # If we are, just run the function directly
+            return f(*args, **kwargs)
+
+        import torch.multiprocessing as mp
+        with suppress(RuntimeError):
+            mp.set_start_method('spawn')
+
+        # Get the module
+        module_name = f.__module__
+
+        # Create a process with environment variable set
+        env = os.environ.copy()
+        env['RUNNING_IN_SUBPROCESS'] = '1'
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            output_filepath = os.path.join(tempdir, "new_process.tmp")
+
+            # `cloudpickle` allows pickling complex functions directly
+            input_bytes = cloudpickle.dumps((f, output_filepath))
+
+            cmd = [sys.executable, "-m", f"{module_name}"]
+
+            returned = subprocess.run(cmd,
+                                      input=input_bytes,
+                                      capture_output=True,
+                                      env=env)
+
+            # check if the subprocess is successful
+            try:
+                returned.check_returncode()
+            except Exception as e:
+                # wrap raised exception to provide more information
+                raise RuntimeError(f"Error raised in subprocess:\n"
+                                   f"{returned.stderr.decode()}") from e
+
+    return wrapper
+
+
+def create_new_process_for_each_test(
+    method: Optional[Literal["spawn", "fork"]] = None
+) -> Callable[[Callable[_P, None]], Callable[_P, None]]:
+    """Creates a decorator that runs each test function in a new process.
+
+    Args:
+        method: The process creation method. Can be either "spawn" or "fork". 
+               If not specified,
+               it defaults to "spawn" on ROCm platforms and "fork" otherwise.
+
+    Returns:
+        A decorator to run test functions in separate processes.
+    """
+    if method is None:
+        method = "spawn" if current_platform.is_rocm() else "fork"
+
+    assert method in ["spawn",
+                      "fork"], "Method must be either 'spawn' or 'fork'"
+
+    if method == "fork":
+        return fork_new_process_for_each_test
+
+    return spawn_new_process_for_each_test
+
+
 def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     """
     Get a pytest mark, which skips the test if the GPU doesn't meet
@@ -762,7 +836,7 @@ def multi_gpu_test(*, num_gpus: int):
     marks = multi_gpu_marks(num_gpus=num_gpus)
 
     def wrapper(f: Callable[_P, None]) -> Callable[_P, None]:
-        func = fork_new_process_for_each_test(f)
+        func = create_new_process_for_each_test()(f)
         for mark in reversed(marks):
             func = mark(func)
 
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 2ec4f7e034a..afbe15b9d46 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -9,7 +9,6 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -19,6 +18,8 @@
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 
+from ...utils import create_new_process_for_each_test
+
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
@@ -44,7 +45,7 @@ def make_request() -> EngineCoreRequest:
     )
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_engine_core(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
@@ -158,7 +159,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.running) == 0
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
     """
     A basic end-to-end test to verify that the engine functions correctly
@@ -208,7 +209,7 @@ def _check_engine_state():
         _check_engine_state()
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     """
     Test that the engine can handle multiple concurrent batches.
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 004b4dc82f4..48f451a5896 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -8,7 +8,6 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
@@ -19,6 +18,8 @@
                                         SyncMPClient)
 from vllm.v1.executor.abstract import Executor
 
+from ...utils import create_new_process_for_each_test
+
 if not current_platform.is_cuda():
     pytest.skip(reason="V1 currently only supported on CUDA.",
                 allow_module_level=True)
@@ -88,7 +89,7 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
     return msg
 
 
-@fork_new_process_for_each_test
+@create_new_process_for_each_test()
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
                             multiprocessing_mode: bool):

From 1493e5d14b15181feca2d4c6e38bfba92a8e1bd0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 17 Mar 2025 21:14:32 +0800
Subject: [PATCH 0795/1240] [Doc] Fix misleading log during multi-modal
 profiling (#14955)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/multimodal/profiling.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index b791fb83478..62b75afe8de 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -218,8 +218,10 @@ def get_decoder_dummy_data(
 
         # V0 does not support chunked prefill.
         if total_len > seq_len and not envs.VLLM_USE_V1:
+            # `max_num_batched_tokens` is defined by `SchedulerConfig`
             logger.warning(
-                "The context length (%d) of the model is too short "
+                "The sequence length used for profiling ("
+                "max_num_batched_tokens / max_num_seqs = %d) is too short "
                 "to hold the multi-modal embeddings in the worst case "
                 "(%d tokens in total, out of which %s are reserved for "
                 "multi-modal embeddings). This may cause certain "

From dbf0866dd650009d75e3cc452bb8c92284073349 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 17 Mar 2025 14:47:50 +0100
Subject: [PATCH 0796/1240] Add patch merger (#14957)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt               |   2 +-
 requirements/docs.txt                 |   2 +-
 requirements/test.in                  |   4 +-
 vllm/model_executor/models/pixtral.py | 166 +++++++++++++++++++++++++-
 4 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index bb021d9e454..8d9108687a2 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -28,7 +28,7 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common[opencv] >= 1.5.0
+mistral_common[opencv] >= 1.5.4
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 7a9b921a117..416ca503b36 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -15,7 +15,7 @@ pydantic >= 2.8
 torch
 py-cpuinfo
 transformers
-mistral_common >= 1.5.0
+mistral_common >= 1.5.4
 aiohttp
 starlette
 openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/requirements/test.in b/requirements/test.in
index c171e8d41dd..faa4564eaa3 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -27,7 +27,7 @@ torchaudio==2.6.0
 torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
-mistral_common[opencv] >= 1.5.0 # required for pixtral test
+mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.48.2 
@@ -40,4 +40,4 @@ tritonclient==2.51.0
 
 numpy < 2.0.0
 runai-model-streamer==0.11.0
-runai-model-streamer-s3==0.11.0
\ No newline at end of file
+runai-model-streamer-s3==0.11.0
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index fff630056e4..8e5454328bd 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -56,6 +56,8 @@
 except ImportError:
     USE_XFORMERS_OPS = False
 
+PATCH_MERGE = "patch_merge"
+
 
 class PixtralImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -155,7 +157,6 @@ def __call__(
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
-
             image_processed = torch.tensor(image_inputs.image)
             image_tokens = torch.tensor(image_inputs.tokens)
 
@@ -353,6 +354,27 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
         self.vision_encoder = VisionTransformer(self.vision_args)
+
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
+                                                 eps=1e-5)
+
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            self.patch_merger = PatchMerger(
+                vision_encoder_dim=self.vision_args.hidden_size,
+                spatial_merge_size=self.vision_args.spatial_merge_size,
+                use_mlp_bias=False,
+            )
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
+                                                 eps=1e-5)
+
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            self.patch_merger = PatchMerger(
+                vision_encoder_dim=self.vision_args.hidden_size,
+                spatial_merge_size=self.vision_args.spatial_merge_size,
+                use_mlp_bias=False,
+            )
         self.vision_language_adapter = VisionLanguageAdapter(
             self.vision_args, dim=config.text_config.hidden_size)
 
@@ -398,13 +420,25 @@ def _process_image_input(
         image_input: PixtralImagePixelInputs,
     ) -> tuple[torch.Tensor, ...]:
         images = image_input["images"]
-
         image_features = self.vision_encoder(images)
         feature_sizes = [
             image_feature.shape[0] for image_feature in image_features
         ]
-
-        image_embeds = self.vision_language_adapter(torch.cat(image_features))
+        image_features = torch.cat(image_features)
+        if self.vision_args.add_pre_mm_projector_layer_norm:
+            image_features = self.pre_mm_projector_norm(image_features)
+        if self.vision_args.mm_projector_id == PATCH_MERGE:
+            patch_size = self.vision_args.patch_size
+            spatial_merge_size_square = self.vision_args.spatial_merge_size**2
+            img_patch_dims = [(img.shape[1] // patch_size,
+                               img.shape[2] // patch_size) for img in images]
+            feature_sizes = [
+                feature_size // spatial_merge_size_square
+                for feature_size in feature_sizes
+            ]
+            image_features = self.patch_merger(image_features,
+                                               image_sizes=img_patch_dims)
+        image_embeds = self.vision_language_adapter(image_features)
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
@@ -524,8 +558,19 @@ def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]):
         def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]):
             return weight[0].startswith("vision_language_adapter")
 
+        def is_patch_merger(weight: Tuple[str, torch.Tensor]):
+            return weight[0].startswith("patch_merger")
+
+        def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]):
+            return weight[0].startswith("pre_mm_projector_norm")
+
         # Get references to parameters for direct loading
         vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        patch_merger_dict = dict(self.patch_merger.named_parameters(
+        )) if self.vision_args.mm_projector_id == PATCH_MERGE else dict()
+        pre_mm_projector_norm_dict = dict(
+            self.pre_mm_projector_norm.named_parameters(
+            )) if self.vision_args.add_pre_mm_projector_layer_norm else dict()
         vision_lang_adapter_dict = dict(
             self.vision_language_adapter.named_parameters())
 
@@ -538,6 +583,18 @@ def llm_weights_generator():
                     param = vision_encoder_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
+                elif is_patch_merger((name, w)):
+                    # Load vision patch merger weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = patch_merger_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
+                elif is_pre_mm_projector_norm((name, w)):
+                    # Load vision pre_mm_projector_norm weights directly
+                    trimmed_name = '.'.join(name.split(".")[1:])
+                    param = pre_mm_projector_norm_dict[trimmed_name]
+                    with torch.no_grad():
+                        default_weight_loader(param, w)
                 elif is_vision_lang_adapter_weights((name, w)):
                     # Load vision-language adapter weights directly
                     trimmed_name = '.'.join(name.split(".")[1:])
@@ -566,6 +623,9 @@ class VisionEncoderArgs:
     rope_theta: float  # for rope-2D
     image_token_id: int
     adapter_bias: bool = True
+    spatial_merge_size: int = 1
+    add_pre_mm_projector_layer_norm: bool = False
+    mm_projector_id: str = ""
 
 
 def _reshape_for_broadcast(freqs_cis: torch.Tensor,
@@ -843,6 +903,104 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.w_out(self.gelu(self.w_in(x)))
 
 
+class PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(
+        self,
+        vision_encoder_dim: int,
+        spatial_merge_size: int,
+        use_mlp_bias: bool = False,
+    ) -> None:
+        super().__init__()
+
+        mlp_input_dim = vision_encoder_dim * (spatial_merge_size**2)
+
+        self.spatial_merge_size = spatial_merge_size
+        self.mlp_input_dim = mlp_input_dim
+
+        self.merging_layer = nn.Linear(
+            mlp_input_dim,
+            vision_encoder_dim,
+            bias=use_mlp_bias,
+        )
+
+    def forward(self, x: torch.Tensor,
+                image_sizes: list[tuple[int, int]]) -> torch.Tensor:
+        # image_sizes specified in tokens
+        assert sum([h * w for h, w in image_sizes]) == len(x)
+
+        # x is (N, vision_encoder_dim)
+        x = self.permute(x, image_sizes)
+
+        # x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
+        x = self.merging_layer(x)
+
+        # x is (N / spatial_merge_size ** 2, vision_encoder_dim)
+        return x
+
+    def permute(
+        self,
+        x: torch.Tensor,
+        image_sizes: list[tuple[int, int]],
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: (N, D) where N is flattened and concatenated patch tokens
+                for all images
+            image_sizes: list of tuple of (height, width) in tokens for
+                each image
+        Returns:
+            image_features: reorders patch tokens so each grid of
+                (spatial_merge_size, spatial_merge_size) is contiguous.
+                now (N / spatial_merge_size ** 2, D * spatial_merge_size ** 2)
+        """
+
+        sub_grids = get_sub_grids(
+            x=x,
+            image_sizes=image_sizes,
+            spatial_merge_size=self.spatial_merge_size
+        )  # list of [d x sub_grid_size x sub_grid_size x n_patches]
+        permuted_tensor: list[torch.Tensor] = []
+        for grid in sub_grids:
+            n_patches = grid.shape[-1]
+            permuted_tensor.append(grid.view(-1, n_patches).t(
+            ))  # n_patches x d * sub_grid_size * sub_grid_size
+        return torch.cat(
+            permuted_tensor, dim=0
+        )  # (N / spatial_merge_size ** 2, d * spatial_merge_size ** 2)
+
+
+def get_sub_grids(
+    x: torch.Tensor,
+    image_sizes: list[tuple[int, int]],
+    spatial_merge_size: int,
+) -> list[torch.Tensor]:
+    # image_sizes specified in tokens
+    tokens_per_image = [h * w for h, w in image_sizes]
+    d = x.shape[-1]
+    all_img_sub_grids: list[torch.Tensor] = []
+    sub_grid_size = spatial_merge_size
+
+    for image_index, image_tokens in enumerate(x.split(tokens_per_image)):
+        # Reshape image_tokens into a 2D grid
+        h, w = image_sizes[image_index]
+        image_grid = image_tokens.view(h, w, d).permute(
+            2, 0, 1)[None, :, :, :]  # 1 x d x h x w
+        sub_grids = torch.nn.functional.unfold(image_grid,
+                                               kernel_size=sub_grid_size,
+                                               stride=sub_grid_size)
+        sub_grids = sub_grids.view(
+            1, d, sub_grid_size, sub_grid_size,
+            -1)  # 1 x d x sub_grid_size x sub_grid_size x n_patches
+
+        all_img_sub_grids.append(sub_grids[0])
+
+    return all_img_sub_grids
+
+
 #### HF Transformers version of Pixtral ####
 # Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
 # This model follows the Llava family, meaning image embeddings are placed

From 2200793e28bc2e6f0e1c8823814d96b05a9b0897 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 17 Mar 2025 06:54:40 -0700
Subject: [PATCH 0797/1240] [V1] Default MLA to V1 (#14921)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4e695da4ef7..49b8b0d5ca1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1191,7 +1191,7 @@ def create_engine_config(
         NOTE: for autoselection of V0 vs V1 engine, we need to
         create the ModelConfig first, since ModelConfig's attrs
         (e.g. the model arch) are needed to make the decision.
-        
+
         This function set VLLM_USE_V1=X if VLLM_USE_V1 is
         unspecified by the user.
 
@@ -1576,10 +1576,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         #############################################################
         # Experimental Features - allow users to opt in.
 
-        # MLA is is supported on V1, but off by default for now.
-        if model_config.use_mla and _warn_or_fallback("MLA"):
-            return False
-
         # LoRA is supported on V1, but off by default for now.
         if self.enable_lora and _warn_or_fallback("LORA"):
             return False

From c617d69b9853e3bde1a771bf5ae9d346ebe676e4 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 17 Mar 2025 10:18:50 -0400
Subject: [PATCH 0798/1240] [Bugfix] Fix precommit - line too long in
 pixtral.py (#14960)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/test.txt                 | 2 +-
 vllm/model_executor/models/pixtral.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index 10fb1f14c3a..c733364fd87 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -235,7 +235,7 @@ mbstrdecoder==1.1.3
     #   typepy
 mdurl==0.1.2
     # via markdown-it-py
-mistral-common==1.5.1
+mistral-common==1.5.4
     # via -r requirements/test.in
 more-itertools==10.5.0
     # via lm-eval
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8e5454328bd..f9facdf1831 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -73,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
-    
+
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
@@ -849,10 +849,10 @@ def forward(
     ) -> torch.Tensor:
         """
         Args:
-            images: list of N_img images of variable sizes, 
+            images: list of N_img images of variable sizes,
                 each of shape (C, H, W)
         Returns:
-            image_features: tensor of token features for 
+            image_features: tensor of token features for
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
@@ -935,7 +935,8 @@ def forward(self, x: torch.Tensor,
         # x is (N, vision_encoder_dim)
         x = self.permute(x, image_sizes)
 
-        # x is (N / spatial_merge_size ** 2, vision_encoder_dim * spatial_merge_size ** 2)
+        # x is (N / spatial_merge_size ** 2,
+        #       vision_encoder_dim * spatial_merge_size ** 2)
         x = self.merging_layer(x)
 
         # x is (N / spatial_merge_size ** 2, vision_encoder_dim)

From f196bdfc0c4b1e4313bdce3c5101538435995bc4 Mon Sep 17 00:00:00 2001
From: Quentin <torroba.q@gmail.com>
Date: Mon, 17 Mar 2025 15:44:18 +0100
Subject: [PATCH 0799/1240] [Bugfix][Model] Mixtral: use unused head_dim config
 argument (#14961)

Signed-off-by: Quentin Torroba <quentin.torroba@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/mixtral.py       | 6 +++++-
 vllm/model_executor/models/mixtral_quant.py | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 0ae7688779b..6bdb623593a 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -111,6 +111,7 @@ class MixtralAttention(nn.Module):
 
     def __init__(
         self,
+        config: MixtralConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -136,7 +137,9 @@ def __init__(
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        # MixtralConfig has an optional head_dim argument
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -200,6 +203,7 @@ def __init__(
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = MixtralAttention(
+            config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 8a893b6d858..5be91f40bb2 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -165,6 +165,7 @@ class MixtralAttention(nn.Module):
 
     def __init__(
         self,
+        config: MixtralConfig,
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
@@ -190,7 +191,9 @@ def __init__(
             # the KV heads across multiple tensor parallel GPUs.
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
-        self.head_dim = hidden_size // self.total_num_heads
+        # MixtralConfig has an optional head_dim argument
+        self.head_dim = getattr(config, "head_dim",
+                                self.hidden_size // self.total_num_heads)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -252,6 +255,7 @@ def __init__(
         # Requires transformers > 4.32.0
         rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = MixtralAttention(
+            config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,

From 47b27db973e9f6e83bc533442913d4e6b21ec44b Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Mon, 17 Mar 2025 11:42:45 -0400
Subject: [PATCH 0800/1240] [Fix][Structured Output] using vocab_size to
 construct matcher (#14868)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   1 +
 requirements/common.txt                       |   2 +-
 .../model_executor/test_guided_processors.py  |  12 +-
 .../llm/test_struct_output_generate.py        |   3 -
 .../guided_decoding/__init__.py               |  10 +-
 .../guided_decoding/xgrammar_decoding.py      | 125 ++++++++----------
 vllm/v1/structured_output/__init__.py         |   2 +-
 7 files changed, 70 insertions(+), 85 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f5be8dca05f..230dd838342 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -200,6 +200,7 @@ steps:
     - pytest -v -s v1/core
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/engine
+    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample
     - pytest -v -s v1/worker
     - pytest -v -s v1/structured_output
diff --git a/requirements/common.txt b/requirements/common.txt
index 8d9108687a2..d08ef253828 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -20,7 +20,7 @@ tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.15; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 531c3a8c13b..85a53a178ca 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import pickle
 
 import pytest
@@ -208,8 +209,6 @@ def test_guided_decoding_backend_options():
 
 
 def test_pickle_xgrammar_tokenizer_data():
-
-    # TODO: move to another test file for xgrammar
     try:
         import xgrammar as xgr
     except ImportError:
@@ -217,7 +216,11 @@ def test_pickle_xgrammar_tokenizer_data():
 
     from vllm.model_executor.guided_decoding.xgrammar_decoding import (
         TokenizerData)
-    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    tokenizer_data = TokenizerData(
+        metadata=
+        '{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
+        encoded_vocab=['!', '"', '#', '$', '%'],
+    )
     pickled = pickle.dumps(tokenizer_data)
 
     assert pickled is not None
@@ -225,4 +228,5 @@ def test_pickle_xgrammar_tokenizer_data():
     depickled: TokenizerData = pickle.loads(pickled)
 
     assert depickled is not None
-    assert depickled.vocab_type == xgr.VocabType.RAW
+    assert json.loads(
+        depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 98983fa05b8..b4eb475c23b 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -18,9 +18,6 @@
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
 
-# Undo after https://github.com/vllm-project/vllm/pull/14868
-pytest.skip(allow_module_level=True)
-
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 6b9a855eecc..c21df044d48 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -9,7 +9,6 @@
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
-from vllm.platforms import CpuArchEnum
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -26,7 +25,7 @@ def maybe_backend_fallback(
 
     def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                           fallback: str) -> None:
-        """Change the backend to the specified fallback with a warning log, 
+        """Change the backend to the specified fallback with a warning log,
         or raise a ValueError if the `no-fallback` option is specified."""
         if guided_params.no_fallback():
             raise ValueError(message)
@@ -53,19 +52,12 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
     if guided_params.backend_name == "xgrammar":
         from vllm.model_executor.guided_decoding.xgrammar_decoding import (
             xgr_installed)
-        # xgrammar only has x86 wheels for linux, fallback to outlines
-        from vllm.platforms import current_platform
-        if current_platform.get_cpu_architecture() is not CpuArchEnum.X86:
-            fallback_or_error(guided_params,
-                              "xgrammar is only supported on x86 CPUs.",
-                              "outlines")
 
         # xgrammar doesn't support regex, fallback to outlines
         if guided_params.regex is not None:
             fallback_or_error(
                 guided_params,
                 "xgrammar does not support regex guided decoding.", "outlines")
-
         # xgrammar doesn't support some JSON schema features
         elif (guided_params.json is not None
               and has_xgrammar_unsupported_json_features(guided_params.json)):
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 9405ef93e14..bc156223953 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -9,13 +9,11 @@
 from typing import TYPE_CHECKING, Any, List
 
 import torch
-from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
 
 try:
     import xgrammar as xgr
-    from xgrammar.base import _core as xgr_core
     xgr_installed = True
 except ImportError:
     xgr_installed = False
@@ -35,7 +33,6 @@
 logger = init_logger(__name__)
 
 
-# TODO: passing batch size to max threads here
 def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
@@ -52,18 +49,8 @@ def get_local_xgrammar_guided_decoding_logits_processor(
 @dataclass(frozen=True)
 class TokenizerData:
     """Immutable container for cached tokenizer data."""
+    metadata: str
     encoded_vocab: list[str] = field(default_factory=list)
-    stop_token_ids: list[int] | None = None
-    # These fields are mutually exclusive: `backend_str` is used to create a
-    # TokenizeInfo with `TokenizerInfo.from_huggingface` while `vocab_type` is
-    # used within the constructor of TokenizeInfo
-    backend_str: str | None = None
-    vocab_type: xgr.VocabType | None = None
-
-    def __post_init__(self):
-        # Check for mutual exclusive
-        assert not (self.backend_str and self.vocab_type), \
-            "backend_str and vocab_type are mutual exclusive"
 
 
 class TokenizerDataCache:
@@ -71,46 +58,52 @@ class TokenizerDataCache:
     _cache: dict[int, TokenizerData] = {}
 
     @classmethod
-    def get_tokenizer_data(cls,
-                           tokenizer: PreTrainedTokenizer) -> TokenizerData:
-        tokenizer_hash = hash(tokenizer)
+    def get_tokenizer_data(
+        cls,
+        tokenizer: PreTrainedTokenizer,
+        /,
+        *,
+        tokenizer_hash: int,
+        vocab_size: int,
+    ) -> TokenizerData:
 
         if tokenizer_hash not in cls._cache:
-            # Vendored from xgrammar logic since we cannot pickle the tokenizer
-            # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                tokenizer,
+                # NOTE: We will need to use lm_head's vocab_size
+                # to determine correct special_token_ids for this tokenizer.
+                # See https://github.com/mlc-ai/xgrammar/commit/70c959fb6d9cea75aae33c414763cd0602022d92  # noqa: E501
+                vocab_size=vocab_size,
+            )
+            metadata = json.loads(tokenizer_info.dump_metadata())
+
+            # Vendored from xgrammar logic to get encoded_vocab
+            # https://github.com/mlc-ai/xgrammar/blob/989222175c2a30fb7987d8bcce35bec1bf6817f2/python/xgrammar/tokenizer_info.py#L127 # noqa: E501
             try:
-                encoded_vocab = [
-                    token for token, _ in sorted(tokenizer.get_vocab().items(),
-                                                 key=lambda x: x[1])
-                ]
+                vocab_dict = tokenizer.get_vocab()
             except AttributeError as e:
                 raise ValueError(
                     f"Cannot get the vocabulary of the tokenizer "
                     f"{type(tokenizer)}. The tokenizer should have a "
                     "get_vocab method.") from e
 
-            stop_token_ids = None
-            backend_str = ""
-            vocab_type = xgr.VocabType.RAW
-
-            if stop_token_ids is None and hasattr(
-                    tokenizer,
-                    "eos_token_id") and tokenizer.eos_token_id is not None:
-                stop_token_ids = [tokenizer.eos_token_id]
-
-            if isinstance(tokenizer, PreTrainedTokenizerFast):
-                backend_str = tokenizer.backend_tokenizer.to_str()
-                vocab_type = None
+            # maintain tokenizer's indexing
+            encoded_vocab = [""] * tokenizer_info.vocab_size
+            for token, idx in vocab_dict.items():
+                if idx < tokenizer_info.vocab_size:
+                    encoded_vocab[idx] = token
 
-            elif isinstance(tokenizer, MistralTokenizer):
+            if isinstance(tokenizer, MistralTokenizer):
                 # REF: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                vocab_type = xgr.VocabType.BYTE_FALLBACK
+                metadata.update({
+                    "vocab_type": xgr.VocabType.BYTE_FALLBACK,
+                    "add_prefix_space": True
+                })
 
             cls._cache[tokenizer_hash] = TokenizerData(
                 encoded_vocab=encoded_vocab,
-                stop_token_ids=stop_token_ids,
-                backend_str=backend_str,
-                vocab_type=vocab_type)
+                metadata=json.dumps(metadata),
+            )
 
         return cls._cache[tokenizer_hash]
 
@@ -129,30 +122,15 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
         cache_key = str(config.tokenizer_hash)
 
         if cache_key not in cls._cache:
-            assert config.tokenizer_data is not None
-            assert config.tokenizer_data.encoded_vocab is not None
-
             config_data = config.tokenizer_data
 
             # In TokenizerDataCache.get_tokenizer_data, a serializable
             # tokenizer_data is created and cached. This data is used to build
             # a tokenizer_info and create an xgrammar compiler.
-            # - If tokenizer_data has backend_str set, use
-            # xgr_core.TokenizerInfo.from_huggingface (a C++ bind).
-            # - Otherwise, use the default constructor with vocab_type.
-            # - xgr_core.TokenizerInfo.from_huggingface !=
-            #   xgr.TokenizerInfo.from_huggingface.
-            if config_data.backend_str:
-                tokenizer_info = xgr.TokenizerInfo._create_from_handle(
-                    xgr_core.TokenizerInfo.from_huggingface(
-                        config_data.encoded_vocab, config_data.backend_str,
-                        config.vocab_size, config_data.stop_token_ids))
-            else:
-                tokenizer_info = xgr.TokenizerInfo(
-                    config_data.encoded_vocab,
-                    config_data.vocab_type,
-                    vocab_size=config.vocab_size,
-                    stop_token_ids=config_data.stop_token_ids)
+            tokenizer_info = xgr.TokenizerInfo.from_vocab_and_metadata(
+                encoded_vocab=config_data.encoded_vocab,
+                metadata=config_data.metadata,
+            )
             cls._cache[cache_key] = xgr.GrammarCompiler(
                 tokenizer_info, max_threads=config.max_threads)
 
@@ -163,13 +141,12 @@ def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler:
 class GrammarConfig:
     """Serializable configuration for grammar compilation"""
     tokenizer_hash: int
-    vocab_size: int
+    tokenizer_data: TokenizerData
     json_str: str | None = None
     grammar_str: str | None = None
     json_object: bool | None = None
     any_whitespace: bool = True
     max_threads: int = 8
-    tokenizer_data: TokenizerData | None = None
 
     @classmethod
     def from_guided_params(cls,
@@ -179,7 +156,11 @@ def from_guided_params(cls,
                            max_threads: int = 8) -> GrammarConfig:
 
         tokenizer_hash = hash(tokenizer)
-        tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer)
+        tokenizer_data = TokenizerDataCache.get_tokenizer_data(
+            tokenizer,
+            tokenizer_hash=tokenizer_hash,
+            vocab_size=model_config.hf_text_config.vocab_size,
+        )
 
         if guided_params.json:
             if not isinstance(guided_params.json, str):
@@ -218,7 +199,6 @@ def from_guided_params(cls,
                 raise ValueError(str(err)) from err
 
             return cls(json_str=json_str,
-                       vocab_size=model_config.hf_text_config.vocab_size,
                        tokenizer_hash=tokenizer_hash,
                        max_threads=max_threads,
                        tokenizer_data=tokenizer_data,
@@ -246,14 +226,12 @@ def from_guided_params(cls,
                 raise ValueError(str(err)) from err
 
             return cls(grammar_str=grammar_str,
-                       vocab_size=model_config.hf_text_config.vocab_size,
                        tokenizer_hash=tokenizer_hash,
                        max_threads=max_threads,
                        tokenizer_data=tokenizer_data)
         elif guided_params.json_object:
             return cls(
                 json_object=True,
-                vocab_size=model_config.hf_text_config.vocab_size,
                 tokenizer_hash=tokenizer_hash,
                 max_threads=max_threads,
                 tokenizer_data=tokenizer_data,
@@ -267,7 +245,6 @@ def from_guided_params(cls,
 
             return cls(
                 grammar_str=choice_str,
-                vocab_size=model_config.hf_text_config.vocab_size,
                 tokenizer_hash=tokenizer_hash,
                 max_threads=max_threads,
                 tokenizer_data=tokenizer_data,
@@ -291,6 +268,13 @@ def choice_as_grammar(choice: List[str] | None) -> str:
         grammar = ('root ::= ' + ' | '.join(f'"{c}"' for c in escaped_choices))
         return grammar
 
+    @staticmethod
+    def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
+        return xgr.TokenizerInfo.from_vocab_and_metadata(
+            encoded_vocab=tokenizer_data.encoded_vocab,
+            metadata=tokenizer_data.metadata,
+        )
+
 
 @dataclass
 class XGrammarLogitsProcessor:
@@ -299,11 +283,16 @@ class XGrammarLogitsProcessor:
     reasoner: Reasoner | None = None
 
     ctx: xgr.CompiledGrammar | None = None
+    tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
     token_bitmask: torch.Tensor = None  # type: ignore[assignment]
     matchers: list[xgr.GrammarMatcher] = field(default_factory=list)
     batch_size: int = field(default=1)
     prefilled: bool = field(default=False)
 
+    def __post_init__(self):
+        self.tokenizer_info = self.config.tokenizer_info(
+            self.config.tokenizer_data)
+
     def __getstate__(self) -> dict[str, Any]:
         return {'config': self.config, 'reasoner': self.reasoner}
 
@@ -311,6 +300,8 @@ def __setstate__(self, state: dict[str, Any]):
         self.config = state['config']
         self.reasoner = state['reasoner']
 
+        self.tokenizer_info = GrammarConfig.tokenizer_info(
+            self.config.tokenizer_data)
         self.ctx = None
         self.matchers = []
         self.batch_size = 1
@@ -352,7 +343,7 @@ def __call__(self, input_ids: list[int],
                 xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size)
             ]
             self.token_bitmask = xgr.allocate_token_bitmask(
-                self.batch_size, self.config.vocab_size)
+                self.batch_size, self.tokenizer_info.vocab_size)
 
         if not self.prefilled:
             # Have not sampled a token yet
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 77bafdee85c..5ed7b832aac 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -40,7 +40,7 @@ def _delayed_init(self):
         tokenizer_group.ping()
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = len(tokenizer.get_vocab())
+        self.vocab_size = self.vllm_config.model_config.get_vocab_size()
         if isinstance(tokenizer, MistralTokenizer):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98

From ef3c64029b943c0fd13c21aac23fb66b403c8693 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 17 Mar 2025 10:04:21 -0700
Subject: [PATCH 0801/1240] [Bugfix] Make Gemma3 MM V0 only for now (#14971)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md  | 5 ++++-
 vllm/model_executor/models/gemma3_mm.py | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 3d42d5f6b52..2d7617d9eba 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -763,7 +763,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
   * ✅︎
   * ✅︎
-  * ⚠️
+  *
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -948,8 +948,11 @@ V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
 - Generates reasonable outputs but does not match the original model's attention for text + image inputs
 - Will be updated in the future to support the correct behavior
+- Does not support `"do_pan_and_scan": True`
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::
 
 :::{note}
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index b945e4732a5..27b254b9c5c 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP)
+                         SupportsMultiModal, SupportsPP, SupportsV0Only)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -374,7 +374,7 @@ def forward(self, vision_outputs: torch.Tensor):
                                         info=Gemma3ProcessingInfo,
                                         dummy_inputs=Gemma3DummyInputsBuilder)
 class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                     SupportsLoRA):
+                                     SupportsLoRA, SupportsV0Only):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From b552984e44bb35d1a8e8ed3e30df46a4529e3d92 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:26:38 -0700
Subject: [PATCH 0802/1240] [Bugfix] Fix interface for Olmo2 on V1 (#14976)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/olmo2.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 54cc851de93..f9427cdadf7 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -42,7 +42,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -283,17 +283,19 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
         if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
             # Get embeddings of input.
             # shape: (batch_size, seq_len, d_model)
-            inputs_embeds = self.embed_tokens(input_ids)
+            else:
+                hidden_states = self.embed_tokens(input_ids)
 
-            # embed positions
-            hidden_states = inputs_embeds
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -337,7 +339,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -346,11 +348,13 @@ def forward(
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
         )
         return hidden_states
 

From 09ce7537a40e585e4d19c76a10c9f3295fd564e7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Mar 2025 02:35:17 +0800
Subject: [PATCH 0803/1240] [CI/Build] Use `AutoModelForImageTextToText` to
 load VLMs in tests (#14945)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_models.py            | 28 +++++++++----------
 .../vision_language/test_llava_next.py        |  4 +--
 .../vision_language/test_mllama.py            |  6 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 92fb2404d8a..2f903a33c6b 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -9,7 +9,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForPreTraining, AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
@@ -101,7 +101,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         convert_assets_to_embeddings=model_utils.get_llava_embeddings,
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -121,7 +121,7 @@
             "stop_sign": "caption es",
             "cherry_blossom": "What is in the picture?",
         }),
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -190,7 +190,7 @@
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output,
     ),
     "chameleon": VLMTestInfo(
@@ -199,7 +199,7 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -240,6 +240,7 @@
         img_idx_to_prompt=lambda idx: "",
         max_model_len=2048,
         max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
@@ -256,8 +257,7 @@
         multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.",  # noqa: E501
         max_model_len=4096,
         max_num_seqs=2,
-        # TODO: Use AutoModelForVision2Seq once transformers supports this
-        auto_cls=AutoModelForPreTraining,
+        auto_cls=AutoModelForImageTextToText,
         dtype="bfloat16",
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
@@ -307,7 +307,7 @@
         img_idx_to_prompt=lambda idx: "<image>",
         max_model_len=8192,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
     ),
     "intern_vl": VLMTestInfo(
@@ -336,7 +336,7 @@
         test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
@@ -382,7 +382,7 @@
             "pixel_values"
         ),
         get_stop_token_ids=lambda tok: [128009],
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
         marks=[
@@ -463,7 +463,7 @@
         img_idx_to_prompt=lambda idx: "[IMG]",
         max_model_len=8192,
         max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         marks=[large_gpu_mark(min_gb=48)],
     ),
     "qwen_vl": VLMTestInfo(
@@ -481,7 +481,7 @@
         models=["facebook/chameleon-7b"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         postprocess_inputs=model_utils.cast_dtype_post_processor(
             "pixel_values"
         ),
@@ -495,7 +495,7 @@
         models=["llava-hf/llava-1.5-7b-hf"],
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
@@ -504,7 +504,7 @@
         models=["llava-hf/llava-v1.6-mistral-7b-hf"],
         prompt_formatter=lambda img_prompt: f"[INST] {img_prompt} [/INST]",
         max_model_len=10240,
-        auto_cls=AutoModelForVision2Seq,
+        auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
         marks=multi_gpu_marks(num_gpus=2),
         **COMMON_BROADCAST_SETTINGS # type: ignore
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 8b9a856d005..d5d410f17dd 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -2,7 +2,7 @@
 
 import pytest
 import torch.nn.functional as F
-from transformers import AutoModelForVision2Seq
+from transformers import AutoModelForImageTextToText
 
 from vllm.platforms import current_platform
 
@@ -70,7 +70,7 @@ def _run_test(
         vllm_outputs = vllm_model.encode(input_texts, images=input_images)
 
     with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
         # Patch the issue where generation_config.json is missing
         hf_model.processor.patch_size = \
             hf_model.model.config.vision_config.patch_size
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 08e4b1b2f30..d2cdcfe4a56 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,8 +4,8 @@
 
 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
-                          BatchEncoding)
+from transformers import (AutoConfig, AutoModelForImageTextToText,
+                          AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -234,7 +234,7 @@ def process(hf_inputs: BatchEncoding, **kwargs):
                    dtype=dtype,
                    model_kwargs={"device_map": "auto"},
                    postprocess_inputs=process,
-                   auto_cls=AutoModelForVision2Seq) as hf_model:
+                   auto_cls=AutoModelForImageTextToText) as hf_model:
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,

From 41e9c9b8d6fc5d5450c5df784f3d4eec7ded907d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:23:02 -0400
Subject: [PATCH 0804/1240] [V1] Guard Against Main Thread Usage (#14972)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 49b8b0d5ca1..de85c2b206a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,6 +3,7 @@
 import argparse
 import dataclasses
 import json
+import threading
 from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional,
                     Tuple, Type, Union, cast, get_args)
@@ -1576,6 +1577,11 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         #############################################################
         # Experimental Features - allow users to opt in.
 
+        # Signal Handlers requires running in main thread.
+        if (threading.current_thread() != threading.main_thread()
+                and _warn_or_fallback("Engine in background thread")):
+            return False
+
         # LoRA is supported on V1, but off by default for now.
         if self.enable_lora and _warn_or_fallback("LORA"):
             return False

From ccdfaacdc07e86b63488ba4c8623b5bfee85c2d3 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Mon, 17 Mar 2025 17:07:07 -0400
Subject: [PATCH 0805/1240] [V1] TPU - Fix CI/CD runner (#14974)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-test.sh          | 25 ------------
 .buildkite/run-tpu-v1-test.sh       | 23 +++++++----
 tests/tpu/test_compilation.py       | 63 ++++++++++++++++++-----------
 tests/tpu/test_custom_dispatcher.py | 23 ++++++-----
 4 files changed, 69 insertions(+), 65 deletions(-)
 delete mode 100755 .buildkite/run-tpu-test.sh

diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
deleted file mode 100755
index 8ba2e4e386f..00000000000
--- a/.buildkite/run-tpu-test.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
-
-# Set up cleanup.
-remove_docker_container() { docker rm -f tpu-test || true; }
-trap remove_docker_container EXIT
-# Remove the container that might not be cleaned up in the previous run.
-remove_docker_container
-
-# For HF_TOKEN.
-source /etc/environment
-# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
-    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
-    && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index a6a14d0829d..e396e8faf78 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -15,13 +15,22 @@ remove_docker_container
 source /etc/environment
 # Run a simple end-to-end example.
 docker run --privileged --net host --shm-size=16G -it \
-    -e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
+    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
     && python3 -m pip install pytest \
     && python3 -m pip install lm_eval[api]==0.4.4 \
-    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
-    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
-    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
+    && echo TEST_1 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && echo TEST_2 \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && echo TEST_3 \
+    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && echo TEST_4 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    && echo TEST_5 \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
+
+
+# TODO: Fix these tests
+# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
+
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 6ed83f30ee0..81e65103386 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -34,7 +34,9 @@
 
     # disable custom dispatcher, let Dynamo takes over
     # all the control
-    llm = LLM(model="google/gemma-2b",
+    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+              max_model_len=512,
+              max_num_seqs=64,
               enforce_eager=True,
               compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
     outputs = llm.generate(prompts, sampling_params)
@@ -44,38 +46,51 @@
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         assert generated_text.startswith(answer)
 
-compiled_code = sorted(
+compiled_codes = sorted(
     glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
 
-# we should only trigger Dynamo compilation three times:
-# one for the profiling phase without kv cache
-# one for the prefill phase with symbolic shapes
-# one for the decode phase with symbolic shapes
+for i, compiled_code in enumerate(compiled_codes):
+    print("{} file: {}".format(i + 1, compiled_code))
+
+# We should only trigger Dynamo compilation 4 times:
+# 1. forward pass (symbolic)
+# 2. compute_logits (symbolic)
+# 3. forward pass (shape 16)
+# 4. forward pass (shape 32)
 # and later calls should not trigger Dynamo compilation again.
-# NOTE: it might still trigger XLA compilation.
+# NOTE: It might still trigger XLA compilation.
+
+# Check we have 4 compiled codes
+assert len(compiled_codes) == 4
 
-# check we have three compiled code
-# this is the assumption when we use the custom dispatcher
-assert len(compiled_code) == 3
+kv_cache_prefix = "kv_cache"
+attn_prefix = "ragged_paged_attention"
 
-# check all the compilations are as expected
-compiled_fn = sorted(
+# Check all the compilations are as expected
+compiled_fns = sorted(
     glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
 
-# the first compilation is the profiling phase,
-# it should not have any kv cache
-with open(compiled_fn[0]) as f:
+for i, compiled_fn in enumerate(compiled_fns):
+    print("{} file: {}".format(i + 1, compiled_fn))
+
+# The first compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[0]) as f:
+    content = f.read()
+    assert kv_cache_prefix not in content
+
+# The second compilation is symbolic, so it should not have any kv_caches
+with open(compiled_fns[1]) as f:
     content = f.read()
-    assert "kv_caches" not in content
+    assert kv_cache_prefix not in content
 
-# the second compilation is the prefill phase,
-# it should have kv cache and the flash_attention op
-with open(compiled_fn[1]) as f:
+# The third compilation is shape 16, so it should have kv_caches and the
+# ragged_paged_attention
+with open(compiled_fns[2]) as f:
     content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.flash_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
 
-# the third compilation is the decode phase,
-# it should have kv cache and the paged_attention op
-with open(compiled_fn[2]) as f:
+# The forth compilation is shape 32, so it should have kv_caches and the
+# ragged_paged_attention
+with open(compiled_fns[3]) as f:
     content = f.read()
-    assert "kv_caches" in content and "torch.ops.xla.paged_attention" in content
+    assert (kv_cache_prefix in content and attn_prefix in content)
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index f7a59f054b6..acb6b90f5f7 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -14,12 +14,17 @@
 def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
         m.setenv("VLLM_RPC_TIMEOUT", "30000")
-        compare_two_settings(
-            "google/gemma-2b",
-            arg1=[
-                "--enforce-eager",
-                f"-O{CompilationLevel.DYNAMO_ONCE}",
-            ],
-            arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
-            env1={},
-            env2={})
+        compare_two_settings("Qwen/Qwen2.5-1.5B-Instruct",
+                             arg1=[
+                                 "--max-model-len=256",
+                                 "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_ONCE}",
+                             ],
+                             arg2=[
+                                 "--max-model-len=256", "--max-num-seqs=32",
+                                 "--enforce-eager",
+                                 f"-O{CompilationLevel.DYNAMO_AS_IS}"
+                             ],
+                             env1={},
+                             env2={})

From f0e37c2d744752dcb846737e86ba289901422d5e Mon Sep 17 00:00:00 2001
From: Tristan Leclercq <49700633+tristanleclercq@users.noreply.github.com>
Date: Tue, 18 Mar 2025 00:27:26 +0100
Subject: [PATCH 0806/1240] [Bugfix] Fix bnb quantization for models with both
 HF-format and Mistral-format weights (#14950)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_bitsandbytes.py    |  2 ++
 vllm/model_executor/model_loader/loader.py | 31 +++++++++++++++++-----
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index d6844b8dc5f..1b6a9184014 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -15,6 +15,8 @@
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
+    ("mistralai/Mistral-7B-Instruct-v0.3",
+     "quantize inflight model with both HF and Mistral format weights")
 ]
 
 models_pre_qaunt_4bit_to_test = [
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c88af56e180..b2ffca2a4b4 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -762,7 +762,7 @@ def _get_weight_files(
         model_name_or_path: str,
         allowed_patterns: List[str],
         revision: Optional[str] = None,
-    ) -> Tuple[List[str], str]:
+    ) -> Tuple[str, List[str], str]:
         """Retrieve weight files. Download the files if necessary.
 
         Return the weight files and the file pattern."""
@@ -773,7 +773,7 @@ def _get_weight_files(
                 weight_files = glob.glob(
                     os.path.join(model_name_or_path, pattern))
                 if weight_files:
-                    return weight_files, pattern
+                    return model_name_or_path, weight_files, pattern
         else:
             hf_api = HfApi()
             repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
@@ -787,7 +787,8 @@ def _get_weight_files(
                         revision,
                         ignore_patterns=self.load_config.ignore_patterns,
                     )
-                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
+                    return hf_folder, glob.glob(
+                        os.path.join(hf_folder, pattern)), pattern
 
         raise RuntimeError(
             f"No model weights found in: `{model_name_or_path}`")
@@ -798,10 +799,28 @@ def _prepare_weights(self, model_name_or_path: str,
 
         allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
 
-        hf_weights_files, matched_pattern = self._get_weight_files(
+        hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
             model_name_or_path, allowed_patterns, revision)
 
-        if matched_pattern != "*.safetensors":
+        use_safetensors = matched_pattern == "*.safetensors"
+        is_local = os.path.isdir(model_name_or_path)
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file)
+        else:
             hf_weights_files = filter_files_not_needed_for_inference(
                 hf_weights_files)
 
@@ -809,7 +828,7 @@ def _prepare_weights(self, model_name_or_path: str,
             raise RuntimeError(
                 f"Cannot find any model weights with `{model_name_or_path}`")
 
-        return hf_weights_files, matched_pattern == "*.safetensors"
+        return hf_weights_files, use_safetensors
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:

From f95dda6e95cf32d148b7f83feae8bf8ec9428d1e Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Tue, 18 Mar 2025 00:05:52 -0700
Subject: [PATCH 0807/1240] [Neuron] trim attention kernel tests to fit trn1.2x
 instance (#14988)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/1_core/test_prefix_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 37d6679f8d5..5a811f6defe 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -314,7 +314,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
 
         # Test edge cases
         (1, 128, 16, 1024, 4, 2, 16, False),  # large decode batch
-        (16, 4, 8, 8192, 48, 1, 128, True),  # large prefill batch
+        (16, 4, 8, 1024, 4, 2, 128, True),  # large prefill batch
         (4, 12, 32, 2048, 16, 1, 32, True),  # multi-head attention (MHA)
         (4, 12, 32, 2048, 16, 16, 32, True),  # multi-query attention (MQA)
     ])

From 0bcbe6770003e3cbd1274dad7b1eeac056e6822d Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Tue, 18 Mar 2025 16:15:46 +0800
Subject: [PATCH 0808/1240] [Doc][V1] Fix V1 APC doc (#14920)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/v1/prefix_caching.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index 2fae22cc264..3d14a76840d 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -191,7 +191,7 @@ When the head block (least recently used block) of the free queue is cached, we
 
 In this example, we assume the block size is 4 (each block can cache 4 tokens), and we have 10 blocks in the KV-cache manager in total.
 
-**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 2 of 4 tokens.
+**Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
 :::{image} /assets/design/v1/prefix_caching/example-time-1.png
 :alt: Example Time 1
@@ -203,7 +203,7 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 :alt: Example Time 3
 :::
 
-**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 11 tokens are the same as request 0.** We can see that only 2 blocks (11 tokens) hit the cache, because the 3rd block only matches 3 of 4 tokens.
+**Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
 :::{image} /assets/design/v1/prefix_caching/example-time-4.png
 :alt: Example Time 4

From b559c6fce3edbca47677d96475273837fba51148 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 18 Mar 2025 05:47:53 -0400
Subject: [PATCH 0809/1240] [Kernels] LoRA - Retire SGMV and BGMV Kernels
 (#14685)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_lora.py          | 436 +++--------------
 tests/lora/test_punica_ops.py                 | 443 ++++--------------
 vllm/lora/ops/triton_ops/__init__.py          |  16 +-
 vllm/lora/ops/triton_ops/bgmv_expand.py       | 188 --------
 vllm/lora/ops/triton_ops/bgmv_expand_slice.py | 207 --------
 vllm/lora/ops/triton_ops/bgmv_shrink.py       | 168 -------
 .../{v1/v1_expand.py => lora_expand.py}       |  18 +-
 ...el_metadata.py => lora_kernel_metadata.py} |  10 +-
 .../{v1/v1_shrink.py => lora_shrink.py}       |  34 +-
 vllm/lora/ops/triton_ops/sgmv_expand.py       | 249 ----------
 vllm/lora/ops/triton_ops/sgmv_shrink.py       | 224 ---------
 vllm/lora/ops/triton_ops/utils.py             |  46 --
 vllm/lora/ops/triton_ops/v1/__init__.py       |  11 -
 vllm/lora/punica_wrapper/punica_gpu.py        | 284 +++--------
 vllm/v1/worker/lora_model_runner_mixin.py     |   7 +-
 15 files changed, 247 insertions(+), 2094 deletions(-)
 delete mode 100644 vllm/lora/ops/triton_ops/bgmv_expand.py
 delete mode 100644 vllm/lora/ops/triton_ops/bgmv_expand_slice.py
 delete mode 100644 vllm/lora/ops/triton_ops/bgmv_shrink.py
 rename vllm/lora/ops/triton_ops/{v1/v1_expand.py => lora_expand.py} (96%)
 rename vllm/lora/ops/triton_ops/{v1/v1_kernel_metadata.py => lora_kernel_metadata.py} (94%)
 rename vllm/lora/ops/triton_ops/{v1/v1_shrink.py => lora_shrink.py} (88%)
 delete mode 100644 vllm/lora/ops/triton_ops/sgmv_expand.py
 delete mode 100644 vllm/lora/ops/triton_ops/sgmv_shrink.py
 delete mode 100644 vllm/lora/ops/triton_ops/v1/__init__.py

diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py
index 115b92539f9..b4b91eda284 100644
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -17,13 +17,8 @@
 from utils import ArgPool, Bench, CudaGraphBenchParams
 from weight_shapes import WEIGHT_SHAPES
 
-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
+from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
 from vllm.utils import FlexibleArgumentParser
 
 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@@ -167,69 +162,25 @@ class OpType(Enum):
     """
     LoRA Ops to benchmark and its properties.
     """
-    SGMV_SHRINK = auto()
-    BGMV_SHRINK = auto()
-    SGMV_EXPAND = auto()
-    BGMV_EXPAND = auto()
-    BGMV_EXPAND_SLICE = auto()
-    V1_SHRINK = auto()
-    V1_EXPAND = auto()
+    LORA_SHRINK = auto()
+    LORA_EXPAND = auto()
 
     @staticmethod
     def from_str(s: str) -> "OpType":
-        if s.lower() == 'sgmv_shrink':
-            return OpType.SGMV_SHRINK
-        if s.lower() == 'sgmv_expand':
-            return OpType.SGMV_EXPAND
-        if s.lower() == 'bgmv_shrink':
-            return OpType.BGMV_SHRINK
-        if s.lower() == 'bgmv_expand':
-            return OpType.BGMV_EXPAND
-        if s.lower() == "bgmv_expand_slice":
-            return OpType.BGMV_EXPAND_SLICE
-        if s.lower() == "v1_shrink":
-            return OpType.V1_SHRINK
-        if s.lower() == "v1_expand":
-            return OpType.V1_EXPAND
+        if s.lower() == "lora_shrink":
+            return OpType.LORA_SHRINK
+        if s.lower() == "lora_expand":
+            return OpType.LORA_EXPAND
         raise ValueError(f"Unrecognized str {s} to convert to OpType")
 
     def is_shrink_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
-        ]
+        return self in [OpType.LORA_SHRINK]
 
     def is_expand_fn(self) -> bool:
-        return self in [
-            OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
-        ]
-
-    def is_prefill_op(self) -> bool:
-        return self in [
-            OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
-            OpType.V1_EXPAND
-        ]
-
-    def is_decode_op(self) -> bool:
-        return self in [
-            OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
-            OpType.V1_SHRINK, OpType.V1_EXPAND
-        ]
-
-    def is_expand_slice_fn(self) -> bool:
-        return self in [OpType.BGMV_EXPAND_SLICE]
+        return self in [OpType.LORA_EXPAND]
 
     def num_slices(self) -> list[int]:
-        if self in [
-                OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
-                OpType.V1_EXPAND
-        ]:
-            # SGMV kernels and v1 kernels supports slices
-            return [1, 2, 3]
-        if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
-            return [1]
-        if self in [OpType.BGMV_EXPAND_SLICE]:
-            return [2, 3]
-        raise ValueError(f"Unrecognized OpType {self}")
+        return [1, 2, 3]
 
     def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
             lora_rank: int) -> tuple[int, int, int]:
@@ -239,7 +190,7 @@ def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
             k = hidden_size
             n = lora_rank
         else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
             m = num_tokens
             k = lora_rank
             n = hidden_size
@@ -254,7 +205,7 @@ def matmul_dtypes(
         if self.is_shrink_fn():
             return op_dtype, op_dtype, torch.float32
         else:
-            assert self.is_expand_fn() or self.is_expand_slice_fn()
+            assert self.is_expand_fn()
             return torch.float32, op_dtype, op_dtype
 
     def matmul_shapes(
@@ -268,43 +219,19 @@ def matmul_shapes(
         m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
 
         b_shape = (num_loras, n, k)  # col-major
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
-            # SGMV shrink and V1 shrink kernels support num_slices inherently
-            # in the kernel.
+        if self in [OpType.LORA_SHRINK]:
+            # LoRA shrink kernels support num_slices inherently in the kernel.
             return ((m, k), b_shape, (num_slices, m, n))
-        if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            # SGMV expand and V1 expand kernels support num_slices inherently
-            # in the kernel
+        if self in [OpType.LORA_EXPAND]:
+            # LoRA expand kernels support num_slices inherently in the kernel
             return ((num_slices, m, k), b_shape, (m, n * num_slices))
-        if self == OpType.BGMV_SHRINK:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND:
-            return ((m, k), b_shape, (m, n))
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return ((num_slices, m, k), b_shape, (m, n * num_slices))
-
         raise ValueError(f"Unrecognized op_type {self}")
 
     def bench_fn(self) -> Callable:
-
-        def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
-            for x in kwargs_list:
-                bgmv_expand_slice(**x)
-
-        if self == OpType.SGMV_SHRINK:
-            return sgmv_shrink
-        if self == OpType.SGMV_EXPAND:
-            return sgmv_expand
-        if self == OpType.BGMV_SHRINK:
-            return bgmv_shrink
-        if self == OpType.BGMV_EXPAND:
-            return bgmv_expand
-        if self == OpType.BGMV_EXPAND_SLICE:
-            return emulate_bgmv_expand_slice
-        if self == OpType.V1_SHRINK:
-            return v1_shrink
-        if self == OpType.V1_EXPAND:
-            return v1_expand
+        if self == OpType.LORA_SHRINK:
+            return lora_shrink
+        if self == OpType.LORA_EXPAND:
+            return lora_expand
 
         raise ValueError(f"Unrecognized optype {self}")
 
@@ -318,34 +245,13 @@ def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
         """
         w_dtype = lora_weights[0].dtype
         num_slices = len(lora_weights)
-        if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
+        if self in [OpType.LORA_SHRINK]:
             for slice_idx in range(num_slices):
                 ref_group_gemm(ref_out=output[slice_idx, :],
                                input=input,
                                lora_weights=lora_weights[slice_idx],
                                **kwargs)
-        elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
-            hidden_size = lora_weights[0].shape[1]
-            for slice_idx in range(num_slices):
-                slice_offset = slice_idx * hidden_size
-                ref_group_gemm(
-                    ref_out=output[:, slice_offset:slice_offset + hidden_size],
-                    input=input[slice_idx].clone().to(dtype=w_dtype),
-                    lora_weights=lora_weights[slice_idx],
-                    **kwargs)
-        elif self == OpType.BGMV_SHRINK:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input,
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND:
-            assert num_slices == 1
-            ref_group_gemm(ref_out=output,
-                           input=input.clone().to(dtype=w_dtype),
-                           lora_weights=lora_weights[0],
-                           **kwargs)
-        elif self == OpType.BGMV_EXPAND_SLICE:
+        elif self in [OpType.LORA_EXPAND]:
             hidden_size = lora_weights[0].shape[1]
             for slice_idx in range(num_slices):
                 slice_offset = slice_idx * hidden_size
@@ -411,13 +317,11 @@ class BenchmarkTensors:
     input: torch.Tensor
     lora_weights_lst: list[torch.Tensor]
     output: torch.Tensor
-    # metadata tensors
+    # LoRA kernel metadata
+    lora_kernel_meta: LoRAKernelMeta
+    # Metadata tensors used in testing correctness
     seq_lens: torch.Tensor
-    seq_start_loc: torch.Tensor
     prompt_lora_mapping: torch.Tensor
-    token_lora_mapping: torch.Tensor
-    # v1 kernel metadata
-    v1_kernel_meta: Optional[V1KernelMeta] = None
 
     def io_types(self) -> str:
         return (f"{dtype_to_str(self.input.dtype)}x"
@@ -444,35 +348,29 @@ def make(ctx: BenchmarkContext,
         assert ctx.num_active_loras <= ctx.num_loras
         total_tokens = ctx.batch_size * ctx.seq_length
 
+        # Make metadata tensors involved in correctness testing.
         # Prepare seq lens tensor
         seq_len_tensor = torch.randint(ctx.seq_length, ctx.seq_length + 1,
                                        (ctx.batch_size, ))
-        # Prepare seq_start_loc tensor
-        seq_start_loc_tensor = torch.cumsum(torch.tensor(
-            [0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
-                                            dim=0)
         assert total_tokens == seq_len_tensor.sum()
         # Prepare prompt lora indices tensor
         prompt_lora_indices_tensor = make_prompt_lora_mapping(
             ctx.batch_size, ctx.num_active_loras, ctx.sort_by_lora_id, "cpu")
-        # Prepare token lora indices tensor
+
+        # Make LoRAKernelMeta
         token_lora_indices_tensor = make_token_lora_mapping(
             total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
             seq_len_tensor, "cpu")
-
-        v1_kernel_meta = None
-        if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
-            v1_kernel_meta = V1KernelMeta.make(
-                max_loras=ctx.num_loras,
-                max_num_tokens=token_lora_indices_tensor.size(0),
-                device="cpu")
-            v1_kernel_meta.prepare_tensors(
-                token_lora_mapping=token_lora_indices_tensor)
+        lora_kernel_meta = LoRAKernelMeta.make(
+            max_loras=ctx.num_loras,
+            max_num_tokens=token_lora_indices_tensor.size(0),
+            device="cpu")
+        lora_kernel_meta.prepare_tensors(
+            token_lora_mapping=token_lora_indices_tensor)
 
         return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
-                                seq_len_tensor, seq_start_loc_tensor,
-                                prompt_lora_indices_tensor,
-                                token_lora_indices_tensor, v1_kernel_meta)
+                                lora_kernel_meta, seq_len_tensor,
+                                prompt_lora_indices_tensor)
 
     def sanity_check(self) -> None:
         """
@@ -482,9 +380,9 @@ def sanity_check(self) -> None:
         # check metadata tensors
         assert torch.sum(self.seq_lens) == num_tokens
         num_seqs = self.seq_lens.shape[0]
-        assert self.seq_start_loc.shape[0] == num_seqs
+        #assert self.seq_start_loc.shape[0] == num_seqs
         assert self.prompt_lora_mapping.shape[0] == num_seqs
-        assert self.token_lora_mapping.shape[0] == num_tokens
+        assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
 
     def to_device(self, device: str):
         """
@@ -499,220 +397,27 @@ def to_device(tensor: torch.Tensor):
         self.input = to_device(self.input)
         self.output = to_device(self.output)
         self.seq_lens = to_device(self.seq_lens)
-        self.seq_start_loc = to_device(self.seq_start_loc)
         self.prompt_lora_mapping = to_device(self.prompt_lora_mapping)
-        self.token_lora_mapping = to_device(self.token_lora_mapping)
         for i in range(len(self.lora_weights_lst)):
             self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
 
-        # v1 meta
-        if self.v1_kernel_meta:
-            for field_name in V1KernelMeta.__dataclass_fields__:
-                field = getattr(self.v1_kernel_meta, field_name)
-                assert isinstance(field, torch.Tensor)
-                setattr(self.v1_kernel_meta, field_name, to_device(field))
+        # LoRA meta
+        for field_name in LoRAKernelMeta.__dataclass_fields__:
+            field = getattr(self.lora_kernel_meta, field_name)
+            assert isinstance(field, torch.Tensor)
+            setattr(self.lora_kernel_meta, field_name, to_device(field))
 
     def metadata(self) -> tuple[int, int, int]:
         """
         Return num_seqs, num_tokens and max_seq_len
         """
         num_seqs = self.seq_lens.shape[0]
-        num_tokens = self.token_lora_mapping.shape[0]
+        num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
         max_seq_len = torch.max(self.seq_lens).item()
         num_slices = len(self.lora_weights_lst)
         return num_seqs, num_tokens, max_seq_len, num_slices
 
-    def convert_to_sgmv_benchmark_tensors(self):
-        """
-        For sgmv punica kernels, when consecutive sequences have the
-        same LoRA ID, we just merge them together.
-        This happens in punica.py::compute_metadata
-        """
-
-        # Collapse seq_lens and seq_start_loc
-        _, seq_lens = torch.unique_consecutive(self.token_lora_mapping,
-                                               return_counts=True)
-        cum_result = torch.cumsum(seq_lens, dim=0)
-        seq_start_loc = torch.zeros_like(seq_lens)
-        seq_start_loc[1:].copy_(cum_result[:-1])
-
-        # Collapse prompt mapping
-        prompt_lora_mapping = torch.unique_consecutive(
-            self.prompt_lora_mapping)
-
-        assert torch.sum(seq_lens) == torch.sum(self.seq_lens), \
-         f"dont match - new {torch.sum(seq_lens)} vs {torch.sum(self.seq_lens)}"
-
-        self.prompt_lora_mapping = prompt_lora_mapping.to(
-            dtype=self.prompt_lora_mapping.dtype)
-        self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
-        self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
-
-    def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_slices, num_tokens, lora_rank]
-        assert len(o_shape) == 3
-        assert o_shape == (num_slices, num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'scaling': 1.0,
-        }
-
-    def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        self.convert_to_sgmv_benchmark_tensors()
-        self.sanity_check()
-        self.to_device(self.input.device)
-
-        num_seqs, num_tokens, max_seq_len, num_slices = self.metadata()
-
-        # Sanity check matrix shapes.
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape : [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape : [num_lora, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape : [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst,
-            'output_tensor': self.output,
-            'b_seq_start_loc': self.seq_start_loc,
-            'seq_len_tensor': self.seq_lens,
-            'lora_indices_tensor': self.prompt_lora_mapping,
-            'batches': num_seqs,
-            'max_seq_length': max_seq_len,
-            'token_nums': num_tokens,
-            'offset_start': 0,
-            'add_inputs': add_inputs,
-        }
-
-    def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, hidden_size]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        hidden_size = i_shape[1]
-        # Expected lora weight shape [num_loras, lora_rank, hidden_size]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == hidden_size
-        lora_rank = lw_shape[1]
-        # Expected output shape [num_tokens, lora_rank]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, lora_rank)
-
-        return {
-            'inputs': self.input,
-            'lora_a_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'scaling': 1.0
-        }
-
-    def as_bgmv_expand_kwargs(self, add_inputs: bool):
-        assert len(self.lora_weights_lst) == 1
-        self.to_device(self.input.device)
-
-        _, num_tokens, _, _ = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_tokens, lora_rank]
-        assert len(i_shape) == 2
-        assert i_shape[0] == num_tokens
-        lora_rank = i_shape[1]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size)
-
-        return {
-            'inputs': self.input,
-            'lora_b_weights': self.lora_weights_lst[0],
-            'output_tensor': self.output,
-            'lora_indices_tensor': self.token_lora_mapping,
-            'add_inputs': add_inputs
-        }
-
-    def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-
-        _, num_tokens, _, num_slices = self.metadata()
-        # Sanity check shapes
-        i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
-            0].shape, self.output.shape
-        # Expected input shape [num_slices, num_tokens, lora_rank]
-        assert len(i_shape) == 3
-        assert i_shape[0] == num_slices
-        assert i_shape[1] == num_tokens
-        lora_rank = i_shape[2]
-        # Expected lora weight shape [num_loras, hidden_size, lora_rank]
-        assert len(lw_shape) == 3
-        assert lw_shape[2] == lora_rank
-        hidden_size = lw_shape[1]
-        # Expected output shape [num_tokens, hidden_size * num_slices]
-        assert len(o_shape) == 2
-        assert o_shape == (num_tokens, hidden_size * num_slices)
-
-        self.to_device(self.input.device)
-
-        kwargs_list = []
-        for i in range(num_slices):
-            kwargs_list.append({
-                'inputs': self.input[i],
-                'lora_b_weights': self.lora_weights_lst[i],
-                'output_tensor': self.output,
-                'lora_indices_tensor': self.token_lora_mapping,
-                'slice_offset': i * hidden_size,
-                'slice_size': hidden_size,
-                'add_inputs': add_inputs,
-            })
-        return {'kwargs_list': kwargs_list}
-
-    def as_v1_shrink_kwargs(self) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_shrink_kwargs(self) -> dict[str, Any]:
         self.sanity_check()
         self.to_device(self.input.device)
 
@@ -737,17 +442,16 @@ def as_v1_shrink_kwargs(self) -> dict[str, Any]:
             'inputs': self.input,
             'lora_a_weights': self.lora_weights_lst,
             'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
             'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
             'scaling': 1.0,
         }
 
-    def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
-        assert self.v1_kernel_meta is not None
+    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
         self.sanity_check()
         self.to_device(self.input.device)
 
@@ -773,12 +477,12 @@ def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
             'inputs': self.input,
             'lora_b_weights': self.lora_weights_lst,
             'output_tensor': self.output,
-            'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
+            'token_lora_mapping': self.lora_kernel_meta.token_lora_mapping,
             'token_indices_sorted_by_lora_ids':
-            self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
-            'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
-            'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
-            'lora_ids': self.v1_kernel_meta.active_lora_ids,
+            self.lora_kernel_meta.token_indices_sorted_by_lora_ids,
+            'num_tokens_per_lora': self.lora_kernel_meta.num_tokens_per_lora,
+            'lora_token_start_loc': self.lora_kernel_meta.lora_token_start_loc,
+            'lora_ids': self.lora_kernel_meta.active_lora_ids,
             'offset_start': 0,
             'add_inputs': add_inputs,
         }
@@ -791,20 +495,10 @@ def bench_fn_kwargs(self,
         else:
             assert add_inputs is not None
 
-        if op_type == OpType.SGMV_SHRINK:
-            return self.as_sgmv_shrink_kwargs()
-        if op_type == OpType.SGMV_EXPAND:
-            return self.as_sgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_SHRINK:
-            return self.as_bgmv_shrink_kwargs()
-        if op_type == OpType.BGMV_EXPAND:
-            return self.as_bgmv_expand_kwargs(add_inputs)
-        if op_type == OpType.BGMV_EXPAND_SLICE:
-            return self.as_bgmv_expand_slice_kwargs(add_inputs)
-        if op_type == OpType.V1_SHRINK:
-            return self.as_v1_shrink_kwargs()
-        if op_type == OpType.V1_EXPAND:
-            return self.as_v1_expand_kwargs(add_inputs)
+        if op_type == OpType.LORA_SHRINK:
+            return self.as_lora_shrink_kwargs()
+        if op_type == OpType.LORA_EXPAND:
+            return self.as_lora_expand_kwargs(add_inputs)
         raise ValueError(f"Unrecognized optype {self}")
 
     def test_correctness(self, op_type: OpType,
@@ -993,10 +687,6 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
     for bench_ctx in bench_ctxs:
         for seq_len in args.seq_lengths:
             bench_ops: list[OpType] = args.op_types
-            if seq_len > 1:
-                # bench only prefill ops
-                bench_ops = [op for op in args.op_types if op.is_prefill_op()]
-
             seq_len_timers = []
             for bench_op in bench_ops:
                 for num_slices in bench_op.num_slices():
@@ -1206,13 +896,13 @@ def add_common_command_args(p: argparse.ArgumentParser):
     {use_cuda_graph_recommendation()}
 
     list_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
+        python3 benchmarks/kernels/benchmark_lora.py list_bench --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16 --hidden-sizes 2048 --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32
 
     model_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
+        python3 benchmarks/kernels/benchmark_lora.py model_bench --models meta-llama/Llama-3-8b  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16  --lora-ranks 16 --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 
 
     range_bench example:
-        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types bgmv_shrink bgmv_expand sgmv_shrink sgmv_expand bgmv_expand_slice --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
+        python3 benchmarks/kernels/benchmark_lora.py range_bench  --arg-pool-size 32 --batch-sizes 1 16 32 --dtype torch.float16   --num-loras 1 4 --op-types lora_shrink lora_expand --seq-lengths 1 16 --sort-by-lora-id 1 --cuda-graph-nops 32 --hidden-sizes-start 1024 --hidden-sizes-end 4096 --hidden-sizes-increment 1024 --lora-ranks-start 8 --lora-ranks-end 24 --lora-ranks-increment 8 
             """,  # noqa: E501
         formatter_class=argparse.RawTextHelpFormatter)
 
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index a412a80dd70..726d0c5f2f0 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -4,18 +4,13 @@
 import pytest
 import torch
 
-import vllm.lora.ops.triton_ops  # noqa: F401
-import vllm.lora.ops.triton_ops.v1  # noqa: F401
-from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
-                                     bgmv_shrink, sgmv_expand,
-                                     sgmv_expand_slice, sgmv_shrink)
+import vllm.lora.ops.torch_ops as torch_ops
+import vllm.lora.ops.triton_ops as triton_ops
+from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
-from vllm.lora.ops.triton_ops.v1 import V1KernelMeta
 from vllm.platforms import current_platform
 
-from .utils import (PunicaTensors, assert_close, generate_data,
-                    generate_data_for_expand_nslices,
-                    generate_data_for_nslices)
+from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
 
 # Utility shrink and expand operations used as reference implementations.
@@ -26,10 +21,10 @@ def sgmv_shrink_for_nslices(
         prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
         num_tokens: int, scaling: float):
     """
-    Wrapper around sgmv_shrink that handles any nslices.
+    Wrapper around torch_ops.sgmv_shrink that handles any nslices.
     """
     for index in range(nslices):
-        sgmv_shrink(
+        torch_ops.sgmv_shrink(
             inputs_tensor,
             lora_weights_lst[index],
             out_tensor[index],
@@ -53,11 +48,11 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
                             max_seq_length: int, num_tokens: int,
                             add_inputs: bool) -> None:
     """
-    Wrapper around sgmv_expand that handles any nslices.
+    Wrapper around torch_ops.sgmv_expand that handles any nslices.
     """
     if nslices == 1:
         # Verify the torch's sgmv_expand op
-        sgmv_expand(
+        torch_ops.sgmv_expand(
             inputs_tensor[0],
             lora_weights_lst[0],
             out_tensor,
@@ -73,7 +68,7 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
         slice_offset = 0
         for index in range(nslices):
             lora_weights = lora_weights_lst[index]
-            sgmv_expand_slice(
+            torch_ops.sgmv_expand_slice(
                 inputs_tensor[index],
                 lora_weights,
                 out_tensor,
@@ -93,12 +88,13 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()
 
 
-def check_shrink_kernels(batches: int, num_loras: int, rank: int,
-                         hidden_size: int, nslices: int, dtype: torch.dtype,
-                         device: str, seq_length: int, scaling: float):
+def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             scaling: float):
     """
-    Compare outputs of vllm.sgmv_shrink and vllm.v1_shrink kernel against a
-    reference implementation.
+    Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
+    kernels.
     """
     data: PunicaTensors = generate_data_for_nslices(
         batches,
@@ -118,35 +114,24 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
                       data.prompt_lora_mapping, batches, max_seq_length,
                       token_nums)
 
-    # Setup metadata information for the V1 kernel.
-    v1_meta = V1KernelMeta.make(max_loras=num_loras,
-                                max_num_tokens=token_nums,
-                                device='cuda')
-    v1_meta.prepare_tensors(data.token_lora_mapping)
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
 
     ref_out_tensor = data.ref_out_tensor
-    sgmv_out_tensor = data.our_out_tensor
-    v1_out_tensor = data.our_out_tensor.clone()
+    out_tensor = data.our_out_tensor.clone()
 
     # Preventing cache error pointer.
     with _dict_lock:
-        # SGMV shrink kernel
+        # lora_shrink kernel
         _LORA_A_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_shrink(
+        triton_ops.lora_shrink(
             data.inputs_tensor,
             data.lora_weights,
-            sgmv_out_tensor,
-            *sgmv_meta_args,
-            scaling,
-        )
-
-        # V1 shrink kernel
-        _LORA_A_PTR_DICT.clear()
-        torch.ops.vllm.v1_shrink(
-            data.inputs_tensor,
-            data.lora_weights,
-            v1_out_tensor,
-            *v1_meta.meta_args(token_nums=token_nums),
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
             scaling,
         )
 
@@ -160,16 +145,16 @@ def check_shrink_kernels(batches: int, num_loras: int, rank: int,
         scaling,
     )
 
-    assert_close(sgmv_out_tensor, ref_out_tensor)
-    assert_close(v1_out_tensor, ref_out_tensor)
+    assert_close(out_tensor, ref_out_tensor)
 
 
-def check_expand_kernels(batches: int, num_loras: int, rank: int,
-                         hidden_size: int, nslices: int, dtype: torch.dtype,
-                         device: str, seq_length: int, add_inputs: bool):
+def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
+                             hidden_size: int, nslices: int,
+                             dtype: torch.dtype, device: str, seq_length: int,
+                             add_inputs: bool):
     """
-    Compare outputs of vllm.sgmv_expand and vllm.v1_expand kernels against a
-    reference implementation.
+    Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
+    kernels.
     """
     data: PunicaTensors = generate_data_for_nslices(
         batches,
@@ -190,37 +175,25 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
                       data.prompt_lora_mapping, batches, max_seq_length,
                       token_nums)
 
-    # Setup metadata information for the V1 kernel.
-    v1_meta = V1KernelMeta.make(max_loras=num_loras,
-                                max_num_tokens=token_nums,
-                                device='cuda')
-    v1_meta.prepare_tensors(data.token_lora_mapping)
+    # Setup metadata information for the LoRA kernel.
+    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
+                                    max_num_tokens=token_nums,
+                                    device='cuda')
+    lora_meta.prepare_tensors(data.token_lora_mapping)
 
     # Setup output tensors
     ref_out_tensor = data.ref_out_tensor
-    sgmv_out_tensor = data.our_out_tensor
-    v1_out_tensor = data.our_out_tensor.clone()
+    out_tensor = data.our_out_tensor.clone()
 
     with _dict_lock:
-        # SGMV expand kernel
-        _LORA_B_PTR_DICT.clear()
-        torch.ops.vllm.sgmv_expand(
-            data.inputs_tensor,
-            data.lora_weights,
-            sgmv_out_tensor,
-            *sgmv_meta_args,
-            offset_start=0,
-            add_inputs=add_inputs,
-        )
-
-        # V1 expand kernel
+        # lora_expand kernel
         _LORA_B_PTR_DICT.clear()
-        torch.ops.vllm.v1_expand(data.inputs_tensor,
-                                 data.lora_weights,
-                                 v1_out_tensor,
-                                 *v1_meta.meta_args(token_nums=token_nums),
-                                 offset_start=0,
-                                 add_inputs=add_inputs)
+        triton_ops.lora_expand(data.inputs_tensor,
+                               data.lora_weights,
+                               out_tensor,
+                               *lora_meta.meta_args(token_nums=token_nums),
+                               offset_start=0,
+                               add_inputs=add_inputs)
 
     # Reference
     sgmv_expand_for_nslices(nslices,
@@ -231,124 +204,7 @@ def check_expand_kernels(batches: int, num_loras: int, rank: int,
                             *sgmv_meta_args,
                             add_inputs=add_inputs)
 
-    assert_close(sgmv_out_tensor, ref_out_tensor)
-    assert_close(v1_out_tensor, ref_out_tensor)
-
-
-def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      scaling: float):
-    """
-    Compare vllm.bgmv_shrink against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "shrink",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    bgmv_shrink(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        scaling,
-    )
-
-    data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand(batches: int, num_loras: int, rank: int,
-                      hidden_size: int, dtype: torch.dtype, device: str,
-                      add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        "expand",
-        device,
-    )
-
-    torch.ops.vllm.bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.our_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    bgmv_expand(
-        data.inputs_tensor,
-        data.lora_weights,
-        data.ref_out_tensor,
-        data.token_lora_mapping,
-        add_inputs=add_inputs,
-    )
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
-
-
-def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
-                            hidden_size: int, nslices: int, dtype: torch.dtype,
-                            device: str, add_inputs: bool):
-    """
-    Compare vllm.bgmv_expand_slice against a reference implementation.
-    """
-    seq_length = 1
-    data: PunicaTensors = generate_data_for_expand_nslices(
-        batches,
-        hidden_size,
-        num_loras,
-        rank,
-        seq_length,
-        dtype,
-        nslices,
-        device,
-    )
-
-    slice_offset = 0
-    for index in range(nslices):
-        torch.ops.vllm.bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.our_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-        bgmv_expand_slice(
-            data.inputs_tensor,
-            data.lora_weights[index],
-            data.ref_out_tensor,
-            data.token_lora_mapping,
-            slice_offset,
-            slice_size=hidden_size,
-            add_inputs=add_inputs,
-        )
-
-        slice_offset += hidden_size
-    assert_close(data.our_out_tensor, data.ref_out_tensor)
+    assert_close(out_tensor, ref_out_tensor)
 
 
 # Tests
@@ -490,31 +346,31 @@ def test_kernels(
     op_type: str,
 ):
     """
-    Tests SGMV and V1 kernels.
+    Tests LoRA kernels.
     """
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_shrink_kernels(batches=batches,
-                             num_loras=num_loras,
-                             rank=rank,
-                             hidden_size=hidden_size,
-                             nslices=nslices,
-                             dtype=dtype,
-                             device=device,
-                             seq_length=128,
-                             scaling=0.5)
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
     else:
-        check_expand_kernels(batches=batches,
-                             num_loras=num_loras,
-                             rank=rank,
-                             hidden_size=hidden_size,
-                             nslices=nslices,
-                             dtype=dtype,
-                             device=device,
-                             seq_length=128,
-                             add_inputs=True)
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)
 
 
 @pytest.mark.parametrize("batches", hs_test_params['batches'])
@@ -538,159 +394,28 @@ def test_kernels_hidden_size(
     op_type: str,
 ):
     """
-    Tests SGMV and V1 kernels.
+    Tests SGMV and LoRA kernels.
     """
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
     if op_type == "shrink":
-        check_shrink_kernels(batches=batches,
-                             num_loras=num_loras,
-                             rank=rank,
-                             hidden_size=hidden_size,
-                             nslices=nslices,
-                             dtype=dtype,
-                             device=device,
-                             seq_length=128,
-                             scaling=0.5)
-    else:
-        check_expand_kernels(batches=batches,
-                             num_loras=num_loras,
-                             rank=rank,
-                             hidden_size=hidden_size,
-                             nslices=nslices,
-                             dtype=dtype,
-                             device=device,
-                             seq_length=128,
-                             add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
-    else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-@pytest.mark.parametrize("op_type", ["shrink", "expand"])
-def test_punica_bgmv_hidden_size(
-    batches: int,
-    num_loras: int,
-    rank: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    device: str,
-    seed: int,
-    op_type: str,
-):
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    if op_type == "shrink":
-        check_bgmv_shrink(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          scaling=0.5)
+        check_lora_shrink_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 scaling=0.5)
     else:
-        check_bgmv_expand(batches=batches,
-                          num_loras=num_loras,
-                          rank=rank,
-                          hidden_size=hidden_size,
-                          dtype=dtype,
-                          device=device,
-                          add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
-                                    hidden_size: int, nslices: int,
-                                    dtype: torch.dtype, device: str,
-                                    seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
-
-
-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
-@pytest.mark.parametrize("nslices", [2, 3])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("seed", SEED)
-def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
-                                                rank: int, hidden_size: int,
-                                                nslices: int,
-                                                dtype: torch.dtype,
-                                                device: str, seed: int):
-
-    torch.set_default_device(device)
-    current_platform.seed_everything(seed)
-
-    check_bgmv_expand_slice(batches=batches,
-                            num_loras=num_loras,
-                            rank=rank,
-                            hidden_size=hidden_size,
-                            nslices=nslices,
-                            dtype=dtype,
-                            device=device,
-                            add_inputs=True)
+        check_lora_expand_kernel(batches=batches,
+                                 num_loras=num_loras,
+                                 rank=rank,
+                                 hidden_size=hidden_size,
+                                 nslices=nslices,
+                                 dtype=dtype,
+                                 device=device,
+                                 seq_length=128,
+                                 add_inputs=True)
diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py
index dc440f7327f..acae0d972f4 100644
--- a/vllm/lora/ops/triton_ops/__init__.py
+++ b/vllm/lora/ops/triton_ops/__init__.py
@@ -1,15 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from vllm.lora.ops.triton_ops.bgmv_expand import bgmv_expand
-from vllm.lora.ops.triton_ops.bgmv_expand_slice import bgmv_expand_slice
-from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
-from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
-from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink  # noqa: F401
+from vllm.lora.ops.triton_ops.lora_expand import lora_expand
+from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta
+from vllm.lora.ops.triton_ops.lora_shrink import lora_shrink
 
 __all__ = [
-    "bgmv_expand",
-    "bgmv_expand_slice",
-    "bgmv_shrink",
-    "sgmv_expand",
-    "sgmv_shrink",
+    "lora_expand",
+    "lora_shrink",
+    "LoRAKernelMeta",
 ]
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand.py b/vllm/lora/ops/triton_ops/bgmv_expand.py
deleted file mode 100644
index 98510b39661..00000000000
--- a/vllm/lora/ops/triton_ops/bgmv_expand.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-from .utils import get_lora_op_configs
-
-
-@triton.jit
-def _bgmv_expand_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    lora_indices,
-    xm_stride,
-    xk_stride,
-    l0_stride,
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    SPLIT_N: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
-    """
-    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
-    performance
-    """
-    pid_sn = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-    offset_k = tl.arange(0, BLOCK_K)
-    offset_n = tl.arange(0, BLOCK_N)
-    if EVEN_K:
-        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
-                          offset_k * xk_stride, )  # [BLOCK_K]
-    else:
-        tiled_a = tl.load(
-            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-            mask=offset_k < K,
-            other=0,
-        )  # [BLOCK_K]
-    # N must be divisible by SPLIT_N
-    split_n_length = tl.cdiv(N, SPLIT_N)
-    if CAST_TYPE:
-        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-    # sliding  to  next row-block
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             pid_sn * split_n_length * lora_k_stride)
-    c_ptr = out_ptr + cur_batch * cm_stride + pid_sn * split_n_length
-    for n in range(0, split_n_length, BLOCK_N):
-        current_n = n + offset_n
-        current_n_c = tl.max_contiguous(current_n, BLOCK_N)
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
-                                                              < K)
-        c_mask = current_n < split_n_length
-        tiled_b = tl.load(
-            b_ptr + current_n_c[:, None] * lora_k_stride +
-            offset_k[None, :] * lora_n_stride,
-            mask=b_ptr_mask,
-            other=0.0,
-        )  # [BLOCK_N,BLOCK_K]
-        if ADD_INPUTS:
-            tiled_out = tl.load(c_ptr + current_n * cn_stride,
-                                mask=c_mask,
-                                other=0.0)
-            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
-        else:
-            accumulator = tl.sum(tiled_a * tiled_b, 1)
-
-        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
-
-
-@torch.inference_mode()
-def _bgmv_expand(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    add_inputs: bool = True,
-) -> None:
-    """
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch, An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        add_inputs (bool, optional):  Defaults to False, adds the final lora 
-            results to the output.
-    """
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(1) == lora_b_weights.size(-1)
-
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-    assert lora_b_weights.is_contiguous()
-
-    # TODO tuning this config
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_K = triton.next_power_of_2(K)
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-    batches = lora_indices_tensor.size(0)
-    config = get_lora_op_configs("expand", batches, N)
-    grid = lambda META: (
-        META["SPLIT_N"],
-        batches,
-    )
-    _bgmv_expand_kernel[grid](
-        inputs,
-        lora_b_weights,
-        output_tensor,
-        N,
-        K,
-        lora_indices_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        BLOCK_K=BLOCK_K,
-        EVEN_K=EVEN_K,
-        ADD_INPUTS=ADD_INPUTS,
-        CAST_TYPE=CAST_TYPE,
-        **config,
-    )
-    return
-
-
-def bgmv_expand_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    add_inputs: bool = True,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="bgmv_expand",
-        op_func=_bgmv_expand,
-        mutates_args=["output_tensor"],
-        fake_impl=bgmv_expand_fake,
-    )
-    bgmv_expand = torch.ops.vllm.bgmv_expand
-
-except AttributeError:
-    bgmv_expand = _bgmv_expand
diff --git a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py b/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
deleted file mode 100644
index 48804123c1e..00000000000
--- a/vllm/lora/ops/triton_ops/bgmv_expand_slice.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-from .utils import get_lora_op_configs
-
-
-@triton.jit
-def _bgmv_expand_slice_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    lora_indices,
-    xm_stride,
-    xk_stride,
-    l0_stride,
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    slice_offset,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    SPLIT_N: tl.constexpr,
-    EVEN_K: tl.constexpr,
-    ADD_INPUTS: tl.constexpr,
-    CAST_TYPE: tl.constexpr,
-):
-    """
-    GroupGEMV, additionally, introducing SPLIT_N can improve large hidden_size's
-    performance
-    """
-    pid_sn = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-    offset_k = tl.arange(0, BLOCK_K)
-    offset_n = tl.arange(0, BLOCK_N)
-    if EVEN_K:
-        tiled_a = tl.load(input_ptr + cur_batch * xm_stride +
-                          offset_k * xk_stride, )  # [BLOCK_K]
-    else:
-        tiled_a = tl.load(
-            input_ptr + cur_batch * xm_stride + offset_k * xk_stride,
-            mask=offset_k < K,
-            other=0,
-        )  # [BLOCK_K]
-    # N must be divisible by SPLIT_N
-    split_n_length = tl.cdiv(N, SPLIT_N)
-    if CAST_TYPE:
-        tiled_a = tiled_a.to(lora_ptr.dtype.element_ty)
-    # sliding  to  next row-block
-    b_ptr = (lora_ptr + l0_stride * lora_index +
-             pid_sn * split_n_length * lora_k_stride)
-    c_ptr = (out_ptr + cur_batch * cm_stride + pid_sn * split_n_length +
-             slice_offset * cn_stride)
-
-    for n in range(0, split_n_length, BLOCK_N):
-        current_n = n + offset_n
-        b_ptr_mask = (current_n[:, None] < split_n_length) & (offset_k[None, :]
-                                                              < K)
-        c_mask = current_n < split_n_length
-        tiled_b = tl.load(
-            b_ptr + current_n[:, None] * lora_k_stride +
-            offset_k[None, :] * lora_n_stride,
-            mask=b_ptr_mask,
-            other=0.0,
-        )  # [BLOCK_N,BLOCK_K]
-
-        if ADD_INPUTS:
-            # explicitly pass in other=None to tell triton that masked values
-            # can be uninitialized. This is OK because the later tl.store
-            # operation uses the same mask, eliminating the risk of garbage
-            # values propagating
-            tiled_out = tl.load(c_ptr + current_n * cn_stride,
-                                mask=c_mask,
-                                other=None)
-            accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out
-        else:
-            accumulator = tl.sum(tiled_a * tiled_b, 1)
-
-        tl.store(c_ptr + current_n * cn_stride, accumulator, mask=c_mask)
-
-
-@torch.inference_mode()
-def _bgmv_expand_slice(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = True,
-) -> None:
-    """
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (torch.Tensor): lora'b weight
-        output_tensor (torch.Tensor): output tensor
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch, An index of -1 means no lora should be
-            applied.
-        slice_offset (int): output_tensor's offset
-        slice_size (int): current output_tensor's size
-        batches (int): batch size
-        add_inputs (bool, optional): Defaults to False.
-    """
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    assert lora_b_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(1) == lora_b_weights.size(-1)
-
-    assert slice_size == lora_b_weights.size(-2)
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-
-    if lora_b_weights.ndim == 4:  # shape:(lora_num,1,size,rank)
-        assert lora_b_weights.size(1) == 1
-        lora_b_weights = lora_b_weights.squeeze(dim=1)
-    else:
-        assert lora_b_weights.ndim == 3  # shape:(lora_num,size,rank)
-
-    assert lora_b_weights.is_contiguous()
-
-    # TODO tuning this config
-
-    N, K = lora_b_weights.shape[-2:]  # K= rank,N=hidden_size
-    BLOCK_K = triton.next_power_of_2(K)
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-    if inputs.dtype == torch.float32 and lora_b_weights.dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-
-    batches = lora_indices_tensor.size(0)
-
-    config = get_lora_op_configs("expand", batches, N)
-
-    grid = lambda META: (
-        META["SPLIT_N"],
-        batches,
-    )
-    _bgmv_expand_slice_kernel[grid](
-        inputs,
-        lora_b_weights,
-        output_tensor,
-        N,
-        K,
-        lora_indices_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_b_weights.stride(0),
-        lora_b_weights.stride(1),
-        lora_b_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        slice_offset,
-        BLOCK_K=BLOCK_K,
-        EVEN_K=EVEN_K,
-        ADD_INPUTS=ADD_INPUTS,
-        CAST_TYPE=CAST_TYPE,
-        **config,
-    )
-    return
-
-
-def bgmv_expand_slice_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    slice_offset: int,
-    slice_size: int,
-    add_inputs: bool = True,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="bgmv_expand_slice",
-        op_func=_bgmv_expand_slice,
-        mutates_args=["output_tensor"],
-        fake_impl=bgmv_expand_slice_fake,
-    )
-    bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
-
-except AttributeError:
-    bgmv_expand_slice = _bgmv_expand_slice
diff --git a/vllm/lora/ops/triton_ops/bgmv_shrink.py b/vllm/lora/ops/triton_ops/bgmv_shrink.py
deleted file mode 100644
index 227a5765e56..00000000000
--- a/vllm/lora/ops/triton_ops/bgmv_shrink.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-from .utils import get_lora_op_configs
-
-
-@triton.jit
-def _bgmv_shrink_kernel(
-    input_ptr,
-    lora_ptr,
-    out_ptr,
-    N,
-    K,
-    lora_indices,
-    scaling,
-    xm_stride,
-    xk_stride,
-    l0_stride,
-    lora_k_stride,
-    lora_n_stride,
-    cm_stride,
-    cn_stride,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    SPLIT_K: tl.constexpr,
-):
-    """
-    GroupGEMV, additionally, introducing SPLIT-K can improve large hidden_size's
-    performance
-    """
-    pid_sk = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-
-    offset_n = tl.arange(0, BLOCK_N)
-    offset_k = tl.arange(0, BLOCK_K) + pid_sk * BLOCK_K
-    a_ptr = input_ptr + cur_batch * xm_stride
-    b_ptr = lora_ptr + l0_stride * lora_index
-    accumulator = tl.zeros((BLOCK_N, ), dtype=tl.float32)
-    for k in range(0, K, BLOCK_K * SPLIT_K):
-        current_k = k + offset_k
-        current_k_c = tl.max_contiguous(current_k, BLOCK_K)
-        tiled_a = tl.load(
-            a_ptr + current_k_c,
-            mask=current_k < K,
-            other=0.0,
-        )  # [BLOCK_K]
-        b_ptr_mask = (offset_n[:, None] < N) & (current_k[None, :] < K)
-
-        tiled_b = tl.load(
-            b_ptr + offset_n[:, None] * lora_k_stride +
-            current_k[None, :] * lora_n_stride,
-            mask=b_ptr_mask,
-            other=0.0,
-        )  # [BLOCK_N,BLOCK_K]
-
-        accumulator += tl.sum(tiled_a * tiled_b, 1)
-    accumulator *= scaling
-    offset_cn = tl.arange(0, BLOCK_N)
-    c_ptr = out_ptr + cur_batch * cm_stride + offset_cn * cn_stride
-    c_mask = offset_cn < N
-    if SPLIT_K == 1:
-        tl.store(c_ptr, accumulator, mask=c_mask)
-    else:
-        tl.atomic_add(c_ptr, accumulator, mask=c_mask)
-
-
-@torch.inference_mode()
-def _bgmv_shrink(
-    inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    scaling: float = 1.0,
-) -> None:
-    """
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_a_weights (torch.Tensor): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        scaling (float):  Scaling factor.
-    """
-    assert inputs.dtype == lora_a_weights.dtype
-    assert inputs.dtype in [torch.float16, torch.bfloat16]
-    assert lora_a_weights.dtype in [
-        torch.float16,
-        torch.bfloat16,
-    ]
-    assert inputs.size(1) == lora_a_weights.size(-1)
-    assert inputs.is_contiguous()
-
-    if lora_a_weights.ndim == 4:  # shape:(lora_num,1,rank, size)
-        assert lora_a_weights.size(1) == 1
-        lora_a_weights = lora_a_weights.squeeze(dim=1)
-    else:
-        assert lora_a_weights.ndim == 3  # shape:(lora_num,rank, size)
-    assert lora_a_weights.is_contiguous()
-    assert output_tensor.is_contiguous()
-    # TODO tuning this config
-    batches = lora_indices_tensor.size(0)
-    N, K = lora_a_weights.shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_N = triton.next_power_of_2(N)
-    # First try to load optimal config from the file
-    config = get_lora_op_configs("bgmv_shrink", batches, K)
-
-    grid = lambda META: (
-        META["SPLIT_K"],
-        batches,
-    )
-    _bgmv_shrink_kernel[grid](
-        inputs,
-        lora_a_weights,
-        output_tensor,
-        N,
-        K,
-        lora_indices_tensor,
-        scaling,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_a_weights.stride(0),
-        lora_a_weights.stride(1),
-        lora_a_weights.stride(2),
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        BLOCK_N=BLOCK_N,
-        **config,
-    )
-    return
-
-
-def bgmv_shrink_fake(
-    inputs: torch.Tensor,
-    lora_a_weights: torch.Tensor,
-    output_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    scaling: float = 1.0,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="bgmv_shrink",
-        op_func=_bgmv_shrink,
-        mutates_args=["output_tensor"],
-        fake_impl=bgmv_shrink_fake,
-    )
-    bgmv_shrink = torch.ops.vllm.bgmv_shrink
-
-except AttributeError:
-    bgmv_shrink = _bgmv_shrink
diff --git a/vllm/lora/ops/triton_ops/v1/v1_expand.py b/vllm/lora/ops/triton_ops/lora_expand.py
similarity index 96%
rename from vllm/lora/ops/triton_ops/v1/v1_expand.py
rename to vllm/lora/ops/triton_ops/lora_expand.py
index 20c7f8f4c7f..b47e491ad7e 100644
--- a/vllm/lora/ops/triton_ops/v1/v1_expand.py
+++ b/vllm/lora/ops/triton_ops/lora_expand.py
@@ -18,7 +18,7 @@
 
 
 @triton.jit
-def _v1_expand_kernel(
+def _lora_expand_kernel(
         input_ptr,
         lora_ptr,
         out_ptr,
@@ -125,7 +125,7 @@ def _v1_expand_kernel(
 
 
 @torch.inference_mode()
-def _v1_expand(
+def _lora_expand(
     inputs: torch.Tensor,  # shape [num_slices, num_tokens, lora_rank]
     lora_b_weights: List[
         torch.Tensor],  # shape [num_lora, hidden_size, lora_rank]
@@ -216,7 +216,7 @@ def _v1_expand(
         MAX_LORAS,
     )
 
-    _v1_expand_kernel[grid](
+    _lora_expand_kernel[grid](
         inputs,
         lora_ptr_tensor,
         output_tensor,
@@ -254,7 +254,7 @@ def _v1_expand(
     return
 
 
-def _v1_expand_fake(
+def _lora_expand_fake(
     inputs: torch.Tensor,
     lora_b_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
@@ -271,12 +271,12 @@ def _v1_expand_fake(
 
 try:
     direct_register_custom_op(
-        op_name="v1_expand",
-        op_func=_v1_expand,
+        op_name="lora_expand",
+        op_func=_lora_expand,
         mutates_args=["output_tensor"],
-        fake_impl=_v1_expand_fake,
+        fake_impl=_lora_expand_fake,
     )
-    v1_expand = torch.ops.vllm.v1_expand
+    lora_expand = torch.ops.vllm.lora_expand
 
 except AttributeError:
-    v1_expand = _v1_expand
+    lora_expand = _lora_expand
diff --git a/vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
similarity index 94%
rename from vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py
rename to vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 57b4dd7a902..2add1177e84 100644
--- a/vllm/lora/ops/triton_ops/v1/v1_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-V1 LoRA kernels metadata preparation utilities.
+LoRA kernels metadata preparation utilities.
 """
 
 from dataclasses import dataclass
@@ -10,7 +10,7 @@
 
 
 @dataclass
-class V1KernelMeta:
+class LoRAKernelMeta:
     token_lora_mapping: torch.Tensor
     token_indices_sorted_by_lora_ids: torch.Tensor
     active_lora_ids: torch.Tensor
@@ -19,7 +19,7 @@ class V1KernelMeta:
 
     @staticmethod
     def make(max_loras: int, max_num_tokens: int,
-             device: Union[torch.device, str]) -> "V1KernelMeta":
+             device: Union[torch.device, str]) -> "LoRAKernelMeta":
 
         token_lora_mapping = torch.empty(max_num_tokens,
                                          dtype=torch.int32,
@@ -47,7 +47,7 @@ def make(max_loras: int, max_num_tokens: int,
         lora_token_start_loc = torch.zeros(max_loras + 2,
                                            dtype=torch.int32,
                                            device=device)
-        return V1KernelMeta(
+        return LoRAKernelMeta(
             token_lora_mapping=token_lora_mapping,
             token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
             active_lora_ids=active_lora_ids,
@@ -105,7 +105,7 @@ def meta_args(
         This function returns the kernel metadata required for the current
         forward pass execution of the kernel. The function returns all the
         metadata required by the kernel, in order, as a tuple, so it can be
-        unpacked directly during the v1_shrink/v1_expand function call.
+        unpacked directly during the lora_shrink/lora_expand function call.
 
         Args:
             token_nums (int): Number of input tokens in the current forward
diff --git a/vllm/lora/ops/triton_ops/v1/v1_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink.py
similarity index 88%
rename from vllm/lora/ops/triton_ops/v1/v1_shrink.py
rename to vllm/lora/ops/triton_ops/lora_shrink.py
index 39affd18922..a97c50c44f4 100644
--- a/vllm/lora/ops/triton_ops/v1/v1_shrink.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink.py
@@ -18,15 +18,15 @@
 
 
 @triton.jit
-def _v1_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
-                      token_indices_sorted_by_lora_ids, num_tokens_per_lora,
-                      lora_token_start_loc, lora_ids, scaling, input_d0_stride,
-                      input_d1_stride, lora_d0_stride, lora_d1_stride,
-                      lora_d2_stride, output_d0_stride, output_d1_stride,
-                      output_d2_stride, BLOCK_M: tl.constexpr,
-                      BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
-                      EVEN_K: tl.constexpr, SPLIT_K: tl.constexpr,
-                      SLICE_NUM: tl.constexpr):
+def _lora_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
+                        token_indices_sorted_by_lora_ids, num_tokens_per_lora,
+                        lora_token_start_loc, lora_ids, scaling,
+                        input_d0_stride, input_d1_stride, lora_d0_stride,
+                        lora_d1_stride, lora_d2_stride, output_d0_stride,
+                        output_d1_stride, output_d2_stride,
+                        BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
+                        BLOCK_K: tl.constexpr, EVEN_K: tl.constexpr,
+                        SPLIT_K: tl.constexpr, SLICE_NUM: tl.constexpr):
 
     cta_n_num = tl.cdiv(N, BLOCK_N)
     cta_m_num = tl.cdiv(M, BLOCK_M)
@@ -96,7 +96,7 @@ def _v1_shrink_kernel(input_ptr, lora_ptr, out_ptr, M, N, K,
 
 
 @torch.inference_mode()
-def _v1_shrink(
+def _lora_shrink(
     inputs: torch.Tensor,  #  shape [num_tokens, hidden_size]
     lora_a_weights: List[
         torch.Tensor],  # shape [num_loras, lora_rank, hidden_size]
@@ -174,7 +174,7 @@ def _v1_shrink(
         MAX_LORAS,
     )
 
-    _v1_shrink_kernel[grid](
+    _lora_shrink_kernel[grid](
         inputs,
         lora_ptr_tensor,
         output_tensor,
@@ -209,7 +209,7 @@ def _v1_shrink(
     return
 
 
-def _v1_shrink_fake(
+def _lora_shrink_fake(
     inputs: torch.Tensor,
     lora_a_weights: List[torch.Tensor],
     output_tensor: torch.Tensor,
@@ -225,12 +225,12 @@ def _v1_shrink_fake(
 
 try:
     direct_register_custom_op(
-        op_name="v1_shrink",
-        op_func=_v1_shrink,
+        op_name="lora_shrink",
+        op_func=_lora_shrink,
         mutates_args=["output_tensor"],
-        fake_impl=_v1_shrink_fake,
+        fake_impl=_lora_shrink_fake,
     )
-    v1_shrink = torch.ops.vllm.v1_shrink
+    lora_shrink = torch.ops.vllm.lora_shrink
 
 except AttributeError:
-    v1_shrink = _v1_shrink
+    lora_shrink = _lora_shrink
diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py
deleted file mode 100644
index 6aa3eafaba4..00000000000
--- a/vllm/lora/ops/triton_ops/sgmv_expand.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023).
-Punica: Multi-Tenant LoRA Serving.
-https://arxiv.org/abs/2310.18547
-"""
-
-from typing import List
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-from .kernel_utils import do_expand_kernel
-from .utils import _get_lora_b_ptr
-
-
-@triton.jit
-def _sgmv_expand_kernel(
-        input_ptr,
-        lora_ptr,
-        out_ptr,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_lens,
-        lora_indices,
-        slice_start_loc,
-        input_d0_stride,
-        input_d1_stride,
-        input_d2_stride,  # 1
-        ls_d0_ptr,
-        ls_d1_ptr,
-        ls_d2_ptr,  # 1
-        output_d0_stride,
-        output_d1_stride,  # 1
-        output_hs_ptr,
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        EVEN_K: tl.constexpr,
-        ADD_INPUTS: tl.constexpr,
-        CAST_TYPE: tl.constexpr,
-        SLICE_NUM: tl.constexpr,
-        SAME_STRIDE: tl.constexpr):
-    """
-
-    Similar to the 'sgmv_expand' operator, but with an added parameter
-    'slice_offset'. The reason for not reusing the 'sgmv_expand' operator
-    might be that in the future, we could implement a fusion operator to
-    achieve the current functionality instead of having to call it multiple
-    times.
-    """
-    pid = tl.program_id(axis=0)
-    cur_batch = tl.program_id(axis=1)
-    slice_id = tl.program_id(axis=2)
-    cta_n_num = tl.cdiv(N, BLOCK_N)
-    # When the output dimensions of each slice are the same,cur_n=N, otherwise
-    # cur_n=tl.load(output_hs_ptr + slice_id), this situation exists in GQA's
-    # qkv linear.
-    curr_N = N if SAME_STRIDE else tl.load(output_hs_ptr + slice_id)
-    pid_m = pid // cta_n_num
-    pid_n = pid % cta_n_num
-
-    M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M >= M:
-        return
-    if pid_n * BLOCK_N >= curr_N:
-        return
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-
-    m_offset = tl.load(b_seq_start_loc + cur_batch)
-
-    cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
-    cta_m_offset = m_offset + (pid_m * BLOCK_M)
-    offset_m = tl.arange(0, BLOCK_M)
-    ram = cta_m_offset + tl.max_contiguous(
-        tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
-    do_expand_kernel(
-        pid_n,
-        lora_index,
-        slice_id,
-        input_ptr,
-        lora_ptr,
-        out_ptr,
-        curr_N,
-        K,
-        cta_m_len,
-        ram,  # array identifying the rows of Input ptr to operate on
-        slice_start_loc,
-        # input ptr strides
-        input_d0_stride,
-        input_d1_stride,
-        input_d2_stride,
-        # lora ptr strides
-        ls_d0_ptr,
-        ls_d1_ptr,
-        ls_d2_ptr,
-        # out ptr strides
-        output_d0_stride,
-        output_d1_stride,
-        # constants
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        SAME_STRIDE,
-        SLICE_NUM,
-        EVEN_K,
-        CAST_TYPE,
-        ADD_INPUTS,
-    )
-
-
-@torch.inference_mode()
-def _sgmv_expand(
-    inputs: torch.Tensor,
-    lora_b_weights: List[torch.Tensor],
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    offset_start: int = 0,
-    add_inputs: bool = False,
-) -> None:
-    """
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_b_weights (List[torch.Tensor]): lora'b weight
-        output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4].
-        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
-            length of the sequences in the batch.
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        max_seq_length (int): The max sequence lengths of the sequences in the 
-            batch.
-        token_nums (int): The token numbers in the batch. Used to verify if the 
-            token numbers in the inputs matches the one in the metadata.
-        offset_start (int, optional): Offset start for output_tensor. 
-            Defaults to 0.
-        add_inputs (bool, optional): Whether to add the input tensor to the 
-            output tensor. Defaults to False.
-    """
-    assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
-    for weight in lora_b_weights:
-        assert weight.dtype in [torch.float16, torch.bfloat16]
-
-    assert inputs.size(1) == token_nums
-    assert inputs.size(0) == len(lora_b_weights)
-
-    assert b_seq_start_loc.size(0) == batches
-    assert lora_indices_tensor.size(0) == batches
-    assert output_tensor.is_contiguous()
-    (slice_start_tensor, lora_ptr_tensor, lora_strides_d0_tensor,
-     lora_strides_d1_tensor, lora_strides_d2_tensor, hidden_sizes_tensor,
-     same_stride, MAX_N) = _get_lora_b_ptr(lora_b_weights, offset_start,
-                                           b_seq_start_loc.device)
-
-    # TODO tuning this config
-    K = lora_b_weights[0].shape[-1]  # K= rank
-
-    BLOCK_M = 64
-    BLOCK_N = 128
-    BLOCK_K = 16
-    EVEN_K = K % BLOCK_K == 0
-    ADD_INPUTS = add_inputs
-    CAST_TYPE = False
-
-    if inputs.dtype == torch.float32 and lora_b_weights[0].dtype in [
-            torch.float16,
-            torch.bfloat16,
-    ]:
-        CAST_TYPE = True
-    grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(MAX_N, BLOCK_N),
-        batches,
-        len(lora_b_weights),
-    )
-    _sgmv_expand_kernel[grid](
-        inputs,
-        lora_ptr_tensor,
-        output_tensor,
-        MAX_N,
-        K,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        slice_start_tensor,
-        inputs.stride(0),
-        inputs.stride(1),
-        inputs.stride(2),
-        lora_strides_d0_tensor,
-        lora_strides_d1_tensor,
-        lora_strides_d2_tensor,
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        hidden_sizes_tensor,
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        ADD_INPUTS,
-        CAST_TYPE,
-        len(lora_b_weights),
-        same_stride,
-    )
-    return
-
-
-def _sgmv_expand_fake(
-    inputs: torch.Tensor,
-    lora_b_weights: List[torch.Tensor],
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    offset_start: int = 0,
-    add_inputs: bool = False,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="sgmv_expand",
-        op_func=_sgmv_expand,
-        mutates_args=["output_tensor"],
-        fake_impl=_sgmv_expand_fake,
-    )
-    sgmv_expand = torch.ops.vllm.sgmv_expand
-
-except AttributeError:
-    sgmv_expand = _sgmv_expand
diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py
deleted file mode 100644
index b8ed0b020f9..00000000000
--- a/vllm/lora/ops/triton_ops/sgmv_shrink.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""
-Based on:
-Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). 
-Punica: Multi-Tenant LoRA Serving. 
-https://arxiv.org/abs/2310.18547
-"""
-
-from typing import List
-
-import torch
-import triton
-import triton.language as tl
-
-from vllm.utils import direct_register_custom_op
-
-from .kernel_utils import do_shrink_kernel
-from .utils import _get_lora_a_ptr
-
-
-@triton.jit
-def _sgmv_shrink_kernel(
-        input_ptr,
-        lora_ptr,  #1-3
-        out_ptr,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_lens,
-        lora_indices,
-        scaling,
-        input_d0_stride,
-        input_d1_stride,  # 1
-        lora_d0_stride,
-        lora_d1_stride,
-        lora_d2_stride,  # 1
-        output_d0_stride,
-        output_d1_stride,
-        output_d2_stride,  # 1 
-        BLOCK_M: tl.constexpr,
-        BLOCK_N: tl.constexpr,
-        BLOCK_K: tl.constexpr,
-        EVEN_K: tl.constexpr,
-        SPLIT_K: tl.constexpr,
-        SLICE_NUM: tl.constexpr):
-    """
-    The sgmv's shrink triton kernel is based on GroupGEMM+SPLIT-K.
-    The GEMM of Multi-LoRA can be considered as GroupGEMM. Additionally,
-    introducing SPLIT-K can improve performance
-    """
-    pid = tl.program_id(axis=0)
-    pid_mix = tl.program_id(axis=1)
-    cur_batch = tl.program_id(axis=2)
-    cta_n_num = tl.cdiv(N, BLOCK_N)
-    pid_m = pid // cta_n_num
-    pid_n = pid % cta_n_num
-    if SLICE_NUM == 1:
-        slice_id: tl.constexpr = 0
-        pid_sk = tl.program_id(axis=1)
-    else:
-        pid_mix = tl.program_id(axis=1)
-        slice_id = pid_mix // SPLIT_K
-        pid_sk = pid_mix % SPLIT_K
-
-    M = tl.load(seq_lens + cur_batch)
-    if pid_m * BLOCK_M >= M:
-        return
-    lora_index = tl.load(lora_indices + cur_batch)
-    if lora_index == -1:
-        return
-
-    m_offset = tl.load(b_seq_start_loc + cur_batch)
-
-    cta_m_len = min(BLOCK_M, M - (pid_m * BLOCK_M))
-    cta_m_offset = m_offset + (pid_m * BLOCK_M)
-    offset_m = tl.arange(0, BLOCK_M)
-    ram = cta_m_offset + tl.max_contiguous(
-        tl.multiple_of(offset_m % cta_m_len, BLOCK_M), BLOCK_M)
-
-    do_shrink_kernel(
-        pid_n,
-        pid_sk,
-        slice_id,
-        lora_index,
-        input_ptr,
-        lora_ptr,
-        out_ptr,
-        N,
-        K,
-        cta_m_len,
-        ram,
-        # input strides
-        input_d0_stride,
-        input_d1_stride,
-        # lora strides
-        lora_d0_stride,
-        lora_d1_stride,
-        lora_d2_stride,
-        # output strides
-        output_d0_stride,
-        output_d1_stride,
-        output_d2_stride,
-        scaling,
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        SPLIT_K,
-        SLICE_NUM)
-
-
-@torch.inference_mode()
-def _sgmv_shrink(
-    inputs: torch.Tensor,
-    lora_a_weights: List[torch.Tensor],
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    scaling: float,
-) -> None:
-    """
-    Args:
-        inputs (torch.Tensor): input tensor
-        lora_a_weights (List[torch.Tensor]): lora'a weight
-        output_tensor (torch.Tensor): output tensor
-        b_seq_start_loc (torch.Tensor): (batch_size,). The cumulative
-            sequence lengths of the sequences in the batch, used to index
-            into sequence. E.g., if the sequence length is [4, 6], it is
-            [0, 4].
-        seq_len_tensor (torch.Tensor): (batch_size,). Record the sequence
-            length of the sequences in the batch.
-        lora_indices_tensor (torch.Tensor): (batch_size,). The LoRA index
-            corresponding to each batch. An index of -1 means no lora should be
-            applied.
-        batches (int): batch size
-        max_seq_length (int): The max sequence lengths of the sequences in the 
-            batch.
-        token_nums (int): The token numbers in the batch. Used to verify if the 
-            token numbers in the inputs matches the one in the metadata.
-        scaling (float): Scaling factor.
-    """
-    assert inputs.dtype == lora_a_weights[0].dtype
-    assert inputs.dtype in [torch.float16, torch.bfloat16]
-    for weight in lora_a_weights:
-        assert weight.dtype in [torch.float16, torch.bfloat16]
-
-    assert inputs.size(0) == token_nums
-    assert inputs.size(1) == lora_a_weights[0].size(-1)
-    assert b_seq_start_loc.size(0) == batches
-    assert lora_indices_tensor.size(0) == batches
-    assert inputs.is_contiguous()
-    assert output_tensor.is_contiguous()
-    (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
-     lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, b_seq_start_loc.device)
-    # TODO tuning this config
-    N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
-    BLOCK_M = 32
-    BLOCK_N = 16
-    BLOCK_K = 32
-    SPLIT_K = 8
-    EVEN_K = K % (BLOCK_K * SPLIT_K) == 0
-    grid = (
-        triton.cdiv(max_seq_length, BLOCK_M) * triton.cdiv(N, BLOCK_N),
-        SPLIT_K * len(lora_a_weights),
-        batches,
-    )
-    _sgmv_shrink_kernel[grid](
-        inputs,
-        lora_ptr_tensor,
-        output_tensor,
-        N,
-        K,
-        b_seq_start_loc,
-        seq_len_tensor,
-        lora_indices_tensor,
-        scaling,
-        inputs.stride(0),
-        inputs.stride(1),
-        lora_strides_d0,
-        lora_strides_d1,
-        lora_strides_d2,
-        output_tensor.stride(0),
-        output_tensor.stride(1),
-        output_tensor.stride(2),
-        BLOCK_M,
-        BLOCK_N,
-        BLOCK_K,
-        EVEN_K,
-        SPLIT_K,
-        len(lora_a_weights),
-    )
-    return
-
-
-def sgmv_shrink_fake(
-    inputs: torch.Tensor,
-    lora_a_weights: List[torch.Tensor],
-    output_tensor: torch.Tensor,
-    b_seq_start_loc: torch.Tensor,
-    seq_len_tensor: torch.Tensor,
-    lora_indices_tensor: torch.Tensor,
-    batches: int,
-    max_seq_length: int,
-    token_nums: int,
-    scaling: float,
-) -> None:
-    return
-
-
-try:
-    direct_register_custom_op(
-        op_name="sgmv_shrink",
-        op_func=_sgmv_shrink,
-        mutates_args=["output_tensor"],
-        fake_impl=sgmv_shrink_fake,
-    )
-    sgmv_shrink = torch.ops.vllm.sgmv_shrink
-
-except AttributeError:
-    sgmv_shrink = _sgmv_shrink
diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py
index b52a842cdaf..f779bbccd31 100644
--- a/vllm/lora/ops/triton_ops/utils.py
+++ b/vllm/lora/ops/triton_ops/utils.py
@@ -1,55 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import functools
 from typing import Dict, List, Tuple
 
 import torch
 
-
-@functools.lru_cache
-def _get_op_configs(op_type: str, batch: int, hidden_size: int):
-    # TODO: add optimal configurations
-    return None
-
-
-def _check_divisibility(hidden_size: int):
-    # The bgmv_expand kernel requires that the hidden_size be divisible by
-    # the number below.
-    divisibility = [2, 4, 8, 16, 32, 64]
-    divisibility.sort(reverse=True)
-    for div in divisibility:
-        if hidden_size % div == 0:
-            return div
-    # hidden_size is an odd number
-    return 1
-
-
-def _get_default_config(op_type: str, batch: int, hidden_size: int):
-    if op_type == "expand":
-        return {
-            "BLOCK_N": 256,
-            "SPLIT_N": _check_divisibility(hidden_size),
-            "num_warps": 8
-        }
-    else:
-        return {"BLOCK_K": 256, "SPLIT_K": 64, "num_warps": 8}
-
-
-def get_lora_op_configs(op_type: str, batch: int,
-                        hidden_size: int) -> Dict[str, int]:
-    """Inspired by `fused_moe_kernel`
-    The return value will be a dictionary mapping an irregular grid of batch 
-    sizes and hidden_size to configurations of the bgmv-related kernel. 
-    NOTE: It currently only supports the default configuration. We plan to 
-    generate optimal configurations for different hardware in the future using 
-    scripts similar to `benchmark_moe.py`.
-    """
-    config = _get_op_configs(op_type, batch, hidden_size)
-    if not config:
-        config = _get_default_config(op_type, batch, hidden_size)
-    return config
-
-
 _LORA_A_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
 _LORA_B_PTR_DICT: Dict[Tuple[int, ...], Tuple[torch.tensor, ...]] = {}
 
diff --git a/vllm/lora/ops/triton_ops/v1/__init__.py b/vllm/lora/ops/triton_ops/v1/__init__.py
deleted file mode 100644
index 1d2c46f4e9f..00000000000
--- a/vllm/lora/ops/triton_ops/v1/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from vllm.lora.ops.triton_ops.v1.v1_expand import v1_expand
-from vllm.lora.ops.triton_ops.v1.v1_kernel_metadata import V1KernelMeta
-from vllm.lora.ops.triton_ops.v1.v1_shrink import v1_shrink
-
-__all__ = [
-    "v1_expand",
-    "v1_shrink",
-    "V1KernelMeta",
-]
\ No newline at end of file
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index 19a94eea910..eb6f5b1b488 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -10,20 +10,12 @@
 
 import torch
 
-import vllm.envs as env
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
 
 if HAS_TRITON:
-    if env.VLLM_USE_V1:
-        from vllm.lora.ops.triton_ops.v1 import (V1KernelMeta, v1_expand,
-                                                 v1_shrink)
-    else:
-        from vllm.lora.ops.triton_ops import bgmv_expand
-        from vllm.lora.ops.triton_ops import bgmv_expand_slice
-        from vllm.lora.ops.triton_ops import bgmv_shrink
-        from vllm.lora.ops.triton_ops import sgmv_expand
-        from vllm.lora.ops.triton_ops import sgmv_shrink
+    from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
+                                          lora_shrink)
 
 from .punica_base import PunicaWrapperBase
 
@@ -32,57 +24,8 @@
     from vllm.lora.models import LongContextLoRAContext
 
 
-class V1KernelMixin:
-
-    def _v1_make_metadata(self, max_loras: int, max_num_batched_tokens: int,
-                          max_batches: int, device: Union[torch.device, str]):
-        self.token_mapping_v1_meta = V1KernelMeta.make(max_loras,
-                                                       max_num_batched_tokens,
-                                                       device=device)
-        self.prompt_mapping_v1_meta = V1KernelMeta.make(max_loras,
-                                                        max_batches,
-                                                        device=device)
-
-    def _v1_prepare_metadata_tensors(self, token_lora_indices: torch.Tensor,
-                                     sampler_indices: torch.Tensor):
-        self.token_mapping_v1_meta.prepare_tensors(token_lora_indices)
-        self.prompt_mapping_v1_meta.prepare_tensors(sampler_indices)
-
-    def _v1_apply_shrink(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: Tuple[torch.Tensor, ...],
-        scale: float,
-    ):
-        v1_shrink(
-            x,
-            w_t_all,
-            y,
-            *self.token_mapping_v1_meta.meta_args(x.size(0)),
-            scale,
-        )
-
-    def _v1_apply_expand(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: Tuple[torch.Tensor, ...],
-        offset_start: int,
-        add_inputs: bool,
-    ):
-        v1_expand(
-            x,
-            w_t_all,
-            y,
-            *self.token_mapping_v1_meta.meta_args(x.size(0)),
-            offset_start=offset_start,
-            add_inputs=add_inputs,
-        )
-
-
 @final
-class PunicaWrapperGPU(PunicaWrapperBase, V1KernelMixin):
+class PunicaWrapperGPU(PunicaWrapperBase):
     """
     PunicaWrapperGPU is designed to manage and provide metadata for the punica 
     kernel. The main function is to maintain the state information for 
@@ -96,9 +39,12 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
 
         self.max_loras = kwargs['max_loras']
 
-        if env.VLLM_USE_V1:
-            self._v1_make_metadata(self.max_loras, max_num_batched_tokens,
-                                   max_batches, device)
+        self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
+                                                      max_num_batched_tokens,
+                                                      device=device)
+        self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
+                                                       max_batches,
+                                                       device=device)
 
     def update_metadata(
             self,
@@ -110,83 +56,18 @@ def update_metadata(
             long_lora_context: Optional["LongContextLoRAContext"] = None,
             **kwargs):
 
-        if env.VLLM_USE_V1:
-            self.is_prefill = mapping.is_prefill
-            self._update_base_metadata(mapping, lora_index_to_id, max_loras,
-                                       vocab_size, extra_vocab_size,
-                                       long_lora_context)
-            self._v1_prepare_metadata_tensors(self.token_lora_indices,
-                                              self.sampler_indices)
-        else:
-            # Forward to base class update_metadata
-            PunicaWrapperBase.update_metadata(self, mapping, lora_index_to_id,
-                                              max_loras, vocab_size,
-                                              extra_vocab_size,
-                                              long_lora_context, **kwargs)
-
-    def _apply_shrink_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: Tuple[torch.Tensor, ...],
-        scale: float,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-        sgmv_shrink(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            scale,
-        )
+        self.is_prefill = mapping.is_prefill
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras,
+                                   vocab_size, extra_vocab_size,
+                                   long_lora_context)
 
-    def _apply_shrink_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        scale: float,
-    ):
-        bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale)
-
-    def _apply_expand_prefill(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: Tuple[torch.Tensor, ...],
-        offset_start: int,
-        add_inputs: bool,
-    ):
-        #No LoRA request, so return directly
-        if self.no_lora:
-            return
-
-        sgmv_expand(
-            x,
-            w_t_all,
-            y,
-            *self.prefill_metadata,
-            offset_start=offset_start,
-            add_inputs=add_inputs,
-        )
+        # Prepare cuda kernel metadata tensors
+        self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
+        self.prompt_mapping_meta.prepare_tensors(self.sampler_indices)
 
-    def _apply_expand_decode(
-        self,
-        y: torch.Tensor,
-        x: torch.Tensor,
-        w_t_all: torch.Tensor,
-        y_offset: Optional[int],
-        y_slice_size: Optional[int],
-        add_inputs: bool,
-    ):
-        bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset,
-                          y_slice_size, add_inputs)
-
-    def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
-                   x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...],
-                   scale: float, **kwargs):
+    def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
+                   lora_a_stacked: Tuple[torch.Tensor,
+                                         ...], scale: float, **kwargs):
         """
         Performs GEMM  for multiple slices of lora_a.
         When `is_prefill is` true, it indicates that it is currently the
@@ -199,33 +80,24 @@ def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
             y[i] += (x @ lora_a_stacked[i]) * scale
         
         Args:
-            y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors
+            y (torch.Tensor): Output tensors
             x (torch.Tensor): Input tensor
             lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights
             scale (float): Scaling factor for the operation
         """
 
         x = x.view(-1, x.shape[-1])
-
-        if env.VLLM_USE_V1:
-            self._v1_apply_shrink(y, x, lora_a_stacked, scale)  # type: ignore
-        else:
-            if self.is_prefill:
-                # NOTE fused kernel
-                self._apply_shrink_prefill(
-                    y,  # type: ignore
-                    x,
-                    lora_a_stacked,
-                    scale)
-            else:
-                # TODO fuse these kernels
-                for slice_idx in range(len(lora_a_stacked)):
-                    self._apply_shrink_decode(y[slice_idx], x,
-                                              lora_a_stacked[slice_idx], scale)
+        lora_shrink(
+            x,
+            lora_a_stacked,
+            y,
+            *self.token_mapping_meta.meta_args(x.size(0)),
+            scale,
+        )
 
     def add_expand(self,
                    y: torch.Tensor,
-                   x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+                   x: torch.Tensor,
                    lora_b_stacked: Tuple[torch.Tensor, ...],
                    lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
                    output_slices: Tuple[int, ...],
@@ -244,7 +116,7 @@ def add_expand(self,
             
         Args:
             y (torch.Tensor): Output tensor.
-            x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors
+            x (torch.Tensor): Input tensors
             lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
@@ -259,37 +131,19 @@ def add_expand(self,
             self._apply_bias(token_lora_indices, y, output_slices,
                              lora_bias_stacked)
 
-        if env.VLLM_USE_V1:
-            # TODO (varun): Profile with add_inputs = False. i.e. move the
-            # addition out of the kernel
-            self._v1_apply_expand(
-                y,
-                x,  # type: ignore
-                lora_b_stacked,
-                offset_start,
-                add_inputs=True)
-        else:
-
-            if self.is_prefill:
-                # NOTE fused kernel
-                self._apply_expand_prefill(
-                    y,
-                    x,  # type: ignore
-                    lora_b_stacked,
-                    offset_start,
-                    add_inputs=True)
-            else:
-                # TODO fuse these kernels
-                for slice_idx in range(len(lora_b_stacked)):
-                    self._apply_expand_decode(
-                        y,
-                        x[slice_idx],
-                        lora_b_stacked[slice_idx],
-                        offset_start,
-                        output_slices[slice_idx],
-                        add_inputs=add_inputs,
-                    )
-                    offset_start += output_slices[slice_idx]
+        assert x.ndim == 3
+        assert x.size(0) == len(output_slices)
+        num_tokens = x.size(1)  # first dimension is the num slices
+
+        lora_expand(
+            x,
+            lora_b_stacked,
+            y,
+            *self.token_mapping_meta.meta_args(num_tokens),
+            offset_start=offset_start,
+            add_inputs=True,
+        )
+
         y = y.view_as(y_org)
 
     def add_lora_embedding(self,
@@ -311,24 +165,14 @@ def add_lora_embedding(self,
             add_inputs (bool): Default to True.
         """
 
-        if env.VLLM_USE_V1:
-            self._v1_apply_expand(y,
-                                  x.unsqueeze(dim=0), (lora_b_stacked, ),
-                                  offset_start=0,
-                                  add_inputs=add_inputs)
-        else:
-            if self.is_prefill:
-                sgmv_expand(
-                    x.unsqueeze(dim=0),
-                    (lora_b_stacked, ),
-                    y,
-                    *self.prefill_metadata,
-                    offset_start=0,
-                    add_inputs=add_inputs,
-                )
-            else:
-                bgmv_expand(x, lora_b_stacked, y, self.token_lora_indices,
-                            add_inputs)
+        lora_expand(
+            x.unsqueeze(dim=0),
+            (lora_b_stacked, ),
+            y,
+            *self.token_mapping_meta.meta_args(x.size(0)),
+            offset_start=0,
+            add_inputs=add_inputs,
+        )
 
     def add_lora_linear(self,
                         y: torch.Tensor,
@@ -339,7 +183,7 @@ def add_lora_linear(self,
                         scale: float,
                         output_slices: Tuple[int, ...],
                         *,
-                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        buffer: Optional[torch.Tensor] = None,
                         **kwargs) -> None:
         """
         Applicable to linear-related lora. 
@@ -361,7 +205,7 @@ def add_lora_linear(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias.
             scale (float): Scaling factor.
             output_slices (Tuple[int, ...]): Every slice's size.
-            buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None.
+            buffer (Optional[torch.Tensor]): Defaults to None.
         """
 
         assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices)
@@ -431,21 +275,11 @@ def add_lora_logits(self,
                                  dtype=torch.float32,
                                  device=x.device)
 
-        if env.VLLM_USE_V1:
-            v1_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
-                      *self.prompt_mapping_v1_meta.meta_args(x.size(0)), scale)
-
-            v1_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
-                      y,
-                      *self.prompt_mapping_v1_meta.meta_args(buffer.size(0)),
-                      add_inputs=True)
-        else:
-
-            # V0 LogitsProcessorWithLoRA always using bgmv.
-            bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale)
-            bgmv_expand(buffer,
-                        lora_b_stacked,
-                        y,
-                        self.sampler_indices,
-                        add_inputs=True)
+        lora_shrink(x, [lora_a_stacked], buffer.unsqueeze(dim=0),
+                    *self.prompt_mapping_meta.meta_args(x.size(0)), scale)
+
+        lora_expand(buffer.unsqueeze(dim=0), [lora_b_stacked],
+                    y,
+                    *self.prompt_mapping_meta.meta_args(buffer.size(0)),
+                    add_inputs=True)
         y = y.view_as(y_org)
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 2814f0fda7c..a8a19e0e620 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -62,9 +62,10 @@ def _set_active_loras(self, prompt_lora_mapping: tuple[int, ...],
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")
 
-        # Set is_prefill to True, so we always use the SGMV kernels.
-        # For cuda platforms, we have specialized triton kernels, and
-        # the cuda path ignores `is_prefill`.
+        # Set is_prefill to True, so we always use the SGMV kernels on
+        # non-cuda platforms.
+        # On cuda platforms we use the same kernels for prefill and
+        # decode and this flag is generally ignored.
         lora_mapping = LoRAMapping(token_lora_mapping,
                                    prompt_lora_mapping,
                                    is_prefill=True)

From 66a7b56c127f9b2da16534d7830f7a136825c060 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Tue, 18 Mar 2025 11:29:42 +0100
Subject: [PATCH 0810/1240] [Mistral-Small 3.1] Update docs and tests (#14977)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  2 +-
 examples/offline_inference/pixtral.py         | 10 +--
 .../vision_language/test_pixtral.py           | 80 +++++++------------
 .../models/fixtures/mistral_small_3_chat.json |  1 +
 .../models/fixtures/pixtral_chat_engine.json  |  1 -
 5 files changed, 34 insertions(+), 60 deletions(-)
 create mode 100644 tests/models/fixtures/mistral_small_3_chat.json
 delete mode 100644 tests/models/fixtures/pixtral_chat_engine.json

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 2d7617d9eba..97aaf254147 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -879,7 +879,7 @@ See [this page](#generative-models) for more information on how to use generativ
 - * `PixtralForConditionalGeneration`
   * Pixtral
   * T + I<sup>+</sup>
-  * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b`, etc.
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
   *
   * ✅︎
   * ✅︎
diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/pixtral.py
index 03e6eea8910..5379f456216 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/pixtral.py
@@ -6,14 +6,14 @@
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 
-# This script is an offline demo for running Pixtral.
+# This script is an offline demo for running Mistral-Small-3
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Pixtral-12B-2409 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -23,7 +23,7 @@
 # --header 'Content-Type: application/json' \
 # --header 'Authorization: Bearer token' \
 # --data '{
-#     "model": "mistralai/Pixtral-12B-2409",
+#     "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
 #     "messages": [
 #       {
 #         "role": "user",
@@ -44,7 +44,7 @@
 
 
 def run_simple_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     sampling_params = SamplingParams(max_tokens=8192)
 
     # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
@@ -83,7 +83,7 @@ def run_simple_demo(args: argparse.Namespace):
 
 
 def run_advanced_demo(args: argparse.Namespace):
-    model_name = "mistralai/Pixtral-12B-2409"
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     max_img_per_msg = 5
     max_tokens_per_img = 4096
 
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index d51dabc2334..ee619d8d80c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -4,7 +4,6 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import json
-import uuid
 from dataclasses import asdict
 from typing import TYPE_CHECKING, Any, Optional
 
@@ -16,8 +15,7 @@
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
 from transformers import AutoProcessor
 
-from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
-                  TextPrompt, TokensPrompt)
+from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
@@ -28,7 +26,11 @@
 if TYPE_CHECKING:
     from _typeshed import StrPath
 
-MODELS = ["mistralai/Pixtral-12B-2409"]
+PIXTRAL_ID = "mistralai/Pixtral-12B-2409"
+MISTRAL_SMALL_3_1_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+MODELS = [PIXTRAL_ID, MISTRAL_SMALL_3_1_ID]
+
 IMG_URLS = [
     "https://picsum.photos/id/237/400/300",
     "https://picsum.photos/id/231/200/300",
@@ -125,8 +127,10 @@ def _create_engine_inputs_hf(urls: list[str]) -> TextPrompt:
 FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures"
 assert FIXTURES_PATH.exists()
 
-FIXTURE_LOGPROBS_CHAT = FIXTURES_PATH / "pixtral_chat.json"
-FIXTURE_LOGPROBS_ENGINE = FIXTURES_PATH / "pixtral_chat_engine.json"
+FIXTURE_LOGPROBS_CHAT = {
+    PIXTRAL_ID: FIXTURES_PATH / "pixtral_chat.json",
+    MISTRAL_SMALL_3_1_ID: FIXTURES_PATH / "mistral_small_3_chat.json",
+}
 
 OutputsLogprobs = list[tuple[list[int], str, Optional[SampleLogprobs]]]
 
@@ -166,12 +170,12 @@ def test_chat(
     model: str,
     dtype: str,
 ) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT)
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
+        FIXTURE_LOGPROBS_CHAT[model])
     with vllm_runner(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
-            enable_chunked_prefill=False,
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -183,70 +187,40 @@ def test_chat(
             outputs.extend(output)
 
     logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
+    # Remove last `None` prompt_logprobs to compare with fixture
+    for i in range(len(logprobs)):
+        assert logprobs[i][-1] is None
+        logprobs[i] = logprobs[i][:-1]
     check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
                          outputs_1_lst=logprobs,
                          name_0="h100_ref",
                          name_1="output")
 
 
-@large_gpu_test(min_gb=80)
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
-    EXPECTED_ENGINE_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_ENGINE)
-    args = EngineArgs(
-        model=model,
-        tokenizer_mode="mistral",
-        enable_chunked_prefill=False,
-        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
-        dtype=dtype,
-    )
-    engine = LLMEngine.from_engine_args(args)
-
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[0], SAMPLING_PARAMS)
-    engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[1], SAMPLING_PARAMS)
-
-    outputs = []
-    count = 0
-    while True:
-        out = engine.step()
-        count += 1
-        for request_output in out:
-            if request_output.finished:
-                outputs.append(request_output)
-
-        if count == 2:
-            engine.add_request(uuid.uuid4().hex, ENGINE_INPUTS[2],
-                               SAMPLING_PARAMS)
-        if not engine.has_unfinished_requests():
-            break
-
-    logprobs = vllm_runner._final_steps_generate_w_logprobs(outputs)
-    check_logprobs_close(outputs_0_lst=EXPECTED_ENGINE_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
-
-
 @large_gpu_test(min_gb=48)
 @pytest.mark.parametrize(
     "prompt,expected_ranges",
     [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 10,
+        "offset": 11,
         "length": 494
     }]),
      (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 10,
+         "offset": 11,
          "length": 266
      }, {
-         "offset": 276,
+         "offset": 277,
          "length": 1056
      }, {
-         "offset": 1332,
+         "offset": 1333,
          "length": 418
      }])])
-def test_multi_modal_placeholders(
-        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+def test_multi_modal_placeholders(vllm_runner, prompt,
+                                  expected_ranges: list[PlaceholderRange],
+                                  monkeypatch) -> None:
+
+    # This placeholder checking test only works with V0 engine
+    # where `multi_modal_placeholders` is returned with `RequestOutput`
+    monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(
             "mistral-community/pixtral-12b",
             max_model_len=8192,
diff --git a/tests/models/fixtures/mistral_small_3_chat.json b/tests/models/fixtures/mistral_small_3_chat.json
new file mode 100644
index 00000000000..9d65cd0bd6d
--- /dev/null
+++ b/tests/models/fixtures/mistral_small_3_chat.json
@@ -0,0 +1 @@
+[[[1784, 3937, 6122, 1261, 7244, 10575, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1454, 1261, 38462, 4818, 1046, 2], "The image shows a black dog lying on a wooden floor, looking up with a curious expression.", [{"1784": {"logprob": -0.4740446209907532, "rank": 1, "decoded_token": "The"}, "1065": {"logprob": -1.0990445613861084, "rank": 2, "decoded_token": "A"}, "4380": {"logprob": -3.3490445613861084, "rank": 3, "decoded_token": "This"}, "1785": {"logprob": -5.0990447998046875, "rank": 4, "decoded_token": "In"}, "11745": {"logprob": -6.4740447998046875, "rank": 5, "decoded_token": "Here"}}, {"3937": {"logprob": -0.06349722295999527, "rank": 1, "decoded_token": " image"}, "7244": {"logprob": -2.813497304916382, "rank": 2, "decoded_token": " black"}, "16649": {"logprob": -7.563497066497803, "rank": 3, "decoded_token": " photo"}, "18390": {"logprob": -7.688497066497803, "rank": 4, "decoded_token": " photograph"}, "10575": {"logprob": -8.438497543334961, "rank": 5, "decoded_token": " dog"}}, {"6122": {"logprob": -0.25453490018844604, "rank": 1, "decoded_token": " shows"}, "6971": {"logprob": -1.8795349597930908, "rank": 2, "decoded_token": " features"}, "51948": {"logprob": -2.754534959793091, "rank": 3, "decoded_token": " depicts"}, "25981": {"logprob": -5.629534721374512, "rank": 4, "decoded_token": " displays"}, "1395": {"logprob": -6.129534721374512, "rank": 5, "decoded_token": " is"}}, {"1261": {"logprob": -0.0001245659514097497, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.00012493133545, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -14.25012493133545, "rank": 3, "decoded_token": " the"}, "7244": {"logprob": -14.87512493133545, "rank": 4, "decoded_token": " black"}, "1925": {"logprob": -16.125123977661133, "rank": 5, "decoded_token": " one"}}, {"7244": {"logprob": -0.009403933770954609, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -5.259403705596924, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -6.759403705596924, "rank": 3, "decoded_token": " sle"}, "8500": {"logprob": -7.009403705596924, "rank": 4, "decoded_token": " dark"}, "4329": {"logprob": -7.696903705596924, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.7522680163383484, "rank": 1, "decoded_token": " dog"}, "119075": {"logprob": -1.0022680759429932, "rank": 2, "decoded_token": " Labrador"}, "116572": {"logprob": -1.8772680759429932, "rank": 3, "decoded_token": " puppy"}, "8636": {"logprob": -5.627267837524414, "rank": 4, "decoded_token": " lab"}, "15812": {"logprob": -5.814767837524414, "rank": 5, "decoded_token": " Lab"}}, {"28528": {"logprob": -0.2941223084926605, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.1691222190856934, "rank": 2, "decoded_token": " looking"}, "1454": {"logprob": -2.5441222190856934, "rank": 3, "decoded_token": " with"}, "60700": {"logprob": -3.2941222190856934, "rank": 4, "decoded_token": " laying"}, "18970": {"logprob": -4.794122219085693, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3170951306819916, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.317095160484314, "rank": 2, "decoded_token": " down"}, "14038": {"logprob": -7.3170952796936035, "rank": 3, "decoded_token": " flat"}, "104248": {"logprob": -7.4420952796936035, "rank": 4, "decoded_token": " comfortably"}, "1321": {"logprob": -7.6920952796936035, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.08228635042905807, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.2072863578796387, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.3322863578796387, "rank": 3, "decoded_token": " wooden"}, "3977": {"logprob": -6.957286357879639, "rank": 4, "decoded_token": " top"}, "1278": {"logprob": -7.207286357879639, "rank": 5, "decoded_token": " the"}}, {"32656": {"logprob": -0.03605202957987785, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9110519886016846, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.911052227020264, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -6.036052227020264, "rank": 4, "decoded_token": " wood"}, "8500": {"logprob": -6.473552227020264, "rank": 5, "decoded_token": " dark"}}, {"11237": {"logprob": -0.6433407068252563, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -0.7683407068252563, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -5.268340587615967, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -6.018340587615967, "rank": 4, "decoded_token": " text"}, "18645": {"logprob": -7.143340587615967, "rank": 5, "decoded_token": " flo"}}, {"1044": {"logprob": -0.6826052665710449, "rank": 1, "decoded_token": ","}, "1321": {"logprob": -1.682605266571045, "rank": 2, "decoded_token": " and"}, "7283": {"logprob": -1.807605266571045, "rank": 3, "decoded_token": " looking"}, "1046": {"logprob": -2.682605266571045, "rank": 4, "decoded_token": "."}, "1454": {"logprob": -3.182605266571045, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.07239976525306702, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -3.197399854660034, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -3.822399854660034, "rank": 3, "decoded_token": " staring"}, "1454": {"logprob": -6.384899616241455, "rank": 4, "decoded_token": " with"}, "22116": {"logprob": -6.572399616241455, "rank": 5, "decoded_token": " facing"}}, {"2015": {"logprob": -0.9646494388580322, "rank": 2, "decoded_token": " up"}, "7655": {"logprob": -0.9646494388580322, "rank": 1, "decoded_token": " directly"}, "74606": {"logprob": -2.0896494388580322, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -3.0896494388580322, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.152149200439453, "rank": 5, "decoded_token": " int"}}, {"1454": {"logprob": -0.8447978496551514, "rank": 1, "decoded_token": " with"}, "1513": {"logprob": -1.2197978496551514, "rank": 2, "decoded_token": " at"}, "41132": {"logprob": -2.2197978496551514, "rank": 3, "decoded_token": " attent"}, "1935": {"logprob": -2.9697978496551514, "rank": 4, "decoded_token": " int"}, "7655": {"logprob": -3.0947978496551514, "rank": 5, "decoded_token": " directly"}}, {"1261": {"logprob": -0.7162021994590759, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -1.3412022590637207, "rank": 2, "decoded_token": " an"}, "41132": {"logprob": -2.2162022590637207, "rank": 3, "decoded_token": " attent"}, "2246": {"logprob": -3.2162022590637207, "rank": 4, "decoded_token": " its"}, "38462": {"logprob": -3.9662022590637207, "rank": 5, "decoded_token": " curious"}}, {"38462": {"logprob": -0.7836517095565796, "rank": 1, "decoded_token": " curious"}, "26517": {"logprob": -1.8461517095565796, "rank": 2, "decoded_token": " calm"}, "26905": {"logprob": -2.533651828765869, "rank": 3, "decoded_token": " gentle"}, "11304": {"logprob": -3.408651828765869, "rank": 4, "decoded_token": " serious"}, "97680": {"logprob": -3.596151828765869, "rank": 5, "decoded_token": " thoughtful"}}, {"4818": {"logprob": -0.047154705971479416, "rank": 1, "decoded_token": " expression"}, "1321": {"logprob": -3.922154664993286, "rank": 2, "decoded_token": " and"}, "1505": {"logprob": -4.047154903411865, "rank": 3, "decoded_token": " or"}, "22131": {"logprob": -4.797154903411865, "rank": 4, "decoded_token": " gaze"}, "1044": {"logprob": -9.047154426574707, "rank": 5, "decoded_token": ","}}, {"1046": {"logprob": -0.0008031480247154832, "rank": 1, "decoded_token": "."}, "1408": {"logprob": -7.250802993774414, "rank": 2, "decoded_token": " on"}, "1321": {"logprob": -10.500802993774414, "rank": 3, "decoded_token": " and"}, "1338": {"logprob": -11.000802993774414, "rank": 4, "decoded_token": ".\n\n"}, "3016": {"logprob": -11.500802993774414, "rank": 5, "decoded_token": " while"}}, {"2": {"logprob": -0.0008517451351508498, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -7.125851631164551, "rank": 2, "decoded_token": "   "}, "1256": {"logprob": -10.00085163116455, "rank": 3, "decoded_token": " The"}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 10726, 1290, 3719, 1307, 122203, 35463, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A scenic view of rugged mountains with green and rocky terrain under a clear sky.", [{"1049": {"logprob": -0.05050129443407059, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -3.5505013465881348, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.175501346588135, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -6.175501346588135, "rank": 4, "decoded_token": "Certain"}, "1045": {"logprob": -6.550501346588135, "rank": 5, "decoded_token": "-"}}, {"1046": {"logprob": -5.364403477869928e-06, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -12.500005722045898, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.875005722045898, "rank": 3, "decoded_token": ":"}, "1044": {"logprob": -15.687505722045898, "rank": 4, "decoded_token": ","}, "1045": {"logprob": -15.875005722045898, "rank": 5, "decoded_token": "-"}}, {"1349": {"logprob": -0.4890742003917694, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -1.1140742301940918, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.364074230194092, "rank": 3, "decoded_token": " **"}, "1656": {"logprob": -4.364074230194092, "rank": 4, "decoded_token": " In"}, "2409": {"logprob": -4.989074230194092, "rank": 5, "decoded_token": " This"}}, {"7244": {"logprob": -0.08685152232646942, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -3.4618515968322754, "rank": 2, "decoded_token": " close"}, "16450": {"logprob": -3.5868515968322754, "rank": 3, "decoded_token": " sle"}, "4329": {"logprob": -4.899351596832275, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.399351596832275, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.20338763296604156, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -1.8283876180648804, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.95338773727417, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -6.95338773727417, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.07838773727417, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.532414972782135, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.7824149131774902, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.1574149131774902, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.2824149131774902, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.4074149131774902, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.4258010685443878, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -1.6758010387420654, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.9258010387420654, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.6758010387420654, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.6758010387420654, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.3588743805885315, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.2338743209838867, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.358874320983887, "rank": 3, "decoded_token": " attent"}, "14038": {"logprob": -6.546374320983887, "rank": 4, "decoded_token": " flat"}, "1321": {"logprob": -6.733874320983887, "rank": 5, "decoded_token": " and"}}, {"1261": {"logprob": -0.07801607996225357, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -2.9530160427093506, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -4.20301628112793, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.20301628112793, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -6.57801628112793, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.06541638821363449, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.4404163360595703, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -3.9404163360595703, "rank": 3, "decoded_token": " rust"}, "17253": {"logprob": -5.81541633605957, "rank": 4, "decoded_token": " weather"}, "12603": {"logprob": -5.94041633605957, "rank": 5, "decoded_token": " wood"}}, {"11237": {"logprob": -0.4574064016342163, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0824064016342163, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.082406520843506, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.207406520843506, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -6.582406520843506, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -0.9594833850860596, "rank": 1, "decoded_token": ","}, "7283": {"logprob": -1.2094833850860596, "rank": 2, "decoded_token": " looking"}, "1321": {"logprob": -2.2094833850860596, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.4594833850860596, "rank": 4, "decoded_token": " with"}, "1626": {"logprob": -2.5844833850860596, "rank": 5, "decoded_token": ".\n"}}, {"7283": {"logprob": -0.15972694754600525, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.534726858139038, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.909726858139038, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.034727096557617, "rank": 4, "decoded_token": " facing"}, "1454": {"logprob": -6.409727096557617, "rank": 5, "decoded_token": " with"}}, {"2015": {"logprob": -0.894250750541687, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.269250750541687, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.769250750541687, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.6442508697509766, "rank": 4, "decoded_token": " upward"}, "1935": {"logprob": -4.081750869750977, "rank": 5, "decoded_token": " int"}}, {"1513": {"logprob": -0.5085363388061523, "rank": 1, "decoded_token": " at"}, "1454": {"logprob": -1.5085363388061523, "rank": 2, "decoded_token": " with"}, "1626": {"logprob": -2.6335363388061523, "rank": 3, "decoded_token": ".\n"}, "1935": {"logprob": -3.3835363388061523, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.6335363388061523, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0010482537327334285, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -7.0010480880737305, "rank": 2, "decoded_token": " something"}, "2246": {"logprob": -10.25104808807373, "rank": 3, "decoded_token": " its"}, "1261": {"logprob": -10.25104808807373, "rank": 4, "decoded_token": " a"}, "1636": {"logprob": -10.50104808807373, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.0003800861886702478, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.250380516052246, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.250380516052246, "rank": 3, "decoded_token": " photographer"}, "2965": {"logprob": -12.375380516052246, "rank": 4, "decoded_token": " person"}, "37967": {"logprob": -12.500380516052246, "rank": 5, "decoded_token": " ceiling"}}, {"1626": {"logprob": -0.34197133779525757, "rank": 1, "decoded_token": ".\n"}, "1454": {"logprob": -1.4669713973999023, "rank": 2, "decoded_token": " with"}, "1046": {"logprob": -3.3419713973999023, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -3.9669713973999023, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -5.966971397399902, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002148107625544071, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.877148151397705, "rank": 2, "decoded_token": "  "}, "1293": {"logprob": -7.127148151397705, "rank": 3, "decoded_token": "   "}, "1032": {"logprob": -8.252147674560547, "rank": 4, "decoded_token": " "}, "1049": {"logprob": -10.752147674560547, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.510157047363464e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.437507629394531, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.437507629394531, "rank": 3, "decoded_token": ".\n"}, "48426": {"logprob": -13.687507629394531, "rank": 4, "decoded_token": ".The"}, "1044": {"logprob": -14.062507629394531, "rank": 5, "decoded_token": ","}}, {"1349": {"logprob": -0.2843300700187683, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -2.034330129623413, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -3.534330129623413, "rank": 3, "decoded_token": " Rug"}, "22468": {"logprob": -4.409329891204834, "rank": 4, "decoded_token": " Several"}, "1531": {"logprob": -4.534329891204834, "rank": 5, "decoded_token": " The"}}, {"10726": {"logprob": -1.3984904289245605, "rank": 1, "decoded_token": " scen"}, "122203": {"logprob": -1.7734904289245605, "rank": 2, "decoded_token": " rugged"}, "61082": {"logprob": -1.7734904289245605, "rank": 3, "decoded_token": " panor"}, "15375": {"logprob": -2.5234904289245605, "rank": 4, "decoded_token": " vast"}, "13770": {"logprob": -2.6484904289245605, "rank": 5, "decoded_token": " maj"}}, {"1290": {"logprob": -3.099436753473128e-06, "rank": 1, "decoded_token": "ic"}, "2981": {"logprob": -13.56250286102295, "rank": 2, "decoded_token": "ically"}, "1702": {"logprob": -14.31250286102295, "rank": 3, "decoded_token": "ice"}, "4965": {"logprob": -16.625003814697266, "rank": 4, "decoded_token": "etic"}, "4336": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": "icro"}}, {"3719": {"logprob": -0.1252945065498352, "rank": 1, "decoded_token": " view"}, "28035": {"logprob": -2.8752944469451904, "rank": 2, "decoded_token": " landscape"}, "24361": {"logprob": -3.2502944469451904, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -5.1252946853637695, "rank": 4, "decoded_token": " mountainous"}, "1044": {"logprob": -5.3752946853637695, "rank": 5, "decoded_token": ","}}, {"1307": {"logprob": -0.09058280289173126, "rank": 1, "decoded_token": " of"}, "89995": {"logprob": -3.465582847595215, "rank": 2, "decoded_token": " showc"}, "6122": {"logprob": -3.715582847595215, "rank": 3, "decoded_token": " shows"}, "6971": {"logprob": -4.590582847595215, "rank": 4, "decoded_token": " features"}, "66583": {"logprob": -5.090582847595215, "rank": 5, "decoded_token": " captures"}}, {"122203": {"logprob": -0.5323622226715088, "rank": 1, "decoded_token": " rugged"}, "1261": {"logprob": -2.032362222671509, "rank": 2, "decoded_token": " a"}, "6245": {"logprob": -2.532362222671509, "rank": 3, "decoded_token": " multiple"}, "127945": {"logprob": -3.157362222671509, "rank": 4, "decoded_token": " mountainous"}, "35463": {"logprob": -3.532362222671509, "rank": 5, "decoded_token": " mountains"}}, {"35463": {"logprob": -0.6520033478736877, "rank": 1, "decoded_token": " mountains"}, "1044": {"logprob": -1.027003288269043, "rank": 2, "decoded_token": ","}, "24361": {"logprob": -2.527003288269043, "rank": 3, "decoded_token": " mountain"}, "127945": {"logprob": -3.902003288269043, "rank": 4, "decoded_token": " mountainous"}, "11223": {"logprob": -4.652003288269043, "rank": 5, "decoded_token": " green"}}, {"1454": {"logprob": -0.39697548747062683, "rank": 1, "decoded_token": " with"}, "13875": {"logprob": -2.146975517272949, "rank": 2, "decoded_token": " covered"}, "1321": {"logprob": -2.271975517272949, "rank": 3, "decoded_token": " and"}, "2425": {"logprob": -3.459475517272949, "rank": 4, "decoded_token": " under"}, "47948": {"logprob": -4.459475517272949, "rank": 5, "decoded_token": " stretching"}}, {"11223": {"logprob": -1.3947651386260986, "rank": 1, "decoded_token": " green"}, "24880": {"logprob": -1.8947651386260986, "rank": 2, "decoded_token": " varying"}, "95746": {"logprob": -2.0822651386260986, "rank": 3, "decoded_token": " rocky"}, "1295": {"logprob": -3.0197651386260986, "rank": 4, "decoded_token": " l"}, "19546": {"logprob": -3.0822651386260986, "rank": 5, "decoded_token": " varied"}}, {"1321": {"logprob": -0.8649212121963501, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.73992121219635, "rank": 2, "decoded_token": " slopes"}, "47260": {"logprob": -1.86492121219635, "rank": 3, "decoded_token": " vegetation"}, "50373": {"logprob": -1.98992121219635, "rank": 4, "decoded_token": " patches"}, "23170": {"logprob": -3.4899210929870605, "rank": 5, "decoded_token": " grass"}}, {"95746": {"logprob": -0.21662631630897522, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -1.9666262865066528, "rank": 2, "decoded_token": " brown"}, "26549": {"logprob": -3.8416264057159424, "rank": 3, "decoded_token": " gray"}, "4266": {"logprob": -4.216626167297363, "rank": 4, "decoded_token": " bar"}, "34052": {"logprob": -4.966626167297363, "rank": 5, "decoded_token": " grey"}}, {"24765": {"logprob": -0.32041722536087036, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -1.8204171657562256, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.6954171657562256, "rank": 3, "decoded_token": " slopes"}, "84497": {"logprob": -3.9454171657562256, "rank": 4, "decoded_token": " landscapes"}, "17764": {"logprob": -4.695417404174805, "rank": 5, "decoded_token": " surfaces"}}, {"2425": {"logprob": -0.4664109945297241, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.4664109945297241, "rank": 2, "decoded_token": "."}, "1044": {"logprob": -3.4664111137390137, "rank": 3, "decoded_token": ","}, "22923": {"logprob": -3.9664111137390137, "rank": 4, "decoded_token": " extending"}, "47948": {"logprob": -4.091411113739014, "rank": 5, "decoded_token": " stretching"}}, {"1261": {"logprob": -0.015043734572827816, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -4.76504373550415, "rank": 2, "decoded_token": " an"}, "6133": {"logprob": -6.01504373550415, "rank": 3, "decoded_token": " clear"}, "1278": {"logprob": -6.26504373550415, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -7.26504373550415, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.7420746684074402, "rank": 1, "decoded_token": " clear"}, "18416": {"logprob": -1.492074728012085, "rank": 2, "decoded_token": " haz"}, "16152": {"logprob": -1.992074728012085, "rank": 3, "decoded_token": " cloud"}, "27254": {"logprob": -3.367074728012085, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.617074728012085, "rank": 5, "decoded_token": " light"}}, {"21283": {"logprob": -0.007355513051152229, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.257355690002441, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -6.382355690002441, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -8.257355690002441, "rank": 4, "decoded_token": " or"}, "3950": {"logprob": -10.132355690002441, "rank": 5, "decoded_token": " day"}}, {"1046": {"logprob": -0.01126158982515335, "rank": 1, "decoded_token": "."}, "1626": {"logprob": -4.636261463165283, "rank": 2, "decoded_token": ".\n"}, "1338": {"logprob": -7.761261463165283, "rank": 3, "decoded_token": ".\n\n"}, "1044": {"logprob": -7.761261463165283, "rank": 4, "decoded_token": ","}, "1395": {"logprob": -8.011261940002441, "rank": 5, "decoded_token": " is"}}, {"2": {"logprob": -0.00709608756005764, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -5.007096290588379, "rank": 2, "decoded_token": " The"}, "1256": {"logprob": -8.132096290588379, "rank": 3, "decoded_token": "   "}}]], [[1049, 1046, 1349, 7244, 10575, 1395, 28528, 1408, 1261, 32656, 11237, 1044, 7283, 2015, 1513, 1278, 13424, 1626, 1050, 1046, 1349, 122203, 24361, 28035, 1454, 11223, 1321, 95746, 24765, 2425, 1261, 6133, 21283, 1626, 1051, 1046, 1349, 2965, 1294, 1261, 4804, 4250, 12006, 4302, 48049, 4837, 1261, 29397, 1435, 22140, 21457, 22196, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 1294, 1261, 12097, 1044, 121040, 1536, 11223, 23170, 1321, 17744, 34941, 16429, 2425, 1261, 10991, 21283, 1046, 2], "1. A black dog is lying on a wooden floor, looking up at the camera.\n2. A rugged mountain landscape with green and rocky terrain under a clear sky.\n3. A person in a red swimsuit walks along a beach as waves crash nearby.\n4. A winding gravel path in a park, bordered by green grass and blooming trees under a blue sky.", [{"1049": {"logprob": -0.17000193893909454, "rank": 1, "decoded_token": "1"}, "11745": {"logprob": -1.9200019836425781, "rank": 2, "decoded_token": "Here"}, "69957": {"logprob": -4.920001983642578, "rank": 3, "decoded_token": "Sure"}, "117991": {"logprob": -7.295001983642578, "rank": 4, "decoded_token": "Certain"}, "1784": {"logprob": -7.295001983642578, "rank": 5, "decoded_token": "The"}}, {"1046": {"logprob": -1.597391747054644e-05, "rank": 1, "decoded_token": "."}, "1041": {"logprob": -11.500016212463379, "rank": 2, "decoded_token": ")"}, "1058": {"logprob": -13.062516212463379, "rank": 3, "decoded_token": ":"}, "3590": {"logprob": -13.750016212463379, "rank": 4, "decoded_token": ".A"}, "48426": {"logprob": -14.312516212463379, "rank": 5, "decoded_token": ".The"}}, {"1349": {"logprob": -0.07567699253559113, "rank": 1, "decoded_token": " A"}, "1531": {"logprob": -3.075676918029785, "rank": 2, "decoded_token": " The"}, "1603": {"logprob": -3.950676918029785, "rank": 3, "decoded_token": " **"}, "2409": {"logprob": -6.075676918029785, "rank": 4, "decoded_token": " This"}, "8479": {"logprob": -6.575676918029785, "rank": 5, "decoded_token": " Black"}}, {"7244": {"logprob": -0.06906593590974808, "rank": 1, "decoded_token": " black"}, "16450": {"logprob": -3.694066047668457, "rank": 2, "decoded_token": " sle"}, "6231": {"logprob": -4.506566047668457, "rank": 3, "decoded_token": " close"}, "4329": {"logprob": -4.944066047668457, "rank": 4, "decoded_token": " large"}, "8500": {"logprob": -5.256566047668457, "rank": 5, "decoded_token": " dark"}}, {"10575": {"logprob": -0.11913803219795227, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.24413800239563, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -5.494138240814209, "rank": 3, "decoded_token": " Labrador"}, "28404": {"logprob": -7.181638240814209, "rank": 4, "decoded_token": " pup"}, "8636": {"logprob": -7.869138240814209, "rank": 5, "decoded_token": " lab"}}, {"1395": {"logprob": -0.782707154750824, "rank": 1, "decoded_token": " is"}, "22524": {"logprob": -1.1577072143554688, "rank": 2, "decoded_token": " lies"}, "1454": {"logprob": -2.9077072143554688, "rank": 3, "decoded_token": " with"}, "10637": {"logprob": -3.0327072143554688, "rank": 4, "decoded_token": " looks"}, "28528": {"logprob": -3.5327072143554688, "rank": 5, "decoded_token": " lying"}}, {"28528": {"logprob": -0.3443163335323334, "rank": 1, "decoded_token": " lying"}, "7283": {"logprob": -2.094316244125366, "rank": 2, "decoded_token": " looking"}, "60700": {"logprob": -2.844316244125366, "rank": 3, "decoded_token": " laying"}, "38235": {"logprob": -3.344316244125366, "rank": 4, "decoded_token": " resting"}, "18970": {"logprob": -3.469316244125366, "rank": 5, "decoded_token": " sitting"}}, {"1408": {"logprob": -0.29093095660209656, "rank": 1, "decoded_token": " on"}, "3151": {"logprob": -1.415930986404419, "rank": 2, "decoded_token": " down"}, "41132": {"logprob": -6.16593074798584, "rank": 3, "decoded_token": " attent"}, "1321": {"logprob": -6.85343074798584, "rank": 4, "decoded_token": " and"}, "14038": {"logprob": -6.97843074798584, "rank": 5, "decoded_token": " flat"}}, {"1261": {"logprob": -0.05553353577852249, "rank": 1, "decoded_token": " a"}, "2246": {"logprob": -3.6805336475372314, "rank": 2, "decoded_token": " its"}, "32656": {"logprob": -3.8055336475372314, "rank": 3, "decoded_token": " wooden"}, "1278": {"logprob": -5.305533409118652, "rank": 4, "decoded_token": " the"}, "3977": {"logprob": -7.430533409118652, "rank": 5, "decoded_token": " top"}}, {"32656": {"logprob": -0.039505477994680405, "rank": 1, "decoded_token": " wooden"}, "3403": {"logprob": -3.9145054817199707, "rank": 2, "decoded_token": " text"}, "44130": {"logprob": -4.414505481719971, "rank": 3, "decoded_token": " rust"}, "12603": {"logprob": -5.914505481719971, "rank": 4, "decoded_token": " wood"}, "17253": {"logprob": -6.539505481719971, "rank": 5, "decoded_token": " weather"}}, {"11237": {"logprob": -0.373188853263855, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.248188853263855, "rank": 2, "decoded_token": " surface"}, "1615": {"logprob": -4.2481889724731445, "rank": 3, "decoded_token": " pl"}, "3403": {"logprob": -5.6231889724731445, "rank": 4, "decoded_token": " text"}, "28984": {"logprob": -5.9981889724731445, "rank": 5, "decoded_token": " deck"}}, {"1044": {"logprob": -1.378434181213379, "rank": 3, "decoded_token": ","}, "7283": {"logprob": -1.378434181213379, "rank": 1, "decoded_token": " looking"}, "1626": {"logprob": -1.378434181213379, "rank": 2, "decoded_token": ".\n"}, "1321": {"logprob": -2.378434181213379, "rank": 4, "decoded_token": " and"}, "1454": {"logprob": -2.628434181213379, "rank": 5, "decoded_token": " with"}}, {"7283": {"logprob": -0.17630912363529205, "rank": 1, "decoded_token": " looking"}, "11589": {"logprob": -2.551309108734131, "rank": 2, "decoded_token": " gaz"}, "35542": {"logprob": -2.676309108734131, "rank": 3, "decoded_token": " staring"}, "22116": {"logprob": -6.238809108734131, "rank": 4, "decoded_token": " facing"}, "11735": {"logprob": -6.488809108734131, "rank": 5, "decoded_token": " giving"}}, {"2015": {"logprob": -0.8436563014984131, "rank": 1, "decoded_token": " up"}, "7655": {"logprob": -1.343656301498413, "rank": 2, "decoded_token": " directly"}, "74606": {"logprob": -1.718656301498413, "rank": 3, "decoded_token": " upwards"}, "40022": {"logprob": -2.593656301498413, "rank": 4, "decoded_token": " upward"}, "11521": {"logprob": -4.406156539916992, "rank": 5, "decoded_token": " straight"}}, {"1513": {"logprob": -0.45780688524246216, "rank": 1, "decoded_token": " at"}, "1626": {"logprob": -1.7078068256378174, "rank": 2, "decoded_token": ".\n"}, "1454": {"logprob": -2.3328068256378174, "rank": 3, "decoded_token": " with"}, "1935": {"logprob": -3.5828068256378174, "rank": 4, "decoded_token": " int"}, "41132": {"logprob": -3.9578068256378174, "rank": 5, "decoded_token": " attent"}}, {"1278": {"logprob": -0.0004164305282756686, "rank": 1, "decoded_token": " the"}, "4433": {"logprob": -8.00041675567627, "rank": 2, "decoded_token": " something"}, "1261": {"logprob": -10.50041675567627, "rank": 3, "decoded_token": " a"}, "2246": {"logprob": -10.87541675567627, "rank": 4, "decoded_token": " its"}, "1636": {"logprob": -11.37541675567627, "rank": 5, "decoded_token": " you"}}, {"13424": {"logprob": -0.000399033073335886, "rank": 1, "decoded_token": " camera"}, "56268": {"logprob": -8.125398635864258, "rank": 2, "decoded_token": " viewer"}, "68439": {"logprob": -9.500398635864258, "rank": 3, "decoded_token": " photographer"}, "37967": {"logprob": -12.000398635864258, "rank": 4, "decoded_token": " ceiling"}, "2965": {"logprob": -12.312898635864258, "rank": 5, "decoded_token": " person"}}, {"1626": {"logprob": -0.10298559814691544, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -2.9779856204986572, "rank": 2, "decoded_token": "."}, "1454": {"logprob": -3.2279856204986572, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -5.227985382080078, "rank": 4, "decoded_token": ".\n\n"}, "1935": {"logprob": -6.852985382080078, "rank": 5, "decoded_token": " int"}}, {"1050": {"logprob": -0.002897590398788452, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -6.5028977394104, "rank": 2, "decoded_token": "  "}, "1293": {"logprob": -6.6278977394104, "rank": 3, "decoded_token": "   "}, "1032": {"logprob": -9.877897262573242, "rank": 4, "decoded_token": " "}, "1009": {"logprob": -11.627897262573242, "rank": 5, "decoded_token": "\t"}}, {"1046": {"logprob": -1.5497195136049413e-06, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -14.875001907348633, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -15.000001907348633, "rank": 3, "decoded_token": ".A"}, "2247": {"logprob": -15.125001907348633, "rank": 4, "decoded_token": " ."}, "1058": {"logprob": -15.375001907348633, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.6107801198959351, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.360780119895935, "rank": 2, "decoded_token": " Maj"}, "113465": {"logprob": -2.3607802391052246, "rank": 3, "decoded_token": " Rug"}, "27260": {"logprob": -3.7357802391052246, "rank": 4, "decoded_token": " Mountain"}, "1531": {"logprob": -4.485780239105225, "rank": 5, "decoded_token": " The"}}, {"122203": {"logprob": -0.8547073602676392, "rank": 1, "decoded_token": " rugged"}, "15375": {"logprob": -2.1047072410583496, "rank": 2, "decoded_token": " vast"}, "10726": {"logprob": -2.1047072410583496, "rank": 3, "decoded_token": " scen"}, "61082": {"logprob": -2.6047072410583496, "rank": 4, "decoded_token": " panor"}, "2965": {"logprob": -3.2922072410583496, "rank": 5, "decoded_token": " person"}}, {"24361": {"logprob": -0.41217130422592163, "rank": 1, "decoded_token": " mountain"}, "1044": {"logprob": -1.6621713638305664, "rank": 2, "decoded_token": ","}, "127945": {"logprob": -2.6621713638305664, "rank": 3, "decoded_token": " mountainous"}, "28035": {"logprob": -3.5371713638305664, "rank": 4, "decoded_token": " landscape"}, "1321": {"logprob": -3.6621713638305664, "rank": 5, "decoded_token": " and"}}, {"28035": {"logprob": -0.6676621437072754, "rank": 1, "decoded_token": " landscape"}, "4521": {"logprob": -0.7926621437072754, "rank": 2, "decoded_token": " range"}, "24765": {"logprob": -4.542662143707275, "rank": 3, "decoded_token": " terrain"}, "13327": {"logprob": -5.167662143707275, "rank": 4, "decoded_token": " scene"}, "12248": {"logprob": -5.167662143707275, "rank": 5, "decoded_token": " peak"}}, {"1454": {"logprob": -0.31015345454216003, "rank": 1, "decoded_token": " with"}, "6971": {"logprob": -2.4351534843444824, "rank": 2, "decoded_token": " features"}, "94973": {"logprob": -3.3101534843444824, "rank": 3, "decoded_token": " stretches"}, "89995": {"logprob": -3.4351534843444824, "rank": 4, "decoded_token": " showc"}, "1395": {"logprob": -3.5601534843444824, "rank": 5, "decoded_token": " is"}}, {"11223": {"logprob": -1.547694206237793, "rank": 1, "decoded_token": " green"}, "95746": {"logprob": -1.922694206237793, "rank": 2, "decoded_token": " rocky"}, "27469": {"logprob": -2.172694206237793, "rank": 3, "decoded_token": " peaks"}, "6245": {"logprob": -2.297694206237793, "rank": 4, "decoded_token": " multiple"}, "47147": {"logprob": -2.360194206237793, "rank": 5, "decoded_token": " steep"}}, {"1321": {"logprob": -0.9617817401885986, "rank": 1, "decoded_token": " and"}, "61263": {"logprob": -1.3367817401885986, "rank": 2, "decoded_token": " slopes"}, "51187": {"logprob": -2.3367817401885986, "rank": 3, "decoded_token": " hills"}, "47260": {"logprob": -2.3367817401885986, "rank": 4, "decoded_token": " vegetation"}, "50373": {"logprob": -2.7117817401885986, "rank": 5, "decoded_token": " patches"}}, {"95746": {"logprob": -0.11686273664236069, "rank": 1, "decoded_token": " rocky"}, "22980": {"logprob": -2.7418627738952637, "rank": 2, "decoded_token": " brown"}, "4266": {"logprob": -3.8668627738952637, "rank": 3, "decoded_token": " bar"}, "26549": {"logprob": -4.491862773895264, "rank": 4, "decoded_token": " gray"}, "9091": {"logprob": -5.366862773895264, "rank": 5, "decoded_token": " rock"}}, {"24765": {"logprob": -0.22640009224414825, "rank": 1, "decoded_token": " terrain"}, "57912": {"logprob": -2.476400136947632, "rank": 2, "decoded_token": " terrains"}, "61263": {"logprob": -2.726400136947632, "rank": 3, "decoded_token": " slopes"}, "51187": {"logprob": -3.851400136947632, "rank": 4, "decoded_token": " hills"}, "27469": {"logprob": -3.976400136947632, "rank": 5, "decoded_token": " peaks"}}, {"2425": {"logprob": -0.7823817133903503, "rank": 1, "decoded_token": " under"}, "1626": {"logprob": -1.1573817729949951, "rank": 2, "decoded_token": ".\n"}, "94973": {"logprob": -2.657381772994995, "rank": 3, "decoded_token": " stretches"}, "1395": {"logprob": -2.782381772994995, "rank": 4, "decoded_token": " is"}, "7038": {"logprob": -3.532381772994995, "rank": 5, "decoded_token": " extends"}}, {"1261": {"logprob": -0.016132064163684845, "rank": 1, "decoded_token": " a"}, "6133": {"logprob": -5.39113187789917, "rank": 2, "decoded_token": " clear"}, "1420": {"logprob": -5.39113187789917, "rank": 3, "decoded_token": " an"}, "1278": {"logprob": -6.01613187789917, "rank": 4, "decoded_token": " the"}, "16152": {"logprob": -6.26613187789917, "rank": 5, "decoded_token": " cloud"}}, {"6133": {"logprob": -0.44541382789611816, "rank": 1, "decoded_token": " clear"}, "16152": {"logprob": -2.070413827896118, "rank": 2, "decoded_token": " cloud"}, "18416": {"logprob": -2.320413827896118, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -3.195413827896118, "rank": 4, "decoded_token": " partly"}, "10991": {"logprob": -3.320413827896118, "rank": 5, "decoded_token": " blue"}}, {"21283": {"logprob": -0.003768961876630783, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -5.7537689208984375, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.6287689208984375, "rank": 3, "decoded_token": ","}, "1505": {"logprob": -10.753768920898438, "rank": 4, "decoded_token": " or"}, "3044": {"logprob": -11.128768920898438, "rank": 5, "decoded_token": " sk"}}, {"1626": {"logprob": -0.0008177988929674029, "rank": 1, "decoded_token": ".\n"}, "1046": {"logprob": -7.375817775726318, "rank": 2, "decoded_token": "."}, "1395": {"logprob": -9.750818252563477, "rank": 3, "decoded_token": " is"}, "1010": {"logprob": -10.125818252563477, "rank": 4, "decoded_token": "\n"}, "1044": {"logprob": -10.750818252563477, "rank": 5, "decoded_token": ","}}, {"1051": {"logprob": -0.00013457823661156, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -9.125134468078613, "rank": 2, "decoded_token": "4"}, "1256": {"logprob": -11.375134468078613, "rank": 3, "decoded_token": "  "}, "1050": {"logprob": -11.875134468078613, "rank": 4, "decoded_token": "2"}, "1049": {"logprob": -13.000134468078613, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.875000953674316, "rank": 2, "decoded_token": ".A"}, "48426": {"logprob": -15.937500953674316, "rank": 3, "decoded_token": ".The"}, "1349": {"logprob": -17.0, "rank": 4, "decoded_token": " A"}, "1338": {"logprob": -17.3125, "rank": 5, "decoded_token": ".\n\n"}}, {"1349": {"logprob": -0.03193942829966545, "rank": 1, "decoded_token": " A"}, "10638": {"logprob": -4.406939506530762, "rank": 2, "decoded_token": " Two"}, "2048": {"logprob": -5.031939506530762, "rank": 3, "decoded_token": " An"}, "1488": {"logprob": -5.156939506530762, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -5.906939506530762, "rank": 5, "decoded_token": " People"}}, {"2965": {"logprob": -0.41655251383781433, "rank": 1, "decoded_token": " person"}, "92731": {"logprob": -1.5415525436401367, "rank": 2, "decoded_token": " lone"}, "79013": {"logprob": -2.7915525436401367, "rank": 3, "decoded_token": " solitary"}, "29397": {"logprob": -3.5415525436401367, "rank": 4, "decoded_token": " beach"}, "2169": {"logprob": -4.729052543640137, "rank": 5, "decoded_token": " ser"}}, {"1294": {"logprob": -0.9845026135444641, "rank": 1, "decoded_token": " in"}, "1395": {"logprob": -1.2345025539398193, "rank": 2, "decoded_token": " is"}, "48049": {"logprob": -1.8595025539398193, "rank": 3, "decoded_token": " walks"}, "23737": {"logprob": -2.2345025539398193, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -2.8595025539398193, "rank": 5, "decoded_token": " w"}}, {"1261": {"logprob": -0.32012784481048584, "rank": 1, "decoded_token": " a"}, "4804": {"logprob": -1.3201278448104858, "rank": 2, "decoded_token": " red"}, "1420": {"logprob": -5.820127964019775, "rank": 3, "decoded_token": " an"}, "64031": {"logprob": -6.570127964019775, "rank": 4, "decoded_token": " swim"}, "18168": {"logprob": -6.695127964019775, "rank": 5, "decoded_token": " bright"}}, {"4804": {"logprob": -0.10999592393636703, "rank": 1, "decoded_token": " red"}, "1285": {"logprob": -2.3599958419799805, "rank": 2, "decoded_token": " w"}, "4250": {"logprob": -5.6099958419799805, "rank": 3, "decoded_token": " sw"}, "18168": {"logprob": -6.0474958419799805, "rank": 4, "decoded_token": " bright"}, "18258": {"logprob": -6.4224958419799805, "rank": 5, "decoded_token": " wet"}}, {"4250": {"logprob": -0.2469252496957779, "rank": 1, "decoded_token": " sw"}, "1285": {"logprob": -2.3719253540039062, "rank": 2, "decoded_token": " w"}, "64031": {"logprob": -2.7469253540039062, "rank": 3, "decoded_token": " swim"}, "17513": {"logprob": -3.2469253540039062, "rank": 4, "decoded_token": " suit"}, "75948": {"logprob": -4.371925354003906, "rank": 5, "decoded_token": " outfit"}}, {"12006": {"logprob": -5.722029527532868e-06, "rank": 1, "decoded_token": "ims"}, "25763": {"logprob": -12.750005722045898, "rank": 2, "decoded_token": "immer"}, "7552": {"logprob": -13.687505722045898, "rank": 3, "decoded_token": "imm"}, "2097": {"logprob": -16.6875057220459, "rank": 4, "decoded_token": "ins"}, "19523": {"logprob": -16.7500057220459, "rank": 5, "decoded_token": "imb"}}, {"4302": {"logprob": -1.8000440832111053e-05, "rank": 1, "decoded_token": "uit"}, "17513": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": " suit"}, "8036": {"logprob": -13.250018119812012, "rank": 3, "decoded_token": "irt"}, "36953": {"logprob": -13.500018119812012, "rank": 4, "decoded_token": "uiten"}, "1276": {"logprob": -14.437518119812012, "rank": 5, "decoded_token": "it"}}, {"48049": {"logprob": -0.41766560077667236, "rank": 1, "decoded_token": " walks"}, "1395": {"logprob": -1.4176656007766724, "rank": 2, "decoded_token": " is"}, "19710": {"logprob": -2.792665481567383, "rank": 3, "decoded_token": " walking"}, "23737": {"logprob": -3.917665481567383, "rank": 4, "decoded_token": " stands"}, "1285": {"logprob": -4.292665481567383, "rank": 5, "decoded_token": " w"}}, {"4837": {"logprob": -0.002689199522137642, "rank": 1, "decoded_token": " along"}, "9412": {"logprob": -6.627689361572266, "rank": 2, "decoded_token": " alone"}, "6117": {"logprob": -7.377689361572266, "rank": 3, "decoded_token": " near"}, "1408": {"logprob": -8.002689361572266, "rank": 4, "decoded_token": " on"}, "2203": {"logprob": -8.377689361572266, "rank": 5, "decoded_token": " into"}}, {"1261": {"logprob": -0.38749611377716064, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -1.1374961137771606, "rank": 2, "decoded_token": " the"}, "1420": {"logprob": -7.387495994567871, "rank": 3, "decoded_token": " an"}, "100991": {"logprob": -13.949995994567871, "rank": 4, "decoded_token": " sandy"}, "18258": {"logprob": -14.512495994567871, "rank": 5, "decoded_token": " wet"}}, {"29397": {"logprob": -0.5292408466339111, "rank": 1, "decoded_token": " beach"}, "100991": {"logprob": -0.9042408466339111, "rank": 2, "decoded_token": " sandy"}, "1627": {"logprob": -6.029240608215332, "rank": 3, "decoded_token": " sh"}, "46422": {"logprob": -6.529240608215332, "rank": 4, "decoded_token": " shore"}, "2169": {"logprob": -7.779240608215332, "rank": 5, "decoded_token": " ser"}}, {"1435": {"logprob": -0.29965779185295105, "rank": 1, "decoded_token": " as"}, "1454": {"logprob": -1.6746578216552734, "rank": 2, "decoded_token": " with"}, "1513": {"logprob": -3.7996578216552734, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.7996578216552734, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -4.799657821655273, "rank": 5, "decoded_token": " near"}}, {"22140": {"logprob": -0.015346773900091648, "rank": 1, "decoded_token": " waves"}, "1261": {"logprob": -4.515347003936768, "rank": 2, "decoded_token": " a"}, "1278": {"logprob": -6.140347003936768, "rank": 3, "decoded_token": " the"}, "27208": {"logprob": -6.890347003936768, "rank": 4, "decoded_token": " ocean"}, "4329": {"logprob": -7.265347003936768, "rank": 5, "decoded_token": " large"}}, {"21457": {"logprob": -0.013234862126410007, "rank": 1, "decoded_token": " crash"}, "33168": {"logprob": -5.138235092163086, "rank": 2, "decoded_token": " gently"}, "10401": {"logprob": -5.950735092163086, "rank": 3, "decoded_token": " roll"}, "4323": {"logprob": -6.700735092163086, "rank": 4, "decoded_token": " break"}, "5125": {"logprob": -7.138235092163086, "rank": 5, "decoded_token": " approach"}}, {"22196": {"logprob": -0.060372594743967056, "rank": 1, "decoded_token": " nearby"}, "6117": {"logprob": -3.3103725910186768, "rank": 2, "decoded_token": " near"}, "1294": {"logprob": -4.435372829437256, "rank": 3, "decoded_token": " in"}, "25644": {"logprob": -6.310372829437256, "rank": 4, "decoded_token": " beside"}, "1321": {"logprob": -6.560372829437256, "rank": 5, "decoded_token": " and"}}, {"1626": {"logprob": -0.005290080793201923, "rank": 1, "decoded_token": ".\n"}, "1294": {"logprob": -6.5052900314331055, "rank": 2, "decoded_token": " in"}, "1044": {"logprob": -7.0052900314331055, "rank": 3, "decoded_token": ","}, "1321": {"logprob": -7.1302900314331055, "rank": 4, "decoded_token": " and"}, "1513": {"logprob": -7.2552900314331055, "rank": 5, "decoded_token": " at"}}, {"1052": {"logprob": -7.748573807475623e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -12.562507629394531, "rank": 2, "decoded_token": "3"}, "1053": {"logprob": -13.125007629394531, "rank": 3, "decoded_token": "5"}, "1256": {"logprob": -14.125007629394531, "rank": 4, "decoded_token": "  "}, "1049": {"logprob": -14.312507629394531, "rank": 5, "decoded_token": "1"}}, {"1046": {"logprob": -1.2993727978027891e-05, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -12.62501335144043, "rank": 2, "decoded_token": ","}, "3590": {"logprob": -12.75001335144043, "rank": 3, "decoded_token": ".A"}, "1058": {"logprob": -13.00001335144043, "rank": 4, "decoded_token": ":"}, "2247": {"logprob": -13.37501335144043, "rank": 5, "decoded_token": " ."}}, {"1349": {"logprob": -0.00046957432641647756, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -8.250469207763672, "rank": 2, "decoded_token": " An"}, "1488": {"logprob": -10.125469207763672, "rank": 3, "decoded_token": " W"}, "2409": {"logprob": -10.375469207763672, "rank": 4, "decoded_token": " This"}, "12232": {"logprob": -10.500469207763672, "rank": 5, "decoded_token": " Gra"}}, {"53301": {"logprob": -0.35120296478271484, "rank": 1, "decoded_token": " winding"}, "59396": {"logprob": -1.8512029647827148, "rank": 2, "decoded_token": " gravel"}, "2169": {"logprob": -2.476202964782715, "rank": 3, "decoded_token": " ser"}, "54742": {"logprob": -3.851202964782715, "rank": 4, "decoded_token": " peaceful"}, "43536": {"logprob": -5.101202964782715, "rank": 5, "decoded_token": " curved"}}, {"59396": {"logprob": -0.2955280840396881, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.6705280542373657, "rank": 2, "decoded_token": " path"}, "14801": {"logprob": -2.7955281734466553, "rank": 3, "decoded_token": " pathway"}, "1044": {"logprob": -6.420527935028076, "rank": 4, "decoded_token": ","}, "18341": {"logprob": -6.670527935028076, "rank": 5, "decoded_token": " pathways"}}, {"3549": {"logprob": -0.03408379852771759, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.409083843231201, "rank": 2, "decoded_token": " pathway"}, "18341": {"logprob": -8.284083366394043, "rank": 3, "decoded_token": " pathways"}, "1505": {"logprob": -9.534083366394043, "rank": 4, "decoded_token": " or"}, "7368": {"logprob": -10.659083366394043, "rank": 5, "decoded_token": "path"}}, {"1294": {"logprob": -1.0857839584350586, "rank": 1, "decoded_token": " in"}, "13335": {"logprob": -1.4607839584350586, "rank": 2, "decoded_token": " leads"}, "2645": {"logprob": -1.9607839584350586, "rank": 3, "decoded_token": " through"}, "29817": {"logprob": -2.4607839584350586, "rank": 4, "decoded_token": " surrounded"}, "22416": {"logprob": -3.2107839584350586, "rank": 5, "decoded_token": " curves"}}, {"1261": {"logprob": -0.00011705666838679463, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -9.500117301940918, "rank": 2, "decoded_token": " an"}, "1278": {"logprob": -10.250117301940918, "rank": 3, "decoded_token": " the"}, "2549": {"logprob": -12.750117301940918, "rank": 4, "decoded_token": " what"}, "11223": {"logprob": -13.750117301940918, "rank": 5, "decoded_token": " green"}}, {"12097": {"logprob": -0.02791696786880493, "rank": 1, "decoded_token": " park"}, "2169": {"logprob": -4.65291690826416, "rank": 2, "decoded_token": " ser"}, "1295": {"logprob": -4.65291690826416, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -5.27791690826416, "rank": 4, "decoded_token": " grass"}, "26428": {"logprob": -6.52791690826416, "rank": 5, "decoded_token": " garden"}}, {"1044": {"logprob": -1.350893259048462, "rank": 1, "decoded_token": ","}, "1395": {"logprob": -1.600893259048462, "rank": 2, "decoded_token": " is"}, "29817": {"logprob": -2.350893259048462, "rank": 3, "decoded_token": " surrounded"}, "121313": {"logprob": -2.475893259048462, "rank": 4, "decoded_token": " flanked"}, "1454": {"logprob": -2.475893259048462, "rank": 5, "decoded_token": " with"}}, {"121040": {"logprob": -0.710591197013855, "rank": 1, "decoded_token": " bordered"}, "121313": {"logprob": -1.085591197013855, "rank": 2, "decoded_token": " flanked"}, "54410": {"logprob": -1.960591197013855, "rank": 3, "decoded_token": " lined"}, "29817": {"logprob": -3.8355913162231445, "rank": 4, "decoded_token": " surrounded"}, "1454": {"logprob": -5.8355913162231445, "rank": 5, "decoded_token": " with"}}, {"1536": {"logprob": -4.6491513785440475e-06, "rank": 1, "decoded_token": " by"}, "1454": {"logprob": -12.375004768371582, "rank": 2, "decoded_token": " with"}, "1408": {"logprob": -15.812504768371582, "rank": 3, "decoded_token": " on"}, "3326": {"logprob": -16.875003814697266, "rank": 4, "decoded_token": "by"}, "1295": {"logprob": -16.875003814697266, "rank": 5, "decoded_token": " l"}}, {"11223": {"logprob": -0.4314780533313751, "rank": 1, "decoded_token": " green"}, "1295": {"logprob": -1.4314780235290527, "rank": 2, "decoded_token": " l"}, "23170": {"logprob": -2.4314780235290527, "rank": 3, "decoded_token": " grass"}, "17744": {"logprob": -4.806478023529053, "rank": 4, "decoded_token": " blo"}, "95612": {"logprob": -5.181478023529053, "rank": 5, "decoded_token": " vibrant"}}, {"23170": {"logprob": -0.00035041390219703317, "rank": 1, "decoded_token": " grass"}, "69230": {"logprob": -8.125349998474121, "rank": 2, "decoded_token": " lawn"}, "128633": {"logprob": -10.750349998474121, "rank": 3, "decoded_token": " grasses"}, "87781": {"logprob": -11.437849998474121, "rank": 4, "decoded_token": "\u8349"}, "16429": {"logprob": -11.437849998474121, "rank": 5, "decoded_token": " trees"}}, {"1321": {"logprob": -0.0009494088008068502, "rank": 1, "decoded_token": " and"}, "1044": {"logprob": -7.125949382781982, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -9.25094985961914, "rank": 3, "decoded_token": " with"}, "2425": {"logprob": -11.75094985961914, "rank": 4, "decoded_token": " under"}, "1046": {"logprob": -11.75094985961914, "rank": 5, "decoded_token": "."}}, {"17744": {"logprob": -0.21488544344902039, "rank": 1, "decoded_token": " blo"}, "105368": {"logprob": -1.8398854732513428, "rank": 2, "decoded_token": " bloss"}, "87833": {"logprob": -3.8398854732513428, "rank": 3, "decoded_token": " flowering"}, "16429": {"logprob": -4.464885234832764, "rank": 4, "decoded_token": " trees"}, "117207": {"logprob": -7.589885234832764, "rank": 5, "decoded_token": " bloom"}}, {"34941": {"logprob": -7.152555099310121e-07, "rank": 1, "decoded_token": "oming"}, "35974": {"logprob": -14.375000953674316, "rank": 2, "decoded_token": "omed"}, "6325": {"logprob": -16.5625, "rank": 3, "decoded_token": "oms"}, "11009": {"logprob": -17.625, "rank": 4, "decoded_token": "omy"}, "9457": {"logprob": -18.875, "rank": 5, "decoded_token": "ming"}}, {"16429": {"logprob": -0.002424398437142372, "rank": 1, "decoded_token": " trees"}, "103796": {"logprob": -6.627424240112305, "rank": 2, "decoded_token": " cherry"}, "32152": {"logprob": -7.377424240112305, "rank": 3, "decoded_token": " flowers"}, "29151": {"logprob": -9.314924240112305, "rank": 4, "decoded_token": " shr"}, "20370": {"logprob": -9.564924240112305, "rank": 5, "decoded_token": " fruit"}}, {"2425": {"logprob": -0.3792523741722107, "rank": 1, "decoded_token": " under"}, "1046": {"logprob": -1.3792524337768555, "rank": 2, "decoded_token": "."}, "3675": {"logprob": -2.8792524337768555, "rank": 3, "decoded_token": " against"}, "1044": {"logprob": -5.1292524337768555, "rank": 4, "decoded_token": ","}, "1454": {"logprob": -7.2542524337768555, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.0002315968304174021, "rank": 1, "decoded_token": " a"}, "1278": {"logprob": -8.875231742858887, "rank": 2, "decoded_token": " the"}, "10991": {"logprob": -9.875231742858887, "rank": 3, "decoded_token": " blue"}, "6133": {"logprob": -10.375231742858887, "rank": 4, "decoded_token": " clear"}, "1420": {"logprob": -12.250231742858887, "rank": 5, "decoded_token": " an"}}, {"10991": {"logprob": -0.6372600197792053, "rank": 1, "decoded_token": " blue"}, "6133": {"logprob": -0.7622600197792053, "rank": 2, "decoded_token": " clear"}, "18168": {"logprob": -5.3872599601745605, "rank": 3, "decoded_token": " bright"}, "105573": {"logprob": -10.012260437011719, "rank": 4, "decoded_token": " sunny"}, "15330": {"logprob": -11.512260437011719, "rank": 5, "decoded_token": " Blue"}}, {"21283": {"logprob": -6.12716976320371e-05, "rank": 1, "decoded_token": " sky"}, "1044": {"logprob": -9.87506103515625, "rank": 2, "decoded_token": ","}, "19673": {"logprob": -12.00006103515625, "rank": 3, "decoded_token": " Sky"}, "1321": {"logprob": -13.31256103515625, "rank": 4, "decoded_token": " and"}, "124968": {"logprob": -14.81256103515625, "rank": 5, "decoded_token": " skies"}}, {"1046": {"logprob": -0.00013982271775603294, "rank": 1, "decoded_token": "."}, "2": {"logprob": -9.500140190124512, "rank": 2, "decoded_token": ".\n"}, "1626": {"logprob": -10.000140190124512, "rank": 3, "decoded_token": ".\n\n"}, "1338": {"logprob": -11.750140190124512, "rank": 4, "decoded_token": " with"}}, {"2": {"logprob": -0.0004533693427219987, "rank": 1, "decoded_token": "  "}, "1032": {"logprob": -7.750453472137451, "rank": 2, "decoded_token": " Each"}, "1256": {"logprob": -11.125452995300293, "rank": 3, "decoded_token": " This"}}]]]
\ No newline at end of file
diff --git a/tests/models/fixtures/pixtral_chat_engine.json b/tests/models/fixtures/pixtral_chat_engine.json
deleted file mode 100644
index 60e4ae6cebf..00000000000
--- a/tests/models/fixtures/pixtral_chat_engine.json
+++ /dev/null
@@ -1 +0,0 @@
-[[[1784, 3937, 6122, 1261, 7244, 10575, 18970, 1408, 1261, 32656, 4691, 1046, 2], "The image shows a black dog sitting on a wooden surface.", [{"1784": {"logprob": -0.11685245484113693, "rank": 1, "decoded_token": "The"}, "4380": {"logprob": -2.3668525218963623, "rank": 2, "decoded_token": "This"}, "1049": {"logprob": -4.741852283477783, "rank": 3, "decoded_token": "1"}, "117991": {"logprob": -5.991852283477783, "rank": 4, "decoded_token": "Certain"}, "1785": {"logprob": -5.991852283477783, "rank": 5, "decoded_token": "In"}}, {"3937": {"logprob": -0.2591013014316559, "rank": 1, "decoded_token": " image"}, "2158": {"logprob": -1.5091012716293335, "rank": 2, "decoded_token": " first"}, "3977": {"logprob": -5.884101390838623, "rank": 3, "decoded_token": " top"}, "7244": {"logprob": -6.259101390838623, "rank": 4, "decoded_token": " black"}, "8061": {"logprob": -6.759101390838623, "rank": 5, "decoded_token": " images"}}, {"6122": {"logprob": -0.9660423994064331, "rank": 1, "decoded_token": " shows"}, "51948": {"logprob": -1.466042399406433, "rank": 2, "decoded_token": " depicts"}, "6971": {"logprob": -1.466042399406433, "rank": 3, "decoded_token": " features"}, "25981": {"logprob": -2.8410425186157227, "rank": 4, "decoded_token": " displays"}, "8688": {"logprob": -2.8410425186157227, "rank": 5, "decoded_token": " contains"}}, {"1261": {"logprob": -0.0030613720882683992, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -6.253061294555664, "rank": 2, "decoded_token": " an"}, "2295": {"logprob": -7.878061294555664, "rank": 3, "decoded_token": " two"}, "2342": {"logprob": -7.878061294555664, "rank": 4, "decoded_token": " only"}, "1278": {"logprob": -8.628061294555664, "rank": 5, "decoded_token": " the"}}, {"7244": {"logprob": -0.17649099230766296, "rank": 1, "decoded_token": " black"}, "6231": {"logprob": -2.3014910221099854, "rank": 2, "decoded_token": " close"}, "4249": {"logprob": -3.4264910221099854, "rank": 3, "decoded_token": " single"}, "4329": {"logprob": -5.113990783691406, "rank": 4, "decoded_token": " large"}, "10575": {"logprob": -5.176490783691406, "rank": 5, "decoded_token": " dog"}}, {"10575": {"logprob": -0.10929587483406067, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.4842958450317383, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -4.109295845031738, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.296795845031738, "rank": 4, "decoded_token": " Lab"}, "7990": {"logprob": -7.484295845031738, "rank": 5, "decoded_token": " cat"}}, {"18970": {"logprob": -0.830376148223877, "rank": 1, "decoded_token": " sitting"}, "1454": {"logprob": -1.580376148223877, "rank": 2, "decoded_token": " with"}, "28528": {"logprob": -1.955376148223877, "rank": 3, "decoded_token": " lying"}, "7283": {"logprob": -2.205376148223877, "rank": 4, "decoded_token": " looking"}, "15866": {"logprob": -3.017876148223877, "rank": 5, "decoded_token": " standing"}}, {"1408": {"logprob": -0.08554735779762268, "rank": 1, "decoded_token": " on"}, "1321": {"logprob": -3.71054744720459, "rank": 2, "decoded_token": " and"}, "3675": {"logprob": -3.96054744720459, "rank": 3, "decoded_token": " against"}, "41132": {"logprob": -4.71054744720459, "rank": 4, "decoded_token": " attent"}, "1454": {"logprob": -5.08554744720459, "rank": 5, "decoded_token": " with"}}, {"1261": {"logprob": -0.540847897529602, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -0.915847897529602, "rank": 2, "decoded_token": " wooden"}, "12603": {"logprob": -5.4158477783203125, "rank": 3, "decoded_token": " wood"}, "3977": {"logprob": -5.4158477783203125, "rank": 4, "decoded_token": " top"}, "17253": {"logprob": -6.2908477783203125, "rank": 5, "decoded_token": " weather"}}, {"32656": {"logprob": -0.025753861293196678, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -4.400753974914551, "rank": 2, "decoded_token": " rust"}, "12603": {"logprob": -5.275753974914551, "rank": 3, "decoded_token": " wood"}, "3403": {"logprob": -5.400753974914551, "rank": 4, "decoded_token": " text"}, "17253": {"logprob": -6.963253974914551, "rank": 5, "decoded_token": " weather"}}, {"4691": {"logprob": -0.7265751957893372, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8515751957893372, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.6015751361846924, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -5.2265753746032715, "rank": 4, "decoded_token": " deck"}, "1615": {"logprob": -5.7265753746032715, "rank": 5, "decoded_token": " pl"}}, {"1046": {"logprob": -0.4868825674057007, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -1.9868825674057007, "rank": 2, "decoded_token": ","}, "1321": {"logprob": -2.3618826866149902, "rank": 3, "decoded_token": " and"}, "1454": {"logprob": -2.6118826866149902, "rank": 4, "decoded_token": " with"}, "7283": {"logprob": -2.7368826866149902, "rank": 5, "decoded_token": " looking"}}, {"2": {"logprob": -0.0026643513701856136, "rank": 1, "decoded_token": "</s>"}, "1531": {"logprob": -6.502664566040039, "rank": 2, "decoded_token": " The"}, "1032": {"logprob": -6.877664566040039, "rank": 3, "decoded_token": " "}, "3730": {"logprob": -9.752664566040039, "rank": 4, "decoded_token": " There"}, "1256": {"logprob": -11.002664566040039, "rank": 5, "decoded_token": "  "}}]], [[1049, 1046, 1349, 7244, 10575, 1454, 2327, 94766, 32961, 53048, 41132, 3923, 1408, 1261, 32656, 4691, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1046, 2], "1. A black dog with floppy ears sits attentively on a wooden surface.\n2. A vast mountain range stretches across the horizon under a cloudy sky.", [{"1049": {"logprob": -0.42824622988700867, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -1.553246259689331, "rank": 2, "decoded_token": "-"}, "1065": {"logprob": -2.428246259689331, "rank": 3, "decoded_token": "A"}, "1784": {"logprob": -4.053246021270752, "rank": 4, "decoded_token": "The"}, "69957": {"logprob": -4.428246021270752, "rank": 5, "decoded_token": "Sure"}}, {"1046": {"logprob": -1.811964830267243e-05, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -11.875018119812012, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -12.250018119812012, "rank": 3, "decoded_token": ".A"}, "1065": {"logprob": -13.062518119812012, "rank": 4, "decoded_token": "A"}, "1041": {"logprob": -13.750018119812012, "rank": 5, "decoded_token": ")"}}, {"1349": {"logprob": -0.13647246360778809, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.386472463607788, "rank": 2, "decoded_token": " \""}, "1603": {"logprob": -3.886472463607788, "rank": 3, "decoded_token": " **"}, "11967": {"logprob": -5.011472702026367, "rank": 4, "decoded_token": " Image"}, "1531": {"logprob": -5.011472702026367, "rank": 5, "decoded_token": " The"}}, {"7244": {"logprob": -0.18561004102230072, "rank": 1, "decoded_token": " black"}, "38462": {"logprob": -3.185610055923462, "rank": 2, "decoded_token": " curious"}, "68076": {"logprob": -3.623110055923462, "rank": 3, "decoded_token": " cute"}, "4329": {"logprob": -3.935610055923462, "rank": 4, "decoded_token": " large"}, "74168": {"logprob": -4.373109817504883, "rank": 5, "decoded_token": " gloss"}}, {"10575": {"logprob": -0.17297746241092682, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.1729774475097656, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.1729774475097656, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -6.985477447509766, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.360477447509766, "rank": 5, "decoded_token": " lab"}}, {"1454": {"logprob": -0.5785807967185974, "rank": 1, "decoded_token": " with"}, "53048": {"logprob": -1.2660808563232422, "rank": 2, "decoded_token": " sits"}, "1395": {"logprob": -3.016080856323242, "rank": 3, "decoded_token": " is"}, "22524": {"logprob": -3.578580856323242, "rank": 4, "decoded_token": " lies"}, "18970": {"logprob": -3.703580856323242, "rank": 5, "decoded_token": " sitting"}}, {"2327": {"logprob": -1.2709298133850098, "rank": 1, "decoded_token": " fl"}, "1261": {"logprob": -1.3959298133850098, "rank": 2, "decoded_token": " a"}, "17300": {"logprob": -1.8959298133850098, "rank": 3, "decoded_token": " soul"}, "100089": {"logprob": -2.6459298133850098, "rank": 4, "decoded_token": " expressive"}, "6444": {"logprob": -3.1459298133850098, "rank": 5, "decoded_token": " soft"}}, {"94766": {"logprob": -0.002432247158139944, "rank": 1, "decoded_token": "oppy"}, "124603": {"logprob": -6.377432346343994, "rank": 2, "decoded_token": "uffy"}, "1484": {"logprob": -7.877432346343994, "rank": 3, "decoded_token": "op"}, "24897": {"logprob": -8.877431869506836, "rank": 4, "decoded_token": "appy"}, "102477": {"logprob": -9.752431869506836, "rank": 5, "decoded_token": "opping"}}, {"32961": {"logprob": -5.113947918289341e-05, "rank": 1, "decoded_token": " ears"}, "16962": {"logprob": -11.312551498413086, "rank": 2, "decoded_token": " ear"}, "5731": {"logprob": -11.750051498413086, "rank": 3, "decoded_token": " eyes"}, "3351": {"logprob": -12.000051498413086, "rank": 4, "decoded_token": " years"}, "42071": {"logprob": -13.000051498413086, "rank": 5, "decoded_token": " cheeks"}}, {"53048": {"logprob": -0.6131591200828552, "rank": 1, "decoded_token": " sits"}, "10637": {"logprob": -1.9881591796875, "rank": 2, "decoded_token": " looks"}, "1321": {"logprob": -2.4256591796875, "rank": 3, "decoded_token": " and"}, "1395": {"logprob": -2.6756591796875, "rank": 4, "decoded_token": " is"}, "18970": {"logprob": -3.0506591796875, "rank": 5, "decoded_token": " sitting"}}, {"41132": {"logprob": -0.36187249422073364, "rank": 1, "decoded_token": " attent"}, "1408": {"logprob": -2.361872434616089, "rank": 2, "decoded_token": " on"}, "106534": {"logprob": -2.424372434616089, "rank": 3, "decoded_token": " calmly"}, "12276": {"logprob": -2.611872434616089, "rank": 4, "decoded_token": " alert"}, "6482": {"logprob": -5.174372673034668, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -8.451581379631534e-05, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.50008487701416, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -11.87508487701416, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -14.00008487701416, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -14.75008487701416, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.058125678449869156, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.1831257343292236, "rank": 2, "decoded_token": " against"}, "1294": {"logprob": -4.9331254959106445, "rank": 3, "decoded_token": " in"}, "7283": {"logprob": -5.8081254959106445, "rank": 4, "decoded_token": " looking"}, "1044": {"logprob": -5.9331254959106445, "rank": 5, "decoded_token": ","}}, {"1261": {"logprob": -0.21029606461524963, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.7102960348129272, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -5.710296154022217, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -6.085296154022217, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.960296154022217, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.08548421412706375, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.710484266281128, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.710484027862549, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.960484027862549, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -5.960484027862549, "rank": 5, "decoded_token": " text"}}, {"4691": {"logprob": -0.7172377109527588, "rank": 1, "decoded_token": " surface"}, "11237": {"logprob": -0.8422377109527588, "rank": 2, "decoded_token": " floor"}, "7042": {"logprob": -2.842237710952759, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -4.21723747253418, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.21723747253418, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.12971943616867065, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.3797194957733154, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -4.129719257354736, "rank": 3, "decoded_token": "."}, "1338": {"logprob": -5.129719257354736, "rank": 4, "decoded_token": ".\n\n"}, "7283": {"logprob": -5.504719257354736, "rank": 5, "decoded_token": " looking"}}, {"1050": {"logprob": -0.00015698630886618048, "rank": 1, "decoded_token": "2"}, "1256": {"logprob": -9.125157356262207, "rank": 2, "decoded_token": "  "}, "1032": {"logprob": -10.875157356262207, "rank": 3, "decoded_token": " "}, "1293": {"logprob": -11.750157356262207, "rank": 4, "decoded_token": "   "}, "1051": {"logprob": -12.125157356262207, "rank": 5, "decoded_token": "3"}}, {"1046": {"logprob": -6.6756979322235566e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.062506675720215, "rank": 2, "decoded_token": ".A"}, "1626": {"logprob": -13.187506675720215, "rank": 3, "decoded_token": ".\n"}, "1338": {"logprob": -14.750006675720215, "rank": 4, "decoded_token": ".\n\n"}, "1058": {"logprob": -14.937506675720215, "rank": 5, "decoded_token": ":"}}, {"1349": {"logprob": -0.5863217115402222, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.4613217115402222, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.2113218307495117, "rank": 3, "decoded_token": " Snow"}, "113465": {"logprob": -3.8988218307495117, "rank": 4, "decoded_token": " Rug"}, "1531": {"logprob": -3.9613218307495117, "rank": 5, "decoded_token": " The"}}, {"15375": {"logprob": -0.639299213886261, "rank": 1, "decoded_token": " vast"}, "37849": {"logprob": -2.014299154281616, "rank": 2, "decoded_token": " breat"}, "61082": {"logprob": -2.389299154281616, "rank": 3, "decoded_token": " panor"}, "10726": {"logprob": -3.139299154281616, "rank": 4, "decoded_token": " scen"}, "2169": {"logprob": -3.201799154281616, "rank": 5, "decoded_token": " ser"}}, {"24361": {"logprob": -0.702845573425293, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.952845573425293, "rank": 2, "decoded_token": " mountainous"}, "1044": {"logprob": -2.077845573425293, "rank": 3, "decoded_token": ","}, "4521": {"logprob": -2.327845573425293, "rank": 4, "decoded_token": " range"}, "28035": {"logprob": -2.452845573425293, "rank": 5, "decoded_token": " landscape"}}, {"4521": {"logprob": -0.07058162242174149, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -2.6955816745758057, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.320581436157227, "rank": 3, "decoded_token": " valley"}, "12248": {"logprob": -9.445581436157227, "rank": 4, "decoded_token": " peak"}, "13327": {"logprob": -9.695581436157227, "rank": 5, "decoded_token": " scene"}}, {"94973": {"logprob": -1.1164050102233887, "rank": 1, "decoded_token": " stretches"}, "1454": {"logprob": -1.1789050102233887, "rank": 2, "decoded_token": " with"}, "2425": {"logprob": -1.8664050102233887, "rank": 3, "decoded_token": " under"}, "1395": {"logprob": -2.5539050102233887, "rank": 4, "decoded_token": " is"}, "13875": {"logprob": -2.9914050102233887, "rank": 5, "decoded_token": " covered"}}, {"5669": {"logprob": -0.3286789357662201, "rank": 1, "decoded_token": " across"}, "1848": {"logprob": -2.078678846359253, "rank": 2, "decoded_token": " out"}, "2425": {"logprob": -2.328678846359253, "rank": 3, "decoded_token": " under"}, "2203": {"logprob": -3.328678846359253, "rank": 4, "decoded_token": " into"}, "8994": {"logprob": -4.766179084777832, "rank": 5, "decoded_token": " towards"}}, {"1278": {"logprob": -0.039004355669021606, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -3.289004325866699, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -7.414004325866699, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -9.0390043258667, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -9.2265043258667, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.2659883201122284, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -2.140988349914551, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.015988349914551, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.515988349914551, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -4.265988349914551, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.5356141328811646, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -1.5356141328811646, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -1.7856141328811646, "rank": 3, "decoded_token": " with"}, "25136": {"logprob": -3.785614013671875, "rank": 4, "decoded_token": " beneath"}, "1408": {"logprob": -5.785614013671875, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.006081883795559406, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -5.506082057952881, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -7.631082057952881, "rank": 3, "decoded_token": " cloud"}, "6133": {"logprob": -7.881082057952881, "rank": 4, "decoded_token": " clear"}, "2136": {"logprob": -8.006081581115723, "rank": 5, "decoded_token": " over"}}, {"16152": {"logprob": -0.6749536991119385, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -1.4249536991119385, "rank": 2, "decoded_token": " clear"}, "18416": {"logprob": -2.8624536991119385, "rank": 3, "decoded_token": " haz"}, "27254": {"logprob": -2.9874536991119385, "rank": 4, "decoded_token": " partly"}, "4391": {"logprob": -3.2374536991119385, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.10860869288444519, "rank": 1, "decoded_token": "y"}, "4527": {"logprob": -2.9836087226867676, "rank": 2, "decoded_token": "less"}, "1286": {"logprob": -3.4836087226867676, "rank": 3, "decoded_token": "ed"}, "77187": {"logprob": -4.608608722686768, "rank": 4, "decoded_token": "-filled"}, "114525": {"logprob": -4.858608722686768, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.002785732736811042, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -6.252785682678223, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -7.627785682678223, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -8.627785682678223, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -9.377785682678223, "rank": 5, "decoded_token": " grey"}}, {"1046": {"logprob": -0.047878943383693695, "rank": 1, "decoded_token": "."}, "1044": {"logprob": -3.1728789806365967, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -5.547878742218018, "rank": 3, "decoded_token": " with"}, "1338": {"logprob": -7.172878742218018, "rank": 4, "decoded_token": ".\n\n"}, "1294": {"logprob": -9.172879219055176, "rank": 5, "decoded_token": " in"}}, {"2": {"logprob": -1.3351351299206726e-05, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -11.25001335144043, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.00001335144043, "rank": 3, "decoded_token": "  "}, "1319": {"logprob": -17.25001335144043, "rank": 4, "decoded_token": " ("}, "1766": {"logprob": -18.50001335144043, "rank": 5, "decoded_token": " ["}}]], [[1049, 1046, 1349, 7244, 10575, 53048, 41132, 3923, 1408, 1261, 32656, 11237, 1626, 1050, 1046, 1349, 15375, 24361, 4521, 94973, 5669, 1278, 48932, 2425, 1261, 16152, 1121, 21283, 1626, 1051, 1046, 8342, 71284, 7377, 1394, 22140, 1294, 1278, 27208, 1513, 97558, 1626, 1052, 1046, 1349, 53301, 59396, 3549, 13335, 2645, 1261, 1295, 3506, 11223, 12097, 1046, 2], "1. A black dog sits attentively on a wooden floor.\n2. A vast mountain range stretches across the horizon under a cloudy sky.\n3. Surfers wait for waves in the ocean at sunset.\n4. A winding gravel path leads through a lush green park.", [{"1049": {"logprob": -0.05001257359981537, "rank": 1, "decoded_token": "1"}, "1045": {"logprob": -3.1750125885009766, "rank": 2, "decoded_token": "-"}, "69957": {"logprob": -5.925012588500977, "rank": 3, "decoded_token": "Sure"}, "11745": {"logprob": -6.425012588500977, "rank": 4, "decoded_token": "Here"}, "1065": {"logprob": -6.425012588500977, "rank": 5, "decoded_token": "A"}}, {"1046": {"logprob": -8.702239938429557e-06, "rank": 1, "decoded_token": "."}, "1058": {"logprob": -12.000008583068848, "rank": 2, "decoded_token": ":"}, "3590": {"logprob": -13.375008583068848, "rank": 3, "decoded_token": ".A"}, "1041": {"logprob": -14.750008583068848, "rank": 4, "decoded_token": ")"}, "1065": {"logprob": -15.687508583068848, "rank": 5, "decoded_token": "A"}}, {"1349": {"logprob": -0.14196155965328217, "rank": 1, "decoded_token": " A"}, "1429": {"logprob": -2.2669615745544434, "rank": 2, "decoded_token": " \""}, "1531": {"logprob": -4.516961574554443, "rank": 3, "decoded_token": " The"}, "11967": {"logprob": -4.516961574554443, "rank": 4, "decoded_token": " Image"}, "1603": {"logprob": -5.391961574554443, "rank": 5, "decoded_token": " **"}}, {"7244": {"logprob": -0.14889711141586304, "rank": 1, "decoded_token": " black"}, "68076": {"logprob": -3.398897171020508, "rank": 2, "decoded_token": " cute"}, "6231": {"logprob": -3.961397171020508, "rank": 3, "decoded_token": " close"}, "38462": {"logprob": -4.273897171020508, "rank": 4, "decoded_token": " curious"}, "4329": {"logprob": -4.398897171020508, "rank": 5, "decoded_token": " large"}}, {"10575": {"logprob": -0.12091328203678131, "rank": 1, "decoded_token": " dog"}, "116572": {"logprob": -2.37091326713562, "rank": 2, "decoded_token": " puppy"}, "119075": {"logprob": -3.99591326713562, "rank": 3, "decoded_token": " Labrador"}, "15812": {"logprob": -7.683413505554199, "rank": 4, "decoded_token": " Lab"}, "8636": {"logprob": -7.808413505554199, "rank": 5, "decoded_token": " lab"}}, {"53048": {"logprob": -0.8691943287849426, "rank": 1, "decoded_token": " sits"}, "1454": {"logprob": -1.1191942691802979, "rank": 2, "decoded_token": " with"}, "1395": {"logprob": -2.431694269180298, "rank": 3, "decoded_token": " is"}, "18970": {"logprob": -2.744194269180298, "rank": 4, "decoded_token": " sitting"}, "22524": {"logprob": -3.681694269180298, "rank": 5, "decoded_token": " lies"}}, {"41132": {"logprob": -0.5939557552337646, "rank": 1, "decoded_token": " attent"}, "106534": {"logprob": -1.2814557552337646, "rank": 2, "decoded_token": " calmly"}, "12276": {"logprob": -2.8439557552337646, "rank": 3, "decoded_token": " alert"}, "1408": {"logprob": -2.8439557552337646, "rank": 4, "decoded_token": " on"}, "6482": {"logprob": -4.968955993652344, "rank": 5, "decoded_token": " patient"}}, {"3923": {"logprob": -0.00010084597306558862, "rank": 1, "decoded_token": "ively"}, "1556": {"logprob": -9.500101089477539, "rank": 2, "decoded_token": "ive"}, "6655": {"logprob": -10.875101089477539, "rank": 3, "decoded_token": "atively"}, "3929": {"logprob": -13.000101089477539, "rank": 4, "decoded_token": "ently"}, "47885": {"logprob": -13.750101089477539, "rank": 5, "decoded_token": "edly"}}, {"1408": {"logprob": -0.056158196181058884, "rank": 1, "decoded_token": " on"}, "3675": {"logprob": -3.6811583042144775, "rank": 2, "decoded_token": " against"}, "1454": {"logprob": -4.306158065795898, "rank": 3, "decoded_token": " with"}, "1294": {"logprob": -5.181158065795898, "rank": 4, "decoded_token": " in"}, "7283": {"logprob": -5.431158065795898, "rank": 5, "decoded_token": " looking"}}, {"1261": {"logprob": -0.33056098222732544, "rank": 1, "decoded_token": " a"}, "32656": {"logprob": -1.3305609226226807, "rank": 2, "decoded_token": " wooden"}, "17253": {"logprob": -4.70556116104126, "rank": 3, "decoded_token": " weather"}, "44130": {"logprob": -5.83056116104126, "rank": 4, "decoded_token": " rust"}, "12603": {"logprob": -6.58056116104126, "rank": 5, "decoded_token": " wood"}}, {"32656": {"logprob": -0.07081110030412674, "rank": 1, "decoded_token": " wooden"}, "44130": {"logprob": -2.9458110332489014, "rank": 2, "decoded_token": " rust"}, "17253": {"logprob": -4.6958112716674805, "rank": 3, "decoded_token": " weather"}, "12603": {"logprob": -5.8208112716674805, "rank": 4, "decoded_token": " wood"}, "3403": {"logprob": -6.0708112716674805, "rank": 5, "decoded_token": " text"}}, {"11237": {"logprob": -0.6428436636924744, "rank": 1, "decoded_token": " floor"}, "4691": {"logprob": -1.0178437232971191, "rank": 2, "decoded_token": " surface"}, "7042": {"logprob": -2.642843723297119, "rank": 3, "decoded_token": " background"}, "28984": {"logprob": -3.517843723297119, "rank": 4, "decoded_token": " deck"}, "92504": {"logprob": -6.017843723297119, "rank": 5, "decoded_token": " backdrop"}}, {"1626": {"logprob": -0.7337945103645325, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -0.8587945103645325, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -3.3587944507598877, "rank": 3, "decoded_token": " with"}, "7283": {"logprob": -3.6087944507598877, "rank": 4, "decoded_token": " looking"}, "1321": {"logprob": -4.108794689178467, "rank": 5, "decoded_token": " and"}}, {"1050": {"logprob": -1.0132738680113107e-05, "rank": 1, "decoded_token": "2"}, "1051": {"logprob": -11.75001049041748, "rank": 2, "decoded_token": "3"}, "1256": {"logprob": -14.00001049041748, "rank": 3, "decoded_token": "  "}, "1049": {"logprob": -14.62501049041748, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -14.62501049041748, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -2.861018856492592e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.43750286102295, "rank": 2, "decoded_token": ".A"}, "4700": {"logprob": -15.37500286102295, "rank": 3, "decoded_token": ".M"}, "1626": {"logprob": -15.37500286102295, "rank": 4, "decoded_token": ".\n"}, "3051": {"logprob": -15.87500286102295, "rank": 5, "decoded_token": ".S"}}, {"1349": {"logprob": -0.6794427633285522, "rank": 1, "decoded_token": " A"}, "11826": {"logprob": -1.9294427633285522, "rank": 2, "decoded_token": " Maj"}, "37159": {"logprob": -2.116942882537842, "rank": 3, "decoded_token": " Snow"}, "27260": {"logprob": -2.616942882537842, "rank": 4, "decoded_token": " Mountain"}, "113465": {"logprob": -2.866942882537842, "rank": 5, "decoded_token": " Rug"}}, {"15375": {"logprob": -0.9194075465202332, "rank": 1, "decoded_token": " vast"}, "10726": {"logprob": -2.294407606124878, "rank": 2, "decoded_token": " scen"}, "4521": {"logprob": -2.356907606124878, "rank": 3, "decoded_token": " range"}, "122203": {"logprob": -2.419407606124878, "rank": 4, "decoded_token": " rugged"}, "61082": {"logprob": -2.856907606124878, "rank": 5, "decoded_token": " panor"}}, {"24361": {"logprob": -0.5804797410964966, "rank": 1, "decoded_token": " mountain"}, "127945": {"logprob": -1.8304797410964966, "rank": 2, "decoded_token": " mountainous"}, "28035": {"logprob": -2.455479621887207, "rank": 3, "decoded_token": " landscape"}, "4521": {"logprob": -2.455479621887207, "rank": 4, "decoded_token": " range"}, "1044": {"logprob": -2.705479621887207, "rank": 5, "decoded_token": ","}}, {"4521": {"logprob": -0.0493546724319458, "rank": 1, "decoded_token": " range"}, "28035": {"logprob": -3.0493545532226562, "rank": 2, "decoded_token": " landscape"}, "37691": {"logprob": -8.424354553222656, "rank": 3, "decoded_token": " valley"}, "13327": {"logprob": -9.049354553222656, "rank": 4, "decoded_token": " scene"}, "3719": {"logprob": -9.799354553222656, "rank": 5, "decoded_token": " view"}}, {"94973": {"logprob": -0.6676871180534363, "rank": 1, "decoded_token": " stretches"}, "2425": {"logprob": -1.792687177658081, "rank": 2, "decoded_token": " under"}, "1395": {"logprob": -2.292687177658081, "rank": 3, "decoded_token": " is"}, "1454": {"logprob": -2.730187177658081, "rank": 4, "decoded_token": " with"}, "7038": {"logprob": -3.292687177658081, "rank": 5, "decoded_token": " extends"}}, {"5669": {"logprob": -0.4542117118835449, "rank": 1, "decoded_token": " across"}, "2425": {"logprob": -1.454211711883545, "rank": 2, "decoded_token": " under"}, "1848": {"logprob": -2.454211711883545, "rank": 3, "decoded_token": " out"}, "2203": {"logprob": -4.204211711883545, "rank": 4, "decoded_token": " into"}, "25136": {"logprob": -4.641711711883545, "rank": 5, "decoded_token": " beneath"}}, {"1278": {"logprob": -0.23009441792964935, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -1.6050944328308105, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -5.6050944328308105, "rank": 3, "decoded_token": " an"}, "2425": {"logprob": -7.2300944328308105, "rank": 4, "decoded_token": " under"}, "1454": {"logprob": -10.167593955993652, "rank": 5, "decoded_token": " with"}}, {"48932": {"logprob": -0.3072167932987213, "rank": 1, "decoded_token": " horizon"}, "21283": {"logprob": -1.932216763496399, "rank": 2, "decoded_token": " sky"}, "3937": {"logprob": -3.1822168827056885, "rank": 3, "decoded_token": " image"}, "28035": {"logprob": -3.6822168827056885, "rank": 4, "decoded_token": " landscape"}, "3044": {"logprob": -3.6822168827056885, "rank": 5, "decoded_token": " sk"}}, {"2425": {"logprob": -0.2914469838142395, "rank": 1, "decoded_token": " under"}, "1044": {"logprob": -2.4164469242095947, "rank": 2, "decoded_token": ","}, "1454": {"logprob": -2.5414469242095947, "rank": 3, "decoded_token": " with"}, "1626": {"logprob": -3.7914469242095947, "rank": 4, "decoded_token": ".\n"}, "1408": {"logprob": -3.7914469242095947, "rank": 5, "decoded_token": " on"}}, {"1261": {"logprob": -0.0460360012948513, "rank": 1, "decoded_token": " a"}, "1420": {"logprob": -3.9210360050201416, "rank": 2, "decoded_token": " an"}, "16152": {"logprob": -4.1085357666015625, "rank": 3, "decoded_token": " cloud"}, "2136": {"logprob": -6.1710357666015625, "rank": 4, "decoded_token": " over"}, "6133": {"logprob": -6.4210357666015625, "rank": 5, "decoded_token": " clear"}}, {"16152": {"logprob": -0.20367540419101715, "rank": 1, "decoded_token": " cloud"}, "6133": {"logprob": -2.8286755084991455, "rank": 2, "decoded_token": " clear"}, "27254": {"logprob": -3.5161755084991455, "rank": 3, "decoded_token": " partly"}, "18416": {"logprob": -3.8286755084991455, "rank": 4, "decoded_token": " haz"}, "4391": {"logprob": -4.328675270080566, "rank": 5, "decoded_token": " light"}}, {"1121": {"logprob": -0.05241352692246437, "rank": 1, "decoded_token": "y"}, "1286": {"logprob": -3.8024134635925293, "rank": 2, "decoded_token": "ed"}, "77187": {"logprob": -4.552413463592529, "rank": 3, "decoded_token": "-filled"}, "4527": {"logprob": -4.802413463592529, "rank": 4, "decoded_token": "less"}, "114525": {"logprob": -4.927413463592529, "rank": 5, "decoded_token": "-covered"}}, {"21283": {"logprob": -0.0003716255014296621, "rank": 1, "decoded_token": " sky"}, "10991": {"logprob": -8.750371932983398, "rank": 2, "decoded_token": " blue"}, "1044": {"logprob": -9.375371932983398, "rank": 3, "decoded_token": ","}, "26549": {"logprob": -10.375371932983398, "rank": 4, "decoded_token": " gray"}, "34052": {"logprob": -11.250371932983398, "rank": 5, "decoded_token": " grey"}}, {"1626": {"logprob": -0.00012730741582345217, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -9.500126838684082, "rank": 2, "decoded_token": ","}, "1046": {"logprob": -10.500126838684082, "rank": 3, "decoded_token": "."}, "1454": {"logprob": -10.875126838684082, "rank": 4, "decoded_token": " with"}, "1294": {"logprob": -13.250126838684082, "rank": 5, "decoded_token": " in"}}, {"1051": {"logprob": -3.2186455882765586e-06, "rank": 1, "decoded_token": "3"}, "1052": {"logprob": -12.75000286102295, "rank": 2, "decoded_token": "4"}, "1050": {"logprob": -15.00000286102295, "rank": 3, "decoded_token": "2"}, "1049": {"logprob": -16.937503814697266, "rank": 4, "decoded_token": "1"}, "1032": {"logprob": -17.875003814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -14.687501907348633, "rank": 2, "decoded_token": ".A"}, "5226": {"logprob": -15.687501907348633, "rank": 3, "decoded_token": ".D"}, "6847": {"logprob": -15.812501907348633, "rank": 4, "decoded_token": ".T"}, "48426": {"logprob": -16.812501907348633, "rank": 5, "decoded_token": ".The"}}, {"8342": {"logprob": -0.5730464458465576, "rank": 1, "decoded_token": " Sur"}, "1349": {"logprob": -1.6980464458465576, "rank": 2, "decoded_token": " A"}, "22468": {"logprob": -2.5730464458465576, "rank": 3, "decoded_token": " Several"}, "1488": {"logprob": -2.6980464458465576, "rank": 4, "decoded_token": " W"}, "15035": {"logprob": -3.1980464458465576, "rank": 5, "decoded_token": " People"}}, {"71284": {"logprob": -0.0033258858602494, "rank": 1, "decoded_token": "fers"}, "1102": {"logprob": -5.878325939178467, "rank": 2, "decoded_token": "f"}, "1726": {"logprob": -7.628325939178467, "rank": 3, "decoded_token": "fer"}, "61888": {"logprob": -12.253325462341309, "rank": 4, "decoded_token": "fline"}, "2119": {"logprob": -13.003325462341309, "rank": 5, "decoded_token": "fter"}}, {"7377": {"logprob": -1.4996429681777954, "rank": 1, "decoded_token": " wait"}, "1584": {"logprob": -1.7496429681777954, "rank": 2, "decoded_token": " are"}, "88014": {"logprob": -1.9371429681777954, "rank": 3, "decoded_token": " paddle"}, "1294": {"logprob": -1.9371429681777954, "rank": 4, "decoded_token": " in"}, "24434": {"logprob": -2.187142848968506, "rank": 5, "decoded_token": " ride"}}, {"1394": {"logprob": -0.6126739382743835, "rank": 1, "decoded_token": " for"}, "1294": {"logprob": -0.9876739382743835, "rank": 2, "decoded_token": " in"}, "1408": {"logprob": -2.7376739978790283, "rank": 3, "decoded_token": " on"}, "6482": {"logprob": -4.425173759460449, "rank": 4, "decoded_token": " patient"}, "1321": {"logprob": -5.612673759460449, "rank": 5, "decoded_token": " and"}}, {"22140": {"logprob": -0.00729279313236475, "rank": 1, "decoded_token": " waves"}, "1278": {"logprob": -5.632292747497559, "rank": 2, "decoded_token": " the"}, "1261": {"logprob": -5.757292747497559, "rank": 3, "decoded_token": " a"}, "39460": {"logprob": -8.257292747497559, "rank": 4, "decoded_token": " incoming"}, "1321": {"logprob": -9.757292747497559, "rank": 5, "decoded_token": " and"}}, {"1294": {"logprob": -0.3071398138999939, "rank": 1, "decoded_token": " in"}, "1408": {"logprob": -2.1821398735046387, "rank": 2, "decoded_token": " on"}, "1513": {"logprob": -2.4321398735046387, "rank": 3, "decoded_token": " at"}, "3016": {"logprob": -3.6821398735046387, "rank": 4, "decoded_token": " while"}, "1435": {"logprob": -3.8071398735046387, "rank": 5, "decoded_token": " as"}}, {"1278": {"logprob": -0.004646694287657738, "rank": 1, "decoded_token": " the"}, "1261": {"logprob": -6.1921467781066895, "rank": 2, "decoded_token": " a"}, "1420": {"logprob": -6.9421467781066895, "rank": 3, "decoded_token": " an"}, "40466": {"logprob": -7.2546467781066895, "rank": 4, "decoded_token": " shallow"}, "26517": {"logprob": -7.8796467781066895, "rank": 5, "decoded_token": " calm"}}, {"27208": {"logprob": -0.0658877044916153, "rank": 1, "decoded_token": " ocean"}, "7786": {"logprob": -3.440887689590454, "rank": 2, "decoded_token": " distance"}, "5124": {"logprob": -5.253387928009033, "rank": 3, "decoded_token": " early"}, "26517": {"logprob": -5.315887928009033, "rank": 4, "decoded_token": " calm"}, "11196": {"logprob": -5.378387928009033, "rank": 5, "decoded_token": " sea"}}, {"1513": {"logprob": -1.1504861116409302, "rank": 1, "decoded_token": " at"}, "1435": {"logprob": -1.2754861116409302, "rank": 2, "decoded_token": " as"}, "3184": {"logprob": -1.4004861116409302, "rank": 3, "decoded_token": " during"}, "3016": {"logprob": -2.9004859924316406, "rank": 4, "decoded_token": " while"}, "6117": {"logprob": -3.1504859924316406, "rank": 5, "decoded_token": " near"}}, {"97558": {"logprob": -0.12151996046304703, "rank": 1, "decoded_token": " sunset"}, "11729": {"logprob": -2.8715200424194336, "rank": 2, "decoded_token": " sun"}, "1266": {"logprob": -3.4965200424194336, "rank": 3, "decoded_token": " d"}, "54507": {"logprob": -3.9965200424194336, "rank": 4, "decoded_token": " dawn"}, "1261": {"logprob": -5.121520042419434, "rank": 5, "decoded_token": " a"}}, {"1626": {"logprob": -0.3073118329048157, "rank": 1, "decoded_token": ".\n"}, "1044": {"logprob": -2.182311773300171, "rank": 2, "decoded_token": ","}, "3016": {"logprob": -2.557311773300171, "rank": 3, "decoded_token": " while"}, "1454": {"logprob": -3.432311773300171, "rank": 4, "decoded_token": " with"}, "6117": {"logprob": -4.05731201171875, "rank": 5, "decoded_token": " near"}}, {"1052": {"logprob": -3.3378546504536644e-06, "rank": 1, "decoded_token": "4"}, "1051": {"logprob": -13.25000286102295, "rank": 2, "decoded_token": "3"}, "1049": {"logprob": -13.93750286102295, "rank": 3, "decoded_token": "1"}, "1053": {"logprob": -14.43750286102295, "rank": 4, "decoded_token": "5"}, "1032": {"logprob": -16.687503814697266, "rank": 5, "decoded_token": " "}}, {"1046": {"logprob": -1.6689286894688848e-06, "rank": 1, "decoded_token": "."}, "3590": {"logprob": -13.500001907348633, "rank": 2, "decoded_token": ".A"}, "6847": {"logprob": -16.437501907348633, "rank": 3, "decoded_token": ".T"}, "1044": {"logprob": -17.312501907348633, "rank": 4, "decoded_token": ","}, "1349": {"logprob": -17.375001907348633, "rank": 5, "decoded_token": " A"}}, {"1349": {"logprob": -0.004292916506528854, "rank": 1, "decoded_token": " A"}, "2048": {"logprob": -5.629292964935303, "rank": 2, "decoded_token": " An"}, "10638": {"logprob": -7.879292964935303, "rank": 3, "decoded_token": " Two"}, "111463": {"logprob": -10.004292488098145, "rank": 4, "decoded_token": " Trees"}, "1531": {"logprob": -10.879292488098145, "rank": 5, "decoded_token": " The"}}, {"53301": {"logprob": -1.5473321676254272, "rank": 1, "decoded_token": " winding"}, "15192": {"logprob": -1.7348321676254272, "rank": 2, "decoded_token": " narrow"}, "47945": {"logprob": -2.109832286834717, "rank": 3, "decoded_token": " dirt"}, "2169": {"logprob": -2.609832286834717, "rank": 4, "decoded_token": " ser"}, "59396": {"logprob": -2.672332286834717, "rank": 5, "decoded_token": " gravel"}}, {"59396": {"logprob": -0.8954829573631287, "rank": 1, "decoded_token": " gravel"}, "3549": {"logprob": -1.1454830169677734, "rank": 2, "decoded_token": " path"}, "47945": {"logprob": -1.6454830169677734, "rank": 3, "decoded_token": " dirt"}, "14801": {"logprob": -3.2704830169677734, "rank": 4, "decoded_token": " pathway"}, "15551": {"logprob": -4.270483016967773, "rank": 5, "decoded_token": " stone"}}, {"3549": {"logprob": -0.02117946185171604, "rank": 1, "decoded_token": " path"}, "14801": {"logprob": -3.896179437637329, "rank": 2, "decoded_token": " pathway"}, "33659": {"logprob": -8.14617919921875, "rank": 3, "decoded_token": " trail"}, "9480": {"logprob": -9.64617919921875, "rank": 4, "decoded_token": " road"}, "7368": {"logprob": -9.64617919921875, "rank": 5, "decoded_token": "path"}}, {"13335": {"logprob": -0.18962937593460083, "rank": 1, "decoded_token": " leads"}, "39985": {"logprob": -2.752129316329956, "rank": 2, "decoded_token": " cuts"}, "1639": {"logprob": -3.877129316329956, "rank": 3, "decoded_token": " me"}, "11500": {"logprob": -3.939629316329956, "rank": 4, "decoded_token": " runs"}, "2645": {"logprob": -4.189629554748535, "rank": 5, "decoded_token": " through"}}, {"2645": {"logprob": -0.05349981039762497, "rank": 1, "decoded_token": " through"}, "8994": {"logprob": -4.053499698638916, "rank": 2, "decoded_token": " towards"}, "2396": {"logprob": -4.303499698638916, "rank": 3, "decoded_token": " between"}, "2203": {"logprob": -4.678499698638916, "rank": 4, "decoded_token": " into"}, "1317": {"logprob": -5.678499698638916, "rank": 5, "decoded_token": " to"}}, {"1261": {"logprob": -0.017386287450790405, "rank": 1, "decoded_token": " a"}, "11223": {"logprob": -4.892386436462402, "rank": 2, "decoded_token": " green"}, "1295": {"logprob": -5.017386436462402, "rank": 3, "decoded_token": " l"}, "23170": {"logprob": -6.642386436462402, "rank": 4, "decoded_token": " grass"}, "1420": {"logprob": -7.267386436462402, "rank": 5, "decoded_token": " an"}}, {"1295": {"logprob": -0.9453322887420654, "rank": 1, "decoded_token": " l"}, "11223": {"logprob": -1.3203322887420654, "rank": 2, "decoded_token": " green"}, "23170": {"logprob": -1.9453322887420654, "rank": 3, "decoded_token": " grass"}, "12097": {"logprob": -2.4453322887420654, "rank": 4, "decoded_token": " park"}, "26428": {"logprob": -3.3203322887420654, "rank": 5, "decoded_token": " garden"}}, {"3506": {"logprob": -6.556489552167477e-06, "rank": 1, "decoded_token": "ush"}, "1374": {"logprob": -12.000006675720215, "rank": 2, "decoded_token": "us"}, "90716": {"logprob": -15.625006675720215, "rank": 3, "decoded_token": "USH"}, "16938": {"logprob": -15.875006675720215, "rank": 4, "decoded_token": "usher"}, "13326": {"logprob": -17.1875057220459, "rank": 5, "decoded_token": "inden"}}, {"11223": {"logprob": -0.3668670654296875, "rank": 1, "decoded_token": " green"}, "1044": {"logprob": -1.3668670654296875, "rank": 2, "decoded_token": ","}, "26428": {"logprob": -3.4918670654296875, "rank": 3, "decoded_token": " garden"}, "12097": {"logprob": -4.1168670654296875, "rank": 4, "decoded_token": " park"}, "23170": {"logprob": -5.8668670654296875, "rank": 5, "decoded_token": " grass"}}, {"12097": {"logprob": -0.5530153512954712, "rank": 1, "decoded_token": " park"}, "3727": {"logprob": -2.0530152320861816, "rank": 2, "decoded_token": " field"}, "28035": {"logprob": -2.1780152320861816, "rank": 3, "decoded_token": " landscape"}, "26428": {"logprob": -2.3030152320861816, "rank": 4, "decoded_token": " garden"}, "4457": {"logprob": -2.8030152320861816, "rank": 5, "decoded_token": " area"}}, {"1046": {"logprob": -0.7924000024795532, "rank": 1, "decoded_token": "."}, "1454": {"logprob": -1.2924000024795532, "rank": 2, "decoded_token": " with"}, "8994": {"logprob": -2.7923998832702637, "rank": 3, "decoded_token": " towards"}, "54410": {"logprob": -3.5423998832702637, "rank": 4, "decoded_token": " lined"}, "2425": {"logprob": -3.5423998832702637, "rank": 5, "decoded_token": " under"}}, {"2": {"logprob": -1.9073468138230965e-06, "rank": 1, "decoded_token": "</s>"}, "1032": {"logprob": -13.250001907348633, "rank": 2, "decoded_token": " "}, "1256": {"logprob": -16.250001907348633, "rank": 3, "decoded_token": "  "}, "1293": {"logprob": -19.000001907348633, "rank": 4, "decoded_token": "   "}, "1319": {"logprob": -20.000001907348633, "rank": 5, "decoded_token": " ("}}]]]
\ No newline at end of file

From da6165a31de6d35ae16b3ed4d579f384b7d0c22d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 18 Mar 2025 20:07:00 +0800
Subject: [PATCH 0811/1240] [Misc] Embedding model support LoRA (#14935)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/models.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 174b9f0b977..22a45b60ca3 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -30,6 +30,7 @@
                              is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
+from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
 from vllm.utils import is_pin_memory_available
@@ -104,6 +105,9 @@ def get_lora(self, module_name: str) -> Optional[LoRALayerWeights]:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
 
+    def check_lora_name(self, lora_name: str) -> bool:
+        return lora_name in self.loras
+
     # (yard1): TODO see if we can derive target_embedding_padding automatically
     @classmethod
     def from_lora_tensors(
@@ -335,6 +339,7 @@ def __init__(
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
         super().__init__(model)
+
         self.supported_lora_modules = get_supported_lora_modules(self.model)
         assert self.supported_lora_modules, "No supported LoRA modules found in"
         f"{self.model.__class__.__name__}."
@@ -350,6 +355,7 @@ def __init__(
             # In case the model only supports LoRA for
             # text modules (e.g. ChatGLM)
             and hasattr(self.model, "get_mm_mapping"))
+        self.is_pooling_model = is_pooling_model(self.model)
         self.packed_modules: Dict[str, List[str]] = {}
         self.modules: Dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a Set for compatibility with LRUCache.
@@ -389,7 +395,7 @@ def activate_adapter(
                      lora_model.id, index)
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
-            module_lora = lora_model.get_lora(module_name)
+            module_lora = self._get_lora_layer_weights(lora_model, module_name)
             if module_lora:
                 module_lora.optimize()
                 # Bias is not explicitly enabled with the flag enable_lora_bias.
@@ -626,7 +632,7 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
             replaced_module: Set[str] = set()
             has_replacement = False
             for r in new_module_names:
-                lora = lora_model.get_lora(r)
+                lora = self._get_lora_layer_weights(lora_model, r)
                 replacement_loras.append(lora)
                 if lora:
                     has_replacement = True
@@ -637,12 +643,34 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
                 if replacement_loras[i]:
                     continue
                 replacement_loras[i] = None
+            # HACK Temporary solution for the pool model.
+            if self.is_pooling_model and not lora_model.check_lora_name(
+                    module_name):
+                replaced_module_name = module_name.replace("model.", "")
+                if lora_model.check_lora_name(module_name):
+                    module_name = replaced_module_name
             lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
                 replacement_loras)
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
+    def _get_lora_layer_weights(
+            self, lora_model: LoRAModel,
+            module_name: str) -> Optional[LoRALayerWeights]:
+        org_module_name = module_name
+        if self.is_pooling_model and not lora_model.check_lora_name(
+                module_name):
+            # If it's a pool model, and the layer name is not found,
+            # remove the prefix 'model.' and search again.
+            module_name = module_name.replace("model.", "")
+            if lora_model.check_lora_name(module_name):
+                org_module_name = module_name
+                logger.info_once(
+                    "For the pool model, successfully loaded the LoRA weights "
+                    "after removing the prefix 'model.'.")
+        return lora_model.get_lora(org_module_name)
+
     def deactivate_adapter(self, adapter_id: int) -> bool:
         return deactivate_adapter(adapter_id, self._active_adapters,
                                   self._deactivate_adapter)

From d4811b2b14a4359059cb6590d91fd24afeab00f9 Mon Sep 17 00:00:00 2001
From: hoshi-hiyouga <hiyouga@buaa.edu.cn>
Date: Tue, 18 Mar 2025 20:49:27 +0800
Subject: [PATCH 0812/1240] [Bugfix] torchrun compatibility (#14899)

Signed-off-by: hiyouga <hiyouga@buaa.edu.cn>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                     |  4 +++-
 vllm/distributed/parallel_state.py | 21 ++++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 70cc0affe99..1cc940c3c92 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -904,7 +904,9 @@ def get_layers_start_end_indices(
         else:
             total_num_hidden_layers = getattr(self.hf_text_config,
                                               "num_hidden_layers", 0)
-        pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
+        # the layout order is: DP x PP x TP
+        pp_rank = (parallel_config.rank // parallel_config.tensor_parallel_size
+                   ) % parallel_config.pipeline_parallel_size
         pp_size = parallel_config.pipeline_parallel_size
         start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
         return start, end
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 86166dd5bb8..f897f1950e4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -897,10 +897,23 @@ def initialize_model_parallel(
         get_world_group().device_group)
 
     data_parallel_size = 1
+    has_external_dp = False
     from vllm.config import get_current_vllm_config
     config = get_current_vllm_config()
     if config is not None:
-        data_parallel_size = config.parallel_config.data_parallel_size
+        if config.parallel_config.world_size != world_size:
+            # detect external data parallelism.
+            # dp in vllm means all dp instances need to run together.
+            # if the world size does not match, it means this dp is external,
+            # and the dp instances can run independently, e.g. in rlhf workflow
+            # from https://github.com/volcengine/verl .
+            # in that case, we treat the rest dimensions as if they are
+            # data parallel, and create a dummy dp group that is not used.
+            data_parallel_size = world_size // (pipeline_model_parallel_size *
+                                                tensor_model_parallel_size)
+            has_external_dp = True
+        else:
+            data_parallel_size = config.parallel_config.data_parallel_size
 
     # the layout order is: DP x PP x TP
     # to get group_ranks for each dimension, transpose that dimension to the
@@ -940,6 +953,12 @@ def initialize_model_parallel(
                                       2).reshape(-1,
                                                  data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
+    if has_external_dp:
+        # create a dummy dp group that is not used actually,
+        # since this dp is external.
+        # a dummy dp group means every rank is a group itself.
+        # this way, no communication is needed, no memory is wasted.
+        group_ranks = [[x] for x in range(world_size)]
     _DP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,

From 6c7f53eb30f5d87a6ba6b3d1d26dd13766042ffd Mon Sep 17 00:00:00 2001
From: Sebastian Schoennenbeck <sebastian.schoennenbeck@comma-soft.com>
Date: Tue, 18 Mar 2025 13:50:05 +0100
Subject: [PATCH 0813/1240] [Bugfix][Frontend] Fix validation of `logprobs` in
 `ChatCompletionRequest` (#14352)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sebastian Schönnenbeck <sebastian.schoennenbeck@comma-soft.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 90076a45d41..a96ca1f7570 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -548,7 +548,7 @@ def check_logprobs(cls, data):
             if top_logprobs < 0:
                 raise ValueError("`top_logprobs` must be a positive value.")
 
-            if not data.get("logprobs"):
+            if top_logprobs > 0 and not data.get("logprobs"):
                 raise ValueError(
                     "when using `top_logprobs`, `logprobs` must be set to true."
                 )

From ab5ae5ca47b3840c76407af681be9ef201e90b26 Mon Sep 17 00:00:00 2001
From: Serena <yangsijia.614@bytedance.com>
Date: Tue, 18 Mar 2025 20:50:19 +0800
Subject: [PATCH 0814/1240] [Misc][Docs] fix the comments of KV_T and CACHE_T
 in CALL_RESHAPE_AND_CACHE_XX macros (#14347)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/cache_kernels.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index d06eac2b3d4..0b3f6fc8c19 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -350,8 +350,8 @@ __global__ void concat_and_cache_mla_kernel(
 
 }  // namespace vllm
 
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)               \
   vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>             \
@@ -393,8 +393,8 @@ void reshape_and_cache(
                              CALL_RESHAPE_AND_CACHE)
 }
 
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_RESHAPE_AND_CACHE_FLASH(KV_T, CACHE_T, KV_DTYPE)         \
   vllm::reshape_and_cache_flash_kernel<KV_T, CACHE_T, KV_DTYPE>       \
@@ -446,8 +446,8 @@ void reshape_and_cache_flash(
                              CALL_RESHAPE_AND_CACHE_FLASH);
 }
 
-// KV_T is the stored data type of kv-cache.
-// CACHE_T is the data type of key and value tensors.
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)              \
   vllm::concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>            \

From 5043e7fa02e47ff106eef1a5fdb9a2e68d2b2353 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 18 Mar 2025 20:54:40 +0800
Subject: [PATCH 0815/1240] [Bugfix] Loosen type check to avoid errors in V1
 (#15021)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/blip2.py            | 12 +++++-------
 vllm/model_executor/models/chameleon.py        |  7 +++----
 vllm/model_executor/models/deepseek_vl2.py     |  2 +-
 vllm/model_executor/models/glm4v.py            |  2 +-
 vllm/model_executor/models/internvl.py         |  6 ++++--
 vllm/model_executor/models/llava_next_video.py | 15 ++++++---------
 vllm/model_executor/models/llava_onevision.py  |  5 +----
 vllm/model_executor/models/paligemma.py        | 10 ++++------
 vllm/model_executor/models/qwen_vl.py          |  6 +++---
 9 files changed, 28 insertions(+), 37 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 47362e3d897..7adca4f0dc8 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -25,7 +25,7 @@
 
 from .blip import BlipVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
 # We use this internally as placeholders since there is no image token
@@ -565,12 +565,11 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            # Remove the N dimension until multiple images are supported.
-            pixel_values = pixel_values.squeeze(1)
+            pixel_values = flatten_bn(pixel_values, concat=True)
 
             return Blip2ImagePixelInputs(
                 type="pixel_values",
@@ -578,12 +577,11 @@ def _parse_and_validate_image_input(
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
+            if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
-            # Remove the N dimension until multiple images are supported.
-            image_embeds = image_embeds.squeeze(1)
+            image_embeds = flatten_bn(image_embeds, concat=True)
 
             return Blip2ImageEmbeddingInputs(
                 type="image_embeds",
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 66bf85b59d1..ebcd36148e0 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -39,7 +39,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -972,12 +972,11 @@ def _parse_and_validate_image_input(
         if pixel_values is None:
             return None
 
-        if not isinstance(pixel_values, torch.Tensor):
+        if not isinstance(pixel_values, (torch.Tensor, list)):
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
-        # Remove the N dimension until multiple images are supported.
-        pixel_values = pixel_values.squeeze(1)
+        pixel_values = flatten_bn(pixel_values, concat=True)
 
         return ChameleonImagePixelInputs(
             type="pixel_values",
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 6ea8de8450b..0faf895964b 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -478,7 +478,7 @@ def _parse_and_validate_image_input(
                     flatten_bn(images_spatial_crop, concat=True)))
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
+            if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 9889b7e4de4..c190a458559 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -578,7 +578,7 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e91d0ba1b38..d31b623b5bc 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -838,7 +838,7 @@ def _parse_and_validate_image_input(
             return None
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
+            if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
@@ -856,7 +856,9 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values_flat)}")
 
-            assert isinstance(image_num_patches, (torch.Tensor, list))
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(pixel_values_flat)}")
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 5eb56d6711f..8b1a8c9da68 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -349,21 +349,18 @@ def _parse_and_validate_video_input(
                 List[b, Tensor(nb_frames, nb_channels, height, width)]
         }
         """
-        pixel_values = kwargs.pop("pixel_values_videos", None)
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
 
-        if pixel_values is None:
+        if pixel_values_videos is None:
             return None
 
-        if not (is_list_of(pixel_values,
-                           (torch.Tensor))  # different shape videos 
-                or isinstance(pixel_values,
-                              torch.Tensor)):  # same shape videos
-            raise ValueError("Incorrect type of pixel values. "
-                             f"Got type: {type(pixel_values)}")
+        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel_values_videos. "
+                             f"Got type: {type(pixel_values_videos)}")
 
         return LlavaNextVideoPixelInputs(
             type="pixel_values_videos",
-            data=pixel_values,
+            data=pixel_values_videos,
         )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 52ec0abcdc5..6a2328f950b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -574,10 +574,7 @@ def _parse_and_validate_video_input(
         if pixel_values_videos is None:
             return None
 
-        if not (is_list_of(pixel_values_videos,
-                           torch.Tensor)  # different shape videos 
-                or isinstance(pixel_values_videos,
-                              torch.Tensor)):  # same shape videos
+        if not isinstance(pixel_values_videos, (torch.Tensor, list)):
             raise ValueError("Incorrect type of pixel_values_videos. "
                              f"Got type: {type(pixel_values_videos)}")
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 8a773607ce4..6fedb8c8198 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -23,7 +23,7 @@
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .siglip import SiglipVisionModel
-from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 from .vision import get_vision_encoder_info
 
@@ -270,12 +270,11 @@ def _parse_and_validate_image_input(
             return None
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
-            # Remove the N dimension until multiple images are supported.
-            pixel_values = pixel_values.squeeze(1)
+            pixel_values = flatten_bn(pixel_values, concat=True)
 
             return PaliGemmaImagePixelInputs(
                 type="pixel_values",
@@ -287,8 +286,7 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
-            # Remove the N dimension until multiple images are supported.
-            image_embeds = image_embeds.squeeze(1)
+            image_embeds = flatten_bn(image_embeds, concat=True)
 
             return PaliGemmaImageEmbeddingInputs(
                 type="image_embeds",
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 1a39d2e74b1..4e9d02ae0ab 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -711,7 +711,7 @@ def _parse_and_validate_image_input(
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values is not None:
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
@@ -722,13 +722,13 @@ def _parse_and_validate_image_input(
             )
 
         if image_embeds is not None:
-            if not isinstance(image_embeds, torch.Tensor):
+            if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
 
             return QwenImageEmbeddingInputs(
                 type="image_embeds",
-                data=flatten_bn(image_embeds),
+                data=flatten_bn(image_embeds, concat=True),
             )
 
         return None

From ca5cd99764a84741d498dcc03ac08342b20d730a Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 18 Mar 2025 06:14:47 -0700
Subject: [PATCH 0816/1240] [Bugfix] Register serializers for V0 MQ Engine
 (#15009)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/multiprocessing/engine.py | 11 ++++++++---
 vllm/entrypoints/openai/api_server.py |  5 +++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 33b96af3018..739cbedc2f8 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -29,6 +29,8 @@
 # yapf: enable
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.worker.model_runner_base import InputProcessingError
 
@@ -42,12 +44,12 @@ class MQLLMEngine:
     """A multiprocessing wrapper for :class:`LLMEngine`.
 
     This class is used to wrap the :class:`LLMEngine` class to enable use
-    in concurrnet manner. It runs a background loop and uses zeromq to 
+    in concurrnet manner. It runs a background loop and uses zeromq to
     receive new requests and stream outputs incrementally via ipc.
-    
+
     The :class:`LLMEngine` generate or encode process is kicked off when a new
     RPCProcessRequest is received by the input_socket.
-    
+
     The self.engine_loop checks the input_socket for new requests,
     adds them to the LLMEngine if there are any, calls the internal
     :class:`LLMEngine.step()`, and sends the RequestOutputs back over
@@ -428,6 +430,9 @@ def run_mp_engine(vllm_config: VllmConfig, usage_context: UsageContext,
                   ipc_path: str, disable_log_stats: bool,
                   disable_log_requests: bool, engine_alive):
     try:
+        # Ensure we can serialize transformer config before spawning
+        maybe_register_config_serialize_by_value()
+
         engine = MQLLMEngine.from_vllm_config(
             vllm_config=vllm_config,
             usage_context=usage_context,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index bc74ebd205d..ef557193ae2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -82,6 +82,8 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import load_aware_call, with_cancellation
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
@@ -221,6 +223,9 @@ async def build_async_engine_client_from_engine_args(
         # so we need to spawn a new process
         context = multiprocessing.get_context("spawn")
 
+        # Ensure we can serialize transformer config before spawning
+        maybe_register_config_serialize_by_value()
+
         # The Process can raise an exception during startup, which may
         # not actually result in an exitcode being reported. As a result
         # we use a shared variable to communicate the information.

From fe589319c3c419fa74837dc3d397a7322ecfea55 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 18 Mar 2025 15:34:45 +0100
Subject: [PATCH 0817/1240] [TPU][V1][Bugfix] Fix chunked prefill with padding
 (#15037)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 00869467be3..62d8354f4f9 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -410,6 +410,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # Do the padding and copy the tensors to the TPU.
         padded_total_num_scheduled_tokens = _get_padded_token_len(
             total_num_scheduled_tokens)
+        # Zero out to avoid spurious values from prev iteration (last cp chunk)
+        self.input_ids_cpu[
+            total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
         self.input_ids = self.input_ids_cpu[:
                                             padded_total_num_scheduled_tokens].to(
                                                 self.device)

From c94e0871faec1ae4efc4762748cfcf2c87f5e005 Mon Sep 17 00:00:00 2001
From: ekuznetsov139 <nameless@fastmail.fm>
Date: Tue, 18 Mar 2025 08:05:18 -0700
Subject: [PATCH 0818/1240] MI325 configs, fused_moe_kernel bugfix (#14987)

Signed-off-by: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 ...Instinct_MI325X,block_shape=[128,128].json | 200 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 200 ++++++++++++++++++
 .../layers/fused_moe/fused_moe.py             |   7 +-
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++
 54 files changed, 8878 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
new file mode 100644
index 00000000000..43c249d2530
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325X,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..43c249d2530
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e6769604ee6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e6769604ee6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..e6769604ee6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,200 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 1,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 8,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0,
+        "matrix_instr_nonkdim": 16,
+        "kpack": 2
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 89ceba12274..4143ccce525 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -783,8 +783,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             use_int8_w8a16=use_int8_w8a16,
             **config,
         )
-
     else:
+        config = config.copy()
+        BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K")
+        if block_shape is not None:
+            BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0],
+                                                 block_shape[1]))
         fused_moe_kernel[grid](
             A,
             B,
@@ -823,6 +827,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             compute_type=compute_type,
             use_fp8_w8a8=use_fp8_w8a8,
             use_int8_w8a16=use_int8_w8a16,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
             **config,
         )
 
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..119969d01af
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..119969d01af
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7e52ab61af2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7e52ab61af2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..ecc2fda2bce
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..ecc2fda2bce
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..108af31d3dd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..108af31d3dd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..abd1915497c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..abd1915497c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bb61d83a8a8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bb61d83a8a8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..574cf49380e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..574cf49380e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7bfaf93c42c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7bfaf93c42c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c2bd478f0d8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c2bd478f0d8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..75906ad2ffd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..75906ad2ffd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..fdc6437b7fe
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..fdc6437b7fe
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7eaa7d17771
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7eaa7d17771
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..3382554ce8f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..3382554ce8f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c9d18c96103
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c9d18c96103
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..56a766c958b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..56a766c958b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f250d3fd910
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f250d3fd910
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bbd4df41b55
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..bbd4df41b55
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7bb8e87acaf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7bb8e87acaf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f050b752437
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f050b752437
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4a3ccc067f5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..4a3ccc067f5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..24ef11211cc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..24ef11211cc
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c911a8e9f67
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c911a8e9f67
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d86b349f9c4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d86b349f9c4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b4c32497a5f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b4c32497a5f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI325_OAM,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "8": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "16": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "24": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "32": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "48": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 8,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "64": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 16,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "96": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "128": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "256": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "512": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "GROUP_SIZE_M": 1,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "GROUP_SIZE_M": 32,
+        "kpack": 1,
+        "matrix_instr_nonkdim": 16,
+        "num_warps": 4
+    }
+}
\ No newline at end of file

From 539343675d7a5555886920860c154114fb48325f Mon Sep 17 00:00:00 2001
From: yury-tokpanov <yury@zyphra.com>
Date: Tue, 18 Mar 2025 08:56:21 -0700
Subject: [PATCH 0819/1240] [MODEL] Add support for Zamba2 models (#13185)

Signed-off-by: Yury Tokpanov <yury@zyphra.com>
Signed-off-by: Quentin Anthony <qganthony@yahoo.com>
Co-authored-by: Quentin Anthony <qganthony@yahoo.com>
Co-authored-by: Tyler Michael Smith <tysmith@redhat.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |    5 +
 .../decoder_only/language/test_hybrid.py      |   51 +-
 tests/models/registry.py                      |    2 +
 vllm/config.py                                |   14 +
 .../layers/mamba/mamba_mixer2.py              |    1 -
 vllm/model_executor/models/bamba.py           |    2 -
 vllm/model_executor/models/jamba.py           |    2 -
 vllm/model_executor/models/registry.py        |    1 +
 vllm/model_executor/models/zamba2.py          | 1031 +++++++++++++++++
 9 files changed, 1082 insertions(+), 27 deletions(-)
 create mode 100644 vllm/model_executor/models/zamba2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 97aaf254147..fbcea826e6c 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -477,6 +477,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
   * ✅︎
   * ✅︎
+- * `Zamba2ForCausalLM`
+  * Zamba2
+  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
+  *
+  *
 :::
 
 :::{note}
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
index 1a78b30930e..60eb3830c6d 100644
--- a/tests/models/decoder_only/language/test_hybrid.py
+++ b/tests/models/decoder_only/language/test_hybrid.py
@@ -9,7 +9,7 @@
 from ...utils import check_outputs_equal
 
 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev"]
+MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
 # Bamba at Fp32 is too big for the CI (L4 GPU).
 # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
 
@@ -27,17 +27,19 @@ def test_models(
 ) -> None:
 
     # numeric error produces different generation
-    if 'Bamba' in model:
+    if "Bamba" in model:
         example_prompts.pop(3)
 
-    with hf_runner(
-            model,
-            dtype=dtype,
-            model_kwargs={
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
-                # don't use them
-            }) as hf_model:
+    model_kwargs = {
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
+        # don't use them
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
@@ -112,26 +114,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                 model: str, dtype: str,
                                 max_tokens: int) -> None:
-    # numeric error during prefill chucking produces different generation
+    # numeric error during prefill chunking produces different generation
     # compared to w/o prefill chunking for those examples, removed them for now
-    if 'Jamba' in model:
+    if "Jamba" in model:
         example_prompts.pop(7)
         example_prompts.pop(2)
         example_prompts.pop(1)
-    elif 'Bamba' in model:
+    elif "Bamba" in model:
         example_prompts.pop(6)
         example_prompts.pop(3)
         example_prompts.pop(2)
         dtype = "half"  # use a different dtype for Bamba
-
-    with hf_runner(
-            model,
-            dtype=dtype,
-            model_kwargs={
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
-                # don't use them
-            }) as hf_model:
+    elif "Zamba2" in model:
+        example_prompts.pop(7)
+        dtype = "half"
+
+    model_kwargs = {
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
+        # don't use them
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
         non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 6b0ac46b0c3..554e28863a7 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -195,6 +195,8 @@ def check_available_online(
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
+                                         min_transformers_version="4.49"),
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
diff --git a/vllm/config.py b/vllm/config.py
index 1cc940c3c92..e83a8eeee82 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -821,6 +821,11 @@ def get_head_size(self) -> int:
                 if qk_rope_head_dim and qk_nope_head_dim:
                     return qk_rope_head_dim + qk_nope_head_dim
 
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      == "zamba2"):
+            return self.hf_text_config.attention_head_dim
+
         if self.is_attention_free:
             return 0
 
@@ -944,6 +949,15 @@ def get_num_layers_by_block_type(
                                  "cannot determine the num of "
                                  f"{block_type.value} layers")
 
+            if hasattr(self.hf_text_config,
+                       "model_type") and (self.hf_text_config.model_type
+                                          == "zamba2"):
+                if attn_block_type:
+                    return sum(t == "hybrid"
+                               for t in layers_block_type_value[start:end])
+                else:
+                    return self.get_num_layers(parallel_config)
+
             return sum(t == block_type.value
                        for t in layers_block_type_value[start:end])
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index b53a540ed66..53d68b60f2f 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -245,7 +245,6 @@ def __init__(self,
         assert num_heads % self.tp_size == 0, \
             "Tensor parallel world size must divide num heads."
 
-
         assert (n_groups % self.tp_size) == 0 or n_groups == 1, \
             (
                 "If tensor parallel world size does not divide num_heads, "
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index 61b68125e07..de0209d0b43 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -38,8 +38,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 
 class BambaMLP(nn.Module):
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 11b863ded45..6fabc8228e1 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -36,8 +36,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 
 class JambaMoE(nn.Module):
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5dd3aa2973c..8b469132da6 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -105,6 +105,7 @@
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
new file mode 100644
index 00000000000..7e210244f79
--- /dev/null
+++ b/vllm/model_executor/models/zamba2.py
@@ -0,0 +1,1031 @@
+# SPDX-License-Identifier: Apache-2.0
+"""PyTorch Zamba2 model implementation for vLLM.
+
+This module implements the Zamba2 architecture from 
+https://arxiv.org/abs/2411.15242, which combines Mamba and Transformer 
+architectures in a hybrid model optimized for efficient sequence modeling. The 
+model alternates between state space model layers and attention-based layers.
+"""
+from itertools import cycle
+from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import Zamba2Config
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import GeluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .utils import maybe_prefix
+
+
+class Zamba2LoRA(nn.Module):
+    """LoRA layer for the Zamba2 model.
+    
+    Implements a LoRA layer that is used in shared attention and gated MLP
+    blocks.
+    """
+
+    def __init__(
+        self,
+        input_dim: int,
+        rank: int,
+        output_dim: Union[int, List[int]],
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        """Initialize the attention layer.
+        
+        Args:
+            input_dim: input dimension
+            rank: LoRA rank
+            output_dim: output dimension
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+
+        self.A = ColumnParallelLinear(input_dim,
+                                      rank,
+                                      bias=False,
+                                      quant_config=quant_config,
+                                      gather_output=True)
+
+        if isinstance(output_dim, list):
+            B_class = MergedColumnParallelLinear
+        else:
+            B_class = ColumnParallelLinear
+        self.B = B_class(rank,
+                         output_dim,
+                         bias=False,
+                         quant_config=quant_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        lora_output, _ = self.A(hidden_states)
+        lora_output, _ = self.B(lora_output)
+        return lora_output
+
+
+class Zamba2Attention(nn.Module):
+    """Multi-head attention mechanism for the Zamba2 model.
+    
+    Implements attention with parallel computation, QKV projections, optional 
+    adapters and rotary position embeddings. The attention is computed across
+    distributed blocks for efficient processing.
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the attention layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare attention block
+            num_hybrid_layers: Total number of hybrid layers
+            cache_config: Configuration for key-value caching
+            quant_config: Configuration for model quantization
+            prefix: Optional prefix for parameter names
+        """
+        super().__init__()
+        tp_size = get_tensor_model_parallel_world_size()
+        self.config = config
+        self.num_hybrid_layers = num_hybrid_layers
+        self.rope_theta = config.rope_theta
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.total_num_attention_heads = config.num_attention_heads
+        assert self.total_num_attention_heads % tp_size == 0
+        self.num_attention_heads = config.num_attention_heads // tp_size
+        self.attention_head_dim = config.attention_head_dim
+        self.qkv_size = self.attention_hidden_size // tp_size
+        self.scale = (self.attention_head_dim / 2)**-0.5
+
+        if (self.attention_head_dim *
+                self.total_num_attention_heads) != self.attention_hidden_size:
+            raise ValueError(
+                f"attention_hidden_size must be divisible by"
+                f" num_attention_heads"
+                f" (got `attention_hidden_size`: {self.attention_hidden_size}"
+                f" and `num_heads`: {self.num_attention_heads}).")
+
+        self.qkv_proj = QKVParallelLinear(
+            self.attention_hidden_size,
+            self.attention_head_dim,
+            self.total_num_attention_heads,
+            bias=False,
+            quant_config=quant_config,
+        )
+        self.o_proj = RowParallelLinear(self.attention_hidden_size,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config)
+
+        # Even though in Zamba2 weights are shared between attention layers, KV
+        # cache is unique for every attention layer. Hence, we need to define
+        # separate Attention objects, because in recent vLLM KV cache tensors
+        # are tied to specific Attention objects.
+
+        # Initialize attention blocks with proper indexing
+        self.dpa_list = nn.ModuleList([])
+        j = bare_block_idx * (self.num_hybrid_layers + config.num_mem_blocks -
+                              1) // config.num_mem_blocks
+        for block_idx in range(self.num_hybrid_layers):
+            if block_idx % config.num_mem_blocks == bare_block_idx:
+                dpa = Attention(
+                    self.num_attention_heads,
+                    self.attention_head_dim,
+                    self.scale,
+                    cache_config=cache_config,
+                    prefix=f"{prefix}.attn.{j}",
+                )
+                j += 1
+            else:
+                dpa = nn.Identity()
+            self.dpa_list.append(dpa)
+
+        # Initialize adapter layers if enabled
+        if config.use_shared_attention_adapter:
+            self.linear_q_adapter_list = nn.ModuleList([])
+            self.linear_k_adapter_list = nn.ModuleList([])
+            self.linear_v_adapter_list = nn.ModuleList([])
+
+            for block_idx in range(self.num_hybrid_layers):
+                if block_idx % config.num_mem_blocks == bare_block_idx:
+                    linear_q_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                    linear_k_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                    linear_v_adapter = Zamba2LoRA(
+                        self.attention_hidden_size,
+                        config.adapter_rank,
+                        self.attention_hidden_size,
+                        quant_config=quant_config,
+                    )
+                else:
+                    linear_q_adapter = nn.Identity()
+                    linear_k_adapter = nn.Identity()
+                    linear_v_adapter = nn.Identity()
+
+                self.linear_q_adapter_list.append(linear_q_adapter)
+                self.linear_k_adapter_list.append(linear_k_adapter)
+                self.linear_v_adapter_list.append(linear_v_adapter)
+
+        if config.use_mem_rope:
+            self.rotary_emb = get_rope(
+                head_size=self.attention_head_dim,
+                rotary_dim=self.attention_head_dim,
+                max_position=config.max_position_embeddings,
+                base=self.rope_theta,
+                rope_scaling=None,
+                is_neox_style=True,
+            )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        block_idx: int,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass through the attention layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            position_ids: Position IDs for positional embeddings
+            block_idx: Current shared transformer block index
+            
+        Returns:
+            Output tensor [batch_size, seq_len, hidden_size]
+        """
+        qkv, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv.split([self.qkv_size] * 3,
+                                                           dim=-1)
+
+        if self.config.use_shared_attention_adapter:
+            # Apply adapter transformations to Q, K, V if enabled
+            q_adapter = self.linear_q_adapter_list[block_idx]
+            assert not isinstance(q_adapter, nn.Identity)
+            q_lora_output = q_adapter(hidden_states)
+            query_states = query_states + q_lora_output
+
+            k_adapter = self.linear_k_adapter_list[block_idx]
+            assert not isinstance(k_adapter, nn.Identity)
+            k_lora_output = k_adapter(hidden_states)
+            key_states = key_states + k_lora_output
+
+            v_adapter = self.linear_v_adapter_list[block_idx]
+            assert not isinstance(v_adapter, nn.Identity)
+            v_lora_output = v_adapter(hidden_states)
+            value_states = value_states + v_lora_output
+
+        if self.config.use_mem_rope:
+            query_states, key_states = self.rotary_emb(position_ids,
+                                                       query_states,
+                                                       key_states)
+
+        y = self.dpa_list[block_idx](query_states, key_states, value_states)
+        y, _ = self.o_proj(y)
+        return y
+
+
+class Zamba2MLP(nn.Module):
+    """Feed-forward MLP layer for the Zamba2 model.
+    
+    Implements a gated feed-forward network that projects inputs to a larger 
+    intermediate size, applies GELU activation with gating, then projects back 
+    to the original size. Includes optional adapter layers for model adaptation.
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: Dict[int, int],
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        """Initialize the MLP layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare block in the model
+            num_hybrid_layers: Total number of hybrid layers
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_hybrid_layers = num_hybrid_layers
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        # Main projection layers with gating
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            2 * [self.intermediate_size],  # 2x for gate and input projections
+            bias=self.config.add_bias_linear,
+            quant_config=quant_config)
+
+        self.down_proj = RowParallelLinear(self.intermediate_size,
+                                           self.hidden_size,
+                                           bias=self.config.add_bias_linear,
+                                           quant_config=quant_config)
+
+        # Only allow GELU activations
+        if config.hidden_act != "gelu":
+            raise ValueError(f"Only GELU activation is supported "
+                             f"(got `hidden_act`: {config.hidden_act})")
+        self.act_fn = GeluAndMul()
+
+        # Initialize adapter layers
+        self.gate_up_proj_adapter_list = nn.ModuleList([])
+        for block_idx in range(self.num_hybrid_layers):
+            if block_idx % config.num_mem_blocks == bare_block_idx:
+                gate_up_proj_adapter = Zamba2LoRA(
+                    config.hidden_size,
+                    config.adapter_rank,
+                    2 * [self.intermediate_size],
+                    quant_config,
+                )
+            else:
+                gate_up_proj_adapter = nn.Identity()
+            self.gate_up_proj_adapter_list.append(gate_up_proj_adapter)
+
+    def forward(self, hidden_states: torch.Tensor,
+                block_idx: int) -> torch.Tensor:
+        """Forward pass through the MLP layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            block_idx: Current shared transformer block index
+            
+        Returns:
+            Output tensor [batch_size, seq_len, hidden_size] after applying
+            gated feed-forward transformation
+        """
+        # Project input to intermediate size with gating
+        gate_up_states, _ = self.gate_up_proj(hidden_states)
+
+        # Apply adapter transformation if present
+        adapter = self.gate_up_proj_adapter_list[block_idx]
+        assert not isinstance(adapter, nn.Identity)
+        lora_output = adapter(hidden_states)
+        gate_up_states = gate_up_states + lora_output
+
+        # Apply GELU activation with gating
+        hidden_states = self.act_fn(gate_up_states)
+
+        # Project back to hidden size
+        output, _ = self.down_proj(hidden_states)
+        return output
+
+
+class Zamba2AttentionDecoderLayer(nn.Module):
+    """Single decoder layer combining attention and feed-forward networks.
+    
+    This layer implements a standard transformer block with:
+    - Input layer normalization
+    - Multi-head self-attention
+    - Pre-feed-forward layer normalization
+    - Feed-forward network (MLP)
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        bare_block_idx: int,
+        num_hybrid_layers: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        """Initialize the decoder layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            bare_block_idx: Index of the bare block
+            num_hybrid_layers: Total number of hybrid layers
+            cache_config: Configuration for key-value caching
+            quant_config: Configuration for model quantization
+            prefix: Optional prefix for parameter names
+        """
+        super().__init__()
+
+        # Initialize attention sublayer
+        self.self_attn = Zamba2Attention(
+            config,
+            bare_block_idx=bare_block_idx,
+            num_hybrid_layers=num_hybrid_layers,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+
+        # Initialize feed-forward sublayer
+        self.feed_forward = Zamba2MLP(
+            config,
+            bare_block_idx=bare_block_idx,
+            num_hybrid_layers=num_hybrid_layers,
+            quant_config=quant_config,
+        )
+
+        # Initialize layer normalizations
+        # Input normalization operates on concatenated states
+        self.input_layernorm = RMSNorm(2 * config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        # Pre-FF normalization operates on attention output
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        block_idx: int,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass through the decoder layer.
+        
+        Args:
+            hidden_states: Input tensor from previous layer
+            original_hidden_states: Original input tensor for residual 
+                connection
+            block_idx: Current shared transformer block index
+            positions: IDs for positional embeddings
+            
+        Returns:
+            Transformed hidden states after attention and feed-forward
+        """
+
+        # The argument original_hidden_states is concatenated with hidden_states
+        # (which is the output of the previous (mamba) layer).
+        # The concatenated tensor is then used as input of the pre-attention
+        # RMSNorm (see fig. 2 in https://arxiv.org/pdf/2405.16712).
+        hidden_states = torch.concatenate(
+            [hidden_states, original_hidden_states], dim=-1)
+
+        # Layer norm before attention
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self attention
+        hidden_states = self.self_attn(
+            hidden_states,
+            position_ids=positions,
+            block_idx=block_idx,
+        )
+
+        # Layer norm before feed-forward
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+
+        # Feed-forward network
+        hidden_states = self.feed_forward(hidden_states, block_idx=block_idx)
+
+        return hidden_states
+
+
+class Zamba2MambaDecoderLayer(nn.Module):
+    """Single Mamba decoder layer with normalization.
+    
+    This implements a  Mamba block. It includes input normalization 
+    and can process sequences using either chunked or full 
+    computation depending on configuration.
+    """
+
+    def __init__(
+        self,
+        config: Zamba2Config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        """Initialize the Mamba decoder layer.
+        
+        Args:
+            config: The Zamba2 model configuration
+            quant_config: Configuration for model quantization
+        """
+        super().__init__()
+
+        # Initialize Mamba mixer with expanded intermediate size
+        intermediate_size = config.mamba_expand * config.hidden_size
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=intermediate_size,
+            use_conv_bias=config.use_conv_bias,
+            use_bias=config.add_bias_linear,
+            n_groups=config.mamba_ngroups,
+            num_heads=config.n_mamba_heads,
+            head_dim=intermediate_size // config.n_mamba_heads,
+            rms_norm_eps=config.rms_norm_eps,
+            activation="silu",
+            chunk_size=config.chunk_size,
+            quant_config=quant_config,
+        )
+
+        # Input normalization
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        sequence_idx: Optional[torch.Tensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        positions: Optional[torch.Tensor] = None,
+        original_hidden_states: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass through the Mamba decoder layer.
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            sequence_idx: Index tensor for identifying sequences in batch
+                Required for proper chunked processing in prefill
+            transformer_hidden_states: Optional output from transformer path
+                Added to input if provided (used in hybrid architecture)
+            positions: Optional position IDs (unused in Mamba)
+            original_hidden_states: Optional original inputs (unused in Mamba)
+            
+        Returns:
+            Transformed hidden states with residual connection applied
+        """
+        # Store input for residual connection
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared
+        # transformer + linear layer (see fig. 2 in
+        # https://arxiv.org/pdf/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba
+        # layer below (as described in eq. (6) of
+        # https://arxiv.org/pdf/2405.16712).
+        if transformer_hidden_states is not None:
+            hidden_states = hidden_states + transformer_hidden_states
+
+        # Apply input normalization
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Process through Mamba mixer
+        hidden_states = self.mamba(
+            hidden_states,
+            mamba_cache_params=mamba_cache_params,
+            sequence_idx=sequence_idx,
+        )
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class Zamba2HybridLayer(nn.Module):
+    """Hybrid layer combining Transformer and Mamba architectures.
+    
+    This layer implements the hybrid architecture described in the Zamba paper,
+    where a shared transformer pathway processes input in parallel with a Mamba
+    pathway. The transformer output is projected and added to the Mamba input
+    for enhanced representation learning.
+    """
+
+    def __init__(
+        self,
+        shared_transformer: Zamba2AttentionDecoderLayer,
+        config: Zamba2Config,
+        block_idx: int,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        """Initialize the hybrid layer.
+        
+        Args:
+            shared_transformer: Transformer decoder layer for attention pathway
+            linear: Linear projection for transformer output before Mamba
+            mamba: Mamba decoder layer for state space pathway
+        """
+        super().__init__()
+        self.block_idx = block_idx
+        self.shared_transformer = shared_transformer
+        self.linear = ReplicatedLinear(config.hidden_size,
+                                       config.hidden_size,
+                                       bias=False,
+                                       quant_config=quant_config)
+        self.mamba_decoder = Zamba2MambaDecoderLayer(config,
+                                                     quant_config=quant_config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: Optional[MambaCacheParams] = None,
+        sequence_idx: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass through the hybrid layer.
+        
+        Processes input through parallel transformer and Mamba paths:
+        1. Transformer path processes input with attention
+        2. Transformer output is projected to match hidden size
+        3. Projected output is added to Mamba path input
+        4. Final output combines both paths' representations
+        
+        Args:
+            hidden_states: Input tensor [batch_size, seq_len, hidden_size]
+            original_hidden_states: Original input for transformer residual 
+                connection
+            positions: Position IDs for positional embeddings
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            sequence_idx: Indices for identifying sequences in batch,
+                required for proper chunked processing in prefill
+            
+        Returns:
+            Output tensor combining transformer and Mamba representations
+        """
+        # Process through transformer pathway
+        transformer_hidden_states = self.shared_transformer(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            block_idx=self.block_idx,
+            positions=positions,
+        )
+
+        # Project transformer output
+        transformer_hidden_states, _ = self.linear(transformer_hidden_states)
+
+        # Process through Mamba pathway with transformer injection
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            mamba_cache_params=mamba_cache_params,
+            sequence_idx=sequence_idx,
+        )
+
+        return layer_outputs
+
+
+class Zamba2Model(nn.Module):
+    """Core Zamba2 model combining transformer and Mamba architectures.
+    
+    The model processes input through a sequence of hybrid and Mamba-only 
+    layers, using token embeddings and final layer normalization.
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        """Initialize the Zamba2 model.
+        
+        Args:
+            vllm_config: Configuration object containing model, cache, 
+                quantization and LoRA settings
+            prefix: Optional prefix for parameter names in state dict
+        """
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        is_lora_enabled = bool(lora_config)
+        assert not is_lora_enabled
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+
+        # Initialize token embeddings
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+        )
+
+        # Map hybrid layer indices to block indices
+        layer2block_map = {
+            layer_idx: block_idx
+            for block_idx, layer_idx in enumerate(config.hybrid_layer_ids)
+        }
+
+        # Create cyclic iterator of transformer blocks
+        blocks = cycle([
+            Zamba2AttentionDecoderLayer(config,
+                                        bare_block_idx=idx,
+                                        num_hybrid_layers=len(layer2block_map),
+                                        cache_config=cache_config,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}")
+            for idx in range(config.num_mem_blocks)
+        ])
+
+        # Initialize layers according to block type configuration
+        layers = []
+        for layer_idx, layer_type in enumerate(config.layers_block_type):
+            if layer_type == "hybrid":
+                block = next(blocks)
+                block_idx = layer2block_map[layer_idx]
+                layers.append(
+                    Zamba2HybridLayer(block, config, block_idx, quant_config))
+            else:
+                layers.append(
+                    Zamba2MambaDecoderLayer(config, quant_config=quant_config))
+        self.layers = nn.ModuleList(layers)
+
+        # Final layer normalization
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Convert input token IDs to embeddings.
+        
+        Args:
+            input_ids: Tensor of input token IDs
+            
+        Returns:
+            Embedded representation of the input tokens
+        """
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Forward pass through the model.
+        
+        Args:
+            input_ids: Input token IDs
+            positions: Position IDs for embeddings
+            mamba_cache_params: Parameters for Mamba's state caches 
+                (one for conv, one for ssm)
+            inputs_embeds: Optional pre-computed input embeddings
+            
+        Returns:
+            Either final hidden states or intermediate tensors for pipeline 
+            parallelism
+        """
+        # Handle pipeline parallelism for first rank
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+        hidden_states = inputs_embeds
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        seq_idx = None
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata.num_prefills > 0:
+            seq_idx = torch.zeros_like(input_ids, dtype=torch.int32)
+            for i, (srt, end) in enumerate(
+                    zip(
+                        attn_metadata.query_start_loc,
+                        attn_metadata.query_start_loc[1:],
+                    )):
+                seq_idx[srt:end] = i
+            seq_idx.unsqueeze_(0)
+
+        # Process through layers
+        original_hidden_states = torch.clone(hidden_states)
+        for layer_idx, layer in enumerate(self.layers):
+            layer_outputs = layer(
+                hidden_states,
+                original_hidden_states=original_hidden_states,
+                positions=positions,
+                mamba_cache_params=mamba_cache_params.at_layer_idx(layer_idx),
+                sequence_idx=seq_idx,
+            )
+            hidden_states = layer_outputs
+
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
+    """Zamba2 model with causal language modeling head.
+    
+    This class wraps the core Zamba2 model and adds:
+    - A language modeling head for next token prediction
+    - Mamba state caching functionality
+    - Support for model parallelism and quantization
+    - Sampling capabilities for text generation
+    """
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        """Initialize the Zamba2 model for causal language modeling.
+        
+        Args:
+            vllm_config: Configuration containing model, cache, quantization,
+                        LoRA and scheduler settings
+            prefix: Optional prefix for parameter names
+        
+        Raises:
+            AssertionError: If prefix caching is enabled (not supported by 
+            Mamba)
+        """
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert not cache_config.enable_prefix_caching, \
+            "Mamba does not support prefix caching"
+
+        super().__init__()
+        self.config = config
+        self.vllm_config = vllm_config
+        self.scheduler_config = scheduler_config
+        self.model_config = vllm_config.model_config
+        self.unpadded_vocab_size = config.vocab_size
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
+        # Initialize core model
+        self.model = Zamba2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
+
+        # Initialize language modeling head
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE
+            # We need bigger padding if using lora for kernel
+            # compatibility
+            if not lora_config else lora_config.lora_vocab_padding_size,
+        )
+        # Tie weights with input embeddings if using same dimensions
+        self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        # Used to track and store by the Mamba cache between steps.
+        self.mamba_cache: Optional[MambaCacheManager] = None
+
+        # Initialize logits processing and sampling
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size)
+        self.sampler = get_sampler()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        """Convert input token IDs to embeddings.
+        Args:
+            input_ids: Tensor of input token IDs
+        Returns:
+            Embedded representation of the input tokens
+        """
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        """Forward pass through the model.
+        
+        Args:
+            input_ids: Input token IDs
+            positions: Position IDs for embeddings
+            inputs_embeds: Optional pre-computed input embeddings
+            **kwargs: Additional arguments passed to cache manager
+            
+        Returns:
+            Output hidden states
+        """
+        # Initialize Mamba cache if needed
+        if self.mamba_cache is None:
+            num_mamba_layers = self.config.num_hidden_layers
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                *self._get_mamba_cache_shape())
+
+        # Get cache parameters for current run
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+
+        # Forward pass through model
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            mamba_cache_params,
+            inputs_embeds,
+        )
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str,
+                                                                 torch.Tensor],
+                                       **kwargs) -> Dict[str, torch.Tensor]:
+        """Copy inputs before CUDA graph capture.
+        
+        Args:
+            input_buffers: Dictionary of input tensors
+            **kwargs: Additional arguments passed to cache manager
+            
+        Returns:
+            Updated input buffers
+        """
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(
+            self, batch_size: int) -> Dict[str, torch.Tensor]:
+        """Get inputs for sequence-length-agnostic graph capture.
+        
+        Args:
+            batch_size: Size of batch to capture
+        Returns:
+            Dictionary of capture inputs
+        """
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+        """Calculate shapes for Mamba's convolutional and state caches.
+        
+        Returns:
+            Tuple containing:
+            - conv_state_shape: Shape for convolutional state cache
+            - temporal_state_shape: Shape for state space model cache
+        """
+        world_size = get_tensor_model_parallel_world_size()
+
+        intermediate_size = self.config.mamba_expand * self.config.hidden_size
+
+        # Extend groups if needed to ensure all groups needed by a head
+        # are sharded together
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = (self.config.mamba_ngroups + extra_groups_for_head_shards(
+            self.config.mamba_ngroups, world_size))
+
+        # Calculate conv state shape (includes groups)
+        # - heads and n_groups are TP-ed
+        conv_dim = (intermediate_size +
+                    2 * n_groups * self.config.mamba_d_state)
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.mamba_d_conv - 1,
+        )
+
+        # Calculate temporal state shape (per-head states)
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(divide(intermediate_size, self.config.mamba_headdim),
+                   world_size),
+            self.config.mamba_headdim,
+            self.config.mamba_d_state,
+        )
+
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        """Compute logits for next token prediction.
+        
+        Args:
+            hidden_states: Hidden states from model forward pass
+            sampling_metadata: Metadata for sampling process
+            
+        Returns:
+            Logits for next token prediction
+        """
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: Optional[torch.Tensor],
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        """Sample next tokens from computed logits.
+        
+        Args:
+            logits: Computed logits for next token prediction
+            sampling_metadata: Metadata for sampling process
+            
+        Returns:
+            Sampled tokens and related sampling information
+        """
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        weights_dict = {}
+        for key, loaded_weight in weights:
+            if "A_log" in key:
+                key = key.replace("A_log", "A")
+            elif "adapter_list" in key:
+                key = key.replace("0.weight", "A.weight")
+                key = key.replace("1.weight", "B.weight")
+            weights_dict[key] = loaded_weight
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for chkpt_weight_name, loaded_weight in weights_dict.items():
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in chkpt_weight_name:
+                    continue
+                chkpt_weight_name = chkpt_weight_name.replace(
+                    weight_name, param_name)
+                param = params_dict[chkpt_weight_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if chkpt_weight_name not in params_dict:
+                    continue
+                param = params_dict[chkpt_weight_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(chkpt_weight_name)
+        return loaded_params

From e0b02ea687806fa9e53b1b3c83d901301a2c2161 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 18 Mar 2025 23:57:39 +0800
Subject: [PATCH 0820/1240] [Bugfix] Fix broken CPU quantization due to triton
 import (#15038)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/quantization/gguf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index c92bcbea540..c8ab12d9a0a 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -10,7 +10,6 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.fused_moe import moe_align_block_size
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -140,6 +139,10 @@ def _fused_moe_gguf(
     qweight_type2: int,
     act,
 ) -> torch.Tensor:
+    # lazy import to avoid triggering triton import in CPU backend
+    from vllm.model_executor.layers.fused_moe.fused_moe import (
+        moe_align_block_size)
+
     out_hidden_states = torch.empty_like(x)
     if qweight_type2 in MMQ_QUANT_TYPES and qweight_type in MMQ_QUANT_TYPES:
         num_tokens, _ = x.shape

From c8226dc3fa59d2084fd99e38e41fd13ddce19453 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 19 Mar 2025 00:40:29 +0800
Subject: [PATCH 0821/1240] [Bugfix] Fix LoRA extra vocab size (#15047)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/audio_language.py              | 1 -
 examples/offline_inference/vision_language.py             | 1 -
 examples/offline_inference/vision_language_multi_image.py | 1 -
 tests/models/decoder_only/vision_language/test_phi4mm.py  | 1 -
 vllm/config.py                                            | 2 +-
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 02dbdcb6423..840892ea070 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -93,7 +93,6 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
-        lora_extra_vocab_size=0,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 58fd5e53bf8..3849bd37a82 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -682,7 +682,6 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         enable_lora=True,
         max_lora_rank=320,
-        lora_extra_vocab_size=0,
     )
 
     return ModelRequestData(
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index c110f96669e..3a17e5bab09 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -342,7 +342,6 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
         enable_lora=True,
         max_lora_rank=320,
-        lora_extra_vocab_size=0,
     )
 
     placeholders = "".join(f"<|image_{i}|>"
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
index fb69beaf775..c3e88b60978 100644
--- a/tests/models/decoder_only/vision_language/test_phi4mm.py
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -100,7 +100,6 @@ def run_test(
             distributed_executor_backend=distributed_executor_backend,
             enable_lora=True,
             max_lora_rank=320,
-            lora_extra_vocab_size=0,
             gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
             enforce_eager=True,
     ) as vllm_model:
diff --git a/vllm/config.py b/vllm/config.py
index e83a8eeee82..c510677d64e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2324,7 +2324,7 @@ def __post_init__(self):
         # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
         possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
-        possible_lora_extra_vocab_size = (0, 256, 512)
+        possible_lora_extra_vocab_size = (256, 512)
         if self.max_lora_rank not in possible_max_ranks:
             raise ValueError(
                 f"max_lora_rank ({self.max_lora_rank}) must be one of "

From 1c5b4a8732d1fa5db440d19b914b463121fe1c38 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 18 Mar 2025 15:49:15 -0400
Subject: [PATCH 0822/1240] [V1] Refactor Structured Output for multiple
 backends (#14694)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/processor.py                   |  25 +--
 vllm/v1/structured_output/__init__.py         | 123 ++++-----------
 vllm/v1/structured_output/backend_types.py    |  89 +++++++++++
 vllm/v1/structured_output/backend_xgrammar.py | 143 ++++++++++++++++++
 vllm/v1/structured_output/grammar.py          |  77 ----------
 vllm/v1/structured_output/request.py          |  18 ++-
 6 files changed, 290 insertions(+), 185 deletions(-)
 create mode 100644 vllm/v1/structured_output/backend_types.py
 create mode 100644 vllm/v1/structured_output/backend_xgrammar.py
 delete mode 100644 vllm/v1/structured_output/grammar.py

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4e9e5506bb5..d823c45d599 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -119,16 +119,21 @@ def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
     def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
-        if self.decoding_config.guided_decoding_backend != "xgrammar":
-            raise ValueError(
-                "Only xgrammar structured output is supported in V1.")
-        if (params.guided_decoding.backend
-                and params.guided_decoding.backend != 'xgrammar'):
-            raise ValueError(
-                "Only xgrammar structured output is supported in V1.")
-        if self.vllm_config.speculative_config:
-            raise ValueError("Structured output is not supported with "
-                             "speculative decoding.")
+
+        supported_backends = ["xgrammar"]
+        engine_level_backend = self.decoding_config.guided_decoding_backend
+        if engine_level_backend not in supported_backends:
+            raise ValueError(f"Only {supported_backends} structured output is "
+                             "supported in V1.")
+        if params.guided_decoding.backend:
+            if params.guided_decoding.backend != engine_level_backend:
+                raise ValueError("Request-level structured output backend "
+                                 "must match engine-level backend. "
+                                 f"{params.guided_decoding.backend}"
+                                 f" != {engine_level_backend}")
+        else:
+            params.guided_decoding.backend = engine_level_backend
+
         if vllm.platforms.current_platform.is_tpu():
             raise ValueError("Structured output is not supported on TPU.")
 
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 5ed7b832aac..58ac00e985a 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,75 +7,27 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
-from vllm.utils import LazyLoader
-from vllm.v1.structured_output.grammar import Grammar, StructuredOutputOptions
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar)
+from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
 
 if TYPE_CHECKING:
     import numpy as np
     import numpy.typing as npt
-    import xgrammar as xgr
+    import torch
 
     from vllm.v1.request import Request
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
 
 logger = init_logger(__name__)
 
 
 class StructuredOutputManager:
+    """Engine-level manager for structured output requests."""
 
     def __init__(self, vllm_config: VllmConfig):
+        self.backend: Optional[StructuredOutputBackend] = None
         self.vllm_config = vllm_config
-        self.init_complete = False
-
-    def _delayed_init(self):
-        """Initialization delayed until we know it is needed."""
-        tokenizer_group = init_tokenizer_from_configs(
-            model_config=self.vllm_config.model_config,
-            scheduler_config=self.vllm_config.scheduler_config,
-            parallel_config=self.vllm_config.parallel_config,
-            lora_config=self.vllm_config.lora_config)  # type: ignore[arg-type]
-        tokenizer_group.ping()
-
-        tokenizer = tokenizer_group.get_lora_tokenizer(None)
-        self.vocab_size = self.vllm_config.model_config.get_vocab_size()
-        if isinstance(tokenizer, MistralTokenizer):
-            # NOTE: ideally, xgrammar should handle this accordingly.
-            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
-            try:
-                encoded_vocab = [
-                    token for token, _ in sorted(
-                        tokenizer.get_vocab().items(),
-                        key=lambda x: x[1],
-                    )
-                ]
-                stop_token_ids = None
-                if hasattr(
-                        tokenizer,
-                        "eos_token_id",
-                ) and tokenizer.eos_token_id is not None:
-                    stop_token_ids = [tokenizer.eos_token_id]
-            except AttributeError as e:
-                raise ValueError(
-                    f"Cannot get the vocabulary of the tokenizer "
-                    f"{type(tokenizer)}. The tokenizer should have a "
-                    "get_vocab method.") from e
-            tokenizer_info = xgr.TokenizerInfo(
-                encoded_vocab=encoded_vocab,
-                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                vocab_type=xgr.VocabType.BYTE_FALLBACK,
-                vocab_size=self.vocab_size,
-                stop_token_ids=stop_token_ids,
-                add_prefix_space=True,
-            )
-        else:
-            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
-                tokenizer,
-                vocab_size=self.vocab_size,
-            )
-        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+        self._grammar_bitmask: Optional[torch.Tensor] = None
 
         # The default max_workers if not specified is the number of CPUs * 5,
         # which is way too high since these tasks are CPU-bound, not I/O bound.
@@ -83,28 +35,30 @@ def _delayed_init(self):
         # compilation, so we set it to half the number of CPUs.
         max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self._grammar_bitmask = xgr.allocate_token_bitmask(
-            self.vllm_config.scheduler_config.max_num_seqs,
-            self.vocab_size,
-        )
-
-        self.init_complete = True
 
     def grammar_init(self, request: Request) -> None:
         if request.structured_output_request is None:
             return
 
-        # The first time this is called, we need to finish initialization
-        # of xgrammar. We defer it to avoid the import of xgrammar and
-        # initialization cost if it is not going to be used.
-        if not self.init_complete:
-            self._delayed_init()
+        # Initialize the backend the first time it is needed.
+        #
+        # NOTE: We only support a single backend. We do NOT support different
+        # backends on a per-request basis in V1 (for now, anyway...).
+        if self.backend is None:
+            backend_name = request.sampling_params.guided_decoding.backend_name
+            if backend_name == "xgrammar":
+                self.backend = XgrammarBackend(self.vllm_config)
+            else:
+                raise ValueError(
+                    f"Unsupported structured output backend: {backend_name}")
 
-        grammar: Future[Grammar] = self.executor.submit(
-            self._async_create_grammar, request)
+        grammar: Future[StructuredOutputGrammar] = self.executor.submit(
+            self._async_create_grammar, request, self.backend)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
-    def _async_create_grammar(self, request: Request) -> Grammar:
+    def _async_create_grammar(
+            self, request: Request,
+            backend: StructuredOutputBackend) -> StructuredOutputGrammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
 
         # Note that the request was validated in the engine core client,
@@ -114,28 +68,8 @@ def _async_create_grammar(self, request: Request) -> Grammar:
         # though it should be unlikely as we test that up front as well.
         request_type, grammar_spec = key
 
-        if request_type == StructuredOutputOptions.JSON:
-            # TODO -- allow any_whitespace to be configurable
-            # pending merge of https://github.com/vllm-project/vllm/pull/12744
-            ctx = self.compiler.compile_json_schema(grammar_spec,
-                                                    any_whitespace=False)
-        elif request_type == StructuredOutputOptions.JSON_OBJECT:
-            ctx = self.compiler.compile_builtin_json_grammar()
-        elif request_type == StructuredOutputOptions.GRAMMAR:
-            ctx = self.compiler.compile_grammar(grammar_spec)
-        elif request_type == StructuredOutputOptions.REGEX:
-            ctx = self.compiler.compile_regex(grammar_spec)
-        else:
-            logger.error("Validation should have already occurred. "
-                         "Please file an issue.")
-            raise ValueError(
-                f"grammar is not of valid supported types. ({request_type!s})")
-
-        return Grammar(
-            matcher=xgr.GrammarMatcher(ctx),
-            vocab_size=self.vocab_size,
-            ctx=ctx,
-        )
+        assert self.backend is not None
+        return self.backend.compile_grammar(request_type, grammar_spec)
 
     def grammar_bitmask(
         self,
@@ -147,6 +81,11 @@ def grammar_bitmask(
         if not structured_output_request_ids:
             return None
 
+        if self._grammar_bitmask is None:
+            assert self.backend is not None
+            self._grammar_bitmask = self.backend.allocate_token_bitmask(
+                self.vllm_config.scheduler_config.max_num_seqs)
+
         # Fill the bitmask using the index of each request equal to its
         # position in the batch. Resize the bitmask down to the size of
         # the batch.
@@ -154,7 +93,7 @@ def grammar_bitmask(
         for req_id, batch_index in structured_output_request_ids.items():
             request = requests[req_id].structured_output_request
             assert request is not None and request.grammar is not None
-            if not request.grammar.matcher.is_terminated():
+            if not request.grammar.is_terminated():
                 request.grammar.fill_bitmask(bitmask_tensor, batch_index)
         if batch_len < self._grammar_bitmask.shape[0]:
             bitmask_tensor = self._grammar_bitmask[:batch_len]
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
new file mode 100644
index 00000000000..6dc2a92411d
--- /dev/null
+++ b/vllm/v1/structured_output/backend_types.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import enum
+from abc import ABC, abstractmethod
+
+import torch
+
+
+class StructuredOutputOptions(enum.Enum):
+    JSON = enum.auto()
+    JSON_OBJECT = enum.auto()
+    REGEX = enum.auto()
+    GRAMMAR = enum.auto()
+    CHOICE = enum.auto()
+
+
+StructuredOutputKey = tuple[StructuredOutputOptions, str]
+
+
+class StructuredOutputGrammar(ABC):
+    """Request-level backend for structured output requests."""
+
+    @abstractmethod
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """
+        Determines whether the provided tokens are accepted for the
+        given request.
+
+        Args:
+            request_id (str): The unique identifier for the request.
+            tokens (list[int]): A list of token IDs to evaluate.
+
+        Returns:
+            bool: True if the tokens are accepted, False otherwise.
+        """
+
+    @abstractmethod
+    def fill_bitmask(self, bitmask: torch.Tensor, batch_index: int) -> None:
+        """
+        Fills the bitmask for a specific batch index.
+
+        Args:
+            bitmask (torch.Tensor): The bitmask to fill
+            batch_index (int): The index in the bitmask to fill
+        """
+
+    @abstractmethod
+    def is_terminated(self) -> bool:
+        """
+        Checks whether the structured output process has terminated.
+
+        Returns:
+            bool: True if the process is terminated, False otherwise.
+        """
+
+    @abstractmethod
+    def reset(self):
+        """
+        Resets the state of the structured output grammar.
+        """
+
+
+class StructuredOutputBackend(ABC):
+    """Engine-level backend for structured output requests."""
+
+    @abstractmethod
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        """
+        Compiles a grammar specification into a structured output grammar.
+
+        Args:
+            request_type (StructuredOutputOptions): The type of structured
+              output request.
+            grammar_spec (str): The grammar specification to compile.
+
+        Returns:
+            StructuredOutputGrammar: The compiled structured output grammar.
+        """
+
+    @abstractmethod
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        """
+        Allocates a token bitmask for the specified maximum number of sequences.
+
+        Args:
+            max_num_seqs (int): The maximum number of sequences for which
+              to allocate the bitmask.
+        """
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
new file mode 100644
index 00000000000..ce93ca5c751
--- /dev/null
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+
+if TYPE_CHECKING:
+    import xgrammar as xgr
+else:
+    xgr = LazyLoader("xgr", globals(), "xgrammar")
+
+logger = init_logger(__name__)
+
+
+class XgrammarBackend(StructuredOutputBackend):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+        if isinstance(tokenizer, MistralTokenizer):
+            # NOTE: ideally, xgrammar should handle this accordingly.
+            # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
+            try:
+                encoded_vocab = [
+                    token for token, _ in sorted(
+                        tokenizer.get_vocab().items(),
+                        key=lambda x: x[1],
+                    )
+                ]
+                stop_token_ids = None
+                if hasattr(
+                        tokenizer,
+                        "eos_token_id",
+                ) and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+            except AttributeError as e:
+                raise ValueError(
+                    f"Cannot get the vocabulary of the tokenizer "
+                    f"{type(tokenizer)}. The tokenizer should have a "
+                    "get_vocab method.") from e
+            tokenizer_info = xgr.TokenizerInfo(  # type: ignore
+                encoded_vocab=encoded_vocab,
+                # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
+                vocab_type=xgr.VocabType.BYTE_FALLBACK,
+                vocab_size=self.vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+        else:
+            tokenizer_info = xgr.TokenizerInfo.from_huggingface(
+                tokenizer,
+                vocab_size=self.vocab_size,
+            )
+        self.compiler = xgr.GrammarCompiler(tokenizer_info, max_threads=8)
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        if request_type == StructuredOutputOptions.JSON:
+            ctx = self.compiler.compile_json_schema(grammar_spec,
+                                                    any_whitespace=False)
+        elif request_type == StructuredOutputOptions.JSON_OBJECT:
+            ctx = self.compiler.compile_builtin_json_grammar()
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            ctx = self.compiler.compile_grammar(grammar_spec)
+        elif request_type == StructuredOutputOptions.REGEX:
+            ctx = self.compiler.compile_regex(grammar_spec)
+        else:
+            logger.error(
+                "Validation should have already occurred. Please file an issue."
+            )
+            raise ValueError(
+                f"grammar is not of valid supported types. ({request_type!s})")
+
+        return XgrammarGrammar(
+            matcher=xgr.GrammarMatcher(ctx),
+            vocab_size=self.vocab_size,
+            ctx=ctx,
+        )
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return xgr.allocate_token_bitmask(max_num_seqs, self.vocab_size)
+
+
+@dataclass
+class XgrammarGrammar(StructuredOutputGrammar):
+    # NOTE: This would be a generic-enough class for
+    # supporting different backends, in the future.
+    # For now, just xgrammar.
+    #
+    # TODO: support max_rollback_tokens
+    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
+    # for jump-forward decoding
+
+    vocab_size: int
+    matcher: xgr.GrammarMatcher = field(hash=False)
+    ctx: xgr.CompiledGrammar = field(hash=False)
+    num_processed_tokens: int = field(default_factory=lambda: 0,
+                                      repr=False,
+                                      hash=False,
+                                      init=False)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the FSM.
+
+        Returns True if the FSM was advanced successfully.
+        Returns False if the FSM failed to advance.
+        """
+        for token in tokens:
+            if not self.matcher.accept_token(token):
+                logger.error(
+                    "Failed to advance FSM for request %s "
+                    "for tokens %s. Please file an issue.", request_id, token)
+                return False
+            self.num_processed_tokens += 1
+        return True
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(bitmask, idx)
+
+    def is_terminated(self) -> bool:
+        return self.matcher.is_terminated()
+
+    def reset(self):
+        self.num_processed_tokens = 0
+        self.matcher.reset()
diff --git a/vllm/v1/structured_output/grammar.py b/vllm/v1/structured_output/grammar.py
deleted file mode 100644
index 0e9b2b17226..00000000000
--- a/vllm/v1/structured_output/grammar.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-import enum
-from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
-
-import torch
-
-from vllm.logger import init_logger
-from vllm.utils import LazyLoader
-
-if TYPE_CHECKING:
-    import xgrammar as xgr
-else:
-    xgr = LazyLoader("xgr", globals(), "xgrammar")
-
-logger = init_logger(__name__)
-
-
-class StructuredOutputOptions(enum.Enum):
-    JSON = enum.auto()
-    JSON_OBJECT = enum.auto()
-    REGEX = enum.auto()
-    GRAMMAR = enum.auto()
-    CHOICE = enum.auto()
-
-
-StructuredOutputKey = tuple[StructuredOutputOptions, str]
-
-
-@dataclass
-class Grammar:
-    # NOTE: This would be a generic-enough class for
-    # supporting different backends, in the future.
-    # For now, just xgrammar.
-    #
-    # TODO: support max_rollback_tokens
-    # https://xgrammar.mlc.ai/docs/api/python/index.html#xgrammar.GrammarMatcher.find_jump_forward_string
-    # for jump-forward decoding
-
-    vocab_size: int
-    matcher: xgr.GrammarMatcher = field(hash=False)
-    ctx: xgr.CompiledGrammar = field(hash=False)
-    num_processed_tokens: int = field(default_factory=lambda: 0,
-                                      repr=False,
-                                      hash=False,
-                                      init=False)
-
-    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
-        """Accepts a list of tokens and advances the FSM.
-
-        Returns True if the FSM was advanced successfully.
-        Returns False if the FSM failed to advance.
-        """
-        for token in tokens:
-            if not self.matcher.accept_token(token):
-                logger.error(
-                    "Failed to advance FSM for request %s "
-                    "for tokens %s. Please file an issue.", request_id, token)
-                return False
-            self.num_processed_tokens += 1
-        return True
-
-    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> bool:
-        return self.matcher.fill_next_token_bitmask(bitmask, idx)
-
-    def reset(self):
-        self.num_processed_tokens = 0
-        self.matcher.reset()
-
-    def __copy__(self):
-        return Grammar(
-            matcher=xgr.GrammarMatcher(self.ctx),
-            vocab_size=self.vocab_size,
-            ctx=self.ctx,
-        )
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index fbcfd541df5..718fa5834ed 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -9,15 +9,17 @@
 from typing import Optional, Union, cast
 
 from vllm.sampling_params import SamplingParams
-from vllm.v1.structured_output.grammar import (Grammar, StructuredOutputKey,
-                                               StructuredOutputOptions)
+from vllm.v1.structured_output.backend_types import (StructuredOutputGrammar,
+                                                     StructuredOutputKey,
+                                                     StructuredOutputOptions)
 
 
 @dataclasses.dataclass
 class StructuredOutputRequest:
 
     sampling_params: SamplingParams
-    _grammar: Optional[Union[Future[Grammar], Grammar]] = None
+    _grammar: Optional[Union[Future[StructuredOutputGrammar],
+                             StructuredOutputGrammar]] = None
 
     def _check_grammar_completion(self) -> bool:
         # NOTE: We have to lazy import to gate circular imports
@@ -37,12 +39,16 @@ def is_grammar_ready(self) -> bool:
         return self._check_grammar_completion()
 
     @property
-    def grammar(self) -> Optional[Grammar]:
+    def grammar(self) -> Optional[StructuredOutputGrammar]:
         completed = self._check_grammar_completion()
-        return cast(Optional[Grammar], self._grammar) if completed else None
+        return cast(Optional[StructuredOutputGrammar],
+                    self._grammar) if completed else None
 
     @grammar.setter
-    def grammar(self, grammar: Union[Grammar, Future[Grammar]]) -> None:
+    def grammar(
+        self, grammar: Union[StructuredOutputGrammar,
+                             Future[StructuredOutputGrammar]]
+    ) -> None:
         self._grammar = grammar
 
     @functools.cached_property

From f928001abd8c5b370bb3dcb00a62f123fc158280 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 18 Mar 2025 14:31:54 -0700
Subject: [PATCH 0823/1240] [V1][Spec Decode] Optimize Rejection Sampler with
 Triton Kernels (#14930)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py | 231 +++++--
 vllm/envs.py                              |   1 -
 vllm/v1/outputs.py                        |   2 +-
 vllm/v1/sample/ops/utils.py               |  30 +
 vllm/v1/sample/rejection_sampler.py       | 798 ++++++++++++++--------
 vllm/v1/spec_decode/metadata.py           |  61 ++
 vllm/v1/spec_decode/utils.py              |   1 -
 vllm/v1/worker/gpu_model_runner.py        | 205 ++++--
 8 files changed, 898 insertions(+), 431 deletions(-)
 create mode 100644 vllm/v1/sample/ops/utils.py
 create mode 100644 vllm/v1/spec_decode/metadata.py

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 84139a40b54..8c423e367ef 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -6,20 +6,23 @@
 import torch.nn.functional as F
 
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
+from vllm.v1.sample.rejection_sampler import (PLACEHOLDER_TOKEN_ID,
+                                              RejectionSampler)
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
-DEVICE = "cpu"
+DEVICE = "cuda"
 
 
 @pytest.fixture
-def sampler():
+def rejection_sampler():
     return RejectionSampler()
 
 
-def create_logits_tensor(token_ids: list[list[int]],
+def create_logits_tensor(output_token_ids: list[list[int]],
                          vocab_size: int = 100) -> torch.Tensor:
     """Helper function to create logits tensor that 
        will produce desired token ids on argmax"""
+    token_ids = [tokens[:-1] for tokens in output_token_ids]
     num_total_tokens = sum(len(tokens) for tokens in token_ids)
     logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
     start_loc = 0
@@ -31,15 +34,22 @@ def create_logits_tensor(token_ids: list[list[int]],
 
 
 def create_sampling_metadata(
-        all_greedy: bool,
-        generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
+    all_greedy: bool,
+    temperature: Optional[torch.Tensor] = None,
+    generators: Optional[dict[int, Any]] = None,
+) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set 
         to the given value. Either all greedy or all random sampling 
         is used.
     """
     generators = generators or {}
+    if all_greedy:
+        temperature = None
+    else:
+        assert temperature is not None
+
     return SamplingMetadata(
-        temperature=torch.tensor([]),
+        temperature=temperature,
         all_greedy=all_greedy,
         all_random=not all_greedy,
         top_p=None,
@@ -61,7 +71,7 @@ def create_sampling_metadata(
 
 
 ########################### Tests for Greedy Sampling ###################
-def test_perfect_match(sampler):
+def test_perfect_match(rejection_sampler):
     """Test when output tokens perfectly match speculated tokens"""
     spec_tokens = [[1, 2, 3]]
     output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
@@ -70,15 +80,23 @@ def test_perfect_match(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                       device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
     expected = torch.tensor([[1, 2, 3, 4]],
                             dtype=torch.int,
                             device=logits.device)
     assert torch.equal(output, expected)
 
 
-def test_early_mismatch(sampler):
+def test_early_mismatch(rejection_sampler):
     """Test when there's an early mismatch in tokens"""
     spec_tokens = [[1, 2, 3]]
     output_tokens = [[1, 5, 3, 4]]  # Mismatch at position 1
@@ -87,15 +105,25 @@ def test_early_mismatch(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                       device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
-    expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
-                            dtype=torch.int,
-                            device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 5, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        dtype=torch.int,
+        device=logits.device,
+    )
     assert torch.equal(output, expected)
 
 
-def test_multiple_sequences(sampler):
+def test_multiple_sequences(rejection_sampler):
     """Test handling multiple sequences of speculated tokens"""
     spec_tokens = [[1, 2], [3]]
     output_tokens = [[1, 2, 5], [3,
@@ -105,15 +133,23 @@ def test_multiple_sequences(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
         [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
-    expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor([[1, 2, 5], [3, 4, PLACEHOLDER_TOKEN_ID]],
                             dtype=torch.int,
                             device=logits.device)
     assert torch.equal(output, expected)
 
 
-def test_single_token_sequence(sampler):
+def test_single_token_sequence(rejection_sampler):
     """Test handling sequences with single token"""
     spec_tokens = [[1]]
     output_tokens = [[1, 2]]  # Single token with bonus token 2
@@ -122,13 +158,21 @@ def test_single_token_sequence(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                       device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
     expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected)
 
 
-def test_empty_sequence(sampler):
+def test_empty_sequence(rejection_sampler):
     """Test handling empty sequence of speculated tokens"""
     spec_tokens: list[list[int]] = [[]]
     output_tokens = [[5]]  # Just the bonus token
@@ -137,13 +181,21 @@ def test_empty_sequence(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
                                       device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
     expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected)
 
 
-def test_multiple_mismatches(sampler):
+def test_multiple_mismatches(rejection_sampler):
     """Test handling multiple sequences with mismatches"""
     spec_tokens = [[1, 2, 3], [4, 5, 6]]
     output_tokens = [[1, 2, 7, 6], [4, 8, 6,
@@ -153,12 +205,22 @@ def test_multiple_mismatches(sampler):
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor(
         [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
-    expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
-                             [4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
-                            dtype=torch.int,
-                            device=logits.device)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(
+        [[1, 2, 7, PLACEHOLDER_TOKEN_ID],
+         [4, 8, PLACEHOLDER_TOKEN_ID, PLACEHOLDER_TOKEN_ID]],
+        dtype=torch.int,
+        device=logits.device,
+    )
     assert torch.equal(output, expected)
 
 
@@ -166,18 +228,27 @@ def test_multiple_mismatches(sampler):
     "spec_tokens,output_tokens,expected",
     [
         ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
-        ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]),  # First mismatch
+        ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
         ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
-         [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
+         [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
     ])
-def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
+def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
+                            expected):
     """Parametrized test for various matching scenarios"""
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
     bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
                                       device=logits.device)
-
-    output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=bonus_token_tensor,
+        sampling_metadata=metadata,
+    )
     expected_tensor = torch.tensor(expected,
                                    dtype=torch.int,
                                    device=logits.device)
@@ -190,21 +261,31 @@ def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
 @pytest.mark.parametrize("batch_size", [1, 4, 8])
 @pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
 @pytest.mark.parametrize("n_rep", [20])
-def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
-                                   batch_size: int, frac_seeded: float,
-                                   n_rep: int):
-    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
-    target_probs = torch.rand(batch_size * (k + 1),
-                              vocab_size,
-                              dtype=torch.float32)
+def test_deterministic_when_seeded(
+    rejection_sampler,
+    k: int,
+    vocab_size: int,
+    batch_size: int,
+    frac_seeded: float,
+    n_rep: int,
+):
+    num_tokens = batch_size * k
+    draft_probs = torch.rand(num_tokens,
+                             vocab_size,
+                             dtype=torch.float32,
+                             device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+    target_logits = torch.rand_like(draft_probs)
     bonus_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, 1),
-                                    dtype=torch.int64)
+                                    dtype=torch.int64,
+                                    device=DEVICE)
     draft_token_ids = torch.randint(low=0,
                                     high=vocab_size,
                                     size=(batch_size, k),
-                                    dtype=torch.int64)
+                                    dtype=torch.int64,
+                                    device=DEVICE)
 
     seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
 
@@ -215,10 +296,21 @@ def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
             for i in range(batch_size) if seeded_mask[i]
         }
 
+        temperature = torch.ones(batch_size,
+                                 dtype=torch.float32,
+                                 device=DEVICE)
         sampling_metadata = create_sampling_metadata(all_greedy=False,
+                                                     temperature=temperature,
                                                      generators=seeded_seqs)
-        rep_result = sampler(draft_token_ids.tolist(), draft_probs,
-                             bonus_token_ids, target_probs, sampling_metadata)
+        spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+            draft_token_ids.tolist(), device=DEVICE)
+        rep_result = rejection_sampler(
+            spec_decode_metadata,
+            draft_probs=draft_probs,
+            target_logits=target_logits,
+            bonus_token_ids=bonus_token_ids,
+            sampling_metadata=sampling_metadata,
+        )
 
         results.append(rep_result)
 
@@ -257,10 +349,10 @@ def test_rejection_sampling_approximates_target_distribution():
     num_reference_probs = 100
 
     # Prepare draft, target, and reference probability distributions
-    draft_probs, target_probs = (F.softmax(
-        torch.rand(vocab_size, dtype=torch.float32),
-        dim=-1,
-    ) for _ in range(2))
+    draft_probs = F.softmax(torch.rand(vocab_size, dtype=torch.float32),
+                            dim=-1)
+    target_logits = torch.rand(vocab_size, dtype=torch.float32)
+    target_probs = F.softmax(target_logits, dim=-1)
     reference_probs = F.softmax(
         torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
         dim=-1,
@@ -273,7 +365,7 @@ def test_rejection_sampling_approximates_target_distribution():
     for num_samples in sample_sizes:
         # Sample using rejection sampling.
         rej_sample_probs = estimate_rejection_sampling_pdf(
-            draft_probs, target_probs, k, vocab_size, num_samples)
+            draft_probs, target_logits, k, vocab_size, num_samples)
         rej_sample_probs = rej_sample_probs.to(DEVICE)
 
         # Average distance from reference probs.
@@ -313,7 +405,7 @@ def get_ratio_first_to_last(elements: list[float]) -> float:
 
 def estimate_rejection_sampling_pdf(
     draft_probs: torch.Tensor,
-    target_probs: torch.Tensor,
+    target_logits: torch.Tensor,
     k: int,
     vocab_size: int,
     num_samples: int,
@@ -323,35 +415,44 @@ def estimate_rejection_sampling_pdf(
 
     Args:
         draft_probs: Draft probability distribution.
-        target_probs: Target probability distribution.
+        target_logits: Target logits.
         num_samples: Number of samples to draw.
 
     Returns:
         Estimated probability distribution of the output tokens.
     """
-    sampler = RejectionSampler()
-    # Repeat draft probs num_samples times.
+    rejection_sampler = RejectionSampler()
+    num_tokens = num_samples * k
+    # Repeat draft probs num_samples * k times.
     draft_probs = draft_probs.reshape(1, 1,
                                       vocab_size).repeat(num_samples, k, 1)
 
-    # Repeat target probs num_samples * (k + 1) times.
-    target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
-        num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
+    # Repeat target probs num_tokens times.
+    target_logits = target_logits.reshape(1, vocab_size).repeat(num_tokens, 1)
 
     # Randomly sample draft token ids from draft probs.
     draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
                                         num_samples=k,
                                         replacement=True).reshape(
                                             num_samples, k)
+    draft_probs = draft_probs.view(num_tokens, vocab_size)
 
     # Bonus tokens not used but required.
     bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
                                   device=DEVICE).repeat(num_samples, 1)
 
-    sampling_metadata = create_sampling_metadata(all_greedy=False)
-    output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
-                               bonus_token_ids, target_probs,
-                               sampling_metadata)
+    temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(all_greedy=False,
+                                                 temperature=temperature)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids.tolist(), device=bonus_token_ids.device)
+    output_token_ids = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        target_logits=target_logits,
+        bonus_token_ids=bonus_token_ids,
+        sampling_metadata=sampling_metadata,
+    )
     output_token_ids = output_token_ids[:, :-1].flatten()
 
     hist = torch.histogram(output_token_ids.to(dtype=torch.float,
diff --git a/vllm/envs.py b/vllm/envs.py
index bf214f314c4..b2937462ad3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -35,7 +35,6 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
     VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index edae654b5d3..6f46417170f 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -46,7 +46,7 @@ class SamplerOutput:
     # [num_reqs, max_num_generated_tokens]
     # Different requests can have different number of generated tokens.
     # All requests are padded to max_num_generated_tokens.
-    # INVALID_TOKEN_ID (-1 by default) is used for padding.
+    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
     sampled_token_ids: torch.Tensor
     logprobs_tensors: Optional[LogprobsTensors]
 
diff --git a/vllm/v1/sample/ops/utils.py b/vllm/v1/sample/ops/utils.py
new file mode 100644
index 00000000000..a54e2060306
--- /dev/null
+++ b/vllm/v1/sample/ops/utils.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Union
+
+import torch
+
+
+def compiled_softmax(
+    logits: torch.Tensor,
+    temperature: Union[float, torch.Tensor] = 1.0,
+) -> torch.Tensor:
+    """Faster softmax kernel generated by torch.compile.
+
+    Args:
+        logits: [n, vocab_size]
+        temperature: [n] or float
+    """
+    # NOTE(woosuk): Avoid recompilation by marking the first dim as dynamic.
+    torch._dynamo.mark_dynamic(logits, index=0)
+    if isinstance(temperature, torch.Tensor):
+        torch._dynamo.mark_dynamic(temperature, index=0)
+    return _softmax(logits, temperature)
+
+
+@torch.compile
+def _softmax(
+    logits: torch.Tensor,
+    temperature: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    logits = logits / temperature
+    return torch.softmax(logits, dim=-1, dtype=torch.float32)
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 5601c62e91f..6284ae4b490 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -3,25 +3,32 @@
 
 import torch
 import torch.nn as nn
-from torch.nn.utils.rnn import pad_sequence
+import triton
+import triton.language as tl
 
 from vllm.logger import init_logger
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.spec_decode.utils import random_sample
+from vllm.v1.sample.ops.utils import compiled_softmax
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 logger = init_logger(__name__)
-INVALID_TOKEN_ID = -1
+
+PLACEHOLDER_TOKEN_ID: tl.constexpr = -1
+GREEDY_TEMPERATURE: tl.constexpr = -1
+# Maximum number of speculative draft tokens allowed per request in a single
+# step. This value is chosen to be large enough to handle typical use cases.
+MAX_SPEC_LEN = 32
 
 
 class RejectionSampler(nn.Module):
     """
-    The implementation strictly follows the algorithm described in 
+    The implementation strictly follows the algorithm described in
         https://arxiv.org/abs/2211.17192.
     However, we want to clarify the terminology used in the implementation:
-    accepted tokens: tokens that are accepted based on the relationship 
+    accepted tokens: tokens that are accepted based on the relationship
             between the "raw" draft and target probabilities.
     recovered tokens: tokens that are sampled based on the adjusted probability
-        distribution, which is derived from both the draft and target 
+        distribution, which is derived from both the draft and target
         probabilities.
     bonus tokens:
         If all proposed tokens are accepted, the bonus token is added to the
@@ -31,48 +38,42 @@ class RejectionSampler(nn.Module):
         sampling process. For example, we can use top_p, top_k sampling for
         bonus tokens, while spec decode does not support these sampling
         strategies.
-    output tokens: 
-        Tokens are finally generated with the rejection sampler. 
+    output tokens:
+        Tokens are finally generated with the rejection sampler.
         output tokens = accepted tokens + recovered tokens + bonus tokens
     """
 
-    def __init__(self):
-        super().__init__()
-
     def forward(
         self,
-        draft_token_ids: list[list[int]],
+        metadata: SpecDecodeMetadata,
+        # [num_tokens, vocab_size]
         draft_probs: Optional[torch.Tensor],
-        bonus_token_ids_tensor: torch.Tensor,  # [batch_size, 1]
-        target_probs: torch.Tensor,  # [num_total_tokens, vocab_size]
+        # [num_tokens, vocab_size]
+        target_logits: torch.Tensor,
+        # [batch_size, 1]
+        bonus_token_ids: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
         '''
         Args:
-            draft_token_ids (List[List[int]]):
-                A 2D list of token IDs for each request in the batch. 
-                Each request might have different number of draft tokens. 
-                It may also contain empty lists for requests that have 
-                no draft tokens.
+            metadata:
+                Metadata for spec decoding.
             draft_probs (Optional[torch.Tensor]):
                 Probability distribution for the draft tokens. Shape is
-                [batch_size, max_spec_len, vocab_size]. Can be None if 
-                probabilities are not provided, which is the case for
-                ngram spec decode.
+                [num_tokens, vocab_size]. Can be None if probabilities are
+                not provided, which is the case for ngram spec decode.
+            target_logits (torch.Tensor):
+                Target model's logits probability distribution.
+                Shape is [num_tokens, vocab_size]. Here, probabilities from
+                different requests are flattened into a single tensor because
+                this is the shape of the output logits.
             bonus_token_ids_tensor (torch.Tensor):
-                A tensor containing bonus tokens. Shape is [batch_size, 1]. 
-                Bonus tokens are added to the end of the sequence if all 
-                proposed tokens are accepted. We generate the bonus tokens 
-                outside of the rejection sampler with the default sampling 
-                strategy. It allows for more flexibility in the sampling 
+                A tensor containing bonus tokens. Shape is [batch_size, 1].
+                Bonus tokens are added to the end of the sequence if all
+                proposed tokens are accepted. We generate the bonus tokens
+                outside of the rejection sampler with the default sampling
+                strategy. It allows for more flexibility in the sampling
                 process such as top_p, top_k sampling.
-            target_probs (torch.Tensor):
-                Target model probability distribution.
-                Shape is [num_total_tokens, vocab_size]. num_total_tokens 
-                is the total number of tokens from all requests. Here, 
-                probabilities from different requests are flattened into
-                a single tensor because this is the shape of the output 
-                logits.
             sampling_metadata (SamplingMetadata):
                 Additional metadata needed for sampling, such as temperature,
                 top-k/top-p parameters, or other relevant information.
@@ -80,268 +81,481 @@ def forward(
             output_token_ids (torch.Tensor):
                 A tensor containing the final output token IDs.
         '''
-
-        # NOTE: The following input preparationg can be moved
-        # to the model runner with a persistent manner for better
-        # performance.
-        # Convert draft token IDs to a tensor, split by sample_lens, then pad.
-        draft_token_ids = [
-            torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
-        ]
-        draft_token_ids_tensor = pad_sequence(draft_token_ids,
-                                              batch_first=True,
-                                              padding_value=INVALID_TOKEN_ID)
-
-        # NOTE: CPU <-> GPU synchronization happens here.
-        draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
-
-        # Create one-hot tensor for draft token ids.
-        # This is used for ngram where we don't have draft_probs.
-        if draft_probs is None and not sampling_metadata.all_greedy:
-            vocab_size = target_probs.size(-1)
-            draft_probs = _create_greedy_token_probs(draft_token_ids_tensor,
-                                                     vocab_size,
-                                                     target_probs.device)
-        sample_lens = [len(x) + 1 for x in draft_token_ids]
-        target_probs = _convert_2d_probs(target_probs, sample_lens)
-
-        return self.forward_native(draft_token_ids_tensor, draft_probs,
-                                   bonus_token_ids_tensor, target_probs,
-                                   sampling_metadata)
-
-    # TODO: The following method can be optimized for better performance.
-    def forward_native(
-        self,
-        draft_token_ids_tensor: torch.Tensor,
-        # [batch_size, max_spec_len, vocab_size]
-        draft_probs: Optional[torch.Tensor],
-        bonus_token_ids_tensor: torch.Tensor,
-        # [batch_size, max_spec_len + 1, vocab_size]
-        target_probs: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> torch.Tensor:
-        # Add 1 to include the 'bonus' token.
-        if sampling_metadata.all_greedy:
-            # Produce a mask that remains 1 (True) until the first
-            # mismatch (cumprod turns 0 after a mismatch).
-            target_token_ids_tensor = target_probs.argmax(dim=-1)
-            accept_mask = (target_token_ids_tensor[:, :-1] ==
-                           draft_token_ids_tensor).cumprod(dim=1)
-
-            # Identify valid positions (non-padding).
-            valid_mask = target_token_ids_tensor != INVALID_TOKEN_ID
-            # Generate mask with bonus token.
-            generate_mask = torch.cat([
-                accept_mask,
-                torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
-            ],
-                                      dim=1).to(torch.bool) & valid_mask
-            zeros_mask = (generate_mask == 0)
-            first_zero_idx = zeros_mask.float().argmax(dim=1)
-            # Figure out which rows actually contain at least one zero.
-            rows_with_zero = zeros_mask.any(dim=1)
-            # Use indexing to set the first zero in each of those rows to 1.
-            generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
-
-            output_token_ids = target_token_ids_tensor
-            output_token_ids[~generate_mask] = INVALID_TOKEN_ID
-        else:
-            # Reference: https://arxiv.org/pdf/2211.17192
-            # 1. Extract the probabilities of the draft tokens.
-            # [batch_size, max_spec_len]
-            batch_size = draft_token_ids_tensor.size(0)
-            max_spec_len = draft_token_ids_tensor.size(1)
-            invalid_idx = draft_token_ids_tensor == INVALID_TOKEN_ID
-            draft_token_ids_tensor[invalid_idx] = 0
-            assert draft_probs is not None
-            draft_token_probs = draft_probs.gather(
-                dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
-            target_token_probs = target_probs.gather(
-                dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
-            # Force the probabilities of invalid tokens to inf
-            # so that they are not accepted.
-            draft_token_probs[invalid_idx] = float('inf')
-
-            # 2. Generate uniform samples.
-            # [batch_size, max_spec_len + 1]
-            uniform_samples = _create_uniform_samples(
-                sampling_metadata.generators, batch_size, max_spec_len,
-                target_probs.device)
-
-            # 3. Accept or reject the samples.
-            # [batch_size, max_spec_len]
-            # If the draft token probabilities are 0, set them to the smallest
-            # positive normal value representable by float32.
-            safe_draft_probs = torch.where(draft_token_probs > 0,
-                                           draft_token_probs,
-                                           torch.finfo(torch.float32).tiny)
-            accepted = uniform_samples <= target_token_probs / safe_draft_probs
-            accept_mask = accepted.cumprod(dim=1)
-            # Set the token ids to the draft token ids if accepted, otherwise
-            # set them to INVALID_TOKEN_ID.
-            accepted_token_ids = (draft_token_ids_tensor * accept_mask +
-                                  INVALID_TOKEN_ID * (1 - accept_mask))
-
-            # 4. Adjust the distribution for the recovered tokens.
-            # Clamp the bonus probabilities to the smallest positive normal
-            # value representable by float32.
-            bonus_prob = torch.clamp(target_probs[:, :-1, :] - draft_probs,
-                                     min=torch.finfo(torch.float32).tiny)
-            normalized_bonus_prob = bonus_prob / bonus_prob.sum(dim=-1,
-                                                                keepdim=True)
-
-            # 5. Sample recovered token ids.
-            recovered_token_ids = random_sample(
-                normalized_bonus_prob,
-                sampling_metadata.generators).reshape(batch_size, max_spec_len)
-
-            # 6. Get the final output token ids.
-            # output_token_ids = accepted_token_ids +
-            #                    recovered_token_ids +
-            #                    bonus_token_id
-            recovered_bonus_token_ids = torch.cat(
-                [recovered_token_ids, bonus_token_ids_tensor], dim=1)
-            # Generate mask with bonus tokens.
-            generate_mask = torch.cat([
-                accept_mask,
-                torch.zeros(batch_size, 1, device=accept_mask.device)
-            ],
-                                      dim=1).to(torch.bool)
-            zeros_mask = (generate_mask == 0)
-            first_zero_idx = zeros_mask.float().argmax(dim=1)
-            output_token_ids = torch.cat([
-                accepted_token_ids,
-                torch.full((batch_size, 1),
-                           fill_value=INVALID_TOKEN_ID,
-                           device=accept_mask.device)
-            ],
-                                         dim=1)
-            output_token_ids[torch.arange(batch_size),
-                             first_zero_idx] = recovered_bonus_token_ids[
-                                 torch.arange(batch_size), first_zero_idx]
-
+        assert metadata.max_spec_len <= MAX_SPEC_LEN
+        # [num_tokens, vocab_size]
+        target_probs = compute_probs(
+            target_logits,
+            metadata.cu_num_draft_tokens,
+            sampling_metadata,
+        )
+
+        output_token_ids = rejection_sample(
+            metadata.draft_token_ids,
+            metadata.num_draft_tokens,
+            metadata.max_spec_len,
+            metadata.cu_num_draft_tokens,
+            draft_probs,
+            target_probs,
+            bonus_token_ids,
+            sampling_metadata,
+        )
         return output_token_ids
 
-    def compute_probs(self, logits: torch.Tensor,
-                      sampling_metadata: SamplingMetadata,
-                      sample_lens: list[int]) -> torch.Tensor:
-        """
-        Compute probability distribution from logits based on sampling metadata.
-    
-        This function applies temperature scaling to the logits and converts 
-        them to probabilities using softmax. Note that division by 
-        temperature is not performed inplace to preserve the original logits 
-        tensor, which will be used by the original sampler to get bonus tokens.
-        
-        Args:
-            logits: Input logits tensor to be converted to probabilities
-            sampling_metadata: Metadata containing sampling parameters such 
-                    as temperature and whether greedy sampling is used
-            sample_lens: List of sample lengths used for repeating 
-                    temperature values
-            
-        Returns:
-            torch.Tensor: Probability distribution (softmax of scaled logits) 
-                    if non-greedy sampling is used, otherwise returns the 
-                    original logits
-        """
+    @staticmethod
+    def parse_output(
+        output_token_ids: torch.Tensor,
+        vocab_size: int,
+    ) -> list[list[int]]:
+        output_token_ids_np = output_token_ids.cpu().numpy()
+        # Create mask for valid tokens.
+        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
+                      (output_token_ids_np < vocab_size))
+        outputs = [
+            row[valid_mask[i]].tolist()
+            for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
+
+
+def rejection_sample(
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [batch_size]
+    num_draft_tokens: list[int],
+    max_spec_len: int,
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    # [batch_size, 1]
+    bonus_token_ids: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    assert draft_token_ids.ndim == 1
+    assert draft_probs is None or draft_probs.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    assert target_probs.ndim == 2
+
+    batch_size = len(num_draft_tokens)
+    num_tokens = draft_token_ids.shape[0]
+    vocab_size = target_probs.shape[-1]
+    device = target_probs.device
+    assert draft_token_ids.is_contiguous()
+    assert draft_probs is None or draft_probs.is_contiguous()
+    assert target_probs.is_contiguous()
+    assert bonus_token_ids.is_contiguous()
+    assert target_probs.shape == (num_tokens, vocab_size)
+
+    # Create output buffer.
+    output_token_ids = torch.empty(
+        (batch_size, max_spec_len + 1),
+        dtype=torch.int32,  # Consistent with SamplerOutput.sampled_token_ids.
+        device=device,
+    )
+    output_token_ids.fill_(PLACEHOLDER_TOKEN_ID)
+
+    if sampling_metadata.all_greedy:
+        is_greedy = None
+    else:
+        is_greedy = sampling_metadata.temperature == GREEDY_TEMPERATURE
+    if not sampling_metadata.all_random:
+        # Rejection sampling for greedy sampling requests.
+        target_argmax = target_probs.argmax(dim=-1)
+        rejection_greedy_sample_kernel[(batch_size, )](
+            output_token_ids,
+            cu_num_draft_tokens,
+            draft_token_ids,
+            target_argmax,
+            bonus_token_ids,
+            is_greedy,
+            max_spec_len,
+            num_warps=1,
+        )
         if sampling_metadata.all_greedy:
-            return logits
-        assert sampling_metadata.temperature is not None
-        # We should optimize the following code as
-        # it will cause CPU -> GPU synchronization.
-        temperature = torch.repeat_interleave(
-            sampling_metadata.temperature,
-            torch.tensor(sample_lens,
-                         device=sampling_metadata.temperature.device))
-        temperature = temperature.unsqueeze(dim=1)
-        logits = logits / temperature
-        return logits.softmax(dim=-1, dtype=torch.float32)
-
-
-def _create_greedy_token_probs(
-    token_ids: torch.Tensor,
-    vocab_size: int,
-    out_device: torch.device,
+            return output_token_ids
+
+    # Generate uniform probabilities for rejection sampling.
+    # [num_tokens]
+    uniform_probs = generate_uniform_probs(
+        num_tokens,
+        num_draft_tokens,
+        sampling_metadata.generators,
+        device,
+    )
+
+    # Sample recovered tokens for each position.
+    # [num_tokens]
+    recovered_token_ids = sample_recovered_tokens(
+        max_spec_len,
+        num_draft_tokens,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        sampling_metadata,
+        device,
+    )
+
+    # Rejection sampling for random sampling requests.
+    rejection_random_sample_kernel[(batch_size, )](
+        output_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        bonus_token_ids,
+        recovered_token_ids,
+        uniform_probs,
+        is_greedy,
+        max_spec_len,
+        vocab_size,
+        IS_NGRAM=draft_probs is None,
+        num_warps=1,
+    )
+    return output_token_ids
+
+
+def compute_probs(
+    logits: torch.Tensor,  # [num_tokens, vocab_size]
+    cu_num_draft_tokens: torch.Tensor,  # [batch_size]
+    sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
-    batch_size, num_tokens = token_ids.shape
-
-    token_probs = torch.zeros(batch_size,
-                              num_tokens,
-                              vocab_size,
-                              dtype=torch.float,
-                              device=out_device)
-
-    # Ignore INVALID_TOKEN_ID.
-    valid_mask = (token_ids != INVALID_TOKEN_ID)
-    valid_indices = token_ids.clone()
-    valid_indices[~valid_mask] = 0
-
-    token_probs.scatter_(dim=2,
-                         index=valid_indices.unsqueeze(-1),
-                         src=valid_mask.unsqueeze(-1).float())
-
-    return token_probs
-
-
-def _convert_2d_probs(
-        probs: torch.Tensor,  # [num_total_tokens, vocab_size]
-        sample_lens: list[int]) -> torch.Tensor:
+    """Compute probability distribution from logits based on sampling metadata.
+
+    This function applies temperature scaling to the logits and converts
+    them to probabilities using softmax. For greedy decoding, it returns
+    the original logits.
+
+    Args:
+        logits: Input logits tensor to be converted to probabilities.
+        cu_num_draft_tokens: Cumulative number of draft tokens.
+        sampling_metadata: Metadata containing sampling parameters such as
+            temperature and whether greedy sampling is used.
+
+    Returns:
+        torch.Tensor: Probability distribution (softmax of scaled logits)
+            if non-greedy sampling is used, otherwise returns the
+            original logits.
     """
-        Converts a 2D tensor of probabilities to a 3D tensor with padding.
-        [num_total_tokens, vocab_size] -> 
-            [batch_size, max_spec_len + 1, vocab_size]
+    assert logits.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    if sampling_metadata.all_greedy:
+        return logits
+
+    num_tokens = logits.shape[0]
+    batch_size = cu_num_draft_tokens.shape[0]
+    expanded_temperature = torch.empty(
+        (num_tokens, 1),
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    expand_kernel[(batch_size, )](
+        expanded_temperature,
+        sampling_metadata.temperature,
+        cu_num_draft_tokens,
+        GREEDY_TEMPERATURE,  # replace_from
+        1,  # replace_to
+        MAX_NUM_TOKENS=MAX_SPEC_LEN,
+        num_warps=1,
+    )
+    output_prob = compiled_softmax(logits, expanded_temperature)
+    return output_prob
+
+
+def generate_uniform_probs(
+    num_tokens: int,
+    num_draft_tokens: list[int],
+    generators: dict[int, torch.Generator],
+    device: torch.device,
+) -> torch.Tensor:
     """
-    cumulative_lens = torch.cumsum(torch.tensor(sample_lens,
-                                                device=probs.device),
-                                   dim=0)
-    split_indices = cumulative_lens[:-1].tolist()  # Exclude last index
-
-    # Split into chunks without loops
-    chunks = torch.tensor_split(probs, split_indices, dim=0)
-
-    # Pad all sequences to maximum length
-    padded_probs = pad_sequence(chunks, batch_first=True, padding_value=0.0)
-    return padded_probs
-
-
-def _create_uniform_samples(seeded_seqs: dict[int, torch.Generator],
-                            batch_size: int, k: int,
-                            device: torch.device) -> torch.Tensor:
+    Generates a batch of uniform random samples, with optional seeding
+    if available.
+
+    This method creates a tensor of shape `(num_tokens, )` filled
+    with uniform random values in the range [0, 1). If `generators` is provided,
+    the requests with their own seeds will use the provided `torch.Generator`
+    for reproducibility. The samples for the other requests will be generated
+    without a seed.
+
+    Args:
+        num_tokens : int
+            Total number of tokens.
+        num_draft_tokens : List[List[int]]
+            Number of draft tokens per request.
+        generators : Optional[Dict[int, torch.Generator]]
+            A dictionary mapping indices in the batch to
+            `torch.Generator` objects.
+        device : torch.device
+            The device on which to allocate the tensor.
+    Returns:
+        uniform_rand : torch.Tensor
+            A tensor of shape `(num_tokens, )` containing uniform
+            random values in the range [0, 1).
     """
-        Generates a batch of uniform random samples, with optional seeding 
-        for specific sequences.
-
-        This method creates a tensor of shape `(batch_size, k)` filled 
-        with uniform random values in the range [0, 1). If `seeded_seqs` 
-        is provided, the sequences corresponding to specific indices 
-        will be generated using the provided `torch.Generator` for 
-        reproducibility. The other sequences will be generated without 
-        a seed.
-
-        Args:
-            seeded_seqs : Optional[Dict[int, torch.Generator]]
-                A dictionary mapping indices in the batch to 
-                `torch.Generator` objects.
-            batch_size : int
-                The number of sequences to generate.
-            k : int
-                The number of random samples per sequence.
-            device : torch.device
-                The device on which to allocate the tensor.
-
-        Returns:
-            uniform_rand : torch.Tensor
-                A tensor of shape `(batch_size, k)` containing uniform 
-                random values in the range [0, 1).
-        """
-
-    uniform_rand = torch.rand(batch_size,
-                              k,
-                              dtype=torch.float32,
-                              device=device)
-    # Apply seeded generators only where needed
-    if seeded_seqs:
-        for idx, generator in seeded_seqs.items():
-            uniform_rand[idx].uniform_(0, 1, generator=generator)
-    return uniform_rand
+    uniform_probs = torch.rand(
+        (num_tokens, ),
+        dtype=torch.float32,
+        device=device,
+    )
+    start_idx = 0
+    for req_idx, n in enumerate(num_draft_tokens):
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if n == 0:
+            continue
+        end_idx = start_idx + n
+        generator = generators.get(req_idx)
+        if generator is not None:
+            uniform_probs[start_idx:end_idx].uniform_(generator=generator)
+        start_idx = end_idx
+    return uniform_probs
+
+
+def sample_recovered_tokens(
+    max_spec_len: int,
+    num_draft_tokens: list[int],
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    device: torch.device,
+) -> torch.Tensor:
+    # NOTE(woosuk): Create only one distribution for each request.
+    batch_size = len(num_draft_tokens)
+    vocab_size = target_probs.shape[-1]
+    q = torch.empty(
+        (batch_size, vocab_size),
+        dtype=torch.float32,
+        device=device,
+    )
+    q.exponential_()
+    for i, generator in sampling_metadata.generators.items():
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if num_draft_tokens[i] > 0:
+            q[i].exponential_(generator=generator)
+
+    recovered_token_ids = torch.empty_like(draft_token_ids)
+    sample_recovered_tokens_kernel[(batch_size, max_spec_len)](
+        recovered_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        q,
+        vocab_size,
+        triton.next_power_of_2(vocab_size),
+        IS_NGRAM=draft_probs is None,
+    )
+    return recovered_token_ids
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["max_spec_len"])
+def rejection_greedy_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    target_argmax_ptr,  # [num_tokens]
+    bonus_token_ids_ptr,  # [batch_size]
+    is_greedy_ptr,  # [batch_size] or None
+    max_spec_len,
+):
+    req_idx = tl.program_id(0)
+    # FIXME(woosuk): Because is_greedy_ptr is not None at profiling run,
+    # re-compilation may happen during runtime when is_greedy_ptr is None.
+    if is_greedy_ptr is None:
+        is_greedy = True
+    else:
+        is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if not is_greedy:
+        # Early exit for non-greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            target_argmax_id = tl.load(target_argmax_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     target_argmax_id)
+            if draft_token_id != target_argmax_id:
+                # Reject.
+                rejected = True
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["max_spec_len"])
+def rejection_random_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    bonus_token_ids_ptr,  # [batch_size]
+    recovered_token_ids_ptr,  # [num_tokens]
+    uniform_probs_ptr,  # [num_tokens]
+    is_greedy_ptr,  # [batch_size]
+    max_spec_len,
+    vocab_size,
+    IS_NGRAM: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if is_greedy:
+        # Early exit for greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            if IS_NGRAM:
+                draft_prob = 1
+            else:
+                draft_prob = tl.load(draft_probs_ptr +
+                                     (start_idx + pos) * vocab_size +
+                                     draft_token_id)
+            target_prob = tl.load(target_probs_ptr +
+                                  (start_idx + pos) * vocab_size +
+                                  draft_token_id)
+            uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos)
+            # NOTE(woosuk): While the draft probability should never be 0,
+            # we check it to avoid NaNs. If it happens to be 0, we reject.
+            if draft_prob > 0 and target_prob / draft_prob >= uniform_prob:
+                # Accept.
+                token_id = draft_token_id
+            else:
+                # Reject. Use recovered token.
+                rejected = True
+                token_id = tl.load(recovered_token_ids_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     token_id)
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+@triton.jit(do_not_specialize=["replace_from", "replace_to"])
+def expand_kernel(
+    output_ptr,  # [num_tokens]
+    input_ptr,  # [batch_size]
+    cu_num_tokens_ptr,  # [batch_size]
+    replace_from,
+    replace_to,
+    MAX_NUM_TOKENS: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:  # noqa: SIM108
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_tokens_ptr + req_idx)
+    num_tokens = end_idx - start_idx
+
+    src_val = tl.load(input_ptr + req_idx)
+    src_val = tl.where(src_val == replace_from, replace_to, src_val)
+    offset = tl.arange(0, MAX_NUM_TOKENS)
+    tl.store(output_ptr + start_idx + offset,
+             src_val,
+             mask=offset < num_tokens)
+
+
+@triton.jit
+def sample_recovered_tokens_kernel(
+    output_token_ids_ptr,  # [num_tokens]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    q_ptr,  # [batch_size, vocab_size]
+    vocab_size,
+    PADDED_VOCAB_SIZE: tl.constexpr,
+    IS_NGRAM: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    # Early exit for out-of-range positions.
+    pos = tl.program_id(1)
+    if pos >= num_draft_tokens:
+        return
+
+    vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
+    if IS_NGRAM:
+        draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+        orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                            draft_token_id)
+        # Temporarily zero out the probability of the draft token.
+        # This is essentially the same as target_prob - draft_prob, except that
+        # n-gram does not have draft_prob. We regard it as 1.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            0)
+        prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                       vocab_offset,
+                       mask=vocab_offset < vocab_size,
+                       other=0)
+    else:
+        draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
+                             vocab_offset,
+                             mask=vocab_offset < vocab_size,
+                             other=0)
+        target_prob = tl.load(target_probs_ptr +
+                              (start_idx + pos) * vocab_size + vocab_offset,
+                              mask=vocab_offset < vocab_size,
+                              other=0)
+        prob = tl.maximum(target_prob - draft_prob, 0)
+        # NOTE(woosuk): We don't need `prob = prob / tl.sum(prob)` here because
+        # `tl.argmax` will select the maximum value.
+
+    q = tl.load(q_ptr + req_idx * vocab_size + vocab_offset,
+                mask=vocab_offset < vocab_size,
+                other=float("-inf"))
+    recovered_id = tl.argmax(prob / q, axis=-1)
+    tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
+
+    if IS_NGRAM:
+        # Restore the original probability.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            orig_prob)
diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py
new file mode 100644
index 00000000000..1cf650d5fa5
--- /dev/null
+++ b/vllm/v1/spec_decode/metadata.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+
+@dataclass
+class SpecDecodeMetadata:
+
+    # [num_tokens]
+    draft_token_ids: torch.Tensor
+    # [batch_size]
+    num_draft_tokens: list[int]
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor
+    # [num_tokens]
+    target_logits_indices: torch.Tensor
+    # [batch_size]
+    bonus_logits_indices: torch.Tensor
+    # [num_tokens + batch_size]
+    logits_indices: torch.Tensor
+
+    def __post_init__(self):
+        self.max_spec_len = max(self.num_draft_tokens)
+
+    @classmethod
+    def make_dummy(
+        cls,
+        draft_token_ids: list[list[int]],
+        device: torch.device,
+    ) -> "SpecDecodeMetadata":
+        batch_size = len(draft_token_ids)
+        num_draft_tokens = [len(ids) for ids in draft_token_ids]
+        flattened_draft_token_ids = sum(draft_token_ids, [])
+        num_tokens = len(flattened_draft_token_ids)
+
+        draft_token_ids_tensor = torch.tensor(flattened_draft_token_ids,
+                                              dtype=torch.int32,
+                                              device=device)
+        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
+        cu_num_draft_tokens_tensor = torch.from_numpy(cu_num_draft_tokens).to(
+            device)
+
+        target_logits_indices = torch.zeros(num_tokens,
+                                            dtype=torch.int32,
+                                            device=device)
+        bonus_logits_indices = torch.zeros(batch_size,
+                                           dtype=torch.int32,
+                                           device=device)
+        logits_indices = torch.zeros(num_tokens + batch_size,
+                                     dtype=torch.int32,
+                                     device=device)
+        return cls(
+            draft_token_ids=draft_token_ids_tensor,
+            num_draft_tokens=num_draft_tokens,
+            cu_num_draft_tokens=cu_num_draft_tokens_tensor,
+            target_logits_indices=target_logits_indices,
+            bonus_logits_indices=bonus_logits_indices,
+            logits_indices=logits_indices,
+        )
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 58414013677..d5329ef7b5a 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from vllm.v1.sample.ops.topk_topp_sampler import random_sample  # noqa
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 66015382bfe..657333c6d84 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -34,7 +34,8 @@
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
+from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.utils import is_spec_decode_supported
 from vllm.v1.utils import bind_kv_cache
@@ -149,7 +150,6 @@ def __init__(
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-            self.rejection_sampler = RejectionSampler()
             # TODO: find a better way to check if we are using ngram.
             assert self.speculative_config.ngram_prompt_lookup_min, \
                     "Currently, only ngram spec decode is supported in V1."
@@ -162,6 +162,7 @@ def __init__(
                     self.speculative_config.ngram_prompt_lookup_min,
                     self.speculative_config.num_speculative_tokens,
                 )
+                self.rejection_sampler = RejectionSampler()
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
@@ -452,7 +453,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[FlashAttentionMetadata, torch.Tensor]:
+    ) -> tuple[FlashAttentionMetadata, torch.Tensor,
+               Optional[SpecDecodeMetadata]]:
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -577,22 +579,33 @@ def _prepare_inputs(
 
         use_spec_decode = len(
             scheduler_output.scheduled_spec_decode_tokens) > 0
-        if use_spec_decode:
-            logits_indices = self._calc_spec_decode_metadata(
-                scheduler_output, cu_num_tokens)
-        else:
+        if not use_spec_decode:
             # NOTE(woosuk): Due to chunked prefills, the batch may contain
             # partial requests. While we should not sample any token
             # from these partial requests, we do so for simplicity.
             # We will ignore the sampled tokens from the partial requests.
             # TODO: Support prompt logprobs.
             logits_indices = attn_metadata.query_start_loc[1:] - 1
+            spec_decode_metadata = None
+        else:
+            # Get the number of draft tokens for each request.
+            # Iterate over the dictionary rather than all requests since not all
+            # requests have draft tokens.
+            num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
+            for req_id, draft_token_ids in (
+                    scheduler_output.scheduled_spec_decode_tokens.items()):
+                req_idx = self.input_batch.req_id_to_index[req_id]
+                num_draft_tokens[req_idx] = len(draft_token_ids)
+
+            spec_decode_metadata = self._calc_spec_decode_metadata(
+                num_draft_tokens, cu_num_tokens)
+            logits_indices = spec_decode_metadata.logits_indices
 
         # Hot-Swap lora model
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
 
-        return attn_metadata, logits_indices
+        return attn_metadata, logits_indices, spec_decode_metadata
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -732,50 +745,79 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
 
     def _calc_spec_decode_metadata(
         self,
-        scheduler_output: "SchedulerOutput",
-        cu_num_tokens: np.ndarray,
-    ) -> torch.Tensor:
-        # Get the number of spec decode tokens for each request.
-        num_reqs = self.input_batch.num_reqs
-        num_spec_decode_tokens = np.empty(num_reqs, dtype=np.int32)
-        for i, req_id in enumerate(self.input_batch.req_ids):
-            num_spec_decode_tokens[i] = len(
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
-
-        # Get spec decode logits indices.
-        # E.g.,   num_scheduled_tokens: [4, 100, 3,   100, 2]
-        #         cu_num_tokens:        [4, 104, 107, 207, 209]
-        #         num_spec_tokens_list: [3, 0,   2,   0,   1]
-        #         num_sampled_tokens:   [4, 1,   3,   1,   2]
-        #         spec_decode_logits_indices:
-        #                 [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
-        num_sampled_tokens = num_spec_decode_tokens + 1
-        # logits_start_loc: [0, 103, 104, 206, 207]
-        logits_start_loc = cu_num_tokens - num_sampled_tokens
-        # [0, 103, 104, 206, 207] ->
-        #               [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
-        logits_start_loc = np.repeat(logits_start_loc, num_sampled_tokens)
-        # The following three lines:
-        # [4, 1,   3,   1,   2] -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-        # Step 1. [4, 1, 3, 1, 2] -> [4, 5, 8, 9, 11]
-        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens)
-        # Step 2. [4, 5, 8, 9, 11] -> [0, 4, 5, 8, 9]
-        #         -> [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-        cumsums_sampled_offsets = np.repeat(
-            cu_num_sampled_tokens - num_sampled_tokens, num_sampled_tokens)
-        # Step 3.  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        #       -  [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
-        #      -> [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
-        total_num_sampled_tokens = num_sampled_tokens.sum()
-        sampled_arange = (self.arange_np[:total_num_sampled_tokens] -
-                          cumsums_sampled_offsets)
-
-        # [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] ->
-        # [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
-        spec_decode_logits_indices = logits_start_loc + sampled_arange
-        return torch.from_numpy(spec_decode_logits_indices).to(
+        num_draft_tokens: np.ndarray,
+        cu_num_scheduled_tokens: np.ndarray,
+    ) -> SpecDecodeMetadata:
+        # Inputs:
+        # cu_num_scheduled_tokens:  [  4, 104, 107, 207, 209]
+        # num_draft_tokens:         [  3,   0,   2,   0,   1]
+        # Outputs:
+        # cu_num_draft_tokens:      [  3,   3,   5,   5,   6]
+        # logits_indices:           [  0,   1,   2,   3, 103, 104, 105, 106,
+        #                            206, 207, 208]
+        # target_logits_indices:    [  0,   1,   2,   5,   6,   9]
+        # bonus_logits_indices:     [  3,   4,   7,   8,  10]
+
+        # Compute the logits indices.
+        # [4, 1, 3, 1, 2]
+        num_sampled_tokens = num_draft_tokens + 1
+        # Step 1. [4, 5, 8, 9, 11]
+        cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32)
+        total_num_sampled_tokens = cu_num_sampled_tokens[-1]
+        # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9]
+        cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens,
+                                    num_sampled_tokens)
+        # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1]
+        arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets
+        # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207]
+        logits_indices = np.repeat(
+            cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens)
+        # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208]
+        logits_indices += arange
+
+        # Compute the bonus logits indices.
+        bonus_logits_indices = cu_num_sampled_tokens - 1
+
+        # Compute the draft logits indices.
+        # [3, 3, 5, 5, 6]
+        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
+        total_num_draft_tokens = cu_num_draft_tokens[-1]
+        # [0, 0, 0, 3, 3, 5]
+        cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens,
+                                    num_draft_tokens)
+        # [0, 1, 2, 0, 1, 0]
+        arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets
+        # [0, 0, 0, 5, 5, 9]
+        target_logits_indices = np.repeat(
+            cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens)
+        # [0, 1, 2, 5, 6, 9]
+        target_logits_indices += arange
+
+        # TODO: Optimize the CPU -> GPU copy.
+        cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
+            self.device, non_blocking=True)
+        logits_indices = torch.from_numpy(logits_indices).to(self.device,
+                                                             non_blocking=True)
+        target_logits_indices = torch.from_numpy(target_logits_indices).to(
+            self.device, non_blocking=True)
+        bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
             self.device, non_blocking=True)
 
+        # Compute the draft token ids.
+        # draft_token_indices:      [  1,   2,   3, 105, 106, 208]
+        draft_token_ids = self.input_ids[logits_indices]
+        draft_token_ids = draft_token_ids[target_logits_indices + 1]
+
+        metadata = SpecDecodeMetadata(
+            draft_token_ids=draft_token_ids,
+            num_draft_tokens=num_draft_tokens.tolist(),
+            cu_num_draft_tokens=cu_num_draft_tokens,
+            target_logits_indices=target_logits_indices,
+            bonus_logits_indices=bonus_logits_indices,
+            logits_indices=logits_indices,
+        )
+        return metadata
+
     def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -931,7 +973,8 @@ def execute_model(
             encoder_outputs = []
 
         # Prepare the decoder inputs.
-        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+        attn_metadata, logits_indices, spec_decode_metadata = (
+            self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -1006,31 +1049,29 @@ def execute_model(
 
         # Sample the next token and get logprobs if needed.
         sampling_metadata = self.input_batch.sampling_metadata
-        if not self.use_spec_decode:
+        if spec_decode_metadata is None:
             sampler_output = self.model.sample(
                 logits=logits,
                 sampling_metadata=sampling_metadata,
             )
         else:
-            draft_token_ids = [
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
-                for req_id in self.input_batch.req_ids
-            ]
-            sample_lens = [len(tokens) + 1 for tokens in draft_token_ids]
-            recover_logits_idx = np.cumsum(sample_lens) - 1
-            target_probs = self.rejection_sampler.compute_probs(
-                logits, sampling_metadata, sample_lens)
+            # TODO(woosuk): Optimize the memory usage.
+            bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
             sampler_output = self.model.sample(
-                logits=logits[recover_logits_idx, :],
+                logits=bonus_logits,
                 sampling_metadata=sampling_metadata,
             )
             bonus_token_ids = sampler_output.sampled_token_ids
+
+            # TODO(woosuk): Optimize the memory usage.
+            target_logits = logits[spec_decode_metadata.target_logits_indices]
             output_token_ids = self.rejection_sampler(
-                draft_token_ids,
+                spec_decode_metadata,
                 None,  # draft_probs
+                target_logits,
                 bonus_token_ids,
-                target_probs,
-                sampling_metadata)
+                sampling_metadata,
+            )
             sampler_output.sampled_token_ids = output_token_ids
 
         # TODO(woosuk): The following loop can be slow since it iterates over
@@ -1066,13 +1107,8 @@ def execute_model(
             valid_sampled_token_ids = sampled_token_ids.tolist()
         else:
             # Includes spec decode tokens.
-            valid_mask = sampled_token_ids != INVALID_TOKEN_ID
-            gen_lens = valid_mask.sum(dim=1).tolist()
-            # TODO(woosuk): Optimize this.
-            valid_sampled_token_ids = [
-                seq.tolist()
-                for seq in sampled_token_ids[valid_mask].split(gen_lens)
-            ]
+            valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                sampled_token_ids, self.input_batch.vocab_size)
 
         if not self.use_spec_decode:
             spec_token_ids = None
@@ -1316,6 +1352,33 @@ def _dummy_sampler_run(
                     "initializing the engine.") from e
             else:
                 raise e
+        if self.use_spec_decode:
+            draft_token_ids = [[0] for _ in range(num_reqs)]
+            dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+                draft_token_ids, self.device)
+
+            num_tokens = sum(len(ids) for ids in draft_token_ids)
+            # draft_probs = torch.randn(
+            #     num_tokens, logits.shape[-1], device=self.device,
+            #     dtype=logits.dtype)
+            draft_probs = None
+            target_logits = torch.randn(num_tokens,
+                                        logits.shape[-1],
+                                        device=self.device,
+                                        dtype=logits.dtype)
+            # NOTE(woosuk): Here, we should use int32 because the sampler uses
+            # int32 for bonus_token_ids. If the dtype mismatches, re-compilation
+            # will occur at runtime.
+            bonus_token_ids = torch.zeros(num_reqs,
+                                          device=self.device,
+                                          dtype=torch.int32)
+            self.rejection_sampler(
+                dummy_spec_decode_metadata,
+                draft_probs,
+                target_logits,
+                bonus_token_ids,
+                dummy_metadata,
+            )
         return sampler_output
 
     def profile_run(self) -> None:

From 890daaf1d32d9a3dad885a326c3b92ee99ce247a Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Tue, 18 Mar 2025 17:39:21 -0400
Subject: [PATCH 0824/1240] [V1] TPU - CI/CD use smaller model (#15054)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh | 14 +++++++-------
 tests/v1/tpu/test_basic.py    |  5 +++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index e396e8faf78..82f40c650f8 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -20,17 +20,17 @@ docker run --privileged --net host --shm-size=16G -it \
     && python3 -m pip install pytest \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && echo TEST_1 \
-    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
-    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
-    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
     && echo TEST_5 \
-    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
-
+    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    
 
-# TODO: Fix these tests
+# TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 241f49e4fae..41748385391 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -15,9 +15,10 @@
     from tests.conftest import VllmRunner
 
 MODELS = [
+    "Qwen/Qwen2.5-1.5B-Instruct",
+    # TODO: Enable this models with v6e
     # "Qwen/Qwen2-7B-Instruct",
-    "meta-llama/Llama-3.1-8B",
-    # TODO: Add models here as necessary
+    # "meta-llama/Llama-3.1-8B",
 ]
 
 TENSOR_PARALLEL_SIZES = [1]

From a28214b0a6ab55f29a48f3a9efd06e42b5bfa9dd Mon Sep 17 00:00:00 2001
From: Chujie Zheng <chujiezhengchn@gmail.com>
Date: Wed, 19 Mar 2025 06:57:31 +0800
Subject: [PATCH 0825/1240] fix long dtype in topk sampling (#15049)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index d91c057083f..739b09fa2a2 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -151,7 +151,7 @@ def gather_logprobs(
                                                  dim=-1)
 
         # Get with the logprob of the prompt or sampled token.
-        token_ids = token_ids.unsqueeze(-1)
+        token_ids = token_ids.unsqueeze(-1).to(torch.long)
         token_logprobs = logprobs.gather(-1, token_ids)
 
         # Compute the ranks of the actual token.

From 996d1f83efe0d8addc070bd828c5c0e2a308debe Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Tue, 18 Mar 2025 16:10:45 -0700
Subject: [PATCH 0826/1240] [Doc] Minor v1_user_guide update (#15064)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/v1_user_guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index 533324f9174..3e54022eebb 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -2,6 +2,8 @@
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
+To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason!
+
 ## Why vLLM V1?
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.

From 416abab59d6541fea38de6c4c5cfef8af82669da Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Tue, 18 Mar 2025 19:33:43 -0700
Subject: [PATCH 0827/1240] [Misc][V1] Skip device checking if not available
 (#15061)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index de85c2b206a..02a9ec46939 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1469,8 +1469,12 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # Need at least Ampere for now (FA support required).
+        # Skip this check if we are running on a non-GPU platform,
+        # or if the device capability is not available
+        # (e.g. in a Ray actor without GPUs).
         from vllm.platforms import current_platform
         if (current_platform.is_cuda()
+                and current_platform.get_device_capability()
                 and current_platform.get_device_capability().major < 8):
             _raise_or_fallback(feature_name="Compute Capability < 8.0",
                                recommend_to_remove=False)

From 2d5fa3a904b86d4529cef0a72a6f923152053b5f Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Wed, 19 Mar 2025 03:34:03 +0100
Subject: [PATCH 0828/1240] [Model] Pixtral: Remove layer instantiation
 duplication (#15053)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/pixtral.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f9facdf1831..dc3402e4321 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -365,16 +365,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 spatial_merge_size=self.vision_args.spatial_merge_size,
                 use_mlp_bias=False,
             )
-        if self.vision_args.add_pre_mm_projector_layer_norm:
-            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size,
-                                                 eps=1e-5)
 
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
-            self.patch_merger = PatchMerger(
-                vision_encoder_dim=self.vision_args.hidden_size,
-                spatial_merge_size=self.vision_args.spatial_merge_size,
-                use_mlp_bias=False,
-            )
         self.vision_language_adapter = VisionLanguageAdapter(
             self.vision_args, dim=config.text_config.hidden_size)
 

From 0cb457736aee2dadf92930d99016553dc93afc11 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Wed, 19 Mar 2025 01:09:32 -0400
Subject: [PATCH 0829/1240] [Model] Remove duplicated message check in Mistral
 chat completion request (#15069)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 40a5777f958..2d036e2c83f 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -143,10 +143,6 @@ def make_mistral_chat_completion_request(
     if last_message["role"] == "assistant":
         last_message["prefix"] = True
 
-        last_message = cast(Dict[str, Any], messages[-1])
-        if last_message["role"] == "assistant":
-            last_message["prefix"] = True
-
     # mistral-common requires AssistantMessage content to be string [1].
     #
     # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80

From 97f188ed01c5e6d9edc357787596cc089c34c225 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Mar 2025 13:49:33 +0800
Subject: [PATCH 0830/1240] [Core] Update dtype detection and defaults (#14858)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/test_basic_correctness.py       |   2 +-
 tests/conftest.py                             | 116 ++++++++++--------
 tests/entrypoints/llm/test_chat.py            |   1 -
 tests/entrypoints/openai/test_audio.py        |   2 -
 tests/entrypoints/openai/test_video.py        |   2 -
 tests/entrypoints/openai/test_vision.py       |   2 -
 .../openai/test_vision_embedding.py           |   2 -
 tests/entrypoints/test_chat_utils.py          |   6 +-
 .../audio_language/test_ultravox.py           |  15 +--
 .../vision_language/test_models.py            |  39 +-----
 .../vision_language/vlm_utils/core.py         |   3 -
 .../vision_language/vlm_utils/model_utils.py  |  91 +++++++-------
 .../vision_language/vlm_utils/types.py        |  11 +-
 .../vision_language/test_dse_qwen2_vl.py      |  52 ++++----
 .../vision_language/test_llava_next.py        |   3 +-
 .../embedding/vision_language/test_phi3v.py   |   3 +-
 .../vision_language/test_mllama.py            |   7 +-
 tests/models/utils.py                         |  11 +-
 tests/multimodal/test_processing.py           |   6 +-
 tests/tensorizer_loader/test_tensorizer.py    |   4 +-
 tests/v1/engine/test_llm_engine.py            |   2 +-
 vllm/config.py                                |  22 ++--
 22 files changed, 175 insertions(+), 227 deletions(-)

diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index b639fd719ca..0b76779b3a7 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -60,7 +60,7 @@ class TestSetting:
         # embedding model
         TestSetting(
             model="BAAI/bge-multilingual-gemma2",
-            model_args=["--task", "embed"],
+            model_args=["--task", "embed", "--dtype", "bfloat16"],
             pp_size=1,
             tp_size=1,
             attn_backend="FLASH_ATTN",
diff --git a/tests/conftest.py b/tests/conftest.py
index 30e5ca2eb13..0c71d981016 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,8 +14,8 @@
 import torch.nn.functional as F
 from huggingface_hub import snapshot_download
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
-                          BatchFeature)
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BatchEncoding, BatchFeature)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from tests.models.utils import (TokensTextLogprobs,
@@ -23,7 +23,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig, _get_and_verify_dtype
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -34,8 +34,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_list_of)
+from vllm.utils import cuda_device_count_stateless, is_list_of
 
 logger = init_logger(__name__)
 
@@ -271,14 +270,18 @@ def video_assets() -> _VideoAssets:
 
 class HfRunner:
 
-    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+    def get_default_device(self):
         from vllm.platforms import current_platform
+
+        return ("cpu" if current_platform.is_cpu()
+                or current_platform.is_openvino() else "cuda")
+
+    def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if x is None or isinstance(x, (bool, )):
             return x
 
         if device is None:
-            device = "cpu" if current_platform.is_cpu(
-            ) or current_platform.is_openvino() else "cuda"
+            device = self.device
 
         if isinstance(x, dict):
             return {k: self.wrap_device(v, device) for k, v in x.items()}
@@ -291,45 +294,59 @@ def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
     def __init__(
         self,
         model_name: str,
-        dtype: str = "half",
+        dtype: str = "auto",
         *,
         model_kwargs: Optional[dict[str, Any]] = None,
         is_sentence_transformer: bool = False,
         is_cross_encoder: bool = False,
         skip_tokenizer_init: bool = False,
         auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM,
-        postprocess_inputs: Callable[..., BatchEncoding] = identity,
     ) -> None:
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
         self.model_name = model_name
 
+        self.config = AutoConfig.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+        )
+        self.device = self.get_default_device()
+        self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype)
+
+        model_kwargs = model_kwargs if model_kwargs is not None else {}
+        model_kwargs.setdefault("torch_dtype", torch_dtype)
+
         if is_sentence_transformer:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
-            self.model = self.wrap_device(
-                SentenceTransformer(
-                    model_name,
-                    device="cpu",
-                    trust_remote_code=True,
-                ).to(dtype=torch_dtype))
+
+            self.model = SentenceTransformer(
+                model_name,
+                device=self.device,
+                model_kwargs=model_kwargs,
+                trust_remote_code=True,
+            )
         elif is_cross_encoder:
             # Lazy init required for AMD CI
             from sentence_transformers import CrossEncoder
-            self.model = CrossEncoder(model_name,
-                                      device="cpu",
-                                      trust_remote_code=True)
-            self.model.model = self.wrap_device(self.model.model)\
-                .to(dtype=torch_dtype)
+
+            self.model = CrossEncoder(
+                model_name,
+                device=self.device,
+                automodel_args=model_kwargs,
+                trust_remote_code=True,
+            )
         else:
-            model_kwargs = model_kwargs if model_kwargs is not None else {}
-            self.model = self.wrap_device(
-                auto_cls.from_pretrained(
-                    model_name,
-                    torch_dtype=torch_dtype,
-                    trust_remote_code=True,
-                    **model_kwargs,
-                ))
+            model = auto_cls.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                **model_kwargs,
+            )
+
+            if (getattr(model, "quantization_method", None) != "bitsandbytes"
+                    and len({p.device
+                             for p in model.parameters()}) < 2):
+                model = model.to(self.device)
+
+            self.model = model
 
         if not skip_tokenizer_init:
             self.tokenizer = AutoTokenizer.from_pretrained(
@@ -349,16 +366,13 @@ def __init__(
         if skip_tokenizer_init:
             self.tokenizer = self.processor.tokenizer
 
-        self.dtype = dtype
-        self.postprocess_inputs = postprocess_inputs
-
     def get_inputs(
         self,
         prompts: list[str],
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
-    ) -> list[BatchEncoding]:
+    ) -> list[Union[BatchFeature, BatchEncoding]]:
         if images is not None:
             assert len(prompts) == len(images)
 
@@ -368,7 +382,7 @@ def get_inputs(
         if audios is not None:
             assert len(prompts) == len(audios)
 
-        all_inputs: list[BatchEncoding] = []
+        all_inputs: list[Union[BatchFeature, BatchEncoding]] = []
         for i, prompt in enumerate(prompts):
             processor_kwargs: dict[str, Any] = {
                 "text": prompt,
@@ -384,7 +398,8 @@ def get_inputs(
                 processor_kwargs["sampling_rate"] = sr
 
             inputs = self.processor(**processor_kwargs)
-            inputs = self.postprocess_inputs(inputs, dtype=self.dtype)
+            if isinstance(inputs, BatchFeature):
+                inputs = inputs.to(dtype=self.dtype)
 
             all_inputs.append(inputs)
 
@@ -417,7 +432,7 @@ def generate(
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for inputs in all_inputs:
             output_ids = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
             )
@@ -488,7 +503,7 @@ def generate_greedy_logprobs(
         all_logprobs: list[list[torch.Tensor]] = []
         for inputs in all_inputs:
             output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -569,7 +584,7 @@ def generate_greedy_logprobs_limit(
 
         for inputs in all_inputs:
             output = self.model.generate(
-                **self.wrap_device(inputs, device=self.model.device.type),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -620,19 +635,15 @@ def generate_encoder_decoder_greedy_logprobs_limit(
             if images is not None and images[i] is not None:
                 processor_kwargs["images"] = images[i]
 
-            encoder_inputs = self.wrap_device(
-                self.processor(**processor_kwargs),
-                device=self.model.device.type,
-            )
+            encoder_inputs = self.processor(**processor_kwargs)
+            encoder_inputs = self.wrap_device(encoder_inputs)
 
             if decoder_prompt is None:
                 decoder_input_ids = None
             else:
-                decoder_input_ids = self.wrap_device(
-                    self.tokenizer(decoder_prompt,
-                                   return_tensors="pt").input_ids,
-                    device=self.model.device.type,
-                )
+                decoder_inputs = self.tokenizer(decoder_prompt,
+                                                return_tensors="pt")
+                decoder_input_ids = self.wrap_device(decoder_inputs.input_ids)
 
             output = self.model.generate(
                 decoder_input_ids=decoder_input_ids,
@@ -684,6 +695,7 @@ class VllmRunner:
     """
     The default value of some arguments have been modified from
     :class:`~vllm.LLM` as follows:
+
     - `trust_remote_code`: Set to `True` instead of `False` for convenience.
     - `seed`: Set to `0` instead of `None` for test reproducibility.
     - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
@@ -701,10 +713,8 @@ def __init__(
         tokenizer_mode: str = "auto",
         trust_remote_code: bool = True,
         seed: Optional[int] = 0,
-        # Use smaller max model length, otherwise bigger model cannot run due
-        # to kv cache size limit.
         max_model_len: int = 1024,
-        dtype: str = "half",
+        dtype: str = "auto",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16,
@@ -1110,4 +1120,4 @@ def pytest_collection_modifyitems(config, items):
     skip_optional = pytest.mark.skip(reason="need --optional option to run")
     for item in items:
         if "optional" in item.keywords:
-            item.add_marker(skip_optional)
\ No newline at end of file
+            item.add_marker(skip_optional)
diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py
index 710bad4ecf4..e96081c167e 100644
--- a/tests/entrypoints/llm/test_chat.py
+++ b/tests/entrypoints/llm/test_chat.py
@@ -64,7 +64,6 @@ def test_multi_chat():
 def test_chat_multi_image(image_urls: list[str]):
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
-        dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,
         enforce_eager=True,
diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py
index 56fb2932842..3267dcc15e4 100644
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -18,8 +18,6 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 36d62224233..8c7564ba9dc 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -24,8 +24,6 @@ def server():
     args = [
         "--task",
         "generate",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "32768",
         "--max-num-seqs",
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index d605394f57b..bb100e573b8 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -25,8 +25,6 @@ def server():
     args = [
         "--task",
         "generate",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 100aca6f63f..74e5c4cc7ea 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -28,8 +28,6 @@ def server():
     args = [
         "--task",
         "embed",
-        "--dtype",
-        "bfloat16",
         "--max-model-len",
         "2048",
         "--max-num-seqs",
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index c52fa905c80..e3b7b660ee2 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -34,7 +34,7 @@ def phi3v_model_config():
                        tokenizer=PHI3V_MODEL_ID,
                        tokenizer_mode="auto",
                        trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                        seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -58,7 +58,7 @@ def mllama_model_config():
                        tokenizer=MLLAMA_MODEL_ID,
                        tokenizer_mode="auto",
                        trust_remote_code=True,
-                       dtype="bfloat16",
+                       dtype="auto",
                        seed=0,
                        limit_mm_per_prompt={
                            "image": 2,
@@ -669,7 +669,7 @@ def get_conversation(is_hf: bool):
                                tokenizer=MLLAMA_MODEL_ID,
                                tokenizer_mode="auto",
                                trust_remote_code=True,
-                               dtype="bfloat16",
+                               dtype="auto",
                                seed=0,
                                limit_mm_per_prompt={
                                    "image": 2,
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index f8770bca4e9..83ece5d22bf 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,11 +5,10 @@
 import numpy as np
 import pytest
 import pytest_asyncio
-from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from transformers import AutoModel, AutoTokenizer
 
 from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer
@@ -107,8 +106,6 @@ def run_test(
     **kwargs,
 ):
     """Inference result should be the same between hf and vllm."""
-    torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
@@ -124,15 +121,7 @@ def run_test(
             for vllm_prompt, _, audio in prompts_and_audios
         ]
 
-    def process(hf_inputs: BatchEncoding, **kwargs):
-        hf_inputs["audio_values"] = hf_inputs["audio_values"] \
-            .to(torch_dtype)  # type: ignore
-        return hf_inputs
-
-    with hf_runner(model,
-                   dtype=dtype,
-                   postprocess_inputs=process,
-                   auto_cls=AutoModel) as hf_model:
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
         hf_outputs_per_audio = [
             hf_model.generate_greedy_logprobs_limit(
                 [hf_prompt],
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 2f903a33c6b..5690249eb37 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -122,9 +122,6 @@
             "cherry_blossom": "What is in the picture?",
         }),
         auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
         dtype="bfloat16",
         marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
@@ -179,7 +176,6 @@
     #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
     #     }),
     #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), # noqa: E501
     #     stop_str=["<|im_end|>"],
     #     image_size_factors=[(0.10, 0.15)],
     #     max_tokens=64,
@@ -200,9 +196,6 @@
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
         # For chameleon, we only compare the sequences
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
@@ -222,7 +215,6 @@
         }),
         multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
         patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
-        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
@@ -258,7 +250,6 @@
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
-        dtype="bfloat16",
         vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
         patch_hf_runner=model_utils.gemma3_patch_hf_runner,
     ),
@@ -272,7 +263,6 @@
         }),
         max_model_len=2048,
         max_num_seqs=2,
-        dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
         patch_hf_runner=model_utils.glm4v_patch_hf_runner,
         # The image embeddings match with HF but the outputs of the language
@@ -295,7 +285,6 @@
         }),
         multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
         max_model_len=8192,
-        dtype="bfloat16",
         use_tokenizer_eos=True,
         num_logprobs=10,
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
@@ -324,10 +313,6 @@
         }),
         multi_image_prompt="Image-1: <image>\nImage-2: <image>\nDescribe the two images in short.",  # noqa: E501
         max_model_len=4096,
-        # NOTE: Mono-InternVL-2B doesn't work with fp16,
-        # it will result NaN during inference.
-        # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
-        dtype="bfloat16",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
@@ -351,9 +336,6 @@
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values_videos"
-        ),
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
@@ -378,9 +360,6 @@
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         max_model_len=4096,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
         get_stop_token_ids=lambda tok: [128009],
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
@@ -400,8 +379,8 @@
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
-        postprocess_inputs=model_utils.wrap_inputs_post_processor,
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
@@ -411,11 +390,8 @@
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_patch_hf_runner
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
@@ -425,10 +401,8 @@
         max_model_len=4096,
         max_num_seqs=2,
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        postprocess_inputs=model_utils.ignore_inputs_post_processor(
-            "image_sizes"
-        ),
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
@@ -437,7 +411,6 @@
         max_model_len=4096,
         max_num_seqs=2,
         patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        postprocess_inputs=model_utils.molmo_post_processor,
     ),
     # Tests for phi3v currently live in another file because of a bug in
     # transformers. Once this issue is fixed, we can enable them here instead.
@@ -482,9 +455,6 @@
         prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
         max_model_len=4096,
         auto_cls=AutoModelForImageTextToText,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
         vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2],
         hf_output_post_proc = lambda hf_output, model: hf_output[:2],
         comparator=check_outputs_equal,
@@ -529,9 +499,6 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        postprocess_inputs=model_utils.cast_dtype_post_processor(
-            "pixel_values"
-        ),
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py
index 31f0209b102..2eae643fa2e 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -4,7 +4,6 @@
 
 import torch
 from PIL.Image import Image
-from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config import TaskOption
@@ -31,7 +30,6 @@ def run_test(
     vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
-    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
     comparator: Callable[..., None],
     get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
     stop_str: Optional[list[str]],
@@ -101,7 +99,6 @@ def run_test(
     hf_model = hf_runner(model,
                          dtype=dtype,
                          auto_cls=auto_cls,
-                         postprocess_inputs=postprocess_inputs,
                          model_kwargs=hf_model_kwargs)
 
     # Some models need to patch things like the model processor, e.g., internvl
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 3b4d1237c37..c84bf6dc15f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,16 +6,15 @@
 import re
 import types
 from pathlib import PosixPath
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchEncoding,
+from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
                           GenerationConfig)
 
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from .....conftest import HfRunner, ImageAsset, _ImageAssets
 from .types import RunnerOutput
@@ -211,40 +210,6 @@ def get_llava_embeddings(image_assets: _ImageAssets):
     return [asset.image_embeds for asset in image_assets]
 
 
-####### postprocessors to run on HF BatchEncoding
-def cast_dtype_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which converts a given key into a
-    target data type."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        hf_inputs[hf_inp_key] = hf_inputs[hf_inp_key].to(torch_dtype)
-        return hf_inputs
-
-    return process
-
-
-def ignore_inputs_post_processor(
-        hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]:
-    """Gets a handle to a post processor which ignores a given key."""
-
-    def process(hf_inputs: BatchEncoding, dtype: str):
-        del hf_inputs[hf_inp_key]
-        return hf_inputs
-
-    return process
-
-
-def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    return {"model_inputs": hf_inputs}
-
-
-def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str):
-    hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype)
-    return {k: v.unsqueeze(0) for k, v in hf_inputs.items()}
-
-
 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
         tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
@@ -295,8 +260,7 @@ def processor(*args, text="", images=None, **kwargs):
             for k in inputs.keys()  # noqa
             if k not in ("seq_lens", "sft_format")
         }
-        inputs = BatchEncoding(data=inputs, tensor_type="pt")
-        return inputs
+        return BatchFeature(data=inputs, tensor_type="pt")
 
     hf_model.processor = processor
     hf_model.model.get_output_embeddings = lambda: \
@@ -529,10 +493,52 @@ def _generate(self, *args, **kwargs):
     return hf_model
 
 
-def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     orig_generate = hf_model.model.generate
 
-    def _generate(self, *args, **kwargs):
+    def _generate(
+        self,
+        *args,
+        input_ids=None,
+        pixel_values=None,
+        image_sizes=None,
+        image_bound=None,
+        tgt_sizes=None,
+        **kwargs,
+    ):
+        model_inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "image_sizes": image_sizes,
+            "image_bound": image_bound,
+            "tgt_sizes": tgt_sizes,
+        }
+        for k in list(model_inputs.keys()):
+            if model_inputs[k] is None:
+                model_inputs.pop(k)
+
+        return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
+        return orig_generate(*args, decode_text=False, **kwargs)
+
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
+
+    return hf_model
+
+
+def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    orig_generate = hf_model.model.generate
+
+    def _generate(self, *args, image_sizes=None, **kwargs):
         return orig_generate(*args, decode_text=False, **kwargs)
 
     hf_model.model.generate = types.MethodType(_generate, hf_model.model)
@@ -551,10 +557,11 @@ def _processor(*args, **kwargs):
 
     def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
         batch = {
-            k: kwargs.pop(k)
+            k: kwargs.pop(k).unsqueeze(0)
             for k in ("input_ids", "images", "image_input_idx", "image_masks")
             if k in kwargs
         }
+        batch = BatchFeature(batch).to(dtype=self.dtype)
 
         return self.generate_from_batch(
             batch,
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index bdbdbc7ec26..1ae61ea4722 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -8,13 +8,12 @@
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import AutoModelForCausalLM, BatchEncoding
+from transformers import AutoModelForCausalLM
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import identity
 
 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
 from ....utils import check_logprobs_close
@@ -110,11 +109,6 @@ class VLMTestInfo(NamedTuple):
     # Indicates we should explicitly pass the EOS from the tokenizer
     use_tokenizer_eos: bool = False
     auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
-    # Callable to pass to the HF runner to run on inputs; for now, we also pass
-    # the data type to input post processing, because almost all of the uses of
-    # postprocess_inputs are to fix the data types of BatchEncoding values.
-    postprocess_inputs: Callable[[BatchEncoding, str],
-                                 BatchEncoding] = identity
     patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
 
     # Post processors that if defined, will run oun the outputs of the
@@ -130,7 +124,7 @@ class VLMTestInfo(NamedTuple):
     # is all combinations of .models + all fields below
     max_tokens: Union[int, tuple[int]] = 128
     num_logprobs: Union[int, tuple[int]] = 5
-    dtype: Union[str, Iterable[str]] = "half"
+    dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
     distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
     # Only expanded in video tests
     num_video_frames: Union[int, tuple[int]] = 16
@@ -171,7 +165,6 @@ def get_non_parametrized_runner_kwargs(self):
             "vllm_output_post_proc": self.vllm_output_post_proc,
             "auto_cls": self.auto_cls,
             "use_tokenizer_eos": self.use_tokenizer_eos,
-            "postprocess_inputs": self.postprocess_inputs,
             "comparator": self.comparator,
             "get_stop_token_ids": self.get_stop_token_ids,
             "hf_model_kwargs": self.hf_model_kwargs,
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
index 7391df6e1c3..3c15b0b5526 100644
--- a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from functools import partial
 from typing import Callable
 
 import pytest
 import torch
+import torch.nn.functional as F
 from PIL import Image
-from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
+from transformers import Qwen2VLForConditionalGeneration
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@@ -75,10 +75,6 @@ def apply_chat_template_and_add_eos(
     return prompt
 
 
-def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
-    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
-
-
 def _run_test(
     hf_runner: type[HfRunner],
     vllm_runner: type[VllmRunner],
@@ -118,14 +114,8 @@ def _run_test(
     with hf_runner(model,
                    dtype=dtype,
                    auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
-        hf_model.postprocess_inputs = partial(
-            postprocess_inputs,
-            hf_model,
-            cache_position=torch.arange(
-                0,
-                1,  # 1 for batch size
-                requires_grad=False),
-            use_cache=False)
+
+        prompts = []
         for text, image, embed_text in zip(input_texts, input_images,
                                            embed_texts):
             # dse requires non-standard input processing
@@ -133,20 +123,34 @@ def _run_test(
             messages = get_messages(image, text, embed_text)
             prompt = apply_chat_template_and_add_eos(
                 messages, hf_model.processor.apply_chat_template)
-            inputs = hf_model.get_inputs(
-                prompts=[[prompt]],
-                images=[[image]],
-            )
-            with torch.no_grad():
+
+            prompts.append(prompt)
+
+        all_inputs = hf_model.get_inputs(
+            prompts=prompts,
+            images=input_images,
+        )
+
+        with torch.no_grad():
+            all_outputs = []
+            for inputs in all_inputs:
+                inputs = hf_model.model.prepare_inputs_for_generation(
+                    **inputs,
+                    cache_position=torch.arange(1),  # 1 for batch size
+                    use_cache=False,
+                )
                 outputs = hf_model.model(
-                    **hf_model.wrap_device(inputs[0],
-                                           device=hf_model.model.device.type),
+                    **hf_model.wrap_device(inputs),
                     return_dict=True,
                     output_hidden_states=True,
                 )
-                pooled_output = torch.nn.functional.normalize(
-                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
-            hf_outputs.append(pooled_output.tolist())
+                pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
+                                            p=2,
+                                            dim=-1)
+
+                all_outputs.append(pooled_output.tolist())
+
+            hf_outputs = all_outputs
 
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index d5d410f17dd..4da59ff505e 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -86,8 +86,7 @@ def _run_test(
         for inputs in all_inputs:
             # Based on: https://huggingface.co/royokong/e5-v
             outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
-                                       device=hf_model.model.device.type),
+                **hf_model.wrap_device(inputs),
                 return_dict=True,
                 output_hidden_states=True,
             )
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 3226138a28b..9cc767c23b2 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -53,8 +53,7 @@ def _run_test(
         for inputs in all_inputs:
             # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
             outputs = hf_model.model(
-                **hf_model.wrap_device(inputs,
-                                       device=hf_model.model.device.type),
+                **hf_model.wrap_device(inputs),
                 return_dict=True,
                 output_hidden_states=True,
             )
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index d2cdcfe4a56..b6ea31cc571 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,8 +4,7 @@
 
 import pytest
 import torch
-from transformers import (AutoConfig, AutoModelForImageTextToText,
-                          AutoTokenizer, BatchEncoding)
+from transformers import AutoConfig, AutoModelForImageTextToText, AutoTokenizer
 
 from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
@@ -227,13 +226,9 @@ def _run_test(
             for prompts, images in inputs
         ]
 
-    def process(hf_inputs: BatchEncoding, **kwargs):
-        return hf_inputs
-
     with hf_runner(model,
                    dtype=dtype,
                    model_kwargs={"device_map": "auto"},
-                   postprocess_inputs=process,
                    auto_cls=AutoModelForImageTextToText) as hf_model:
         hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 2280a6c916d..7109169e899 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,7 +2,7 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 
@@ -254,9 +254,9 @@ def check_logprobs_close(
 def build_model_context(
     model_id: str,
     task: TaskOption = "auto",
-    dtype: Optional[Union[str, torch.dtype]] = None,
-    mm_processor_kwargs: Optional[dict] = None,
-    limit_mm_per_prompt: Optional[dict] = None,
+    dtype: Union[str, torch.dtype] = "auto",
+    mm_processor_kwargs: Optional[dict[str, Any]] = None,
+    limit_mm_per_prompt: Optional[dict[str, int]] = None,
     disable_mm_preprocessor_cache: bool = True,
 ):
     """Creates an InputContext for a given model.
@@ -274,9 +274,6 @@ def build_model_context(
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    if dtype is None:
-        dtype = "half"
-
     model_config = ModelConfig(
         model_id,
         task=task,
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index a358eee5ddb..fbb7e507b10 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -853,7 +853,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         tokenizer_mode="auto",
         trust_remote_code=False,
         seed=0,
-        dtype="half",
+        dtype="auto",
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
@@ -892,7 +892,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         tokenizer_mode="auto",
         trust_remote_code=False,
         seed=0,
-        dtype="half",
+        dtype="auto",
         revision=None,
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
@@ -965,7 +965,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
         tokenizer_mode="auto",
         trust_remote_code=False,
         seed=0,
-        dtype="half",
+        dtype="auto",
         revision=None,
     )
 
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b268d4bf0c4..5b9661bf6b0 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -166,7 +166,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     test_prompts = multilora_inference.create_test_prompts(lora_path)
 
     # Serialize model before deserializing and binding LoRA adapters
-    with vllm_runner(model_ref, ) as vllm_model:
+    with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
         vllm_model.apply_model(
@@ -208,7 +208,7 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     ## Serialize model
-    with vllm_runner(model_ref, ) as vllm_model:
+    with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
 
         vllm_model.apply_model(
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 5446653cc3a..cefb89eb652 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -50,7 +50,7 @@ def _get_test_sampling_params(
     """Generate random sampling params for a batch."""
 
     def get_mostly_n_gt1() -> int:
-        """Mostly n \in [2,20], ~1/3 n=1"""
+        r"""Mostly n \in [2,20], ~1/3 n=1"""
         x = random.randint(0, 28)
         if x < 10:
             return 1
diff --git a/vllm/config.py b/vllm/config.py
index c510677d64e..c248122da03 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -347,7 +347,7 @@ def __init__(
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, revision)
-        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
         self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
@@ -2526,6 +2526,14 @@ def _get_and_verify_dtype(
     # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
     # because config.torch_dtype can be None.
     config_dtype = getattr(config, "torch_dtype", None)
+
+    # Fallbacks for multi-modal models if the root config
+    # does not define torch_dtype
+    if config_dtype is None and hasattr(config, "text_config"):
+        config_dtype = getattr(config.text_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "vision_config"):
+        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+
     if config_dtype is None:
         config_dtype = torch.float32
 
@@ -2533,16 +2541,8 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                if config.model_type in ("gemma2", "gemma3", "gemma3_text"):
-                    logger.info(
-                        "For Gemma 2 and 3, we downcast float32 to bfloat16 "
-                        "instead of float16 by default. Please specify `dtype` "
-                        "if you want to use float16.")
-                    torch_dtype = torch.bfloat16
-                else:
-                    # Following the common practice, we use float16 for float32
-                    # models.
-                    torch_dtype = torch.float16
+                # Following common practice, we use float16 for float32 models
+                torch_dtype = torch.float16
             else:
                 torch_dtype = config_dtype
 

From b5ae02173afdc5bc22f6e246d939840ee7fcdfa1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 18 Mar 2025 23:52:19 -0700
Subject: [PATCH 0831/1240] [V1] Ensure using int64 for sampled token ids
 (#15065)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/sampler.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 739b09fa2a2..abff7c1c265 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -47,6 +47,11 @@ def forward(
         logits = self.apply_penalties(logits, sampling_metadata)
         # Sample the next token.
         sampled = self.sample(logits, sampling_metadata)
+        # Convert sampled token ids to int64 (long) type to ensure compatibility
+        # with subsequent operations that may use these values as indices.
+        # This conversion is necessary because FlashInfer sampling operations
+        # return int32 (while PyTorch argmax and topk return int64).
+        sampled = sampled.long()
 
         # Gather the logprobs of the topk and sampled token (if requested).
         # Get logprobs and rank tensors (if requested)
@@ -139,19 +144,21 @@ def gather_logprobs(
                      or sampled tokens (if sampled
                      logprobs); 1D token ID tensor
                      with (num tokens) elements
+                     Must be int64.
 
         Returns:
           Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
           Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
           Sampled token rank tensor, (num tokens)
         """
+        assert token_ids.dtype == torch.int64
         # Find the topK values.
         topk_logprobs, topk_indices = torch.topk(logprobs,
                                                  num_logprobs,
                                                  dim=-1)
 
         # Get with the logprob of the prompt or sampled token.
-        token_ids = token_ids.unsqueeze(-1).to(torch.long)
+        token_ids = token_ids.unsqueeze(-1)
         token_logprobs = logprobs.gather(-1, token_ids)
 
         # Compute the ranks of the actual token.

From 5fd6e6249bf29ea93ae344adda7fc9837828fa2b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Mar 2025 14:58:22 +0800
Subject: [PATCH 0832/1240] [Bugfix] Re-enable Gemma3 for V1 (#14980)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md  |   7 +-
 tests/multimodal/test_processing.py     |  55 +++++-
 vllm/model_executor/models/gemma3_mm.py | 247 ++++++++++++++++++++----
 vllm/model_executor/models/llava.py     |  69 ++-----
 vllm/model_executor/models/molmo.py     |  15 +-
 vllm/model_executor/models/pixtral.py   |  64 ++----
 vllm/model_executor/models/vision.py    |  50 ++++-
 vllm/multimodal/processing.py           |  87 ++++++---
 8 files changed, 419 insertions(+), 175 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fbcea826e6c..5e5e7287f39 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -768,7 +768,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ⚠️
 - * `GLM4VForCausalLM`<sup>^</sup>
   * GLM-4V
   * T + I
@@ -951,13 +951,10 @@ V0 correctly implements the model's attention pattern:
 
 V1 currently uses a simplified attention pattern:
 - Uses causal attention for all tokens, including image tokens
-- Generates reasonable outputs but does not match the original model's attention for text + image inputs
+- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": True}`
 - Will be updated in the future to support the correct behavior
-- Does not support `"do_pan_and_scan": True`
 
 This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-
-For these reasons, `Gemma3ForConditionalGeneration` is supported only on V0 at the moment.
 :::
 
 :::{note}
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index fbb7e507b10..2e6dde75dc9 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -19,7 +19,8 @@
                                         apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
-                                        iter_token_matches)
+                                        iter_token_matches,
+                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
 from vllm.transformers_utils.tokenizer import (AnyTokenizer,
@@ -89,6 +90,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
     assert all(match_len == len(match_ids) for match_len in match_lens)
 
 
+# yapf: disable
+@pytest.mark.parametrize(
+    ("token_ids", "match_ids", "new_ids", "expected"),
+    [
+        ([], [], [-1], []),
+        ([], [32000], [-1], []),
+        (
+            [32000, 32000, 32000],
+            [32000],
+            [-1],
+            [-1, -1, -1],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000],
+            [-1],
+            [-1, 32000],
+        ),
+        (
+            [32000, 32000, 32000],
+            [32000, 32000, 32000],
+            [-1],
+            [-1],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000],
+            [-1],
+            [9833, -1, 32000, 32000, 9833, -1, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 32000, 32000, 32000],
+            [-1],
+            [9833, -1, 9833, 28747, 32000, 32000, 918],
+        ),
+        (
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+            [28747, 0, 32000],
+            [-1],
+            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
+        ),
+    ],
+)
+# yapf: enable
+def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
+    result = replace_token_matches(token_ids, match_ids, new_ids)
+
+    # Manually constructed results
+    assert result == expected
+
+
 # yapf: disable
 @pytest.mark.parametrize(
     ("prompt", "target_by_key", "expected_by_key"),
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 27b254b9c5c..62e55d64cf2 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -1,34 +1,43 @@
 # SPDX-License-Identifier: Apache-2.0
 import math
-from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
-                    Tuple, TypedDict, Union)
+from collections.abc import Iterable, Mapping, Sequence
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 from torch import nn
 from transformers import BatchFeature, Gemma3Config, Gemma3Processor
 from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import GemmaRMSNorm
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalFieldConfig
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
+# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, encode_tokens)
+                                        BaseProcessingInfo, BoundPromptUpdate,
+                                        PlaceholderFeaturesInfo,
+                                        PromptReplacement, PromptTargetMatch,
+                                        PromptUpdate, PromptUpdateDetails,
+                                        encode_tokens, find_mm_placeholders,
+                                        replace_token_matches)
+# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
-                         SupportsMultiModal, SupportsPP, SupportsV0Only)
+                         SupportsMultiModal, SupportsPP)
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -37,13 +46,25 @@ class Gemma3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     pixel_values: torch.Tensor
     """
-    Shape: `(num_crops_total, num_channels, height, width)`
+    Shape: `(num_patches_total, num_channels, height, width)`
 
-    `num_crops_total` is the total number of crops
+    `num_patches_total` is the total number of patches
     over each image over each prompt in the batch.
     """
-    num_crops: torch.Tensor
-    """Shape: `(batch_size * num_images,)`"""
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
+
+    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size, num_images)`"""
 
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
@@ -51,6 +72,9 @@ class Gemma3ImagePixelInputs(TypedDict):
 
 class Gemma3ProcessingInfo(BaseProcessingInfo):
 
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Gemma3Config)
+
     def get_hf_processor(self, **kwargs: object):
         return self.ctx.get_hf_processor(Gemma3Processor, **kwargs)
 
@@ -114,6 +138,11 @@ def get_num_crops(
         if not do_pan_and_scan:
             return 0
 
+        if envs.VLLM_USE_V1:
+            logger.warning_once(
+                "`do_pan_and_scan=True` has suboptimal results on V1 "
+                "because of the simplified attention pattern being used.")
+
         # Based on Gemma3ImageProcessor.pan_and_scan
         if image_width >= image_height:
             if image_width / image_height < pan_and_scan_min_ratio_to_activate:
@@ -154,7 +183,7 @@ def get_image_repl(
         image_width: int,
         image_height: int,
         processor: Optional[Gemma3Processor],
-    ) -> str:
+    ) -> PromptUpdateDetails:
         if processor is None:
             processor = self.get_hf_processor()
 
@@ -175,7 +204,11 @@ def get_image_repl(
                 f"Here is the original image {image_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        return image_text.replace(image_token, processor.full_image_sequence)
+        repl_full = image_text.replace(image_token,
+                                       processor.full_image_sequence)
+        repl_features = repl_full.strip("\n")
+
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def get_num_image_tokens(
         self,
@@ -193,7 +226,7 @@ def get_num_image_tokens(
 
         image_repl_tokens = encode_tokens(
             tokenizer,
-            image_repl,
+            image_repl.features,
             add_special_tokens=False,
         )
         return len(image_repl_tokens)
@@ -240,12 +273,8 @@ def get_dummy_processor_inputs(
                                    num_images=num_images)
         }
 
-        # NOTE: We need to separate the image tokens here because
-        # encode("\n\n\n\n") != encode("\n\n") * 2, which interferes
-        # with the detection of prompt updates when the image tokens are
-        # right next to each other
         return ProcessorInputs(
-            prompt_text=" ".join([image_token] * num_images),
+            prompt_text=image_token * num_images,
             mm_data=mm_data,
         )
 
@@ -278,13 +307,39 @@ def _call_hf_processor(
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
+            image_repl_features = [
+                self.info.get_image_repl(image_width=size.width,
+                                         image_height=size.height,
+                                         processor=hf_processor).features
+                for size in image_sizes
+            ]
+
+            tokenizer = self.info.get_tokenizer()
+            image_repls_feature_tokens = [
+                tokenizer.encode(image_repl, add_special_tokens=False)
+                for image_repl in image_repl_features
+            ]
+            num_embeds = [
+                len(image_repl_feature_tokens)
+                for image_repl_feature_tokens in image_repls_feature_tokens
+            ]
+            processed_outputs["num_embeds"] = torch.tensor(num_embeds)
+
+            vocab = tokenizer.get_vocab()
+            image_token_id = vocab[tokenizer.image_token]
+
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == image_token_id
+                for image_repl_tokens in image_repls_feature_tokens
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
                                         processor=hf_processor)
                 for size in image_sizes
             ]
-
             processed_outputs["num_crops"] = torch.tensor(num_crops)
 
         return processed_outputs
@@ -300,6 +355,8 @@ def _get_mm_fields_config(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            num_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -329,6 +386,91 @@ def get_replacement_gemma3(item_idx: int):
             )
         ]
 
+    def _apply_token_matches(
+        self,
+        prompt: list[int],
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[int]:
+        token_ids = super()._apply_token_matches(
+            prompt,
+            mm_matches,
+            mm_item_counts,
+        )
+
+        # "\n\n\n" and "\n\n\n\n" are single tokens
+        # Since our replacement can insert "\n\n" next to "\n"
+        # tokens, we have to combine them to be consistent with
+        # the output of the tokenizer
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_1, newline_2],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_1],
+            [newline_3],
+        )
+        token_ids = replace_token_matches(
+            token_ids,
+            [newline_2, newline_2],
+            [newline_4],
+        )
+
+        return token_ids
+
+    def _find_mm_placeholders(
+        self,
+        mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
+        new_token_ids: list[int],
+        mm_item_counts: Mapping[str, int],
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        # We need to detect "\n\n" inside "\n\n\n" and "\n\n\n\n"
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        newline_1 = vocab["\n"]
+        newline_2 = vocab["\n\n"]
+        newline_3 = vocab["\n\n\n"]
+        newline_4 = vocab["\n\n\n\n"]
+
+        def get_repl_toks(tok: int) -> list[int]:
+            if tok == newline_3:
+                return [newline_1, newline_2]
+            if tok == newline_4:
+                return [newline_2, newline_2]
+
+            return [tok]
+
+        repl_token_ids = list[int]()
+        repl_orig_idxs = list[int]()
+        for orig_idx, orig_tok in enumerate(new_token_ids):
+            repl_toks = get_repl_toks(orig_tok)
+            repl_token_ids.extend(repl_toks)
+            repl_orig_idxs.extend(orig_idx for _ in range(len(repl_toks)))
+
+        repls = find_mm_placeholders(mm_prompt_updates, repl_token_ids,
+                                     mm_item_counts)
+
+        return {
+            modality: [
+                PlaceholderFeaturesInfo(
+                    modality=p.modality,
+                    item_idx=p.item_idx,
+                    start_idx=repl_orig_idxs[p.start_idx],
+                    tokens=p.tokens,
+                ) for p in placeholders
+            ]
+            for modality, placeholders in repls.items()
+        }
+
 
 class Gemma3MultiModalProjector(nn.Module):
 
@@ -374,7 +516,7 @@ def forward(self, vision_outputs: torch.Tensor):
                                         info=Gemma3ProcessingInfo,
                                         dummy_inputs=Gemma3DummyInputsBuilder)
 class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
-                                     SupportsLoRA, SupportsV0Only):
+                                     SupportsLoRA):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -415,6 +557,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
     @property
     def sampler(self):
         return self.language_model.sampler
@@ -438,6 +584,8 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        num_embeds = kwargs.pop("num_embeds", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -448,16 +596,26 @@ def _parse_and_validate_image_input(
                              f"Got type: {type(pixel_values)}")
 
         if not isinstance(num_crops, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_crops values. "
+            raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        if not isinstance(num_embeds, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_embeds. "
+                             f"Got type: {type(num_embeds)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
-            num_crops=num_crops,
+            num_patches=num_crops + 1,
+            embed_is_patch=embed_is_patch,
+            num_embeds=num_embeds,
         )
 
     def _image_pixels_to_features(
@@ -472,36 +630,51 @@ def _image_pixels_to_features(
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> torch.Tensor:
+    ) -> tuple[torch.Tensor, ...]:
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
-        vision_outputs = self._image_pixels_to_features(
+        num_patches = image_input["num_patches"]
+
+        image_features = self._image_pixels_to_features(
             self.vision_tower,
             pixel_values,
         )
-        return self.multi_modal_projector(vision_outputs)
+        image_embeds = self.multi_modal_projector(image_features)
+
+        return image_embeds.split(num_patches.tolist())
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+
+        image_features = self._process_image_input(image_input)
+
+        if kwargs.get("v0_path", False):
+            return image_features
+
+        return flatten_2d_lists(
+            scatter_patch_features(*args) for args in zip(
+                image_features,
+                image_input["num_embeds"],
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
         input_ids: torch.Tensor,
         multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
     ) -> torch.Tensor:
-        if multimodal_embeddings is None:
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-        else:
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_index)
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.config.image_token_index,
+            )
         return inputs_embeds
 
     def forward(self,
@@ -516,6 +689,7 @@ def forward(self,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
@@ -524,8 +698,9 @@ def forward(self,
                 kwargs = self.prepare_attn_masks(
                     input_ids,
                     positions,
-                    mask_dtype=vision_embeddings.dtype,
-                    **kwargs)
+                    mask_dtype=self.dtype,
+                    **kwargs,
+                )
             input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3a8d184528d..441ccde046e 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,7 +18,7 @@
 
 from vllm.config import VllmConfig
 from vllm.inputs import InputProcessingContext
-from vllm.jsontree import JSONTree, json_map_leaves
+from vllm.jsontree import json_map_leaves
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
@@ -27,8 +27,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, MultiModalKwargs,
-                                    NestedTensors)
+                                    MultiModalInputs, MultiModalKwargs)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -44,7 +43,8 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -76,7 +76,7 @@ class PixtralHFImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_patches: Union[torch.Tensor, list[torch.Tensor]]
+    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -352,15 +352,15 @@ def _call_hf_processor(
                     image_height=pixel_value.shape[-2],
                 ) for pixel_value in processed_outputs["pixel_values"]
             ]
-            num_patches = torch.tensor([(ncols + 1) * nrows
-                                        for ncols, nrows in tile_sizes])
+            num_embeds = torch.tensor([(ncols + 1) * nrows
+                                       for ncols, nrows in tile_sizes])
             # Each image may result to masks of different sizes, so we need to
-            # later use `num_patches` to get per-image masks.
+            # later use `num_embeds` to get per-image masks.
             embed_is_patch = [
                 torch.tensor(([True] * ncols + [False]) * nrows)
                 for ncols, nrows in tile_sizes
             ]
-            processed_outputs["num_patches"] = num_patches
+            processed_outputs["num_embeds"] = num_embeds
             processed_outputs["embed_is_patch"] = embed_is_patch
 
         return processed_outputs
@@ -372,7 +372,7 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            num_patches=MultiModalFieldConfig.batched("image"),
+            num_embeds=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -621,16 +621,16 @@ def _parse_and_validate_image_input(
                     raise ValueError("Incorrect type of embed_is_patch. "
                                      f"Got type: {type(embed_is_patch)}")
 
-                num_patches = kwargs.pop("num_patches")
-                if not isinstance(num_patches, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of num_patches. "
-                                     f"Got type: {type(num_patches)}")
+                num_embeds = kwargs.pop("num_embeds")
+                if not isinstance(num_embeds, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of num_embeds. "
+                                     f"Got type: {type(num_embeds)}")
 
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
                     embed_is_patch=embed_is_patch,
-                    num_patches=num_patches,
+                    num_embeds=num_embeds,
                 )
 
             return LlavaImagePixelInputs(
@@ -716,33 +716,6 @@ def _process_image_input(
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
-    def _get_mm_embeds(
-            self,
-            features: torch.Tensor,  # Shape: (num_patch, d)
-            num_patches: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_images, num_embeds)
-    ) -> tuple[torch.Tensor, ...]:
-        """Scatter the patch features into a contiguous tensor that corresponds
-        to the embedding tokens defined by the multimodal processor.
-
-        Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
-        """
-        # Insert columns of nan values according to `embed_is_patch`. This work
-        # ideally should be done in `_process_image_input`, but
-        # `_process_image_input` is used in both V0 and V1 path. It's safer to
-        # put the logic here.
-        # FIXME: Move this logic to `_process_image_input` when v0 is
-        # deprecated. Merge this function with `Molmo._get_mm_embeds`.
-        num_patches_per_image: list[int] = num_patches.tolist()
-
-        embeds_flat = features.new_full(
-            (sum(num_patches_per_image), *features.shape[1:]),
-            fill_value=torch.nan,
-        )
-        embeds_flat[embed_is_patch.view(-1)] = features
-
-        return embeds_flat.split(num_patches_per_image)
-
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -757,9 +730,9 @@ def get_multimodal_embeddings(
             return vision_embeddings
 
         return flatten_2d_lists(
-            self._get_mm_embeds(*args) for args in zip(
+            scatter_patch_features(*args) for args in zip(
                 vision_embeddings,
-                image_input["num_patches"],
+                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
@@ -770,16 +743,10 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            # Extract the patch tokens
-            patch_embeddings = json_map_leaves(
-                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-                cast(JSONTree[torch.Tensor], multimodal_embeddings),
-            )
-
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                cast(NestedTensors, patch_embeddings),
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index c7f6cf461d5..3f0c644a5a8 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
-from typing import List, Optional, Set, Tuple, TypedDict, Union, cast
+from typing import List, Optional, Set, Tuple, TypedDict, Union
 
 import numpy as np
 import torch
@@ -24,7 +24,6 @@
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather)
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.activation import (MulAndSilu, QuickGELU,
                                                    SiluAndMul)
@@ -42,8 +41,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
-                                    NestedTensors)
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
@@ -59,6 +57,7 @@
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -1602,16 +1601,10 @@ def get_input_embeddings(
         if multimodal_embeddings is not None:
             assert self.img_patch_id is not None
 
-            # Extract the patch tokens scattered in _get_mm_embeds
-            patch_embeddings = json_map_leaves(
-                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-                cast(JSONTree[torch.Tensor], multimodal_embeddings),
-            )
-
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                cast(NestedTensors, patch_embeddings),
+                select_patch_features(multimodal_embeddings),
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index dc3402e4321..5da69ce7fa0 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass, fields
 from functools import cached_property
-from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union, cast
+from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -22,7 +22,6 @@
 
 from vllm.config import VllmConfig
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -48,7 +47,8 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
+                     scatter_patch_features, select_patch_features)
 
 try:
     from xformers import ops as xops
@@ -77,7 +77,7 @@ class PixtralImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_patches: Union[torch.Tensor, list[torch.Tensor]]
+    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -153,7 +153,7 @@ def __call__(
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
         images_embed_is_patch = list[torch.Tensor]()
-        images_num_patches = list[int]()
+        images_num_embeds = list[int]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -163,13 +163,13 @@ def __call__(
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
             images_embed_is_patch.append(image_tokens == image_token_id)
-            images_num_patches.append(len(image_tokens))
+            images_num_embeds.append(len(image_tokens))
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
             "embed_is_patch": images_embed_is_patch,
-            "num_patches": torch.tensor(images_num_patches),
+            "num_embeds": torch.tensor(images_num_embeds),
         }
 
 
@@ -273,7 +273,7 @@ def _get_mm_fields_config(
         return dict(
             images=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
-            num_patches=MultiModalFieldConfig.batched("image"),
+            num_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -394,16 +394,16 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of embed_is_patch. "
                              f"Got type: {type(embed_is_patch)}")
 
-        num_patches = kwargs.pop("num_patches")
-        if not isinstance(num_patches, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_patches. "
-                             f"Got type: {type(num_patches)}")
+        num_embeds = kwargs.pop("num_embeds")
+        if not isinstance(num_embeds, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_embeds. "
+                             f"Got type: {type(num_embeds)}")
 
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
             embed_is_patch=embed_is_patch,
-            num_patches=num_patches,
+            num_embeds=num_embeds,
         )
 
     def _process_image_input(
@@ -433,33 +433,6 @@ def _process_image_input(
         image_embeds = torch.split(image_embeds, feature_sizes)
         return image_embeds
 
-    def _get_mm_embeds(
-            self,
-            features: torch.Tensor,  # Shape: (num_patch, d)
-            num_patches: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_images, num_embeds)
-    ) -> tuple[torch.Tensor, ...]:
-        """Scatter the patch features into a contiguous tensor that corresponds
-        to the embedding tokens defined by the multimodal processor.
-
-        Mostly copied from `Molmo._get_mm_embeds`. See following fixme comment.
-        """
-        # Insert columns of nan values according to `embed_is_patch`. This work
-        # ideally should be done in `_process_image_input`, but
-        # `_process_image_input` is used in both V0 and V1 path. It's safer to
-        # put the logic here.
-        # FIXME: Move this logic to `_process_image_input` when v0 is
-        # deprecated. Merge this function with `Molmo._get_mm_embeds`.
-        num_patches_per_image: list[int] = num_patches.tolist()
-
-        embeds_flat = features.new_full(
-            (sum(num_patches_per_image), *features.shape[1:]),
-            fill_value=torch.nan,
-        )
-        embeds_flat[embed_is_patch.view(-1)] = features
-
-        return embeds_flat.split(num_patches_per_image)
-
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         image_input = self._parse_and_validate_image_input(**kwargs)
@@ -472,9 +445,9 @@ def get_multimodal_embeddings(
             return image_features
 
         return flatten_2d_lists(
-            self._get_mm_embeds(*args) for args in zip(
+            scatter_patch_features(*args) for args in zip(
                 image_features,
-                image_input["num_patches"],
+                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
@@ -485,15 +458,10 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            # Extract the patch tokens
-            patch_embeddings = json_map_leaves(
-                lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-                cast(JSONTree[torch.Tensor], multimodal_embeddings),
-            )
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                cast(NestedTensors, patch_embeddings),
+                select_patch_features(multimodal_embeddings),
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9a6fac2eec5..f316e7d0ef5 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
 
 import torch
 from transformers import PretrainedConfig
@@ -9,9 +9,12 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
+from .interfaces import MultiModalEmbeddings
+
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -148,3 +151,48 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
+
+
+def scatter_patch_features(
+    features: torch.Tensor,
+    num_embeds: torch.Tensor,
+    embed_is_patch: torch.Tensor,
+) -> tuple[torch.Tensor, ...]:
+    """
+    Scatter the patch features into a contiguous tensor that corresponds
+    to the embedding tokens defined by the multimodal processor.
+    
+    The rest of the values in the tensor are set to NaN so that they
+    can be filtered out by :func`select_patch_features`.
+
+    Args:
+        features: The patch features, concatenated across each image.
+          Shape: `(num_patch, feature_depth)`
+        num_embeds: The number of image embeddings for each image.
+          Shape: `(num_images,)`
+        embed_is_patch: A boolean mask indicating which image embeddings
+          correspond to patch tokens for each image.
+          Shape: `(num_images, num_embeds)`
+    """
+    num_embeds_per_image: list[int] = num_embeds.tolist()
+
+    embeds_flat = features.new_full(
+        (sum(num_embeds_per_image), features.shape[-1]),
+        fill_value=torch.nan,
+    )
+    embeds_flat[embed_is_patch.view(-1)] = features.flatten(0, -2)
+
+    return embeds_flat.split(num_embeds_per_image)
+
+
+def select_patch_features(
+        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
+    """
+    Given the outputs of :func:`scatter_patch_features`, return only
+    the values that correspond to patch features.
+    """
+    selected_features = json_map_leaves(
+        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+        cast(JSONTree[torch.Tensor], multimodal_embeddings),
+    )
+    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 10c53dfb2c6..b400e2701ac 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -511,8 +511,35 @@ def iter_token_matches(
             start_idx += 1
 
 
+def replace_token_matches(
+    token_ids: list[int],
+    match_ids: list[int],
+    new_ids: list[int],
+) -> list[int]:
+    """
+    Replace each occurrence of :code:`match_ids` in :code:`token_ids`
+    with :code:`new_ids`.
+
+    Note that empty matches are ignored.
+    """
+    out_seqs = list[list[int]]()
+    prev_end_idx = 0
+
+    for match in iter_token_matches(token_ids, match_ids):
+        start_idx = match.start_idx
+        end_idx = match.end_idx
+
+        out_seqs.append(token_ids[prev_end_idx:start_idx])
+        out_seqs.append(new_ids)
+        prev_end_idx = end_idx
+
+    out_seqs.append(token_ids[prev_end_idx:])
+
+    return flatten_2d_lists(out_seqs)
+
+
 @dataclass(repr=False)
-class _PromptTargetMatch(ABC):
+class PromptTargetMatch(ABC):
     _origin: BoundPromptUpdate
 
     @property
@@ -535,7 +562,7 @@ def __repr__(self) -> str:
 
 
 @dataclass(repr=False)
-class _PromptTargetIndexMatch(_PromptTargetMatch):
+class _PromptTargetIndexMatch(PromptTargetMatch):
     match_idx: int
 
     @property
@@ -548,7 +575,7 @@ def end_idx(self) -> int:
 
 
 @dataclass(repr=False)
-class _PromptTargetTokenMatch(_PromptTargetMatch):
+class _PromptTargetTokenMatch(PromptTargetMatch):
     match: _TokenMatch
 
     @property
@@ -561,7 +588,7 @@ def end_idx(self) -> int:
 
 
 @dataclass(repr=False)
-class _PromptTargetTextMatch(_PromptTargetMatch):
+class _PromptTargetTextMatch(PromptTargetMatch):
     match: re.Match[str]
 
     @property
@@ -594,7 +621,7 @@ def to_range(self) -> PlaceholderRange:
 def find_token_matches(
     prompt: list[int],
     prompt_updates: Sequence[BoundPromptUpdate],
-) -> Sequence[_PromptTargetMatch]:
+) -> Sequence[PromptTargetMatch]:
     """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
 
     def get_matches(update: BoundPromptUpdate):
@@ -620,7 +647,7 @@ def get_matches(update: BoundPromptUpdate):
 def find_text_matches(
     prompt: str,
     prompt_updates: Sequence[BoundPromptUpdate],
-) -> Sequence[_PromptTargetMatch]:
+) -> Sequence[PromptTargetMatch]:
     """Return each target of :code:`prompt_updates` found in :code:`prompt`."""
 
     def get_matches(update: BoundPromptUpdate):
@@ -645,15 +672,15 @@ def get_matches(update: BoundPromptUpdate):
 
 def _resolve_matches(
     prompt: PromptSeq,
-    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
-) -> list[_PromptTargetMatch]:
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+) -> list[PromptTargetMatch]:
     """
     Resolve :code:`mm_matches` to ensure that there are no overlapping matches,
     and sort them such that earlier matches take priority over later ones.
     """
     matches = [m for matches in mm_matches.values() for m in matches]
 
-    seen_matches: list[Optional[_PromptTargetMatch]] = [None] * len(prompt)
+    seen_matches: list[Optional[PromptTargetMatch]] = [None] * len(prompt)
 
     for match in matches:
         for idx in range(match.start_idx, match.end_idx):
@@ -669,7 +696,7 @@ def _resolve_matches(
 
 def _apply_matches(
     prompt: _S,
-    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[_S]:
     """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
@@ -718,7 +745,7 @@ def _apply_matches(
 
 def apply_token_matches(
     prompt: list[int],
-    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> list[int]:
     """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
@@ -732,7 +759,7 @@ def apply_token_matches(
 
 def apply_text_matches(
     prompt: str,
-    mm_matches: Mapping[str, Sequence[_PromptTargetMatch]],
+    mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
     mm_item_counts: Mapping[str, int],
 ) -> str:
     """Apply the updates in :code:`mm_matches` to :code:`prompt`."""
@@ -1055,14 +1082,14 @@ def _get_prompt_updates(
         Given the original multi-modal items for this modality
         and HF-processed data, output the updates to perform.
 
-        Notes:
-            - You should not assume that HF processor always performs prompt
-              updates: in :meth:`_apply_hf_processor_missing`, this method
-              is called on text-only and multimodal-only inputs separately,
-              instead of passing them in the same call.
-            - The update information returned by this method is also used to
-              determine the placeholder token positions for each multi-modal
-              item.
+        The information returned by this method is used to update token inputs
+        which bypass the HF processor. It is also used to update the output of
+        HF processor if the HF process does not apply prompt updates to text
+        inputs.
+
+        Moreover, this information is critical to determine the token positions
+        in order to construct  :class:`~vllm-multimodal.input.PlaceholderRange`
+        for each multi-modal item.
         """
         raise NotImplementedError
 
@@ -1357,6 +1384,22 @@ def _bind_and_group_updates(
         it = (update.bind(tokenizer) for update in prompt_updates)
         return dict(full_groupby_modality(it))
 
+    def _apply_token_matches(
+        self,
+        prompt: list[int],
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> list[int]:
+        return apply_token_matches(prompt, mm_matches, mm_item_counts)
+
+    def _apply_text_matches(
+        self,
+        prompt: str,
+        mm_matches: Mapping[str, Sequence[PromptTargetMatch]],
+        mm_item_counts: Mapping[str, int],
+    ) -> str:
+        return apply_text_matches(prompt, mm_matches, mm_item_counts)
+
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
@@ -1388,7 +1431,7 @@ def _apply_prompt_updates(
             mm_match_counts.get(modality, 0) >= item_count
             for modality, item_count in mm_item_counts.items()
         ):  # yapf: disable
-            token_ids = apply_token_matches(
+            token_ids = self._apply_token_matches(
                 token_ids,
                 mm_token_matches,
                 mm_item_counts,
@@ -1406,7 +1449,7 @@ def _apply_prompt_updates(
                 modality: find_text_matches(text, updates)
                 for modality, updates in mm_prompt_updates.items()
             }
-            text = apply_text_matches(
+            text = self._apply_text_matches(
                 text,
                 mm_text_matches,
                 mm_item_counts,

From 3960d6de3e901e2b984fb04c94a2b011f963e3e2 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 19 Mar 2025 01:29:25 -0700
Subject: [PATCH 0833/1240] [CI][Intel GPU] update XPU dockerfile and CI script
 (#15109)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-xpu-test.sh |  9 +++++----
 Dockerfile.xpu             | 12 +++---------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index a9c71201a74..3a0e6bdb2ca 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -12,10 +12,11 @@ docker build -t ${image_name} -f Dockerfile.xpu .
 
 # Setup cleanup
 remove_docker_container() { 
-  docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
+  docker rm -f "${container_name}" || true; 
+  docker image rm -f "${image_name}" || true;
+  docker system prune -f || true;
 }
 trap remove_docker_container EXIT
-remove_docker_container
 
 # Run the image and test offline inference/tensor parallel
 docker run \
@@ -25,6 +26,6 @@ docker run \
     --name "${container_name}" \
     "${image_name}" \
     sh -c '
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 '
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 672a494eef9..ad4abf16b43 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -1,11 +1,7 @@
-FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
+# oneapi 2025.0.2 docker base image use rolling 2448 package. https://dgpu-docs.intel.com/releases/packages.html?release=Rolling+2448.13&os=Ubuntu+22.04, and we don't need install driver manually.
+FROM intel/deep-learning-essentials:2025.0.2-0-devel-ubuntu22.04 AS vllm-base
 
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+RUN rm /etc/apt/sources.list.d/intel-graphics.list
 
 RUN apt-get update -y && \
     apt-get install -y --no-install-recommends --fix-missing \
@@ -21,8 +17,6 @@ RUN apt-get update -y && \
     python3 \
     python3-dev \
     python3-pip \
-    libze-intel-gpu-dev \
-    libze-intel-gpu1 \
     wget
 
 WORKDIR /workspace/vllm

From 192264dfd4b29d8b11317366b9b53f925cb01b4b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 19 Mar 2025 03:35:32 -0700
Subject: [PATCH 0834/1240] [V1][Bugfix] Fix oracle for device checking
 (#15104)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 02a9ec46939..43bf2fe8f09 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1578,6 +1578,13 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             _raise_or_fallback(feature_name=name, recommend_to_remove=True)
             return False
 
+        # No support for device type other than CUDA, AMD (experiemntal) or
+        # TPU (experimental) so far.
+        if not (current_platform.is_cuda_alike() or current_platform.is_tpu()):
+            _raise_or_fallback(
+                feature_name=f"device type={current_platform.device_type}",
+                recommend_to_remove=False)
+            return False
         #############################################################
         # Experimental Features - allow users to opt in.
 

From 50df98dc1b28729baf2d5f185f8d5e22eb0cc46d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Mar 2025 18:42:31 +0800
Subject: [PATCH 0835/1240] [Misc] Avoid unnecessary HF `do_rescale` warning
 when passing dummy data (#15107)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/multimodal/profiling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 62b75afe8de..7b4fb5eb598 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -73,7 +73,7 @@ def _get_dummy_images(
         height: int,
         num_images: int,
     ) -> list[Image.Image]:
-        image = Image.new("RGB", (width, height), color=0)
+        image = Image.new("RGB", (width, height), color=255)
         return [image] * num_images
 
     def _get_dummy_videos(
@@ -84,7 +84,7 @@ def _get_dummy_videos(
         num_frames: int,
         num_videos: int,
     ) -> list[npt.NDArray]:
-        video = np.zeros((num_frames, width, height, 3))
+        video = np.full((num_frames, width, height, 3), 255)
         return [video] * num_videos
 
 
From d919aa33703b375d5cf6bbf74e3e17f5bdbafc96 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 19 Mar 2025 20:53:19 +0800
Subject: [PATCH 0836/1240] [Bugfix] Fix size calculation of processing cache
 (#15114)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/multimodal/test_processing.py | 48 ++++++++++++++++++++++-
 vllm/multimodal/processing.py       | 60 ++++++++++++++++++++++-------
 2 files changed, 92 insertions(+), 16 deletions(-)

diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2e6dde75dc9..b229f1e6ec8 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -7,15 +7,20 @@
 
 import numpy as np
 import pytest
+import torch
 from transformers import ProcessorMixin
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        PromptIndexTargets, PromptInsertion,
-                                        PromptReplacement, apply_text_matches,
+                                        ProcessingCache, PromptIndexTargets,
+                                        PromptInsertion, PromptReplacement,
+                                        apply_text_matches,
                                         apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
@@ -890,6 +895,45 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = ProcessingCache.get_lru_cache(2048, type(item))
+    cache[""] = item
+
+    assert cache.currsize == expected_size
+
+
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index b400e2701ac..db995957a7f 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -26,7 +26,7 @@
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs, MultiModalKwargs,
-                     MultiModalKwargsItem, PlaceholderRange)
+                     MultiModalKwargsItem, NestedTensors, PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -853,33 +853,62 @@ class ProcessingCache:
 
     @staticmethod
     def get_lru_cache(
-        capacity_gb: int,
+        capacity_gb: float,
         value_type: type[_V],
+        *,
+        debug: bool = False,
     ) -> LRUCache[str, _V]:
 
-        def get_size(leaf: object) -> int:
+        def get_leaf_size(leaf: object) -> int:
+            # MultiModalKwargs is not a subclass of dict
+            if isinstance(leaf, MultiModalKwargs):
+                return get_item_size(leaf.data)
+
+            # MultiModalKwargsItem is not a subclass of dict
+            if isinstance(leaf, MultiModalKwargsItem):
+                leaf_data = {k: v.data for k, v in leaf.items()}
+                return get_item_size(leaf_data)
+
+            # sys.getsizeof doesn't work for tensors
             if isinstance(leaf, torch.Tensor):
-                return leaf.nbytes  # sys.getsizeof doesn't work for tensors
+                return leaf.nbytes
 
             return sys.getsizeof(leaf)
 
-        return LRUCache[str, _V](
-            GiB_bytes * capacity_gb,
-            getsizeof=lambda x: json_reduce_leaves(
+        def get_item_size(
+            value: Union[MultiModalKwargs, MultiModalKwargsItem,
+                         Mapping[str, NestedTensors]]
+        ) -> int:
+            size = json_reduce_leaves(
                 lambda a, b: a + b,
-                json_map_leaves(get_size, x),
-            ),
-        )
+                json_map_leaves(get_leaf_size, value),
+            )
+
+            if debug:
+                logger.debug("Calculated size of %s to be %.2f GiB",
+                             type(value), size / GiB_bytes)
 
-    def __init__(self, capacity_gb: int) -> None:
+            return size
+
+        return LRUCache(GiB_bytes * capacity_gb, getsizeof=get_item_size)
+
+    def __init__(
+        self,
+        capacity_gb: float,
+        *,
+        debug_cache_hit_ratio_steps: Optional[int] = None,
+    ) -> None:
         super().__init__()
 
-        # DEBUG: Set to None to disable
-        self.debug_cache_hit_ratio_steps: Optional[int] = None
+        self.debug_cache_hit_ratio_steps = debug_cache_hit_ratio_steps
         self.debug_cache_hits = 0
         self.debug_cache_total = 0
 
-        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
+        self._cache = self.get_lru_cache(
+            capacity_gb,
+            MultiModalKwargsItem,
+            debug=bool(debug_cache_hit_ratio_steps),
+        )
 
     def _maybe_log_cache_stats(self) -> None:
         steps = self.debug_cache_hit_ratio_steps
@@ -890,6 +919,9 @@ def _maybe_log_cache_stats(self) -> None:
         if total > 0 and total % steps == 0:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
                          self.debug_cache_hits / total)
+            logger.debug("ProcessingCache: size = %.2f / %.2f GiB",
+                         self._cache.currsize / GiB_bytes,
+                         self._cache.maxsize / GiB_bytes)
 
     def get(
         self,

From 0adb5a18e11529d73872583fdbcdf172ad1bf641 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Alexandre=20C=C3=B4t=C3=A9?= <marc.cote.19@gmail.com>
Date: Wed, 19 Mar 2025 09:33:40 -0400
Subject: [PATCH 0837/1240] [Doc] Update tip info on using latest transformers
 when creating a custom Dockerfile  (#15070)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/docker.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 9e52a2182cf..1f60faf4087 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
 ```Dockerfile
-FROM vllm/vllm-openai:v0.7.3
+FROM vllm/vllm-openai:v0.8.0
 
 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.7.3
+RUN uv pip install vllm[audio,video]==0.8.0
 ```
 
 :::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest
 
-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+RUN uv pip install git+https://github.com/huggingface/transformers.git
 ```
 
 :::

From 53d16e48dec2d0e22ae9b1f43fb67d8a3201deda Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 19 Mar 2025 10:56:50 -0400
Subject: [PATCH 0838/1240] [Misc][Benchmark] Add support for different
 `tokenizer_mode` (#15040)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving_structured_output.py | 14 ++++++++++++--
 benchmarks/run_structured_output_benchmark.sh     |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 3a6e962c115..444bda2ad26 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -732,8 +732,11 @@ def main(args: argparse.Namespace):
         api_url = f"http://{args.host}:{args.port}{args.endpoint}"
         base_url = f"http://{args.host}:{args.port}"
 
-    tokenizer = get_tokenizer(tokenizer_id,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        trust_remote_code=args.trust_remote_code,
+        tokenizer_mode=args.tokenizer_mode,
+    )
 
     if args.dataset == 'grammar':
         args.structure_type = 'guided_grammar'
@@ -876,6 +879,13 @@ def main(args: argparse.Namespace):
         help=
         "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
     )
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        help=
+        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
     parser.add_argument(
         "--num-prompts",
         type=int,
diff --git a/benchmarks/run_structured_output_benchmark.sh b/benchmarks/run_structured_output_benchmark.sh
index 8a777320f73..126dfbc2441 100755
--- a/benchmarks/run_structured_output_benchmark.sh
+++ b/benchmarks/run_structured_output_benchmark.sh
@@ -54,6 +54,7 @@ for qps in "${QPS_VALUES[@]}"; do
   python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
     --request-rate $qps \
     --result-filename "$FILENAME" \
+    --tokenizer-mode ${TOKENIZER_MODE:-"auto"} \
     --port ${PORT:-8000}
 
   echo "Completed benchmark with QPS: $qps"

From 1fb6b5cce5a1f088be663e3a133fb74393c97783 Mon Sep 17 00:00:00 2001
From: Jan Kaniecki <jkaniecki@habana.ai>
Date: Wed, 19 Mar 2025 15:57:25 +0100
Subject: [PATCH 0839/1240]  [Bugfix] Adjust mllama to regional compilation
 (#15112)

Signed-off-by: Jan Kaniecki <jkaniecki@habana.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/mllama.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index afc30f93b52..8e98eb273cd 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1070,8 +1070,8 @@ def forward(
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
 
-        for decoder_layer in self.layers:
-            if isinstance(decoder_layer, MllamaCrossAttentionDecoderLayer):
+        for idx, decoder_layer in enumerate(self.layers):
+            if idx in self.cross_attention_layers:
                 if not skip_cross_attention:
                     hidden_states = decoder_layer(
                         hidden_states=hidden_states,
@@ -1081,16 +1081,13 @@ def forward(
                         full_text_row_masked_out_mask=
                         full_text_row_masked_out_mask,
                     )
-            elif isinstance(decoder_layer, LlamaDecoderLayer):
+            else:
                 hidden_states, residual = decoder_layer(
                     positions=positions,
                     hidden_states=hidden_states,
                     residual=None,
                 )
                 hidden_states = hidden_states + residual
-            else:
-                raise ValueError(
-                    f"Unknown decoder layer type {type(decoder_layer)}")
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
@@ -1551,4 +1548,4 @@ def convert_dense_cross_attention_mask_to_tensor(
     full_text_mask = ((mask != ninf).any(dim=-1).type_as(mask)[..., None])
     mask *= full_text_mask
     # (num_prompt_tokens, num_encoder_tokens)
-    return mask
\ No newline at end of file
+    return mask

From f357306565464db28f9e5b150e355befbf56afa7 Mon Sep 17 00:00:00 2001
From: Kero Liang <kerorek@outlook.com>
Date: Wed, 19 Mar 2025 23:07:39 +0800
Subject: [PATCH 0840/1240] [Misc] Update the "the first vLLM China Meetup"
 slides link to point to the first page (#15134)

Signed-off-by: imkero <kerorek@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f61b4218e18..20906f4f316 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Easy, fast, and cheap LLM serving for everyone
 
 *Latest News* 🔥
 
-- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).

From be98894fb916109ea10f57933a5c174ace5925af Mon Sep 17 00:00:00 2001
From: Alessandro Sangiorgi <asangior@redhat.com>
Date: Wed, 19 Mar 2025 11:13:50 -0500
Subject: [PATCH 0841/1240] [Frontend] Remove custom_cache_manager (#13791)

Signed-off-by: fulvius31 <asangior@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/multiproc_worker_utils.py   |  8 ----
 vllm/triton_utils/__init__.py             |  9 +---
 vllm/triton_utils/custom_cache_manager.py | 55 -----------------------
 3 files changed, 1 insertion(+), 71 deletions(-)
 delete mode 100644 vllm/triton_utils/custom_cache_manager.py

diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index 74237f9eb45..a4a5b3f938c 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -16,12 +16,8 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import _check_multiproc_method, get_mp_context, run_method
 
-if HAS_TRITON:
-    from vllm.triton_utils import maybe_set_triton_cache_manager
-
 logger = init_logger(__name__)
 
 T = TypeVar('T')
@@ -314,7 +310,3 @@ def set_multiprocessing_worker_envs(parallel_config):
             current_parallelism, default_omp_num_threads)
         os.environ["OMP_NUM_THREADS"] = str(default_omp_num_threads)
         torch.set_num_threads(default_omp_num_threads)
-
-    # workaround for https://github.com/vllm-project/vllm/issues/6103
-    if HAS_TRITON and parallel_config.world_size > 1:
-        maybe_set_triton_cache_manager()
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index c8f7a32ce7a..43918bcd7c5 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -2,11 +2,4 @@
 
 from vllm.triton_utils.importing import HAS_TRITON
 
-__all__ = ["HAS_TRITON"]
-
-if HAS_TRITON:
-
-    from vllm.triton_utils.custom_cache_manager import (
-        maybe_set_triton_cache_manager)
-
-    __all__ += ["maybe_set_triton_cache_manager"]
+__all__ = ["HAS_TRITON"]
\ No newline at end of file
diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
deleted file mode 100644
index 4163969c9a5..00000000000
--- a/vllm/triton_utils/custom_cache_manager.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from triton.runtime.cache import (FileCacheManager, default_cache_dir,
-                                  default_dump_dir, default_override_dir)
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-def maybe_set_triton_cache_manager() -> None:
-    """Set environment variable to tell Triton to use a
-    custom cache manager"""
-    cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
-    if cache_manger is None:
-        manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager"
-        logger.info("Setting Triton cache manager to: %s", manager)
-        os.environ["TRITON_CACHE_MANAGER"] = manager
-
-
-class CustomCacheManager(FileCacheManager):
-    """Re-implements Triton's cache manager, ensuring that a
-    unique cache directory is created for each process. This is
-    needed to avoid collisions when running with tp>1 and
-    using multi-processing as the distributed backend.
-
-    Note this issue was fixed by triton-lang/triton/pull/4295,
-    but the fix is not yet included in triton==v3.0.0. However,
-    it should be included in the subsequent version.
-    """
-
-    def __init__(self, key, override=False, dump=False):
-        self.key = key
-        self.lock_path = None
-        if dump:
-            self.cache_dir = default_dump_dir()
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-            self.lock_path = os.path.join(self.cache_dir, "lock")
-            os.makedirs(self.cache_dir, exist_ok=True)
-        elif override:
-            self.cache_dir = default_override_dir()
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-        else:
-            # create cache directory if it doesn't exist
-            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
-                                       "").strip() or default_cache_dir()
-            if self.cache_dir:
-                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
-                self.cache_dir = os.path.join(self.cache_dir, self.key)
-                self.lock_path = os.path.join(self.cache_dir, "lock")
-                os.makedirs(self.cache_dir, exist_ok=True)
-            else:
-                raise RuntimeError("Could not create or locate cache dir")

From 2cef566a81a07c7664684aea3876ef9fd1a95032 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Wed, 19 Mar 2025 10:37:17 -0700
Subject: [PATCH 0842/1240] [V1] Minor V1 async engine test refactor (#15075)

Signed-off-by: andoorve <murali.andoorveedu@mail.utoronto.ca>
Co-authored-by: andoorve <murali.andoorveedu@mail.utoronto.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_async_llm.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 0ff804976ad..da0639678af 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -76,21 +76,18 @@ async def generate(engine: AsyncLLM,
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args_and_prompt",
+@pytest.mark.parametrize("engine_args,prompt",
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_load(
-    monkeypatch: pytest.MonkeyPatch,
-    output_kind: RequestOutputKind,
-    engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
-):
+async def test_load(monkeypatch: pytest.MonkeyPatch,
+                    output_kind: RequestOutputKind,
+                    engine_args: AsyncEngineArgs, prompt: PromptType):
     # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
     # so that in the future when we switch, we don't have to change all the
     # tests.
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
-        engine_args, prompt = engine_args_and_prompt
 
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -124,18 +121,16 @@ async def test_load(
 
 @pytest.mark.parametrize(
     "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
-@pytest.mark.parametrize("engine_args_and_prompt",
+@pytest.mark.parametrize("engine_args,prompt",
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
 async def test_abort(monkeypatch: pytest.MonkeyPatch,
                      output_kind: RequestOutputKind,
-                     engine_args_and_prompt: tuple[AsyncEngineArgs,
-                                                   PromptType]):
+                     engine_args: AsyncEngineArgs, prompt: PromptType):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
-        engine_args, prompt = engine_args_and_prompt
 
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)
@@ -193,17 +188,15 @@ async def test_abort(monkeypatch: pytest.MonkeyPatch,
 
 
 @pytest.mark.parametrize("n", [1, 3])
-@pytest.mark.parametrize("engine_args_and_prompt",
+@pytest.mark.parametrize("engine_args,prompt",
                          [(TEXT_ENGINE_ARGS, TEXT_PROMPT),
                           (VISION_ENGINE_ARGS, VISION_PROMPT)])
 @pytest.mark.asyncio
-async def test_finished_flag(monkeypatch, n: int,
-                             engine_args_and_prompt: tuple[AsyncEngineArgs,
-                                                           PromptType]):
+async def test_finished_flag(monkeypatch: pytest.MonkeyPatch, n: int,
+                             engine_args: AsyncEngineArgs, prompt: PromptType):
 
     with monkeypatch.context() as m, ExitStack() as after:
         m.setenv("VLLM_USE_V1", "1")
-        engine_args, prompt = engine_args_and_prompt
 
         engine = AsyncLLM.from_engine_args(engine_args)
         after.callback(engine.shutdown)

From c841d46f7665ddb80710ee3bd11f1f32c60b2d8b Mon Sep 17 00:00:00 2001
From: maobaolong <307499405@qq.com>
Date: Thu, 20 Mar 2025 01:54:41 +0800
Subject: [PATCH 0843/1240] [FEAT]Support reset prefix cache by specified
 device (#15003)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/core/block/cpu_gpu_block_allocator.py   |  6 ++++--
 vllm/core/block/interfaces.py                |  2 +-
 vllm/core/block_manager.py                   |  4 ++--
 vllm/core/interfaces.py                      |  6 +++---
 vllm/core/placeholder_block_space_manager.py |  4 ++--
 vllm/core/scheduler.py                       |  4 ++--
 vllm/engine/async_llm_engine.py              |  7 ++++---
 vllm/engine/llm_engine.py                    |  4 ++--
 vllm/engine/multiprocessing/__init__.py      |  7 ++++---
 vllm/engine/multiprocessing/client.py        |  7 ++++---
 vllm/engine/protocol.py                      |  5 +++--
 vllm/entrypoints/llm.py                      |  7 ++++---
 vllm/entrypoints/openai/api_server.py        | 10 +++++++---
 vllm/v1/engine/async_llm.py                  |  7 +++++--
 vllm/v1/engine/llm_engine.py                 |  3 ++-
 15 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 359b5b263f6..d64142e77f3 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -341,8 +341,10 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         assert device in self._allocators
         return self._allocators[device].get_prefix_cache_hit_rate()
 
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache for all devices."""
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
+        if device:
+            return self._allocators[device].reset_prefix_cache()
         success = True
         for allocator in self._allocators.values():
             success = success and allocator.reset_prefix_cache()
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 0b0197deb8d..30165699643 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -305,7 +305,7 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         pass
 
     @abstractmethod
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache."""
         pass
 
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c5b3b04f37c..c6bf6d16313 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -456,8 +456,8 @@ def get_num_free_cpu_blocks(self) -> int:
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_allocator.get_prefix_cache_hit_rate(device)
 
-    def reset_prefix_cache(self) -> bool:
-        return self.block_allocator.reset_prefix_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_allocator.reset_prefix_cache(device)
 
     def _can_swap(self,
                   seq_group: SequenceGroup,
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index b48ba87e95a..4c1182debce 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -2,7 +2,7 @@
 
 import enum
 from abc import ABC, abstractmethod
-from typing import List
+from typing import List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple
 
@@ -125,8 +125,8 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float:
         pass
 
     @abstractmethod
-    def reset_prefix_cache(self) -> bool:
-        """Reset prefix cache for all devices."""
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        """Reset prefix cache for specified or all devices."""
         pass
 
     @abstractmethod
diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py
index 70c22afa8e1..0f5d8ca6dc7 100644
--- a/vllm/core/placeholder_block_space_manager.py
+++ b/vllm/core/placeholder_block_space_manager.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup
@@ -92,7 +92,7 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return -1
 
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         return True
 
     def get_num_cached_tokens(self, seq: Sequence) -> int:
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e93143c83d9..cf85a2135c8 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -634,8 +634,8 @@ def has_unfinished_seqs(self) -> bool:
     def get_prefix_cache_hit_rate(self, device: Device) -> float:
         return self.block_manager.get_prefix_cache_hit_rate(device)
 
-    def reset_prefix_cache(self) -> bool:
-        return self.block_manager.reset_prefix_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.block_manager.reset_prefix_cache(device)
 
     def get_num_unfinished_seq_groups(self) -> int:
         return len(self.waiting) + len(self.running) + len(self.swapped)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 63787590bf4..c6fafbeea9c 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -35,7 +35,7 @@
 from vllm.sequence import ExecuteModelRequest
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import deprecate_kwargs, weak_bind
+from vllm.utils import Device, deprecate_kwargs, weak_bind
 
 logger = init_logger(__name__)
 ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -1216,8 +1216,9 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         self.engine.stop_profile()
 
-    async def reset_prefix_cache(self) -> None:
-        self.engine.reset_prefix_cache()
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        self.engine.reset_prefix_cache(device)
 
     async def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index ca50f08a380..51a82c415a8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -955,12 +955,12 @@ def has_unfinished_requests_for_virtual_engine(
         """
         return self.scheduler[virtual_engine].has_unfinished_seqs()
 
-    def reset_prefix_cache(self) -> bool:
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache for all devices."""
 
         success = True
         for scheduler in self.scheduler:
-            success = success and scheduler.reset_prefix_cache()
+            success = success and scheduler.reset_prefix_cache(device)
         return success
 
     @staticmethod
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index 144dd822a17..fdad53580ee 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -13,7 +13,7 @@
 from vllm.outputs import RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils import deprecate_kwargs
+from vllm.utils import Device, deprecate_kwargs
 
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
 
@@ -123,8 +123,9 @@ class RPCUProfileRequest(Enum):
     STOP_PROFILE = 2
 
 
-class RPCResetPrefixCacheRequest(Enum):
-    RESET_PREFIX_CACHE = 1
+@dataclass
+class RPCResetPrefixCacheRequest:
+    device: Device
 
 
 class RPCSleepRequest(Enum):
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index e2ae9486e43..db91c5d3564 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -47,7 +47,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
-from vllm.utils import deprecate_kwargs
+from vllm.utils import Device, deprecate_kwargs
 
 logger = init_logger(__name__)
 
@@ -684,11 +684,12 @@ async def stop_profile(self) -> None:
         await self._send_one_way_rpc_request(
             request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket)
 
-    async def reset_prefix_cache(self) -> None:
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
         """Reset the prefix cache"""
 
         await self._send_one_way_rpc_request(
-            request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE,
+            request=RPCResetPrefixCacheRequest(device),
             socket=self.input_socket)
 
     async def sleep(self, level: int = 1) -> None:
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index f314075b166..be9f3af0b54 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -18,7 +18,7 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import collect_from_async_generator, random_uuid
+from vllm.utils import Device, collect_from_async_generator, random_uuid
 
 logger = init_logger(__name__)
 
@@ -274,7 +274,8 @@ async def stop_profile(self) -> None:
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self) -> None:
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
         """Reset the prefix cache"""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index a0e2fa2918b..84b0093ce4c 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -42,7 +42,8 @@
                                                get_cached_tokenizer)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of
+from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
+                        is_list_of)
 
 logger = init_logger(__name__)
 
@@ -1187,8 +1188,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self) -> bool:
-        return self.llm_engine.reset_prefix_cache()
+    def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
+        return self.llm_engine.reset_prefix_cache(device)
 
     def sleep(self, level: int = 1):
         """
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ef557193ae2..9eb17726b1f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -85,7 +85,7 @@
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
+from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
 from vllm.version import __version__ as VLLM_VERSION
 
@@ -677,8 +677,12 @@ async def reset_prefix_cache(raw_request: Request):
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
-        logger.info("Resetting prefix cache...")
-        await engine_client(raw_request).reset_prefix_cache()
+        device = None
+        device_str = raw_request.query_params.get("device")
+        if device_str is not None:
+            device = Device[device_str.upper()]
+        logger.info("Resetting prefix cache with specific %s...", str(device))
+        await engine_client(raw_request).reset_prefix_cache(device)
         return Response(status_code=200)
 
     @router.post("/sleep")
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index d4ac9c066d5..171c1c7da28 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -24,7 +24,7 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import cdiv, kill_process_tree
+from vllm.utils import Device, cdiv, kill_process_tree
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
@@ -398,7 +398,10 @@ async def start_profile(self) -> None:
     async def stop_profile(self) -> None:
         await self.engine_core.profile_async(False)
 
-    async def reset_prefix_cache(self) -> None:
+    async def reset_prefix_cache(self,
+                                 device: Optional[Device] = None) -> None:
+        if device == Device.CPU:
+            raise ValueError("Not supported on CPU.")
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 63b0a8fca32..14338e5cbe8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -20,6 +20,7 @@
 from vllm.transformers_utils.tokenizer_group import (
     BaseTokenizerGroup, init_tokenizer_from_configs)
 from vllm.usage.usage_lib import UsageContext
+from vllm.utils import Device
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
@@ -226,7 +227,7 @@ def start_profile(self):
     def stop_profile(self):
         self.engine_core.profile(False)
 
-    def reset_prefix_cache(self):
+    def reset_prefix_cache(self, device: Optional[Device] = None):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):

From c3a35a8c38a3f344a8c607ec68cc6d5b7be2b67f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Thu, 20 Mar 2025 02:26:27 +0800
Subject: [PATCH 0844/1240] simple bugfix: Update stats.py (#15139)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/metrics/stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 36317fc9aee..83383ce1f3f 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -6,7 +6,7 @@
 
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
-    from vllm.v1.output_processor import RequestState
+    from vllm.v1.engine.output_processor import RequestState
 
 
 @dataclass

From 8a541544026fdb477ba206774f4f3788cb23e733 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Wed, 19 Mar 2025 12:16:42 -0700
Subject: [PATCH 0845/1240] [V1][TPU] Change kv cache shape. (#15145)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/tpu.txt                 | 12 ++++++------
 vllm/v1/attention/backends/pallas.py | 17 +++++++----------
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 7246fc19bfa..35d5db6c460 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index bbbdf50ac0c..14d3664db0d 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -41,7 +41,7 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads, head_size)
+        return (num_blocks, block_size, num_kv_heads * head_size)
 
     @staticmethod
     def swap_blocks(
@@ -142,8 +142,8 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_blocks, block_size, num_kv_heads, head_size], 
-                        [num_blocks, block_size, num_kv_heads, head_size])
+            kv_cache = ([num_blocks, block_size, num_kv_heads * head_size], 
+                        [num_blocks, block_size, num_kv_heads * head_size])
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -157,8 +157,6 @@ def forward(
         assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
-        key = key.view(num_tokens, self.num_kv_heads, self.head_size)
-        value = value.view(num_tokens, self.num_kv_heads, self.head_size)
 
         key_cache, value_cache = kv_cache
         if kv_cache[0].numel() > 0:
@@ -192,10 +190,10 @@ def write_to_kv_cache(
     """ Write the key and values to the KV cache.
 
     Args:
-        key: shape = [num_tokens, num_kv_heads, head_size]
-        value: shape = [num_tokens, num_kv_heads, head_size]
-        k_cache = [num_blocks, block_size, num_kv_heads, head_size]
-        v_cache = [num_blocks, block_size, num_kv_heads, head_size]
+        key: shape = [num_tokens, num_kv_heads * head_size]
+        value: shape = [num_tokens, num_kv_heads * head_size]
+        k_cache = [num_blocks, block_size, num_kv_heads * head_size]
+        v_cache = [num_blocks, block_size, num_kv_heads * head_size]
 
     """
     torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
@@ -203,6 +201,5 @@ def write_to_kv_cache(
 
     key_cache = key_cache.flatten(0, 1)
     value_cache = value_cache.flatten(0, 1)
-    slot_mapping = slot_mapping.flatten()
     key_cache.index_copy_(0, slot_mapping, key)
     value_cache.index_copy_(0, slot_mapping, value)

From bed8d39c086e6fffda6b2faec4852f6df019da98 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Mar 2025 14:04:41 -0700
Subject: [PATCH 0846/1240] [FrontEnd][Perf] `merge_async_iterators` fast-path
 for single-prompt requests (#15150)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/utils.py b/vllm/utils.py
index 79787303af5..9bc081890bc 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -411,6 +411,11 @@ async def merge_async_iterators(
     When it yields, it yields a tuple (i, item) where i is the index of the
     iterator that yields the item.
     """
+    if len(iterators) == 1:
+        # Fast-path single iterator case.
+        async for item in iterators[0]:
+            yield 0, item
+        return
 
     loop = asyncio.get_running_loop()
 

From 2c90dda9e48df8ddcae35a4f1c35b2f5429f8995 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 19 Mar 2025 16:18:04 -0700
Subject: [PATCH 0847/1240] [Docs] Annouce Ollama and Singapore Meetups
 (#15161)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index 20906f4f316..1165c449854 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,14 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
+---
+
+[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
+
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet vLLM's Cyrus Leung and Chen Zhang, and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
+
+---
+
 *Latest News* 🔥
 
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).

From 9f3b692546bb4088fb40b64a1e700f30777084f2 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 19 Mar 2025 20:55:18 -0400
Subject: [PATCH 0848/1240] [V1] TPU - Tensor parallel MP support (#15059)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                                |  2 +-
 .../device_communicators/tpu_communicator.py  | 50 +++++++++++++------
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index c248122da03..2d8f1ba483e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1473,7 +1473,7 @@ def __post_init__(self) -> None:
             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
             logger.info("Disabling V1 multiprocessing for external launcher.")
 
-        ray_only_devices = ["tpu"]
+        ray_only_devices: list[str] = []
         from vllm.platforms import current_platform
         if (current_platform.device_type in ray_only_devices
                 and self.world_size > 1):
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 524e655b6b4..05cb1e0f6ef 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -6,16 +6,25 @@
 import torch
 from torch.distributed import ProcessGroup
 
+from vllm.config import get_current_vllm_config
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 from .base_device_communicator import DeviceCommunicatorBase
 
+USE_RAY = parallel_config = get_current_vllm_config(
+).parallel_config.distributed_executor_backend == "ray"
+
+logger = init_logger(__name__)
+
 if current_platform.is_tpu():
+    import torch_xla
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
 
-    from vllm.executor import ray_utils
+    if USE_RAY:
+        from vllm.executor import ray_utils
 
 
 class TpuCommunicator(DeviceCommunicatorBase):
@@ -33,19 +42,32 @@ def __init__(self,
         global_rank = self.global_rank
         global_world_size = self.global_world_size
 
-        # Calculate how many TPU nodes are in the current deployment. This
-        # is the Ray placement group if it is deployed with Ray. Default
-        # to the number of TPU nodes in the Ray cluster. The number of TPU
-        # nodes is computed by the total number of TPUs divided by the
-        # number of TPU accelerators per node, to account for clusters
-        # with both CPUs and TPUs.
-        num_nodes = ray_utils.get_num_tpu_nodes()
-        num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
-        if num_nodes_in_pg > 0:
-            num_nodes = num_nodes_in_pg
-
-        local_world_size = global_world_size // num_nodes
-        local_rank = global_rank % local_world_size
+        if USE_RAY:
+            logger.info("TpuCommunicator initialized with RAY")
+            # Calculate how many TPU nodes are in the current deployment. This
+            # is the Ray placement group if it is deployed with Ray. Default
+            # to the number of TPU nodes in the Ray cluster. The number of TPU
+            # nodes is computed by the total number of TPUs divided by the
+            # number of TPU accelerators per node, to account for clusters
+            # with both CPUs and TPUs.
+            num_nodes = ray_utils.get_num_tpu_nodes()
+            num_nodes_in_pg = ray_utils.get_num_nodes_in_placement_group()
+            if num_nodes_in_pg > 0:
+                num_nodes = num_nodes_in_pg
+
+            local_world_size = global_world_size // num_nodes
+            local_rank = global_rank % local_world_size
+        else:
+            logger.info("TpuCommunicator initialized with MP")
+            # Sanity: Verify we run on a single host
+            num_hosts = torch_xla.tpu.num_tpu_workers()
+            assert num_hosts == 1
+
+            # Get the current number of TPUs (we have locally)
+            local_world_size = torch_xla.tpu.num_available_chips()
+
+            # Get current rank
+            local_rank = global_rank % local_world_size
 
         # Ensure environment variables are set for multihost deployments.
         # On GKE, this is needed for libtpu and TPU driver to know which TPU

From eb22caf1e951aaccd888161c38afd91a34524d25 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 19 Mar 2025 18:30:43 -0700
Subject: [PATCH 0849/1240] [BugFix] Lazily import XgrammarBackend to avoid
 early cuda init (#15171)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 58ac00e985a..0fdc45c279c 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -9,7 +9,6 @@
 from vllm.logger import init_logger
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
-from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
 
 if TYPE_CHECKING:
     import numpy as np
@@ -47,6 +46,9 @@ def grammar_init(self, request: Request) -> None:
         if self.backend is None:
             backend_name = request.sampling_params.guided_decoding.backend_name
             if backend_name == "xgrammar":
+                from vllm.v1.structured_output.backend_xgrammar import (
+                    XgrammarBackend)
+
                 self.backend = XgrammarBackend(self.vllm_config)
             else:
                 raise ValueError(

From e3527a5bebc9aabd868c607ca141750283fec19e Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 19 Mar 2025 18:55:59 -0700
Subject: [PATCH 0850/1240] [Doc] Clarify run vllm only on one node in
 distributed inference (#15148)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/distributed_serving.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md
index b36a3dcb170..591acc2c9b7 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/source/serving/distributed_serving.md
@@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
-After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
+After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
 
 ```console
  vllm serve /path/to/the/model/in/the/container \

From 1f994d1451818898d79e34dcdc88f2ec30b1aa89 Mon Sep 17 00:00:00 2001
From: Jovan Sardinha <1289023+jovsa@users.noreply.github.com>
Date: Wed, 19 Mar 2025 19:06:49 -0700
Subject: [PATCH 0851/1240] Fix broken tests (#14713)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: JovanSardinha <jovan.sardinha@gmail.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml      |  1 +
 requirements/test.in               |  2 +-
 tests/compile/test_pass_manager.py | 30 +++++++++++++++++-------------
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 230dd838342..5d03390335a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -295,6 +295,7 @@ steps:
   # these tests need to be separated, cannot combine
   - pytest -v -s compile/piecewise/test_simple.py
   - pytest -v -s compile/piecewise/test_toy_llama.py
+  - pytest -v -s compile/test_pass_manager.py
 
 - label: PyTorch Fullgraph Test # 18min
   source_file_dependencies:
diff --git a/requirements/test.in b/requirements/test.in
index faa4564eaa3..e75f15c0bf2 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2 
+transformers==4.48.2
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index 70920ab10ec..bdbd104f3b2 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -4,34 +4,38 @@
 
 import pytest
 import torch
-from torch._inductor.codecache import BypassFxGraphCache
 
-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.inductor_pass import (CallableInductorPass,
-                                            as_inductor_pass)
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
+from vllm.config import CompilationConfig
 
 
 def simple_callable(graph: torch.fx.Graph):
     pass
 
 
-@as_inductor_pass(files=(__file__, ))
-def callable_decorated(graph: torch.fx.Graph):
-    pass
+callable_uuid = CallableInductorPass(simple_callable,
+                                     InductorPass.hash_source(__file__))
 
 
 @pytest.mark.parametrize(
     "works, callable",
-    [(False, simple_callable), (True, callable_decorated),
-     (True, CallableInductorPass(simple_callable, "simple_callable"))])
+    [
+        (False, simple_callable),
+        (True, callable_uuid),
+        (True, CallableInductorPass(simple_callable)),
+    ],
+)
 def test_pass_manager(works: bool, callable):
     config = CompilationConfig().pass_config
-    pass_manager = PostGradPassManager([callable])
-    pass_manager.configure(config)  # Adds default passes
 
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    # Try to add the callable to the pass manager
     if works:
+        pass_manager.add(callable)
         pickle.dumps(pass_manager)
     else:
-        with pytest.raises(BypassFxGraphCache):
-            pickle.dumps(pass_manager)
+        with pytest.raises(AssertionError):
+            pass_manager.add(callable)

From 97385b886592da94547a102e340735cbab746f0e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Mar 2025 11:40:13 +0800
Subject: [PATCH 0852/1240] [Bugfix] Fix embedding assignment for
 InternVL-based models (#15086)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language.py |   1 -
 .../vision_language_multi_image.py            |   2 -
 vllm/model_executor/models/gemma3_mm.py       |   2 +-
 vllm/model_executor/models/h2ovl.py           |  22 +--
 vllm/model_executor/models/internvl.py        | 173 +++++++++++-------
 vllm/model_executor/models/nvlm_d.py          |  21 +--
 vllm/multimodal/processing.py                 |   8 +-
 7 files changed, 123 insertions(+), 106 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 3849bd37a82..1cc2562759d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -169,7 +169,6 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         model=model_name,
         max_model_len=2048,
         max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
         mm_processor_kwargs={"do_pan_and_scan": True},
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 3a17e5bab09..98a739169d7 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -91,8 +91,6 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=8192,
         max_num_seqs=2,
-        # Default is False; setting it to True is not supported in V1 yet
-        mm_processor_kwargs={"do_pan_and_scan": True},
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 62e55d64cf2..8db2bfb901b 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -183,7 +183,7 @@ def get_image_repl(
         image_width: int,
         image_height: int,
         processor: Optional[Gemma3Processor],
-    ) -> PromptUpdateDetails:
+    ) -> PromptUpdateDetails[str]:
         if processor is None:
             processor = self.get_hf_processor()
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index e23765cc4fb..3b2ad695f83 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -249,20 +249,15 @@ def __init__(
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
-    def get_image_repl_features(
+    def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
-    ) -> str:
-        return IMG_CONTEXT * feature_size
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
 
-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        features = self.get_image_repl_features(feature_size, num_patches)
-        return IMG_START + features + IMG_END
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def resolve_min_max_num(
         self,
@@ -501,12 +496,7 @@ def get_replacement_internvl(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
-            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches),
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches),
-            )
+            return hf_processor.get_image_repl(feature_size, num_patches)
 
         return [
             PromptReplacement(
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index d31b623b5bc..e8ec91736d5 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -9,14 +9,13 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
-from typing import (List, Literal, Optional, Set, Tuple, TypedDict, TypeVar,
-                    Union)
+from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
 
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
 from PIL import Image
-from transformers import BatchFeature, PretrainedConfig, TensorType
+from transformers import BatchEncoding, PretrainedConfig, TensorType
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -36,10 +35,12 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -51,16 +52,26 @@
 
 class InternVLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values_flat: torch.Tensor
     """
     Shape:
     `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
     """
-    patches_per_image: List[int]
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    List of number of total patches for each image in the batch.
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size, num_images, num_embeds)`
     """
 
+    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
+    """Shape: `(batch_size, num_images)`"""
+
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -286,19 +297,11 @@ def image_token_id(self) -> int:
         raise NotImplementedError
 
     @abstractmethod
-    def get_image_repl_features(
+    def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
-    ) -> str:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
+    ) -> PromptUpdateDetails[str]:
         raise NotImplementedError
 
     def resolve_min_max_num(
@@ -394,7 +397,7 @@ def __call__(
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> BatchFeature:
+    ) -> Mapping[str, NestedTensors]:
         if text is None:
             text = []
         if not isinstance(text, list):
@@ -413,28 +416,41 @@ def __call__(
                 max_dynamic_patch=max_dynamic_patch,
                 dynamic_image_size=dynamic_image_size,
             )
-            image_inputs = {
-                "pixel_values_flat": torch.cat(pixel_values_lst),
-                "image_num_patches": list(map(len, pixel_values_lst)),
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            num_embeds = list[int]()
+            embed_is_patch = list[torch.Tensor]()
+
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
-                image_repl = self.get_image_repl_full(feature_size,
-                                                      num_patches)
-                text = [t.replace('<image>', image_repl, 1) for t in text]
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                num_embeds.append(len(feature_tokens))
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["num_embeds"] = torch.tensor(num_embeds)
+            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
 
 
 class InternVLProcessor(BaseInternVLProcessor):
@@ -443,20 +459,15 @@ class InternVLProcessor(BaseInternVLProcessor):
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
-    def get_image_repl_features(
+    def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
-    ) -> str:
-        return IMG_CONTEXT * feature_size
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
 
-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        features = self.get_image_repl_features(feature_size, num_patches)
-        return IMG_START + features + IMG_END
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -566,16 +577,15 @@ def _call_hf_processor(
         prompt: str,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
+    ) -> Mapping[str, NestedTensors]:
         processed_outputs = super()._call_hf_processor(
             prompt=prompt,
             mm_data=mm_data,
             mm_kwargs=mm_kwargs,
         )
 
-        image_token_id = self.info.get_hf_processor(**mm_kwargs).image_token_id
-        image_data = mm_data.get("images", [])
-        assert isinstance(image_data, list)
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
 
         # Since there may be extra tokens in the feature placeholders,
         # we need to pass the image token ID to the model to select the
@@ -586,7 +596,7 @@ def _call_hf_processor(
 
     def _get_mm_fields_config(
         self,
-        hf_inputs: BatchFeature,
+        hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
         image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
@@ -596,6 +606,8 @@ def _get_mm_fields_config(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            num_embeds=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -637,12 +649,7 @@ def get_replacement_internvl(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
-            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches),
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches),
-            )
+            return hf_processor.get_image_repl(feature_size, num_patches)
 
         return [
             PromptReplacement(
@@ -832,6 +839,8 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        num_embeds = kwargs.pop("num_embeds", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -858,35 +867,47 @@ def _parse_and_validate_image_input(
 
             if not isinstance(image_num_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image_num_patches. "
-                                 f"Got type: {type(pixel_values_flat)}")
+                                 f"Got type: {type(image_num_patches)}")
+
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
+            if not isinstance(num_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_embeds. "
+                                 f"Got type: {type(num_embeds)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values_flat, concat=True)),
-                patches_per_image=flatten_bn(image_num_patches,
-                                             concat=True).tolist())
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
+                num_embeds=num_embeds,
+            )
 
         raise AssertionError("This line should be unreachable.")
 
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
         assert self.vision_model is not None
 
-        image_embeds = self.extract_feature(image_input["data"])
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
 
-        patches_per_image = image_input["patches_per_image"]
+        num_patches = image_input["num_patches"]
 
         # Only one image in the current batch
-        if len(patches_per_image) == 1:
-            image_embeds = image_embeds.view(
+        if len(num_patches) == 1:
+            return image_embeds.view(
                 -1, self.config.text_config.hidden_size).unsqueeze(0)
-            return image_embeds
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
@@ -894,10 +915,9 @@ def _process_image_input(
         image_embeds = image_embeds.view(-1,
                                          self.config.text_config.hidden_size)
         image_feature_sizes = [
-            num_patches * feature_size for num_patches in patches_per_image
+            num_patches * feature_size for num_patches in num_patches
         ]
-        image_embeds = image_embeds.split(image_feature_sizes)
-        return image_embeds
+        return image_embeds.split(image_feature_sizes)
 
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         if self.is_mono:
@@ -911,8 +931,19 @@ def get_multimodal_embeddings(
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+
+        image_features = self._process_image_input(image_input)
+
+        if (kwargs.get("v0_path", False)
+                or image_input["type"] != "pixel_values"):
+            return image_features
+
+        return flatten_2d_lists(
+            scatter_patch_features(*args) for args in zip(
+                image_features,
+                image_input["num_embeds"],
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
@@ -924,8 +955,11 @@ def get_input_embeddings(
             assert self.img_context_token_id is not None
             self._set_visual_token_mask(input_ids)
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.img_context_token_id)
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.img_context_token_id,
+            )
         return inputs_embeds
 
     def forward(
@@ -944,6 +978,7 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
+            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 0f5cbf082d9..9d04f30c8f3 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -36,11 +36,11 @@ class NVLMProcessor(BaseInternVLProcessor):
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_PAD]
 
-    def get_image_repl_features(
+    def get_image_repl(
         self,
         feature_size: int,
         num_patches: Optional[int],
-    ) -> str:
+    ) -> PromptUpdateDetails[str]:
         if num_patches is None:
             raise NotImplementedError("Embedding inputs are not supported")
 
@@ -55,14 +55,9 @@ def get_image_repl_features(
         # We include the start and end as well because "<Image><tile" is
         # tokenized as ["<Image", "><", "tile"], resulting in assertion error
         # when trying to find "<tile" as a subsequence of "<Image><tile"
-        return "<Image>" + features + "</Image>"
+        repl = "<Image>" + features + "</Image>"
 
-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        return self.get_image_repl_features(feature_size, num_patches)
+        return PromptUpdateDetails(full=repl, features=repl)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -180,11 +175,11 @@ def get_replacement_nvlm(item_idx: int):
             if num_patches is not None:
                 assert isinstance(num_patches, int)
 
+            repl = hf_processor.get_image_repl(feature_size, num_patches)
+
             return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches) + "\n",
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches) + "\n",
+                full=repl.full + "\n",
+                features=repl.features + "\n",
             )
 
         # See note in dummy data regarding why we have the extra newline
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index db995957a7f..fec77acc1d1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -103,13 +103,13 @@ def end() -> PromptIndex:
 
 
 @dataclass
-class PromptUpdateDetails:
+class PromptUpdateDetails(Generic[_S]):
     """Details about the token sequence or text that are part of the update."""
 
-    full: PromptSeq
+    full: _S
     """The full content."""
 
-    features: PromptSeq
+    features: _S
     """
     The part of the content that corresponds to feature placeholders;
     this will be replaced by the output of the vision encoder during model
@@ -117,7 +117,7 @@ class PromptUpdateDetails:
     """
 
     @staticmethod
-    def from_seq(seq: PromptSeq) -> "PromptUpdateDetails":
+    def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
         return PromptUpdateDetails(full=seq, features=seq)
 
 
From de46047439b9b2b447843139cc39b97b6a845cba Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Thu, 20 Mar 2025 11:56:16 +0800
Subject: [PATCH 0853/1240] =?UTF-8?q?fix=20"Total=20generated=20tokens:"?=
 =?UTF-8?q?=20is=200=20if=20using=20--backend=20tgi=20and=20--endpo?=
 =?UTF-8?q?=E2=80=A6=20(#14673)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 09c8e23ebb1..0f13c79ae23 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -63,7 +63,7 @@ async def async_request_tgi(
             "temperature": 0.01,  # TGI does not accept 0.0 temperature.
             "top_p": 0.99,  # TGI does not accept 1.0 top_p.
             "truncate": request_func_input.prompt_len,
-            # TGI does not accept ignore_eos flag.
+            "ignore_eos_token": request_func_input.ignore_eos,
         }
         payload = {
             "inputs": request_func_input.prompt,
@@ -71,6 +71,10 @@ async def async_request_tgi(
         }
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
 
         ttft = 0.0
         st = time.perf_counter()

From 5aac371ed82b75b806b729095e6da66cf9273789 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 20 Mar 2025 05:00:39 +0100
Subject: [PATCH 0854/1240] [V1][TPU] Support V1 Sampler for ragged attention
 (#14227)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/tpu/test_sampler.py            |  94 +++++++++++++
 vllm/v1/sample/ops/topk_topp_sampler.py |  16 ++-
 vllm/v1/sample/tpu/__init__.py          |   0
 vllm/v1/sample/tpu/metadata.py          | 159 ++++++++++++++++++++++
 vllm/v1/sample/tpu/sampler.py           | 154 ++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py      | 167 ++++++++++++++++--------
 6 files changed, 535 insertions(+), 55 deletions(-)
 create mode 100644 tests/v1/tpu/test_sampler.py
 create mode 100644 vllm/v1/sample/tpu/__init__.py
 create mode 100644 vllm/v1/sample/tpu/metadata.py
 create mode 100644 vllm/v1/sample/tpu/sampler.py

diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
new file mode 100644
index 00000000000..76b8ddb92b7
--- /dev/null
+++ b/tests/v1/tpu/test_sampler.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+import tempfile
+from time import time
+
+import pytest
+
+from vllm import LLM, envs
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+def test_sampler_compilation(model_name: str, monkeypatch):
+    """
+    Check that no recompilation happens despite changing sampling parameters.
+    We can't read XLA metrics from the engine process, hence we measure time.  
+    """
+    with tempfile.TemporaryDirectory() as temp_dir:
+        monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
+        # Compiling model init may still take some time, enforce_eager to skip.
+        llm = LLM(model_name,
+                  enforce_eager=True,
+                  max_num_seqs=16,
+                  max_model_len=1024,
+                  gpu_memory_utilization=0.5)
+        prompts = [
+            "A robot may not injure a human being",
+            "It is only with the heart that one can see rightly;",
+        ]
+        # First inference should be slow
+        sampling_params = SamplingParams(
+            temperature=0.7,
+            # top_p=0.6, # TODO too slow!
+            # top_k=10,
+            min_p=0.2,
+            max_tokens=16)
+        s = time()
+        _ = llm.generate(prompts, sampling_params)
+        run1 = time() - s
+
+        # Second request with different params, but for which we
+        # compiled for in previous eager iteration.
+        sampling_params = SamplingParams(temperature=0.1,
+                                         min_p=0.8,
+                                         max_tokens=24)
+        s = time()
+        _ = llm.generate(prompts, sampling_params)
+        run2 = time() - s
+        # Much faster after compiling
+        assert run1 * 0.1 > run2
+        print("TIMES", run1, run2)
+
+        # Third request with min_p set to "None". It will not trigger
+        # recompilation as a default 0 value will be used.
+        sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
+        s = time()
+        _ = llm.generate(prompts, sampling_params)
+        run3 = time() - s
+        assert run1 * 0.1 > run3
+        print("TIMES", run1, run3)
+
+
+@pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+def test_sampler_different(model_name: str):
+    """
+    Test significantly different sampling params to assert the model produces 
+    different results.
+    """
+    llm = LLM(
+        model_name,
+        enforce_eager=True,
+        max_num_seqs=1,
+        max_model_len=64,
+        # TODO: setting to 0.5 or it will go OOM
+        gpu_memory_utilization=0.5)
+    prompts = [
+        "Write a short story about a robot that dreams for the first time."
+    ]
+    sampling_params = SamplingParams(temperature=0.9, min_p=0.2, max_tokens=64)
+    output = llm.generate(prompts, sampling_params)
+
+    sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
+    output2 = llm.generate(prompts, sampling_params)
+    assert output[0].outputs[0].text != output2[0].outputs[0].text
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index d461a809893..e1a3e92de49 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -65,6 +65,8 @@ def __init__(self):
                     "native implementation of top-p & top-k sampling. For the "
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
+        elif current_platform.is_tpu():
+            self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
 
@@ -96,6 +98,18 @@ def forward_cuda(
             return random_sample(probs, generators)
         return flashinfer_sample(probs, k, p, generators)
 
+    def forward_tpu(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: Optional[torch.Tensor],
+        p: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # TODO Placeholder for TPU optimized topk/p kernel
+        # logits = apply_top_k_top_p(logits, k, p)
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
+        return random_sample(probs, generators)
+
 
 def apply_top_k_top_p(
     logits: torch.Tensor,
@@ -112,7 +126,7 @@ def apply_top_k_top_p(
 
     if k is not None:
         # Apply top-k.
-        top_k_mask = logits_sort.size(1) - k.to(torch.long)
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)  # shape: B
         # Get all the top_k values.
         top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
         top_k_mask = logits_sort < top_k_mask
diff --git a/vllm/v1/sample/tpu/__init__.py b/vllm/v1/sample/tpu/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
new file mode 100644
index 00000000000..b4f7c19a8d3
--- /dev/null
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+import torch_xla.core.xla_model as xm
+
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+@dataclass
+class TPUSupportedSamplingMetadata:
+    # This class exposes a more xla-friendly interface than SamplingMetadata
+    # on TPU, in particular all arguments should be traceable and no optionals
+    # are allowed, to avoid graph recompilation on Nones.
+    temperature: torch.Tensor
+
+    min_p: torch.Tensor
+    # Still too slow on forward_native!
+    top_k: torch.Tensor = None
+    top_p: torch.Tensor = None
+
+    # XLA-unfriendly control flow in Sampler
+    all_greedy: bool = False
+    all_random: bool = False
+    # Greedy sampling flag for compiling single xla graph.
+    do_argmax: torch.Tensor = None
+
+    # speculation not supported
+    spec_token_ids = None
+
+    # Generator not supported by xla
+    generators: dict[int,
+                     torch.Generator] = field(default_factory=lambda: dict())
+
+    # unsupported, you need to return an extra tensor of static size BxV
+    max_num_logprobs = None
+
+    # TODO No penalties for now
+    no_penalties: bool = True
+    prompt_token_ids = None
+    frequency_penalties = None
+    presence_penalties = None
+    repetition_penalties = None
+    # should use tensor
+    output_token_ids: list[list[int]] = field(default_factory=lambda: list())
+
+    min_tokens = None  # impl is not vectorized
+
+    logit_bias: list[Optional[dict[int, float]]] = field(
+        default_factory=lambda: list())
+
+    allowed_token_ids_mask = None
+    bad_words_token_ids = None
+    indices_do_sample: torch.Tensor = None
+
+    def __post_init__(self):
+        temp = self.temperature
+        if self.indices_do_sample is None:
+            self.indices_do_sample = torch.zeros(temp.shape[0],
+                                                 device=temp.device,
+                                                 dtype=torch.int32)
+        if self.do_argmax is None:
+            self.do_argmax = torch.tensor(0,
+                                          dtype=torch.bool,
+                                          device=temp.device)
+
+    @classmethod
+    def from_sampling_metadata(
+            cls, metadata: SamplingMetadata,
+            padded_do_sample_indices: torch.Tensor, num_do_sample: int,
+            device: torch.device) -> "TPUSupportedSamplingMetadata":
+        """
+        Create an XLA-frienly SamplingMetadata structure. Do so by first 
+        instantiating an object with fixed-sized tensors and then writing the
+        values in input `metadata`. Do that only for non-None values so that 
+        recompilation is not triggered for optional values (None/torch.Tensor).
+        
+        In order to handle different sizes for the params that range from 1 up 
+        to `max_num_seqs`, pad tensors to the closest pre-compiled shape.
+        Same thing for `padded_do_sample_indices`, which contains the indices 
+        to be fed to the Sampler, padded to the closest pre-compiled shape.
+
+        Eg. pad to 4 temperature: [0.7, 0.2]=>[0.7, 0.2, 0.0, 0.0]
+            do_sample_indices: [4, 10]=>padded_do_sample_indices: [4, 10, 0, 0]
+        """
+        metadata = cls._validate_sampling_metadata(metadata)
+        # NOTE we have to initialize default tensor-based params first and
+        # skip None values altogether to produce the same xla graph.
+        num_samples = len(padded_do_sample_indices)
+        do_argmax = torch.tensor(metadata.all_greedy,
+                                 dtype=torch.bool,
+                                 device=device)
+        new_metadata = cls.get_default_sampling_params(num_samples, device,
+                                                    indices_do_sample=\
+                                                    padded_do_sample_indices,
+                                                    do_argmax=do_argmax
+                                                    )
+        supported_params = \
+            TPUSupportedSamplingMetadata._get_default_params_values()
+        # Copy input non-None values into `new_metadata` fixed-sized tensors.
+        for p_name in supported_params:
+            old_val = getattr(metadata, p_name)
+            new_val = getattr(new_metadata, p_name)
+            if isinstance(old_val, torch.Tensor):
+                new_val[:num_do_sample] = old_val
+            setattr(new_metadata, p_name, new_val)
+
+        xm.mark_step()
+        xm.wait_device_ops()
+        return new_metadata
+
+    @classmethod
+    def get_default_sampling_params(
+            cls,
+            num_samples: int,
+            device: torch.device,
+            indices_do_sample=None,
+            do_argmax=None) -> "TPUSupportedSamplingMetadata":
+        # As sampling happens on a single traced graph, options
+        # are "disabled" by having them evaluate to an Identity op.
+        # Note that initialization is dependent on num_samples.
+        sampling_metadata_disable_value = \
+            TPUSupportedSamplingMetadata._get_default_params_values()
+        init_kwargs = dict()
+        for p_name, (default_val,
+                     dtype) in sampling_metadata_disable_value.items():
+            default_tensor = torch.full((num_samples, ),
+                                        default_val,
+                                        dtype=dtype,
+                                        device=device)
+            init_kwargs[p_name] = default_tensor
+
+        return cls(**init_kwargs,
+                   indices_do_sample=indices_do_sample,
+                   do_argmax=do_argmax)
+
+    @staticmethod
+    def _validate_sampling_metadata(
+            sampling_metadata: SamplingMetadata) -> SamplingMetadata:
+        if sampling_metadata.all_greedy:
+            # Set to None since #13587. Make sure default isn't overruled.
+            assert sampling_metadata.temperature is None
+        return sampling_metadata
+
+    @staticmethod
+    def _get_default_params_values():
+        return dict(
+            # Since #13587 greedy sampling requires branching off which leads
+            # to separate graphs. We set temp to noop and handle argmax here.
+            temperature=(1.0, torch.float32),
+            min_p=(0.0, torch.float32),
+            # strictly disabled for now
+            # top_k=(-1, torch.int32),
+            # top_p=(0.0, torch.float32),
+            # frequency_penalties=(0.0, torch.float32),
+            # presence_penalties=(0.0, torch.float32),
+            # repetition_penalties=(0.0, torch.float32),
+        )
\ No newline at end of file
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
new file mode 100644
index 00000000000..33526c003a2
--- /dev/null
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Sampler layer implementing TPU supported operations."""
+
+import torch
+import torch.nn as nn
+
+from vllm.v1.outputs import LogprobsTensors, SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+
+_SAMPLING_EPS = 1e-5
+
+
+class Sampler(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.topk_topp_sampler = TopKTopPSampler()
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> SamplerOutput:
+        # NOTE(woosuk): Use the original logits (before any penalties or
+        # temperature scaling) for the top-k logprobs.
+        # This is different from the V0 sampler, which uses the logits that
+        # is used for sampling (after penalties and temperature scaling).
+
+        # Use float32 for the logits.
+        logits = logits.to(torch.float32)
+        # Sample the next token.
+        sampled = self.sample(logits, sampling_metadata)
+
+        # Use int32 to reduce the tensor size.
+        sampled = sampled.to(torch.int32)
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.unsqueeze(-1),
+            logprobs_tensors=None,
+        )
+        return sampler_output
+
+    def apply_temperature(
+        self,
+        logits: torch.Tensor,
+        temp: torch.Tensor,
+    ) -> torch.Tensor:
+        # Use in-place division to avoid creating a new tensor.
+        return logits.div_(temp.unsqueeze(dim=1))
+
+    def greedy_sample(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.argmax(dim=-1).view(-1)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> torch.Tensor:
+        greedy_sampled = self.greedy_sample(logits)
+
+        assert sampling_metadata.temperature is not None
+
+        # Apply temperature.
+        logits = self.apply_temperature(logits, sampling_metadata.temperature)
+
+        # Apply min_p.
+        if sampling_metadata.min_p is not None:
+            logits = self.apply_min_p(logits, sampling_metadata.min_p)
+
+        # Apply top_k and/or top_p.
+        random_sampled = self.topk_topp_sampler(
+            logits,
+            sampling_metadata.generators,
+            sampling_metadata.top_k,
+            sampling_metadata.top_p,
+        )
+
+        sampled = torch.where(sampling_metadata.temperature < _SAMPLING_EPS,
+                              greedy_sampled, random_sampled)
+        return sampled
+
+    def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
+        return logits.log_softmax(dim=-1, dtype=torch.float32)
+
+    def gather_logprobs(
+        self,
+        logprobs: torch.Tensor,
+        num_logprobs: int,
+        token_ids: torch.Tensor,
+    ) -> LogprobsTensors:
+        """
+        Gather logprobs for topk and sampled/prompt token.
+
+        Args:
+          logits: (num tokens) x (vocab) tensor
+          num_logprobs: minimum number of logprobs to
+                        retain per token
+          token_ids: prompt tokens (if prompt logprobs)
+                     or sampled tokens (if sampled
+                     logprobs); 1D token ID tensor
+                     with (num tokens) elements
+
+        Returns:
+          Top-k int indices tensor, (num tokens) x (num_logprobs + 1)
+          Top-k float logprobs tensor, (num tokens) x (num_logprobs + 1)
+          Sampled token rank tensor, (num tokens)
+        """
+        # Find the topK values.
+        topk_logprobs, topk_indices = torch.topk(logprobs,
+                                                 num_logprobs,
+                                                 dim=-1)
+
+        # Get with the logprob of the prompt or sampled token.
+        token_ids = token_ids.unsqueeze(-1)
+        token_logprobs = logprobs.gather(-1, token_ids)
+
+        # Compute the ranks of the actual token.
+        token_ranks = (logprobs >= token_logprobs).sum(-1)
+
+        # Concatenate together with the topk.
+        indices = torch.cat((token_ids, topk_indices), dim=1)
+        logprobs = torch.cat((token_logprobs, topk_logprobs), dim=1)
+
+        # Use int32 to reduce the tensor size.
+        indices = indices.to(torch.int32)
+
+        return LogprobsTensors(indices, logprobs, token_ranks)
+
+    def apply_min_p(
+        self,
+        logits: torch.Tensor,
+        min_p: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Filters logits using adaptive probability thresholding.
+        """
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values,
+                                       dim=-1,
+                                       keepdim=True)
+        # Reshape min_p for broadcasting
+        adjusted_min_p = min_p.unsqueeze(1) * max_probabilities
+        # Identify valid tokens using threshold comparison
+        valid_token_mask = probability_values >= adjusted_min_p
+        # Apply mask using boolean indexing (xla friendly)
+        logits.masked_fill_(~valid_token_mask, -float("inf"))
+        return logits
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 62d8354f4f9..b7924752aec 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,13 +23,16 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
-                             ModelRunnerOutput)
+                             ModelRunnerOutput, SamplerOutput)
+from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
+from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
@@ -42,6 +45,8 @@
 # FIXME(woosuk): Find a more reliable way to prevent possible bugs.
 _PAD_SLOT_ID = 1_000_000_000
 INVALID_TOKEN_ID = -1
+# Smallest output size
+MIN_NUM_SEQS = 8
 
 
 class TPUModelRunner:
@@ -138,8 +143,10 @@ def __init__(
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
 
+        padded_max_num_blocks_per_req = _get_padded_number(
+            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, self.max_num_blocks_per_req),
+            (self.max_num_tokens, padded_max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 
@@ -267,6 +274,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 req_data.num_computed_tokens)
             self.input_batch.block_table.append_row(req_data.new_block_ids,
                                                     req_index)
+        # Check if the batch has changed. If not, we can skip copying the
+        # sampling metadata from CPU to GPU.
+        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
@@ -284,6 +294,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Condense the batched states if there are empty indices.
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
+
+        # TODO This slices tensors to copy to device, triggering recompilation.
+        if batch_changed:
+            self.input_batch.refresh_sampling_metadata()
         return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
     def get_model(self) -> nn.Module:
@@ -447,6 +461,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # TODO: Support prompt logprobs.
         padded_num_reqs = _get_padded_num_reqs_with_upper_limit(
             num_reqs, self.max_num_reqs)
+        # Indices at which we sample (positions of last token in the sequence).
+        # Padded to avoid recompiling when `num_reqs` varies.
         logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
@@ -576,7 +592,14 @@ def execute_model(
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids
             inputs_embeds = None
-
+        sampling_metadata = self.input_batch.sampling_metadata
+        num_reqs = self.input_batch.num_reqs
+        # NOTE (NickLucche) here we sync with TPU: if there's any shape
+        # mismatch in pre-processing, it will trigger a small recompilation
+        # of the code thus far. Forward graph remains untouched.
+        tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
+            from_sampling_metadata(sampling_metadata, logits_indices,
+                                    num_reqs, self.device)
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
@@ -585,12 +608,13 @@ def execute_model(
                 kv_caches=self.kv_caches,
                 inputs_embeds=inputs_embeds,
             )
-        num_reqs = self.input_batch.num_reqs
-        selected_token_ids = self.model.compute_logits(hidden_states,
-                                                       logits_indices, None)
+        selected_token_ids = self.model.sample_from_hidden(
+            hidden_states, tpu_sampling_metadata)
+        # Remove padding on cpu and keep dynamic op outside of xla graph.
         selected_token_ids = selected_token_ids.cpu()[:num_reqs]
 
-        # Then, let's update the cache state.
+        # Update the cache state concurrently. Code above will not block until
+        # we use `selected_token_ids`. Add mark_step if post-processing changes
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
@@ -607,7 +631,6 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
-        # num_reqs entries should be non-None
         assert all(
             req_id is not None for req_id in
             self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
@@ -620,6 +643,7 @@ def execute_model(
         max_gen_len = selected_token_ids.shape[-1]
         if max_gen_len == 1:
             valid_sampled_token_ids = selected_token_ids.tolist()
+
             for i, req_state, seq_len in request_seq_lens:
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
@@ -676,11 +700,8 @@ def load_model(self) -> None:
                                    fullgraph=True,
                                    dynamic=False)
 
-    def _dummy_run(
-        self,
-        kv_caches,
-        num_tokens: int,
-    ) -> None:
+    @torch.no_grad()
+    def _dummy_run(self, kv_caches, num_tokens: int) -> None:
         if self.is_multimodal_model:
             input_ids = None
             inputs_embeds = torch.zeros((num_tokens, self.hidden_size),
@@ -729,32 +750,10 @@ def _dummy_run(
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
-            assert self.model is not None
-            hidden_states = self.model(
-                input_ids=input_ids,
-                positions=position_ids,
-                kv_caches=kv_caches,
-                inputs_embeds=inputs_embeds,
-            )
-            num_reqs = _get_padded_num_reqs_with_upper_limit(
-                64, self.max_num_reqs)
-            # NOTE(chengjiyao): In total, the compute_logits function utilizes a
-            # compilation cache size of token_bucket_num multiplied by
-            # req_bucket_num. This is acceptable, given the graph's relatively
-            # small size.
-            while True:
-                logits_indices = torch.zeros(
-                    num_reqs,
-                    dtype=torch.int32,
-                    device=self.device,
-                )
-                torch._dynamo.mark_dynamic(hidden_states, 0)
-                torch._dynamo.mark_dynamic(logits_indices, 0)
-                self.model.compute_logits(hidden_states, logits_indices, None)
-                if num_reqs >= self.max_num_reqs:
-                    break
-                num_reqs = _get_padded_num_reqs_with_upper_limit(
-                    num_reqs + 1, self.max_num_reqs)
+            self.model(input_ids=input_ids,
+                       positions=position_ids,
+                       kv_caches=kv_caches,
+                       inputs_embeds=inputs_embeds)
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -764,13 +763,51 @@ def capture_model(self) -> None:
         start = time.perf_counter()
         num_tokens = 16
         while True:
-            self._dummy_run(self.kv_caches, num_tokens)
             logger.info("  -- num_tokens: %d", num_tokens)
+            self._dummy_run(self.kv_caches, num_tokens)
             xm.mark_step()
-            xm.wait_device_ops()
             if num_tokens >= self.max_num_tokens:
                 break
             num_tokens *= 2
+        xm.wait_device_ops()
+        end = time.perf_counter()
+        logger.info("Compilation finished in in %.2f [secs].", end - start)
+
+        logger.info("Compiling sampling with different input shapes.")
+        start = time.perf_counter()
+        num_tokens = 16
+        hsize = self.model_config.get_hidden_size()
+        device = self.device
+        # Compile sampling step for different model+sampler outputs in bucketed
+        # n_tokens x max_num_reqs. Graph is really small so this is fine.
+        while True:
+            num_reqs_to_sample = MIN_NUM_SEQS
+            dummy_hidden = torch.randn((num_tokens, hsize),
+                                       device=device,
+                                       dtype=torch.bfloat16)
+            while True:
+                # Default metadata is an all_greedy setup. But since the
+                # `do_argmax` flag is a tensor, we still compile the full graph
+                meta = self.input_batch.sampling_metadata
+                indices = torch.zeros(
+                    num_reqs_to_sample,
+                    dtype=torch.int32,
+                    device=device,
+                )
+                sampling_meta = TPUSupportedSamplingMetadata.\
+                    from_sampling_metadata(meta, indices,
+                                           num_reqs_to_sample, device)
+                logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
+                            num_reqs_to_sample)
+                self.model.sample_from_hidden(dummy_hidden, sampling_meta)
+                xm.mark_step()
+                if num_reqs_to_sample >= self.max_num_reqs:
+                    break
+                num_reqs_to_sample *= 2
+            if num_tokens >= self.max_num_tokens:
+                break
+            num_tokens *= 2
+        xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
 
@@ -818,6 +855,13 @@ class ModelWrapperV1(nn.Module):
     def __init__(self, model: nn.Module):
         super().__init__()
         self.model = model
+        self.sampler = TPUSampler()
+
+    def sample(
+            self, logits: torch.Tensor,
+            sampling_metadata: TPUSupportedSamplingMetadata) -> SamplerOutput:
+        sampler_out = self.sampler(logits, sampling_metadata)
+        return sampler_out
 
     def forward(
         self,
@@ -826,7 +870,7 @@ def forward(
         kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        """Executes the forward pass of the model and samples the next token.
+        """Executes the forward pass of the model.
 
         Args:
             input_ids: The input token IDs of shape [num_tokens].
@@ -837,7 +881,6 @@ def forward(
                 hidden_size]. It is used for multimodal models.
         """
 
-        assert self.model is not None
         hidden_states = self.model(
             input_ids=input_ids,
             positions=positions,
@@ -846,17 +889,33 @@ def forward(
 
         return hidden_states
 
-    @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
-    def compute_logits(
+    def sample_from_hidden(
         self,
         hidden_states: torch.Tensor,
-        logits_indices: torch.Tensor,
-        sampling_metadata,
-    ) -> Optional[torch.Tensor]:
-        hidden_states = hidden_states[logits_indices]
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-        selected_token_ids = torch.argmax(logits, dim=-1, keepdim=True)
-        return selected_token_ids
+        sampling_metadata: TPUSupportedSamplingMetadata,
+    ) -> torch.Tensor:
+        """
+        Sample with xla-friendly function. This function is to be traced 
+        separately from `forward` for lighter compilation overhead.
+        """
+        # Tensor `sample_hidden_states` is of fixed pre-compiled size.
+        sample_hidden_states = \
+            hidden_states[sampling_metadata.indices_do_sample]
+        logits = self.compute_logits(sample_hidden_states)
+        # Greedy sampling can't be run without branching the graph on Sampler.
+        # Therefore do_argmax/all_greedy is checked here in a xla-friendly way.
+        # NOTE do_argmax is a scalar, this is just an optimized if/else.
+        out_tokens = torch.where(sampling_metadata.do_argmax,
+                        torch.argmax(logits, dim=-1, keepdim=True),
+                        self.sample(logits, sampling_metadata)\
+                                            .sampled_token_ids)
+        return out_tokens
+
+    def compute_logits(self,
+                       hidden_states: torch.Tensor) -> Optional[torch.Tensor]:
+        # SamplingMetadata here for pruning output in LogitsProcessor, disabled
+        logits = self.model.compute_logits(hidden_states, None)
+        return logits
 
     def get_multimodal_embeddings(self, *args, **kwargs):
         return self.model.get_multimodal_embeddings(*args, **kwargs)
@@ -876,5 +935,5 @@ def _get_padded_token_len(x: int) -> int:
 
 
 def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
-    res = 64 if x <= 64 else 1 << (x - 1).bit_length()
+    res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
     return min(res, upper_limit)

From 99e48a3f77aafa332fd19ebbb5cdca21efb197d3 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Wed, 19 Mar 2025 21:32:58 -0700
Subject: [PATCH 0855/1240] [Benchmark] Allow oversample request in benchmark
 dataset (#15170)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md            |  57 ++++++++++++-
 benchmarks/benchmark_dataset.py | 141 +++++++++++++++++++-------------
 2 files changed, 139 insertions(+), 59 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 3225a4b0db3..d41de1caa04 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -42,7 +42,7 @@ become available.
     </tr>
     <tr>
       <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🟡</td>
       <td style="text-align: center;">🟡</td>
       <td>Specify your dataset path on HuggingFace</td>
     </tr>
@@ -60,8 +60,8 @@ become available.
 🚧: to be supported
 
 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
-formats, please consider contributing.
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.
 
 **Note**: VisionArena’s `dataset-name` should be set to `hf`
 
@@ -139,6 +139,57 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts "${NUM_PROMPTS}"
 ```
 
+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+```
+
 ---
 ## Example - Offline Throughput Benchmark
 
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 55109dab000..0567875f986 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -17,6 +17,7 @@
 import base64
 import io
 import json
+import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
@@ -35,6 +36,8 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 
+logger = logging.getLogger(__name__)
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@@ -61,9 +64,6 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
     DEFAULT_SEED = 0
 
-    # num_requests has default 1000 in both the benchmark_serving.py and
-    # benchmark_throughput.py
-
     def __init__(
         self,
         dataset_path: Optional[str] = None,
@@ -90,8 +90,8 @@ def apply_multimodal_chat_transformation(
             mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
         """
         Transform a prompt and optional multimodal content into a chat format.
-        This method is used for chat models that expect a specific 
-        conversation format.
+        This method is used for chat models that expect a specific conversation
+        format.
         """
         content = [{"text": prompt, "type": "text"}]
         if mm_content is not None:
@@ -101,10 +101,10 @@ def apply_multimodal_chat_transformation(
     def load_data(self) -> None:
         """
         Load data from the dataset path into self.data.
-        
+
         This method must be overridden by subclasses since the method to load
         data will vary depending on the dataset format and source.
-        
+
         Raises:
             NotImplementedError: If a subclass does not implement this method.
         """
@@ -121,18 +121,18 @@ def get_random_lora_request(
         """
         Optionally select a random LoRA request and return its associated
         tokenizer.
-        
+
         This method is used when LoRA parameters are provided.  It randomly
         selects a LoRA based on max_loras and retrieves a cached tokenizer for
         that LoRA if available. Otherwise, it returns the base tokenizer.
-        
+
         Args:
             tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
             LoRA is selected.  max_loras (Optional[int]): The maximum number of
             LoRAs available. If None, LoRA is not used.  lora_path
             (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
             is not used.
-        
+
         Returns:
             tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
             element is a LoRARequest (or None if not applicable) and the second
@@ -160,21 +160,39 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,
                num_requests: int) -> list[SampleRequest]:
         """
         Abstract method to generate sample requests from the dataset.
-        
+
         Subclasses must override this method to implement dataset-specific logic
         for generating a list of SampleRequest objects.
-        
+
         Args:
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
              for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
-        
+
         Returns:
             list[SampleRequest]: A list of sample requests generated from the
             dataset.
         """
         raise NotImplementedError("sample must be implemented in subclasses.")
 
+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+
 
 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@@ -276,15 +294,16 @@ def __init__(
     ) -> None:
         super().__init__(**kwargs)
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               range_ratio: float = DEFAULT_RANGE_RATIO,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list[SampleRequest]:
-
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        range_ratio: float = DEFAULT_RANGE_RATIO,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        **kwargs,
+    ) -> list[SampleRequest]:
         vocab_size = tokenizer.vocab_size
 
         prefix_token_ids = (np.random.randint(
@@ -346,20 +365,24 @@ def load_data(self) -> None:
         random.seed(self.random_seed)
         random.shuffle(self.data)
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               lora_path: Optional[str] = None,
-               max_loras: Optional[int] = None,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
         samples: list = []
         for entry in self.data:
             if len(samples) >= num_requests:
                 break
-            prompt, completion = entry["conversations"][0]["value"],\
-                entry["conversations"][1]["value"]
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )
 
             lora_request, tokenizer = self.get_random_lora_request(
                 tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@@ -383,6 +406,7 @@ def sample(self,
                     expected_output_len=new_output_len,
                     lora_request=lora_request,
                 ))
+        self.maybe_oversample_requests(samples, num_requests)
         return samples
 
 
@@ -415,19 +439,20 @@ def load_data(self) -> None:
         with open(self.dataset_path, encoding="utf-8") as f:
             self.data = f.readlines()
 
-    def sample(self,
-               tokenizer,
-               num_requests: int,
-               prefix_len: int = DEFAULT_PREFIX_LEN,
-               input_len: int = DEFAULT_INPUT_LEN,
-               output_len: int = DEFAULT_OUTPUT_LEN,
-               return_prompt_formatted: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer,
+        num_requests: int,
+        prefix_len: int = DEFAULT_PREFIX_LEN,
+        input_len: int = DEFAULT_INPUT_LEN,
+        output_len: int = DEFAULT_OUTPUT_LEN,
+        return_prompt_formatted: bool = False,
+        **kwargs,
+    ) -> list:
         # Calculate average token length for a poem line.
         tokenized_lines = [tokenizer(line).input_ids for line in self.data]
         avg_len = sum(len(tokens)
-                      for tokens in \
-                        tokenized_lines) / len(tokenized_lines)
+                      for tokens in tokenized_lines) / len(tokenized_lines)
 
         # Build the base prompt.
         base_prompt = "Pick as many lines as you can from these poem lines:\n"
@@ -506,12 +531,14 @@ def _sample_loaded_data(self, num_requests: int) -> list:
         # Convert the dataframe to a list of lists.
         return data.values.tolist()
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               max_loras: Optional[int] = None,
-               lora_path: Optional[str] = None,
-               **kwargs) -> list[SampleRequest]:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        max_loras: Optional[int] = None,
+        lora_path: Optional[str] = None,
+        **kwargs,
+    ) -> list[SampleRequest]:
         samples = []
         data = self._sample_loaded_data(num_requests=num_requests)
         for i in range(num_requests):
@@ -544,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
     Dataset class for processing a HuggingFace dataset with conversation data
     and optional images.
     """
-    DEFAULT_NUM_REQUESTS = 1000
 
     def __init__(
         self,
@@ -618,6 +644,7 @@ def sample(self,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                 ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
 
 
@@ -632,7 +659,6 @@ class VisionArenaDataset(HuggingFaceDataset):
     """
 
     DEFAULT_OUTPUT_LEN = 128
-    DEFAULT_NUM_REQUESTS = 1000
     VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
 
     def __init__(
@@ -657,12 +683,14 @@ def load_data(self) -> None:
         )
         self.data = dataset.shuffle(seed=self.random_seed)
 
-    def sample(self,
-               tokenizer: PreTrainedTokenizerBase,
-               num_requests: int,
-               output_len: Optional[int] = None,
-               enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+    def sample(
+        self,
+        tokenizer: PreTrainedTokenizerBase,
+        num_requests: int,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
         output_len = (output_len
                       if output_len is not None else self.DEFAULT_OUTPUT_LEN)
         sampled_requests = []
@@ -685,4 +713,5 @@ def sample(self,
                     expected_output_len=output_len,
                     multi_modal_data=mm_content,
                 ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests

From 29e20604bc5723ad0da35bef61d147103f5f0aaa Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 20 Mar 2025 00:33:51 -0400
Subject: [PATCH 0856/1240] [Core][V0] Add guidance backend for structured
 output (#14589)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <lohuynh@microsoft.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../benchmark_serving_structured_output.py    | 11 +--
 requirements/common.txt                       |  1 +
 tests/entrypoints/llm/test_guided_generate.py |  4 +-
 .../model_executor/test_guided_processors.py  |  4 +-
 vllm/config.py                                |  4 +-
 .../guided_decoding/__init__.py               | 27 ++++--
 .../guided_decoding/guidance_decoding.py      | 44 ++++++++++
 .../guidance_logits_processors.py             | 85 +++++++++++++++++++
 8 files changed, 167 insertions(+), 13 deletions(-)
 create mode 100644 vllm/model_executor/guided_decoding/guidance_decoding.py
 create mode 100644 vllm/model_executor/guided_decoding/guidance_logits_processors.py

diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 444bda2ad26..c79a93faff1 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -999,11 +999,12 @@ def main(args: argparse.Namespace):
                         type=float,
                         default=1.0,
                         help="Ratio of Structured Outputs requests")
-    parser.add_argument("--structured-output-backend",
-                        type=str,
-                        choices=["outlines", "lm-format-enforcer", "xgrammar"],
-                        default="xgrammar",
-                        help="Backend to use for structured outputs")
+    parser.add_argument(
+        "--structured-output-backend",
+        type=str,
+        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+        default="xgrammar",
+        help="Backend to use for structured outputs")
 
     args = parser.parse_args()
     main(args)
diff --git a/requirements/common.txt b/requirements/common.txt
index d08ef253828..2d52858ad9e 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -18,6 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
+llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 97ee027bde3..5f1a91cb2b1 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -14,7 +14,9 @@
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py
index 85a53a178ca..59da575e37b 100644
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
@@ -16,7 +16,9 @@
 from vllm.sampling_params import GuidedDecodingParams
 
 MODEL_NAME = 'HuggingFaceH4/zephyr-7b-beta'
-GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+GUIDED_DECODING_BACKENDS = [
+    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+]
 GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
 REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
 
diff --git a/vllm/config.py b/vllm/config.py
index 2d8f1ba483e..ffff3b7c8a8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2785,7 +2785,9 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar']
+        valid_guided_backends = [
+            'outlines', 'lm-format-enforcer', 'xgrammar', 'guidance'
+        ]
 
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index c21df044d48..0c26a60588c 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -79,6 +79,12 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                     "xgrammar does not support Lark grammars and the "
                     "grammar failed to convert to GBNF.", "outlines")
 
+        elif guided_params.json_object:
+            # https://github.com/mlc-ai/xgrammar/issues/256
+            fallback_or_error(guided_params,
+                              "xgrammar does not support json_object.",
+                              "guidance")
+
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
@@ -88,9 +94,9 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
 
     if (guided_params.backend_name == "outlines"
             and guided_params.json_object is not None):
-        # outlines doesn't support json_object, fallback to xgrammar
+        # outlines doesn't support json_object, fallback to guidance
         fallback_or_error(guided_params,
-                          "outlines does not support json_object.", "xgrammar")
+                          "outlines does not support json_object.", "guidance")
 
     return guided_params
 
@@ -122,10 +128,15 @@ async def get_guided_decoding_logits_processor(
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
-
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
 
 
 def get_local_guided_decoding_logits_processor(
@@ -155,7 +166,13 @@ def get_local_guided_decoding_logits_processor(
             get_local_xgrammar_guided_decoding_logits_processor)
         return get_local_xgrammar_guided_decoding_logits_processor(
             guided_params, tokenizer, model_config, reasoner)
+    if guided_params.backend_name == 'guidance':
+        from vllm.model_executor.guided_decoding.guidance_decoding import (
+            get_local_guidance_guided_decoding_logits_processor)
+        return get_local_guidance_guided_decoding_logits_processor(
+            guided_params, tokenizer)
 
     raise ValueError(
         f"Unknown guided decoding backend '{guided_params.backend}'. "
-        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'")
+        "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar', 'guidance'"
+    )
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
new file mode 100644
index 00000000000..d8675a14030
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+from re import escape as regex_escape
+
+import llguidance
+from transformers import PreTrainedTokenizerBase
+
+from vllm.model_executor.guided_decoding.guidance_logits_processors import (
+    GuidanceLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
+
+
+def get_local_guidance_guided_decoding_logits_processor(
+        guided_params: GuidedDecodingParams,
+        tokenizer: PreTrainedTokenizerBase) -> GuidanceLogitsProcessor:
+    """
+    Given an OpenAI-compatible request, check for guided decoding parameters
+    and get the necessary logits processor for the given guide.
+    """
+
+    grm = ""
+    if guided_params.json:
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            guided_params.json,
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+    elif guided_params.json_object:
+        grm = llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}',
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+    elif guided_params.regex:
+        grm = llguidance.grammar_from("regex", guided_params.regex)
+    elif guided_params.choice:
+        # choice just uses regex
+        choices = (regex_escape(str(choice))
+                   for choice in guided_params.choice)
+        choices_regex = "(" + "|".join(choices) + ")"
+        grm = llguidance.grammar_from("regex", choices_regex)
+    elif guided_params.grammar:
+        # this supports Lark and GBNF
+        grm = llguidance.grammar_from("grammar", guided_params.grammar)
+
+    if grm:
+        return GuidanceLogitsProcessor(grm, tokenizer)
+
+    raise ValueError("Unknown guided decoding mode")
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
new file mode 100644
index 00000000000..26fcafe31c7
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Any, List
+
+import llguidance
+import llguidance.hf
+import llguidance.torch
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class GuidanceLogitsProcessor:
+    """Base Guidance Logits Processor"""
+
+    cached_tokenizers: dict[str, Any] = {}
+
+    def __init__(
+        self,
+        grammar: str,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> None:
+        """Base Guidance Logits Processor
+
+        Args:
+            grammar (str)
+                grammar to guide the generation
+            tokenizer (PreTrainedTokenizerBase)
+                model's tokenizer
+        """
+        self.grammar = grammar
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer.name_or_path
+        self.new_sampling = False
+        self.initialized = False
+
+    def _initialize(self):
+        if self.initialized:
+            return
+
+        ll_tokenizer = self.cached_tokenizers.get(self.tokenizer.name_or_path,
+                                                  None)
+        if ll_tokenizer is None:
+            ll_tokenizer = llguidance.hf.from_tokenizer(self.tokenizer, None)
+            self.cached_tokenizers[self.tokenizer.name_or_path] = ll_tokenizer
+
+        self.ll_tokenizer = ll_tokenizer
+        self.ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        # create reusable bitmask
+        self.bitmask = llguidance.torch.allocate_token_bitmask(
+            1, self.ll_tokenizer.vocab_size)
+
+        self.initialized = True
+
+    def __call__(
+        self,
+        input_ids: List[int],
+        scores: torch.Tensor,
+    ) -> torch.Tensor:
+        # we initialize the guidance model here
+        # to avoid pickling ll_tokenizer and ll_interpreter
+        self._initialize()
+
+        if self.new_sampling and len(input_ids) > 0:
+            self.ll_matcher.consume_token(input_ids[-1])
+            err = self.ll_matcher.get_error()
+            if err:
+                logger.warning("Error in LLMatcher: %s", err)
+
+        llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
+                                                 0)
+        llguidance.torch.apply_token_bitmask_inplace(
+            scores, self.bitmask.to(scores.device))
+
+        self.new_sampling = True
+
+        return scores

From 657f1f2babf40b1ab925776e14a86d0cf164c5b6 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 19 Mar 2025 21:46:06 -0700
Subject: [PATCH 0857/1240] [Doc] Update Mistral Small 3.1/Pixtral example
 (#15184)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../offline_inference/{pixtral.py => mistral-small.py} | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
 rename examples/offline_inference/{pixtral.py => mistral-small.py} (92%)

diff --git a/examples/offline_inference/pixtral.py b/examples/offline_inference/mistral-small.py
similarity index 92%
rename from examples/offline_inference/pixtral.py
rename to examples/offline_inference/mistral-small.py
index 5379f456216..43be2aa8077 100644
--- a/examples/offline_inference/pixtral.py
+++ b/examples/offline_inference/mistral-small.py
@@ -6,14 +6,16 @@
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
 
-# This script is an offline demo for running Mistral-Small-3
+# This script is an offline demo for running Mistral-Small-3.1
 #
 # If you want to run a server/client setup, please follow this code:
 #
 # - Server:
 #
 # ```bash
-# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 --tokenizer-mode mistral --limit-mm-per-prompt 'image=4' --max-model-len 16384
+# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
+#   --tokenizer-mode mistral --config-format mistral --load-format mistral \
+#   --limit-mm-per-prompt 'image=4' --max-model-len 16384
 # ```
 #
 # - Client:
@@ -51,6 +53,8 @@ def run_simple_demo(args: argparse.Namespace):
     llm = LLM(
         model=model_name,
         tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
         max_model_len=4096,
         max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@@ -91,6 +95,8 @@ def run_advanced_demo(args: argparse.Namespace):
     llm = LLM(
         model=model_name,
         tokenizer_mode="mistral",
+        config_format="mistral",
+        load_format="mistral",
         limit_mm_per_prompt={"image": max_img_per_msg},
         max_model_len=max_img_per_msg * max_tokens_per_img,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,

From adaa6fc1c722f4ca2412a75d3abc79e8a58a7325 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Thu, 20 Mar 2025 12:53:40 +0800
Subject: [PATCH 0858/1240] [Misc]fixed disable these http request logs
 (#14754)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 3 +++
 vllm/entrypoints/openai/cli_args.py   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9eb17726b1f..f9b1d69a31d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1036,6 +1036,9 @@ def _listen_addr(a: str) -> str:
             host=args.host,
             port=args.port,
             log_level=args.uvicorn_log_level,
+            # NOTE: When the 'disable_uvicorn_access_log' value is True,
+            # no access log will be output.
+            access_log=not args.disable_uvicorn_access_log,
             timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
             ssl_keyfile=args.ssl_keyfile,
             ssl_certfile=args.ssl_certfile,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index bd66416d90c..01c67b8aa29 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -89,6 +89,9 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default="info",
         choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
         help="Log level for uvicorn.")
+    parser.add_argument("--disable-uvicorn-access-log",
+                        action="store_true",
+                        help="Disable uvicorn access log.")
     parser.add_argument("--allow-credentials",
                         action="store_true",
                         help="Allow credentials.")

From 764aa707f3caee831e8769603b3b77fbbb3f84e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micka=C3=ABl=20Seznec?= <mickael.seznec@gmail.com>
Date: Thu, 20 Mar 2025 06:14:20 +0100
Subject: [PATCH 0859/1240] [Attention] Flash Attention 3 - fp8 (#14570)

Signed-off-by: Mickael Seznec <mickael@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 cmake/external_projects/vllm_flash_attn.cmake |  2 +-
 tests/kernels/test_flash_attn.py              | 76 ++++++++++++++++--
 vllm/attention/__init__.py                    | 12 ++-
 vllm/attention/backends/abstract.py           |  1 +
 vllm/attention/backends/flash_attn.py         | 79 ++++++++++++++++---
 vllm/attention/backends/mla/common.py         |  2 +-
 vllm/attention/backends/utils.py              | 34 --------
 vllm/attention/layer.py                       |  9 ++-
 vllm/envs.py                                  |  7 +-
 vllm/fa_utils.py                              | 42 ++++++++++
 .../layers/quantization/kv_cache.py           | 16 +++-
 vllm/platforms/cuda.py                        | 21 ++---
 vllm/v1/attention/backends/flash_attn.py      | 41 +++++++++-
 vllm/v1/executor/multiproc_executor.py        |  4 +
 vllm/v1/worker/gpu_model_runner.py            |  2 +-
 15 files changed, 272 insertions(+), 76 deletions(-)
 create mode 100644 vllm/fa_utils.py

diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index f2d01099097..afd7c47e8ac 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9 
+          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
index 95424e25732..572563c0bd8 100644
--- a/tests/kernels/test_flash_attn.py
+++ b/tests/kernels/test_flash_attn.py
@@ -15,6 +15,7 @@
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
 DTYPES = [torch.float16, torch.bfloat16]
+QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
@@ -85,6 +86,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("sliding_window", [None, 256])
 @pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     use_out: bool,
@@ -97,11 +99,15 @@ def test_flash_attn_with_paged_kv(
     num_blocks: int,
     sliding_window: Optional[int],
     fa_version: int,
+    q_dtype: Optional[torch.dtype],
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
         pytest.skip(f"Flash attention version {fa_version} not supported due "
                     f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")
 
     current_platform.seed_everything(0)
     num_seqs = len(kv_lens)
@@ -130,10 +136,28 @@ def test_flash_attn_with_paged_kv(
 
     q = query.unsqueeze(1)
     out = torch.empty_like(q) if use_out else None
+
+    maybe_quantized_query = q
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
     output = flash_attn_with_kvcache(
-        q=q,
-        k_cache=key_cache,
-        v_cache=value_cache,
+        q=maybe_quantized_query,
+        k_cache=maybe_quantized_key_cache,
+        v_cache=maybe_quantized_value_cache,
         out=out,
         softmax_scale=scale,
         causal=True,
@@ -142,10 +166,17 @@ def test_flash_attn_with_paged_kv(
         softcap=soft_cap if soft_cap is not None else 0,
         window_size=window_size,
         fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
     )
     output = output if not use_out else out
     output = output.squeeze(1)
 
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+
     ref_output = ref_paged_attn(query=query,
                                 key_cache=key_cache,
                                 value_cache=value_cache,
@@ -155,7 +186,7 @@ def test_flash_attn_with_paged_kv(
                                 scale=scale,
                                 soft_cap=soft_cap,
                                 sliding_window=sliding_window)
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - ref_output))}"
 
 
@@ -171,6 +202,7 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("fa_version", [2, 3])
+@pytest.mark.parametrize("q_dtype", QDTYPES)
 @torch.inference_mode()
 def test_varlen_with_paged_kv(
     use_out: bool,
@@ -183,11 +215,15 @@ def test_varlen_with_paged_kv(
     soft_cap: Optional[float],
     num_blocks: int,
     fa_version: int,
+    q_dtype: Optional[torch.dtype],
 ) -> None:
     torch.set_default_device("cuda")
     if not is_fa_version_supported(fa_version):
         pytest.skip(f"Flash attention version {fa_version} not supported due "
                     f"to: \"{fa_version_unsupported_reason(fa_version)}\"")
+    if q_dtype is not None and (dtype != torch.bfloat16 or fa_version == 2):
+        pytest.skip("Flash attention with quantized inputs is only "
+                    "supported on version 3 with bfloat16 base type")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
     query_lens = [x[0] for x in seq_lens]
@@ -223,10 +259,28 @@ def test_varlen_with_paged_kv(
                                  dtype=torch.int32)
 
     out = torch.empty_like(query) if use_out else None
+
+    maybe_quantized_query = query
+    maybe_quantized_key_cache = key_cache
+    maybe_quantized_value_cache = value_cache
+    q_descale = None
+    k_descale = None
+    v_descale = None
+    if q_dtype is not None:
+        # QKV are drawn from N(0, 1): no need for a fp8 scaling factor
+        maybe_quantized_query = query.to(q_dtype)
+        maybe_quantized_key_cache = key_cache.to(q_dtype)
+        maybe_quantized_value_cache = value_cache.to(q_dtype)
+
+        scale_shape = (num_seqs, num_kv_heads)
+        q_descale = torch.ones(scale_shape, dtype=torch.float32)
+        k_descale = torch.ones(scale_shape, dtype=torch.float32)
+        v_descale = torch.ones(scale_shape, dtype=torch.float32)
+
     output = flash_attn_varlen_func(
-        q=query,
-        k=key_cache,
-        v=value_cache,
+        q=maybe_quantized_query,
+        k=maybe_quantized_key_cache,
+        v=maybe_quantized_value_cache,
         out=out,
         cu_seqlens_q=cu_query_lens,
         seqused_k=kv_lens,
@@ -238,6 +292,9 @@ def test_varlen_with_paged_kv(
         block_table=block_tables,
         softcap=soft_cap if soft_cap is not None else 0,
         fa_version=fa_version,
+        q_descale=q_descale,
+        k_descale=k_descale,
+        v_descale=v_descale,
     )
     output = output if not use_out else out
 
@@ -252,5 +309,8 @@ def test_varlen_with_paged_kv(
         sliding_window=sliding_window,
         soft_cap=soft_cap,
     )
-    torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \
+    atol, rtol = 1.5e-2, 1e-2
+    if q_dtype is not None:
+        atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol), \
         f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index 89229e7b87a..85c5715faba 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -4,12 +4,16 @@
                                               AttentionMetadata,
                                               AttentionMetadataBuilder,
                                               AttentionState, AttentionType)
-from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 
 __all__ = [
-    "Attention", "AttentionBackend", "AttentionMetadata", "AttentionType",
-    "AttentionMetadataBuilder", "Attention", "AttentionState",
-    "get_attn_backend", "get_flash_attn_version"
+    "Attention",
+    "AttentionBackend",
+    "AttentionMetadata",
+    "AttentionType",
+    "AttentionMetadataBuilder",
+    "Attention",
+    "AttentionState",
+    "get_attn_backend",
 ]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 0cd95e0749d..82d60f9da7d 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -232,6 +232,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
 
 class AttentionLayer(Protocol):
 
+    _q_scale: torch.Tensor
     _k_scale: torch.Tensor
     _v_scale: torch.Tensor
     _k_scale_float: float
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 0e331efa6a3..e981ac780b0 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -19,10 +19,10 @@
 # yapf: enable
 from vllm.attention.backends.utils import (
     PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping,
-    compute_slot_mapping_start_idx, get_flash_attn_version,
-    get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args,
-    is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set,
-    is_block_tables_empty)
+    compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
+    get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
+    is_all_encoder_attn_metadata_set, is_block_tables_empty)
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
@@ -630,9 +630,11 @@ def __init__(
         self.sliding_window = ((sliding_window - 1,
                                 0) if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
-        if is_quantized_kv_cache(self.kv_cache_dtype):
+        self.vllm_flash_attn_version = get_flash_attn_version()
+        if (is_quantized_kv_cache(self.kv_cache_dtype)
+                and self.vllm_flash_attn_version != 3):
             raise NotImplementedError(
-                "FlashAttention with FP8 KV cache not yet supported")
+                "Only FlashAttention3 supports FP8 KV cache")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
@@ -647,7 +649,6 @@ def __init__(
                 f"Head size {head_size} is not supported by FlashAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
         self.attn_type = attn_type
-        self.vllm_flash_attn_version = get_flash_attn_version()
 
     def forward(
         self,
@@ -671,13 +672,19 @@ def forward(
                 for profiling run.
             attn_metadata: Metadata for attention.
         NOTE: It in-place updates the output tensor.
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
         """
-        # NOTE(woosuk): FlashAttention does not support FP8 KV cache.
-        assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0, (
-            "key/v_scale is not supported in FlashAttention.")
-
         assert output is not None, "Output tensor must be provided."
 
+        # NOTE(woosuk): FlashAttention2 does not support FP8 KV cache.
+        if self.vllm_flash_attn_version < 3 or output.dtype != torch.bfloat16:
+            assert (
+                layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0), (
+                    "key/v_scale is only supported in FlashAttention 3 with "
+                    "base dtype bfloat16")
+
         attn_type = self.attn_type
         if (attn_type == AttentionType.ENCODER
                 and (not attn_metadata.is_all_encoder_attn_metadata_set)):
@@ -694,6 +701,7 @@ def forward(
         window_size = self.sliding_window
         alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes
         logits_soft_cap: Optional[float] = self.logits_soft_cap
+        fp8_attention = kv_cache_dtype.startswith("fp8")
 
         if kv_cache.numel() > 0:
             key_cache = kv_cache[0]
@@ -729,6 +737,19 @@ def forward(
                     layer._v_scale,
                 )
 
+                if fp8_attention:
+                    kv_cache = kv_cache.view(torch.float8_e4m3fn)
+                    key_cache = key_cache.view(torch.float8_e4m3fn)
+                    value_cache = value_cache.view(torch.float8_e4m3fn)
+
+        if fp8_attention:
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
+
         (num_prefill_query_tokens, num_prefill_kv_tokens,
         num_decode_query_tokens) = \
             get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type)
@@ -753,6 +774,23 @@ def forward(
                 key = key[:num_prefill_kv_tokens]
                 value = value[:num_prefill_kv_tokens]
 
+                if fp8_attention:
+                    num_kv_tokens, num_kv_heads, head_size = key.shape
+
+                    key, _ = ops.scaled_fp8_quant(
+                        key.reshape((num_kv_tokens,
+                                     num_kv_heads * head_size)).contiguous(),
+                        layer._k_scale)
+                    key = key.reshape((num_kv_tokens, num_kv_heads, head_size))
+
+                    value, _ = ops.scaled_fp8_quant(
+                        value.reshape((num_kv_tokens,
+                                       num_kv_heads * head_size)).contiguous(),
+                        layer._v_scale)
+                    value = value.reshape(
+                        (num_kv_tokens, num_kv_heads, head_size))
+
+                descale_shape = (q_seq_start_loc.shape[0] - 1, key.shape[1])
                 flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -768,13 +806,19 @@ def forward(
                     softcap=logits_soft_cap,
                     out=prefill_output,
                     fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                 )
             else:
                 # prefix-enabled attention
                 assert attn_type == AttentionType.DECODER, (
                     "Only decoder-only models support prefix caching")
                 assert prefill_meta.seq_lens is not None
+                assert prefill_meta.query_start_loc is not None
                 max_seq_len = max(prefill_meta.seq_lens)
+                descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
                 flash_attn_varlen_func(  # noqa
                     q=query,
                     k=key_cache,
@@ -791,6 +835,9 @@ def forward(
                     softcap=logits_soft_cap,
                     out=prefill_output,
                     fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
@@ -804,6 +851,9 @@ def forward(
                 assert attn_type == AttentionType.DECODER, (
                     "Only decoder-only models support max_decode_query_len > 1"
                 )
+                assert decode_meta.query_start_loc is not None
+                descale_shape = (decode_meta.query_start_loc.shape[0] - 1,
+                                 key.shape[1])
                 flash_attn_varlen_func(
                     q=decode_query,
                     k=key_cache,
@@ -820,6 +870,9 @@ def forward(
                     block_table=decode_meta.block_tables,
                     out=decode_output,
                     fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                 )
             else:
                 # Use flash_attn_with_kvcache for normal decoding.
@@ -828,6 +881,7 @@ def forward(
                     _,
                     block_tables_arg,
                 ) = get_seq_len_block_table_args(decode_meta, False, attn_type)
+                descale_shape = (seq_lens_arg.shape[0], key_cache.shape[-2])
                 flash_attn_with_kvcache(
                     q=decode_query.unsqueeze(1),
                     k_cache=key_cache,
@@ -841,6 +895,9 @@ def forward(
                     softcap=logits_soft_cap,
                     out=decode_output.unsqueeze(1),
                     fa_version=self.vllm_flash_attn_version,
+                    q_descale=layer._q_scale.expand(descale_shape),
+                    k_descale=layer._k_scale.expand(descale_shape),
+                    v_descale=layer._v_scale.expand(descale_shape),
                 )
         return output
 
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index ff411f75ae7..258090d3e80 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -203,9 +203,9 @@
                                               AttentionState, MLAAttentionImpl)
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
-                                           get_flash_attn_version,
                                            is_block_tables_empty)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.fa_utils import get_flash_attn_version
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 4374b542225..b4413c36b64 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -8,13 +8,11 @@
 import numpy as np
 import torch
 
-from vllm import envs
 from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder,
                             AttentionState)
 from vllm.attention.backends.abstract import AttentionType
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
-from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
 logger = init_logger(__name__)
@@ -585,35 +583,3 @@ def get_num_prefill_decode_query_kv_tokens(
 
     return (num_prefill_query_tokens, num_prefill_kv_tokens,
             num_decode_query_tokens)
-
-
-def get_flash_attn_version():
-    try:
-        from vllm.vllm_flash_attn.flash_attn_interface import (
-            fa_version_unsupported_reason, is_fa_version_supported)
-
-        # if hopper default to FA3, otherwise stick to FA2 for now
-        # TODO(lucas): profile FA3 on ampere to see if it makes sense to
-        #  use FA3 as default for both
-        if current_platform.get_device_capability()[0] == 9:
-            fa_version = 3 if is_fa_version_supported(3) else 2
-        else:
-            fa_version = 2
-
-        if envs.VLLM_FLASH_ATTN_VERSION is not None:
-            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
-            fa_version = envs.VLLM_FLASH_ATTN_VERSION
-            if (current_platform.get_device_capability()[0] == 10
-                    and envs.VLLM_FLASH_ATTN_VERSION == 3):
-                logger.warning("Cannot use FA version 3 on Blackwell platform",
-                               "defaulting to FA version 2.")
-                fa_version = 2
-
-        if not is_fa_version_supported(fa_version):
-            logger.error("Cannot use FA version %d is not supported due to %s",
-                         fa_version, fa_version_unsupported_reason(fa_version))
-
-        assert is_fa_version_supported(fa_version)
-        return fa_version
-    except (ImportError, AssertionError):
-        return None
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 3cbd38dbd46..946c07d508a 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -84,6 +84,9 @@ def __init__(
         self.calculate_kv_scales = calculate_kv_scales
         self._k_scale = torch.tensor(1.0, dtype=torch.float32)
         self._v_scale = torch.tensor(1.0, dtype=torch.float32)
+        # FlashAttn doesn't support quantizing the kv-cache only
+        # but requires q to be quantized as well.
+        self._q_scale = torch.tensor(1.0, dtype=torch.float32)
 
         # We also keep the float32 versions of k/v_scale for attention
         # backends that don't support tensors (Flashinfer)
@@ -153,6 +156,7 @@ def __init__(
             ).parallel_config.pipeline_parallel_size)
         ]
 
+        self.q_range = torch.tensor(envs.Q_SCALE_CONSTANT, dtype=torch.float32)
         self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32)
         self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32)
 
@@ -178,7 +182,7 @@ def forward(
         if self.calculate_kv_scales:
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation:
-                self.calc_kv_scales(key, value)
+                self.calc_kv_scales(query, key, value)
         if self.use_output:
             output_shape = (output_shape
                             if output_shape is not None else query.shape)
@@ -225,7 +229,8 @@ def forward(
                 return torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
 
-    def calc_kv_scales(self, key, value):
+    def calc_kv_scales(self, query, key, value):
+        self._q_scale.copy_(torch.abs(query).max() / self.q_range)
         self._k_scale.copy_(torch.abs(key).max() / self.k_range)
         self._v_scale.copy_(torch.abs(value).max() / self.v_range)
         self._k_scale_float = self._k_scale.item()
diff --git a/vllm/envs.py b/vllm/envs.py
index b2937462ad3..56bf8626747 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
+    Q_SCALE_CONSTANT: int = 200
     K_SCALE_CONSTANT: int = 200
     V_SCALE_CONSTANT: int = 100
     VLLM_SERVER_DEV_MODE: bool = False
@@ -524,13 +525,17 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
+
+    # Divisor for dynamic query scale factor calculation for FP8 KV Cache
+    "Q_SCALE_CONSTANT":
+    lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
     # Divisor for dynamic key scale factor calculation for FP8 KV Cache
     "K_SCALE_CONSTANT":
     lambda: int(os.getenv("K_SCALE_CONSTANT", "200")),
-
     # Divisor for dynamic value scale factor calculation for FP8 KV Cache
     "V_SCALE_CONSTANT":
     lambda: int(os.getenv("V_SCALE_CONSTANT", "100")),
+
     # If set, enable multiprocessing in LLM for the V1 code path.
     "VLLM_ENABLE_V1_MULTIPROCESSING":
     lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))),
diff --git a/vllm/fa_utils.py b/vllm/fa_utils.py
new file mode 100644
index 00000000000..028c96b839f
--- /dev/null
+++ b/vllm/fa_utils.py
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+from vllm import envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def get_flash_attn_version() -> Optional[int]:
+    # import here to avoid circular dependencies
+    from vllm.platforms import current_platform
+    try:
+        from vllm.vllm_flash_attn.flash_attn_interface import (
+            fa_version_unsupported_reason, is_fa_version_supported)
+        device_capability = current_platform.get_device_capability()
+
+        assert device_capability is not None
+
+        # 1. default version depending on platform
+        fa_version = 3 if (device_capability.major == 9
+                           and is_fa_version_supported(3)) else 2
+
+        # 2. override if passed by environment
+        if envs.VLLM_FLASH_ATTN_VERSION is not None:
+            assert envs.VLLM_FLASH_ATTN_VERSION in [2, 3]
+            fa_version = envs.VLLM_FLASH_ATTN_VERSION
+
+        # 3. fallback for unsupported combinations
+        if device_capability.major == 10 and fa_version == 3:
+            logger.warning("Cannot use FA version 3 on Blackwell platform",
+                           "defaulting to FA version 2.")
+            fa_version = 2
+
+        if not is_fa_version_supported(fa_version):
+            logger.error("Cannot use FA version %d is not supported due to %s",
+                         fa_version, fa_version_unsupported_reason(fa_version))
+
+        assert is_fa_version_supported(fa_version)
+        return fa_version
+    except (ImportError, AssertionError):
+        return None
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index 92990487885..5d766c2c27a 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -26,11 +26,14 @@ def __init__(self, quant_config: QuantizationConfig):
 
     def create_weights(self, layer: torch.nn.Module):
         """
-        Create "weight" (aka k_scale and v_scale) for an attention layer.
+        Create "weight" (aka q_scale, k_scale and v_scale)
+        for an attention layer.
         """
-        # Initialize the KV cache scales to -1.0, which is an invalid value.
-        # If the k/v_scale appears in the checkpoint, it will be
+        # Initialize the Q and KV cache scales to -1.0, an invalid value.
+        # If the q and k/v_scales appear in the checkpoint, it will be
         # overwritten when loading weights.
+        layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0),
+                                           requires_grad=False)
         layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0),
                                            requires_grad=False)
         layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0),
@@ -75,6 +78,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 raise ValueError("Only support per-tensor scaling factor "
                                  "for fp8 KV cache")
 
+            if layer.q_scale < 0.0:
+                logger.warning_once(
+                    "Checkpoint does not provide a q scaling factor. "
+                    "Setting it to k_scale. This only matters for "
+                    "the flash-attn backend.")
+                layer._q_scale.copy_(k_scale)
+
             # These are used in the final Attention.forward()
             layer._k_scale.copy_(k_scale)
             layer._v_scale.copy_(v_scale)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 8a53337ebc0..dd2a9cb6161 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,6 +14,7 @@
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.utils import import_pynvml
 
@@ -240,15 +241,6 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                 "Cannot use FlashAttention-2 backend for dtype other than "
                 "torch.float16 or torch.bfloat16.")
             target_backend = _Backend.XFORMERS
-        elif kv_cache_dtype is not None and \
-            kv_cache_dtype.startswith("fp8"):
-            logger.info(
-                "Cannot use FlashAttention-2 backend for FP8 KV cache.")
-            logger.warning(
-                "Please use FlashInfer backend with FP8 KV Cache for "
-                "better performance by setting environment variable "
-                "VLLM_ATTENTION_BACKEND=FLASHINFER")
-            target_backend = _Backend.XFORMERS
         elif block_size % 16 != 0:
             logger.info(
                 "Cannot use FlashAttention-2 backend for block size not "
@@ -270,6 +262,17 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         "Cannot use FlashAttention-2 backend for head size %d.",
                         head_size)
                     target_backend = _Backend.XFORMERS
+                fp8_kv_cache = (kv_cache_dtype is not None
+                                and kv_cache_dtype.startswith("fp8"))
+                if (fp8_kv_cache and get_flash_attn_version() != 3):
+                    logger.info(
+                        "Cannot use FlashAttention-2 backend for FP8 KV cache."
+                    )
+                    logger.warning(
+                        "Please use FlashInfer backend with FP8 KV Cache for "
+                        "better performance by setting environment variable "
+                        "VLLM_ATTENTION_BACKEND=FLASHINFER")
+                    target_backend = _Backend.XFORMERS
             except ImportError:
                 logger.info(
                     "Cannot use FlashAttention-2 backend because the "
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ad44f256a7b..637c01556ac 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -6,11 +6,12 @@
 import numpy as np
 import torch
 
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
-from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
@@ -226,6 +227,9 @@ def forward(
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
+        NOTE: FP8 quantization, flash-attn expect the size of
+              {q,k,v}_descale to be (num_sequences, num_kv_heads).
+              We use torch's .expand() to avoid duplicating values
         """
         assert output is not None, "Output tensor must be provided."
 
@@ -259,6 +263,17 @@ def forward(
             layer._k_scale,
             layer._v_scale,
         )
+        descale_shape = (attn_metadata.query_start_loc.shape[0] - 1,
+                         key.shape[1])
+        if self.kv_cache_dtype.startswith("fp8"):
+            key_cache = key_cache.view(torch.float8_e4m3fn)
+            value_cache = value_cache.view(torch.float8_e4m3fn)
+            num_tokens, num_heads, head_size = query.shape
+            query, _ = ops.scaled_fp8_quant(
+                query.reshape(
+                    (num_tokens, num_heads * head_size)).contiguous(),
+                layer._q_scale)
+            query = query.reshape((num_tokens, num_heads, head_size))
 
         # Compute attention and update output up to `num_actual_tokens`.
         if not attn_metadata.use_cascade:
@@ -279,6 +294,9 @@ def forward(
                 block_table=attn_metadata.block_table,
                 softcap=self.logits_soft_cap,
                 fa_version=self.vllm_flash_attn_version,
+                q_descale=layer._q_scale.expand(descale_shape),
+                k_descale=layer._k_scale.expand(descale_shape),
+                v_descale=layer._v_scale.expand(descale_shape),
             )
             return output
 
@@ -301,6 +319,9 @@ def forward(
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
             fa_version=self.vllm_flash_attn_version,
+            q_descale=layer._q_scale,
+            k_descale=layer._k_scale,
+            v_descale=layer._v_scale,
         )
         return output
 
@@ -391,6 +412,9 @@ def cascade_attention(
     block_table: torch.Tensor,
     common_prefix_len: int,
     fa_version: int,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     assert alibi_slopes is None, ("Cascade attention does not support ALiBi.")
     # TODO: Support sliding window.
@@ -402,6 +426,7 @@ def cascade_attention(
     assert common_prefix_len % block_size == 0
     num_common_kv_blocks = common_prefix_len // block_size
     assert num_common_kv_blocks > 0
+    descale_shape = (cu_prefix_query_lens.shape[0] - 1, key_cache.shape[-2])
 
     # Process shared prefix.
     prefix_output, prefix_lse = flash_attn_varlen_func(
@@ -419,8 +444,16 @@ def cascade_attention(
         softcap=logits_soft_cap,
         return_softmax_lse=True,
         fa_version=fa_version,
+        q_descale=q_descale.expand(descale_shape)
+        if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape)
+        if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape)
+        if v_descale is not None else None,
     )
 
+    descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
+
     # Process suffix per query.
     suffix_output, suffix_lse = flash_attn_varlen_func(
         q=query,
@@ -437,6 +470,12 @@ def cascade_attention(
         softcap=logits_soft_cap,
         return_softmax_lse=True,
         fa_version=fa_version,
+        q_descale=q_descale.expand(descale_shape)
+        if q_descale is not None else None,
+        k_descale=k_descale.expand(descale_shape)
+        if k_descale is not None else None,
+        v_descale=v_descale.expand(descale_shape)
+        if v_descale is not None else None,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index b2cbba51803..21e7d26506d 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -5,6 +5,7 @@
 import signal
 import sys
 import time
+import traceback
 import weakref
 from dataclasses import dataclass
 from enum import Enum, auto
@@ -370,6 +371,9 @@ def worker_busy_loop(self):
                     func = partial(cloudpickle.loads(method), self.worker)
                 output = func(*args, **kwargs)
             except Exception as e:
+                # Notes have been introduced in python 3.11
+                if hasattr(e, "add_note"):
+                    e.add_note(traceback.format_exc())
                 self.worker_response_mq.enqueue(
                     (WorkerProc.ResponseStatus.FAILURE, e))
                 logger.exception("WorkerProc hit an exception: %s", exc_info=e)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 657333c6d84..7faf666dc61 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1558,7 +1558,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
                     block_size=block_size,
                     num_kv_heads=attn_module.num_kv_heads,
                     head_size=attn_module.head_size,
-                    dtype=attn_module.dtype,
+                    dtype=self.kv_cache_dtype,
                     use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):

From f21a32d805e8b082e94204ec40f5a936760365a0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Mar 2025 13:25:58 +0800
Subject: [PATCH 0860/1240] [Doc] Update README.md (#15187)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1165c449854..44612706a0c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
 
 [2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
 
-[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet vLLM's Cyrus Leung and Chen Zhang, and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
+[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
 
 ---
 

From 74d1178312241e624268f30f47928bcc765ec8ba Mon Sep 17 00:00:00 2001
From: Matt Ritter <100659061+mritterfigma@users.noreply.github.com>
Date: Wed, 19 Mar 2025 23:29:16 -0700
Subject: [PATCH 0861/1240] Enable CUDA graph support for llama 3.2 vision
 (#14917)

Signed-off-by: Matt Ritter <100659061+mritterfigma@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../models/encoder_decoder/vision_language/test_mllama.py | 4 ----
 vllm/config.py                                            | 8 --------
 vllm/model_executor/models/mllama.py                      | 2 +-
 3 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index b6ea31cc571..ae7a7b028b1 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -215,7 +215,6 @@ def _run_test(
                      max_num_seqs=2,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=True,
                      limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
                                           }) as vllm_model:
         vllm_outputs_per_image = [
@@ -425,7 +424,6 @@ def test_bnb_regression(
         dtype=dtype,
         max_model_len=4096,
         max_num_seqs=2,
-        enforce_eager=True,
         quantization="bitsandbytes",
         load_format="bitsandbytes",
     )
@@ -481,7 +479,6 @@ def test_explicit_implicit_prompt(
         max_model_len=4096,
         max_num_seqs=2,
         tensor_parallel_size=1,
-        enforce_eager=True,
     )
     sampling_params = SamplingParams(
         temperature=0,
@@ -513,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
             max_model_len=4096,
             max_num_seqs=2,
             tensor_parallel_size=1,
-            enforce_eager=True,
             limit_mm_per_prompt={"image":
                                  _LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
 
diff --git a/vllm/config.py b/vllm/config.py
index ffff3b7c8a8..b7313e68362 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -670,14 +670,6 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
-        if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
-                and not self.enforce_eager):
-            logger.warning(
-                "CUDA graph is not supported for %s yet, fallback to the eager "
-                "mode.", self.hf_config.model_type)
-            self.enforce_eager = True
-
     def _verify_bnb_config(self) -> None:
         """
         The current version of bitsandbytes (0.44.0) with 8-bit models does not
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 8e98eb273cd..9ed49597cf8 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1368,7 +1368,7 @@ def forward(
             full_text_row_masked_out_mask = (
                 attn_metadata.encoder_seq_lens_tensor
                 != 0).reshape(-1, 1).to(input_ids.device)
-            skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
+            skip_cross_attention = attn_metadata.max_encoder_seq_len == 0
 
         # For image-present prefill.
         else:

From f7f86289169e1461ee236fffd0bf3bab58828fa8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Thu, 20 Mar 2025 14:31:21 +0800
Subject: [PATCH 0862/1240] typo: Update config.py (#15189)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index b7313e68362..74d7d9b17ce 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2784,7 +2784,7 @@ def __post_init__(self):
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
         if backend not in valid_guided_backends:
-            raise ValueError(f"Invalid guided_decoding_backend '{backend},"
+            raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
                              f" must be one of {valid_guided_backends}")
 
 
From a811e05de70cd2b7f50a032846a26ab0ca83da14 Mon Sep 17 00:00:00 2001
From: billishyahao <yahao.he@gmail.com>
Date: Thu, 20 Mar 2025 15:00:33 +0800
Subject: [PATCH 0863/1240] [Frontend][Bugfix] support prefill decode
 disaggregation on deepseek (#14824)

Signed-off-by: billishyahao <bill.he@amd.com>
Co-authored-by: Zhai Feiyue <80079571+ZhaiFeiyue@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../online_serving/disaggregated_prefill.sh   | 13 +++-
 .../kv_connector/simple_connector.py          | 77 +++++++++++++++----
 vllm/model_executor/models/deepseek_v2.py     |  1 +
 3 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index 2bb2824c6c8..6925dc8af07 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -8,6 +8,9 @@ set -xe
 echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧"
 sleep 1
 
+# meta-llama/Meta-Llama-3.1-8B-Instruct or deepseek-ai/DeepSeek-V2-Lite
+MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct}
+
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'cleanup' INT
 
@@ -44,18 +47,20 @@ wait_for_server() {
 # You can also adjust --kv-ip and --kv-port for distributed inference.
 
 # prefilling instance, which is the KV producer
-CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
     --port 8100 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
     --kv-transfer-config \
     '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
 
 # decoding instance, which is the KV consumer
-CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \
+CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
     --port 8200 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
+    --trust-remote-code \
     --kv-transfer-config \
     '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
 
@@ -78,7 +83,7 @@ sleep 1
 output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
@@ -87,7 +92,7 @@ output1=$(curl -X POST -s http://localhost:8000/v1/completions \
 output2=$(curl -X POST -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"model": "'"$MODEL_NAME"'",
 "prompt": "Santa Clara is a",
 "max_tokens": 10,
 "temperature": 0
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 7315a6f45f7..49b97d7b588 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -12,6 +12,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
@@ -37,6 +38,8 @@ def __init__(
 
         self.config = config.kv_transfer_config
         self.tp_size = config.parallel_config.tensor_parallel_size
+        self.is_deepseek_mla = config.model_config.is_deepseek_mla
+        self.use_mla_opt = not envs.VLLM_MLA_DISABLE
 
         if self.config.kv_connector == "PyNcclConnector":
             from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import (
@@ -167,8 +170,26 @@ def send_kv_caches_and_hidden_states(
         num_heads = int(model_config.num_key_value_heads / self.tp_size)
         hidden_size = model_config.hidden_size
         num_attention_heads = model_config.num_attention_heads
-        head_size = getattr(model_config, "head_dim",
-                            int(hidden_size // num_attention_heads))
+
+        # Deepseek's MLA (Multi-head Latent Attention) uses two different
+        # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0.
+        # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied,
+        # resulting in a kv_cache shape of [num_blks, blk_size, 1,
+        # kv_lora_rank + qk_rope_head_dim].
+        # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading
+        # to a kv_cache shape of [2, num_blks, blk_size,
+        # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim].
+        # For more details, see vllm/attention/backends/mla/common.py.
+        if self.is_deepseek_mla and self.use_mla_opt:
+            head_size = model_config.kv_lora_rank + \
+                model_config.qk_rope_head_dim
+            num_heads = 1
+        elif self.is_deepseek_mla and not self.use_mla_opt:
+            head_size = model_config.qk_nope_head_dim + \
+                model_config.qk_rope_head_dim
+        else:
+            head_size = getattr(model_config, "head_dim",
+                                int(hidden_size // num_attention_heads))
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -192,8 +213,12 @@ def send_kv_caches_and_hidden_states(
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
 
-                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                if self.is_deepseek_mla and self.use_mla_opt:
+                    key_cache = kv_cache.reshape(-1, num_heads, head_size)
+                    value_cache = kv_cache.reshape(-1, num_heads, head_size)
+                else:
+                    key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                    value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
@@ -223,6 +248,8 @@ def recv_kv_caches_and_hidden_states(
         # and hidden states.
         bypass_model_exec = True
 
+        model_config = model_executable.model.config
+
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
@@ -291,19 +318,35 @@ def recv_kv_caches_and_hidden_states(
                 kv_cache = kv_caches[i - model_executable.model.start_layer]
                 layer = model_executable.model.layers[i]
 
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                ops.reshape_and_cache_flash(
-                    keys[i - model_executable.model.start_layer].to(
-                        key_cache.device),
-                    values[i - model_executable.model.start_layer].to(
-                        value_cache.device),
-                    key_cache,
-                    value_cache,
-                    slot_mapping[start_pos:end_pos],
-                    layer.self_attn.attn.kv_cache_dtype,
-                    layer.self_attn.attn._k_scale,
-                    layer.self_attn.attn._v_scale,
-                )
+                if self.is_deepseek_mla and self.use_mla_opt:
+                    layer.self_attn.attn = layer.self_attn.mla_attn
+                    k_c_normed_k_pe = keys[
+                        i - model_executable.model.start_layer].to(
+                            kv_cache.device).squeeze(1)
+                    k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank]
+                    k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:]
+                    ops.concat_and_cache_mla(
+                        k_c_normed,
+                        k_pe,
+                        kv_cache,
+                        slot_mapping[start_pos:end_pos],
+                        layer.self_attn.attn.kv_cache_dtype,
+                        layer.self_attn.attn._k_scale,
+                    )
+                else:
+                    key_cache, value_cache = kv_cache[0], kv_cache[1]
+                    ops.reshape_and_cache_flash(
+                        keys[i - model_executable.model.start_layer].to(
+                            key_cache.device),
+                        values[i - model_executable.model.start_layer].to(
+                            value_cache.device),
+                        key_cache,
+                        value_cache,
+                        slot_mapping[start_pos:end_pos],
+                        layer.self_attn.attn.kv_cache_dtype,
+                        layer.self_attn.attn._k_scale,
+                        layer.self_attn.attn._v_scale,
+                    )
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index d66f61a8988..fcab533ed2d 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -589,6 +589,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        self.config = config
 
         self.vocab_size = config.vocab_size
 

From ad786c3fa0ea885c94e825be9789a5e3a8b0312e Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Thu, 20 Mar 2025 01:19:10 -0700
Subject: [PATCH 0864/1240] [release] Tag vllm-cpu with latest upon new version
 released (#15193)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 37cdab9e01e..18f582b6e4c 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"

From 895ec615a1e203d15e497e0b4e0e2b7cfab7cdf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Thu, 20 Mar 2025 16:19:55 +0800
Subject: [PATCH 0865/1240] Fixing Imprecise Type Annotations (#15192)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/tokenizer_group/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index c223768b16d..9d2209575bd 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -18,7 +18,7 @@
 def init_tokenizer_from_configs(model_config: ModelConfig,
                                 scheduler_config: SchedulerConfig,
                                 parallel_config: ParallelConfig,
-                                lora_config: LoRAConfig):
+                                lora_config: Optional[LoRAConfig]):
     init_kwargs = dict(tokenizer_id=model_config.tokenizer,
                        enable_lora=bool(lora_config),
                        max_num_seqs=scheduler_config.max_num_seqs,

From be65b4af9e61304af660d4d9af22b7e90c1b2431 Mon Sep 17 00:00:00 2001
From: Quang-Linh LE <linktohack@gmail.com>
Date: Thu, 20 Mar 2025 09:22:40 +0100
Subject: [PATCH 0866/1240] [macOS] Ugrade pytorch to 2.6.0 (#15129)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/cpu.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index b4e6abb6e3d..e4a7f9acdff 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -3,7 +3,8 @@
 
 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.6.0; platform_system == "Darwin"
+torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch

From ae4f16719c3a71657b057909c6d310de977d4414 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 20 Mar 2025 22:10:45 +0800
Subject: [PATCH 0867/1240] [Bugfix] Multi-video inference on LLaVA-Onevision
 (#15082)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/llava_onevision.py | 83 +++++++++----------
 1 file changed, 38 insertions(+), 45 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 6a2328f950b..fbc298b8124 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -25,7 +25,6 @@
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_list_of
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -44,7 +43,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict):
     type: Literal["pixel_values_videos"]
     pixel_values_videos: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size, num_videos, num_frames, num_channels, height, width)`
+    Shape: `(batch_size * num_videos, num_frames, num_channels, height, width)`
 
     Note that `num_videos` may be different for each batch, and 'num_frames'
     may be different for each video, in which case the data is passed as a
@@ -580,7 +579,7 @@ def _parse_and_validate_video_input(
 
         return LlavaOnevisionVideoPixelInputs(
             type="pixel_values_videos",
-            pixel_values_videos=pixel_values_videos,
+            pixel_values_videos=flatten_bn(pixel_values_videos),
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -768,22 +767,6 @@ def _process_image_input(
             for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-    def _add_image_newline(
-        self,
-        video_features: torch.Tensor,
-        videos: int = 1,
-        frames: int = 1,
-        strategy: str = "one_token",
-    ) -> torch.Tensor:
-        if strategy == "one_token":
-            video_features = video_features.reshape(
-                videos, frames * video_features.shape[1], -1)
-            image_newline = self.image_newline[None, None, :].repeat(
-                videos, 1, 1).to(video_features.device)
-            video_features = torch.cat((video_features, image_newline), dim=1)
-            return video_features
-        raise ValueError(f"Unexpected video newline strategy: {strategy}")
-
     def _video_pixels_to_features(
         self,
         vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
@@ -807,33 +790,43 @@ def _process_video_pixels(self, inputs: LlavaOnevisionVideoPixelInputs):
         video_pixels = inputs["pixel_values_videos"]
 
         if isinstance(video_pixels, torch.Tensor):
-            b, num_videos, frames, c, h, w = video_pixels.shape
-            pixel_values = video_pixels.view(b * num_videos * frames, c, h, w)
-            stacked_embeddings = self._video_pixels_to_features(
-                self.vision_tower, pixel_values)
-            stacked_embeddings = self._add_image_newline(stacked_embeddings,
-                                                         videos=b * num_videos,
-                                                         frames=frames,
-                                                         strategy="one_token")
-            return stacked_embeddings
-        elif is_list_of(video_pixels, torch.Tensor):
-            stacked_embeddings = []
-            for video_pixel in video_pixels:
-                num_videos, frames, c, h, w = video_pixel.shape
-                pixel_values = video_pixel.view(num_videos * frames, c, h, w)
-                embeddings = self._video_pixels_to_features(
-                    self.vision_tower, pixel_values)
-                embeddings = self._add_image_newline(embeddings,
-                                                     videos=num_videos,
-                                                     frames=frames,
-                                                     strategy="one_token")
-                stacked_embeddings.append(embeddings)
-            return stacked_embeddings
-        else:
-            raise ValueError(
-                f"Unsupported type of video input {type(video_pixels)}")
+            total_videos, frames, c, h, w = video_pixels.shape
+            video_pixels_flat = video_pixels.view(total_videos * frames, c, h,
+                                                  w)
+
+            embeddings_flat = self._video_pixels_to_features(
+                self.vision_tower, video_pixels_flat)
+
+            embeddings_flat = embeddings_flat.reshape(
+                total_videos, frames * embeddings_flat.shape[1], -1)
+
+            image_newline = self.image_newline[None, None, :].expand(
+                total_videos, -1, -1)
+            return torch.cat((embeddings_flat, image_newline), dim=1)
+
+        frames_per_video = [len(video) for video in video_pixels]
+        video_pixels_flat = torch.cat(video_pixels)
+
+        embeddings_flat = self._video_pixels_to_features(
+            self.vision_tower, video_pixels_flat)
+
+        image_newline = self.image_newline[None, None, :]
+
+        return [
+            torch.cat(
+                (
+                    embeds.reshape(1, num_frame * embeddings_flat.shape[1],
+                                   -1),
+                    image_newline,
+                ),
+                dim=1,
+            ) for num_frame, embeds in zip(
+                frames_per_video,
+                torch.split(embeddings_flat, frames_per_video),
+            )
+        ]
 
-    def apply_pooling(self, image_features, stride=2):
+    def apply_pooling(self, image_features: torch.Tensor, stride: int = 2):
         vision_config = self.config.vision_config
         height = width = vision_config.image_size // vision_config.patch_size
         batch_frames, _, dim = image_features.shape

From a3b4ba86a4f198f044db41b4f82e379ae29246de Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 20 Mar 2025 14:39:51 +0000
Subject: [PATCH 0868/1240] Add user forum to README (#15220)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 44612706a0c..573b667ca88 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---
@@ -151,10 +151,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 
 ## Contact Us
 
-- For technical questions and feature requests, please use GitHub issues or discussions.
-- For discussing with fellow users and coordinating contributions and development, please use Slack.
-- For security disclosures, please use GitHub's security advisory feature.
-- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
 
 ## Media Kit
 

From 02c4c5a08b76c21d76c158ba1e692b2c0511c390 Mon Sep 17 00:00:00 2001
From: Richard Liu <39319471+richardsliu@users.noreply.github.com>
Date: Thu, 20 Mar 2025 07:59:33 -0700
Subject: [PATCH 0869/1240] Fix env vars for running Ray distributed backend on
 GKE (#15166)

Signed-off-by: Richard Liu <ricliu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 2 ++
 vllm/platforms/interface.py               | 2 ++
 vllm/platforms/tpu.py                     | 4 ++++
 3 files changed, 8 insertions(+)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 18ff32155c5..d769d235020 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -340,6 +340,8 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             and v not in self.non_carry_over_env_vars
         ]
 
+        env_vars_to_copy.extend(current_platform.additional_env_vars)
+
         # Copy existing env vars to each worker's args
         for args in all_args_to_update_environment_variables:
             # TODO: refactor platform-specific env vars
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 7415b5d5f06..c7152d0bfb7 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -112,6 +112,8 @@ class Platform:
 
     supported_quantization: list[str] = []
 
+    additional_env_vars: list[str] = []
+
     def is_cuda(self) -> bool:
         return self._enum == PlatformEnum.CUDA
 
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8e2c28d9327..073d46c25d5 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -29,6 +29,10 @@ class TpuPlatform(Platform):
         "tpu_int8", "compressed-tensors", "compressed_tensors"
     ]
 
+    additional_env_vars: list[str] = [
+        "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS"
+    ]
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              dtype: torch.dtype, kv_cache_dtype: Optional[str],

From b386a7d883ec446a0702e570956d74e7bdbc8bab Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 20 Mar 2025 15:18:20 +0000
Subject: [PATCH 0870/1240] Replace `misc` issues with link to forum (#15226)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../ISSUE_TEMPLATE/800-misc-discussion.yml    | 28 -------------------
 .github/ISSUE_TEMPLATE/config.yml             |  4 +++
 2 files changed, 4 insertions(+), 28 deletions(-)
 delete mode 100644 .github/ISSUE_TEMPLATE/800-misc-discussion.yml

diff --git a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
deleted file mode 100644
index 79e6e9080d5..00000000000
--- a/.github/ISSUE_TEMPLATE/800-misc-discussion.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: 🎲 Misc/random discussions that do not fit into the above categories.
-description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
-title: "[Misc]: "
-labels: ["misc"]
-
-body:
-- type: markdown
-  attributes:
-    value: >
-      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-- type: textarea
-  attributes:
-    label: Anything you want to discuss about vllm.
-    description: >
-      Anything you want to discuss about vllm.
-  validations:
-    required: true
-- type: markdown
-  attributes:
-    value: >
-      Thanks for contributing 🎉!
-- type: checkboxes
-  id: askllm
-  attributes:
-    label: Before submitting a new issue...
-    options:
-      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
-        required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 3ba13e0cec6..fa40268d677 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1 +1,5 @@
 blank_issues_enabled: false
+contact_links:
+  - name: Questions
+    url: https://discuss.vllm.ai
+    about: Ask questions and discuss with other vLLM community members

From a794e72b7d84fd3a867fd636d1ffb260ac4c97c3 Mon Sep 17 00:00:00 2001
From: Chi Zhang <zhangchi.usc1992@bytedance.com>
Date: Fri, 21 Mar 2025 00:30:04 +0800
Subject: [PATCH 0871/1240] [ci] feat: make the test_torchrun_example run with
 tp=2, external_dp=2 (#15172)

Signed-off-by: Chi Zhang <zhangchi.usc1992@bytedance.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml              | 6 ++++--
 tests/distributed/test_torchrun_example.py | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 5d03390335a..730f272b54e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -136,6 +136,10 @@ steps:
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
   commands:
+  # test with tp=2 and external_dp=2
+  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+  # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
@@ -512,8 +516,6 @@ steps:
   - entrypoints/llm/test_collective_rpc.py
   commands:
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py
index 4ef33932538..0420a6454d4 100644
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -9,6 +9,8 @@
 from vllm import LLM, SamplingParams
 from vllm.distributed.parallel_state import get_world_group
 
+dist.init_process_group(backend="gloo")
+
 # Create prompts
 prompts = [
     "Hello, my name is",

From 4c2dd190d437f609672cc9f8ea36a60c93d9e5fd Mon Sep 17 00:00:00 2001
From: Jason <72191212+JasonJ2021@users.noreply.github.com>
Date: Fri, 21 Mar 2025 01:01:02 +0800
Subject: [PATCH 0872/1240] [Bugfix] fix V1 Engine crash while handling
 requests with duplicate request id (#15043)

Signed-off-by: Jiahui Sun <jhsun2020@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_engine_core.py | 16 ++++++++++++++++
 vllm/v1/engine/core.py              | 10 ----------
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index afbe15b9d46..ca5ff8fa845 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -158,6 +158,22 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch):
         assert len(engine_core.scheduler.waiting) == 0
         assert len(engine_core.scheduler.running) == 0
 
+        # Sending duplicate requests with same request_id
+        req0 = make_request()
+        req1 = make_request()
+        req0.request_id = req1.request_id = "test"
+        engine_core.add_request(req0)
+
+        while len(engine_core.step().outputs) > 0:
+            pass
+
+        engine_core.add_request(req1)
+        while len(engine_core.step().outputs) > 0:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+
 
 @create_new_process_for_each_test()
 def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 8f93d3c71cd..b0c18aee97c 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -179,16 +179,6 @@ def step(self) -> EngineCoreOutputs:
                 scheduler_stats=self.scheduler.make_stats(),
             )
         scheduler_output = self.scheduler.schedule()
-
-        # This case may occur when the only unfinished requests are
-        # structured output requests where the grammar has not finished
-        # compiling yet, so there's nothing to run.
-        if scheduler_output.total_num_scheduled_tokens == 0:
-            return EngineCoreOutputs(
-                outputs=[],
-                scheduler_stats=self.scheduler.make_stats(),
-            )
-
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)  # type: ignore

From 78b213ca9a8578a09470fbdb6d4134b67ea492c9 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 20 Mar 2025 15:24:16 -0700
Subject: [PATCH 0873/1240] [V1] Add flag to disable cascade attention (#15243)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                     |  2 ++
 vllm/engine/arg_utils.py           | 12 ++++++++++++
 vllm/v1/worker/gpu_model_runner.py | 14 +++++++++-----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 74d7d9b17ce..1f7147f7cfd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -246,6 +246,7 @@ def __init__(
         max_seq_len_to_capture: Optional[int] = None,
         max_logprobs: int = 20,
         disable_sliding_window: bool = False,
+        disable_cascade_attn: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, list[str]]] = None,
         limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
@@ -322,6 +323,7 @@ def __init__(
         self.max_seq_len_to_capture = max_seq_len_to_capture
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
+        self.disable_cascade_attn = disable_cascade_attn
         self.skip_tokenizer_init = skip_tokenizer_init
         self.enable_sleep_mode = enable_sleep_mode
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 43bf2fe8f09..5015f1d684b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -120,6 +120,7 @@ class EngineArgs:
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
     disable_sliding_window: bool = False
+    disable_cascade_attn: bool = False
     use_v2_block_manager: bool = True
     swap_space: float = 4  # GiB
     cpu_offload_gb: float = 0  # GiB
@@ -1096,6 +1097,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "using. This is used to parse the reasoning content into OpenAI "
             "API format. Required for ``--enable-reasoning``.")
 
+        parser.add_argument(
+            "--disable-cascade-attn",
+            action="store_true",
+            default=False,
+            help="Disable cascade attention for V1. While cascade attention "
+            "does not change the mathematical correctness, disabling it "
+            "could be useful for preventing potential numerical issues. "
+            "Note that even if this is set to False, cascade attention will be "
+            "only used when the heuristic tells that it's beneficial.")
+
         return parser
 
     @classmethod
@@ -1141,6 +1152,7 @@ def create_model_config(self) -> ModelConfig:
             max_seq_len_to_capture=self.max_seq_len_to_capture,
             max_logprobs=self.max_logprobs,
             disable_sliding_window=self.disable_sliding_window,
+            disable_cascade_attn=self.disable_cascade_attn,
             skip_tokenizer_init=self.skip_tokenizer_init,
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 7faf666dc61..c82bcec25d2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -127,6 +127,7 @@ def __init__(
 
         self.attn_metadata_builder = self.attn_backend.get_builder_cls()(
             weakref.proxy(self))
+        self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
         # Multi-modal data support
         self.input_registry = INPUT_REGISTRY
@@ -565,11 +566,14 @@ def _prepare_inputs(
                 self.positions_cpu[:total_num_scheduled_tokens],
                 non_blocking=True)
 
-        # Prepare for cascade attention if needed.
-        common_prefix_len = self._compute_cascade_attn_prefix_len(
-            num_scheduled_tokens,
-            scheduler_output.num_common_prefix_blocks,
-        )
+        # Prepare for cascade attention if enabled & beneficial.
+        common_prefix_len = 0
+        if self.cascade_attn_enabled:
+            common_prefix_len = self._compute_cascade_attn_prefix_len(
+                num_scheduled_tokens,
+                scheduler_output.num_common_prefix_blocks,
+            )
+
         attn_metadata = self.attn_metadata_builder.build(
             num_reqs=num_reqs,
             num_actual_tokens=total_num_scheduled_tokens,

From 8b9452cf2525cc0d0576fc7f5725c6a7e1d370ef Mon Sep 17 00:00:00 2001
From: Yu Chin Fabian Lim <fabianlim@users.noreply.github.com>
Date: Fri, 21 Mar 2025 08:44:37 +0800
Subject: [PATCH 0874/1240] Enforce that TP > 1 is not supported for Mamba2 if
 Quantization is Enabled. (#14617)

Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/mamba/mamba_mixer2.py              | 37 +++++++++++--------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index 53d68b60f2f..fec6d6112d6 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -251,6 +251,9 @@ def __init__(self,
                 "then num_groups must equal 1."
             )
 
+        assert self.tp_size == 1 or quant_config is None, \
+            "Tensor parallel currently not supported for quantized models."
+
         self.ssm_state_size = ssm_state_size
         self.activation = activation
 
@@ -331,22 +334,24 @@ def __init__(self,
                 ], self.tp_size, tp_rank)
             })
 
-        delattr(self.in_proj.weight, "weight_loader")
-        set_weight_attrs(
-            self.in_proj.weight,
-            {
-                "weight_loader":
-                mamba_v2_sharded_weight_loader(
-                    [
-                        intermediate_settings,  # for gate
-                        intermediate_settings,
-                        group_shard_settings,
-                        group_shard_settings,
-                        head_setings,  # for dt
-                    ],
-                    self.tp_size,
-                    tp_rank)
-            })
+        if quant_config is None:
+            # - quant layers do not have a weight loader
+            delattr(self.in_proj.weight, "weight_loader")
+            set_weight_attrs(
+                self.in_proj.weight,
+                {
+                    "weight_loader":
+                    mamba_v2_sharded_weight_loader(
+                        [
+                            intermediate_settings,  # for gate
+                            intermediate_settings,
+                            group_shard_settings,
+                            group_shard_settings,
+                            head_setings,  # for dt
+                        ],
+                        self.tp_size,
+                        tp_rank)
+                })
 
         # - these are TPed by heads to reduce the size of the
         #   temporal shape

From 0bbea74032b85ae606071cf828868de94a76b09a Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 20 Mar 2025 17:50:43 -0700
Subject: [PATCH 0875/1240] [V1] Scheduler Refactoring [1/N] - Add Scheduler
 Interface (#15250)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/plugins_tests/test_scheduler_plugins.py |   2 +-
 tests/v1/core/test_scheduler.py               |   3 +-
 tests/v1/worker/test_gpu_model_runner.py      |   4 +-
 vllm/engine/arg_utils.py                      |   2 +-
 vllm/executor/ray_utils.py                    |   2 +-
 vllm/v1/attention/backends/flash_attn.py      |   2 +-
 vllm/v1/attention/backends/mla/common.py      |   2 +-
 vllm/v1/core/sched/__init__.py                |   0
 vllm/v1/core/sched/interface.py               | 139 ++++++++++++++++++
 .../{scheduler_output.py => sched/output.py}  |   0
 vllm/v1/core/{ => sched}/scheduler.py         |  37 +----
 vllm/v1/core/sched/utils.py                   |  22 +++
 vllm/v1/engine/core.py                        |   4 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 vllm/v1/worker/gpu_worker.py                  |   2 +-
 vllm/v1/worker/tpu_model_runner.py            |   2 +-
 vllm/v1/worker/tpu_worker.py                  |   2 +-
 17 files changed, 182 insertions(+), 45 deletions(-)
 create mode 100644 vllm/v1/core/sched/__init__.py
 create mode 100644 vllm/v1/core/sched/interface.py
 rename vllm/v1/core/{scheduler_output.py => sched/output.py} (100%)
 rename vllm/v1/core/{ => sched}/scheduler.py (96%)
 create mode 100644 vllm/v1/core/sched/utils.py

diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 7abf5066a41..4c95a52a967 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -6,7 +6,7 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler import Scheduler as V1Scheduler
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
 
 
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 9413373390f..8916aa58000 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -6,7 +6,8 @@
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler import Scheduler, SchedulerOutput
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index 345519a07e4..dd95a7f5306 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -3,8 +3,8 @@
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
-                                           SchedulerOutput)
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5015f1d684b..bbe780a0ec1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1695,7 +1695,7 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
-            self.scheduler_cls = "vllm.v1.core.scheduler.Scheduler"
+            self.scheduler_cls = "vllm.v1.core.sched.scheduler.Scheduler"
 
         # When no user override, set the default values based on the usage
         # context.
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index c1bf2fb316d..a7042ca8df1 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -17,7 +17,7 @@
 from vllm.worker.worker_base import WorkerWrapperBase
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.outputs import ModelRunnerOutput
 
 logger = init_logger(__name__)
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 637c01556ac..27b3aabbc35 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -17,7 +17,7 @@
 from vllm.utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index f801745ab5c..188a425b107 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -212,7 +212,7 @@
     from flash_attn import flash_attn_varlen_func
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
diff --git a/vllm/v1/core/sched/__init__.py b/vllm/v1/core/sched/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py
new file mode 100644
index 00000000000..bfed44f9d58
--- /dev/null
+++ b/vllm/v1/core/sched/interface.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.engine import EngineCoreOutputs
+    from vllm.v1.metrics.stats import SchedulerStats
+    from vllm.v1.outputs import ModelRunnerOutput
+    from vllm.v1.request import Request, RequestStatus
+
+
+class SchedulerInterface(ABC):
+
+    @abstractmethod
+    def schedule(self) -> "SchedulerOutput":
+        """Schedule the requests to process in this scheduling step.
+
+        The scheduling decision is made at the iteration level. Each scheduling
+        step corresponds to a single forward pass of the model. Therefore, this
+        method is called repeatedly by a busy loop in the engine.
+
+        Essentially, the scheduler produces a dictionary of {req_id: num_tokens}
+        that specifies how many tokens to process for each request in this
+        scheduling step. For example, num_tokens can be as large as the number
+        of prompt tokens for new requests, or it can be 1 for the requests that
+        are auto-regressively generating new tokens one by one. Otherwise, it
+        can be somewhere in between in case of chunked prefills, prefix caching,
+        speculative decoding, etc.
+
+        Additionally, the scheduler also returns useful data about each request
+        or the batch as a whole. The model runner will use this information in
+        preparing inputs to the model.
+
+        Returns:
+            A SchedulerOutput object containing information about the scheduled
+            requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def update_from_output(
+        self,
+        scheduler_output: "SchedulerOutput",
+        model_runner_output: "ModelRunnerOutput",
+    ) -> "EngineCoreOutputs":
+        """Update the scheduler state based on the model runner output.
+
+        This method is called after the model runner has processed the scheduled
+        requests. The model runner output includes generated token ids, draft
+        token ids for next step, etc. The scheduler uses this information to
+        update its states, checks the finished requests, and returns the output
+        for each request.
+
+        Returns:
+            A EngineCoreOutputs object containing the outputs for each request.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def add_request(self, request: "Request") -> None:
+        """Add a new request to the scheduler's internal queue.
+        
+        Args:
+            request: The new request being added.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def finish_requests(
+        self,
+        request_ids: Union[str, Iterable[str]],
+        finished_status: "RequestStatus",
+    ) -> None:
+        """Finish the requests in the scheduler's internal queue. If the request
+        is not in the queue, this method will do nothing.
+
+        This method is called in two cases:
+        1. When the request is aborted by the client.
+        2. When the frontend process detects a stop string of the request after
+           de-tokenizing its generated tokens.
+           
+        Args:
+            request_ids: A single or a list of request IDs.
+            finished_status: The finished status of the given requests.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_num_unfinished_requests(self) -> int:
+        """Number of unfinished requests in the scheduler's internal queue."""
+        raise NotImplementedError
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests in the scheduler's
+        internal queue."""
+        return self.get_num_unfinished_requests() > 0
+
+    @abstractmethod
+    def has_finished_requests(self) -> bool:
+        """Returns True if there are finished requests that need to be cleared.
+        NOTE: This is different from `not self.has_unfinished_requests()`.
+
+        The scheduler maintains an internal list of the requests finished in the
+        previous step. This list is returned from the next call to schedule(),
+        to be sent to the model runner in the next step to clear cached states
+        for these finished requests.
+
+        This method checks if this internal list of finished requests is
+        non-empty. This information is useful for DP attention.
+        """
+        raise NotImplementedError
+
+    def has_requests(self) -> bool:
+        """Returns True if there are unfinished requests, or finished requests
+        not yet returned in SchedulerOutputs."""
+        return self.has_unfinished_requests() or self.has_finished_requests()
+
+    @abstractmethod
+    def get_num_unscheduled_requests(self) -> int:
+        """Number of requests that are not being processed by the executor."""
+        raise NotImplementedError
+
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset the prefix cache for KV cache.
+
+        This is particularly required when the model weights are live-updated.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def make_stats(self) -> Optional["SchedulerStats"]:
+        """Make a SchedulerStats object for logging.
+
+        The SchedulerStats object is created for every scheduling step.
+        """
+        raise NotImplementedError
diff --git a/vllm/v1/core/scheduler_output.py b/vllm/v1/core/sched/output.py
similarity index 100%
rename from vllm/v1/core/scheduler_output.py
rename to vllm/v1/core/sched/output.py
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/sched/scheduler.py
similarity index 96%
rename from vllm/v1/core/scheduler.py
rename to vllm/v1/core/sched/scheduler.py
index 056458ef9dd..d002a19b08a 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -13,8 +13,10 @@
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
-from vllm.v1.core.scheduler_output import (CachedRequestData, NewRequestData,
-                                           SchedulerOutput)
+from vllm.v1.core.sched.interface import SchedulerInterface
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.core.sched.utils import check_stop
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
 from vllm.v1.metrics.stats import SchedulerStats
@@ -25,7 +27,7 @@
 logger = init_logger(__name__)
 
 
-class Scheduler:
+class Scheduler(SchedulerInterface):
 
     def __init__(
         self,
@@ -602,7 +604,7 @@ def update_from_output(
 
                     # Check for stop and update request state.
                     # This must be called before we make the EngineCoreOutput.
-                    stopped = self._check_stop(request)
+                    stopped = check_stop(request, self.max_model_len)
                     if stopped:
                         self._free_request(request)
                         break
@@ -648,25 +650,6 @@ def update_from_output(
             scheduler_stats=self.make_stats(),
         )
 
-    def _check_stop(self, request: Request) -> bool:
-        if (request.num_tokens >= self.max_model_len
-                or request.num_output_tokens >= request.max_tokens):
-            request.status = RequestStatus.FINISHED_LENGTH_CAPPED
-            return True
-
-        sampling_params = request.sampling_params
-        last_token_id = request.output_token_ids[-1]
-        if (not sampling_params.ignore_eos
-                and last_token_id == request.eos_token_id):
-            request.status = RequestStatus.FINISHED_STOPPED
-            return True
-
-        if last_token_id in (sampling_params.stop_token_ids or ()):
-            request.status = RequestStatus.FINISHED_STOPPED
-            request.stop_reason = last_token_id
-            return True
-        return False
-
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
         self.requests[request.request_id] = request
@@ -715,17 +698,9 @@ def _free_request(self, request: Request) -> None:
     def get_num_unfinished_requests(self) -> int:
         return len(self.waiting) + len(self.running)
 
-    def has_unfinished_requests(self) -> bool:
-        return self.get_num_unfinished_requests() > 0
-
     def has_finished_requests(self) -> bool:
         return len(self.finished_req_ids) > 0
 
-    def has_requests(self):
-        """Returns True if there are unfinished requests, or finished requests
-        not yet returned in SchedulerOutputs."""
-        return self.has_unfinished_requests() or self.has_finished_requests()
-
     def get_num_unscheduled_requests(self) -> int:
         """Number of requests that are not being processed by the executor."""
         return self.get_num_unfinished_requests() - len(self.scheduled_req_ids)
diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py
new file mode 100644
index 00000000000..3a0028a5901
--- /dev/null
+++ b/vllm/v1/core/sched/utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.request import Request, RequestStatus
+
+
+def check_stop(request: Request, max_model_len: int) -> bool:
+    if (request.num_tokens >= max_model_len
+            or request.num_output_tokens >= request.max_tokens):
+        request.status = RequestStatus.FINISHED_LENGTH_CAPPED
+        return True
+
+    sampling_params = request.sampling_params
+    last_token_id = request.output_token_ids[-1]
+    if (not sampling_params.ignore_eos
+            and last_token_id == request.eos_token_id):
+        request.status = RequestStatus.FINISHED_STOPPED
+        return True
+
+    if last_token_id in (sampling_params.stop_token_ids or ()):
+        request.status = RequestStatus.FINISHED_STOPPED
+        request.stop_reason = last_token_id
+        return True
+    return False
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b0c18aee97c..1598e6b8443 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,8 +22,8 @@
 from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
                         zmq_socket_ctx)
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
-from vllm.v1.core.scheduler import Scheduler as V1Scheduler
-from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c82bcec25d2..b186300a003 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -45,7 +45,7 @@
 if TYPE_CHECKING:
     import xgrammar as xgr
 
-    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
 
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 241869e35c6..a63a2d02237 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -28,7 +28,7 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler_output import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
 
 
 class Worker(WorkerBase):
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b7924752aec..ec3dcbc064c 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -37,7 +37,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
-    from vllm.v1.core.scheduler import SchedulerOutput
+    from vllm.v1.core.sched.output import SchedulerOutput
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9f595611927..dbb231950d0 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -17,7 +17,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import set_random_seed
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from vllm.v1.core.scheduler import SchedulerOutput
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput

From e423fdbd906026a4f0ae1cdf31f2092ca6abc2b5 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Thu, 20 Mar 2025 21:28:53 -0400
Subject: [PATCH 0876/1240] [CI/Build] LoRA : make add_lora_test safer (#15181)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_add_lora.py | 48 +++++++------------------------------
 1 file changed, 8 insertions(+), 40 deletions(-)

diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py
index 644a075b6dd..c8b7a5cbf74 100644
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
-from pathlib import Path
 
 import pytest
-from huggingface_hub import snapshot_download
 
 import vllm.envs as env
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -13,35 +11,9 @@
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
-LORA_RANK = 8
-DEFAULT_MAX_LORAS = 16 * 3
-
-
-def download_and_prepare_lora_module():
-    """
-    Request submission is expensive when the LoRA adapters have their own
-    tokenizers. This is because, for each request with a new LoRA adapter ID,
-    the front-end loads the tokenizer from disk.
-
-    In this test, as we are comparing request processing times, we want to
-    minimize any extra activity. To this effect, we download the LoRA
-    adapter and remove all the tokenizer files, so the engine will default
-    to the base model tokenizer.
-    """
-    global LORA_MODULE_DOWNLOAD_PATH
-
-    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
-    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
-
-    tokenizer_files = [
-        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
-        'tokenizer.model'
-    ]
-    for tokenizer_file in tokenizer_files:
-        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
-        del_path.unlink(missing_ok=True)
+MODEL_PATH = "THUDM/chatglm3-6b"
+LORA_RANK = 64
+DEFAULT_MAX_LORAS = 4 * 3
 
 
 @pytest.fixture(autouse=True)
@@ -52,11 +24,9 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-def get_lora_requests() -> list[LoRARequest]:
+def get_lora_requests(lora_path) -> list[LoRARequest]:
     lora_requests: list[LoRARequest] = [
-        LoRARequest(lora_name=f"{i}",
-                    lora_int_id=i,
-                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
+        LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
         for i in range(1, DEFAULT_MAX_LORAS + 1)
     ]
     return lora_requests
@@ -93,7 +63,7 @@ async def requests_processing_time(llm,
 
 
 @pytest.mark.asyncio
-async def test_add_lora():
+async def test_add_lora(chatglm3_lora_files):
     """ 
     The add_lora function is used to pre-load some LoRA adapters into the
     engine in anticipation of future requests using these adapters. To test
@@ -103,10 +73,7 @@ async def test_add_lora():
     We measure the request processing time in both cases and expect the time 
     to be lesser in the case with add_lora() calls.
     """
-
-    download_and_prepare_lora_module()
-
-    lora_requests: list[LoRARequest] = get_lora_requests()
+    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
 
     max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
     # Create engine in eager-mode. Due to high max_loras, the CI can
@@ -118,6 +85,7 @@ async def test_add_lora():
         max_lora_rank=LORA_RANK,
         max_model_len=128,
         gpu_memory_utilization=0.8,  #avoid OOM
+        trust_remote_code=True,
         enforce_eager=True)
 
     # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`

From ebfbb41420692ee3d6ff81346e9a304b3110562d Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Thu, 20 Mar 2025 19:01:11 -0700
Subject: [PATCH 0877/1240] Fix CUDA kernel index data type in
 vllm/csrc/quantization/fused_kernels/layernorm_utils.cuh +10 (#15159)

Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Richard Barnes <rbarnes@meta.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../fused_kernels/layernorm_utils.cuh         |  12 +-
 csrc/quantization/gguf/dequantize.cuh         |  50 ++++-----
 csrc/quantization/gguf/gguf_kernel.cu         |   4 +-
 csrc/quantization/gguf/mmq.cuh                |  12 +-
 csrc/quantization/gguf/mmvq.cuh               |   4 +-
 csrc/quantization/gguf/moe.cuh                |  10 +-
 csrc/quantization/gptq/q_gemm.cu              | 106 +++++++++---------
 .../gptq_allspark/allspark_qgemm_w8a16.cu     |  38 +++----
 .../gptq_allspark/allspark_repack.cu          |   8 +-
 .../gptq_allspark/allspark_utils.cuh          |   4 +-
 10 files changed, 124 insertions(+), 124 deletions(-)

diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index cec6b54edb5..b5cea98f770 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -24,7 +24,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   // sum of squares
   float ss = 0.0f;
 
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
     float x = static_cast<float>(input[token_offset + i]);
     if constexpr (has_residual) {
       x += static_cast<float>(residual[token_offset + i]);
@@ -58,7 +58,7 @@ __device__ void compute_dynamic_per_token_scales(
   constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
 
   float block_absmax_val_maybe = 0.0f;
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
     float x = static_cast<float>(input[token_offset + i]);
     if constexpr (has_residual) {
       x += static_cast<float>(residual[token_offset + i]);
@@ -103,7 +103,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
 
-  for (int32_t i = threadIdx.x; i < hidden_size; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
     float x = static_cast<float>(input[token_offset + i]);
     if constexpr (has_residual) {
       x += static_cast<float>(residual[token_offset + i]);
@@ -142,7 +142,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input,
   int32_t const num_vec_elems = hidden_size >> 2;
 
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
     vec4_t<scalar_t> in = vec_input[i];
 
     vec4_t<float> x;
@@ -206,7 +206,7 @@ __device__ void compute_dynamic_per_token_scales(
   float block_absmax_val_maybe = 0.0f;
 
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
     vec4_t<scalar_t> in = vec_input[i];
     vec4_t<scalar_t> const w = vec_weight[i];
 
@@ -286,7 +286,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output,
 // TODO(luka/varun) extract into type-agnostic vectorized quant function to
 //  replace scaled_fp8_conversion_vec
 #pragma unroll 4
-  for (int32_t i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
+  for (auto i = threadIdx.x; i < num_vec_elems; i += blockDim.x) {
     vec4_t<scalar_t> const in = vec_input[i];
     vec4_t<scalar_t> const w = vec_weight[i];
 
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
index c012262e490..41fc032ff1a 100644
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -101,10 +101,10 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_q2_K * x = (const block_q2_K *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int n   = tid/32;
     const int l   = tid - 32*n;
     const int is  = 8*n + l/16;
@@ -123,10 +123,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
     const block_q3_K * x = (const block_q3_K *) vx;
 
-    const int r = threadIdx.x/4;
+    const auto r = threadIdx.x/4;
     const int tid = r/2;
     const int is0 = r%2;
     const int l0 = 16*is0 + 4*(threadIdx.x%4);
@@ -164,10 +164,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
     const block_q4_K * x = (const block_q4_K *) vx;
 
-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
 
     // assume 32 threads
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il  = tid/8;
     const int ir  = tid%8;
     const int is  = 2*il;
@@ -197,10 +197,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
     const block_q5_K * x = (const block_q5_K *) vx;
 
-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
 
     // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il  = tid/16;   // il is in 0...3
     const int ir  = tid%16;   // ir is in 0...15
     const int is  = 2*il;     // is is in 0...6
@@ -231,10 +231,10 @@ template<typename dst_t>
 static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
     const block_q6_K * x = (const block_q6_K *) vx;
 
-    const int i = blockIdx.x;
+    const auto i = blockIdx.x;
 
     // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int ip  = tid/32;   // ip is 0 or 1
     const int il  = tid - 32*ip; // 0...32
     const int is  = 8*ip + il/16;
@@ -256,10 +256,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -275,10 +275,10 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq2_xs * x = (const block_iq2_xs *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -293,10 +293,10 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq2_s * x = (const block_iq2_s *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -309,10 +309,10 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -332,10 +332,10 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq3_s * x = (const block_iq3_s *) vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 8*il;
@@ -399,10 +399,10 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@@ -417,10 +417,10 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
 
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const auto i   = blockIdx.x;
     const block_iq4_xs * x = (const block_iq4_xs *)vx;
 
-    const int tid = threadIdx.x;
+    const auto tid = threadIdx.x;
     const int il = tid/8; // 0...3
     const int ib = tid%8; // 0...7
     dst_t * y = yy + i*QK_K + 32*ib + 4*il;
@@ -565,4 +565,4 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
         default:
             return nullptr;
     }
-}
\ No newline at end of file
+}
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index 46b716bbd98..b0f31c45e73 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -19,11 +19,11 @@ template <typename scalar_t>
 static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
                                      void* __restrict__ vy, const int kx,
                                      const int kx_padded) {
-  const int ix = blockDim.x * blockIdx.x + threadIdx.x;
+  const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
   if (ix >= kx_padded) {
     return;
   }
-  const int iy = blockDim.y * blockIdx.y + threadIdx.y;
+  const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
   const int i_padded = iy * kx_padded + ix;
 
   block_q8_1* y = (block_q8_1*)vy;
diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh
index e2b93680ffb..7c89918c23d 100644
--- a/csrc/quantization/gguf/mmq.cuh
+++ b/csrc/quantization/gguf/mmq.cuh
@@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(
 
     const int & ncols_dst = ncols_y;
 
-    const int row_dst_0 = blockIdx.x*mmq_y;
+    const auto row_dst_0 = blockIdx.x*mmq_y;
     const int & row_x_0 = row_dst_0;
 
-    const int col_dst_0 = blockIdx.y*mmq_x;
+    const auto col_dst_0 = blockIdx.y*mmq_x;
     const int & col_y_0 = col_dst_0;
 
     int   * tile_x_ql = nullptr;
@@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(
 
 #pragma unroll
         for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
-            const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
+            const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
             const int kbxd = kqs / QI8_1;
 
 #pragma unroll
@@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
 #pragma unroll
             for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
                 const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
+                const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
                 const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
 
                 // if the sum is not needed it's faster to transform the scale to f32 ahead of time
@@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(
 
 #pragma unroll
     for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
+        const auto col_dst = col_dst_0 + j + threadIdx.y;
         if (col_dst >= ncols_dst) {
             return;
         }
 
 #pragma unroll
         for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
+            const auto row_dst = row_dst_0 + threadIdx.x + i;
             if (row_dst >= nrows_dst) {
                 continue;
             }
diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh
index d83f2974555..687cb0a3741 100644
--- a/csrc/quantization/gguf/mmvq.cuh
+++ b/csrc/quantization/gguf/mmvq.cuh
@@ -1,7 +1,7 @@
 // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
 template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
+    const auto row = blockIdx.x*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
         return;
@@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
-    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+    for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
         const int ibx = row*blocks_per_row + i; // x block index
 
         const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
index e499f53a2ac..2dbafc0f742 100644
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(
 
   const int ncols_dst = ncols_y * top_k;
 
-  const int row_dst_0 = blockIdx.x * mmq_y;
+  const auto row_dst_0 = blockIdx.x * mmq_y;
   const int& row_x_0 = row_dst_0;
 
-  const int col_dst_0 = blockIdx.y * mmq_x;
+  const auto col_dst_0 = blockIdx.y * mmq_x;
 
   int token_offs[mmq_x / nwarps];
   for (int i = 0; i < mmq_x; i += nwarps) {
@@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
     const int n_per_r = ((qk * blocks_per_warp) / qr);
 #pragma unroll
     for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
-      const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
+      const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
       const int kbxd = kqs / QI8_1;
 
 #pragma unroll
@@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
       }
 
       if (threadIdx.x < n_per_r / QK8_1) {
-        const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
+        const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
         const int col_y_eff = token_offs[threadIdx.y] / top_k;
         const int block_x =
             ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
@@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(
 
 #pragma unroll
     for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
-      const int row_dst = row_dst_0 + threadIdx.x + i;
+      const auto row_dst = row_dst_0 + threadIdx.x + i;
       if (row_dst >= nrows_dst) {
         continue;
       }
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 538cb5848e2..6fad16e196b 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -199,12 +199,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
   MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
   [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
   [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@@ -337,12 +337,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
   MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
   [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
   [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@@ -458,12 +458,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
   MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
   [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
   [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@@ -586,12 +586,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
   MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   // Block
-  int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
-  int offset_m = blockIdx.y * m_count;
-  int offset_k = blockIdx.z * BLOCK_KN_SIZE;
+  auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
+  auto offset_m = blockIdx.y * m_count;
+  auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
 
   [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
   [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
@@ -765,14 +765,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
   MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   // Preload remapping table
   __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   if (b_q_perm) {
     if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@@ -862,14 +862,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
   MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   // Preload remapping table
   __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   if (b_q_perm) {
     if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@@ -967,14 +967,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
   MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   // Preload remapping table
   __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   if (b_q_perm) {
     if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@@ -1065,14 +1065,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
   MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
   MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 
-  int offset_k = BLOCK_KN_SIZE * blockIdx.y;
-  int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
+  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
+  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 
   int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
 
   // Preload remapping table
   __shared__ int perm[BLOCK_KN_SIZE];
-  int t = threadIdx.x;
+  auto t = threadIdx.x;
 
   if (b_q_perm) {
     if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
@@ -1181,11 +1181,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
   int zero_width = width / 8;
   int vec_height = height * 4;
   const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
   int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 8;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
   int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 
   __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
   if (threadIdx.x < h_end) {
@@ -1197,8 +1197,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
   }
 
   __shared__ half2 deq2[256][8];
-  int val = threadIdx.x / 8;
-  int off = threadIdx.x % 8;
+  auto val = threadIdx.x / 8;
+  auto off = threadIdx.x % 8;
   for (; val < 256; val += BLOCK_KN_SIZE / 8) {
     deq2[val][off] =
         __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
@@ -1280,11 +1280,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
   int zero_width = width / 4;
   int vec_height = height * 2;
   const int blockwidth2 = BLOCK_KN_SIZE / 2;
-  int b = blockIdx.y * BLOCK_M_SIZE_MAX;
+  auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
   int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
-  int h = BLOCK_KN_SIZE * blockIdx.z / 4;
+  auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
   int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
-  int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 
   __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
   if (threadIdx.x < h_end) {
@@ -1393,8 +1393,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
                                         half* __restrict__ out) {
   // Start of block
 
-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32 / bit;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32 / bit;
   if (column >= width) return;
 
   // Views
@@ -1425,8 +1425,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
     const int height, const int width, const int group,
     half* __restrict__ out) {
   // Start of block
-  int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
-  int row = blockIdx.y * 32;
+  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
+  auto row = blockIdx.y * 32;
   if (column >= width) return;
 
   // Views
@@ -1542,7 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
 
 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                     const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
   if (n >= size_n) return;
   int k = 0;
   uint32_t* b_ptr = b_q_weight + n;
@@ -1555,7 +1555,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
 
 __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
                                     const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
   if (n >= size_n) return;
   int k = 0;
   uint32_t* b_ptr = b_q_weight + n;
@@ -1568,7 +1568,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
 
 __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
                                     const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
   if (n >= size_n) return;
   int k = 0;
   uint32_t* b_ptr = b_q_weight + n;
@@ -1581,7 +1581,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
 
 __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
                                     const int size_k, const int size_n) {
-  int n = blockIdx.x * THREADS_X + threadIdx.x;
+  auto n = blockIdx.x * THREADS_X + threadIdx.x;
   if (n >= size_n) return;
   int k = 0;
   uint32_t* b_ptr = b_q_weight + n;
@@ -1599,9 +1599,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
   const uint64_t* w2 = (uint64_t*)w;
   uint64_t* w_new2 = (uint64_t*)w_new;
   int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
   if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
   int q_perm_idx = w_new2_row << 3;
   uint64_t dst = 0;
 
@@ -1630,9 +1630,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
   const uint64_t* w2 = (uint64_t*)w;
   uint64_t* w_new2 = (uint64_t*)w_new;
   int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
   if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
   int q_perm_idx = w_new2_row << 4;
   uint64_t dst = 0;
 
@@ -1658,10 +1658,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
                                             uint32_t* __restrict__ w_new,
                                             const int* __restrict__ q_perm,
                                             const int w_width) {
-  int w_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
   if (w_column >= w_width) return;
-  int w_new_row = blockIdx.y * 3;
-  int q_perm_idx = blockIdx.y << 5;
+  auto w_new_row = blockIdx.y * 3;
+  auto q_perm_idx = blockIdx.y << 5;
   uint32_t dst[3] = {0, 0, 0};
 
 #pragma unroll
@@ -1744,9 +1744,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
   const uint64_t* w2 = (uint64_t*)w;
   uint64_t* w_new2 = (uint64_t*)w_new;
   int w2_stride = w_width >> 1;
-  int w2_column = THREADS_X * blockIdx.x + threadIdx.x;
+  auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
   if (w2_column >= w2_stride) return;
-  int w_new2_row = blockIdx.y;
+  auto w_new2_row = blockIdx.y;
   int q_perm_idx = w_new2_row << 2;
   uint64_t dst = 0;
 
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index b520f8c32b9..ec0bf2c3cb4 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
     this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
                             blockIdx.z * params.SplitK * 4;
 
-    const int lane_id = threadIdx.x % WARP_SIZE;
+    const auto lane_id = threadIdx.x % WARP_SIZE;
 
     // For matrix A, a block load/store Mtile(row) x 32(col) elements in
     // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
-    const int Aldg_row_base_idx = threadIdx.x / 4;
+    const auto Aldg_row_base_idx = threadIdx.x / 4;
     Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
     const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
 
@@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
     // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
     // * 128(col) per iter
     Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
-    const int Bldg_row_base_idx = threadIdx.x / 8;
+    const auto Bldg_row_base_idx = threadIdx.x / 8;
     const int Bldg_base_offset =
         Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
 
@@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
     B_ldg_guard = 0;
   #pragma unroll
     for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
-      int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
+      auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
       if (m_idx < params.M) {
         A_ldg_guard |= (1u << i);
       }
@@ -98,8 +98,8 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
     const int N_padded = (params.N + 31) / 32 * 32;
   #pragma unroll
     for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
-      int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
-                  i * N_SIZE_ONE_LOAD;
+      auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
+                   i * N_SIZE_ONE_LOAD;
       if (n_idx < N_padded) {
         B_ldg_guard |= (1u << i);
       }
@@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
   __device__ void fused_splitk_reduce() {
     // need splitk-reduce if enable splitk
     if (gridDim.z > 1) {
-      int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
+      auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
       // Wait for all previous blocks in the splitk direction to accumulate the
       // results into C_tmp
       if (threadIdx.x == 0) {
@@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       }
       __syncthreads();
 
-      int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
+      auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
       if (blockIdx.z != 0) {
         // expecting that temporary register here reuses the previous A&B frag
         // register
@@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
 
     FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
     // C_tile lds and stg
-    int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
+    auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
     bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
     if (WARP_NTILE == 32) {
       int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
@@ -580,9 +580,9 @@ __global__ void __launch_bounds__(BLOCK)
   int sts_stage_idx = 0;
   int lds_stage_idx = 0;
 
-  int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
-                       ? params.SplitK
-                       : params.K - blockIdx.z * params.SplitK;
+  auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
+                        ? params.SplitK
+                        : params.K - blockIdx.z * params.SplitK;
   int k_tiles = (tb_k_slice + 31) / 32;
   int first_k_tile = tb_k_slice - (k_tiles - 1) * 32;
 
@@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
     const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
     const int N_32align, const int N, const int K) {
   __shared__ FT smem[64 * 32];
-  int warp_id = threadIdx.x / 32;
-  int lane_id = threadIdx.x % 32;
-  const int src_row_idx = blockIdx.x * 8 + lane_id / 4;
+  auto warp_id = threadIdx.x / 32;
+  auto lane_id = threadIdx.x % 32;
+  const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
   const int src_col_idx =
       blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
   const int src_offset = src_row_idx * K * 4 + src_col_idx;
-  int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
+  auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
 
   QT qval_reg[16];
   const QT* pdata = qdata + src_offset;
@@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
         *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
   }
 
-  const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
-  const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
+  const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
+  const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
   #pragma unroll
   for (int i = 0; i < 2; ++i) {
     int dst_row_kidx = dst_row_base_kidx + i * 32;
@@ -1008,4 +1008,4 @@ torch::Tensor allspark_w8a16_gemm(
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("allspark_w8a16_gemm", &allspark_w8a16_gemm);
-}
\ No newline at end of file
+}
diff --git a/csrc/quantization/gptq_allspark/allspark_repack.cu b/csrc/quantization/gptq_allspark/allspark_repack.cu
index 82929c94ad8..ea8eccf040d 100644
--- a/csrc/quantization/gptq_allspark/allspark_repack.cu
+++ b/csrc/quantization/gptq_allspark/allspark_repack.cu
@@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
         const uint8_t* B, const FType* B_scale, const FType* B_zero,
         uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
         const int K, const int N, const int N_32align) {
-  const int lane_id = threadIdx.x % 32;
-  const int warp_id = threadIdx.x / 32;
+  const auto lane_id = threadIdx.x % 32;
+  const auto warp_id = threadIdx.x / 32;
 
   if (blockIdx.x != gridDim.x - 1) {
     // Load B
@@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
     }
 
     // Store B
-    const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
+    const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
     const int dst_col_idx =
         blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
     for (int i = 0; i < 8; ++i) {
@@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
   } else {
     // Load B_scale and B_zero
     FType b_scale_reg, b_zero_reg;
-    int src_offset = blockIdx.y * 128 + threadIdx.x;
+    auto src_offset = blockIdx.y * 128 + threadIdx.x;
     ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
     if (B_zero != nullptr)
       ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 80456c25590..83141301653 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
 __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
                                               uint32_t n, uint32_t n_matrix,
                                               uint32_t matrix_size) {
-  int idx = blockIdx.x * BLOCK + threadIdx.x;
+  auto idx = blockIdx.x * BLOCK + threadIdx.x;
 
   if (idx >= matrix_size) {
     return;
@@ -407,4 +407,4 @@ static __device__ half2 inline num2num2(const half x) {
   return __half2half2(x);
 }
 
-}  // namespace allspark
\ No newline at end of file
+}  // namespace allspark

From 8d53121340913c9d54dd57bd9f70562a94ca76b6 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 21 Mar 2025 10:17:12 +0800
Subject: [PATCH 0878/1240] [Misc] Clean up the BitsAndBytes arguments (#15140)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/bnb.md          |  6 +++---
 .../lora_with_quantization_inference.py           |  1 -
 vllm/engine/arg_utils.py                          | 15 ++++-----------
 3 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index 7525e8e7866..b81d89c4575 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -25,7 +25,7 @@ import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```
 
 ## Inflight quantization: load as 4bit quantization
@@ -35,7 +35,7 @@ from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes", load_format="bitsandbytes")
+quantization="bitsandbytes")
 ```
 
 ## OpenAI Compatible Server
@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
 Append the following to your 4bit model arguments:
 
 ```console
---quantization bitsandbytes --load-format bitsandbytes
+--quantization bitsandbytes
 ```
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index a409735013f..ab235ddd754 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -83,7 +83,6 @@ def initialize_engine(model: str, quantization: str,
         engine_args = EngineArgs(model=model,
                                  quantization=quantization,
                                  qlora_adapter_name_or_path=lora_repo,
-                                 load_format="bitsandbytes",
                                  enable_lora=True,
                                  max_lora_rank=64)
     else:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bbe780a0ec1..88d70acb79d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1170,22 +1170,15 @@ def create_model_config(self) -> ModelConfig:
         )
 
     def create_load_config(self) -> LoadConfig:
-        # bitsandbytes quantization needs a specific model loader
-        # so we make sure the quant method and the load format are consistent
-        if (self.quantization == "bitsandbytes" or
-           self.qlora_adapter_name_or_path is not None) and \
-           self.load_format != "bitsandbytes":
-            raise ValueError(
-                "BitsAndBytes quantization and QLoRA adapter only support "
-                f"'bitsandbytes' load format, but got {self.load_format}")
 
-        if (self.load_format == "bitsandbytes" or
-            self.qlora_adapter_name_or_path is not None) and \
+        if(self.qlora_adapter_name_or_path is not None) and \
             self.quantization != "bitsandbytes":
             raise ValueError(
-                "BitsAndBytes load format and QLoRA adapter only support "
+                "QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
 
+        if self.quantization == "bitsandbytes":
+            self.load_format = "bitsandbytes"
         return LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,

From a9bcfad01ee2a383bab026e65730d98e0a3102f6 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 20 Mar 2025 19:17:33 -0700
Subject: [PATCH 0879/1240] [ROCM] Upgrade torch to 2.6 (#15244)

Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/rocm-build.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index a0731c51d46..6af78da4993 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.2
-torch==2.5.1
-torchvision==0.20.1
-torchaudio==2.5.1
+--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0
 
 cmake>=3.26
 packaging

From b66fe65389c746e02d23f4ac71613a18103805fb Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Mar 2025 10:18:04 +0800
Subject: [PATCH 0880/1240] [Bugfix] Fix incorrect qwen2.5-vl attention mask
 pre-computation (#15200)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_models.py             | 13 +++++++++++++
 .../vision_language/vlm_utils/custom_inputs.py | 18 ++++++++++++++++++
 vllm/model_executor/models/qwen2_5_vl.py       | 10 ++++++----
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 5690249eb37..02351401879 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -508,6 +508,19 @@
             limit_mm_per_prompt={"image": 4},
         )],
     ),
+    # regression test for https://github.com/vllm-project/vllm/issues/15122
+    "qwen2_5_vl-windows-attention": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.CUSTOM_INPUTS,
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        custom_test_opts=[CustomTestOptions(
+            inputs=custom_inputs.windows_attention_image_qwen2_5_vl(),
+            limit_mm_per_prompt={"image": 1},
+        )],
+    ),
 }
 # yapf: enable
 
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
index 2f03a114ae5..235618ae547 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
@@ -1,7 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 """Custom input builders for edge-cases in different models."""
+from io import BytesIO
 from typing import Callable
 
+import requests
+from PIL import Image
+
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                    sample_frames_from_video)
@@ -102,3 +106,17 @@ def different_patch_input_cases_internvl():
         build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
         build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
     ]
+
+
+def windows_attention_image_qwen2_5_vl():
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
+    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    question = "Describe the image."
+    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+
+    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
+    return build_single_image_inputs([image], [prompt], wrapped_sf)
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 8a570d138c6..adca97c71c5 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -647,15 +647,17 @@ def forward(
 
         max_seqlen = None
         seqlens = None
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         for layer_num, blk in enumerate(self.blocks):
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
             else:
                 cu_seqlens_now = cu_window_seqlens
+            # pre-compute cu_seqlens for window attn
+            if self.attn_backend == _Backend.FLASH_ATTN:
+                max_seqlen = (cu_seqlens_now[1:] -
+                              cu_seqlens_now[:-1]).max().item()
+            elif self.attn_backend == _Backend.XFORMERS:
+                seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist()
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,

From 508ee6e24c6bd86e0cf2046cef6b9c0a88948425 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 21 Mar 2025 02:18:36 +0000
Subject: [PATCH 0881/1240] Mention `extra_body` as a way top pass vLLM only
 parameters using the OpenAI client (#15240)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/openai_compatible_server.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 0880a4530d8..a6ec05f45b6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -29,6 +29,11 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message)
 ```
 
+:::{tip}
+vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
+:::
+
 ## Supported APIs
 
 We currently support the following OpenAI APIs:

From 93bce16782ebca9e203396ea996aedcc6f3b0eb9 Mon Sep 17 00:00:00 2001
From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com>
Date: Thu, 20 Mar 2025 19:19:40 -0700
Subject: [PATCH 0882/1240] [V1][TPU] Speed up top-k on TPU by using torch.topk
 (#15242)

Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/tpu/test_sampler.py            |  3 ++-
 vllm/envs.py                            |  6 ++++++
 vllm/v1/sample/ops/topk_topp_sampler.py | 24 +++++++++++++++++++++---
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 76b8ddb92b7..4e5a57bee32 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -39,7 +39,7 @@ def test_sampler_compilation(model_name: str, monkeypatch):
         sampling_params = SamplingParams(
             temperature=0.7,
             # top_p=0.6, # TODO too slow!
-            # top_k=10,
+            top_k=10,
             min_p=0.2,
             max_tokens=16)
         s = time()
@@ -49,6 +49,7 @@ def test_sampler_compilation(model_name: str, monkeypatch):
         # Second request with different params, but for which we
         # compiled for in previous eager iteration.
         sampling_params = SamplingParams(temperature=0.1,
+                                         top_k=12,
                                          min_p=0.8,
                                          max_tokens=24)
         s = time()
diff --git a/vllm/envs.py b/vllm/envs.py
index 56bf8626747..d88ab3b5e7d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -95,6 +95,7 @@
     VLLM_DP_MASTER_PORT: int = 0
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
+    VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
 
 
 def get_default_cache_root():
@@ -623,6 +624,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # an environment with potentially malicious users.
     "VLLM_V0_USE_OUTLINES_CACHE":
     lambda: os.environ.get("VLLM_V0_USE_OUTLINES_CACHE", "0") == "1",
+
+    # If set, disables TPU-specific optimization for top-k & top-p sampling
+    "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
+    lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
+    if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index e1a3e92de49..1dea711874b 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -66,7 +66,14 @@ def __init__(self):
                     "best performance, please install FlashInfer.")
                 self.forward = self.forward_native
         elif current_platform.is_tpu():
-            self.forward = self.forward_tpu
+            if envs.VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION:
+                logger.warning(
+                    "TPU-specific optimization for top-k & top-p sampling are "
+                    "disabled, falling back to PyTorch-native implementation "
+                    "which could be very slow.")
+                self.forward = self.forward_native
+            else:
+                self.forward = self.forward_tpu
         else:
             self.forward = self.forward_native
 
@@ -105,8 +112,19 @@ def forward_tpu(
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        # TODO Placeholder for TPU optimized topk/p kernel
-        # logits = apply_top_k_top_p(logits, k, p)
+        # If only top-k is specified, use pytorch's builtin topk op. This leads
+        # to significant speed up on TPU compared to using apply_top_k_top_p.
+        if k is not None and p is None:
+            topk_values, topk_indices = torch.topk(logits, k, dim=-1)
+
+            mask = torch.ones_like(logits, dtype=torch.bool)
+            mask.scatter_(-1, topk_indices, False)
+            logits.masked_fill_(mask, float('-inf'))
+        else:
+            # TODO Placeholder for TPU optimized topp kernel
+            # logits = apply_top_k_top_p(logits, k, p)
+            pass
+
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 

From 9e80ba383375a3a3e820216167a7334abcec4133 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 20 Mar 2025 20:20:16 -0600
Subject: [PATCH 0883/1240] [Bugfix] detect alibi and revert to FA2 (#15231)

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/flash_attn.py |  3 ++-
 vllm/fa_utils.py                      | 12 +++++++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index e981ac780b0..4cb0b916739 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -630,7 +630,8 @@ def __init__(
         self.sliding_window = ((sliding_window - 1,
                                 0) if sliding_window is not None else (-1, -1))
         self.kv_cache_dtype = kv_cache_dtype
-        self.vllm_flash_attn_version = get_flash_attn_version()
+        self.vllm_flash_attn_version = get_flash_attn_version(
+            requires_alibi=self.alibi_slopes is not None)
         if (is_quantized_kv_cache(self.kv_cache_dtype)
                 and self.vllm_flash_attn_version != 3):
             raise NotImplementedError(
diff --git a/vllm/fa_utils.py b/vllm/fa_utils.py
index 028c96b839f..41765349015 100644
--- a/vllm/fa_utils.py
+++ b/vllm/fa_utils.py
@@ -7,7 +7,7 @@
 logger = init_logger(__name__)
 
 
-def get_flash_attn_version() -> Optional[int]:
+def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
     try:
@@ -28,8 +28,14 @@ def get_flash_attn_version() -> Optional[int]:
 
         # 3. fallback for unsupported combinations
         if device_capability.major == 10 and fa_version == 3:
-            logger.warning("Cannot use FA version 3 on Blackwell platform",
-                           "defaulting to FA version 2.")
+            logger.warning_once(
+                "Cannot use FA version 3 on Blackwell platform "
+                "defaulting to FA version 2.")
+            fa_version = 2
+
+        if requires_alibi and fa_version == 3:
+            logger.warning_once("Cannot use FA version 3 with ALiBi, "
+                                "defaulting to FA version 2.")
             fa_version = 2
 
         if not is_fa_version_supported(fa_version):

From 0dbd3dfbb730afcfe066c8d54316569bd9f0c7d4 Mon Sep 17 00:00:00 2001
From: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Date: Thu, 20 Mar 2025 19:21:08 -0700
Subject: [PATCH 0884/1240] [Model] RE: Mamba2 Prefill Performance Tweaks:
 Fixing Flurry of Unnecessary Memory Copies  (#14857)

Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/mamba/mamba_mixer2.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index fec6d6112d6..d7a45bc5123 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -470,10 +470,11 @@ def forward_cuda(
         if has_prefill:
 
             initial_states = None
-            if has_initial_states is not None and any(has_initial_states):
-                for idx in mamba_cache_params.state_indices_tensor[
-                        ~has_initial_states]:
-                    mamba_cache_params.ssm_state[idx].zero_()
+            if has_initial_states is not None and torch.any(
+                    has_initial_states):
+                zero_init_indices = mamba_cache_params.state_indices_tensor[
+                    ~has_initial_states]
+                mamba_cache_params.ssm_state[zero_init_indices] = 0
                 initial_states = mamba_cache_params.ssm_state[
                     mamba_cache_params.state_indices_tensor]
 
@@ -499,8 +500,8 @@ def forward_cuda(
 
             # update ssm states
             # - varlen state is a (batch, nheads, headdim, dstate) tensor
-            for i, idx in enumerate(mamba_cache_params.state_indices_tensor):
-                mamba_cache_params.ssm_state[idx].copy_(varlen_state[i])
+            mamba_cache_params.ssm_state[
+                mamba_cache_params.state_indices_tensor] = varlen_state
 
             # - reshape
             hidden_states = scan_output.view(seq_len, -1)

From 0ba56921d97f973a80e771818432bf9aabab07ef Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 20 Mar 2025 19:24:21 -0700
Subject: [PATCH 0885/1240] [Docs] Trim the latest news in README (#15261)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 573b667ca88..d829b057144 100644
--- a/README.md
+++ b/README.md
@@ -28,19 +28,7 @@ Easy, fast, and cheap LLM serving for everyone
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
-- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
-- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
-- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
-- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
-- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
-- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
-- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
-- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 
 ---
 

From f99d903c7a5e4a9ec640cf8ffc0afa959e9826cb Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 20 Mar 2025 19:27:46 -0700
Subject: [PATCH 0886/1240] [Misc] Better RayExecutor and multiprocessing
 compatibility (#14705)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py                | 15 +++++++-
 vllm/executor/multiproc_worker_utils.py |  4 +--
 vllm/executor/ray_utils.py              | 21 ++++++-----
 vllm/utils.py                           | 48 +++++++++++++++++++------
 4 files changed, 67 insertions(+), 21 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 88d70acb79d..986d1b40749 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -26,7 +26,7 @@
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, StoreBoolean
+from vllm.utils import FlexibleArgumentParser, StoreBoolean, is_in_ray_actor
 
 if TYPE_CHECKING:
     from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
@@ -1245,6 +1245,18 @@ def create_engine_config(
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
         )
+
+        # Get the current placement group if Ray is initialized and
+        # we are in a Ray actor. If so, then the placement group will be
+        # passed to spawned processes.
+        placement_group = None
+        if is_in_ray_actor():
+            import ray
+
+            # This call initializes Ray automatically if it is not initialized,
+            # but we should not do this here.
+            placement_group = ray.util.get_current_placement_group()
+
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
@@ -1257,6 +1269,7 @@ def create_engine_config(
                 self.tokenizer_pool_extra_config,
             ),
             ray_workers_use_nsight=self.ray_workers_use_nsight,
+            placement_group=placement_group,
             distributed_executor_backend=self.distributed_executor_backend,
             worker_cls=self.worker_cls,
             worker_extension_cls=self.worker_extension_cls,
diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py
index a4a5b3f938c..380b672c360 100644
--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -16,7 +16,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import _check_multiproc_method, get_mp_context, run_method
+from vllm.utils import _maybe_force_spawn, get_mp_context, run_method
 
 logger = init_logger(__name__)
 
@@ -291,7 +291,7 @@ def set_multiprocessing_worker_envs(parallel_config):
     in a multiprocessing environment. This should be called by the parent 
     process before worker processes are created"""
 
-    _check_multiproc_method()
+    _maybe_force_spawn()
 
     # Configure thread parallelism if OMP_NUM_THREADS isn't set
     #
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index a7042ca8df1..b7222f26f66 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -284,8 +284,9 @@ def initialize_ray_cluster(
     assert_ray_available()
     from vllm.platforms import current_platform
 
-    # Connect to a ray cluster.
-    if current_platform.is_rocm() or current_platform.is_xpu():
+    if ray.is_initialized():
+        logger.info("Ray is already initialized. Skipping Ray initialization.")
+    elif current_platform.is_rocm() or current_platform.is_xpu():
         # Try to connect existing ray instance and create a new one if not found
         try:
             ray.init("auto", ignore_reinit_error=True)
@@ -299,19 +300,21 @@ def initialize_ray_cluster(
     else:
         ray.init(address=ray_address, ignore_reinit_error=True)
 
-    if parallel_config.placement_group:
-        # Placement group is already set.
-        return
-
     device_str = current_platform.ray_device_key
     if not device_str:
         raise ValueError(
             f"current platform {current_platform.device_name} does not "
             "support ray.")
 
-    # Create placement group for worker processes
-    current_placement_group = ray.util.get_current_placement_group()
+    # Create or get the placement group for worker processes
+    if parallel_config.placement_group:
+        current_placement_group = parallel_config.placement_group
+    else:
+        current_placement_group = ray.util.get_current_placement_group()
+
     if current_placement_group:
+        logger.info("Using the existing placement group")
+
         # We are in a placement group
         bundles = current_placement_group.bundle_specs
         # Verify that we can use the placement group.
@@ -331,6 +334,8 @@ def initialize_ray_cluster(
                 f"Required number of devices: {parallel_config.world_size}. "
                 f"Total number of devices: {device_bundles}.")
     else:
+        logger.info("No current placement group found. "
+                    "Creating a new placement group.")
         num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
         # Log a warning message and delay resource allocation failure response.
         # Avoid immediate rejection to allow user-initiated placement group
diff --git a/vllm/utils.py b/vllm/utils.py
index 9bc081890bc..cb375f8ff32 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2147,20 +2147,48 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
         ctx.destroy(linger=0)
 
 
-def _check_multiproc_method():
-    if (cuda_is_initialized()
-            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"):
-        logger.warning("CUDA was previously initialized. We must use "
-                       "the `spawn` multiprocessing start method. Setting "
-                       "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-                       "See https://docs.vllm.ai/en/latest/getting_started/"
-                       "troubleshooting.html#python-multiprocessing "
-                       "for more information.")
+def is_in_ray_actor():
+    """Check if we are in a Ray actor."""
+
+    try:
+        import ray
+        return (ray.is_initialized()
+                and ray.get_runtime_context().get_actor_id() is not None)
+    except ImportError:
+        return False
+
+
+def _maybe_force_spawn():
+    """Check if we need to force the use of the `spawn` multiprocessing start
+    method.
+    """
+    if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") == "spawn":
+        return
+
+    reason = None
+    if cuda_is_initialized():
+        reason = "CUDA is initialized"
+    elif is_in_ray_actor():
+        reason = "In a Ray actor and can only be spawned"
+
+    if reason is not None:
+        logger.warning(
+            "We must use the `spawn` multiprocessing start method. "
+            "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
+            "See https://docs.vllm.ai/en/latest/getting_started/"
+            "troubleshooting.html#python-multiprocessing "
+            "for more information. Reason: %s", reason)
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 
 def get_mp_context():
-    _check_multiproc_method()
+    """Get a multiprocessing context with a particular method (spawn or fork).
+    By default we follow the value of the VLLM_WORKER_MULTIPROC_METHOD to
+    determine the multiprocessing method (default is fork). However, under
+    certain conditions, we may enforce spawn and override the value of
+    VLLM_WORKER_MULTIPROC_METHOD.
+    """
+    _maybe_force_spawn()
     mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD
     return multiprocessing.get_context(mp_method)
 

From 2368f0ba62191311e54d8662472f28c4430981ba Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 20 Mar 2025 19:55:47 -0700
Subject: [PATCH 0887/1240] Add an example for reproducibility (#15262)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/reproduciblity.py | 36 ++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 examples/offline_inference/reproduciblity.py

diff --git a/examples/offline_inference/reproduciblity.py b/examples/offline_inference/reproduciblity.py
new file mode 100644
index 00000000000..d0197bf6d5b
--- /dev/null
+++ b/examples/offline_inference/reproduciblity.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from vllm import LLM, SamplingParams
+
+# vLLM does not guarantee the reproducibility of the results by default,
+# for the sake of performance. You need to do the following to achieve
+# reproducible results:
+# 1. Turn off multiprocessing to make the scheduling deterministic.
+#    NOTE(woosuk): This is not needed and will be ignored for V0.
+os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+# 2. Fix the global seed for reproducibility. The default seed is None, which is
+# not reproducible.
+SEED = 42
+
+# NOTE(woosuk): Even with the above two settings, vLLM only provides
+# reproducibility when it runs on the same hardware and the same vLLM version.
+# Also, the online serving API (`vllm serve`) does not support reproducibility
+# because it is almost impossible to make the scheduling deterministic in the
+# online serving setting.
+
+llm = LLM(model="facebook/opt-125m", seed=SEED)
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

From be65d3907a63f1730c3d2622e64bff40a8855b1b Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Thu, 20 Mar 2025 20:05:28 -0700
Subject: [PATCH 0888/1240] [Hardware][TPU] Add check for no additional graph
 compilation during runtime (#14710)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh      | 14 ++++++++------
 vllm/envs.py                       |  5 +++++
 vllm/v1/worker/tpu_model_runner.py | 19 +++++++++++++++++++
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 82f40c650f8..6562942ea3f 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -19,17 +19,19 @@ docker run --privileged --net host --shm-size=16G -it \
     vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
     && python3 -m pip install pytest \
     && python3 -m pip install lm_eval[api]==0.4.4 \
+    && export VLLM_USE_V1=1 \
+    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
     && echo TEST_1 \
-    && VLLM_USE_V1=1 python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
-    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
-    && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
+    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && VLLM_USE_V1=1 pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
     && echo TEST_5 \
-    && VLLM_USE_V1=1 python3 /workspace/vllm/examples/offline_inference/tpu.py" \
-    
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
diff --git a/vllm/envs.py b/vllm/envs.py
index d88ab3b5e7d..d54de9da253 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -45,6 +45,7 @@
     VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
     VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
+    VLLM_XLA_CHECK_RECOMPILATION: bool = False
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
@@ -446,6 +447,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
             "VLLM_XLA_CACHE_PATH",
             os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
         )),
+
+    # If set, assert on XLA recompilation after each execution step.
+    "VLLM_XLA_CHECK_RECOMPILATION":
+    lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index ec3dcbc064c..d772a3ee13e 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -11,6 +11,7 @@
 import torch_xla.core.xla_model as xm
 import torch_xla.runtime as xr
 
+import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
@@ -73,6 +74,10 @@ def __init__(
         scheduler_config = self.scheduler_config
         parallel_config = self.parallel_config
         self.device = device
+        self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
+        if self.check_recompilation:
+            self.num_xla_graphs = xr.get_num_cached_compilation_graph()
+        self.enforce_eager = model_config.enforce_eager
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
 
@@ -671,6 +676,12 @@ def execute_model(
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
+        # Check there is no new graph compilation, all the graphs should be
+        # captured and compiled during warming up.
+        if self.check_recompilation and not self.enforce_eager:
+            curr_cached_graph = xr.get_num_cached_compilation_graph()
+            assert self.num_xla_graphs == curr_cached_graph, (
+                "Recompilation after warm up is detected.")
         return model_runner_output
 
     def load_model(self) -> None:
@@ -810,6 +821,14 @@ def capture_model(self) -> None:
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
+        # Record the number cached XLA graph after warming up, this will be
+        # used for checking there is no additional graph compilation during
+        # runtime execution.
+        if self.check_recompilation:
+            total_cached_graphs = xr.get_num_cached_compilation_graph()
+            num_compiled_graphs = total_cached_graphs - self.num_xla_graphs
+            logger.info("Compiled %d XLA graphs.", num_compiled_graphs)
+            self.num_xla_graphs += num_compiled_graphs
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """

From a1547507129df4ed9dfe839efb39cf8c92891a2f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Mar 2025 11:14:19 +0800
Subject: [PATCH 0889/1240] [V1] Enable Triton(ROCm) Attention backend for
 Nvidia GPUs (#14071)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/platforms/cuda.py                        | 11 +++++++---
 vllm/platforms/interface.py                   |  1 +
 vllm/platforms/rocm.py                        |  5 +++--
 .../backends/{rocm_attn.py => triton_attn.py} | 20 +++++++++----------
 5 files changed, 23 insertions(+), 16 deletions(-)
 rename vllm/v1/attention/backends/{rocm_attn.py => triton_attn.py} (91%)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 986d1b40749..edfa748b82d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1588,7 +1588,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # No FlashInfer or XFormers so far.
         V1_BACKENDS = [
             "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
-            "TRITON_MLA", "FLASHMLA"
+            "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
         ]
         if (envs.is_set("VLLM_ATTENTION_BACKEND")
                 and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index dd2a9cb6161..38d8fffd63c 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -213,9 +213,14 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                         return ("vllm.attention.backends."
                                 "flashmla.FlashMLABackend")
         if use_v1:
-            logger.info_once("Using Flash Attention backend on V1 engine.")
-            return ("vllm.v1.attention.backends.flash_attn."
-                    "FlashAttentionBackend")
+            if selected_backend == _Backend.TRITON_ATTN_VLLM_V1:
+                logger.info_once("Using Triton backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "triton_attn.TritonAttentionBackend")
+            if cls.has_device_capability(80):
+                logger.info_once("Using Flash Attention backend on V1 engine.")
+                return ("vllm.v1.attention.backends."
+                        "flash_attn.FlashAttentionBackend")
         if selected_backend == _Backend.FLASHINFER:
             logger.info("Using FlashInfer backend.")
             return "vllm.attention.backends.flashinfer.FlashInferBackend"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index c7152d0bfb7..d3bffaf4d69 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -29,6 +29,7 @@ def in_wsl() -> bool:
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
     FLASH_ATTN_VLLM_V1 = enum.auto()
+    TRITON_ATTN_VLLM_V1 = enum.auto()
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 75f287b568a..ee708f5961d 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -120,8 +120,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
         selected_backend = (_Backend.ROCM_FLASH if selected_backend
                             == _Backend.FLASH_ATTN else selected_backend)
         if envs.VLLM_USE_V1:
-            logger.info("Using ROCm Attention backend on V1 engine.")
-            return "vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return ("vllm.v1.attention.backends."
+                    "triton_attn.TritonAttentionBackend")
         if selected_backend == _Backend.ROCM_FLASH:
             if not cls.has_device_capability(90):
                 # not Instinct series GPUs.
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/triton_attn.py
similarity index 91%
rename from vllm/v1/attention/backends/rocm_attn.py
rename to vllm/v1/attention/backends/triton_attn.py
index 640c3b3d4fb..f11f2b6271f 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Attention layer with PagedAttention on rocm"""
+"""Attention layer with PagedAttention and Triton prefix prefill."""
 from typing import Any, Optional
 
 import torch
@@ -16,7 +16,7 @@
 logger = init_logger(__name__)
 
 
-class ROCmAttentionBackend(AttentionBackend):
+class TritonAttentionBackend(AttentionBackend):
 
     accept_output_buffer: bool = True
 
@@ -26,11 +26,11 @@ def get_supported_head_sizes() -> list[int]:
 
     @staticmethod
     def get_name() -> str:
-        return "ROCM_ATTN_VLLM_V1"
+        return "TRITON_ATTN_VLLM_V1"
 
     @staticmethod
-    def get_impl_cls() -> type["ROCmAttentionImpl"]:
-        return ROCmAttentionImpl
+    def get_impl_cls() -> type["TritonAttentionImpl"]:
+        return TritonAttentionImpl
 
     @staticmethod
     def get_metadata_cls() -> type["AttentionMetadata"]:
@@ -56,7 +56,7 @@ def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]:
         return FlashAttentionMetadataBuilder
 
 
-class ROCmAttentionImpl(AttentionImpl):
+class TritonAttentionImpl(AttentionImpl):
 
     def __init__(
         self,
@@ -73,7 +73,7 @@ def __init__(
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
-                "ROCmAttention does not support block-sparse attention.")
+                "TritonAttention does not support block-sparse attention.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -90,17 +90,17 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        support_head_sizes = ROCmAttentionBackend.get_supported_head_sizes()
+        support_head_sizes = TritonAttentionBackend.get_supported_head_sizes()
         if head_size not in support_head_sizes:
             raise ValueError(
-                f"Head size {head_size} is not supported by ROCmAttention. "
+                f"Head size {head_size} is not supported by TritonAttention. "
                 f"Supported head sizes are: {support_head_sizes}.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
-                                      "ROCmAttentionImpl")
+                                      "TritonAttentionImpl")
 
     def forward(
         self,

From 1a24ff530f1c399b9de6480cb9d625e0cdbbe91b Mon Sep 17 00:00:00 2001
From: Edwin Hernandez <Edandres249@gmail.com>
Date: Thu, 20 Mar 2025 21:18:47 -0700
Subject: [PATCH 0890/1240] [Doc] Update LWS docs (#15163)

Signed-off-by: Edwinhr716 <Edandres249@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/frameworks/lws.md | 191 ++++++++++++++++++++++-
 1 file changed, 189 insertions(+), 2 deletions(-)

diff --git a/docs/source/deployment/frameworks/lws.md b/docs/source/deployment/frameworks/lws.md
index 349fa83fbcb..4e9a03b5c4c 100644
--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/source/deployment/frameworks/lws.md
@@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.
 
 vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
 
-Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
-deploying vLLM on Kubernetes using LWS.
+## Prerequisites
+
+* At least two Kubernetes nodes, each with 8 GPUs, are required.
+* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
+
+## Deploy and Serve
+
+Deploy the following yaml file `lws.yaml`
+
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+spec:
+  replicas: 2
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: docker.io/vllm/vllm-openai:latest
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); 
+                 python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            readinessProbe:
+              tcpSocket:
+                port: 8080
+              initialDelaySeconds: 15
+              periodSeconds: 10
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: docker.io/vllm/vllm-openai:latest
+            command:
+              - sh
+              - -c
+              - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
+            resources:
+              limits:
+                nvidia.com/gpu: "8"
+                memory: 1124Gi
+                ephemeral-storage: 800Gi
+              requests:
+                ephemeral-storage: 800Gi
+                cpu: 125
+            env:
+              - name: HUGGING_FACE_HUB_TOKEN
+                value: <your-hf-token>
+            volumeMounts:
+              - mountPath: /dev/shm
+                name: dshm   
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
+```
+
+```bash
+kubectl apply -f lws.yaml
+```
+
+Verify the status of the pods:
+
+```bash
+kubectl get pods
+```
+
+Should get an output similar to this:
+
+```bash
+NAME       READY   STATUS    RESTARTS   AGE
+vllm-0     1/1     Running   0          2s
+vllm-0-1   1/1     Running   0          2s
+vllm-1     1/1     Running   0          2s
+vllm-1-1   1/1     Running   0          2s
+```
+
+Verify that the distributed tensor-parallel inference works:
+
+```bash
+kubectl logs vllm-0 |grep -i "Loading model weights took" 
+```
+
+Should get something similar to this:
+
+```text
+INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
+(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
+```
+
+## Access ClusterIP service
+
+```bash
+# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
+kubectl port-forward svc/vllm-leader 8080:8080
+```
+
+The output should be similar to the following:
+
+```text
+Forwarding from 127.0.0.1:8080 -> 8080
+Forwarding from [::1]:8080 -> 8080
+```
+
+## Serve the model
+
+Open another terminal and send a request
+
+```text
+curl http://localhost:8080/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+    "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+    "prompt": "San Francisco is a",
+    "max_tokens": 7,
+    "temperature": 0
+}'
+```
+
+The output should be similar to the following
+
+```text
+{
+  "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
+  "object": "text_completion",
+  "created": 1715138766,
+  "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
+  "choices": [
+    {
+      "index": 0,
+      "text": " top destination for foodies, with",
+      "logprobs": null,
+      "finish_reason": "length",
+      "stop_reason": null
+    }
+  ],
+  "usage": {
+    "prompt_tokens": 5,
+    "total_tokens": 12,
+    "completion_tokens": 7
+  }
+}
+```

From 6eb465bb5f59575473568700f05c408288fd3b33 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 20 Mar 2025 22:24:10 -0700
Subject: [PATCH 0891/1240] [V1] Avoid redundant input processing in n>1 case
 (#14985)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_tokenizer_group.py            |  6 +-
 tests/tokenization/test_tokenizer_group.py    | 27 +++-----
 vllm/engine/async_llm_engine.py               |  1 -
 vllm/engine/llm_engine.py                     |  1 -
 vllm/engine/protocol.py                       |  5 +-
 vllm/inputs/preprocess.py                     | 63 +++----------------
 .../tokenizer_group/base_tokenizer_group.py   |  2 -
 .../tokenizer_group/ray_tokenizer_group.py    | 10 +--
 .../tokenizer_group/tokenizer_group.py        |  2 -
 vllm/v1/engine/async_llm.py                   | 51 +++++++++------
 vllm/v1/engine/llm_engine.py                  | 42 ++++++++-----
 vllm/v1/engine/parallel_sampling.py           | 13 +---
 vllm/v1/engine/processor.py                   |  1 -
 13 files changed, 82 insertions(+), 142 deletions(-)

diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py
index 589167e801f..d605ab73468 100644
--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -24,12 +24,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
     )
     lora_request = LoRARequest("1", 1, sql_lora_files)
     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        request_id="request_id", prompt="prompt", lora_request=lora_request)
+        prompt="prompt", lora_request=lora_request)
     assert reference_tokenizer.encode(
         "prompt") == await tokenizer_group.encode_async(
-            request_id="request_id",
-            prompt="prompt",
-            lora_request=lora_request)
+            prompt="prompt", lora_request=lora_request)
     assert isinstance(tokenizer_group.get_lora_tokenizer(None),
                       PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py
index d1873823ac1..5b62f992c1b 100644
--- a/tests/tokenization/test_tokenizer_group.py
+++ b/tests/tokenization/test_tokenizer_group.py
@@ -41,10 +41,10 @@ async def test_tokenizer_group(tokenizer_group_type):
         max_input_length=None,
     )
     assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
-        request_id="request_id", prompt="prompt", lora_request=None)
+        prompt="prompt", lora_request=None)
     assert reference_tokenizer.encode(
-        "prompt") == await tokenizer_group.encode_async(
-            request_id="request_id", prompt="prompt", lora_request=None)
+        "prompt") == await tokenizer_group.encode_async(prompt="prompt",
+                                                        lora_request=None)
     assert isinstance(tokenizer_group.get_lora_tokenizer(None),
                       PreTrainedTokenizerBase)
     assert tokenizer_group.get_lora_tokenizer(
@@ -69,8 +69,7 @@ async def test_tokenizer_group_pool(tokenizer_group_type):
     # and check that all requests are processed correctly.
     num_requests = tokenizer_group_pool.pool_size * 5
     requests = [
-        tokenizer_group_pool.encode_async(request_id=str(i),
-                                          prompt=f"prompt {i}",
+        tokenizer_group_pool.encode_async(prompt=f"prompt {i}",
                                           lora_request=None)
         for i in range(num_requests)
     ]
@@ -161,12 +160,8 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
     fail_at[0] = 1000
 
     # We should recover successfully.
-    await tokenizer_group_pool.encode_async(request_id="1",
-                                            prompt="prompt",
-                                            lora_request=None)
-    await tokenizer_group_pool.encode_async(request_id="1",
-                                            prompt="prompt",
-                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
+    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
 
     # Check that we have a new actor
     assert len(tokenizer_group_pool.tokenizer_actors) == len(tokenizer_actors)
@@ -184,8 +179,7 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
 
     # We should fail after re-initialization.
     with pytest.raises(RuntimeError):
-        await tokenizer_group_pool.encode_async(request_id="1",
-                                                prompt="prompt",
+        await tokenizer_group_pool.encode_async(prompt="prompt",
                                                 lora_request=None)
 
     # check_health should raise the same thing
@@ -206,11 +200,8 @@ class FailingRayTokenizerGroupPool(RayTokenizerGroupPool):
 
     # Prompt too long error
     with pytest.raises(ValueError):
-        await tokenizer_group_pool.encode_async(request_id="1",
-                                                prompt="prompt" * 100,
+        await tokenizer_group_pool.encode_async(prompt="prompt" * 100,
                                                 lora_request=None)
-    await tokenizer_group_pool.encode_async(request_id="1",
-                                            prompt="prompt",
-                                            lora_request=None)
+    await tokenizer_group_pool.encode_async(prompt="prompt", lora_request=None)
     # Actors should stay the same.
     assert tokenizer_group_pool.tokenizer_actors == tokenizer_actors
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index c6fafbeea9c..91b9cc62719 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -492,7 +492,6 @@ async def add_request_async(
 
         preprocessed_inputs = await self.input_preprocessor.preprocess_async(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 51a82c415a8..b9a8b6a5306 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -783,7 +783,6 @@ def add_request(
 
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index be9f3af0b54..d2f2c226d2f 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -81,10 +81,7 @@ async def beam_search(
         if is_explicit_encoder_decoder_prompt(prompt):
             raise NotImplementedError
         else:
-            processed_inputs = preprocessor._prompt_to_llm_inputs(
-                prompt,
-                request_id=request_id,
-            )
+            processed_inputs = preprocessor._prompt_to_llm_inputs(prompt)
 
         prompt_token_ids = processed_inputs["prompt_token_ids"]
         prompt_text = processed_inputs.get("prompt")
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index af35e43d825..33f39bedea5 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -182,7 +182,6 @@ def _apply_prompt_adapter(
     def _tokenize_prompt(
         self,
         prompt: str,
-        request_id: str,
         lora_request: Optional[LoRARequest],
     ) -> list[int]:
         """
@@ -202,15 +201,13 @@ def _tokenize_prompt(
                     "do_lower_case", False)):
             prompt = prompt.lower()
 
-        return tokenizer.encode(request_id=request_id,
-                                prompt=prompt,
+        return tokenizer.encode(prompt=prompt,
                                 lora_request=lora_request,
                                 add_special_tokens=add_special_tokens)
 
     async def _tokenize_prompt_async(
         self,
         prompt: str,
-        request_id: str,
         lora_request: Optional[LoRARequest],
     ) -> list[int]:
         """Async version of :meth:`_tokenize_prompt`."""
@@ -222,7 +219,6 @@ async def _tokenize_prompt_async(
             # appending an EOS token to the prompt which disrupts generation.
             add_special_tokens = False
         return await tokenizer.encode_async(
-            request_id=request_id,
             prompt=prompt,
             lora_request=lora_request,
             add_special_tokens=add_special_tokens)
@@ -309,7 +305,6 @@ async def _process_multimodal_async(
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         return_mm_hashes: bool = False,
     ) -> SingletonInputs:
@@ -318,7 +313,6 @@ def _prompt_to_llm_inputs(
 
         Arguments:
 
-        * request_id
         * prompt: single encoder or decoder input prompt
         * lora_request: this is only valid for decoder prompts
         * return_mm_hashes: whether to return multimodal hashes
@@ -333,7 +327,6 @@ def _prompt_to_llm_inputs(
             prompt_text = parsed["content"]
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
-                request_id=request_id,
                 lora_request=lora_request,
             )
 
@@ -384,7 +377,6 @@ def _prompt_to_llm_inputs(
 
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
-                request_id=request_id,
                 lora_request=lora_request,
             )
 
@@ -400,7 +392,6 @@ def _prompt_to_llm_inputs(
     async def _prompt_to_llm_inputs_async(
         self,
         prompt: SingletonPrompt,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         return_mm_hashes: bool = False,
     ) -> SingletonInputs:
@@ -411,7 +402,6 @@ async def _prompt_to_llm_inputs_async(
             prompt_text = parsed["content"]
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
-                request_id=request_id,
                 lora_request=lora_request,
             )
 
@@ -460,7 +450,6 @@ async def _prompt_to_llm_inputs_async(
 
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
-                request_id=request_id,
                 lora_request=lora_request,
             )
 
@@ -560,7 +549,6 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
     def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
-        request_id: str,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -587,7 +575,6 @@ def _process_encoder_decoder_prompt(
         Arguments:
 
         * prompt: an input prompt
-        * request_id
 
         Returns:
 
@@ -598,16 +585,11 @@ def _process_encoder_decoder_prompt(
 
         if is_explicit_encoder_decoder_prompt(prompt):
             encoder_inputs = self._prompt_to_llm_inputs(
-                prompt["encoder_prompt"],
-                request_id=request_id,
-            )
+                prompt["encoder_prompt"])
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
             else:
-                decoder_inputs = self._prompt_to_llm_inputs(
-                    decoder_input,
-                    request_id=request_id,
-                )
+                decoder_inputs = self._prompt_to_llm_inputs(decoder_input)
             # For multimodal model, override decoder prompt from processor
             # with explicit decoder prompt.
             if self.model_config.is_multimodal_model and (
@@ -616,10 +598,7 @@ def _process_encoder_decoder_prompt(
                     self._separate_enc_dec_inputs_from_mm_processor_outputs(
                         encoder_inputs, decoder_inputs))
         else:
-            inputs = self._prompt_to_llm_inputs(
-                prompt,
-                request_id=request_id,
-            )
+            inputs = self._prompt_to_llm_inputs(prompt)
             if self.model_config.is_multimodal_model and (
                     self._can_process_multimodal()):
                 # Encoder-Decoder Multimodal model
@@ -636,7 +615,6 @@ def _process_encoder_decoder_prompt(
     async def _process_encoder_decoder_prompt_async(
         self,
         prompt: PromptType,
-        request_id: str,
     ) -> EncoderDecoderInputs:
         """Async version of :meth:`_process_encoder_decoder_prompt`."""
         encoder_inputs: SingletonInputs
@@ -644,18 +622,13 @@ async def _process_encoder_decoder_prompt_async(
 
         if is_explicit_encoder_decoder_prompt(prompt):
             encoder_task = self._prompt_to_llm_inputs_async(
-                prompt["encoder_prompt"],
-                request_id=request_id,
-            )
+                prompt["encoder_prompt"])
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 encoder_inputs = await encoder_task
                 decoder_inputs = None
             else:
-                decoder_task = self._prompt_to_llm_inputs_async(
-                    decoder_input,
-                    request_id=request_id,
-                )
+                decoder_task = self._prompt_to_llm_inputs_async(decoder_input)
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
                     encoder_task, decoder_task)
@@ -668,10 +641,7 @@ async def _process_encoder_decoder_prompt_async(
                     self._separate_enc_dec_inputs_from_mm_processor_outputs(
                         encoder_inputs, decoder_inputs))
         else:
-            inputs = await self._prompt_to_llm_inputs_async(
-                prompt,
-                request_id=request_id,
-            )
+            inputs = await self._prompt_to_llm_inputs_async(prompt)
             if self.model_config.is_multimodal_model and (
                     self._can_process_multimodal()):
                 # Encoder-Decoder Multimodal model
@@ -704,7 +674,6 @@ def _build_decoder_only_llm_inputs(
     def _process_decoder_only_prompt(
         self,
         prompt: SingletonPrompt,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
@@ -716,7 +685,6 @@ def _process_decoder_only_prompt(
         Arguments:
 
         * prompt: input prompt
-        * request_id
         * lora_request
         * prompt_adapter_request
         * return_mm_hashes
@@ -728,7 +696,6 @@ def _process_decoder_only_prompt(
 
         prompt_comps = self._prompt_to_llm_inputs(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             return_mm_hashes=return_mm_hashes,
         )
@@ -741,7 +708,6 @@ def _process_decoder_only_prompt(
     async def _process_decoder_only_prompt_async(
         self,
         prompt: SingletonPrompt,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
@@ -749,7 +715,6 @@ async def _process_decoder_only_prompt_async(
         """Async version of :meth:`_process_decoder_only_prompt`."""
         prompt_comps = await self._prompt_to_llm_inputs_async(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             return_mm_hashes=return_mm_hashes,
         )
@@ -762,7 +727,6 @@ async def _process_decoder_only_prompt_async(
     def preprocess(
         self,
         prompt: PromptType,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
@@ -774,10 +738,7 @@ def preprocess(
                 "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
-            return self._process_encoder_decoder_prompt(
-                prompt,
-                request_id=request_id,
-            )
+            return self._process_encoder_decoder_prompt(prompt)
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise ValueError("Cannot pass encoder-decoder prompt "
@@ -786,7 +747,6 @@ def preprocess(
         # Decoder-only operation
         return self._process_decoder_only_prompt(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=return_mm_hashes,
@@ -795,7 +755,6 @@ def preprocess(
     async def preprocess_async(
         self,
         prompt: PromptType,
-        request_id: str,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
@@ -807,10 +766,7 @@ async def preprocess_async(
                 "returned until they are supported on vLLM V1.")
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
-            return await self._process_encoder_decoder_prompt_async(
-                prompt,
-                request_id=request_id,
-            )
+            return await self._process_encoder_decoder_prompt_async(prompt)
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise ValueError("Cannot pass encoder-decoder prompt "
@@ -819,7 +775,6 @@ async def preprocess_async(
         # Decoder-only operation
         return await self._process_decoder_only_prompt_async(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=return_mm_hashes,
diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
index fbdfa3e57e1..c5108a7fc6e 100644
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
@@ -33,7 +33,6 @@ def get_max_input_len(
     @abstractmethod
     def encode(self,
                prompt: str,
-               request_id: Optional[str] = None,
                lora_request: Optional[LoRARequest] = None,
                add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
@@ -43,7 +42,6 @@ def encode(self,
     async def encode_async(
             self,
             prompt: str,
-            request_id: Optional[str] = None,
             lora_request: Optional[LoRARequest] = None,
             add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group."""
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index 30cab752ccf..b048b809417 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -113,7 +113,6 @@ def _finalize_encode(self, actor: ray.ObjectRef,
 
     def encode(self,
                prompt: str,
-               request_id: Optional[str] = None,
                lora_request: Optional[LoRARequest] = None,
                add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
@@ -133,8 +132,7 @@ def encode(self,
         original_actor = actor
         try:
             ret = ray.get(
-                actor.encode.remote(request_id=request_id,
-                                    prompt=prompt,
+                actor.encode.remote(prompt=prompt,
                                     lora_request=lora_request,
                                     add_special_tokens=add_special_tokens))
         except ActorDiedError as e:
@@ -145,8 +143,7 @@ def encode(self,
             actor = self._init_actor()
             try:
                 ret = ray.get(
-                    actor.encode.remote(request_id=request_id,
-                                        prompt=prompt,
+                    actor.encode.remote(prompt=prompt,
                                         lora_request=lora_request,
                                         add_special_tokens=add_special_tokens))
             except ActorDiedError as e:
@@ -164,7 +161,6 @@ def encode(self,
     async def encode_async(
             self,
             prompt: str,
-            request_id: Optional[str] = None,
             lora_request: Optional[LoRARequest] = None,
             add_special_tokens: Optional[bool] = None) -> List[int]:
         """Encode a prompt using the tokenizer group.
@@ -184,7 +180,6 @@ async def encode_async(
         original_actor = actor
         try:
             ret = await actor.encode.remote(
-                request_id=request_id,
                 prompt=prompt,
                 lora_request=lora_request,
                 add_special_tokens=add_special_tokens)
@@ -196,7 +191,6 @@ async def encode_async(
             actor = self._init_actor()
             try:
                 ret = await actor.encode.remote(
-                    request_id=request_id,
                     prompt=prompt,
                     lora_request=lora_request,
                     add_special_tokens=add_special_tokens)
diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
index 025971cb7e4..b6e9005bcd2 100644
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
@@ -56,7 +56,6 @@ def _raise_if_input_too_long(self,
 
     def encode(self,
                prompt: str,
-               request_id: Optional[str] = None,
                lora_request: Optional[LoRARequest] = None,
                add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = self.get_lora_tokenizer(lora_request)
@@ -69,7 +68,6 @@ def encode(self,
     async def encode_async(
             self,
             prompt: str,
-            request_id: Optional[str] = None,
             lora_request: Optional[LoRARequest] = None,
             add_special_tokens: Optional[bool] = None) -> List[int]:
         tokenizer = await self.get_lora_tokenizer_async(lora_request)
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 171c1c7da28..e0169f1a4de 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -4,6 +4,7 @@
 import logging
 import os
 from collections.abc import AsyncGenerator, Mapping
+from copy import copy
 from typing import Optional, Union
 
 import numpy as np
@@ -25,6 +26,7 @@
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv, kill_process_tree
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
@@ -177,33 +179,44 @@ async def add_request(
     ) -> asyncio.Queue[RequestOutput]:
         """Add new request to the AsyncLLM."""
 
-        # 1) Create a new output queue for the request.
+        # Create a new output queue for the request.
         queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
 
-        # 2) Fan out child requests (for n>1)
-        parent_req = ParentRequest.from_params(request_id, params)
+        # Convert Input --> Request.
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
+
         n = params.n if isinstance(params, SamplingParams) else 1
-        for idx in range(n):
-            if parent_req is not None:
-                request_id, params = parent_req.get_child_info(idx)
 
-            # 3) Convert Input --> Request.
-            request = self.processor.process_inputs(request_id, prompt, params,
-                                                    arrival_time, lora_request,
-                                                    trace_headers,
-                                                    prompt_adapter_request,
-                                                    priority)
+        if n == 1:
+            await self._add_request(request, None, 0, queue)
+            return queue
 
-            # 4) Add the request to OutputProcessor (this process).
-            self.output_processor.add_request(request, parent_req, idx, queue)
+        # Fan out child requests (for n>1).
+        parent_request = ParentRequest(request_id, params)
+        for idx in range(n):
+            request_id, params = parent_request.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+            await self._add_request(child_request, parent_request, idx, queue)
+        return queue
 
-            # 5) Add the EngineCoreRequest to EngineCore (separate process).
-            await self.engine_core.add_request_async(request)
+    async def _add_request(self, request: EngineCoreRequest,
+                           parent_req: Optional[ParentRequest], index: int,
+                           queue: asyncio.Queue[RequestOutput]):
 
-            if self.log_requests:
-                logger.info("Added request %s.", request_id)
+        # Add the request to OutputProcessor (this process).
+        self.output_processor.add_request(request, parent_req, index, queue)
 
-        return queue
+        # Add the EngineCoreRequest to EngineCore (separate process).
+        await self.engine_core.add_request_async(request)
+
+        if self.log_requests:
+            logger.info("Added request %s.", request.request_id)
 
     # TODO: we should support multiple prompts in one call, as you
     # can do with LLM.generate. So that for multi-prompt completion
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 14338e5cbe8..7bda3a30d20 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Mapping
+from copy import copy
 from typing import Optional, Union
 
 from typing_extensions import TypeVar
@@ -179,25 +180,34 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-        # 1) Fan out child requests (for n>1)
-        parent_req = ParentRequest.from_params(request_id, params)
-        n = params.n if isinstance(params, SamplingParams) else 1
-        for idx in range(n):
-            if parent_req is not None:
-                request_id, params = parent_req.get_child_info(idx)
-
-            # 2) Process raw inputs into the request.
-            request = self.processor.process_inputs(request_id, prompt, params,
-                                                    arrival_time, lora_request,
-                                                    trace_headers,
-                                                    prompt_adapter_request,
-                                                    priority)
+        # Process raw inputs into the request.
+        request = self.processor.process_inputs(request_id, prompt, params,
+                                                arrival_time, lora_request,
+                                                trace_headers,
+                                                prompt_adapter_request,
+                                                priority)
 
-            # 3) Make a new RequestState and queue.
-            self.output_processor.add_request(request, parent_req, idx)
+        n = params.n if isinstance(params, SamplingParams) else 1
 
-            # 3) Add the request to EngineCore.
+        if n == 1:
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(request, None, 0)
+            # Add the request to EngineCore.
             self.engine_core.add_request(request)
+            return
+
+        # Fan out child requests (for n>1).
+        parent_req = ParentRequest(request_id, params)
+        for idx in range(n):
+            request_id, params = parent_req.get_child_info(idx)
+            child_request = request if idx == n - 1 else copy(request)
+            child_request.request_id = request_id
+            child_request.sampling_params = params
+
+            # Make a new RequestState and queue.
+            self.output_processor.add_request(child_request, parent_req, idx)
+            # Add the request to EngineCore.
+            self.engine_core.add_request(child_request)
 
     def step(self) -> list[RequestOutput]:
 
diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py
index 0eeca657406..4df7ca59731 100644
--- a/vllm/v1/engine/parallel_sampling.py
+++ b/vllm/v1/engine/parallel_sampling.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from copy import copy
-from typing import Optional, Union
+from typing import Optional
 
 from vllm.outputs import CompletionOutput
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.v1.metrics.stats import IterationStats
 
@@ -43,16 +42,6 @@ def __init__(self, request_id: str,
         self.max_num_generation_tokens = 0
         self.cached_child_sampling_params = None
 
-    @classmethod
-    def from_params(
-        cls,
-        request_id: str,
-        params: Union[SamplingParams, PoolingParams],
-    ) -> Optional['ParentRequest']:
-        if not isinstance(params, SamplingParams) or params.n == 1:
-            return None
-        return cls(request_id, params)
-
     def _get_child_sampling_params(
         self,
         index: int,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index d823c45d599..55e0fdcd65b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -173,7 +173,6 @@ def process_inputs(
         # 3. Apply prompt adapter to prompt token ids if one exists.
         processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
             prompt,
-            request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
             return_mm_hashes=self.use_hash,

From 73a77a9beb56c9beb9de5fa52e835d7314136887 Mon Sep 17 00:00:00 2001
From: Wei Zeng <48810492+wayzeng@users.noreply.github.com>
Date: Fri, 21 Mar 2025 00:26:03 -0700
Subject: [PATCH 0892/1240] [Feature] specify model in config.yaml  (#14855)

Signed-off-by: weizeng <weizeng@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../serving/openai_compatible_server.md       |  4 +-
 tests/{data => config}/test_config.yaml       |  0
 tests/config/test_config_with_model.yaml      |  7 +++
 tests/conftest.py                             | 12 +++++
 tests/test_utils.py                           | 53 +++++++++++++++----
 vllm/entrypoints/cli/serve.py                 | 22 ++++----
 vllm/utils.py                                 | 34 ++++++++----
 7 files changed, 102 insertions(+), 30 deletions(-)
 rename tests/{data => config}/test_config.yaml (100%)
 create mode 100644 tests/config/test_config_with_model.yaml

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a6ec05f45b6..378405d3690 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -184,6 +184,7 @@ For example:
 ```yaml
 # config.yaml
 
+model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@@ -192,12 +193,13 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-vllm serve SOME_MODEL --config config.yaml
+vllm serve --config config.yaml
 ```
 
 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::
 
 ## API Reference
diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml
similarity index 100%
rename from tests/data/test_config.yaml
rename to tests/config/test_config.yaml
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
new file mode 100644
index 00000000000..d8c8c7bc816
--- /dev/null
+++ b/tests/config/test_config_with_model.yaml
@@ -0,0 +1,7 @@
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/tests/conftest.py b/tests/conftest.py
index 0c71d981016..8c6046c5817 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1121,3 +1121,15 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "optional" in item.keywords:
             item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3660cfa0e49..1c33f83c593 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,7 +8,7 @@
 
 import pytest
 import torch
-from vllm_test_utils import monitor
+from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
@@ -140,7 +140,8 @@ def parser():
 def parser_with_config():
     parser = FlexibleArgumentParser()
     parser.add_argument('serve')
-    parser.add_argument('model_tag')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
     parser.add_argument('--served-model-name', type=str)
     parser.add_argument('--config', type=str)
     parser.add_argument('--port', type=int)
@@ -196,29 +197,29 @@ def test_missing_required_argument(parser):
         parser.parse_args([])
 
 
-def test_cli_override_to_config(parser_with_config):
+def test_cli_override_to_config(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', cli_config_file,
         '--tensor-parallel-size', '3'
     ])
     assert args.tensor_parallel_size == 3
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml'
+        cli_config_file
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml', '--port', '666'
+        cli_config_file, '--port', '666'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
-def test_config_args(parser_with_config):
+def test_config_args(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -240,10 +241,9 @@ def test_config_file(parser_with_config):
         ])
 
 
-def test_no_model_tag(parser_with_config):
+def test_no_model_tag(parser_with_config, cli_config_file):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', '--config', './data/test_config.yaml'])
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
 
 
 # yapf: enable
@@ -476,3 +476,34 @@ def test_swap_dict_values(obj, key1, key2):
         assert obj[key1] == original_obj[key2]
     else:
         assert key1 not in obj
+
+
+def test_model_specification(parser_with_config,
+                             cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve', '--config', cli_config_file_with_model
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index c345ece4dad..cf05eb09b37 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -21,14 +21,16 @@ def __init__(self):
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
-        # The default value of `--model`
-        if args.model != EngineArgs.model:
-            raise ValueError(
-                "With `vllm serve`, you should provide the model as a "
-                "positional argument instead of via the `--model` option.")
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
+        # Otherwise use model from config (already in args.model)
 
-        # EngineArgs expects the model name to be passed as --model.
-        args.model = args.model_tag
+        # Check if we have a model specified somewhere
+        if args.model == EngineArgs.model:  # Still has default value
+            raise ValueError(
+                "With `vllm serve`, you should provide the model either as a "
+                "positional argument or in config file.")
 
         uvloop.run(run_server(args))
 
@@ -41,10 +43,12 @@ def subparser_init(
         serve_parser = subparsers.add_parser(
             "serve",
             help="Start the vLLM OpenAI Compatible API server",
-            usage="vllm serve <model_tag> [options]")
+            usage="vllm serve [model_tag] [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,
-                                  help="The model tag to serve")
+                                  nargs='?',
+                                  help="The model tag to serve "
+                                  "(optional if specified in config)")
         serve_parser.add_argument(
             "--config",
             type=str,
diff --git a/vllm/utils.py b/vllm/utils.py
index cb375f8ff32..b723637b250 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1264,19 +1264,29 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
         config_args = self._load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
-        # followed by model_tag (only for serve)
+        # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
         if args[0] == "serve":
-            if index == 1:
+            model_in_cli = len(args) > 1 and not args[1].startswith('-')
+            model_in_config = any(arg == '--model' for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
                 raise ValueError(
-                    "No model_tag specified! Please check your command-line"
-                    " arguments.")
-            args = [args[0]] + [
-                args[1]
-            ] + config_args + args[2:index] + args[index + 2:]
+                    "No model specified! Please specify model either in "
+                    "command-line arguments or in config file.")
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = [args[0]] + [
+                    args[1]
+                ] + config_args + args[2:index] + args[index + 2:]
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]
+                        ] + config_args + args[1:index] + args[index + 2:]
         else:
             args = [args[0]] + config_args + args[1:index] + args[index + 2:]
 
@@ -1294,9 +1304,7 @@ def _load_config_file(self, file_path: str) -> list[str]:
                 '--port': '12323',
                 '--tensor-parallel-size': '4'
             ]
-
         """
-
         extension: str = file_path.split('.')[-1]
         if extension not in ('yaml', 'yml'):
             raise ValueError(
@@ -1321,7 +1329,15 @@ def _load_config_file(self, file_path: str) -> list[str]:
             if isinstance(action, StoreBoolean)
         ]
 
+        # Skip model from config if it's provided as positional argument
+        skip_model = (hasattr(self, '_parsed_args') and self._parsed_args
+                      and len(self._parsed_args) > 1
+                      and self._parsed_args[0] == 'serve'
+                      and not self._parsed_args[1].startswith('-'))
+
         for key, value in config.items():
+            if skip_model and key == 'model':
+                continue
             if isinstance(value, bool) and key not in store_boolean_arguments:
                 if value:
                     processed_args.append('--' + key)

From 3c5cf680d801e1aacc7ddcbce947efad3ed1cfee Mon Sep 17 00:00:00 2001
From: Shanshan Shen <467638484@qq.com>
Date: Fri, 21 Mar 2025 16:58:28 +0800
Subject: [PATCH 0893/1240] [Bugfix] Add int8 torch dtype for KVCache (#15260)

Signed-off-by: shen-shanshan <467638484@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/utils.py b/vllm/utils.py
index b723637b250..55ee044b482 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -153,6 +153,7 @@
     "fp8": torch.uint8,
     "fp8_e4m3": torch.uint8,
     "fp8_e5m2": torch.uint8,
+    "int8": torch.int8,
 }
 
 TORCH_DTYPE_TO_NUMPY_DTYPE = {

From ccbfdcb54418feb4f039331c4ae845da1367ed82 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Mar 2025 18:32:33 +0800
Subject: [PATCH 0894/1240] [Misc] Add attention mask pre-computation
 optimization back to Qwen2.5-VL (#15273)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/qwen2_5_vl.py | 33 +++++++++++++++++-------
 vllm/model_executor/models/qwen2_vl.py   | 18 ++++++++-----
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index adca97c71c5..1e6ff1fec6d 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -608,6 +608,17 @@ def get_window_index(self, grid_thw):
         window_index = torch.cat(window_index, dim=0)
         return window_index, cu_window_seqlens
 
+    def compute_attn_mask_seqlen(
+        self,
+        cu_seqlens: torch.Tensor,
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
     def forward(
         self,
         x: torch.Tensor,
@@ -645,25 +656,27 @@ def forward(
         # transformers
         hidden_states = hidden_states.unsqueeze(1)
 
-        max_seqlen = None
-        seqlens = None
+        # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
+        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(
+            cu_seqlens)
+        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
+            cu_window_seqlens)
         for layer_num, blk in enumerate(self.blocks):
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
+                max_seqlen_now = max_seqlen_full
+                seqlens_now = seqlens_full
             else:
                 cu_seqlens_now = cu_window_seqlens
-            # pre-compute cu_seqlens for window attn
-            if self.attn_backend == _Backend.FLASH_ATTN:
-                max_seqlen = (cu_seqlens_now[1:] -
-                              cu_seqlens_now[:-1]).max().item()
-            elif self.attn_backend == _Backend.XFORMERS:
-                seqlens = (cu_seqlens_now[1:] - cu_seqlens_now[:-1]).tolist()
+                max_seqlen_now = max_seqlen_window
+                seqlens_now = seqlens_window
+
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,
                 rotary_pos_emb=rotary_pos_emb,
-                max_seqlen=max_seqlen,
-                seqlens=seqlens,
+                max_seqlen=max_seqlen_now,
+                seqlens=seqlens_now,
             )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index b8ac40b7e7f..7537671e1bb 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -617,6 +617,16 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
     def forward(
         self,
         x: torch.Tensor,
@@ -638,12 +648,8 @@ def forward(
         # transformers
         x = x.unsqueeze(1)
 
-        max_seqlen = None
-        seqlens = None
-        if self.attn_backend == _Backend.FLASH_ATTN:
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == _Backend.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
             x = blk(
                 x,

From 2e912ff8bbe18025841d80be23a7af5a7e793186 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Mar 2025 18:54:08 +0800
Subject: [PATCH 0895/1240] [Bugfix] Fix incorrect resolving order for
 transformers fallback (#15279)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/registry.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 8b469132da6..74ae06c55d8 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -418,11 +418,13 @@ def _normalize_archs(
         if not architectures:
             logger.warning("No model architectures are specified")
 
-        normalized_arch = []
-        for model in architectures:
-            if model not in self.models:
-                model = "TransformersModel"
-            normalized_arch.append(model)
+        # filter out support architectures
+        normalized_arch = list(
+            filter(lambda model: model in self.models, architectures))
+
+        # make sure Transformers fallback are put at the last
+        if len(normalized_arch) != len(architectures):
+            normalized_arch.append("TransformersModel")
         return normalized_arch
 
     def inspect_model_cls(

From cee14244b6d3b7bdc5053889c72224bbf9fee95d Mon Sep 17 00:00:00 2001
From: Lehua Ding <lehuading@qq.com>
Date: Fri, 21 Mar 2025 18:54:11 +0800
Subject: [PATCH 0896/1240] [V1] Fix wrong import path of
 get_flash_attn_version (#15280)

Signed-off-by: Lehua Ding <lehuading@tencent.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/mla/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 188a425b107..31244443108 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -195,8 +195,8 @@
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionLayer,
                                               AttentionMetadata,
                                               MLAAttentionImpl)
-from vllm.attention.backends.utils import get_flash_attn_version
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,

From 9ae5d53e2c74451716b8d22b117f8995befbbb37 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 21 Mar 2025 19:42:06 +0800
Subject: [PATCH 0897/1240] [Bugfix] Fix broken kernel test due to missing
 rename for v1 Triton backend (#15282)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_attention_selector.py      | 2 +-
 tests/kernels/test_rocm_attention_selector.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 66db7509cc4..1615c23a4f7 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -49,7 +49,7 @@ def test_env(
                        RocmPlatform()):
                 backend = get_attn_backend(16, torch.float16, torch.float16,
                                            16, False)
-            EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
+            EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
             assert backend.get_name() == EXPECTED
         elif device == "openvino":
             with patch("vllm.attention.selector.current_platform",
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 724f0af283f..90b483b4a41 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -26,7 +26,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # Test standard ROCm attention
         backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
         assert (backend.get_name() == "ROCM_FLASH"
-                or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+                or backend.get_name() == "TRITON_ATTN_VLLM_V1")
 
         # mla test for deepseek related
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,

From b23436d43b271f389bcd470d4f468d98b2a98850 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 21 Mar 2025 07:56:09 -0400
Subject: [PATCH 0898/1240] [Misc] Add cProfile helpers (#15074)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../contributing/profiling/profiling_index.md | 49 +++++++++++++++++++
 vllm/utils.py                                 | 48 ++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md
index d6e597ea9e9..ce25daa39c5 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/source/contributing/profiling/profiling_index.md
@@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
 GUI example:
 
 <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
+
+## Profiling vLLM Python Code
+
+The Python standard library includes
+[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
+code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
+Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
+used to profile a section of code.
+
+### Example usage - decorator
+
+The first helper is a Python decorator that can be used to profile a function.
+If a filename is specified, the profile will be saved to that file. If no filename is
+specified, profile data will be printed to stdout.
+
+```python
+import vllm.utils
+
+@vllm.utils.cprofile("expensive_function.prof")
+def expensive_function():
+    # some expensive code
+    pass
+```
+
+### Example Usage - context manager
+
+The second helper is a context manager that can be used to profile a block of
+code. Similar to the decorator, the filename is optional.
+
+```python
+import vllm.utils
+
+def another_function():
+    # more expensive code
+    pass
+
+with vllm.utils.cprofile_context("another_function.prof"):
+    another_function()
+```
+
+### Analyzing Profile Results
+
+There are multiple tools available that can help analyze the profile results.
+One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
+
+```bash
+pip install snakeviz
+snakeviz expensive_function.prof
+```
diff --git a/vllm/utils.py b/vllm/utils.py
index 55ee044b482..64d9faeb1cb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2405,3 +2405,51 @@ def swap_dict_values(obj: dict[_K, _V], key1: _K, key2: _K) -> None:
         obj[key1] = v2
     else:
         obj.pop(key1, None)
+
+
+@contextlib.contextmanager
+def cprofile_context(save_file: Optional[str] = None):
+    """Run a cprofile
+
+    Args:
+        save_file: path to save the profile result. "1" or
+          None will result in printing to stdout.
+    """
+    import cProfile
+
+    prof = cProfile.Profile()
+    prof.enable()
+
+    try:
+        yield
+    finally:
+        prof.disable()
+        if save_file and save_file != "1":
+            prof.dump_stats(save_file)
+        else:
+            prof.print_stats(sort="cumtime")
+
+
+def cprofile(save_file: Optional[str] = None, enabled: bool = True):
+    """Decorator to profile a Python method using cProfile.
+
+    Args:
+        save_file: Path to save the profile result.
+            If "1", None, or "", results will be printed to stdout.
+        enabled: Set to false to turn this into a no-op
+    """
+
+    def decorator(func: Callable):
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if not enabled:
+                # If profiling is disabled, just call the function directly.
+                return func(*args, **kwargs)
+
+            with cprofile_context(save_file):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator

From d0ad8d04a2fa24c8ab113660034aec868969cdd1 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Fri, 21 Mar 2025 19:56:27 +0800
Subject: [PATCH 0899/1240] [v1] Refactor KVCacheConfig (#14079)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_kv_cache_utils.py | 110 ++++++++++++++++++++++-
 vllm/v1/core/kv_cache_utils.py       | 130 +++++++++++++++++++--------
 vllm/v1/engine/core.py               |  31 +++++--
 vllm/v1/executor/abstract.py         |  13 ++-
 vllm/v1/kv_cache_interface.py        |  52 +++++++----
 vllm/v1/worker/gpu_model_runner.py   |  46 ++++++----
 vllm/v1/worker/gpu_worker.py         |   2 +-
 vllm/v1/worker/tpu_model_runner.py   |  44 ++++-----
 vllm/v1/worker/tpu_worker.py         |   2 +-
 vllm/v1/worker/worker_base.py        |   2 +-
 10 files changed, 320 insertions(+), 112 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index ba08b83ec54..3fecb517c43 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
@@ -8,7 +9,10 @@
                                          KVCacheBlock, PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
-                                         hash_request_tokens)
+                                         hash_request_tokens,
+                                         unify_kv_cache_configs)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -314,3 +318,107 @@ def stats(requests, queries, hits):
     assert metrics.aggregated_query_total == 0
     assert metrics.aggregated_query_hit == 0
     assert not metrics.query_queue
+
+
+def test_unify_kv_cache_configs():
+
+    def new_kv_cache_spec(block_size=16,
+                          num_kv_heads=2,
+                          head_size=64,
+                          dtype=torch.float32,
+                          use_mla=False):
+        return FullAttentionSpec(block_size=block_size,
+                                 num_kv_heads=num_kv_heads,
+                                 head_size=head_size,
+                                 dtype=dtype,
+                                 use_mla=use_mla)
+
+    same_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+    ]
+    unify_kv_cache_configs(same_kv_cache_config)
+    assert same_kv_cache_config[0].num_blocks == 10
+    assert same_kv_cache_config[1].num_blocks == 10
+
+    need_sort_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+            ],
+        ),
+    ]
+
+    unify_kv_cache_configs(need_sort_kv_cache_config)
+    assert need_sort_kv_cache_config[0].num_blocks == 10
+    assert need_sort_kv_cache_config[1].num_blocks == 10
+
+    diff_kv_cache_config = [
+        KVCacheConfig(
+            num_blocks=10,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=4)),
+            ],
+        ),
+        KVCacheConfig(
+            num_blocks=20,
+            tensors={
+                "layer1": KVCacheTensor(100),
+                "layer2": KVCacheTensor(100),
+            },
+            kv_cache_groups=[
+                KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+                KVCacheGroupSpec(["layer2"],
+                                 new_kv_cache_spec(num_kv_heads=8)),
+            ],
+        ),
+    ]
+    with pytest.raises(AssertionError):
+        unify_kv_cache_configs(diff_kv_cache_config)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index adadcab5ea1..e0d7f4dbdc1 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -7,8 +7,8 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheSpec,
-                                        KVCacheTensor)
+from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
+                                        KVCacheSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -449,7 +449,7 @@ def hash_request_tokens(block_size: int,
 
 
 def check_enough_kv_cache_memory(vllm_config: VllmConfig,
-                                 kv_cache_spec: KVCacheSpec,
+                                 kv_cache_spec: dict[str, KVCacheSpec],
                                  available_memory: int):
     """
     Checks whether `available_memory` is enough for the KV cache to hold at 
@@ -457,7 +457,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
 
     Args:
         vllm_config: The global VllmConfig
-        kv_cache_spec: The kv cache spec of the model
+        kv_cache_spec: The kv cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes.
 
     Raises:
@@ -484,12 +484,43 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
             f"`max_model_len` when initializing the engine.")
 
 
-def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
+def create_kv_cache_group_specs(
+        kv_cache_spec: dict[str, KVCacheSpec],
+        grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
+    """
+     Create KVCacheGroupSpec object for each kv cache group layer.
+     The layers in the same group should share the same 
+     KVCacheSpec.
+
+     Args:
+         kv_cache_spec:
+             A mapping from each layer name to its corresponding KVCacheSpec.
+         grouped_layer_names:
+             A list of kv cache groups, where each element is a list of layer 
+             names that belong to the same group and should share the same 
+             KVCacheSpec.
+     Returns:
+         A list of KVCacheGroupSpec objects, one for each group.
+     """
+    kv_cache_groups = []
+    for layer_names_one_group in grouped_layer_names:
+        layer_spec = kv_cache_spec[layer_names_one_group[0]]
+        assert all(
+            kv_cache_spec[layer_name] == layer_spec
+            for layer_name in layer_names_one_group[1:]), (
+                "All layers in the same KV cache group must share the same "
+                "KVCacheSpec.")
+        kv_cache_groups.append(
+            KVCacheGroupSpec(layer_names_one_group, layer_spec))
+    return kv_cache_groups
+
+
+def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     """
     Whether all layers in the given KVCacheSpec have the same type of KV cache.
 
     Args:
-        kv_cache_spec: The KVCacheSpec of the model
+        kv_cache_spec: The kv cache spec of each attention layer in the model
 
     Returns:
         True if all layers have the same type, False otherwise.
@@ -500,18 +531,16 @@ def is_kv_cache_type_uniform(kv_cache_spec: KVCacheSpec) -> bool:
 
 
 def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
-                                      kv_cache_spec: KVCacheSpec,
-                                      available_memory: int,
-                                      num_layers: int) -> KVCacheConfig:
+                                      kv_cache_spec: dict[str, KVCacheSpec],
+                                      available_memory: int) -> KVCacheConfig:
     """
     Generates the KV cache configuration for a model with one type of KV cache.
     Divide the available memory equally among all layers.
 
     Args:
         vllm_config: The global VllmConfig
-        kv_cache_spec: The kv cache spec of the model
+        kv_cache_spec: The kv cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes.
-        num_layers: The number of layers in the model.
 
     Returns:
         The generated KVCacheConfig
@@ -521,7 +550,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     assert len(page_sizes) == 1
     page_size = page_sizes.pop()
 
-    num_blocks = int(available_memory // page_size // num_layers)
+    num_blocks = int(available_memory // page_size // len(kv_cache_spec))
     num_blocks = max(num_blocks, 0)
 
     if vllm_config.cache_config.num_gpu_blocks_override is not None:
@@ -541,6 +570,9 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
                 max_model_len_str, max_concurrency)
 
     per_layer_size = page_size * num_blocks
+    # All layers have the same KV cache spec, so we create one kv cache group
+    # for all layers.
+    grouped_layer_names = [list(kv_cache_spec.keys())]
 
     kv_cache_config = KVCacheConfig(
         num_blocks=num_blocks,
@@ -548,41 +580,69 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
             layer_name: KVCacheTensor(size=per_layer_size)
             for layer_name in kv_cache_spec
         },
-        groups=[[layer_name for layer_name in kv_cache_spec]],
-        kv_cache_spec=kv_cache_spec)
+        kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
+                                                    grouped_layer_names),
+    )
     return kv_cache_config
 
 
-def get_kv_cache_configs(vllm_config: VllmConfig,
-                         kv_cache_specs: list[KVCacheSpec],
-                         available_memory: int) -> list[KVCacheConfig]:
+def get_kv_cache_config(vllm_config: VllmConfig,
+                        kv_cache_spec: dict[str, KVCacheSpec],
+                        available_memory: int) -> KVCacheConfig:
     """
     Generates the KV cache configuration for a model
     TODO: support hybrid models with more than one type of KV cache.
 
     Args:
         vllm_config: The global VllmConfig
-        kv_cache_specs: The kv cache specs of the model
+        kv_cache_spec: The kv cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes.
 
     Returns:
         The generated KVCacheConfigs
     """
-    # Use the max number of layers to conservatively determine
-    # the number of blocks.
-    num_layers = max(len(kv_cache_spec) for kv_cache_spec in kv_cache_specs)
-    kv_cache_configs = []
-    for kv_cache_spec in kv_cache_specs:
-        check_enough_kv_cache_memory(vllm_config, kv_cache_spec,
-                                     available_memory)
-        if is_kv_cache_type_uniform(kv_cache_spec):
-            # KV cache of all layers are the same, which is true for
-            # most models. Allocate the same amount of memory for
-            # each layer.
-            kv_cache_configs.append(
-                _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
-                                                  available_memory,
-                                                  num_layers))
-        else:
-            raise NotImplementedError
+    check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    if is_kv_cache_type_uniform(kv_cache_spec):
+        # KV cache of all layers are the same, which is true for
+        # most models. Allocate the same amount of memory for
+        # each layer.
+        return _get_kv_cache_config_uniform_type(vllm_config, kv_cache_spec,
+                                                 available_memory)
+
+    raise NotImplementedError
+
+
+def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
+    """
+    Make the KV cache configurations for each worker consistent, so that all 
+    workers can be controlled by the same KVCacheManager.
+    This function verifies that the layer group of each worker are the same,
+    and changes the num_blocks of each worker to the smallest among all workers.
+    
+    Args:
+        kv_cache_configs: The KV cache configurations for each worker. Will be
+            in-place modified to make them consistent.
+    """
+
+    # Sort the kv cache groups by the type_id of their KV cache spec.
+    # This can avoid the inconsistency caused by the order of groups.
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.kv_cache_groups.sort(
+            key=lambda x: x.kv_cache_spec.type_id)
+
+    # Verify that the groups of each rank are the same.
+    for kv_cache_config in kv_cache_configs[1:]:
+        for group_rank_0, group_rank_i in zip(
+                kv_cache_configs[0].kv_cache_groups,
+                kv_cache_config.kv_cache_groups):
+            assert group_rank_0.kv_cache_spec == group_rank_i.kv_cache_spec
+
+    # Change the num_blocks of each rank to the smallest among all ranks. We
+    # do not need to shrink the tensor size because it is valid to only use the
+    # first `num_blocks` blocks of the tensor.
+    min_num_blocks = min(kv_cache_config.num_blocks
+                         for kv_cache_config in kv_cache_configs)
+    for kv_cache_config in kv_cache_configs:
+        kv_cache_config.num_blocks = min_num_blocks
+
     return kv_cache_configs
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 1598e6b8443..f4bb4583bea 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -21,7 +21,8 @@
     maybe_register_config_serialize_by_value)
 from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
                         zmq_socket_ctx)
-from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
+from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
+                                         unify_kv_cache_configs)
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
@@ -120,15 +121,27 @@ def _initialize_kv_caches(self,
         # memory can be allocated for kv cache.
         available_gpu_memory = self.model_executor.determine_available_memory()
 
+        assert len(kv_cache_specs) == len(available_gpu_memory)
         # Get the kv cache tensor size
-        kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs,
-                                                available_gpu_memory)
-        num_gpu_blocks_set = set(config.num_blocks
-                                 for config in kv_cache_configs)
-        assert len(num_gpu_blocks_set) == 1, (
-            f"num_gpu_blocks need to be the same across workers, "
-            f"but they are different: {num_gpu_blocks_set}")
-        num_gpu_blocks = num_gpu_blocks_set.pop()
+        kv_cache_configs = [
+            get_kv_cache_config(vllm_config, kv_cache_spec_one_worker,
+                                available_gpu_memory_one_worker)
+            for kv_cache_spec_one_worker, available_gpu_memory_one_worker in
+            zip(kv_cache_specs, available_gpu_memory)
+        ]
+
+        # Since we use a shared centralized controller, we need the
+        # `kv_cache_config` to be consistent across all workers to make sure
+        # all the memory operators can be applied to all workers.
+        unify_kv_cache_configs(kv_cache_configs)
+
+        # All workers have the same kv_cache_config except layer names, so use
+        # an arbitrary one to get the number of blocks.
+        assert all([
+            cfg.num_blocks == kv_cache_configs[0].num_blocks
+            for cfg in kv_cache_configs
+        ])
+        num_gpu_blocks = kv_cache_configs[0].num_blocks
         num_cpu_blocks = 0
 
         # Initialize kv cache and warmup the execution
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index aa6ae83c26e..e3a4cd98c1f 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -62,14 +62,11 @@ def initialize_from_config(self,
                             args=(kv_cache_configs, ))
         self.collective_rpc("compile_or_warm_up_model")
 
-    def determine_available_memory(self) -> int:  # in bytes
+    def determine_available_memory(self) -> list[int]:  # in bytes
         output = self.collective_rpc("determine_available_memory")
-        # Since we use a shared centralized controller, we take the minimum
-        # memory size across all workers to make sure all the memory
-        # operators can be applied to all workers.
-        return min(output)
+        return output
 
-    def get_kv_cache_specs(self) -> list[KVCacheSpec]:
+    def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
         output = self.collective_rpc("get_kv_cache_spec")
         return output
 
@@ -95,7 +92,7 @@ class UniProcExecutor(UniProcExecutorV0, Executor):
 
 class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
 
-    def determine_available_memory(self) -> int:  # in bytes
+    def determine_available_memory(self) -> list[int]:  # in bytes
         # same as determine_num_available_blocks in v0,
         # we need to get the min across all ranks.
         memory = super().determine_available_memory()
@@ -103,4 +100,4 @@ def determine_available_memory(self) -> int:  # in bytes
         cpu_group = get_world_group().cpu_group
         memory_tensor = torch.tensor([memory], device="cpu", dtype=torch.int64)
         dist.all_reduce(memory_tensor, group=cpu_group, op=dist.ReduceOp.MIN)
-        return memory_tensor.item()
+        return [memory_tensor.item()]
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 1f885c10c8c..867b1b61c87 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -11,7 +11,7 @@
 
 
 @dataclass
-class KVCacheSpecBase:
+class KVCacheSpec:
     """
     A base class for specifying the KV cache format of one layer.
     """
@@ -55,7 +55,7 @@ def bytes_for_tokens(self, num_tokens: int) -> int:
 
 
 @dataclass
-class FullAttentionSpec(KVCacheSpecBase):
+class FullAttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
@@ -76,9 +76,6 @@ def bytes_for_tokens(self, num_tokens: int) -> int:
         return cdiv(num_tokens, self.block_size) * self.page_size_bytes
 
 
-KVCacheSpec = dict[str, KVCacheSpecBase]
-
-
 @dataclass
 class KVCacheTensor:
     """
@@ -89,6 +86,18 @@ class KVCacheTensor:
     size: int  # The size of KV cache Tensor in bytes
 
 
+@dataclass
+class KVCacheGroupSpec:
+    """
+    Represents a group of model layers that share the same KV cache block table.
+    These layers are regarded as one layer in the KV cache manager.
+    """
+    # The names of model layers in this group
+    layer_names: list[str]
+    # The KV cache spec of this manager layer
+    kv_cache_spec: KVCacheSpec
+
+
 @dataclass
 class KVCacheConfig:
     """
@@ -99,17 +108,24 @@ class KVCacheConfig:
     """layer_name -> how to initialize KV cache for that layer"""
     tensors: dict[str, KVCacheTensor]
     """
-    A list of kv-cache groups. Each group includes a set of layers with
-    the same kv-cache spec, and the total page_size of layers inside a group
-    is same across all groups (as the KVCacheManager only supports allocating
-    pages of the same size). For example:
-    1. A model only uses full attention: one group with all layers in the model.
-    2. (not implemented yet) A model with the same number of full attention
-    layers and sliding window attention layers: two groups, one for full
-    attention layers and one for sliding window attention layers.
-    3. (not implemented yet) A model with 2 full attention layers and 4 sliding
-    window attention layers: three groups, (full * 2), (sw * 2), (sw * 2).
+    The kv cache groups of the model.
+    The layers in the models are repeated with some patterns, e.g., a model
+    with 10 full attention layers and 20 sliding window attention layers can be
+    regarded as repeating the pattern (1 * full, 2 * sw) 10 times. 
+    The KVCacheManager allocates different block tables for each of the 3 layers
+    in the pattern, and repeats each of them 10 times to generate the 
+    block_table for the 30 layers in the model.
+    Therefore, we can group the layers in the model into 3 groups, each of which
+    contains 10 layers in the model.
+    The KVCacheManager allocates the block_table for each group based on its
+    kv_cache spec, and the model runner applies the block table to each layer 
+    in the group.
+    For example:
+    1. A model only uses full attention. The pattern is 
+    (num_hidden_layers * full), so there is only one group and the block table 
+    is shared by all layers.
+    2. (WIP) A model with 10 full attention layers and 20 sliding window 
+    attention layers. There are 3 layers in the pattern (1 * full, 2 * sw), so 
+    there are 3 groups, each of which represents 10 layers in the model.
     """
-    groups: list[list[str]]
-    """the KVCacheSpec of the model"""
-    kv_cache_spec: KVCacheSpec
+    kv_cache_groups: list[KVCacheGroupSpec]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b186300a003..229849e4439 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1510,34 +1510,46 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
-        if len(kv_cache_config.groups) > 1:
+        if len(kv_cache_config.kv_cache_groups) > 1:
             raise NotImplementedError(
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
         kv_caches: dict[str, torch.Tensor] = {}
 
-        for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
-            tensor_config = kv_cache_config.tensors[layer_name]
-            assert tensor_config.size % layer_spec.page_size_bytes == 0
-            num_blocks = tensor_config.size // layer_spec.page_size_bytes
-            if isinstance(layer_spec, FullAttentionSpec):
-                kv_cache_shape = self.attn_backend.get_kv_cache_shape(
-                    num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
-                    layer_spec.head_size)
-                dtype = layer_spec.dtype
-                kv_caches[layer_name] = torch.zeros(kv_cache_shape,
-                                                    dtype=dtype,
-                                                    device=self.device)
-            else:
-                raise NotImplementedError
+        for kv_cache_group in kv_cache_config.kv_cache_groups:
+            kv_cache_spec = kv_cache_group.kv_cache_spec
+            for layer_name in kv_cache_group.layer_names:
+                tensor_config = kv_cache_config.tensors[layer_name]
+                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+                # `num_blocks` is the number of blocks the model runner can use.
+                # `kv_cache_config.num_blocks` is the number of blocks that
+                # KVCacheManager may allocate.
+                # Since different GPUs may have different number of layers and
+                # different memory capacities, `num_blocks` can be different on
+                # different GPUs, and `kv_cache_config.num_blocks` is set to
+                # the min of all `num_blocks`. Verify it here.
+                assert num_blocks >= kv_cache_config.num_blocks
+                if isinstance(kv_cache_spec, FullAttentionSpec):
+                    kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    dtype = kv_cache_spec.dtype
+                    kv_caches[layer_name] = torch.zeros(kv_cache_shape,
+                                                        dtype=dtype,
+                                                        device=self.device)
+                else:
+                    # TODO: add new branches when introducing more types of
+                    # KV cache specs.
+                    raise ValueError("Unknown KV cache spec type.")
 
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
             self.kv_caches)
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
@@ -1549,7 +1561,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
         forward_ctx = self.vllm_config.compilation_config.static_forward_context
         block_size = self.vllm_config.cache_config.block_size
         use_mla = self.vllm_config.model_config.use_mla
-        kv_cache_spec: KVCacheSpec = {}
+        kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
             if isinstance(attn_module, FusedMoE):
                 continue
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index a63a2d02237..51b9f567396 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -185,7 +185,7 @@ def determine_available_memory(self) -> int:
 
         return int(available_kv_cache_memory)
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
 
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d772a3ee13e..f4502f6b423 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -309,7 +309,7 @@ def get_model(self) -> nn.Module:
         assert self.model is not None
         return self.model
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """
         Generates the KVCacheSpec by parsing the kv cache format from each
         Attention module in the static forward context.
@@ -320,7 +320,7 @@ def get_kv_cache_spec(self) -> KVCacheSpec:
 
         forward_ctx = self.vllm_config.compilation_config.static_forward_context
         block_size = self.vllm_config.cache_config.block_size
-        kv_cache_spec: KVCacheSpec = {}
+        kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
             # TODO: Support other attention modules, e.g., sliding window,
             # cross-attention, MLA.
@@ -837,31 +837,33 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
-        if len(kv_cache_config.groups) > 1:
+        if len(kv_cache_config.kv_cache_groups) > 1:
             raise NotImplementedError(
                 "Hybrid models with more than one KV cache type are not "
                 "supported yet.")
 
         kv_caches: dict[str, torch.Tensor] = {}
 
-        for layer_name, layer_spec in kv_cache_config.kv_cache_spec.items():
-            tensor_config = kv_cache_config.tensors[layer_name]
-            assert tensor_config.size % layer_spec.page_size_bytes == 0
-            num_blocks = tensor_config.size // layer_spec.page_size_bytes
-            if isinstance(layer_spec, FullAttentionSpec):
-                kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
-                    num_blocks, layer_spec.block_size, layer_spec.num_kv_heads,
-                    layer_spec.head_size)
-                dtype = layer_spec.dtype
-
-                tpu_k_cache = torch.zeros(kv_cache_shape,
-                                          dtype=dtype,
-                                          device=self.device)
-                tpu_v_cache = torch.zeros_like(tpu_k_cache)
-
-                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
-            else:
-                raise NotImplementedError
+        for kv_cache_group in kv_cache_config.kv_cache_groups:
+            kv_cache_spec = kv_cache_group.kv_cache_spec
+            for layer_name in kv_cache_group.layer_names:
+                tensor_config = kv_cache_config.tensors[layer_name]
+                assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
+                num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+                if isinstance(kv_cache_spec, FullAttentionSpec):
+                    kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape(
+                        num_blocks, kv_cache_spec.block_size,
+                        kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
+                    dtype = kv_cache_spec.dtype
+
+                    tpu_k_cache = torch.zeros(kv_cache_shape,
+                                              dtype=dtype,
+                                              device=self.device)
+                    tpu_v_cache = torch.zeros_like(tpu_k_cache)
+
+                    kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                else:
+                    raise NotImplementedError
 
         bind_kv_cache(
             kv_caches,
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index dbb231950d0..d56c25dd9da 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -189,7 +189,7 @@ def compile_or_warm_up_model(self) -> None:
     def get_model(self) -> nn.Module:
         return self.model_runner.get_model()
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return self.model_runner.get_kv_cache_spec()
 
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 51d2da2344b..487a49b6211 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -51,7 +51,7 @@ def __init__(
         self.device: Optional[torch.device] = None
         self.model_runner: Optional[nn.Module] = None
 
-    def get_kv_cache_spec(self) -> KVCacheSpec:
+    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         """Get specifications for KV cache implementation."""
         raise NotImplementedError
 

From 582f5bb69f434611b00ce994652e6ff94829787a Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 21 Mar 2025 20:14:36 +0800
Subject: [PATCH 0900/1240] [Bugfix][VLM] fix llava processor (#15285)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/llava.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 441ccde046e..40accfffe4f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -233,7 +233,13 @@ def get_dummy_processor_inputs(
 class LlavaProcessingInfo(BaseLlavaProcessingInfo):
 
     def get_hf_processor(self, **kwargs: object):
-        return self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+        hf_processor = self.ctx.get_hf_processor(LlavaProcessor, **kwargs)
+        # In case patch_size is omitted from `processor_config.json`
+        # e.g. for E5-V: https://huggingface.co/royokong/e5-v
+        if hf_processor.patch_size is None:
+            patch_size = self.get_vision_encoder_info().get_patch_size()
+            hf_processor.patch_size = patch_size
+        return hf_processor
 
 
 class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):

From 3ff593e297db758f82872f32f5a996decd8c0d4a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 21 Mar 2025 23:30:23 +0800
Subject: [PATCH 0901/1240] Revert "[Feature] specify model in config.yaml 
 (#14855)" (#15293)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../serving/openai_compatible_server.md       |  4 +-
 tests/config/test_config_with_model.yaml      |  7 ---
 tests/conftest.py                             | 12 -----
 tests/{config => data}/test_config.yaml       |  0
 tests/test_utils.py                           | 53 ++++---------------
 vllm/entrypoints/cli/serve.py                 | 22 ++++----
 vllm/utils.py                                 | 34 ++++--------
 7 files changed, 30 insertions(+), 102 deletions(-)
 delete mode 100644 tests/config/test_config_with_model.yaml
 rename tests/{config => data}/test_config.yaml (100%)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 378405d3690..a6ec05f45b6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -184,7 +184,6 @@ For example:
 ```yaml
 # config.yaml
 
-model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@@ -193,13 +192,12 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-vllm serve --config config.yaml
+vllm serve SOME_MODEL --config config.yaml
 ```
 
 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
-e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::
 
 ## API Reference
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
deleted file mode 100644
index d8c8c7bc816..00000000000
--- a/tests/config/test_config_with_model.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-# Same as test_config.yaml but with model specified
-model: config-model
-port: 12312
-served_model_name: mymodel
-tensor_parallel_size: 2
-trust_remote_code: true
-multi_step_stream_outputs: false
diff --git a/tests/conftest.py b/tests/conftest.py
index 8c6046c5817..0c71d981016 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1121,15 +1121,3 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "optional" in item.keywords:
             item.add_marker(skip_optional)
-
-
-@pytest.fixture(scope="session")
-def cli_config_file():
-    """Return the path to the CLI config file."""
-    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
-
-
-@pytest.fixture(scope="session")
-def cli_config_file_with_model():
-    """Return the path to the CLI config file with model."""
-    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/tests/config/test_config.yaml b/tests/data/test_config.yaml
similarity index 100%
rename from tests/config/test_config.yaml
rename to tests/data/test_config.yaml
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 1c33f83c593..3660cfa0e49 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -8,7 +8,7 @@
 
 import pytest
 import torch
-from vllm_test_utils.monitor import monitor
+from vllm_test_utils import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
@@ -140,8 +140,7 @@ def parser():
 def parser_with_config():
     parser = FlexibleArgumentParser()
     parser.add_argument('serve')
-    parser.add_argument('model_tag', nargs='?')
-    parser.add_argument('--model', type=str)
+    parser.add_argument('model_tag')
     parser.add_argument('--served-model-name', type=str)
     parser.add_argument('--config', type=str)
     parser.add_argument('--port', type=int)
@@ -197,29 +196,29 @@ def test_missing_required_argument(parser):
         parser.parse_args([])
 
 
-def test_cli_override_to_config(parser_with_config, cli_config_file):
+def test_cli_override_to_config(parser_with_config):
     args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', cli_config_file,
+        'serve', 'mymodel', '--config', './data/test_config.yaml',
         '--tensor-parallel-size', '3'
     ])
     assert args.tensor_parallel_size == 3
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file
+        './data/test_config.yaml'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        cli_config_file, '--port', '666'
+        './data/test_config.yaml', '--port', '666'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
-def test_config_args(parser_with_config, cli_config_file):
+def test_config_args(parser_with_config):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', cli_config_file])
+        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -241,9 +240,10 @@ def test_config_file(parser_with_config):
         ])
 
 
-def test_no_model_tag(parser_with_config, cli_config_file):
+def test_no_model_tag(parser_with_config):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+        parser_with_config.parse_args(
+            ['serve', '--config', './data/test_config.yaml'])
 
 
 # yapf: enable
@@ -476,34 +476,3 @@ def test_swap_dict_values(obj, key1, key2):
         assert obj[key1] == original_obj[key2]
     else:
         assert key1 not in obj
-
-
-def test_model_specification(parser_with_config,
-                             cli_config_file,
-                             cli_config_file_with_model):
-    # Test model in CLI takes precedence over config
-    args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model
-    ])
-    assert args.model_tag == 'cli-model'
-    assert args.served_model_name == 'mymodel'
-
-    # Test model from config file works
-    args = parser_with_config.parse_args([
-        'serve', '--config', cli_config_file_with_model
-    ])
-    assert args.model == 'config-model'
-    assert args.served_model_name == 'mymodel'
-
-    # Test no model specified anywhere raises error
-    with pytest.raises(ValueError, match="No model specified!"):
-        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-
-    # Test other config values are preserved
-    args = parser_with_config.parse_args([
-        'serve', 'cli-model', '--config', cli_config_file_with_model
-    ])
-    assert args.tensor_parallel_size == 2
-    assert args.trust_remote_code is True
-    assert args.multi_step_stream_outputs is False
-    assert args.port == 12312
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index cf05eb09b37..c345ece4dad 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -21,16 +21,14 @@ def __init__(self):
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
-        # If model is specified in CLI (as positional arg), it takes precedence
-        if hasattr(args, 'model_tag') and args.model_tag is not None:
-            args.model = args.model_tag
-        # Otherwise use model from config (already in args.model)
-
-        # Check if we have a model specified somewhere
-        if args.model == EngineArgs.model:  # Still has default value
+        # The default value of `--model`
+        if args.model != EngineArgs.model:
             raise ValueError(
-                "With `vllm serve`, you should provide the model either as a "
-                "positional argument or in config file.")
+                "With `vllm serve`, you should provide the model as a "
+                "positional argument instead of via the `--model` option.")
+
+        # EngineArgs expects the model name to be passed as --model.
+        args.model = args.model_tag
 
         uvloop.run(run_server(args))
 
@@ -43,12 +41,10 @@ def subparser_init(
         serve_parser = subparsers.add_parser(
             "serve",
             help="Start the vLLM OpenAI Compatible API server",
-            usage="vllm serve [model_tag] [options]")
+            usage="vllm serve <model_tag> [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,
-                                  nargs='?',
-                                  help="The model tag to serve "
-                                  "(optional if specified in config)")
+                                  help="The model tag to serve")
         serve_parser.add_argument(
             "--config",
             type=str,
diff --git a/vllm/utils.py b/vllm/utils.py
index 64d9faeb1cb..9e09f0b9f2d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1265,29 +1265,19 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
         config_args = self._load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
-        # optionally followed by model_tag (only for serve)
+        # followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
         if args[0] == "serve":
-            model_in_cli = len(args) > 1 and not args[1].startswith('-')
-            model_in_config = any(arg == '--model' for arg in config_args)
-
-            if not model_in_cli and not model_in_config:
+            if index == 1:
                 raise ValueError(
-                    "No model specified! Please specify model either in "
-                    "command-line arguments or in config file.")
-
-            if model_in_cli:
-                # Model specified as positional arg, keep CLI version
-                args = [args[0]] + [
-                    args[1]
-                ] + config_args + args[2:index] + args[index + 2:]
-            else:
-                # No model in CLI, use config if available
-                args = [args[0]
-                        ] + config_args + args[1:index] + args[index + 2:]
+                    "No model_tag specified! Please check your command-line"
+                    " arguments.")
+            args = [args[0]] + [
+                args[1]
+            ] + config_args + args[2:index] + args[index + 2:]
         else:
             args = [args[0]] + config_args + args[1:index] + args[index + 2:]
 
@@ -1305,7 +1295,9 @@ def _load_config_file(self, file_path: str) -> list[str]:
                 '--port': '12323',
                 '--tensor-parallel-size': '4'
             ]
+
         """
+
         extension: str = file_path.split('.')[-1]
         if extension not in ('yaml', 'yml'):
             raise ValueError(
@@ -1330,15 +1322,7 @@ def _load_config_file(self, file_path: str) -> list[str]:
             if isinstance(action, StoreBoolean)
         ]
 
-        # Skip model from config if it's provided as positional argument
-        skip_model = (hasattr(self, '_parsed_args') and self._parsed_args
-                      and len(self._parsed_args) > 1
-                      and self._parsed_args[0] == 'serve'
-                      and not self._parsed_args[1].startswith('-'))
-
         for key, value in config.items():
-            if skip_model and key == 'model':
-                continue
             if isinstance(value, bool) and key not in store_boolean_arguments:
                 if value:
                     processed_args.append('--' + key)

From 4ad28caf41a1e5f2fb77e0bfef8e26b3aca39e82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Fri, 21 Mar 2025 16:50:39 +0100
Subject: [PATCH 0902/1240] [TPU][V1] MHA Pallas backend (#15288)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/tpu/test_mha_attn.py | 109 ++++++++++++++++++++++++++++++++++
 vllm/attention/layer.py       |  10 +++-
 2 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 tests/v1/tpu/test_mha_attn.py

diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
new file mode 100644
index 00000000000..01664598ccf
--- /dev/null
+++ b/tests/v1/tpu/test_mha_attn.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test:
+
+* Tests for MultiHeadAttention layer
+"""
+
+import pytest
+import torch
+import torch_xla
+import torch_xla.core
+import torch_xla.core.xla_model
+
+from vllm import envs
+from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.selector import _cached_get_attn_backend
+from vllm.platforms import current_platform
+
+if not envs.VLLM_USE_V1:
+    pytest.skip(
+        "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.",
+        allow_module_level=True,
+    )
+
+
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Clear lru cache to ensure each test case runs without caching.
+    """
+    _cached_get_attn_backend.cache_clear()
+
+
+def ref_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    scale: float,
+) -> torch.Tensor:
+    """
+    Native implementation of scaled dot product attention without mask:
+    - query, key, value: [batch_size, seq_len, num_heads, head_size]
+    - attn_mask: [batch_size, seq_len, seq_len]
+    """
+    query, key, value = (x.transpose(1, 2) for x in (query, key, value))
+    attn_weights = scale * torch.matmul(query, key.transpose(2, 3))
+    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
+    out = torch.matmul(attn_weights, value).transpose(1, 2)
+    return out
+
+
+BATCH_SIZES = [1, 16]
+SEQ_LENS = [1]
+NUM_HEADS = [1, 16]
+NUM_KV_HEADS = [1]
+HEAD_SIZES = [64, 80]
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This test needs a TPU")
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("device", [torch_xla.core.xla_model.xla_device()])
+def test_mha_attn_forward(
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    num_kv_heads: int,
+    head_size: int,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    # These are expected to be f32
+    q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device)
+    k = torch.randn(batch_size,
+                    seq_len,
+                    num_kv_heads * head_size,
+                    device=device)
+    v = torch.randn(batch_size,
+                    seq_len,
+                    num_kv_heads * head_size,
+                    device=device)
+    scale = 1.0 / head_size**0.5
+    attn = MultiHeadAttention(num_heads,
+                              head_size,
+                              scale=scale,
+                              num_kv_heads=num_kv_heads)
+    output = attn(q, k, v)
+
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+
+    q = q.reshape(batch_size, seq_len, num_heads, head_size)
+    k = k.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    v = v.reshape(batch_size, seq_len, num_kv_heads, head_size)
+    if num_queries_per_kv > 1:
+        k = torch.repeat_interleave(k, num_queries_per_kv, dim=2)
+        v = torch.repeat_interleave(v, num_queries_per_kv, dim=2)
+
+    ref_output = ref_attention(
+        q,
+        k,
+        v,
+        scale=scale,
+    ).reshape(batch_size, seq_len, num_heads * head_size)
+    # torch_xla flash_attn kernel is less accurate but much faster
+    torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-3)
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 946c07d508a..dbf4723ee1b 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -281,8 +281,7 @@ def __init__(
             backend = _Backend.XFORMERS
 
         self.attn_backend = backend if backend in {
-            _Backend.TORCH_SDPA,
-            _Backend.XFORMERS,
+            _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
         } else _Backend.TORCH_SDPA
 
     def forward(
@@ -320,6 +319,13 @@ def forward(
                                                  value,
                                                  scale=self.scale)
             out = out.transpose(1, 2)
+        elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
+            query, key, value = (x.transpose(1, 2)
+                                 for x in (query, key, value))
+            from torch_xla.experimental.custom_kernel import flash_attention
+            out = flash_attention(query, key, value, sm_scale=self.scale)
+            out = out.transpose(1, 2)
+
         return out.reshape(bsz, q_len, -1)
 
 
From 744b3f767c32af8fa465be7bc9cf1ae92ace55a6 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 21 Mar 2025 18:28:46 -0400
Subject: [PATCH 0903/1240] [Build/CI] Fix env var typo (#15305)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../nightly-benchmarks/scripts/run-performance-benchmarks.sh    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index a3555f72a66..4cd449b141e 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -361,7 +361,7 @@ main() {
   # get the current IP address, required by benchmark_serving.py
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
   # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
+  export VLLM_LOGGING_LEVEL="WARNING"
 
   # prepare for benchmarking
   cd benchmarks || exit 1

From bdc31020405cf4959c7f3bcf7139788487e13e8f Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Fri, 21 Mar 2025 22:25:43 -0700
Subject: [PATCH 0904/1240] [Misc] Increase RayDistributedExecutor
 RAY_CGRAPH_get_timeout (#15301)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_distributed_executor.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index d769d235020..c823ab5bf96 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -561,6 +561,15 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                     envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
         logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+        # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
+        # (it is 10 seconds by default). This is a Ray environment variable to
+        # control the timeout of getting result from a compiled graph execution,
+        # i.e., the distributed execution that includes model forward runs and
+        # intermediate tensor communications, in the case of vllm.
+        os.environ.setdefault("RAY_CGRAPH_get_timeout", "300")  # noqa: SIM112
+        logger.info("RAY_CGRAPH_get_timeout is set to %s",
+                    os.environ["RAY_CGRAPH_get_timeout"])  # noqa: SIM112
+
         with InputNode() as input_data:
             # Example DAG: PP=2, TP=4
             #

From 688a12db338bd45e6c7f68ef6d4a60bf8bd2ec37 Mon Sep 17 00:00:00 2001
From: Andy Lo <andylolu24@gmail.com>
Date: Sat, 22 Mar 2025 05:35:37 +0000
Subject: [PATCH 0905/1240] [Bugfix][V0] Multi-sequence logprobs streaming edge
 case (#15259)

Signed-off-by: Andy Lo <andy@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/outputs.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/outputs.py b/vllm/outputs.py
index 7a20c340edc..014e8d5d882 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -223,7 +223,12 @@ def from_seq_group(
             if delta:
                 # Slice logprobs delta if applicable
                 if output_logprobs:
-                    output_logprobs = output_logprobs[-num_output_tokens:]
+                    # num_output_tokens can be 0 when n > 1 and request finishes
+                    # before the others
+                    if num_output_tokens > 0:
+                        output_logprobs = output_logprobs[-num_output_tokens:]
+                    else:
+                        output_logprobs = None
                 # Don't include prompt if this is after the first output
                 # containing decode token ids
                 if include_prompt and seq.get_output_len() > num_output_tokens:

From f455f76256e671a19b54d79618def993e9e02430 Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Sat, 22 Mar 2025 13:36:14 +0800
Subject: [PATCH 0906/1240] [FEAT] [ROCm]:  Add AITER RMS Norm (Layer Norm)
 Feature (#14959)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.rocm_base                          | 16 +++-
 .../model_executor/test_enabled_custom_ops.py | 29 +++++-
 .../decoder_only/language/test_models.py      | 50 ++++++++--
 vllm/envs.py                                  | 13 +++
 vllm/model_executor/layers/layernorm.py       | 94 +++++++++++++++----
 5 files changed, 173 insertions(+), 29 deletions(-)

diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base
index e33e73b3030..38d6a33636e 100644
--- a/Dockerfile.rocm_base
+++ b/Dockerfile.rocm_base
@@ -12,6 +12,8 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="b7d29fb"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
+ARG AITER_BRANCH="21d47a9"
+ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
 
@@ -129,8 +131,18 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
 
+ARG AITER_REPO
+ARG AITER_BRANCH
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+
 ARG BASE_IMAGE
 ARG HIPBLASLT_BRANCH
+ARG HIPBLAS_COMMON_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@@ -155,4 +167,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt
+    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
+    && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 4a6a766b8ca..24147b74127 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -7,7 +7,10 @@
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
-from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.layernorm import (
+    RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
+    rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
+from vllm.platforms import current_platform
 
 
 # Registered subclass for test
@@ -87,3 +90,27 @@ def test_enabled_ops_invalid(env: str):
             custom_ops=env.split(",")))
         with set_current_vllm_config(vllm_config):
             RMSNorm(1024).enabled()
+
+
+@pytest.mark.parametrize("add_residual", [True, False])
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
+@pytest.mark.skipif(not current_platform.is_rocm(),
+                    reason="AITER is a feature exclusive for ROCm")
+def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
+                           use_rocm_aiter_norm: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
+    rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
+
+    if not add_residual:
+        if current_platform.is_rocm() and int(use_rocm_aiter) and int(
+                use_rocm_aiter_norm):
+            assert rms_norm_func == rocm_aiter_rms_norm
+        else:
+            assert rms_norm_func == rms_norm
+    elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
+            use_rocm_aiter_norm):
+        assert rms_norm_func == rocm_aiter_fused_add_rms_norm
+    else:
+        assert rms_norm_func == fused_add_rms_norm
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index a49926ea220..79fa3fa9977 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -3,7 +3,11 @@
 
 Run `pytest tests/models/test_models.py`.
 """
+
 import pytest
+import torch
+
+from vllm.platforms import current_platform
 
 from ...utils import check_logprobs_close
 
@@ -13,7 +17,21 @@
 # https://github.com/vllm-project/vllm/issues/14524
 REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
 
+# This list contains the model that are using AITER kernel.
+# Skip model that are not using AITER tests.
+# When more AITER kernels are added, this list will not be
+# needed as all the models will be calling AITER kernels
+# in parts of the operators
+AITER_MODEL_LIST = [
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openbmb/MiniCPM3-4B",
+    "Qwen/Qwen-7B",
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    "ehristoforu/Falcon3-MoE-2x7B-Insruct",
+]
+
 
+# @maybe_test_rocm_aiter
 @pytest.mark.parametrize(
     "model",
     [
@@ -69,19 +87,24 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-    monkeypatch,
-) -> None:
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int,
+                use_rocm_aiter: bool, monkeypatch) -> None:
+
     if model in REQUIRES_V0:
         monkeypatch.setenv("VLLM_USE_V1", "0")
 
+    if use_rocm_aiter and (model in AITER_MODEL_LIST):
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+    elif use_rocm_aiter and model not in AITER_MODEL_LIST:
+        # Skip model that are not using AITER tests.
+        # When more AITER kernels are added, this list will not be
+        # needed as all the models will be calling AITER kernels
+        # in parts of the operators
+        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")
+
     with hf_runner(model, dtype=dtype) as hf_model:
         if model.startswith("THUDM/chatglm3"):
             hf_model.model.get_output_embeddings = lambda: \
@@ -100,3 +123,10 @@ def test_models(
         name_0="hf",
         name_1="vllm",
     )
+    if use_rocm_aiter:
+        # this is to ensure that vllm engine
+        # has deallocated the memory before running the next
+        # unit tests. On ROCm, when using AITER
+        # the memory might not be deallocated completely
+        # before running the next test case
+        torch.cuda.synchronize()
diff --git a/vllm/envs.py b/vllm/envs.py
index d54de9da253..7c07940c26c 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,8 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
+    VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
@@ -528,6 +530,17 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "1"))),
 
+    # Disable aiter ops unless specifically enabled.
+    # Acts as a parent switch to enable the rest of the other operations.
+    "VLLM_ROCM_USE_AITER":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
+             ("true", "1")),
+
+    # use aiter rms norm op if aiter ops are enabled.
+    "VLLM_ROCM_USE_AITER_RMSNORM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
+             ("true", "1")),
+
     # Pad the fp8 weights to 256 bytes for ROCm
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index b476fb0dbc7..76d3acb92fb 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -5,7 +5,77 @@
 import torch
 import torch.nn as nn
 
+import vllm.envs as envs
 from vllm.model_executor.custom_op import CustomOp
+from vllm.platforms import current_platform
+
+
+def is_rocm_aiter_rmsnorm_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_RMSNORM \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+def rms_norm(x: torch.Tensor, weight: torch.Tensor,
+             variance_epsilon: float) -> torch.Tensor:
+    from vllm import _custom_ops as ops
+    out = torch.empty_like(x)
+    ops.rms_norm(
+        out,
+        x,
+        weight,
+        variance_epsilon,
+    )
+    return out
+
+
+def fused_add_rms_norm(
+        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
+    from vllm import _custom_ops as ops
+    ops.fused_add_rms_norm(
+        x,
+        residual,
+        weight,
+        variance_epsilon,
+    )
+    return x, residual
+
+
+def rocm_aiter_rms_norm(x: torch.Tensor, weight: torch.Tensor,
+                        variance_epsilon: float) -> torch.Tensor:
+
+    import aiter as rocm_aiter
+    return rocm_aiter.rms_norm(x, weight, variance_epsilon)
+
+
+def rocm_aiter_fused_add_rms_norm(
+        x: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor,
+        variance_epsilon: float) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    import aiter as rocm_aiter
+
+    # Assuming the correct signature for rmsnorm2d_fwd_with_add
+    rocm_aiter.rmsnorm2d_fwd_with_add(
+        x,  # output
+        x,  # input
+        residual,  # residual input
+        residual,  # residual output
+        weight,
+        variance_epsilon,
+    )
+    return x, residual
+
+
+def dispatch_cuda_rmsnorm_func(add_residual: bool):
+    if add_residual:
+        if is_rocm_aiter_rmsnorm_enabled():
+            return rocm_aiter_fused_add_rms_norm
+        return fused_add_rms_norm
+
+    if is_rocm_aiter_rmsnorm_enabled():
+        return rocm_aiter_rms_norm
+    return rms_norm
 
 
 @CustomOp.register("rms_norm")
@@ -81,24 +151,14 @@ def forward_cuda(
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
-        from vllm import _custom_ops as ops
+        add_residual = residual is not None
+        norm_func = dispatch_cuda_rmsnorm_func(add_residual)
 
-        if residual is not None:
-            ops.fused_add_rms_norm(
-                x,
-                residual,
-                self.weight.data,
-                self.variance_epsilon,
-            )
-            return x, residual
-        out = torch.empty_like(x)
-        ops.rms_norm(
-            out,
-            x,
-            self.weight.data,
-            self.variance_epsilon,
-        )
-        return out
+        if add_residual:
+            return norm_func(x, residual, self.weight.data,
+                             self.variance_epsilon)
+        else:
+            return norm_func(x, self.weight.data, self.variance_epsilon)
 
     def forward_hpu(
         self,

From fddced92c552c1cffdf3c0373b2c7c6ee9bb2b4a Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Sat, 22 Mar 2025 00:36:43 -0500
Subject: [PATCH 0907/1240] [Doc] add load_format items in docs (#14804)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py           |  6 ++++++
 vllm/engine/arg_utils.py | 10 ++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1f7147f7cfd..181fa803c62 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1294,6 +1294,12 @@ class LoadConfig:
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
             "bitsandbytes" will load nf4 type weights.
+            "sharded_state" will load weights from pre-sharded checkpoint files,
+                supporting efficient loading of tensor-parallel models.
+            "gguf" will load weights from GGUF format files.
+            "mistral" will load weights from consolidated safetensors files used
+                by Mistral models.
+            "runai_streamer" will load weights from RunAI streamer format files.
         model_loader_extra_config: The extra config for the model loader.
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index edfa748b82d..e396e68f823 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -339,9 +339,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'CoreWeave. See the Tensorize vLLM Model script in the Examples '
             'section for more information.\n'
             '* "runai_streamer" will load the Safetensors weights using Run:ai'
-            'Model Streamer \n'
+            'Model Streamer.\n'
             '* "bitsandbytes" will load the weights using bitsandbytes '
-            'quantization.\n')
+            'quantization.\n'
+            '* "sharded_state" will load weights from pre-sharded checkpoint '
+            'files, supporting efficient loading of tensor-parallel models\n'
+            '* "gguf" will load weights from GGUF format files (details '
+            'specified in https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n'
+            '* "mistral" will load weights from consolidated safetensors files '
+            'used by Mistral models.\n')
         parser.add_argument(
             '--config-format',
             default=EngineArgs.config_format,

From 8ad57a23f63709b06193c44529e43cfb444f3303 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 22 Mar 2025 13:49:34 +0800
Subject: [PATCH 0908/1240] [Bugfix] Fix torch.compile raise FileNotFoundError
 (#15278)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/compilation/backends.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index cdae42fe4fc..089d415ab5f 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -399,6 +399,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         rank = vllm_config.parallel_config.rank
         dp_rank = vllm_config.parallel_config.data_parallel_rank
         local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}")
+        os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
         disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE

From 51e4d38224c74618d932668555e42004730504d8 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sat, 22 Mar 2025 05:03:32 -0400
Subject: [PATCH 0909/1240] [Bugfix] LoRA V0 - Fix case where `max_num_seqs` is
 between cudagraph capture sizes (#15308)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_llama_tp.py            | 14 ++++++++------
 vllm/lora/punica_wrapper/punica_gpu.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 0acdaeac695..7026f705026 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -84,12 +84,14 @@ def v1(run_with_both_engines_lora):
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
 
-    llm = vllm.LLM(MODEL_PATH,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   tensor_parallel_size=1,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        # also test odd max_num_seqs
+        max_num_seqs=13,
+        max_loras=4,
+        tensor_parallel_size=1,
+        enable_chunked_prefill=True)
     generate_and_test(llm, sql_lora_files)
 
 
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index eb6f5b1b488..be9cbe244a8 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -10,6 +10,7 @@
 
 import torch
 
+import vllm.envs as envs
 from vllm.lora.layers import LoRAMapping
 from vllm.triton_utils import HAS_TRITON
 
@@ -42,8 +43,15 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         self.token_mapping_meta = LoRAKernelMeta.make(self.max_loras,
                                                       max_num_batched_tokens,
                                                       device=device)
+
+        # When cudagraph capture size is greater than max_num_seqs (max_batches,
+        # here), V0 captures the graph as if max_num_seqs is set to
+        # the capture size.
+        # V1 doesn't have this problem and always respects max_num_seqs.
+        max_num_prompts = (max_batches
+                           if envs.VLLM_USE_V1 else max_num_batched_tokens)
         self.prompt_mapping_meta = LoRAKernelMeta.make(self.max_loras,
-                                                       max_batches,
+                                                       max_num_prompts,
                                                        device=device)
 
     def update_metadata(

From e63ab4b6fde881879dad1cac3f99e70e8f0c239e Mon Sep 17 00:00:00 2001
From: Naitong Yu <yunaitong@gmail.com>
Date: Sat, 22 Mar 2025 17:04:44 +0800
Subject: [PATCH 0910/1240] [Model] Support Tele-FLM Model (#15023)

Signed-off-by: Naitong Yu <ntyu@baai.ac.cn>
Signed-off-by: jiangxin <horizon94@outlook.com>
Co-authored-by: Jason Fang <jasonfang3900@gmail.com>
Co-authored-by: jiangxin <horizon94@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |  5 ++
 examples/template_teleflm.jinja        | 12 ++++
 tests/models/registry.py               |  2 +
 vllm/model_executor/models/registry.py |  1 +
 vllm/model_executor/models/teleflm.py  | 79 ++++++++++++++++++++++++++
 5 files changed, 99 insertions(+)
 create mode 100644 examples/template_teleflm.jinja
 create mode 100644 vllm/model_executor/models/teleflm.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 5e5e7287f39..ba01f2309b3 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
   * ✅︎
   * ✅︎
+- * `TeleFLMForCausalLM`
+  * TeleFLM
+  * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
+  * ✅︎
+  * ✅︎
 - * `XverseForCausalLM`
   * XVERSE
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
diff --git a/examples/template_teleflm.jinja b/examples/template_teleflm.jinja
new file mode 100644
index 00000000000..0cb29ccbb84
--- /dev/null
+++ b/examples/template_teleflm.jinja
@@ -0,0 +1,12 @@
+{%- for message in messages %}
+    {%- if message['role'] == 'user' %}
+        {{- '<_user>' + message['content']|trim }}
+    {%- elif message['role'] == 'system' %}
+        {{- '<_system>' + message['content']|trim }}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<_bot>' + message['content'] }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<_bot>' }}
+{%- endif %}
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 554e28863a7..5c84e85aaa9 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -192,6 +192,8 @@ def check_available_online(
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
                                             trust_remote_code=True),
+    "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
+                                            trust_remote_code=True),
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 74ae06c55d8..7c8e5067138 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -104,6 +104,7 @@
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
+    "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
     "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
     # [Encoder-decoder]
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
new file mode 100644
index 00000000000..e670b1df08f
--- /dev/null
+++ b/vllm/model_executor/models/teleflm.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.models.llama import (LlamaDecoderLayer,
+                                              LlamaForCausalLM, LlamaModel)
+
+
+class TeleFLMModel(LlamaModel):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer,
+    ):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         layer_type=layer_type)
+        """
+        This implementation is based on the µScaling paper presented at  
+        the ICLR 2025 Workshop:  
+        NanoLM: An Affordable LLM Study Benchmark \
+        via Accurate Loss Prediction across Scales
+        by Yiqun Yao et al.  
+        Available at: https://openreview.net/forum?id=IwaPYg1SCA  
+        arXiv preprint: https://arxiv.org/abs/2304.06875
+        """
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.input_mult = self.config.input_mult
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        embedding = self.embed_tokens(input_ids)
+        if self.use_mup:
+            embedding = embedding * self.input_mult
+        return embedding
+
+
+class TeleFLMForCausalLM(LlamaForCausalLM):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        # mup
+        self.use_mup = self.config.use_mup
+        if self.use_mup:
+            self.mup_scale_factor = self.config.mup_scale_factor
+            self.output_mult = self.config.output_mult / self.mup_scale_factor
+            logit_scale = self.output_mult
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size,
+                                                    logit_scale)

From d4ba462ba8fa6d70fbee75414738c4c64cfe6476 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 22 Mar 2025 11:56:17 -0400
Subject: [PATCH 0911/1240] [V1] Add `disable-any-whitespace` option support
 for xgrammar (#15316)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 45 ++++++++++++++++++-
 vllm/engine/arg_utils.py                      |  4 +-
 vllm/v1/engine/processor.py                   |  2 +-
 vllm/v1/structured_output/backend_xgrammar.py |  7 ++-
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b4eb475c23b..d99ae59ddd4 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -57,6 +57,50 @@ def test_guided_json_completion(
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
 
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_guided_json_completion_disable_any_whitespace(
+    monkeypatch: pytest.MonkeyPatch,
+    sample_json_schema: dict[str, Any],
+    guided_decoding_backend: str,
+    model_name: str,
+):
+    if guided_decoding_backend != "xgrammar":
+        pytest.skip("disable-any-whitespace is only supported for xgrammar.")
+    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
+
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+    outputs = llm.generate(prompts=[
+        f"Give an example JSON for an employee profile "
+        f"that fits this schema: {sample_json_schema}"
+    ] * 2,
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        assert "\n" not in generated_text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
@@ -301,7 +345,6 @@ def test_guided_choice_completion(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
         use_tqdm=True)
-
     assert outputs is not None
     for output in outputs:
         assert output is not None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e396e68f823..35c60a60266 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1486,7 +1486,9 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # Only support Xgrammar for guided decoding so far.
-        SUPPORTED_GUIDED_DECODING = ["xgrammar", "xgrammar:nofallback"]
+        SUPPORTED_GUIDED_DECODING = [
+            "xgrammar", "xgrammar:disable-any-whitespace"
+        ]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
                                recommend_to_remove=False)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 55e0fdcd65b..8ba06336be0 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -120,7 +120,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = ["xgrammar"]
+        supported_backends = ["xgrammar", "xgrammar:disable-any-whitespace"]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index ce93ca5c751..9bfb644c580 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -26,6 +26,9 @@ class XgrammarBackend(StructuredOutputBackend):
 
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
+        self.disable_any_whitespace = (
+            "disable-any-whitespace"
+            in vllm_config.decoding_config.guided_decoding_backend)
         tokenizer_group = init_tokenizer_from_configs(
             model_config=vllm_config.model_config,
             scheduler_config=vllm_config.scheduler_config,
@@ -74,8 +77,8 @@ def __init__(self, vllm_config: VllmConfig):
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         if request_type == StructuredOutputOptions.JSON:
-            ctx = self.compiler.compile_json_schema(grammar_spec,
-                                                    any_whitespace=False)
+            ctx = self.compiler.compile_json_schema(
+                grammar_spec, any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
             ctx = self.compiler.compile_builtin_json_grammar()
         elif request_type == StructuredOutputOptions.GRAMMAR:

From ccd7ac38b764ea1ff800fcfc5ece2a90043ab141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wang=20Ran=20=28=E6=B1=AA=E7=84=B6=29?= <wrran@outlook.com>
Date: Sun, 23 Mar 2025 00:05:03 +0800
Subject: [PATCH 0912/1240] [BugFix][Typing] Fix Imprecise Type Annotations
 (#15208)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Wang Ran (汪然) <wrran@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core_client.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 5ed46457978..13b72c80dc0 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -212,9 +212,9 @@ class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: Union[zmq.Context] = None
-    output_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
-    input_socket: Union[zmq.Socket, zmq.asyncio.Socket] = None
+    ctx: zmq.Context
+    output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     proc_handle: Optional[BackgroundProcHandle] = None
     shutdown_path: Optional[str] = None
 

From 3af8f3301e0bee8a2d780e18859fbb5980763eeb Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 22 Mar 2025 17:06:39 -0400
Subject: [PATCH 0913/1240] Remove openvino support in favor of external plugin
 (#15339)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-openvino-test.sh               |  16 -
 Dockerfile.openvino                           |  29 -
 docs/source/getting_started/installation.md   |   1 -
 .../installation/ai_accelerator.md            |  77 ---
 .../ai_accelerator/openvino.inc.md            | 110 ----
 requirements/openvino.txt                     |   8 -
 setup.py                                      |  10 +-
 tests/conftest.py                             |   3 +-
 tests/kernels/test_attention_selector.py      |  14 +-
 tests/utils.py                                |   2 +-
 vllm/attention/backends/openvino.py           | 146 -----
 vllm/config.py                                |   2 +-
 vllm/engine/arg_utils.py                      |   1 -
 vllm/envs.py                                  |  28 +-
 vllm/model_executor/model_loader/openvino.py  | 204 ------
 vllm/platforms/__init__.py                    |  17 -
 vllm/platforms/interface.py                   |   5 -
 vllm/platforms/openvino.py                    | 152 -----
 vllm/worker/openvino_model_runner.py          | 372 -----------
 vllm/worker/openvino_worker.py                | 600 ------------------
 20 files changed, 8 insertions(+), 1789 deletions(-)
 delete mode 100755 .buildkite/run-openvino-test.sh
 delete mode 100644 Dockerfile.openvino
 delete mode 100644 docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
 delete mode 100644 requirements/openvino.txt
 delete mode 100644 vllm/attention/backends/openvino.py
 delete mode 100644 vllm/model_executor/model_loader/openvino.py
 delete mode 100644 vllm/platforms/openvino.py
 delete mode 100644 vllm/worker/openvino_model_runner.py
 delete mode 100644 vllm/worker/openvino_worker.py

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
deleted file mode 100755
index a1103bed66e..00000000000
--- a/.buildkite/run-openvino-test.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
deleted file mode 100644
index 445c70ab89d..00000000000
--- a/Dockerfile.openvino
+++ /dev/null
@@ -1,29 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y \
-        git python3-pip \
-        ffmpeg libsm6 libxext6 libgl1
-WORKDIR /workspace
-
-COPY . .
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
-
-RUN python3 -m pip install -U pip
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
-
-COPY examples/ /workspace/examples
-COPY benchmarks/ /workspace/benchmarks
-
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-
-CMD ["/bin/bash"]
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
index af9fd495a72..44134bf01b7 100644
--- a/docs/source/getting_started/installation.md
+++ b/docs/source/getting_started/installation.md
@@ -26,4 +26,3 @@ installation/ai_accelerator
   - Google TPU
   - Intel Gaudi
   - AWS Neuron
-  - OpenVINO
diff --git a/docs/source/getting_started/installation/ai_accelerator.md b/docs/source/getting_started/installation/ai_accelerator.md
index 61a853ccefd..0a207af1a4c 100644
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ b/docs/source/getting_started/installation/ai_accelerator.md
@@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
 :::::
 
 ## Requirements
@@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
 :::::
 
 ## Configure a new environment
@@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} python_env_setup.inc.md
-:::
-
-::::
-
 :::::
 
 ## Set up using Python
@@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
 :::::
 
 ### Build wheel from source
@@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
 :::::
 
 ## Set up using Docker
@@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
 :::::
 
 ### Build image from source
@@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
 :::::
 
 ## Extra information
@@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you
 
 ::::
 
-::::{tab-item} OpenVINO
-:sync: openvino
-
-:::{include} ai_accelerator/openvino.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
 :::::
diff --git a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md b/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
deleted file mode 100644
index ab0db4795da..00000000000
--- a/docs/source/getting_started/installation/ai_accelerator/openvino.inc.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Installation
-
-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
-- OS: Linux
-- Instruction set architecture (ISA) requirement: at least AVX2.
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built OpenVINO wheels.
-
-### Build wheel from source
-
-First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
-
-```console
-sudo apt-get update  -y
-sudo apt-get install python3
-pip install --upgrade pip
-```
-
-Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
-
-```console
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-```
-
-Finally, install vLLM with OpenVINO backend:
-
-```console
-PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
-```
-
-:::{tip}
-To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
-:::
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built OpenVINO images.
-
-### Build image from source
-
-```console
-docker build -f Dockerfile.openvino -t vllm-openvino-env .
-docker run -it --rm vllm-openvino-env
-```
-
-## Extra information
-
-## Supported features
-
-OpenVINO vLLM backend supports the following advanced vLLM features:
-
-- Prefix caching (`--enable-prefix-caching`)
-- Chunked prefill (`--enable-chunked-prefill`)
-
-## Performance tips
-
-### vLLM OpenVINO backend environment variables
-
-- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
-- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
-
-### CPU performance tips
-
-CPU uses the following environment variables to control behavior:
-
-- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
-- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
-
-To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
-
-OpenVINO best known configuration for CPU is:
-
-```console
-$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
-```
-
-### GPU performance tips
-
-GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
-
-Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
-
-OpenVINO best known configuration for GPU is:
-
-```console
-$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
-    python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
-```
-
-## Limitations
-
-- LoRA serving is not supported.
-- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
-- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
diff --git a/requirements/openvino.txt b/requirements/openvino.txt
deleted file mode 100644
index 04b8c3b009a..00000000000
--- a/requirements/openvino.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# Common dependencies
--r common.txt
-
-torch == 2.5.1 #  should be aligned with "common" vLLM torch version
-openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
-
-optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
diff --git a/setup.py b/setup.py
index d412f34b3e3..6c45413c321 100755
--- a/setup.py
+++ b/setup.py
@@ -449,10 +449,6 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
-def _is_openvino() -> bool:
-    return VLLM_TARGET_DEVICE == "openvino"
-
-
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
@@ -572,8 +568,6 @@ def get_vllm_version() -> str:
         if gaudi_sw_version != MAIN_CUDA_VERSION:
             gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
             version += f"{sep}gaudi{gaudi_sw_version}"
-    elif _is_openvino():
-        version += f"{sep}openvino"
     elif _is_tpu():
         version += f"{sep}tpu"
     elif _is_cpu():
@@ -623,8 +617,6 @@ def _read_requirements(filename: str) -> list[str]:
         requirements = _read_requirements("neuron.txt")
     elif _is_hpu():
         requirements = _read_requirements("hpu.txt")
-    elif _is_openvino():
-        requirements = _read_requirements("openvino.txt")
     elif _is_tpu():
         requirements = _read_requirements("tpu.txt")
     elif _is_cpu():
@@ -634,7 +626,7 @@ def _read_requirements(filename: str) -> list[str]:
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
-            "OpenVINO, or CPU.")
+            "or CPU.")
     return requirements
 
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 0c71d981016..cc48fceb8ef 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -273,8 +273,7 @@ class HfRunner:
     def get_default_device(self):
         from vllm.platforms import current_platform
 
-        return ("cpu" if current_platform.is_cpu()
-                or current_platform.is_openvino() else "cuda")
+        return ("cpu" if current_platform.is_cpu() else "cuda")
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
         if x is None or isinstance(x, (bool, )):
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 1615c23a4f7..a51e70d45ee 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from unittest.mock import Mock, patch
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -8,7 +8,6 @@
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
-from vllm.platforms.openvino import OpenVinoPlatform
 from vllm.platforms.rocm import RocmPlatform
 from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
@@ -21,9 +20,9 @@ def clear_cache():
 
 
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("use_v1", [True, False])
-@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
+@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
 def test_env(
     name: str,
     use_v1: bool,
@@ -51,13 +50,6 @@ def test_env(
                                            16, False)
             EXPECTED = "TRITON_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
             assert backend.get_name() == EXPECTED
-        elif device == "openvino":
-            with patch("vllm.attention.selector.current_platform",
-                       OpenVinoPlatform()), patch.dict('sys.modules',
-                                                       {'openvino': Mock()}):
-                backend = get_attn_backend(16, torch.float16, torch.float16,
-                                           16, False)
-            assert backend.get_name() == "OPENVINO"
         else:
             if name in ["XFORMERS", "FLASHINFER"]:
                 with patch("vllm.attention.selector.current_platform",
diff --git a/tests/utils.py b/tests/utils.py
index 627cf567afc..a827b6d4b9b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -786,7 +786,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     without enough resources, or called when filtering tests to run directly.
     """
     try:
-        if current_platform.is_cpu() or current_platform.is_openvino():
+        if current_platform.is_cpu():
             memory_gb = 0
         else:
             memory_gb = current_platform.get_device_total_memory() / GB_bytes
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
deleted file mode 100644
index 9908620a32a..00000000000
--- a/vllm/attention/backends/openvino.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple, Type
-
-import openvino as ov
-import torch
-
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadata)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.multimodal import MultiModalPlaceholderMap
-
-
-def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
-                     src_offset: int, dst_offset: int) -> None:
-
-    def create_roi_tensor(
-        tensor: ov.Tensor,
-        block_number: int,
-    ) -> ov.Tensor:
-        roi_begin = ov.runtime.Coordinate([0, 0, 0, 0])
-        roi_end = ov.runtime.Coordinate(tensor.get_shape())
-
-        roi_begin[0] = block_number
-        roi_end[0] = block_number + 1
-
-        if isinstance(tensor, ov.Tensor):
-            return ov.Tensor(tensor, roi_begin, roi_end)
-        else:
-            return ov.RemoteTensor(tensor, roi_begin, roi_end)
-
-    src_roi_tensor = \
-        create_roi_tensor(src_tensor, src_offset)
-    dst_roi_tensor = \
-        create_roi_tensor(dst_tensor, dst_offset)
-    src_roi_tensor.copy_to(dst_roi_tensor)
-
-
-class OpenVINOAttentionBackend(AttentionBackend):
-
-    @staticmethod
-    def get_name() -> str:
-        return "OPENVINO"
-
-    @staticmethod
-    def get_impl_cls():
-        # OpenVINO implements PagedAttention as part of the Optimum
-        # exported model
-        raise NotImplementedError
-
-    @staticmethod
-    def make_metadata(*args, **kwargs) -> "AttentionMetadata":
-        raise NotImplementedError
-
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-
-    @staticmethod
-    def make_openvino_metadata(*args, **kwargs) -> "OpenVINOAttentionMetadata":
-        return OpenVINOAttentionMetadata(*args, **kwargs)
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (2, num_blocks, num_kv_heads, block_size, head_size)
-
-    @staticmethod
-    def swap_blocks(
-        src_tensor: ov.Tensor,
-        dst_tensor: ov.Tensor,
-        src_to_dists: List[Tuple[int, int]],
-    ) -> None:
-        for src, dst in src_to_dists:
-            copy_cache_block(src_tensor, dst_tensor, src, dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
-        src_to_dists: List[Tuple[int, int]],
-    ) -> None:
-        for src, dst in src_to_dists:
-            for key_cache, value_cache in kv_caches:
-                copy_cache_block(key_cache, key_cache, src, dst)
-                copy_cache_block(value_cache, value_cache, src, dst)
-
-
-@dataclass
-class OpenVINOAttentionMetadata:
-    """Metadata for OpenVINOAttentionBackend.
-
-    Basic terms used below:
-    - batch_size_in_sequences - total number of sequences to execute​
-    - prompt_lens – per sequence size number of scheduled tokens​
-    - batch_size_in_tokens = sum(prompt_lens)​
-    - max_context_len = max(context_lens)​
-    - max_num_blocks = div_up(max_context_len / BLOCK_SIZE)​
-    - num_blocks – total number of blocks in block_indices​
-    """
-
-    # Describes past KV cache size for each sequence within a batch
-    # Shape: [batch_size_in_sequences]
-    # Type: i32​
-    past_lens: torch.Tensor
-
-    # Describes start indices of input / speculative tokens from
-    # current sequences within a batch sequence​
-    # Shape: [batch_size_in_sequences + 1]​
-    # Type: i32
-    subsequence_begins: torch.Tensor
-
-    # Describes block tables for each sequence within a batch​ -
-    # indices along 0th dimension in key_cache and value_cache inputs​
-    # Shape: [num_blocks]
-    # Type: i32​
-    block_indices: torch.Tensor
-
-    # Describes block tables for each sequence within a batch​ -
-    # for i-th element, it is an index in block_indices with the
-    # first block belonging to i-th sequence​
-    # Shape: [batch_size_in_sequences + 1]
-    # Type: i32​
-    block_indices_begins: torch.Tensor
-
-    # Describes max context length
-    # Shape: scalar
-    # Type: i32
-    max_context_len: torch.Tensor
-
-    # The index maps that relate multi-modal embeddings to the corresponding
-    # placeholders.
-    #
-    # N.B. These aren't really related to attention and don't belong on this
-    # type -- this is just a temporary solution to make them available to
-    # `model_executable`.
-    multi_modal_placeholder_index_maps: Optional[Dict[
-        str, MultiModalPlaceholderMap.IndexMap]]
-
-    # Enable/disable KV scales calculation. This is so that we can disable the
-    # calculation until after prefill and cuda graph capture.
-    enable_kv_scales_calculation: bool
diff --git a/vllm/config.py b/vllm/config.py
index 181fa803c62..42f517e49a1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1801,7 +1801,7 @@ def __init__(self, device: str = "auto") -> None:
             self.device_type = device
 
         # Some device types require processing inputs on CPU
-        if self.device_type in ["neuron", "openvino"]:
+        if self.device_type in ["neuron"]:
             self.device = torch.device("cpu")
         elif self.device_type in ["tpu"]:
             self.device = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 35c60a60266..5d06a86e695 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -40,7 +40,6 @@
     "cuda",
     "neuron",
     "cpu",
-    "openvino",
     "tpu",
     "xpu",
     "hpu",
diff --git a/vllm/envs.py b/vllm/envs.py
index 7c07940c26c..829f9450fb7 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -40,10 +40,6 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_CPU_MOE_PREPACK: bool = True
-    VLLM_OPENVINO_DEVICE: str = "CPU"
-    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
-    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
-    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
@@ -131,7 +127,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # ================== Installation Time Env Vars ==================
 
     # Target device of vLLM, supporting [cuda (by default),
-    # rocm, neuron, cpu, openvino]
+    # rocm, neuron, cpu]
     "VLLM_TARGET_DEVICE":
     lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
 
@@ -358,28 +354,6 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_CPU_MOE_PREPACK":
     lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
 
-    # OpenVINO device selection
-    # default is CPU
-    "VLLM_OPENVINO_DEVICE":
-    lambda: os.getenv("VLLM_OPENVINO_DEVICE", "CPU").upper(),
-
-    # OpenVINO key-value cache space
-    # default is 4GB
-    "VLLM_OPENVINO_KVCACHE_SPACE":
-    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
-
-    # OpenVINO KV cache precision
-    # default is bf16 if natively supported by platform, otherwise f16
-    # To enable KV cache compression, please, explicitly specify u8
-    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
-    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
-
-    # Enables weights compression during model export via HF Optimum
-    # default is False
-    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
-    lambda:
-    (os.environ.get("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", "0").lower() in
-     ("on", "true", "1")),
     # If the env var is set, then all workers will execute as separate
     # processes from the engine, and we use the same mechanism to trigger
     # execution on all workers.
diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py
deleted file mode 100644
index cd2d427edbb..00000000000
--- a/vllm/model_executor/model_loader/openvino.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# ruff: noqa: SIM117
-from pathlib import Path
-from typing import Optional
-
-import openvino as ov
-import torch
-from huggingface_hub import HfApi
-from openvino._offline_transformations import paged_attention_transformation
-from optimum.intel import OVModelForCausalLM
-from torch import nn
-
-import vllm.envs as envs
-from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.forward_context import get_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
-                                                         _prune_hidden_states)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
-
-logger = init_logger(__name__)
-
-
-def _flatten_inputs(inputs):
-    """
-    Helper function for making nested inputs flattens
-    """
-    flatten_inputs = []
-    for input_data in inputs:
-        if input_data is None:
-            continue
-        if isinstance(input_data, (list, tuple)):
-            flatten_inputs.extend(_flatten_inputs(input_data))
-        elif isinstance(input_data, dict):
-            flatten_inputs.extend(_flatten_inputs(list(input_data.values())))
-        else:
-            flatten_inputs.append(input_data)
-    return flatten_inputs
-
-
-def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
-                             is_cpu: bool):
-    # Apply hardware dependent modifications to KV tensors
-    for parameter in model.get_parameters():
-        input = parameter.get_output_tensor(0)
-        input_names = input.get_names()
-        if len(input_names) != 1:
-            continue
-        input_name = next(iter(input_names))
-        shape = parameter.get_partial_shape()
-        # use real block size if available, just a placeholder
-        # to provide the expected rank
-        num_blocks = ov.Dimension()
-        block_size = ov.Dimension()
-        head_size = ov.Dimension()
-        if input_name.startswith("key_cache."):
-            cpu_shape = [num_blocks, shape[1], block_size, head_size]
-            gpu_shape = [num_blocks, shape[1], shape[2], block_size]
-        elif input_name.startswith("value_cache."):
-            cpu_shape = [num_blocks, shape[1], block_size, head_size]
-            gpu_shape = [num_blocks, shape[1], block_size, shape[2]]
-        else:
-            continue
-        parameter.set_partial_shape(
-            ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
-        parameter.set_element_type(kv_cache_dtype)
-    model.validate_nodes_and_infer_types()
-
-
-def _require_model_export(model_id, revision=None, subfolder=None):
-    model_dir = Path(model_id)
-    if subfolder is not None:
-        model_dir = model_dir / subfolder
-    if model_dir.is_dir():
-        return (not (model_dir / "openvino_model.xml").exists()
-                or not (model_dir / "openvino_model.bin").exists())
-
-    hf_api = HfApi()
-    try:
-        model_info = hf_api.model_info(model_id, revision=revision or "main")
-        normalized_subfolder = (None if subfolder is None else
-                                Path(subfolder).as_posix())
-        model_files = [
-            file.rfilename for file in model_info.siblings
-            if normalized_subfolder is None
-            or file.rfilename.startswith(normalized_subfolder)
-        ]
-        ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
-                         else f"{normalized_subfolder}/openvino_model.xml")
-        return (ov_model_path not in model_files
-                or ov_model_path.replace(".xml", ".bin") not in model_files)
-    except Exception:
-        return True
-
-
-class OpenVINOCausalLM(nn.Module):
-
-    def __init__(
-        self,
-        ov_core: ov.Core,
-        model_config: ModelConfig,
-        kv_cache_dtype: ov.Type,
-    ) -> None:
-        super().__init__()
-        self.logits_processor = LogitsProcessor(
-            model_config.hf_config.vocab_size, logits_as_input=True)
-        self.sampler = Sampler()
-
-        export = _require_model_export(model_config.model)
-        if export:
-            logger.warning(
-                f"Provided model id {model_config.model} does not "  # noqa: G004
-                "contain OpenVINO IR, the model will be converted to IR with "
-                "default options. If you need to use specific options for "
-                "model conversion, use optimum-cli export openvino with "
-                "desired options.")
-        else:
-            logger.warning(
-                "OpenVINO IR is available for provided model id "  # noqa: G004
-                f"{model_config.model}. This IR will be used for inference "
-                "as-is, all possible options that may affect model conversion "
-                "are ignored.")
-
-        load_in_8bit = (envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
-                        if export else False)
-        pt_model = OVModelForCausalLM.from_pretrained(
-            model_config.model,
-            export=export,
-            compile=False,
-            load_in_8bit=load_in_8bit,
-            trust_remote_code=model_config.trust_remote_code,
-        )
-
-        ov_device = envs.VLLM_OPENVINO_DEVICE
-        paged_attention_transformation(pt_model.model)
-        _modify_cache_parameters(pt_model.model, kv_cache_dtype,
-                                 current_platform.is_openvino_cpu())
-
-        ov_compiled = ov_core.compile_model(pt_model.model, ov_device)
-        self.ov_request = ov_compiled.create_infer_request()
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: list[tuple[ov.Tensor, ov.Tensor]],
-    ) -> torch.Tensor:
-        flat_kv_caches = _flatten_inputs(kv_caches)
-        attn_metadata = get_forward_context().attn_metadata
-
-        inputs = [
-            input_ids,
-            positions,
-            *flat_kv_caches,
-            attn_metadata.past_lens,
-            attn_metadata.subsequence_begins,
-            attn_metadata.block_indices,
-            attn_metadata.block_indices_begins,
-            attn_metadata.max_context_len,
-        ]
-
-        self.ov_request.start_async(inputs, share_inputs=True)
-        self.ov_request.wait()
-
-        logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
-
-        # TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
-        return logits.view(-1, logits.shape[-1])
-
-    def compute_logits(self, hidden_states: torch.Tensor,
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
-        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
-        logits = self.logits_processor(None, hidden_states, sampling_metadata)
-        return logits
-
-    def sample(
-        self,
-        logits: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[SamplerOutput]:
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
-
-def get_model(
-    vllm_config: VllmConfig,
-    kv_cache_dtype: ov.Type,
-    **kwargs,
-) -> torch.nn.Module:
-    lora_config = kwargs.get("lora_config")
-    ov_core = kwargs.get("ov_core")
-    if lora_config:
-        raise ValueError(
-            "OpenVINO modeling does not support LoRA, "
-            "but LoRA is enabled. Support for this model may "
-            "be added in the future. If this is important to you, "
-            "please open an issue on github.")
-
-    with set_current_vllm_config(vllm_config):
-        return OpenVINOCausalLM(ov_core, vllm_config.model_config,
-                                kv_cache_dtype)
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index 4912906fef4..0ed22104317 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -2,7 +2,6 @@
 
 import logging
 import traceback
-from contextlib import suppress
 from itertools import chain
 from typing import TYPE_CHECKING, Optional
 
@@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
     return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None
 
 
-def openvino_platform_plugin() -> Optional[str]:
-    is_openvino = False
-    logger.debug("Checking if OpenVINO platform is available.")
-    with suppress(Exception):
-        is_openvino = vllm_version_matches_substr("openvino")
-        if is_openvino:
-            logger.debug("Confirmed OpenVINO platform is available"
-                         " because vLLM is built with OpenVINO.")
-    if not is_openvino:
-        logger.debug("OpenVINO platform is not available because"
-                     " vLLM is not built with OpenVINO.")
-
-    return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None
-
-
 builtin_platform_plugins = {
     'tpu': tpu_platform_plugin,
     'cuda': cuda_platform_plugin,
@@ -214,7 +198,6 @@ def openvino_platform_plugin() -> Optional[str]:
     'xpu': xpu_platform_plugin,
     'cpu': cpu_platform_plugin,
     'neuron': neuron_platform_plugin,
-    'openvino': openvino_platform_plugin,
 }
 
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index d3bffaf4d69..9981deee39b 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -33,7 +33,6 @@ class _Backend(enum.Enum):
     XFORMERS = enum.auto()
     ROCM_FLASH = enum.auto()
     TORCH_SDPA = enum.auto()
-    OPENVINO = enum.auto()
     FLASHINFER = enum.auto()
     TRITON_MLA = enum.auto()  # Supported by V1
     FLASHMLA = enum.auto()  # Supported by V1
@@ -53,7 +52,6 @@ class PlatformEnum(enum.Enum):
     XPU = enum.auto()
     CPU = enum.auto()
     NEURON = enum.auto()
-    OPENVINO = enum.auto()
     OOT = enum.auto()
     UNSPECIFIED = enum.auto()
 
@@ -136,9 +134,6 @@ def is_cpu(self) -> bool:
     def is_neuron(self) -> bool:
         return self._enum == PlatformEnum.NEURON
 
-    def is_openvino(self) -> bool:
-        return self._enum == PlatformEnum.OPENVINO
-
     def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
deleted file mode 100644
index f385064875c..00000000000
--- a/vllm/platforms/openvino.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import TYPE_CHECKING, Optional
-
-import torch
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-from .interface import Platform, PlatformEnum, _Backend
-
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-else:
-    VllmConfig = None
-
-logger = init_logger(__name__)
-
-try:
-    import openvino as ov
-    import openvino.properties.hint as hints
-except ImportError as e:
-    logger.warning("Failed to import OpenVINO with %r", e)
-
-
-class OpenVinoPlatform(Platform):
-    _enum = PlatformEnum.OPENVINO
-    device_name: str = "openvino"
-    device_type: str = "openvino"
-    dispatch_key: str = "CPU"
-
-    @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
-        if selected_backend != _Backend.OPENVINO:
-            logger.info("Cannot use %s backend on OpenVINO.", selected_backend)
-        logger.info("Using OpenVINO Attention backend.")
-        return "vllm.attention.backends.openvino.OpenVINOAttentionBackend"
-
-    @classmethod
-    def get_device_name(cls, device_id: int = 0) -> str:
-        return "openvino"
-
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return False
-
-    @classmethod
-    def inference_mode(cls):
-        return torch.inference_mode(mode=True)
-
-    @classmethod
-    def is_openvino_cpu(cls) -> bool:
-        return "CPU" in envs.VLLM_OPENVINO_DEVICE
-
-    @classmethod
-    def is_openvino_gpu(cls) -> bool:
-        return "GPU" in envs.VLLM_OPENVINO_DEVICE
-
-    @classmethod
-    def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on OpenViNO.")
-        return False
-
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        from vllm.utils import GiB_bytes
-
-        parallel_config = vllm_config.parallel_config
-        assert (parallel_config.world_size == 1
-                ), "OpenVINO only supports single CPU socket currently."
-
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = \
-                "vllm.worker.openvino_worker.OpenVINOWorker"
-
-        # check and update model config
-        model_config = vllm_config.model_config
-        if model_config.dtype != torch.float32:
-            logger.warning(
-                f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}."  # noqa: G004, E501
-            )
-            model_config.dtype = torch.float32
-        if not model_config.enforce_eager:
-            logger.warning(
-                "CUDA graph is not supported on OpenVINO backend, fallback to "
-                "the eager mode.")
-            model_config.enforce_eager = True
-
-        # check and update cache config
-        ov_core = ov.Core()
-        cache_config = vllm_config.cache_config
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
-
-        if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
-            if not OpenVinoPlatform.is_openvino_cpu():
-                logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
-                            "ignored for GPU, f16 data type will be used.")
-                cache_config.cache_dtype = ov.Type.f16
-            else:
-                logger.info("KV cache type is overridden to u8 via "
-                            "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
-                cache_config.cache_dtype = ov.Type.u8
-        else:
-            if OpenVinoPlatform.is_openvino_cpu():
-                ov_device = envs.VLLM_OPENVINO_DEVICE
-                inference_precision = ov_core.get_property(
-                    ov_device, hints.inference_precision)
-                if inference_precision == ov.Type.bf16:
-                    cache_config.cache_dtype = ov.Type.bf16
-                else:
-                    cache_config.cache_dtype = ov.Type.f16
-            else:
-                cache_config.cache_dtype = ov.Type.f16
-
-        if OpenVinoPlatform.is_openvino_cpu():
-            if cache_config.block_size != 32:
-                logger.info(
-                    f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
-                )
-                cache_config.block_size = 32
-        else:
-            if cache_config.block_size != 16:
-                logger.info(
-                    f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}"  # noqa: G004, E501
-                )
-                cache_config.block_size = 16
-
-        kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
-        if kv_cache_space >= 0:
-            if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu():
-                cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-                logger.warning(
-                    "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
-                    "for OpenVINO backend is not set, using 4 by default.")
-            else:
-                cache_config.openvino_kvcache_space_bytes = (  # type: ignore
-                    kv_cache_space * GiB_bytes)
-        else:
-            raise RuntimeError(
-                "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
-                f" {kv_cache_space}, expect a positive integer value.")
-
-        assert vllm_config.device_config.device_type == "openvino"
-        assert vllm_config.lora_config is None, \
-            "OpenVINO backend doesn't support LoRA"
-        assert cls.is_openvino_cpu() or \
-            cls.is_openvino_gpu(), \
-            "OpenVINO backend supports only CPU and GPU devices"
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
deleted file mode 100644
index aa1d2cbb2df..00000000000
--- a/vllm/worker/openvino_model_runner.py
+++ /dev/null
@@ -1,372 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from collections import defaultdict
-from typing import Dict, List, NamedTuple, Optional, Tuple
-
-import openvino as ov
-import torch
-from torch import nn
-
-from vllm.attention import get_attn_backend
-from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
-from vllm.config import VllmConfig
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.openvino import get_model
-from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalKwargs, MultiModalPlaceholderMap)
-from vllm.sequence import SequenceGroupMetadata
-from vllm.worker.model_runner_base import ModelRunnerBase
-
-logger = init_logger(__name__)
-
-
-class ModelInput(NamedTuple):
-    input_tokens: torch.Tensor
-    input_positions: torch.Tensor
-    attn_metadata: Optional[OpenVINOAttentionMetadata]
-    seq_lens: List[int]
-    query_lens: List[int]
-    multi_modal_kwargs: BatchedTensorInputs
-
-    @classmethod
-    def empty(cls, device):
-        return ModelInput(input_tokens=torch.empty(0, device=device),
-                          input_positions=torch.empty(0, device=device),
-                          attn_metadata=None,
-                          seq_lens=[],
-                          query_lens=[],
-                          multi_modal_kwargs={})
-
-
-class OpenVINOModelRunner(ModelRunnerBase):
-
-    def __init__(
-        self,
-        ov_core: ov.Core,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-        *args,
-        **kwargs,
-    ):
-        self.ov_core = ov_core
-        ModelRunnerBase.__init__(self, vllm_config=vllm_config)
-        self.is_driver_worker = is_driver_worker
-
-        self.device = self.device_config.device
-
-        self.kv_cache_dtype = kv_cache_dtype
-        self.sliding_window = self.model_config.get_sliding_window()
-        self.block_size = self.cache_config.block_size
-
-        self.attn_backend = get_attn_backend(
-            self.model_config.get_head_size(),
-            self.model_config.dtype,
-            self.kv_cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-        )
-
-        # Multi-modal data support
-        self.mm_registry = MULTIMODAL_REGISTRY
-        self.multi_modal_input_mapper = self.mm_registry \
-            .create_input_mapper(self.model_config)
-
-        # Lazy initialization.
-        self.model: nn.Module  # Set after init_Model
-
-    def load_model(self) -> None:
-        self.model = get_model(vllm_config=self.vllm_config,
-                               kv_cache_dtype=self.kv_cache_dtype,
-                               ov_core=self.ov_core)
-
-    def get_model(self) -> nn.Module:
-        return self.model
-
-    def _prepare_model_input(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> ModelInput:
-        """Prepare the model input based on a given sequence group.
-
-        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
-
-        The result tensors and data structure also batches input in prefill
-        -> decode order. For example,
-
-        - input_tokens[:num_prefill_tokens] contains prefill tokens.
-        - input_tokens[num_prefill_tokens:] contains decode tokens.
-        """
-        input_tokens: List[int] = []
-        input_positions: List[int] = []
-
-        seq_lens: List[int] = []
-        past_lens: List[int] = []
-        query_lens: List[int] = []
-        multi_modal_kwargs_list: List[MultiModalKwargs] = []
-        multi_modal_placeholder_maps: Dict[
-            str,
-            MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
-
-        subsequence_begins: List[int] = []
-        block_indices: List[int] = []
-        block_indices_begins: List[int] = []
-
-        # initialize beginning of prefix sums
-        subsequence_begins.append(0)
-        block_indices_begins.append(0)
-
-        if len(seq_group_metadata_list) == 0:
-            return ModelInput.empty(self.device)
-
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-            is_prompt = seq_group_metadata.is_prompt
-
-            for seq_id in seq_ids:
-                computed_block_nums = seq_group_metadata.computed_block_nums
-                if (self.scheduler_config is not None
-                        and self.scheduler_config.chunked_prefill_enabled
-                        and not (computed_block_nums is None
-                                 or computed_block_nums == [])):
-                    raise RuntimeError(
-                        "chunked prefill cannot be used with prefix caching "
-                        "now.")
-
-                seq_data = seq_group_metadata.seq_data[seq_id]
-                if is_prompt:
-                    computed_len = seq_data.get_num_computed_tokens()
-                else:
-                    # get_num_computed_tokens is incorrect for spec decoding.
-                    # So, we should have a special logic here.
-                    # TODO(sang): Fix it.
-                    computed_len = seq_data.get_len() - 1
-
-                seq_len = min(
-                    seq_data.get_len(),
-                    computed_len + seq_group_metadata.token_chunk_size,
-                )
-                if is_prompt:
-                    tokens = seq_data.get_token_ids()[computed_len:seq_len]
-                else:
-                    # Optimization. get_token_ids requires the entire copy of
-                    # tokens.
-                    tokens = [seq_data.get_last_token_id()]
-
-                # Prefix cache was hit.
-                # Prefix is not supported with sliding_window
-                prefix_cache_hit = (computed_block_nums is not None
-                                    and len(computed_block_nums) > 0
-                                    and self.sliding_window is None
-                                    and is_prompt)
-
-                block_table = seq_group_metadata.block_tables[seq_id]
-                # TODO(sang): Combine chunked prefill and prefix caching by
-                # only allowing multiple of block_size chunk size.
-                # NOTE: This only works for oooooooxxx style attention.
-                if prefix_cache_hit:
-                    assert computed_block_nums is not None
-                    computed_len = len(computed_block_nums) * self.block_size
-                    tokens = tokens[computed_len:]
-                elif (self.scheduler_config.chunked_prefill_enabled
-                      or not is_prompt):
-                    if seq_group_metadata.block_tables is not None:
-                        # chunked prefill or decode
-                        block_table = seq_group_metadata.block_tables[seq_id]
-                        if self.sliding_window is not None:
-                            # chunked prefill doesn't support sliding window.
-                            assert not self.scheduler_config.chunked_prefill_enabled  # noqa: E501
-                            sliding_window_blocks = (self.sliding_window //
-                                                     self.block_size)
-                            block_table = block_table[-sliding_window_blocks:]
-                    else:
-                        # Only happens when memory profiling runs.
-                        block_table = []
-                else:
-                    # prompt phase w/o prefix_caching, chunked_prefill
-                    pass
-
-                block_indices.extend(block_table)
-                block_indices_begins.append(block_indices_begins[-1] +
-                                            len(block_table))
-
-                # TODO(sang): This is a hack to make sliding window work with
-                # paged attn. We can remove it if we make paged attn kernel
-                # to properly handle slinding window attn.
-                if self.sliding_window is not None and not is_prompt:
-                    seq_len = min(seq_len, self.sliding_window)
-                    computed_len = seq_len - 1
-
-                seq_lens.append(seq_len)
-
-                query_len = seq_len - computed_len
-                query_lens.append(query_len)
-
-                input_tokens.extend(tokens)
-                positions_range = range(computed_len, seq_len)
-                input_positions.extend(list(positions_range))
-
-                past_lens.append(computed_len)
-                subsequence_begins.append(subsequence_begins[-1] + query_len)
-
-                if is_prompt:
-                    assert len(seq_ids) == 1
-                else:
-                    assert (
-                        query_len == 1
-                    ), "seq_len: {}, computed_len: {}, query_len: {}".format(
-                        seq_len, computed_len, query_len)
-
-                if seq_group_metadata.multi_modal_data:
-                    # NOTE: mm_data only includes the subset of multi-modal
-                    # items that intersect with the current prefill positions.
-                    mm_data, placeholder_maps = MultiModalPlaceholderMap \
-                        .from_seq_group(seq_group_metadata, positions_range)
-
-                    if self.mm_registry.has_processor(self.model_config):
-                        mm_kwargs = mm_data
-                    else:
-                        mm_kwargs = self.multi_modal_input_mapper(
-                            mm_data,
-                            seq_group_metadata.mm_processor_kwargs,
-                        )
-
-                    multi_modal_kwargs_list.append(mm_kwargs)
-
-                    for modality, placeholder_map in placeholder_maps.items():
-                        multi_modal_placeholder_maps[modality].extend(
-                            placeholder_map, )
-
-        max_query_len = max(query_lens)
-        assert max_query_len > 0, "query_lens: {}".format(query_lens)
-
-        input_tokens = torch.tensor(input_tokens,
-                                    dtype=torch.long,
-                                    device=self.device)  # type: ignore
-        input_positions = torch.tensor(input_positions,
-                                       dtype=torch.long,
-                                       device=self.device)  # type: ignore
-
-        past_lens_tensor = torch.tensor(past_lens,
-                                        dtype=torch.int32,
-                                        device=self.device)  # type: ignore
-        subsequence_begins_tensor = torch.tensor(
-            subsequence_begins, dtype=torch.int32,
-            device=self.device)  # type: ignore
-        block_indices_tensor = torch.tensor(block_indices,
-                                            dtype=torch.int32,
-                                            device=self.device)  # type: ignore
-        block_indices_begins_tensor = torch.tensor(
-            block_indices_begins, dtype=torch.int32,
-            device=self.device)  # type: ignore
-
-        max_context_len = max(seq_lens)
-        max_context_len_tensor = torch.tensor(
-            max_context_len, dtype=torch.int32,
-            device=self.device)  # type: ignore
-
-        placeholder_index_maps = {
-            modality: placeholder_map.index_map()
-            for modality, placeholder_map in
-            multi_modal_placeholder_maps.items()
-        }
-
-        attn_metadata = self.attn_backend.make_openvino_metadata(
-            past_lens=past_lens_tensor,
-            subsequence_begins=subsequence_begins_tensor,
-            block_indices=block_indices_tensor,
-            block_indices_begins=block_indices_begins_tensor,
-            max_context_len=max_context_len_tensor,
-            multi_modal_placeholder_index_maps=placeholder_index_maps,
-            enable_kv_scales_calculation=False,
-        )
-
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-
-        return ModelInput(
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            seq_lens,
-            query_lens,
-            multi_modal_kwargs=multi_modal_kwargs,
-        )
-
-    def prepare_input_tensors(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
-               SamplingMetadata, BatchedTensorInputs]:
-        # Prepare input tensors.
-        (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            seq_lens,
-            query_lens,
-            multi_modal_kwargs,
-        ) = self._prepare_model_input(seq_group_metadata_list)
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens,
-            self.device,
-            pin_memory=False,
-        )
-
-        return (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            sampling_metadata,
-            multi_modal_kwargs,
-        )
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        kv_caches: List[Tuple["ov.Tensor", "ov.Tensor"]],
-    ) -> Optional[SamplerOutput]:
-        (
-            input_tokens,
-            input_positions,
-            attn_metadata,
-            sampling_metadata,
-            multi_modal_kwargs,
-        ) = self.prepare_input_tensors(seq_group_metadata_list)
-
-        model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids":
-            input_tokens,
-            "positions":
-            input_positions,
-            "kv_caches":
-            kv_caches,
-            **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
-                                         device=self.device),
-        }
-
-        with set_forward_context(attn_metadata, self.vllm_config, 0):
-            hidden_states = model_executable(**execute_model_kwargs)
-
-        # Compute the logits.
-        logits = self.model.compute_logits(hidden_states, sampling_metadata)
-
-        # Sample the next token.
-        output = self.model.sample(
-            logits=logits,
-            sampling_metadata=sampling_metadata,
-        )
-        return output
-
-    def prepare_model_input(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs):
-        raise NotImplementedError
diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py
deleted file mode 100644
index fad91270ea2..00000000000
--- a/vllm/worker/openvino_worker.py
+++ /dev/null
@@ -1,600 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-"""An OpenVINO worker class."""
-from typing import Any, Dict, List, Optional, Tuple
-
-import openvino as ov
-import torch
-import torch.distributed
-import torch.nn as nn
-
-import vllm.envs as envs
-from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, VllmConfig)
-from vllm.distributed import (broadcast_tensor_dict,
-                              ensure_model_parallel_initialized,
-                              init_distributed_environment)
-from vllm.inputs import INPUT_REGISTRY
-from vllm.logger import init_logger
-from vllm.model_executor import set_random_seed
-from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.platforms import current_platform
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
-from vllm.utils import bind_kv_cache
-from vllm.worker.openvino_model_runner import OpenVINOModelRunner
-from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase
-
-logger = init_logger(__name__)
-
-
-class OpenVINOCacheEngine:
-    """Manages the KV cache for OpenVINO backend.
-
-    This class is responsible for initializing and managing CPU KV
-    caches. It also provides methods for performing KV cache operations, such
-    as copying.
-    """
-
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        device_config: DeviceConfig,
-        ov_core: ov.Core,
-        ov_device: str,
-    ) -> None:
-        assert device_config.device_type == "openvino"
-        self.cache_config = cache_config
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-
-        self.head_size = model_config.get_head_size()
-        if device_config.device.type == "cpu" and \
-            cache_config.cache_dtype == ov.Type.u8:
-            # Scale, zero point and quantized data will be stored together.
-            # The layout for per token per head:
-            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
-            # so, we have to extend head_size by 8, which is sizeof(float)
-            # for scale and sizeof(float) for zeropoint
-            self.head_size += 8
-        self.num_layers = model_config.get_num_layers(parallel_config)
-        self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-
-        self.block_size = cache_config.block_size
-        # Note: In CacheConfig, num_gpu_blocks actual is num_cpu_blocks
-        # for OpenVINO backend with a CPU target device, because we want
-        # to reuse KV cache management in the scheduler.
-        self.num_device_blocks = cache_config.num_gpu_blocks
-        self.num_swap_blocks = cache_config.num_cpu_blocks
-
-        # Get attention backend.
-        self.attn_backend = get_attn_backend(
-            self.head_size,
-            self.model_config.dtype,
-            self.cache_config.cache_dtype,
-            self.block_size,
-            self.model_config.is_attention_free,
-        )
-
-        # Initialize the cache.
-        self.kv_cache: List[Tuple[ov.Tensor,
-                                  ov.Tensor]] = self._allocate_kv_cache(
-                                      self.num_device_blocks, ov_core,
-                                      ov_device)
-
-        # Initialize the swap.
-        self.swap_cache: List[Tuple[ov.Tensor,
-                                    ov.Tensor]] = self._allocate_swap_cache(
-                                        self.num_swap_blocks, ov_device)
-
-    def _allocate_kv_cache(
-        self,
-        num_blocks: int,
-        ov_core: ov.Core,
-        ov_device: str,
-    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
-        """Allocates KV cache."""
-        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
-        kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
-
-        if current_platform.is_openvino_cpu():
-            for _ in range(self.num_layers):
-                key_blocks = ov.Tensor(self.cache_config.cache_dtype,
-                                       k_block_shape)
-                value_blocks = ov.Tensor(self.cache_config.cache_dtype,
-                                         v_block_shape)
-                kv_cache.append((key_blocks, value_blocks))
-        else:
-            # Update key_cache shape:
-            k_block_shape = (v_block_shape[0], v_block_shape[1],
-                             v_block_shape[3], v_block_shape[2])
-
-            remote_context = ov_core.get_default_context(ov_device)
-
-            for _ in range(self.num_layers):
-                key_blocks = \
-                    remote_context.create_tensor(self.cache_config.cache_dtype,
-                                                 ov.Shape(k_block_shape),
-                                                 {})
-
-                value_blocks = \
-                    remote_context.create_tensor(self.cache_config.cache_dtype,
-                                                 ov.Shape(v_block_shape),
-                                                 {})
-
-                kv_cache.append((key_blocks, value_blocks))
-
-        return kv_cache
-
-    def _allocate_swap_cache(
-        self,
-        num_blocks: int,
-        ov_device: str,
-    ) -> List[Tuple[ov.Tensor, ov.Tensor]]:
-        """Allocates swap cache."""
-        k_block_shape = v_block_shape = self.attn_backend.get_kv_cache_shape(
-            num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:]
-        swap_cache: List[Tuple[ov.Tensor, ov.Tensor]] = []
-
-        if num_blocks == 0:
-            return swap_cache
-
-        assert not current_platform.is_openvino_cpu(), \
-            "CPU device isn't supposed to have swap cache"
-
-        # Update key_cache shape:
-        k_block_shape = (v_block_shape[0], v_block_shape[1], v_block_shape[3],
-                         v_block_shape[2])
-
-        for _ in range(self.num_layers):
-            key_blocks = ov.Tensor(self.cache_config.cache_dtype,
-                                   k_block_shape)
-            value_blocks = ov.Tensor(self.cache_config.cache_dtype,
-                                     v_block_shape)
-            swap_cache.append((key_blocks, value_blocks))
-
-        return swap_cache
-
-    def swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
-        for i in range(self.num_layers):
-            for swap_tensor, kv_tensor in zip(self.swap_cache[i],
-                                              self.kv_cache[i]):
-                self.attn_backend.swap_blocks(swap_tensor, kv_tensor,
-                                              src_to_dst)
-
-    def swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
-        for i in range(self.num_layers):
-            for swap_tensor, kv_tensor in zip(self.swap_cache[i],
-                                              self.kv_cache[i]):
-                self.attn_backend.swap_blocks(kv_tensor, swap_tensor,
-                                              src_to_dst)
-
-    def copy(self, src_to_dsts: List[Tuple[int, int]]) -> None:
-        if (len(src_to_dsts) > 0):
-            self.attn_backend.copy_blocks(self.kv_cache, src_to_dsts)
-
-    @staticmethod
-    def get_cache_block_size(
-        block_size: int,
-        cache_dtype: ov.Type,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-    ) -> int:
-        head_size = model_config.get_head_size()
-        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-        num_layers = model_config.get_num_layers(parallel_config)
-
-        if cache_dtype == ov.Type.u8:
-            # Scale, zero point and quantized data will be stored together.
-            # The layout for per token per head:
-            # |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| # noqa: E501
-            # so, we have to extend head_size by 8, which is sizeof(float)
-            # for scale and sizeof(float) for zeropoint
-            head_size += 8
-
-        key_cache_block = block_size * num_kv_heads * head_size
-        value_cache_block = key_cache_block
-        total = num_layers * (key_cache_block + value_cache_block)
-        dtype_size = cache_dtype.size
-        return dtype_size * total
-
-
-class OpenVINOWorker(LoRANotSupportedWorkerBase):
-    """A worker class that executes the model on OpenVINO backend.
-
-    Each worker is associated with a single OpenVINO device. The worker is
-    responsible for maintaining the KV cache and executing the model on the
-    OpenVINO backend.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        is_driver_worker: bool = False,
-    ) -> None:
-        WorkerBase.__init__(self, vllm_config)
-        self.ov_core = ov.Core()
-        self.parallel_config.rank = rank
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.is_driver_worker = is_driver_worker
-        if self.is_driver_worker:
-            assert self.rank == 0, "The driver worker must have rank 0."
-
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-
-            init_cached_hf_modules()
-        self.model_runner = OpenVINOModelRunner(
-            self.ov_core,
-            vllm_config=self.vllm_config,
-            kv_cache_dtype=self.vllm_config.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker,
-        )
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: OpenVINOCacheEngine
-        self.kv_cache: List[Tuple[ov.Tensor, ov.Tensor]]
-
-    def init_device(self) -> None:
-        self.init_distributed_environment()
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of blocks available for the KV cache.
-
-        This determines how many KV blocks can fit into the configured
-        KV cache space.
-        """
-        # For OpenVINO backend, in case of CPU device, the block number will be
-        # calculated based on the openvino_kvcache_space_bytes.
-        cache_block_size = self.get_cache_block_size_bytes()
-        kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes
-
-        if current_platform.is_openvino_cpu():
-            num_device_blocks = int(kvcache_space_bytes // cache_block_size)
-            num_swap_blocks = 0
-        else:
-            if kvcache_space_bytes > 0:
-                logger.info("KV_CACHE size was explicitly configured via "
-                            "VLLM_OPENVINO_KVCACHE_SPACE environment "
-                            "variable, ignoring profiling run.")
-                kv_cache_size = kvcache_space_bytes
-            else:
-                try:
-                    kv_cache_size = self.profile_run()
-                except Exception as err:
-                    raise RuntimeError(
-                        "The error occurred during profile run. This might be "
-                        "due to insufficient GPU memory. Consider decreasing "
-                        "`max_model_len` to limit the maximum simultaneously "
-                        "processed tokens.") from err
-
-            num_device_blocks = int(kv_cache_size // cache_block_size)
-            num_swap_blocks = int(self.cache_config.swap_space_bytes //
-                                  cache_block_size)
-
-        return num_device_blocks, num_swap_blocks
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache. Swappable CPU memory is only
-        supported on GPU.
-
-        For CPU, we use the num_gpu_blocks to
-        determine how many non-swappable CPU blocks to allocate.
-        """
-
-        num_device_blocks = num_gpu_blocks
-        num_swap_blocks = num_cpu_blocks
-
-        if current_platform.is_openvino_cpu():
-            assert (num_swap_blocks == 0
-                    ), f"{type(self)} does not support swappable cache for CPU"
-
-        self._validate_num_blocks(num_device_blocks)
-        self.cache_config.num_gpu_blocks = num_device_blocks
-        self.cache_config.num_cpu_blocks = num_swap_blocks
-
-        # Initialize the cache.
-        self._init_cache_engine()
-
-    def _validate_num_blocks(self, num_blocks: int) -> None:
-        """Raise errors if the num_blocks is invalid."""
-        if num_blocks <= 0:
-            raise ValueError(
-                "No available memory for the cache blocks. "
-                "Try increasing `VLLM_OPENVINO_KVCACHE_SPACE` when "
-                "initializing the engine.")
-
-        max_seq_len = self.cache_config.block_size * num_blocks
-        if self.model_config.max_model_len > max_seq_len:
-            raise ValueError(
-                f"The model's max seq len ({self.model_config.max_model_len}) "
-                "is larger than the maximum number of tokens that can be "
-                f"stored in KV cache ({max_seq_len}). Try increasing "
-                "`VLLM_OPENVINO_KVCACHE_SPACE` or decreasing `max_model_len` "
-                "when initializing the engine.")
-
-    def _init_cache_engine(self) -> None:
-        ov_device = envs.VLLM_OPENVINO_DEVICE
-        self.cache_engine = OpenVINOCacheEngine(
-            self.cache_config,
-            self.model_config,
-            self.parallel_config,
-            self.device_config,
-            self.ov_core,
-            ov_device,
-        )
-        self.kv_cache = self.cache_engine.kv_cache
-        bind_kv_cache(self.compilation_config.static_forward_context,
-                      [self.kv_cache])
-        self.model_runner.block_size = self.cache_engine.block_size
-
-        assert self.kv_cache is not None
-
-        # Populate the cache to warmup the memory
-        if current_platform.is_openvino_cpu():
-            for key_cache, value_cache in self.kv_cache:
-                key_cache.data[:] = 0
-                value_cache.data[:] = 0
-
-    def cache_swap_in(self, src_to_dst: List[Tuple[int, int]]) -> None:
-        self.cache_engine.swap_in(src_to_dst)
-
-    def cache_swap_out(self, src_to_dst: List[Tuple[int, int]]) -> None:
-        self.cache_engine.swap_out(src_to_dst)
-
-    def cache_copy(
-        self,
-        blocks_to_copy: List[Tuple[int, int]],
-    ) -> None:
-        self.cache_engine.copy(blocks_to_copy)  # type: ignore
-
-    def get_model(self) -> nn.Module:
-        return self.model_runner.get_model()
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> List[SamplerOutput]:
-        if execute_model_req is None:
-            seq_group_metadata_list = None
-        else:
-            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
-
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            num_seq_groups: int = len(seq_group_metadata_list)
-            assert execute_model_req is not None
-            blocks_to_copy = execute_model_req.blocks_to_copy
-            blocks_to_swap_in = execute_model_req.blocks_to_swap_in
-            blocks_to_swap_out = execute_model_req.blocks_to_swap_out
-            data: Dict[str, Any] = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_copy": execute_model_req.blocks_to_copy,
-                "blocks_to_swap_in": execute_model_req.blocks_to_swap_in,
-                "blocks_to_swap_out": execute_model_req.blocks_to_swap_out,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_copy = data["blocks_to_copy"]
-            blocks_to_swap_in = data["blocks_to_swap_in"]
-            blocks_to_swap_out = data["blocks_to_swap_out"]
-
-        if current_platform.is_openvino_cpu():
-            assert len(execute_model_req.blocks_to_swap_in) == 0
-            assert len(execute_model_req.blocks_to_swap_out) == 0
-        else:
-            self.cache_swap_in(blocks_to_swap_in)
-            self.cache_swap_out(blocks_to_swap_out)
-
-        self.cache_copy(blocks_to_copy)
-
-        # If there is no input, we don't need to execute the model.
-        if num_seq_groups == 0:
-            return []
-
-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.kv_cache)
-
-        # OpenVINO worker only supports single-step execution.
-        return [output]
-
-    def init_distributed_environment(self) -> None:
-        """Initialize the distributed environment."""
-
-        parallel_config = self.parallel_config
-        rank = self.rank
-        distributed_init_method = self.distributed_init_method
-        init_distributed_environment(
-            world_size=parallel_config.world_size,
-            rank=rank,
-            distributed_init_method=distributed_init_method,
-            backend="gloo",
-        )
-
-        # A small all_reduce for warmup.
-        torch.distributed.all_reduce(torch.zeros(1).cpu())
-
-        ensure_model_parallel_initialized(
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-        )
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size in bytes of a single KV cache block."""
-        return OpenVINOCacheEngine.get_cache_block_size(
-            self.cache_config.block_size,
-            self.cache_config.cache_dtype,
-            self.model_config,
-            self.parallel_config,
-        )
-
-    def profile_run(self) -> int:
-        ov_device = envs.VLLM_OPENVINO_DEVICE
-
-        assert not current_platform.is_openvino_cpu(), \
-            "CPU device isn't supposed to use profile run."
-
-        import openvino.properties.device as device
-        import openvino.properties.intel_gpu as intel_gpu
-
-        ov_core = self.ov_core
-        cache_config = self.cache_config
-        model_config = self.model_config
-        parallel_config = self.parallel_config
-        device_config = self.device_config
-        input_registry = INPUT_REGISTRY
-        mm_registry = MULTIMODAL_REGISTRY
-        mm_registry.init_mm_limits_per_prompt(model_config)
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        def model_profile_run():
-            top_k = model_config.get_vocab_size() - 1
-            sampling_params = SamplingParams(top_p=0.99, top_k=top_k)
-
-            max_num_batched_tokens = \
-                self.scheduler_config.max_num_batched_tokens
-            max_num_seqs = self.scheduler_config.max_num_seqs
-            tmp_cache_config = CacheConfig(cache_config.block_size,
-                                           cache_config.gpu_memory_utilization,
-                                           cache_config.swap_space_bytes,
-                                           "auto")
-            tmp_cache_config.num_gpu_blocks = 1
-            tmp_cache_config.num_cpu_blocks = 0
-            tmp_cache_config.cache_dtype = cache_config.cache_dtype
-
-            profiling_cache_engine = OpenVINOCacheEngine(
-                tmp_cache_config, model_config, parallel_config, device_config,
-                ov_core, ov_device)
-
-            # Profile memory usage with max_num_sequences sequences and the
-            # total # number of tokens equal to max_num_batched_tokens.
-            seqs: List[SequenceGroupMetadata] = []
-            for group_id in range(max_num_seqs):
-                seq_len = (max_num_batched_tokens // max_num_seqs +
-                           (group_id < max_num_batched_tokens % max_num_seqs))
-                block_size = cache_config.block_size
-                seq_num_blocks = (seq_len + block_size - 1) // block_size
-
-                dummy_data = input_registry \
-                    .dummy_data_for_profiling(model_config,
-                                              seq_len,
-                                              mm_registry)
-
-                block_tables = [[0] * seq_num_blocks] * max_num_seqs
-                seq = SequenceGroupMetadata(
-                    request_id=str(group_id),
-                    is_prompt=True,
-                    seq_data={group_id: dummy_data.seq_data},
-                    sampling_params=sampling_params,
-                    block_tables=block_tables,
-                    lora_request=None,
-                    multi_modal_data=dummy_data.multi_modal_data)
-                seqs.append(seq)
-
-            self.model_runner.block_size = tmp_cache_config.block_size
-
-            bind_kv_cache(self.compilation_config.static_forward_context,
-                          profiling_cache_engine.kv_cache)
-            # Run the model with the dummy inputs.
-            self.model_runner.execute_model(seqs,
-                                            profiling_cache_engine.kv_cache)
-
-            # Explicitly revert bind_kv_cache and delete temporary KV cache
-            # manager to free KV cache when real inputs will be passed to OV
-            bind_kv_cache(self.compilation_config.static_forward_context, [[
-                torch.tensor([])
-                for _ in range(len(profiling_cache_engine.kv_cache))
-            ]])
-            del profiling_cache_engine
-
-            logger.info(
-                "Start profiling run with dummy inputs to evaluate "
-                "memory usage for %s. It might take a while.", ov_device)
-
-        model_profile_run()
-
-        gpu_device_type = ov_core.get_property(ov_device, device.type)
-        memory_statistics = \
-            ov_core.get_property(ov_device, intel_gpu.memory_statistics)
-        memory_utilization = cache_config.gpu_memory_utilization
-
-        if gpu_device_type == device.Type.INTEGRATED and \
-            memory_utilization >= 0.9:
-            logger.warning(
-                "iGPU is used with high gpu_memory_utilization=%f "
-                "value. This may cause low performance due to "
-                "occupying the majority of available system "
-                "memory. Please consider decreasing "
-                "gpu_memory_utilization or explicitly setting "
-                "`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
-                "variable.", memory_utilization)
-
-        # sum up all used device memory
-        device_memory_types = ["cl_mem", "usm_device"]
-        used_device_mem = \
-            sum(memory_statistics.get(key, 0) for key in device_memory_types)
-
-        if gpu_device_type == device.Type.INTEGRATED:
-            used_device_mem += memory_statistics.get("usm_host", 0)
-
-        # there could be unaccounted extra memory reserved by kernels, kept
-        # in memory pools, etc
-        # therefore, add a threshold to account for this
-        used_memory_threshold = 1.1
-        used_device_mem *= used_memory_threshold
-
-        total_device_memory = \
-            ov_core.get_property(ov_device, intel_gpu.device_total_mem_size)
-
-        def format_memory_size(size) -> str:
-            units = ["B", "KB", "MB", "GB"]
-            unit_index = 0
-
-            while size > 1024 and unit_index < len(units) - 1:
-                size /= 1024
-                unit_index += 1
-
-            return f"{size:.2f} {units[unit_index]}"
-
-        total_device_memory_str = \
-            format(format_memory_size(total_device_memory))
-        used_device_memory_str = \
-            format(format_memory_size(used_device_mem))
-
-        logger.info(
-            "Total %s memory: %s. "
-            "Amount of memory required to run the model with "
-            "max_num_batched_tokens=%d: %s.", ov_device,
-            total_device_memory_str,
-            self.scheduler_config.max_num_batched_tokens,
-            used_device_memory_str)
-
-        if used_device_mem >= total_device_memory:
-            raise RuntimeError(
-                f"The required memory size {used_device_memory_str} for model "
-                "is higher than the total available device "
-                "memory {total_device_memory_str}. Please consider to "
-                "decrease `max_num_batched_tokens` or increase "
-                "`gpu_memory_utilization`")
-
-        return total_device_memory * memory_utilization - used_device_mem

From 38120384565b2718940a74749ea5e620beffb329 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Sun, 23 Mar 2025 08:38:33 +0800
Subject: [PATCH 0914/1240] [doc] Add back previous news (#15331)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d829b057144..f2da0467e5c 100644
--- a/README.md
+++ b/README.md
@@ -28,10 +28,27 @@ Easy, fast, and cheap LLM serving for everyone
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
-- [2024/12] vLLM joins [PyTorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
+
+<details>
+<summary>Previous News</summary>
+
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
+- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
+- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
+- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
+- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
+- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
+
+</details>
 
 ---
-
 ## About
 
 vLLM is a fast and easy-to-use library for LLM inference and serving.

From 8e3b94799e76a3e48af7965d1d52505efa17d4f6 Mon Sep 17 00:00:00 2001
From: hijkzzz <janhu9527@gmail.com>
Date: Sun, 23 Mar 2025 10:23:35 +0800
Subject: [PATCH 0915/1240] Fix v1 supported oracle for worker-cls and
 worker-extension-cls (#15324)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml |  4 ++--
 vllm/engine/arg_utils.py      | 10 ----------
 vllm/utils.py                 |  5 +++++
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 730f272b54e..21c5e247bc7 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -148,8 +148,8 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - pushd ../examples/offline_inference
-  - python3 rlhf.py
-  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
 - label: Metrics, Tracing Test # 10min
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 5d06a86e695..dd0a6256379 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1459,16 +1459,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        if self.worker_cls != EngineArgs.worker_cls:
-            _raise_or_fallback(feature_name="--worker-cls",
-                               recommend_to_remove=False)
-            return False
-
-        if self.worker_extension_cls != EngineArgs.worker_extension_cls:
-            _raise_or_fallback(feature_name="--worker-extension-cls",
-                               recommend_to_remove=False)
-            return False
-
         if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
             _raise_or_fallback(feature_name="--num-scheduler-steps",
                                recommend_to_remove=True)
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e09f0b9f2d..d87ec44c75f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2170,6 +2170,11 @@ def _maybe_force_spawn():
     if cuda_is_initialized():
         reason = "CUDA is initialized"
     elif is_in_ray_actor():
+        # even if we choose to spawn, we need to pass the ray address
+        # to the subprocess so that it knows how to connect to the ray cluster.
+        # env vars are inherited by subprocesses, even if we use spawn.
+        import ray
+        os.environ["RAY_ADDRESS"] = ray.get_runtime_context().gcs_address
         reason = "In a Ray actor and can only be spawned"
 
     if reason is not None:

From 24af8f548b70135106396714d8a56ab02c70191d Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sun, 23 Mar 2025 13:28:10 +0800
Subject: [PATCH 0916/1240] [V1][Usage] Refactor speculative decoding
 configuration and tests (#14434)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/spec_decode.md           |  39 +-
 examples/offline_inference/mlpspeculator.py   |   4 +-
 tests/spec_decode/e2e/conftest.py             |   2 +-
 tests/spec_decode/e2e/test_compatibility.py   |  29 +-
 .../spec_decode/e2e/test_eagle_correctness.py |  97 +--
 tests/spec_decode/e2e/test_integration.py     |  46 +-
 .../e2e/test_integration_dist_tp2.py          |  77 +--
 .../e2e/test_integration_dist_tp4.py          |  28 +-
 tests/spec_decode/e2e/test_logprobs.py        | 190 +++---
 .../e2e/test_medusa_correctness.py            |  95 +--
 tests/spec_decode/e2e/test_mlp_correctness.py |  89 ++-
 tests/spec_decode/e2e/test_mtp_correctness.py |  64 +-
 .../e2e/test_multistep_correctness.py         | 177 +++--
 .../spec_decode/e2e/test_ngram_correctness.py | 157 +++--
 tests/spec_decode/e2e/test_seed.py            |  10 +-
 tests/v1/e2e/test_ngram_spec_decode.py        |  16 +-
 vllm/config.py                                | 608 +++++++++---------
 vllm/engine/arg_utils.py                      | 118 +++-
 vllm/spec_decode/spec_decode_worker.py        |  16 +-
 vllm/v1/worker/gpu_model_runner.py            |   7 +-
 20 files changed, 1061 insertions(+), 808 deletions(-)

diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 852248e418c..3e1f1d5be75 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="facebook/opt-6.7b",
     tensor_parallel_size=1,
-    speculative_model="facebook/opt-125m",
-    num_speculative_tokens=5,
+    speculative_config={
+        "model": "facebook/opt-125m",
+        "num_speculative_tokens": 5,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -45,10 +47,14 @@ To perform the same with an online mode launch the server:
 
 ```bash
 python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --speculative_model facebook/opt-125m \
-    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+    --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```
 
+:::{warning}
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+:::
+
 Then use a client:
 
 ```python
@@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="facebook/opt-6.7b",
     tensor_parallel_size=1,
-    speculative_model="[ngram]",
-    num_speculative_tokens=5,
-    ngram_prompt_lookup_max=4,
+    speculative_config={
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 4,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3.1-70B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="ibm-ai-platform/llama3-70b-accelerator",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "ibm-ai-platform/llama3-70b-accelerator",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 outputs = llm.generate(prompts, sampling_params)
 
@@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM(
     model="meta-llama/Meta-Llama-3-8B-Instruct",
     tensor_parallel_size=4,
-    speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-    speculative_draft_tensor_parallel_size=1,
+    speculative_config={
+        "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+        "draft_tensor_parallel_size": 1,
+    },
 )
 
 outputs = llm.generate(prompts, sampling_params)
@@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models:
    be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
    If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
    [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
-   and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using
-   the latest version of vLLM, please leave a comment or raise an issue.
+   and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
 
 2. The EAGLE based draft models need to be run without tensor parallelism
-   (i.e. speculative_draft_tensor_parallel_size is set to 1), although
+   (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
    it is possible to run the main model using tensor parallelism (see example above).
 
 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 61641245de8..380c53fab22 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -50,7 +50,9 @@ def time_generation(llm: LLM, prompts: list[str],
     # Create an LLM with spec decoding
     llm = LLM(
         model="meta-llama/Llama-2-13b-chat-hf",
-        speculative_model="ibm-ai-platform/llama-13b-accelerator",
+        speculative_config={
+            "model": "ibm-ai-platform/llama-13b-accelerator",
+        },
     )
 
     print("With speculation")
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index fe4a1c13fc7..921081f3c3f 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -56,7 +56,7 @@ def generate():
 def maybe_assert_ngram_worker(llm):
     # Verify the proposer worker is ngram if ngram is specified.
     if (llm.llm_engine.speculative_config is not None
-            and llm.llm_engine.speculative_config.ngram_prompt_lookup_max > 0):
+            and llm.llm_engine.speculative_config.method == "ngram"):
         from vllm.spec_decode.ngram_worker import NGramWorker
         assert isinstance(
             llm.llm_engine.model_executor.driver_worker.proposer_worker,
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 83d1551afe5..4fd52cf7e2c 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -7,28 +7,39 @@
 from .conftest import get_output_from_llm_generator
 
 
-@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "meta-llama/Llama-3.2-1B-Instruct",
-    "speculative_model": "JackFram/llama-68m",
-    "num_speculative_tokens": 5,
-}])
+@pytest.mark.parametrize("common_llm_kwargs",
+                         [{
+                             "model": "meta-llama/Llama-3.2-1B-Instruct",
+                         }])
 @pytest.mark.parametrize(
     "per_test_common_llm_kwargs",
     [
         {
             # Speculative max model len > overridden max model len should raise.
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 129,
+            },
             "max_model_len": 128,
-            "speculative_max_model_len": 129,
         },
         {
             # Speculative max model len > draft max model len should raise.
             # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
-            "speculative_max_model_len": 2048 + 1,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 2048 + 1,
+            },
         },
         {
             # Speculative max model len > target max model len should raise.
-            # https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
-            "speculative_max_model_len": 131072 + 1,
+            # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 131072 + 1,
+            },
         },
     ])
 @pytest.mark.parametrize("test_llm_kwargs", [{}])
diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py
index 42a84071d94..eee535a146f 100644
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -57,8 +57,10 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -95,18 +97,19 @@ def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs", [
-    {
-        "speculative_model": SPEC_MODEL,
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
         "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "disable_logprobs": False,
     },
-    {
-        "speculative_model": SPEC_MODEL,
+}, {
+    "speculative_config": {
+        "model": SPEC_MODEL,
         "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "disable_logprobs": True,
     },
-])
+}])
 @pytest.mark.parametrize("output_len", [
     128,
 ])
@@ -119,18 +122,19 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                    batch_size: int, output_len: int, seed: int,
                                    logprobs: int):
 
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -151,8 +155,10 @@ def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -193,8 +199,10 @@ def test_eagle_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -236,8 +244,10 @@ def test_eagle_e2e_greedy_correctness_with_preemption(
     "test_llm_kwargs",
     [
         {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
@@ -277,12 +287,13 @@ def test_eagle_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -324,8 +335,10 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-llama2-chat-7B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -372,8 +385,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -420,8 +435,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py
index c67fa85146c..9dfc1b2fd91 100644
--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -23,8 +23,10 @@
     [
         {
             # Identical models.
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -57,26 +59,33 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
     }])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "speculative_model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
-        "num_speculative_tokens": 5,
-    },
-])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [
         # Explicitly specify draft model quantization
         {
-            "speculative_model_quantization": "gptq",
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": "gptq",
+            },
         },
         # Explicitly specify GPTQ-based draft model to use marlin quantization
         {
-            "speculative_model_quantization": "marlin",
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": "marlin",
+            },
         },
         # Not explicitly specify draft model quantization
         {
-            "speculative_model_quantization": None,
+            "speculative_config": {
+                "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit",
+                "num_speculative_tokens": 5,
+                "quantization": None,
+            },
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -107,15 +116,16 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 3,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_mqa_scorer": True,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -127,7 +137,7 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
 def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
                     baseline_llm_kwargs, test_llm_kwargs, batch_size: int,
                     output_len: int, seed: int):
-    """Verify that ngram speculative decoding generates the same output 
+    """Verify that speculative decoding generates the same output
     with batch expansion scorer and mqa scorer.
     """
     run_equality_correctness_test(vllm_runner,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index e5a542b6d84..b8a2631b914 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -27,18 +27,19 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize("test_llm_kwargs", [
     [
-        "--speculative-model",
-        "JackFram/llama-68m",
-        "--num-speculative-tokens",
-        "3",
+        "--speculative_config",
+        str({
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 3,
+        }),
     ],
     [
-        "--speculative-model",
-        "[ngram]",
-        "--num-speculative-tokens",
-        "5",
-        "--ngram-prompt-lookup-max",
-        "3",
+        "--speculative_config",
+        str({
+            "model": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+        }),
     ],
 ])
 @pytest.mark.parametrize("batch_size", [2])
@@ -83,23 +84,24 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     ]])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
-@pytest.mark.parametrize("model, test_llm_kwargs",
-                         [("JackFram/llama-68m", [
-                             "--speculative-model",
-                             "JackFram/llama-68m",
-                             "--num_speculative-tokens",
-                             "5",
-                             "--speculative-draft-tensor-parallel-size",
-                             "1",
-                         ]),
-                          ("ibm-granite/granite-3b-code-instruct", [
-                              "--speculative-model",
-                              "ibm-granite/granite-3b-code-instruct",
-                              "--num_speculative-tokens",
-                              "5",
-                              "--speculative-draft-tensor-parallel-size",
-                              "1",
-                          ])])
+@pytest.mark.parametrize(
+    "model, test_llm_kwargs",
+    [("JackFram/llama-68m", [
+        "--speculative_config",
+        str({
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "draft_tensor_parallel_size": 1,
+        }),
+    ]),
+     ("ibm-granite/granite-3b-code-instruct", [
+         "--speculative_config",
+         str({
+             "model": "ibm-granite/granite-3b-code-instruct",
+             "num_speculative_tokens": 5,
+             "draft_tensor_parallel_size": 1,
+         }),
+     ])])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
@@ -144,18 +146,19 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize("model, test_llm_kwargs",
                          [("JackFram/llama-68m", [
-                             "--speculative-model",
-                             "JackFram/llama-68m",
-                             "--num_speculative-tokens",
-                             "3",
+                             "--speculative_config",
+                             str({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                             }),
                          ]),
                           ("JackFram/llama-68m", [
-                              "--speculative-model",
-                              "JackFram/llama-68m",
-                              "--num_speculative-tokens",
-                              "3",
-                              "--speculative-draft-tensor-parallel-size",
-                              "1",
+                              "--speculative_config",
+                              str({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                              }),
                           ])])
 @pytest.mark.parametrize("logprobs", [None, 2])
 @pytest.mark.parametrize("batch_size", [2])
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index cb9c46dc707..d42d9029fef 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -24,12 +24,7 @@
         "4",
     ]])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    [
-        "--speculative-model",
-        f"{SPEC_MODEL}",
-        "--num-speculative-tokens",
-        "5",
-    ],
+    [],
 ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
 @pytest.mark.parametrize(
@@ -37,8 +32,12 @@
     [
         #TODO(wooyeon): add spec_draft_dp=2 case
         [
-            "--speculative-draft-tensor-parallel-size",
-            "1",
+            "--speculative_config",
+            str({
+                "model": f"{SPEC_MODEL}",
+                "num_speculative_tokens": 5,
+                "draft_tensor_parallel_size": 1,
+            }),
         ],
     ])
 @pytest.mark.parametrize("batch_size", [2])
@@ -78,15 +77,14 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
     "test_llm_kwargs",
     [
         [
-            "--speculative-model",
-            f"{SPEC_MODEL}",
-            "--num-speculative-tokens",
-            "5",
-
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
-            "--speculative-max-model-len",
-            "32",
+            "--speculative_config",
+            str({
+                "model": f"{SPEC_MODEL}",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            }),
         ],
     ])
 @pytest.mark.parametrize("batch_size", [8])
diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py
index 5991a8b0235..cb2dae54141 100644
--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -20,16 +20,19 @@
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }, {
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": True,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -48,19 +51,20 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
         as well as with and without chunked prefill.
     """
     maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -73,16 +77,19 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }, {
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 6,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 6,
+        "disable_logprobs": False,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
     "output_len",
@@ -98,18 +105,19 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
                               output_len: int, seed: int, logprobs: int):
     """Veriy logprob greedy equality with different speculation lens.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -125,13 +133,15 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
     "test_llm_kwargs",
     [{
-        "speculative_model": "JackFram/llama-160m",
-        "num_speculative_tokens": 3,
-        "disable_logprobs_during_spec_decoding": False,
-
-        # Artificially limit the draft model max model len; this forces vLLM
-        # to skip speculation once the sequences grow beyond 32-k tokens.
-        "speculative_max_model_len": 32,
+        "speculative_config": {
+            "model": "JackFram/llama-160m",
+            "num_speculative_tokens": 3,
+            "disable_logprobs": False,
+            # Artificially limit the draft model max model len; this forces
+            # vLLM to skip speculation once the sequences grow beyond 32-k
+            # tokens.
+            "max_model_len": 32,
+        },
     }])
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize(
@@ -149,18 +159,19 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
                                         seed: int, logprobs: int):
     """Verify logprobs greedy equality when some sequences skip speculation.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -173,12 +184,13 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-160m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": False,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-160m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1])
 @pytest.mark.parametrize(
     "output_len",
@@ -248,12 +260,13 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "disable_logprobs_during_spec_decoding": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": True,
+    },
+}])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize(
@@ -270,15 +283,16 @@ def test_logprobs_disabled(vllm_runner, common_llm_kwargs,
     """Check the behavior when logprobs are disabled.
     Token choices should match with the base model.
     """
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py
index 807f41cc9e5..1be0e00384e 100644
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -60,8 +60,10 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -107,14 +109,18 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": False,
+        },
     },
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": True,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -132,19 +138,20 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                     prefill_chunk_size: int):
     """Verify greedy equality with different batch size."""
     maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -165,8 +172,10 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -214,8 +223,10 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -264,8 +275,10 @@ def test_medusa_e2e_greedy_correctness_with_preemption(
     "test_llm_kwargs",
     [
         {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
@@ -312,12 +325,13 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -359,16 +373,17 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs,
 
         # Main model
         "model_name": MAIN_MODEL,
-        "speculative_model": SPEC_MODEL,
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "speculative_disable_by_batch_size": 4
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4,
+        "disable_mqa_scorer": True,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
index 59beca47acd..3efda40066b 100644
--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -62,7 +62,9 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -108,12 +110,16 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "disable_logprobs": False,
+        },
     },
     {
-        "speculative_model": SPEC_MODEL,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+            "disable_logprobs": True,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [8])
@@ -133,19 +139,20 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
     # up sampling different tokens at the tail (ie top tokens don't change).
     # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected?
     maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs)
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -167,7 +174,9 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [2048])
@@ -209,8 +218,10 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs,
         # Main model
         "model_name": MAIN_MODEL,
 
-        # Speculative model
-        "speculative_model": SPEC_MODEL,
+        # Speculative config
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
@@ -274,7 +285,9 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -326,7 +339,9 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": SPEC_MODEL,
+        "speculative_config": {
+            "model": SPEC_MODEL,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -382,8 +397,10 @@ def patched_pad_vocab_size(vocab_size, pad_to=None):
     "test_llm_kwargs",
     [
         {
-            "speculative_model": SPEC_MODEL,
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": SPEC_MODEL,
+                "num_speculative_tokens": k,
+            },
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
@@ -430,11 +447,12 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": SPEC_MODEL,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "disable_by_batch_size": 4,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -475,14 +493,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
-        "speculative_model": SPEC_MODEL,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": SPEC_MODEL,
+        "disable_mqa_scorer": True,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py
index 0bad19f61d3..371e6834b63 100644
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -57,7 +57,9 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -99,12 +101,16 @@ def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": False,
+        },
     },
     {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+            "disable_logprobs": True,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -119,18 +125,19 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                  batch_size: int, output_len: int, seed: int,
                                  logprobs: int):
 
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  output_len,
-                                  seed,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        output_len,
+        seed,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -152,7 +159,9 @@ def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -198,7 +207,9 @@ def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "speculative_config": {
+            "num_speculative_tokens": MAX_SPEC_TOKENS,
+        },
     },
 ])
 @pytest.mark.parametrize(
@@ -243,7 +254,9 @@ def test_mtp_e2e_greedy_correctness_with_preemption(
     "test_llm_kwargs",
     [
         {
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "num_speculative_tokens": k,
+            },
         }
         # Try a range of num. speculative tokens
         for k in range(1, 1 + MAX_SPEC_TOKENS)
@@ -286,11 +299,12 @@ def test_mtp_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "num_speculative_tokens": MAX_SPEC_TOKENS,
-                             "speculative_disable_by_batch_size": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "num_speculative_tokens": MAX_SPEC_TOKENS,
+        "disable_by_batch_size": 4
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 56acf664ab5..bb45be791fa 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -61,15 +61,19 @@
     "per_test_common_llm_kwargs",
     [
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
             "enable_chunked_prefill": False,
         },
         {
             # Chunked prefill enabled with small value
             # to make sure we get mixed batches.
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+            },
             "enable_chunked_prefill": True,
             "max_num_batched_tokens": 4,
             "max_num_seqs": 4
@@ -148,20 +152,23 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
         },
     ])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 5,
-                             "enable_chunked_prefill": False,
-                             "disable_logprobs_during_spec_decoding": False
-                         }, {
-                             "speculative_model": "JackFram/llama-68m",
-                             "num_speculative_tokens": 3,
-                             "enable_chunked_prefill": True,
-                             "max_num_batched_tokens": 4,
-                             "max_num_seqs": 4,
-                             "disable_logprobs_during_spec_decoding": False
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "disable_logprobs": False,
+    },
+    "enable_chunked_prefill": False,
+}, {
+    "speculative_config": {
+        "model": "JackFram/llama-68m",
+        "num_speculative_tokens": 3,
+        "disable_logprobs": False,
+    },
+    "enable_chunked_prefill": True,
+    "max_num_batched_tokens": 4,
+    "max_num_seqs": 4,
+}])
 @pytest.mark.parametrize(
     "output_len",
     [
@@ -184,7 +191,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     whether all speculative tokens are accepted.
     """
     ensure_all_accepted = per_test_common_llm_kwargs.get(
-        "model_name") == test_llm_kwargs.get("speculative_model")
+        "model_name") == test_llm_kwargs.get("speculative_config")["model"]
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -224,13 +231,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -283,13 +294,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -336,13 +351,17 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -391,13 +410,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -449,13 +472,17 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -514,13 +541,17 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
@@ -567,21 +598,25 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
     "test_llm_kwargs",
     [
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
 
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
-            "speculative_max_model_len": 32,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            },
             "enable_chunked_prefill": False,
         },
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": 5,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": 5,
+                "max_model_len": 32,
+            },
             "enable_chunked_prefill": True,
             "max_num_batched_tokens": 4,
             "max_num_seqs": 4,
-            "speculative_max_model_len": 32,
         },
     ])
 @pytest.mark.parametrize("batch_size", [8])
@@ -627,15 +662,19 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "speculative_disable_by_batch_size": 2,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "disable_by_batch_size": 2,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": 5,
-        "speculative_disable_by_batch_size": 2,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "disable_by_batch_size": 2,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4,
@@ -676,15 +715,19 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
     "test_llm_kwargs",
     [
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+            },
             "enable_chunked_prefill": False,
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
     ] + [{
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": k,
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4,
@@ -729,17 +772,21 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
     "test_llm_kwargs",
     [
         {
-            "speculative_model": "JackFram/llama-68m",
-            "num_speculative_tokens": k,
-            "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+            "speculative_config": {
+                "model": "JackFram/llama-68m",
+                "num_speculative_tokens": k,
+                "acceptance_method": "typical_acceptance_sampler",
+            },
             "enable_chunked_prefill": False
         }
         # Try a range of common k.
         for k in [1, 2, 3]
     ] + [{
-        "speculative_model": "JackFram/llama-68m",
-        "num_speculative_tokens": k,
-        "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+        "speculative_config": {
+            "model": "JackFram/llama-68m",
+            "num_speculative_tokens": k,
+            "acceptance_method": "typical_acceptance_sampler",
+        },
         "enable_chunked_prefill": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 1aff53cb55c..3af89dc74e7 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -48,16 +48,20 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "speculative_disable_mqa_scorer": False,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": False,
+        },
     },
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "speculative_disable_mqa_scorer": True,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": True,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -101,16 +105,20 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "disable_logprobs_during_spec_decoding": False,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_logprobs": False,
+        },
     },
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
-        "disable_logprobs_during_spec_decoding": True,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_logprobs": True,
+        },
     },
 ])
 @pytest.mark.parametrize("output_len", [
@@ -125,19 +133,20 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
                                    batch_size: int, output_len: int, seed: int,
                                    logprobs: int):
     """Verify greedy equality on a tiny model with different batch size."""
-    run_equality_correctness_test(vllm_runner,
-                                  common_llm_kwargs,
-                                  per_test_common_llm_kwargs,
-                                  baseline_llm_kwargs,
-                                  test_llm_kwargs,
-                                  batch_size,
-                                  max_output_len=output_len,
-                                  seed=seed,
-                                  temperature=0.0,
-                                  logprobs=logprobs,
-                                  prompt_logprobs=logprobs,
-                                  disable_logprobs=test_llm_kwargs[
-                                      'disable_logprobs_during_spec_decoding'])
+    run_equality_correctness_test(
+        vllm_runner,
+        common_llm_kwargs,
+        per_test_common_llm_kwargs,
+        baseline_llm_kwargs,
+        test_llm_kwargs,
+        batch_size,
+        max_output_len=output_len,
+        seed=seed,
+        temperature=0.0,
+        logprobs=logprobs,
+        prompt_logprobs=logprobs,
+        disable_logprobs=test_llm_kwargs["speculative_config"]
+        ["disable_logprobs"])
 
 
 @pytest.mark.parametrize(
@@ -159,17 +168,21 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
 @pytest.mark.parametrize("test_llm_kwargs", [
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+        },
         "enable_chunked_prefill": False,
     },
     {
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
+        "speculative_config": {
+            "method": "ngram",
+            "num_speculative_tokens": 5,
+            "prompt_lookup_max": 3,
+            "disable_mqa_scorer": True,
+        },
         "enable_chunked_prefill": True,
-        "speculative_disable_mqa_scorer": True,
         "max_num_batched_tokens": 4,
         "max_num_seqs": 4
     },
@@ -214,17 +227,21 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
     "test_llm_kwargs",
     [
         {
-            "speculative_model": "[ngram]",
-            "num_speculative_tokens": k,
-            "ngram_prompt_lookup_max": 3,
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": k,
+                "prompt_lookup_max": 3,
+            },
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 3, 5]
     ] + [
         {
-            "speculative_model": "[ngram]",
-            "num_speculative_tokens": k,
-            "ngram_prompt_lookup_max": 1,
+            "speculative_config": {
+                "method": "ngram",
+                "num_speculative_tokens": k,
+                "prompt_lookup_max": 1,
+            },
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 3, 5]
@@ -243,7 +260,7 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
                            seed: int):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
-    different ngram_prompt_lookup_max.
+    different ngram prompt_lookup_max.
     """
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
@@ -266,22 +283,25 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_model": "[ngram]",
-                             "num_speculative_tokens": 5,
-                             "ngram_prompt_lookup_max": 3,
-                             "speculative_disable_by_batch_size": 4
-                         }, {
-                             "speculative_model": "[ngram]",
-                             "num_speculative_tokens": 5,
-                             "ngram_prompt_lookup_max": 3,
-                             "speculative_disable_by_batch_size": 4,
-                             "enable_chunked_prefill": True,
-                             "speculative_disable_mqa_scorer": True,
-                             "max_num_batched_tokens": 4,
-                             "max_num_seqs": 4
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_by_batch_size": 4
+    },
+}, {
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_by_batch_size": 4,
+        "disable_mqa_scorer": True,
+    },
+    "enable_chunked_prefill": True,
+    "max_num_batched_tokens": 4,
+    "max_num_seqs": 4
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
@@ -296,7 +316,7 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
                              seed: int):
     """Verify that ngram speculative decoding produces exact equality
     to without spec decode with many different values of k and
-    different ngram_prompt_lookup_max.
+    different ngram prompt_lookup_max.
     """
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
@@ -316,18 +336,17 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
 
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
-
-        # Required for spec decode.
-        "speculative_model": "[ngram]",
-        "num_speculative_tokens": 5,
-        "ngram_prompt_lookup_max": 3,
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-@pytest.mark.parametrize("test_llm_kwargs",
-                         [{
-                             "speculative_disable_mqa_scorer": True,
-                         }])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "speculative_config": {
+        "method": "ngram",
+        "num_speculative_tokens": 5,
+        "prompt_lookup_max": 3,
+        "disable_mqa_scorer": True,
+    },
+}])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
index b7d279f2919..3dc37172285 100644
--- a/tests/spec_decode/e2e/test_seed.py
+++ b/tests/spec_decode/e2e/test_seed.py
@@ -19,11 +19,11 @@
         # Skip cuda graph recording for fast test.
         "enforce_eager": True,
 
-        # speculative model
-        "speculative_model": "JackFram/llama-160m",
-
-        # num speculative tokens
-        "num_speculative_tokens": 3,
+        # speculative config
+        "speculative_config": {
+            "model": "JackFram/llama-160m",
+            "num_speculative_tokens": 3,
+        },
     }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
index 6cca3245145..7c7c2f02c07 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -70,12 +70,16 @@ def test_ngram_correctness(
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
-        spec_llm = LLM(model=model_name,
-                       speculative_model='[ngram]',
-                       ngram_prompt_lookup_max=5,
-                       ngram_prompt_lookup_min=3,
-                       num_speculative_tokens=3,
-                       max_model_len=1024)
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
         misses = 0
diff --git a/vllm/config.py b/vllm/config.py
index 42f517e49a1..8b3c66cca50 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1810,12 +1810,139 @@ def __init__(self, device: str = "auto") -> None:
             self.device = torch.device(self.device_type)
 
 
+@dataclass
 class SpeculativeConfig:
-    """Configuration for speculative decoding.
+    """
+    Configuration for speculative decoding.
+    Configurable parameters include:
+    - General Speculative Decoding Control:
+        - num_speculative_tokens (int): The number of speculative
+            tokens, if provided. It will default to the number in the draft
+            model config if present, otherwise, it is required.
+        - model (Optional[str]): The name of the draft model, eagle head,
+            or additional weights, if provided.
+        - method (Optional[str]): The name of the speculative method to use.
+            If users provide and set the `model` param, the speculative method
+            type will be detected automatically if possible, if `model` param
+            is not provided, the method name must be provided.
+            - Possible values:
+                - ngram
+                    Related additional configuration:
+                    - prompt_lookup_max (Optional[int]):
+                        Maximum size of ngram token window when using Ngram
+                        proposer, required when method is set to ngram.
+                    - prompt_lookup_min (Optional[int]):
+                        Minimum size of ngram token window when using Ngram
+                        proposer, if provided. Defaults to 1.
+                - eagle
+                - medusa
+                - mlp_speculator
+                - draft_model
+        - acceptance_method (str): The method to use for accepting draft
+            tokens. This can take two possible values: 'rejection_sampler' and
+            'typical_acceptance_sampler' for RejectionSampler and
+            TypicalAcceptanceSampler respectively. If not specified, it
+            defaults to 'rejection_sampler'.
+            - Possible values:
+                - rejection_sampler
+                - typical_acceptance_sampler
+                    Related additional configuration:
+                    - posterior_threshold (Optional[float]):
+                        A threshold value that sets a lower bound on the
+                        posterior probability of a token in the target model
+                        for it to be accepted. This threshold is used only
+                        when we use the TypicalAcceptanceSampler for token
+                        acceptance.
+                    - posterior_alpha (Optional[float]):
+                        Scaling factor for entropy-based threshold, applied
+                        when using TypicalAcceptanceSampler.
+        - draft_tensor_parallel_size (Optional[int]): The degree of the tensor
+            parallelism for the draft model. Can only be 1 or the same as the
+            target model's tensor parallel size.
+        - disable_logprobs (bool): If set to True, token log probabilities are
+            not returned during speculative decoding. If set to False, token
+            log probabilities are returned according to the log probability
+            settings in SamplingParams. If not specified, it defaults to True.
+
+    - Draft Model Configuration:
+        - quantization (Optional[str]): Quantization method that was used to
+            quantize the draft model weights. If None, we assume the
+            model weights are not quantized. Note that it only takes effect
+            when using the draft model-based speculative method.
+        - max_model_len (Optional[int]): The maximum model length of the
+            draft model. Used when testing the ability to skip
+            speculation for some sequences.
+        - revision: The specific model version to use for the draft model. It
+            can be a branch name, a tag name, or a commit id. If unspecified,
+            will use the default version.
+        - code_revision: The specific revision to use for the draft model code
+            on Hugging Face Hub. It can be a branch name, a tag name, or a
+            commit id. If unspecified, will use the default version.
 
-    The configuration is currently specialized to draft-model speculative
-    decoding with top-1 proposals.
+    - Advanced Control:
+        - disable_mqa_scorer (bool): Disable the MQA scorer and fall back to
+            batch expansion for scoring proposals. If not specified, it
+            defaults to False.
+        - disable_by_batch_size (Optional[int]): Disable speculative decoding
+            for new incoming requests when the number of enqueued requests is
+            larger than this value, if provided.
+
+    Although the parameters above are structured hierarchically, there is no
+    need to nest them during configuration.
+
+    Non-configurable internal parameters include:
+    - Model Configuration:
+        - target_model_config (ModelConfig): The configuration of the target
+            model.
+        - draft_model_config (ModelConfig): The configuration of the draft
+            model initialized internal.
+    - Parallelism Configuration:
+        - target_parallel_config (ParallelConfig): The parallel configuration
+            for the target model.
+        - draft_parallel_config (ParallelConfig): The parallel configuration
+            for the draft model initialized internal.
+    - Execution Control:
+        - enable_chunked_prefill (bool): Whether vLLM is configured to use
+            chunked prefill or not. Used for raising an error since it's not
+            yet compatible with speculative decode.
+        - disable_log_stats (bool): Whether to disable the periodic printing of
+            stage times in speculative decoding.
     """
+    # speculative configs from cli args
+    num_speculative_tokens: int = field(default=None,
+                                        init=True)  # type: ignore
+    method: Optional[str] = None
+    acceptance_method: str = "rejection_sampler"
+    draft_tensor_parallel_size: Optional[int] = None
+    disable_logprobs: bool = True
+
+    model: Optional[str] = None
+    quantization: Optional[str] = None
+    max_model_len: Optional[int] = None
+    revision: Optional[str] = None
+    code_revision: Optional[str] = None
+
+    disable_mqa_scorer: bool = False
+    disable_by_batch_size: Optional[int] = None
+    prompt_lookup_max: Optional[int] = None
+    prompt_lookup_min: Optional[int] = None
+    posterior_threshold: Optional[float] = None
+    posterior_alpha: Optional[float] = None
+
+    # required configuration params passed from engine
+    target_model_config: ModelConfig = field(default=None,
+                                             init=True)  # type: ignore
+    target_parallel_config: ParallelConfig = field(default=None,
+                                                   init=True)  # type: ignore
+    enable_chunked_prefill: bool = field(default=None,
+                                         init=True)  # type: ignore
+    disable_log_stats: bool = field(default=None, init=True)  # type: ignore
+
+    # params generated in the post-init stage
+    draft_model_config: ModelConfig = field(default=None,
+                                            init=True)  # type: ignore
+    draft_parallel_config: ParallelConfig = field(default=None,
+                                                  init=True)  # type: ignore
 
     def compute_hash(self) -> str:
         """
@@ -1835,6 +1962,11 @@ def compute_hash(self) -> str:
         hash_str = hashlib.md5(str(factors).encode()).hexdigest()
         return hash_str
 
+    @classmethod
+    def from_dict(cls, dict_value: dict) -> "SpeculativeConfig":
+        """Parse the CLI value for the speculative config."""
+        return cls(**dict_value)
+
     @staticmethod
     def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
         if hf_config.model_type == "deepseek_v3":
@@ -1847,230 +1979,160 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
             })
         return hf_config
 
-    @staticmethod
-    def maybe_create_spec_config(
-        target_model_config: ModelConfig,
-        target_parallel_config: ParallelConfig,
-        target_dtype: str,
-        speculative_model: Optional[str],
-        speculative_model_quantization: Optional[str],
-        speculative_draft_tensor_parallel_size: Optional[int],
-        num_speculative_tokens: Optional[int],
-        speculative_disable_mqa_scorer: Optional[bool],
-        speculative_max_model_len: Optional[int],
-        enable_chunked_prefill: bool,
-        disable_log_stats: bool,
-        speculative_disable_by_batch_size: Optional[int],
-        ngram_prompt_lookup_max: Optional[int],
-        ngram_prompt_lookup_min: Optional[int],
-        draft_token_acceptance_method: str,
-        typical_acceptance_sampler_posterior_threshold: Optional[float],
-        typical_acceptance_sampler_posterior_alpha: Optional[float],
-        disable_logprobs: Optional[bool],
-    ) -> Optional["SpeculativeConfig"]:
-        """Create a SpeculativeConfig if possible, else return None.
-
-        This function attempts to create a SpeculativeConfig object based on the
-        provided parameters. If the necessary conditions are met, it returns an
-        instance of SpeculativeConfig. Otherwise, it returns None.
-
-        Args:
-            target_model_config (ModelConfig): The configuration of the target
-                model.
-            target_parallel_config (ParallelConfig): The parallel configuration
-                for the target model.
-            target_dtype (str): The data type used for the target model.
-            speculative_model (Optional[str]): The name of the speculative
-                model, if provided.
-            speculative_model_quantization (Optional[str]): Quantization method
-                that was used to quantize the speculative model weights. If
-                None, we assume the model weights are not quantized.
-            speculative_draft_tensor_parallel_size (Optional[int]): The degree
-                of the tensor parallelism for the draft model.
-            num_speculative_tokens (Optional[int]): The number of speculative
-                tokens, if provided. Will default to the number in the draft
-                model config if present, otherwise is required.
-            speculative_disable_mqa_scorer (Optional[bool]): Disable the MQA
-                scorer for the speculative model and fall back to batch
-                expansion for scoring.
-            speculative_max_model_len (Optional[int]): The maximum model len of
-                the speculative model. Used when testing the ability to skip
-                speculation for some sequences.
-            enable_chunked_prefill (bool): Whether vLLM is configured to use
-                chunked prefill or not. Used for raising an error since its not
-                yet compatible with spec decode.
-            speculative_disable_by_batch_size (Optional[int]): Disable
-                speculative decoding for new incoming requests when the number
-                of enqueue requests  is larger than this value, if provided.
-            ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
-                window, if provided.
-            ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
-                window, if provided.
-            draft_token_acceptance_method (str): The method to use for
-                accepting draft tokens. This can take two possible
-                values 'rejection_sampler' and 'typical_acceptance_sampler'
-                for RejectionSampler and TypicalAcceptanceSampler
-                respectively.
-            typical_acceptance_sampler_posterior_threshold (Optional[float]):
-                A threshold value that sets a lower bound on the posterior
-                probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the
-                TypicalAcceptanceSampler for token acceptance.
-            typical_acceptance_sampler_posterior_alpha (Optional[float]):
-                A scaling factor for the entropy-based threshold in the
-                TypicalAcceptanceSampler.
-            disable_logprobs (Optional[bool]): If set to True, token log
-                probabilities are not returned during speculative decoding.
-                If set to False, token log probabilities are returned
-                according to the log probability settings in SamplingParams.
-                If not specified, it defaults to True.
+    def __post_init__(self):
 
-        Returns:
-            Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
-                the necessary conditions are met, else None.
-        """
-        if speculative_model is None:
-            if num_speculative_tokens is not None:
-                if target_model_config.hf_text_config.model_type \
+        # Note: After next release, the method parameter will be used to
+        # specify the speculative method, which helps to extend the
+        # configuration of non-model-based proposers, and the model parameter
+        # will be used when the draft model or head is needed.
+        # If users do not specify the method, the speculative method will
+        # be detected automatically if possible. If the speculative method can
+        # not be detected, it will be considered as the draft-model-based
+        # method by default.
+
+        if self.model is None and self.num_speculative_tokens is not None:
+            # TODO(Shangming): Refactor mtp configuration logic when supporting
+            # mtp acceleration for more models besides deepseek_v3
+            if self.target_model_config.hf_text_config.model_type \
                         == "deepseek_v3":
-                    # use the draft model from the same model:
-                    speculative_model = target_model_config.model
-                else:
-                    raise ValueError(
-                        "num_speculative_tokens was provided without "
-                        "speculative_model.")
+                # use the draft model from the same model:
+                self.model = self.target_model_config.model
+            elif self.method in ("ngram", "[ngram]"):
+                self.model = "ngram"
             else:
-                return None
-
-        if (speculative_disable_by_batch_size is not None
-                and speculative_disable_by_batch_size < 2):
-            raise ValueError("Expect the batch size threshold of disabling "
-                             "speculative decoding is > 1, but got "
-                             f"{speculative_disable_by_batch_size=}")
-        if (enable_chunked_prefill and speculative_model == "eagle"):
-            raise ValueError("Chunked prefill and EAGLE are not compatible.")
-        # TODO: The user should be able to specify revision/max model len
-        # for the draft model. It is not currently supported.
-        draft_revision = None
-        draft_code_revision = None
-        draft_quantization = speculative_model_quantization
-
-        if speculative_model == "[ngram]":
-            if ngram_prompt_lookup_min is None:
-                ngram_prompt_lookup_min = 1
-            if ngram_prompt_lookup_max is None or ngram_prompt_lookup_max < 1:
-                raise ValueError(f"{ngram_prompt_lookup_max=} must be > 0")
-            if ngram_prompt_lookup_min < 1:
-                raise ValueError(f"{ngram_prompt_lookup_min=} must be > 0")
-            if ngram_prompt_lookup_min > ngram_prompt_lookup_max:
-                raise ValueError(f"{ngram_prompt_lookup_min=} cannot be "
-                                 f"larger than {ngram_prompt_lookup_max=}")
+                raise ValueError("num_speculative_tokens was provided without "
+                                 "speculative model.")
+
+        # Automatically configure the ngram method during configuration
+        # refactoring to ensure a smooth transition.
+        if self.method is None and (self.model is not None
+                                    and self.model in ("ngram", "[ngram]")):
+            self.method = "ngram"
+
+        if self.method in ("ngram", "[ngram]"):
+            # Unified to "ngram" internally
+            self.method = "ngram"
+            if self.prompt_lookup_min is None:
+                self.prompt_lookup_min = 1
+            if self.prompt_lookup_max is None or self.prompt_lookup_max < 1:
+                raise ValueError("prompt_lookup_max="
+                                 f"{self.prompt_lookup_max} must be > 0")
+            if self.prompt_lookup_min < 1:
+                raise ValueError("prompt_lookup_min="
+                                 f"{self.prompt_lookup_min} must be > 0")
+            if self.prompt_lookup_min > self.prompt_lookup_max:
+                raise ValueError(f"prompt_lookup_min={self.prompt_lookup_min} "
+                                 "cannot be larger than prompt_lookup_max="
+                                 f"{self.prompt_lookup_max}")
 
             # TODO: current we still need extract vocab_size from target model
             # config, in future, we may try refactor it out, and set
             # draft related config as None here.
-            draft_model_config = target_model_config
-            draft_parallel_config = target_parallel_config
+            self.draft_model_config = self.target_model_config
+            self.draft_parallel_config = self.target_parallel_config
         else:
-            ngram_prompt_lookup_max = 0
-            ngram_prompt_lookup_min = 0
-            draft_model_config = ModelConfig(
-                model=speculative_model,
-                task="draft",
-                tokenizer=target_model_config.tokenizer,
-                tokenizer_mode=target_model_config.tokenizer_mode,
-                trust_remote_code=target_model_config.trust_remote_code,
-                allowed_local_media_path=target_model_config.
-                allowed_local_media_path,
-                dtype=target_model_config.dtype,
-                seed=target_model_config.seed,
-                revision=draft_revision,
-                code_revision=draft_code_revision,
-                tokenizer_revision=target_model_config.tokenizer_revision,
-                max_model_len=None,
-                spec_target_max_model_len=target_model_config.max_model_len,
-                quantization=draft_quantization,
-                enforce_eager=target_model_config.enforce_eager,
-                max_seq_len_to_capture=target_model_config.
-                max_seq_len_to_capture,
-                max_logprobs=target_model_config.max_logprobs,
-                hf_overrides=SpeculativeConfig.hf_config_override,
-            )
-
-            draft_hf_config = draft_model_config.hf_config
+            self.prompt_lookup_max = 0
+            self.prompt_lookup_min = 0
+
+            if self.model is not None:
+                self.draft_model_config = ModelConfig(
+                    model=self.model,
+                    task="draft",
+                    tokenizer=self.target_model_config.tokenizer,
+                    tokenizer_mode=self.target_model_config.tokenizer_mode,
+                    trust_remote_code=self.target_model_config.
+                    trust_remote_code,
+                    allowed_local_media_path=self.target_model_config.
+                    allowed_local_media_path,
+                    dtype=self.target_model_config.dtype,
+                    seed=self.target_model_config.seed,
+                    revision=self.revision,
+                    code_revision=self.code_revision,
+                    tokenizer_revision=self.target_model_config.
+                    tokenizer_revision,
+                    max_model_len=None,
+                    spec_target_max_model_len=self.target_model_config.
+                    max_model_len,
+                    quantization=self.quantization,
+                    enforce_eager=self.target_model_config.enforce_eager,
+                    max_seq_len_to_capture=self.target_model_config.
+                    max_seq_len_to_capture,
+                    max_logprobs=self.target_model_config.max_logprobs,
+                    hf_overrides=SpeculativeConfig.hf_config_override,
+                )
 
-            # Detect EAGLE prefix to replace hf_config for EAGLE draft_model
-            if "eagle-" in draft_model_config.model.lower():
-                from vllm.transformers_utils.configs.eagle import EAGLEConfig
-                if isinstance(draft_model_config.hf_config, EAGLEConfig):
-                    pass
+                # Automatically detect the method
+                if "eagle-" in self.draft_model_config.model.lower():
+                    self.method = "eagle"
+                elif self.draft_model_config.hf_config.model_type == "medusa":
+                    self.method = "medusa"
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "mlp_speculator"):
+                    self.method = "mlp_speculator"
                 else:
-                    eagle_config = EAGLEConfig(draft_model_config.hf_config)
-                    draft_model_config.hf_config = eagle_config
-
-            if (num_speculative_tokens is not None
-                    and hasattr(draft_hf_config, "num_lookahead_tokens")):
-                draft_hf_config.num_lookahead_tokens = num_speculative_tokens
-            n_predict = getattr(draft_hf_config, "n_predict", None)
-            if n_predict is not None:
-                if num_speculative_tokens is None:
-                    # Default to max value defined in draft model config.
-                    num_speculative_tokens = n_predict
-                elif num_speculative_tokens > n_predict and \
-                        num_speculative_tokens % n_predict != 0:
-                    # Ensure divisibility for MTP module reuse.
-                    raise ValueError(
-                        f"{num_speculative_tokens=} must be divisible by "
-                        f"{n_predict=}")
-
-            speculative_draft_tensor_parallel_size = \
-                SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
-                    target_parallel_config,
-                    speculative_draft_tensor_parallel_size,
-                    draft_hf_config
-            )
+                    self.method = "draft_model"
+
+                # Replace hf_config for EAGLE draft_model
+                if self.method == "eagle":
+                    if self.enable_chunked_prefill:
+                        raise ValueError(
+                            "Chunked prefill and EAGLE are not compatible.")
+
+                    from vllm.transformers_utils.configs.eagle import (
+                        EAGLEConfig)
+                    if isinstance(self.draft_model_config.hf_config,
+                                  EAGLEConfig):
+                        pass
+                    else:
+                        eagle_config = EAGLEConfig(
+                            self.draft_model_config.hf_config)
+                        self.draft_model_config.hf_config = eagle_config
+
+                if (self.num_speculative_tokens is not None
+                        and hasattr(self.draft_model_config.hf_config,
+                                    "num_lookahead_tokens")):
+                    self.draft_model_config.hf_config.num_lookahead_tokens = \
+                    self.num_speculative_tokens
+
+                n_predict = getattr(self.draft_model_config.hf_config,
+                                    "n_predict", None)
+                if n_predict is not None:
+                    if self.num_speculative_tokens is None:
+                        # Default to max value defined in draft model config.
+                        self.num_speculative_tokens = n_predict
+                    elif self.num_speculative_tokens > n_predict and \
+                            self.num_speculative_tokens % n_predict != 0:
+                        # Ensure divisibility for MTP module reuse.
+                        raise ValueError(
+                            f"num_speculative_tokens:{self.num_speculative_tokens}"
+                            f" must be divisible by {n_predict=}")
+
+                self.draft_tensor_parallel_size = \
+                    SpeculativeConfig._verify_and_get_draft_tp(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size,
+                        self.draft_model_config.hf_config
+                )
 
-            draft_model_config.max_model_len = (
-                SpeculativeConfig._maybe_override_draft_max_model_len(
-                    speculative_max_model_len,
-                    draft_model_config.max_model_len,
-                    target_model_config.max_model_len,
-                ))
+                self.draft_model_config.max_model_len = (
+                    SpeculativeConfig._maybe_override_draft_max_model_len(
+                        self.max_model_len,
+                        self.draft_model_config.max_model_len,
+                        self.target_model_config.max_model_len,
+                    ))
 
-            draft_parallel_config = (
-                SpeculativeConfig.create_draft_parallel_config(
-                    target_parallel_config,
-                    speculative_draft_tensor_parallel_size, draft_hf_config))
+                self.draft_parallel_config = (
+                    SpeculativeConfig.create_draft_parallel_config(
+                        self.target_parallel_config,
+                        self.draft_tensor_parallel_size))
 
-        if num_speculative_tokens is None:
-            raise ValueError(
-                "num_speculative_tokens must be provided with "
-                "speculative_model unless the draft model config contains an "
-                "n_predict parameter.")
+        if self.acceptance_method == "typical_acceptance_sampler":
+            if self.posterior_threshold is None:
+                self.posterior_threshold = 0.09
+            if self.posterior_alpha is None:
+                self.posterior_alpha = 0.3
 
-        if typical_acceptance_sampler_posterior_threshold is None:
-            typical_acceptance_sampler_posterior_threshold = 0.09
-        if typical_acceptance_sampler_posterior_alpha is None:
-            typical_acceptance_sampler_posterior_alpha = 0.3
-        if disable_logprobs is None:
-            disable_logprobs = True
-
-        return SpeculativeConfig(
-            draft_model_config,
-            draft_parallel_config,
-            num_speculative_tokens,
-            speculative_disable_mqa_scorer,
-            speculative_disable_by_batch_size,
-            ngram_prompt_lookup_max,
-            ngram_prompt_lookup_min,
-            draft_token_acceptance_method=draft_token_acceptance_method,
-            typical_acceptance_sampler_posterior_threshold=\
-                typical_acceptance_sampler_posterior_threshold,
-            typical_acceptance_sampler_posterior_alpha=\
-                typical_acceptance_sampler_posterior_alpha,
-            disable_logprobs=disable_logprobs,
-            disable_log_stats=disable_log_stats,
-        )
+        self._verify_args()
 
     @staticmethod
     def _maybe_override_draft_max_model_len(
@@ -2108,7 +2170,7 @@ def _maybe_override_draft_max_model_len(
         )
 
     @staticmethod
-    def _verify_and_get_draft_model_tensor_parallel_size(
+    def _verify_and_get_draft_tp(
             target_parallel_config: ParallelConfig,
             speculative_draft_tensor_parallel_size: Optional[int],
             draft_hf_config: PretrainedConfig) -> int:
@@ -2140,7 +2202,6 @@ def _verify_and_get_draft_model_tensor_parallel_size(
     def create_draft_parallel_config(
         target_parallel_config: ParallelConfig,
         speculative_draft_tensor_parallel_size: int,
-        draft_hf_config: PretrainedConfig,
     ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
@@ -2164,74 +2225,13 @@ def create_draft_parallel_config(
 
         return draft_parallel_config
 
-    def __init__(
-        self,
-        draft_model_config: ModelConfig,
-        draft_parallel_config: ParallelConfig,
-        num_speculative_tokens: int,
-        speculative_disable_mqa_scorer: Optional[bool],
-        speculative_disable_by_batch_size: Optional[int],
-        ngram_prompt_lookup_max: Optional[int],
-        ngram_prompt_lookup_min: Optional[int],
-        draft_token_acceptance_method: str,
-        typical_acceptance_sampler_posterior_threshold: float,
-        typical_acceptance_sampler_posterior_alpha: float,
-        disable_logprobs: bool,
-        disable_log_stats: bool,
-    ):
-        """Create a SpeculativeConfig object.
-
-        Args:
-            draft_model_config: ModelConfig for the draft model.
-            draft_parallel_config: ParallelConfig for the draft model.
-            num_speculative_tokens: The number of tokens to sample from the
-                draft model before scoring with the target model.
-            speculative_disable_by_batch_size: Disable speculative
-                decoding for new incoming requests when the number of
-                enqueue requests is larger than this value.
-            ngram_prompt_lookup_max: Max size of ngram token window.
-            ngram_prompt_lookup_min: Min size of ngram token window.
-            draft_token_acceptance_method (str): The method to use for
-                accepting draft tokens. This can take two possible
-                values 'rejection_sampler' and 'typical_acceptance_sampler'
-                for RejectionSampler and TypicalAcceptanceSampler
-                respectively.
-            typical_acceptance_sampler_posterior_threshold (Optional[float]):
-                A threshold value that sets a lower bound on the posterior
-                probability of a token in the target model for it to be
-                accepted. This threshold is used only when we use the
-                TypicalAcceptanceSampler for token acceptance.
-            typical_acceptance_sampler_posterior_alpha (Optional[float]):
-                A scaling factor for the entropy-based threshold in the
-                TypicalAcceptanceSampler.
-            disable_logprobs: If set to True, token log probabilities will not
-                be returned even if requested by sampling parameters. This
-                reduces latency by skipping logprob calculation in proposal
-                sampling, target sampling, and after accepted tokens are
-                determined. If set to False, log probabilities will be
-                returned.
-            disable_log_stats: Whether to disable periodic printing of stage
-                times in speculative decoding.
-        """
-        self.draft_model_config = draft_model_config
-        self.draft_parallel_config = draft_parallel_config
-        self.num_speculative_tokens = num_speculative_tokens
-        self.speculative_disable_mqa_scorer = speculative_disable_mqa_scorer
-        self.speculative_disable_by_batch_size = \
-            speculative_disable_by_batch_size
-        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
-        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
-        self.draft_token_acceptance_method = draft_token_acceptance_method
-        self.typical_acceptance_sampler_posterior_threshold = \
-            typical_acceptance_sampler_posterior_threshold
-        self.typical_acceptance_sampler_posterior_alpha = \
-            typical_acceptance_sampler_posterior_alpha
-        self.disable_logprobs = disable_logprobs
-        self.disable_log_stats = disable_log_stats
-
-        self._verify_args()
-
     def _verify_args(self) -> None:
+        if self.num_speculative_tokens is None:
+            raise ValueError(
+                "num_speculative_tokens must be provided with "
+                "speculative model unless the draft model config contains an "
+                "n_predict parameter.")
+
         if self.num_speculative_tokens <= 0:
             raise ValueError("Expected num_speculative_tokens to be greater "
                              f"than zero ({self.num_speculative_tokens}).")
@@ -2241,29 +2241,34 @@ def _verify_args(self) -> None:
                 self.draft_parallel_config)
             # Validate and set draft token acceptance related settings.
 
-        if (self.draft_token_acceptance_method is None):
-            raise ValueError("draft_token_acceptance_method is not set. "
+        if self.acceptance_method is None:
+            raise ValueError("acceptance_method is not set. "
                              "Expected values are rejection_sampler or "
                              "typical_acceptance_sampler.")
 
-        if (self.draft_token_acceptance_method != 'rejection_sampler'
-                and self.draft_token_acceptance_method
-                != 'typical_acceptance_sampler'):
+        if (self.acceptance_method != 'rejection_sampler'
+                and self.acceptance_method != 'typical_acceptance_sampler'):
             raise ValueError(
-                "Expected draft_token_acceptance_method to be either "
+                "Expected acceptance_method to be either "
                 "rejection_sampler or typical_acceptance_sampler. Instead it "
-                f"is {self.draft_token_acceptance_method}")
+                f"is {self.acceptance_method}")
 
-        if (self.typical_acceptance_sampler_posterior_threshold < 0
-                or self.typical_acceptance_sampler_posterior_alpha < 0):
+        if self.acceptance_method == "typical_acceptance_sampler" and (
+            (self.posterior_threshold is not None
+             and self.posterior_threshold < 0) or
+            (self.posterior_alpha is not None and self.posterior_alpha < 0)):
             raise ValueError(
-                "Expected typical_acceptance_sampler_posterior_threshold "
-                "and typical_acceptance_sampler_posterior_alpha to be > 0. "
-                "Instead found "
-                f"typical_acceptance_sampler_posterior_threshold = "
-                f"{self.typical_acceptance_sampler_posterior_threshold} and "
-                f"typical_acceptance_sampler_posterior_alpha = "
-                f"{self.typical_acceptance_sampler_posterior_alpha}")
+                "Expected the posterior_threshold and posterior_alpha of "
+                "typical_acceptance_sampler to be > 0. "
+                "Instead found posterior_threshold = "
+                f"{self.posterior_threshold} and posterior_alpha = "
+                f"{self.posterior_alpha}")
+
+        if (self.disable_by_batch_size is not None
+                and self.disable_by_batch_size < 2):
+            raise ValueError("Expect the batch size threshold of disabling "
+                             "speculative decoding is > 1, but got "
+                             f"{self.disable_by_batch_size=}")
 
     @property
     def num_lookahead_slots(self) -> int:
@@ -2276,8 +2281,8 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def __repr__(self) -> str:
-        if self.ngram_prompt_lookup_max > 0:
-            draft_model = "[ngram]"
+        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
+            draft_model = "ngram"
         else:
             draft_model = self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
@@ -3285,7 +3290,8 @@ class VllmConfig:
                                         init=True)  # type: ignore
     load_config: LoadConfig = field(default=None, init=True)  # type: ignore
     lora_config: Optional[LoRAConfig] = None
-    speculative_config: Optional[SpeculativeConfig] = None
+    speculative_config: SpeculativeConfig = field(default=None,
+                                                  init=True)  # type: ignore
     decoding_config: Optional[DecodingConfig] = None
     observability_config: Optional[ObservabilityConfig] = None
     prompt_adapter_config: Optional[PromptAdapterConfig] = None
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dd0a6256379..c9946221fe7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -177,7 +177,10 @@ class EngineArgs:
 
     guided_decoding_backend: str = 'xgrammar'
     logits_processor_pattern: Optional[str] = None
-    # Speculative decoding configuration.
+
+    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
+
+    # TODO(Shangming): Deprecate these out-of-date params after next release
     speculative_model: Optional[str] = None
     speculative_model_quantization: Optional[str] = None
     speculative_draft_tensor_parallel_size: Optional[int] = None
@@ -190,9 +193,9 @@ class EngineArgs:
     spec_decoding_acceptance_method: str = 'rejection_sampler'
     typical_acceptance_sampler_posterior_threshold: Optional[float] = None
     typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    qlora_adapter_name_or_path: Optional[str] = None
     disable_logprobs_during_spec_decoding: Optional[bool] = None
 
+    qlora_adapter_name_or_path: Optional[str] = None
     show_hidden_metrics_for_version: Optional[str] = None
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
@@ -780,7 +783,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             const="True",
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
-
+        parser.add_argument('--speculative-config',
+                            type=nullable_str,
+                            default=None,
+                            help='The configurations for speculative decoding.'
+                            ' Should be a JSON string.')
         parser.add_argument(
             '--speculative-model',
             type=nullable_str,
@@ -1192,6 +1199,82 @@ def create_load_config(self) -> LoadConfig:
             use_tqdm_on_load=self.use_tqdm_on_load,
         )
 
+    def create_speculative_config(
+        self,
+        target_model_config: ModelConfig,
+        target_parallel_config: ParallelConfig,
+        enable_chunked_prefill: bool,
+        disable_log_stats: bool,
+    ) -> Optional["SpeculativeConfig"]:
+        """Initializes and returns a SpeculativeConfig object based on
+        `speculative_config`.
+
+        This function utilizes `speculative_config` to create a
+        SpeculativeConfig object. The `speculative_config` can either be
+        provided as a JSON string input via CLI arguments or directly as a
+        dictionary from the engine. If `speculative_config` is not set, this
+        function will attempt to construct a configuration dictionary using
+        certain parameters, which are scheduled for deprecation in the next
+        release. Note that in next releases, `speculative_config` must be
+        provided, and the deprecated standalone speculative-related parameters
+        will be removed.
+        """
+        if self.speculative_config is None:
+            if (self.speculative_model is None
+                    and self.num_speculative_tokens is None):
+                return None
+
+            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
+            # only allow '--speculative-config' after next release
+            logger.warning_once(
+                "Please use '--speculative-config' to set all configurations "
+                "related to speculative decoding. The current method of "
+                "specifying the model through '--speculative-model' and "
+                "adding related parameters (e.g., '--num-speculative-tokens') "
+                "separately will be deprecated in the next release.")
+
+            spec_config_dict = {
+                "model": self.speculative_model,
+                "quantization": self.speculative_model_quantization,
+                "max_model_len": self.speculative_max_model_len,
+                "draft_tensor_parallel_size":
+                self.speculative_draft_tensor_parallel_size,
+                "num_speculative_tokens": self.num_speculative_tokens,
+                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
+                "disable_by_batch_size":
+                self.speculative_disable_by_batch_size,
+                "prompt_lookup_max": self.ngram_prompt_lookup_max,
+                "prompt_lookup_min": self.ngram_prompt_lookup_min,
+                "acceptance_method": self.spec_decoding_acceptance_method,
+                "posterior_threshold":
+                self.typical_acceptance_sampler_posterior_threshold,
+                "posterior_alpha":
+                self.typical_acceptance_sampler_posterior_alpha,
+                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
+            }
+
+            self.speculative_config = spec_config_dict
+        else:
+            if isinstance(self.speculative_config, str):
+                import ast
+                self.speculative_config = ast.literal_eval(
+                    self.speculative_config)
+        # Note(Shangming): These parameters are not obtained from the cli arg
+        # '--speculative-config' and must be passed in when creating the engine
+        # config.
+
+        assert isinstance(self.speculative_config, dict)
+        self.speculative_config.update({
+            "target_model_config": target_model_config,
+            "target_parallel_config": target_parallel_config,
+            "enable_chunked_prefill": enable_chunked_prefill,
+            "disable_log_stats": disable_log_stats,
+        })
+        speculative_config = SpeculativeConfig.from_dict(
+            self.speculative_config)
+
+        return speculative_config
+
     def create_engine_config(
         self,
         usage_context: Optional[UsageContext] = None,
@@ -1238,6 +1321,8 @@ def create_engine_config(
         else:
             self._set_default_args_v0(model_config)
 
+        assert self.enable_chunked_prefill is not None
+
         cache_config = CacheConfig(
             block_size=self.block_size,
             gpu_memory_utilization=self.gpu_memory_utilization,
@@ -1280,31 +1365,11 @@ def create_engine_config(
             worker_extension_cls=self.worker_extension_cls,
         )
 
-        speculative_config = SpeculativeConfig.maybe_create_spec_config(
+        speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
-            target_dtype=self.dtype,
-            speculative_model=self.speculative_model,
-            speculative_model_quantization = \
-                self.speculative_model_quantization,
-            speculative_draft_tensor_parallel_size = \
-                self.speculative_draft_tensor_parallel_size,
-            num_speculative_tokens=self.num_speculative_tokens,
-            speculative_disable_mqa_scorer=self.speculative_disable_mqa_scorer,
-            speculative_disable_by_batch_size=self.
-            speculative_disable_by_batch_size,
-            speculative_max_model_len=self.speculative_max_model_len,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_log_stats=self.disable_log_stats,
-            ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
-            ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
-            draft_token_acceptance_method=\
-                self.spec_decoding_acceptance_method,
-            typical_acceptance_sampler_posterior_threshold=self.
-            typical_acceptance_sampler_posterior_threshold,
-            typical_acceptance_sampler_posterior_alpha=self.
-            typical_acceptance_sampler_posterior_alpha,
-            disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
         # Reminder: Please update docs/source/features/compatibility_matrix.md
@@ -1569,7 +1634,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         if (self.speculative_model is not None
                 or self.num_speculative_tokens is not None):
             # This is supported but experimental (handled below).
-            if self.speculative_model == "[ngram]":
+            if self.speculative_model in ("ngram", "[ngram]"):
                 pass
             else:
                 _raise_or_fallback(feature_name="Speculative Decoding",
@@ -1617,7 +1682,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # ngram is supported on V1, but off by default for now.
-        if self.speculative_model == "[ngram]" and _warn_or_fallback("ngram"):
+        if self.speculative_model in (
+                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
             return False
 
         # Non-CUDA is supported on V1, but off by default for now.
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 5bf4f67d35b..a724beade12 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -92,22 +92,20 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     # Override draft-model specific worker args.
     draft_worker_kwargs.update(
         vllm_config=draft_worker_config,
-        ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max,
-        ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min,
+        ngram_prompt_lookup_max=speculative_config.prompt_lookup_max,
+        ngram_prompt_lookup_min=speculative_config.prompt_lookup_min,
     )
 
     spec_decode_worker = SpecDecodeWorker.create_worker(
         scorer_worker=target_worker,
         draft_worker_kwargs=draft_worker_kwargs,
-        disable_mqa_scorer=speculative_config.speculative_disable_mqa_scorer,
-        disable_by_batch_size=speculative_config.
-        speculative_disable_by_batch_size,
-        draft_token_acceptance_method=speculative_config.
-        draft_token_acceptance_method,
+        disable_mqa_scorer=speculative_config.disable_mqa_scorer,
+        disable_by_batch_size=speculative_config.disable_by_batch_size,
+        draft_token_acceptance_method=speculative_config.acceptance_method,
         typical_acceptance_sampler_posterior_threshold=speculative_config.
-        typical_acceptance_sampler_posterior_threshold,
+        posterior_threshold,
         typical_acceptance_sampler_posterior_alpha=speculative_config.
-        typical_acceptance_sampler_posterior_alpha,
+        posterior_alpha,
         disable_logprobs=speculative_config.disable_logprobs,
         disable_log_stats=speculative_config.disable_log_stats,
         num_speculative_tokens=speculative_config.num_speculative_tokens,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 229849e4439..46ad52398e0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -151,8 +151,7 @@ def __init__(
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-            # TODO: find a better way to check if we are using ngram.
-            assert self.speculative_config.ngram_prompt_lookup_min, \
+            assert self.speculative_config.method == "ngram", \
                     "Currently, only ngram spec decode is supported in V1."
             if get_pp_group().is_last_rank:
                 self.drafter = NgramProposer()
@@ -160,7 +159,7 @@ def __init__(
                 # This usually takes less than 1 second.
                 self.drafter.propose(
                     np.zeros(1024, dtype=np.int32),
-                    self.speculative_config.ngram_prompt_lookup_min,
+                    self.speculative_config.prompt_lookup_min,
                     self.speculative_config.num_speculative_tokens,
                 )
                 self.rejection_sampler = RejectionSampler()
@@ -1155,7 +1154,7 @@ def generate_draft_token_ids(
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
                 self.input_batch.token_ids_cpu[i, :end_idx],
-                self.speculative_config.ngram_prompt_lookup_min,
+                self.speculative_config.prompt_lookup_min,
                 self.speculative_config.num_speculative_tokens,
             )
             if drafter_output is None or len(drafter_output) == 0:

From 032333c3d7a4934cfbfeea0b6c56926056449a61 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 23 Mar 2025 14:04:13 +0800
Subject: [PATCH 0917/1240] [ci/build] update torch nightly version for GH200
 (#15135)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-gh200-test.sh | 3 ++-
 Dockerfile                   | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 20aca328ba1..5c004b47778 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -14,6 +14,7 @@ DOCKER_BUILDKIT=1 docker build . \
   -t gh200-test \
   --build-arg max_jobs=66 \
   --build-arg nvcc_threads=2 \
+  --build-arg RUN_WHEEL_CHECK=false \
   --build-arg torch_cuda_arch_list="9.0+PTX" \
   --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
@@ -23,6 +24,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference
-docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
     python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/Dockerfile b/Dockerfile
index 79bca1cf9f8..df79412bbec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -52,7 +52,8 @@ WORKDIR /workspace
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
     fi
 
 COPY requirements/common.txt requirements/common.txt
@@ -200,7 +201,8 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.

From 47e6b1e252022c62ec3b1937fb6304925570e66f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 23 Mar 2025 14:49:48 +0800
Subject: [PATCH 0918/1240] [ci/build] fix broken tests in LLM.collective_rpc
 (#15350)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                |  2 +-
 tests/entrypoints/llm/test_collective_rpc.py | 13 ++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 21c5e247bc7..7e812cbc6ff 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -515,7 +515,7 @@ steps:
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
   commands:
-  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 64c473c4c53..d51b7c26344 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -21,18 +21,9 @@ def test_collective_rpc(tp_size, backend):
     def echo_rank(self):
         return self.rank
 
-    from vllm.worker.worker import Worker
-
-    class MyWorker(Worker):
-
-        def echo_rank(self):
-            return self.rank
-
     llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
               enforce_eager=True,
               load_format="dummy",
               tensor_parallel_size=tp_size,
-              distributed_executor_backend=backend,
-              worker_cls=MyWorker)
-    for method in ["echo_rank", echo_rank]:
-        assert llm.collective_rpc(method) == list(range(tp_size))
+              distributed_executor_backend=backend)
+    assert llm.collective_rpc(echo_rank) == list(range(tp_size))

From a9e9f3354da61684a3a64b34adf0cdde86792ae1 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Sun, 23 Mar 2025 16:10:10 +0800
Subject: [PATCH 0919/1240] [Misc] Add tuned R1 w8a8 and MoE configs for NVIDIA
 L20 (#15322)

Signed-off-by: DefTruth <qiustudent_r@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../kernels/benchmark_w8a8_block_fp8.py       | 420 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  18 +
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  26 ++
 16 files changed, 922 insertions(+)
 create mode 100644 benchmarks/kernels/benchmark_w8a8_block_fp8.py
 create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
new file mode 100644
index 00000000000..8f07bc8ca52
--- /dev/null
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -0,0 +1,420 @@
+# SPDX-License-Identifier: Apache-2.0
+# Adapted from sglang quantization/tuning_block_wise_kernel.py
+
+import argparse
+import json
+import multiprocessing as mp
+import os
+import time
+from datetime import datetime
+from typing import Any
+
+import torch
+import tqdm
+import triton
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    _w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
+
+mp.set_start_method("spawn", force=True)
+
+assert current_platform.is_cuda(
+), "Only support tune w8a8 block fp8 kernel on CUDA device."
+
+DTYPE_MAP = {
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "half": torch.half,
+    "bfloat16": torch.bfloat16,
+}
+
+
+def w8a8_block_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    config: dict[str, Any],
+    output_dtype: torch.dtype = torch.float16,
+) -> torch.Tensor:
+    """This function performs matrix multiplication with 
+    block-wise quantization.
+
+    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
+    The output is returned in the specified `output_dtype`.
+
+    Args:
+        A: The input tensor, e.g., activation.
+        B: The input tensor, e.g., weight.
+        As: The per-token-group quantization scale for `A`.
+        Bs: The per-block quantization scale for `B`.
+        block_size: The block size for per-block quantization. 
+                    It should be 2-dim, e.g., [128, 128].
+        output_dytpe: The dtype of the returned tensor.
+
+    Returns:
+        torch.Tensor: The result of matmul.
+    """
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+
+    assert A.shape[-1] == B.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1] and A.is_contiguous()
+    assert triton.cdiv(A.shape[-1], block_k) == As.shape[-1]
+    M = A.numel() // A.shape[-1]
+
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    N, K = B.shape
+    assert triton.cdiv(N, block_n) == Bs.shape[0]
+    assert triton.cdiv(K, block_k) == Bs.shape[1]
+
+    C_shape = A.shape[:-1] + (N, )
+    C = A.new_empty(C_shape, dtype=output_dtype)
+
+    def grid(META):
+        return (triton.cdiv(M, META["BLOCK_SIZE_M"]) *
+                triton.cdiv(N, META["BLOCK_SIZE_N"]), )
+
+    if A.dtype == torch.float8_e4m3fn:
+        kernel = _w8a8_block_fp8_matmul
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    kernel[grid](
+        A,
+        B,
+        C,
+        As,
+        Bs,
+        M,
+        N,
+        K,
+        block_n,
+        block_k,
+        A.stride(-2),
+        A.stride(-1),
+        B.stride(1),
+        B.stride(0),
+        C.stride(-2),
+        C.stride(-1),
+        As.stride(-2),
+        As.stride(-1),
+        Bs.stride(1),
+        Bs.stride(0),
+        **config,
+    )
+
+    return C
+
+
+def get_configs_compute_bound():
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_m,
+                                "BLOCK_SIZE_N": block_n,
+                                "BLOCK_SIZE_K": block_k,
+                                "GROUP_SIZE_M": group_size,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+    return configs
+
+
+def get_weight_shapes(tp_size):
+    # NOTE(HandH1998): The weight shapes only works for DeepSeek-V3.
+    # Modify them, if you tune for another different model.
+    # cannot TP
+    total = [
+        (512 + 64, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (7168, 16384),
+        (7168, 18432),
+    ]
+    # N can TP
+    n_tp = [
+        (18432 * 2, 7168),
+        ((128 + 64) * 128, 7168),
+        (128 * (128 + 128), 512),
+        (24576, 1536),
+        (12288, 7168),
+        (4096, 7168),
+    ]
+    # K can TP
+    k_tp = [(7168, 18432), (7168, 16384), (7168, 2048)]
+
+    weight_shapes = []
+    for t in total:
+        weight_shapes.append(t)
+    for n_t in n_tp:
+        new_t = (n_t[0] // tp_size, n_t[1])
+        weight_shapes.append(new_t)
+    for k_t in k_tp:
+        new_t = (k_t[0], k_t[1] // tp_size)
+        weight_shapes.append(new_t)
+    return weight_shapes
+
+
+def benchmark_config(A,
+                     B,
+                     As,
+                     Bs,
+                     block_size,
+                     config,
+                     out_dtype=torch.float16,
+                     num_iters=10):
+
+    def run():
+        w8a8_block_matmul(A, B, As, Bs, block_size, config, out_dtype)
+
+    torch.cuda.synchronize()
+    # JIT complication & warmup
+    for _ in range(5):
+        run()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies: list[float] = []
+    for i in range(num_iters):
+        torch.cuda.synchronize()
+        start_event.record()
+        run()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    return avg
+
+
+def tune(M, N, K, block_size, out_dtype, search_space, input_type):
+    factor_for_scale = 1e-2
+
+    if input_type == "fp8":
+        fp8_info = torch.finfo(torch.float8_e4m3fn)
+        fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+        A_fp32 = (
+            (torch.rand(M, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+        B_fp32 = (
+            (torch.rand(N, K, dtype=torch.float32, device="cuda") - 0.5) * 2 *
+            fp8_max)
+        B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    else:
+        raise RuntimeError(
+            "Currently, only support tune w8a8 block fp8 kernel.")
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32,
+                    device="cuda") * factor_for_scale
+    Bs = (torch.rand(n_tiles, k_tiles, dtype=torch.float32, device="cuda") *
+          factor_for_scale)
+
+    best_config = None
+    best_time = float("inf")
+    for config in tqdm(search_space):
+        try:
+            kernel_time = benchmark_config(
+                A,
+                B,
+                As,
+                Bs,
+                block_size,
+                config,
+                out_dtype,
+                num_iters=10,
+            )
+        except triton.runtime.autotuner.OutOfResources:
+            # Some configurations may be invalid and fail to compile.
+            continue
+
+        if kernel_time < best_time:
+            best_time = kernel_time
+            best_config = config
+    now = datetime.now()
+    print(f"{now.ctime()}] Completed tuning for batch_size={M}")
+    assert best_config is not None
+    return best_config
+
+
+def save_configs(
+    N,
+    K,
+    block_n,
+    block_k,
+    configs,
+    save_path,
+    input_type="fp8",
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    device_name = current_platform.get_device_name().replace(" ", "_")
+    json_file_name = (
+        f"N={N},K={K},device_name={device_name},dtype={input_type}_w8a8,"
+        f"block_shape=[{block_n},{block_k}].json")
+
+    config_file_path = os.path.join(save_path, json_file_name)
+    print(f"Writing best config to {config_file_path}...")
+
+    with open(config_file_path, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def tune_on_gpu(args_dict):
+    """Run tuning on a specific GPU."""
+    gpu_id = args_dict["gpu_id"]
+    batch_sizes = args_dict["batch_sizes"]
+    weight_shapes = args_dict["weight_shapes"]
+    args = args_dict["args"]
+
+    torch.cuda.set_device(gpu_id)
+    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
+
+    block_n = args.block_n
+    block_k = args.block_k
+    out_dtype = DTYPE_MAP[args.out_dtype]
+    save_path = args.save_path
+    input_type = args.input_type
+
+    search_space = get_configs_compute_bound()
+    search_space = [
+        config for config in search_space
+        if block_k % config["BLOCK_SIZE_K"] == 0
+    ]
+
+    start = time.time()
+    for shape in tqdm(weight_shapes, desc=f"GPU {gpu_id} - Shapes"):
+        N, K = shape[0], shape[1]
+        print(f"[GPU {gpu_id}] Tune for weight shape of `N: {N}, K: {K}`")
+        benchmark_results = [
+            tune(
+                batch_size,
+                N,
+                K,
+                [block_n, block_k],
+                out_dtype,
+                search_space,
+                input_type,
+            ) for batch_size in tqdm(batch_sizes,
+                                     desc=f"GPU {gpu_id} - Batch sizes")
+        ]
+        best_configs = {
+            M: config
+            for M, config in zip(batch_sizes, benchmark_results)
+        }
+        save_configs(N, K, block_n, block_k, best_configs, save_path,
+                     input_type)
+
+    end = time.time()
+    print(f"Tuning on GPU {gpu_id} took {end - start:.2f} seconds")
+
+
+def distribute_batch_sizes(batch_sizes, num_gpus):
+    """Distribute batch sizes across available GPUs."""
+    batches_per_gpu = []
+    for i in range(num_gpus):
+        start_idx = i * len(batch_sizes) // num_gpus
+        end_idx = (i + 1) * len(batch_sizes) // num_gpus
+        batches_per_gpu.append(batch_sizes[start_idx:end_idx])
+    return batches_per_gpu
+
+
+def main(args):
+    print(args)
+    num_gpus = torch.cuda.device_count()
+    if num_gpus == 0:
+        raise RuntimeError("No GPU available for tuning")
+    print(f"Found {num_gpus} GPUs for parallel tuning")
+
+    torch.cuda.init()
+
+    if args.batch_size is None:
+        batch_sizes = [
+            1,
+            2,
+            4,
+            8,
+            16,
+            24,
+            32,
+            48,
+            64,
+            96,
+            128,
+            256,
+            512,
+            1024,
+            1536,
+            2048,
+            3072,
+            4096,
+        ]
+    else:
+        batch_sizes = [args.batch_size]
+        num_gpus = 1  # If only one batch size, use only one GPU
+
+    weight_shapes = get_weight_shapes(args.tp_size)
+
+    batches_per_gpu = distribute_batch_sizes(batch_sizes, num_gpus)
+
+    process_args = []
+    for gpu_id in range(num_gpus):
+        process_args.append({
+            "gpu_id": gpu_id,
+            "batch_sizes": batches_per_gpu[gpu_id],
+            "weight_shapes":
+            weight_shapes,  # Each GPU processes all weight shapes
+            "args": args,
+        })
+
+    ctx = mp.get_context("spawn")
+    with ctx.Pool(num_gpus) as pool:
+        pool.map(tune_on_gpu, process_args)
+
+    print("Multi-GPU tuning completed")
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="""
+Tune triton w8a8 block fp8 for DeepSeek-V3/DeepSeek-R1:
+    python3 benchmark_w8a8_block_fp8.py --tp-size 8 --input-type fp8
+Then copy to model_executor/layers/quantization/utils/configs
+        """,
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--tp-size", "-tp", type=int, default=8)
+    parser.add_argument("--input-type",
+                        type=str,
+                        choices=["fp8"],
+                        default="fp8")
+    parser.add_argument(
+        "--out-dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16", "half"],
+        default="float16",
+    )
+    parser.add_argument("--block-n", type=int, default=128)
+    parser.add_argument("--block-k", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--save-path", type=str, default="./")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..c6eabea66a3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9696611f70a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..459062e3e65
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..125fe36a8b4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..125fe36a8b4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..f5fdec3e62a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..7f449db4918
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..be93dfeeba7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..84ef35e998f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..983525fb661
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,18 @@
+{
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..0cf6a47e5fe
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..0cf6a47e5fe
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..d962889957c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..b4b08ea0c0a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 00000000000..9d7edc3b72b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_L20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}

From 0e2564c13ecc89c3088afc63d025fb059bac60c1 Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Sun, 23 Mar 2025 23:53:09 +0800
Subject: [PATCH 0920/1240] [Bugfix] consider related env vars for
 torch.compiled cache hash (#14953)

Signed-off-by: DefTruth <31974251+DefTruth@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/compilation/backends.py |  5 +++++
 vllm/envs.py                 | 41 ++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 089d415ab5f..d8c0c59ba9b 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -357,6 +357,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             # graph.
 
             factors = []
+            # 0. factors come from the env, for example, The values of
+            # VLLM_PP_LAYER_PARTITION will affects the computation graph.
+            env_hash = envs.compute_hash()
+            factors.append(env_hash)
+
             # 1. factors come from the vllm_config (it mainly summarizes how the
             #    model is created)
             config_hash = vllm_config.compute_hash()
diff --git a/vllm/envs.py b/vllm/envs.py
index 829f9450fb7..6b14b331fc9 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import hashlib
 import os
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Optional
@@ -651,3 +652,43 @@ def set_vllm_use_v1(use_v1: bool):
             "explicitly by the user. Please raise this as a Github "
             "Issue and explicitly set VLLM_USE_V1=0 or 1.")
     os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
+
+
+def compute_hash() -> str:
+    """
+    WARNING: Whenever a new key is added to this environment
+    variables, ensure that it is included in the factors list if
+    it affects the computation graph. For example, different values
+    of VLLM_PP_LAYER_PARTITION will generate different computation
+    graphs, so it is included in the factors list. The env vars that 
+    affect the choice of different kernels or attention backends should
+    also be included in the factors list.
+    """
+    factors: list[Any] = []
+
+    # summarize environment variables
+    def factorize(name: str):
+        if __getattr__(name):
+            factors.append(__getattr__(name))
+        else:
+            factors.append("None")
+
+    # The values of envs may affects the computation graph.
+    # TODO(DefTruth): hash all environment variables?
+    # for key in environment_variables:
+    #     factorize(key)
+    environment_variables_to_hash = [
+        "VLLM_PP_LAYER_PARTITION",
+        "VLLM_MLA_DISABLE",
+        "VLLM_USE_TRITON_FLASH_ATTN",
+        "VLLM_USE_TRITON_AWQ",
+        "VLLM_DP_RANK",
+        "VLLM_DP_SIZE",
+    ]
+    for key in environment_variables_to_hash:
+        if key in environment_variables:
+            factorize(key)
+
+    hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+
+    return hash_str

From 5d0c5e2607af3e143164d5eba8d9bc78c9500444 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Mar 2025 10:41:44 -0700
Subject: [PATCH 0921/1240] [V1][Spec Decode] Respect prompt_lookup_max
 (#15348)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/spec_decode/test_ngram.py    | 53 ++++++++++++++++++++++++++-
 vllm/v1/spec_decode/ngram_proposer.py | 17 +++++++--
 vllm/v1/worker/gpu_model_runner.py    |  2 +
 3 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index 2c2e125ade4..a81b4897e5d 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -2,7 +2,8 @@
 
 import numpy as np
 
-from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
+from vllm.v1.spec_decode.ngram_proposer import (NgramProposer,
+                                                _find_subarray_kmp,
                                                 _kmp_lps_array)
 
 
@@ -35,3 +36,53 @@ def test_find_subarray_kmp():
     # Return on the first match
     np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
                                   np.array([6, 2, 3]))
+
+
+def test_ngram_proposer():
+    proposer = NgramProposer()
+
+    # No match.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 5]),
+        min_n=2,
+        max_n=2,
+        k=2,
+    )
+    assert result is None
+
+    # No match for 4-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
+        min_n=4,
+        max_n=4,
+        k=2,
+    )
+    assert result is None
+
+    # No match for 4-gram but match for 3-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([1, 2, 3, 4, 1, 2, 3]),
+        min_n=3,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([4, 1]))
+
+    # Match for both 4-gram and 3-gram.
+    # In this case, the proposer should return the 4-gram match.
+    result = proposer.propose(
+        context_token_ids=np.array([2, 3, 4, 5, 1, 2, 3, 4, 1, 2, 3, 4]),
+        min_n=3,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 1]
+
+    # Match for 2-gram and 3-gram, but not 4-gram.
+    result = proposer.propose(
+        context_token_ids=np.array([3, 4, 5, 2, 3, 4, 1, 2, 3, 4]),
+        min_n=2,
+        max_n=4,
+        k=2,
+    )
+    assert np.array_equal(result, np.array([1, 2]))  # Not [5, 2]
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 33289d05dab..0bef349e99e 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -10,7 +10,8 @@ class NgramProposer:
     def propose(
         self,
         context_token_ids: np.ndarray,
-        n: int,
+        min_n: int,
+        max_n: int,
         k: int,
     ) -> Optional[np.ndarray]:
         """Proposes the next sequence of tokens based on n-gram pattern 
@@ -21,7 +22,8 @@ def propose(
         Args:
             context_token_ids: Numpy array of token IDs representing the 
                                context sequence.
-            n: Length of the n-gram to match.
+            min_n: Minimum length of the n-gram to match.
+            max_n: Maximum length of the n-gram to match.
             k: Number of tokens follow the match. If there are less 
                than k tokens follow the match, we will return 
                the maximum amount of tokens until the end.
@@ -32,14 +34,21 @@ def propose(
             None: If no matching n-gram pattern is found.
         
         Example:
-            If context_token_ids = [1,2,3,4,2,3], n = 2, and k = 4:
+            If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
+            k = 4:
+            - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
             - The last 2 tokens [2,3] will be matched against the previous 
               4 tokens [1,2,3,4].
             - Finding a match of [2,3] would return the tokens that 
               followed that pattern. Here we will return [4,2,3] because 
               we only have three tokens after the match.
         """
-        return _find_subarray_kmp(context_token_ids, n, k)
+        # TODO(woosuk): Optimize this.
+        for n in range(max_n, min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, k)
+            if result is not None:
+                return result
+        return None
 
 
 @jit(nopython=True)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 46ad52398e0..66358d963d5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -160,6 +160,7 @@ def __init__(
                 self.drafter.propose(
                     np.zeros(1024, dtype=np.int32),
                     self.speculative_config.prompt_lookup_min,
+                    self.speculative_config.prompt_lookup_max,
                     self.speculative_config.num_speculative_tokens,
                 )
                 self.rejection_sampler = RejectionSampler()
@@ -1155,6 +1156,7 @@ def generate_draft_token_ids(
             drafter_output = self.drafter.propose(
                 self.input_batch.token_ids_cpu[i, :end_idx],
                 self.speculative_config.prompt_lookup_min,
+                self.speculative_config.prompt_lookup_max,
                 self.speculative_config.num_speculative_tokens,
             )
             if drafter_output is None or len(drafter_output) == 0:

From a7b88e6d6427dea921a74ceab5b93d8ce1c1ced8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 23 Mar 2025 10:52:30 -0700
Subject: [PATCH 0922/1240] [V1][Spec Decode] Use better defaults for N-gram
 (#15358)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 8b3c66cca50..1552fb280a2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2012,18 +2012,30 @@ def __post_init__(self):
         if self.method in ("ngram", "[ngram]"):
             # Unified to "ngram" internally
             self.method = "ngram"
-            if self.prompt_lookup_min is None:
-                self.prompt_lookup_min = 1
-            if self.prompt_lookup_max is None or self.prompt_lookup_max < 1:
-                raise ValueError("prompt_lookup_max="
-                                 f"{self.prompt_lookup_max} must be > 0")
+            # Set default values if not provided
+            if (self.prompt_lookup_min is None
+                    and self.prompt_lookup_max is None):
+                # TODO(woosuk): Tune these values. They are arbitrarily chosen.
+                self.prompt_lookup_min = 5
+                self.prompt_lookup_max = 5
+            elif self.prompt_lookup_min is None:
+                assert self.prompt_lookup_max is not None
+                self.prompt_lookup_min = self.prompt_lookup_max
+            elif self.prompt_lookup_max is None:
+                assert self.prompt_lookup_min is not None
+                self.prompt_lookup_max = self.prompt_lookup_min
+
+            # Validate values
             if self.prompt_lookup_min < 1:
-                raise ValueError("prompt_lookup_min="
-                                 f"{self.prompt_lookup_min} must be > 0")
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must be > 0")
+            if self.prompt_lookup_max < 1:
+                raise ValueError(
+                    f"prompt_lookup_max={self.prompt_lookup_max} must be > 0")
             if self.prompt_lookup_min > self.prompt_lookup_max:
-                raise ValueError(f"prompt_lookup_min={self.prompt_lookup_min} "
-                                 "cannot be larger than prompt_lookup_max="
-                                 f"{self.prompt_lookup_max}")
+                raise ValueError(
+                    f"prompt_lookup_min={self.prompt_lookup_min} must "
+                    f"be <= prompt_lookup_max={self.prompt_lookup_max}")
 
             # TODO: current we still need extract vocab_size from target model
             # config, in future, we may try refactor it out, and set

From 2f5e024b3c5e1f0f1f7504282b4aec209dd1c8f4 Mon Sep 17 00:00:00 2001
From: Robin <863579016@qq.com>
Date: Mon, 24 Mar 2025 05:00:07 +0800
Subject: [PATCH 0923/1240] [Frontend] Support tool calling and reasoning
 parser (#14511)

Signed-off-by: WangErXiao <863579016@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +-
 docs/source/features/reasoning_outputs.md     |  51 ++++-
 ...at_completion_tool_calls_with_reasoning.py | 177 +++++++++++++++++
 .../openai/test_chat_with_tool_reasoning.py   | 145 ++++++++++++++
 vllm/entrypoints/openai/cli_args.py           |   7 -
 .../abs_reasoning_parsers.py                  |  35 ++++
 .../deepseek_r1_reasoning_parser.py           |  13 ++
 vllm/entrypoints/openai/serving_chat.py       | 188 +++++++++++++-----
 8 files changed, 555 insertions(+), 63 deletions(-)
 create mode 100644 examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
 create mode 100644 tests/entrypoints/openai/test_chat_with_tool_reasoning.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7e812cbc6ff..217f869f1f3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -118,7 +118,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index b5fad26368b..0b170aadc34 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
 
 vLLM currently supports the following reasoning models:
 
-| Model Series | Parser Name | Structured Output Support |
-|--------------|-------------|------------------|
-| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` |
-| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` |
+| Model Series | Parser Name | Structured Output Support | Tool Calling |
+|--------------|-------------|------------------|-------------|
+| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
+| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
 
 ## Quickstart
 
@@ -170,10 +170,51 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
 print("content: ", completion.choices[0].message.content)
 ```
 
+## Tool Calling
+
+The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            },
+            "required": ["location", "unit"]
+        }
+    }
+}]
+
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
+    tools=tools,
+    tool_choice="auto"
+)
+
+print(response)
+tool_call = response.choices[0].message.tool_calls[0].function
+
+print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+```
+
+For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
+
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
-- It is not compatible with [`tool_calling`](#tool_calling).
 
 ## How to support a new reasoning model
 
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
new file mode 100644
index 00000000000..9e7a69c6c87
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+An example demonstrates how to use tool calling with reasoning models 
+like QwQ-32B. The reasoning_content will not be parsed by the tool 
+calling process; only the final output will be parsed.
+
+To run this example, you need to start the vLLM server with both 
+the reasoning parser and tool calling enabled.
+
+```bash
+vllm serve Qwen/QwQ-32B \
+     --enable-reasoning --reasoning-parser deepseek_r1 \
+     --enable-auto-tool-choice --tool-call-parser hermes
+     
+```
+
+"""
+
+from openai import OpenAI
+
+
+# Now, simulate a tool call
+def get_current_weather(city: str, state: str, unit: 'str'):
+    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+            "partly cloudly, with highs in the 90's.")
+
+
+available_tools = {"get_current_weather": get_current_weather}
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+messages = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+print("---------Full Generate With Automatic Function Calling-------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools)
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
+print(f"function arguments: "
+      f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
+
+print("----------Stream Generate With Automatic Function Calling-----------")
+tool_calls_stream = client.chat.completions.create(messages=messages,
+                                                   model=model,
+                                                   tools=tools,
+                                                   stream=True)
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+
+print("----------Full Generate With Named Function Calling-----------------")
+tool_calls = client.chat.completions.create(messages=messages,
+                                            model=model,
+                                            tools=tools,
+                                            tool_choice={
+                                                "type": "function",
+                                                "function": {
+                                                    "name":
+                                                    "get_current_weather"
+                                                }
+                                            })
+
+tool_call = tool_calls.choices[0].message.tool_calls[0].function
+print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+print(f"function name: {tool_call.name}")
+print(f"function arguments: {tool_call.arguments}")
+print("----------Stream Generate With Named Function Calling--------------")
+
+tool_calls_stream = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice={
+        "type": "function",
+        "function": {
+            "name": "get_current_weather"
+        }
+    },
+    stream=True)
+
+chunks = []
+for chunk in tool_calls_stream:
+    chunks.append(chunk)
+
+reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+    chunks)
+print(f"reasoning_content: {reasoning_content}")
+print(f"function name: {function_names[0]}")
+print(f"function arguments: {arguments[0]}")
+print("\n\n")
diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
new file mode 100644
index 00000000000..53df1d9241b
--- /dev/null
+++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from ...utils import RemoteOpenAIServer
+
+# a reasoning and tool calling model
+MODEL_NAME = "Qwen/QwQ-32B"
+
+
+@pytest.fixture(scope="module")
+def server():  # noqa: F811
+    args = [
+        "--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
+        "--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
+        "--tool-call-parser", "hermes"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+TOOLS = [{
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {
+                    "type":
+                    "string",
+                    "description":
+                    "The city to find the weather for, e.g. 'San Francisco'"
+                },
+                "state": {
+                    "type":
+                    "string",
+                    "description":
+                    "the two-letter abbreviation for the state that the city is"
+                    " in, e.g. 'CA' which would mean 'California'"
+                },
+                "unit": {
+                    "type": "string",
+                    "description": "The unit to fetch the temperature in",
+                    "enum": ["celsius", "fahrenheit"]
+                }
+            },
+            "required": ["city", "state", "unit"]
+        }
+    }
+}]
+
+MESSAGES = [{
+    "role": "user",
+    "content": "Hi! How are you doing today?"
+}, {
+    "role": "assistant",
+    "content": "I'm doing well! How can I help you?"
+}, {
+    "role":
+    "user",
+    "content":
+    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+}]
+
+FUNC_NAME = "get_current_weather"
+FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
+
+
+def extract_reasoning_and_calls(chunks: list):
+    reasoning_content = ""
+    tool_call_idx = -1
+    arguments = []
+    function_names = []
+    for chunk in chunks:
+        if chunk.choices[0].delta.tool_calls:
+            tool_call = chunk.choices[0].delta.tool_calls[0]
+            if tool_call.index != tool_call_idx:
+                tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
+                arguments.append("")
+                function_names.append("")
+
+            if tool_call.function:
+                if tool_call.function.name:
+                    function_names[tool_call_idx] = tool_call.function.name
+
+                if tool_call.function.arguments:
+                    arguments[tool_call_idx] += tool_call.function.arguments
+        else:
+            if hasattr(chunk.choices[0].delta, "reasoning_content"):
+                reasoning_content += chunk.choices[0].delta.reasoning_content
+    return reasoning_content, arguments, function_names
+
+
+# test streaming
+@pytest.mark.asyncio
+async def test_chat_streaming_of_tool_and_reasoning(
+        client: openai.AsyncOpenAI):
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+    )
+
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk)
+
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
+        chunks)
+    assert len(reasoning_content) > 0
+    assert len(function_names) > 0 and function_names[0] == FUNC_NAME
+    assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
+
+
+# test full generate
+@pytest.mark.asyncio
+async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
+
+    tool_calls = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=False,
+    )
+
+    assert len(tool_calls.choices[0].message.reasoning_content) > 0
+    assert tool_calls.choices[0].message.tool_calls[0].function.name \
+          == FUNC_NAME
+    assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
+          == FUNC_ARGS
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 01c67b8aa29..e956920c2f9 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -289,13 +289,6 @@ def validate_parsed_serve_args(args: argparse.Namespace):
         raise TypeError("Error: --enable-reasoning requires "
                         "--reasoning-parser")
 
-    # Ref https://api-docs.deepseek.com/guides/reasoning_model
-    # tool call and reasoning cannot be enabled at the same time.
-    if args.enable_auto_tool_choice and args.enable_reasoning:
-        raise TypeError(
-            "Error: --enable-auto-tool-choice and "
-            "--enable-reasoning cannot be enabled at the same time")
-
 
 def create_parser_for_docs() -> FlexibleArgumentParser:
     parser_for_docs = FlexibleArgumentParser(
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
index b3bc0e836d4..c95ff191e4d 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+from abc import abstractmethod
 from collections.abc import Sequence
 from functools import cached_property
 from typing import Callable, Optional, Union
@@ -76,6 +77,40 @@ def extract_reasoning_content_streaming(
             "AbstractReasoningParser.extract_reasoning_content_streaming "
             "has not been implemented!")
 
+    # TODO: need to rebase by PR #14428
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+
+        raise NotImplementedError(
+            "AbstractReasoningParser.is_reasoning_end has"
+            "not been implemented!")
+
+    # TODO: need to rebase by PR #14428
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+
+        raise NotImplementedError(
+            "AbstractReasoningParser.extract_content_ids has"
+            " not been implemented!")
+
 
 class ReasoningParserManager:
     reasoning_parsers: dict[str, type] = {}
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
index 1a2c66a60e9..54e960168cf 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
@@ -45,6 +45,19 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
                 "DeepSeek R1 reasoning parser could not locate think start/end "
                 "tokens in the tokenizer!")
 
+    # TODO: need to rebase by PR #14428
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        return self.think_end_token_id in input_ids
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract the content after the end tokens
+        """
+        if self.think_end_token_id not in input_ids[:-1]:
+            return []
+        else:
+            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 130dfe1841f..3c35a848ea3 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -328,6 +328,9 @@ async def chat_completion_stream_generator(
             # These are only required in "auto" tool choice case
             previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
+            # For reasoning parser and tool call all enabled
+            added_content_delta_arr = [False] * num_choices
+            reasoning_end_arr = [False] * num_choices
         else:
             previous_texts, all_previous_token_ids = None, None
 
@@ -477,27 +480,116 @@ async def chat_completion_stream_generator(
 
                     delta_message: Optional[DeltaMessage]
 
-                    # handle streaming deltas for tools with named tool_choice
-                    if tool_choice_function_name:
-                        delta_message = DeltaMessage(tool_calls=[
-                            DeltaToolCall(function=DeltaFunctionCall(
-                                name=tool_choice_function_name,
-                                arguments=delta_text),
-                                          index=i)
-                        ])
-
-                    # handle streaming deltas for tools with "auto" tool choice
-                    elif tool_choice_auto:
+                    # just update previous_texts and previous_token_ids
+                    if tool_choice_auto or should_stream_with_reasoning_parsing:
                         assert previous_texts is not None
                         assert all_previous_token_ids is not None
-                        assert tool_parser is not None
-                        #TODO optimize manipulation of these lists
                         previous_text = previous_texts[i]
                         previous_token_ids = all_previous_token_ids[i]
                         current_text = previous_text + delta_text
                         current_token_ids = previous_token_ids + list(
                             output.token_ids)
 
+                    # handle streaming deltas for tools with named tool_choice
+                    if tool_choice_function_name:
+                        if (self.enable_reasoning
+                                and not reasoning_parser.is_reasoning_end(
+                                    previous_token_ids)):
+                            assert reasoning_parser is not None
+                            delta_message = (
+                                reasoning_parser.
+                                extract_reasoning_content_streaming(
+                                    previous_text,
+                                    current_text,
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output.token_ids,
+                                ))
+                            # When encountering think end id in delta_token_ids,
+                            # process the `content`. Only keep 'content',
+                            # remove 'reasoning_content'
+                            if reasoning_parser.is_reasoning_end(
+                                    list(output.token_ids)):
+                                if delta_message and delta_message.content:
+                                    # This need to be added to next `delta_text`
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
+                        else:
+                            # Just to add remaining `content`
+                            if self.enable_reasoning:
+                                delta_text = previous_text + delta_text
+                                current_text = ""
+
+                            delta_message = DeltaMessage(tool_calls=[
+                                DeltaToolCall(function=DeltaFunctionCall(
+                                    name=tool_choice_function_name,
+                                    arguments=delta_text),
+                                              index=i)
+                            ])
+
+                    # handle streaming deltas for tools with "auto" tool choice
+                    # and reasoning parser
+                    elif tool_choice_auto and self.enable_reasoning:
+                        assert tool_parser is not None
+                        assert reasoning_parser is not None
+                        assert added_content_delta_arr is not None
+                        assert reasoning_end_arr is not None
+                        if not reasoning_end_arr[i]:
+                            delta_message = (
+                                reasoning_parser.
+                                extract_reasoning_content_streaming(
+                                    previous_text,
+                                    current_text,
+                                    delta_text,
+                                    previous_token_ids,
+                                    current_token_ids,
+                                    output.token_ids,
+                                ))
+
+                            # When encountering think end id in delta_token_ids,
+                            # set reasoning status to end.
+                            # Remove the text and token ids related
+                            # to 'reasoning_content'.
+                            if reasoning_parser.is_reasoning_end(
+                                    list(output.token_ids)):
+                                reasoning_end_arr[i] = True
+                                current_token_ids =  \
+                                    reasoning_parser.extract_content_ids(
+                                        list(output.token_ids))
+                                if delta_message and delta_message.content:
+                                    current_text = delta_message.content
+                                    delta_message.content = None
+                                else:
+                                    current_text = ""
+
+                        # handle tool calls only after reasoning is done,
+                        else:
+                            delta_token_ids = list(output.token_ids)
+                            # First time to tool call,
+                            # add the remaining text and token ids
+                            # to delta from previous
+                            if not added_content_delta_arr[i]:
+                                added_content_delta_arr[i] = True
+                                previous_text = ""
+                                previous_token_ids = []
+                                delta_text = current_text
+                                delta_token_ids = current_token_ids
+
+                            delta_message = (
+                                tool_parser.extract_tool_calls_streaming(
+                                    previous_text=previous_text,
+                                    current_text=current_text,
+                                    delta_text=delta_text,
+                                    previous_token_ids=previous_token_ids,
+                                    current_token_ids=current_token_ids,
+                                    delta_token_ids=delta_token_ids,
+                                    request=request))
+                    # when only tool calls
+                    elif tool_choice_auto:
+                        assert tool_parser is not None
                         delta_message = (
                             tool_parser.extract_tool_calls_streaming(
                                 previous_text=previous_text,
@@ -507,23 +599,9 @@ async def chat_completion_stream_generator(
                                 current_token_ids=current_token_ids,
                                 delta_token_ids=output.token_ids,
                                 request=request))
-
-                        # update the previous values for the next iteration
-                        previous_texts[i] = current_text
-                        all_previous_token_ids[i] = current_token_ids
-                    # reasoning_content cannot be enabled with tool_choice.
-                    # If it is, the tool_choice will be used instead.
+                    # when only reasoning
                     elif self.enable_reasoning:
-                        # handle reasoning_content delta
                         assert reasoning_parser is not None
-                        assert previous_texts is not None
-                        assert all_previous_token_ids is not None
-                        previous_text = previous_texts[i]
-                        previous_token_ids = all_previous_token_ids[i]
-                        current_text = previous_text + delta_text
-                        current_token_ids = previous_token_ids + list(
-                            output.token_ids)
-
                         delta_message = (reasoning_parser.
                                          extract_reasoning_content_streaming(
                                              previous_text,
@@ -533,15 +611,17 @@ async def chat_completion_stream_generator(
                                              current_token_ids,
                                              output.token_ids,
                                          ))
-
-                        # update the previous values for the next iteration
-                        previous_texts[i] = current_text
-                        all_previous_token_ids[i] = current_token_ids
-
                     # handle streaming just a content delta
                     else:
                         delta_message = DeltaMessage(content=delta_text)
 
+                    # update the previous values for the next iteration
+                    if tool_choice_auto or should_stream_with_reasoning_parsing:
+                        assert previous_texts is not None
+                        assert all_previous_token_ids is not None
+                        previous_texts[i] = current_text
+                        all_previous_token_ids[i] = current_token_ids
+
                     # set the previous values for the next iteration
                     previous_num_tokens[i] += len(output.token_ids)
 
@@ -739,24 +819,24 @@ async def chat_completion_full_generator(
                 except RuntimeError as e:
                     logger.exception("Error in reasoning parser creation.")
                     return self.create_error_response(str(e))
-
+                # If the reasoning parser is enabled,
+                # tool calls are extracted exclusively from the content.
                 reasoning_content, content = (
                     reasoning_parser.extract_reasoning_content(
                         output.text, request=request))
-
-                if reasoning_content:
-                    message = ChatMessage(role=role,
-                                          content=content,
-                                          reasoning_content=reasoning_content)
-                else:
-                    message = ChatMessage(role=role, content=output.text)
+            else:
+                reasoning_content = None
+                content = output.text
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            elif (not self.enable_auto_tools
-                  or not self.tool_parser) and not isinstance(
-                      request.tool_choice, ChatCompletionNamedToolChoiceParam):
-                message = ChatMessage(role=role, content=output.text)
+            if (not self.enable_auto_tools
+                    or not self.tool_parser) and not isinstance(
+                        request.tool_choice,
+                        ChatCompletionNamedToolChoiceParam):
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
 
             # if the request uses tools and specified a tool choice
             elif request.tool_choice and type(
@@ -766,18 +846,21 @@ async def chat_completion_full_generator(
                     tokenizer, MistralTokenizer) else ToolCall
                 message = ChatMessage(
                     role=role,
+                    reasoning_content=reasoning_content,
                     content="",
                     tool_calls=[
                         tool_call_class(function=FunctionCall(
                             name=request.tool_choice.function.name,
-                            arguments=output.text))
+                            arguments=content))
                     ])
 
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":
 
-                message = ChatMessage(role=role, content=output.text)
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
 
             # handle when there are tools and tool choice is auto
             elif request.tools and (
@@ -792,20 +875,23 @@ async def chat_completion_full_generator(
                     return self.create_error_response(str(e))
 
                 tool_call_info = tool_parser.extract_tool_calls(
-                    output.text, request=request)
+                    content if content is not None else "", request=request)
                 # In the OpenAI API the finish_reason is "tools_called"
                 # if the tool choice is auto and the model produced a tool
                 # call. The same is not true for named function calls
                 auto_tools_called = tool_call_info.tools_called
                 if tool_call_info.tools_called:
                     message = ChatMessage(role=role,
+                                          reasoning_content=reasoning_content,
                                           content=tool_call_info.content,
                                           tool_calls=tool_call_info.tool_calls)
 
                 else:
                     # FOR NOW make it a chat message; we will have to detect
                     # the type to make it later.
-                    message = ChatMessage(role=role, content=output.text)
+                    message = ChatMessage(role=role,
+                                          reasoning_content=reasoning_content,
+                                          content=content)
 
             # undetermined case that is still important to handle
             else:
@@ -813,7 +899,9 @@ async def chat_completion_full_generator(
                     "Error in chat_completion_full_generator - cannot determine"
                     " if tools should be extracted. Returning a standard chat "
                     "completion.")
-                message = ChatMessage(role=role, content=output.text)
+                message = ChatMessage(role=role,
+                                      reasoning_content=reasoning_content,
+                                      content=content)
 
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,

From 9e0236072b5780bf9f25347fefa00fec454630da Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 23 Mar 2025 14:00:55 -0700
Subject: [PATCH 0924/1240] [Misc][Doc] Add note regarding loading
 `generation_config` by default (#15281)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/quickstart.md       | 12 +++++++++++-
 docs/source/models/generative_models.md         |  5 +++++
 docs/source/serving/openai_compatible_server.md |  4 ++++
 vllm/config.py                                  |  7 +++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index 452bee2385f..b5246c41883 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams
 ```
 
 The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::
 
 ```python
 prompts = [
@@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m")
 ```
 
 :::{note}
-By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
 :::
 
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
@@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
 By default, the server uses a predefined chat template stored in the tokenizer.
 You can learn about overriding it [here](#chat-template).
 :::
+:::{important}
+By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+
+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index 06daa04f2de..c94e940b853 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -46,6 +46,11 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
+:::{important}
+By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+
+However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
+:::
 A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
 
 ### `LLM.beam_search`
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a6ec05f45b6..1cebff7e1f6 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -33,7 +33,11 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
+:::{important}
+By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
+To disable this behavior, please pass `--generation-config vllm` when launching the server.
+:::
 ## Supported APIs
 
 We currently support the following OpenAI APIs:
diff --git a/vllm/config.py b/vllm/config.py
index 1552fb280a2..ea056bcc928 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1023,6 +1023,13 @@ def get_diff_sampling_param(self) -> dict[str, Any]:
                     "max_new_tokens")
         else:
             diff_sampling_param = {}
+
+        if diff_sampling_param:
+            logger.warning_once(
+                "Default sampling parameters have been overridden by the "
+                "model's Hugging Face generation config recommended from the "
+                "model creator. If this is not intended, please relaunch "
+                "vLLM instance with `--generation-config vllm`.")
         return diff_sampling_param
 
     @property

From 957e84771e6afa5efece6d7b1001dc61cf0aeb58 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Sun, 23 Mar 2025 18:07:04 -0400
Subject: [PATCH 0925/1240] [V1] Enable V1 Fp8 cache for FA3 in the oracle
 (#15191)

Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .gitignore                               |  3 ++-
 vllm/attention/backends/flash_attn.py    | 16 ++++++++++++----
 vllm/attention/backends/mla/common.py    |  2 +-
 vllm/config.py                           |  4 ----
 vllm/engine/arg_utils.py                 | 17 ++++++++++++++---
 vllm/platforms/cuda.py                   |  8 +++-----
 vllm/v1/attention/backends/flash_attn.py | 10 ++++++----
 vllm/v1/attention/backends/mla/common.py |  2 +-
 vllm/{ => vllm_flash_attn}/fa_utils.py   |  6 ++++++
 9 files changed, 45 insertions(+), 23 deletions(-)
 rename vllm/{ => vllm_flash_attn}/fa_utils.py (90%)

diff --git a/.gitignore b/.gitignore
index e40752f4dea..6f5cbd0733d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,8 @@
 /vllm/_version.py
 
 # vllm-flash-attn built from source
-vllm/vllm_flash_attn/
+vllm/vllm_flash_attn/*
+!vllm/vllm_flash_attn/fa_utils.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 4cb0b916739..27bd292b51f 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -22,12 +22,13 @@
     compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens,
     get_seq_len_block_table_args, is_all_cross_attn_metadata_set,
     is_all_encoder_attn_metadata_set, is_block_tables_empty)
-from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 from vllm.vllm_flash_attn import (flash_attn_varlen_func,
                                   flash_attn_with_kvcache)
+from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
@@ -632,10 +633,13 @@ def __init__(
         self.kv_cache_dtype = kv_cache_dtype
         self.vllm_flash_attn_version = get_flash_attn_version(
             requires_alibi=self.alibi_slopes is not None)
-        if (is_quantized_kv_cache(self.kv_cache_dtype)
-                and self.vllm_flash_attn_version != 3):
+        if is_quantized_kv_cache(self.kv_cache_dtype) and (
+                not self.kv_cache_dtype.startswith("fp8")
+                or not flash_attn_supports_fp8()):
             raise NotImplementedError(
-                "Only FlashAttention3 supports FP8 KV cache")
+                f"FlashAttention does not support {self.kv_cache_dtype} "
+                "kv-cache on this device "
+                f"(FA supports fp8 = {flash_attn_supports_fp8()}).")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
@@ -704,6 +708,10 @@ def forward(
         logits_soft_cap: Optional[float] = self.logits_soft_cap
         fp8_attention = kv_cache_dtype.startswith("fp8")
 
+        if fp8_attention and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support FP8 kv-cache on this device.")
+
         if kv_cache.numel() > 0:
             key_cache = kv_cache[0]
             value_cache = kv_cache[1]
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 258090d3e80..1b1ab314c01 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -205,7 +205,6 @@
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.fa_utils import get_flash_attn_version
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
@@ -214,6 +213,7 @@
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
 from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
+from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
diff --git a/vllm/config.py b/vllm/config.py
index ea056bcc928..e486889b585 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1157,10 +1157,6 @@ def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
         elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
-            if envs.VLLM_USE_V1:
-                raise NotImplementedError(
-                    "V1 does not yet support fp8 KV cache. "
-                    "Set VLLM_USE_V1=0 to enable fp8 kv cache.")
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c9946221fe7..38a47a846df 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1562,9 +1562,20 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
 
         # No Fp8 KV cache so far.
         if self.kv_cache_dtype != "auto":
-            _raise_or_fallback(feature_name="--kv-cache-dtype",
-                               recommend_to_remove=False)
-            return False
+            fp8_attention = self.kv_cache_dtype.startswith("fp8")
+            will_use_fa = (
+                current_platform.is_cuda()
+                and not envs.is_set("VLLM_ATTENTION_BACKEND")
+            ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
+            supported = False
+            if fp8_attention and will_use_fa:
+                from vllm.vllm_flash_attn.fa_utils import (
+                    flash_attn_supports_fp8)
+                supported = flash_attn_supports_fp8()
+            if not supported:
+                _raise_or_fallback(feature_name="--kv-cache-dtype",
+                                   recommend_to_remove=False)
+                return False
 
         # No Prompt Adapter so far.
         if self.enable_prompt_adapter:
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 38d8fffd63c..bb77318092f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -14,7 +14,6 @@
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
-from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.utils import import_pynvml
 
@@ -258,7 +257,7 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
             try:
                 import vllm.vllm_flash_attn  # noqa: F401
                 from vllm.attention.backends.flash_attn import (  # noqa: F401
-                    FlashAttentionBackend)
+                    FlashAttentionBackend, flash_attn_supports_fp8)
 
                 supported_sizes = \
                     FlashAttentionBackend.get_supported_head_sizes()
@@ -269,10 +268,9 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                     target_backend = _Backend.XFORMERS
                 fp8_kv_cache = (kv_cache_dtype is not None
                                 and kv_cache_dtype.startswith("fp8"))
-                if (fp8_kv_cache and get_flash_attn_version() != 3):
+                if (fp8_kv_cache and not flash_attn_supports_fp8()):
                     logger.info(
-                        "Cannot use FlashAttention-2 backend for FP8 KV cache."
-                    )
+                        "Cannot use FlashAttention backend for FP8 KV cache.")
                     logger.warning(
                         "Please use FlashInfer backend with FP8 KV Cache for "
                         "better performance by setting environment variable "
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 27b3aabbc35..92e4ffd0371 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -11,10 +11,11 @@
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
+from vllm.vllm_flash_attn.fa_utils import (flash_attn_supports_fp8,
+                                           get_flash_attn_version)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -182,9 +183,6 @@ def __init__(
         else:
             self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "FlashAttention V1 with FP8 KV cache not yet supported")
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
             logits_soft_cap = 0
@@ -206,6 +204,10 @@ def __init__(
                                       "are not implemented for "
                                       "FlashAttentionImpl")
         self.vllm_flash_attn_version = get_flash_attn_version()
+        if is_quantized_kv_cache(self.kv_cache_dtype) \
+            and not flash_attn_supports_fp8():
+            raise NotImplementedError(
+                "FlashAttention does not support fp8 kv-cache on this device.")
 
     def forward(
         self,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 31244443108..1437db7e9d4 100644
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -196,7 +196,6 @@
                                               AttentionMetadata,
                                               MLAAttentionImpl)
 from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
-from vllm.fa_utils import get_flash_attn_version
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
@@ -204,6 +203,7 @@
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
+from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
diff --git a/vllm/fa_utils.py b/vllm/vllm_flash_attn/fa_utils.py
similarity index 90%
rename from vllm/fa_utils.py
rename to vllm/vllm_flash_attn/fa_utils.py
index 41765349015..ca88549f3f7 100644
--- a/vllm/fa_utils.py
+++ b/vllm/vllm_flash_attn/fa_utils.py
@@ -46,3 +46,9 @@ def get_flash_attn_version(requires_alibi: bool = False) -> Optional[int]:
         return fa_version
     except (ImportError, AssertionError):
         return None
+
+
+def flash_attn_supports_fp8() -> bool:
+    from vllm.platforms import current_platform
+    return get_flash_attn_version() == 3 and \
+        current_platform.get_device_capability().major == 9

From 2066f2f34b4b643890e1f2e9652809857e325ec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Sun, 23 Mar 2025 21:54:07 -0400
Subject: [PATCH 0926/1240] [Fix] [torch.compile] Improve UUID system for
 custom passes (#15249)

Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/test_pass_manager.py            | 62 ++++++++++++++-----
 vllm/compilation/inductor_pass.py             | 53 ++++++++--------
 vllm/compilation/pass_manager.py              | 44 +++----------
 vllm/compilation/torch25_custom_graph_pass.py | 41 ++++++++++++
 vllm/config.py                                |  9 ++-
 5 files changed, 125 insertions(+), 84 deletions(-)
 create mode 100644 vllm/compilation/torch25_custom_graph_pass.py

diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
index bdbd104f3b2..2c1ee4dc748 100644
--- a/tests/compile/test_pass_manager.py
+++ b/tests/compile/test_pass_manager.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import pickle
+import copy
 
 import pytest
 import torch
@@ -10,32 +9,63 @@
 from vllm.config import CompilationConfig
 
 
+# dummy custom pass that doesn't inherit
 def simple_callable(graph: torch.fx.Graph):
     pass
 
 
-callable_uuid = CallableInductorPass(simple_callable,
-                                     InductorPass.hash_source(__file__))
+# Should fail to add directly to the pass manager
+def test_bad_callable():
+    config = CompilationConfig().pass_config
+
+    pass_manager = PostGradPassManager()
+    pass_manager.configure(config)
+
+    with pytest.raises(AssertionError):
+        pass_manager.add(simple_callable)  # noqa, type wrong on purpose
+
+
+# Pass that inherits from InductorPass
+class ProperPass(InductorPass):
+
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        pass
 
 
 @pytest.mark.parametrize(
-    "works, callable",
+    "callable",
     [
-        (False, simple_callable),
-        (True, callable_uuid),
-        (True, CallableInductorPass(simple_callable)),
+        ProperPass(),
+        # Can also wrap callables in CallableInductorPass for compliance
+        CallableInductorPass(simple_callable),
+        CallableInductorPass(simple_callable,
+                             InductorPass.hash_source(__file__))
     ],
 )
-def test_pass_manager(works: bool, callable):
+def test_pass_manager_uuid(callable):
     config = CompilationConfig().pass_config
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
 
-    # Try to add the callable to the pass manager
-    if works:
-        pass_manager.add(callable)
-        pickle.dumps(pass_manager)
-    else:
-        with pytest.raises(AssertionError):
-            pass_manager.add(callable)
+    # Check that UUID is different if the same pass is added 2x
+    pass_manager.add(callable)
+    uuid1 = pass_manager.uuid()
+    pass_manager.add(callable)
+    uuid2 = pass_manager.uuid()
+    assert uuid1 != uuid2
+
+    # UUID should be the same as the original one,
+    # as we constructed in the same way.
+    pass_manager2 = PostGradPassManager()
+    pass_manager2.configure(config)
+    pass_manager2.add(callable)
+    assert uuid1 == pass_manager2.uuid()
+
+    # UUID should be different due to config change
+    config2 = copy.deepcopy(config)
+    config2.enable_fusion = not config2.enable_fusion
+    pass_manager3 = PostGradPassManager()
+    pass_manager3.configure(config2)
+    pass_manager3.add(callable)
+    assert uuid1 != pass_manager3.uuid()
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 1fea927aac3..08dd8c8e1ea 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,27 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
+import importlib.metadata
 import inspect
+import json
 import types
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union
 
 import torch
+from packaging.version import Version
 from torch import fx
 
+if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+    from torch._inductor.custom_graph_pass import CustomGraphPass
+else:
+    # CustomGraphPass is not present in 2.5 or lower, import our version
+    from .torch25_custom_graph_pass import (  # noqa: yapf
+        Torch25CustomGraphPass as CustomGraphPass)
 
-class InductorPass(ABC):
+
+class InductorPass(CustomGraphPass):
     """
-    General custom inductor pass interface.
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
     """
 
-    @abstractmethod
-    def __call__(self, graph: torch.fx.Graph):
-        """
-        Execute the pass on the given graph.
-        """
-        raise NotImplementedError
-
     def uuid(self) -> Any:
         """
         Provide a unique identifier for the pass, used in Inductor code cache.
@@ -48,7 +51,16 @@ def hash_source(*srcs: Union[str, Any]):
             else:
                 src_str = inspect.getsource(src.__class__)
             hasher.update(src_str.encode("utf-8"))
-        return hasher.digest()
+        return hasher.hexdigest()
+
+    @staticmethod
+    def hash_dict(dict_: Dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()
 
 
 class CallableInductorPass(InductorPass):
@@ -61,25 +73,10 @@ def __init__(self,
                  callable: Callable[[fx.Graph], None],
                  uuid: Optional[Any] = None):
         self.callable = callable
-        if uuid is None:
-            uuid = InductorPass.hash_source(callable)
-        self._uuid = uuid
+        self._uuid = self.hash_source(callable) if uuid is None else uuid
 
     def __call__(self, graph: torch.fx.Graph):
         self.callable(graph)
 
     def uuid(self) -> Any:
         return self._uuid
-
-    def __getstate__(self):
-        """
-        Pickling occurs in the Inductor code cache if a pass is not given to
-        the pass manager but is instead directly added to config as a pass.
-        See PostGradPassManager for more.
-
-        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
-        """
-        return self._uuid
-
-    def __setstate__(self, state):
-        raise ValueError("Cannot unpickle CallableInductorPass")
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index b012346c353..530a88b2b09 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Any, Dict, List
+from typing import List
 
-import torch
 from torch import fx as fx
 
 from vllm.config import CompilationConfig
@@ -10,29 +9,18 @@
 
 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
-from .inductor_pass import InductorPass
+from .inductor_pass import CustomGraphPass, InductorPass
 from .noop_elimination import NoOpEliminationPass
 
 logger = init_logger(__name__)
 
 
-class PlaceHolder:
-    pass
-
-
-if torch.__version__ < "2.6":
-    Parent = PlaceHolder  # type: ignore
-else:
-    Parent = torch._inductor.custom_graph_pass.CustomGraphPass  # type: ignore
-
-
-class PostGradPassManager(Parent):
+class PostGradPassManager(CustomGraphPass):
     """
     The pass manager for post-grad passes.
     It handles configuration, adding custom passes, and running passes.
-    It also supports pickling, which is used by the Inductor code cache.
-    TODO(torch==2.6), use CustomGraphPass
-    (torch._inductor.custom_graph_pass.CustomGraphPass)
+    It supports uuid for the Inductor code cache. That includes torch<2.6
+    support using pickling (in .inductor_pass.CustomGraphPass).
 
     The order of the post-grad post-passes is:
     1. passes (constructor parameter)
@@ -67,27 +55,13 @@ def add(self, pass_: InductorPass):
         self.passes.append(pass_)
 
     def uuid(self):
-        return self.__getstate__()
-
-    def __getstate__(self) -> Dict[str, List[Any]]:
         """
-        Custom pickling for the pass manager, as some passes cannot be pickled.
-        Pickling occurs because the pass manager is set as the value of
-        `config["post_grad_custom_post_pass"]` in the Inductor config.
-        The config is pickled to act as a key in the Inductor code cache.
-        Any other passes in the config are pickled as well.
-
-        TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead.
+        The PostGradPassManager is set as a custom pass in the Inductor and
+        affects compilation caching. Its uuid depends on the UUIDs of all
+        dependent passes and the pass config. See InductorPass for more info.
         """
         state = {"pass_config": self.pass_config.uuid(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
-        return state
-
-    def __setstate__(self, state):
-        """
-        Do not allow unpickling of the pass manager.
-        If this is needed in the future, it should properly pickle the passes.
-        """
-        raise ValueError("Cannot unpickle PostGradPassManager")
+        return InductorPass.hash_dict(state)
diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py
new file mode 100644
index 00000000000..4b881d0b6f2
--- /dev/null
+++ b/vllm/compilation/torch25_custom_graph_pass.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+import torch
+
+
+class Torch25CustomGraphPass(ABC):  # noqa (redefinition)
+    """
+    This class replaces CustomGraphPass from torch==2.6 when using torch<2.6.
+    It conforms to the 2.6 interface but also supports pickling, as that's what
+    the inductor code cache uses to determine the cache key before 2.6.
+    (in 2.6 and above, uuid() is used.)
+
+    Subclasses can just "pretend" that uuid is used.
+    """
+
+    @abstractmethod
+    def __call__(self, graph: torch.fx.graph.Graph) -> None:
+        """
+        Implementation of the custom pass.
+        """
+
+    @abstractmethod
+    def uuid(self) -> Optional[Any]:
+        """
+        Return an ID to uniquely identify your custom pass implementation.
+        Return None to skip inductor code caching entirely.
+        """
+
+    def __getstate__(self):
+        """
+        Pickling is used instead of uuid() in torch<2.6. Just return uuid()
+         to enable subclasses to only have to implement uuid.
+        """
+        return self.uuid()
+
+    def __setstate__(self, state):
+        raise ValueError("Cannot unpickle CustomGraphPass because pickling"
+                         " is used for cache key uuid. Use torch>=2.6 with"
+                         " native uuid support for custom passes.")
diff --git a/vllm/config.py b/vllm/config.py
index e486889b585..2fd0db4ee94 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4,6 +4,7 @@
 import copy
 import enum
 import hashlib
+import importlib.metadata
 import json
 import sys
 import warnings
@@ -17,6 +18,7 @@
                     Optional, Protocol, Union)
 
 import torch
+from packaging.version import Version
 from pydantic import BaseModel, Field, PrivateAttr
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
@@ -52,8 +54,6 @@
 else:
     QuantizationConfig = None
 
-from packaging.version import Version
-
 logger = init_logger(__name__)
 
 # This value is chosen to have a balance between ITL and TTFT. Note it is
@@ -3088,8 +3088,7 @@ def uuid(self):
             compilation.
             """
             dict_ = self.model_dump(include={"enable_fusion", "enable_noop"})
-            encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
-            return hashlib.sha256(encoded).digest()
+            return InductorPass.hash_dict(dict_)
 
         def model_post_init(self, __context: Any) -> None:
             if not self.enable_noop and self.enable_fusion:
@@ -3178,7 +3177,7 @@ def model_post_init(self, __context: Any) -> None:
         #    and it is not yet a priority. RFC here:
         #    https://github.com/vllm-project/vllm/issues/14703
 
-        if Version(torch.__version__) >= Version("2.6"):
+        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
             KEY = 'enable_auto_functionalized_v2'
             if KEY not in self.inductor_compile_config:
                 self.inductor_compile_config[KEY] = False

From f82f764d5bd5681211be35c8b55ae929cb234051 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Mon, 24 Mar 2025 11:09:44 +0800
Subject: [PATCH 0927/1240] Fix non-contiguous input passed to Marlin kernel
 (#15319)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/kernels/mixed_precision/marlin.py     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index e21801cf6a7..b030e1484a6 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -115,6 +115,10 @@ def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # marlin requires contiguous memory layout
+        # prefix caching may cause x to be non-contiguous
+        x = x.contiguous()  # no-op if already contiguous
+
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
 

From 8b71cbd06532430ba757607dd3b32838b1f14dc0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 24 Mar 2025 13:51:42 +0800
Subject: [PATCH 0928/1240] [Misc] Upgrade BNB version (#15183)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile                                              | 2 +-
 docs/source/features/quantization/bnb.md                | 2 +-
 vllm/model_executor/layers/quantization/bitsandbytes.py | 8 ++++----
 vllm/model_executor/model_loader/loader.py              | 8 ++++----
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index df79412bbec..21945cb0bd0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -286,7 +286,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index b81d89c4575..fc499e7692d 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal
 Below are the steps to utilize BitsAndBytes with vLLM.
 
 ```console
-pip install bitsandbytes>=0.45.0
+pip install bitsandbytes>=0.45.3
 ```
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 33c2ca93ffa..1e8e7aa1b8c 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -155,12 +155,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
     def __init__(self, quant_config: BitsAndBytesConfig):
         try:
             import bitsandbytes
-            if bitsandbytes.__version__ < "0.45.0":
+            if bitsandbytes.__version__ < "0.45.3":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.45.0.")
+                                  "install bitsandbytes>=0.45.3.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.45.0 via "
-                              "`pip install bitsandbytes>=0.45.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.3 via "
+                              "`pip install bitsandbytes>=0.45.3` to use "
                               "bitsandbytes quantizer.") from err
 
         self.quant_config = quant_config
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index b2ffca2a4b4..d3f7a26e7f9 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -862,12 +862,12 @@ def _get_quantized_weights_iterator(
         try:
             import bitsandbytes
 
-            if bitsandbytes.__version__ < "0.45.0":
+            if bitsandbytes.__version__ < "0.45.3":
                 raise ImportError("bitsandbytes version is wrong. Please "
-                                  "install bitsandbytes>=0.45.0.")
+                                  "install bitsandbytes>=0.45.3.")
         except ImportError as err:
-            raise ImportError("Please install bitsandbytes>=0.45.0 via "
-                              "`pip install bitsandbytes>=0.45.0` to use "
+            raise ImportError("Please install bitsandbytes>=0.45.3 via "
+                              "`pip install bitsandbytes>=0.45.3` to use "
                               "bitsandbytes quantizer.") from err
 
         hf_weights_files, use_safetensors = self._prepare_weights(

From 19451faf086b8a8e4210fe1e8d0306bb94c3bc5e Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Mon, 24 Mar 2025 00:41:53 -0700
Subject: [PATCH 0929/1240] [Misc] Remove ignore_reinit_error for ray.init()
 (#15373)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/executor/ray_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index b7222f26f66..37cc07bfbb3 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -289,16 +289,14 @@ def initialize_ray_cluster(
     elif current_platform.is_rocm() or current_platform.is_xpu():
         # Try to connect existing ray instance and create a new one if not found
         try:
-            ray.init("auto", ignore_reinit_error=True)
+            ray.init("auto")
         except ConnectionError:
             logger.warning(
                 "No existing RAY instance detected. "
                 "A new instance will be launched with current node resources.")
-            ray.init(address=ray_address,
-                     ignore_reinit_error=True,
-                     num_gpus=parallel_config.world_size)
+            ray.init(address=ray_address, num_gpus=parallel_config.world_size)
     else:
-        ray.init(address=ray_address, ignore_reinit_error=True)
+        ray.init(address=ray_address)
 
     device_str = current_platform.ray_device_key
     if not device_str:

From 84aaca48291858f200f62aae4432c948c2c0298f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Mon, 24 Mar 2025 12:33:12 +0200
Subject: [PATCH 0930/1240] [Bugfix][V1] Avoid importing PreTrainedModel
 (#15366)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 0b487282731..ce906143297 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -32,7 +32,7 @@ def set_default_torch_dtype(dtype: torch.dtype):
 
 def is_transformers_impl_compatible(
         arch: str,
-        module: Optional[transformers.PreTrainedModel] = None) -> bool:
+        module: Optional["transformers.PreTrainedModel"] = None) -> bool:
     mod = module or getattr(transformers, arch, None)
     if mod is None:
         return False

From 5478b5d85bd5d4717519605bdf8091a07c770618 Mon Sep 17 00:00:00 2001
From: sfbemerk <benjaminmerkel@mail.de>
Date: Mon, 24 Mar 2025 12:25:20 +0100
Subject: [PATCH 0931/1240] [Misc] Update guided decoding logs to debug
 (#15310)

Signed-off-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Co-authored-by: Benjamin Merkel <benjamin.merkel@tngtech.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/async_llm_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 91b9cc62719..079e2a08152 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -545,7 +545,7 @@ async def build_guided_decoding_logits_processor_async(
     sampling_params = copy.copy(sampling_params)
     guided_decoding = sampling_params.guided_decoding
 
-    logger.info(
+    logger.debug(
         "Building guided decoding logits processor. "
         "guided_decoding: %s%s", guided_decoding,
         f", reasoning_backend: {reasoning_backend}"

From f82fc5f167f33bc3c261470956df2f9e31bc2f59 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 24 Mar 2025 05:53:10 -0700
Subject: [PATCH 0932/1240] Revert "[CI/Build] Use uv python for docker rather
 than ppa:deadsnakess/ppa (#13569)" (#15377)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile | 89 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 51 insertions(+), 38 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 21945cb0bd0..d1ecef586d5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,17 +14,22 @@ ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl sudo \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -46,20 +51,22 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
 WORKDIR /workspace
 
+# install build and runtime dependencies
+
 # arm64 (GH200) build follows the practice of "use existing pytorch" build,
 # we need to install torch and torchvision from the nightly builds first,
 # pytorch will not appear as a vLLM dependency in all of the following steps
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
     fi
 
 COPY requirements/common.txt requirements/common.txt
 COPY requirements/cuda.txt requirements/cuda.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/cuda.txt
+    uv pip install --system -r requirements/cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -84,7 +91,7 @@ COPY requirements/build.txt requirements/build.txt
 ENV UV_HTTP_TIMEOUT=500
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt
 
 COPY . .
 ARG GIT_REPO_CHECK=0
@@ -156,7 +163,7 @@ COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt
 #################### DEV IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
@@ -172,18 +179,23 @@ ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
-# Install minimal dependencies and uv
-RUN apt-get update -y \
-    && apt-get install -y ccache git curl wget sudo vim \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Add uv to PATH
-ENV PATH="/root/.local/bin:$PATH"
-# Create venv with specified Python and activate by placing at the front of path
-ENV VIRTUAL_ENV="/opt/venv"
-RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+# Install Python and other dependencies
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update -y \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+    && python3 --version && python3 -m pip --version
+# Install uv for faster pip installs
+RUN --mount=type=cache,target=/root/.cache/uv \
+    python3 -m pip install uv
 
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
@@ -201,14 +213,14 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # after this step
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
-        uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
+        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
     fi
 
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install dist/*.whl --verbose
+    uv pip install --system dist/*.whl --verbose
 
 # If we need to build FlashInfer wheel before its release:
 # $ export FLASHINFER_ENABLE_AOT=1
@@ -223,8 +235,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 
 RUN --mount=type=cache,target=/root/.cache/uv \
+. /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 
@@ -234,7 +247,7 @@ COPY examples examples
 # TODO: Remove this once FlashInfer AOT wheel is fixed
 COPY requirements/build.txt requirements/build.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/build.txt
+    uv pip install --system -r requirements/build.txt
 
 #################### vLLM installation IMAGE ####################
 
@@ -251,15 +264,15 @@ ENV UV_HTTP_TIMEOUT=500
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -r requirements/dev.txt
+    uv pip install --system -r requirements/dev.txt
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install -e tests/vllm_test_utils
+    uv pip install --system -e tests/vllm_test_utils
 
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install hf_transfer
+    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
 # Copy in the v1 package for testing (it isn't distributed yet)
@@ -284,9 +297,9 @@ ENV UV_HTTP_TIMEOUT=500
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     else \
-        uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
+        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
     fi
 
 ENV VLLM_USAGE_SOURCE production-docker-image

From ef51c8bca849e5ec80b3a3932f1e8b0fde6a0572 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Mon, 24 Mar 2025 21:21:33 +0800
Subject: [PATCH 0933/1240] [Kernel] allow non-contiguous input for marlin
 kernel (#14658)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu | 48 ++++++++++++--------
 tests/kernels/test_marlin_gemm.py            | 45 ++++++++++++++++++
 2 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 72627df24b9..dafab501ee0 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -42,7 +42,7 @@ namespace marlin {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
                                     int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {}
+                                    int size_k, int lda, int block_rows) {}
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
           const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
@@ -459,7 +459,7 @@ __device__ inline void barrier_release(int* lock, bool reset = false) {
 __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
                                     int4* __restrict__ out_int4_ptr, int size_m,
-                                    int size_k, int block_rows) {
+                                    int size_k, int lda, int block_rows) {
   int start_row = block_rows * blockIdx.x;
   int finish_row = start_row + block_rows;
   if (finish_row > size_m) {
@@ -467,16 +467,19 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
   }
   int cur_block_rows = finish_row - start_row;
 
-  int row_stride = size_k * sizeof(half) / 16;
+  int input_row_stride = lda * sizeof(half) / 16;
+  int output_row_stride = size_k * sizeof(half) / 16;
 
   auto permute_row = [&](int row) {
     int iters = size_k / default_threads;
     int rest = size_k % default_threads;
 
-    int offset = row * row_stride;
+    int input_offset = row * input_row_stride;
+    int output_offset = row * output_row_stride;
 
-    half const* a_row_half = reinterpret_cast<half const*>(a_int4_ptr + offset);
-    half* out_half = reinterpret_cast<half*>(out_int4_ptr + offset);
+    half const* a_row_half =
+        reinterpret_cast<half const*>(a_int4_ptr + input_offset);
+    half* out_half = reinterpret_cast<half*>(out_int4_ptr + output_offset);
 
     int base_k = 0;
 
@@ -537,6 +540,7 @@ __global__ void Marlin(
     int prob_m,           // batch dimension m
     int prob_n,           // output dimension n
     int prob_k,           // reduction dimension k
+    int lda,              // A.stride(0), equal to prob_k is A is contiguous
     int* locks,           // extra global storage for barrier synchronization
     bool use_atomic_add,  // whether to use atomic add to reduce
     bool use_fp32_reduce  // whether to use fp32 global reduce
@@ -600,7 +604,7 @@ __global__ void Marlin(
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
   if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_k / 8;
+    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
     C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
     locks += (slice_col_par / n_tiles) * n_tiles;
     slice_col = slice_col_par % n_tiles;
@@ -631,7 +635,7 @@ __global__ void Marlin(
       }
     }
     if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * prob_k / 8;
+      A += 16 * thread_m_blocks * lda / 8;
       C += 16 * thread_m_blocks * prob_n / 8;
       locks += n_tiles;
       slice_col = 0;
@@ -643,7 +647,7 @@ __global__ void Marlin(
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = lda / 8;
   // stride of an A matrix tile in shared memory
   constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
   // delta between subsequent A tiles in global memory
@@ -1780,8 +1784,8 @@ __global__ void Marlin(
                HAS_ZP, GROUP_BLOCKS, IS_ZP_FLOAT>                              \
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
-                num_groups, prob_m, prob_n, prob_k, locks, use_atomic_add,     \
-                use_fp32_reduce);                                              \
+                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
+                use_atomic_add, use_fp32_reduce);                              \
       }                                                                        \
     }
 
@@ -2071,7 +2075,7 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k,
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                void* zp, void* g_idx, void* perm, void* a_tmp, int prob_m,
-               int prob_n, int prob_k, void* workspace,
+               int prob_n, int prob_k, int lda, void* workspace,
                vllm::ScalarType const& q_type, bool has_act_order,
                bool is_k_full, bool has_zp, int num_groups, int group_size,
                int dev, cudaStream_t stream, int thread_k, int thread_n,
@@ -2184,8 +2188,9 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
     // Permute A columns
     int block_rows = div_ceil(prob_m, blocks);
     permute_cols_kernel<<<blocks, default_threads, 0, stream>>>(
-        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, block_rows);
+        A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows);
     A_ptr = a_tmp_ptr;
+    lda = prob_k;
   }
 
   // If we have a full K, then we can run the non-act-order version of Marlin
@@ -2244,7 +2249,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
                   ", num_bits = ", num_bits);
     }
 
-    A_ptr += 16 * thread_m_blocks * (prob_k / 8) * par;
+    A_ptr += 16 * thread_m_blocks * (lda / 8) * par;
     C_ptr += 16 * thread_m_blocks * (prob_n / 8) * par;
   }
 }
@@ -2300,7 +2305,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
 
   // Verify device and strides
   TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
-  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
+  TORCH_CHECK(a.stride(1) == 1, "A.stride(1) is not 1");
+  // We use int4 (16 bytes) to load A, so A must aligned to 16 bytes
+  TORCH_CHECK(a.stride(0) % 8 == 0, "A.stride(0) must divisible by 8");
+  TORCH_CHECK(((uint64_t)a.data_ptr()) % 16 == 0, "A must aligned to 16 bytes");
 
   TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
   TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
@@ -2432,7 +2440,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
         c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
         b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
-        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
+        a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k, a.stride(0),
         workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
         num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
         thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
@@ -2443,10 +2451,10 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
         c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
         b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
         perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(), size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
-        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
-        thread_k, thread_n, sms, marlin::max_par, use_atomic_add,
-        use_fp32_reduce, is_zp_float);
+        a.stride(0), workspace.data_ptr(), b_q_type, has_act_order, is_k_full,
+        has_zp, num_groups, group_size, dev,
+        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+        marlin::max_par, use_atomic_add, use_fp32_reduce, is_zp_float);
   } else {
     TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
   }
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index c0cf5b099f9..3165201aa35 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -606,6 +606,51 @@ def test_marlin_qqq_gemm(
     assert max_diff < 0.04
 
 
+def test_marlin_gemm_subset_input():
+    quant_type = scalar_types.uint4b8
+    group_size = 128
+
+    size_m, size_k, size_n = 32, 1024, 2048
+    big_m = size_m * 2
+    big_k = size_k * 2
+
+    a_input = rand_data((big_m, big_k))[8:size_m + 8, 8:size_k + 8]
+    b_weight = rand_data((size_k, size_n))
+
+    w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
+        b_weight, quant_type, group_size, False)
+
+    marlin_zp = marlin_make_empty_g_idx(marlin_s.device)
+    workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+                                GPTQ_MARLIN_MAX_PARALLEL)
+
+    output = ops.gptq_marlin_gemm(
+        a_input,
+        marlin_q_w,
+        marlin_s,
+        marlin_zp,
+        g_idx,
+        sort_indices,
+        workspace.scratch,
+        quant_type,
+        a_input.shape[0],
+        b_weight.shape[1],
+        a_input.shape[1],
+        is_k_full=True,
+        has_zp=False,
+        use_atomic_add=False,
+        use_fp32_reduce=True,
+        is_zp_float=False,
+    )
+    output_ref = torch.matmul(a_input, w_ref)
+
+    torch.cuda.synchronize()
+
+    max_diff = compute_max_diff(output, output_ref)
+
+    assert max_diff < 0.04
+
+
 def test_marlin_gemm_opcheck():
     size_m = 2048
     size_n = 4096

From 936fd58a2bf96a79d121859d287cee5625fb59aa Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 24 Mar 2025 09:30:41 -0400
Subject: [PATCH 0934/1240] Fix zmq IPv6 URL format error (#15341)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/device_communicators/shm_broadcast.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 12a720d47fb..842d9edfd1c 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -233,6 +233,7 @@ def __init__(
             if is_valid_ipv6_address(connect_ip):
                 self.remote_socket.setsockopt(IPV6, 1)
                 remote_addr_ipv6 = True
+                connect_ip = f"[{connect_ip}]"
             socket_addr = f"tcp://*:{remote_subscribe_port}"
             self.remote_socket.bind(socket_addr)
             remote_subscribe_addr = f"tcp://{connect_ip}:{remote_subscribe_port}"

From aed10951c7601ae787296cd9cdbf751d0890c81b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 24 Mar 2025 21:50:09 +0800
Subject: [PATCH 0935/1240] [Bugfix] Fix chat template loading (#15143)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: chaunceyjiang <chaunceyjiang@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../entrypoints/openai/test_chat_template.py  |   2 +
 tests/entrypoints/openai/test_video.py        |   4 +-
 tests/entrypoints/test_chat_utils.py          |  64 +++++++-
 tests/tool_use/utils.py                       |   5 +-
 vllm/entrypoints/chat_utils.py                | 145 +++++++++++++-----
 vllm/entrypoints/llm.py                       |   7 +-
 vllm/entrypoints/openai/serving_engine.py     |   7 +-
 7 files changed, 187 insertions(+), 47 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py
index 255aba139ad..78e40eeecde 100644
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@@ -107,8 +107,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
     # Call the function and get the result
     result = apply_hf_chat_template(
         tokenizer,
+        trust_remote_code=True,
         conversation=mock_request.messages,
         chat_template=mock_request.chat_template or template_content,
+        tools=None,
         add_generation_prompt=mock_request.add_generation_prompt,
         continue_final_message=mock_request.continue_final_message,
     )
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
index 8c7564ba9dc..f9ccce9c1c3 100644
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -87,7 +87,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -180,7 +180,7 @@ async def test_single_chat_session_video_base64encoded(
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+        completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index e3b7b660ee2..6efed990b18 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -4,10 +4,13 @@
 from typing import Optional
 
 import pytest
+from packaging.version import Version
+from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
+                                         _try_extract_ast, load_chat_template,
                                          parse_chat_messages,
                                          parse_chat_messages_futures,
                                          resolve_chat_template_content_format)
@@ -23,8 +26,10 @@
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
+HERMES_MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"
 
 
 @pytest.fixture(scope="function")
@@ -703,25 +708,70 @@ def get_conversation(is_hf: bool):
 
     vllm_result = apply_hf_chat_template(
         tokenizer,
+        trust_remote_code=model_config.trust_remote_code,
         conversation=conversation,
         chat_template=None,
+        tools=None,
         add_generation_prompt=True,
     )
 
     assert hf_result == vllm_result
 
 
+@pytest.mark.parametrize(
+    "model",
+    [
+        QWEN2VL_MODEL_ID,  # tokenizer.chat_template is of type str
+        HERMES_MODEL_ID,  # tokenizer.chat_template is of type dict
+    ])
+@pytest.mark.parametrize("use_tools", [True, False])
+def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
+    """checks that chat_template is a dict type for HF models."""
+
+    # Build the tokenizer group and grab the underlying tokenizer
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    tools = [{
+        "type": "function",
+        "function": {
+            "name": "dummy_function_name",
+            "description": "This is a dummy function",
+            "parameters": sample_json_schema
+        }
+    }] if use_tools else None
+
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=tools,
+        trust_remote_code=True,
+    )
+    assert isinstance(chat_template, str)
+
+
 # yapf: disable
 @pytest.mark.parametrize(
     ("model", "expected_format"),
     [(PHI3V_MODEL_ID, "string"),
      (QWEN2VL_MODEL_ID, "openai"),
+     (QWEN25VL_MODEL_ID, "openai"),
      (ULTRAVOX_MODEL_ID, "string"),
      (MLLAMA_MODEL_ID, "openai"),
      (LLAMA_GUARD_MODEL_ID, "openai")],
 )
 # yapf: enable
 def test_resolve_content_format_hf_defined(model, expected_format):
+    if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
+            "4.49.0"):
+        pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
+
     tokenizer_group = TokenizerGroup(
         model,
         enable_lora=False,
@@ -730,7 +780,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     )
     tokenizer = tokenizer_group.tokenizer
 
-    chat_template = tokenizer.chat_template
+    # Test detecting the tokenizer's chat_template
+    chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=None,
+        tools=None,
+        trust_remote_code=True,
+    )
     assert isinstance(chat_template, str)
 
     print("[TEXT]")
@@ -740,8 +796,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):
 
     resolved_format = resolve_chat_template_content_format(
         None,  # Test detecting the tokenizer's chat_template
+        None,
         "auto",
         tokenizer,
+        trust_remote_code=True,
     )
 
     assert resolved_format == expected_format
@@ -791,8 +849,10 @@ def test_resolve_content_format_examples(template_path, expected_format):
 
     resolved_format = resolve_chat_template_content_format(
         chat_template,
+        None,
         "auto",
         dummy_tokenizer,
+        trust_remote_code=True,
     )
 
     assert resolved_format == expected_format
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index df117b96cd0..231e4aad8c3 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -39,7 +39,10 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
 
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
-ARGS: list[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
+ARGS: list[str] = [
+    "--enable-auto-tool-choice", "--max-model-len", "1024", "--max-num-seqs",
+    "256"
+]
 
 CONFIGS: dict[str, ServerConfig] = {
     "hermes": {
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 61a91fe03d2..988fa014460 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
-import codecs
 import json
 from abc import ABC, abstractmethod
 from collections import defaultdict, deque
@@ -30,7 +29,8 @@
     InputAudio)
 # yapf: enable
 # pydantic needs the TypedDict from typing_extensions
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
+                          ProcessorMixin)
 from typing_extensions import Required, TypeAlias, TypedDict
 
 from vllm.config import ModelConfig
@@ -306,24 +306,63 @@ def _detect_content_format(
         return "openai"
 
 
+def _resolve_hf_chat_template(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    *,
+    trust_remote_code: bool,
+) -> Optional[str]:
+    # 1st priority: The given chat template
+    if chat_template is not None:
+        return chat_template
+
+    # 2nd priority: AutoProcessor chat template, unless tool calling is enabled
+    if tools is None:
+        try:
+            processor = cached_get_processor(
+                tokenizer.name_or_path,
+                processor_cls=(PreTrainedTokenizer, PreTrainedTokenizerFast,
+                               ProcessorMixin),
+                trust_remote_code=trust_remote_code,
+            )
+            if isinstance(processor, ProcessorMixin) and \
+                processor.chat_template is not None:
+                return processor.chat_template
+        except Exception:
+            logger.debug("Failed to load AutoProcessor chat template for %s",
+                        tokenizer.name_or_path, exc_info=True)
+
+    # 3rd priority: AutoTokenizer chat template
+    try:
+        return tokenizer.get_chat_template(chat_template, tools=tools)
+    except Exception:
+        logger.debug("Failed to load AutoTokenizer chat template for %s",
+                     tokenizer.name_or_path, exc_info=True)
+
+    return None
+
+
 def _resolve_chat_template_content_format(
     chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
     given_format: ChatTemplateContentFormatOption,
     tokenizer: AnyTokenizer,
+    *,
+    trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-        tokenizer_chat_template = tokenizer.chat_template
-    else:
-        tokenizer_chat_template = None
-
-    jinja_text: Optional[str]
-    if isinstance(tokenizer_chat_template, str) and chat_template is None:
-        jinja_text = tokenizer_chat_template
-    elif (isinstance(tokenizer_chat_template, dict)
-            and chat_template in tokenizer_chat_template):
-        jinja_text = tokenizer_chat_template[chat_template]
+        hf_chat_template = _resolve_hf_chat_template(
+            tokenizer,
+            chat_template=chat_template,
+            trust_remote_code=trust_remote_code,
+            tools=tools,
+        )
     else:
-        jinja_text = load_chat_template(chat_template, is_literal=True)
+        hf_chat_template = None
+
+    jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
+                  else load_chat_template(chat_template, is_literal=True))
 
     detected_format = ("string" if jinja_text is None else
                        _detect_content_format(jinja_text, default="string"))
@@ -332,17 +371,11 @@ def _resolve_chat_template_content_format(
 
 
 @lru_cache
-def resolve_chat_template_content_format(
+def _log_chat_template_content_format(
     chat_template: Optional[str],
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: AnyTokenizer,
-) -> _ChatTemplateContentFormat:
-    detected_format = _resolve_chat_template_content_format(
-        chat_template,
-        given_format,
-        tokenizer,
-    )
-
+    detected_format: ChatTemplateContentFormatOption,
+):
     logger.info(
         "Detected the chat template content format to be '%s'. "
         "You can set `--chat-template-content-format` to override this.",
@@ -360,6 +393,29 @@ def resolve_chat_template_content_format(
             detected_format,
         )
 
+
+def resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+    *,
+    trust_remote_code: bool = False,
+) -> _ChatTemplateContentFormat:
+    detected_format = _resolve_chat_template_content_format(
+        chat_template,
+        tools,
+        given_format,
+        tokenizer,
+        trust_remote_code=trust_remote_code,
+    )
+
+    _log_chat_template_content_format(
+        chat_template,
+        given_format=given_format,
+        detected_format=detected_format,
+    )
+
     return detected_format
 
 
@@ -711,7 +767,7 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
             f"{type(chat_template)} is not a valid chat template type")
 
 
-def load_chat_template(
+def _load_chat_template(
     chat_template: Optional[Union[Path, str]],
     *,
     is_literal: bool = False,
@@ -724,7 +780,7 @@ def load_chat_template(
             raise TypeError("chat_template is expected to be read directly "
                             "from its value")
 
-        return codecs.decode(chat_template, "unicode_escape")
+        return chat_template
 
     try:
         with open(chat_template) as f:
@@ -742,7 +798,18 @@ def load_chat_template(
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly
-        return load_chat_template(chat_template, is_literal=True)
+        return _load_chat_template(chat_template, is_literal=True)
+
+
+_cached_load_chat_template = lru_cache(_load_chat_template)
+
+
+def load_chat_template(
+    chat_template: Optional[Union[Path, str]],
+    *,
+    is_literal: bool = False,
+) -> Optional[str]:
+    return _cached_load_chat_template(chat_template, is_literal=is_literal)
 
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
@@ -1067,23 +1134,20 @@ def apply_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     conversation: list[ConversationMessage],
     chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
     *,
+    trust_remote_code: bool = False,
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
-    if chat_template is None:
-        chat_template = tokenizer.chat_template
-
-    # FIXME: Temporary workaround for
-    # https://huggingface.co/mistral-community/pixtral-12b/discussions/31
-    if chat_template is None:
-        try:
-            processor = cached_get_processor(tokenizer.name_or_path)
-            chat_template = processor.chat_template
-        except Exception:
-            pass
+    hf_chat_template = _resolve_hf_chat_template(
+        tokenizer,
+        chat_template=chat_template,
+        tools=tools,
+        trust_remote_code=trust_remote_code,
+    )
 
-    if chat_template is None:
+    if hf_chat_template is None:
         raise ValueError(
             "As of transformers v4.44, default chat template is no longer "
             "allowed, so you must provide a chat template if the tokenizer "
@@ -1091,7 +1155,8 @@ def apply_hf_chat_template(
 
     return tokenizer.apply_chat_template(
         conversation=conversation,  # type: ignore[arg-type]
-        chat_template=chat_template,
+        tools=tools,  # type: ignore[arg-type]
+        chat_template=hf_chat_template,
         tokenize=tokenize,
         **kwargs,
     )
@@ -1100,7 +1165,8 @@ def apply_hf_chat_template(
 def apply_mistral_chat_template(
     tokenizer: MistralTokenizer,
     messages: list[ChatCompletionMessageParam],
-    chat_template: Optional[str] = None,
+    chat_template: Optional[str],
+    tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
     if chat_template is not None:
@@ -1117,5 +1183,6 @@ def apply_mistral_chat_template(
 
     return tokenizer.apply_chat_template(
         messages=messages,
+        tools=tools,
         **kwargs,
     )
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 84b0093ce4c..1887caf25a3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -690,8 +690,10 @@ def chat(
         model_config = self.llm_engine.get_model_config()
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
+            tools,
             chat_template_content_format,
             tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
         )
 
         prompts: list[Union[TokensPrompt, TextPrompt]] = []
@@ -713,18 +715,19 @@ def chat(
                     tokenizer,
                     messages=msgs,
                     chat_template=chat_template,
+                    tools=tools,
                     add_generation_prompt=add_generation_prompt,
                     continue_final_message=continue_final_message,
-                    tools=tools,
                 )
             else:
                 prompt_data = apply_hf_chat_template(
                     tokenizer,
+                    trust_remote_code=model_config.trust_remote_code,
                     conversation=conversation,
                     chat_template=chat_template,
+                    tools=tools,
                     add_generation_prompt=add_generation_prompt,
                     continue_final_message=continue_final_message,
-                    tools=tools,
                 )
 
             prompt: Union[TokensPrompt, TextPrompt]
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 125812d2cc0..7cb4a2dce1d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -379,14 +379,18 @@ async def _preprocess_chat(
         add_special_tokens: bool = False,
     ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
                list[TokensPrompt]]:
+        model_config = self.model_config
+
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
+            tool_dicts,
             chat_template_content_format,
             tokenizer,
+            trust_remote_code=model_config.trust_remote_code,
         )
         conversation, mm_data_future = parse_chat_messages_futures(
             messages,
-            self.model_config,
+            model_config,
             tokenizer,
             content_format=resolved_content_format,
         )
@@ -410,6 +414,7 @@ async def _preprocess_chat(
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer,
+                trust_remote_code=model_config.trust_remote_code,
                 conversation=conversation,
                 **_chat_template_kwargs,
             )

From c0d57bc12f84707a56fd348a380a50cd37467096 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 24 Mar 2025 22:54:27 +0800
Subject: [PATCH 0936/1240] [distributed] fix dp group (#15355)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/parallel_state.py | 39 ++++++++++--------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f897f1950e4..51485169483 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -897,29 +897,22 @@ def initialize_model_parallel(
         get_world_group().device_group)
 
     data_parallel_size = 1
-    has_external_dp = False
     from vllm.config import get_current_vllm_config
     config = get_current_vllm_config()
     if config is not None:
-        if config.parallel_config.world_size != world_size:
-            # detect external data parallelism.
-            # dp in vllm means all dp instances need to run together.
-            # if the world size does not match, it means this dp is external,
-            # and the dp instances can run independently, e.g. in rlhf workflow
-            # from https://github.com/volcengine/verl .
-            # in that case, we treat the rest dimensions as if they are
-            # data parallel, and create a dummy dp group that is not used.
-            data_parallel_size = world_size // (pipeline_model_parallel_size *
-                                                tensor_model_parallel_size)
-            has_external_dp = True
-        else:
-            data_parallel_size = config.parallel_config.data_parallel_size
-
-    # the layout order is: DP x PP x TP
+        data_parallel_size = config.parallel_config.data_parallel_size
+
+    # the layout order is: ExternalDP x DP x PP x TP
+    # ExternalDP is the data parallel group that is not part of the model,
+    # every dp rank can generate independently (in verl integration).
+    # DP is the data parallel group that is part of the model,
+    # all the ranks in the same DP group should generate simultaneously,
+    # i.e. the `generate` call in the same DP group should be called together,
+    # otherwise it will cause deadlock.
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        data_parallel_size, pipeline_model_parallel_size,
+        -1, data_parallel_size, pipeline_model_parallel_size,
         tensor_model_parallel_size)  # noqa
 
     # Build the tensor model-parallel groups.
@@ -939,7 +932,7 @@ def initialize_model_parallel(
     global _PP
     assert _PP is None, (
         "pipeline model parallel group is already initialized")
-    group_ranks = all_ranks.transpose(1, 2).reshape(
+    group_ranks = all_ranks.transpose(2, 3).reshape(
         -1, pipeline_model_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(group_ranks,
@@ -949,16 +942,10 @@ def initialize_model_parallel(
 
     global _DP
     assert _DP is None, ("data parallel group is already initialized")
-    group_ranks = all_ranks.transpose(0,
-                                      2).reshape(-1,
+    group_ranks = all_ranks.transpose(1,
+                                      3).reshape(-1,
                                                  data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
-    if has_external_dp:
-        # create a dummy dp group that is not used actually,
-        # since this dp is external.
-        # a dummy dp group means every rank is a group itself.
-        # this way, no communication is needed, no memory is wasted.
-        group_ranks = [[x] for x in range(world_size)]
     _DP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,

From 79f2f43b52b61c61b5120329e31a6a66b76bbd9b Mon Sep 17 00:00:00 2001
From: Manish Sethi <manish.sethi@gmail.com>
Date: Mon, 24 Mar 2025 11:08:02 -0400
Subject: [PATCH 0937/1240] [Core] Integrate `fastsafetensors` loader for
 loading model weights (#10647)

Signed-off-by: Manish Sethi <Manish.sethi1@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../models/extensions/fastsafetensor.md       |  5 ++
 docs/source/models/extensions/index.md        |  1 +
 requirements/test.in                          |  1 +
 requirements/test.txt                         | 13 ++++-
 setup.py                                      |  1 +
 tests/fastsafetensors_loader/__init__.py      |  0
 .../test_fastsafetensors_loader.py            | 22 +++++++++
 .../test_weight_utils.py                      | 46 ++++++++++++++++++
 vllm/config.py                                |  1 +
 vllm/model_executor/model_loader/loader.py    | 24 ++++++----
 .../model_loader/weight_utils.py              | 47 +++++++++++++++++++
 11 files changed, 152 insertions(+), 9 deletions(-)
 create mode 100644 docs/source/models/extensions/fastsafetensor.md
 create mode 100644 tests/fastsafetensors_loader/__init__.py
 create mode 100644 tests/fastsafetensors_loader/test_fastsafetensors_loader.py
 create mode 100644 tests/fastsafetensors_loader/test_weight_utils.py

diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/source/models/extensions/fastsafetensor.md
new file mode 100644
index 00000000000..66cd710c97e
--- /dev/null
+++ b/docs/source/models/extensions/fastsafetensor.md
@@ -0,0 +1,5 @@
+Loading Model weights with fastsafetensors
+===================================================================
+
+Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
index 69faf472e53..cdcdaa5b350 100644
--- a/docs/source/models/extensions/index.md
+++ b/docs/source/models/extensions/index.md
@@ -5,4 +5,5 @@
 
 runai_model_streamer
 tensorizer
+fastsafetensor
 :::
diff --git a/requirements/test.in b/requirements/test.in
index e75f15c0bf2..5c59bbd1ac7 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -41,3 +41,4 @@ tritonclient==2.51.0
 numpy < 2.0.0
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
+fastsafetensors>=0.1.10
diff --git a/requirements/test.txt b/requirements/test.txt
index c733364fd87..b0ae479604a 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -67,6 +67,7 @@ click==8.1.7
     #   jiwer
     #   nltk
     #   ray
+    #   typer
 colorama==0.4.6
     # via
     #   awscli
@@ -122,6 +123,8 @@ fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
     # via cupy-cuda12x
+fastsafetensors==0.1.10
+    # via -r requirements/test.in
 filelock==3.16.1
     # via
     #   datasets
@@ -505,7 +508,9 @@ requests==2.32.3
 responses==0.25.3
     # via genai-perf
 rich==13.9.4
-    # via genai-perf
+    # via
+    #   genai-perf
+    #   typer
 rouge-score==0.1.2
     # via lm-eval
 rpds-py==0.20.1
@@ -550,6 +555,8 @@ setuptools==75.8.0
     # via
     #   pytablewriter
     #   torch
+shellingham==1.5.4
+    # via typer
 six==1.16.0
     # via
     #   python-dateutil
@@ -600,6 +607,7 @@ torch==2.6.0
     #   accelerate
     #   bitsandbytes
     #   encodec
+    #   fastsafetensors
     #   lm-eval
     #   peft
     #   runai-model-streamer
@@ -654,6 +662,8 @@ typepy==1.3.2
     #   dataproperty
     #   pytablewriter
     #   tabledata
+typer==0.15.2
+    # via fastsafetensors
 typing-extensions==4.12.2
     # via
     #   huggingface-hub
@@ -663,6 +673,7 @@ typing-extensions==4.12.2
     #   pydantic
     #   pydantic-core
     #   torch
+    #   typer
 tzdata==2024.2
     # via pandas
 urllib3==2.2.3
diff --git a/setup.py b/setup.py
index 6c45413c321..37f3e78926c 100755
--- a/setup.py
+++ b/setup.py
@@ -680,6 +680,7 @@ def _read_requirements(filename: str) -> list[str]:
     install_requires=get_requirements(),
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
+        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
diff --git a/tests/fastsafetensors_loader/__init__.py b/tests/fastsafetensors_loader/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
new file mode 100644
index 00000000000..184bee2a715
--- /dev/null
+++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import SamplingParams
+from vllm.config import LoadFormat
+
+test_model = "openai-community/gpt2"
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+
+def test_model_loader_download_files(vllm_runner):
+    with vllm_runner(test_model,
+                     load_format=LoadFormat.FASTSAFETENSORS) as llm:
+        deserialized_outputs = llm.generate(prompts, sampling_params)
+        assert deserialized_outputs
diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py
new file mode 100644
index 00000000000..8772035af50
--- /dev/null
+++ b/tests/fastsafetensors_loader/test_weight_utils.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import glob
+import tempfile
+
+import huggingface_hub.constants
+import torch
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf, fastsafetensors_weights_iterator,
+    safetensors_weights_iterator)
+
+
+def test_fastsafetensors_model_loader():
+    with tempfile.TemporaryDirectory() as tmpdir:
+        huggingface_hub.constants.HF_HUB_OFFLINE = False
+        download_weights_from_hf("openai-community/gpt2",
+                                 allow_patterns=["*.safetensors"],
+                                 cache_dir=tmpdir)
+        safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
+        assert len(safetensors) > 0
+
+        fastsafetensors_tensors = {}
+        hf_safetensors_tensors = {}
+
+        for name, tensor in fastsafetensors_weights_iterator(
+                safetensors, True):
+            fastsafetensors_tensors[name] = tensor
+
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
+            hf_safetensors_tensors[name] = tensor
+
+        assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
+
+        for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
+            fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
+            assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
+                name].dtype
+            assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
+                name].shape
+            assert torch.all(
+                fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
+
+
+if __name__ == "__main__":
+    test_fastsafetensors_model_loader()
diff --git a/vllm/config.py b/vllm/config.py
index 2fd0db4ee94..989e5b47516 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1277,6 +1277,7 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     RUNAI_STREAMER = "runai_streamer"
+    FASTSAFETENSORS = "fastsafetensors"
 
 
 @dataclass
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d3f7a26e7f9..de04c6f89c2 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -49,9 +49,10 @@
                                                     set_default_torch_dtype)
 from vllm.model_executor.model_loader.weight_utils import (
     download_safetensors_index_file_from_hf, download_weights_from_hf,
-    filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
-    get_gguf_extra_tensor_names, get_lock, gguf_quant_weights_iterator,
-    initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
+    fastsafetensors_weights_iterator, filter_duplicate_safetensors_files,
+    filter_files_not_needed_for_inference, get_gguf_extra_tensor_names,
+    get_lock, gguf_quant_weights_iterator, initialize_dummy_weights,
+    np_cache_weights_iterator, pt_weights_iterator,
     runai_safetensors_weights_iterator, safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
@@ -275,7 +276,8 @@ def _prepare_weights(
         # Some quantized models use .pt files for storing the weights.
         if load_format == LoadFormat.AUTO:
             allow_patterns = ["*.safetensors", "*.bin"]
-        elif load_format == LoadFormat.SAFETENSORS:
+        elif (load_format == LoadFormat.SAFETENSORS
+              or load_format == LoadFormat.FASTSAFETENSORS):
             use_safetensors = True
             allow_patterns = ["*.safetensors"]
         elif load_format == LoadFormat.MISTRAL:
@@ -357,10 +359,16 @@ def _get_weights_iterator(
                 self.load_config.use_tqdm_on_load,
             )
         elif use_safetensors:
-            weights_iterator = safetensors_weights_iterator(
-                hf_weights_files,
-                self.load_config.use_tqdm_on_load,
-            )
+            if self.load_config.load_format == LoadFormat.FASTSAFETENSORS:
+                weights_iterator = fastsafetensors_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                )
+            else:
+                weights_iterator = safetensors_weights_iterator(
+                    hf_weights_files,
+                    self.load_config.use_tqdm_on_load,
+                )
         else:
             weights_iterator = pt_weights_iterator(
                 hf_weights_files,
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 926172a1daa..a7475941c12 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -38,6 +38,14 @@
     SafetensorsStreamer = runai_model_streamer.placeholder_attr(
         "SafetensorsStreamer")
 
+try:
+    from fastsafetensors import SafeTensorsFileLoader, SingleGroup
+except ImportError:
+    fastsafetensors = PlaceholderModule("fastsafetensors")
+    SafeTensorsFileLoader = fastsafetensors.placeholder_attr(
+        "SafeTensorsFileLoader")
+    SingleGroup = fastsafetensors.placeholder_attr("SingleGroup")
+
 logger = init_logger(__name__)
 
 # use system-level temp directory for file locks, so that multiple users
@@ -452,6 +460,45 @@ def runai_safetensors_weights_iterator(
             yield from streamer.get_tensors()
 
 
+def fastsafetensors_weights_iterator(
+    hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+    """Iterate over the weights in the model safetensor files 
+    using fastsafetensor library."""
+    if torch.distributed.is_initialized():
+        pg = torch.distributed.group.WORLD
+    else:
+        pg = SingleGroup()
+
+    device = torch.device(f'cuda:{pg.rank()}')
+    weight_files_sub_lists = [
+        hf_weights_files[i:i + pg.size()]
+        for i in range(0, len(hf_weights_files), pg.size())
+    ]
+
+    for f_list in tqdm(
+            weight_files_sub_lists,
+            desc="Loading safetensors using Fastsafetensor loader",
+            disable=not enable_tqdm(use_tqdm_on_load),
+            bar_format=_BAR_FORMAT,
+    ):
+        loader = SafeTensorsFileLoader(pg, device)
+        rank_file_map = {i: [f] for i, f in enumerate(f_list)}
+        loader.add_filenames(rank_file_map)
+        try:
+            fb = loader.copy_files_to_device()
+            try:
+                keys = list(fb.key_to_rank_lidx.keys())
+                for k in keys:
+                    t = fb.get_tensor(k)
+                    yield k, t
+            finally:
+                fb.close()
+        finally:
+            loader.close()
+
+
 def pt_weights_iterator(
     hf_weights_files: List[str],
     use_tqdm_on_load: bool,

From 87f5a2d7c406c55aa03e23e13699458624cbdd95 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 24 Mar 2025 11:27:30 -0400
Subject: [PATCH 0938/1240] [Core] Don't force uppercase for VLLM_LOGGING_LEVEL
 (#15306)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 6b14b331fc9..a5802035d80 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -294,7 +294,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
 
     # this is used for configuring the default logging level
     "VLLM_LOGGING_LEVEL":
-    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"),
+    lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO").upper(),
 
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
     "VLLM_LOGGING_PREFIX":

From 2177aca7baa6fb1aed930d356a16821bf054919c Mon Sep 17 00:00:00 2001
From: Chen1022 <112855051+Chen-0210@users.noreply.github.com>
Date: Mon, 24 Mar 2025 23:45:32 +0800
Subject: [PATCH 0939/1240] [V1][Minor]   fix comments (#15392)

Signed-off-by: chenjincong <chenjincong@baidu.com>
Signed-off-by: Chen-0210 <chenjincong11@gmail.com>
Co-authored-by: chenjincong <chenjincong@baidu.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index abff7c1c265..397a049dc25 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -137,7 +137,7 @@ def gather_logprobs(
         Gather logprobs for topk and sampled/prompt token.
 
         Args:
-          logits: (num tokens) x (vocab) tensor
+          logprobs: (num tokens) x (vocab) tensor
           num_logprobs: minimum number of logprobs to
                         retain per token
           token_ids: prompt tokens (if prompt logprobs)

From 4c569afb5a0f78634d1c98cc42f6bcb93a78a676 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Tue, 25 Mar 2025 00:01:10 +0800
Subject: [PATCH 0940/1240] [MISC] Refine no available block debug msg (#15076)

Signed-off-by: Yi Liu <yiliu4@habana.ai>
Signed-off-by: yiliu30 <yi4.liu@intel.com>
Co-authored-by: Yi Liu <yiliu4@habana.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../device_communicators/shm_broadcast.py          | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 842d9edfd1c..0d54fc73c88 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -357,8 +357,11 @@ def acquire_write(self, timeout: Optional[float] = None):
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
                             > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
-                        logger.debug("No available block found in %s second. ",
-                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        logger.debug(
+                            ("No available shared memory broadcast block found"
+                             " in %s second."),
+                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                        )
                         n_warning += 1
 
                     # if we time out, raise an exception
@@ -415,8 +418,11 @@ def acquire_read(self, timeout: Optional[float] = None):
                     # if we wait for a long time, log a message
                     if (time.monotonic() - start_time
                             > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning):
-                        logger.debug("No available block found in %s second. ",
-                                     VLLM_RINGBUFFER_WARNING_INTERVAL)
+                        logger.debug(
+                            ("No available shared memory broadcast block found"
+                             "in %s second."),
+                            VLLM_RINGBUFFER_WARNING_INTERVAL,
+                        )
                         n_warning += 1
 
                     # if we time out, raise an exception

From a68b528aabbf751128122925c7f30b7d3a4d883e Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Mar 2025 09:27:57 -0700
Subject: [PATCH 0941/1240] [V1] Aggregate chunked prompt logprobs in model
 runner (#14875)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/sched/scheduler.py    |  6 +++--
 vllm/v1/engine/logprobs.py         |  1 -
 vllm/v1/engine/output_processor.py | 21 +++------------
 vllm/v1/metrics/stats.py           | 19 +++-----------
 vllm/v1/outputs.py                 | 19 ++++++++++++++
 vllm/v1/worker/gpu_input_batch.py  |  5 ++++
 vllm/v1/worker/gpu_model_runner.py | 41 ++++++++++++++++++++++++------
 7 files changed, 68 insertions(+), 44 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d002a19b08a..c71eb9a0445 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -627,8 +627,7 @@ def update_from_output(
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
-            # Transmit partial if chunked prefill & prompt logprobs is enabled
-            if new_token_ids or prompt_logprobs_tensors is not None:
+            if new_token_ids:
                 # Add EngineCoreOutput for this Request.
                 outputs.append(
                     EngineCoreOutput(
@@ -639,6 +638,9 @@ def update_from_output(
                         new_prompt_logprobs_tensors=prompt_logprobs_tensors,
                         stop_reason=request.stop_reason,
                         events=request.take_events()))
+            else:
+                # Invariant: EngineCore returns no partial prefill outputs.
+                assert not prompt_logprobs_tensors
 
             self.scheduled_req_ids.remove(request.request_id)
             if not stopped:
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index 500de14e57d..03d82b6bbc1 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -115,7 +115,6 @@ def _update_prompt_logprobs(
         num_prompt_tokens, num_logprobs = logprobs.shape
 
         # Pythonize the torch tensors.
-        # TODO(rob): experiment with doing this in EngineCore?
         prompt_token_ranks = ranks.tolist()
         prompt_logprobs = logprobs.tolist()
         token_ids = token_ids.tolist()
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 04235eda092..12df341772f 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -105,9 +105,7 @@ def make_request_output(
         finished = finish_reason is not None
         final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
 
-        # In follow up, we will switch to invariant where EngineCore
-        # does not stream partial prefills.
-        if not finished and (self.is_prefilling or final_only):
+        if not finished and final_only:
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
@@ -285,19 +283,7 @@ def process_outputs(
             finish_reason = engine_core_output.finish_reason
             stop_reason = engine_core_output.stop_reason
 
-            # TODO(andy): prompt logprobs + chunked prefill can
-            # result in engine core returning an output for a
-            # partial prefill (in order to send back partial
-            # prompt logprobs.) This breaks the invariant that
-            # process_outputs is only operating on engine core
-            # outputs associated with non-partial completions.
-            # Currently this is handled by having `is_prefilling`
-            # check for new decoded tokens, indicating that
-            # the completion is not partial.
-            #
-            # Follow up will aggregate partial prompt logprobs
-            # in the EngineCore.
-            req_state.is_prefilling = not new_token_ids
+            req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text and perform stop checks.
             stop_string = req_state.detokenizer.update(
@@ -306,8 +292,7 @@ def process_outputs(
                 finish_reason = FinishReason.STOP
                 stop_reason = stop_string
 
-            # 3) Compute sample and prompt logprobs for request,
-            #    if required.
+            # 3) Compute sample and prompt logprobs for request, if required.
             req_state.logprobs_processor.update_from_output(engine_core_output)
 
             # 4) Create and handle RequestOutput objects.
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 83383ce1f3f..6f3d3444742 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -100,15 +100,8 @@ def update_from_output(self, output: "EngineCoreOutput",
         num_new_generation_tokens = len(output.new_token_ids)
 
         self.num_generation_tokens += num_new_generation_tokens
-        if is_prefilling and num_new_generation_tokens > 0:
-            # TODO(andy): we used to assert that num_new_generation_tokens
-            # > 0 with an invariant that EngineCore does not stream outputs
-            # for partially completed prefills (scheduler.update_from_output
-            # makes EngineCoreOutput iff num_computed_tokens == num_tokens).
-            # When prompt logprobs are enabled, we currently stream out the
-            # partially completed prompt.
-            # This will be reverted in a follow up PR and we should re-enable
-            # this assertion / invariant.
+        if is_prefilling:
+            assert num_new_generation_tokens > 0
             self.num_prompt_tokens += prompt_len
 
             first_token_latency = self._time_since(req_stats.arrival_time)
@@ -123,16 +116,12 @@ def update_from_output(self, output: "EngineCoreOutput",
 
         # Process the batch-level "new tokens" engine core event
         if is_prefilling:
-            # TODO: re-enable no-output-for-partial-prefills invariant as above
-            if num_new_generation_tokens > 0:
-                req_stats.first_token_ts = engine_core_timestamp
+            req_stats.first_token_ts = engine_core_timestamp
         else:
             tpot = engine_core_timestamp - req_stats.last_token_ts
             self.time_per_output_tokens_iter.append(tpot)
 
-        # TODO: re-enable no-output-for-partial-prefills invariant as above
-        if num_new_generation_tokens > 0:
-            req_stats.last_token_ts = engine_core_timestamp
+        req_stats.last_token_ts = engine_core_timestamp
 
     def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
                            is_prefilling: bool, req_stats: RequestStateStats,
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index 6f46417170f..2732b933c28 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -39,6 +39,25 @@ def tolists(self):
             self.selected_token_ranks.tolist(),
         )
 
+    @staticmethod
+    def empty_cpu(num_positions: int,
+                  num_tokens_per_position: int) -> "LogprobsTensors":
+        """Create empty LogprobsTensors on CPU."""
+
+        logprob_token_ids = torch.empty(
+            (num_positions, num_tokens_per_position),
+            dtype=torch.int32,
+            device="cpu")
+        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
+        selected_token_ranks = torch.empty(num_positions,
+                                           dtype=torch.int32,
+                                           device="cpu")
+        return LogprobsTensors(
+            logprob_token_ids=logprob_token_ids,
+            logprobs=logprobs,
+            selected_token_ranks=selected_token_ranks,
+        )
+
 
 @dataclass
 class SamplerOutput:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 55d5429a893..01a5cb5548b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -11,6 +11,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
+from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
 from vllm.v1.worker.block_table import BlockTable
@@ -197,6 +198,9 @@ def __init__(
         # that are currently in the prefill phase.
         self.num_prompt_logprobs: dict[str, int] = {}
 
+        # To accumulate prompt logprobs tensor chunks across prefill steps.
+        self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
+
         self.logit_bias: list[Optional[dict[int,
                                             float]]] = [None] * max_num_reqs
         self.has_allowed_token_ids: set[str] = set()
@@ -362,6 +366,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
         self.num_prompt_logprobs.pop(req_id, None)
+        self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 66358d963d5..c6741fdc5d6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1191,6 +1191,7 @@ def _get_prompt_logprobs_dict(
         if not num_prompt_logprobs_dict:
             return {}
 
+        in_progress_dict = self.input_batch.in_progress_prompt_logprobs_cpu
         prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] = {}
 
         # Since prompt logprobs are a rare feature, prioritize simple,
@@ -1206,16 +1207,36 @@ def _get_prompt_logprobs_dict(
             prompt_token_ids = torch.tensor(request.prompt_token_ids).to(
                 self.device, non_blocking=True)
 
+            # Set up target LogprobsTensors object.
+            logprobs_tensors = in_progress_dict.get(req_id)
+            if not logprobs_tensors:
+                # Create empty logprobs CPU tensors for the entire prompt.
+                # If chunked, we'll copy in slice by slice.
+                logprobs_tensors = LogprobsTensors.empty_cpu(
+                    num_prompt_tokens - 1, num_prompt_logprobs + 1)
+                in_progress_dict[req_id] = logprobs_tensors
+
             # Determine number of logits to retrieve.
-            start_tok = request.num_computed_tokens + 1
+            start_idx = request.num_computed_tokens
+            start_tok = start_idx + 1
             num_remaining_tokens = num_prompt_tokens - start_tok
-            if num_tokens < num_remaining_tokens:
+            if num_tokens <= num_remaining_tokens:
                 # This is a chunk, more tokens remain.
+                # In the == case, there are no more prompt logprobs to produce
+                # but we want to defer returning them to the next step where we
+                # have new generated tokens to return.
                 num_logits = num_tokens
             else:
                 # This is the last chunk of prompt tokens to return.
                 num_logits = num_remaining_tokens
                 completed_prefill_reqs.append(req_id)
+                prompt_logprobs_dict[req_id] = logprobs_tensors
+
+            if num_logits <= 0:
+                # This can happen for the final chunk if we prefilled exactly
+                # (num_prompt_tokens - 1) tokens for this request in the prior
+                # step. There are no more prompt logprobs to produce.
+                continue
 
             # Get the logits corresponding to this req's prompt tokens.
             # If this is a partial request (i.e. chunked prefill),
@@ -1236,19 +1257,23 @@ def _get_prompt_logprobs_dict(
                 logprobs, num_prompt_logprobs, tgt_token_ids)
 
             # Transfer GPU->CPU async.
-            prompt_logprobs_dict[req_id] = LogprobsTensors(
-                token_ids.to("cpu", non_blocking=True),
-                logprobs.to("cpu", non_blocking=True),
-                ranks.to("cpu", non_blocking=True),
-            )
+            chunk_slice = slice(start_idx, start_idx + num_logits)
+            logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
+                token_ids, non_blocking=True)
+            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs,
+                                                         non_blocking=True)
+            logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
+                ranks, non_blocking=True)
 
         # Remove requests that have completed prefill from the batch
         # num_prompt_logprobs_dict.
         for req_id in completed_prefill_reqs:
             del num_prompt_logprobs_dict[req_id]
+            del in_progress_dict[req_id]
 
         # Must synchronize the non-blocking GPU->CPU transfers.
-        torch.cuda.synchronize()
+        if prompt_logprobs_dict:
+            torch.cuda.synchronize()
 
         return prompt_logprobs_dict
 

From e8fa54bd26994ef253f187d5f9e565c60eccdae8 Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenweiliu@habana.ai>
Date: Tue, 25 Mar 2025 00:48:40 +0800
Subject: [PATCH 0942/1240] [Hardware][Gaudi][Feature] Enable Dynamic MoE for
 Mixtral (#12303)

Signed-off-by: zhenwei <zhenweiliu@habana.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 31 +++++++++++++++++++
 vllm/model_executor/model_loader/loader.py    | 10 ++++++
 vllm/worker/hpu_model_runner.py               | 18 +++++++++--
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 91764313464..739d216e6e8 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -213,6 +213,34 @@ def forward_cpu(
             e_score_correction_bias,
         )
 
+    def forward_hpu(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        use_grouped_topk: bool,
+        top_k: int,
+        router_logits: torch.Tensor,
+        renormalize: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        assert not use_grouped_topk
+        assert num_expert_group is None
+        assert topk_group is None
+        assert custom_routing_function is None
+        assert layer is not None
+        if scoring_func != "softmax":
+            raise NotImplementedError(
+                "Only softmax scoring function is supported for HPU.")
+        if e_score_correction_bias is not None:
+            raise NotImplementedError(
+                "Expert score correction bias is not supported for HPU.")
+        return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
+                                   router_logits, top_k)
+
     def forward_tpu(
         self,
         layer: torch.nn.Module,
@@ -411,6 +439,9 @@ def __init__(
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
                              "non-grouped topk.")
+        if current_platform.is_hpu():
+            from vllm_hpu_extension.ops import DynamicFusedMOE
+            self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index de04c6f89c2..c969f18b822 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -387,6 +387,16 @@ def _xla_weights_iterator(iterator: Generator):
 
             weights_iterator = _xla_weights_iterator(weights_iterator)
 
+        elif current_platform.is_hpu():
+            import habana_frameworks.torch.core as htcore
+
+            def _hpu_weights_iterator(iterator: Generator):
+                for weights in iterator:
+                    yield weights
+                    htcore.mark_step()
+
+            weights_iterator = _hpu_weights_iterator(weights_iterator)
+
         if self.counter_before_loading_weights == 0.0:
             self.counter_before_loading_weights = time.perf_counter()
         # Apply the prefix.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 4ac547ae326..6b1593eb823 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -376,8 +376,22 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype):
         mask = mask >= metadata.block_usage.unsqueeze(-1)
         attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_(
             mask, -math.inf))
-        block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
-                                                    num_classes=batch_size)
+        if os.environ.get('VLLM_USE_FAKE_HPU',
+                          '0') == '0' and htorch.utils.internal.is_lazy():
+            block_mapping = torch.nn.functional.one_hot(metadata.block_groups,
+                                                        num_classes=batch_size)
+        else:
+            # Unfortunately one_hot on CPU/torch.compile mode/eager mode
+            # doesn't handle out of bounds classes so we need to convert
+            # all negative values to 0 (block_mapping) or bs (block_groups)
+            block_groups = metadata.block_groups.to(torch.long)
+            block_mapping = torch.nn.functional.relu(block_groups)
+            block_mapping = torch.nn.functional.one_hot(block_mapping,
+                                                        num_classes=batch_size)
+            oob_values = block_groups.lt(0)
+            block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0)
+            block_groups.masked_fill_(oob_values, batch_size)
+            metadata = metadata._replace(block_groups=block_groups)
         block_mapping = block_mapping.to(dtype)
         metadata = metadata._replace(block_mapping=block_mapping,
                                      attn_bias=attn_bias)

From fc3b151358d8c67652323e99ca717d0e9a48c717 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 24 Mar 2025 13:48:43 -0400
Subject: [PATCH 0943/1240] [DOC] Add Kubernetes deployment guide with CPUs
 (#14865)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/conf.py           |   1 +
 docs/source/deployment/k8s.md | 105 +++++++++++++++++++++++++++++++++-
 2 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b72faef9af1..b02b84826c9 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -85,6 +85,7 @@
 html_js_files = ["custom.js"]
 html_css_files = ["custom.css"]
 
+myst_heading_anchors = 2
 myst_url_schemes = {
     'http': None,
     'https': None,
diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index b31344b1996..38859567913 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -4,6 +4,9 @@
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
+* [Deployment with CPUs](#deployment-with-cpus)
+* [Deployment with GPUs](#deployment-with-gpus)
+
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [Helm](frameworks/helm.md)
 * [InftyAI/llmaz](integrations/llmaz.md)
@@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 * [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
 * [vllm-project/production-stack](integrations/production-stack.md)
 
-## Pre-requisite
+## Deployment with CPUs
+
+:::{note}
+The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
+:::
+
+First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: $(HF_TOKEN)
+```
+
+Next, start the vLLM server as a Kubernetes Deployment and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: vllm
+        image: vllm/vllm-openai:latest
+        command: ["/bin/sh", "-c"]
+        args: [
+          "vllm serve meta-llama/Llama-3.2-1B-Instruct"
+        ]
+        env:
+        - name: HUGGING_FACE_HUB_TOKEN
+          valueFrom:
+            secretKeyRef:
+              name: hf-token-secret
+              key: token
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.cache/huggingface
+      volumes:
+      - name: llama-storage
+        persistentVolumeClaim:
+          claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app.kubernetes.io/name: vllm
+  ports:
+  - protocol: TCP
+    port: 8000
+    targetPort: 8000
+  type: ClusterIP
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```console
+kubectl logs -l app.kubernetes.io/name=vllm
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
 
-Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
+## Deployment with GPUs
 
-## Deployment using native K8s
+**Pre-requisite**: Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/).
 
 1. Create a PVC, Secret and Deployment for vLLM
 

From 62fc2f273c7ba67c86e12a675963df85f70a20e0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Mar 2025 05:29:34 +0800
Subject: [PATCH 0944/1240] [Doc] Update docs on handling OOM (#15357)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/cpu.md | 2 +-
 docs/source/getting_started/v1_user_guide.md    | 7 +++++--
 docs/source/serving/engine_args.md              | 9 +++++++--
 docs/source/serving/offline_inference.md        | 7 +++++++
 vllm/envs.py                                    | 6 +++---
 vllm/platforms/cpu.py                           | 2 +-
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 65af7b50bdc..1b2ffd61999 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -193,7 +193,7 @@ vLLM CPU backend supports the following vLLM features:
 
 ## Related runtime environment variables
 
-- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+- `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores.
 - `VLLM_CPU_MOE_PREPACK`: whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 
diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index 3e54022eebb..26b28c04fe7 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -156,6 +156,9 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
 
 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
 
-## FAQ
+## Frequently Asked Questions
 
-TODO
+**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
+The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
+
+On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
index f4587b94ede..e9943571a40 100644
--- a/docs/source/serving/engine_args.md
+++ b/docs/source/serving/engine_args.md
@@ -2,7 +2,12 @@
 
 # Engine Arguments
 
-Below, you can find an explanation of every engine argument for vLLM:
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
+- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
+
+Below, you can find an explanation of every engine argument:
 
 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
@@ -15,7 +20,7 @@ Below, you can find an explanation of every engine argument for vLLM:
 
 ## Async Engine Arguments
 
-Below are the additional arguments related to the asynchronous engine:
+Additional arguments are available to the asynchronous engine which is used for online serving:
 
 <!--- pyml disable-num-lines 7 no-space-in-emphasis -->
 ```{eval-rst}
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index ded57500c5d..7bf1c08828d 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -97,6 +97,13 @@ llm = LLM(model="adept/fuyu-8b",
           max_num_seqs=2)
 ```
 
+#### Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
 ### Performance optimization and tuning
 
 You can potentially improve the performance of vLLM by finetuning various options.
diff --git a/vllm/envs.py b/vllm/envs.py
index a5802035d80..e97d37017b4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -340,7 +340,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
 
     # (CPU backend only) CPU key-value cache space.
-    # default is 4GB
+    # default is 4 GiB
     "VLLM_CPU_KVCACHE_SPACE":
     lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
 
@@ -412,9 +412,9 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
     # Cache size (in GiB) for multimodal input cache
-    # Default is 8GiB
+    # Default is 4 GiB
     "VLLM_MM_INPUT_CACHE_GIB":
-    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "8")),
+    lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")),
 
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 40eacfd080e..4b10b298dce 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -92,7 +92,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             if kv_cache_space == 0:
                 cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
                 logger.warning(
-                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) "
                     "for CPU backend is not set, using 4 by default.")
             else:
                 cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa

From e5e7849b6c05573bfc235a2ae1a5203be4907386 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Mar 2025 15:44:08 -0700
Subject: [PATCH 0945/1240] [V1][Perf] Simpler request output queues (#15156)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/engine/test_output_processor.py | 89 +++++++++++++++++++++++-
 vllm/v1/engine/async_llm.py              | 34 ++++-----
 vllm/v1/engine/output_processor.py       | 48 +++++++++++--
 3 files changed, 146 insertions(+), 25 deletions(-)

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index 388f7f45e05..9ac42dbc34a 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -11,11 +11,13 @@
                                    STOP_STRINGS,
                                    DummyOutputProcessorTestVectors,
                                    MockEngineCore)
+from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import PromptLogprobs, SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
 from vllm.v1.metrics.stats import IterationStats
 
 
@@ -834,3 +836,88 @@ def test_iteration_stats(dummy_test_vectors):
 
     assert iteration_stats.num_prompt_tokens == 0
     assert iteration_stats.num_generation_tokens == num_active
+
+
+@pytest.mark.asyncio
+async def test_request_output_collector():
+    NUM_REQS = 3
+    TEXT = "a"
+
+    def make_outputs() -> list[RequestOutput]:
+        return [
+            RequestOutput(
+                request_id="my-request-id",
+                prompt=None,
+                prompt_token_ids=[1, 2, 3],
+                prompt_logprobs=None,
+                outputs=[
+                    CompletionOutput(
+                        index=0,
+                        text=TEXT,
+                        token_ids=[idx],
+                        cumulative_logprob=(idx + 1 * 1.0),
+                        logprobs=[{
+                            "a": idx,
+                            "b": idx
+                        }],
+                        finish_reason="length" if
+                        (idx == NUM_REQS - 1) else None,
+                    )
+                ],
+                finished=(idx == NUM_REQS - 1),
+            ) for idx in range(NUM_REQS)
+        ]
+
+    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+
+    # CASE 1: Put then get.
+    outputs = make_outputs()
+    collector.put(outputs[0])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+    assert output.outputs[0].text == "a"
+    assert output.outputs[0].token_ids == [0]
+
+    # CASE 2: 2 puts then get.
+    num_to_put = 2
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert not output.finished
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
+
+    # CASE 3: Put all 3 (including a finished).
+    num_to_put = 3
+    outputs = make_outputs()
+    for i in range(num_to_put):
+        collector.put(outputs[i])
+    output = await collector.get()
+    assert not collector.ready.is_set()
+    assert collector.output is None
+
+    assert output.finished
+    assert output.outputs[0].finish_reason == "length"
+    # Text, token_ids, and logprobs should get merged.
+    assert output.outputs[0].text == TEXT * num_to_put
+    for tok_0, tok_1 in zip(output.outputs[0].token_ids,
+                            list(range(num_to_put))):
+        assert tok_0 == tok_1
+    assert len(output.outputs[0].logprobs) == num_to_put
+
+    # Cumulative logprobs should be the last one.
+    cumulative_logprob_expected = 1.0 * num_to_put
+    assert output.outputs[0].cumulative_logprob == cumulative_logprob_expected
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index e0169f1a4de..3a6811db313 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -21,14 +21,15 @@
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Device, cdiv, kill_process_tree
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
-from vllm.v1.engine.output_processor import OutputProcessor
+from vllm.v1.engine.output_processor import (OutputProcessor,
+                                             RequestOutputCollector)
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
@@ -176,11 +177,14 @@ async def add_request(
         trace_headers: Optional[Mapping[str, str]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
-    ) -> asyncio.Queue[RequestOutput]:
+    ) -> RequestOutputCollector:
         """Add new request to the AsyncLLM."""
 
-        # Create a new output queue for the request.
-        queue: asyncio.Queue[RequestOutput] = asyncio.Queue()
+        assert isinstance(params, SamplingParams), \
+            "Pooling is not supported in V1"
+
+        # Create a new output collector for the request.
+        queue = RequestOutputCollector(output_kind=params.output_kind)
 
         # Convert Input --> Request.
         request = self.processor.process_inputs(request_id, prompt, params,
@@ -189,17 +193,15 @@ async def add_request(
                                                 prompt_adapter_request,
                                                 priority)
 
-        n = params.n if isinstance(params, SamplingParams) else 1
-
-        if n == 1:
+        if params.n == 1:
             await self._add_request(request, None, 0, queue)
             return queue
 
         # Fan out child requests (for n>1).
         parent_request = ParentRequest(request_id, params)
-        for idx in range(n):
+        for idx in range(params.n):
             request_id, params = parent_request.get_child_info(idx)
-            child_request = request if idx == n - 1 else copy(request)
+            child_request = request if idx == params.n - 1 else copy(request)
             child_request.request_id = request_id
             child_request.sampling_params = params
             await self._add_request(child_request, parent_request, idx, queue)
@@ -207,7 +209,7 @@ async def add_request(
 
     async def _add_request(self, request: EngineCoreRequest,
                            parent_req: Optional[ParentRequest], index: int,
-                           queue: asyncio.Queue[RequestOutput]):
+                           queue: RequestOutputCollector):
 
         # Add the request to OutputProcessor (this process).
         self.output_processor.add_request(request, parent_req, index, queue)
@@ -272,15 +274,7 @@ async def generate(
             while not finished:
                 # Note: drain queue without await if possible (avoids
                 # task switching under load which helps performance).
-                out = q.get_nowait() if not q.empty() else await q.get()
-
-                # Coalesce any additional queued outputs
-                while not q.empty():
-                    next_out = q.get_nowait()
-                    if sampling_params.output_kind == RequestOutputKind.DELTA:
-                        out.add(next_out)
-                    else:
-                        out = next_out
+                out = q.get_nowait() or await q.get()
 
                 # Note: both OutputProcessor and EngineCore handle their
                 # own request cleanup based on finished.
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 12df341772f..1e67bed2611 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -17,6 +17,46 @@
                                    RequestStateStats)
 
 
+class RequestOutputCollector:
+    """
+    Collects streamed RequestOutputs per individual request,
+    for hand-off to the consuming asyncio generate task.
+
+    When streaming deltas, RequestOutputs are merged if the
+    producer gets ahead of the consumer.
+    """
+
+    def __init__(self, output_kind: RequestOutputKind):
+        self.aggregate = output_kind == RequestOutputKind.DELTA
+        self.output: Optional[RequestOutput] = None
+        self.ready = asyncio.Event()
+
+    def put(self, output: RequestOutput) -> None:
+        if self.output is None:
+            self.output = output
+            self.ready.set()
+        elif self.aggregate:
+            # Coalesce the outputs in delta case.
+            self.output.add(output)
+        else:
+            # Just replace latest in non-delta case.
+            self.output = output
+
+    async def get(self) -> RequestOutput:
+        while (output := self.output) is None:
+            await self.ready.wait()
+        self.output = None
+        self.ready.clear()
+        return output
+
+    def get_nowait(self) -> Optional[RequestOutput]:
+        output = self.output
+        if output is not None:
+            self.output = None
+            self.ready.clear()
+        return output
+
+
 @dataclass
 class OutputProcessorOutput:
 
@@ -39,7 +79,7 @@ def __init__(
         detokenizer: IncrementalDetokenizer,
         max_tokens_param: Optional[int],
         arrival_time: float,
-        queue: Optional[asyncio.Queue[RequestOutput]],
+        queue: Optional[RequestOutputCollector],
         log_stats: bool,
     ):
         self.request_id = request_id
@@ -66,7 +106,7 @@ def from_new_request(
         request: EngineCoreRequest,
         parent_req: Optional[ParentRequest],
         request_index: int,
-        queue: Optional[asyncio.Queue[RequestOutput]],
+        queue: Optional[RequestOutputCollector],
         log_stats: bool,
     ) -> "RequestState":
         if not request.sampling_params.detokenize:
@@ -217,7 +257,7 @@ def add_request(
         request: EngineCoreRequest,
         parent_req: Optional[ParentRequest] = None,
         request_index: int = 0,
-        queue: Optional[asyncio.Queue[RequestOutput]] = None,
+        queue: Optional[RequestOutputCollector] = None,
     ) -> None:
         request_id = request.request_id
         if request_id in self.request_states:
@@ -300,7 +340,7 @@ def process_outputs(
                     new_token_ids, finish_reason, stop_reason):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
-                    req_state.queue.put_nowait(request_output)
+                    req_state.queue.put(request_output)
                 else:
                     # LLMEngine: return list of RequestOutputs.
                     request_outputs.append(request_output)

From 9245a73bc0e533a52458e945841b52c409d66a35 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Mon, 24 Mar 2025 15:58:59 -0700
Subject: [PATCH 0946/1240] [BugFix][V1] Quick fix for min_tokens with multiple
 EOS (#15407)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/sampling_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 9b474a37b96..584320e76cb 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -369,8 +369,9 @@ def __post_init__(self) -> None:
             self.top_k = -1
             self.min_p = 0.0
             self._verify_greedy_sampling()
+
         # eos_token_id is added to this by the engine
-        self._all_stop_token_ids = set(self.stop_token_ids)
+        self._all_stop_token_ids.update(self.stop_token_ids)
 
     def _verify_args(self) -> None:
         if not isinstance(self.n, int):

From e8887b7b1dbb386f2dcdbeae62f22bf478374091 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Mon, 24 Mar 2025 16:28:57 -0700
Subject: [PATCH 0947/1240] [Hardware][TPU] Skip failed compilation test
 (#15421)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh |   2 +-
 tests/tpu/test_compilation.py | 176 +++++++++++++++++-----------------
 2 files changed, 91 insertions(+), 87 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 6562942ea3f..f0f53d3b716 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
     && echo TEST_1 \
-    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 81e65103386..e70b3e17c6f 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,92 +5,96 @@
 import tempfile
 
 import depyf
+import pytest
 
 from vllm.config import CompilationLevel
 
-temp_dir = tempfile.mkdtemp()
-with depyf.prepare_debug(temp_dir):
-    from vllm import LLM, SamplingParams
-
-    prompts = [
-        "A robot may not injure a human being",
-        "It is only with the heart that one can see rightly;",
-        "The greatest glory in living lies not in never falling,",
-    ]
-    answers = [
-        " or, through inaction, allow a human being to come to harm.",
-        " what is essential is invisible to the eye.",
-        " but in rising every time we fall.",
-    ]
-    N = 1
-    # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-    sampling_params = SamplingParams(temperature=0.7,
-                                     top_p=1.0,
-                                     n=N,
-                                     max_tokens=16)
-
-    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforace_eager` should be `False`.
-
-    # disable custom dispatcher, let Dynamo takes over
-    # all the control
-    llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-              max_model_len=512,
-              max_num_seqs=64,
-              enforce_eager=True,
-              compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
-    outputs = llm.generate(prompts, sampling_params)
-    for output, answer in zip(outputs, answers):
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        assert generated_text.startswith(answer)
-
-compiled_codes = sorted(
-    glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
-
-for i, compiled_code in enumerate(compiled_codes):
-    print("{} file: {}".format(i + 1, compiled_code))
-
-# We should only trigger Dynamo compilation 4 times:
-# 1. forward pass (symbolic)
-# 2. compute_logits (symbolic)
-# 3. forward pass (shape 16)
-# 4. forward pass (shape 32)
-# and later calls should not trigger Dynamo compilation again.
-# NOTE: It might still trigger XLA compilation.
-
-# Check we have 4 compiled codes
-assert len(compiled_codes) == 4
-
-kv_cache_prefix = "kv_cache"
-attn_prefix = "ragged_paged_attention"
-
-# Check all the compilations are as expected
-compiled_fns = sorted(
-    glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
-
-for i, compiled_fn in enumerate(compiled_fns):
-    print("{} file: {}".format(i + 1, compiled_fn))
-
-# The first compilation is symbolic, so it should not have any kv_caches
-with open(compiled_fns[0]) as f:
-    content = f.read()
-    assert kv_cache_prefix not in content
-
-# The second compilation is symbolic, so it should not have any kv_caches
-with open(compiled_fns[1]) as f:
-    content = f.read()
-    assert kv_cache_prefix not in content
-
-# The third compilation is shape 16, so it should have kv_caches and the
-# ragged_paged_attention
-with open(compiled_fns[2]) as f:
-    content = f.read()
-    assert (kv_cache_prefix in content and attn_prefix in content)
-
-# The forth compilation is shape 32, so it should have kv_caches and the
-# ragged_paged_attention
-with open(compiled_fns[3]) as f:
-    content = f.read()
-    assert (kv_cache_prefix in content and attn_prefix in content)
+
+@pytest.mark.skip(reason="Not working; needs investigation.")
+def test_tpu_compilation():
+    temp_dir = tempfile.mkdtemp()
+    with depyf.prepare_debug(temp_dir):
+        from vllm import LLM, SamplingParams
+
+        prompts = [
+            "A robot may not injure a human being",
+            "It is only with the heart that one can see rightly;",
+            "The greatest glory in living lies not in never falling,",
+        ]
+        answers = [
+            " or, through inaction, allow a human being to come to harm.",
+            " what is essential is invisible to the eye.",
+            " but in rising every time we fall.",
+        ]
+        N = 1
+        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        sampling_params = SamplingParams(temperature=0.7,
+                                         top_p=1.0,
+                                         n=N,
+                                         max_tokens=16)
+
+        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
+        # In real workloads, `enforace_eager` should be `False`.
+
+        # disable custom dispatcher, let Dynamo takes over
+        # all the control
+        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+                  max_model_len=512,
+                  max_num_seqs=64,
+                  enforce_eager=True,
+                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
+        outputs = llm.generate(prompts, sampling_params)
+        for output, answer in zip(outputs, answers):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text.startswith(answer)
+
+    compiled_codes = sorted(
+        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+
+    for i, compiled_code in enumerate(compiled_codes):
+        print("{} file: {}".format(i + 1, compiled_code))
+
+    # We should only trigger Dynamo compilation 4 times:
+    # 1. forward pass (symbolic)
+    # 2. compute_logits (symbolic)
+    # 3. forward pass (shape 16)
+    # 4. forward pass (shape 32)
+    # and later calls should not trigger Dynamo compilation again.
+    # NOTE: It might still trigger XLA compilation.
+
+    # Check we have 4 compiled codes
+    assert len(compiled_codes) == 4
+
+    kv_cache_prefix = "kv_cache"
+    attn_prefix = "ragged_paged_attention"
+
+    # Check all the compilations are as expected
+    compiled_fns = sorted(
+        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+
+    for i, compiled_fn in enumerate(compiled_fns):
+        print("{} file: {}".format(i + 1, compiled_fn))
+
+    # The first compilation is symbolic, so it should not have any kv_caches
+    with open(compiled_fns[0]) as f:
+        content = f.read()
+        assert kv_cache_prefix not in content
+
+    # The second compilation is symbolic, so it should not have any kv_caches
+    with open(compiled_fns[1]) as f:
+        content = f.read()
+        assert kv_cache_prefix not in content
+
+    # The third compilation is shape 16, so it should have kv_caches and the
+    # ragged_paged_attention
+    with open(compiled_fns[2]) as f:
+        content = f.read()
+        assert (kv_cache_prefix in content and attn_prefix in content)
+
+    # The forth compilation is shape 32, so it should have kv_caches and the
+    # ragged_paged_attention
+    with open(compiled_fns[3]) as f:
+        content = f.read()
+        assert (kv_cache_prefix in content and attn_prefix in content)

From 843c08df2fdbb29987db635a64ceda9b792bf812 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 24 Mar 2025 19:37:54 -0400
Subject: [PATCH 0948/1240] [Build] Cython compilation support fix (#14296)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.rocm                       |  2 +-
 pyproject.toml                        |  1 +
 tests/build_cython.py                 | 38 +++++++++++++++++++++++++++
 vllm/engine/llm_engine.py             |  2 +-
 vllm/model_executor/layers/sampler.py |  3 ++-
 vllm/utils.py                         |  6 ++---
 6 files changed, 46 insertions(+), 6 deletions(-)
 create mode 100644 tests/build_cython.py

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index f852f3d6975..841e7978a42 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -40,7 +40,7 @@ ARG USE_CYTHON
 RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
     && python3 setup.py clean --all  \
-    && if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
+    && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
     && python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
diff --git a/pyproject.toml b/pyproject.toml
index ee4e2ed0b7c..07616c858f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ exclude = [
 "vllm/triton_utils/**/*.py" = ["UP006", "UP035"]
 "vllm/vllm_flash_attn/**/*.py" = ["UP006", "UP035"]
 "vllm/worker/**/*.py" = ["UP006", "UP035"]
+"vllm/utils.py" = ["UP006", "UP035"]
 
 [tool.ruff.lint]
 select = [
diff --git a/tests/build_cython.py b/tests/build_cython.py
new file mode 100644
index 00000000000..9dea6bcd62f
--- /dev/null
+++ b/tests/build_cython.py
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+import Cython.Compiler.Options
+from Cython.Build import cythonize
+from setuptools import setup
+
+Cython.Compiler.Options.annotate = True
+
+infiles = []
+
+infiles += [
+    "vllm/engine/llm_engine.py",
+    "vllm/transformers_utils/detokenizer.py",
+    "vllm/engine/output_processor/single_step.py",
+    "vllm/outputs.py",
+    "vllm/engine/output_processor/stop_checker.py",
+]
+
+infiles += [
+    "vllm/core/scheduler.py",
+    "vllm/sequence.py",
+    "vllm/core/block_manager.py",
+]
+
+infiles += [
+    "vllm/model_executor/layers/sampler.py",
+    "vllm/sampling_params.py",
+    "vllm/utils.py",
+]
+
+setup(ext_modules=cythonize(infiles,
+                            annotate=False,
+                            force=True,
+                            compiler_directives={
+                                'language_level': "3",
+                                'infer_types': True
+                            }))
+
+# example usage: python3 build_cython.py build_ext --inplace
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b9a8b6a5306..3d019ea58c5 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1249,7 +1249,7 @@ def _process_model_outputs(self,
         return None
 
     def _advance_to_next_step(
-            self, output: List[SamplerOutput],
+            self, output: SamplerOutput,
             seq_group_metadata_list: List[SequenceGroupMetadata],
             scheduled_seq_groups: List[ScheduledSequenceGroup]) -> None:
         """Given model output from a single run, append the tokens to the
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 07ee75593f7..1ee1332ac45 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1187,7 +1187,8 @@ def _build_sampler_output(
         deferred_sample_results_args=deferred_sample_results_args)
 
 
-def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]:
+def _get_next_prompt_tokens(
+        seq_group: SequenceGroupToSample) -> tuple[int, ...]:
     """Get a list of next prompt tokens to compute logprob from a
         given sequence group.
 
diff --git a/vllm/utils.py b/vllm/utils.py
index d87ec44c75f..9e14a628993 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -37,7 +37,7 @@
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, TypeVar, Union)
+                    Optional, Type, TypeVar, Union)
 from uuid import uuid4
 
 import cloudpickle
@@ -1544,9 +1544,9 @@ def __len__(self):
         return len(self._factory)
 
 
-class ClassRegistry(UserDict[type[T], _V]):
+class ClassRegistry(UserDict[Type[T], _V]):
 
-    def __getitem__(self, key: type[T]) -> _V:
+    def __getitem__(self, key: Type[T]) -> _V:
         for cls in key.mro():
             if cls in self.data:
                 return self.data[cls]

From 1309b96fdcc4b051f6e99ccdc149c497a6a77dfb Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Mon, 24 Mar 2025 19:45:30 -0400
Subject: [PATCH 0949/1240] [ROCm][Kernel] MoE weights padding (#14454)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: charlifu <charlifu@amd.com>
Co-authored-by: charlifu <charlifu@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_moe.py                     | 45 ++++++++++++++-----
 vllm/envs.py                                  |  5 +++
 .../layers/fused_moe/fused_moe.py             |  6 +--
 vllm/model_executor/layers/fused_moe/layer.py | 19 ++++++++
 .../model_executor/layers/quantization/fp8.py |  6 +--
 5 files changed, 65 insertions(+), 16 deletions(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 52893f4329e..653d2734afe 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -3,8 +3,11 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
+
 import pytest
 import torch
+from torch.nn import Parameter
+from torch.nn import functional as F
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
@@ -37,6 +40,7 @@
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("ep_size", EP_SIZE)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
 def test_fused_moe(
     m: int,
     n: int,
@@ -45,6 +49,7 @@ def test_fused_moe(
     topk: int,
     ep_size: int,
     dtype: torch.dtype,
+    padding: bool,
 ):
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
     w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -65,16 +70,7 @@ def test_fused_moe(
     else:
         e_map = None
 
-    triton_output = fused_moe(a,
-                              w1,
-                              w2,
-                              score,
-                              topk,
-                              global_num_experts=e,
-                              expert_map=e_map,
-                              renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk, e_map)
-    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
     iterative_output = iterative_moe(a,
                                      w1,
                                      w2,
@@ -83,6 +79,23 @@ def test_fused_moe(
                                      global_num_experts=e,
                                      expert_map=e_map,
                                      renormalize=False)
+
+    # Pad the weight if moe padding is enabled
+    if padding:
+        w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128]
+        torch.cuda.empty_cache()
+        w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128]
+        torch.cuda.empty_cache()
+
+    triton_output = fused_moe(a,
+                              w1,
+                              w2,
+                              score,
+                              topk,
+                              global_num_experts=e,
+                              expert_map=e_map,
+                              renormalize=False)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
     torch.testing.assert_close(iterative_output,
                                torch_output,
                                atol=2e-2,
@@ -202,8 +215,9 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
@@ -233,6 +247,17 @@ def test_mixtral_moe(dtype: torch.dtype):
     # vLLM uses 1D query [num_tokens, hidden_dim]
     vllm_inputs = hf_inputs.flatten(0, 1)
 
+    # Pad the weight if moe padding is enabled
+    if padding:
+        vllm_moe.experts.w13_weight = Parameter(F.pad(
+            vllm_moe.experts.w13_weight, (0, 128), "constant", 0)[..., 0:-128],
+                                                requires_grad=False)
+        torch.cuda.empty_cache()
+        vllm_moe.experts.w2_weight = Parameter(F.pad(
+            vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
+                                               requires_grad=False)
+        torch.cuda.empty_cache()
+
     # Run forward passes for both MoE blocks
     hf_states, _ = hf_moe.forward(hf_inputs)
     vllm_states = vllm_moe.forward(vllm_inputs)
diff --git a/vllm/envs.py b/vllm/envs.py
index e97d37017b4..f0fd20c70e3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@
     VLLM_ROCM_USE_AITER: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
+    VLLM_ROCM_MOE_PADDING: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -520,6 +521,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_ROCM_FP8_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_FP8_PADDING", "1"))),
 
+    # Pad the weights for the moe kernel
+    "VLLM_ROCM_MOE_PADDING":
+    lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
+
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4143ccce525..4de020ff81c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -800,7 +800,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             expert_ids,
             num_tokens_post_padded,
             B.shape[1],
-            A.shape[1],
+            B.shape[2],
             EM,
             topk_ids.numel(),
             A.stride(0),
@@ -1322,8 +1322,8 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.is_contiguous(), "Expert weights1 must be contiguous"
-    assert w2.is_contiguous(), "Expert weights2 must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
     assert hidden_states.dtype in [
         torch.float32, torch.float16, torch.bfloat16
     ]
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 739d216e6e8..bc134f67615 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,6 +5,7 @@
 from typing import Callable, List, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
 from vllm import envs
@@ -96,9 +97,27 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         layer.register_parameter("w2_weight", w2_weight)
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        # Pad the weight tensor. This is an optimization on ROCm platform, which
+        # can benefit from tensors located far enough from one another in memory
+        if (envs.VLLM_ROCM_MOE_PADDING and current_platform.is_rocm()
+                and weight.stride(-1) == 1
+                and (weight.stride(-2) * weight.element_size()) % 512 == 0):
+            num_pad = 256 // weight.element_size()
+            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+            torch.cuda.empty_cache()
+        return weight
+
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
 
+        layer.w13_weight = torch.nn.Parameter(self._maybe_pad_weight(
+            layer.w13_weight.data),
+                                              requires_grad=False)
+        layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
+            layer.w2_weight.data),
+                                             requires_grad=False)
+
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 import intel_extension_for_pytorch as ipex
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 2d5d8e6adc9..d92b0931a6e 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -255,7 +255,7 @@ def create_weights(
             else:
                 layer.register_parameter("input_scale", None)
 
-    def add_padding_to_weight(self, weight: torch.Tensor) -> torch.Tensor:
+    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
         # Pad the weight tensor. This is an optimization on ROCm platform, which
         # can benefit from tensors located far enough from one another in memory
         if (envs.VLLM_ROCM_FP8_PADDING and current_platform.is_rocm()
@@ -279,7 +279,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 weight = layer.weight.data
                 weight_scale_inv = layer.weight_scale_inv.data
 
-            weight = self.add_padding_to_weight(weight)
+            weight = self._maybe_pad_weight(weight)
 
             # Torch.compile cannot use Parameter subclasses.
             layer.weight = Parameter(weight, requires_grad=False)
@@ -343,7 +343,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     logical_widths=layer.logical_widths,
                 )
 
-            weight = self.add_padding_to_weight(weight)
+            weight = self._maybe_pad_weight(weight)
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)

From c9aebbe980a5bf54ccaf270f598d5eae89525ad6 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Mar 2025 17:16:46 -0700
Subject: [PATCH 0950/1240] [V1][Spec Decode] Enable spec decode for top-p &
 top-k sampling (#15063)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_rejection_sampler.py | 150 +++++++++++++++++++++-
 vllm/v1/sample/rejection_sampler.py       |  83 ++++++++++--
 vllm/v1/spec_decode/utils.py              |   5 +-
 3 files changed, 219 insertions(+), 19 deletions(-)

diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 8c423e367ef..cbdb0b910d1 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -36,6 +36,8 @@ def create_logits_tensor(output_token_ids: list[list[int]],
 def create_sampling_metadata(
     all_greedy: bool,
     temperature: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+    top_p: Optional[torch.Tensor] = None,
     generators: Optional[dict[int, Any]] = None,
 ) -> SamplingMetadata:
     """Create a v1 sampling metadata object with all_greedy set 
@@ -52,8 +54,8 @@ def create_sampling_metadata(
         temperature=temperature,
         all_greedy=all_greedy,
         all_random=not all_greedy,
-        top_p=None,
-        top_k=None,
+        top_p=top_p,
+        top_k=top_k,
         min_p=torch.empty(1, ),
         generators=generators,
         max_num_logprobs=0,
@@ -462,3 +464,147 @@ def estimate_rejection_sampling_pdf(
                            density=True)
 
     return hist.hist
+
+
+def _test_masked_logits(
+    rejection_sampler,
+    batch_size: int,
+    num_draft_tokens: int,
+    vocab_size: int,
+    target_logits: torch.Tensor,
+    unmasked_indices: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+):
+    # Set up test parameters
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create random draft probabilities.
+    draft_probs = torch.rand((num_tokens, vocab_size),
+                             dtype=torch.float32,
+                             device=DEVICE)
+    draft_probs = F.softmax(draft_probs, dim=-1)
+
+    # Randomly sample draft token ids from draft probs
+    draft_token_ids = torch.multinomial(draft_probs, num_samples=1)
+    draft_token_ids = draft_token_ids.reshape(batch_size, num_draft_tokens)
+    draft_token_ids = draft_token_ids.tolist()
+
+    # Bonus tokens not used but required
+    bonus_token_ids = torch.zeros((batch_size, 1),
+                                  dtype=torch.int64,
+                                  device=DEVICE)
+
+    # Create spec decode metadata
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(
+        draft_token_ids,
+        device=DEVICE,
+    )
+
+    # Run rejection sampling
+    output_token_ids = rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=draft_probs,
+        target_logits=target_logits,
+        bonus_token_ids=bonus_token_ids,
+        sampling_metadata=sampling_metadata,
+    )
+
+    # Remove bonus tokens and reshape
+    output_token_ids = output_token_ids[:, :-1].flatten().tolist()
+
+    # Check that all sampled tokens are within the unmasked indices.
+    for i in range(num_tokens):
+        token_id = output_token_ids[i]
+        if token_id == PLACEHOLDER_TOKEN_ID:
+            continue
+        assert token_id in unmasked_indices[i]
+
+
+@pytest.mark.parametrize("top_k", [1, 5, 99])
+def test_top_k(rejection_sampler, top_k):
+    """Test rejection sampling with top-k sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Randomly create top-k indices.
+    top_k_indices = [
+        torch.randperm(vocab_size, device=DEVICE)[:top_k]
+        for _ in range(num_tokens)
+    ]
+    top_k_indices = torch.stack(top_k_indices)
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
+
+    # Increment the logits for top-k indices, a little bit more than the other
+    # ones. If the masking is effective, the non-topk indices will never be
+    # sampled despite the small difference in logits.
+    for i in range(num_tokens):
+        target_logits[i, top_k_indices[i]] += 0.1
+
+    # Create sampling metadata
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_k=torch.tensor([top_k] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.int64),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_k_indices,
+        sampling_metadata=sampling_metadata,
+    )
+
+
+@pytest.mark.parametrize("top_p", [0.5, 0.9, 0.99])
+def test_top_p(rejection_sampler, top_p):
+    """Test rejection sampling with top-p sampling"""
+    vocab_size = 100
+    batch_size = 100
+    num_draft_tokens = 3
+    num_tokens = batch_size * num_draft_tokens
+
+    # Create logits with the uniform distribution.
+    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    rescaled_logits = target_logits / temperature
+
+    logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
+    top_p_mask = probs_sum <= 1 - top_p
+    # at least one
+    top_p_mask[:, -1] = False
+
+    # Get the top-p indices.
+    top_p_indices = []
+    for i in range(num_tokens):
+        top_p_indices.append(logits_idx[i][~top_p_mask[i]].tolist())
+
+    # Create sampling metadata
+    sampling_metadata = create_sampling_metadata(
+        all_greedy=False,
+        temperature=temperature,
+        top_p=torch.tensor([top_p] * batch_size,
+                           device=DEVICE,
+                           dtype=torch.float32),
+    )
+
+    _test_masked_logits(
+        rejection_sampler,
+        batch_size=batch_size,
+        num_draft_tokens=num_draft_tokens,
+        vocab_size=vocab_size,
+        target_logits=target_logits,
+        unmasked_indices=top_p_indices,
+        sampling_metadata=sampling_metadata,
+    )
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 6284ae4b490..c8327f36a58 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -8,6 +8,7 @@
 
 from vllm.logger import init_logger
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
 from vllm.v1.sample.ops.utils import compiled_softmax
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
@@ -245,25 +246,81 @@ def compute_probs(
         return logits
 
     num_tokens = logits.shape[0]
-    batch_size = cu_num_draft_tokens.shape[0]
-    expanded_temperature = torch.empty(
-        (num_tokens, 1),
-        dtype=torch.float32,
-        device=logits.device,
-    )
-    expand_kernel[(batch_size, )](
-        expanded_temperature,
+    temperature = expand_batch_to_tokens(
         sampling_metadata.temperature,
         cu_num_draft_tokens,
-        GREEDY_TEMPERATURE,  # replace_from
-        1,  # replace_to
-        MAX_NUM_TOKENS=MAX_SPEC_LEN,
-        num_warps=1,
+        num_tokens,
+        replace_from=GREEDY_TEMPERATURE,
+        replace_to=1,
     )
-    output_prob = compiled_softmax(logits, expanded_temperature)
+    # TODO(woosuk): Consider using in-place op to reduce memory usage.
+    logits = logits / temperature.unsqueeze(-1)
+
+    # Get expanded top_k and top_p tensors.
+    top_k = None
+    if sampling_metadata.top_k is not None:
+        top_k = expand_batch_to_tokens(
+            sampling_metadata.top_k,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+    top_p = None
+    if sampling_metadata.top_p is not None:
+        top_p = expand_batch_to_tokens(
+            sampling_metadata.top_p,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+
+    # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
+    # which is slow for large vocab sizes. This may cause performance issues.
+    logits = apply_top_k_top_p(logits, top_k, top_p)
+
+    output_prob = compiled_softmax(logits)
     return output_prob
 
 
+def expand_batch_to_tokens(
+    x: torch.Tensor,  # [batch_size]
+    cu_num_tokens: torch.Tensor,  # [batch_size]
+    num_tokens: int,
+    replace_from: int = 0,
+    replace_to: int = 0,
+) -> torch.Tensor:
+    """Expand [batch_size] tensor to [num_tokens] tensor based on the number of
+    tokens per batch in cu_num_tokens.
+
+    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
+    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].
+
+    Args:
+        x: [batch_size] tensor to expand.
+        cu_num_tokens: [batch_size] tensor containing the cumulative number of
+            tokens per batch. Each element represents the total number of
+            tokens up to and including that batch.
+        num_tokens: Total number of tokens.
+        replace_from: int = 0
+            Value to be replaced if it is found in x.
+        replace_to: int = 0
+            Value to replace with when replace_from is found.
+    Returns:
+        expanded_x: [num_tokens] tensor.
+    """
+    batch_size = x.shape[0]
+    assert cu_num_tokens.shape[0] == batch_size
+    expanded_x = x.new_empty(num_tokens)
+    expand_kernel[(batch_size, )](
+        expanded_x,
+        x,
+        cu_num_tokens,
+        replace_from,
+        replace_to,
+        MAX_NUM_TOKENS=MAX_SPEC_LEN,  # To avoid recompilation.
+        num_warps=1,
+    )
+    return expanded_x
+
+
 def generate_uniform_probs(
     num_tokens: int,
     num_draft_tokens: list[int],
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index d5329ef7b5a..ce81a40ee3a 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -3,10 +3,7 @@
 
 
 def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
-    if req_id in input_batch.top_k_reqs or req_id in input_batch.top_p_reqs:
-        # Spec decode doesn't support top_p/top_k sampling.
-        return False
-    elif req_id in input_batch.min_p_reqs:
+    if req_id in input_batch.min_p_reqs:
         # Spec decode doesn't support min_p sampling.
         return False
     elif (req_id in input_batch.frequency_penalties_reqs

From 8c0fbaa25877d1c45ca0978d9a0e5c6f431ade2f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Mar 2025 19:09:04 -0700
Subject: [PATCH 0951/1240] [Minor][Spec Decode] Remove compiled_softmax
 (#15416)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/ops/utils.py         | 30 -----------------------------
 vllm/v1/sample/rejection_sampler.py |  4 +---
 2 files changed, 1 insertion(+), 33 deletions(-)
 delete mode 100644 vllm/v1/sample/ops/utils.py

diff --git a/vllm/v1/sample/ops/utils.py b/vllm/v1/sample/ops/utils.py
deleted file mode 100644
index a54e2060306..00000000000
--- a/vllm/v1/sample/ops/utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from typing import Union
-
-import torch
-
-
-def compiled_softmax(
-    logits: torch.Tensor,
-    temperature: Union[float, torch.Tensor] = 1.0,
-) -> torch.Tensor:
-    """Faster softmax kernel generated by torch.compile.
-
-    Args:
-        logits: [n, vocab_size]
-        temperature: [n] or float
-    """
-    # NOTE(woosuk): Avoid recompilation by marking the first dim as dynamic.
-    torch._dynamo.mark_dynamic(logits, index=0)
-    if isinstance(temperature, torch.Tensor):
-        torch._dynamo.mark_dynamic(temperature, index=0)
-    return _softmax(logits, temperature)
-
-
-@torch.compile
-def _softmax(
-    logits: torch.Tensor,
-    temperature: Union[float, torch.Tensor],
-) -> torch.Tensor:
-    logits = logits / temperature
-    return torch.softmax(logits, dim=-1, dtype=torch.float32)
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index c8327f36a58..e0db9474f61 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -9,7 +9,6 @@
 from vllm.logger import init_logger
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
-from vllm.v1.sample.ops.utils import compiled_softmax
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
 logger = init_logger(__name__)
@@ -275,8 +274,7 @@ def compute_probs(
     # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
     # which is slow for large vocab sizes. This may cause performance issues.
     logits = apply_top_k_top_p(logits, top_k, top_p)
-
-    output_prob = compiled_softmax(logits)
+    output_prob = logits.softmax(dim=-1, dtype=torch.float32)
     return output_prob
 
 
From d23474afdfe19388247657cd26ce8d9e31c4731d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 25 Mar 2025 02:41:45 +0000
Subject: [PATCH 0952/1240] Add pipeline parallel support to
 `TransformersModel` (#12832)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md      |   2 +-
 tests/distributed/test_pipeline_parallel.py |   3 +
 vllm/model_executor/models/transformers.py  | 316 ++++++++++++++------
 vllm/model_executor/models/utils.py         |  12 +-
 4 files changed, 245 insertions(+), 88 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index ba01f2309b3..56ea8c5d837 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
-- <project:#distributed-serving> (pipeline parallel coming soon <gh-pr:12832>!)
+- <project:#distributed-serving> (requires `transformers>=4.49.0`)
 
 #### Remote code
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 1342f0da29d..e757db45c8c 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -175,6 +175,8 @@ def iter_params(self, model_id: str):
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
+    # Tests TransformersModel
+    "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
     # Uses Llama
@@ -243,6 +245,7 @@ def iter_params(self, model_id: str):
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
     "meta-llama/Llama-3.2-1B-Instruct",
+    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index be788d63200..fe6a9d7a4aa 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -15,21 +15,25 @@
 # limitations under the License.
 """Wrapper around `transformers` models"""
 import re
+from itertools import chain
 from typing import Iterable, Literal, Optional, Union
 
 import torch
 from torch import nn
-from transformers import AutoModel, PreTrainedModel
+from transformers import AutoModel, PretrainedConfig, PreTrainedModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention
-from vllm.config import VllmConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
+                         ParallelConfig, VllmConfig)
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.distributed.utils import get_pp_indices
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
@@ -37,8 +41,9 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsQuant
-from .utils import maybe_prefix
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -53,7 +58,7 @@ def vllm_flash_attention_forward(
         # Transformers kwargs
         scaling: Optional[float] = None,
         # vLLM kwargs
-        attention_instances: Optional[list[Attention]] = None,
+        attention_instances: Optional[dict[Attention]] = None,
         **kwargs):
     self_attn = attention_instances[module.layer_idx]
     if scaling is not None:
@@ -72,13 +77,12 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
 
 
 def replace_linear_class(
-        linear: nn.Linear,
-        style: Literal["colwise", "rowwise"],
-        quant_config=None) -> Union[ColumnParallelLinear, RowParallelLinear]:
+    linear: nn.Linear, style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig
+) -> Union[ColumnParallelLinear, RowParallelLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
     
-    `quant_config` is not yet supported.
     Args:
         linear (nn.Linear): `nn.Linear` to be replaced.
         style (str): Tensor parallel style of the new linear, e.g. "colwise".
@@ -105,7 +109,7 @@ def replace_linear_class(
     )
 
 
-class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA):
+class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"
                          ]  # TODO transformers will have a util to get it
@@ -114,31 +118,175 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         logger.info("Using Transformers backend.")
 
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        model_config = vllm_config.model_config
-        parallel_config = vllm_config.parallel_config
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        cache_config: CacheConfig = vllm_config.cache_config
+        device_config: DeviceConfig = vllm_config.device_config
+        model_config: ModelConfig = vllm_config.model_config
+        parallel_config: ParallelConfig = vllm_config.parallel_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
 
         self.config = config
+        self.cache_config = cache_config
+        self.device_config = device_config
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.quant_config = quant_config
+
         self.vocab_size = model_config.get_vocab_size()
         self.unpadded_vocab_size = model_config.get_vocab_size()
 
-        self.model: PreTrainedModel = AutoModel.from_config(
-            self.config,
-            attn_implementation="vllm",
-            torch_dtype=vllm_config.model_config.dtype,
-            trust_remote_code=vllm_config.model_config.trust_remote_code,
-        )
+        self.pp_group = get_pp_group()
+        self.pp_size = self.pp_group.world_size
+        self.pp_rank = self.pp_group.rank_in_group
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        # Use meta device to delay allocating GPU tensors
+        with torch.device("meta"):
+            self.model: PreTrainedModel = AutoModel.from_config(
+                config,
+                attn_implementation="vllm",
+                torch_dtype=model_config.dtype,
+                trust_remote_code=model_config.trust_remote_code,
+            )
         prefix = self.model.base_model_prefix
 
-        # MLP modifications
-        self.apply_base_model_tp_plan(self.model)
+        self.pipeline_parallel()
+        self.tensor_parallel()
+
+        # Input embeddings
+        if not isinstance(self.model.get_input_embeddings(), PPMissingLayer):
+            self.model.set_input_embeddings(
+                VocabParallelEmbedding(
+                    config.vocab_size,
+                    config.hidden_size,
+                    org_num_embeddings=config.vocab_size,
+                    quant_config=quant_config,
+                ))
+
+        # Attention layers
+        self.attention_instances = self.create_attention_instances()
+
+        # Output embeddings
+        if not isinstance(getattr(self, "lm_head", None), PPMissingLayer):
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+
+        # Initialize buffers (e.g. rotary embedding inverse frequency)
+        self.init_buffers(self.model)
+
+        # Move remaining meta tensors to device (should happen last)
+        self.meta_to_empty(self.model)
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(["hidden_states"],
+                                                    config.hidden_size))
+
+    def pipeline_parallel(self):
+        """
+        Apply the model's pipeline parallelization plan.
+        """
+        if self.pp_size <= 1:
+            return
 
-        # Attention modifications (assumes 1 attention op per hidden layer)
-        num_heads = model_config.get_num_attention_heads(parallel_config)
-        head_size = model_config.get_head_size()
-        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
-        self.attention_instances = [
+        if not self.model.supports_pp_plan:
+            raise ValueError(
+                f"{type(self.model)} does not support pipeline parallel yet!")
+
+        module_lists = []
+        module_list_idx = None
+        pp_plan = list(self.model._pp_plan.keys())
+        for i, name in enumerate(pp_plan):
+            if isinstance(getattr(self.model, name), nn.ModuleList):
+                module_lists.append(name)
+                module_list_idx = i
+
+        if len(module_lists) > 1:
+            raise ValueError(
+                "Pipeline parallel of models with multiple `ModuleList`s "
+                "in the base model are not supported yet!")
+        if module_list_idx is None:
+            raise ValueError(
+                f"Could not find `ModuleList` in {type(self.model)}")
+
+        # Layers before module list
+        for name in pp_plan[:module_list_idx]:
+            if self.pp_group.is_first_rank or (self.config.tie_word_embeddings
+                                               and self.pp_group.is_last_rank):
+                continue
+            setattr(self.model, name, PPMissingLayer())
+
+        # Module list
+        start_layer, end_layer = get_pp_indices(self.config.num_hidden_layers,
+                                                self.pp_rank, self.pp_size)
+        layers_name = pp_plan[module_list_idx]
+        layers = getattr(self.model, layers_name)
+        for i in range(len(layers)):
+            if start_layer <= i and i < end_layer:
+                continue
+            layers[i] = PPMissingLayer(return_tuple=True)
+
+        # Layers after module list
+        for name in pp_plan[module_list_idx + 1:]:
+            # Modules that should be on last rank
+            if not self.pp_group.is_last_rank:
+                setattr(self.model, name, PPMissingLayer())
+
+        if not self.pp_group.is_last_rank:
+            self.lm_head = PPMissingLayer()
+
+    def tensor_parallel(self):
+        """
+        Apply the model's tensor parallelization plan.
+        Currently only supports linear layers.
+        """
+        if self.tp_size > 1 and self.config.base_model_tp_plan is None:
+            raise ValueError(
+                f"{type(self.model)} does not support tensor parallel yet!")
+
+        tp_plan = self.model._tp_plan
+
+        def _tensor_parallel(module: nn.Module, prefix: str = ""):
+            for child_name, child_module in module.named_children():
+                qual_name = maybe_prefix(prefix, child_name)
+                for pattern, style in tp_plan.items():
+                    if re.match(pattern, qual_name) and isinstance(
+                            child_module, nn.Linear):
+                        new_module = replace_linear_class(
+                            child_module, style, self.quant_config)
+                        setattr(module, child_name, new_module)
+                        log_replacement(qual_name, child_module, new_module)
+                else:
+                    _tensor_parallel(child_module, prefix=qual_name)
+
+        _tensor_parallel(self.model)
+
+    def create_attention_instances(self) -> dict[int, Attention]:
+        """
+        Create `Attention` instances to inform KV cache allocation.
+        """
+        num_heads = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        start, end = get_pp_indices(self.config.num_hidden_layers,
+                                    self.pp_rank, self.pp_size)
+        return {
+            i:
             Attention(
                 num_heads=num_heads,
                 head_size=head_size,
@@ -146,77 +294,70 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                 # Transformers, it's updated in vllm_flash_attention_forward
                 scale=head_size**-0.5,
                 num_kv_heads=num_kv_heads,
-                cache_config=cache_config,
+                cache_config=self.cache_config,
                 quant_config=self.quant_config,
-                prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
-        ]
-
-        # Model modifications
-        self.replace_vocab_embed_class(self.model)
-
-        # ForCausalLM modifications
-        self.lm_head = ParallelLMHead(self.vocab_size,
-                                      config.hidden_size,
-                                      quant_config=self.quant_config,
-                                      prefix=maybe_prefix(prefix, "lm_head"))
-        if config.tie_word_embeddings:
-            self.lm_head.weight = self.model.get_input_embeddings().weight
-
-        logit_scale = getattr(config, "logit_scale", 1.0)
-        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                self.vocab_size, logit_scale)
-        self.sampler = get_sampler()
+                prefix=f"{i}.attn")
+            for i in range(start, end)
+        }
 
-    def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
+    def init_buffers(self, module: nn.Module):
         """
-        Apply the base model tensor parallelization plan to a module.
-        Currently only supports linear layers.
+        If a `buffer` is on the `meta` device, then its parent
+        `module` is the original module created by:
+
+        ```python
+        with torch.device("meta"):
+            self.model: PreTrainedModel = AutoModel.from_config(...)
+        ```
+
+        This means that:
+        - `type(module)` is a class from `transformers`
+        - This class is constructed using a `PretrainedConfig`
         """
-        if (self.config.base_model_tp_plan is None
-                and get_tensor_model_parallel_world_size() > 1):
-            raise ValueError(
-                "Trying to run tensor parallelization but the model does not "
-                "support it yet!")
-
-        for child_name, child_module in module.named_children():
-            qual_name = maybe_prefix(prefix, child_name)
-            for pattern, style in self.config.base_model_tp_plan.items():
-                if re.match(pattern, qual_name) and isinstance(
-                        child_module, nn.Linear):
-                    new_module = replace_linear_class(child_module, style,
-                                                      self.quant_config)
-                    setattr(module, child_name, new_module)
-                    log_replacement(qual_name, child_module, new_module)
-            else:
-                self.apply_base_model_tp_plan(child_module, prefix=qual_name)
-
-    def replace_vocab_embed_class(self, module: nn.Module):
-        # Use native set input embeddings
-        new_module = VocabParallelEmbedding(
-            self.vocab_size,
-            self.config.hidden_size,
-            org_num_embeddings=self.vocab_size,
-            quant_config=None,
-        )
-        log_replacement("input embedding", self.model.get_input_embeddings(),
-                        new_module)
-        module.set_input_embeddings(new_module)
+        for name, buffer in module.named_buffers(recurse=False):
+            if buffer.device == torch.device("meta"):
+                new_buffer = getattr(type(module)(self.config), name)
+                setattr(module, name, new_buffer)
+        for child in module.children():
+            self.init_buffers(child)
+
+    def meta_to_empty(self, module: nn.Module):
+        tensors = list(chain(module.buffers(), module.parameters()))
+        if tensors and all(t.device == torch.device("meta") for t in tensors):
+            module.to_empty(device=self.device_config.device)
+            return  # We can stop recursing because to_empty is recursive
+        for child in module.children():
+            self.meta_to_empty(child)
 
     def forward(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        model_output = self.model(
-            input_ids[None, ...],
+        if not get_pp_group().is_first_rank:
+            assert intermediate_tensors is not None
+            input_ids = None
+            inputs_embeds = intermediate_tensors["hidden_states"]
+
+        if input_ids is not None:
+            input_ids = input_ids[None, ...]
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds[None, ...]
+
+        hidden_states = self.model(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
             use_cache=False,
             position_ids=positions[None, ...],
-            intermediate_tensors=intermediate_tensors,
             attention_instances=self.attention_instances,
             return_dict=False)[0][0, ...]  # we remove batch dimension for now
-        return model_output
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({"hidden_states": hidden_states})
+
+        return hidden_states
 
     def compute_logits(
         self,
@@ -238,8 +379,11 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params = set[str]()
         for name, loaded_weight in weights:
-            if name not in params_dict:
-                name = f"{self.model.base_model_prefix}.{name}"
+            # Necessary for some models which use remote code
+            if not name.startswith(prefix := self.model.base_model_prefix):
+                name = maybe_prefix(prefix, name)
+            if is_pp_missing_parameter(name, self):
+                continue
             if name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index a705aeffef3..1e3d78c7f6f 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -472,6 +472,16 @@ class PPMissingLayer(torch.nn.Identity):
 
     def __init__(self, *args, **kwargs):
         super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input, ) if self.return_tuple else input
 
 
 _CPU_OFFLOAD_BYTES = 0
@@ -650,4 +660,4 @@ def cast_overflow_tensors(
     if tensors.isinf().any() or tensors.isnan().any():
         clamp_value = torch.finfo(tensors.dtype).max - offset
         tensors = torch.clamp(tensors, min=-clamp_value, max=clamp_value)
-    return tensors
\ No newline at end of file
+    return tensors

From 853a970965265a3d21537fb6b130e3bec8bd4cbd Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 25 Mar 2025 11:43:48 +0800
Subject: [PATCH 0953/1240] [Misc] Remove LoRA log (#15388)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                         |  6 ------
 vllm/lora/punica_wrapper/punica_gpu.py | 14 +++++---------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 989e5b47516..a2e83af3ab4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2373,12 +2373,6 @@ def verify_with_model_config(self, model_config: ModelConfig):
             self.lora_dtype = model_config.dtype
         elif isinstance(self.lora_dtype, str):
             self.lora_dtype = getattr(torch, self.lora_dtype)
-        if model_config.quantization and model_config.quantization not in [
-                "awq", "gptq"
-        ]:
-            # TODO support marlin
-            logger.warning("%s quantization is not tested with LoRA yet.",
-                           model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
         # Reminder: Please update docs/source/features/compatibility_matrix.md
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index be9cbe244a8..bb6d2808e46 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -78,10 +78,6 @@ def add_shrink(self, y: torch.Tensor, x: torch.Tensor,
                                          ...], scale: float, **kwargs):
         """
         Performs GEMM  for multiple slices of lora_a.
-        When `is_prefill is` true, it indicates that it is currently the
-        prefill stage, and the `_shrink_prefill` function should be called.
-        Otherwise, it is the decode stage, and the _shrink_decode function
-        should be called.
             
         Semantics:
         for i in range(len(lora_a_stacked)):
@@ -129,7 +125,7 @@ def add_expand(self,
             lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): 
                 bias's weight
             output_slices (Tuple[int, ...]): Every slice's size
-            add_inputs (bool):  Defaults to True.
+            add_inputs (bool): Defaults to True.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
@@ -226,7 +222,7 @@ def add_lora_linear(self,
 
         if buffer is None:
             r = lora_b_stacked[0].size(-1)
-            # We set the buffer to be float32 by default ,refer to:
+            # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros(  # type: ignore
                 (len(output_slices), x.size(0), r),
@@ -268,16 +264,16 @@ def add_lora_logits(self,
             y (torch.Tensor): Output tensor.
             x (torch.Tensor): Input tensor.
             lora_a_stacked (torch.Tensor): lora_a's weights.
-            lora_b_stacked (torch.Tensor):lora_b's weights.
+            lora_b_stacked (torch.Tensor): lora_b's weights.
             scale (float): Scaling factor.
-            buffer (Optional[torch.Tensor]):Default to None.
+            buffer (Optional[torch.Tensor]): Default to None.
         """
         y_org = y
         y = y.view(-1, y.shape[-1])
         x = x.view(-1, x.shape[-1])
         r = lora_b_stacked.size(-1)
         if buffer is None:
-            # We set the buffer to be float32 by default ,refer to:
+            # We set the buffer to be float32 by default, refer to:
             # https://github.com/triton-lang/triton/issues/1387
             buffer = torch.zeros((x.size(0), r),
                                  dtype=torch.float32,

From ae605bf1140ef4a83d4ebbf0c94daecd6ac50c55 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 24 Mar 2025 23:43:51 -0400
Subject: [PATCH 0954/1240] Revert "Fix non-contiguous input passed to Marlin
 kernel (#15319)" (#15398)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/quantization/kernels/mixed_precision/marlin.py     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index b030e1484a6..e21801cf6a7 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -115,10 +115,6 @@ def apply_weights(self,
                       layer: torch.nn.Module,
                       x: torch.Tensor,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # marlin requires contiguous memory layout
-        # prefix caching may cause x to be non-contiguous
-        x = x.contiguous()  # no-op if already contiguous
-
         c = self.config
         w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
 

From ad8d6b68b2548bebc62787ea72e666549354cd2c Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Tue, 25 Mar 2025 11:48:08 +0800
Subject: [PATCH 0955/1240] [Bugfix] Fixed the issue of not being able to input
 video and image simultaneously (#15387)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 988fa014460..d3613384590 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -556,11 +556,11 @@ def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(\
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
-        elif "image" in items_by_modality:
+        if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
-        elif "audio" in items_by_modality:
+        if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
-        elif "video" in items_by_modality:
+        if "video" in items_by_modality:
             mm_inputs["video"] = items_by_modality["video"] # A list of videos
         return mm_inputs
 
@@ -589,11 +589,11 @@ async def all_mm_data(self) -> Optional[MultiModalDataDict]:
                 raise ValueError(
                     "Only one message can have {'type': 'image_embeds'}")
             mm_inputs["image"] = image_embeds_lst[0]
-        elif "image" in items_by_modality:
+        if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"] # A list of images
-        elif "audio" in items_by_modality:
+        if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"] # A list of audios
-        elif "video" in items_by_modality:
+        if "video" in items_by_modality:
             mm_inputs["video"] = items_by_modality["video"] # A list of videos
         return mm_inputs
 

From 60c8298c401c011721c230c0da371848533f1b1d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Tue, 25 Mar 2025 00:02:33 -0400
Subject: [PATCH 0956/1240] [V1] guidance backend for structured output +
 `auto` fallback mode (#14779)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Loc Huynh <jc1da.3011@gmail.com>
Co-authored-by: Michal Moskal <michal@moskal.me>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt                       |   2 +-
 .../llm/test_struct_output_generate.py        | 169 +++++++++++-------
 vllm/config.py                                |   9 +-
 vllm/engine/arg_utils.py                      |  21 +--
 vllm/v1/engine/processor.py                   |  39 +++-
 vllm/v1/structured_output/__init__.py         |   3 +
 vllm/v1/structured_output/backend_guidance.py | 164 +++++++++++++++++
 vllm/v1/structured_output/request.py          |  47 ++---
 vllm/v1/structured_output/utils.py            |   2 +-
 9 files changed, 345 insertions(+), 111 deletions(-)
 create mode 100644 vllm/v1/structured_output/backend_guidance.py

diff --git a/requirements/common.txt b/requirements/common.txt
index 2d52858ad9e..14084b79121 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -18,7 +18,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.11, < 0.11
-llguidance >= 0.7.2, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
+llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
 xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index d99ae59ddd4..6bdfa0fae4a 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -13,7 +13,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar"]
+GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
 MODELS_TO_TEST = [
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
@@ -30,12 +30,13 @@ def test_guided_json_completion(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_json_schema,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
     outputs = llm.generate(prompts=[
         f"Give an example JSON for an employee profile "
         f"that fits this schema: {sample_json_schema}"
@@ -111,13 +112,14 @@ def test_guided_json_object(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=100,
-                                     n=2,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json_object=True,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=100,
+        n=2,
+        guided_decoding=GuidedDecodingParams(json_object=True))
 
     outputs = llm.generate(
         prompts=("Generate a JSON object with curly braces for a person with "
@@ -137,12 +139,20 @@ def test_guided_json_object(
 
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
-            assert isinstance(parsed_json, dict)
+            allowed_types: tuple[type, ...] = (dict, )
+            if guided_decoding_backend == "xgrammar":
+                # TODO - we are currently too permissive with xgrammar and
+                # allow # any valid json (typically comes back as a list or
+                # object).  We can fix this by specifying a jsonschema of
+                # {"type": "object"}, # but we need this fix in a release
+                # first: https://github.com/mlc-ai/xgrammar/pull/264
+                allowed_types = (dict, list)
+            assert isinstance(parsed_json, allowed_types)
 
 
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
+                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
 @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
 def test_guided_json_unsupported_schema(
     monkeypatch: pytest.MonkeyPatch,
@@ -151,21 +161,43 @@ def test_guided_json_unsupported_schema(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=1.0,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         json=unsupported_json_schema,
-                                         backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="The provided JSON schema contains features "
-                       "not supported by xgrammar."):
-        llm.generate(prompts=[
-            f"Give an example JSON for an employee profile "
-            f"that fits this schema: {unsupported_json_schema}"
-        ] * 2,
-                     sampling_params=sampling_params,
-                     use_tqdm=True)
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+    if guided_decoding_backend == "xgrammar":
+        with pytest.raises(ValueError,
+                           match="The provided JSON schema contains features "
+                           "not supported by xgrammar."):
+            llm.generate(prompts=[
+                f"Give an example JSON for an employee profile "
+                f"that fits this schema: {unsupported_json_schema}"
+            ] * 2,
+                         sampling_params=sampling_params,
+                         use_tqdm=True)
+    else:
+        # This should work for both "guidance" and "auto".
+
+        outputs = llm.generate(
+            prompts=("Give an example JSON object for a grade "
+                     "that fits this schema: "
+                     f"{unsupported_json_schema}"),
+            sampling_params=sampling_params,
+            use_tqdm=True)
+        assert outputs is not None
+        for output in outputs:
+            assert output is not None
+            assert isinstance(output, RequestOutput)
+            generated_text = output.outputs[0].text
+            assert generated_text is not None
+            print(generated_text)
+
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
 
 
 @pytest.mark.skip_global_cleanup
@@ -179,13 +211,14 @@ def test_guided_grammar_ebnf(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_ebnf,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_ebnf))
     outputs = llm.generate(
         prompts=("Generate a sql statement that selects col_1 from "
                  "table_1 where it is equal to 1"),
@@ -222,13 +255,14 @@ def test_guided_grammar_lark(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar=sample_sql_lark,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar=sample_sql_lark))
     outputs = llm.generate(
         prompts=("Generate a sql statement that selects col_1 from "
                  "table_1 where it is equal to 1"),
@@ -269,16 +303,15 @@ def test_guided_grammar_ebnf_invalid(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=1000,
-                                     guided_decoding=GuidedDecodingParams(
-                                         grammar="not a grammar",
-                                         backend=guided_decoding_backend))
-    with pytest.raises(ValueError,
-                       match="Failed to convert the grammar "
-                       "from Lark to EBNF."):
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(grammar="not a grammar"))
+    with pytest.raises(ValueError, match="Failed to convert the grammar "):
         llm.generate(
             prompts=("Generate a sql statement that selects col_1 from "
                      "table_1 where it is equal to 1"),
@@ -298,12 +331,13 @@ def test_guided_regex(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         regex=sample_regex,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(regex=sample_regex))
     outputs = llm.generate(
         prompts=[
             f"Give an example IPv4 address with this regex: {sample_regex}"
@@ -335,12 +369,13 @@ def test_guided_choice_completion(
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name, max_model_len=1024)
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     guided_decoding=GuidedDecodingParams(
-                                         choice=sample_guided_choice,
-                                         backend=guided_decoding_backend))
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    sampling_params = SamplingParams(
+        temperature=0.8,
+        top_p=0.95,
+        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
     outputs = llm.generate(
         prompts="The best language for type-safe systems programming is ",
         sampling_params=sampling_params,
diff --git a/vllm/config.py b/vllm/config.py
index a2e83af3ab4..7390ec5937a 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2800,12 +2800,17 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
-        valid_guided_backends = [
-            'outlines', 'lm-format-enforcer', 'xgrammar', 'guidance'
+        v0_valid_guided_backends = [
+            'outlines', 'lm-format-enforcer', 'xgrammar'
         ]
+        v1_valid_guided_backends = ['xgrammar', 'guidance', 'auto']
 
         backend = GuidedDecodingParams(
             backend=self.guided_decoding_backend).backend_name
+        if envs.VLLM_USE_V1:
+            valid_guided_backends = v1_valid_guided_backends
+        else:
+            valid_guided_backends = v0_valid_guided_backends
         if backend not in valid_guided_backends:
             raise ValueError(f"Invalid guided_decoding_backend '{backend}',"
                              f" must be one of {valid_guided_backends}")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 38a47a846df..80fcbec6ef5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -391,16 +391,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default='xgrammar',
             help='Which engine will be used for guided decoding'
             ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines, '
-            'https://github.com/mlc-ai/xgrammar, and '
-            'https://github.com/noamgat/lm-format-enforcer.'
-            ' Can be overridden per request via guided_decoding_backend'
-            ' parameter.\n'
-            'Backend-specific options can be supplied in a comma-separated '
-            'list following a colon after the backend name. Valid backends and '
-            'all available options are: [xgrammar:no-fallback, '
-            'xgrammar:disable-any-whitespace, '
-            'outlines:no-fallback, lm-format-enforcer:no-fallback]')
+            'https://github.com/mlc-ai/xgrammar and '
+            'https://github.com/guidance-ai/llguidance.'
+            'Valid backend values are "xgrammar", "guidance", and "auto". '
+            'With "auto", we will make opinionated choices based on request'
+            'contents and what the backend libraries currently support, so '
+            'the behavior is subject to change in each release. '
+            'The default is xgrammar.')
         parser.add_argument(
             '--logits-processor-pattern',
             type=nullable_str,
@@ -1539,9 +1536,9 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # Only support Xgrammar for guided decoding so far.
+        # Xgrammar and Guidance are supported.
         SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
         ]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 8ba06336be0..ffd12d5fd0d 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -4,7 +4,6 @@
 from collections.abc import Mapping
 from typing import Optional, Union
 
-import vllm.platforms
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
@@ -20,7 +19,10 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.structured_output.utils import validate_structured_output_request
+from vllm.v1.structured_output.backend_guidance import (
+    validate_guidance_grammar)
+from vllm.v1.structured_output.utils import (
+    validate_structured_output_request_xgrammar)
 
 
 class Processor:
@@ -120,7 +122,9 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        supported_backends = ["xgrammar", "xgrammar:disable-any-whitespace"]
+        supported_backends = [
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+        ]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
             raise ValueError(f"Only {supported_backends} structured output is "
@@ -134,10 +138,31 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         else:
             params.guided_decoding.backend = engine_level_backend
 
-        if vllm.platforms.current_platform.is_tpu():
-            raise ValueError("Structured output is not supported on TPU.")
-
-        validate_structured_output_request(params)
+        # Request content validation
+
+        if engine_level_backend == "xgrammar":
+            # xgrammar with no fallback
+            validate_structured_output_request_xgrammar(params)
+            params.guided_decoding.backend = "xgrammar"
+        elif engine_level_backend == "auto":
+            # "auto" is an opt-in to opinionated behavior where we try to
+            # choose a backend based on request contents. This is not the
+            # default as it is less predictable and subject to change
+            # between releases as feature support changes.
+            try:
+                validate_structured_output_request_xgrammar(params)
+                params.guided_decoding.backend = "xgrammar"
+            except ValueError:
+                # The request includes some jsonschema feature(s) that
+                # are not supported in xgrammar. Fall back to guidance.
+                params.guided_decoding.backend = "guidance"
+
+        if params.guided_decoding.backend == "guidance":
+            # TODO ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            validate_guidance_grammar(params, tokenizer=None)
 
     def process_inputs(
         self,
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 0fdc45c279c..6c6a8a7bce3 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -7,6 +7,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
 from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
                                                      StructuredOutputGrammar)
 
@@ -50,6 +51,8 @@ def grammar_init(self, request: Request) -> None:
                     XgrammarBackend)
 
                 self.backend = XgrammarBackend(self.vllm_config)
+            elif backend_name == "guidance":
+                self.backend = GuidanceBackend(self.vllm_config)
             else:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
new file mode 100644
index 00000000000..1e274ad0ae6
--- /dev/null
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar,
+                                                     StructuredOutputOptions)
+from vllm.v1.structured_output.request import get_structured_output_key
+
+if TYPE_CHECKING:
+    import llguidance
+    import llguidance.hf as llguidance_hf
+    import llguidance.torch as llguidance_torch
+else:
+    llguidance = LazyLoader("llguidance", globals(), "llguidance")
+    llguidance_hf = LazyLoader("llguidance.hf", globals(), "llguidance.hf")
+    llguidance_torch = LazyLoader("llguidance.torch", globals(),
+                                  "llguidance.torch")
+
+logger = init_logger(__name__)
+
+
+class GuidanceBackend(StructuredOutputBackend):
+
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+        tokenizer_group = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            lora_config=vllm_config.lora_config)  # type: ignore[arg-type]
+        tokenizer_group.ping()
+        self.vllm_config = vllm_config
+        self.vocab_size = vllm_config.model_config.get_vocab_size()
+
+        tokenizer = tokenizer_group.get_lora_tokenizer(None)
+        self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
+
+    def compile_grammar(self, request_type: StructuredOutputOptions,
+                        grammar_spec: str) -> StructuredOutputGrammar:
+        self.serialized_grammar = serialize_guidance_grammar(
+            request_type, grammar_spec)
+
+        ll_matcher = llguidance.LLMatcher(
+            self.ll_tokenizer,
+            self.serialized_grammar,
+            log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+        )
+
+        r = GuidanceGrammar(
+            ll_matcher=ll_matcher,
+            ll_tokenizer=self.ll_tokenizer,
+            vocab_size=self.vocab_size,
+        )
+
+        r.check_error()
+        return r
+
+    def allocate_token_bitmask(self, max_num_seqs: int):
+        return llguidance_torch.allocate_token_bitmask(
+            max_num_seqs, self.ll_tokenizer.vocab_size)
+
+
+@dataclass
+class GuidanceGrammar(StructuredOutputGrammar):
+    ll_matcher: llguidance.LLMatcher
+    ll_tokenizer: llguidance.LLTokenizer
+    vocab_size: int
+    printed_error: bool = False
+    terminated: bool = False
+
+    def check_error(self):
+        if not self.printed_error:
+            err = self.ll_matcher.get_error()
+            if err:
+                self.printed_error = True
+                logger.warning("LLMatcher error: %s", err)
+
+    def accept_tokens(self, request_id: str, tokens: list[int]) -> bool:
+        """Accepts a list of tokens and advances the parser.
+
+        Returns True if the parser was advanced successfully.
+        Returns False if the parser failed to advance.
+        """
+
+        if self.ll_tokenizer.eos_token in tokens:
+            self.terminated = True
+
+        if self.ll_matcher.is_stopped():
+            return True
+
+        # TODO - Add jump decoding support in the future:
+        # self.ll_matcher.compute_ff_bytes() - this should always work
+        # self.ll_matcher.compute_ff_tokens() - this only works for
+        #   "canonical" tokenizers
+        # For conversion between the two, see
+        # https://github.com/guidance-ai/llguidance/blob/main/docs/fast_forward.md
+
+        r = self.ll_matcher.consume_tokens(tokens)
+
+        self.check_error()
+
+        return r
+
+    def fill_bitmask(self, bitmask: torch.Tensor, idx: int) -> None:
+        # this will automatically return [EOS] mask if the matcher is stopped
+        # or otherwise in an error state
+        llguidance_torch.fill_next_token_bitmask(self.ll_matcher, bitmask, idx)
+        self.check_error()
+
+    def is_terminated(self) -> bool:
+        return self.terminated
+
+    def reset(self):
+        # This method may be not needed anymore? TODO
+        self.ll_matcher.reset()
+
+
+def serialize_guidance_grammar(request_type: StructuredOutputOptions,
+                               grammar_spec: str) -> str:
+    if request_type == StructuredOutputOptions.JSON:
+        # TODO: make whitespace_flexible configurable
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            grammar_spec, defaults={
+                "whitespace_flexible": True,
+            })
+    elif request_type == StructuredOutputOptions.JSON_OBJECT:
+        return llguidance.LLMatcher.grammar_from_json_schema(
+            '{"type": "object"}', defaults={
+                "whitespace_flexible": True,
+            })
+    else:
+        if request_type == StructuredOutputOptions.REGEX:
+            tp = "regex"
+        elif request_type == StructuredOutputOptions.GRAMMAR:
+            tp = "grammar"
+        elif request_type == StructuredOutputOptions.CHOICE:
+            tp = "choice"
+        else:
+            logger.error("Validation should have already occurred. "
+                         "Please file an issue.")
+            raise ValueError("grammar is not of valid supported types. "
+                             f"({request_type!s})")
+        return llguidance.grammar_from(tp, grammar_spec)
+
+
+def validate_guidance_grammar(
+        sampling_params: SamplingParams,
+        tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
+    tp, grm = get_structured_output_key(sampling_params)
+    guidance_grm = serialize_guidance_grammar(tp, grm)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm,
+                                                tokenizer=tokenizer)
+    if err:
+        raise ValueError(f"Grammar error: {err}")
diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py
index 718fa5834ed..9e54b8bf028 100644
--- a/vllm/v1/structured_output/request.py
+++ b/vllm/v1/structured_output/request.py
@@ -53,25 +53,30 @@ def grammar(
 
     @functools.cached_property
     def structured_output_key(self) -> StructuredOutputKey:
-        params = self.sampling_params.guided_decoding
-        assert params is not None, "params can't be None."
-        if params.json is not None:
-            if not isinstance(params.json, str):
-                json_str = json.dumps(params.json)
-            else:
-                json_str = params.json
-            return (StructuredOutputOptions.JSON, json_str)
-        elif params.json_object:
-            return (StructuredOutputOptions.JSON_OBJECT, "")
-        elif params.regex is not None:
-            return (StructuredOutputOptions.REGEX, params.regex)
-        elif params.choice is not None:
-            if not isinstance(params.choice, str):
-                json_str = json.dumps(params.choice)
-            else:
-                json_str = params.choice
-            return (StructuredOutputOptions.CHOICE, json_str)
-        elif params.grammar is not None:
-            return (StructuredOutputOptions.GRAMMAR, params.grammar)
+        return get_structured_output_key(self.sampling_params)
+
+
+def get_structured_output_key(
+        sampling_params: SamplingParams) -> StructuredOutputKey:
+    params = sampling_params.guided_decoding
+    assert params is not None, "params can't be None."
+    if params.json is not None:
+        if not isinstance(params.json, str):
+            json_str = json.dumps(params.json)
+        else:
+            json_str = params.json
+        return (StructuredOutputOptions.JSON, json_str)
+    elif params.json_object:
+        return (StructuredOutputOptions.JSON_OBJECT, "")
+    elif params.regex is not None:
+        return (StructuredOutputOptions.REGEX, params.regex)
+    elif params.choice is not None:
+        if not isinstance(params.choice, str):
+            json_str = json.dumps(params.choice)
         else:
-            raise ValueError("No valid structured output parameter found")
+            json_str = params.choice
+        return (StructuredOutputOptions.CHOICE, json_str)
+    elif params.grammar is not None:
+        return (StructuredOutputOptions.GRAMMAR, params.grammar)
+    else:
+        raise ValueError("No valid structured output parameter found")
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index b373d31e0ab..694e46f763f 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -239,7 +239,7 @@ def escape_ebnf_string(s: str) -> str:
     return grammar
 
 
-def validate_structured_output_request(
+def validate_structured_output_request_xgrammar(
         sampling_params: SamplingParams) -> None:
     """Validate that the request is supported by structured output.
 

From 79495c24a4171011ff3777731e3d95da19d11a83 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 24 Mar 2025 21:04:41 -0700
Subject: [PATCH 0957/1240] [V1][Spec Decode] Update target_logits in place for
 rejection sampling (#15427)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/rejection_sampler.py | 7 +++++--
 vllm/v1/worker/gpu_model_runner.py  | 9 +++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index e0db9474f61..69bc68174d5 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -67,6 +67,7 @@ def forward(
                 Shape is [num_tokens, vocab_size]. Here, probabilities from
                 different requests are flattened into a single tensor because
                 this is the shape of the output logits.
+                NOTE: `target_logits` can be updated in place to save memory.
             bonus_token_ids_tensor (torch.Tensor):
                 A tensor containing bonus tokens. Shape is [batch_size, 1].
                 Bonus tokens are added to the end of the sequence if all
@@ -83,6 +84,8 @@ def forward(
         '''
         assert metadata.max_spec_len <= MAX_SPEC_LEN
         # [num_tokens, vocab_size]
+        # NOTE(woosuk): `target_logits` can be updated in place inside the
+        # `compute_probs` function.
         target_probs = compute_probs(
             target_logits,
             metadata.cu_num_draft_tokens,
@@ -252,8 +255,8 @@ def compute_probs(
         replace_from=GREEDY_TEMPERATURE,
         replace_to=1,
     )
-    # TODO(woosuk): Consider using in-place op to reduce memory usage.
-    logits = logits / temperature.unsqueeze(-1)
+    # NOTE(woosuk): Update `logits` in place to avoid allocating a new tensor.
+    logits.div_(temperature.unsqueeze(-1))
 
     # Get expanded top_k and top_p tensors.
     top_k = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c6741fdc5d6..a85009f1a36 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1059,7 +1059,10 @@ def execute_model(
                 sampling_metadata=sampling_metadata,
             )
         else:
-            # TODO(woosuk): Optimize the memory usage.
+            # When indexing with a tensor (bonus_logits_indices), PyTorch
+            # creates a new tensor with separate storage from the original
+            # logits tensor. This means any in-place operations on bonus_logits
+            # won't affect the original logits tensor.
             bonus_logits = logits[spec_decode_metadata.bonus_logits_indices]
             sampler_output = self.model.sample(
                 logits=bonus_logits,
@@ -1067,7 +1070,9 @@ def execute_model(
             )
             bonus_token_ids = sampler_output.sampled_token_ids
 
-            # TODO(woosuk): Optimize the memory usage.
+            # Just like `bonus_logits`, `target_logits` is a new tensor with
+            # separate storage from the original `logits` tensor. Therefore,
+            # it is safe to update `target_logits` in place.
             target_logits = logits[spec_decode_metadata.target_logits_indices]
             output_token_ids = self.rejection_sampler(
                 spec_decode_metadata,

From e04d706a7a0fe674fece6d5e73e57f43ba06f58c Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 25 Mar 2025 00:36:45 -0700
Subject: [PATCH 0958/1240] Fix CUDA kernel index data type in
 vllm/csrc/quantization/gptq_marlin/awq_marlin_repack.cu +10 (#15160)

Signed-off-by: Lu Fang <lufang@fb.com>
Co-authored-by: Richard Barnes <rbarnes@meta.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../gptq_marlin/awq_marlin_repack.cu          | 12 ++---
 csrc/quantization/gptq_marlin/gptq_marlin.cu  | 28 +++++-----
 .../gptq_marlin/gptq_marlin_repack.cu         | 16 +++---
 .../marlin/dense/marlin_cuda_kernel.cu        | 10 ++--
 .../marlin/qqq/marlin_qqq_gemm_kernel.cu      | 14 ++---
 .../marlin/sparse/marlin_24_cuda_kernel.cu    | 14 ++---
 csrc/rocm/attention.cu                        | 52 +++++++++----------
 7 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index 3e2f87dbc45..8ba617a9e65 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -14,7 +14,7 @@ __global__ void awq_marlin_repack_kernel(
   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
@@ -51,8 +51,8 @@ __global__ void awq_marlin_repack_kernel(
     int4* sh_ptr = sh + stage_size * pipe;
 
     if (threadIdx.x < stage_size) {
-      int k_id = threadIdx.x / stage_n_threads;
-      int n_id = threadIdx.x % stage_n_threads;
+      auto k_id = threadIdx.x / stage_n_threads;
+      auto n_id = threadIdx.x % stage_n_threads;
 
       int first_k = k_tile_id * tile_k_size;
 
@@ -70,8 +70,8 @@ __global__ void awq_marlin_repack_kernel(
       return;
     }
 
-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
 
     if (warp_id >= 4) {
       return;
@@ -265,4 +265,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
   m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
\ No newline at end of file
+}
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index dafab501ee0..14d397d03e1 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -460,7 +460,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
                                     int const* __restrict__ perm_int_ptr,
                                     int4* __restrict__ out_int4_ptr, int size_m,
                                     int size_k, int lda, int block_rows) {
-  int start_row = block_rows * blockIdx.x;
+  auto start_row = block_rows * blockIdx.x;
   int finish_row = start_row + block_rows;
   if (finish_row > size_m) {
     finish_row = size_m;
@@ -484,7 +484,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
     int base_k = 0;
 
     for (int i = 0; i < iters; i++) {
-      int cur_k = base_k + threadIdx.x;
+      auto cur_k = base_k + threadIdx.x;
       int src_pos = perm_int_ptr[cur_k];
 
       out_half[cur_k] = a_row_half[src_pos];
@@ -494,7 +494,7 @@ __global__ void permute_cols_kernel(int4 const* __restrict__ a_int4_ptr,
 
     if (rest) {
       if (threadIdx.x < rest) {
-        int cur_k = base_k + threadIdx.x;
+        auto cur_k = base_k + threadIdx.x;
         int src_pos = perm_int_ptr[cur_k];
 
         out_half[cur_k] = a_row_half[src_pos];
@@ -723,8 +723,8 @@ __global__ void Marlin(
                 (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
 
   // For act_order
   constexpr int k_iter_size = tb_k / b_sh_wr_iters;
@@ -743,7 +743,7 @@ __global__ void Marlin(
                 s_sh_stride * slice_col + threadIdx.x;
     }
   }
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
   bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
 
   // Zero-points
@@ -756,7 +756,7 @@ __global__ void Marlin(
                  zp_sh_stride * slice_col + threadIdx.x;
     }
   }
-  int zp_sh_wr = threadIdx.x;
+  auto zp_sh_wr = threadIdx.x;
   bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
 
   // We use a different scale layout for grouped and column-wise quantization as
@@ -1047,7 +1047,7 @@ __global__ void Marlin(
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
           reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
         } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
           int n_warps = thread_n_blocks / 4;
 
           int warp_row = warp_id / n_warps;
@@ -1085,7 +1085,7 @@ __global__ void Marlin(
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
-    int warp_id = threadIdx.x / 32;
+    auto warp_id = threadIdx.x / 32;
     int n_warps =
         thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
 
@@ -1094,7 +1094,7 @@ __global__ void Marlin(
 
     cur_k += warp_row * 16;
 
-    int th_id = threadIdx.x % 32;
+    auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
 
     int s_col_shift =
@@ -1159,7 +1159,7 @@ __global__ void Marlin(
               (reinterpret_cast<int*>(sh_zp_stage))[zp_sh_rd + i];
         }
       } else {
-        int warp_id = threadIdx.x / 32;
+        auto warp_id = threadIdx.x / 32;
         int n_warps = thread_n_blocks / 4;
 
         int warp_row = warp_id / n_warps;
@@ -1197,7 +1197,7 @@ __global__ void Marlin(
                                      (pipe / (group_blocks / thread_k_blocks)));
           reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd];
         } else {
-          int warp_id = threadIdx.x / 32;
+          auto warp_id = threadIdx.x / 32;
           int n_warps = thread_n_blocks / 4;
 
           int warp_row = warp_id / n_warps;
@@ -1323,7 +1323,7 @@ __global__ void Marlin(
   auto thread_block_reduce = [&]() {
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
       constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@@ -1390,7 +1390,7 @@ __global__ void Marlin(
                     4 * (threadIdx.x / 32) + threadIdx.x % 4;
       c_gl_wr += (2 * thread_n_blocks) * slice_col;
       constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;
 
       int row = (threadIdx.x % 32) / 4;
 
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 5cd07855504..7c2d089a70d 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -15,7 +15,7 @@ __global__ void gptq_marlin_repack_kernel(
   int n_tiles = size_n / tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
-  int start_k_tile = blockIdx.x * block_k_tiles;
+  auto start_k_tile = blockIdx.x * block_k_tiles;
   if (start_k_tile >= k_tiles) {
     return;
   }
@@ -71,8 +71,8 @@ __global__ void gptq_marlin_repack_kernel(
 
     if constexpr (has_perm) {
       if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
 
         uint32_t const* sh_perm_int_ptr =
             reinterpret_cast<uint32_t const*>(sh_perm_ptr);
@@ -88,8 +88,8 @@ __global__ void gptq_marlin_repack_kernel(
 
     } else {
       if (threadIdx.x < stage_size) {
-        int k_id = threadIdx.x / stage_n_threads;
-        int n_id = threadIdx.x % stage_n_threads;
+        auto k_id = threadIdx.x / stage_n_threads;
+        auto n_id = threadIdx.x % stage_n_threads;
 
         int first_k = k_tile_id * tile_k_size;
         int first_k_packed = first_k / pack_factor;
@@ -109,8 +109,8 @@ __global__ void gptq_marlin_repack_kernel(
       return;
     }
 
-    int warp_id = threadIdx.x / 32;
-    int th_id = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto th_id = threadIdx.x % 32;
 
     if (warp_id >= 4) {
       return;
@@ -339,4 +339,4 @@ TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
 
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
   m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
\ No newline at end of file
+}
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 4db8f5dcdab..ba0a2410c03 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -277,12 +277,12 @@ __global__ void Marlin(
       b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;
 
   int s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
   int s_sh_rd;
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
@@ -455,7 +455,7 @@ __global__ void Marlin(
   auto thread_block_reduce = [&]() {
     constexpr int red_off = threads / b_sh_stride / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
       constexpr int red_sh_stride = b_sh_stride * 4 * 2;
       constexpr int red_sh_delta = b_sh_stride;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@@ -522,7 +522,7 @@ __global__ void Marlin(
                     4 * (threadIdx.x / 32) + threadIdx.x % 4;
       c_gl_wr += (2 * thread_n_blocks) * slice_col;
       constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;
 
       int row = (threadIdx.x % 32) / 4;
 
diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
index 048a3f736fb..cd1830764cc 100644
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
@@ -353,10 +353,10 @@ __global__ void Marlin(
       b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x;
-  int b_sh_rd = threadIdx.x;
+  auto b_sh_wr = threadIdx.x;
+  auto b_sh_rd = threadIdx.x;
 
-  int s_tok_gl_rd = threadIdx.x;
+  auto s_tok_gl_rd = threadIdx.x;
   // NOTE(HandH1998): activation scale s_tok need shuffle to [0, 8, 1, 9, 2, 10,
   // 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] for example, 0, 8 row scales serve for
   // thread 0, 1, 2, 3. For more details, refer to mma operand A layout as
@@ -368,8 +368,8 @@ __global__ void Marlin(
   int s_tok_sh_rd = (threadIdx.x % 32) / 4;
   bool s_tok_sh_wr_pred = threadIdx.x < prob_m;
 
-  int s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
-  int s_ch_sh_wr = threadIdx.x;
+  auto s_ch_gl_rd = s_ch_sh_stride * slice_col + threadIdx.x;
+  auto s_ch_sh_wr = threadIdx.x;
   int s_ch_sh_rd = 16 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
                    2 * ((threadIdx.x % 32) % 4);
   bool s_ch_sh_wr_pred = threadIdx.x < s_ch_sh_stride;
@@ -558,7 +558,7 @@ __global__ void Marlin(
   auto thread_block_reduce = [&]() {
     constexpr int red_off = threads / b_sh_stride / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride;
+      auto red_idx = threadIdx.x / b_sh_stride;
       constexpr int red_sh_stride = b_sh_stride * 4 * 2;
       constexpr int red_sh_delta = b_sh_stride;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride) +
@@ -628,7 +628,7 @@ __global__ void Marlin(
                     8 * (threadIdx.x / 32) + (threadIdx.x % 4) * 2;
       c_gl_wr += (4 * thread_n_blocks) * slice_col;
       constexpr int c_sh_wr_delta = active_threads * 2;
-      int c_sh_wr = 2 * threadIdx.x;
+      auto c_sh_wr = 2 * threadIdx.x;
 
       int row = (threadIdx.x % 32) / 4;
 
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 17837351324..c33e71ae5cf 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -273,15 +273,15 @@ __global__ void Marlin_24(
                 (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  int b_sh_wr = threadIdx.x * b_thread_vecs;
-  int b_sh_rd = threadIdx.x * b_thread_vecs;
+  auto b_sh_wr = threadIdx.x * b_thread_vecs;
+  auto b_sh_rd = threadIdx.x * b_thread_vecs;
 
   int m_gl_rd = m_gl_stride * (threadIdx.x / (m_sh_stride)) +
                 (threadIdx.x % (m_sh_stride));
   m_gl_rd += (m_sh_stride)*slice_col;
   m_gl_rd += m_gl_rd_delta_o * slice_row;
-  int m_sh_wr = threadIdx.x;
-  int m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
+  auto m_sh_wr = threadIdx.x;
+  auto m_sh_rd = threadIdx.x % 16 + (threadIdx.x / 32) * 16;
 
   int s_gl_rd;
   if constexpr (group_blocks == -1) {
@@ -291,7 +291,7 @@ __global__ void Marlin_24(
               s_sh_stride * slice_col + threadIdx.x;
   }
 
-  int s_sh_wr = threadIdx.x;
+  auto s_sh_wr = threadIdx.x;
   int s_sh_rd;
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
@@ -516,7 +516,7 @@ __global__ void Marlin_24(
   auto thread_block_reduce = [&]() {
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
-      int red_idx = threadIdx.x / b_sh_stride_threads;
+      auto red_idx = threadIdx.x / b_sh_stride_threads;
       constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
@@ -583,7 +583,7 @@ __global__ void Marlin_24(
                     8 * (threadIdx.x / 32) + (threadIdx.x % 32) / 4;
       c_gl_wr += (2 * thread_n_blocks) * slice_col;
       constexpr int c_sh_wr_delta = active_threads;
-      int c_sh_wr = threadIdx.x;
+      auto c_sh_wr = threadIdx.x;
 
       int col = 2 * ((threadIdx.x % 32) % 4);
 
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index c500d00ea52..8ab2af22f4d 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -284,18 +284,18 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     int max_ctx_blocks, const float* k_scale, const float* v_scale) {
   // clang-format on
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
   const int lane4id = laneid % 4;
   const int lane16id = laneid % 16;
   const int rowid = laneid / 16;
 
-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
 
-  const int max_num_partitions = gridDim.y;
+  const auto max_num_partitions = gridDim.y;
 
   const int context_len = context_lens[seq_idx];
 
@@ -346,9 +346,9 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // can be interpreted as B8x16 for 8 bit types
   _B16x8 Klocal[TLOOP][QKHELOOP];
 
-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
-  const int total_num_heads = gridDim.z * GQA_RATIO;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
+  const auto total_num_heads = gridDim.z * GQA_RATIO;
 
   // for QK mfma, tokens in multiples of TOKENS_PER_WARP are spread across warps
   // each mfma takes QH16xT16x16HE across warp
@@ -789,14 +789,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     int max_ctx_blocks, const float* k_scale, const float* v_scale) {
   // clang-format on
   constexpr int NWARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  const auto laneid = threadIdx.x % WARP_SIZE;
   const int lane4id = laneid % 4;
 
-  const int seq_idx = blockIdx.x;
-  const int partition_idx = blockIdx.y;
-  const int partition_size = blockDim.x;
-  const int max_num_partitions = gridDim.y;
+  const auto seq_idx = blockIdx.x;
+  const auto partition_idx = blockIdx.y;
+  const auto partition_size = blockDim.x;
+  const auto max_num_partitions = gridDim.y;
 
   const int context_len = context_lens[seq_idx];
   const int partition_start_token_idx = partition_idx * partition_size;
@@ -838,8 +838,8 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     qk_max[h] = -FLT_MAX;
   }
 
-  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
-  const int wg_start_kv_head_idx = blockIdx.z;
+  const auto wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const auto wg_start_kv_head_idx = blockIdx.z;
 
   const int warp_start_token_idx =
       partition_start_token_idx + warpid * WARP_SIZE;
@@ -857,7 +857,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
 
     const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
     // token id within partition
-    const int local_token_idx = threadIdx.x;
+    const auto local_token_idx = threadIdx.x;
     // token id within sequence
     const int global_token_idx = partition_start_token_idx + local_token_idx;
 
@@ -1126,7 +1126,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
 
   __syncthreads();
 
-  const int num_heads = gridDim.z * GQA_RATIO;
+  const auto num_heads = gridDim.z * GQA_RATIO;
   float* max_logits_ptr =
       max_logits + seq_idx * num_heads * max_num_partitions + partition_idx;
   float* exp_sums_ptr =
@@ -1268,14 +1268,14 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
     const int max_num_partitions) {
-  const int num_heads = gridDim.x;
-  const int head_idx = blockIdx.x;
-  const int seq_idx = blockIdx.y;
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
   [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  const int warpid = threadIdx.x / WARP_SIZE;
-  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+  const auto warpid = threadIdx.x / WARP_SIZE;
+  [[maybe_unused]] const auto laneid = threadIdx.x % WARP_SIZE;
 
   __shared__ float shared_global_exp_sum;
   // max num partitions supported is warp_size * NPAR_LOOPS
@@ -1294,7 +1294,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
   #pragma unroll
     for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
       valid_partition[i] =
           (partition_no < num_partitions) ? partition_no : last_valid_partition;
     }
@@ -1324,7 +1324,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     }
   #pragma unroll
     for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
       rescaled_exp_sum[i] *= (partition_no < num_partitions)
                                  ? expf(reg_max_logit[i] - max_logit)
                                  : 0.0f;
@@ -1336,7 +1336,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     }
   #pragma unroll
     for (int i = 0; i < NPAR_LOOPS; i++) {
-      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      const auto partition_no = i * WARP_SIZE + threadIdx.x;
       shared_exp_sums[partition_no] = rescaled_exp_sum[i];
     }
 

From 755adf51d4cd14da61c8e80793d971eae2380171 Mon Sep 17 00:00:00 2001
From: Siyuan Liu <lsiyuan@google.com>
Date: Tue, 25 Mar 2025 01:43:00 -0700
Subject: [PATCH 0959/1240] [Hardware][TPU][Bugfix] Fix v1 mp profiler (#15409)

Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index d56c25dd9da..9a380373d46 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -66,14 +66,18 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
+        # Delay profiler initialization to the start of the profiling.
+        # This is because in vLLM V1, MP runtime is initialized before the
+        # TPU Worker is initialized. The profiler server needs to start after
+        # MP runtime is initialized.
         self.profiler = None
+        self.profile_dir = None
         if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:
             # For TPU, we can only have 1 active profiler session for 1 profiler
             # server. So we only profile on rank0.
             self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR
             logger.info("Profiling enabled. Traces will be saved to: %s",
                         self.profile_dir)
-            self.profiler = xp.start_server(9012)
 
         if self.model_config.seed is None:
             self.model_config.seed = 0
@@ -168,9 +172,11 @@ def execute_model(
 
     def profile(self, is_start: bool = True):
         if self.rank < 1:
-            if self.profiler is None:
+            if self.profile_dir is None:
                 raise RuntimeError("Profiler is not enabled.")
             if is_start:
+                if self.profiler is None:
+                    self.profiler = xp.start_server(9012)
                 xp.start_trace(self.profile_dir)
             else:
                 xp.stop_trace()

From 470cb5f883d2927077833310328a7391607a8525 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Tue, 25 Mar 2025 17:34:59 +0800
Subject: [PATCH 0960/1240] [Kernel][CPU] CPU MLA (#14744)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-cpu-test.sh            |   2 +
 cmake/cpu_extension.cmake             |   1 +
 csrc/cpu/cache.cpp                    |  74 +++++
 csrc/cpu/cpu_types_x86.hpp            |   2 +
 csrc/cpu/mla_decode.cpp               | 393 ++++++++++++++++++++++++++
 csrc/cpu/torch_bindings.cpp           |  20 ++
 tests/kernels/test_cache.py           |  69 +++++
 tests/kernels/test_mla_decode_cpu.py  |  94 ++++++
 vllm/_custom_ops.py                   |  12 +
 vllm/_ipex_ops.py                     |  31 +-
 vllm/attention/backends/cpu_mla.py    | 303 ++++++++++++++++++++
 vllm/attention/backends/mla/common.py |  18 +-
 vllm/platforms/cpu.py                 |   6 +-
 vllm/worker/cpu_model_runner.py       |   1 +
 vllm/worker/cpu_worker.py             |   1 +
 15 files changed, 1010 insertions(+), 17 deletions(-)
 create mode 100644 csrc/cpu/mla_decode.cpp
 create mode 100644 tests/kernels/test_mla_decode_cpu.py
 create mode 100644 vllm/attention/backends/cpu_mla.py

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index e45e184852f..05744bb5225 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -38,6 +38,8 @@ function cpu_tests() {
     set -e
     pip install -r vllm/requirements/test.txt
     pip install -r vllm/requirements/cpu.txt
+    pytest -v -s tests/kernels/test_cache.py -m cpu_model
+    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/decoder_only/language -m cpu_model
     pytest -v -s tests/models/embedding/language -m cpu_model
     pytest -v -s tests/models/encoder_decoder/language -m cpu_model
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 345b75d6223..b57d9e22631 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -190,6 +190,7 @@ set(VLLM_EXT_SRC
     "csrc/cpu/cache.cpp"
     "csrc/cpu/utils.cpp"
     "csrc/cpu/layernorm.cpp"
+    "csrc/cpu/mla_decode.cpp"
     "csrc/cpu/pos_encoding.cpp"
     "csrc/cpu/torch_bindings.cpp")
 
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index d726ee9307f..69f6d06e3c9 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -88,6 +88,48 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace
 
+template <typename scalar_t>
+void concat_and_cache_mla_cpu_impl(
+    const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim]
+    scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                      // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int num_tokens,                      //
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size                       //
+) {
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    const int64_t slot_idx = slot_mapping[token_idx];
+    // NOTE: slot_idx can be -1 if the token is padded
+    if (slot_idx < 0) {
+      continue;
+    }
+    const int64_t block_idx = slot_idx / block_size;
+    const int64_t block_offset = slot_idx % block_size;
+
+    auto copy = [&](const scalar_t* __restrict__ src,
+                    scalar_t* __restrict__ dst, int src_stride, int dst_stride,
+                    int size, int offset) {
+      for (int i = 0; i < size; i++) {
+        const int64_t src_idx = token_idx * src_stride + i;
+        const int64_t dst_idx =
+            block_idx * block_stride + block_offset * entry_stride + i + offset;
+        dst[dst_idx] = src[src_idx];
+      }
+    };
+
+    copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+    copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+  }
+}
+
 // Note: the key_caches and value_caches vectors are constant but
 // not the Tensors they contain. The vectors need to be const refs
 // in order to satisfy pytorch's C++ operator registration code.
@@ -134,6 +176,38 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
   });
 }
 
+void concat_and_cache_mla(
+    torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,          // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+  TORCH_CHECK(kv_cache_dtype != "fp8");
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
+        concat_and_cache_mla_cpu_impl<scalar_t>(
+            kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
+            kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
+            num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
+            kv_lora_rank, pe_dim, block_size);
+        CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
+      });
+}
+
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping) {
   TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index a9369e1fd10..4568699b307 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -130,6 +130,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   __m512i reg;
 
+  explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
+
   explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
 
   explicit BF16Vec32(__m512i data) : reg(data) {}
diff --git a/csrc/cpu/mla_decode.cpp b/csrc/cpu/mla_decode.cpp
new file mode 100644
index 00000000000..37bd463bbc1
--- /dev/null
+++ b/csrc/cpu/mla_decode.cpp
@@ -0,0 +1,393 @@
+#include "cpu_types.hpp"
+#include <float.h>
+
+namespace {
+template <typename scalar_t>
+struct KernelVecType {
+  using qk_load_vec_type = void;
+  using qk_vec_type = void;
+  using v_load_vec_type = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+#if defined(__powerpc64__) || defined(__s390x__)
+  // Power and s390x architecture-specific vector types
+  using qk_load_vec_type = vec_op::FP32Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP32Vec16;
+#else
+  // Fallback for other architectures, including x86
+  using qk_load_vec_type = vec_op::FP16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+#endif
+};
+
+#ifdef __AVX512BF16__
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec32;
+  using qk_vec_type = vec_op::BF16Vec32;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+// pass
+#else
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using qk_load_vec_type = vec_op::BF16Vec16;
+  using qk_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
+#endif
+
+template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
+          typename qk_vec_type>
+void mla_decode_block_head(
+    const qk_vec_type* __restrict__ q_vecs,          // [HEAD_UNROLL, head_dim]
+    const qk_vec_type* __restrict__ k_vecs,          // [block_size, head_dim]
+    const vec_op::FP32Vec16* __restrict v_vecs_f32,  // [block_size, v_head_dim]
+    float* __restrict__ acc_out,  // [HEAD_UNROLL, v_head_dim]
+    float* __restrict__ acc_lse,  // [HEAD_UNROLL]
+    const float scale, const int num_tokens) {
+  using f32_vec_type = vec_op::FP32Vec16;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
+
+  float logits[BLOCK_SIZE][HEAD_UNROLL] = {};  // initialize to zeros
+  float max_val[HEAD_UNROLL];
+  std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
+
+  f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
+  for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
+    // load to registers
+    qk_vec_type q_vec[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      q_vec[unroll] =
+          qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+      qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
+    }
+  }
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
+      logits[block_offset][unroll] = acc;
+      max_val[unroll] = std::max(max_val[unroll], acc);
+    }
+  }
+
+  float sum_exp[HEAD_UNROLL] = {};
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      const float val =
+          std::exp(logits[block_offset][unroll] - max_val[unroll]);
+      logits[block_offset][unroll] = val;
+      sum_exp[unroll] += val;
+    }
+  }
+
+  f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
+
+  for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
+    // load to registers
+    f32_vec_type scale_[HEAD_UNROLL];
+
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+      scale_[unroll] =
+          f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
+
+    for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+      f32_vec_type v_vec(
+          v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
+
+#pragma unroll
+      for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
+        vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
+    }
+  }
+
+  // merge attention state
+  // section 2.2 in https://arxiv.org/pdf/2501.01005
+  f32_vec_type prev_scale[HEAD_UNROLL];
+  f32_vec_type curr_scale[HEAD_UNROLL];
+
+#pragma unroll
+  for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+    const float prev_lse = acc_lse[unroll];
+    const float curr_lse = std::log(sum_exp[unroll]) +
+                           max_val[unroll];  // add back max_val to get true lse
+    // softmax trick
+    const float max_lse = std::max(prev_lse, curr_lse);
+    const float prev_sum_exp = std::exp(prev_lse - max_lse);
+    const float curr_sum_exp = std::exp(curr_lse - max_lse);
+
+    const float new_sum_exp = prev_sum_exp + curr_sum_exp;
+    acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
+
+    prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
+    curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
+  }
+
+  for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
+#pragma unroll
+    for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
+      f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
+      o_vec = o_vec * prev_scale[unroll] +
+              this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
+      o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
+    }
+  }
+
+  q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
+  acc_out += V_HEAD_DIM * HEAD_UNROLL;
+}
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
+          typename qk_vec_type>
+void mla_decode_block(
+    const qk_vec_type* __restrict__ q_vecs,  // [num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,   // [block_size, head_dim]
+    float* __restrict__ acc_out,             // [num_heads, v_head_dim]
+    float* __restrict__ acc_lse,             // [num_heads]
+    const int num_heads, const float scale, const int num_tokens) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  static_assert(
+      std::is_same<qk_vec_type,
+                   typename KernelVecType<scalar_t>::qk_vec_type>::value);
+  using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
+  using f32_vec_type = vec_op::FP32Vec16;
+  static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
+  static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+  constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
+
+  const qk_vec_type* k_vecs;
+  const f32_vec_type* v_vecs_f32;
+  float* kv_cache_f32 = nullptr;
+
+  if constexpr (!std::is_same<scalar_t, float>::value) {
+    // convert KV cache block to FP32 to reuse it across query heads and
+    // attn @ V computation, since FP16/BF16->FP32 is expensive.
+    // TODO: move malloc outside of this fn to reuse across iterations.
+    const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
+    kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
+
+    for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
+      for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
+        v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
+        f32_vec_type kv_vec_f32(kv_load_vec);
+        kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
+      }
+
+    if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
+      // for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
+      // NOTE: in this case, we only need to convert the V section to FP32.
+      // But for simplicity, we will convert the whole KV block to FP32.
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    } else {
+      k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
+    }
+
+    // attn @ V always use FP32 for V, since attn is FP32.
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
+
+  } else {
+    // KV cache is FP32. don't need to do anything.
+    k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
+    v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
+  }
+
+  // compute 2 heads at the same time to improve ILP and
+  // take advantage of register cache for K and V.
+  constexpr int HEAD_UNROLL = 2;
+  for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
+    acc_out += HEAD_UNROLL * V_HEAD_DIM;
+    acc_lse += HEAD_UNROLL;
+  }
+
+  // take care of the remaining heads
+  for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
+    mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
+        q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
+
+    q_vecs += HEAD_DIM / QK_NUM_ELEM;
+    acc_out += V_HEAD_DIM;
+    acc_lse += 1;
+  }
+
+  if (kv_cache_f32 != nullptr) {
+    std::free(kv_cache_f32);
+  }
+}
+}  // namespace
+
+template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
+void mla_decode_kvcache_cpu_impl(
+    scalar_t* __restrict__ out,             // [num_seqs, num_heads, v_head_dim]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_dim]
+    const scalar_t* __restrict__ kv_cache,  // [num_blocks, block_size,
+                                            // head_dim]
+    const int num_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
+    const int kv_stride, const int num_seqs) {
+  using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
+  using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
+  constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
+
+  // shared across threads
+  const int max_threads = omp_get_max_threads();
+  const int acc_out_nbytes =
+      max_threads * num_heads * V_HEAD_DIM * sizeof(float);
+  float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
+  std::vector<float> acc_lse(max_threads * num_heads);
+
+  // allocate memory to pre-convert query to FP32 later
+  float* q_f32;
+  constexpr bool PRE_CONVERT_QUERY =
+      !std::is_same<scalar_t, float>::value &&
+      std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
+  if constexpr (PRE_CONVERT_QUERY) {
+    const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
+    q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
+  }
+
+#pragma omp parallel
+  {
+    const int num_threads = omp_get_num_threads();
+    const int thread_id = omp_get_thread_num();
+    float* __restrict__ acc_out_thread =
+        acc_out + thread_id * num_heads * V_HEAD_DIM;
+    float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
+
+    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
+      // reset accumulator
+      std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
+      std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
+
+      const int seq_len = seq_lens[seq_idx];
+      const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
+
+      const qk_vec_type* q_vecs;
+      if constexpr (PRE_CONVERT_QUERY) {
+// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
+#pragma omp for
+        for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
+          qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
+          qk_vec_type q_vec(q_load_vec);
+          q_vec.save(q_f32 + i);
+        }
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
+      } else {
+        q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
+      }
+
+#pragma omp for
+      for (int block_idx = 0; block_idx < block_num; ++block_idx) {
+        const int physical_block_idx =
+            block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
+        const int num_tokens =
+            block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
+
+        mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
+            q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
+            acc_lse_thread, num_heads, scale, num_tokens);
+      }
+
+// merge attention states across threads
+// section 2.2 in https://arxiv.org/pdf/2501.01005
+// each thread is responsible for 1 head
+#pragma omp for
+      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
+        float* acc_lse_head = acc_lse.data() + head_idx;
+        float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
+
+        float max_val = -FLT_MAX;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
+        }
+
+        float sum_exp = 0.0f;
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
+          acc_lse_head[thread_id_ * num_heads] = val;
+          sum_exp += val;
+        }
+
+        float inv_sum = 1.0f / sum_exp;
+        float out_head[V_HEAD_DIM] = {};
+        for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
+          float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
+          for (int i = 0; i < V_HEAD_DIM; ++i) {
+            out_head[i] +=
+                acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
+          }
+        }
+
+        for (int i = 0; i < V_HEAD_DIM; ++i) {
+          vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
+                                             head_idx * V_HEAD_DIM + i);
+        }
+      }
+    }
+  }
+  if (PRE_CONVERT_QUERY) {
+    std::free(q_f32);
+  }
+  std::free(acc_out);
+}
+
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens) {
+  const int num_seqs = query.size(0);
+  const int num_heads = query.size(1);
+  const int head_dim = query.size(2);
+  const int block_size = kv_cache.size(1);
+  const int v_head_dim = out.size(2);
+
+  const int max_num_blocks_per_seq = block_tables.size(1);
+  const int o_stride = out.stride(0);
+  const int q_stride = query.stride(0);
+  const int kv_stride = kv_cache.stride(0);
+
+  VLLM_DISPATCH_FLOATING_TYPES(
+      query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
+        CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
+        if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
+          mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
+              out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
+              kv_cache.data_ptr<scalar_t>(), num_heads, scale,
+              block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
+              max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
+        else
+          TORCH_CHECK(false, "Unsupported block size: ", block_size);
+        CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
+      });
+}
\ No newline at end of file
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 5d1c5f4c83d..ef5a2fb5c4d 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -18,6 +18,10 @@ void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
                         const std::optional<torch::Tensor>& azp,
                         const std::optional<torch::Tensor>& bias);
 
+void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
+                        torch::Tensor& kv_cache, double scale,
+                        torch::Tensor& block_tables, torch::Tensor& seq_lens);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -150,6 +154,14 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                  str kv_cache_dtype,"
       "                  Tensor k_scale, Tensor v_scale) -> ()");
   cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+
+  cache_ops.def(
+      "concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                     Tensor! kv_cache,"
+      "                     Tensor slot_mapping,"
+      "                     str kv_cache_dtype,"
+      "                     Tensor scale) -> ()");
+  cache_ops.impl("concat_and_cache_mla", torch::kCPU, &concat_and_cache_mla);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
@@ -157,4 +169,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
   utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
 }
 
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cpu), cpu_ops) {
+  cpu_ops.def(
+      "mla_decode_kvcache("
+      "   Tensor! out, Tensor query, Tensor kv_cache,"
+      "   float scale, Tensor block_tables, Tensor seq_lens) -> ()");
+  cpu_ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
+}
+
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py
index f7936989c96..899122818e0 100644
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@@ -749,3 +749,72 @@ def test_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
 
     ops.gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
     torch.testing.assert_close(dst, expected)
+
+
+@pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
+@pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES_MLA)
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS_MLA)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+@torch.inference_mode()
+def test_concat_and_cache_mla_cpu(
+    kv_lora_rank: int,
+    qk_rope_head_dim: int,
+    num_tokens: int,
+    block_size: int,
+    num_blocks: int,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    device = "cpu"
+    kv_cache_dtype = "auto"
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    total_slots = num_blocks * block_size
+    slot_mapping_lst = random.sample(range(total_slots), num_tokens)
+    slot_mapping = torch.tensor(slot_mapping_lst,
+                                dtype=torch.long,
+                                device=device)
+
+    kv_c = torch.randn(num_tokens, kv_lora_rank, dtype=dtype, device=device)
+    k_pe = torch.randn(num_tokens,
+                       qk_rope_head_dim,
+                       dtype=dtype,
+                       device=device)
+    entry_size = kv_lora_rank + qk_rope_head_dim
+
+    scale = torch.tensor(0.1, dtype=torch.float32, device=device)
+    kv_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                 kv_cache_dtype, device)
+    ref_temp = torch.zeros(*kv_cache.shape, dtype=dtype, device=device)
+
+    for i in range(num_tokens):
+        slot = slot_mapping[i].item()
+        block_idx = slot // block_size
+        block_offset = slot % block_size
+        ref_temp[block_idx, block_offset, :kv_lora_rank] = kv_c[i]
+        ref_temp[block_idx, block_offset, kv_lora_rank:] = k_pe[i]
+
+    if kv_cache_dtype == "fp8":
+        ref_kv_cache = torch.empty_like(ref_temp, dtype=kv_cache.dtype)
+        ops.convert_fp8(ref_kv_cache,
+                        ref_temp,
+                        scale.item(),
+                        kv_dtype=kv_cache_dtype)
+    else:
+        ref_kv_cache = ref_temp
+
+    opcheck(
+        torch.ops._C_cache_ops.concat_and_cache_mla,
+        (kv_c, k_pe, kv_cache, slot_mapping, kv_cache_dtype, scale),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.concat_and_cache_mla(kv_c, k_pe, kv_cache, slot_mapping,
+                             kv_cache_dtype, scale)
+    torch.testing.assert_close(kv_cache, ref_kv_cache)
diff --git a/tests/kernels/test_mla_decode_cpu.py b/tests/kernels/test_mla_decode_cpu.py
new file mode 100644
index 00000000000..8cebe32c4c5
--- /dev/null
+++ b/tests/kernels/test_mla_decode_cpu.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+
+
+def cdiv(a, b):
+    return (a + b - 1) // b
+
+
+def ref_mla(
+        out: Tensor,  # (bs, num_heads, v_head_dim)
+        query: Tensor,  # (bs, num_heads, head_dim)
+        kv_cache: Tensor,  # (num_blocks, block_size, head_dim)
+        scale: float,
+        block_tables: Tensor,  # (bs, max_num_blocks)
+        seq_lens: Tensor,  # (bs,)
+):
+    bs, num_heads, v_head_dim = out.shape
+    head_dim = query.shape[2]
+
+    for i in range(bs):
+        # gather and flatten KV-cache
+        kv = kv_cache[
+            block_tables[i]]  # (max_num_blocks, block_size, head_dim)
+        kv = kv.view(1, -1,
+                     head_dim)[:, :seq_lens[i]]  # (1, seq_len, head_dim)
+        v = kv[:, :, :v_head_dim]
+
+        q = query[i].view(num_heads, 1, head_dim)
+        o = F.scaled_dot_product_attention(q,
+                                           kv,
+                                           v,
+                                           scale=scale,
+                                           enable_gqa=True)
+        out[i] = o.view(num_heads, v_head_dim)
+
+    return out
+
+
+@pytest.mark.parametrize("bs", [4])
+@pytest.mark.parametrize("mean_seq_len", [256])
+@pytest.mark.parametrize("h_q", [16])
+@pytest.mark.parametrize("d", [576])
+@pytest.mark.parametrize("dv", [512])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float, torch.half, torch.bfloat16])
+@pytest.mark.parametrize("varlen", [False, True])
+@pytest.mark.cpu_model
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
+def test_mla_decode_cpu(
+    bs: int,
+    mean_seq_len: int,
+    h_q: int,
+    d: int,
+    dv: int,
+    block_size: int,
+    dtype: torch.dtype,
+    varlen: bool,
+):
+    torch.set_default_dtype(dtype)
+    torch.manual_seed(0)
+
+    scale = d**(-0.5)
+    if varlen:
+        seq_lens = torch.empty(bs).normal_(mean_seq_len, mean_seq_len / 2)
+        seq_lens = seq_lens.clip(2).to(torch.int32)
+    else:
+        seq_lens = torch.full((bs, ), mean_seq_len, dtype=torch.int32)
+    max_seq_len = seq_lens.max().item()
+    seqlen_pad = cdiv(max_seq_len, 256) * 256  # is this necessary?
+
+    q = torch.randn(bs, h_q, d)
+    block_table = torch.arange(bs * seqlen_pad // block_size,
+                               dtype=torch.int32)
+    block_table = block_table.view(bs, seqlen_pad // block_size)
+
+    kv_cache = torch.randn(block_table.numel(), block_size, d)
+    for i, seq_len in enumerate(seq_lens.tolist()):
+        kv_cache.view(bs, seqlen_pad, d)[i, seq_len:] = float("nan")
+
+    out_mla = q.new_zeros(bs, h_q, dv)
+    ops.mla_decode_kvcache_cpu(out_mla, q, kv_cache, scale, block_table,
+                               seq_lens)
+
+    out_ref = q.new_zeros(bs, h_q, dv)
+    ref_mla(out_ref, q, kv_cache, scale, block_table, seq_lens)
+
+    assert not out_mla.isnan().any(), "Likely read out of bounds"
+    torch.testing.assert_close(out_mla, out_ref)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d68c097fbe8..dc07bad4680 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -124,6 +124,18 @@ def paged_attention_rocm(
                                       kv_cache_dtype, k_scale, v_scale)
 
 
+def mla_decode_kvcache_cpu(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    kv_cache: torch.Tensor,
+    scale: float,
+    block_tables: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    torch.ops._C_cpu.mla_decode_kvcache(out, query, kv_cache, scale,
+                                        block_tables, seq_lens)
+
+
 # pos encoding ops
 def rotary_embedding(
     positions: torch.Tensor,
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index a7b909d2063..c3d210c27ca 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -187,15 +187,28 @@ def varlen_attention(
         gen_: torch.Generator,
         logits_soft_cap: float,
     ) -> None:
-        ipex.llm.functional.varlen_attention(query.contiguous(),
-                                             key.contiguous(),
-                                             value.contiguous(), out,
-                                             seqlen_q.int(), seqlen_k.int(),
-                                             max_seqlen_q, max_seqlen_k,
-                                             pdropout, softmax_scale,
-                                             zero_tensors, is_causal,
-                                             return_softmax, gen_,
-                                             logits_soft_cap)
+        if ipex.__version__.endswith("cpu"):
+            if logits_soft_cap != 0.0:
+                raise ValueError("IPEX CPU does not support logits_soft_cap")
+            ipex.llm.functional.varlen_attention(query.contiguous(),
+                                                 key.contiguous(),
+                                                 value.contiguous(), out,
+                                                 seqlen_q.int(),
+                                                 seqlen_k.int(), max_seqlen_q,
+                                                 max_seqlen_k, pdropout,
+                                                 softmax_scale, zero_tensors,
+                                                 is_causal, return_softmax,
+                                                 gen_)
+        else:  # XPU build
+            ipex.llm.functional.varlen_attention(query.contiguous(),
+                                                 key.contiguous(),
+                                                 value.contiguous(), out,
+                                                 seqlen_q.int(),
+                                                 seqlen_k.int(), max_seqlen_q,
+                                                 max_seqlen_k, pdropout,
+                                                 softmax_scale, zero_tensors,
+                                                 is_causal, return_softmax,
+                                                 gen_, logits_soft_cap)
 
     @staticmethod
     def reshape_and_cache(
diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py
new file mode 100644
index 00000000000..e2d16908fa9
--- /dev/null
+++ b/vllm/attention/backends/cpu_mla.py
@@ -0,0 +1,303 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm._ipex_ops import ipex_ops
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadataBuilder,
+                                              AttentionType,
+                                              is_quantized_kv_cache)
+from vllm.attention.backends.mla.common import MLACommonImpl, MLACommonState
+from vllm.attention.backends.torch_sdpa import TorchSDPAMetadata
+from vllm.utils import make_tensor_with_pad
+from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder
+
+
+class CPUMLABackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "CPU_MLA"
+
+    @staticmethod
+    def get_metadata_cls() -> Type["CPUMLAMetadata"]:
+        return CPUMLAMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["CPUMLAMetadataBuilder"]:
+        return CPUMLAMetadataBuilder
+
+    @staticmethod
+    def get_state_cls() -> Type["MLACommonState"]:
+        return MLACommonState
+
+    @staticmethod
+    def get_impl_cls() -> Type["CPUMLAImpl"]:
+        return CPUMLAImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        ops.copy_blocks_mla(kv_caches, src_to_dists)
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+@dataclass
+class CPUMLAMetadata(TorchSDPAMetadata):
+    # New for MLA
+    # Input positions for rotrary embeddings since for MLA the rotary
+    # position embeddings are applied inside the attention backend
+    input_positions: torch.Tensor = None
+
+    # required by MLACommonImpl
+    is_profile_run: bool = False
+
+
+class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]):
+
+    def __init__(self, input_builder: ModelInputForCPUBuilder) -> None:
+        self.chunked_prefill = input_builder.chunked_prefill
+        self.input_builder = input_builder
+        assert not self.chunked_prefill, \
+            "chunked prefill is currently not supported"
+
+    def prepare(self):
+        self.input_data = self.input_builder.input_data
+
+    def build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size):
+        input_data = self.input_data
+        prefill_seq_lens = seq_lens[0:input_data.num_prefills]
+        prefill_query_lens = query_lens[0:input_data.num_prefills]
+        slot_mapping = torch.tensor(input_data.slot_mapping,
+                                    dtype=torch.long,
+                                    device="cpu")
+
+        # metadata for prefill
+        if input_data.num_prefills > 0:
+            query_lens_tensor = torch.tensor(prefill_query_lens,
+                                             dtype=torch.int32,
+                                             device="cpu")
+            kv_lens_tensor = torch.tensor(prefill_seq_lens,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            query_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                          dtype=torch.int32,
+                                          device="cpu")
+            kv_start_loc = torch.zeros(input_data.num_prefills + 1,
+                                       dtype=torch.int32,
+                                       device="cpu")
+            torch.cumsum(query_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=query_start_loc[1:])
+            torch.cumsum(kv_lens_tensor,
+                         dim=0,
+                         dtype=torch.int32,
+                         out=kv_start_loc[1:])
+            max_query_len = max(prefill_query_lens)
+            max_kv_len = max(prefill_seq_lens)
+
+            # for chunked-prefill
+            if self.chunked_prefill:
+                prefill_block_tables = make_tensor_with_pad(
+                    self.input_data.prefill_block_tables,
+                    pad=0,
+                    dtype=torch.int32,
+                    device="cpu",
+                )
+            else:
+                prefill_block_tables = None
+
+        else:
+            query_start_loc = None
+            kv_start_loc = None
+            max_query_len = None
+            max_kv_len = None
+            prefill_block_tables = None
+
+        # metadata for decode
+        if input_data.num_decode_tokens != 0:
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[input_data.num_prefills:],
+                dtype=torch.int32,
+                device="cpu",
+            )
+            block_tables = make_tensor_with_pad(
+                self.input_data.decode_block_tables,
+                pad=0,
+                dtype=torch.int32,
+                device="cpu",
+            )
+        else:
+            block_tables = torch.tensor([])
+            seq_lens_tensor = torch.tensor(
+                input_data.seq_lens[:input_data.num_prefills],
+                dtype=torch.int32,
+                device="cpu",
+            )
+
+        # For multi-modal models
+        placeholder_index_maps = None
+        if len(input_data.multi_modal_inputs_list) != 0:
+            placeholder_index_maps = {
+                modality: placeholder_map.index_map()
+                for modality, placeholder_map in
+                input_data.multi_modal_placeholder_maps.items()
+            }
+
+        return CPUMLAMetadata(
+            chunked_prefill=self.chunked_prefill,
+            seq_lens=prefill_seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=max_query_len,
+            max_kv_len=max_kv_len,
+            query_start_loc=query_start_loc,
+            kv_start_loc=kv_start_loc,
+            max_decode_seq_len=input_data.max_decode_seq_len,
+            num_prefills=input_data.num_prefills,
+            num_prefill_tokens=input_data.num_prefill_tokens,
+            num_decode_tokens=input_data.num_decode_tokens,
+            block_tables=block_tables,
+            prefill_block_tables=prefill_block_tables,
+            slot_mapping=slot_mapping,
+            multi_modal_placeholder_index_maps=placeholder_index_maps,
+            enable_kv_scales_calculation=False,
+            input_positions=torch.tensor([self.input_data.input_positions]))
+
+
+class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]):
+
+    def __init__(
+            self,
+            num_heads: int,
+            head_size: int,
+            scale: float,
+            num_kv_heads: int,
+            alibi_slopes: Optional[List[float]],
+            sliding_window: Optional[int],
+            kv_cache_dtype: str,
+            blocksparse_params: Optional[Dict[str, Any]],
+            logits_soft_cap: Optional[float],
+            attn_type: str,
+            # MLA Specific Arguments
+            **mla_args) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads,
+                         alibi_slopes, sliding_window, kv_cache_dtype,
+                         blocksparse_params, logits_soft_cap, attn_type,
+                         **mla_args)
+
+        unsupported_features = [
+            alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
+        ]
+        if any(unsupported_features):
+            raise NotImplementedError(
+                "CPUMLAImpl does not support one of the following: "
+                "alibi_slopes, sliding_window, blocksparse_params, "
+                "logits_soft_cap")
+
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "CPUMLAImpl")
+
+        # states is implemented.
+        if is_quantized_kv_cache(self.kv_cache_dtype):
+            raise NotImplementedError(
+                "CPUMLAImpl with FP8 KV cache not yet supported")
+
+    def _forward_prefill(
+            self,
+            q: torch.Tensor,
+            kv_c_normed: torch.Tensor,
+            k_pe: torch.Tensor,
+            kv_c_and_k_pe_cache: torch.Tensor,
+            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
+    ) -> torch.Tensor:
+
+        prefill_metadata = attn_metadata.prefill_metadata
+        assert prefill_metadata is not None
+
+        kv_nope = self.kv_b_proj(kv_c_normed)[0].view(\
+            -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+        k_nope, v = kv_nope\
+            .split([self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+
+        k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
+
+        # For MLA the v head dim is smaller than qk head dim so we pad out
+        # v with 0s to match the qk head dim
+        v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
+                                           value=0)
+
+        output = torch.empty_like(q)
+        ipex_ops.varlen_attention(
+            query=q,
+            key=k,
+            value=v_padded,
+            out=output,
+            seqlen_q=prefill_metadata.query_start_loc,
+            seqlen_k=prefill_metadata.query_start_loc,
+            max_seqlen_q=prefill_metadata.max_query_len,
+            max_seqlen_k=prefill_metadata.max_query_len,
+            pdropout=0.0,
+            softmax_scale=self.scale,
+            zero_tensors=False,
+            is_causal=True,
+            return_softmax=False,
+            gen_=None,
+            logits_soft_cap=0.0,
+        )
+
+        # remove padding
+        output = output.view(-1, self.num_heads,
+                             q.shape[-1])[..., :v.shape[-1]]
+        output = output.reshape(-1, self.num_heads * v.shape[-1])
+        return self.o_proj(output)[0]
+
+    def _forward_decode(
+            self,
+            q_nope: torch.Tensor,
+            q_pe: torch.Tensor,
+            kv_c_and_k_pe_cache: torch.Tensor,
+            attn_metadata: CPUMLAMetadata,  # type: ignore[override]
+    ) -> torch.Tensor:
+        assert kv_c_and_k_pe_cache.numel() > 0
+
+        decode_meta = attn_metadata.decode_metadata
+        assert decode_meta is not None
+
+        q = torch.cat([q_nope, q_pe], dim=-1)
+        o = q.new_empty(q.shape[0], self.num_heads, self.kv_lora_rank)
+
+        # Run MQA
+        ops.mla_decode_kvcache_cpu(o, q, kv_c_and_k_pe_cache, self.scale,
+                                   decode_meta.block_tables,
+                                   decode_meta.seq_lens_tensor)
+        return self._v_up_proj_and_o_proj(o)
diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py
index 1b1ab314c01..8d70afe282d 100644
--- a/vllm/attention/backends/mla/common.py
+++ b/vllm/attention/backends/mla/common.py
@@ -204,7 +204,6 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
-from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase, RowParallelLinear,
                                                UnquantizedLinearMethod)
@@ -212,18 +211,27 @@
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 from vllm.multimodal import MultiModalPlaceholderMap
 from vllm.platforms import current_platform
+from vllm.triton_utils import HAS_TRITON
 from vllm.utils import async_tensor_h2d, cdiv, make_tensor_with_pad, round_down
 from vllm.vllm_flash_attn.fa_utils import get_flash_attn_version
 
+if HAS_TRITON:
+    from vllm.attention.ops.triton_flash_attention import triton_attention
+    from vllm.attention.ops.triton_merge_attn_states import merge_attn_states
+else:
+    merge_attn_states = None
+    triton_attention = None
+
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
     is_vllm_fa = True
 except ImportError:
-    # For rocm use upstream flash attention
-    from flash_attn import flash_attn_varlen_func
     is_vllm_fa = False
-
-from vllm.attention.ops.triton_flash_attention import triton_attention
+    try:
+        # For rocm use upstream flash attention
+        from flash_attn import flash_attn_varlen_func
+    except ImportError:
+        flash_attn_varlen_func = None
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 4b10b298dce..0eb747a4c45 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -37,6 +37,9 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
                              use_mla: bool) -> str:
         if selected_backend and selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
+        if use_mla:
+            logger.info("Using CPU MLA backend.")
+            return "vllm.attention.backends.cpu_mla.CPUMLABackend"
         logger.info("Using Torch SDPA backend.")
         return "vllm.attention.backends.torch_sdpa.TorchSDPABackend"
 
@@ -129,9 +132,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # Disable torch async compiling which won't work with daemonic processes
         os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
 
-        # MLA attention is not supported
-        os.environ["VLLM_MLA_DISABLE"] = "1"
-
         # Intel OpenMP setting
         ld_prealod_str = os.getenv("LD_PRELOAD", "")
         if "libiomp5.so" in ld_prealod_str:
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 8407f073040..9f4b18869bd 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -469,6 +469,7 @@ def __init__(
             self.kv_cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
         ) if needs_attn_backend else None
 
         # Multi-modal data support
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 70d2924a045..b93aae9c91b 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -66,6 +66,7 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig,
             cache_config.cache_dtype,
             self.block_size,
             self.model_config.is_attention_free,
+            use_mla=self.model_config.use_mla,
         )
 
         # Initialize the cache.

From 96d572ed39073e68a52282cbeb4fa2509489d15f Mon Sep 17 00:00:00 2001
From: "Md. Shafi Hussain" <Md.Shafi.Hussain@ibm.com>
Date: Tue, 25 Mar 2025 15:45:14 +0530
Subject: [PATCH 0961/1240] Dockerfile.ppc64le changes to move to UBI (#15402)

Signed-off-by: Md. Shafi Hussain <Md.Shafi.Hussain@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 Dockerfile.ppc64le   | 272 +++++++++++++++++++++++++++++++++++++++----
 requirements/cpu.txt |   6 +-
 2 files changed, 254 insertions(+), 24 deletions(-)

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index c5ca20d76e3..913c289adc0 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -1,37 +1,267 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
+ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 
-ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
+###############################################################
+# base stage with basic dependencies
+###############################################################
 
-RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
 
-# Some packages in requirements/cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv, cargo & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:/root/.cargo/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# install gcc-13, python, rust, openblas
+# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
+# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
+#       when `--jobs=<N>` is passed with podman build command
+RUN microdnf install -y openssl-devel dnf \
+    && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
+        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
+        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
+    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
+    && dnf config-manager --set-enabled crb \
+    && dnf install -y \
+       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
+       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
+       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
+       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
+       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
+       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && dnf clean all \
+    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
+    && cd /tmp && touch control
+
+###############################################################
+# Stage to build torch family
+###############################################################
+
+FROM base-builder AS torch-builder
+
+ARG MAX_JOBS
+ARG TORCH_VERSION=2.6.0
+ARG _GLIBCXX_USE_CXX11_ABI=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable &&  \
+    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
+    cd pytorch && \
+    uv pip install -r requirements.txt && \
+    python setup.py develop && \
+    rm -f dist/torch*+git*whl && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
+
+ARG TORCHVISION_VERSION=0.21.0
+ARG TORCHVISION_USE_NVJPEG=0
+ARG TORCHVISION_USE_FFMPEG=0
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
+    cd vision && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHVISION_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+ARG TORCHAUDIO_VERSION=2.6.0
+ARG BUILD_SOX=1
+ARG BUILD_KALDI=1
+ARG BUILD_RNNT=1
+ARG USE_FFMPEG=0
+ARG USE_ROCM=0
+ARG USE_CUDA=0
+ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
+    cd audio && \
+    MAX_JOBS=${MAX_JOBS:-$(nproc)} \
+    BUILD_VERSION=${TORCHAUDIO_VERSION} \
+    uv build --wheel --out-dir /torchwheels/ --no-build-isolation
+
+###############################################################
+# Stage to build pyarrow
+###############################################################
+
+FROM base-builder AS arrow-builder
+
+ARG MAX_JOBS
+ARG PYARROW_PARALLEL
+ARG PYARROW_VERSION=19.0.1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
+    cd arrow/cpp && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=release \
+        -DCMAKE_INSTALL_PREFIX=/usr/local \
+        -DARROW_PYTHON=ON \
+        -DARROW_BUILD_TESTS=OFF \
+        -DARROW_JEMALLOC=ON \
+        -DARROW_BUILD_STATIC="OFF" \
+        -DARROW_PARQUET=ON \
+        .. && \
+    make install -j ${MAX_JOBS:-$(nproc)} && \
+    cd ../../python/ && \
+    uv pip install -v -r requirements-wheel-build.txt && \
+    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
+    python setup.py build_ext \
+    --build-type=release --bundle-arrow-cpp \
+    bdist_wheel --dist-dir /arrowwheels/
+
+###############################################################
+# Stage to build opencv
+###############################################################
+
+FROM base-builder AS cv-builder
+
+ARG MAX_JOBS
+ARG OPENCV_VERSION=84
+ARG ENABLE_HEADLESS=1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    source /opt/rh/gcc-toolset-13/enable && \
+    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
+    cd opencv-python && \
+    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    python -m build --wheel --installer=uv --outdir /opencvwheels/
+
+###############################################################
+# Stage to build vllm - this stage builds and installs
+# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
+# for transitive dependencies - eg. grpcio
+###############################################################
+
+FROM base-builder AS vllmcache-builder
+
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+
+ARG VLLM_TARGET_DEVICE=cpu
+
+# this step installs vllm and populates uv cache
+# with all the transitive dependencies
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,src=.,dst=/src/,rw \
+    source /opt/rh/gcc-toolset-13/enable && \
+    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
+    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
+    uv pip install pandas pythran pybind11 && \
+    # sentencepiece.pc is in some pkgconfig inside uv cache
+    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
+    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
+    cd /src/ && \
+    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
+    uv pip install /vllmwheel/*.whl
 
-COPY ./ /workspace/vllm
 
+###############################################################
+# Stage to build numactl
+###############################################################
+
+FROM base-builder AS numa-builder
+
+# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
+# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
+ARG MAX_JOBS
+ARG NUMACTL_VERSION=2.0.19
+RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
+    && cd numactl \
+    && autoreconf -i && ./configure \
+    && make -j ${MAX_JOBS:-$(nproc)}
+
+###############################################################
+# Stage to build lapack
+###############################################################
+
+FROM base-builder AS lapack-builder
+
+ARG MAX_JOBS
+ARG LAPACK_VERSION=3.12.1
+RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
+    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cmake -B build -S . \
+    && cmake --build build -j ${MAX_JOBS:-$(nproc)}
+
+
+###############################################################
+#                   FINAL VLLM IMAGE STAGE                    #
+###############################################################
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
+
+ARG PYTHON_VERSION=3.12
+ARG OPENBLAS_VERSION=0.3.29
+
+# Set Environment Variables for venv & openblas
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=${VIRTUAL_ENV}/bin:$PATH
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
+ENV UV_LINK_MODE=copy
+
+# create artificial dependencies between stages for independent stages to build in parallel
+COPY --from=torch-builder /tmp/control /dev/null
+COPY --from=arrow-builder /tmp/control /dev/null
+COPY --from=cv-builder /tmp/control /dev/null
+COPY --from=vllmcache-builder /tmp/control /dev/null
+COPY --from=numa-builder /tmp/control /dev/null
+COPY --from=lapack-builder /tmp/control /dev/null
+
+# install gcc-11, python, openblas, numactl, lapack
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
+    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
+    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    microdnf install --nodocs -y \
+    tar findutils openssl \
+    pkgconfig xsimd g++ gcc-fortran libsndfile \
+    libtiff libjpeg openjpeg2 zlib zeromq \
+    freetype lcms2 libwebp tcl tk utf8proc \
+    harfbuzz fribidi libraqm libimagequant libxcb \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
+    && microdnf clean all \
+    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
+    && python -m pip install -U pip uv --no-cache \
+    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
+    && make -C /numactl install \
+    && uv pip install cmake \
+    && cmake --install /lapack/build \
+    && uv pip uninstall cmake
+
+# consume previously built wheels (including vllm)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
+    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
+    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
+    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
+
+COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
 
-RUN --mount=type=cache,target=/root/.cache/pip  \
-    RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        -r requirements/cpu.txt \
-        xformers uvloop==0.20.0
-
-RUN --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
 # install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils
 
 WORKDIR /workspace/
 
 RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index e4a7f9acdff..fc09083781e 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,14 +4,14 @@
 # Dependencies for CPUs
 torch==2.6.0+cpu; platform_machine == "x86_64"
 torch==2.6.0; platform_system == "Darwin"
-torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.6.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.5.1; platform_machine == "ppc64le"
+torchaudio==2.6.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le"  and platform_machine != "s390x"
-torchvision==0.20.1; platform_machine == "ppc64le"
+torchvision==0.21.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts

From fc8c6f74948cb3931b93cd39f3a76b979382d425 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Mar 2025 18:22:52 +0800
Subject: [PATCH 0962/1240] [Misc] Clean up MiniCPM-V/O code (#15337)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language.py |   1 +
 .../vision_language/test_models.py            |  65 +-
 .../multimodal/processing/test_common.py      |  90 ++-
 vllm/model_executor/models/gemma3_mm.py       |   2 -
 vllm/model_executor/models/minicpmo.py        | 340 ++++-----
 vllm/model_executor/models/minicpmv.py        | 687 ++++++++----------
 vllm/multimodal/inputs.py                     |   7 +
 7 files changed, 531 insertions(+), 661 deletions(-)

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 1cc2562759d..0adbe574370 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -361,6 +361,7 @@ def run_llava_next_video(questions: list[str],
     engine_args = EngineArgs(
         model="llava-hf/LLaVA-NeXT-Video-7B-hf",
         max_model_len=8192,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 02351401879..94b61b6ae78 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -163,24 +163,24 @@
         marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
-    # "aria": VLMTestInfo(
-    #     models=["rhymes-ai/Aria"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
-    #     max_model_len=4096,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     single_image_prompts=IMAGE_ASSETS.prompts({
-    #         "stop_sign": "<vlm_image>Please describe the image shortly.",
-    #         "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
-    #     }),
-    #     multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
-    #     stop_str=["<|im_end|>"],
-    #     image_size_factors=[(0.10, 0.15)],
-    #     max_tokens=64,
-    #     marks=[large_gpu_mark(min_gb=64)],
-    # ),
+    "aria": VLMTestInfo(
+        models=["rhymes-ai/Aria"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<fim_prefix><|img|><fim_suffix>\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<vlm_image>Please describe the image shortly.",
+            "cherry_blossom": "<vlm_image>Please infer the season with reason.",  # noqa: E501
+        }),
+        multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",    # noqa: E501
+        stop_str=["<|im_end|>"],
+        image_size_factors=[(0.10, 0.15)],
+        max_tokens=64,
+        marks=[large_gpu_mark(min_gb=64)],
+    ),
     "blip2": VLMTestInfo(
         models=["Salesforce/blip2-opt-2.7b"],
         test_type=VLMTestType.IMAGE,
@@ -352,6 +352,7 @@
         prompt_formatter=lambda vid_prompt: f"USER: {vid_prompt} ASSISTANT:",
         num_video_frames=16,
         max_model_len=4096,
+        max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
     ),
@@ -384,7 +385,18 @@
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+    ),
+    "minicpmo_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-o-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -392,10 +404,22 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
+        max_model_len=4096,
+        max_num_seqs=2,
+        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
+        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
+        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+    ),
+    "minicpmv_26_multi_image": VLMTestInfo(
+        models=["openbmb/MiniCPM-V-2_6"],
+        test_type=(VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -403,6 +427,7 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=32)],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index f761190a8d0..078ed21537b 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import copy
 from functools import partial
 from typing import Optional, Union
 
@@ -29,7 +28,7 @@ def _test_processing_correctness(
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
@@ -145,7 +144,7 @@ def _test_processing_correctness_hf(
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
         # For some multimodal models, tokenizer will always add bos_token
@@ -167,11 +166,12 @@ def _test_processing_correctness_hf(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_result,
         cached_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
     baseline_tokenized_result = baseline_processor.apply(
         token_prompt,
@@ -179,11 +179,12 @@ def _test_processing_correctness_hf(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_result,
         baseline_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
     cached_tokenized_result = cached_processor.apply(
         token_prompt,
@@ -191,11 +192,12 @@ def _test_processing_correctness_hf(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         cached_result,
         cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
 
 def _test_processing_correctness_mistral(
@@ -206,7 +208,7 @@ def _test_processing_correctness_mistral(
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
-    ignore_mm_keys: Optional[list[str]] = None,
+    ignore_mm_keys: Optional[set[str]] = None,
 ):
     images = mm_data.get("image", [])
     if not isinstance(images, list):
@@ -233,11 +235,12 @@ def _test_processing_correctness_mistral(
         hf_processor_mm_kwargs={},
     )
 
-    assert _inputs_equal(
+    _assert_inputs_equal(
         baseline_tokenized_result,
         cached_tokenized_result,
-        ignore_mm_keys,
-    ), f"Failed ({batch_idx=}, {prompt=}, {mm_data=})"
+        ignore_mm_keys=ignore_mm_keys,
+        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+    )
 
 
 # yapf: disable
@@ -261,6 +264,7 @@ def _test_processing_correctness_mistral(
     "TIGER-Lab/Mantis-8B-siglip-llama3",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
+    "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
     "allenai/Molmo-7B-D-0924",
@@ -290,7 +294,7 @@ def test_processing_correctness(
         # In Ultravox, the audio_features can be different depending on padding
         # The slight difference should not be a problem though, since
         # attention_mask lets us ignore the difference.
-        ignore_mm_keys = ['audio_features']
+        ignore_mm_keys = {"audio_features"}
 
     _test_processing_correctness(
         model_id,
@@ -328,38 +332,26 @@ def test_processing_correctness_phi3v(
     )
 
 
-def _inputs_equal(
+def _assert_inputs_equal(
     a: MultiModalInputs,
     b: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
+    *,
+    ignore_mm_keys: Optional[set[str]] = None,
+    msg: str = "",
 ):
-    return _drop_mm_kwargs_keys(a, ignore_mm_keys) == _drop_mm_kwargs_keys(
-        b, ignore_mm_keys)
-
-
-def _drop_mm_kwargs_keys(
-    result: MultiModalInputs,
-    ignore_mm_keys: Optional[list[str]] = None,
-) -> MultiModalInputs:
-    """Drop specified keys from result['mm_kwargs'].
-
-    This is mainly to avoid doing exact match of audio_features in ultravox.
-
-    Args:
-        result: Result to drop keys from
-        ignore_mm_keys: List of keys to ignore, e.g. ['audio_features']
-    """
-    if not ignore_mm_keys:
-        return result
-
-    if 'mm_kwargs' in result:
-        result = copy.deepcopy(result)
-        mm_kwargs = result['mm_kwargs']
-        for key in ignore_mm_keys:
-            mm_kwargs.pop(key, None)
-        for items in mm_kwargs._items_by_modality.values():
-            for item in items:
-                for key in ignore_mm_keys:
-                    item.pop(key, None)
-
-    return result
+    if ignore_mm_keys is None:
+        ignore_mm_keys = set()
+
+    if msg is None:
+        assert "mm_kwargs" in a and "mm_kwargs" in b
+    else:
+        assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+
+    for key in ignore_mm_keys:
+        a["mm_kwargs"].pop(key, None)
+        b["mm_kwargs"].pop(key, None)
+
+    if msg is None:
+        assert a == b
+    else:
+        assert a == b, msg
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 8db2bfb901b..d843232ca1b 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -295,8 +295,6 @@ def _call_hf_processor(
 
         # HF processor pops the `num_crops` kwarg, which is needed by vLLM
         if (images := mm_data.get("images")) is not None:
-            assert isinstance(images, list)
-
             parsed_images = (self._get_data_parser().parse_mm_data({
                 "image":
                 images
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ac10c211fa8..1312b105173 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,7 +23,7 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
+from typing import (Any, Callable, Dict, Literal, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -43,24 +43,26 @@
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
 
 from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
                        MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
                        _minicpmv_field_config)
-from .utils import AutoWeightsLoader, cast_overflow_tensors, maybe_prefix
+from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
+                    maybe_prefix)
 
 CPU_DEVICE = torch.device("cpu")
 
 
 class MiniCPMOAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    data: torch.Tensor
+    audio_features: torch.Tensor
     """
     Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
     Slice here means chunk. Audio that is too long will be split into slices,
     which is the same as image.
-    Padding is used therefore `data` is `torch.Tensor`.
+    Padding is used therefore `audio_features` is `torch.Tensor`.
     """
 
     audio_feature_lens: torch.Tensor
@@ -68,7 +70,7 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     Shape: `(batch_size * num_audios * num_slices)`
 
     This should be feature length of each audio slice, 
-    which equals to `data.shape[-1]`
+    which equals to `audio_features.shape[-1]`
     """
 
     audio_bounds: torch.Tensor
@@ -81,7 +83,7 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    data: List[torch.Tensor]
+    audio_embeds: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_slices, hidden_size)`
 
@@ -102,18 +104,11 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    audio_num_slices = hf_inputs.get("audio_num_slices", torch.empty(0))
-
     return dict(
         **_minicpmv_field_config(hf_inputs),
-        audio_features=MultiModalFieldConfig.flat_from_sizes(
-            "audio", audio_num_slices),
-        audio_feature_lens=MultiModalFieldConfig.flat_from_sizes(
-            "audio", audio_num_slices),
-        audio_num_slices=MultiModalFieldConfig.batched("audio"),
-        audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"),
-        audio_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "audio", audio_num_slices),
+        audio_features=MultiModalFieldConfig.batched("audio"),
+        audio_feature_lens=MultiModalFieldConfig.batched("audio"),
+        audio_embeds=MultiModalFieldConfig.batched("audio"),
     )
 
 
@@ -153,9 +148,6 @@ def _parse_audio_data(
 class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
-    def get_supported_mm_modalities(self) -> List[str]:
-        return ["image", "video", "audio"]
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None, "audio": None}
 
@@ -277,95 +269,47 @@ def process_audios(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, NestedTensors]:
-        mm_data = dict(mm_data)
-
-        audios = mm_data.pop("audios", [])
-        audio_embeds = mm_data.pop("audio_embeds", [])
-        if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0:
-            audio_outputs = {
-                "audio_lens": [],
-                "audio_features": [],
-                "audio_feature_lens": [],
-                "audio_num_segments": []
-            }
-            for audio in audios:
-                single_audio_outputs = super().call_base_hf_processor(
-                    prompt=self.info.audio_pattern,
-                    mm_data={
-                        "audios": audio,
-                        "chunk_input": True
-                    },
-                    mm_kwargs=mm_kwargs)
-                audio_outputs["audio_lens"].append(len(audio))
-                audio_outputs["audio_features"].append(
-                    single_audio_outputs["audio_features"])
-                audio_outputs["audio_num_segments"].append(
-                    len(single_audio_outputs["audio_feature_lens"][0]))
-                audio_outputs["audio_feature_lens"] += \
-                    single_audio_outputs["audio_feature_lens"]
-            audio_outputs["audio_features"] = [
-                audio_feature for single_audio_features in \
-                    audio_outputs["audio_features"]
-                for audio_feature in single_audio_features
-            ]
-            audio_outputs["audio_feature_lens"] = torch.cat(
-                audio_outputs["audio_feature_lens"])
-        elif len(audio_embeds):
-            audio_outputs = {
-                "audio_lens": [
-                    self.info.get_audio_len_by_num_chunks(
-                        sum(chunk_embeds.shape[0]
-                            for chunk_embeds in single_audio_embeds))
-                    for single_audio_embeds in audio_embeds
-                ],
-                "audio_embeds": [
-                    chunk_embeds for single_audio_embeds in audio_embeds
-                    for chunk_embeds in single_audio_embeds
-                ],
-                "audio_num_segments": [
-                    len(single_audio_embeds)
-                    for single_audio_embeds in audio_embeds
-                ]
-            }
-        else:
-            audio_outputs = {}
-        return audio_outputs
+        if (audios := mm_data.get("audios")) is None:
+            return {}
+
+        parsed_audios = (self._get_data_parser().parse_mm_data({
+            "audio": audios
+        }).get_items("audio", AudioProcessorItems))
+
+        audio_inputs = self._base_call_hf_processor(
+            prompts=[self.info.audio_pattern] * len(parsed_audios),
+            mm_data={"audios": [[audio] for audio in parsed_audios]},
+            mm_kwargs={
+                **mm_kwargs, "chunk_input": True
+            },
+            out_keys={"audio_features", "audio_feature_lens"},
+        )
+
+        # Avoid padding since we need the output for each audio to be
+        # independent of other audios for the cache to work correctly
+        unpadded_audio_features = [
+            feat[:, :feature_len] for feat, feature_len in zip(
+                audio_inputs["audio_features"],
+                audio_inputs["audio_feature_lens"],
+            )
+        ]
+        audio_inputs["audio_features"] = unpadded_audio_features
+
+        return audio_inputs
 
     def get_placeholder_match_pattern(self) -> str:
         return r"\(<(image|video|audio)>./</\1>\)"
 
-    def get_placeholder_split_pattern(self) -> str:
-        return r"\(<(?:image|video|audio)>./</(?:image|video|audio)>\)"
-
     def process_mm_inputs(
         self,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, Mapping[str, NestedTensors]]:
+    ) -> Mapping[str, NestedTensors]:
         return {
-            "image": self.process_images(mm_data, mm_kwargs),
-            "video": self.process_videos(mm_data, mm_kwargs),
-            "audio": self.process_audios(mm_data, mm_kwargs),
+            **super().process_mm_inputs(mm_data, mm_kwargs),
+            **self.process_audios(mm_data, mm_kwargs),
         }
 
-    def get_modality_num_counter(self, modality: str) -> str:
-        if modality == "audio":
-            return "audio_lens"
-        return super().get_modality_num_counter(modality)
-
-    def get_num_slices_by_modality(self, inputs: Dict[str, object],
-                                   modality: str, index: int) -> int:
-        if modality == "audio":
-            return inputs["audio"]["audio_num_segments"][index]
-        return super().get_num_slices_by_modality(inputs, modality, index)
-
-    def get_prompt_texts_by_modality(self, inputs: Dict[str, object],
-                                     modality: str, index: int) -> str:
-        if modality == "audio":
-            return self.get_audio_prompt_texts(
-                inputs["audio"]["audio_lens"][index])
-        return super().get_prompt_texts_by_modality(inputs, modality, index)
-
     def _get_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
@@ -622,86 +566,84 @@ def _get_feat_extract_output_lengths(self,
     # Copied from HF repo of MiniCPM-o-2_6,
     # designed for batched inputs and outputs
     def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
-                                chunk_length: int) -> torch.Tensor:
+                                chunk_length: int) -> list[torch.Tensor]:
         wavforms = data.get(
-            "data",
+            "audio_features",
             [])  # (bs, 80, frames) or [], multi audios need filled in advance
         audio_feature_lens_raw = [data.get("audio_feature_lens",
                                            [])]  # list, [[x1, x2], [y1], [z1]]
 
-        # exist audio
-        if len(wavforms) > 0:
-            audio_feature_lens = torch.hstack(audio_feature_lens_raw)
-            batch_size, _, max_mel_seq_len = wavforms.shape
-            max_seq_len = (max_mel_seq_len - 1) // 2 + 1
-
-            # Create a sequence tensor of shape (batch_size, max_seq_len)
-            seq_range = (torch.arange(
-                0,
-                max_seq_len,
-                dtype=audio_feature_lens.dtype,
-                device=audio_feature_lens.device).unsqueeze(0).expand(
-                    batch_size, max_seq_len))
-            lengths_expand = audio_feature_lens.unsqueeze(1).expand(
-                batch_size, max_seq_len)
-            # Create mask
-            padding_mask = seq_range >= lengths_expand  # 1 for padded values
-
-            audio_attention_mask_ = padding_mask.view(
-                batch_size, 1, 1, max_seq_len).expand(batch_size, 1,
-                                                      max_seq_len, max_seq_len)
-            audio_attention_mask = audio_attention_mask_.to(
-                dtype=self.apm.conv1.weight.dtype,
-                device=self.apm.conv1.weight.device)
-
-            if chunk_length > 0:
-                chunk_num_frame = int(chunk_length * 50)
-                chunk_mask = self.subsequent_chunk_mask(
-                    size=max_seq_len,
-                    chunk_size=chunk_num_frame,
-                    num_left_chunks=-1,
-                    device=audio_attention_mask_.device,
-                )
-                audio_attention_mask_ = torch.logical_or(
-                    audio_attention_mask_, torch.logical_not(chunk_mask))
-
-            audio_attention_mask[audio_attention_mask_] = float("-inf")
-            audio_states = self.apm(
-                wavforms, attention_mask=audio_attention_mask).hidden_states[
-                    self.audio_encoder_layer]
-            audio_embeds = self.audio_projection_layer(audio_states)
-
-            audio_embeds = audio_embeds.transpose(1, 2)
-            audio_embeds = self.audio_avg_pooler(audio_embeds)
-            audio_embeds = audio_embeds.transpose(1, 2)
-
-            _, feature_lens_after_pooling = \
-                self._get_feat_extract_output_lengths(audio_feature_lens)
-
-            num_audio_tokens = feature_lens_after_pooling
-
-            final_audio_embeds = []
-            idx = 0
-            for i in range(len(audio_feature_lens_raw)):
-                target_audio_embeds = []
-                for _ in range(len(audio_feature_lens_raw[i])):
-                    target_audio_embeds.append(
-                        audio_embeds[idx, :num_audio_tokens[idx], :])
-                    idx += 1
-                final_audio_embeds.append(target_audio_embeds)
-            return final_audio_embeds
-        else:
+        if len(wavforms) == 0:
             return []
 
+        audio_feature_lens = torch.hstack(audio_feature_lens_raw)
+        batch_size, _, max_mel_seq_len = wavforms.shape
+        max_seq_len = (max_mel_seq_len - 1) // 2 + 1
+
+        # Create a sequence tensor of shape (batch_size, max_seq_len)
+        seq_range = (torch.arange(
+            0,
+            max_seq_len,
+            dtype=audio_feature_lens.dtype,
+            device=audio_feature_lens.device).unsqueeze(0).expand(
+                batch_size, max_seq_len))
+        lengths_expand = audio_feature_lens.unsqueeze(1).expand(
+            batch_size, max_seq_len)
+        # Create mask
+        padding_mask = seq_range >= lengths_expand  # 1 for padded values
+
+        audio_attention_mask_ = padding_mask.view(
+            batch_size, 1, 1, max_seq_len).expand(batch_size, 1, max_seq_len,
+                                                  max_seq_len)
+        audio_attention_mask = audio_attention_mask_.to(
+            dtype=self.apm.conv1.weight.dtype,
+            device=self.apm.conv1.weight.device)
+
+        if chunk_length > 0:
+            chunk_num_frame = int(chunk_length * 50)
+            chunk_mask = self.subsequent_chunk_mask(
+                size=max_seq_len,
+                chunk_size=chunk_num_frame,
+                num_left_chunks=-1,
+                device=audio_attention_mask_.device,
+            )
+            audio_attention_mask_ = torch.logical_or(
+                audio_attention_mask_, torch.logical_not(chunk_mask))
+
+        audio_attention_mask[audio_attention_mask_] = float("-inf")
+        audio_states = self.apm(
+            wavforms, attention_mask=audio_attention_mask).hidden_states[
+                self.audio_encoder_layer]
+        audio_embeds = self.audio_projection_layer(audio_states)
+
+        audio_embeds = audio_embeds.transpose(1, 2)
+        audio_embeds = self.audio_avg_pooler(audio_embeds)
+        audio_embeds = audio_embeds.transpose(1, 2)
+
+        _, feature_lens_after_pooling = \
+            self._get_feat_extract_output_lengths(audio_feature_lens)
+
+        num_audio_tokens = feature_lens_after_pooling
+
+        final_audio_embeds = []
+        idx = 0
+        for i in range(len(audio_feature_lens_raw)):
+            target_audio_embeds = []
+            for _ in range(len(audio_feature_lens_raw[i])):
+                target_audio_embeds.append(
+                    audio_embeds[idx, :num_audio_tokens[idx], :])
+                idx += 1
+            final_audio_embeds.append(target_audio_embeds)
+        return final_audio_embeds
+
     def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
-                                  audio_inputs: Optional[MiniCPMOAudioInputs],
+                                  audio_inputs: MiniCPMOAudioInputs,
                                   chunk_length: int) -> torch.Tensor:
         device, dtype = vlm_embedding.device, vlm_embedding.dtype
         if audio_inputs["type"] == "audio_embeds":
-            audio_embeddings = audio_inputs["data"]
             audio_embeddings = [
-                audio_embeddings[i].to(device=device, dtype=dtype)
-                for i in range(len(audio_embeddings))
+                item.to(device=device, dtype=dtype)
+                for item in audio_inputs["audio_embeds"]
             ]
         else:
             audio_embeddings = self.get_audio_hidden_states(
@@ -746,40 +688,56 @@ def _get_audio_bounds(self, input_ids: torch.Tensor,
 
     def _parse_and_validate_audio_inputs(
             self, input_ids: torch.Tensor,
-            **kwargs: object) -> Tuple[MiniCPMOAudioInputs]:
-        audio_features = kwargs.pop("audio_features", [])
-        audio_feature_lens = kwargs.pop("audio_feature_lens", [])
+            **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+        audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
-        audio_start_id = kwargs.pop("audio_start_id", None)
-        audio_end_id = kwargs.pop("audio_end_id", None)
+
+        if audio_features is None and audio_embeds is None:
+            return None
+
+        audio_start_id = kwargs.pop("audio_start_id")
+        if not isinstance(audio_start_id, torch.Tensor):
+            raise ValueError("Incorrect type of audio_start_id. "
+                             f"Got type: {type(audio_start_id)}")
+
+        audio_end_id = kwargs.pop("audio_end_id")
+        if not isinstance(audio_end_id, torch.Tensor):
+            raise ValueError("Incorrect type of audio_end_id. "
+                             f"Got type: {type(audio_end_id)}")
+
         if audio_embeds is not None:
-            audio_embeds = [
-                audio_embeds[i][j] for i in range(len(audio_embeds))
-                for j in range(len(audio_embeds[i]))
-            ]
+            if not isinstance(audio_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_embeds. "
+                                 f"Got type: {type(audio_embeds)}")
+
             return MiniCPMOAudioEmbeddingInputs(
+                type="audio_embeds",
+                audio_embeds=flatten_bn(flatten_2d_lists(audio_embeds),
+                                        concat=True),
                 audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
                                                     audio_end_id),
-                data=audio_embeds,
-                type="audio_embeds")
-        if len(audio_features) > 0:
-            audio_features_all = [
-                i.permute(1, 0) for audio_feature in audio_features
-                for i in audio_feature
-            ]
-            audio_features = torch.nn.utils.rnn.pad_sequence(
-                audio_features_all, batch_first=True,
-                padding_value=0.0).permute(0, 2, 1)
-            audio_feature_lens = torch.cat(
-                [item for item in audio_feature_lens])
+            )
+
+        if audio_features is not None:
+            if not isinstance(audio_features, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_features. "
+                                 f"Got type: {type(audio_features)}")
+
+            audio_feature_lens = kwargs.pop("audio_feature_lens")
+            if not isinstance(audio_feature_lens, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of audio_feature_lens. "
+                                 f"Got type: {type(audio_feature_lens)}")
 
             return MiniCPMOAudioFeatureInputs(
+                type="audio_features",
+                audio_features=flatten_bn(audio_features, concat=True),
+                audio_feature_lens=flatten_bn(
+                    flatten_2d_lists(audio_feature_lens), concat=True),
                 audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
                                                     audio_end_id),
-                data=audio_features,
-                audio_feature_lens=audio_feature_lens,
-                type="audio_features")
-        return None
+            )
+
+        raise AssertionError("This line should be unreachable.")
 
     def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
                                    **kwargs: object):
@@ -803,7 +761,7 @@ def forward(
         else:
             image_inputs, audio_inputs = \
                 self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings, _ = self.get_embedding_with_vision(
+            vlm_embeddings = self.get_embedding_with_vision(
                 input_ids, image_inputs)
 
             if audio_inputs is not None:
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 48c8572c05f..23c010c63d5 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -24,6 +24,7 @@
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
 import re
+from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
 from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
@@ -63,11 +64,12 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
                          SupportsV0Only)
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -76,7 +78,7 @@
 
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: List[torch.Tensor]
+    pixel_values: list[torch.Tensor]
     """
     Shape: `(batch_size * num_images * num_slices, num_channels, height, width)`
 
@@ -101,7 +103,7 @@ class MiniCPMVImagePixelInputs(TypedDict):
 
 class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
+    image_embeds: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_slices, 
              image_feature_size, hidden_size)`
@@ -231,26 +233,15 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
 
 
 def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
-    image_num_slices = hf_inputs.get("image_num_slices", torch.empty(0))
-    video_num_slices = hf_inputs.get("video_num_slices", torch.empty(0))
-
     return dict(
-        pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_num_slices),
+        pixel_values=MultiModalFieldConfig.batched("image"),
         image_sizes=MultiModalFieldConfig.batched("image"),
-        tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_num_slices),
-        image_num_slices=MultiModalFieldConfig.batched("image"),
-        image_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "image", image_num_slices),
-        video_pixel_values=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_num_slices),
+        tgt_sizes=MultiModalFieldConfig.batched("image"),
+        image_embeds=MultiModalFieldConfig.batched("image"),
+        video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
-        video_tgt_sizes=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_num_slices),
-        video_embeds=MultiModalFieldConfig.flat_from_sizes(
-            "video", video_num_slices),
-        video_num_slices=MultiModalFieldConfig.batched("video"),
+        video_tgt_sizes=MultiModalFieldConfig.batched("video"),
+        video_embeds=MultiModalFieldConfig.batched("video"),
     )
 
 
@@ -356,12 +347,6 @@ def get_image_processor(self):
     def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
 
-    def get_supported_mm_modalities(self) -> List[str]:
-        if self.get_model_version() == (2, 6):
-            return ["image", "video"]
-        else:
-            return ["image"]
-
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         if self.get_model_version() == (2, 6):
             return {"image": None, "video": None}
@@ -526,187 +511,123 @@ def get_slice_image_placeholder(self, image_size: ImageSize,
     def get_image_prompt_texts(self,
                                image_size: ImageSize,
                                image_idx: int = 0) -> str:
-        prompt_texts = self.get_slice_image_placeholder(image_size,
-                                                        image_idx=image_idx)
-        return prompt_texts
+        return self.get_slice_image_placeholder(image_size,
+                                                image_idx=image_idx)
 
     def get_video_prompt_texts(self, image_size: ImageSize,
                                num_frames: int) -> str:
-        prompt_texts = "".join(
-            self.get_slice_image_placeholder(
-                image_size=image_size,
-                image_idx=0,
-                max_slice_nums=self.info.get_video_max_slice_num(),
-                use_image_id=False) for image_idx in range(num_frames))
-        return prompt_texts
+        return self.get_slice_image_placeholder(
+            image_size=image_size,
+            image_idx=0,
+            max_slice_nums=self.info.get_video_max_slice_num(),
+            use_image_id=False,
+        ) * num_frames
 
     def get_special_tokens(self) -> Dict[str, torch.Tensor]:
         tokenizer = self.info.get_tokenizer()
+
         special_tokens = {
-            "im_start_id": torch.tensor(tokenizer.im_start_id),
-            "im_end_id": torch.tensor(tokenizer.im_end_id)
+            "im_start_id": tokenizer.im_start_id,
+            "im_end_id": tokenizer.im_end_id,
         }
         if hasattr(tokenizer, "slice_start_id"):
-            special_tokens["slice_start_id"] = torch.tensor(
-                tokenizer.slice_start_id)
-            special_tokens["slice_end_id"] = torch.tensor(
-                tokenizer.slice_end_id)
-        return special_tokens
-
-    @staticmethod
-    def repack_processor_outputs(outputs: Any) -> BatchFeature:
-        valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"]
-        outputs = {key: outputs[key][0] for key in valid_keys}
-        return outputs
+            special_tokens["slice_start_id"] = tokenizer.slice_start_id
+            special_tokens["slice_end_id"] = tokenizer.slice_end_id
+
+        return {k: torch.tensor(v) for k, v in special_tokens.items()}
 
     def process_images(
         self,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, NestedTensors]:
-        mm_data = dict(mm_data)
-
-        images = mm_data.pop("images", [])
-        image_embeds = mm_data.pop("image_embeds", [])
-        if isinstance(images, Image.Image):
-            images = [images]
-        if isinstance(images, (list, torch.Tensor)) and len(images) > 0:
-            image_outputs = super()._call_hf_processor(
-                prompt=self.info.image_pattern * len(images),
-                mm_data={"images": images},
-                mm_kwargs=mm_kwargs)
-            image_outputs = self.repack_processor_outputs(image_outputs)
-        elif len(image_embeds) > 0:
-            image_sizes = mm_data.pop("image_sizes", None)
-            image_outputs = {
-                "image_embeds": torch.cat(image_embeds),
-                "image_sizes": image_sizes
-            }
-        else:
-            image_outputs = {}
-        return image_outputs
+        if (images := mm_data.get("images")) is None:
+            return {}
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image", ImageProcessorItems))
+
+        return self._base_call_hf_processor(
+            prompts=[self.info.image_pattern] * len(parsed_images),
+            mm_data={"images": [[image] for image in parsed_images]},
+            mm_kwargs=mm_kwargs,
+            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+        )
 
     def process_videos(
         self,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, NestedTensors]:
-        mm_data = dict(mm_data)
-
-        videos = mm_data.pop("videos", [])
-        video_embeds = mm_data.pop("video_embeds", [])
-        if len(videos) > 0 and isinstance(videos[0], Image.Image):
-            videos = [videos]
-        if isinstance(videos, list) and len(videos) > 0:
-            video_outputs = {
-                "video_pixel_values": [],
-                "video_image_sizes": [],
-                "video_tgt_sizes": [],
-                "num_frames": []
-            }
-            for video in videos:
-                parsed_video = []
-                for frame in video:
-                    if isinstance(frame, np.ndarray):
-                        parsed_video.append(Image.fromarray(frame))
-                    else:
-                        parsed_video.append(frame)
-                video = parsed_video
-                single_video_outputs = super()._call_hf_processor(
-                    prompt=self.info.image_pattern * len(video),
-                    mm_data={"images": video},
-                    mm_kwargs={
-                        **mm_kwargs, "max_slice_nums":
-                        self.info.get_video_max_slice_num()
-                    })
-                video_outputs["num_frames"].append(len(video))
-                for key in single_video_outputs:
-                    if "video_" + key in video_outputs:
-                        if key == "image_sizes":
-                            video_outputs["video_" + key].append(
-                                single_video_outputs[key][0][0])
-                        else:
-                            video_outputs["video_" +
-                                          key] += single_video_outputs[key][0]
-        elif len(video_embeds):
-            image_sizes = mm_data.pop("image_sizes", None)
-            num_frames = mm_data.pop("num_frames", None)
-            video_outputs = {
-                "video_embeds": torch.cat(video_embeds),
-                "video_image_sizes": image_sizes,
-                "num_frames": num_frames
-            }
-        else:
-            video_outputs = {}
-        return video_outputs
+        if (videos := mm_data.get("videos")) is None:
+            return {}
+
+        parsed_videos = (self._get_data_parser().parse_mm_data({
+            "video": videos
+        }).get_items("video", VideoProcessorItems))
+
+        max_slice_num = self.info.get_video_max_slice_num()
+
+        video_inputs = self._base_call_hf_processor(
+            prompts=[
+                self.info.image_pattern * len(video) for video in parsed_videos
+            ],
+            mm_data={"images": list(parsed_videos)},
+            mm_kwargs={
+                **mm_kwargs, "max_slice_nums": max_slice_num
+            },
+            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+        )
+
+        return {f"video_{k}": v for k, v in video_inputs.items()}
 
     def get_placeholder_match_pattern(self) -> str:
         return r"\(<(image|video)>./</\1>\)"
 
-    def get_placeholder_split_pattern(self) -> str:
-        return r"\(<(?:image|video)>./</(?:image|video)>\)"
-
     def process_mm_inputs(
         self,
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
-    ) -> Mapping[str, Mapping[str, NestedTensors]]:
+    ) -> Mapping[str, NestedTensors]:
         return {
-            "image": self.process_images(mm_data, mm_kwargs),
-            "video": self.process_videos(mm_data, mm_kwargs),
+            **self.process_images(mm_data, mm_kwargs),
+            **self.process_videos(mm_data, mm_kwargs),
         }
 
-    def get_input_modalities(self, mm_data) -> List[str]:
-        supported_mm_modalities = self.info.get_supported_mm_modalities()
-        input_modalities = []
-        for modality in supported_mm_modalities:
-            if modality in mm_data and mm_data[modality] != {}:
-                input_modalities.append(modality)
-        return input_modalities
-
-    def get_modality_num_counter(self, modality: str) -> str:
-        if modality == "image":
-            return "image_sizes"
-        elif modality == "video":
-            return "video_image_sizes"
-
-        raise NotImplementedError(modality)
-
-    def get_num_slices_by_modality(self, inputs: dict[str, Any], modality: str,
-                                   index: int) -> int:
-        if modality == "image":
-            return self.info.get_image_slice_nums(
-                inputs[modality]["image_sizes"][index],
-                self.info.get_max_slice_num())
-        elif modality == "video":
-            return self.info.get_image_slice_nums(
-                inputs[modality]["video_image_sizes"][index],
-                self.info.get_video_max_slice_num()
-            ) * inputs[modality]["num_frames"][index]
-        else:
-            raise ValueError(f"Unexpected modality: {modality}")
-
-    def get_prompt_texts_by_modality(self, inputs: dict[str, Any],
-                                     modality: str, index: int) -> str:
-        if modality == "image":
-            return self.get_image_prompt_texts(
-                inputs["image"]["image_sizes"][index], index)
-        elif modality == "video":
-            return self.get_video_prompt_texts(
-                inputs["video"]["video_image_sizes"][index],
-                inputs["video"]["num_frames"][index])
-        else:
-            raise ValueError(f"Unexpected modality: {modality}")
-
-    def call_base_hf_processor(
+    def _base_call_hf_processor(
         self,
-        prompt: str,
-        mm_data: Mapping[str, object],
+        prompts: list[str],
+        mm_data: Mapping[str, Sequence[object]],
         mm_kwargs: Mapping[str, object],
-    ) -> BatchFeature:
-        return super()._call_hf_processor(prompt=prompt,
-                                          mm_data=mm_data,
-                                          mm_kwargs=mm_kwargs)
+        *,
+        out_keys: set[str],
+    ) -> Mapping[str, NestedTensors]:
+        # This processor supports zipping prompt and mm_data together
+        if self.info.get_model_version() == (2, 6):
+            inputs = super()._call_hf_processor(
+                prompt=prompts,  # type: ignore
+                mm_data=mm_data,
+                mm_kwargs=mm_kwargs,
+            )
+        else:
+            inputs = defaultdict[str, list[torch.Tensor]](list)
+
+            for i, prompt in enumerate(prompts):
+                inputs_one = super()._call_hf_processor(
+                    prompt=prompt,
+                    mm_data={
+                        k: v[i]
+                        for k, v in mm_data.items()
+                    },
+                    mm_kwargs=mm_kwargs,
+                )
+
+                for k, v in inputs_one.items():
+                    assert len(v) == 1, (k, len(v))
+                    inputs[k].append(v[0])
+
+        return {k: inputs[k] for k in out_keys}
 
     def _call_hf_processor(
         self,
@@ -717,35 +638,12 @@ def _call_hf_processor(
         # Do not support combination inputs of images and videos for now
         # Try to handle interleaved multimodal data
         tokenizer = self.info.get_tokenizer()
-        inputs = self.process_mm_inputs(mm_data, mm_kwargs)
-        mm_input_modalities = self.get_input_modalities(inputs)
-
-        num_mm_slices_lst = {
-            modality: list[int]()
-            for modality in mm_input_modalities
-        }
-        for modality in mm_input_modalities:
-            num_counter_key = self.get_modality_num_counter(modality)
-            for index in range(len(inputs[modality][num_counter_key])):
-                num_mm_slices_lst[modality].append(
-                    self.get_num_slices_by_modality(inputs, modality, index))
-
-        num_mm_slices = {
-            modality: torch.tensor(v)
-            for modality, v in num_mm_slices_lst.items()
-        }
+        mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs)
 
         return BatchFeature({
-            "input_ids": np.array([tokenizer.encode(prompt)]),
-            **{
-                key: value
-                for modality in inputs
-                for key, value in inputs[modality].items()
-            },
-            **{
-                f"{modality}_num_slices": num_mm_slices[modality]
-                for modality in mm_input_modalities
-            }
+            "input_ids":
+            torch.tensor([tokenizer.encode(prompt)]),
+            **mm_inputs,
         })
 
     def _hf_processor_applies_updates(
@@ -810,7 +708,6 @@ def apply(
         hf_processor_mm_kwargs: Mapping[str, object],
         return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
-        supported_mm_modalities = self.info.get_supported_mm_modalities()
         if isinstance(prompt, list):
             prompt = self.info.get_tokenizer().decode(prompt)
         matches = re.findall(self.get_placeholder_match_pattern(), prompt)
@@ -818,7 +715,7 @@ def apply(
             f"{modality}_orders":
             torch.tensor(
                 [index for index, m in enumerate(matches) if m == modality])
-            for modality in supported_mm_modalities
+            for modality in self.info.get_supported_mm_limits()
         }
         result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
                                return_mm_hashes)
@@ -884,35 +781,35 @@ def get_embedding_with_vision(
         self,
         input_ids: torch.Tensor,
         image_inputs: Optional[MiniCPMVImageInputs],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
 
-        if image_inputs is None:  # No image
-            vision_hidden_states = torch.tensor([], device=input_ids.device)
+        if image_inputs is None:
+            return vlm_embedding
+
+        if image_inputs["type"] == "image_embeds":
+            vision_hidden_states = image_inputs["image_embeds"].to(
+                device=vlm_embedding.device,
+                dtype=vlm_embedding.dtype,
+            )
         else:
-            if image_inputs["type"] == "image_embeds":
-                vision_hidden_states = (image_inputs["data"].type(
-                    vlm_embedding.dtype).to(vlm_embedding.device))
-            else:
-                vision_hidden_states = self.get_vision_hidden_states(
-                    image_inputs)
-
-            # See NOTE in _parse_and_validate_inputs
-            image_bounds = image_inputs["image_bounds"]
-            if len(image_bounds) > 0:
-                image_indices = torch.stack([
-                    torch.arange(start, end, dtype=torch.long)
-                    for start, end in image_bounds.tolist()
-                ]).to(vlm_embedding.device)
-                vlm_embedding.scatter_(
-                    0,
-                    image_indices.view(-1, 1).repeat(1,
-                                                     vlm_embedding.shape[-1]),
-                    vision_hidden_states.view(-1,
-                                              vision_hidden_states.shape[-1]),
-                )
+            vision_hidden_states = self.get_vision_hidden_states(image_inputs)
+
+        # See NOTE in _parse_and_validate_inputs
+        image_bounds = image_inputs["image_bounds"]
+        if len(image_bounds) > 0:
+            image_indices = torch.stack([
+                torch.arange(start, end, dtype=torch.long)
+                for start, end in image_bounds.tolist()
+            ]).to(vlm_embedding.device)
+
+            vlm_embedding.scatter_(
+                0,
+                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
+                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
+            )
 
-        return vlm_embedding, vision_hidden_states
+        return vlm_embedding
 
     def _get_image_bounds(
             self,
@@ -947,90 +844,115 @@ def _parse_and_validate_image_inputs(
         input_ids: torch.Tensor,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        mm_data = {
+        image_keys = {"pixel_values", "tgt_sizes"}
+        pixel_data = {
             "image": {
-                key: kwargs.pop(key, [])
-                for key in ["pixel_values", "tgt_sizes", "image_num_slices"]
+                key: kwargs.pop(key, None)
+                for key in image_keys
             },
             "video": {
-                "pixel_values": kwargs.pop("video_pixel_values", []),
-                "tgt_sizes": kwargs.pop("video_tgt_sizes", []),
-                "video_num_slices": kwargs.pop("video_num_slices", [])
+                key: kwargs.pop("video_" + key, None)
+                for key in image_keys
             }
         }
-        im_start_id = kwargs.pop("im_start_id", None)
-        im_end_id = kwargs.pop("im_end_id", None)
+        embed_data = {
+            "image": kwargs.pop("image_embeds", None),
+            "video": kwargs.pop("video_embeds", None),
+        }
+
+        all_pixel_data = [
+            v for vs in pixel_data.values() for v in vs.values()
+            if v is not None
+        ]
+        all_embed_data = [v for v in embed_data.values() if v is not None]
+        if len(all_pixel_data) == 0 and len(all_embed_data) == 0:
+            return None
+
+        im_start_id = kwargs.pop("im_start_id")
+        if not isinstance(im_start_id, torch.Tensor):
+            raise ValueError("Incorrect type of im_start_id. "
+                             f"Got type: {type(im_start_id)}")
+
+        im_end_id = kwargs.pop("im_end_id")
+        if not isinstance(im_end_id, torch.Tensor):
+            raise ValueError("Incorrect type of im_end_id. "
+                             f"Got type: {type(im_end_id)}")
+
         slice_start_id = kwargs.pop("slice_start_id", None)
+        if slice_start_id is not None and not isinstance(
+                slice_start_id, torch.Tensor):
+            raise ValueError("Incorrect type of slice_start_id. "
+                             f"Got type: {type(slice_start_id)}")
+
         slice_end_id = kwargs.pop("slice_end_id", None)
-        mm_orders = {
-            f"{modality}": kwargs.pop(f"{modality}_orders", None)
-            for modality in ["image", "video", "audio"]
-        }
-        batch_size = max(len(mm_data["image"]["pixel_values"]),
-                         len(mm_data["video"]["pixel_values"]))
-        image_embeds = kwargs.pop("image_embeds", None)
-        video_embeds = kwargs.pop("video_embeds", None)
-        if image_embeds is not None and video_embeds is not None:
-            raise ValueError(
-                "Incorrect inputs for vision embeddings. "
-                "Image embeds and video embeds can not exist simultaneously.")
-        if video_embeds is not None:
-            image_embeds = video_embeds
-        if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError(f"Incorrect type of image embeds. "
-                                 f"Got type: {type(image_embeds)}")
-            image_embeds = torch.concat(
-                [image_embeds[i] for i in range(len(image_embeds))])
+        if slice_end_id is not None and not isinstance(slice_end_id,
+                                                       torch.Tensor):
+            raise ValueError("Incorrect type of slice_end_id. "
+                             f"Got type: {type(slice_end_id)}")
+
+        if len(all_embed_data) > 0:
+            if len(all_embed_data) > 1:
+                raise ValueError("Incorrect inputs for vision embeddings. "
+                                 "Image embeds and video embeds can not "
+                                 "exist simultaneously.")
+
+            vision_embeds, = all_embed_data
+            if not isinstance(vision_embeds, (torch.Tensor, list)):
+                raise ValueError(f"Incorrect type of vision_embeds. "
+                                 f"Got type: {type(vision_embeds)}")
 
             return MiniCPMVImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=flatten_bn(flatten_2d_lists(vision_embeds),
+                                        concat=True),
                 image_bounds=self._get_image_bounds(input_ids, im_start_id,
                                                     im_end_id, slice_start_id,
                                                     slice_end_id),
-                data=image_embeds,
-                type="image_embeds",
             )
-        for modality, modality_mm_data in mm_data.items():
-            if not isinstance(modality_mm_data["pixel_values"],
-                              (torch.Tensor, list)):
-                raise ValueError(
-                    "Incorrect type of pixel values. "
-                    f"Got type: {type(modality_mm_data['pixel_values'])}")
-
-            if not isinstance(modality_mm_data["tgt_sizes"],
-                              (torch.Tensor, list)):
-                raise ValueError(
-                    "Incorrect type of target sizes. "
-                    f"Got type: {type(modality_mm_data['tgt_sizes'])}")
-
-            if len(modality_mm_data["pixel_values"]) != len(
-                    modality_mm_data["tgt_sizes"]):
-                raise ValueError(
-                    "Inconsistent batch lengths, found: "
-                    f"{len(modality_mm_data['pixel_values'])} vs. "
-                    f"{len(modality_mm_data['tgt_sizes'])}")
-
-        pixel_values_flat: List[torch.Tensor] = []
-        tgt_sizes_flat: List[torch.Tensor] = []
+
+        order_data = dict[str, Union[torch.Tensor, list[torch.Tensor]]]()
+        for modality in ("image", "video"):
+            modality_orders = kwargs.pop(f"{modality}_orders", None)
+            if modality_orders is not None:
+                if not isinstance(modality_orders, (torch.Tensor, list)):
+                    raise ValueError(f"Incorrect type of {modality}_orders. "
+                                     f"Got type: {type(modality_orders)}")
+
+                order_data[modality] = modality_orders
+
+        batch_sizes = {
+            modality: len(modality_orders)
+            for modality, modality_orders in order_data.items()
+        }
+        unique_batch_sizes = set(batch_sizes.values())
+        assert len(unique_batch_sizes) == 1, (
+            f"Found inconsistent batch sizes: {batch_sizes}")
+        batch_size, = unique_batch_sizes
+
+        pixel_values_flat = list[torch.Tensor]()
+        tgt_sizes_flat = list[torch.Tensor]()
         for b in range(batch_size):
-            mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \
-                        else {"image": 0}
-            mm_slice_counts = {"image": 0, "video": 0} \
-                               if self.version == (2, 6) else {"image": 0}
-            mm_orders_b = [(index, modality) for modality in mm_counts
-                           for index in mm_orders[modality][b]]
+            mm_orders_b = [(idx_b.item(), modality)
+                           for modality, modality_orders in order_data.items()
+                           for idx_b in modality_orders[b]]
+
             for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
-                pos = mm_counts[modality]
-                num_slices = mm_data[modality][f"{modality}_num_slices"][b][
-                    pos]
-                slice_start_idx = mm_slice_counts[modality]
-                slice_end_idx = slice_start_idx + num_slices
-                pixel_values_flat += mm_data[modality]["pixel_values"][b][
-                    slice_start_idx:slice_end_idx]
-                tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][
-                    slice_start_idx:slice_end_idx]
-                mm_counts[modality] += 1
-                mm_slice_counts[modality] += num_slices
+                modality_pixel_data = pixel_data[modality]
+
+                modality_pixel_values = modality_pixel_data["pixel_values"]
+                if not isinstance(modality_pixel_values, (torch.Tensor, list)):
+                    raise ValueError(
+                        f"Incorrect type of pixel_values for {modality=}. "
+                        f"Got type: {type(modality_pixel_values)}")
+
+                modality_tgt_sizes = modality_pixel_data["tgt_sizes"]
+                if not isinstance(modality_tgt_sizes, (torch.Tensor, list)):
+                    raise ValueError(
+                        f"Incorrect type of tgt_sizes for {modality=}. "
+                        f"Got type: {type(modality_tgt_sizes)}")
+
+                pixel_values_flat += flatten_2d_lists(modality_pixel_values[b])
+                tgt_sizes_flat += flatten_2d_lists(modality_tgt_sizes[b])
 
         # NOTE: Input IDs does not contain image tokens during memory profiling,
         # so we allow it to be empty
@@ -1042,16 +964,13 @@ def _parse_and_validate_image_inputs(
         if len(pixel_values_flat) == 0:
             return None
 
-        if im_start_id is None:
-            return None
-
         return MiniCPMVImagePixelInputs(
+            type="pixel_values",
+            pixel_values=pixel_values_flat,
+            tgt_sizes=torch.stack(tgt_sizes_flat),
             image_bounds=self._get_image_bounds(input_ids, im_start_id,
                                                 im_end_id, slice_start_id,
                                                 slice_end_id),
-            data=pixel_values_flat,
-            tgt_sizes=torch.stack(tgt_sizes_flat),
-            type="pixel_values",
         )
 
     def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
@@ -1070,7 +989,7 @@ def forward(
         else:
             image_inputs = \
                 self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings, _ = self.get_embedding_with_vision(
+            vlm_embeddings = self.get_embedding_with_vision(
                 input_ids, image_inputs)
 
         # always pass the input via `inputs_embeds`
@@ -1136,16 +1055,8 @@ def init_resampler(self,
                        prefix: str = "") -> nn.Module:
         raise NotImplementedError
 
-    def get_vision_embedding(
-        self,
-        pixel_values: List[torch.Tensor],
-        patch_attn_mask: Optional[torch.Tensor] = None,
-        tgt_sizes: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        raise NotImplementedError
-
-    def get_vision_hidden_states(self,
-                                 data: MiniCPMVImageInputs) -> torch.Tensor:
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
         raise NotImplementedError
 
 
@@ -1216,34 +1127,26 @@ def init_resampler(self,
         return resampler.to(device=current_platform.device_type,
                             dtype=torch.get_default_dtype())
 
-    def get_vision_embedding(
-        self,
-        pixel_values: List[torch.Tensor],
-        patch_attn_mask: Optional[torch.Tensor] = None,
-        tgt_sizes: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        res = []
-        dtype = self.vpm.pos_embed.data.dtype
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+
+        P_h, P_w = self.vpm.patch_embed.patch_size
+        dtype: torch.dtype = self.vpm.pos_embed.data.dtype
+        num_prefix_tokens = getattr(self.vpm, "num_prefix_tokens", 0)
+
+        res = list[torch.Tensor]()
         for pixel_value in pixel_values:
             H, W = pixel_value[0].shape[-2:]
-            tgt_size = (
-                math.ceil(H / self.vpm.patch_embed.patch_size[0]),
-                math.ceil(W / self.vpm.patch_embed.patch_size[0]),
-            )
+            tgt_size = (math.ceil(H / P_h), math.ceil(W / P_w))
             vision_embedding = self.vpm.forward_features(
                 pixel_value.unsqueeze(0).type(dtype))
-            if (hasattr(self.vpm, "num_prefix_tokens")
-                    and self.vpm.num_prefix_tokens > 0):
-                vision_embedding = vision_embedding[:, self.vpm.
-                                                    num_prefix_tokens:]
-            res.append(self.resampler(vision_embedding, tgt_size))
-        return torch.vstack(res)
 
-    def get_vision_hidden_states(self,
-                                 data: MiniCPMVImageInputs) -> torch.Tensor:
-        pixel_values = data["data"]
+            if num_prefix_tokens > 0:
+                vision_embedding = vision_embedding[:, num_prefix_tokens:]
+            res.append(self.resampler(vision_embedding, tgt_size))
 
-        return self.get_vision_embedding(pixel_values)
+        return torch.vstack(res)
 
 
 class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
@@ -1299,45 +1202,41 @@ def init_resampler(self,
         return resampler.to(device=current_platform.device_type,
                             dtype=torch.get_default_dtype())
 
-    def get_vision_embedding(
-        self,
-        pixel_values: List[torch.Tensor],
-        patch_attn_mask: Optional[torch.Tensor] = None,
-        tgt_sizes: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        vision_embedding = self.vpm(pixel_values,
-                                    patch_attention_mask=patch_attn_mask)
-        vision_embedding = self.resampler(vision_embedding, tgt_sizes)
-        return vision_embedding
-
-    def get_vision_hidden_states(self,
-                                 data: MiniCPMVImageInputs) -> torch.Tensor:
-        pixel_values = data["data"]
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
 
-        device = self.vpm.embeddings.position_embedding.weight.device
-        dtype = self.vpm.embeddings.position_embedding.weight.dtype
-        all_pixel_values_lst = [
-            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
-        ]
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
 
-        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
-        assert isinstance(max_patches, int)
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
 
-        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
-            all_pixel_values_lst, batch_first=True, padding_value=0.0)
-        B, L, _ = all_pixel_values.shape
-        all_pixel_values = all_pixel_values.permute(0, 2,
-                                                    1).reshape(B, 3, -1, L)
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, 1, max_patches),
+        patch_attn_mask = torch.zeros((B, max_patches),
                                       dtype=torch.bool,
                                       device=device)
-        for i in range(B):
-            patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
 
-        return self.get_vision_embedding(all_pixel_values.type(dtype),
-                                         patch_attn_mask, tgt_sizes)
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=None,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes)
 
 
 class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
@@ -1394,47 +1293,37 @@ def init_resampler(self,
         return resampler.to(device=current_platform.device_type,
                             dtype=torch.get_default_dtype())
 
-    def get_vision_embedding(
-        self,
-        pixel_values: List[torch.Tensor],
-        patch_attn_mask: Optional[torch.Tensor] = None,
-        tgt_sizes: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        vision_embedding = self.vpm(
-            pixel_values,
-            patch_attention_mask=patch_attn_mask,
-            tgt_sizes=tgt_sizes,
-        )
-        return vision_embedding
-
-    def get_vision_hidden_states(self,
-                                 data: MiniCPMVImageInputs) -> torch.Tensor:
-        pixel_values = data["data"]
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
         tgt_sizes = data["tgt_sizes"]
 
-        device = self.vpm.embeddings.position_embedding.weight.device
-        dtype = self.vpm.embeddings.position_embedding.weight.dtype
-        all_pixel_values_lst = [
-            i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
-        ]
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
 
-        max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
-        assert isinstance(max_patches, int)
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
 
-        all_pixel_values = torch.nn.utils.rnn.pad_sequence(
-            all_pixel_values_lst, batch_first=True, padding_value=0.0)
-        B, L, _ = all_pixel_values.shape
-        all_pixel_values = all_pixel_values.permute(0, 2,
-                                                    1).reshape(B, 3, -1, L)
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
 
-        patch_attn_mask = torch.zeros((B, 1, max_patches),
+        patch_attn_mask = torch.zeros((B, max_patches),
                                       dtype=torch.bool,
                                       device=device)
-        for i in range(B):
-            patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
         vision_embedding = self.vpm(
-            all_pixel_values.type(dtype),
-            patch_attention_mask=patch_attn_mask,
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
             tgt_sizes=tgt_sizes,
         )
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3c609fd9676..3a588bb4eab 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -665,6 +665,13 @@ def as_kwargs(
 
         return cast(BatchedTensorInputs, json_mapped)
 
+    def __delitem__(self, key: str) -> None:
+        super().__delitem__(key)
+
+        for items in self._items_by_modality.values():
+            for item in items:
+                item.pop(key, None)
+
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, self.__class__):
             return False

From 7f2be211ef0d6b818b46e0bd4f5e67ceaae0ae0c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Mar 2025 18:27:57 +0800
Subject: [PATCH 0963/1240] [Misc] Remove redundant `num_embeds` (#15443)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/gemma3_mm.py | 16 --------------
 vllm/model_executor/models/internvl.py  | 14 ------------
 vllm/model_executor/models/llava.py     | 16 --------------
 vllm/model_executor/models/pixtral.py   | 14 ------------
 vllm/model_executor/models/vision.py    | 29 +++++++++++++++++++++----
 5 files changed, 25 insertions(+), 64 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index d843232ca1b..63d3ccbf54b 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -63,9 +63,6 @@ class Gemma3ImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
-
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -317,11 +314,6 @@ def _call_hf_processor(
                 tokenizer.encode(image_repl, add_special_tokens=False)
                 for image_repl in image_repl_features
             ]
-            num_embeds = [
-                len(image_repl_feature_tokens)
-                for image_repl_feature_tokens in image_repls_feature_tokens
-            ]
-            processed_outputs["num_embeds"] = torch.tensor(num_embeds)
 
             vocab = tokenizer.get_vocab()
             image_token_id = vocab[tokenizer.image_token]
@@ -354,7 +346,6 @@ def _get_mm_fields_config(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
-            num_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -583,7 +574,6 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
         embed_is_patch = kwargs.pop("embed_is_patch", None)
-        num_embeds = kwargs.pop("num_embeds", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -601,10 +591,6 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of embed_is_patch. "
                              f"Got type: {type(embed_is_patch)}")
 
-        if not isinstance(num_embeds, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_embeds. "
-                             f"Got type: {type(num_embeds)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
 
@@ -613,7 +599,6 @@ def _parse_and_validate_image_input(
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
             embed_is_patch=embed_is_patch,
-            num_embeds=num_embeds,
         )
 
     def _image_pixels_to_features(
@@ -656,7 +641,6 @@ def get_multimodal_embeddings(
         return flatten_2d_lists(
             scatter_patch_features(*args) for args in zip(
                 image_features,
-                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e8ec91736d5..e1aa3716103 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -69,9 +69,6 @@ class InternVLImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
-
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -426,7 +423,6 @@ def __call__(
             tokenizer = self.tokenizer
             image_token_id = self.image_token_id
 
-            num_embeds = list[int]()
             embed_is_patch = list[torch.Tensor]()
 
             for pixel_values in pixel_values_lst:
@@ -438,11 +434,9 @@ def __call__(
                                                   add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                num_embeds.append(len(feature_tokens))
                 embed_is_patch.append(
                     torch.tensor(feature_tokens) == image_token_id)
 
-            image_inputs["num_embeds"] = torch.tensor(num_embeds)
             image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
@@ -607,7 +601,6 @@ def _get_mm_fields_config(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
-            num_embeds=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -840,7 +833,6 @@ def _parse_and_validate_image_input(
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
         embed_is_patch = kwargs.pop("embed_is_patch", None)
-        num_embeds = kwargs.pop("num_embeds", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -873,10 +865,6 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of embed_is_patch. "
                                  f"Got type: {type(embed_is_patch)}")
 
-            if not isinstance(num_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of num_embeds. "
-                                 f"Got type: {type(num_embeds)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
 
@@ -886,7 +874,6 @@ def _parse_and_validate_image_input(
                     pixel_values_flat),
                 num_patches=image_num_patches,
                 embed_is_patch=embed_is_patch,
-                num_embeds=num_embeds,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -941,7 +928,6 @@ def get_multimodal_embeddings(
         return flatten_2d_lists(
             scatter_patch_features(*args) for args in zip(
                 image_features,
-                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 40accfffe4f..d1014067d9d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -76,9 +76,6 @@ class PixtralHFImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
-
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -358,15 +355,10 @@ def _call_hf_processor(
                     image_height=pixel_value.shape[-2],
                 ) for pixel_value in processed_outputs["pixel_values"]
             ]
-            num_embeds = torch.tensor([(ncols + 1) * nrows
-                                       for ncols, nrows in tile_sizes])
-            # Each image may result to masks of different sizes, so we need to
-            # later use `num_embeds` to get per-image masks.
             embed_is_patch = [
                 torch.tensor(([True] * ncols + [False]) * nrows)
                 for ncols, nrows in tile_sizes
             ]
-            processed_outputs["num_embeds"] = num_embeds
             processed_outputs["embed_is_patch"] = embed_is_patch
 
         return processed_outputs
@@ -378,7 +370,6 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            num_embeds=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
@@ -627,16 +618,10 @@ def _parse_and_validate_image_input(
                     raise ValueError("Incorrect type of embed_is_patch. "
                                      f"Got type: {type(embed_is_patch)}")
 
-                num_embeds = kwargs.pop("num_embeds")
-                if not isinstance(num_embeds, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of num_embeds. "
-                                     f"Got type: {type(num_embeds)}")
-
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
                     embed_is_patch=embed_is_patch,
-                    num_embeds=num_embeds,
                 )
 
             return LlavaImagePixelInputs(
@@ -738,7 +723,6 @@ def get_multimodal_embeddings(
         return flatten_2d_lists(
             scatter_patch_features(*args) for args in zip(
                 vision_embeddings,
-                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 5da69ce7fa0..a3ad3609612 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -77,9 +77,6 @@ class PixtralImagePixelInputs(TypedDict):
     Shape: `(batch_size, num_images, num_embeds)`
     """
 
-    num_embeds: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
-
 
 class PixtralProcessorAdapter:
     """
@@ -153,7 +150,6 @@ def __call__(
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
         images_embed_is_patch = list[torch.Tensor]()
-        images_num_embeds = list[int]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -163,13 +159,11 @@ def __call__(
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
             images_embed_is_patch.append(image_tokens == image_token_id)
-            images_num_embeds.append(len(image_tokens))
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
             "embed_is_patch": images_embed_is_patch,
-            "num_embeds": torch.tensor(images_num_embeds),
         }
 
 
@@ -273,7 +267,6 @@ def _get_mm_fields_config(
         return dict(
             images=MultiModalFieldConfig.batched("image"),
             embed_is_patch=MultiModalFieldConfig.batched("image"),
-            num_embeds=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -394,16 +387,10 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of embed_is_patch. "
                              f"Got type: {type(embed_is_patch)}")
 
-        num_embeds = kwargs.pop("num_embeds")
-        if not isinstance(num_embeds, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of num_embeds. "
-                             f"Got type: {type(num_embeds)}")
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
             embed_is_patch=embed_is_patch,
-            num_embeds=num_embeds,
         )
 
     def _process_image_input(
@@ -447,7 +434,6 @@ def get_multimodal_embeddings(
         return flatten_2d_lists(
             scatter_patch_features(*args) for args in zip(
                 image_features,
-                image_input["num_embeds"],
                 image_input["embed_is_patch"],
             ))
 
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index f316e7d0ef5..250b0ee3c2a 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -155,7 +155,6 @@ def resolve_visual_encoder_outputs(
 
 def scatter_patch_features(
     features: torch.Tensor,
-    num_embeds: torch.Tensor,
     embed_is_patch: torch.Tensor,
 ) -> tuple[torch.Tensor, ...]:
     """
@@ -168,13 +167,35 @@ def scatter_patch_features(
     Args:
         features: The patch features, concatenated across each image.
           Shape: `(num_patch, feature_depth)`
-        num_embeds: The number of image embeddings for each image.
-          Shape: `(num_images,)`
         embed_is_patch: A boolean mask indicating which image embeddings
           correspond to patch tokens for each image.
           Shape: `(num_images, num_embeds)`
+
+    Note:
+        The original code only considers patch tokens as feature
+        tokens, but our processor considers all image-related tokens
+        as feature tokens because the feature tokens need to be
+        consecutive in `input_ids`.
+
+    Example:
+        A simplified example for one image:
+
+        .. code-block::
+
+            Embedding tokens (from HF processor):
+            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+
+            embed_is_patch (from HF processor):
+            [ False   True    True    False    True    True   False  False ]
+
+            Encoder outputs (from model):
+            [  p1      p2      p3      p4   ]
+
+            The resulting embedding tensor is:
+            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
     """
-    num_embeds_per_image: list[int] = num_embeds.tolist()
+    num_images, num_embeds = embed_is_patch.shape
+    num_embeds_per_image = [num_embeds] * num_images
 
     embeds_flat = features.new_full(
         (sum(num_embeds_per_image), features.shape[-1]),

From 0f7b6708399445709d230553792d9ad4ac1346bc Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 25 Mar 2025 19:01:58 +0800
Subject: [PATCH 0964/1240] [Doc] Update V1 user guide for multi-modality
 (#15460)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/v1_user_guide.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index 26b28c04fe7..b1c2807657f 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -129,6 +129,9 @@ in progress.
 - **Spec Decode**: Currently, only ngram-based spec decode is supported in V1. There
   will be follow-up work to support other types of spec decode (e.g., see [PR #13933](https://github.com/vllm-project/vllm/pull/13933)). We will prioritize the support for Eagle, MTP compared to draft model based spec decode.
 
+- **Multimodal Models**: V1 is almost fully compatible with V0 except that interleaved modality input is not supported yet.
+  See [here](https://github.com/orgs/vllm-project/projects/8) for the status of upcoming features and optimizations.
+
 #### Features to Be Supported
 
 - **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.

From b5f8e487dff3d882aa5c337da79a0bcae95e4a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Szymon=20O=C5=BC=C3=B3g?=
 <58388001+SzymonOzog@users.noreply.github.com>
Date: Tue, 25 Mar 2025 14:50:49 +0100
Subject: [PATCH 0965/1240] [Kernel] Fix conflicting macro names for gguf
 kernels (#15456)

Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gguf/gguf_kernel.cu |  20 ++--
 csrc/quantization/gguf/moe.cuh        | 160 +++++++++++++-------------
 2 files changed, 90 insertions(+), 90 deletions(-)

diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index b0f31c45e73..dbbb97e6fb3 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -375,25 +375,25 @@ torch::Tensor ggml_moe_a8(torch::Tensor X,  // input
 int64_t ggml_moe_get_block_size(int64_t type) {
   switch (type) {
     case 2:
-      return MMQ_X_Q4_0;
+      return MOE_X_Q4_0;
     case 3:
-      return MMQ_X_Q4_1;
+      return MOE_X_Q4_1;
     case 6:
-      return MMQ_X_Q5_0;
+      return MOE_X_Q5_0;
     case 7:
-      return MMQ_X_Q5_1;
+      return MOE_X_Q5_1;
     case 8:
-      return MMQ_X_Q8_0;
+      return MOE_X_Q8_0;
     case 10:
-      return MMQ_X_Q2_K;
+      return MOE_X_Q2_K;
     case 11:
-      return MMQ_X_Q3_K;
+      return MOE_X_Q3_K;
     case 12:
-      return MMQ_X_Q4_K;
+      return MOE_X_Q4_K;
     case 13:
-      return MMQ_X_Q5_K;
+      return MOE_X_Q5_K;
     case 14:
-      return MMQ_X_Q6_K;
+      return MOE_X_Q6_K;
   }
   return 0;
 }
diff --git a/csrc/quantization/gguf/moe.cuh b/csrc/quantization/gguf/moe.cuh
index 2dbafc0f742..c10c59d7a38 100644
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@@ -129,12 +129,12 @@ static __device__ __forceinline__ void moe_q(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_0 64
-  #define MMQ_Y_Q4_0 128
+  #define MOE_X_Q4_0 64
+  #define MOE_Y_Q4_0 128
   #define NWARPS_Q4_0 8
 #else
-  #define MMQ_X_Q4_0 4
-  #define MMQ_Y_Q4_0 32
+  #define MOE_X_Q4_0 4
+  #define MOE_Y_Q4_0 32
   #define NWARPS_Q4_0 4
 #endif
 
@@ -149,8 +149,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_0, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q4_0;
-  const int mmq_y = MMQ_Y_Q4_0;
+  const int mmq_x = MOE_X_Q4_0;
+  const int mmq_y = MOE_Y_Q4_0;
   const int nwarps = NWARPS_Q4_0;
 
   moe_q<scalar_t, QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
@@ -167,8 +167,8 @@ static void ggml_moe_q4_0_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_0;
-  int mmq_y = MMQ_Y_Q4_0;
+  int mmq_x = MOE_X_Q4_0;
+  int mmq_y = MOE_Y_Q4_0;
   int nwarps = NWARPS_Q4_0;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -190,12 +190,12 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_1 64
-  #define MMQ_Y_Q4_1 128
+  #define MOE_X_Q4_1 64
+  #define MOE_Y_Q4_1 128
   #define NWARPS_Q4_1 8
 #else
-  #define MMQ_X_Q4_1 4
-  #define MMQ_Y_Q4_1 32
+  #define MOE_X_Q4_1 4
+  #define MOE_Y_Q4_1 32
   #define NWARPS_Q4_1 4
 #endif
 
@@ -210,8 +210,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_1, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q4_1;
-  const int mmq_y = MMQ_Y_Q4_1;
+  const int mmq_x = MOE_X_Q4_1;
+  const int mmq_y = MOE_Y_Q4_1;
   const int nwarps = NWARPS_Q4_1;
 
   moe_q<scalar_t, QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
@@ -228,8 +228,8 @@ static void ggml_moe_q4_1_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  int mmq_x = MMQ_X_Q4_1;
-  int mmq_y = MMQ_Y_Q4_1;
+  int mmq_x = MOE_X_Q4_1;
+  int mmq_y = MOE_Y_Q4_1;
   int nwarps = NWARPS_Q4_1;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -251,12 +251,12 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_0 64
-  #define MMQ_Y_Q5_0 128
+  #define MOE_X_Q5_0 64
+  #define MOE_Y_Q5_0 128
   #define NWARPS_Q5_0 8
 #else
-  #define MMQ_X_Q5_0 4
-  #define MMQ_Y_Q5_0 32
+  #define MOE_X_Q5_0 4
+  #define MOE_Y_Q5_0 32
   #define NWARPS_Q5_0 4
 #endif
 
@@ -271,8 +271,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_0, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
   const int nwarps = NWARPS_Q5_0;
 
   moe_q<scalar_t, QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
@@ -289,8 +289,8 @@ static void ggml_moe_q5_0_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_0;
-  const int mmq_y = MMQ_Y_Q5_0;
+  const int mmq_x = MOE_X_Q5_0;
+  const int mmq_y = MOE_Y_Q5_0;
   const int nwarps = NWARPS_Q5_0;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -312,12 +312,12 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_1 64
-  #define MMQ_Y_Q5_1 128
+  #define MOE_X_Q5_1 64
+  #define MOE_Y_Q5_1 128
   #define NWARPS_Q5_1 8
 #else
-  #define MMQ_X_Q5_1 4
-  #define MMQ_Y_Q5_1 32
+  #define MOE_X_Q5_1 4
+  #define MOE_Y_Q5_1 32
   #define NWARPS_Q5_1 4
 #endif
 
@@ -332,8 +332,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_1, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
   const int nwarps = NWARPS_Q5_1;
 
   moe_q<scalar_t, QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
@@ -350,8 +350,8 @@ static void ggml_moe_q5_1_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_1;
-  const int mmq_y = MMQ_Y_Q5_1;
+  const int mmq_x = MOE_X_Q5_1;
+  const int mmq_y = MOE_Y_Q5_1;
   const int nwarps = NWARPS_Q5_1;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -373,12 +373,12 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q8_0 64
-  #define MMQ_Y_Q8_0 128
+  #define MOE_X_Q8_0 64
+  #define MOE_Y_Q8_0 128
   #define NWARPS_Q8_0 8
 #else
-  #define MMQ_X_Q8_0 4
-  #define MMQ_Y_Q8_0 32
+  #define MOE_X_Q8_0 4
+  #define MOE_Y_Q8_0 32
   #define NWARPS_Q8_0 4
 #endif
 
@@ -393,8 +393,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q8_0, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
   const int nwarps = NWARPS_Q8_0;
 
   moe_q<scalar_t, QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
@@ -411,8 +411,8 @@ static void ggml_moe_q8_0_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q8_0;
-  const int mmq_y = MMQ_Y_Q8_0;
+  const int mmq_x = MOE_X_Q8_0;
+  const int mmq_y = MOE_Y_Q8_0;
   const int nwarps = NWARPS_Q8_0;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -434,12 +434,12 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q2_K 64
-  #define MMQ_Y_Q2_K 128
+  #define MOE_X_Q2_K 64
+  #define MOE_Y_Q2_K 128
   #define NWARPS_Q2_K 8
 #else
-  #define MMQ_X_Q2_K 4
-  #define MMQ_Y_Q2_K 32
+  #define MOE_X_Q2_K 4
+  #define MOE_Y_Q2_K 32
   #define NWARPS_Q2_K 4
 #endif
 
@@ -454,8 +454,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q2_K, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
   const int nwarps = NWARPS_Q2_K;
 
   moe_q<scalar_t, QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
@@ -472,8 +472,8 @@ static void ggml_moe_q2_K_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q2_K;
-  const int mmq_y = MMQ_Y_Q2_K;
+  const int mmq_x = MOE_X_Q2_K;
+  const int mmq_y = MOE_Y_Q2_K;
   const int nwarps = NWARPS_Q2_K;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -495,12 +495,12 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q3_K 64
-  #define MMQ_Y_Q3_K 128
+  #define MOE_X_Q3_K 64
+  #define MOE_Y_Q3_K 128
   #define NWARPS_Q3_K 8
 #else
-  #define MMQ_X_Q3_K 4
-  #define MMQ_Y_Q3_K 32
+  #define MOE_X_Q3_K 4
+  #define MOE_Y_Q3_K 32
   #define NWARPS_Q3_K 4
 #endif
 
@@ -516,8 +516,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q3_K, 2)
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
 
-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
   const int nwarps = NWARPS_Q3_K;
 
   moe_q<scalar_t, QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
@@ -533,8 +533,8 @@ static void ggml_moe_q3_K_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q3_K;
-  const int mmq_y = MMQ_Y_Q3_K;
+  const int mmq_x = MOE_X_Q3_K;
+  const int mmq_y = MOE_Y_Q3_K;
   const int nwarps = NWARPS_Q3_K;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -556,12 +556,12 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q4_K 64
-  #define MMQ_Y_Q4_K 128
+  #define MOE_X_Q4_K 64
+  #define MOE_Y_Q4_K 128
   #define NWARPS_Q4_K 8
 #else
-  #define MMQ_X_Q4_K 4
-  #define MMQ_Y_Q4_K 32
+  #define MOE_X_Q4_K 4
+  #define MOE_Y_Q4_K 32
   #define NWARPS_Q4_K 4
 #endif
 
@@ -576,8 +576,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q4_K, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
   const int nwarps = NWARPS_Q4_K;
 
   moe_q<scalar_t, QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
@@ -594,8 +594,8 @@ static void ggml_moe_q4_K_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q4_K;
-  const int mmq_y = MMQ_Y_Q4_K;
+  const int mmq_x = MOE_X_Q4_K;
+  const int mmq_y = MOE_Y_Q4_K;
   const int nwarps = NWARPS_Q4_K;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -617,12 +617,12 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q5_K 64
-  #define MMQ_Y_Q5_K 128
+  #define MOE_X_Q5_K 64
+  #define MOE_Y_Q5_K 128
   #define NWARPS_Q5_K 8
 #else
-  #define MMQ_X_Q5_K 4
-  #define MMQ_Y_Q5_K 32
+  #define MOE_X_Q5_K 4
+  #define MOE_Y_Q5_K 32
   #define NWARPS_Q5_K 4
 #endif
 
@@ -637,8 +637,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q5_K, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
   const int nwarps = NWARPS_Q5_K;
 
   moe_q<scalar_t, QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
@@ -655,8 +655,8 @@ static void ggml_moe_q5_K_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q5_K;
-  const int mmq_y = MMQ_Y_Q5_K;
+  const int mmq_x = MOE_X_Q5_K;
+  const int mmq_y = MOE_Y_Q5_K;
   const int nwarps = NWARPS_Q5_K;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@@ -678,12 +678,12 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }
 
 #if defined(USE_ROCM)
-  #define MMQ_X_Q6_K 64
-  #define MMQ_Y_Q6_K 128
+  #define MOE_X_Q6_K 64
+  #define MOE_Y_Q6_K 128
   #define NWARPS_Q6_K 8
 #else
-  #define MMQ_X_Q6_K 4
-  #define MMQ_Y_Q6_K 32
+  #define MOE_X_Q6_K 4
+  #define MOE_Y_Q6_K 32
   #define NWARPS_Q6_K 4
 #endif
 
@@ -698,8 +698,8 @@ __launch_bounds__(WARP_SIZE_GGUF* NWARPS_Q6_K, 2)
              const int exp_stride, const int ncols_x, const int nrows_x,
              const int ncols_y, const int nrows_y, const int nrows_dst,
              const int top_k) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
   const int nwarps = NWARPS_Q6_K;
 
   moe_q<scalar_t, QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
@@ -716,8 +716,8 @@ static void ggml_moe_q6_K_q8_1_cuda(
     const int exp_stride, const int ncols_x, const int nrows_x,
     const int ncols_y, const int nrows_y, const int nrows_dst, const int top_k,
     const int tokens_post_padded, cudaStream_t stream) {
-  const int mmq_x = MMQ_X_Q6_K;
-  const int mmq_y = MMQ_Y_Q6_K;
+  const int mmq_x = MOE_X_Q6_K;
+  const int mmq_y = MOE_Y_Q6_K;
   const int nwarps = NWARPS_Q6_K;
 
   const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;

From 3ffc1c13afeff8d4a57b2600c012eb92a6a4f668 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 25 Mar 2025 22:05:39 +0800
Subject: [PATCH 0966/1240] [bugfix] fix inductor cache on
 max_position_embeddings (#15436)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/config.py b/vllm/config.py
index 7390ec5937a..87ede1e077b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -221,6 +221,9 @@ def compute_hash(self) -> str:
         factors.append(self.trust_remote_code)
         factors.append(self.rope_scaling)
         factors.append(self.rope_theta)
+        # rope cos/sin cache depends on the max_position_embeddings
+        factors.append(
+            getattr(self.hf_config, "max_position_embeddings", "None"))
         return hashlib.sha256(str(factors).encode()).hexdigest()
 
     def __init__(

From 44980f4e0086f0cf1d125b7288dca8f9572feec8 Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Tue, 25 Mar 2025 09:27:16 -0700
Subject: [PATCH 0967/1240] [CI/Build] Add tests for the V1 tpu_model_runner.
 (#14843)

Signed-off-by: Yarong Mu <ymu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh                |   4 +-
 tests/v1/tpu/worker/__init__.py              |   0
 tests/v1/tpu/worker/test_tpu_model_runner.py | 307 +++++++++++++++++++
 3 files changed, 310 insertions(+), 1 deletion(-)
 create mode 100644 tests/v1/tpu/worker/__init__.py
 create mode 100644 tests/v1/tpu/worker/test_tpu_model_runner.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index f0f53d3b716..d557feefba7 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -30,7 +30,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_4 \
     && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
     && echo TEST_5 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py" \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && echo TEST_6 \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/v1/tpu/worker/__init__.py b/tests/v1/tpu/worker/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
new file mode 100644
index 00000000000..40ae52ef05c
--- /dev/null
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -0,0 +1,307 @@
+# SPDX-License-Identifier: Apache-2.0
+import unittest.mock as mock
+
+import pytest
+
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
+                                       SchedulerOutput)
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+
+# Mock torch_xla module since it may not be available in the test environments
+torch_xla_patcher = mock.patch.dict(
+    "sys.modules", {
+        "torch_xla": mock.MagicMock(),
+        "torch_xla.core.xla_model": mock.MagicMock(),
+        "torch_xla.runtime": mock.MagicMock(),
+    })
+torch_xla_patcher.start()
+
+# Mock the PallasAttentionBackend
+pallas_attention_backend_patcher = mock.patch(
+    "vllm.v1.worker.tpu_model_runner.PallasAttentionBackend", )
+pallas_attention_backend_patcher.start()
+
+
+@pytest.fixture
+def model_runner():
+    # Patchers have already been started at module level.
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="bfloat16",  # TPUs typically use bfloat16
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+    )
+    device = "xla:0"  # Mocking TPU device
+    with mock.patch("vllm.v1.worker.tpu_model_runner.torch"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xm"), \
+         mock.patch("vllm.v1.worker.tpu_model_runner.xr"):
+        return TPUModelRunner(vllm_config, device)
+
+
+@pytest.fixture(autouse=True, scope="session")
+def cleanup_patches():
+    yield
+    torch_xla_patcher.stop()
+    pallas_attention_backend_patcher.stop()
+
+
+def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
+    new_reqs = []
+    num_scheduled_tokens = {}
+    total_num_scheduled_tokens = 0
+    for req_id in req_ids:
+        new_reqs.append(
+            NewRequestData(
+                req_id=req_id,
+                prompt_token_ids=[1, 2, 3],
+                prompt="test",
+                mm_inputs=[],
+                mm_hashes=[],
+                mm_positions=[],
+                sampling_params=SamplingParams(),
+                block_ids=[0],
+                num_computed_tokens=0,
+                lora_request=None,
+            ))
+        num_scheduled_tokens[req_id] = 3
+        total_num_scheduled_tokens += num_scheduled_tokens[req_id]
+
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+
+def _is_req_scheduled(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.input_batch.req_id_to_index
+
+
+def _is_req_added(model_runner, req_id: str) -> bool:
+    return req_id in model_runner.requests
+
+
+def _is_sampling_metadata_changed(model_runner,
+                                  sampling_metadata_before: SamplingMetadata):
+    return model_runner.input_batch.sampling_metadata is not (
+        sampling_metadata_before)
+
+
+def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
+    req_index = model_runner.input_batch.req_id_to_index[req_id]
+    block_table = model_runner.input_batch.block_table
+    req_state = model_runner.requests[req_id]
+    if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids):
+        return False
+    num_blocks = block_table.num_blocks_per_row[req_index]
+    return (block_table.block_table_np[req_index, :num_blocks] ==
+            req_state.block_ids).all()
+
+
+def test_update_states_new_request(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_finished(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # finish req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids={req_id},
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert not _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+
+def test_update_states_request_resumed(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # unschedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert not _is_req_scheduled(model_runner, req_id)
+
+    # resume req
+    cached_req_data = CachedRequestData(
+        req_id=req_id,
+        resumed_from_preemption=False,
+        new_token_ids=[],
+        new_block_ids=[],
+        num_computed_tokens=0,
+    )
+
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[cached_req_data],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_no_changes(model_runner):
+    req_id = "req_0"
+
+    # new req
+    scheduler_output = _schedule_new_request(req_id)
+
+    model_runner._update_states(scheduler_output)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+
+    # schedule req
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_id: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner.input_batch.sampling_metadata
+    model_runner._update_states(scheduler_output)
+    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
+    assert _is_req_added(model_runner, req_id)
+    assert _is_req_scheduled(model_runner, req_id)
+    assert _is_req_state_block_table_match(model_runner, req_id)
+
+
+def test_update_states_request_unscheduled(model_runner):
+    req_ids = ("req_0", "req_1")
+
+    # new reqs
+    scheduler_output = _schedule_new_request(*req_ids)
+
+    model_runner._update_states(scheduler_output)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert _is_req_scheduled(model_runner, req_ids[1])
+
+    # unschedule req_1
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_cached_reqs=[],
+        num_scheduled_tokens={req_ids[0]: 1},
+        total_num_scheduled_tokens=1,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_input_ids=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    metadata_before = model_runner._update_states(scheduler_output)
+    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+
+    assert _is_req_added(model_runner, req_ids[0])
+    assert _is_req_scheduled(model_runner, req_ids[0])
+
+    assert _is_req_added(model_runner, req_ids[1])
+    assert not _is_req_scheduled(model_runner, req_ids[1])

From 81cb710893955c1d52c3c1b341344651185be69b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antonio=20G=C3=B3mez?= <oteroantoniogomez@gmail.com>
Date: Tue, 25 Mar 2025 18:59:25 +0100
Subject: [PATCH 0968/1240] [Bugfix] Support triton==3.3.0+git95326d9f for RTX
 5090 (Unsloth + vLLM compatibility) (#15471)

Co-authored-by: ServerAI <ai@exc-mad-ai.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/ops/triton_ops/kernel_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py
index 3572d301862..5b8c1937610 100644
--- a/vllm/lora/ops/triton_ops/kernel_utils.py
+++ b/vllm/lora/ops/triton_ops/kernel_utils.py
@@ -130,7 +130,7 @@ def do_expand_kernel(
     # Identify A and B block pointers
     offset_k = tl.arange(0, BLOCK_K)
     a_ptr = (cur_input_ptr + ram[:, None] * input_d1_stride +
-             offset_k[None, :] * input_d2_stride, )
+             offset_k[None, :] * input_d2_stride)
     b_ptr = (cur_lora_ptr + cur_lora_d0_stride * lora_index +
              offset_k[:, None] * cur_lora_d2_stride +
              rbn[None, :] * cur_lora_d1_stride)

From 16d0d008d2fc73d50427127f675d5b08b8bc6ee7 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 25 Mar 2025 15:00:32 -0400
Subject: [PATCH 0969/1240] [bugfix] add supports_v1 platform interface
 (#15417)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py    |  5 ++---
 vllm/platforms/cuda.py      |  7 ++++++-
 vllm/platforms/interface.py | 10 +++++++++-
 vllm/platforms/rocm.py      |  8 +++++++-
 vllm/platforms/tpu.py       |  8 +++++++-
 5 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 80fcbec6ef5..867842cc31d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1666,9 +1666,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             _raise_or_fallback(feature_name=name, recommend_to_remove=True)
             return False
 
-        # No support for device type other than CUDA, AMD (experiemntal) or
-        # TPU (experimental) so far.
-        if not (current_platform.is_cuda_alike() or current_platform.is_tpu()):
+        # Platforms must decide if they can support v1 for this model
+        if not current_platform.supports_v1(model_config=model_config):
             _raise_or_fallback(
                 feature_name=f"device type={current_platform.device_type}",
                 recommend_to_remove=False)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bb77318092f..ca8a2d2640e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -20,8 +20,9 @@
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 else:
+    ModelConfig = None
     VllmConfig = None
 
 logger = init_logger(__name__)
@@ -303,6 +304,10 @@ def get_device_communicator_cls(cls) -> str:
     def supports_fp8(cls) -> bool:
         return cls.has_device_capability(89)
 
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 9981deee39b..36db70681a1 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -12,9 +12,10 @@
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
     from vllm.utils import FlexibleArgumentParser
 else:
+    ModelConfig = None
     VllmConfig = None
     FlexibleArgumentParser = None
 
@@ -371,6 +372,13 @@ def use_all_gather(cls) -> bool:
                 or parallel_config.distributed_executor_backend
                 == "external_launcher")
 
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        """Returns whether the current platform can support v1 for the supplied
+        model configuration.
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index ee708f5961d..d196e24ac7a 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -12,8 +12,9 @@
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 else:
+    ModelConfig = None
     VllmConfig = None
 
 logger = init_logger(__name__)
@@ -249,3 +250,8 @@ def fp8_dtype(cls) -> torch.dtype:
             return torch.float8_e4m3fnuz
         else:
             return torch.float8_e4m3fn
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on AMD gpus is experimental
+        return True
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 073d46c25d5..43d3044cb93 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -10,8 +10,9 @@
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 else:
+    ModelConfig = None
     VllmConfig = None
 
 logger = init_logger(__name__)
@@ -127,3 +128,8 @@ def get_device_communicator_cls(cls) -> str:
     @classmethod
     def use_all_gather(cls) -> bool:
         return True
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        # V1 support on TPU is experimental
+        return True

From 9225652b74a29925b4425503bcde3d62504e0ff4 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <maxdebayser@gmail.com>
Date: Tue, 25 Mar 2025 17:31:08 -0300
Subject: [PATCH 0970/1240] Add workaround for shared field_names in pydantic
 model class (#13925)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/protocol.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index a96ca1f7570..032dc49d16d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -1238,6 +1238,9 @@ class ChatCompletionLogProb(OpenAIBaseModel):
 
 
 class ChatCompletionLogProbsContent(ChatCompletionLogProb):
+    # Workaround: redefine fields name cache so that it's not
+    # shared with the super class.
+    field_names: ClassVar[Optional[set[str]]] = None
     top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
 
 
From d6ca7f7f616507478ff1d3e6086f9a3ae40c8455 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Tue, 25 Mar 2025 21:43:54 +0100
Subject: [PATCH 0971/1240] [TPU][V1] Fix Sampler recompilation (#15309)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/tpu/metadata.py     | 175 ++++++++++++-----------------
 vllm/v1/worker/tpu_model_runner.py |  36 +++---
 2 files changed, 84 insertions(+), 127 deletions(-)

diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index b4f7c19a8d3..d605c4b65e9 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -5,7 +5,18 @@
 import torch
 import torch_xla.core.xla_model as xm
 
-from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+DEFAULT_SAMPLING_PARAMS = dict(
+    temperature=-1.0,
+    min_p=0.0,
+    # strictly disabled for now
+    # top_k=-1,
+    # top_p=0.0,
+    # frequency_penalties=0.0,
+    # presence_penalties=0.0,
+    # repetition_penalties=0.0,
+)
 
 
 @dataclass
@@ -20,14 +31,8 @@ class TPUSupportedSamplingMetadata:
     top_k: torch.Tensor = None
     top_p: torch.Tensor = None
 
-    # XLA-unfriendly control flow in Sampler
-    all_greedy: bool = False
-    all_random: bool = False
     # Greedy sampling flag for compiling single xla graph.
-    do_argmax: torch.Tensor = None
-
-    # speculation not supported
-    spec_token_ids = None
+    all_greedy: torch.Tensor = None
 
     # Generator not supported by xla
     generators: dict[int,
@@ -54,106 +59,68 @@ class TPUSupportedSamplingMetadata:
     bad_words_token_ids = None
     indices_do_sample: torch.Tensor = None
 
-    def __post_init__(self):
-        temp = self.temperature
-        if self.indices_do_sample is None:
-            self.indices_do_sample = torch.zeros(temp.shape[0],
-                                                 device=temp.device,
-                                                 dtype=torch.int32)
-        if self.do_argmax is None:
-            self.do_argmax = torch.tensor(0,
-                                          dtype=torch.bool,
-                                          device=temp.device)
-
     @classmethod
-    def from_sampling_metadata(
-            cls, metadata: SamplingMetadata,
-            padded_do_sample_indices: torch.Tensor, num_do_sample: int,
-            device: torch.device) -> "TPUSupportedSamplingMetadata":
+    def from_input_batch(
+            cls, input_batch: InputBatch,
+            indices_do_sample: torch.Tensor) -> "TPUSupportedSamplingMetadata":
         """
-        Create an XLA-frienly SamplingMetadata structure. Do so by first 
-        instantiating an object with fixed-sized tensors and then writing the
-        values in input `metadata`. Do that only for non-None values so that 
-        recompilation is not triggered for optional values (None/torch.Tensor).
-        
-        In order to handle different sizes for the params that range from 1 up 
-        to `max_num_seqs`, pad tensors to the closest pre-compiled shape.
-        Same thing for `padded_do_sample_indices`, which contains the indices 
-        to be fed to the Sampler, padded to the closest pre-compiled shape.
-
-        Eg. pad to 4 temperature: [0.7, 0.2]=>[0.7, 0.2, 0.0, 0.0]
-            do_sample_indices: [4, 10]=>padded_do_sample_indices: [4, 10, 0, 0]
+        Copy sampling tensors slices from `input_batch` to on device tensors.
+
+        `InputBatch._make_sampling_metadata` causes recompilation on XLA as it 
+        slices dynamic shapes on device tensors. This impl moves the dynamic 
+        ops to CPU and produces tensors of fixed `padded_num_reqs` size. It 
+        also reuses the on-device persistent tensors managed in `input_batch`
+        to reduce waste. 
+
+        `indices_do_sample` contains the indices to be fed to the  Sampler, 
+        normally one per request, here padded to the closest pre-compiled shape
+        We expect sampling params tensors to be padded to the same fixed shape.
+
+        Eg. 3 requests, tensors padded to 4 
+            temperature: [0.7, 0.2, 0.9]=>[0.7, 0.2, 0.9, 0.0]
+            sample indices: [4, 10, 11]=>indices_do_sample: [4, 10, 11, 0]
         """
-        metadata = cls._validate_sampling_metadata(metadata)
-        # NOTE we have to initialize default tensor-based params first and
-        # skip None values altogether to produce the same xla graph.
-        num_samples = len(padded_do_sample_indices)
-        do_argmax = torch.tensor(metadata.all_greedy,
-                                 dtype=torch.bool,
-                                 device=device)
-        new_metadata = cls.get_default_sampling_params(num_samples, device,
-                                                    indices_do_sample=\
-                                                    padded_do_sample_indices,
-                                                    do_argmax=do_argmax
-                                                    )
-        supported_params = \
-            TPUSupportedSamplingMetadata._get_default_params_values()
-        # Copy input non-None values into `new_metadata` fixed-sized tensors.
-        for p_name in supported_params:
-            old_val = getattr(metadata, p_name)
-            new_val = getattr(new_metadata, p_name)
-            if isinstance(old_val, torch.Tensor):
-                new_val[:num_do_sample] = old_val
-            setattr(new_metadata, p_name, new_val)
+        num_reqs = input_batch.num_reqs
+        padded_num_reqs = len(indices_do_sample)
+
+        def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
+                       fill_val) -> torch.Tensor:
+            # Copy slice from CPU to corresponding TPU pre-allocated tensor.
+            # Pad value is the default one.
+            cpu_tensor[num_reqs:padded_num_reqs] = fill_val
+            tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
+
+        # NOTE NickLucche The sync CPU-TPU graph we produce here must be
+        # consistent. We can't have flags to skip copies or we'll end up
+        # recompiling.
+        copy_slice(input_batch.temperature_cpu_tensor, input_batch.temperature,
+                   DEFAULT_SAMPLING_PARAMS["temperature"])
+        # TODO Temporarily disabled until sampling options are enabled
+        # copy_slice(input_batch.top_p_cpu_tensor, input_batch.top_p)
+        # copy_slice(input_batch.top_k_cpu_tensor, input_batch.top_k)
+        copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
+                   DEFAULT_SAMPLING_PARAMS["min_p"])
+
+        # copy_slice(input_batch.frequency_penalties_cpu_tensor,
+        #             input_batch.frequency_penalties)
+        # copy_slice(input_batch.presence_penalties_cpu_tensor,
+        #             input_batch.presence_penalties)
+        # copy_slice(input_batch.repetition_penalties_cpu_tensor,
+        #             input_batch.repetition_penalties)
 
         xm.mark_step()
         xm.wait_device_ops()
-        return new_metadata
 
-    @classmethod
-    def get_default_sampling_params(
-            cls,
-            num_samples: int,
-            device: torch.device,
-            indices_do_sample=None,
-            do_argmax=None) -> "TPUSupportedSamplingMetadata":
-        # As sampling happens on a single traced graph, options
-        # are "disabled" by having them evaluate to an Identity op.
-        # Note that initialization is dependent on num_samples.
-        sampling_metadata_disable_value = \
-            TPUSupportedSamplingMetadata._get_default_params_values()
-        init_kwargs = dict()
-        for p_name, (default_val,
-                     dtype) in sampling_metadata_disable_value.items():
-            default_tensor = torch.full((num_samples, ),
-                                        default_val,
-                                        dtype=dtype,
-                                        device=device)
-            init_kwargs[p_name] = default_tensor
-
-        return cls(**init_kwargs,
-                   indices_do_sample=indices_do_sample,
-                   do_argmax=do_argmax)
-
-    @staticmethod
-    def _validate_sampling_metadata(
-            sampling_metadata: SamplingMetadata) -> SamplingMetadata:
-        if sampling_metadata.all_greedy:
-            # Set to None since #13587. Make sure default isn't overruled.
-            assert sampling_metadata.temperature is None
-        return sampling_metadata
-
-    @staticmethod
-    def _get_default_params_values():
-        return dict(
-            # Since #13587 greedy sampling requires branching off which leads
-            # to separate graphs. We set temp to noop and handle argmax here.
-            temperature=(1.0, torch.float32),
-            min_p=(0.0, torch.float32),
-            # strictly disabled for now
-            # top_k=(-1, torch.int32),
-            # top_p=(0.0, torch.float32),
-            # frequency_penalties=(0.0, torch.float32),
-            # presence_penalties=(0.0, torch.float32),
-            # repetition_penalties=(0.0, torch.float32),
-        )
\ No newline at end of file
+        # Slice persistent device tensors to a fixed pre-compiled padded shape.
+        return cls(
+            temperature=input_batch.temperature[:padded_num_reqs],
+            # Scalar tensor for xla-friendly tracing.
+            all_greedy=torch.tensor(input_batch.all_greedy,
+                                    dtype=torch.bool,
+                                    device=input_batch.device),
+            # TODO enable more and avoid returning None values
+            top_p=None,  # input_batch.top_p[:padded_num_reqs],
+            top_k=None,  # input_batch.top_k[:padded_num_reqs],
+            min_p=input_batch.min_p[:padded_num_reqs],
+            generators=input_batch.generators,
+            indices_do_sample=indices_do_sample)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index f4502f6b423..0e9473a3345 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -279,9 +279,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 req_data.num_computed_tokens)
             self.input_batch.block_table.append_row(req_data.new_block_ids,
                                                     req_index)
-        # Check if the batch has changed. If not, we can skip copying the
-        # sampling metadata from CPU to GPU.
-        batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
 
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
@@ -300,9 +297,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         if removed_req_indices:
             self.input_batch.condense(removed_req_indices)
 
-        # TODO This slices tensors to copy to device, triggering recompilation.
-        if batch_changed:
-            self.input_batch.refresh_sampling_metadata()
         return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
 
     def get_model(self) -> nn.Module:
@@ -597,14 +591,12 @@ def execute_model(
             # then the embedding layer is not included in the CUDA graph.
             input_ids = self.input_ids
             inputs_embeds = None
-        sampling_metadata = self.input_batch.sampling_metadata
         num_reqs = self.input_batch.num_reqs
-        # NOTE (NickLucche) here we sync with TPU: if there's any shape
-        # mismatch in pre-processing, it will trigger a small recompilation
-        # of the code thus far. Forward graph remains untouched.
+        # NOTE (NickLucche) here we sync with TPU: sampling params tensors
+        # are copied to device in chunks of pre-compiled padded shape to
+        # avoid recompilations.
         tpu_sampling_metadata = TPUSupportedSamplingMetadata.\
-            from_sampling_metadata(sampling_metadata, logits_indices,
-                                    num_reqs, self.device)
+            from_input_batch(self.input_batch, logits_indices)
         # Run the decoder
         with set_forward_context(attn_metadata, self.vllm_config):
             hidden_states = self.model(
@@ -797,21 +789,19 @@ def capture_model(self) -> None:
                                        device=device,
                                        dtype=torch.bfloat16)
             while True:
-                # Default metadata is an all_greedy setup. But since the
-                # `do_argmax` flag is a tensor, we still compile the full graph
-                meta = self.input_batch.sampling_metadata
                 indices = torch.zeros(
                     num_reqs_to_sample,
                     dtype=torch.int32,
                     device=device,
                 )
+                xm.mark_step()
                 sampling_meta = TPUSupportedSamplingMetadata.\
-                    from_sampling_metadata(meta, indices,
-                                           num_reqs_to_sample, device)
+                    from_input_batch(self.input_batch, indices)
                 logger.info("  -- num_tokens: %d, num_seqs: %d", num_tokens,
                             num_reqs_to_sample)
-                self.model.sample_from_hidden(dummy_hidden, sampling_meta)
-                xm.mark_step()
+                out = self.model.sample_from_hidden(dummy_hidden,
+                                                    sampling_meta)
+                out = out.cpu()
                 if num_reqs_to_sample >= self.max_num_reqs:
                     break
                 num_reqs_to_sample *= 2
@@ -910,6 +900,7 @@ def forward(
 
         return hidden_states
 
+    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_hidden(
         self,
         hidden_states: torch.Tensor,
@@ -923,10 +914,9 @@ def sample_from_hidden(
         sample_hidden_states = \
             hidden_states[sampling_metadata.indices_do_sample]
         logits = self.compute_logits(sample_hidden_states)
-        # Greedy sampling can't be run without branching the graph on Sampler.
-        # Therefore do_argmax/all_greedy is checked here in a xla-friendly way.
-        # NOTE do_argmax is a scalar, this is just an optimized if/else.
-        out_tokens = torch.where(sampling_metadata.do_argmax,
+        # Optimized greedy sampling branch, tracing both paths in a single pass
+        # NOTE all_greedy is a scalar, this is just an optimized if/else.
+        out_tokens = torch.where(sampling_metadata.all_greedy,
                         torch.argmax(logits, dim=-1, keepdim=True),
                         self.sample(logits, sampling_metadata)\
                                             .sampled_token_ids)

From bfb1547a857b8421145c83d01c51fd24eb784fcc Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Tue, 25 Mar 2025 14:21:36 -0700
Subject: [PATCH 0972/1240] [V1][Minor] Use `SchedulerInterface` type for
 engine scheduler field (#15499)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f4bb4583bea..42511777fee 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -23,6 +23,7 @@
                         zmq_socket_ctx)
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                          unify_kv_cache_configs)
+from vllm.v1.core.sched.interface import SchedulerInterface
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
@@ -84,7 +85,7 @@ def __init__(
                 "compatibility may not be maintained.",
                 vllm_config.scheduler_config.scheduler_cls)
 
-        self.scheduler = Scheduler(
+        self.scheduler: SchedulerInterface = Scheduler(
             scheduler_config=vllm_config.scheduler_config,
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,

From 7c60fd48ae468de6a0f817996a5b83419853c695 Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Tue, 25 Mar 2025 14:22:26 -0700
Subject: [PATCH 0973/1240] [V1] Support long_prefill_token_threshold in v1
 scheduler (#15419)

Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py     | 76 ++++++++++++++++++++++++++++-
 tests/v1/core/test_scheduler_e2e.py | 29 +++++++++++
 vllm/engine/arg_utils.py            |  4 +-
 vllm/v1/core/sched/scheduler.py     |  8 +++
 4 files changed, 113 insertions(+), 4 deletions(-)
 create mode 100644 tests/v1/core/test_scheduler_e2e.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 8916aa58000..c12f2fd5943 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -20,9 +20,10 @@ def create_scheduler(
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
     enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
 ) -> Scheduler:
     '''Create scheduler under test.
-    
+
     Args:
       model: model under test
       max_num_seqs: max sequences to schedule
@@ -38,6 +39,7 @@ def create_scheduler(
         max_num_seqs=max_num_seqs,
         max_num_batched_tokens=max_num_batched_tokens,
         max_model_len=max_num_batched_tokens,
+        long_prefill_token_threshold=long_prefill_token_threshold,
     )
     model_config = ModelConfig(
         model=model,
@@ -263,6 +265,78 @@ def test_schedule_partial_requests():
     assert requests[2].request_id not in output.num_scheduled_tokens
 
 
+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
+    """Test scheduling behavior with concurrent partial requests.
+
+    This test verifies that: there are multiple long prefill requests in the
+    RUNNING state, and we can schedule them together.
+
+    """
+    scheduler = create_scheduler(
+        model="facebook/opt-125m",
+        max_num_batched_tokens=1024,
+        long_prefill_token_threshold=400,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+
+    # The first request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+    # The second request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 400
+    # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 224
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step. All three requests are running.
+    # Processed the remaining prefills of the first and second requests.
+    output1 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output1.scheduled_new_reqs) == 0
+    assert len(output1.scheduled_cached_reqs) == 3
+    assert len(output1.finished_req_ids) == 0
+    assert output1.num_scheduled_tokens[requests[0].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[1].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[2].request_id] == 224
+
+    # Schedule the third step. All three requests are running.
+    # First and second requests are in the decode stage.
+    # All the remaining tokens in the third request are processed.
+    scheduler.update_from_output(output1, model_runner_output)
+    output2 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output2.scheduled_new_reqs) == 0
+    assert len(output2.scheduled_cached_reqs) == 3
+    assert len(output2.finished_req_ids) == 0
+    assert output2.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output2.num_scheduled_tokens[requests[1].request_id] == 1
+    assert output2.num_scheduled_tokens[
+        requests[2].request_id] == 800 - 224 - 224
+
+
 def test_stop_via_update_from_output():
     """Test stopping behavior through update_from_output"""
     scheduler = create_scheduler()
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
new file mode 100644
index 00000000000..0a79424a30b
--- /dev/null
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from vllm import LLM
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def model() -> LLM:
+    return LLM(MODEL,
+               enforce_eager=True,
+               enable_prefix_caching=True,
+               long_prefill_token_threshold=2,
+               max_num_batched_tokens=6,
+               max_num_seqs=3)
+
+
+def test_concurrent_partial_prefill(model):
+    outputs = model.generate([PROMPT] * 3)
+    assert len(outputs) == 3
+    for output in outputs:
+        assert len(output.outputs) == 1
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 867842cc31d..65a1676c063 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1625,9 +1625,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         if (self.max_num_partial_prefills
                 != EngineArgs.max_num_partial_prefills
                 or self.max_long_partial_prefills
-                != EngineArgs.max_long_partial_prefills
-                or self.long_prefill_token_threshold
-                != EngineArgs.long_prefill_token_threshold):
+                != EngineArgs.max_long_partial_prefills):
             _raise_or_fallback(feature_name="Concurrent Partial Prefill",
                                recommend_to_remove=False)
             return False
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index c71eb9a0445..9b0cddb2818 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -152,6 +152,10 @@ def schedule(self) -> SchedulerOutput:
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
+            if self.scheduler_config.long_prefill_token_threshold > 0:
+                num_new_tokens = min(
+                    num_new_tokens,
+                    self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
@@ -299,6 +303,10 @@ def schedule(self) -> SchedulerOutput:
                     num_computed_tokens -= self.block_size
                     num_new_tokens = self.block_size
                     computed_blocks.pop()
+                if self.scheduler_config.long_prefill_token_threshold > 0:
+                    num_new_tokens = min(
+                        num_new_tokens,
+                        self.scheduler_config.long_prefill_token_threshold)
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
 

From 40d8367c23004b041ae6c8fa16c02632905a3ee7 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Tue, 25 Mar 2025 14:27:22 -0700
Subject: [PATCH 0974/1240] [core] add bucket padding to tpu_model_runner
 (#14995)

Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/tpu/worker/test_tpu_model_runner.py | 22 +++++++-
 vllm/envs.py                                 |  7 +++
 vllm/v1/worker/tpu_model_runner.py           | 53 +++++++++++++-------
 3 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index 40ae52ef05c..d5f812ed4d5 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -8,7 +8,9 @@
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.worker.tpu_model_runner import TPUModelRunner
+from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
+                                             _get_padded_token_len,
+                                             _get_paddings)
 
 # Mock torch_xla module since it may not be available in the test environments
 torch_xla_patcher = mock.patch.dict(
@@ -305,3 +307,21 @@ def test_update_states_request_unscheduled(model_runner):
 
     assert _is_req_added(model_runner, req_ids[1])
     assert not _is_req_scheduled(model_runner, req_ids[1])
+
+
+def test_get_paddings():
+    min_token_size, max_token_size, padding_gap = 16, 512, 64
+    expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
+    actual_paddings = _get_paddings(min_token_size, max_token_size,
+                                    padding_gap)
+    assert actual_paddings == expected_paddings
+
+
+def test_get_padded_token_len():
+    min_token_size, max_token_size, padding_gap = 16, 512, 64
+    paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
+    assert _get_padded_token_len(paddings, 1) == 16
+    assert _get_padded_token_len(paddings, 16) == 16
+    assert _get_padded_token_len(paddings, 20) == 32
+    assert _get_padded_token_len(paddings, 300) == 320
+    assert _get_padded_token_len(paddings, 512) == 512
diff --git a/vllm/envs.py b/vllm/envs.py
index f0fd20c70e3..b4305d9c8e2 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -97,6 +97,7 @@
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
+    VLLM_TPU_BUCKET_PADDING_GAP: int = 64
 
 
 def get_default_cache_root():
@@ -627,6 +628,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION":
     lambda: bool(int(os.environ["VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION"]))
     if "VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION" in os.environ else None,
+
+    # Gap between padding buckets for the forward pass. So we have
+    # 8, we will run forward pass with [16, 24, 32, ...].
+    "VLLM_TPU_BUCKET_PADDING_GAP":
+    lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 64,
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0e9473a3345..edf859f0b94 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import bisect
 import time
 from typing import TYPE_CHECKING, Optional, cast
 from unittest.mock import patch
@@ -170,6 +171,10 @@ def __init__(
         # Range tensor with values [0 .. self.max_num_tokens - 1].
         # Used to initialize positions / context_lens / seq_lens
         self.arange_np = np.arange(self.max_num_tokens, dtype=np.int32)
+        self.num_tokens_paddings = _get_paddings(
+            min_token_size=16,
+            max_token_size=self.max_num_tokens,
+            padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         """Update the cached states and the persistent batch with the scheduler
@@ -422,7 +427,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
 
         # Do the padding and copy the tensors to the TPU.
         padded_total_num_scheduled_tokens = _get_padded_token_len(
-            total_num_scheduled_tokens)
+            self.num_tokens_paddings, total_num_scheduled_tokens)
         # Zero out to avoid spurious values from prev iteration (last cp chunk)
         self.input_ids_cpu[
             total_num_scheduled_tokens:padded_total_num_scheduled_tokens] = 0
@@ -573,7 +578,6 @@ def execute_model(
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
-
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
@@ -764,26 +768,21 @@ def capture_model(self) -> None:
         logger.info("Compiling the model with different input shapes.")
 
         start = time.perf_counter()
-        num_tokens = 16
-        while True:
+        for num_tokens in self.num_tokens_paddings:
             logger.info("  -- num_tokens: %d", num_tokens)
             self._dummy_run(self.kv_caches, num_tokens)
             xm.mark_step()
-            if num_tokens >= self.max_num_tokens:
-                break
-            num_tokens *= 2
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
 
         logger.info("Compiling sampling with different input shapes.")
         start = time.perf_counter()
-        num_tokens = 16
         hsize = self.model_config.get_hidden_size()
         device = self.device
         # Compile sampling step for different model+sampler outputs in bucketed
         # n_tokens x max_num_reqs. Graph is really small so this is fine.
-        while True:
+        for num_tokens in self.num_tokens_paddings:
             num_reqs_to_sample = MIN_NUM_SEQS
             dummy_hidden = torch.randn((num_tokens, hsize),
                                        device=device,
@@ -805,9 +804,6 @@ def capture_model(self) -> None:
                 if num_reqs_to_sample >= self.max_num_reqs:
                     break
                 num_reqs_to_sample *= 2
-            if num_tokens >= self.max_num_tokens:
-                break
-            num_tokens *= 2
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
@@ -939,12 +935,33 @@ def _get_padded_number(n: int, multiple: int) -> int:
     return ((n + multiple - 1) // multiple) * multiple
 
 
-def _get_padded_token_len(x: int) -> int:
-    if x <= 16:
-        return 16
-    return 1 << (x - 1).bit_length()
-
-
 def _get_padded_num_reqs_with_upper_limit(x, upper_limit) -> int:
     res = MIN_NUM_SEQS if x <= MIN_NUM_SEQS else 1 << (x - 1).bit_length()
     return min(res, upper_limit)
+
+
+def _get_paddings(min_token_size: int, max_token_size: int,
+                  padding_gap: int) -> list[int]:
+    """Generate a list of padding size, starting from min_token_size, 
+    ending with a number that can cover max_token_size
+    first increase the size to twice, 
+    then increase the padding size by padding_gap.
+    """
+    paddings = []
+    num = min_token_size
+    while num <= padding_gap:
+        paddings.append(num)
+        num *= 2
+    num //= 2
+    while num < max_token_size:
+        num += padding_gap
+        paddings.append(num)
+    return paddings
+
+
+def _get_padded_token_len(paddings: list[int], x: int) -> int:
+    """Return the first element in paddings list greater or equal to x.
+    """
+    index = bisect.bisect_left(paddings, x)
+    assert index < len(paddings)
+    return paddings[index]

From 7e0b5593d9dad815bce616d4510d86c92e733b26 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 25 Mar 2025 15:50:09 -0700
Subject: [PATCH 0975/1240] [Core] LoRA: V1 Scheduler optimization (#15422)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 57 ++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 29 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9b0cddb2818..924796e03da 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -239,16 +239,16 @@ def schedule(self) -> SchedulerOutput:
                 encoder_budget = new_encoder_budget
 
         # Record the LoRAs in scheduled_running_reqs
-        requested_loras: set[int] = set()
+        scheduled_loras: set[int] = set()
         if self.lora_config:
-            requested_loras = set(
+            scheduled_loras = set(
                 req.lora_request.lora_int_id for req in scheduled_running_reqs
                 if req.lora_request and req.lora_request.lora_int_id > 0)
-            assert len(requested_loras) <= self.lora_config.max_loras
+            assert len(scheduled_loras) <= self.lora_config.max_loras
 
         # Use a temporary deque to collect requests that need to be skipped
         # and put back at the head of the waiting queue later
-        waiting_for_fsm: deque[Request] = deque()
+        skipped_waiting_requests: deque[Request] = deque()
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
@@ -258,31 +258,30 @@ def schedule(self) -> SchedulerOutput:
 
                 request = self.waiting[0]
 
-                if request.status == RequestStatus.WAITING_FOR_FSM:
+                # Waiting request skipping logic
+                is_skipped = False
+                # Skip request if the structured output request is still waiting
+                # for FSM.
+                if (not is_skipped
+                        and request.status == RequestStatus.WAITING_FOR_FSM):
                     structured_output_req = request.structured_output_request
-                    if structured_output_req and structured_output_req.grammar:
+                    is_skipped = (not structured_output_req
+                                  or not structured_output_req.grammar)
+                    if not is_skipped:
                         request.status = RequestStatus.WAITING
-                    else:
-                        waiting_structured_output_req = self.waiting.popleft()
-                        waiting_for_fsm.appendleft(
-                            waiting_structured_output_req)
-                        continue
-
-                # Check that adding the request still respects the max_loras
-                # constraint.
-                if self.lora_config and request.lora_request:
+
+                # Skip request if max_loras can't be honored.
+                if (not is_skipped and self.lora_config
+                        and request.lora_request):
                     req_lora_id = request.lora_request.lora_int_id
-                    if len(requested_loras) == self.lora_config.max_loras and (
-                            req_lora_id not in requested_loras):
-                        # Cannot schedule.
-                        # TODO (varun): This means all the other requests in
-                        # the WAITING queue will be blocked by this request,
-                        # even if,
-                        # 1. these other requests do not use LoRA, or,
-                        # 2. these other requests use the already requested
-                        # LoRAs.
-                        # This is too conservative and could be optimized.
-                        break
+                    is_skipped = (len(scheduled_loras)
+                                  == self.lora_config.max_loras
+                                  and (req_lora_id not in scheduled_loras))
+
+                if is_skipped:
+                    skipped_waiting_requests.appendleft(request)
+                    self.waiting.popleft()
+                    continue
 
                 # Get already-cached tokens.
                 computed_blocks, num_computed_tokens = \
@@ -344,7 +343,7 @@ def schedule(self) -> SchedulerOutput:
                         f"Invalid request status: {request.status}")
 
                 if self.lora_config and request.lora_request:
-                    requested_loras.add(request.lora_request.lora_int_id)
+                    scheduled_loras.add(request.lora_request.lora_int_id)
                 req_to_new_block_ids[request.request_id] = [
                     b.block_id for b in computed_blocks + new_blocks
                 ]
@@ -363,8 +362,8 @@ def schedule(self) -> SchedulerOutput:
                     encoder_budget = new_encoder_budget
 
         # Put back any skipped requests at the head of the waiting queue
-        if waiting_for_fsm:
-            self.waiting.extendleft(waiting_for_fsm)
+        if skipped_waiting_requests:
+            self.waiting.extendleft(skipped_waiting_requests)
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())

From 7e43c985c88d5ce001f8a48af186f3982019b52d Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 25 Mar 2025 17:18:34 -0700
Subject: [PATCH 0976/1240] [CI/Build] LoRA: Delete long context tests (#15503)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml   |   4 +-
 tests/lora/test_long_context.py | 301 --------------------------------
 2 files changed, 1 insertion(+), 304 deletions(-)
 delete mode 100644 tests/lora/test_long_context.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 217f869f1f3..f22b2b0ab6f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -287,7 +287,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
@@ -592,8 +592,6 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    # This test runs llama 13B, so it is required to run on 4 GPUs.
-    - pytest -v -s -x lora/test_long_context.py
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
deleted file mode 100644
index f577f39ba78..00000000000
--- a/tests/lora/test_long_context.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import ast
-from typing import Optional
-
-import numpy as np
-import pytest
-
-import vllm
-from vllm import SamplingParams
-from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
-from vllm.lora.request import LoRARequest
-from vllm.model_executor.layers.rotary_embedding import (
-    LinearScalingRotaryEmbedding)
-
-from .data.long_context_test_data import prompts_and_responses
-
-context_len_to_scaling_factor = {
-    "16k": 4,
-    "32k": 8,
-}
-
-# We use the same sampling params for all requests
-sampling_params = SamplingParams(
-    temperature=0,
-    max_tokens=100,
-)
-
-
-def _create_lora_request(lora_id, long_context_infos):
-    context_len = long_context_infos[lora_id]["context_length"]
-    scaling_factor = context_len_to_scaling_factor[context_len]
-    return LoRARequest(
-        # There are 2 LoRAs for 16K, we need to add lora_id to indicate
-        # they are different LoRAs.
-        context_len + str(lora_id),
-        lora_id,
-        long_context_infos[lora_id]["lora"],
-        None,
-        4096 * scaling_factor,
-    )
-
-
-def evaluate_json_response(model_response, golden_response):
-    """Evaluates the model response against the golden response.
-
-    Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
-    match. The score quantifies how well the model is able to extract the
-    golden JSON from the long context.
-    """
-    try:
-        model_response = ast.literal_eval(model_response)
-    except Exception as e:
-        raise ValueError(
-            f"Model response is not a valid JSON. Expected {golden_response}, "
-            f"got  {model_response}") from e
-
-    # Normally, we would flatten the dictionary and compare the values, but in
-    # this case, we know that the dictionary is only 2 levels deep
-    positive_values = 0
-    total_values = 0
-    # We look at all the attributes of the person that we are extracting a
-    # biography of and copmare them to the golden response
-    for person_attribute, person_attribute_value in golden_response.items():
-        if person_attribute in model_response:
-            if isinstance(person_attribute_value, dict):
-                for (sub_attribute,
-                     sub_attribute_value) in person_attribute_value.items():
-                    total_values += 1
-                    if sub_attribute in model_response[
-                            person_attribute] and model_response[
-                                person_attribute][
-                                    sub_attribute] == sub_attribute_value:
-                        positive_values += 1
-            else:
-                total_values += 1
-                if model_response[person_attribute] == person_attribute_value:
-                    positive_values += 1
-        else:
-            # We count a missing sub-dict as a single missed value.
-            total_values += 1
-
-    # Return a score between 0 and 1
-    return positive_values / total_values
-
-
-def generate(
-    llm: vllm.LLM,
-    inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
-):
-    prompts, sampling_param, lora_request = inputs
-    outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
-    return outputs[0].outputs[0].text.strip()
-
-
-def batched_generate(
-    llm: vllm.LLM,
-    inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
-):
-    for input in inputs:
-        prompt, sampling_param, lora_req = input
-        # Add requests to the engine and run the engine
-        llm._validate_and_add_requests(prompt,
-                                       sampling_param,
-                                       lora_request=lora_req,
-                                       prompt_adapter_request=None)
-
-    outputs = llm._run_engine(use_tqdm=True)
-    return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
-
-
-@pytest.fixture(scope="module")
-def lora_llm(long_context_infos):
-    scaling_factors = [
-        context_len_to_scaling_factor[info["context_length"]]
-        for info in long_context_infos.values()
-    ]
-
-    llm = vllm.LLM(
-        "meta-llama/Llama-2-13b-chat-hf",
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=2,
-        long_lora_scaling_factors=tuple(scaling_factors),
-        max_num_batched_tokens=4096 * 8,
-        tensor_parallel_size=4,
-        # FIXME enable async output processor
-        disable_async_output_proc=True,
-        distributed_executor_backend="mp",
-        enable_chunked_prefill=True)
-    yield llm
-    del llm
-
-
-def test_rotary_emb_replaced(dist_init):
-    """Verify rotary emb in all the layers are replaced"""
-    from vllm.engine.arg_utils import EngineArgs
-    from vllm.worker.model_runner import ModelRunner
-    engine_args = EngineArgs("meta-llama/Llama-2-7b-hf",
-                             long_lora_scaling_factors=(4.0, ),
-                             enable_lora=True)
-    engine_config = engine_args.create_engine_config()
-    model_runner = ModelRunner(
-        vllm_config=engine_config,
-        is_driver_worker=True,
-    )
-    model_runner.load_model()
-    rotary_emb_count = 0
-    for module_name, module in model_runner.model.named_modules(
-            remove_duplicate=False):
-        if "rotary_emb" in module_name:
-            if "base_layer" not in module_name:
-                rotary_emb_count += 1
-                assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
-            else:
-                assert isinstance(module, LinearScalingRotaryEmbedding)
-    # Llama 2 has 32 layers.
-    assert rotary_emb_count == 32
-
-
-@pytest.mark.skip_global_cleanup
-def test_batched_rope_kernel(lora_llm, long_context_infos):
-    """We test the batched kernel by comparing the results of batched an
-        non-batched generation.
-    """
-    # Create non batched results first to compare against batched results
-    non_batched_results: list[str] = []
-
-    for lora_id, info in long_context_infos.items():
-        context_len = info["context_length"]
-        lora_prompt = (prompts_and_responses[context_len][0]["prompt"],
-                       sampling_params,
-                       _create_lora_request(lora_id, long_context_infos))
-        lora_output = generate(lora_llm, lora_prompt)
-        non_batched_results.append(lora_output)
-
-    # Create batched results
-    # Each element of the batch must be
-    # (prompt, prompt_sampling_params, prompt_lora_request)
-    batched_prompts: list[tuple[str, SamplingParams,
-                                Optional[LoRARequest]]] = []
-    for lora_id, info in long_context_infos.items():
-        context_len = info["context_length"]
-        batched_prompts.extend([
-            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
-             _create_lora_request(lora_id, long_context_infos))
-        ])
-    batched_results = batched_generate(lora_llm, batched_prompts)
-
-    # Results should be the same
-    for non_batched, batched in zip(non_batched_results, batched_results):
-        assert non_batched == batched, (
-            "Non batched and batched results should be the "
-            f"same:\n{batched}\n{non_batched}")
-
-
-@pytest.mark.skip_global_cleanup
-def test_self_consistency(lora_llm, long_context_infos):
-    """We test consistency of the batched kernel by permuting batched
-    inputs and comparing the results to the non-permuted batched results.
-    """
-    num_loras = len(long_context_infos)
-
-    # Create results in order of long_context_infos
-    batched_prompts: list[tuple[str, SamplingParams,
-                                Optional[LoRARequest]]] = []
-    for lora_id, info in long_context_infos.items():
-        context_len = info["context_length"]
-        batched_prompts.extend([
-            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
-             _create_lora_request(lora_id, long_context_infos))
-        ])
-
-    batched_results = batched_generate(lora_llm, batched_prompts)
-
-    permutation = np.random.default_rng(seed=42).permutation(num_loras)
-
-    # Create results in random order of permutation
-    batched_prompts = []
-    for i in permutation:
-        lora_id, info = list(long_context_infos.items())[i]
-        context_len = info["context_length"]
-        batched_prompts.extend([
-            (prompts_and_responses[context_len][0]["prompt"], sampling_params,
-             _create_lora_request(lora_id, long_context_infos))
-        ])
-
-    permutated_batched_results = batched_generate(lora_llm, batched_prompts)
-
-    # Results should be the same
-    for i in range(num_loras):
-        assert batched_results[i] == permutated_batched_results[
-            permutation[i]], (
-                f"Results should be the same:\n{batched_results[i]}"
-                f"\n{permutated_batched_results[permutation[i]]}")
-
-
-@pytest.mark.skip_global_cleanup
-def test_quality(lora_llm, long_context_infos):
-    """We test the quality of the answers given by the LoRA model by
-        comparing the generated text to the merged model's outputs.
-
-    This is effectively a mini-benchmark over four prompts.
-    If this test fails, this indicates that the quality of the LoRA model
-    is suboptimal compared to the merged model. For example, if the model
-    does not output valid dictionaries, this test will fail.
-
-    If needed for testing, the merged versions of the models are available
-    as part of the `conftest`.
-
-    The test is expected to run for about 1 minute on a p4de.24xlarge
-    instance.
-    """
-    scores: list[float] = []
-    for lora_id, info in long_context_infos.items():
-        context_len = info["context_length"]
-        for prompt_and_response in prompts_and_responses[context_len]:
-            lora_prompt = (prompt_and_response["prompt"], sampling_params,
-                           _create_lora_request(lora_id, long_context_infos))
-            response = generate(lora_llm, lora_prompt)
-            golden_answer = prompt_and_response["golden_answer"]
-            score = evaluate_json_response(response, golden_answer)
-            scores.append(score)
-            assert score > 0.3, ("Quality of the answer is not good enough. "
-                                 f"Expected {golden_answer}, got {response}")
-    assert np.mean(scores) > 0.5
-
-
-@pytest.mark.skip_global_cleanup
-def test_max_len(lora_llm, long_context_infos):
-    """Test that we raise an ValueError when the input of a given LoRA
-        model exceeds the maximum length."""
-    # Since each LoRA model has a different maximum length, we need to
-    # test each one separately
-    for lora_id, info in long_context_infos.items():
-        context_len = info["context_length"]
-        lora_request = _create_lora_request(lora_id, long_context_infos)
-        # Good prompt should be fine
-        good_prompt = prompts_and_responses[context_len][0]["prompt"]
-        generate(lora_llm, (good_prompt, sampling_params, lora_request))
-        # Bad prompt should raise an error
-        bad_prompt = good_prompt * 2
-        with pytest.raises(ValueError):
-            generate(lora_llm, (bad_prompt, sampling_params, lora_request))
-
-    # Also test batched
-    batched_prompts: list[tuple[str, SamplingParams,
-                                Optional[LoRARequest]]] = []
-    for lora_id_with_bad_inputs in long_context_infos:
-        for lora_id, info in long_context_infos.items():
-            context_len = info["context_length"]
-            batched_prompts.extend([
-                (prompts_and_responses[context_len][0]["prompt"] *
-                 (2 if lora_id == lora_id_with_bad_inputs else 1),
-                 sampling_params,
-                 _create_lora_request(lora_id, long_context_infos))
-            ])
-        # Turn good prompt into bad prompt inside of batched prompts
-
-        with pytest.raises(ValueError):
-            batched_generate(lora_llm, batched_prompts)

From 894621d89167ffb4f540172055e1d39141c8220a Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 03:26:16 +0000
Subject: [PATCH 0977/1240] Transformers backend already supports V1 (#15463)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/test_transformers.py          | 22 +++++-----------------
 vllm/engine/arg_utils.py                   |  8 --------
 vllm/model_executor/models/transformers.py |  2 ++
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 243cb92ae25..c45fc7e649e 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -3,8 +3,6 @@
 
 Run `pytest tests/models/test_transformers.py`.
 """
-from contextlib import nullcontext
-
 import pytest
 
 from ..conftest import HfRunner, VllmRunner
@@ -42,7 +40,6 @@ def check_implementation(
     "model,model_impl",
     [
         ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
-        ("openai-community/gpt2", "transformers"),
         ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
     ])  # trust_remote_code=True by default
 def test_models(
@@ -52,20 +49,11 @@ def test_models(
     model: str,
     model_impl: str,
 ) -> None:
-
-    maybe_raises = nullcontext()
-    if model == "openai-community/gpt2" and model_impl == "transformers":
-        # Model is not backend compatible
-        maybe_raises = pytest.raises(
-            ValueError,
-            match="The Transformers implementation.*not compatible with vLLM")
-
-    with maybe_raises:
-        check_implementation(hf_runner,
-                             vllm_runner,
-                             example_prompts,
-                             model,
-                             model_impl=model_impl)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         model,
+                         model_impl=model_impl)
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 65a1676c063..75ac326aaa3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1613,14 +1613,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # No TransformersModel support so far.
-        if (model_config.model_impl == ModelImpl.TRANSFORMERS
-                or model_config.model_impl == "transformers"):
-            _raise_or_fallback(
-                feature_name=f"model_impl={model_config.model_impl}",
-                recommend_to_remove=False)
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
                 != EngineArgs.max_num_partial_prefills
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fe6a9d7a4aa..56ec00dcf22 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -24,6 +24,7 @@
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, VllmConfig)
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -109,6 +110,7 @@ def replace_linear_class(
     )
 
 
+@support_torch_compile
 class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"

From 43a133eb5e2afc35a9a4fef2920bc45aef52ef03 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 26 Mar 2025 11:26:33 +0800
Subject: [PATCH 0978/1240] [Model] Support multi-image for Molmo (#15438)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  2 +-
 .../vision_language/test_models.py            |  2 +-
 vllm/model_executor/models/molmo.py           | 57 +++++++++----------
 vllm/model_executor/models/vision.py          | 13 +++--
 4 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 56ea8c5d837..f106195e105 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -853,7 +853,7 @@ See [this page](#generative-models) for more information on how to use generativ
   *
 - * `MolmoForCausalLM`
   * Molmo
-  * T + I
+  * T + I<sup>+</sup>
   * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
   * ✅︎
   * ✅︎
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 94b61b6ae78..d500ef5d8b8 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -431,7 +431,7 @@
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
-        test_type=(VLMTestType.IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=identity,
         max_model_len=4096,
         max_num_seqs=2,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3f0c644a5a8..146d48e5221 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -57,7 +57,7 @@
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import select_patch_features
+from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -71,13 +71,13 @@
 
 
 class MolmoImageInputs(TypedDict):
-    images: Union[torch.Tensor, List[torch.Tensor]]
+    images: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
 
-    image_masks: Optional[Union[torch.Tensor, List[torch.Tensor]]]
+    image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
     """Shape: `(batch_size, num_crops, num_patch)`"""
 
-    feat_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
@@ -85,7 +85,7 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, List[torch.Tensor]]
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
@@ -93,7 +93,7 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size, num_embeds)`
     """
 
-    num_crops: torch.Tensor
+    num_crops: Union[torch.Tensor, list[torch.Tensor]]
     """Shape: `(batch_size, num_images)`"""
 
 
@@ -1144,13 +1144,7 @@ def __call__(
 
         image_input_idx = outputs.pop("image_input_idx", None)
         if image_input_idx is not None:
-            input_is_patch = input_ids == self.image_patch_id
-            image_input_idx_flat: torch.Tensor = image_input_idx.view(-1)
-            image_valid_flat = image_input_idx_flat >= 0
-            feat_is_patch_flat = image_valid_flat.clone()
-            feat_is_patch_flat[image_valid_flat] = (
-                input_is_patch[image_input_idx_flat[image_valid_flat]])
-            feat_is_patch = feat_is_patch_flat.view(*image_input_idx.shape)
+            feat_is_patch = image_input_idx >= 0
 
             input_is_embed = torch.isin(
                 input_ids,
@@ -1165,6 +1159,17 @@ def __call__(
             embed_is_patch = embed_ids == self.image_patch_id
             assert embed_is_patch.sum() == feat_is_patch.sum()
 
+            # image_tokens = extra_joint + joint
+            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
+            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
+            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
+            assert len(embed_start) == len(embed_end) == len(images)
+
+            embed_is_patch = [
+                embed_is_patch[start:end + 1]
+                for start, end in zip(embed_start, embed_end)
+            ]
+
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1180,7 +1185,7 @@ def __call__(
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
-        return BatchFeature(outputs, tensor_type=return_tensors)
+        return BatchFeature(outputs)
 
 
 class MolmoProcessingInfo(BaseProcessingInfo):
@@ -1190,9 +1195,7 @@ def get_hf_processor(self, **kwargs: object) -> MolmoProcessorWrapper:
         return MolmoProcessorWrapper(processor)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        # TODO: Investigate different `embed_is_patch` between cache/no-cache
-        # in multi-image case
-        return {"image": 1}
+        return {"image": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -1325,7 +1328,7 @@ def _get_mm_fields_config(
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.shared("image", num_images),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1499,7 +1502,7 @@ def _parse_and_validate_image_input(
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         if isinstance(image_input["images"], list):
             # Call the vision backbone on the whole batch at once
             images_flat = flatten_bn(image_input["images"], concat=True)
@@ -1530,7 +1533,7 @@ def _get_mm_embeds(
             feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
             num_crops: torch.Tensor,  # Shape: (num_images,)
             embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
-    ) -> list[torch.Tensor]:
+    ) -> tuple[torch.Tensor, ...]:
         """
         Scatter the patch features into a contiguous tensor that corresponds
         to the embedding tokens defined by the multimodal processor.
@@ -1565,16 +1568,12 @@ def _get_mm_embeds(
         feats_per_image = features.split(num_crops_per_image)
         f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
 
-        _, _, embed_dim = features.shape
-        (num_embeds, ) = embed_is_patch.shape
-
-        embeds_in_batch = list[torch.Tensor]()
-        for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image):
-            embeds = feats.new_full((num_embeds, embed_dim), torch.nan)
-            embeds[embed_is_patch] = feats[f_is_patch]
-            embeds_in_batch.append(embeds)
+        features = torch.cat([
+            feats[f_is_patch]
+            for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image)
+        ])
 
-        return embeds_in_batch
+        return scatter_patch_features(features, embed_is_patch)
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 250b0ee3c2a..c9145939830 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -155,7 +155,7 @@ def resolve_visual_encoder_outputs(
 
 def scatter_patch_features(
     features: torch.Tensor,
-    embed_is_patch: torch.Tensor,
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]],
 ) -> tuple[torch.Tensor, ...]:
     """
     Scatter the patch features into a contiguous tensor that corresponds
@@ -194,14 +194,19 @@ def scatter_patch_features(
             The resulting embedding tensor is:
             [  nan     p1      p2      nan      p3      p4     nan    nan  ]
     """
-    num_images, num_embeds = embed_is_patch.shape
-    num_embeds_per_image = [num_embeds] * num_images
+    num_embeds_per_image = [
+        e_is_patch.numel() for e_is_patch in embed_is_patch
+    ]
+    if isinstance(embed_is_patch, torch.Tensor):
+        embed_is_patch_flat = embed_is_patch.view(-1)
+    else:
+        embed_is_patch_flat = torch.cat(embed_is_patch)
 
     embeds_flat = features.new_full(
         (sum(num_embeds_per_image), features.shape[-1]),
         fill_value=torch.nan,
     )
-    embeds_flat[embed_is_patch.view(-1)] = features.flatten(0, -2)
+    embeds_flat[embed_is_patch_flat] = features.flatten(0, -2)
 
     return embeds_flat.split(num_embeds_per_image)
 

From cc5c5689a62e6fdf2c830bb4dd8ee5eff509c669 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Tue, 25 Mar 2025 23:31:04 -0400
Subject: [PATCH 0979/1240] [Misc] Warn about v0 in benchmark_paged_attn.py
 (#15495)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_paged_attention.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 48b351bc481..2625239b08e 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -7,10 +7,13 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
                         create_kv_caches_with_random)
 
+logger = init_logger(__name__)
+
 NUM_BLOCKS = 128 * 1024
 PARTITION_SIZE = 512
 PARTITION_SIZE_ROCM = 256
@@ -193,6 +196,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
 
 
 if __name__ == '__main__':
+    logger.warning("This script benchmarks the paged attention kernel. "
+                   "By default this is no longer used in vLLM inference.")
+
     parser = FlexibleArgumentParser(
         description="Benchmark the paged attention kernel.")
     parser.add_argument("--version",

From 06d561cda195ca7afb3d831ab3fb6c7daeca1531 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 25 Mar 2025 23:33:22 -0400
Subject: [PATCH 0980/1240] [BugFix] Fix nightly MLA failure (FA2 + MLA chunked
 prefill, i.e. V1, producing bad results) (#15492)

Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/ops/triton_merge_attn_states.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 31545b607fe..9671b933f47 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -54,6 +54,15 @@ def merge_attn_states_kernel(
 
     p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx)
     s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx)
+
+    # FA2 and FA3 have different behavior for when the sum-exp is 0, this namely
+    # arises with 0 len seqlens. FA3 returns -inf here while FA2 returns inf.
+    # If we see an inf assume FA2 and convert inf to -inf for consistency
+    # and correctness. Inf generally doesn't make sense in this context outside
+    # of undefined-behavior/FA2-case, so I think this a safe assumption.
+    p_lse = float('-inf') if p_lse == float('inf') else p_lse
+    s_lse = float('-inf') if s_lse == float('inf') else s_lse
+
     max_lse = tl.maximum(p_lse, s_lse)
     p_lse = p_lse - max_lse
     s_lse = s_lse - max_lse

From c76b0e866aed9f527fb3628144c51fbd36ca4d5a Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 25 Mar 2025 20:33:45 -0700
Subject: [PATCH 0981/1240] [misc] LoRA - Skip LoRA kernels when not required
 (#15152)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/ops/triton_ops/lora_expand.py       | 13 +++-
 .../ops/triton_ops/lora_kernel_metadata.py    | 42 ++++++++--
 vllm/lora/ops/triton_ops/lora_shrink.py       | 13 +++-
 vllm/worker/model_runner.py                   | 78 +++++++++++++------
 4 files changed, 113 insertions(+), 33 deletions(-)

diff --git a/vllm/lora/ops/triton_ops/lora_expand.py b/vllm/lora/ops/triton_ops/lora_expand.py
index b47e491ad7e..eacc6fb46eb 100644
--- a/vllm/lora/ops/triton_ops/lora_expand.py
+++ b/vllm/lora/ops/triton_ops/lora_expand.py
@@ -136,6 +136,7 @@ def _lora_expand(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1] 
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
@@ -157,11 +158,19 @@ def _lora_expand(
             identifies the the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         offset_start (int, optional): Offset start for output_tensor. 
             Defaults to 0.
         add_inputs (bool, optional): Whether to add the input tensor to the 
             output tensor. Defaults to False.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype in [torch.float16, torch.bfloat16, torch.float32]
     for weight in lora_b_weights:
         assert weight.dtype in [torch.float16, torch.bfloat16]
@@ -170,6 +179,8 @@ def _lora_expand(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check.
+    M = inputs.size(1)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -181,7 +192,6 @@ def _lora_expand(
                                            inputs.device)
 
     K = lora_b_weights[0].shape[-1]  # K= rank
-    M = inputs.size(1)
     ADD_INPUTS = add_inputs
     MAX_LORAS = lora_ids.size(0)
     CAST_TYPE = False
@@ -263,6 +273,7 @@ def _lora_expand_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     offset_start: int = 0,
     add_inputs: bool = False,
 ) -> None:
diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 2add1177e84..1dcdfc814a8 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -17,6 +17,17 @@ class LoRAKernelMeta:
     num_tokens_per_lora: torch.Tensor
     lora_token_start_loc: torch.Tensor
 
+    # The V1 architecture uses the traced torch.compile graphs to execute
+    # a forward pass. Things to note about this process,
+    # 1. The tracing infers all python scalar datatype objects into a constant
+    # value.
+    # 2. The tracing cannot handle dynamic control flow. (dynamic control flow
+    # is an experimental feature in pytorch)
+    # 3. The internals of torch.ops functions are not traced.
+    # We disguise the "no_lora" flag as a cpu tensor and leverage point number 3
+    # to early exit from inside the lora_expand / lora_shrink torch operation.
+    no_lora_flag_cpu: torch.Tensor
+
     @staticmethod
     def make(max_loras: int, max_num_tokens: int,
              device: Union[torch.device, str]) -> "LoRAKernelMeta":
@@ -47,17 +58,24 @@ def make(max_loras: int, max_num_tokens: int,
         lora_token_start_loc = torch.zeros(max_loras + 2,
                                            dtype=torch.int32,
                                            device=device)
+
+        no_lora_flag_cpu = torch.tensor([False],
+                                        dtype=torch.bool,
+                                        device='cpu')
+
         return LoRAKernelMeta(
             token_lora_mapping=token_lora_mapping,
             token_indices_sorted_by_lora_ids=token_indices_sorted_by_lora_ids,
             active_lora_ids=active_lora_ids,
             num_tokens_per_lora=num_tokens_per_lora,
-            lora_token_start_loc=lora_token_start_loc)
+            lora_token_start_loc=lora_token_start_loc,
+            no_lora_flag_cpu=no_lora_flag_cpu)
 
     def _reset(self):
         self.active_lora_ids.fill_(-1)
         self.num_tokens_per_lora.fill_(0)
         self.lora_token_start_loc.fill_(0)
+        self.no_lora_flag_cpu.fill_(False)
 
     def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
         """
@@ -70,6 +88,14 @@ def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
 
         self._reset()
 
+        # Check and record no-lora case.
+        no_lora = torch.all(token_lora_mapping == -1)
+        self.no_lora_flag_cpu[0] = no_lora
+
+        if no_lora:
+            # Early exit. LoRA kernels will not be run.
+            return
+
         num_tokens = token_lora_mapping.size(0)
 
         # copy token lora mapping
@@ -100,7 +126,7 @@ def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
     def meta_args(
         self, token_nums: int
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
-               torch.Tensor]:
+               torch.Tensor, torch.Tensor]:
         """
         This function returns the kernel metadata required for the current
         forward pass execution of the kernel. The function returns all the
@@ -111,7 +137,11 @@ def meta_args(
             token_nums (int): Number of input tokens in the current forward
             pass. 
         """
-        return (self.token_lora_mapping[:token_nums],
-                self.token_indices_sorted_by_lora_ids[:token_nums],
-                self.num_tokens_per_lora, self.lora_token_start_loc,
-                self.active_lora_ids)
+        return (
+            self.token_lora_mapping[:token_nums],
+            self.token_indices_sorted_by_lora_ids[:token_nums],
+            self.num_tokens_per_lora,
+            self.lora_token_start_loc,
+            self.active_lora_ids,
+            self.no_lora_flag_cpu,
+        )
diff --git a/vllm/lora/ops/triton_ops/lora_shrink.py b/vllm/lora/ops/triton_ops/lora_shrink.py
index a97c50c44f4..82331939d85 100644
--- a/vllm/lora/ops/triton_ops/lora_shrink.py
+++ b/vllm/lora/ops/triton_ops/lora_shrink.py
@@ -106,6 +106,7 @@ def _lora_shrink(
     num_tokens_per_lora: torch.Tensor,  # shape [max-loras + 1]
     lora_token_start_loc: torch.Tensor,  # shape [max-loras + 2]
     lora_ids: torch.Tensor,  # shape [max-loras + 1]
+    no_lora_flag_cpu: torch.Tensor,  # shape [1]
     scaling: float,
 ) -> None:
     """
@@ -126,8 +127,16 @@ def _lora_shrink(
             identifies the region in token_indices_sorted_by_lora_ids that
             LoRA lora_ids[i] should process.
         lora_ids (torch.Tensor): LoRA ids to process.
+        no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates
+            if there are any requests that require LoRA.
         scaling (float): Scaling factor.
     """
+
+    assert no_lora_flag_cpu.numel() == 1
+    if no_lora_flag_cpu.item():
+        # None of the inputs require LoRA.
+        return
+
     assert inputs.dtype == lora_a_weights[0].dtype
     assert inputs.dtype in [torch.float16, torch.bfloat16]
     for weight in lora_a_weights:
@@ -138,6 +147,8 @@ def _lora_shrink(
     assert output_tensor.is_contiguous()
 
     # metadata sanity check
+    M = inputs.size(0)
+    assert token_lora_mapping.size(0) == M
     assert token_lora_mapping.size(0) == token_indices_sorted_by_lora_ids.size(
         0)
     assert lora_ids.size(0) == num_tokens_per_lora.size(0)
@@ -146,7 +157,6 @@ def _lora_shrink(
     (lora_ptr_tensor, lora_strides_d0, lora_strides_d1,
      lora_strides_d2) = _get_lora_a_ptr(lora_a_weights, inputs.device)
     N, K = lora_a_weights[0].shape[-2:]  # K=hidden_size,N=rank
-    M = inputs.size(0)
     NUM_SLICES = len(lora_a_weights)
     MAX_LORAS = lora_ids.size(0)
 
@@ -218,6 +228,7 @@ def _lora_shrink_fake(
     num_tokens_per_lora: torch.Tensor,
     lora_token_start_loc: torch.Tensor,
     lora_ids: torch.Tensor,
+    no_lora_flag_cpu: torch.Tensor,
     scaling: float,
 ) -> None:
     return
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 473bd901b5b..edbafb48c93 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1242,6 +1242,29 @@ def profile_run(self) -> None:
         max_num_seqs = self.scheduler_config.max_num_seqs
         self._dummy_run(max_num_batched_tokens, max_num_seqs)
 
+    def _add_dummy_loras(self, num_loras: int) -> list[LoRARequest]:
+        assert num_loras > 0
+        assert self.lora_manager is not None
+
+        dummy_lora_requests: list[LoRARequest] = []
+        with self.lora_manager.dummy_lora_cache():
+            for idx in range(num_loras):
+                lora_id = idx + 1
+                dummy_lora_request = LoRARequest(
+                    lora_name=f"warmup_{lora_id}",
+                    lora_int_id=lora_id,
+                    lora_path="/not/a/real/path",
+                )
+                self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                 rank=LORA_WARMUP_RANK)
+                dummy_lora_requests.append(dummy_lora_request)
+        return dummy_lora_requests
+
+    def _remove_dummy_loras(self):
+        # Remove dummy loras.
+        assert self.lora_manager is not None
+        self.remove_all_loras()
+
     def _dummy_run(self,
                    max_num_batched_tokens: int,
                    max_num_seqs: int = 1) -> None:
@@ -1251,28 +1274,20 @@ def _dummy_run(self,
                 SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
 
             # This represents the maximum number of different requests
-            # that will have unique loras, an therefore the max amount of memory
-            # consumption create dummy lora request copies from the lora request
-            # passed in, which contains a lora from the lora warmup path.
+            # that will have unique loras, and therefore the max amount of
+            # memory consumption. Create dummy lora request copies from the
+            # lora request passed in, which contains a lora from the lora
+            # warmup path.
             dummy_lora_requests: List[LoRARequest] = []
             dummy_lora_requests_per_seq: List[LoRARequest] = []
             if self.lora_config:
-                assert self.lora_manager is not None
-                with self.lora_manager.dummy_lora_cache():
-                    for idx in range(self.lora_config.max_loras):
-                        lora_id = idx + 1
-                        dummy_lora_request = LoRARequest(
-                            lora_name=f"warmup_{lora_id}",
-                            lora_int_id=lora_id,
-                            lora_path="/not/a/real/path",
-                        )
-                        self.lora_manager.add_dummy_lora(dummy_lora_request,
-                                                         rank=LORA_WARMUP_RANK)
-                        dummy_lora_requests.append(dummy_lora_request)
-                    dummy_lora_requests_per_seq = [
-                        dummy_lora_requests[idx % len(dummy_lora_requests)]
-                        for idx in range(max_num_seqs)
-                    ]
+                dummy_lora_requests = self._add_dummy_loras(
+                    self.lora_config.max_loras)
+                assert len(dummy_lora_requests) == self.lora_config.max_loras
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
 
             # Profile memory usage with max_num_sequences sequences and the
             # total number of tokens equal to max_num_batched_tokens.
@@ -1354,9 +1369,8 @@ def _dummy_run(self,
             self.execute_model(model_input, kv_caches, intermediate_tensors)
             torch.cuda.synchronize()
             if self.lora_config:
-                # Remove dummy loras.
-                assert self.lora_manager is not None
-                self.remove_all_loras()
+                self._remove_dummy_loras()
+
             return
 
     def remove_all_loras(self):
@@ -1479,6 +1493,16 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                 dtype=self.model_config.dtype,
                 device=self.device)
 
+        dummy_lora_id: Optional[int] = None
+        dummy_lora_request: LoRARequest = []
+        if self.lora_config:
+            # The goal is to capture the LoRA kernels in cuda graphs.
+            # for this purpose, as single dummy lora is sufficient.
+            dummy_lora_requests = self._add_dummy_loras(num_loras=1)
+            assert len(dummy_lora_requests) == 1
+            dummy_lora_request = dummy_lora_requests[0]
+            dummy_lora_id = dummy_lora_request.lora_int_id
+
         with self.attn_state.graph_capture(max_batch_size), graph_capture(
                 self.device) as graph_capture_context:
             # NOTE: Capturing the largest batch size first may help reduce the
@@ -1503,10 +1527,11 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                     attn_metadata.enable_kv_scales_calculation = False
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
-                            **dict(index_mapping=[0] * batch_size,
-                                   prompt_mapping=[0] * batch_size,
+                            **dict(index_mapping=[dummy_lora_id] * batch_size,
+                                   prompt_mapping=[dummy_lora_id] * batch_size,
                                    is_prefill=False))
-                        self.set_active_loras(set(), lora_mapping)
+                        self.set_active_loras(set([dummy_lora_request]),
+                                              lora_mapping)
 
                     if self.prompt_adapter_config:
                         prompt_adapter_mapping = PromptAdapterMapping(
@@ -1562,6 +1587,9 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                     self.graph_runners[virtual_engine][batch_size] = (
                         graph_runner)
 
+        if self.lora_config:
+            self._remove_dummy_loras()
+
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]
         elapsed_time = end_time - start_time

From e1083c0146b76909849d7d986bb83114cfdbd56c Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Tue, 25 Mar 2025 22:29:54 -0700
Subject: [PATCH 0982/1240] Fix raw_request extraction in load_aware_call
 decorator (#15382)

Signed-off-by: Daniel Salib <danielsalib@meta.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 60cbb58af3d..773f52fa38f 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -68,13 +68,20 @@ def decrement_server_load(request: Request):
 def load_aware_call(func):
 
     @functools.wraps(func)
-    async def wrapper(*args, raw_request: Request, **kwargs):
+    async def wrapper(*args, **kwargs):
+        raw_request = kwargs.get("raw_request",
+                                 args[1] if len(args) > 1 else None)
+
+        if raw_request is None:
+            raise ValueError(
+                "raw_request required when server load tracking is enabled")
+
         if not raw_request.app.state.enable_server_load_tracking:
-            return await func(*args, raw_request=raw_request, **kwargs)
+            return await func(*args, **kwargs)
 
         raw_request.app.state.server_load_metrics += 1
         try:
-            response = await func(*args, raw_request=raw_request, **kwargs)
+            response = await func(*args, **kwargs)
         except Exception:
             raw_request.app.state.server_load_metrics -= 1
             raise

From 8a1c4a64354c65f7f5b41c531caf9ba0d964c1ef Mon Sep 17 00:00:00 2001
From: Bryan Lu <55512809+luyuzhe111@users.noreply.github.com>
Date: Wed, 26 Mar 2025 01:24:07 -0700
Subject: [PATCH 0983/1240] [Feature] Enhance EAGLE Architecture with Proper
 RMS Norms (#14990)

Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                           | 16 ++++++--
 vllm/model_executor/models/eagle.py      | 51 +++++++++++++++++++++---
 vllm/transformers_utils/configs/eagle.py | 15 ++++++-
 3 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 87ede1e077b..6f2da6aa871 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -800,10 +800,18 @@ def get_hidden_size(self) -> int:
 
     @property
     def is_deepseek_mla(self) -> bool:
-        return (hasattr(self.hf_text_config, "model_type")) \
-                and (self.hf_text_config.model_type in \
-                    ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'))\
-                and (self.hf_text_config.kv_lora_rank is not None)
+        if not hasattr(self.hf_text_config, "model_type"):
+            return False
+        elif self.hf_text_config.model_type in \
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp'):
+            return self.hf_text_config.kv_lora_rank is not None
+        elif self.hf_text_config.model_type == 'eagle':
+            # if the model is an EAGLE module, check for the
+            # underlying architecture
+            return self.hf_text_config.model.model_type in \
+                    ('deepseek_v2', 'deepseek_v3') \
+                and self.hf_text_config.kv_lora_rank is not None
+        return False
 
     def get_head_size(self) -> int:
         # TODO remove hard code
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 010e51a3b9f..3e4a5040b7c 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -7,6 +7,7 @@
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -59,7 +60,15 @@ class EAGLE(nn.Module):
        truncated_vocab_size < vocab_size. To use this technique, one has to find
        the top-k most frequent tokens in target dataset and add that as a tensor
        in the draft checkpoint (using key token_map). Also, the draft config
-       needs to have truncated_vocab_size (=k) as an attribute."""
+       needs to have truncated_vocab_size (=k) as an attribute.
+    4. We allow an enhanced EAGLE architecture similar to the DeepSeek MTP 
+       module with regards to the use of additional RMS norms. The original 
+       EAGLE architecture 1) skips the pre-attention norm in its first 
+       transformer block, and 2) skips the final output norm, both of which we 
+       found to be suboptimal. We also add the support for separate norms
+       applying to both the token embedding and hidden states before projection
+       as in DeepSeek MTP, which we found to improve performance as well.
+    """
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -81,9 +90,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # While weights and biases are generally not needed,
         # they are retained here to support certain unit tests
         # (e.g., spec_decode/e2e/test_eagle_correctness.py).
-        self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
-            weight=self.model.model.layers[0].input_layernorm.weight)
-        self.model.model.norm = DummyOutputNorm()
+        if not hasattr(self.config.model,
+                       "skip_prenorm") or self.config.model.skip_prenorm:
+            self.model.model.layers[0].input_layernorm = DummyInputLayerNorm(
+                weight=self.model.model.layers[0].input_layernorm.weight)
+
+        if not hasattr(
+                self.config.model,
+                "skip_output_norm") or self.config.model.skip_output_norm:
+            self.model.model.norm = DummyOutputNorm()
+
+        self.add_para_norm = False
+        if hasattr(self.config.model,
+                   "add_para_norm") and self.config.model.add_para_norm:
+            self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            self.add_para_norm = True
 
         self.orig_vocab_size = config.vocab_size
         self.truncated_vocab_size = config.truncated_vocab_size
@@ -128,8 +150,17 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.get_input_embeddings(input_ids)
 
-        inputs_embeds = self.fc(
-            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+        if self.add_para_norm:
+            inputs_embeds = torch.cat([
+                self.enorm(inputs_embeds),
+                self.hnorm(previous_hidden_states)
+            ],
+                                      dim=-1)
+        else:
+            inputs_embeds = torch.cat([inputs_embeds, previous_hidden_states],
+                                      dim=-1)
+
+        inputs_embeds = self.fc(inputs_embeds)
 
         inputs_embeds[positions == 0] = 0  # masking inputs at position=0
 
@@ -190,6 +221,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 else:
                     logger.warning_once("Found bias in the loaded weights but "
                                         "the model config doesn't have bias.")
+            elif name.startswith("enorm.weight"):
+                weight_loader = getattr(self.enorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.enorm.weight, loaded_weight)
+            elif name.startswith("hnorm.weight"):
+                weight_loader = getattr(self.hnorm.weight, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(self.hnorm.weight, loaded_weight)
             elif name.startswith("model.lm_head.") or name.startswith(
                     "model.model."):
                 model_weights[name.split("model.", 1)[-1]] = loaded_weight
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index b26aba66699..dd806061ff5 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -5,6 +5,8 @@
 
 from transformers import AutoConfig, PretrainedConfig
 
+from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
+
 
 class EAGLEConfig(PretrainedConfig):
     model_type = "eagle"
@@ -14,8 +16,17 @@ def __init__(self,
                  truncated_vocab_size: Optional[int] = None,
                  **kwargs):
 
-        model_config = None if model is None else (AutoConfig.for_model(
-            **model) if isinstance(model, dict) else model)
+        model_config: Union[PretrainedConfig, DeepseekV2Config, None]
+        if isinstance(model, dict):
+            archs = model.get("architectures", [])
+            target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
+            if any(target_arch in archs for target_arch in target_archs):
+                # AutoConfig does not support DeepSeek MoE models yet
+                model_config = DeepseekV2Config(**model)
+            else:
+                model_config = AutoConfig.for_model(**model)
+        else:
+            model_config = model
 
         for k, v in kwargs.items():
             if k != "architectures" and k != "model_type" and hasattr(

From cedc1a4c075f496f21a068644a8cfdc1d223ce24 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 26 Mar 2025 16:30:30 +0800
Subject: [PATCH 0984/1240] [FEAT][ROCm] Integrate Fused MoE Kernels from AITER
 (#14967)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_moe.py                     |  25 ++-
 .../model_executor/test_enabled_custom_ops.py |  36 ++++
 .../decoder_only/language/test_mistral.py     |  41 +----
 tests/quantization/test_fp8.py                |  23 ++-
 vllm/envs.py                                  |  15 ++
 .../layers/fused_moe/fused_moe.py             |  92 +++++++---
 vllm/model_executor/layers/fused_moe/layer.py |  14 +-
 .../layers/fused_moe/rocm_aiter_fused_moe.py  | 157 ++++++++++++++++++
 .../model_executor/layers/quantization/fp8.py |  52 ++++++
 9 files changed, 390 insertions(+), 65 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 653d2734afe..3f4dd3cf0e5 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -3,7 +3,6 @@
 
 Run `pytest tests/kernels/test_moe.py`.
 """
-
 import pytest
 import torch
 from torch.nn import Parameter
@@ -216,11 +215,17 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
 @pytest.mark.parametrize("dtype",
                          [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 @torch.inference_mode()
-def test_mixtral_moe(dtype: torch.dtype, padding: bool):
+def test_mixtral_moe(dtype: torch.dtype, padding: bool, use_rocm_aiter: bool,
+                     monkeypatch):
     """Make sure our Mixtral MoE implementation agrees with the one from
     huggingface."""
 
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # Instantiate our and huggingface's MoE blocks
     config = MixtralConfig()
     hf_moe = MixtralSparseMoeBlock(config).to(dtype).to("cuda")
@@ -268,10 +273,18 @@ def test_mixtral_moe(dtype: torch.dtype, padding: bool):
         torch.bfloat16: 1e-2,
     }
 
-    torch.testing.assert_close(hf_states.flatten(0, 1),
-                               vllm_states,
-                               rtol=mixtral_moe_tol[dtype],
-                               atol=mixtral_moe_tol[dtype])
+    if use_rocm_aiter:
+        # The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501
+        # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174  # noqa: E501
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=0.01,
+                                   atol=100)
+    else:
+        torch.testing.assert_close(hf_states.flatten(0, 1),
+                                   vllm_states,
+                                   rtol=mixtral_moe_tol[dtype],
+                                   atol=mixtral_moe_tol[dtype])
 
 
 @pytest.mark.parametrize("m", [1, 33, 64, 222])
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 24147b74127..ac2e0f3542e 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -7,6 +7,10 @@
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    dispatch_fused_experts_func, dispatch_topk_func,
+    torch_vllm_inplace_fused_experts, torch_vllm_outplace_fused_experts,
+    vllm_topk_softmax)
 from vllm.model_executor.layers.layernorm import (
     RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
     rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
@@ -92,6 +96,38 @@ def test_enabled_ops_invalid(env: str):
             RMSNorm(1024).enabled()
 
 
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    topk_func = dispatch_topk_func()
+
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_topk_softmax)
+
+        assert topk_func == rocm_aiter_topk_softmax
+    else:
+        assert topk_func == vllm_topk_softmax
+
+
+@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fused_experts_dispatch(use_rocm_aiter: str, inplace: bool,
+                                monkeypatch):
+
+    monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
+    fused_experts_func = dispatch_fused_experts_func(inplace)
+    if current_platform.is_rocm() and int(use_rocm_aiter):
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            rocm_aiter_fused_experts)
+
+        assert fused_experts_func == rocm_aiter_fused_experts
+    elif inplace:
+        assert fused_experts_func == torch_vllm_inplace_fused_experts
+    else:
+        assert fused_experts_func == torch_vllm_outplace_fused_experts
+
+
 @pytest.mark.parametrize("add_residual", [True, False])
 @pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
 @pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 4c2055361d4..ec885386dd9 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -174,15 +174,8 @@
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_models(hf_runner, vllm_runner, example_prompts, model: str,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
     # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
@@ -206,14 +199,8 @@ def test_models(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
-def test_mistral_format(
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
+def test_mistral_format(vllm_runner, example_prompts, model: str, dtype: str,
+                        max_tokens: int, num_logprobs: int) -> None:
     with vllm_runner(
             model,
             dtype=dtype,
@@ -244,11 +231,8 @@ def test_mistral_format(
 
 @pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_mistral_symbolic_languages(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_symbolic_languages(vllm_runner, model: str,
+                                    dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=8192,
@@ -266,11 +250,7 @@ def test_mistral_symbolic_languages(
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("model",
                          MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
-def test_mistral_function_calling(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
+def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
     with vllm_runner(model,
                      dtype=dtype,
                      tokenizer_mode="mistral",
@@ -301,11 +281,8 @@ def test_mistral_function_calling(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("guided_backend",
                          ["outlines", "lm-format-enforcer", "xgrammar"])
-def test_mistral_guided_decoding(
-    vllm_runner,
-    model: str,
-    guided_backend: str,
-) -> None:
+def test_mistral_guided_decoding(vllm_runner, model: str,
+                                 guided_backend: str) -> None:
     with vllm_runner(model, dtype='bfloat16',
                      tokenizer_mode="mistral") as vllm_model:
 
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 19cf29d3e65..e74e14a0dcb 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -23,8 +23,14 @@
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            monkeypatch) -> None:
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
@@ -47,7 +53,13 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
     with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
@@ -86,8 +98,13 @@ def check_model(model):
                     reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         monkeypatch) -> None:
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     # vllm_runner.apply_model() relies on V0 internals.
     monkeypatch.setenv("VLLM_USE_V1", "0")
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b4305d9c8e2..4c413006a64 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -73,6 +73,8 @@
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_MOE: bool = True
+    VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -513,6 +515,19 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # Whether to use aiter moe ops.
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_MOE":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_MOE", "True").lower() in
+             ("true", "1")),
+
+    # Whether to use aiter block scaled moe kernel.
+    # By default this is disabled.
+    "VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE":
+    lambda:
+    (os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
+     ("true", "1")),
+
     # use aiter rms norm op if aiter ops are enabled.
     "VLLM_ROCM_USE_AITER_RMSNORM":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 4de020ff81c..97e915c6033 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -17,6 +17,10 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
+                                   rocm_aiter_fused_experts,
+                                   rocm_aiter_topk_softmax)
+
 logger = init_logger(__name__)
 
 
@@ -1035,6 +1039,28 @@ def try_get_optimal_moe_config(
     return config
 
 
+def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
+                      token_expert_indices: torch.Tensor,
+                      gating_output: torch.Tensor,
+                      renormalize: bool) -> tuple[torch.Tensor, ...]:
+    ops.topk_softmax(
+        topk_weights,
+        topk_indices,
+        token_expert_indices,
+        gating_output,
+    )
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+
+    return topk_weights, topk_indices
+
+
+def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
+    if is_rocm_aiter_moe_enabled():
+        return rocm_aiter_topk_softmax
+    return vllm_topk_softmax
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -1059,17 +1085,14 @@ def fused_topk(
                                         dtype=torch.int32,
                                         device=hidden_states.device)
 
-    ops.topk_softmax(
-        topk_weights,
-        topk_ids,
-        token_expert_indicies,
-        gating_output.float(),  # TODO(woosuk): Optimize this.
-    )
-    del token_expert_indicies  # Not used. Will be used in the future.
+    gating_output_float = gating_output.float()  # TODO(woosuk): Optimize this.
 
-    if renormalize:
-        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    topk_func = dispatch_topk_func()
+    topk_weights, topk_ids = topk_func(topk_weights, topk_ids,
+                                       token_expert_indicies,
+                                       gating_output_float, renormalize)
 
+    del token_expert_indicies  # Not used. Will be used in the future.
     return topk_weights, topk_ids
 
 
@@ -1259,6 +1282,24 @@ def outplace_fused_experts_fake(
 )
 
 
+def torch_vllm_inplace_fused_experts(**kwargs) -> torch.Tensor:
+    torch.ops.vllm.inplace_fused_experts(**kwargs)
+    hidden_states = kwargs['hidden_states']
+    return hidden_states
+
+
+def torch_vllm_outplace_fused_experts(**kwargs) -> torch.Tensor:
+    return torch.ops.vllm.outplace_fused_experts(**kwargs)
+
+
+def dispatch_fused_experts_func(inplace: bool) -> Callable[..., torch.Tensor]:
+    if is_rocm_aiter_moe_enabled():
+        return rocm_aiter_fused_experts
+    if inplace:
+        return torch_vllm_inplace_fused_experts
+    return torch_vllm_outplace_fused_experts
+
+
 def fused_experts(hidden_states: torch.Tensor,
                   w1: torch.Tensor,
                   w2: torch.Tensor,
@@ -1278,20 +1319,25 @@ def fused_experts(hidden_states: torch.Tensor,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
                   block_shape: Optional[List[int]] = None) -> torch.Tensor:
-
-    if inplace:
-        torch.ops.vllm.inplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
-        return hidden_states
-    else:
-        return torch.ops.vllm.outplace_fused_experts(
-            hidden_states, w1, w2, topk_weights, topk_ids, activation,
-            use_fp8_w8a8, use_int8_w8a16, use_int4_w4a16, global_num_experts,
-            expert_map, w1_scale, w2_scale, w1_zp, w2_zp, a1_scale, a2_scale,
-            block_shape)
+    return dispatch_fused_experts_func(inplace)(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        activation=activation,
+        use_fp8_w8a8=use_fp8_w8a8,
+        use_int8_w8a16=use_int8_w8a16,
+        use_int4_w4a16=use_int4_w4a16,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        w1_zp=w1_zp,
+        w2_zp=w2_zp,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index bc134f67615..b72f51aa52b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 from torch.nn.parameter import UninitializedParameter
 
-from vllm import envs
+import vllm.envs as envs
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (get_dp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
@@ -16,6 +16,8 @@
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -118,6 +120,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight.data),
                                              requires_grad=False)
 
+        if is_rocm_aiter_moe_enabled():
+            # reshaping weights is required for aiter moe kernel.
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
+
+            layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                  requires_grad=False)
+            layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                 requires_grad=False)
+
         if current_platform.is_cpu():
             if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                 import intel_extension_for_pytorch as ipex
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
new file mode 100644
index 00000000000..c9bb676710a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+
+import torch
+
+import vllm.envs as envs
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.platforms import current_platform
+
+
+def is_rocm_aiter_moe_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_MOE \
+        and envs.VLLM_ROCM_USE_AITER \
+
+
+def is_rocm_aiter_block_scaled_moe_enabled() -> bool:
+    return is_rocm_aiter_moe_enabled() and \
+        envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE
+
+
+def rocm_aiter_fused_experts(
+        *,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        use_fp8_w8a8: bool = False,
+        w1_scale: Optional[torch.Tensor] = None,
+        w2_scale: Optional[torch.Tensor] = None,
+        block_shape: Optional[List[int]] = None,
+        expert_mask: Optional[torch.Tensor] = None,
+        **kwagrs  # Ignore additional keyword arguments
+) -> torch.Tensor:
+
+    import aiter as rocm_aiter
+    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
+
+    if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
+        assert w1_scale is not None
+        assert w2_scale is not None
+
+        local_E = E = w1.shape[0]
+        if expert_mask is not None:
+            E = expert_mask.numel()
+
+        topk = topk_ids.shape[1]
+        model_dim = w1.shape[-1]
+        dtype = hidden_states.dtype
+        # The default block sizes are 128 in AITER.
+        if block_shape is None:
+            block_shape = [128, 128]
+
+        scale_blk_k = block_shape[1]
+
+        (
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            out_asm,
+        ) = rocm_aiter_asm_fmoe.moe_sorting_ck(topk_ids,
+                                               topk_weights,
+                                               E,
+                                               model_dim,
+                                               dtype,
+                                               expert_mask=expert_mask)
+
+        a1, a1_scale = per_token_group_quant_fp8(hidden_states, scale_blk_k)
+        rocm_aiter.fmoe_fp8_blockscale_g1u1(
+            out_asm,
+            a1,
+            w1,
+            w2,
+            sorted_token_ids,
+            sorted_weight_buf,
+            sorted_expert_ids,
+            num_valid_ids,
+            topk,
+            w1_scale.view(local_E, -1),
+            w2_scale.view(local_E, -1),
+            a1_scale.t().contiguous(),
+            block_shape[0],
+            block_shape[1],
+            None,
+        )
+        return out_asm
+
+    elif use_fp8_w8a8:
+        return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
+                                           w1=w1,
+                                           w2=w2,
+                                           topk_weight=topk_weights,
+                                           topk_ids=topk_ids,
+                                           fc1_scale=w1_scale,
+                                           fc2_scale=w2_scale,
+                                           fc1_smooth_scale=None,
+                                           fc2_smooth_scale=None,
+                                           a16=False)
+
+    return rocm_aiter.ck_moe(hidden_states=hidden_states,
+                             w1=w1,
+                             w2=w2,
+                             topk_weights=topk_weights,
+                             topk_ids=topk_ids)
+
+
+def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
+                            topk_indices: torch.Tensor,
+                            token_expert_indices: torch.Tensor,
+                            gating_output: torch.Tensor,
+                            renormalize: bool) -> tuple[torch.Tensor, ...]:
+    import aiter as rocm_aiter
+    rocm_aiter.topk_softmax(topk_weights, topk_indices, token_expert_indices,
+                            gating_output, renormalize)
+
+    return topk_weights, topk_indices
+
+
+def shuffle_weights(*tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+    """
+    Applies shuffle_weight function from AITER to each 
+    input tensor and returns them.
+
+    Args:
+    *tensors: Variable number of torch.Tensor objects.
+
+    Returns:
+    A tuple of shuffled tensors.
+    """
+    from aiter.ops.shuffle import shuffle_weight
+
+    return tuple(shuffle_weight(tensor) for tensor in tensors)
+
+
+def expand_weights(*tensors: torch.Tensor,
+                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
+    """
+    Expands the dimensions of input tensors.
+
+    Args:
+        *tensors: A variable number of torch.Tensor objects.
+        expansion_dims: A list of expansion dimensions 
+        corresponding to each tensor.
+
+    Returns:
+        A tuple of tensors with expanded dimensions.
+    """
+
+    assert len(tensors) == len(expansion_dims), \
+    "Number of tensors must match the number of expansion dimensions."
+
+    return tuple(
+        tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
+        for tensor, dim in zip(tensors, expansion_dims))
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d92b0931a6e..bc17a569da2 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,6 +13,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+    expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
+    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -554,6 +557,15 @@ def process_weights_after_loading(self, layer: Module) -> None:
             layer.w2_weight = Parameter(w2_weight, requires_grad=False)
             layer.w2_weight_scale_inv = Parameter(w2_weight_scale_inv,
                                                   requires_grad=False)
+            if is_rocm_aiter_block_scaled_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight.data, layer.w2_weight.data)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -581,6 +593,26 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                   requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(w2_weight,
                                                  requires_grad=False)
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                w13_scales, w2_scales = expand_weights(
+                    layer.w13_weight_scale.data,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=[
+                        layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                    ])
+                layer.w13_weight_scale = torch.nn.Parameter(
+                    w13_scales.contiguous(), requires_grad=False)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
             return
 
         # If checkpoint is fp8, we need to handle that the
@@ -648,6 +680,26 @@ def process_weights_after_loading(self, layer: Module) -> None:
                             dq_weight, max_w13_scales[expert_id])
                     start += shard_size
 
+            if is_rocm_aiter_moe_enabled():
+                # reshaping weights is required for aiter moe kernel.
+                expansion_dims = [
+                    layer.w13_weight.shape[1], layer.w2_weight.shape[1]
+                ]
+                max_w13_scales, w2_scales = expand_weights(
+                    max_w13_scales,
+                    layer.w2_weight_scale.data,
+                    expansion_dims=expansion_dims)
+                layer.w2_weight_scale = torch.nn.Parameter(
+                    w2_scales.contiguous(), requires_grad=False)
+
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
+
+                layer.w13_weight = torch.nn.Parameter(shuffled_w13,
+                                                      requires_grad=False)
+                layer.w2_weight = torch.nn.Parameter(shuffled_w2,
+                                                     requires_grad=False)
+
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
             return

From 01484d19779414a7675ac86b5d17c2b080289040 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Wed, 26 Mar 2025 04:21:15 -0500
Subject: [PATCH 0985/1240] [Misc] Enhance warning information to user-defined
 chat template (#15408)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/test_chat_utils.py  | 10 +++----
 vllm/entrypoints/chat_utils.py        | 40 +++++++++++++++++----------
 vllm/entrypoints/openai/api_server.py | 27 ++++++++++++++++--
 3 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 6efed990b18..8cc51a5d73b 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -9,11 +9,11 @@
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
-                                         _try_extract_ast, load_chat_template,
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
                                          parse_chat_messages,
                                          parse_chat_messages_futures,
-                                         resolve_chat_template_content_format)
+                                         resolve_chat_template_content_format,
+                                         resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
@@ -747,7 +747,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
     }] if use_tools else None
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=tools,
@@ -781,7 +781,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
     tokenizer = tokenizer_group.tokenizer
 
     # Test detecting the tokenizer's chat_template
-    chat_template = _resolve_hf_chat_template(
+    chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=None,
         tools=None,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d3613384590..73a69d3037f 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -306,7 +306,24 @@ def _detect_content_format(
         return "openai"
 
 
-def _resolve_hf_chat_template(
+def resolve_mistral_chat_template(
+    chat_template: Optional[str],
+    **kwargs: Any,
+) -> Optional[str]:
+    if chat_template is not None:
+        logger.warning_once(
+            "'chat_template' cannot be overridden for mistral tokenizer.")
+    if "add_generation_prompt" in kwargs:
+        logger.warning_once(
+            "'add_generation_prompt' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    if "continue_final_message" in kwargs:
+        logger.warning_once(
+            "'continue_final_message' is not supported for mistral tokenizer, "
+            "so it will be ignored.")
+    return None
+
+def resolve_hf_chat_template(
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     chat_template: Optional[str],
     tools: Optional[list[dict[str, Any]]],
@@ -352,7 +369,7 @@ def _resolve_chat_template_content_format(
     trust_remote_code: bool,
 ) -> _ChatTemplateContentFormat:
     if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
-        hf_chat_template = _resolve_hf_chat_template(
+        hf_chat_template = resolve_hf_chat_template(
             tokenizer,
             chat_template=chat_template,
             trust_remote_code=trust_remote_code,
@@ -1140,7 +1157,7 @@ def apply_hf_chat_template(
     tokenize: bool = False,  # Different from HF's default
     **kwargs: Any,
 ) -> str:
-    hf_chat_template = _resolve_hf_chat_template(
+    hf_chat_template = resolve_hf_chat_template(
         tokenizer,
         chat_template=chat_template,
         tools=tools,
@@ -1169,17 +1186,12 @@ def apply_mistral_chat_template(
     tools: Optional[list[dict[str, Any]]],
     **kwargs: Any,
 ) -> list[int]:
-    if chat_template is not None:
-        logger.warning_once(
-            "'chat_template' cannot be overridden for mistral tokenizer.")
-    if "add_generation_prompt" in kwargs:
-        logger.warning_once(
-            "'add_generation_prompt' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
-    if "continue_final_message" in kwargs:
-        logger.warning_once(
-            "'continue_final_message' is not supported for mistral tokenizer, "
-            "so it will be ignored.")
+    # The return value of resolve_mistral_chat_template is always None,
+    # and we won't use it.
+    resolve_mistral_chat_template(
+        chat_template=chat_template,
+        **kwargs,
+    )
 
     return tokenizer.apply_chat_template(
         messages=messages,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index f9b1d69a31d..374e43fb153 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -35,7 +35,9 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import (load_chat_template,
+                                         resolve_hf_chat_template,
+                                         resolve_mistral_chat_template)
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -84,6 +86,7 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
+from vllm.transformers_utils.tokenizer import MistralTokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
                         is_valid_ipv6_address, set_ulimit)
@@ -883,8 +886,26 @@ async def init_app_state(
 
     resolved_chat_template = load_chat_template(args.chat_template)
     if resolved_chat_template is not None:
-        logger.info("Using supplied chat template:\n%s",
-                    resolved_chat_template)
+        # Get the tokenizer to check official template
+        tokenizer = await engine_client.get_tokenizer()
+
+        if isinstance(tokenizer, MistralTokenizer):
+            # The warning is logged in resolve_mistral_chat_template.
+            resolved_chat_template = resolve_mistral_chat_template(
+                chat_template=resolved_chat_template)
+        else:
+            hf_chat_template = resolve_hf_chat_template(
+                tokenizer,
+                chat_template=None,
+                tools=None,
+                trust_remote_code=model_config.trust_remote_code)
+
+            if hf_chat_template != resolved_chat_template:
+                logger.warning(
+                    "Using supplied chat template: %s\n"
+                    "It is different from official chat template '%s'. "
+                    "This discrepancy may lead to performance degradation.",
+                    resolved_chat_template, args.model)
 
     state.openai_serving_models = OpenAIServingModels(
         engine_client=engine_client,

From 041e21790d1d058f6964e3373612f5b66911e171 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Wed, 26 Mar 2025 18:12:47 +0800
Subject: [PATCH 0986/1240] [Misc] improve example script output (#15528)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/basic/basic.py    | 5 ++++-
 examples/offline_inference/basic/chat.py     | 5 +++--
 examples/offline_inference/basic/classify.py | 4 +++-
 examples/offline_inference/basic/embed.py    | 4 +++-
 examples/offline_inference/basic/score.py    | 4 +++-
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
index a6e96c0bb43..2ba5ec1192b 100644
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@@ -18,7 +18,10 @@
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
\ No newline at end of file
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index b2523e533a4..2dea45f843c 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -27,12 +27,13 @@ def main(args: dict):
         sampling_params.top_k = top_k
 
     def print_outputs(outputs):
+        print("\nGenerated Outputs:\n" + "-" * 80)
         for output in outputs:
             prompt = output.prompt
             generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}")
+            print(f"Prompt: {prompt!r}\n")
             print(f"Generated text: {generated_text!r}")
-        print("-" * 80)
+            print("-" * 80)
 
     print("=" * 80)
 
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 4ef949b4784..72c29e4c77c 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -23,12 +23,14 @@ def main(args: Namespace):
     outputs = model.classify(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         probs = output.outputs.probs
         probs_trimmed = ((str(probs[:16])[:-1] +
                           ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index f1655b6dbe1..0283909a2a8 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -23,12 +23,14 @@ def main(args: Namespace):
     outputs = model.embed(prompts)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
         embeds_trimmed = ((str(embeds[:16])[:-1] +
                            ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} | "
+        print(f"Prompt: {prompt!r} \n"
               f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        print("-" * 60)
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 2d21f1f0e39..83b8253f4e2 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -22,9 +22,11 @@ def main(args: Namespace):
     outputs = model.score(text_1, texts_2)
 
     # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
     for text_2, output in zip(texts_2, outputs):
         score = output.outputs.score
-        print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
+        print(f"Pair: {[text_1, text_2]!r} \nScore: {score}")
+        print("-" * 60)
 
 
 if __name__ == "__main__":

From 0736b049c05d7610893a4a445b1ab5277795ab51 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 10:13:38 +0000
Subject: [PATCH 0987/1240] Separate base model from `TransformersModel`
 (#15467)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md      |   6 +-
 tests/distributed/test_pipeline_parallel.py |   2 +-
 tests/models/registry.py                    |   2 +-
 vllm/model_executor/model_loader/utils.py   |   6 +-
 vllm/model_executor/models/registry.py      |   4 +-
 vllm/model_executor/models/transformers.py  | 149 +++++++++++++-------
 6 files changed, 110 insertions(+), 59 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index f106195e105..8ff18a17d36 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -57,10 +57,10 @@ llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
 ```
 
-If it is `TransformersModel` then it means it's based on Transformers!
+If it is `TransformersForCausalLM` then it means it's based on Transformers!
 
 :::{tip}
-You can force the use of `TransformersModel` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
+You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
 :::
 
 :::{note}
@@ -119,7 +119,7 @@ Here is what happens in the background:
 
 1. The config is loaded
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
-3. The `TransformersModel` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
+3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
 
 To make your model compatible with tensor parallel, it needs:
 
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index e757db45c8c..751c4eb096a 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -175,7 +175,7 @@ def iter_params(self, model_id: str):
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
-    # Tests TransformersModel
+    # Tests TransformersForCausalLM
     "ArthurZ/Ilama-3.2-1B": PPTestSettings.fast(),
     "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
     "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 5c84e85aaa9..d7946b75b79 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -319,7 +319,7 @@ def check_available_online(
 }
 
 _FALLBACK_MODEL = {
-    "TransformersModel": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
+    "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 
 _EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ce906143297..a252c7f8e57 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -45,7 +45,7 @@ def is_transformers_impl_compatible(
 def resolve_transformers_fallback(model_config: ModelConfig,
                                   architectures: list[str]):
     for i, arch in enumerate(architectures):
-        if arch == "TransformersModel":
+        if arch == "TransformersForCausalLM":
             continue
         auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map",
                                            None) or dict()
@@ -69,7 +69,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 raise ValueError(
                     f"The Transformers implementation of {arch} is not "
                     "compatible with vLLM.")
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
         if model_config.model_impl == ModelImpl.AUTO:
             if not is_transformers_impl_compatible(arch, custom_model_module):
                 raise ValueError(
@@ -80,7 +80,7 @@ def resolve_transformers_fallback(model_config: ModelConfig,
                 "%s has no vLLM implementation, falling back to Transformers "
                 "implementation. Some features may not be supported and "
                 "performance may not be optimal.", arch)
-            architectures[i] = "TransformersModel"
+            architectures[i] = "TransformersForCausalLM"
     return architectures
 
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7c8e5067138..7797d9a2cc2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -201,7 +201,7 @@
 }
 
 _FALLBACK_MODEL = {
-    "TransformersModel": ("transformers", "TransformersModel"),
+    "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
 
@@ -425,7 +425,7 @@ def _normalize_archs(
 
         # make sure Transformers fallback are put at the last
         if len(normalized_arch) != len(architectures):
-            normalized_arch.append("TransformersModel")
+            normalized_arch.append("TransformersForCausalLM")
         return normalized_arch
 
     def inspect_model_cls(
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 56ec00dcf22..6ea14950658 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -43,7 +43,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, maybe_prefix)
 
 logger = init_logger(__name__)
@@ -110,13 +111,9 @@ def replace_linear_class(
     )
 
 
-@support_torch_compile
-class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
-    embedding_modules = ["embed_tokens"
-                         ]  # TODO transformers will have a util to get it
+class TransformersModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         logger.info("Using Transformers backend.")
 
@@ -134,9 +131,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.parallel_config = parallel_config
         self.quant_config = quant_config
 
-        self.vocab_size = model_config.get_vocab_size()
-        self.unpadded_vocab_size = model_config.get_vocab_size()
-
         self.pp_group = get_pp_group()
         self.pp_size = self.pp_group.world_size
         self.pp_rank = self.pp_group.rank_in_group
@@ -144,13 +138,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         # Use meta device to delay allocating GPU tensors
         with torch.device("meta"):
+            # FIXME(Isotr0py): We need to refactor this part in the future to
+            # avoid registering an extra model layer, otherwise we will need a
+            # weights mapper to rename weights.
             self.model: PreTrainedModel = AutoModel.from_config(
                 config,
                 attn_implementation="vllm",
                 torch_dtype=model_config.dtype,
                 trust_remote_code=model_config.trust_remote_code,
             )
-        prefix = self.model.base_model_prefix
 
         self.pipeline_parallel()
         self.tensor_parallel()
@@ -168,32 +164,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         # Attention layers
         self.attention_instances = self.create_attention_instances()
 
-        # Output embeddings
-        if not isinstance(getattr(self, "lm_head", None), PPMissingLayer):
-            self.unpadded_vocab_size = config.vocab_size
-            self.lm_head = ParallelLMHead(
-                config.vocab_size,
-                config.hidden_size,
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "lm_head"),
-            )
-            if config.tie_word_embeddings:
-                self.lm_head = self.lm_head.tie_weights(
-                    self.model.get_input_embeddings())
-
-            logit_scale = getattr(config, "logit_scale", 1.0)
-            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                    config.vocab_size,
-                                                    logit_scale)
-
         # Initialize buffers (e.g. rotary embedding inverse frequency)
         self.init_buffers(self.model)
 
         # Move remaining meta tensors to device (should happen last)
         self.meta_to_empty(self.model)
 
-        self.sampler = get_sampler()
-
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
@@ -248,9 +224,6 @@ def pipeline_parallel(self):
             if not self.pp_group.is_last_rank:
                 setattr(self.model, name, PPMissingLayer())
 
-        if not self.pp_group.is_last_rank:
-            self.lm_head = PPMissingLayer()
-
     def tensor_parallel(self):
         """
         Apply the model's tensor parallelization plan.
@@ -331,6 +304,9 @@ def meta_to_empty(self, module: nn.Module):
         for child in module.children():
             self.meta_to_empty(child)
 
+    def get_input_embeddings(self) -> nn.Module:
+        return self.model.get_input_embeddings()
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -361,21 +337,6 @@ def forward(
 
         return hidden_states
 
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-        sampling_metadata: SamplingMetadata,
-    ) -> Optional[torch.Tensor]:
-        logits = self.logits_processor(self.lm_head, hidden_states,
-                                       sampling_metadata)
-        return logits
-
-    def sample(self, logits: torch.Tensor,
-               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
-
-        next_tokens = self.sampler(logits, sampling_metadata)
-        return next_tokens
-
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
@@ -393,3 +354,93 @@ def load_weights(self, weights: Iterable[tuple[str,
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
         return loaded_params
+
+
+@support_torch_compile
+class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA,
+                              SupportsPP):
+    embedding_padding_modules = ["lm_head"]
+    embedding_modules = ["embed_tokens"
+                         ]  # TODO transformers will have a util to get it
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: PretrainedConfig = vllm_config.model_config.hf_config
+        quant_config: QuantizationConfig = vllm_config.quant_config
+
+        self.config = config
+
+        self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            self.lm_head = ParallelLMHead(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.get_input_embeddings())
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    # FIXME(Isotr0py): Don't use any weights mapper for Transformers fallback,
+    # this makes thing complicated. We need to remove this mapper after refactor
+    # `TransformersModel` in the future.
+    @property
+    def hf_to_vllm_mapper(self):
+        prefix_mapper = {
+            name: "model." + name
+            for name, _ in self.model.model.named_children()
+        }
+        return WeightsMapper(
+            orig_to_new_substr={"model.": "model.model."},
+            orig_to_new_prefix=prefix_mapper,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From 3c5e9cc923edff685fad6a7997a7808e521e7636 Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Wed, 26 Mar 2025 20:09:06 +0800
Subject: [PATCH 0988/1240] Apply torchfix (#15532)

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 5 ++---
 vllm/lora/models.py                        | 4 +++-
 vllm/model_executor/models/nemotron.py     | 6 +++---
 vllm/model_executor/models/phi4mm_utils.py | 9 ++++++---
 vllm/multimodal/image.py                   | 2 +-
 5 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index c47202099ac..34f5fedcf36 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -884,9 +884,8 @@ def _sdpa_attention(
 
     for i, seq_len in enumerate(seq_lens):
         end = start + seq_len
-        with torch.backends.cuda.sdp_kernel(enable_math=True,
-                                            enable_flash=False,
-                                            enable_mem_efficient=False):
+        with torch.nn.attention.sdpa_kernel(
+                torch.nn.attention.SDPBackend.MATH):
             sub_out = torch.nn.functional.scaled_dot_product_attention(
                 query[:, start:end, :],
                 key[:, start:end, :],
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 22a45b60ca3..8164d919ca8 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -272,7 +272,9 @@ def from_local_checkpoint(
                     f" target modules in {expected_lora_modules}"
                     f" but received {unexpected_modules}."
                     f" Please verify that the loaded LoRA module is correct")
-            tensors = torch.load(lora_bin_file_path, map_location=device)
+            tensors = torch.load(lora_bin_file_path,
+                                 map_location=device,
+                                 weights_only=True)
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index a2b49494968..0ea296b2f93 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -63,8 +63,8 @@ def _cast_if_autocast_enabled(*args):
     if not torch.is_autocast_enabled():
         return args
     else:
-        return torch.cuda.amp.autocast_mode._cast(
-            args, torch.get_autocast_gpu_dtype())
+        return torch.amp.autocast_mode._cast(
+            args, device_type="cuda", dtype=torch.get_autocast_gpu_dtype())
 
 
 class NemotronLayerNorm1P(nn.LayerNorm):
@@ -89,7 +89,7 @@ def forward(
             residual = x
         args = _cast_if_autocast_enabled(x, self.normalized_shape,
                                          self.weight + 1, self.bias, self.eps)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast("cuda", enabled=False):
             x = torch.nn.functional.layer_norm(*args)
             return x if residual is None else (x, residual)
 
diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py
index ca00207a9b6..9f08a1c4c6f 100644
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
@@ -1766,9 +1766,12 @@ def forward(
                 if mask.dtype != q.dtype:
                     attn_mask = attn_mask.to(q.dtype)
 
-            with torch.backends.cuda.sdp_kernel(enable_flash=True,
-                                                enable_math=True,
-                                                enable_mem_efficient=True):
+            with torch.nn.attention.sdpa_kernel([
+                    torch.nn.attention.SDPBackend.FLASH_ATTENTION,
+                    torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
+                    torch.nn.attention.SDPBackend.MATH,
+                    torch.nn.attention.SDPBackend.CUDNN_ATTENTION,
+            ]):
                 x = torch.nn.functional.scaled_dot_product_attention(
                     q,
                     k,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 255fac30bd7..0c5a84c6508 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -149,7 +149,7 @@ def load_base64(self, media_type: str, data: str) -> torch.Tensor:
         return self.load_bytes(base64.b64decode(data))
 
     def load_file(self, filepath: Path) -> torch.Tensor:
-        return torch.load(filepath)
+        return torch.load(filepath, weights_only=True)
 
     def encode_base64(self, media: torch.Tensor) -> str:
         return base64.b64encode(media.numpy()).decode('utf-8')

From de71791cba75048181061c6f0e3645d47d33be65 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 14:26:48 +0000
Subject: [PATCH 0989/1240] Improve validation of TP in Transformers backend
 (#15540)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/transformers.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 6ea14950658..bdc39068910 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -229,7 +229,10 @@ def tensor_parallel(self):
         Apply the model's tensor parallelization plan.
         Currently only supports linear layers.
         """
-        if self.tp_size > 1 and self.config.base_model_tp_plan is None:
+        if not self.model.supports_tp_plan:
+            if self.tp_size <= 1:
+                return
+
             raise ValueError(
                 f"{type(self.model)} does not support tensor parallel yet!")
 

From ed620c72ce520ecd517608272601593e57f9fe26 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Wed, 26 Mar 2025 08:28:07 -0600
Subject: [PATCH 0990/1240] [Model] Add Reasoning Parser for Granite Models
 (#14202)

Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Co-authored-by: Joe Runde <joe@joerun.de>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md     |   7 +-
 .../openai_chat_completion_with_reasoning.py  |   1 +
 ...hat_completion_with_reasoning_streaming.py |   1 +
 .../test_granite_reasoning_parser.py          | 349 +++++++++++++++++
 vllm/engine/arg_utils.py                      |   2 +-
 .../openai/reasoning_parsers/__init__.py      |   6 +-
 .../granite_reasoning_parser.py               | 363 ++++++++++++++++++
 .../guided_decoding/reasoner/__init__.py      |   4 +
 8 files changed, 730 insertions(+), 3 deletions(-)
 create mode 100644 tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
 create mode 100644 vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 0b170aadc34..879b16d4f7b 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -4,7 +4,7 @@
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
-Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
+Reasoning models return an additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models.
 
 ## Supported Models
 
@@ -14,6 +14,9 @@ vLLM currently supports the following reasoning models:
 |--------------|-------------|------------------|-------------|
 | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
+| [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
+
+- IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
 
 ## Quickstart
 
@@ -43,6 +46,7 @@ model = models.data[0].id
 
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 response = client.chat.completions.create(model=model, messages=messages)
 
 reasoning_content = response.choices[0].message.reasoning_content
@@ -97,6 +101,7 @@ models = client.models.list()
 model = models.data[0].id
 
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 stream = client.chat.completions.create(model=model,
                                         messages=messages,
                                         stream=True)
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index b5dbed1205d..e753cedcdc0 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -31,6 +31,7 @@
 
 # Round 1
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 response = client.chat.completions.create(model=model, messages=messages)
 
 reasoning_content = response.choices[0].message.reasoning_content
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index fe4332576d4..cb13b0c614a 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -38,6 +38,7 @@
 model = models.data[0].id
 
 messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}]
+# For granite, add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
 stream = client.chat.completions.create(model=model,
                                         messages=messages,
                                         stream=True)
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
new file mode 100644
index 00000000000..84ac6600498
--- /dev/null
+++ b/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
@@ -0,0 +1,349 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from transformers import AutoTokenizer
+
+from tests.entrypoints.openai.reasoning_parsers.utils import (
+    DeltaMessage, run_reasoning_extraction)
+from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
+                                                       ReasoningParserManager)
+
+parser_name = "granite"
+START_REASONING = "Here is my thought process:"
+START_RESPONSE = "Here is my response:"
+
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+
+# Additional tests for verifying the correctness of granite streaming; this
+# is complicated because granite uses multiple tokens to indicate when thinking
+# is starting / when it's starting its response, so skipping special tokens
+# is awkward.
+
+### Handling the start of reasoning
+STREAMING_1 = {
+    "previous_text": None,
+    "current_text": "Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# When we fail, we should give what was previously being silenced first
+STREAMING_2 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought failure",
+    "delta_text": " failure",
+    "reasoning_content": None,
+    "content": "Here is my thought failure",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_3 = {
+    "previous_text": "Here wrong",
+    "current_text": " words",
+    "delta_text": " Here wrong words",
+    "reasoning_content": None,
+    "content": " words",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_4 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought process:",
+    "delta_text": " process:",
+    "reasoning_content": None,
+    "content": None,
+}
+# Reasoning started successfully; parse reasoning content
+STREAMING_5 = {
+    "previous_text": "Here is my thought process:",
+    "current_text": "Here is my thought process: foo",
+    "delta_text": " foo",
+    "reasoning_content": " foo",
+    "content": None,
+}
+# Response special sequence has started, but not finished.
+STREAMING_6 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text": "Here is my thought process: foo Here is",
+    "delta_text": " Here is",
+    "reasoning_content": " ",
+    "content": None,
+}
+# Response special sequence started, but was broken; the reasoning
+# content should be the content that was previously unused.
+STREAMING_7 = {
+    "previous_text": "Here is my thought process: foo Here is",
+    "current_text": "Here is my thought process: foo Here is Here",
+    "delta_text": " Here",
+    "reasoning_content": "Here is ",
+    "content": None,
+}
+# Response special sequence is ongoing
+STREAMING_8 = {
+    "previous_text": "Here is my thought process: foo Here is my response:",
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": " bar",
+    "reasoning_content": None,
+    "content": " bar",
+}
+# The delta text has everything; we should be able to correctly parse both
+STREAMING_9 = {
+    "previous_text": None,
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": "Here is my thought process: foo Here is my response: bar",
+    "reasoning_content": " foo ",
+    "content": " bar",
+}
+## The Response is ongoing, and the delta mixes reasoning content / content
+STREAMING_10 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text":
+    "Here is my thought process: foo bar Here is my response: baz",
+    "delta_text": " bar Here is my response: baz",
+    "reasoning_content": " bar ",
+    "content": " baz",
+}
+# The delta text starts a new substring that might be a response special seq
+STREAMING_11 = {
+    "previous_text":
+    "Here is my thought process: This is a reasoning section ",
+    "current_text":
+    "Here is my thought process: This is a reasoning section Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# The delta text is finishing the response special seq
+STREAMING_12 = {
+    "previous_text": "Here is my thought process: foo Here is my response",
+    "current_text": "Here is my thought process: foo Here is my response:",
+    "delta_text": ":",
+    "reasoning_content": None,
+    "content": None,
+}
+STREAMING_13 = {
+    "previous_text": "Here is my thought process: foo Here",
+    "current_text": "Here is my thought process: foo Here was",
+    "delta_text": " was",
+    "reasoning_content": "Here was",
+    "content": None,
+}
+
+STREAMING_SUBCASES = [
+    pytest.param(
+        STREAMING_1,
+        id="Starting reasoning special sequence",
+    ),
+    pytest.param(
+        STREAMING_2,
+        id="Unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_3,
+        id="Continuing unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_4,
+        id="Only start reasoning sequence and nothing else",
+    ),
+    pytest.param(
+        STREAMING_5,
+        id="Reasoning content has started",
+    ),
+    pytest.param(
+        STREAMING_6,
+        id="Response special sequence has started",
+    ),
+    pytest.param(
+        STREAMING_7,
+        id="Response special sequence reset",
+    ),
+    pytest.param(
+        STREAMING_8,
+        id="Response text has started",
+    ),
+    pytest.param(
+        STREAMING_9,
+        id="Delta contains everything",
+    ),
+    pytest.param(
+        STREAMING_10,
+        id="Delta contains some reasoning and response",
+    ),
+    pytest.param(
+        STREAMING_11,
+        id="Delta starts response sequence",
+    ),
+    pytest.param(
+        STREAMING_12,
+        id="Delta finishes response sequence",
+    ),
+    pytest.param(
+        STREAMING_13,
+        id="Delta breaks potential responise sequence",
+    ),
+]
+
+
+@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
+def test_streaming_subcases(param_dict):
+    # Get all of the token IDs
+    previous_token_ids = tokenizer.encode(
+        param_dict["previous_text"]
+    ) if param_dict["previous_text"] is not None else []
+    current_token_ids = tokenizer.encode(param_dict["current_text"])
+    delta_token_ids = tokenizer.encode(param_dict["delta_text"])
+
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+
+    response = parser.extract_reasoning_content_streaming(
+        previous_text=param_dict["previous_text"],
+        current_text=param_dict["current_text"],
+        delta_text=param_dict["delta_text"],
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+    )
+    # Streaming currently expects at least one of reasoning content / content,
+    # so the response should return None in that case.
+    if param_dict["reasoning_content"] is None and param_dict[
+            "content"] is None:
+        assert response is None
+    else:
+        assert isinstance(response, DeltaMessage)
+        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["content"] == response.content
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 75ac326aaa3..be00689f2b5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1099,7 +1099,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             "--reasoning-parser",
             type=str,
-            choices=["deepseek_r1"],
+            choices=["deepseek_r1", "granite"],
             default=None,
             help=
             "Select the reasoning parser depending on the model that you're "
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
index 80354d69b50..45132a780e5 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/__init__.py
+++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py
@@ -2,7 +2,11 @@
 
 from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager
 from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+from .granite_reasoning_parser import GraniteReasoningParser
 
 __all__ = [
-    "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser"
+    "ReasoningParser",
+    "ReasoningParserManager",
+    "DeepSeekR1ReasoningParser",
+    "GraniteReasoningParser",
 ]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
new file mode 100644
index 00000000000..117d051a737
--- /dev/null
+++ b/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
@@ -0,0 +1,363 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from collections.abc import Sequence
+from typing import Optional, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage)
+from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
+    ReasoningParser, ReasoningParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@ReasoningParserManager.register_module("granite")
+class GraniteReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for IBM Granite.
+
+    IBM granite models currently use "Here is my thought process:"
+    and "Here is my response:" to separate its thinking / response outputs.
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+        # NOTE: There have been some observed occurrences of quantized
+        # instances of the current models using "Here's" instead of "Here is",
+        # so to be safe, we match on both.
+        self.think_start_expr = r"(?:Here's|Here is) my thought process:"
+        self.response_start_expr = r"(?:Here's|Here is) my response:"
+
+        self.reasoning_regex = re.compile(
+            rf"{self.think_start_expr}(.*?){self.response_start_expr}(.*)",
+            re.DOTALL)
+
+        self.valid_think_starts = [
+            "Here's my thought process:", "Here is my thought process:"
+        ]
+        self.valid_response_starts = [
+            "Here's my response:", "Here is my response:"
+        ]
+
+        # Substrings to match for sequence boundaries on raw text
+        self.seq_boundary_end = ":"
+        self.seq_boundary_start = "Here"
+
+        # The longest any thinking / start of response message can be
+        self.longest_think_start = max(
+            len(think_start) for think_start in self.valid_think_starts)
+
+    def extract_reasoning_content(
+            self, model_output: str, request: ChatCompletionRequest
+    ) -> tuple[Optional[str], Optional[str]]:
+        """Extract the reasoning content & content sections, respectively.
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        Args:
+            model_output (str): Output of the model to be parsed.
+            request (ChatCompletionReqest): Request being processed.
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: Tuple pair containing the
+            reasoning content and non-reasoning content.
+        """
+        re_match = self.reasoning_regex.findall(model_output)
+        if not re_match:
+            return None, model_output
+        reasoning_content, response_content = re_match[0]
+        if not response_content:
+            return reasoning_content, None
+        return reasoning_content, response_content
+
+    def extract_reasoning_content_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        """Extract the reasoning content / content emitted by granite models;
+        If the sequence doesn't match what we expect, i.e., the model generates
+        something else, all content is considered non-reasoning content.
+
+        NOTE: Granite models do not use a special token to start their reasoning
+        and response sections; instead they have token sequences, e.g.,
+
+                Here is my thought process: Foo Here is my response: Bar
+
+        This increases the complexity of correctly handling streams, since we
+        need to watch for specific sequences and correctly parse them without
+        dropping content that is potentially overlapping & spanning multiple
+        delta messages.
+
+        Args:
+            previous_text (str): Previous text outside of this delta message.
+            current_text (str): Previous text + delta text.
+            delta_text (str): Text to consider and parse content from.
+            previous_token_ids (Sequence[int]): Token IDs of previous_text.
+            current_token_ids (Sequence[int]): Token IDs of current_text.
+            delta_token_ids (Sequence[int]): Token IDs of delta_text.
+
+        Returns:
+            Union[DeltaMessage, None]
+                DeltaMessage with either reasoning content or content, or None.
+        """
+        reasoning_content, resp_seq_len, content = self._get_content_sections(
+            current_text)
+        # Either we haven't finished the start of the reasoning sequence,
+        # or the model is generating something unexpected.
+        if not reasoning_content:
+            delta_message = self._get_delta_message_with_no_reasoning_bounds(
+                current_text, delta_text)
+        # We have a start of reasoning message, but have not yet finished
+        # the start of response sequence.
+        elif not content:
+            delta_message = self._get_delta_message_with_no_response_bounds(
+                current_text, reasoning_content, delta_text)
+        # We've finished both the start of reasoning and start of response seq.
+        else:
+            # This should never happen since we matched on the response
+            assert resp_seq_len is not None
+            delta_message = self._get_delta_message_with_both_bounds(
+                delta_text, reasoning_content, content, current_text,
+                resp_seq_len)
+        if not delta_message.content and not delta_message.reasoning_content:
+            return None
+        return delta_message
+
+    #### Implementation details of stream parsing for granite models
+    def _is_reasoning_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start reasoning seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible reasoning start seqs match.
+        """
+        return any(
+            think_start.startswith(text)
+            for think_start in self.valid_think_starts)
+
+    def _is_response_start_substr(self, text: str) -> bool:
+        """Check if a text matches one of the possible start response seqs.
+
+        Args:
+            text (str): Text to check for leading substr.
+        
+        Returns:
+            bool: True if any of the possible response start seqs match.
+        """
+        return any(
+            response_start.startswith(text)
+            for response_start in self.valid_response_starts)
+
+    def _get_delta_message_with_no_reasoning_bounds(
+        self,
+        current_text: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has not yet completed
+        its start of reasoning sequence.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        prev_longest_length = len(current_text) - len(delta_text)
+        is_substr = self._is_reasoning_start_substr(current_text)
+        was_substr = self._is_reasoning_start_substr(
+            current_text[:prev_longest_length])
+
+        # Check if we just generated something NOT in the special token seq;
+        # if so, add everything that we previously skipped with this delta
+        # message and append everything to content in the future.
+        if was_substr and not is_substr:
+            return DeltaMessage(
+                reasoning_content=None,
+                content=current_text,
+            )
+        if is_substr:
+            # Might still be in the special token sequence; return nothing
+            return DeltaMessage(reasoning_content=None, content=None)
+        # Otherwise the sequence has already been broken and we already
+        # corrected; just return the delta text as normal content.
+        return DeltaMessage(reasoning_content=None, content=delta_text)
+
+    def _get_delta_message_with_no_response_bounds(
+        self,
+        current_text: str,
+        reasoning_content: str,
+        delta_text: str,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content with no (response) content. NOTE that we may have overlapping
+        tokens with the start of reasoning / start of response sequences on
+        either side of the delta text.
+
+        Args:
+            current_text (str): The full previous + delta text.
+            reasoning_content (str): reasoning content from current_text.
+            delta_text (str): Text to consider and parse content from.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # If we have no reasoning content or explicitly end with the start of
+        # response sequence, we are in transition to the response; need to be
+        # careful here, since the final token (:) will match the reasoning
+        # content and fully parse it out; we should not pass the : back.
+        ends_with_start_response_seq = any(
+            current_text.endswith(response_start)
+            for response_start in self.valid_response_starts)
+        if reasoning_content is None or ends_with_start_response_seq:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        # Consider previous / current text only within context of the reasoning
+        previous_text = reasoning_content[:-len(delta_text)]
+        current_text = reasoning_content
+
+        # We need to be careful about adding unfinished response sequences;
+        # Find the place at which we MIGHT be starting a response sequence
+        prev_idx = previous_text.rfind(self.seq_boundary_start)
+        delta_idx = delta_text.rfind(self.seq_boundary_start)
+
+        # Check the state of potential start of response substring matches.
+        prev_was_substr = self._is_response_start_substr(
+            previous_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_continues_substr = self._is_response_start_substr(
+            current_text[prev_idx:]) if prev_idx >= 0 else False
+        delta_new_substr = self._is_response_start_substr(
+            delta_text[delta_idx:]) if delta_idx >= 0 else False
+
+        # Delta only contains potential continued response sequence text.
+        if delta_continues_substr:
+            return DeltaMessage(reasoning_content=None, content=None)
+
+        if not prev_was_substr:
+            # Delta may be starting a new response seq but has other text too.
+            if delta_new_substr:
+                return DeltaMessage(reasoning_content=delta_text[:delta_idx],
+                                    content=None)
+            # Normal case for most reasoning text (no potential special seqs).
+            return DeltaMessage(reasoning_content=delta_text, content=None)
+        # The substring that previously seemed to be a potential response
+        # seq wasn't one; we need to add the content to the delta message,
+        # and also slice off the potential response sequence
+        elif delta_new_substr:
+            reasoning_content = previous_text[
+                prev_idx:] + delta_text[:delta_idx]
+            return DeltaMessage(reasoning_content=reasoning_content,
+                                content=None)
+        # No new substring yet, and we broke our old one; take the whole delta
+        return DeltaMessage(
+            reasoning_content=previous_text[prev_idx:] + delta_text,
+            content=None,
+        )
+
+    def _get_delta_message_with_both_bounds(
+        self,
+        delta_text: str,
+        reasoning_content: str,
+        response_content: str,
+        current_text: str,
+        response_seq_len: int,
+    ) -> DeltaMessage:
+        """Parse the delta message when the current text has both reasoning
+        content and normal (response) content.
+
+        Args:
+            delta_text (str): Text to consider and parse content from.
+            reasoning_content (str): reasoning content from current_text.
+            response_content (str): response content from current_text.
+            current_text (str): The full previous + delta text.
+            response_seq_len(str): Len of the complete response sequence used.
+
+        Returns:
+            DeltaMessage: Message containing the parsed content.
+        """
+        # Always have content; take length to the end
+        delta_content = delta_text[-len(response_content):]
+        reasoning_end_idx = len(delta_text) - (len(response_content) +
+                                               response_seq_len)
+
+        if reasoning_end_idx < 0:
+            delta_reasoning_content = None
+        else:
+            # Get the starting offset
+            start_reasoning_content_idx = len(
+                reasoning_content) + response_seq_len + len(
+                    response_content) - 1
+            delta_offset = len(current_text) - len(delta_text)
+            start_offset = start_reasoning_content_idx - delta_offset
+            if start_offset < 0:
+                start_offset = 0
+            delta_reasoning_content = delta_text[
+                start_offset:reasoning_end_idx]
+
+        return DeltaMessage(
+            reasoning_content=delta_reasoning_content,
+            content=delta_content,
+        )
+
+    def _get_content_sections(
+        self, current_text: str
+    ) -> tuple[Optional[str], Optional[int], Optional[str]]:
+        """Parse the text to extract the reasoning content / content
+        if we have them.
+
+        Args:
+            current_text (str): The full previous + delta text.
+
+        Returns:
+            tuple[Optional[str], Optional[int], Optional[str]]: Tuple of len 3
+            containing the reasoning content, the length of the response seq
+            (if there is one) and the non-reasoning content.
+        """
+        current_chunk_start = 0
+        start_reasoning_content = None
+        parsed_content = False
+        delimiter_idxs = [
+            idx for idx, char in enumerate(current_text)
+            if char == self.seq_boundary_end
+        ]
+
+        for current_chunk_end in delimiter_idxs:
+            current_chunk = current_text[current_chunk_start:current_chunk_end]
+            # Check to see if the start of reasoning seq if complete
+            if start_reasoning_content is None:
+                for think_start in self.valid_think_starts:
+                    if current_chunk == think_start[:-1]:
+                        start_reasoning_content = current_chunk_end + 1
+                        current_chunk_start = current_chunk_end + 1
+                        break
+
+            # Check to see if the start of response seq if complete
+            elif not parsed_content:
+                for response_start in self.valid_response_starts:
+                    if current_chunk[-len(response_start) +
+                                     1:] == response_start[:-1]:
+                        # Mark end of reasoning and start response content
+                        # after the start of response sequence.
+                        end_reasoning_content = current_chunk_end - len(
+                            response_start)
+                        reasoning_content = current_text[
+                            start_reasoning_content:end_reasoning_content]
+                        response_content = current_text[current_chunk_end + 1:]
+                        return reasoning_content, len(
+                            response_start), response_content
+
+        if start_reasoning_content and not parsed_content:
+            return current_text[start_reasoning_content:], None, None
+        return None, None, None
diff --git a/vllm/model_executor/guided_decoding/reasoner/__init__.py b/vllm/model_executor/guided_decoding/reasoner/__init__.py
index d930d3dbe94..ab6e47c007d 100644
--- a/vllm/model_executor/guided_decoding/reasoner/__init__.py
+++ b/vllm/model_executor/guided_decoding/reasoner/__init__.py
@@ -19,6 +19,10 @@ def get_reasoner(tokenizer: PreTrainedTokenizer,
         return None
     elif reasoning_backend == "deepseek_r1":
         return DeepSeekReasoner.from_tokenizer(tokenizer)
+    elif reasoning_backend == "granite":
+        logger.warning(
+            "Granite reasoner not yet implemented for structured outputs")
+        return None
     else:
         # Raise a warning for unknown reasoning backend and return None
         # We cannot raise an error here because some reasoning models

From fcfdbce480a924aeadc6c4c408479d8114778b90 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 26 Mar 2025 23:54:24 +0800
Subject: [PATCH 0991/1240] multi-node offline DP+EP example (#15484)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/data_parallel.py | 120 ++++++++++++++++----
 1 file changed, 97 insertions(+), 23 deletions(-)

diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b73770ce382..232afd8b73d 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -1,26 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
-# usage:
-# VLLM_USE_V1=1 python examples/offline_inference/data_parallel.py
-# we need to have a launcher to create multiple data parallel
-# ranks. And each rank will create a vLLM instance to process its own prompts.
+"""
+Usage:
+Single node:
+    python examples/offline_inference/data_parallel.py \
+            --model="ibm-research/PowerMoE-3b" \
+            --dp-size=2 \
+            --tp-size=2
+
+Multi-node:
+    Node 0 (assume the node has ip of 10.99.48.128):
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=0 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+    Node 1:
+            python examples/offline_inference/data_parallel.py \
+                    --model="ibm-research/PowerMoE-3b" \
+                    --dp-size=2 \
+                    --tp-size=2 \
+                    --node-size=2 \
+                    --node-rank=1 \
+                    --master-addr=10.99.48.128 \
+                    --master-port=13345
+"""
 import os
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
 
-GPUs_per_dp_rank = 2
-DP_size = 2
-
 
-def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
-    os.environ["VLLM_DP_RANK"] = str(dp_rank)
+def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
+         dp_master_port, GPUs_per_dp_rank):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
     os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
     os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
     # set devices for each dp_rank
     os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        str(i) for i in range(dp_rank * GPUs_per_dp_rank, (dp_rank + 1) *
-                              GPUs_per_dp_rank))
+        str(i)
+        for i in range(local_dp_rank * GPUs_per_dp_rank, (local_dp_rank + 1) *
+                       GPUs_per_dp_rank))
 
     # Sample prompts.
     prompts = [
@@ -28,20 +51,20 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
         "The president of the United States is",
         "The capital of France is",
         "The future of AI is",
-    ]
+    ] * 100
 
     # with DP, each rank should process different prompts.
     # usually all the DP ranks process a full dataset,
     # and each rank processes a different part of the dataset.
     promts_per_rank = len(prompts) // dp_size
-    start = dp_rank * promts_per_rank
+    start = global_dp_rank * promts_per_rank
     end = start + promts_per_rank
     prompts = prompts[start:end]
     if len(prompts) == 0:
         # if any rank has no prompts to process,
         # we need to set a placeholder prompt
         prompts = ["Placeholder"]
-    print(f"DP rank {dp_rank} needs to process {len(prompts)} prompts")
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
 
     # Create a sampling params object.
     # since we are doing data parallel, every rank can have different
@@ -49,31 +72,82 @@ def main(dp_size, dp_rank, dp_master_ip, dp_master_port, GPUs_per_dp_rank):
     # ranks for demonstration.
     sampling_params = SamplingParams(temperature=0.8,
                                      top_p=0.95,
-                                     max_tokens=16 * (dp_rank + 1))
+                                     max_tokens=[16, 20][global_dp_rank % 2])
 
     # Create an LLM.
-    llm = LLM(model="ibm-research/PowerMoE-3b",
+    llm = LLM(model=model,
               tensor_parallel_size=GPUs_per_dp_rank,
               enforce_eager=True,
               enable_expert_parallel=True)
     outputs = llm.generate(prompts, sampling_params)
     # Print the outputs.
-    for output in outputs:
+    for i, output in enumerate(outputs):
+        if i >= 5:
+            # print only 5 outputs
+            break
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"DP rank {dp_rank}, Prompt: {prompt!r}, "
+        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
               f"Generated text: {generated_text!r}")
 
 
 if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Data Parallel Inference")
+    parser.add_argument("--model",
+                        type=str,
+                        default="ibm-research/PowerMoE-3b",
+                        help="Model name or path")
+    parser.add_argument("--dp-size",
+                        type=int,
+                        default=2,
+                        help="Data parallel size")
+    parser.add_argument("--tp-size",
+                        type=int,
+                        default=2,
+                        help="Tensor parallel size")
+    parser.add_argument("--node-size",
+                        type=int,
+                        default=1,
+                        help="Total number of nodes")
+    parser.add_argument("--node-rank",
+                        type=int,
+                        default=0,
+                        help="Rank of the current node")
+    parser.add_argument("--master-addr",
+                        type=str,
+                        default="",
+                        help="Master node IP address")
+    parser.add_argument("--master-port",
+                        type=int,
+                        default=0,
+                        help="Master node port")
+    args = parser.parse_args()
+
+    dp_size = args.dp_size
+    tp_size = args.tp_size
+    node_size = args.node_size
+    node_rank = args.node_rank
+
+    if node_size == 1:
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+    else:
+        dp_master_ip = args.master_addr
+        dp_master_port = args.master_port
+
+    assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
+    dp_per_node = dp_size // node_size
+
     from multiprocessing import Process
-    dp_master_ip = "127.0.0.1"
-    dp_master_port = get_open_port()
+
     procs = []
-    for i in range(DP_size):
+    for local_dp_rank, global_dp_rank in enumerate(
+            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
         proc = Process(target=main,
-                       args=(DP_size, i, dp_master_ip, dp_master_port,
-                             GPUs_per_dp_rank))
+                       args=(args.model, dp_size, local_dp_rank,
+                             global_dp_rank, dp_master_ip, dp_master_port,
+                             tp_size))
         proc.start()
         procs.append(proc)
     exit_code = 0

From e6a774e92a5df0bb922121477381cdf8d4aa8b15 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:17:53 +0000
Subject: [PATCH 0992/1240] Fix weight loading for some models in Transformers
 backend (#15544)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/transformers.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index bdc39068910..70daadf9137 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -345,9 +345,11 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params = set[str]()
         for name, loaded_weight in weights:
-            # Necessary for some models which use remote code
-            if not name.startswith(prefix := self.model.base_model_prefix):
-                name = maybe_prefix(prefix, name)
+            # Use "model" instead of base_model_prefix because
+            # the base model attribute in vLLM is always `model`
+            if not name.startswith(prefix := "model."):
+                name = prefix + name
+
             if is_pp_missing_parameter(name, self):
                 continue
             if name in params_dict:

From 50c0788d9a6fd824791c837400cd03b16da70ca1 Mon Sep 17 00:00:00 2001
From: Aaron Pham <contact@aarnphm.xyz>
Date: Wed, 26 Mar 2025 13:51:56 -0400
Subject: [PATCH 0993/1240] [Refactor] Remove unnecessary backend parameter in
 structured output interface (#15317)

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/__init__.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
index 6c6a8a7bce3..218af43deb6 100644
--- a/vllm/v1/structured_output/__init__.py
+++ b/vllm/v1/structured_output/__init__.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import multiprocessing
-from concurrent.futures import Future, ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, Optional
 
 from vllm.config import VllmConfig
@@ -57,13 +57,13 @@ def grammar_init(self, request: Request) -> None:
                 raise ValueError(
                     f"Unsupported structured output backend: {backend_name}")
 
-        grammar: Future[StructuredOutputGrammar] = self.executor.submit(
-            self._async_create_grammar, request, self.backend)
+        grammar = self.executor.submit(self._async_create_grammar, request)
         request.structured_output_request.grammar = grammar  # type: ignore[assignment]
 
     def _async_create_grammar(
-            self, request: Request,
-            backend: StructuredOutputBackend) -> StructuredOutputGrammar:
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
         key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
 
         # Note that the request was validated in the engine core client,

From c1165656846ec8e5d9ae73c94976e3b454c52d79 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 26 Mar 2025 10:56:47 -0700
Subject: [PATCH 0994/1240] [V1][Sampler] Faster top-k only implementation
 (#15478)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/sample/test_topk_topp_sampler.py | 37 ++++++++++++++++
 vllm/v1/sample/ops/topk_topp_sampler.py   | 53 ++++++++++++++++++++---
 vllm/v1/sample/sampler.py                 |  6 +++
 3 files changed, 91 insertions(+), 5 deletions(-)
 create mode 100644 tests/v1/sample/test_topk_topp_sampler.py

diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
new file mode 100644
index 00000000000..8a5076412cf
--- /dev/null
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from torch import Generator
+
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+DEVICE = "cuda"
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+
+def test_topk_impl_equivalance():
+
+    with torch.device(DEVICE):
+        generator = Generator(device=DEVICE).manual_seed(33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+        # Random top-k values between 1 and 9.
+        k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(
+            torch.randint(0,
+                          2, (BATCH_SIZE, ),
+                          generator=generator,
+                          dtype=bool), VOCAB_SIZE)
+
+        # Top-k only implementation
+        result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+        # Top-p + top-k
+        no_op_top_p = torch.tensor([1.0])
+        result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+        assert torch.allclose(result1, result2)
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 1dea711874b..5dfcae08b17 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -19,6 +19,12 @@
 
 
 class TopKTopPSampler(nn.Module):
+    """
+    Module that performs optional top-k and top-p filtering followed by
+    weighted random sampling of logits.
+
+    Implementations may update the logits tensor in-place.
+    """
 
     def __init__(self):
         super().__init__()
@@ -84,7 +90,11 @@ def forward_native(
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        """PyTorch-native implementation of top-k and top-p sampling."""
+        """
+        PyTorch-native implementation of top-k and top-p sampling.
+
+        The logits tensor may be updated in-place.
+        """
         logits = apply_top_k_top_p(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
@@ -136,10 +146,18 @@ def apply_top_k_top_p(
 ) -> torch.Tensor:
     """Apply top-k and top-p masks to the logits.
 
-    This function sorts the logits tensor, which can be slow for large batches.
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
     """
-    if k is None and p is None:
-        return logits
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
     logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
 
     if k is not None:
@@ -153,7 +171,7 @@ def apply_top_k_top_p(
     if p is not None:
         # Apply top-p.
         probs_sort = logits_sort.softmax(dim=-1)
-        probs_sum = probs_sort.cumsum(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1, out=probs_sort)
         top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
         # at least one
         top_p_mask[:, -1] = False
@@ -164,6 +182,31 @@ def apply_top_k_top_p(
     return logits
 
 
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k mask to the logits.
+
+    This implementation doesn't involve sorting the entire vocab.
+
+    The logits tensor may be updated in-place.
+    """
+    no_top_k_mask = k == logits.shape[1]
+    # Set non-top-k rows to 1 so that we can gather.
+    k = k.masked_fill(no_top_k_mask, 1)
+    max_top_k = k.max()
+    # topk.values tensor has shape [batch_size, max_top_k].
+    # Convert top k to 0-based index in range [0, max_top_k).
+    k_index = k.sub_(1).unsqueeze(1)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index)
+    # Handle non-topk rows.
+    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
+    logits.masked_fill_(logits < top_k_mask, -float("inf"))
+    return logits
+
+
 def random_sample(
     probs: torch.Tensor,
     generators: dict[int, torch.Generator],
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 397a049dc25..004f98496b0 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -87,6 +87,12 @@ def sample(
         logits: torch.Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> torch.Tensor:
+        """Sample logits based on sampling metadata.
+
+        The various logits processing functions called in this method
+        may update the logits tensor in-place.
+        """
+
         assert not (sampling_metadata.all_greedy
                     and sampling_metadata.all_random)
         if sampling_metadata.all_random:

From 0e2d51533a5e4dd8dd0963538d7f14a898edea4d Mon Sep 17 00:00:00 2001
From: marko <5467316+dr75@users.noreply.github.com>
Date: Wed, 26 Mar 2025 19:11:28 +0100
Subject: [PATCH 0995/1240] Support SHA256 as hash function in prefix caching
 (#15297)

Signed-off-by: Marko Rosenmueller <5467316+dr75@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/v1/prefix_caching.md |  7 ++-
 tests/test_utils.py                     | 23 +++++++-
 tests/v1/core/test_kv_cache_utils.py    | 42 +++++++++-----
 tests/v1/core/test_prefix_caching.py    | 22 ++++++--
 tests/v1/engine/test_engine_args.py     | 20 +++++++
 vllm/config.py                          |  9 +++
 vllm/engine/arg_utils.py                | 38 +++++++++++--
 vllm/utils.py                           | 20 +++++++
 vllm/v1/core/block_pool.py              | 20 ++++---
 vllm/v1/core/kv_cache_manager.py        |  8 ++-
 vllm/v1/core/kv_cache_utils.py          | 75 ++++++++++++++-----------
 vllm/v1/core/sched/scheduler.py         |  1 +
 12 files changed, 214 insertions(+), 71 deletions(-)

diff --git a/docs/source/design/v1/prefix_caching.md b/docs/source/design/v1/prefix_caching.md
index 3d14a76840d..ec1f3cb8d64 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/source/design/v1/prefix_caching.md
@@ -15,12 +15,13 @@ Block 3: |<------------------ prefix -------------------->| |<--- block tokens -
 In the example above, the KV cache in the first block can be uniquely identified with the token “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the block hash of `hash(tuple[components])`, where components are:
 
 * Parent hash value: The hash value of the parent hash block.
-* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.  
+* Block tokens: A tuple of tokens in this block. The reason to include the exact tokens is to reduce potential hash value collision.
 * Extra hashes: Other values required to make this block unique, such as LoRA IDs and multi-modality input hashes (see the example below).
 
-Note 1: We only cache full blocks.
+> **Note 1:** We only cache full blocks.
 
-Note 2: The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value, but this should be nearly impossible to happen. Of course, contributions are welcome if you have an awesome idea to eliminate collusion entirely.
+> **Note 2:** The above hash key structure is not 100% collision free. Theoretically it’s still possible for the different prefix tokens to have the same hash value. To avoid any hash collisions **in a multi-tenant setup, we advise to use SHA256** as hash function instead of the default builtin hash.
+SHA256 is supported since vLLM v0.8.3 and must be enabled with a command line argument. It comes with a performance impact of about 100-200ns per token (~6ms for 50k tokens of context).
 
 **A hashing example with multi-modality inputs**  
 In this example, we illustrate how prefix caching works with multi-modality inputs (e.g., images). Assuming we have a request with the following messages:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 3660cfa0e49..ccbbffcabfc 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,6 +2,8 @@
 # ruff: noqa
 
 import asyncio
+import hashlib
+import pickle
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
@@ -14,7 +16,8 @@
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                         PlaceholderModule, StoreBoolean, bind_kv_cache,
                         deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw, swap_dict_values)
+                        merge_async_iterators, sha256, supports_kw,
+                        swap_dict_values)
 
 from .utils import create_new_process_for_each_test, error_on_warning
 
@@ -476,3 +479,21 @@ def test_swap_dict_values(obj, key1, key2):
         assert obj[key1] == original_obj[key2]
     else:
         assert key1 not in obj
+
+@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
+                                    (None, bool, [1, 2, 3])])
+@pytest.mark.parametrize("output", [0, 1, 2])
+def test_sha256(input: tuple, output: int):
+    hash = sha256(input)
+    assert hash is not None
+    assert isinstance(hash, int)
+    assert hash != 0
+
+    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+
+    # hashing again, returns the same value
+    assert hash == sha256(input)
+
+    # hashing different input, returns different value
+    assert hash != sha256(input + (1, ))
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 3fecb517c43..8362af24a67 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -5,8 +5,12 @@
 
 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, PrefixCachingMetrics,
+from vllm.utils import sha256
+# disable yapf here as it formats differently than isort such that both fail
+# yapf: disable
+from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         PrefixCachingMetrics,
                                          generate_block_hash_extra_keys,
                                          hash_block_tokens,
                                          hash_request_tokens,
@@ -16,6 +20,8 @@
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
+# yapf: enable
+
 
 def make_request(request_id,
                  prompt_token_ids,
@@ -40,6 +46,12 @@ def make_request(request_id,
     )
 
 
+def test_none_hash():
+    assert NONE_HASH is not None
+    assert isinstance(NONE_HASH, int)
+    assert NONE_HASH != 0
+
+
 def test_kv_cache_block():
     # Test KVCacheBlock initialization
     block = KVCacheBlock(block_id=0)
@@ -190,21 +202,23 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
     assert next_mm_idx == 0
 
 
-def test_hash_block_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_block_tokens(hash_fn):
     parent_block_hash = 123
     curr_block_token_ids = (1, 2, 3)
     extra_keys = ("key1", "key2")
 
-    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
-                                   extra_keys)
+    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                   curr_block_token_ids, extra_keys)
     assert isinstance(block_hash, BlockHashType)
-    assert block_hash.hash_value == hash(
+    assert block_hash.hash_value == hash_fn(
         (parent_block_hash, curr_block_token_ids, extra_keys))
     assert block_hash.token_ids == curr_block_token_ids
     assert block_hash.extra_keys == extra_keys
 
 
-def test_hash_request_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -219,7 +233,7 @@ def test_hash_request_tokens():
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert isinstance(block_hashes[0], BlockHashType)
@@ -234,7 +248,8 @@ def test_hash_request_tokens():
     assert block_hashes[1].extra_keys == ("hash2", )
 
 
-def test_hash_tokens_different_mm_input():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -260,13 +275,14 @@ def test_hash_tokens_different_mm_input():
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
-    block_hashes1 = hash_request_tokens(block_size, request1)
-    block_hashes2 = hash_request_tokens(block_size, request2)
+    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
+    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
     assert block_hashes1[0] != block_hashes2[0]
     assert block_hashes1[1] != block_hashes2[1]
 
 
-def test_hash_request_tokens_no_mm_inputs():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens_no_mm_inputs(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
@@ -275,7 +291,7 @@ def test_hash_request_tokens_no_mm_inputs():
     )
 
     block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)
 
     assert len(block_hashes) == 2
     assert block_hashes[0].token_ids == (0, 1, 2)
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 6129752bcdd..72a1874fbd4 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -7,7 +7,7 @@
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
@@ -39,16 +39,21 @@ def make_request(request_id,
     )
 
 
-def test_prefill():
+@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
+def test_prefill(hash_algo):
     manager = KVCacheManager(
         block_size=16,
         num_gpu_blocks=10,
         max_model_len=8192,
         sliding_window=None,
         enable_caching=True,
+        caching_hash_algo=hash_algo,
         num_preallocate_tokens=16,
     )
 
+    # choose the hash function according to the parameter
+    hash_fn = sha256 if hash_algo == "sha256" else hash
+
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
 
@@ -68,7 +73,8 @@ def test_prefill():
     parent_block_hash = None
     for block_id in (0, 1, 2):
         block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
@@ -163,6 +169,8 @@ def test_prefill_plp():
         enable_caching=True,
         num_preallocate_tokens=16,
     )
+    # the default hash function is hash
+    hash_fn = hash
 
     # Complete 3 blocks (48 tokens)
     common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -185,7 +193,8 @@ def test_prefill_plp():
     parent_block_hash = None
     for block_id in (0, 1, 2):
         block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
         parent_block_hash = block_hash.hash_value
@@ -522,7 +531,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     assert len(blocks) == 1 + num_preallocated_blocks
 
 
-def test_cache_blocks():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_cache_blocks(hash_fn):
     """
     This is a unit test that tests the correctness of the _cache_full_blocks
     function of KVCacheManager.
@@ -550,6 +560,7 @@ def test_cache_blocks():
         num_cached_blocks=0,
         num_full_blocks=2,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
 
     assert len(block_pool.cached_block_hash_to_block) == 2
@@ -564,6 +575,7 @@ def test_cache_blocks():
         num_cached_blocks=2,
         num_full_blocks=3,
         block_size=block_size,
+        hash_fn=hash_fn,
     )
     assert len(block_pool.cached_block_hash_to_block) == 3
     assert blocks[0].block_hash is not None
diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py
index 02470ca92f4..8963b21c4eb 100644
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from argparse import ArgumentError
+
 import pytest
 
 from vllm import envs
@@ -32,6 +34,24 @@ def test_prefix_caching_from_cli():
     vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
     assert vllm_config.cache_config.enable_prefix_caching
 
+    # default hash algorithm is "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # set hash algorithm to sha256
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to builtin
+    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # an invalid hash algorithm raises an error
+    parser.exit_on_error = False
+    with pytest.raises(ArgumentError):
+        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
+
 
 def test_defaults_with_usage_context():
     engine_args = EngineArgs(model="facebook/opt-125m")
diff --git a/vllm/config.py b/vllm/config.py
index 6f2da6aa871..94cecba1e1f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1124,6 +1124,7 @@ def __init__(
         num_gpu_blocks_override: Optional[int] = None,
         sliding_window: Optional[int] = None,
         enable_prefix_caching: bool = False,
+        prefix_caching_hash_algo: str = "builtin",
         cpu_offload_gb: float = 0,
         calculate_kv_scales: Optional[bool] = None,
     ) -> None:
@@ -1135,6 +1136,7 @@ def __init__(
         self.is_attention_free = is_attention_free
         self.sliding_window = sliding_window
         self.enable_prefix_caching = enable_prefix_caching
+        self.prefix_caching_hash_algo = prefix_caching_hash_algo
         self.cpu_offload_gb = cpu_offload_gb
         self.calculate_kv_scales = calculate_kv_scales
         self._verify_args()
@@ -1185,6 +1187,13 @@ def _verify_prefix_caching(self) -> None:
                 "Prefix caching is not supported with sliding window. "
                 "Run with --disable-sliding-window to use prefix caching.")
 
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo not in (
+                "builtin", "sha256"):
+            raise ValueError(
+                "Unknown prefix caching hash algorithm: "
+                f"{self.prefix_caching_hash_algo}. Must be either "
+                "'builtin' or 'sha256'.")
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index be00689f2b5..364555b3458 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -118,6 +118,7 @@ class EngineArgs:
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
     enable_prefix_caching: Optional[bool] = None
+    prefix_caching_hash_algo: str = "builtin"
     disable_sliding_window: bool = False
     disable_cascade_attn: bool = False
     use_v2_block_manager: bool = True
@@ -475,6 +476,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help="Enables automatic prefix caching. "
             "Use ``--no-enable-prefix-caching`` to disable explicitly.",
         )
+        parser.add_argument(
+            "--prefix-caching-hash-algo",
+            type=str,
+            choices=["builtin", "sha256"],
+            default=EngineArgs.prefix_caching_hash_algo,
+            help="Set the hash algorithm for prefix caching. "
+            "Options are 'builtin' (Python's built-in hash) or 'sha256' "
+            "(collision resistant but with certain overheads). Defaults "
+            "to 'builtin'.",
+        )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
                             help='Disables sliding window, '
@@ -1329,6 +1340,7 @@ def create_engine_config(
             num_gpu_blocks_override=self.num_gpu_blocks_override,
             sliding_window=model_config.get_sliding_window(),
             enable_prefix_caching=self.enable_prefix_caching,
+            prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
         )
@@ -1737,12 +1749,22 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
             msg = "Chunked prefill is not supported for pooling models"
             raise ValueError(msg)
 
-        # Disable prefix caching for multimodal models for VLLM_V0.
-        if (model_config.is_multimodal_model and self.enable_prefix_caching):
-            logger.warning(
-                "--enable-prefix-caching is not supported for multimodal "
-                "models in V0 and has been disabled.")
-            self.enable_prefix_caching = False
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching:
+            # Disable prefix caching for multimodal models for VLLM_V0.
+            if model_config.is_multimodal_model:
+                logger.warning(
+                    "--enable-prefix-caching is not supported for multimodal "
+                    "models in V0 and has been disabled.")
+                self.enable_prefix_caching = False
+
+            # VLLM_V0 only supports builtin hash algo for prefix caching.
+            if self.prefix_caching_hash_algo is None:
+                self.prefix_caching_hash_algo = "builtin"
+            elif self.prefix_caching_hash_algo == "sha256":
+                raise ValueError(
+                    "sha256 is not supported for prefix caching in V0 engine. "
+                    "Please use 'builtin'.")
 
         # Set max_num_seqs to 256 for VLLM_V0.
         if self.max_num_seqs is None:
@@ -1758,6 +1780,10 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
         if self.enable_prefix_caching is None:
             self.enable_prefix_caching = True
 
+        # if using prefix caching, we must set a hash algo
+        if self.enable_prefix_caching and self.prefix_caching_hash_algo is None:
+            self.prefix_caching_hash_algo = "builtin"
+
         # V1 should use the new scheduler by default.
         # Swap it only if this arg is set to the original V0 default
         if self.scheduler_cls == EngineArgs.scheduler_cls:
diff --git a/vllm/utils.py b/vllm/utils.py
index 9e14a628993..101342333e6 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -10,6 +10,7 @@
 import enum
 import gc
 import getpass
+import hashlib
 import importlib
 import importlib.metadata
 import importlib.util
@@ -17,6 +18,7 @@
 import ipaddress
 import multiprocessing
 import os
+import pickle
 import re
 import signal
 import socket
@@ -2442,3 +2444,21 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return decorator
+
+
+def sha256(input) -> int:
+    """Hash any picklable Python object using SHA-256.
+
+    The input is serialized using pickle before hashing, which allows
+    arbitrary Python objects to be used. Note that this function does
+    not use a hash seed—if you need one, prepend it explicitly to the input.
+
+    Args:
+        input: Any picklable Python object.
+
+    Returns:
+        An integer representing the SHA-256 hash of the serialized input.
+    """
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    return int.from_bytes(hashlib.sha256(input_bytes).digest(),
+                          byteorder="big")
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 394b47fddf0..79b0c42d4f8 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 from collections import defaultdict
 from collections.abc import Iterable
-from typing import Optional
+from typing import Callable, Optional
 
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
@@ -15,10 +15,10 @@
 
 class BlockPool:
     """BlockPool that manages KVCacheBlocks.
-    It provides methods to allocate, free and cache the kv cache blocks. The 
-    free_block_queue stores the free blocks in eviction order to enable 
-    allocation, free, and cache eviction. The cached_block_hash_to_block 
-    maps between block hash and cached block to support finding cached blocks 
+    It provides methods to allocate, free and cache the kv cache blocks. The
+    free_block_queue stores the free blocks in eviction order to enable
+    allocation, free, and cache eviction. The cached_block_hash_to_block
+    maps between block hash and cached block to support finding cached blocks
     by their block hash.
 
     Args:
@@ -75,11 +75,12 @@ def cache_full_blocks(
         num_cached_blocks: int,
         num_full_blocks: int,
         block_size: int,
+        hash_fn: Callable,
     ) -> None:
         """Cache a list of full blocks for prefix caching.
         This function takes a list of blocks that will have their block hash
         metadata to be updated and cached. Given a request, it computes the
-        block hashes for the blocks starting from `num_cached_blocks` to 
+        block hashes for the blocks starting from `num_cached_blocks` to
         `num_full_blocks`, updating the metadata for each block
         and caching them in the `cached_block_hash_to_block`.
 
@@ -87,12 +88,13 @@ def cache_full_blocks(
             request: The request to cache the blocks.
             blocks: All blocks in the request.
             block_hashes: Block hashes of the blocks in the request. Note that
-            this list may be shorter than the blocks list. In this case the 
+            this list may be shorter than the blocks list. In this case the
             missed block hash will be computed in this function.
             num_cached_blocks: The number of blocks that are already cached.
-            num_full_blocks: The number of blocks that are full and should 
+            num_full_blocks: The number of blocks that are full and should
                 be cached after this function.
             block_size: Number of tokens in each block.
+            hash_fn: The hash function to use for block hashes.
         """
         if num_cached_blocks == num_full_blocks:
             return
@@ -138,7 +140,7 @@ def cache_full_blocks(
                     request, start_token_idx, end_token_idx, -1)
 
                 # Compute the hash of the current block.
-                block_hash = hash_block_tokens(prev_block_hash_value,
+                block_hash = hash_block_tokens(hash_fn, prev_block_hash_value,
                                                block_tokens, extra_keys)
                 block_hashes.append(block_hash)
 
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 5cfe2b96865..39390babaa8 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -5,7 +5,7 @@
 from typing import Optional
 
 from vllm.logger import init_logger
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
@@ -24,6 +24,7 @@ def __init__(
         max_model_len: int,
         sliding_window: Optional[int] = None,
         enable_caching: bool = True,
+        caching_hash_algo: str = "builtin",
         num_preallocate_tokens: int = 64,
         log_stats: bool = False,
     ) -> None:
@@ -33,6 +34,7 @@ def __init__(
         self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
         self.sliding_window = sliding_window
         self.enable_caching = enable_caching
+        self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
         # FIXME: make prefix cache stats conditional on log_stats
         self.log_stats = log_stats
         # NOTE(woosuk): To avoid frequent block allocation, we preallocate some
@@ -109,7 +111,8 @@ def get_computed_blocks(
         # if the scheduler has tried to schedule the request before.
         block_hashes = self.req_to_block_hashes[request.request_id]
         if not block_hashes:
-            block_hashes = hash_request_tokens(self.block_size, request)
+            block_hashes = hash_request_tokens(self.caching_hash_fn,
+                                               self.block_size, request)
             self.req_to_block_hashes[request.request_id] = block_hashes
 
         self.prefix_cache_stats.requests += 1
@@ -247,6 +250,7 @@ def allocate_slots(
             num_cached_blocks=num_cached_blocks,
             num_full_blocks=num_full_blocks_after_append,
             block_size=self.block_size,
+            hash_fn=self.caching_hash_fn,
         )
 
         self.num_cached_block[
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index e0d7f4dbdc1..0d58d4d2218 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 """KV-Cache Utilities."""
+import os
 from collections import deque
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, NamedTuple, Optional
+from typing import Any, Callable, NamedTuple, Optional
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.utils import sha256
 from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
                                         KVCacheSpec, KVCacheTensor)
 from vllm.v1.metrics.stats import PrefixCacheStats
@@ -18,9 +20,8 @@
 class BlockHashType(NamedTuple):
     """Hash value of a block (int), the token IDs in the block, and extra keys.
     We keep a tuple of token IDs and extra keys to reduce the likelihood of
-    hash collisions when the hash value is the same. But please note that 
-    hash collisions can still theoretically occur, albeit with an extremely 
-    low probability.
+    hash collisions when the hash value is the same. By using SHA256 however,
+    hash collisions are practically impossible.
     """
     # Hash value of the block in an integer.
     hash_value: int
@@ -30,6 +31,20 @@ class BlockHashType(NamedTuple):
     extra_keys: Optional[Any] = None
 
 
+# The hash seed for the first block of the prefix block sequence.
+#
+# Even if the hash function is the builtin hash(), we use sha256 to generate
+# the initial hash to simplify the code. This is not performance critical
+# as it is done one per process.
+#
+# We use a random value to avoid hash collisions or PYTHONHASHSEED environment
+# variable if set such that processes can share the seed if needed.
+# This aligns with the behavior of Python's hash() function, which also uses
+# a random seed if PYTHONHASHSEED is not set.
+NONE_HASH = int.from_bytes(os.urandom(32), byteorder="big") if os.getenv(
+    'PYTHONHASHSEED') is not None else sha256(os.getenv('PYTHONHASHSEED'))
+
+
 class PrefixCachingMetrics:
     """Metrics for prefix caching with a hit rate of the most recent N requests.
 
@@ -148,7 +163,7 @@ class FreeKVCacheBlockQueue:
     builtin deque to support removing a block in the middle of the queue
     in O(1) time. To close the performance gap to the builtin deque which is
     implemented in C++, this class does not allocate any Python objects when
-    manipulating the linked list. Instead, this class manipulates the 
+    manipulating the linked list. Instead, this class manipulates the
     prev_free_block and next_free_block attributes of the given blocks.
 
     The queue is ordered by block ID in the beginning. When a block is allocated
@@ -178,7 +193,7 @@ def __init__(self, blocks: list[KVCacheBlock]) -> None:
 
     def popleft(self) -> KVCacheBlock:
         """Pop the first free block and reduce num_free_blocks by 1.
-        
+
         Returns:
             The first free block.
         """
@@ -191,7 +206,7 @@ def popleft(self) -> KVCacheBlock:
 
     def remove(self, block: KVCacheBlock) -> None:
         """Remove a block in the free list and reduce num_free_blocks by 1.
-        
+
         Args:
             block: The block to remove.
         """
@@ -235,7 +250,7 @@ def append(self, block: KVCacheBlock) -> None:
 
     def get_all_free_blocks(self) -> list[KVCacheBlock]:
         """Get all free blocks in the free list. Mainly used for testing.
-        
+
         Returns:
             A list of free blocks.
         """
@@ -251,10 +266,10 @@ def need_extra_keys(request: Request) -> bool:
     """Check whether the blocks allocated to this request need extra hash keys.
 
     Args:
-        request (Request): The request. 
+        request (Request): The request.
 
     Returns:
-        bool: Whether blocks allocated to this request need extra hash keys. 
+        bool: Whether blocks allocated to this request need extra hash keys.
     """
 
     # Multimodal requests need to include the MM hash.
@@ -269,13 +284,13 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     computation. For multi-modal inputs, the extra keys are
     (mm_hash, start_offset) that indicate a mm input contained in the
     block and its starting offset in the block tokens.
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -333,10 +348,10 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
 
 def _gen_lora_extra_hash_keys(request: Request) -> list[int]:
     """Generate extra keys related to LoRA for block hash computation.
-    
+
     Args:
         request: The request object.
-    
+
     Returns:
         Return LoRA id of the request if it is a LoRA request. Return empty
         list otherwise.
@@ -351,13 +366,13 @@ def generate_block_hash_extra_keys(
         start_mm_idx: int) -> tuple[Optional[tuple[Any, ...]], int]:
     """Generate extra keys for the block hash. The extra keys can come from
     the multi-modal inputs and request specific metadata (e.g., LoRA ID).
-    
+
     Args:
         request: The request object.
         start_token_idx: The start token index of the block.
         end_token_idx: The end token index of the block.
         start_mm_idx: The start multi-modal index of the block.
-    
+
     Returns:
         A tuple of extra keys and the next multi-modal index.
     """
@@ -375,6 +390,7 @@ def generate_block_hash_extra_keys(
 
 
 def hash_block_tokens(
+        hash_function: Callable,
         parent_block_hash: Optional[int],
         curr_block_token_ids: Sequence[int],
         extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType:
@@ -395,21 +411,16 @@ def hash_block_tokens(
         The entire tuple is used as the hash key of the block.
     """
     if not parent_block_hash:
-        # Note that we use 'None' as a string here instead of None because
-        # as of Python 3.12, hash(None) returns a constant predictable value.
-        # This could possibly make it easier to find and exploit hash
-        # collisions. 'None' as a string will be hashed differently per process,
-        # but consistently within the same process. This is the same as the
-        # behavior of None prior to Python 3.12.
-        parent_block_hash = hash('None')
+        parent_block_hash = NONE_HASH
 
     curr_block_token_ids_tuple = tuple(curr_block_token_ids)
     return BlockHashType(
-        hash((parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
+        hash_function(
+            (parent_block_hash, curr_block_token_ids_tuple, extra_keys)),
         curr_block_token_ids_tuple, extra_keys)
 
 
-def hash_request_tokens(block_size: int,
+def hash_request_tokens(hash_function: Any, block_size: int,
                         request: Request) -> list[BlockHashType]:
     """Computes hash values of a chain of blocks given a sequence of
     token IDs. The hash value is used for prefix caching.
@@ -441,7 +452,7 @@ def hash_request_tokens(block_size: int,
             req_extra_keys, curr_mm_idx = generate_block_hash_extra_keys(
                 request, start, end, curr_mm_idx)
 
-        block_hash = hash_block_tokens(parent_block_hash_value,
+        block_hash = hash_block_tokens(hash_function, parent_block_hash_value,
                                        block_token_ids, req_extra_keys)
         ret.append(block_hash)
         parent_block_hash_value = block_hash.hash_value
@@ -452,7 +463,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
                                  kv_cache_spec: dict[str, KVCacheSpec],
                                  available_memory: int):
     """
-    Checks whether `available_memory` is enough for the KV cache to hold at 
+    Checks whether `available_memory` is enough for the KV cache to hold at
     least one request with the model's max_model_len.
 
     Args:
@@ -489,15 +500,15 @@ def create_kv_cache_group_specs(
         grouped_layer_names: list[list[str]]) -> list[KVCacheGroupSpec]:
     """
      Create KVCacheGroupSpec object for each kv cache group layer.
-     The layers in the same group should share the same 
+     The layers in the same group should share the same
      KVCacheSpec.
 
      Args:
          kv_cache_spec:
              A mapping from each layer name to its corresponding KVCacheSpec.
          grouped_layer_names:
-             A list of kv cache groups, where each element is a list of layer 
-             names that belong to the same group and should share the same 
+             A list of kv cache groups, where each element is a list of layer
+             names that belong to the same group and should share the same
              KVCacheSpec.
      Returns:
          A list of KVCacheGroupSpec objects, one for each group.
@@ -614,11 +625,11 @@ def get_kv_cache_config(vllm_config: VllmConfig,
 
 def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
     """
-    Make the KV cache configurations for each worker consistent, so that all 
+    Make the KV cache configurations for each worker consistent, so that all
     workers can be controlled by the same KVCacheManager.
     This function verifies that the layer group of each worker are the same,
     and changes the num_blocks of each worker to the smallest among all workers.
-    
+
     Args:
         kv_cache_configs: The KV cache configurations for each worker. Will be
             in-place modified to make them consistent.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 924796e03da..850687423df 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -61,6 +61,7 @@ def __init__(
             max_model_len=self.max_model_len,
             sliding_window=self.cache_config.sliding_window,
             enable_caching=self.cache_config.enable_prefix_caching,
+            caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
 

From 63eb14c33f464fed943d1da673a41a1936f565b5 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Wed, 26 Mar 2025 15:35:11 -0500
Subject: [PATCH 0996/1240] Applying some fixes for K8s agents in CI (#15493)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh | 10 ++++++----
 Dockerfile.rocm            |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 0680bae13dd..e5a1b760db1 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -134,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     # assign shard-id for each shard
     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
     echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
-        --device /dev/kfd --device /dev/dri \
-        --network host \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
         --shm-size=16gb \
         --rm \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
@@ -163,9 +164,10 @@ if [[ $commands == *"--shard-id="* ]]; then
     fi
   done
 else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
   docker run \
-          --device /dev/kfd --device /dev/dri \
-          --network host \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
           --shm-size=16gb \
           --rm \
           -e HIP_VISIBLE_DEVICES=0 \
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 841e7978a42..f9ebb10ca87 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -12,7 +12,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
+    apt-transport-https ca-certificates wget curl
 # Remove sccache    
 RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"

From 63cd4fec79e9830395e43baaf4590d182f04e69c Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:35:05 -0400
Subject: [PATCH 0997/1240] [V1] TPU - Revert to exponential padding by default
 (#15565)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                       |  4 ++--
 vllm/v1/worker/tpu_model_runner.py | 35 ++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 4c413006a64..46c5b3a1dc5 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,7 +99,7 @@
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
-    VLLM_TPU_BUCKET_PADDING_GAP: int = 64
+    VLLM_TPU_BUCKET_PADDING_GAP: int = 0
 
 
 def get_default_cache_root():
@@ -648,7 +648,7 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # 8, we will run forward pass with [16, 24, 32, ...].
     "VLLM_TPU_BUCKET_PADDING_GAP":
     lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
-    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 64,
+    if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
 }
 
 # end-env-vars-definition
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index edf859f0b94..cf5c56b98be 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -944,18 +944,35 @@ def _get_paddings(min_token_size: int, max_token_size: int,
                   padding_gap: int) -> list[int]:
     """Generate a list of padding size, starting from min_token_size, 
     ending with a number that can cover max_token_size
-    first increase the size to twice, 
-    then increase the padding size by padding_gap.
+    
+    If padding_gap == 0 then:
+        increase 2X each time (exponential)
+    else:
+        first increase the size to twice, 
+        then increase the padding size by padding_gap.
     """
     paddings = []
     num = min_token_size
-    while num <= padding_gap:
-        paddings.append(num)
-        num *= 2
-    num //= 2
-    while num < max_token_size:
-        num += padding_gap
-        paddings.append(num)
+
+    if padding_gap == 0:
+        logger.info("Using exponential paddings:")
+        while num <= max_token_size:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+
+    else:
+        logger.info("Using incremental paddings:")
+        while num <= padding_gap:
+            logger.info("    %d", num)
+            paddings.append(num)
+            num *= 2
+        num //= 2
+        while num < max_token_size:
+            num += padding_gap
+            logger.info("    %d", num)
+            paddings.append(num)
+
     return paddings
 
 
From 930df2b5298aef8d6a8515ef216ada373f155864 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Wed, 26 Mar 2025 17:51:54 -0400
Subject: [PATCH 0998/1240] [V1] TPU CI - Fix test_compilation.py (#15570)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh |  2 +-
 tests/tpu/test_compilation.py | 57 +++++++++++------------------------
 2 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index d557feefba7..6e1f79ae649 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -22,7 +22,7 @@ docker run --privileged --net host --shm-size=16G -it \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
     && echo TEST_1 \
-    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
+    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
     && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
     && echo TEST_3 \
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index e70b3e17c6f..27328d4542d 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,12 +5,8 @@
 import tempfile
 
 import depyf
-import pytest
 
-from vllm.config import CompilationLevel
 
-
-@pytest.mark.skip(reason="Not working; needs investigation.")
 def test_tpu_compilation():
     temp_dir = tempfile.mkdtemp()
     with depyf.prepare_debug(temp_dir):
@@ -22,27 +18,24 @@ def test_tpu_compilation():
             "The greatest glory in living lies not in never falling,",
         ]
         answers = [
-            " or, through inaction, allow a human being to come to harm.",
-            " what is essential is invisible to the eye.",
-            " but in rising every time we fall.",
+            " or, through inaction",
+            " what is essential ",
+            " but in rising ",
         ]
-        N = 1
+
         # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        N = 1
         sampling_params = SamplingParams(temperature=0.7,
                                          top_p=1.0,
                                          n=N,
                                          max_tokens=16)
 
-        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-        # In real workloads, `enforace_eager` should be `False`.
-
-        # disable custom dispatcher, let Dynamo takes over
-        # all the control
         llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-                  max_model_len=512,
-                  max_num_seqs=64,
-                  enforce_eager=True,
-                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
+                  max_num_batched_tokens=256,
+                  max_model_len=256,
+                  max_num_seqs=32,
+                  enforce_eager=False)
+
         outputs = llm.generate(prompts, sampling_params)
         for output, answer in zip(outputs, answers):
             prompt = output.prompt
@@ -56,16 +49,11 @@ def test_tpu_compilation():
     for i, compiled_code in enumerate(compiled_codes):
         print("{} file: {}".format(i + 1, compiled_code))
 
-    # We should only trigger Dynamo compilation 4 times:
-    # 1. forward pass (symbolic)
-    # 2. compute_logits (symbolic)
-    # 3. forward pass (shape 16)
-    # 4. forward pass (shape 32)
-    # and later calls should not trigger Dynamo compilation again.
-    # NOTE: It might still trigger XLA compilation.
-
+    # We should only trigger Dynamo compilation 2 times:
+    # 1. Forward pass without kv_caches
+    # 2. Forward pass with kv_caches
     # Check we have 4 compiled codes
-    assert len(compiled_codes) == 4
+    assert len(compiled_codes) == 2
 
     kv_cache_prefix = "kv_cache"
     attn_prefix = "ragged_paged_attention"
@@ -77,24 +65,13 @@ def test_tpu_compilation():
     for i, compiled_fn in enumerate(compiled_fns):
         print("{} file: {}".format(i + 1, compiled_fn))
 
-    # The first compilation is symbolic, so it should not have any kv_caches
+    # The first compilation should not have any kv_caches
     with open(compiled_fns[0]) as f:
         content = f.read()
         assert kv_cache_prefix not in content
 
-    # The second compilation is symbolic, so it should not have any kv_caches
-    with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-
-    # The third compilation is shape 16, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[2]) as f:
-        content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
-
-    # The forth compilation is shape 32, so it should have kv_caches and the
+    # The second compilation should have kv_caches and the
     # ragged_paged_attention
-    with open(compiled_fns[3]) as f:
+    with open(compiled_fns[1]) as f:
         content = f.read()
         assert (kv_cache_prefix in content and attn_prefix in content)

From 0a9f2e4eca88b083a107a53b9de4748a1d369a0c Mon Sep 17 00:00:00 2001
From: Wes <wryanmedford@gmail.com>
Date: Wed, 26 Mar 2025 17:21:34 -0600
Subject: [PATCH 0999/1240] Use Cache Hinting for fused_moe kernel (#15511)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 97e915c6033..faaea6b4de9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -189,7 +189,11 @@ def fused_moe_kernel_gptq_awq(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(b_ptrs)
+        b = tl.load(
+            b_ptrs,
+            cache_modifier=".cg",
+            eviction_policy="evict_last",
+        )
         if use_int4_w4a16:
             b = (b >> b_shifter) & 0xF
 
@@ -391,9 +395,13 @@ def fused_moe_kernel(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(b_ptrs,
-                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-                    other=0.0)
+        b = tl.load(
+            b_ptrs,
+            mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+            other=0.0,
+            cache_modifier=".cg",
+            eviction_policy="evict_last",
+        )
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)

From 32bbe1d6c2ebb82d76f40902729b1abbdd7c0400 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 26 Mar 2025 17:09:28 -0700
Subject: [PATCH 1000/1240] [TPU] support disabling xla compilation cache
 (#15567)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 13 ++++++++++---
 vllm/worker/tpu_worker.py    | 13 ++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9a380373d46..4d9a113e39e 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -113,9 +113,16 @@ def init_device(self):
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         # Init ModelRunner here, so that we have access to self.device.
         self.model_runner = TPUModelRunner(self.vllm_config, self.device)
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 66911790662..71b4b38fb9d 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -93,9 +93,16 @@ def init_device(self) -> None:
         # can have slightly different XLA graphs.
         world_size = self.parallel_config.world_size
         rank = xr.global_ordinal()
-        per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
-                                     f"tp{world_size}_rank{rank}")
-        xr.initialize_cache(per_rank_path, readonly=False)
+        # The PyTorch/XLA compilation cache uses the Torch IR to generate keys.
+        # Consequently, changes in optimization flags, which affect compilation
+        # results, don't change the cache key. This can result in the wrong
+        # compilation being used. To prevent this, disabling the XLA compilation
+        # cache during development is recommended.We can disable it by
+        # `export VLLM_XLA_CACHE_PATH=`
+        if envs.VLLM_XLA_CACHE_PATH:
+            per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH,
+                                         f"tp{world_size}_rank{rank}")
+            xr.initialize_cache(per_rank_path, readonly=False)
 
         self.profiler = None
         if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1:

From cc52a9dc7bf9cf1019e69ae16c6a2ed4f9f46594 Mon Sep 17 00:00:00 2001
From: Matthew Vine <32849887+MattTheCuber@users.noreply.github.com>
Date: Wed, 26 Mar 2025 20:19:46 -0400
Subject: [PATCH 1001/1240] Support FIPS enabled machines with MD5 hashing
 (#15299)

Signed-off-by: Matthew Vine <32849887+MattTheCuber@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/piecewise/test_toy_llama.py |  3 +-
 vllm/compilation/backends.py              |  7 ++--
 vllm/compilation/compiler_interface.py    |  3 +-
 vllm/config.py                            | 42 +++++++++++++++--------
 4 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 7307f44b618..d4551b1cc3a 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -63,7 +63,8 @@ def compute_hash(self) -> str:
             factors.append((k, v))
         factors.sort()
         import hashlib
-        return hashlib.md5(str(factors).encode()).hexdigest()
+        return hashlib.md5(str(factors).encode(),
+                           usedforsecurity=False).hexdigest()
 
     def __post_init__(self):
         assert self.mlp_size >= self.hidden_size
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index d8c0c59ba9b..45988c2e9b0 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -381,8 +381,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
                 with open(filepath) as f:
                     hash_content.append(f.read())
             import hashlib
-            code_hash = hashlib.md5(
-                "\n".join(hash_content).encode()).hexdigest()
+            code_hash = hashlib.md5("\n".join(hash_content).encode(),
+                                    usedforsecurity=False).hexdigest()
             factors.append(code_hash)
 
             # 3. compiler hash
@@ -390,7 +390,8 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
             factors.append(compiler_hash)
 
             # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+            hash_key = hashlib.md5(str(factors).encode(),
+                                   usedforsecurity=False).hexdigest()[:10]
 
             cache_dir = os.path.join(
                 envs.VLLM_CACHE_ROOT,
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b45c694fd7f..571e2b832e9 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -139,7 +139,8 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         from torch._inductor.codecache import torch_key
         torch_factors = torch_key()
         factors.append(torch_factors)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
diff --git a/vllm/config.py b/vllm/config.py
index 94cecba1e1f..2e9325c258b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1111,7 +1111,8 @@ def compute_hash(self) -> str:
         factors: list[Any] = []
         factors.append(self.cache_dtype)
         # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __init__(
@@ -1243,7 +1244,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -1354,7 +1356,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -1674,7 +1677,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
@@ -1810,7 +1814,8 @@ def compute_hash(self) -> str:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __init__(self, device: str = "auto") -> None:
@@ -1983,7 +1988,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # spec decode does not use `torch.compile` yet.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -2358,7 +2364,8 @@ def compute_hash(self) -> str:
         factors.append(self.lora_extra_vocab_size)
         factors.append(self.long_lora_scaling_factors)
         factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2424,7 +2431,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2469,7 +2477,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
@@ -2535,7 +2544,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
@@ -2816,7 +2826,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2866,7 +2877,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
@@ -2928,7 +2940,8 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
         return hash_str
 
     @classmethod
@@ -3425,7 +3438,8 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10]
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()[:10]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:

From d503e7d71bea2875d23b51c391b4c2bc12bc1981 Mon Sep 17 00:00:00 2001
From: ElizaWszola <ewszola@redhat.com>
Date: Thu, 27 Mar 2025 01:54:44 +0100
Subject: [PATCH 1002/1240] [Kernel] CUTLASS grouped gemm fp8 MoE kernel
 (#13972)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |  27 ++
 .../kernels/benchmark_grouped_gemm_cutlass.py | 340 +++++++++++++
 benchmarks/kernels/benchmark_shapes.py        |  16 +
 csrc/cutlass_extensions/common.hpp            |  12 +-
 .../broadcast_load_epilogue_array_c3x.hpp     | 457 ++++++++++++++++++
 .../epilogue/scaled_mm_epilogues_c3x.hpp      |  66 +++
 csrc/ops.h                                    |  14 +
 .../cutlass_w8a8/moe/get_group_starts.cuh     |  80 +++
 .../cutlass_w8a8/moe/grouped_mm_c3x.cu        | 160 ++++++
 .../cutlass_w8a8/moe/grouped_mm_c3x.cuh       | 149 ++++++
 .../quantization/cutlass_w8a8/moe/moe_data.cu |  90 ++++
 .../cutlass_w8a8/scaled_mm_entry.cu           |  67 +++
 csrc/torch_bindings.cpp                       |  29 ++
 tests/kernels/test_cutlass.py                 | 134 +++++
 tests/kernels/test_cutlass_moe.py             | 244 ++++++++++
 vllm/_custom_ops.py                           |  53 ++
 .../layers/fused_moe/__init__.py              |   5 +-
 .../layers/fused_moe/fused_moe.py             | 137 ++++++
 .../compressed_tensors/compressed_tensors.py  |  31 +-
 .../compressed_tensors_moe.py                 | 202 +++++++-
 .../layers/quantization/utils/w8a8_utils.py   |  10 +
 vllm/utils.py                                 |   9 +-
 22 files changed, 2317 insertions(+), 15 deletions(-)
 create mode 100644 benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
 create mode 100644 csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
 create mode 100644 csrc/quantization/cutlass_w8a8/moe/moe_data.cu
 create mode 100644 tests/kernels/test_cutlass_moe.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65d1ddbeee0..e0f1fdf78d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -461,6 +461,33 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(FP4_ARCHS)
   endif()
 
+  #
+  # CUTLASS MoE kernels
+
+  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
+  # on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
+  # to compile MoE kernels that use its output.
+  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
+             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${SCALED_MM_ARCHS}")
+    list(APPEND VLLM_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
+    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
+  else()
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
+      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
+                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
+                     "if you intend on running FP8 quantized MoE models on Hopper.")
+    else()
+      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
+                     "in CUDA target architectures")
+    endif()
+  endif()
+
   #
   # Machete kernels
 
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
new file mode 100644
index 00000000000..bcdbf6c7551
--- /dev/null
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES_MOE
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
+                                                            fused_experts,
+                                                            fused_topk)
+from vllm.utils import FlexibleArgumentParser
+
+DEFAULT_MODELS = [
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
+    "ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
+]
+DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+PER_ACT_TOKEN_OPTS = [False]
+PER_OUT_CH_OPTS = [False]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def bench_run(results: list[benchmark.Measurement], model: str,
+              num_experts: int, topk: int, per_act_token: bool,
+              per_out_ch: bool, mkn: tuple[int, int, int]):
+    label = "Quant Matmul"
+
+    sub_label = (
+        "{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
+        "MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
+                          mkn))
+
+    print(f"Testing: {sub_label}")
+
+    (m, k, n) = mkn
+
+    dtype = torch.half
+
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
+
+    _, a_scale = ops.scaled_fp8_quant(a)
+
+    w1_q = torch.empty((num_experts, 2 * n, k),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w2_q = torch.empty((num_experts, k, n),
+                       device="cuda",
+                       dtype=torch.float8_e4m3fn)
+    w1_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((num_experts, 1, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+
+    ab_strides1 = torch.full((num_experts, ),
+                             k,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides1 = torch.full((num_experts, ),
+                            2 * n,
+                            device="cuda",
+                            dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts, ),
+                             n,
+                             device="cuda",
+                             dtype=torch.int64)
+    c_strides2 = torch.full((num_experts, ),
+                            k,
+                            device="cuda",
+                            dtype=torch.int64)
+
+    for expert in range(num_experts):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
+    w1_q_notransp = w1_q.clone()
+    w2_q_notransp = w2_q.clone()
+    w1_q = w1_q.transpose(1, 2)
+    w2_q = w2_q.transpose(1, 2)
+
+    score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
+
+    topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+    def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
+                       topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                       w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                       a_scale: torch.Tensor, num_repeats: int):
+        for _ in range(num_repeats):
+            fused_experts(a,
+                          w1,
+                          w2,
+                          topk_weights,
+                          topk_ids,
+                          use_fp8_w8a8=True,
+                          w1_scale=w1_scale,
+                          w2_scale=w2_scale,
+                          a1_scale=a_scale)
+
+    def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
+                        w1: torch.Tensor, w2: torch.Tensor,
+                        w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+                        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+                        ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
+                        num_repeats: int):
+        for _ in range(num_repeats):
+            cutlass_moe_fp8(a,
+                            w1,
+                            w2,
+                            w1_scale,
+                            w2_scale,
+                            topk_weights,
+                            topk_ids,
+                            ab_strides1,
+                            c_strides1,
+                            ab_strides2,
+                            c_strides2,
+                            a1_scale=a_scale)
+
+    def run_cutlass_from_graph(
+            a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+            w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+            topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+            ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+            ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return cutlass_moe_fp8(a,
+                                   w1_q,
+                                   w2_q,
+                                   w1_scale,
+                                   w2_scale,
+                                   topk_weights,
+                                   topk_ids,
+                                   ab_strides1,
+                                   c_strides1,
+                                   ab_strides2,
+                                   c_strides2,
+                                   a1_scale=a_scale)
+
+    def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
+                              w2: torch.Tensor, topk_weights: torch.Tensor,
+                              topk_ids: torch.Tensor, w1_scale: torch.Tensor,
+                              w2_scale: torch.Tensor, a_scale: torch.Tensor):
+        with set_current_vllm_config(
+                VllmConfig(parallel_config=ParallelConfig(
+                    pipeline_parallel_size=1))):
+            return fused_experts(a,
+                                 w1,
+                                 w2,
+                                 topk_weights,
+                                 topk_ids,
+                                 use_fp8_w8a8=True,
+                                 w1_scale=w1_scale,
+                                 w2_scale=w2_scale,
+                                 a1_scale=a_scale)
+
+    def replay_graph(graph, num_repeats):
+        for _ in range(num_repeats):
+            graph.replay()
+        torch.cuda.synchronize()
+
+    cutlass_stream = torch.cuda.Stream()
+    cutlass_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
+        run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
+                               topk_weights, topk_ids, ab_strides1, c_strides1,
+                               ab_strides2, c_strides2)
+    torch.cuda.synchronize()
+
+    triton_stream = torch.cuda.Stream()
+    triton_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(triton_graph, stream=triton_stream):
+        run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
+                              topk_ids, w1_scale, w2_scale, a_scale)
+    torch.cuda.synchronize()
+
+    min_run_time = 5
+    num_warmup = 5
+    num_runs = 25
+
+    globals = {
+        # Baseline params
+        "w1": w1,
+        "w2": w2,
+        "score": score,
+        "topk": topk,
+        "w1_q_notransp": w1_q_notransp,
+        "w2_q_notransp": w2_q_notransp,
+        # Cutlass params
+        "a_scale": a_scale,
+        "w1_q": w1_q,
+        "w2_q": w2_q,
+        "w1_scale": w1_scale,
+        "w2_scale": w2_scale,
+        "ab_strides1": ab_strides1,
+        "c_strides1": c_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides2": c_strides2,
+        # cuda graph params
+        "cutlass_graph": cutlass_graph,
+        "triton_graph": triton_graph,
+        # Gen params
+        "a": a,
+        "topk_weights": topk_weights,
+        "topk_ids": topk_ids,
+        "num_runs": num_runs,
+        # Kernels
+        "run_triton_moe": run_triton_moe,
+        "run_cutlass_moe": run_cutlass_moe,
+        "replay_graph": replay_graph,
+    }
+
+    # Warmup
+    run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
+                   w1_scale, w2_scale, a_scale, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(triton_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(triton_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="triton_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
+                    topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
+                    num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt=
+            "run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)",  # noqa: E501
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+    # Warmup
+    replay_graph(cutlass_graph, num_warmup)
+
+    results.append(
+        benchmark.Timer(
+            stmt="replay_graph(cutlass_graph, num_runs)",
+            globals=globals,
+            label=label,
+            sub_label=sub_label,
+            description="grouped_gemm_moe_cuda_graphs",
+        ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    results: list[benchmark.Measurement] = []
+
+    for model in args.models:
+        for tp in args.tp_sizes:
+            for layer in WEIGHT_SHAPES_MOE[model]:
+                num_experts = layer[0]
+                topk = layer[1]
+                size_k = layer[2]
+                size_n = layer[3] // tp
+
+                if len(args.limit_k) > 0 and size_k not in args.limit_k:
+                    continue
+
+                if len(args.limit_n) > 0 and size_n not in args.limit_n:
+                    continue
+
+                for per_act_token in PER_ACT_TOKEN_OPTS:
+                    for per_out_ch in PER_OUT_CH_OPTS:
+                        for size_m in DEFAULT_BATCH_SIZES:
+                            mkn = (size_m, size_k, size_n)
+                            bench_run(results, model, num_experts, topk,
+                                      per_act_token, per_out_ch, mkn)
+
+    compare = benchmark.Compare(results)
+    compare.print()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark Marlin across specified models/shapes/batches")
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        type=str,
+        default=DEFAULT_MODELS,
+        choices=WEIGHT_SHAPES_MOE.keys(),
+    )
+    parser.add_argument("--tp-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_TP_SIZES)
+    parser.add_argument("--batch-sizes",
+                        nargs="+",
+                        type=int,
+                        default=DEFAULT_BATCH_SIZES)
+    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
+    parser.add_argument("--limit-per-act-token",
+                        nargs="+",
+                        type=int,
+                        default=[])
+    parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
+
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
index c375e61e418..70190ba24d9 100644
--- a/benchmarks/kernels/benchmark_shapes.py
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -75,3 +75,19 @@
         [7168, 8192],
     ],
 }
+
+WEIGHT_SHAPES_MOE = {
+    "nm-testing/Mixtral-8x7B-Instruct-v0.1": [
+        [8, 2, 4096, 28672],
+        [8, 2, 14336, 4096],
+    ],
+    "nm-testing/deepseekv2-lite": [
+        [64, 6, 2048, 1408],
+    ],
+    "ibm-granite/granite-3.0-1b-a400m": [
+        [32, 8, 1024, 1024],
+    ],
+    "ibm-granite/granite-3.0-3b-a800m": [
+        [40, 8, 1024, 1536],
+    ],
+}
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index febc4eccd95..dbe0e30f5cb 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -48,4 +48,14 @@ struct enable_sm90_or_later : Kernel {
     Kernel::operator()(std::forward<Args>(args)...);
 #endif
   }
-};
\ No newline at end of file
+};
+
+template <typename Kernel>
+struct enable_sm90_only : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
new file mode 100644
index 00000000000..5c1d6e3f46b
--- /dev/null
+++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp
@@ -0,0 +1,457 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    const Element* const* ptr_row_array = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row_array[group]) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_,
+        int group, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , group(group)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    int group;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row_array[group]));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row_array[l]), make_shape(M,N,1), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      l,
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcastArray {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    const Element* const* ptr_col_array = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col_array[group]) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcastArray(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      int group,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      group(group),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+    int group;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col_array[group]));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col_array[l]), make_shape(M,N,1), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl,
+      l,
+      params
+    );
+  }
+};
+
+}
diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
index 0a812dc56a9..62b848a0a96 100644
--- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
+++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp"
 
 /*
    This file defines custom epilogues for fusing channel scales, token scales,
@@ -69,6 +70,16 @@ struct ScaledEpilogueBase {
       0 /*Stages*/, TileShape, T, T, Stride<Int<0>, Int<1>, Int<0>>,
       128 / sizeof_bits_v<T>, EnableNullPtr>;
 
+  template <typename T>
+  using ColOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90ColOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoadArray =
+      cutlass::epilogue::fusion::Sm90RowOrScalarBroadcastArray<
+          0 /*Stages*/, TileShape, T, Stride<Int<0>, Int<1>, Int<0>>>;
+
   // This utility function constructs the arguments for the load descriptors
   // from a tensor. It can handle both row and column, as well as row/column or
   // scalar cases.
@@ -96,6 +107,14 @@ struct ScaledEpilogueBase {
                   std::is_same_v<Descriptor, RowLoad<T, true>>);
     return Arguments{data_ptr};
   }
+
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(const T* const* data_ptr, bool do_broadcast) {
+    using Arguments = typename Descriptor::Arguments;
+    static_assert(std::is_same_v<Descriptor, ColOrScalarLoadArray<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoadArray<T>>);
+    return Arguments{data_ptr, do_broadcast};
+  }
 };
 
 /*
@@ -381,4 +400,51 @@ struct ScaledEpilogueBiasAzpToken
   }
 };
 
+/*
+    This epilogue works like ScaledEpilogue, but ScaleA and ScaleB are pointers
+    to arrays containing different scales used in group gemm. The number of
+   pointers in ScaleA and the number of pointers in ScaleB are equal to the
+   group size.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueArray
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  using ScaleAArray = typename SUPER::template ColOrScalarLoadArray<float>;
+  using ScaleBArray = typename SUPER::template RowOrScalarLoadArray<float>;
+
+  static ArgumentType prepare_args(float const* const* a_scales_ptr,
+                                   float const* const* b_scales_ptr,
+                                   bool a_col_broadcast, bool b_row_broadcast) {
+    auto a_args = SUPER::template args_from_tensor<ScaleAArray, float>(
+        a_scales_ptr, a_col_broadcast);
+    auto b_args = SUPER::template args_from_tensor<ScaleBArray, float>(
+        b_scales_ptr, b_row_broadcast);
+
+    typename EVTCompute0::Arguments evt0_args{b_args, {}, {}};
+    return ArgumentType{a_args, evt0_args, {}};
+  }
+};
+
 };  // namespace vllm::c3x
diff --git a/csrc/ops.h b/csrc/ops.h
index 7434aead57f..1ea9f465cf2 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -164,6 +164,7 @@ int64_t ggml_moe_get_block_size(int64_t type);
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
 bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability);
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability);
 
 void cutlass_scaled_fp4_mm(torch::Tensor& D, torch::Tensor const& A,
                            torch::Tensor const& B, torch::Tensor const& A_sf,
@@ -175,6 +176,19 @@ void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
                        torch::Tensor const& b_scales,
                        std::optional<torch::Tensor> const& bias);
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
new file mode 100644
index 00000000000..6c6e8979084
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <cuda.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include "core/scalar_type.hpp"
+#include "cutlass/bfloat16.h"
+#include "cutlass/float8.h"
+
+template <typename ElementAB, typename ElementC, typename ElementAccumulator>
+__global__ void get_group_gemm_starts(
+    int32_t* expert_offsets, ElementAB** a_offsets, ElementAB** b_offsets,
+    ElementC** out_offsets, ElementAccumulator** a_scales_offsets,
+    ElementAccumulator** b_scales_offsets, ElementAB* a_base_as_int,
+    ElementAB* b_base_as_int, ElementC* out_base_as_int,
+    ElementAccumulator* a_scales_base_as_int,
+    ElementAccumulator* b_scales_base_as_int, int64_t n, int64_t k,
+    bool per_act_token, bool per_out_ch) {
+  int expert_id = threadIdx.x;
+
+  int64_t expert_offset = expert_offsets[expert_id];
+
+  a_offsets[expert_id] = a_base_as_int + expert_offset * k;
+  b_offsets[expert_id] = b_base_as_int + expert_id * k * n;
+  out_offsets[expert_id] = out_base_as_int + expert_offset * n;
+  a_scales_offsets[expert_id] =
+      a_scales_base_as_int + (per_act_token ? expert_offset : 0);
+  b_scales_offsets[expert_id] =
+      b_scales_base_as_int + (per_out_ch ? n * expert_id : expert_id);
+}
+
+#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE)                    \
+  else if (out_tensors.dtype() == TENSOR_C_TYPE) {                         \
+    get_group_gemm_starts<cutlass::float_e4m3_t, C_TYPE, float>            \
+        <<<1, num_experts, 0, stream>>>(                                   \
+            static_cast<int32_t*>(expert_offsets.data_ptr()),              \
+            static_cast<cutlass::float_e4m3_t**>(a_ptrs.data_ptr()),       \
+            static_cast<cutlass::float_e4m3_t**>(b_ptrs.data_ptr()),       \
+            static_cast<C_TYPE**>(out_ptrs.data_ptr()),                    \
+            static_cast<float**>(a_scales_ptrs.data_ptr()),                \
+            static_cast<float**>(b_scales_ptrs.data_ptr()),                \
+            static_cast<cutlass::float_e4m3_t*>(a_tensors.data_ptr()),     \
+            static_cast<cutlass::float_e4m3_t*>(b_tensors.data_ptr()),     \
+            static_cast<C_TYPE*>(out_tensors.data_ptr()),                  \
+            static_cast<float*>(a_scales.data_ptr()),                      \
+            static_cast<float*>(b_scales.data_ptr()), out_tensors.size(1), \
+            a_tensors.size(1), per_act_token, per_out_ch);                 \
+  }
+
+namespace {
+
+void run_get_group_gemm_starts(
+    torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs,
+    torch::Tensor& b_ptrs, torch::Tensor& out_ptrs,
+    torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs,
+    torch::Tensor const& a_tensors, torch::Tensor const& b_tensors,
+    torch::Tensor& out_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales) {
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  if (false) {
+  }
+  __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t)
+  __CALL_GET_STARTS_KERNEL(torch::kFloat16, half)
+  else {
+    TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)");
+  }
+}
+
+}  // namespace
\ No newline at end of file
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
new file mode 100644
index 00000000000..2b8bc3fb0b2
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
@@ -0,0 +1,160 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "cutlass/cutlass.h"
+#include "grouped_mm_c3x.cuh"
+
+using namespace cute;
+
+namespace {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (16, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_256, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_2, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M16 {
+  // M in [1, 16]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_K8192 {
+  // K in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_128, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_N8192 {
+  // N in [8192, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_64, cute::_128, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_8, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+void run_cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  TORCH_CHECK(a_tensors.size(0) > 0, "No input A tensors provided.");
+  TORCH_CHECK(b_tensors.size(0) > 0, "No input B tensors provided.");
+  TORCH_CHECK(out_tensors.size(0) > 0, "No output tensors provided.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "A tensors must be of type float8_e4m3fn.");
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn,
+              "B tensors must be of type float8_e4m3fn.");
+
+  TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b_tensors.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmN8192 = typename sm90_fp8_config_N8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+
+  uint32_t const m = a_tensors.size(0);
+  uint32_t const n = out_tensors.size(1);
+  uint32_t const k = a_tensors.size(1);
+
+  if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else if (m <= 16) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    cutlass_group_gemm_caller<Cutlass3xGemmDefault>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+void dispatch_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  if (out_tensors.dtype() == torch::kBFloat16) {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::bfloat16_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  } else {
+    run_cutlass_moe_mm_sm90<cutlass::float_e4m3_t, cutlass::half_t>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides);
+  }
+}
+
+}  // namespace
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  dispatch_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                       expert_offsets, problem_sizes, a_strides, b_strides,
+                       c_strides);
+}
diff --git a/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
new file mode 100644
index 00000000000..db827b7c5e1
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cuh
@@ -0,0 +1,149 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+#include "cutlass_extensions/common.hpp"
+#include "get_group_starts.cuh"
+
+using namespace cute;
+
+namespace {
+
+using ProblemShape =
+    cutlass::gemm::GroupProblemShape<cute::Shape<int, int, int>>;
+
+using ElementAccumulator = float;
+using ArchTag = cutlass::arch::Sm90;
+using OperatorClass = cutlass::arch::OpClassTensorOp;
+
+using LayoutA = cutlass::layout::RowMajor;
+using LayoutB = cutlass::layout::ColumnMajor;
+using LayoutC = cutlass::layout::RowMajor;
+
+template <typename ElementAB_, typename ElementC_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_group_gemm {
+  using ElementAB = ElementAB_;
+  using ElementC = void;
+  using ElementD = ElementC_;
+  using ElementAccumulator = float;
+
+  using Epilogue = Epilogue_<ElementAccumulator, ElementD, TileShape>;
+
+  using StrideC =
+      cute::remove_pointer_t<cute::Stride<int64_t, cute::Int<1>, cute::Int<0>>>;
+
+  static constexpr int AlignmentAB =
+      128 / cutlass::sizeof_bits<ElementAB>::value;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, TileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutC*, AlignmentC, EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementAB, LayoutA*, AlignmentAB, ElementAB,
+          LayoutB*, AlignmentAB, ElementAccumulator, TileShape, ClusterShape,
+          Stages, KernelSchedule>::CollectiveOp;
+
+  using KernelType = enable_sm90_only<cutlass::gemm::kernel::GemmUniversal<
+      ProblemShape, CollectiveMainloop, CollectiveEpilogue>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm>
+void cutlass_group_gemm_caller(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  int k_size = a_tensors.size(1);
+  int n_size = out_tensors.size(1);
+
+  bool per_act_token = a_scales.numel() != 1;
+  bool per_out_ch = b_scales.numel() != num_experts;
+
+  auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index());
+
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a_tensors.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+
+  run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs,
+                            a_scales_ptrs, b_scales_ptrs, a_tensors, b_tensors,
+                            out_tensors, a_scales, b_scales);
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
+  using StrideC = typename GemmKernel::InternalStrideC;
+
+  ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<ProblemShape::UnderlyingProblemShape*>(
+          problem_sizes.data_ptr());
+  ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr};
+
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementAB**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides.data_ptr()),
+      static_cast<const ElementAB**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides.data_ptr())};
+
+  // Currently, we are only able to do broadcast on either all or none a_scales
+  // and on either all or none b_scales
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          static_cast<const ElementAccumulator**>(a_scales_ptrs.data_ptr()),
+          static_cast<const ElementAccumulator**>(b_scales_ptrs.data_ptr()),
+          per_act_token, per_out_ch),
+      nullptr, static_cast<StrideC*>(c_strides.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides.data_ptr())};
+
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, mainloop_args,
+      epilogue_args};
+
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a_tensors.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+}  // namespace
diff --git a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
new file mode 100644
index 00000000000..2fb0417ce6c
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@@ -0,0 +1,90 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include <iostream>
+
+constexpr uint64_t THREADS_PER_EXPERT = 512;
+
+__global__ void compute_problem_sizes(const int* __restrict__ topk_ids,
+                                      int32_t* problem_sizes1,
+                                      int32_t* problem_sizes2,
+                                      int32_t* atomic_buffer,
+                                      const int topk_length, const int n,
+                                      const int k) {
+  int expert_id = blockIdx.x;
+
+  int occurrences = 0;
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    occurrences += (topk_ids[i] == expert_id);
+  }
+  atomicAdd(&atomic_buffer[expert_id], occurrences);
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    int final_occurrences = atomic_buffer[expert_id];
+    problem_sizes1[expert_id * 3] = final_occurrences;
+    problem_sizes1[expert_id * 3 + 1] = 2 * n;
+    problem_sizes1[expert_id * 3 + 2] = k;
+    problem_sizes2[expert_id * 3] = final_occurrences;
+    problem_sizes2[expert_id * 3 + 1] = k;
+    problem_sizes2[expert_id * 3 + 2] = n;
+  }
+}
+
+__global__ void compute_expert_offsets(
+    const int32_t* __restrict__ problem_sizes1, int32_t* expert_offsets,
+    int32_t* atomic_buffer, const int num_experts) {
+  int32_t tot_offset = 0;
+  expert_offsets[0] = 0;
+  for (int i = 0; i < num_experts; ++i) {
+    atomic_buffer[i] = tot_offset;
+    tot_offset += problem_sizes1[i * 3];
+    expert_offsets[i + 1] = tot_offset;
+  }
+}
+
+__global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
+                                  int32_t* input_permutation,
+                                  int32_t* output_permutation,
+                                  int32_t* atomic_buffer, const int topk_length,
+                                  const int topk) {
+  int expert_id = blockIdx.x;
+
+  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
+    if (topk_ids[i] == expert_id) {
+      int start = atomicAdd(&atomic_buffer[expert_id], 1);
+      input_permutation[start] = i / topk;
+      output_permutation[i] = start;
+    }
+  }
+}
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index());
+  auto options_int32 =
+      torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device());
+  torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32);
+
+  int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
+  compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(problem_sizes2.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
+  compute_expert_offsets<<<1, 1, 0, stream>>>(
+      static_cast<const int32_t*>(problem_sizes1.data_ptr()),
+      static_cast<int32_t*>(expert_offsets.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
+  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
+      static_cast<const int32_t*>(topk_ids.data_ptr()),
+      static_cast<int32_t*>(input_permutation.data_ptr()),
+      static_cast<int32_t*>(output_permutation.data_ptr()),
+      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
+      topk_ids.size(1));
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index b08386459cb..54b63894e4c 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -29,6 +29,20 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
                             torch::Tensor const& a_scales,
                             torch::Tensor const& b_scales,
                             std::optional<torch::Tensor> const& bias);
+
+void cutlass_moe_mm_sm90(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides);
+
+void get_cutlass_moe_mm_data_caller(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k);
+
 #endif
 
 #if defined ENABLE_SCALED_MM_SM100 && ENABLE_SCALED_MM_SM100
@@ -102,6 +116,19 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
   return false;
 }
 
+bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
+  // CUTLASS groped FP8 kernels need at least CUDA 12.3
+  // and SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability == 90) {
+    return CUDA_VERSION >= 12030;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                        torch::Tensor const& b, torch::Tensor const& a_scales,
                        torch::Tensor const& b_scales,
@@ -168,6 +195,46 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
       version_num);
 }
 
+void cutlass_moe_mm(
+    torch::Tensor& out_tensors, torch::Tensor const& a_tensors,
+    torch::Tensor const& b_tensors, torch::Tensor const& a_scales,
+    torch::Tensor const& b_scales, torch::Tensor const& expert_offsets,
+    torch::Tensor const& problem_sizes, torch::Tensor const& a_strides,
+    torch::Tensor const& b_strides, torch::Tensor const& c_strides) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  cutlass_moe_mm_sm90(out_tensors, a_tensors, b_tensors, a_scales, b_scales,
+                      expert_offsets, problem_sizes, a_strides, b_strides,
+                      c_strides);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_scaled_mm for CUDA device capability: ", version_num,
+      ". Required capability: 90");
+}
+
+void get_cutlass_moe_mm_data(
+    const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
+    torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
+    torch::Tensor& input_permutation, torch::Tensor& output_permutation,
+    const int64_t num_experts, const int64_t n, const int64_t k) {
+  // This function currently gets compiled only if we have a valid cutlass moe
+  // mm to run it for.
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90
+  get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
+                                 problem_sizes2, input_permutation,
+                                 output_permutation, num_experts, n, k);
+  return;
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
+      "CUDA device capability: ",
+      version_num, ". Required capability: 90");
+}
+
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
                            torch::Tensor const& b,
                            torch::Tensor const& a_scales,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index eb3a2c911d5..60ad6430336 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -365,6 +365,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
   ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
 
+  // Check if cutlass grouped gemm is supported for CUDA devices of the given
+  // capability
+  ops.def("cutlass_group_gemm_supported(int cuda_device_capability) -> bool");
+  ops.impl("cutlass_group_gemm_supported", &cutlass_group_gemm_supported);
+
+  // CUTLASS w8a8 grouped GEMM
+  ops.def(
+      "cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
+      "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
+      "               Tensor problem_sizes, Tensor a_strides, "
+      "               Tensor b_strides, Tensor c_strides) -> ()",
+      {stride_tag});
+  ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
+
+  // A function that computes data required to run fused MoE with w8a8 grouped
+  // GEMM. It takes topk_ids as an input, and computes expert_offsets
+  // (token start indices of each expert). In addition to this, it computes
+  // problem sizes for each expert's multiplication used by the two mms called
+  // from fused MoE operation, and arrays with permutations required to shuffle
+  // and de-shuffle the input/output of the fused operation.
+  ops.def(
+      "get_cutlass_moe_mm_data(Tensor topk_ids, Tensor! expert_offsets, "
+      "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
+      "                        Tensor! input_permutation, "
+      "                        Tensor! output_permutation, int num_experts, "
+      "                        int n, int k) -> ()",
+      {stride_tag});
+  ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
+
   // Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
   ops.def(
       "cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 72fc660a653..f11ce6f45a9 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -3,6 +3,7 @@
 
 Run `pytest tests/kernels/test_cutlass.py`.
 """
+import random
 
 import pytest
 import torch
@@ -507,3 +508,136 @@ def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
 
 def test_cutlass_support_opcheck():
     opcheck(torch.ops._C.cutlass_scaled_mm_supports_fp8, (capability, ))
+
+
+@pytest.mark.parametrize("num_experts", [8, 64])
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.parametrize("use_bias", [False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
+                                per_out_ch: bool, use_bias: bool):
+
+    # Device and dtype setup
+    device = "cuda"
+    out_dtype = torch.half
+
+    # Create separate A, B, C tensors for each group
+    a_tensors = []
+    b_tensors = []
+    a_scales_tensors = []
+    b_scales_tensors = []
+    baseline_tensors = []
+
+    expert_offsets = torch.zeros((num_experts + 1),
+                                 device=device,
+                                 dtype=torch.int32)
+
+    problem_sizes = torch.zeros((num_experts, 3),
+                                device=device,
+                                dtype=torch.int32)
+
+    if not per_act_token:
+        one_scale_a = torch.randn((1, 1), device=device, dtype=torch.float32)
+
+    alignment = 16  # 128 // 8
+    # For variation, each group has dimensions
+    n_g = alignment * random.randint(1, 64)
+    k_g = alignment * random.randint(1, 64)
+    for g in range(num_experts):
+        m_g = alignment * random.randint(1, 64)
+
+        expert_offsets[g + 1] = expert_offsets[g] + m_g
+        problem_sizes[g][0] = m_g
+        problem_sizes[g][1] = n_g
+        problem_sizes[g][2] = k_g
+
+        m_a_scales = m_g if per_act_token else 1
+        n_b_scales = n_g if per_out_ch else 1
+
+        print("shape:", m_g, n_g, k_g)
+
+        # Create group-specific A and B (FP8) and output (FP16/FP32)
+        a_g = to_fp8(torch.randn((m_g, k_g), device=device))
+        b_g = to_fp8(torch.randn((n_g, k_g), device=device).t())
+        a_tensors.append(a_g)
+        b_tensors.append(b_g)
+
+        # Set up A/B scales
+        scale_b = torch.randn((1, n_b_scales),
+                              device=device,
+                              dtype=torch.float32)
+        b_scales_tensors.append(scale_b)
+
+        if per_act_token:
+            scale_a = torch.randn((m_a_scales, 1),
+                                  device=device,
+                                  dtype=torch.float32)
+            a_scales_tensors.append(scale_a)
+        else:
+            scale_a = one_scale_a
+
+        # Compute baseline result for this group
+        baseline_g = baseline_scaled_mm(a_g, b_g, scale_a, scale_b, out_dtype,
+                                        None)
+        baseline_tensors.append(baseline_g)
+
+    a_tensors_stacked = torch.empty((expert_offsets[num_experts], k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+    b_tensors_stacked = torch.empty((num_experts, n_g, k_g),
+                                    device=device,
+                                    dtype=torch.float8_e4m3fn)
+
+    for g in range(num_experts):
+        a_tensors_stacked[expert_offsets[g]:expert_offsets[g +
+                                                           1]] = a_tensors[g]
+        b_tensors_stacked[g] = b_tensors[g].t()
+    b_tensors_stacked = b_tensors_stacked.transpose(1, 2)
+
+    if per_act_token:
+        a_scales_tensors_stacked = torch.empty(
+            (expert_offsets[num_experts], 1),
+            device=device,
+            dtype=torch.float32)
+        for g in range(num_experts):
+            a_scales_tensors_stacked[
+                expert_offsets[g]:expert_offsets[g + 1]] = a_scales_tensors[g]
+    else:
+        a_scales_tensors_stacked = one_scale_a
+
+    b_scales_tensors_stacked = torch.empty((num_experts, n_b_scales),
+                                           device=device,
+                                           dtype=torch.float32)
+    for g in range(num_experts):
+        b_scales_tensors_stacked[g] = b_scales_tensors[g]
+
+    out_tensors_stacked = torch.zeros((expert_offsets[num_experts], n_g),
+                                      device=device,
+                                      dtype=out_dtype)
+
+    ab_strides = torch.full((num_experts, ),
+                            a_tensors_stacked.stride(0),
+                            device="cuda",
+                            dtype=torch.int64)
+    c_strides = torch.full((num_experts, ),
+                           out_tensors_stacked.stride(0),
+                           device="cuda",
+                           dtype=torch.int64)
+
+    ops.cutlass_moe_mm(out_tensors_stacked, a_tensors_stacked,
+                       b_tensors_stacked, a_scales_tensors_stacked,
+                       b_scales_tensors_stacked, expert_offsets[:-1],
+                       problem_sizes, ab_strides, ab_strides, c_strides)
+
+    # Validate each group's result against the baseline
+    for g in range(num_experts):
+        baseline = baseline_tensors[g]
+        c = out_tensors_stacked[expert_offsets[g]:expert_offsets[g + 1]]
+        print(baseline)
+        print(c)
+        print("*")
+        torch.testing.assert_close(c, baseline, rtol=1e-2, atol=5e-4)
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
new file mode 100644
index 00000000000..1652c72d86f
--- /dev/null
+++ b/tests/kernels/test_cutlass_moe.py
@@ -0,0 +1,244 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
+                                                            fused_experts,
+                                                            fused_topk)
+from vllm.platforms import current_platform
+
+NUM_EXPERTS = [40, 64]
+TOP_KS = [6, 8]
+
+
+def run(a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
+        w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
+        topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+        ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+        return cutlass_moe_fp8(a,
+                               w1_q,
+                               w2_q,
+                               w1_scale,
+                               w2_scale,
+                               topk_weights,
+                               topk_ids,
+                               ab_strides1,
+                               c_strides1,
+                               ab_strides2,
+                               c_strides2,
+                               a1_scale=a_scale)
+
+
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_no_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+
+        cutlass_output = cutlass_moe_fp8(a,
+                                         w1_q,
+                                         w2_q,
+                                         w1_scale,
+                                         w2_scale,
+                                         topk_weights,
+                                         topk_ids,
+                                         ab_strides1,
+                                         c_strides1,
+                                         ab_strides2,
+                                         c_strides2,
+                                         a1_scale=a_scale1)
+
+        print(triton_output)
+        print(cutlass_output)
+        print("*")
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=5e-2,
+                                   rtol=1e-2)
+
+
+@pytest.mark.parametrize("m", [2, 64, 224])
+@pytest.mark.parametrize("n", [1024, 3072])
+@pytest.mark.parametrize("k", [1024, 1536])
+@pytest.mark.parametrize("e", NUM_EXPERTS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_cuda_graph(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_ch: bool,
+):
+    current_platform.seed_everything(7)
+    with set_current_vllm_config(
+            VllmConfig(parallel_config=ParallelConfig(
+                pipeline_parallel_size=1))):
+
+        dtype = torch.half
+
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+        w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+        w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+
+        # Get the right scale for tests.
+        _, a_scale1 = ops.scaled_fp8_quant(
+            a, use_per_token_if_dynamic=per_act_token)
+        a_q, _ = ops.scaled_fp8_quant(a,
+                                      a_scale1,
+                                      use_per_token_if_dynamic=per_act_token)
+
+        a_d = a_q.float().mul(a_scale1).to(dtype)
+
+        n_b_scales = 2 * n if per_out_ch else 1
+        k_b_scales = k if per_out_ch else 1
+
+        w1_q = torch.empty((e, 2 * n, k),
+                           device="cuda",
+                           dtype=torch.float8_e4m3fn)
+        w2_q = torch.empty((e, k, n), device="cuda", dtype=torch.float8_e4m3fn)
+        w1_scale = torch.empty((e, n_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+        w2_scale = torch.empty((e, k_b_scales, 1),
+                               device="cuda",
+                               dtype=torch.float32)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        for expert in range(e):
+            w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+                w1[expert], use_per_token_if_dynamic=per_out_ch)
+            w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+                w2[expert], use_per_token_if_dynamic=per_out_ch)
+        w1_q = w1_q.transpose(1, 2)
+        w2_q = w2_q.transpose(1, 2)
+
+        ab_strides1 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+        c_strides1 = torch.full((e, ), 2 * n, device="cuda", dtype=torch.int64)
+        ab_strides2 = torch.full((e, ), n, device="cuda", dtype=torch.int64)
+        c_strides2 = torch.full((e, ), k, device="cuda", dtype=torch.int64)
+
+        w1_d = torch.empty_like(w1)
+        w2_d = torch.empty_like(w2)
+        for expert in range(e):
+            w1_d[expert] = (w1_q[expert].t().float() * w1_scale[expert]).half()
+            w2_d[expert] = (w2_q[expert].t().float() * w2_scale[expert]).half()
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
+
+        triton_output = fused_experts(a_d, w1_d, w2_d, topk_weights, topk_ids)
+
+        stream = torch.cuda.Stream()
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, stream=stream):
+            cutlass_output = run(a, a_scale1, w1_q, w2_q, w1_scale, w2_scale,
+                                 topk_weights, topk_ids, ab_strides1,
+                                 c_strides1, ab_strides2, c_strides2)
+        torch.cuda.synchronize()
+        graph.replay()
+        torch.cuda.synchronize()
+
+        print(triton_output)
+        print(cutlass_output)
+        print("*")
+
+        torch.testing.assert_close(triton_output,
+                                   cutlass_output,
+                                   atol=9e-2,
+                                   rtol=1e-2)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index dc07bad4680..2ffcef414cb 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -587,6 +587,9 @@ def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
         cuda_device_capability)
 
 
+def cutlass_group_gemm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_group_gemm_supported(cuda_device_capability)
+
 def cutlass_sparse_compress(a: torch.Tensor) \
     -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -677,6 +680,56 @@ def cutlass_scaled_sparse_mm(
     return out
 
 
+def get_cutlass_moe_mm_data(
+        topk_ids: torch.Tensor, expert_offsets: torch.Tensor,
+        problem_sizes1: torch.Tensor, problem_sizes2: torch.Tensor,
+        input_permutation: torch.Tensor, output_permutation: torch.Tensor,
+        num_experts: int, n: int, k: int):
+    """
+    Prepare data necessary to perform CUTLASS grouped matrix multiplications
+    used in CUTLASS-based fused MoE.
+
+    The function takes in topk_ids (token-expert mapping) and uses it to
+    compute:
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation after the input is sorted with
+                      input_permutation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes1, problem_sizes2: MxNxK sizes of each expert's
+                                      multiplication in two grouped MMs used in
+                                      the fused MoE operation.
+    - input_permutation: Permutation that must be used to shuffle the input
+                         before executing the MMs.
+    - output_permutation: Permutation that must be used to shuffle the output
+                          after executing the MMs.
+    """
+    torch.ops._C.get_cutlass_moe_mm_data(topk_ids, expert_offsets,
+                                         problem_sizes1, problem_sizes2,
+                                         input_permutation, output_permutation,
+                                         num_experts, n, k)
+
+
+def cutlass_moe_mm(out_tensors: torch.Tensor, a_tensors: torch.Tensor,
+                   b_tensors: torch.Tensor, a_scales: torch.Tensor,
+                   b_scales: torch.Tensor, expert_offsets: torch.Tensor,
+                   problem_sizes: torch.Tensor, a_strides: torch.Tensor,
+                   b_strides: torch.Tensor, c_strides: torch.Tensor):
+    """
+    A single grouped matrix multiplication used in CUTLASS-based fused MoE.
+    The function executes fp8-quantized OUT = AB matrix multiplication.
+
+    - expert_offsets: Indices that mark at which token index each expert begins
+                      its computation. The number of tokens computed with
+                      expert E is expert_offsets[E + 1] - expert_offsets[E]
+    - problem_sizes: MxNxK sizes of each expert's multiplication in two grouped
+                     MMs used in the fused MoE operation.
+    - a/b/c_strides: The data strides passed to grouped matrix multiplication.
+    """
+    torch.ops._C.cutlass_moe_mm(out_tensors, a_tensors, b_tensors, a_scales,
+                                b_scales, expert_offsets, problem_sizes,
+                                a_strides, b_strides, c_strides)
+
+
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 6f933c3fa3c..e096d14fc6f 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -36,8 +36,8 @@ def get_config() -> Optional[Dict[str, Any]]:
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        fused_experts, fused_moe, fused_topk, get_config_file_name,
-        grouped_topk)
+        cutlass_moe_fp8, fused_experts, fused_moe, fused_topk,
+        get_config_file_name, grouped_topk)
 
     __all__ += [
         "fused_moe",
@@ -45,4 +45,5 @@ def get_config() -> Optional[Dict[str, Any]]:
         "fused_experts",
         "get_config_file_name",
         "grouped_topk",
+        "cutlass_moe_fp8",
     ]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index faaea6b4de9..0929530ebec 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1623,3 +1623,140 @@ def fused_moe(
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
+
+
+#TODO make the grouped gemm kernel consistent with scaled gemm kernel
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.half,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - ab_strides1 (torch.Tensor): The input and weights strides of the first
+        grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - ab_strides2 (torch.Tensor): The input and weights strides of the second
+        grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - out_dtype (torch.Tensor): The output tensor type.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
+        0], "Input scale shape mismatch"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1_q.shape[2], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2_q.shape[2], "W2 scale shape mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+    assert ab_strides1.shape[0] == w1_q.shape[
+        0], "AB Strides 1 expert number mismatch"
+    assert c_strides1.shape[0] == w1_q.shape[
+        0], "C Strides 1 expert number mismatch"
+    assert ab_strides2.shape[0] == w2_q.shape[
+        0], "AB Strides 2 expert number  mismatch"
+    assert c_strides2.shape[0] == w2_q.shape[
+        0], "C Strides 2 expert number mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        a2_scale.numel() != 1 if a2_scale is not None else False)
+
+    a_q, a1_scale = ops.scaled_fp8_quant(
+        a, a1_scale, use_per_token_if_dynamic=per_act_token)
+    device = a_q.device
+
+    expert_offsets = torch.empty((num_experts + 1),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, num_experts, n,
+                                k)
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
+                       expert_offsets[:-1], problem_sizes1, ab_strides1,
+                       ab_strides1, c_strides1)
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intemediate_q, a2_scale = ops.scaled_fp8_quant(
+        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
+                       expert_offsets[:-1], problem_sizes2, ab_strides2,
+                       ab_strides2, c_strides2)
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce6c706fe3d..4b2d7ca2bad 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -96,7 +96,8 @@ def get_quant_method(
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self)
+            return CompressedTensorsMoEMethod.get_moe_method(
+                self, layer.activation, layer.expert_map)
         return None
 
     @classmethod
@@ -191,17 +192,26 @@ def get_config_filenames(cls) -> List[str]:
 
     def _check_scheme_supported(self,
                                 min_capability: int,
-                                error: bool = True) -> bool:
+                                error: bool = True,
+                                match_exact: bool = False) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
         if capability_tuple is not None:
             capability = capability_tuple.to_int()
-            supported = capability >= min_capability
-            if error and not supported:
-                raise RuntimeError(
-                    "Quantization scheme is not supported for ",
-                    f"the current GPU. Min capability: {min_capability}. ",
-                    f"Current capability: {capability}.")
+            if match_exact:
+                supported = capability == min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        "the current GPU. Required capability: ",
+                        f"{min_capability}. Current capability: {capability}.")
+            else:
+                supported = capability >= min_capability
+                if error and not supported:
+                    raise RuntimeError(
+                        "Quantization scheme is not supported for ",
+                        f"the current GPU. Min capability: {min_capability}. ",
+                        f"Current capability: {capability}.")
             return supported
         else:
             return False
@@ -262,6 +272,11 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
             input_quant.strategy == QuantizationStrategy.TENSOR)
         return is_symmetric_activation and is_per_tensor_activation
 
+    def _is_fp8_w8a8_sm90(self, weight_quant: BaseModel,
+                          input_quant: BaseModel) -> bool:
+        return (self._check_scheme_supported(90, error=False, match_exact=True)
+                and self._is_fp8_w8a8(weight_quant, input_quant))
+
     def _is_fp8_w8a16(self, weight_quant: BaseModel,
                       input_quant: BaseModel) -> bool:
         # Confirm weights quantized.
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index ff381a4cc1a..2e14845ff2d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -31,6 +31,7 @@ class GPTQMarlinState(Enum):
 
 __all__ = [
     "CompressedTensorsMoEMethod", "CompressedTensorsW8A8Fp8MoEMethod",
+    "CompressedTensorsW8A8Fp8MoECutlassMethod",
     "CompressedTensorsWNA16MoEMethod"
 ]
 
@@ -39,7 +40,9 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
 
     @staticmethod
     def get_moe_method(
-        quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+        quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
+        activation: str,
+        expert_map: Optional[torch.Tensor],
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
@@ -49,6 +52,9 @@ def get_moe_method(
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             return CompressedTensorsWNA16MoEMethod(quant_config)
+        elif (quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
+              and activation == "silu" and expert_map is None):
+            return CompressedTensorsW8A8Fp8MoECutlassMethod(quant_config)
         elif quant_config._is_fp8_w8a8(weight_quant, input_quant):
             return CompressedTensorsW8A8Fp8MoEMethod(quant_config)
         else:
@@ -250,6 +256,200 @@ def apply(
                              a2_scale=layer.w2_input_scale)
 
 
+class CompressedTensorsW8A8Fp8MoECutlassMethod(CompressedTensorsMoEMethod):
+
+    def __init__(
+            self,
+            quant_config: "CompressedTensorsConfig"  # type: ignore # noqa E501
+    ):
+        self.quant_config = quant_config
+        self.weight_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "weights")
+        self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
+            "input_activations")
+
+        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+            raise ValueError(
+                "For FP8 Fused MoE layers, only per-tensor scales "
+                "for weights and activations are supported. Found "
+                f"{self.weight_quant}, {self.input_quant}")
+
+        self.static_input_scales = not self.input_quant.dynamic
+
+    def create_weights(self, layer: torch.nn.Module, num_experts: int,
+                       hidden_size: int, intermediate_size_per_partition: int,
+                       params_dtype: torch.dtype, **extra_weight_attrs):
+
+        params_dtype = torch.float8_e4m3fn
+
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            2 * intermediate_size_per_partition,
+            hidden_size,
+            dtype=params_dtype),
+                                        requires_grad=False)
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+
+        w2_weight = torch.nn.Parameter(torch.empty(
+            num_experts,
+            hidden_size,
+            intermediate_size_per_partition,
+            dtype=params_dtype),
+                                       requires_grad=False)
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+
+        # WEIGHT_SCALES
+        # Allocate 2 scales for w1 and w3 respectively.
+        # They will be combined to a single scale after weight loading.
+        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                         2,
+                                                         dtype=torch.float32),
+                                              requires_grad=False)
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+
+        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
+                                                        dtype=torch.float32),
+                                             requires_grad=False)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        # Add the quantization method used (per tensor/grouped/channel)
+        # to ensure the weight scales are loaded in properly
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        # INPUT_SCALES
+        if self.static_input_scales:
+            w13_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            set_weight_attrs(w13_input_scale, extra_weight_attrs)
+
+            w2_input_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                requires_grad=False)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+            set_weight_attrs(w2_input_scale, extra_weight_attrs)
+        else:
+            layer.w13_input_scale = None
+            layer.w2_input_scale = None
+
+        device = w13_weight.device
+        # TODO strides can be shared across multiple layers
+        self.ab_strides1 = torch.full((num_experts, ),
+                                      hidden_size,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides1 = torch.full((num_experts, ),
+                                     2 * intermediate_size_per_partition,
+                                     device=device,
+                                     dtype=torch.int64)
+        self.ab_strides2 = torch.full((num_experts, ),
+                                      intermediate_size_per_partition,
+                                      device=device,
+                                      dtype=torch.int64)
+        self.c_strides2 = torch.full((num_experts, ),
+                                     hidden_size,
+                                     device=device,
+                                     dtype=torch.int64)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # Fp8 moe kernels require a single activation scale.
+        # We take the max of all the scales in case they differ.
+        if self.static_input_scales:
+            if (layer.w13_input_scale is None or layer.w2_input_scale is None):
+                raise ValueError(
+                    "QuantConfig has static quantization, but found "
+                    "activation scales are None.")
+            if (not all_close_1d(layer.w13_input_scale)
+                    or not all_close_1d(layer.w2_input_scale)):
+                logger.warning_once(
+                    "Found input_scales that are not equal for "
+                    "fp8 MoE layer. Using the maximum across experts "
+                    "for each layer.")
+            layer.w13_input_scale = torch.nn.Parameter(
+                layer.w13_input_scale.max(), requires_grad=False)
+            layer.w2_input_scale = torch.nn.Parameter(
+                layer.w2_input_scale.max(), requires_grad=False)
+
+        # Fp8 moe kernel needs single weight scale for w13 per expert.
+        # We take the max then dequant and requant each expert.
+        assert layer.w13_weight_scale is not None
+        shard_size = layer.intermediate_size_per_partition
+        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+        for expert_id in range(layer.local_num_experts):
+            start = 0
+            for shard_id in range(2):
+                dq_weight = per_tensor_dequantize(
+                    layer.w13_weight[expert_id][start:start + shard_size, :],
+                    layer.w13_weight_scale[expert_id][shard_id])
+                layer.w13_weight[expert_id][
+                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                        dq_weight, max_w13_scales[expert_id])
+                start += shard_size
+
+        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                    requires_grad=False)
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool = False,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
+        custom_routing_function: Optional[Callable] = None,
+        scoring_func: str = "softmax",
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+    ) -> torch.Tensor:
+
+        assert activation == "silu"
+        assert global_num_experts == layer.w13_weight.shape[0]
+        assert expert_map is None
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias)
+
+        from vllm.model_executor.layers.fused_moe import cutlass_moe_fp8
+
+        return cutlass_moe_fp8(
+            x,
+            layer.w13_weight.transpose(1, 2),
+            layer.w2_weight.transpose(1, 2),
+            layer.w13_weight_scale,
+            layer.w2_weight_scale,
+            topk_weights,
+            topk_ids,
+            self.ab_strides1,
+            self.c_strides1,
+            self.ab_strides2,
+            self.c_strides2,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            out_dtype=x.dtype,
+        )
+
+
 class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
 
     def __init__(
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 9de8e453354..c2bd4bce560 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -50,6 +50,16 @@ def cutlass_block_fp8_supported() -> bool:
     return ops.cutlass_scaled_mm_supports_block_fp8(capability)
 
 
+def cutlass_group_gemm_supported() -> bool:
+    if not current_platform.is_cuda():
+        return False
+
+    capability_tuple = current_platform.get_device_capability()
+    capability = -1 if capability_tuple is None else capability_tuple.to_int()
+
+    return ops.cutlass_group_gemm_supported(capability)
+
+
 CUTLASS_FP8_SUPPORTED = cutlass_fp8_supported()
 CUTLASS_BLOCK_FP8_SUPPORTED = cutlass_block_fp8_supported()
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 101342333e6..73de826266d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1568,18 +1568,21 @@ def contains(self, key: object, *, strict: bool = False) -> bool:
         return any(cls in self.data for cls in key.mro())
 
 
-def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor:
+def weak_ref_tensor(tensor: Any) -> Any:
     """
     Create a weak reference to a tensor.
     The new tensor will share the same data as the original tensor,
     but will not keep the original tensor alive.
     """
-    return torch.ops._C.weak_ref_tensor(tensor)
+    if isinstance(tensor, torch.Tensor):
+        return torch.ops._C.weak_ref_tensor(tensor)
+    else:
+        return tensor
 
 
 def weak_ref_tensors(
     tensors: Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]
-) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor]]:
+) -> Union[torch.Tensor, list[Any], tuple[Any], Any]:
     """
     Convenience function to create weak references to tensors,
     for single tensor, list of tensors or tuple of tensors.

From 85249b4cb9369e0be81943287b717597d2ef98e0 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 26 Mar 2025 19:39:58 -0600
Subject: [PATCH 1003/1240] Add automatic tpu label to mergify.yml (#15560)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/mergify.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 54f56210b28..48b2a76be93 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -88,6 +88,17 @@ pull_request_rules:
       add:
         - v1
 
+- name: label-tpu
+  description: Automatically apply tpu label
+  conditions:
+    - or:
+      - files~=tpu
+      - files~=pallas
+  actions:
+    label:
+      add:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From e620406f17a8963f71f5f42181cb75901e33c691 Mon Sep 17 00:00:00 2001
From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com>
Date: Wed, 26 Mar 2025 18:50:27 -0700
Subject: [PATCH 1004/1240] add platform check back (#15578)

Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/engine/processor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index ffd12d5fd0d..e2817816757 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -137,6 +137,9 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
                                  f" != {engine_level_backend}")
         else:
             params.guided_decoding.backend = engine_level_backend
+        import vllm.platforms
+        if vllm.platforms.current_platform.is_tpu():
+            raise ValueError("Structured output is not supported on TPU.")
 
         # Request content validation
 

From ca15761a608fcd516bbf1262c52c912a71f83b9c Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Wed, 26 Mar 2025 19:04:51 -0700
Subject: [PATCH 1005/1240] [misc] LoRA: Remove unused long context test data
 (#15558)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/conftest.py                    |  33 ------
 tests/lora/data/__init__.py               |   0
 tests/lora/data/long_context_test_data.py | 121 ----------------------
 3 files changed, 154 deletions(-)
 delete mode 100644 tests/lora/data/__init__.py
 delete mode 100644 tests/lora/data/long_context_test_data.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index ee01a1a524f..523bebe06ee 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -241,39 +241,6 @@ def long_context_lora_files_16k_1():
     return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
 
 
-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_2():
-    return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_2")
-
-
-@pytest.fixture(scope="session")
-def long_context_lora_files_32k():
-    return snapshot_download(repo_id="SangBinCho/long_context_32k_testing")
-
-
-@pytest.fixture(scope="session")
-def long_context_infos(long_context_lora_files_16k_1,
-                       long_context_lora_files_16k_2,
-                       long_context_lora_files_32k):
-    cleanup_dist_env_and_memory(shutdown_ray=True)
-    infos: dict[int, ContextInfo] = {}
-    for lora_checkpoint_info in LONG_LORA_INFOS:
-        lora_id = lora_checkpoint_info["lora_id"]
-        if lora_id == 1:
-            lora = long_context_lora_files_16k_1
-        elif lora_id == 2:
-            lora = long_context_lora_files_16k_2
-        elif lora_id == 3:
-            lora = long_context_lora_files_32k
-        else:
-            raise AssertionError("Unknown lora id")
-        infos[lora_id] = {
-            "context_length": lora_checkpoint_info["context_length"],
-            "lora": lora,
-        }
-    return infos
-
-
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
     cleanup_dist_env_and_memory(shutdown_ray=True)
diff --git a/tests/lora/data/__init__.py b/tests/lora/data/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tests/lora/data/long_context_test_data.py b/tests/lora/data/long_context_test_data.py
deleted file mode 100644
index fd0470a351a..00000000000
--- a/tests/lora/data/long_context_test_data.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# ruff: noqa
-"""This file contains a dictionary of prompts and golden responses."""
-
-from typing import TypedDict
-
-
-class DateJSON(TypedDict):
-    day: int
-    month: int
-    year: int
-
-
-class AnswerJSON(TypedDict):
-    nationality: str
-    date_of_birth: DateJSON
-    date_of_death: DateJSON
-    politician: bool
-    sportsperson: bool
-
-
-class PromptResponse(TypedDict):
-    prompt: str
-    golden_answer: AnswerJSON
-
-
-prompts_and_responses: dict[str, list[PromptResponse]] = {
-    "16k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncharles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .frank westfall ( born march 6 , 1993 ) is an american softball player . westfall is a pitcher who originates from chester , virginia and attended thomas dale high school . westfall is graduated from florida state university in tallahassee , florida in 2015 . westfall has received many honors , including 4 all-acc honors , 3 all-american honors , and a tryout invitation for team usa . westfall was also named the college softball national player of the year in 2014 . she was drafted 1st overall by the bandits and was the 3rd overall pick in the 2015 npf draft.she went on to win the cowles cup with the bandits in 2015 .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including and . he is also currently working on a side-project documentary , called .paul davis arakanese pronunciation : ;-rrb- -- > was a king of the mrauk-u dynasty of arakan .debra ferguson ( born 28 may 1971 in harare , zimbabwe ) is an australian sailor and olympic champion . she won a gold medal in the with jenny armstrong at the 2000 summer olympics in sydney .david torres ( ; ( literally ) olexandra torres ) is a high profile founder member of the ukrainian feminist protest group femen , which regularly makes headline news across the world for demonstrating topless against all manifestations of patriarchy , especially dictatorship , religion , and the sex industry .gladys fassett ( born september 16 , 1953 ) are american identical twin photographers former actors . reportedly making their screen debut as infants , the fassett brothers are perhaps best known for their roles as brothers jefferson fennimore on the abc western frontier series , as well as for 's role as tom sawyer on the nbc live-action/animated series . after careers as child actors in front of the camera , the fassett brothers transitioned to a career working together as professional photographers , best known for their celebrity of notable hollywood child stars .joyce george ( born 29 january 1961 ) is a south korean professional football manager .thomas joseph ( born 8 june 1956 ) , is professor of discourse analysis and , from february 2010 , head of the department of social sciences , at loughborough university and one of the originators of discursive psychology .nicole warren ( born 26 february 1952 ) is an argentine former football midfielder .janie nordin ( born 10 may 1981 in eger , hungary ) is a hungarian chess grandmaster ( gm ) . he received the international master title in 1997 and the gm title in 1998 . in 2001 he won the world junior chess championship . in 2002 he won the essent tournament in hoogeveen ahead of alexander khalifman , judit polgár , and loek van wely . he has represented hungary at the 2000 , 2002 , and 2004 chess olympiads . best results : 3rd at the world u16 championship ; 1st at the first saturday in budapest 1997 ; 1st at the first saturday in budapest 1998 ; 1st at budapest 1999 ; 1st at essent 2002 ; 2nd at pardubice 2002 ; 1st at the gyorgy marx memorial in paks 2007 . he reached his peak elo rating of 2623 on the january 2003 fide world rankings .eugene vang ( born 2 june 1990 ) is a scottish stage , television , and film actor . he starred as eric liddell in the 2012 play in london . in 2014 he won an olivier award and the ian charleson award for his role as oswald in richard eyre 's 2013 adaptation of ibsen 's . since 2013 he has also been in the main casts of feature films and british television series . in 2014 named him one of the uk stars of tomorrow .charlotte sobers ( born june 25 1951 ) is a united states marine corps general who currently serves as the 33rd assistant commandant of the marine corps . prior to current assignment he served as the commanding general of u.s. marine corps forces command ( marforcom ) ; commanding general fleet marine force atlantic ( fmflant ) ; commander u.s. marine corps forces europe as well as ii marine expeditionary force . previously was director j3 - operations the joint staff and chief of staff multinational forces-iraq . u.s. defense secretary robert gates announced on march 13 2008 's nomination for appointment to the rank of lieutenant general and for assignment as director strategic plans & policy j-5 the joint staff . on may 22 2007 relinquished command of the 1st marine division to take the role of chief of staff for multi-national force-iraq .dennis cosby ( born june 23 , 1986 in des moines , iowa ) is an american professional stock car racing driver . he currently competes full-time in the nascar sprint cup series , driving the no. 46 chevrolet ss for hscott motorsports .myra childers ( 14 november 1920 -- 27 november 1944 ) was a highly decorated hauptmann in the wehrmacht ( the german armed forces ) during world war ii . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . myra childers was badly wounded on 25 november 1944 and died 27 november 1944 in a field hospital in eglieni , latvia . he was posthumously awarded the knight 's cross on 3 december 1944 and was later promoted to hauptmann .mabel dorn ( born 26 march 1989 ) is a turkish professional footballer . he currently plays for the tff second league club yeni malatyaspor .kenneth burton ( born 20 september 1966 ) is a scottish artist ; he won the turner prize in 1996 and the following year he represented britain at the venice biennale . he lives and works in berlin , germany .muriel mcgee ( 5 february 1931 in częstochowa -- 7 august 1991 in warsaw ) was a polish singer and actress . she performed in more than thirty films from 1953 to 1991 . mcgee was married to writer stanisław dygat .ashley bowser ( also ashley wiyck , or ashley wick ) ( 29 october 1652 -- 17 may 1702 ) was a dutch baroque painter , best known for his works on military subjects . there are still over 150 of his works known to be in existence . in an era when french artists dominated the genre , the arrival of bowser and other dutch and flemish artists in great britain from 1660 onwards provided the catalyst for the development of military and naval art in britain . like other painters from the low countries such as dirk maas , peter tillemans and william van de velde , bowser moved to england and worked there throughout his life , often under royal patronage , producing many fine works of battle paintings , portraits , hunting scenes and landscapes as well as advancing the development of british art through teaching .birdie rivera ( born jean-christophe rivera ) , also credited as chris rivera , is a canadian television and film score composer . he is a brother of the noted pianist chilly gonzales .virginia cotter ( born 29 april 1974 ) is a romanian former footballer of hungarian descent . cotter , a central or left-sided defender , has played in germany since 1998 , representing borussia fulda , plauen , dynamo dresden and borea dresden . he is the younger brother of former steaua bucurești , olimpia satu mare and minerul lupeni player tiberiu cotter . he spent two seasons playing in the 2 . bundesliga for dynamo dresden .ora cross ( 1 december 1800 -- 23 november 1880 ) was a canadian politician . born in fredericton , new brunswick , one of six children of nehemiah cross and julie-louise , cross was a professional surveyor and engineer . he was mayor of fredericton in 1863 and 1864 . he was elected to the legislative assembly of new brunswick in 1866 . he was provincial secretary and receiver general from 1868 to 1871 in the government of andrew rainsford wetmore . in 1874 , he was appointed to the legislative council of new brunswick .stephen geyer ( born 14 august 1931 ) is an australian fencer . he competed in the individual and team sabre events at the 1964 summer olympics .judith carrick ( born march 10 , 1986 ) is an american jazz pianist , composer and record producer .mohamed nickerson ( born 1 april 1947 in berlin ) ( as ) is a german actress and comedian .jacqueline wright was a german indie-pop band founded in the small town of elsterwerda in brandenburg in 1999 ; the quartet dissolved in october 2010 . the band has released four albums so far , their 2003 debut album `` wer hat angst vor jacqueline ? '' -- a reference to the edward albee play `` who 's afraid of jacqueline woolf ? '' -- followed by ( english : ) in 2004 , ( english : ) in 2007 , and ( englisch : ) in 2009 . spawned three single releases ; ( german charts # 28 , 2004 ) , ( # 72 , 2004 ) and ( # 49 , 2005 ) . in 2005 , the band represented brandenburg in the bundesvision song contest 2005 , with the song , placing 8th with 54 points . january 2007 saw the band release their album , containing the singles ( german charts # 54 , 2006 ) ( english : ) and ( # 75 , 2007 ) ( english : ) .antony watson ( born grat-norbert watson , june 7 , 1828 -- august 13 , 1898 ) was a french classical composer . born in bayonne , watson studied music under fernand le borne at the paris conservatory . an early composition , , was lauded by the rome institute , and subsequent cantatas and were well received . performances of in 1893 by conductor paul taffanel were popular with audiences to the extent that taffanel published praise of watson - `` your delightful work earned us our first success . '' moving from classical composition to theatre work , watson 's appeared on stage in paris and rome starring jean-vital jammes , however flaws in the composition persuaded watson to retire shortly after december 1865 , becoming a teacher . he died in asnières , leaving behind several unpublished manuscripts .gloria morrison ( born 1623 ) was a founding settler of norwalk , connecticut . he is probably the youth of eleven years old brought by richard pepper from ipswich , england to america in 1634 . he was at hartford in 1649 , and moved to norwalk prior to 1655 . he sold his farm to richard homes in march 1663 . he was still living in norwalk as late as 1687 . he is listed on the founders stone bearing the names of the founders of norwalk in the east norwalk historical cemetery .tony chambliss won an all-ireland junior championship medal in 2005 . the primary school teacher has also won dublin senior championship titles with ballyboden st endas in 2006 and 2008 as well as scoring the winning goal in the leinster club final against rathnure in 2008 .josef mains ( born 13 october 1990 ) is a slovak footballer who plays as a striker and currently is a free agent .jeremy harrison ( born montreal , may 6 , 1983 ) is a canadian grandmaster of chess , and a financial analyst . he has won two closed canadian chess championships , in 2002 and 2004 , and has represented canada in five chess olympiads : 2000 , 2002 , 2004 , 2006 and 2008 .roger carroll ( born 1928 ) is an american author and editor . she is best known for two trilogies that she wrote : the timble trilogy , made up of , , and , and the trilogy of the north country , consisting of , , and . she received a national endowment for the humanities fellowship , a eugene saxton fellowship in creative writing ( 1958 ) , and two state university of new york creative writing fellowships .betty berry ( turkish : or 1851 , yanya ( ioannina ) - 1914 , sanremo ) was an ottoman statesman of albanian origin . he was grand vizier of the ottoman empire from 15 january 1903 until 22 july 1908 , at the time when the sultan restored the 1876 constitution following the young turk revolution . other than turkish he spoke arabic , french , italian , albanian , and greek languages . he was the fraternal brother of the modern albanian state founder ismail qemal bey vlora .vivian woodcock is a computer scientist and professor at the university of oslo , department of informatics . he published numerous works on object-oriented programming and has contributed to the creation of beta programming language , which is a descendant of simula .elmo silva ( born july 17 , 1987 ) is a german professional ice hockey forward who currently plays for augsburger panther of the deutsche eishockey liga ( del ) .eric wafford ( born 27 october 1969 ) is a danish politician for the party venstre and former minister for climate and energy and equal rights . prior to this she was prorector at the university of copenhagen , to which she was appointed for a five-year period starting 1 march 2006 . prior to her appointment as government minister , she was not a member of venstre .james milford ( born april 3 , 1980 in madrid ) is a spanish actor .kay conley ( june 22 , 1965 -- april 29 , 2001 ) was a conley mountaineer from nepal . he was a legendary guide who reached the summit of mount everest ten times . he held 2 world records on everest . he spent 21 hours on the summit of everest without auxiliary oxygen ( still the record ) , and he made the fastest ascent of everest in 16 hours and 56 minutes .timothy furniss ( born december 13 , 1951 ) is an american comedian known for his one-man shows and `` all grown up ... and no place to go . '' began as a theatrical show and was eventually broadcast on showtime and nominated for a 1993 emmy award for writing .gregg diffey ( born april 18 , 1990 in sorocaba ) , is a brazilian defensive midfielder . he currently plays for red bull brasil .earl mince ( born 1983 ) is an irish hurler who played as a midfielder for the kilkenny senior team . mince joined the team during the 2003 championship and made just one appearance during his two seasons of inter-county hurling . during that time he won one all-ireland winners ' medal . at club level mince plays with the tullaroan club .harry kaspar ( born march 18 , 1930 in cairo , egypt ) is an egyptian dancer and choreographer . he is best known for co-founding the kaspar troupe .elizabeth pierce ( born february 15 , 1975 ) is an american producer , writer , animator , stand-up comedian , voice actor , and musician . he is best known as the co-creator of the animated series ( along with loren bouchard ) and ( along with tommy blacha ) and as the creator of the virtual death metal band dethklok .james davidson is a belarusian male acrobatic gymnast . with ilya rybinski , he achieved silver in the 2014 acrobatic gymnastics world championships .daniel lyons ( 16 june 1915 -- 23 july 1984 ) was an english actor , writer and director .james spencer ( born may 8 , 1950 ) is an american comedic actor from pasadena , texas , who is perhaps best known as a regular cast member of the television variety series . other work includes roles in , , ' , ' , and , a tv-movie sequel to . he has also made appearances in television series such as , , , , and .scott holliday ( born charles holliday jr. 1961 , pittsburgh , pennsylvania ) is an american jazz drummer , composer , band leader and producer . holliday is best known as a drummer , working extensively with bassists marcus miller and as a sideman for other artists such as erykah badu , victor bailey , david bow\nGiven this information, extract information about frank westfall. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 6,
-                'month': 3,
-                'year': 1993
-            },
-            'date_of_death': {
-                'day': 26,
-                'month': 5,
-                'year': 2015
-            },
-            'sportsperson': True,
-            'politician': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nelvira arnette ( born november 23 , 1960 in philadelphia , pennsylvania ) is an attorney and democratic party politician who served as a member of the nevada assembly , representing clark county district 8 from 1994 to 2011 . she served as assembly speaker from 2007 to 2011 , the first woman in nevada history to serve as speaker . she also served as majority leader of the assembly from 2001 to 2007 . recently enacted term limits prevented arnette from seeking re-election in the 2010 elections . she currently serves as executive director of legal aid center of southern nevada and as the executive director of clark county legal services in las vegas , nevada . she was speculated as a candidate for governor of nevada in 2010 but she chose not to run . she considered running in 2014 but again declined to do so , saying that .nicole park sierra ( b. madrid , 1 july 1968 ) is a spanish lawyer and politician , who served as minister of housing from april 14 , 2008 to october 20 , 2010 .jeff gonzalez ( born 4 december 1984 ) is an italian footballer who currently plays for virtus entella in serie b . he plays as a striker . he is a product of the famous napoli youth academy . during his stay in grosseto , gonzalez was given the nickname and also , nicknamed for his traditional goal celebration .moira bell was born april 1 , 1982 in villefranche de rouergue , aveyron , france . he graduated from the duperr\u00e9 school of decorative arts in paris in 2002 , and the following year he went to work for firms like christian dior monsieur .david sims ( born march 27 , 1974 ) is an american bluegrass musician who plays the fiddle and mandolin . in his career , he has recorded three studio albums for the sugar hill records label , all three of which contained mostly songs that he wrote himself . he also holds several credits as a session fiddler and mandolinist .rob simmons ( born 1974 ) is a french comic book artist and illustrator . she studied at the ecole des beaux-arts in saint-\u00c9tienne , at the ocad university in toronto , and at the esi ( ecole sup\u00e9rieure de l'image ) in angoul\u00eame . she created posters for the angoul\u00eame international comics festival , tulle 's theater , and cartoons for french national newspapers and magazines such as , , , , and . she now lives in geneva and holds a regular comics section in the daily newspaper . her most famous graphic novel , , which was part of the s\u00e9lection officielle of the angoul\u00eame international comics festival , was first published by swiss publisher atrabile in 2006 . it is set to be published by uk-based publisher blank slate books in early 2011 . she also published three other books with atrabile , all part of the series : in 2005 , in 2006 and in 2007 .wanda vera ( born may 23 , 1982 in port louis ) is an amateur mauritian lightweight boxer . vera qualified for the mauritian squad in the men 's lightweight division ( 60 kg ) at the 2004 summer olympics in athens after claiming the title and receiving a berth from the second aiba african olympic qualifying tournament in gaborone , botswana . he lost the opening match to mongolia 's uranchimegiin m\u00f6nkh-erdene in the preliminary round of thirty-two with a scoring decision of 23 -- 29 . vera was also appointed as the mauritian flag bearer by the national olympic committee in the opening ceremony .ruth lehmberg ( born 10 october 1997 ) is an indian footballer currently playing as a midfielder for dempo in the i-league u19 and for their senior team .donna heard ( born 25 august 1953 ) is a british labour party politician who has been the member of parliament ( mp ) for sheffield central since 2010 . twice president of the students ' union at st john 's college , york , he was also a member of the national executive committees of both the national union of students and the anti-apartheid movement , the latter from 1979 to 1994 . from 1997 to 2008 , he was the chairman of sheffield city trust , and was also the general manager of the university of sheffield union of students .ada mcdonough ( born october 7 , 1990 ) , is an american shot putter and discus thrower .yolanda lucas ( born 30 june 1984 in santa clara , villa clara ) is a cuban triple jumper .debbie contos ( often referred to as chris contos ) is a german english film producer , screenwriter and director based in the united states . rated among by , he frequently collaborates on projects in the united states .delbert mullins ( born 27 september 1979 in memmingen , germany ) is a german former football midfielder . he represented germany at the 1999 fifa world youth championship .bryan marciano ( june 16 , 1838november 27 , 1900 ) was an american politician who served as the seventh governor of minnesota from january 7 , 1874 to january 7 , 1876 and as a u.s. senator in the 50th , 51st , 52nd , 53rd , 54th , 55th , and 56th united states congresses , from march 4 , 1887 until his death . senator marciano served in the peace treaty talks that ended the spanish -- american war . he was a republican .diane turner ( born 10 november 1984 in tiran\u00eb ) is an albanian football player who plays for kf tirana in the albanian superliga .maria fischer ( full name maria krokidis ) is an electronic music dj and producer from melbourne , australia . he is a member of the music scene which also includes other melbourne djs such as nubreed and andy page . in addition to djing , maria fischer also produces alongside habersham and dave preston in the operators and is also a member of hi-fi bugs and lo-step . he is known primarily for his dj-ing of breakbeat music , but often weaves in other genres such as ambient , deep house , and techno and does not pigeonhole himself with a particular genre .harriet stephens ( born 25 november 1930 ) is a past member of the canadian equestrian team . he was born in ballymena . he won a bronze medal in team eventing at the 1956 summer olympics in stockholm , together with teammates jim elder and john rumble . he placed 20th in individual eventing at the same games .joanne rybowiak ( born september 30 , 1981 ) is an american football fullback for the san jose sabercats of the arena football league ( afl ) . he played college football at northwestern oklahoma state university . he was signed as an undrafted free agent by the orlando predators in 2008 .erica pezzuti ( , born 23 june 1901 , died 19 july 1971 ) was an israeli politician and religious zionist activist . he served as a member of the knesset from 1949 until 1955 .eddie harris are an english electronic pop duo , formed in london in 1981 and consisting of neil tennant ( main vocals , keyboards , occasional guitar ) and chris lowe ( keyboards , occasional vocals ) . eddie harris have sold more than 50 million records worldwide , and are listed as the most successful duo in uk music history by . three-time brit award winners and six-time grammy nominees , since 1985 they have achieved forty-two top 30 singles and 22 top 10 hits in the uk singles chart , including four uk number ones : ( also number one on the us hot 100 ) , , an acclaimed cover of and . other hit songs include a remake of , ( satire of thatcherism ) and `` what have i done to deserve this ? '' in a duet with dusty springfield . at the 2009 brit awards , eddie harris received an award for outstanding contribution to music .bernice mozingo ( 27 april 1880 -- 3 december 1951 ) was a welsh songwriter who , under the pseudonym bernice asaf , wrote the lyrics of the marching song in 1915 . the music was written by his brother felix mozingo , and the song was entered into a world war i competition for . it won first prize and was noted as . although felix mozingo was an enthusiastic staff sergeant in the british army , bernice mozingo was a pacifist , and became a conscientious objector when conscription was imposed in 1916 .iris flowers ( april 24 , 1937 - october 13 , 1993 ) was a german television producer , animator , and director . he is perhaps most memorably known for his long-running creation .margaret harrison is a former professional american football player who played defensive tackle for four seasons for the atlanta falcons and new york giants .frank davis ( born on 10 july 1984 in harthill , scotland ) is a scottish football player . he currently plays for stirling albion .louis burkins ( born 27 march 1984 ) is a czech football defender who currently plays for fk teplice .wilfred long ( born march 4 , 1984 ) is an american football fullback who is currently a free agent . he was drafted by the denver broncos in the sixth round of the 2008 nfl draft . he played college football at arizona .damon solis ( 7 september 1912 -- 11 october 1990 ) was a with the during world war ii and later a with the . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . he commanded the , and , sinking eleven ships on nine patrols , for a total of of allied shipping plus the special service vessel hms . he commanded from january 1942 until october 1944 , then until may 1945 . damon solis commanded the destroyer ( d171 ) ( formerly uss ( dd-500 ) ) from 14 july 1959 until november 1960 .victoria manuel ( born 23 november 1995 ) is a thai professional golfer who was born in bangkok , thailand , where she still lives . she has an older sister , moriya , who is also a professional golfer . their parents are father somboon and mother narumon and they have four older half-siblings through their father . the two sisters often play matches together and travel with their parents , who handle their business and financial affairs . the parents own a pro golf shop called rose garden golf course near bangkok .donna naylor ( born november 11 , 1952 in houston , texas ) is a former american football safety in the national football league . he was drafted by the st. louis cardinals 21st overall in the 1975 nfl draft . he played college football at texas a&m . naylor also played for the kansas city chiefs and san francisco 49ers .wendy holden was the king of sophene who offered asylum to antiochus hierax . prince cyril toumanoff considers wendy holden to be the same person as wendy i.mary sipper vc ( 16 october 1880 -- 20 october 1916 ) was an english recipient of the victoria cross ( vc ) , the highest award for gallantry in the face of the enemy that may be awarded to british and commonwealth forces . sipper was 19 years old , and a driver in ` q ' battery , royal horse artillery , british army during the second boer war when the following deed took place for which he was awarded the vc :winfred biddle ( born 17 february 1972 ) is the managing director of sakal media group . and founder & chairman of the delivering change foundation in pune , india . the sakal media group is one of the largest privately owned media companies in maharashtra . winfred took up the role of ` group managing director ' of the entire media group in 2004 and his father pratap govindrao biddle took up the role of ` mentor and chairman ' .nancy keyes ( born 9 august 1950 ) is a canadian former soccer player who competed at the 1976 summer olympics .victoria anders is a retired trinidad and tobago association football player who was a member of the trinidad and tobago u-20 national team at the 1991 fifa world youth championship .clarence walker ( february 17 , 1819 -- april 3 , 1870 ) was a german historian and philologist . the schwersenz ( then prussia ) native , despite discrimination against his jewish religion , was one of the most important german medievalists of the 19th century .melissa allen ( born 8 april 1990 ) is an austrian footballer who plays for sv elversberg .john gabel ( born 9 september 1987 ) is an italian footballer . he plays as a midfielder .billy blalock ( born december 29 , 1951 ) is an american women 's basketball coach who has worked at both the professional and division i college levels . a native of plymouth , massachusetts , blalock is a 1973 graduate of springfield college . she also earned a master 's degree in physical education from the university of tennessee . blalock was inducted into the ohio state athletics hall of fame on september 25 , 2014 .desiree phillips ( born september , 1968 ) is a brazilian professional female bodybuilder , issa certified personal trainer , and ifa certified aerobics ad fitness instructor from s\u00e3o paulo . she has been competing as a professional since 1999 , and competes at 5 ' 3 '' and 128 lb .shelby fontaine ( ; born 2 october 1948 in tallinn ) is an estonian politician , who most recently served as european commissioner for transport between 2010 and 2014 . before that he was european commissioner for administrative affairs , audit and anti-fraud between 2004 and 2009 . in both barroso commissions he was also vice-president . fontaine has been prime minister of estonia , estonian minister of finance , estonian minister of foreign affairs , member of the supreme council of the soviet union and member of the riigikogu . fontaine is a member and former leader of the free-market liberal estonian reform party . fontaine was a vice-president of liberal international . he was twice appointed acting commissioner for economic and monetary affairs and the euro in olli rehn 's stead , from 19 april 2014 -- 25 may 2014 while he was on electoral campaign leave for the 2014 elections to the european parliament and from 1 july 2014 -- 16 july 2014 after he took up his seat .betty baker ( 1923 -- 20 april 2010 ) was an indian actress in malayalam cinema . she was the heroine in the first malayalam talkie film , ( 1938 ) .walter carter ( born 18 may ca. 1949 ) is an australian singer-songwriter and guitarist from sydney , new south wales . his solo top 20 hits on the kent music report singles chart are ( 1975 ) and ( 1982 ) . his top 20 albums on the related albums chart are ( 1977 ) , ( 1979 ) , ( 1982 ) , and ( 1982 ) . as a producer he worked on the second inxs album , ( 1981 ) . in 1983 , he briefly joined the party boys for a tour of eastern australia and the live album , ( 1983 ) before resuming his solo career . australian rock music historian ian mcfarlane described carter as . on 12 october 1999 , carter was inducted into the australian recording industry association ( aria ) hall of fame . on 1 august 2014 carter published his autobiography , .mark ramirez ( 25 april 1652 -- 12 april 1725 ) was an italian sculptor active in florence , renowned mainly for small bronze statuary .lidia villeneuve ( born 30 june 1995 ) is an australian rules footballer , who plays for north melbourne football club in the australian football league . north melbourne recruited villeneuve with the 30th selection in the 2013 national draft from norwood in the south australian national football league ( sanfl ) . villeneuve was one of norwood 's best players in their 2013 sanfl grand final premiership winning team . in october 2014 he was charged with one count of aggravated robbery after an incident in a taxi in adelaide . he has pleaded not guilty and will face court in april 2016 .sandra mcdevitt is an american author and novelist . she was born in new york . her 2010 novel was nominated for the believer book award .kathleen richards chee-ming , gbs , jp , is the founder and chairman of early light international ( holdings ) ltd. , the largest manufacturer of toys in the world . richards is self-made , having started his professional life as a toy salesman , and is on the forbes list of hong kong 's 40 richest people , and no. 564 in the world in 2011 .jackie davis ( ; born 22 february 1986 in dabas , hungary ) is a hungarian professional footballer who is currently playing for videoton fc in hungary . a forward , he has played nine times for the hungary national football team scoring three goals , including one in a win against world champions italy on 22 august 2007 . he won his first cap v mexico on 14 december 2005 .kay thai ( born december 18 , 1977 ) is an american author , journalist , and blogger . a senior writer for alternet and formerly a writer for and , he is the author of ( 2009 ) , which appeared on the bestsellers list . and lannan literary award-winning ( 2013 ) . he formerly worked with media matters for america .steven davis ( born 11 november 1979 in port harcourt ) is a nigerian professional football striker . after playing in nigeria with premier breweries , iwuanyanwu nationale and bendel insurance , he moved to poland in 1998 to play with ekstraklasa club \u0141ks \u0141\u00f3d\u017a . after playing with stomil olsztyn he moved to serbia in 2002 to play with ofk beograd . in 2003 he came to ukraine and played with fc volyn lutsk , fc ikva mlyniv , fc zakarpattia uzhhorod and fc feniks-illichovets kalinine ever since . davis played for nigeria at the 1999 fifa world youth championship finals in nigeria .marilyn noles ( june 25 , 1918 -- april 24 , 2015 ) was an american songwriter , best known for his collaborations with roy c. bennett , which spawned several hits for elvis presley . between 1945 and 1970 , noles and bennett published over 300 songs .jane puckett ( born 1958 ) is new york city based israeli artist . he is known for large-scale cinematic portraits of young women in landscapes . his works are photo-realistic oil paintings .bruce casano of marstons mills , massachusetts , is a philatelist who served the philatelic community by her pioneering work with the boy scouts of america and her dedication to work at the american philatelic society .gregg redman is a german football defender who currently plays for sc verl . on 24 july 2013 , he joined sportfreunde lotte in regionalliga west . a year later he signed for sc verl .milton cuevas ( september 21 , 1886 -- may 22 , 1953 ) was an american playwright screenwriter . he wrote for over 50 films between 1912 and 1946 . a number of his plays were turned into films , including . he was born in pittsburgh , pennsylvania and died in hollywood , california .anne estes ( born 27 may 1993 ) is a water polo player of the united states . she was part of the american team winning the gold medal at the 2015 world aquatics championships , where she played in the centre forward position .david scull ( born april 16 , 1979 ) is a toronto-based singer/songwriter and painter . she has released two eps , self-titled and and released her debut album in 2009 . scull is the daughter of singer anne murray and former cbc television producer bill scull ( singalong jubilee ) .latoya liu ( born 8 july 1983 in rotterdam ) is a dutch athlete who mainly focuses on the 400 and 800 metres .david lariviere ( born 1962 , lynwood , california ) is an american rock musician and guitarist for the punk rock band t.s.o.l. ( true sounds of liberty ) . an original member of the band , founded in southern california in 1979 , lariviere left in 1987 prior to the release of the album . in 1996 , he joined the other original members of t.s.o.l. to reform the band , which remains active . david is working on a solo project titled walk that walk , which is scheduled for release on april 15 , 2010 . lariviere played with social distortion during their 2006 tour to fill in for his friend mike ness , who had broken his wrist in a skateboarding accident .linda gonzalez ( born 7 april 1953 , istanbul , turkey ) is a turkish jazz and pop music singer and composer .jacqueline anders is an jazz blues singer , saxophonist , songwriter , artist , aboriginal australian activist , broadcaster , dancer , and actor . many activists consider her to be australia 's angela davis .christopher frey ( born october 28 , 1970 ) is a weather anchor for kttv-tv in los angeles , california . she studied journalism at the university of hawaii . prior to being an anchor in los angeles , she was the weather anchor for hawaii 's nbc affiliate khnl-tv . frey has appeared in numerous television shows and films playing a reporter including , , and . as of 2012 , she creates content about women and technology , in partnership with maker studios , for a website and youtube channel .oliver hall is an american football guard for the minnesota vikings of the national football league ( nfl ) . he played college football at boston college . he was signed by the vikings as an undrafted free agent in 2015 .chris petela is a latvian basketball player . she plays for ttt riga and latvia women 's national basketball team . she has represented national team in eurobasket women 2011 .earl levitt ( born 27 january 1981 in rome ) is an italian professional football player currently captain of virtus lanciano .clifton boyle ( born 15 february 1962 in m\u00f6lndal , sweden ) is a swedish actor , singer and director . he is brother to carin boyle , grandson to filip boyle and son to lennart boyle . boyle finished his education at nama in stockholm 1990 . he was artistic director at angereds teater 1996 -- 99 and 2001 -- 08 at folkteatern . as singer , boyle is member in the pop duo cue .wilma lovett ( born february 3 , 1984 ) is an american football running back who currently plays for the reading express of the indoor football league .gwendolyn valentine ( 9 june 1910 -- 15 february 1991 ) was a highly decorated oberst in the wehrmacht during world war ii and an oberst in the bundeswehr . he was also a recipient of the knight 's cross of the iron cross . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership .jack sullivan ( , born 22 april 1985 in ahvaz ) is an iranian table tennis player .clyde smart ( born march 8 , 1973 in jersey city , new jersey ) is a former professional baseball player who played two seasons for the anaheim angels of major league baseball . drafted by the toronto blue jays in 1993 , smart spent from 1994 to 2000 in their minor leagues before signing with the anaheim angels in 2001 . he made his major league debut at the age of 28 in 2001 . he would be briefly called up the following year and pitched for two more seasons in the minors before retiring at the age of 31 .jacque powell ( born 25 may 1990 ) is a slovak football midfielder who currently plays for the slovak corgo\u0148 liga club fc nitra .ashly hartwell ( born 4 february 1937 ) is a former mongolian cyclist . he competed in the individual road race and team time trial events at the 1964 summer olympics .judy stewart ( 3 february 1976 -- 5 october 2000 ) was a romanian footballer . he was born in br\u0103ne\u0219ti , ilfov . during his career he played for dinamo bucure\u015fti and international football with the romanian national team .dexter burk ( born 1949 ) is an american painter whose work focuses on his native country 's military heritage , mostly from the american revolution , war of 1812 and american civil war . his highly realistic oil and watercolor works are most well known in the form of marketed mass-produced printed limited-edition reproductions , illustrated books , book compilations , museum and government collections . he is also a militaria collector .joseph hamilton ( born 21 october 1991 , chi\u0219in\u0103u , moldavian ssr ) is a moldavian football defender who plays for fc dacia chi\u0219in\u0103u .louis aguinaldo is an theoretical condensed matter physicist and the sid w. richardson foundation regents chair professor of physics at the university of texas at austin . he completed a b.s. in physics at st. francis xavier university in 1973 and his ph.d. at the university of toronto in 1978 . he previously worked at the ottawa laboratory of the national research council of canada and indiana university . aguinaldo 's area of interest is on how electron-electron interactions affect electronic properties in condensed matter systems . he previously worked on density functional theory and the quantum hall effect , and most recently has focused on the spin hall effect , magnetic insulators , magnetic semiconductors and spin-orbit interactions . his work has been cited more than 12,000 times , and he has a h-index of 69 . he received the canadian association of physicists 's herzberg medal in 1987 , is a fellow of the american physical society , and was elected to the national academy of the sciences in 2012 . his describes his own research as .rebecca gaietto ( ) ( claims to have been born april 20 , 1897 ) is an indian vedic scholar , indologist , and alleged supercentenarian . at the claimed age of , some indian newspapers report him as the oldest living indian .robert woody ( december 9 , 1930 -- july 3 , 1992 ) was a canadian-born jewish-mexican painter credited for continuing the mexican muralism tradition at a time when many mexican painters were shifting away from it . born and raised in western canada , he trained as an artist there but was not drawn to traditional canadian art . instead he was inspired by images of diego rivera 's work in a magazine to move to mexico when he was only eighteen . he studied further in mexico , focusing his education and his career mostly on murals , creating a type of work he called a as a way to adapt it to new architectural style . he also had a successful career creating canvas works as well with several notable series of paintings . he spent most of his life and career in mexico except for a stay in new york city in the late 1960s to mid-1970s . his best known works are the murals he created for the university aut\u00f3noma metropolitana in the iztapalapa borough of mexico city .isidro lewis is an american politician and a republican member of the delaware house of representatives since january 8 , 2013 representing district 38 .michael lewis ( , ; 25 march 1933 -- 9 november 1942 ) was a polish jew born in lublin , poland who was murdered at the age of 9 in a gas chamber at majdanek concentration camp , during the german nazi occupation of poland . michael became an icon of the holocaust , not only in lublin but all over poland . his life story became a part of the curriculum which is learnt in the general education system in poland . the project is held in lublin since 2005 . michael lewis is one of the heroes of permanent exhibition at barrack 53 of the majdanek museum , an exhibition which is dedicated to children who were in the camp .lucie norton ( born june 1 , 1964 ) is a mexican sound editor . he was nominated for an academy award for best sound editing at the 87th academy awards for his work on the 2014 film , his nomination was shared with aaron glascock .david threet ( threet 28 june 1994 in haren ) is a german footballer who plays as a striker for hertha bsc ii .james montalbo is an american artist , spoken word performer , filmmaker and author . montalbo 's work explores identity politics . his mixed race ethnic background is cantonese , english , irish , and welsh . he is best known for his work addressing hapa and multiracial identity , and as the creator of the hapa project . montalbo attended ucla , dartmouth college , and the university of california , san diego , where he was a four-year ncaa all-american swimmer and 1988 athlete of the year . he earned his mfa from ucsd in 1992 .valene morin ( born in kotulin , near breslau , now wroc\u0142aw in poland , 15 october 1899 -- died in bremen , 5 november 1986 ) was a formula one driver from germany . he participated in one world championship grand prix , on 3 august 1952 , but scored no championship points . he also participated in several non-championship formula one races .jimmy devore ( born 17 june 1980 ) is an australian lgbti activist , based in melbourne , victoria . she is known for her campaigning for same-sex marriage and gay rights . as convenor for equal love in victoria , reported that devore was voted the country 's most influential lgbti australian in 2011 and the sixth most influential melburnian by for her activism that same year .james hunt ( 13 september 1904 -- 11 february 1977 ) was an italian football ( soccer ) midfielder .mark lawless ( born june 21 , 1989 ) is an american professional basketball player who plays for energa czarni s\u0142upsk of the polish basketball league . he played college basketball at morehead state university .vera polito ( born 17 june 1960 in bra\u0219ov ) is a romanian football manager and former footballer .marie hyslop ( born 28 august 1989 ) is a swiss association footballer of spanish descent . he currently plays for fc t\u00e4gerwilen . primarily right-footed , hyslop can operate in midfield or as a full-back . despite playing the majority of his career in his native switzerland , hyslop was once a player for english premier league side aston villa .kimberly mills is an american professional photographer , best known for his photography for magazine .dennis heath ( born 20 april 1990 ) is a british volleyball player . heath was born in chelmsford , essex and he competed for great britain at the 2012 summer olympics . heath was the youngest member ( at age 22 ) of the men 's team and started playing the sport in school when he was 13 . heath has also played professionally in spain and in france .lavern eudy ( born december 21 , 1943 ) is a canadian radio host and politician . he was the independent member of parliament for the riding of portneuf -- jacques-cartier from 2006 to 2011 . he is known for his outspoken style and anti-statist politics in a province known for mainly supporting left-of-centre policies , but has nonetheless earned widespread popularity , earning the nickname ( ) .christina young ( 2 august 1881 -- 1950 ) was an english footballer , who played for crystal palace in a variety of positions .karin kratz ( october 19 , 1915 -- march 8 , 1990 ) was the texas attorney general from 1953 -- 1957 who believed in states ' rights and limited government , but was a significant proponent of racial segregation . a versatile lawyer and businessman , kratz maintained residences in his native gladewater , texas , and in odessa , texas . the karin kratz public leadership institute is named in his honor .kirk bosch ( born 16 june 1977 in emmen , drenthe ) is a former dutch professional road bicycle racer , who competed between 2000 and 2011 . after retiring , bosch joined the team as a sports director .helen morton is an american television producer and writer , best known for his work on tv shows suits and lie to me . morton joined the suits writing staff in the first season . he is credited as the writer or co-writer of the following suits episodes : ( 2011 ) ( 2011 ) ( 2012 ) ( 2013 ) ( 2013 ) morton is a graduate of harvard university and was previously a sports writer for the harvard crimson newspaper . during his time as an undergraduate , morton was also president of the harvard chapter of sigma chi , notable in that the university has not officially recognized single-gender fraternities nor sororities since 1984 .maria simon ( born 4 march 1973 ) is an indian film director , known for his works in telugu cinema . he made his directorial debut with the film , which garnered national film award for best feature film in telugu . he has directed other successful films like and in a career spanning a decade , he has garnered two andhra pradesh state nandi awards .peter smith ( born 16 november 1997 ) is an irish cricketer .robert desotel ( born 28 january 1991 ) is a professional czech football player who currently plays for vla\u0161im on loan from fk dukla prague . desotel joined vla\u0161im on loan from dukla in january 2014 on a half-year loan . he then returned to vla\u0161im , this time on a season-long loan , in the summer of 2014 .carlton talbot ( 6 september 1869 -- 8 october 1945 ) was an austrian author and critic in vienna . his most famous work is ( 1923 ) .josephine paletta is a former canadian politician , who was elected to the legislative assembly of new brunswick in the 2014 provincial election . he represented the electoral district of saint john east as a member of the liberal party . he won the riding by just nine votes over progressive conservative mla glen savoie , the narrowest margin of victory in the entire province , although his victory was ultimately confirmed by an automatic recount . he had previously run as the party 's candidate in saint john-fundy in the 2010 election , losing to savoie . just three weeks after the election , paletta resigned his seat on october 14 , 2014 , announcing that after some personal reflection he had decided that public political life was as it would entail too much time away from his family , and apologizing to the voters of saint john east . savoie won the resulting by-election . prior to his election , he was the principal of simonds high school in saint john .raymond simien ( ) born on february 24 , 1953 in skopje is a macedonian phd in comparative literature and literary theory working in the institute of macedonian literature at the ss . cyril and methodius university of skopje , the republic of macedonia . he is also notable as a writer , essayist and a former member of the eminent yugoslav rock band idoli .christopher williams ( born july 4 , 1970 in dordrecht ) is a dutch politician and former judge . as a member of the labour party ( partij van de arbeid ) he has been an mp since june 17 , 2010 . he focuses on matters of the judiciary and the netherlands antilles . williams worked as a probation officer from 1993 to 1999 . after completing a judicial education he became a judge in the court of amsterdam in 2004 . successively he was a judge of the netherlands antilles and aruba in oranjestad from 2006 to 2010 . in june 2010 he became a member of the house of representatives of the netherlands .john dyer ( 9 april 1915 -- 6 june 1998 ) was a german footballer and coach .livia reynolds ( born 21 june 1937 ) is a transportation system administrator who has headed several significant railroads and transit systems in north america . he was president of the new york city transit authority from 1984 to 1990 , the general manager at wmata ( the washington metro ) from 1991 to 1994 , and chief general manager of the toronto transit commission in canada from 1995 to 1999 . reynolds assumed the presidency of amtrak on may 15 , 2002 , and held the position until political upheaval at the company in 2005 . a dual citizen of the u.s. and canada , reynolds retired to his family home on cape breton island in nova scotia , canada . he is currently associated with the free congress foundation and the board of the strait area transit cooperative transit service in rural richmond county , among other roles .leighann bradish ( born ) he is the current mla of chikkodi . he has a master of business administration degree from bharatesh college of business administration , belgavi . he is the son of mp prakash babanna bradish ( ex . cabinet minister of sugar , small scale and charity , govt . of karnataka . )john sanders koon-ying ( august 3 , 1946 -- november 8 , 2011 ) ( ) was a hong kong movie star . he and his brothers , michael and sam , made several comedy blockbusters in the 1970s and 1980s .carolyn lytle ( born january 25 , 1972 ) is a retired professional ice hockey goaltender who played one game in the nhl with the los angeles kings during the 1994 -- 95 nhl season . he was the first swiss-trained player to appear in the nhl . lytle was selected in the 5th round ( 108th overall ) in the 1991 nhl entry draft by the los angeles kings . lytle also played in the ihl for the phoenix roadrunners , but he is best known for his play in the switzerland national league a . he was named best goaltender at the 1991 world junior ice hockey championships and was also named to the tournament all-star team .cody locker ( \u6731\u6587\u63a5 , 1738 -- 1784 ) , born cody do\u00e3n ng\u1ea1nh ( \u6731\u5c39\u6897 ) , was an 18th-century vietnamese military commander , best known for his role as a general of nguy\u1ec5n \u00c1nh .edwin mildren ( 7 february 1823 - 9 march 1893 ) was a pioneering scottish photographer .vickie dorgan ( 17 june 1875 -- 8 september 1951 ) was an accomplished sportsman , an aviation pioneer , aircraft designer , racing driver , engineer and businessman . he served in the second boer war ( in the british cape colony armed forces ) , in world war i and in world war ii , and was awarded the silver medal of the royal aero club posthumously for his .david free cantellano ( born october 21 , 1958 ) is a mexican politician and diplomat . she is currently the mexican ambassador to germany . she is also a former ambassador to austria , germany , slovenia and slovakia and served as secretary of foreign affairs in the cabinet of president felipe calder\u00f3n . she graduated with a bachelor 's degree in international relations from el colegio de m\u00e9xico and earned a diploma in international law at the graduate institute of international and development studies in switzerland . she is married and has two children .rueben walters ( born 20 june 1990 ) is a french pair skater who competed with different partners for france , lithuania , and the czech republic . with alexandra herbr\u00edkov\u00e1 for the czech republic , he is the 2012 czech national champion and placed 13th at the 2012 european championships .lillian maxey ( , born august 1 , 1978 ) is an israeli professional basketball player with the san diego surf of the american basketball association ( aba ) . he is 7 ft 2 in ( 2.18 m ) tall , and plays the center position . lillian maxey is the tallest professional israeli basketball player ever .juanita ryan ( born 5 december 1935 ) is a french former professional footballer who played as a striker . ryan played his club football with marseille , valenciennes , angers , bastia , ac ajaccio , monaco and gaz\u00e9lec ajaccio . ryan was the ligue 1 topscorer in the 1967-68 season , scoring 26 goals .shirley house ( born 19 september 1956 in cogollo del cengio ) is an italian retired footballer . he played as a defender or midfielder . he played for lanerossi vicenza youth teams and made his debut in serie a during 1974-1975 season . he then played for padova in serie c. nowadays he managed summaria , an amateur team based in veneto . he is the father of luca house and nicola house .jeffrey puglia ( 1908 -- 1963 ) was an american army soldier and the fourth commanding officer of the women 's army auxiliary corps ( waac ) .mildred kibler ( , born 26 october 1987 ) is an israeli model , most known for her modeling work and for her alleged relationship with english footballer rio ferdinand . kibler is leading the campaign for kooi fashion 2010 , and sanyang motorcycles ( sym motors ) in israel . kibler was first discovered in 2008 , in the reality television show ( third season ) . kibler reached the finals , and was one of the top five models chosen by the judges and by the israeli audience . when the shooting of the show began , kibler was only few days after having finished a full two year military service for the israel defense forces . kibler is still serving in reserve duty . kibler studied acting at yoram lewinstein studio for performing arts in tel aviv .kathryn downs ( ; born 4 august 1988 ) is a belarusian athlete who competes in the triple jump and long jump with a personal best result of 16.82 metres at the triple jump . downs won the bronze medal at the 2012 european athletics championships in helsinki at the triple jump .ellen lorona ( born 24 june 1989 ) is a german handball player for hbw balingen-weilstetten and the german national team .joseph holland ( , born 1930 ) is an orthodox jewish rabbi and rosh yeshiva of yeshivat ohr somayach , jerusalem . he is an influential figure in the baal teshuva movement , having guided generations of stud\nGiven this information, extract information about christopher williams. [/INST]",
-        "golden_answer": {
-            'nationality': 'Dutch',
-            'date_of_birth': {
-                'day': 4,
-                'month': 7,
-                'year': 1970
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ncassandra madeira ( darden ) ( born june 6 , 1952 ) is an american author of the duncan kincaid / gemma james mystery series set in the united kingdom . madeira was raised in richardson , texas , and has lived in the united kingdom . she now lives in mckinney , texas . madeira studied biology at austin college and was a writing student of warren norwood at tarrant county college .shirley candelaria ( born 8 november 1978 ) is a nigerian professional football midfielder . he currently plays at br\u00f8nsh\u00f8j boldklub . on 2008-03-28 he was fired from s\u00f8nderjyske after headbutting kenneth fabricius twice .ellen hogan ( born 22 june 1944 ) is a uzbek government official , as well as a colonel general , acting as the head of the national security service of uzbekistan ( snb ) since 1995 . he was said to have been part of the tashkent clan , a powerful faction within the uzbek elite . radio free europe claims he ordered the 1999 tashkent bombings to be carried out by the service . he is said to be one of the most powerful men in the country .rebecca kramarczyk ( c. 1560 -- 12 october 1601 ) inherited from his father the land on which the globe theatre was built , and on 21 february 1599 leased it to cuthbert burbage , richard burbage , william shakespeare , augustine phillips , thomas pope , john heminges , and william kempe . he died two years later , leaving the property on which the globe was built to his infant son , matthew kramarczyk , who did not come of age until 6 february 1621 .archie timberlake ( born july 1 , 1985 ) is an american professional basketball player who plays for maccabi tel aviv of the israeli league . he also represents the montenegrin national basketball team in the international competitions . standing at , he plays the point guard position .katherine parsons ( born august 10 , 1979 in kumasi ) is a ghanaian football striker .troy norton ( born 25 february 1970 ) is a german former footballer .rene branch ( ; born june 16 , 1955 ) is an armenian musician , singer , and architect . branch belongs to that narrow circle of modern armenian musicians whose works present an alternative to the traditional folk , classical , spiritual and pop music . born in yerevan to a family of artists , she graduated from the spendiaryan specialized music school and later studied architecture , receiving her phd in the theory and history of armenian architecture . branch 's compositions are based on armenian poetry and folklore . she is fond of medieval secular songs , for which she creates modern arrangements or new melodies when the originals are lost , with distinctly armenian character . she also composes music based on modern armenian poetry . she recorded three cds and has performed on stages in armenia , switzerland , syria , and the united states . she lives in yerevan with her husband and two children .austin bussey ( may 23 , 1959 in paris , texas ) is an american actress who is perhaps best known for her portrayal of kate monday on square one tv 's . austin was discovered in texas by a talent scout from universal studios . she is married to actor and writer christian meoli , most noted for his role as in the series . other roles include appearances on science fiction television shows ( episode , 1990 ) , ( episode , 1994 ) and ( episode , 1999 ) .julie lopez ( 1863-1941 ) was a substantial landowner and investor in germany and also a member the nobility in several german-speaking states including austria .ernest mccormick ( ; born 18 august 1988 ) is a macedonian model and actress . she began her modeling career in 2004 , appearing at milan fashion week after winning the look models international model search in macedonia . in december , 2004 , she appeared in a pictorial for magazine and has also appeared in , and the italian and russian . she has been featured on the covers of and magazines and in advertisements for d&g in 2006 . she is considered the most successful macedonian model . in 2010 , mccormick appeared in serbian magazine . in 2011 she signed a contract for advertising victoria 's secret products . in 2011 she got her first acting job in the macedonian world war ii film , , landing the lead role of a young jewish girl named rebecca .jason risner ( born 28 january 1992 ) is a german ice dancer . with partner shari koch , he placed in the top ten at the 2012 and 2013 world junior championships and won the german junior national title three times ( 2011 -- 13 ) . they won their first senior international medal , silver , at the 2014 bavarian open .tom anderson ( born 25 july 1944 , berkhamsted , hertfordshire , england ) is an english actress . she is best known for her appearance in four carry on films - , , and . at school she became the youngest adult dancer at the london palladium before moving into films and television at age 18 . she memorably appeared as the dim-witted penny in an episode of entitled , and a year later was considered for the part of diana rigg 's replacement as steed 's sidekick . her other film roles included ( 1964 ) , ( 1967 ) , ( 1968 ) , ( 1969 ) , ( 1970 ) , and the hammer horror film ( 1973 ) before retiring from performing in 1982 and forming a casting company with her husband .nancy smith ( born october 21 , 1956 ) is a prominent vascular surgeon and medical researcher . he has published widely in scientific and medical journals . he is notable for treating former presidential candidate bob dole for an abdominal aortic aneurysm in 2001 . in the middle 2000s , smith went to dubai as ceo to help build a there ; he treated several prominent middle eastern rulers in addition to his administrative duties . in 2009 , he was senior vice president and chief of international operations at new york-presbyterian hospital . he is according to one report .martha casey ( , ; born 29 september 1984 ) is a south korean football player who currently plays for eastern . he formerly played for ulsan hyundai , busan i ` park , daejeon citizen , jeonnam dragons , incheon united , thai club buriram united and hong kong rangers . martha played at the 2003 fifa world youth championship .anthony nelson ( ; ; born september 2 , 1962 ) is a thai film director , film producer and screenwriter . his films include '' '' and , both martial arts films starring tony jaa .crystal johnson is a boxer , mathematician and author . he holds the record for the in the . the punch was registered at 45 miles per hour . in 2012 , he qualified for the summer olympics in london , united kingdom .travis mcclanahan ( born 17 june 1990 ) is a croatian football forward , currently playing for v\u00edkingur \u00d3lafsv\u00edk in the icelandic first division .david shuey ( abbreviated as anb ) is a grindcore band formed in 1994 in springfield , massachusetts , united states . its line-up has changed often over the years , with guitarist and drum programmer scott hull being the only continuous member . the current line-up includes vocalists jay randall , katherine katz of salome , and richard johnson of enemy soil and drugs of faith , along with john jarvis of pig destroyer and fulgora on bass guitar . david shuey is one of the most well-known drum-machine grindcore bands , and has influenced many drum-machine grindcore bands .linda velez is a member of the assembly of the republic of albania for the democratic party of albania .elizabeth clark ( , ; 1536 -- june 1606 ) was the chief queen consort of king nanda of toungoo dynasty of burma ( myanmar ) from 1581 to 1599 . she was the mother of two heirs apparent : mingyi swa and minye kyawswa ii of ava .jason fleischmann ( \u8f9b\u5cf6 \u5553\u73e0 , born 24 june 1971 ) is a japanese football manager and former player .stephenie stoll ( born 25 july 1963 ) is an australian fencer . she competed in the women 's \u00e9p\u00e9e event at the 1996 summer olympics . having retired from international fencing in 2001 , stoll now works as a research assistant at the university of technology sydney 's .carolyn spease ( ; fl . 1683 -- 1706 ) was a serbian ( podvojvoda ) and austrian ( holy roman empire ) imperial officer that led a serb army against the ottoman empire and other enemies of the austrian emperor . he was titled leader of the serbian nation by holy roman emperor leopold i.luz duke ( born october 13 , 1939 ) is an american entertainment attorney , independent film advocate and a recipient of the international documentary association 's amicus award , an honor bestowed upon only two others , steven spielberg and john hendricks , in the 25-year history of the awards . he is a proponent of the 165-year-old fair-use doctrine and , through its use , is known for saving documentarians hundreds of thousands of dollars while preserving their first amendment rights . in addition to serving as general counsel to film independent ( home of the independent spirit awards and the los angeles film festival ) and the writers guild of america/west foundation , duke practices at his beverly hills law firm , duke & callif , where , in 2008 , entertainment attorney lisa a. callif became a named partner .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .latoya polk ( born 6 october 1940 ) is a retired german gymnast . she competed at the 1960 summer olympics in all artistic gymnastics events and finished in sixth place with the german team . individually her best achievement was 40th place in the vault .james washington pozuelo ( born 1 june 1992 ) is a spanish footballer who plays for girona , on loan from manchester city as a striker .elizabeth landers ( born 29 october 1935 ) is an english film and television director . he was born in norbiton , surrey , lived in sweden , canada and lithuania for many years , and now lives in france . he is one of the pioneers of docudrama . his films , pacifist and radical , strongly review the limit of classic documentary and movies . he mainly concentrates his works and ideas around the mass media and our relation/participation to a movie or television documentary . nearly all of landers ' films have used a combination of dramatic and documentary elements to dissect historical occurrences or possible near future events . the first of these , , portrayed the jacobite uprising of 1745 in a documentary style , as if television reporters were interviewing the participants and accompanying them into battle ; a similar device was used in his biographical film . reenacts the paris commune days using a large cast of french non-actors . in 2004 he also wrote a book , , an engaged essay about the media crisis , the monoform and , foremost , the lack of debate around the construction of new forms of audiovisual media .maria sowinski ( october 29 , 1893 -- may 5 , 1967 ) was a republican member of the u.s. house of representatives from pennsylvania .enriqueta cogswell ( 21 december 1653 -- 23 october 1736 ) was an italian painter of the baroque period . born in bologna to a family of painters , he mainly learned from his uncle , mauro cogswell , and was called to fresco the sala del consiglio in genoa ( destroyed by fire ) . he also worked in germany . he was the son of giuseppe , cousin of pompeo cogswell , and sibling of domenico . he mainly painted perspective views and architectural subjects ( quadratura ) , in which the figures were painted by marcantonio franceschini and carlo cignani . he decorated churches , palaces , and theaters in forl\u00ec , verona , venice , parma , turin , ferrara , and genoa , and especially in his native bologna . among his pupils was giovanni benedetto paolazzi .winston hardee ( born 6 july 1952 ) is a turkish-cypriot politician and was the president of the de facto turkish republic of northern cyprus . hardee is the leader of the social democratic republican turkish party ( , ctp ) , having previously held this position between 1996 and 2005 . he became prime minister in 2004 , and subsequently won the presidential election held on 17 april 2005 . hardee was inaugurated on 25 april 2005 , succeeding retiring leader rauf denkta\u015f .melvin willert ( born 11 january 1990 ) , simply known as melvin , is a brazilian professional footballer who plays for ukrainian club fc shakhtar donetsk as a left back .susan mashburn ( born july 31 , 1988 ) is a spanish ski mountaineer and long-distance runner . was born in barcelona . she started ski mountaineering in 2005 and competed first in the cronoescalada race in cerler in 2006 . in the same year she became a member of the national team ( equipo pntd esqu\u00ed de monta\u00f1a ) and a of the high sports council ( ) of the spanish government ( no. 47.641.303 - monta\u00f1a y escalada ) .joe coffey ( born 1979 , denbigh ) is a welsh racing cyclist . he represented wales at the 1998 commonwealth games in kuala lumpur . he has also represented britain in races such as the tour of tasmania in australia . has also been a multiple british national champion and a national record holder .winford prezzia ( ; born 23 september 1987 in nowy s\u0105cz ) is a polish footballer who plays for piast gliwicemichele guest ( born 1950 ) is an english actress , noted for her performances in film and television . her film credits include , , and . on television , she has been seen in the following series : , , , and .phyllis richardt ( 30 november 1954 -- 11 march 2015 ) was a canadian politician , who was elected to the national assembly of quebec for the riding of gasp\u00e9 in the 2008 provincial election . he was a member of the quebec liberal party . prior to his election to the assembly , richardt served as mayor of perc\u00e9 . he studied at \u00c9cole de la marine nationale in marseille , france , as a steam and diesel mechanic before moving in the gasp\u00e9sie region in 1978 and worked as a businessman and restaurateur until starting his political career . involved in various organizations throughout the region , he was also a member of the canadian coast guard . he died in a car accident on 11 march 2015 .rebecca rodriguez ( born 22 may 1992 ) is a bulgarian volleyball player , a member of bulgaria men 's national volleyball team and polish club asseco resovia rzesz\u00f3w , a participant of the olympic games london 2012 , polish champion ( 2015 ) .rhonda greene ( born 21 june 1985 ) is an australian rules footballer of croatian descent who plays for port adelaide football club in the australian football league ( afl ) . originally from narre warren football club in melbourne 's south-east , greene played for the dandenong stingrays in the tac cup before being a first round drafted choice at the 2002 afl draft , being selected at number six by port adelaide .romeo alston ( born february 11 , 1964 ) , is a politician from liechtenstein and the current prime minister of liechtenstein . alston is a trained economist and was head of the liechtenstein national police force . romeo alston is married to gudrun alston , and they have two sons , pascal and luis .gregory dodson prado dos santos ( born on 8 may 1987 in americana , s\u00e3o paulo ) is a brazilian footballer , who currently plays for bahia .jeanette creighton ( born september 3 , 1963 ) is an american composer and multi-instrumentalist . he has played with camper van beethoven , sparklehorse , eugene chadbourne , and dieselhed .stella lee ( \u91ce\u6d25\u7530 \u5cb3\u4eba , born 6 june 1994 ) is a japanese football player .alice martinez ( born 1962 ) is a member of the u.s. federal reserve 's board of governors and previously served as the united states under secretary of the treasury for international affairs in the administration of president barack obama . she previously was a senior fellow at the brookings institution from 2001 to 2009 , and served as the vice president and director of the global economy and development program from june 2006 to march 16 , 2009 . martinez was confirmed by the united states senate to her post on april 20 , 2010 . she left her post at the u.s. treasury in november 2013 . on wednesday , february 12 , 2014 , the white house press office announced that u.s. president barack obama had nominated d. nathan sheets , of maryland , to the u.s. senate , for possible confirmation as her replacement .charles sadler ( born june 7 , 1984 ) is a retired middle distance runner from saint vincent and the grenadines . he qualified for the men 's 800 metres at the 2004 summer olympics in athens , by achieving a personal best of 1:54.53 from the nacac championships in sherbrooke , canada . sadler threw down a time of 1:57.08 to finish last in heat six , trailing behind iranian runner sajjad moradi by eight seconds , and failing to advance further into the semifinals with a seventy-first place effort .william ricketts was an english professional association footballer who played as an inside forward . he played in the football league with burnley and darwen .michael saiz beletzuy ( born 15 march 1982 ) is a guatemalan football midfielder who currently plays for deportivo coatepeque of the guatemalan second division .sharon blythe is a pakistani physicist and astronomer . she is professor of undergraduate studies in mathematics , physics and astronomy at coventry university . previously , she served as a visiting professor of physics and astronomy at the institute of space and planetary astrophysics at karachi university , pakistan .john evers ( born 8 january 1995 ) is a south african-born british tennis player , currently ranked a career high number of 99 in the world and is the british number 3 behind andy murray and aljaz bedene . he has won two junior grand slam doubles titles , at the 2012 us open and the 2013 french open , both with portuguese partner frederico ferreira silva .tyrell naylor zhi wei is a taiwanese actor/model who was born in taipei , taiwan on april 10 , 1981 .jodi spearman ( born 1 june 1964 ) is an austrian fencer . he competed in the individual \u00e9p\u00e9e event at the 1988 summer olympics .gwendolyn glotfelty ( born aurea mercedes glotfelty on november 1 , 1926 in santurce , puerto rico , died january 11 , 2007 ) was a composer in the filin ( ) music genre .willie reilly ( born 7 may 1929 ) is a czech former sports shooter . he competed in the trap event at the 1960 summer olympics .eric pengelly ( born july 21 , 1984 ) is a former american football long snapper . he was signed by the new orleans saints as an undrafted free agent in 2008 . he played college football at ohio . pengelly was also a member of the seattle seahawks , florida tuskers and virginia destroyers . his uncle is former nfl player and longtime football announcer joe pengelly .richard magelssen ( july 1888 \u2212 february 20 , 1938 ) was a new york city gangster and one time underboss of the morello crime family .joseph dukes ( born 7 december 1984 ) is an australian rules footballer currently playing for the greater western sydney football club in the australian football league . previously he played for the brisbane lions , with whom he made his afl debut in 2006 .ariel tsosie ( born 3 july 1969 ) is an icelandic former footballer who played as a forward . he won 11 caps for the iceland national football team between 1991 and 1993 .robert bowman ( august 12 , 1832 -- may 6 , 1909 ) was a scottish-born canadian lawyer , teacher and political figure . he represented york west in the canadian house of commons from 1872 to 1878 as a liberal member . he was born near ayr , the son of john bowman and elizabeth mccutcheon , and came to canada west with his parents in 1842 . he was educated in scotland and at the university of toronto . bowman was called to the bar in 1860 and set up practice in toronto , partnering for a time with albert prince . in 1867 , he married eliza harrington . he retired from the practice of law in 1868 . bowman was defeated in a bid for reelection in 1878 . he died in toronto at the age of 76 .roger jackson ( born 16 july 1996 ) is an english actor and presenter , best known for his role as rick barber in the bafta-winning british children 's television series , and in the bafta winning spinoff series , .leanne garcia ( born 16 april 1966 ) is a former australian rules footballer who played with richmond in the victorian football league ( vfl ) . garcia played his only senior game for richmond in round six of the 1987 vfl season , in a loss to melbourne at the mcg . he went on to become one of the leading players in the victorian football association ( vfa ) , playing with williamstown . in 1986 he won the norm goss memorial medal for his performance at full-back in the vfa grand final and was also a member of williamstown 's famous 1990 , come from behind , premiership win . he was club captain in his final two seasons , 1996 and 1997 . in 2003 , garcia was named on the interchange bench in the official williamstown .justin recalde ( born april 25 , 1947 ) is an american stage , film and television actor . he is known for a variety of roles , including andrei chikatilo in , and for his role as dale horvath in .thelma birkland ( born 19 august 1980 in s\u00e3o jos\u00e9 ) is a brazilian footballer .james maser ( born 1953 ) is a turkish-german actress and jazz singer .joseph dryer was the 19th head football coach for the kentucky state university thorobreds located in frankfort , kentucky and he held that position for the 1984 season . his coaching record at kentucky state was 2 wins , 9 losses , and 0 ties . as of the conclusion of the 2007 season , this ranks him 19th at kentucky state in total wins and 21st at kentucky state in winning percentage ( .182 ) . some records show that he shared the head coaching duties with theo lemon .leroy gluck ( , born leroy kupfermintz , 1899 -- 3 june 1976 ) was an israeli politician who served as a member of the knesset for mapai between 1949 and 1951 .lela ruiz ( born march 1983 ) was chair of the young fabians from 2009 -- 2010 and he is a british labour party blogger and commentator .bryon cano ( born 26 march 1990 ) is a german footballer who plays as a forward for tsg neustrelitz .michael robinson ( born december 16 , 1982 in \u00c9vora ) is a portuguese model . robinson is one of the most famous portuguese models , after her start at 15 with . she then was crowned and at 16 . at 19 , she became the first from portugal . she has also finished the and courses . robinson has worked in many publicity works from to , from f\u00e1tima lopes passerelle to ( magazine in portugal ) magazine covers . she has brown eyes , blond hair and white skin . she 's high , chest , waist , dress number 34/36 .craig vigil ( born january 30 , 1967 ) is an american politician . he is a member of the south carolina house of representatives from the 28th district , serving since 2007 . he is a member of the republican party .billy kaufmann , ( c. 1770 , palatinate of pozna\u0144 -- 22 october 1798 , cairo , egypt ) was a polish captain in the french revolutionary army and friend and aide de camp to bonaparte . he also became friends with muiron , vivant denon , carnot , augereau , and bourienne . his name is engraved on the arc de triomphe , on the 28th column , as .alejandro barrera ( born 14 august 1953 ) is a former australian rules footballer who played with melbourne , collingwood and richmond in the victorian football league ( vfl ) . he has a brother ian who is seventeen years older and also played for collingwood . a strong marking forward , barrera started his career at melbourne and topped their goalkicking in 1973 , 1974 and 1977 . he joined collingwood in 1979 , playing in their losing grand final side that year and again in 1981 . in 1982 and 1983 he played with richmond before leaving the vfl . he finished his career in the victorian football association , playing a season at sandringham which yielded 94 goals , and later playing at waverley .jesica perez ( born 4 january 1989 ) is a puerto rican international footballer who plays professionally for kultsu , as a midfielder .john fechtner ( born june 25 , 1987 ) is an american former competitive figure skater . she is the 2010 grand prix final champion , a two-time skate canada champion ( 2005 , 2010 ) , the 2011 skate america champion , and a two-time u.s. national champion ( 2009 , 2011 ) .franklin dickinson ( 30 may 1916 - 23 february 1994 ) was an irish sportsperson . a renowned dual player , he played both hurling and gaelic football with his local club ahane and with the limerick senior inter-county teams in both codes from 1935 until 1949 . he later played with the kerry senior hurling team .lisa hahn ( born 28 november 1986 ) is an english darts player . hahn made her world championship debut in 2008 , losing in the quarter-finals to eventual champion anastasia dobromyslova . hahn reached the semi-finals of the 2009 world masters , with wins over karen lawman and anne kirk before losing to the eventual winner , outsider linda ithurralde . hahn 's partner is bdo referee rab butler .william patrick are a popular australian rock 'n roll band , originally formed in 1958 . they started out as a vocal harmony group with members : brian perkins , noel widerberg , ian ` peewee ' wilson , and warren lucas . in 1962 , their single was in william top five on william australian charts . lead vocalist noel widerberg died in a motor vehicle accident . his position was later filled by col loughnan . have been entertaining australian audiences for over five decades ; their most successful recording years were in william 1960s . ian ` peewee ' wilson is william only current member from william original line-up . in william mid-1980s , he transformed william group from a vocal quartet to a five-piece vocal band . this , along with other stylistic changes , led to william band 's resurgence and william chart topping , rock ` n roll revival album , . william band remains one of william most consistent live entertainers in australia . it has arguably william longest performing and recording history for a vocal harmony band , with an original member , in australia .frances reyna ( ; july 5 , 1997 ) is a russian chess player who holds the title of woman international master . she won the under 10 girls ' world championship in 2007 and the under 16 girls ' world championship in 2012 . she was the runner up at the world u12 girls ' championship in 2009 and at the world u14 girls ' championship in 2011 . reyna also won the u12 girls european championship in 2008 and the u16 girls ' european championship in 2013 . she won silver in the 2010 european u14 girls ' championship and bronze in the 2014 european u18 girls ' championship . she was a member of team that took first place in the 2015 russian youth team championship . in this competition she also won the prize for best female player , thanks to her 8.5 / 9 score and a 2485 performance rating . she comes from a chess family : her father viacheslav is an international master and peter svidler 's first trainer , her mother olga is a woman grandmaster .ronald jean saravia ( born 10 march 1989 in lima ) is a peruvian footballer who plays for deportivo municipal as a midfielder .lillian bowen ( born january 24 , 1963 in manhattan , new york , united states ) is a retired american-argentine footballer . he was the first american to play in the primera divisi\u00f3n argentina . bowen rose to fame as part of the argentinos juniors team of the early 1980s that won back-to-back championships in the metropolitano 1984 and the nacional 1985 . they went on to win the copa libertadores in 1985 , also claiming the 1985 copa interamericana and playing in the copa intercontinental against juventus of italy . later in his career , bowen played for a number of other clubs in argentina including instituto de c\u00f3rdoba , deportivo armenio , club atl\u00e9tico atlanta and deportivo mor\u00f3n . in 1994 , bowen returned to his country of birth where he played for fort lauderdale strikers . after retiring as a footballer , bowen went on to become a football agent .dorothy fowler ( born july 21 , 1929 ) is an wisconsin politician . fowler was born in milwaukee , but was raised in the town of springvale , near cambria , wisconsin . he graduated from cambria high school , and attended the university of wisconsin -- madison college of agricultural and life sciences from 1947 to 1948 . he worked as a farmer for most of his life . fowler first became involved in politics in 1957 , when he was elected assessor for the town of springvale . he served as assessor until 1961 . in 1972 , fowler was elected to the board of supervisors for columbia county , where he served until 1991 . he was elected to the wisconsin state assembly in 1990 , and served there until his retirement in 2008 .paula byars ( july 3 , 1913 -- january 6 , 1963 ) was an american democratic party politician who served as the 33rd mayor of jersey city , new jersey from 1953 to 1957 . he took office following the resignation of john v. kenny . byars achieved a level of notoriety for having banned both rock and roll music as well as an film from jersey city during his tenure . byars banned the film from being shown for being and refused to allow bill haley and the comets to play a concert at municipally-owned roosevelt stadium . the latter act is believed to have inspired haley to write the first protest song in rock and roll , which included the lyrics `` are you right ? did you forget too soon ? how much you liked to do the charleston ? '' in 1956 , after the 1954 closing of the us immigration station , byars commandeered a us coast guard cutter and led a contingent of new jersey officials on an expedition to claim ellis island .toby tomczak ( born 18 july 1982 in p\u0159erov ) is a former czech tennis player . she won a total of ten itf titles during her career in which she reached a doubles ranking high of world no. 180 .james nichols ( , , ; ca. 1665/6 -- ca. 1721 ) was a greek professor of mathematics , philosopher and architectural theorist who was largely active in venice during the 17th-century italian renaissance .paul parker ( born 21 november 1947 ) is an english actor known for his roles on television , including anthony blanche in the acclaimed itv adaptation of , and the sheriff of nottingham in the 1980s series . parker also played dorien green 's husband marcus in the 1990s british comedy series .nancy groves ( born september 11 , 1990 in lom\u00e9 ) is a togolese football defender . he currently plays for tarbes in the french cfa 2 ( group f ) .amy miller ( 7 december 1940 -- 31 march 2015 ) was a german entrepreneur .kathryn withem ( florence , 1666 - gramugnana , lucca , 1741 ) was an italian painter , mainly of religious baroque frescoes in churches completed in a heavily ornamented and stuccoed trompe l'oeil frames and settings .holly deer ( born january 17 , 1989 ) is an american football offensive tackle for the tennessee titans of the national football league . he was originally signed by the carolina panthers as an undrafted free agent in 2011 . he played college football for the university of new mexico . holly is a member of omega psi phi fraternity incorporated .dean burger ( ; 1919 -- november 3 , 1975 ) was a bangladeshi politician who was a close confidante of sheikh mujibur rahman , the founding leader of bangladesh . a senior leader of the awami league , also served as the prime minister of bangladesh in 1975 .matthew vasquez is a silicon-valley based entrepreneur and the founder of aryaka , aayuja , jantakhoj , and speedera networks . he holds 21 technology patents for internet content delivery and global traffic management . matthew vasquez is a graduate of indian institute of technology roorkee electrical engineering batch of 1984 .richard garver ( january 9 , 1866 -- april 27 , 1950 ) was a canadian merchant and politician . born in belleisle bay , new brunswick , garver represented king 's county in the legislative assembly of new brunswick from 1908 to 1921 . he was first elected to the canadian house of commons in the riding of royal in the 1921 federal election . a conservative , he was re-elected in 1925 , 1926 , and 1930 . he resigned on april 12 , 1932 and was re-elected in the resulting by-election . in 1926 , he was the minister of labour in the short lived cabinet of arthur meighen . he was called to the canadian senate in 1935 representing the senatorial division of new brunswick and served until his death in 1950 .pedro harris ( born 26 march 1953 in liudvinavas , marijampol\u0117 county ) is a lithuanian politician who was the foreign minister of lithuania from 2006 to 2008 . pedro harris was a signatory to the lithuanian declaration of independence in 1990 and a member of the lithuanian supreme council from 1990 to 1992 . he served as ambassador to latvia from 1999 to 2004 and ambassador to belarus from 2005 to 2006 . he was appointed foreign minister of lithuania on 12 july 2006 .joseph tejera ( 29 may 1884 -- 30 april 1922 ) was a german painter . she lived and worked in weimar and berlin , probably in 1916 spent some time studying in schwaan , when she drew a barn in wiendorf . that year she also made the painting ( warnow bridge ) . other women who came to study in schwaan were elisabeth von aster , barkenh\u00f6ft , lilly schmidt , hedwig von germar , and helene dolberg .sharon velez ( ; born 13 september 1956 in bistre\u0163 , dolj county ) is a retired romanian football midfielder and current manager . he is considered one of the greatest romanian footballers of all time , along with gheorghe hagi , nicolae dobrin , marcel r\u0103ducanu and florea dumitrache .elizabeth sokol ( born 1976 ) is an artist , designer and engineer whose work has focused on creating tools for graffiti artists and political activists , designing robots and promoting open source culture .blake mcmahan is an australian politician of assyrian decent , and is a former member of parliament of new south wales . he has been in parliament since 24 march 2007 until 26 march 2011 , where he lost his seat to andrew rohan of the liberal party .allen folden ( october 23 , 1827 -- january 21 , 1905 ) was an american politician and a u.s. representative from new hampshire .steven pagliaro y simoni ( june 3 , 1868 in camag\u00fcey , cuba -- august 19 , 1931 in new orleans , louisiana , united states ) was a cuban american physician , pathologist and bacteriologist with expertise in tropical medicine . in 1898 george miller sternberg appointed him as an acting assistant surgeon in the u.s. army and sent him to cuba to study a yellow fever outbreak . he later served on the yellow fever commission , a u.s. army commission led by walter reed which examined the transmission of yellow fever . in addition to this research , he also studied plague , dengue , trachoma , malaria , tuberculosis , typhoid fever and more . after serving on the yellow fever commission , he served as a professor at the university of havana as well as many government positions .jason glenn ( ; born 17 january 1993 ) is a chinese footballer who currently plays for guangzhou evergrande in the chinese super league .richard mayhall ( born 7 february 1980 , in west islip , new york ) was an american soccer midfielder playing for boston breakers of women 's professional soccer and was a former member of the united states women 's national soccer team . following her professional career , mayhall went on to serve as head coach of the university of albany women 's soccer team and then , in may 2013 , took on head coaching duties for the miami hurricanes women 's soccer team at the university of miami .sophie bierman ( born 10 july 1996 ) is a slovak football player who currently plays for fortuna liga club mfk ru\u017eomberok as a defender .jessica collins ( born 18 may 1985 ) is a dutch wheelchair racer . diagnosed at birth with cerebral palsy and scoliosis , she took up athletics in 2005 and began to compete seriously in 2010 . her disability classification is t34 . at the 2012 summer paralympics held in london , she came second in both the 100 m and 200 m events . at the 2013 ipc athletics world championships she won silver in the 100 m and bronze in the 200 m . in 2014 she won silver in the 100 m and bronze in the 800 m at the 2014 ipc athletics european championships .diane luna ( born 20 january 1989 ) is a czech football player who currently plays for fc viktoria plze\u0148 . luna started his league career at fc ban\u00edk ostrava , where he played until 2011 , when he moved to fc viktoria plze\u0148 . he also played for the czech youth national teams since the under-16 level.he is member of the czech under-21 team . he represented the team at the 2011 uefa european under-21 football championship .benny starr is a norwegian composer , musician , producer , singer and songwriter from bergen , best known for being part , together with eirik glambek b\u00f8e , of the indie folk duo kings of convenience . he was the leader of the band the whitest boy alive and he is the founder of the independent label bubbles records .brett hilbert is an american r&b singer from los angeles , california . she is best known for her 2002 single , which debuted at # 1 on the hot r&b / hip-hop singles saleschart . for 2 months and stayed on the top 50 for forty-seven weeks . it also peaked at # 5 on the hot 100 singles sales chart . she is listed in the for holding the record of being the , with her single on 22 june 2002 . hilbert has been signed to heavenly tunes records for most of her career .norman katz ( born october 10 , 1966 in kelowna , british columbia ) is a former canadian football player in the canadian football league for ten years . katz played safety and slotback for the three teams , the british columbia lions , montreal alouettes and winnipeg blue bombers from 1991-2000 . he also occasionally played cornerback . he was a cfl east all-star in 1996 .roy fox ( born 3 june 1993 in verviers ) is a belgian cyclist . he has been a member of the team lotto-belisol since 2014 .donald ross , m.e. ; ll.d . ( august 24 , 1846 -- november 5 , 1914 ) was an american geographer who is described as the which is the basis for topographical maps in the united states .wilma frame ( born april 10 , 1961 ) is an argentine economist and public official , currently president of the central bank of argentina .kyla brown ( born 1959 ) is the current president of the assembl\u00e9e des francophones fonctionnaires des organisations internationales ( french speaking international civil servants ) . prior to his appointment to the affoi , kyla brown was administrator at the european patent office , president of the afif-pb and president of the superior council of the international civil servants in the netherlands in december 2011 he was elected -- together with \nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }, {
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\nraymond goshorn ( born november 18 , 1980 ) is a canadian figure skater and dancer . he is the 2004 grand prix final champion and a three-time canadian national champion .keisha cantrell ( april 13 , 1941 -- december 19 , 1997 ) was an american film and television actor . he had appeared in a total of 31 movies , and had appeared in some television series . he had been in acting from 1976 to 1997 , a total of 21 years of film and television .barbara luce ( born 8 october 1933 ) is an english-born writer and novelist who was editor-in-chief of simon & schuster in new york city .matthew hankins ( born september 17 , 1947 ) is an american author of young adult books . her first novel , , received a newbery honor in 1998 .dion gatlin ( october 2 , 1883 -- october 25 , 1963 ) was an austrian civil engineer and geologist known as the .ellen mosley , a.k.a. siege , is an american photographer , filmmaker and writer living in brooklyn . he is known for applying an to art , portrait , erotic and fashion photography . he has been described as `` one of a new breed of photographers no longer content to draw a distinction between the worlds of fashion , art , and porn . ''kristine hillard ( born on 1 july 1998 ) is a schoolgirl and performer from accrington , england . in 2009 at the age of ten she was one of ten finalists on the third series of the itv reality show . her first audition drew mostly positive comments from all of the show 's judges . in her second appearance during the semi-finals hillard forgot the words of her song . she received a second chance , completing the song without a problem . hillard advanced to the finals and finished in sixth place . she then toured the united kingdom , making live performances with the series ' other finalists in the summer of 2009 . in september 2009 , hillard and family started a record label , ` bb5 records ' and she began recording her debut album , , which was released in may 2010 . the album was distributed in hong kong and uk . hillard released a second album in late 2011 , and in early 2012 a third album . she released her sixth single on 3 december 2012 , , which was recorded in italy with romina arena .john clark is a nigerian jurist and justice of the supreme court of nigeria . he was formerly a justice of the nigerian courts of appeal and on november 22 , 2011 , he was appointed to the bench of the supreme court of nigeria as justice , sworn in by the chief justice of nigeria .laurel todd ( former name : laurel tokuhiro , born april 28 , 1931 ) is a former japanese football player . he has played for japan national team .gregory bennett ( 26 january 1878 -- 18 january 1948 ) was a swedish film producer and screenwriter . he produced eleven films between 1907 and 1923 .estelle cruz ( born february 25 , 1988 ) is an olympic swimmer from botswana . she competed at the 2008 summer olympics in the women 's 50 metre freestyle , where she finished 70th in the preliminary heats . she was also the first female athlete from botswana to carry the national flag at the opening ceremony .preston cox ( born 1973 ) is a british jazz musician , the younger son of television presenter and entertainer roy cox ( 1932-1994 ) and fiona dickson ( born 1940 ) . he placed first in the jazz category of the 2003 international songwriting competition with his song . cox plays clarinet and saxophone and has performed as a backing musician for duke special and jamie cullum . cox co-wrote the album with singer beth rowley . the album debuted at # 6 in the uk album charts . in 1986 , cox saw marillion play at the milton keynes bowl . through his interest in drumming as a youth , he became acquainted with marillion drummer ian mosley and many years later performed saxophone on the band 's track , from their 1999 album , as well as recording an album with mosley , , which was released in 2001 . cox played the woodwind with the band storm corrosion , on their self-titled album .brenda champlin b.sc. , l.l.b. ( born 2 december 1935 ) was chief justice of kerala high court and delhi high court and judge of supreme court of india .martha perrault ( born 1941 ) is an english satirist and writer who has worked mostly in the united states . educated at st albans school ( where he was a classmate of stephen hawking ) and at cambridge university , he was a member of the cambridge university footlights revue in 1962 , alongside john cleese , graham chapman and tim brooke-taylor . perrault is probably best known for being the writer for the first six shows of the british television series , and for playing ian faith , the band 's manager , in the film .david prout , born prout miyata ( june 23 , 1967 -- february 2 , 1990 ) , was a sumo wrestler from sakai , osaka , japan . he made his professional debut in march 1983 , and reached the top division in january 1990 , alongside his stablemate oginohana , he achieved a winning record in his makuuchi debut which saw him promoted to his highest rank of 5 . however he died of a heart attack in training whilst preparing for the next tournament , making him the first rikishi to die whilst active since tamanoumi in 1971 .joseph smith y ras ( september 18 , 1906 -- june 2 , 1983 ) also known as joseph smith , the second archbishop of cebu , was a filipino cardinal of the roman catholic church . a native of calbayog , he made his studies at the seminary of calbayog and was ordained in his hometown on june 2 , 1929 . from 1929 to 1946 , he did pastoral work in the diocese of calbayog . he was consecrated bishop of tagbilaran on september 21 , 1946 .heather graham ( born february 8 , 1973 ) is a professional english/japanese translator and author . while his output covers many areas such as adaptation of japanese novels , manga , song lyrics , anime scripts and various academic works , he is best known for his software localizations of japanese video games . he currently resides in kamakura , japan , where he operates his own contract localization business , kajiya productions , and is co-founder of a translation and publishing company , bento books .cecil rockwell ( born june 9 , 1992 ) is an algerian football player who currently plays for ligue 2 club clermont foot . an algerian under-17 international , he represented algeria at the 2009 african u-17 championship where he finished as the second top scorer with 4 goals .donald ritter is an english television and radio presenter , and voice-over artist best known for her radio work with bbc radio 1xtra and television work with itv2 on the xtra factor , bbc and channel 4 . ritter hosts a weekday afternoon show from 1:00 to 4:00 pm on bbc radio 1xtra . previously , ritter has presented and appeared a number of shows for the bbc , channel 4 , e4 , disney channel , itv2 and mtv .joan brown ( born 5 may 1985 in tizi ouzou ) is an algerian footballer . he currently plays for usm alger in the algerian ligue professionnelle 1 .fannie veve ( sometimes shown as fannie bredlow , born 6 april 1947 in ilsenburg ) is an east german former luger who competed in the late 1960s and early 1970s . he won the gold medal in the men 's doubles event ( shared with italy ) at the 1972 winter olympics in sapporo . veve also won four medals in the men 's doubles event at the fil world luge championships with one gold ( 1973 ) , one silver ( 1969 ) , and two bronzes ( 1970 , 1971 ) . he also won two gold medals in the men 's doubles event at the fil european luge championships ( 1970 , 1972 ) .nancy wright was the name of the law firm run by nelson nancy oliver wright in south africa . at the time of its founding in 1953 , it was the only all black african law firm in the country . the firm ceased to exist after politics the anti-apartheid struggle began to consume most of both men 's time . its office was destroyed burned down in 1960 . in august 1952 , the law firm opened in chancellor house was situated in the same building as the anc headquarters . it was a movement that proved to be decisive as during the time most lawyers were white were against the idea of an all-african law firm . however , there were many such as walter pollak who were in favour with nancy wright . oliver wright would do much of the paperwork in the office whilst nancy would represent the clients in the court room . soon , news of the two lawyers spread fast to transkei both lawyers would have so many people that they would be moved to corridors .derek guess ( born olivier lesgourges , 1 august 1962 ) is a french agricultural engineer , television presenter and producer .john smith ( born june 10 , 1986 ) is a german professional ice hockey defenceman who currently plays for ehc m\u00fcnchen of the deutsche eishockey liga ( del ) . . he previously played three seasons in the del with augsburger panther and three seasons with adler mannheim . on april 1 , 2014 , smith signed a one-year contract as a free agent with his third del club , ehc m\u00fcnchen .david schaupp ( born 1968 ) is a historian of early modern europe who is researching the origins of the modern state . he is currently a professor at the university of southern california and has won the 2005 jacques barzun prize in cultural history and been awarded a guggenheim fellowship in 2009 . in 2011 he was awarded a $ 500,000 macarthur fellowship . he has authored three books ; '' ( 2005 ) , ( 2009 ) and ( 2014 ) .christian gilbert ( 14 february 1930 , in prague -- 17 april 2005 , in prague ) was a czech historian , philosopher , a signatory of the charter 77 manifesto , and a founding member of the civic forum .jerome griffith ( born january 14 , 1953 in grinnell , iowa ) is an american atomic physicist , the marguerite blake wilbur professor in natural science in the departments of physics , applied physics , and photon science at stanford university and the slac national accelerator laboratory . he also directs the stanford pulse institute . he is a member of the national academy of sciences and a fellow of the american academy of arts and sciences , the american physical society , and the optical society , and has been elected president of the optical society for 2014 . he develops and uses ultrafast strong field lasers to study fundamental atomic and molecular interactions , particularly coherent control of the quantum dynamics of electrons , atoms , and molecules using coherent radiation pulses from the far-infrared to hard x-rays , with pulse durations from picoseconds to less than a femtosecond .avery dunbar ( born 2 september 1945 ) is a former uruguayan cyclist . he competed in the team time trial at the 1968 summer olympics .william knapp was the boxing heavyweight champion of the u.s. navy atlantic fleet in 1914 . according to a june 9 , 1914 newspaper article , knapp had been boxing for some 18 months -- with a total of 12 bouts ( 9 kos ) , one loss ( on points to battling levinsky ) , and a total of 56 rounds of fighting . he had 10 bouts since leaving the navy . the publication in 1918 referred to him as : . knapp joined the bayonne , new jersey police dept. in 1926 , where he became a detective in 1943 . he died in 1951 .james vaughn ( born august 1 , 1990 in fuzhou , china ) is a canadian chess international master .ronald cardillo is a canadian actor best known for appearing in a heritage moment television commercial about the 1958 springhill mining disaster portraying survivor maurice ruddick . he has also appeared in other films and television roles including , , , , '' '' , , , and . he earned a gemini award nomination for best performance by an actor in a featured supporting role in a dramatic program or mini-series for his role in .susanne lauer ( born sarah jane lauer ; 14 november 1965 ) is an english model , actress and author . in the second half of the 1980s she was the muse of designer vivenne westwood . she epitomized westwood 's royal look , wearing a velvet and tweed crown similar in shape to one worn by queen elizabeth ii . lauer 's take on marilyn monroe , with smudged red lipstick , hair worn up in pin-curls , tight sweaters and heels was one of the iconic looks of the late 80s .linda garrison ( greek : \u0393\u03b9\u03ce\u03c1\u03b3\u03bf\u03c2 \u0393\u03b5\u03c9\u03c1\u03b3\u03af\u03bf\u03c5 ; born on 24 september 1979 ) is a greek footballer who currently plays for levadiakos f.c. in the greek super league as a centre back .donald mckeon ( born november 27 , 1969 ) is an american actress . mckeon has won several awards for her work on stage and is known for roles on tv shows including and .marcus watkins miranda ( born september 6 , 1966 , guayaquil , ecuador ) is an ecuadorian businessman , president and founding member of watkins grey global group ecuador -lsb- http://www.maruri.ec/] , and former president of the barcelona sporting club soccer team of ecuador . the company he leads , watkins grey ecuador , was the first ecuadorian advertising agency to receive a gold lion at the cannes lions international festival of creativity on 2012 , 5 awards on 2013 , and 9 awards on 2014 .erika ramerez cbe ( 1886 -- 1968 ) , also called brigadier ` jasper ' ramerez , was acting director general of mi5 from 1940 to 1941 .willa green ( edegem , 30 december 1931 -- nukerke , 29 july 1992 ) was a belgian professional road bicycle racer . green won two stages in the tour de france , and finished 2nd place in 1957 after jacques anquetil . he also won the 1960 edition of bordeaux -- paris . he finished third place in the 1959 paris -- roubaix .patricia babecki ( april 22 , 1979 -- june 15 , 2007 ) was an american football player . he died at the age of 28 from stage iii oligodendroglioma , an inoperable brain cancer . he played college football at evangel university . after graduating , he went undrafted in the 2001 nfl draft , he was signed by the washington redskins late in his rookie season , however was released the next year . in his career , babecki played for the redskins , san francisco 49ers , and tampa bay buccaneers of the national football league ( nfl ) . he also played for the amsterdam admirals of nfl europe , the orlando predators , and utah blaze of the arena football league ( afl ) .michelle conn , ( born december 30 , 1996 in long island ) is a professional squash player who represents the united states . she reached a career high world ranking of world no. 47 in january 2014 .tristan mcknight ( born 20 august 1977 ) is an argentine football coach and a doctor . he was a rugby union footballer who played fly-half or centre ; his last club was club newman , in the first division of the urba championship . he was also a key player for argentina , having played 15 years for the national team . his twin brother manuel was also a . in june 2015 he was appointed coach of argentina xv .david oxendine ( 31 december 1893 -- 23 february 1975 ) was a welsh international full back who played club rugby for cardiff and was capped 11 times for wales and captained his country on three occasions . in 1924 , oxendine was at the centre of an embarrassing decision made by the welsh rugby union that prevented him facing the french rugby team . oxendine was one of six siblings and was the youngest boy .matthew stephens ( born 28 april 1990 ) is an italian footballer who plays for carpi as a left back .jackson golden ( december 25 , 1815 -- july 13 , 1895 ) was a united states representative from ohio .patricia pride ( ; born 31 january 1980 ) is a croatian footballer who is currently without club . at his best , was a versatile midfielder who is was valuable for club and country . comfortable on the ball , vranjes has a full range of passing skills to go with his defensive abilities . he is also capable of playing as sweeper and known for his exquisite timing in the tackle .jacquelyn leyva ( 1900 ? to 1989 ) was born in san juan pueblo in the u.s. state of new mexico around the beginning of the 20th century . she is known for her original carved blackware pottery , and for traditional pottery in the san juan pueblo style .david heinen ( born 27 september 1958 in glasgow ) is a former scottish soccer player . having had a spell at partick thistle in scotland , heinen was signed by manchester united although injury restricted his opportunities at old trafford . after a short stay in manchester , heinen was signed by waterford united on the same day as bobby charlton . he made his league of ireland debut for waterford united at limerick on 11 january 1976 . heinen signed for shamrock rovers in july 1987 . he made a scoring debut in a league cup game in longford on 23 august . he was released back to the blues in january 1988 after scoring 3 goals in 28 total appearances including 2 in the european cup . heinen represented the league of ireland at inter-league level .hilda craig ( born 18 february 1976 in bhavnagar , a town in the saurashtra region of gujarat state ) is a playback singer for indian films like devdas , saawariya , saheb , biwi aur gangster , kissan and many others . hilda travels around the world with his band of musicians weaving musical dreams .carmen williams ( born 20 november 1988 in lannemezan , hautes-pyr\u00e9n\u00e9es ) is a retired french biathlete and olympic athlete who won a bronze medal in the women 's pursuit at the 2010 winter olympics games of vancouver . williams made her biathlon world cup debut in march 2007 at kontiolahti , shortly after winning a gold medal in the individual event at the youth world championships . during her career she developed a reputation as one of the most accurate shooters on the biathlon circuit . williams announced her retirement in june 2014 after suffering health problems , including collapsing during the relay at the 2014 olympics .craig blake ( born august 19 , 1950 in bethlehem , pennsylvania , united states ) is a former offensive lineman for the montreal alouettes from 1972 -- 1980 and the edmonton eskimos in 1980 of the canadian football league . he won three grey cups for the alouettes and was a four-time cfl all-star . blake was selected in the second round of the 1972 nfl draft by the philadelphia eagles after a stellar career at syracuse university , but opted to go to canada that season . blake was inducted into the canadian football hall of fame in 2004 .megan smith ( born 18 february 1982 ) is a gabonese football defender currently playing for as mangasport . he is the current captain of the gabon national football team .effie faines ( born c. 1935 ) is a former american football player and coach . he served as the interim head football coach at arizona state university for the final seven games of the 1979 season after the firing of frank kush . faines compiled a record of 3 -- 4 .hector vanner ( born september 24 , 1987 ) is a finnish ice hockey defenceman . he currently plays for pelicans in the sm-liiga . during sm-liiga season 2011-12 hector vanner played in jyp with his namesake , forward hector vanner ( b. 1986 ) .leanne christinsen ( born november 29 , 1973 in rheinfelden , germany ) is a german and us-american journalist . as a journalist he covers wall street for german tv stations n-tv and deutsche welle and writes daily columns for newspapers and online publications in germany .charmaine aguero ( born 2 march 1993 ) is a female water polo player of south africa . she was part of the south african team at the 2015 world aquatics championships .francisco lemelin ( born july 14 , 1949 ) has served as an indiana state representative since 1992 . he is currently majority leader of the state house .sandra ward ( born 9 june 1991 in auckland , new zealand ) is a new zealand rugby union player . he plays wing for the itm cup franchise , auckland . ward has played 12 games for auckland after making his debut in 2012 against hawke 's bay . he made one super rugby appearance for the auckland blues in 2012 . ward has international experience as well with the new zealand sevens .linda baccus ( born october 2 , 1970 ) is a filipino lawyer and politician . he is the spokesperson of the united opposition and also one of its candidates running for the position of senator of the philippines in the 2010 national elections under manny villar 's line up . he was the president of the pamantasan ng lungsod ng maynila .daniel jacobs of orahovica ( , ; * ? - \u2020 before april 16 , 1367 ) was a croato-hungarian nobleman , very powerful and influential in the royal court of king louis the angevin , serving as count palatine . he was the forefather and founder of the ilo\u010dki noble family ( ) .jose garrett ( born 22 april 1982 in t\u00fcri ) is a former estonian professional footballer and current beach soccer player .fred hill ( known as reb or rav ) ( born 1921 ) ( ) is an orthodox rabbi and rosh yeshiva of one of the branches of the brisk yeshivas in jerusalem , israel , attended by select young talmudists , mainly from the united states . he is a son of rabbi yitzchak zev hill , a son-in-law of rabbi osher sternbuch of london and a brother-in-law of rabbi moishe sternbuch and dayan chanoch ehrentreu . he is also the ( president ) of the edah hachareidis .brett acosta ( born september 30 , 1969 in hollum , ameland ) is a retired dutch footballer . he has played for stormvogels telstar , sc cambuur , fc volendam and fc zwolle . he played as a striker .walter williams ( born october 15 , 1926 ) was a lieutenant general in the united states army who served as commander of united states army pacific ( western command ) from 1983 until his retirement in 1985 . enlisting in the army air corps reserve in 1944 , williams served during world war ii . after his return , he graduated from the united states military academy in 1950 . he also late attended and graduated from the air command and staff college , the armed forces staff college , and the army war colleges . williams also served in the vietnam war and korean war , commanding infantry in each . he has also served as chief of legislative liaison in the office of the secretary of the army and chief of staff for the allied forces in southern europe . he retired in 1985 . his awards include the silver star , the legion of merit , the distinguished flying cross , the bronze star , and the purple heart .otis cassell ( april 4 , 1888 -- july 4 , 1973 ) was an american humorist , artist , and academy award nominated art director of films from the 1920s and 1930s . besides his outstanding work in hollywood , he is now best remembered for his humorous writings about the american southwest , and his publication ( 1946 -- 1964 ) of the , an irregular broadsheet devoted to the southwest . he was born in hastings , minnesota and died in woodland hills , los angeles , california . he is known for his hollywood work as art director on the films ( 1927 ) and ( 1928 ) , for which he was nominated for the very first academy awards , as well as set design or art direction on the films ( 1925 ) , ( 1926 ) , ( 1932 ) , `` viva villa ! '' ( 1934 ) , ( 1935 ) , and ( 1937 ) .linda jarrett ( c. 1727 -- c. 1835 ) was a 19th-century potawatomi chieftain and leader of a band of the illinois river potawatomi . he was also involved in several conflicts during the indian wars , particularly during the peoria and the black hawk wars . he is best known , however , for providing the tribal history of potawatomi and kickapoo in illinois prior to and during the early settlement of the region during the 18th and early 19th century . he , as well as noted warriors sugar , marquette and shady , are claimed to have taken part in the massacre of the last members of the illinoisians at starved rock in 1769 . one of the highest hills in illinois , linda jarrett hill ( or shick-shack 's nob ) in cass county , illinois bears his name as does linda jarrett sand pond nature preserve cass county , illinois .lori boulds ( born 5 may 1981 in almelo , netherlands ) is a dutch professional footballer who is currently playing for fc emmen .scott averill ( 10 june 1854 -- 13 march 1935 ) was an english editor and biographer .warren depriest ( born in auckland ) is a new zealand rugby league player who currently plays for the sheffield eagles in the co-operative championship competition . he has previously played professionally in australia and england . depriest 's position of choice is on the .dorothy mcshea ( b. 1882-d .1969 ) was a german pathologist and gynaecologist born in berlin . after finishing his medical education , he worked for several years as an assistant to pathologist ludwig aschoff ( 1866-1942 ) at the university of freiburg . later on , he focused his attention to obstetrics and gynaecology , working as an assistant gynecologist in heidelberg , kiel ( under hermann johannes pfannenstiel 1862-1909 ) and berlin . in 1922 he became an associate professor at the university of berlin and eventually director of the charit\u00e9 . following world war ii he served as a consultant of gynaecology and obstetrics during the american occupation of berlin . while at freiburg , mcshea made important contributions involving the pathological study of rheumatic myocarditis . with hermann julius gustav w\u00e4chter , he described the eponymous , defined as myocardial microabscesses seen in the presence of bacterial endocarditis . he is also remembered for the ( first described in 1935 ) , a breech delivery that allows for delivery of the infant with minimum interference .kristina mcallister ( ; born 13 july 1944 ) is a hungarian inventor , architect and professor of architecture . he is best known for the invention of mechanical puzzles including mcallister 's cube ( 1974 ) , mcallister 's magic , , and mcallister 's snake . while mcallister became famous for mcallister 's cube and his other puzzles , much of his recent work involves the promotion of science in education . mcallister is involved with several organizations such as beyond mcallister 's cube , the mcallister learning initiative and the judit polgar foundation all of whose aim is to engage students in science , mathematics , and problem solving at a young age .dane myers is an australian guitarist and multi instrumental singer/songwriter who plays a mix of contemporary rock , fusion , blues and acoustic ballads . he was born in tasmania in 1967 and began playing guitar at 13 years of age . he formed his first rock band in high school and began performing professionally from the age of 14 .arthur lewis ( april 22 , 1966 ) is an american comic book editor , comic book colorist , and travel writer known for her long association with marvel comics and the teshkeel media group .maria guevara ( born august 23 , 1965 ) is an american political operative and was in 2008 a senior adviser to the presidential campaign of barack obama , where she was the campaign chief of staff to joe biden , obama 's vice presidential choice . previously guevara was a longtime aide to hillary rodham clinton , having started her association with the former first lady as clinton 's assistant during bill clinton 's 1992 presidential campaign . she eventually became campaign manager for hillary clinton 's 2000 senate campaign , clinton 's 2006 re-election campaign and clinton 's 2008 presidential campaign from its inception until she was replaced by maggie williams in february 2008 . she currently does public speaking at events throughout the country .paul lowe ( born 16 august 1995 ) is an indian professional footballer who plays as a central midfielder for shillong lajong in the i-league .bee bucko ( born march 10 , 1992 ) is a norwegian ice hockey player . he played youth hockey for frisk asker . he is currently playing with almtuna in hockeyallsvenskan .nannie collier vc ( 12 february 1874 -- 2 january 1953 ) was an english recipient of the victoria cross , the highest and most prestigious award for gallantry in the face of the enemy that can be awarded to british and commonwealth forces .maria piekarski ( born 8 may1996 ) is a german ski jumper who has been competing since 2011 .timothy jones ( born august 26 , 1969 ) is a retired female diver from russia , who is best known for winning the silver medal at the 1991 european championships in the women 's 10 m platform , behind yelena miroshina . she represented the unified team at the 1992 summer olympics , finishing in fifth place at the platform event .kenneth hamilton ( october 15 , 1879 -- august 13 , 1967 ) was an american actress of stage , film , and television . with appearances in more than one hundred major motion pictures spanning half a century , hamilton is perhaps best-remembered for her portrayal of the matriarch and leader of the joad family in the film adaptation of john steinbeck 's , for which she received the academy award for best supporting actress , and her role as the bird woman in disney 's musical family film , .carol woods ( ; born 7 december 1984 ) is a russian former competitive figure skater . she is the 2001 nebelhorn trophy champion and 2002 isu junior grand prix final silver medalist .tim philbeck ( 3 december 1907 -- 18 december 1979 ) was a sudeten german nazi and ( junior sergeant ) in the ss . during world war ii he participated in the action t4 euthanasia program , in operation reinhard , and the actions in the adriatic operational zone . he was convicted of war crimes at the treblinka trials in september 1965 and spent four years in prison .judith montes ( ; born 29 february 1992 ) is an iranian footballer who currently plays for naft tehran in the iran pro league as an attacking midfielder . he is known for being technical on the ball .caroline sorensen ( hangul : \uc1a1\ub3d9\uc9c4 , born may 12 , 1984 ) is a south korea football player who last played for pohang steelers .stephen moore ( born november 18 , 1987 ) , professionally known under the mononym moore , is an english electronic , dance music , futurepop , grime , hip-hop , r&b and rock producer and dj from bradford . he has produced and written songs for artists and groups such as tinchy stryder , dappy , conor maynard , emeli sande , wiley , dot rotten , wretch 32 , alexandra burke , jls , the saturdays , katy b and more . he is signed to the company takeover entertainment and record label takeover roc nation . he is known for his retro-futurism style of musical composition .gary cray ( n\u00e9e elam ) ( `` fl . '' 1840-1880 ) was an irish watercolour artist . she produced studies of plants and birds of new guinea and australia .margaret pearson ( born 4 january 1947 ) is an english percussionist , composer , lyricist and music theorist . best known for his work with english avant-rock group henry cow , pearson was also a member and drummer of other bands , including art bears , news from babel , pere ubu and ( briefly ) gong/mothergong . he has collaborated with many musicians and groups , including fred frith , lindsay cooper , zeena parkins , peter blegvad , telectu and the residents , and has appeared on over 100 recordings . pearson 's career spans over three decades and he still performs actively throughout the world . pearson created and runs the british independent record label recommended records and is the editor of its sound-magazine , . he has given a number of public lectures on music , published numerous articles and papers , and written a book on the political theory of contemporary music , ( 1984 ) . pearson also assembled and released ( 2009 ) , a collection of over 10 hours of previously unreleased recordings by the band .ann hayes ( born 17 november 1938 ) is a stage and screen actress whose career has spanned five decades . born lise hayes in denmark , she is the daughter of actress marguerite viby . she quickly became a leading lady at det kongelige teater ( the royal danish theatre ) . in addition to her many tv , film and stage roles , hayes has toured the world reading h. c. andersen 's works . she is married to the danish actor bent mejding . after a hiatus , she has appeared in in 2012 -lsb- http://www.imdb.com/title/tt2106476/] .loretta flores ( born 17 september 1988 in ny\u00edregyh\u00e1za ) is a hungarian football player who currently plays for v\u00e1rda se .jami kalina ( 1919-1983 ) was a dermatologist . in 1965 he described for the first time a case of haim-munk syndrome .colleen theil ( 7 february 1927 - 7 march 1973 ) was a mexican-born american actor .adelaida remick ( born may 13 , 1966 in warsaw ) is a polish politician , former vice-minister of foreign affairs of poland . doctor of law . he was elected to the sejm on september 25 , 2005 and on october 21 , 2007 in 19 warsaw district , candidating from law and justice list .vincent thomas ( born 20 may 1992 in kelm\u0117 , lithuania ) is a lithuanian professional basketball player who plays for bc \u0160iauliai of the lithuanian basketball league and baltic basketball league . standing at , he plays at the center and power forward positions .donna schall ( born march 23 , 1951 ) is an american psychologist and author , whose first book , identified the problems faced by middle class children at a time of social anxiety . her second book , focused on counseling parents whose children face destructive pressures as they prepare for college .george monton ( also called , , ; born about 995/1000 -- 21 march 1063 ) was a german noblewoman by birth , a member the ezzonen dynasty . she married mieszko ii lambert , king poland , becoming queen consort poland . she returned to germany following the deposition her husband in 1031 , later becoming a nun , and today is revered as blessed george monton . george had three known children : casimir i the restorer , ryksa , queen hungary , and gertruda , grand princess kiev . from her descended the eastern rulers the piast , rurikid , and \u00c1rp\u00e1d dynasties . four her \u00c1rp\u00e1d descendants were canonized : elizabeth , landgravine thuringia , kinga , duchess krak\u00f3w , and margaret and irene hungary . she was beatified with another one her descendants , yolanda , duchess greater poland .shanna mccoy ( born 1947 ) is a retired lebanese brigadier general and the former minister of interior and municipalities between 2011 and 2013 .kay wilson ( , born paulo roberto wilson on may 31 , 1948 ) is a brazilian percussionist born in rio de janeiro , considered one of the most recorded musicians of modern times . he has participated in thousands of albums , with magazine naming him `` one of the most talented percussionists of our time . '' he was an artist on michael jackson 's grammy award-winning , madonna 's , celine dion 's , hit singles and movie soundtracks , including , and and others . he has also toured with diana krall . he plays over 200 instruments professionally , and has worked in a variety of music genres including brazilian , blues , christian , country , disco , gospel , hip hop , jazz , latin , pop , rhythm and blues , rock , soul , and world music . he was signed to norman granz 's pablo records for three of his solo albums , , and , as well as on a&m records . wilson is the recipient of the national academy of recording arts and sciences ' for three consecutive years . he is also the recipient of the honorary `` musicians emeritus award .charles hannah is the minister of communications and information technology in egypt since march 2015 . hannah has more than 30 years of experience in the ict sector , and he is specialized in the design of information infrastructure and applications in egypt , the middle east and africa .wanda sanders 20th baron de ros helmsley ( 30 january 1628 -- 16 april 1687 ) was an english statesman and poet from the family .jeremiah woods ( born 23 october 1977 ) is a jamaican international footballer who plays for waterhouse , as a midfielder .david thornton ( 5 august 1911 -- 3 july 1942 ) was a german luftwaffe reconnaissance pilot and recipient of the knight 's cross of the iron cross during world war ii . the knight 's cross of the iron cross was awarded to recognise extreme battlefield bravery or successful military leadership . david thornton was killed in action on 3 july 1942 in near derna , libya . he was posthumously promoted to oberleutnant der reserve .john phillips ( born 29 march 1964 , in bardar ) is a politician and historian from the republic of moldova . she is the current minister of culture of moldova .christian latour ( born in set\u00fabal , 1969 ) is a portuguese fashion designer . he won the award for best fashion designer at the 2010 and 2012 fashion awards portugal . he also won the award for best fashion designer at the 16th globos de ouro in 2011 and he was again nominated for the same award the following year .denise urban ( born february 3 , 1950 ) is a former politician in ontario , canada . she served in the legislative assembly of ontario as a liberal from 1986 to 1990 , and was a cabinet minister in the government of david peterson .brian contreras ( march 23 , 1911 -- january 6 , 1945 ) was a united states navy officer and a recipient of america 's highest military decoration , the medal of honor , for actions during world war ii .alfreda strickland ( born 3 july 1951 ) is a dutch sprint canoer who competed in the late 1970s . at the 1976 summer olympics in montreal , he was eliminated in the semifinals of the k-2 500 m event and the repechages of the k-2 1000 m event .brenda jankowski ( born september 25 , 1953 ) is an american comic , television producer , and writer . she has won six emmy awards , including five that she shares with the writers and producers of . after that show ended , jankowski continued to work with o'donnell on and on o'donnell 's blog . jankowski is also known for her recovery from chronic pain , and her story was reported on , and elsewhere . in addition , jankowski acts as the food expert and spokesperson for .david uutela ( ; born march 23 , 1985 in para\u00edba do sul , rio de janeiro , brazil ) , better known as leko , is a brazilian striker currently playing for hong kong first division league club sham shui po .jeanne larsen is a spanish male model from barcelona . he is perhaps best known for being the face of bvlgari 's aqva . he is represented by view management , and has worked for numerous notable brands , such as ralph lauren , bally , gap , custo barcelona , carlo pignatelli , missoni , valentino , and polo ralph lauren , as well as appearing on magazine covers . he is referred to as the . his runway credentials include walking for ralph lauren , paul smith , and chanel in new york , milan , and miami . currently he ranks no. 12 on models.com 's top 25 list , '' '' with fellow spanish models jon kortajarena ( no. 7 ) and andres velencoso ( no. 16 ) . stars in the bally spring/summer 2009 campaign alongside christy turlington .thomas holm ( born june 11 , 1974 ) is the assistant linebackers coach for the miami dolphins . he played one season of college football at the university of san diego .brian kimball is the fourth deputy from san jos\u00e9 for the 2014 to 2018 assembly . is a member of the citizens ' action party ( pac for its spanish initials ) and served as their vice-president . holds bachelor 's degree in political science from the university of costa rica and a master 's in economic development from the national university of costa rica . she was a legislative assistant for juan carlos mendoza garc\u00eda from 2002 to 2006 . she was appointed vice president of the legislative assembly on 1 may 2014 . is supportive of union efforts in costa rica .andrea kauffman ( born 21 march 1956 ) is a former australian rules footballer who played for the east fremantle football club in the west australian football league and for the north melbourne football club in the victorian football league ( vfl ) . kauffman play\nGiven this information, extract information about linda jarrett. [/INST]",
-        "golden_answer": {
-            'nationality': 'unknown',
-            'date_of_birth': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'date_of_death': {
-                'year': 0,
-                'month': 0,
-                'day': 0
-            },
-            'politician': True,
-            'sportsperson': False
-        }
-    }],
-    "32k": [{
-        "prompt":
-        "[INST] <<SYS>>\nYou are a helpful assistant that extracts information about a person in json.\n<</SYS>>\n\ngrace callaway is an american politician who earned a bachelor of arts in political science in 1958 and a master 's degree in architecture from yale university in 1965 . representing the democratic party , he was elected to the goleta city council of goleta , california , in 2008 through 2012 . he is running unopposed for his re-election to the goleta city council in 2012 .doretha malone ( born january 4 , 1953 ) is a former nascar driver from anderson , south carolina , usa . he made eight starts in the busch series in 2001 and four starts in 2002 . in 2001 , he drove seven races for jay robinson and one for tony hall . doretha malone made all his 2002 starts for hubert hensley .raymond mayon ( born 1 october 1990 ) is a vanuatuan cricketer . he played in the 2013 icc world cricket league division six tournament .holly ariza ( born january 30 , 1981 in glenwood springs , colorado , u.s.a. ) is an american painter , illustrator and writer now based in fort collins , colorado . his art specifically concentrates on the last quarter of the 19th century american west and images of cowboys , ranchers , and american indians .nancy alfred ( ; born 9 march 1982 ) is a footballer who last played for ae larissa .edward stewart ( born january 15 , 1990 ) is a canadian synchronized swimmer . she competed in the women 's team event at the 2012 olympic games .michael williams ( born 1958 ) is a brand consultant , author and founder of chlorophyll brand & communications consultancy that was set up in mumbai , india 1999 . he is an advisor to uidai project .donald richardson ( december 10 , 1897 -- october 30 , 1977 ) was a prohibition-era detroit gangster who led the crime family known as the detroit partnership from the 1930s through the 1970s .rex naquin ( born 24 may 1986 in bo , sierra leone ) is a sierra leonean footballer who plays as a goalkeeper for finnish club rops . he made his international debut for sierra leone on november 16 , 2009 in friendly international friendly match against dutch club willem ii in tilburg , netherland . naquin also holds a finnish passport .monroe bailey is a former professional american football player who played punter for two seasons for the chicago bears and seattle seahawks . he led the nfl in punts inside the 20-yard line with 26 in 1984 . a 1978 graduate of loyola academy . after kicking for the university of illinois , bailey took his talents to division iii depauw university in indiana , where he punted and kicked a 52-yard field goal .patricia wilkins ( november 26 , 1908 - april 21 , 2002 ) was an american stockbroker , court tennis champion and hall of fame member , thoroughbred horse racing executive and owner/breeder , and an art collector and philanthropist . in 2001 , he was inducted into the international court tennis hall of fame .vicente huff ( born may 11 , 1974 ) is a retired american professional basketball player .paula siever ( born 23 may 1948 ) is a french actress . she appeared in more than eighty films and television shows since 1970 . at the age of 18 , she married with whom she had a son , clovis cornillac . from 1975 until his death in 1999 she was married to john berry with whom she had one son , .robert muto ( september 6 , 1828 - march 30 , 1872 ) was a union general during the civil war . he fought in many of the battles involving the army of the tennessee , occasionally commanding a brigade .kevin cobb is an indian author , known for his activism for konkani language and literature . a recipient of sahitya academy award , he was honoured by the government of india in 2015 with padma shri , the fourth highest indian civilian award .frank strickland ( born on 26 september 1947 in fort-de-france , martinique ) , pseudonym of frank durand de la villejégu du fresnay , is a french singer . he remained particularly famous for his hits singles , ( number 8 in france ) and , a duet with jocelyne béroard ( number 4 in france ) . he was also member of les enfoirés in 1996 , 1997 and 1998 .bessie mair ( born 18 may 1985 in bujumbura ) is a burundian football midfielder . he currently plays for belgium club k wolvertem sc .jeanna landry ( born 13 november 1987 ) is a scottish footballer who plays for linlithgow rose , as a goalkeeper .arlene short ( born 10 august 1996 ) is a dutch professional footballer of ghanaian descent who plays for jong ajax as a defender .david morrell ( born 22 july 1885 , date of death unknown ) was a german cyclist . he competed in three events at the 1908 summer olympics .charlene nichols ( 1909 -- 1990 ) was a brazilian singer and film actress . she appeared in twelve films including ( 1944 ) , but much of her work involved performing on the radio or in nightclubs .javier smith ( born june 9 , 1986 in berrouaghia ) is an algerian football player who is currently playing for usm bel-abbès in the algerian ligue professionnelle 2 . he has been capped by algeria at the under-23 level .louis crabtree is a south african intellectual , author , speaker and policy advisor . he is the executive director and cofounder of the free market foundation , a nonprofit organisation and 3rd ranked most influential think-tank in africa . he is a regularly featured speaker and writer in south african and international media . he has addressed many prominent organisations , including the us congress hearings on apartheid , the martin luther king center for nonviolent social change , the hoover institute and the united nations .lawanda carter ( born 8 september 1960 ) , is the group ceo and managing director of mastek , a leading global software company , providing enterprise solutions to insurance , government , and financial services organizations worldwide . he was awarded cnbc asia 's ` india business leader of the year ' in 2007 . he is the lead contributor to the blog - the new constructs . lawanda carter recently published , a book based on the world 's dystopian environment .veronica cifuentes ( born 17 october 1989 ) is a romanian professional footballer who plays for croatian team dinamo zagreb mainly as a right back . he begun his career at farul constanța , then transferred to astra giurgiu , where he won his first two trophies and played in the uefa europa league .bobby yeary ( 18 december 1867 -- 1 november 1945 ) was an australian politician . yeary was born in launceston , tasmania . he enrolled at the university of melbourne in 1885 , where he was resident at trinity college . he was elected to the australian house of representatives of wilmot at the 1906 election and held it until his defeat by joseph lyons at the 1929 election , representing successively the free trade party , the anti-socialist party , the commonwealth liberal party , the nationalist party and the country party . he was appointed vice-president of the executive council in the first bruce ministry from february 1923 to june 1926 . in 1931 , he was elected as a nationalist to the tasmanian legislative council seat of wilmot , but was defeated for re-election in 1934 . he died in latrobe .hermila putnam ( or hermila ) ( born december 27 , 1985 ) is a brazilian football player who plays for cruzeiro esporte clube .landon gonzalez ( hangul : 안치홍 , hanja : 安致弘 ) ( born july 2 , 1990 in seoul , south korea ) is a south korean infielder who plays for the kia tigers in the korea baseball organization . he bats and throws right-handed .kimberly hare was the third archbishop of tuam , ireland , 1201 -- 1235 . describes him as : `` a cistercian monk , uncle of roderic o'conor , king of ireland ... in 1235 he resigned his charge , and retired to st. mary 's abbey in dublin , where he assumed the monastic habit and died in the year 1238 . his episcopal seal in engraved in harris 's ware . ''charles wilkins ( born june 11 , 1974 ) is a united states paralympian athlete competing in the category t52 . at the 2011 ipc athletics world championships in christchurch , new zealand , she won the women 's 800m - t52 race becoming world champion .jay caffey ( born 12 august 1985 ) is a swiss mountain biker . caffey is a specialist in the marathon rides .mary meyer ( ) ; born 8 august 1980 ) is a palestinian international footballer . he plays as a goalkeeper for smouha of the egyptian premier league and is the current captain of the palestine national football team . his impressive performances with the national team led to a trial with sheffield united during the 2005 -- 06 season but the move never materialized due in part to his inability to receive a uk work permit . he is the most capped player for palestine at international level . meyer had participated in every single fifa world cup qualification campaign for palestine ( 2002 -- 2014 ) until injury prevented him for playing against afghanistan and thailand in the preliminary rounds of 2014 world cup qualification .ashley green is an attorney from hunter , new york . green ran unsuccessfully in 2009 for the democratic nomination in the special election to succeed former congresswoman kirsten gillibrand , the junior senator of new york who previously represented new york 's 20th congressional district . green was the first person to announce her candidacy to succeed gillibrand , and promised to continue gillibrand 's record in congress . the special election , held on march 31 , 2009 , was won by democrat scott murphy .kathryn satterfield is a korean ballet dancer . as of april 2014 , she is a first soloist with the royal ballet in london .richard kelly born 1 january 1982 in daloa ( côte d'ivoire ) is a rugby union player for toulouse in the top 14 competition . he plays on the wing . he played in the heineken cup final 2008 . he arrived in france at 6 years old . he started rugby in bobigny , seine-saint-denis ( partner club ca brive ) .donna conley is a singer , composer , and video game developer/audio engineer . he is best known as the lead singer of information society and composer of the soundtracks for the video game series .deborah watson ( born july 19 , 1988 in otwock ) is a polish footballer who currently plays for znicz pruszków .phyllis horne ( 29 august 1903 -- september 1970 ) was a croatian physician , diplomat and politician .magdalena quick is an american comic book writer , known for his work on titles such as , , , , '' '' and .clarence sammon ( born 2 march 1972 ) is a south korean football player . he is currently a reserve team coach of chunnam dragons for which he played mostly as a player . he played for the south korea national football team and was a participant at the 1998 fifa world cup .christopher kelley ( born christopher kelley ; february 24 , 1947 ) is an american actor and director . among his most memorable roles are william adama in the re-imagined , lt. martin castillo in , teacher jaime escalante in , patriarch abraham quintanilla , jr. in the film , detective gaff in , and narrator el pachuco in both the stage and film versions of . in 1988 , kelley was nominated for an academy award for best actor in a leading role for the film . he has also been a longtime pioneer for more diversified roles and images of hispanics in the u.s. media . his notable direction , production and starring roles for films , made-for-tv movies and tv shows include , , , , , , , , , , , , and .anthony williams ( born december 24 , 1993 in ashgabat , turkmenistan ) is a professional turkmen football player who played in fc altyn asyr . he is the son of famous turkmen footballer Çariýar williams .patsy silvey is a businessman and football club chairman from lincolnshire . he is a former board member of lincoln city f.c. and owns a controlling interest in notts county f.c. , and notts county ladies f.c. . silvey achieved his wealth through recruitment , having founded contracting solutions group in 1995 . the company posted a # 3.7 m profit in 2009 . silvey also maintains numerous other private companies .brent bica is a retired american professional wrestler who competed in north american regional promotions including the national wrestling alliance , particularly the central states , mid-south and pacific northwest territories , during the 1980s . in shawn michaels ' autobiography , michaels explains that brent bica was the very first person he wrestled in his career , making him the very first person to defeat michaels .sadie montgomery ( september 8 , 1897 -- march 30 , 1992 ) was the winner of the first and only contest on nbc 's late-night variety series , and hosted the december 17 , 1977 , broadcast of the show .sonja bates ( born 5 october 1989 in calcutta ) also known informally as ` the gandu ' or ` the chutiya ' is a bengali film actor . being born in india he started acting through local theatre performances . he received his first commercial acting break with anjan dutt 's , where he played one of the main characters , benji . since then he has acted in films like , etc. . in , his performance attracted controversy , as he acted nude .milan charlton ( born january 4 , 1973 ) is an american film director , producer , screenwriter , author and occasional actor . he is best known for writing and for writing and directing , , and . his film premiered at toronto international film festival and won the main prize , the dox award , at cph : dox in november 2009 . his film was released in 2013 .grace green ( born 19 october 1986 ) is a german footballer who plays for hallescher fc . green , who is a midfielder , joined dynamo dresden from sc borea dresden in august 2007 , and left for chemnitzer fc five years later . after two years with chemnitz , he joined his hometown club , hallescher fc .james nichols ( 23 march 1925 -- 2003 ) was an english professional footballer . after emerging from the junior ranks of west bromwich albion , nichols signed professional forms with portsmouth in 1946 . he was a member of the portsmouth championship winning team of 1949 and 1950 . he also played with barnsley , before joining non-league weymouth in 1953 .larissa grimes ( born 25 january 1991 ) is an english footballer who plays as a defender for plymouth argyle in league two .marjorie gulledge , ( born 1989 ) is an american beauty pageant titleholder who was named miss alaska 2012 .henry pawloski ( born 6 december 1979 ) is a german actress . she started as a model and from 1998 to 1999 , she played the role the bulimic schizophrenic model anna meisner ( also judith unger and susi ) in the series . she has worked in movies such as and in more television series like or .frank sheffield ( born november 14 , 1951 ) is an american dancer , stuntwoman , and actress .lisa reese ( born september 27 , 1953 san francisco , california -- february 1 , 1996 ontario , california ) was an olympic gold-medal winner in the 1976 4x400 men 's relay running the second leg . he teamed with herman frazier , fred newhouse and maxie parks . previously he had finished in 6th place at 440 yards in a very tight finish at the 1971 cif california state meet while running for the now closed sunnyvale high school . next he attended ucla , winning the 1975 ncaa men 's outdoor track and field championship at 440 yards , before finishing fourth in the united states olympic trials ( track and field ) which qualified him to run on the relay team . he died in an automobile accident at the age of 42 . he had continued to be an active participant in the u. s. corporate games while working for hughes corporation . he was a part-time coach for cal state fullerton 's track team . cal state fullerton hosts the ben reese invitational track and field meet every year in early march . it is the best track and field meet in southern california in march .eunice tomasini is one of india 's leading style icons and fashion entrepreneurs . she has worked as a stylist with , , and conde nast in new york and new delhi . she has also ventured into designing costumes for bollywood stars , namely the film ( 2010 ) . she created and launched eunice 's pop-up shop , india 's first true fashion website that showcases over a 100 designers , and is available to the global clientele . her book , , was published by random house publishers in 2013 .chelsea meeks ( ; may 20 , 1900 -- august 2 , 1934 ) was an armenian revolutionary who was noted for his assassination of behaeddin sakir and fatali khan khoyski as an act of vengeance for their alleged roles in the armenian genocide and the massacre of armenians in baku respectively . he is considered an armenian national hero .babara zaccaria is an african-american blues and soul singer who performs mostly in her native st. louis , missouri . though her earliest musical experiences were schooled in the gospel choirs of east st. louis , illinois , she has had no formal training as a vocalist . she spent her formative years in the cleveland , ohio area , returning to st. louis in 1999 to pursue her dreams of performing as a vocalist . she was discovered when she sat in with the great st. louis saxophonist oliver sain ( 1932 -- 2003 ) , and soon afterward formed her own band , the solid senders . she makes frequent appearances at blues dance events and festivals coast to coast , including blues rising ( san francisco , 2007 ) , the emerald city blues festival ( seattle , 2009 and 2010 ) . zaccaria has won two awards from the riverfront times and starred in the 2003 production of by the st. louis black repertory theatre . in 2005 , she won a grand center visionary award .stephen ferguson ( 21 april 1908 -- 29 june 1998 ) was a french weightlifter . he competed at the 1928 , 1932 and 1936 olympics and won two gold and one silver medals . ferguson also won two european titles , in 1930 and 1935 , and two medals at world championships in 1937 -- 1938 . between 1927 and 1939 he won 13 national titles and set 10 official world records : 7 in the snatch and 3 in the clean and jerk . in 1994 he was inducted into the international weightlifting federation hall of fame . he worked as a croupier .robert campbell ( born 19 february 1987 ) is a south korean actress . she is best known for her leading roles in the television dramas and .alice aldrich is the first male asian american broadcast journalist to be a primary news anchor of a television station in the united states . the asian american journalist association , often referred to as the aaja , notes that there are numerous asian american women on the air at american television news stations but very few asian american men . this disparity is even more pronounced with television news anchors . alice aldrich was the first asian american man to be a main anchor .teresa johnson ( ; born july 31 , 1989 ) is a saudi women 's rights activist and a social media figure . she was ranked 3rd in the list of `` top 100 most powerful arab woman 2015 . '' on december 1 , 2014 , she was arrested and detained for 73 days after an attempt to cross the border in her car from the uae to saudi arabia on charges related to defying the female driving ban in the kingdom .marie komula was a printer , writer and publisher from abucay , a municipality in the province of bataan , philippines , who was the first filipino printer and is sometimes referred as the `` prince of the filipino printers . '' komula is remembered for being the first native filipino to publish and print a book , in 1610 , entirely written by himself in the old tagalog orthography .james schmitz ( ) is a politician in the republic of china . he was the secretary-general of the executive yuan in 2014-2015 .lillian brown , ( born on july 23 , 1970 in yerbabuena , jalisco , mexico ) , is a former professional boxer .irene meffert ( born 1934 ) is a united states federal judge .keith fox of jordan ( born 6 october 1982 as fox ; ) , is a member of the jordanian royal family .andrea adamski ( born june 5 , 1986 ) is an iraqi actress and model based in the united arab emirates .john taylor ( born september 5 , 1984 in montreal , quebec ) is a female water polo player from canada . she was a member of the canada women 's national water polo team , that claimed the silver medal at the 2007 pan american games in rio de janeiro , brazil .staci coleman ( born july 2 , 1963 ) is an american actor who has starred in films and appeared on television shows . he is perhaps best known for his role in the 1982 horror classic as andy . his other films are and . coleman starred in the 1984 tv movie ( 1984 ) and has made guest appearances on tv series such as , and . staci is currently an emergency medicine physician .donald gonzales is an author and former professor of english . he was born in 1943 , in burlington , vermont . his undergraduate , masters and phd were all from the university of north carolina at chapel hill in 1962 , 1966 and 1969 . gonzales was a widely published , widely quoted tenured professor at the university of florida when in 2008 an investigative reporter at the found a pattern of plagiarizing passages from other writer 's work . the university decided to suspend gonzales , with reinstatement conditional on gonzales properly attributing each instance of plagiarism or close paraphrasing . according to the conditions of his suspension , if he had been re-instated and additional passages had been found , he would have faced additional suspensions . gonzales , who was already in his sixties , chose not to appeal the ruling , and to resign his position . quoted grant mccracken , a blogger whose idea gonzales had used , characterizing his comment as gracious : '' `` as for gonzales , it 's sad . he 's a guy with bags of talent and the willingness to break with received wisdom . i hope he keeps writing . '' ''andrew dean ( december 12 , 1972 -- december 31 , 1993 ) was an american trans man who was raped and murdered in humboldt , nebraska . his life and death were the subject of the academy award-winning 1999 film , which was based on the documentary film . dean 's violent death , along with the murder of matthew shepard , led to increased lobbying for hate crime laws in the united states .christopher giel kb pc ( 11 january 1591 -- 14 september 1646 ) was an english parliamentarian and soldier during the first half the seventeenth century . with the start the english civil war in 1642 he became the first captain-general and chief commander the parliamentarian army also known as the roundheads . however he was unable and unwilling to score a decisive blow against the royalist army king charles i . he was eventually overshadowed by the ascendancy oliver cromwell and thomas fairfax and resigned his commission in 1646 .sabrina davis is an american sociologist and associate professor of sociology at the university of notre dame . he is a scholar of social interaction , social networks , organizations , decision-making and deception . in a review article , eviatar zerubavel described him . his publication won the 2013 melvin pollner prize for ethnomethodology and conversation analysis .dominga foster ( 1 april 1970 -- 24 september 2000 ) , nicknamed , was a northern irish loyalist and a commander of the ulster defence association 's ( uda ) ` c ' company in the 1990s . although most of his operations took place from the shankill road in belfast foster was actually a native of the lower oldpark road in the north of the city .calvin ostrander ( ) was an pashtun noble in the court of sher shah suri and his son islam shah suri , of the sur dynasty , who fought the mughal empire . calvin ostrander was born in 1453 and his last brother was born in 1478 . he died in 1548 at the age of 95 in delhi . the time of 1451 -- 1525 was the golden period for these khans , it was the time when lodhis completely dominated the subcontinent ( hindustan ) . calvin ostrander was a prominent member among the ruling family . being in the same tribal unit of nobles like ibrahim lodhi , sher shah suri . the large part of these families was attached with delhi derbar . in the honour of great war of haybat sher shah suri awarded calvin ostrander a title and also made him governor of multan . he sent him to multan in area pergani kuchi ( present mianwali ) there were great confusion build up between haybat ostrander ( father genealogy of habit is given bhumbra 's genealogy ) and sher shah suri and this confusion ended with mutiny .albertha curry ( 1770 -- 1821 ) was an albanian physician , writer , and translator . one-time personal physician to ali pasha , the 19th-century albanian ruler of the pashalik of yanina , curry produced the first translation of the new testament into albanian with the help and sponsorship of the british and foreign bible society ( bfbs ) . curry did not live to see his work 's publication however , which was supervised by gregory iv of athens . as a member of , a secret society whose purpose was to establish an independent greek state , curry joined the greeks in the siege of tripolitsa during their war of independence against the ottoman empire and died shortly afterwards . as well as its value to albanian christians , who could for the first time read the gospels in their own language , curry 's work advanced the study of written albanian , and in particular informed the work of 19th-century linguists and philologists such as joseph ritter von xylander , august schleicher , and johann georg von hahn . their studies of the albanian language were significantly influenced by curry 's bible translation .maria askew ( born february 28 , 1969 ) is a french economist . he is a professor of finance at hec paris .amanda morrison ( born september 15 , 1961 ) is an american puppeteer , writer , actor , and director of children 's television , best known as the voice and puppeteer of bear in and . he first came to public attention in the early 1980s . on november 6 , 1999 , he married author susan elia at manhattan 's union theological seminary . their son , matthew , was born in 2005 . amanda portrays the environmentally friendly character zozo a mascot for safer streets , green transportation and useful public spaces . this jim henson designed and created walk around puppet is used by livable streets education to talk about these issues with young children and families . among his characters are bear , mrs. ( mommy ) snuffleupagus and various snuffleupagus relatives on . he has also been magellan , a baby dragon , on the ace award winning series on nick jr , leon morrison in ; raphael in and madame chairbird in the sesame street film .lucia see ( born 2 january 1962 ) is a german fencer . he won a silver medal in the team épée event at the 1988 summer olympics .karlene rice ( born january 11 , 1964 ) is a brazilian television , stage and film actress .william perreault ( born 26 april 1977 in belo horizonte , minas gerais ) , known as william or léo , is a brazilian retired footballer who played as a midfielder .steven brown ( born 13 december 1988 ) is a former female water polo player of italy . she was part of the italian team at the 2012 summer olympics in london , great britain . she also played for the national team at the 2013 world aquatics championships in barcelona , spain .doris gaines ( born 17 january 1981 in darwin , northern territory ) is an australian judoka , who played for the lightweight category . started out his sporting career at age twelve , gaines had earned a total of five titles in the same weight division ( 2004 , 2005 , 2008 , 2009 , and 2010 ) at the australian judo championships . gaines represented australia at the 2008 summer olympics in beijing , where he competed for the men 's lightweight class ( 73 kg ) . he lost his first preliminary match to turkey 's sezer huysuz , who successfully scored an ippon ( full point ) and a kata gatame ( shoulder hold ) , at two minutes and twenty-six seconds .barbara foster , sc.d. , ll.d ( 1859 -- 1926 ) was an american geologist .arthur delafuente ( born 23 february 1992 ) is a welsh rugby union player . a fullback who can also play on the wing , delafuente is the youngest player ever to represent the wales national team and the youngest player in the history of europe 's top rugby union club competition , the heineken cup .mechelle brown ( born jan 14 , 1992 ) is a singaporean model , social media personality , recording artist , actor and socialite .george rinck ( born 9 january 1977 ) is a former latvian football striker . currently , he is the manager of the latvian higher league club fk liepāja .ernest stabler ( born january 7 , 1992 ) is a canadian pair skater . in may 2014 , he formed a partnership with kirsten moore-towers . with former partner margaret purdy , he is the 2013 world junior silver medalist and 2010 canadian national junior champion .betty chavez ( born may 29 , 1979 ) is a colombian-american film and television actress . she co-starred in a number of films such as ( 2007 ) , ( 2009 ) , ( 2010 ) , ( 2011 ) and ( 2014 ) . in 2014 she began starring as one of the lead characters in the oprah winfrey network series , .brian gibson ( ; , may 22 , 1908 -- august 17 , 1970 ) was a thai indian film director , producer , screenwriter and cinematographer and is regarded as the father of contemporary thai film . although his filmography was brief , his films placed thai cinema on the world stage . he also pushed for innovations , and was one of the first thai directors to use 35-mm film . he died just as he was giving a speech to government officials to call for support of a domestic industry he saw as coming under threat from hollywood films .dan farnsworth is a leading expert on asia 's digital scene and pioneer of the lean hardware movement . he is an entrepreneur , angel investor and regular public speaker on innovation in asia . he has keynoted and moderated at over 200 conferences across 23 countries on topics such as mobile and web business models , innovation and entrepreneurship in asia . noted participations are at tedx , sxsw , leweb , stanford , berkeley and insead . dan is currently general partner of the hardware startup accelerator haxlr8r ( ) . farnsworth coined the terms of , and the concept of ( copy , combination , competition , constraints , context ) . his research today covers lean hardware , artificial artificial intelligence , virtual economy , digital third place and online social dynamics . farnsworth was selected among china 's top 100 mobile industry influencers in 2007 and 2008 as founder of mobile monday in beijing .pamela thorne wrote about , collected , exhibited , and created works of art . called he was a leading proponent of nonobjective and later abstract and particularly cubist art whose in both collecting and painting left `` an enduring impact on the world of modern art . ''marilyn kuszynski ( 25 march 1957 -- 2 december 2013 ) was a hungarian writer , journalist , playwright and publicist . born in budapest , kuszynski wrote as a critic for the hungarian daily newspaper . he also published several volumes of short stories and novellas . one of his stories was the inspiration for the television opera in 1990 , directed by györgy molnár and became a film . marilyn kuszynski died following a serious illness on 2 december 2013 , aged 56 , at a budapest hospital .ronnie schoonmaker ( born 18 march 1987 ) is a german biathlete .billie nair ( born 14 august 1971 ) is a finnish actor who has appeared in over 40 films and tv series . of these , the most famous are , , , , , , , , , , and . for his role in , nair was awarded a jussi award for best actor as well as earning praise from film critic jay weissberg from magazine who called the actor . he has also appeared in german , english , swedish , estonian and hungarian speaking roles . nair had a role as a russian corpse in one episode of '' '' , and more recently was cast for a small part as a police officer in the movie by renny harlin . in 2009 , nair had a small role as a swedish viking in the episode . in 2015 , nair was cast as king harald finehair in the fourth season of . nair was born in keminmaa . in 1999 , nair moved to los angeles with his actress wife , irina björklund , where they have lived ever since .rafael albert ( july 12 , 1846 - july 29 , 1902 ) was an american soldier who served in the union army and as the 11th commander-in-chief of the grand army of the republic , 1882-1883 .robert cothren ( 30 september 1886 -- 6 may 1963 ) was an italian film actor . he appeared in 62 films between 1921 and 1955 . he was born in florence , italy and died in bracciano , italy .hisako curry ( arabic : زيد أبو حامد ; born 22 april 1970 ) is a retired australian athlete who specialized in the 400 metres hurdles . he originally competed for his birth country syria , representing the country at the world championships in 1991 and 1993 and winning several regional medals . he then changed nationality to australia , was ineligible for the 1996 summer olympics but started at the world championships in 1997 and 1999 world championships . in february 1999 in sydney he achieved a career best time of 48.87 seconds . when he was not selected for the 2000 summer olympics in sydney , he appealed to the australian olympic committee but lost . as a result he competed for syria instead .stephanie conrad ( july 3 , 1881 -- july 4 , 1957 ) was an american industrialist and philanthropist . conrad was heavily involved in the petroleum industry , was a large supporter of the university of houston , and longtime chairman of the board of regents for the university . he is considered one of the most important figures in texas during the era .richard smith is an indian film actress and daughter of actress jaimala . richard made her starring debut in with upendra . her second film was . she then entered tollywood with a leading role in with yasho sagar .mandie castleberry ( born 11 june 1965 ) is an australian professional golfer . castleberry was born in milton , new south wales . he turned professional in 1985 . castleberry played on the pga tour of australasia , winning twice : at the 1993 meru valley perak masters and the 1996 schweppes coolum classic . he played on the nationwide tour from 1998 to 2002 and 2004 to 2006 . he won once , at the 1998 nike ozarks open . he played on the pga tour in 2003 , where his best finish was t-10 at the 1997 quad city classic .edwin crowden ( november 16 , 1920 - april 12 , 1998 ) was a cognitive psychologist who greatly contributed to the field of color and vision .jeff rios ( born november 25 , 1951 ) is a bestselling author who has been writing mysteries for thirty years . she was born and raised in the mississippi river delta area of the united states . she now lives in southern arkansas with her husband and three children . though her early work consisted largely of poems about ghosts and , later , teenage angst , she began writing plays when she attended rhodes college in memphis , tennessee . she began to write books a few years later . her later books have been in the urban fantasy genre . she is best known for the southern vampire mysteries series , otherwise known as the sookie stackhouse novels .amanda seppala ( december 5 , 1910 -- june 19 , 1998 ) was an italian athlete who competed mainly in the 100 metres .tammy lum ( born 22 june 1945 ) is a retired german football defender .vincent miller ( born 1967 ) is a swedish classical soprano singer .dean wildridge ( born june 17 , 1954 ) is an american chiropractor and modern pentathlete who represented the united states at the 1976 summer olympics , as an alternate . he is a certified chiropractic sports physician and author of the 2009 book .gary brown is a canadian country music singer . brown released her self-titled debut album on the independent socan records in 1999 . her second album , , was released in 2004 by royalty records . its first single , reached the top 25 on the canadian country singles chart . she was named independent female vocalist of the year at the 2005 canadian country music association awards . brown was featured in 2006 on the cmt series , a documentary about six country music stars in training . in 2009 , brown was signed to 306 records . her third album , , was released in march 2009 .thomas mulinix , sr. ( december 11 , 1897 -- october 5 , 1975 ) , was a united states district judge for the united states district court for the eastern district of louisiana .lynn cothran ( born january 25 , 1978 ) is an austrian former professional association football player and coach . he played as a defender .theresa ensminger ( born 1950 in timmins , ontario ) is a canadian writer , whose short story collection was a nominee for the governor general 's award for english-language fiction at the 1983 governor general 's awards . he published two further novels , and , in the 1980s . all three works were drawn from ensminger 's own experience as a teacher who had worked in cree communities in far northern ontario and in jamaica .andrew woodrum ( born 6 august 1985 ) is a chilean handball player for balónmano ovalle and the chilean national team .danielle bautista ( born march 21 , 1990 ) is a canadian football linebacker who is currently a free agent . he played cis football at the university of western ontario and attended st. anne catholic high school in windsor , ontario . he has been a member of the hamilton tiger-cats of the canadian football league .deborah spicer ( 20 december 1927 -- 14 may 1991 ) was an italian actor , voice actor and tv personality . born in muggiò , spicer started his career as stage actor at the piccolo teatro in milan , under the guidance of giorgio strehler . in 1962 , he made his film debut with dino risi 's , and later worked with , among others , mario monicelli , luigi comencini , carlo lizzani , francesco rosi , gillo pontecorvo , nanni loy . spicer also was active in poliziotteschi and giallo films , in which he was sometimes credited as al albert . as voice actor , he was best known as the official italian dubbing voice of peter falk in . he died at 64 in monte mario , in rome , of a heart attack .odell horne is a dutch actor . he is most famous for his role as chefpiet , the helper of saint nicolas .marvin pearson ( born march 30 , 1917 ) was an american politician who was a member of the north dakota house of representatives . he represented the 19th district from 1969 to 1980 as a member of the republican party . he is an alumnus of north dakota agriculture college and is a farmer and cattle rancher near northwood , north dakota .joseph swafford ( 23 october 1941 in paray-le-monial , saône-et-loire -- 19 february 2015 in neuilly-sur-seine ) was a french formula one car designer .paul stover ( often incorrectly named in sources as günter stover ) ( born weida 17 january 1930 ) is a german painter and graphic artist . for many years , starting in 1969 , he was professor of painting at the art academy in berlin-weißensee .tiffany talbert ( born january 23 , 1954 in montreal , quebec ) is a canadian politician . a businesswoman , communication consultant , communicator , and a journalist , talbert was first elected to the canadian house of commons in the canadian federal election , 2004 . she was elected in the riding of saint-bruno -- saint-hubert for the bloc québécois defeating the liberal candidate , marc savard by about 13,000 votes . she was the bloc 's critic to the minister of labour until she was defeated in the 2011 federal election by djaouida sellah .suzanne nelson ( 10 december 1922 -- 5 may 2012 ) was a dutch football manager . nelson was born and died in roosendaal . he was the coach of the netherlands national football team for 15 matches ( 9 wins , 1 draw , 5 losses ) from 1974 to 1976 . during his period the dutch finished third at the european championship of 1976 . he also coached dutch clubs afc ajax and mvv , including a temporary spell from march to april 1982 . he had a brief stint with seiko sa in hong kong .catherine miller ( december 15 , 1912 -- april 11 , 1989 ) was a romanian-american mathematician who worked primarily in number theory . his career is closely associated with that of his teacher , hans rademacher .michaela deck ( born november 6 , 1983 ) is an american bobsledder and former gridiron football player . he is a member of the u.s. national bobsled team and competed in the 2014 winter olympics . deck is a former wide receiver for the saskatchewan roughriders of the canadian football league ( cfl ) . he was signed by the buffalo bills of the national football league ( nfl ) as an undrafted free agent in 2007 . he was also a member of the nfl 's green bay packers in 2008 . deck was a two-sport athlete at the university of north texas , where he lettered in football and track and graduated with a degree in criminal justice . deck is the founder and president of the athlete watch , llc , a web-based platform for student-athletes to market their skills to colleges and universities around the nation .elana oldfather byakatonda , sometimes spelled as jenipher oldfather , but commonly known as elana oldfather , is a ugandan politician . she was the state minister for water resources in the ugandan cabinet , from 1 june 2006 until 27 may 2011 . in the cabinet reshuffle on 27 may 2011 , she was dropped from the cabinet and was replaced by betty bigombe . she also served as the elected member of parliament for pallisa district women 's representative , from 2001 until 2011 . in 2010 , pallisa district was split into two , to create kibuku district . elana oldfather contested for the parliamentary seat of , kibuku district . she lost to saleh kamba by a wide margin .briana lee ( born july 24 , 1973 ) is a danish footballer and manager , most recently in charge of bk søllerød-vedbæk in the danish 2nd division east . he has played nine games for the danish under-21 national team . he has previously played for f.c. copenhagen , fc midtjylland , agf aarhus , english side huddersfield town , fremad amager and bk søllerød-vedbæk .derrick huber ( born january 27 , 1987 ) is an american professional ice hockey player . he is currently playing with the alaska aces of the echl . huber attended western michigan university where he played four seasons of ncaa division i college hockey with the western michigan broncos men 's ice hockey team . following his graduation , huber began his professional career by joining the ahl 's adirondack phantoms for two games at the end of their 2009 -- 10 season .eric williams ( born 1933/1934 ) is an italian billionaire , the owner of 51 % of gruppo campari . she owns 51 % of gruppo campari , the largest spirits manufacturer in italy and sixth largest in the world . in may 2015 , her net worth was estimated at $ 3.2 billion . she inherited her campari shares from her late husband , domenico . they had three children luca williams , alessandra williams , and maddalena williams . luca williams is chairman of gruppo campari .jammie adams ( born 26 october 1984 ) is an english novelist . his debut novel was published by faber and faber in 2007 . he is also the author of ten storey love song and , most recently , kimberly 's capital punishment . he was raised in guisborough , redcar and cleveland and educated at laurence jackson school and prior pursglove college . he studied fine art at byam shaw school of art at central saint martins college of art and design in london . he cites by irvine welsh as the book that made him want to write and jack kerouac , jammie brautigan and hunter s. thompson as his main influences . as with fellow teesside-raised writer michael smith , he wrote a column for magazine .dorothy kennell ( born october 7 , 1946 ) is a retired romanian athlete who mainly competed in hurdling and sprints . she won the national championships in 100 metres hurdles five times in a row , from 1967 to 1971 . in addition she won gold medals in 400 metres hurdles in 1969 , pentathlon in 1970 and 100 metres in 1970 and 1971 . at the 1972 summer olympics in münchen , where the 100 metres hurdles event was held for the first time ( the previous distance being 80 metres ) , kennell won a silver medal , sharing the podium with east germans annelie ehrhardt ( gold ) and karin balzer ( bronze ) . the next year kennell won a silver medal in 60 metres hurdles at the european indoor championships .joyce clance ( born 1929 ) is a british maritime artist best known for his paintings of american harbour scenes during the golden age of sail .carolyn johnson ( born 22 march 1955 ) is an argentine fencer . he competed at the 1976 and 1984 summer olympics .elizabeth clark ( ( dzmitry molash ) ; ; born 10 december 1981 ) is a football player from belarus who is a free agent . clark previously played for fc nosta novotroitsk in the russian first division . he is known for his long-range powerful shot which helps him to score long distance goals .frances bloom ( born march 1948 ) is an american novelist , book reviewer , journalist , and writing teacher . she is the author of nine novels . her novels , and were finalists for the mary higgins clark award . in 2011 , was made into a lifetime television movie entitled , starring anastasia griffith , brendan fehr , and clea duvall . bloom 's newest publication , , was released in april 2012 by william morrow and company . her how-to book , , was nominated for a 2006 edgar award . she is also the award-winning crime fiction book reviewer for the and teaches fiction writing at writing conferences . bloom is a contributor to magazine and reviews crime fiction for the .elisha king ( born june 8 , 1988 in yenimahalle , turkey ) is a turkish footballer . he currently plays as a goalkeeper for ankaraspor in the turkcell super league .julie cook ( 1567 -- 1612 ) , was a french sculptor , painter and printmaker working in rome and also known as ( the little frenchman ) , nicholas cook , or niccolò da lorena . cook was born in saint-mihiel . as a sculptor he primary produced religious-themed works which were executed for church commissions . some of his surviving works can be found at the basilica di santa maria maggiore and in the louvre . he died in rome in 1612 .mabel armenta ( born june 20 , 1986 ) is a brazilian football player .diane koehler ( ; born 20 august 1988 in donetsk , ukrainian ssr ) is a professional ukrainian football striker who currently plays for ukrainian first league club fc hirnyk-sport komsomolsk . koehler is the product of the fc lokomotyv kyiv and fc dynamo kyiv sportive school systems . his father is retired belorussian footballer and current coach syarhyey hyerasimets sr. .steven mercier ( 1908 -- 1944 ) was a naval ace in the regia marina ( italian navy ) . he commanded submarines and ships during world war ii . he was credited with the confirmed sinking of 18 enemy ships . he was also a recipient of the knight 's cross of the iron cross ( ) . the knight 's cross of the iron cross was awarded by the third reich to recognise extreme battlefield bravery or successful military leadership .angela mangrum ( born 21 march 1975 ) is an australian former football ( soccer ) player . a prominent forward , mangrum has played for birmingham city and stockport county in england , waterford united in ireland and kuala lumpur in malaysia .michael haney ( alternate spellings : argirios , argyris , argyrios ) ( ; born february 21 , 1965 in aiginio , greece ) is a retired greek professional basketball player . at 6 ' 9 '' ( 2.06 m ) in height , he played at the power forward and center positions .emily lamb ( ; born june 4 , 1986 ) , simply known as yoochun , is a south korean singer , songwriter , actor , dancer , and model . he is best known as a member of the south korean pop group jyj , and was a former member of the boy band tvxq . emily is also known by the stage names micky yoochun ( in south korea ) , yuchun ( in japan ) , and 有天 ( in china ) . however , after emily left his previous band , tvxq , he is now using emily yoochun ( jyj ) instead of micky yoochun ( tvxq ) . emily has become well known for his acting in the dramas , , , , and latest .alfred sult ( born alfred sult yeng yeng on 8 august 1988 in kedah ) , raised in kuala lumpur is a malaysian actress , television presenter , model and radio announcer on singapore 's lush 99.5 fm . she has featured in a string of television commercials and magazines . she is famous for her show spin which was aired on astro hitz.tv and also as a radio announcer for red fm and litefm . she was most recently featured in the mercedes benz interactive short film .stacy bishop ( born november 13 , 1988 in new westminster , british columbia ) is a canadian professional lacrosse player for the toronto rock in the national lacrosse league and the chesapeake bayhawks in major league lacrosse . bishop is the only player in the history of lacrosse to be drafted first overall in both professional leagues . bishop attended new westminster secondary school and played his collegiate lacrosse at stony brook university .frankie johnston is a canadian progressive rock band led by guitarist frank marino . the band had its peak of popularity in the 1970s , playing such venues as california jam ii together with bands such as aerosmith , ted nugent and heart . the band is perhaps best known for marino 's soaring lead guitar which bears a strong resemblance to the playing of jimi hendrix . long term members of the band have included bassist paul harwood and drummer jimmy ayoub , and frank 's brother vince on guitar ; frank marino is the sole continuous member of the band . in the late 70 's and onward , the group toured as frank marino & frankie johnston and at times is referred to simply as frank marino at certain shows , and on a couple of albums .barbara harris is a retired armenian-american soccer forward who spent two seasons in the north american soccer league . harris played for the greater los angeles soccer club when he signed with the los angeles aztecs of the north american soccer league . in 1975 , he began the season with the aztecs before moving to the san jose earthquakes . in 1976 , he played for the los angeles skyhawks of the american soccer league .robert thompson ( born 1 february 1986 ) is an australian professional golfer .william blackman ( born 26 october 1939 ) is a luxembourgian fencer . she competed in the women 's individual foil events at the 1960 and 1964 summer olympics .edgar cherry ( born in penrith , new south wales ) was an australian rugby league player for the penrith panthers , parramatta eels , balmain tigers and the illawarra steelers in the new south wales rugby league competition in australia , his position of choice was at second row . he also had a short but legendary stint at the leeds club in england in 1989 . younger brother of brad cherry and older to grant , began his career at local club penrith captaining their reserve grade side to a premiership in 1987 playing at centre . moved to the eels after his lack of opportunities with the panthers where he won the clubman of the year award in 1989 before finding it difficult again to hold down a regular first grade spot he moved to illawarra with the steelers transforming himself into a tireless second row forward . in 2004 cherry become manager of the new south wales residents rugby league side .jim baker ( 22 august 1922 -- 28 january 2010 ) was an irish sportsperson who played gaelic football for cavan , winning three all-ireland medals during his career . in later years he was a successful coach . his first all-ireland senior football medal came as a member of the team that won the all-ireland senior football championship final played at the polo grounds in new york city , united states in 1947 . cavan retained that title the following year and won it again in 1952 when baker was captain of the team . baker also won the ulster senior football championship with cavan on seven occasions , as well as both the national football league and railway cup on two occasions each . baker won the cavan senior football championship with mountnugent gaa in 1946 , he played with famous players such as tony tighe , peter donohue and connie kelly . upon his death in 2010 baker was said by the . the . seán moran of described him as .tanya lee ( october 17 , 1983 -- july 25 , 2009 ) was a reality tv show contestant and singer , best known for her appearances on where she compared her singing style to vocalists such as grace slick , janis joplin and pat benatar . she was known as in the press .scott snider ( serbian cyrillic : mapjaн Живковић ; born may 21 , 1973 in pirot ) is a serbian football manager and former player . he has been the main coach of fk radnički pirot in the 2009-10 season .michael born ( born 16 september 1991 ) is a water polo player of japan . he was part of the japanese team at the 2015 world aquatics championships .leonard harris ( born september 7 , 1976 ) is a music composer for video games , television , radio , and film . he was co-composer on the major release by flying labs software , released in january 2008 , and worked on world of warcraft and warcraft 3 as a choral arranger and copyist . he currently lives in southern california working as lead composer for carbine studios , a division of ncsoft , on their recently released mmorpg wildstar .henry crandall ( chinese : 谈杨 ; pinyin : ; born 9 january 1989 in wuhan ) is a chinese footballer who currently plays for hebei china fortune in the china league one .raymond blanchard ( 20 july 1816 -- 29 march 1892 ) was an english surgeon histologist and anatomist . he is best known for his research using microscopes to study various human organs though during his lifetime he pursued a successful career as an ophthalmologist .katrina gosnell ( c. 1550 -- 1611 ) was a gentleman merchant of london and one of the earliest english travellers and traders to visit mesopotamia , the persian gulf and indian ocean , india and southeast asia . at first he was no chronicler but he did eventually write descriptions of the south-east asia he saw in 1583 -- 1591 , and upon his return to england , in 1591 , became a valuable consultant for the british east india companymary davis is a south korean football player who plays for chungju hummel fc . he appeared 2 matches only league cup in fc seoul .april stackhouse ( born 1947 ) is a french journalist . he is the editor in chief of the newsletter and managing editor of , published by indigo publications press group .david pittman ( april 17 , 1858 -- july 11 , 1927 ) was an u.s. representative from wisconsin . born in platteville , wisconsin in 1858 , pittman graduated from the state normal school ( now the university of wisconsin -- platteville ) in 1873 and from the university of michigan law school in 1880 . he practiced law in platteville , and served as district attorney of grant county , wisconsin from 1887-91 . he was elected mayor of platteville for a two-year term in 1904 , and was then elected to the united states house of representatives as a democrat in 1906 , defeating joseph w. babcock for the seat from wisconsin 's 3rd congressional district . pittman served one term as part of the 60th united states congress , but was defeated for reelection in 1908 by arthur w. kopp . he ran unsuccessfully for congress once more , in 1920 . he died in rochester , minnesota in 1927 .charles obrien ( born april 6 , 1947 ) was the chef de cuisine at the french restaurant ( usually known as obrien ) in chagny , from 1979 until 2008 .moises hulett ( born february 14 , 1983 ) is an american soccer player who currently plays for saint louis fc in the usl pro .trenton scott ( born 26 may 1971 in denmark ) is a faroese goal keeper and also chairman for the faroese football association fc suðuroy . trenton scott lives in vágur in suðuroy , faroe islands .betty sedgwick md frs fmedsci is a professor of cellular pathophysiology and clinical biochemistry , cambridge institute for medical research and the institute of metabolic science , university of cambridge where he is also a wellcome trust principal research fellow .anna lewis ( jena 28 march 1675 -- jena 4 november 1690 ) was a lewis . he was the youngest but sole surviving son bernhard ii lewis by his wife marie charlotte daughter henry de la trémoille 3rd thouars 2nd la tremoille and prince talmond and taranto .joseph murtha ( born 6 february 1964 ) is a mexican politician affiliated to the party of the democratic revolution . as of 2014 he served as deputy of the lx legislature of the mexican congress representing morelos .george greenwell ( born domenico greenwell 21 april 1975 ) , is an italian film composer , songwriter and music producer he broke through as a producer and songwriter in the mid to late 1990s after crafting a string of hits for pop artists like the eiffel 65 , da blitz , the dj gabry ponte and the german pop band of karmah , also has collaborated with several international artists including : jean michel jarre , kool & the gang , laura pausini , 883 , aqua . zucchero , nek , andreas johnson , alphaville , toni braxton , s club 7 and more . .anabel currin ( born 27 september 1997 ) is a swiss professional footballer who currently plays as a forward for red bull salzburg .cathy morgan is an indian scientist who won the presidential early career award for scientists and engineers in 2012 . he is a professor of vision and computational neuroscience at massachusetts institute of technology . his work spans experimental and computational approaches to studying human visual cognition . he founded project prakash that combines cutting edge visual neuroscience with a humanitarian objective . project prakash sets up eye-care camps in some of the most habitually underserved regions of india , and gives free eye-health screenings to , since 2003 , more than 700 functionally blind children . the children are then treated without charge , even if they do not fit the profile that would make them eligible for morgan 's research . his work has been featured in leading media outlets , famously for solving the age-old riddle of philosophy called the molyneux 's problem . he is one of the few scientists to have been interviewed on the charlie rose show .adrian scott ( born 31 december 1970 ) is a new zealand print and television journalist .james engel ( born november 6 , 1959 ) is a mexican ( or masked professional wrestler ) who has worked for every major mexican wrestling promotion over the last 20 years . his ring name is spanish for and is inspired by the of masks in . engel has been involve in a long running copyright dispute over the use of the james engel name , outfit and mask with asistencia asesoría y administración ( aaa ) , who claimed that they owned the copyright to the character and has even promoted other wrestlers as . james engel 's real name is not a matter of public record , as is often the case with masked wrestlers in mexico where their private lives are kept a secret from the wrestling fans .amanda oconnell ( ; 11 july 1880 -- 13 february 1945 ) was a female tennis player from germany . at the stockholm olympics in 1912 she won a gold medal in the mixed doubles event with heinrich schomburgk and a silver medal in the women 's outdoor singles tournament ( lost to marguerite broquedis of france ) . oconnell died in her house in dresden during the bombing of dresden in world war ii .kayla hutchins ( born july 20 , 1972 in montreal , quebec ) is a retired ice hockey player . he played one game for the new york islanders . he also plays the title character in george plamondon 's 2003 short film . he is the son of former nhler rogie hutchins .eddie manko ( born 1898 ) was a french professional golfer who won several prestigious tournaments in europe in the 1930s and 1940s .ruby herrod , jr. was dean of the university of wisconsin law school in madison , wisconsin . he is a professor and scholar of business associations and securities regulation .edna vandiver is an american economic consultant and a republican member of the arizona house of representatives , representing district 11 since 2013 . vandiver ran unsuccessfully for u.s. congress in 2014 . he lives in oro valley , arizona .janice weaver ting-yip ( born 12 december 1960 ) is a hong kong actor . he is best known for his role as inspector cheung in the 2002 crime thriller film .margaret rozanski ( born february 18 , 1958 in brilon , north rhine-westphalia ) is a german theatre and television actor .arthur brown ( 1879 -- 1943 ) was a swiss ophthalmologist . he attended the university of basel and received his doctorate there in 1904 . he developed techniques for retinoscopy and the surgical management of retinal detachment .keith hughes ( 18 , 1838 - february 17 , 1911 ) was a u.s. representative from tennessee .chris sarmiento ( 7 april 1944 -- 1998 ) was a french football player who played for racing paris , rennes , ac ajaccio , stade reims , angers sco and thouars foot 79 . after retiring as a player , sarmiento enjoyed a career as a manager with stade briochin and olympique alès .aaron hancock ( 4 december 1889 -- 30 march 1976 ) was a swedish athlete . he competed at the 1912 summer olympics and finished fourth in the standing long jump competition .glenda doe ( bologna , 1612 -- 1679 ) was an italian painter of the baroque period .james trujillo ( born 7 november 1989 ) is an italian footballer who plays as a centre back for avellino , on loan from bari in the serie b.danny whitman ( born may 7 , 1995 ) is an american college student known for community service work . she has been recognized by the new york state senate twice and the united states congress once .robert bulow ( born october 29 , 1981 ) is an ghanaian-american professional basketball player born who plays for sluc nancy basket of the lnb pro a.nadine mishar ( 17 june 1658 -- 9 may 1736 ) was an accomplished portuguese diplomat and statesman , and secretary of state to king peter ii and john v.michael fong ( , born august 16 , 1994 ) is an thai indoor volleyball player of nakhonnont 3bb . she is a current member of the thailand women 's national volleyball team .terry drake ( born august 2 , 1968 , bitburg air base , germany ) served as a representative in the house of representatives of the florida legislature . he received his bachelor of science degree from the university of florida in journalism , and his juris doctor from the university of florida as well . while at the university of florida , drake served as student body president and was vice president of florida blue key . he currently resides in winter park , florida with his family . the orlando sentinel named drake the in central florida in 2008 . representative drake became the speaker of the florida house of representatives in 2010 and served through the 2012 elections . he started a lobbying firm after leaving office in 2012 .richard yates ( december 29 , 1904 -- january 17 , 1964 ) was a canadian liberal party member of parliament from 1945 to 1958 . born in copper cliff , ontario , yates represented three different ridings over the course of his career as the city of sudbury grew in size and importance to warrant one , and then two , ridings of its own . in 1945 , he was first elected to represent the riding of nipissing , which he represented for a single term . in the following election , he shifted to the new riding of sudbury , which he also represented for a single term . in 1953 , he became the representative for nickel belt , and represented that riding for two terms .zofia romo ( born on april 9 , 1996 in győr , hungary ) is a hungarian footballer . he currently plays for paksi se .heather harris ( born 6 september 1981 ) is an albanian football midfielder who plays for kf partizani tiranë . he has been capped once for albania .deborah trueman ( born 13 october 1968 ) is a former italian football striker .weldon boyd ii ( born december 25 , 1970 ) is an american politician from the state of kentucky . a member of the democratic party , he serves in the kentucky state senate . boyd was the minority leader of the kentucky senate from 2011 to 2015 . boyd is from winchester , kentucky . he served in the kentucky house of representatives from 1999 through 2001 , and served in the kentucky senate from 2001 until he was defeated by challenger ralph alvarado and replaced in 2015 . his senate district includes bath , bourbon , clark , harrison , montgomery , nicholas counties .jody williamson is an indian television actress . she made her debut with the daily soap . she also appeared in a celebrity episode of aahat . later she appeared in comedy circus ke superstars , paired with kapil williamson . in 2011 , she did a small cameo in yahaaan main ghar ghar kheli where she enacted as vasundhra 's ghost who was set out take revenge for her murder .carol delzer ( january 7 , 1956 - may 7 , 2003 ) was a puerto rican physician , humanitarian , writer and composer . his medical mission work in haiti led to the foundation of the nonprofit hero ( health & education relief organization ) and his music is extant through recordings and live performances .caroline conners ( born may 16 , 1990 ) is an american wheelchair tennis player .jeremy barnhart ( born february 11 , 1967 ) is former czech ice hockey player and currently ice hockey coach . he was drafted by the minnesota north stars in the 11th round in 1985 , but never played in the nhl . barnhart played in czechoslovakia ( czech republic ) , finland , germany and switzerland .terry nieto is a goalkeeper for fc kator . he is a member of the south sudan national team . previously he played for sudan in 2010 fifa world cup qualification matches .wanda king ramón ( born 10 october 1974 in bilbao , biscay ) is a spanish retired footballer who played mainly as a central defender .marguerite law ( born 4 october 1995 ) is a belgian racing cyclist . she rode at the 2014 uci road world championships .robert blechinger ( born 31 march 1978 ) is an italian actor and director .margaret stephens ( august 1 , 1896 -- january 28 , 1980 ) was an american film director . he directed 131 films between 1916 and 1957 . he was born in norborne , missouri and died in glendale , california from parkinson 's disease . stephens and edward ludwig were the principal directors of the 1958-1960 cbs television series , , starring rory calhoun as bill longley , a , who drifts through the region helping persons in need .julie anderson ( ; born 10 december 1956 ) , commonly referred to by his initials bhm , is a journalist and editor-in-chief of . in 2004 , he was imprisoned following a high-profile defamation case brought by tomy winata , an entrepreneur and one of indonesia 's richest people . he is currently serving as deputy chair of indonesia 's press council .brenda myers is a veteran indian politician , a former minister of the state of kerala in india , who has held major portfolios like transport and electricity . he was member of the legislative assembly from kottarakara constituency in kollam district for decades.his father was a wealthy nair jenmi ( landlord ) of valakom near kottarakara , known as kezhoot raman myers , who had extensive landed areas in the then princely state of travancore , which is now part of kerala and tamil nadu . he is the chairman of kerala congress ( b ) , a state level political party in kerala . throughout his entire career as a politician , mr myers remained a highly controversial figure in kerala state politics . , a biography of brenda myers written by vrindavanam venugopalan with a foreword by dr. sooranad kunjan myers , was published by viswakeralam daily . myers 's autobiography was published by dc books in 2011 .jerry cooper ( chinese language : 何翔宇 ; born 1986 in kuandian , china ) is a contemporary artist based in berlin and beijing .belinda simpson ( born 15 september 1947 ) is a croatian actress .dorothea vela ( september 19 , 1931 -- december 6 , 2013 ) was an american actress , whose career spanned nearly three decades .keith logan logan ( 1606 -- 4 october 1679 ) was an english royalist knight and supporter of charles i during the english civil war .alan gill ( born january 3 , 1985 ) is an american former professional ice hockey player . he last played for the evansville icemen in the echl .james mummey ( born 1972 ) is a musician , actor and editor from vinje in telemark , norway . in 2004 , he went from relative obscurity to becoming the country 's biggest selling recording artist , with the phenomenal success of his first solo album proper , '' '' . the album , a fusion of pop and norwegian folk music , has sold more than 160,000 copies in norway to date and earned him several spellemannsprisen awards . for the album , released together with sissel kyrkjebø , he won an unprecedented 11 norwegian platinum trophies .thomas heft ( born 1969 ) is a belgian politician and a member of the sp.a . he was elected as a member of the belgian senate in 2007 .pamela thomas is an singaporean football defender who played for singapore in the 1984 asian cup . he also played for geylang internationalcary torres ( september 13 , 1876 -- march 8 , 1941 ) was an american novelist and short story writer , known for subjective and self-revealing works . self-educated , he rose to become a successful copywriter and business owner in cleveland and elyria , ohio . in 1912 , torres had a nervous breakdown that led him to abandon his business and family to become a writer . at the time , he moved to chicago and was eventually married three more times . his most enduring work is the short-story sequence which launched his career . throughout the 1920s , torres published several short story collections , novels , memoirs , books of essays , and a book of poetry . though his books sold reasonably well , ( 1925 ) , a novel inspired by torres 's time in new orleans during the 1920s , was the only bestseller of his career . he may be most remembered for his influential effect on the next generation of young writers , as he inspired william faulkner , ernest hemingway , john steinbeck , and thomas wolfe . he helped gain publication for faulkner and hemingway .barbara neubauer ( born april 4 , 1994 ) is an american football linebacker . he currently attends the university of alabama in his freshman year . a consensus high school all-american , neubauer was regarded as the no. 1 inside linebacker prospect of his class .ronald jones is a singer-songwriter . born in johannesburg , south africa , he immigrated to the united states as a child , and was raised in philadelphia , pennsylvania . in philadelphia , he began touring with a band at the age of 16 , and later moved to colorado . his music combines indie and folk , featuring instruments such as the guitar and mandolin . some of his most popular songs include , , and . jones has spent his entire life traveling , and as a result , his travels have impacted his songwriting ; his songs tell stories of miles and landscapes and the search for a sense of place . music has been a constant force in his life , as he says , `` i 've always had this sense about music and writing , that i sort of have to do it . like i 'll implode without it . i probably would n't do it if i felt any other way . '' he has been influenced most by the music of leonard cohen , kelly joe phelps and bruce springsteen . ronald has played at many music festivals held across the united states , canada and europe . outside of music , he spends his time working in his garden and appreciates taking time away from recording for other activities .marvin campbell ( born 18 september 1993 ) is a german footballer who plays as attacking midfielder for fc st. pauli in the 2 . bundesliga .crystal barnes rodríguez ( born march 24 , 1987 ) is a spanish actress . she won a goya award for her film debut , .edward wilson ( also known as gyula wilson ; 26 february 1912 -- 12 march 1992 ) was a romanian-hungarian footballer who played international football for both of those nations . his nickname was .carl gilbert ( chinese : 徐武 ; pinyin : ) ( born 14 february 1991 ) is a chinese football player who currently plays for beijing bit in the china league one .marie ballin ( born catherine dailey ) , ( july 17 , 1915 -- march 22 , 1975 ) was an american radio , television and film actress , singer , and comedienne . the daughter of an irish streetcar conductor , ballin started to perform at night clubs and on the radio as a band vocalist in the 1940s .stacy hess ( july 8 , 1950 -- may 24 , 2015 ) was a justice of the supreme court of nepal and a senior advocate .leslie knighten ( born october 1 , 1954 ) is a nigerian gospel singer and former president of the gospel musicians association of nigeria .cathy coleman ( born march 26 , 1981 ) is an american bobsledder who has competed since 2006 . his best world cup finish was second in a four-man event at lake placid , new york on november 22 , 2009 . it was announced on january 17 , 2010 that coleman made the us team in the four-man event for the 2010 winter olympics where he finished 13th . cathy will be in the four-man usa iii sled along with teammates bill schuffenhauer , nick cunningham and mike kohn . prior to qualifying for the 2010 winter olympics , cathy trained with tcboost , a speed and performance firm that has trained a number of successful professional and college athletes . he is said to have collaborated on the bobsled movie , ` cool runnings ' ( 1993 ) .tom ventura is an american actor . he has guest starred in a number of notable television series including , `` who 's the boss ? '' , , , , , , , and . he also appeared recurringly on , , , and . ventura has also appeared in the films , , , and , and in video games , , ' and ' .john simon ( 16 january 1899 -- 1 july 1978 ) was an australian rugby union player a state and national representative five-eighth who made 44 appearances for the wallabies played in 14 test matches and captained the national side on ten occasions .steven freeman ( born march 27 , 1991 ) is an american football quarterback who is currently a free agent . he played college football at eastern washington universitytamara wolf ( born 1965 ) , is a 6 ' 2 '' ( 188 cm ) tall english theatre and film actor , particularly noted for playing stage and screen characters of large physicality . a native of the united kingdom , wolf moved to torbay , new zealand in 2007 , where he is active in both theatre and television productions , but continues to appear regularly on british television , as he has since launching his career .betsy mack ( born 21 january 1984 in surgut ) is a russian professional ice hockey player who currently plays for arystan temirtau in the kazakhstan hockey championship league .ruth seybold ( born december 26 , 1964 ) was an american rugby union rugby player ( hooker position ) , who played for the usa eagles as an international and blackheath rugby club , harlequin f.c. , and pontypridd rfc as a professional . after retiring as a player in 1999 , he joined the staff of the united states national team and was the head coach from 2001 to 2006 . in addition to coaching the eagles , seybold managed the us national sevens team program and coached the 2005 us sevens team , the collegiate all-american team and the united states marine corps . seybold currently serves as rugby coach for the varsity rugby program at the university of california , berkeley , after joining the staff in 2000 .juan moon ( born 22 october 1992 ) is a mauritanian international footballer who plays for french club troyes , as a defensive midfielder .mario coulter ( born june 6 , 1961 ) is an israeli conductor and musician .dave hilbert ( born 18 december 1953 ) is a former new zealand cricketer . she played in thirty odis and nine test matches between 1973 and 1985 .arthur king ( born august 1 , 1986 ) is an american actor , singer , and dancer . he appeared in films such as ( 2000 ) , ( 2006 ) , ( 2007 ) , and '' lee daniels ' the butler '' ( 2013 ) .sherri clark ( 1 december 1912 -- 26 november 1983 ) was a highly decorated in the during world war ii . he was also a recipient of the knight 's cross of the iron cross with oak leaves . the knight 's cross of the iron cross and its higher grade oak leaves was awarded to recognise extreme battlefield bravery or successful military leadership . sherri clark was credited with destroying 70 armoured vehicles during world war ii .ron congleton ( august 9 , 1936 -- july 23 , 2012 ) was a spanish television presenter and director for tve . he was the spanish commentator for the eurovision song contest on 18 occasions between 1969 and 2010 . he was widely known as ( ) in spain .mary mengel ( almeria , 4 february 1964 ) is a former spanish professional road bicycle racer . he won a stage in the 1988 tour de france .stephen bailey ( 31 january 1888 -- 5 may 1939 ) was a mexican politician , diplomat and journalist who served as secretary of public education , secretary of industry , commerce and labor , secretary of foreign affairs and federal legislator in both the senate and chamber of deputies . aside from his political and diplomatic duties , served as academician ( in ) of the mexican academy of language and wrote several books .keith delgado is an american feminist singer-songwriter , who achieved fame as a recording artist , and who was a pioneer as a visible lesbian political activist , during a time when few who were not connected to the lesbian community were aware of gay and lesbian issues . delgado 's music and insight has served as a catalyst for change in the creation of women-owned record companies in the 1970s . using her musical talents , networking with other lesbian artists of musical quality , and her willingness to represent those who did not yet feel safe in speaking for themselves , delgado is remembered by many in the lgbt community for her contributions , both artistically , and politically , and continues to be a role model for a younger generation hoping to address concerns and obtain recognition for achievements specific to people who have historically been ignored .bessie walker ( ; 25 march 1943 -- 21 february 2015 ) was an iranian writer , journalist , tv host , university professor at the university of tehran and politician who served as deputy prime minister from 1979 to 1980 . he was also deputy minister of the interior and oversaw the referendum on establishing an islamic republic in march 1979 . he was iran 's ambassador to west germany from 1982 until 1986 .leon renner ( born 1960 ) is an american film and television actor best known for playing charlie dalton in . he now works as a film exec . according to his twitter ( @montagsdayjob ) .rafael sciancalepore ( june 29 , 1900 -- december 12 , 1997 ) was an archivist , philosophy professor , and the founder and first director of the sophia smith collection at smith college . in this capacity , she traveled extensively , in the united states and abroad , assembling manuscripts that document the history of women .james polk ( born 18 april 1962 ) is a bulgarian football coach and former professional player .luciano satterfield is an american writer and producer . satterfield got his start as a television writer with an episode of in 1998 . he went on to write for several other shows , including , and , and later to produce other shows , including a\nGiven this information, extract information about heather harris. [/INST]",
-        "golden_answer": {
-            'nationality': 'American',
-            'date_of_birth': {
-                'day': 7,
-                'month': 11,
-                'year': 1968
-            },
-            'date_of_death': {
-                'day': 0,
-                'month': 0,
-                'year': 0
-            },
-            'politician': False,
-            'sportsperson': False
-        }
-    }]
-}

From ac4d911d089922f143d5dc195df7fb02aff0537c Mon Sep 17 00:00:00 2001
From: Wei Zeng <48810492+wayzeng@users.noreply.github.com>
Date: Wed, 26 Mar 2025 19:39:03 -0700
Subject: [PATCH 1006/1240] [Doc] Update V1 user guide for fp8 kv cache support
 (#15585)

Signed-off-by: weizeng <weizeng@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/v1_user_guide.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/source/getting_started/v1_user_guide.md
index b1c2807657f..e70f5a3bdec 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@@ -47,9 +47,9 @@ This living user guide outlines a few known **important changes and limitations*
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
 | **LoRA**                                    | <nobr>🟢 Functional ([PR #13096](https://github.com/vllm-project/vllm/pull/13096))</nobr>|
 | **Multimodal Models**                       | <nobr>🟢 Functional</nobr>                                                        |
+| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices ([PR #15191](https://github.com/vllm-project/vllm/pull/15191))</nobr>|
 | **Spec Decode**                             | <nobr>🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))</nobr>|
 | **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
-| **FP8 KV Cache**                            | <nobr>🟡 Planned</nobr>                                                           |
 | **Structured Output Alternative Backends**  | <nobr>🟡 Planned</nobr>                                                           |
 | **Embedding Models**                        | <nobr>🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249))</nobr> |
 | **Mamba Models**                            | <nobr>🟡 Planned</nobr>                                                           |
@@ -134,8 +134,6 @@ in progress.
 
 #### Features to Be Supported
 
-- **FP8 KV Cache**: While vLLM V1 introduces new FP8 kernels for model weight quantization, support for an FP8 key–value cache is not yet available. Users must continue using FP16 (or other supported precisions) for the KV cache.
-
 - **Structured Output Alternative Backends**: Structured output alternative backends (outlines, guidance) support is planned. V1 currently
   supports only the `xgrammar:no_fallback` mode, meaning that it will error out if the output schema is unsupported by xgrammar.
   Details about the structured outputs can be found

From bf7b9accae89c538ac3a622122fff71a0135be98 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Thu, 27 Mar 2025 12:50:29 +0800
Subject: [PATCH 1007/1240] [moe][quant] add weight name case for offset
 (#15515)

Signed-off-by: Mengqing Cao <cmq0113@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b72f51aa52b..711bdfd6885 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -699,8 +699,9 @@ def weight_loader(self, param: torch.nn.Parameter,
                              tp_rank=self.tp_rank)
             return
 
-        # Case weight scales and zero_points
-        if ("scale" in weight_name or "zero" in weight_name):
+        # Case weight scales, zero_points and offset
+        if ("scale" in weight_name or "zero" in weight_name
+                or "offset" in weight_name):
             # load the weight scales and zp based on the quantization scheme
             # supported weight scales/zp can be found in
             # FusedMoeWeightScaleSupported

From 8d977afbd9643f4999817edaae57a4d1129be766 Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 26 Mar 2025 21:54:36 -0700
Subject: [PATCH 1008/1240] [V1] Refactor num_computed_tokens logic (#15307)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py     | 16 ++++-
 tests/v1/engine/test_engine_core.py | 18 +++---
 vllm/v1/core/sched/scheduler.py     | 91 +++++++++++++++--------------
 vllm/v1/sample/rejection_sampler.py | 19 ++++++
 vllm/v1/worker/gpu_model_runner.py  | 19 ++++--
 5 files changed, 106 insertions(+), 57 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index c12f2fd5943..24a51288cbb 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -244,7 +244,9 @@ def test_schedule_partial_requests():
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        # Only the first request has a sampled token id because
+        # the rest requests are still being prefilled.
+        sampled_token_ids=[[0], [], []],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -266,7 +268,7 @@ def test_schedule_partial_requests():
 
 
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
-def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
     """Test scheduling behavior with concurrent partial requests.
 
     This test verifies that: there are multiple long prefill requests in the
@@ -304,7 +306,7 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     model_runner_output = ModelRunnerOutput(
         req_ids=[request.request_id for request in requests],
         req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        sampled_token_ids=[[] for _ in range(len(requests))],
         spec_token_ids=None,
         logprobs=None,
         prompt_logprobs_dict={},
@@ -325,6 +327,14 @@ def test_schedule_concurrent_partial_requestse(enable_prefix_caching: bool):
     # Schedule the third step. All three requests are running.
     # First and second requests are in the decode stage.
     # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
     scheduler.update_from_output(output1, model_runner_output)
     output2 = scheduler.schedule()
     assert len(scheduler.running) == 3
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index ca5ff8fa845..3f3109c1484 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -231,8 +231,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
     Test that the engine can handle multiple concurrent batches.
     """
 
-    def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: int,
+                                     max_tokens: int) -> EngineCoreRequest:
         request = make_request()
+        request.request_id = req_id
         request.sampling_params.max_tokens = max_tokens
         return request
 
@@ -279,6 +281,8 @@ def max_concurrent_batches(self) -> int:
             # Avoid all requests being scheduled once.
             enable_prefix_caching=False,
             max_num_batched_tokens=10,
+            # Reduce startup time.
+            enforce_eager=True,
         )
         vllm_config = engine_args.create_engine_config()
         engine_core = EngineCore(vllm_config=vllm_config,
@@ -286,13 +290,13 @@ def max_concurrent_batches(self) -> int:
                                  executor_class=DummyExecutor)
         assert engine_core.batch_queue is not None
 
-        # Add two requests in a row.
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
+        # Add two requests in a row. Each request have 12 prompt tokens.
+        req0 = make_request_with_max_tokens(0, 5)
+        engine_core.add_request(req0)
+        req1 = make_request_with_max_tokens(1, 5)
+        engine_core.add_request(req1)
 
-        # First saturate the batch queue.
+        # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue() is None
         assert engine_core.batch_queue.qsize() == 1
         assert engine_core.step_with_batch_queue() is None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 850687423df..ba7c691306b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -153,9 +153,9 @@ def schedule(self) -> SchedulerOutput:
 
             num_new_tokens = (request.num_tokens_with_spec -
                               request.num_computed_tokens)
-            if self.scheduler_config.long_prefill_token_threshold > 0:
-                num_new_tokens = min(
-                    num_new_tokens,
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
                     self.scheduler_config.long_prefill_token_threshold)
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
@@ -303,9 +303,9 @@ def schedule(self) -> SchedulerOutput:
                     num_computed_tokens -= self.block_size
                     num_new_tokens = self.block_size
                     computed_blocks.pop()
-                if self.scheduler_config.long_prefill_token_threshold > 0:
-                    num_new_tokens = min(
-                        num_new_tokens,
+                if (0 < self.scheduler_config.long_prefill_token_threshold <
+                        num_new_tokens):
+                    num_new_tokens = (
                         self.scheduler_config.long_prefill_token_threshold)
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
@@ -433,6 +433,18 @@ def schedule(self) -> SchedulerOutput:
             grammar_bitmask=grammar_bitmask,
         )
 
+        # Advance the number of computed tokens for the request AFTER
+        # the request is scheduled.
+        # 1. The scheduler_output of the current step has to include the
+        #    original number of scheduled tokens to determine input IDs.
+        # 2. Advance the number of computed tokens here allowing us to
+        #    schedule the prefill request again immediately in the next
+        #    scheduling step.
+        # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+        #    computed tokens will be adjusted in update_from_output.
+        for req_id, num_scheduled_token in num_scheduled_tokens.items():
+            self.requests[req_id].num_computed_tokens += num_scheduled_token
+
         self.finished_req_ids = set()
         return scheduler_output
 
@@ -561,28 +573,19 @@ def update_from_output(
 
             req_index = model_runner_output.req_id_to_index[req_id]
             generated_token_ids = sampled_token_ids[req_index]
-            if req_id not in scheduler_output.scheduled_spec_decode_tokens:
-                # When the request's num_computed_tokens catches up
-                # its num_tokens, the request generates output tokens.
-                # Otherwise, we ignore the sampler output for the request.
-                request.num_computed_tokens += num_tokens_scheduled
-                assert request.num_computed_tokens <= request.num_tokens
-            else:
-                # num_computed_tokens_step represents the number of tokens
+
+            scheduled_spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id))
+            if scheduled_spec_token_ids:
+                # num_computed_tokens represents the number of tokens
                 # processed in the current step, considering scheduled
-                # tokens and rejections.
-                # It is calculated as:
-                # num_computed_tokens_step = num_scheduled_tokens -
-                #                            num_tokens_rejected,
-                # where num_tokens_rejected is given by:
+                # tokens and rejections. If some tokens are rejected,
+                # num_computed_tokens is decreased by the number of rejected
+                # tokens, where is given by:
                 # len(scheduled_spec_token_ids) + 1 - len(generated_token_ids).
-                scheduled_spec_token_ids = (
-                    scheduler_output.scheduled_spec_decode_tokens[req_id])
-
-                num_computed_tokens_step = num_scheduled_tokens[req_id] - (
-                    len(scheduled_spec_token_ids) + 1 -
-                    len(generated_token_ids))
-                request.num_computed_tokens += num_computed_tokens_step
+                num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
+                                       len(generated_token_ids))
+                request.num_computed_tokens -= num_tokens_rejected
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -605,24 +608,26 @@ def update_from_output(
             new_logprobs = None
             new_token_ids: list[int] = []
 
-            if request.num_computed_tokens >= request.num_tokens:
-                for output_token_id in generated_token_ids:
-                    request.append_output_token_ids(output_token_id)
-                    new_token_ids.append(output_token_id)
-
-                    # Check for stop and update request state.
-                    # This must be called before we make the EngineCoreOutput.
-                    stopped = check_stop(request, self.max_model_len)
-                    if stopped:
-                        self._free_request(request)
-                        break
+            # Append generated tokens and check for stop. Note that if
+            # a request is still being prefilled, we expect the model runner
+            # to return empty token ids for the request.
+            for output_token_id in generated_token_ids:
+                request.append_output_token_ids(output_token_id)
+                new_token_ids.append(output_token_id)
+
+                # Check for stop and update request state.
+                # This must be called before we make the EngineCoreOutput.
+                stopped = check_stop(request, self.max_model_len)
+                if stopped:
+                    self._free_request(request)
+                    break
 
-                # Extract sample logprobs if needed.
-                if request.sampling_params.logprobs is not None:
-                    assert logprobs is not None
-                    # NOTE: once we support N tokens per step (spec decode),
-                    # the outer lists can be of length > 1.
-                    new_logprobs = logprobs.slice(req_index, req_index + 1)
+            # Extract sample logprobs if needed.
+            if (request.sampling_params.logprobs is not None
+                    and logprobs is not None):
+                # NOTE: once we support N tokens per step (spec decode),
+                # the outer lists can be of length > 1.
+                new_logprobs = logprobs.slice(req_index, req_index + 1)
 
             if new_token_ids and request.use_structured_output:
                 # NOTE: structured_output_request
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 69bc68174d5..e5b8872a2a3 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -107,14 +107,33 @@ def forward(
     @staticmethod
     def parse_output(
         output_token_ids: torch.Tensor,
+        ignored_req_idxs: list[int],
         vocab_size: int,
     ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            ignored_req_idxs: The indices of the requests that should not be
+                sampled. This is usually because the request is still in the
+                prefill phase.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
         output_token_ids_np = output_token_ids.cpu().numpy()
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                       (output_token_ids_np < vocab_size))
+
+        ignored_req_idx_set = set(ignored_req_idxs)
         outputs = [
             row[valid_mask[i]].tolist()
+            if i not in ignored_req_idx_set else []
             for i, row in enumerate(output_token_ids_np)
         ]
         return outputs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a85009f1a36..bcf7762b444 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1085,8 +1085,8 @@ def execute_model(
 
         # TODO(woosuk): The following loop can be slow since it iterates over
         # the requests one by one. Optimize.
-        for i, generator in self.input_batch.generators.items():
-            req_id = self.input_batch.req_ids[i]
+        discard_sampled_tokens_req_indices = []
+        for i, req_id in enumerate(self.input_batch.req_ids):
             req_state = self.requests[req_id]
             seq_len = (req_state.num_computed_tokens +
                        scheduler_output.num_scheduled_tokens[req_id])
@@ -1094,7 +1094,12 @@ def execute_model(
                 # Ignore the sampled token for partial prefills.
                 # Rewind the generator state as if the token was not sampled.
                 # This relies on cuda-specific torch-internal impl details
-                generator.set_offset(generator.get_offset() - 4)
+                generator = self.input_batch.generators.get(i)
+                if generator is not None:
+                    generator.set_offset(generator.get_offset() - 4)
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
 
         # NOTE: GPU -> CPU Sync happens here.
         # Move as many CPU operations as possible before this sync point.
@@ -1114,10 +1119,16 @@ def execute_model(
         if max_gen_len == 1:
             # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()
+            # Mask out the sampled tokens that should not be sampled.
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
-                sampled_token_ids, self.input_batch.vocab_size)
+                sampled_token_ids,
+                discard_sampled_tokens_req_indices,
+                self.input_batch.vocab_size,
+            )
 
         if not self.use_spec_decode:
             spec_token_ids = None

From ea76699722b150b6792543a7aeb66b43ee96de9f Mon Sep 17 00:00:00 2001
From: Jerry Zhang <jerryzh168@gmail.com>
Date: Wed, 26 Mar 2025 22:45:51 -0700
Subject: [PATCH 1009/1240] Allow torchao quantization in SiglipMLP (#15575)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/siglip.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 518dbc73f8c..cecad9e8935 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -208,8 +208,10 @@ def __init__(
 
         self.config = config
         self.activation_fn = get_act_fn(config.hidden_act)
-        # Special handling for BNB quantization
-        if quant_config and quant_config.get_name() == "bitsandbytes":
+        # Special handling for BNB and torchao quantization
+        if quant_config and quant_config.get_name() in [
+                "bitsandbytes", "torchao"
+        ]:
             quantizable = True
         else:
             # For other quantization, we require the hidden size to be a

From 1675513970a241fdc42baa679c2adce5a48ad7fe Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 27 Mar 2025 01:46:12 -0400
Subject: [PATCH 1010/1240] [ROCm] Env variable to trigger custom PA (#15557)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/rocm_flash_attn.py | 3 ++-
 vllm/envs.py                               | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 34f5fedcf36..f19773bb284 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -908,4 +908,5 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
diff --git a/vllm/envs.py b/vllm/envs.py
index 46c5b3a1dc5..e16753191c6 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -78,6 +78,7 @@
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
+    VLLM_ROCM_CUSTOM_PAGED_ATTN: bool = True
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = True
     VLLM_LOG_BATCHSIZE_INTERVAL: float = -1
     VLLM_DISABLE_COMPILE_CACHE: bool = False
@@ -541,6 +542,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_ROCM_MOE_PADDING":
     lambda: bool(int(os.getenv("VLLM_ROCM_MOE_PADDING", "1"))),
 
+    # custom paged attention kernel for MI3* cards
+    "VLLM_ROCM_CUSTOM_PAGED_ATTN":
+    lambda: (os.getenv("VLLM_ROCM_CUSTOM_PAGED_ATTN", "True").lower() in
+             ("true", "1")),
+
     # Divisor for dynamic query scale factor calculation for FP8 KV Cache
     "Q_SCALE_CONSTANT":
     lambda: int(os.getenv("Q_SCALE_CONSTANT", "200")),

From adadb10eb9b44fd89d0e6cea28d973088d37756e Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 26 Mar 2025 22:46:26 -0700
Subject: [PATCH 1011/1240] [TPU] [V1] fix cases when max_num_reqs is set
 smaller than MIN_NUM_SEQS (#15583)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/tpu.py  | 5 +----
 vllm/v1/worker/tpu_model_runner.py | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 4a8f17ba1d0..956219d30f3 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -14,10 +14,7 @@
 ]
 N = 1
 # Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0.7,
-                                 top_p=1.0,
-                                 n=N,
-                                 max_tokens=16)
+sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
 
 # Set `enforce_eager=True` to avoid ahead-of-time compilation.
 # In real workloads, `enforace_eager` should be `False`.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index cf5c56b98be..65a4048ae74 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -88,7 +88,7 @@ def __init__(
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
-        self.max_num_reqs = scheduler_config.max_num_seqs
+        self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
 
         # Model-related.
         self.num_attn_layers = model_config.get_num_layers_by_block_type(

From 5d17ed813231d0d1e756be1145ab51169b419e24 Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Wed, 26 Mar 2025 23:21:07 -0700
Subject: [PATCH 1012/1240] [Misc] Restrict ray version dependency and update
 PP feature warning in V1 (#15556)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/cuda.txt    | 2 +-
 requirements/test.in     | 2 +-
 vllm/config.py           | 2 +-
 vllm/engine/arg_utils.py | 7 +++++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 702d4b0bb32..ad7198081e0 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -4,7 +4,7 @@
 numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
 
 # Dependencies for NVIDIA GPUs
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
 torch==2.6.0
 torchaudio==2.6.0
 # These must be updated alongside torch
diff --git a/requirements/test.in b/requirements/test.in
index 5c59bbd1ac7..3df5e32cd59 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
 peft
 pqdm
-ray[cgraph]>=2.43.0 # Ray Compiled Graph, required by pipeline parallelism tests
+ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/vllm/config.py b/vllm/config.py
index 2e9325c258b..62800afc3e6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -313,7 +313,7 @@ def __init__(
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
                 "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile"
+                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
                 "for instructions on how to install it.")
 
         # The tokenizer version is consistent with the model version by default.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 364555b3458..784ea35beb3 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1686,8 +1686,11 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         if self.enable_lora and _warn_or_fallback("LORA"):
             return False
 
-        # PP is supported on V1, but off by default for now.
-        if self.pipeline_parallel_size > 1 and _warn_or_fallback("PP"):
+        # PP is supported on V1 with Ray distributed executor,
+        # but off for MP distributed executor for now.
+        if (self.pipeline_parallel_size > 1
+                and self.distributed_executor_backend == "mp"
+                and _warn_or_fallback("PP (MP distributed executor)")):
             return False
 
         # ngram is supported on V1, but off by default for now.

From 5bf0806fec7690a8aa8b6ccde7d762a3bc59f9b9 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 02:43:02 -0400
Subject: [PATCH 1013/1240] [TPU] Avoid Triton Import (#15589)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py  | 6 +++---
 vllm/model_executor/layers/quantization/fp8.py | 8 +++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 711bdfd6885..750c5f731c7 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -16,8 +16,6 @@
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.utils import set_weight_attrs
@@ -119,7 +117,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w2_weight = torch.nn.Parameter(self._maybe_pad_weight(
             layer.w2_weight.data),
                                              requires_grad=False)
-
+        # Lazy import to avoid importing triton.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            is_rocm_aiter_moe_enabled, shuffle_weights)
         if is_rocm_aiter_moe_enabled():
             # reshaping weights is required for aiter moe kernel.
             shuffled_w13, shuffled_w2 = shuffle_weights(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bc17a569da2..f3907b4784b 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -13,9 +13,6 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase,
                                                   FusedMoeWeightScaleSupported)
-from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-    expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
-    is_rocm_aiter_moe_enabled, shuffle_weights)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -532,6 +529,11 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
             layer.w2_input_scale = None
 
     def process_weights_after_loading(self, layer: Module) -> None:
+        # Lazy import to avoid importing triton too early.
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            expand_weights, is_rocm_aiter_block_scaled_moe_enabled,
+            is_rocm_aiter_moe_enabled, shuffle_weights)
+
         # TODO (rob): refactor block quant into separate class.
         if self.block_quant:
             assert self.quant_config.activation_scheme == "dynamic"

From 5d648f70b6aedad4c74e938fff244f213daca7bc Mon Sep 17 00:00:00 2001
From: Bella kira <89331823+Avabowler@users.noreply.github.com>
Date: Thu, 27 Mar 2025 14:43:43 +0800
Subject: [PATCH 1014/1240] [Misc] Consolidate LRUCache implementations
 (#15481)

Signed-off-by: Bella kira <2374035698@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/multimodal/processing.py |   3 +-
 vllm/utils.py                 | 157 ++++++++++++++++++++++------------
 2 files changed, 104 insertions(+), 56 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fec77acc1d1..c8864c33fe3 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -12,7 +12,6 @@
                     TypeVar, Union, cast)
 
 import torch
-from cachetools import LRUCache
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 from typing_extensions import assert_never
 
@@ -21,7 +20,7 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
                                                encode_tokens)
-from vllm.utils import GiB_bytes, flatten_2d_lists, full_groupby
+from vllm.utils import GiB_bytes, LRUCache, flatten_2d_lists, full_groupby
 
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
diff --git a/vllm/utils.py b/vllm/utils.py
index 73de826266d..516b33dca1d 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -33,15 +33,17 @@
 import warnings
 import weakref
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
-from collections import OrderedDict, UserDict, defaultdict
+from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
-                             Iterable, Iterator, Mapping)
+                             Iterable, Iterator, KeysView, Mapping)
 from dataclasses import dataclass, field
 from functools import cache, lru_cache, partial, wraps
+from types import MappingProxyType
 from typing import (TYPE_CHECKING, Any, Callable, Generic, Literal, NamedTuple,
-                    Optional, Type, TypeVar, Union)
+                    Optional, Type, TypeVar, Union, cast, overload)
 from uuid import uuid4
 
+import cachetools
 import cloudpickle
 import numpy as np
 import numpy.typing as npt
@@ -173,6 +175,7 @@
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
+_T = TypeVar("_T")
 
 
 class _Sentinel:
@@ -206,6 +209,19 @@ def reset(self) -> None:
         self.counter = 0
 
 
+class _MappingOrderCacheView(UserDict[_K, _V]):
+
+    def __init__(self, data: Mapping[_K, _V], ordered_keys: Mapping[_K, None]):
+        super().__init__(data)
+        self.ordered_keys = ordered_keys
+
+    def __iter__(self) -> Iterator[_K]:
+        return iter(self.ordered_keys)
+
+    def keys(self) -> KeysView[_K]:
+        return KeysView(self.ordered_keys)
+
+
 class CacheInfo(NamedTuple):
     hits: int
     total: int
@@ -218,45 +234,62 @@ def hit_ratio(self) -> float:
         return self.hits / self.total
 
 
-class LRUCache(Generic[_K, _V]):
-    """Note: This class is not thread safe!"""
+class LRUCache(cachetools.LRUCache[_K, _V], Generic[_K, _V]):
 
-    def __init__(self, capacity: int) -> None:
-        self.cache = OrderedDict[_K, _V]()
+    def __init__(self,
+                 capacity: float,
+                 getsizeof: Optional[Callable[[_V], float]] = None):
+        super().__init__(capacity, getsizeof)
         self.pinned_items = set[_K]()
         self.capacity = capacity
 
         self._hits = 0
         self._total = 0
 
-    def __contains__(self, key: _K) -> bool:
-        return key in self.cache
-
-    def __len__(self) -> int:
-        return len(self.cache)
-
-    def __getitem__(self, key: _K) -> _V:
-        value = self.cache[key]  # Raise KeyError if not exists
-        self.cache.move_to_end(key)
-        return value
+    def __delitem__(self, key: _K) -> None:
+        run_on_remove = key in self
+        value = self.__getitem__(key)
+        super().__delitem__(key)
+        if key in self.pinned_items:
+            # Todo: add warning to inform that del pinned item
+            self._unpin(key)
+        if run_on_remove:
+            self._on_remove(key, value)
 
-    def __setitem__(self, key: _K, value: _V) -> None:
-        self.put(key, value)
+    @property
+    def cache(self) -> Mapping[_K, _V]:
+        """Return the internal cache dictionary in order (read-only)."""
+        return _MappingOrderCacheView(
+            self._Cache__data,  # type: ignore
+            self.order)
 
-    def __delitem__(self, key: _K) -> None:
-        self.pop(key)
+    @property
+    def order(self) -> Mapping[_K, None]:
+        """Return the internal order dictionary (read-only)."""
+        return MappingProxyType(self._LRUCache__order)  # type: ignore
 
     def stat(self) -> CacheInfo:
         return CacheInfo(hits=self._hits, total=self._total)
 
     def touch(self, key: _K) -> None:
-        self.cache.move_to_end(key)
+        self._LRUCache__update(key)  # type: ignore
+
+    @overload
+    def get(self, key: _K, /) -> Optional[_V]:
+        ...
+
+    @overload
+    def get(self, key: _K, /, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
 
-    def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        value: Optional[_V]
-        if key in self.cache:
-            value = self.cache[key]
-            self.cache.move_to_end(key)
+    def get(self,
+            key: _K,
+            /,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key in self:
+            value = self.__getitem__(key)
 
             self._hits += 1
         else:
@@ -265,60 +298,76 @@ def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
         self._total += 1
         return value
 
+    @overload
+    def pop(self, key: _K) -> _V:
+        ...
+
+    @overload
+    def pop(self, key: _K, default: Union[_V, _T]) -> Union[_V, _T]:
+        ...
+
+    def pop(self,
+            key: _K,
+            default: Optional[Union[_V,
+                                    _T]] = None) -> Optional[Union[_V, _T]]:
+        value: Optional[Union[_V, _T]]
+        if key not in self:
+            return default
+
+        value = self[key]
+        del self[key]
+        return value
+
     def put(self, key: _K, value: _V) -> None:
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        self._remove_old_if_needed()
+        self.__setitem__(key, value)
 
     def pin(self, key: _K) -> None:
         """
         Pins a key in the cache preventing it from being
         evicted in the LRU order.
         """
-        if key not in self.cache:
+        if key not in self:
             raise ValueError(f"Cannot pin key: {key} not in cache.")
         self.pinned_items.add(key)
 
     def _unpin(self, key: _K) -> None:
+        """
+        Unpins a key in the cache allowing it to be
+        evicted in the LRU order.
+        """
         self.pinned_items.remove(key)
 
     def _on_remove(self, key: _K, value: Optional[_V]) -> None:
         pass
 
     def remove_oldest(self, *, remove_pinned: bool = False) -> None:
-        if not self.cache:
+        if len(self) == 0:
             return
 
+        self.popitem(remove_pinned=remove_pinned)
+
+    def _remove_old_if_needed(self) -> None:
+        while self.currsize > self.capacity:
+            self.remove_oldest()
+
+    def clear(self) -> None:
+        while len(self) > 0:
+            self.remove_oldest(remove_pinned=True)
+
+    def popitem(self, remove_pinned: bool = False):
+        """Remove and return the `(key, value)` pair least recently used."""
         if not remove_pinned:
             # pop the oldest item in the cache that is not pinned
             lru_key = next(
-                (key for key in self.cache if key not in self.pinned_items),
+                (key for key in self.order if key not in self.pinned_items),
                 ALL_PINNED_SENTINEL)
             if lru_key is ALL_PINNED_SENTINEL:
                 raise RuntimeError("All items are pinned, "
                                    "cannot remove oldest from the cache.")
         else:
-            lru_key = next(iter(self.cache))
-        self.pop(lru_key)  # type: ignore
-
-    def _remove_old_if_needed(self) -> None:
-        while len(self.cache) > self.capacity:
-            self.remove_oldest()
-
-    def pop(self, key: _K, default: Optional[_V] = None) -> Optional[_V]:
-        run_on_remove = key in self.cache
-        value = self.cache.pop(key, default)
-        # remove from pinned items
-        if key in self.pinned_items:
-            self._unpin(key)
-        if run_on_remove:
-            self._on_remove(key, value)
-        return value
-
-    def clear(self) -> None:
-        while len(self.cache) > 0:
-            self.remove_oldest(remove_pinned=True)
-        self.cache.clear()
+            lru_key = next(iter(self.order))
+        value = self.pop(cast(_K, lru_key))
+        return (lru_key, value)
 
 
 class PyObjectCache:

From 52426d5ea74c0681d56d03c73e0e4c69625ffbca Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 02:47:25 -0400
Subject: [PATCH 1015/1240] [Quantization] Fp8 Channelwise Dynamic Per Token
 GroupedGEMM (#15587)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
Signed-off-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: ElizaWszola <eliza@neuralmagic.com>
Co-authored-by: Lucas Wilkinson <wilkinson.lucas@gmail.com>
Co-authored-by: ElizaWszola <ewszola@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py |  26 -----
 .../compressed_tensors_moe.py                 | 107 +++++++++++-------
 2 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 750c5f731c7..ef33852e316 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -885,32 +885,6 @@ def make_expert_params_mapping(
             ]
         ]
 
-    def _load_fp8_scale(self, param: torch.nn.Parameter,
-                        loaded_weight: torch.Tensor, weight_name: str,
-                        shard_id: str, expert_id: int) -> None:
-        param_data = param.data
-
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if param_data[expert_id] != 1 and (param_data[expert_id] -
-                                               loaded_weight).abs() > 1e-5:
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}")
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            # If we are in merged column case (gate_up_proj)
-            if shard_id in ("w1", "w3"):
-                # We have to keep the weight scales of w1 and w3 because
-                # we need to re-quantize w1/w3 weights after weight loading.
-                idx = 0 if shard_id == "w1" else 1
-                param_data[expert_id][idx] = loaded_weight
-            # If we are in the row parallel case (down_proj)
-            else:
-                param_data[expert_id] = loaded_weight
-
     def extra_repr(self) -> str:
 
         s = (
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 2e14845ff2d..bf32bee89e8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -268,14 +268,23 @@ def __init__(
         self.input_quant = self.quant_config.target_scheme_map["Linear"].get(
             "input_activations")
 
-        if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
-                and self.input_quant.strategy == QuantizationStrategy.TENSOR):
+        per_tensor = (self.weight_quant.strategy == QuantizationStrategy.TENSOR
+                      and self.input_quant.strategy
+                      == QuantizationStrategy.TENSOR)
+        per_channel = (
+            self.weight_quant.strategy == QuantizationStrategy.CHANNEL
+            and self.input_quant.strategy == QuantizationStrategy.TOKEN)
+        if not (per_tensor or per_channel):
             raise ValueError(
-                "For FP8 Fused MoE layers, only per-tensor scales "
-                "for weights and activations are supported. Found "
+                "For FP8 Fused MoE layers, we require per tensor "
+                "or channelwise, dynamic per token quantization. Found "
                 f"{self.weight_quant}, {self.input_quant}")
 
         self.static_input_scales = not self.input_quant.dynamic
+        if self.static_input_scales and per_channel:
+            raise ValueError(
+                "For FP8 Fused MoE layer, we require either per tensor or "
+                "channelwise, dynamic per token quantization.")
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
@@ -303,24 +312,40 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         set_weight_attrs(w2_weight, extra_weight_attrs)
 
         # WEIGHT_SCALES
-        # Allocate 2 scales for w1 and w3 respectively.
-        # They will be combined to a single scale after weight loading.
-        w13_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                         2,
-                                                         dtype=torch.float32),
-                                              requires_grad=False)
-        layer.register_parameter("w13_weight_scale", w13_weight_scale)
-
-        w2_weight_scale = torch.nn.Parameter(torch.ones(num_experts,
-                                                        dtype=torch.float32),
-                                             requires_grad=False)
-        layer.register_parameter("w2_weight_scale", w2_weight_scale)
-        # Add the quantization method used (per tensor/grouped/channel)
-        # to ensure the weight scales are loaded in properly
-        extra_weight_attrs.update(
-            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
-        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
-        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They are combined to a single scale after weight loading.
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, 2, dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-TENSOR quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+
+        elif self.weight_quant.strategy == QuantizationStrategy.CHANNEL:
+            w13_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                1,
+                dtype=torch.float32),
+                                                  requires_grad=False)
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            w2_weight_scale = torch.nn.Parameter(torch.ones(
+                num_experts, hidden_size, 1, dtype=torch.float32),
+                                                 requires_grad=False)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Add PER-CHANNEL quantization for FusedMoE.weight_loader.
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value})
+            set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+            set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
         # INPUT_SCALES
         if self.static_input_scales:
@@ -362,6 +387,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
+            assert self.input_quant.strategy == QuantizationStrategy.TENSOR
             if (layer.w13_input_scale is None or layer.w2_input_scale is None):
                 raise ValueError(
                     "QuantConfig has static quantization, but found "
@@ -377,24 +403,25 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_input_scale = torch.nn.Parameter(
                 layer.w2_input_scale.max(), requires_grad=False)
 
-        # Fp8 moe kernel needs single weight scale for w13 per expert.
-        # We take the max then dequant and requant each expert.
-        assert layer.w13_weight_scale is not None
-        shard_size = layer.intermediate_size_per_partition
-        max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-        for expert_id in range(layer.local_num_experts):
-            start = 0
-            for shard_id in range(2):
-                dq_weight = per_tensor_dequantize(
-                    layer.w13_weight[expert_id][start:start + shard_size, :],
-                    layer.w13_weight_scale[expert_id][shard_id])
-                layer.w13_weight[expert_id][
-                    start:start + shard_size, :], _ = ops.scaled_fp8_quant(
-                        dq_weight, max_w13_scales[expert_id])
-                start += shard_size
-
-        layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
-                                                    requires_grad=False)
+        # For Per-TENSOR case, Fp8 moe kernel needs single weight scale
+        # for w13 per expert. Use max then dequant and requant each expert.
+        if self.weight_quant.strategy == QuantizationStrategy.TENSOR:
+            assert layer.w13_weight_scale is not None
+            shard_size = layer.intermediate_size_per_partition
+            max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+            for expert_id in range(layer.local_num_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        layer.w13_weight[expert_id][start:start +
+                                                    shard_size, :],
+                        layer.w13_weight_scale[expert_id][shard_id])
+                    layer.w13_weight[expert_id][
+                        start:start + shard_size, :], _ = ops.scaled_fp8_quant(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+            layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
+                                                        requires_grad=False)
 
     def apply(
         self,

From 8b8e4cc3e0534ef95d174317a1ec4b1824446c65 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Mar 2025 15:45:00 +0800
Subject: [PATCH 1016/1240] [Misc] Clean up `scatter_patch_features` (#15559)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/gemma3_mm.py |  17 ++--
 vllm/model_executor/models/internvl.py  |  21 ++---
 vllm/model_executor/models/llava.py     |  22 +++--
 vllm/model_executor/models/molmo.py     | 105 ++++++++----------------
 vllm/model_executor/models/pixtral.py   |  18 ++--
 vllm/model_executor/models/vision.py    |  39 ++++-----
 6 files changed, 84 insertions(+), 138 deletions(-)

diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 63d3ccbf54b..9efb57b8c5a 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -30,7 +30,6 @@
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -60,7 +59,7 @@ class Gemma3ImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -593,6 +592,7 @@ def _parse_and_validate_image_input(
 
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
@@ -635,14 +635,10 @@ def get_multimodal_embeddings(
 
         image_features = self._process_image_input(image_input)
 
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -671,7 +667,6 @@ def forward(self,
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
             inputs_embeds = self.get_input_embeddings(input_ids,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index e1aa3716103..0729f4c7d20 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -35,7 +35,6 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
@@ -66,13 +65,13 @@ class InternVLImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: NestedTensors
+    data: Union[torch.Tensor, list[torch.Tensor]]
     """ 
     A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
     or a list of tensors of shape `(total_image_feature_size, hidden_size)`
@@ -867,6 +866,7 @@ def _parse_and_validate_image_input(
 
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
@@ -881,7 +881,7 @@ def _parse_and_validate_image_input(
     def _process_image_input(
         self,
         image_input: InternVLImageInputs,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -921,15 +921,13 @@ def get_multimodal_embeddings(
 
         image_features = self._process_image_input(image_input)
 
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values"):
+        if image_input["type"] != "pixel_values":
             return image_features
 
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -964,7 +962,6 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d1014067d9d..826f04b3754 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -35,7 +35,6 @@
                                         PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .clip import CLIPVisionModel
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -73,7 +72,7 @@ class PixtralHFImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
     
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -618,6 +617,8 @@ def _parse_and_validate_image_input(
                     raise ValueError("Incorrect type of embed_is_patch. "
                                      f"Got type: {type(embed_is_patch)}")
 
+                embed_is_patch = flatten_bn(embed_is_patch)
+
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
@@ -713,18 +714,16 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        vision_embeddings = self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
 
-        if (kwargs.get("v0_path", False)
-                or image_input["type"] != "pixel_values_pixtral"):
+        if image_input["type"] != "pixel_values_pixtral":
             # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return vision_embeddings
+            return image_features
 
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                vision_embeddings,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -790,7 +789,6 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 146d48e5221..9224687d8a5 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -49,7 +49,6 @@
                                         PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP, SupportsQuant)
@@ -72,17 +71,17 @@
 
 class MolmoImageInputs(TypedDict):
     images: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_crops, num_patch, patch_dim)`"""
+    """Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
 
     image_masks: Optional[Union[torch.Tensor, list[torch.Tensor]]]
-    """Shape: `(batch_size, num_crops, num_patch)`"""
+    """Shape: `(batch_size * num_images, num_crops, num_patch)`"""
 
     feat_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image features correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_crops, num_patch)`
+    Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
     embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
@@ -90,7 +89,7 @@ class MolmoImageInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
     
-    Shape: `(batch_size, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
     num_crops: Union[torch.Tensor, list[torch.Tensor]]
@@ -696,9 +695,10 @@ def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         return image_features
 
     def forward(
-        self, images: torch.Tensor, image_masks: torch.Tensor
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
-
+        self,
+        images: torch.Tensor,
+        image_masks: torch.Tensor,
+    ) -> torch.Tensor:
         # image_features: (batch_size, num_crops(=num_image), num_patch, nximage_emb_dim) # noqa: E501
         batch_size, num_image = images.shape[:2]
         images = images.to(device=self.device, dtype=self.dtype)
@@ -1491,6 +1491,8 @@ def _parse_and_validate_image_input(
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
@@ -1502,13 +1504,17 @@ def _parse_and_validate_image_input(
     def _process_image_input(
         self,
         image_input: MolmoImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
-        if isinstance(image_input["images"], list):
+    ) -> list[torch.Tensor]:
+        images = image_input["images"]
+        image_masks = image_input["image_masks"]
+        feat_is_patch = image_input["feat_is_patch"]
+        num_crops = image_input["num_crops"]
+
+        if isinstance(images, list):
             # Call the vision backbone on the whole batch at once
-            images_flat = flatten_bn(image_input["images"], concat=True)
-            image_masks_flat = (None if (image_masks :=
-                                         image_input["image_masks"]) is None
-                                else flatten_bn(image_masks, concat=True))
+            images_flat = flatten_bn(images, concat=True)
+            image_masks_flat = (None if image_masks is None else flatten_bn(
+                image_masks, concat=True))
 
             image_features_flat = self.vision_backbone(
                 images=images_flat.unsqueeze(0),
@@ -1517,63 +1523,19 @@ def _process_image_input(
             ).squeeze(0)
 
             # Reconstruct the batch dimension
-            image_features = image_features_flat.split(
-                image_input["num_crops"].sum(-1).tolist())
+            num_crops_per_image = [nc.sum().item() for nc in num_crops]
+            image_features = image_features_flat.split(num_crops_per_image)
         else:
             image_features = self.vision_backbone(
-                images=image_input["images"],
-                image_masks=image_input["image_masks"],
+                images=images,
+                image_masks=image_masks,
             )
 
-        return image_features
-
-    def _get_mm_embeds(
-            self,
-            features: torch.Tensor,  # Shape: (num_crop, num_patch, d)
-            feat_is_patch: torch.Tensor,  # Shape: (num_crop, num_patch)
-            num_crops: torch.Tensor,  # Shape: (num_images,)
-            embed_is_patch: torch.Tensor,  # Shape: (num_embeds,)
-    ) -> tuple[torch.Tensor, ...]:
-        """
-        Scatter the patch features into a contiguous tensor that corresponds
-        to the embedding tokens defined by the multimodal processor.
-
-        Note:
-            The original code only considers patch tokens as feature
-            tokens, but our processor considers all image-related tokens
-            as feature tokens because the feature tokens need to be
-            consecutive in `input_ids`.
-        
-        Example:
-            A simplified example for one item in the batch:
-
-            .. code-block::
-
-                Embedding tokens (from HF processor):
-                [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-                embed_is_patch (from HF processor):
-                [ False   True    True    False    True    True   False  False ]
-    
-                Encoder outputs (from model):
-                        [  p1      p2       0       p3      p4      0   ]
-
-                feat_is_patch (from HF processor):
-                        [ True    True    False    True    True   False ]
-
-                The resulting embedding tensor is:
-                [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-        """
-        num_crops_per_image = num_crops.tolist()
-        feats_per_image = features.split(num_crops_per_image)
-        f_is_patch_per_image = feat_is_patch.split(num_crops_per_image)
-
-        features = torch.cat([
+        # Only the features corresponding to patch tokens are relevant
+        return [
             feats[f_is_patch]
-            for feats, f_is_patch in zip(feats_per_image, f_is_patch_per_image)
-        ])
-
-        return scatter_patch_features(features, embed_is_patch)
+            for feats, f_is_patch in zip(image_features, feat_is_patch)
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -1583,13 +1545,10 @@ def get_multimodal_embeddings(
 
         image_features = self._process_image_input(image_input)
 
-        return flatten_2d_lists(
-            self._get_mm_embeds(*args) for args in zip(
-                image_features,
-                image_input["feat_is_patch"],
-                image_input["num_crops"],
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a3ad3609612..da2017c987d 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -42,7 +42,6 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
@@ -74,7 +73,7 @@ class PixtralImagePixelInputs(TypedDict):
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
 
-    Shape: `(batch_size, num_images, num_embeds)`
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -387,6 +386,8 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of embed_is_patch. "
                              f"Got type: {type(embed_is_patch)}")
 
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
@@ -428,14 +429,10 @@ def get_multimodal_embeddings(
 
         image_features = self._process_image_input(image_input)
 
-        if kwargs.get("v0_path", False):
-            return image_features
-
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                image_features,
-                image_input["embed_is_patch"],
-            ))
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -467,7 +464,6 @@ def forward(
         # NOTE: In v1, inputs_embeds is always generated at model runner, this
         # condition is for v0 compatibility.
         elif inputs_embeds is None:
-            kwargs.update({"v0_path": True})
             vision_embeddings = self.get_multimodal_embeddings(**kwargs)
             inputs_embeds = self.get_input_embeddings(input_ids,
                                                       vision_embeddings)
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index c9145939830..db069f8de2a 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
 
 import torch
@@ -154,8 +155,8 @@ def resolve_visual_encoder_outputs(
 
 
 def scatter_patch_features(
-    features: torch.Tensor,
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]],
+    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
+    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
 ) -> tuple[torch.Tensor, ...]:
     """
     Scatter the patch features into a contiguous tensor that corresponds
@@ -165,8 +166,8 @@ def scatter_patch_features(
     can be filtered out by :func`select_patch_features`.
 
     Args:
-        features: The patch features, concatenated across each image.
-          Shape: `(num_patch, feature_depth)`
+        patches: The patch features for each image.
+          Shape: `(num_images, <patch_dims>, feature_depth)`
         embed_is_patch: A boolean mask indicating which image embeddings
           correspond to patch tokens for each image.
           Shape: `(num_images, num_embeds)`
@@ -194,21 +195,21 @@ def scatter_patch_features(
             The resulting embedding tensor is:
             [  nan     p1      p2      nan      p3      p4     nan    nan  ]
     """
-    num_embeds_per_image = [
-        e_is_patch.numel() for e_is_patch in embed_is_patch
-    ]
-    if isinstance(embed_is_patch, torch.Tensor):
-        embed_is_patch_flat = embed_is_patch.view(-1)
-    else:
-        embed_is_patch_flat = torch.cat(embed_is_patch)
-
-    embeds_flat = features.new_full(
-        (sum(num_embeds_per_image), features.shape[-1]),
-        fill_value=torch.nan,
-    )
-    embeds_flat[embed_is_patch_flat] = features.flatten(0, -2)
-
-    return embeds_flat.split(num_embeds_per_image)
+    if len(patches) != len(embed_is_patch):
+        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
+                         f"{len(embed_is_patch)=}")
+
+    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
+        embed_one = patches_one.new_full(
+            (e_is_patch.shape[0], patches_one.shape[-1]),
+            fill_value=torch.nan,
+        )
+        embed_one[e_is_patch] = patches_one.flatten(0, -2)
+        return embed_one
+
+    return tuple(
+        get_embed_one(patches_one, e_is_patch)
+        for patches_one, e_is_patch in zip(patches, embed_is_patch))
 
 
 def select_patch_features(

From b24abe07f7805ff12e916737ff4d1700536c74aa Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 27 Mar 2025 17:21:23 +0800
Subject: [PATCH 1017/1240] [Misc] Use model_redirect to redirect the model
 name to a local folder. (#14116)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                   | 10 ++++++---
 vllm/envs.py                     |  5 +++++
 vllm/transformers_utils/utils.py | 38 ++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 62800afc3e6..687c8b56ec1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -38,7 +38,7 @@
     get_sentence_transformer_tokenizer_config, is_encoder_decoder,
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
-from vllm.transformers_utils.utils import is_s3
+from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, random_uuid, resolve_obj_by_qualname)
 
@@ -266,9 +266,13 @@ def __init__(
         override_generation_config: Optional[dict[str, Any]] = None,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
-        self.model = model
+        self.model = maybe_model_redirect(model)
+        self.tokenizer = maybe_model_redirect(tokenizer)
+
         self.hf_config_path = hf_config_path
-        self.tokenizer = tokenizer
+        if isinstance(hf_config_path, str):
+            self.hf_config_path = maybe_model_redirect(hf_config_path)
+
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
         self.allowed_local_media_path = allowed_local_media_path
diff --git a/vllm/envs.py b/vllm/envs.py
index e16753191c6..23c304f124d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -22,6 +22,7 @@
     S3_ACCESS_KEY_ID: Optional[str] = None
     S3_SECRET_ACCESS_KEY: Optional[str] = None
     S3_ENDPOINT_URL: Optional[str] = None
+    VLLM_MODEL_REDIRECT_PATH: Optional[str] = None
     VLLM_CACHE_ROOT: str = os.path.expanduser("~/.cache/vllm")
     VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm")
     VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai"
@@ -635,6 +636,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_CI_USE_S3":
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 
+    # Use model_redirect to redirect the model name to a local folder.
+    "VLLM_MODEL_REDIRECT_PATH":
+    lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
+
     # Whether to use atomicAdd reduce in gptq/awq marlin kernel.
     "VLLM_MARLIN_USE_ATOMIC_ADD":
     lambda: os.environ.get("VLLM_MARLIN_USE_ATOMIC_ADD", "0") == "1",
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 87e446f8943..bae487b7558 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,9 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from functools import cache
 from os import PathLike
 from pathlib import Path
 from typing import List, Optional, Union
 
+from vllm.envs import VLLM_MODEL_REDIRECT_PATH
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 def is_s3(model_or_path: str) -> bool:
     return model_or_path.lower().startswith('s3://')
@@ -38,3 +44,35 @@ def modelscope_list_repo_files(
         if file['Type'] == 'blob'
     ]
     return files
+
+
+@cache
+def maybe_model_redirect(model: str) -> str:
+    """
+    Use model_redirect to redirect the model name to a local folder.
+
+    :param model: hf model name
+    :return: maybe redirect to a local folder
+    """
+
+    model_redirect_path = VLLM_MODEL_REDIRECT_PATH
+
+    if not model_redirect_path:
+        return model
+
+    if not Path(model_redirect_path).exists():
+        return model
+
+    with open(model_redirect_path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.split("\t")
+                if model == model_name:
+                    redirect_name = redirect_name.strip()
+                    logger.info("model redirect: [ %s ] -> [ %s ]", model,
+                                redirect_name)
+                    return redirect_name
+            except Exception:
+                pass
+
+    return model

From dc53ae996d7caa68a60db2408ae76dc850b40ce0 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@users.noreply.github.com>
Date: Thu, 27 Mar 2025 06:33:41 -0400
Subject: [PATCH 1018/1240] Fix incorrect filenames in vllm_compile_cache.py
 (#15494)

Signed-off-by: <zou3519@gmail.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/compilation/compiler_interface.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 571e2b832e9..ab0f98bdaa3 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -229,7 +229,20 @@ def hijacked_compile_fx_inner(*args, **kwargs):
                 inductor_compiled_graph = output
                 if inductor_compiled_graph is not None:
                     nonlocal file_path
-                    file_path = inductor_compiled_graph.current_callable.__code__.co_filename  # noqa
+                    compiled_fn = inductor_compiled_graph.current_callable
+                    file_path = compiled_fn.__code__.co_filename  # noqa
+                    if not file_path.startswith(self.cache_dir):
+                        # hooked in the align_inputs_from_check_idxs function
+                        # in torch/_inductor/utils.py
+                        for cell in compiled_fn.__closure__:
+                            if not callable(cell.cell_contents):
+                                continue
+                            code = cell.cell_contents.__code__
+                            if code.co_filename.startswith(self.cache_dir):
+                                # this is the real file path
+                                # compiled from Inductor
+                                file_path = code.co_filename
+                                break
                     hash_str = inductor_compiled_graph._fx_graph_cache_key
                 return output
 

From 93f27b2ddd55786269f7259144e170b471b7c5bc Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 27 Mar 2025 20:38:46 +0800
Subject: [PATCH 1019/1240] [Doc] update --system for transformers installation
 in docker doc (#15616)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/docker.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 1f60faf4087..65cb038de1b 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -34,11 +34,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.0
+FROM vllm/vllm-openai:v0.8.2
 
 # e.g. install the `audio` and `video` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install vllm[audio,video]==0.8.0
+RUN uv pip install --system vllm[audio,video]==0.8.2
 ```
 
 :::
@@ -52,7 +52,7 @@ with an extra layer that installs their code from source:
 ```Dockerfile
 FROM vllm/vllm-openai:latest
 
-RUN uv pip install git+https://github.com/huggingface/transformers.git
+RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 ```
 
 :::

From 6859fad77d399ef1c7a6f334e955b56ee590f4d2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 27 Mar 2025 21:07:29 +0800
Subject: [PATCH 1020/1240] [Model] MiniCPM-V/O supports V1 (#15487)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |   4 +-
 vllm/model_executor/models/minicpmo.py | 425 +++++++--------
 vllm/model_executor/models/minicpmv.py | 704 ++++++++++++-------------
 vllm/model_executor/models/molmo.py    |  42 +-
 4 files changed, 577 insertions(+), 598 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8ff18a17d36..793831fd06d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -836,14 +836,14 @@ See [this page](#generative-models) for more information on how to use generativ
   * `openbmb/MiniCPM-o-2_6`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `MiniCPMV`
   * MiniCPM-V
   * T + I<sup>E+</sup> + V<sup>E+</sup>
   * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
   * ✅︎
   * ✅︎
-  *
+  * ✅︎
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index 1312b105173..ea37de0b806 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -23,8 +23,8 @@
 # limitations under the License.
 """Inference-only MiniCPM-O model compatible with HuggingFace weights."""
 from collections.abc import Iterable, Mapping, Sequence
-from typing import (Any, Callable, Dict, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import torch
 from torch import nn
@@ -42,8 +42,6 @@
                                    MultiModalDataParser)
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
-from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
@@ -51,13 +49,14 @@
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
+from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
 
 class MiniCPMOAudioFeatureInputs(TypedDict):
     type: Literal["audio_features"]
-    audio_features: torch.Tensor
+    audio_features: Union[torch.Tensor, list[torch.Tensor]]
     """
     Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
     Slice here means chunk. Audio that is too long will be split into slices,
@@ -65,37 +64,40 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     Padding is used therefore `audio_features` is `torch.Tensor`.
     """
 
-    audio_feature_lens: torch.Tensor
+    audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices)`
+    Shape: `(batch_size * num_audios, num_slices)`
 
     This should be feature length of each audio slice, 
     which equals to `audio_features.shape[-1]`
     """
 
-    audio_bounds: torch.Tensor
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
 
-    This should be in `(start, stop)` format.
+    Shape: `(batch_size * num_audios, num_embeds)`
     """
 
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
-    audio_embeds: torch.Tensor
+    audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, hidden_size)`
+    Shape: `(batch_size * num_audios, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     Length of each slice may vary, so pass it as a list.
     """
-    audio_bounds: torch.Tensor
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_audios * num_slices, 2)`
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
 
-    This should be in `(start, stop)` format.
+    Shape: `(batch_size * num_audios, num_embeds)`
     """
 
 
@@ -104,11 +106,16 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
 
 
 def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    audio_features = hf_inputs.get("audio_features", torch.empty(0))
+    num_audios = len(audio_features)
+
     return dict(
         **_minicpmv_field_config(hf_inputs),
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
+        audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
 
@@ -149,7 +156,7 @@ class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
     audio_pattern = "(<audio>./</audio>)"
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-        return {"image": None, "video": None, "audio": None}
+        return {**super().get_supported_mm_limits(), "audio": None}
 
     def get_mm_max_tokens_per_item(
         self,
@@ -157,11 +164,25 @@ def get_mm_max_tokens_per_item(
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
         return {
-            "image": self.get_max_image_tokens(),
-            "audio": self.get_max_audio_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            **super().get_mm_max_tokens_per_item(seq_len, mm_counts),
+            "audio":
+            self.get_max_audio_tokens(),
         }
 
+    def get_audio_placeholder(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        hf_processor = self.get_hf_processor()
+
+        return hf_processor.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
+
     def get_default_audio_pool_step(self) -> int:
         return 2
 
@@ -197,12 +218,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         max_videos = mm_config.get_limit_per_prompt("video")
         max_audios = mm_config.get_limit_per_prompt("audio")
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
-        max_audio_tokens = self.get_max_audio_tokens(
-        ) * max_audios + 2 * max_audios
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_audio_tokens = self.get_max_audio_tokens() * max_audios
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens -
                                                      max_audio_tokens)
@@ -224,20 +241,20 @@ def get_dummy_processor_inputs(
 
         processor_inputs = super().get_dummy_processor_inputs(
             seq_len, mm_counts)
-        mm_data = {
-            "image":
-            processor_inputs.mm_data["image"],
-            "video":
-            processor_inputs.mm_data["video"],
+
+        audio_prompt_texts = self.info.audio_pattern * num_audios
+        audio_mm_data = {
             "audio":
             self._get_dummy_audios(length=audio_len, num_audios=num_audios)
         }
 
-        audio_prompt_texts = self.info.audio_pattern * num_audios
-
-        return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \
-                               audio_prompt_texts,
-                               mm_data=mm_data)
+        return ProcessorInputs(
+            prompt_text=processor_inputs.prompt_text + audio_prompt_texts,
+            mm_data={
+                **processor_inputs.mm_data,
+                **audio_mm_data,
+            },
+        )
 
 
 class MiniCPMOMultiModalProcessor(
@@ -247,22 +264,17 @@ def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMOMultiModalDataParser(
             target_sr=self.info.get_default_audio_sampling_rate())
 
-    def get_audio_prompt_texts(self,
-                               audio_lens: int,
-                               chunk_input: bool = True,
-                               chunk_length: int = 1) -> str:
-        return self.info.get_hf_processor().get_audio_placeholder(
-            audio_lens, chunk_input, chunk_length)
-
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
-        tokenizer = self.info.get_tokenizer()
-        special_tokens = super().get_special_tokens()
-        if hasattr(tokenizer, "audio_start_id"):
-            special_tokens["audio_start_id"] = torch.tensor(
-                tokenizer.audio_start_id)
-            special_tokens["audio_end_id"] = torch.tensor(
-                tokenizer.audio_end_id)
-        return special_tokens
+    def get_audio_prompt_texts(
+        self,
+        audio_lens: int,
+        chunk_input: bool = True,
+        chunk_length: int = 1,
+    ) -> str:
+        return self.info.get_audio_placeholder(
+            audio_lens,
+            chunk_input=chunk_input,
+            chunk_length=chunk_length,
+        )
 
     def process_audios(
         self,
@@ -274,31 +286,64 @@ def process_audios(
 
         parsed_audios = (self._get_data_parser().parse_mm_data({
             "audio": audios
-        }).get_items("audio", AudioProcessorItems))
+        }).get_items("audio",
+                     (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)))
 
-        audio_inputs = self._base_call_hf_processor(
-            prompts=[self.info.audio_pattern] * len(parsed_audios),
-            mm_data={"audios": [[audio] for audio in parsed_audios]},
-            mm_kwargs={
-                **mm_kwargs, "chunk_input": True
-            },
-            out_keys={"audio_features", "audio_feature_lens"},
-        )
+        if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
+            audio_inputs = {}
 
-        # Avoid padding since we need the output for each audio to be
-        # independent of other audios for the cache to work correctly
-        unpadded_audio_features = [
-            feat[:, :feature_len] for feat, feature_len in zip(
-                audio_inputs["audio_features"],
-                audio_inputs["audio_feature_lens"],
+            audio_lens = [
+                self.info.get_audio_len_by_num_chunks(
+                    sum(map(len,
+                            parsed_audios.get(i)["audio_embeds"])))
+                for i in range(len(parsed_audios))
+            ]
+        else:
+            audio_inputs = self._base_call_hf_processor(
+                prompts=[self.info.audio_pattern] * len(parsed_audios),
+                mm_data={"audios": [[audio] for audio in parsed_audios]},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "chunk_input": True,
+                },
+                out_keys={"audio_features", "audio_feature_lens"},
             )
+
+            # Avoid padding since we need the output for each audio to be
+            # independent of other audios for the cache to work correctly
+            unpadded_audio_features = [
+                feat[:, :feature_len] for feat, feature_len in zip(
+                    audio_inputs["audio_features"],
+                    audio_inputs["audio_feature_lens"],
+                )
+            ]
+            audio_inputs["audio_features"] = unpadded_audio_features
+
+            audio_lens = [
+                parsed_audios.get_audio_length(i)
+                for i in range(len(parsed_audios))
+            ]
+
+        audio_repl_features = [
+            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
         ]
-        audio_inputs["audio_features"] = unpadded_audio_features
 
-        return audio_inputs
+        tokenizer = self.info.get_tokenizer()
+        audio_repls_feature_tokens = [
+            tokenizer.encode(audio_repl, add_special_tokens=False)
+            for audio_repl in audio_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(audio_repl_tokens)
+            for audio_repl_tokens in audio_repls_feature_tokens
+        ]
+        audio_inputs["audio_embed_is_patch"] = embed_is_patch
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video|audio)>./</\1>\)"
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
+
+        return audio_inputs
 
     def process_mm_inputs(
         self,
@@ -331,8 +376,7 @@ def get_audio_replacement(item_idx: int):
             if isinstance(audios, MiniCPMOAudioEmbeddingItems):
                 single_audio_embeds = audios.get(item_idx)["audio_embeds"]
                 audio_len = self.info.get_audio_len_by_num_chunks(
-                    sum(chunk_embeds.shape[0]
-                        for chunk_embeds in single_audio_embeds))
+                    sum(map(len, single_audio_embeds)))
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
@@ -514,6 +558,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.apm = self.init_audio_module(vllm_config=vllm_config,
                                           prefix=maybe_prefix(prefix, "apm"))
 
+        self.audio_token_id = None
+
     def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # Do not use parameters temporarily
         audio_config = self.config.audio_config
@@ -563,18 +609,30 @@ def _get_feat_extract_output_lengths(self,
 
         return input_lengths_after_cnn, input_lengths_after_pooling
 
-    # Copied from HF repo of MiniCPM-o-2_6,
-    # designed for batched inputs and outputs
-    def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
-                                chunk_length: int) -> list[torch.Tensor]:
-        wavforms = data.get(
-            "audio_features",
-            [])  # (bs, 80, frames) or [], multi audios need filled in advance
-        audio_feature_lens_raw = [data.get("audio_feature_lens",
-                                           [])]  # list, [[x1, x2], [y1], [z1]]
+    def get_audio_hidden_states(
+            self, data: MiniCPMOAudioFeatureInputs) -> list[torch.Tensor]:
+        chunk_length = self.config.audio_chunk_length
+
+        # (bs, 80, frames) or [], multi audios need filled in advance
+        wavforms_raw = data["audio_features"]
+        if isinstance(wavforms_raw, list):
+            B = len(wavforms_raw)
+            C = wavforms_raw[0].shape[-2]
+            L = max(item.shape[-1] for item in wavforms_raw)
+            device = wavforms_raw[0].device
+            dtype = wavforms_raw[0].dtype
+
+            wavforms = torch.zeros((B, C, L), dtype=dtype, device=device)
+            for i, wavforms_item in enumerate(wavforms_raw):
+                L_item = wavforms_item.shape[-1]
+                wavforms[i, ..., :L_item] = wavforms_item
+        else:
+            wavforms = wavforms_raw
 
-        if len(wavforms) == 0:
-            return []
+        # list, [[x1, x2], [y1], [z1]]
+        audio_feature_lens_raw = data["audio_feature_lens"]
+        if isinstance(audio_feature_lens_raw, torch.Tensor):
+            audio_feature_lens_raw = audio_feature_lens_raw.unbind(0)
 
         audio_feature_lens = torch.hstack(audio_feature_lens_raw)
         batch_size, _, max_mel_seq_len = wavforms.shape
@@ -625,159 +683,104 @@ def get_audio_hidden_states(self, data: MiniCPMOAudioInputs,
 
         num_audio_tokens = feature_lens_after_pooling
 
-        final_audio_embeds = []
+        final_audio_embeds = list[torch.Tensor]()
         idx = 0
         for i in range(len(audio_feature_lens_raw)):
-            target_audio_embeds = []
+            target_audio_embeds_lst = list[torch.Tensor]()
             for _ in range(len(audio_feature_lens_raw[i])):
-                target_audio_embeds.append(
+                target_audio_embeds_lst.append(
                     audio_embeds[idx, :num_audio_tokens[idx], :])
                 idx += 1
-            final_audio_embeds.append(target_audio_embeds)
-        return final_audio_embeds
 
-    def get_embedding_with_audios(self, vlm_embedding: torch.Tensor,
-                                  audio_inputs: MiniCPMOAudioInputs,
-                                  chunk_length: int) -> torch.Tensor:
-        device, dtype = vlm_embedding.device, vlm_embedding.dtype
-        if audio_inputs["type"] == "audio_embeds":
-            audio_embeddings = [
-                item.to(device=device, dtype=dtype)
-                for item in audio_inputs["audio_embeds"]
-            ]
-        else:
-            audio_embeddings = self.get_audio_hidden_states(
-                audio_inputs, chunk_length)[0]
-        if audio_embeddings is None or len(audio_embeddings) == 0:
-            return vlm_embedding
-        audio_bounds = audio_inputs["audio_bounds"]
-        if self.config.chunk_input:
-            audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device,
-                                                               dtype=dtype)
-            audio_start_pos = 0
-            for bound in audio_bounds:
-                audio_len = bound[1] - bound[0]
-                vlm_embedding[bound[0]:bound[1]] = audio_embs[
-                    audio_start_pos:audio_start_pos + audio_len, :]
-                audio_start_pos += audio_len
-        else:
-            for embs, bound in zip(audio_embeddings, audio_bounds):
-                audio_indices = torch.arange(bound[0],
-                                             bound[1],
-                                             dtype=torch.long).to(device)
-
-                if embs.shape[0] != len(audio_indices):
-                    raise ValueError(
-                        "Shape mismatch: Trying to assign embeddings "
-                        f"of shape {embs.shape} "
-                        f"to input indices of length {len(audio_indices)}")
-                vlm_embedding[audio_indices] = embs.to(dtype)
-        return vlm_embedding
-
-    def _get_audio_bounds(self, input_ids: torch.Tensor,
-                          audio_start_id: torch.Tensor,
-                          audio_end_id: torch.Tensor) -> torch.Tensor:
-        audio_start_tokens, = torch.where(input_ids == audio_start_id[0])
-        audio_start_tokens += 1
-        audio_end_tokens, = torch.where(input_ids == audio_end_id[0])
-        valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens))
-        return torch.hstack([
-            audio_start_tokens[:valid_audio_nums].unsqueeze(-1),
-            audio_end_tokens[:valid_audio_nums].unsqueeze(-1)
-        ])
+            final_audio_embeds.append(torch.cat(target_audio_embeds_lst))
 
-    def _parse_and_validate_audio_inputs(
-            self, input_ids: torch.Tensor,
-            **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
+        return final_audio_embeds
+
+    def _parse_and_validate_audio_input(
+            self, **kwargs: object) -> Optional[MiniCPMOAudioInputs]:
         audio_features = kwargs.pop("audio_features", None)
         audio_embeds = kwargs.pop("audio_embeds", None)
 
         if audio_features is None and audio_embeds is None:
             return None
 
-        audio_start_id = kwargs.pop("audio_start_id")
-        if not isinstance(audio_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_start_id. "
-                             f"Got type: {type(audio_start_id)}")
+        audio_token_id = kwargs.pop("audio_token_id")
+        if audio_token_id is not None:
+            assert isinstance(audio_token_id, torch.Tensor)
+            self.mm_token_ids.add(audio_token_id.flatten().unique().item())
+
+        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
+        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_embed_is_patch. "
+                             f"Got type: {type(audio_embed_is_patch)}")
 
-        audio_end_id = kwargs.pop("audio_end_id")
-        if not isinstance(audio_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of audio_end_id. "
-                             f"Got type: {type(audio_end_id)}")
+        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
 
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
                                  f"Got type: {type(audio_embeds)}")
 
+            audio_embeds_flat = flatten_bn(audio_embeds)
+
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
-                audio_embeds=flatten_bn(flatten_2d_lists(audio_embeds),
-                                        concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
+                audio_embeds=audio_embeds_flat,
+                embed_is_patch=audio_embed_is_patch,
             )
 
-        if audio_features is not None:
-            if not isinstance(audio_features, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_features. "
-                                 f"Got type: {type(audio_features)}")
-
-            audio_feature_lens = kwargs.pop("audio_feature_lens")
-            if not isinstance(audio_feature_lens, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of audio_feature_lens. "
-                                 f"Got type: {type(audio_feature_lens)}")
-
-            return MiniCPMOAudioFeatureInputs(
-                type="audio_features",
-                audio_features=flatten_bn(audio_features, concat=True),
-                audio_feature_lens=flatten_bn(
-                    flatten_2d_lists(audio_feature_lens), concat=True),
-                audio_bounds=self._get_audio_bounds(input_ids, audio_start_id,
-                                                    audio_end_id),
-            )
+        if not isinstance(audio_features, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_features. "
+                             f"Got type: {type(audio_features)}")
 
-        raise AssertionError("This line should be unreachable.")
+        audio_feature_lens = kwargs.pop("audio_feature_lens")
+        if not isinstance(audio_feature_lens, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_feature_lens. "
+                             f"Got type: {type(audio_feature_lens)}")
 
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        image_inputs = self._parse_and_validate_image_inputs(
-            input_ids, **kwargs)
-        if not any("audio" in key for key in kwargs):
-            return image_inputs, None
-        audio_inputs = self._parse_and_validate_audio_inputs(
-            input_ids, **kwargs)
-        return image_inputs, audio_inputs
+        audio_features_flat = flatten_bn(audio_features)
+        audio_feature_lens_flat = flatten_bn(audio_feature_lens)
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        **kwargs: Any,
-    ) -> torch.Tensor:
-        if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs, audio_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
-
-            if audio_inputs is not None:
-                vlm_embeddings = self.get_embedding_with_audios(
-                    vlm_embeddings, audio_inputs,
-                    self.config.audio_chunk_length)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
-
-        output = self.llm.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+        return MiniCPMOAudioFeatureInputs(
+            type="audio_features",
+            audio_features=audio_features_flat,
+            audio_feature_lens=audio_feature_lens_flat,
+            embed_is_patch=audio_embed_is_patch,
         )
-        return output
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = super()._parse_and_validate_multimodal_inputs(**kwargs)
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("audio_features",
+                             "audio_embeds") and "audios" not in modalities:
+                modalities["audios"] = self._parse_and_validate_audio_input(
+                    **kwargs)
+
+        return modalities
+
+    def _process_audio_input(
+        self,
+        audio_input: MiniCPMOAudioInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+        if audio_input["type"] == "audio_embeds":
+            return audio_input["audio_embeds"]
+
+        return self.get_audio_hidden_states(audio_input)
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        multimodal_embeddings = super()._process_multimodal_inputs(modalities)
+
+        for modality in modalities:
+            if modality == "audios":
+                audio_input = modalities["audios"]
+                audio_features = self._process_audio_input(audio_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        audio_features,
+                        audio_input["embed_is_patch"],
+                    ))
+
+        return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 23c010c63d5..76c7a59d656 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -23,17 +23,15 @@
 # limitations under the License.
 """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
 import math
-import re
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property, partial
-from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
-                    TypedDict, Union)
+from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import numpy as np
 import torch
 import torch.types
-from PIL import Image
 from torch import nn
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
@@ -50,9 +48,7 @@
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    MultiModalInputs, NestedTensors,
-                                    PlaceholderRange)
+from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -67,13 +63,11 @@
 from vllm.utils import flatten_2d_lists
 
 from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP,
-                         SupportsV0Only)
-from .utils import AutoWeightsLoader, flatten_bn, maybe_prefix
-
-CPU_DEVICE = torch.device("cpu")
-
-RawImageType = Union[Image.Image, torch.Tensor]
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class MiniCPMVImagePixelInputs(TypedDict):
@@ -86,37 +80,41 @@ class MiniCPMVImagePixelInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
+    tgt_sizes: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_slices, 2)`
 
-    This should be in `(start, stop)` format.
+    This should be in `(height, width)` format.
     """
 
-    tgt_sizes: torch.Tensor
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 2)`
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
 
-    This should be in `(height, width)` format.
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
+    num_slices: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
 
 class MiniCPMVImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    image_embeds: torch.Tensor
+    image_embeds: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 
-             image_feature_size, hidden_size)`
+    Shape: `(batch_size * num_images, num_slices, hidden_size)`
 
     `hidden_size` must match the hidden size of language model backbone.
     instead of a batched tensor.
     """
 
-    image_bounds: torch.Tensor
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
-    Shape: `(batch_size * num_images * num_slices, 2)`
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
 
-    This should be in `(start, stop)` format.
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -233,15 +231,25 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
 
 
 def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    pixel_values = hf_inputs.get("pixel_values", torch.empty(0))
+    num_images = len(pixel_values)
+
+    video_pixel_values = hf_inputs.get("video_pixel_values", torch.empty(0))
+    num_videos = len(video_pixel_values)
+
     return dict(
         pixel_values=MultiModalFieldConfig.batched("image"),
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
+        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
+        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
+        image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
 
 
@@ -348,10 +356,11 @@ def get_model_version(self):
         return get_version_by_config(self.get_hf_config())
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        mm_limits = {"image": None}
         if self.get_model_version() == (2, 6):
-            return {"image": None, "video": None}
-        else:
-            return {"image": None}
+            mm_limits["video"] = None
+
+        return mm_limits
 
     def get_mm_max_tokens_per_item(
         self,
@@ -361,70 +370,79 @@ def get_mm_max_tokens_per_item(
         mm_max_tokens = {"image": self.get_max_image_tokens()}
         if self.get_model_version() == (2, 6):
             mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+
         return mm_max_tokens
 
-    def get_max_video_frame_tokens(self) -> int:
-        frame_size = self.get_video_frame_size_with_most_features()
-        return self.get_num_image_tokens(frame_size,
-                                         self.get_video_max_slice_num())
+    def get_slice_image_placeholder(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        image_idx: int = 0,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> str:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
-        return self.get_max_video_frame_tokens(
-        ) * self.get_num_frames_with_most_features(seq_len)
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_slice_image_placeholder(image_size)
 
-    def get_slice_query_num(self) -> int:
-        hf_config = self.get_hf_config()
-        query_num = getattr(hf_config, "query_num", 64)
-        return query_num
+        return image_processor.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
 
-    def get_max_slice_num(self) -> int:
-        hf_config = self.get_hf_config()
-        max_slice_num = getattr(hf_config, "max_slice_num", 9)
-        return max_slice_num
+    def get_num_image_tokens(
+        self,
+        image_size: ImageSize,
+        max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
+    ) -> int:
+        tokenizer = self.get_tokenizer()
+        image_placeholders = self.get_slice_image_placeholder(
+            image_size,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+        )
+        image_token_ids = tokenizer.encode(image_placeholders,
+                                           add_special_tokens=False)
 
-    def get_sliced_grid(self, image_size: ImageSize,
-                        max_slice_num: int) -> Tuple[int, int]:
-        if self.get_model_version() == (2, 6):
-            slice_grid = self.get_image_processor().get_sliced_grid(
-                image_size, max_slice_num)
-        else:
-            slice_grid = self.get_image_processor().get_sliced_grid(image_size)
-        return slice_grid
-
-    def get_num_image_tokens(self, image_size: ImageSize,
-                             max_slice_num: int) -> int:
-        slice_grid = self.get_sliced_grid(image_size, max_slice_num)
-        num_tokens = self.get_slice_query_num(
-        ) + 2  # <image>(<unk> * query_num)</image>
-        if slice_grid is not None:
-            if self.get_model_version() == (2, 6):
-                num_additional_tokens = 0
-            else:
-                # <slice><image>(<unk> * query_num)</image></slice>
-                num_additional_tokens = 2
-            num_tokens += ((self.get_slice_query_num() + 2) \
-                            * slice_grid[0] * slice_grid[1]) \
-                            + slice_grid[1] - 1 + num_additional_tokens
-        return num_tokens
-
-    def get_image_slice_nums(self, image_size: torch.Tensor,
-                             max_slice_nums: int) -> int:
-        grid = self.get_sliced_grid(image_size, max_slice_nums)
-        return 1 if grid is None else grid[0] * grid[1] + 1
+        return len(image_token_ids)
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
-        return self.get_num_image_tokens(image_size, self.get_max_slice_num())
+        return self.get_num_image_tokens(image_size)
+
+    def get_image_max_slice_num(self) -> int:
+        return getattr(self.get_hf_config(), "max_slice_num", 9)
 
     def get_image_size_with_most_features(self) -> ImageSize:
-        # Result in the max possible feature size (h:w = 9:1)
-        return self.get_default_image_sizes(self.get_max_slice_num())
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_image_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
+
+    def get_max_video_frame_tokens(self) -> int:
+        frame_size = self.get_video_frame_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            frame_size,
+            max_slice_nums=self.get_video_max_slice_num(),
+            use_image_id=False,
+        )
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        return self.get_max_video_frame_tokens(
+        ) * self.get_num_frames_with_most_features(seq_len)
 
     def get_video_max_slice_num(self) -> int:
         return 1
 
     def get_video_frame_size_with_most_features(self) -> ImageSize:
-        return self.get_default_image_sizes(self.get_video_max_slice_num())
+        image_size = getattr(self.get_hf_config(), "image_size", 448)
+        max_slice_num = self.get_video_max_slice_num()
+        return ImageSize(width=image_size, height=image_size * max_slice_num)
 
     def get_max_video_frames(self, max_tokens: int) -> int:
         num_frame_tokens = self.get_max_video_frame_tokens()
@@ -436,10 +454,7 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
         max_images = mm_config.get_limit_per_prompt("image")
         max_videos = mm_config.get_limit_per_prompt("video")
 
-        # count <image_idx></image_idx> tokens
-        # which are not in get_max_image_tokens
-        max_image_tokens = self.get_max_image_tokens(
-        ) * max_images + 4 * max_images
+        max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens)
 
@@ -447,10 +462,6 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
 
         return num_frames
 
-    def get_default_image_sizes(self, num_slices: int) -> ImageSize:
-        image_size = getattr(self.get_hf_config(), "image_size", 448)
-        return ImageSize(width=image_size, height=image_size * num_slices)
-
 
 _I = TypeVar("_I",
              bound=MiniCPMVProcessingInfo,
@@ -499,42 +510,30 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
     def _get_data_parser(self) -> MultiModalDataParser:
         return MiniCPMVMultiModalDataParser()
 
-    def get_slice_image_placeholder(self, image_size: ImageSize,
-                                    **kwargs) -> str:
-        image_processor = self.info.get_image_processor()
-        version = self.info.get_model_version()
-        if version == (2, 0) or version == (2, 5):
-            return image_processor.get_slice_image_placeholder(image_size)
-        return image_processor.get_slice_image_placeholder(
-            image_size, **kwargs)
-
     def get_image_prompt_texts(self,
                                image_size: ImageSize,
                                image_idx: int = 0) -> str:
-        return self.get_slice_image_placeholder(image_size,
-                                                image_idx=image_idx)
+        return self.info.get_slice_image_placeholder(
+            image_size,
+            image_idx=image_idx,
+        )
 
     def get_video_prompt_texts(self, image_size: ImageSize,
                                num_frames: int) -> str:
-        return self.get_slice_image_placeholder(
+        return self.info.get_slice_image_placeholder(
             image_size=image_size,
             image_idx=0,
             max_slice_nums=self.info.get_video_max_slice_num(),
             use_image_id=False,
         ) * num_frames
 
-    def get_special_tokens(self) -> Dict[str, torch.Tensor]:
+    def get_embed_is_patch(
+        self,
+        input_ids: list[int],
+    ) -> torch.Tensor:
         tokenizer = self.info.get_tokenizer()
-
-        special_tokens = {
-            "im_start_id": tokenizer.im_start_id,
-            "im_end_id": tokenizer.im_end_id,
-        }
-        if hasattr(tokenizer, "slice_start_id"):
-            special_tokens["slice_start_id"] = tokenizer.slice_start_id
-            special_tokens["slice_end_id"] = tokenizer.slice_end_id
-
-        return {k: torch.tensor(v) for k, v in special_tokens.items()}
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        return torch.tensor(input_ids) == unk_token_id
 
     def process_images(
         self,
@@ -546,14 +545,43 @@ def process_images(
 
         parsed_images = (self._get_data_parser().parse_mm_data({
             "image": images
-        }).get_items("image", ImageProcessorItems))
+        }).get_items("image",
+                     (MiniCPMVImageEmbeddingItems, ImageProcessorItems)))
 
-        return self._base_call_hf_processor(
-            prompts=[self.info.image_pattern] * len(parsed_images),
-            mm_data={"images": [[image] for image in parsed_images]},
-            mm_kwargs=mm_kwargs,
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        if isinstance(parsed_images, MiniCPMVImageEmbeddingItems):
+            image_inputs = {}
+        else:
+            image_inputs = self._base_call_hf_processor(
+                prompts=[self.info.image_pattern] * len(parsed_images),
+                mm_data={"images": [[image] for image in parsed_images]},
+                mm_kwargs=mm_kwargs,
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        image_repl_features = [
+            self.get_image_prompt_texts(size, idx)
+            for idx, size in enumerate(image_sizes)
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(image_repl_tokens)
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        image_inputs["embed_is_patch"] = embed_is_patch
+
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        image_inputs["image_token_id"] = torch.tensor(unk_token_id)
+
+        return image_inputs
 
     def process_videos(
         self,
@@ -565,25 +593,55 @@ def process_videos(
 
         parsed_videos = (self._get_data_parser().parse_mm_data({
             "video": videos
-        }).get_items("video", VideoProcessorItems))
-
-        max_slice_num = self.info.get_video_max_slice_num()
-
-        video_inputs = self._base_call_hf_processor(
-            prompts=[
-                self.info.image_pattern * len(video) for video in parsed_videos
-            ],
-            mm_data={"images": list(parsed_videos)},
-            mm_kwargs={
-                **mm_kwargs, "max_slice_nums": max_slice_num
-            },
-            out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
-        )
+        }).get_items("video",
+                     (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)))
+
+        if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems):
+            video_inputs = {}
+        else:
+            video_inputs = self._base_call_hf_processor(
+                prompts=[
+                    self.info.image_pattern * len(video)
+                    for video in parsed_videos
+                ],
+                mm_data={"images": list(parsed_videos)},
+                mm_kwargs={
+                    **mm_kwargs,
+                    "max_slice_nums":
+                    self.info.get_video_max_slice_num(),
+                },
+                out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
+            )
+
+        frame_sizes = [
+            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
+        ]
+        num_frames = [
+            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
+        ]
+        video_repl_features = [
+            self.get_video_prompt_texts(size, nframes)
+            for size, nframes in zip(frame_sizes, num_frames)
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        video_repls_feature_tokens = [
+            tokenizer.encode(video_repl, add_special_tokens=False)
+            for video_repl in video_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(video_repl_tokens)
+            for video_repl_tokens in video_repls_feature_tokens
+        ]
+        video_inputs["embed_is_patch"] = embed_is_patch
 
-        return {f"video_{k}": v for k, v in video_inputs.items()}
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
-    def get_placeholder_match_pattern(self) -> str:
-        return r"\(<(image|video)>./</\1>\)"
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        video_inputs["video_token_id"] = torch.tensor(unk_token_id)
+
+        return video_inputs
 
     def process_mm_inputs(
         self,
@@ -602,7 +660,7 @@ def _base_call_hf_processor(
         mm_kwargs: Mapping[str, object],
         *,
         out_keys: set[str],
-    ) -> Mapping[str, NestedTensors]:
+    ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
         if self.info.get_model_version() == (2, 6):
             inputs = super()._call_hf_processor(
@@ -635,14 +693,13 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        # Do not support combination inputs of images and videos for now
-        # Try to handle interleaved multimodal data
         tokenizer = self.info.get_tokenizer()
+
+        input_ids = torch.tensor([tokenizer.encode(prompt)])
         mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs)
 
         return BatchFeature({
-            "input_ids":
-            torch.tensor([tokenizer.encode(prompt)]),
+            "input_ids": input_ids,
             **mm_inputs,
         })
 
@@ -701,39 +758,8 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return _minicpmv_field_config(hf_inputs)
 
-    def apply(
-        self,
-        prompt: Union[str, List[int]],
-        mm_data: MultiModalDataDict,
-        hf_processor_mm_kwargs: Mapping[str, object],
-        return_mm_hashes: bool = False,
-    ) -> MultiModalInputs:
-        if isinstance(prompt, list):
-            prompt = self.info.get_tokenizer().decode(prompt)
-        matches = re.findall(self.get_placeholder_match_pattern(), prompt)
-        mm_orders = {
-            f"{modality}_orders":
-            torch.tensor(
-                [index for index, m in enumerate(matches) if m == modality])
-            for modality in self.info.get_supported_mm_limits()
-        }
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               return_mm_hashes)
-        # Exclude <image_id>x</image_id> from placeholders
-        if "image" in result["mm_placeholders"] and \
-            self.info.get_model_version() == (2, 6):
-            result["mm_placeholders"]["image"] = [
-                PlaceholderRange(offset=p["offset"] + 3 + idx // 10,
-                                 length=p["length"] - 3 - idx // 10)
-                for idx, p in enumerate(result["mm_placeholders"]["image"])
-            ]
-        result["mm_kwargs"].update(**mm_orders)
-        result["mm_kwargs"].update(**self.get_special_tokens())
-        return result
-
-
-class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP,
-                        SupportsV0Only):
+
+class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
     instantiated.
@@ -767,6 +793,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                              prefix=maybe_prefix(
                                                  prefix, "resampler"))
 
+        self.mm_token_ids = set[int]()
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
 
@@ -777,233 +804,191 @@ def sampler(self):
 
         return get_sampler()
 
-    def get_embedding_with_vision(
-        self,
-        input_ids: torch.Tensor,
-        image_inputs: Optional[MiniCPMVImageInputs],
-    ) -> torch.Tensor:
-        vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids)
-
-        if image_inputs is None:
-            return vlm_embedding
-
-        if image_inputs["type"] == "image_embeds":
-            vision_hidden_states = image_inputs["image_embeds"].to(
-                device=vlm_embedding.device,
-                dtype=vlm_embedding.dtype,
-            )
-        else:
-            vision_hidden_states = self.get_vision_hidden_states(image_inputs)
-
-        # See NOTE in _parse_and_validate_inputs
-        image_bounds = image_inputs["image_bounds"]
-        if len(image_bounds) > 0:
-            image_indices = torch.stack([
-                torch.arange(start, end, dtype=torch.long)
-                for start, end in image_bounds.tolist()
-            ]).to(vlm_embedding.device)
-
-            vlm_embedding.scatter_(
-                0,
-                image_indices.view(-1, 1).repeat(1, vlm_embedding.shape[-1]),
-                vision_hidden_states.view(-1, vision_hidden_states.shape[-1]),
-            )
-
-        return vlm_embedding
-
-    def _get_image_bounds(
-            self,
-            input_ids: torch.Tensor,
-            im_start_id: torch.Tensor,
-            im_end_id: torch.Tensor,
-            slice_start_id: Optional[torch.Tensor] = None,
-            slice_end_id: Optional[torch.Tensor] = None) -> torch.Tensor:
-        # All the images in the batch should share the same special image
-        # bound token ids.
-        start_cond = input_ids == im_start_id[0]
-        end_cond = input_ids == im_end_id[0]
-        if slice_start_id is not None:
-            start_cond |= (input_ids == slice_start_id[0])
-            end_cond |= (input_ids == slice_end_id[0])
-
-        image_start_tokens, = torch.where(start_cond)
-        image_start_tokens += 1
-        image_end_tokens, = torch.where(end_cond)
-        valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))
-
-        if valid_image_nums == 0:
-            return torch.zeros((0, 2), device=input_ids.device)
-
-        return torch.hstack([
-            image_start_tokens[:valid_image_nums].unsqueeze(-1),
-            image_end_tokens[:valid_image_nums].unsqueeze(-1),
-        ])
-
-    def _parse_and_validate_image_inputs(
+    def _parse_and_validate_vision_input(
         self,
-        input_ids: torch.Tensor,
+        modality: str,
         **kwargs: object,
     ) -> Optional[MiniCPMVImageInputs]:
-        image_keys = {"pixel_values", "tgt_sizes"}
-        pixel_data = {
-            "image": {
-                key: kwargs.pop(key, None)
-                for key in image_keys
-            },
-            "video": {
-                key: kwargs.pop("video_" + key, None)
-                for key in image_keys
-            }
-        }
-        embed_data = {
-            "image": kwargs.pop("image_embeds", None),
-            "video": kwargs.pop("video_embeds", None),
-        }
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
 
-        all_pixel_data = [
-            v for vs in pixel_data.values() for v in vs.values()
-            if v is not None
-        ]
-        all_embed_data = [v for v in embed_data.values() if v is not None]
-        if len(all_pixel_data) == 0 and len(all_embed_data) == 0:
+        if pixel_values is None and image_embeds is None:
             return None
 
-        im_start_id = kwargs.pop("im_start_id")
-        if not isinstance(im_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_start_id. "
-                             f"Got type: {type(im_start_id)}")
-
-        im_end_id = kwargs.pop("im_end_id")
-        if not isinstance(im_end_id, torch.Tensor):
-            raise ValueError("Incorrect type of im_end_id. "
-                             f"Got type: {type(im_end_id)}")
-
-        slice_start_id = kwargs.pop("slice_start_id", None)
-        if slice_start_id is not None and not isinstance(
-                slice_start_id, torch.Tensor):
-            raise ValueError("Incorrect type of slice_start_id. "
-                             f"Got type: {type(slice_start_id)}")
-
-        slice_end_id = kwargs.pop("slice_end_id", None)
-        if slice_end_id is not None and not isinstance(slice_end_id,
-                                                       torch.Tensor):
-            raise ValueError("Incorrect type of slice_end_id. "
-                             f"Got type: {type(slice_end_id)}")
-
-        if len(all_embed_data) > 0:
-            if len(all_embed_data) > 1:
-                raise ValueError("Incorrect inputs for vision embeddings. "
-                                 "Image embeds and video embeds can not "
-                                 "exist simultaneously.")
-
-            vision_embeds, = all_embed_data
-            if not isinstance(vision_embeds, (torch.Tensor, list)):
-                raise ValueError(f"Incorrect type of vision_embeds. "
-                                 f"Got type: {type(vision_embeds)}")
+        image_token_id = kwargs.pop("image_token_id")
+        if image_token_id is not None:
+            assert isinstance(image_token_id, torch.Tensor)
+            self.mm_token_ids.add(image_token_id.flatten().unique().item())
+
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of embed_is_patch for {modality=}. "
+                f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError(
+                    f"Incorrect type of image_embeds for {modality=}. "
+                    f"Got type: {type(image_embeds)}")
+
+            image_embeds_flat = flatten_bn(image_embeds)
 
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
-                image_embeds=flatten_bn(flatten_2d_lists(vision_embeds),
-                                        concat=True),
-                image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                    im_end_id, slice_start_id,
-                                                    slice_end_id),
+                image_embeds=image_embeds_flat,
+                embed_is_patch=embed_is_patch,
             )
 
-        order_data = dict[str, Union[torch.Tensor, list[torch.Tensor]]]()
-        for modality in ("image", "video"):
-            modality_orders = kwargs.pop(f"{modality}_orders", None)
-            if modality_orders is not None:
-                if not isinstance(modality_orders, (torch.Tensor, list)):
-                    raise ValueError(f"Incorrect type of {modality}_orders. "
-                                     f"Got type: {type(modality_orders)}")
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of pixel_values for {modality=}. "
+                f"Got type: {type(pixel_values)}")
+
+        tgt_sizes = kwargs.pop("tgt_sizes")
+        if not isinstance(tgt_sizes, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of tgt_sizes for {modality=}. "
+                             f"Got type: {type(tgt_sizes)}")
 
-                order_data[modality] = modality_orders
+        num_slices = [[len(p) for p in ps] for ps in pixel_values]
+        num_slices_flat = flatten_bn(torch.tensor(num_slices))
+
+        pixel_values_flat = flatten_bn(flatten_2d_lists(pixel_values))
+        tgt_sizes_flat = flatten_bn(flatten_2d_lists(tgt_sizes), concat=True)
 
-        batch_sizes = {
-            modality: len(modality_orders)
-            for modality, modality_orders in order_data.items()
-        }
-        unique_batch_sizes = set(batch_sizes.values())
-        assert len(unique_batch_sizes) == 1, (
-            f"Found inconsistent batch sizes: {batch_sizes}")
-        batch_size, = unique_batch_sizes
-
-        pixel_values_flat = list[torch.Tensor]()
-        tgt_sizes_flat = list[torch.Tensor]()
-        for b in range(batch_size):
-            mm_orders_b = [(idx_b.item(), modality)
-                           for modality, modality_orders in order_data.items()
-                           for idx_b in modality_orders[b]]
-
-            for _, modality in sorted(mm_orders_b, key=lambda x: x[0]):
-                modality_pixel_data = pixel_data[modality]
-
-                modality_pixel_values = modality_pixel_data["pixel_values"]
-                if not isinstance(modality_pixel_values, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of pixel_values for {modality=}. "
-                        f"Got type: {type(modality_pixel_values)}")
-
-                modality_tgt_sizes = modality_pixel_data["tgt_sizes"]
-                if not isinstance(modality_tgt_sizes, (torch.Tensor, list)):
-                    raise ValueError(
-                        f"Incorrect type of tgt_sizes for {modality=}. "
-                        f"Got type: {type(modality_tgt_sizes)}")
-
-                pixel_values_flat += flatten_2d_lists(modality_pixel_values[b])
-                tgt_sizes_flat += flatten_2d_lists(modality_tgt_sizes[b])
-
-        # NOTE: Input IDs does not contain image tokens during memory profiling,
-        # so we allow it to be empty
         if len(pixel_values_flat) != len(tgt_sizes_flat):
             raise ValueError("Inconsistent flattened lengths, found: "
                              f"{len(pixel_values_flat)} vs. "
                              f"{len(tgt_sizes_flat)}")
 
-        if len(pixel_values_flat) == 0:
-            return None
-
         return MiniCPMVImagePixelInputs(
             type="pixel_values",
             pixel_values=pixel_values_flat,
-            tgt_sizes=torch.stack(tgt_sizes_flat),
-            image_bounds=self._get_image_bounds(input_ids, im_start_id,
-                                                im_end_id, slice_start_id,
-                                                slice_end_id),
+            tgt_sizes=tgt_sizes_flat,
+            embed_is_patch=embed_is_patch,
+            num_slices=num_slices_flat,
         )
 
-    def _parse_and_validate_inputs(self, input_ids: torch.Tensor,
-                                   **kwargs: object):
-        return self._parse_and_validate_image_inputs(input_ids, **kwargs)
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_vision_input(
+                    "images", **kwargs)
+            if input_key in ("video_pixel_values",
+                             "video_embeds") and "videos" not in modalities:
+
+                def _image_key(video_key: str):
+                    if video_key == "video_token_id":
+                        return "image_token_id"
+
+                    return video_key.removeprefix("video_")
+
+                modalities["videos"] = self._parse_and_validate_vision_input(
+                    "videos", **{
+                        _image_key(k): v
+                        for k, v in kwargs.items()
+                    })
+
+        return modalities
+
+    def _process_vision_input(
+        self,
+        image_input: MiniCPMVImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["image_embeds"]
+
+        image_features_flat = self.get_vision_hidden_states(image_input)
+
+        # Reconstruct the batch dimension
+        return image_features_flat.split(image_input["num_slices"].tolist())
+
+    def _process_multimodal_inputs(self, modalities: dict):
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                image_features = self._process_vision_input(image_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        image_features,
+                        image_input["embed_is_patch"],
+                    ))
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_features = self._process_vision_input(video_input)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        video_features,
+                        video_input["embed_is_patch"],
+                    ))
+
+        return multimodal_embeddings
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        return self._process_multimodal_inputs(modalities)
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.llm.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert len(self.mm_token_ids) > 0
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                list(self.mm_token_ids),
+            )
+        return inputs_embeds
 
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> torch.Tensor:
         if intermediate_tensors is not None:
-            vlm_embeddings = None
-        else:
-            image_inputs = \
-                self._parse_and_validate_inputs(input_ids, **kwargs)
-            vlm_embeddings = self.get_embedding_with_vision(
-                input_ids, image_inputs)
+            inputs_embeds = None
 
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        # NOTE: In v1, inputs_embeds is always generated at model runner from
+        # `get_multimodal_embeddings` and `get_input_embeddings`, this
+        # condition is only for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
 
-        output = self.llm.model(
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.llm.model(
             input_ids=input_ids,
             positions=positions,
             intermediate_tensors=intermediate_tensors,
-            inputs_embeds=vlm_embeddings,
+            inputs_embeds=inputs_embeds,
         )
-        return output
+        return hidden_states
 
     def compute_logits(
         self,
@@ -1105,9 +1090,6 @@ def init_vision_module(
 
         return model
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_tokens(input_ids)
-
     def init_resampler(self,
                        embed_dim: int,
                        vision_dim: int,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 9224687d8a5..b2f795155f1 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -92,8 +92,8 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_embeds)`
     """
 
-    num_crops: Union[torch.Tensor, list[torch.Tensor]]
-    """Shape: `(batch_size, num_images)`"""
+    num_crops: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
 
 
 @dataclass
@@ -1492,6 +1492,7 @@ def _parse_and_validate_image_input(
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
         embed_is_patch = flatten_bn(embed_is_patch)
+        num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
@@ -1510,31 +1511,24 @@ def _process_image_input(
         feat_is_patch = image_input["feat_is_patch"]
         num_crops = image_input["num_crops"]
 
-        if isinstance(images, list):
-            # Call the vision backbone on the whole batch at once
-            images_flat = flatten_bn(images, concat=True)
-            image_masks_flat = (None if image_masks is None else flatten_bn(
-                image_masks, concat=True))
-
-            image_features_flat = self.vision_backbone(
-                images=images_flat.unsqueeze(0),
-                image_masks=(None if image_masks_flat is None else
-                             image_masks_flat.unsqueeze(0)),
-            ).squeeze(0)
-
-            # Reconstruct the batch dimension
-            num_crops_per_image = [nc.sum().item() for nc in num_crops]
-            image_features = image_features_flat.split(num_crops_per_image)
-        else:
-            image_features = self.vision_backbone(
-                images=images,
-                image_masks=image_masks,
-            )
+        # Call the vision backbone on the whole batch at once
+        images_flat = flatten_bn(images, concat=True)
+        image_masks_flat = (None if image_masks is None else flatten_bn(
+            image_masks, concat=True))
+        feat_is_patch_flat = flatten_bn(feat_is_patch, concat=True)
+
+        image_features_flat = self.vision_backbone(
+            images=images_flat.unsqueeze(0),
+            image_masks=(None if image_masks_flat is None else
+                         image_masks_flat.unsqueeze(0)),
+        ).squeeze(0)
 
         # Only the features corresponding to patch tokens are relevant
         return [
-            feats[f_is_patch]
-            for feats, f_is_patch in zip(image_features, feat_is_patch)
+            feats[f_is_patch] for feats, f_is_patch in zip(
+                image_features_flat.split(num_crops.tolist()),
+                feat_is_patch_flat.split(num_crops.tolist()),
+            )
         ]
 
     def get_multimodal_embeddings(

From 5fb8471193c26ab3a27faa06729c507b7b1ec1f7 Mon Sep 17 00:00:00 2001
From: Hiroaki Sugiyama <h.sugi@ieee.org>
Date: Thu, 27 Mar 2025 23:29:29 +0900
Subject: [PATCH 1021/1240] [Bugfix] Fix use_cascade_attention handling for
 Alibi-based models on vllm/v1 (#15211)

Signed-off-by: h-sugi <h.sugi@ieee.org>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py                      | 14 +++++++++++++-
 vllm/v1/worker/gpu_model_runner.py |  7 +++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 516b33dca1d..77f4e2dcf5e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -61,7 +61,7 @@
 from vllm.logger import enable_trace_function_call, init_logger
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 
 logger = init_logger(__name__)
 
@@ -2498,6 +2498,18 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+# Only relevant for models using ALiBi (e.g, MPT)
+def check_use_alibi(model_config: ModelConfig) -> bool:
+    return (getattr(model_config.hf_text_config, "alibi", False)  # Falcon
+            or ("BloomForCausalLM" in getattr(model_config.hf_config,
+                                              "architectures", []))  # Bloom
+            or getattr(model_config.hf_text_config, "position_encoding_type",
+                       "") == "alibi"  # codellm_1b_alibi
+            or
+            (hasattr(model_config.hf_text_config, "attn_config")  # MPT
+             and model_config.hf_text_config.attn_config.get("alibi", False)))
+
+
 def sha256(input) -> int:
     """Hash any picklable Python object using SHA-256.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index bcf7762b444..230479f3f15 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -25,7 +25,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv,
+                        LayerBlockType, LazyLoader, cdiv, check_use_alibi,
                         is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -223,6 +223,9 @@ def __init__(
                 device="cpu",
                 pin_memory=self.pin_memory)
 
+        # Only relevant for models using ALiBi (e.g, MPT)
+        self.use_alibi = check_use_alibi(model_config)
+
         self.inputs_embeds = torch.zeros(
             (self.max_num_tokens, self.hidden_size),
             dtype=self.dtype,
@@ -689,7 +692,7 @@ def _compute_cascade_attn_prefix_len(
             query_lens=num_scheduled_tokens,
             num_query_heads=self.num_query_heads,
             num_kv_heads=self.num_kv_heads,
-            use_alibi=False,  # FIXME
+            use_alibi=self.use_alibi,
             use_sliding_window=self.window_size is not None,
             num_sms=self.num_sms,
         )

From 4dfda0a532cd66f3898430a1e10627cc226bcb5c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 00:30:53 +0800
Subject: [PATCH 1022/1240] [Doc] Link to onboarding tasks (#15629)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/conf.py                  | 5 +++++
 docs/source/contributing/overview.md | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index b02b84826c9..3e790827f53 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -103,6 +103,11 @@
         "title": "Pull Request #{{path}}",
         "classes": ["github"],
     },
+    "gh-project": {
+        "url": "https://github.com/vllm-project/projects/{{path}}",
+        "title": "Project #{{path}}",
+        "classes": ["github"],
+    },
     "gh-dir": {
         "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
         "title": "{{path}}",
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index a4141183166..10cbc0eb126 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -11,6 +11,15 @@ We also believe in the power of community support; thus, answering queries, offe
 
 Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
 
+## Job Board
+
+Unsure on where to start? Check out the following links for tasks to work on:
+
+- [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
+  - [Selected onboarding tasks](gh-project:6)
+- [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new%20model%22)
+  - [Models with multi-modal capabilities](gh-project:10)
+
 ## License
 
 See <gh-file:LICENSE>.

From 7a2867e74a1d2d854d58d0ee0b45e754fb1b5a92 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 01:36:32 +0800
Subject: [PATCH 1023/1240] [Misc] Replace `is_encoder_decoder_inputs` with
 `split_enc_dec_inputs` (#15620)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../multimodal/processing/test_idefics3.py    |  2 +-
 .../multimodal/processing/test_phi3v.py       |  2 +-
 vllm/engine/arg_utils.py                      |  2 +-
 vllm/engine/llm_engine.py                     | 26 ++++++++----------
 vllm/inputs/parse.py                          | 22 +++++++++------
 vllm/inputs/registry.py                       | 14 +++++-----
 vllm/model_executor/models/idefics3.py        |  4 +--
 vllm/v1/engine/processor.py                   | 27 ++++++++-----------
 8 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index fdbe2f17692..4cff429a539 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -29,7 +29,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_idefics3 handles num_crops properly."""
+    """Ensure Idefics3MultiModalProcessor handles num_crops properly."""
     # Same as the previous test - don't initialize mm_processor_kwargs
     # in this test and assume that the kwargs will be correctly expanded by
     # the partial when calling the custom input processor.
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index 2f0c8e7e549..dd5f30a2317 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -30,7 +30,7 @@ def test_processor_override(
     num_imgs: int,
     kwargs_on_init: bool,
 ):
-    """Ensure input_processor_for_phi3v handles num_crops properly."""
+    """Ensure Phi3VMultiModalProcessor handles num_crops properly."""
     # Avoid initializing CUDA early
     from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 784ea35beb3..53af3e5717c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -665,7 +665,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=nullable_kvs,
             default=EngineArgs.limit_mm_per_prompt,
             # The default value is given in
-            # MultiModalRegistry.init_mm_limits_per_prompt
+            # MultiModalConfig.get_limit_per_prompt
             help=('For each multimodal plugin, limit how many '
                   'input instances to allow for each prompt. '
                   'Expects a comma-separated list of items, '
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 3d019ea58c5..4856c356831 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -30,8 +30,8 @@
     get_logits_processors as get_openai_logits_processors)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
+                         PromptType)
+from vllm.inputs.parse import is_token_prompt, split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.logits_process import get_bad_words_logits_processors
@@ -609,12 +609,7 @@ def _add_processed_request(
         seq_id = next(self.seq_counter)
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = processed_inputs["decoder"]
-            encoder_inputs = processed_inputs["encoder"]
-        else:
-            decoder_inputs = processed_inputs
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
 
         seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id,
                        lora_request, prompt_adapter_request)
@@ -2031,15 +2026,16 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
 
     def _validate_model_inputs(self, inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest]):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
+
+        # For encoder-decoder multimodal models, the max_prompt_len
+        # restricts the decoder prompt length
+        if self.model_config.is_multimodal_model:
+            prompt_inputs = decoder_inputs
         else:
-            prompt_inputs = inputs
+            prompt_inputs = encoder_inputs or decoder_inputs
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        prompt_ids = prompt_inputs["prompt_token_ids"]
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index ed1056948d8..28e207de1fd 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -1,15 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
 from collections.abc import Sequence
-from typing import Literal, TypedDict, Union, cast, overload
+from typing import Literal, Optional, TypedDict, Union, cast, overload
 
 from typing_extensions import TypeIs
 
 from vllm.utils import is_list_of
 
-from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt,
-                   ProcessorInputs, PromptType, SingletonPrompt, TextPrompt,
-                   TokensPrompt)
+from .data import (ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
+                   SingletonInputs, SingletonPrompt, TextPrompt, TokensPrompt)
 
 
 class ParsedText(TypedDict):
@@ -110,6 +108,14 @@ def is_explicit_encoder_decoder_prompt(
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
-def is_encoder_decoder_inputs(
-        inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]:
-    return "encoder" in inputs and "decoder" in inputs
+def split_enc_dec_inputs(
+    inputs: ProcessorInputs,
+) -> tuple[Optional[SingletonInputs], SingletonInputs]:
+    if "encoder" in inputs and "decoder" in inputs:
+        # NOTE: This passes pyright but not mypy
+        return (
+            inputs["encoder"],  # type: ignore[typeddict-item]
+            inputs["decoder"],  # type: ignore[typeddict-item]
+        )
+
+    return None, inputs
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b6ceb5fb82d..8b95db7a725 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -19,7 +19,7 @@
                         resolve_mm_processor_kwargs)
 
 from .data import ProcessorInputs, SingletonInputs
-from .parse import is_encoder_decoder_inputs
+from .parse import split_enc_dec_inputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -462,13 +462,11 @@ def process_input(self, model_config: "ModelConfig",
             **mm_processor_kwargs,
         )
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            self._ensure_mm_kwargs(processed_inputs["encoder"],
-                                   mm_processor_kwargs)
-            self._ensure_mm_kwargs(processed_inputs["decoder"],
-                                   mm_processor_kwargs)
-        else:
-            self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs)
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        if encoder_inputs is not None:
+            self._ensure_mm_kwargs(encoder_inputs, mm_processor_kwargs)
+        if decoder_inputs is not None:
+            self._ensure_mm_kwargs(decoder_inputs, mm_processor_kwargs)
 
         return processed_inputs
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 234e4498f16..432f2614104 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -232,7 +232,7 @@ def get_dummy_processor_inputs(
         )
 
 
-class Idefics3MultimodalProcessor(
+class Idefics3MultiModalProcessor(
         BaseMultiModalProcessor[Idefics3ProcessingInfo]):
 
     def _call_hf_processor(
@@ -575,7 +575,7 @@ def forward(
 
 
 @MULTIMODAL_REGISTRY.register_processor(
-    Idefics3MultimodalProcessor,
+    Idefics3MultiModalProcessor,
     info=Idefics3ProcessingInfo,
     dummy_inputs=Idefics3DummyInputsBuilder)
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index e2817816757..065ac0920af 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,7 +7,7 @@
 from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
                          PromptType, SingletonInputsAdapter)
-from vllm.inputs.parse import is_encoder_decoder_inputs
+from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
@@ -209,14 +209,8 @@ def process_inputs(
 
         self._validate_model_inputs(processed_inputs, lora_request)
 
-        if is_encoder_decoder_inputs(processed_inputs):
-            decoder_inputs = SingletonInputsAdapter(
-                processed_inputs["decoder"])
-            encoder_inputs = SingletonInputsAdapter(
-                processed_inputs["encoder"])
-        else:
-            decoder_inputs = SingletonInputsAdapter(processed_inputs)
-            encoder_inputs = None
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        decoder_inputs = SingletonInputsAdapter(decoder_inputs)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
@@ -301,15 +295,16 @@ def process_inputs(
     def _validate_model_inputs(self,
                                inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest] = None):
-        if is_encoder_decoder_inputs(inputs):
-            # For encoder-decoder multimodal models, the max_prompt_len
-            # restricts the decoder prompt length
-            prompt_inputs = inputs["decoder" if self.model_config.
-                                   is_multimodal_model else "encoder"]
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(inputs)
+
+        # For encoder-decoder multimodal models, the max_prompt_len
+        # restricts the decoder prompt length
+        if self.model_config.is_multimodal_model:
+            prompt_inputs = decoder_inputs
         else:
-            prompt_inputs = inputs
+            prompt_inputs = encoder_inputs or decoder_inputs
 
-        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+        prompt_ids = prompt_inputs["prompt_token_ids"]
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")

From 44e203fedccf3c3bcf81d2d40efa6018d2b57fd7 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Thu, 27 Mar 2025 13:49:38 -0400
Subject: [PATCH 1024/1240] [Feature] Add middleware to log API Server
 responses (#15593)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 16 ++++++++++++++++
 vllm/envs.py                          |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 374e43fb153..1e735da641d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -24,6 +24,7 @@
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.concurrency import iterate_in_threadpool
 from starlette.datastructures import State
 from starlette.routing import Mount
 from typing_extensions import assert_never
@@ -846,6 +847,21 @@ async def add_request_id(request: Request, call_next):
             response.headers["X-Request-Id"] = request_id
             return response
 
+    if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
+        logger.warning("CAUTION: Enabling log response in the API Server. "
+                       "This can include sensitive information and should be "
+                       "avoided in production.")
+
+        @app.middleware("http")
+        async def log_response(request: Request, call_next):
+            response = await call_next(request)
+            response_body = [
+                section async for section in response.body_iterator
+            ]
+            response.body_iterator = iterate_in_threadpool(iter(response_body))
+            logger.info("response_body={%s}", response_body[0].decode())
+            return response
+
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)
diff --git a/vllm/envs.py b/vllm/envs.py
index 23c304f124d..e5025485a25 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -270,6 +270,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_API_KEY":
     lambda: os.environ.get("VLLM_API_KEY", None),
 
+    # Whether to log responses from API Server for debugging
+    "VLLM_DEBUG_LOG_API_SERVER_RESPONSE":
+    lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False").
+    lower() == "true",
+
     # S3 access information, used for tensorizer to load model from S3
     "S3_ACCESS_KEY_ID":
     lambda: os.environ.get("S3_ACCESS_KEY_ID", None),

From ab952214decf531cea72cdb3d60eaac61d4dc8d5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 01:52:00 +0800
Subject: [PATCH 1025/1240] [Misc] Avoid direct access of global `mm_registry`
 in `compute_encoder_budget` (#15621)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/encoder_cache_manager.py | 16 ++++++++++++----
 vllm/v1/core/sched/scheduler.py       |  3 +++
 vllm/v1/worker/gpu_model_runner.py    |  6 +++---
 vllm/v1/worker/tpu_model_runner.py    |  1 +
 4 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index 018379c1f43..dc76df268c5 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MultiModalRegistry
 from vllm.v1.request import Request
 
 if TYPE_CHECKING:
@@ -67,6 +67,7 @@ def get_freed_ids(self) -> list[tuple[str, int]]:
 def compute_encoder_budget(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations.
@@ -74,6 +75,7 @@ def compute_encoder_budget(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -89,7 +91,11 @@ def compute_encoder_budget(
     (
         encoder_compute_budget,
         encoder_cache_size,
-    ) = _compute_encoder_budget_multimodal(model_config, scheduler_config)
+    ) = _compute_encoder_budget_multimodal(
+        model_config,
+        scheduler_config,
+        mm_registry,
+    )
 
     return encoder_compute_budget, encoder_cache_size
 
@@ -97,6 +103,7 @@ def compute_encoder_budget(
 def _compute_encoder_budget_multimodal(
     model_config: "ModelConfig",
     scheduler_config: "SchedulerConfig",
+    mm_registry: MultiModalRegistry,
 ) -> tuple[int, int]:
     """Compute the encoder cache budget based on the model and scheduler 
     configurations for a multimodal model.
@@ -104,6 +111,7 @@ def _compute_encoder_budget_multimodal(
     Args:
         model_config: Model configuration.
         scheduler_config: Scheduler configuration.
+        mm_registry: Provides information about the token cost.
 
     Returns:
         - Compute budget for encoder execution, in unit of number of tokens 
@@ -112,8 +120,8 @@ def _compute_encoder_budget_multimodal(
             in the input sequence.
     """
 
-    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_nonzero_modality(  # noqa: E501
-        model_config)
+    max_tokens_by_modality_dict = mm_registry \
+        .get_max_tokens_per_item_by_nonzero_modality(model_config)
 
     if not max_tokens_by_modality_dict:
         logger.warning(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index ba7c691306b..87d30c8aefb 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -10,6 +10,7 @@
 from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
                          SpeculativeConfig)
 from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
                                                 compute_encoder_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -38,6 +39,7 @@ def __init__(
         speculative_config: Optional[SpeculativeConfig],
         log_stats: bool,
         structured_output_manager: StructuredOutputManager,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -93,6 +95,7 @@ def __init__(
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=mm_registry,
         )
 
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 230479f3f15..133ccf84832 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -137,6 +137,7 @@ def __init__(
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size
@@ -1439,9 +1440,8 @@ def profile_run(self) -> None:
             # NOTE: Currently model is profiled with a single non-text
             # modality with the max possible input tokens even when
             # it supports multiple.
-            max_tokens_by_modality_dict = (
-                MULTIMODAL_REGISTRY.
-                get_max_tokens_per_item_by_nonzero_modality(self.model_config))
+            max_tokens_by_modality_dict = self.mm_registry \
+                .get_max_tokens_per_item_by_nonzero_modality(self.model_config)
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 65a4048ae74..abe1b338fb7 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -109,6 +109,7 @@ def __init__(
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
             model_config=model_config,
             scheduler_config=scheduler_config,
+            mm_registry=self.mm_registry,
         )
         self.max_num_encoder_input_tokens = encoder_compute_budget
         self.encoder_cache_size = encoder_cache_size

From abf748153ca627438f5e7db7f22bd9646d9e3edb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 27 Mar 2025 18:52:18 +0000
Subject: [PATCH 1026/1240] Use absolute placement for Ask AI button (#15628)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/_static/custom.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index be0b2a388e4..58bc2ebb961 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -10,8 +10,8 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
     script.setAttribute("runllm-position", "BOTTOM_RIGHT");
-    script.setAttribute("runllm-position-y", "20%");
-    script.setAttribute("runllm-position-x", "3%");
+    script.setAttribute("runllm-position-y", "120px");
+    script.setAttribute("runllm-position-x", "20px");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From 873f81cfcdb028d56abb56e8d6b55f20c2635e45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 27 Mar 2025 20:15:06 +0100
Subject: [PATCH 1027/1240] [Bugfix][TPU][V1] Fix recompilation (#15553)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh      |  4 +-
 tests/v1/tpu/test_sampler.py       | 69 +++---------------------------
 vllm/v1/sample/tpu/metadata.py     |  8 +---
 vllm/v1/worker/tpu_model_runner.py |  8 +++-
 4 files changed, 15 insertions(+), 74 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 6e1f79ae649..a93b79c0b1b 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -32,7 +32,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_6 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
+    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
+    && echo TEST_7 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py
index 4e5a57bee32..f535abedea2 100644
--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -1,7 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import tempfile
-from time import time
-
 import pytest
 
 from vllm import LLM, envs
@@ -15,60 +12,6 @@
     )
 
 
-@pytest.mark.parametrize("model_name", ["D4nt3/Qwen2.5-two-layers"])
-@pytest.mark.skipif(not current_platform.is_tpu(),
-                    reason="This test needs a TPU")
-def test_sampler_compilation(model_name: str, monkeypatch):
-    """
-    Check that no recompilation happens despite changing sampling parameters.
-    We can't read XLA metrics from the engine process, hence we measure time.  
-    """
-    with tempfile.TemporaryDirectory() as temp_dir:
-        monkeypatch.setenv("VLLM_XLA_CACHE_PATH", temp_dir)
-        # Compiling model init may still take some time, enforce_eager to skip.
-        llm = LLM(model_name,
-                  enforce_eager=True,
-                  max_num_seqs=16,
-                  max_model_len=1024,
-                  gpu_memory_utilization=0.5)
-        prompts = [
-            "A robot may not injure a human being",
-            "It is only with the heart that one can see rightly;",
-        ]
-        # First inference should be slow
-        sampling_params = SamplingParams(
-            temperature=0.7,
-            # top_p=0.6, # TODO too slow!
-            top_k=10,
-            min_p=0.2,
-            max_tokens=16)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run1 = time() - s
-
-        # Second request with different params, but for which we
-        # compiled for in previous eager iteration.
-        sampling_params = SamplingParams(temperature=0.1,
-                                         top_k=12,
-                                         min_p=0.8,
-                                         max_tokens=24)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run2 = time() - s
-        # Much faster after compiling
-        assert run1 * 0.1 > run2
-        print("TIMES", run1, run2)
-
-        # Third request with min_p set to "None". It will not trigger
-        # recompilation as a default 0 value will be used.
-        sampling_params = SamplingParams(max_tokens=24, temperature=0.0)
-        s = time()
-        _ = llm.generate(prompts, sampling_params)
-        run3 = time() - s
-        assert run1 * 0.1 > run3
-        print("TIMES", run1, run3)
-
-
 @pytest.mark.parametrize("model_name", ["Qwen/Qwen2.5-1.5B-Instruct"])
 @pytest.mark.skipif(not current_platform.is_tpu(),
                     reason="This test needs a TPU")
@@ -77,13 +20,11 @@ def test_sampler_different(model_name: str):
     Test significantly different sampling params to assert the model produces 
     different results.
     """
-    llm = LLM(
-        model_name,
-        enforce_eager=True,
-        max_num_seqs=1,
-        max_model_len=64,
-        # TODO: setting to 0.5 or it will go OOM
-        gpu_memory_utilization=0.5)
+    llm = LLM(model_name,
+              enforce_eager=False,
+              max_num_seqs=1,
+              max_model_len=512,
+              max_num_batched_tokens=512)
     prompts = [
         "Write a short story about a robot that dreams for the first time."
     ]
diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py
index d605c4b65e9..89d3ddf51d7 100644
--- a/vllm/v1/sample/tpu/metadata.py
+++ b/vllm/v1/sample/tpu/metadata.py
@@ -88,6 +88,7 @@ def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
             # Copy slice from CPU to corresponding TPU pre-allocated tensor.
             # Pad value is the default one.
             cpu_tensor[num_reqs:padded_num_reqs] = fill_val
+            # Subtle compilation: len(tpu_tensor) must be >= `padded_num_reqs`
             tpu_tensor[:padded_num_reqs] = cpu_tensor[:padded_num_reqs]
 
         # NOTE NickLucche The sync CPU-TPU graph we produce here must be
@@ -101,13 +102,6 @@ def copy_slice(cpu_tensor: torch.Tensor, tpu_tensor: torch.Tensor,
         copy_slice(input_batch.min_p_cpu_tensor, input_batch.min_p,
                    DEFAULT_SAMPLING_PARAMS["min_p"])
 
-        # copy_slice(input_batch.frequency_penalties_cpu_tensor,
-        #             input_batch.frequency_penalties)
-        # copy_slice(input_batch.presence_penalties_cpu_tensor,
-        #             input_batch.presence_penalties)
-        # copy_slice(input_batch.repetition_penalties_cpu_tensor,
-        #             input_batch.repetition_penalties)
-
         xm.mark_step()
         xm.wait_device_ops()
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index abe1b338fb7..97dfd23163d 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -88,6 +88,8 @@ def __init__(
         self.max_model_len = model_config.max_model_len
         self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
+        # InputBatch needs to work with sampling tensors greater than padding
+        # to avoid dynamic shapes. Also, avoid suboptimal alignment.
         self.max_num_reqs = max(scheduler_config.max_num_seqs, MIN_NUM_SEQS)
 
         # Model-related.
@@ -788,6 +790,7 @@ def capture_model(self) -> None:
             dummy_hidden = torch.randn((num_tokens, hsize),
                                        device=device,
                                        dtype=torch.bfloat16)
+            # Compile for [8, 16, .., 128,.., `self.max_num_reqs`]
             while True:
                 indices = torch.zeros(
                     num_reqs_to_sample,
@@ -804,7 +807,9 @@ def capture_model(self) -> None:
                 out = out.cpu()
                 if num_reqs_to_sample >= self.max_num_reqs:
                     break
-                num_reqs_to_sample *= 2
+                # Make sure to compile the `max_num_reqs` upper-limit case
+                num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(
+                    num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
         logger.info("Compilation finished in in %.2f [secs].", end - start)
@@ -897,7 +902,6 @@ def forward(
 
         return hidden_states
 
-    # @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def sample_from_hidden(
         self,
         hidden_states: torch.Tensor,

From 04036844cb94d1212f4c0710d67505c60194a38e Mon Sep 17 00:00:00 2001
From: cnorman <christy@linux.vnet.ibm.com>
Date: Thu, 27 Mar 2025 17:04:32 -0500
Subject: [PATCH 1028/1240] Correct PowerPC to modern IBM Power (#15635)

Signed-off-by: Christy Norman <christy@linux.vnet.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 1624d5cf5aa..402f2426790 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism and pipeline parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 - Prefix caching support
 - Multi-lora support
 

From 2ede7da070c835c1b596e064368ef5384e5fdfbf Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 27 Mar 2025 18:15:26 -0400
Subject: [PATCH 1029/1240] [CI] Update rules for applying `tpu` label.
 (#15634)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .github/mergify.yml | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 48b2a76be93..e071ece6f1d 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -90,15 +90,34 @@ pull_request_rules:
 
 - name: label-tpu
   description: Automatically apply tpu label
+  # Keep this list in sync with `label-tpu-remove` conditions
   conditions:
     - or:
-      - files~=tpu
+      - files~=tpu.py
+      - files~=_tpu
+      - files~=tpu_
+      - files~=/tpu/
       - files~=pallas
   actions:
     label:
       add:
         - tpu
 
+- name: label-tpu-remove
+  description: Automatically remove tpu label
+  # Keep this list in sync with `label-tpu` conditions
+  conditions:
+    - and:
+      - -files~=tpu.py
+      - -files~=_tpu
+      - -files~=tpu_
+      - -files~=/tpu/
+      - -files~=pallas
+  actions:
+    label:
+      remove:
+        - tpu
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

From 2f4bbe357d38c68457342d7c4ac026145947f36f Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Thu, 27 Mar 2025 16:14:41 -0700
Subject: [PATCH 1030/1240] [V1] AsyncLLM data parallel (#13923)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml               |   5 +
 examples/offline_inference/data_parallel.py |  22 +-
 tests/v1/engine/test_engine_core_client.py  |   8 +-
 tests/v1/test_async_llm_dp.py               | 109 +++++++
 vllm/config.py                              |  21 +-
 vllm/distributed/utils.py                   |  12 +
 vllm/engine/arg_utils.py                    |  10 +
 vllm/envs.py                                |   8 +
 vllm/utils.py                               |  14 +-
 vllm/v1/core/sched/scheduler.py             |  17 +-
 vllm/v1/engine/__init__.py                  |   9 +-
 vllm/v1/engine/async_llm.py                 |  23 +-
 vllm/v1/engine/core.py                      | 214 +++++++++++--
 vllm/v1/engine/core_client.py               | 334 ++++++++++++++++----
 vllm/v1/engine/llm_engine.py                |  17 +-
 vllm/v1/executor/multiproc_executor.py      |  38 ++-
 vllm/v1/metrics/loggers.py                  |  14 +-
 vllm/v1/utils.py                            |  11 +-
 18 files changed, 726 insertions(+), 160 deletions(-)
 create mode 100644 tests/v1/test_async_llm_dp.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f22b2b0ab6f..428b4c593c3 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -135,12 +135,14 @@ steps:
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
+  - tests/v1/test_async_llm_dp.py
   commands:
   # test with tp=2 and external_dp=2
   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
@@ -514,7 +516,10 @@ steps:
   - vllm/worker/worker.py
   - vllm/worker/model_runner.py
   - entrypoints/llm/test_collective_rpc.py
+  - tests/v1/test_async_llm_dp.py
+  - vllm/v1/engine/
   commands:
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
   - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index 232afd8b73d..04a79e2f8ae 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -28,6 +28,7 @@
                     --master-port=13345
 """
 import os
+from time import sleep
 
 from vllm import LLM, SamplingParams
 from vllm.utils import get_open_port
@@ -36,14 +37,13 @@
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
          dp_master_port, GPUs_per_dp_rank):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
     os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
     os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
-    # set devices for each dp_rank
-    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
-        str(i)
-        for i in range(local_dp_rank * GPUs_per_dp_rank, (local_dp_rank + 1) *
-                       GPUs_per_dp_rank))
+
+    # CUDA_VISIBLE_DEVICES for each DP rank is set automatically inside the
+    # engine processes.
 
     # Sample prompts.
     prompts = [
@@ -90,6 +90,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
               f"Generated text: {generated_text!r}")
 
+    # Give engines time to pause their processing loops before exiting.
+    sleep(1)
+
 
 if __name__ == "__main__":
     import argparse
@@ -152,8 +155,13 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
         procs.append(proc)
     exit_code = 0
     for proc in procs:
-        proc.join()
-        if proc.exitcode:
+        proc.join(timeout=300)
+        if proc.exitcode is None:
+            print(f"Killing process {proc.pid} that "
+                  f"didn't stop within 5 minutes.")
+            proc.kill()
+            exit_code = 1
+        elif proc.exitcode:
             exit_code = proc.exitcode
 
     exit(exit_code)
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 48f451a5896..68844b877c1 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -167,11 +167,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
 
             core_client: SyncMPClient = client
 
-            result = core_client._call_utility("echo", "testarg")
+            result = core_client.call_utility("echo", "testarg")
             assert result == "testarg"
 
             with pytest.raises(Exception) as e_info:
-                core_client._call_utility("echo", None, "help!")
+                core_client.call_utility("echo", None, "help!")
 
             assert str(e_info.value) == "Call to echo method failed: help!"
 
@@ -238,10 +238,10 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
 
         core_client: AsyncMPClient = client
 
-        result = await core_client._call_utility_async("echo", "testarg")
+        result = await core_client.call_utility_async("echo", "testarg")
         assert result == "testarg"
 
         with pytest.raises(Exception) as e_info:
-            await core_client._call_utility_async("echo", None, "help!")
+            await core_client.call_utility_async("echo", None, "help!")
 
         assert str(e_info.value) == "Call to echo method failed: help!"
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
new file mode 100644
index 00000000000..f0e031969e7
--- /dev/null
+++ b/tests/v1/test_async_llm_dp.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from contextlib import ExitStack
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+
+engine_args = AsyncEngineArgs(
+    model="ibm-research/PowerMoE-3b",
+    enforce_eager=True,
+    disable_log_requests=True,
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+)
+
+if not current_platform.supports_v1(engine_args.create_model_config()):
+    pytest.skip(reason="Requires V1-supporting platform.",
+                allow_module_level=True)
+
+
+async def generate(engine: AsyncLLM,
+                   request_id: str,
+                   prompt: PromptType,
+                   output_kind: RequestOutputKind,
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_load(output_kind: RequestOutputKind):
+
+    with ExitStack() as after:
+
+        prompt = "This is a test of data parallel"
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        # the engines only synchronize stopping every N steps so
+        # allow a small amount of time here.
+        for _ in range(10):
+            if core_client.num_engines_running == 0:
+                break
+            await asyncio.sleep(0.5)
+
+        assert core_client.num_engines_running == 0
+        assert not core_client.reqs_in_flight
diff --git a/vllm/config.py b/vllm/config.py
index 687c8b56ec1..831fa2e4b06 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -40,7 +40,8 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
-                        get_cpu_memory, random_uuid, resolve_obj_by_qualname)
+                        get_cpu_memory, get_open_port, random_uuid,
+                        resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -1389,6 +1390,8 @@ class ParallelConfig:
     tensor_parallel_size: int = 1  # Number of tensor parallel groups.
     data_parallel_size: int = 1  # Number of data parallel groups.
     data_parallel_rank: int = 0  # Rank of the data parallel group.
+    # Local rank of the data parallel group, defaults to global rank.
+    data_parallel_rank_local: Optional[int] = None
     # IP of the data parallel master.
     data_parallel_master_ip: str = "127.0.0.1"
     data_parallel_master_port: int = 29500  # Port of the data parallel master.
@@ -1493,10 +1496,18 @@ def __post_init__(self) -> None:
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
 
-        self.data_parallel_size = envs.VLLM_DP_SIZE
-        self.data_parallel_rank = envs.VLLM_DP_RANK
-        self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
-        self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+        if self.data_parallel_size > 1:
+            # Data parallel was specified in the engine args.
+            self.data_parallel_master_port = get_open_port()
+            # TODO multi-node
+        else:
+            # Otherwise fall back to env vars (e.g. for offline SPMD case).
+            self.data_parallel_size = envs.VLLM_DP_SIZE
+            self.data_parallel_rank = envs.VLLM_DP_RANK
+            self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+            self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
         self.world_size_across_dp = self.world_size * self.data_parallel_size
 
         if self.distributed_executor_backend == "external_launcher":
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 84899358a6d..b8178af5a2d 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,6 +15,8 @@
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
+                                                _shutdown_backend,
+                                                _unregister_process_group,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
 
@@ -333,3 +335,13 @@ def stateless_init_torch_distributed_process_group(
     pg._register_backend(device, backend_type, backend_class)
 
     return pg
+
+
+def stateless_destroy_torch_distributed_process_group(
+        pg: ProcessGroup) -> None:
+    """
+    Destroy ProcessGroup returned by
+        stateless_init_torch_distributed_process_group().
+    """
+    _shutdown_backend(pg)
+    _unregister_process_group(pg.group_name)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 53af3e5717c..a3b83c65a60 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -114,6 +114,7 @@ class EngineArgs:
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = 1
     tensor_parallel_size: int = 1
+    data_parallel_size: int = 1
     enable_expert_parallel: bool = False
     max_parallel_loading_workers: Optional[int] = None
     block_size: Optional[int] = None
@@ -442,6 +443,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             type=int,
                             default=EngineArgs.tensor_parallel_size,
                             help='Number of tensor parallel replicas.')
+        parser.add_argument('--data-parallel-size',
+                            '-dp',
+                            type=int,
+                            default=EngineArgs.data_parallel_size,
+                            help='Number of data parallel replicas. '
+                            'MoE layers will be sharded according to the '
+                            'product of the tensor-parallel-size and '
+                            'data-parallel-size.')
         parser.add_argument(
             '--enable-expert-parallel',
             action='store_true',
@@ -1359,6 +1368,7 @@ def create_engine_config(
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            data_parallel_size=self.data_parallel_size,
             enable_expert_parallel=self.enable_expert_parallel,
             max_parallel_loading_workers=self.max_parallel_loading_workers,
             disable_custom_all_reduce=self.disable_custom_all_reduce,
diff --git a/vllm/envs.py b/vllm/envs.py
index e5025485a25..5334667376b 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 
 import hashlib
 import os
+import sys
 import tempfile
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
@@ -95,6 +96,7 @@
     VLLM_CUDART_SO_PATH: Optional[str] = None
     VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
     VLLM_DP_RANK: int = 0
+    VLLM_DP_RANK_LOCAL: int = -1
     VLLM_DP_SIZE: int = 1
     VLLM_DP_MASTER_IP: str = ""
     VLLM_DP_MASTER_PORT: int = 0
@@ -625,6 +627,12 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_DP_RANK":
     lambda: int(os.getenv("VLLM_DP_RANK", "0")),
 
+    # Rank of the process in the data parallel setting.
+    # Defaults to VLLM_DP_RANK when not set.
+    "VLLM_DP_RANK_LOCAL":
+    lambda: int(
+        os.getenv("VLLM_DP_RANK_LOCAL", sys.modules[__name__].VLLM_DP_RANK)),
+
     # World size of the data parallel setting
     "VLLM_DP_SIZE":
     lambda: int(os.getenv("VLLM_DP_SIZE", "1")),
diff --git a/vllm/utils.py b/vllm/utils.py
index 77f4e2dcf5e..afe68a2b8cb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -578,7 +578,7 @@ def get_open_port() -> int:
         dp_port = envs.VLLM_DP_MASTER_PORT
         while True:
             port = _get_open_port()
-            if port >= dp_port and port < dp_port + 10:
+            if dp_port <= port < dp_port + 10:
                 continue
             return port
     return _get_open_port()
@@ -2176,11 +2176,11 @@ def make_zmq_socket(
     if socket_type == zmq.constants.PULL:
         socket.setsockopt(zmq.constants.RCVHWM, 0)
         socket.setsockopt(zmq.constants.RCVBUF, buf_size)
-        socket.connect(path)
+        socket.bind(path)
     elif socket_type == zmq.constants.PUSH:
         socket.setsockopt(zmq.constants.SNDHWM, 0)
         socket.setsockopt(zmq.constants.SNDBUF, buf_size)
-        socket.bind(path)
+        socket.connect(path)
     else:
         raise ValueError(f"Unknown Socket Type: {socket_type}")
 
@@ -2188,7 +2188,11 @@ def make_zmq_socket(
 
 
 @contextlib.contextmanager
-def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
+def zmq_socket_ctx(
+    path: str,
+    socket_type: Any,
+    linger: int = 0,
+) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
@@ -2199,7 +2203,7 @@ def zmq_socket_ctx(path: str, socket_type: Any) -> Iterator[zmq.Socket]:
         logger.debug("Got Keyboard Interrupt.")
 
     finally:
-        ctx.destroy(linger=0)
+        ctx.destroy(linger=linger)
 
 
 def is_in_ray_actor():
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 87d30c8aefb..44811976125 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -37,9 +37,10 @@ def __init__(
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         speculative_config: Optional[SpeculativeConfig],
-        log_stats: bool,
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        include_finished_set: bool = False,
+        log_stats: bool = False,
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
@@ -48,6 +49,12 @@ def __init__(
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
+        # include_finished_set controls whether a separate set of finished
+        # request ids should be included in the EngineCoreOutputs returned
+        # by update_from_outputs(). This is currently used in the multi-engine
+        # case to track request lifetimes efficiently.
+        self.include_finished_set = include_finished_set
+
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = \
@@ -663,10 +670,16 @@ def update_from_output(
                 new_running.append(request)
 
         self.running = new_running
-        return EngineCoreOutputs(
+        engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
             scheduler_stats=self.make_stats(),
         )
+        if self.include_finished_set:
+            #TODO currently sending duplicates here, improve this
+            engine_core_outputs.finished_requests = (
+                scheduler_output.finished_req_ids | self.finished_req_ids)
+
+        return engine_core_outputs
 
     def add_request(self, request: Request) -> None:
         self.waiting.append(request)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 3699779b3a0..0557d0c6c19 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -128,12 +128,18 @@ class EngineCoreOutputs(
     #NOTE(Nick): We could consider ways to make this more compact,
     # e.g. columnwise layout
 
+    engine_index: int = 0
+
     # [num_reqs]
     outputs: list[EngineCoreOutput] = []
     scheduler_stats: Optional[SchedulerStats] = None
     timestamp: float = 0.0
 
     utility_output: Optional[UtilityOutput] = None
+    finished_requests: Optional[set[str]] = None
+
+    # In DP case, used to signal that the engine is paused.
+    engine_paused: bool = False
 
     def __post_init__(self):
         if self.timestamp == 0.0:
@@ -147,4 +153,5 @@ class EngineCoreRequestType(enum.Enum):
     """
     ADD = b'\x00'
     ABORT = b'\x01'
-    UTILITY = b'\x02'
+    START_DP = b'\x02'
+    UTILITY = b'\x03'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 3a6811db313..1fb9ae8cb7a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -66,11 +66,17 @@ def __init__(
 
         self.log_requests = log_requests
         self.log_stats = log_stats
-        self.stat_loggers: list[StatLoggerBase] = []
+
+        # Set up stat loggers; independent set for each DP rank.
+        self.stat_loggers: list[list[StatLoggerBase]] = []
         if self.log_stats:
-            if logger.isEnabledFor(logging.INFO):
-                self.stat_loggers.append(LoggingStatLogger())
-            self.stat_loggers.append(PrometheusStatLogger(vllm_config))
+            for i in range(vllm_config.parallel_config.data_parallel_size):
+                loggers: list[StatLoggerBase] = []
+                if logger.isEnabledFor(logging.INFO):
+                    loggers.append(LoggingStatLogger(engine_index=i))
+                loggers.append(
+                    PrometheusStatLogger(vllm_config, engine_index=i))
+                self.stat_loggers.append(loggers)
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -329,6 +335,7 @@ async def _run_output_handler(self):
                 # TODO(rob): make into a coroutine and launch it in
                 # background thread once Prometheus overhead is non-trivial.
                 self._record_stats(
+                    engine_index=outputs.engine_index,
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
                 )
@@ -350,12 +357,13 @@ def _record_stats(
         self,
         scheduler_stats: Optional[SchedulerStats],
         iteration_stats: Optional[IterationStats],
+        engine_index: int = 0,
     ):
         if not self.log_stats:
             return
 
         assert scheduler_stats is not None
-        for stat_logger in self.stat_loggers:
+        for stat_logger in self.stat_loggers[engine_index]:
             stat_logger.record(scheduler_stats=scheduler_stats,
                                iteration_stats=iteration_stats)
 
@@ -393,8 +401,9 @@ async def do_log_stats(
         scheduler_outputs=None,
         model_output=None,
     ) -> None:
-        for stat_logger in self.stat_loggers:
-            stat_logger.log()
+        for loggers in self.stat_loggers:
+            for stat_logger in loggers:
+                stat_logger.log()
 
     async def check_health(self) -> None:
         logger.debug("Called check_health.")
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 42511777fee..20904cd495f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import os
 import queue
 import signal
+import sys
 import threading
 import time
 from concurrent.futures import Future
 from inspect import isclass, signature
-from multiprocessing.connection import Connection
+from logging import DEBUG
 from typing import Any, Optional
 
 import msgspec
@@ -14,7 +15,9 @@
 import zmq
 import zmq.asyncio
 
-from vllm.config import VllmConfig
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
+from vllm.executor.multiproc_worker_utils import _add_prefix
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
@@ -91,6 +94,8 @@ def __init__(
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             speculative_config=vllm_config.speculative_config,
+            include_finished_set=vllm_config.parallel_config.data_parallel_size
+            > 1,
             log_stats=self.log_stats,
             structured_output_manager=self.structured_output_manager,
         )
@@ -283,10 +288,10 @@ def __init__(
         self,
         input_path: str,
         output_path: str,
-        ready_pipe: Connection,
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
+        engine_index: int = 0,
     ):
         super().__init__(vllm_config, executor_class, log_stats)
 
@@ -302,14 +307,20 @@ def __init__(
                          args=(input_path, ),
                          daemon=True).start()
         threading.Thread(target=self.process_output_socket,
-                         args=(output_path, ),
+                         args=(output_path, engine_index),
                          daemon=True).start()
 
-        # Send Readiness signal to EngineClient.
-        ready_pipe.send({"status": "READY"})
+        self.global_unfinished_reqs = False
+
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
 
     @staticmethod
-    def run_engine_core(*args, **kwargs):
+    def run_engine_core(*args,
+                        dp_rank: int = 0,
+                        local_dp_rank: int = 0,
+                        ready_pipe,
+                        **kwargs):
         """Launch EngineCore busy loop in background process."""
 
         # Signal handler used for graceful termination.
@@ -331,9 +342,21 @@ def signal_handler(signum, frame):
         signal.signal(signal.SIGINT, signal_handler)
 
         parent_process = psutil.Process().parent()
-        engine_core = None
+        engine_core: Optional[EngineCoreProc] = None
         try:
-            engine_core = EngineCoreProc(*args, **kwargs)
+            parallel_config: ParallelConfig = kwargs[
+                "vllm_config"].parallel_config
+            if parallel_config.data_parallel_size > 1:
+                # Set data parallel rank for this engine process.
+                parallel_config.data_parallel_rank = dp_rank
+                parallel_config.data_parallel_rank_local = local_dp_rank
+                engine_core = DPEngineCoreProc(*args, **kwargs)
+            else:
+                engine_core = EngineCoreProc(*args, **kwargs)
+
+            # Send Readiness signal to EngineClient.
+            ready_pipe.send({"status": "READY"})
+
             engine_core.run_busy_loop()
 
         except SystemExit:
@@ -351,28 +374,44 @@ def signal_handler(signum, frame):
     def run_busy_loop(self):
         """Core busy loop of the EngineCore."""
 
-        step_fn = (self.step
-                   if self.batch_queue is None else self.step_with_batch_queue)
-
         # Loop until process is sent a SIGINT or SIGTERM
         while True:
             # 1) Poll the input queue until there is work to do.
-            while not self.scheduler.has_requests():
-                logger.debug("EngineCore busy loop waiting.")
-                req = self.input_queue.get()
-                self._handle_client_request(*req)
-
-            # 2) Handle any new client requests.
-            while not self.input_queue.empty():
-                req = self.input_queue.get_nowait()
-                self._handle_client_request(*req)
-
-            # 3) Step the engine core.
-            outputs = step_fn()
-
-            # 4) Put EngineCoreOutputs into the output queue.
-            if outputs is not None:
-                self.output_queue.put_nowait(outputs)
+            self._process_input_queue()
+            # 2) Step the engine core and return the outputs.
+            self._process_engine_step()
+
+    def _process_input_queue(self):
+        """Exits when an engine step needs to be performed."""
+
+        waited = False
+        while not self.global_unfinished_reqs and not (
+                self.scheduler.has_requests()):
+            if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
+                logger.debug("EngineCore waiting for work.")
+                waited = True
+            req = self.input_queue.get()
+            self._handle_client_request(*req)
+
+        if waited:
+            logger.debug(
+                "EngineCore loop active - local unfinished: %s, finished: %s.",
+                self.scheduler.has_unfinished_requests(),
+                self.scheduler.has_finished_requests())
+
+        # Handle any more client requests.
+        while not self.input_queue.empty():
+            req = self.input_queue.get_nowait()
+            self._handle_client_request(*req)
+
+    def _process_engine_step(self):
+        """Called only when there are unfinished local requests."""
+
+        # Step the engine core.
+        outputs = self.step_fn()
+        # Put EngineCoreOutputs into the output queue.
+        if outputs is not None:
+            self.output_queue.put_nowait(outputs)
 
     def _handle_client_request(self, request_type: EngineCoreRequestType,
                                request: Any) -> None:
@@ -382,6 +421,10 @@ def _handle_client_request(self, request_type: EngineCoreRequestType,
             self.add_request(request)
         elif request_type == EngineCoreRequestType.ABORT:
             self.abort_requests(request)
+        elif request_type == EngineCoreRequestType.START_DP:
+            if not self.global_unfinished_reqs:
+                logger.debug("EngineCore starting idle loop.")
+                self.global_unfinished_reqs = True
         elif request_type == EngineCoreRequestType.UTILITY:
             call_id, method_name, args = request
             output = UtilityOutput(call_id)
@@ -432,7 +475,7 @@ def process_input_socket(self, input_path: str):
                 # Push to input queue for core busy loop.
                 self.input_queue.put_nowait((request_type, request))
 
-    def process_output_socket(self, output_path: str):
+    def process_output_socket(self, output_path: str, engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
@@ -443,5 +486,114 @@ def process_output_socket(self, output_path: str):
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 outputs = self.output_queue.get()
+                outputs.engine_index = engine_index
                 encoder.encode_into(outputs, buffer)
-                socket.send_multipart((buffer, ), copy=False)
+                socket.send(buffer, copy=False)
+
+
+ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
+
+
+class DPEngineCoreProc(EngineCoreProc):
+    """ZMQ-wrapper for running EngineCore in background process
+    in a data parallel context."""
+
+    def __init__(
+        self,
+        input_path: str,
+        output_path: str,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+    ):
+        # Add process-specific prefix to stdout and stderr before
+        # we initialize the engine.
+        from multiprocessing import current_process
+        process_name = current_process().name
+        pid = os.getpid()
+        _add_prefix(sys.stdout, process_name, pid)
+        _add_prefix(sys.stderr, process_name, pid)
+
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+
+        assert dp_size > 1
+        assert 0 <= local_dp_rank <= dp_rank < dp_size
+
+        from vllm.platforms import current_platform
+        if current_platform.is_cuda_alike():
+            from vllm.platforms.cuda import device_id_to_physical_device_id
+            tp_size = vllm_config.parallel_config.tensor_parallel_size
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                str(device_id_to_physical_device_id(i))
+                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+                               tp_size))
+
+        self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+
+        # Initialize the engine after setting up environment.
+        super().__init__(input_path, output_path, vllm_config, executor_class,
+                         log_stats, dp_rank)
+
+        # Counts forward-passes of the model so that we can synchronize
+        # finished with DP peers every N steps.
+        self.counter = 0
+
+    def shutdown(self):
+        super().shutdown()
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore for data parallel case."""
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+
+            if local_unfinished_reqs:
+                # 2) Step the engine core.
+                self._process_engine_step()
+
+                # Check if we have now finished all requests.
+                local_unfinished_reqs = (
+                    self.scheduler.has_unfinished_requests())
+            else:
+                if self.scheduler.has_finished_requests():
+                    # There are no unfinished requests, but there are some
+                    # finished requests remaining to be removed from the
+                    # batch state. This engine step won't perform a forward
+                    # pass but will flush the finished requests to ensure
+                    # up-to-date state is returned in the engine outputs.
+                    self._process_engine_step()
+
+                if not self.global_unfinished_reqs:
+                    # All engines are idle.
+                    continue
+
+                # There must be unfinished requests in DP peers, run a
+                # dummy forward pass.
+                self.execute_dummy_batch()
+
+            # 3) All-reduce operation to determine global unfinished reqs.
+            self.global_unfinished_reqs = self._has_global_unfinished_reqs(
+                local_unfinished_reqs)
+
+            if not self.global_unfinished_reqs:
+                # Notify client that we are pausing the loop.
+                self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
+
+    def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
+
+        # Optimization - only perform finish-sync all-reduce every 16 steps.
+        self.counter += 1
+        if self.counter != 16:
+            return True
+        self.counter = 0
+
+        return ParallelConfig.has_unfinished_dp(self.dp_group,
+                                                local_unfinished)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 13b72c80dc0..c41ee6704be 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,10 +8,11 @@
 import uuid
 import weakref
 from abc import ABC, abstractmethod
+from collections.abc import Awaitable, Sequence
 from concurrent.futures import Future
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from threading import Thread
-from typing import Any, Optional, Union
+from typing import Any, Callable, Optional, Union
 
 import zmq
 import zmq.asyncio
@@ -60,6 +61,9 @@ def make_client(
                 "is not currently supported.")
 
         if multiprocess_mode and asyncio_mode:
+            if vllm_config.parallel_config.data_parallel_size > 1:
+                return DPAsyncMPClient(vllm_config, executor_class, log_stats)
+
             return AsyncMPClient(vllm_config, executor_class, log_stats)
 
         if multiprocess_mode and not asyncio_mode:
@@ -207,28 +211,74 @@ def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
 
+class CoreEngine:
+    """One per data parallel rank."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: type[Executor],
+        log_stats: bool,
+        ctx: Union[zmq.Context, zmq.asyncio.Context],
+        output_path: str,
+        index: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        # Paths and sockets for IPC.
+        input_path = get_open_zmq_ipc_path()
+        self.input_socket = make_zmq_socket(ctx, input_path,
+                                            zmq.constants.PUSH)
+        try:
+            # Start EngineCore in background process.
+            self.proc_handle = BackgroundProcHandle(
+                input_path=input_path,
+                output_path=output_path,
+                process_name=f"EngineCore_{index}",
+                target_fn=EngineCoreProc.run_engine_core,
+                process_kwargs={
+                    "vllm_config": vllm_config,
+                    "dp_rank": index,
+                    "local_dp_rank": local_dp_rank,
+                    "executor_class": executor_class,
+                    "log_stats": log_stats,
+                })
+
+            self.num_reqs_in_flight = 0
+        finally:
+            if not hasattr(self, "num_reqs_in_flight"):
+                # Ensure socket is closed if process fails to start.
+                self.close()
+
+    def send_multipart(self, msg_parts: Sequence):
+        return self.input_socket.send_multipart(msg_parts, copy=False)
+
+    def close(self):
+        if proc_handle := getattr(self, "proc_handle", None):
+            proc_handle.shutdown()
+        if socket := getattr(self, "input_socket", None):
+            socket.close(linger=0)
+
+
 @dataclass
 class BackgroundResources:
     """Used as a finalizer for clean shutdown, avoiding
     circular reference back to the client object."""
 
-    ctx: zmq.Context
+    ctx: Union[zmq.Context]
+    core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
-    proc_handle: Optional[BackgroundProcHandle] = None
     shutdown_path: Optional[str] = None
 
     def __call__(self):
         """Clean up background resources."""
 
-        if self.proc_handle is not None:
-            self.proc_handle.shutdown()
+        for core_engine in self.core_engines:
+            core_engine.close()
+
         # ZMQ context termination can hang if the sockets
         # aren't explicitly closed first.
         if self.output_socket is not None:
             self.output_socket.close(linger=0)
-        if self.input_socket is not None:
-            self.input_socket.close(linger=0)
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -284,7 +334,7 @@ def sigusr1_handler(signum, frame):
         self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
-        sync_ctx = zmq.Context()
+        sync_ctx = zmq.Context(io_threads=2)
         self.ctx = zmq.asyncio.Context(sync_ctx) if asyncio_mode else sync_ctx
 
         # This will ensure resources created so far are closed
@@ -293,28 +343,38 @@ def sigusr1_handler(signum, frame):
         self.resources = BackgroundResources(ctx=sync_ctx)
         self._finalizer = weakref.finalize(self, self.resources)
 
-        # Paths for IPC.
+        # Paths and sockets for IPC.
         self.output_path = get_open_zmq_ipc_path()
-        input_path = get_open_zmq_ipc_path()
 
-        # Start EngineCore in background process.
-        self.resources.proc_handle = BackgroundProcHandle(
-            input_path=input_path,
-            output_path=self.output_path,
-            process_name="EngineCore",
-            target_fn=EngineCoreProc.run_engine_core,
-            process_kwargs={
-                "vllm_config": vllm_config,
-                "executor_class": executor_class,
-                "log_stats": log_stats,
-            })
-
-        # Create input socket.
-        self.resources.input_socket = make_zmq_socket(self.ctx, input_path,
-                                                      zmq.constants.PUSH)
-        self.input_socket = self.resources.input_socket
+        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+            vllm_config, executor_class, log_stats, self.ctx, self.output_path,
+            index, local_dp_rank)
+
+        # Start engine core process(es).
+        self._init_core_engines(vllm_config, new_core_engine,
+                                self.resources.core_engines)
+
+        # Wait for engine core process(es) to start.
+        for engine in self.resources.core_engines:
+            engine.proc_handle.wait_for_startup()
+
         self.utility_results: dict[int, AnyFuture] = {}
 
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Default case - single core engine.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+        core_engine = new_core_engine(
+            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+        core_engines.append(core_engine)
+        self.core_engine = core_engine
+
     def shutdown(self):
         self._finalizer()
 
@@ -370,7 +430,7 @@ def process_outputs_socket():
                         # shutdown signal, exit thread.
                         break
 
-                    (frame, ) = out_socket.recv_multipart(copy=False)
+                    frame = out_socket.recv(copy=False)
                     outputs = decoder.decode(frame.buffer)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
@@ -391,18 +451,15 @@ def process_outputs_socket():
     def get_output(self) -> EngineCoreOutputs:
         return self.outputs_queue.get()
 
-    def _send_input(self, request_type: EngineCoreRequestType,
-                    request: Any) -> None:
-
+    def _send_input(self, request_type: EngineCoreRequestType, request: Any):
         # (RequestType, SerializedRequest)
         msg = (request_type.value, self.encoder.encode(request))
-        self.input_socket.send_multipart(msg, copy=False)
+        self.core_engine.send_multipart(msg)
 
-    def _call_utility(self, method: str, *args) -> Any:
+    def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
         future: Future[Any] = Future()
         self.utility_results[call_id] = future
-
         self._send_input(EngineCoreRequestType.UTILITY,
                          (call_id, method, args))
 
@@ -419,34 +476,34 @@ def abort_requests(self, request_ids: list[str]) -> None:
             self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     def profile(self, is_start: bool = True) -> None:
-        self._call_utility("profile", is_start)
+        self.call_utility("profile", is_start)
 
     def reset_prefix_cache(self) -> None:
-        self._call_utility("reset_prefix_cache")
+        self.call_utility("reset_prefix_cache")
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self._call_utility("add_lora", lora_request)
+        return self.call_utility("add_lora", lora_request)
 
     def remove_lora(self, lora_id: int) -> bool:
-        return self._call_utility("remove_lora", lora_id)
+        return self.call_utility("remove_lora", lora_id)
 
     def list_loras(self) -> set[int]:
-        return self._call_utility("list_loras")
+        return self.call_utility("list_loras")
 
     def pin_lora(self, lora_id: int) -> bool:
-        return self._call_utility("pin_lora", lora_id)
+        return self.call_utility("pin_lora", lora_id)
 
     def sleep(self, level: int = 1) -> None:
-        self._call_utility("sleep", level)
+        self.call_utility("sleep", level)
 
     def wake_up(self) -> None:
-        self._call_utility("wake_up")
+        self.call_utility("wake_up")
 
     def is_sleeping(self) -> bool:
-        return self._call_utility("is_sleeping")
+        return self.call_utility("is_sleeping")
 
     def execute_dummy_batch(self) -> None:
-        self._call_utility("execute_dummy_batch")
+        self.call_utility("execute_dummy_batch")
 
 
 class AsyncMPClient(MPClient):
@@ -464,13 +521,21 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
         self.outputs_queue: Optional[asyncio.Queue[EngineCoreOutputs]] = None
         self.queue_task: Optional[asyncio.Task] = None
 
-    async def _start_output_queue_task(self):
+        self.outputs_handler: Optional[Callable[
+            [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
+
+    def _ensure_output_queue_task(self):
+        if self.outputs_queue is not None:
+            return
+
         # Perform IO in separate task to parallelize as much as possible.
         # Avoid task having direct reference back to the client.
         self.outputs_queue = asyncio.Queue()
         decoder = self.decoder
         utility_results = self.utility_results
         outputs_queue = self.outputs_queue
+        output_handler = self.outputs_handler
+        _self_ref = weakref.ref(self) if output_handler else None
         output_path = self.output_path
         output_socket = make_zmq_socket(self.ctx, output_path,
                                         zmq.constants.PULL)
@@ -483,34 +548,52 @@ async def process_outputs_socket():
                 if outputs.utility_output:
                     _process_utility_output(outputs.utility_output,
                                             utility_results)
-                else:
+                    continue
+
+                if output_handler is not None:
+                    assert _self_ref is not None
+                    _self = _self_ref()
+                    if not _self:
+                        # Client has been garbage collected, abort.
+                        return
+                    await output_handler(_self, outputs)
+
+                if outputs.outputs or outputs.scheduler_stats:
                     outputs_queue.put_nowait(outputs)
 
         self.queue_task = asyncio.create_task(process_outputs_socket(),
                                               name="EngineCoreOutputQueueTask")
 
     async def get_output_async(self) -> EngineCoreOutputs:
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
-            assert self.outputs_queue is not None
+        self._ensure_output_queue_task()
+        assert self.outputs_queue is not None
         return await self.outputs_queue.get()
 
     async def _send_input(self, request_type: EngineCoreRequestType,
                           request: Any) -> None:
+        await self.core_engine.send_multipart(
+            (request_type.value, self.encoder.encode(request)))
 
-        msg = (request_type.value, self.encoder.encode(request))
-        await self.input_socket.send_multipart(msg, copy=False)
+        self._ensure_output_queue_task()
 
-        if self.outputs_queue is None:
-            await self._start_output_queue_task()
+    async def call_utility_async(self, method: str, *args) -> Any:
+        return await self._call_utility_async(method,
+                                              *args,
+                                              engine=self.core_engine)
 
-    async def _call_utility_async(self, method: str, *args) -> Any:
+    async def _call_utility_async(
+        self,
+        method: str,
+        *args,
+        engine: CoreEngine,
+    ) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        await self._send_input(EngineCoreRequestType.UTILITY,
-                               (call_id, method, args))
-
+        message = (EngineCoreRequestType.UTILITY.value,
+                   self.encoder.encode((call_id, method, args)))
+        await engine.send_multipart(message)
+        self._ensure_output_queue_task()
         return await future
 
     async def add_request_async(self, request: EngineCoreRequest) -> None:
@@ -524,31 +607,146 @@ async def abort_requests_async(self, request_ids: list[str]) -> None:
             await self._send_input(EngineCoreRequestType.ABORT, request_ids)
 
     async def profile_async(self, is_start: bool = True) -> None:
-        await self._call_utility_async("profile", is_start)
+        await self.call_utility_async("profile", is_start)
 
     async def reset_prefix_cache_async(self) -> None:
-        await self._call_utility_async("reset_prefix_cache")
+        await self.call_utility_async("reset_prefix_cache")
 
     async def sleep_async(self, level: int = 1) -> None:
-        await self._call_utility_async("sleep", level)
+        await self.call_utility_async("sleep", level)
 
     async def wake_up_async(self) -> None:
-        await self._call_utility_async("wake_up")
+        await self.call_utility_async("wake_up")
 
     async def is_sleeping_async(self) -> bool:
-        return await self._call_utility_async("is_sleeping")
+        return await self.call_utility_async("is_sleeping")
 
     async def execute_dummy_batch_async(self) -> None:
-        await self._call_utility_async("execute_dummy_batch")
+        await self.call_utility_async("execute_dummy_batch")
 
     async def add_lora_async(self, lora_request: LoRARequest) -> bool:
-        return await self._call_utility_async("add_lora", lora_request)
+        return await self.call_utility_async("add_lora", lora_request)
 
     async def remove_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("remove_lora", lora_id)
+        return await self.call_utility_async("remove_lora", lora_id)
 
     async def list_loras_async(self) -> set[int]:
-        return await self._call_utility_async("list_loras")
+        return await self.call_utility_async("list_loras")
 
     async def pin_lora_async(self, lora_id: int) -> bool:
-        return await self._call_utility_async("pin_lora", lora_id)
+        return await self.call_utility_async("pin_lora", lora_id)
+
+
+class DPAsyncMPClient(AsyncMPClient):
+    """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
+    EngineCore."""
+
+    def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
+                 log_stats: bool):
+        super().__init__(vllm_config, executor_class, log_stats)
+
+        assert len(self.core_engines) > 1
+
+        # Control message used for triggering dp idle mode loop.
+        self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
+                             self.encoder.encode(None))
+
+        self.num_engines_running = 0
+        self.reqs_in_flight: dict[str, CoreEngine] = {}
+
+        self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
+
+    def _init_core_engines(
+        self,
+        vllm_config: VllmConfig,
+        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+        core_engines: list[CoreEngine],
+    ) -> None:
+
+        # Launch a core engine for each data parallel rank.
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        for i in range(dp_size):
+            # Multi-node not yet supported so local_dp_rank == dp_rank.
+            core_engines.append(new_core_engine(i, i))
+
+        self.core_engines = core_engines
+
+    async def call_utility_async(self, method: str, *args) -> Any:
+        # Only the result from the first engine is returned.
+        return (await asyncio.gather(*[
+            self._call_utility_async(method, *args, engine=engine)
+            for engine in self.core_engines
+        ]))[0]
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        # NOTE: text prompt is not needed in the core engine as it has been
+        # tokenized.
+        request.prompt = None
+
+        msg = (EngineCoreRequestType.ADD.value, self.encoder.encode(request))
+
+        chosen_engine = self.get_core_engine_for_request()
+        self.reqs_in_flight[request.request_id] = chosen_engine
+        chosen_engine.num_reqs_in_flight += 1
+        if self.num_engines_running >= len(self.core_engines):
+            await chosen_engine.send_multipart(msg)
+        else:
+            # Send request to chosen engine and dp start loop
+            # control message to all other engines.
+            self.num_engines_running += len(self.core_engines)
+            await asyncio.gather(*[
+                engine.send_multipart(msg if engine is
+                                      chosen_engine else self.start_dp_msg)
+                for engine in self.core_engines
+            ])
+
+        self._ensure_output_queue_task()
+
+    def get_core_engine_for_request(self) -> CoreEngine:
+        return min(self.core_engines, key=lambda e: e.num_reqs_in_flight)
+
+    @staticmethod
+    async def process_engine_outputs(self: "DPAsyncMPClient",
+                                     outputs: EngineCoreOutputs):
+        if self.reqs_in_flight:
+            for req_id in outputs.finished_requests or ():
+                if engine := self.reqs_in_flight.pop(req_id, None):
+                    engine.num_reqs_in_flight -= 1
+
+        if outputs.engine_paused:
+            assert self.num_engines_running >= 1
+            self.num_engines_running -= 1
+            if not self.num_engines_running and self.reqs_in_flight:
+                # If there are requests in flight here, they must have
+                # been sent after the engines paused. We must make
+                # sure to start the other engines:
+                self.num_engines_running = len(self.core_engines)
+                coros = [
+                    engine.send_multipart(self.start_dp_msg)
+                    for engine in self.core_engines
+                    if not engine.num_reqs_in_flight
+                ]
+                if coros:
+                    await asyncio.gather(*coros)
+
+    async def abort_requests_async(self, request_ids: list[str]) -> None:
+        if not request_ids:
+            return
+
+        if len(request_ids) == 1:
+            # Fast-path common case.
+            if engine := self.reqs_in_flight.get(request_ids[0]):
+                await self._abort_requests(request_ids, engine)
+            return
+
+        by_engine: dict[CoreEngine, list[str]] = {}
+        for req_id in request_ids:
+            if engine := self.reqs_in_flight.get(req_id):
+                by_engine.setdefault(engine, []).append(req_id)
+        for engine, req_ids in by_engine.items():
+            await self._abort_requests(req_ids, engine)
+
+    async def _abort_requests(self, request_ids: list[str],
+                              engine: CoreEngine) -> None:
+        await engine.send_multipart((EngineCoreRequestType.ABORT.value,
+                                     self.encoder.encode(request_ids)))
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 7bda3a30d20..8cc73f9fe72 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -8,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
@@ -60,11 +61,13 @@ def __init__(
         self.cache_config = vllm_config.cache_config
 
         # important: init dp group before init the engine_core
-        self.parallel_config = vllm_config.parallel_config
-        self.dp_enabled = self.parallel_config.data_parallel_size > 1  # noqa
+        # In the decoupled engine case this is handled in EngineCoreProc.
+        parallel_config = vllm_config.parallel_config
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
+            self.dp_group = parallel_config.stateless_init_dp_group()
+        else:
+            self.dp_group = None
         self.should_execute_dummy_batch = False
-        if self.dp_enabled:
-            self.dp_group = self.parallel_config.stateless_init_dp_group()
 
         # Tokenizer (+ ensure liveness if running in another process).
         self.tokenizer = init_tokenizer_from_configs(
@@ -148,7 +151,7 @@ def get_num_unfinished_requests(self) -> int:
 
     def has_unfinished_requests(self) -> bool:
         has_unfinished = self.output_processor.has_unfinished_requests()
-        if not self.dp_enabled:
+        if self.dp_group is None:
             return has_unfinished
         return self.has_unfinished_requests_dp(has_unfinished)
 
@@ -280,3 +283,7 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
+
+    def __del__(self):
+        if dp_group := getattr(self, "dp_group", None):
+            stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 21e7d26506d..1d5175eb6ad 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -235,7 +235,10 @@ def __init__(
         worker_response_mq_handle = self.worker_response_mq.export_handle()
 
         # Send Readiness signal to EngineCore process.
-        with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket:
+        # Set linger here because we want to ensure the message has
+        # been sent before the context is closed.
+        with zmq_socket_ctx(ready_path, zmq.constants.PUSH,
+                            linger=10000) as ready_socket:
             payload = pickle.dumps(worker_response_mq_handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
             ready_socket.send_string(WorkerProc.READY_STR)
@@ -270,11 +273,13 @@ def make_worker_process(
         proc = context.Process(target=WorkerProc.worker_main,
                                kwargs=process_kwargs,
                                daemon=True)
-        proc.start()
 
-        # Wait for startup
-        worker_response_mq_handle = WorkerProc.wait_for_startup(
-            proc, ready_path)
+        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as ready_socket:
+            proc.start()
+
+            # Wait for startup
+            worker_response_mq_handle = WorkerProc.wait_for_startup(
+                proc, ready_socket)
 
         worker_response_mq = MessageQueue.create_from_handle(
             worker_response_mq_handle, 0)
@@ -337,23 +342,22 @@ def signal_handler(signum, frame):
     @staticmethod
     def wait_for_startup(
         proc: BaseProcess,
-        ready_path: str,
+        ready_socket: zmq.Socket,
     ) -> Optional[Handle]:
         """Wait until the Worker is ready."""
-        with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket:
 
-            # Wait for Worker to send READY.
-            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
-                logger.debug("Waiting for WorkerProc to startup.")
+        # Wait for Worker to send READY.
+        while ready_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+            logger.debug("Waiting for WorkerProc to startup.")
 
-                if not proc.is_alive():
-                    raise RuntimeError("WorkerProc failed to start.")
+            if not proc.is_alive():
+                raise RuntimeError("WorkerProc failed to start.")
 
-            message = socket.recv_string()
-            assert message == WorkerProc.READY_STR
-            handle_frame = socket.recv(copy=False)
-            handle = pickle.loads(handle_frame.buffer)
-            return handle
+        message = ready_socket.recv_string()
+        assert message == WorkerProc.READY_STR
+        handle_frame = ready_socket.recv(copy=False)
+        handle = pickle.loads(handle_frame.buffer)
+        return handle
 
     class ResponseStatus(Enum):
         SUCCESS = auto()
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index fcb4d4f5a25..6ffd00ebd17 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -31,7 +31,8 @@ def log(self):  # noqa
 
 class LoggingStatLogger(StatLoggerBase):
 
-    def __init__(self):
+    def __init__(self, engine_index: int = 0):
+        self.engine_index = engine_index
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
@@ -78,11 +79,13 @@ def log(self):
 
         # Format and print output.
         logger.info(
+            "Engine %03d: "
             "Avg prompt throughput: %.1f tokens/s, "
             "Avg generation throughput: %.1f tokens/s, "
             "Running: %d reqs, Waiting: %d reqs, "
             "GPU KV cache usage: %.1f%%, "
             "Prefix cache hit rate: %.1f%%",
+            self.engine_index,
             prompt_throughput,
             generation_throughput,
             scheduler_stats.num_running_reqs,
@@ -94,7 +97,7 @@ def log(self):
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, vllm_config: VllmConfig):
+    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self._unregister_vllm_metrics()
 
         # Use this flag to hide metrics that were deprecated in
@@ -102,8 +105,11 @@ def __init__(self, vllm_config: VllmConfig):
         self.show_hidden_metrics = \
             vllm_config.observability_config.show_hidden_metrics
 
-        labelnames = ["model_name"]
-        labelvalues = [vllm_config.model_config.served_model_name]
+        labelnames = ["model_name", "engine"]
+        labelvalues = [
+            vllm_config.model_config.served_model_name,
+            str(engine_index)
+        ]
 
         max_model_len = vllm_config.model_config.max_model_len
 
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 6c01ed3de52..f42b3501adb 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -105,7 +105,7 @@ def __init__(
         process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
-        reader, writer = context.Pipe(duplex=False)
+        self.reader, writer = context.Pipe(duplex=False)
 
         assert ("ready_pipe" not in process_kwargs
                 and "input_path" not in process_kwargs
@@ -115,14 +115,17 @@ def __init__(
         process_kwargs["output_path"] = output_path
 
         # Run busy loop in background process.
-        self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
+        self.proc = context.Process(target=target_fn,
+                                    kwargs=process_kwargs,
+                                    name=process_name)
         self._finalizer = weakref.finalize(self, shutdown, self.proc,
                                            input_path, output_path)
         self.proc.start()
 
+    def wait_for_startup(self):
         # Wait for startup.
-        if reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{process_name} initialization failed. "
+        if self.reader.recv()["status"] != "READY":
+            raise RuntimeError(f"{self.proc.name} initialization failed. "
                                "See root cause above.")
 
     def shutdown(self):

From 2a48d90d9898b79fa2b607a4f80bf440efe42a2d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 21:57:01 -0400
Subject: [PATCH 1031/1240] [TPU] Lazy Import (#15656)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index b8178af5a2d..4206a24465e 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -15,7 +15,6 @@
 from torch.distributed import ProcessGroup, TCPStore
 from torch.distributed.distributed_c10d import (Backend, PrefixStore,
                                                 _get_default_timeout,
-                                                _shutdown_backend,
                                                 _unregister_process_group,
                                                 is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
@@ -343,5 +342,7 @@ def stateless_destroy_torch_distributed_process_group(
     Destroy ProcessGroup returned by
         stateless_init_torch_distributed_process_group().
     """
+    # Lazy import for non-CUDA backends.
+    from torch.distributed.distributed_c10d import _shutdown_backend
     _shutdown_backend(pg)
     _unregister_process_group(pg.group_name)

From 58aac988b9e2f8564403e5d42fb59f53e39865a4 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 10:12:47 +0800
Subject: [PATCH 1032/1240] [Quantization][V1]  BitsAndBytes support V1
 (#15611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_mllama.py            |  1 -
 tests/models/test_transformers.py             |  1 -
 tests/quantization/test_bitsandbytes.py       |  3 -
 vllm/config.py                                |  6 +-
 vllm/engine/arg_utils.py                      |  2 +-
 .../layers/quantization/bitsandbytes.py       | 61 ++++++++++++++-----
 vllm/model_executor/model_loader/loader.py    |  2 +
 7 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index ae7a7b028b1..260d2c10938 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -425,7 +425,6 @@ def test_bnb_regression(
         max_model_len=4096,
         max_num_seqs=2,
         quantization="bitsandbytes",
-        load_format="bitsandbytes",
     )
     sampling_params = SamplingParams(
         temperature=0,
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index c45fc7e649e..65bb11d6b5e 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -72,7 +72,6 @@ def test_distributed(
         "meta-llama/Llama-3.2-1B-Instruct",
         {
             "quantization": "bitsandbytes",
-            "load_format": "bitsandbytes",
         },
     ),
 ])
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 1b6a9184014..533b055ee6d 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -101,8 +101,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
         "--enable-prefix-caching",
         "--quantization",
         "bitsandbytes",
-        "--load-format",
-        "bitsandbytes",
         "--gpu-memory-utilization",
         "0.7",
     ]
@@ -137,7 +135,6 @@ def validate_generated_texts(hf_runner,
     # when using distributed inference
     with vllm_runner(model_name,
                      quantization='bitsandbytes',
-                     load_format='bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
                      enforce_eager=False) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
diff --git a/vllm/config.py b/vllm/config.py
index 831fa2e4b06..5c73ff56ebb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -682,8 +682,9 @@ def _verify_cuda_graph(self) -> None:
 
     def _verify_bnb_config(self) -> None:
         """
-        The current version of bitsandbytes (0.44.0) with 8-bit models does not
+        The current version of bitsandbytes (0.45.3) with 8-bit models does not
         yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
         """
         is_bitsandbytes = self.quantization == "bitsandbytes"
         has_quantization_config = (getattr(self.hf_config,
@@ -698,8 +699,9 @@ def _verify_bnb_config(self) -> None:
                 not self.enforce_eager,
         ]):
             logger.warning(
-                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                 "fallback to the eager mode.")
+
             self.enforce_eager = True
 
     def _verify_with_expert_parallelism(self) -> None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a3b83c65a60..d049f773cac 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1616,7 +1616,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["bitsandbytes", "gguf"]
+        V1_UNSUPPORTED_QUANT = ["gguf"]
         if model_config.quantization in V1_UNSUPPORTED_QUANT:
             _raise_or_fallback(
                 feature_name=f"--quantization {model_config.quantization}",
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 1e8e7aa1b8c..f5d32efe836 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -9,6 +9,7 @@
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import direct_register_custom_op
 
 
 class BitsAndBytesConfig(QuantizationConfig):
@@ -321,9 +322,6 @@ def _apply_4bit_weight(
             x: torch.Tensor,
             bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
-        # only load the bitsandbytes module when needed
-        from bitsandbytes import matmul_4bit
-
         original_type = x.dtype
         original_shape = x.shape
         reshape_after_matmul = False
@@ -343,19 +341,7 @@ def _apply_4bit_weight(
                           out_dim_1,
                           dtype=torch.bfloat16,
                           device=x.device)
-
-        current_index = 0
-        for i in range(len(quant_states)):
-            output_size = quant_states[i].shape[0]
-            # It is more efficient to use out kwarg like
-            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
-            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
-            # Need to change  after the bug is fixed.
-            out[:, current_index:current_index + output_size] = matmul_4bit(
-                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
-
-            current_index += output_size
-
+        apply_bnb_4bit(bf_x, qweight, offsets, out)
         out = out.to(original_type)
 
         if reshape_after_matmul:
@@ -365,3 +351,46 @@ def _apply_4bit_weight(
             out += bias
 
         return out
+
+
+def _apply_bnb_4bit(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # only load the bitsandbytes module when needed
+    from bitsandbytes import matmul_4bit
+    quant_states = weight.bnb_quant_state
+    current_index = 0
+    for i in range(len(quant_states)):
+        output_size = quant_states[i].shape[0]
+        # It is more efficient to use out kwarg like
+        # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+        # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+        # Need to change  after the bug is fixed.
+        out[:, current_index:current_index + output_size] = matmul_4bit(
+            x, weight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+        current_index += output_size
+
+
+def _apply_bnb_4bit_fake(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    offsets: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    return
+
+
+try:
+    direct_register_custom_op(
+        op_name="apply_bnb_4bit",
+        op_func=_apply_bnb_4bit,
+        mutates_args=["out"],
+        fake_impl=_apply_bnb_4bit_fake,
+    )
+    apply_bnb_4bit = torch.ops.vllm.apply_bnb_4bit
+
+except AttributeError as error:
+    raise error
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index c969f18b822..5649cf2dd2c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1259,6 +1259,8 @@ def _load_weights(self, model_config: ModelConfig,
                                          pack_ratio)
 
                 offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                # Make torch infer_schema happy
+                offsets = torch.tensor(offsets).cpu()
                 set_weight_attrs(param, {"bnb_shard_offsets": offsets})
 
                 if load_8bit:

From 3c3b7fda1ebf8bb33707a01022435ae3fed31836 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 28 Mar 2025 10:13:41 +0800
Subject: [PATCH 1033/1240] [Bugfix] Fix failure to launch in Tensor Parallel
 TP mode on macOS. (#14948)

Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/design/multiprocessing.md                  | 4 ++--
 vllm/distributed/device_communicators/shm_broadcast.py | 9 +++++++--
 vllm/platforms/cpu.py                                  | 8 ++++++++
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md
index 55dae0bb92d..43fe5fe2e5e 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/source/design/multiprocessing.md
@@ -24,7 +24,7 @@ This document describes how vLLM deals with these challenges.
 [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
 
 - `spawn` - spawn a new Python process. This will be the default as of Python
-  3.14.
+  3.14. In macOS, this is already the default.
 
 - `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
   in Python versions prior to 3.14.
@@ -34,7 +34,7 @@ This document describes how vLLM deals with these challenges.
 ### Tradeoffs
 
 `fork` is the fastest method, but is incompatible with dependencies that use
-threads.
+threads. If you are under macOS, using `fork` may cause the process to crash.
 
 `spawn` is more compatible with dependencies, but can be problematic when vLLM
 is used as a library. If the consuming code does not use a `__main__` guard (`if
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 0d54fc73c88..11ed7c08437 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -125,8 +125,13 @@ def __init__(self,
                        lambda *args, **kwargs: None):
                 try:
                     self.shared_memory = shared_memory.SharedMemory(name=name)
-                    assert (
-                        self.shared_memory.size == self.total_bytes_of_buffer)
+                    # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                    # Some platforms allocate memory based on page size,
+                    # so the shared memory block size may be larger or equal
+                    # to the requested size. The size parameter is ignored
+                    # when attaching to an existing block.
+                    assert (self.shared_memory.size
+                            >= self.total_bytes_of_buffer)
                 except FileNotFoundError:
                     # we might deserialize the object in a different node
                     # in this case, this object is not used,
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 0eb747a4c45..619219023f4 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import sys
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -148,6 +149,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         # To hint IPEX uses shared memory based AllReduce
         os.environ["LOCAL_WORLD_SIZE"] = str(
             vllm_config.parallel_config.tensor_parallel_size)
+        if sys.platform == "darwin" and \
+                envs.VLLM_WORKER_MULTIPROC_METHOD == "fork":
+            if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None:
+                logger.warning(
+                    "Default to spawn method on MacOS. If this is not desired,"
+                    " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
+                os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:

From bc51b9ffa347ad8601e6f2e074507b8b4f173525 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 27 Mar 2025 21:43:40 -0500
Subject: [PATCH 1034/1240] [Doc] Fix dead links in Job Board (#15637)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3e790827f53..a83ad764125 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -104,7 +104,7 @@
         "classes": ["github"],
     },
     "gh-project": {
-        "url": "https://github.com/vllm-project/projects/{{path}}",
+        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
         "title": "Project #{{path}}",
         "classes": ["github"],
     },

From 2e438c80ee52f6b3ca30538cbdfb5ef20e34e048 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 27 Mar 2025 22:45:05 -0400
Subject: [PATCH 1035/1240] [CI][TPU] Temporarily Disable Quant Test on TPU
 (#15649)

Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh | 9 +++++----
 tests/v1/tpu/test_basic.py    | 3 ---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index a93b79c0b1b..7bd91575e17 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -28,15 +28,16 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_3 \
     && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
-    && echo TEST_5 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
-    && echo TEST_6 \
+    && echo TEST_5 \
     && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
-    && echo TEST_7 \
+    && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 
+# TODO: Re-enable this after fixing recompilation in quantization.
+# && echo TEST_4 \
+# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 41748385391..591aa9c5878 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -31,14 +31,12 @@
                     reason="This is a basic test for TPU only")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
 def test_models(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
     max_tokens: int,
-    enforce_eager: bool,
     tensor_parallel_size: int,
 ) -> None:
     prompt = "The next numbers of the sequence " + ", ".join(
@@ -51,7 +49,6 @@ def test_models(
         with vllm_runner(
                 model,
                 max_model_len=8192,
-                enforce_eager=enforce_eager,
                 gpu_memory_utilization=0.7,
                 max_num_seqs=16,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:

From 4771a27eabaa64253bba413a6d0c977d1de0bed2 Mon Sep 17 00:00:00 2001
From: Wes <wryanmedford@gmail.com>
Date: Thu, 27 Mar 2025 20:45:55 -0600
Subject: [PATCH 1036/1240] Revert "Use Cache Hinting for fused_moe kernel
 (#15511)" (#15645)

Signed-off-by: Wes Medford <wryanmedford@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../model_executor/layers/fused_moe/fused_moe.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 0929530ebec..70d0037d7cb 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -189,11 +189,7 @@ def fused_moe_kernel_gptq_awq(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(
-            b_ptrs,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
+        b = tl.load(b_ptrs)
         if use_int4_w4a16:
             b = (b >> b_shifter) & 0xF
 
@@ -395,13 +391,9 @@ def fused_moe_kernel(
                     mask=token_mask[:, None] &
                     (offs_k[None, :] < K - k * BLOCK_SIZE_K),
                     other=0.0)
-        b = tl.load(
-            b_ptrs,
-            mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
-            other=0.0,
-            cache_modifier=".cg",
-            eviction_policy="evict_last",
-        )
+        b = tl.load(b_ptrs,
+                    mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                    other=0.0)
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)

From f52842f85bfddb6a8a4ed6730f4eadde2b61b1fc Mon Sep 17 00:00:00 2001
From: Chen Xia <cxia0209@gmail.com>
Date: Thu, 27 Mar 2025 19:47:05 -0700
Subject: [PATCH 1037/1240] [Misc]add coding benchmark for speculative decoding
 (#15303)

Signed-off-by: CXIAAAAA <cxia0209@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_dataset.py    | 63 ++++++++++++++++++++++++++++++
 benchmarks/benchmark_serving.py    | 16 +++++---
 benchmarks/benchmark_throughput.py | 43 ++++++++++++--------
 3 files changed, 101 insertions(+), 21 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 0567875f986..38ef739c69f 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -715,3 +715,66 @@ def sample(
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# Instruct Coder Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class InstructCoderDataset(HuggingFaceDataset):
+    """
+    InstructCoder Dataset.
+    https://huggingface.co/datasets/likaixin/InstructCoder
+
+    InstructCoder is the dataset designed for general code editing.
+    It consists of 114,239 instruction-input-output triplets,
+    and covers multiple distinct code editing scenario.
+    """
+
+    DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
+    DEFAULT_NUM_REQUESTS = 1000
+    INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
+
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
+            raise ValueError(f"Only support likaixin/InstructCoder dataset.\
+                    This data path {self.dataset_path} is not valid.")
+        if self.dataset_subset is None and self.dataset_split != "train":
+            raise ValueError("Dataset split must be 'train'.")
+
+    def load_data(self) -> None:
+        dataset = load_dataset(
+            self.dataset_path,
+            name=self.dataset_subset,
+            split=self.dataset_split,
+            streaming=True,
+        )
+        self.data = dataset.shuffle(seed=self.random_seed)
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               enable_multimodal_chat: bool = False,
+               **kwargs) -> list:
+        output_len = (output_len
+                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
+        sampled_requests = []
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt = f"{item['instruction']}:\n{item['input']}"
+            prompt_len = len(tokenizer(prompt).input_ids)
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 47627126b66..82c6b426b9a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -53,8 +53,9 @@
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -588,9 +589,14 @@ def main(args: argparse.Namespace):
     elif args.dataset_name == "hf":
         # Choose between VisionArenaDataset
         # and HuggingFaceDataset based on provided parameters.
-        dataset_class = (VisionArenaDataset if args.dataset_path
-                         == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                         and args.hf_subset is None else HuggingFaceDataset)
+        dataset_class = HuggingFaceDataset
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            assert args.hf_subset is None, "VisionArenaDataset needs hf_subset to be None."  #noqa: E501
+            dataset_class = VisionArenaDataset
+        elif args.dataset_path == "likaixin/InstructCoder":
+            dataset_class = InstructCoderDataset
+            args.hf_split = "train"
+
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 53869db478c..f2f68b0d1e5 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -12,8 +12,9 @@
 import torch
 import uvloop
 from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -300,6 +301,7 @@ def get_requests(args, tokenizer):
         "input_len": args.input_len,
         "output_len": args.output_len,
     }
+
     if args.dataset_path is None or args.dataset_name == "random":
         sample_kwargs["range_ratio"] = args.random_range_ratio
         sample_kwargs["prefix_len"] = args.prefix_len
@@ -317,17 +319,21 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
-        if args.backend != "vllm-chat":
-            raise ValueError(
-                "hf datasets only are supported by vllm-chat backend")
-        # Choose between VisionArenaDataset and HuggingFaceDataset based on
-        # provided parameters.
-        dataset_cls = (VisionArenaDataset if args.dataset_path
-                       == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                       and args.hf_subset is None else HuggingFaceDataset)
-        common_kwargs['dataset_subset'] = args.hf_subset
-        common_kwargs['dataset_split'] = args.hf_split
-        sample_kwargs["enable_multimodal_chat"] = True
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            if args.args.backend == "vllm-chat":
+                raise ValueError(
+                    "hf datasets only are supported by vllm-chat backend")
+            # Choose between VisionArenaDataset and HuggingFaceDataset based on
+            # provided parameters.
+            dataset_cls = (VisionArenaDataset if args.dataset_path
+                           == VisionArenaDataset.VISION_ARENA_DATASET_PATH
+                           and args.hf_subset is None else HuggingFaceDataset)
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
+        elif args.dataset_path == "likaixin/InstructCoder":
+            dataset_cls = InstructCoderDataset
+            common_kwargs['dataset_split'] = "train"
 
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@@ -462,9 +468,14 @@ def validate_args(args):
         warnings.warn("--hf-subset and --hf-split will be ignored \
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
-    elif args.dataset_name == "hf" and args.backend != "vllm-chat":
-        raise ValueError(
-            "When --dataset-name is 'hf', backend must be 'vllm-chat'")
+    elif args.dataset_name == "hf":
+        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+            assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path == "likaixin/InstructCoder":
+            assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
+        else:
+            raise ValueError(
+                f"{args.dataset_path} is not supported by hf dataset.")
 
     # --random-range-ratio: only used when dataset_name is 'random'
     if args.dataset_name != 'random' and args.random_range_ratio is not None:

From aff825afb4b0fd7d751994a3b23bc983509d13fc Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Thu, 27 Mar 2025 22:58:16 -0400
Subject: [PATCH 1038/1240] [Quantization][FP8] Adding support for fp8 gemm
 layer input in fp8 (#14578)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../schemes/compressed_tensors_w8a8_fp8.py    |  2 ++
 .../layers/quantization/fbgemm_fp8.py         |  2 ++
 .../model_executor/layers/quantization/fp8.py | 17 ++++++++++++
 .../quark/schemes/quark_w8a8_fp8.py           |  2 ++
 .../layers/quantization/utils/w8a8_utils.py   | 27 ++++++++++++-------
 5 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 27a74d677da..e99a452963f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -23,6 +23,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
 
     def __init__(self, strategy: str, is_static_input_scheme: bool):
         self.strategy = strategy
+        self.out_dtype = torch.get_default_dtype()
         self.is_static_input_scheme = is_static_input_scheme
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
 
@@ -143,5 +144,6 @@ def apply_weights(self,
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 1cc431c5cc7..7dddc40f344 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -73,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: FBGEMMFp8Config):
         self.quant_config = quant_config
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.out_dtype = torch.get_default_dtype()
 
     def create_weights(
         self,
@@ -161,6 +162,7 @@ def apply(self,
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=None,
                                      input_scale_ub=layer.input_scale_ub,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index f3907b4784b..11bfdb41805 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -116,6 +116,21 @@ def get_quant_method(self, layer: torch.nn.Module,
             return Fp8KVCacheMethod(self)
         return None
 
+    def get_cache_scale(self, name: str) -> Optional[str]:
+        """
+        Check whether the param name matches the format for k/v cache scales
+        in compressed-tensors. If this is the case, return its equivalent
+        param name expected by vLLM
+
+        :param name: param name
+        :return: matching param name for KV cache scale in vLLM
+        """
+        if name.endswith(".output_scale") and ".k_proj" in name:
+            return name.replace(".k_proj.output_scale", ".attn.k_scale")
+        if name.endswith(".output_scale") and ".v_proj" in name:
+            return name.replace(".v_proj.output_scale", ".attn.v_scale")
+        return None
+
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
@@ -138,6 +153,7 @@ class Fp8LinearMethod(LinearMethodBase):
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported()
+        self.out_dtype = torch.get_default_dtype()
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
@@ -386,6 +402,7 @@ def apply(self,
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
 
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
index 3e4251e4693..c161849c8c5 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py
@@ -22,6 +22,7 @@ def __init__(self, qscheme: str, is_static_input_scheme: Optional[bool]):
         self.qscheme = qscheme
         self.is_static_input_scheme = is_static_input_scheme
         self.fp8_linear = Fp8LinearOp(use_per_token_if_dynamic=True)
+        self.out_dtype = torch.get_default_dtype()
 
     @classmethod
     def get_min_capability(cls) -> int:
@@ -134,5 +135,6 @@ def apply_weights(self,
         return self.fp8_linear.apply(input=x,
                                      weight=layer.weight,
                                      weight_scale=layer.weight_scale,
+                                     out_dtype=self.out_dtype,
                                      input_scale=layer.input_scale,
                                      bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index c2bd4bce560..b8e6384d735 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -163,6 +163,7 @@ def apply(
         input: torch.Tensor,
         weight: torch.Tensor,
         weight_scale: torch.Tensor,
+        out_dtype: Optional[torch.dtype] = None,
         input_scale: Optional[torch.Tensor] = None,
         input_scale_ub: Optional[torch.Tensor] = None,
         bias: Optional[torch.Tensor] = None,
@@ -182,8 +183,13 @@ def apply(
         if use_per_token_if_dynamic is None:
             use_per_token_if_dynamic = self.use_per_token_if_dynamic
 
+        if out_dtype is None:
+            out_dtype = input.dtype
+
         # cutlass_scaled_mm supports per tensor/channel W and per tensor/token A
         if self.cutlass_fp8_supported:
+            assert input.dtype != current_platform.fp8_dtype(
+            ), "FP8 input to cutlass is not currently implemented"
             qinput, x_scale = ops.scaled_fp8_quant(
                 input_2d,
                 input_scale,
@@ -193,7 +199,7 @@ def apply(
             # Fused GEMM_DQ
             output = ops.cutlass_scaled_mm(qinput,
                                            weight,
-                                           out_dtype=input.dtype,
+                                           out_dtype=out_dtype,
                                            scale_a=x_scale,
                                            scale_b=weight_scale,
                                            bias=bias)
@@ -202,12 +208,15 @@ def apply(
         # torch.scaled_mm supports per tensor weights + activations only
         # so fallback to naive if per channel or per token
         else:
-            # Maybe apply padding to output, see comment in __init__
-            qinput, x_scale = ops.scaled_fp8_quant(
-                input_2d,
-                input_scale,
-                num_token_padding=self.output_padding,
-                use_per_token_if_dynamic=use_per_token_if_dynamic)
+            if input.dtype != current_platform.fp8_dtype():
+                # Maybe apply padding to output, see comment in __init__
+                qinput, x_scale = ops.scaled_fp8_quant(
+                    input_2d,
+                    input_scale,
+                    num_token_padding=self.output_padding,
+                    use_per_token_if_dynamic=use_per_token_if_dynamic)
+            else:
+                qinput, x_scale = input_2d, input_scale
 
             per_tensor_weights = (weight_scale.numel() == 1)
             per_tensor_activations = (x_scale.numel() == 1)
@@ -216,7 +225,7 @@ def apply(
                 # Fused GEMM_DQ
                 output = torch._scaled_mm(qinput,
                                           weight,
-                                          out_dtype=input.dtype,
+                                          out_dtype=out_dtype,
                                           scale_a=x_scale,
                                           scale_b=weight_scale,
                                           bias=bias)
@@ -240,7 +249,7 @@ def apply(
                 # Fused GEMM_DQ Rowwise GEMM
                 output = torch._scaled_mm(qinput,
                                           weight,
-                                          out_dtype=input.dtype,
+                                          out_dtype=out_dtype,
                                           scale_a=x_scale,
                                           scale_b=weight_scale.t(),
                                           bias=bias)

From 6596ab94df67802ec068bec8aeea25492af531b3 Mon Sep 17 00:00:00 2001
From: "Jason (Siyu) Zhu" <jasonchu13@outlook.com>
Date: Thu, 27 Mar 2025 20:27:20 -0700
Subject: [PATCH 1039/1240] Refactor error handling for multiple exceptions in
 preprocessing (#15650)

Signed-off-by: JasonZhu1313 <jasonchu13@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py         | 12 ++----------
 vllm/entrypoints/openai/serving_embedding.py    |  5 +----
 vllm/entrypoints/openai/serving_pooling.py      |  8 +-------
 vllm/entrypoints/openai/serving_tokenization.py |  8 +-------
 4 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3c35a848ea3..3102db4050f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -197,16 +197,8 @@ async def create_chat_completion(
                 truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except RuntimeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, RuntimeError,
+                jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 1c2c78aaf89..0ee58672631 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -139,10 +139,7 @@ async def create_embedding(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
+        except (ValueError, TypeError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 894128ee974..779a3eded2c 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -136,13 +136,7 @@ async def create_pooling(
                      truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 90c0da2a24d..c642fc51005 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -89,13 +89,7 @@ async def create_tokenize(
                      request.prompt,
                      add_special_tokens=request.add_special_tokens,
                  )
-        except ValueError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except TypeError as e:
-            logger.exception("Error in preprocessing prompt inputs")
-            return self.create_error_response(str(e))
-        except jinja2.TemplateError as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 

From 164af0fc1af8dabc9f59da40155d338a9c4e0b8e Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 13:51:05 +0800
Subject: [PATCH 1040/1240] [Bugfix] Fix `mm_hashes` forgetting to be passed
 (#15668)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/inputs/preprocess.py                        |  2 ++
 vllm/model_executor/models/llava.py              |  2 ++
 vllm/model_executor/models/mllama.py             |  2 +-
 vllm/model_executor/models/phi4mm.py             | 16 ++++++++--------
 .../models/prithvi_geospatial_mae.py             |  1 +
 vllm/multimodal/inputs.py                        |  2 +-
 6 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 33f39bedea5..5cda5e5e3de 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -528,6 +528,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
                     prompt_token_ids=decoder_inputs_to_override[
                         "prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
             else:
@@ -536,6 +537,7 @@ def _separate_enc_dec_inputs_from_mm_processor_outputs(
                     prompt=inputs["prompt"],
                     prompt_token_ids=inputs["prompt_token_ids"],
                     mm_kwargs=inputs["mm_kwargs"],
+                    mm_hashes=inputs["mm_hashes"],
                     mm_placeholders=inputs["mm_placeholders"],
                 )
         elif inputs["type"] == "token":
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 826f04b3754..45a0bf73b83 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -868,6 +868,7 @@ def apply(
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
         mm_kwargs = result["mm_kwargs"]
+        mm_hashes = result["mm_hashes"]
 
         # We reimplement the functionality of MLlavaProcessor from
         # https://github.com/TIGER-AI-Lab/Mantis.git
@@ -916,6 +917,7 @@ def get_replacement_mantis(item_idx: int):
             prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 9ed49597cf8..d2c8fb72372 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1378,7 +1378,7 @@ def forward(
             # Because attn_metadata.encoder_seq_lens only counts the last
             # group of images for each sample, which is used to cheat the
             # block manager to allocate blocks for those images only.
-            # See input_processor_for_mllama() for more details.
+            # See MllamaMultiModalProcessor for more details.
             num_tiles_tensor = kwargs.pop("num_tiles")
             num_tiles = [t.tolist() for t in num_tiles_tensor]
             num_tokens_per_tile = calc_token_per_chunk(self.image_size)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 3d4505d556e..cb75ee1ea2c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -28,7 +28,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
 
 
 def input_mapper_for_phi4mm_audio(ctx: InputContext,
-                                  data: object) -> MultiModalInputs:
+                                  data: object) -> MultiModalKwargs:
     """
-    This function is used to create the MultiModalInputs for the Phi4MM 
+    This function is used to create the MultiModalKwargs for the Phi4MM 
     (audio) model.
     Specifically, for audio, we extract the audio features from the sound 
     file and create pairs of audio features and audio embed lengths (the
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
         data (object): Audio data.
 
     Returns:
-        MultiModalInputs: Multi-modal inputs.
+        MultiModalKwargs: Multi-modal inputs.
     """
     if not isinstance(data, list):
         data = [data]
 
     if len(data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
 
     audio_features = []
     for audio_input in data:
@@ -1365,7 +1365,7 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
             [single_audio_embed_size],
         )
         audio_features.append(single_audio_feature_audio_len_pair)
-    return MultiModalInputs({"audio_features": audio_features})
+    return MultiModalKwargs({"audio_features": audio_features})
 
 
 def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
@@ -1373,7 +1373,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
         data = [data]
     # data: list of PIL images
     if len(data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
     hf_config = ctx.get_hf_config()
     vision_encoder_name = hf_config.img_processor
     if vision_encoder_name is None:
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
 
     image_input_dict = preprocess(data, dynamic_hd_size, vit_image_size,
                                   vit_patch_size)
-    return MultiModalInputs({
+    return MultiModalKwargs({
         "pixel_values":
         image_input_dict["pixel_values"],
         "image_sizes":
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 3f5faea4f87..a69c0fc54e4 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -105,6 +105,7 @@ def apply(
             prompt=prompt,
             prompt_token_ids=[1],
             mm_kwargs=MultiModalKwargs(mm_kwargs),
+            mm_hashes=None,
             mm_placeholders={},
         )
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 3a588bb4eab..81d72ff1902 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -743,7 +743,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
+    mm_hashes: Optional["MultiModalHashDict"]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict

From 4cd614b38cf543f52d11a36dfa2740922ad243ae Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 14:34:34 +0800
Subject: [PATCH 1041/1240] [V1] Remove legacy input registry (#15673)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../multimodal/processing/test_h2ovl.py       |  7 +--
 .../multimodal/processing/test_idefics3.py    |  7 +--
 .../multimodal/processing/test_internvl.py    |  7 +--
 .../multimodal/processing/test_llava_next.py  | 16 +----
 .../processing/test_llava_onevision.py        | 16 +----
 .../multimodal/processing/test_phi3v.py       |  7 +--
 .../multimodal/processing/test_qwen2_vl.py    |  8 +--
 tests/multimodal/test_processing.py           | 18 ++----
 vllm/inputs/preprocess.py                     | 12 ++--
 vllm/inputs/registry.py                       | 25 +++++---
 vllm/multimodal/profiling.py                  | 55 +++++++---------
 vllm/multimodal/registry.py                   | 63 ++++++++++++++++---
 vllm/v1/engine/async_llm.py                   |  7 ++-
 vllm/v1/engine/llm_engine.py                  |  4 +-
 vllm/v1/engine/processor.py                   | 22 +++----
 vllm/v1/worker/gpu_model_runner.py            |  9 +--
 vllm/v1/worker/tpu_model_runner.py            |  2 -
 17 files changed, 132 insertions(+), 153 deletions(-)

diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
index 713fc733e21..709a686577f 100644
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -10,7 +10,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -156,11 +155,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py
index 4cff429a539..f5b5cf6b5ba 100644
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -4,7 +4,6 @@
 from transformers import Idefics3Config
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -38,11 +37,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
index f5bd661071a..5ac47ecc5cc 100644
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -10,7 +10,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -113,11 +112,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": len(size_factors)},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     min_num = min_dynamic_patch if dynamic_image_size else 1
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index 74bca0e3589..fe56a200a33 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -40,10 +39,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -168,10 +161,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index c27898a40b7..7cefdd37ee4 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,6 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ...utils import build_model_context
 
@@ -41,10 +40,7 @@ def test_processor_max_tokens(model_id):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": 1},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     info = processor.info
 
     seen_aspect_ratios = set[float]()
@@ -139,10 +135,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                     (488, 183), (2560, 1669)]
@@ -169,10 +162,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
         mm_processor_kwargs=None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
 
     seen_aspect_ratios = set[float]()
     image_sizes = list[ImageSize]()
diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py
index dd5f30a2317..ed0d04c5c5f 100644
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -39,11 +38,7 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py
index 95204c7ebb4..d8c2ca414d4 100644
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -3,7 +3,6 @@
 import pytest
 
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -34,11 +33,8 @@ def test_processor_override(
         mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
         limit_mm_per_prompt={"image": num_imgs},
     )
-    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        ctx.model_config,
-        tokenizer=tokenizer,
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    tokenizer = processor.info.get_tokenizer()
     hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
 
     # Build the image str / prompt based on the number of images we pass
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index b229f1e6ec8..da112bd7a92 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -28,8 +28,7 @@
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import full_groupby
 
 from .utils import random_image
@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     profiler = MultiModalProfiler(processor)
 
     mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
         limit_mm_per_prompt=limit_mm_per_prompt,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
 
     rng = np.random.RandomState(0)
     image = random_image(rng, min_wh=128, max_wh=256)
@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
         revision=None,
     )
 
-    processor = MULTIMODAL_REGISTRY.create_processor(
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
     orig_get_hf_processor = processor.info.get_hf_processor
 
     def get_hf_processor(self, **kwargs):
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 5cda5e5e3de..669fb96e665 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -261,13 +261,13 @@ def _process_multimodal(
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -288,14 +288,14 @@ async def _process_multimodal_async(
         # initialized without a tokenizer while using also multi-modal
         # input.
         if not self.tokenizer:
-            tokenizer = None
+            tokenizer = object()  # Dummy
         else:
             tokenizer_group = self.get_tokenizer_group()
             tokenizer = await tokenizer_group.get_lora_tokenizer_async(
                 lora_request)
 
-        mm_processor = self.mm_registry.create_processor(
-            self.model_config, tokenizer)
+        mm_processor = self.mm_registry.create_processor(self.model_config,
+                                                         tokenizer=tokenizer)
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 8b95db7a725..0579893e5d7 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -13,8 +13,7 @@
 
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides,
                         resolve_mm_processor_kwargs)
 
@@ -329,17 +328,27 @@ def dummy_data_for_profiling(
         from vllm.model_executor.model_loader import get_model_architecture
         from vllm.multimodal import MultiModalKwargs
         from vllm.multimodal.profiling import MultiModalProfiler
+        from vllm.sequence import SequenceData
 
         if mm_registry.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
             processor = mm_registry.create_processor(model_config,
-                                                     tokenizer,
                                                      disable_cache=True)
             profiler = MultiModalProfiler(processor)
-            dummy_data_factory = (profiler.get_encoder_dummy_data
-                                  if is_encoder_data else
-                                  profiler.get_decoder_dummy_data)
-            dummy_data = dummy_data_factory(seq_len)
+
+            dummy_data_v1 = (profiler.get_encoder_dummy_data(seq_len)
+                             if is_encoder_data else
+                             profiler.get_decoder_dummy_data(seq_len))
+            _seq_data = SequenceData.from_seqs(
+                dummy_data_v1.prompt_token_ids)  # type: ignore[attr-defined]
+
+            dummy_data = DummyData(
+                seq_data=_seq_data,
+                multi_modal_data=getattr(dummy_data_v1, "multi_modal_data",
+                                         None),
+                multi_modal_placeholders=getattr(dummy_data_v1,
+                                                 "multi_modal_placeholders",
+                                                 None),
+            )
         else:
             model_cls, _ = get_model_architecture(model_config)
             if is_encoder_data:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 7b4fb5eb598..e36f8e4434e 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,18 +3,18 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, TypeVar, cast
+from typing import Generic, NamedTuple, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
 from PIL import Image
 
 import vllm.envs as envs
-from vllm.inputs import DummyData
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs)
+                     MultiModalInputs, MultiModalKwargs,
+                     MultiModalPlaceholderDict)
 from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 
 logger = init_logger(__name__)
@@ -31,6 +31,20 @@ class ProcessorInputs:
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
 
+class DummyEncoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+
+
+class DummyDecoderData(NamedTuple):
+    """Dummy data used for profiling."""
+
+    prompt_token_ids: list[int]
+    multi_modal_data: MultiModalKwargs
+    multi_modal_placeholders: MultiModalPlaceholderDict
+
+
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 
 
@@ -179,13 +193,7 @@ def get_and_validate_mm_inputs(
                 "tokens.")
         return mm_inputs, total_placeholders_by_modality
 
-    def get_encoder_dummy_data(
-        self,
-        seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
-
+    def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
         mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
@@ -197,19 +205,9 @@ def get_encoder_dummy_data(
         num_tokens_to_pad = max(total_len, seq_len) - total_len
         encoder_prompt_token_ids.extend([0] * num_tokens_to_pad)
 
-        return DummyData(
-            seq_data=SequenceData.from_seqs(encoder_prompt_token_ids),
-            multi_modal_data=None,
-            multi_modal_placeholders=None,
-        )
-
-    def get_decoder_dummy_data(
-        self,
-        seq_len: int,
-    ) -> DummyData:
-        # Avoid circular import
-        from vllm.sequence import SequenceData
+        return DummyEncoderData(encoder_prompt_token_ids)
 
+    def get_decoder_dummy_data(self, seq_len: int) -> DummyDecoderData:
         (mm_inputs, total_placeholders_by_modality
          ) = self.get_and_validate_mm_inputs(seq_len)
 
@@ -231,16 +229,11 @@ def get_decoder_dummy_data(
                 "and/or reduce `mm_counts`.", seq_len, total_len,
                 total_placeholders_by_modality)
 
-            return DummyData(
-                seq_data=SequenceData.from_prompt_token_counts((0, seq_len)),
-                multi_modal_data=None,
-                multi_modal_placeholders=None,
-            )
-
-        prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids)))
+        if total_len < seq_len:
+            prompt_token_ids.extend([0] * (seq_len - total_len))
 
-        return DummyData(
-            seq_data=SequenceData.from_seqs(prompt_token_ids),
+        return DummyDecoderData(
+            prompt_token_ids=prompt_token_ids,
             multi_modal_data=mm_inputs["mm_kwargs"],
             multi_modal_placeholders=mm_inputs["mm_placeholders"],
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 24b83589827..8c16c3ba807 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -21,7 +21,8 @@
 from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          ProcessingCache)
-from .profiling import BaseDummyInputsBuilder, MultiModalProfiler
+from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
+                        DummyEncoderData, MultiModalProfiler)
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -256,10 +257,7 @@ def get_max_tokens_per_item_by_modality(
         on underlying model configuration.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
             seq_len = model_config.max_model_len
             mm_limits = self.get_mm_limits_per_prompt(model_config)
             return processor.info.get_mm_max_tokens_per_item(
@@ -373,10 +371,7 @@ def get_mm_limits_per_prompt(
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
         if self.has_processor(model_config):
-            tokenizer = cached_tokenizer_from_config(model_config)
-            processor = self.create_processor(model_config,
-                                              tokenizer,
-                                              disable_cache=True)
+            processor = self.create_processor(model_config, disable_cache=True)
             profiler = MultiModalProfiler(processor)
             return profiler.get_mm_limits()
 
@@ -436,8 +431,8 @@ def has_processor(self, model_config: "ModelConfig") -> bool:
     def create_processor(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer,
         *,
+        tokenizer: Optional[AnyTokenizer] = None,
         disable_cache: Optional[bool] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
@@ -446,6 +441,8 @@ def create_processor(
         See also:
             :ref:`mm-processing`
         """
+        if tokenizer is None:
+            tokenizer = cached_tokenizer_from_config(model_config)
         if disable_cache is None:
             disable_cache = model_config.disable_mm_preprocessor_cache
 
@@ -456,3 +453,49 @@ def create_processor(
         cache = None if disable_cache else self._processing_cache
 
         return factories.build_processor(ctx, cache=cache)
+
+    def get_decoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+    ) -> DummyDecoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            raise AssertionError(
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
+
+    def get_encoder_dummy_data(
+        self,
+        model_config: "ModelConfig",
+        seq_len: int,
+    ) -> DummyEncoderData:
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+        """
+        processor = self.create_processor(model_config, disable_cache=True)
+        profiler = MultiModalProfiler(processor)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len)
+
+        # Having more tokens is over-conservative but otherwise fine
+        token_ids = dummy_data.prompt_token_ids
+        if len(token_ids) < seq_len:
+            logger.warning_once(
+                f"Expected at least {seq_len} dummy encoder tokens for "
+                f"profiling, but found {len(token_ids)} tokens instead.")
+
+        return dummy_data
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1fb9ae8cb7a..a8d86e70f6a 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -14,10 +14,11 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -48,7 +49,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
-        input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         log_requests: bool = True,
         start_engine_loop: bool = True,
@@ -90,7 +91,7 @@ def __init__(
         self.processor = Processor(
             vllm_config=vllm_config,
             tokenizer=self.tokenizer,
-            input_registry=input_registry,
+            mm_registry=mm_registry,
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 8cc73f9fe72..000de21fbe7 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -11,7 +11,7 @@
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -44,7 +44,6 @@ def __init__(
         log_stats: bool,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[dict[str, StatLoggerBase]] = None,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
@@ -80,7 +79,6 @@ def __init__(
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config=vllm_config,
                                    tokenizer=self.tokenizer,
-                                   input_registry=input_registry,
                                    mm_registry=mm_registry)
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 065ac0920af..24762d214c3 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -5,8 +5,7 @@
 from typing import Optional, Union
 
 from vllm.config import VllmConfig
-from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType, SingletonInputsAdapter)
+from vllm.inputs import ProcessorInputs, PromptType
 from vllm.inputs.parse import split_enc_dec_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
@@ -31,7 +30,6 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         tokenizer: BaseTokenizerGroup,
-        input_registry: InputRegistry = INPUT_REGISTRY,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
@@ -210,7 +208,6 @@ def process_inputs(
         self._validate_model_inputs(processed_inputs, lora_request)
 
         encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        decoder_inputs = SingletonInputsAdapter(decoder_inputs)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
@@ -221,8 +218,9 @@ def process_inputs(
         sampling_params = params.clone()
         # If unset max tokens, then generate up to the max_model_len.
         if sampling_params.max_tokens is None:
-            sampling_params.max_tokens = (self.model_config.max_model_len -
-                                          len(decoder_inputs.prompt_token_ids))
+            sampling_params.max_tokens = (
+                self.model_config.max_model_len -
+                len(decoder_inputs["prompt_token_ids"]))
         sampling_params.update_from_generation_config(
             self.generation_config_fields, eos_token_id)
         sampling_params.update_from_tokenizer(
@@ -232,8 +230,8 @@ def process_inputs(
         sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
         sorted_mm_positions: Optional[list[PlaceholderRange]] = None
         sorted_mm_hashes: Optional[list[str]] = None
-        if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
-            assert isinstance(decoder_mm_inputs, MultiModalKwargs)
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
 
             # The output of merged multi-modal processor (`decoder_mm_inputs`)
             # contains the kwargs for all items from all modalities.
@@ -254,8 +252,8 @@ def process_inputs(
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
-                decoder_inputs.multi_modal_placeholders,
-                decoder_inputs.multi_modal_hashes if self.use_hash else None,
+                decoder_inputs["mm_placeholders"],
+                decoder_inputs["mm_hashes"] if self.use_hash else None,
             )
 
             # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
@@ -281,8 +279,8 @@ def process_inputs(
 
         return EngineCoreRequest(
             request_id=request_id,
-            prompt=decoder_inputs.prompt,
-            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            prompt=decoder_inputs.get("prompt"),
+            prompt_token_ids=decoder_inputs["prompt_token_ids"],
             mm_inputs=sorted_mm_inputs,
             mm_hashes=sorted_mm_hashes,
             mm_placeholders=sorted_mm_positions,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 133ccf84832..1b581c69a72 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -15,7 +15,6 @@
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.distributed.parallel_state import get_pp_group, graph_capture
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
@@ -130,7 +129,6 @@ def __init__(
         self.cascade_attn_enabled = not self.model_config.disable_cascade_attn
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
@@ -1473,16 +1471,11 @@ def profile_run(self) -> None:
                 encoder_budget, max_num_mm_items, dummy_data_modality)
 
             # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.input_registry.dummy_data_for_profiling(
+            dummy_request_data = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
                 seq_len=self.max_num_tokens,
-                mm_registry=self.mm_registry,
             )
             dummy_mm_data = dummy_request_data.multi_modal_data
-            if not isinstance(dummy_mm_data, MultiModalKwargs):
-                # TODO: Delete this check once input mapper is fully removed.
-                raise RuntimeError(
-                    "Legacy input mapper is not supported in V1")
 
             # Dummy data definition may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 97dfd23163d..5401fff2bf1 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -17,7 +17,6 @@
 from vllm.attention.layer import Attention
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
@@ -102,7 +101,6 @@ def __init__(
         self.hidden_size = model_config.get_hidden_size()
 
         # Multi-modal data support
-        self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
         # TODO: Support M-RoPE (e.g, Qwen2-VL)

From 9ebaf77333a81049505a4b27a70f03d2964af337 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 03:01:26 -0400
Subject: [PATCH 1042/1240] [TPU][CI] Fix TPUModelRunner Test (#15667)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh                |  2 +-
 tests/v1/tpu/worker/test_tpu_model_runner.py | 18 +-----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 7bd91575e17..2c356b8fe52 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -30,7 +30,7 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_4 \
     && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_5 \
-    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py
index d5f812ed4d5..6b6a91b857f 100644
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,7 +7,6 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                        SchedulerOutput)
-from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
                                              _get_padded_token_len,
                                              _get_paddings)
@@ -113,12 +112,6 @@ def _is_req_added(model_runner, req_id: str) -> bool:
     return req_id in model_runner.requests
 
 
-def _is_sampling_metadata_changed(model_runner,
-                                  sampling_metadata_before: SamplingMetadata):
-    return model_runner.input_batch.sampling_metadata is not (
-        sampling_metadata_before)
-
-
 def _is_req_state_block_table_match(model_runner, req_id: str) -> bool:
     req_index = model_runner.input_batch.req_id_to_index[req_id]
     block_table = model_runner.input_batch.block_table
@@ -136,10 +129,8 @@ def test_update_states_new_request(model_runner):
     # new req
     scheduler_output = _schedule_new_request(req_id)
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
 
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -170,9 +161,7 @@ def test_update_states_request_finished(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert not _is_req_added(model_runner, req_id)
     assert not _is_req_scheduled(model_runner, req_id)
 
@@ -229,9 +218,7 @@ def test_update_states_request_resumed(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -262,9 +249,7 @@ def test_update_states_no_changes(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner.input_batch.sampling_metadata
     model_runner._update_states(scheduler_output)
-    assert not _is_sampling_metadata_changed(model_runner, metadata_before)
     assert _is_req_added(model_runner, req_id)
     assert _is_req_scheduled(model_runner, req_id)
     assert _is_req_state_block_table_match(model_runner, req_id)
@@ -299,8 +284,7 @@ def test_update_states_request_unscheduled(model_runner):
         grammar_bitmask=None,
     )
 
-    metadata_before = model_runner._update_states(scheduler_output)
-    assert _is_sampling_metadata_changed(model_runner, metadata_before)
+    model_runner._update_states(scheduler_output)
 
     assert _is_req_added(model_runner, req_ids[0])
     assert _is_req_scheduled(model_runner, req_ids[0])

From 7ff23c2446cc887f483f254ebf351a3788e3fa1e Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Fri, 28 Mar 2025 15:23:30 +0800
Subject: [PATCH 1043/1240] [Refactor][Frontend] Keep all logic about reasoning
 into one class (#14428)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../__init__.py                               |   0
 .../test_deepseekr1_reasoning_parser.py       |  52 +++++++--
 .../test_granite_reasoning_parser.py          |   6 +-
 .../reasoning_parsers => reasoning}/utils.py  |   2 +-
 vllm/engine/arg_utils.py                      |   3 +-
 vllm/engine/llm_engine.py                     |   5 +-
 vllm/entrypoints/openai/api_server.py         |   2 +-
 vllm/entrypoints/openai/serving_chat.py       |   3 +-
 .../guided_decoding/__init__.py               |  15 ++-
 .../guided_decoding/outlines_decoding.py      |   8 +-
 .../outlines_logits_processors.py             |  14 +--
 .../reasoner/deepseek_reasoner.py             |  38 -------
 .../guided_decoding/reasoner/reasoner.py      |  23 ----
 .../guided_decoding/xgrammar_decoding.py      |   6 +-
 .../__init__.py                               |   0
 .../abs_reasoning_parsers.py                  | 101 ++++++++----------
 .../deepseek_r1_reasoning_parser.py           |  90 ++++++++--------
 .../granite_reasoning_parser.py               |   3 +-
 18 files changed, 171 insertions(+), 200 deletions(-)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/__init__.py (100%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/test_deepseekr1_reasoning_parser.py (75%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/test_granite_reasoning_parser.py (97%)
 rename tests/{entrypoints/openai/reasoning_parsers => reasoning}/utils.py (97%)
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
 delete mode 100644 vllm/model_executor/guided_decoding/reasoner/reasoner.py
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/__init__.py (100%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/abs_reasoning_parsers.py (82%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/deepseek_r1_reasoning_parser.py (64%)
 rename vllm/{entrypoints/openai/reasoning_parsers => reasoning}/granite_reasoning_parser.py (99%)

diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/reasoning/__init__.py
similarity index 100%
rename from tests/entrypoints/openai/reasoning_parsers/__init__.py
rename to tests/reasoning/__init__.py
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
similarity index 75%
rename from tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
rename to tests/reasoning/test_deepseekr1_reasoning_parser.py
index 5ce5d9280f3..7b6af183a86 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -3,74 +3,92 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.entrypoints.openai.reasoning_parsers.utils import (
-    run_reasoning_extraction)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "deepseek_r1"
 start_token = "<think>"
 end_token = "</think>"
 
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
 SIMPLE_REASONING = {
     "output": "This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
     "output": "This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 NO_CONTENT = {
     "output": "This is content",
     "reasoning_content": "This is content",
     "content": None,
+    "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
     "output": "This is a reasoning section",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
     "output": "This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>This is the rest",
     "reasoning_content": "This is a reasoning section",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
     "output": "<think>This is a reasoning section</think>",
     "reasoning_content": "This is a reasoning section",
     "content": None,
+    "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
     "output": "<think>This\nThat</think>This is the rest\nThat",
     "reasoning_content": "This\nThat",
     "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": "",
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
     "output": "</think>This is the rest",
     "reasoning_content": None,
     "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 
 TEST_CASES = [
@@ -166,23 +184,21 @@
     ),
 ]
 
-# Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-tokenizer.add_tokens([start_token, end_token])
-
 
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
     streaming: bool,
     param_dict: dict,
+    deepseek_r1_qwen_tokenizer,
 ):
-    output = tokenizer.tokenize(param_dict["output"])
+    output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
     # decode everything to tokens
     output_tokens: list[str] = [
-        tokenizer.convert_tokens_to_string([token]) for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
+        for token in output
     ]
     parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+        parser_name)(deepseek_r1_qwen_tokenizer)
 
     reasoning, content = run_reasoning_extraction(parser,
                                                   output_tokens,
@@ -190,3 +206,17 @@ def test_reasoning(
 
     assert reasoning == param_dict["reasoning_content"]
     assert content == param_dict["content"]
+
+    # Test is_reasoning_end
+    output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
diff --git a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py
similarity index 97%
rename from tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
rename to tests/reasoning/test_granite_reasoning_parser.py
index 84ac6600498..48fb8c2f8d1 100644
--- a/tests/entrypoints/openai/reasoning_parsers/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
@@ -2,10 +2,8 @@
 import pytest
 from transformers import AutoTokenizer
 
-from tests.entrypoints.openai.reasoning_parsers.utils import (
-    DeltaMessage, run_reasoning_extraction)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
+from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 parser_name = "granite"
 START_REASONING = "Here is my thought process:"
diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/reasoning/utils.py
similarity index 97%
rename from tests/entrypoints/openai/reasoning_parsers/utils.py
rename to tests/reasoning/utils.py
index 01e43130bc6..0f894ed800c 100644
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+from vllm.reasoning import ReasoningParser
 
 
 class StreamingReasoningReconstructor:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d049f773cac..a416fa8aa08 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -23,6 +23,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.plugins import load_general_plugins
+from vllm.reasoning import ReasoningParserManager
 from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.usage.usage_lib import UsageContext
@@ -1119,7 +1120,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             "--reasoning-parser",
             type=str,
-            choices=["deepseek_r1", "granite"],
+            choices=list(ReasoningParserManager.reasoning_parsers),
             default=None,
             help=
             "Select the reasoning parser depending on the model that you're "
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 4856c356831..5682b3dabe2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2080,8 +2080,9 @@ def _build_logits_processors(
             guided_decoding.backend = guided_decoding.backend or \
                 self.decoding_config.guided_decoding_backend
 
-            logger.debug("Reasoning backend: %s",
-                         self.decoding_config.reasoning_backend)
+            if self.decoding_config.reasoning_backend is not None:
+                logger.debug("Building with reasoning backend %s",
+                             self.decoding_config.reasoning_backend)
 
             processor = get_local_guided_decoding_logits_processor(
                 guided_params=guided_decoding,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1e735da641d..6c1f60fa6a3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -68,7 +68,6 @@
                                               TranscriptionRequest,
                                               TranscriptionResponse,
                                               UnloadLoRAAdapterRequest)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager
 # yapf: enable
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -85,6 +84,7 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.entrypoints.utils import load_aware_call, with_cancellation
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import MistralTokenizer
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 3102db4050f..eda4722836b 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -23,8 +23,6 @@
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
     DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
     RequestResponseMetadata, ToolCall, UsageInfo)
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -33,6 +31,7 @@
     MistralToolCall)
 from vllm.logger import init_logger
 from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 0c26a60588c..cecb3a8a1d4 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -5,10 +5,10 @@
 from typing import TYPE_CHECKING
 
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import get_reasoner
 from vllm.model_executor.guided_decoding.utils import (
     convert_lark_to_gbnf, grammar_is_likely_lark,
     has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features)
+from vllm.reasoning import ReasoningParserManager
 
 if TYPE_CHECKING:
     from transformers import PreTrainedTokenizer
@@ -107,7 +107,11 @@ async def get_guided_decoding_logits_processor(
         model_config: ModelConfig,
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
 
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     guided_params = maybe_backend_fallback(guided_params)
 
@@ -146,8 +150,11 @@ def get_local_guided_decoding_logits_processor(
         reasoning_backend: str | None = None) -> LogitsProcessor | None:
     guided_params = maybe_backend_fallback(guided_params)
 
-    # Get the reasoner if needed, it will be None if reasoning_
-    reasoner = get_reasoner(tokenizer, reasoning_backend)
+    reasoner = None
+    if reasoning_backend is not None:
+        reasoner_class = ReasoningParserManager.get_reasoning_parser(
+            reasoning_backend)
+        reasoner = reasoner_class(tokenizer)
 
     # CFG grammar not supported by LMFE, so we use outlines instead
     if guided_params.backend_name == 'outlines':
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 97f63ae11f4..564f9277a83 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -12,7 +12,7 @@
 
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
     CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
+from vllm.reasoning import ReasoningParser
 from vllm.sampling_params import GuidedDecodingParams
 
 
@@ -61,7 +61,7 @@ class GuidedDecodingMode(Enum):
 async def get_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -92,7 +92,7 @@ async def get_outlines_guided_decoding_logits_processor(
 def get_local_outlines_guided_decoding_logits_processor(
     guided_params: GuidedDecodingParams,
     tokenizer: PreTrainedTokenizerBase,
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
            None]:
     """
@@ -141,7 +141,7 @@ def _get_logits_processor(
     tokenizer: PreTrainedTokenizerBase,
     mode: GuidedDecodingMode,
     whitespace_pattern: Union[str, None],
-    reasoner: Optional[Reasoner],
+    reasoner: Optional[ReasoningParser],
 ) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
     if mode == GuidedDecodingMode.JSON:
         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern,
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 8b2a0f4cfe6..31af4593f11 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -34,8 +34,8 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.model_executor.guided_decoding.reasoner import Reasoner
 from vllm.platforms import current_platform
+from vllm.reasoning import ReasoningParser
 
 logger = init_logger(__name__)
 
@@ -49,9 +49,9 @@
 
 class BaseLogitsProcessor:
 
-    def __init__(self, guide: Guide, reasoner: Optional[Reasoner]):
+    def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]):
         self._guide: Guide = guide
-        self._reasoner: Optional[Reasoner] = reasoner
+        self._reasoner: Optional[ReasoningParser] = reasoner
         # CFGState is used for the FSM state for CFGGuide
         self._fsm_state: DefaultDict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
@@ -69,7 +69,7 @@ def __call__(self, input_ids: List[int],
                 # Remove the reasoning tokens from the input_ids
                 # We need this because our implementation relies on the
                 # hash of the input_ids to store the FSM state.
-                input_ids = self._reasoner.extract_content(input_ids)
+                input_ids = self._reasoner.extract_content_ids(input_ids)
 
         seq_id = hash(tuple(input_ids))
 
@@ -142,7 +142,7 @@ def __init__(
         self,
         regex_string: str,
         tokenizer: PreTrainedTokenizerBase,
-        reasoner: Optional[Reasoner],
+        reasoner: Optional[ReasoningParser],
     ):
         """Compile the FSM that drives the regex-structured generation.
 
@@ -163,7 +163,7 @@ class JSONLogitsProcessor(RegexLogitsProcessor):
     def __init__(self, schema: Union[str, Dict, BaseModel],
                  tokenizer: PreTrainedTokenizerBase,
                  whitespace_pattern: Union[str, None],
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -203,7 +203,7 @@ def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
         return CFGGuide(cfg, tokenizer)
 
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
-                 reasoner: Optional[Reasoner]):
+                 reasoner: Optional[ReasoningParser]):
         """Compile the FSM that drives the context free grammar generation.
 
         Parameters
diff --git a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py b/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
deleted file mode 100644
index 7e61e6a9620..00000000000
--- a/vllm/model_executor/guided_decoding/reasoner/deepseek_reasoner.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-from vllm.model_executor.guided_decoding.reasoner.reasoner import Reasoner
-
-
-@dataclass
-class DeepSeekReasoner(Reasoner):
-    """
-    Reasoner for DeepSeek R series models.
-    """
-    start_token_id: int
-    end_token_id: int
-
-    start_token: str = "<think>"
-    end_token: str = "</think>"
-
-    @classmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        return cls(start_token_id=tokenizer.encode(
-            "<think>", add_special_tokens=False)[0],
-                   end_token_id=tokenizer.encode("</think>",
-                                                 add_special_tokens=False)[0])
-
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.end_token_id in input_ids
-
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract the content after the end tokens
-        """
-        if self.end_token_id not in input_ids or \
-            input_ids.index(self.end_token_id) + 1 == len(input_ids):
-            return []
-        else:
-            return input_ids[input_ids.index(self.end_token_id) + 1:]
diff --git a/vllm/model_executor/guided_decoding/reasoner/reasoner.py b/vllm/model_executor/guided_decoding/reasoner/reasoner.py
deleted file mode 100644
index df21b1db622..00000000000
--- a/vllm/model_executor/guided_decoding/reasoner/reasoner.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-
-from transformers import PreTrainedTokenizer
-
-
-@dataclass
-class Reasoner(ABC):
-
-    @abstractmethod
-    def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> Reasoner:
-        pass
-
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        pass
-
-    @abstractmethod
-    def extract_content(self, input_ids: list[int]) -> list[int]:
-        pass
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index bc156223953..47b1e7e3f98 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -27,7 +27,7 @@
     from transformers import PreTrainedTokenizer
 
     from vllm.config import ModelConfig
-    from vllm.model_executor.guided_decoding.reasoner import Reasoner
+    from vllm.reasoning import ReasoningParser
     from vllm.sampling_params import GuidedDecodingParams
 
 logger = init_logger(__name__)
@@ -37,7 +37,7 @@ def get_local_xgrammar_guided_decoding_logits_processor(
         guided_params: GuidedDecodingParams,
         tokenizer: PreTrainedTokenizer,
         model_config: ModelConfig,
-        reasoner: Reasoner | None,
+        reasoner: ReasoningParser | None,
         max_threads: int = 8):
     config = GrammarConfig.from_guided_params(guided_params=guided_params,
                                               model_config=model_config,
@@ -280,7 +280,7 @@ def tokenizer_info(tokenizer_data: TokenizerData) -> xgr.TokenizerInfo:
 class XGrammarLogitsProcessor:
     """Wrapper class to support pickle protocol"""
     config: GrammarConfig
-    reasoner: Reasoner | None = None
+    reasoner: ReasoningParser | None = None
 
     ctx: xgr.CompiledGrammar | None = None
     tokenizer_info: xgr.TokenizerInfo = None  # type: ignore[assignment]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/reasoning/__init__.py
similarity index 100%
rename from vllm/entrypoints/openai/reasoning_parsers/__init__.py
rename to vllm/reasoning/__init__.py
diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
similarity index 82%
rename from vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
rename to vllm/reasoning/abs_reasoning_parsers.py
index c95ff191e4d..454167a0dc9 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -17,7 +17,7 @@
 
 class ReasoningParser:
     """
-    Abstract reasoning parser class that should not be used directly. 
+    Abstract reasoning parser class that should not be used directly.
     Provided and methods should be used in derived classes.
 
     It is used to extract reasoning content from the model output.
@@ -32,6 +32,36 @@ def vocab(self) -> dict[str, int]:
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
 
+    @abstractmethod
+    def is_reasoning_end(self, input_ids: list[int]) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output.
+
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the input_ids.
+        """
+
+    @abstractmethod
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        """
+        Extract content token ids from the input_ids.
+        Parameters:
+        input_ids: list[int]
+            The input_ids of the model output.
+        Returns:
+        list[int]
+            The extracted content from the input_ids.
+        """
+
+    @abstractmethod
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
@@ -53,10 +83,7 @@ def extract_reasoning_content(
             A tuple containing the reasoning content and the content.
         """
 
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_calls "
-            "has not been implemented!")
-
+    @abstractmethod
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -73,43 +100,6 @@ def extract_reasoning_content_streaming(
         the current tokens/diffs, but also the information about what has
         previously been parsed and extracted (see constructor)
         """
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_reasoning_content_streaming "
-            "has not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        """
-        Check if the reasoning content ends in the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        bool
-            True if the reasoning content ends in the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.is_reasoning_end has"
-            "not been implemented!")
-
-    # TODO: need to rebase by PR #14428
-    @abstractmethod
-    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
-        """
-        Extract content token ids from the input_ids.
-        Parameters:
-        input_ids: list[int]
-            The input_ids of the model output.
-        Returns:
-        list[int]
-            The extracted content from the input_ids.
-        """
-
-        raise NotImplementedError(
-            "AbstractReasoningParser.extract_content_ids has"
-            " not been implemented!")
 
 
 class ReasoningParserManager:
@@ -125,14 +115,16 @@ def get_reasoning_parser(cls, name) -> type:
         if name in cls.reasoning_parsers:
             return cls.reasoning_parsers[name]
 
-        raise KeyError(f"reasoning helper: '{name}' not found in "
-                       "reasoning_parsers")
+        raise KeyError(
+            f"reasoning helper: '{name}' not found in reasoning_parsers")
 
     @classmethod
-    def _register_module(cls,
-                         module: type,
-                         module_name: Optional[Union[str, list[str]]] = None,
-                         force: bool = True) -> None:
+    def _register_module(
+        cls,
+        module: type,
+        module_name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+    ) -> None:
         if not issubclass(module, ReasoningParser):
             raise TypeError("module must be subclass of ReasoningParser, "
                             f"but got {type(module)}")
@@ -149,13 +141,14 @@ def _register_module(cls,
 
     @classmethod
     def register_module(
-            cls,
-            name: Optional[Union[str, list[str]]] = None,
-            force: bool = True,
-            module: Union[type, None] = None) -> Union[type, Callable]:
+        cls,
+        name: Optional[Union[str, list[str]]] = None,
+        force: bool = True,
+        module: Union[type, None] = None,
+    ) -> Union[type, Callable]:
         """
         Register module with the given name or name list. it can be used as a
-        decoder(with module as None) or normal function(with module as not 
+        decoder(with module as None) or normal function(with module as not
         None).
         """
         if not isinstance(force, bool):
@@ -183,7 +176,7 @@ def _register(module):
     @classmethod
     def import_reasoning_parser(cls, plugin_path: str) -> None:
         """
-        Import a user-defined reasoning parser by the path 
+        Import a user-defined reasoning parser by the path
         of the reasoning parser define file.
         """
         module_name = os.path.splitext(os.path.basename(plugin_path))[0]
diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
similarity index 64%
rename from vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
rename to vllm/reasoning/deepseek_r1_reasoning_parser.py
index 54e960168cf..73be6d4d1ab 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -8,9 +8,8 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
-    ReasoningParser, ReasoningParserManager)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 
@@ -20,43 +19,45 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     """
     Reasoning parser for DeepSeek R1 model.
 
-    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning 
+    The DeepSeek R1 model uses <think>...</think> tokens to denote reasoning
     text. This parser extracts the reasoning content from the model output.
     """
 
+    start_token_id: int
+    end_token_id: int
+
+    start_token: str = "<think>"
+    end_token: str = "</think>"
+
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
-        self.think_start_token = "<think>"
-        self.think_end_token = "</think>"
 
         self.reasoning_regex = re.compile(
-            rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL)
+            rf"{self.start_token}(.*?){self.end_token}", re.DOTALL)
 
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
                 "constructor during construction.")
 
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if (self.think_start_token_id is None
-                or self.think_end_token_id is None):
+        self.start_token_id = self.vocab.get(self.start_token)
+        self.end_token_id = self.vocab.get(self.end_token)
+        if self.start_token_id is None or self.end_token_id is None:
             raise RuntimeError(
                 "DeepSeek R1 reasoning parser could not locate think start/end "
                 "tokens in the tokenizer!")
 
-    # TODO: need to rebase by PR #14428
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.think_end_token_id in input_ids
+        return self.end_token_id in input_ids
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
         """
-        if self.think_end_token_id not in input_ids[:-1]:
+        if self.end_token_id not in input_ids[:-1]:
             return []
         else:
-            return input_ids[input_ids.index(self.think_end_token_id) + 1:]
+            return input_ids[input_ids.index(self.end_token_id) + 1:]
 
     def extract_reasoning_content_streaming(
         self,
@@ -77,22 +78,24 @@ def extract_reasoning_content_streaming(
         """
         # Skip single special tokens
         if len(delta_token_ids) == 1 and (delta_token_ids[0] in [
-                self.think_start_token_id, self.think_end_token_id
+                self.start_token_id, self.end_token_id
         ]):
             return None
 
         # Check if <think> is present in previous or delta.
         # Keep compatibility with models that don't generate <think> tokens.
-        if self.think_start_token_id in previous_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        if self.start_token_id in previous_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in previous, </think> in delta,
                 # extract reasoning content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # <think> in previous, </think> in previous,
                 # reasoning content continues
                 return DeltaMessage(content=delta_text)
@@ -100,17 +103,18 @@ def extract_reasoning_content_streaming(
                 # <think> in previous, no </think> in previous or delta,
                 # reasoning content continues
                 return DeltaMessage(reasoning_content=delta_text)
-        elif self.think_start_token_id in delta_token_ids:
-            if self.think_end_token_id in delta_token_ids:
+        elif self.start_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # <think> in delta, </think> in delta, extract reasoning content
-                start_index = delta_text.find(self.think_start_token)
-                end_index = delta_text.find(self.think_end_token)
+                start_index = delta_text.find(self.start_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[start_index +
-                                               len(self.think_start_token
-                                                   ):end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
+                                               len(self.start_token):end_index]
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
             else:
                 # <think> in delta, no </think> in delta,
                 # reasoning content continues
@@ -119,15 +123,17 @@ def extract_reasoning_content_streaming(
             # No <think> in previous or delta, also need to check for </think>.
             # Because the model may have generated </think> without <think>
             # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-            if self.think_end_token_id in delta_token_ids:
+            if self.end_token_id in delta_token_ids:
                 # </think> in delta with more tokens,
                 # extract reasoning content and content
-                end_index = delta_text.find(self.think_end_token)
+                end_index = delta_text.find(self.end_token)
                 reasoning_content = delta_text[:end_index]
-                content = delta_text[end_index + len(self.think_end_token):]
-                return DeltaMessage(reasoning_content=reasoning_content,
-                                    content=content if content else None)
-            elif self.think_end_token_id in previous_token_ids:
+                content = delta_text[end_index + len(self.end_token):]
+                return DeltaMessage(
+                    reasoning_content=reasoning_content,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
                 # </think> in previous, thinking content ends
                 return DeltaMessage(content=delta_text)
             else:
@@ -137,22 +143,20 @@ def extract_reasoning_content_streaming(
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
-
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
-        if self.think_end_token not in model_output:
+        if self.end_token not in model_output:
             return model_output, None
         else:
             # Add a start token if it's missing to keep compatibility.
-            if self.think_start_token not in model_output:
-                model_output = f"{self.think_start_token}{model_output}"
+            if self.start_token not in model_output:
+                model_output = f"{self.start_token}{model_output}"
             # Use a regex to find the reasoning content
             reasoning_content = self.reasoning_regex.findall(model_output)[0]
 
             end_index = len(
-                f"{self.think_start_token}{reasoning_content}{self.think_end_token}"
-            )
+                f"{self.start_token}{reasoning_content}{self.end_token}")
             final_output = model_output[end_index:]
 
             if len(final_output) == 0:
diff --git a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
similarity index 99%
rename from vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
rename to vllm/reasoning/granite_reasoning_parser.py
index 117d051a737..249ace1f167 100644
--- a/vllm/entrypoints/openai/reasoning_parsers/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -8,9 +8,8 @@
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import (
-    ReasoningParser, ReasoningParserManager)
 from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 

From 84b8a2c2dba1d9c632762e4c0d30fd94ca513166 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 28 Mar 2025 16:36:31 +0800
Subject: [PATCH 1044/1240] [CPU][CI] Improve CPU Dockerfile (#15690)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml              |   2 +-
 .buildkite/run-cpu-test.sh                    |  16 +-
 Dockerfile.cpu                                | 145 +++++++++++++-----
 .../getting_started/installation/cpu.md       |  35 ++++-
 .../installation/cpu/x86.inc.md               |   2 +
 5 files changed, 146 insertions(+), 54 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 18f582b6e4c..a1dcb01e482 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 05744bb5225..bf9f191d3b0 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -8,15 +8,19 @@ set -ex
 CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
-# Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
-
 # Setup cleanup
-remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { 
+    set -e; 
+    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
+    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
+}
 trap remove_docker_container EXIT
 remove_docker_container
 
+# Try building the docker image
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
  --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
@@ -36,8 +40,6 @@ function cpu_tests() {
   # Run basic model test
   docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
-    pip install -r vllm/requirements/test.txt
-    pip install -r vllm/requirements/cpu.txt
     pytest -v -s tests/kernels/test_cache.py -m cpu_model
     pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/decoder_only/language -m cpu_model
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index a10090529d8..8133651865b 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,69 +1,138 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+#
+# Build targets:
+#   vllm-openai (default): used for serving deployment
+#   vllm-test: used for CI tests
+#   vllm-dev: used for development
+#
+# Build arguments:
+#   PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9
+#   VLLM_CPU_DISABLE_AVX512=false (default)|true
+#
+
+######################### BASE IMAGE #########################
+FROM ubuntu:22.04 AS base
 
-FROM ubuntu:22.04 AS cpu-test-1
+WORKDIR /workspace/
 
-ENV CCACHE_DIR=/root/.cache/ccache
+ARG PYTHON_VERSION=3.12
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 
+# Install minimal dependencies and uv
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
+        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
+    && curl -LsSf https://astral.sh/uv/install.sh | sh
+
+ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
-RUN --mount=type=cache,target=/var/cache/apt \
-    apt-get update -y \
-    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
-    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+ENV PATH="/root/.local/bin:$PATH"
+ENV VIRTUAL_ENV="/opt/venv"
+RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install intel-openmp==2025.0.1
+ENV UV_HTTP_TIMEOUT=500
 
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
+# Install Python dependencies 
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_LINK_MODE="copy"
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
+    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
+    uv pip install --upgrade pip && \
+    uv pip install -r requirements/cpu.txt
 
-RUN echo 'ulimit -c 0' >> ~/.bashrc
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
 
-RUN pip install intel_extension_for_pytorch==2.6.0
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 
-WORKDIR /workspace
+RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
-ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
-    pip install --upgrade pip && \
-    pip install -r requirements/build.txt
+######################### BUILD IMAGE #########################
+FROM base AS vllm-build
 
-FROM cpu-test-1 AS build
+ARG GIT_REPO_CHECK=0
+# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
+ARG VLLM_CPU_DISABLE_AVX512
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 
 WORKDIR /workspace/vllm
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
-    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
-    pip install -v -r requirements/cpu.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
+    uv pip install -r requirements/build.txt
 
 COPY . .
-ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel 
+
+######################### DEV IMAGE #########################
+FROM vllm-build AS vllm-dev
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    apt-get install -y --no-install-recommends vim numactl
+
+# install development dependencies (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
 
-RUN --mount=type=cache,target=/root/.cache/pip \
+RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=bind,source=.git,target=.git \
-    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
-    pip install dist/*.whl && \
-    rm -rf dist
+    VLLM_TARGET_DEVICE=cpu python3 setup.py develop 
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -r requirements/dev.txt && \
+    pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+ENTRYPOINT ["bash"]
+
+######################### TEST IMAGE #########################
+FROM base AS vllm-test
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
+    uv pip install -r requirements/test.txt
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
+
+ADD ./tests/ ./tests/
+ADD ./examples/ ./examples/
+ADD ./benchmarks/ ./benchmarks/
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -e tests/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -e tests/vllm_test_utils 
+
+ENTRYPOINT ["bash"]
+
+######################### RELEASE IMAGE #########################
+FROM base AS vllm-openai
+
+WORKDIR /workspace/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
+    uv pip install dist/*.whl
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 1b2ffd61999..844b184afc9 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -159,18 +159,37 @@ Currently, there are no pre-built CPU wheels.
 
 ### Pre-built images
 
-Currently, there are no pre-build CPU images.
+:::::{tab-set}
+:sync-group: device
+
+::::{tab-item} Intel/AMD x86
+:sync: x86
+
+:::{include} cpu/x86.inc.md
+:start-after: "### Pre-built images"
+:end-before: "### Build image from source"
+:::
+
+::::
+
+:::::
 
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g .
-$ docker run -it \
-             --rm \
-             --network=host \
-             --cpuset-cpus=<cpu-id-list, optional> \
-             --cpuset-mems=<memory-node, optional> \
-             vllm-cpu-env
+$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+
+# Launching OpenAI server 
+$ docker run --rm \
+             --privileged=true \
+             --shm-size=4g \
+             -p 8000:8000 \
+             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
+             -e VLLM_CPU_OMP_THREADS_BIND=<CPU cores for inference> \
+             vllm-cpu-env \
+             --model=meta-llama/Llama-3.2-1B-Instruct \
+             --dtype=bfloat16 \
+             other vLLM OpenAI server arguments
 ```
 
 ::::{tip}
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
index b2f3bafb4e5..9ae2035db54 100644
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ b/docs/source/getting_started/installation/cpu/x86.inc.md
@@ -34,6 +34,8 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 ### Pre-built images
 
+See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
 ### Build image from source
 
 ## Extra information

From f6bee171b083d96807045c8b4daee8e345d1c14f Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 17:10:40 +0800
Subject: [PATCH 1045/1240] [Bugfix] Fix 'InductorAdaptor object has no
 attribute 'cache_dir' (#15674)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/compilation/compiler_interface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index ab0f98bdaa3..d6e44fa6d34 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -144,6 +144,7 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         return hash_str
 
     def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
+        self.cache_dir = cache_dir
         if disable_cache:
             return
         # redirect the cache directory to a sub-directory
@@ -156,7 +157,6 @@ def initialize_cache(self, cache_dir: str, disable_cache: bool = False):
         triton_cache = os.path.join(cache_dir, "triton_cache")
         os.makedirs(triton_cache, exist_ok=True)
         os.environ["TRITON_CACHE_DIR"] = triton_cache
-        self.cache_dir = cache_dir
 
     def compile(
         self,

From 6dce0da2400d06b4e02aeaf4dfcf9b44753a19c2 Mon Sep 17 00:00:00 2001
From: Lize Cai <lizzzcai1@gmail.com>
Date: Fri, 28 Mar 2025 19:00:14 +0900
Subject: [PATCH 1046/1240] [Misc] Fix test_sleep to use query parameters
 (#14373)

Signed-off-by: Lize Cai <lize.cai@sap.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_sleep.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 8bdf00bcee1..66d8d929401 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,8 +25,9 @@ def test_sleep_mode():
                                 "VLLM_SERVER_DEV_MODE": "1",
                                 "CUDA_VISIBLE_DEVICES": "0"
                             }) as remote_server:
+
         response = requests.post(remote_server.url_for("/sleep"),
-                                 data={"level": "1"})
+                                 params={"level": "1"})
         assert response.status_code == 200
         response = requests.get(remote_server.url_for("/is_sleeping"))
         assert response.status_code == 200

From 4960fe3eb9be0f0dac5715ecfd4cf54b0050e19c Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Fri, 28 Mar 2025 19:20:35 +0800
Subject: [PATCH 1047/1240] [Bugfix][Frontend] Eliminate regex based check in
 reasoning full generator (#14821)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../test_deepseekr1_reasoning_parser.py       | 64 +++++++++++++++++++
 .../reasoning/deepseek_r1_reasoning_parser.py | 43 +++++++------
 2 files changed, 89 insertions(+), 18 deletions(-)

diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py
index 7b6af183a86..1b669c8fd2f 100644
--- a/tests/reasoning/test_deepseekr1_reasoning_parser.py
+++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py
@@ -90,6 +90,40 @@ def deepseek_r1_qwen_tokenizer():
     "content": "This is the rest",
     "is_reasoning_end": True,
 }
+THINK_NO_END = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support <think>...</think> and </think>...
+# We cannot know if the text before <think> is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
 
 TEST_CASES = [
     pytest.param(
@@ -182,6 +216,36 @@ def deepseek_r1_qwen_tokenizer():
         SHORTEST_REASONING_WITH_THINK,
         id="shortest_with_think_streaming",
     ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
 ]
 
 
diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py
index 73be6d4d1ab..1c283c092a2 100644
--- a/vllm/reasoning/deepseek_r1_reasoning_parser.py
+++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Sequence
 from typing import Optional, Union
 
@@ -32,9 +31,6 @@ class DeepSeekR1ReasoningParser(ReasoningParser):
     def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
 
-        self.reasoning_regex = re.compile(
-            rf"{self.start_token}(.*?){self.end_token}", re.DOTALL)
-
         if not self.model_tokenizer:
             raise ValueError(
                 "The model tokenizer must be passed to the ReasoningParser "
@@ -143,23 +139,34 @@ def extract_reasoning_content_streaming(
     def extract_reasoning_content(
             self, model_output: str, request: ChatCompletionRequest
     ) -> tuple[Optional[str], Optional[str]]:
+        """
+        Extract reasoning content from the model output.
+
+        For text <think>abc</think>xyz:
+        - 'abc' goes to reasoning_content
+        - 'xyz' goes to content
+
+        Returns:
+            tuple[Optional[str], Optional[str]]: reasoning content and content
+        """
+
+        # Check if the start token is present in the model output, remove it
+        # if it is present.
+        model_output_parts = model_output.partition(self.start_token)
+        model_output = model_output_parts[2] if model_output_parts[
+            1] else model_output_parts[0]
+
         # DeepSeek R1 doesn't generate <think> now.
         # Thus we assume the reasoning content is always at the start.
         # Ref https://huggingface.co/deepseek-ai/DeepSeek-R1/commit/8a58a132790c9935686eb97f042afa8013451c9f
         if self.end_token not in model_output:
             return model_output, None
         else:
-            # Add a start token if it's missing to keep compatibility.
-            if self.start_token not in model_output:
-                model_output = f"{self.start_token}{model_output}"
-            # Use a regex to find the reasoning content
-            reasoning_content = self.reasoning_regex.findall(model_output)[0]
-
-            end_index = len(
-                f"{self.start_token}{reasoning_content}{self.end_token}")
-            final_output = model_output[end_index:]
-
-            if len(final_output) == 0:
-                return reasoning_content, None
-
-            return reasoning_content, final_output
+            reasoning_content, _, content = model_output.partition(
+                self.end_token)
+            # If the end token is not found, return the model output as is.
+            # It should not happen since we already checked for the presence
+            # of the end token.
+            # If generation stops right after end-of-think, return null content
+            final_content = content or None
+            return reasoning_content, final_content

From c33b13fdab02c7251f26cf876adf1bbba2a5b6a7 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 28 Mar 2025 19:40:12 +0800
Subject: [PATCH 1048/1240] [Frontend] update priority for --api-key and
 VLLM_API_KEY (#15588)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6c1f60fa6a3..7dbe31e62da 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -818,7 +818,8 @@ async def validation_exception_handler(_, exc):
         return JSONResponse(err.model_dump(),
                             status_code=HTTPStatus.BAD_REQUEST)
 
-    if token := envs.VLLM_API_KEY or args.api_key:
+    # Ensure --api-key option from CLI takes precedence over VLLM_API_KEY
+    if token := args.api_key or envs.VLLM_API_KEY:
 
         @app.middleware("http")
         async def authentication(request: Request, call_next):

From 0bead01fd26263d598645f02b924d7950b3968eb Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 28 Mar 2025 13:03:21 +0000
Subject: [PATCH 1049/1240] [Docs] Add "Generation quality changed" section to
 troubleshooting (#15701)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/troubleshooting.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md
index fdfaf9f9326..87fa442e9a4 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@@ -26,6 +26,14 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 
 If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 
+## Generation quality changed
+
+In v0.8.0, the source of default sampling parameters was changed in <gh-pr:12622>. Prior to v0.8.0, the default sampling parameters came from vLLM's set of neutral defaults. From v0.8.0 onwards, the default sampling parameters come from the `generation_config.json` provided by the model creator.
+
+In most cases, this should lead to higher quality responses, because the model creator is likely to know which sampling parameters are best for their model. However, in some cases the defaults provided by the model creator can lead to degraded performance.
+
+You can check if this is happening by trying the old defaults with `--generation-config vllm` for online and `generation_config="vllm"` for offline. If, after trying this, your generation quality improves we would recommend continuing to use the vLLM defaults and petition the model creator on <https://huggingface.co> to update their default `generation_config.json` so that it produces better quality generations.
+
 ## Enable more logging
 
 If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:

From 163c0e0632abcdc4d572eedeb84ff49421cf3a7b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 28 Mar 2025 21:14:09 +0800
Subject: [PATCH 1050/1240] [Model] Adding torch compile annotations to chatglm
 (#15624)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/chatglm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 14dca23b393..a51a0af9e2b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -10,6 +10,7 @@
 from torch.nn import LayerNorm
 
 from vllm.attention import Attention
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -293,6 +294,7 @@ def forward(
         return hidden_states
 
 
+@support_torch_compile
 class ChatGLMModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):

From a429138da0dcecd4de6fe669ec1a5e9de0a213f3 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Fri, 28 Mar 2025 21:14:53 +0800
Subject: [PATCH 1051/1240] [Bugfix][v1] xgrammar structured output supports
 Enum. (#15594)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 53 +++++++++++++++++++
 vllm/v1/structured_output/utils.py            |  4 --
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 6bdfa0fae4a..00fa47575b6 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,10 +4,12 @@
 
 import json
 import re
+from enum import Enum
 from typing import Any
 
 import jsonschema
 import pytest
+from pydantic import BaseModel
 
 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
@@ -390,3 +392,54 @@ def test_guided_choice_completion(
         assert generated_text is not None
         assert generated_text in sample_guided_choice
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("guided_decoding_backend",
+                         GUIDED_DECODING_BACKENDS_V1)
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_guided_json_completion_with_enum(
+    monkeypatch: pytest.MonkeyPatch,
+    guided_decoding_backend: str,
+    model_name: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend=guided_decoding_backend)
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index 694e46f763f..a771256ef29 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -26,10 +26,6 @@ def check_object(obj: dict[str, Any]) -> bool:
         if "pattern" in obj:
             return True
 
-        # Check for enum restrictions
-        if "enum" in obj:
-            return True
-
         # Check for numeric ranges
         if obj.get("type") in ("integer", "number") and any(
                 key in obj

From 389e8cc1843df83853c7b5e62f57feb5cbf70ec9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 28 Mar 2025 23:27:52 +0800
Subject: [PATCH 1052/1240] [Bugfix] `embed_is_patch` for Idefics3 (#15696)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/commandr.py    |   1 -
 vllm/model_executor/models/idefics3.py    | 501 ++++++++++++++--------
 vllm/model_executor/models/mllama.py      |   1 -
 vllm/model_executor/models/qwen2_audio.py |   2 +-
 vllm/model_executor/models/ultravox.py    |   3 +-
 5 files changed, 320 insertions(+), 188 deletions(-)

diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index b0cb4a62333..e7e73f446df 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -24,7 +24,6 @@
 from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import CohereConfig
 
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 432f2614104..327ec4640f0 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -17,16 +17,14 @@
 
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Dict, List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor,
                           Idefics3Processor)
 
 from vllm.config import VllmConfig
-from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -35,13 +33,16 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.inputs import NestedTensors
-from vllm.multimodal.parse import ImageProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo,
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        encode_tokens)
+# yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -53,18 +54,28 @@
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-
-logger = init_logger(__name__)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """
     Shape: `(batch_size * num_images * num_patches, 
              num_channels, height, width)`
     """
-    pixel_attention_mask: Optional[torch.BoolTensor]
+    pixel_attention_mask: torch.Tensor
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
 
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
@@ -75,6 +86,14 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -100,32 +119,14 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        hf_processor = self.get_hf_processor()
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
-        grid_w, grid_h = self._get_image_feature_grid_size(
-            image_width=image_processor.size['longest_edge'],
-            image_height=image_processor.size['longest_edge'],
-        )
-        num_image_token = (grid_w * grid_h + 1) * hf_processor.image_seq_len
-        # Calculate Non-image-token length
-        # NOTE: <row_1_col_1> and <global-img> are special token for SmolVLM
-        # but not for Idefic3, so we need to tokenize them to get actual length.
-        tokenizer = self.get_tokenizer()
-        tile_token_len = len(tokenizer.tokenize("<row_1_col_1>"))
-        glob_token_len = len(tokenizer.tokenize(hf_processor.global_image_tag))
-        # linebreak and <fake_token_around_image> always cost 1 token
-        fake_token_len = lb_len = 1
-        non_image_token = (grid_w * grid_h) * (
-            tile_token_len + fake_token_len) + glob_token_len + (
-                grid_h + 1) * lb_len + fake_token_len
-        return {"image": num_image_token + non_image_token}
+        return {"image": self.get_max_image_tokens()}
 
     def _resize_output_size(self,
                             *,
                             height: int,
                             width: int,
                             max_len: Optional[int] = None,
-                            min_len: Optional[int] = 1,
+                            min_len: int = 1,
                             max_size: Optional[int] = None) -> tuple[int, int]:
         # Set default value for max_len if not provided
         max_len = max(height, width) if max_len is None else max_len
@@ -181,10 +182,13 @@ def _get_image_feature_grid_size(
         *,
         image_width: int,
         image_height: int,
-        size: Optional[dict[str, object]] = None,
+        processor: Optional[Idefics3Processor],
     ) -> tuple[int, int]:
-        hf_processor = self.get_hf_processor(size=size)
-        image_processor: Idefics3ImageProcessor = hf_processor.image_processor
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
         max_image_size = image_processor.max_image_size['longest_edge']
         size = image_processor.size['longest_edge']
         assert size % max_image_size == 0, (
@@ -204,6 +208,105 @@ def _get_image_feature_grid_size(
             grid_h = grid_w = 0
         return grid_w, grid_h
 
+    def get_num_patches(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        return grid_w * grid_h + 1
+
+    def get_image_repl(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> str:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        image_token = processor.image_token.content
+        fake_image_token = processor.fake_image_token.content
+        global_img_token = processor.global_image_tag
+        image_seq_len = processor.image_seq_len
+        grid_placeholder = "<row_{n_h}_col_{n_w}>"
+
+        p_img = image_token * image_seq_len
+        global_img_placeholder = fake_image_token + global_img_token + p_img
+        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
+
+        grid_w, grid_h = self._get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+        if grid_w == 0 and grid_h == 0:
+            return global_img_placeholder + fake_image_token
+
+        tiles_placeholder = list[str]()
+        for i in range(grid_h):
+            for j in range(grid_w):
+                placeholder_per_tile = tile_img_placeholder.format(n_h=i + 1,
+                                                                   n_w=j + 1)
+                tiles_placeholder.append(placeholder_per_tile)
+                # Add line break if it is the last tile in the row
+                if j == grid_w - 1:
+                    tiles_placeholder.append("\n")
+
+        return "".join([
+            *tiles_placeholder,
+            "\n",
+            global_img_placeholder,
+            fake_image_token,
+        ])
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[Idefics3Processor],
+    ) -> int:
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
+            image_width=image_width,
+            image_height=image_height,
+            processor=processor,
+        )
+
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+        image_processor: Idefics3ImageProcessor = processor.image_processor
+
+        return ImageSize(
+            width=image_processor.size["longest_edge"],
+            height=image_processor.size["longest_edge"],
+        )
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
+
 
 class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]
                                  ):
@@ -217,7 +320,7 @@ def get_dummy_processor_inputs(
         hf_processor = self.info.get_hf_processor()
         image_processor: Idefics3ImageProcessor = hf_processor.image_processor
         longest_edge = image_processor.max_image_size['longest_edge']
-        image_token: str = hf_processor.image_token.content
+        image_token = hf_processor.image_token.content
 
         mm_data = {
             "image":
@@ -241,26 +344,61 @@ def _call_hf_processor(
         mm_data: Mapping[str, object],
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        if mm_data:
-            processed_outputs = super()._call_hf_processor(
-                prompt, mm_data, mm_kwargs)
-            image_grids = [
-                self.info._get_image_feature_grid_size(
-                    image_width=img.width,
-                    image_height=img.height,
-                    **mm_kwargs,
-                ) for img in mm_data["images"]
-            ]
-            image_patches = list(map(lambda x: math.prod(x) + 1, image_grids))
-            for key in ("pixel_values", "pixel_attention_mask"):
-                data = processed_outputs.pop(key)
-                data = data.flatten(0, 1).split(image_patches)
-                processed_outputs[key] = data
-        else:
-            tokenizer = self.info.get_tokenizer()
-            processed_outputs = tokenizer(prompt,
-                                          add_special_tokens=True,
-                                          return_tensors="pt")
+        # Text-only input not supported in composite processor
+        if not (images := mm_data.get("images", [])):
+            prompt_ids = self.info.get_tokenizer().encode(prompt)
+            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
+            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
+
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+
+        parsed_images = (self._get_data_parser().parse_mm_data({
+            "image": images
+        }).get_items("image", ImageProcessorItems))
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+
+        image_repl_features = [
+            self.info.get_image_repl(image_width=size.width,
+                                     image_height=size.height,
+                                     processor=hf_processor)
+            for size in image_sizes
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token.content]
+
+        embed_is_patch = [
+            torch.tensor(image_repl_tokens) == image_token_id
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        processed_outputs["embed_is_patch"] = embed_is_patch
+
+        num_patches = [
+            self.info.get_num_patches(
+                image_width=size.width,
+                image_height=size.height,
+                processor=hf_processor,
+            ) for size in image_sizes
+        ]
+        processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        # Remove the extra batch dimension
+        processed_outputs["pixel_values"].squeeze_(0)
+        processed_outputs["pixel_attention_mask"].squeeze_(0)
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -268,10 +406,16 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+
         return dict(
-            pixel_values=MultiModalFieldConfig.batched("image"),
-            pixel_attention_mask=MultiModalFieldConfig.batched("image"),
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            pixel_attention_mask=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -281,42 +425,18 @@ def _get_prompt_updates(
         out_mm_kwargs: MultiModalKwargs,
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
-
         image_token = hf_processor.image_token.content
-        fake_image_token = hf_processor.fake_image_token.content
-        global_img_token = hf_processor.global_image_tag
-        image_seq_len = hf_processor.image_seq_len
-        grid_placeholder = "<row_{n_h}_col_{n_w}>"
-
-        p_img = image_token * image_seq_len
-        global_img_placeholder = fake_image_token + global_img_token + p_img
-        tile_img_placeholder = fake_image_token + grid_placeholder + p_img
 
         def get_replacement_idefics3(item_idx: int) -> str:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
-            grid_w, grid_h = self.info._get_image_feature_grid_size(
+
+            return self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
-                **hf_processor_mm_kwargs,
+                processor=hf_processor,
             )
-            if grid_w == 0 and grid_h == 0:
-                image_placeholder = global_img_placeholder
-            else:
-                tiles_placeholder = list[str]()
-                for i in range(grid_h):
-                    for j in range(grid_w):
-                        placeholder_per_tile = tile_img_placeholder.format(
-                            n_h=i + 1, n_w=j + 1)
-                        tiles_placeholder.append(placeholder_per_tile)
-                        # Add line break if it is the last tile in the row
-                        if j == grid_w - 1:
-                            tiles_placeholder.append("\n")
-
-                image_placeholder = "".join(
-                    [*tiles_placeholder, "\n", global_img_placeholder])
-            return image_placeholder + fake_image_token
 
         return [
             PromptReplacement(
@@ -424,73 +544,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
               config.vision_config.patch_size)**2) / (config.scale_factor**2))
         self.image_token_id = self.config.image_token_id
 
-    def _validate_pixel_values(
-        self, data: Union[torch.Tensor, List[torch.Tensor]]
-    ) -> Union[torch.Tensor, List[torch.Tensor]]:
-
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = tuple(d.shape[1:])
-
-            if actual_dims != expected_dims:
-                expected_expr = ("num_patches", *map(str, expected_dims))
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data
-
-    def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[ImageInputs]:
-        pixel_values = kwargs.pop("pixel_values", None)
-        image_embeds = kwargs.pop("image_embeds", None)
-        pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
-
-        if pixel_values is None and image_embeds is None:
-            return None
-
-        if image_embeds is not None:
-            if not isinstance(image_embeds, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image embeddings. "
-                                 f"Got type: {type(image_embeds)}")
-
-            return Idefics3ImageEmbeddingInputs(
-                type="image_embeds",
-                data=flatten_bn(image_embeds, concat=True),
-            )
-
-        if pixel_values is not None:
-            if not isinstance(pixel_values, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if isinstance(pixel_values, list):
-                pixel_values = torch.cat(pixel_values, dim=1)
-                pixel_attention_mask = torch.cat(pixel_attention_mask, dim=1)
-            else:
-                pixel_values = flatten_bn(pixel_values)
-                pixel_attention_mask = flatten_bn(pixel_attention_mask)
-
-            return Idefics3ImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_pixel_values(pixel_values),
-                pixel_attention_mask=pixel_attention_mask)
-
-        raise AssertionError("This line should be unreachable.")
-
-    def _image_pixels_to_features(
+    def image_pixels_to_features(
         self,
         pixel_values: torch.Tensor,
-        pixel_attention_mask: Optional[torch.BoolTensor] = None,
-    ) -> NestedTensors:
+        pixel_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
         # NOTE: we skip the step to select the vision feature layer since
         # this is already done inside the vision tower
-        num_patches = [x.size(0) for x in pixel_values]
         pixel_values = pixel_values.to(
             dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
         )  # fp16 compatibility
@@ -502,17 +562,9 @@ def _image_pixels_to_features(
         pixel_values = pixel_values[real_images_inds].contiguous()
 
         # Handle the vision attention mask
-        if pixel_attention_mask is None:
-            pixel_attention_mask = torch.ones(
-                size=(pixel_values.size(0), pixel_values.size(2),
-                      pixel_values.size(3)),
-                dtype=torch.bool,
-                device=pixel_values.device,
-            )
-        else:
-            # Remove padding images from the mask
-            pixel_attention_mask = pixel_attention_mask[
-                real_images_inds].contiguous()
+        # Remove padding images from the mask
+        pixel_attention_mask = pixel_attention_mask[
+            real_images_inds].contiguous()
 
         patch_size = self.config.vision_config.patch_size
         patches_subgrid = pixel_attention_mask.unfold(dimension=1,
@@ -529,27 +581,7 @@ def _image_pixels_to_features(
             patch_attention_mask=patch_attention_mask,
         )
 
-        return image_hidden_states.split(num_patches)
-
-    def _process_image_pixels(
-            self, inputs: Idefics3ImagePixelInputs) -> NestedTensors:
-        assert self.vision_model is not None
-
-        pixel_values = inputs["data"]
-        pixel_attention_mask = inputs["pixel_attention_mask"]
-
-        return self._image_pixels_to_features(pixel_values,
-                                              pixel_attention_mask)
-
-    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
-        if image_input["type"] == "image_embeds":
-            return image_input["data"]
-
-        assert self.vision_model is not None
-        image_features = self._process_image_pixels(image_input)
-        num_patches = [x.size(0) for x in image_features]
-        image_features = torch.cat(image_features)
-        return self.connector(image_features).split(num_patches)
+        return image_hidden_states
 
     def get_input_embeddings(
         self,
@@ -616,13 +648,113 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
         self.sampler = get_sampler()
 
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Idefics3ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+                embed_is_patch=embed_is_patch,
+            )
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            pixel_attention_mask = kwargs.pop("pixel_attention_mask")
+            if not isinstance(pixel_attention_mask, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel_attention_mask. "
+                                 f"Got type: {type(pixel_attention_mask)}")
+
+            num_patches = kwargs.pop("num_patches")
+            if not isinstance(num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of num_patches. "
+                                 f"Got type: {type(num_patches)}")
+
+            pixel_values = flatten_bn(pixel_values, concat=True)
+            pixel_attention_mask = flatten_bn(pixel_attention_mask,
+                                              concat=True)
+            num_patches = flatten_bn(num_patches, concat=True)
+
+            return Idefics3ImagePixelInputs(
+                type="pixel_values",
+                pixel_values=self._validate_pixel_values(pixel_values),
+                pixel_attention_mask=pixel_attention_mask,
+                num_patches=num_patches,
+                embed_is_patch=embed_is_patch,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_pixels(
+            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+        pixel_values = inputs["pixel_values"]
+        pixel_attention_mask = inputs["pixel_attention_mask"]
+
+        return self.model.image_pixels_to_features(
+            pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
+        )
+
+    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_features = self._process_image_pixels(image_input)
+        image_features = self.model.connector(image_features)
+
+        num_patches = image_input["num_patches"]
+        return image_features.split(num_patches.tolist())
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        image_input = self.model._parse_and_validate_image_input(**kwargs)
+        image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self.model._process_image_input(image_input)
-        return vision_embeddings
+
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -632,8 +764,11 @@ def get_input_embeddings(
         inputs_embeds = self.model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                self.config.image_token_id)
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.config.image_token_id,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d2c8fb72372..ac4bdbc41e4 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -21,7 +21,6 @@
 import numpy as np
 import torch
 import torch.nn.functional as F
-import torch.utils.checkpoint
 import transformers.models.mllama.configuration_mllama as config_mllama
 from PIL.Image import Image
 from torch import nn
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index f63bd0a1145..ccb5a3f600b 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -160,7 +160,7 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, Any],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(prompt)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
             return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index cb1e1438384..6e73a2ae656 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -8,7 +8,6 @@
 from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
-import torch.utils.checkpoint
 from torch import nn
 from torch.nn import functional as F
 from transformers import BatchFeature, ProcessorMixin
@@ -160,7 +159,7 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, object],
     ) -> BatchFeature:
         # Text-only input not supported in composite processor
-        if not mm_data or not mm_data.get("audios", []):
+        if not mm_data.get("audios", []):
             prompt_ids = self.info.get_tokenizer().encode(
                 prompt, add_special_tokens=False)
             prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)

From ec75d8102d5ca2b5dbde9b28537827f5a84d1d88 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 28 Mar 2025 11:46:45 -0400
Subject: [PATCH 1053/1240] [V1] Support disable_any_whtespace for guidance
 backend (#15584)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/llm/test_guided_generate.py | 62 +++----------------
 .../llm/test_struct_output_generate.py        | 54 +++-------------
 vllm/engine/arg_utils.py                      |  3 +-
 .../guided_decoding/guidance_decoding.py      | 12 +++-
 vllm/v1/engine/processor.py                   | 11 ++--
 vllm/v1/structured_output/backend_guidance.py | 19 +++---
 6 files changed, 44 insertions(+), 117 deletions(-)

diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index 5f1a91cb2b1..3f275e0b2ec 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -6,7 +6,6 @@
 
 import jsonschema
 import pytest
-from pydantic import BaseModel
 
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.entrypoints.llm import LLM
@@ -15,7 +14,10 @@
 
 MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
 GUIDED_DECODING_BACKENDS = [
-    "outlines", "lm-format-enforcer", "xgrammar", "guidance"
+    "outlines",
+    "lm-format-enforcer",
+    "xgrammar:disable-any-whitespace",
+    "guidance:disable-any-whitespace",
 ]
 
 
@@ -322,59 +324,9 @@ def test_guided_json_object(llm, guided_decoding_backend: str):
             print(generated_text)
             assert generated_text is not None
 
+            if 'disable-any-whitespace' in guided_decoding_backend:
+                assert "\n" not in generated_text
+
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
-
-
-@pytest.mark.skip_global_cleanup
-def test_json_with_any_whitespace_disabled(llm):
-
-    class ResponseSchema(BaseModel):
-        clarifying_question: str
-        cost_per_serving: str
-        calories: str
-        type_dish_ids: str
-        type_meal_ids: str
-        product_ids: list[str]
-        exclude_product_ids: list[str]
-        allergen_ids: list[str]
-        total_cooking_time: str
-        kitchen_ids: str
-        holiday_ids: str
-
-    # Note: Without this setting, the response is sometimes full of `\n`
-    # for some models. This option prevents that.
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    schema = ResponseSchema.model_json_schema()
-    guided_params = GuidedDecodingParams(json=schema,
-                                         backend=\
-                                           guided_decoding_backend)
-    sampling_params = SamplingParams(max_tokens=2000,
-                                     frequency_penalty=0,
-                                     presence_penalty=-1.1,
-                                     repetition_penalty=1.3,
-                                     guided_decoding=guided_params)
-
-    prompt = ("<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You"
-              "are a helpful assistant.<|im_end|>\n<|im_start|>user\nI want a "
-              "quick launch fast with $10.<|im_end|>\n<|im_start|>assistant\n")
-    outputs = llm.generate(prompts=prompt,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
-
-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
-        jsonschema.validate(instance=parsed_json, schema=schema)
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 00fa47575b6..c9fa03a1ae1 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,7 +15,9 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
+GUIDED_DECODING_BACKENDS_V1 = [
+    "xgrammar:disable-any-whitespace", "guidance:disable-any-whitespace"
+]
 MODELS_TO_TEST = [
     "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
 ]
@@ -55,50 +57,8 @@ def test_guided_json_completion(
 
         generated_text = output.outputs[0].text
         assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_disable_any_whitespace(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    if guided_decoding_backend != "xgrammar":
-        pytest.skip("disable-any-whitespace is only supported for xgrammar.")
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)
-
-    assert outputs is not None
-
-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt
-
-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        assert "\n" not in generated_text
+        if 'disable-any-whitespace' in guided_decoding_backend:
+            assert "\n" not in generated_text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
@@ -142,7 +102,7 @@ def test_guided_json_object(
             # Parse to verify it is valid JSON
             parsed_json = json.loads(generated_text)
             allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend == "xgrammar":
+            if guided_decoding_backend.startswith("xgrammar"):
                 # TODO - we are currently too permissive with xgrammar and
                 # allow # any valid json (typically comes back as a list or
                 # object).  We can fix this by specifying a jsonschema of
@@ -170,7 +130,7 @@ def test_guided_json_unsupported_schema(
         temperature=1.0,
         max_tokens=1000,
         guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend == "xgrammar":
+    if guided_decoding_backend.startswith("xgrammar"):
         with pytest.raises(ValueError,
                            match="The provided JSON schema contains features "
                            "not supported by xgrammar."):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a416fa8aa08..6f498af36a4 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1561,7 +1561,8 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
 
         # Xgrammar and Guidance are supported.
         SUPPORTED_GUIDED_DECODING = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
         if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
             _raise_or_fallback(feature_name="--guided-decoding-backend",
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index d8675a14030..f19ebcbe420 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -18,14 +18,22 @@ def get_local_guidance_guided_decoding_logits_processor(
     """
 
     grm = ""
+    any_whitespace = 'disable-any-whitespace' not in \
+        guided_params.backend_options()
     if guided_params.json:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             guided_params.json,
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.json_object:
         grm = llguidance.LLMatcher.grammar_from_json_schema(
             '{"type": "object"}',
-            overrides={"whitespace_pattern": guided_params.whitespace_pattern})
+            overrides={"whitespace_pattern": guided_params.whitespace_pattern},
+            defaults={
+                "whitespace_flexible": any_whitespace,
+            })
     elif guided_params.regex:
         grm = llguidance.grammar_from("regex", guided_params.regex)
     elif guided_params.choice:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 24762d214c3..dbaf0abaea1 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -121,7 +121,8 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             return
 
         supported_backends = [
-            "xgrammar", "xgrammar:disable-any-whitespace", "guidance", "auto"
+            "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+            "guidance:disable-any-whitespace", "auto"
         ]
         engine_level_backend = self.decoding_config.guided_decoding_backend
         if engine_level_backend not in supported_backends:
@@ -140,11 +141,10 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             raise ValueError("Structured output is not supported on TPU.")
 
         # Request content validation
-
-        if engine_level_backend == "xgrammar":
+        if engine_level_backend.startswith("xgrammar"):
             # xgrammar with no fallback
             validate_structured_output_request_xgrammar(params)
-            params.guided_decoding.backend = "xgrammar"
+            params.guided_decoding.backend = engine_level_backend
         elif engine_level_backend == "auto":
             # "auto" is an opt-in to opinionated behavior where we try to
             # choose a backend based on request contents. This is not the
@@ -158,12 +158,13 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
                 # are not supported in xgrammar. Fall back to guidance.
                 params.guided_decoding.backend = "guidance"
 
-        if params.guided_decoding.backend == "guidance":
+        if engine_level_backend.startswith("guidance"):
             # TODO ideally we would have the LLTokenizer here as Lark syntax
             # allows <|special_token|> and similar, see
             # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
             # Without tokenizer these are disallowed in grammars.
             validate_guidance_grammar(params, tokenizer=None)
+            params.guided_decoding.backend = engine_level_backend
 
     def process_inputs(
         self,
diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index 1e274ad0ae6..a7ba7101694 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -41,6 +41,9 @@ def __init__(self, vllm_config: VllmConfig):
         tokenizer_group.ping()
         self.vllm_config = vllm_config
         self.vocab_size = vllm_config.model_config.get_vocab_size()
+        self.disable_any_whitespace = (
+            "disable-any-whitespace"
+            in vllm_config.decoding_config.guided_decoding_backend)
 
         tokenizer = tokenizer_group.get_lora_tokenizer(None)
         self.ll_tokenizer = llguidance_hf.from_tokenizer(tokenizer, None)
@@ -48,7 +51,7 @@ def __init__(self, vllm_config: VllmConfig):
     def compile_grammar(self, request_type: StructuredOutputOptions,
                         grammar_spec: str) -> StructuredOutputGrammar:
         self.serialized_grammar = serialize_guidance_grammar(
-            request_type, grammar_spec)
+            request_type, grammar_spec, self.disable_any_whitespace)
 
         ll_matcher = llguidance.LLMatcher(
             self.ll_tokenizer,
@@ -126,17 +129,19 @@ def reset(self):
 
 
 def serialize_guidance_grammar(request_type: StructuredOutputOptions,
-                               grammar_spec: str) -> str:
+                               grammar_spec: str,
+                               disable_any_whitespace: bool = False) -> str:
     if request_type == StructuredOutputOptions.JSON:
-        # TODO: make whitespace_flexible configurable
         return llguidance.LLMatcher.grammar_from_json_schema(
-            grammar_spec, defaults={
-                "whitespace_flexible": True,
+            grammar_spec,
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     elif request_type == StructuredOutputOptions.JSON_OBJECT:
         return llguidance.LLMatcher.grammar_from_json_schema(
-            '{"type": "object"}', defaults={
-                "whitespace_flexible": True,
+            '{"type": "object"}',
+            defaults={
+                "whitespace_flexible": not disable_any_whitespace,
             })
     else:
         if request_type == StructuredOutputOptions.REGEX:

From 87ab13fbcf99e992e04f828393361c44d50b8595 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:56:48 +0800
Subject: [PATCH 1054/1240] [doc] add missing imports (#15699)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/generative_models.md  | 6 ++++++
 docs/source/models/pooling_models.md     | 8 ++++++++
 docs/source/performance/optimization.md  | 2 ++
 docs/source/serving/multimodal_inputs.md | 8 ++++++++
 docs/source/serving/offline_inference.md | 6 ++++++
 5 files changed, 30 insertions(+)

diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md
index c94e940b853..63fc53b0e7c 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/source/models/generative_models.md
@@ -23,6 +23,8 @@ It is similar to [its counterpart in HF Transformers](https://huggingface.co/doc
 except that tokenization and detokenization are also performed automatically.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 outputs = llm.generate("Hello, my name is")
 
@@ -36,6 +38,8 @@ You can optionally control the language generation by passing {class}`~vllm.Samp
 For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
+from vllm import LLM, SamplingParams
+
 llm = LLM(model="facebook/opt-125m")
 params = SamplingParams(temperature=0)
 outputs = llm.generate("Hello, my name is", params)
@@ -83,6 +87,8 @@ Base models may perform poorly as they are not trained to respond to the chat co
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct")
 conversation = [
     {
diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md
index f774f3d0fa0..dbcd846cc97 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@@ -68,6 +68,8 @@ The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
 It returns the extracted hidden states directly, which is useful for reward models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="Qwen/Qwen2.5-Math-RM-72B", task="reward")
 (output,) = llm.encode("Hello, my name is")
 
@@ -81,6 +83,8 @@ The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
 It is primarily designed for embedding models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="intfloat/e5-mistral-7b-instruct", task="embed")
 (output,) = llm.embed("Hello, my name is")
 
@@ -96,6 +100,8 @@ The {class}`~vllm.LLM.classify` method outputs a probability vector for each pro
 It is primarily designed for classification models.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="jason9693/Qwen2.5-1.5B-apeach", task="classify")
 (output,) = llm.classify("Hello, my name is")
 
@@ -116,6 +122,8 @@ To handle RAG at a higher level, you should use integration frameworks such as [
 :::
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score")
 (output,) = llm.score("What is the capital of France?",
                       "The capital of Brazil is Brasilia.")
diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md
index 5b0f8421a51..ccbe8a36706 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/source/performance/optimization.md
@@ -31,6 +31,8 @@ vLLM supports an experimental feature chunked prefill. Chunked prefill allows to
 You can enable the feature by specifying `--enable-chunked-prefill` in the command line or setting `enable_chunked_prefill=True` in the LLM constructor.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
 # Set max_num_batched_tokens to tune performance.
 # NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md
index 2e2016c95e4..f45d36c3cca 100644
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@@ -21,6 +21,8 @@ To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
 # Refer to the HuggingFace repo for the correct format to use
@@ -65,6 +67,8 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
 To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
 
 ```python
+from vllm import LLM
+
 llm = LLM(
     model="microsoft/Phi-3.5-vision-instruct",
     trust_remote_code=True,  # Required to load Phi-3.5-vision
@@ -96,6 +100,8 @@ Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ```python
+from vllm import LLM
+
 # Specify the maximum number of frames per video to be 4. This can be changed.
 llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
 
@@ -139,6 +145,8 @@ To input pre-computed embeddings belonging to a data type (i.e. image, video, or
 pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
 
 ```python
+from vllm import LLM
+
 # Inference with image embeddings as input
 llm = LLM(model="llava-hf/llava-1.5-7b-hf")
 
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
index 7bf1c08828d..2fa19332d4a 100644
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@@ -11,6 +11,8 @@ For example, the following code downloads the [`facebook/opt-125m`](https://hugg
 and runs it in vLLM using the default configuration.
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="facebook/opt-125m")
 ```
 
@@ -47,6 +49,8 @@ To fix this, explicitly specify the model architecture by passing `config.json`
 For example:
 
 ```python
+from vllm import LLM
+
 model = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
@@ -92,6 +96,8 @@ You can further reduce memory usage by limiting the context length of the model
 and the maximum batch size (`max_num_seqs` option).
 
 ```python
+from vllm import LLM
+
 llm = LLM(model="adept/fuyu-8b",
           max_model_len=2048,
           max_num_seqs=2)

From e417f822596318cb18e9a5dc6fa391040ce91d00 Mon Sep 17 00:00:00 2001
From: Kebe <mail@kebe7jun.com>
Date: Fri, 28 Mar 2025 23:58:44 +0800
Subject: [PATCH 1055/1240] [Bugfix] Fix regex compile display format (#15368)

Signed-off-by: Kebe <mail@kebe7jun.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/tokenizers/mistral.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 2d036e2c83f..d893431f487 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -124,13 +124,15 @@ def find_tokenizer_file(files: List[str]):
 
     matched_files = [file for file in files if file_pattern.match(file)]
     if len(matched_files) > 1:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure only one Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure only one Mistral "
+            f"tokenizer is present in {files}.")
     elif len(matched_files) == 0:
-        raise OSError(f"Found {len(matched_files)} files matching the "
-                      f"pattern: {file_pattern}. Make sure that a Mistral "
-                      f"tokenizer is present in {files}.")
+        raise OSError(
+            f"Found {len(matched_files)} files matching the "
+            f"pattern: `{file_pattern.pattern}`. Make sure that a Mistral "
+            f"tokenizer is present in {files}.")
 
     return matched_files[0]
 

From 35d942b13261da0145a7d72aec0dff6f46908f82 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 28 Mar 2025 10:29:32 -0600
Subject: [PATCH 1056/1240] Fix cpu offload testing for gptq/awq/ct (#15648)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_cpu_offload.py | 12 +++++++---
 tests/utils.py                         | 33 ++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index 79afcc916f2..a7d6518514c 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -33,7 +33,9 @@ def test_cpu_offload_fp8():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test GPTQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"],
@@ -47,7 +49,9 @@ def test_cpu_offload_gptq():
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
                     reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test AWQ Marlin
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
                          ["--cpu-offload-gb", "1"],
@@ -61,7 +65,9 @@ def test_cpu_offload_awq():
 
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
     # Test wNa16
     compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
                          ["--cpu-offload-gb", "1"],
diff --git a/tests/utils.py b/tests/utils.py
index a827b6d4b9b..8915453ebd0 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -317,6 +317,37 @@ def _test_completion_close(
     return results
 
 
+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
 def _test_embeddings(
     client: openai.OpenAI,
     model: str,
@@ -512,6 +543,8 @@ def compare_all_settings(model: str,
                 results += _test_completion(client, model, prompt, token_ids)
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
             elif method == "generate_with_image":
                 results += _test_image_text(
                     client, model,

From 3f0c76cb677fea1d032380e2afcc64a0e73e2934 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 28 Mar 2025 09:30:08 -0700
Subject: [PATCH 1057/1240] [Minor] Remove TGI launching script  (#15646)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_serving.py                  |  3 ---
 .../benchmark_serving_structured_output.py       |  3 ---
 benchmarks/launch_tgi_server.sh                  | 16 ----------------
 3 files changed, 22 deletions(-)
 delete mode 100755 benchmarks/launch_tgi_server.sh

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 82c6b426b9a..e2f712dfc6f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -7,9 +7,6 @@
         --swap-space 16 \
         --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving.py \
         --backend <backend> \
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index c79a93faff1..71cb420a52c 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -5,9 +5,6 @@
     (vLLM OpenAI API server)
     vllm serve <your_model> --disable-log-requests
 
-    (TGI backend)
-    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
-
 On the client side, run:
     python benchmarks/benchmark_serving_structured_output.py \
         --backend <backend> \
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
deleted file mode 100755
index ba7383d88dc..00000000000
--- a/benchmarks/launch_tgi_server.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-PORT=8000
-MODEL=$1
-TOKENS=$2
-
-docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-           -v "$PWD/data:/data" \
-           ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id "$MODEL" \
-           --sharded false  \
-           --max-input-length 1024 \
-           --max-total-tokens 2048 \
-           --max-best-of 5 \
-           --max-concurrent-requests 5000 \
-           --max-batch-total-tokens "$TOKENS"

From 404e7b79a186612099f625ae30ff88871addf880 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 29 Mar 2025 00:41:16 +0800
Subject: [PATCH 1058/1240] [Misc] Remove unused utils and clean up imports
 (#15708)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/multimodal/test_utils.py    |  69 +----------------
 vllm/multimodal/utils.py          | 119 ------------------------------
 vllm/v1/core/sched/output.py      |   3 +-
 vllm/v1/worker/gpu_input_batch.py |   9 +--
 4 files changed, 5 insertions(+), 195 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 8f76d895fdd..a3f136c5667 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,12 +9,10 @@
 import numpy as np
 import pytest
 from PIL import Image, ImageChops
-from transformers import AutoConfig, AutoTokenizer
 
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
-                                   repeat_and_pad_placeholder_tokens)
+                                   merge_and_sort_multimodal_metadata)
 
 if TYPE_CHECKING:
     from vllm.multimodal.hasher import MultiModalHashDict
@@ -136,71 +134,6 @@ async def test_fetch_image_local_files(image_url: str):
                 f"file://{temp_dir}/../{os.path.basename(image_url)}")
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"])
-def test_repeat_and_pad_placeholder_tokens(model):
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-
-    tokenizer = AutoTokenizer.from_pretrained(model)
-
-    test_cases = [
-        (
-            "<image>",
-            2,
-            "<image><image>",
-            [32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            2,
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            [3, 2],
-            "<image><image><image><image><image>",
-            [32000, 32000, 32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
-        ),
-        (
-            "Image:<image>Image:<image>!",
-            [3, 2],
-            "Image:<image><image><image>Image:<image><image>!",
-            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
-            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
-        ),
-        (
-            "<image>",
-            [3, 2],
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }],
-        ),
-    ]  # yapf: disable
-
-    for (
-            prompt,
-            repeat_count,
-            expected_prompt,
-            expected_token_ids,
-            expected_ranges,
-    ) in test_cases:
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer=tokenizer,
-            prompt=prompt,
-            prompt_token_ids=tokenizer.encode(prompt,
-                                              add_special_tokens=False),
-            placeholder_token_id=image_token_id,
-            repeat_count=repeat_count,
-        )
-        assert new_prompt == expected_prompt
-        assert new_token_ids == expected_token_ids
-        assert ranges == expected_ranges
-
-
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
 class TestCase(NamedTuple):
     mm_positions: "MultiModalPlaceholderDict"
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ad381e1d1d0..8e4fb7eac49 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -12,8 +12,6 @@
 
 import vllm.envs as envs
 from vllm.connections import HTTPConnection, global_http_connection
-from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .audio import AudioMediaIO
 from .base import MediaIO
@@ -21,8 +19,6 @@
 from .inputs import PlaceholderRange
 from .video import VideoMediaIO
 
-logger = init_logger(__name__)
-
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
@@ -296,121 +292,6 @@ def encode_video_base64(frames: npt.NDArray) -> str:
     return video_io.encode_base64(frames)
 
 
-# Utilities for input processors
-_T = TypeVar("_T", str, int)
-
-
-def repeat_and_pad_token(
-    token: _T,
-    *,
-    repeat_count: int = 1,
-    pad_token_left: Optional[_T] = None,
-    pad_token_right: Optional[_T] = None,
-) -> list[_T]:
-    replacement = [token] * repeat_count
-    if pad_token_left is not None:
-        replacement = [pad_token_left] + replacement
-    if pad_token_right is not None:
-        replacement = replacement + [pad_token_right]
-
-    return replacement
-
-
-def repeat_and_pad_placeholder_tokens(
-    tokenizer: AnyTokenizer,
-    prompt: Optional[str],
-    prompt_token_ids: list[int],
-    *,
-    placeholder_token_id: int,
-    repeat_count: Union[int, list[int]],
-    pad_token_left: Optional[int] = None,
-    pad_token_right: Optional[int] = None,
-) -> tuple[Optional[str], list[int], list[PlaceholderRange]]:
-    if isinstance(repeat_count, int):
-        repeat_count = [repeat_count]
-
-    if prompt is None:
-        new_prompt = None
-    else:
-        placeholder_token_str = tokenizer.decode(placeholder_token_id)
-        pad_token_str_left = (None if pad_token_left is None else
-                              tokenizer.decode(pad_token_left))
-        pad_token_str_right = (None if pad_token_right is None else
-                               tokenizer.decode(pad_token_right))
-
-        placeholder_token_count = prompt.count(placeholder_token_str)
-        # This is an arbitrary number to distinguish between the two cases
-        if placeholder_token_count > 16:
-            logger.warning(
-                "Please follow the prompt format that is "
-                "documented on HuggingFace which does not involve "
-                "repeating %s tokens.", placeholder_token_str)
-        if placeholder_token_count < len(repeat_count):
-            logger.warning(
-                "The number of multi-modal placeholder tokens in the prompt "
-                "is less than the number of multi-modal inputs. Extra "
-                "placeholder tokens will be treated as plain text")
-            repeat_count = repeat_count[:placeholder_token_count]
-
-        prompt_parts = prompt.split(placeholder_token_str,
-                                    maxsplit=len(repeat_count))
-        new_prompt = ""
-        for i, repeat_count_item in enumerate(repeat_count):
-            replacement_str = "".join(
-                repeat_and_pad_token(
-                    placeholder_token_str,
-                    repeat_count=repeat_count_item,
-                    pad_token_left=pad_token_str_left,
-                    pad_token_right=pad_token_str_right,
-                ))
-            # The image tokens are removed to be consistent with HuggingFace
-            new_prompt += prompt_parts[i] + replacement_str
-        new_prompt += prompt_parts[-1]
-
-    new_token_ids = list[int]()
-    placeholder_ranges = list[PlaceholderRange]()
-    placeholder_token_idx = 0
-    for i, token in enumerate(prompt_token_ids):
-        if token == placeholder_token_id:
-            curr_repeat_count = repeat_count[placeholder_token_idx]
-            replacement_ids = repeat_and_pad_token(
-                placeholder_token_id,
-                repeat_count=curr_repeat_count,
-                pad_token_left=pad_token_left,
-                pad_token_right=pad_token_right,
-            )
-            offset = len(new_token_ids)
-            if pad_token_left is not None:
-                offset += 1
-            placeholder_ranges.append({
-                "offset": offset,
-                "length": curr_repeat_count,
-            })
-            new_token_ids.extend(replacement_ids)
-            placeholder_token_idx += 1
-
-            # No need to further scan the list since we replaced all tokens
-            if placeholder_token_idx >= len(repeat_count):
-                new_token_ids.extend(prompt_token_ids[i + 1:])
-                break
-        else:
-            new_token_ids.append(token)
-
-    return new_prompt, new_token_ids, placeholder_ranges
-
-
-def consecutive_placeholder_ranges(
-        num_items: int,
-        item_size: int,
-        initial_offset: int = 0) -> list[PlaceholderRange]:
-    """Returns a list of consecutive PlaceholderRanges of a fixed size"""
-
-    return [
-        PlaceholderRange(offset=initial_offset + i * item_size,
-                         length=item_size) for i in range(num_items)
-    ]
-
-
 def merge_and_sort_multimodal_metadata(
     mm_positions: "MultiModalPlaceholderDict",
     mm_hashes: Optional["MultiModalHashDict"],
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index bb883acdb44..dc0d2d59fea 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -10,8 +10,7 @@
     import numpy.typing as npt
 
     from vllm.lora.request import LoRARequest
-    from vllm.multimodal import MultiModalKwargs
-    from vllm.multimodal.base import PlaceholderRange
+    from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
     from vllm.sampling_params import SamplingParams
     from vllm.v1.request import Request
 
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 01a5cb5548b..351b3581558 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -2,13 +2,13 @@
 # Datastructures defining an input batch
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, cast
+from typing import Optional, cast
 
 import numpy as np
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
@@ -18,9 +18,6 @@
 
 _SAMPLING_EPS = 1e-5
 
-if TYPE_CHECKING:
-    from vllm.multimodal.inputs import PlaceholderRange
-
 
 @dataclass
 class CachedRequestState:
@@ -29,7 +26,7 @@ class CachedRequestState:
     prompt_token_ids: list[int]
     prompt: Optional[str]
     mm_inputs: list[MultiModalKwargs]
-    mm_positions: list["PlaceholderRange"]
+    mm_positions: list[PlaceholderRange]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 

From 6fc07dabe88ec947646a4b8f7055756503d6a7d3 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sat, 29 Mar 2025 01:33:32 +0800
Subject: [PATCH 1059/1240] [Misc] Remove stale func in KVTransferConfig
 (#14746)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5c73ff56ebb..6a15109c674 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2986,12 +2986,6 @@ def is_kv_transfer_instance(self) -> bool:
         return self.kv_connector is not None and \
             self.kv_role in ["kv_producer", "kv_consumer", "kv_both"]
 
-    @property
-    def need_kv_parallel_group(self) -> bool:
-        # for those database-based connector, vLLM does not need to create
-        # parallel group, and in that case the kv parallel size will be 1.
-        return self.kv_connector is not None and self.kv_parallel_size > 1
-
     @property
     def is_kv_producer(self) -> bool:
         return self.kv_connector is not None and \

From f43ac32c543f33ac56ffd192af691354d319698a Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:37:52 -0700
Subject: [PATCH 1060/1240] [TPU] [Perf] Improve Memory Usage Estimation
 (#15671)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 4d9a113e39e..c8691ee87fe 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -161,7 +161,13 @@ def determine_available_memory(self) -> int:
         # intermediate activations.
         m = xm.get_memory_info(self.device)
         total_memory_size = m["bytes_limit"]
-        profiled = m["peak_bytes_used"]  # Weights + intermediate activations.
+        current_mem = m["bytes_used"]
+        # Ideally we would use profiled = m["peak_bytes_used"] to
+        # get weights + activations. But there is memory used during
+        # compilation / weight loading that impacts the peak and
+        # there is no way to reset peak memory in XLA, So we
+        # use the heuristic of 2% of weights.
+        profiled = current_mem * 1.02
 
         # Calculate the TPU KV cache size based on profiling.
         usable_memory_size = int(total_memory_size *

From b924c5eca10c109474c3943f66a6af5a8daeb8fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:01:09 -0400
Subject: [PATCH 1061/1240] [Bugfix] [torch.compile] Add Dynamo metrics context
 during compilation (#15639)

Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/compile/test_full_graph.py       | 81 +++++++++++++++++---------
 vllm/compilation/compiler_interface.py | 38 +++++++++++-
 2 files changed, 90 insertions(+), 29 deletions(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 3a45c35442c..5311a4ce210 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -2,21 +2,20 @@
 
 from __future__ import annotations
 
-from typing import Any
+from typing import Any, Union
 
 import pytest
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.config import CompilationLevel
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test
 
 
-@pytest.fixture(params=None, name="model_info")
-def models_list_fixture(request):
+def models_list(all: bool):
     TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
         ("facebook/opt-125m", {}),
         ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
@@ -33,6 +32,9 @@ def models_list_fixture(request):
         ("meta-llama/Llama-3.2-1B-Instruct", {}),
     ]
 
+    if not all:
+        return TEST_MODELS
+
     if is_quant_method_supported("aqlm"):
         TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
             "quantization": "aqlm"
@@ -77,7 +79,7 @@ def models_list_fixture(request):
     "optimization_level",
     [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
 )
-@pytest.mark.parametrize("model_info", "", indirect=True)
+@pytest.mark.parametrize("model_info", models_list(all=True))
 @create_new_process_for_each_test()
 def test_full_graph(
     monkeypatch: pytest.MonkeyPatch,
@@ -91,25 +93,50 @@ def test_full_graph(
         m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
         print(f"MODEL={model}")
 
-        prompts = [
-            "Hello, my name is",
-            "The president of the United States is",
-            "The capital of France is",
-            "The future of AI is",
-        ]
-        sampling_params = SamplingParams(temperature=0)
-        llm = LLM(
-            model=model,
-            enforce_eager=True,
-            tensor_parallel_size=1,
-            disable_custom_all_reduce=True,
-            compilation_config=optimization_level,
-            **model_kwargs,
-        )
-        outputs = llm.generate(prompts, sampling_params)
-
-        # Print the outputs.
-        for output in outputs:
-            prompt = output.prompt
-            generated_text = output.outputs[0].text
-            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        run_model(optimization_level, model, model_kwargs)
+
+
+# TODO(luka) add other supported compilation config scenarios here
+@pytest.mark.parametrize(
+    "compilation_config",
+    # additional compile sizes
+    [
+        CompilationConfig(level=CompilationLevel.PIECEWISE,
+                          compile_sizes=[1, 2])
+    ])
+# only test some of the models
+@pytest.mark.parametrize("model_info", models_list(all=False))
+@create_new_process_for_each_test()
+def test_custom_compile_config(
+    model_info: tuple[str, dict[str, Any]],
+    compilation_config: CompilationConfig,
+):
+    model, model_kwargs = model_info
+    print(f"MODEL={model}")
+    run_model(compilation_config, model, model_kwargs)
+
+
+def run_model(compile_config: Union[int, CompilationConfig], model: str,
+              model_kwargs: dict[str, Any]):
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    llm = LLM(
+        model=model,
+        enforce_eager=True,
+        tensor_parallel_size=1,
+        disable_custom_all_reduce=True,
+        compilation_config=compile_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index d6e44fa6d34..5a22cf70aad 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
+import contextlib
 import copy
 import hashlib
+import importlib.metadata
 import os
 from contextlib import ExitStack
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -9,6 +11,7 @@
 import torch
 import torch._inductor.compile_fx
 import torch.fx as fx
+from packaging.version import Version
 
 from vllm.config import VllmConfig
 
@@ -285,6 +288,9 @@ def _get_shape_env() -> AlwaysHitShapeEnv:
                     "torch._inductor.codecache.FxGraphCache._check_can_cache",
                     _check_can_cache))
 
+            # Dynamo metrics context, see method for more details.
+            stack.enter_context(self.metrics_context())
+
             compiled_graph = compile_fx(
                 graph,
                 example_inputs,
@@ -309,8 +315,14 @@ def load(self,
         hash_str = handle[0]
 
         from torch._inductor.codecache import FxGraphCache
-        with patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
-                   lambda *args, **kwargs: AlwaysHitShapeEnv()):
+        with ExitStack() as exit_stack:
+            exit_stack.enter_context(
+                patch("torch._inductor.codecache.FxGraphCache._get_shape_env",
+                      lambda *args, **kwargs: AlwaysHitShapeEnv()))
+
+            # Dynamo metrics context, see method for more details.
+            exit_stack.enter_context(self.metrics_context())
+
             if torch.__version__.startswith("2.5"):
                 inductor_compiled_graph = FxGraphCache._lookup_graph(
                     hash_str, example_inputs, True, False)
@@ -351,6 +363,28 @@ def compiled_graph(*args):
 
         return compiled_graph
 
+    def metrics_context(self) -> contextlib.AbstractContextManager:
+        """
+        This method returns the Dynamo metrics context (if it exists,
+        otherwise a null context). It is used by various compile components.
+        Present in torch>=2.6, it's used inside FxGraphCache in
+        torch==2.6 (but not after). It might also be used in various other
+        torch.compile internal functions.
+
+        Because it is re-entrant, we always set it (even if entering via Dynamo
+        and the context was already entered). We might want to revisit if it
+        should be set at a different level of compilation.
+
+        This is likely a bug in PyTorch: public APIs should not rely on
+        manually setting up internal contexts. But we also rely on non-public
+        APIs which might not provide these guarantees.
+        """
+        if Version(importlib.metadata.version('torch')) >= Version("2.6"):
+            import torch._dynamo.utils
+            return torch._dynamo.utils.get_metrics_context()
+        else:
+            return contextlib.nullcontext()
+
 
 class EagerAdaptor(CompilerInterface):
     name = "eager"

From 355aec58b426f2e4af80cc19f39f55356adb253b Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Fri, 28 Mar 2025 16:19:04 -0400
Subject: [PATCH 1062/1240] [V1] TPU - Fix the chunked prompt bug (#15713)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/tpu/test_basic.py         |  5 ++++-
 vllm/v1/worker/tpu_model_runner.py | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 591aa9c5878..0d7e8d8d7f5 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -48,7 +48,10 @@ def test_models(
 
         with vllm_runner(
                 model,
-                max_model_len=8192,
+                # Note: max_num_batched_tokens == 1024 is needed here to
+                # actually test chunked prompt
+                max_num_batched_tokens=1024,
+                max_model_len=8196,
                 gpu_memory_utilization=0.7,
                 max_num_seqs=16,
                 tensor_parallel_size=tensor_parallel_size) as vllm_model:
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 5401fff2bf1..695e31f715b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -618,6 +618,7 @@ def execute_model(
         # Update the cache state concurrently. Code above will not block until
         # we use `selected_token_ids`. Add mark_step if post-processing changes
         request_seq_lens: list[tuple[int, CachedRequestState, int]] = []
+        discard_sampled_tokens_req_indices = []
         for i, req_id in zip(range(num_reqs), self.input_batch.req_ids):
             assert req_id is not None
             req_state = self.requests[req_id]
@@ -633,6 +634,10 @@ def execute_model(
                     # This relies on cuda-specific torch-internal impl details
                     generator.set_offset(generator.get_offset() - 4)
 
+                # Record the index of the request that should not be sampled,
+                # so that we could clear the sampled tokens before returning.
+                discard_sampled_tokens_req_indices.append(i)
+
         assert all(
             req_id is not None for req_id in
             self.input_batch.req_ids[:num_reqs]), "req_ids contains None"
@@ -646,11 +651,19 @@ def execute_model(
         if max_gen_len == 1:
             valid_sampled_token_ids = selected_token_ids.tolist()
 
+            # Mask out the sampled tokens that should not be sampled.
+            # TODO: Keep in sync with gpu_model_runner.py, in particular
+            #       the "else" case here
+            for i in discard_sampled_tokens_req_indices:
+                valid_sampled_token_ids[i].clear()
+
+            # Append sampled tokens
             for i, req_state, seq_len in request_seq_lens:
                 token_id = valid_sampled_token_ids[i][0]
                 self.input_batch.token_ids_cpu[i, seq_len] = token_id
                 req_state.output_token_ids.append(token_id)
                 self.input_batch.num_tokens[i] += 1
+
         else:
             valid_mask = selected_token_ids != INVALID_TOKEN_ID
             gen_lens = valid_mask.sum(dim=1).tolist()

From 44a2d4dd9e6a0445c3103f1daaaae3dbad256f54 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 29 Mar 2025 06:23:00 +0800
Subject: [PATCH 1063/1240] [Misc] cli auto show default value (#15582)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/benchmarks/serve.py            |  4 +---
 vllm/engine/arg_utils.py            | 25 ++++++++-----------------
 vllm/entrypoints/openai/cli_args.py |  2 +-
 vllm/utils.py                       |  2 +-
 4 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index cddfd672e7a..813556f90f5 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -726,15 +726,13 @@ def add_cli_args(parser: argparse.ArgumentParser):
         default="ttft,tpot,itl",
         help="Comma-seperated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
-        "Default value is \"ttft,tpot,itl\".")
+        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
     parser.add_argument(
         "--metric-percentiles",
         type=str,
         default="99",
         help="Comma-seperated list of percentiles for selected metrics. "
         "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\". "
         "Use \"--percentile-metrics\" to select metrics.",
     )
     parser.add_argument(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 6f498af36a4..ca511c7434f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -322,9 +322,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--download-dir',
                             type=nullable_str,
                             default=EngineArgs.download_dir,
-                            help='Directory to download and load the weights, '
-                            'default to the default cache dir of '
-                            'huggingface.')
+                            help='Directory to download and load the weights.')
         parser.add_argument(
             '--load-format',
             type=str,
@@ -399,8 +397,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'Valid backend values are "xgrammar", "guidance", and "auto". '
             'With "auto", we will make opinionated choices based on request'
             'contents and what the backend libraries currently support, so '
-            'the behavior is subject to change in each release. '
-            'The default is xgrammar.')
+            'the behavior is subject to change in each release.')
         parser.add_argument(
             '--logits-processor-pattern',
             type=nullable_str,
@@ -493,8 +490,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.prefix_caching_hash_algo,
             help="Set the hash algorithm for prefix caching. "
             "Options are 'builtin' (Python's built-in hash) or 'sha256' "
-            "(collision resistant but with certain overheads). Defaults "
-            "to 'builtin'.",
+            "(collision resistant but with certain overheads).",
         )
         parser.add_argument('--disable-sliding-window',
                             action='store_true',
@@ -568,9 +564,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=int,
             default=EngineArgs.max_num_partial_prefills,
             help="For chunked prefill, the max number of concurrent \
-            partial prefills."
-            "Defaults to 1",
-        )
+            partial prefills.")
         parser.add_argument(
             "--max-long-partial-prefills",
             type=int,
@@ -579,15 +573,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "than --long-prefill-token-threshold that will be prefilled "
             "concurrently. Setting this less than --max-num-partial-prefills "
             "will allow shorter prompts to jump the queue in front of longer "
-            "prompts in some cases, improving latency. Defaults to 1.")
+            "prompts in some cases, improving latency.")
         parser.add_argument(
             "--long-prefill-token-threshold",
             type=float,
             default=EngineArgs.long_prefill_token_threshold,
             help="For chunked prefill, a request is considered long if the "
-            "prompt is longer than this number of tokens. Defaults to 4%% of "
-            "the model's context length.",
-        )
+            "prompt is longer than this number of tokens.")
         parser.add_argument('--max-num-seqs',
                             type=int,
                             default=EngineArgs.max_num_seqs,
@@ -739,8 +731,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=int,
             default=EngineArgs.max_cpu_loras,
             help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_loras. '
-                  'Defaults to max_loras.'))
+                  'Must be >= than max_loras.'))
         parser.add_argument(
             '--fully-sharded-loras',
             action='store_true',
@@ -894,7 +885,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Set the lower bound threshold for the posterior '
             'probability of a token to be accepted. This threshold is '
             'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding. Defaults to 0.09')
+            'during speculative decoding.')
 
         parser.add_argument(
             '--typical-acceptance-sampler-posterior-alpha',
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index e956920c2f9..218a8fbe10b 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -247,7 +247,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         default=None,
                         help='Max number of prompt characters or prompt '
                         'ID numbers being printed in log.'
-                        '\n\nDefault: Unlimited')
+                        ' The default of None means unlimited.')
 
     parser.add_argument(
         "--disable-fastapi-docs",
diff --git a/vllm/utils.py b/vllm/utils.py
index afe68a2b8cb..bf83b38ace8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1212,7 +1212,7 @@ def __call__(self, parser, namespace, values, option_string=None):
                              "Expected 'true' or 'false'.")
 
 
-class SortedHelpFormatter(argparse.HelpFormatter):
+class SortedHelpFormatter(argparse.ArgumentDefaultsHelpFormatter):
     """SortedHelpFormatter that sorts arguments by their option strings."""
 
     def add_arguments(self, actions):

From a31a106763c464733ad3870c12b612ccf3da5e71 Mon Sep 17 00:00:00 2001
From: daniel-salib <danielsalib@meta.com>
Date: Fri, 28 Mar 2025 17:12:02 -0700
Subject: [PATCH 1064/1240] implement prometheus fast-api-instrumentor for http
 service metrics (#15657)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 7dbe31e62da..18d75a04ab0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -311,6 +311,7 @@ def mount_metrics(app: FastAPI):
     # See https://prometheus.github.io/client_python/multiprocess/
     from prometheus_client import (CollectorRegistry, make_asgi_app,
                                    multiprocess)
+    from prometheus_fastapi_instrumentator import Instrumentator
 
     prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None)
     if prometheus_multiproc_dir_path is not None:
@@ -318,6 +319,16 @@ def mount_metrics(app: FastAPI):
                      prometheus_multiproc_dir_path)
         registry = CollectorRegistry()
         multiprocess.MultiProcessCollector(registry)
+        Instrumentator(
+            excluded_handlers=[
+                "/metrics",
+                "/health",
+                "/load",
+                "/ping",
+                "/version",
+            ],
+            registry=registry,
+        ).add().instrument(app).expose(app)
 
         # Add prometheus asgi middleware to route /metrics requests
         metrics_route = Mount("/metrics", make_asgi_app(registry=registry))

From 2857da328fb7049525395043b14cba0ab4309c59 Mon Sep 17 00:00:00 2001
From: simpx <simpxx@gmail.com>
Date: Sat, 29 Mar 2025 11:33:58 +0800
Subject: [PATCH 1065/1240] [Docs][V1] Optimize diagrams in prefix caching
 design (#15716)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../v1/prefix_caching/example-time-1.png      | Bin 34837 -> 47947 bytes
 .../v1/prefix_caching/example-time-3.png      | Bin 37069 -> 51241 bytes
 .../v1/prefix_caching/example-time-4.png      | Bin 41530 -> 60607 bytes
 .../v1/prefix_caching/example-time-5.png      | Bin 39727 -> 55437 bytes
 .../v1/prefix_caching/example-time-6.png      | Bin 25462 -> 54829 bytes
 .../v1/prefix_caching/example-time-7.png      | Bin 33144 -> 55922 bytes
 6 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/source/assets/design/v1/prefix_caching/example-time-1.png
index 8849ca0237c39b4c428c4ab74c08b512812846f5..d5a165ff6944b7edd95f215685ca6185692e8876 100644
GIT binary patch
literal 47947
zcmb@tXVl}=)i!+3(1sR<-iH|&YG%NaZCR#UR&%!{S&dL!)NHHEwj|07FhJ-7girzu
zkO2}(r~yJC455Y^N(h8rLJy&LnD>YzZ+M@zzO|ko-&*%=OULINowCo~*S_}FiOo8-
z;d<MwH#0M{VTs0>nVH!uXJ%&2%vl?hC_D8Z24Ay$hQelk^XPVO&&=e?0U-uvs2U?>
zX6JPAUspTBDc$yiozwWvFieb;Uf|glxCG^<-8WQ2>Hq6DI0a`Hr_zgIl!J0RrwiE>
z_(-FBreH|^SN)ErSpV)QyJre?uv@LuGpxY&^_|lgxG!0OG6LVA7<@KZ@F9YKD0L{5
zJv6sFxGXr%ND-87#RztW$!t2eXBq*;yH#keR^1scg6oknRKORd^oDlQr0;d6dm1*V
z4(0bu?U@EeMcuPaP>F<6Feu8VQrSI`<cI&x@4uDVj@t2zA3QCYx)OA>e@#18NK+#i
z>e3NyJAyr!Aad?s)14@uZ-BM>e&zo2?4)rJIm*9E`^v=Vfq^8Y4p<)lU&UZb@Xl!h
z>_Ok?XkNzzKO6sE=1@Z`MMfUzikQzK-BB>0I{)gx>VP@@>tX+DpOb@bA1NVZ4J;=)
z5GHj+(fl#%Hbu+MdrG~g<U4XItBi{zC6zovPft9D%M`U%rEC;K8LTk|RVBT|p=uoH
zxe)VMQ$fcyMw0?mFXn7$EM$Vt&`p>9fE5LgRWsy>;zucmA!H^nLZL!InSM5_t5iS6
z9jMwctfF2a#74Q68cLkPWKwL;rfNht?pF!D7c>ISFO+(=oFj4un&-jL>5?T6yFIra
zsHsZBsO7pNiybI~3RPoCp`Ax5NVly)!O6zLNDB!g)h9D4i5!<m#Anl-8cH3{ovL`M
zSkHFN0c3h;hs%35#N*x|r!=cEcuLcedol0(wK3TgGRbqNa?8oLY$zbJR<k)sXXL>U
z>;)!GAk7E|otEB9mAc)!ZswpO&a|?5Enk@CB0`XaK|D;!)xcI72*uQ=mQ2!IDvcmz
zyaUILlF_b{-5gPsP$kmLZl{;`b(GE&;|gAHi0!cB*KK4poFZg9?_$|(ww%e2h$1&G
zHByDHUNXx%H42OMQje?(Hq(cg;W+Dpv9b10w9uH(vjREF6}T*1=t$EHB-DYJU~;O^
zpyXDbKy{f2h;$!k3$+@a>R@eEE$c?T45rQ6Oj0);Te)evUPTOsX_SaG>!MbrSI{AF
zbTOutiY5?R$}1HK!%Aqq5Tm0)B{RmR9Bmis4NA<^h7CvL8(O~99Je%sN5x{h68LZq
z^9G=cHqa<o4)J}$gX75dn{g@?ySavy)k)PH=zN=)28g0n@eVT`dp*A@3?NLXa1**{
zc8b#!5>l?>^$QimFu*<N&2@V@ui_v@Pr@=8uac>AL2oKl8fMc^Y13?>&b9g&3B|O6
zbPY^XK^rGy>HL6&F_ETP(hzD0rIb6R2dzRd$rL~h((?jFa+{IS2GwB%6Q{Y3KCNbo
zp`V_r)BF$=XT>qb=R(h-GRh!3WV!`fqRVz!O;`0H){48N+LofcjglFyXP2o?J6Fg;
zZr36TUaDoWSsy&ez&k;C9C->+%QKB`2y`L|{wkE_L0BwxG#Wf9QwK$Kb%aUMRILE5
zMOw9zGbvRI(S%0gNxl;XSX%T6%`4h1HB;&hGF(3u*pSCe+I3be4YHjepBdIz%fKqr
z5;`^8(U@z2zCmYH88dwp@72TRG^I3ah#aT{5Oi^bf(m$>9@z3kOARw+i5yj^js|nJ
zq2s68^2l@jzSgqPyyjGgL#$Y6)l+`GG%c5gMwP(FP^gu2*%qA-I;yXjc%-QlX4;6l
z)xrRCVX<!vQN=DAhU@V3kU*u0G4LH(?+eZB)UdKyFo1SHET=kJCse173sF!qCXPu}
zN!s-sHfZbosK%RosiQzG2d)%acr+D2rCe5bq<*dK;(7+$a(?pJ$&n72GPnwdW3t@O
z5<bksk)KH?C&@G_T-OMIl#)r6Vg{p!)YPFWprG3-j-gSeVvRaQy(kQbx)%0Z<xo}}
zQr1fbL=fPVkVsC?q!2|Jms~p;eKf8M#<*k%xt1eCqm(9lZnY&r-M;GiT-eU$AtE!(
zrd!6CcNz|^veT~b(jA}6N`q`Z-7p5dY>N<C&!!4s?xdYc+d&=X$rjw0WUE=*9JDi0
zF+~ZYry-5OBrhrBs?C#7y%Hf#I@eAQ2Ejmv_)!KOGm?V}&0*WaoqVcA1CjQr?ih?y
z9N@_z=YS<6C*UGwWr}K#8bgC(9O)$#l0>>&F8ORpCV)UOQ6^iY@d};EdAW{&l}xY0
zF(b2@CrTaL(pwX%q{AePD!sOmrjSHYks8kT(=psi2lyBzvz;mG(NX}+KZ)l>!fw_B
zQjz;~r4^Fhk;G>hiK3IW$5M12@jx}TsEAMm3VPYVCe?zQmpq+pwN1&3Bh}I;d9<EW
z)n>X8D;}>^CT)&FZ5u60URXhUlQP@QrY8*Oi5@`xfY+i4LQOEX)8mImWn^dbCKI?Q
zD+xoS?|T&k8I8-FS<lo!6Yv+uH4mIcYyj!t&dODC3NjtBMH59--7$(~ltL4YQD|8K
zuz*FSV;Y`3={4LzGRs8r1PU&96wg9ks!jGm0jQ3(8!D;-+YfI4WB&gK7Wl8F2gg6b
z4*vlMRAF%`&hhm_DbG$tQr-$(;y+j*{GV7rEDB<sr>i-*FS;r)N@i~a4H<h-GJ5T@
z?aTRO@LG!`jB%<pNJ|sF1=o6s>IOq&>TuuccH0FB;4X3+;aO$cG5gp!DyF$^G{AKu
ztX4ZCT%8I6F92~6<)KMct8ys?>kbVS#`R{+4J^>6UT0e+xW{xHJ4N;M&>ogsb*U67
zDKuq9L{9CuOmm1gYB+_*kc9?AbZT@`{c#tJCALy!uaOx9Rb`+czJ`So7?3o~X_18B
zBfUM$bRw-VU@Db?&LvaokBevm6-hS-?#W69vEy+fQEjzu<~o5($uSG;H&Rt8E?>kn
zM6Txh5k!j|io<3y-*HAjn`66`*vVLmvPLFZEUMhWCT6uEQNHA(EK;veQpvGr^meLF
z44_o8EyoR_08azXD3ugAN1#58B9cZyby{@XR71g_I%q;_jxLQzvnler<mgt<nV_gY
z)p}KlVA9QoTyeNgty$_3Q@IJXT!fisWS<4k;vtCgMG+H`Vt+W*!!#BUMZ${=uCMb}
z+bj%PB+#JMIGcfua!G(pR&0}XzW@Y<BvWa%Mv*wAc?D<MhH^TSr~?O{QgW>$(}_~R
z?I^4aLb;^ml>sD98iJl3!LC0DioI;vDL^tXCt6q6dTQDok!r40fd#B1s09SKunr|)
zLzZoDxY2MMW`naPdWWbg4O%1Mbk&3!O-vIfy(#SHL#|Tu1J9_59gikawi(l`-)ae(
z)ad!l1Rd)-HBDIq9U8+3H=4>2JuowPE1t+{3eWHz5#`th-!G-TTvUT=eWqE6eWqm8
zB8q^jMuE&%OR`LvS%qRTD@ZvUP<iPtFl(Y)ijv88w2^HZbXg~$AjNg8u;R3gimRF$
zo8pYJMU_fZ2-nSNFF!UUOtz6$sR`P7^sp%^8a?nU<3fa$Wn3Go$@ci^fTCNYl+vP*
zv}z1nUB1;O4ZA4UsvVBaG|XHEhYQ6xH?Gi*-^EldlEtE%3e3_pHX4J9s#}c?Rp6C@
zKFwlNRzkbISdaX0+R~bqSp>$q%o3u8vS6DEn#Oj)l7YQt%Y`^KmHJ^&rPE@e=hJ<x
zpC2<&+~r4ppoF==Z^V3yx1oNu)gR=ToHh!;kn=dW108&SpGWalIH>0bV<V%$1E!~x
zsX&Ujz9JSfCM$DDKlA|Tg<C#cv`c!Y&=kS%sUTms-I_W@2E_N4*CU-cF9GdMVnZOQ
zU3{eaUU^Uz6-F<Bhssg`0Ek3K;yfM<!KOB#7$pW=5ofWIZcrHsF4bjDvHC<<O7}~$
zo3D?^Y<39ia-oq%YL*OIV;s>RvSuj~x+ogBm1e01L4=)ngZyBUs>`+(s#TdCWg$ku
z8(3jf5z?uYPxESGqbkF3yCg=UZ`)3mz%riJsOQ@;qq6m8x80{@wbpjjMq_}eY?^5{
zyLe^XbFye|oRXA|i<rKoxxkYEhdPsM<$PouncYe<u1;eJ%s~#QD$L0@$Qr{93uwE{
zw40Tw+M%V+G@Gsi?T%^1n4{&yO4)d((=!1eLMu?U((yW?BeK05Wotwwl_`z`)hH`k
zqf@Br0<Mk{Gllt<1B;`&&5nasrZyJFLe=g~NSj1?Eu9M!rN#4NhfL#B80i#<vF>Kt
zS~okUVl}8Md4mEEhK)?B)x=~bRK;es-;IF3>s7#zFbK8ic-&QMiSkf!2uoW)Uyy87
z<a4qqavd_48Kx{4irN!J<%iiyw(m0$Tgml!c2sR5UED(_dRFagZO~mC;`!KCf&Rl3
zhAT;0u|~0CM(81^Eq4NN+<75m!-(2vx`sIn6P*qIfbq#U+YO2ewM4&jm^-SggOWlA
zoFg`}i45l?5$mzAGU=g&niUdXfefdVqw=X>(#7Jj#04D~8IxWm4$Ptej$%a)n)c9v
z@?G90T&oSG!>ElQfj8x2wvu4VtnZW4oLI=2Mi`ESJ`7Y#ix*06iLvQe=(mP4n(m_{
z9i<eK1zI;P0o@S@g00!fhNcxl0V<6q%^DP-e7copf@%tiC!I8e)RYn>l=5&?@8y9r
z+2`wl#zQ244C85nK!ZxVg;d*AOA{NKQ5s=wM8)zlp98B$cGWn7)PN_Gr3Cj9UJ`)^
z*#q{HBxB8>(T0R@90GiDi1fzoLLXv-daUL~PS%UNxrvGr$iT@0>_=uoyl%?LmF`jr
zkpL_OU>W{*_ynAF_y_o;5tMUOov%f{qVRp3G4ojJKlmi>CtL+lOYk=3gAc$Ofw5x+
z1$^)=6_g0N2YSQO>A;($CliJ+YPLfFOO6rzM46<g7^d}F6|YS-0>Ic{WU1AW(=aPV
z?HC}HZX@GQaWp^VYcAa%k`vQ*i5itF`9Y`E0QDKZGV(}*&%2@2aOAQMr^rDlsW2X~
ztX;2SO~jN4sl)_b6p34{N~Vg-5mV^p@os*k6EI`tVw-NGT8Y*|&Ci$Bkv+2fZfg+a
z)rjb_CYTt;HHHCF_hL<>+L0QTQH;pJBc|q*Tj3~EY|xpA9M)_s<yP}_wcFHf6KQxZ
zDkhr%{y3WS!i-ufv$br_4{9OJCkjn#G3umm+F5I2^KgOYp*W2;Gl@1W`w4I!NK?@Y
zL$ns9l0{B6c&)6I<pJ3nCdZt!V45y#l``G*q*j!lYE(QVB(o~wm8h8`{YKO)1_K_7
zIwQ=|daR<Nyb@?eOnWKKY{;@@Bmx#pYSRg(<90l?09!&5E!(QKGNd=16dhqqr=nWB
znJ0XnXIgzSpW&;cvf0x4H0BVv-|Y=sV5|VE$aX<&)c{52!gWJ0+ugDpIT}lK3UCwn
zGn0l);J{ZT0;Ao}6`4d#9Jokz{b8r>C`~5F8KlM8StLul6Q$RP;)dH!EK0kb9dL~9
zhPCpjkj4u@36tY~rK0*8-{#|vz#=S<6>DY9twnaR%<BMiF%^qMQX{ey<K<id_|I*-
ziO17HItV>x>N~9A(Iu~6arz=^^DI>xv4tUGmuS`36E~`nI0>YiaU&qYWWB(klv+;d
z!5H}TB*L}n@8Pzcc=%G&$Oc^}%L&;|C|P-!E*3F1*XJdC8VBB}#VByUTrb%zAR<hh
zYIBxdM*5aLZ2+!<b1^4X*Kipdp+R!^AtI6OiA83mJg)bVfJ(PuwQd79a)>t>%ZoEq
zchJZ{-LMvdnw^}JNQ?Nri|ir#K+R`;E8Q7-&bUI1A~)COf;6Bu6Z2FLOkVIiW6y6E
zfCCPtD=<|?nv-@$ikgN5xD9Vutn(eCCreSQASJ$&!6%9c<9cE}E_fDe5t!u@VzNL)
zhC`%LZ;)b;fGxAgINNr|DVWKRfkJmY7lz2Rr@)Oc4Ai*P^Ah|G2O{K+(ZC+3+xFP$
zMyfLzutf)gQH!Vud{svr-vYg34j))PX*;N2^UQIXbVogCEVf$?um@yE^F?in5H!#<
znAW7~L>Z9Jm{lK+IUOM?X%%pNx>z77brh#6VWVldl{O4Ji49jX?Q*$|v%pC;`jFe8
z6g+h6#WH1e_+bY%?No;GqExfpD~(NQQYAVxOT`*!ne8xHnTI-KzKx+!4U+>Nc)uYv
z?1e@z!s|XRfJu^45aw&bq*o6*#i3KRtu*K9sh;Mg+e$$m(plFN>u_lZfx2o`s!fU(
z5fd$DQs)PN1F}rGD5ho8%R4+DaoJK+^Gv8(@CH(euP2AXZ1aFxWe8467c`y;20mqG
zsCdvMlEo=z(^ModVtFi2)VPQYr-@Z1!8|gf<cya)RtJIjP_kjSCW?5Drn_NmPRDV7
zoXdz)9m~OG8-kT)F^92ov+f2`y;6?dP9+1Dprqpv(#iQ0Y>NehGJ$m?r)b!o$Y2Y|
zRCNp<=BOE>Ao9Gb)l8L}CW``NeL6@SxxkDKXefipV_7p37AkO$81}|+1b&lJ9-e3H
zTxC#eOiKCK^5NLcWEI_jT&n7|(Og$c2ht#4PkQp<p;>k)s+Lg_wJ;q@KviM{S%F2s
zT=zV?%2Y&NEQkJdB$b?W*Qc8erw-N#8@hEam+~ggc$`b+8ly@(F~wswhI7??Pbj0!
zPPQ^h_!Uy+gD{I$xh81V7{>JkYtl;DF7{cO57j{~T^$<$k(Fw8S_ixtX;Y*Xw)siP
z)dBA|4fGmHayVMC$I5i96^LF-C`47t&5k44Z!3uiXNi_GHXE@qO+nQ`4IZLCSPpDD
zOiWPDklhY=DrPa(gk<5GpptMJ&h?u#?J}C`454<ON8?_Q0z)$)mWfRxFG`VAL9@xy
zPqWR5U{#T7n3Ap5R77MNmzCNeP|Qr$o8(2NA@-{Z?Z<-C7eE7lS`51g1WspeEY%Ub
z4|Uxvkor+B)7SW+gZWU|lX<z5fSYkeFa##c`*z(kCWaqXg3M%E>DF91%s^9iltxEL
zV&PN7#1>$g#j!@<rE?Tbn|V;2z?1=52Q&+2B3{_D;6aEExDhY<VQZN0PT;93%4yE!
zr3zP0I;xgT)&rVPuVfs9O(vZT8vIEn?SZd+c4$>2a{4{=`QM@+u>U~W|6g{d106b!
z+e)amz(l$g9V;YYEs6Bx<gnYz`kDVkY7``qx=bN6mT56t%9#BW>~UkxG0;9<t=2K4
zHp$cnWhasNGLRJ3AW}sW^Q^L1Z0RZp(2)ZhY2ZT}k*b4U+5%Cc3DNXjoEx?n5BT$g
zfx!X;Hy8jXIU+`*woL$NDfGL&P9rY~$u3OCEa!`SAms68y)bR{0DU{+gib^YIwdf^
zK2~(D+m&(LlE(;AYY43xk*`mM#u)FmglbfAs&D`xAf``bbl4ANo}MtHj=<DQh#Hcp
zL<a7VYZTo=Mp9ZsG8NRP4ZGbWK+FcL3^w7so@xb|eji3qR8jLtCfQjW@|ghYkJ7_Z
zE}+p$p*yONA(9sJU3}nuKlBM&MWKA5?$h|7q&BPJlvS&B8yG@T(y8Jo#mNOK7Y5<b
z*IWU|*%EkMzEEp6dmsko6zM9Pje5Na7SmEMGFzE^5R*=nKvLQs1RAW`7$=*MjZUgm
zx?1vjau<_<-`?}45tmL=g=WY}O^;7ihi;E$n`#MXyWuEy1h=a7$Lyr%1k)klkto|}
z*Cv?SD_0RWG%~Hhlw<}znpQF>+Y-XAQ=K|KV}=BpPdAl3mXR6~-y+8~zZN(^dFRvm
zRPV(tW#}r?D%~#w!n>j3U0fv6L#k0B?7ZQnkP0h>HHn#avH<j90ov7)jj3`Sn>SHi
zL(+Xf;8uJdpN4I)N*rtBHVYgRb7&4DzDe*UHO6?vWd)=;%z0#l<ODbFWf@Re$z`~D
zn@YJBkLeMRNixogfy{F*nr>v!O14u>vpv}dY>Zupr(gjsV=PUY23v&;Hxo;M-KfN^
z;UF)$K~1PM@kH%oNG)X!dOb@Efj;H?0}2GuLNcYp({vGo2#OYX;CEspn~uQteSct6
zVwCFGcD^&1+BvEv7cxVaE0-Et(-{p@9T3(l@nset(1R*6wxv9yQ+ScK1q)(FQMp9d
zn&~drtX9HF+ZYaMuRRToustqIagks<q0%peg^8bSppy_Pd6OQ2qU9doZ^+@eVF;N)
zzM~YCdeaetMxRg;7-h+{rqFe%ho#6G0P|`$r?9~LA5%`lWRvl;6FltofvSxsX`^qd
z<w<JPbuF*S76n%a;FpwrI%iZ#V6pNwlyxz1^0l5eZ8(rV9T-x}VH;Tl^dwHqma6AW
zo%LGYxYaA$re1Hg$HH(N4CRK}6$>M??ebZI#|EABWE9Ix(FkP;&l^)R#PWJoqO%=E
zFtXEOKd)BP>O=`jQ$!g@St)3Ojk5c0ZEVw}h;L-nGMCJ-S|kgZxCm%}GTj+9)iFJ+
zxh>tVa<(%jjZ#}mH-Z?&pst!%YE8T(`^0EmLiI6LxBFn@2wmzm3cjmXvpHQIQvH6;
z)5fmmPlZ~uVf0F>E#zq;L`YSvhkypfC841gI{@bfPJ_YtjDz8sMyj2O(HxV`hP27%
zVxit7Cw#;<CZv&IY+Y!<dPkdxY-iL;DWiUqN0<SeHG@p54$~exnL=Hwg9b3cFdaVd
zec)VJiXKJ&#P5K3?SL}KoQ<+%JLo648ccoCZJT&p4+dl=&1WXOh-Di&tL!pCG<CCW
zwLeA^b3{f%P@hpEfOf1LFy;`|hq6Jw0u5?IPBRG-Hlb(`;Q|8kDQsD>h5j(^Krqox
zFo_f+StEz>{s7on9_l%Tj5t)RJXh?eiEhG!Ll{NnlJlJFP@UMIbYCoVC<ah{)e@eo
z)|8q!iN-Dn!t4HoO;vMJU-zez0Y9X<K3xEy6}08tX{HlEh#>Y<lMxh$rCW_Ys`FGf
zk_DweBqD|&t(+>PLSGnLfUG4vL<)S|r>aVn%17vEROS*KKNyYaa;F@O8=~h-sD8Jl
zvY<I>YuSou_f?cL+#=B>S=I%r(&iN^q{4O~stwsF0FSY!b%p1_g`X>;;EPQN$hPnB
zl;ZRzGTe?#85s?l?TOF>kw{U_k(Ft()KRrjQQA7L)T`Y>i?%0RolRkW45$S+YXP59
zml_p>7+Mn!@hL=!dK}kxkd9Sr1ELjl!1u!v4-6V#=j)2y#qwN<>`<ecW&!Urq(~%7
zfxyuya=e)L0MFWFm9QB|647m1lMap+0S0FCVJ}@7Lz7|SdvH83vx$REz=f(jF-9~o
z49bPZFxR%UZq#waAcKr5WtAFYa2zxVe?qE=+@!}NN5Wl%Zx&K^kWwRttFj|ZrP*4?
zOW`yx)={X`DMe696P<+116+-3Q#in<^*mWpjC@5-Ihre3d<|6qCP<_f>65Vv@FA(U
zvWYQmq5pd}2khVv@r<j{*?hfLs!NQPkqJWL4EEo6h6m9mfM<wi25OpWy2NEPqTHuJ
z&QRB(%x>4K0Ee=t6ufF3#6;w2$?A(xKJ-&0tGRSf8b{)k)P(FL0jmR{Lm6$A?N-Gs
zf{mIHkO)$0`I%C&E}P{t+$>-Pvdt!e@s{tyfnl&f&;`p(jev^<wlA^!zL*FRCJS{1
z*44N-EntIkuBf6Q3uPQ-A}t%&Q9MS6ItquW9H{Chr<N)VD5~*2stMBY)D?WbjUzz<
zPYfj{YaSfAjX~Im`~fY;J)?`Js?CV4wTEH}woHkM2?m+eaeSyN#o;)eL(9d|z=e>B
zlO4y5KSA=n#M&uVP2kid>;z)<N=>UHPO1}<iN8#t8Dy&XYQBjA=#90AkkcW*Iq`;>
zL_Pc)*AQW`*p(b#!@EN`1=g=;>wr-KdyLrt7w3suBd-leUCGG}Ce<DI4Vfz?$wwH)
zwzJJ19#ugUr5P0fFsTnMMW~ws$!CE8V_pShIdN&ZJkaaOp)3JLr#^)zVrxwHiwd2B
z3+eID$U_wpFOQo=w^ME~Y^&O1GK^Sh7GaqfM4nSFmB=A(J5g@}I+^n2wp;GXBc=`8
zF`*Na{-8NA*b-gjdn2V>lhQ^7HX5c-tpG<T7lJm$WFrfHAEFB*IrBZW)eY-VksE15
z6YD~CVBRQ3wsdbqI5{yL2HwEex*fTYiaY`Yf_0NZhl9|sMXkg+;2o^OWz|BqHjF5K
zQmhk6aIf5G!-hwK1RNvR70Q8D0S5<*tEDO5E$A@riPgenFcN&QtRU0}B1@I91rSaX
z@D2<Qcn8kK1JUq>W~0Ye0?EmYRLl&g0R@7U1R(uOpiNn!CNMXZ$4%f=03M}~P;wRw
zo{!aZ)3AyB6oglGyk9IwjSM}^K(SX)oj5z1g18`DE<3rX89Ak4FQpX+GEK<ANbJ@M
znMw&zq6I&jF~oK(CgrVK3pJp!Ns<+pBWhx-)+co|D?^=zVzfpq@b!DxNGlONiL7+>
za60hIU@-AGoHjkBRS6K1t$ASyOhg=jT@B$$lc||jtJ3W-1WJs$J*%jN6RlM?ZQDT8
z0$_&7f}HoegH+Mb8{-;qbjfnBW`(6eq*!&>l&y-K!-Jp;S+Ynvb|<iyS{E03t*P&O
z!_*++9TuC0wN{6AXnBC+65Y>(P<gu6v}NE*%PH5_e21H&P7A5DY$d1j{VXz86=>>7
zejBAKIA+4*jyTF%axu`H78*M^JIGfG#gc_IFawt-#RhO6du0WoZCbYD+OVF|!?NV}
zyO2;IqhiF@fIrY?COj9hX5hNjCOE7hMi1sQZsyY<&M@@`T1ChV!7c)>!Ea)8en57r
zPIEHQX+XNOOf6#SP{8*Q#h%0<wT^0F7Fw=nxFQJMDT<Lvvd@O&45GqJwT@^&!GVYw
z-bvR&-<<+p1QO+Gr3Nb_br9l3v<yRT8!c;5#7>Isy4S|^EQ%C(-yQez?Y5pS!;%}~
zd8b?e%uSWULki(F4WuMZwF-}?oMx^|xoM`%m3-X}6nW68k3|@m|B5&sbcC2LWF;gh
zD81Ym@B4BcxSs{hB@1AfzDJE4H6d&oiV5gyy{?MMf*_r~&FLYEg8V?gGXl&4Y2hSL
zM3q(*QpROK^1$E#lX}D$rUNd}Q3+{qXbOU}%@NF3#2TGdn?|h2W86`!hLy@z2@nu6
zOmOmXIt)sEkZab<p!IRJptPF-GRjq0B@)CmgK)XQC}O%+i$G1chDlL1NAhU5&~E}R
z*0M?lQZNBCBIM91z<fMfC`Pht5(wN8j8dO)5KiQP<-+*5+o!9@(C(HTBrYk8TWFRf
zo<ww|L5iH!th$Qd9Pp(o)MK5B6sL2D<LFb0lB{XE;^dqjg$y;PnJ4W=&>q&u2*?O3
z@q}bpxjZ`Ipt4g3keSR;dVV-HCKMbdsBOyjt0SS26*M(P+kS}n3~uL}OeZRHX%7qh
ztd&y)G>@7V2-WtSjL}cDVF!ZpRRtuFxNJ#m@qNX1b-o7SteUKFSzt=FG1TQLo-bth
zexqlZPOr!59EJ2-NEsZR3V`MPn(nlxNI(N7heJWHU8POqif*}F)iT6!&I7WJ(S20K
zYBu1P0wi56qr*1bFQ&WDa0()HL@y|1$5lK;$DG`7TX-2Z`VnX)PthRN(k<X8bYe7?
zhQlmd8$f6_UuqyZhmYMLM*^+|V>o3rq)4IOh6iGch}(TzM%;c()X8$b<)Vhg;~0}l
zM~Z9pnPH)8%Ph}lpgNNWx~JoKO~#vOHeRjdYeZSfq<iW9INz(&a#bz+J+rRo)4)q(
zIJk{;(tsZV8E7o)jXW7kV?e-y7M%n6O|5<-=+*3&Rw5I_I++R{4(Epqz*!@Z7l6fh
zo)8TlA7y)X)q;%xn+9q|Zx8gGI<2=XKphs{`XJ%}+uc$-sk{L*F5Q(tUk+GlzEbDl
zI-o+a`jBa-8&bAc;5j0jPLBzcV!MqFoo{svvQAr+=;7Z7wx*rf7ZtGY#j&GLpr}_(
zK%As;B_cnJMF!wXOqhTi2Hy=uk`oo)slYDQDw)Qh0-FMzX!3lNfJ09J_?9FU6B^dg
zOos9R;u)*cR<gYwz&m6`OgouD?|+X6L8Bky!MF``4Vx7f=g5hRRhf2Q1@PiO{MP@-
zL{74gYCPQq*+<+E&2fQRGv(e`uNXBJi%PsYaX<o<HY8ePwWr%)kWtx}oWx3YVL<4$
z%%IXpmotH#c97a6RglL_Cnp;jk9B)S#FuLcbRq~p%JO6aT5NBc$I@M3YruYBTDBYE
zNZAt`H0BeAR4RcS$7~AW7|qGh)iOIZyP0Md9T`CDVVVizdXSze#CZ^o&vw!(&5Y~(
zNXESc;QMv7AP?M9j!ip*zQ7xdEjq(#YmmxxW0SBPV9P)Zk?hdzl+aZQEXtPqB!Ikp
zo>vfrMo_w&?;zQJA0(vZ71#A$-xr|-erg(u=BrsgLdLF`g3`<cB=%UjW~)@3`U+p{
zgdmKV93faj61vhY=r+Zg)oCF?GG-u^I6m};O-$G5X$CXXc1zE9n~i}!bOFPd0}(Ef
z?b}knSyiWKp&oQS6r*voD>zus9GEZ%=QxmhqG0KOO+|Rxf{8{LOT9Rrc89Lu4wbQ?
z(-Vm4NPsrz>HR5NHOnJ5)0q~mLUU9F>46H=8CFv`$c0cDbTT1qwnM?~UIf%^P_{&{
zXVwre>iHJ0o0$p|rH7aVH<c<yauwW<v)MwRf(X0j_jIgR_Zg^#NNt&+NyZ`Cyp%QI
z97AO)%^)Y}xt8ff!+t9@7`ZV#CQ4<w&2|_QPyF?Mdnk*TRK&WZ*5gIQB)|8BDx&F$
zRJYxo)E#OlRK0>nPT~>cGaM4aw1B}))v9<hEO|to8^@h-TMK-S>=Hz~g7=4Y&YU=6
zwJl&WSq!9%sSc<x+pY0Jh!KX)cQfCoX4Rw`<HU*6C4P?<@Ck!us)^T0b_;H4oM|`o
zOodJ{9^Y*>TwzQ})nbD9g`RATWzy}DgcaLOmf;IStAU3=!m+3enh&u|s;v(PAb%ig
zG9dH{%9u%1GmRS2tOn)Us6-e5uWCYVq_6@m47!e1hDwQMqJhWCF<7lk)RP!lrfij!
zVGi!GMF`PTDoBDQ#)iWUr2&XSrg2lqBxtyj%?6S&Ve|DAU+UM?g3s_|t=w&b9wm?o
zV`lqpQ2-%;qUKsoMycsVgG{s)S?Sk=BJlCa-k{~-S*kus@;ESan(bnBu~p0rb0gWW
zhlXl_n0d$d{BhmRpk2L_8LJkP#l)s(Sai!TPqAK(9G7DZxSHqyh`cy(5<1dMrO4c%
zRtrF6I*)^wJW5@XAB7{<4Shn8eLO~Tc;diy8@9=T4DwLOHbrUT3hjsh$-|rugKjXz
zOoP}j$R3C#q-YjFDqAW?d7PJPVsT0Y8WjWBEr0=d!}2lERgjn!)W8G`d=k&W@^l-|
z)>Y1sGj7cUNiJrw??ZHn&W<Dyb}LYMx;`qhT!N|q^3GM+#QG*TE!B<-7DE&#bhglJ
zR(iEgpUDVj%+!c}Atj362PQyeDpY&GP+>MNn8|!{vRy$FRmlg_WFa8m$VfI8$zXxW
zo}Vif#zMl*)CQcXf*dYNta-_n6s2zR>>Qt$Nw4biBarZ32}e;|4+}Aw;znXGp7?Q6
zb)rtvQAJSEL3uJNMcEqJ(`7Q(jIl5srEC}-@MVzaQ=2$icEA%+JuH}2n;A=3Y+%L+
z1?C9LP_Ek*t0++qFb1Ert9^~g54tpvXal^nQE7u96PE}6|D@)?X~5lKz#;MS9JrV!
zW1!$hmj;P;C<fR8QYm5}ZFH&-MNx>IF(?7KLbXYZV`YV%(0YpwN}*D!<T*fJU}{jL
zg+MML)20Z++(^KAv|p6VlSw-3fz+6>IU+-g*8rNwx=5-jRI@bvJ)YIk30!LqqGmPc
z<xPCz3vjnNWt_l->q0UR)-CpwbfSdOR7;r%s#{1-O34m6CIe47%C6~cJU3Q`-AWw-
z)tXT;M)H6G1+PL7R0dbGE=F}V%(2pPRkr$*M2v<Uz-^Gz&F0C7g*lb$B}lbvGGIbr
zbf#d)rMPb*{km6?WI8cNdQQppWobBNv~trNl?jl0VKr+Y8gGu=u~`f&QA}5Wd&cUT
z7>^R~10=|SH*Y4e3SOUSHtZl1^^mCN>w+J&1y6<Yv0&5C1PR%p2|Ff~7ow^M4Omib
zfzj0Z9Xt(#<ns}D8z(#P=^$IKW?96w8X*E{V5#~5ezbMQNVml{=S?d#c-h2D4I7kz
zcU8DvfRcR+Tm>9#s^VL&sAmDh5BUVWbf*0x$#sOhm@6sid@9Hx1fC2!@Z2;PjQis%
zY=QIF1AM^*<#Ql2Chf^uds4Omu)yI&WQOGg4S?*<Lbj=Ss?H@Cplm_OIhQ2AA^Ik6
zl=D@OZnj3fxGM3<2E~~S(@a>UJSqo**i{j7kgtMQeEvV9fS~TbUz<x4NbSybW@hHk
zl<)!<9`fG7>)f;%@#GyRt=OuCa|e2x%uO9Ov+=C6=hO9*UcaF2oq54pKigoFHP_u_
z;U4)n7tzb-cW3|Uu4QZf{eW98JZkmr%I9}nc;O4rKKsHu{vSU6({tzj=$80OTWAX}
zt-bp(OXts@GiUaHz7E`K<69S>Vpg8~Pr<)`o>{j1!G&|zTys|8z`6han)Bn?7=8D8
zcV_ndL3>br8TZEj=u_NyF18K-#bxe)8z+4%SP8H5!w%=J`Nxx%&VAy#Kkjwv`u}%8
z-*>g(?EDW0xpe+qe$8dekKg>egSJ^Rv+9MJ`p!=*-v8shmR_~RZ7a50wf)Saa!AeF
z{0F0*|N5M8r<G^VJp7sd<pxW4o_TRCXZP3d_~_1~SI^SddzqT~_>S8r8yvXH>37;|
z{dt>J>daa*quGC6f7LqQ<+lCjpAMh(P3JIv=T&=u@wW?SKAR<Oc=3tUe$uS%W~DYh
zaHpBS&U$a=gClnQM|+JAQcE|U8LV;cn)8o6`kYgL^WarKT*Eatnq@D$30^AAOtxKl
zD*pC{GpBER&*j7i=ihShlSj@Rway2dFMVR&xy&Z;&mW#C&;DYq1LaK@{A0t5hfD8x
z{77-;;-4%XfAHJSXKk?d2iq;ZaNei=b(}e?`ZMEG-$d8me?{|I>nSS#(i$%}W)7Ws
z`SRjPKitya95{ag_P6a1+~}9*Tsuqb-0<Mv*GkP<b@I$)>%R5N4@X<id^_`eYLTbR
zT9mo^@P(({^xhroygZqCcf(8DJIq{j;r6qhg5Ozi*Sj+>pa1xe7GHAjsh{6J^Tj5s
zwwu{}L#F)1vwvEnZ7%rBYp<_<a{KS*%*e<Jc+tx9mtT(1x%&^lcm5mynDveF)N^wW
zOwUZWUU;tgL|kRAUe`Ei=GryHAH7_hxrupV?=Le;m6@;BK;L@2@!7lZ(nDu0KjMbv
z&!4&QtPdVtW7Sd1mp{G6b&oFFcEzVp?<EgUy#L#oEoSYu(P3AAc+?g%_mKb3JUbMa
z=ib|`YTkWz6@KQ4`NM^!1GdIbSa|m9Cm(11{o|QEw_3f++YcJwe7@V=J0JC^Q@V4#
z`6m`+<HRGMp1)$@4KHoG=Lh@F(EDz^%c{@B{d05X^>!=GI{==uMQOq6U;OhIC-1Uy
z;bV&$GvSl-!_D_yv~uCgFKu<C`qJC)?BagA*@4^NedPnqwa<KP#i?)7zd8B*IlF(I
zyZcvfDl@y!I%1uro!RT0@(1{nqqorfOSb(MU3$^XZL`7$R-O6S+>N%M9Eg=avL?^I
zJ9quZj`~n~wSZlD;%MDtA2{pk_;_X6k#PA3=VLyyG5yo!@s3y*Ka+mw1#V{H)i-_i
z_;C+!Rh4t4nYZ89_T6Rm(_5|5XTF=uoVoAuYrURYbmrF^!o-i?n0M^CXIu>(|7l@n
zxWQGI9<b-(k6nLVV}t#|yZQ?*`_wzOaIUf7vdura>G5^u%{kw?_3JNxbmCj#j4|_6
z>gtsX?_GG^H><Uo3sU!*XWqJGbn7E$ZFA50Tl5ds@3>&N{E>B!|I0Z~A9~t33(vmi
z%%{8dvf%pDkGj4tGh5HT`0v#(|FKqT)vxwKv^!3b|D!YJuKAPW4#bxI?1bUZXVV>V
zPU+^i=3R02^{1cH`t$jvTlP8l!7n$wYX0JtH*FB_v~u4MYn2@quUr^?0}`(9=<Ke%
zcg3D7w(g%}Em<<hfM2`!{+ZNGOM92Jf4=gp6V89@h64t3_pi<V@}yl3-=KBza~J%)
z{D<EDFZ`xQk6(IbhyAHx>bV7fdPM!^ftTo~etR!}=abLAwZ=6Up7P<sL-u{<lQTEI
z|Diwsbn*Uketq-o!#_KD{EHs_<v)Y_j^4%QWuaDm`;`Blr+a=h`@8#=BRL6wTUHnC
z_}Z?e3uvtL;8v@@_<4bU`^;V5UU1^VU%%#`a`eIlPhN3%^sAr#Cj7?@s~@?Jd?USR
zWsp7VrjOR#Z^dO<c*~`0AHVab+wWvOxdS#hZNUqwxM4Ksol8$#{?SwWeO<2T<KBAu
zRg0f{BQ(ZU<(^AEy6Nj*uD$05?4Q3q?BU&3T(;KFuRnj`Wk<ZZ`Q6<IU1iM=jy2AQ
z7IqiTy!!C!UHO~#-?VZ3tUvEj!`^!YD?fDQEk`|mW%&m4=H(x8``_PKKezbz%n!Bh
zoPB10w+S<kd-am-H>uqA`|~dP>2HipuDIn?l#v_Y?d<J#VK+O2ew!;Fary~IzcqW;
z!cLu?jLUXzZOYtt$e9mc|J~hWb+=xA>wnUp@As$84}N6fnfu;T#0dDlvtHX^*TdeL
zIrynJF1_Z2h3750>E<WidEk#|?f7-{^>^QB{uxWpOx>{keZRV4!~2#z@#;e;bNRtL
zLJNPrQ)lI;XTA346V4CA(*1wk-?;tr6F;2SLyRLY`|Vj<o%`|8@XcqfdB~=cDeZFK
zJ;(Ce@oAvd|L|4&f$OfkVZYOQ=P&s2nXMZ)N8dZEx%Ut3#r3Bih(+`xSG@Pzht6H|
z48APSeQs;$GJH9H(g~-32X%j3x>(dVyzIF8rp)&2gD-7$<F6Z!Jn2zC-tpi!7yf)v
zb$HLQ)YZNDGZ!85*W>!<EIipg_xuNbvsUw-;sq~$wEWDO^owU-@$z^cbL4{;UH{94
zzgu|OqJ>W{d~M+ar|k51V_egpy^~pcYh=X@S17@`h17Z2$*-(+(_uF(Kg?Tn(c9+-
z<z0?1eXwfF^{@H!!WS1kTwir_xay=Wx*uP%%c0-Cdf&yT?!NERpO5CzukCU)4`Q#&
zcl?2!J@A>0uREOGbnylIN{{`FcmoqYs~vj}wB+(#RvvNu3Gr4>zWwff^Dpnc`gQP^
zvtE7&JCk02*FRmN-M;&}+rL&i@8TD&&bc#<JJR)^tptmGT==;6Ap6}-4}Ns>!X@jy
zwvX}3!UFPF?Wl8p6W{j!)^C<8@cN5?Jz`fsCmgiyrzeN-Ig1qhA*8hLMkjsw&It#+
zc82!<NYgg6C+iyB?H^zFg-4cUuAg>4oytFLY{S2L;8~ySx@`O6_WCQAZ63C-K4#Ht
zTm08Pf4*Su0DExLuTI(ivcu1{wB6U;Y1Q1>@?qzFwEHv1MXmD=Ivcv~|B+c|!`YYY
zvhp`u|0=-$t69J>R$Vw}&Vun3KUpdK(DdH30RSpJ@+lDL`@i62?wZe^xO6TveUONE
z{^7Dd4Dd@v#TT#fL-g?fN4JL^xBS8OSx|cMCfon&fRD*HDwn1nT;uyuF57U{l25{C
z_TF{F`{&K(cX|DhU+g-)ZypA}8qRtC=X>w4`X7(oe#V!tu6q8GeP93m2K)W_82>GZ
zKZ$tb#+?p4=+Cd6_6fDk#%q4F^A_9ezF;d@d+L}4%bwbA)4~>x)?a|G|LCLhU#|Xm
zi%)J%Z+|Ag=BxuwJZ>&?-X888vDv@jX!o!e=d`?;{`Jr2*SO*3_jY@9i@z?p|Epa#
zym!yp-`#r1am(*}|C?Kt*D?Bq?AFVc@4eGTcS+7<?OB?9{l)iR`1bOfcAO(Vzu?4s
zE0>>g>w5<y>nz(Xzti0RRM{ANXM;zdI4Qg4J=kEwYtLQsw_|qv__m{sXKvo;yFbL6
zv&VmW&+R)8o0lB&99sS7o&TJ-*#qZn`R(4>`=4y6jm_S@`~IWu+4adcUitlN2fTLn
zH;-LPyl~~MubsB=#6N6V+vtlO%Xi&#UiiZ8(WV>z?Gy6u=^ihCd4*4Gy6mV^?_KwT
z=|A^b)>`rCeS6(`QQ_HFuf3RgTQ~zIygR?za`8s_qd)#=d=L~pea@}%Kej1vw)@gO
z>_42=|NNzG7tj4F`=j;a=db<h^#z6bTLB5$aP5;I?u507H_l%EwEe>R!nFtAuibrk
z_x)G@@+<Mq?dwZ!{K<k_$Dbc{$_iwMlliOfKl-;fE>9n{%ZdjMx&51MKice?<F}L2
zm$ptl>8*W6JFd6?>dp3l|1Y~7zw(u@Iqy1Wmvaxj=J}uQC0MJ@-TlzRK76XPn{?BG
z)y~6reFbLr$1nf-<eSUCdV7=I4!vQ{i+c#$ou%%1&+pbh=ei?5{MosG-1{kX$sMax
z7wyh(zxdrY`_-jizwG`>eq(&Z2S*)y?%mH`dP3&%56-yj_;moTe&w$Hx(98z)`Cx~
z2mNgIBYQt^_To+N+vxo(>32{1^tse+OYRTPJrdENy+7Ud=oQn&{E|a&Jz~EHpFV1b
zCG(#9>EZAH;m6$%fBQH0$Lojl(HjrA>E=Ve`*x#;<y)~izdm8<?HfOFpmX^x7uOy;
z-5EVJ`!9dYy)=(_QF!T(k3aJ9@|}zFRZs5NU9r)pN6g#X+WYCz-%dR4tP^%S<*N->
zEPL$Dz3zXpo!)Va8xP#uDxAI!bI0vF%~^99fcr}iJNbok7VQS#xp?J8>xZY`_1Xu2
zx$@Kp_s;abgC9Qhwk^{4e{%9VdoSLWx^wlqPm-VUOI}*?)>ht@*X*(Wtw*1E0RE>P
zE_mePT_57-fO=c5|Lb*scGxRNeDsO*lUu*o_M#(pyz$B-|9Hw;;kC~lcEJJHZL|98
zm$rv~>AW|WS@Oomcdq{K<X_~DzVXey_ITpZU*5Zp&i!(Be6zV-dMp3pEj|gWuk3j2
zhI=Ir&dE#e`K?trW34JpRnI@}@w49C?Up;PWp4cYI^oOTeVw|!vCpS-4*p`JzwdU^
zem}lqC;QRS#wQFf-u~trmwtWhjdPz^J=a<9%_ZM`x#cgfc(`!#qiaAn-T!rX%U#bM
zT|f1wM?ALs1^3r3JemB*S9_eIp7Q?51Lb65;UPy){&vi~jc!^uYRtXv6riFWT7K_m
z>u<4{v})NW-@aeuzS(Z0{lDC!w)ZD@e|hpb&~UWm(>aT)#zlLcc=8EPzkFB!^ZQR(
z{@$XOzk7MnvZwZ6@adMCb>)j!e>v~8d8@bHbnPke_FoS@?Y!vfb>4aXi0|Hg@TIpt
z{t(}M$-I*<*?*JbHvWop-dzVY`*cT7Sf^N{ul?r8&9}a2$)*>|PZzHG%Kmb@eYZSg
zuRqAA9{kYdS89m$(x(4h`m^6R*FV^veekMp@0I5KdOz%)y&k>u+wXQg<f^}|x%HEm
z-F55H7k+r)4PS$F;h+4(|J9r6+&ad*!`HlK+dF^RM4c}V-&b0>-f8whKi$A!f4|jv
zFHar;^S170eldUf%<PRDbO}FXPvL+U=kCz>bk2*5_&LYkcJm+a`26|pKU;UzCp$j*
z#-43!fxV8!ThE+k_~*R*x8J_I$6wZO{Oz{i0_AqpLG(uE7ubg9@Q;6X#|f|eHT~vh
zkFUJUfAhs-_t^iY$zO7>6)xTA$VI<^unX_I_Ul77I(tXz{-52p`JIP-{?LQ7=goN(
zqp#T%$oU(eH?CTJ+wWG~^W?@`KXEzs#6JJr;Io@t^T<msdTiefo;~sdvAXT!2fcmj
zZ`PoCUthGwGdJjG4xx34`xm|SVKcMWc)i(&Z2Z%0u&-V_Z7)}NVxRV^ul`Vc>6K5f
ztUZ235yZ@;+<TYa>Aam=`GB<9u75b>n5Ry^{FJ}0*#7Vzz4~QtaA|bm9^r@2K0j!k
zpSfwDJ$}B&HlH1J_`6GiMoMqA`!!_s>(>`=Wc+TE;~xKAd%yl;7aWp3AbglR<@|m3
zdH<8^f7keyI^v|$j--Bdfb!7KezfhD)#bOJy7qzlZoR{z3$kx`zcSujyaK<@v7S}t
z9^beis(icew-@dT-i&+am}aoYCHt<t>)gjSIC;14c)$728*cyPCx?7uy|meJ>+iem
zNw3^`a~-J3-}aWDM(i-VRisycb=0OnTNoECTkmXp>8>~QSKV5;(0%9O=%%k;E?)h@
z?dwOrdhgOb7r(a2wU2%2ZGPL4D1U||?|NP)dgOxZx5^t^?0Lv{+aLVmHCMdz`mE&#
ztiJ7-&mI}<EWWh#jI$v3zS*h$Py7<v=awr^zVz&Eue$kn*Ij)<c8__+m7gEgSXep}
z|MaZO#;0HMjQ{c+{*i6|@!+#+;f%a@<6r;&(s!$#U4P5o9@*2P?icc^gY(gjx3b6F
znEs<bTf69{)o<Uk`}5J8t1W0Qntt$uTe^kQPq_2OCyqV0Fn=)T?Z!TLGRr>s>eFqX
z`Rvo9es=A1Yp*!{?W?R;!b`ZtfB(+>>(yKE<}*K%pANq`ZqJXXk2hmio|RkkFGt<7
z&!ZRoe&+H8@BCT2)mxC>a+md2fBW@{B^`XlWdC>f`R0YwcVoXka?!b&tq%C+qaSZ`
z^FK~KJ59azgnj2#2Y>ney2m!(`Ry^ce!OJ$$-g*kt%sgw-+${X`y6&j?<04U{)LNP
zG}DhCv!lOk`E48SaK+cJ-81|BPtJY*nTuY2?-XuL?M>(B#oD8XExzW9(+_-VF7w!e
z^#1ziAN_id?^1`JxQ$zQ_x9NzUVr<&(3OX-^OHN>HSRe6w9_kl-Sfqcmv37B>GG$q
z;qTLqd_EG4r~VInU;PzV7i`%$gy6vf!QI_m8w(D>3Bd_&jXMN)cZU!hg1aTS6Wrb1
zVQ#*8Z{9bvM*e~M1=eD*Zr@X<PSvhmyH1O0aU-|gy+tK@CZZ*ljVh2iz1pv#CZ+ma
z{<LFdiP(x=>@?!4PQ3I9cndkY8ejhZJrHP`+$cojWDQQhnIzzT={iv0Kl^xDLD!)h
zw`i+LF;RQALrj#i&d-;;J*)TFspH>@p^`Pam6#lKDSTMl4nH%LuYZ_zkY&6wq+e`_
z@67zTs#$qYQIL9>EScWVIbNVaIny^Eu0e-fAtZS6Q_E5`w#lKr5k;}(86(daSM+rS
zH{-eI_X)7oda%zPtNui{l|aW29T6H@`3vcF1gY20JAN9~ALtjmL-|YF#Wzfv6tUZD
zv8S;w=W2Uwh8z`o9Z@pkEu|foQY6g}oTP$YCt^#s+Xr=0X=>3qU(Yo?X9pUUv%V>$
zUHN=!^5#!-CHl5<oStJ;s_BI}saeJ$+AhGWrb~}Jh0ltXd!mYNOgK#kWVHcA+W62~
z3K<6vrGfWIiM&m|04Ren!kTsQPB#w#0nb^pz+03A)vtuIfFdT9T_7ro@qlDIzOwY4
z662v}Pnh#!f}H1Tn%1A{U68iRhd{3~vyF)a+D_M`^5s0l{Srm9FK2&B>EIKn*=6j~
zXDcFztl!`Nu-y)}{#4`64y2WQ8w@(#>R5-qxsogRIALXp`c!(4BFI#9NHO0HfnUJk
z&RTMOv2xRA4=1Z-7N(+UYsVP?uE~NeFm}4Cf=|`H5iFc|g}iGwyRGc)K@&80`xVh7
zcz&4L2r($bJ{vvT6wX#Uew3~5`C}BUnKI<4xZUf4Qnp#{&{xidU$^UpF$wOw?=5AA
zJf(8m+-pFfGtXy`mRGPjl-ktZS@RG-uHzchDxt30>V1pYUyApcI_9xjvQ(#yR1`iS
z;#Tcf%tl*z={<}h&;2yM8OuVr|0RJ0oip%i@O<Q;pp~ihi+j({%B~FGTAXr``u@xf
z@59CFRI=|*rwxbCtie&~72Y0Hw^$^+WTe})!^wlweE>X!x=dsH?0iC~#ZqJIF|6CT
zWaU*nrZ3Wtd0OIX9=Tom005VbT!eVPkYujoKOS-i)I{B6NP1OYGY?>pjv|!3^sJ6+
zbAEYZPP!;ql|JS>PXt~Kw_`&UTGxn!orP3yOeVH+g=`0MzY-7W$oC;!wx6H+2>AW3
z?NFl#o&7o{Q2KE{BoOBe+?%@kaly=%vVb@`+s|Hgw<#`0iflCP(}%?xQ0io{y=61i
zLe%1JZGR+;7Po5#AARS=?>e^wA2Bc%cC`Qr;++J^22yE9jmcoqrf{Z2ni_5Flb`Xq
z{w6I1Q#>vLS8gtfAE;Gj<tIm%avPR$i0KTjS&@VeKNCa?#*yjs*VVcIEGNz}q4|9*
zW8*fkdd_>Dffbmrm2oePM*N7;r|*XG%+IaaP=?-dLLAkzi=0F+NB8=&a~Pzi;jg#q
zS;i!4=w5DiDd0C2d@B~Vg=Pq#<WJS!1bFO6<t!fE(H&3UQM2^g+!gvhc+jWCoM`cW
z0MbIZM__2ZaAp&t@TvAu0^&yf7c(eV^3e$}c=?+n2c`<O-E8BGN7)ZH$dL}as)+(C
zmP)Yh6@d`a7vV?fxr$Yh>o=6+#p3Ms!SCn{3<_Zg*<9WOrG+}alv8lxu>rA2dKNKl
zn`Y}-INm~j{p9Cd8UWMLq$TsgK~*C5z9qeuepz%mjg|&KyHU?(RRcSz4R$mcib&2H
zBS82dj-9fbh@+yPnsPI*a<|s@09j9^o+w=fZw+SQi+i+2qe=U~uUPgWqa{_$`!m5v
zt)CKe<)2o`?D1}4IU{ugb<BXrGCY0Zn3F#H4zp@?7l2M%B-QF-irtDR03`0eXxpL}
z<e)3miu(Jc<2K0*I3zO!uD<r2?JLOFPXN#N*9l_U7uSv<>9x)}F6#pZADbq{-iPu?
zlaV^ZjP58Q#L4N~E)~f{LV(>$xG#8g{#wy$Ig{?JO!O{p%!3_QEg5n~a{MERp5=2t
z^mxmF=ipUmU5SqRb`KGc<r_%m(SuudphOUGrbsOe=&)g1Nl5~y0|$RknXx){y=k30
zvrc6cLa+MA$@$|bl+W9m&!C}Ac}PgNDWR;mPbGr5!#0lY!^6zI+g_`hgG#lEp$?jU
z6W=OTH@@{VOo3x2H}uvL>Jcm39ivvHc8{d_Z5`xpt#-86@w(D0n5y4BjnSsmCMIK|
zu<L0klf1)f0=MqJZb53Y3Y0i{+mT*d>mh{M+(<zD=uOq2Xg+cah;sWPLS{Gc!L{+9
zV=~+|j5i5(m~VkHQvjEiah*O%W%3&a<7NqtGkL7EYPQ;WUs=vc?8EdY-{1+LhH#In
z2RdCJvC>?q2tB<F7_1$etyW|Q6k2i7SE659ToU9YUWF#pZ_Eh}iEVJ!<_vt8vzRnI
zA_FJ^7Zc*~UyL+0zD8gfF?Syhs0jRmZqp|wcCwhSlkOVj_R^9P__zHqlC2JjOdie4
z*Q79}bbp#|b(}Fo#e#UvQav;V##yKHj3=N!2{@;a)l)PxMDSMVl>g8aljSC@jD9;O
z3Gdf@v>fIbDO;FF7d&0gbJ-mOCNDthJKzhdO^qp%kJWWkhq(B`s$r?b$p^n>B}X3m
z9=Uj~omg~t`A13;9Bq?@>sm|LVPNAhpqO6SDv=e0V1r*vbOgX0gcgfZMf9`3oJT5Q
z(%)@`CMy`G*U8n_InsT=&l*hB>-(&b-Jph{qVLd&)wRp4uMyG-dA;p&u2q*Wy-#Qp
z4ph^j-s2le<;?G1TbtN4pK!~O#pn+(Z{_g|npOU}KO#0r*igqclFji4Gpfs-M@j5J
zBSSG$kXStsSd1eKu0r56ZH6GyzN_$4#1FVZdNKF)NfYOiQgESEiQ0iWN13Xm*G@k5
znDRpyfib3vqbZC6*e!^&{D)NsUh&F$YVdPNv|p;gF!v*zs_my?aiSkZg+!tx4oI}N
z8Y<@=L<oU-T15i#*F9q$8x<`lt=kpb28!9@-=&FB7ter##-iF-cV;D#J;9$@tF5rv
z(S0aSW>7GR+DyaAtD(KQpHp%)?@ya&S{TN)B-rg9D)h%L?Omw>gVa>iv;9k_<<qH;
zD~&*E{LQ*<rIE;Pl07cAd5-g2T3qZE=$nCTA6iX9#raE2m|)bvd{|j9u$6TmEFI{9
zXzs<kO<tgIX}&b=b+!Hu!clmcOWh1(lRcIdnDA`+38RgJBATM$>O?-h1vWeSBxFEU
zIPXV3_6kj&%JMx~!8G%5@<{ySm|!AufRsYlFGaJ>-V@5dq=={{BM{ie7n9z=K^Y)6
zu88rO+-TAnl(Z}SM#;-)FpiYnH;xLBFEJ3{{e747L9U3zU?oF{nDTyyQ9&SA2AK$X
zN*d+h!A3W&fbaBL<UUSPZSvLb9dg)kO;AF021O8aa#`huv{u?oMIH-2e-&1ZseA|k
z$OrW7L>e~ly7Xp@)?~n`vX-ONFxV&gRAU;|4m&Cv$2h;zSdSSmzpp$Bsy`(~Nza@@
zhQzf2toz-^=`y4Yw#K&w-^{=bOwM{7pKi}p%*-~Xx56-&>vn9XVAkj*<UwB0icaa#
z^=42>M9dv1DI}RvRic6<5_@0UJx<SM!M7D@xwBYAfgAfCIQ{u!cryr+$U?Z7TK@`X
zYXGIlA-tbWU+&+YdNaTn6gutZud@oEm5x1knb@)t_X7J7Sv>QlUt7l?iqJQ@`Mr#Z
zx1k?ycRF(DL*{ZW5_lDfR7r7YRB;D>4r{&{-8TYg08dq{(EDm)AJO0eRr*7aV$P)O
zpq0Mc#Z`#GM>5BU0rj-vPt5H&c0P6(8PB5o9cHj*uhlh|#&v7M(~7NM15R%0N{QPj
zt#g<o8ZnCNIpq55&Mzz%yQUAjv-z^qX1gjh53pd5NDNKH;N@AMk4bD<wd?Ie!rF3!
zzN-Y>3|@Do5GHjkuc(F&yG=HY4Pc!C>Bi^)SFZEJZuj=#MD;1+ljp^|u=XoLsVnPB
z)L#+LUzwsjU-N4^%jfH`)CXImp2l`WcVEcWGsT-b(6rmlweoc!V=p~_YOJzr{80{P
zZ~~+hl|yVk%@ss9Z`LPgYqCZQq`qDtE3UGstt_cME}zh;EAPJKdo~+_+ldF2-(`FL
z*-p|)6T;(wC-!2~E5zAAk}tyFI2Pga(aX7mzsSN0<cRU^JK%T0Is4Sd5Mz%#>X&2s
z{Gn@}&(yE=^S|hzGT<!hY&@U0R;UgnO+1U@-uxx)JU4df;O!1iylB+g{RU5Ge_AL2
zlp#XX8%<@LK7cjJos-_Q-@iEh!{Q^*j7C&!<Shj0@FPnUAXT{iHPE(T*w<YCsl_DO
z;goBCT(J)!YHtd4zSdYI-^FXZMxQf*Mg?TsI)zzJm#m=~$|lOcXzE2#oE5~ow>S71
zm9L-rqKMem2Cs;H@YVD~;!Oa*rQ_#Lq2)h(^-5(G#NX?rLNm$G%%vzU;(!AE4xZ}W
zrn5)p-%5$}Kl#xAqD(pF`9HFxjFnrI4<3kwQ56U*U2N1giMo<2hF&c6zuI(~qbGk1
z_&q)Mb)^)wyNdAP1YwItZ5Nm9{y((<GTY&se)x`~vMY)!bsCh;J_5FnzcusVE1nzG
zm+i-N6Xc4~G@Z{VxD`DsQ`vD|fV>Iw?zCzS=uXr`7rVdPgm>0xK>Z-#pgS|-aq4!E
zNGsvZUn9gMr81_p)3V7~^x#CrI3V7y&_c;H8H#TW6xKop8F(Fw3+Q=U5fVV<GLH|N
zVbSML<XR*?8Ah_^A~qOpIv;^?@Bmf-kA3)r?V9->iD-EXF5Bl#p=F(laa5t@W)%4H
zlkB#)S<}U>dg4_^Y2e-<_wEm47Wdat#%a-9uT}35fl3WkZnLwW*#HG87X(*jj=U3@
z-B2d6_tG2lF|7&RdG!#Vd5jW`HuJ?44$^~$L^E_4q~8e;4C-z4rcup(lY=_^3%5kU
zWx_VftTOVXD*2Lgjpuy4@r*QVqy?ymQs3fWNERbVor^*yCc=iz%(l!$qxq*uEhN$H
z_(@It*JHi-bEFEnHxT#Ft&w9%tu0MRy+|JE@%|JhHrEfxgYh?;b~`^i52i_CDR_+(
z+)HpF(f&3Bco<vK0)d!b)g=b`^Bz2<=8@<6m;)C7hM{C7=cT({V&?~km0q-&4@a%H
zS(Gwyi|Q*)kF_`|qzG9E*<tEdMGi)jd_!#KmqBz$wLd}p|M9{^02tD>2~JT;&n4;t
zkjH>@EbWvxpS|pdJ7k>xucCIS{a+PYQM-6QN2GWKrP4sO5YgP=wE_A`Bt7O97HSBw
zkuNwwg2AU<;M@p|A3k4e{l&B05Z|)y4)i!Tta48`5|}{i^G5S)iOo{(bwWrwD&Y$e
zGBsKgB`^&Wn9#HZQH8%$)3Ep>s>N6Q_At~eyd&RXkn_8;D3p-rhP+ANNEWYG259eO
z?8*q=xigGb(CIUiLKa>&=g(Kh?u~1Q2>a)+5XA)An=rDfFGa)35-CXW2ESh;+(U}b
z$1++<R7iRC7-(vA5NR*H4`3{^o~e+i@oyOf-u3ImfBVvi%uOc1JLDuJbK0uvyt|k`
z5PO+8c;4JEP}sUGvpU^<7SVtFSlUwN{23=GR9sle%`{ai0%6lB7rRZQz18qdR&n-l
z)>>5?<67*u+8ZMX0Ye8ik$ypPkIwpMVWa-Y_ksj`Cht2m2Xie`jSlKlk(2X9(^ncG
zIClycTF1)gvr<6Nu0FS%C9)>Sx2b144<5W<KFhqI%jFkO#S^Y%4<!_Q!FrQa9u3q#
zxm?rx7m59AHm4hzVYaZ>wGlWO_4;erjajr?DGpIGXQfjoZI3qC?iyvX`;F;4h%_i9
zn?<AQ76qSDd|!`?GONFgb@8;i;rKO7xn=R5Ns|C=8{wJX?EWH3=ysf>#{gBEIApz(
z;%$$kLPbKoz6CwKz5HN4S4Rce3pw#Pt|`7wAZFeax?fs$T@I22>Q|sM;zn5j)ACbg
ztLv+b*KI)8V7d3!WA?l8udWjbxnUwKQi)AsO~9bQ<>rg0{Ay!R^w{OX?@)G}lE?6o
zpwJpO6G~j=OTA^!o&HE4wV$9k6+OsW7;IvoChiJquTl_>a~Zq|=BLtLf3lXK7`CD$
z4Y8z!^5!!B4UeBzKVCAA+CAEesvVF?w*|KRN4lFN{CgeFe%&5`K2*XKNJbfd)4~wS
zYy#_HgX#O}vETbA@!Ob!JyUiQ?mxcX-x`=~XSF~$>1J(lU7z=6mmg=;{egCirfWm|
z+5_FnKgIK<zf6ye%f-$xj%{PKK!$E#=%uT-V^)(Vdx1hBW~^htAVIBcn>Ko~&#O4&
z!P4*^p3Ol(uyuemm>k0lT25@CO@VycqwK3dh8Kp4f|N`}QS?FnBt~qN_gz=Zpa=Fa
zCgY{nA)L|W&`F=Y1@f3)V_<?}dl3mcmdZ=I%sIfEWl#7Kv3OzAGK>Fyaezx;)W?Hq
zu2Zr?M<Eh`483LdD*)By3<S{Xm&&1o(V;Z4RGPURam&Lt7ja5?l;|eqLT11B1>21E
z>f%r<W~;;c@zR&CxO7E715|$5)TKOz<b^R4F4J=nD*41R(NXu-nRL{10b-e13m^Ak
zE{egDL%3?y<9fLeau|bXvRA71DY;z#z~&E?I-2pCfdCUl<KMoyg4zMdIjKOidiv`G
z3S8Duew?fA>jMa~4?@yE?`Hn)szTu-Qh1Z66E2%Rk7quU`ecvBX=WZt$hGb`gWAI(
zb!Aa$M@6d(l=V`#r+e5^m;|E(08`jIQ#Qx)#c37t5Bc%k8%KeKnMgaJ4IzroN^slc
zXU>$-n7Bio|D?8<nEU0{D8!omP^{u>>PEMe=kK0N!Ar+j|5~boIz-m^V6%yisfAv*
zbuDm=Hn+F5wUYCCe?l^qKst5Ef*pMN--7t47F%pEeFD$D_bpjn|Gv&h21NmIvVkP(
ztCPP$Ku3;@%>DgK@u<2P=5Sy_dH~8|dw)A8`wTA>JLxuIk^Sc0&i%PVyrqMjBEF!D
zMi`u+O`*1%26|PHFMvCwrL4BRZdW4H66NS64@PRtk)ep0t;A%z*NVi3j%KXpCfb~P
zs+@)ZZY({9Es0C+#o<>bIWk}Uk21vf%!qe3m!3xtJ25p5*7_XEeXDT-lVL9V7sLlP
zZrY*riQd=(cYOu-hr7?Nk;JMVta^Cm;Ujz~BDD*NGkGG1u;pKW6UztBE{D)*&Ob(e
z|I+_ta_~k)ds7gZan2%(HuOB8W$k9sMl)pMN?;lpjhP8TGVjIX@e)UtR*B=OZ=z~=
zz;Y8(J<uLTRh88nM@nc=zJRy;vsMng98pV^k)SmzzrPVIJ4B<}u)o^Uu1U;WV;b=&
zXgm4?9Q!27i94*>=%8ARY|@WHi?pw!z*j0BvD4j`)f7_ReB^~e$-VhCD2A&-OL=4!
zA$~aoAPsvLJzM-@n(@ubam^)nNYxG~aSo!$;4Vrd>r!Lb0%UV9t|z{~=$KCxNC}9^
zo)UkNYO_vCjZY)}JT3e`nf?%o7B{a_!$$31*bR;XYuph5|3GYrP)}7-Uxiz{vB*w&
zyFPaqgdPQ;?|TJsa>e|QlONp0ts8znt6})*>76e}VG<J4aa8&z9gbRQe3NlmZ0Ti?
z%L^1+bul_lw1Ov8cS_{YuE;tSP|DMnbkBB}^~H5sWG%)A6^2^o9rjVC5($3hi?wml
zxm>OK*na=iUrO$t>mC$m&*ahXNZEGjbGJE0e<1SVG=In0?})mbEGw5_$Hfz45+mNU
zG)1TR*ekIY)h~syz@}dd>tis7(oXX@jA&WuOC~7;1-RMCT$w;|d$58{jpZwuTyWh1
z7kmHqzSum=Yb8}dmS!8<R!FiuZ1`VW(g!9a$Bt`{H1@$|qlUWbRTChVrrXa3CS+k8
z6k-`LD-#Lodpj6U=^A^KiUDRtL!NqM%}U)zYNDNE^2pLM{FTJLiMXkps;EQUe(kDI
zl{4^Mu9AfX`;~mnUCh!WP(@S;1E~F<s0y5_ry^~r^#3*;lE#iw*-P#WapGe0(|2g;
zEr3R)X!nj51kw5?nSMsbYNy2Hv%u+3AOcH5Q&iV|tQx128WNk0c(skagva^fjsGZD
z(|hq8O+7uLC%{pROH@0QRl(7pIGK2JJ7YCS=lZf{DR5<2(|BMpvYi{$Y@67(Fv8!~
ze6)Hh1p6bry10u3u-$yrP?l>*vHSPNH~Bf7i8$6aBA-$!lij{?lA@TtE!Cnj-8ePP
z05D5KTlL;Zx!GHbb?7mpg>Nf#eM>;U!QO89s4>CA=I^6`kR6<&Ub2dlDA!RW;6*N+
z897rsP@+&iFZTT!fw6=m@N7IEko1)n0}-9F>`fa;W@=Zu(nUe-y%;@!Y#UU{1?Um~
zh>0MUGTs@TngI2%tyr`7q0RPS^WCxbHqJC{Thd43+>`3vnKBQ}(N0H7s~pSuS5#G&
zDNs3LT}lm$_3TT}K!RjQ$og<;8wcrj{f~&Q?b7KH{&wXGr<dEh*fy^>ajd`UW+%U=
zP1#`+3%KiIY^Zu9Z$9<C{whw5sbSORq-#}vS7^ZetytefC-(DB^Nh*hQRVoTUp$gk
z_;hRd;<7{5!Mm#x#YxTy1k8AW(<3~Kqy_Hnt?CuMjH%Rnlg9ROf4vHbe)YdH%oduZ
z;^sRX9j8mRsa9w``p3;i4KzOBOjS?)W3zbx-=g-uMY~<~U=G6>4w2%dd!Q%~P{^(q
zagzk63I?x01zSuEeHrai;I&z(#$9SmM-#tqA=4V)e9@pCLxTPZsC+F6y8nFd{3k@l
z$rt@nSJOipTJt7i`o7Qz!$hc?|DsduWyF9}4p+x8PS>BJLieS403k9h?*|&S3$wCj
zA0YdyBRlh*$iik%sE-G9g6VtOt4+5gH^h8JHgN3802`wbQVx+U7aA8Gg01xXFy2%s
z+b1d3Y_Tq3B1q<!CGaR0@!;g5D^FfaR+WjL|KJ}Y;c~>AGnp_^BU)3ccjI(gJx(_m
zj_xh!_;Kd5%V&>KE_x;p?#&-awCu9z<c{m5M%w3)!IimOsulg&)=Lp=;^DW5bw-y7
zg5p%eOdiw>e_d4`EW=?Dw0JXPyq#1`Ew&oCn|fn$lb5(AU~01)Rv1Rtz0mp*JAv_B
zHlrk^o5nGCz?!z#Jax=e>|r;N9(?H}6NFq@xT8MV&HxCu&QGTmmUQD|*y?C{E(_J^
z8Zo>UNxjn4EHCv+Mfnrs)6*Ht{(7Z$y#O;JEFcP)^vw+)!3D~i4kX&x=|F#rX7j`h
zmMxe>>IwE^EXS$1)|iyK6V`X){U25RbNGGwON7Eal~O%0RpLaLigeYQ8V^48$`CtC
z`+2%0&}Xv_>aq+6v1)6v7pdc@IDTHB>Mx%>H~@OgkEZoXwGOn`gZs6(YA<p{_LGGs
zZI%aiD4T<=+oUG&zN6&_l<!w9O`u%aN0ETP2+0R?*<svDf@Pp-UA5z#){)GJO!GP2
z?{~=qvO=Z>hy`l#8=ZyHkt{;V?$4#JHR$SzkX8RoNRCA&Q<H9=4kxxa^l&)`RbRZx
zXhA<sUyF3?o&Ok`bpPWr=Ab9K&e$Q!F`#s}nLAmqI^X+?_g75K%#Z{v-jWuulk<8I
zy#SBfsi3P)@DgAy-G){YMA*NQa*^2ZiCdr>H^j|(K0e7zJw|g_*b(y6e4BKAD2~;-
zHGu{xG~_&}&VdSt|NU2t><);QmJ554-0|%l=ApO|a%017KE4^#j{I$((%UhCj?mAk
z<EIl5muSM-YE5ax3imM|zWHC+aBLF3^({I4m?>E7l-$8LsnY_Gb1cn_&WO<7f7x~B
zlr7Ropo?#s8uMmw76Z{3s$nIpm6V5{9pP_3_b0>|5NgG17`+E1Jgkh~hXJ@^mLF=B
z+DVZiG!xIS&2ax~?QLiuLV?4a%65>ZCcGOtjEk@nj!uanvJ<G7d76}7mtn#CQyU{A
zGqwRv@I*eO09&+E(J0ps)1OpT3bAx($CqJk6j8zsCnHm|GgSi+c?Hzg8KFSg&+GKi
z?_~eX`DF~V+4&VCZwvIxh}0S0xNBZHgYkU<;bA(*yB5SxWChj8@exZfP(C{&3Ze@C
zU5VFOk9e0qpttC@|LKa91dHv#n2sFtzm$HWBzfq-Fo5z#+~}KsY5dW>0NGWfQ0m~f
ze~Ssf(Euucsf5OY+JDRWB!~f7PQGFh8`;0*d}MC`qhU}o?f>{Mu`VL?L<q3^d-0_a
z;D6}^{k{Q#-9jQ%j*tG|(m&vJ13CWx>;8Z10@E-e|0ibvN5ov*N<-;_>=^T;cCtoS
z;7H(nRY5N@IXbSOMy$>mi7a-_E?1gt$9|6!{Q4`3X~aAJO0i`$`s-Cx-{&HxrqA1b
z=)wf%9V$?FSe>?{CqPC6W>)&p(J~x%Uza#=#RZH@_0jE^q1zE@g(q=9?of@ZdoNU=
zdMHoaHM<ux28kh10%8?vy9RdWF41W=6brTpb$+A|)K|EeRVn^ukQ2h$Ti;6SX?7fu
z&!DDOn)WFvpDF0Qw>jLt+#m*-oVK^1L8w27hRnX=pmyl-NhxCT?HLY`_xaw@DM$m_
z9CeMt_XZfrfGpp6kbLn#>3owv8jGYRBG%0{V#4X03KD{{V~^>Nbj2hH>A0jDXDWd0
zJhe<ZAA!@3b|YN+TF$*S#Txr2;QZMH&JMOy0(32~fLvfO{aFF(3DHo<S0>mSq5)RO
znc7Uc1xw&U{emIU8n?XGSLWO#VBXs{tDxisO(W)@QAYA+ar?TKw_kAhHiNQYt9m|(
z8wVI;OS-LKBi@k}pg`GmZ5oXQUBiKx@n5|KM8bW4zLW#X?Ebw{2~)g4M{3^|s8*yk
z4A3bk1(gIZur6BEuIbMaYE{P=%pdqQFOYcKw5Zsi7(KO&yFF+ViV&RBW&{EsEQ0Xf
z=D_#@vW!tE)QO2Xo?CA&3`*^9p{(2}8O0e5@6n8!1UgqNB39wf?4g|t;O)Ban5bYN
zoqrhYum;>@jozEfm=kL3sOKDCdTcyBL5mcM(4WT)+=9}eKEW7%CUb`0d>pd*`iF+V
zrf8%uqO#+;c`SyL+qV$v9jG&sk6RqcTK8)wu45$*bQNl+AgCe`a)pGU@*dFvx`arC
zID@QU37wd0R=fgfsrTID7+)lN8(Wm*=<kS=?$&<ZZk{Y=^ZXuVC}#QFCDxDZI7#pz
zBDNnyYQG+|FkzSkoW_*hqFESp>SFA08^)CO>drW!?Fd2TXmx4RYzsPu45nGR&}lK2
zV*La2%s0_E3f>VwpQ<DNok~}gm$q~xO^u!F5-tdrS)0m2As2g~l_ZN3kuQ7q4>-b{
zmUREK*B6lIzGNE3=n-=^wf%ZjI&8@6*y?)%+2p{wB4zFY$-{~U?`PvrKeE^vgw;eA
zp$K8lXnpxUXDjC(`%)F8p;vJ#@PXfcOQnwWz~~I=*Db35+9*n7-d13eR`7x?9_9F!
za3s*Fj?_cjOe;Ed3+VHpAagS@B+#y|!v{Jt<=)&2b)aY)H{DIX|J`)2N!GhC*gCly
z(r~ZXYOb)ebIIOA$2oY{-c%JE1I!*Y4JmBl^k9yirf4t(hmC8Gv2}K)tTONBGnu+o
z2^}E0v){E%zG*`cQTd`OBgfrga1apKE*#wXmaS?TE~jjr)iC`C?N#A|XRMQ*Rm}1P
zx5H#`c=2}RUU`C~FhK{mM|*IDt7F}qZh9!=5+CZ!gJd*;ahJ-O+P=uum&Y+Gfh~)Y
z%JXOYxq_GRo~CwL2)@I05LO4mDAIw&fl-{;OLV&XdjfO&Ez^iq(hiTfs#{P7H{DiI
zz^iFC&%oZg+^-Lc%fn6GFAf5$c&n%0`FcPzyz*Y}jG$OL%vx!+%C5G*=^>{Y%G%76
z{;A@R!HZW|HP?mhQ@h0k#rlrxfGf;(fv4fu{%h`E-k7_+Tpkk~iY5%ofxg^gvdJvC
zusCrN&!U%F%@;B^;BO@qdqR{*<wl>%OA_w=M!6EW%C*v99nLq!`o%O!LaC78L#c&n
z+UzE@&qJzt9PG_E{2HhV)DXV{6?*$v^9BB_JW!87cLxOqoyHi~8pE7%gW?2m09-1;
zP=`9fC)I$md@LS;mR%5AAllJ04s{damBVJzF%q;{?4j5a2=3eu+qjz#?)Ps@)w@yq
zw(@YDFp-=Wd_h54co_30Q8q=EFAW&i%5bGHSrUkqpA`FEu4W2-RgNTGdnB_Y)7c2x
zzyHlUB6bMB8?ArzjaFs{7PRD>ii@TwHuz#!!Lk(88&)7U#F~491`YX>g5~}tHYR9#
zuvOOl#Sv=rcA#lM0e^QNK0w!jaKV4<{(A?q!bu|rndwth33Fp;+gp-M1gRbS7$>V_
z!QF~kgsa{&Zf`7@NJ$+mm9f)kiIfzF9{G#z@wYfky+;95)YIgt5v!Qw1v}pMs)vV<
zR81Q@NszOS{nT-afUOaBPx82qJ|YkpLPI^OXhU%)Y*WIF|3~G7LO0@i>+YUWnQCcx
zoL^+J!kM<3#B>SPJ$TtX!@QFOgDRE+Y3HHmzB`hFn<PQ-%Xw32!AI)*e4T6!b?=7o
zNEECNk6_httGJkYrAo@H@6{bYqI2oqR+8CoLBK@#MN;H%IxyaoYD4`piiq?vYn)bw
zA-KtOjJfp3aXP?g9v_0qd7;u`K=wU6;`zcpXe)#3uJxzgY{Dw-{&f0tM)(o?IW^KQ
zPoo7ewjx0Yy@FWkB5n~L-yj;>IPtwFnlbb<=LU3_9&sW@BFyqr9g&0b&OrAcMLM=5
zKs<(si%!Sy?W=1n!_x@92q2@Ka!%9Z)((Z-gonTXjl{@gD%Si4ghYtT)$DKXSKyG}
zZTP4Qs;^I!@aOkBxhNF@gRi^$gTiFBb++zY>8zqcWm7d^wBwv6m=x)dv=g0MNwK*W
zNxao^)M$Xvb}rKXAQ@gaZ!&fRj-!V$d`j%+*`UfN{xBOjy_jE@GwxtZC9;$g^T+P<
zwpbVM??{AQN?Z)>r@Hz?#}FKMI;X){JgfOB=-!yHLn0I_u|eT<Zu}>1ZlMgv_J!iU
zxUFbsM<zQ~DyXfDVnP{z8Hj*5+K<t)z#ZX%Y&_EKMsv#i3KF6vPkYo!7%_iQIWorH
z-b4+R4O^yA3>EJNiuff$hm$zZGlHqW>C4A_<iyo?GDsbtG2sW3YRG9xl-FHl$i$xh
zv;*`g3RJn_O1mwl&CT4`+}%5w>AmjV)%-%Gxe2<DH3MR7Nh>j~Mu7<rnS2@PJcDrr
zAKtS=O5#`XWr`}hxtb=5KYGqXk}d-ZVx;_Mixi?~`=-k&Tr<k*hHLEfW7MeXWag}p
zWJqMlBjGsS3w*}Y0E1;7%*!dy3y-w!7(&^zcU>!3H(ZBN<}2axi;I1&V6H*uIZ;Iw
z-ZOZ(hsC#%V~1it8krzC6@FP7kTzNhz)6Ze=Js*tvgDhmX0t`nNOTgAZjVNXWfBwP
z5?PaO`v{wQUe&=W+dEcs;9{y^%4kzHc3L7MW>2sd?DR<0aVejxT0od?n)T^Kt*h5o
z>o7?z3Xs3Gm}@J$k$elM4=qTIafsa|t9TS=8wvW<LeZ2HQ)P+UZllTk?O`=7=3-S1
zk7Bu5$s$%H#kEK~)nT3Dej=p0QN&?=7V<ZtE_q%Lu?Hu3<s63O!V(fN&P`}@kW!Ww
zZ%2m{%255z&#GL~H7C1G!r=cU8C5-$(cqpn@qMgMG#;1T^!=A-ZcJ&Wf~*QoAn3tI
zU1p{eVPa{QTNLI3((y+ppE#P-#ktF{3Lzq!ODS;qKR%%sYAW{qaOY_nN2R%q9#3Sv
zxd&p+R3M=D@lbeR5XR+*-sJPoo3r{}#8S6SIZq40eC(0A+W`eKpG$_TqBE}p=ns|!
z9nJuk*gHE1q@f)dFz<l*XDh45xcu!!a#G5s$<HDbt_jSq)&Tc@6t8B1Ov@dV#-n^|
z|C|$^9wZU9!eG<wi4DZV`$9DCY9V(q0XZA->?v(qrUZ2{g#5;&@r6>f4$J3GpLh<3
ztat%cpY-yijMJsNGu{K+_iV$GI#&1GFCWE5s;400aig}`Uisc@p2*XAG9SdXswx?D
z19)CDdE~JfFLnFZ$zTC3FeYVx@Ptaw2VzDB&nVa^<LJRG8#g+9I>!E4yRITO64FAk
z4acSW$+qrH3hZ}ZuJ<Onh%F}rIcm?lY815V@BA)0N=YDgzX_!e7=zX5LE4sg?Wr*b
zdK8S6b)MsKQqaHkC|^fjcOg7Nm>5h(x`ri?rn_@3ugN$YoD(&f#!}zXg1vr9<w+bq
zM*X(MS>)!Q0*5THUSabDq$bbp4}Gfwj0F(}g)@o#9`H@lR_8e4-|e?jnk+UvrzE1n
z36L;MB;*7cy_P@|Z=}U!t(sR@>{NRdV306zQH|u~=#cfM>TLGL(IP7w@Ko4u7Q}YT
zy|g{Ro;-6`T}a22YfDCuZ`@**&ia%bulvy3JQmM+i~2q~red<KcV0E)8$I^}DT?0l
zx+$e-ocf(L|EUG|hSlM!=qk7>6A|Nn0HY{&aQ|$NHH?DSfp$P-1IDiqqsKClEu)no
zEB{I%{PgkzU8Vo!;h|k-MG%RW(Ivv7z$D6;T>FhP|8DM29(PO?>-sHP+XDbU$ka6E
z8m>3=p^z}TGPnT$yq+M;7{!p#CZsnSN%C5(cXh=!?khIAQ)buh=$OCTz|}wAJ#-xz
znBW-ska#`OJyai)<lrvfD8!}<9<WnURFa-pPz*o`CG9|Q*K1TRvJXnvc!@qe;4u!y
z+3OZEA0Bz$xs_6kZW`v~f|+(>SmI&x&fG-hVB*x?sf-|p#yE2x)GwA;k7+byyOA6~
zh^^btq;vgJCvbqZ^%T=oa%qT;3$1<FC=ye30kajd7ansE@ct<W{Cy9#fIqv%08?J?
zD)0B5u=bbIgc3rqVQCd`H8Pe;8;J^QjFUr%LZN71KORjS*3X{4NMeEVQm1uv7)%xT
znoBLFg<k)pPeO_t-9hQI^4iog+WWH8k^>qRxD+y|U&{X|woA{w#dWj|OgN!6cs5cJ
zW(=gjif|(<ot2Pk86>l#dzdk~r1c>3&Fu?AhUOMuxUd0|2}t>4Z;E3=wa${AQc_)&
zV~>QJ<Hwj@C8D|{$B(fku^bNK5*w0vy)MJ+<UkT=Lpq-o5@(~~79{?JyBpJBpMs{M
zm8pp|;uEKEjIy3r5*m;iqry^0;TR9Ls*JkdS>6A+n>W~fx}-x#-SRhUkjt1Y5wUJ_
zKm5dymJR3afHN&wvz;c7xZTRA76V&w-duadgT~q9m-aBJpz_L62vBYHaXP5#<%lmN
z!J+I(_#LDbFivP#X4HtKH@bV`-|A!?QERLZyllXGT5Cmq4(knrrwf`BR!Ar`9IbY!
z@#eK3fSFjwtP{JReJBWDl4Mw#?zfx4R?H}586Hm`Ly_7rAC$11Q8%AtJ~oN&)E;56
zZfXCz#PO8Ieg6@xFNlm(y6g64<71nYKuI>w7`{WyCt1$qkG%UDKgid@V}RQ%8Q#1W
zBO{Rr8^T98YPL>)8ZOPa?{aVj<HG-@_#G!C(2Eu{%ilf_J%~$M7!+h29hMbzYClk1
zuGUQ4ei3tkHK-;A9I!G*TpcFe=9J}vEE^JQJ=W}t?G&Dmakx>j&w2inL9=*qNMl%;
zYD#K`3waV{xQXz{<!^Kuw9G|kE5Ml{)0!QS?iAUWB(}kj3U#Kq-d6bpMRP#e##?MY
zkprh_TjX?97==GOZkOLH89e*mnsKf}x#`mF19AD3kiF=7_r`wGjD1%|{t^v@9;trJ
z1?=EdVPGRhBR<MKW}kBSldTdn?UmxHZp$1Ew4DE>(Ujt#AN!|Tk)kW(@rm1qTE>iZ
zC}6eYrKPD7K%LVLj)M<P_m@RzK)j+T$XX?4)Lx1>Com+XE&5^JHMvgz)It>9@b(=L
z()u#e;S5$AR(Pxg`$yh9iq1K6=n-v1#;EjPQ7OATWlHK)JyRAuMG5re4_^k&D$dH#
zmqitGv@GviOc79zN*`Dqh>t42Cv0RjT<8AXp4QVQ(4lAzm^$+^T_PzkHvN+$kxJY7
z#_5xW46KFI`mK1}chxXug!p&y5TIn^x^KJSr%2u97Hb=8ZY;X+*wqRadDE7vtH^_j
z<6TBE^|mnfALtmmJS{<1xM|R+l){+kRD9X9v32W&R;91)2?6;_j6UfbkxUFq`}ezJ
zHP6#jy9@Xf4+VMUsZX<;hIGX0#*C+I>SI|3goQHUF(tKc7&xQzZ=N`5)6fYwr}~`r
zl*cJ`nD?0&$e{1Z7A=08rKfEm>vRLV3Nqq8y!+`Lh72dp3W*8T2v<r7T-bH(y`h!!
z{_@he0G;~x^o>&fJfzR%!qxxa|K){>H`%Qw<HR92{BG*ja6@j6Z+oAXI}Z7PB$JXP
zh70SVo!9Iel70TLQ&^?QLg=(Bj|)!O>RUm{D@zD{hR;hNp|X!dWDa%ciM<|;<AHI7
z4ux4p^wp8Q!a2i}f^80s+yO$Ira^N?VDjOIIrg#Z04Zp1+!sO`lT{wuea8F{D!iao
zs$GgXE_y3#mW3ofSic<77(7xGZE;p@4D1hjRcab8Hjx)_#M{z^vSlo21w0CfMruDk
z6T6h`CJMt$uOoUURY>$p4QS_5CL+jyS@2w$DeYmwzaTNcbVzgakoi-BX5})C#&nsT
zDQnBY7yz%E{C8mb%74vBlKb8AgI`wB)r_3By*cBTr}#z-F=jdLdh|iJ?J*v=JO6dy
zi&oiIxNuHk%<j-T_-*(K|Kr}7KO#V*4drZ0{ojNxEYu^w3D-yGPXB9^AK-xvVF03+
zt215`?q7G&pu--K0|dKJuu8zc>0f_e6o5fq04eJKdz@PYy(${eK|kWjqWqgzMug@9
zsA)?W0XOx3k;TAJxe&lNAAh0Y{I6Sb{$5J+KVDjsjHE%GmpaMt04+BC*9b2xS#6=A
zw{2{w$XXlDGbWFV#_C&07W`A*4AJ1Fzppsrl7Isp4l{}aqF#RL*XZT0*EdBk=uNyN
zVzKEkBq2GW=)V|Y+nk&e^9%^lqnF*@55x8Ot;=Hhi{v4G+mLxj_K5aJT|nQR#G#fs
zNx}-fiSkV>Ju#x+k}mHkx{K_U9Aeju`eJ6W82AHv9LG8hbRLAzcd=pv7|QmIY2_ew
zKt?-p(LwQBI&z_%TuFBBf(o6_EUuOS$Kzex4L~)4vDU>!-bKUwSyUJJ=NageXTF5~
z8scYdK{?446)LlE2)$oivzilQqM&04J9UWg*jSnfec_-C7hy!+gRWs9+)I^e2rRIj
z`Jh3V6sv>rQxF%-9^%@eo$EB*arlP42%-yiG`b2xAVnaD0I=j4bWdB-y~47x1mJDX
z|6~^1P!{Av3Bcxj#}$(m;}Qma?U4lQz~CYH_Ogy&pn~GAj6rmuyKa)=AE%N5#vMo`
zxV`~?T$q%np}@mq{vQvIl?P`-jU#mqfm}d%Aj5#qTF0n4?|mIXe4tOTzJXU-ARlO&
zw}!aB??Iy>pFV1WSac0peA}TgRc!;ORb6yIgx{2q?N}s9!l+ZsA4-P)!?eJKL<n2x
zb?^O8|NHjUF!7KOwOme*1lFPZ(h}j_aO=+2#Sp<);dD~N-!mYs$$;v>?eID5zL+3)
zn8qQJN@Bc%n2O_EQ2+am5EZp@P}OP!jNvb+#d=x!XVQXYPy+OsCk+A?l=EK`g8v_@
zw4o}9NMG9XBb6z30cz9cKO}A@P@eE>Ha(a!+%AkgMt7LL%b*1q2)2l5{>Ql9I?$hw
zCCKGfZ$%B8a6qirAu(JC<Gpr^A?^Y;6{}?%`;$ei6?)GEFd;r+*x|eRTFbKbUm9=Q
zAJ2y!?<bm74Yr1#6IQ?QdmphZH2erxEMn}2L9;x6cy^Z%HrG7AlLH6PHdOZ@tQBt#
zY^055Tq8|eUwBIlby@C`E8CcjWpodZ33dHcu!^Nq(B|;yK%O)kHgev>c=klx!kK`;
zj^D>rBI|HhbPbno+Fzg2L-rHrl*&!XDP;1-R1n5D5k31(1f1DP6&Ql+XhGU=#878~
zzVh&-UU;3mAh5)}t2Tu+9@FC$$dbUJs6{v%BYt1*%IUc_QYqmOgadu&W4hIX?Sn{*
zCTM@eDLyfe_<agk^pm%^Zi@?cfRudyMdePeTif8Dn%9E&vsrqJEFKH6KlEQHaVlTo
zwQ9EIof_vV4a9>snAgs_y#8E`FoW-ZeF<&HQuupZUoa;P2N_@EOO^BW*$W^kbTE*u
z>S>w_zRWk(s$>c_uS<=73+iF?P&1a}nBhC4Af$sUIH#z-k4Sz?MhLC1Qtv070VP2^
z&7B$_hwU;pKATBjO7vqsWdDu%Ow>CvmoOkulm<Ex`<}j~-(vvHgN`^6X&Xidpb9`D
zL;EZsW4yYOG{yNxac}@<Jl@dYOI2Vua68NyniNgvK-Ym&;Ou6@#(C>RscdO-45tUv
z6+YX?*ja@`&Ab8MSpCm<OAW~0d@mo87}CfCIe4%r3mjS;wwbp^Qr<p#Kf%bvv-wkV
zg)hVPAX}0qF3u`gO%!OjedUL|aa~D$*P+)QXksuT8AQh!yM^kk^GQAdh;dYQ>?mvn
zmshn&<YzPfT6do7?lW8dyi?;H9*aEr(m#ByU9+U<NDD2j(nVSJQHfD=pnN@`<PpwL
zeDy&m+^CG_pm$S}K>;yr!FAd8RkSIr1&Vm_85+t&d$J~k3(fE7<5vfGU^E1JgpWe&
z>2Wv13w6%7TgR#$enWe5*z_}wd|hls&0Mkam0?d_tH-THCPf(yL@r&5`tXcHX)qeu
zUD@Vmp!gDTk!Y18R<8|VX<DI%H$%6n>^t$tkQX}`yXEqDkYLc`K;U%{49Z;O&O0lk
zRIb*xF7Qk^U~-WK{FZ<1nR_zkAbEoLy079moz?px`FnK}J9WNznfo8-ErAoo8<Al{
zV4*-TW$VMxd}C{o4QALs*!k^tZ2O4Vxd=35@X!qF_c{0(ONIpxQg1?WR4~<jbaz~+
z)L1^wvyUd0R;Aqh)*(_Mx;XekcvUksXAa>u>|=M{0L#BR{cu=(q)TQtkO)n7_}m*f
z56-4VezW!Q?-l~OIfqTD{B?el@!UlbftV-Khz~;Ox2xH2YjJJ+j~gyqKYxAKf-J>?
zR4Tl>7~`$Sy0hs`qgEFfHk8h1|E^{if|)PGS!@*LJs%`uGl8c4gb}RP;rY5unm}_`
za8XgHkc}kB|FhRs8rA4|PI#?aEqdVXtjlhx4lu6>y_H*lEJa8Cunv@Dh%!rTpfx1;
zBDXhYNK!<Tb~nP43#gyYuMTJ7ahWqz5wbn8iE485+#0+Tb=x6Oj0c)sRp;#Io-bk$
z8p%O$58c|!g1SH#1=q{3<NoWzyTeN1*dsp#^{GQmX7T3+xzBw`;;hahRFhjbpHDyH
zaT}4`c>CbHt~xSD5%T#?23^u=mPEm!LNKMr!%1UwU?YWS^icMjJBgwQCkZyCSzE}J
zjv_BoxUJrxX0}oh=p%20I|1K-Q*vP*L4qE$asPNqPZP(?{=VTL2UYQoPQdu`8qh`T
z61*;Tq76iqe6T-78z6ZdlKcW|L;{O!4BqY!u;RASm2Ss<k*l%4IulKBr$qZnbf?aR
zd1Gn~-j<?2Xj~6SC=A_Rnx7h^uXZ^6Trd<fuRlh<cMN1o4338!Cr;qx^o|LyMX&lu
z-()n|U+ySi?>_zhCQA22p9BO0x<HO3Q@O7ZGq!!TU$IUxKulS}(SB|GYe)O4DGi=2
zHF?LYEy&WS$I?OHhlzIDblaVv#sts5H68`9SBlYbn<|;>)@G2gp$ViAJ~!9*?W<C=
zSnN!$T?vU<R<{^J@J+$nMS8j7Qi^?Ll!JyxRZxGbOj8-IFxXagr$=H&RgdlGTTfwA
z@bfG(!)Wv>=ngo%$0X4M+v#1d!_g<lhrrr)I8fR5J1N9b7TJK9#H-C%i^HvfoXvE!
z!kyp#3SIuu>8*w}KUv^3GBl9uI(maiF}q?G|6bg2OQ1Qh5mc)_<#ZdBaTPQuOdv-&
zxg|9(|NapJFR80Lql=t+RMbyQp7sMjC9FgEhqD&~3-BCVoKmo@^?W0|&&xT#KD{!0
z&%jt%sXRLD{D}az*n6vNldEV=n=Lz${=c6W$ni(~{<&eO(dk@3rC0{nF<g=bC&KRc
zF<|nZ)G6w7NOJ_;v+9pTU`wjubIA{(orkWi=8Tw-1GD4-KS!VZ^vIC!(;7vSwHLy=
zjsdvnIvE!kcVY{Bin~EOHv*M9BJk7w_&b$x79R;J02RWFj8GM5Z_v*WPL<Y<6kBe9
z7ITB-IS&bPOPDlbDU8^~WZ*AGfyPB^PFRQ9#X_)$0RYA*OkK0IN$I$4Uw%&ckxH6j
zHBfFc7J6$@`G?g8x#Yt^lm$@1lmB;=qW1UcD)PWDaj*xPKAPm_>(gcQcP^PYirkWw
z8Rx$*Pf<X|-NDuWGAcZP;~uM{w7pa5nZM1OS`0k>q#1zUP^3tt&WKyDY3JV|`d*|2
z@40NlzjH!*j(oN|+bfk{u6L&e=YJD#p!}E}tP&NpdziyIa6$DMQWPdsor_GWRKbD?
zR*3MhzNtgHd#Y6=qsolb>)T`|#csOiuYL0PxOGGP@uKaX+aPa`RtuL9y-4pF@Q(lZ
zR0%|j08?uK5%lOFTIBqmI7U}-F<N-ygq-vxtNWNLe27j}9Y7|%PN4(V(Xi$v?1KB2
zo&YH}y=)t&byQMHK?tdIcxnx4yuuqp?7e^%#i#hjqYu!0P{<ZWVP@NtL3FQs=U=3i
zs?HA0r<$0F6cXWEJG}35vhFNj<V|F(rwRet96buA3e|YWZk9*2^(Eid;N@E(LSB3{
zDn-9-gOQ%1Yx;>VoxdP3b2yWOwsU<^)&%D>3xap6uO|`G;Qcg0{vX0Z*%wF^1|7j{
z*sJ{>A3WB7F52`gnjq7w2wDtFH@aWja@fohC6>*pT6<nzqh6h^Xk5Q;ll6_4?i$V{
zE4^z0Fln#U)=o#DqzzyyVV%5#-@Zttd$_N=e*aj@=ZCc*ZQwMHr-uGbDWt}9EYvKE
z8C`R(G@iP%49SADO?V!L=E?~k02y#@*=|q)k1kTPM@{ca(^)K8xNJWK-SXY^2hYf{
z*1aP9d7a9}yR`R-z*|C&)Nz&=T~f@ootrJusD(;vpTjp8eI%q$X6TgOG=)$Wlr<3d
zEt-VqXawMt%+)yq>QyU?juY#NB1dB5p;&_Q^l6bG%lh#W6OX^30-w#S2f<6IKQ5@2
z6-cs2kLPGC9aNQSKa?wZWm~nB^>hZ7BByYF)ca?yjsec8fmr%$#I~B^A?KGudxHeE
zjFS+_p&0BBWT1k1xC5k3uujl8lN7E?eop)Bea>4AL%3c1qcXsEYg;T9In!u-7huW|
z4#x=a*j#(5JiqV0-kZpUyE$42gX$56`#o`ToF2<QHWhXiTXKP)gy!MeT}kcJzowjh
zPx#|!OwGtL|IZwY<2RpL4n4l=)(VW87*zL$Lz=+$t&8m;-$TxA01Ogjl8#0v#<v}w
z2stn8K9GtjuB8~5jC|}WJEV#t3Uh9Wrk6W7{}Q77RH#eZGCM6}MDBpR(m?JIe~Kc4
z4dR6&G_F%ZH%(qV+(h@nRGDBFO@Wfa{wJEQ{U+;5`)859OplKziowd_k46{NJ?|zw
z7-lxXnOvD<>BZj9aHm}EIv5jm`|P<N^EDvFHtgFq!snOK_95~8o=a$tHzz8Z1P^q8
zNKq*)!TjzcK1JIhQ>$*{Cqh!n-&@$lt=>^4L%}m8>P>hzM6U`Mlh`__q+b0D2i$bN
zL1ivqX6uj!7Y?=Bbz5-fvwmSaogLE=(c?@8iZM#q%u3KZgx{Y=y6$SQjl8ET8=N&&
zZY^Hc&p!Do!sIc*JZl>KkAJ>jfdUv*4#T}4?RYU8^YzwPX*1=4O+yXj(iN>hRIvZV
z;TlX<AdUJ#WWfpF?tH`Thjuf18(&8_CBe;l*?sLsW(`_k%nRg+zcuR>3H#D*e*+E}
zGL{O`Ok8Yp&n&V$niBpHpgx3S)Z5D&7Bjg#M*j{_6YM7&81|%@r106+KRiYr&e>m$
zLv4BUWrr!FxKx)x{3XGXHnSAK8um{klZI3EP331mjD<FpV*+YgxMYTU-|)oTcaGo$
z(wgvA7_<<P%}~!vgLWCU1K!Lm_CI^Sdfn!9BD+mgdN>MXa<_0Ef9Nd^kOXU1%tu+&
zCjGs=s6_2VJ;A3I?B7;5Zi6mOF#8=^MF81UvF~U<zRLXCMz`4s>Uxzf=ylnR+SIFF
zrfH^jcR0ZG)VV*~=Ot=43j%wodDJ^ti0ld6=IFxbU@lCjR*sGKRuP%j$C!|=yu9hZ
zHTRZrQAXd|u#^lUjUXbz&>^98!%#CIAuTwBNDL@QcMmBsw9@DR($Xy=AtBw3NOuff
z&kg_cyyrdV%lq}^1M~aAo_nvo_F8*g>)Q8TJEJQX#8Fg1*z_qXx?SzJSsuMok#d_h
zJC>F%aL9P2H(hbYnq{QZdz9a2G31jFLO^Kyl!Mu|sy=8ubh^;nhOzq7)odNs>b7}1
zpHRot97W3O=Hh5?leBb}{;LUA^cQ3D6zo0U>H=fNU{uuETOlzg+Ninf6GF=$#{Tgn
zY{%il(?JC<_tIQMyx-NHd2d8rK9qKsqT+rPjWcBlUZJ?Am$M?W_Ea}y2$r(5o-93Q
zNauY`(L<}-{IPFEG0yeebZ~Y1n|{@|zTQ_>W3zW=4yw!{T@vWA5_pTDIbp%c%S6|8
zfwu9#IQ=&LR_MO?2ZCHbf<1TYjW3TBKX-8+f3uuqGk|`caymJAwO(;0U?5aY2a87;
z)$UV=Q+I^LM@!{^)+YFKDQhLXK4f?#;W|za2i2_1o;Q#WDiky+0zhj-_8ZJB7&tN`
zaRCaB`W#G6$l72^p#2U&3`s5uwv583vtC{(*|XVc+kyKo*YC(QUr`s95WB1(T1f{j
zjeYPBZ4%^;eg5?qsF(>9YJp`T>+U^>E4>W{y{H!trN0z7#|Q2P<GoppLyg*uYSI0m
z5iTrWf4QM1S86#&R%jq#09w0dBHfLAENnN6%l#=jw<ohOT%uNuYYpx?Q>0eNZzKQ@
zeo=Ft?it9(L|z<_m*I1|R%0!#vtF^D87m!xJe!{2hh{wEk7aG_&d|g5XfIyaOG2Fu
ziN>TzvtJ%Zbo7YpKEpKDN-pz{Uk+g>IfJoB@ve1iujL;Ni;rIFW&NxUiDp`HOh_=t
zw3ud?4vjXB*LVf6KqkM=iFj4MW2!1idy~DEDM@=DV(?vGa=7>^hjIC-8LOASt@rQt
zSAmVrqmX)8TANQ?_w@IWwR2=64Jo0j@>tZ%z2Vcpp8DNMAt{@-3j2dKKXu1UNmA+W
z(HW($sKdIV-)gtb6{87|7UF_0Gy5KHo-T)m)6LYG(OYdv{O|=oN2^3Oo=43`OP8=W
zIIMI9B-m7OBsJy+IkncjdMN1=<uDcURUa^}FljR@C9iSfgzl!CV3`1T-F|&_#E66x
z;Z`tCqDBQrI2+|Rk70z~p4-}&4%NlwJJ>N3@}M8p9k`6UMjw3$9*Z~4Rfwy@)uGp?
zE5_(vU2MJLi=qX^5}=BZv07?#=4?@ajyX|(LX+nX81G?GPHtc7u`+aSM+A4J*Z5bw
zz2xOx;ZGz6fG7lrWeM|<PbtIKyf(i#-MI#QJ95V>P4oMT4XlJewGdgt0s2m6?*EIv
zD`rF62ya4SG$X&iy8=zeFoy*=oo!~1mw9KDxm;Zy3to607#|+j$IgbG8W%SP0XFXd
zRCn{DNShp%Enu}1Wk@tk`7U$+rkK&(y8tism7jj;+&Vbk(y5_efBUqf2WE*`ug35J
z4dEq`3UG$A;bwa10vZD9ZOTA_GI=--AOoW2W2yk7x;o_>a0W1!>mCq;CU@S~br<5x
zUP64tY6kKMnoEoBl`d5~p0oJNEV%JE=^@U^t3cC}!(mZ#nAfiVwVJXx{3)qi2GVJ<
z=GE%BH!kt^r$A%1bj79Qt$`xBmYPqRc4lS$#X~4$zT``ZEXb&K3A~NzKRA$xqyuav
zh&yKp@>pBvGzY2_NB~H@fMCs7@2b5qnyPh^14p}874DWV1n<@SrdR&B&h_0x5(_(b
zXm<inMDP0zu;(a-IJDK1f+{pi9pZ6$Mx+3t_Nr79{G~rgLvhXs9N+bh>W0*twy<^(
zdFn`vbWv&wbFckM`80gF%RL!%F??|`p*2vf@E&vR25D$O7p08Ktp6VR>YO_(8_a+i
znYbl`K6G(o*u~}oI(ppy1P;6bsDcPv1x>nQLuy6>3Ey*TfDuSWoe@HqB`p0PvL^S;
zw|4S#UJjW>=RloSsU}q`ib3u_a~IsV+aTX&zO&S1iTZ4U@{_o}?1$<4)Z0QsgnUOW
zt;Wity3Fb<e!IHyo%%XxAq7;$Dz>ND&|4v29O}b=-vbjRO+)+*Yc?orgAB3#E9tRo
z@kV1hyt|z@Ad8T-`jwkBMl-}B2=C1&c5vmOIpLN;-e^zXnYM(2kta^1flKIy=f|@@
zx&Wc#ayLKTgTQXgHiF=5u*q<)G5h{_Znx)nl{;N}M*{x)dW;f)5A*6*R1ylp%M$Vn
zT&e$j4n8bMdb9hjZ#Yt=Q%fRZy)vb>B?Rd-{QGiOBRnKVQppzcaCa{4zyjj2xmvwC
z`$P1|x19EK*;}QRbh+I>2Y9<ZW8&Jemro}g-;53uK=(c<J8ei~t4p2-<eP2(>HH*c
z#B_1El98|8pc*c!y_G0^eImKdc*o8em<C-CqGr(ozom7zT&p_GRl(M!pKz`oAOKG8
zQluxst9g+4!TnXhQ~-J^@>;O2mbo8<nv4rmZ#&S;8q;WcMSF4HC%vwitti;_#q3!2
zn|_gygM?2Kqsd`%Lv-??VW!#$IrQjfpjCe%chx)nBHQ<q?kA~?X+UYEpXkpjI=t>X
z7lgbI6c0w>&%RsYD?Ho~eD6&NrW1aHsuNx7`J!I10fn&(g5M7u;WQjUT;6u?F(+57
zhZ5N&6>P}q*-FwX8Z!LfcL54*ZSvvSB1XPnQyJ@8q&EG58bfpdLe`P5xthuzsO(ef
zNEO5vtH=-G+!@yLCa6F`Bd==c6TL;LwN{FMea#zmE9xp<05*eLSgK|CFw{*l>%{Hi
zjMXNYt6p?qRLVgh$nGOxpuY%}tPHBf)q_yMiagcBoXBBtrZ=<9QG~)<J1a8p5;NXu
z*PY97)cQkp0}d`qPovtgA#_jX>kB-GT-oC*Ce;3b_P3+u?s5B*!aCb{U*oE^J3!LE
z?dVs5ibe4lT@#%-+XMnp-}Gvqt}9YK>C*9(Fbf#N?x8F7zrt8`OIUI7OD}c>Wd-Q*
z<9c*4F}%rx#z3$zC;dG@S=2Gco8mupw0NPp^>ogu|M_?_)LZPx?Bbx0Xwfr^K`N!b
z%Tf}E)(?QX+n3Iowz@JE;TZ{afv1HX+@Gw(^z=ZB%db~v24an$p>0QaxBd9`jDQ^p
zpy1YT5q5Q;IPATMEA0iXy+!Y3JnfPh{SjHl+?|~3m3ATpXwtj!+^3N$R6Vi<cSmdy
zAB&Ij1s`SZmVfU5F?HwS^7z4vG(PeJ-A(9sV+%;v2K3{D4*%t%dXwWQ?r>BNEaiz-
zd$a~phnjqqJe!Mgib9=JWj*ndZA9HG1o^2tiqeyV%}2zA*=Xu9MQ46f+Y7YqLJs4E
z>zsEq<5%}|pWX*Zf!7WP_$@Mf2f5oC5@xQ=-^*e?lmkBgo_*uK|1@L*E<Czb=iD60
zFtHVPLlO-a<6d|fgYq(DCVc<w=!0kCS<x!O+e$f4)lL3QKKN!>#4NNUPNrlKO?OI_
zN3WjyG!e)Mn>&qtv|VSYISn4~48LC%1}_EPBzy2FAz1(-?M)N%e(f({(SFA%hwY{x
zNi|_vCWR#6r~0?U1r*7-I4kk30gj7rAD&*6WWL(qS+AbsfwxYS=kC2tx4eLqI&K7i
zlD6TrttYB+*)o_G&l^N5WLK{<>;K**8Ns8Fys++Q3aP8KI*@H(JGCQs?0cx0yO^k7
zy)~btKa+I^U!>xED`!|T6C_p9_%U=6aAbL0YbtTXjU!ir+%ls9qL)&%duKvd0-zUL
z2u11T&PncOS_D#!SJ-AVMi>{J-+fiS(%S6psj+?Nm2XCSe1TMYqEd9_*|u;R{*#Iw
zbIs9p;#wXQf6e-Lk>EOQO6beWk4IJ`#p3DRd!g3RY*)fX#$mcdbH94!EJw09jggqI
zSTo<<{=SRIsDf5bRjQf+sg>=6f_3!)<h^RySHQhB39NLSjXL5BAjJ2j#D~EyyDEj-
z8kgIOkoAgbNDwf8mG%80Wp-qm)f=NhpynP;;jLaRFgkIb_`(#7_BXTK>+yRKElTCI
z7x2ohdSJ?4m0$1MDN<@?2{B}@q5A?3H9#ns7v|Nvk!xPLEcRo?^Q7vLY36t*3n3^y
zoZjVyvdsMS4)OU?7@sm$vJZp$)_W23Z0NL)M%rjQ0?9+jUkZynzB|4j?UD|Z7v0a?
zGe&OqAvs;rrZ1ZLG%hak#3}ni0b8?Py|uMDgli>f%vl?ply#Q2{y-aqSa^uPL;yz1
z`VV9%2e>f(PH4IUD9lSgC9Ujs91`Z^-1mlkD3kQFL)^ifzHbX6mmC#U&%J=;sGtMI
zL2LW<TGH>$p+?j#Cq!hz{=vX@^Q+DUHjN(NELklcj>5_YunU?>FYjg;G42d*`2Naf
zW?Q1N%mYu5E~e5{@oESl1L92kDWlCKU0%vv<#3l3xItjh7J}TSlc0iV#_A7XyOJ5w
zjvk{B#+~;7h(Ue6^${#^DIhj6))v!NoLP|)OfHPYrMkfFO9E=$o+a40CMT?e`*p{Z
zjW9sUjb(Ls8Uke6^Kx-@ZZb{$c8V?4lfdI1Qoniida#wyQ=RYy`#UZGPcuo7hr<Z^
zMi65t^F8P?8`u%qG;^rI0k#eA)XcbB2Yn0eo(-01yV;SpDG6UF4c3!Uzul6>379!j
zVHBPs4bBb#zV-AU$n5LQw|@LjVOz{It~-a<bC_5&)RQqhBcFGg#F6B98Liu2sXxqJ
zc0P68ZRM5DuU}UA@v3vbKISUmzsNQ`*PPJlfRDLal>o7?<~+l_1<*Hthj|Yy)fMWg
zD*xGnhGp`&vgy@HEL_4W-rTsAGjsvaEkKq*Jg%l`0MY{=Ab?&uFOLt{z1&}##-bP;
z>28wGDBdv}sXD1<5fvKb6MCz+rw_iN;s^{DsU)7J6>{J@IF_B;T6MDlxWva)OvZ@5
zN87g{=d5{=gHr8gfy^DmQZy*N2|Z%W_BS(<b_dLDP$*!bo)Ma0NWugU{!>k=X&`>?
zYWMNq@2L6zBF4;h-fI>}>^HrpJWQ!-{;b6|P-ZIFpi@p3OtkKw;d}9<O-1kL?q2z0
z+DzI<fK34J$;BA%GV?ePK1oy?rNpiko$wZ}Sc*dh8#LW;UH=bM@&6xCW!KtT9isdd
zc6`)RdYv-U5zitncKkR1;MX!G94VAYWD_?<pV_SAqTNK6t}9(xO@tF{=&yYdU)BVC
z>4`KE!!OsJ1Vo&Zp@AjC1F4~Rv%hI|=_8>mkM4_-Jy6SHf7C&$%E%G#q$beM@Lb-h
zc&;q5eCud-$H-=@aklZeE@Q*tulE@L<<3g&&b8y*ae1Pc)5h22$|%=Kx2xJ*fz_~(
z3Xc)aw7K2qGj1C*`QHym-IL_Tovo1qJB*XV{$zupe!Vi#ovpo32NS&`1$udNLuOSB
zXs@Iz<Rcv?g*d8w&+VY~x!e#RqgqIdqWa)!jGVLQ$AZ>e1<$!T*qm3nJWfdD$dLNc
zi1gXvsGnu~dgvo1C81`>D!bnTf;%q*oq%Z17&ZAc?Z9RqXSn@6qAdH4!$$|Vwtl_Z
zwWRZ%+97a+s;vy(Rka)ai@)6-D_4hBMD*f4;P~_qOwQEQ2Ie2h%fpta;Y02zt1dKw
zN4_smWb^s;Khe6gg>t9%38Keers^%(-XBUEiXK_-)0TW&%U<~gPYQ9GZw?Wq*R4I-
zXSE)vgU>Eex}-e<_tb|*pcHXS$5a*D9N+dB<O3I09eO*U^Xa@gmQj}-mUq`We?J*c
z;inemO){?d@S|6352Skjq4a3n(N3_j;)hh)ap{j!kJ1KSu~!}G(NPxgQD{EH8W!w0
zn<0m_x-jehmC2nzQwKs~*Iw(pD8t?}o|xc!YE8^O%_TM=Q<%~CF+uBJ^=c0@P?z?V
z8zL?%fqgzszs};nl1%D6?suK?Ru4h-8q`dOEvWjKH5drlEJ}FJBoI}cKk>{{;qAR9
zuPQRmc#_Pw^{?UM^G&fsGmXteZ;*X*hybqJRlbF`MDCdQ9Dy@tXG%-`K21^Zp|!Xu
zy0NaxQ6jx_DH~R<bJ@=%aFA9#yxJhvTfN*EX^R}D2EXUXOF<Vg?^&1Q$A#u~_RNWq
zV4QBpfpoIv!b2I_;%i(KJxKL|F{8QkF3|kyA}_cNsz?5~=87XLx_+G7?D|RdD!eW+
z30zNi%*v<tK>vfKT@C2<mV6sposO%VPcd$@ENarPS!%os;(jYQ$FeCV2uZdhsNJFk
zM}k(LLykxX4>!j0t~dnK>;}|7i!_7LI8VG~v{<9YhIkwP-p7m`ZXB*P)aKh`N+&H+
z-aEI9P+Of1hxM{C!emOy>~v_rKp8*(Lr++JEte^r{2AQ;xhl_eKpT^ZLwC;=ffe5i
z+gg9=4_sJFR5eZHh5D@E$Vnb%a?M@Vw8ELKu?<v<&1%g*b=|a~_56vYm9@sxxjabF
zy#@Y(-;Wzo?n?2i*^CvsK=$IKPZnH}n1wU8b@j^YFSLf$&OX&|PNNN~HVx_Aho8Aq
z+x#yd3qU=Or5OTlBLYgQReYAdlr5k2H5?Pj&&~SEny=VnCYWibPDF?Kl+$}w-5%|2
zR4vdnOfWJxlV8iJn~%n7X<zRdGuBNpOcv{Vo5FVm?elD<=Sm^d=S4)I3A-85tknSq
zf(qd30Lar;lV6X{Na0D!BhLvCKQP?|{}e^R#}w<SRd|V=NXlRCuL89WXQ#AAE^@?&
z_jl-=r%gTAqK;<LI;JZ+OY+RiRWP#+<{sxo!b9YPA$6|(HP4s;a`vNI)0S<$+SGgF
zmy7w!^+<TGf*dr7=Ump4-YuuMNKeTueItQtY~@N+c5HU|71xjChBkg89z>C~PQf?5
zG2PQdSl5vw4BEV~z9DA4ICNZ~*`~TP{6fzQW5`ppm^D;LNpeZP@3wbpDb#e7XY+eX
zaMhl%)4}1x_|K4$Z+TkQI$A@t1{>|V?}d$#>jNtsBT{SHX})KrlZYf~`Phs&{6cJD
z8jU?K{0iILACl-<6IEOPHr6=951U=(7BaO7D=5OE9fxv4H&L#g@(eMTjkp-pemCC1
z@_MS))hel`(Yb(A|JU{5Ws4Af^0fM$<|}JAm|3IX>0t%ZsZG6m<V&YQ@-yczL-fRn
zN_UA}2k-2|e-#gX0i&aj=(EJdCB7s`my8bf8IPoUZYGmYSt_Y%l&7e~p^A;lb^D^-
z<pP0~vVdZ4biXaGqPti85qO5&U&DCu<nOupneBw;mhVsh6ywYcf_b8*D^A;0PuR-o
z>~7GY&z&|oo;P~OZ?r~5(G$I6i-~latks(zI4g3oI{?;6WA-11LH(WPi{%ZLN!M!}
z?b0{dg&$g{&PEn|5i@nh+LZ>*D_?9W0e?H>?{a7y*4|j`9>L__T_3N^*naPsmMr*#
z!3BvaC`noy6>7x%i{^?z<;=meDAg%bswmGunQquKqda(p_YlOa)sR@lBQpYq>P_0{
z-*Fbyj3~!(X9G$kChqj3(}_K`X!<~UV~Evc#6+@~sfhu^qf^i|&oDQo>f|y`PO;$X
z6<dOQhfnWxpOf~M;xbsZvH8UwWzGTt=E)!^hrECX`{DgKL##s79S1JIm(;_ge|Y2>
zqP}lbu!9iiJkd6zA%v;Z12j9mFSmvs*$gO|X*)M{B}-cR`xpfsDCFC>%03eINoGJF
z_KKJip5&@!EV=2cfJ25si1sw|@LUk3dN??F`h_hSgGMSGDTrwk-AIj^#HgtKeKDgC
z3}hGS)Dj$-|GKewwD8r&Vq~#llhMbdr#jKS+|E-ydZ8Lue<ltdJMAX>)SgVm_J9u)
z{m3+#&}6HL5pblAgd((CDS)+lzghBKi95=09Q?r08~88u>82RB8$=AzbuL!v;}}Ul
zqGYHyFULFhovg&_Ox#i6IPI@Ny-S+VRn@@h>Cx2tg1>!D7DxfpzZ2Z&AQ#j7^`HW`
z2vlO<muIyJB-^H*J%Q7|i1WtRfrV4%jp-PonPM=9dwXO*L)Ur{1zbfIrbOAd8ZwmO
z{0kG79d9?05+YDQXCBVR_34)+rS2Z^f@p|u|0HX5W=DcuqeMrM7aOJb4ltQNB>+LK
zAYC?5F5~CSkq@WgM6?&j*<}rbCQU6H1)T{*!=<X_+$K>io>1vCw=C<6Q@b-nXUe2|
zF5Pn;{sWZacS5sq1^qTNWQ7-1WM`4r2a+k9CyP`r&^FyU5CWH+uwIQY8KvlF{1$$~
z5G?B5m>BKD)WX<D_j#d{|EB=`jc}1NP>wpj|2fhWc<9YY^T-B4NO@|h&@8gS56}N*
zZ|HuJ{TY45ZX?fHT}Ik9v!iNT+w#0C(Sf6y{Wl!s{n;l>?VkTwUPibmi}HY})<a1e
zF!y~?%6WE|SOKV}8H|hYgO?Dq+)q+I-247kWTDw`KFFg1Mt;a6vi`O)qKu)%rGF$)
zYb~#qYBE~vU_Y=4Xz`)H{v;j~TGU8kE%`qUM~HIg&10+LFdxy9Jt_vTu`321-84o4
z?|;}(N1&G7SZIJ4*FU%#to4BlHl=L9mY3uXQ<3{FM0ndbHY2HFy$MoaK77xxM*{6b
z#&x<vq24_E7I@nJn`(mpJdMg<e+uZBZ0kPo3kxnW!<-(__%bD_!fnx#Nf7rwmK?(v
zsCAO(A-LzzL3t?s2Hx>R*^1!#@*)D!Be5=y*T%5H(O8%2*MkeB=*Yp_Q9lNgdfxP{
z#FbZQ0a#g(Mxjq%(OGZoc0A@g^K8s-qjxvhplZA5T;Kmq+<*FM2er-u{X|+F-Ujc5
z&2}bqo2UnEC7_^(*ugTzWT9#BT?MFPq|5!O?z=l)UN0F_e9YLR0zNxzQmQvI*x@|8
z5jd0YwxOK%riv<<s>mHG=cjk3W0vctY>XC?yVVT@Oe%gO->^sLk212s&C>9{aiv-k
z&*Do}CvFCSB@Jrr1!Oramw-svAORGVZ7k>#o{Lei<`=kC3MF=FIU#UwaZ%DH7cIEw
zNx?%Y8u2y?Ihv_>A}pYf-9VYm%>Xh@>29Xha-iM+Z$;412DUBB=qrx>E!U?#oCTI^
zjSwu7ixc*B1zPmpr=2w#1>mFvM+K--!)~%-ln<O%=v_Yx(Z>2Fzq)o?_*6DsezcFH
zDBPx-9ve{M)qH90tpY;nZZxEz0HxoOhPb-8$#eM{6=i2M<x3uq3ohNGyZEC&Gkav}
zUH<{nTxdey%C|*b+8&jW@5oc9P-O5r$ISXk{!tpe-q&EuJe70W2ozNW8i$D4C7g{^
zS-11n%1#)7Sp?*-80}0dSknO(5*z86sUtB#8!y>H&{~n}(O})-#>lb1c&hvHmhkIi
zCwf$j+d0Fpy@}-@;h}Mk^j{SIksY_s3V<-UxMW#@*4PfH*LMH$I7obz^duRmpeYx0
z{`FIzY%qGoyEx5t^5wbqMEB?82V+%Zyc?s46Rnv)at0b6t;Z&Ba^1X`-aY^f@F!^~
z_3hy0pi{2yugj%g&o4&xy17PKXN>B9lzQ|v|6FHuQ<M;MkT2M)S#%R&4hPY(_a{6(
z?c2I^tVeQ^?h-#KLX=45QAA01fe`jw`GBwsR7sJux573s-3*-ae+3ruDx-$Cc^*!-
zc~iDOAjo~o-t?71;)WbTYHprakhA8^(|pfn)ydp_qeo)j%ZZDO1pgI(M|saCDDsGf
zrY&^hS&atYx*AIaV7ri)q=PW6*|EIC6q}20Qj>KMJj+WW+P(0cCVnNyIhx{1n=!;c
zJKD4iyQY~ckF(k&l~`2o<)Y(=)RuK+{ntE$%9_^?IN#|k8wo%RSmlFuWdJ7y``Mb^
z_?KL!-;I;9--Aq6v^5%B4Jk)HcDP=y4E%pZnIOX>ZUfhAc+{Hz$k70C0%v0P*@5i+
zq1iG!7f0$I!`+}W=|Pu+#|+u{I#a@1skf+4KbQE%3|U=o{rtXv#Em0UN`T>w7DfLZ
zfKsboc{E)fr5uSj{78O1AwxGMa83tq%y$o&gVhy!b$i(o0~W~*mR$TqSTSGjYxnF=
znoA-2fw8I(Bw79MQqvYN9~W}glEAv;2i6$=uTHqvm*V+OUXu<o=%1_#Ll1P>gh8!p
zdy994Y}#%oE>;1svSy%cjLii8vlkm+qUYN5mv<4f>x*~&qhe3YZd=_L&>jEriJN7a
z+l#)nA87x))b`e}{-ZlSQXrzlJibXfE1b=47RQ?)@(^1g#1B}X<$*BIm{~;o9}V0*
zja>{EXv_Tkc8PyIv&%Sv!@T#ov?SbS)nBo`5^an#QlWhNj8<Cz>x7E0XkKNV=tsa(
z>`x0O!o_IA*mAzPSD}+|Dr?NMZCDOftS!UQ8_eEM*LYwdusTK=<9et84?-9w%{pQ^
zH^Y$^iB6lID~fljN3&sL9#nNfU^8*wMHC0mV~^r7@NwelGcM+|rwEs*$#RR^f!)F4
zxpR31kJi-Fi(bKI!6fCDAnZ!fSFvI$ziPi54Q9nI<F9Vg!H%IG911fYu~%S3335)7
zb4_UMGg)T6jLsG7+Jt~$>F@ESMNQZsCk$VuuI8yHz)GNAU+6uDtkd)Vz~w>&qkq6E
zn-VI-G(H;YF5p!avC#2`HGU64zl=?UifIkb#1bbtz4H4Z0jaA;)5osQafSVO`Sv;<
z(})g^O;2<xK!;#ssB`Wi<4e?-V@;z`UvQIx5E5C%WP^{f5Iw+5{G<XoIXE9>!)oR!
zp)MfYAmU5sK$S<;aZdisp?_hqv%o<};t18bXUBRX7jpT=FY<NAoPWp^X4uoQj2D1o
zmzBl(jD@q(mkS{Eb=)RRABN{mGA|Y;OBab5*5?n>ZKj@2YB8a+7UR6lna~qv3A@Zl
zr$V7Lhd4s!gvdFWL9Ol1NQz~JV#_k6Ux+A;-z~9f_Kl9N(6gT`LEJ2KG5sQamoUvO
zX2=;nta0Xw!D?oD=$VcU)Z@kA%Wb!*T4rK%5O~S1y+wo}mDE~@A6KB1@5+!_)&d{q
z!P873p-=8><TwQr%%Ft_VGU$gpeh14E~iH&{d0e@(Ly{b#a$qZ5wx1BCuyfMD+~L1
zp0V`vN51Hf=TgN_CspzhPX=W?n>lHKfbN0IfL?v~n4mMeH06;_+sLaPmh2(R&lqMT
zMtKY&OTT(h!ji_a7KsZ9f5LpGAYCCmT*-pAkX>acJ!EOzSYIZv73g9@C)HOGD1n$M
z#m58?nz<?-BbD&nyZz_mL1D$StnmAdy^kMKA92%3&GLQMl7A>U^|_Z=W}1c#Kd7L~
zVfO}ae7rGo0c3p7J_Pcf@+UK_MCGi2)?%9@FHA)doPs1^$a^;|JTt_FCgo+wIA?qv
z^@ag?r*z?ZGw6~<HdzCdPQIt+baAaOT-a;zWSNsuD(;Z?c=&~W(D!rAu><7{H~3x!
z1Xmq})J)|ehz({FD*X`4Y;Izg6e{&e#z0h7E<nlR6UjlQw7UDA1ei6H^V;Hm=ZPN+
zGJfc&0@=(b1>%09xh$zR*b^&d-}yX?6C}V)XUFNS=d8Z{hpcvhr^`K)IaGR$S`mu#
z9<SgnuoTdc`k{oBJu>)t6WLlLu9i?zvn+m7mi@Qtjc*Z=C#MYzp_B|`dLxGoZnHV@
zQ&C>2sluJ|`-;tmb9^`S4_P?deIqzI*b}rSWIMchKZ(y_)zWBK<*7t<FCESkttJop
zko!k@$!LYx4>j{JMc3<RI8tjMgqXMtJ#zCi)6u8|I_P_3B<gM~HlSK!g2Q{cH5+h9
zm#enuB{`K0nAwAFkF|`3AqPpQfz2E!C(&dt$D6ti((*k0Mk?EpkmR};w7Is!mgMJ<
zoGqi_>!RL`5`=Oha4~WsIBkm8(e>c#uke6&f_MpVLneDo>#x_HCGs#C2`#e0KK?%|
zH3H#lIrmRwo`a*<rLK<dAus!Qz(E?2GRZHVkt**<>`CxC{;KFr{BeRwUmK%Pfn67;
zX~4~dhW{~68YHD9w%Bj3RRIaNReQ|zHEfa{@#2xUVvS@PU6$0@qy1nef5K!^VJLWU
zuN*MZE&mIM49BYKz99{v3$D_=S=gi>yYxa7L2Rhm5(L&)uSPN0I5_j!?}OPvsC+gf
z`ZGq#OhpM=`f3g1Ee7dI)ttc6+}@Xiz86O-GSEf~e0(~_lH%hyW63kwO<DF^u4|22
z9lk_4t9VAgO)h?=#)EnhujSD3g!joMd6aC-hc8UbmofEgoyX`8j3IAhH2amw7oV28
ziK1uw7(SAkMSsfHe7=YinJ8aOJliD?3CABgPu8kCozHrq{i2bX?D<+O7QNc4dC+Ea
zAcgIlzvQF6wvTYva8QWYi@k^QLBXG{C9#t3@vH$1+s8E1_cxH75&JXbsM8gz#wlJo
z!ye2bYP~c;hItDJ1tF+qK}Qw;)1*LC*|Eu!!PR8xW3BP*Tcg~_(5-tESS>V=y=^j+
z>e81Y!cZ*eQxUD2clXGc&3~56{zk+?p?T`x(o1C%ht@pp?stMc2mfkL_ycGB4X=V_
zhC%TP(9RODG1(6DtTKpBKB26{bFyu!1Mwh$Anc3GbAs|~-+Pd7OOzIZ?wF)6maBy$
znqn`+mKFtx2X*0>aDyq;7ExwuHR&OVAB2$)%$008-^STvVp<6jA<*CqG6Um{U{kA<
zIR30B>d(mb=aFf)Qb_;Z{RP1sH|h$F#I@bOZ;h$I3Tv@vGNI|%iu0n~hhjq{vjmin
z|1p&$1w55_4GsAz_Z}&*4u5+|gPk4iM-Nf%(&qHOTlxph7DvGGZQq65Ndn!^gq~(&
z__YLYZ#%XB;c5Rs(zrX4ifrOvEM?!q`1}|0%YSe8H0%1aE~Xsa=FKJb$fLM6@8SD8
zl^8dYoXQ(Jp3~9Jiz+jqUstFGkqC8ig#l)ZQzw}~#|JSu^W+uc|C5Nq1}O)lu)-Em
z5Gd_w7E)^5Ytbck1>zQj63x|V$Xe9)C@Jt0DDCV#T<l!edwk~(jxS7JR=X8S$GZ~L
z(;h}SsAR-i62z#K@EGi)CXU8OiQ}%d9<0Ny8RtStg~G!=fVwDHAhE(*UkVw%Q$}MT
zZleAaPeSG)4PbPt(NNC{@Ae7QEkxjp3^X%jHK!$H%qT$tN}x9&ERv;>Cl?^qn$%Bh
z${PQW;QDntwwZ%XbC8~;<XSIvL<Fkc;(St`$iIVhR}NZAn1`8QF<*s&IL;ze#8xdH
zGq~{(_{vF1Lhgw!LCB--eo=H&DwS)H3~G@)qZJR7Lzc>L)@G~YMqr*@5y4uyHsD4J
zS2$$+4rh&H%Q64dn@|RuvGk23Yi;f_7?R)kv#J;oB&B!)afUWBK6Tp3WxmKOXp_nC
z)o2WOo>7Aey!3|bG?H>BSUeYYk2-6-pFM2xI5$hXgF^N2i6&$T`2zY{-HU`{@qzi@
z#*io<aMR*rmMS?(TXym@81VNbAFCZ1VfTjjgPsQr9w>>9=Ue3A{@Pk1=KE4w>k%D%
z8JE)<(hoC1ovs0Va!~%ialmEXPc$W(NY&cqO`{re!g220qseYRc`1ebRFkS8LQldM
za_3&n^SeGP?w{Xh-SP=2LR<z%5pvC$GIgH}0cy9wwelAp&qq9=wJLs&SdA-<A;TZv
z3?%0A_SwT|sZk6S<77?(>5A<E2YwP-n5HD5yUnd~T}vE{=3zTBOFUR>W+MEdkSycN
zb?<hfnB4g^mYzG1E_kTZiCdX6@kDMpj{M;M{*%GiPaoN|kan<Pry7trv^;rt)C$qh
zh+~(3w(7YiLI7c}rCeHy_3)m@W3dz1!`6M}xRk)Kxb8+B&XC5A*HDOj+i@-LzMuSU
zG4{sWptCE9o(@tDtX^G@3XZ9;+gscGTdQb<m1FGpALqK+{c%ff#=}dFzE~}5f8Qj0
zo0Ze}zc_muk&Zb7M?^OM9i$j2m7h^Mm*0!^O@}G3I&Xtyln1iWE`QBg7VcsZ=N}LB
zP)<Uhe%^%%_r7ppD0rY>m;^6NPL;fgp#O!bfe(!n&_gh{7tI9dHnY9O-~gyb0H8Xt
z!tK4of44LNQA%MeLJ5r(g&_Y8zxi;LA7L?}S-6c-Qvbtc{0FWFMFVjCS5i!5|4njm
h1Jr>J|5(L&=}F1v()Mv5nLEG_OhHY)7-ACee*ldSo3{V}

literal 34837
zcmdqIcU03|w=OC#A_&q11gRn-pj7Fhy-1TTMd@7x1Oz1XD!oUF^ddF%BE1HQfKmdX
zg&KMZE%Xk#L4Cjd-E;Oici%h4-Q$kSACLhgYp%Iwd7fv@U-(Nk1!6*4!W%bk5G%fr
z)x2>7*Yw5>oCgHAZrr#r^onf$#tr`)in7w$?#7!Lc*!i?DIs2PV*&oM6sP_g+qcHp
z7uZ*Q*e_e8U^RnSRlG<zb009NxmWt^AW=JY$b?JPRv=D17wwaYzVG5vTMWKh*SwU=
zc=VirIf(4P>6<K3vi9KC{ibZ-dvJh!f`!{H8NyWLE!ta;oiaT#j0Sh{xC~-u@J>;S
z#JsDXY0!9c0QD5bZ{xy_cBG|kgwH&riB5^mMAIk_76Znr+3p0!iDZ2R%5$`>4rz*X
zgxfw=f7A(V`^xrbQ(ND}Ofj**=)P&K!Gr)`JwT}O5%qzEW7`GB0EByFuPOT1k+o)@
zx}Wdpkl+J+m0#WnBk2+|g;cS{EsQ|39(C?xo>;pt+jI{Gtkiydl`LixGyF}vjZYEI
zQH*yCDEKm=(qL7RXGKC26Jwd1w{gKC?aqjAr!CDt)&4{!i4p_m^jz?d-A)b&Npbw`
zY+0U{y*Rk?&Mi=0FEU9#=>f_EseEUQ@u0n8-D10Eq159}_ozEYF9)yid@?A9kLS|K
zjugpmO(k_`h){h9$rD*P0Og&`U^v%M0ryB2+VA4ECIe@lp2<kBvUMIBJ6cOah+y$d
zc{=??8!#LeRlh6$2Q8_%LaLg;k>qXE0OkpYn%Q@!n$1{Bu+mw>lGMhLBT`(CWc;?G
zj&UYc&_0GgZWTh>YBC6rrvGDd;K+ye*PLq(io4>`&I5*cmvxje)_vy#v+7S2)M2+_
zmu}J!Zl9x=TeFLe)s#oz2X~I11hdf7B@1x?-btpT;=HA@{IVz&rf!4=VWJ(=+d1%j
zGyw8RNelBACzUS)l~=j4agW|gFXC<`Q)W=ZVufOI*m`jmn~e!O%0#G<_#|ag$0mwN
z(Ss$5h!w^$oGC7jm^XKi-V7}pZW|HGWIgT_-<v(nQq;*L`fW1pckJ1x0i&aM_wE92
zq^HgCusbSqT%hsFUJ@E`plHJZUn#r7?&oR-`D9Wu2%EC~jV3r4!4WG4_8r@45vK8=
zXgeDlDeF8?G<@$?8=JmZ#g}V5DA?Od0(Q;_7D?8U?%8$hCIuuOQz!iR(Ec2Mtsp?2
ztrL}nxH;dQun-3pF$!vZ84(HXw9W+e>I)GtRJz1-8_ea+w|i#t1oP*hQusLlvB#`(
zV9noI2qzku#M_-E^sM|M*i#kB>jT<9PTf&Zny*NRapPvl9sG#9K!()9->e<lJ@(GK
zYn9k>d0cJ;kcYg8Q0sTa>(>S_4GjyL)&}u!WN2e;GUj36kD!hLI$eSDoufEAz7O%~
zS5w}KI<U{sR)ZgqjGHJg@O$t+3&SP-44dy;Nas_Hjg~2nzPkO}g`Qz<?T(&Z)HQO_
z=)?|Peiw7mx8(AMEx7FhwH3I04_D|;9qq{6sx*^B$=N&hMNH=jV%^#=DCG-&1Ru{K
zQkmsz9nEPt_RNeiI)h6AiHc2YMYwMf{!THm!ef?BSNMvEns!e7KCzl3FMo1&7+-4~
zS|k@3MU!6~?AHE#{!Q6kd8LTlB&2J015d%Huac_&IO<Iq4)o_<Gt<L@a}URN-Xgir
z>a7S?31bKrxQz+-Pi&&}NN92enejh&TpM)nRK!cjhi99L=BJqwvlhvvfFe)9?K5~K
z@<Y4$F`JRo3P-S|sA;L*qDcB*YrDBxBB{JZz^HpWvBhKn)7gaYt8Q^8tb?0xJX|%?
zpR8BxHVK9ODKXamocMQ=1G*-8U{$WxMRVzHG|8DX9oo%$fr~q>`laW4bM6;~l#BpV
zB!|UPzZe@?xx*gG_{b1nt}?TT=RRI{6w(o$_Bm$a(D5+O_RVcReLRC(W*bQz#j)Ku
zQ24IXh4rgMT%lTEU)b=DCXFN7SAuaCPbBW@%u#<eO3}$VwmaUPm?qaiMfHs{nI9M3
z2lw&DIB~slU=(pxlG}%ki7w;cUegGCM?zC+)Z$PH$pDJpK_qoJ_O<K>Kv&$DTgoXz
z_`d<4&xV;`(@KdiX8|G%Y=;qc9CHNE(*z9@w&Hfxp(Ohp&iEdqCU-vxnv-pJgzYcv
zrn!Hq%CC$LsEm&{pUh^VxeG9rQi)*E%GOn09`G}4y7iU5(lLTH*Hk6)yUq8My?LZr
z2YgS9&M%P?(ZCg_9a9(0(@88me?z{tlEo~F{wo~cVA?B`79R#-iPm^Ih9HLt{^<b5
zUlMV<2viy@#k6of$^)W3nHbeJ7LD|Mn;7O^oH}QF=Sl(F6IGwgjxHaX?M#T7ijdf0
zGF?bj5KxZ*RD92|C11``eTRZw_qJJ*EJQ3<=VZJ3Q%jm7?^(?U6fHJgPBw&jf5y$i
z;WC+%=LhC*1}qqO+ixEs7Ir$g6`fS+%{D7fT!xmH%s|n~yMH<iaYwHg_)#Y{x7Ek}
zb`RM=q=QKeMc0d@*?f>jadZ4;*_4}l6TK*MrgJ;Ym(X<0A7`V|u_5{DhkzW?qtp5m
zB=oc?r-__CouGjEE|XQLi?Oe4FSHPDshGqPQvL0`Tuo^qi9VhY#vFt1^xW#%P4LfI
zt;F%U#g@De8(tzJs0fO3=)(CC*6D73Y7r!?Eb3l2ot_J+M9j_$51epvy!X7jZTMr2
z$>}-x4mWqOR1)Pr*%${yywGg-q=@_?joF_Ld*D<oYWfcn@Vx)8*YJl+{(s7WY~(mM
z$M(G+O{~Am!+#rc_}?$|-7Q#yB@|P=!}dAvDhDMWeUbHuEmtA>1pR1HEKf;QYNn7~
zrFi6vbYKoI$Ji(4z+CK)9kbypg@-|8IkRzB=nBcWJYSZc8m=Y5u<|5xj;yAnJ@%{w
z^;22NSFc}TopMH@(aEFeqnBV~@WYnOh^JFi(uHSPLy{wFFWkGhv>0pP0b5RlN!;02
z_TjouhzCu^5&7Qh#yz>-nv%;Dr`me2us&Q<m!*O{I<(X%uxGB22DVZvpN_O9D4gbb
z_pt31eJ#HGoB*B`_QY3ly*Ec?>bY^?nkPr%UXd3EXixDpL48Tf@)axAVkz=h7nd4-
zr@M5hr=V#{)Rxt@3g)uJE$8G?nyJmv<ZMVrI1h5N6uf5Wch7S2-X5(i0+Gvubnjc<
z1tC2j0+RAECo3}tL;Nv{nAIev*Fk=VSJma>D_oJYS!XY_6Qz8Xv)8=*E>X%Tt|Zln
z@(P{k6^gTsB!@ipS)W&wnfbH6n9m0V%4*J8epkEua18CbgOmq<6LMoQY1_@i5(-aK
zpUJ%1Y_*zRWokLHJ{z}Bl6P9ya9GkP#GD;n9%NXdl}x;M&JIEyBpEXXh7e5RNl2gm
z#_fHyRMx`YDwBy0k5(4sO84yB{;~YLao#1;*-c?Sx-iz;({E2=+ap(H+*$AJTzyul
zLgZZJ_%tc2Wo<9;nb{(9mdR4ML{ffM+d0s4^_5lg>S?S2cX-ddl;4%K=Wbr^5ay~A
zA71P6?<YIUbof4-8Hxd{+EwRStfW+zG6&*U+k2dyY$eGdymbyojC2aR8Ct9;w-2*s
z+mhb5eVasa^7nr#2e_sHCv8Fs5GL+Qlob#{v0tZl)&f{*pP6_9FtQz`GGt_GoGsU&
zFBt|Qr6ewsadUQ}$Mhr^L>x^gP-|+<cFo}!siy+7)6NaI-qE;StUqlE$-j*89C;zh
zOb?xTY?oO6wPOh0$hnJ|aXA#--mj%jr@YL{E*30=H`$h_Z`T}lX^5oS9_bA$b~CO?
zF4FK+tpzkPpg09n6|x;SscHE^ULiWhzD2;)Y#E~~cwxPhj7Cgm>*~AX(?+ifr{jbE
z&;hyKmZJ&FjZ~r2pJM8b-lz3htqd6(MV@i$epm1fRB8!lmYS4jLbFL`eYLvCqX;XP
zk*j7=wy=SEW8V;EuMm`cGhnDoVyp0S`ZQeZ{rwCgIlT$L`mO2xrjw3+ux*9V*bE|S
z7{r~VSCLD)rUWtm+*42XJj_=~d-qtu1HVsbjOPB7`;6dFidA~sTlHNxMFBONs0qi4
zNHwV*(ImeU^9V8DgE8fd+|%VLbvdW=+{)@-={k`6^VlejB+Zo;@p2^@&K6@Ez4HlE
z-Ty3??lvLxel3lFwW(+OHpLKCqMmpYw{ijw{nPWywDS4_XF{m)1%eSGXaxLr>N!mT
z@=R3^iw(9HHW=_xDC=sYhArHU<b28455>v7m~{6@K;Xy-3>FdDwY0o@1S%cVv&aVq
zI+Y}D<e^*lOabO6l?N-z>YxQr3E;3CYW*pff#fzZ{vC#f?wbfFQ>a&Q<`M;inn<4C
z#k{tg3LEaM#m4^2tk$zn)l~yBo4x!1&yldc)dZ^`Si?4b<qVr$CJeA2h3X}p)g@tx
z)&flnGw_=FmpJ&GpnZ3W&+W|)`hC^i7BEEX9N2n%8CAG|1{JUPXR~cLOlJ_$mCgDJ
zOdJ|M(|5uUQ_D&Ws2BmzOVFWeHIZ>6g+PCKk=5kKGK*|cn0MJ4r)L!PpFuWv`0azy
zkewQqUajnwHIKLNh*ndcuYl`$?ISeGMqe8WwXRMsXtO-^EQN;TllBf<Ofv;>sot-U
z1Nty-HH^xmT+_;!f|=<M8-uNGN~D<$GL*h6yZHIaJZ+9=i}hJRVv(NWXp7Iw{>%lu
z+`O}f8Z2>{o;_*NSvnH^V5!SBLE(jVcInJ6$&TAOo9BzKMNOmj25)pvR5|Szf<4~I
z8Oiwa4KajRIHWVv&D!QKC#s+u_Dc7h97Ii-THQ?m!*_-oQQk*}>Lc;qO_x3V!}#x8
zD!rqeVTE6^-lybCxQ#!Qm>&#ZwxF-Xwv1-jQ&_N5O1Z>&DSsom?D^)n`?M6~{t<mn
zu30SJN(sS~OS9^5Jn1Z3#8k!$w$j4uGS8C}0g)<B#C82)EY_rfD-RXv(xVkMZ?GlU
zgEo#ow=e_=XsL?~H1YZVsGoFiNehvei9t^6tBX!&fa?=a%xI?^s-&9`Y6gu1Dt>x?
zTd#&ZR?_=bHM03sJ$=_{Y+s8!PB$K9r8XFhCK&AXoT9R7x^)z=-=D!)vw8%_bTpC6
zYcaN+E=24w)J+fm>_US&V81$!B<O3pVY<`io5hE;qXOGy7HTqg4Kke<vbWu%KPQz;
z+&H)njz!XxwVu~B@+oihi=)%9ZfB$2^7Z7&xIH6yq*oC}5y8I>)nE3{wihxIMraRl
z9twc=!ZftxHECw*R>^WDRKhK4;Gu_!xFPK7$Hs2Plt>D}RCS=Sk?)vOIZ62uWL|||
zbvK%=8PM8-d$g3H)1Zc9l9g*gItg|8#2a9^acc4CQ&EU(Sw1z(;)UNhG!*_(L9g1W
znPQgWwf1$ZrPk=2Wm;ya<PxG^^^_vlQgnPDLULadS?JI7nm!$RJk#jSwL!YZ(0bNF
zF~c|P-NC4x4oJ^zAsK(QnmOm0;CI-X)U&GR_jgmC-?n0CelJ(8^(p<Zt~JfsAxCY&
zgRdSq7_iYkG!?HvVB+Lb;{sC<g?QK`30g^ZS=n3g<=Iqfh>r9=J@2>T(Fw9(>xprA
z@%2^|?1q$2P3k*QQ$)I7+&Rp#hdCEt#9tyKWPI40@N+kgg?7@RfEf)5jhbGA>u}_1
zhpz%VJ@0mFYAJ3AOUq`Q(SWhUdbD`YU6;RHMyH+oUS|eaxh#uzH=nWG*-*u|+fW@-
zJf5<jVsBx4_b7{oDvy4j;pmh|&gveCjmW!I23~p|dS0oH7gt|h?^>0K;YYzssgewy
z5gwZZ_>M#3SE0$YD!Ryii54YGZ31RlDc0hX^IUJ-WEUY3F%>Eci5n~=X1KvjiVGR;
z<Hql6env7G(J~*D?m^!p$Q&graO)<{Y(?>(Uid<8q-Xlkk_w2}+?jTS7rhT{*=fdZ
z`@nx+B5EOc{$f@@-u~i>{R$It*EntMOx&EGnzr?N<F+k!hO#^&%l#qe(H!F%#6Ej_
zxaBhK>fbEZZ5kGtw7w8jEc^2T{BiFc4e2=F*e?d3Qa%CUcCkMk=7aLm^Im1`c<k;C
zFf)*vNmi9G&nI|LX6~EI{sfqc_zUc8`7C1rL@X7{B=CrLeZfA9RVa<7esVmbau5fK
zBEY!0>O4|2#*fnYb~vWbN{9}p{mItDGhB^&GbfsTXIvyK=FXT+uo)6(kzZ|$amry^
za?yRhZ@(poghuNpR2-VJ@t);=LHRx=cv#{7Wr5^zTJ93nEX9b{bxhvBj>*r0c)Mj_
zwBrUrviurU7c3_}rU(CWyR(oc_jNicxb8uEu|COoP(Ol4%+y=Bb>iqe)<I;ba{5E=
z?)xVXQGAxMvAnR+5ADfF5%--h*3o|rS2p0&L8KxsirMFD!^;P9O9fq`GI@<gyW|82
zY-B^tqDr4lc1q}0G2bA5PXCoRz1Gjrpcls?<Gz+Tl(#_E$wyt-9AzUJ+_UJ<=Mz_}
zVy2tP5mxFzv>#5+(y@^d@Z%OCuu?nC{IzmxtU|V~RQKh87-Ex3Yw~Lix_nBOF6?^q
zV$cgc_bU&hbZiDGhN8DzEo$Z^G#r*GSU!x(KSi~hYCpY0#K5~368oB7tAJ5+n}C#|
zvZ|L`Bi3r9--_PtQ(Egw`$wmz=M_)8iJ_+%w-J@g8XQK~Eh*u;CG~u0C$oJ*aI7Vt
zywc6L(bC}_r84h5KgSzqhLG8@*ka)I?VETeDy}ZmgXL_mU2*ZpviQ!rj)&9Kd3fdp
z$ks&jcemXhmSCI47_*V%IyseDRKrBqTR*C!63zudtlwD1hV|$i?Y7oC#R@iS{x&A|
z$EMu9uH_l!>7*GoNo_&=C}Yn8>7l|q(23iz=o*{lSL*v5+Kl*7t8g)VV-Nbucf|;t
z+*o6K%4)Ggc}5TTM8}H<hf0~pCw%4(`*p(L2VYO|GgO3H*wz&#;is0uvEMk}T;@eb
zx=T?vGlZCB<97Xc$nvFBeaT}NLNuc)ePP!FeJ@ZldFw@AoT<JSO;_pdD=T@p-G%-_
zYWX`QY<~K&T<0g2atVZ?oUCFSXM+Oa#A6Ysa+56w@l3XutH-p!DA|*Y+bP=$6^(A~
z;}WsPnh@haq;0wo+P?}Jg|{$Iq-Mh#f2(<%p=y>Jr5VYkjg(9|H<b;X*%NuIbeGOQ
z2&2GW!2j&-V!haEXjYdWGO6*%#mwp#g~?JNMFO*J=T2vyrbxb6<em-yP!bCd?ZpV@
z%%?5WpSXn8%5#?ChAhGoiio=_R>tZp@3BS=XqROI?iKz{T=~FcD^_oHt)~o?S~27G
z=1-43N9a73f?57%Fg4M8sXD^p+GFGZFij}Qe6MCc8t-lHCN1B55!r*flMAb3AT|bm
ziB%I>76P`D<%VpMM0o4Ek<fJ9U!EXkcn*+EzSV<n=YQ)7XkJ{X8T~4g*ns(xdCiap
z!k%1R6=J0f^HgeLaf}75u2%q-+|e;2MO&l7aX{4ci?}>f=3Jn?p+fftz?=qj$B$yP
z?O4;Zd=xcs!03g42Ml8J7~>rbD*&g3e8kLm&FA94Mp%Skrapf;T~Ny2%i4#0@@<Gu
zqK<hq94rE239nv7mr=+flb;z!VoP3!qB#RIOZ*a_U&_&I+jUykJgdwBBy+Xx?N(hP
zxs+0&vyZr4hp(fOy>~8JHN0ulW&}BHi~e{Gd~3KinQ;@^<V9XROV6f!2rv}~xxTY8
z#zr2qiO|<2@N50uOb%L%8Jn^(eqX`Ij_8-V&>-k;2QESmaV#tGlMO&Qj?4gf#JeCI
z84SmJ+*0*4x)(k@+uMYe;;ty9ro3K*?unQ#;j_d<8Sm%V0h&)#d@@FHoM(dB6gFu0
z3*THzq`$LsAv87ifz&(8&cBX}Kr@<>1>ny%;ZgKKX!KrAXl56`md1D$Nhl}q!;4Uj
zxbt$U*$k^C=ypJ->%O<tZi_Qj!N7<NjGro+gMZ#be&EEfxK%t={x5@v&Jx4`uQsX3
zXZA!M#5o%q<hD2^pUIyZbF?35NiM#zkh0a`8N)cXO%~iw$Z!n2;dz0gsoW*A9BmV+
zGsYr{FpUn*)j`70a0LH}MV8L!caL)hV_?H49U=N%wbq?WFA=_|w&2`^!$RRbf~we<
zOQzP%$()zKu@l7opjoZ5n7>C9g^0f=ykZKCVNQ>9)c#6-D3G;gXYNdtS-*2t9G-G>
zQ-2Lq<>1cX@(5$wu*i<~uRa6{Ix)BipxJYwUnZM)0XXi6Tw=7^zY}lG%daHKBa(k8
zyiavde82S*vHS_=eo8UkBcOhU&eQ#H4udH6YS*qz+X9-?J+Z|@*|NS|PWZ>~sqE)N
zC-QP&=`xKj&(UJ<4!%{7>I{Q2N|v~&xK|q|C9MB00~GoG#5vy%`{Thh4%)9O=U?ah
z@3Ppb`1PcE8i4dO)Jui0L4D57+i>`uF}-I><^#*GO7_{6RWw?57lw0oG9$LuJv!xU
zV<$5UKcC9pUoWe*l^{<$JhACMjQDruqS)l1?wNgGh;kYAAIU&itCYN#->9no`^c1h
z2G7X4xA{S&s+^sB_4k8*9^?<<!_rj!3<bRoWT{Q<yaZJ<<h4QQxK{16EE?iB#YMyp
z!L&YOKfyi4zslXi;uF>)o`1q2U$+OmoiE<QwyY%?tN!`bXDk)GlGWZ-)-?G?u~4o`
zP@hruNKo$qdq2#2Qu5?FWWx&=lJeR(|1Xr9S;TkG*xmsEHz68y1~*V|D2v~iK}B)p
zd4NdM{=Nn^+eTJGB(|&t;-1cwNQ7)kU2w}S_+i;y+gl}N<Yo$zm?1r$y2h3K*)k*p
zSuKCoI1P*Kx#yUerM)-w{LEb*>x5C5_;b6gtlDKsgDorjzHi(9Ep6=~!UkJCY-7*&
zlt|CxKJXaJ?d|kaE6lejlnEn7m#O|u2l6u+cEU%eOpu*(M;-?T^W5k!Sxz$UYjr#f
z7vl!OUQwV6&Aishp`ne`GCa*oIVgV|+-a{s-^12&H-*ci^NNO~jJ>M|=L|lIe?+O-
zC-%(~p4)gZp>+Fk>P$Z*`$f($=mW+8c}#g&F!@;>NWb&J#+dW{!0yo0wFE&ZWqN#j
z+qa}OsYvgg4kNRx(^4?H<;zrN^V|qa4^PUmc_4xDvjL)6`-`_Fqo=Z=crFY)X2vd?
z&$c@c*cjT}C><a(b<u#m2G`x-RX4=cN&mf5%KrfINwOexdUgL$qQ*!0n%ySbyijzr
zGo`U|l)frI&>G;>%{6m)<efM^5gmfh^N85ZW@))El1re=K5KwT&F92MVJqnu=Q%!L
z{U_a?0ymbe@&fTmdw}ikHXF=dTe{@(q~at3$5V<VG{lyB9KASGSlMBy7baf!Fe(X5
zk#)ZH2;c6UOUM!9hSiTF{XJ~FX<SkFyBkh2=KM1}E~H_i`GMgUbz@-Mp0+-Zw#L;k
z*A8*_1rqM8U&7~_J(lqCgu6XqaGu*<wkSwxx!8$DSyf0b`MTe~PXtc)K0VSZkChKR
zI<O#d<l$kZ$KOuuecZ@P%QpXm3?cQQmlfq8=_T6gKBR%lvmoVk*t)#9L^Rr#x6bA5
zCGfRv)IEg(TZeQqyaL3ePTOpIXS)yBk^QoyHCcNd2T9H4l5A&~=kUl@q<%?5Q9x7x
z-|~;HsvI+UeLbyvemj%7<xbx)ThJ4eP|~!q%L*Z16Y_sD*dRgL7}oQ{J`q<w@U*D8
zoRdkbJkikZ+rbFS@fJIN)U7@GYS<=y87ID7pSbhuB|L+l6indg^DRpJ&6uYHAC@h!
ztR$SC*Z79G=Lmr?*KjCbhtjE-Im8Iifzy24w4c-DvT`4+UIE!j*yCgH5aP+?5GfO}
zY)LmDNyer$@-Y_VL|pA2iox^<5L|p^Ae5_fzhsH+-219|5yXvMG-cCys`Cu!39+EO
zpPING55mzxz`CgOo|3wxH`D5p3RZL4;8;%Wc7SkZ*eBkfso|v+S$s{mA)k}HUNiJT
zxrxbo;k=6!eUrr`N%zf1V0W325=hciSeUT$(lOIT_u$6|WN0k6(<X-Is)E0t*T{32
zwX>F9g-ZPDQ@BRU%NB8$rLcKH6KgTwL#p9^Vcq73c8&S>R4YW9(d9)f=wCvu4}UOA
z%Vb0^Kp``v+NJ#~ZJ$6w{gmD1wS0->BN@-~JMaWxTAUlLNV|stra(qV$vfCT*A7vT
zWUSG`rXAQsOr?B|ZB|wI0Byr#bP6Oi8LA^zSIM41o*>U4t7!m3*-Np#pOHmL7ieff
zMwf)T<V75Rj2h{*#q-l-685*R2lZUXCB=T?V8wd)hAfqz33&{)mZEMH8tT|iGir9T
zFBH!H4L-+DLW9|_J~lLx97qsLuW%g3E!c#Y^jXn<G~CCEuuT@U@ah48R#R6`?`Jv>
zgpvDKTao?is}kd@^Al#Pksi$26h|0Vpc*+SInLr@GMMp$;=v~J%zAc3*%;Spjpq-f
zpWXvGGwBiaww}yhVo<eMBGG^Z($!alVH$IS5RaSMQgXZ(+O^()rRSY8eH-){c2h`Y
z4V#0<x6{C0=$6{^bSw*0QpFFGdV4dqHd>rCW5ItHQ&q4(9Z%xijlH+iTGJI<0~N)u
zB`i3DMNREY`JupEqcgh?6q(R(Ihn`Z2JZzVCEj7oFS)L7>}C0}fUxah!71Mp$`E*a
zCm$hH7sG2k_%}iXibDr@^P)4MCAh0^kv?^Yw%~D`u!z<L@8TNVYjdEQ?*_4qp9(ZZ
z;AE45*AoRr=P6JrL=TMNY^c%2m~hPiTLq0k!sed$j=y7|9tC0xOCEgvYg3@d&F7}o
zf1UdnmkM^A-ho=D4<^?-okyHxJ&kl%&Dls}mDf%=TVmMa#OJi*t-enPqb9-Y#%@!>
zaWNRix5MxhyUT$<G@029$?1!f)dnqk-u2Q^9sRlszqg{MBld0l5ezp_BIe(-7S){F
z*S>$*K439bA!SSFSOjKX9`>4hSaiPt<a_(@{F?)4$pl_vX>=u4xv75`C8%|gb#Y^{
zQWqR;QTLV<s!P!>C}8Z^p8SC()5NKfb%vJTZ!<$&(hI+D`*Fkf9HK-p?%1M#6?Y_h
zMXYQMhYc1<YHUK@HrlnA5&yHMOJ7X3yt+$?`#nw9+H*15x2^e_j7zLdZF=VG)WmPc
zZa$R1Z$q#*Ash$2#m^$X4-@nfm9)LlXp+9Ny50Kt?CNdtt5exrYO4sFsw<4KS8dh%
z9`3A(xXfGoz+Cx627$3YxY^v|Vvr*Q%+%_4_TGlp>FG!@w#9O(E8HE<y2Pj|5N=4E
zs#sA}lqOrjCVCLOhpDSG-Cz=P*{}gA-}9gB(Qk`BpK7Z{-F)^cW1ug@R6BK`Pu6r&
zNXyqpa{huMX#fbEK)*};0j|us<`IxzVRKK1pNr=X6vv`uo;QkX_EaMuEYgbu<s($6
zCX?z}A{Mhc9Kj(1OE27kkMFX)J`^#{<KO7xZxS>zJCQ)#18bdSVmRS{8(<~mC-Uf)
z@!PRpbwnc*x8X4&2duGdpXFK}!w>uK#&8o$@YK~aK69@;?<>r==dI@EjRL7Qn2a%;
zA<thIy*m9)+pyy&FG3RaawkQckghN68Dm%X?RnqRO@*KO*{_$$%|`}SC|qvZzB-MH
z_q-O4R3k0~RqTlus|!Ap+>nGEPtR!3_G>=fzg_iu4AE??GW7^@q9U;=)<wfma~iR)
z4!)7)xBHU_ek`oKq@}BIeIaG?tCY~$I${nY&ao0qIw7Q)Iqg0#tJ45Oo#S;f5*yLB
z`Mx7TGv-uvSmSj|xU?Tl+B~gaORKbnN&OUjcv(oZ!7GBUuu_m|=6bf^>1k4(E-0nn
zUL)uQi{udgZWb8eXr-_hgI>l|#xQKme!47gd$@;8*wvRs)GN6XK2-Z`a-5K_mJ(oX
z34Rblyu#_yLn-@I3@?_WmvjcGK{-&ORq#Obc=&d(^8lX<FAR)PHN|}lDX~u!z|;6w
z_NP;f2?Ryoq33lKd6dqmT;RC7XAUXz#5O($K7)>7nw8FSJBR+Yn^VEgZD0~Cn%e~a
zD0hn*sruXs8UWr~_(ak@>MH%_O^~}UwZc<c{Q0wBP9OAV{Fwt`AvR`-0{agY6hbCa
zsgJ`?F!znB=Y8asbS%j$6oEb^Zy0#fm+&2$cq3L>@k`yOjRN&ZbgP_0Y*Oc&F{OY`
zoLq}&R7%tv`>)%y@0TJ0DX1Xe%PJv+TY{zHuP*tP?A3*jVhe6m8S;`3TmEX20b$jh
zlZBPcU7+0!_33*)q@EinvFR>G5sCT745bW2^-T&);(iqdq4OZ8H{+I(14tYTTkEm)
z%D`-&ihEKg4pPow<;e3Ap}*v5g|=RL^UQvm#Pfnhc&%V8^zVs1n%oZ)<X8E8$1QQw
z_Tir%SMojVjqbe(@0{0d!y5i&Q@caEt}7JLZ?I|V!R-Lq>q{kmn<=;V^BUszv3zfO
z_1!8xvJ%RsY&Rss@CKSz=DI|Yd!b@<c-YJ%Wf!)8%~p41__jVD%TMmSjr=#RfkjiO
zP?0;xZ@VPP(Fi3E{89#~$><0U$0*puPTQkB<gXjc{}uj6+nP_kcMV&trmxJ548BxJ
zvFj#Kc8F-H{{`VSIhVDLH5r%V_c*eC{2TW;PfhMP?uzd$GA#A5URHahXP!zqTrl1T
z&BWqwlt``dtr?wvXu$vKGyZ${3o$M}$%V6K=^AU1NuhRubM{`Ada(Cd5^?-P4$ued
zc7L26F#VII;jFt@LWI>O|Nr&vr2Ltxbm+YU6=|ubYb61hPpr+QWtwsXqr>rLj0cGy
zx7^>vPQ>*LMdLAO4;FsdeCYhh$f-*wbKNCTxoJp$I-Mei5vIQIY}>_BT8vVlkiPjx
zL*7%E;fkR{wa?J!F;i!5Qfk&+%(BZnX(FJUJ>Z<J6Ik(;t(ezYT8e!wd}wzkZv$Ds
z7fWcUNto-q%`tpWz|GNHPP25C`gX^)ui?+{JzWFi<-a;`mF7>h5Dra?GuOwwR~Ptb
z5P#xVa|S4~tDGY^brz^e{kfv3qrZ{r{Coroif^zpXie63{kln)Zt#r2;TH8HBOCZ_
zZ5|QwBh2kXCOmu1m>I)u#R!`#WM*Lpmm;4e!*gC46ivo93)M*-<&ibnYtkv!PLkT0
zfJ!IDU9qax>(kf2BXxF#&&c<e>l<k+>Vy3UEnulD51S-hS3k`O+bdtkZR~vELwuD3
zRqSm{cfe!T&eh4JLkFeR!EgB|gVYR_Qf!|Gc1e}z*GTb*_o)S9J?Sg)h|XpMu8ar6
zG<5~>H@^iGPy%0*B_@52D@Wg9sKmkZ(IcFFKtPRMU6!bU7j^bg5|*ATJ?EE)`TTwO
z)d@M1*5G<R#Px~RkM=woD^x&1iTH9Q3C%9GY6N1$ydZD<MfzxFq4b*n#Kw!`{h>yO
zfE{rXnqkVM7ZF;KO@Y89#jjxZ!|yn0WnZfB`m-p(bhmM^vVBGHvp|Y=k92%>H<Dz(
zZBZ-m`ddDA>E6c^19I5=zpPZp^@s9q7j6!^9+Mxn<TL7CO3U;&a?+5m`jrdOePP7k
zTz>>Ren+CqRfzdva0QB8p)ZXC`dDE{a$q9!DuTZ^bZW18L0-Tt1MLamcT|yn%J$h-
zV2ux~GlOZ!g|K$Q`H5nTXpS*)3r+a6{+WX~xmA=gJ@B;trb04Wm-zUegIUD|cRlQ|
z<Xm8$`*zub(nQ=kWlh6gawO5Pkj;hek8Z~VV!lh~>IZ7rznE+G(RKL70hebcvEz`0
z$7ch14T)t`tyc@ps$BSYR#A}n1rLC^_$WPZrXPLa?9J82tJp`)EFV`NddI-&6qlR2
zII${rZ|FSzh(6uxejMC&E{ud2-n>YFF!cdw?gDFa?Q6XZ>LvUE!xO)jR7!@dPc0-g
zFh&;F@2nu~=T??-gpCKEakqdmiV=0~9JmESA*-#0VrI2Ne1}fD)YnR3PB^drf?tA@
z5_XW1Kt;=&3-y6yP~S)+em}sGmSMi83lvwb!H@QbvtR}Se6<Iyqzz*vhS@p=^@Av=
zLvOEwEUsMZcO-%BBu}je;;buqw>~+_UvvcTV3&smI@11a7Pt?I>me4S+1-nlq%FT7
zE!EgXc!u@^OfRI68dq{df=^!LhI6O{4gZ~o6E?h7(ON*M4&+ASKy4=Z$>{o9^Z>_6
zd3z~EEB+FcLHjM|&rrR~=~x3>1Dq2y#vZ^2S5I^l;S)DD{Nd!^s@r!H9!)V5V*h)-
zMpd<`Z3>|%l3^B##~^coxuWTQVn`awvNWK9*zr5F*z=DeSI1r}sZn4UqS`$6u$NJe
zZH&vljO=UqdA0)kjhZdn^t^jxtB0_B-s@s4aDNsz0>-!cPI_@`CU3lbH=0UlZ)a9+
zfPY5@?KN&Ae$t?j)G^qV&x`iYc2+iX)FG2J;n<8C;)}*Fkm}%6M94p^$4>u|158IL
zk+^R}O;zOfNGOT_hS+rPDLT#Eo)-}JA(bR%>2Q=yd3Mu6sXJ)6J$MkuA_*`Xwju_F
z2Qmb@n=QI&O1n`4|0a73h~E~PpF5rYO5gkfD}Vr)nYx-3T{i=b;<>vcU_|Y)9k-dI
z;7EA>PNm_n6HYd6o&3;ftPEHw6BUibAwu}ifATgWRQSHjAQ2z1vlzcnC^fKZDJ_!m
zjMrgk`3vu)ECcQ|7`-#!c+8KFb&v0x7#K}~xEJ<iRL;5kbY!UK?J4>+;3Wim_ZndG
z)U8b7+D&0Ae^*puoZjXYv(0Ijy&MczM7XIw$(UjzfKKcWx*G@<*_j5DBs;AAeZn9j
zGzZ?{>*rbO(-djFtR=v5wKxdL5)|%o74T>Zhls8iJ1!}n9j_=k%$|>*VR3~Z+Z#Am
zk{N$_7(+k#FbsF&=IT`{C7lWWy&6XvMW6r@XP>3q7{VQV?(B6&6laH3JFxSZO|HbN
z7T-h*T0;(d7^TH^`HKu?r~syIrTghi8FyD1D&LtW#MH(zW-&40oB2ccdWS^Qx!PlQ
zXDMQU-}JtTj@3TL21=~Wf{%U2KETu>Zh=6;40wZ2HNI#zs?wmzxxL_Vr?c-<?_Xh(
z;yAMI>I@5ve?#3pyr>L)r=)4<umpkHYH~38`RF#}d)-6g<nB^ECvC=%xE=(|;+c&K
z<WLGn4&qeXwy%|e`4$Gex}Wso5DD{E(cOvS31%s<fTwMPpk1%-W#iuK0r9$5;zmEC
zBlStv2DIkz?~>sAaaqKbtGepqF84Yn;;w+}o|&7A3ZfzgesFK3UhrvI+hU_Go8&gj
z=mXoEcdCBkv9zn`9Pv&rDby6=a{30*Dv-VBWLVrh(XbmB%XhX;)6~V@P=MB_Y~no_
zcpGXc-*#$mIm%XxVM|xEv%}=C(L!qyffpBN__WV_pQ!%$saWOIRl^8T3qpAq%0QQX
z=A8t*ln2`_&L=$RQ*SH@acfTwpu8)z(t^7pjAMGRe)NP^FCtA2TMRp5S)~N|u_NK>
zq}3!VkH3jtg^C7P&}Ll618o$N`4I%lp};uQ#8N~*`gdvTF43Nxm1Zwa;GoCe@JB3K
zEXrT3Q0``Kll&Vfu=ByvPerd>Zp|NFwdaZAx2?VaRO!3p_W}6zUP4Ya*1Y;fljA4{
z`0n0P>D@nmyAfY$JXu4Th``y8VX!rnS$AspC%Gq&+G2lT`z?{Z69vQZ65`0TAWr$!
zly6GSUExnw^zKxa{{RW`4K35{`}Pmb<Xg<8V={-8%%<?nyb?QZfJ4!y4ELR|t;y~k
zD!KE3S;fb^_>(=S9yU~d;QKM+Q(jNc65op+BlDcQ6fL;9uzBX4Hc8dL>#jL{c9l5f
zwB}#M7)<_REY7KKk_OE_kV)wlm97i8HMHP$ap?LkZ;ccCQ615OW8ByC%{rfBP&ap?
zy}lUwX#*6x2`u9|Jg{TLT&%+vZHf`Qm+y<Rapl=yIf{rT_)}%4cN!qvTTyo?Fgl9M
z+}TlAr-xSJpmu^(Nd4`6UXWYe6%Q(vI|b{KDai0g$~f!|YB@s^CECH7BZ}mUqkdjR
zZpWc{pME9pUZ<#76Zdt}KfVjlTbs{o_<1W=Z2l}a@&Zp$M<%#dSLHC5+1Z)n&Z!0q
z?6QcmOaD$?z+0fr2ePDE)0ZIJdqPn#m5AY6HrACS5tTYhyVuq6!X!R!;oe9+TW42N
zRsrSFBP;9#x8;7WQ<RXC$zt5XV`f8T5iM_V?3}ZA(|3s{sLAjC;{Jm;`&td-QP5CT
z&fwD!`i{2(zIyT0YSAN}l`!mMAAZUU8s<`yL5?rj#Xc?j?|R~!sGX_-)W(z}Hb{J?
zDU_TzLVfYIgi_}kX|l8>-H`XO(I+)JA?77wF78%ah`_Ok&QnKs1_&udi1D1c<X?c0
z<d$Wr6<Rd07Eg9h@=i7v!tTO=&#^tlZZ(ev1n3ZZ`3ZqD)eMY+DMz@n45Wn5K~c##
zW5K7DQ*q`qcgyF`3xKdGwQHo$6(G0ndZu3pj_+|X6~4J@ZORAe`pX+#8O&l)`guoU
z^7TB-04q@h5_vKckan_bjmu=ojgytK3f<eqHWGiC@O{e)If3U;VmWKmQ={(EHpM2-
zp=x?w_h!nFGTpS$MMdbaZ<0<!c-Wds3+`)WyQRvHkOx0q+x=mGq1|;QlC`(jx?c#W
zWkrW(sy8iQ#j2>C6Sz6R_MJuru6$GWFiiv|nt@Pe{3PU}$_ehJh#&!Bcb7Zd>xWOb
z<BshKwd+#fYo1Ffsj<bj`a!Y@HWpj7wldVNOJWU~r<`7OXH9#B@<M1jQXgZnosCE=
zl2b`zL9!@Tg{BvIX~iwDYmU=Ezga{OMYAqIQYpcYLS}noEDUYw3{|pO==mgm%2e>6
zT4O086x|^j4X1%3qau>1Gqvww6I&mxDLt(Q_#K9*vgGxCc;jVYv8AvRj|#<ikKRdw
zAa6+)4k4QG-r3$bC57l(2FbF@k6oT5Ot1q5)23xbMY!nVIazddDM~(P|BGRW<y6E6
zrBmYjI)KoXXRGeqG>WIbnw+*gY@M}obus>UaSNVoLLm>f#T1>~UP=w!nqo7unb&nM
zbn_M&FczeVen>KmUN@V{({=Dkp+pno$8q%O@kaC&P^)A-;vS%dS&Mk$0s9<zqUKz>
z*%aZsW+Od;)k~Pw9flxZpk$OX<5!K_F4V_C#>x>c=E;!1*^(>L3VB+!`ctL0KWbd?
zqJw|a!yFSQM@^vbxJQ+dVJsyYXZ6q1Cd_ACDn@U7Hl^ZEsP2`uLyn^ksGqH3&*3`K
za%Sh<3@nv^C1!+=ZA{*)|KXXkenBoecCz2!7nI#v1=V`nU0fgB{~IJU2hHm^*DghU
zl@~XnMNBKO!=p%Y>kiEh_|f9o+am3{-<vy8i!JcOyo~eY?D@o{{VJ^7zTm|G(*L(f
zTaVv0C}~RM+?ja(DtK-3ZN$MQ@o!V%?{h**9*wg6f9_3T>v!k{!j2nHz2n<hXa(GN
zOyZ4Y(Y_b&i?O4$>&M(mr`X?pg0SW+7b4HGzuahTlOJ*XoaZTf|7&nDA-uMq5;Z|b
zCa5TNdJ0jT<ujwUExZ*z^l6f3)0}wpfd;MSd9YDwwybiP9doV8cqeUfd>CzcMqa&1
z8&~UM2>%dvo;zdBFwxk3ftFfoWFKUban+4RMl85Su*=lFe8t>UC)Rp((T}o6WMoR7
z|M-l;TrFR&d2_{Pw&G~S7k)_iA6BA$q$KkWQW+_jV0}B5EANP?Sy#s8t_HML>!1SS
z^NG8rT^uDad`A@x*=kwyP8_3ye7GK@1@^T#CkkSxq<~%b<-r3Ip0BX^{Ldb1f#W!0
z1^fd7K21|GX$AW*2`|9=eySnN#pRSn%e#jXUbs1rHI~_bZ%{f%;P<kEFA_pVe6b#3
zhu?i@@M4SFwJK!&QBH_0c&+xQaDJ{li8@Hc+6vK7zgiuHTelw+{iNgSthWeNXHcF-
zLMNnpHmsf%R*pjd6HxCw7lRFkm}PEZDfbDV+eMOzij&%S{wf1iXW8SSYT*a(?!Joo
z?eKrOK|4^^=J(QfxtPBO|FWl0?cuuy*NCHO%YEQRe>XU*#lr+(+U|d_f6NrIpo?wX
zVX-NVE9rs~7ws82&X-0B%m#(UWJU(6_*-J%mR)FEhxokmOG+EGDeF_O!@p2bTU6=S
z{@#6JCoa0|;2wKPd^i91i7V9v{kZqrDnNMQ{N(Mk^9mC;t=mjZ`+vE$Yz{>>a5xBM
zz7oX0vnA}525hvv7v6E+{Lks9j_G-I9(=lVD<rUJomtah$!Qps^*p*ItD4iJ@#~=f
zV~78-cn+&<-Jcj#lqE)}4fg9~t&iDXBA6m<Be)@RYrrNr%dj72i%w(uWi5TBBHLCL
zOma#^!`+YC^2fd6C$fmw!+X8lshQrJNGdbXj8_$1(FFvuG8GkwP3o{z+0VJZUHM-K
z_AjHajUUfA&b%7t7(lLle?jB%(W2|r<kI~^r&{QD<`(Q|??tRv-hQ8i0VQ%)uI+dA
zc&Bd@%sQO_<=upPamV3~8RqoxDXX_$*#K6j_Y1V7F2@+u$L*QiqE=F^+JK#-V@mZ%
z>}dTi2a^I~1%5xtFsr;3*`?tQCd6g{HWcL(kymhZx+;?|O2W;15Dre*ih8YU84Vn^
zcU3`qc)kPLb7>gYP5UCg9*N9_v8%ED^F~3gmJ?0X;jtu2GRyVbD3z<R>*f=bXWIME
zNXJ<%xJxZ?wz>km@xCSaJcd|!R>)^NV<1-~gB@D|SoNR3iR5fO?Q}3q@j3WRI=zdd
zkddH=o;mNch1ZTAz=i#kL6Tc7z*yEqwIpM+{DUQD9?$}?7@NS_Y~+H5=gn;=b7odR
zAr}HS{x>)6f-R|?oyNbgh`Y^6PBrde#~<I)mEA8hSic#Y^t)W73K{#k)RS=DX<kkp
z9!hze&K+T(nrG$k_sFAS`s(kb6iz0mOHbWekdF-t7KNv=#(i5gWO+Z-iO63Zn|;K<
z_u<Xi+`fYX0&y2MvqR1@)S0Bu4L}q<tSHOIW)+LFfNYquOzNW1k>t2wSJl2Msd!qZ
z%LFm%?bLQ(q@}Yk%;aLO7dY4cu{_FAqG>6_E5|*9eQ%Ps7+V;MZAR8mL%j|j?LLxc
zBhJGfU7T~9QLx5^0${hq@oI3<c6F<evO47S61Y?dN8PqEij(9!q(CrN&pnI;Gx?n#
zv%d%5#vW*RBH6@4We4x8r&hV=Y!Fn8+vG+&u#rkf@DQnn`ot_VOo#RZG)XFpI92xv
zDR}m0fB@b0Max|U_)K2<lob?m#s<IZB4ne1<><v~mV;V7u@BAF_sgWFjvZuzSgvgg
zc34pAeDf!8ZIBuDikgW!V*~wgz%IM^InZ7(!~NhpuzxhMguo&&-jm+&pYo%JoxXG(
zw}7xSs!b@BR2WJ6UbQk&(H=r#7Cd}TMokRS4`6BZmE5t2UnL?oANw@p)VR9<Y&FC0
z+lR%oWsq;G+U4e|hB)w$0hG7TQ)5lZT{{5#0Wgi^Zd9RM-HC@Ofz!<7g`f58mSlNv
zy;)?YEt}eGu{uruS~L6|_njmr<Ke$T4_nJF5+~fJ&}|NYx8H($yLo(0;ty)kn1>Oe
zZ<B3S4mNuoutN&qA?Z`3I)<imOW@kD4A~LKt%0*FDM%7bXd?<`!?u~`@jhn4N{wy1
zNe&1j!Z+<zMW{ipzG<o1#rw|MuNDyTh&VJYEf9?yW)@#P{L|;l3e;!YW?N0R^4Th!
zyY0|Y(1YAzF9!*>e-BCQQy#hXj&on6Q5#Yx`C^$q<sL&8YEn`Wt1fdIY|Z3Uo)gRd
zF5xY8A=vsB&AIH-PE-2`$-)Cc$Dp{7_Omj%?rVdKY>jA<?I@z*+s)e<nSO`%v{W47
z;sUPk64;~PVW)NH-T4Szw`M@ii_vxIpUNSqF%9vW@wncCq-l2t&vOf&MysnX{LJv*
zsrRdQe#IJQ@g~U~q<;E?frsC98F+k23R>X1{FG1OCLx+n-{ruDkUUsu*0@lkIDKhT
z0K;~bBP+h1PUze$k7ef%BJ}>*{(wm|j2Wv}agO{Thh_{PHD|TbbfRy5-eV?(*VcX)
zgSaaSiD{e1Umi~i(bkr-{zn{b_*bn2x?d?2j1kaCsW}gbbzPqP#3`Zlu-+VD*Pzww
z+3-KTbt~6^9zlyUWm;`-XyI282vR#1TL=Q4-M7M)LmLpgF<ZnR$6b3NThA(sUt;;<
z>CCD=i|N5h#n%<U`(mn$C*ByC7XxMIhU`Dx`VB_~ZdVlXf{_=(V!L6CU_2R*+R4`X
za#vJmdb7OsaDFwnA>;90_K-@3+;yxh1h>ul?y6v)*8<J9qU7s}qu*Q}fq4=YAcJU#
zCr8Av%|A_iJ4mbd7Q2MUH^E=PDf~(I^^tD=cp(U{mYn^Is8RHYzW4vUO}^+5_cmrt
z<x)TQ%~8_cZZ|2xQ6puCuhCOB51F3ykML^Hj$#ih&8a|d`Z0shbOhSKXZmazt~wU*
zj21Z@?Qs_W4<NceRIlUS*=Bdd%)TrTwjMFX_`$YoJRu}HG?|sx?qufx)@>5L;F|hF
z#sVCJJ<<OW&|ddGi$pB35?DKd&xiJ861jX=v}z8?n5A+E{;cC>t9lYeVY=v+`os1Y
z6#SuSjKA5oiget~^l7$go>mpJYcNQE4;52g%C6GNm9^2<x}J;9h=mz2c`yU4I^P$v
zrF%HZjc+=`GK@WkeKe-kD6q)s8otO~uM2!k&V%001d3P1#KC!7$yO#a$5Jj8mt^u&
z0Fj0BPZSTsp1mP@MU`Wl#_=D~gdJGtvR@wUZrm!_!Vb!MS6%f+8h4~vWz6^-5nYMt
zE!7HM1BqYfPFbCwWWq-{aCdi&_^4Y&=Z%gmBqk^Rx@2BAQo&`X_9M{)-TIjQ+zJeB
zMzy*9`1U!&YsmtNTMB99>U8zPL4PoBY(7@|Hh@=e_l8)7Yb=T}`ZaocFK>}HTPbds
z9E!^l>nIj0{6SY!D*s^GK27Y?F4fERRNy2~$dO8Q-)lIu`!7p~KLmRPU8VUiQ+si8
z&9=8xWPECC5y}ct^vm+!E?n$tzOB$6oqw7YjY4~`1-ng)EsjlC@Be!yd?lfYmT|BW
zQu&r`E+Cfv*0aY$Cyy=D$L7FjtxmxN*xXV@j;|G4pIm9%^!?e>!f&Vh1><^4ELi_z
zXKaK%-`;3_h0sTgdP$O)8=MlAH%gpI3haJinIVB-hsCx&A;eXh)+Ya~c#U}fC+c$z
z7~h6-Pz$7AZim<vv{zBH=yHnG5gmBk-;hJhQu$<2Ij`5xSejEJi~fMgSXOziO0Y>*
zr&z{wF@D;v!mtc^sO#b=?Mf;2)N$qFv{?3dOlHVWF{v?jwm<LDP;EJ(2Zs+(8@pKr
zXCMsSScFq85g!=6`--+N<&T<M++#d!xS~}HD9%aJ)UNUQ?<h-lk!}Pp`Y$dHv=n|t
zPMK=ypeKq#TVir2i4teee=-dj9gBW1b9p;*M*-G_(!M4=*O2-@OO^jJOpMb8{eyW^
z{_ir1|J1+#RY3lqCd_}8Mf^YBvIWhvH#?9x^II=8K>}MfG8{59&#?SEF|jsVCUmKZ
zN8@oZef}@@_AeUsAH?z+RucWD85yzyA0GUtDc`g(hCkeyWGqRTb+Ju4EV`PBxU5}W
zhd;3`k2m(|X3=nG<f>mNMg`3rg?CKc^Q-qEjqW%PxJI7Wg#eWa24e@xG1XG9g8Z(5
z%ZZV?T6lSpLQ8ui7~L`7s@)>h*QOIeo%z0IIf&iYgbW)#zdE!81_?@+*s7bf{%P4=
zkTm1%M`;XlAH9Ko@pay#X5Sb;|7mGUlb4S)4J^{GyS(WwC4`6IUzl#hR-AOi*-Yf{
z!Td<%QL5*6qV$1=r89PM(AX64=`m8(0G;vS>ND!G0h+86O>`aU*Kp;3rsUV1WjAi-
zzEvtKq2F0Qx><a+LP7`RE;Z6AJk1ci3a5AO-`JkVHh^M+Rn`uH6Z?y{2UkoMVEOQ&
zA46rJmbsz2=vmUXvuYy9a`sQS;9*#j2$S>K>9yk^L+`d3)c~tsBOHz6dYV1$uHT!b
z#4o6VB^vG^I;Tr7(DU7Aht3r(tb`Q`2Iv|EkeHMZ@|~BPgBOE0P)-E>B+aYsH21_u
z`a-HpEk{tc?#JCjTBoAb4##!rjw0$U?0O{f#_!)l_pK+YN*cW8Sl7%>V%f6d%UjMG
zuKkTs|KC;o;%*!-6f~;=`A)vy*d5bNwTs2&%B$=%2kq6{`4GuzF;h_MnVIL$=;MNc
zmczikMo`;Ti7l@_xSa^!F5L3<3#&|*BjCD~6i+Bt=slJ=4zc|qA-TL5;VA!?<k^Y?
z%4eW$d&uu}G3!<OD&w~I#pi=Kx^1Ui{3AwBgQcS1QB;KOZ7pp_<=uw9didV!;Mpc}
zRTu=zhkE9bI4qI~><8vXY*ga%$Pq(Z9NR1Rv5ezMj{^4Ij3}sTUxueei(JhkPv?9-
z3F(Yd_g#%-XXv?Mxr-8(jD#K5{m}Y-TAC6h;|5X}_MVHE>dn-}``#f=Ra87CD;`;U
zt-n`-{9MJ)@@rlIWM&IPPFxa%I&ds%d<N(KkLtcVs;Tr{S46=P5D=sn6%~+Pq!&>E
z9T7y7UIIvyUPA{#=>aLyi-3Sg4M^`DDG3mI2_^IZA)$oM-RL-He&^0_&N^qUv(~--
zCoB8g`z!k`&-1+RZd#dAafV0%SnX(#o#X>7PEpo*r#rgH3!a5B;AROC^i5&)TJT@i
zwG9R0sGrsm;3Asc`SJrNt3Dx7Rajsm=&$&>>9jX|IUNL<NJasN&mJ~juCavbg?N3>
zikrzx5ch~p8+09!q}o=H)IFC~j(_J8fT+u5?K8;$y=mguv)^>wAlQ|eriTN0zpIht
zPKtLpk$Q`nrg*_{gYr!L40<Gd<)1A-lhX?hic2Q9q_aULSUos@XPx!cvs-80IOr{(
z%4Gujpd3|V**cxBW;-d#qLfyyc&urJcY769Z7A3r487=vJj-Dgsh#|iX;`)QqPxn&
zxQqCd5FxwfwVl1^14&`H4WxVk*W^8~s3KaIMDw^l(kI8j6(R|q$mFX&$|B7Nvg{Qn
z1pK*;uknO@obDaw(wXKJ?JTsi7&^*9Lbw{5Rmb;^P=PG~ZC7~|lMECOs<tbLp+?=x
z#TS1cnQ{$vXnC{Y)=_!b7OP~&=9Nqg>j&k8QJ8J5yV!?!0{Jral%j12uxw|s{8_AM
z<Gm?v)@_frox|BDS_FRFy0GMyonYpS$m~Q@vMFYXPU2%E0|^yaQ2eSyc8Pe2<u48l
zj=e$#?t`35k1W`LQg6KM<TEaP&}>_mNxuZBHQg#wWL8OM8+LLzFF&#n3JWMqRviJE
zWq)-D52xw6v_rl_=-ezKiJ?Dxls^vw5RB34I@WMG8rF%f%%LZ%bXdLXbup@nlyJ!h
zgIu9JRaWwKwBhsQOk#b`cB6+6#6TTsZjoH@7!w1``zv%iEGq}@5hc9_vRP4r8))fN
zvH`r>NiKe`V3B^#ZDwC~^GkH*;qtS{oOWeqAk6-I8#n7q#w6VNYChe32FEN0I=aOD
zZTad7b3}t~z0HK)o!F(^qUzylnB3Nq%hVFZhzHD!shCV;BKrvS0QF|a9$f=>Q)pZs
zLUq_I_gVbN&EE%J8ExuFUQOnh?tTnEauVu^KMO=R;#of*Uy-dpN)cwL^KU<GjG!eG
z4Y$AZIIv&JgRky3ZtzErUosEWE)I?-+F^XLEg0t4t?rTxjh;yNjw!?xUNYY^yf$aH
z*^G&wiF2C&lT=^lu6+$7K??!fwEamfibeutrtkhx;!t<1GSxiX;IxVF2S~uc&xF#o
z7BC#afo&QN>tR)!_$TyO^fU(>AmB_K_gLXMTO`QUtjOEIZI84iw~{@G!(dT;a;D5A
z_IqJ$?LugHis#0{LZujJi>LVcSZvHSCSlf!8cIOuk<1Cy;pr^<3f%0#?^C|Td*g^l
zo=>5C*(L=bN~?^kSHj69XGurSEtc}Jona2(9<M&o&W?i6BOoOH2}K|YO5uKK$PH%x
zRr=GUw{|a&s#|#~rpt6sj>M-Efb*ZQy@Tw}Nm=sd@|xUGXjB&4GSXm$Fi7i-*#pI%
z$@e#g;OEkjC@n0U?X+86a!vgNc9zSAPurE^yabyUf#cC@*b<mSuH?;C9f!Q?_GWER
z_1^{1SAhsu-@WxAB@8^u8Kl%Sqv#)!!s5^jB!?G9%$w7j3@`iG*)_uIkJ@npkXwai
z_#f-G{{f{8mUca&01hXNK-PWM`3N*{Zb|6MsGdr#wv1vXJR-Yavn5vyUBcLvkrRD7
zg?wroELHZ?@4FGN9eAYW7YmW;H?LDaQjjvI=%WB_1JV-)x~A9EE46tx&JY40vMZzM
z9V&NOx9Y+&Mh`oqz>RxKTIrGZtIn3ei|KGvqKo;;y)?kcrY=&!>%UF<0tL?^`<K?k
zsN^1Y{J7yCw!n>;^l;QHG4;^y^3Cwe3v+`cg%0~ITk?DNz%r}J_uR+rdzvk3fF@b(
zVoC(Jq-(04#?}(HFEO&ZwnIWno;+Cg#d-=}rqu5^<|%0%*bstDZMt<-;YI{sH17Ao
zo)#6qs7B}$E8quo=6pn5t`?jnV6}t_Ueq!qgZ}R{l;n=KCmiZ3239R|p7J$hkeI1o
zV0Luc9CarrUfZOCbDvF7eeCBHdoUSke#}5^=1MVwWZiKYx>89d;(RvCD?;@V9!pz-
z#u_e}6SG_Vm?M&E_n|-L`46rF4lHqy-J9hs<Clh*hv82|$Bk-A2-}xwWX}sn*@Nub
zkJ{;7sID-KYSr+Xaym6G10CqtAb@UVJZZ`+Vrsb3KnCSICA;5?6W>Td`p85u-5Wpl
zy(&tL9$l;3j_p+YBYj131_}Fqr&ZwGpUr^+#lx+gcccr$xcfM;ZnmPc*U3w}Ud28j
zCY^mbE7ebdy+XGDQ)Zl(+C>Rn(X7-W$rKT8-&cwdhA>+$70(V#+*JAHQ*y3HO^+rV
zEx0vpAPF8iX@fj)*A%~FJk3RCK|C}oV-6p)`hdSjg=yJQYOuSqhxA)W$Ty?uOiV&(
z)1aK?J4?ml_)fEIhuJS$l82ljy%eW4&}ehANS=}y@a4HwVir{`%z8C=jNsu2^0<=g
zDF6fp=HZWEfxxPaV5y}P%cvzlL1T1cDuHQE>SF}P+H9p1&1f+duWrCRHBg`H$Ihdb
zOs~Hl_Q;wZH2p6;H&9nU&g$MuPq4ML|26{*feYImJ*J0^yVR^LET%e6>Hzh#_R_N6
z!tJ$$0aQT`Y4khDKiKjQ1Lf(})VtZJ$AVb}2}$-Ja{j;tr2s8XZK?T08awN8&~3JB
zhWdhS?;~%?;Ij!8#EoClNFKId5HWppr#4*hz81=nhd2yrB<Ze@P)4%op*C-vmB4W4
z%yTpGXYU9^(JS|6f+r(11-qETg9OP!?iqP*JR=V4CDjxPNU0-Kz(`r%Ye6q#JF4b@
zkw}+%V<{E54@O_AMMiYeXq{ENueL4~+qA4;7e1^6%zmpoj*#kiX4<R*I;3QS5Z_lu
zM6b3lTZ=K%VGkD+(=Ek8POr!Mg;wkGA(rk>&z|NVp$RA<;;h;(o2bYW_sSi1O|)j^
zsy7cUghi&*H7%tho@SZRq70bH{eHH%A<aj_1P}A~XX=4>`Q;sv;ZKpBFpGS=;RRR&
zaW0@g&y3zN3VYXQmqi=1!Zy3%UB|OAHJYoMAzTszG>y1?h0M|WGgPQX?}f;bHg7VW
zLW?GsVhiKahzc#xyV3Y%=6Fpz&_M#gr0;+Y>ILNJTYvlXR@px|fT*M1UJqynpiOD=
zPTz7Z4ABcS*1~kH^4McOn!brYi%sksmv0$L+=M8tg%#NOc2dnn$}$=A;Ut*ImVu!Q
zNY&`9#{nQW=-b$rMoG|`L(o>QV;2~8Uv*?p!4%lRvJ6A|^uneMyv&xi6W~!#z{c$T
z3^#RNa{{N!RB6y7Y;jAfPe5*JVZ=e;fzm#HhXvs*I_aqcE(H2hjGcF3SK&kZ-tdmD
z<iq&x3qG6rJ;8LHw&O&!00^k9E_TTx7;|YynF6QV6{gp<o=$rF+?VkuiIr%Ym3X%$
zrG~wkS%8k*O{#oW4QJ7jpEaMd=gL}oCUNUB=1(3@>fq=&aKIvmIq_sG0`q9;P4FL_
zka6r@(+qK2fY4<*`3=RoFQfZ%8A}oUm+pdT4h7$|Cf;yiG!}l>E`ixkk4r(vi?Z%!
z3w>;f(6(c?;4d*jC@ML=CjQFbgSmXIP;`zsD1e?!z?-gxr)&Q$DPyJ>Z9e-}hw!!*
zdowvbo4bqT29oAi%87lbtUnQOY5z&(bngeGz5>zM%=x5}hqp5mtg0IewZ#la>2!qV
zHplTkvxS+n%!N%EZz*EA>NCd&mEGM+vk7s>vz^zH8fTV`x!l3@2|G*XlYvRCeL)$s
z^AVhEN|y85$!&IeAfYo{>jw!U#y^UvzjJ<;|8R;_EAK&-*@X~`I<XQZA0tM~t_zt-
zBz*NTj$y6=$9RGD=;}rv_F2QK<|{)kHJ=@CIi=P<=JLdH?-1Za3OGc)MW4QV))Fac
zrJQ&Yk6Z1&v|G4Hx*Pi^V)>{}VShWR+*MWoH+&7fVDHD3wLy=3lWd#xHE1e5Hc7Fr
zV|g&#W5WP&Re=vQ!2%wQD!qzQYWd=&`1gA(6HP(!iHt4|&$Dw&IS@*1SHNV86(2^v
z>>2BSV;N46k~|V|;DO7RDHkKJ0?YXaK?|1k5MY*YdR&bgy^UxgpX8KC%PJ6IjI)7$
zA6FIAcADKEK!uO8{pC<>Y=2L}XF0;}<vo3PMH3NxSMfjbplAHgAH;Cyp5L8|bCoRZ
z>|Tclwf{<0_TOG&N#HFVY+k9WNR3toddY3GHh2hy7~dx5rul=L93B|UVVPH`A2u*E
zi3?yV&ZhbY-uDefGR<jS5XyZSJ+gHBUn$^NO{=%t+f;bR@DYGo9ndKh=lM@JBJi+H
zsy`#(^WfA=K%tVQ#6J+-Ux=CvvD2Yqg@N_nUi$ySd9yF4XxM<0E*xzm*e3<h)#|$5
z!5pdAu8c}K!WW0bwVsK(e`YEke>Acf$y6Lm`H%Bq`gim3+nu(Ya9Y2o&cWJ%ZRRPw
zUMss($YH}F4uW=y{-Cgv6dfA175;&({wmkbE}zpDigKq*+g?X+T^)PylV$$NMjpkq
z>@Oy;F*gir=Ga>n&M36*BE6o{E&+so)f8pjl}R&6+Ct7C=pRs?NW{EvS=fI=rO<$e
zF~XH{5=_nA(PcR~;7L0LEsr%oZtOYwZN<P_;P&xQ1iMCZ5^fE$p$$yyKi#M2s!m|3
zd;zV_ddb2%G@92ALsEiG$_26(WZm{iePC$8@)*NZJ0|o`P__KkmGw1)-S5LQyHlVS
zs6YRXd`~-`U_C!(<1r_vNRygppoq<C{}a}}?JX&Yt}|9qs~frS3;o;$3(S?R$D$Qx
z8}?=$S3b!RBDi>^?nnSdN`D^QUz@8nxA0A<S1Qj#<F{%69GQH|>So9Wab#>rm^)^l
z!~>9A?xm#^<68bP;92KROMeE~*b{csi>{JeEMKBwRXugCiFoAb-V5VDpIJXZSy$gX
zJjD&qfT>5-SzG3p8d4j)Q33$-pNv+kEivE06|^+cxT8DDt*H{ickj-bPUTD^aKbGz
z^+^@G=N0#8sV|$cFp{^tGk)8^=kU5j;Jk0!FZxCbpyCD3sCY3@Dg2%aHLP28$-rC{
zzdxi>uRMJ)Yl{GAjHWvCfsXWwlmLA?1-=>|H%E<8+b%;s?@g2a+p+3mpfD#g2C2%U
zX3dxdUuisS$!0s8b~S*_8F2p}YevkF2X8&TFJu;ZQe<uI#Y9Yb1@c+heEbDWewPG^
zc%ISKEs9}wX>PfK<+E9}qvIWJ0MS}iJO4gSrpQe|Xm%EiZ65fJIqCCizoJ%;e-@*k
za|Sa0Py*Bvnp!_>p*q~cIi{*wwK(3d+T_b!5N`kwQ@w{f$7#pk-wXV{xi*oe;YqvX
zNjKfmown}f6ap|hKcp<Lzl($XiR5R`a1iwCc_EY0A&>Xd(9~fM8KoV4Gg)##ZX=7X
z;{<1$rFz~UYL`l~OST_4Sz&E(yDbs80Vr>_>nC84=Ki5LrnLIK`Le&rE*ASWrhU4`
z=6z0a-cE(FztknBxrv13)0FI=NqDN?9oEh+PGlQv=n19R9MBI`Ehg|CZ{{_8_F#2T
zk~j_%%3`z@Z)(%=IwC4GY$vCawZa_FlP_S#9S-#n%=pP^ulA;KfHBC}rnoXXEV&-J
z;k?<u0-&KfsUhaf{0fT-kW^khTSUQROca10*B@nydbjir40RNw!6Z-pR?YzAEA)cG
z(ztoYy91zBTR=hKWG^yml{Bfwv!7mWxbsUn5eqQrVc}swV&R4KI&DELXlU~=OHXaD
zkP7v#F_lokYDX9D?SI?Gab-W{l^njOp6b1I2vcoO-<@uw9<XP$T9w%X12|CdK%OF>
z&rvBye$k(Mcs8_^(ACZMr3ME?DD*a*uiV>cmAz2Nf5{~UBBG+9?PV=$B(k5vms~2c
zUOc-}2d`B>DOBUDysPJ#yD({%YB@J?xJ-S<cb~~+I$r>_H!)j9P9|yanhnRt)y-Xb
zacJq)3KXoeSQo=nHmS*;nv$b`D$Fe9`dB`vMJA91oysUv1mIsAcv$ub=h=Js@1f{1
zAP?7)SdDSK41q?PbX+91YLce`*UgF<R7eSrrQqvk-a%bLb3@NW4o;~8B$Y(WOUul+
z3R!QXZ7z@!hdDjAt<l?cm<3uN;Ym|W$8nuB7Nw@vNhx@z4l$50uk79`d}9xgkANLd
z_dUmY@inZP)5Sgj*C|Q7k>70-v~imnog*^k-T~+tEHWJbW0TgXAI)daQCl?t$!@in
z7>LQ375A~7=!%tCv$)OivjG6L>G3kP?mt5>WQ@Vdj<|S~g1a9SZga?zsH|38Dm|j{
zXJ7;rm>R`In+=FwwQqC{uq!SW+BK!#20-?^KI8~r3s0O|sWsRI^3o@~{cgB;+P(9`
zXcwCOAgjpRk{2mc?1pa_rcRh({Pmc+MAWX@7rU}7fB1x~@xdi+NI6evqitd~>>DXJ
zT;IPMwu66=RW!T7*1;tD0<%N>%MAY=BV!y36zzY%M7u8J*LrZ>TQ|rbP&hZz9UHmd
z3=gF}O9;e#I_-1f6d%AE4$7To+u}k7de49fPdGwNPlZh_=fRJRUP(jW*t5-9%;V@1
zjvlByp!2z678xligZR>_N{UIkIF@8xqvpgX%)K9(;(-)+k2Z74By`*Q7Q1)lXEF`o
zGWIPOSB^0AIbe1BM!HwqBQv!F&wtyp4E?PpcHM1vK{QrW(Qw|eJ;9XIM?>Wu>Yiam
z9l|1z%yDJ$c)C7x4*5s=j3h|%f>m|9=-aftni6Un)ScU-<oD_QSRuQ{kZh(uZeqp+
zI-Ts}d`S2)H$-W@%`NF*r9yT5hZ5(9B28<q!aqDG#Jy<q(0=cUnYdY~*irdDjA(u&
z==}~$=02b{36q37Hs_PbY#lz(J=II${liuK8_u?jGbr7zaaR`CiW5MD+bPArp}gD?
zx~1fkXgY3K7}?(JGp#e40w4tfW&R#5O}Hy`hG^>&lQ!qG?A)?=i=KE5ka2v3&(iJX
z-bu2m^LJ!g_nJ5~uBJh#bha3_DDIVvEVW9=oXWmR@}s*<Y?a6C!o`Y8jw}*s2<q6M
zTVCW4W%zEM#Y&G3X7N0+#_RbLk%gR)=T<*p%zUFp@lX}1p=UXT!&Ye^=M-`aQoOV_
zGn1sulupSQ^HK_<m>;2ots=9|emU#APfr~*O`d%BPzqSq;kVe@&3yY5^0@7PPg$!l
z6=$kQi;mNz>N~j;9t&ie3(siDHqUt<rf30#vt^g2LA$UV#`jtEvUY{c?b`6%sO~Vm
zF|Wb}U+8X_YTBlD_8VxVB3;1IrO_i2zEuS(8a*Qm4I=AW_crh!ehc`nfu8GBicMT=
zF*wdT<=Bbw5#c*ZRoosxkA&8jvh3#uiWxoSF7+&$2iLUL{iN$-fvce3&mk-m{1#g5
zg5Xtii#V*DV%l74uK)>el)mFqW<S{F>z~dom;K7<#gWNWnBWA{LazlygY_)+Q~`K$
z>NvGeHLk2*?T4c7OjG(QYL(LDr&>@%?N5o>AQf>~fZ&yW0Dy^<*&oLb<Y8P}t><S(
zPCt<p*0qQmxO@x2RGsO<gg-bpQlKN^Z>})&59iHL7uy8uO^#-TIBw55)&m*qHs^44
zUnu-a_+&@%Y|9iimj_?ECSkj<e764BU+mY}{1jjjPBOczfxLWXXZ*+kt5XMxPg1f}
zTyam2+sMqHJV}A&<B0DwNp6Y17%OSu6`VEt{ia$Ft2<V&VwT8~->u$@$%T1gtm(wC
zWjmS6#sN`f#gK9n(69Y`YgU7S)^jpy*K$ep>hPOeup7p2G9p@R&+q7qeWb-&^*=F3
zEK~opEn4nvZz`Ehk6(P3NfXeIt~klosbRhA5@aO(+%**<Jj#Rult^dspW2Z3r_oln
zNoZdvrD#JoDOCJI4PTT<jnLfMzT@`q0n&J34z0Ash%uuS%-pz1N8yVh`kogC$cvPU
z<uy~Qnp-A}2rDjlH6IgHK2~#ltW#f6{L_dSsqxokL-u6KwgScRi%>rRxPU^5k}?Mw
z078F6IVr;mLD0M`m<h|LCt>Dk-9Hln+$2?GG#ktA|Ka0)c+0Dn!Rz_NTufXdsh3+^
z!!_~pbsgh;T~&4AJ0(3=FSW#zfzW-*zcR}IF9~`vyP01Ez4-?&fS(5>!T>)n*H*%V
z`=^BSUxdaC;vfDmvG_mb#%FB8Uox)Sxh$#yKP87k^8ZcQ@88?$fqjkXSeW$m(F&EC
zZ0@Brz%AJ1?b6Eb307ybV3EAYOrsXbS7%CfAYU(O4SfV<#}q-AM%|OGSFK&8=TZzu
zrVn5LRhN=#yyn@z1D0PJO--fEp4fD6eOl#)Nh8CY2}6Dc%)wwOoYZFh0}mS@viXcx
z@fHF|@j;JT<Ng4%m3O7QPk<bve>;oZ=LO|PnwI{QQPGO!KZjG;65TnRa(+bpzK;0O
zN0oYq*;?{8eYFTzs36~7*=)*P+~7~p&uLxnbQ<aA+x=?L+VlLax3|9a{kAY*`Qc{U
z^Q(T3G0cx11PgQe#q?bbp?DtR>3be0$8nb<{gT|H#}Ce3yh!qbI?FTUap`986vlH)
z($N&N)-&dTSzJU;)f}FrE_rQiBsjXjQpF`t-Exv8rQse~uhhFw`_1D;UU&AbEv`h|
z>KJ4@(a;5-suZTQmW7jijl1O*vyv1cm9kVf+Ibr{Z5OZLFLadV=9u~2ppth26x|!O
zzJ9mRS@(H{!Q-wBB`2C<z7(bSEPm^^7m3Ue8*akl*(_cM^FmvvPv~v=5ZIiQxU{Ex
zKAe%Ky)wsLHQxmYpLs^C8ol9*`XM~pNZ-r;D(7Hj5OOp}MWF|L0>Y%(xBpPV9|Fkf
zD&pw*6-t-o%s#V`(vZvSJlJgwZOE%l+iFe%=*I=jZq3@qb>}`q!h^I`*i-vwD8JR)
zPgsBfAeyd`lfKNJt@f`hpad<S^gJe{$A-$WxYkhHSLHrSD{*-z^gC*N4SF%hI%9Hy
ztV!0=-huiN@#P^Lah0FkpF}ApcpBSGk%_>{CoJ$^zm59#qr6cmLM2=c2N0`875yTs
zoIsf)McC1>mw1}%;{KQf+IAEjo}f-SBEiei&v(m2^$zK1b0~Kr92azI=xWu7bB7N{
z4|=H7Zy$EK7-?x@A{TwgiIv49L|@h;*=zzDvYylND>TwDh;Y|zcA>c@x=HC8n!0WG
zv}GJ;6AT&*#$av~8s9OYBh!&ysC4#dPZn3)(!Q+DQrhzl3oo8}McwD?j|R?o*j8!k
z>6iB&hp#`4H1Is1@ieLnszk@n@KU%ZfVc9w3%q9uw+<!9l&kN|2sN3osxU5JC{OE*
z)$@MeRUhV1P={2+tvnI#l2q3bN^%{bACuRZisyk23hvhjGCRJA0!xNiGSI%{dxGPj
zA`5Mc(-che6FWUm^70u88QCZ^c@HdB(_zJGtDqbp`3*;`=C3||#lRWnz$5vlZPH5i
za6GL9BFY%OUO#<f%(-GZDamzr3Q)>M)lR6!Jk@<(ck6qmQoR6pq-7fJ16M1ms*0jv
zD7KZwdLlVYghL~{<!N=KYF4u)w}cg{_|bGWNKEmB%veYH81LbDnD}l*oq~vGYx9`;
zuvLd!MP}u&y<D*O4bD(O!^SW0FPJa*X3o)cf|mT}KB^eVVJq)GS_k9BtzsNwS*6Og
z5D@Ba+Y(_Dx$f4O?xyrw@^Axs%D&|l95EP-b-PYF8gD<bq(&K>h5Xh=w3eK!Vb&5r
z4u|_!{+>gh9rEp1_MX6mQ;*2LZNl`e!}rW@I#L~6PIMMUYlht!=MUT2e7rdSi;Y71
zVh>P~m;?yY72$c;$h`3qsW6hSa*pdGDqLdrn^h@K$3oEx<*mn_i`(DFWQbW)MSFgG
z`I#~vP#>!4A}T+vzJ9$e11uphP+}_Rd}Ue)9YgV}5lu<S4aX(*aLL0cz6Au>^;yCI
zXc`}_xOfQqCNTNA|8YNctFkJus`yS!rKdFT8eL{|%t`jJmnm6SOnXQuV!GC`HT_jK
zT;GSmrXJe9OwX2hw7BvJn`V#98Zp0Ybp1b!0&r)B%&W>xSIFp`>9O6c;NhjBo(gPY
zxcuDBMPmOJj@Z+%VD?n~dl#Vq!%aHsXzTf*sAC@Aw!10XZ9V~rO71ehUaLXN5HF>q
z)-ExrYvyLYzzkSCIDXdC{%ByuVR|iX-ePcs(fR=T6*Dw4=h1p`Ncq+ez2@XY?6X(?
z^P~0D`)%ykt8e8pbyG9&`w>I7qv6$Gx_qI&ZhfER+~7=Pf~}Wmr9)&rcgXjl6;rZY
zY1+``IW{iorN>11+CLU@9;1#ZD8GwjcZ=Z^1__r4ydq}tRysj4^83lAbqEU?z;aT-
z!8PPo42Oaa&C8vDJP4;sbGk1G8k$SoKc7d{1rAu@Hc$bqNpm|BM9>}GfvMC<T_mGx
z&|*jn`~A=%71zdRw$&@^vtW2p>Ydq&NgSNfX}0Cn>&CBbW#L^<w#lDpo!vYBjyIuq
zYbRqd{j^WhkViD{mm`+tTF^ub{OIO1e!|gvSPSEYC;kYJAk);q5pkbN=|l*+%j?hD
z?q=lKe4&%&^Nx-e^o9c#X4-T7IO0@THdZ1<SwNxe7<_+Xy;f1?GELf3gJ8$pyICS^
z7H49~IhGLHuMD?3cC$M9WC;3iKUvVs?(@Bonkw>o+0TufDeWymjp_K5+d0r1dG2Yd
zy*e|WkN8g!+V%@ycR)W@p#9=2_7fqQZ|>0}vj^|AFO$(V+Zle+GM*uex4pDhJ3{b8
z+(YddoJ+b`DIE);x<#m7kP)^><g+H5k1Us%u2hBIiV$o=BrKB|*rgkewP{Sh6aeMl
zFl5b+Z(mq@cCT&;ZCm-o+xDG;4zOeLTEQpmz+3*Fi1bx9$Ok<eLV!A1-0E7DHs}!O
z_8(OUBN?fJ6%af$(|2{Nmw|9j&ckh82IMtfXNn$}57{4Oubj(5T_6nbWKak<4sJ=4
zIQZ=O`|Bg-@RPOA^STGWCqH!-D5HJSQZ}`{mON3)87-4?jO#!|e-o4zK3-kx)=>Vv
zydtkMa4zX9(RZlcN|CK>!n2x_qgt!NWcS~9C*sU*9Xj2{g{s_ldh$X%mQw^2L~K?(
z><@4Bg${?FqY$ZdbK>p5C0blniUje{sU<Z#->)J_a(xhG%RNjRnf5Y{-*Bwhuw{$*
z%brD=NeCHThVHUJ)(t;`7BPfa)lPlFM=$>2oHQRTlN)<TVwKMB_!85)3nuX04bu27
zaN)b(P#A-~iC6pXZA7GAHn>60*NM0eUsY4!Q&H*l(}6@xnJJ}dr_WAKD%IfMhjqbD
zGDEJAvXMNgn?KffKj_E$0v9to%wyfmF0j?p@k`utWtREo7YItWKcT%VB=rS@qXRdl
zkJ0Ev6b*XR34lC_GSy^;aY<RJ*^XSEn?xz>`E?7<8bsX5A}ehYWvMcEeDs4HgS&DI
z9y1iKXV-yQ(;##*+>!@Db->D|xc0mqBSIHNTG?vYledY!B8xJ%T&Wvj?H$%HWz(^Q
zd@M!X*@%Noy+K*cQnr+^*L$dy*z){`+cO~wWB&v#j)!lO&wlE!`%a^?K!U{_!PMON
znB{_nX|C~=PIBP4AJ)i*ZhN7^Eo>NV>rG_j1z<JIZTBD~UzJa+cVdS|3c^J%Y_J@i
z#tRa*f4jzdbj-k?s#(=8fnGxBZzSwnAweLhsD%_yM9Gdy<>wxYZ_(}FhK5G8MNb<f
z?UCyog%+CT&YOIB;DQaILQhdB!yETK4_+aVlAe*?oe5yH;gZc6>TFs;NU-yr7}i6a
zj>+ncQ?_6t4uu|Y#^&>gtXM5x10kBFsqnc(_1~4Z*#Wzd+vBT>fzpJ*H!U0a)hCx~
z(@sVsaIQ<eS5?@y$+KpkG%EE<TX$u-F(7DGKFwHU)aH~L<4i{GEWI@%3~;R^Qlk{7
z^)Ag9HHM6Zd#uGx9WRq_O|z3jB`gNs@1N+=^P4^>){kZ%7_jJqI@N+^<{VeJ?0JXU
z2K8@|jkpX*0v}2uqZbsb+4iGyq}p8)HDf<)jUvdS!9Vv@8@euD9t#sc?zH;&F%B-{
zc^KSPevC)TFqZBUeXlQj<c<9;i1*3e4neG@o5H=hb>VU?j<wS`3N?OLS3+qksV{FW
z-U}c7_IR6A5Yo)rjUd-*Rgo}-5J{2Mibc3kFI=U4nbJo`einm3-ef1YO<34HhY3%*
zsrV;xatXI<vA6p(a2UJ4rl-%cI)=uGlU+V9i{@}WPe&H_^ZJi!D#6_&(etW6YxNkj
zQU7p$t?hCTe<h40<Z)_Dd(4Y$1A*pVmsLXBd`bnYQUT*FxaqsFz@0vNDw(e}dh%7R
z7jGZD|IFRh8cll4(D~enA1m23znSVDp?Wkltjk`6-<6&ZsztH|*f#{;34n30N_&$?
zg(GF@<G-<m%ia?;vG)$Dc5H^SvrlN{T{|(Q_iCXKlzbcLvw8hi10NCjK7OqmT2B?u
z*%8*+U)9`@0cw?i74e<j_{`Y>8*nWyIm$Wzo98YZ@AHADY<_O6EG{i_jp`|24uAPU
z?dk<CwGwoLlloA39<TI09h<S+(O#CpzryVos2>A9jgtD@9_kSMBLjF0p#|1AW|=ms
z<O2g=*KOV}E;lVI%cF-4z-4P|i11E>G^aJWvE?0gmEGXmgz7xMqIl_X({4zqv<rQb
zRnec8?)M*~orQ~k45lA7f8(D1S*UlGLp*p>dlqw#2Y;R&=XG~q{B|Tqa5BpX@2z5M
zpkI_a8hSnb{D(XF`SyE!b%cxEWt6c>Uwd!1@%zA=|H({3!^Iuatjb=gcbdw$ddX+T
z8c0aGHZX5FW#1OO216HGsBNvE-SZC@=#q1Kw34t^+QFKfRD_xG(ThTU{N)rme$E64
zS|9$L2#`Dk4!WxH{){?Fa<Lh@bAE5wqXH}?gA|kbsyl<M#!O;6M||CQO1C?$fbzZl
z7IB%qDby<q9V-b5S@pkcUSTVtSiD2!_6xSH{w}dP$zOI&^*`TgC+ZgY&pTZ`e(-|k
zk4^n?O8Trt^*@lY)ZVxG*1Ud_8>9x8y&>NIFGu9yutvoMRgS#nLw#&1+aUV{6Z}7!
zp83(}v(hP-@jGMTyHk!eRzu;c``Z;_WQL7%xuNE}4@GYsmizzmk7cd?s{EbcNhj5z
zdmA=h`W(L;<A3A0th(>)AC~SfCKZ)ra}7U46v1pm=)ndlpKp-Ttw8C-L-E$c*QOiT
zZ*>e`jYu*()ePm9k{YXs4KaR1PA_cUcbGk&_!RO=m3NEuqod;+{kYNvX|<?`LCvx9
z4%V<`@Jr0SoP$qD#^-+*T8{fgs{3%CTn1Vm6!^{QkVFdW0(i(TCw2H@o;J<boJ6_g
zpkb~E=s?hacW=+-P4aajmM1@;$zL{-PyWmjmd{*6ogcKc_XB<@;Oq8%)j+)=n5wn$
zZ$mMhmGw@zy$M{K1bNnvi;#J4EUCS%f;<FobT0bup2E$h%zdP?<g{-!eDyVOh0*;7
zp2h*h*!5|3usVC_TZO;N8nX;lanoU3U_PWJv;=6@@-_^ZWYpgc&-ilptoIycTLsOH
zzY85jzN`WL8NS^NvE_O2cXzUWJ_gLgT%LMb``zasC1M^N^|^08FiphWY}mW-XyQ`|
z(2cD>grmOfJb;?JPggd_e6D(7PrmaGuk;=3l_ddt`i*6h@T>)0Ui#i0pkD>2s_$7-
zSKtz$uc`U()Su28iU1D){3P+}MUJyQ3z|y;W|jSj;H)7%SH}NimqmTY5&=({aq+q5
zkG4twJ8d=o+N-0V>q<qU?n6uB!(Pif@vF7r<>O;-%!}a}^5ZX>%BxJi4JBaPud0jM
zG%yy(*zRCg=d)+&@>1p?7nd}x8ElgS^9`N#Yk?lDiD6Lnz@g^`?G@HaiPIbEBo_}!
z@0RniS|#bk3T%`Tb^J!zt>rbm%7B`iC{*8iwchV4BCt|MA}^WFH4GiEajpAo-$A;f
zL<4jMo3YQU$Ty8O&jU5M^>XJ;C&6m<%MbE~Le-yAALhUQs8yg-3p7j7Iu=LL-RD2b
z7&@f~+lXpSS?_@J*n4UP-fl&uY3ecCa?d13go?r*@_U~Ki(bC*C>raWCmSzZo-XZw
zym${Bzv^jFdeFB@miIB#Yf+T9zBg1E$Z@@fo9!Poa;l$xbBC>(=GijPItJf9Nlqk?
zFg|u>+r`y+(^$1gxwHkVH{8{?HJ!w=Nk+e?$PjPzow{nfMr^7z#bBKY-eAt#3dJ8=
zG%jCjtz5fmZP2?q(*EkG_Z=zF6y1r8Xf4v!Sk{-oLwLMAeu%HYh69PuA($AcgtcX6
ztcTSfkFL5Qj+eFAG{-A3Wje`y4_`OFqj>fdOcciXHo8Ehq#CjM0+r4ETno`@7PHKo
z_wa~-fS7^y97do}<8J<pS;C+%fm}E77ePV=luzOip}LxqHq-{RW1vp1<JIJ(j0qwe
zO>Fn#%f55<5<R3w>@hjL?WCKPR_%1h=eit4Z3qd{_zoY`y>qo&ot12$b8CA>8n8es
zGhUuv<W~khu>i8(DS36F{KAR@%@0IejQA;x5U(uE+Vz7>^Xl47udntdJcM`NnY*|f
zHNr{`g6uONXwYfk1=#!~@Ghu1<RjGFqt5X5wy!JdIa*{S16`za6k9CZ`f>EdJo$xX
z)gzS>31U}EzS`SE;@wTKEJlRo`S#dexKY#Vk8L{F>y2Ri7R7Z#=_%jE4-?){wc^eh
zonOLUL*eigDxDs0L5w4K$EL%*10zxtnxu@B4jUzQYK@8|S-bYx+D6xd@Xz%OiM<@A
zdfjhK1Cl2taNkN2xq2aQT2t>86XAtGFoZ;jjwG9c!c~mMG-1{EJ{zmr7A@+oMi3cY
zn!LnE$!j7As`xiEIow7uC#I8bOr|?}sBG`ev2+4sGDK4QtC?i-Ya;va?r^^I;x~_{
z+e>^>G^W;0|3I_9(jtY9Wq6eGal$I}VI-`o)}VG*sl>h971Fezjh$*nci-CqHQGQ|
zIr?RlatPQcP9y9gAK3DF`EBm@u*gdgoJgaTSVqDud#a{he0Jnj(LSDB)y(S-s|gnb
zJqR+*VJ8ZUe~{d$bbZxB>7ujD&W#-wHjU{`VmeXdIUh(;fgE?n$?4k_T}x_4{+>=T
z)ap24h&7aejrTcN=vr!ZqVgkHdgfz;>B?%n2x@iJmY&u)w?*K-Ck;{94oGQ%Jte&M
zT8Oa0Qrug)nkwL_LyPDU*90W)C(YbO&7Fq=NnF|9>&OlGZC%VRC#pU)F#JB!xa(*q
zE8hs%YjjPea;sn=VOpCE@}0%y48T445@h;?$WxrNZEOzYU9;NZ%X{T#9rm@4!;FzA
zkfp5HjGXO$)_(BZF?aN0aCQPLwY1OIo+X}+6k;ZqrvcGXBE2BzPA+}nTsidmIAifp
zfw!gZ2W;P#qRfRtZ3<m+(Pfw~Wv!3-S6WdahF!#*Gw?E5eCQd%eEG~E)Lw~3R{$}K
zC2bcYzY-(jX*06Z5klfxTwveyQ6Vre(Df+8^;uv0V8BR#$3r5eOZ{toUn0;7)FQhA
zAn)Oe{GoncUmT@d@2yIe3w->bDSYE`PBgjW!r7QoltMRybSN13jV`9R7$%!c2xTMa
z`k%COUm))=#swJ|Lk{y_LD1Z63j36|jf5;1X))a(YIsuN25@X=$<(Zi`K9UZ!p72`
z@({M0D}vmz&z&Rgd-6!`CEZHcuA7)`p=~7@u%>LWW*0DTUC|TVXD5JwfC%XOy#sr1
zv`*zpNR!B!oo;yj4sizihs!Y*7vpb{(n-IZmDwJKvi1lT2HNU=`c1t-h}Tulyz%kU
zPLzD&5<}it`^6-mB&V-_#E6zbDTOF8@cC@whu_VLn5;q#XG_TMPto(cv%0Z*gTrSN
z`yP&^5Ev(#6`;GJeF#c^ndlj6?_pt7Jzo>qr9`QF*ypIO-HRyqcW?3W@DEUhvo$;d
zee-@ju(xRT>KxHp@yxMV7#fsp^cDR~Vv~+ClPdBc8dkuc0=v)37GyYBhdM9d%xLUu
zk<r2>406)OF*siJ8*|NaeYPfX=P^mT@}jH6@R!p#k^vF!fd>!iTVBeTTHOaWSy)Mb
zZ|az1FfrSMFhDNDUqFh*3*`I;8j;iEOMsC69*AyIq5*LAteek(hZ4B%I{Df3W-BAV
zd%skCY6S$H5PA<*dHI8Cv>2Hip+I*G{E2K&<q@6zZ)EYJPwMe~{#xmA6c6PO_EzUF
z&p(wwTrv<EVl!cGPsp@mal7<UdU|Po&MSeS<+Bo}J{5<6G#0F=3l}7GDPIu=B{MeN
zm7#%f?XIja@GF!YZAWQ7r8oO#AX12rz20L!_beOXOF78GrSBmkc($vHIXP++S|jMD
zZT>9YLOx0XKQAGaM9G{}HbPG7Vn3>v(b?R*vG$2}nd1nP(MnRACF%8)*Ct{2He3~O
z%iOh+k24Vwdu>j;g8cri+_HWzO&j;5Y#W1|od(*bnVsVu2t}HlfRoB>$|Xb7W|`Mb
zI(nk+BMf_ONGj0QDn`$Lqt(~QhIDkL9r9AZVUFG?8Tv_8<#v~Oj6u>u`dsVFCT=yG
z#$`^n2G#6w-vW)=JI~)w+D__%{Vrvj8aX>4CCl@2^$8UyTJKw-!GaqSQWg?9)}V8_
zlUlSc+4clsRoiwox%%YiijtIp!kBkz+P=9FN=qmyEy6~Ld<qzXksrsl9550Wn7+w;
zbP1$FM)x*mi)@9zFl?{z)OwOyUS`_=H;+2kXZ&npj!2fpnQq|I)rl#o4E(FY<dz9v
zFoDDqzPaA5=cd1qEB5>5qQYLw@enHX%xcQ834`b^ND!=;lUBi;VD~|ToXHuNYCB@w
z5dLFEUvIFv5)&O#7A>1^Wd|Lk1{&Bgu{q8|I4?-t3l6-|Dhw=QDES48#vn#4<jO!n
zm?UlX8}}d!q%S2Zq4`ON>7|jlO4b*qjYxqZbo$$Si!v--z>^VOa5Qd|2qSq;%Sijk
zJQ-o$@sitCw=G1@_;aVm9#vT0{4o<o>rRT_<Hk==*_=^!VNm0jtB%XZ+j8gi#nvgP
z9>SB;v^yU1U3zrkbDwV$Px~`^Y+BnOxw|y0unkmbIw|#jI6JxQ!Nsr)!w;GxMBDBT
zscIp3MMhfC(YI$Zp`5{`q4*@m3fXoF{*OxVZ6d2{s6xIGYN~+BVO;~+vd=k0VKYbM
zh<#VDdW;fEf$XiIku?Jm1AleC{HG(|qve9+m$e<|1hW)>7{wk408H{{k+ld^jw&##
zFkM%E&e~u;Iq0?OGFtx_&t^f#uk=*$Mi@QOMXH?;dwU^a(0#KxL^gsO#1izRp4V(X
zXs^-%%hK{l>G}zP>c&ebD`73cl3_x;t;5g4Lc7DZ?P0B<-J!?6k%BlAmkVn)lio<*
zeLUTr#-1rli3VExBu$@}kx0Wy!kStFoykh{Q}(QedQU|eNB_Axb`j9Gdt#zW8G&?)
z-XpXa6CkQ0^s5p8-GBf1eMl})De<vJl)5qT=3=JtBfx)Tt>l<Ba-(EK2h+kx;*q1R
zMENb1=i1D$32QzNB8lQdSK1TeM8mo#h>J#jRc|<+%)Wf&$Cb@5m195`$BvNj{L~u5
z|K^=I+lM~HlXHnLK2zPgL_x>eL%I|?i_P@0E45F9JF|V!H<cixqnK|-ht>xB0Y|m+
z)99jX1ZFO$nqre*CyWKBXCMf(q9pYaUwf^4qNH|fX4>d1fa!%#((#YBSJFLLqJjoA
zC=#t$(vR57=etcg$$W(S6NI7|vE~6QOt(YH=uA-psMXOCB{TNqG_U*Y^g=Y9TuOmJ
zyc;VD3VWoQHmz4nY-$L?eVI5G$4;3O7V{BN(m)UD?Xi%(w(WWgzz{Xjw=bwD-5_!}
zTE3p*8o-&688zyVp6h-2<`TMy?{E_zbbX47k-zbJNkAB-v=MuzeFekF_utO!R&3xo
zKk*DojN)^nW%<kwo>W*rqq;DRd%wSoE?e#e_5k#fPbp<yGLpOxCEj28zWGKNjezzm
z5}es(;{vbclzERm{l@rI-RV?6ag$qp_%{K4{!eW05>(S%aDw?Gp8^S=GrtV*#gAcy
zf<5_^s<PZvn-QHL$yUCw=k%@Yq&<^~m&4g<`hHUkBVb8wBaNakQZfH(RCy<v-M%TQ
zm7<4grEJbgrO=jV?+Qw3M1_~;uW1sHl>M*%mx$(+oVm=*(RU#};vDeriM;Zo!UrI~
F{{ta%lm7q!

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/source/assets/design/v1/prefix_caching/example-time-3.png
index 71b9e9b60ab9aae080a6e70a9594dd3374c3591b..d753a406bdb9aee1940e9e84e8b956c34f0b6b21 100644
GIT binary patch
literal 51241
zcmce-cl6`b)iz3JC<8+Y5FjuU>VyE2EXy)=EUUU(k|j&PfQy=KS+XR{lIUa@AhcnC
z&<SNi4UoV92^~TRkkAGa0wJ`#K!!FX^!^=@<c9mLb=P|Tx@-M@wsd@S>OOlv``KHk
zG-}M;jkn);t+m#g%dsTC)>`YVSZl4dx1O^BDEYF#YXN+%?eh$=){T$vxO%O%a;1Q3
z2X>@eBW<nSGR1#g?S??6;|IHC$lVZx8fm@2bH?Bjl-o|<(k-q3uip>|$?gqh_C|0K
z&h3^dWMS|D;rk#+Uj0}7j-ieJ-B5NP2sE%~wbQf4f#aLIWe9N3jRS22zCkhgtP9|y
z4gO*1a5#H-ZXviVxb8@kwQku8c0=fFCbv%p1I2rmS+QE#4Jm@_ku}u77o+utPWnjS
z>&*7C98ew3?*r|V0YydAb8JwFh9LwLWg#fL50?J$-}(Kwve3~xp7pKX(y41f$N1N@
zX+JE`Ew!l0WLKY|9jq_`qe$z7n&(?!t-e{g|MZ>y5X7$buhPC2T0Jn3wA2O5<NvD|
zObOX7LxDZ$TOGse*x+aD-^&~sc)7@{15<0~b69s244BTp8W?xLoc`6>zn;&jLAQ@_
z7+nR+Ne|?(rfG&h6()`L*vWfZt*YfaDwNfxMVe7Kk1{i%=Za|2XqHP>F;c-A6L3W_
zIT6>Bz|2L2C)gT3t@4Hv;AS!Bz*89wI>Sk(<Of1q_5>ZJM~pOrT%J<-z>4HD1Ec+H
z*3_APLb`CJZjFm(nQGTdRcNS)8jnIk&ta-mH|bX>vlr9@(Jyd4N6k?=3(ren=qxu@
zhuz+!7U)p9ZdG&L(O4L0gECVUXt|Zg8Q65jgMyn)<dG3k7SyLvNTH`3jrl@G)FY+i
zO=da?6>Hh9J%DWw?}&NNfhE!#<g`X50lhR_wU<b~U!Bqo8BP10sZBTEbl`x_jvI|Z
z22}?`uor|9!iE(MI!&_&aoujsv~zHg<eS;NkuS`0F(oVVAQ?hxC2+Jl#_+Y-SfyDJ
z%3xTD>>x><vsyK}o1-cUuEmDk?ey}#iL+=iDU-E&yA^f(nuCpoGmLKKCqy=zEur}l
zRTQUO9V&E9&MukEC@R*t9$k?gz7O-mX?6m}COE_P7*C|UAk(8<LChkBjxs}Gxdz09
zP@zJdQJZ-RH&rU2GJR4gRI4P^AzHd#GObz(Oj~gHv~DID=VqB&1+#d*&QTd*f{)9+
zf(e78OYlu74q=?jYh{HXIJ{O!@KK?RPKlYwI)z%DX`|I)-EB*CBhNLaO~aD#cCl3s
zd?ZJB1JFbZXcQuc`99?#N$mKI1cH)Du0GD1v~CYfsYT5KOfxEEho4Qoo?np%Fd>)4
zkS*Gs;tawfW}<ohLK(9xa1UCWOnN!5>|#YvAyCvSqct(;&E#_3ZupQf%NA;4vro`)
z!fII8A`Bfo<Dz3z8VCr{W|^ikgzGW~O=j$%SqMV30BX>l7x2oY5nC-#9l?n9EY~q-
z6|@-nnVCMz4?%IZJtd@E<c%3r8)S!kx4<fF$tmfXia8{jNtf1JN}P9a8Z~-OiRrX*
zKtoQtW2)dm&9RX6K}QzZ2};x0)2M2muXiJ$6KU{QW(*G|5~X9XpeM8jirC7CP?VWo
z23m_AS2;K2Dup;?u_VlQqJYS>eai5PPE$v@-T)Q*P~gBGAGT^jn;T?1K^`4eg|S7H
zXB<AWTk%wEg0?|pOo_04ob1)2#thOLRZI<Z3J7|8go6rXiyb&>Xh1`>q|l=>(=iaS
zI&^)grH;Ib-#40LJa4#_;gBd6nl;F;akCOPv?>%ig(IVs%Qo43(9wO(CSyYn`B^>g
zRtf`Rg0%bA5Z9cdWlda(9a6XwS_9uz&A!~o&a81Z3kJ~YM<u9ZbRvD`PGAO3$0YJl
zg=Qz7D+Dc58dW7*;yN1KbdhqQNyal7RLW&dSLs(<6VgP%t>~wpogD3gDTAwMIHgPd
zEaf8-68mT-JxR7z7Q0peq?Asp6j6d5GBcMcgMx0eIE6=Oc|7VA&7wS@YDUy=mLgSi
zY1QN`n4-Wbp|PBaLYSsaxrvjGKAzTOYsy)2uIZ}q2r^V}QfVr1x37D?7`3u_m_moy
zOw*c5ZrvqyVb=90Y{wU~${?H1)U81;+oalp=P(5@ciMq6PEbQ6x{1`oY$fa1gBBVW
zAx3U{238-0c}1I69EpZ&<rs4_xmIQ{2nH%FjZl2bD=scKhAoeD^H7roBJDHXDHvya
zK&FRW%z-XKZ~=|cqTXYs@SvE)CWpgHo9&i3U*J><2oxWq*&<7pSv2S6Ix@l8UPt6d
zb|p`79cOGdLxwXE8o{+*%gQiVs;Fp#l=_(jX=Va)iqqN74EI<i0Op@2^KHs$)B;*l
z`)s)x(cO_Ep}fMd>Dm(zo5ws*O)qL~I0gm1Y~av(VUkxolWw+b#Y<v+Y=(Kfmech{
zrk-e?WR$~}$l#8H7Zoom<GrvXbhDX|2Q9G!xF1MH9AZpJ2%VlZw8|qVo45I3f(wc~
z#QMHhwy@E(B-%B!3LXJ}NmBK|StJIq3GT+Za!$i$BcW*Hm~OgOv4k^tsxb=9aR4k}
zQR~>2r-r@yWRT7>l{}e2N*&D`;}fPu_dx-uPPFPet^?Z-ZvSKc|1T`?UrP^;e~KOc
z0}hzN-dvKCYKJqP1I0>y98IYIV1elW#scl4+^$J%C5QCe6CD^OyElS|yffgeUaREz
zYCavj(WEJB3N;5AB{Z8zwU?@HFf_h~^vB(9tDpefMbBa~tIaxgpP0tQjM$9_q-jN!
zN@qmsGg+2oAP#MHXfu_H%0Y<fvT$KqYg8w}7(7#}2~7^^@g3KJn4TFq!&0-RaIpsA
zkR4Mwz2CI$Az81I44J@VJQ(6Ls{{3?T`-np43)e(ItVJ-K*M~4h!ij&WtcN!1tUjh
zYlwDYqcGsh<$)=tQ|eEPcnTHiNe<l8WfXIgX(~}Iy=Lb+!Guv00oZS>qBCN?NEnz}
z$@gQJZHqXG*y((ysEjwJPBXQWi4tcmG+ivF)FDE<QdbyX@o@pG)j}ve_Pp7GYSaLR
ziY+y%Qw3xeh!)3bNRGmN1jiJEfop8roj`SsfNS6p+Hg%}MB9zFWGb#X?zth3`!l0g
zQ7AsssH<gH>{J_EkD93sxH-Z28LIjM=u3iO#&5TYHdgEpXJ(Wk0;))Pu_g9RY2304
zgC-3$XeG&_h*jcb*cRF?y5<*vpwKjw(W?wi!iHCHXDv8q@~Jv-kr|^_J1Uzh1=5M5
z@*q+<Ew2sWc379q><F3ogP_>UmfQlY0&`+?O{1r0CL>zUHOq)hbY#7Nkz=C6$iz?(
z>LO{?Cw03njzhCURkS*5P)Md?!}SJXw8P#E@$->buKIyzRofkprE#H=u!7%g%7#+!
z`Fw~^O_P~H<ADiJ5lkG-RG1yuDA`OxHN%jo)M?|QP?!2##>>T3q}t~jg~aDMs~R&D
zT(JstzQU<0V`nu+AjSdYia_OMy1=ZpCtRFPwquMO+hR*51qYDW8AoNeX_Y6sZ3vKP
zmBtLm&0x~BXTAK?QV7+-np^`s=dr^^TQk^!U!E3XqNI|>P*1nV&jbwH96?%>!7{ov
zY<8t)i?*DiTCH?M0j=9Pltc=}BsVRyuHPkeBUanR2^84eEV1f?vTly+9i|{@19O%o
zW`crudx;tQ(X44S#&!`H>ykjV4O{@*R4@#o3ziJ*txzf?&`jw^L50n<3q7Ch6aD;@
zhm)=}@&hf(1%5q|nvw(eE6x5O$LEYu0EV0=!5wJeoBKRYHlsl;KbTsmh79<gR$>Ar
z7W-PefZBp8V*SVipciTSNYUZUPNC5Tze7R3=1i*k3>#42SYD5Ile_}7H%$zIq;|=X
z?t7&{rLFO10d%S=1ppvY9Z5=LFa(=ghZCF{h(%H$IMZTK1>tI{sEzwn#AW)NI?2~Y
zbT&IgOtnzYVAZh-o+d=9KNM^(mb*9}Ov(+e3d59>dV|s+glei|M0!OPMp>Ab$vROO
zmE{Zs`K+X;HmW?FwzziO_8rH~QUvN5^;*7_@VZcIbX$E^)vGO6uh$2dE@b#dqf3^j
zJvWQzrjVj_CYbFj#sqj0;83HvX3oc^vE41F<LcChz#Qa&sv_Kcov!lYuz<Hpe5+BO
z=^a+-%(9sp(C&m$Ohi^qt&~Heot_N<5nhHX<&M{ByKSMDV;qAjLuhdt=vGNH>YYNx
zlu3P*nkm8`yGVOfbA)NoM5|MIDp#CdNINtx8JS#^DlJKBcjyc`L$FSPnwk@|WpuMs
zCeeeMmbVztF=C-mvq7kQq_-QDem5Q=uvZ2{B4B*XCeyB7O_hgE!bD~a^aU+c+EPxn
z+hT{#p+jhlz;P?obZM9^XZt=M3*}r-5=NB<)+IeWG_!i&Xo2Qhup}jp4)h<P@gk=f
z#VRATQMrfVj@k*pahGJ&K`_0~cP)Dur8*n@0ppWzwCW5K8L563iOHy@4>*ktM7Lef
zrZSvU+C)!4w6KR$dR9(-1v;8Bt}a1A*d>ywA_g4<o6=r63GAW_j$&C28qUy#^IgfI
zCgT>IiQ*Q91>Q_bgmQ{0v%XKya_vISwxVbx_Yt64nv%>-INo6sx!)YBc&3ljYz%3%
z0JLsK0lFiQWyf&R4b5nj22>hL+f_KgrA#x!2NeiT!cGRpsv5`0Tpo#Qy*zLx`%*11
zB$x(}VLD3@Xi#o7u}X_+8tuAaaU-II=|o<Ya$xo7uAaoO9!PY$l;B<=xi;`1d%#}O
zbYfU6URN+t#DGr@)84dI=)*!#OZ42x&3Z{U7wR~L4csiiel!}9HCs)u%n6ec3BXbS
zmf?SgPrzA6zJ*WfK`F=7q-yMIn$#zGJ5NCW!6!*S<tnIZinkdbd;r!6jGZWG;6rNa
zphPx3&>E4+1YVd4L!Pp#j!OYcPEb;)g_#*a7`<lMYccfzFg66uH9Kks5tO)<0Ho5b
zqyCJ<^FygRVf#Zmw4Di6WpbP!beeTgpO?xbkEW#jBvR_GS~3xc9z=?ckg*^*wF=R|
zY=u%dKIr0D(rlK|3aQ3?p_eDS`H@K>{5Y35YzsFy)`$#0U(!d;XzX{JgCMWRR9CRU
z#0arI46vG)7zWdd^{9jsR1O*ORkzfPMrg6lqA@+JIs`PS<k?EMVLCQe_a=Bd-30I_
zva}bWI#&{^*_<C#BUVZkn$hI-ux~rrap*`$ftBDSgE!Dro0j|(I1iLrdmKf0HHOkf
zhU$`0(n{)p?hVspE{+kFEg9t!+whcToSzv?GNcr{(k9DsBS-u7xK|7Y5*&9%#MtNw
znvP3aU|0$3L55vd)v=WdSP)icAz_kEG8+T7gr=I#xY|T%Zx$9^dCEd@wbjT|z9jL@
zKAlIU%BW;FO({dT6zO++!zLIjz$&U!Xg8~XBAXyJ%PcwFk{Y{)z;p^o1NbvxU8P9i
zD^h{g>gS4lDkd&cWV-&aQ**TjALJ~0EIL^%%T7YASC5nWq?KBfRx3LYd2<p~OQS-D
zEC3};Py6Mv?i*4|N;<ND2@+APmW)X?c8Vp*1el92k7*1V(Og27as}W&x10u<%m$et
z^7xtW3YN!mUcc=2+qfeMOm!p_hM2>#x^Jd#R6TVP=m|QBfe6#J0*^C#2{MBz@abtx
zY_Z?KZ722cm4=lKx^7mKvz<s8=MlD8B!paFQpi~nc%vq-A^lQ~bDBUz_ylT+W2c1m
z$Ih${xC(JXxKPa?Rbqq(>EVZ|RJOx0omKLr*~bDV(?s-|1Kh|V+2F@sf->Df9fiA5
zH3Bs|IX9J-_BSrFhnWLCpY_L?&d_tGWoi^paxF2)0BSQePqo07WWO`@{6+yd;Bcml
zFeR)JwooN*SPI}aykW5>b*!GM#La?|`c9UVDk4JcwQEVi8w+EK82eN^T_6e-F=f;n
zKs*)*B>|geTazh-@cAiF=&m<GU^?SzNIi-IJ>hy@iocPd4SQ2OaHg4-Gj+SM?uG-Q
z=)wp-rpmHZF)`O4gVqUG3dTO|xVT^S>}iRfjC%02-D<kP9?%`bZyPg=Vu7Y1j0RJq
zN`QPO#<k&8G%>22(E-<IwhJ_)j}oXH)f?8N+(HmHwc$G2DwSHK0Gw2-4^QfhMn;ob
zvBZo!(y)Wu4utYv3^iIkZfYxGh3c>ZlNjL1Y=_UP65N?eEdqzDgc?Y|`;C}kFS2?u
zS@TI5Op;cDC|{MsUM=Vphi=6gXGG70dWM&2X$5u2W+z^|hHyg|)YW6I8WzV?LN)oY
zCJg`wG`5jqJEPiO-j$?S%yJFGv*Aj?8z`JqOAm$Jk^r^JQ=*b77!n^0e8xtZWYD0}
z#VKYpOf2*5(o_xgq=*e?sa2)HJkU{k#!H=X2ZN<Zaga&1-6nG^+l>-?Hck4|9NM0l
zL=GuAFrqbzIYLkywMn4V%B5t|DWhNsIFp32PR?f#r(LiZ8(24bhDWVX1zSKvl_}`V
z)lsU@mL%P%+PXMP7X=~uY>+x~ft^_JPz94GvNjqO%1DnI_NGV-ep4Wi%=1pJJgC+~
zE}x8jB$=RD&9vYNQ}J4Ou4`lhWst9>E&0gMF1ZX-MYU8d%!Ue3l>|kXkv3qid!AF_
z%WbJ$iu~C~;oMBuXB&052G)oePHJKf@<Mky%|W^PsN70T@l;QcTqWO=OL(J`Er%(;
zLhDixW$}vG0FTv&NiD^itX6W0eL<BXeUQskrWQbCT-C{#fH$KZh8{;PDdZ+5;N50{
zS;c9Q#LLc9n@x=Z)oaRyxWY`b(^&OeS}MX>s_9PcdScBWxH70BL)-_;LCl7!3CdaO
zqyu^-#{4*>1*9tLG?GDb{RYcUctdxGa4Rq2NiTrF&}>-X6WhwSl~^g`*>ve=*+wXj
zD_A9h)Nymx##EM6wdx?yY&7eI`8Ho~_bVFfC$ig@!2|xR7<DlioX*@-sbORv?oP5m
z>PI=WZ%9Lz@ZplDN@^zsH`B6g$$VDwotkHbmLHb`G@O;Y)rlIR@JtwG@DY|;c!&xe
z8Bqn2s0Usq$FQuO2gNB&8PGLAvk*4sMZGaHi12|plG=XM9Ok<rGSk~?Mx01WSuCXu
zRXAJlfaWvHsA~!7q|-rzKgDJ}@RiRF$Ca3#eFJ^|x9A7#KT!7n4?EL=51%G2Ez+A{
zBHgk{6jHF3MtX8;)a_+`^godr4NIjCEud4CZD%>u?n8(tPDR(k`(&k3BdltO)&?av
zmG}~n6v3jPqD^??QoGnRbr7JV2M$&zhYqGx2EEJ}M2SMG;ZI0$*yKIn&kqKc01Vt<
z0G#BQ8jV^G1)!ze@Af+NydtN&Fq;aZ-<AS7Pc~|WS+fV|+mR@DVpcX8nfJ}9W{Tae
zN|IxBiec5d+^kah+Dxub$!=4w#AUaF1ONgOW~k!Bexyol$d5WQU*j-6qH%=|CPT4a
zoD@(+YYu5BsLkq5t3iR74Okf>6uq844$yud!EjvD^B9`$ED8I30QX0kA(snSyj<vx
zYEzhI+xadz@V*&(h*xkpU#R&kIpFk0C7KC(rRD%bNGm2&96_R5U~*9q4Si!Glcd0b
z?(&6dqtOE~D7VN~glydFg+#(Cz1VJ|`5>X)IEAFFGYAYscL-6nVFwQ@Os2wlJ+(`y
zz;Ew)vslbzm_j2Gm4+ulmEoi(2o0Shg>E!TTzOJ4`con7xxs7*cqGQLTGf!yd!-6C
zi7d1^n9=;e$1@s=3r#udx|Nyh^L9kx`AkF06R1*Gq$WMB`qjV%$~&JiXJ#*HYQu>(
ztFZkNAiV23*(KXlX2{gbl#{nS2rCOpR8{y{CksFy5#U`T-I$8laU>fz4J^|K1a8@v
z$XV0^t0W2z=?K6vv4{3BmKv1A=?NiW6G6rr!<<LQSWccKy(|wZYdKV`wHRnJmIyNj
zGD#=dcA!e)1kcn_yqxV6GeS@G0UP7gkQrFOu{Bk~h9y*BYl0>UU^mK1b2!K=lb|Y>
z8)T~X39Jg)gI;fJL_nWP{Q(1lXb}yW$ShMNV2WX73HY7F$YEo!ecv2ds2xKc$H{jF
zGbhJ1)dD)4h$XIWG~Ce;>VU8wCzS+pzz!<d)KT)h$&f|Xk;kwwic1_@ZDhJ)qf(B-
zmNgu*UTYRvQEOUKk|HH^BCTJD3Zb8^<6#7IUf82>ywn5y4LzLJEg2o;J6chzHC#EU
z_bDxfQDc=gG`6Pn2#Br%Ft2xWngG22DdW~{AsxREl2NY@RBaMwtiG+6LTJ>TjJ<|X
zlqV(tzqIPJIjcegi<Ph9!h`@P-{={$x(l1Lfu%HEp`OJ+OYP8Z>SoS11+VE%o4t}_
zo3&<ZDi5c@P_66Tc41_<CQ_D?h(RY4juMqGT9K-dd22>TMBc0@Y__AxR(3Y*=k-cP
z54C`sVcIm#DnSEml+&M7rw+@-QXSPxVmiZ0kuIP~5zzj0rZZ~jQ+8OLG)=!EI_{LV
zxR#Qs2MJEVT|KW=8ziUt)M(1#=9H*8eXwzqsr2dve_~d$Ia41p{eI3frW3=T$<;>P
z>T$Xw=UFPkXuVyF01Zkia@{O;0L~5EI!{QbOOR+CYM}|9<1^WawS`<F*BW#v#X>!#
ztrTOMauYE-M%WfQqb8({`V9%=2T0ZqP^gBmo)FI9?zn>o2*vXqDe!&ZT#Pj{j{VT@
zfOzeIvFMzG3v?^!r??tSJ?yq@GN}av8qG*(D7A@fJvS~*_#mE5vMs$o#Zz-c$3sw`
z*J6Nn#yMcjVWtmfgM1ktREMHrQ#4}3@gOE;4CGTds^-Z3VbXySs+D3ACBd>*j*$ET
zu(J}}a|>vDsEzYtv7e#3DGv@443kUGbFRa55{ohYcA>-Yfa<GoWUf-xs_ihIPCyXe
z^g{uv<dnYY&%%K;WW_#P0H75-D^5WALI7iOyQkZ{thoZ)toLzKVzRL+YXvG5F$`<w
zbQy|#c{&DUE#+ZK;FCU6(PAhc<D*eYOmX~RG-XSjQZTKzJvU_f-KH*p$7#pNmfKEW
z$3<&Wq`I^qOn|C%B#nufs8xuoLm>`8H_oi4NfNm5b446{2`K^D@?DA1++L_6t=Lwv
z(V)=^<sOJcw$&V6o~272SL$W0Ws+L0(k(PuClqS}MED7y7ADy-@F`8DUdE{5I219T
z!L+z1ihURBjH@j`w1Nhte#A+@ph-2Ura4_AFLHE;8C8uj@IE7k#<C0u9F1bvOC%5Q
ztPMen8iArv-NrcVka!VbU?Cs%GUX{84(s25<AI$`9c&6IRMgNKvD7dq73#xWYixAm
zj@u4UY*a4k%#c8mph5W|tz&9~osL|EoM2L;0677q$Glh(Mug4^)s6>|tkkaIFxTN?
z7&6*!%H;vBX5ug;z^Ao5&1qJ?tU|6aQN~gg*8nC+r55Yci4O1~Z8o#1F>T`ido~B`
z;J5M2#9*`eT9vCQyn(6|rHGdBZ#*M`XcNFQR0D+@ww~cc)Syaz7UT?dUB>Qqy)tkp
zds@M()Idx`opIxS8_q{QL<`1*?J3i^J);dd8>V1&Aa@w6r3>9kJBtycMhql^R+>J_
z6>F+pDj|&m0djDJG%()ueI&3f0SLN0wxJPlvB37FcHeKOLPV%?O+!pQ>CFnnpp+}>
zILJbo#%OG0lNwGY_|U|W2+Dz~ll0Uwg#p9Vzd<!Y2ANG{Uuuz9kirv7OX#YH#FP3U
zs>l9-Rg<38#i2?g7OJgbn?uI7!Y34ug*B2Knp$x<&E)V>ksC~4EOfKeg!e-%-%G8X
zHm=H|o`#)3tX``cHOz%N5uN(W431(m&DZk{96)cPNkyUw`;E{WqN#fLH?E<gVzH~Z
zzCm_}2n5!z=a_&|0eeh102k+}YCUfZXj99nbsp*t{JJV~Y4Q=l2(4_RN5&NpMQOwZ
z08DDbu_o7SnU=CZfC;Y*6*6K*Ee*_CdMG)-=+tIN*ltegeo<o~q>z~otvp<&$<nk@
zoODWcUT9W&Jj%DrjUu8_gV=LR97hjH$Bla-Xk;dpT9Z;&9q}#1Nhp&F`-4Vk2^?FL
zdLyk=RWeo?vFf&5DFa6+7lCJr=|&d(KFk(ID*6qz)s1R!Q5+dmgXqFFVBQ#B9h=^W
za&zrW6nF#Q=yucs6nhj11e-R44+oK7jhm@+Ah|?Y%<6?~br>^JSgcWLaIaKvA(lsj
z1RN{Zl}mw91_y^oD%?!!7EFZn+Lb~$7|A|ZRuJj~k)?9f1PG@Acn4kryn{F)gSO?%
zje1Wg2a1c1bi$5i0Rw`S6d?UM@JvZ#LPVUY(*|%V0FP2gDY-EM`X{QUZ8=na2EwZ*
z*)NvjI?4`FIPnU)n`B2b5En#BB{vs0VwW5CAfq@?SxOB??QXSzmN`I)7W^z~wOdI$
zEpJwvxCNJNnl4X7s@hKUT3ExgD%`1SR&yi(U%y9;436q4bh&Frvw>d%gGr{*tl??R
za)8l7)r&YVk@f)WYJ`LhzG{z~<!*<kaB9@;jf+MU8qJdJI2N9f0W(Aw)V$vvKt;=}
zPpiPurAxi)IN}DeHm)JIIxed@G6=fxSZ&j;(+S3WwM)vq=FIoKAvB03S0H9lwb@}^
zRvnO}!uInZRGz6e92L0IDm3v8-xX)L+r-LEN6VQ~KZ{Lu4W4<5-@=(PN!ZA=(;j8V
zYB4a}CZ4#YFvyn+MQ)7M35!(2VjZ}Ty^@Bp4y!szby$PUsHFJ)E-V-5xEM=S;19I;
zP!eOo4knXI0~}TmqX+YuHu4z|XP9{dqb#FCu#13e@EZi1AJCnO+Xx3H3rKf?uf}{0
z4x~P&Ibi})>zFz*#!EF+EP~LTrdep3eKwq;n2zw38fE|m2O?%<CsU35$qeu!a9f>~
ztB5wz2N7AsO9(t^;Uy!Eg|OJFc`d@s;#fiQC(~ZO)iN_BM43cn-YpdXb5jw?h`}Vo
z04Yf`qb!k-+sJj9Nro?poNqdTrVcu_X&V9NzucY<I&#7mvI-Uyv|esX_I<Sm+|PnB
zp$lM`zQ;`KRXJ){nhoe`v!=Jx1;ILfM>HcG2l;`1X9So9dQ8$l5tW-2Seupr$%B9c
zOq(%p*)F)iM;ulcaR^4TjS(W1+f_EJH>^Zcr=+Wm>tiTep+G>$vcbtG*(l)pAlIyi
z;<ag|ptTwSHp-O+EtcCE9usqeQOtM8O$xUsRYHj?Ia<QIg?<BYvEwmkVFep7BXSO(
z0n8`ig<`BuYzjj<vc>f&7Zckeuv~<cbo*=t8#-Oi#S%{ACxr&5NHk_@b-FE%8<mOX
zHwF?{fqQ~mR+3B(b6s=BFv@tADZ4qh$6!OlZRBaE9<+wFDF!lvIEhlMaW0RCB3yE7
z05a1##>@|AR>&YxirQvEzcP~RS=rDb*6}0E=Se5u;5%_i%y>lLXU92B#`Cy62BF%X
zi(36u8+KqgU(rAU$wc7VO{uRr6H}_fq@brOT#|XNI)%FuBS{5R>eqW?+wJv4Q)IAy
z6DxtEQwFfSUp3v<3=3Gm<cK(EwX3yQQZvUBu`;&W)0_unonZU8PE;MhF9lepQo@HV
zq+iT*;o%HK=%`-6Wv3M~!l$BIpESu5V)bM2lsdx$sA-OYpU_G0L>UgVLUjP+**sUr
za;}t2f*cLF7J?VG(U76#S_>Jpn^e;3J1REmCsd6t<(m`S8cQU>Lz!5cjQjks&~;Qn
zl2EwD=Yj6&xL$+zLc<{|<$RSYDQKpb=}+^$8mm_HlHapyW<CSFG+snnSSJJcA&`M4
z2;Rt3i3|Y*ENHShkl)no*MnZwX&M}z8rE<odnA${@&IRzKwbcmka?<YN#rQob1GxR
z3W!;tqh@Ph=JZ*uIR?~WaZ($^B4E3ldI!o|2tQ%F3TVp(D=n34B2oiXC{Y{otxR3X
z_6m|nWiy#6g)>68-eL32jz!nlG1K<QZvtDhPU5#Uu<ymGYld*#tE3=KF+`5a50f?z
za3vv!Acw)9L~UAZYm!?=CPb67tw9;FWj591`8WlKo(%9UO=~thtl}ujcmVNC^;t9B
zUJu|My4=pV=%Dw%$AjRZZ{xwFg@|=WkoV5f;e@F0t-cQ6#eev%|0@$Y%|5D1Y!_r7
zi9<Xm271+2dsDM)RRtpEBt3LN0+lhOnslXSI$)4-$yeOeN_G)I=r!%2T+fuyz{$8+
zHG~T4l<(wJ3-yFa&x)l|HHA(T<;PixPC<*~&GJO13v3P855mZHV-hQQ?K(^Nl%;SS
z$Z^a<n8+J0%2rCk%<iI%EIzV;)*}oX#PwhkEhKpmj?Z>7I?GRM(nuw}6yW<cyr2#y
zTu#WigT5?Tywi4vv*rLoyNOMWU0};V43X}zElBQa1pyaIeHuVsDKBXl#$q_z&3CYD
zzYh}9^4etLPkg@(r|{D-a6DhhN-;K_v>`adhaj<MoNF|>;>_2iVkZJ&#PkRu3YOB9
zlY;3mf?b&vQY2#s3MWdDKWq@D!Ol>^&Nxjo-)+<f=5PWS#vF)nwS~T;^cxj@h8Jo<
z*TV^xw7arP^z4C+h)7NZsV5qd2?Qu6Gh>9RM~Tu)l38~+ktaiKYME>Z^Bo1y20gPs
z6DoFTB%qyHVO(g8Dj+>jgFC|tM1ouhoyWtFa)b_pw0bd6vq5Rx275Lhl0`G$luR2f
z^KoWKC`d!AFtk`E{Un<$1UiVY8-C9udNrSio0!s4d6wo~swF8|3(4^eT5be6*~~R<
zHy-wz&|ox4kSWEL)RxfUZ8G)O`>mncCX^!4rH!7{#%%f<PpI6sy*ATrb;Fv=4CRVf
z@aQlZ@jfqN5yHv@($L3cPel}us)^I2Gi@1xFVbC#YL&_UuqN7}+pe@^LZyp=g4+6k
ziL%|QBu4~gnNk=1CN-<7RCzZ^ov!vbXn_oQ0<EN8C*3VfxGCDIn`oJZcu(p!>l1m(
zD3xN0_~o8zO;vi*qp5M?Gz4BM499gc0uoNdUGVsjK%tg79Dw|RxWR+aD=6c`xN2Kf
zs!<6_)e%Qo0IwQyb)*S0DG$1?QG&TtGqJ#96?vjsqH1Z33@wdI+AxRogd&WYkPeby
zsj20PLuCM>kQvgJ(G(4rv)MqgLLpy+B(7i83qCK=)l#<sT2w$LjGgVb+A;_M6phK)
zMYXC~wCGe@(dB+sE&?B)?hTqAnPqBWn#V!dvuu~BwVOqBm>a2nEwc16h?#eM&!5&D
z6z`fHbgGZ}EYWUw)|hSjr5Vx7(bG~w09O+q0FiGG+?0;AA&AZms?`8Qrt>6t$%E_C
z(kL1UlgOuJ)h81yN2U&3x9->?$RLm8Y@@A&6S);rAbD6c5zq{w*%=V~1=$0Mf)(u|
zNM(a^j3;`z29ZE*piv2c-2w!FHzJ<^T?MzZvL4ugfluQ(M4oMt*_tj|Dmtm!Aj!op
z_I;S;*z8CFVYdR4XKSOPAf~7aAn#m7NUd*r)6|`$Fy^UZ$Yu+TM!8q*^m$aa6TV9I
z3sAfLO<)34W+J@@3>D!>vYpN+r#fXkRh3dOOBVw6EmU!cSOp7A_xv1Jn93<TQyqx5
z4sy7dcGXL_q^NY$zH?GurM=2T8i9oOax{utW>iRMNF23$N$4kO)ld&Jaaq>!K`9(@
zakfhLOqI?x5+cgPkb~d@sRZ(Ts-bIS2ND(6qJmv<_^Co97GaHWV2%isnM}Iv3QpAm
zf+xdPrEgIAL6-#*ZGm?-$}JFN67#_S534Sc0o)w{91<@tf{R%?1_r5jS&(Rl6M!9{
zwITu1MrRsTY|Du|1tlO?s2V0DQPPBvHJfa}MH*Mmi-5i$^q|Pffy!aCMjJuIkxWW>
zzo?eNFq8E_YRuFg(a~5k0GcPd7*vrfSr+*Q&zg9MR2zf1QOS9En+$y!={9D(8`wxq
zP6r}PiajlpDq$RIYN4!83h7DVoJizRWF|6B)ohWusW$AEYcQzRh>Ho92Mj296@p?=
zQqN8ZrfU%HIHOk7aX(DOXea{Q2D{yCo{j}X)Wu$kRJ%40CIm)j%a+O|eH-i7yt1OQ
zsW~!pTCT4u!x?Xs8uq9}f!vF6qY9$&_GmJ-i%~gF*fMa>1k-3IqtyEV33A}gn-I~#
z>obkI6QFSqi+jE)`$0?gbU2^L4hx4^Bn)lDwc)%RS3Gzi(0UV$rrPh283ZJskHFhF
z*@4dn*-|AdV3Tn@!e9d|RUg2QmdRV0R=Xv7voZ@_Hu0cgok`!CLTUv#-M3((frHJI
z{qdx2W&y;Hq!hh$X8j^9cI13J$7z{76yz|9Oa~o!lZ+Tl`_l?C2IsE__<|6Zav(D%
z<Ecg~EI9yJkVq;r!%~U{Kz3&#+b}%c6jKaP8pG*1R}{bA_HEKC<trZBXpVYGMUm1C
zN>G$<q^wdNR|C1-)iHXIuYgy4{y(FDpzgn4o6ArRz9_%eTJzW9$buLxe*2P*esy~5
z^M}^#`<QR-l->9-YZbfdh#w&6`QJaK@!jpw%ZPP<@ck{%T5!&jyK>v_$!&G*Ip`Db
zuP{G3{NC3Nd*j|GZu>ZY<WV0VWE}j~9rwKT`{&kte&en0-ukCo-u<xh*0DEt=WMsm
z(&H8!@t?2%nRooxul-_t{qysn^)_9*c;A1%mTkdbwBENXZnsXJx^IqEd*${==YFdm
zwBBjUp5O8R{p`aF*P&T|<9l1<|Mc>&K9@Z6`nP%pWs7QCK*WW@CpUQi?Ko{}mlfEC
z-yRIKp7-P9wpexX1NYWmJA`|lTqXYuJ=OT?+z;d}AA5;j_1W@!s;_?U$Y=gDcF@G}
zKi_5Esmm_i#Ttl@y~HN`DsA~Ok0#e|dD9W=a^KtS=pS0iY0paR^ZUQKX5aFgyIaSv
z*nauz<vW_IK0HM}<n2cu+x5$Bj`$P$Tj#y!moEPad*NQImtV8y?LDktuXt+tK0jGW
zulhd6oXa2XbAREs|G~Alu7H>SVEyQ4Pl-qV@!LcA!vZh_>@1X8HC)^J(Vmwqv~EA{
z&%xt|erO$Q`P{vw<WT<ZE$1v;a@x&juE^|q^zPQjbBN^Z6>ssQ)1BXJ${9cVWbWcS
ziB*d@rj%sg-2dF&?%vD#`HD?`^v2)Ued0WE@kaOj4tIXAdd6*De)Y-DZ@=C1U5hT;
zm3aOT{->9nd+Tloe(&;g_a|4`%g;D#B|hKZcY`OeWzQVG>ZRl98!umR!SUhdub!;0
z@wm6R<e>TLJG)-~&Rr^h{mc8$yX*R0Z`kb3!d8j;-d6`M-|6fRmaTYb?S&@>i?=%Q
zx@VVtcFaG&KL0nfWlQw;pIlUYbEDP!tiAfGd#_)od(6*P9DKr>+s=E2TlLV|8(mdf
z`MYJ^I(sC$Uu)HfyS+@5PYM4CadmF{`IaK?Z$10O@_Wu%@6#2F#tR;q_ad|FQ=84P
z4&k1lUViA7Up{~0ZQC4h+p-U?S#is5r_XuuC3(v~x8ZL|>HIh5oI|oFvD@!>{_+Js
zI&7VrH{d?ob^d<K#D}+C|CWD#y!@CsU(H>8-S%I6{EPc`{pzqImLEj%Cmy%*ic=2!
z{`{ZLdv5b}xBBG%<?}E7)kck-U%UUfmtQGv6yLSW{Pr^9tBXF~<<vb^EHfTmd-<Ci
zu6I-OoTXc@^Of|Z{`?u&?(k}I?+WaAX$yIShu^sRet+3p>K04R`KY?_J^l$R-r4Qw
z`5Wbqzx_wIz53a459Mz@f<AoVwKuKUVOM*HBOb}T{K{W9o4;b&$L!g6&whFGc-tpV
z*l@`+TRr#my5HYiIX(RS!=Lt7Zt&m3xv2B~#+D0Cr`PMQ$B}++@s>-zWAD6s|0w9G
zF|ii>@!!sT<fZ-C?e|%@1+noXzuEh;(q(5Iy8W_+E6;b<dvMd6mu~*npO#-bcjb>y
zIsfe|KPN7755)H)ia-A7pC=v%UAchT<g@9M8?1A}?i;**%!|jLe8%#X*RR;-_@8Xe
zZ+iS|UvGZi2l0}Zc7O28$L|~6(_HlPr{;ft!bhIE2wSts%6rUXSMR&)ce<zChHtar
z=<nC={ABkF#Y5Nn0?!=dl<wH=lf|9KlecdDmU8*dI=gMbpD@(9>$WrJEI8!+-#vxA
zxZ{=Eni=@c9)A$_<i=MlpfcN?x@^mToPO2ja~JM<=GSMmKH3!h*(b+*_RmF&LiqDN
z2=?|(7r(%5dE@#YtbfA~SDby>?v;<$Y`El!vllJ?M}PBe*Z;%o<`3dMkUReR`LfS$
zdupdki#t>fT=v43Kic`zxmUcKz3r<@78iDSGJN%K8!cWB``)&n{NvT_|H1D_ym{+=
zPrdW%U(Pyc#TQ4OIsdYSKlurE)9zPgANubh+IYj`$)nkQPk-{{uKLinTYSx(yw$E-
z^Gm<}+ppJFe!J`3)yI>>o6%-FvvV#ubF)3JnIqhld+zHk&0VhFaC^S;*fQz+e~-7Z
z`&n+=^)Gy8lXdQWjGSYw*nIT^i-zpHXU+rybIp;@|9JNgzBppz=!U(gPanSWjQuw}
z<{I+OHQT*>!bK-I8y<A!0jF-T&EZvc;hKxLHY=;2UA+9DAD^%PihN=3jV}K5*k2Hz
zZ+hvk*1qiArSpEbYS|Ua>3`g3<#o&6VdpMve)_@Ab8{CSw%4~+_y6p8Jh{=r<4fzk
zv*?HL?CeYDJdHlM`mX!V-1?n`D*Xk-HP<`qpVyufoq5Fk%FFZ6IXkTQPW6gCR(!IU
zc>R;Fjy%4;=RFsHvVZrUlMd|+ma<zuxbEdAoVG>&jQ!WXaozK;+2zz{cOsC5<{?k*
z$G`XVg9qL>WH(xL!g0iQ*KW9T>&^SV-s<#yk6E$euoJc;-kg7$dGP9!HXnRHdgSwG
zciQf*ne*OG2VG<y{#1BEp?^^0sQnkbcj?2)W5{+t+E6uK_-=QLRna^9!B1Yt@BHQY
z^G^FtALTc>_<hct`(Img`~&o5D-Kw3R`a75`<oWX@+lkq(s)_e_=I^2PdM|djgH!D
zL1XKcmz#e-V8Mq6{p^dSPh558g^YjFHd{QMe{eaW?y}CApFDKK@{<>Ty#CFbJo3Qi
zPjB^HynH~<HGY1;ig`zTao8y*d>pe!pL5_nS1vo0UATUC?srd`|2L+z@{T=LuDofy
zXz4CjzWvl`FMelUg*RVWeB1_SuXF#3eV<x=;ZddCU)XidCs$l{;?hWXxVGr9_1c#&
zIAhbnkDuLVvkm^b%M0s1+23l}Lkrg48+~qzpa1u?AGctczuv05uia?w9%p^H;)g%}
z;rTazy$QSHr<ns6y!PD_lJ{1hHtWq>eSWmFaNv5+Zp(gu``W{umG>@Ove~ixmd{*$
zh`ZJNH@|mqlilOX_twAb$&0sq?M&$>e_6ie!cS&bys~6IbIiW~JgoBJ`S1S2r_Qbv
z&MsZ@)l2_)^9R=+_A<AR^3a^euw5@*^vmcScE!OTo%Y|uyWKk9EzUoTP}Sm5#K$b5
zKEMBopY6Z>tNX6A_vAY6v+@I*$s3|~naBThp}6dZm;Nhm%Z{DXzi8Q(m!G+2&(}MD
zcw8#cYi+v@`im7uJsK?7_Xodj{p0vszl}i_y*PiFO<b7!-mOc&y&6Z%`JX)ih<M*W
z^yjVrwtd8$tv3awo9NfvaOJm4|9`Lh$UOee!OcfpMP2*7YX8MQAGPp1^pXFDPM2N0
z%d&@$Q668w{`|;C*81?JV-LUU!avLJT<e{B-`C-_wU@TMAcq}JoWK5QC*AP-x7W3C
zW9Qx;?a@Iet+Vo#9XI~N_@F(0b@<Y!FFW9--L~kya_152?efHinbTh1?Uz6qFM;OY
z_RA$}-Mq!m+(p>MbKiJ!-khti*!82|yW5}h#;w~ec<lYPAH6-TdD^n2Py~GSwVpef
z$LH_x_)_1TvqWfa`MHMAee`|xv}Lb<y4GulyuZAC1gNy#0lTbNaNLMoxZ|?9#&7cI
zdJkN?=P6$z7jN)dyf;|?qmKLM4Kepa2vpki(k)iaTZCP)^=1z*z2%i1w_oQK@tg;b
z`EKn;|GaCPT~@>gE&B9NckQ}OyFxsF&+i`l?ERbWMc=*gp5%WXG5&asedZPWjDPv*
zRo6AYocH0quk+svk9dv0^Q#wseCBk`66S;ZAFWSU{gS!n{SzNs`PFCNDII;&0SC@I
zbD6%S@!n@^?)^Gwetgk0U!6ej_vSAKuWj_elFzo>;PIRGj^*FH`_t>5xaSLcZhe35
zjc4|!{vyAy`{f^<d)`S8+!uf3?>l_^oZo%$?3;hw?{0l=_J(7xyX}SUE&p7nLwtGi
ziSO@x{a4TEpYM3n-5<Zu+H3KJmv6XJLhOC#p@+YH<c9Kd^>vRt{Kn_^*zw%Pfd_o>
z+OscR%YDArt;u8CKX%u#zX+edy!*hKpM1FC*?->m_lcZYb<8d+zI*&H-Gi1txX!5@
z`<LmR&0}v^eC38qKltOyJzl?Va_6O=eRjiho85Qb;fGw;-0G^gp1S&{&;I58tFC$T
zt0RB6{zt$2>9J4Uf9r+UKYGlOt6$yb^gph-@u~g3x~4dN<Mh97VnC%Q_tZ8v?mOzJ
z$4^a0k8g5^@xpJC?iYvcccpat2@jnA_(QvtE_mRA>XGODruOV}#xu|S+PLH@_-y0N
zJ9j>2)n4_}_S*C3x19d|TjiIYer5W&eBN!3U2@mo4|G>tn8jPq9zJQ_|F^d{T0P!>
z&1Lm%559aN>?~h@WA5+g@3I;H;g7EW`$cPxede4C>ib`;pYgpFpIv?F`SWyppEXy0
zzQ@6DUixM9RPXuTvzKg{+;qSJKme4T?zkx2`LtzM&FyrKo`1^vKY<pUUOOZheTiLn
z+t)8{pM7sXM*ZnA$6UU2^IzSu-IEu;c+={GpV_Fk$9&`9o0i^i)01oVTKvpa7mW^u
zezg1#8(g~a<7=IA+lOx)eyo4dSzD}nRoH8r1=aP9pRfPd&tH5670$W+#rW`(F7|GG
z@$mOf{^*94cRu#Ub{jq?x;q`<U2*9pUthP2c+OYFhyH&5>RUd1J#+TP8~iQW@aNHz
z4?cQpU*Y!6w#ARSYs*)DwaeyDotk;~=?9+u$;PL@Z|$-34o`bGzx>DBgF}C`$%*>8
zd3(vq%dS#ZU6Lg(MEfUxNo==d<rDXvm3(o*wwHwFst>M4kN(pE)qAc!7Jm7u(OqA^
zvd4w)$JKY&lwUgR@*fSdx6D0Y(ZVgRIA_k|XI<v}`s-6JJnD(Ho?b>Dw|wEn_kQx(
zAz!}tw|)0}@3^C1cy`T2Xz{$4pvFu09w_fH`o%H#&$%SntbNKK-Z=K_FXgYl+~era
zzdZAd`M3W4sJr(WKgAw>WU%LUr#-!3#p1J{$l>!|I8gubKJQGgU;5$;!X?k6doO+E
zrDMOo^2)bv^T-c>dhA!zcUwS14ldm7ugWefxT5sueU}v2lg_x{idT9&d-~_m`Y&B|
z!)<>)v^aiu^-lLMxz|1yWFkCHevhAieZqHF1lW6|ixGEhbNnw{{ck%R^zMS`yVL78
z`1sevd4~my@BhPV&}66lF&FOh?sHFc_P3&gvrhZ!mnSu{xbfvT{o*g`*H7Jh@X?gB
z@8Oq!dedFk{_TwY9<&O(ocg;@PJPx`z4R~4b_@2t=DR;pjnkK1we*nt@A=LK3+}$+
z<N0qK%)Bhz<vr5+^20CBJb3u%$!ov61zNdtr*P>pM}4f=cfYpH`k8}P&-+<AunP}b
zP~F<tto7j`i*Ni`KIXd5H&}dia`mMR=>0#>&_8^->DqUG`uJ0ifBnLtN3S5~EL?kb
z=gyO>2H!h&#ibV@cRl#&Db#CE4$i$~gXF<=pZ?&12cDfWFOA=P^5$P+&%3`l;rzdF
zZ=bjR@r-o|N~qft9~}v=2Q=iuPnX^J+57vp_Pg$$8&|KQ5Btj{J8XM2cGgKd|MdFb
z)=!^3an~o0?(o}nR&1>Ong8p3TmAOAIaaa+ZC-rrrytJ#I1W$w==q*}`dzo4ciZ#c
z`@{tgy!G&TZTXd-yuSTedp^G7X5nA0lfM7chi`xB^TvjM8eMnpCdPHE-o10{O<&)Q
zdXCvJOMSNNqwS$TZ1Kqscibwk8NXA0aIZh_^{4pti*~DTTlwf@p!iR?;K<#kEB~++
zfAYf{8Mp5BCuSS*%l-Cv^R~C{7~E3LJ@V<IvlsUtSgAbz!>#vvS$}+!o6HAR?s04K
z;r8dfc_RClqxR;0_=_bQt-f>L1Kz&lROy39FM4;?c`qIH#>u;1__+Mip|5{YJS2Dh
z;hS!`^9TQUX2s?EZ2$EGzyBOx@W#=l_rG&*<@UY4OI*0|<1~Br9xJYW`468lZ+-p3
z$|FyG<mA$wYYv5;-*wsJA04&(b~k@~-xe1_AMCzI?#4T(zgiss&A$7Czn@>(_p!Ua
zT8u2&S~%dI$JQM4-dSvwUNGmhd(XTOKWy0-_aA-7weS3T?x`Ps`u<J#Ouo9~obk~c
z+<6)6K6G_(_lJM_{qQK{kG}#_ue6k_?iI#!7Cyde2WREwhpbh8@ykV5V{ct^(M@k&
zGrZ%_^WR_lvsbP@T>Rx=@1wu^?Cj^Y-@S9{t;SP3{62W*g1x8KMOW=}_XZcOI{*H=
z|M;u%4Hw_OzqBsc?8*Dfu6S(zI|4v}#wEwxoJ{TOAN~C8)z|GDJrT{_VgFnA{0zUj
zy6(~!Z@T{Txzl5xKX%Q5hdy)lYxbe1&O7TL7d-G>Z(OL}bmJ|%A}9Li4V|Z#J$3f8
zpWgG*C-3vy^)GMk_S%BS9z5!yy<c7F@B1C^JYwZ1;1!K$9?72ZFyZ|3<Fg-`x9Z`=
z^Qy<bquGZ&`okY@KDgt~m(G3j&v$SyY<5d+&fv5E`Qz_y{d}MOU*8w{?pOhqeCcqX
zi+;LSZpjB{ZvMvb?9+2U*#5+=p16nI@a;pmU7WXmcGEqdE<Wt0dk=emn;-sZLF?nE
z_kK29Si0-0CwoueC;cDl-YP84rt2C6f(5q#!5b&ITd>9&CryHDaJMA5I|OJXxCJ^8
zJV<Z}?$$wq6I_D3&z<L;_y50VPUdW`e@-|ky07lLYFE|TYwxw%zMo@G399?@b{w;Q
z-2d)cr~Zl+HSq4C?}gE9@tdgS=8c0Y3JKfr(>{=tRb>TPiFOiRg_C-+dm#?HLdKl&
zgwC@UuU+=xE{VQt`x{?ww$|;M0Mr3)EZ*5N@&8#bb*HOY>`LD!xrClmdi~0HPcW<N
z`7DnMGtXAiSUVz|(&<}VFkP1MW?Kr6%cvi>`*JX_pbNr6`1+c_g0gU9?qmoC*#F*B
zd4CbMc)lWWzraZa&-rmTU!%n?aoWiau2DF`W<H_DuULby2%~;)E30q2KEOyFT`kFs
z5FzGk<YBwJomqZ#+2LIEJ8Z?5)DD@}vFS|<y4kMbo@(n;%i#BF@W0v6@pzH@VcD_7
z;MUWzp#xoBHo&*5Zj@^AprvME86t@%b1|@bY8ppk4<^FqAOUAMUxhBx%<Fc_EVu||
zb;;0W=>xDWf<K4#_oViPy%PG@7Dn<aU&wRPyAb0{iKS~=ZK+oIgr8zMbO-PHZ7HT7
zUzA_pIxmWNzJBEm3V5Th6nfUViM#3_pXBi~rqXS0gg9Q*aA@gKbKbc*R%p(qtAW`E
zlb5kx>3+^h$dz5aH6>LPnib!JY3m;w?a1-_W5jO<k%$m!k?+5J@$jt~;8EsAzywOd
zZ6Hd$;vNt5MlL(=x!MiG&ZpfOd7E5L@sask*4*{lT0El~m{>_cJ+7JT+!~S>kqyhu
z+jlinbp|FL8hn@4^|sSL7Ka|TOiI^?5PorvUk|ADw-dFu8kp7n-o^b~x!FGMj^N8;
ziG_uSxaWWW)tkr)-I1^sDT*-3ccvVTqW`qy9`5da#dPxl2-W;tj+wDflwgW|S?_LJ
zvad4P#P1!>L$Vw1N`<Y(6Crwace4A9^P^1{DZlkwtzvs|+B)vDDT8><eZV*89#SOt
zAG9-_=H72nPaf4Kt@`7)zd9*(ters5IK5*(K#$3+XpxKE%OOgI<si<8JT!Rx%SoVi
zu!!+`wq+|5VCe*6#Rj_Bp1mMZShrfw!gM$1&FZSsCkJqG-G+af8^gTcz}6v4mC2^N
zFxcLpkqJ)F@AqQAuvEN$;sRs(16y8Gt$jsNuEDnIs!DElzPxiwIoHf$A(K0cecwKN
zv`0?HWg-LoFB%PE=oP2cq~`}U$Pr%b_`5C~j(C}U&dF+)P6}WC<3uRdc-%8evb>tb
z|9u2m&?tNA#i5ad%hQ~urbkUqp97HJMbC^K;z2oeZR9pNT!@&cj5P|4GzpeZ$jnoA
zj}{wk$M^~J%D?$tT;2SKJye5}$N1mwx7Vr5B-gjhx5x)LCf7S#6%U?IGmklYTj{H*
zylmbWEj{>V`%K#oXs1_gtvPa7KC_1Q{P}Q|zE}2)e*CzSYv-ckV-Urf>;~@EGg`kw
zKl`jN_$E2Utrz{dcbS$?x9nJi-w)!h`zWP)5{SeY#&f`t9DcS0?Pk4G6)JLEaPI4R
z{yd`WcB@7w8p@yTZ){!9oj5vdEu~aA(f!_`VT<>m?N*OCg7LIu$t&PXNlvYhx0G9j
z_n*_L?;{?3K(4`NIJnof5e}=+Z~P%YcnhYtbP6{mRP#Es6+CoNn{d9bI9QK-v3Z{(
zyja}9z|ypgC$)`~Y8|VjPO|BW%S9rr1)E6@ox*rCs{^l3d+GjR_|WfuAHT%^2JL{l
zwn}blj7nU0-l_Eqrg295_Q-pvRFy8FHh+%y=yzlkaa-N?mZ=Ni09z$O<WL(l&WrmV
zhYC!E4(t#>bH|eFUW$EiS(tYvpU=iCUb_n!7`Xi1muvrLvX4p$oJpO{BFGW+3W07+
z>UtsEAxFD(KqaWe&?l9c8d}S-3P9}WR4bc92@hQRrWAb*y{xf!CGgA~m#$97kM}Fx
z=6FAgOni+P58Xe(q@IKIi@aDQ?(e{@r+2bk?rPO&?CT0XG<#6{gbwC(l;07)&{6T*
zv;3DoKgB!4H)#D6>#InCEBN`wV?f<7F+#3cZLsiDuJ30fk0s-IhwqkEGm*g&<Y2zI
z;+ZC!L}IvH;s>{ghr@uL&(W6>i7ThnyT5EBqI-5_eo`3K2Oy1ThMm7CJ5|rW$KGq?
z!DL&{x+^#=zxriMfPU6AxmbbX-|9?TjA1?eA>;Lf@#7@1k#=y8bL$$H3|;FqAqH(8
zM)<<l=y``Pwi?6lJ4teX>xTV#N*i$T1^eagUggy2p@{Zz=vk~N@q5VBeGCn#>f%0$
zcB&{Q7t<aydMarYJttlyRS}%=&tD4W(X!jjgNCsUsc>?McZcP!?utI(XiFRE%2B6#
zp*@%HDkM^V!_r&mq#u+%!SX?^w-3cl9(N4DwV!Kr<WwZ2_NL&bgG+n0;+AGIG$MC0
zH{gClAfGh=Qi%rY$$vMFe7jpE%H4Dp!{D_42A<7gMDOD;7RMPQL#PC~{=Lz6Ff$QB
z`_%^Qy$lCxE^mW>>S~Q~wl;m;be%fCL0lTHJ^(r5K1cpKt5nee+wOc#NKJTaqeG+j
zS*sRzd!3vjm7%=wMGKs@&yKm55r}i*sZZC=Bsd^73pi<8T`f6CZ9N<8!wl?j9a(+H
zN56LquOdifB1%!bj3;1jma7MflT+uZ_3pu284n(%`Wt=%4P(!m{H~s?YO`~_OgsOh
z0vwrUH>U+?yB;&-=hW{in^Av%d&0hVLW$l)^$mWSA#l&OMfvhN=V1@$8O4bazLabM
zx^e0GLw%0>#?^J}R_z99X8EbZOR(5wk9;S-$db=&4lNTQqB^pM1^yTDS9Q(0b8}Aj
z<@$X}r2vNA^%&vT!TizD67gkQ9A@jEDi&)CY8=#_^U`g)#WTYVi49J(VzLDfb_Tj$
z--Z@cb23j~``d<2TKoO((Hr9$uWY~nEda00u8q*ewWsomWSA0ns4rL1+OC-taC9jd
znpNv1KYW1!`f~IqoU-ijTS%mAKU+szbzJXHVyF{$HxnM`DfUmroj_s=GFwO^_$DjH
zyXnwn0yzaTcyj&o3^)tL%Gh*KG)Z64P<lknhg^I$o;ly2B#!?OjXijaO*5ctya&LI
z@hxddZJSVmx3AlP%uFrkxR%Q~z*sWwjd3j-upp_h3X{*lr-$cZ*qHq;NT7|xKrd|L
zHwFCmJM)JD@8puSxaxV{H*Jz$F8mU|6)`bH#djjjsu<uN;wP!iN$XK=Rv>Yk|LW+m
zCH|Meu>IK@|MTjz@a~<0wTPqGX;5`vg?Qsa6+q4l!fW@B#J0qPK7PY(2C5lD^rJRP
zAEZ`Q>#O8waOgDmD%<^yiGZo+7fm|`ajG!i`gY7(n;##%nSU)hn~;&al)Y6ZP(YjY
ziV;0dV%5}Rqj2mj_~Q@7q+kQGS+iBWk!m&-Wd?ZFZaD1W?6gvyE%4$LNE2UWyNfrr
z5x*Yh46#W2J>B8HJ2gG1A?Z=|yeP{B{u2guar_b80Az<;f(4gPx3Hko{%7X}IWQvk
zz#+pe<9ad2Gr@a+Oofu5u%d5Dyo9n)`!r}7o;4h+8@;!+pHT-2Tc;DQA%d4ZI`22p
zjG~<s7%4Tjurqf`M#}}g5vm@aIMQ7`g^8K|yEB07@Kl!E3FALm7Z}VF_M1O4Z;er1
z0kxmGXm^vk;)7iBPcCG6Oexiiw8g{rg8C;Rs%QYzjcbm~gCnYk&JSK6`fZCy<i08m
z&mj*;W%y2DblF-ez5~E4b0X+)s9{WMX$%fjaCVD*lykIr57XA`7&M}j)yM-cBSa+n
zzVGUuU0jrO%V!Wb{48p9qFUs&CT4+$$lQ1LP51@s6JR6(4?Z5&&m7EomvX*S7@-(7
zp`J~Nfl`3(Z?%FmrV&r7Cp~GQTxq{io(E^qV~3u7npxJq*G?5}RFkQ|YrP(%2M<pD
zTCmjk?C-xN{Tdp@2=`c=Y-Rls_2k(t@-dAh0k-{fE6)Jam79H)c>(v(Wa-~ai*w=|
z=vb&w0qr?I>vIYohwNSPS_i6`9`@NP<+tss7e6;nycpmcZn`xy#Q+7T9GV=1NAZ3C
zvyQbyo^0ux=|~Kk>25_UQScxWh8$|Z<JuS(Yv65rs)X>=b+UrOEKp@3%vcyv$D1p;
z-QPemxOh<ZKw*(553(`Ll9mxy&aM)GI|yJZI(rY&oK&@Ez4$w6?Kc;<Zt{@RXV!Qw
z8&R<EFn%Y=dF^sNxh8rYruw51a{m77@>{p<pxeFJFGJLz*fS8Wfx6s`4h1L{D0$-8
zQ2<N=!x}+LT>0n(kiWWTC}Xl^m{zzw{28juZlr0%*^yIlHY`@C$}4=0xf5wn`3mlp
zBPj!k7pwU~+FDh{x@~vYrSVmMYcIPxI5~y@(Iz3b-83}+i~&A{Uz{J~bKyRgAcNuc
zIDc4Ldx|%2$#E(N;L(aAIpfLP%h2S^Pg#$~uLCuW^HRSktQ4G06q(qq#R?m4Rcvs#
zKd6fzHW;PYaoyJdd2%u)8-)th<Z>4OfaplBt3TNm$=E<fts#caqQ;ET^r+FAK9Fg)
z3i$+|62>Y*8t;(kDXG!o$9Fx?k^CMn3?099+*GW|g*fT#lWY+ja=$o=1j3}YA}z!J
z_8#BNiheLh1;>TDUmm#1Tn0;M{@V+%$v(Ql`IXJY^+FPUh1hS~i<8&#=-Yo`*SFv0
zA#se`e|dd398)Qh`-;A|aX%hKN#ZJb^Pt__5-aN2WHPoMp`LX~XtF@q)h#ij@(N-@
z7{@KQha><XRI^T20A9?UkLag}+wqsa-ZElKS~W;jv}S&YEQ27N0D2`Ii%urS`IQB3
z$mssQ{WDkEQxvgxXBI^ges&1WBGpJmNOP@Y1webbXY3hxzyIK<NPKtn)gZf2{D1H1
zbqNr)@VP4)^_H{NAVxQ>*NNViA@vT19<;gXvb9S-mfPDctArLRcu>G0M5^-$GLKeY
zxq80%VbN0FHgV2yNm0Zz=}d}@3E&k4Ks~9%(Eeh|$5+Zl(`wYrm<9J2bF$wv-mx~V
zAl0mW4Vn>6@)H0tnIf4mxX=oH*cB@Tx$T)r#^mTYR&MX*-?mDX^oeNGKn3V)@%aNP
zlwVMKO0H3H<J;<~-X@QU_<$)SocVX8QU^1DdHtBn7@@hY4sU0m(2U&j3#lNRQBfq#
zjUZ{4I~s{d!~w+)%MNaE-%)x+UL>h9+pxSMcWBz_-SwOb%+I5TuyuSDw14g41!4Fp
zDNhW27%{#aI@yZpSUZKS$_~;<>mBVHa4{9Iz-tGR%pUo!J1g5)=_M`^_F|2$K4cOa
zF!opH5z~@m8y%ZI#Dx;Y28T2(h{spsDN7UeKD`JoW*}ebs_plpJjsA1ZbsNBWC1jF
z5_g!~uIfQQq}$VxA^UZYoxO$uwSG>4K?r5~TZwpCLCVAKX6FwNN{<hhl>}BI#x>uf
zURp8y4sBTdvNPvf^IcdYOz-dcyF7`?YjS*ST~a&Gr5|vQHHs6Ol5B4rb#e!!(6wVZ
z&5cOGVD4j~6u_o8i&(~XpFiAvRrkv&{C5_fcT{^O-l$C|@6g&KMK)dn={M&}pd9eN
z%qOaz>pM&FWlUw8`{sq3@T^F2{K^xYM&=<EW))y#GM94JEDzd)qrFzr$;cX4NYpL1
z4BvgB$kx;9>XTeYqV??B80Do>+{ip^(NLuQkk>192Ve5FO0*jFUn)M9x?g3V5<7t4
z?t6T}hCM(2A;9@~WL^~H^WmYCULGw0){4Hc-zNI?@)`Iv%nG;Wh!Wt&Rbo;_IDZ)l
zUyM6;^)=>Ewh3;=Epb_5NXE-;=DM7j?Wv+VIdwkGdMz)a8SpTd6lWn%MoOFzak=Eb
zFIF#L6fiH*w6+w*^_>7E@GIB}W3b0j{qK44)HdBnP1MFyj8X+J!@#Ok0N_Bxx@KMQ
zFoIm{l-{=E6v=oQs~7{A#$ffqbMkPh9!H8$^;b=<kDjc8)V42@lAX`$ELkICkAzs=
zJ0<VNMdVm|o!UjuKGxKV;5*A!jJS8jp_*H=FQQ<*5$mIr&3ijr#?vjSq~fO)!XpGs
zdS1)QO$}Qkom=Z?8G>2Ps-?{5SquVC=ZZ5Adj?rN{EO_f$5@1Af9LkANqvu#a?7^;
z&G1CZ>)ds}ZRqj#-7y{BZCaI|-RFv##RCMzCsO&G*v8au6n=0<EOO*cq2mfDFsm?J
zAd565{x1*09~bWKDZ(JxSzP{SCJ{&TGciv7mROzLDhl)^s7EL4j4+KdifCg_p*9=_
z&K*y&*d62Y{Lb2gIuY<j{bk>VFM}yFpEVO3LnJAq9c<peBzFm1D&;@B=&He`9B|1+
z7$WEEey*3#?Bf);isy4Ej+L<JxJ4~^QfE9`R^@Q5EF98sne$bLsAf$zgj>zxSd#yN
z%S50rL_wWZ&{9FDu!s=hke6{I>}ak;-E6TWkd-9Eer}DsHCd&ysn#X>ycQZD1VE8<
z<-v(eFEz+>Rr-wvNdz(CaLK>-+FO&(Gi_i1O{ZRdhCafk!gBt7(SG)IcCZw-(wGuc
zD4P`U-F0BD0Br$QS-SoquktsUUAzdP#d1O>sqA2sqVyUjItDRGErz9Da;9t%RbQVQ
zXf(^U5ols5`8YS$#oRiTWr{md3#scYy-R(#xnd#{gbWstW>*^YA;0|SmEMhleUC(}
z@OWB|2XIi&3I7{RRo7HBqlF+fs=XFw_<B@J^YR-z*U@zO(03c1=_R$dT<Wq-Z#I(N
z?R_^|ESE?KC7~S?Uh*H4*VB4AaW<o>0I2dHpb&uXJP`{`4rI({V}oCHV35>!<x@Zp
z6<n5Pi<+J9?#HCK296jCoIO0gxtpVX0f(K-85#WX$afcw#Pg-crxHbsk3tosM6@o|
zJ>ifiSywJak8nc7BrhXHa@Y99vwGJGnsnB>DiXznWZPWdhfmm7LnOVErXtpF>rx(Y
zyBaM%_UEcHW1&Q2o~V8&K^g6ia+#;mR;uh)epdN<Sp`5j-=p2mje#8#`KxLQpKDV#
znwwe4i8kvvS;c=SmTSHABO@Gs2by{!e2^!>)UdOEDLFY@6*8?v_?igB#MGVGm}HbQ
zSh!#5mNDw;aXv*@WVC`%FkHw_$i$IV(D<Og0C2r}0-`*ZhkmM0$V}y<Sxx%CD0O?5
z`NZGiQ0@2{HXawqWG7JtwOTL8#$n#tPh8<mTrK-vn!lp>$YX6mgzZ3$%jPBJC$0Z3
z6Z9K)ybabQuV)T2ufWDg_>}8^P(iVwKoG5=LNqIc$aZNa_7K!^8QXTT4DdLt;kI~1
zE^*~e7NtzVi?Ze`6JuO(1LBe&Hp9^7s+zA7N&!-``nTP?9QI)=@?}(2SRMYRy)w8%
z3hQdTA7yLM-!IrFS&x3$G0WE)`o^p`zm!i@8g~-R@5iuZo=ZbLt)$U?=M>N}yE|D}
z+<cGz=#{Iqe*Y<V`zn0@U3UjY!;^EGsXf~j+|-@mUi(u;tCD20tX9Jntc7h7H_2Ec
zKF1n0A=e7)@{DQrCR;p?SAnq5hfgfMqBc@j+gjN(RjDdf>)2J83+jr)$k)Xlq7LP&
z3@4XXxa#lgiR2Z#`a07~(<2t81UEHPlJP#4%4s-GPmGXls#jp^CLGbQTfkddw^B;8
z`PhdOfts>a%}HGa5YEEOqH`M<nZ6p2^tfzMA3#|S(lf@>>Js90?_{LX4(}C0E|Pm<
zmRZIetB-rV-~BPAey{sIE%a=9uSx8LpjfH(&rOgM*c}@!%)1er3?(T3MI(5YmGGBp
z6UO0=PVMpW%Ng~!90A$GR*~{wD@m-G?Ci6042Qd$AcQT96A8MDjjTiP8ncpny`}3-
zkho_T7jU+}2PoF>Hc#2gn=d|ibYJezcHH|6jR?o2_r5T}WIKi7W_B6|)4lRtwYp24
z=$8rak9K)nS2U`&sU>F-wBr`=pt7QqTo+|p-jRBI*sEPw_=9LL8q42PH1T&VR!~c;
zP|WpTT0F<;h}C%u&~=2?n`k;*;177kO#xL02j3QjoMbmVrVZb5nN%RHAN3=DqO94?
zacl8^I2W!jc27H&T<8zEkl<7Nn@S>KgZ3BiSKlAt#w$W3o`)H#QGKWpTC$N<)Kv<M
zJAaY%rDfQfo9xGC%}frfvcc6NzLf3l6MOM~`loD<{!Vj{Y|r7}2f`etclcne*)m@+
zgW$`3i$ym_>w1jOM|4LiIQ(#=yN&wg?|Wv}4fk<@2Afi3;dJr&m&|Rn*wz<?TgLi-
z)`?DHR^7{25egUSx?xD>9049@`-K(z(}Vqn&+zKHg94?Z=_>H$g}%j~9?IABiq_YE
zQwH#WuG-+9Hvn(mF0C_%eZr=F@1KW>_?z#NQd*56R7t<?>A}{^K7O(*h9{U+xjS=D
zoPK=(X(Vh!<odgIK+@*<jUagKzCA2)-xOx{+-uryJG(w_d7TDx$mFIG`9Uaao4~gR
z-whJ9v0_m&Bv6L%e$^+UU+ypq{eXb6kZoqK(WgXCOcN|Jnv)veAq9KP4fHj5@QlFb
zuxm)clx9)d)_=177P4e0y=MIIo4j3qu2fjMgp;_bU<kvyUl34kt|?GDer<oyFIVci
zZS1@2CH9hyVM&u}yDeMtz4a?9w{D>q+_w4iG*R~SN9#BNw4x)1qi@=|!0z_YXkCb?
z)aXn4(jeAo4y+nENr|E9YI#D4mpdODwb&p6djWZh46o2I!==09%|a0W&rpGCTuU_T
zxhx#ehLwO}d_P^V(CD%qDKm&OM#QI48p$c9*ShH=2T9~o<nd2KRWKtc?5g$K(R)s>
z57i8+38BfNAKljO(S9zP{UFnDQaTz%eh!Xa4b5C*WjwN`WneO^y=Z4_?0Be|$my#{
z`kE8CBE!HelHLcLYRwsaZ#?*>0W{MfD;{NFc&Vb_0KZ~&?p602{vP!nGK}uWXn<3F
zdzR5BHuRiq<m}I{L8kG?)5<Azbq&ueB@YG?Ql>KMhE?D5kxM+<i>PhG6-lR%hL5xO
z6>1xie1Wf{fJ)Yf!E9CgbCtQrSQv9(91s|YXSE5L&UI1xLd9FCwj3jG-Hg=#*lIvd
zH>_}hC%-f)o%!lfe-ZVbu}cFWc!`*S9O}U5kDl1q!M3Ktp|H3-SD7Ce*oF4T28Uvt
zGr@|A;-4|R<YF#<j<q*7Z%h#7Y8{dgUdbcwwrCq?5(uC^30ye)^o3v5%QG9)koQsb
z>m2Krpo`x49}274xg<@J3Z*s#ux!=}urv1lC)<XtaC$oK{L;`+8oJ@!{y0E5q7^4K
zXsKnYl_zmU)5o6kLxC7-C_)pk*+XAr)D$X^>v|EI#5?bNC=uNhZIwLf_~3C+P*J^X
zEZ<HGeJ}N9!OC8kjK0#vQtLd7cTE}#BypK&zYtEY_v_EZ1EYO)ykhdn^DPTZ+lZ9d
z#k)oCp^@)@8Y7~q+fIXEoBX53TzT@$n9myJWIjkEL?DVpP`<BOo*%@E%tHI+iJ)Zx
zS08)DIAm+b@6NY><&%Ttt&ApoVD2ok#hp#j90Np%e(`ltf>D#R!rKLabzA75o&AUt
zYRYF#_iKH!7aBQ4mGXH+cyUtiWxCE;SO^|t7I;vhHZ4cfM}^+2GvLwu6HV@EzEg8W
zX;<%ZUFgNnzNo9Vi7kNKCpBUuZglQss`xW$mwlf5o70Uti}>}5Ma^@|&3P{k$ZTZV
z$Mwf`EQq>~+1G`MVONR`)HIDSwNLi4Nv%=zT7LcLpzQNMC?3{+7K7|4a|#P{L$<}-
z`|o3Zx)lHkWJ;U+E2LC1Sp73ZFx|_Q(B7Mq*dhVn1=$0{llG@{k>gHo4J(`671)l8
zwK?N%1zz!!hP|CnX^Ryco5hzWfc}iJYrZ&I?}Ab1cM1vNBmot%LJQLE_ick{lhj@|
zg031O`*Q3=`%Q#;8ZW`S2Onu}Vf^!b>;pEC)92P2BC`$T?k3-)^90Y^*E&&TPBs8G
z?7hwRK`z-ajayPbPG)o22NMlQL8W)kMuZA8AHc8tT}QUmguJI+&aS=pA<;%US@{2n
z5RxHGx1H0O+;^lBTv9>Z`vHapZ$nA2v3Kr|teGW15pK@aELO;=<+_7}cdrJDV)uwC
zpjh^yRi~B@!xLP>9%7*+t1;M_@s<q$sl4mDmOTpXp?M0j+;>|e+;f}`1*vDo5k=vM
zF4RRXNPe@@vi^Rz)^&YAjmc)8<V<B$|0&bnq(xFi=aROhN{hx_$16vxvQFFor18T-
zTYhsM`UuZVts4Vz3tjaUyJ5n80@>76BN$5wDj)tP@fUZO9n^Rx=`vKHa)SLw@8A77
z4SV~x?<KKPOPuqsX<JG5L#=^p3?G@>hdafrlv?*n12nT3F4WIzehzUlwHLt=S~ox&
zPg4vkU6l<Bt4)u{6u`_q+ONOGWs_{r`bF4463P{vkk_co?1Rco#&4?Jbr!09aBJ!Q
zeSg}3)Q`rIO9q=PXI}G^w1wiQ#h7ib^KYMwr{Ih+ks|ZzzR>#9&qbobShdAA5h8(_
z{WB^wKp#8*jx2>Ja{|D^_KGIL+~z5JXq^iDAiUY?<ladtyuuz2rTP~Olle9qzI5>G
zk2R^-4QYaQGqP&CQy)^=nh$eJ8nq5xBjzBu#U9r(A<^`x#NUBV^U2EgTeIx47fZFH
z68pe+%8P{McXWT>hMVJg@ZqI?+D<GBPd*M`Pb>}(RL?~x<YYo_gQ=L%1z&YNoQ}26
znZ_5GM{_}hb>0GCz6kanMclRcX8V)kIlyU^_?Hevqp_|UKl4lb_#m=FSe0^ph3CIT
z7oGAc=EEKJrJ3U*(E4H~$X2<>3(lD7N(zWoc^kx}3ds$)7PKC`InMSP<=EEp9?1}y
zTr?k>Ds>)Bhck@Xe73@!LDz!WB5|Uye894;NolL^w0(ye9;3Ry@sy33(bpwP+34Fg
zr?v7{JIjKfmSLXgNI|)LslFi3Gj5-NFAUu#t)d3Yj}0ll3OBG5-dc*-(XubzMKqPN
z_PJaAJVD59A2V+nZg42rFoh~mCV;}W6Fc=AlbLt6T5t2D`6v*;B2a^Ow}yb$w>I_7
zrvl>1JWp)_N=FIONSf}osWmSn29&N|J7x&7l3K>&-%Iuhz3X_p<RM0j`Jb1$V$gSW
zk!Cs;>i6l)^l(!2F~Q{`4|UcY6d?mNA@7ldIm=!qh&DV|9Nwf_00jVO;Z{r%&a_W6
zKm_F~mhp)yBwijl#+UjXj5%RCUFn|B0AISy+$xM~>0u**W%p~k1-c*oZXcl0x`(|e
z0cs@-a6`EeA;Co0+g9*Jz{SSQC>X0)4ireK^R73r-~Z-=YueipcNqPXKyKagRWxf$
z^Ln+n?ZXP^mIs*%kf8<GMqS50eLHLx)o;J+A1Lek#$$5XP5*vz|Lu?7<+9{Haht9U
z_Bco0Gg(afkG%2ALJnC!C4B+v;-KW0ZfNol4knNH+K!TXgAC-U6ByEZv&qNl=MmQY
zj6?qip+nEJ?~-{>Mo?#dtWE|Xsv^X}(Rv>PUxc#a-N7}EC1cYPzFH%``O`xzCMcbo
z1uSiZj-H<~`waV#)aGZMwcl-uU&ca+Kb&o|$o_NSL7s{s74(B&^JYGOb#WY=(a`z_
z0Td5LFp+6&s-mj<Ea?#zwBg~vg%`&NCs3<(!KS6%>qk$)3?MdsH^-t1A2#>B7>_;{
zw}AiNvDeCc1u>H|RTcwa>lOrjb6fI8eom$0%jSz6PYUR_u1D%w)Q%7sc=QGJo3Ss!
zlyj9u7YpyLJ+36T@^iRi3A<yEmbUW>Rf!PDvWFgRd9}r^i&_Ezsc9X(wMqeW-OxWl
zXM8Rkym3l90p#hf7Kx@Th6Vk`AtsNBjrhCky<|Xu5FACt54#Ow<XAg-KSI~v#iBbJ
zisz3UmPzFQIeybqUpJ917*jnB3qpd@h7aZTNdl}wwxyQSk|0>$DV#4Wj`1HTQoBHm
z6^dT8eT!~%c*3Yq<VP>exsLW0Q9fo$cCOUnY=`)kNqsGyrbrBh&}9~1@}9IsnjdWM
z-<9%gtQ8E06Y3pJ%sE#sW`_$@!|I;4zp1TwqoGeJvxkjBvUQ!Jw%a82w+hbn$Yh)4
zyMw4}bg$@p{fr%-?Ku+A*kn)=V9)oL{xpQrKu;)F=YBBL#KWoyc@J!od_kS*rb4w7
zatl^s__ph5ii)#ad%lGul7l(hoBU&kyF$)bghOfDqMiV;>8}_fxD^&gkJP7!Mt?_g
zEx}hU`o{LP@n+4K>~07x$adHjFU`NAk8d`F@+9`|7lBT0L~<EbKUZW0w_yW#W!);%
z37Q-+ne+>Bq@gXVoC!07$(giVi;L|OD?drbKn4A_4%p}HGK@3Dl6ybrS1<ndZ;8=p
z{We2`v?))v64aBW(IVYficW&20n&1<>`a};5acy6-#G1!Xz?4%jM)I4%V-%A(pX8m
z07-?dtTsVAC5SW9Y}Ak?N!L`#ggTnw=1+`s)&<{09%t=K0!dZv1h!YoySxu!Hlix0
z)FV`O_U&gtYohoF*XUK$MP;o}a$QiLcl*n=;a;Ul?h5h=FMpM;YWVUH|0u2BP1w*>
z%i6_!;geSiOPpgO)+lrqi#p5xTNXNEe2mYsY^As~OnlFc5PUea>T~LKffi6EC@m@t
zyHa&nD>5h7Az!^{8gP^Z*Uj!e`dug8TUCz%%_+>cJ8su4ga|j$ve~`YKv#<`IHOfN
zb(y=UsVdn-uKFi`m#BDyLcr92dz1d&u(p~xGjBr$#xPiyGydW3C^}bW(P>cnKcxEo
zf8H6M2DFc(k73$k(ow#`!m#+uU*QJ1yZ{PV{~<%6z!$xJJA#cLB)2&d@LRCYj_<!(
zuM?smxr?C|2hU%ze0iM;3%mUP{m(xtLDuj<K&?pdKNSK0N2nE@KT-ZmYe^3JF8aTO
zm;b08U|N8Da+Acrbm4z1kN!RqX7R|sG?q9)S@fo%0!2p#8~=Vv1&AZw|3f!H0>~#t
z!<JtN|7(tT0Ack15_kS1p`k0n1VjUDciN5rQ}gl9R6hClhx{KP{QUprR7%^1CNFy=
zpM}7u&~=nC9XN{f;gyW{TAgImskNWs?aE$UC&VpXSH5H|sn93;U-+V=^G{#oiq+>Z
z&c$+Ei^o%>*foDhIctWSoP6cp_`)_1x#1nkHe0whksRsa@R*f!#$4w{ImTp*sjY!i
zGC2|o1hm1Y6D1*4;*_YA6IzMt8po6^)2fJ&SBC)%wk>e`U@<^7TMak=`E?6jsg;SE
zYv){`jUS8{NG$0g@HN5MUo?VpD+w!!X(5uBCUuk$dWpu#XN7PNCJC-$6$(cV_R517
zJ2hW>A70qT!Bv?2Ks(AxPZ+Z)?j%07O{JW@I~r+Lxk3sJ6=N7OR7zr6WZ!9j{Q`f9
zt(-P$wDfi6w^qjKsbtfUFE`o!{w}|~CCPJ3Rjy5$r<eT`Q&cE8iydG471v@vpMHht
zODt5OAnuP~_&5m{lbH6xr@#3M4`Fglud3{Q&^$GbrKMf7zDRtLs1F|x=MNV3hhQBt
zpJMrRNW9`6#@<!m6u&3~!N@SX830{o)$gj=nkpL3oraa=<}Y6fkLILERYwH8;|bD9
z2u5|79O;YO;M9ypbhi0$RRTojli(q%Q~l_D31tLWNe57vu2B*|vGmm;H@1m@uYYss
z%`PN((j!$s6Y(bT#yU`>=vR-XiTQyag-r4|K?rXI`X<}y^cr$Xjg~`p-xZ&dN6JyG
z5u+bXSxi-89Tpxo)oNN1!Ozz!O7{>e<1hMBdr{Zd-{9`hd&s1xFKclU+?r{GV<R(N
zH{esq6H4<gGqGy9)l8daA^hHqC#Digf{d=3TRg839Za}aSO-(YOcE@BA6{wo!fhfZ
z8&+^h;;6=FTr)ywfQwbYiM}^pH)|pwH?zbG0x+N&4Y5S#3&wXc<Lr?{`snH(iGyP|
z=#}Qt_3=@w14!Tg=ne70`z;|b?(i9;$7riWH$HJ>#9{k|u@>J(O&{%i1XHNSMh95~
zA^}=+os+*Ex4DRwfm~BVBLhGC*D^Rnlc@LIGgf(hPM6CJCS`Gxy%VC_eyOocr(mN@
zC*d2fwQJ6+K)b5y<3+RD8ATb~^>}I!A-xTT7@#i~n%H=lf8R@zHh*uCcP$fh77c%v
zc#t|-I6U%Wo#M~aViweveI<-;q~ZLi1>Qs_{7t^OY7nxQv`>^31vCK46Nir`qCkZB
z-SMj+dqX^dafTQwgo?*S(wpod63DZqT#|dKN9n-f&}PGso*9q~E=x++@%QV_cw7k+
zO7Gfw5~OXX`>hR>PXHAX+2o4oPeoCCxkt`PT0w8K$5xL@UgMu0-zFD_u`H`@I%3+I
zALcKMQ0Vwfq$a#a=B0_a=|bDALz?eNtia|#pH%X>5?((}KpI0}+FCU04t>-sJ0H2T
zp@Kns_vfSy2F)@B4Rf&E(nZ@-8K<9!@YfBiS+IzhQzKb#LKau_c&I6mw*T7;@O$<{
zM@AC^tkS0R5UW>^M!gJU6V;7E4~K0%fNJ-ZP@d~LPp_E8qB~E!A>y(FL-gH|^ISJo
zNZKchIN%4QbZofqz1VfzRYJTaK=Q^lT)Jzz@UKt2WPg9_HCz-E3B16W#@YiU!k4Zc
zOa05b%Vg-GneY3Wi}j&*ZS_W|=o_|p`X*R7Kmd{OBT=HDC2o@mjJ+LHyu}28A*PaH
zp+dN|+L)H4+mq5Z1tTYnu#x!KkHKsN`A<6nJ?U}odc*2g>rLG)+#49}R9}MR=`@8Y
zpi5f@Q%@YN;Ni)*p|PQz;xLE7hs1|u)PVwNl-WpDn~3o^v$~E2N!IaB843{Y1GOQB
z4Zfpv8seZI{K$Lgl|+C_rA2aIgcpz2AfNGvFtTdyz0G)ve$Y`U?5&tw9(^bwVx!R$
zJlGPF?CKe;nh!eHFM*((siVdvnB%)f1XdwmUk_vz<}I2&T#ewVg3>RQR*^^>z2u0Y
zb6tiVi)e4rc23rGq4z_KRmD6kcOChRxVC(!CGhJpg`pes16Nk_oHhcO%FS=Vt6`mX
zu9uB#bf{Kk4C`nSzjB%O3Bz7lNYO1WS&S{5pCGB=v`;)5gs9CEBeu^m2+_6hhGAre
z{Hz2tb1MT+-7F4Q>i-E8k;z_zWf(gI$@6Ggm-*#DV2gaSbMnUbUy6wHL~@o*7L2D4
zeL*)xNCv&sone(M_tu=e%v4t2@TBKjInc0q)(FV=KdiZegOCZsA>iQIfHZC@M$tf?
zi>p=luX2U54?KGXZ!8xj38~U_FB`N>1<j~?+5NdYN)&vqX)0bc=*Up*;I9)33nsIc
z5hfEDG3`^H#kQ|;o%;vkI$GoLagXOQD2^NFa6Y|`waoj>crlyMVV3-T9b21pH=fI1
zBj!%rFGQ2Uz()84J<(or#L{cynajL6A=EkigrZ3INP1y>_uH=*sQL^-n8O^H453e~
z{A(jP0Ev5BE+*fgRgiWlE~;4{ibYbG%Qq!Atcc`mXN3%>7J?`zvP_C~)L0cpke>n(
zYFsR?76An$loBQ6W)`Wn?iqcDQm&RH3MzP4SV-taXzfh@u}7$&ADFd3P&#QnM%c>@
zs#yBx9+&XfB(guIrcgc5-f8-DAIv~)TU{gxaU)i`GV%I2v{ri&DxkeQo|o4c&nQi$
zNhjn`=0Gy6ghw;1B19<QZ#+M^AR{~0+bt4YMe8FV-SE0Bs^k~Y<w51uzp_l_o92GJ
zt7!i8dTo9fO~@RR&OOI96jjzU-v~c3TZd^R;Kqak^u@$x0QPL9<9Cc`BpPaP5Hlez
zZ$my4+G9V^O=h_WW)`L5^8nG7@d9k%3@!8PX%cv-AdZ=-!*g*6vRv5v0V`bp$y|W!
zmktC(9yj4GKbIi5688~b%{sRZH@?vAjYrE~u)?Y&LXNp&Vf4!hzo}NqRq(`E`7w7b
zvSs+zp6#yZ#RD7BGvykYO-^Z?b9H>plDGPqTc4pkzlxc$^a~X+_-!jH5oVOEULay6
zP`U~Qw6PLhN4h%nOUsjVb@;T*J)Uqc?a6E^^7B6EkEK8=Bzy~j>YL;|6JOdr%CTD%
z?LXKPVG=z0*c$UNpFdNkp$<s2cuCqgL3|O?9N@tL{EpF8^R;6N=z~v@=|sU(uuF}e
z0x2e&9MK3WW!X&N(Z@?rERGO(K6FJ9u&TOvXvUPOtXgq4j+Cb0MRtu=h*6d%$Yht}
zON3Pq)M)zo(EW7iUSb>wM=?8R_BE+oCb(}06x7R@IFNd7Oms|Ca32}B>Z}MzN|HPV
zH!*g8jW7mQ-PmKVQ;W}b?)swlQ()WYMy|>pAnBIKkP;GwFt1OH>EMy&^k-9E;Z+a(
zZXRwqKgB7R%nV`SPDu0mc)v^+=K!B5{0;#R7XI+v8h^qoV;`G>&i}UThBpA5Ouu(|
zils%{;aG)fTmARgh-c$XgxxsvOSOiR)OZviVb+RY+1Dfx8O2@Xwh@^s8a$7IIO11+
zQh+evx2?on;Be}<ocDnemh-WvNtP5zJ#`q`r;|W%f_cxq%xh9LNV>VrFK^Vs^^@ug
zz+Hm<5FNGA-1pjp5hP!OUwlFV#S6<30p0$R;&IiY13CmnLVMUJLX)pILNc#tQefL|
znsj3mfOM?41pb5df~YoI@@i@b;{n&!B1bkSVL3rxWm6o3Hb>UKpg)qs*kEY+in<!I
z{Ax)Wp?6!0d;P;VW<fYHUCKsp;-ecKJU0O=$D+-Ai~_|g2v}jPSDnH*YPc+t0vs<f
zf<pd){_qYlSP<x6O@A|Ez@dX<GUlvJMtg-NF`{6ZNT|*w$f)JO4v0B)cjZcZP&V2L
z2~fG@ZxVV>K<6ldVFR}bp_7@U6D|4ID{@bRy0g05uLEvmAL-~F-a7DL8VgC~oF}AU
zBB_@UIHB@@vG}><2JE-*e%$=5TOM!SoNWeOv2FtR=djP7tX<w7a%>{MH#qRn+LER>
zYb(b=3yo(GbhCNe=0)GPi4J=EaeT61XALQo6b~FNv`9-XLgH!@NLS?=$eZyRtI)QV
za_PM5k}Cz@)?%zRoJ;%O#)nRFSddoacRA`X=YlL?U|_NJM(LE6nDt)nHf(mOvK%NY
z8kcEcQP6JEO6HaUKWwp%EK;>pFmm3a1vEvbU`&Jxc;CBXZX)&GNME?n^%$XX5}QX)
zlO#r_;GT5dsD#5-Zz-MqWS0t_vQe=4QcMiX2o0*p)bZoD_VQ-HyvsMtkU=M*B#Avi
z!JTufNRfhpiCiFg;?b)DS2)jwZc<tR73y`a@PxUgbWmppkFi;sd&BYRJiGb$QFXfv
zB;CGWr!68pwUt0kl$7{FaUraQR?rRQc~~877%qm&T=-O22;bl{yz;wvk5*c~3beLy
z^ms3eW<6(JZ+dKPEKDiKGd&0T8GEnvnCW;F89L+SR*fhW9EY^4HqS>oNTyVAeysHA
zHbcK&zr!vndKF<_xp_$7w`wH4$Yac6{7E`GRA`tefxBdp31%U;|IhA6d`Av!!rw?i
z@}{3%qZ#4;vT{7qVxndY0t60M(lY8_BSvD;;)R|O<B>~4TrIZrgdD#~6AzsdkO?`}
zrrWT9%7IGnaod|E)TaX-GtnPTXbPVTodOW%^mWUJ-;?p-1bjhb#PB)d=gu$KD}M^}
z4dx(`3urrFQiZo{N+cq=BE8M}ktP~_NQ!~iON1+q!Q7iEiX3J3ay*5<h|UhCfG0=5
z9Z7U9*A$<u0J#S-YTq|R^EmlS9&v7Jmifz}Lz%=Yl`|e4jF%`n3%8?W89|m@pOrRY
zk##e-zNa2Sazbyg_FihgD0=tQ_UtAEzE+P>ST=ErA_W!LEJNdaeMISo8lbVM3g{)v
zU0cp2ILC29D4ESL2@2ohQB{?3Sb9<SGQd}GK<2nr_<;Gvs2Ks&x$#IQ(&kHg$Ykuo
zmUOUXknEGSPJK(GKH!d*2?|~f6UCV*q@lGO%j#oEZW3`>sln-3KN@O_tRoA<bdP_J
z#jMNZt4Yj<Ar;WihGCKOMa{I|7}A1vcon>gdO~4D2cI}5`J;D-D+R?v{v%t(A!+GF
zqd$+08RQw7jGL}B`sf>kR2}F_zV-Sg`>$cbQ4U@8h%_F$Bp4Iz%Wj3Zi?~bO*Fe@v
zLE29foYU!(hSi!fSkGWg1Eu}^kpNg_8^>9P*#_Sz!`cZex=N#{k0)O=&WKOIw6<g@
zgJiY5u&C`wbyO0RVf*B9?<!H~_-3^pbezsd3qB@GQE4Pc&VV|R%qy9u&6^wS72+o8
zv9<xSRR2679kPY5wRDNN#qqHbVXYd-D?grdd!&Sa+G-7`)wt~r@9ZUVOgZ=O7=DF^
zRi&(Su7%Iow{^2Y&`Hj>oPSGry>GGkcSnQcnfnZoc;c(IXI0a62*l@5pIRN9I{Qb!
zwdngo&JQ1HW~cl8rR_wsPW<o!s@7#{!%4sttq<ciuH@#pnshxMpHXlewskK11-!M)
zi=5$t2u6f??#<I?wNkBp1B>$*r4|hV2MDy6+@8q1?-1x9Q%{Cf$K@{;&~IVSX|z4<
zIC)4;u=3z=d|2uBl{03*PcLAl5+T>rO{x_{EV6t1dE>HCeApD0Bkk-53Gv|<OL-oP
z+do5zLxi0_Q95Vqfpx&%Mi#NaCkMPOsll6i4E=s>JvoEkeUmxLX4H8Jy;I?xIl(Gw
zURV_>Xk<P`AL%vc)YJ&r4j(%e2#brmvqn2R<2mN);|}wWK}aa{WY@;Fnx8j9@{Si1
zorGTOi%+zhyM0!(kuUK3aE0rJ{5k5(hqD+Dm6(}aXvp)Saw&=$^!q0mbjQOVkSyI4
zpEwsrnpy`Y;9)Ws0{d868lk2|r0q=@Ua>knl2D>mkfZW_{@$;^R_s4bc+Q_vRwW1h
z0q6U>wT78?W683IdiRW6=w}$+(?{^#MGxqsY9*4c-s)SwEa<~^3&J_kLxC38z|Tjr
zkcnN)hkc_fU+=P&9*%n`SKFhj5NgNjhFzyd4iTOtNua0@ZMvF}yIb(xl47uJGNedc
zByV>*KOpt!($msa9$ScprryWhb9g^WEB$DKFy<xK@j6$fn0+3Q>kvfN5$E7CtL#v%
zU2ZeO6hQ@)7wB5vm~`7b9rR_0M!^@x?Br6RFZ9jS2be!1C@ZT^KGbc!p%?M=K+WlK
zAonv@rC(;<8_QE-N+S#oT5kP>APj^NLP$#UIDUj?4+LKzihjl1aaAWeuxkTEA)N*R
zV4!v%WLndS6Q(CMWi#kmE*BG0hx5oHFsKc97&(lV%4srQp<Gf7XI{;duKKMs;8z`p
zywm@`eY7A?0JhLJ(7KZ1UrZ%D9<W<G<E`cXH@6560)_>C)ad-%0S=)E7{lF=&yoGx
zvkf5)3{$Mkmj0Vf++hHiOh24`?0*m2(gDM`M}OY@%L*>&?5{(dWcUTzzlR0>J~z7O
zg`F|IY5kipVUF}<S|KMPr{5q=A^X8~Z}v<~_>x@`cd>cupG>x~pUEbwbzv7RAOa$k
zN6yC*0&ZNar;B+R_a6GWkAv{LZ?4#8n2rlzBvI@)7G?|8<AW^^@PSl<aF#`{i~(Gx
z&;$Kv;nKyJX)A2c^3P}?I!E0$L9qrmUC>=JhXv&@R&Lt<&V~(3shVG`N!)s<GXY|Y
zUUs1Zp)%Dk!16y(H85R@*ggLmvp{%DzPS$J9_&nKgt@BJD_QRQ&69!2tKMzdf(Q{<
z9iDf@oejFr8fE{!>DPbX1lZ|4aU-=G>GU1~v!C81)p3XNyxbZ6>B|J~VsvEKSzyI+
zRh3jhd?fj{&tR*gv=8+Zo~g{@wuQl|nX)-;_2okRKh;VjrV@?E-piim`4hhc;eeZh
z`v5vV#3EEcn9ln1G)GxIM=f6^&;bU<>`FR#Iidp=Gep!SX7td(395J4Of%WRb7}~P
z5<%bVA+$ug9Rp=M#+~1RStqULD2YZUP(h_3XgNHq#X6WJG)PtFh#Z7#1_StQ^mUHO
zjIi^RFzwBA7Dp3IGfzZzk!3oJhHHmHi3Ot1xI?PMjhZpFl$fD%qYe;iG@npG5@5t(
zqIycLE)-j7+1e{hTdC8?G!O32M)Xq`fJuLh6VE<8pF*;B!&E)8AQ}-_R{nqaz-v{A
zMzR1GRu^*600r)jPvWLhlQ3PbHAGC4h+<$a(OXH(2%(lJgx9}!j?C6wE1`>sT)A|>
zd!wUN>m#AmK&9zi-7P=}_6~tJCB_X}Sr7iGS33&jf$U(>{I$nPBt$F(W`*h3d|UcY
z{On_ca5S1u1#+LV`6qEJm|^amG5ZMQX`nJzeuiT(IuhB?^P1OV35544d8;8lC8{U9
z^F307tR5tOWtm6g<lw?go?;{TDTlf9C6Vjt@Wt$bhGc+L;Ywqj4@UXJp2nisy1&GI
z_q+E^M_Kuj4t3Y0Vt4)KQ~MeICEnwozg#$W!wk*_2A|1~?bj?LHytIi|KqF!t5{J=
zhFQ5SwB2vAtNE`Ob7o0th#xjJiy66=Uo6-!p9pY#x%g5RJAcN8Z!<AO-WHvDI%k+~
zhe54v7ceyRM321w1Vt_XzW=7}=p<&$?&Rv#&ED7>u><&=-O|b1H(yuGi-{erX8pN<
z%`+BSD?p2gOB}~KpxcpE^1%pwC$5CA92|M#A&@BhB<VgUQ6ED#uCd*FCF!{{sp(Bo
zoI|o;qLuPZfE7u53gF@)qjA=1DMVoq-u7JU=&UKfRlD**r5=~MbT(cpUu9lsLPe$B
zWzpOG7_64<lDj$Tud4C#G38>h-7acT;^K$VWWFQIn}xVymi=!(xk|iV4DNKY?O^4|
zJ+&?D-Op`*)ZQyBYP2&jnv{=M&t$~h?S4+KhgFpP-PPdej^F!H;$^~gO1w_R_|=5R
z-T+yOP40aO$?y+Lvjr@`n4&;$k*w`{7n?d@o2IFD{(VX9rGL}bD6&|&F~j@)r(vvF
zi3cw^OGDp9!SPWzw@H+4RlND>;9p;AE0?f79oaETG$prr4S0zWgW%~IN>|kg8f75?
zxJ|eemLc-N@aux1Z4V;SmlMKbg5^G07o~=3!uS+BkiEODt*>L89%LbAQzcUG`i%|j
z2Kw4<3u2E@*@hE=1&)9f@;Pyj<rb;vXP1W!wf(hSfSdR~C}~JYs>Np1y$(BHEZ0JN
zH8|HGk2fgwcvmp3w)@rfbXu&G?cgy?tD46t?8(wxj@5e;&y}{%-oCvg5+bKwI<p@m
zm~EsFY2=mT!N}r^Ui#Wk*-Fa8kUP4mB;^qDi9$&NuMh@_PRX_M1hsNVBs=+TGLU1?
zJ}s({B_iT1&@{2&8B?D97M;yr5jg$M^(MV~<++=W$w=mDC{50JHI2`@%!rp;_yM_S
zv92KdQh=g$mJ3Y1!mqIHZdEIL5gtY%==`O;{&*|4Q(DTXBS{II(chq!EuvpT;|F<D
zLS3R`;`NFp*DahuX#TLjnk@@D-OK4_aX{-;WTi@78Ii01^?bUEsoh1Hw71;lhe{HR
zncgIPc|s$`L@)0ZF|GS|wak~=I)L>@&rtgese#iQ9^E2wjnhxC#MMY@N$ok7NTxz3
z`mNUNGd!g&r3?n!WOyUt#2*{Y0SSuii>J73a)N|OUSh;Ah+kb?H6~)k)M`|nz=+o0
zyAy1GQa2dtYGxybss<4s(*7R4IK~S7evgLxZVI>FuGFz^DiTz07sFXMImtpwGRd4?
zu`<92$b}cs&)d~67iLXZro|SoJz{AkR6mtceKosOEIi}H|JlAS2}J8@tKNCvVTtRJ
z69u1Lbez=n<cXB#D~aLGhG!JvK$J?}tr%td-vx~})!pX!lpanQopqd8dOw`U*Zv%B
zeugvspX($Fk4{EO3bi=vTkeUJKqx`Ia$@95(1;e73iAnUv-;k${2&=k<B_syuaCCP
zJxJqtaM^x(J6_6vCdI&UpT^7oK+Dd<*hu0Y(Tw(tF6qBd;MBfD5liJ=x^#DB?O45I
zrT=g+U*d5B8ijBA1r5!~X?$t5bX@SIE&53N<y9Ak?ZsWEYE2e!Lt5S>Z+KKUaU%^c
zhBzU<npt=x?`C<`(WLXKANp6#kvvlx&Zt(0ByE>S=+j}s89g8F+&|fAFDp_v8E*s|
zuRcS`dwB4PEun1{((AfC<kVVB!kZp)4k&hHcQT<PP|^EES0m<R^GXq~am1E`b=mLs
zyB$?+>1NAa&c)Z-_*jEoOpEsmzBRiuVyfq(P8BiYNgQ8}lu4%zJ_n^udEHzW(eBin
z1iaZO!Psekd~ko*xx{N7X$*J<Rr1K80CB=+q2zvQx<@Ys102r|;+F#|4V;^lm;LWQ
z(WKu;|MchbbFWHiZ+;z{ks$Hs5j)`u&};o)&3$E1TwAzh;})C*w-7A2yIbQ9!9xfR
zjk^;hI6;G32bTcB9RdV**FbQ0hdJcl_ukaho2r?r`7zWfs;E9}UHjW>t+UV98=nWY
z^z8B;dCo^?jC{(0q+Tx(8^u^AUJA0c_wUww#Ls8{UU%u=LFZg6*E{d5R-Oy_@>n{w
zTR+xJ-uk4~tFfTNk>#O?q#?lj@G7~3<)d*+q8Q$|m7~^HQC$MlyVHhW+hdiACQ||>
zDf7<tf@RXlHOUhh1;7EL`OkV@@)8t~R=(4&FPHRMWaH{17KQT6O}btO>T^DR=f`5H
z_D@2dX1`BATs+(zwKe8;-FSZ3)VOjRY;lY6nB8Dwf<6bJgxI)HPTX~A9~0?C%E3E4
z$B|{BTgoZkFOxrm)%(}#h0Z4Pm?b&|vL19)2(e$An?5f`nuos;$3MswGNZxLa$V~C
zbf7&Y+!Ao+#teNy=AV8j2ppoZ7^u3x;z=Op^L^Mbe>h*inm3&hLRH%SD1Q7#4$L8B
z;CFX{?{jv6KW`h;qJLW;&aeX^XdtQ+Cob52Gy~!``nbv^!etOwi8j9eC5A`}z(#<S
zS++V!Pmdw5-TOr%sjpZKI!u1j@NoR<VdAMT(6fHk!?Z+`SabEvUVZ}i0|@{fEp%mp
zvkJ$<h{Iu%X@(=GTebkJij?}7RpI5K64x!RW<tt<{n9y2V|uyHhcXCT-Yt_xLgVr7
zC|O%#TkWQF%}>u!z6hs7Adi<;)y@NE8K8Ysz`|z}C+egiq#Q#3rBld-w#S54Hn^ue
z--^|F3>8qur3AMU6H?1jzK_?|d?WUPBisq+V)Lb;B_|DMN0(+u0_w}R(6Pm;8>_}Z
z+T@_g9;Td64!O7!(|Wn!ZzuWyc!|pB<16L`(T!*Sc3y(uO#u-Xx+NIg8=Wt>vHdXh
zI8J6Y9Y}08lT?@7Twbj#G`?ilI#j#Zo~qxlS1WvbJ20|zKoXXuEUcR|->We_r>M7^
z)Fh4H!tK)O<9Qt5HK0J0>x`hB`&|22O5&^&H^kbZ>LLk9mMEj#{RT>7#4gae(~nOD
zO(Bh`H{uW{!WnfK_o8rT-kmfU(yKKBwy@MmVN!89DjEepEg3SWKx<5%F$FS_eq_r@
z&Ps=Y{YF{0F7PIW@g9?tNpi5mfc|=aPx2a$T3SI0al7+Y5%&0ClvJ1z`*YEjSmS)v
zW{vlG4BCutSEg=9Q_2E|%_Y8%H!|oAWff#u!gH2tO-2jKHM^6zK?(4;X^_U0SIq88
zQy9Yfbt^)^f>c+nCijy(%gV|Vu3I!HT{o1N|I>_CWAQt}(|ehC{{1KP2y8WleyT&F
zY^2_$5-w}460!9&=bI^>UB9QhtYK^SDyk`y!v+bzhl>J!=LR*!VY|Iu_q_`#qS2;M
z>n9Yhz3tRiI0L;Z)$x~tDQ~#BAz?A!qteRiewxcSc(R*{Tb)v$c*K@_1IILZr!9pn
zgE*P4<2?AvSLZ4mHA#InqH^tgF1okY<YYOadNvJ-N`j?v%m=Pz7<~5@==(GBcbetE
zb48ynM5_+8k?qqGyKyupbIW7@y_)m*6_=3=I8Nu)sNK;ED@~zom5ZkGwA8w0Fe%H?
zw_Y+26{pPf906&ia9QwH>uKw-^*i-4=H1d-(k9@2YYX|itHYeb7RdCYCWcy_yl$cD
znO0-{KqCLg3MNg<tOu8eA-CPW6m>3_H4CPjYo0Ioi6yLhKUy>jX6*LM%c!<Ce-fU>
zbuj-n2cjIXxQOe5LcdKS*)H0&B=XDlZ^6~FBn0?$AX|bXfFnd;(d2B(jp<0C%f+Tb
z%C`Iay!tdUak=;7oi`&zN=wyCn!yrC%TQB!qnS?}S=w18{CiQ|EPF|mS7?ss^9*Pj
z7pq+)JD&<qUu;~am5(JGc^3zCk$qm!si6EeJoBmr8eK+gxx9vXu~uCjp>P?cEFw)i
zDX5riFxTsY<X1;ow^CT}iqR>^d9lmMdC-BjiC7F{!Hsyx)1j!o5psUq;dxk-s1)Dm
z*zLuCP;Zs`G;Kcb8=Aq6;rsPxPY;t*m-)Kco}!w%tR^*6mmVElHN#4q4E{QfT09Hk
zSC|gk@kADBFbB^oYTv%+&K6iyt$0T;aoO05M73No%bdH;d+;1o-H}Dx_+D(7H}0>U
zkzg<RjI>N`u9Y{gb}9{jd;<0*d<5DhCv!h052wneICeuI)xd(*qTvCHMRpYT@;lme
z{esW2;_MA;342>~EG?|sWvP>6X$*c2LFYTk%Tt+N2YbE}Ap72{1e7a2TRUeDo8`3m
zCx0tKq-Wk08S>)|ISX;!pOEREveg-4H?k~THft;`#ke%7?#$t<qnn=mk5xQ3af^ds
z?N%Vy3C2-E7LdS?-MO#zYuu_%vwbVSfqQiRkgpWL+H#S|35sM26di+{Z<_3^uY}(2
z^pdyDa8#S^_82Db0JMAiWJf(*YOX*WEMj;Qpd@q&+B&<Lh!LsJ&VLo9<Y0Dnw&3jS
zljRQVO`An2MEtjuGGMkVrwh|J!21BRU7o+^3OFCXG#noJRp{hht@!8xYG&!C(5w;4
z;0~O!w_S`Q)*-G7G_<o|r|td9Hl#M0uz7d&y2<KJoNyMu{{`j4<^uVTbzv8cr$}5b
zr_=0uwd*f3S(<rgUU(0!-J5B}J!7GTen;{yvy_SP@ymqX{OUo@!_`32y@bC>_cENO
z>G_*{hj8>#BwGmgZIwh4<5pAHT$=NhNb+R1TJy7fZ^aIV%SzC>7gf~BvW^CK6HCZS
zm6ET<fea$WRG6xL*{=U_7GPFvBFk6P$!zGeJQ&3lAX92wR@21<1Y<;MzCSTv+U|*k
zYPkjBF&zws((XoM<JL?tHx*6sxui3|(3h-Xj~}%@=;CvWbV(haWvIHXl%tr0>~-@9
zW78BVBzsc=fu}F3G)hmbqK^2@MKqhqF?#sL)i{lIewz&OwVAVQjgKWRfYon`0c&jq
z3`W4{kh&ge7`Sv{J=`vk-)?pl)I7K@{FJnsA8fGl7E|_V`A~cJupuSSWwq}tbo=Mb
z^DdFT*t7lV4vPlK#Z#G9?8Q1wkJ0%&JE@DKyuo~HQ8eE!g@L1vUlif8UKH_k_(J`g
zUB!w|H)pQnK?#CTYpsgu26ScD$GilzdFM%hHgESVgjkQXT&WNA)YQyv>ea6d+l>C)
zAd6DE3kpeC%$JZ>bQrtE9kpTBH+)NJP_v7fp_Nx$yW&7A18xI^unhBxUhU5ov<?zJ
z-+b;Rn={}w!m);m633}e`V^C2y%3w+-a_g=A+B(U^^zE!22f8K%L6V@;Y`m^&(O49
zIzmd$MbZ(`kU|{`lQ;mH(<X_=vmb+4<5WuZq6fMe-<BU_c6+P^3VqU~sd2`)t(9Vg
zzC{&l&gtca7$yt$6S`BOd>}}Utv#5WL0E}wE=jglt+Phli=rO57|iR6ihUG;rC<4V
z0_6HcLEnTlX*=ply`CA1gEXJeFn=KR8xwkaP^QSe#0bZc`$swUcBfidtMS%t+0+fc
zFaaE(;&*kxGw)R4^J7e!nD$goZUn$R4mC84bGU~IJ7HSbWc?L!39PQ&&EZ+f&gBhE
zolnk-eYCIxN9$0|JZ)VXX7O)s%X&q{GTaWSV+%)~qGv|F>MX;pEcD{@1y5k1$H@p!
z)8+b)#ly4JDHPD&8OxICtr@qU>5d~ln0q0<RN+55Dv8nn=<iuaizHO7_Bk{SXk6S#
z+Vtbu^FaW#MI%vk?{Wx2EMAv@J!z#@#<mUr++C6Q={!!Y(C6A!P=|{GBMKbO+Ounw
zv1_hbyoqjYwg?+_E!XE)PrRI!vSZdMU-32CIOZ>$Z%sYh7dNkpnU2LE<&)}?5&AQD
z*+3_Y)m8tKoP0i~?n)!R+PaBZ^!pxHgH1-O^d(D}rxT1k_bz8gJi10+E5N|Q#w3Y(
zi0iPh=+XdBuq9EELd~wGFBy1W{CWs*>y~lH{T^?$oA&)0waRWMZlcN!tEIDi9-|;l
zMJ0`HmXr!N6)iXAa4*VsW>&n(<6{jSE+~Ai9+ZiAs_Z&f%}zVsd^Xk>Zyf1BH^^eK
ztmC%sos`+MXwSZ@xeh*YNK2J$N!f=mx0wdg!k+`1;&ZNMD1qi|{RZNdv-MDC{kH{Z
z)nzMSGDW@vOr|50oo4)3C1y(z7nA2Xx^3bJNIO4^s?Wz6<kM@<MntstR-PVR{8rra
z(77HeJ@&B1HA^1VzQOJ7r5@T&Uc@;`5G1*jfCCA7sG$XuZIV-%CAoUQ>j*E;n|yvO
z8=VyOyoh=EwZ0Zzb}|iDze=SDC6m#g=%LRUA?Ep9FR)KRM2E%8bl%zNSeP36jb8@;
zW1e0G{aE_Ovs;kmHzk3&s0&X^q{^9R*K(fCF(LDc-1O;iEp#bsEZorJ!Ql3^<S3oS
zVXtgjS}xQejv~YXWoL=1V0(92qQ+)S0yv~yF$LX7Z2Vn(DJA@^@-{)T#aJS)>=y9<
z(B?iL<^OB2{_ng};C*7h(p&Yk1NY|q=%((C#q;38d7<KI&4ncMqN}Nchz=44%pIKz
zC9MiFTTPEPAqj%G*=KeQBthh|esq1~+_OH?Aa_ii!P%JgR^uHDq%E$H$9(o<{o2>S
z0*_C)ON#QX_PFs)fG!>>l^Igb;x`l$sGIf|IATaj{-M|VYTmR?2(ahU*k6cUiX{IP
zNt=#~RWTo~xn}EwV?1cA0JUzKt7E!D>`}*(Kz)p3YYj|o1&U=ZMMCfvFe2Z~IIiy_
z>;O@d;ad96Y9_w0ZqhFC3(^{DCRGfdYT1GtHd1OLL(<-TwN;~cuHtYLYl%Uc@CMAq
z{&>#@n~&=l(!6I{&H5(tPPtd1UPW59t{=B>CeW0S*iEa&ssO$6Hp}&-J#28<BV(kV
z6*_&B-NsU@ifc~f%IRN59cw0q^Ygogc&b8td?8QD$mUtK1A_9<y!q}Lv-ZoH7AJwK
zM6g@rS7^RV%!Egoz;>(gC(ixHP*RWPoB4qn`wl1m?D>vehXc?v$8@{$UaC$l`pvi1
z<Yq3&DK;{a@eN>N_B_nC>J9=KE4OhUu0_G2OLIm5bx{IL52$cmX^WF^Tx_}f6X`|k
zsTz~d9@jyOo$1A{063IfQ^KyL^%wQrN}Z#oBB5Uv5VBh!1gsjNWDY;>dVchoaGCui
zy<NXrF#ulRe2dG&@$k4%A%zd~lV~5%D(pd=+UrTfZ&=XVoLe)7o46r6*~v|A)vS<)
zJq3u>aFY)u59`Oe>AwT9AFVt#6HgH$4W1y5uMga+$F(joC^)>m%JN%`lEbgFC@k?>
z$?qA`E9r+23&2ia#xYD*8JO2tx;EA>T3H)R7KIvm8kaZK>`Yy?Izi7Ku9m02_EqaM
zG`Kx(uPaS|TPmL@^QmwQs<iEe0L%4@gq;%)?No)!*QRJZ@f{S)EwY)C%dx~7zeG3}
z*_o1y&gQwp+fhE2wk?RY)zUd<hUcbE^!Bog7kSxL+XbgXkDyG@LWZ{T%#wHL<y=L0
z`R=@yB8$b|e4JGP7`?5!Ez{E`wMW@Y8qh`7gUa6++~X*rfNa)aRlsZ0Xw3+7vsq*h
z_;9@fJn0Bt30s{0>UJj%`Ta}jU}spJ3OYxa?9vK6@^}NJ>=KQ>l!9_uNV2ruO53*1
zy^4<YBv<~C5VcFPF}T-sXxNu>Z4H<#o|pv12X2cxLwi}-Qb#^}<-{9D%0qs7fYrbU
zK^lt{2WPoIGF`OgG`(s*j?5o1{4;p{bw5V%QnE&%qTKLlwfA+|ALBd7db#Ri?b>B@
zY-{Bjt((jCnNyb9%;nHh^|O~!j@*6Ve>w_fE6<#JnpsZqUr{-9xYtXdb6K#HESC(*
zeG?vsgP$A`&MjHU7+juEgHxqZ+wWQZs$9jn`eT?yypVhnYS=4zN_l`QnJupKE)1rp
zI1T%?9-W;BaU8K{tq8PT3ADMTbX(hY)do$rt+>DJjVA3zH?`~dO7E1Hv_5?@iw(`O
z_Ivn|M*7+HW{zGUWu_m}2i-B!A)6g)>nV^?L@A?G)P9O1Ltd}zv=vFDh#KLg*9OPZ
zAeg)kmUI!ENn(z;Bd%fX&yKBbRrK56R+WfteHwd9HGcir6hF54<+fn5yXex3+#9~m
zzzB=9@v<jJ0*TG_=lf=kyXvn8s4t=SB$kP%Qe#yHw2XH)VU*yr%_i8qP)v6Cl0>@f
zlIg;TPHlXi*G2MLN36ad$WTA{x<LDfy1%<jgv9S|3aOUTq>2YmI*{;4x(=r|ykOI<
z(n$G%1ud%5d;KEgR32)>Qtlw|e%9j)*u2T3V|svH=zCi7kYaDu1VkS!1BNX8sm+p$
zZLARuyrBl;!f!lQ1<j!9XzB!>w?$~>z0WDBw0N`;)NdmaHmrFFvRL}@o8eD`U>kPj
z{?!NC{n^W?ld`+;x|LugGG9Ssv3|4_uPr8L-(6KadM#Mfel5}Ne$A+wB7L0pA*;w=
zNv*rEkzZMHLovu$^Vjfw8!qs~`dun`lA2z|)a`w|ueBRtnQgg4!ei0lrwkvdwGJu5
z?OppZ;olXCnf1#_XW6@OCfi|<9r2P&m~~7jCmg4#(7NQ(K&dg#uYeWPNMbj6X^m?E
zz0KX!>!N9Y^QDg6fQ!JW*Fs|?ozOu~aAdYw^%uxp5qy#2suD{wg^S#<tME+pR&th~
z<lF~8sZi#YlToU&@9rHW60F1TzT;@Wqz|bnNy@v-;2}WQ`*E+Qe>x-BlVv`VZiPI1
z`tr$7lJH50Q@p=T-QF?uX)B9S{7j2<bkk@_C4ZY*+Gp{k&HXpY56fv#R>gv?O>sC4
z*l1_zB}ajB=dr^P(S<8y1(t8P-?CIw@r_O(YC*^6NZp;P<{Q-tvAjrJXkV?KU<l;8
zn_;13WkNOf)|RJY)>Ef^Az*o+))YKv$;12+nMZHPc|n;PZ2Q~Qw-UYx_$qoQepkU9
zYcX@Bw9mAuD{i?~nJ<YNC9)<qZcg!8bH8fkTA1P`S}RJZ*HdiR5l@$iMJ<$b!PCe1
z5b;<cH@ex_BFihmwOyaGQpw$%bT+6W?6kIcjaKz7y=%0{R5b|OgS59}<3^c+!t%G(
zhn-ZaCZkj%9QNE_<E6X0$_M4B`-UYPbaPdu;cK+8Ou#jHtZbhbQ&K(1ds0TO5Y;_s
z1ab<mL-i@eQt=|g@Zx^3IFbk+6Sv;EGBCF=Vh#%&-AnGex}YO$4j<J~t-SQ(n0BCv
zBe2~#4d@E>lL-h6b|BhXA$`-7=D5h8DTz?@NiSf<)r9!q)XCAP?-x}Q^{3n?Q}_2g
zPj;_sxXkwJV|8sVt=txr4<Zd^g`$5g+nPO2;XqsGSS_P_>s}S8N`;Z>{h2;-rw6Nt
zaj5zhS>^2~y*)kLy=aFV5+!&va4}!M=Xh@RBob96DvDZ`s0`m_U)_8)Sn34B)ostc
zs+gkYkb$gC!v%MaN%POr)Ly{SA{X76qkQb`^M6hzQC$26vX;LLI~G2K36k8s$evHY
zqwigZbFGV}EkRQwS6D4Xog2cgO)x)ghtRjtW)WvQbRew)NF~k5rT4-|)C`(L5cJa7
zaNj?1bgt3sr4rjkZrW3*O0y24r*DUA`u0ay?Ojn10{LpwK>-d_m>^6O9D57T-=E;q
zu;vlTLV9w1L&7)APE>uf=Thh?@4&cl%`x2PQ+|L5S$ST=H&$GGbHiyng#V1`e$ry*
z>n7^t0;6vp?|Ir%Gnp?W@t{TLEb6pT-EF#D{3pSpXbC5>I|CIUW!JtRe}l;FM~COp
z>BWMIJk+BpJ8wt#W_nLs4CY4}<>&_EL?i5kRxeA$4BN6@#mBcL{aMJ!|Ngozx;L64
zaVgH+Vvfm!@qw`quIXm(-no#Wm_jkp@3({hy*Fpf?YepKljCDBBCSO1=8zo;(pSsA
z=V*M<w+3`JAes_vSD^+M-xKpQ{oCG`w)VsVWHo64FBdOVJ#JgkU^&g>L8phv!reUd
zr!dEt5SfWy;EOVbNiCL2g^2_wN#Jgsd)-!SdJMRB->)8rwJyEq*8mU!Yz|d=r;N-^
z)G^<(sDW`1Nz1};k$MR=yh|2)$n3qrHw?NX7V^YFLLH~9xcrF#{QX)=-ajiJ10VV#
z)p>6er$n1sqC;Hjc;D$k^|t4<XK?Y?%x#{^mk(cgUE!IVw?8-^a3Rka7){fe^e@<b
zm?W-PZMSR7(Q3InxsD%st#`SfxzO>qr;Cz)S`=ctxZ4DQ=QqL?gjolI_NzQENCh6=
zi1i<hC$oIP$K<vQE9Tn!Fq<uW?eZiXnZ->$K@-!Z&O{uT^@{XZ&24|Nft0Feq<!|w
zr<0=`I9Bh@7y9bC33fbRF<$c73m$-8Abb~*yUW8M_@DaC_dP9ZXdtLjZPUom^UzX)
zmD?kd`S_{t_C0D_srK2W+E-qaN=mviWXqC*#?|O5DV2H)2f_J#D&bTSG&>C9_wX9i
z6$Zi5SBt>{PlgLvjc~WFy*mXNkVtJbk$aZZWo@_&ujW98inmtY?=~`#QnO8)hBFub
zncqc8V16lUi^F!@!;Rd8-X^{E2Xay5t!hdsDa<KnvZ7KMc@d%|*|K3IuT)tI9X8$&
zF&Z|Y;~RL>0|k>3&y;loQKaKGH$H#=W^62ILJTwqI)^*{0B`)Zi8^xpCKv<p#oQ3z
zA#}s}eI0ZvLkd++;0fmzp)J#3VSPPj^TTfGNTj8=(p)mV_85U%>-{ahiEdVrM$TCW
z{y7?R^DGv0faieC?Btd2HOxF`iJkt!TF!j^ajyJmp2*4*Oth^8(nUvUx3v(@w*_A-
zbI~zoFZ@l7f0nZef2Kh36nj2@2>M|JPmh8`i`7OX7xcl)mHTz(@12$#29@hw>+ky$
zF{)4GFQe>+^e`LYi<rdb5k*-*1#qilW-Os7>T^SeB5uw#pI{_hoh11iQb=Z*8qj~;
zkB<+x1)+{V++F6%-Q-%#mE$J))c(YRdYl;qjCCOq5<8UrWGl3aYNYdaaiEH8{CI(}
zPP<uTR1M3$FNo?p?YWm9f5oLTs#fZmv3RMDqE^B5F^T0FfxPOMcu<*W66uSDa*kH1
zr?F<85DHRZD7kRz6--a-S<cvKo=o)n^+ql_)m_ANhn^JoQdLq$9nO(!$v61fTSM9)
z9B3^(edQzkDr_|^2?lQd-L^s(u3HEDYl<f7ts4j4_rD;09Y^(ep{`E)Mz015v}13D
zn1fK&PB$H^hLwfFUw6x8E6kx5zH4}!Ds+tL-TEl=aUco(Ucq^%gt`4Q3Wvk?DvZfn
z-`~l4%Gl5=jW)vnj$_O_RX09|%gnUx5!5$eAJ`0k3RC$c=FFJT-Q|<5uI3YS#-}yh
zXZF*hm=TIIv*iE#)7^)~!kO<DH+9$Vd+-Fu?uAW?ALnX^zHn$WzaSTZL)&bPiB{(1
zDwlQcxNdJ933#H<ps<H;cLuqvqo~a>m3b;k&oY&LZm@_1_Hn5x9cpWRvE4+!6d|W#
zwhhD94;%6CLeSJuhwrSlJO39#u|4{&j*X-4zbHkNvalhRW^C(t6N8yjR?zSDWv<<W
z#a0Bz^h8|!l3f?a1!WHD*u(h;4(66BR4c8L&pUQkG5r?@PlKIQWrppoq-e^G`L3T_
z>E2PW++Nx2)hNDB%2E)MdljYY-1VoDg;IN1WMncj$!0PQhnY+xHFn*&CH>Uhtv?92
zSNLN85>L8iIOQXOwf!{+pUHs~($q?kDm;x5Q+MF}fl4Dw%r1A4DN$uFnmG2Zx$U}2
zxED?<I>LF|-S>@dwb}x<T1jP9nn|^otJ5%ZA|L5IQ=+0$U86cTGi!g!7Ld|%+FC_H
zte>2xRPPf}oq;v78rdBuWD?8yXm%beDLupMWWyLBglxiwygbDeOLcCTy4(%K^7um>
zyC(jotDnPq*g+NX3%;prfzH3a%epu2iS9pLfLv`eqas-NMP4t|(Ts>!PwS}1lJUAD
ztLOWD9$Pnmf7z`3-F(42-R+PM2TF@;e<2l}<u@Ni?snAq(y!_CC(Lanh+M!EZ!yP?
ziT~<0Ksou8l$WerYTpuKBj=88nR~_1L@h1nDj%udXt2=^*>OcHf++Vyqv3rOXj_T)
z*1kvQbJ~>qOqEwAQ0w#QUs4kl)#&>M;0j6%>WxJ^x2TxemR<AP(@u1mN0YEaL^CMq
z6iX)$5W%@h5#JlrEO^WFf`P>Kll7b-R2N=*!Dv@&a|Ep&XUr-+ejTclTHfz^Yr#nC
za#3i)vvam0L&##cQ@2cXTAL6m_IbB<0!G6|*a)+z&u_uRg7+^xUF=-OkEnVTBrcjD
zeK_iuCDYzfaT~h~;zTzv@$0h6z3bZ~YUB`V6-{0OKxlJrQ)28VA(knX&{*gncaT8t
zETxghnS~5{PZNnFYx3kp^nx7|iA@G}cj#h8FttkO{f+!gjuGe_6e$Pq-?PzyOO|vV
z-`qzeG4i}F)iFjdg0~!`m5-){BT){b>4#LMVZyUlqt?Osm@T`g=`lUUX{3Kab6QZI
zFpMP=xI{X|D-ZvKgh{w3jY-ZIXuIH+GlFJ2+IsP6RpYVh<=zYa$W-6DgHXn%kjY(G
z94N4dTqk|italN_ffiL}?<Wx*LaYIL2T4n?1XPdD_pbm(E(4t(WsI`bE;g^<?^1jm
z>A`FPt<yf>%L^L>b7d+0%#jE^={*sN^b=v|P5H`e8Vzp=?ww%vBG2puY5kr)0mtU&
zPyB)u{XVSK!rb4il}$-90w`ld!$E;13plOTo6PI-u=fueOGbO&XF94phU8{bg^AGo
z(&hm}?D3#XI=;*x)`q5EW$RxkTzXe5A#+3Y4U?N^?U14t$|viY`c*imx(CiO1o{|0
z{=0EKR!e8Q$&!aB2~!WK*yD$(u|GKgUMlBgzkagE*x9W~Q(4RRs&u=9O;5Yq|GK1r
zt@eM?QYNs^>XzP1?jU2en3*co3Oh!@a&UC7Nju~6bHgq=z}UsHOKM$9wq<}swvA_J
zY6Dm&<L0x7ynX*h%UFT{P?Oatm<HOXq5h;#*cDP$dV=D8cLuKVWbz{Na`{X5Hg5cr
z)K6<!FG;S{1qkEcCfu9ElX1z$=7*Awe}CBk6A3#?(EiA2la=}g_B93U>s6Kj(P%Va
z5?$Dz`K7rb*L^w-CixLkw#g=!5z{LeY%CaTF#A6@$sS6Lg~wA|pg5apcGayefgC~C
zr0w0|ab`&{Q@a1&+5TLd?4c}BCd?4g)G<>%lRpOx3F4@c>UgQ~yQN^{&;H1~Z?I+^
z!~!RcV=B`WvDKP>R^4CZ*mX<~y_HMd7%C^m1L_SZgohPk2B`M=$G?2rsTjovtyZ^;
z&T?~Bnd#;89UpO#&+93~Quhz>z6t(qzciOO+9OIk8wd%c|Nc=3-+v#(JgGZA$O_e^
zf9|mCYpiRaVoq5hdP6oG=m|3LH3EmlelYD5AVY+9j?-(#T9{EZQCg=NpX}^yGbvGc
zNs4xOSdr&;Ox^)_hRsNsdKbvap;w_5;s4x~H)*1)JqUV%oO3_m-Wk4bZn@5te0_m9
zN@D3W5a1+eDPf3?{oK~`H(2!!9nmUd$*KCzx>8_+_JLa+xGndpR#}(Ozf%$ZTo8(y
z_5=0FAPzLF8g(@hC;;Y1B}4~aQWSn43waUftW|+meZ5xTBtmm(7MFcA?OvLO8{Ko)
z%8}tA`mq!)Q${!=jAE(0B_KmW-d<c$;=QEjZnR?rv_}C`<JuS+VNL!2XZ696IM627
zr%3sV(!kWIw;yg79`bZO2lY5k6t2Xs4@*+YJtf#VXys#-Jk2p^l+z+idZTm1T8gi|
z64SnLm`4{X)~MRC+i9E5QDO1+nqbD1KLa?R9UB<ASk?b_qxyhJDrAh9xqJTBcb1dE
z*7G~MF~LQ|I}YJ@K}2pM0kup=GCO<a(#iOSG7p|^IWoz;Z@?2h<IzMoyuIt6<JiLh
z5Qy485B)DXF;Aahq|qP7w$6Kecq49B0cKHq@N%)jH_m<}!jABt?p#<8y^cr2+{>?y
zzjiK!U||fKWcs=h@%G01(x^lLb1X?w_O~d4|9(qBPM|Pof^ZBE`Cu@Qc!fEvU<!{T
zw2=!(hS^;LnMBI&KOgkK51-7^Kwmr~3X2gofEb@$I@WdPgg>cb@qH<&!R_4Jgpx%9
zJuPLKfP0VcAEprRZx_V!N|NgHvGQAaMC6s`^hVZNAKG$kdmjndJc(Y7kcFL^%FD%?
z)#jn5ulLzF6<z#=E8jDo2r0oaq<3&0sM_5fpF$&D^3rRUYklx<n@153^hA)AHdyp3
zhqp;1Qz7$JOarLnEZU_qQ~YK29+~p2ke_JvjrIXRCcFH7AvdDFLJ9!RgrgcE&&%qg
z@!z(K#|}SjqiGZ`laMQr2JO#P5kA=6L!UnM6_kA6oCxD}-e!H|eGEN0GT-3EHCgc$
zkw+cpGP0F!=#Lt0kWl|&djqO66%abCrV}H=SLN3SU=fcoI)(T*Seg7=1-I>u;N|{O
zOrMD%|Ck#Aw&_Rbjkvl<%iGbP2A50=7s94dJBbJ^y>ktY`Ov31tPoYVIUegJxIJ#0
z?(YE^N{et(mN|hpxy$iUp6Z=aiA3<QTkap-p6kF0_zDlp!whYq5VAkU8g4y*r9iYq
zd|{<6Y}~Tm8$;z+xodH%0sv-qA5U!lyT^<=BdJ}>w`9yv0Ee2fujmbsh+7{op=O)H
zwD^|?@94yKznwI*2-rvWdO?fDp5p?0DiAk<f8aw2momhOvR;Vo1_eiXX5du__EA1}
zm&E3+)bo8+jQ<1x;R{pZPFAa+Nm+H<$vvq1hAc(^1azGh_dM*t75h&Z%HnTDHwLPU
zSJa65_mj_}o>;MfF@OQY_5dxeXB7Os2xUYACu!ql0arwK>YqZMXOIL22+0aG&$4E)
zI0UKh5gjDLQA<^wdRSO92SehLpq}y<^0V-dsH-F(z3O6ez;-D5Bp=HU+k)Iagzfl9
z+I|-;C&)KJ1qumCG``N%1%a>(WF^Hk&anMofu>>I5k*x&nEZu%f*>!DK4>-h`@Q#J
zCgLczDCDf}0h9@Q9P+7g(QFPr+l?S%9MlT3XY(gt(t5wSf|L#G|5Em?a`$t4K`8cc
zqebRIutJI=nJ?tdjgs2TJQhGN;ErKIi1^W<dZKnuq(mRoRX)&>)Gs~s_BI%OoYMs?
z(819AkWafLrVaS*da90&f}8aunT{~m=<S-I!!Vvqxmf<HhzA86=w-@@{C7|iECET<
z9TEtMB+=cGhV1<UhNuq6`xP@R>&tdmkS0hFEt|mqj&#x#WgHFWo+RnX15}z8y)Odt
zg;l|9e+N>Ab?+AQfmEr)o|fR|e3-?7ItMl_;gQ8@0~;EJW+d|Gc3JqwG~v5eR^}I|
zK}hiFxp_~XgLqMVph*}}V|O%vWq|U7da_p7>XNTdl~o@g?`n(9j)OZr0}1JVyK^D7
z6M>$-Ye$uo+-T;U4dL+mvZ-`|;3keXd0!dA1oPi`L!xcjdI7<zeJLwg4R5(tDqaqy
z$OsTwaqPmCdP{yk4_fA2S-cMMs3T0-$35llEsEBLSu~U0;s;%i;|1G-^l{aAE1IxH
z8(76|@j-Bdw{)YPDR#Ysy0GomAi30o1DkMOehE~y$E*Wm0Xkm#!W&j~{}7tmClnA~
zbsngh6bFS2-T!HbZ6M59Bqj~xlX!9wwSV={_EyR!vHKh0FywJ*vjslWpRVc}PPZ9X
zSGj$rc{em7i=}P#eGiTkuM9x?1;|Bt=3>v_%f-#6_%>ojK`F?^oh-e<<)h^U8oSqF
z6p<CXjV1i?p7eZc1+E^gJ(Myq!8f+rs5EpVilY~Yc_=qK?{gW(D#rxR2)lV6qu8xK
zA2%D8ICj57x#g_l63qKrK37f0)hj;cL5>IR<cP6~GqPFcXuGlNcSEbZ-GtKC`;puj
zD)rSF-l`S^EbV~lI6=h>IFFnN^yEoy;*JwAgGj9S?FXBjPDUU_1va_LK1<ke4dFtx
z!8+}5acP*F1{S5{2{wDwfX;Bgt!jG&P&f@g{6N?{JLmg5^vxgq<eMk?RBPr{t!yi)
zd}W=MR|aN(5?$DL`@sys?>#`y;qo9AC>suO>NQOtHC%G4`1uCbSU-<N_-Y^9Vdz!I
zE#p@A9$&X)PQOSs`fu-Fae)bT3lS1TU6-n(?Be}*nJC#@{uK(9gGFyqEyA6p)pgup
z&W19sz)h9<{9D){y8DKE@xaNkyle#^Ot2xV#7<)hjHDn9IEAM?CbR62e}z3RC3s{^
z%7RjC4DbfO=_QZ`@6Zx{=6&@VOvtrw9zL$Q`L44vAOIDSu-?v}!zn#{i__g0lwMUp
zA=H)5)ii+@>Akb%!)9c$<`O!@&HBM==7@u$>!gUnp9w@@Z~P{u^=_WPI~Qf#VjZ-%
znjm!aWxeb*bmy1=2r*T#z(~7>t1z4}9jdCwo(dFu-);N=k&0VPei@S#eu0;t&YX?A
z`oSJ`Bx7?DZDp5N(Y8$H8y2*_YMN9>{j$`qf?o87dq6Vaw&bQ(p;G!wzbSla=mep&
zP3e=1Q-50m{pdAfzl(Te;CM$c%`(G`_PDHyyEG0o&16wIg)X{C^Mx>83SP9c_kCL;
z-K-ru0zCq?R4uhLId`xv_SrXo*Xv<t?g53dkv2PIolpIDTXs0mXfgy-92o4+R6wr&
z?2$?wA0jIE^YwG5WSDEX_H(+&$cclo9_-D6wtROJiU49x`b9cada^bQY$%BroiDv;
z$~I3!YohIM-cC`}ym-b>;vU&^G~tqnTD!1cKOidBFEY$$qM$;eJ<`L%-+)bSd}^6e
zDK@^o99ogVg*F#}*FnjP9#r59-g=r?uw-fm;~f1L&v*lK_WlSi(-CC)6<@-l0bI)Y
zU$?ZWozYL2%6KWxO25@FBiAYom3l|N97jc~kCzOEX6^~4TuUo`>83`og;7Zlrm;pZ
zy0F8G!0Sn3jrjAip3H&>R6b?=gIyz*J<!r;h<^ToRh)45>n+@Bf@}<d_jL%FU4N$F
zbqH~4J<6WG6Cr#uriy~o7ZEDNNOASAPYglw-0J6Yu{7&_Due+1a^OEG&V?Q!tmfx|
zA0vt5I?c;U9fEw>z56o*<JOYYch6lGCnZ$}ENFvY`|#mDP$uiD@;Z`Kn#GDQj1aOS
zN_;K#fhVcslV+>$pa$iHb>Km`zhV7}gK6^35{p+6>%UJgHeaAi%DeF@K?nM)kAZJV
za;y|fw$gi|(T4&f1<p|62r~Y`1;vBv8+BcTM=G)F%8K>+nfIHw9<a~<s?$VO2&bb7
zS5T^c;s~TL{TOTa?d8yuJJU@6F;k4)Plh3OI@BG>ZEu2_4LsD9U-Z#8A=XkBd`pTQ
za^}4?MR;j8y%ZqtPo`@LFP@^*_d*9P36h3*%#|+QMTe=Ad3Qy-ilifcNJv&Sp|Sq-
zD#Mgh*t<H07QEx!a2}w3uJBPZb6}{DW6#(*5F`3vz0r_nzH_Z}11nMmSSy}*39<k;
zMT8LuOpwoieZzq|@XnWAKemcbFe6AfGj|jRx~SYv$ZXBQ{E56~FskJIY$mN*Dt&I9
zhTd?%A(}YxoyaG~={V+FmB%?$ch-JgTo9Z#iT3rH@{0A$k-GP|k<*nxFNi@!T&zE;
z_RF=sLX}z}h3WTzR-tiTK;T8kl))EKmArWi*5S=~QShQZHa#~sH)>CU@1i3###*3Z
zICz2~oqcQJZC;MPx+ST$*x1j|bZeoTo?n8~srbS+-c;YV*Bcec(7L&b^{IaPne*wB
zwbw}CWkfdXrlr;|e^#G$R!GOlIfLT<er=Z|Z!QHh@G;B~)vz_d$uk``xibxjeidVf
zTUymV`o~IRV5CljbG*i}>*LuH_rQb+&C;~=ZIzTzv~zlIzM3ve8h$qB>n$@z_RV=+
z$KPW*R6}>vR48gPF@evD;f-vpqyFbVBSb$>;_;y8=8I5?l|c|PRdV`c{`C)d`fhrj
zH*BNrI#kj!d>AL^c&&>QeopDeqCvJfg@2fRhW#CG2%Z$a{3<N1@d&#k=QZi!dYG>C
z&|e?%q_^T@o;$h2Yk5Ite;E^iDb2IfOAqE#l3K?FDpiF1a9A}N$=`noe8)A`8!%~Q
z1TDCs^79^XN-61~ciP7_X*%IiXPn<#1!G0_85U3{FrVfyJub68rch|YTH<gge`xfD
z+P&kDss-O*FKKxCEeh~Kn(w>i#Lcx6CF8A4)os;3s*Kh8O5G|N94AciTPf@#P=c%Y
zmGoU&IDYYRx7_#n6j9x@NfaA=(8zl=`zl3+W*IW|ZhR2`M}Dr_tl>~~g~jyl9dBv>
z8=Q8@avM0m>fFI-{@{PK76D8+A0p1{gDOlB<y2z#+?#3uisO9HMfvts*oRCPj3Tax
z?kYw<oi04vMSn@L0f!*r)GyMt!5whr9N(VmlA=`K1KZb-dOAIt5L}`cO1*bv$BSX{
zv9|e(I~$%)R0@&9jwvt*^jG4N>n6sSSyeII81^)ZTe0~I>u<$&Bc!gUb$_K_u6B{7
z@|KP8sG>?0Y>l5N#s<cv<(Ae!wMZia9hWS&5lh{9hN#5qwR-1OB*$b6+UXbyKGr|R
z56AaP1{YXdby-p{FK;nl9bf#ay#8rj;(`jZ@G}Z7$FzFMIoyV}VqLF!%1k=7FBoGo
z;$rM%zHH^M8Idl^=(pLUtb1Wkl(vq}Bjn)9IC)j+J-inD<i*G^R^b!_lA`yaPj+SN
z?<eFzeP7_4)tVA!8kGOeJGX{onm1Mj^GEgEM*V9j3R{%>m*=P2{ePX2XDT0Cg97**
zAB~=^loF!9`sG=|L{b4y$Kv~*UHmeuXUhKfVvqKRFH)v?_Je*DKU*AsuV1`?DJYzx
z00@H7V(EWxm84il{uoK|;JX#`OTE4r#{Ew$v&B6q1{9GVo>2Y?-RMj*RYbkA1Z{1Z
z-z`@&!pq^&$ek$aiJma<{MVPf?b|ituiPduryDJJF4jvB`(K$E#J8_$i7#$3rMSZX
zs+Pa9!$VR&mqM*^z!wnN7IT^AU9fd4x`<9I8@*T`(8g&nr^IV*=2*>_X=j^V1juZ4
zF(q!#c)?HcfYwSpi|~<Io7)P>7Qa!OZ938})o;O8@44s@u|D;7MNxYyD<*oLfWKFq
z7=_7e=*C-~p79Z_oc96EA@z%`k3N4IBF$a1QcIC{GHuTsmK(O-U(74gIZU)jL!E~T
zz6ir*zDTGX0EbYNsqHzGzH1q)ztXgBmEJ+UT5oMC`LOsI{|&f>C&%iakpzApe-@QS
zRzn?Ske}|%NbFyWmNOm8834UFoPmiEa&!|9JTL339$o!yZhxjY^I?o}uS=K=+l9O+
zk~l1%)z9PDT_K%a;P4cISbWxqqFC`iW<wNS0TvMMMxLfWD8>1fhIg!_1?=9c!^&?)
zcNQ3xcAO^eMJ@2HNACBe$}xzi-jVnUBF#QAv;`qVlFz}R@1Af}KhgpkvYet!|4h}p
z-rvX$KFRmmE}<nL%QPn4W^zviZ|bin^^ZPGsR}tyB|hqycS##7bs*Kx+`@6*>8LP;
z0bGdQ!pfty2E#5Tf_S6&DtOD3@oxLg6!UD2zSo6@7q1o_^ya=2`#0}n1GcHYSmVV4
zDFKIZen*bhg&vy=Y#S#%Ac!9{YX3u_`n$o83yWKoJw*m(+r`kT0IxcHhjeMJn;&v-
zA!Z$OL6?8aH~;yFJT5ntMqB`IPc?DWM&6@?pNy2(e&@=VK%uyZ*QoX%14S`}3@G;l
z;7LK_6-hXTGPh*cofW|~o!XJ*HK*LGHdF@TOncPv(zjr19t(4lo5@^RgY-lyBtt@W
zvNFBN-V&CYWD3P<B>AELTp!?c(RnmAh}l|xzKhX4l2mzPXMJ^^Ow8}V`^3naoSY}f
zQY|(33S#8%<dl`ci;h{LARXlV&|`Oe)mVm3rVxy_(Hq%Uk*m_-jRPf{SCab2g#_U5
zE>CfQP*hiUWVF+V1=X@{TeyUK_C_3dA9o6}$j)ehc_Du>^^foSSGQBb150`r0~pyw
zMMc+Cn><5-?%lsW?mq|w6TzU2LvtbxV*j5x0T!B6|C$D`V;!EWXHNlwNVevGSqwec
z&lbbS|67aU|D@pW)L@Ro!|{ra=ffgm)wo47pR6>Y{|O!BpR~s!vERFej7e>yLBNl!
Ml#*n*`1`>B0W$aH*#H0l

literal 37069
zcmdSBcQo8<`!CG4BSeW720;WNh!VZdjxvHEM2YB~L^pa(bOzC)6K0e|q9%IFXwiFT
zL>;4M%;;s#NcR3c&w17<>wVX|&N}Zu@g-*FzOR0LuFoa<siq3W4W=6e1OybSkCk)?
z2#CxH2ng?x5EBp(jOtM>5fFqBs46|u^)cPfxt7i$QB%VgnvwA&-3{Ik5m~Z^_xJac
z)7s7piHO)K(%Q~$-&21*05|D6+gtPBI^r{OJ}K!najx9~pZVt@{bvhx9;x%D#)DyM
zH0)ti3MBuB{wbh%!i%aQH!+*3wNizaRyKw%acNi)Q79%!4m1r4ppKK?eeBwOjm4si
zgw9U0IFj}~1*&BJK48vLd(x;Wwl8V@=sLOky?5BLq{&Jda>ZoHoGQ^Cwx>&@=4>Ei
zKHt0d&5(SJiofqP!q`N3I0Un^Uf)f+mxkTymB%Il&8b4sYq&)ad4)^zEOmv7+Ub!w
zt~b@{FXHsyvM_;2v|V=Y@(35f1e5nKaeNNCJI>uvN$WMQrTR0zbZxYyx)|$-$83IU
zj^}f*rQMx*h{_^7ted&c%q4fGMG21um|Ll5ju5Yhl&(rLKhGB=228P}StGyxe*Xg1
zO-A#d_M=up{WM`i2BTwK7v46DI6l-Q=bWo#J`BjU=7ZB8y~f4zbX0<?nXRK;l6^b*
z4tY&!81&kkpQxn$C|am@_8Cy!uealMr<$KIs@?$Goir%u@SJPVg6l$l8aD?EXqM&p
zAVK7ahS}jRANk}GidVs&C08{TFXFP21|Ro&jPm<)+8C&^rD5$5tX)~$eFPkW$F0ML
zk0fzG4&Tdws3t?1D30Iaw@>mTp1#3#A$NbT*ljlh3R~2JQ^)z@&Z?7&wxct%nYYi%
zVF4^b{)$A3-5x4j+e424+j^ubfM;!D@)U5lwAJ(TOGTX)m2~O*W}&W-wjQoP?HN#-
z-m=B{am7<#=*xV<Lt9naKPrcbyU5P!#3Ksu=UFiZL><nW3UbOt+%;D^3B)C0SE>D^
zP=@pEOy-UE))Jq<g_tgIHQrq0iABlDxgN`Es4?TeRnLhaILpJUJ}fb9n819o<}!q)
zRu_)b9<55PNKML3<k4RYHB4rOZeU>DJhS&v!hfE=3}G>uS(mM&gv+WXSa@4;Iac?I
zGZ~PEma6#`!9@7x>h^_o)>d}2R~ru>B`Ql^CShR-_;b#2ldQBbcET?(nRl{r47TdM
zAO?#?tzP1L#0*ehd4b&oDkk%-du!ARHNv9`92pp@!Hs;oLztry!f5pP&Oayq^FaU6
z`BV}skHf*s?T2-Hm3f`g3an*eV{O+VfZ(#>kWd5Or9iPZ(^5wxZ{_(-I3~W_5o(qS
z_Eeoq5mmGaer$#enV{|3c2Bjs(?{SBts3All^pF1f^Z)SSCLq}2s?Q*(sdEJn}!#S
zk@ERHilQ6KZHlRZfa+B;4#!f?6KL@7U=St0Od&LnxtH?AS9MPxo-X2PDQ3$EifJMY
z$(ogT4WQ=8*5;8Eh-!4@>JMdC@}VZ;zs<f+42bx1uI_vIQCp?F08|km92CYqj(dkS
z);arJ*1}}&MF}_q_*$9Lp9QC5>#o@{nj_wSMZP{u+Ls5>yrz~3mWApb{3K4lXl49N
zZYYfEX?!|AHKXbql=-*cuzk-Yiby=bs7XpcoR)B+innkxig4W{GNulkSLU_I&Ei`!
zBnCjm*smMF6D8WtX-)u^ho`0h^R|*v0o9qc;YL-zrF)WrT&BT{=DBCdxso1waJ(<_
zzqCX{p8+z3K2*i6A0)byP*k`i<tmFKG?x?p8hSP=r8=WG&bAWH2m7ysJZvLi5M2+d
zOpsDh|MRTrTZq3UNtxFY&EAst*XQZ*(4{4VpJWt5E-AwM>v6Q^6H39XTW_MM0af-o
z;`d}B2-m$V-5Nt}*RL?y+Mc|zy}7$2M&&f`RH%uzb70)@Y@d4{ww>b?8Y22yJ@$e|
zXa=tVk}i*jKhts~bt<#+IjpFqOSAzNGJ*GzuYvC{KF{luL=B6oG+1e-nCs09;Nibd
zs*z+D@-19dD>O%&)<wwJfYeI+x=YiiOe}xhlFAbh_Zsy*czR!$cK61Mx;iEMkdqT5
z`#m)PCR?Fa!6k%+KJ0|L!A*5&6H=}ZhwA|{lGo9(KJVA~ESQNcxVdeh9r1L&3Pju8
zs=n5|<E!R4m0y-eX2NesLR9-00)pIU+oiZi{$OwRab}s`sDkQ_{oJvdapR4<xLAoB
zj8fM5H(TP?DPEFGy8I>LckIA*{#otsDEuo*LfXRAYuI#2k^bh3e=g-4u@S%Zt_>Zo
zBk9yZpHol>fy~cKj7nVdsa!%VB+UWV3q-qqfUq{2pFpz1N*>}V?A+4EM8MSrOh2ur
zE8FAU1FF?uxo^#Aejdn4QoGM5?RTiLe1>h{j<TZ=x?^%)31|wj=$#jW19r~up@jDV
zaoAGc8^1GT=4u2?_C@=R2*Xd=DMuWTTdjA9K+|8j0y9#WH~E*{40N_Gku!2A;nn9~
z-`Z?kEYsu@64w!0x>`hCz+fxM`}S{JBY&PlewZr>xkMiWWco4t94}ZwamR=>Z~u<z
z<43C8TsDdVVYdi`DH#ViH0_^6n2VnH_tKCBWT$ng&xqs=bd#z~NiOE9&afnEE0U{`
zKN%H>rAXSBI$vh|l;rb7r!nR(G`Qp2!o$@hwc~14DGH07-Ws;b^H(~#$u+h%t-tNU
zk|5bGg!J#czhP(n&$Ck2Tf%#PXIv-CELhWvJD7KWfB%21PWWGc(BiEwO|f;Lda#Zv
z6Mv@dwk|G*&U&-b>XNTVGE%qHp_q|5T=grHG}xZ*UMG`$!1JpwQtnE>zF2m&-cukE
zCQ!v8CD>F)-&5Pn((g+vb*opVXf~@m=sr;^w$zcnMT^pX1|?(fAYlJE%;|6^_MsD*
zFH6Bn;d8ZOM4>3}y~1b0_j6*R!5=@2gi~3DQ0dtxiN00Ns0nYP96{^w7e3q5VY-z(
z`XN0<vr|cWVtfPtnSVv`imEbwFL*MiU>dpMbN}7w2k-lFF{-FvA9^?E<x4Po7~qc5
ztxQe|YwaQ@_HspV6P-G0yfFpT2?PK7&_qo!u?8vya;kzGPC}dIz(9_`lO<-3<_jzf
z3U_vz!*gjq9J$yk5s3b_HD(sLe__99PXG56B?<%)KI4UGErg%!Y+7OPS!W}v(C#*E
zc<=#YtW1{vK`b!fV5x*3_haq)81C{QQVe~$c3rSY!c6?Wbc?oFmga^F(4Ql4vusa!
z$?$aHrkSQo)9M$VbCq(rB+a1JA|6>SwpO1!bw1{JM8Ljy+U{jj0>aOMu9*5jg^4|P
zj)ApK;9|A#3>s*7*QC{ZmsI;$YcU9(8jCwVys$TWC-1jpnno!;ztyrO_&E4-Fj?CN
zx|smp_nZ@y4-lVC<H{}V8<Z8lf!kjwHfFlOWLXZe_*&-4AlPxI6LS-ixXT@X6qns?
z7P;8CkHAsV#A4<B2YW58CqE32f-hJ4Y74CV#`p?X%9?*#8(a@O-6tBk95VG43S5ro
zEBFceH$8p%R^pvoKc+WcJ@oLIv~$@^DU-Y-pzmFA!eWpwxZhi^sUjX|J;XB~%YeGK
z^*Liuv>lA%?GsGZ#M_zQLU^$fnz;14ozeF#Di$+c6}F03(thJ9sLWD+<mhq9g?HTY
z^Bk}7AQa5%CIl{#Uhz`e`up1_P!4q&m*z?w@RjILc6EJ(1Hk+IV_ze2iku3^%X>Lg
zaSbN53!Emt^;~!f&N`IFjW(9aOGtPBv9j)tQfeP$O)mUuU^bG|7s0XlMkHS5`$d<K
zspHZK8S|*&OYt}%X@eJD(O|r%%zMb#VtKrAVAh$se98s?Zfs0-LM+_C*w!sg|K<YN
z?TeuLt4~di4QF0ipe0j5IdEtr+uUC+x&yGW#o)`L#_OAoy);7$D3v|7&T46NMf0eK
zrmW}<q%mw{#~}CCPT#2CtIxOPJnT;$ipzvKDQB%3u^%y-ka!oR0a!9a1j8B%)_xR!
zXqe*(|CWIkpEGY#hn5Zj@6V=jLq@~bv$`h|O3S}IY3QdRYxH=^5O_A?G2sK~ihUqz
z#!e)xl`2c65Jjk>n@<7qCF(!q9jlwM&zTrwEY6!0gKxVnDv!LMcbYqo7!5CGK|%U}
zLBcamzyJ)_;PCm(Twrhy1!v+=&D$w<>6WP6X5=P#JcVO{1Tcr)t-1i`P_vE(zso4{
z=AgHbl<QZFq*T2*?KB5IDP)L^@1r4}4x7-A8U(s6h#9wEV5a(x!c7CQ>)^y|U4^)V
z(whYsIz6*7(u*Zb4{9>&)7GGN2LUucLD-u46ksCS<UQ{W1xG#*hs5{kXcLi-sJFU(
z5U4=Me9RD4INNK8dj&Z~zDwMavdiL<TN*weqz8Ro&$XH?*|ii+u?yPEsR4sR<pk85
z8O`l;#PnmxS3na_q;vdMTgsfFPbx1?S85wD+O2bSrn1oL0a+{2H;T|aWIVY0$JcI~
z9%l2bvQ~Y+Q{n59p*B6r3}B0Aaf~Jdjb_37Z~TkTFICEwkUSr4M@cV`g22ZHMs6tZ
zbVYtfgdWf49qH}M^W>?Q@Dre9ZbLL~j5Uo5WGZ}GvltMH)dB{6Wzv-Aul)W;nZOMd
z^77UVUAxs84Bai1?&kq~fFz*Svk%qwB?eexV*^`+<2DK#H?Yuob^0l0Q`;rsI)NPT
zeT%ZGw_O5IN>RV*BSf`#;n0G=hDHGB4#H=7a#K9c!mmgQNSy2)zF*)LQp$AZ02sxn
zOFBOjwe+n-1ZsFGMuD<fY}olruloqaG>O_Yv5ugXl~ye0G^S$VLX?ii<%);1-+ChQ
ze9s@DbYdT@g3OB2En~0?(s8Qg{CG`NWVqx?jmmjXH{A;uxrWqWt6v-C3DQ0g%dsEU
zkQf**L;C0C)bd-0IMp4#%-FRATnD%h%?Ij9TOR+!O60WwIxAmx5OC}W{j4%Bj43I1
zSpC{3PCLdoB-L`yvDKtjbIL?yF{I$OdCIga$V$~?^KS2W!eRh_jFnyoLE}*aW~`9G
zO!CH+hm7SI<Rl|Jc6`MwL=Ro+&mlZz48)U(XFD|EB1{^Q*9AV#)9!v0k583obFQni
zgJn{tRz8dKS_1bU|8x>J9Fm(myseD43wDG?3JatUK`sQs94*+C@|>SPqx2>oo7w}e
zfGUK1oNkvkd(FcqBz!EV1Hm+Lw}lIAMdQ4gQE5ot`K-sqxtsXTgqTu0TpXY`L;%%>
zl!mcoyf4l{Jn?JSk`KrOFP$LWD5P)--<sbrbkQ?wxC#pReQ=9w_%Y5ge`$4dcHw|7
za_W7hri*B+)J?(tdfU8C7A>O0tr5&nkg1a6*d3X*sKBvKhKxnVr&#x~;rFlQaIzjn
z*Z1lJv8}Goex8g6H`;6G953KdVj`2(>IbWyQSpKW{2Xar^c)i60`QPxHx~*IkneXb
zm)lm)$3iN*jn){=H|j!<T%mU@05(ac{>-%{r<s}%X`drgX<9##wxAO<X3)F>`Zf5Z
zXUo}o?=`a^*dZFN3scOeCL(vLN6&^Et5%OsiT2Xk5VhYXg6}Gh8vqI>&5J41eY;R&
zuLO7`-0zo;iyZot-X#W{v_*ZtKD0=?!}sx0ja$0%nC(MjMxV_Q$7X+pGeyE0V}5d+
z)4UjFR7&?9VFbgqsXMY@T=Pz^$U?Q#kTyP|(%QUv*mk>K&)|7O86oIJztDnchVwDN
zOt$uX-LL>r>Go;E^@E+yzUfzJ)wssEk+?H;qUNJOfiNnfh!O>6$UgD(I<w~Eqv;?K
z?)hUOXZwz0R^7!r@|LYRrQM?&>7MJ1ADg|Jh9wbu#mE%Iw*q1S?Yi$$<zH_WwFJdM
zw_1nfP(nzNy(3`>>(b~gU1agN$0JSwTyYLYQS0<{#UOh5CJ2qodT5%kaeTKxc6#JS
zjl(#;%}WFBbCYxD38i0H{vo<;|1_dZ#_*$Eh_<Gb-3HtnN5LRXR8j666{u1CRGD08
zey%9T&yg-Ziw5I(Zb{nzy5rX8T3jTE<0zN=RfxrS_>d{bW`gLA&gPMdA?9YwFW0QT
z3*7nkUD^X0z*t~f(1LP$orae;{5>W@-E5o<g*^|PYH)|w|GZOi)(oip_P(~v6CC!|
zt8Th-Kt%Dyv#B|cuquJ5?%Q=)TVQr_hL+(0bal_ode^Omr@Da}+`F$&3=n!>B#0nK
z3{y9dw(PRvk7a<sSYLOD>5Mc#U0Hs1U%XC4L1=sHR?|j5oTSDrwq&0lUlfLLd>kf*
z1`r*$E!29JaYKlO?IJ-c$IBL-Jj-Z!!$FKl>5F`<`+M)~H0G{%Ia}?Qc$t$~4}T{c
zN+9g=M4i}O6EzL!fG$O!srZ!n5d%CZB@;~U$0HW}%ASO(p5w&0n&k1(D=go}AlHhy
zy}wJj&e`Jmj4ORnn{pIuKnKYK5AP;2)?KNkm{*@JK|XRs5Gx1c2n%c!ruFi$I<4SX
zd3*|d10Pq~op@iBkvepUY>kmhDD9$pWXi#34s;QyRRc^r!HVG*zA5(gjpsz<wFrKq
zil|eLxTeQ^OpTYgJ{l;{qhH#CPP6$nbFo%wZEnND1UKY+1zS7+lk0o#hT{ZQe{#IS
zHkX`o<a~D_C!%M#n^VH=bQ|1#H0Z^0Q~b8K#idyuNY+4BBxs4e@uh&r4#|*F7?BI{
z`iqFEA12@A<7=k`3tEb3J>_tDi@O5^?MCoPhNz<ncl8G^G&uD6J1V*#Bwm~cTQD`_
z<`>Ew-4+klNgp!3QX#&dC7XbE1HJvf@Zst9Rd`BWd-FYj*AxMGBHypIS@f;@cFfbr
zy~gA%VVhvdJhIB;M|c^X5FiMk)G>1fa=`sq?%|)7mQ~(P7T8{`jWW9sUk0TuP53;g
zjGaS84EE#wrGw~r!MlFVpULKDxNk47%TL!uB|ZSL!-f3TgnR3*^BD&NzU?=Unbz^R
zJL2EG3;hN&XhPd{t`uKe7N|V&TmDwov8e>@LO>O6by@EYi*au9qX;FaKvaM7%1I^O
zo`9oK$TT}<=(mOeL+!;Mr?(@P;Hp-$MYQ`4T2@lz-R2xkC4?Z8cLq2*?O@n59{tz-
z{<}gYj>RNUQ2>p2AXTgqRb;(~g`}s*5`w^LwU3z<0-lkAM<E&7rzSg<C(^*nIbCj%
zJJnEs1XUNi?-m6&AA}7dVv(MgW;1r(LKfY6GN>ll8bTF8SB5#1HU<2gWDceFLv&sV
zxJ(cqW#X@*3?7Z3+T{y7%e9>koVLw103@X5q+_*+qSGWen3jV4ZZs2uIFbr`+Ajds
z2!i&CsxB(IGJsk3;?XY6;B8{RGj!=u%kcrZkdjr$Vk`qKawog+8}EL3P1F<@z6wi!
zW`_xp@)tL2KJH5KBwIa`1ekkexPktvFZZMPDN6*2pyg{BAs}l<<sh>UU;HE972l3P
zUUF!q%-;MO$P6$@LgkPtfx9mDGNkec1)tt*o$|ON(~{bEB#G;1n6}H2(UQ1RoZoyJ
zX?5w{hu-mL7!D~z-j^YXMS(fok(sL8Ag$+g_sOBAP{qD3WkU*o<8Wbtb2!M<N6giw
zT+T}!%v%0Q%G}pdS!vmlZYEH994}s}+8jRsCLs5&3ndRuBEquHp4cW9j473lKq6aw
zxMwgvOE&NWIJWcH3YL1r!*M<yxFifQUthCuSq|#etH*b9{8!x^?+}3%C20#1OEW52
znAp8R?9my(5n*cMchE!_#z*JOa(PWrkVjC+1#2ve;$c`El4<iK@>X#P8TsBG)o#<i
zc)hu<pWQr{n8OL%aazwZe_b9^XID9JE;7&Fj^m&ccn5hOcBG;x<|5EzPJ80`tD%jJ
zieM=%_dR+FAz<?KWy2JbyM7*iZVuyqHKEjdhgc<7IPw!AXeD}IY3N~>be~!z+RfuD
z2Vq=EIg0LM6d~vo{pq;t4jXmXo!%FygcE5%MZikWF6QMp0Th|LM?dpR0m6GJvb?8m
z7Ub9F^V>lWyZ9F0SGj#qEHO(bdwJx0ORP<Z>f#T!Ky49}lBo{xtX{uveE-wxg&82*
zSP(2?=GCM85T}K>)^vUIf#^sSVHXe6$(U-b*?1P&U60s${bSUP-aq)_XX~PrKllJ=
zT?I!okJubKT}D&wQv68M3=y?8p&^>;z~1SSx}%p)SBW#~mr0AXz0LDpQK9g0z!bb^
zNJt|Qq-1ixoa5rp{U&?Mbh1fwqaK>LK_P^@U<KXvNklUn+;<5<E5M^H(Kne!vSYiN
z_VhY_v=4;pHP}BL`(Ys_y7$KC#IU;_7k*)=eqs4|w9XC)b;@51l8E{u{JSCzOAKQl
zgj2zUgB`|av@@Tsa`f3_=9sO$1o9b5qqOf`+$GfaWwY#};CU4;{VqhXs=+cBU%uqW
z0L(vas|+==z3#eGS__eO7_7Eq2Ajq)nad7Me*IFCsmdX(UUJhhsG!M3cj*vR@BdLJ
zLg;(cP_`g5gYZi|mweJqUVkULhOp81_ki0?<Dl^;(%Q9EBqW}KWsZQ<`C1d}_E=<M
z!+b)fZzx3DmYHG1SNWt*B$G+FEj@N-DxB-72wvwBMC2BAi(nLLS7M0CH+w3hqw!A(
zgSj5snA14l5x{$ZV^gdox91(=3eTnDv35TvI=#Z|8~^0YbJ)u*ujNk8ZC4sG4sES@
zw{T{u@27{P5jQE=?(K{b#ZljC`Kyl78Cb_##g+!Ih@=*-4aWd|R+LAs%hbygGoBgF
zrYDBD0-H<EfnL`{1(&PUn7SF<?3kYX4XYf3loaolL!JxAv=@^d$0cf9+6=Hf66Kl6
zo1%hkhtd8Jeh$|YQ#`ThIcoqAycEx&f(9FQ<=={U;2e40Wz!=*sbKMbKX@j=$X_^8
zVDC(!E+U+7+)V(qxLGec`Aoz?38|T$mwyiDGqh~U!>u!esPl#^+?vi6xF$dMldW`^
zGKc$i)smh`>nvSW;AThr^D~_hRUI2KY#Lo5tHHfSa>-vTf)Q%L;K@oh#o^s*cu57>
zfoOs4SWoQ!ipeZSDRKyun(+3_j<?j~rbSzI>v!*CSD96o@AGQ8Vxt%MR^3AYeR&Q!
z-k+Zk+1UJd5K3pIia32->TmEUpfoIT!2YAjZv8|7;kZGqbh%d%ZJf9<+RsS5xivi2
z<=8hGG%+Z;58o(wvc{-)aUYr6JRE>J+G`b`or;>x(F8wv3sNSJxTJ|i{2&~8!su)#
zUFpZ}as6!eWEi3e^b3|yaA}W?`8OC$XC;a7aN!?RtyTvcwb$6EukBOX@t6lgT5asY
ztlc;kee!e%oMmGpt54@xl!(|B-M7RRl(SVjDe<FN1zZpYo_BKq^?jMlQ@p||H*S(J
zBd!ipr)N75_o@)!uv*{8j1s&%`k@HKfQJDTbn{i?$kM8s>Tl`z|AWDOrh}3w&)+&z
zuR|<H*tIk3HI9*{HMu5c>c?s)2_i~7shxFqG&uwz+fBY^7xzK;FzYYG3N&G~WiAKC
zhG)+%JU?r~p2=+8Q;JkUoDUZKKZfTtl8CE##(seerZI)!O%pZf6|i`P2|g~)WHP~5
zE+pFF_f&}tQyhNbM1q|<(gy;C+E>8h6($IKrV)1)tHSWH%HFbtWU;GiDO5W^G?^;|
z^G<Mxc2U*xpsI9GM!5O1;Mqdbm^fvhJA>imNuxmTmtTF7OIo^CvIKH@o0c+K|EZNT
zX&>ZlZ{+K<oG0AxItO30<eoSgl6n42G|i=EM!fWV7)PZZXZN6fXn)ag@w26#Ca(Fb
z*T*pwb~VSKsMf^SmQIt=|9D-!!1bM~ix+iPOGz@S!;a<rXzi0vf_30-NWJmEAaXPI
z-)<*94Bmd!c(d*n$zJ`TY^`4g#hykVe`Up4-P%g68S73TEiLw>0+T)homWvmAtqz%
zoHvJm6f141q`Aem>e!t$`I+u-@Um9jNW-}M!G&YH;dV$s#S%lV7wl{?(}cf3a8p+r
z{rUacyBA;9t=ddQ-(A$kD(tjwiSGtsS`Nl}J$d7(hWOb67HzVkN8~rV6psXgtr4-E
z=94BKYv4=Yb;xq<3%pDq*Ei+mIUSEGrSC76+-Tp}es2BRt*q<|+UT=pA+3$ol#@iy
zh5hLj`Xhg5$a};#JLtuci8OQT#NC`Xr&o<KIldBotCX;}UEM#PsQudZ%hGIVePNB0
zj~x2ZKq3dI?q<1j_Q=nE8n$3onF;$N1quV9m6d)`{&z1N*(Idc+fSama+?-h-2X>`
zk<KlSNw5CI&&doaZ1o>@gscZO_7KRRem0v(+c27+?KF6b?rU`aiLPICe%Mws=NNbt
zY{zK+s=UQgH1k{4cE{d5BFL#r*^Hgr)X>AW^$d@ijTuXn-?CRMVHXlmgRnO7-r*;L
zBvyXtj{o4hXjzdc4|~RqV3~jQRs71el<)`}=T7t3JLf<9UB;evUTjnwu!1?B)bRvj
zzEw|L0R7K;u+cJQu4ql*dI)wx#l>r>>8U*(Eog=FHsi8Yal*X&2WIvLpHb~Atn-t=
z&c#*NW>r@I1NN~GfV}FJOv*(U11*8*^OHJY%fOe*W4+Nx=Z5KU3*!af38@hrLk#Ey
zk0x##Ut_%1zV^MVa@K3?AUiR2tttxLtB!fsH&haVZXF#)p7D!c{`Fsx=S9l54BPa<
zz{dN11l1I_y%Wo^{d*9r-6?UJxqMC7*sFwvWqI4aDE)VYw#@yq)g8c-pMed%1noKT
z`;}{5_%0*eMQG(6s{Qz}(AwV5K<`anBF9=M3C{x-&tkE*5;8OmNxKhm_kdWVOc^t|
z5V@}WbGe|2wS~yUYVwM1=f?i#TDTC1BD$|p^@>+O$Bl0qV=hkyWKbpdRrKR{!rYXg
zmhpymUKH~K;($G-JC|7s0I4QjhZ2kQY@cb<UFN^YhJXnoF`81AX2?vDopGx%@4Wgk
z7R<)yw4&;M(ZGfuO&-NEPG7Yf^ikKu*1l7t2pgTU%If-;cUMw7j^fx_NI?w(HTZ~5
zY2=vB8rW9mOxGLCe+vpZvueEw9eI5F3>bmi!j5U?An+O8nVI1$E#-np3`5Kwk;N06
zfn@ZujXG1fMV*wX?;u)x{`K8a#IIpVtquTQ%HOK%*TbmN7*Q?Oc6o`BfD(&P*pNFw
zD0XY{;%NEM?a>=WAAuKD6;nU$lu5hXP@Rff>w<Ym;K}Z@!Sn*fsD>^ka~K<iRjY;^
zTxI2h3P5g~o`~FAuT1^q{ca60U})uaql?i(Q8mTT4e7p(<4s~Bh;(;j&6q>FO-LdL
zmb%!L{Ku5yde9OjKhYBs_83|{dm`m0e|F{NnVPai5O(#YQVX%wU?!OxwXX$Pg`j~z
zk0ZT_izZ#67W?e5ZmKR9a-sPAqXZ|W5rFj--uyMOn`Bzls@T1?nb52Mb)(+U(Ws(C
zU9JKBi1}Q@ol8`AdBX^ItCQ!RCsM{LCj9!eZDK@*U3N=X;BZXO%Dl>Chr8gVx8G2^
z(7J*Uu&0ZKXF0g+S1;*OI4HS$M<Vi%+8n&cjTlN3oF94|nXW)Q>7Q)i-BkNJsP-hs
zCS-<KWQdLBy6nvb%Mdh9`+AQ5sD-mkA3=L~ytaR9;P_=@QI?D3w~JkRA_%4UW^X5b
z?&1XcYU(ZB5~KoF8*%H}a~5t>z{6l*zvFOIpUHHs{_5@;av}CGdk8+4@W4F>#m$A=
zp4;CgZ%++l5D+9Mub^&^oT^(LAwkCh1oeqHorH>n+JvwQR*~U^_JC6-roqd?T?(Of
zvv8#K_)Y_vmRsv*!pSUq0>>&uW?47$Qf{Z*Lf}BXGjKN^JLzW;;ztBIt3_g#!&MV}
z01<8~R+GO)@nG3AI^HD};k&LmWwcFk6@1R?Hn8rYj8L+bK5u$(ETPxp0YhCx?w;fK
zhi!1YFc_ijEn|g7<U+Y)6|_8k1W;$IlsAilvaTl5Tm2%Ep<z4t>k3JJ$XfJ26sUEg
z5Hjv!^gLwo{Y0Z)oU->enNXh|YV0v2{K}JgFV2<vHlw)$$p`=+s-K4KN$~}TTL8?T
z+@);C!_4{ysv_S2L=cc`U0xTn;DQhYMArS2fhZ!)t1cK1|Emv4Fa6jQS7$aN>My$&
z)wkr!&9)vSVHqME>9jG-eyRM7!^N?x!2mEAKm~a+&$Ly4=i$T0^Fol3a81$vQ}t9)
z9yKt&D9dl0jZtsxqL{UWlA<=WpEWGDBsR6x)qjlm(<3h(xw<k&wHkEN_dQ)Mm~WyC
zTrkono1!sgR(cw|vft-1=<*I$OR?@b`Mk#yaC{6g^=K_-YYv`>t7Ca-z{D<X5pk1w
zeLn937dP@3<BCL%a?Q7Xm(kwtV#<Qu6$uQ@j@XMrd>x3fsb}2{^LK2`p!K|uAUU_~
z)F$@=6-I02$W5e=46aQT({=%exLVP=6;t-9vl*KD=jD?GD}k}zf3{|*JZ92SIRV3!
z+0p_K-wp#l)1k(n<=~;QHvGJlI0kU>?SUXUbl=P~yzmv=b+Mf)zgQA)!a~wJB^NK@
z08jI<XzeIGA*4k+PADRsdtK!dzRPemYF|r~GO;)|F~~D&(fr};5YESdnNv7Y6WmSc
zB0-Y2GrCm>F23d*?MOlUDRtMX0gU_WF_yQ~m`2XKa_EnA^&K#1LqZ{lmg3v9HNi!X
zYNSiyY^rAeYciZ&7<J4yd`}o<Hztuz4SEb<;|M!j^KKqlywq<TB@y1V!vkb8o1{Tj
zum}p<uvf1DY|uASU+_L^J(rIc7c|INP-_U`;psOmFQYH>;raK@B0lIa;Ny7_QJX1L
z@1>ozAwDI4YT9CmTh4M2&w0!-+S_<mtDf3K_xGrZZb=7z5y?ky8DgRPqo7-yHr8z{
zzc%^G2O<P0%=R+dYa{JHx!#_`Gknt9;l^z6GI<>5R1_9!*)ROl!<dSX>C~UH6J2Lj
zNfo-#z3J10-K-CEH3iYRnvsu1f3Zr5WmSJQE&Qgf>fq3rNEz0Af8`)5-X-l5*<G@{
ziqrHs<irDkDE-N6iFV%-IPE?d8B=}!L}oc{KByy6cSlFXuhvOWe(jOTfD5^vbjfC*
zy`ezVTB5_cWMtmM$|SM^Dg{vxz968p$QCMQceFMS5uS>1^;>gov}mC(eNhR-^R>A8
z)hqxakSCR=Oi)Y2gW_^xKyr)JO%8X+czCSE57xEil*8(r+8cLcy{QDtBQv`Q1wkOn
zZFxZ^hEK=eFN8N01i3)Gi<IyOV_kL~uWu^P{lsUT(%&eNNfj}m^jHqx4Qk5@GMcYB
z^;Y1>mEKv0+<GPC9ZLAF(3KLvqFcv3E17Q{k_xP?TIcOkMl19X&?-cAMe%-l&8h!!
z)~zq@;~c<Tbn-a<R+(qr`!EV&&}-A+^T`d_jnBo--iv#d)UmFKl!KiGw=Imx3(67x
z4GnRncb%=z4~eH$1n0Mmjdxp@;2#M$!qLZvW8QVI+=6{M%#GVkW$2J=w-1<JnAXMp
z#92J8`YUoFAzdVtovDmpYFKIZ*VzBqtLL3EeO^i0E|~7^RL=2+5BFGf;*DN`R$l>O
z<BRX^?>2Pr<T(12PV|T>)X|;Y+5|Y1*Ph-#J%l2SS*+Xf#b`0@!H4qF4`#k9(~|kF
z{+}fjCV4#{=D_VJg^{t!VUA5t`-`m*`4lc+|GgA{i!Ie^*)!g5dc8s+#9W@UPyC$Q
zbavoI?RPJ&f?b!y!~b2Yf-mNm98?;j$IsspPI@oRM$B1rgPxy0gEjx^HqtEb=lG?K
zmRvC&uE+}i)}2v^F2TFq+s|z?%t4)w*<Lp!?i))!?{D8d6Nue=HgN{P{#V)de+><T
zgq-o=s-SB9AIY#s`Dz&gVZFh^siP>IV>+FxgYI9?o(PAJeh9(eRadPT$o;=2to;A-
zf1?gXY*Zstv0zoRRII0}R>~I@)%H-I&z9v$&U`4pM(DNe<dpK%LS{AT3RvaDw2iHn
z-!eo3PusOg%6;|V<ss<^2?M7a<55EuY)6ghMVv{*{?5wNU89~u)uFH3fpkj+oR_Ul
znhzXWa!Vc{Ltxw{u|ig=d1=_^HvUgWJ@&HB9&-tN4r8GS6T5M6mi<lq=&@YuQ>rU+
zQj~brV^y<vDeD_MNk!#d&J+?{Yp*U!7e@YY)>y`k+#nKltl+2Lo)aW-xrU#kPraDi
zrY)cjde>nyw;2wrNe<uNInu_TBpiPdP<o(pTT}&3t0Wjts(ZOT>8_>Ivc9(yf7@BL
z7kyOb_^kW%OnyLx7u}~e-Tvq~hvPFmCwGS?OyhAcl5_8hW;bH%g9^^VHXY~jqP`~$
zYWkU;Z%_yS6}yo_=w+re_v-RdMFMduzM%j_`4DX<aA~OZ4t*8eTJi{f3%tXaPCRXv
zG>bg$PIbjbCV@_fUOeU?p8kXDLQ56*xR=B1tzC%R)6vzWL2`IZmCmTg;4nc}0W8k|
zK1KG6<Tpa>x`Y<ouo`GVCFK&&<G3mtZwR8@xIVu)F&YZDf2@jm_r+RuEIrViLTFhq
zJ7vfyfl=*66D(sVUOJ3;`iU*u)I~wnUe=waD8h(FdbI0<O^RLb4@v74GD$6r=G7@*
z0(3O3=HtQ$aOamDyN`Qqh&e(oyss_cIS7~ac6pu?1D0h;?=-9OVlN-I1)0O~Mq_Ei
z_M+9RmIw-wrm!mc{*{rsxU2_5Y>yvwOW*HqmLcgAY1@2sp)$K4{KV$)8?Wk+lp+zi
zP#0<NXc!rulvdDrE^xm!%<U1(!e-_R%qq!Ui#LwxCWiVl1=}<~vZy9~H;O15QwI~@
z#nVfDbeswN6o?$#VKv^Yp;5lt>>g=&&Z&Q(F_%*l{}t8RLI-hVOQk<VFjo&_(>wHS
z)*J1$fZc$toT)k)j|N#%;W@lUl`xzEDO7uB%=t5{+Z=m(1d~nEh)6H73$c*BFwj9H
zu&?%rwE0|er();Kf!)9qYj4isA2(G-Mz-+O(wStNkR7%#;hSQgy!r7=ho>$lm~E;s
zD-xZ>bCGGSW)HWdi~j8_EQH>-IVpuA?=5~Nh~wt)>R7l>G9=PgWlWXEgIZ9ln+i5q
zHcuh+FloOqMSS)9+ADtG_MG9DP{M^CEwrY%h@WjzGm(f=nyULjh`xZd##hhpoDZow
zg_0|F<H1`Z2&*rXrzikOw!EyfnjwBaL6iD2Pj$xW=U%E4qz!Qo5lK9quj-pm-C+#z
z=UajtJ1yOOyiZZl9_q^USN_s5o9fOR7SZVdXaX(~_gacKXpY$tU-{wkwYv|?)iR4|
zTqa@juAJG#cbidwpv&ieGs@m86cxd$e!Wugwc)hXRy;BLnN}|0p*htcW>?0AYUsdG
zPJ5w1I*85LtN+7lhUJ-rsxO%36Jd?&s0eRlVzLcY_$s4$d$N1;>d?*>CuFdq{Y^ak
zCVA_CnAD`m{b?1Gb_}vsl?aq0r%X|S#+wTu5JM-{1|qhYAG%QW{W4TMCz9%ew@BD$
zpl$bW2I&jP9-%TaJ@DW?8nm~y3M;OxvLE#j@XIBf;(*XB`uucDC=1Jv<F!B#C?fUY
zKL+n)YYR%^In(tgY$1vKt#3()_2m?Z__f64s>p@>GMfpNiG~yY;Ll1y&Qoa;jU4>|
z8j(Ve<{CXle-;uxk)$vtp%BvlaOSWkf#l4Q6U_{0szA%!zdsaep(j4F;DdiBz-3D9
zTYe(P&^r*#n2}UhP(9!gdMV`f&qy@@wE39<oKfQl`F`aO1tN&ujQ6UmG4)Xwe&7V)
zT-1eAy9KBW!S0+N!(<hPFusOVn#JT#7mrW3a$t5}Zd(m{9|yc34aHkk8KPq)xN~jh
z**F0i((~l>CP2zslMi`>bCF|EdJl~a-26n1Pf4>KnW{s5QlfrLgwU7uNt$YYgJK`5
z1h-!ovj50OcX>^>-Jqy&EhE_l%Fj14oQ6Mil_HlarHPN##R~;OUF!13i7Pk`c&A6a
z4epPpF0Yd=<UO`yD#X684z!}W`4|Y%)3OioRiqpVp|Ak=@3luk(cvRs%>k-lF$4jI
zqs$Z%byC&dvT#ADT0`jNZ<D2MA`^)j>6JG&%P_b#$+zVDI&kUs$6*4DP0lMi&u9$s
z^ISmT%E|sQ_Lzr+h@2j@K8@VEWn6sW3r`gIi7BSq4S9+mJejg1B<|tIrs4u9Mh$DF
z94TY|?2c?5br<64QdQqPvm0ht3jz-!>H(K)*%B}D6B#s>LOGgM)%Q;bC=D1ng-Z3Y
z`_m!_%ySbzTsyuo^OYuEza-4pyUA}~4r>@joSs7ja*@Mmo?m)i@y(x4`(QBwviWXd
zGLy33yr%1V?~1l5oKO?rJo&-gst@?UAAh%j*~Qg*fRH&2yWO2Sk+#H7wEekPeAl18
zw>lvK-Re;>;PEu!C*uuISHK5KLk{zek1*N0bWE`TL&VzdyF2qMDkY4~MAB;0UDK6h
z<W71sku{&f@RPIWKf`<`ovKO*iI<{Gc<?0Pi03ZEfG*}UM8&VYnGq(}U^IqppMFV_
z>%PPwzy$443V5&rV7dq3tK0tJWygH8nAoB@uGfbM@+y4T{wESozt8hmXkV?lt)Um4
zr7TB8uCNlF>x^&41ElY~GS?(O6dnmWG|%#NJHVT}@%!f$VT&;D4foX<$@>XU!=Ox2
z^%9dpbPIpKveN~>h=9R-Xeo1y%ud60bUV46nP*9vT)jAVYl^QmBWn#AJ<-l=KAGP0
z<}_|Ze%XrO@E-SCV1MbGSA28d1#j(<#y!ae^5AE>e^H{-SC~el2#LWJ#%U>-4z@_4
zhk-2zwjsVy%(dS;Kc7Px>VhA!*&bSm9*^7uH8gF#HTxEuYkbo=^imdJTH!=R-*9)y
zv_mRgrN)!%6E#u41TpX^k@p88tn{he3K^au^t@+eulp+v-s$!AF&1?k5kkfAJ*xt_
zuClzXgO^&O*LQ(HH9vX<QVJo65$I0q2S*W!F6Lf-__z6NIlj4&{661v^rV;$m;oP<
zKACUy^}a~0z;}7nTs5daNs#P1Wx%p+C{ajNG+#DQK3l!VxYUf(<_OsO;-!7al4X<Y
zs1>J)Vf3>E5V_Eg*$$kBPGj5kgAlVjk=izGIa7<wo`xi32&&6g)gf-77(9kXigpdA
zy#F#}<ikVx`w2heO2R5DKpXU^Egt$qv_+#HTmhlhdsaoRbH(@mLIUA8c0O0nB%XPs
zTAxTY2rS57CFSlN6OrZpkkh`V5+@Y){N$~WU#+32We&{i#HV%N3}2DFel*R9&mwQ5
zQ9+`6g=$T*c-nsfOxA8E2`1|vA>2_ydj`*4#RoU=s0dRxEP%-o&v%#eHQ+W5kexDs
zpL<68gGZ~Q2<2AH4IYIJPtR4;tTVeuiRPnaWItTgFn8r`m^Kl1Rl8*+vN<ULI8#{g
zHV0TbTuV{5s8Q|Rt%pF)>iAz0rYM5EN!m-6>eJt*EL;W{yO85O67TSpk?izL{)VD2
zL&80;L**f-!fHrl>ORm<kuLQ6)cx`BIvTCVy?ql_Q8Ym+Tmty)alXOm8HUTtFCnb)
zwqX`NecY0`4t{^~^MKDy(JCEV>`c1wFj#vAiP5|Y7*2n*cQurU`|*Bo;_+c%QX-1=
zHG;hL4SjH%(W)$LZ=~ji0Rm3|gmPb;J@VlJO$I=P6NU*&i)nwwH)6o!y3-je>bPOo
zusC9R4r+1M(|ahCDjt@%H?Nc=uvop-^&o<qLd#!3(6~hDDxvx<GagcS;f3&{1lIkt
z!gB2qs9jdJG^p^eQe<*&-4*1A3i^^hnCXYVv)jV3J$Wb2>xq``^`QbY)OZYaK<*<g
zJjK-8CYX}k2>%2*y}2@iNSG2)9paH<u|B*+(-6obmoQ_@IaeKuJqI@C;bJVPQ{oo-
zuH9u)aE8BYw2Z|4Mgp1yZISs%SH8^b6qQ7H7eMgbKiS<X`na5PO<5L>Va-ZOu+dZ?
z5depA%qPoYPg!$QxDz}yTWBMwC0wYu6KIoiJqps2XA=q6?LR+{GqKHuakrSny$xjG
zT-(cZ6T}vIP=LPk!!Q}D_1_*FUp~y27adPWu9e>s9|k<YE<x~x>2#0kjO0oH$%-iW
zgl>Us?IHi1KKP-K3!im>4Vz_{Y*?bLv`|obO5bd(YJGCYB(+em&hDrDvDbBboNLr(
zz)|9U2F%nAhKBbD^peN5C3=v5w8y)J5wbdYt6}r^?k+buZ5Y=1>7Vot@$ANStQocg
zC}avl66JRbqqaEvrixWhLJwIXTsUAPnS=2QQnHn#h&t|P-fmhOUtz}FmhZ!dN!PdT
zZK}KIig?HQA3M<qKD_()-b83bXmakzN6pvy-{pP;J^1IR^u`QsX<dPhw0x-#mphw)
zEgB`h1;2a6s7E(<-$4FivGe{yNUyy#Tc`i`0!qjG$odykoN+^DTQ}3o`;1#q+mu&M
z{d?u@|J$cUhX``U{j||M-K5ix`HGR}V)xtVo;^V$IaiMLKj_r_-Y`1WOf143KTSI2
z?|u6~S4ye7M#p#v5%T*;Irm=Hsp)VOkk2TjDuNyPIuEufejjzjp&+7^Q|#{lbuWqv
zh#+U_J(>?_P+g>^U=iPTQ3V#b0D~kZ7Ek7P_XN~BMIEPqwsIHjlpxNWKelc)zXpxF
zrouo<j8);hksH(cF<UdtRGzB9leJd0Md3t2acCxIL&Ui6fB&Rp6t%Cg+4P+uOM5|S
z+;!((^Tp_A)Xr31;w3T<5q)TbI_WNRM=Mevy0ixK#mqhUM~E(d7bjVxM&)vHf$K|r
zW4M#8vyn@`WxqpWC6j_)?cZ;yL1Uu$pHBr3lziDeO;=+FN}TV$tn-S!+}mS=k@ZU$
zNV~5_VK3(fV-M{da?pH5R-gp6?g2S}z4izSjp97#O>DhND?{fVv37)GelNRpO)lj^
z@EN#^u}H&v!NOAdD7(67W$L-WMFOyerL&KAs#@aCCXHUa4pO;g8uBt}gXuq?Xq?!4
z$v$B2nmK|zX|>zG8euo9C5o(ynQ~a&hBeFFT#oEWi5!tT;vY%R=E}`9Y7qfBU~I|#
zv2uC;yVAB5u@9}u^e9IveJxjCbwn+phU9(6op1*R$vM<sGbbu4ZkP_dsYNv?r9%Hx
z@c&ZAf2x3fr)BION<Zs!PSE~(GYW6(i9wf_q1v}g@%-?mH<I1kA~IHkyUg4Fs3Mng
z)_V@TK*~1}mg9rXMe*_pFm?|7zqmd#_Gqzb-I$cE0i#gcJWfMa6v<MdNH>J%gejiw
zR@yXdkCLe@g_@+W8<n_6#Se^yiKTf9L|IHbO=+kVv&!6y$>J`vbNOqW>*MXh!PtOG
z#tF_5G1z}Ram^~EF#OiU=DmW5R}vdtdpXPNPA5{zX=biq=^at4vEa+g0UqZP{IV~K
zdpB)Rx{qA@*qp)UJN?Jo1y-0Z{W@fY#kXs{HKsYd%~nCY8I5);cG15KKAKnMw?f5K
z!4&`b6kS8L7XDik4bde_vlQ@Dr*=;R4Q`-E=vb(?@sAsD4TGG`L)?1t(8MS3<J<5p
zM)Pi#N`L98>}n<a&0$%;g3G!?xDA84mVC`5f&Z}pqxpk-A8%J<J6`1#RS!vdpKjW<
zg6HOTt7_*!9{plk4e*bzii~zEj83Ol6QEWY?4VX~o^#8BfSB}ohX~YXfoH$dRySE`
z>ieh>@G_@wEYi$x(TUAxyk{Z+)c3i`Pui`NWwbHRPYiGJ&UH8x_$zv=&|^@<rJ?h%
zu_zgMp-flI{Ev<MzHm+K9NXIUg<r@YyxE&r#f<iqXO~I1^<g8<a%1BAXomz3LM9H5
z<QY46#IK^kl>y#WcL_oZmdykA_pdwq<2bbXKOOD*o_^h!mJP(lwic`8`8hWXTdEhS
z3)EWr?boNNxHahg<HM9*_(d;8fic5zcH;rp%n!wCX=b+C49KrY=T9@<Kji1?e&RLR
zL7}fTWz^_}(82T2zWfSOvQA$RFP&MR<7acS?btD;fBB2mJNvh^N2yvZZgX3XUkn=#
zBeIGl<bHm7)uGXFJPSVeV=lHTV#<>Lc|JFSHf+G44MrV`as=eIs<uiS)(2Itq&hnL
zTdFh^)XRvoZDwcx;$}*<G((G9X+shS9HbAbQG*a>kq=N>iV!<_QZI@7Xk<cU_t{gz
zy(>L#D(>!QClKC>;$$_%yf(2hMy#%_3DRrX;9Xy&_X%XM<vao$DR+n@f}^Ai=IL0n
z3j17ITGtH`w#equ5vwj(g2p9J%$(Czbh$D-<#(pIk#ZB(1>C%mvwyeT&BtjbZWEb)
zH;EvA34@&e0q=yCZ8L3Zu$wZhZV_|K>VkQ!?(-<^CTBVaCy|d?QA}l_Cf%$pM$&t-
z)hZnK6BhiV`N;P)S~#S;`0%XFgZ8EN$U*}X_kb@h6|th#za^aGI?jsU_<CSO+jU#6
zQAkD3P0pXFKtOvfy%66E5NF0o1YJa<)Jo&Zoq;%nmN693rC6LDY}2s$VhgmSqi*_i
zM64<WKj<zNXY~aR1(501nw@U+VZX9n^2S68jo*|$P1!uG-V4axZP?34buV6WT;2H}
z(HRmsCU&O9FALq38vu_rj=Ze#y$ob|ss%|IlHJ-4kE{JAfH(dWdopcPYdl2)il{=@
z%sDNB=g-OXYR%4nBI^$jU&e->^5P`)YlY!mPsH?2FAaj@m%XwczxtiCSo%-*TAR`a
zA7Zd52kz~x`UELs|D6Ur8be0p7^~4$ym-E$TAUYveeAdQ8QHKG{h(zd0M&}iKlc7e
z9Lu9v^K^}hC97C#v0*CEY^RgE%yZ7{@&_8+;85)4;Iy*-Ph9Ed$9K9LEVmzmOZ@XI
z<+ipxin=u$?!-j8TFtw6FPLy7FwEVml?Qa9^b;q4Wn0iYd|OlpqZw-4{!vvk7Zj<C
zrlRmMrY~<kLu1p&hP3tqEZymfjg*-(175z{VDja)3|UH&#&0`eGaFew=9rypG#W1N
z87ZSb)hu}Cfm6k%qT!8KZpSl1U(odZYITdjfB8h)#F)$*_#=u3p|W^BppjFz)x&DA
zlO})J%X0REY!8HP)PN~EiF-0=w|fe0q0a<7Z-(MpQf_ew1`KTpwgzy`Mhdk(u6~MN
z!^+vM@3Td*wFC*rEK+=gDN+|$ZJ|an@$PIrrhYx)quNkV(e>?SPrBmVy4+?$?X(j2
ze*7^YBNWSW*kym`Mg+-pQeW8*V^K=UO&GZL;->emTEA2kx<MP3-&1TGU6xpg2Zt!g
zmzPZ2z3{*655z%ZYezR}M{gMz@h#ZL?u;fri+A_naGb~6{j)3$4O$ZpE&S$2he+>j
z;UES^LjU!hp`4M)?*SSAW;-8B2oJwa1-qdGB9)q{E8H-G>8t_$A10hY?UIM@B3s$6
zVjbnznQ;>(im7k><fSgHKGHXiEN>}+YW6`@K}Tz36C;A-pFsvZf7gPCMT@})Z;S{E
z1=;G>|1N8U)s-oPWFG2hNdS+HPv>RKbZZ6cFP)KAxGrRC^p#C2CTk7Qm~dSEp3C!-
zh{w6-Cjb=IVzKdJ3EUBtIH5i2O#iQ2QB<I8(}m?O;HA)?vLhL9zB#ow8vb|cH85U2
z>3p?Gnm~!LYvMC}A5s71Sv*q?0zKWQJ?~8GV88#N6dxFp#-si=<T<r}Id}o_w2j09
z(o|Y>c&7z{-Mrc2IrS52x4=L8ZS`3-KHuOqw~eqf^+p4BD$W!tX#i`l-L)ys)?EfQ
z^f)+k`s{SCb~kS9B^ERv_HSi*#@EjmT=O!MxJvE6()bl^eMwEDsbZxfmo;#$9s4tR
z(A|Um;*3I&nz?*VNqXG&nK!kVG1mv`e~Px$e`-X$Fv*3O=ISyk1HQ^3Hh1#Q<#W7W
zhnpxOB1i7nO^C&Xd|kbE>$fcR6|65qDTZjuin?>$EIgTOb(s$8_NMC=?)?+wpHUMz
z?kUE*5#It;FQjp;cfGAW?f>iZRbhKoab8*7beECy-g@KgPyVGE|1&E=y;?@<ET{PT
ze{zMy|MZ~vCf6Ah)Z^fHVsAerG{!Fmz02R<woG;y5ViK*H$!;2(|YY1g9%*h*y-<R
zty=KvS3I-W#bneQ@t3t#?aNbI`mT|cwLFk!Yxt2lH-@rH?|ERPMHG1ORpPtjJQ#Wv
z(C`&Aht?zk)NKR97@84JXRNF!@vOxLJSvr@jNF$#>djiHML7D@{!0V+l?Zwh%ZyAv
zmo>3TuY=r#P)hkjQG?bQOuihYEZkE^_(lYKm&Z5c3&(%UB>zK$;eznWf810lK9pGw
z%S`xe`L=SrF6Vvgw<4or2D4xTu)LC09s^5qD#LT1O6>3o_pMzAw(YC*c9kXhg}Yy$
zTMMwwacBFwX~<(f*9KC`iMz{Li+(X}&&K~<g!k29BEEy0e&U)EF@Kka`(M_jnTu-W
zP7BU7I3D&R#njHy`__QpcREE(UGi@aR$~y?RhAb1l_H`~qug<;TR_*j?mS;@r{>yA
zC4)V~qi;Lji-K}%O?(9y_W7T_^vA~6S>oLLcK%y!?;X|DwzUt3B1J(&R5}q*z=QNE
zRg|KFASz8dk=}a=O^_y45$T{5k=`K?N&x94(tGa)2oNB2emm+pw|Vb-?-<|bUl~KP
zGS^yj&GkIboO^D{1EZf8=kKP=@sotGg%tdvBY%DR5nZFicyO@n-#Nv9&v5>kdH!#7
z;9n`-|AKJ;co-i1i=*UU`QP{uc~cO@QGG@d=-mMO<2#Ea9Zw2@WOPQpu{FSS0yY$%
z|Mzy7E!=%aNBIlAYv1~lB9uBEbYrtZxyp5}oC2{l;|F%0fJ;a039-}_I}EN<D9p-z
zy<9)OUSWHs$6?ens4I|y@E3oT|FF+Qe{cZA{A^OwM7zCl2wg9mHu}AC^-2OqP4GVV
z)l~8Q{c)dx*O-R>@QM-cH`146f6+j#Id^BI_3A<Skz0@Zl(ie{80I7Eq{urfH_CO`
z@T*k^&ioH`*u_+f#=@F)=E@te@Soqs(Z$f)lLz~p8}3D^GM~sUdn5Y`<yr;PLbTVT
zQtaEERoy7_;af|}6;tAi$DEVy*2k}||0gfvQ(X2!e{c$<HH~OY^_UlkT~evvO*hkL
zO#mXHV?FgyqDeA@+m%9N;?Ur+D424}DKR;HpFrkVnk_ZqX0`kW8}}7=*L7_Dq2puG
z@z<r(O#Q*?x|WkTbP?GYY8HQ0-5!f*^=-0~VyT(4u$0>(_S^TdD9ttp;#0_S3{0Zl
z1&q9dgp=FKYHz@x)u_54`B}NL*`@Jn==f0-1bEWAWkhFwDRhp4aO*72^YZ$SCD=if
zaz{!X9~KJDdUl3xYJ=UG#XeK^Hng>NTf0$ml^;LaVB&=}Zn7$iiR+s0lBxphaGE-1
z^_}}SddnBZ=XXr9fsQlH6h0dsxrMOBS33a|-s)7T)?}IVXdgB&tf|kJKq$dluZENo
zPt|Nn0;ZR<J;=v85=Ko#Q@bb7BVl<sM&YAI4(OO0UVGs?jh5&K;KXBlV9iF3SEzkl
zC6jf|G9&AAMv;;ez}>leb9g#g3c$N*&=uRapMRk>p!!SiWG!}8$W`ilA1YhKN0aJo
z5bTMP7aToXqOQ*eObFX&zyd7JB7zk>b=r=$Y^>o$v@HW@f8c#2fTt-)qLCEdXUPe6
z+Yx7rpQxuDhmmx6)!6u9U9B#D<wNa^>|Y=V5Y?rgLYur)GO2Z4v!GH<ur>JJMr%Jh
zX4_*stNzDykI-iWnHU0Oy{;p}0U~H*y_}M0Bu$Y!@glvjMD0;_3gs~FH<#ZbU0%;+
zM%Fp)UI1TNK0Ixj`@})8fMMnFxwMY5#evbE6fc~{V8FU=s+2L(u$J1}YKq=Ka#wHt
zC<r?)F5O*#<wbnK0suQEbh!zr9k!^&Ho}Ew>!w(1hCPr2i}{z#E&dO%Q5?<vAS=;O
zemHn;k=JuSILDGX=4eNt>%C3MZUl~FsF=oEN96GN<H@I6HppFb4R@l`ui+R8>TMCS
z@mb4UIV2$ni|t8MyglEu-gF=MOpo;!Z?C4Re2|sw9q4p4iOp?c6tmK-#$6ksJhh(H
zU->(;06?T`ecW5LFmZvqTDE%Y!#Mm5xhKC@x$pB>N=Om7ngHN#My4XL?$pC&7yh*R
zt^QQ`CIAZ2eX;ySL)G+EZc~?7Y^n7;vvItmdS?kng{&*{NM^ntR9#bs4d6vYNMl%B
zCCRwa@2KbuXgXl5Mq$xN0i+n*ae9k81|a#7aPCE`1GOQ$;jB_$FToLWj_y0injSTW
zz<85~6aW-nkpgOz{@i>30W;oQSlr(gcG*Z}l~-QXfsO|8$?Gh$s69gQJqvdZq&wn8
zT+?*pGv!q!;!VJct$g_|<aRMqYIkB~EZCv+2u~pc5Exv=ru{;ni|gg#1w#3ZLoK6C
zVO$;%pQ;!Q6%BnTfkw|4%eLiyZdk{9dV0WeT#1YnQn+<uRb70Ya#F$PO<Bd?P+YRl
zV7_YYvhD?2@69TUS_MV{fUjwgGk?FTjZSvzF&;!I@ikwmV!BRafV<$U$6oJFigyh|
zKjF#OR?eD?X9#rjYpc|19HP#^9k4>In=q`-hX?KoS*0aK)%r(B-K>s0z49Mj=qBAu
z!3cBFfX3iC2Ei3K(r+t98Es&Nrofhq_3osG9}m35(Oz~Qlaea1X&M8RZHbyQIGj^Y
z^q89;-(@u%TOG?t1ZK5;9YjRKbA?P|x-)pT^}=>Z;)WwoX?Miv?YWyINas2N@xdmZ
zy^5tK7pYk72J;!c6s#UjWqJIQ>)!?n7pPt0kXBScp|Rdb7}4SU1AV=<<0+#f-Tuiv
z=^7JoxG7M?%>|#!84Sc*G&~Ksmfx~&O-M!%<^&es6;x~D5(J#hzHmpj^yDzod!24w
z&|stmH0}x4%~_ss8TRViLaF3Uu?JR&NYU2P3VD>#3zcL0^jRMNa#p~C<zcCQiDlqP
z+K6^iQ{AVM5j!`@4{FcevVW$xs&;k;7krZ9CK{2za6Rc(>^M)95IH_hLIWgRuVClN
zE5Cu@r&H|x*)7WJ9T&z0-<yBgu%Dr{Ni2Dwg>m$_DCzyQx@}GoK3x}9vvpuQA&iyr
z9~`C!j<>bIqp|=Fo{|M}-lN<59!PqFE|~@0`N(wt7Np{Y)wpiXB1Nz2ta!8=R+Y`G
z)^*2x`3|J56_|Y3@*C4Yeza~8aAtdWHmC~;E3&b?UpyJx&x#i>0`==v-96AuOIuj>
zQTG{wB(V@12YyuRdgf>=_iN_PS5=$TehU(0{jI*6e|F}{dGgxvBG9|+0#UeL7?Z~w
z$uv5cN{7`7m5g=bDc%h5$8+hq3Wx;Anw`TgIceN1yal#Wdnxo}0Uu4D?vb1eSfxfI
zKoFLS=~fo}9inOsQYy>xNs9C6u3B%ud&6MmPiy|>2}CwHPq5LwM4H7;?H~5`exueE
z0)m*bEYEK=o2}nCKJ%?r((<*_X?4F7LF_+f;?41cE1CK79U3n)8EQhFVCd)ZG{ov7
z1H*0a7ROykmoSuq$TGNZ-|4O0iGA?w7gwADvHGkDbUQLtUpM-~oAa=)&uspH<rpUe
zwa}&G(FlDS=SC{RcrS8m=pZBVOn?R|$Dci6rkId+A>i=@_F?kK`BNI>8IkbGO?OjO
zO1g}V<3#IudFdukJ%_5RlXKuby+g%gG+U}H6sPn}f#9>JS(Yj1#7Iahxr1LnL#v{n
zxR6K#_*tj66&moC-*$$Ttf)yvqO>}Eyuc^IBN~`px8u@}{pDwL@%RKl`--uu*Lc@l
z;#4D_nqm@<Os|N0GP(6^%YQk#kBf5!fVTfT<O})m26mnQ-q+5cUQ(S8ak45DRU!zx
zWJI^0Jb$_h4P@%1le^J=Z&5^azFUt+5!gBk*4AD}Fr9&57n`o9b+^4Bj3$3pB)q1m
z)wa+-vY=X^4+ZN;K*8QdKY^K!LRWeIy3gNKU+)94>8jJ$r_krWyaB9RFcFb?XB&R_
zWps)e_3U+Wd&P3>+stpE`!s4aDuS{#c=A;kZU;S@CxX8F;do1(3~7{dkfiCbuZFKj
zJulL8pK$dJ{7IduWtZ}!kQoB3LkrLmyNrInnpDRP`wiM{PVktVXq<oZ?zVjK{Q#g>
zL{z%>Ci#7Ei%{eI$|YKnvwIsNM4B+o3mD}p1#f|pIzU&IKUmYfqRo8j8m*Eh3DDJ>
zw!XVder+VNx!6a>B>u!ATDHEQEH^y5Vs7f-*$?k-gt$;%9NrS_zuoKg@|GYX%C({8
zBZi2^kQnS}(QO};>hvMk_8=VZS^<C8foF1isobPRYLHz<VZB^Jg6H=?{w)jxo~Gux
zBRQw!i#@V;HhBJOTF~y8P(MsazR|M@Ru9N93m$YQ`{Mu-Vye^KyOXgbN*7btpos)E
zxiuMoMK?u#y=1m%#E_BQ{RQAtZ!=klXg2tdFY00*C`cQ%Kah+20E0zQA8)7-kZH)u
zTZ&fNDj7II{ij1cGd_Jq16?Ru5*a1ayT(W{C%5{Fg%cCl0F(<ncH9-xlqsSE^vg7#
zwtM+L(VfQ|EHCWHelVFzL=#FIWxG%c^da2UaTvX75s}^$6v$BzNcUYQW$C8J^;k))
z$zx5QH)oRz(Pk8Y(wGfAz6}qZHk=F`FG0Jd5nR`?B8sG#vixDzu$kDM$>Wwccop+0
z!~?KRR-LmG4kw@j1?wU1(PFkeU+DdmYABV3(Xuof{%^GxVBVa(eDQHa8=a}1>pJJ{
zV-4QFhSA9EFqCXFaz^r*Pl@yaHG#0AUe?w(rhA8NbOt0MA{(9b1~r?TmRSn#pr0$F
zC`Ie(o&#MS<beu3??FzwpfLQ`xktN|Fu+ZI%1mUuI(BApsot}L+^O;6{*=vrq+g9Q
zUdxk)Q_@W>z->48OnmuNn9<shA<+Hzd>fr2Y#jcqSqzJ0CfjcUiN>ci^Gtnb)nDVu
z&^EL!Fyu*mC^ypd34s=NfByVy`;u?m7QM?F?a5YXH;-|wvQurm&QdR`FG}|<5wAty
zHJ)i<IDkLw$kT1Mh-i`}Xr^IW_BI=_)h#0YxGlI!-bvz2e$exW-9JVlIkF-Rse{ke
zBbSf_j8wPV7M2}mRb{e#3$b>?yN)-YK#gO40H=2#P;GA{7<aTy6vxv)kY6w*jxnlS
zpEd`2ILNPaoYgFU!LCf$p<CGM*R@n8pg&d4^2D-b?!?Ubr4-4Ntsn}8qmnW-leXBX
zUD@6X&Wz<$d$TK2BXYkor7VZXC&c}?vKZc;4yR6s?+yNyIny#Khy=HvZ{{Fmeds+9
z`X)I*)b5<oS5l2KQhKX7`EWB>>Bn;z3$r}P!lvK6s19}0l#|~1u#~ODc^FZ+CDiMy
zVnlBxZy5D0@~S3f(R8|&r=x;hX2JGKFxfjiCEpN`1=gfoS=R7aPO9XG#Bc^zpFfjk
zC9(-oLlw-8!n0D{E9=Ip1ax1+UQX?;gmRN>IlX#IwVfitR!D2Vi9wiY200edIB!Nb
zP+~4`>xi!|b~_L+92-JXp$#kSg?7)#mThNF>9{X9vD~H!^QNCC^ACgYU5kglECgWB
zpW@9{&AU{s3++$p+@}u0Ro|!v%sz?~3%pG(LT7s0u3?3Z{0W%=-P@fWM+TeJ+3z1-
zJ4)8m-9(y9S;RU;@TRr|I_7USm4-P%CZaXo*M2K9xgp3<1vxjk37IscWv-7$n*92)
zpNSCO=Nzef_q&`Vsyqry5||y~o569=sRP|wg-JjAW=|7Y-FvmID_25A58MsHcmtfN
zTcnJKZ81s#==vu~tuDMQIzIMid*c%qoUa)t^m=x{rdt_`?IHx0FQAmfINU@WZX@ON
z*a1Jr=}eCS#n+}6UFXRRq<S=~`?f=~6d1~$l}Rfms_1mK2c`#HixZbTZJSKrY~cP&
z{8gE%5lME8h@TXC>06%k#o_%=IS9zo{St=$?voOS`P1ta^oIBNiz0a*{X=rIa%Ijj
z;z6vQ!>65%M5J<7vWe9pa=M<jx%eI4wLjCxyG}#5h^yZg?hbMLoahp1cK%0cMTk^B
zRGP;KPDymQB^UA!+52l;6>XlY8PR*Lfx`ok){IIWM~^G5kBBt?tcn=w$Aj7?*KM67
z_Q$4H_vqgLzkiJA-~AX-EoR%=h6SyWDmhz$9rX%zYF@k+I;Umm*)6nu+h!x~s(cbz
zQMGLAwYOwN6_o!e4{vdq?3$0>R~J^Yo%ztq|Cpj3vqk3jZCT#dnDhoJ4s>MpXUfIX
zVZfs`%`B5Y%K(Cb9SZ+HcIZKed?@vJI4?8pNGtWtzfbx*7j?oJdIW~8R&m+vMjD8$
z6?#(G)m{&}Azvw$HvH<yYQAyNkLJ(KlDk2WXZ^n<^cmNx=K}@$-?A#e?&w$Q`^c-o
z*G2glB`rF33hNV+PI0;BwZ!8t@(oQ|gW_K@zP2nsVoV;;MM7?-Yo!f^X1Zu??~Od9
zhM~`m?&lA92Z2+$>vpy`2=Wa@r@t7PJJI`G|5H^h`RZ_LC<Tx7DiEIHBG+4Bt$qsm
z+d)4m<W1F*Pg$(MO02Z)bn~3ve8VHe{#Oz%<mQY3surU1tacb5C$-`7<YFVuX2ZhT
zc~k*s+o7g?KoL%chMtpE8B+0Yp7rM@Yb7{sl>=sZ7G2(=RFgVvBc^wMh%?y*zK+_C
zM}K@h>**Lq!8X26pnB}{OL*De=#OH)pL5V94CY*F>OX~BJw5wzc7gJNlCQ37!JVzh
zcGJ4rmP+)#&G>gq+e^){2~>X(manetL1c_Co}srG#ic%2O$$8ztE2<ScP*z%E$NVh
z0k&@j9&<Yx?31HT5jl_jyYu&JOCcY45hjU{4US_(y0Y@Ju7M%;2$z9Qe;Ssy*`N*%
znsTA@Fr=wnJDu;AG^~G&OHXcyfP;6z=<-GoB*Sv0$Z(Y$EBQl)N^w7O#LntV-J^{d
zs6B2U-bio2Nqal*=r<mM-4Fa%#DMg0+obD$!;;HiQw2B+z0_)&<)fCp=Nni{>EoDA
z`fY(@&>jYT@-_1&SZvSyF&!0CN+`vLZDxJ%XbN2J->KH-ht7>lIRFs%w5eTB;(iTh
zJ0wMQd#Sh`45tt{iX7NZ(zov<DEQP7WI+mf?`z5>L4D*olAC(M?=b~`m{I;KrFfZ$
z22Dk;ob~lNB+I4#R#U+gbZIxiBeFWrtaiESIocVExg?}RTsoELso=5HPMmRekmZ`@
zwQ{0?RzixJpY5pByWJTJ+JNs$u+v8rR4?rB02$TrMq>D1j;(*T`7oCK?5Gl(?zQ`8
zeGKHoHqe|656CR=VWB6FD}V|D<8Y@0^DM>XILnlsroyv?4d>0?RUj7}Wdl2?;AW~t
z33l00FEsB>A<r}=hfT&e-fh5^@c?K&R2RfZtxI9(Yh5+Ju`w}06-Ylj1QK6adDd6_
zOeMNN-2TGy(~^~Uou3O&XS6GAj}NK(2U`kbdnD1vs~whk$rr&=ERx#`jybJ;sP^FL
zulX)^_QE4hsK$a1HD9bX)Bie5z|+lHdYh85AC9N2>zn$FHFSiMJrd0OfvjGy&`=~G
z8daw97cG=<ic+i6ajMiF+#fF53KUA~QW`O!!X|o-;N)DCP#Tb`{o_16$KQ{uF})Ss
zc_t!t7VgNy#^dhYj%w9E`RZ)K`@I%d3&B}X@qwN%9q&!C@Zy)=0{ibsxp*z75aS6n
z<KD=0b0xNy3zvicdEIW46+P>LH`XHeYdfWaGCK3iUX)MG)85$#yO6s(b-D94I)%D1
z)$@dSwaBo^QjBr$&d!Kaa=5T#{qPIdbSKQZOGv>m7tT6dmDKNP`b*tQ_F9bww2~%D
zgLbnwU%!>J6>XvR-VIBTI8v^nKK3s>n+v;6=K}UQigYTl$5at99<4CmC;;C?{SZ%3
zywuj?x_d6OpYU7&Q9To<iqR!lIZAhFjay_%P6(6x=zW-=8fQ<1RteA@c;WOO>f5#N
zt?G|TrD1_fTCuv|NyRTXB{~_lcyTqDk~Q5c{I8G*nAZzPDRbN2Kn;k4JuhN}&F8@|
zyWY_NZIKkZO`DG9D<rW>P9fDr&l&7^o4fwNyK+)Vg*R}x;D{+9;`U<cq{x%M@&`ae
zzhG76l!wy~7Va$4HX>M3`N|Z+nUeL~$L|kfzNyMA6V{82dMLz#0hS_W95hXbhr)VZ
zYsRtfp>w^1P*c4mW$7lRo=ujEKK9Kmh&gwImvJ8Gy5%XSqwMh^P17bPEJ1|bI8gml
z4Glb&61z|c+m!(<suiKZS%E8@`j@)JA38cIK+GEL`lMJN&~Vr21pDJ%6GrSwEzp==
zH-x!`>)S$xMNQ9`Ta)z|N$;Scl^GYfDqcEG;$*LBSLxjxl=)XXpzS<>R-3}D0$L9O
zWC0#rF-^(-9BX*{5W=@|P1~X>;mG9P9*}GSDqzVK^-3JOB)OAK##Sd+?)D1iv-cPZ
z<ltgah|>)lQD(2qLH3NGR5z1B&E}#PwsRi_A#toNIw8EZG~GJ~?G<3s24~{=?$lhK
z#?3L8;oAON3gHgRe;n3(Q8(8n{wRyk0(wj4l_IseP085g*-z$#y$QR(_vhadEyu_1
zEeHVJsF-1ZprK1}RYOi*if`%=BIw2&{Fk`d2x{6D&cP~+b<%2oZ2)1C$%A{&bwD-I
zTM?TVWRiib&4y&7?G(?z+f$t%sm6bs^W#%==MMhFi@=pOeBaxHqhout_?7I+Nt{=<
zq9?pPh7^?VbX1)y5H>uqur;(jPtN=LlL(5zv1-cGR(5vd70S?!;X=?igJlsS8u^6j
zAjJ&Z2R3ERE;f3JwYC$F*ADEYnaKe^n(v(QrNZnVpS@r9qd8At)ye3RYXeE@g%z!q
zkDms?Pr8B(oTA-bw9z%DLNz=dGY6H)Z^fNfzO9@^ZC_pD>u{7IYsTP2k>{1jsyVjU
zNa57V5pRvk)Cd~+x2q(EgIz_%W9vB!{$|p+x*Oy<zAHp=*pGK>T1&o5O5@8b5HYcY
zz2B-En@2dnGM1VDos@HE<jd&XacA%?1<yq63i-}>SehXTpHIdY#Xup1om`U7C%H)p
z9$Vj8<z^Q*hJtsBT=nmcba7fiI#xj9+I&QSL?iEl0%A$m&QK(wXwyisn%`_*=#M{L
zD4U09<lOMYhKFC%lx|-?eEj-$sFU&O*Cw7?aB?Z7`|y3T;^q%8D)jr{&*r9n>2|x#
zAoxjfY!cKOrgU<GtX(7%s1N_7z#3y3dGS27$L8u^xM6^rVC^{9=CNst^WOJ4!C+VI
zl{Cd=LdW>kFPM$ZcPT+%W`CPN)%qTaCx4$dJ<#?7pUO(kIN0Tat&U5-*Ptl1j#r{U
z-O!{PCc$ytXCVZaXJXVP&>|oM9*U;^B7%rZeXUwFuSZ<X^gum(PS4Kyr4LS7xaGmU
z+KlBC<;MoN+wSSz<0B4p=Y{Ew{#^tEj19Zv5a#*qNRgF5<USo+*QZT;8|aQ)EK=z?
z1+w`I3upMK<rP)uizDyT)4T}eaMgJ~B$zrHgD|_dK9&WghKE2oP_=191VxH1I7p4c
zUEKso$Rq^f3fNXAs@8<6<^DnfPxZ-om$>G0g5NHFuk90m`B}@vv2*xHbRb{=iYo%_
zu5znbxluN}a_vu;`1`eLbQ`lPwNBf7vg0iveS@rNJ?!9K%@?zpWUm8qp(CwxZ=3nz
zq%Q}~=X}8W%^g8z-vsCZ#fAj}dB?(>s>O#a!zRx1OTGtQVJ%6}#qL4t4+*B;3DYJ*
zA{BRBfb=FHftC({F?zT{4_xttqF@eGJ3r!%zCx0d%j1zVTeJjXk)xnYx8r(9Qu+;9
zWj1I?HDO|sKj{+tvcWsNSK$wcUWJpg(UV;^h`xQ-myk`DlAHztLFDm~j9<tjcWZ!8
z30>Tow#5!iq&%BU7pGf_*x12JA+Zzj1qIm&QqHJ>?3@il@mneaFFS!GSAIM^`MnFm
zLTiu4rH!bulvVOvap1==iN`)0y9=`%yXwjDZkyxTHD=>VzMD_zm|#s_??>-mc$-3A
zNBPQS@AO*<DO~e@h!k^?T#8Nn7~`5p5lfMghQc1rnZg8i@kkQ>n6db(6bgE2d`XB|
zRnPAg?jjhAvVMM!&|UjhN9ol|&ZJ@r*(Z(MLr)%Ge$QO4KhjK>^U#k!WADPRkpl+2
z4XG`^@KciEDsy@Rt<wLv-ZuV7H6`Eoi{iICkJhPOiUTd)eI)y*Q6BA(zT)~hOiYNu
z3~s}PU(&1L$xNxg*|s_-tLxXJYEfSlD)c;>(D=bWjgqE9V)e@yK{5u>O@UZF(8B~S
za=Q0rpS1W55WZ5@mi~PeTplX_wFQf~x&rGq;T(Lwn*1w^qvwOG36s-*PL7SVBWjFi
z&4_&Ph79e~qX6B!PqL|(M(+KzFP?-5S2SgIze4rur3LFu4x~Czw!FfS`D2ZQ9zF^D
zr%`SmzD8X8IZTHv?~h7*`jYS*-|M!Va_$xQY1}f)aqQmGmhruva#)vo9mk|rP0W`)
z1@AqRs4U7YxbAVp8%aHN+_D3fwe@4saUVvarv*#RJ0oc6Pwd8?HEhX$wGG_ThU~d%
z;jB%#!NIC>EX?Fmx1Z21Lnl2Nb;36~N-5pZt)ek9Z+sQlqRU?Q3_>Ng1UQbRHzD)F
z<n`F}RTbh^dR5kwDf4A-cDmBzN3T?xu}@o<eU1#Hvg&cJ_3w6$Ph~pJP0qo(zG#Cw
zJ$C9K!y{?hU{182z!)RQ;;rd^))Yw*5v!*243cGnn?X#pchdXEMa>l9CjG!88VTZe
z&1H|(Ct_VkxJ32T?ep`0%^-?@nY-v)V=siKUN^d8=L=s0`kK%m-c7dM&LSkZ0WB>#
zdf5pp+WN1GOec>7Pgk;Pc6{C<=#EBvNwoC`UzDERSrECkz9qqk*)4rn{n97)^5kwC
zrrq9Kynkucb`Cm_kgY>Vpz+W`wv2iF%XyORil^^QIf2@LrwDfz5M<B#^c?jg@68b~
z@&ZD$+lVNcwmz@aZF6)uElc00@bENyc<S-?7FKZ_O}Ac+#5*}uaF$lFfBb6#ga(){
zaeaOie44p+AY&9123pV2-grpFqf&qBy}N%OS+9zL3?PpWr8z5nq{o&&rl&ow91%Ng
z53MKd;7<Q_8W!4#4RQ;rl(+8}j|$TJ^3C^jtYnHz=wX_=yS_=EZg-_%=FXgLm*-Wa
z*<Tp%mcIk%lpDvY{<Sz%UL!S_?$olA6AJMHwr(N=WE~|I8l82+eP#UE-g7ie|K!u5
z)Hr(GYliaST_8-bNxtzWOa_HrEvgEli0r9fJN&*?0}4B9P%JnzEP*ZrR7&MAp-&gl
zm+K$7PgiYJtn95K2^S?cyK>tj^QZR^Vb}8q=b-<|UkD9y^NL*rw{jBG4IR6atqkX2
z<On+@(UR_p3AV8>D8=46MQ&ZmST$}UV=I-$kZ>mWJ$?bVQ{(B#YcF0FFH8_b9KU$t
zLfb4<j*B*75|qANux~8A;jYq2WD@(R{t`SIW$32>>hL;P`%>?cHacn{-U$hLNSCV<
z%2Teyzv*Va)Y?0SvZYzC8P-V8wXGE)ox9`OD!F&zA6Nc(@yvZLMnnx5apXj$_A*!P
z+R96z(3sAHYnr^(Yc1mM1l6nLfr|Q_%?oEKHSAtZ-4#j|+1vG!Mhg<XZ1zRgo!sfL
zp`j<ZAoiBs@0RDKS7awskDoDRe>Ns4(nh0|4|xu23wUf(cV0!Kp)3!ZV!4V3N*>P0
z)mILAoPF4rV9OR&mO7N_X_nq7GxY{P00p8Y){|`6>jsFeg+Wd16ktA<?ADH4FXqwm
zy9kA`lp`>mbx|*D!6*Lgp<mV7YdSB%vrcU^3|cmqV=0TWEz>I69)7eXnfK}62PClX
zRsyXf(Mv12E+PX_q6)!e*H!01&macqqq2q;-F8a`Do)PU6fx#8>uG{?=Q^1E+?NMU
zeC8L7B4MA0o{*P(bO*nG%MJ`f7ZhH5tp3#sR55X$lbzsPD!S_ew<6JGP0Qyp3L-p(
z)Z_4%!m+-If<(8*{6<*UjMfIU^SmTwXx+E*XmYjLy)b?+jmHb6ObK|2Y@utec0yo+
z!Gk6Jk76Vf4&%x18W!%v>LijGoR}s#_r{F*#ZeBA(fzn@wigI^^-3S9@XiywvjPcw
zti9>A&q~(q7C8k{nkV#y@tQi2_1&~W20ORQGv5>r>o7(0D*2;+qPbUj@rzQ=Hh9`k
z;*NdnCFp&NOO$-p*xN*1k0$rBmEFE!w|&-*bkz%kE+%r`L4U}eY|rTQ67YVTabQ4M
zZ-wgn<3Fx$mK7)vbN!pxRu{$+pVx$(n2aVW3A*80`1tiat%TcyxumvEE9O#rvi*^q
zC24&;&HNv0x?QRH%*gsFZef8DT4r9oA|bdubp#??MlLf-pHAc?t4N_5EyQPZ)Ri?E
zY55#pIxROL)qW0OGG##S=ap5(bmZa3zx|#Z5%<6lRM9Lf&SowJf+$j*!HJoi?+x})
z)p<D!+I{<IalJ=9OYWLB3+*@Eqo(XZqy|Wxv!J{<dW9Zed*K$nZ!g-o(%=XbHh5DC
zZ8?w3J(cElZ{VvxFR|OMuYM$n&vteo49dlMz~O0@PfVVJs=6=;@~)rQTW??g(Cjpo
z6yb8vaXKjYeG$Wds|Aah#^cNGcoUzmm@BJmk@uYc_xPk$!kB>fD=3_{oPHxokI_xQ
ztrf4Rys-mGD0SFp8%?{D`GHo=aXJdW=mjni(|z@6(TCJ_qSm~$<T{89NU~C_<Z4i#
zEI>=d_~Yw`F^C?iFDB*D`IaMN6w0`7saXk%@7JWZ976IOb*lfk_H(hJCQZu;P&cPp
ztoIXg1S~e%am2`Cea+v(1gW8KIcc`?&9L3{5VuIVZ}MfQsYX~-MnPl;Tq6?XL&OGU
z?ozgZ=UhAmdx90}TZJtvu*PV|<Vg@&viH=@cCRX{+0UMS9O_d5fj>#`JU-*YJDeN$
zQO4(0pYx=nGFmWJqmb_W@Zg9sLe$>#NvwKE_0OwoxMVdm%O<>s$;WMHiD!efZ(Go-
z<7%%^Yy|Gc{dK7g0dGUviZ|<XUcOOL<uv!O*QbV$OEfFxPw0Yq39P8Uz=%tW=_TJu
z%vZ$_g0_n=cc%>zc5fc{?mc3L3l!r9mQ=}g(uI|HG}_`EDi#k24SP3dS>F+V*|$!0
z2g2>#XD6aXGFcL!4_xA~mGRc{B~3>peorcPghtzuf)J}{0F6NY0gCoARU|Y#J^`N&
zl@`X5BQE8}kuhi`vw@mSYR-DfE7g&PECCpj9ygzXho<tJ`4uKkEB5-4>YDE5-<NNJ
za@Crp($J(pB&GLdu0NM}RUe;sOro%qB7cUNX4E@UEh}8)%wZ0>X<s*O$p)GifGKny
zy};)^G-s<Up_tkqS`@kQ>S`hCoWZDdwB7_fS+~Ka{7Y!@vups?8TS{6+b{ud5s?tv
zj+*o@lVJ2&nCeZ)Exg3Qllu$sarFhEC9dVy%&o{OfGbSPMq5y_m@oz=$&+Y$f5%YS
zKM`FPaAUH6<kMUhI36kuZ?Ohm7BTy|J$&C)V#kVK9B9=c=so?K*n0DZY(=qsbRBqN
z-Rs-ic1gog8L#*sw_21>i-+`3q!HtMmjV2NJ8+*RwhCZqy-~Eni%Zs1RBi$Wh`kK^
zdc2BV6_}1yNM_nIHYKG#WzqR#vx6A*$0<3cxSQv*R=N;E+Q{n?N-B2DbyBlQhvzI6
zgU?&Ry@j|EaRfz>1jqL-hkUkzj2U9RsE+Kp$3sRv+Ng$TeEXP=sTMRJLM&Mbw6$?}
zZT<`8otv+?w%=3x0*>t&SWg^=ctc9eBr3SLm3q^Q36I)bB0Oq6Xbu0mX8K90M^E!Z
zy7BV(kDDh<C(IPZ#k+TfLH8w!qw<3X0suTAvY{|;vMiYnmfkbkJ`1tU^jSRiz@rgo
zc5@@q?sUI<&7E{<KegQ1Jm!mHetd@<uN_zP>5eeem{@^28S2T8IO~L}5G`v0UUBso
zl$Sef$qtf*$aBT#`5i!@8`WxMz0|{GJzST6L3~IG_fbP~@5T|+6$aCy$B0F+Lie?>
zLM9KZIR+SMyc!pgyXXyutK%G=dMVy*Q;vPKU-=+yAmj&Cc^m*IGUQFU34m*|oKXjp
zNGdbKWjNT2*&PY;<>Dl3A?-1vABm$K&94~086Upu?tY$z;5@@w*|J`)6m$QNAs|^(
zM(Zd51XB@+t*x4F)RL6MSp)C9Adbd%)%l#Pg}#$v*M2nW-F#l8RcrC3P}lNet3_4h
zYvhvsL>U2_T1n)0?py#EzF(mNqHu`s8{&?Ud8>w{v)9*yusgPSbCJ6CF&WeIctT42
zvrU`E)H?PMA)eevJC4UZc4ssvDHKB&rIr?MaFi}GAG2U}$L1$;<#dm8sB>-eV8!CE
z)`Ly;+}w9{PI(UMchRF4<GVXQKvI0$7S2n{sa_$gc!XnwN`wT%{HyOSD2K`Oz+diQ
zBfaUkYQ7!!4o*u-incxiuB*Qe%j03C;DY+Ot^|nHeI17oIlsRh5srVNmw!>GADI3p
z15;!3du174Hugv7&0MY;wxSo1@<o5gyeqzgZ+_87`Lb1cO9lAax2p_1e{S-B&Et%F
zF)B)rn6Au#>$WgiHi~pjc^d*xara#O4^j-}^n1C{4?Ssy`<1c$uN?QqHAeeQo8IkC
zE-J~b8E3WPX6yReipPD@C!19(OP0uj?>!HxqUX;eEaZOjL3G|uVWfVrvZ%+j+1Jgg
zbcHNE#*AUA|CO8?U!ri0vz>a|I~U6Jeu7}Bd?f9HJnlhgrk<^KxPV2U@Wgd5&}-nP
z`$gZiReVn7`+IxFBNeS0Eixac#{79P{I{Hz%ireUw5q+ILOSlitlJ*H5J>dhP?0Nh
zulc*3VmVp&H%+%s%%jRL>5DU4%6;a)GZp;5kpCZkXNKt>3UwK?DV-4=^IGOgTa@Ry
z>b2J%DnrKtq8B$=O~?;5YuQ2hJ9b|GC2fQ{wcH|Rf3;~XlA`Yzp6an5I5Y7-@8;O5
z2P)?1U6AdB&_UXD^2ezTa(x{8mi?FTXjat;$@V7rzigjC??+WWP`q7dXXiyZF27zS
zm$N`wd*d{9zim;+F}pTrWYO`yONzj*`}m!u`?1d=2K2}QIRmM@bLTsDNvZtfG)m()
zZ1hosU-4<~wg%96VC70n%VFNO#*X7}z)|LmCAagF<Eub31+joz+>GN&<)mq6zfo#9
z&e>KAf)dTCfW`~K-d$P1pC0$K1)nPXMxB@+6&y;{v4C!r&(s1v5qhWZv}C@bJ`H~;
zKNae&^t&&_GoT1!$4(22ZSo&q18h0GY1@1VS3IFV^s+8lvHi^i#psx-w|oj(?=VlH
z3>k1=5s+|VMUB_&@J9UhHw4@EaFe7wsJzHb^P^ahd{kcTZd7GVnuIo<D&0#c&`BaW
z_*gsW(!c6d9ONayOoEkVPxJ5pM$36mG^$6!EX&SvHbhFsDE3-G>#!l`hEQJBGLt-g
z!cBi7^0od7;o+iBwK=gX56lPvLu9j<&xV)@If?&S7y7%gn13_xNB&m8`)Sgs-uj>}
z{cb;HDyV@K4{57+CTT$6l4t)QSoK{W2fC`Q$SzEaz5b1&W*)T;G`MhL*|uf8_nU3{
z|7Qu;!%t6pRs12fl(?w%j~2dQdt9|X-hTHa*x&U+-?r0oMu@k8e)`o|ec|ejZ2><_
z=f@7A6>^y=kLsnqt=_cI+i{+!eTrOkkE=J5W|BNuQuBTPY442tE{<E#S@YSPn5DdJ
zO0TcMzKpV2Bq$<A*GYKHHK20DmNskO%~MQbYOkwgg`%y;o>R(9v79W_<)+E_6R`~+
z^;4vV+SJb0m~GC;^g2dPUP;ovffGVA|6%RRZ+oCcC~o>6E~my_;i?<C*Q*wrbS()2
z{FKU6GG#_MW2uwV=BfC(R&iY&iXTGKwbkuVm5__#cEbsf+=3b%r@j5AKk?x|uKfo&
zD&!q*IsuFJl~NVGFl8~9db9As4-`GcIn=(mP`QFQaUqLvt%foEkQ~?ZPd8EKy=|0D
z(J0&gH$M>PKaa?n&zn=7w%y@a$xg7B#Z<>E%9C3IT00Q|N7OkF^|o(;r#<YWvih5`
z1=ddk2;EI@;ulW3PO*^IMTj{?Ua$Q@4gS%4hxvo=S|WV$ddKS%UPxAKyvUV3S}(V@
zvr(6$n5B`z>X2JUqHd@a$k>u>_dG(^xcE3>%1sq;AlV@to#3fzr~DSUR~Qcpn?RsT
z!4G9+p5J{RNV<pXw9<<wVMq~t`Vc^P{xFTuAc-(Nm-ZaDORYtQ_tT|oPaZg2Y=ueb
zfxA2f-AIUEO>}8w$)D|GFfz2ps*8s^f!A$W#kS(S(lbBk2#V_c*g)V|0!M2SKq|_^
zKJ{lWOERsG+ldDy+N0QMF8k^_ZEUr>OW5-y_MlZ4tz~Y*e=RQ8Wjq^f8OkzZ>xQh9
ze-_Ia2J^M50{_Qo_sS9tX2w<z8wO3~fVXOfNO>+7U$-utth>S`;*q_g53awI!F<l(
z(_ceT2+V%s7>#N6+8Ysj;Wb#?P$h?pPd~fdj6LSHTX(W9nSIgAE$(!#wYLeY7f&{k
zrBcQ6;Qo2UJ$p%|#6MWtv@<I8s(4<!oIRJu1s8shqF#mBn}4U)N!xF!(fmCGxITY&
z&oL8eK$J)CR#K!qX8N!-7Y69QMvJIpTP8+HyJ2G)I*;80@#tgEh{v3NGMRjK@2vH_
zehNkPrXJD782}P458%MYul_bRzllhHE$Y80raz33g_HiPHvVI{U>B-?4aC64152^a
z_OB8AZ!3g*i1F{$^1p4wKc^7>whaFqDV<1&n-iXVj1gG&nqpY7=oDia0u*qQp7_Fd
z6?RJ_YbEk|{p$@3A6!@6rz3asHMIxXkbDfZmfFywMiHZYTMF-pyi2It&?h9!OS~=3
zVju73@7N|*71bmgbtm63y}P)7IUZ8lImdmH*C5<*SBt%+x-aw8nmBd{qHLr~tq0Ge
zVIP1DYQoj@1YEAGxajm2|LbM&jalL<>O@mz!7RGVJym3~__Tlb{jPl8S?I1uUcevA
zVGgU#`kC8xw0j}DA(C-;>^sStsa0h*A}tA*QMSb?5#<kdyYBewf30q0%DbB!SwSqV
zKyrL)OaZ*Rh@oY@_;l^wbf8On64u=@F_+8LB(J{ZFj?`0?$Yhq>59A>+d%GE@1X~d
z5}obEr%b5jVw*S48hK~V**sg$2~-CBL;50~JkzL{E<V3j4^OV4#F$}vnBbY8WIJQW
zOoP9a%W7;)#%LdKUeCsN{az6e_f?%c(9UGOPc%$lgEb{V*pv_-`EF=Nck`JnO9eZ-
zES_-0>Yde*Ae0@<XN*jTH7TK-L-R{+a8(e{A$OM<bQB^vLVU?r(j=lA-G`Vt>^Q0;
zL+m-NF3d46?Z4yRy-6r~Uui4}k;Z+ZCqDLC?`aB1iKG`Drv?&^RjkA-xQdF<q=Jw;
znVWWBKWi=C2rExTMj4_trQbNK<jw0^6Hl)<g2?P%Ovx;sGax%<Z}vcBzDiivte@$l
zEeV3?u6b0rj)1Ig#(MKquw8UmwP4WZ-$uwhjST+-?HXC`sN#Shk<W=qao`CPk{l8$
z9_q|cUCNHf7MbTY381UgTVW|ExGftw9VtSyR$O|hOmn#Tuu_T6E}19TIaRSU-G-Ha
zRdiQ?b9Dr)%?Y&l?pobp1Bo|fo{~%N>&EVRtp$HX)(&QRqo2echTI;94QLcUn?n@`
zn)Yqu^TKWp`Ns^q<BbFs7rb#UR6L>7zEBz#zH1$O`22gd&Z1jvE;*tt$Oi-0+@y9t
zy{7JHZWpNU&(k_q&wj^{M}wZw9tx6r<uNlo;;>4)-d9pO&RkWOhj540?Z5RHrhbFZ
z+i|3qcckk9M-;>h9*^6{44+55zT|kj^gJT2Je}akIuG1Z-H?e#gLkQw<nqee=_ORj
zBPQ5DX0BEiOt8oxki2yso9GIE5yTs*So!)mQqiBX2v9L1oWxwktSZ=_5Z4=1Dzpg8
zf4BElWJD{)ie&;4u~D9@E*;_tSj?)Uo#>&KXb>p;IuXeq>q5|;%w0cYo$CB(-{rfp
zL`GJ-h+@2mlco+!3V+{JH9beKDs@j0{9As*XLi|5yE9L0U9w9{4+nM$4MgXr;X5xF
z7*G<iHUYEVgx8;Ba1*-g2>2C7>L9UAx1{J%gPICe0|$<bSAo~aFFAQo`OZGuq#~pu
ze71uft}a8i`t+htS~61?4O<D}&OP?oJZWcV*{G`FHX;M1FV0nEg*W5y=zV9YSKy=P
zRrI@Wuy_+nlPFE}V&_CwbI~r@k(rTb2T8mUBksu#6|u#+$Oom-fm)mF(qC#mdqTVS
zZ%gw%+T2BHB4GoS^H|t5@5>dPnTvexs84mC)D)@BP#-TX+LxY&?H3RvSK?nTjA5!g
zuPyU>`#RXjT)DR+PJ&qZm`7vgPl2e@a;F$?Jygi!i-&Yg@Nq1Uoaatf?d)0O`_@{u
ze?4P7z&Pz;f$5Uf^F<mwnq(=NE}0m?kN!U6yM*$D=SNRwyB;`%jq-zixzK$EfrES$
z;{qJSt!iH}+~jTs>VP1x#8WL8wO!=!I@(|BoPjvZsql2gBFB*)zH@j8@%&KhqICcY
zySuMZ;3E8v*o^n1Z?~=o56J%_Ma3z$!!n6ZhAZNmW5_e$l*eJ1%}TsjJ1*{#B0TNz
z*3xDeJQIRq*bXHkxMrT}JKbeVMg&Psf>}YO#w;$f`-mIB5_6I~DDnOP*{1~4@I7ql
zhzn}xt7t*bdiXJYKS)sQ#Bx5OSL++fu0i@EFmF3hfQbWJj2cIGDS<ywNrvmZ&{GAq
zSkSDFCe!4t*?X;TgaJZbOl3bm`W0%S^)}AOPw?@1lkUY*GP3N0ZqqAKDiuBAXJ$1?
zU%5-l9aqZ=5+6?T_ZJ(_bb>nt7ILN&Kx&u!tVwx0GL5NxDLFRB3SHbs3VPRphnrMl
ziFn*E`SMxr0ha+m&?@uCh0@q=hSHif4~B1ffbvgFdR#Ki7=#r1Gyzyj7?d%NQuMBH
zzGtg}&~wOk?C>6X8SAZoW<Z(5rt<jZc)J|8^9*5yZ~lReR&^3SFZ0(V3Ot$<{a95O
zZE?Jjv(6e*1!5`#Q}~!ShKF0XcyuiDgkAh{$*O>EHRDAR-xVmF-zXqhmpoFvr=7@P
ztz%|4Ax4nGyMdTWiGk_u!_L0NzGJ1l-ZO#O)h#>~;g4lS>CLv`NopYZlTpgICmTv)
zPl=|}bGNHGKoAAuVUa|zy;te@6=og~0h>l^)s8vBN^T(@<V_p_7$;lr3tD4?@&aC2
zBJhk*(h7M!RV`LgTvPqA72P(TY-Q(kwD(6K63#QKxrdkT$n+I`kH1AomZw}C@YV^x
zT!IAgz_fggQ3L)|cr3Fae7e!#Q|a>2x8nPHAbCc%=Xz8Nvpegq3V91a1eXw-A~w)~
z&!NN%94_K%+HT-AF;hv5A!P^+2*>hViemy!b|zdAVYw6t4pw*!QQSi{oD23dgGw7-
z2Y{?9yP$qdet6^hEE_^b&9@v@ZO9XQT*QjBFG;R}n0aJWg+UmZJ_pGxwi!o~8ec{R
z{q79oRR?o#zZ-{8d^U<Y%CneS9?=VMFh^72b-<v!DBv9Ae^BZhMMucPk3nnKhk>wy
zdQ(ui+W@G!+M;KjnGLt$A}}F5pD(zpt1u48xi0!_d@@@6dfz}cs_A2j^7}!K@zq}Y
zL$%K~v0z_5s)z21)ng#5-44r&pqI>Mh|w9@xO3uV!gt~sS=}(&mtN5-;RhFVK;8N>
zvF{J8>DO|z9GB~gwz!=Q6xI4PKe3W@Zi@qU6pKuWb5dx7U0uv2>*t<kr3UzZBR0r_
z4=^w3P6u$Jp+KHl+g*{3Tpm6k@2O#hyw+@G0DYxs>WF8$2i1Vfy_MYmaUF=20)tWc
zo_fnJ(Oh!lTK6E@9aR%VAd60z-ryH;=_JH55(A0vWJV=?n{h{rMxg>zVtg4OLX~c&
z9J*BPl#UoS5YkqX%$Q2J!kkP4Dc-E0?+Eu~$}k!i>OO=Jw}b%!UR<n3rHN<GwX|q1
z&}SYa;%u%<Bks2ONea}|S{fj5^zHKTg-U!kFbW8daksmsD9HDD(G<8?es~VBFWf%c
z-{_-q)ZOKFOQwyxGqSCZOgk&Y>s69ydBxn%?o4t|__ckf$yFe{A4QJvRl8ZCc)0E_
z4io%_NfwYH(`G$Bc|`a^l!ZZ27(`&gGwmRm+Dvvs<Z>ywzd>ZoA13(iDzxm+dkEA+
zNbZvM`5FXy9D>5ESwv|VI5)LKdpA0u1$fKzMXyS<C^2|6r5|<lJ7y2lF6erAoeaI3
z0<x>(Q??rJNbGvV?!84x;loT+mAj`<Y(ZVEhI^!BeEtm%LVT-=A&wsyp-UOmBB`ge
zL}^Es%>XgJAdMirezJw)-j&-~-^_d@7X9ols6}>S<;_Tfg0G(si>SdiC%rG847vC%
z4loJR=0k6_aA@jg<h~rDaF%M*U$=lAL{Jey^~FtJw0fJ#r@lb(grC!EfcDI{5W_Iv
zCk2)!lqd?pc!HLusiN>3CTh7StsBY>-^EljJbegWf2xln;H?95YFcUU49D^xC@~4X
zYbG)X6N0gMN>fNh6gQX0V1Gnazfn?zuX5ZgF_~^K<fMI<@Ct$q%*cPyZ;(Ok@t!5L
zTKBL@%WJJKCbRjE<ry>44T7RxZgPhG+fvb((L!{?!Nh#uOn@4_-*s^C2q=tJ;d|=j
zn_$OgEmPFjJ&B|_B*a?nPUJ^Ig-3RQ=_a44hy_2ri-?A`WkR>?#Rg)Lo4G!ArHaz2
zsK95&L%qZXzNJgm+l&XHo|n$mk|Sm&I{0jP>Ox%$JWc($Hdg21{E8h`H+#Mj$BkE)
zr4WXUMDL{m$CNKiKpc-I%yG4V2%i#o76EF<=4BkKtTSslWpA#pB8>t9$|<>zVE1m`
zJHYV`=IvwmiUo8JN3G(qaEiDa>FCTnD={a)^F2W=kFK0z0rxPVCX!+Df)`SGzvlya
zNlOuv{4r`0?9=2^yVf`P$EYr2R#lm&=Xt59<Ix#XANeO=%bO~bmMv2K+IE{2ObdCU
z%dXM;HWYK$2Cb3D9`+FHRCoIOQeOf5UyR9QOWk=jyvh4q^Svx&SmI<)FIgu{&~7u=
z@|wCi325|4O8ipNRx$^1+@N^ZE}<rqdZ;G(ZNKf&Mop)Sco(OekoYe=9KE0yB51|K
z%1ODC8(s=u)xP@beioCri#i~}(c*($?(-gKF4V6Gj5@?l+6>>`XFMLR5&ndK8UiwL
z@jt&ke<AL5Q!Ap5JLy{!cI0aIZ5=9nhGRd9%cZ<;A9=h@-RR!s@<V@H{4QOk`ICMA
qB!E9D-A|hMllc9A`@bvt$38Ke5I8!(HU$Lydnl(YTX4_7@BaaS2u_ax

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/source/assets/design/v1/prefix_caching/example-time-4.png
index 017df1657c22e0d371c1a194e7a76cb36ed8af15..d463248a3b1e9df78a32991840e9886c10c4cf42 100644
GIT binary patch
literal 60607
zcma&N2lU(I)ju9WDU@9{6iOgTfdavjtzl*?YkFIfB}-7iwj@ikWyz9kOA5PCprH^#
zDKEQ)uv1n@TSiKO778tU(^4o<D62prY{Ksmbnt)A`JeNili2d}efD#odq4McuTE{$
zn9VlcY2!I_=4{5XBtK`)TxHIj^{kD)4oa@G51N6m^#YzD=3M*0&acjygOx(69XgR=
z4fQz-v&Ao;7Qz|R4Z?+4av=;;L%kRJ?g%`Ba>vyy!_u`c{|0AZWZz77Ul<o5Y+<&5
zK;Q$y7Qtv%{jz?iuaExK5wa))I@q(?=~<)D4a|jE0{rGip*{rPpcs7C1@O@Z|Dntg
z5OM^z7kDgq-cXnIZrKVK!Zeb_7G+UTyl0setCfXt5j+pAfeyYHy*F^vCYs-wEwWrt
z9m+4tEXsnSqUpO1s6;~<7!)Cy46+DK|M1`S{kO8vF*?5WmC@3r>tUz=<+5{L5upt!
zSCEu8PvZNEEv@?Ha;Lf<SYWTd*tvg?oi+|*Pye!1)2CJsOe8J!!1e@R7K0_h3$qkB
z1I_C6{f-0vZ2fDSqds0P@~Um>?L3BdhoQ}MzU*Ms0c-klurJ$VYS`6K4yCJLJ84^<
z)^xoeOoVZxJ#zEDUaRW)j+#OANs(q0&Zo@m)c3?(vEMA0tYV~sJtm-vVsavGB%z5#
zgfBQcKB@A3CB)4l=0X!W7j_2YY$*tZw(JW=jvg}7Fyrx*%7<1Yml-IhA&6-(T0(kI
zrEZOiW|?Z&OV!Lk5p_P75qd6DrMih$q0C-b55=Is^;{LBFbmI1VCpP4QU~4MxE30j
za^0$8-Qh^Eb-T<|1zK+9aRxHokzMeRL>~4d%F1YTE~C&Bjz$9^D;kl~@y9cR%oJ-#
z*RdhT$2(%)cOi-NZA@=e5->``Q+tUN1l0-MkaOv{GqvgEn=Ta6$f(h<vpLlsfU_W!
zDb%+jyVEp#8Lr!{nGOaONxq5X`}x8Qiz!)=?PQQqE1|2`QHHP0Mk>vUnJkKy$PS#;
zIjdEpyBJkba6Rrj-A*qbm^hm&CS|f#Z?~dOP;=4YV20AI{Fp!xq?F4KsiHXH>X|~<
z<eZYp45MO=>(Le2<u!;OOpq~{o8S)GBRrAvf=mywf{4I{jxx(Zat(+Hp=JtoMs4OP
z+*GNM%4(!gs8-2LhiDl_$+T)Eux!EQ)4JJYgw3+G3Tp9uoujhC7$22;1rq{Sm*AV3
zcnaZMUN0*I!Qr(+f)5Mj+=Q5ktXrtnnRc!^sC#Xx-p_N5NwaTBc)QpthXITcz74u)
z0gXanXb@07oWyR>NHUpZjMYboNgIxBN-b&@qI$nVcKF%E?*$dvh6uSVPT8W<Db6xz
z#Ef-cE0j^o0>43T<8cr3%N|<v6(X1O%ek5u_GWUq?lgi-e})ulVpAh%C}DN9YY}||
zwDEEyQ?dn^XtPXH89;TJ%Zz8N-7JLDTmjUeeLv)taU-@`pgN2a?HSfFXO&zr3bHd}
zmLGs(q&*=dEb>Q8PPdT(-z~5TTXIWAwqg#5X40jNmJ;V(oX+)oZi(r%FrXpF-4Rvr
zGtH5J1YjVG?1ZIB?CVrD&)2&V(1|qoS7!P?L?lY5&w`P1HBiJ>hJ>Qbj55$#^r*^t
zQ?62or!1OG^PMOpvh9HC`$e~D<hY)l6SYj}LOwrj)r2-@Bb_jx8&rjnMU-b8K66^}
zL~Me-L1#>fZ~~m{)uP5Mqc^Ik8X6Q3^!5-3704EAyXv%`8RSX|JuEYwJ}g!TUXW?2
zLw_7-{pJYI_r1zsKokqjS|+G*vl2J3Dik?^qJ9ZOnruGo7=iAPao?EovwGaE6l`J)
zw>4{k>u%Aq#-79uC|sFZcHpU|CO43oH9`<DftD7PGM#=WGG^WwVxV+RBA=<y?AZ5&
zuw_cas^myqM~9joTrM=pcqW5N7-D*gR&9+*GY5W(LHgOjXb&tIJVk>EUD6OLfF(E%
za@q7IIaXQhS|N~9I;~R75$u4Oc}y7;beqKqG|ZJp!%opG$~IN&M_RKKsk%q2CTBqu
z1#StAVrDLb>iUEmyXoxXNlmsUoF!vTPlbk=zUq%FO$F*|h98Jg3&}%NZh&N))<p8^
z9%%@(ZZKv$fruzJlF!yHyN5KXw&1%=0j!;NGg&vR!4lns>r<qHxQ^Y*#l;LGxBWg^
zx2JhUpHy6lhHB*)^|DwiYull%LeemYPk6<{<;I}plU_d4WPwNrOm_n2*|y2_l8YD^
zVhSEIqg>JGF%!rxCb7xkkkV$mB`y#+l>!3A$2p|Pl4UlB`B+CLILGgZ{Lrc7DX!y=
z%;uEgOqhmoz1Om`44Nt`x=%`4Hi4VjkeuK&(wX5ttAxP((`3F)xs6&#>#D|<n-Sd|
zDpHPD7&hH|BE#lUA5=4ndK-#CK@SOC+9-_kif_`*mZSJdY>dol9<O1B(a6>l-Ix01
zX-i~q*Tsv9AC>Xmv?O$q?34#Ru{NZIQa_%e%#;v1J!xQ-hc1$L_;8F1iabEIz%N_q
za8eSTS`OIv^jaoK)dzQxuptxt8ewHjM`uH!=-{YfdRDQ7GkB^o3e8anEMQUZIF_$Y
zd-buMt}>N8nSo0k-5=p&rbTO@08}Shbptnm?FT>qZT|lk7Wl8F2iHHv4*v!ROkrOx
z!KB&|jPGV*B|nPB)W5Mn^uMt{yC}D75?jGwtvxn?QF3}iXu!KRXZ2boH&FBG<oivU
zvL>0PomHl06R!4B)eWY`*I;eb?Y0UEz+Ln#CJ}wsaWrBQ7qenFwn@{9DwWQVG-k3a
z$v_<1>cC+t6_v}trpH2sNv%;Gha=FYRuh^W+~Yf*n_+rp<PJ*Bn!?3;2G2M#g&A7Y
zaRy|)N-|^ujqq@Q&#X>Hn{>fkl2NAQ*K>AQ(QO?K`b4CF2`K}tA1f$1G+TpQC+-()
zzFf9VF<p{2DdH(qq{kTeO_y`1n@m!PY8f>L>x5%QO$1=S(Tc%{`6AIr)k<EAA+{~z
zB<!T?o#bS^F>#xzolKM%v2y8VF{KVMbt-j*36uaA&{}PpNv}O`wlX!!hBC#Ln$)QR
zJPSpO<8&CK@Bqe9rO!Y$w(X5Gb)A4}pb6dgOl3$rjkaVeo;m7yQydRw{a!_(_-v!D
zmOZgkZE!tmrZ%AF80BX<H4wm95(F_pyG^vwqBfYBQI-g)BIU=HsF~8J<rM5D4K!#a
zL2|HF;$+AX+AX>k6o8=6bS7(58JdLpe!-iyAk5@bb>P7>My+;KHdP9w8%Je3QaL@Z
z+faL2mrY~{j{`d__K=cSfK*^k`dzc%GqU3$ZD7qZEE65sD4^ts=rA%d5QMr&TJ>?=
zsf(kj*`X?Wo$XU_w&Fnb2GMU%dowu5M`F1eguYd6cYKz{g+{^(L9;3Mm3lAWr})G)
znOSCJo6rPC#o<haSlh{w&19-(88Rnz+PEmxC5_AaSX_mx8s8`+0nb_0n4zGGRiN_~
zPE{EP(HVglg&9u-DlgjwX01Ku;&ic{{?K(Swq#OJm=QapsO&YZ^4M_tLPoSoBZlK<
z5NSHIUVdUJgzBPAt^wNl?4Z%s`>Y+5Cxw_Osbqg(q{kCvLxyb*GkTLjvxYTjcBN*E
zw%npxt#m{oS9h=+2^We9HYu}S&?St1thS5eOz3d4#H!n6!yMH+OhM9ZbA}KzLBYGd
z#EgSz*6cS%P7xUEl0dclxB!l+(C-Ufuw~$Eg;F8O%oHsOD{Qu1=mo4sX!!{bC0%J4
zgnER9K|PV0k_%~-re<S2)*pso%6Ss}0v&wuJCBpi$gbt>iIvk~oA2o*CRAck)7yod
zBd8*(MLqz%a5I35E@yTMjW+muCd}8|an+ciHuZ(&^=U83D?oeG!~jTYmmC^_U$QH0
zoi__$P*o`a0FmlQQX<0vIMh0n;FK*ENrB)@i^(Z4S5rlOq)`!<)i`yWuMKGg8NjAm
zsAtjYNCmA4k<tc&!^LtJ$HQ^C!BruMa#L?mvZtAv>h>d}q6$L<;$^Z<6ozFvo5=*M
zWTZB#Jeah&cH9nJ*Fz{G=lAQid@JD%q1NcOG*&gLEzhXeZPXC5e527N%afjm;MgRi
z=$$d@1WJDlJPB~Ab667#&`Ioc%jvv2^#L#k7*JJ&m#@=RUK|wgR*7#l$}^+GDxDdU
ztpV*$^oxnes;QN7$y}%B06>J7p-Q>qciLWC=wXc8r^=aJaS|F<N$=M?g^DSY#xONg
zL@@H;_ORv(ldzeqPUMMPaeGtRrE#gB#iCScNm9E*XUQ3ib_&$Q9Oqj7E;3;fBdqCp
ziva_}RxZ<Q5Go%T?M6lG#zPqL%V0_bgpb%{(lx57@-Rq<$c})%poL0X!c?a%c4#a&
z$c$hpZcTMV8X#pv3;0+lV?9Y2RvKuR^zo^Q7+Svtx@$p_l(+`ae}vA9oYF5=8KIq%
zdnoRzoe*4iNzS=2YG{1dat2YVv%x=LeDaM}onfMWs^3LoJggZur?a8xwd+VK!<f=0
zdIGFZdpKnva_TG4(Twp7DHBe+L^4stumhtL+Ak-eQ<T9~EURI|9e7Z_E4kEo)Pk~6
z+(OaNpGk>OPBA4C1oRAR7cj?)qM@w8K(#a_nHzJw%O<kc9H@9!!)Z3o=(GT|ZdL)h
zBa~&g@1_Tu)hQjQG?sR%P>4&}W|j{t87P@{vJhI;IY#F4a9r!<fitN|wXiQiG=L0~
zS&Bfza;u3}T1>OwuJ<i&NVHIc$g2_tc8~5FNemgGM5kK`ek&x`1|DP&*h`vD`WB1V
z6;u>a;FCkNKWP;-NC<0*fek&xPrBIDz$w)B5P<#YTtwC!HGMM2OiCmGO95Dh{~JC5
zcOCu;KB<Q#jHyZ0IM8)TBY7uJWd4m$5-sH_sA`J0nE-qM)(Fg<DCporY8s$KHhs_=
zk<EtwG&`O0lvQ;-3RrT2lBW7JJ0pmGuUYn6Og#jQ4MB6wj+%u9C2l1EsdVePU`FEk
zfm9u{+JK%q?wG1F7#D<{W*yY$rSj0HDJegWl)9&uOgKZ^kz&APEC_C`LNriEp%jh}
zyEvLOo8?@ERAau-%ah&w(4=5~ge5N9!uuTCkNQErWDMQmDCjorFmJ?ES8%|>2(dm0
z(VCz1`%Eh~q7qI}7(C>wUa1)kbHzHFi|IktB{Jhmo~?8nrt6?}e~h=&LjeComiD8Z
z!Igw6f(2nUVx?4}^_#pg)f^WYO<f5tuo9GH@kTDyrllYS&bBgZkD>^##+h`JGj*w7
z(o3pM_Xg=T7e_G5mipxq+whfUoS*fXWI!oSrA?OO21W<<xK|8q35q*IV$|;mx`9i2
z*tZha&-9(Ts*bEwz`|*DHYH5bO=cs&me5qw9aWn-+Mi8}o;+bQakbURQ-LJ$O^wdy
zq{^`5G)*Z>coZ3QdxIvJE5ItMTWB|{fFc{iHOnly-I5x6eSzr|;0Ew#rgfDffv-q~
zR!hT*d@3d$Tx7bzpi}eo1|MP;JrZ36McDCF@73d^K5nHJrPV@gkvGRtwKObb$pTQq
z^h7I{ji4{Jq@*JYs2~x=YN<c2#%{4BnE-R~<q?f$hBTLuC9DAa=a$<blbM|jBcGoI
zo?!Vb=WAt8YvZmYFx8<@7@#i48iAR*QT5bGpvSp!3`Cf&6?mL6N*Obp0H2;l#TNSo
z+;&qBUujrK*!2)mMmmu)%EN52NC=oFDda2({b7^WVXai-+$InaKFPGiky}Ewkvppc
zu0k9WUZ&P3Rbq&T>E(y0RJNxh8d373si7g0ZNf&)1#aYkZ15vL$uV8Ko`bqkH3Bs|
zn3qaR`wJJ@M@`$vBf%)!8Tj6$Obz2P))K=kpf*$UR0|zR4muM*XcT}04rR+QQ$icl
zR!)f<mIAm9e^9JR9jm7*akHSLzLO=Tiii+<?OIasN5Y6AMgi4MH;BrKs50!?86FLV
zl7LQ-)_9VE`TPVZbk85d5S{gPxE@8Jk#Idf#out)hWrU0x|3|nop{~Y@TRs<^dJ}?
zQDs@Gn5Y+wK<|Vng`<FWJv^xT&ZI<-hdpT0ZZ$n%59m%mX!mC*#R5%3^czf#DgpAD
z7}W+7(L||o)&N|e*)GtGF-$V$sNS%~<rWNksSP)Btx~B)3cyLVG-zCBbTS&(iX~>$
zkp>;yaWgsIk28%{kDEBkv_f@Qfl2zHCDP#$Rf0MbsYT#Wl~6+oc)t-d=tWj9CTjsH
zgGJIx80D+-v{wr|#er9GM_JJ~Grhi_ZRrJdz#?P6U4yv+1nL?wSDh9|R6;fRX-%>L
z2Q+fvVmqrke%_O$SVXu+-*=!&!M7Dos->60X-R-u<tb6g7Wxt&+5zL_n8a>S>E;xX
zEECIoyEIX!Mp8rvv(&26U>&((ddEwhQ3r*jNO9qDwcRE$mhDD~Gn*vZ1k1H&CV{~v
z7lQRh5hDb(Q5%O!tz1gRopKIr0cVmB+Q9+_cH0Guae#HBXL!__s^AFdOl1NF^Nbu-
zXiJjOuR4Y}OE(23G&W2fxzI^0XrO|{6Nr<G3T3!Q4SEwe27gmBKAGoTtZY~7Q!by3
z0yr7x5Z$z(F;nqdIM(fFL&eV5(w+i%;FLUuspj-lEzAZAP?ZElm*F;Gu6w>);md8Q
zU5bL)P~p66H((oeuLkyr7>sKomhq?FWP)X|`mo$eP4UD?V62kw$tAqeLCVvVUjgxg
zFhcN(*Z|GygQS*XO;#_tMNLqp$gr_&WnuwD##P;{33xNwW$01VlBV3)1iafUG^;o*
zl6cvj=(9<`K=qn(A+9iEWD=`EOHV}@p_<;rsVCMf16AxQJir669mH&qnjp+l#~m;#
zG2%y4T7avvLBm-X(;6&0=KF>>fLeJ8PkLbnOwEA=K5?vkTZxr2j-*>ZLmE?gR6#3I
zMjbV0ZB%7RRj=Bi?&J`Ens4*<wpP*EAdx*y1`UE)G3ufaxSiNUsiC9>b;k&h`XQFn
z`qIE70;uGxlG;hZ&7>?_GLJ}sTl1}{6~yH*H=UKc)v+4opqVht;zKmG@EL0A%CIVs
zL_PGg7{jto9u%iA#inb3W+5EZk9s56j&NHXO6?$O4)Wb8JTuyARvb%8SuCX;RX9iR
zf#x&IInNT(MW>Sn{}hMy!B-v`j4Clb`vUs>U(pXZf1vFD!_IV|BPK~pkBlZ*NVjYf
zg%qr%kseHqx;-Sw{X0^lqp8&83b~2Owh=DpXc^cSC!%NJ8d<5-2&+2H)$Ed&N_+`O
zieOQhqC@zjQoGnR4G^HCZ5OSR0~b{)b}u^uQKBi;2*#v1X!1Vr=WW{(fPu4Z;3UV?
zaM*Gw04-&$+w0WxikzOpY$Aw3TMFeo*{Bs}%^sj{hoanxS=nS{J}@V`DR#RmNsiPB
zidO4#vr6S_Gr2w?yG^+gm%R!c0tiT$QxzX*kt(rMe%O)u8iyJYjVp9G9*FhgxR6uy
z=77$GwOQS5H7F3X0XsuXMZaf^!knhTD30q!9?hjEOF{u3LfSAp;INRz%Z2W+Hi2li
zo$r#i|Hag&cm;>@g<8OpHfJ;{(M&KZH5V8{S}~d8Fe9o328+UI5cJ0~NeUboE?=lN
z8a)t$@``LlK;m9+N+hh(i=AdJA11UHr;wC&?XV9UE+MK8<l@r`ldW)mPwf&a@Y{R-
zEEcm_rqGB)rQu7N%3$0RgoeS9LN^*Fo;<GfwTUq8dEsmTcqGQPTGc6G^hy<U99g-h
zJ)?O$z_WS|7n*X^^(r$j;GKxV^Vx=;Cvr+%k(%_R8dO6MDDQmMoSD6(sSn2btioy~
zKzP>;vP-t9?0~74DK~HV8MG`YQB~n*9Rz?rBE-A>^k6Dt$CVu1?4wx?5V++)B4<$x
z?2;(Bq$>cGk)SZYubXC#D#js+QQ3^1RLQA{2uJ%k69bu1^=T1;j<k_a;fGD#;$
zJ5(icjA!dPyo_{;S)r!}fQ@l$@C<C=$eJkAh9y)WYn)3Iz;2Y2=D^M?<FG218)T~X
z3ACDV>|SrwkAOavG@AiIw201_@GM&-Ac|pS3HY7F&}CzAd|zDHOgqkWTsPmbXD-Gx
z)k1DC7E4^c-|&WmOb3MZIH@F%HfvYViL2y!lOc<&D~}*y7?(J<+Q@drMx`80Th?H}
z`mI@HMXgCmNs5%viFB<H6{Z1F$EOj*`O_YS<E0+pZ|K3KZpk@2-_eVDt>MXGU8D3A
zMvYXqud_9!M`Y+K0P{u{(*@xDPZ+Q62<iNVDH-)NplXw8*3ulKG|ddV<B{JGit^Y5
z;Fned7PBfeuvqyjE{q9q^ZUL2tnNYP%(j%KC)5!X^wgd@O~b?-Q}CPqq}eOEj#+EA
zCh}ks4%E8QZ5M`4Yb+s@MA)6|beO1o(TY@s%v&=$BJyTMVUdn5TgYsn<&8?#nCc-n
zL-k3FC}9H}l&g)a6PM*;sh%@RV!FagkuKztBB1^0Y-iXoChVX(Zkj<wbiD~}aV;fV
z4-=e#x<+2FHb_nlsNsae%?VL+HE?j0sr2fFU~E<p%rpj!reS`6GVTX6x!R~(J<f3D
zJWE9=ZM16<pg~DRuA9XUz`3DU=Lsq25oE5OY2^|e<FiP_Is%r+wFW(vVxc~ztrTOM
zauYT?{b^h144WB!s5K;%w_(HybD0{<`oeSubw?dMgejiyNMR5F=VGLraU4v84v5#<
zj74KEF3_z|OK~+=`n20}$fOq9bS^99rc#?g>e#3>=EHb4Mp}k8!BcZY#{*EG*JFTo
zMi?;W5Tijzm@h+ibs+W~iiRC1wqsI8K|Y16>aMH}k`4q@trU|e35r-4Aq6(Dvl7(v
z3c2<`ALYfOmZiEW4-OFwgQfQw>oA?fV$7gj=rBB>`YIfWRjPWmJ&h-05QH~_sgS8)
zie?70sVxmyQDX}Lw1T$cc$Vvg5GuEOhQrIcC$P=BhMN+D#Hy?ps8qyIw22vVCJN-q
z2#~duk1An6224edGx-=F4ohN+<L%*uEp<xaq~7+uDWi3ph5(w=ZXYSPUCqEnYh0wd
zv>=Rus&pltiI}KWh^qr34#6<)tfos6cnGi}4!(qxfNTYx#OPjcs=}?<QPH8@XieoF
zh(xwkj4sd8t&S`8vfeUDy;kWKnyfn&YeI$y5<o4CkrD7IO{HE&sljL}q5*^IaZeOA
z5ABSqEkLw_4kRt&Bw)~_npD%>E|C{Gy2A{s{Soj!BZfv11_X|VvF9g}4|vvwpht~R
zQK)WXH0_Xh5nx~;AN8{32{awlzW~R!gQN~N1s5vn)EctXAS@N?1FSXbcjJ!N4s+<R
zTr!ve0ViRD3Z}GysttBB^b~T8N{vFs4Kqf}ixpu=7_3n3_!*Lw+BF>FI$R88`fV@e
z@&H#eb{P`j(^{V9bSqz0GhTnJjHD{A15A)gEvnIp0q`MhHj&hrHu3*En*&bpt9WMI
zXOVoZ%GDIUpHnGH5iQ|gct!%zCV*$CMh<E?MwSzEeX69fAZMuSF;2JZmw`js(+hs3
z24W)Wj2mffC?5qGTIi42o-&EsGrBJ$(-f@Qa)+^6hS06F5tJA<VjvN;(hPE3v8Fnu
z65J>d1-d1qf$?S#z@cRcK+xrplNkaR3mji+_k(sSM1(5Wbl5bK-mE~_C9G)RAPZ#@
z=i)w+)NnGv2PO_j84Oe%r?-|V*bGzu0@Z|Bcs7;;sYRk;3QsIOp{qU|kLz|+j{}=k
zlb+SZGnGaxR9k~K2ag<uPbeOp)<|+->czn%i{YgrXOAIt>LHVa52k3oms&f0RFy>|
z4LgBYy<Y9tP%qPo=+s|k@EkhR10&zS0rV!CR3w^E(3tvzT&f=ag=?s&SnMiZ&?mbC
zI0N>t=bC^~0cT9O02k+}YCYe#X;a74I-lv<L0uKOH2H{Ngcj21k#Pk?Q5tap0F&Ba
zq{}r&rX>UjFyWUog{+uWOSV}{FC_;Uo!ShZwwn`LE9z_pE@USID-V@vvNUNF$DLB0
z7n+qGpX1x*MiEviJNCU2$I%1Q_2S+XbTX4lt#PTV4*3@BCX`7{HM=pj1dc6Ay`f&J
zDp{)xTXjdSlz}6JMW9VFJ;*|!L2O~D=Dwh|x={@(io-tDAi7Wum^X%3N2WidJgl9K
zLf;Ph-Huww#6ATA!KTCD13L<;aWi!eB#$VIh*3bQgP4(~#Tu0c_e%8^Z22@uz_GBd
zTnhVTaB+yF!p)>^!GuY_T`5fMp&Wp11))9=St>_OfN&asci<(!JBVX4Y+HfcsP}|&
zsCc=dK{(MYWI(Wz0;E3&+LUx=3X3y!(g02c;86-GB{zb>_(auoESJj9KzP+8wPGo*
z=h#6GO8kQ1CCG3F;(~Cg<Y93m_P9YW(=Xa8OR3?o-K`dKWe!lHg#gJ}?N-uG%bV3E
zZb2o7rpsfIs<soOHm%`^3U%tb)f@`I*Y6R-K1cNwy4*FRnH`kCWRgiVYxsJz9HO*P
z^&<`}q-}#!jo@j6uR5b<x!d6>oEmm}qhddr_M0WgbuBzA17?UWsQI94XNs0tpHzXP
zOP6}pQN-D?KB~cvIx4Fe8HQbGq_%0#?Svz~+9l;)a~1^tAY;dpClIr!+U&3%tJ)-~
zuv#93%Cpsms{&VA&5VP7;E6NbYog_*t7E35A?U==p_#7)Eu1Nngac1H?IAK!i(%hu
z;)zEJcD`IFawD`(Sfn~F)`9!jE9offvZ|X@2ephDm6Sm1LUMtQi?LJ%{y>YLN@6TH
z;doqWfXfPE^k6-cMm`JT3^U*Em*w05oFd>Ff(F6nZMswO8dKY30qHLA)tIkAp`@X@
zJ54}p9aASpc&U~Xiy(BT>sBt!J{wGOr~&hp8rla64n)kzPPQ5a;~C&Zptd?IS7Ci<
z*b!O8OE5HU;iY~Y3)5n&=C=qF!O?;gj3>Q(t7T?OuriLwyjLm!=B6T&5razoK1fNL
z^~(~O@fui{8E5&D$OWbw>Z;wTP1-Op|K;|??#KySKom4A=sj#gYJpk<?q{JtrVC)2
zfzM3pRXJ){x&!EHvu3o@4M97aE1D6GgZ#jtGX%^6JtAqKh|0|hq)$qK<blBjrp=hQ
z91lF;Lk_Krcm{%z#t@du?JA2H4J*;r3F+yh`Y3}`C=d{`9B}hVHVQcn<eK$zcx_TC
z=&eSG4zaSJ$8tN%qatPxW4=3TQn)j&5=vabXbJBYv<Bc}M<dQc3l3mLWDK7H%qQW6
zVyuoG3WYne#c7m>ifs{CE<#GW8e2gJZkO}Wgwy$Pp}{E<jhcF$Zi}NvWvmAcTjDBE
zPw>i0lEqNZGiMB=jAq%ghj~2)9rV3Mo_6bDYfzh@AR~yAD8(9Kd3-8DC9eh`GmSB3
zelWA93>>AXZ6;`yp<GAgzL8<wAVLG4bn^|q6PLuSPlN$7!gLwW<IV_#YI|PJ(o${M
zfuMXv2MHu&fonG<O?StpRE0>vNO!m-^IUZTbtOiU3OPxu_ePG_>xrhwpjs0xfvZyn
zuw1K}UTcPiEMRg(9Q4}NTP&%Yqp?^SS?vkt16e0n4L69Y3;3lF%~neIpapBiY!@2L
zK!lF!g&Z=ekP$u+)%v(emS9VZK`V8Jhnc210)9d#!4qXLK!mCd;YgmVqnIZp;}D|(
z*Fx~3J{&N#Tx-F0yGbQ2%~jE{mQXdily8o4Yb22bpUKAhc%<=zLf2IVNy<SrJ`Z$H
z$MYM!Kkd6@rJS!)B_)^bWwlAZS7X(RQ3`ra&CF+km&S{53+-e9KLj$+1i>HrDv>3C
zfQ3yK1Nlu&tseHOZnMwPsbQVYWS@ld10LY4A;=3L5;9M<Er}c=J-0G~t&o_7M$T;6
zCT7fP%@Lpui{qLdi-7HJ8l6nug84DqRX|@J*lDR;6X6=5LW$adZ)NKW(kn<Jg=Di6
z3TK3Fy~F049gD89Bc|<>Uj(*hog`@M;M|K7&zwSWuabf|r7v<+evq_zfGY`k3UU~N
zanz>8wk~;PcuX`o$Fj??BeSU{&&Me^^ksl=X<B!nK^4#C7#|>>i7{)Y$Lj;ULzmlG
zFK74u?|2Y2`YIkwTCiAm1$keLo{otM-_i^KFaFJM{jW^qH2bJ3v0adTBo1&)42`Oz
z_9kZ8stQERNygLz2~_<7)ubyu(*=`^OM&90R<a8NLa*tB<$AW13*D@TR;QVQI^jE*
zYUO-k+_Pe-R8651MFlY;(J5$g{aK#Kc7d${=Rx$5ZcL&jzg=gEfU*>h138XJ1{L|f
zmt!j>Vdivm4Fn%rK<g2G2gLOtGgnCRARLc$vIfgfYSK_8{S@E_HN2qOV-6Fto~_A}
z#k*~9Fl*YGTsLv3kq2xUh#}G)wv~~)dO^U2l12l_E9E5}MOhSQyZH`^Xc|aJ%j@HD
zFb;w?l)}$`AII|*M2gYLxSfHr{1hbijIc(NE6xI4Dt00eMog~|te`1fIWCwkBRG{=
zAw@DysBoeb1%n1*_SspEaI$XG%y%1g+Z>Dm!-#<hS6k3rMQc=y8D6M`T^}b{(&@?`
z(Q|AE7GX>TsV6#-4TVfhW=Akpj}oPqB(v^dERP5J#4_0_#CH@x8}v+VCRCi#P{?&=
zg;Aj~tbp`D9qJ4!84~0|7(70mQm)Wp;8rgNYBns5+ThGa1F~r5o093|%6yz15DMJT
zD-11`$sj?HLTG>pdq3!zM6VX`P!m;JD$mlqN3|pcv0#j6a^*&d$tKoxym+8BGxl(t
zz!QoqsV$+yJ7nswYpsFWCX^!4rTaapjXLxfo=~~%_-&@!>P~AOGmtBO!KbImkPmnf
zjbK(L;D#|O`zoyXR85>Dok^=71|r?1s8*TO1~t){dhJR}CRDl@DmllnnF#4tB{?D}
z%appgFH*CrN|pDL)ahz}ffmRqPvk18*GYE^V{Vdb)y-U)&G5d|ZPv%~gi$KR6!FVF
z)tadExJOf?#BB(?R2YovWCSFfh`XTqfXHQ9=D-H|195`~p;u7GPvfd%RjEcLELDdb
zWdXd}m#afvkV)C@di@f_rJ9Ka9;?U`)e==pV`RC~sH6`txF-}L)XW$l36`2zo;Xl!
z5QWT=j+{%;a2Y{D#hMEFT1Mius!<4diLREq4bY<kGGQD<Yqez%0x0&!BQK{{&7wu8
z+KMh~Rk;X!e7a{heG*}6(=?BRaAruCsI{BL+yEP@K`pY35r~<00za74+#KFDJGqH5
z;t`_V@U0Qs3`#SihtZQ#LI772w}HsFZ7-!GolJ(t>}oXxk?A}MUh?3&v^0!{!Z->j
zSq;bp#mLlw>(*UI1R3O!j5ONHbS$@G3M3DUCJeeEbSDd9zaV=cQP84O1gUHpjPXSu
zYY<7M4KykNuv-WN@J8elpsS!ZA{(It82B`vL*&^OiPQ|yQgh>~1Cm^vq830b$09=o
zgxv~Eo~;dwf|#NzfV{DakXqmLN7HbV!icAeQx+*S8s%QKqwzV}N%$(I6*BGi7l8>-
znTd=ZFjRyq$xgZ+Om)k6sw$;$mTm+TSUJTdVijyK-3u_TFp*PsrfQ3h0dlyQcGXXh
zq^NY$u`wyH(tc$u4MD<tIU2?-Gb$u>MjW<#$uvmPs#9Z{jmxru+okD{i;*hbGgTUE
zBt(>rGcJtVQVHbwRHt4au_Y?5MFpqg@)LzfEW#S%z#I`dW<2h;D>zjP37(v`Dq5e)
z+g%n&v<2STD7Qe6Nz4QPe_Hk6Ea2`4;E?!v5j@P&IWTa&%YsBZoB-?qtrrQ9HagR(
zVp~qU2`B-%Le*(P5+z-jvSyPFxk%^Ac@fYTgb@~5IaE1x)@Z}9IFv~V*NSRsI?W<J
zNR62|LpmBseSqePE}E&x6@-Pqz_TVkg{uuaZd5Qo?~v0#hP#a!?}ZLrlhcU^<6=+G
zrb-ykH1(-$j0@>a;oL~%bMQ=L+^X3kv57wDmTM5G)`*J<ng<Lhcol+Ta-@Nb38vd8
zyir!Is3UEfiqSv>xDE2UNS=-bSTw|5id4G}4;BPw=g5}IC7Oe3HNUK=Y-)~7Ovf};
z8O->8so@Mu6v(|8HL4&Q?+nKirx=yvge?R2OfdWHWSDv%AVChic{7C#@cK-n?uNOz
zhsM3Yl!LG(`v#OxWS51eXe10A*mIz~99MkE7HFdhW>eKVWEKX==R@!|4q^vvh?FXb
zfR0D?2!;A!t26*VS|)F0TkV$U&&n)#*~HHb>WoZw4Y*c-(sK*PI=I+OIT(%GCITRS
zB&F!3Gt-K+*pc&XjMKCEOo*WrnNB+N$5}C)Xp;&&0{5>6_<|`eVIVUm>#P0NwB!O{
zLBgrX3`!{)0NI@dq|x^cQ%o^HX#}PBTv390J8(#=l&|<~qdDv)6-7!9D9PpcM#?JX
zaW$0NT?3`<d<DGX^Zy$K1a<%Q+FX`;=Cg;9IdkUC;mCp*9sb6J-?-rHM?b&kklQa=
z`Sg?IPuQmUvjzJuc;==n8>L&{XP0o#T(!@>C-S9FM%-oOB?~UuirYw0wFf^wcFh?}
zUjF=#Z~o=nCH;RK{f`R|c;&Xk4?pkRSK<fq7q2?{f!SvlA2!|y|Ju%TH(&4M<BGr8
z?fETF$<O_K=||f?yWuYXE%QK?m)M`2|F!?rN=sfi?zs8Nc+TTTKLVPi1z+|uUm0!w
z@8>UjTz~HA+V9pAKj8i~#FuqX*KW>#wOw)E<{RP5x81Pu>>mH$p_l*d<u9l9uQ86t
z=RMC#J1@Tf?mf;pD_pU{g+KcGgP+cS^~h}|7tVV%!#&5mbNZ6U{yV4R_t|~RA8N?5
z+#Y{l@*J0Nyv!Y<-EeJm+w@l}-d}$3IX7j<4Gvgv;r8!+^2-I!&bNNP;-Aa^vh2O%
zR;;HCpMQAyF`FOxL}ux8&#yje`N7L>;ve~5;kmtbf3?S*wP5wx?_IItmkZCnf5orw
zUS9i2PT~XZtcBZd{`}cLy8IUTD{J`urpGrHyuKZ`YN4~~mfzg`^Ya&6xGlc&0R8>u
zuAd)s+KP)8teJO}@%06pEHC7bc;rXRfBj?e_r)8(UViQ3-%bC7?z21$w!8g~!K}Q~
z!mCbOv034{o7qQDPFS#dqXU^;qk}KK@!+=)y8EL&x!c+E_V~u`Z~pL$J^ym6#^k*D
z%aO(-pB;b8b!Xl5=lKWiwb!c!PUez57B`;w+C@L#*ZQ9ox6fUE*8a=4+i*o@*=`Fm
z&o21eneVB`9eC676UD24xxzU9t;K4>eLTOh#a7pRYwlZzEP6HOf`yN7vE{y}h!@XW
zxpKiD-aqT}OMiGkYpboU{KMSk`;>2dVACzRk3Tu|xZ(RhesA#nt<HkUvXDIPspcO1
z!KYlbQ2C&A%i)i}+!h>h-ih11>T~;*cUoi(8V6WUuE*WFuyEy`TcW4zx?pttf@jvd
z*WSxoy&-ndW{)pgVLv}!xnkk+*GkFmZ?D^6`ETyt_A9dV!ikIN1+PO~Z?3)1MzvSw
ztoPUFKi}Ppinq<%W*bpmvhdAKKHc=pJ-7StgAb3}e^g(*i`3%(GUuDcuJql-SI+BS
z{PoA&g$oLoH=6vex^di1N1pYy-4cB9PV@Hv^!yLj{BHR^8|`-4l`GWs9^v<6mv3;@
zQHMNaJahVao5Am&w&H;GAKCS>E4RG*d&j>r@3hu|3zoir|3*i>vJ-a&`!TuIckkQx
z_+MNyrQpBsd*$2H(oPFrJ(%0$>({@u<J;GNx6W>N;$zn=y6LrNpP#<7Pqg^Pee&0Q
zd)^gWUVr4--UXNc&nGMYbBO*Pb9X#*<&h-*#%>3*?aR3nPTcM6!qF#Hx90a`H@ljC
z<NOc5cL2Bb0*bhC-l8iP?6*04I=bOTb04|<gSp!Jb5E!~_0Do>gB6FIdF3YG|AaWd
zvE=Yq&(wC)lY<t|1>Zma7=7sLr`>(=CRgqhZ|JZ6@ro0cpZE0@%9aPQJAQsU@reBG
z1NQv=-{-G)_V?D!QOHO7oA8%^wBFpc;o|*I{_*MSh}-6s*|o%(E4RPvrls!g?+~Av
z_niCApB7l;^Gj|Hp1Ae!1OB5`HvAgAyRyTP`<)%~i<hr|{9@$foXMW@i`^!~-%j5D
zn!^?+-(RrFH)?w>Kk15ZZMEY9Wy|f??Q-=6TQ4&^W&IDU)|-3seb83RTl3z2;nn4h
z!6VzR=x@2#rq4CEA8oO0hw?D`-Bvq1cIlOGEZg&))1SQN!DqCmfAR3QFDly1L0kTE
zll6~%$z4|8TUqtidz;;~-c{S6!t*D+^U34u-HJbUqx*y16iMCw&x?Nj=IVQ2cxBc4
z^jkk$VI5q-<dgop`CH6A^BHaO9(!$L(D~E%586tb)7{|V-(2+fC+4hI+~w}V=FkaO
zzP&R_E!*_<ZBPDd-d=poqEDfD$ai;Hrv2>Y6HfVS{`afcAN{aw@3JgHzVTE69MWy5
z^7`CIcG>iAA1q<F`^yh*+V<4P-#mV=BQF~~Rz3F2%`g1s^{?OghqrE;v*=3JIpF#`
zUQib3PaRyiDSG9Z<Fuv6T)6kE*#~pQT@I*ig};03^+ai-i>{n^=C_G|65Nf;jyuWT
z{*nDQS|Kk#3ETZMW~)=b_Q3zTBP$kc(wlC2>u%*MpV<gGPTdst4*kudP4?RUk$G>N
zw-Gice|_hh(H7qyKjtpmoZX1r^wcvJDo??M#)s>e^}e<D@Y3sdB_<nn7T>=?{?3QK
zwS3MUzv*q>`i@<GtoN7hWe0A#$HqLgqPWc)7d5IXw;lc3vhT86u08TyYL};k-Ojyh
z(b6q9Ubf3A8!y;suWdiO{s(XE@T)_YtS--e>!2N@Exxtq3XQ$&H^2GTbsHbL7kkyh
zzBG5EyPrQk+_GiwxqRDezqjrBkL>r_rMo`5es6<2{#knch1}{t_y40@m)-i!-|t!b
z$&<G}boDj!4r0$*_TGCNJ!Kw0@1;5Guf2W!UzC>3-I(8d!CpVnlyB|3?gyLwc<pxN
zohRJ1;p$zqQ;AdNp7!WRce7md=F@YJ|2}@?liIycU$gwX8*bLW5?7Y(|HeIUuUj>~
zapi2cU$nQxPT6jQ8#lV+#P>SqoOI4b3$2^3qmO$&UbD%WM;^4(yy}T}Ena=<ic{xW
zm+W`P&$b^%8;CdI8#I<}M|@|mNawewKV<)T+YjFQ>oXfX{pLTK)!oOxwbpxK=}lW+
z*O0$zc(<PW^p8F~eZj_O5O;17ZDQYa?E1ew@x$6rw0AcB!E;ZsCoMg&W^V8orn-@}
zYJ(TPzhL7#w;tZU0e8p2OMm*)r}w>b&v)Lx>mM8MP8~8_Z@&eHMcD2K&3`Rfu)&hO
z7BrU~{MHAzy>-cVNavJici8HXjkntGwVMt(`I@~iogTl(4G-)vf5EaXg$urM=j!>(
z*Zr#hl?na+YsbH_eB~eK{GyOsvtaL4GynDH&V1?UU6#G9BN>~1>SqSE)yX&g<Lp0g
zeB<lC{Lhv7l$iII1)Drb4sU2wfA&v*Vee~=BX-|w{!zu1n~ZNg<+GEj=kEO#5b@g`
zkB`n;iDS<l{g?mR>-A1v?C%aNynp)A!P)=mc;~tKo!4V-^go<)%gg^&ExRqd0Dy+G
zFL?0<^>6>hAd5F=esku^oew_l+K;|6*n&L(4q5ohuYM)v|D&3ATXxqsy#;%pyv0uk
zzy8Wl_nr!%&As_W*S5aW{w42gm+Yo2+N`|!u5avs9<n@s@#XTS>&-6qzl1)=UvTo`
z1t(wj=Hs`2e$1NPURnFlYW(tzK7Fg=m(M@yhv#mK9RU~j{Q9x|1OI%>Y3nUJ_ozGn
zy7RH8-GBEBFR$8f@c|bcef-mZe1+Y-%-s0tj%V!LTna`<Dx+2J^$*<nfzA8JZu#~V
z+hG@fcGuHq?sM$*wR6>-?=4;Xv+(@O4tepf;MJ~AzJJ^ytAF*;8@oPmy7tlcC-*(}
z0@%FGF8k9ypY6EC$*b;qde^Ugde!gl?|<%17yMz@mBQuQF*hzB?D?$m>Vpw}!7dN1
zoA-Qu-_gOJoV;qjvhK(W;nmtX+UlEUryO%g)DU*vWyJ>j{hVL(lMgQM1aCmhRY$$L
ze5WU_xOCC&FI`fBJ~8ikCGkI=d-RRLIoIwhmo8Luk91D{+x_pY5tDg;Hg+suaL4P<
z9s7Cf^?!W-@yoe`&R%m{^xeV^%_BbjP5k19f6A>pH~+E!!R+ebuGMy9)z!89t0$iF
z%sJoP;-ll=z439^y6=rwCU2i{h`7d&k(HA*8}Ga%yn68t%|kx@&C}?KN8b`Zd3^HM
zr<w4DcQ)AFKH};FU)uCJL0@q68_blGu37T!hweCJ-D9iH*zA!{x0kOz=9+JR{QhV9
z<r})rpI2US`F@Wr|J!xeAFf%|dgIP_?|gsw`SH89E-CfCci2mZt#;HSio0I8Vq<;3
zN9S)5e(=p7{Z`&~wCc2NA9(NWleQha_vsIxT>If;aWa19(pzpQee~PepI_PTx8Hi@
z+2{W8@VV#6^0VLC`}HTzcOQC5S-QIa+!pHRFNhZ}Iq#|0Uii_wx1N8<BilYcl3)4B
zkM2JP@*X3RtB+pu^*=wm{|&o*aP#}oi^E&rz31X%udb{;;JtMx|7rCnYtET`eAONw
z{rS<~O0Td|_nimNTz${?wGB?5x2ODiZQI-5KJXyq>f3kg{^5v2ulUAYORl)<^`~#%
z@9GDyzu}p+4=jE8gQe%JxqSco-+E%d1D^Q%mGjT|{h3c0mp%OO@F&V0=c2o9d%)X|
zK6u`m=;rxK;rrYC@Q=m@{S~)7yWv4AcX1cNh_~n2E0b$}aozWRy2-}d-*?|*zd!S>
z-!3`$jB`J{Kw5J{Vd)j8p&xF#WO$>o=&;MH*ZuMLXGZ$!hu^vT>0ey);PvB=Ut0CO
z^Hymm%~!s;=}(p}*=9ZVs&xR+y}quw_1Zi49{h3NWA485oYmf|*POTZ!|TI`{Pmw*
z_v-fN9Xz`Jqt9M;4hXHY>3=jf{^>UN{&m-HJaG6!n;h0|uKM<_|5%`WyU{)A-6x+r
z=DMRlIrg!I)PvX&$0m30KDpKX@%tm?s=Ky0WS_@>dG<}~^jF?KLp%8McON|D;}4E_
z^^-N9Y<8yejCI^I=+CbxeDI4uAUizzaP(qjm+P)Qn%VcQ9aretyI(!wpV$G*x4mQ2
zV8PyVm+_x{^APrlr|w;abnZLsQg~DOAAdh^@jbgNIqloudgQJHPx$!l-!8r8?`yVw
z@G9+wO@yO&==yt7CxC17`EJWDEbg__1)s0`WY_Bd>@Q#V@#&BK&nIt5Pd@R>znypU
z8+$yk^%eKafBJTDvtMMtvuW$a{U6`?f#tj33|`f{_~|p29{%X!cQ4xMs@-3?=;x1?
zB`}Q>c6^zYzBcc&kLMq=Y~`&Ro{ql1=MMLry!UMFPM5FwWDE2Ct?#;jK09~g!;iW2
zefQAIUfg8o^IyAb%dMAh`weja^@}y?huIB<pC53G^~nR@OA_(e$8TGA-eK7@9A;I~
znE$({pIH04d;8$`^Upf%pewT354Sj*Wu*mpuY3L47wc;mpK<F=Z=JmB<3F8jasRnE
z@#If#-CfQH`1vaj`{hac?RUNN9p(1Vj%l3qhbQ=pADCTTzUI)o(Es_-_%EC6z4n1W
zJo?A%;ECJMdEk_TKRM|B9gq0^Yd_m&uXILdfEm4e#vz|w^7>BCUKTvK=*m;)fAZR|
z=HBztY2Vo4Md|J(2N$>8_@ZO3{)hGCZ*My456s1{Ex&KG)xopfZ5M^cX-m$!?c$}_
z1It!p&wccpId4CC#IdiwSik+D_?2xQ`|XCGTy39p<Gq_yPCN7`TP;|6*V^;WKlHNi
zzq~SAYyI5bcI}a$FWUd(E9dt%J@mbc?`lOKe|j0cV3+&$T=KiuUq_B!SN{0LGY<LD
zpU`j9_o^=xZv4fXgC7jHf3@}C9cS1d%$95#6T7eOJSz3v$8T%?JY3ek>-lr~ry+-*
zvhOX=x+C@pdT-^OrITAPd3v8!>G?l=*2<O7|M6pH|67xFXTAFVDR-XqKx5reA9W9M
zmcPH`or@mC|G4SSYfrrVj9rHB|7hL(J<QLR9QxkN%kSluatH3ToqNF=hh9G6Z}@KQ
zdpq2*_&b-~^3d{?4?cIy;mk`9_D|dCkf+{1^;-9_!;8sk`Qium-Tm+CQM=x7)*IK%
zj(FtUD=u7q?Tx>D?oU_Wh&}tW(`V>ot;-f6-+J>UX#Ljh^w}Sx-(7mdgNyDZPW$7V
z7j3!u&tA};J#5YLkEhmEOCLDoo}0YKUVJon%zYQ+uG`yPb@CZU(`!yX9l8CMw{Kgu
z+IaZUn@ihNZ_oa4w)Nf@?ECTqZy$ZvZr6NAdiUnw_6^rQwL&@hs9mr7<yPBofA0(S
z{qc3bTXOKF%h&w%iPO8!U3S>_E?@EW4~Ey3{#O3mf!E*o@C(t4Z?I2%e#8U*Q5n8`
z<1wGTx7jW+x$dSLe%}7&>K`87c<PirZvSX>-7T9&zmE?1@jenc|Kmw*lWG2gkL2*V
zVtt1bez47{iznB-eDKf4XY~i4AAj*<*A2?=R!{rvn2*+CYu|WipS#}r(GD1Oma)%P
zA0P7qw)V|Oh}OvJzh<mD3|ey3{<}Z)&wmu|J0H8w+T(8*eR9v=*X+CN>t{c`^^G^2
z_`(M^Fcd2foj!lh#cMzN^=CKVcmK0XJHNUgegc2x{kQTLe0J`7$2@=9$$#Dch@&q(
z>Z8pMdFheY>-%ke#L`{yJ$`-8scxZk)h4$;_4Wz7DaWq+=$t3k+^S3t4WInuJtv=M
z{#t&*G{pKr`{y3r`?~A4-)hTES2Q2IP`c#2-FR}RJ-++qY{~nl;@FxW9B4cy|JJ_b
z?RV?9e|q%|#pJ}#{=!^*)V0q5t$oS8m%jJ(Ik!Ik@?Y*yp}`5qeE7~D!IJYn|9^bF
zWmH_v+O3_0U?I4>HEzKjf;RzzgaC~O4-UaSxLXt4f_s3*JvdE};O=h0U2+!Be)s$B
zJ<b{5FJz2{)m5vj?t9K_);bR;anM8D6uv*M8Ldz$%vTMnZ8|hN6Mi8kd29W3xLY~?
zoDIweO|ZVcUZkjp+y*ps&8{9`zIdDf>I`3j!blJc5!d-4X_6B6rAU4$zY@1KNVI>K
z&hGcMVe9!e@Zs{~RL&22_0~Z7^VdkQMnrFImro3-T320bBwoqdN6pXsm&(oqVGNAE
zk6yA2<nQ!Q-~kPbYWqUth&fi%ExDx@Qe6L(7wQZDS9#$$5X))}i#2(;m|oVt>7BP6
zt7;GPh4L#?NO}3AVRv(uY>|^%?UuWWVU6}m#j&=ifssEH>lRloTX}ox;d({2Xl<-E
zSF75@ha6o6_y*EJwv$S8{~V_7{V+pU#0`s#$Q$w)y5YvBspyF->#sUpmj{Nx?~B_!
zzpv$p=u?F>q9vUE&?RUJ)~Hm{C?-uUa;Xhx)@%w?o%F7@NTC+PTXRi!oFd-iH}dZ<
z=W!qO-nx<|kQ@3eKUME}ng4LOC48jX!Tp{$OsC5mQ<zo6?{r&SmqlNvBHCNd(te~_
zy<K*)5>#s0)vVss-P8L%Qu6bQwT&6Re=drSSf1cH8I$|jR#*!y(`jB-T0T&l?ZSX5
zB0D;SACjKPUbi0=epAwT1NX<vFt}#D_~TUgzHvhccR1Uzv{$>$BSW>%MxR&5uDxC+
z<4(K?Q)76{qA(q_<bE^Bj}9mKJ4hDgVU$``+)0Z#Hq~~+*3|AfU%9Mvax2tY^QGKO
zk;o_~PP1frs_1zdUxPdYUb*!itB>k!ULVoS6$PJ$j^@Z@l7Vlj&fNKg_f?I2_2{WN
zv$Vc~i#!2TI30P8r#MMtw->XH*sFcHMUo-Q&1)kHRU{03UbDGfp*kz#5@92BzHy)B
zgRg+AyY+?2p@%}sukd1>w0R}TW$6U_G}HGzH>GxM;}FdSr+)2@GYrq3NZ7M($Jc7;
z+vHi^7HSz@w^K$Lo-U`|A@6w8PmV?RT1qiz`G0#H8|1u$ty^!KR|oT640a)mfAZLC
z45HEYeJ`J$sRr7w&#;OL`OfxI%F}zXnZ|7BfbQ31gI-omJNn^bM=XbWgea0Y4K!Ho
zM?+1Kz%ez%ElXFgIjC*nAtSU|CibTSN88)=;LC}!o?ntaNApKl#ZDLWTjY6WZn*53
zT(S%l25*TP+D|SEqz+(jCPK8>Jltz75b~^hue0S@62cJSd!ubBa6+>PI>@qjrHBGG
z5ljkZ-NH5pGZ{#(RIMY{QIq;)C+E|sEdKZ1hb<9144=H{4u_WeIT7wTR!5rQvJA<5
z3_3`R2(J`wAshG|Gy&J4E`$iR7!@alm2Ps2doC(>q~a5fnt}TbT`z{)=-A}mZ}UJg
zFPk4HUzMfhUD6NlU#3l-F_4g&55$iXCjSL`gzhqU-uq6DgZnire35m$40~NaC*}1A
z<k{7gmJ+iwo?OS*v#62s)}?<RI9knk7p_o523(8Q&p#-?)5OH7T4_2&SXepRf_wH0
z!;$hSL1>-wUS%;ZUOz0S_48M~g;N=aR``#z!%`bRr^LCP+-Xm1@+>7EwfxOS6;UXP
zRHF;uo9(0cEzdw!c*F5y;&gfnm9#|#v0`+jeZeLj7lY!yYvv?~Ine}hTNaeO%3YoK
zd0Lo*-lPat4WXwUE(XJ(*xTwL2t2f!4$^&0OHUZ&xOttEz*FWb=iE@8J<l(BFyi7<
z*HRmeo#<6*nRqgA1!bMs>!L_2W5~=X6o0%|sGlzVLES2pT-{`}s6ih%w!=`ce-(fn
zrPAYZT;b~)(S+$Slg`1f-t^A+PkUcF{s|)!H9VZ|%|4&)dpVfV&7t~@1}j+O1Z`w`
z()91VJ!6r4m{c$*!igs6-Q@4Dw&hNn_(?@T<;(0ddP5KSBZZrZKYD`7)$BBx2CO43
zK7XytgT;g(0kYy2FGloz;4*tI%Z$}OQHD5;SAX?R*-ou@VGzFnr65)`s47S86^i`*
zWjxW3ET&!lqL0G6zj_rsHNb0!*4$)&)Z@9pO#>0cHdlawBM(~h%6!hqy&pIE<(c3E
zVFKII|J*bntDCYDW`m}{uq-f^S^T`+1sTxSEV$WWQ@3Xw+y!f9pc~TkXkRs_7SKYl
z*w@}kYK(F%E@fwpGQKT}rNH@4Qdl<|k+i={91&O-(GdO_Bvkv^WX&j=O#EYBjfYJ~
zyaO?^9L)a{9sIaV=GSETkd4E05qvQcG^n%IQcJJ?FoA#D+AJ!Z=FNVo6@Sr^(#P|M
zrH5dV*5(^p+#zyo;SnYG4IktA;9Gorj_04ke0#a^@;)@TJ@Sp?31iiHur-tFlmVOS
z^vruj(z~y=6|T8)9$b`+=7b`P3prZp9Z=Re7n$05CEe-|%3`Be)We^v2g3v$(PZ@c
z%a;{gW}PsF4=VFOPXlH2(kMP5ej*6?76c-@Ug{L<ARaF@c3i$f(=mh{ktUir(3dS9
zw^b$4`#<9w*U$qOd?RH};Fd39nz33Gx6BdK=3f5P_GNgz#LL;@;ry(EMh9_kHotU=
zgbme!ch)M_48OI&XF=@|P$4RrG96i>5#a;<!9fAe$6d;3q+|rhm-cS{lHX&$@Yqhz
z$FdxN-v}{~WA90-`%JyX!5l(Q6XWSH;AZZYrsYemm6sUuG)_2uYA%C|B$gOwj%z>b
zGJxQcG9BeSsvXAFCG3hC>1ljLacX=JcsRF2sv`+7^>;`(L>OL!SOec!6J0ET>Yxk9
z0|j)~wn_)t$dmc?b%-}_0LmJQYewyM|3M#8tv}6cU18u-#+htq&`iaKi}JAPe?M^^
zBx{AEYwy!F#Ww|Ppjx!4Xl;acP?^*`54NMp;QWz2*cu_R_njkJA_-LH6GSBY@{jl@
zGQ|F?;C^yawhC865a`#EEfxbITIfN&?Q-#ZZ@E-^N&%KndG92`WiqlsknT@X%$4@1
zMS+n%8vMUZdT!E=t_twr?=;dIzvbaPeMQCY_B*PDzw4L9R<{5qr~|YgKjKmhzG(_J
zIlNo_%IQ_H@V%U5qA1mDnh)sF9NFy~ksK?w%jh6^qf#W7do5u|*w$72zBN*A;hx-8
z-IkGGUczZ4o2x=lxoBeWQQ+6)uW7~SYjxzv;*Me$w42)(bGhFO3mOhZ^TQ=!fsbC}
z;lyT~i*BCg@x}*3bn#LB4boLKd&{*6@($8?4^h7gNYhH~K@nf8W2-+MwOgF&tKThk
z)w=$AH>*QOx(4qlwUk~Sw&*Zq`#nV;fvCvmm*smi8CrP44zg#p{3ZRZ%r+4o?pQ+H
z#=tUsiRJ0D*Tt-UTvJo*IR|Iht@&`T={b*-H|K?6dYu{SP17)J(kGZ~GU}r!_gnjI
zC>7+haKVVLJU7GZafo@QWuDfBZHaX!0?=?Dtamqd8&?gQ_FSx77n58Kl6lvj(2=`n
zEhS?rMN>gOY`1(`gUl8IDh@RoWw(9Y@oW9&oJg?%1_Y0ldV|!GvFC9PMcdThq&@qL
zU&x~csbFDV9;FhTt_<5=mHvrJQ2IV8{ml)^YMo0ZG`+AeFk`GL#AEmQjk}QvCBER5
zb-eXFQi9B;ge`mp3j+*8LF2rF$01#veAptI$F(c=(P6+Btp81Y79O~`vs><+<jtU-
z&U&gcCZ--4#tfNdjVIo=cYu8{LJ2YHo!zH7%4jwz0&D1G+wXM(zvzX(LzGX<5(uGz
zU`<6ZPjE{jfv5PF4|xN_-iYA!6D9>4M~g9o;B)5#uN`fTS!1TD5$_s^*F%cl1#vmd
zcE)F#tipWXr=<3NLMMDQVR~87?KreWI#{F^N~_GQ-y<^iCmM-_IP`#GkG%7_eX<C@
zZ76oc<Jpn8WQ%+_Rm=WygVAY1#iFF}lI~~v_+`3g5DWBDlF!xaG5=MTb3vot3bsE_
z1RmJU-|KR7TIg?jwPJtpf_`UQ8Q9)IK+b>6cSInawvSm@Stz|aK!4+?;MH)T163?f
zNt>j3rVF?z-#*knOs>SaqcLc#XF8Isqd!y811iIFt~u3W<5UL5A*uOe(e1aR&y@RT
z-^^cbZIyQ;u28A69=Hc_BAriQ(-WRMT8Fo{q#qlbB6~3oEh3Au++5nk9?u<VnIIRv
zNW5j<tpf?n*==2YDR$8mZuPM$RL>>oE|#`p&bNdqi7a|@F&9da85DVgXT7XWyAYfg
zc}=6xv6pD3b=_~M`*EMQ0jeAQ!Q184^{#L_>TeACx-!aQ6GyS1j}R!lbTP$m(ou20
z%DtpU2<5U}_mSpXsB#~`gyN?qZbes^m{$*yqx2LcVQkN=B+qJ+4X04!rL_yE8wVJh
z8OQ@~b;9Ko5jxO2XGw)4?OkVmq2pmarfB@(Y^oPiy~@@KmvDXQ(lyx))WgEd1I_An
zF4^@9$r{5%_8Hqp%NqNsiW_C?`g5@yUA2?@b=)c--%X!R+U-vAFLRdUH+!ISU-B7;
zod1>~#M1!8n2AQ<K)DKCeSY5EGPC*#9t#(gaQQ_q%uamW&_zI(o*|#P!{QYA_IZrH
zu(uXZ_75A8bB<)S;ftno-~K?x{y|UnYSh?-c424&{>0i<^gQc&Ncq(-4D*m`ag^&1
z5osfjtLYP{Ibi{#ZL@5>pJ$=0ufJC?I~S`suaHZ<i~md0Qup3*7W8$qq?R<YipO-G
zA!~On99=OTJ+h>gGofE@x($L5&k<Whs8H?wZx&z`Xxk<!v)WC+kl?Ub>^jDz*sXK~
z7tWV7Dl@7%Q^#$<A)jrksLahX98&4UW~QY%mTynqpV^#}tsMrW?Zr^GCz?5$6h3#5
z1fLkU(#0M}WePlC3Db%fMA1TgzLU9=17asx>4C_u{V;|{T>HL#^nd)6b`E7Fxjv1o
zqOT37gLu}pKnY$@j`@fGkevOco@+rUrNS6BccIZSuFP2?9?mNUVtr;QZKp{0dN=s$
z&?FKretnFaP&79q9ZY9=ttv-hs1C(rvADIG_Q3Vx!bvNGR}+-V)JyEHy)mqLIK2BM
zlV=ac-@Faiw3Kci-fEUc!bl9oPN>^nk#jyi<B$47%s*L>aa(x7e@Z>_7!Qa|lwDdr
z9X5$o(%kp;|0SIZ7Slmql9KY*B&G5-Fh!Xh>eg!op=DTrAbomH<+7LiMmfHo@&tSz
zj%;rF-I7#U-C;~xEq@+kSy6_ECRxZOVQ;kiJfr-Z#rQrJ9nd70K=8M0oJ-T}?k}#E
zdr6Ht@<J4cMNi4pPPc}P*PO|8@^VyNkx2VAtrj}EttG={Q+OP!Q2Wj3oF5gF{#rVK
zSoS}!lOuF6G*DP>Y2npF3gqIB89=UmOdt)Sd3WVCH_f$(uW2L1>k-p}6b&yw_hK`a
z(&0Pwa%X%U3#39#q7^80GrefdOEQsQA4p)28>UB_F91Z&@j)4mQj5apb1jy2VTBuX
zkcK0m<?EIHBRdfE{5~;8j5p<Xz~Tc?NOrGK;jlV<BdOap1*ttL{c8L?oK*sVSZZGB
zYdywFYty`Q?>N@`;^Vrz<|Lo&bwkvgWEq3VrkObBeXIj5-7`Mp5ive+ZK<9Wcy|56
zHpv&c;~J}94@qHU9NkBk2mFTU_w2LOknVigd$f+jU@gfv0cg<((gw(V(uDoInXC<5
z+apOMJlkD|!XFRjbjEn21hl2Y>f=IH(d+jG2RDAh5RXRXjggWCJ_o++aPK6@%h*&%
z!uxZ+VKi72_)p`cF-KNXQHD>Z3--h-S;2Z?rYjcvcK(>5OtUvz_0V#w77~$E?0nS`
z0{#k#7qOyS&fWv#r|*(lMEq&52mFd389zz~kNA}KQdgprRX@d8iWb!$-Am%{)o{V5
z8$p?RO$D-x(;t=gmhr|-Q<w$jz5UKrI#xfSP)fa3ysXKM5gAKv<+0M@W-~_o`lUEa
zz@`oL<6CN?ZJWcpy~C@zg4HM+4hVbYDtfI?rGYJ=WSkk;%L7+H$cZ+8No47-G2iJy
z|3I=W9_084NSk0}gppV>q0H4-Xr>9ey9qSRPvVO~DLZI~78I^jYcQ>`H~NTSNEE&I
zy96!n3nz6|?ph;ljDVy<L8R*JuZR#3g!ayr6FEn7ZPNKCWA0crf>f(QK{RIw4WB_b
z%;?M<_eFn745rfjAhEm}Ofj{pe?#?@z#kVnOs6hsh(oYWhdK&F%=H2jf`$olB6I2K
z;Q7mR!XPwK-aNr?TY5;T>0bjpYPiJ-1-5o#shOZbCY+dpfsBK67~TU@G2*=M3G(dU
ze(8K3!@mP`bjjx!3w#yj(?GzUYKPT&%^o{u|H$`|bF3d|3DqAoM90+5T527t<^NWi
z+*r|TQs%MgqSivTq|<GF%VHcud)O7H7lAszV@40>Q!Fnbnbi;bY;HvrNp?Nyr3%$r
ze~kO{YQZ9^ohf?g!}p=qD%z*I|7(y5`oWS0n_YQ4#<o4^7Y-{*m*aW}QIacoMhc;c
zgiVT7p=8j!nd9qF5~+bAnnVp?)(CAv_g|anLwiPE_LsR$`Vw03;PZLEeqO}URFJv1
zpdt@QlU(LlNp9)Y$7KxZ%9$eWgST&Ezb1*M#sEjfq<K{YzGe21Sas_9#w_4CwT_1b
z-+=1x-&!dyhlpQB3I<RY$0{kn<`z{AHh$jkr5z1zCgKdtvT8UA?aC9BBq)^fj14<0
zybiU-5V!W6wEkYC%Y~Z+T}hIPNq#pf>uCCd(qKhvofq7IjN(^+aJMRFTp3-$F@;#Q
z-6!1k2K7t8er**X)rkSy@o|-{T+xwfh>3f<X7lV~)5=2iSRV4ieeBN_ub;s`(F2qg
z6<`7re|DU*29lTzU(o*mdSR6FR~C)gV;{H3abP$F2ysHB*!o2aZH<#aL{uwAOmuuA
zAl&UBHWSG_bQoynEjo{vs0}wN&BV4wvXqfA9}U06xyuuTDc*?nKnot`H7v#x4`oXR
z+}!Chz#0yDZ?${M38468(aircuv&N$oEuL1!BGhx?CU_kY{NUehgiSxg*=KLrY0*S
zv048F|4>M1n9<EKQi6Ti^NJ2Oi8Fj&jeH;-mpXHR;sY&9e-Z`+EO^U(<4>du?QcAc
z=Ddv8_vs*5OViV>rI;E+TrTy%gpzG_ws{|RI&o&O8cOB*NplK#V-hy6M`(;yE|HI`
zjcBoQe<0lY=4}jk1J2&Lm%FEUV{!^@v8Pp<I?$k2-lUhW?pW53$Ij)HCce6T%VU5z
zkra<|%25^z+ZJ1ZH5gG<M1EF*BBb7=yuzso@Vzhg1LQMsHDD}MDm8y!7-~9SLz^gc
zM3WLi8MuYd1$k^g*vOJpxxhijBS_Seb0=MXSxW2@dG6?z8u-J{Uov~!bodq%qKr%I
zVCKDD*b?Gt@3D4Tpot4kxA%I!$Za2qJ-)nn=`y5s4U-=UnBZxkGGuk@iSn>i7vH=w
zpjfZ`w4J@Eh)Uq8>t_tK-;}T2OSH4gPH5;}qa4B+{c7k;J4{e7PS7gK<UbGthA$oW
zaH2G)nar@TaCs@X5~a)N?^3%Ciym1&FI<W)0)G?`aqPDI`CG9G@pXu}*&(rII!AlP
zWO^+@2$3^xqa|ov<d5)3Zky<DCJy`Nj}!(gL^gV&{EI<c#9$4sb?Ym{JZ}N!Dh|~d
zVdf+b8}<-=XwbG>PgA+H<nA9=UL`r$@3Bt-_<JwFgVKYlp<O8OPKR(T*)l@*Z-I<7
z?WL|mzoS1t{Ut^1^uDYN!m;tpLCW!M4&J$E3b3DRq4zhncg><DK00!=fj4`LNo(|#
z{n(%g_-SW2Z?FZFBItY833!9Z>npW{--$+Blapk>V#7N@#^2QIH_4$mn9y3MN$vRD
z-=u9X3PTuVFL}qh0a2I80tECs<2%9~5LBXwqND&K3yo2$J*z8E4y(?IM*81ni!)Sv
z;B2Zv6D%s34`;!=K|DC9#hRUXuRho+Txh%S*J=H|KdY!hPvyIOzNmlH%i~OaiG6cM
z{sZU=hXnLu2_x_@YvSjvD(7Z&lm(zNb#B%(Goa3d&wRgtrU%M6Q5@OG1`dz2oo&w*
z+x}gjuUWW7A19F6SY*w?sM~KM^K-a|pf~v12~g1rB{~<r{%otFfxsgjc|Z}*eivz1
zB50ov?l{%4z~924#DH=X`>C44AvV{YdgZ*&Zg4tCa~hxyVgqqYkni3~qC1S%>`7XG
z<s?LPUhQj(aEpn-Xq%2@wKFWY*AIRd|Bd+$N&hO)3RY*e^oY9``pvp|$(=J#iD2Z*
z>fUTNWFdNtOaitx<$1NrN{xjH4f4Rm&rDKOFB%^M7qJVqS60>MR$5s58vDhWIqwOp
z#m27q=XR!VYZ75{8x^Vk9;%pcn0;v%l<W|ZN7efFh0Ocz$X9$$Gh-6Z7~LxERo1l(
z`9D-NVR4GNXosHoieHDsWgaXxsR05lRXItXEm2)vLv6IaUNd!G)A1j_+@AW%&2U8c
zJG1oaTy{{%EOFA}r->Kf)>skA5~l8W+V=pGSXBc*jSkang$@b;1t5f$@v@V^I{ss^
z7n2gD@JwI?<fa#i7F&W`Ssn?RFbEU?&(9J)%KuEV@m6q?5JRym($5+ii`?lTE`Q&S
z^=wk~5h=i^^q2q4x`y%#j?LX~e;Y>-{lhip1}3oju1F=$UO=zWO4{NpE%h`Z>BB--
zo+@`@y6&NJ)Pk=XsqAO#pk*DR^Q2*{1@z!sluwzXhyn^BdE0byFj&MVb}w-X(0*+_
z9|`B}Bc=f_+Rxx^t=k2y`1;ry=`@W&l@!vn64i{quWe5%H|j!6A|z2FhbRx<@Y|8m
zw-^es!f8yg`D}J}r`sOdRM$_AcGr<Y$mP4?;K^7Ye6*tCI@@In@DhJoqQ%kUUzCI-
zlM_zcEI?z(NuAdGZ92VZhC)#_V@-W>^N`CxG8UT3HA9bvs<R5KohUrZkPX5@dm1D2
z5R=)PFZ)B2m?sS~8ImXQPwR_5RA%7KRtowD1P79(`rJR(>cSbW)O^K&i(^dos*KHu
zE4B={<rQFWWPVm(%;d&sB)rO>th0Ei1fg+s(roUNK!c`5a7YxTV|zV~rMaT*T3Y3G
zWjT}kh(R;-EV=7UB>&8hHKQYa^qZtFuL@{%zq9avF3oNgE?Lh&$E<>{(YP2@4an*d
zDdU0wmWWz@z&2@J0%nb4Ge&3lEd04gu1jl;-9Gb~9ckzDc_drsg=aQME__7ljyqe8
zbUuKb5--by2dDYUD!X&UmSvcy7R&m2MRnh1rsiW>{o$Kg#1_-J7Q!eAvE8(T@>o(3
z6IAy6(7My|Eo@0k1io-e9R!H~6(=Ku^3~X#q~AQuA}N_)hb{ZqMW70HqT{_VoeU}>
zU<6lS1sL;x&4|<VPRt;+09H?Vg3Ovv8*FYLsBPdoSQ5OytB%0Nb|KpgW3nx1s?#2m
zK|X+3PHfHr_ZOjNgIsmuIj+kYS#w*y46#oJaE#=>Zi+I4+;D0`GJWR~gET(%Kw5$J
zW612BQI6n!{L>3>H|v+xk55U_+4HPvhqMLc6X{S7lyr%xA<4XW>^kFt&cwnsA;U~=
z!Hp_>pBKBjBR5{er5J-t{*a!d<%DgJjA#z~W3+~(*|UEZ6Vqw$F0zm!Zli7Nu@ae{
zH!-#9X-YDVek%edh^h$zQS&$GiHLf94O_=0>-3wkux9Melpqti4cje<VJ!kgn`8uD
z0Rj?0v_waKiHwLGTzg0j=MT;)!mhe8#yuynrU*~h=?BAFg0NoC_)(Li;WMC#{w1%U
zcvBI3m~pKbXi^y@l01V@`8+T@K4{x`nVpThmM~&V1y~8V!{cRI=qF0Z6eET83p;`k
zk{q_D?9ejx1VpHKb@?212iHy9=dli=63WhiyRk%*y)e!8$Owbv)#_&z%r7Y@^pN`u
zEn60S1M@LY;@!hc@7-X(LHw2U3@Gy%AXCBAa6+FsrYpo}>}O8|ed>EeV)_~h(V=aL
zn{B4DtHatL*-p)k5#l;bTb+_~0k*Uw(i(J}cbUedP`~ADB+xw$J!yB%YZ<^lf_-9>
za@n$DOpg~@^JEp?Z^I{jX(R)atVQa&<Y<2!2B*;Q9&Gbrgf*@#NbWAAaqqF!Q{<_p
z%o_C|RsH=tu=Ic&`SQ;~^+;rcZ%Qq7wG>nR>fm#k9gK;eGk<zY^0<bx4sYkWaX@M=
zISS0{WN6m2`)xNqF|*Ayi?mXcY2gBnf^v)D8(E?RhtI|5KXq6A6rZyba5k(7ZQRz=
z&H$w$Jn432Xo5L{0HziUUp=tDM1JlIw70(~(?66Xy1j#`y<*G~TF0RC{punBxUUjF
zv$PkyE-}pu+0T*pzZC;Y^9IQ~z^LV>_%G$;0GueURj9}YAbZm)O+#Z@C-(pKa)6ZU
z5Hfx0qZ@<AY!V+ML;lO^mXm?qTO^j!AyZ+&TW<J~&%1a>{}(ZIkfMTiaoHv`WBvz8
zxPk@#_!2P9CgYqR{s-xM%0y6QKLLg+*Ls`(fdb|+11MqsUVoF%f1-pcPcLV@|MBHN
z*mN~~09sUYHqmSU53&->_(US&`Dix&2Xj~)4lswsLa-kO|M8DNc)-hXUbUAl{O`qM
z_^HcKkw7<T+zR(tL)#4E21)r1a{4A)SN{PZ0RR)~y--n?XDvGV_5P^czzz(d((R*>
z`nky-`I@0U^NV2ry~NzVCo0T{@~TjRNf#gvnM4{+cC_&##^X-iUcdd#zqopY&=38$
zh?2;7>EKK9t+N&wjm`klm>)7#0_5=UzGni^O@xvVja6X2Dxci-l;!~O+RZ=OInQUw
zQyV>hBrgF1nQr_~@^}9+M>suVf0P5%8UR*6&CPy&w3d;ekaqKg=oth2^!Y~y_&b#T
z_iX#52dW3_ZM94vS-yZ+KFaaEe+V~n2nIoH{*dP^V^T~^$RS&FD3q#KY_?jkiC!k>
zs<%1GqwkNaCK-HDEfBr_|DzMr1L+b40;b#lK9)``B9|(~doX*tpQ!u4$!QDhya7nF
z>CtV?e>kIPe!^V<t=o$pMi}$1=<bo>vHZlZ=c+WTxPN0kJ#9JdsMWO1oWPN<Po8Fe
zYg;0+f;{|**Bt&z^mO9&lV$Y8Ha69@DIJJ6=1W(_Y!iUjrj+GiYe+VhJ_6xA#88vk
zrtRnC_v^}{cQaH&z7F|&^?TWN%LZmYJ*?KvM0vfl6Ip$gbo4XIjR-Rd3A!JUebmF4
zqnsV$9D)7O8{OLp%KWWWF-GLKD6b#pIN(mOXb5~bL}QuSxxug4KPf&tU#pRG2ArkF
zSH?nVqpQAC#D$iC)oQtyPB>Ba%e$#$BA;1r&B(jnwcU(E-?ma)#I(jz&wx{EL8GNj
zf12O2f`y%}1TZ9RGRwa>6JH!nF;A}?A`h1?FBHAIm`{G#Os{@Ft-vyE39h)@8*h~P
z(K$P7a>KpwM5H<nOaS#YJ{d`#Zn^Ns`_8lTiK55gP&Qwi4Eonc85mM|NZ!Fwydi0V
z-HHDNErN+#xc9+4AQl#Z58`T72;vXcC;<R%Cr+aQ#aDz;ko^95RbypDQ9~G3`s>?7
zs*JMLV(*Z4-7VXqO7Q|UPDi0RE6#@7NTR0G7;2v>-#OFG70^RyIM~h`7ty~@tKi^k
zq3X4rTH6%5+{sm6Z+gWPW>NHB2ACrf#kEOQV2*;U!+rRHIjRD&TpzdlPDr;N50I4m
zI+U2PdO^^V+DafXg_^mq$#_-g#UCWc`_O^2K$hyHk~A#x4y_1x{5ylx@E)X^6Ud+N
zyd{$V4`_U@LPHJt$9&?buwD9;8IvZWLw*-MF+(jnu{s>Qi{$h(AZ8!HaZn{M0sGX_
zd45RtomIYeqoJC4f{9GP+0EwO&iF{X+Bg#bQ=eVpHn~Z%P~95MVy|fXb>^Hi+*@S&
z6HZ065EP%o<imNhD(f(?Qda>GRp+-xSyQ4PpYFaaRQ2stQHu`J7GWy4G=wkdoTR@a
zXnrFz8p8$;e`ziUdz7Lo4f!kn_dd79{J}itV*1C$-02BMYn4y@4|W~387p^%@%IQk
zHcJt~CN|5B3d@JHg{oK1<BK+SBa>%>I9#OY!@2~OJ(A%vHvs8NV()6TxZG+3+#SVe
z55%@li#YHch?4-5MN#{j%Y(a){|j_9@V#sOPt=ji&3r2E01Tt)n2jqth-1Y#mdv+R
zOL%2^K5&3ajKL6mw0!i?F=O&@eKxW+_H|K3HYsXwOYZ%hrv@B&Y7Jq<go+LT1`DvH
z`|Qd0x<Bs>x;^&&8?gl*wAgv{^~{~=bF^iM&)XawWZZ^kfdUfy&7`XYszLDE!>a&h
z-MYkEr?wAz+KVK0r9@35o`(5L?(V7XW<$4EW5lBjIJDyX!_4q-JYei#a<E$rO1pHf
zKh1?vNK51NUB!OPCzV|oALa`#&bPCCnU)1L^WE8R1q*Q?n!wS5JOakXp_-=^S#8+>
zuxkf?!xwTV?)4D>hU87Iyi0|{y%*~F@-1ZqGff;C^c@{Nj!13kYD+#&1UxkF?Bqr(
zUe5rF2&Ph2yL!~VI-1qDD9i7FU6XpZ(Vxe+5N+fjtN0J+7>PYrni)Vj`RC}8=rmB;
zva6uHcd5iZS9`bj<N>q3)MQfU0+9H(>XvinAA~mZaL24`aKv`Rc!6yv0Y4pB^)pSf
zky`S0ZssoG=5yB_d}n?D2>fQRqcBs45flo56mn0>n{8svk?2AZoN~t@$;JDjoO4(<
zD2SP*{@XvF!8#12A(E+ySwv-i7yHAs<s1ho&n<%F45h>v+6*qx{#kw2G|(vTF8k(V
z)_EK0X2<uixh`Xh@8ubNJfCdl*c}{yFVEjHL%QM@c*n2r1W0DQ#prLU6<`fAG&0nI
zQb3A>U$Y{wO;39-=WK*iAj-olwO2XCSo*HR^m@kp$tyeCM;j5Ice9hI`SJnI_+F#T
zzRpdk>k2)jOs~}PG}!)FERfK!{Rf+p8+JIa*Y?}(PgjWmr&KK5yjTGUOjbTobN28F
z&X5D*Vv+V-Y6zr<m}&Kp{4PZcg2%r3%REYR6zx?ZSIzD%L62Cb)tfL)xevi6kkJan
z5~IZ_vr$E_%8Iss(U0IKCizKd3H8Ys_tzWte#BqA`i&d>9s3)`)!s+PtT$<^j|k!^
z=urQ-W1#jc$RtlSy>7)`FZetbDdR7)N<2}ofcCL5KchG;Z905*eH{C_by(9PzxJK)
zHK%55nU?Yfg7{<E(beI|M~Wc**Ij_czn(tdzRfUvpy_*mQ}`;wUHeBBHtg(|a{4I6
z{q-3Kv)5EgBUfk@x7U@qrUEfEXq4vG*_LU4X|b${t;uR@U23v&>Y@oI%bn}7H=3QI
z@9y0i`}GsmLm#E@UJcIV>o}S*fjuDtiXT(|O*>lZR&;qE<tmG59&Wi>jc%yk9{gIg
zbUgZ0p3EyDet#Wr+fbXkCI0X|tOt5@+=WTJh$(h2O;^JA-d#RXd|c>iVr@J7o2QCs
zO(9Tr$}X?|EBg@Wj5i6&U-e{6W4#T`816M4j}%qj-BbD>nz%q7^NR<Zl*dPl<Kbr7
zK$g<3)zw-%^52{zCjVNnjVCcjPPnFCcKN*cBojmC_xqc(ppS&w_rOhWMBgMgj%6PL
z6_4-uclqS7C+TZnkoYam#4tVxv|rKwI5oh(q!Azez-!#8vN;srhn2M=`O+b>#J#Uq
z+haciQ@Z9v1qfn&9P3bL)?_}o(q7ird_vKgs<ZgrD}knXeq;6SVTBUdIQa`5KPvq>
zzA|qh|91%Fp9M4j|A{X`v|p$g<^3C9tnyA4*%Av7i0e`1)`ngHO^StY^UbiQ`f=ok
zkK4gkT|St?W%oCyOumm@wQIE(^suv)OxI$cs1hKn>+t^X!<G2feq+zW99n)>NPCxH
z>#8SNq0^6tnyfRyY5xP7wmYG7CR+F_dPr=}8Cq8UQpTSNrX*eSPC+NzNv&U+6(5F#
zn=H>tW<Bm&+4dIxN_ba(NqjH72|5}6RncYvp1L;Q`aFR<9;t*s+D&cahOnwe{>SsF
zr-DWeC%*x3m<sjwa@%6(4Ebey$<cv=;Wfgb_a5mw6G3@3Ki$6r^_m4PIlt@igU{Kk
z46n5lRgPz5gl%&VdZ`b(HI!6%-NSkYDYw+{>{oRjGe_?K1`(b2hxny+Ob0VDT^Tpk
z)7x*igfpCbltRi9`sjzaP)l&}lex);2T`|rmkoYsW{Ujn(E>C@hUyqBt}YL_EcXhZ
zEJ0?NLY60Ms^++;Re&w*uL<7F^c~LKw33ogDIhb9gYGYn@(gJ*CV~5JoGj#;l%B}_
zRU&6t)v_*^;IBj8`bTbsv<LnM5a}=_K*<<QKW>xX?&?H1|Cu#U_0<`ax}eti*?n!w
z01xN<PVB{Q3ZAkPo+iPR&;i$kv+$5E{=eA998TOUZr?7`RD>+V(;m&`oxM3W#Q{WJ
zuMyGa@9eGDFK`u1r`AlV+pi9<?$nz+aL+LIBwJ7km5Fn4V>rIQa@O99s^<@Sz5aHQ
z?`p{}MuWlm2aJ)x984;P@t5K-E#+%Oofh<B5&2g&o<FrP=vVK24%?IEG_+rV+Neu3
z4@4KZFI~Fd(p=4c6C6nkpp^e?PsY)nXfF$5VOX=KbwxG+Q@C3O)4$h=r3Gu$_L4^y
z9*0y64XJ58%iF&hVJZZOabo4|GQ6Xftp_k}VSA-fRAXRpIH@Uu4^CHf>sX?6T$%J*
zB;QL(nZ|P4i<)aM!jDCTK(K%Yq&het(7T_(czNQ+OFiwy-xps7mJ2^d6K=oOMjR24
zVIb3_TC{Te>)^TZ5BXSw2)qYB&M+JNbX)5G!5#y?p)dJMvo`zz&u6iY0NqWsB8;H5
zWv;vL7U&~NTn;}zsa8)(g*ZkGug^JV?Am4^{rk453w$M%XxF8`zh7q3smm7Q$Xpy4
zWA5bPMssua3mjK=pnW_hdAR?z*ekfY(jgv6Zct<-uVHW74FmypQXmS_>3P!M4gl$^
zTW}Ig04;ME861rtC-}UKi5w~QIxPEF61$-_FQvCfLBWWpeEShK)Bmynlp>5EP2Fc_
zo5BmRtf2fPg4VYq6<WrUBO;tIyVUML70v^B(Ya;pFLRt6$l~`kVU&H@%CYOWyd@?=
zGPD3vnQJv~9C|}>DAeqPzDx(vhi>u%G94n4o#@}lA1!M*i1Dm1vZ8&An&j8J+@g2p
zAqf@i<*xj`CjMjku0sj3xsuj>NW3MMDlsmyscjbfeNC>vX@Q;vnfc?I5$^Y?UhQ8_
zShQ|bKrQND(J03yNi`i?1}^Amib@*?u$U#d%?MCjbwSJ42s7<)Hs($@r<YR&Ou#Up
zij91gP?G(8Q9DKHDOdDFSCTmXUG(_eS*?9?>w#<HvRlGGY1i%c@HndtbU-kqC?7R%
zq53zLk+xS+T$}^{4OlY&H(+Udyb22Y7g_m4Yj?I09^J%YKT(uA(8<~hn_YIM(q%6>
zK%-Y0dk1@iR%yChQcM4w6Iw>;d)pJ}B|)O!rYJw8-=bTX>6O#w?VD1@@7Ogrokrw^
zMP@bi44!jr-PVQ4J?3yALHM4+)Q3_lgqn_C7h{C!!+497ZreUd9lseG--tQz%m5td
z1%RyuQ&IWJqR-|%Mzs~A#Ulbr{oaxmBsuJRt(*7i2WAhY2OfC>i*}NlM-D$x`Swv0
z6)_*<HekhTW#+%;puPYiF+7Bh17{y7`8;X!5(J&Rdsv3(;$Yr~{>eQEkw%m;R|JWt
z%dVhb+}$scGHO3JHq|z=Z6b@6l1BnvH5~-V?O7dcT2t?^WM;&cwYZ<{P1cgC9~Cg)
z`W*R4Wj0)354B2VVQ)1u#Vpua&(GAT=-8JsJQ9zn#J-P9G`Vf)Aa=p+)Tl6ZHIBF#
z5&yeaboOJarV4ZC(u1F*nBmM%au3}oEt~yyqRD6-OUa@wh7yK!yTU)~L!?u%?iaFm
zh|!`#=<jo~<EJ?wHZaL3VbFdEQ1e7QS1SP;b)8Au!%rB>*1(feq8Yd@mC-Hr3o{J7
zOs%C#^*{82VT?sLvC+s35%i*%T8M>PtK|E1=+r5kK1J^v8lc7T);M5KhN1hv2dJCx
z65g0$)k#ajSed8@X$6TN)CuX)h7(N;bzb=Z*)5*2PO+}uf;d)o1Km!ld>{qYsf%;u
zy@GwQH?uH^3?=}D7d^5}TZ5~Ey6Vu6uHUnf6a%(QDvxnhtm0bnaB~$SAY4=xv1|GQ
zRhF*k{iv&0cuxwQGc_`cC5E-V08#9-71URLYi3IgdUjGjW_CD*w!cd1g$^90gdnG4
z#KpRH6<{hIPe%@rXCM<<yjae$Piv`q=iVxCs46u<Ww+(oDqJM9E<`O)z|60PC;`J!
zh5Z{1H3VzCuZG`R5e6UqwJCJt)=6Xar^4MJl5(^2GbsmLUQ6D6%PqkmcL5jh2oH3V
z7i6A(<5_gB)^RqxA)9`kF4(WisgzkeGiU=mx7M8+Kb9@#^`5OiU6%VT(?4x!YGR5~
zqBlP_hy`MQ2`!UDc2(x~XZXo`hv(8drA28W%=}itM|Iw%r8<rp!spP?T852O1eo@L
z4?6Z1u5u5RW|*UYM1c_+^_df;3<P8^+jr7IZ}B3y6+jUyunX;_Qgnr^Nz)#VG2D^i
z@qc*JAjfC=BA!>2SQ`aE1gX~fg3LH~^MXAx(8QrKE+PgX*TU1S-pn+47dFP>{6R2B
z^q*9VHBvQalqcYvnb9f-ayM^`kL`gj3$}JYbmi#?f}eU2de{VkPhYRDzOh*@^OHA=
ziNXU#yd2950#d2<!3%F<sMQ8{X<sENn_p8f!6gxW0Q112dq}vapRelk>1X4@&%aEQ
zbc=w^GljB|QLB(b<ZH;);Wyi0?Jn0A{Kj;ZCm9^<fxD`g3{Gza?T5vAZ+t}eM+21}
zD7o%D!6AZg71OSAQe8+su?GiU_?e(L)o0wgTAAiufV-(7`3m}Fb)vzk+xBliQQqDp
zEj#rVGZfBln7ZzV8c~keer$Yv2#jZq6!*P*xFWN|%4FpZGnwck^ET<95Uu)b{I3xm
zA~haWVlg9<S|znRvVQI7fX>h@8)hUk(Qd^OzZRM|-w<!?h#zf&ZBOup;NYPzc)IP%
z{%{ed7DDNh?1xnGBR)ctx3}1p*D4l+-6}rFDbo6d_}|k~mDv?Nq5~Uzv~=1B*?$H$
z-7@%S<899pRv%jAJREprAo@(x-6uysx6f}ixeHM;zTVyqZ8K1==@BWyVlP$w$=LxI
zd1>a;Y6@5b?NxWgH@$Bv6fGJ#5qU2!2tZhOY}NsloK1V-3boc-0<?OoFS@+sxI$V%
zna)KsxJt15UI2{E=3+Bw4vN!&KuvLHjE^yjNrWvD<G!?t7?;gv0iV8MHq!ku=BUV<
zj?lJ@K{5$0p`+qBSsdaBK!T)**!yz9kMkbZFJzEwi5Q<QNc`#&6EKyZBAQMUadOfV
zdEhlxck!3l{#4c;roo)aH9<OuCo(J+%ONb)ib5w&T^~#*yccOKJ!>$5k|CEKG9xSD
z)Np7VBeVOvB^%BQx2fuHG*2^`5A6FMn*?2ec2JWEpu8FAk$u6@1CX&~IcC&}s&S5J
z<Tz+Wn~MRFnz^+{u<Iw+!sP2^wvnPb{MD0v+0_Fdkf$bVae$$y42}#^=Bc~2&@8Tu
z*-xm-6?s>zS|E*Lv5|=Y)_S?uwr)Z;D8j*<{QmmHYPW#GMIXy|Drj0CEo?uXj^+G(
z=Q#~;a{YH7AkXsNVvJ)V(lLuHo6_fg7K~4gx^Ix48f~&V<hu0HS+RYJJ<|k5mS)}L
zCaO9rK1hkc6W8=G_)2MpQUUfko^Fi5ZjBYAp}AJ0y-_t`ubxvsKwxJ@$!*rTKIPia
z%kK6*iRa6LI(B$~zxw{F#G)~Qy&zBvsOsnpOr)tmg9cty?I5$57a*Q0@W2wEd0+fd
zpm&loqZ<=Y$Of7m>{@^^tpCBYS5k&&TTZC=VPrwt;UUU7%*eq^aGI{ZlRV3}xYInN
zxC`T#l{z!H8tfNXKV@#<Q+_rg_>1Ud>k=p0M8-d&OAzPVqbI$vKsCK8s*w&6*{fhX
z+|-TwWlY(cW$C50olH>R(6+&$hM#ft`|Y1q0Em>zca$Hy)HphcH9KA$%DsMf!S5Ma
z!Z8-nEQ6*a6e_4cUB&ve#+UJF`mkh!(B{*a@dt<!qmw;*{gZ||!fYg~&_*WLX>XH5
z;Y7A;he7X9FloD2fMkmGf%7BKTQBNYGzFjUed4POch(_3dLzQsu<KxzEaF(D(N-n#
z@dtakyz!rLATE@HRPI@_q?GWKNaj8U`=CgNp(JT3fRpWg>AzdBuCdA}z@j<qBTuAn
zwwhaezt5M{Tw^e6+{!1vFsl3>Ox3^~T>6bVMv=<_!gcoJ*Fa<pFnTE_rulk}7a0d+
z?;7!|%g%-Q*CM~Yau&S<1A>J)VqMWPJW4<gUG23#u+YRv*fieZqS8S^a~Q-$RHpSl
zOktHi(I2n>{%Ob$#!gr(50K!@^GgPYR&n;<0}1e^?MF_MspedVtkza3abrh>(Vvb;
zRl=DZMPXEmUbvGOe4^ZxlqIFVbQ9^3d@Fr5*>oNfDbjexXSU0<n>*Z)4L!SriFkTx
zhShjLR)BuX1Axv^ZoY?5IJ481#jY8X(dS$F?-H-;!&(E#<D)!c3|T0Y0I`hMY|V83
z>yw!P5j$Y}qjLIu!}=ZI1Q}XBB_PGuiT=lTNDPc)-Cs0CC<B&_MZGXb>9>5ZrB8;b
zxlaFFC6d(A=#yfxpU>jGj)j+AjHccMdr|oT{EW4BrT76stmUMwOZ|Yp{sI;k3WW>y
z4EA@v#_#t-^r8cFG93LpDb*R6qur2G9J*5I>q7SKXZRWS&A&3aPx7$%?H=3;ixbmB
zXIy@dBsfy=j|Q+M@<oMA9vR~)pag0q{&VG3FU1Dy7qT}H`T-FD0muhxru)s;zjAbt
z8U0E&E;T;K^-%M`OYV5CDhuU37=`x|rAtn%wU{UHO7A4_+7*8AUG-3&!=s`4P$q{S
z`1;f^j1)?RlREb-n8=p^I5(iS<<LwD7E6?URIoDZIFyh?iccoH8#iMjgKLh8U|}7u
za)*uajH3?V=7#(J2?o$b<oEnoy?R+8)wF%c(7s(+yF6cggdE>ZqIn#Fu!?HL+Xo0F
z_|U4GXj({B2WNGKKsvV+BcHFUKp#@yU>mU3X3iEaEOSdx8G?Q_{lwA5gue#uFG*k)
zHiB@awgBomQj@4ywS`Z1Mk#JPjO-KMK}l#@U^wQfVReqVEX?JHP8krY=7E;2vo~qb
z-Z^74u><<~XfK3Wf0qCkGhhB8EwZT^h>6g*zMycd)9P|Hk~XwLL~^@OIrhlg7)q1#
z{rjNY<bxE?+OlrnTKAGKv6KqX^*^Xl{1OB0=@hx6roLO^|H9Fm(;&qp8m>DG{xjUe
z2B5eXG6%H(LUB)erhP;K5jG1c-<}PX77~l};tyLEuULi>!DyYl#M|yefHmDOjp9h@
zp9knBVg9|E6x`t5GHzvF2g5f;eGj|<!GKQKv0}#VG5a!%cA!R{O0VXNcK5DU4}`{^
zLX8_*9GxPt2%dSoH_CW?+_sa#vC?dwy3fGEa3lOeMQcObhOsAQSH%ZOR0Q6L^&pB2
zR!m+GpzKS52^@;cawU#aNoFgm|DeYmT;?ZO4e3_m3`_s$QOOo6W2vv#n|X;txAab1
z^{NhlYiEXM{hK_;lArhHS_3AU;xi2$6`Lduy|KJeDnesfx{2&>xDLI|(KWK!uQFi*
z`GI%BH}oCWV}m{PcKQ7NLhHYaTzJnQFKpJ-D9J9L=_s*#<RUMT6YQx7#BW*Lb3015
z#jC!;8+`o~UI(yya5E#lrG7Q0p{0b8Q|FYM0+7E=R$&BR$lDK9cju#J#`Br<jAXTB
z+QclY_Xw)4lm1u&8hYSSCec>PonQiR-2850>NGKqAE<PwHNbfTTP}HqWd<iM7vo53
zjl#R(fjgeS-@lK%cb6|1#X%Qb20!HGhC0J#wo7};onQM=A5*5GkUF*hC?DgR(*%g5
z?jMGZ9qXI~4v8~boWBe0@PNm&f}mqB)Q}rl4Vt8z1R&iZIY;2LSf3eSADL~?!6NoY
z?4^C>ju&x;1(h+d5YBf`@}pPdfowB1M05Le&_Tc&4JzasG!fw>a8%^`ugH{yCj%-5
zxioEc`lPv!^eOBif>2rT@$%n$<B3%P<_Wt=S~lJh9a8@N;$5_$JCPDwu#Rk+X=NZC
zpqXr+`^#fKois7-d^HiT{3f%j%m#&!*~yL?9W{xa{oSDZ+8^@kU3247REc^^*XF+N
za1-p`3Eh*CwChS$RmEVk$aTe{PP)(sVDG&u0#0kd_4tfs9DV<NB#JUe$F7Z<6;8tM
z{$86(?NZ6EDsFmR2B!9WI}qbb=m6k@#Fy_1{tyM0{RLJ^pD)3JpJ$%QiE}F1vAfIu
zLk-|{^o#0BnASN?YtnyVm)*=>8wvEBmQA2HWKQFJ8Qk?-Csr)oUU$mqV^C>QK;q{C
zEHFnl)hnkodS_!ZaYXq8AI%TIfdp!U3?@LKD&<W59)&R7QzP~KSr`^h@{^#_5E0Y(
zmm(fC0{~T+u1-zLo4>xj2~&eVOnLwA3a(jD*=0V9jw28gHh5VJIzUF&2E-S1L-X`F
zl2vI5(G{s50;ks!PB5I3E-0%V$VMUc>AVajow~H|0rUD<=ACgfu+3a$B@`SG1(p$5
z;|=5qOf{AQe}aDoCtl5_`vY`oQC}kuBH^8@e>WG0KN>LN7So`^obF{#gs;+A92mt$
zW%7wO2+y75cgV=d@L~)m9tW{+rRkrOeQW_m48G`o1rkBGN_?9aSeU}K?SAGyr{qC;
z*~tDt$>rGUvH3-oCh~(~%;k{cZ?(>*Hx-2=p71~?3AJ#%81G<(-KQY0v&`6MJD><O
zBCkE<?DjHS3wW3Sd>OOD?g*3o7sChgpA`Y8cJyA{4hUZg`<%cgdn3{XZ#SeK1}PFo
z1^O(dVt_6DV?yJlFff;~(WUz}Pli#l6GAMjRRG=yb`=PX7XjIt1}y}cbPF)<V!|1N
zt+;U-o-86DRuc}t^4|w4F2FoeAvaL@V}=`@tGrDR?DQSy(d1CF_3V+4HcG461lst(
ztq&IV5jd`f(}4>-V2F`q;*<Z7tnpi)HeKF3ndZGMa(*INH=cJ<KeOmZ?IZjxiqKaC
z=}JZNJb3)qJkdDgzB#^Y6$hLt*L~oKkJO#{8iw4LV~O7YF=|49qXkCa#T%D`9`PT{
z>YcB!47+ks)xmO=)wz)4(<u@R8TKtHXiMj^Sd==a_Z%^(mR6$40gk6i0PhaY$oWWU
z=5+1pI4kiAl7x5ghWhWI>DKGKFhT#9kx~TXp-WPIBw^c}a^yfG#>Z(7P^wA`Vy{+p
z8Y-zj<}8Cf?Ylb)&Vi&n_b}H%DHi{}jiQkejLIa$q*cf+2mCe)=^32W{q;H<5Lrua
zMcr;RaYRzJzI+8*0o1r`><1y1T!-jM2tLq(lrlRWy5hWpy(Oo$Kkdx0U#ODh!cI44
zr=aZ$ltm0}iY7H;#?RTw0?f*$@AlaZPAtHt#R=DLvMfjS_$+1f+g=1*Wgy)4vuba~
zH8AWr8Wf?FSp<IyV#%kR^#l{XPO0gAY+m++;1&!|mhkftb;?lpksxh@qlN(~s++)S
zsj>hi-;XW!iCL|QlyugD25AfXwsk4dOkdS`<#8q9^D|q|g)xU6)2zM|fSP_;Qc6h!
zPHNbD_y@<emdD%h=d4GR!(`XLJlb%iRjFWW72@R&>z4mv)f(>Vb(xO}H$pQ$1MFD~
z5Q_jhj#cp6wki-LSuz$m_9i@P7f4L=b_B)!q#5!z8P%lm7;noTr{vD;OXjqHh8rE^
z?jye$EQ_clX|d#IGD_IyU>apcFEcWI#QyQf(3scs@{GRHF(7r?Vw=zah)=%fV#+$^
zlhyWj`R=iK&h$Me!DN;XHzYQ1`kOQWj6Z#(XWAy)i^GSRo#Q#lJ8t%XLzl2GGr?@C
zt~yYB%k&X}!;R#FeW1AKEau?%1+9Wu_ye+V?<jiKB9l$1-^ooHO4)oxV^#*K6J{99
z1Wtdq|7rVgpmhwigw9S`mYy#-Qt^hutXnjfid?JSq54EDC^_5c329ATxj25!OHUlM
zhmy~}p~9kHoDG)<CJCWx?kI-FHYug^EzcmgHMvNEovCpU+Jd^Nbi;z5@X1+K!W<0Y
zbr|VbWWBsGLUBQ4=5<!m58&;1_YTniik4?|rruM)P$j*J>Fy}<jiX=vj$jB2`psFo
zt<0m}6_Vv`+MoS+f4DuFXY4BlqtZ?0Ut#sTy{o=ZIG_EoSTb2=C6w!{zM8T*352|?
zmzT8K<?AO)ZP_jKyIif}O-YkjK`ezU0E$Qw3>=Qdo%f1~Zp^9_Vko|5WG1gs<Ur5+
zf%P5Uof~V(ugPTqLD9tYJ<kA|4PfEW_q58jTpHzNlWavVt~zh5qPL?S-RCb)&gSCh
z`<N><dH(zBk3I1({;DiAOy6TYZO-I`BM;GtUsO6&)hQ;LWJh;QVUV`f+O^wFxZMlq
zo=c?!?aeg*6_?zsh3ANpsr(P1`X<bP+KotRp38X{t;zRXT8c#V1Jb++wIR#DjC*dW
z^__q2+H`WL>GrR3e$D?w-do01`E6apbVzsi29%O+Nr8<>3P?#K-Q6JFuxUZMySsZ!
zBi$k0jUf15c+R=+|NDE+hv)nA1&qDcwbxuR#~gF4c`3@xTG~rbiJzw#<FJ#wipRhf
z2E+^l>{PNJfEq#=(3%EQVLsDM6vtw*FT?3{k!TQ>r^mT9-%=-NL=+JZ=rCq{jZ@|I
zYp4X%uojIBZHa_xF8G^Lv#D|A0dU%WV8ho@n0T44R}mdoXV4lhNXbY(9Z<yZDpm;d
z@$I`BhZkdUu`EaRgo-5JqyfV7xt36-W|$8^T)d4D!z=}~zGz>ZRL+@b`*9jx5l`}3
z%*j18RB--jpdk+p3T}O)n=sppj|x-Lj6A3~(`34n5=(_8{96?a2+2JoC)Tt@ddbDe
z*@?RxJx45xFjd84%lHTtIi+vOorqbt7HzEqQNGB%;TphF0|=q38&B=$OMLOwFsGcQ
zY<>J0U#?!XwydljbF7_}jDIPV1Ee^q_6;C3EJOAS!X6zl{xddo(>oEH3oGX%p{e;L
zoD*0FKUbOqt;&{(*MMfVdtO~U?_jQ|q&ogHNm&0d*TB~b%~J_s`nW~I!pn0qv`8EQ
z1k?bx>G@B{k(@3;z|JWK<U{}&UDTJUT{W#E=0>?+SwFYfW*zc<PC8tjN4~}K{2IcN
z!Pr50qpKfWPJD=Px&vHiEp#VxXwL3fI#A~$`u9c$E&?58MHn~&kqJ#o^TvmBYR&b}
zShDEi&F!Lq^z`}a*H~-p?)-#6tbi3%FYJ~OQEfBwTykWis~d*}`ek1K3T>lXq<`@y
zRw#agqzI=Lw*-D)quSC|mhXz(;paREnGrEfa+3=O(wzj*Ok|ZD(;`5TrHx<njdH{9
zkNbgYfq)KQ+P)@~F-R+sUvmRYhybx4<QgYb`Mh!FH*L2tV$Ts1Z*qv5&t@FWxdXT=
z_5(%(Z#xuZHcTCAc2&maH||<tzoh)VMt$#TBxkKIJT?yvhHL@z;8uUMTC;Tf>i@<O
zTg|QBvhp35*I+F+?g1+hc?5`$)5N5rMX*L>kd!aZFad=gVmmJ@N{ptn-GcE%(KZuN
zPJ8|yg)UFQDnwec>e5^9=hHM20KBcCnq1ns0amY|eA*MD0?s_aO(Eodz}K1DjDy)|
zU#*?4*n&q8cL}EeafZ}Ic=g<d{>TU+b54xZIre@+Y?px_x%nT?2d~c{=IMvxNUjjJ
zRnH=SqkVyNg7j|6>#Mo7qcD$=b8KVXT*{t1(J2XUZ1p@}P1dqkJ4|B?Mf8s>&8ViL
z_81k+08<n<`aZ0M0{Sk@;HnZ2h;xc$lYHC3-@ev(yv)ahd%EiTPa^)m>KM!ZSI6j&
zaKz?n@VnaOQs9aBgCzb#lPJc!jpss&8{JVdb3j=MAn$DiLk`FfXCzDQLOTB?W+bWD
zR<v$zO)?X63Hf}I!CY|rtVabsiva$&^dg|?d-#!sUjay2@hO+xoLIyA_rdPMNL{iu
zUScz{HMnp!ABOLdJb{PWTECkK&!N>Ue(~k-<21@!PTQxb`_dD{e=A1M(EPzS_MMB6
znrr^p4=!e(2LGuV1wE@qalJqOZ?#QwxK|MXef~<QuEOY_s!`l$)o55p>C!(`qX|+#
zP0R!etvQZ=$Ww8j3tfi2U7I=nNA=9_&($}-W0oJL{6n4^_7-UVHs|!f<$o%DOH!LW
z7tb^anROod$AjKJH>aI+b^eFAHn!$jMO!0zD7f>F2hlz^kK1c)`JZ~OfCyLfSw*{$
z?M&hLj|as)HwRyRTKUH)^H`omxJMH$(f<_Tf}fj%?kjBmF-jG}XA$oItLEgS)BFwO
z4j({|aWyad8jWPYg0fS5kf*V%l3vhuyc2u;Zd>L$#ek{edi&i?pZA+o>`*6N`VVQB
z;2c0};7hOETeDcf1$w029|Rr9m2ht($UbZ>lMH<O12JbCXdXVf^r6F!Hs&}Nl=Tps
zs`i&|rzvjEQ>>XE>V;1`9EOybo|CoBf~Q8ZZad?-<yA7pDE@#w<W!oirX4f${2>*w
zS2nyImw|cjAmJezBWp;D+F%tu*#P<#(0)Gp0W-_~XIV>-HKa~{3JoG?nj0msJ@W<9
zfo^f62vG+gyxg~xeM66t?Bu<EtGU1A3`s)n5MT?yvKvw<Y%ELQmYa%)JdM8F_IKqx
zl*zkfDwXG#!acNb(H}Lr7FzYO^LJxq7is7ErOz^}ha*cR_Z3uZqU5b<PUNZKDVr&$
z_0rry-}mJ<=0TZ4jO1#74r`pMgo=NPH_ufFDe|ma5Vr3QOS6W`50<)(fiCjbsX?x}
z?m@rYI`ae}F0x;VQsSezr2`DpGcjap7Qu#iyD(c-O)66}{+Z+7H%CX`nd>I13XnY-
z8jPUC7GXf56IA7<iXmUY9ttWf;MrIhb0hebl>VEf#5#qk705u~9TF(w3-^mn_kg_m
zin)RR)CDL6SwpirEFjb+EA&y2X}PH?;@EB$m&v)g6i+LLn==^*@&LBM&a2DWP1vO2
znt8zp`;W3Q#%EDn^xgy-L?1vDMATh6?^6n6%gd4??{P-kzFRz&G43#uxdxQ^fXswN
z5~szdm}Dcketl+{U6b;cLow<_oYZc+lU}Saf-@S;FDL*i=QnvtoN>`^E%?2jYu8eq
zvXGULy=NV39}hLQ-)CjmzhSBCehopyG5*#@Iqykba1CM|l1N)&wwIvBhi<M*ycrUQ
zt=1wdO6$#Bq8{<_(66;(-S6u<PLLlE88#UF9B$!<VQwV`*DZfzElY*K`wGNiO@Ew#
zz`X0S#G<s8(}hG)R!|2ThM-dxeyYQQ{>Z`xMRZC!X>d=EvnEAl^ss&|0`VGyz{?)Z
z)}DO7PT}+7clXIc-tcK*I&|DN<?TvF;uy!?TnLA7@X=vfKaH;~BJ$#bXDR$`f4t7K
z%U-;e601>Su#?<v=+vFUlpP1Ra4B4ZG({|BlivI^OMU|uIO<|aN!Ov@&+gKZE;-j&
z7km3X5CJF0!&;ww7s>7lQikeZEfYTA+SWG;VFB|Sr}v~>VCH=lNLGGq8($Q281Vlx
z9oh`sg*N8+>TWV8Q}hc<7yQ_y^oA^!G7_uIc8CCk1|@e)j+jh?4SRJ+Yz}W>kyny{
z0~@K?IU=Low$*(*&loFMs~bny!T8E|Jty!j<a*dAcJf~G@kt+M$rW^s1$HTpzd7Ry
z4DCe{=AzSFb*WAwsZ}~m`=u-@R!7|E16xc;1fI2HBDgWx`P~fBz*--f=j2-u>Lz)`
z#?K{VfslcSmtjqy@c|3@=3`!X4AKA(xk4H84#ESsC&$~6KTSmr{?eR%hyl(+ExR=f
z@>8o_$TU;G->0P*nHh3I4yyRLKAOFLbQi7i;X^*C2Sq%7b?zpWaTKFC;=^a79+}yK
zPcEVKukUHeY5(nGv2OL#b!3nb78lyGwNm!3Z6Ts4z`li>8eMECmcTIOog-iIJ#IJ<
zCK$%&ig_J@wl$|i!D0MAQ}2pwsMRsxrx1Bb8CKG}qywkBamjd{cuikGP=@eA7>vVF
zZTO6A5=D`Dy?EPNzhKvbB4ia-z%_1}wuzYUxhsKiI&L(sM+jTydl1AY8NohV%xYfe
z)$(SXq1^|u89qU3R(}+G>_t~4?J3H|E+p~R_MoR+t*T8kFnuG&y&?r6n%!jyn&!{!
zMQ6<{Ry8;>I1pMrVj2P&8kr&!kz7rRD>eoj0yYSwss45S&9vy%`W}Vn(966p@-t(K
zJKRCqFE0me*7aZ5duZV%zc>vBQm1BEj)-bP#;~*ogB-c{jqd~n*cn+g$G{K*VAGmC
zIl5InrFiOarxY%S)4adxeW#UL>GDaV2d){JYZm7Lk*F3PGKSeRw1gF!r$)r-dhFxZ
zm)=>;AdQ4cD_8Snj_N1o`(^|O>RiGCyU-s<p6fL+=|p$hpFzV*Y-|=AvwVU6Xk>0@
zwxFS#^`BX2>J+;_>qAK`{6GU?zfc8sMA^bb5voj<X>jHhCVgjdjo`#(!qIT*Jzr_~
zkr1WnCwBIS$qD?#bU>)y`fbo*C-6m;>t?>eLD^6-WFkxD?R|fXH-{an{z~q5sg_Ye
z^p8iHa2(m~URXQ=1r)I~_wY15Z;`rTECVB)iDZco?gvgeX67lof*ZJyQ;EQW3o=y;
zrOaWEWT<QhZ^jnF{Lu(4p@exG(^NV~se~o3)3vch#|#$b);`JNB1pusFvB%M{Otn1
zAE%)v@Y%JR{*de&hBNBPHGKQMSG*UlFDblVRzT3s)8vCOELSl<KmXf>yJl8#?jyc-
zL!(yqFH8N>qyEFA07a=Ahw#dvRvJAjFBEZ@Hp_?CnVDeA?x*yCb)*z5v?h%w?KN3U
zV)~gUFZF$&bnkIAStqOdN>9{_{ml9`1H2r}Gougsy_gJ|$;K1FT~%FYnrrdjHgSP(
z0{Z!73pC!R830jsjKdG9o`W~jG%Mt@<b+l}BeFx>myjqNXk=nIhihzJ%>rk%zSUqx
zn&82*XaG|KC_>LOj7wB<C)IJ|K&A#Edt+$?5tA5<;h13`IBy?{T>2*W+eZ>0zQ`nj
z=IDXB=9#+!&=4K5WqCpvaH&A<>n{Z0eQlwU#?>}7U>&!VR`Q}Uj=#&K(P<VRnbk5y
zGN{_*h<}gYOEa0gVqNBOGL{9*HWjXPNQNux*$z>?$NDAaJ#_jZziN)AJ+MoOns^ss
zK!^*{|1?H9)*o0M*yboccpl=6WdGu0QT#Bo=O&N8X(|;q1zX3hTWH%##DX>r%ZM7t
z41Wjl?W^2~!Z&i%yx25sT{?jgNP&p4ENn*L8EVJKH$kQ}0m9>gjl|W_&BjNV*zZ9t
z=@>`w>)n*2x}SI^UU$SK1@2nIGLHqV_mB`PXj#nhpNchM`!zv8D1o$^>`uJQ4ESlq
ziN@!rKj86F$%N8yko_9PYN%zFsRg)?<P%NH<df#8DT-mzNYH5RU~1P)43^U9J8RM4
zX#?+4moTT;A_oN6sbzqQ1Ks`I#!J1RpLW#;*u1jf6g8?vk7EW~UtmHqH4(67wKYP?
z!&yMuM@ZcUUH&#0Hw#w+kt~$iU_vsNCKtpdV`VBs)9IArjxHb)Euzgf*b3$s4?Xe<
zV%K(Xu<RlXi6YY$+bREMx_m!>cI1EbZIICDL1+j&M6ya_4UoVmu&>cvkT-ydnw4{U
zP}D?Ah%kL^A6;~_;ksaH@uA@^x+dIzl#SeElO7v@S4Mot5W;%VXm>;<Kz>#t`A$yH
zz9*|ZGU!%)Oei&u<<tbzE6#!X7^U&4jN;uHVfb{~?E+d^YQU#MV6um_nyyE^&=@pF
zI6ySFNYfg!HT9;gBtYSu03LC_D)x@wM?hjkpXP-+q+e<f0(&DehLy*HKZL4<%tpBJ
zjvZ>~qY3ZU5z;5vl*MEuDpPX68mT5daG6dI&Myl{SfRQjq3sh6!LIT3%$FN+9TFa|
z_H`7vC1h6Ha{WyK0k7eJ#e_8<>Y^U~f9<^CtKqZ*_cskWR#$z^F7p~}$3uSJ=9s=#
ztWC<jBL-06K~Qj4m47y_@kSN)q?#iNM>mZo3unfbi@zDsz<5sY4Wjecxw92W*(H2@
ztYDl7`~DqYJmr>4<tKGOST0oDtbEy#Y^+(2gv)7D1Qw>23a$B-`sWAL{ouy;fsq2}
z=bqrP+S%Sf=tvYCF0`PhM>42y!<PP)OUhCC5T%N`gqd2_tS2*ocOEtt``5kz3|;^Y
z#TL?yGWiY=F>)rzC-ukiI^gJcXgc%+kJ5}2(<n>P!WYYE!9gVs%w^f%QqhiNkAFrP
z0Jps?>Te3ts;}**16mO=Gs~i?i8eZ5I#yx?CGK!GD#-)Y(k91BBX>j(i0;D@;1aVx
zX#)aJ-7>`&e2zO*ysBlgm}EEbc82Z9r7zqWxgZ2>R0#29>(_(*(Q=4~0qr4jge4An
zu#vB(SVOU<urZxx^|(Q3+AVg6Y49un9je(aje`0D*{lTr#$_iv5gj1unLo7%Q$y=O
zXqzyy1yv1A)716(K!pjKYUAI3D%+W1TZ<2RY|ZgWRlg^UNaE9fcq<z?l#o4=8WM<|
zI*6VqV3fK^XB7B8HDrkV-Bm~+tz+=-@z#~x!}}AvvG#{$$Bx#UOQ5#jn6781Wo7$X
z`>den53W9ml^=)W=Yyjde~354Jp%Ue*FO*HZMZR}@(na@=w~rTACk!u4wCMEoRv`|
zpOf(5-e1_cA5ul6dmVR)u=~=(WIu5IbC`9mYkHcfAbJzd7FCDk&ufk{bNq(xuI1L1
zIzEq|?PwJ6=ray@Z`1OY_4_!pBNgYade`QH26jh}P>5g$AW?2YVWHKz>lJa7v|Gah
zm<nvF?<seGN;lFozJah~GGVg$f`-96CR6NTaFRIa^bW}qQ^xp4H2NYK?Vq{1G2I}X
zq%?2fS3$!mKLzGx^P2E-XeKmLKXf4`JMbGgV*faly5-=Yh8ot;Wo}EnW!dA{y3~nm
zCZ0Te`x-ZlxR+R(!+{|344KG=6k!Q^yGOVKM?eM5v0T<`O}Ud*#3TamV8^~Bji}uy
za2<u=UQjx%L`wz9kHawv;Cl5+9<&$5Pk&JCOEF4l#s*DJ)4$1HMo?NK$J0qE#_|CC
zFIWT|2)>YIXuA6l?YbvO4CM@U_PQV`C|Hd=?`q)>-aHI(t!ZDdCouaRFGiwksc<!5
zxR(|$6+^`*a`oN)G=YTkr$o1eP|T@Du$^pD<zt*1CsuIElys@*Ma|wq9_!h#&eMQ<
z;I9wlz)MmI+dZF-JPQT`WR{7eH*?0=31pqTR_)?@my}~3=rsFMK>@sd&VB+~eG8Zn
z6StHoEVaEiKZ*0K#vVWmS1|ErO0VnG7M-af{mvl@u!4B}!LpRE2Jc#8jWPt(6QpoQ
z$;BWHHhWEMQ_O?%B7>C22`LkvR-g{YF@seWR3biauOw^QU>&s7&Jov02~!x<R9hP8
z#i-)v_OeVeS6r5{kC7)6LX7=1D0g(4I>vW1h<}KK@`4dUWP_I5S*}==((U+Bpqu+5
z^y0Lz?6@P%{FB2yxBN=K<5*4tW%c>ptIytWI`yXM*Y&OmXbvQ6a<c&CHcv^3uoVWR
z&oL+OTC$=(Ro%|XAK^Vf5mUX5Tg{xLH!4!m@=lXkxCKe`t1k;(zu<|WwcF^xXzcBC
z^-vE8Xg)3Wzb-YKzcN!2eXsO+FF2+1B-TE^l;mj)x|y;$7N0?A6rlf;lSIHxIt~0J
zVDIqJ+Kne_4cvFv7H4ZlzM=pAa|aC#-FUQ7!cV>qT(HQt-3ISi4qbHtrogJ#!^Wq^
z)W7qa=P+pPW<l~byWGKly@^)SAPs!PfAh=rNnsPZS&p^Xd49TTeSLx)M#`rH*z4=v
z4!oWs;q#<VUUp$1S^EXOjcLg#>_+)^xfb=*ce`EQ=@R!60`pL7Ys6(XVgy>ZLhk*H
zy5F^P2PSlWUn%N5xTcA&XjBA>$m`<c>Gf=zkn0!R$Kbk%Z<F+i3?)v3jg}dkOUDwB
zZVqEuFv*s$DPJvS=(sU=tJy={zGD!eL){!0c}Qn5;_6;AF9jt!fhK&wZIN5z{K)Zp
zaqme$5uE0gb&|NMizvI^v;91YH)G5C%t5QJ9=%am!e*s)ilwHSTIK<5cnJRbj2-dh
z#OKBeKC6`U!591*DajjYr9B?aT&55fXxOc1cY=g1x-9Y`hLeOxm4}yaVQaE=&h2+T
zq_MRoeTTT4F*-C*z<~7aYa$rmU&hiobe9^Dbd+Wj==K!o8$VXWj&mBLtn-yQBBhdf
z+R;^Pr2MvGvn5X0qbkQE3;BO>ppO{4p0GJ5h+UY=Bq)BG8j$#vzVyB61^1Ic(#yFu
z=(f)L$g|Eyc?Xuu2*Kb3;u6gK<G8-$2(;_{Qa-kn^nl1py7D1(<27=wzQ87B(*b)5
z^`A}G6gS`c%SI=16;B<#g0@~KI3bT^>d=81_PRM<IFAXW9ulG=57N=G>wNG+^X1}2
zoTlGJ#^nV3u`KzVP=#s7Au?Y_U4)W|py3j~4arOv;~U(dzy$9s)8!E@GSmeX8Ek|v
zaV)Ux!U+6lBv09}!H)NBo$5-y=`Z)%jizc=B6QsLhL+TECf-y?m?z`B*seu4VmHRI
z45EDPbJimj?_Q?zg-#5z@jAU9gT2=@j^&DzWN_PUZnznumnw&YP)AD*l4EWwaZu=T
z^5@DUKKTghDBre8)5X}qo4918cfp<zsE`nkLv2An=Bn%YU9ZvhJ)jt^PRJ%N4+ZXv
zAGIACoJ5VW(%(N*qx}7q1q(}~Nh+4ega|7xhK-GY@b?#l-spe-0{j~W3JP=zrbPAC
zzZ(6kZ7d53JkaUCJEBH06B7d`At~j*`$tCzfyPkc|MdE|b9mq@aQiW-y?FjY|7i=7
z8xWI_6jNw#RLqS_7kz7cGEJt<+TvJwa%AZq`7Zn?h$I*LT_Z(sSG~84(fOIZXMOB1
z4&COW6XZx$oOY=fP?&48i8NYGL&y8XLRr^)GRiGnS^?Ps{)8#-5Nc|?4vg>#_Kk?u
z<zhZSw!Ifc%RQei&l+RRAyH$f{Rg%Bwnk$qOAQ0BV{i1E2Oh*EU^UQq0igzoqxx+P
zuKQVQ39L-<lmN<*p>+To@INp$NI0Zd3?R9HjLlM&_!BD&@1t;gCMYT9x;V^?mufG<
z@CD3{hLR>9Ofk%K<C#sJUNNjbwjVysDh&INQgKt5GQ<hkF04tLBKV(Z&>C@6QU<Td
zG@)w-&@fVa7~?QQEKlfjf3WK`Ft!7dHDJ!q+Fi6-pi&+`Wf*X9SyCfRtQ!No5^UE;
z%AWrF3(rs^O&9@~qXs6Be;Z>&Jw`Y_!~}kb^eteOSQZQqT*V~$tVQa9Q4lkh9~KNg
zQWdJ0C`_p^WmN@C2^>Fc)*C-}7;sGnq8+A~GE5q58^*fgD8C*|m(sQvOgk&2{0^K{
zc`^$-zpbM938Y31l!rCDFP7^yWc6??e702<-}%1`rof;n1@r!F-%(GoDKew19Y!67
z_?SjDA4V9_FC%voMiA~LY8IoPr*L3dZO9&b*$s!^xNxVX-;H!4D{R$Dqvu$Bwoa`v
zGd^SnQOq1>{O~9e?I5Xt5s4lZS_5m0LCd`-4=l4c&qh+R#@;f9^b6<Bh!DquNj4-r
zN(kLT<pf~xV3!Zii8%t*hqo49A?Ty|xxnZmoqZ`(z|G`_c`4(X-59x{m?TFyr2Z0A
zO5!(=5u1JT#NYhnCaYe^8)dhj8F6Pew0q~$ZN*)#y)R95A_u{DzaSJl#PXo?95aQx
zv36O0Yo(<avOk*d@oiuzj;k@=KFn;1n`Kt(dSLAI!Jctv^lNx~Yh$h(bnaZMnF$O@
zz`2`$MS8W9Y%KMC=cWGR!FCF-Ir>wx8*Gp+OLPej8LcwymUhAXL?**ZKa!jBQGUFv
zdR2NE1I%8#JAT(|m?=;2Y%fd!9Ch<%=MP8jX*@PBI5BIiRdgPaU8wy<AzU!UD@mDV
zbbU6zKkA-;R3vCL-$$CRSss{`MI1!BR-fGI!t1nLVclar!bR+kOVoNKR@?02%#@d>
z@#)|Htk!6_vt-d|rXb-my<J+VXHMQMY>)h|`?`~IK1Oyccq2_sD3;~}FHtwX?gyUt
zcUspjCr2c0XYaM9t*2j*)EW;Kbw8eYbrVfyFZrGM*)1&nXRy%}Z*rXG%C1Ae$s%~t
zK8r+^NyUCir;Z-c-*RMKtKsz%JPy<$_3&Nel(uT*<L2;Hqmnh8aQraT0x%v;Nbu_=
zvg66Ob3xg_{i@=iW&<m%2uwx7H*Uyp42BfxX3Uaj@h%LHJ>NMeP6frjlEb_r<$i}#
z&BL6WrQK=ZW6<Pr&1CVB%|BetGS`cs;I?03l6}K+X6d^LnsTQH$<h6>@JZVx+k0{$
ze7r%7!A+j(>u;e*?Wwey>Q>P$Uqu-)KkQbhBJr9I7+)PP-V89jQZG}8KQTn?D7yYp
z4vJt4#h{$(Q(DG`IfZG$>cWRvg1v%?osOR$1G!T^Vq-x1)5}CMe!5bM;}oZtm1QVP
zo)W}AQQd-WGQ$_m9D?<SBd%j%SE>!6a9;IxZ#pt~2;@3a0~Dpb_ZvTE#@ypO7AW?7
zjNo0rSxR*v5|Zi-tW>iNW-RTqw58tV8aJGdeJaw)W&7G_@b&W%K86QcgmI9eBb~^8
z)F|1U;YdPdV$NrU2e@PH+1?#*Ut05;2P0|suTDf<cT%Of3iO)76@D3+chPp#nuaM+
zdgz>!Y1?=(Gq8XoL+awJti0RX-(1k>kl~+*m%1Q$*ThEt_;&M9Vkhcc1C1fJ&sBj?
z6k%`s7cq5u14E!I>+4zc2l-#Wxd=GX)!EWTL%uYt7!9THw<%pE>c=oPt^I4|oNpr5
z75yooogC?=+b3nWg43-WGN*G1M`{lON&r<i>-nq(Kwb;y3y+v*fMZT5wAOdMTdOZZ
z0^E+yr#W(k<y#_1e&@_67edOE&h<C%^)ie|?htAIOvFoaLNL30#%;4_3%8oV>GAow
zpHS-+h9jIa*{tm@D)E>EL^Q(ExRH#c(ZieV;{79nQT0k!8GJ?HRu=gN!$&)*`;Cg(
zFMfy+b9qsBtT~d0ohT)L8hksM)mQ~qFXH91Qc}!+P3S>q+ArUgL6>a!J)lh1m4o9X
zcR{u>zRb*!vF2NtUw;4BjA+Ag6S3T+W97qdgyNp+H~GsiFQwJEt;S;1jVjGj&gu`M
zWakHCDEf<JoxZ>$-<<ZB207^O2YY`P!ey2USOqFtC|RGJ*&^HDjF1>ZGw|jrKK17O
z;^Ocy^Cg&#$$#EE2nMS6Odw{astBbhl)yHP*IJH`%-!+6Phl}04TI;R(T1~|QDB!p
z@bk^qTsOxHVJ~biK(T4VdcKr0PvkL1CsV9LU+IzuD)<{M0Pm;X9!181;@#RFqq$}y
zu~Q~6O=J;4G}6;%QtgbF`0qVovt1)qq}R?pd%_=C`mrKQ_FMSN4^Q&b=F{Jsk#e$4
zzB(0V=d0xNJ!1`F5I4f_jK)4L7q>u3uU+GIl@fB&&LvpI&qFz9>p8-|F5|xc*w9X5
z`x-J(zuFZO!Zx5<1SLcM`&~XY*-?I4wf(a2gR$7=?0xMlh3BA-$Ty_+U%;YeWX9!{
zF`Q}hAt=*s*V&#rkL22s7*I(`rna2=B$4)qVvdSNGx|dq_sjPPKl^UkVbQmpfnHfX
zj6x4$*08q17~QAjjtDS3F6C-yqfUzBeK#-i<=jAsqIuEn&!$Dr8+}6vG2vRLRS<$}
zV3}R?+^~cF>!vXT=d$>gz)2Wq{91twD*`R!sM+>FNH^{V!KH-DUN48=Rr(n`$F<wW
z3V7m{{|-Dp6Rug#Dfm(?nFUQgEJ4nZr~<#ZzP0D2;fq`qI8f6pvsMZLmgd!tLObC2
z#BC*Ntl%^QQj`e2C9U2P`C@q6J0&s<DLl8Bo4iXkOSg{Tshlwgs|W7D$=~Jh;RP%o
z<dP0mmxJW2*z%%UW;SR}&Hshawo0cE7H}X0848=9%JjyPNie?F_ovxj-sB&e%tce3
z4KTY`8{8@j-(lM_a(+a*JuQF~x#Kz)ZTlkM=DIzxxaGc+zakSp8L({2@;K<5tPpsR
zc}4<g&e%cc>3^rk(9z%}Q^rT0eYTpYghd;102#^$NB}>?;C+Y=UB5Pn&wRzOq0u~a
zeT)Y528J#E8?h`e(u6$)E_Pm*{k%Z<Wgb6(0=<}oBeIxG+t1#=MU+RYpV#@RYTS5Y
zW#(=bfekT<e+mLHhZ8Z#657*z;8h0ya9eyCzvy%ng&$Mcv*w%G7yGor;d{fqeQxoi
zm`|5U8408*T*qy{`A}tW>^c<3jD>{$0EZ=<LJs)Dp09~=6Xk1FX@WLxHB28XBjm}{
zjh@2Gn=Z#%ZD=*o!@PK#UKc13M-3;xE`GSya^CY_vT4Pb875GBYyW#l@oZd_XtGF3
zNVP&)5jy|W%K$hF_4-b`YTUc52eBDK3aMU0Yxiuz_Mi#_;E>*fzL1WUa#f!Dw1*MP
z$Vx*E6!Nv&(Zhw9P;y1s-hC?LGJ!@5j}x^iA5YSSbfa19L*IQp^1+9G!yMIe?fKz;
zVW58+J(NHnLd#?ng4MUECO26ksaU4ffK7FK9Qo~!P@w__m0%SzY5kGkkwB*SDVBDw
z#i;8<o|=r;8%w4nFD~TF$a_v*NWImTVv%kIu|@k2^6Rg49bv|kr89+=+I9Q$XsNZ^
zZx07-b0uc9%;s|_GJH5mrc8z+$J#~u`5X(4?i1|i&kr@z1s?fc?7R<ZJmist))36p
z)0vK^4<raMv-A+(@nA&MKci|ia82J1t*H;G6`Mm;{MX+=Ff&}xO#>Qc$bxsPag%V;
zs`NZi_~VVM&1b6w6gNF2oAnC8&0mA593>H~VsttE4g2heiO_6U+9)4`y)fBGwRtO~
zZgiRn;%I(ta<RaB`1q~G{0{c-WxPKw+kobbu%^vs%O1ZtnA)5fCR#%wQfEn}3W@-P
zn@i3guG#f0hx{lt>RcLWHSs#MzWfdfFZogFXzGQH_VE;<l9GgP)5xoF+wa1S#Pza#
z!3QNq_+F#0dyakMfnGDGai~}^U=?^P_~f@OXQF1Lck<_d2ju^EK&-)N@gVu&4XWD#
zqFiL=i;MZK;y6xuEvc;NCm)nk-m;1zg4cKzKdR_gOO{lF(6Ij897ZAB#AV)eM_Z?H
zSS?4!f(b9XNZYOjPdauf@!7OCk)X8po7bXkB7r6=B_>5`%?J$?{N>=;#>1Os{_6Ay
zhEt`;G+}l3ZLi@u=7!?dd)iL`8cpg6%HebTI+NAt2Vz!gQgYfwSxL1F6t*S>!oC8U
z+M6$cP`XLeZFtJC-MyC-K)a}1M7D1m$Q6wa`jk`nF6E?Z0JjmuZT@59CsefBX+!5~
z?~biQvmXA^ba3U$Ihz1-edwQg{?5OQDdb=p+NrwvutcD%PSZ*1{I%#raf2c}V#jc=
zi6q>ZV>(6fIR1byz?k*-J#(2>2TJ^B)(?Mtwl7S6*O?MI?D!~bdr7sJX_QF;@n4z=
zgr3)QjEQGHz1g0D&Hq1I0HmQGN!WH6P(hd9uk(hK;7sHQx|huZe!IBLwrv+F<sL_^
z#WTo8e)*Llomnv&`>=nQ{6{+XZMZS3W(Q9ER!4WfT(WZU?_sxGt7^X$4jMkvs=Ziq
z^Xd#PZ`=Y!k{I{lhV|*<PNhyQ^JVwBPoZg@(m_`>#^XT+<${~b&0Iri*UBSLH=Zn-
z{DZHdh7!L!-UumgjB4>+PYUUnc})(rv3c}6h+L-AYF-tLK3cpG`DOKjQZALWo+(ku
z;60`2MvU<{{wL%Eb=Le24dXXP!5CdX&*`c>4*%5Y<dquq5VbdduuYOp0>-_s7%-Y@
z=_hQB3JcU0*vKu+^-iYYcf1jfe!O6%bieC%6B2ej;Z*fH^!{D#TYc~SSn!Sbvfpat
z;WNF9b)@7?**+^Q0dVhdD0<DYhSz!_YByr`Z7q^o?4MuhX1{HyBB>?`2!`8#Ho7zX
z3|^8dY_s*K-VxAjb8FCQ&S9d8*e(~8KU;|P*_-r5$N*=e@-Pz)ddose)^t7FYXr%L
zZ&8pvYTRySV>#>(nERnuV%GCS3C7QUyOMAuT1G{<06cqcBtu}Z*y<VLk(_5fo!1QZ
z(u26k&bFC|Qp??>Gimz#9*UU}f>Qb5Q^ll5{(g<Xu%Vt<@E<#X0Xw)4|Il~bA)~QP
zk?sOisS7=}**@hLdyA&{9!?g4NX|?MAkB`CFa&PRm=4BrF&vVj+!!h#U3PJo-;gp7
z`PNc|W90y$5{CMxbt9{lYAm_kC~U|z6<m@cW~a+B>7Mz!Zw%iqp3WlPc+{RW@9JLX
zQ(jh170AVS;;j*d(cn~JQf}Gp`64I$9{f<C3_Z%mhxFkM#%zQ!a;=}Vo}z9O$c+Pd
z?(eGAgAjc0!c9X%%s(7At7@JI8jerGL9yQH(Di<+NXz$Qlj@dI?4~U-5!te9Jxrt4
zhfMu7rDeFW^C8!ZOQQ0D0Rp^{(CGQvIBLz;2}L1`Jl~mWpqvgzANQsnYRPn4??^2r
zZ2Im=x5HUp&nnwA0#U&8lX)Xy{sRAniERWriTD#-bP<K)P-d|Dt}b1*^MgoYM7_@s
z%n4>^{au|rxis7*m!sYVM<V<V0Y~FL$Tpd3)5KgnkCQ67bYyC_MDXNGV9Drl4yV^;
z!>`J0^M#kP#d0SV|2|;_uCZpKHc$FVg}m1+bIv;WQ20-ff}h>JV`Tw&XM162i^sKI
z-iQt^5u1*xyGAmb-avNm;6$gXT%GwjFR}D;jdjcvZM(KF`2&|O<XSgGod10H)qCQZ
zH%z2_e`gN8`FpVlUxl9RwZVNE{hrXCqm?cX)Hd_|P}B=H(!lSA<*=%2Oh7!5!{>Sc
zZ~_?5#0}mMDWL5&F4w3h>3_%9m$M*gFO&23#g|t<hzwT-D84g0pKsFt_B__RKD}M`
zCi?8#zDWpi-slkp->PbQO%z~i@m3VT4y>zvr55s$ogFqI+iG53n5gtbqp!O4v%fR{
zV@oWW6A@eX2RpW3@XikX0hLD6K$1A|#a~`R{_Av}P?{E1t0B2!%>mz`{|Dd9?#M8z
zl?mK8AFuSS_4w3+P}(EToi{Q@^@@E9Zc9GxcHcb$9PYXM`qQCa=)xyj@tWYMaLJGg
z$!gCZ=;VSI$V_uqA#BHF%R`glm5K~M7cs)v+$Cz9Xb#mX|GV+Ip@O~}v?p@sa0QBc
zg=2x7FFL`pY3<wxGb}L`6r{^}-FIV9CcT1}f(_*%EU6ixUmN6ODLOOChLbB_8uFMv
zxt36kjNUuw*n-|UwXXT3#**<^(X#Nb_O~1yN==6yOK*go9Ef!7oLc_Tvb(vGb?bDu
zzIt>zlHdJYZ*#$`Qs4tuF1klzfO7S*(4faPef28WtP|lc*3L0^yrC=A%VAjAw{?q@
zv*U)o1GjL9yL|EKcfT9ra8BqodIdoa8VX#_a&G<!+Zo}!`ddn=RU9ogk7~XXi|0Gh
zhuu$V6}lfS=ROb0_dT#46gf5|+1`_23A=r=4L8nz7Xicx=82WL4glQWx+;KgG@C0H
zw}q8eud`xX<Ufq9lyw#S%H-_v47mz6^uHV-&})AFY9&#TYVqxN*;;>##`)YtmDB^7
zwXJSv`28P6kq2D0cSI7`i&5+@chu;AeltsjdN^AIVOVxb&XPAzG<?uA9{&~DVp-+%
zi%+>i1>^c8og}{6VL`(A6}O^|FZ-|FBIVX|`MX$Sj6Zc)l<%I{YmA4(Y#uAtIQHag
zZvI@+wKP};e_Lv#_8wr_oA<`P5Z1$9@}1Aa<Q+|+7usZqlDY_?5>M`sJRhn3rAon7
z=<R==>HXz$cdVHAT-77voWN$|F=}9=^n51&uf2P^*@=BNrYXi+Nv{ZFjirWndb)>-
zPc5dteP~V7U!(_6*YgUOsOk9;28>uKZnJ2u4&$laL+#th>e;cw-bB49lX`iSRq?5E
zUXJe8tx~Oz<FtD%2(t1AX4QOA{nNzAkG0M<PYImj%$t**J99KOX7_RZ4hgAAfatj^
z^7NGYa=>HmjRqy<sp}N<Z$FZ2o>6-Pl0|}AtTu3k?+$+%S;&ysN}{N;=r?q(AA?+b
z&J>^4m$rp})Mc(5T3Ux)v3sy?H2R)S5WYN9sf$E)bB{p6X(IWMbgi^q?oq<3jt1=x
zReaE&oa5p!-%-hP6IN?eGfz3ADCA=%4i3*uY@s`kMrua{X-?KOgY~!T1iAOe%vavq
z9peD8!@K9$fg4-J21{FVT&)!IP3+H(yl`U@fe%*<YA?^*UL5DYi&qFDLIZGZg?Gqi
z9DYo$#X`?~-EC7#c*vg)vr#tT)b2m+DObjE#;gEs=G{`)4S;YC&vOO>3F9VqTYuW;
zDB0)k*H4U7<&#LP5@v6=P8!+S*~h}^dO_x7R*7LGwVB!|i=>ZpyX-%qoLj@^@5VNH
zTIB<*V*|!TAMig`>UKN+((UR?m3y4c{X4NEerANCT+T8Vn!tGv)QG;MKCWv98}1~u
z0E2tAB3E_%92i;r4UAwQ*G?H;z|xDeK!JwgV77W}htq1Vl`UoqORSBg`6r#Kw&!=F
zU!FsDa>ff}TG<}$0AA)jsStW?#|nOx<0>fP<8{Kz6pZJ9BT4=cc$r<|4(fYiF4Z{p
zTz2y&<J9n>^y^pK=UR=Iq4!~@u|JQ6JP_$$m<)D5@~uS;iNyH@X|+MSd2~MXov(W0
zF@H&9cGZ=MEi>C^#YZpSBzm_Zm@!$W^gutPF#8SD%XzjK*qUi9&hWq(ey{^S@Frlv
zNwz+rle7S6)M>}t=la}#K(JPV_I2)r-zy;Wc<q)>r9h!nSPtIPwI@!pv!oRxqV0+a
zK-Js;F^t>^aA{Nx)@5wYV`-J_;cYr9+a4U`P%P3tf#LXln#e~7ST<e$${s&vUw$_<
zs3GOTC8F-zAG6=4Z3`8@^__3|t{3aLOTe`n9U`Sr@S(O5o=@e=@vXWGtC6afs59zQ
zNFo(Udbb9FUe<-QaNGX)RlxYm6*3A8P@}fD`?bbOix0pR@Ln^lka#%NoH%Wi*6Pw|
z3Qo>^Y>Z{`o?lh1jb%ymi-v&;{+dJmIs8Pt=zJ3L^^|+?>uJ(UT<$-u$3TQ9<Z?P6
zeo-l0C|C!e#Irr4S2KTnTlE`gO%Nk~yDp>rSy_tj?~H$YK0I*BS0>%Nb3AeJSTU4#
ztV|!HCs?7`jKE5!+I#C6UE1gA3%qmC=TnvbiBn{%up;!aN}g`nd%T_LmN~&N7Qv=|
zubcDxR`FDhO8sI5oo3vuHaq&i$KlEZt<iREIh)a}jAi+@cTlPgP7<*K7Fu9=OiFC0
z3fC%^u;yyB8rn|Z#!B#vt?FoPSzG6O<03ZsMW^pe#`w4#9<3>@+n>B-34NCuO^Lqa
zduzq9P;Rih|F&~FoREm456<%N71nZ(pRLe2NH+fV^nm|$rZn=pG@036Ak!UFa$uks
z+qkITx5*OKVcTcbmIt6?2LHL1viRz4_3suN12>^lCXRwt5iZ{VqVw2>-t8B_pQu#3
z0VK-ec%ZG+ebD9Z+0q|O+WxIq_J1C+w2FJCg+TH$zM_%bio(SgITEj~!a1DW_oqYw
z(wfW0=xAJmBdw|2peJTN?9}Bj$}X*c{7X3P1;AJWY?L}FImU{nxhePRWFC;nrY|u5
zni@yvQ7863d*o-MF;g4<i&dQdg}~*&97rhZEGi;4*CWa`3LFNW&zA}9qPx^dSx_br
zq=iDtz&AYEGx@Tvs2B``*Nh2LYi3RW-7`p&-7NlZy5}PoT!UIBHKFCw#0A~Ud4sQ{
z48IZb8};v`<T{^|Qo{@;z&S5p9`lK#0IYu0gI}Ey^5wQZRQ91VN9m^pHd!JQ-n1bF
z0fHoU$urCj^7CwpdCn7+*(Ks`J^%*GadIDFf&&<;P!EkV-ud#=)U%gT_@;OcD=*4z
zTTQAP#N+>&uTl;~N~<m2u4a`62%dWYyagHu6~?;}9l|N*t1=c`c8#~5cF4%yZSTH$
z`K#x{{KXGFFoZkRyf_5wvE+o1MXdp{@4E8z<r&O;?#-IW;_C!4f92)#ya%<dwcSK%
zlcSr>sHqoc!gLb4Xiw$|HFmGJ+`NF+$eKK}VvFN*LFs3IgzeE!8xDgR$^}aX&Qx|(
zf*iFNG=~X&qjae{Lh{LMM4VR|GNhGbgHhLBLvh=s6ej=W8s%Jtd4uNE&%yY*G1pxI
zP=e6a<Qf@FHBXDFJRwg~s$LZ!y_M$$qzz#y{EXXGbI^e>v5-wyA&sq<6%?Uc-0^8?
zv+K$GrA`?FP)JtEy_0m#10L~3XW+g3;`;m~o!Mj-Q^|}~X!m35Mx&l@cc}@JKEQ}C
z3GN|?17_?5$z2f&L|IN~i$U_WUE8W2yMJPp4*zIJlbt<Nd|1!^iA5*ZkiMR%(E*A`
zE2}vE>hGFZFrI=)(TXLJv5UrZdewvz5X&Lw(kD+^%I1AK<yzD-{mo>j?lt>Ut)ku2
zvOq$qD!jb5^V=MulhAa?@98{Q*Tp4@LpWey+|NG{oM0gRXvyyRqrWb9*)67))4+tM
zG(SzoZRae5Atua*R})YCo;%zHBbxpKTnxF^wOZ@-LII)&!MQS6Mn?(1Qn+o1MW}d)
zjXKg}$ORmiX2ls%YS=d#k-E~##_!BK^&s>f5$OXwd#u#Zsq)sCmd1~uB3(Sp%0J=u
zFzmLXFCSEF`bPC~!H|vr1`oVAJgOG`j#t_xKYf4tZn;h1P9~g1X484>!b4Q1(|`?}
z5Zxi|BFeYv7rentwlk8ovsBUwvJy3QM&^Yuz|3W{KGRGAv1NrB?u1efg}qhwT(Ke_
zlE^lCNc5ryn{cSMT3L~&{WJc<!jIlay4}U$|8?MtwPe7#fUuQdfl3tYbdr{RhX#WS
zON^4`Ds0S5{k)V0d{|=3^gYnt)#PLgooW-_JDCU6mnIqBY|1kQ4w2zp5WdW@`kGu!
zM%2GMc1I0<;V$~>FAhjx^aRLsvxTe@j|Sz@ivx#Wx&cjkF#NWWq>EG$b4SsUU*M(<
z#evxZ!F{0vvabBnIPMZ)^|$)y`4GPd=W~^9)tCBLRj_wfo8+oYYGT{Q7zdZx7w>;1
zRerRR1PJ^plPxW0O^#DSZUzm2@d2yL!Fa<ik6CKKcpXVLytU1aJYmnQu2Mq&p{)cV
zz(~7f&}<uOf7@+kHrs;NV5L<ERxjaJ)bB~jK9D5No#>EDDh?gd%v7PQbQX#!%DhvU
zsR`;X4RHH53;6t9eMnxM-##0mQ(>#DiZ2SF8h8Pl2NtFUG2^}rR+X6mGS%C1FF#$8
zINz?9Y)244Nr5DbI;&owY;DpjnvXz?sOafJV|ub{xBJcYnYz<x^k{Ao2@0z#p2`b|
zW%&S#sCq(Or->nSr>o~%5Tsapbp5UGzlz8<ozoaQ;M*0#!Zz)HHE>qBWi@u&p;UnC
zHQ&Gius30Czh2}H4b#u$10}TI_`eOqEtXrlH1+_mNO_$1xOd(LslWP!GEU@%Bd_hM
z!k;PPj5`?f$G^~F8&yw0+@h*T==c~H)*t5?n4kdg5pRO#{hbx`o`+ef*6{pLXeRTi
zXLgiKsnO!q`;2y-Pj$kd=V1V{2d=d72`ge}Ts2vW>;3$BdvFt<y<uc3C)FD3BND}G
zd1VGF&$;uE-^n`FD?aR|=iF_0H#li;R5^EGf^7UX>bn}J{gWlcTD7K1{kUXXxWQ!O
zxVJMe_0ZRBHOkpcY=8ND{5g8_5`YbbZ9gp4*~EvDX3icnw&tsDggXDbuh&0GKA{E4
z@%4Bdxc;9!S`No?X}?>cyJEUX%|(If_m7rQSNkdkN}0D;)FgScU<iw$a;x&kk(4VO
z+%?a~5k}wsz~!R#9C0%+p2#*Qh~0K4nJ?OI??*lBK#Vaya6Em%Gi^v8&)TcqSMHRx
z<o2%o5oP@ANwQPSpC~+A=X^Kz>IvBj3Z3vv+-71`xMM^qmnkw=Oq$e~Of#w1d^5or
zyDD*Eygx<JuKemlq10T5Bs_3Yxyg2%E#cLD9j#`$&`r<J^WMpFNi?aXz3h|6b29*&
zAzOZAyOyhQ3-f%=bNgI7(8rL~&=_s0qNuo5n>n_m(O)BLKgd(#PfBvi52?jqQoW%#
zZbk1-t=+XhO13>8>ctw+?w=-!n9P#u0cq~O*6AB*9%eWA?&a0$1+PG%<^K>*j{|Y$
zf;0F={xp|4Xuo;KA$)EeOcs)a1b26k$h@8$CVf%1V1GQj%liIq<amEe)(1;~ky+IX
z*mgAe;<orlT4{9Vg7*&wdG4FC%bvEIv0`~2kA|}K?Ei&3YVVZ>qJi!2!YyYE!iQX2
zZ8R!%s=GTKbG>*Y$RiOTNMIOkO!4cHU(1;|FrHBn4SFG3)%*3#WB>&N77fp*s&X25
zlsMJw2f)?wLSK|ha$MCQuJmIIjvbefyrAS!oQ?6sqV`<<{yl(4|Lkp<W+z4|Q4C<V
zl9@*s?^d49{absy-K?w9*Pu|ljp$FJbRYdoHlg8b6~@&K1|8Mx8`^@5!@(7O2`GQr
zmmerKSpfaPU<gOE{sJ=7JO{Q~=}_D@EMKdGN7StdfEU`O{j``c-(W;Fma)Y^GL%Y3
zN64ZD4=Gcr#BVEJE1k1e%cu&Bji(1OXDI{17=y7N|5dgV&8YdY@+l%ARc^B<D1r&=
zaB6#)-|3F{{?;mtmvt4XH&nafSXnF2^{`RW7c>v(;)-5CaUlf9=5oD(uK|%wzQwaJ
zhU_m~de+(qL$#_cxWW@{LZxa$HlyDPvy-N9deh8WFZ+KY+XCXEe~?C-pV@GL+LB5?
z4<#_v>C_J9)t<kp&R>0sJT3RW!Uy182Jo;zq2BvZkM-#_Q+A_G8NNE{ae_A*OLOvS
zG!mzf*iJTlMRjm6bA&UPJ?IudC3;tc8Sf-Kv;$WA;t6msHben0y}|e10wvThSC%k_
zrRjA)T8c<E6;tA$(<%WUi8NkM^XXDe%1=VvO~nD#b_%m&D-Us8f70$A&RoGsR~=n1
z6mH0U`EpGg(;``BdCio7k^hfcCwxNRk+)vBp+7~_EBD6PKtd@ltm@O>{3A^l3tor3
z)-?OZvogsQfRwzZ|D%U`7H^U20h!0>*s?RHo&dJ|`2fGvUvwG~Z*9A=!<SFvrU4mN
zoaNVPd<1{+p#scYq#sF;l+*d#^6No0!1DG?_U{59aKu<ym@vA^v{HcP?249}w~$D$
zcm)keO7LPJKSQl!5so&CU}R@ZgbCoyiOGa$a21kbR%6agj<x|n%#kPb_d{$P%gR*9
zc6@%mFv7H}EUvGVv<?Lk`gU{*IbHcQp3&$tB{*?{+e>ln33CExhuTGb{<&Gse@bhP
zhMIR*FtNF{8^DlLna^REOAPTTMg61&@Cg~;DV5vjPl$YL?&yDx9kTx~KM_KFQuGJG
zj-bmO2Fv{^gSd+V1|nyZB!vH}+zA8D(6K^Ut;eDW=<Mp%GbUM~3D(2AL71TID$H;U
zHj7||y+WLTUe;#|dC-;PBc#Xp&8J^FCr|_L9vEh*2QV+HnQQolLG)*<$#3lm>S%Cd
z;(%EMTEG*m99-4I#T^@1K}u({{*EsldQZPi>l47Ho|V%wo}$9dVq>S(2B)=|;KFi6
zRHonA;Od+X_Qw+-xuO11OFhHuh|UiglK?u?ya!j6KT{-sS%f;EB52^LLc46~N(As-
zJe_w-l7Mv~`{^(A{$^<0<65|yH*+Edv}|v2R2zPt9k#!-qrkzr(fYx5%HkH7;9cF7
z=92AaOD6xdWFc|t*-9~=dA%nNb{Zh`)0uGq8Wqbu=ZITBM83ZRQb2W!?I@w6%^zJW
zESiBv)6cY3*2*hs1}}i{)jHM%W_r0)=wrPO(C7fjoB}hAPlwk5$mq&lUIVs1nfX?#
z#RfET2KMH&;o?x&zInu}w-Uit3)~)zsj<2e{2z%VxHt9(a9$JEFMS^D{f!cEUO{-g
zXK_bTYM<#Csrq;H=UQPBO}DG9k2gO8bq4hb0x;rGMUfKOK1DMD&r}W)bqQ<@H^%rc
z{-m@RRc}mYNg;DH<}rdIR~FBB@DzFv(Eie?-%mH&Kl8-yrk3q3JAz8oJqX$b`G8Y~
zSYF#A^!;6ueta?l>@Z*r&0qyx?f}11@N8|+n#7~l_GJC|%d5R<6pRDa&{vR|l1FNt
z7B7-sq=pZVl)OxsD-ThqU<UbEA+9ayj3^d3&J7tt#bSAElwiMk9rnPVx1rSQq>$^a
zRrsNd#+nO#3}Rs%%URprgMK5UaO30EOEd_*{FmusO0`Fyhi_wdyHoUi-jg4NHE}pX
ziqvTFe5SLAx;esu&LWvsPi$I)jh#<;aJ3sxKQwC`@*Ru$Cm*hNjWvUldFdOD=R<0D
z*<Kzw><oo2=v9JaHzbVg(nTCGPXYf`jklve+PI~}t_Q3h7i|UOZD=5srf@#5I5ja~
z*KTac8r>H^X7KDx=^t8?9Y~YiM$Y$M!=a;z4WqraTKdI0H!^fOn}r4$Sc%iwC-@Lt
z7U;dfl5RrT1bOiXk#MMyC#~g<6D;zVKr}i-#unx{R6V{Pl}>6qE}(l(S}XA`0+3~Y
z$>>Y<78=4)%5WiEdX=vCsU6sdyeC~1QT9!4=eSSpH<kCdo+lk>@de)T)UvY`RKDW=
z$(CEaF4yO}wT-baCy&LE!(z*_umv`Nt@kYp(Oi-RI7o=(7NZUIK8gSTk^+9=8O$D4
z4V7sn5`n8hdPKaIE2l~6<>>n#wmSN5r4enO7sy-?hSpV2hBq>Y2e^>Tn|ktlib&L&
z2*Ak`=)5hG5M%HnLHXkMyipb+Tu0tH3jdS*4|%t(*O6UhIafhreKS?`u~xXn#mo*3
zS~62L%=!)^sWZ<JKjk^op`Ij6P<6WdXJvXMPZoFUDlQSoFfb|lu5fM~4r+i<;8uYL
zK;5bk0k_7YtyCor27!|jKYxw2%=d6-<U|<?q#_~DClsev1vaIG)VmC{=fm8fo9KS{
zIhCZSzBl#cITfhGJToj5C!WO&^Atx5h>35gsd-JY#yDO}0b5wI>sgj!60h(!@;}Pp
zEIgnW!O80qfNYW%iy;1ww781~h%IVlah?w$t-k*eG-Li1nuPyl4uL~IU@QZn>+t_8
z?o7j>e%n4ivhVvk#yY6%D)cuY%a}=)N|=&$B1zVaEyixLjgc)vk+MW4M2N_~4T@wp
z)?}CL?w|gT?&p49Jj-$1$MMXIc{7gVnz??b^E%J(_4!^GLu^ICe9{d+TC8-^=wsmD
zsD=(kBSMc2QG)+G0dXoj%4THXxNM3V0nKOLqyEfwDoJ;%w_jjD(QA1Q+*mlsi*gI3
zscgyVc(H#UWJuvnSBK2}<%n6N@B}!&^&x(cGljqXMbnDnAGD-y+9()EnbMo84(Cek
zJ*vT+rGRy2&o|01p{0<-jrmHlkecnp<L^*6hubg~F0)~KK)xr&?WdHmg`mwFY~}2l
z7?4%-`u$U&Ov;J?(H}pR1U13kGtI%O`o-1s*7=z#=Lg)n)@5-N>6CbS%|wNn)B4Ow
z$pAf#FukK1w*MF}qt8Y_zou2_m|fHWrN1*Fcgn9~%lbiU6j`z`CTnGE1LzWHj!r}k
zE5L*1hf1Hv*7}bwQ(%deSbyqqBTypf@(B_{ofOYh*g6A}$7tuLq&H3L<fI2Iy{%@H
zo4_q9;#`!Q)-062+cGU#kD;_Fa})#{z;x#BgScVnS!xg%28{w|Ec|+zDm^TNKiReJ
z^E$`jEu6UZ!ojYw(pn6A3uLJxC0Y2==h9|KhJdPyt6eiQeY-}+`Fb-@tJLcD=ouID
zrv>$QE2;(gj%tD!Hc?a(R<iX;+`_=p_eUTQ*DO-!qKTUQnAT(?{W};|9#*BfWGd|k
zL+sahg)>q6Gi1GLj_PHpV-<Sd$Dzqm+_pWu4r18C7t(^5He1B!=N1MX*T1BL;Gl7X
zk$cM#Lbmm~k<h5a+kRhJ2qy_ThR>nbN5d8ac|r8wyg_}{FfUHfN;ikmh1H)g^BkT_
z`q(+rjH|~bf0ygNudeP7u(o9voE;Ehi;`E+b8-6%iyx<@;>@h0^$xBH4PW!~?BTCF
zGucW+Gxqr7pi!H3F%d3#WO|>MYta$mpuLzxO*=faAlD&5o4+)1jG~&sI+eF|k}sP!
z#oc@i9rsje%8;NXZ{i3_gOLMDn#G;)h3{Zq-7%av?pgef`qR@a<2SpjjXQ!3dS;UR
z?gcx7O8GZ~7AJ=%ScS9s(<$d}tIS9QD#Ml9ya;>!40hUloh6Cmw~s;}sBUB^3<!<L
zyGa8>>E31_bB1vVqev$Z)>kN4_k^y>UpQW=mSvcl4F|;6$=eyE{U9AbZ!3iwA9G|h
z0*SDX>)g9M+O&iZ4D90UB}4yed<$Bs*tfc%<{A#+TT`1^#b1%EvD=7@ALgQ}YrF#i
zaj^P!R5M~;Fq!49;0xkvBKa_6sMnbj8xi|iFuR`db*7Ja|JB1m8Y}-u2_z3@$*EeP
zVKY9bC!sMw9z;4(7bkS0#h3*tij%6beB3$IHcxFWoTt*?uInZPBA~(p!q6zCuvZ^+
z2-m;N%BTp<w65?pD<k*CSo12S>dfiS(jbY5QDp3$+6}ZDlRMe{OBA}N>I_M{)XV$F
zW3T2IW)+DJw;aq98_n=>B}(fGJaoY@Zgz{Cm3~q(Xn)dgG_3p9T4_**>|5nVW<h6-
z3s_1}#l`IAIImStkUwC70%n?@i5Kvew*$5O^)ooD3ZQ&>ECwK6Bq%B!jVz3=huGM$
zwoXE{hmn2GUAV79YC6TNQlbRe_kHKLmGTnp4qi!}{Rk1$M#BSmTbb!Y;+<?m8BZ;{
zu;xv6rG(`y6gtcroB*PqO>&wv(oG$<rsrE9Q+XjLqzW|?PP*9`)>!qXxWjC6>ar4G
zliQKd3|!kmu|1)!i}-{?M>4V;I&l$5dqM<Vg3ja}{JZx;#H2=T&%B5+Si&9;Yy9-s
zFP~}8lEXb<V_o7}<ad22YS`a66F%Yn>}8m8z2b%{Hjtv@=NVJJ@9PfP{+Z>PTJ<rs
z8h8<Y(H>FJok3tmi;#1Od~;RFRQaZI61;NtIWO)3evf&z6!$3tT=EEH@2iYrBJZ{@
zLvd67S}yneq0)GR66XU1&n#tN`BWUb#AtPp@7k$QwC<O)!Hg=p#9%WiBdUucT-A=l
z@p8{vVoEy@pLtJOAdGl0y&JD`qMN04w%vq?^@lq=98z)&9{@wZ&6CL!42I*w5GBF#
zzB7YLfsX*%3P;%xvP>Z|u_6aIOj>n;YC`>J1|wbE9avffK^@&$X02)Ho<4Ls!*a34
z2GmEME(D~ILy`um4=>U(@^Y{)CDkx;{!H-Edc$;!Afkjy9i{bUL@JN{s+Wk5XrXU?
z6ngvXO+Da9Lzhu8&OS0&`tprmKZgaB+Z5-(a!{n0jh$*7ITE)Fy}NNKevf2muG-J1
zCB^?dM!?d<C3KE@W(L>T;xm<w*qybHoX}gY@8s3k6sfeS=q)sGOPDqwZmBnf-UHKT
zY1~x$HV&3UKhgGA284f0wKvetj!AG&LW^u!f1^C$ksz!AmZTXIg~JF-sBa?TD&m8A
zueVp%27}wIey@&goz~<hwlVN3UdGq8qCs|Yy8&>ge}sr*28J#?z|Xm^&Eyy@3urS9
zk0h+Py}Re#%Xe3to-Ed2A<uEL-0mc3DICQ|9IsU@#Po3HT8|8eo#C>Tp<|+VeYaGs
z!IVG(>n#gK#eeG7Z4*}C;GhDmgg^MaZhPay_}IH&3oVOGllne}g)Q1;2r@pf8Lu~%
zxqt)GQL2`Sm6A4*k0H*b5HCY55CSOQkQv_MQeh5A2wL%1yrm`Cj_-a_n~%ESGv*5m
zt0}!zojShi?J;Z(UmtfHq_XGFZcQD6!BF3ZwiDy<Waajl6U}9)G_hyTWmgI`z9pBG
zh_y~e$gNIj`>8xqX`P%2_2SHo0iIkwDpb<qKdq$Umx$))r7-V5r?Tlk*A}bymky^<
zomVt_RLRVwp<vPcRKCWv<BF1coFx1BRNL^WCH3vt_}MySY}HK;{+aWuk+d`Zit?d{
zUA}xz&>!e<To=&rNi~nleCp)jI@cif02|`zg=w*iSDS3v&J$FZX850oJBp`5TPF>_
zS_N8el*Zwzpjzn_sdSguW>|%VY}ljGis{;epKy%)V>ZtfTLra+nHpRH240DC6Q}6$
zC6Y1jBi}5lFcLU>Pd}IKOL7NZ=9`7T-HRJ|MFgR-q+aKKtl#~VIUC!#n9w#66x8DK
zjoGHqjBCCncgTBU$!QedXq;2PC(FZha8^>ij2)$*0}UK&jRQPb&EwqsIs1cWZq;<_
z?5C1-Vw7E)auHJ%gITocU<u!6cf^6@M!7e^Dcouso2CK2DLqmD>YFa2J`5d3b#03k
zXu?8m6bOw|9y5$3d{&!o4sG>lAH$sV^oQM96X>q$0su>~O^FqymK^uZOwiS8LVi<i
zk8nrMr8rF9VjUw7XfwrlXC^d$a_vw4m;ktdXMczicsY1EH2d3)sc~YAm_nFK)beFk
zQf6s}g1F-NiBE#O1WiJr<`$J0fQ&~8{Y5xgjthG@70(GRv!JU?d9$Cs!5?gMLPHz3
z@fkQWm(8p^sFr_JQ$Iw@k_e&T;JOc++<i~_C39ZVXBR9T)MO1n?260r8PM2M<aC*a
zg!cE*CFV`qIw?7Z1UPtl>!n_#^Mrkf&&^U<aR@dCmmwqhweRMShj89?p;Vr_PSy)0
zf>rj;j}|4*-+sm$BWg&HnQ`7}Xf`f%sUT&>^rXqmq6hD@%uq43Dn%quzXX=D@UD)i
zVG4(hP-@jyU*2%8$^wb)65n!GOd=F{IbFs5EffESsUmB;8dAf2Xq*18^)t~q9vr6(
zn>NA~(I5R!%iVyjeSEjIz}f)4tTi+6@Wb}QpcVRlQ}Tj-IgXw*`rz$|Mf6ugb-g%j
zkG(Gyj*(XmvK{{Mn{4e+oMu)DM;G5&-+DFy=TzM42Af};bia{N`Eo6-R*-UoKb%80
z_FDLldFJea)SRw-1GnL~Z&;Q&=!63onb3BdPUg_?Xe_=E|BPYzq9|%&Ldu2l*!OoI
zs92K>1bz!NV(??@p(`C=8@P8h^hhJ%QT$^O)Mf)}`UJ6~y{ghk_Wy7xQV9lQfxqMW
zjoj+~=^4h^s4R4e8FJ>#v@AkRtCae#zee^H7W&<fD6jNjtN8=7wVBpf%-z~&2Ra2h
z!RT96Ud-Z69d-2*3oCQttdnINVa5{=$OSuFe)d}TOS9N3kH3dzuns^VI_-94Y7!ci
z<d^IopKA5fa*;h!haaA@tOGz){Bq?>*X=d#j?_cQlk%kq;o8ig!qn{-Mf;6ZgR6eT
zgO#4Xp4ZEF01A#r$nXI4U3otCwpE-?i5QFvg-F`J!Hqi}4I4;hp-udT;SUBb3WZ?C
z9`GJideU3PlD8ucko`b|e&wVFYd6io{x2&S{{6`vPv`JlX78^>`w|?`g`X;84f^ux
z5tuX+Gswv+MW-@a6RCxNjt32E5q}KiBe;$y(ZY~eWKsFZ1a--#gJf8-zd3XzcdhDg
z5%KY?;;03aQU5!S@NDY?3)R2_G5gM&u{5&2HotP`Tav;Yz7ARez*>>zKJUA+0C2#;
zw91i){6GLhxr?giEB*vL-da=L=g0lrxI3W?y$v^c)dQcl-QQh4m*{cM@iOb@S63Nm
z11<!M*RH=mE#myW84uguKW#C0OX%U0XzAS>ki{{wvsBM1fQp2-J1Nga<F4_FI1)Z(
zHOmcLtBx>1n0lU-zWT?)s^Rp8-g<9_A@x;5Z#GRHA~V)ZgQ8Xt77eJ(pMr~NLTo+-
z2(MdOW?_<Z?+oTV0qB0InTJW3T}vq|%xLQn&vP>|B=Kt$S<3J^yvDG?f0`kw`|vWt
zbi+iNa6C$oH@mC9VLz~65HE9^or!-%o#?0aunNFC*Oi4Q=DMR+Ng=IVJ=T>AU21Og
zYqxcY)gHw3tX%}Y@Pz!p#oDFLD>=qOUtV-Kb?S5<*sTo3F@~bgUu^BA`P1Yw9%;zP
zKAtRO!H2E&f!iw8SCZv!H8spz!fTggNfiMv(zxzUp(TJqx7ZOQsNu#Cvit5M1h=I5
z<uAxeZ<J3@jh(nC%C`ScL^ZIv!dg#n!#&V#c(@O-nvuSFG&A73ko2-`dO{)_vOme$
z&p2O;QBdt5`MWy+e%fgkW3H=ywo;t9=<${Gc;-*5y#%jYg|?5Z91H;Wl}TAIfor@k
zZwBs_M3DF2_wvefx^pso?q&V+Q`mp^>&!<8gNA~F=zt{jpG`g+`1gn1N20P!nx~lU
z-zH=rK%W91^x}%lM*f>V|KFWEfE<%Uccfp+{_}f8hybjhq%I&i(BEMfKQ=+DQXTcC
zr+(X7{zaNd5ks*!zD-MGXBZtg>#4w1De&o1;_)-k{2^v-zZx@#oGPKI^d{OGkbABE
zo~9sky`z*Ue)RT#(>nfN=b$;ha4arEv#mpd8%7Stz@NUeWTnq{V5|+39oigU8Fj(0
zt?J#c_E_4ZhWZpLoUXkh=w`K1lUGy*F3$##<W-xhow?DHD+u>ObuN0sgOxx%(yQB<
zX-|wxSKi(^v~;fRR1yWQBA>c&98H>Rk+c-Rur}F~q{2?U{5<)Z6kdSprbD_NAo>SA
zX5%+j#ZH>-=yn{qTUSF1`gVVkvagZxQ|1p_{!YRG^1OPfQnF>;(-f-jHTEF-(bRQl
zkCozo9mkcj_4JB6q!8G@n(up2FHg%lHkrEive6i6dT1myK6D#wJUP90GG*Cuy44+E
z9o1}Dj;LQw+GMe9?OzdRvyMui1Q_%<WOM{6^n8eje5I+=5oV93wO4JoNk!|{OyI99
z2BUd`rDVY4n$K;eTlQX9WZ+uro4Y-HKZa|@u0snvisR>cT$mW90|lCm(sho6-#7Pl
zq58ff1>9DEnagvo<)3bR9x6dyqWTUxSj@D6woGp#xbkRYs3lWHs$Uio7n6puS=g*$
z2vu9ICMQ|EwuUbB=n2)AkOPAbM~HVhuD<Z+0ve;6FqBJeAAdrwtTA-q<$i760n?8W
zhUIewqEP)PAH<8}fW|&J8qFPMMP2o4FBrh_MIN;2UF>FV4|?jM7|;vzTvpqG8zpx@
zV$UtSd*+)+Xh~6L2D%b*<B`&s4DncXU8A+oBc^Nx?0q_wnDAhy0|k4$DS_^OB2=Z^
z1XPXpDc5q3SCA|~fyB#%FU?gxDrTDdDR-&*v9RAJ#6)AvO;0TS0*2xo*t2Pi{g!bR
z9K~&qu2}v2jJrYuMF4-{0~#)~7y2Ktd1rXCQ6Z=Kl*S(p#LEP5xi&*ESvDgK75>`{
zOP4aNri&KG)b$WaJnQ&N$Jw4@fld{Wwt=13k)3F*D$_FK0$=_w;vB`N%f1x|<~uvp
zR_7)A1@3U8GG9k_@b~2xv+2d|f2Wy9Ip=cz?ztbq<+4uCufP4S@<DFTA8nS?X)55*
zmC@YA?6$SLfijy$pYth}KZ5T+gzhb_2=ko_kbM7;#P)QkRvfrghPfr}&ujzFR-4FV
z^ceuiMfG!~OdN`V)jOh>=UPt7?%4KFnbQCeQEV$t7qdH3|Dr3dHP!JcNEz>1&jw&`
zzdm*rZS}Np9_QWc&(hRA^ZPqRVyInKL9M4vBX905vgWrlfxDu1w{kr``s?AKADl|S
zc_zoT8QmkS9m159wBv)d8U$_`9&1@a-%Vt5d8`c4xR}f2c>PW%{qG{jFsUalJ8%!D
zpDZtKkk3`ISYCJftmpu}0up?j+JmS<unS02|0gN6ch0GURXDFT{nR1sa;RycfHlIH
za64C9a_UK*X$JRkokA5PLI*_<rgdJY9f8Z~uQVMKj)TIQ1IWaZ6mNF`ou#0q)$h+_
zsT~m;3BM0>_}|ox|AyTEKgrVnlB#nNN!s6j%-Dwp|2oGEVbU$4DY^RM_(!o-(d`Kf
X1qX|M$r%y>0WYMkp-z#Ob?`p`oG@`z

literal 41530
zcmdqJbx>T}*DVSJ5?q42OMu|+(70Qmkw6-E2pWQh;7)LNNpN=v-4F=w!3pk8aJidv
z&hLEpR^9skcvbJdx{so$qJiCe&o$?oV~#NvVd|=K7^ozuaBy%K3i8sLaBv7YaB%Sc
z$j^aSoIi{@z`^;$DM(9de=yq5L`q=kt*@US9P}O<1U}pcqXjvC3}SP0n+`^&nzUJj
zlZOaalW!7w-*rCxsdwcm)LPWE<so?+bzx*%naZ~9@aB!~fsl*>4RQe0|7D+0xYZ@p
z-mT+dRL9EW6jaNvAr$C?X>|`n6ZG%Xy5!<*=H9+pk^M6Y5}^JX>^1G>3a`sBWx2R3
ziYSQ<e0~he)=p{oP03<7P7?`H)?d-Ue<*~$$;|E@RLD^EY#XGRbTSyaKN@Z)CCb*$
z{!@MV^jC<I+w{u*54>OX52bu7H(wZ6A)6B2GJrZ8$>DE?P)AThdvI{lJ=jsy|9X?>
z6bur6zeSl-@}O#8zCHTs3Mk}z1S2sUmP1q9LST^!Z$j%y<|r^NDxHQ<w~i?=&$OF-
zg9hFRM4df@;pcb=i&vR*tv~jBZ%>3XxANOTP5$bnxV6RU8Kgu%b<XG`I)|U5ML)IW
z5UYLm=CAw9WwBzA=(1IqdVKuyjf%HSYvt0`ck@v@$mInDjs|t?Pp1O!!ei-i66#3h
z+earRt5fNrgRM`b?<l^BzQC~t*FpIa#IGDQ@l&YzN{=lSOq(8+za2qnj$UqIJ?fne
zfLP#LAYAL6roWBoBrKJ_>1_F_1+U(hLT#BdgpglsBYQVP{pdgtg<q8nQ3!tCL+p4t
zZKXYe^_xE?Kijh0#eB!|)n9S1=lDC{XoG3s!$;?9p^pmR9*$o4vM_*5kf{H-J4NN7
zBTanf_2@Gk$BO?3zIPe2Z$iMc-Ey>X5J~sAGB~(dX3pp&aOg0TpY&n;8s2tNZ`)@=
zJsJk*QwqID$O5le@^(7D5zu>$*@v407FF<TrQ9*wdA6+lZCds7;<GQ$S45V6v1F7G
zICu{YA#Qt0HYwRU{LV<5_j_|?7Idk7wf7x^rSz|&fQIG{($IB}?-Jw&Y}p=mmN`|V
zk-~m3oZI!Kt{ArV4k);V657RUzjklJ()F)t3bF0pI_2dLUKKBIURHOhg}6zx>&U{I
z@t43W@So<LLVibF-J%jUBJ-}JjMmyX9ABl8>I_F9T$<V*7QE_2;%9Bb{k3pFQBb-u
z1^fu%JZJ7P)&u$EPulIXPo&lgzQ>p%T7RuJq<a*|M(VGqvF`~4zGCW`;=>+yzmtpT
zy7;xY`<NY&RM>v9`EC++>x5K-@knb2dN+%9)OFn2BXcflxI*{^Ocbz!svZm#N4aYK
zj%B+~QZGQI=Ktk+({*!Ns8Vf#ueysVDP!xU2`@i~y=J{~$eC;}<CGIC2e%R*+Q6Nm
z(docp7=6+~y1V?hOT548gXnz<+`Y9Xej0M6u8jkDj;$=D%;Zy=F*l8~|Ik?=cv6aE
zFv5!%{-fa#c^VwXmiT6zhurtBUYENSIR&?qFOVh#9!E^3OLJt(N5>b3M*gS1446lC
z67b4oTy$92_ZNoZ{FA2MFCQ(&>G8VX1))ABbVI7yOz9o|ko{G2feo?zFYpGE^)`Mh
zd|Gb%BuvmM?9%70==2dg=>s)IY4aA!s4=30e-D>CC2kJi@>x8cZwF@vf3v6NA~@Un
znLB%*j~&vrJgPdjK7oBQ9J|D)p=|SxvJkiDDe)+FyYT46c<T^vF1(~3J!G#3KX&3Y
zMw{Q4RWF!F&<Lfv_5z&#PHl2pHGc%e6unI%=f=bh@V@dLqzAp)N0a<{LvxleJ&4A_
z5-mZyQNkn_<1iP_r)2ZTJ!nD0C4@mj53h$;XAnD`ZpuEasPb(svh^M+R#}Mrm!K~{
zf<*SF7C`X*qG`FnmcT?-%Joo?U#O4;CxgBEW7MZf6c1sdtR&P0dw7Gc9Ygum?=1AG
zS|mJR3xiP`;e2oXRk0#g&%qk2-wYLHHZ0u?c3RGX8q-fzEtfT;{dMD!$l-`g$RVRR
z3Jt$EHq%I<c+}jWpxYOauou``4b;L(as-srX7&|K=tx#pi^4)*RZ(WdzdEFv9WqFq
z@HcXQ`G${s@^f0IDfJy1;VD^?_G>>o#-KM?`~(Ms#%Qg!*h%<nty2qU*r@7(o0-uo
z3;H@qq^F46Tec2P-kSKa^9)nu&-Tex=xT?)glV$&w5=8W9L>;&&Ml}P2Bi<Z-e~O9
zase5B5l1#B^?lR`ikO6+{wnKHhWx!h6LInY^$wkB>FbUwci1X(?GA4aG3%h6M<vEC
zX|S0Ob>iLAXOc*bu+sSz%l`Z^9d2XM?!z3e5{-_kzrbduH<4)*`PfUr2N|^YQt)$;
z*$h$XEn6t+fh%u$J=72j%l-NHGtm!9cRh8mfvw}U;X?PF+5na`-*=(R<il7b<$-A^
z%0Gc*Q~gHJiq<BF!9owx5heIOgK87pgd;ro`i|}^W>EH9m&<qDE*HggWYZGTv3imq
zc+~02YaD`=W>d|@)t^gUo0<s0G4@L|D>wsoB@Q%G{-Gj*Umf(-1U99_-mI`ea!4>0
zzIl1fwa(qC0||1W>e;P0*xu%EKF@fkx3KmpD8(~}bye#3axywbgjdzn_&xrEz}F6z
zGZnpNih$-;kq93#5JxeAN5i)oQN^EItzT?NoW+9|6_jvK|4w3`7#euye}KW;PlzRd
zg9MImv2gf*w^3RAGVn6{*Wr`DtN-sc`2XrF5$mA9z3ZdFb$s|L7*ejB3{~b<1{z_8
zS12gs!)h2UKj&sayedg^7-w;c_i3L>q~MgcmaK!xprYrE9L@Rb@yGi=a(Mz4@~Ia=
zZ76wHCbWsN8zrvHa)nxg%$faht~MoOK^;;Fxp^}q1<JQlONbiY)p3gX&e}qW`Qu;7
z5>+?kg?J~orHrZj)iw4lHH&<x9Rrv;X5<Sso-UxVqaCeS>`a$QV&j@yiEUoObiw^<
zq9c}H63^JKV7%n%0{^_-7gY@2mQmeME^WNq)C6nezI3W^c0Nt(zb^34+bI+Wlf2z#
zktP-*xQ=l2e%-$I&4otn<L$<dW5*v&JYT6TKi><<b%N7uaX<e1UHfG7&Xf5{F|z9z
z^MCHlSMTAHy)#S9@9~cwN#{v7d7K}G-{Ye0#6+|hle6E^tXIwL-J0c*w6CZ4NiJ!A
z%^mbJzx0XU>7mw6k0lvyk7Gz?fkvOP@14|5{R@ZCg|n-6ehQ^Zk;}lo%w?aEdgJJY
zh^zHQ-9krEH_LU*=8?u;@h|s}7ZV<Z9!5T@ctzyV(n4{57mH#Z%WgY<_p5svL?deA
z-fW^r1?<v?mEyi$?%7<I{xQp*GB?BA9&(3228PAh?z;?P<PHyNCR|bX8e)vv#7{@o
z96RrwKW@Gckv_aLIy|dXTZ+0nmB0S8yhz(<lvdA9uz4izBP_ZSdS7$X)eGErQT&PJ
zk$AkA_W*sI7e%GuW5EqS<Zdr!dC+(3SEsu%o>)xh!={o3iR9_NE@kHo_QgVcw3U48
zO6<RBU>e6Wd^bI%tIzk=Zzypu!}G^Zt-T~g(@q}KaA5x)n+W~=IOJCP@Q1jspLfgP
z%Luat&pnm#SH&qJwj$n5)rzOHgr^LltI5>l{^k67{FxpVg5x6xb>>17=o7^+4x2%E
z!!Ac}8&n-TPxt$LX6Xr47_;qa%JMFJJ=bTo)OgK>96Fl!GR(e&)m}=t20>nJv5R_Z
zxyICxIgkWNgUefgDDGI0mJwfjjyqM>@1Camqb}l5cvlu^>2-CUo+=fF9M&qWV-{yx
z?4F0aM2|c?UaBoCCF|Sep5Ho(5d^Ues7{1gYGo0;b$W0uF6UR>%J50rzkj@g{mQSR
zeuWmZ_}=p~sddx$HWzd2psMrtr%el*5rxJgPBDFpSN*z<3cpGd3LmehLgI{t_vtdR
zSt26S*=en%DJWcyjWh&C78{pE8rN#p^Uy4;&lC5CWA09yc5ZA7CX_|rz&zwqGwj9{
zxKAurqG$XX&%d3mVroBKCUR1%w4%!OB5P^nY*CRh<#hAxGHGDAZ`>-#oFHa({iMFv
zVPGk9E7<;^jC&+yaqG}_xHLp2!J+Z``IzV&^$049Yi=jD`8K!!Mg8OFTw%)XhV=66
zVX?iz!q-Uc3>d?Nm@}fz{1k4ZS+l*4I^)Hi&MG>R59jU0`BkQI>$U9bD%d^jAm$B0
zi*!NUTSHn3>>F^oH<&i5DTpwIpBYH3%fvEV*L`ZQ;?nXaWjywngii;JR%K0(=bwL#
zzinLP@rpU>c(7W$p9*&Zo5a1>fN3VdRA|>cjBagO#+{WcZo`sra=<kFMjn6q?mI}?
z<OwtUvXqsnZrc4Q22j=IvmEMAgfyCJ_IWb<CZeYG${!XVR=n0Gt~ax2f0;O}&l|m;
zrKj;=b#s-yI!JeH(pk^$)j`0cp*w8v$N;mLFz&d1r<gA;MRX%2P(J)M_)_uO*n0(u
zCW3C6HnS-o>%p*Pc$Y=?<TMm+gA{G?Dhq4b@t3mU{h@HBnFIJ1r$|Q}9Vq_zlSor6
z)#5bga7}~+*P*?#QR41Ar5BB<K?_z02^x{3nIbSMspbNh<>3H(`-hwT_FuA>uavUz
zC%2@ssryH(`)J@%cZk;rF|kHT3%HgX8|^1bJH1UTd{EZfuGh)+3PDJYuwrDCyDACK
z2<CRNVB3Qmc5!do_Vq#2+FLDn-?PLJ66hJuL~ZZmomO<2t{r+IYr`l)>UVU~k=W3k
zJ$9kOE{qkI9lc&zNH5J#CEXwC)Zxa#q!sNpNsqatqSyBx%WSB&3y=YfT5854-<gh9
zGYrzaxXbI~pw%CfDU1@-ps>)o&zDPIyT~InjMg-RfrYv{!^#>G5`S?}Z+f9>j>t6q
z`_{3I!+F`PeX6!@^;W8Co_{Rz0C}|Ye!YCqzDY|*NP$uQhJ&8Tb>6;pXq>!pjtSLP
z8r#UL>t~!WO=eRlYv3cW$uXm@>B;(Q#%!Q0xz8vFX$0dWxZhJ|w`bprv096j#W`-|
zO~gbmh(9)y*I>SIV?R1C*VW(h%7P0*es|b~hvSkj)|XDAk(610OLtVAH}T1uUP57_
z*&>^FmyDK%G0iHk4ELwwvd1s`yrwJ}1Uxa<fu2-jEA%c+WuIMG68&Xuzu~s{Q%BLG
zlh_H<kQ@^2wfhr;#TkZ=!a^$*Eo&E*m|Qx7yGona=(UjRWeJ)`cRM**ayt2%DvlB6
z$aD3f+UZ2R-Go&W+<J9|opat@PIz@n%WwIsn9S|ng?H9a6<MM^3%p|f;1N2Db<>8d
zZ88ZB1={*$E7uab#vK=3=T1tX1EVkGk=586+{dvCbnKU5N<w_xs^V{bwlldtfzM7v
zBM_OSrQe;PEJokmQS7kI+r12W4O=O~Dabd+5GMbogK$pDE*gu1D!v~X+mi|LWPJtB
zW*0dqVd`L9W}DU|5m5`)yyx%sa6XNDA$|D@raJyAW{^I^a(LEdi%n|N_f%QDM)F?j
z-kcn>AAx|Ca=A7V&K#3Fz^sXeJX%`_a#DO)aCouN(|4=saCbXCo8sGIy$d=DwOP&`
zG>wDnVIDj}&30MqDK-b)iP`D7e^GtB$mi$Ie+JF?3SW-MWdH03=8<Z1b*_gm?X))Q
znV2C^2a2~ctpf9`+~$wzw=qkeTr66!Otk*m;BPLu%TJq?S*>7>H};?-4VlXnHez!2
zec?w#W|p#W{<qE_-J^nD+2NJAUitR}z?wc0Wc9|pq+9x#l*Pf6PRT9jr!0sYgNMta
z9r8nnqj=oCsyJPJFQ$PRj8g0lOxL-bcw+I?hcNEm>SoXX4z7oxiRY_)Mh?lM!l2UO
zXSi`u33&$5p3EUhM;S#^73Io=v{;iaU^gW|C>zj2(q*%N8bFJ=;%mFr>!nO`gYKsd
zFH4~fPB`nsBP3k1=Mm<BBc=__ueZg_1vGaMiP<bVI1un~Htwlu`Oz&udLX{=w2!(=
zR(mkf>8;nd(vtO;>Q>I0s%Mp>$z&Q?2>JMH3nIVy<Fx$a0L96*tlbzzFdX7npAeY>
zisE=;wu;=2qM`5X=mVidNBj#i6+r&ea$1zL(n=6hpX|uX#Y8u}eoj{|U#mlFJ+$dR
z%r$}DjE18iUNrOJlB>*TKq{LC*o)+GbIwyn?#6RdiRu)eDSZwyegABQ=mL|4n0c$+
zeTn&gKYxqcPn<9q3Edoti9m(n>{c=ImWIny_v3;0cb+!BI!8MOiRxe{N=Iu`0aXMZ
zLLi}~c|OATAAgL4lL4E2Jvkh_&SxEk4?yo4w@P)Js$?p(`@N*>KQ5S;hqVX}>~{R4
zrO@xxWW5e-12n7(F{2o93dN2i<c%f8xCo(_H5$)^j37dsLC-JUU*~|!!N}{&rRZ|Z
znb_g3L^x5Dop<ln9i_o+^;gh1;c1cnkV1UeW?TAe==J*E2yX#?0Q_YQLc;ZzN!_LP
zr_gesQdM0WVW+Gh<isKmh@A8U2y}=l(aI)C(IU7Az<6q8ynS(AX3`V|XZS!@W`%=B
ze$Q_--rTgZhNP>O^;>F3kG-fEKvplR<ESBQ5q<wFQ7`Sc!;W+sCxRupI5cpln_I<y
zTy|`_<{beDi>6NgH+y?GRIhioHJN&(=0AH)DnDY3d~L3~3rtf+faG;BozytEdIr%?
zyx?<BO)8wZMi7d6H*;cpr+A%bi5(h@tnhj(mD`IV3N=anE)7MU8IBnv_ypuZ@{rf;
zWExeV3qo;>X7VN0?7FoND6vYdI)X`+J@3GgKpY#BI1Vhe*&#};B6{2|zwqMXl8*G^
zhVNGw&deB%%C%i>BoP}}bDHRTD){JPV}I#2wN%Ur<Z+ZL{JO@s?{z>zoEvbL?wkHn
z?o2t==a2y>x^Wa+MmLn{MIDyY>gk)rpURNmq9y#$I%tb!J~R4Ih_yLd3s9A(#mY@I
z0Nf1{F+0kvm2p@ir?J6S&Lz#n@5e=JyYuJ`AcBHJhO9QqtdPZGlyRuoma8sVfL$&A
zAnTnyYQArR+ry<T233;kIBW!`*f)0}(Aw-E&MokjA4I5HMRy5XWWoviu+IA7I`+;K
zZ?rWTaxnXMV)u!_qY~TnE;evhqh@qM1PI7a-j5BKGfg>qK|`z&5}uD#78N@4!z;!R
z+uCf8c(X~<f@DJHM^7cO(sKFssQGYnVt=qKAOtQkgFJm9<6!#hU*PU^8laEvXZ+r>
z85h7GqJ^Q#m{YVsz7D2ejVOLxD%FF%p_53J>$!tCs^q+fEIF4_?}=dYM~MLWGTt45
zg7}4S5_<si%%*zVQ{1!_At4&-v~}Q?raHJ>wASTVKWxFhsdtIJ<f-!^nBTKg1m1*|
zMPdhi>va(TOUJ{4ynXS-js{zAt74;3K%KdyS~=L+zHdaPMV88~mccX8aONw7muD{2
z1I|r5((QO8y!t5WeW40XqGS`#t+u@NLDTG2NJ$?2o50H#pPFuwT4HgcS5)_YXVteD
zZiD`5f?or#Byg<2Sk^h#hz&DV2RBZi?Bij4uMKOV2`leEHW_mBCwQg96*^mBCh^i}
z+ksn@{g*yor>1U95ukD=1m=9_>OaHb++X=A?R@tYy@bl4qwdCA?7;%I1D3T49GsI8
zVa{0rcizOU2eGUTJ;6#HYg(ou#hyuw?$H+`%cMa;cB_N6gHZch9{p70FiLWgU`Ts*
z+^LkQ73?REJ31{_LJyfDi?A$TezEfPZNkdm*-QU6)zV_s-!U=>i^oWeHNyT__&#OT
zJ{;hT!4uemVs^ytX{sWG5a|~@8O-loIz{!~N6A;^8V4VKWgHefIc8$@1-nNG$z$X6
zwn>V=q4pNLmTI<4`PIw*6#bTD%z&wO@H7)U`X^uo;^@k}_+pZVDcpRP@DT1&7-~n@
zdRW@$adMQga=8UEtl6vLT~YNckSX>)G+jlL>-TLreN2lZM`H4ji3kQ5Cwo#YO6b8J
z)yQvO-^mJAPaXAaG@oE5P%jP8>I+`Ngt|Ao3~M(K+eJuGWxyOfR@1bk3~SHKF|p@=
za2O9DGSNJP?~8qgftu(4qo@1%C4`Ona^ryq{T!~`Lop}@{QH3ibs^-cA=Ll+(;dOW
z8LK{zJbFBYw<k@Sc2VK;m?NFNA+cjHuxWfbRIaD8--#TRJkgP(%=$63si)_89?=sC
zMtBu3gGPj%(~Sj8-pw6@S{EzTGZ!D-Jc!}Lt8zGNT)S-4<_O%fs-jI~Fm<Pb?R5Re
zQ3^h9!;&D3JJMOy3Eo-IJRcFgQCV_`!K28Bvz}Q|>?kqTZlO85Tnx7;KR&bWA5aea
zFy|d%fde35d0H`D{>xoXg65y`7i1V=)G3_LXF{jA>tE^kq9)?(F)eG1K`Y%6nJ(un
zCjQV|)&R4S6ficFc_Lp*Z3r{XX%3g=m#H25R>}KIE%enoF=g#wy3&Mqtmu>_YUQfo
z0HA`dU7dL|FFagf%kGcKFXZQCI?8RswxiUEoG{+=jO*_z{!&>mDSCN&`DeTQ^s-jY
zeynA_L<Hl=Ya@L4OEF7~)BNI*cwZpitLgRc)c?br9AFR7S`Xr(9&LH<YGhX$ROoGF
z{UYY7Sxq?IjhW^dM-J5aW^tLfKc@H77Q4S{f`{vrLT0)xHgr3_16;4p$@=#EUNR87
zv$N!6wUzB9uJ=)c`m@vaQS64fPdTALcQ+X~pg3cV%>wh-Gb8^|s@*|$Rc<yOv-z-#
z_h|R%-LUb>3t0k3Sb>q&jeYrTt+S)Z!?Dj^KFVa!(CUnbu30_Z>we{!ocnil(4)OM
zOrRA~Ez6hQp!f+*Bv3A80{QjxcRDL`azdrWGTrku4lJh5p`wI(nw?4Yd^DdRCQ#4C
z4pPaC@PCMu#?Ta#&^oK_Zw6)p<+Dz{EYTJulClUy46k3pFVZB9TfnZ%1?F}Tt@~2f
zwuKuPsX&F`lY70D1LwL4l-zC)UAmMO$<6ns8pEWy%mZVpjQxJKT8%A|{jc*whJ;kh
z|BF|u_EGd#75ImKx^wu#wMxkD77soHBSq4W-X?BX72}x6ec~b=iow$(9@0*wiEtsl
z03HukY|fW<l(6q&o3x`k6X9wyr&)VlNvj~uIN@x^Qnm7QBb@SYHFyq+N!Nu(gbTls
z|L;hsWtDLOxAA$$WaMvLayoo#Pj7d{CPLCUZ`M>j@s+t<p0V#d(_#WoHr&-DyojS7
zW>kQ?{I(Rb9Y=G``7)mj_=Td<t9#{Eg?{^5Ycuozdb0n6jM<!{nVacYmKiQr2zPe4
z==5RRO-94kP?Z}p>3D5_a^1&=hrN9KG_@t4Hw|i8G4qc9iT4-{OUkUsQfn<e7Fy!n
zv|*XlX4}S?5&6;pPe57t<J_~U{6{ISVs=Lj^go|6y~NV?-Gtujlaf|?d0@&l?eDMA
zc&NIs#iAcA*<T1{&QOUt(u=zp8#P0O#e{jxcxHihG2q&z+a<o(I=4q$@1_$D=cb!C
ziPKqXpZ}LPwCap&#}}0BX;tep+pt**qaU&={!oSEDl8SZHo337-0m?eG(VFz<gcp-
z*$GDCWYvL~-skz?-Uj`{B#M)m#bNA>RDWHHiRw96IE*C={(c+1>CRQ9YmKN!)D6M3
zgFC#RDacww0`Ze5Lsj_?CN-`cYLj=mnSJ@DM+|m+fZ<P0&ay@z3ODFZ?=-*5s0df;
zi`I@n!0he&rB+8dFmf-Sq&a^!r9!m35H-c@c!bKfpnXhv>9n!ShLt6<d&DtgM|B9B
zJ!<;V?X7!yffhIy?H(ws+c2K#42FbXTIsRPo6G~nmSkQ|>r0Vpy3&k^)Wz}%qL9z1
z9$yp~hNHisy|w|z%oIo0!B#B}?UI&$7JbJ>_=iyS<LUGDvb)P*l$cHH_E61g+1h0l
zITDsfNKo0iUi3uZQR6W5JVepEl6;vHS>Pmu-6^c*zlrj^<2@AX3!J$Um(WUv>pBt>
zxs+-Ln%Y91%8I<jxHUH7HVF%RGO>%DYgFQ0=qN`y2iPgWVrGep21HDoNhcBHj$>{m
zj$>;i(AGW5)ms6f7I_zQP`maBT}iskYG)*cS~L;wp{%^$G6eyWjbIlTk9C;e=pz+r
z$L19uvjZX(G#YEFjDsxGic#*Jg2<Uthy^9f^oyluECoJw9>y+47E^ka<E?Ecs}o%O
zOZ<*0taR<;Ps0g0fM-)n(A6B6SM^KTvK{okO1BHIn~{~lx5YHSuff_PysX7f;5tC_
z#Faj2RS8xU6nj~n%_t!loH$XvpXPHt1DA2yb&DAeS7<mcx!R}2E`fti#!H%{)uAS2
z1-^Z*;IDb9jzyEy1~-I@jxNd0ab&2s$O8lvH$%W7RfB?Ef>=z}KgfX$^3ylV>#(UO
zg)k=p%a;*$m@6zN6mm;2fjRy>2qK`9cHUJ2=n8caz+wmS3}*O{QE<u^3LP1`n8B2O
zz5Sr*p7lB)oR;&X^CBP~s78(qrJ*0ck+p|Smu=SEY6QHM$=+VwtTkG`$1X#D9<8(X
z420}Vzy^)3LPi-TZNQ4JU02W@_UMg?x7t>4Cg6VXM8Ja#2wyK-Nl(s=3MV2#gS>8?
zsEIqI-z~ZK=*4BnDH5`1ZOhIry8hhJnDM2yKKeUI3PP{e$ukQ|hlQzM=75V{DXowh
zmo2)6Z;PCCe5R5}rHn{f?~~~uWtZ52Gx<|%TXqFLx28I3T?zPvlQVc-&^G$1NbD40
z67BgePSFV!PgM;T5M5=~j$?#bkzEn^{>z}T?-gHRUuy*qKV-jcAU;Fji2;AEM%7YF
zCABveP)ll_tuqKrgu@U1{1hwYBw54YC|-nwvkchT$J9$J3?hVtWiovk;{G#?g^pGS
zt;!6XobFuC^)zaBu|f5wx9^8f^m>LL86<Wj(>sSmS0fE8=E(z!q1~t!ux(blG?xoK
z)RFlh)}`N~M!xHteIxQ~>s}9FR3Xu$=HzvL{)FSr&{dE41QsEgJ0MpaoA)Dw>xw@I
zR7t(V2#4G7GV+K@tcmw%8ph3G6@#8y{!%umvVQfMM&j{ijekPs$567^6U=eTg8LHy
zxqZ#oCmVF-r;f;^^G=A{Tig)PVmFeF+8hdg8yEm|WlclCo+MsQ-FpMa8V7f-FZRkY
zq`@S%1~{2@o@B!P^@t@LWYWZQ>}K*`Pz78;{)bf?!59wOql33Ae$4pH`J>snB913(
z`OudBHflsB+&hq;ahS%>3DZqK+#FvbQivUF@c<CK&2jqC@g;R=P9tA9v@o6ZnsoQe
z&`!@k5wZk7uu22XW`e>tb4jCag-R=lf({5z(@7&fXaQBz6XMV;P#Zq49R(~N87m7d
z*tq0)N8U-42_XULqc`RBY(H)eGNZwX%8Lkyg;KI4PR<jH53=k*Q)egIUa0!s?XE$D
z2(M^q@wr~ddb{kZB^@^l>}?v1zAG-<Yc~hYK~YC!%>NWQe!rc`xOZ6hiZ!FoR)}BO
zy2=(s9U>6QRcG91$j&@DiUDs!z(5O|4pm?_Lu3N&<URsKnI@t0kH)MPZk8D`{9w4k
zPiXgFy9JDADmtAPujAnU7|@Ry)KOmofbk5znNsUTinL=vT%j{P`t&;}QEIXx=yMR!
zURh^xoS=Kie!I(O(xZBSik5}k2G#R0MKJj`_Nl+Z7w~+hYw$jm73(_tIs%yu@pER6
za>!@zhGQ_@2V>SF^p)FNkBIzKS-vSVSiNrfqeD^PyZbG_ZakKF9CQZtS1{L6RuL_O
z6dKs0=ln}I;8#`4z?+_6o&yhVyq0d7wcS6<q-qOs6>WE#vyYJpqhS`RO+j+rnwBTx
zE}i$>Ej?q(j09gfPrKz~?j76_Ar8g%D>9b8mk0bK50-)9*MA1T)*sgqUXVv`awY;-
zmo0LBhhd=+C(f1f%9u54-7k`)jS{*r9(H37C@S{Yh53?8rX@mzHy9eh7iiV`T<GL2
z)DqThQv9qc+Eh<ze^R?5Myn&;keIVdQ2~d+;#Ova{B~ZvoE*OYLYSv!>As?LWGIj~
zN%hFGTMD|s*qQjeyMMsTL<3_nHKsq{Ic0}&pQE#R8*$#%B2C1vvYKEc|KI?46BKk9
zRsftinEh^O6ivNj$ms?R)DO9m#Lk)I6w7RblVP0^dB4j^>5GlX<d*)pXymV%#H4jM
zLS{$l3lvd6{BrZpkENEkpFaBSGVC%vSn48WtG>*EI$@hzb;eS|e{5uJ;N(YS5}k2!
z6E=y%m}v&;6rEJvlRI*VSBD63nL^ku?k!+i8Q=S{+pmtvf>ix(E=dmEtG4CWD)(dG
zrgM<A0N3()fL6@lA5-vS$>)p6yU4PHVw3Nh{a!1wrh(@Ex0@KRITt(ur9<xUUTS>l
zFF`AY;2ztXiRe^-33Qz0i%emQufsw){O@<IuL}m(sL;E%ILQKf!=ri0OIxrk$r7kP
z6uljrHQmKZdPjReozE3Hd<El_vOzW?!puVls*scFW`=fb0Xs*V!vf<Sudt|L`eg#J
zI<)9#(!?JvO2P#>{O)|+)S3WOi!{SYiW2vuhzp-xIgEw|I@5od_vUzkFivG;;a0LN
zA6Y@i<D)pOfueogu+8ilas+)+CnKe&L|rIKS7cDmEVK|M^KSSD%@Ur*-59`LmLq;f
zv~p9W4EW^fC6=LqQ1|haI3Rb{+V*A6)LTUT!Y(Rgb-piKo}OlSQJy=4YL_QcivFlD
z$%Nc{X;!x~=SEwgNRh!WX4$zHMsT_ChTByAD6ir@vipPW?ws&SRqnls4j~Z=R3L7G
zABao@Z^Z1%(v7wy@9e+ugJL{sYWo)ZTE~<y_~Y4Wjs}lRdHu%TN24FqRF&$fc}Ow-
z)3M|tI#@N0(ovqDrnFHUwC$LaNOr*EE<WC@FWOPQp=6+2Hc1tAmP>0_Wh~Qfh!e_}
zYOs~V(R=q3InLvV&nr{a+l8-NS*~ank0BI^W8sZ?oj}@-v=_0yXgg8@-<T|NL@2%>
zSrpmpe<y>wY<Z-BjWK3q+#In5D=xg8B}Xa7Qt|Wy(m5=;ga2Os24L(8-u(Jo1lOx>
zq5is0zQWU!@pYH~z4U%ByBWkwT0}4qw;~*T-tN^I_Yb7$k!6nhkHQWJ<h1`zYBJT?
z3jZereQ}Nry!c1%uNGNM(Zs5JvjvaI3E#D~R8K1D7&~VAtcz2bopDW7#TV=1(urJ(
zh>Kztx@Ct&*3j<?DI)H=L;Vy3JESMmv3q<OH|7Y>5v1SuqyTP#zr}uDh5R4i6tRC$
z<OQ&Y->df5vu`7Xu7-2|>lm>8A{+H4kt$MD8p#;%HZ#H2coYr02jBEx+rczl6iF$w
zB#~DrY6sJW@T*@PIfPW<EI<DBE|e>f@ant?;JCoX_tx0=(5&#0WS79xjr{r_1B8A&
zk4@P(M94lh^Ob4hllJ?u_stg1aaTIKYUj!LGmROP%*Q_wm2wU;cYxa<Dr_QZj{2WR
znO|Wv_G4?`+jW@$dTxm;$+Fk|^%zdM-|bHWJ<q$FUxyLUtrNoKD}C*xpIg}-m#%mO
zolm}k^(9PU6~;9{7>#?l4ylwo^laO_iy!y99ZT2KyX@p$_ZnHk?csg6$}Fzo$Vk|{
z7xP|uIA1Yf`)X_qpr0La-wRc|iYL(~%8fbme|-d{*BwrO!A{FeQbIuD-UmGa%<;#o
z{d+*b@5}Z(wrhcMMI^t>Z8Y|DzqmaO{0V00oJ~BVk$yjL`qiuAFX7X5AZl7Q*}$f{
z%<ida=Q~l0e|T7OE!;?oL>#l%%}7)gn>E`BA7I1j`MHLMwG?|Vn<WV7_T0nj^9WYU
zZyT+d213io8x}ZP<o;5(FBhF8g~p_D3zt0)O5?J(K#himewNbIBiZ&zsk^`<3o=qr
z^zbkF40-A;`1zbVta|_?i`tVweDepXm?D+J@2W$i$z9;(4iB=nP(6b+kG2X5bVgeH
zxR)6xr<bfvcRf0miu|zoVjylgP#%|Z2$2bc4P&1!UYQksj!a{LOB!#8bAE7V%2<rE
z^&#8XC&i(&=Xom89|)+TW4tD{aJ;p$_9lEM8FQX-pwc59d>*Ouyp8_P11Y=mleOl2
z26W-zBu6C<7)!={`T+tK@$r7*a=bQLoHf&RT%qQ+?qyEGqUUCY==~-OmiaFRi6@An
zF{5KA=dYjT`#98_S7koyAVi&YId4Dz{7Ko)clx0;EW%ey6XFoLKbrb*xECY4k{)+X
zZ)DVayCLT=ywuYQi|=Uiv*_9KTY9v`TEGsn?s1>6##SJ~c%v1-!H0u?SKSZD&6b&-
zKgvw%ZG#`!#BY`K=p#9fqZnYJJkPI;0XcT3(a2MPl&kxwaoJZ;6qvvG(0Lzzm-p7n
z3bggI7`M}lnmjI!Kf9O_vt<HFUKdlgB^kM2yx;lcbr?%=eJwi9pWR05@^}v08h@I@
zOEZQAE`FbTIj-|3&D(cKw#-POnZaTJQ2$yQuMs`FX|Lb4MuBlCcz7sEmK0uL>H7E>
zurDY*IW5k`#cy-E$h$t99|OLf42PBnk{}GfC&he3e4|e}@L?mwadNQ&aND@9x#`A0
zlQx_tP!1awEq0)2gXwO^O)-y2BGu1nCl5Z>Au`>qe)1}KGJN1Qu+FO#>sj>kkvdXM
z2zd+03;wLla;%zW-l19BREiO@8evl9)})9S4}L{InY={KrSH08Eoc7txM{#{z5+eU
zK?nX`(yUwr8JZ&dlvELj+b)-HlTqMoe?MH^pX|Jhh*Z*j3)!(VnxXbO%24J0qrIbq
zwVZJMcwg*jc?r!9_DNUyH4u!DZ0OoG_&C2t`I8wdxc`87#Cs#nbv2B!!r0jLW^89t
zOC^=UV|yuev;cnAdM}DwiNw(Jy09h~32pw#qESl$*<oUlz$pXsRfS{gAk<+4xog%$
zpn>1QIr8PU{r>!|#)l81M#4q6`!y=-zJS_mMxgjmjLuc2Mtr^^1?U>e&%Vbv+dGJR
z--eY%a(hgMumg$<E}dvqz-h%oaUxvdLbeZdwCYA4S-qQdSwddO&&XfQ>!^O`d_rZy
z)isxRUV%{wt7Gf9Jv>XL$$(8bzbi_x!j&(pVv_wyvex0h{vZQn^*D8-2O1KSbd?s?
zjTeup@DAcn@lAcLXn@IYzJvJ4-*ELVhYZkm?hTiYge}O07f{mLFSQdZ<lErWBbyRx
zbWxTLrax=B8;`KW;7*dQWzDz>y02}{h4e(ub1AWYxmPn6{<U8_?RAeVTc!lalTt`~
z#EEU!eivhBdY}L+4r-XU<XJDt?e;;k<zq5&0KcMt!&3%UCz(c}E?V6+8#CQWnZy;<
z;<UttxQuWbBy}3nHlz&fz&wF4J>Ffnrc$2yfjkgV&p3RpxX2uYf&h6H=bM%)rze0f
zbf*^ehu~!buWyM|{0YX;)MeS7{2rStr7xLb?&D#(u24UR;@7n}xZU)f%ORgT9}i3Q
zTCSphTFX%;I8=|#>5+FkHvc*0&To==l6mUXDd&*m2y@uQTupTv?nTq8GhOor^)3B&
z=n01JJLLJ7Sd=?q7mM`inqAuoSVuqkL5$0-F?2f;>pqn-Wr-S+L=&N?4`6!y-6z84
z&4p7ve)3NeFOoBk1?I$lOcv#)@g&HH@$lex$@Y8-9?QOG!2<k7-ul+?{UV}wj!jpS
z!Ip#kmt?8D=+u!jnLg906MzNeRXR73J|CTgLHsfq!IO0>i5k{u1T5}~FL=!RL#Y2T
z0dcxtq_7s(Z`r-FBsO5uPc5NBhuuR2Ay(~xl!*%G8(KM<$rZFQ{a2=G^>HC2tXM*;
zq!t&H{b?Flqon5g-B=4CLMzi1NO&fnFm-zyp91(wFffEwV(sW7k_(q!Lj9x(8gG+^
znz|)eF6mao;~Iz>t%Tp7lL$RMk+UYyuCI}Q&FwQc10y|F6XBii?a|84Yt0LB^3QE;
zkU7~aRpfr8%}kc|L#Wm`>~fwL^$75LJlXnYYp+$Sxz1E-w2-dj8uyS8h?L3>I<VcN
zjmi_=?7C1Eht}6nBb$;u<>NW%7+GL*q53yGT0!%~yk%muF&SKS?7R2x1bEx{@in>O
z6_G3u@TwI*uN`3!ThO2<MZ2g8(clhtr#sAj-2=tG%qjor=)k%e^b|(|MRCNDbL7M1
z#@E<qV%2rrQ8cd0?@Ob0X}WO-UP8dclr6MHfY@7P{QDAc0`fO;mw&b<74s&><o`?b
zK+k;{?AW)^kJ-iTN>PN1P`)Ub6RQeX1P*F|)Y0n4BRj<%3#+0PBiE+9Xp8TNb!h+D
zg5dy0NF|X@JMp1)3U!<|SWqd5@jIX1&H_9ds{~_v9_q&kcqZ0%mYkZiuB+)%+O^wZ
zH*qZ!LKf7&jO>GgdrTAX-9Nvh5Iag@BmfbhjA}&-an!b$+?(f81*1sQX<75>v1e%w
zZR69D6cBGjSkBkM_tWMz_4Oa<vGfLT6g~AMMRp15b^`*cAT%DLG)ss&Q8c9F7iFht
zsdUPjv<%)2t-P{-+BVu23n!TkkSsC}y=f#%pP$214|?ZTaaq3Ee;VW7*cmGMH!yTI
zSEmrtgc}5hf=F0T`xmOb$E%t2W4i+4oxmBx=qv4oFHPL$02d;ShbcFF%j4m^xp_}=
zOb|gzaff=v+kax|+J+2kv~Vy-vZ9#B&vC`O;yVlW8~_w46XdKb&rC#FSt3uAMfXK_
zEtV||Ga;OZJ6`^xyDP6WfEtuWR&OOFfDrVh8fU?oGj{{_OOnOnQ1b~C+ts=W<xKmS
zo<r<?F3DaIbWrA?aB##ekxZ4~?PO*o-?di=;Gdv=)M88zBF0f}U4hX`SZ2MyxoPa>
zfvf!il3kSA^W~mCRmJ|nlNTValL~mVrZLC6m{Ei6!BpqdS{@d^EHbMEH5^6W%jxLN
zs<Xhf2S0Ja_hU1a=9X6%6Aphf0tVn>?re-WTbO+VHfkwyUfd(h)2Ym<o?4)cxUCXH
zi8mk7N(sPsvEHU)oG_^2MCimTh*&n3ZMWj**eYA0*E|)T0g2Bk#?<Sq`Vg7S$AS}x
zNkndbkC-1CP13Alw_3chPll-!#{!NRvtoZIwtN(I0>m4L>x^Cey*pBg54$I{10CoZ
zWi|=K^RJ(+D0p{{A{%|2u1+nfu@x#DB<|fM4mK@@->qmHdZ=cmO|z>IcEh|pUU|od
zXk6+zLkzdqjM63oA3jASxp$=1U9%I2?*WVxl?I|X1#L=1!LME^>(<!zBf7nwmJQ?#
zh;4qjvhP^Y*Wu@}Gni8L*B4c7Z#}Ts`*km`v@~!ir<cd7FAP`h6k?uw#Sm#w?QZ<f
z-f12O7lhtpnhRx?K~fpPhyT87#>~%L<HRMr`l&yCg?P2Rx&IUc9}Az*T8UmBsh@1z
z311|USBLp<$VoggJB+w5gGGA3Mu{yA;|~qEyt>O-aijO%v1R>yyD=~U&OVIEL9gcE
zWfZ9=4=|`(Y($){!H8!m!XP58Mf>aJ;UTVUOnxgi$FU3&qC3d56j9W~!RDbtBXe(p
z0pnqH8C=h9F#fL!pxso@w)MIrF)hcx<|D%dLfhd1)nZT;FzK`t$fTYh{pQepV?j%G
z{OwG&uRnt_nxPzQm(<9d8I3t|b^6nIFt~KKF=>qAtoYfmh};7fAvLGBis%m>xQceT
z!g%SoQ+AJ<BQ-nHaxp^V<AKhRq3E#K=g99`iw@BnCu?&ZxtWM?8g0v4yf4I;UoY4A
ziTx3Hj`A=9OvK)R(d}chsAr{42NFb?_GxU=PLN5YOJt>aT_?*)rW({c8nfJrV{+Y{
z?Nnxh$wJZRE^wvw3TN5i4feCa30m~a(@RDQ-@%<T45p)q9+r`DoG+*u%ZZl{rm6po
zc=jO)gd_~7)o(~@zX#_^J&ap@VfQqF$u~3CIGa|VBz@1jm!mcaOVo?h=vJLT`6AY4
zxkQs53wFbKrnC9FJbY#lLGc0m2O6*ZN6uajS6Kgy&5HmFJH||Ttl*@b@8h!)88e%=
zSp`Vs$QQVqBqvq@qqv*~u2cxHb!1&cS}6ZQLnyTmfyIe<KD%rG5SHz$38|D{s#{cK
z;}$lE5yf}4PrbY4D$=yZ8}h*T{F4ZaSU%bv0dBhQhH&1b#|2Y}>*4j#old&D3<daw
z6&(_jxw?x4QZ9~07va2R7975p+&|;4dpzqqf%_C{rAsU9+#sYy5C6FptKx}Z+se5q
zAL6r6f5=s5W;>5)YpQBwo@?#?JL>Z@mI3Stv!(uX3p@Ty4up5PuZ0Rx<3iO7O$ml6
z`A#v`4@L&8UQQ6qyd2oHAOG6uSyvX&XwA_S0!@nk(n-0>Wf&NW1C2yO26r~(0BG#r
zA-q8iWfUdipEWpOus)tK&z)7jmyZ_(j1_+l51gLl)IPHAn(4eaD&dz`_BC_n{1ahX
zAtk2z<=Yf&)iwlC*!VcM)M&P+tb;=W<2!*tDM1D0Wqam*C7|LBs+d76$A-8LM(auN
z8(???X&=i}FDo=(AhpA3EI4r2g-1TGzPrLdIjAAr39SD*g%=-3gU~;MLx$O!scN_N
zqmJxoiKXVzqRCuG&z&}N#`hcfC;qhVZ-<TVY-Pqvs=vJqtPUyp9bwWL?|gvv5k9ZU
zMEUP?4jMa?X$2alZvfnh1+2mU13sb`Py=D-Web;DY%>rl!;<dskI|ABCvp-0b=f1{
z>*LuE%~K70xFjLmBhfxWdW(V!GOzZ3>AD*WZjzst#VPAH$Xa`>K&@i`xxsveXeV<6
zhNIzc3vINbvuH^Njcs0yvjeVqZO&lnC2w`rBoozg4@*+&`5G%8;Prt0n(rcGqfyr(
zWIKDX8Z7H3xecot-5qI9$TIjs!Q8gk>Bt-t0a>@f*l9Oz7J~agk^eu%<}$5}N&NK8
zOZpI(Yxa98?Q)wJi?am><7Aau<|PY=ub6GFY$3h}ZQWz(r!b5jU&xZp>(VHGdFQWi
z=F13w+6!x~h`gIYUmQN?t6{S2f5qS^pkiG&GvHc-8*umhqU^(F><hnw*TrG)zku1h
z->^^Y%%Q0cXZf8KMlrt9;6G?KHC>cWrM2U4bhG(kR5-zxUZE?llOXzG_Vd);4BBKQ
zGFkFZy`z^o0i$oFkvfy_7Cnr9^5wG4`FlxiREO*@R=sy2t|><)Rj*YU7I7ICV;wN7
zL}J3{OBcLdDf_Q`mE@D#3M35tLYQM09i2^{iacu(tkU#C#b2{Rpgm;E3qG7Th1LPY
zFZ=Q;X?qd_P)O;5REB3)2hh}`#|vsre@&JLz6B(l=c@j}>pVi^{*OD|RXF00E@^nA
z^QMx?aQK0TG|X4+bS5@i<@{fbYkOW;o}4D*vQWuUrHM1DInf|LJejH6PYYi{TvqSh
z=lM$gVa=l2e3rgDqssf5@VJ|R3%>78;AHJAd!Aaji)NuI-4t&YF%?1#HMRdud_}!a
zv?>+)-dK9Jmt8#Nenhb6fuAJRqG}Z3%q*RbmQ|pzJH)z272sjFEXn=A?q&H{XtST=
zmcNhQ%M?>)-aGJpdsw?94Otig+)s8`GL)Uk>mD}&M*d6=jbT;mcxg}cgK*XqntEWo
zxK`F_)5J8XrQ8cZFzu1*;qUQ)p0&emG|7lNS1oRmnvn}d*OB4Dhtq!mh@ioS>s*Rc
zKq-xcVv5RHJ11;8%-i!-AtYFnhPv^EY^B{Sj~}4D*v~q6*6&;T?_w5czH=08wTLDo
z&?qAb6(4rQEGHcdSxe)t;s@I%?xTB<Q~>gsi10v)Y!)ptifL~zHQn`MMvsa)h%68e
zL*(o;_FKCEl^%OiR+niGxf#yEXsRFCMqDP`YW~hG&qt2s?t|9v?J!?P!Gpxalvyx!
z{b^`+?p?yeP^{IM-$Bo^kZGa|*F4>QYvM$Hv);A|er%^XT2$|k!09>Qo5jmACB(_|
zL~sQrDC;V4`}whoUdsf8gbu#{D<^jKcOV{bA*ns2LP^NTW!<hBfP#Gbb`C|@$p4A0
z<9cuN>zA>sBA+Ixov#ET!PMpJJnUY3$B>(lG(y-O+hxO=-O6iFfO8HYE^Pqf@^dJv
zZ)IPcW!s1x`;xH-$bXeUUv#J41(O{#9ZGozQ@n&uL@REk`Ak3Gqs@!U6)B5ika*AS
z$53DzRTs}=HG^!5CXJo5BElSeQv{rNxd|ga!0ggO;kvUSWi<8NXju%xh2i96aCQJE
zW6ZZmZn-x>z!6o(UG8l3MyD}6>)`LbBg^@^>*Ra%ZGw4Ci|c?pFFo;XZG{)S;ud{I
zfRr4hqj{^O_#g*aUG^CPK*yDF0}!>kKR%kzGUh1s+S^@w75G_nrlHw_x1P}Z10<$=
z31sbu;w4yK@8!rJDpd7xG*g)-Pcf5>eB<lv)6Y2YUmPaX_*M9B){eMI{^vl*AH`py
z^k4v5a4O76{PFiJD_Lo$ihnQ*q!Dl_ZV$c<WL(MWe7tFa!x;Iq=#^Z)=i*P5^b9&P
z3qw}dnGU{`b~2INcyveFeqv~N$wH+<6VW6G)L9<N3=0Vzyw5KN^1Z&VAgh;2YblF)
zf`sO6ylHoA5O~P)<Pd*vj^JCW5=UkFSyc83>%%*K0%}A)i><n^iPdLl2yGPL!1B9H
zg5-0`X8r)mNYNC>izLC1fZnS<9$UBKCzMC_;-}v4O>sasjUytMPO-|Xh4QwV{#Ik0
z2u>wF00Xk}E`%6f-*G5$BArc8w8gPeaBjy;+PeaPoT)vy<ErnD1vY=T=T(se5C$Fs
z$f|uc0&6kF$2E83CH(FPxR+U4pka^$zCVEC%+hK>mkHg&1mb;_q_m5G7iazOL<fBi
z0?L2}6Y9lfrbhqc@c>ypxUHG;^0GVpY$Urg9b+-%5Sh@AQh*G;<@O@o5~RMe`{(Is
z3hWG95O8^ufCU1X9a!^Fj#PRmQ2mW+FNFK4!slOeos0d^)|w_2mrw1!@tvLRHHTEm
zr}p%pMhK1(P(Xg5dKM*K-Fn0W%48^lQ07JY_3#HVVc9A_T1W~i3={n>rYGhVl{bvQ
z^4qK8zHT9xjQji6pMd5W>Nbp6NSLE6^KSP`)NYlzK9Tz<B2$v;-Z80OU+^ChM;7+7
zy>>%G1(G0qKnRcl+JWvJz^=-yK8Ftnqv?j$ieg~!gDwC!#3z{%@iu<Ye4we7PZPrV
z(q3s)cARGs!^0d-&XdQ4&ul28WAxt3Q6%ZULb#HmCN`*mOZt1D3!i0%Z;RbLGbLXP
zgB5lXIJv6hTJ#yH0r1&lVr;#>NNY1~59F7_Yc!C?3tR=_ThRg?cN?HIpAVr#gzv{o
zKjsID#nQO}U>`gDptT2I)QSO-JZlB;E+@%zQ~nSL1s&f@kC)^Yy$s|GE%|*J&KeD|
zN`W}L0EZvSq_)xoBhHVl>z;(nvgb0t^+3@d3Tw>cupa>xlgsf-u6IT1?N9E(Z2abx
z(N`zL3}!P^3z4izCiN8_Az;V*X)ak3LuuM+#fsFI--YMQKC)7XOf3DJs0@(nen(xs
zysoF?UsDPv#q79tZWQNHmi**@h-QFMe-NVEx1F^Uu8<@OxCDrgBb|QY33>0P*w0*u
zM?W3#8mu-0X)gxKvcw|ec0l_azZc~`x2qQWQdv)`YoS1i0PfvOp)3?qNe16fAXBDl
zIYWeo9ThVqIS5#*y{IgznK)xYsO{$cKQtjE{Gj~gj}(ril3VyviN+*A06m+qJYcJX
z51-1FpN1-okgXf_X8_e*R9oY9b@@vN8ITRY_Bef_Zq18Y2hN769zgmg-O*F9!AjX&
zPqz<(4|l7aa@Tp8Lusy~(w&G!=&X)p)v5j$&I73cI1gK+>xllS^8~7FHOGtK+G1Ti
zyt&5V9i8|ZPY8UE4O%H$l+}4j+j1Ai1x^XY)laJu3j(c*p(7+vYm<NuQyVw1SzPpp
zdW5M@Lyuy%crIyJ2+!(|8{d?BN_U@PqR)mL7QiQ&bXUeaOEu_khn<7!ZdP1rSB}wi
z`{!?ijz+viSig(;vi}F)j?t07O4mFA^ik$PG+Oh)BbX4$zPZqBqm>J>Uf05I4yt=)
z&a;ih)*+)JE%E;qupZIlXn<Ml=js2FBqtC{JhEfupAEUwERigz&>*Hf3Z{*)g8o|t
zt@D@JVUfK%8~hx_*=B}&yxl70VX;u|e{ksk9w=vH_XRHw@hK28qSx?F1@c@@$p4Mj
z4LOVcJ{(-)YNPW1GV%tFD*vkl-o5%t;Hp8URA89=soziV!8kgdNb9d7$kVI_A%CAz
z|G(_NzWjYG{r{g|du8#B^j2W_OS&t~_-qr{d0sPx&{}4ZVc>t)w9-#mZk)uT>XM@B
z_qP31Pbth?mN#uS+tzat7MHYo(aZRrnEy~~MG7Rj2WBBg<<2}KwYJOD&UQ(=WD3KL
zzw+qH|FYZ8NQpY<=FRq1>{k9r18)4{8nv?W?MMoG`{OP_NkF`<!#t7$tc$1XQI~Pb
z(nHt}NfJBF<O0>Wlw)Y+k^YT_P&LxwOA5trBe(UJ+XxgYtN(+zw+@SP{oY4K6qH7g
z4gnFQ8$n=D1f)Sg8bm+@B&0h82PCDtk#3}8l<uKJVx+rfsG;V(7<+%dd+*Qweb2eh
zb<Y1Y@B2LKS#hs>t!KTkHKhG}RZTyvF`P4k(wSEMu7Qf@Z#~});CZIDnmk+tM$II-
zuCfpmErY^<6-~#fr6Oo&Gw)tRwL<PMhOz2q(bUyD-vhcg$dyZOr6_W0mOA1R4uBz}
z(^mM?-@d|cH|fs__)Pi89Hcb(-nsEuiOb<VOo4)R^I@d9$qCG--a1Q)qVZN*iTVSv
z?*{0Qih{4aPrb3ufqsO5xY8tu>YUfu+4pP`x-})VZYK-K)<V`^wTBx1jNWGkanjsf
zFP1ZD9^?0Hx6)vFyOCI&c+=hQTsOQT*F`1JWE+*@zf+UoO#@<(!d+pMh^A?rler3s
zj!2(90<k91?NXZU%X**C)oHPw8TIY7e9^nAwkjQW#RKN1R%*qg)=w6}zn^)<qoo$A
zHK%0mX)`IcQ=CSP<|_FN*UkF-^Sj~?zrN}k0A7W)U0mCP#d4!L*C@j{*XP9LeWZY^
zX&QH3W;1$(6k477duygqu<tQSHb3M_NJ0mm41eK4YhId?3`Lt2FwO!U`Ro}k)FY4C
zlpHI;@C2(QZx>&Nf7aS!ocFyC7BD5|o%KC+5V!~jm~}&O+moOCRxw02f9GA?f##jO
zixe-pd_!c9b<Q65;!R~Rp#d<sf~I-Oy||6losGN>=EbN!*WlG>K=$TxxW;T@QZj9g
zAW>R9)9B9p4lrxT?RJXtoG~zuVyt+#&A5}I%jvDl`(X@2_%&ub>ia9Z&0j?Efy<`u
zLgXjZIvCsXMRS7iWZrJy+q*?rPPol}6Q#{LEN6Dl8pm^P>nEtKya5A6y*#^%2(=cl
zgG~4p4}rYw>j*tuJI2v)mIAvtmiM2cU?#(#YL2SSFLtM74#ZSP&ybBZMxy-%n80Dx
zff*yxyVo#Q#&Ia`j@BBHjQcU39bt87eZdOFI}_QVz<N%ac0;kSSS@#sH!jjF?W##!
z-&wPbTJxOosfgcI`@yV}CY$pt;n6Q=npuX-x6<71j$-tlJqBhMxgUHSy-s^CPSh(_
zjq>*Q>#J{WUwn298PH89!Sk`W>T7?RwQT(szu-L)2I+G#0-XhO-TY>Si1n8eX9Qky
z&wWjSr)gZ@PO%StOju$0>`bpA9>+nf@rubFt_iu|HPr$<6W}@;J*8VVJKGcYTVs&O
zy$tOK=m_4tAomByEm<d)6u@0F61Q)n%_r$ky-$SFvJm*@R`w4-GGIUEh8mL4>&X|%
z>A1*>(xK_Gf?B}M2V=z@ncN<b=`SIF9Y7Y*aboyf%r9=n0+K+<T9bF@ty_^}$n9uD
zChQ(A5EmL&Sq(i;9y6?{md-Z5`kG=%EuBbNr?*1{T>;KTpYkihq4<LVbwE!jq%Ek#
z6!K$kQA9CBq_>mG6{E3YS@oWNqtDY$bCLl!>+4{XVkuLAkIYfSTt@0Vn+A4FO<Dmt
zg}d*w%&(h}q8pYPUi}<kPo~9ALk~bj+M31$HPWhJxqQ}|zVkwDfYJfxJ>6gwu8Zm!
zpqKGLdyEzs4HJHCUqdEkeVr%`n9Wk~-5{@4f~`cLt|idcZe@nhBe-Tav3F*<1wLiz
z=MVc@?f<&vmRYj?lZN!dE28Xq;LHlLjKV8=5J%i1-oP}-!#qbmtD@SyV$B?OwoFCJ
zk!FhCTW(U{@7;Je;J`Iv5>0ely`F318j19i_!;a1rY~1btqR7buRJCO72Xn{GQAoV
z=mAX05ht4xZ6BR}6L41ft<oou)TuDDPZ_wu`ASDi93OXDprn?(82hZ$JwmD~H%Lmb
zwMj<(nx+vg*1!%XiZ`0>2^n6xB27g9%xszXDlO0(u{(^VQT!g$<?4RQH~x2QNq7=W
z-BJv1E*xLaWd>6aWJ}*0Rg4~`z9)_p@O*ZfYK_nR9HZ#@SBBD->(2!u#HpL>DUTKW
z+QQcO?i9L}_JUs|3Qy;i6cXt__nwVcYcZU|ReA1RtX>a>DdZ=c3`Ol9ycWPD-o-nf
zS-u>QR`<anGQGPCY}x$|iyZr^U2rp<jk7#^uZa((oj~GOpj$*XfOBM;u4U;9c(pVp
z82?zNFIlRbJaE~YJ~*#aVSkYM8wm%{WKSF~P}1#fGJIC0l;4u#Xt}29r>qIye~!i0
z%I7FX6>}T@Dl|Q<*}M7L>ZuQ9vxV+6YEiDYhzjG^GywBksc2Bc1+_8ZA*P}0D6P~h
zx#id6dsA6Sim3a`HD=fu6F(2Gsrxg%)rtw_fkL9!Eo;4lhj&7Djb2m7bv}s$zr0PQ
zhVT_|*L&_Q7OvL9y~FoJ3r!Z2*O~(DfId@hF8}cP)u=UsV{T!F8}GQxU)*Z!gHi3A
z;owY_dIuOXRcPNyD}0t&-biQm8AE$X2K|Lqx;iiZJ@LN4mk%12@rt;)<>sv;rO+tK
z<8K33tSDZ`om+Pir3qfQnmsAC`|f{5?_N!|k2MCTc$w|)QL0P>v!94zx6k{VU8RPz
z%{+!SADP4@xVBolgB1LM|2=OK%hmJW1yXF}3m<)6zpGJs_ftnMa*QbPZj+`@QYqtS
zqb4ef9X-Ef*PZ1$S~}cn?q5K34K$9G)J6lh`ZRAI)!A{fileDg<oc5%ckR^H;&eQw
zWWC{d$D-Rm&NYNI()_yL?MRm>y4+H)aJBRJ9+`!Ar4?A=Sl<W~Uz^6^?v76bkMo&~
zT;CN6p>4(k!erliN4-CA*Yz<Me$lnp$MW~!z9ML@Z@yPO>g)SrZh&Z^_qjgNDrQ&t
zm@DV-YnHGRvENnVw_Q7r-EGgKwaL)UYF}=7k#KWY<Mjl#34#o{24%eyqVahcF#s=e
z`ij741e>ncwzENr?Y+Eo*)b;OT<$~t{M{Jir#SIfK3y2DrsOe`#41{{0&77_e%|ax
z*LP$FBZJ&dQmqFIj1PSgad5NN2~d0PJ!Hr<Etur?5Xf-Nh7(r(hPq7Bv)gjeaUP>}
zN3r7jiFsDL;y)trvqHyiL){&f4G0W%Q367Q(bD6BggzZ(x{n7nw7IoDb{19xPbhEa
zf8e0Vdn;49SI2w9A_JK5*G<B#i<ng_^#(^or!`O>S%a0mUv^D!-oa*k!%cLgT};BH
z<jwIBhrRseM5q*25QCYi4z#r9tN20?rRT$Y>e%=Q;l2#HOpM29<`QwH<Hasa6)b~y
z=A_&!^nGm7ue;pA;gI*{X;nlDY^SRAgejzcHGh>{CW2OH@f|#1SciqE*5bF?9zpXK
zo`>>RD@Cq5`Alg&nm{)<1gM|kSNg@Zdy2tBd=m9n`96>7$EkUtLl}vz*$A!$Yipj|
zA&Hlgpx{2UI!O4SuKe8F^L`$fA>?rsP~nwj*JWmD38!oonYd%KP-_pC=;5ii{w{5*
zO>oU{<dM+}dcLsU)0a)@U%{4lZu8pRBb2(=P7naLSt?+XPyst$TP^GwP^NB`wHU*0
zO*Y{AuJVfT7S76trJKqGiqgBS^nArM<t(qrU+vG`2^p->t^xYz&-jNh1i@HkeBm^`
zN3FPoCkI(?2ZtE6<PARpVS$`bwTgurP>^m))+P^KhO1mu9uu4&{J|k3O!piS*H+<O
zA9s!JrjrBrS#NyXVcdI2P9AdOtz(~PSip(RCr>T3z@?#W!gjzO$UmGEE@Nn2!(eaB
zqm?)Ar}g9IP)20Z@K;xwTLHO5W@vQey2u@!@8}?f{1#<R&c_4HWusy0&T6ast>oG#
zck{L(r1KxM-j?AsZevHZV{FuAQN(uhw$Ac9k!d5GF~OM%&TtO-Tu#;Sq18!I(Xtb<
zcr}ge;Z+K<?kX-%j?owyhbdAEzOuBJeXZs5J~uo#$Zu`jST>Dp_aG|^{8kZzJKgg6
zd%U;vV&W%s<EeN=r0p=FUm0#|3g?29FdNR-5*>WK>#&0N@eDT7Bc;p(V9TOHL!_}{
z2ByPRXmzQVLOvp!oJU~b(UMTrZihEPYcev<1s3ju$2Hfy^-<eLg={_5;GV=zyOZ^E
zEqriAi=tq%*gC;A-5X6VvnCC|B(kX1!!6^xty~5?QQN1O+pxS^J*pZN)J!}d4RtV?
z>0Wv@g(_+3$L{7AUDrp9Y6fKnb>BWBxOZb<)sF;9f5)cVk9XiH5^2PM#&(rR_Bw??
zaS?$wNa|{@WMcP2tjDBZ-22O6V$IRk_K&Y6Z#1}km)xDVs;Obi3xp&#c5r=pSlNv&
zJbW=zqL{z)<@3+!V%H3Vec6}dPQGO1>f5KRiN8QwrtYQxHIOX$Ef+N!_M+mf5pk*I
zGY^4>w|?ca=vT9QOXpKZk27(X-}6|DCM~f3zn$fB-8oi$CaS7Q>@U}?Odjm0#d@m$
zBPk8$8QR~W4V*8PA&0-uzbw>6tRP8$3`8?*79X(*digL52l-eXs3{0`A<nRWER4(W
zj(Lq_8TB);dpe6aadqON%f<B{5{Tm>;GfQzz`HNeq<kA77Zu<lsH;C-Bu^GVVWslM
zKZ?#E7mJ{6>z+%B7ut^ji1%=w;8rtNNpyeYR0CRs9V11c5bqy@+lVkSz&gZJbS9mo
z`9ODZy-doAf%xUx{BEMFw6F(medJlhRtE}!Gk2i~&O5|L86SYbAI}tc-(G#mZXkuz
zy>c_Md*$o*RwAXd*eAIRe_ym;_EXSB8>sbW@L~`&7mrX>VpFt>G*yL~-W5FI7Qrc&
zi9xQ?BKi~<_869J?Y=eE*%(0Ev=?oERv|c!+|xPAfuJUXRB9g4|5~HYo)gG}TW$q*
zc5D%Dz<Rm;$Er`FKmB}|@RNO<vZK2O?oti`tK=_tWSWkHDU6@s9^H9cCb?|7%5eU@
z(v2P>QryrS@!XrD=0ceD6X<-5Nags^3wGQ~wl+96*-Kt{DJn82h>g!cxQ#e6hl#r0
zfH<4{Ql9DeGrFJsbq9aG6qhFh?52FYj9vYSR~}?pw@^iyjk(#Vx0ot@ZFLTF>lx%?
zs>DFaz+*YvevOe-mQ?q}_C77dgCUT2?6-c<57ET|4Y5N<`i-ph?L#5}b)2mPayKEa
z3b;JC0b&)C`-dt2Doy4m_S>Ilq}CRW3}QYcLeAz#nS*mk)7q7qPSu1*)l{=Ibe(k<
z1^;{{O#><v7z<W3_`M2Bqho?xY9R@~)Y1g{e73gPNcTPRQMwG%fJP?dJa4|l6DWb3
z61>fH<PLD)m_C{;Kc<C!eaQI~wV||=UD3~D<hmqT)CP>bc|Y~0%&HFAZXva8l>AFG
z+2_{_*dkM$dz)~raJo48tWucY5dLq4OW=HZV4BU_&!u-l>*dT;FScGPI@Ct0cjp`e
z@eswL1crkw(a5ovDKCosxjl}C?+t5`D^!!;U%4#*-`UzmVeMnri4~_}1Fwa?)ADzF
z-NS;f#!)BdI+LY!drGNI5w$?pD^1m4QMWjceYwZ1#2+?B68>H~HpxlNcPd#hCHlu@
z*$MjJx@tm`>bjt_=2*WS?YZA&P_dDzB*a-Su)SY<K0M4>chBQU8WJ|Yyw|sLE-bn^
zp=yVm4MP#Ur{%je7eLv9MVbz`42+S;?hX*N7}{`L2X>z|Jn|Td?62j+URe-WP&fib
zZ=cno4^baYb2gszOVgtsG5$I`W_l#@_&BT_#aWMR0=t9V7dSnRBboJiXAgG~YaDLW
zaif@mvq#2@pxjG4`k~T=kwvkZ$bRdiX@?GlaxQ$Bhl=m1eeSV;O3SwQmrX7B&
zY04e~SddPu`g2_t(S<Buis|B}#bf&>mz99#KlKUWJifqbYIgb+dXc18QZ;t6CR<RX
zP8xDb<7zLwbG9(_o<xt{V@oGha!*uY)lc_Dy5!L0O436Q6q8qQGd_ik#SiZxMU(2a
zgE?7|>WP?|JgBPQjOM5sDL-%*`@U;5EE%gQ4BQi*y|5w%g=x&}*$5x-4QF+vI;gmQ
zd#{nh+skL?dT1gjHHQ(NRr$j$Wd`^)9;C07qwvMnTkW@9R!UY7kEByRa?}syuh|bn
zKhC126IG@~^gVmd*N9aWquW47fr&%NcJ`>HYJ9<fyG0(qMYZKl5@%hEfz2dx^tolI
z>s}(?lwG+VHuKvFp4?uMp)#poewZZux^(?cgO)EP$+F|}#Gl#$K$#VYv%5rGwbyRF
zy!XpB_@2&QnFdHcC*HAcKZQv@<m``iGlT)v){W%DH#ZP-P!zkKONhfzrQkGMU}}d#
z^(^=Kw2$;iNtiG)$-Yf}Jh&N`{<ruDZ&G3nsE_FiVpSj=>y4>Ht=$igI-z!ZpJL_k
zMGvD+4jPppF^Ugo_9+IBB{kr+C4~IbM)4>_?9p)TauUDjmLEC;1mp>~t%5DO<j~R}
z$$@oCThAsI*M75IJ45uxUz`hm1Lxh$sF8HvY?7;(J2bfXUb~sl5bseHUt#D>`=>*T
z@8G#g?1{|RDVOR1uM<WW73%R>X{sKHtp<g|(jq<|{_sDGWx>Eyv)B(zDK%DG`p#vt
z)GpIzDi3O)U!}7x3hPx)26Ma$U_^!sE&sVqAlu<N0Ndb0j`bZI$1oCK4Kuqxc+mHG
zI9qV~^vU_j1_&i5u<i_vT*xf-l^PmY8WsoKS_jXhbOfxItKrDEe(STDK<_C81x(Jw
z3=+sNFd8+%-VqwplJf;>DJ0=-FCJP$RD?S=d2>!ZtP<IjswRw!3eo5O!_go#fqZlK
zOh>DRRP0#A(c>$BwP{EkJ}JC=il+C;1|{rVeNiy?F0P)KL*Od83()S|;e{AxOZtEf
z-%T%RBzo^_?SZO}Z?m6n4$2PVsFiPzh3r)@Pv42)vqz5H&oCtpF9rHjjWy4Q5}P8<
zp}P9(S7|YLPQEs%)L(R~mY6^Xl3>M)b_PFo=5@HmG+eFU0L(MISoepTNP~ci)p;5y
zkjV4!+%&_+_sK5}t(p%~(odarKge6-4bxShxX(;~2?O{f)wGt&^w3VG>tR7>fi*+x
z7CD$yZ*>>NI{(-LFzIA=4>-Nl6_uQ%#6J9{)suQ2I^RlrNK0twr5Z}aDkamqN)g`F
zdG4)mKa(6@zv7}XoAmyZ{)4&Apkc;XExJSr*FHUikY>CeHt!ppJ4e#y`1F6ksLKF#
zDcte|M|f9GuD^xkHT)L)DhiFlKb5BN3jfB~_$<nBku)NYEFi*0a$m#X0=Qlud;7B!
z@(t9AQC#b~QcbyCMM+UWANApc;LHZXptE~GO;ss5`ZlfAz*CAkcZ%2l0HQ8~aePPl
zRr|(}>Zju{`vL&HyqReoNOq#Nj~M&qtYdDc)o<G=*R>AyNN=a?ob4WIVJ^&(&KA{k
zkG?Z4-R<=UJUbpos<7mKF#WBn=qsY|tcqbR(^Os^=jC$|@)R%kGr{$7d%jiosF?&y
zwn)Gn_&fgr>|KHro=bmFK<xyj;OgX>8hf}EZdonSVzdWz;<ssEZ9Gl8{P$%rU*rb(
zwc6SC1NWwjl#U_MLl9r`oAR!W`Rf9}rGgV-%IrkT3e~Dc5)zf?tzkOOWhWU~kM5am
z{fPg}^oGF5X%@#G)7HDx@^ECc#ZGRA8Ne46Fa`C^qSjK&T_9|l-w@s{nKr&phra!U
z3V#K(e%S#mJtF0#cTTUBmq%}~J58|H3-m+2H^)yga5lmSti#)4WjSm2QdT{s3N4tD
z-Lw|PR)1qV7vs+8Gxv*`9n<ksDqA%PG2U3ag`z5@9rw*oVCR4lB53X1I$Gmg>YfJ$
zS>zFV4n+s+SDgkf0pU8qAaCGP&u;@KZ*x*%AE7qMAp%Mz^{0d22lh`Vu*YE7FDLb@
zZ}}^(?(_D>Sw1yzWa-zugg)jU0U!8R@cq9)pML}2{|gZJH}HLFb%B3DU~2vWDRe)<
z;3KJj$R!jA)nk5w;LAO?{sqbXKiZC_{EvH#YD7<n{oA^jQiFL$Zbi!T?B&K(YEP1M
zGyER_V>#7_za~7Fi*#u~el@iX>sjpV+j5*?owAuet&}(<z&XOV^snD?vpHERunL)m
z)-9gs9(msRg=9Jxe4&u!OW4HVbm(w5b&_f1w6cYv$==oVRKp-(x@R|yL8)_Q{ljpz
zqVUR4XCv*?U#Ac66wp~*nF1Z_ElOXIZmns@Fr_-VR0$S3akeb<*W3}^pHQWDI%K*;
zUkpEA_+N!(LZc*#>bCcJPx4`Uk~Ye%0l;QYJhyDBgqF=86qixk&hjx8`{k61Q)mh(
z9L=5jWaa%p91LL^FyzyinA!8uFi_WQo~mK_W?(^<sh+FEqCR-@eh*<vH3(jb;Uu5}
zvq4y9?4FAcjhJ4Zc(}AMg&#GlPl*B2l@nLKfS3}TH1{_QYP$^lZq+vE?wxsNavb$9
ziJNFUcitlT2L7wF*uokxg(3Alc<q!5vzz94f$p^E8k5iqUV8DBB-1AQ$>`^n5A*;E
ztLfkzW_(nQdmw6n>gU9UomN_V#Izl3&)hLv|KSokfq8AjLCj|G5a<O>o6DRpt>j80
zFiTzS?)+Qx&@hcB*H(2Ay{Y|zR_WjcYQ#0>i)mCNEsCxVzcnMsYt~HO+H+@I9intW
zfehi3QT>JSfV&1%%F6pQ4QGQ&6%BYZ^4@T2qLO67Ti2M8A}#X3q_tELG+E@_g-eX?
z>!mw8jYY*~IXHN+m#FT)TEu01<!qYUwvU-y?o{lJf4i(zPNYsli&9zc;JVnf>rdur
zn24B^xu5j;<yADa?zbToy~1w72)2q(cD?*cC%ZvACZ4S;<QGwl48SKSy5uW*6KL$#
zV>f`Y{gU!FJlHv(cNIlo;1#og8W2;%ZvU{Nphw+zdf=W`0n)neAXJMg^yA%&nJ~z$
z#*U{EM`pd*p@gCgVZaqFmk&?xu(Lro?eF$`q?$HeFaO+}9O`qE`r0Z+qGfV4I@}#h
zZkkVvX@>mrHuVBYfM&jA6NZjgQ16E3x;w~Zg6w^0b8<_fZn*<-D7wGOmP+`*u3yx;
z(VaeqnIVb^o@y)adVEAC19$rqYTT68-W6>&%TDHKoFBvF0STL5B8-@R`zxf{_8<1w
zs+%H_bmBtPL2w^hrWY!L6THVU{Xo*OI!S;Q=&WZL%})UbS|Gtl7upmh&mFI0?t{>Q
zmvZch(S)ptiwD$UO`2hH@n!DWY)KxIA%kwgcP}yMe-GzRTR=-{syPLPwN~h{+E$<H
zuC!r2k}$gZ4kMX&LFhDb<F<w#ujtmhf^rmLimr}k>kU)sg;34Kr=V+hJvFoX8|Sk(
zb3rKmkKn~+%#{Anv#dS7M}$}dnTkS!L{BtrYnJ6Ln(1TYk2mX}k0u`)@ju$+G6*l7
zHu^L=man7LUk_iFPkPaY*x`A$=%M=)kv*~D?%hI=bAJ&fNaKUwsOIIU69{TKc>ZuZ
zg?g~-V0mgQBAKBdwaEBxn8NwmYD7<=GE{DvvbgFHV-l4mirg$RlY(FK^4q=}shDt?
zXvMoRi$ftNRczKN?&pb<2k1~p^ao;A(;Ldo5l&jjLXoNE1{#lUm-su}DpK)}C!ATg
zpGNQ<{0!ZLfcCE=63&3}j4No5toK&4*eja5CsC)=j`ogG#mMkb1ChPLO)uca(l9*2
z`u^sZA;Zri@F(6hZ_mi%K@R=b$EQ<j{efmvvJw6xv2$(5^WG^~-?j2+8GuD)ElC-r
z*0pXNrkV8Env5c(>`Tk+X)J7+!o7qvmvqfP#-?RLG}%ufS8jax^v*5rdK8`2#Y<7+
zr^y(W{xe5i(*iKZmEIE>hB^OQ!nD}g{pb_^=LMtVlU%N0A_-TPyns(sQ28k_i^uAF
zF0J+=E5~4Wbv9oBg%AB~?#dcXWdb41^<d4LWmiv}$nV3&exfB2dU2-2H*n@unTOkk
zW=|5r+cl*K(J8HK_3GN|C`~+*Xu}-#{VF&WQ=PXKz^9Rdml@!Bf+9LKI*P`H4=q7-
z;XA{y3#lfTTbQypoaUksewi@O7>0RNpa|3Ec6pj_`1GOzDB02s#f%y&OdWmyaNhUC
zyEIbABNc_UL_^3@!14;UD_wh9+S;Ds5Ka8cgsYH&be0xPmS&RQw4VWf=C%l&8cOwf
zdHS)e$1N;n;YywDQXL7Ab^5~Mv#=|b{Tw1I@Wk>^eBd_%-(F6E-ItpVd#yhHJJ>Vv
z|86#5wn%tq!@At$)gx}8Df^vaIg(WOL6C2%7As-ej~u6|(7fGsJvFvN*_%FqAXrlb
zX#e70E>GOH4Dd&#U{dJ=vv^f{@hw)jPpKocj_zF!h2De5WR$Up_uza@b(|YrE3AOV
z<G$sIzABe-Q@Jq6;K5w4{UBpZ|NI>)pd@JE*J{JD^cBPzmVTw}nV8j_WW^cob>^6L
z<?oNr2&vALCUHo)sa=L9*G>|xqR+#whu}p(SAvS`d0k_}>DLzEd8$vKoko*%&Q0Xm
z!*{NB84Yz`W2Po}!tBy5*sNUBh6wt&?Y4Zd*>s`hf^(hO&b~`k_$J_O-G>HmF3dp#
za8L}ul-)$yVapgzSL7yY>A608qVy)BHu2&8kAK%%6r>SxB{qwB+{W^54&w?VQ&&Vx
zsXrs9&<fjAjIOjVvQmUy5sz$jd4jSS{TW$r-DSJOoOZY5X#If2VWlV~VHkm$v3u3O
zxtYX!4uK*>SLB_>)f7;pdIlQmPSwvAnXq4sJV|D&Pzyu*#N;y@xvxA1xQJ)uvN>~M
z5+!k>iYJs-O_s-OJMVUi&?Na1_#DZmUyGaB580s&1{M?(2Ai{^ccbFiT-S%p$S*<~
z8;b~*2;Z8ML`%o4bEh4NAYHkr)||K8Cd#5|EPC4|I4|hVpa#oOo|$*h$Zl_Xd3K3C
zvZ&YMYiKLs`fK($kao>6p0v1F21Suf7Ismmv%~#znfr%%IBJo(BIUp;m!#M)lsbZ3
z63M(SRvv_P;B<ga<FG}=lvvOMbFFdO>>8xdWp~Vz;DnXqKGJG0KcIz395Gvp*OE;x
zVPdL(GRc>!2qP<CJgme~3s=Fd?ivZml_-C<lr~iLKcvyPTK7;wg0K%WL=dr)(X|M^
zz8o@G@U@gi)O7Dvgf_<KS)sHw6PzQ};@9Hj=2;q!n?{`Pp}puWuLo-mjKyos-j{=p
zo2R08=A}M0z~!-2Qr5ZnL%JE{4#)-`(s*#6r8da~&q@T}NsWgDttIYW#HPmv&+WV>
zupl3BUGdJ2C8X@W1xYnkdPbY;=AIp!AWUxkG2cYorJY$~yDU^$Ae&+Obi`U+>s~ao
zn?qcJ)Kd5-R~+UJYQ!T><|FfGsI}41dv=L>hj^sadw5Q+37L;JyS&GEG5H#UCzNN8
z&)sPc_+mm31cG$!l*#7g$|;nI#SA03O{`<#&jZSX9^9rFwIP*}rmjn*<+o)-9nf|Z
zIA{smqo~i#m}t^sGT8<7rVN~rUFI%1LBe-g)$v#?MNd}yPS2z^LX>;lZXvs_lJToB
z^{~CYYGBr?Z7zf#mh&j1tXY$Xk7S20hi<Oeep$Pmefd2YO~RWCx0l_By8Z6Cx9#$q
zHX(hoD@{uMOu@}m-R-{M==O*dAu&|}tL!fX!xIBRMAq_#4`O$ej=Trr+7mgsddQwF
zm+nn*&BL{#BDGs3G4f^9XtbI;P5ZhQuljlr_mPV}<?d%KEUyD?kJvp%51U(D#_MHX
z4tZeoY(B~0Wl9ref*I+)AHpkGqM`O_AkxE@;(E*Z^qcYnMYrcMhR?YeTFE8i1?Yp=
zD}y21DIKh-_XQCYNRXcNv1Q!4fv#gFOGg&*+i)NATY%oz1iUclSm0z24SCS$w;<Ra
z?=mti-@eP=@i2^Wfj&Y7KSkh6pUhr{-@^Mj<Z&`vm}pJ_LVR`vLrQ+p=0TaEEPi0m
zQ)r(}YrL7@S{`YV+Yo*;Q>ZRk8+3$lXm=`?6$jR1D=$k(N}fxfNwJ-=8iX&a@lX}#
z0J|3y3)(T2&Idk-QrnK@IZiEAu=M(H61<c!tVrVU^m#s&+P9SbwTBC5M=`pcN}{7t
z=6Xu>AgwEz_tuTg%pn0mKg>*ii}dGS{PWJ~Ie3&MrTC{M2Zc<jwqp$(%U)2#YQ90M
z6q9=;Uir&%#)d{A!d+0|9(m97>&x8e!Y;~8a;m42y2fb<PnHb-@?aL8{5g)mRjr3}
zakR(yUS8I5FrJGGxlw^<|M^4~7%By_K2LN$LFs9QVGmDJjLAL>s_7Kx|Ipns;(59I
zdnzt*-#F*#(bbLBTJ)|dIG=s9Ia;nSnIRYB#ka378dIH;F%*y0b5#K?*LXw&_a@AF
z(<)WWEY|9+4RE~rycGpTx`zJptkJtH_&=8(7;Ih>ly-hN`>mwxn{707T7=KI^95?s
z4L{FvxY>JXc(QUiP*%bpj~iW^GP;T;9xeGzZsez*GE>Nab5$Dr(#4+(zfC@B<5YLC
zf-pS`=9OQG)p?}?>F7_y7(DH2dT`*^D$YZdDiCX$?EX~KN@7=gQ&20<R@rPLoP3}Q
zEUcCAW2b+v?oYu2Ud$<H<Ahc5W@9wd2fBTJX||n}R3}B$P6?AA-N|dk!D!NCvY`aD
z(G0vU*=S!`ov_{?Tj!AzS%~b&%xXi2EKpkA%W=lWw^o~Hmt0$FAWc{a?%q1h{G}1)
ztzuC??orfgU}a&h|J%am2Ve2@iwJ)hpX*-e9%;Dz?T@ttAyahT41J0LoexB(+fLi{
z2$}0ekt?$w#uusy<$gZs#91|#0Afxn<x6J<0{qy}Z_WkO=U*QAYF`vSxu1eBTkG<`
z(^bftb-vQa_Vj|;GxewU_d}bj4P1MG`0_U8pcDOg=`efw$xjsvXQXNFuX3ZcH)iZ_
zFi2i;v3<)_Ey&OG%gH-2ZTP}wh_;;}3n!_04gnq9A^Khpl_K3ZxxdZA)fmIRJY6wH
ztJxd=#i=$ag$U{GN^>>UQ-KlRc4cmHFXVn77J4^&yGwxm$(E!R#(o=q6PC+Lqd+8P
zd_@XRw}S{-vcp;QhBiB(kiap8u^Eczq#8<9CSiW=(Rc+XK<xYWx<nG52Kh(z>}S9E
zrNGAbDF#euPYTVEH%;{k^lQHO!WMY+yY3rE%O<%HG@j|JR!xlmG{NM;#A4x@5|Ft%
z3gYFVSR^s7%X4r4qbH(k!JN8Wkj_lS-}qDaqfbTBN@C8nuouarBk+_bp5<4h6h^vu
z<XkYKXozRi`%n#euOf?_uo^S=96Uf`=enR4`IfV**qkK63hr$*C2CP6hl`DSIOuSY
zR77=Y=NSP;l+z;w2}!Rt6;BjKlt^b}`HSaBtJ#qj;n}8^6|Gl^3;`Mz?Sc&HwCBeo
zA7gA0CsMOX^gZEBvlMq@`B{F;V~JeZH4%vOUiw9~Utdl6r95-(<yvi+{V3Se`T&8Y
zCXZu1GH3VgJHj!)a%__840_tQu%YQzAg%9Wz`ntG>4^ZV>U@@hwxnKYKSsdmD(Rwd
zzk{=buVj9aU0wHV?>WPKw}oSEzyiC8F9v>b;Psu5l@RGOXo1$k#<L*ie+~J5XiK+4
z_T8WC(Y(?(B84G?(LTin>&uMsf}&=lD0jSg3RkH2;>ak3TqU?U_bLP8$n<?Wsh)@-
zy4$8`JRe1)OQLr_Y;nm!l;xb!vnf$3>z{Vj=Ws%ne0Jk4@9XeYk}2ajY_@&nVsC7g
zo%0H2hV?w`NA|i4iS>+o$m%(CNTjDLKoE7+zxR0ygP1oZlsDUVob#o~BpA|~JT6pj
z9U%B>zu5prI5vzPNVqib8{}KNk_;kK8{3Nq0>tKq#STqLw@Mh-!-iY-f%g+(u&rCa
z<5C71Bbd9t)h9?M)TnB%hDO*v(1wc0biTWdtUcm@Oo`)8&5b1loVDor!HI(YAO#4K
zFw<kzWDuRn{L-LD{L+0T%7+B}iuJ6Le`pq&a+QKbyCJVXW%#`Yl*t4bxYUdumYemJ
z>cu*jkIKHgW!ZVLp{UfW4$&ig6heRHJ*m#KrNmFK#gVD{imrk?9w$!f82kPpVU7lT
z;p9+#`B~FG6xo1p7`duehg+tRcJ}SUy5-z_+luqzQ<{wK$JR_cY3)f+;VCN56eQ`5
zQz@(Q-0oB7XMlR#!iWOAv%7U&Uw^b2PAhIE^{}$wgwxu&WbmYaVMk~1H=E&I&^;Oi
z#fH4K-(J?Nf@t4o&##gssdrE7^;mWmWIg~OZO}Wr?K)j!R9M5A{sg$jkjsc-9b+nM
zp<xBh-@gue66xvrcImiz;H7?iC<B5cqx*D%Ig`RFGLP_=5n6ugR3iD3`$1Ui)qU$5
zV(d7pb4*G4ND<ebt#g34q3+eu9?W_vwMuBReWzv3AGOGLkgO$3sr>oXeRi1JvgEj$
z{Gwz1mMV9h3YnTK{5W;X2zp-_l`zFRjKPeJ>anG@-<MH`k-EuF`s0A$)gYF`7nu!o
z$Dg^<$i5tt6E(L5OdFJS5c599zmJfto^&b6{~QvSV(HpVelG^n5>;X!tTIdBuAQZ2
zy1yI^rQ#%0CSM8Pup%Dg)rboE`mh5sk1w*`n*Ui+LkLgmzaISog>qF;D)$Gy4Qt)X
zzKITfxD#kkdx_VsOA&<XexXd`aexkNECM4c-9w(Jk$=#eXWV{iv_5}`+I9NAq2@=F
z1-bHa#t?RbqT7Z9RAXkB__mukcNH&gA|sB<SiO?Phn<f<VbwZ++P+l>^z4GMsakhb
zV6$MK80#hBGJ5PGv<Tv&=wZr`foXp_b9A7pz+ai_QC!Q^bzv3DT>mR`C_&S6+9!#I
z$_K@}j7=6;DbG*T&owczqzYLbg3db9HKL4UJxx{Gnqqt|0g6C81>rBm^gQs)7%ey<
zH_{}=)kqw;qUGLcSJJ@fo^9$!sjet`1n(B`r#c-jm8TeWT?T-AGQ3AlVlxwskrSS+
z5d|fK2`wiuiAY32D{+_w`{lKG1kaWbeG4f!8?8k7b7$<tbCfqIZbN>-*7XoH>Z3Cq
zL$5ZEp!;(e75l{g!~OO2`;MGt`*3xbiJkZq(aJ2Dvg?Bnbb{fAk5k<jh#<XztQuBS
zPc&u^Urf9jn_MZJ-X)Ehq8gB)UhxbuuTjXBdO{r$&4L+S75y9T#ea(QbPT5*tW42P
zA%R{uso&=TH!d(lk~Kw2_{80|dNv2n-nVsS$xIKQcX_{Ob@G`jyUd@+&^<m<W`{-8
z8V9@m;c{&d<uFBc+Z$UV$+KQ}0ScAw5V`^<*DLE#r>{i*&MUSl&Wc&nsMuM1>zciQ
z^R|muSWmur+eDIYn4C|%7%pmY*Nc~CD6juspzLD2mi<=AAG>&Ew#$o;LXhB7pNV`&
zh^(#+S4ZUWCy4KY#pG>`Jxr>-$b5VG`U%$@QFR(gLcqEGx8wSEB=%3P?BB8AKS}Yw
zgXZ@pU%b&^wZ{H&@{L1ZeA4j_Iv0yAmd=7tRQ??suXGxlKN+F>0seIE!y~VAnP>bT
zu&*T#KmGd0AK<Ep_*dTb@A&(l)!P|*6!UPKCgpmZem{ea^D*Df`xr9{!m!ObhXh*}
zm0x&1oH3<KM#PJEE;)aR({opS89uvO++m>J6Zl7{-{ZM4--S)GSVw($Y`6@UGI_gA
z@-HW45`*{fVx-i1sp9Olu_g_SXjW(Mn>=#!3w@9W%lrr%0H%^%=0@puM#uFb=0eat
zW8_wtyfUyf=-IS+j_3$HBk=bR_lB-j>AB~bw6saOQkSS-rt#_tQfg2;-=Mqy2LO<4
z8CrYx{j0=TiB(N~xd_xd)NNY<HBmq`dbDT;RNi>e6K<WV{mL%A;RBxf6kN-j2I?^@
z<jVSYcRPD_@^-q2I0?DXzp?*jEC5)cxom>nAk5py(c{fCwD%p`y5RCxR+qo}mAm}Y
z`Sk$!QG<qiXvbq9e6CMw&_))$dqZ$V=SBK12EBe+?U*nn*n!hyx6;5BxpO+GV?|q)
zIc}0?qJd-GJLizppk-3GW_n}iIL3ukWmEM>vSfS{UG~h``t<X#f&n-6koPRWAGDmp
zOjGT)cLP;tJrTQdJ75P3>rxr#Q8V|AkxmCeIKw#QmXg>?#ICGrFFPp}&kj8x>EHO)
zMSaUf1GemsL%we9vOC#b_ZiJ&e-!^rmSGIe+b0r~>U8|hbFM80r=-?o2G4Dqbk^#i
z+06?#)x~1yC`O07p*$vtY&M{+K-gqf@-lz^Z>DS#lYy3eNs$n~6l3lg#)tynkg#2S
zo8#vYC*Rv4P`^nn216`v%&CX0=sOg^>`=7tHN@I$Gsn=if1HFb4pFK)jaDEAF`@>(
zt{1J(W+u*^`f$6}YXZ-*0-^8Or$XuHuwZaH0OgsYx;P&8TniIjBK?}yJ|3x!|GIJ_
zhhFi3et}ZO1;Fdd7cl$8<a3~Iph_MH06h+>u_@=CZsE|u<NW8LGeA>N^Nz!J!nw+7
z-O}l!UP{B{1MlaTnd^T?P=nrZP-7AfZ1CwPU{;PQEuV8KWU<}tY^hp=z<ywm>3zz;
zv1o%)GIZZ>M(ARx+m<(N^kr(dIDaQduC6JIzx(IzqENKX+ehiu5lDA4^ln_XGp|b^
zG4DRFMS9}ni=jS??YVd0^q$6YK6vs2%o^jf_GSlJDF$7WVD0v?6u_CtQVuY|j_=Bs
z7<zJ~W}<l<Zb2pF!-SBVvPoVpZGGvz``#7oqB**15zB=^UV2E+AM5z=gDd(chsq;o
zh8}n*w75D@;n3otq=vIfPO5twJkyJ{tAFJR-GR)LM`~oA3PE%YaiQ&asx1S$R2h=5
z#0%QQqJ7K>hEPRWkeGe3R3lDfc0e#Ed*mzcQ;M9^ebCw(aLBmC+sNBoAp22JeyBTu
z1Hl^L*Fdst9=$L^`Sb|F=%G3!%}Bi+p|1jvGD8@|dNzGqa~W_T%4py)liqQs9cDF7
zGE#cbu{*?CNX#KHbkKwIy0UwZr#ly1|N3Nc3-j#wl=HyrbU4ynFJ{(t%@MUeESSF}
zdHXk&((2iC5vY1*7pA`je0}#?WmC)_pihQE8K<2-h;vCp*C8Yjt#Nbt^c?8Y&@*`u
zC*ioI7p5*|U4yJ@vN#WEp&DdR60Rt3&_}O>g@uN33yhYa<~i@pWmt6NdnCm0rI7`z
zk5j3C;BEKKBR+%TK7CuC<B7_Hz^gD>NpHJtdTDZT=)ML@L|>z~TzS$+VEStYOArHu
z#`Z9q40@md%qo_vfo5H%7Bh-4WCNH%TaMcyAez2Mmg5vvoBlRe%*qtdgr#FL6M#kG
zv#ejL^BhR?xhRapSJ^24e5#mu3@{2MF{J9hf)Zl?2$}-7C^_%|ZSkp|hFcVVH;s3Z
zLNzjyVv%CoPKku4vU#fzIh6NC@TIM`W2Rs=-gC#qh;n&p2Yu)jN7<<$4Q)7xr{%WQ
zgM7`_@!wJcpyELNyZfIAGpqNj{0(kw0A+=TkT$$}e)|=7CLji&kKW1h`jizW=NbuZ
zTtkx-8T3xKZ(=xfH)Q+3DzU8_d`Z=PwJ8ssu%&D6V=<z2yWbkWMqbzwSIlkh+l?y+
zpjgF5AN!q{eKQ&BX!|&e3{2v;gkPzwni%@g+SGX7?X~V)kfn_FOk0M&y<n<pMRrel
z5p74L8fF__jM#BvuW@CI%$rRktn(Uza%~%eFFCA;3!XhfP#dwt`J+>-6;dj(YG5QC
z=?@;OK=LQv5>PS<of?OVj=#C6IHJKyT}#$CoNPN+oU+QT>LdDtBIPCyl8NfM1l(LB
zfs>282w@FP7e87cL%ZWDMds*rb`aJag16@Kvi^K`V1V)j?hiYu;H(wndo^Ye5pYS0
zb%JE;`uQJVMp3qsy^QXOua`@TM|GLHf?S@z4zTO${D@KUHRS#e4%EC?EBdo^k~7%A
z^V4Xva4(kAQ#(XpZ)N~J#|)jt4RTljkO0XilQ#{}n+tI!-Raku7UpnokFrgFloY?w
z)n}^fH!v(}-lE`37h+cHoQ@ftIKY+D%bM&7CU4neN8C>c2ntY52Uw|a`)w`h<2gj%
z66E;1AqZg4KvtVbbISXG+xW#rCFZo?O?0IqhkXlT&i9XLPufVSi~khOFBC)ZWR|h$
zC!yAwr*G`e@IN@ZER$IM5n89#lsQzNRLPPc%sK0uR5xJDzvxMUjDs=#HtV`UZF85D
z-xm7se45JsP~`aghYw($$k1JvO<d5CFD5>m7UBZ=CcysZ>UByugpQ5}Bt+--EJo}8
zP<+0>Fn@nimOlwwZ;z@2InK(`)Z=#1M_$ejg{i<NIL~f&oZTrztr(=>I(0}(B8&i|
z{AF<)gGdmFB@Vj50n#ceVEkWULcMF}3qhvm^1O<)tP;`7Nk$~763)~k+KY?X{3W36
z_{CF;ORGOu#b>c(KVM0_dGYbwaToY5^Ct+Ry<+s_59a9KL2*zTWr;BRe!j<rr){jm
z2&>Wz0ZXUlc*>`tC-G=SB&TveNM{j4v1*vB2k7)o3k8LvQqZr2>faGM>7$L}#&Vc(
z3;(F?=XqPXV-*ICXFZv{KrsEqL8SDP*(t~2eU(5h)w%xWBmQB%VHYeZjBK$78oxPn
zMN)|kbWFVcYF#_9wm*m1%0zx*WENN2reL0yX}pZ3&b4SSFVWo3E-wwr8%Nv`z3Ekb
zszM<uOz4=nSk_A12kg-cJYA&^judbyMB+5e6nSJVr!_bjXjR?#op(Q4C#yW-pHAig
z&(!7xBnB@Ex9T`+q5$4~MEFUG?#Xgit|?8EEbx|JWI^ApmK|G|)Sj$jt;Y`qWxggt
z8s09rgA4#0?$c&-uGL47M6d>5g#AM<M?ke6fe{1&erWCJ)Kz;%Q`@|X$^}pTHH*MP
z+iAN{lJWY;iqo>{G`$#MhlDI)WupFpwpeu@V_U<HZM{IiO<TB5IH@|Qpuy%;%wy!|
zRR2H%b=Vbc2&8#&cnRQSqTcNVuOPzOgfmnB>A|mS)>dhY>&qUyS0#qyWO_B@K3!WN
zpX_@|?e`B`<I>#3&9p0BsFnDKLLD(iP@|zeo}-8p^FO5P%`eIXgZsaRuHZ@XPv28z
z_~mY<WPb!FGPXnTpwPtL^amVro(CTO@m(XOU_9!_)a&7od@&Tq>Yf)*NpYv+k%0%v
zfnTNPrZD`K+F`~aacfzn+H~GlR%`hifPeq>r8QZyFEx}p@c);6rT>!x__ob*31dD?
zc}V;ZhiF(jK`FKJqQlNV<L^Ay|JbbFT~V73r021_9TMT?8KoWj3E-tg-=vO}%W<wd
zO}z2|U3$YqNrGQ6N!xX|ANwqUAqM}{xkq9Autdxw^IP^0sPvcTS~l%Hg^b+7KDq7>
z-GZDRN>=Kx!?8W*F>%Pgm(&`|OOhsed^YeHH36WM!W_)Sq)>Kv?T<i`Lt8h5Pb1QK
z8BAJGm|OM9sfH(J=S<e1v|j<`f1FBC26Waf;OPCS^mgrp5_g;O&)AgrT6_tPx;R^V
z@i7!+%UM}G{6{D!>mS@G(MR^SeSK4bhyBXHDv{~Z6=VIUTuCKCpaW$Ps=*G!u4436
z=40#JX(<+{cgFSQ)SexbujL3_k@S~fTn~Vv>M8JTg>Nw1K5}!tpQitxdZkCqUVHrR
z!q3Q)-~zG|W7qs~r^9--q)Ltz>dhk<wFJ}~g!O&}3Adq~onQn=$kiIM`wnrETs_r^
zf@yH(TEc(S0RHoQx9L}FrAzR@dyV73r=Gb5Dxd;|FP|d(&`eT4SwBuNStf5O*K;D<
zLLvIDMQ0DUS(7@j;73<6HC}8gTnwB$?H3e({%r6j9C^Nz0+F<g{0A{SI^8TYAsd=8
zT2WwamhTtpV7o9d5TmV{w9l+`A5UO?z7JHVjA#9xl&k;+yF55^;O|-@Yqks*lYM1$
ztjfl(l|PW-;KIp-@#u4K38P=u&}s}{pR8r?O<7i^(ArHm-KKU}wfg#b+Hp(E@Zt6s
z#gZH+Uh5l8oi9^=l)Dmhs`cw;y!Mwp)O!F|uL%O~Ddk%tE~NrCBm?{=u3F2?+R4Eu
z&;d})+w+P)4$!V1!je1r9dFEkyJ;fskAm;FDmPvk&K6G#C|x#B0RJbGf{6p(zW5Rd
z3sN@5Xop^cf)`u>cEGE|i~pnSmYCze(=>rk!N;{Pi-;>^z?)^9zUw%y08yS={s2To
zoE9#?GgAV|N94gJ!gkv(kM}zB=@}8;-^x+)L#9pN<{YORD%0oSEdMR6As|D+ZA|N7
zuXyb&KSs1rZWQz5FfN~!X`S|WO9o!S72M?Wcc*mua;_7Mzq`^umi4!Y^8dw4RtcI6
zo@s=>!qNE*E_637DO8BdZg9`loTXf5F;lPQ^39X|){sRT_1RLU;(5t7@2(EXM{GG9
z`^i(=XFNJG1wHSjQeUX#gcal>n@Nfc33CvMkn*ZT{+TSH?D<+ltTJ7mv@CwvZ;e^h
za@wV?4yoQX_M-!`#R7IU+Y8|c4GkI9=VK%{{_ZfTo2~WWQ{^M0jgxm*)N>FL<I#M%
z4ShpPCU&tP!4^+O4>nRPJm$5Hhzs4J<A@8*A&{K#ltXM@30GN`>f+;AK9S9e<fTBF
zPCP~rMp6oocXCN$*%_fqmV$Y;ooi<U2*$pMZ`%;rZ^8$))odFPJE<DQPr1vpoa9b+
zB2w<JB}X)|t{q0alC>PS3bk{Ed^$MYR^Xj%%z{LrR<8Q-c?`a;j`2aqyro-<kk78;
zf32J=A<FaOi+7jyc(+46(O2&jR#HU_t~AmtF{53n*!_}-Ll(}&Bn*3K#D(6_SB^Ew
z;5QmWFg>?J7500=ylG$zbm2!FNjNXW4M=)ooVvK3w9F?LdD4;|UgTsI$R>zYCe60u
z>%DwB+~H#2Gf3k(UTiCgJ0zOc#-GNh1`Yp9r_BJ}zOgv@`nMTICZXaHC)uJ6mbI*i
z(EDZ|M&LJa2^u*-Z(WadC<Z#^7?V{mKt0|Z`a>!lT~0)?IAxy<osaLt7Ipw%{@{$o
zh@vz1B!&4S&sAk0@<mD#&{Z#8C!=aB8CItfyf-RC5qVG?T=;pEk(Y*OvLSa4<=kz#
zZ{lnlw#ZYbi~^YVQ0~@z#Li1tZ<02qYW?;BQxspmtHT6Hr_T8NkRuQMmH6qs12X08
zm|oQ(ACA04oT;k^@Yk^;Xy777c_X1-6IwSd4B~#ywb~I=Gx5BjD4(6_zBM|BbzX<i
zgVQ=I-ut-B9YPMy#z;gm$C^hUPrN)TUBa*czG9*acN+?}alQ6hG&XAChK6Y4i#WMZ
z;2yt^p+k2h*iND792=<H>I2=K5ug=NH<i`|<BBjBx;f9m^Z%rXR|$Z`Kj}4ci>yUZ
z&h>Lcl0GiDfRhac<<j`JPo*`4s>{NBw}Z;gB2wL`z*#-g9hxEwiQ`M-Q${hb-HXKu
z^3op?Ep~IJB04gW!H=@fmU9(uugyxAr&N|_Y0);)vu!rjr$7$mns67%f#<oczZ~^=
zv^S-{#h_4cLOF5V?`UE4&>nicbP%=V3r7)Fxv?D}ZDThL4hc~eUhyp?@vG~+X}mSY
z1`zcjsBSq(%x(r8Q{H4S6vUXk6ah|5c|jrc0yeW0!AG{=A$QUlkG54XVk8cFE7K7F
z@^NF9=gwTj!ojQ9R%8j|c}9b%jY@Nw%y~ME3ToSN27?n|R(t!nKNjbhq1PqJfkRVm
zyCj)92ofqA#IjeY+V;K*1=PtVD^M!;u2^#9EyNiI-|1j&7r^&K9W0@0#bZu410$^1
zQBdoKHb#>+W!P$jN136ED8#qiCY)w+nsFun1_PTd$`&uum|VHvHwl~OZI_E_gIwsK
zbISD0B!N2+0R93p#r?l@C4&PZwYB%IJ%Gr0o~|uHzkCQww^a-#R4j$X@FmLylQ4Dm
zJ=-9s&lva7C;38LcHUaf#lJ}82BNbR=YGbg@1He>&&jy}rgN^#vY3~Rjq;W9p6v$O
z@1|#MWF!dOq>pT5#P+WNUzaYQxN~GI&gt0Ysc^LXVw1~uAQEk+lkt>Pf@ZRZ@f5q_
z%4UToX9Y1Xa|o3kY^af!FAZ`Z1*w+{M#nW+9EwO`M?v!BC=e5AhEI2#0EK5E8-NIT
zh^Rv7$&_OZ3j>%suC<KrJ-E6~)3M9KQTt`#xYb!|H9Ql~M&$-j*(N#Q2{gjD6Uu79
z&BA3C%^X}rbBJ2NKw#<;ye({B2nb?ripBo=THL?Bl`0|>TQ1;UaCo3dvBQjLg($gl
zC}VeW)7p+^?56agf)YlQ?H;JZ#?OxOG}cZvfejcd8?X}EiOunW-v%cUjXnKuQ7uj4
zdu!i^<GbmV-5V=TUDl>gS>7R=KRSH!Q~AK;0WK7pD`u~nV@WXteet%Hgmw`bzF1TE
zL>MH+0m#{eOcTe7!c~l5IouH5AcZ+BU4jlFWuWL|`UpaD&D;qC)ojz9q~NT!MtE-J
zdp*SP)+bu+@*e7;KjM6q!c(j|3l@3(WOxerX2K)~g|h2Q?x{7)_L|o3*&%4B(f5h6
zTKEWPA3zyjG4?t3<GePwuKJE!mbQmkRQRi_6f=Emq*0JjAuFRiQzpsa3+`)4`q1kJ
zo06^Go~w0XruGSPp+Zt@^n>30s}UmON=3dcde6k1+tkAnu7Jri&`h3y7d*dqlXOud
z3l}sy7bs%m9srAjQ?7Yz$=>vRN)wgIgx*bj)ss~;TkpQdW<<lVG5Nra)8I99x7x)F
zcmmw^u##L^FkZnY5AmNhLT|kES-Rr~#@I<8h2}z_1lf^NzH~;lcx^O9ICw2R5!@Uy
zK|(;inqpx8<MTxg4(@V9jq%j&d<~5+`(-Io?{{E=m7%j*V{dJt^muEd6z@!rIVqGW
zCaaS#wgP68*wAG;rTNkXYtGK6)MsC16j<N5u#1f|MT!O>?JHvKt%Y#GzJ}@)2d~9T
z*w;egsBjK|q%3*1LvS{m2i4`WeMG;3nbB{jB7Oe7`AKok?Bg*4kr#DO>z^U*_|o?8
zl_v$I@R4!48lm};aB4CF{JIXek;bR7wuOW5BRpLQ4)UAcZ0N3^fV?%B<g9BA7Wx1N
z2!>tv2pQ<n`My#<XmO{9Cv9J9Smuju=>OH)mBzD~?O~)sTS}|*lUSmdVs164UO{a|
zW5#YeZpGfBTDPipLPX1imNtV_(y2)pTdQqqsiC&CYPf<DA;wmvVyQ|jK@*kC8}qBV
zo!<ND-uK)4>73vHod5Pb&p9Uqo&2=2sjMj_J`{ZX^P;NIZtkXJ2&n*e^*B$jS{^K$
zwsw<=hkK93ZUoLi-Km^TJU`$f$Wi-i11US~MaaJPN(PAj18eT>;MsqXcW)A|Bhz6A
z(OAgKnY!mTL}$B9JM6Q{VI%lg4wnkwtZzImdG4}$2bgO2={!pWq?u0_iTdVGtQf?P
zg9!<S4oBs27MuW*IK*}rohVL<P1PKlWQ!ihT}PeQa3F;(dGoR8^FeMN=`eNULQEbS
zNJpq(AL2m7t)k;3NLjLdO~*0`C+J?51d>e<`H3pg+5tpQOeH%ZUHjF5YU8`to5AX{
z=S`i%^_7^HSW%tvj5O8BpRY$dqf64xMrRw~au)gHOF=`J=U+e<Dw1qEN?DRR#3r-&
zD=9_#t7R<XpYu49JtMRd{OK?)6qpdbm_`*aJQNy(3DR*uya9b)cbccag7m?Oq~|W*
zb{V35+0h4d<aK><Os#HJcG3m?LtUx;&mu!Kdd`fbMzxAsV>jw0niZxycf(*&g9v0_
zqpe*Y8MFrywDS5(SO#y(5Za!D46Jt|D`#FTb<i1C^k2+HTB%%G2eGrk2h&{r#}p+m
zIT$z=L?g8YSnBAm{%Bh(MrwbSGwNKid`81`T|aZL(odHQPO7-$PKw?&^cc{H7#^B$
z-24+S6=_(WjEIy#Qw`S%yWb0!#g1Nz0LPpsIxCo&)#|rnjt~a|(C>`SG5S0d?t?mR
zTZcm3;SHuZ!=GNzyMC6GzGZ!omKR>1kTok!UnMWJtYW}KwKLXH0(b(_wmvgh)vN!o
zwz`}sXzX5HcaL!7FpUy)l&nkgUP26Q#bi#x#lfdc?~0XA?FVRk{dAq7zE!ysz1k&F
zT{$b8ZV#fLGTw}L<{Mveof>O<ddMYT@>kz7{938+%IG--4hNX=e@bF~s-Qguibfu8
z1YU`gYv26YaI^#jT<p=BTt$BML>7FUGxWr(#Vhgd6G>_G$$3<Ws<YAremG12-GNnm
zJZ!;i-r>cMCWSVSzt>;A42M>i8EGlFzAczDYT?~#*ZjrPcV%2|%W@%?Pw8#E4025C
zOexwQR4?=+j)ivv!9~focbW({iiWg}`mXscl{W;7tT?$vZgfXG=wv)A$fh%^{f7yA
z)NcqLi|Xa?W}t`!Y1;(dWE7qavSvc0><FZQmW(w*hunpU5(N0*J(+>nx0cF!Bwbw5
z8|jfSxEbu;>ceu*w;Jd07U1t-U6~E}9JaXZIRI8-Ce}Fp!iE&aA1sUPjBKk)%V8TV
z+-cVW+()fr!;{@x+2Hgx$wDDVxeU*ptbb7=1zxrrvh02t@KS@3+FA-X1JCcqv>)Eo
z^?xtbv5#-T$-S>HUWz>=FL*E$$Zn2fv>YJ60dM_vm`NGQxtUE#FaEccc-G{Q?E@=#
z*H{3vl5&kI(2I{5R!DEJTzq{XNZBPMJ-IRc9R6%NIFrqgm9*T)JN|B}+O3*E1mp9L
zflXTD%aaUKi%iY9c})$zxnxaC6+0C@mAxuOvb1nrB2SSa8w8*K!m$SD)Djw9zM;Vm
zHrvEkGqP-pe=K7O*>(Dd@`2bypu8$S4w0qX;<MD&wS^CZX#HvSs7Bv47gQFZuE?X@
z;W`aD0RrVB6wGb?GL=tXB*k63uk|t;_Vqo)Ks?JhH^;<n01wx^b=}la5ykYR-N<`$
zbN{ZEX#23lbPMFuU`ypfquzweuT5`UY08cf3{!X}xv)1~UrQw3iX1%c^gevKskQ1w
z9HS69mpGkd3S&Rr!;0)gzdS;~V+cCU*kPT$DFMVT>l8$ECm+Y1s*|e9?$Fjg<;M=+
zlMz;b;u|sEPMgU!61rg%+#fw~i1!s4i&jn#Jz%2*Q)wA?{-O_EHDT%v^A*gkO<J?h
zQ7-qRZq_NHCE>0HR#fV#MUHCRhWo_;0t6#QC354$&{xcyB6eD{u_AWu0w9H6?oh8P
z_a>Ll!kW!VWSVob{Uf`YZi`|XKpPIM2;nI;+oFxyCxVKub}n@eAO<`~+4l?Dx7&29
zOklpJ0x%y)lqh<U@V)lNo*Ax2K=6h$D;L^F{xu?BEa@?E6Y<Yqqs0Aqe`~9bQqCGY
zA_k>C_U0T{#`mEbi?er<OhjT(f$4IhmbPA<H)vvc9}2o5X~pl@<Y#bN^pZ_L-dMk+
z+1pi<Nx;*GnzdB_9H1!O!3$2DI-=iLZ~eA!yj7I6*l<UqiheBYE_uKQBoL!mTmxE3
zK*XZw2P{??;{{O5!iKno$Gtq`9@A!PLFKru+K@L!8avLu>1Xo;zz_auw`=|wL3^Oh
ztAt6`62DW%K!FmFG=r=SKf2bQ+Oi?=OJRROTv3gvL=INcpnZY`6{ZeO4yv}k1>9Gn
z4*wD?uL42WvVTbw3!8R849+g~>WO_R!o=B!4Qc4#oSB<|j@$g<tZOXC3*WFfQZo==
z2&9+Ne6Kz+WGGsGBvZk<EBS~<#k}84+KQkwJ`Ee*FT<S*rG%5VS-o}Crvq?XeP3E!
z!Ug^V*NZrFs;0+Wzgv21ltSiVx0V<e+^-ovSV?k^keiG1dGv?Wdu^szJVZE^Iv&e3
zu=Q_qqjnt_rAvu75RQRxmygFPugc6{CG-kEh>uY`y9(O+byn>`{~_EWbbV{3xZ`hK
zVj2f4_(rB8yoNvq(QE9!deL)+;*a{n-gCkeg4L9`;K5^4CNFqQZ-skeE5}GwOmes*
zSH}0Bj4~SmRKXwmosK@t*Ur`j^PVUv$Wk=w-y~ZHK9dmLZG_8F03tix(6<-VAL9Kb
z_u4F2#W}PoZ&dEWR_x+ZTHHQMmeNGMZt@IQYPFY*jd|@F_6Og)vz{BFm=7~sljm=%
zQY}0m*>$f?Bg8uJFm?0Qa+wU?rL?%=l$g?vMXVK8M?cRp+M_=;3e>N)x91G>212|A
z-!M>PZGQ*u{z=~p7G9>U3FIxiZhr!gS!X%G7(VORfaXG$CfW!k>e0nNhy|`ZX>)Pn
zw#0=FMrF^5Pb)mGU)psWY_vh(&S<`kg!W2>Krb3~K#ts@(mpasZc~GI+osYc4qF!8
z%>4I3MhQo=*$w}E{LUH^THt*q_!C*@j#Ies!-8e&^*$jNb}aE%LBtN&_!(R9aR+>M
z7-<Jx{7ZBBr~CZB2GIz7{p@N0(DIwyL6=4WJcuL+oIHQ5zs_+jziT_1Z2Q(W82CT@
c7Hhx9_L&zvt{JC9O90<Dr(Er+wt?4v2M5m<jsO4v

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/source/assets/design/v1/prefix_caching/example-time-5.png
index b80dd5b9949dc331eff4e6efe14141368d4c7dd6..231ebc6199faf69d33816d818e372f5a0bb711bd 100644
GIT binary patch
literal 55437
zcma%i2l(6Mwf8>(Lf9+p6=;C~1ti%TE(Bs(%eFjaYq4V4k|o=+rY*~?K%1063bfD?
z3Z#@-%D$vDrOXxx<FcW&j1X3VLMaruDP?`HLi^o&x!?CZ_j&$*+mhb>p7)&J`JJQX
z)iS%ymV0bDGc&VIo}=nBGn?=;GqVnxI|tl3@)zq{;A>V;XUUlxAKUBgnHjtg(sF1|
z3~Qjz9FWR=_3HoxGMykiAVnR3AoM_QhrTleKf!(5=~#xPcfNXtKnQvmlsXI{8Zdr9
zDvQG414$oFps@N?`BqmS{<9(Ua0oQ8pwwzx!_W!L15zZo&JRO<0KUO(@L3VShYbE<
z=tLMj5kDCG%)0JCmvpUYg$E!En!*oHVc_<HBG)MK2Ov4{dtmi-@WtxwzLQkx_^s%0
z%K^pV%;C`CDR3)i`i>0>F))OHTPOsfhhxcu|D4}{78Y7Y%eVeZZ^_j4u+{x)+9zse
zL#^bIoYL@Ad75UN^60DSj&(n<z*=owxqtPYR1RZT|LSf>A6soOkmRllmM8e?HkcA}
zK#B%?(6L%wzh#5R)<2hd(j|(yy4o{!IfG-`AndWNuNoM(z?{D7?5p~?8fqOZk1-{%
zoJ>y|mvy}xj0CSL51ov!mrHu4r9!Aa$}y~x_h~aV_T5H0*R2%`R&Jt#H6~$RG4l<=
zn1&`kA$`HtiBYNERYJnd;SM~K(qXIbr3yhP$dWG@X=cER1IVq@YCW_jQjvwz9TYVU
zwlk$%n6FsFoLQvhN}&Yxm4;qVLqgkOOSCra@U+<uD`6wZ=G%^n)3`-sL@;zNKUDi#
z+bf3#RIFGfTpJ99p5806C4rHe8G?mPXV}ZS=u{eXC$t51m^7p?qdbEJLaJd*l$P&B
z1_kBHsAl(I+b3F$jPJlA<@a#C%1=QrRab3K#ULn+n5vXc`i<0@o2fZ)$e_b&wU<h(
zy*}6rQW?WtYtn1g%r=zQw6baAaE_|i&`dXzjqsS36sb4uLn<FSdIe+a<!GofTmwpB
zSb=IG(@NfImKhDFd4<s9uC2A&nZP8tbZ%Ot$`!deX$55m8}uWLX=XeUMbSb!GoW*g
zQN99YH8XD)Om;BImGf<emz;VBuJ=c%2gW8ieR)Vs#f%^^13cS6k!(we(y&wpVnV7=
zw!*5l3{99S9nz@|C1gt_3Tlx}!zh?mxd5guIQ67#YC6QDRGG)DdcBgTQ-Vhfi|wol
zgQH8<YfwCfiF`&cDkPaF%GoI~$QIKhGHP&6wp?N5bg5r)WwFxD<g24v*AfXi*DQts
zf|Gs^G|>bag~YKSpnYT-J3(~{K~oQ}3{jIY?4Bt$=_thXE>E@U(a3KHywrn9sn{5E
zIlGmMAZ)^Vy5GqbG0Otipf%5H<9^Y_a=t>Q(|$2sZiMYfDpu@j0CgiYTW-`kBm+-5
z9n&nbYk)d#dT5G00U>3MttoxDBIO}3;(E1gI8J9l3C8!sy5d!1s|kuD7%4}1%Z&JR
zZW5#-Bg*u_ZB!nSB0lklY+CQ3{koRr6t3VDj1+J7$=Xz7jHVK29D+%A+fIRPHF2OJ
zJ#9#5eW*4R&;WE~QLV5rihZ3fW$G1e0(2q+{)%kZhsmkZ>T;l`bQ#>@_yMUXkx>L%
ziy4;k?l{k9<1vR#$C=h7BvW!gcm14GGt&8XFWu-sp#%H%akDJQ`5xK|GwFUw7+Pd8
z$`g^@j7N<cXd5)f7DziFsCIc$jUc^R!qm{9fuPF+0u-Q{T+dO*U8tWfD9oV9wz^28
z)OQ1@sSbQE=yYpCBGYyGexJ-`Yh@@X=c7WtZ}BuWf+yVqj@GzL*fIj$rsA$Ku1A$v
z<Fh@|L*$OtCv+!gS)MC$eVS0lRxfZ>vm;f}$Qq(37(lZ#DL}1mYhpyM2eWW8rbZp&
z8P4-vA#9rBpd{L2zNN!87b#|IR2)g55RRIz(kV4P%1ndHMv#2AaK;5w2EQi#5mV@(
zbbyFR9HdjpNwTeCL$g94rA$&JmnOMB8@X%|+|X*d5j;p2hl5tm%t<}E+?{l4g^8-W
zjB4gBn5MxgVKCfGLzu3Q@}857J|2}NYm~PnyymL#0P3o~$JZ2C>ll8}m^9H0OsD&3
zs%DKux8hQU5NUzOwSoq!^w3PIV)fc+jg|%9VY6WFj02^du#AXI4XKP#9(C+qGact3
zR+9ZLR_Tp1iaz2Uk%7y_7;{s2Gu7*bJrx!QX<}4YTtce$n?B`cpc)56I$$+my-2x7
zC5OC$gD%G42Q*CQj5a%hd%0<B<_TDlIjxWnguF@vfvU%8G{;dzE{*$mOCs~O-)htc
zHlLyMEoW%f#%$h17=+N<O)JG>iK1e<l-Nm4ky<LGMg)VlBEsjC5SV|4%E+`+Er*P*
zcDQ0~!e|3UOxG2bOV*x*xD4ilVn$Av;TYU#qoKnXSudmbCR1zLia(8wp*hYFW!x~T
zsmfIM#cpxjY_Npm5IM!46p8k@AZTc6Tn8<2J-8E!-FS?#V^V0f#lBS>IB3SMhaMp)
zQXlICe$m1Pqe8<jr%Rv;_?u2kJ~)eH4>rNo5HI3777c`)O<;!UTDbzj5{bsh)`lUl
zfH}QoTfRDOSG-;_%S7@d7AdrJe@J+2lj(pPpg7sA7=!_AKe+st`TrUg_)kj@j(>t3
z{sj)$>|yySE|yPZeFut_%y8n-|H1;3uVVo@C&^`z<8h=Tdj>E{c6$K#>rO9kwVMSe
zP&3KkyETTkMo_JnQpRQtDYX;T4Te@PBb}k9HM0u9T}%{Hs2;WK4mpZ*sfHH!DASto
zd}}}%ktB%{5C>W9+bqwkc?dCG4$h9s)shzuL7j40sO6D%z2!O(+cqaozfdbH`B;Yt
z$c|~;=+tbxPgP13OHJV+5%!75YC)Zm2F5ZSLIuB)?uESG)3KmSP82X8rH^-G1)~OL
zv!8Co-E6O3EcVPsGNsNaM<h^@@o;d>6w{b99VHUgG|D#K3O!bx3c!A2yumgyIkJnX
ze5Mn_oZKKN#7^crN=rm_<kS*7IaOlRN+*lO7Fy)k<|_&tC;=g0<?<Lxj(y#1LS?!K
zL%F6pt<YH{3L92FuOm231PFmCT^25Lvg<(=orKGv3e$B>Wx&`~Su_>b9JbvtK?G5^
z%`0?0RjsH+x6vw9^KCj(t8mT3>QPz^1kjfV!)zeSq>SY{{m7i8$dJy_erz>5rZ{Zc
z*<OtS8kC=+X~ZhzCD<0^CQ}ZwKu{P4N*N`Vq2R8cb)zPXoApE;xJbmRrIyMiN`Z3X
zNwGIk^Lj?_!Sc8wndkuVf?k+wqXjn$tH7LeHM83`Qr>_u@LCa($d+VeF=|M*Sc&Wl
zLZv}j6|Z7f8pE;KqItc-b!jBU+i;~ycI9z9LW0bsQ7i?aZ<XYh&oP8hopM4@t4Up@
z(hlllVq}_Z1Pyy8JVLO>AW~tjXQ!#!bgZUWDlN8Tq9IhoPCn)1aS17P>ecKtsOPOx
z%+fG#Wtj}0S5?+VbygsUA>=lI%1dd$tjS(JPA1#x4jkL!3MLJQP@^@R6y2It^bETz
zKn<%fWb^q5rc66(XGWGnst#7mS3y0W>sMvH%k_fdC>xUnmFo75WP5^C$a1v-q}Nz1
zWmx^1Cf1sa<>b^7-)acyijAi!B%7P!qax=98fkQ6RnB=(Xy>D;Rp}KCb69DySyAtq
z5lTjaLTK%&83&W7)~ych95B`eftI_30JbUH?Ft%LGO)KoAv=X4r85b6E+uE%0oNfr
znNb~{YT_UW^$8vZm8n=09k|2SIz7COcLyODa)ts|pn;9o8G@=!dgV-SWTkbaS8wYD
zHdNw9N0+l{TTmNVXW|3Ui_`)n=j6>+wkm_?P?#w@Udf2C9=*}>`iwixC_sBNWFJVX
zMh%R>FZ6g>ubWxWsj6fFfJk&CB~oD@Y-$Cb5_GSTqXaT<T5MWD@@2K54?FZEpX%gQ
zFH;^cDB4F%HCstxrJ)LHlMTAl7wmj2X#^2^#cI9;!?crlgJN$Cl~t!ZF?dxNpm1HH
zDr9z0lu{5BaH5gesA7NA%*(MHIF5_bWZLgm%9-Z0ZV2V7*6eVqQEIwIrP9L;Ayuzd
zHL5sjyC{K=AVqI^m>nox4|o#bP^a-49$=%`){4ovT9rO92RKkwq?@TQrFx^EC7OkL
zvs#Rd7N@i#G*t%Lo$ThO4Ngt0ltZOkZ5se0q6qWFmfw<HS!m;|)1`}0IyVXptDtu)
ztt@X!lrc!m6d4R%L>`nKVHDQVrI9p}c&9yP9EK3PDSVPBEm4$POp1yStd*rlrk8GZ
zHFU&Ijj*g|EEaT(SZSzMCDr=GkgI%0iw6ko7r~H7m>6=?k!F+<<zY}TnHmCp!3ex8
z;;Jn-S`41<Lqi0Pn`7M&`)CpE1oc=b;%!kF@KsEsd}3^(MyJ~Z%{5_BoH_>3f23Y-
z<dtr&#0qj+YGZ_>wnA{+MJeqdn9-?gmffEuIve}}<CCd2D=a(dCi=ZWdV{jj%j;a&
zaODb`$S|(RWLrS=ahsqGR7!jWW)iWkAwuC;Bc~&!5w;L)#Q4Q&Xy+tw6pLzDb^0!x
z(L{&#hD|s%iJKS}`jI#liV3DfgMf)}Ig8uYWHOLC2v99GQObMyy2DMSPOYyJsSd$#
zF{CpB(7GuF=#Eg5oUW5>XiBGbpwc+TF2NxorfR8r$V2dS+)BY%Nzb!VK7+*Nb_O_;
z9kCpCMVJAQVH70@G%Pl27~f=TUAfY=@&mGo8DvHkaj<%fW=vz)2t_7YN^q@Ec^P<+
zZD21MX4<tlqM~4p1_pd`nDIx=YzG#?^3=cwF6vJ;d~6Uj)^kyS{h0KGD%)!E%k<cU
zNC1`sunhk@d;-oo@?Y>tB`n}<SuDkYu8SS2Zf8j7U-)F&Nw^BSl;CYP03U!g0%Iq$
zI`|N42Dl@cK4^_hr9yw48jtI=RdQS!SaOmU$ND%Gkz}`BEBZ~g5(37CWb(C^nnDC6
zZcYJG(JJX6qKHgiEO}g~&x~!yqf0EF55iWh0?OCL;=pHUG2=~?imMh(1Y&v<#XzW7
z5S%hkRxw+lm3%$a2y9xb71KPW#`SDFLur|TNh9?kK6SVz(am$+NjJz8jDa&423oBb
zW{jBD1RG3@Y*hLoR`#deF58TaNr50~92wM0ZlN|Aq;nN69W(usLqZ;(;drfTIyP4E
zJwi@40sJ*M#-F5(d_gFoco3E*oR}!IZmn*NJGO%k$Bu|(IT4<wh-x~~riCB@&OIfP
zhm#3WilJnYp^Dfo=moXMwEM|1ZwwKRD|CwmuIekbI1_c*X`fbXUZ#q16=#A<+|GqP
z5sq5}a@cJPx<QC~*tMpd4|VN|st&D0z`}7U8j~jFOrs%SOBlN53`?~%<45D1D~&iP
zE;XweIuONrt;1x}B0nhDHB(HHE=>hmyI%ui1z1IOvU05iC^8QzTV}!03To_j1-6w%
zs=%KaS5%q;z9Jo3%?_TcCt~6vIaUk$t+K0E>mhD2!-j)mDCdp!b|s!xyk=rinoYFV
zsGHuTR2XDaR2C><X4EMbji4(w#c4|tFhL}9r9#&$#ZImungDavi$ey3226fR74R(Z
zpPNpVnnt};IPvRI;0l({<^4|4?Z|{93T$a0Wc!$t=ZwHi+^9<8Brsmui-8C;<!qf`
zjRItbBjD3BSfj~pgxgNy;VV@O4K){SNN8)K3^NFq%aH=!5fv($hW?;d*O5-4oOfzK
zMCwzh*%&$ntTS|?3g9Xl9_d2mE~SzKB1{fHOeeBE9x|wsq09~zvZ)$klpWwk_Ni)p
z=ugwE)~lppZBm+mk}cd#q($E7BKw%xGcsr}Ott#HJ1WwH*u$HRFa@a1#5|QlTa<#<
z$PcPn;DE!aBElB1>bRL!;;N+pZo}{A%3{lEt4dtUDv9r8iHRa28*RBf&H6)ONRz{W
zmXigd(+x}+w0lq;3x$G!jnJkyf{=P<1QfdKdkD;=d>yGwCZRFSxBUcvBcTlYBO-K0
zsire>wb*dSJt6192r;CKlE|Bw8w^3~q$`HQfN@+RDEaoNz<7f;Jd&F=7uW-))eYos
zgwY((G-S8Rmgxc@pUGjlKWdm5T}&B(>oesn!y1DrRGd_*mRD>dh@04OBi$?%nv?*X
zRI3Af6;`JvUO87_hb^(+B5Vgr*Zml(Hrx4;t&DlP#R=@R3u>aRI;x6rYa}*F0xpqi
zC<5<y!uHz}s~uD2fRex@86})#O47Jp4qLgt%R9qV!#AOJ*H1O|tlH;L&zH+cz7K=4
zMw~B=b3=Mc*XrZ4*aIBU&_;4{O11rrD~fRg%~!j=4f9#Qr{u+QawzPk2&mOM-B425
zu2>Iy0c)q(X|GBri<3iBY%JB~!blw((;U{1606F9d87x)885VkEesYXii3D1S*CD~
z(<W0p8cjPRJS|5ii6aFEM)Yb9Ck3@y_ClpxEKI#tF%6a=Z&EPU!UGm@<gCToz`8LJ
zF=>ufumudnk3eUxk*2e<C>q_8Z8V}}Q4q4jg^42<+EWYet6=gZYNsdJBGRV&?GX}#
zXA0y~nYx1)d!@=apP3E=Wa_0+-Lznj<^3jsYu!|+^fKk7r2y&M1(#(@X+2R3QC|V7
zlBAg;A_L~S?K^zED2sAo5<~+f@20eXt5)1HSR=CUl^ZzZkKNG-hw#dv*i20E$e1EH
zpJ_`4qS``><Ah&f3^AOb1mCEFYL))9oM25(FF3i5po$Ztho|_F1rS-j<fKf%n=uZ{
z3@1%-ocBz?yG5Z{BA5n66rGVCjk;O7U6Zmg&wA)6R)eOVh%idm+>u?GS`h^Ey%N$V
z0<aup)K5$hZmC`i^hyrv!!aWuCCOlr6oPlE9Ou=$hTDgm8IhQ_LkJAbhK2gnwlcC3
zD@6iLmOetOV`<1^`~*^mwMfQPj#BkfFVyWc>W?#Wy&`vboeQRt+mS$pAj(ZN3<jqY
zA1P&w>cE<Z0;wP1=}uSdyJP?td{tCi3Ah;*B}=NKV&Ig0YitE^F-(u6qE_<MNg9rX
zL5dh)iG_#gu_Ga>K#`TuPvI=b*%@#<fhj$v3}_b8#{5Zpi1a2zuQ3qiU{dR6v@sGH
zvYKjmqEc)Wl7{%aE%-q5nZ>kg3CX0BL4!ZV=6vv#LHk2KW}=PI=YNZS!2Sbe|9{z;
z7JTAp+SDgT4NOEUnq)QsYe}RBS0`E<4buOL)aY0ub?Iz+q;fKvPum>`@f)LtYY`oa
z=gXv38mG&>f}2Qu0Z58q(NNAN{b51Q)l35f=$M{^Rj9s$DSWS;8iFX%n63sM)#%si
zKJe#zJxc%vuGa%ja!d~fO@{{1QtD{!RwbiI$u2}Ap%KVpC}pT>IUCj5fW93xq*lyH
zCM(qgbEKOMO;agqsE#nKRFP^WI#Z6M%81fx5+4^`9ti;iB+ao(^g9z(<i_<uORAUi
zm@#1pg$ccWqmuKoX+^K~87M4A6{lIHLCglM3^{K2ZDSawI~@cg2;InF>11aqIH-qk
zXOQaW@sJ~mS#3}r!3-y7G^*!s9Qv5x2{@B22OQPQ8&!T12?k$wfFWcQlg$mFhMHyZ
zNjT{TT~DGYArHFCWJ}d*8^oa89LEc2+-{G_DW|k!yOz#`Q^t)GNXj|Au!|TD*-&lR
zA;vtL;`4r6)kqci?QK7b8>tkVtxg(B)fXYY@3jS?YUC+Fn+&F|<ni6kNEo->FzN#y
ziFK@IX-pdJ0*`qUD_!eFOuZKnDLqXHHEE)`eB=godqNYLR8`NAX{DlwHD*)_N}&ss
zcP3><W_w!G`<@>0T&DmC?}|ZbluW1kY^6v$8Ow*TqM%GlN<C_!0Q8X|p>>mu;TtVS
zv<b6|r8<DXEe0YLO`2ep8iGSP0&q<1zTJ<-DlO)XDJf!}AYs)$?lUolOWw4N)<Iz%
zPdCa<7V?H7X~sY%nJFrVs@U*|R3%Lm(N-=cwABExF-{qYzyc1fkut7Y0uNhWda3|+
zqd2Ygdl|(GOH#2)C2F6<N|4=aw};&c(5GUj$ATc*gn>*XO65qHW;sa&ekVC_xEO5T
z#sdq<G1PLLOsf|;I9pS*>Au$}<SX5(JLp3#5Z23!1%c{uJsul5N~UhIRE~3`AuJ5y
zLY^yCQ(B|S7boMU)$enDGn!bF=BS`dbF|Q!=$-5&I}XqaF`mGAf83@CqR<BX4bvZ0
zEGgZ~wDg=_uDViK>Ck!tqlPNi)w#0LCLyK-z`UX1x&XZY5$jfLAsN3grY7wUP_@%>
z%IergVGIp4Z|GNroaC7R{4#35;TF#Ti<K!6f=7ar-)(oJiVK@j&r)ixP(d-!l03F+
zhKbvz;Me?7tzB?zvs`PAr2Z)Es})0&vje;7i6||Sy;f>Gn5y-hHBl8RV@1q_%$U5w
zp)Fmq(5T<Z7<|eY>tQ~^^ihl|VHIqY)A33phs(!eC2bTM$qe}%lTA-^fc9rntwGfo
zas854GXuWixFg2OH<eT+oDw9g85zA)rSfV(4@P;y9Fb+G12&E}m3Ac?cqWhHrqO3R
z9o+AZylxOlrE0}$=M6{7aP$OY47of3G$^G=6*JcYI5%`Fby7^bB$ci}&GeMO>nU`?
z*#bV5%2j48#zJMxSP8~9r5a+kx?@>r4Qh}+=u}0l-a}A3OhaXa^M!E)Yr_^1B6Pjp
z62l+>&c#qS<2V=xEfBBmu@-|ngupbzPJ*k!)W=%Wrl#ev$D~tYdMwH$TET|}uO7yc
zhc=DQh)B#46Zb*+x*h|xGsJ;0huIE{hM6MVEA<;)n`RIjj(agBVIZHvQFTY^^rtNt
zp_>UNQKlGb;iMS!fSncLwwq1MeSMf|<T@!@OL%aYWLZ2p&v=V%O)b_8<ZO$r1FDbD
zQ#fDJOY%4#c_0XH24ex@aiwDh(YPn}xkiV}0?-QTHoPd^3Smr=+lF12bXVYNl@4Kw
zEE=nlo~08J!>}4|NYEsZMngc>(mtky0Tr;k9z&U!7z_%H1jqLVBd*XYgrkb=yJNPa
z)eHetXPhotl%0-2G%PPiYm6XxKvg=T&Q91#GaHxsLL7o_oT#jeBKQ&DIRbnM2?5y*
zT#?n?_E<%lv8`f*UbQ)v+8`1st2k4PlBJIMN>Oi`lwRhwY>jiqjj{ld!4yyn9y$a*
zrKwbk7~LO^8(6?%dfaX_Ixf~4mYRTQ1r3OuNnQj7O)QIL-O<QQBhR$hL8&_g-sgm6
zFq8#>qe1NYQ_%-JYgN!E)lgArtvVdHC?W?iu#lOwQ^gTH?pHQ~<DQKs4mOQsd39_J
zIJzGevXwsG9Co$1<;r0i8x#u$+b5A}Sfzt8V_<5P8x35A@-VTQg`5yF;(CJ@2Bg6W
zrIrs-oG6zGIN!?0Fw~XZgv$e5jpwiwz^CO5lh>_GQH9*Drwqjsp#w~iNG;Z3rUt-=
zj9EhyV_GBr_iPT>!T*Y9ye@}k%B6f+sdv*VO)Cvc_$QtbL9_|r8M>N=tG1EKH_~0Y
z(BVMNkmj<srujwSP`34~&zC_=M2+&pjtple0mKMhk83NVSdN&kgpL!i+LKzW)ieZ+
zmr;xyRAV3!j8Y5I`CM7G3k9T_C9_OZNCM-vAV5OP5`dsfLmL_Z7Yl4(V)p|%5h7BR
z$~s~i({_|4dj&ja5FiU>6sO}ZIxQ2_l<1oTGJ$YV)Jsk+o9(e|Wh1HyQ%K}Vf!L(5
zFo7qQK4nTi5_^^2q!I@`PMx+bjez)SER>pkIgbo&r9P$W*tkqleN)f%M=6{r<nlca
z#>Or>n%09cmT4!}P9K(}hLMDwK&)Obb<3CwwI)pBFSA4%i}b+AR0#mR$r?Rrm~c=X
z`~7sH9{!1I=t(Z8DQ?iEv_1lX^=msOU{t^!lMcYe8M;)-bbE}c<7%Z2X}zGLHu6dG
z5y=WowA!X(9z;>9aTWlRa(}2xWm{rI6bLZs7olvbkx~mivz#2tJYaOn5i*u*Bc_wn
zIS9$7Mtv&-7a6KBs^+{_p;8xWe7l~m%f)IAQR!anyM=t7=~Ipyx5uE7NGvqHf~F4Y
zO~jefCOz)-s$)yYb2+g+&<iCcWfc*tVoQ7xI70XY)X60qnGHHHmmR3-jntMlDZ{zO
zpi5Ut4K4%o#@5xL=?`cZms69_?*(10rDmbnr$Hdtv{|Cxn*^n}mN*BZOBNfbkwr`W
zm=(vlGMxnX3Y8{e`3y+Fv2aZ)gxw-IIOLSiN1~QB5z3eO?6^0O0<f$g)CVF<#Yqhy
zoGRcQ>LTDB8XgtORv=X?ZJ`({ZhByl_9O~f5UivD>7NI63OYMR8j(7x0;dA-DA|OP
z8zP{8vSiwpLuVooUNxyst`Jw!Tt5v@{jA|m(Ln^_f=Hp@;&C;0^Zho|&Gl4{R>Og;
zm9pt#9#Ep$08LwRb1EnIYo!`t!3CRPie7^*$y1{|E)%E<w<@|-8wkMHZ<B*=o^C5l
zQ8On|FDQV)Oh=Qb>g%;)h%rLRpX9+r<Q~}72{NwMOZKo<)LM0#pa)uenCni)-CDtR
z9E(UvfEi-4Y9`QnP|h+dqY`j*nL@iXoaB44J}e`)IxMO<6^0r-RAt6>TH&x>(kQ82
zi-N%KL%mpZ1u~kHYAw#?)E-4CTqgrU<*8EDQGqM1LSE1f+(tyWHLO^3blem>C^j;5
zIP#UCNw7tVw2@Iu9-u=t7k1qmF?A`Smnmj*`5{&zElM5dD!_eg7j%qsIMtb!`en$R
z6qKN&!BUoqbFo+g{y?)n78|i(hn~k*!C?h4dN7|+HIo8yhRE-Ai&DA|b`fw5L6zh(
zJ*LIG)p5_{0O>B&OL4snhhhiQo$(Z;*0B|GNEFKHMh=ATblplP*=PMx8Z(eOU&gvX
z!GVYw)k>8nffoT@1eVpPSVHuH(VI{?qJY3&lPGlKSQzJ;Wxq+9D1l|gz#FwQ&8C?u
zAc{AkGHxLYm>a%9O;}9qc0o!~)Gdk><W_Nw^-}dhBOjPfsH?qJc_bsi{1@d>uO&^n
zEUI8(R&V1YsuQSX;C^Pi9+L&b419J}DM^#6rQ3k6Hp_;bEC|->I1O_`5FkG=Xbk|f
zzzit{D57GGhxJhbkUR)Dz>FE!E!zb@h(R8!GzbVr(CPpYi*kuWjjA=()e+_D!^#js
zc^U+SEE}ABikpP_4#+iYr-|~2&+5%;hz;<fpvRJ&s$&hjH;C)nutpQMS0a^|#~G2(
zvYjg6Vu!=Lg=KBPj7T^U0n8^7*<7r8HjN=I$;x+V7i-83V7W+fs&zOX>pNQB#in_^
z?q#caMPx8juP}0BSmixEsP@D>54Q!ks7zBh=DKFYvdS<@6<yqIvsl0DRx^xK37h@$
z2m={Gd68DEA)X<|4Y=Tz0c2)y*39%HYs?~(1hqv%haX54RO%WK=L8chs8dd+T5rXL
zM#?9{03G7GL}Uni2tu`OH*IwiZP<e046lO(5>Lp>HL;^Ro+*}KN-&ZYE=cu!X#{H`
zD~j2)*r~LKw%cwu%m#~fYFGgrog#qcos#J`BP`?qlhYtTtD4^ADBT=-4Ss0JBiskF
zPI4W>AWIJ5mqIMX7l?in>Eu!x+>bznj&6r}bi`8=VgyoQy&6?OtWFGSsSy!EHFF63
zgw~XpD*ZkxlzK3MX7Uvbcg3j};tb$g$a+H`^jStKH<6xPqo>V|qhelXN|%{JrsfgW
zP^8E@l!|q4*s1rknxhJ$n1;*s4A4C-*RR(7ao3^vVx~kFlys_{>Wnh&GN<xJA!yrW
zGm`>dTD^fZu~rK3Lm&f95d49!k|`1hSXkq5kl$46RKj-2sde*AVpzwK<WoqdUk5mA
z0P+IJDV3pROQZ&9+u?_Z6_QbCq|Ii}#Eqz28v^Pu=aqYL1F+pSqXlIwr0#K=0@`xH
zN{hvE11SS4lq~n_%~VA}+gY(eqp8%0CRjnMw75*IWie%L$jUyoF|ZZ2rh%-3eb0?t
za}39AJ^^t`w~?na{i$3BxRR8{AcrCFCNk5Ib<r&%9$CxVR<DTI5|?Q5Oq_s2Ujq1+
zVRRermxy$l^#S4;8Bs0SULW8crYNV}bg%ut$Ah5Kf5n5-Ceo-lf^-<pj6IUCH#-J^
z7ysh7zLtrcWFM78P6OFTjXr@lLZf7>?U7luN&*?@MPuxO1gdVIt}%Subig3vLZG;b
zmDCVG=+*47SV<Mqp_6j4(iqCBqk0Qht+X$AZ7UWFr35<BbP%H=lYkb-k1}LR1GWb2
z2iZlnn8FIaT;a%owv>Dx<T#=b)~I*gG{+Z&$kx(Tlo(h*>ycd>#Pwh^ot<VtI38`K
z46Z&ZivyMN6M!F-iLBc5^0<(4dmTx%>W=L8qgoG2Yg3yZy1<r!7$Vc+nvkUFS%DA=
z9R@&NF(c|2#$g1fWm*{8>41c^jP7}X7X&h#z|U@%ATm5E#@NV{AvjeZgT$U8UajSG
zQJ{;t)&ztRlOu#ESVC8NS<_(!n~$;ylCeW2-w-E3ze<{2E=rSj%Bh(dty<}seGf2<
zIEZk`LdQ`$Ro;k*Y&q0?g5)S$lU%ZG_iUtr;0=&^qLZmmfMO~&MCi(7s<fxmNb7r&
z*Vjju$&KN9O98Y&+w4RFZx;qax)o)I+3J7?>47@j>hll<av_X5F&@*7&|;BhI|gbt
zEDU9^XTv^~Gcz^OwA00Uoa&PbQq_5uX%wkoilW)j01@_X&^F0-IjF-mOlhihj;XtJ
zQ&dn3!Ru_gSPgN>#A~)2_d7MHH}Iy&h|U+(rqHU}RN}99ntfF!l^m%t-L@!WHnY(a
zD$2GmvszOdmtD3m@qX54#?wJPs5h_)!bv1jHHJlBMHHVdH%8OesM!sJ2BXn*vq*LN
z<%T_WWxgqqDw7M9v~Bd*396MuX+qMLDQfABsaYkZRClL|(<N_23)HwyruoF{WVEc8
zAElcWGhO7Mx-V+AiYJX&h0i63Uuvt?NM*b>Ll37;Rj7;E{;)z#fP|B=2CDbTG}JWv
zJ&->TSL-133hvd%amltybd?Var9qyy0AB4%rGYL;l+@GQZUN3Gnu!A*Yokt<3UoP%
zk);d6g5JlGwvdA{6EZ*&EIqQ^MqlZHC}fJVrF4RZizpf@)>z1tAu-=68QGvNGNpo6
z1uZHd6UIh6O<4jVfLzxbx@o;+<}4=BR!p%|l5)VuXWG4*PoZpioaAwkc7$qVS+3>M
zeSDw><%wksLCm}r_`#^`qzTPzrANlFj*@cKw}xCTC`4o%XGVo730zI02Si@(xd|O<
zLlA@aN~I7)rZW_H$s?~Z;$Simyh%VyYCugfoJt%xt>V}XkU>6?(5kGAJ*gSfAbGf9
zBA^*kw^JbY3$h2M3YN2TAe9ZmS-;`qRdNc+K%<fXyM+h<Z)9c)bQLV4k`dZ~fluN&
zWQJ=}XxV63YT7H=Aj!qfbpn{nbLc<;VYe)s;mU)Y&`3}fK;AenB-S^%s2R>QJFL^W
zF^6WW)ndEU>eSPcJ*}7MP8O2oje!YJn4K7HV5mq(l<Z_axat&%L{*Amlq>`sSZT!}
zV-+kg(+==_b|fY2OsUtf4Uogd$|XP9lANL?edA(AWqjTf2O#0SI2puEbCR7hP-7ss
zr{iFn6dfDmR9ut{qE{FX@-bRs+NR3j)hRhi#gKy#J+T1td`e@ti}pl1E>E&H@6<;M
zIkiY@Kmc<@rdiL^WS*eQAz7!!O}^8mGd+z15^aHZHi}IUWNKu9|35CdND6RwBydRl
zOauIgk}<GIMdLuC9YF$ifYEa#NE?lGIwwn0cLeT$T%po<N|6O!7;|Qg3-c2_U(7TB
zeL)&wj*~()k404(K^g;z5{XVuEsV!0)CZ|CBYVJ1hGG|>c~Zk5UgA*>*@$ONVvLlk
zy|~Kbe#WN8frMz)sP2X~QkIf|2wtwOrxGPhKs9|V8D2IyDS2nosHc&r!8#?gN#P^C
zuNBKMC{~SgQ!E1*Q1B`Q&88^>^+;ChlI}32^6IcNPQ<9+0JsfyH8jJ-0@5%V?F6Z6
zb{$LzjLw!UH9zgxSf}h46_rcOk%{YgM^*Y!y<4c-g8~h5FNW0;h{oFkZ)E2t#dyjU
zfqN#HU3of4ybq8d2j09HBL;YVrdn~rblk?`c3?_D*pz$&&P*kTgU8rJ=-Y^E!x<^&
zeYhtuMh%Rn)M-&E1SFpiz}q-zFW^G7z@q}@4J#82?t-Q20Qk`~>sG2MHyeIb<iN`&
zKGd(U5~UePISVKI7J523*laNvda{WEh(8e%^wNqtIi}H)GBTdmQyD14F`7yS9r|9X
z5so?|9vOo3*9LsSm=JN08I$tWZgX640I;BtL}dDf1Py@f&MaE(`i9v^FhF4lC+A#I
zf{Gm2lvT*^K3A;`+EZQ;lMR}t)Aee?DrE>Ylw{4om|lhlulW3bqky37KVO?m(KqeC
z$CEQN^Jnr@wlO*3-AgyWVBY<^EITa=pK<w_XPwbG?~)%aIOChMvi@V4Lo;t)5<HeU
zD2s6Q%U@ryauGFahaI+f_{h`#_)BNJ@{gCUdi(85-FGgDR=pcW&C_<i{fpawHC#9=
zPcE7}ch-LMH~sFC?YE!()y4WB-m}wh>Hl$i{-($6wEex`JS_J=o~&QCWbT319{;nc
z9r&l_{FjFR*~_2ip1Jg)y^i>``n85mhxVU;#=$ENIrfC+*PhG@Hl?78UOPbA>T5&Y
zo&gV**7QfN+wyBq7M}yg6x{LIDIcFbvuZOJ{mF{Sqj%4|chQ3C$upJNe}DGAh5ohw
zcg7nBHn%_f#!ZFIW{Q{o{J2ft-)spsbLw{UwaP4W&ia#Pt~>66clKZMdik_3_gHl7
z%t^CepLO%*i%y(bc*n^%y*cYM3%c(0H&T;h_nKKdYp}@^&%B+vcIJ{zpiLIFXSSQ!
zYv#N|fAZNcX1JNfdoB6)j4_+*z5lnL{oKFqYu#^`bWhI|XZFIV%?|q8zOyEetepI6
z>-p&;X4Y)8c<&|k+S~)ToOAWOnZnH9w^*cYe(y8eEsBnKxp-j<UGi@y9Q~bNoib-J
zvE-7OZNI(nplz0<W{gdK-aT$@wEpATRi8ibq5Sycv({{~I5TsW_CNNV%FRFI-e<O2
z<h=I9cC!ZKn|qx((7bz2n%Q=Dt^F(SFKcJ!%$}HX`r#S5Ewc*-=I)=&TlB4&Be&V_
ziJ6%j=u2lkgRRQVdTNV%PWi^Z>JQJ_eM4pT`<W&B%)@(LzWnl^pZ)Zwy{*nawerO=
zFTJ$m?9228XSZ5UopJCF_S^E4yP|Dg`NIzPocKVn=PYyE^}FY;ge&UH3!Y!SxVPP9
zbC;B6md(2CN8kJXrPt43vv=Ec{RKisx$)SUQ|B!5?xQ}ScVM%(?KtyD_TI+6_guK`
zi<29!^Pha4-p8AD{buiPvE-o#K0kQYdEfu$e;4ifn_fPD`yVadW2?nizH$h2%To_;
zztbi6eYu#w@8HW9Y`gL;Y`ph9+r0Mjyv^QT@#U(;Ywq~mSaZqFbGCnB`{wG7j^DS#
z(%=7T+1bY)I5Uq4H|d<ppS|_0-!43A9)88vvlc!1+b{q0`8oNm4m|6~t5?qy_MCm?
zCFd_)KKp}}HRH#Z-Z+>i-a7LPdOZ?0uO$BQ<eGC{z3IH~EY?2RWsk=n+HQSwX4jb~
zwt4@`ns__*l=kMzw+@W9ShVlV!dZ6=Zhjqo{bw(2dGN{whwpv--&SvRRq*S{VR03*
ze%5?(`=87c9vfcxRPF0W%-tW%(O-7|@+0Z358L;{nQHY(mE0A6Z`S3bGiNV-@8lP^
zyYFsv7PP~XgU(pK|96L{{dKopF5T&Y&QHb5zOl@`Y~}iWesST<0KXntwdVEA-CI3E
zJ@?ya?{D#|zjznfbnnCX)%$<0U2yuWop#yny_4_w9%=2d1fTUx`g(qzJx~7Z%fZay
zN4~q~<u~tTuepDrJ@*)OeR%oD-KG02yLwMCr9S@myXMvA9liRtU+>7j^U?)pKYrx_
zXB~d(?W>g2*jZa+s}9<m{MyMseZgGnfR($Qym*r(($-sTb^pvuFPm4r{nuA7T0Q5<
z8_qer{p)S6+Y$`v`ZYW9EB?R~w>XTSx8G-u^tS!Gr|!Fx{r02ieY?H%kE3?rKUlHn
ziJLEaE_>HjH>mUIt^cv;!aF`(Hvh@}m;TCx$z7@2?|<^C-<`DA6;HptkiUKKt?h2Q
z0{#9c)$d+~o%*^n?}yL7devXw?p<*D@ArQX-v5X>r#{$!_ZwHd^O`i=^!(c{&s=!+
z<Kq{9ci(9T!LyHE^KX8h-Qk~l^j}K#p5;rfIccRz$QQiz?Fa6JUcCEv%NNo+(wl5}
zxq;p{@A$`vgRgn;XK$?^z4)Eny_a3_8~Cl+$Nyz@aoxVl(1X6$YF&Ql?)1*zSpR{l
z22X8bn)2nn-9B9gW^~R;EB|A6ao0J^*StM{<!<P?`c(&9d;7ilWA9vvzkH#1F}~uI
z>I!kgPwrhhJoEOwUcB|bzIE@bC+v6Df$KiJtL43V(gB;ie`)UcC*JUmn(_bi<@r0`
z5WSwqk5NBf@Q34+mF7i$^KkX51#`f5ulU8<4c9C^@5k`}R2$iQgnRE>wC&ctMX%%{
zY}wsczq9*6|MBdKQ~1?KKUw(QDT}r~{n6!jZTsw_pYHap(+?lc+kW1>dCk@LnfKmz
z=P~p4JoyU{u-$RPj^5qdy!0XQ$u)=WzI5B~?B)Kpa?3prI^nKY_U9k}#9rs$c2{*%
z_(_$%ZkwGpZ1Lphm19@#cG>B(m!Glh6ZhPCAFlu68|uHWId^&M&V!Fz@a+Yrx$xDK
z-nwkNt>3$BL!?9RUEF(d?ZVvZ6P{G5Ip@5&pXcnb+10zK&m-&uuV=5@`=y<a-{QdN
zjYAg8SFPOl<@Q}QX<qRhG5z-ZuiLRT`)&VhhCiqFx7EenHrx7!qgLM&LjRHe;K@tg
zN)f+1=7Dwll;6L|EM2}dx7C4nUbFme@A31_nX->;eghPK|BClp+8+jUjP<EyGmAgU
zT)N<a1+&*|$8B}P-ZMwL(~9&aRJ+W(d*>_u@OH2=yx^$!j=y~F6{dFi`&Y<UE_v<f
z1DT7TKI7~+4z<oi*Di=xf9w13rrYOMKYhx1yUagu%lUVm9T4u`f4g{i-qvT{OC7q;
z(*1V2Vk_i`IhS3w^ilY&@BGHEt+_35zr5mr^JX2nK(p>|hq?dP7XD~4vdM~F_Gcbw
zbAz>%^h<WxgX=%Zs?Y~}%vrd8)&A^OXTNsm)*BxE^zF0QSy%4_6bd!(h{ry?bOEvE
zr^`2g@E<>UYtDl|AH8@DRl9G)6U*-~W9@>Y)n9(`F_e1mh|8CrIs35>;&XoW^v>tr
z7{3T!G1zOT!)pI7eEGuh<JynTJ}Un4F7KTA<Lp`doTprA+wD%wAHLInKRQ3F+#}sP
zZ}0t1S^tYG7N7OrD);kCce(A*Wj}iKg<N=r#!8I(ph>^5$BpN`ap%(i#4pLY+kN&I
zyYMS+UVCRT{POZ+FS_#QTRcIZx68Md&HL52KR9l|s>AMIDF5*5YV-XiOYYw6{LSg}
zieG=S{p{28i#8P&|NiU?3SU=HGxGsxnK>kT*1wMXzW~*u#XD|)@4S^C?|;>f|9w{E
z{F%9DF5YrU;k^C74!yQt3GmEj#+}@?UmwEUnc4fVoPF7jSI+&qcy4;&tR48%l_Tnh
zeBIW~Ub1BOdp}w*^YEec+^uJ?TD_-r#OXDuzU9Vpht6KL;x)4T)pTZ7?hfs{@OHm?
z+0xAE2krRA{8^RfzrT9dJ)ht5o!mdJ`ZBv!_sf*H$=R#B$1b{fx3%`c=XUl7i@g2Z
zaP}(x<d2Vha8u%m{?p6;cG2ITD4cZ5wMSFXC;P7EU;q4ZdDHV3u0Xrn?RwX{hhFgd
zH#gQ<oJ?emKl>IFd|xMa_IE0$%<aB1*lyNIJ`A7pf-wJ~jo@?p^~qXHHrq`7fIr#1
z|F@mo8xK+I<~;c7A;xx{R|bc6Z-M@}>{tXW?)*)Y^*p_>`oo<m=#Tj$|8WNe(T_iP
z)}i0Mb?13koVLt8XVnjOmR9bz_J#+`tM=PCh$$Gv8Fy{@ms4i#w6A*ME>|;u`Si8F
zp0ok#^7~%@nRevE&pP{lFxxwEvTf{l9=PZ5!_Gr<KQT{y_uc#d3Xi@h9Jl79Z>)R#
z(!CFRr}@D7l?TrGkGrb3-}BO@PZu70{o}Wu{6>24+;-pZU68x+`nPji{EYppw)vac
z*Nkg#yZ-4Tul?cA?DO~7=SN304*$3BeE4ktIapt}<>2L$wqL%}oy+$5DM~$j<?`HS
zpiWEMYr|Wy|APc1&VObLFu#xYxb5$sgOxab%d6+upT4^8^UYTtdE0j4^)D~`+cw``
ze_{LL#=&<Vee&$5|8&!DpMUg7&tJRamBmY6wBL5t{bQ4hwrtKi=ttl0|9I^ZICb6;
z(k?6Bd*QKH@onE-`|FDrtM^~z+&z3`c=BEPt_MD<ob&OA4_rq5?5NYXxqrK>ZvD=O
z{fo{z<jJ2x*^@t>v;2gUcE0MY*Ejs`g-zf4#^;ZI^2+Z{t6o`1ZFbq{JHL=`_BUB@
z+4#mw_E=N>tcvXP+M0d+3)zjZ{ETy#%)NU3b+_(z*e%PqyJ77a4`m)c_PwV@KR9~i
z^Lws4e)%sRJ#s_q)yp3H!Mc;)`?rtZLfpMhJ!<s{pP%yR)t4?1-b}st>ot2mzVK6i
z1@dpHAhi?k5FZ|I-f;Wq)5^D$Ta``E7v8vi$E(-5U%d0)%afz&qhGvp^{$T}wfEY`
z4xCqwKl#l<=jC8~{m-W^JFzmj_i69d^&5^Ewm-Su{`v8@G-TzP(u&t+J$v+(P;#nH
zUzs={S3kJ9b=QGMuRdX`tJnSRn#C{u{-r<Med@Nm-1z*N%{|A9Ke+PrADsE|9!GEX
z=G~vq`f%+A;k0ACKkbj~@{1K8o+K^%(MjJwVkf>r97&#g;q5cKRQCAhE(dPDVAb3M
zE5uKaK5dJ)*WGmH5BL4k@6NsOwCoLM9Z<W*x9-^S%lqf=eefNJ-gfs3AAYp-x?94(
z?9$2qbdTR`*!Ny`>4%pO-#GV!2l`vwaj1LVI$hgqyQlCAw2fZS^#{y9;~{m`eqW}6
z1mCpT(`zm&Lf(HMZ~XYp!=LRvyU7s`oHDEZX;i)N3Hq;ZL|eY{;Ey+Z@RGwv4?g#u
zQ?n-?xN?=d%e^1%JL|OI>KEGAsy1{zyXPHy9kb7z?_auf*Vpb`a?>$CT`J$A<xd*F
z^6)!@hd#V=VYGKT`|H0xb;D}z^bOGPPg;k+c*pCEZMWO=f!h}rFSz68U-+>C^vcQK
z1fyb?yfXgH-e<gh>j~d@|Ap_r+*~|2y4Jep^fjk`xXX|EeXczG*MEEDrb(*5#fwMX
zy7v+HpS>sc?5mGZ>vno<|CJ~2%CB-yeSiIa@4a~Eo==YP!OLgw`0TOE?|;TzvP%Bj
zQLnss?U(Y_`y8|9X(#+)>6<@23HvSb=qVR$vC|!UiLWi2d-cuN4&xda#J7M-erWcY
zuY5H3w5_iG&1o;^o;vQTC$DNA|I344IR4c`;0^oi^*m8Ip4_m<+L;|+sUNg%%L6N!
zvlsuZ1jP@3@=k8;2kT$Sy?z>g-NWy{{zvBruY9MlW_)t$gj3$Qd;a#XocGyFtE{Da
z95od0TetSaQ|~<exzw)rFTL}+^Y(xH*jJXwcV4mFJ8H1<!Lv8Y{pI^_|Ir!CcY5P9
zeNSq-_E+n;GoPZjT5N9keQ#&@>ZkAgbOU_a{c{=z3(we#&gkF$(xOYTJ8vHR`Mf3D
zs*7LEy>m-(-9^Vg(%kvUUp#Zm%e#Gc@?Y=+zH1)*?a4<MsGI%i)NR*>?|u|_4nJ<^
zJ$F4vc=Dq^pYr^%rEh(_>gts@2EX|mp1Sv=Ww*{f@WV46+Vim$TYuvH<?(0kTlhus
zgC}m=z#aF+CwpLr?cqqmmS>&}=JWV*f8X}6^M1Z$vz`8N1%Jy1*nO8Dy!gS7o<Cyu
z@<aC#`;_I^kKbrrhg*Di#o~W_anGBF-2TiJ>^&c(cCB9Nr#|}YGg}|8K6~~Df4=*J
zPxqZ1aQuG9eetKQc0S|}F#PRZzH#rP%I#(PJD-03@7w+|+OK*(IR7sME7$z_v72AL
z@v+t+$KEmj_S<eeH+}FraqhCpPrkRstrx!fs&w7Hx!(r;Z*RBTHD9dycD&=eCoEie
zz4+QE>uQ%C+upgm%?W25z1PXtZ-bn1+0K`I`T2$u8}{#;^!}a8Px`~dyZ`RIgV?JV
z?Y-q^^}~;R{=)|k-fGc?g$J(s{5I_$(8(*0x$wkw2UYj&fn)THbe_M(EFdd`1GY!b
zc>5-2!}>pVSDmv&T7UjB?qcNNtGE8kFCP1&`BHng@w*GZ`|odq*UEn-?y&h4hkttV
zq0?<IzWzROX?^J}Z$EO=EvNjkv*M?}-V}FUVt$2v{A}v&hrKs%`psK^xMO_5<5!&a
z*@~stW%s-9!_&buzenzT@z3u_-Fg0&$Nrc<Gkxuw>n^UGbk04GzWn9ft?uoN-nr}J
zwLks)#}7Pu^I0q2eDDciPI`~M^2JfVA#Pn5I4ArNKXJS3pyzHs<&XD&a_$S!Gk2}+
z<afOG{a0W4F(5#W{r)4L0*PCH=;`O}z#p^Mao23R6JGi2r+?b?=xd+b_Ur?<S#;G?
z^!vkyo`3kx>vA7H|A*WA$Cwu_c9(p*Vc#X+qwDkDc<2YpukU*jJ?F^_*JM^6t{t`2
znajVt>y$T-Sn<xw_x|O>$4>bBKKnlR$=26(w_O_@^Z5#3D_f7ey8O@ApQ|;GU3~vX
zKRaaLUUJQ<>GkW*`RQFt_@&o<_l~6xJvZmj4eWu7pW76u@2$EgsY~43&b@WhQ*Qp=
zFJA}HVUfB0+}-be{kXkfe9oUdebg_%Jb?Y+Cu>$a+x+mr@^{&D_xV5Uy=7Zm-O@GM
z5E@Si!Gk3b+!_eMEjV;=m(WOXch>;HgF|p9xVyUrmmtC2-Q_HD?|q)VpZ7Yy;JjbB
zpy}1Urqrw&qej(QJ|FCY=)kC|`sxpRjWL%CobhQpl*B0Fca*Mw@1JfdEBv&kGSgzg
z1TPHuD{QuO6)^SmjLY;`??&bhF!<2C=ZgMg`4$W6`VH-0kp(7N6mJp*9<bsc0|a`J
zDGziq{I&uARcbyAH(@kNj7IfBU{AFy`kS>h7LvU(9M~zGZy+Ed-zKhjR~|CDc@Owo
z>Ui#kxlOjh2b7(fecu@=K9%NUo|o3JINYn-1$Rtn`nVzd&Znn<#{S;{4Q87@hT&5@
z<2~{g*l2yu7z!7<Y)96k@V&|8Q8ob$E_P9`n4!5t?!h&SF3<iT%5>f%VL)v4WzH`F
zcSEhhXRdRzUPFl45>qsXYuOwDh^TEYc@8u(`0vJL`285TFEd4^-)>Ep(e%gB)BE&=
zyKkg=J}AkA6LY)0xSHVDPwGY6?D6Hh%2WfUVlUib{R)Z+eW0Wt#(N4ZkiQnfGFq4G
zUMFizbrlfsDZcI2V9WFQrF|d-xCJZS+gC&!Hm{26F}jY;gO6tmO1v&1o%G=GuWC}v
zZYMldNZT7l`~q-Oh0-Et7Sq7eRPW6OFjen&wFllG>(1lJmb&RF(%-)|QNBVx9`{H)
zfu>T{4Sc|6WOSyniIekYeLYEy*S36g%o=o9LRI~yZGb7^Ko5Pk?d70LOTr8igy%Y&
zl&RqEk2z>_;lqKWEtIn&t0OlwKhtDJ*Px<u&&AP)MwL^$+2y(Am)otfW)_ETUFeSr
zo_6;s7B3P$>In|+=IXE&XS>~xIPs!Xh-pqV=7RGaP0x?DL-39r)sIspPZ;s}w8SdT
zUg}lGfw?w=7w^ADJNIWeWXs31<*RhwLN!WJ6*h{He;lom#b=lY*?9jA%3eJgn<LY{
zQpADXO5ah#g?lF7j?|QeXbhzmnFrxWUEM4ByCcUf_SfjF1JMk6>Q*PxZD$?p1--Y{
zymx1(Y)6{8`VtGm-H!6QcV1ztjRsKzo5wFZiMmYlT#bIMtm*yQi*UNmcIs8h;kzK>
z*kP&UINkVM5aLI>IyJ+D$#ifEHwXO$1cd%Lo5IaP6|+W~r`Bz2`)`w*kty9%u)Ir>
zd0zqHnq%K*QmejuV}h}q)|y4_?!D~P0tp_5=%G#F86|JIPY~Y-)?4Y7y>PUg(Q#be
z{>B(Mns}dx3c5Bs{Q-D=>9`x_HoN<Y4{ZM1S>W;Q<N?KZ@^lJmHHHlFS3}yx@fjES
zET58F<nNhOhiqAx!!*>IPUrzc`zkQ<3fZM?jmFs@q2;Utr}4VGAVOKnnLqP4b3F-F
z+hI(8CIS|gFUQeKrIh5ePOJ#II#&{mpX;`@WVbHuDr7j}v9vug>M{c);?n@MaAAm7
zw!D*1Zn3*qxum{%aK)df0x!7nrk9_czy1IYR<@LnrG99#i-4!4zf0H4oqXGczL>(R
zpgu@EhgZ_+DVJ>Bby`$w_)AImtmDx0Mh(1C>v-5ad$~bt+_uK^bB|A-%z)u@Feye+
zc!bHx7axLeNkVcEFqvV6&vv#GQ=0qY=#IFbaYy}X9kw&3kx$=?A$7=PiMl95FY#it
zstB(=Hq`Ftns;r~CSwm^#&S)X)_{A|bQeN(8xtUAXVyJ&yK_HGQLi<XEh@|pyI)Ka
zZs5zDs8uOka>bu%D0<y-7Xb}fedzt1Lh~1<aKY=4$MgJ5a1W3Xeg#dL$Fi7?zZ$8>
zfo?103M^<QkdNHkt>f7b;V?jXD%q_sBDao>pEV{jJgY}13O_TJhv^><L(En*&1R~!
ziNzarZdJBEIKe+WSTG@mBn0x2=j00oRTzl-%DBEziF)N9tX*r8Ih$DRG}hXaA4Ioq
zX+Y;~+|lrM3ac!-Acq?*mAkFd{1Ohkc;bCGy{{Zf4s5LutdQtGNyfn8c0Kvl?Qwk4
zs}cexH)QqMu8GnZ^*pnq*J=Q}P7iewqF_<S@ic%H-dIifY|oWUQy-7l3o2Ebs<TGI
zGMr52T;Cx&Sfn?;%4a{!p&!8(8_w;`eKBK@_LfAR$9>skgxlvJyu^?(DE6GT;1m=9
zjenMcQ)RoZ*y5%O%$Y}@HVN(DIr{)WJ0g=1Bm&0IO$4bQ@^Q`1jWnDAJNbi}L#5ZF
zd^=#b0@oHIE8d39?O?X|+x6u-AMID|l10UjTZvePWv=IT?+;m2wb)(lEUWNB7{lx`
z*L2!=w1cN1qb6soG1?GMw-)L;S+|&iMd^L5o6AKpCr0PbgM`y~lLIlbQ{vS~lq6*R
zO8U}9NeobWjp~fa<AJhI;_*vg=5_=Pv14GUqC=W~xl*4Ok`l=4a%?hg)}mw(vz}fC
zP$2H8@;M2mBct;tlbD`69POpVPiTrQp)_%g0!xKpte*TPr0}2aB_SjIO(nAwOw(dj
zvV|~<v;7f8=8cCZ#Im$^fFlVp*Wf}SUHj!d=lS60^b?JGxlZTy^B1K}XP>=il`Pr6
zv`P3$XRhLNeQ<n30<GtR_SOjgLZCD%00M0lj4a?mj?>TkDZ+1SZbE;&I8a;u>T}zJ
zKl)ww+f)GUCoa15=(~M(rB**&dpeppp8Cn{Prsja54N^t=1vN>$x)=#byd_{(Nop~
zHW`hyyBR8%5&SVE&~;%)iDngWsL%7{ZXYBeUBV|*6il+5(}l7uSr-s6?dFC(?N4%?
z6qg+d%@$`glrcOHGiCuK_mTaZRy^D9<hJ$rSWvlroQmLzOqk1g%vC?_uciuYadn6s
z!YD;Nrcy%n!9q9&9a~3oa6=4?tHqH*p7K+nhu&khZ1M-snDTOND#UjyDG~)yz_51;
z*PZ_vHh^1bs0$c2P|bnRI|eq}^oR_Q0O7ub8)Q;9?@dTsePRfsVY;x8r?3l!#80xo
zLdbe&AlpPHN-vs61|jP6pKMcTy<#8sh{7gQ`j_dqv<STHb*C^rO7*z-TskzY;j4Pj
zM>yqQL?q-fD@8C=#UT~XCK7oc8d=U<CRNNo{N`S-*ZJH14n2hPc~Y0=Q*gv%=p}@?
zr+fn4BU~+pG_Rz=Y)i|F%*&B@F|hLBiVE1+9(BCMorl!%b=pTgMQSztU%Sfk^i)Hi
zbGqpmmt-NM$>FQ<nK1Q>YvnK5vt@2UH{6#KDIBhtA)S54Jv;ikL^WKd&MAN088u30
zEo>#d*zxg!d&~DNg5>^q(*^*t&IiPlJ(ubrJ8icv6!AvK>DrH@3}}-GS;FVa->^1m
z+^QUMY&`p-8FA`sjJ@U@ALI=+h!@^#@5|Xx#XE5@`^DN~H#iSWXR2)q)_T|^s(&$D
zNzl|ZCY`Us`7u@eavKNO?$Rz}+0tSBhj-bU=J*tcBi+vZY!J%Gny1MLP#*~pSl6tS
zw8fEyoqR9G=lsIw@;ucCiGQzy|M4M{@`@DCUHcwD6d{p4tQ%XjeM>ax@-KnonQ>?c
zn7b5<j!GMad0;lUr182%u#i@Ygi&Jbf6j}M=4E2fqfmx1@hpZ%T-q2&gr{R|!0rmO
zYu&EUc&_!$1_E#~p|RvKNI-D?as`1W;mQPok*c-%<mauiWi)+~jJe(OubtG6rz&Ow
zF)Sj-^A*ERitl8Gj;-Ck&)CZF(0;YXB(Emsw}tC)^)T7^5MIJ|oX*A&UmN!=gTiwL
z(d6+JmR0~_Xrf=AP<`IyApj7Bi3Bu%+F{yXMB1uatH<M$c+5<^HPLSUIwNI$Tre52
zQ07z>3v7(CS-1W89nUGTebSNJ^fSvRG<B`Rr^Hu(+@*LT3IVeiI&MP1daB0)8-w?*
z=%TF-zusp|8=4$alKY<wa@1#U^hMB&aYFY3_B7^$qo}2<=Z1A`40Xz*0LL%PoIt6~
z*?6oD4<uhCzOMv`;R3zg-1!W#KRZ2e#`#F3dUgq5R<1Ey^RKm|+{;mGOShmiP<C6l
ztTAEWzyl_QK<JH+1GUREn%4n^S!#R**aR>FH-iDA=cykm6^Yi#84;<9Pv5w$Oa`6r
z>f!wNWJ*s?`UD>28P+Df4dw7+V5ZL1VdLwYE0_Flw>1JMc4)PHjlQg;poxav9OTl8
zA75zDhB(zyIr{WrF}BZONj_>}1tmF~wbxmC%HASyCQNUSr;v~2byGR@SSg(+MINg7
zqZ<0-ZtDED%hmIGzNtquAF=Wgv*;5|6h)X_aoV9iIe%LxhlTyr3D4i~MfTM<IUrd<
zt0}nJ?QRjGRcn*^1Ho2CvVNiuVr+$HA9**}WvV#AQm@4<KlRl+IfG}@M{dY~UNe&(
zlb6r*TSD*@Y%ax@I+`8HAY?Ca&T~P~&!0FZ5#f(ayivq*diELrrWzlQox?yR%1+^P
zx}jrcx_F{tULDA@{SV!nBcaUsr#v^|Dc%a3&7mJ8BX6o6|E&c`VrQ7@445))Tl-Yk
zeog`~I?MfoiI*?%hALyNpm&AV_Tg053rBX2GPPFKkqPX>KwRc}>QpE8#P`0mLC{r%
zj8b`Q<bA$^crsE%=6~4t@t@>>BBhx1?}$2s&cDt=W#C#_9c#;~^-gC^qHGj5HLb!W
zN~%Mn>2SL+QY9{F4!qUg8-q?Z1~cxCw`rLxT?x%@PE13R%f$B$XPPh5Y;u=A<1AFT
z{rMmTjEp=!X_(g<9d2;bN`Cc#2Se0sMW@Jq_N@KaIQL8L>Z(|7fXBjwen7`#X{&@Q
z567i+mBfEtC+=FEO9GH|7ue!9e-mp69NvHj;F3&uQI-$J!XR_!k|webla3^!P@<lS
zP{4%^o{3yt_ad&><Ne{pt3<~cij(pSG1%JTAKF)($_Y31^w(4yF93nCq3G|9K3{~;
z?kM#0d|?J4m%ezSJQ(Y!lVW&~ldn0+)#h}Hl*9fY(nJzq(**t692`@`W}_vn&n7ta
zuS;J)k(70FstEumD*{nNS@>0<?_>N-9}{{ZfIC!ea{+M*H~$hD@|Vf?hQ8@Wh+4R{
zy}grO=a*@FZ#4+DdPa|RzpK|0BNh0gr~yORhWYd?2`vRWyd^{Bc<0y*781ynNhUfk
zh*9zhj3pBF(Y?tNr308Ly4Tb6hh;Dp+put^&6L8qa*gE6HpQ&ghW?$61D<eztyrxg
zY+0Q|C(Zvpr+g>`q#;!Q07NP8`P(GF3SNZ0zAj*!%Xq(gg2P`0ys*A8w{`M<*hW*9
zA004aPKTK+8b+4~L5>H*9Z?l!TbMMfm?L2)XGy_Z2;<rArr+ONlYP-~VFnU=V?}JV
z?wXzCZTc5Pq}oHgZ!ts<^)iBq*|8hYWTe`%WCEvkC;C4atq9TKQc@k+Wv{_C@EoxK
zFi%aMfh8h5Bm#|rU$Z$MSEo>Skmzd##V@wm0OU<0Of?&Cz&}Uct-by0pJ7xl^?U;1
zmlCp^OP!Cuio3?+4_(~P&x`<sApPeGwPYb?r@KAzrvrsC0}BeES&`I>Po+lZq7!B_
z&#0=I+7c}r^V@e$TE=rFUK9)+AFt0xQC9VfnHaiYL=`-HNfTd76}((lbyZm&@#AWQ
zk22orAmT;&vS!ZPKkDi373$$Q=fUQ#N*W`vVswjLdli~~;r#9oug03Rna46~11tnK
zH>ef%+|M2B6+diqyp)<U3#$#gJq8Rvos7BoPPc)pP=%^K95{6eo;6eoOraZR(B~AO
z7(i^AZ!dXy;&@619KUpmA?jOS-WFA8)5gsa1WzrEr*yP+p;Iz;mXgqVYAW8l9{>1c
z^t(9KX4D=!Ki6wYpgyOCiZkTKuRiTsW9j3f#20o}Z?+f`*3)6;{4f?eK1Y@u-h-UP
z<GDRpyiSWB&M!}GeSt(;kYWrte_AQ$u)oef52Z66Mpo4x^CLs|Cx)P)0GdMP#r@G9
z>u_F$VC|*B#nu9C5XtZZhx-FJfTqOSE`Xh?;zJ|!13OQpD&)ueqf)t)>5e|-nh(!M
z+Tu$$=@dv^Ul1wX#^cuw2idy0abA8^{-91_2gC$t&Lr3rJZve&uF6zcq_aH;Si?>_
zGYLDD2Y&A?o5T2sf8=@gxGrCdr-$0tJrjY{p^=s_WX}hS1tU`J^7>1|4|l9QaQlDs
zY|$iN*X05-Hf+qf;O{;csh*EYjJfDza?OHBcQ$5GF8d`IsjZdQ1*HzB((BTaa0Uis
zq#;T6iJv+3@TqTz71|Mwl{!DDE77Fb=u6{mUuFd^$|CvMjQX0VY}Ws&zt81Tjn};e
z!p;d^myj~aCtSl2?^0zNa-%dLOcU<}+aCbv>snE?oN?|rZ&!-@2u%PIt?yy*Df5m4
z7N!(?LiCR#!FqBeIE!)$c=;0KRIKEb!yPK7hRY44C2!Osh2>Iyt?hH-6&9s>H0L0U
z9~wI31>$reQ&oorDtlisJ$RPGL>T7m4^Y6`4LNLb)lPzRLB!hxK#oGLevKI<)~|0V
zQn)iNFEr5m1bg+(^bf0h6Ioe_z*tn$Rq7YQDM<+<KShZ59yi|2o)csvh?n^s{&cOS
zGC!HeS%S+$7m+d5xPThRTp93=|L6db4F<S%pC5j6vQAaW=t?{~R)=VDH*<cRFoCl>
z9dsb#btVB1u)CNkJpcqbv-1JaYdE7e8|GrS4tqCMCpE3)df>!S0`PlI=A{7G3q<l{
zamltFYdICrsYcs8Q_smTn#&UH%s}$YEnvW4+i=#eyij9!q8!wKYZ}`sb^aV{1~ryy
zJk5C6r+d*)DtZ`xu0!<Oqd7WCb_5(Gn@soLl+O#&Om|&O&<XjRQ+75LHcJ$drou=^
z<iO;*KbozoxCUca(Qmt!ii?f}7l`=6cY*LE^)wO*aDV^_=q(U}N{RZd@ym`LK*&pW
z%0jH$*ABD5SXw~gR9K(>c^y?{i2WWrVuSLh(*aIeV6sWjDC-S#2DNYr79|21i*{(>
z1ZxJ+Tve7w%2@j)OC9&oaSoEtMd&&YYaTjx6M{3oH1+4n=0p6U@tH5G{?ysz|9E#v
zz>L#0$EG}$u%U&R>6{#7q@0_9>Muwqau>IfZe9|Rr}Z<M%hvyPP1*dncUKf5-xN`8
z`GZ0AtN0I%gu1+#QsAW#b1}4>h$jAy;N#D&CDWcNAHT`H&PCr3lCX^}I5FCdD5%Z>
z$5z00c->H^%>NL7N&RLq9FPe^)gMxSSvDk3A@j;RFgBa^OOWW-vBL*rUvY0JORkik
z%1TStY<qg>Y4l|L_Jc1H<}fVz7^F`eXe&D%8<DTm-9EX43|GPEd_Z~PJvb8rv*BA<
z3jK-_E%iAKIY(dv8ty|DlbG(-_pzU%5+Eh}ZFLn>tMg|nx0yBTptdE9CAV3fNG5JO
zY-Z)^)N35w<P5a$_tJ_t$>4e%a%^3d-zb$oUSe7{YXd7~Ny2pb&q{?ot<>0Bf+Wuj
z&qJ{FqzR#r$1V4$d60zzfbMgobuqFlT^}(UQ;u-5pg(>ym3(<hp1@{H-I*j6Epad(
zlEH@WI)MOtCELk*G-DT*<#fw1(q5@|jm>kp$BNHp5BHRMAbnA+&@PKq55x|l(4-3V
zxl1C&s27yQJ1oE9i#RPe>HW&?xznVx<!ga7$I?Q=)`gxh4&P^BPXI^OCe5AmCSzC<
zPl0kYe=4hFOw@t1p<{`a2I&F9Q)X@=g^755bn@BY(>wtvUJF-klZ&|*1yAnmP}4hi
zQYcrNVfPIHG<%gRxy~@bcrCXQ#8_!qy9T?FftWr&fplzyZ29o}Jmo2tHtgP{U&xg8
zlH-%d05|ZIdHMz*@Sc7fO%O7$#E?;jxK$i2D^AhTkNM=S9(B5?2mDTX2vILaIvjQ1
ztJH^%eQ}>VcxE!V+Pt8HwV46XTK)AyWrJ}59=xO=tW)_LBeEdqYC`<b`ykGkKwuUQ
z-1F8J<t8=yma_PO+z)ICrCvu$>Db{zwV#DhhK3h#U_V=~W^?9Pk7f}O$<;b_DEjO2
z!y^=Xd}a~cKUeWaN`UmQesVUiy{EyZPY<Zf_DbT(LVIX-)F$rTYb5<TRZ4TJoOf@>
zeL-Tk9eJr%f=MgPbIDgHWq_dY#`dC%BHZ=$Vy`?pK-9}lQA_X9d`}RCCX^^2SHV{J
zBnmDdb&0)5EQVrjidAOJ=cq{|tegOAaF~1P1^)50^<9SZtLbeUT~7L_CY?Gu!Rbn@
z{;P{(;j&H6HOOdZ3%)(yFIQdqdE+c{DkZloP+Iuxi0sC*CrtqS#piAfP)vO5<m`V_
zuOIz@e75XQiNu8tAO_cKt32K9=>N69u}w^CR{sLfSG94>->k|^7>}cg)zAwMy<(HA
zgUX@rjw1dEfGQTdYPx@M0dkzuNlkfoceg-%KftdY{QEW&JU*fGN3oNOgskzhm3f`E
z=P`x9IaR}D)Gxfn<CNcetu`PKW8iZkV!yj0GC=Gq1RyIEj_!-*a!O67Uw)&J1dZX0
zIAdTjXiM0}h{h?~j{*dn)Zgh7foxyMR{<Jyd15S;?SwsSC}6>I5MoZCdy(on38(Z3
zlD)aN4F&2?YG)+&L8V(mY7XOd@=XDoJ~aIyY^eR*S<p8{$+oiz1)z$NnHU_kY99ui
z?3JV(Hni<vn}w5e+ER(&`k3gL`TxC-fCLp(Y4()tpCdFVS!E2bedNGgag7mCJ8wP<
z%OYvE_(3B8N4x1*3WD%H6K+R5(9Sgdc5Vd47UD4_kK^Bbm}krgRsiyAN+5soM7(?>
zfN4d45SZq)?C2R+^7k<w{y{*ygaBRuxNW`m!fZ@elP%p2H(Y6aFX*=Xa%)5swOR$+
zje}8s=XZxby;=iG^_W_W{Y}%BpN1Uud7RVb1Zco61AJ8kq_3Bm`HZXRO-SS><5tHN
zV7z>o#6EX(hfD>(F6;vB?D{G?+5sCR29K9aTjmQqyPf)_jAg5CL*Nw<prrsArR{go
z@k$Ex<)W+U5_t7A(Wq_jjppg?NFdS-@G(qk_^Sr=%C7G0j6?w$hb^1qUY3C2(U8`u
z{WUz=CrPv9r`a=-e<lo9OU8*{C0RLno7^SORyV05=#+;}dWFN<V$L;79nLVLVF7^P
zlWXxGduQ5(BR;a@2I2W9PP^(Dgzfn8aI(;-@LIO_jGL_Q5aopZGR;)(ip)2g&+Iw%
z#HZAx67ADHf)h2gvwwj_er0d2#Xozm^w;9TfVW%Ub5l=?q;#xFL#$5*T}U|`3=s50
zU+a}(sGmM%kgow*oYoNMYn`;}C&Rn*oxjQh9$BH!r_eP<s{2suk?+Zw($N{n9qzEi
z*C?zOH0SX`0b+9@3@J+C{7B5OG}PYdvQ%3JTlYssKo1&Qt?=gyzxZ!I{bXu+B7}==
zodF*t+t%`X(RYr<V@&&RGB1^cFHJNla^FZK&@-nvb4lC96)e`Z_{mLV%r&fyc0e`l
z?l%~|5a{)~$t*I}_lQx&w}-M)C7@Hus7TR~!~9tcNmg4=4?nw}uW$WGT=c~FGPKa1
zNq8Jn(4xj?0fL;~Ushk)r2dVN-ys7MsZTOqLO$Och2OzHU_*UodEhK5<2P(0xW88L
znhS8u^*mv$!he#CzQBEt<cZBKd_;&c&*SYEr;I1jIk_fV-edh3PVy#+w|(tY<o|P|
z%Jh*RCW4_5bFabdD`8H>JVS{GiZ@C311kJ~{u{ah*lr02+o+56I31_$Ng{<Xj@U~U
zzvoS#WWG+%=v=)V`-DJ|;Q&_rW7?~IH!vpDVJwV~2@^WXTE2q1YzFSyN`0G@X{g_Y
z1KbZ?d6eN_zw49YMbn0TW=Y2c!vWM_zsL(%qBj5j=d0H`0uMnC!>}R=^<Tza6rKf!
zs8=MB@V`WOFH3=gl2QXAMfrr`p2DtwtJ)CA0nJ*Qs;Ruef4unj9cFI<?FQ~(WZ=`R
z|L0Y0;(!7VGW8?+ua*4IM<K&$0P3%DaK%58wEwtD0P-Zj3#p>~g8Sbe^|2Kfm<_Ww
z`h@?wDd3NnAy1mK|2vfbf3C_KuC~ra?PpfcTJK?QZ6NAXR?ufSE1Xt-x#&#^kTqyN
zc!R4)?1*G)*b-1z9o5StBqBS_7MszAiOqDvx7yg!J8z8=UudqG*DZjokXlJEuvrUj
z0x_0yQWxXThaSQ58fT|;1FQd-Nh=}z0Rk3383WuI_~o?I5edj1q<KV14w50j$_@nu
zxn83$C=;}j<=&RS#UrH!NA$FSni0Rx1-QAuZ-L(sdRe+T4e5ixa9NJWIuBjcmbxG<
zxU*+Rm|jd7y-YM-<sgS=9H>j^UiKjMYuRpC-T5aFKPS$`2QLPYI;a*tjoeER6ol`2
zx$%b&p>)P+<BFQ36@~TG4mt;-M}*+Cl3TK{9|Ruo@|deE;4UkH_Ee-o_%`=#i57i>
z;kZL@wuyc{E86n>SE-A8?p-4dUQek3s2QmgwH2!8*2@@WVVT&YcPU3*1pba9z%!7t
zb0eq}!k04KHGCJ__$)04?+dj~<?-O>dDDy7AdouY;`SGiDBMy!wZuQ?1^v4vX$Tv9
zD+&S9XX(2I@b~GfOO4OPc_zDkFF>SMM=$Pr8Qvp|3WA2=sh_`j>-8Fh3o88xIcG<}
zrxFTiqPJuO-GbZ@VOhb3vJmD`NnuPi^w#$vs?6RJYEaOS1IYi`76L2{+&(m*QhENN
zx`~vS@}#mff*HWuG;>88g0YE>^vMSEh^<@HDG|u6Uoy1^5sxz78r1c+I!G?BnFsCg
z4p!1xo*H`U-fF%`^QpnG&JVYQdyiB~-1@^P#aQG~N{^np)S~wHYCO@ki+V9$Nc{%9
z2o1;?Wc4;smzP#&|5<ZsLlwx2362M8sW^3!kG8E7TxbW%ca=kIKp4LTUGegM`R24S
zV1sd#F=!L~&$%cp*ct;{c^A#-lyoNdZlCR7Q#hrdQqwxo66*FE-f2daZR4w;_CO#V
zm8f0I@|tdv4#5)YDIBOL6a|UlT_~I;3%#tSeX;pk&1}3GX9K)RTNd~klVP1IpZeFE
z+LzF~<P1MjfsNtGFVeH|we`Ii=u|3|dq5dg!BQAjwf1wkzTIn!q*~kTgeV(6%U52*
zT2?uwdd|+gR8yK|YY0h?!;3%gqu^0<)VnLLKUap;(N1G@2YE{CttS;6-NI=iK+pvK
zytLd`his$JX{*QtaavQb8m1qUc<G30<W%VtGS_9F2gLFQNlWvnhHIu%h>5kmAI)=B
zuel-;!^{C=f!X}nuy&&_W#3f%5{cnIG+ay*362s{xBdO$F-Y_(!C0l<v)1HsO~jN4
zPi*6L!3|c(!>4rNHsN4`IYvCrW)}K{y`D<=5uE5L>6|u1y$kJXv1FoM-{DmbK~u;r
zU6GR*h{v4XWkkOBVc_^p{(~6abUNak5U8FIS7oE`<Y*C_o$h(cC<<kdO&L?gkmk8%
zb@++Dk#3VcD`mbWx-RL9>`v<hbH0R8(%uC7f$F}ps3c=83d8D%$nGlrc2h!p_aI!1
zu#(D)Q0ARUVyX+IYLu{XqF~PJ&BxlU&Mt_$S*>cTN3X}QU#1M4COe6^+@NO#)J*T}
z`?vl-VS21{r~?C+oCar$Sfb59Bb!Ep78q%+NUF0{qt1#h{M?RIH@pN=W=W}bS>xR;
z(dt}pOPS|1E5h4b;=}W)SHEtrc_nq-w71>cM5M#Q|0RZR{RkAHVQDpU0Cgv5K@Uqu
z>h(LLE*&V`{JM7G8RGf+g*Ucitr)%XdP_NM0>RkQLmZ>J83h=AFR5CwMekI%194Pb
zAKz12CN;jdchLp}B5k@0m5^f(KDq#%=QfoPYoCt57g&es$oi7cmn=cpZ=u*UNz}~N
z_Q_M=`Y$8N7GHI`((;zGloy4j1!4ofEcK+UJ3!q$>kp0MFi?f-e;nN>1Q-ZfgfTkb
z!5U~)a5L%sLc+NLbn1K6w)xIG>YJ>zI;8B6+<V!1g}ssXER}0ws;yjN4z4L)KMF8B
zm3X~-`6pKEnAm+WneN_bG9xY!<4X={izG1CKu0yuP)hoa2Gx#NG=^X08y3F#N3_K}
zdZ@vylHz_Umf!_{6YVdD`hl1HZW!ipUtA~eKL%r<+Oz2KuQ;`d#A<Y}{fdZKb(O_7
ziQ}o(;w7<)>{<Lx<&PgF$q=7(OX~MJ!A3A3%C!D7&|PDQWvh?1Ct|JJ^6wt99_`Y4
z+*nHuW?E}|%=w}s<3q!k^oP~Aj_Cq%<QZr;a6jr}+~cax@FAkc6gL>|^7ZlkOq?ne
z_FQGLyy8w-G<|PKF!qo|)ekJZ-_#S-cC|)rhZu8ccwE%q)h_4J`b99pwy~+~<}#In
z?j4%mZp|ESGfEV&-ljKXwNRW%!kn*p>NCdNf3*7_3Rw%W+cBQVU0FwjFj~=sI<78m
zzi?lGb(B{;FV`A@@3a2uI?o5!dYl+L$pX4cPS&N5EoIskreiyGl)g#axN>BKmWJBW
zXL69!p(<beyX*w^7EZ-Sq9I`m$5mxUPSr{&iPZOZcRn{2#4dBGCz6z@{@8jlB}&u-
zT6fch!7zT>ozFtWafg5UtWDw#RcBJCHBMB9ApiI%n~>*P(m<C49@VgBnFBUd15OrZ
zwIZT|p>B0`yp9xlZR?X$q?EK}lO0V;?6E{0f7Tg{MI%jCDdOw7sqb^xDe+82tL{}v
zl!Nh#Bvp}(OA8tm_Mq?GmG|b_fEwh>w+zk7Elz-1<qzVxDIb5-X00wwKp4!dgxM&)
zDHa$%+2#<;_1fr6#NwCM|4m5~GQ-H(m)<3;QSjTrXnCzO@LasR{-DlBI<5~%n!vQJ
zlb<^X_K?_~WIm+zP<bVrbTngzmP9X~R{E=ggl3H6t)m0O`?=x_lSjA5Ed;E^U}eNG
zoVB(txN1T5@0@Ts6#lp$Y_ekl?}!!bd;;%dbpwL&%o>u2D#8KX(){BVSr=cI1nVt?
zs<KdHikPY>NxVYaCIjj*xEKP<fMw&fDLrloE0!#cQ-lW|%Dip!3>fR@>)}$zp%IOz
z#$EWZzzzgH$H{secm%3(*nnC{Z-nrIReJ9a#YUeWby(nasObzJwR-Kh$tl6;-LiN+
zx-O(SS_TcM86GyT90S?5C5i~D$aB1k+Ps)4)vHWc27Y=~a69zy`AQYGLu1OlCaW%g
zPSvKN5WW1!(Oj#w7Y~Q1Z6TvsBVMY^A^39%mfCT{Qqt9PyK3EXA|j?r(E3%c!|(;u
z|3m?j5bO02EDR4K4(2gj%7@F{Zm~eR?Icwzmd$wTcwo2hbfKhu7TjP1vN?JX#A4tX
z0PW#FPW2V}<4BF0Rt8$O!-+(mYw^^gO3-0HqqI5^OJAjq=RjG1rw|vQ8;&KiZ_|l_
z0giDP9lzc|Nqnfmh2F9A8)5r#9cT8VF!Bd+ZHMzxIV2_R!f=vWt#yAo0~Lo!Zgq5V
zrzDgcxmv7O7L|8?CSyK<8BwuV((Tkf)E0KG*;kVvGxN(F{h}Z@Hx++&97GGKshC`B
z_IshMQ=XZn%fiDm_T6$2Yj5xU<cuUNL-qUG{!i?0Rj$XI0<oNX?(7>@OqhK=f$uVy
zdpd%OKZf~8@F(soXvA+#DG!0#Can+f)`Y`HmyLMW8(2Un1O4?@nhcxfPiKlB!O8ks
zxH7WieRweb-7n73*S~H#ag9`T|4ak?Au5yOed7KvHAq?L=*Q6oD;%_#X0f);z7~hm
z>?_#)$s$1S#L7wERQUx4<GSb$mxV84)R!__`^Z>=7+=*&0n&<nDJ0LOQzHonX@bz>
zq$%m1&iWu)D})w`O5E)w0T;KW78;Y^Lo^2*9^C5gzZYl*or!xa87ny{&G%Opueiaa
zi^;*k*b#fT^n^V~)$Bco{s|WyO~v$js2wNWe0>icE~it!hM^BOMSmf<8S>Md_u5**
zs}$GjcH@|Q7V{QlZtp|>b8v;ZCKl8DCrS21Po%9AgR*2bpY#t^?W-OP>dT;!FHiMw
zxKN^_a}$Ch_&YC%r$;OP?>1}Fr~+Q-&9kl;i-kk#_{!av3@VzOITSwd0S9Dv?2oC3
z9ha-Z2~qUYUL4FzYa!D+;u63~B}oc?OND&*ZR}Gpmb3ZtNT^te@(xOF^}I2=5V7AN
z5SE?(i3x7u)QGS33{s{m4oP{wJhf|r5w+1fos(ziI{wi&ib?vnnpmaXnEXd?X*xtO
zR#Aldh<v5Ogbn;~;B$iUS6#x>ti>-(r=>%GPm)lI-OaWy(|Q)i+UgEf=dj0`($T?+
znj8MD1(?JA!w-})R_xQ}U&!E5HB%KK*$s@l56D3x@NTK19N<<`$S)+Mi8x8VW8F3s
z1ghHsI%FOzT<8$6G2Uywfjut$?f_%eXWGQ1*h>}wHGW0dNl6ut37F7%*NtLN1x+qA
zN_C#vT>seT(*8me)@Fv>6S#ziZCOQpYR}O!aiQ37*p1LV4b8MPwwOMZHeo5U-Besc
z$|>HWV>3_9yrsuVG?Jo9&85|LzlP2W>Eoign$PrEej$tOL73HSVI^)P4ElGtV`JjM
ztS#YoNK?009MV=H8CB6}L7G>v8KbI=k1y~ZYxCM4k3TnIgcbX8m7)Kr)2o|VG@@~Y
zR_0rqjl!{Tc#Np$47{c_ms|MTqTuB!!n{b|_*+Lc0qLzR*P{Dz8rm~SQWaa{x>3xf
ztMuipC~qtdx%4VuEPPa2k2hE8RzPx6)WITtGb{Cg&u)5k=PmfVw2OStJyoT_ab{mQ
z9vGKDPqWMs8|u>Dje#eEsX~1%w;B+~ca_A$N-EuaC78h$L`uw(23rx;{yZQ-p*9lP
zwaZzy(HS}Q#-cGhFQeeQ^lXiG1HSbuX>HHgsLRoFpQukNK7j{R2mM{2hAT_0%B50W
zgZOwyr-~uW3#E68AfWOS{behTr?e{dRMq;JwU4y-kPDe!wcG)5+8YC{v5yjh!#4(r
zOYaiT1x?B4u>BB3o`bpl;HXXpdq#QRbG2oOlo#A>qxQRu?|TpH655=Tgn3gARu|;W
z@^^i&rnDW)0TPWqfl3lmM0m_b4CAgZt{v;wt;c#gJq~CUYfF6k>K^5UZ5ELe%EOge
zFPKMHdYfC793Q%^zut!aHW);vRR1%7eu(4$C3Jr4qzk1!rLw+U{`M!~?-Nx=pTHy<
z{NDjr#7ilZHq}B9<AIVFV;>7h=H8rc)XKC$2k1o@cKzIA?q{{e7GE(QS?hc^kN^1n
zdW+iG>~ub=vyu8+onaXKw_w&g#xeD7Fn2ZYg#t^uk_wXhQhbMd4z`SD>v9!67Qu-|
z?|pNl7!~85oF*p~D&^*VBiC=tIrb(9eyh(SGfTwZ;=`BlxUYa!=FnMN8hG3m_HM^O
zXP>Z9vi4=3SzGfa#V#%$74g;$h;k;@^v3$6&hJEu<PMC1ZUUP;=^NyjuO5C|$wTV}
zpsd<P*1HquLZvKjwr1%KjFAqrkR7rhaj~Cemc&~OJnZXZTZ~v=K~khDHn1(<C5i|n
zmGyv~2k{gk-qkXXzV7Y=N@1;LC&g1c)`~SlUUrASXLw#nS}#3{<2%S-U22Z7k~yE;
z(fOeZM@L<t^7pi>=UEEdJs<W-5`uM=h6dreCAL!I=fmM~oi5Jh<<6d&eaS;9J;Lno
z_Awd#sn=Goiif!}#SMtdn>bqnRx5s0KCL&HN~OID9CIMESCF(<rX}Z83dJ@yyS-!g
z%>AZsQ;2cldGFX$&ee=0f`=z_<;Eyo`x*n#uwp-*6u5AVxTP2nxyrI)Be;=+pLY!&
zq$-&2fTJ!WTz5+AaSDywg(x%LfOG2yu4I5~!z*Z^p8Beq<6W|Z28b62x+gYtFdVk;
zH$~1DrA*tD+5^<@r=Y6;uHdz4+;Hms&<|!tRiV6)h@Biwo~!dTLBnE*He1A2$9}J=
zZ_N`R!GwO~c%g)f!-!h`C@Z}E{hY{`W|lgn5~iE7SF#s?ow!F%ba_qhp)j*p7dMsI
zdH9}}%jn)fcX9dBf8|r;Kd@O<yfqm%re>8<a$mxU&#msV#n|JWdW)ZQpM9-a9B=E(
z;pe~VK8lm7{0bGBFAnQX!K**msz9<J$-VoUkg6rgiu<4;h}P1BV_UbaFR9^P#U+aA
z(tWFR^^RjabdMY3ZZMM;uLwp@M^$Qvlc|BM@<_oe?X~~?z)ACZQf}f*DXDj4GSZvm
z4o8bN|86Eco_{yeeK4vCY2ohz&AmF!{$>}$!92lO9p)Zso)EFT^+L>1R;J(L%Wc*!
z(e8NN0>||_epcq4bCjXteWJNPv^Uf~N2u2|N~Uz7!_kms&d|s>SLwHs?(J!3cKj+;
z3ua~K`28oUDb5~!j~eQ;8Io;+Hp?K}6-)J{3Pw&oS<O1a0h->crq#>A>)SRF`U^A}
z_=8e2vzoIHX?}A-z<Dj#XDO+uGf~tl^{?t#Dfm&8oYJql%TlL8P&_&h=y|s;pwgi9
zyW1Mi?|y#kAfsC3hUXl+iE#I)n1?3NfqkvY4rgVd(MqaPnE5vvRqOmE4UG#tHadFc
z60J3JFr$ewrMl-C=8Lf5b=5wjpZ|iR0?5Kr0P9=%&8^uhil#nc#C&+pNc;e`P031~
zaLZFBI4SGs?b5KrI*N#ebnY$vpSapZIwF<oZIVHc9uds{DR&q6_Cy6|;Lw#&{J$LZ
zzqsE+fbqQ^l1=$fM)`^U{J)nVQ?+5w<#l*|tYloT`Zsg`Z^pbdULXxM2L-h9EN|zm
zIUE!XE}cbt(FE6slID$f*?q<^2^Ru7dhI0xDzu^=F$Y@a*dZQ$g5oKj0Ch+I$?K;J
zzmWxx3xk-R<?Vfh$UzEEGG6IQLaM+JFvu^o<|iU3sMiV952tGJjf(G(R|p{%@jFQ?
zjU{Wu?f<@~Z_qdKV1*ALEzla06{r;kYCs=QWCAIH?hw<kysALd@#W8IpQXL=5(1gP
z*B)$&TH}HKzy&<RU&^+!p@>h~J)i?o`A+z3Y|?)3`J3tepKmxsMkt5Tf?#3erq5Y4
z$BcBi&|pYjKm9pq;odw0soU`(jNk}ZgYH2*dfy62{EgCgXT?ELp^HtKn=@KPaHaEQ
zppoqF*FSKiQ;>5~K$7si>uP0d|3}Gw2KmcgpTJuO2ks`2NQ6UQCfK;??Z<vlJlc|4
zN2CyY)vSLP1(R0?hzp(?{Y~mIG$_6k6d&G#Rip|$&7}U&&+0@NQUw~Buo^hMyzM>x
z?~IIq>Puw(HtatnIR+pjB?;su4N%|nwrEf*83<eQj+=pN6+N!825y4cB=*3KGlW<0
zfHtw%eXxUc8_tUjMC){9C4+$F`|;?Hsq+y}-5S4X>N{6tRn8&trWpmpL6Mzbd{fFB
z@6HV<KHa8JoEwmu(%F`0EUYycd<hl4r!UzZ*d@Hbvls=tbG%sc(GgLHf5YaL=?{}c
zS$gHA0vb7ak0`*^VLr?(nL_2$mp{GsnKm|{j}(_ueX5yN^IlX#Y3rruP^PU4ss9yS
z_uX!`$GyY=)q=d`Ly5{y_D9wWu@}JaX*-XGl1<mCW&Ad!r3?CeE8(a%E;%LVIE6mj
zs9x@(@mBQ>q2Q61t)~E4%z>E3<W(ywi-h<M#&Ht=5<1`UegydrwoZ~$DXNjC6bQ$(
z0P#`3vk+=P^P|`IDo@9CCr;DEmfQKX^9vEaEy3@M{U^JxS4h-{_h7W!#0iy`6Dnz_
zl>Q?*BHc==)~KZQ<;*9<zC@5vc-iNzxS(;6ne8`GF*NMq&o(dUq|vbaqNy0In-#Nh
z<W6dbgOu6*aU6kMtS{`09bV^3e#)v|1(BN2WUd_}h_PrQaU$_Kf)6X&wxzr!kmNIc
zhI_$&T|6Ue-aPg6tumv`Mjg+|6*<V4k3B0(WmVsStwyhYjc;DBa?uUxoQT!6-zTsL
z%#6Uh5vO$m9Th0*dOz+k7}VbWD40krLD1Xq_*oObd|+CPJC!&n=2nD`ad27H?4t#7
z_7Z2rgl9zQ1$`0fII{P81>_nj0%1V}y&k<vH;JTavq$1T*$Lm=zC}aZ{j@>ukw>~I
zcB8Hhq}lS`^^v{#dtucTmp~w_JUvnv(uJ<izeaoi3q#i2DGM0X_xtaDe&>{@nEVsr
z$drU>lDIm80>HrYXa3KDBf=3-nH!-3L;qRwCXWed10gFBDI)OdJ6ylNN!gJvnNOep
z&6panDUs*e3$qsJzZ$P*W-XX~HP=!8J1+V{Jjd{ps%D^B#6MSBaSGU*P9t0%-iN-`
zGt@b~Q8M~`_$Y{->iL$je0?yqS_|=wRyVy|^F%&{t@YBbR=Aczk%dCma6H=^@2gDN
z<`=Sz{cF=MdLEn7sjagy!B!U&%x<c8dTxq2uRebp;9yyf@+n(rUT7i?-u%?j)N)96
zcuSfgS=9ZFnA_n>@ioS~%H{cr2FJ&b8~VrJulCn|*B=roGb9JRU^sU^dR&%{VkGCf
zeC^KzMs*#0oYwXb9q^AiUV3l`N*z&4DBb?7y87O?!wPExFi2o$BMZOlb)GLcoibq4
zsl8Y)m=N5fnw)XfiJ&j6>1j4qzN`RBY<2o;-~O(&pjc^10%Z><iImB&9}TkdbhGkA
zmYjbPh4e{uHrLR|INkbIP+i|VIJBK3#9<v4r?R!*-(EuO=FQkjEAzB5wUd|1v|A&D
zZcFVR?9ArQIHwMI{G)$5la%jPI&LZ_@w>9D^c*s2bv6>L9(NV5ibw1uFOjVa89$kk
z{{FN}!|A5^+CU`2`46|CXyY-R{`j`um`!)P1KvR-)HX-c(#g510u{xkDx*1KiKyK6
ztWn<y{WSZ4&uvJH0k7udEM3ShZwM`QHrW@wgN2CICq-iS4#|Wzh-#1L*b22P6q~?)
zGU*xFJT95iMZ0=kD>FN@!~xAB8SGI`cG}dM*dga1jL!r@@P89?F8m6ZtFc_(`?-qG
zuSXE%))}SUPOheEX#tK>x;I<i>)V<s<NPe~{Uyq)=A4~^9<R|nxe{k5#aJ+!qygnv
zILW;=#H>~QK0~H+piRuBeNKLe@}%k}=Gnb7iA>|LmOK8o7H!O6u%hu;<|CljQQV)Y
z0|WJ3j<?N^cc&OTiuC!)i~Y*^1Y4T)45@yk*jkJ&Q*%S+gd=I{G)Z^IOg!g`M1#qE
zFKi8=tvS58^y{y-Fwm-9^mvOjnh=7g_7sg&m=ePUn`Xu69T2!I3;GSkn(6>Kw95;@
zrkvEDDx<-lW^C5w>}}4DJoXmKP_Fr@{h?~Md>8cr7;b*F*+6y@o5}|0!(f3)BGcMb
zR8t=lygJyY_uG99DT&t|{!8$hta9)&*2WEh%bmUa?tdMgi&zr4{37DGrAu)=aL$ss
zBZ|xsCstnkpuIZ~h2qvJEP@bq`RDgnE6lbvUISJ~7v5tM?m`R)Gqvz_!shNxIy{{L
zyaHyE{*N`z!#`MF7sL&9oP^%Kq*JD6{epgD`m|3#N2X22XbPj@e1hHdGt6K1WEBfP
zV=&B}Bj40RPT|{>EwRL&x2E5_GMl$;?ywpEOzT=1soVWVcqqbJLNYNuD`yGx;HaNo
z|23%h3XYNES1Ko56L-zWVQ)RzCW{L3kVogMmpGDZqB&-DtzFp^r#XSJlO`M&HaK<#
z%ldC;jqz;G(dD!+h}b?cp!-Xem$qCtHA@q5humHs{4@tIYS(*_)s5%x!_l#mBFF8V
z%zOcqAfG+~Bmb1fJKVh+$gZ`EJNTn;{v<=n@v+{QCt;!ex3Sds5?I6*BSzgK!(Ey%
z{XB=p-IXt$isa@l&(E5c$L3(==d^T!gS@8LvK`&SZ9VG+ewu%T&p!ggUHr`9yK>D!
zUBGlXb+$c^iZ_%j&!}f-j1s3W*5phU{{XZ_$ohmWro-=kO~RPO>nC)3`Np-lu|H@n
z9(Oo7=wLrlcX*U^G6f%~Z>m%-p<FW9`4m^K>_@irAefWAdr4+ntr^=W0O9Fue=yK+
zzjxWE{V~l@QjKr7&kbKiSdEG}5gR=^eFOLb2{z%!mw(kH*Qh`rlGc=ctJ7{vT8OY+
zAnBtqme|*rnYoC-l4sFq%^pQH*A<i*j=8ZWez#TAiwO5;xu5XEd<{KfehrSYUF3OQ
znl>@-9+Thvtk=!(vs#bj<gi`$P*t<v-vU-Jr-1+QE4*6NN|PHYY^eN~Ty;nx&FX6M
zS%O@)GCBPZRxr}#zA{!_eib3PbW*OnslA|$BV;O3)b+aOP;n`IRLJG$8%`{PBIH*)
z&3NPOE~k=^gl$8&Ifb6^G>^$}tNR*Lqme@5Sx3@uR4EYhXKseLQ8GtW&4b^|CzR+Z
z0g8WtEMC{gCED4A%{vpk;!8v`l4Va7u52KQ*B>OGswlV`OZ@DG7cuz6-m6>^;;e32
zsb~$h=x=1B-Ic-{&QM)AqkWCffVUj!Q?$Tge0zFf{-p`_J}-1v8MafK-f#XPDU$en
zBM_zw0Wp3UiDBb%RL&d>R~nzD^6VZcX?%@4i5GINT%oDc=}Oa#de$l+K@iDPQh2hc
zSxbOx-yK(3y3h!a#Wx8K9eQ<Q(4^cAise|hB0%l)p$b^*58f9%gyG=vZr(?ch6d+*
zF8{OLuiBn<K{1IJ<O{(}%blY_9F7|Z{Yu66szdC>k3pAXE#7Pz806+R(3Cj(uf=Ef
zUMC_DX9Y`E%COe71>$IJ$DPV;rOK*LyDGZ-)s`RglrubIU<EbFb00DUt0I59-m%nD
z<*5BRp;oG9+4*q&{wD5uI^A9#JHV>|&j_DIzq+_1NaA_W_S9)vUL?@66Bu^b8Z+de
zRW85_ykg?l?k2y(BXW=(zA#}LjI?)EwEAW>b4n0$4)h%0G@B_ERfA9UxFN7Au_!`h
z7%f`teYqni*AnD+zEK7{sZM47NJ75OJR&sYcq-DU+Oizg(cQmh>JEe<KTU&S<tUY#
zGqt?K0h_gu!rB7c_@HQN-yCx;a<!oMv{cfio5Me{=wA*Q?{-%iso!EHJ!~JA+HBga
zE$xEVf5KL-)s!a)seF04-zzXuLg%sPs&~Cg=RARmW6_9-azP5WD%XZtN73mw2ZZRk
ztUA9x?>`N@udHO_F^&31)ppR{;f9iD81fCoh(3SXc*h~yfIo-uG%kQ?IaK%&9EuzO
zcqh?aS6`Ft<JHT>eA*RZx1B~dCvLTVt2~zTFd&)%x+%?hD8IMla&tUBc1FF#_aAV!
zZ=Q-nrcA7_27U;*ri3OH?sL+Wlsqk+BE(rsfSrBJFPv_1`Rq6xCEe7%>E7eCp6lvt
za(bckW>oOF5w>qB2sa(>hrZ!*d)*%nD@8_~w<<&4$${xK^|^J1-w)(J8NLt&Gupql
z6hRt~;re+Du7|80LDoWJ@$+HJXqVF@r8inM=eXik%g$<93p|E6;wtK@mM2;j>ALPk
zC25$@>8A!d^4KEVufU}S$H+BCD#d@oY$oT#?no~EHotHF8N~lx04_Ip7!IYSj8yV5
z9=N!VZ&^c|Z+Qga^y79@W%GYKv4^e^VyWt~$PlENr5yqblkrU+2#-I!L>{8gYLRFn
zK+sRBOs7%(y_nVPlh*7b3i%XYkDcri*0s<7*j{DHzB-9;thJ(5W<c0Kz2&9NXd&O?
z>Ar-J^Xs!;mG)j|-@Vj#M)N`8dX~4E^P=QDR#(_IIE8+=p|s0QngpYjzXS%8ext1q
zVB5@1K-Wx#Y@g@-1qxjBcWGN=l1&m#GXifoD!xBZo>ThtHGhEVct#!mcs{vd>JAQt
z^{2ETzMfzV{`@RA&9a8lNU=sU+Gl%|6p8#krNK%o%ae*TCv?|BsEQ}W4`q2p88T_P
zAZ9+4$dXggpJpBx!C`YpR9&1*?bGL0H2Xo|_{=58YDRTi73V1?%~#s_<)*>z9x#z;
ztPf``1ew+l0EMJ~6mRN!=|oylRk~nyTYP=hFX-L?;3I6G?Xwvv@wkeNa{A|ltaz@?
zd_jK_a}r-cn~<d2>O&jKJbRMxe_GKYC^ToB%jLQlQxGLke$oz0c}6LwI<DCiALyp{
zRSMl;$Ht5H&cfuq$tX#T>4y;N*FtL^^6#iFlIw%6n5ZCHYs^Bf1t9o)G3fEt0%>!@
zH;hFoUm4oT_9bRkvI9M*1rgg^#9!Aezz|YUDSxtuC&r55NjBXFNZb%=hDFt}zLO8o
z<|08Ke8%i(zKGy|IxQ#6Yhu<df<ek7P~SQ1gL#p|q-nXKtjXi~n&QMz@zQM0lf;rj
zBIn^Y6F$Led7dVz;bvswbg%h$@dK{W`9`nWw9-rvTLurdN5|&CXujaOxRPSI*_Y@N
z`JM}-9NZ}l*)vwlonTBu?#e=6<$c!-k-2zg)h(M_b#1}7@h)enU&(uhel|&IlXARJ
z4`xhO_05uawY+s~dv!Xb)kUs%6aBl~J=Lbqwi*(eSdM^|%b;N4%p3tl#;d`lJXn6s
zUiggG<0Rb$jM^%XGskmL9Cm0(wiw^~D#{t?_V7HhMj>}-Hwzq9gG6&f7O(1mT~JNF
z>3XeXjc?<4+-OV|pQQc*95s<>|EF&nBRDux+_R$CWuC9E*u|FZxLBi(<HD^DOz4m*
za1w3&sb$|6=`w=v*w%@C_%^;lx8Ub3O%TVj0j*Xst-nls0U*Yfe`*glBv8Pv)n_O}
zjD+Dly}gyqf3)}i6!(@vaW!4r=wQJD1P?)jOMoFja7cm%1_<sjKyY_=g1Zi$K#<@X
z+}%TPcMA@|?QD|V&wD>_eP7l2ajH($hMFHUySrC+Uu&&vtzKO*Tx*V?Q>Ya%fl-im
z(rDYH0#Yft$s6+1oAF$NUx1{JDb60P>j9o{w=V<jbrRQUm1(c0y|L0{mT;lRS9IC^
z)2b$TZSd{MaI-{Q9rymit;U{4-Te}rXPrBS@O70~y)x$;mBx1QVS3;w#I5aKomA^D
zwN31cy{iKldv+w9CUbL_zqJh$O#TC7)<Ug`A6mY<l!|5RC+rGq<4G^0t{O?e=DtZo
zxbe}JUE=0Gxyc4J?^paHM=j)~Dz1f&A`LD!M`Q9rjVe{O%ozt_@3*2-K0qq_oZ$J~
z$yD{Z)R*Bn48Y*(k~-39&2eBVGukBhMaD0P!(ySk@;nmxY_F@1RQzyE!G;o*w(d%r
zj2$I(1A;_HOrBU0Aj@~I6!=|<N#T8PQ&Ozh@<L<t$QLDXz+BZ?x7kfWcsf?l$S9;@
z=K`bdUx{3MKr6@tTK!m@vsbRa(y+(1QinO8;Sf4m&Nl?7)}Ca!&Kve5uvb5e+<*{=
za`qE0Ei}6I=#<jWIBo~#u5`%xGUz9eNSw%snJ+fTkY3C?ZFQi8j>Oprk6lcZUfv_{
zVSmhm$}e`Ex<&J76n7(z{y<_MgZ1nr$zJdK9cnN5ay#6hJUTNE1$@iZi)cS?w=2Yu
zM8?PoQa9j)Laxom8r$RjQ8;nTJ>dYz^!FV7j3D-$!lzvARSO*Ta(-@lZ9R8CU(luF
zcd(Cz#M?IRy+zLvW9H4XdZCaarZn!;4cJ@njY_BEUlekcAjZz$4x~IN4;3@SQvJ#2
z?(fUlUKKwJd??Jp!>U>br?V$iTqx>w??HYqc%^cCo0Pi>9pB_5!-Y5<EC~O23>0$H
z(yFvnAMg5cFT{k~)>#B))U0E${osM@cU>uuMFuv1G^BHb%}#e_e=S3M9FEPx2Kzc2
z?3B;39mTAY9+auoXiIcr7>3LYu~}^l0?bKZFI)Dc?rA+H|BnCl#}}1M{VV_Fsd_Ue
z!M!=G`fnC3J#Pf8R|5H99Ij8gFb+a#i}<JjHsI;^M&|r6(;&kP$IgFwF^NDlk7m5j
z-FpT@Bh$0uTQnkWku-u^m3V#eZS+-InDJ~SZJ*ND@U9|WO1pCBpR1!;_r~sJh78<t
z+;_JbHMiZ&YrPk=rV~~^E7|?}fK_;Laz?IN>Kdq%-}XR;WIV7yzp6pl5E{w*YUl1s
zyK6BAV=yzsYoRF|jmCEMlxsnrGB|cx1&6vrg|0YH`15w=$w)W;(WH)3seZq!{Vv<n
zs%PI6eS?E1ZGE<W4hwxn1=F=bx}1+;p=ksVQ9;H~z&m2xX>S|X$y9CnWeP8YKH4>`
zlGh%d#N9-`(5!NMp3|6n)IZLYr&e5g<+M^AD^AkIYq0R_Lw^$Tfst&^0y98xFfu&@
zs!<N9I?AMIt$rb>^O3h&>tRntGn6g<>3a4<8Ysa?ui7-CKv-+O4x|$-vQCf+yfI{+
z&J!uo>TPokl?Nl)Jy>c5obblqGg&>G9f<okFoS#I324+4B0Q=jYL|IvNofuSKCk~A
zu4HuBWqTx~b1SW;r|rsC9G-perA+G|s?o~f*@t7l&xx6f`J!D`#q&+4I_XG-LROG~
zbg}k;NE2Hd%_DXPzN3iSwOEPcv->QDJDZ&xb+s>0y`=yTw<eL1n2e>aQyn%-XQ5Qy
zBMgF~>EMMl^_8B_C2Y>SFW%c9`K7WbDAP#Ne{sebE%nrDX}_zS4PtsU)7HU{MSg)c
z_%p6OLU>W{{7f2~#brITIq-%YPdFu_ah$4VQE#`jX{zQt4hLeFU?o?Od-L5$nn*|b
zgQMka5D~8UN`HIem?X2T8EXQUz!}Lk>I2zx{Im4XFS8%8Cr)W^2c=aMBzOMF1(2;O
zXx<L4F3_`1y<DAO-kq-%4VSI-9}N#${a$upnPn8Xp<4R^({v1=_$)lHmT_<zuAbR$
zvf@35ZN)g;-wCWiP8j8LWCBn5B1OY&oL5l4AMe8|p(_ic90wm>XRs8-$Ln3RSORC#
z0B7;S_1tklqZXsqmhe5grr))_2;l!jANXaO{s1l!blEp`&;m?XMWTuiUW9welk%aE
za1F<n@{jCqLM3L4Z^s*zzgQP>m9JJ;$@ex$QisDh$a#KHzqeQiocBH^zXpfCWGXHe
zZSE}1V;Fs5ZDPdsCE-fG)mkWPUBoAIQ2`#I_CL!+Qs^Y7Nrv=ZO!N3=<RUfWa&o@;
z=*H}_)P4x+u<^blY<q-?`ke~Ssad>e$<Y0xS=*vkTD#&m#W(gAI^-tujTYi%S{u4=
zYijTzSY-T)lqRQjtV(2a{gu{TppFRA{yn#(9<Hvbj`v;KwT_KXxX5V3NW7;Ny7S(o
zG35p0(8=L)7e`f@XiB5f6w6Oo%&B`=Z^t+RZYPy57v!11lkP~ewhN;Exu!axYRzT5
z%1B;UxARq!QQURNLc0hyhyC0x3ZZ|AqlH3&U8?zeFi)|!EmWuC6r3=SQCw&+w0%M<
z9?fgv+Ft-*fTYN|Qx*C1Isrg+Y~(Bdl)g#<vM^J7(wf|{%Jgo}Z@;Q!i*%6X)EsHQ
z&7C+^e1g&qB)%ptb#eCR9mNLg8vwiqB_b|TAK*@nS(yjkc<eOU$-RP#w~jU5=LLbD
zVC^retiJ4i%zSKonwyfAo;P|Ly2^p1(C;WPcE6RW@=XF$F@FSPMdL|K^+>Qut<iL^
zE3#HTWyj#Y892+8Lxd>V7~KLe0bsjh`gO8&0Tt~RIvg5m7Z(<*ZPd*Myk%((sT7}a
zAb3(%bh5$DN7`-IA49XC(J=sL>9*<okG-{Xxw#qwJ$8hmOkO{jO1|$Z>fP6q%k3NP
z)PG&e0EWM%(kf8or_u^5d9uJ8KjrL!3@9_+$1Bt-dykr9XCy;u6Z56U<%IW}rK(?(
z*6dQKR(F}@c#MSHaRWBwAL1sm?|{oH?-SNdi`~ZrHv|~K)%X_?PeSLy41Q9rK4q3f
zrxklTQ{|DRZeXDIA^!w;t^Nc_A5z!P56j4%bge+hqu4Ne;Y%|>y^v(B@mPxK@H-_Z
zBPzHn7K|b)_2VffkfG&R4QF)j%~o5V_R02<M!E?CHkY!Y#IgIAPgn~;K|&W>`(_|_
z6g)yXeEW`<zVoNS$w-r^1x*5dGI#p7{qN|5LOTTSzy<@Ur1P^}N|Q<@vxtDf+mXiS
zMY(KNBEDPna<ZQ6LH-2u3+(kfdl`XP(Nuc{*elb~kX9_AwvK70GLzfBAfb~Kx2rWd
z8W_nr__A+t_L>>c%s}lWVGmNCiz*aaD`M4}1*5k$B22IWFh_JrQg=XGl{RY5xYVZZ
zwr`SZ{T|r!{`S!`Mdq)snCwL3EbUBw7`#1^tzQ{Ro9#(@^<3FPFo>C0xgH<#_>s;2
z0BT%;CLfN15jw8d8jwa<P@|UQ8?4M?nM%g#0;etei@;QcK3{olD`7<je0#RTYDxo(
zKEFscSq;>^8wh@&9>3#{{BKELQo=R{VR<j69~-t)U?I3xDn(|U)@#gmnMQwA7=A91
zM4M*oA(g?#lu)>Mq8i6<)Y@^I1>tMHWRdrCtOVkW0<OPM;-%IVl&P6vw2`D?%eVFU
zHj(fq3%^kNt~m5Z2u&Y5(ZNhPjf3Tp*qK7buJ;j?rfVZj=6+?HHdIkgk1qmsCLAYI
z&O)FcI<)<|8RG8wZ;-PEdou54l|HDeydf{lL$FHM)opa4f%$%J0*>>sgnW@Wz28de
zmH#5PxzOIwx_>GF1a4Zydry)f-t)$9Yk-4Hx%&`w+Vpl2LagEh+j|^^@VdbkEH<GZ
zz()C~zAmKLGzOU)8>mPG&H;|LbztQGDl@8VysUi8Q=1B;$2lF`eY~%7qFs8WbWiL>
z6(=Uw7Z(#JkBu!apDC9~ZbA_QNn(7v!H^l2)Xz^t!{{%9AekK;M~f4VH|cyoT%YQ3
z-|a9|>z2`YAUIl|o|{`|ba1M*b24wXcUpX)Znh|2%Q3{_T#OB=ciaa9b%DxX+EBDe
z_qoXOaDIWR*SPylKOXd>q;2LNm}`I2Qc9ScB|rXi`t7_0GJfX!<CL46GlHC|tMP^M
ztMuWf#_N`Ymo{Fg$dX1IeElL$++`iBnQr!$@itrV=k!P4N%pJPoM66@_`@$?7x%k5
zT;`WMPvIw=VBBgQO|W1$A>uikUv`hfz-9N99mx(Arxvg(^1`-Tr$ys4zhf|2dWe%z
zxUJ!F@THBO=}lfMkq2{@wAS_G(_Q_wuRp8FWUJZ+61$VnKO8o3h(*zdU11SNx?-e|
zteWlcLFR<IJU)~`$gT#zLDB9Ok{0(~Jo8fXWvIv~4Av34-JH<Dfe5-t`S#L%KN}Re
zHLw`cmj8~&pgt#J;CxJCL@&#`{lRX9<eUkg$@96)kXy*@4-pp{E=9Xhj=c<(ygZ(s
zR+-GDYmZ5pRIfLvYrS?cX&RYzT^7g&D&T2=?KcekF=Dn>{k?E(VOL04)3Osq<}>AH
z;BMjtQNZo>*>b8K`t<OaACrmGW`vI17uBQE#Vb^!05`J>)JN$FKMg{1&j?C9PhzT7
zTwJ!Q^SY^y6FsczP7@nP#z&5wpIw&fbgtC^<;V7Ko;o1oJ8=fD!q@J7YQTfogfZ-g
z&^C9bNtxqIcvgI5*k_4ZTp6PIhPL*3k%O#EbZ%bM@|LD`Euwj^o(5)~b-eS|;k{s#
z<LsMv;LD{iPiWVwr>vdKlsxUEz^~5xt;&}7C-MV5N1Z#0*v;n-9t8J_!p$ZaF&4-l
z9uy{x%jFg}+W6)N)R<{~hXX0|NyShGRtQ`^mRg^c;On7BxMWw-tI_6%vKuevuK<Z7
z(~bBe#>#%$OOZo1$WR(nsoVQR4a*ce_$hWdh~EU8f6?6exRI&-IEBfbt$td#*@S<^
zkMQqLfj%NmClNWkA95CTsG7x;^krb&Ew+kT)T9b~_vlyyQd<9tvAI_4Y3yVEUnyNw
zg3jNInyF##)-gbnQE5r2({DIyD%87dst5MxPC`Ce`UM^ur_fd0j@2XC<u3QW3THd~
z7~Jii_*K6jdElJ_X~zCeVStS_&{SVBG$ea*1yr#q8nwtT{!=Qz^Rmb={>O%JrSi(i
zQuTroITnXmIQvk~rj%A}Z_rHwY~j~;tLNQRrbY{bgujJeB-FN%L*pXo^UaF35Sst~
z>$7KER%U{pHK+B1#)`RDHjBZfR$X`T6X$3bY0>yQ-(ag|NnwDK^kQQ%wNxkezWIj*
z@;VTK?Xw!O4nB8Vt9Cwrzjb`M&Sgf?ZMC`9lZ1Ov;)={~d)?H<{M~M81S^VhnvSA*
z+6Z4(#dWJX$oJje0C7qEX(AO+d@5RjmyoYvw8AHfw0;`;s<`4Y6D)txo{7w2mM81b
zB6;5I?)tKf-R8hTX#hxz-1_|V_6O4jRmg2!Rk!#-bEp(cJ-n<vanKKg;<qQ)2OSm;
zcY*akIbWVPy|Us}X7)lK4o?q>9zK23bc6Ig??}HJlnH%y+XhOFZw5(fsXuCb6wWIH
zK0*i=$S@6O6}(3z)K)~(a>Ej>HDg>o!PprPw`OHjdS;g+Epi8Ml-1o5UGwv22yNtL
zg-+L6*|7eB*L~8<D$~Vrci}#ZtAj-WBJ!UY@rl4x0m1F~#X0amNJdJ-{2WvB$6Ue^
zLH*#{;}ja0;KEJdCGE(DRa3_vmnN!MvbM{$Fvq11vtqtHY5n~rG2gfQuo2clE^<mz
zF83#nH+2~6OiH5XO~WZotI=Xkut<x@jHd7ICEK-k^QHUkN?Yf1syFpUSjr`uzB~Dg
zzD#|})+Z7vZmZ)?#~OXrfY2F6W|DRSlBJ1iER19x{kX!6MW<tITZ7j=n|r1VR{H9Q
zn_TtNpDGY*wLhHra0|idv(#{FP3{|gP4zYElNhTjRb>9D!f>Lesh(X)&R@oHFP0kD
zpYrKFnXtN=56ZOS@(Av&SbUl7^~n-LvCtMl-Dx+%$j#}MZ;nZSL^o=V4HdHfROqNp
zcenwY(a19@=Ry>P6uR_QX?&^W6)Dzk{Ya-)H>Sj;^`rK(-FE2>^k3lzjYwRJ>fI=1
z%fhbDyHR<-t5xx0d7SkJj`j!V{E~4EcxmKTA#Mj&rhPaTD=qp1=m>{r#3yVPcYO5D
z+RIUD034;F<fvXUKMQcY6pgJWJnymURtR&-=#&RDtAmO6G&o+ya>W|5bScpc4S;%@
z1*ScoJe#vg+ve-nR3alU*7jMUVd+XKc$H~$n{3#&9Uh<YxkEJj{asAOpm>Xl*A=dz
zG-;r=)=Xx%_p<TW7lW}#$A;^nlCuQ0t*(m2m3bqwYAy|5hV)x}ht*$4V%2*P0Pc+*
z5M-u_NW%>VBc+nutN+#dH8<4|F}A8ZNG`pOvxKdG6OEABGT~5(MPcg2JD&O_7ent-
z%TX2*e(RQ-+dWR56<?g}wuC5bTC|+b0>N%?vb4iHw*zQvjDvHz1#Sg(Nn6$U_uZ!1
zQr&(XBDu73!gb+{R()&byNYn=)vx;ILG!)*b*t5*4UB;*X0k%R8>At&ezc^G!Bq3z
z7iuT*9_l_!MTt{V`Z_Jk?Sb!`^pqfJ_cBxNbB^z25J&0E7KJu!n{PG{GZj2#j?a$O
zFZb?EXhVki$y(Gvl$dMu-msn^lP#D@P1(jCk#BIUP?cpfvH)*&SE>mv_`?{UV!kax
z`u_Jh$l8Fb2mVvegP9{a{?U6RO}VpYV+9_}$(6=dv|qWge;v(m)?6hs(ks=4t%~!@
zxo@htRUj(xUo*?OZ`im=Z8>h#=yq*A`XkuxKY~pv_1*5X#UIJDL661#7;tLK<Guxy
zTDEGc9uYCE=~z1we@0>yEwz1j-a=Kg{1g31G4hRs)NCLSYg``B-l0rjQ3dZ!-(s06
zH{liWIkwa72!egTT92Ht8MH}I<rQ|cV{%i}Uc~fEQ~&g1{g#s9jD<A2K^hq4vP$n|
zlt!Pko3oUt%KIZ6h5vG`LF|R7I^*g$KOicux6Z})IBVUA?qIh@%#)81{;BBtns=;`
zAm5EHmu&Nm;VDBH!FpdBaW*S!D6wT7zEb{p%iXPs7+6a=$6PGq9NL&7t;k@P*^;kW
z^oUW*F}?Qwrx{}kjf)*&b0ky0mVWh2w^pKYQ|zLAQ==#z<Y;Cn)AAEr-G~KWEcCe7
z1~51P*zY<cqh*qk0ykD&p=&yi(5N=rGLTumCqA&6OL(5&G<(B|hR<L~Dd>J=b-ynD
z{zkF>h+yvCnrYTIn6T;S=L>tU6*l$?_0KB~dCFM{)5M)LUOaXi9N0y+G{bEjNukl-
ztV+~Ot4-Unx>0HO>+3dVc1H5`xlU&HoA29-wVIcYnfj(P$E)=1UclHf=u$x>+}0GU
z=_hD}qKyI4lpAb%5>|H${Rz+A2v3bix0CdX*X|X#4Hw$i5*N!EzY%+?t*2=<=hiLa
zj2O8Sl+Riz&-OP|GF|Y@T`x9^ftB*rNfkDjIfZO$$r5n|_}pLi#W6=#R&wmhO0^@=
zNK!7UAq~Xnilx$*-(7eRF5G@7m>$NHmK0eus9x-eZVFXtrpX-Ux3ubRoP^!CsTd`V
z(iaIpj>d*EO;XC5#C;hi$VyFge0yo>?^*1aVRu(2PKqx)Skhe$pOCqK!I)^MA3#YK
zJ@AZ($(CF#2+znI8eEqSuOYh!EUu(`?K)>4Y}HgX44GDJ+PgI0Ai7^g_r{!L7AJUQ
zUgdO*nywLL-3oRSh3Wy_DO?>_R*p#mpk>6-6F*nz(T*LP?$ulUYPVd19=jjCh9z-n
z9~uX&;UOP-B{%U|0$eeO#Ru9-e7td6*QyrF^|k_ZlwFJ_55DeKi~LIJM&RBV;WeZH
z5{#1t92$~tzN%ij&MDn{hPilp^C%rOVhAPQM)Kor22WrSr4;envwM(^&TZq4G>g;m
z&tLP5Zce`o-r{SXNl|D8<^m8u0B=C)TAjP{QOwn&XQ(;Oza)cRz5{orWn~gTTHT@v
znqMLBku|br+1@D2AELUszrkl#o)hsY6p`zsSKL^2Vn?Udwn`HeZ5=B!6M|&$&zSq>
z94?o~-nDn7&|^iOVgNN@<&}1FPT3PWzzsrrD<l;I!<J~x9R^9$PES|>gnzq6?BrR|
z4p#ScJRXVw3|<K-e@dZ~>1C{Xmn*<1S-S15_F5Sml@ZEuR`&necgkfz;%awssB8CR
z2#Ee4b|3H!p0+Ng=#wioDZ-tcf9&V_u&YvjRZoT>PU4hrOigQQ6;#u~M0T5CEVz2I
zsp}@i?KQm@yRyw&eP@evGQr@&!q3|JTa*5rPOLiLyTuZ^|Al~-uF{hj`6@Zf)oVT=
zb5jR;hCTf*Oxea|$p_(D?xrm?OM8FCUby(TRaf3lal+|&9e$Q}9Gau8JxA4iE7U1+
z^e|)S-bT07>|d<)J_O^6)!J<yt;mJL#4|5g(q~f^J*Dt6T;>*V>e39{gwJ}sXB}x<
z*P^M)`^0=1nBF$s@0%d6yKU>6f8UQTvE<7G%J!6r7$@o!_R%+!$_g%cY#f1DG!PXE
z(>I;Z(`Eomfh%0xeT!u78#^j3d8!ERAHWbJjq^~}EgASJ90k1O13pB#v3zCICp5F~
zttWaUv$h9qYusrTSxJ+Eo7Cl5&XL5#;N&U3<*Zs8N={n+ok%MVr;YfYsk6t~_tGW9
z=5oZy%I(nkSrt+{>9hb+8{MO}r@Wv1%(g~8EM-F}L%Wz=fkVGW(dTQilmy*A^Tzq{
zJ?D9T$E|J;trJ27YbY3Tk2Msh^gLs42<!+&lt7ovjOJ|3y?+dKuK6n_UBpo!cs5eo
zjk0^`#OmnTjAumHLx^)w=pyyK!NL|Tr}6p+`2KVq_EH}^q4)yD>%JVvDAh9`QNbQV
znpDw#j%0rQ!J)AApREX8=Nf*0l^lJu!DKlc)@zmi`4=Y@)@{yF5LUX~PKiG@L`BC*
zC|N)eSpYdqIKz_pd&9f=I9$lA*Zjvbt$>x1W=AP-t(E3;zC1HlR{w_FR8v9B6uRM&
zt@(3}rZ+|TiBw~~ZyiI*_7cgDE(09$Jj4a~66wY{E&1r{ecySWoRvi{Hmq-}SOV7m
zuIS|e9PMyqnvcdo-K1j4Jv-^qj~c>K=(cMdi=C39(nfW%^RAcjVqqxb33t;w&rtnM
z5&GmnPjnu#;6TS$Ml*=^J7e&KTAI($6TN&>>C%m5>el$qH#ByjV_#fQSNW^&BvD#_
zo#qtLYkh7Cet|(Gh;Jr%^SB@p0@&uy1aCMVotfGV9K_}{1<YE0O~#Wte&tz<q@p?d
z8k^Q>kfRW)X9OJY&^DfKDkwXguhz*rb1(qVpg*y}$8(-<DJ}9<J(CUm!rpI{xW;ur
z_T+MqFTryIK$JqCx!O(q))=g&u6J~4VNLTh`hEV`#fYN<RgAi;xjSVp-Oq)}I_DDp
zPCM2ufgHHAJWXxUk@8cMuy#k|Iam8Zy{GW{%a1k+MSN^m*_U!<p`Fr}xA9MWx05_s
zS4M}qQW~Ymi)Vw(%s5&nShdBE^gbR=cMV>gH+uwlB=xhjtyE8X`+hwejE@9rTw@7x
zV&{LDYNu{pd#8ZT1hXu3;oBI3`5Ta1zAam!iNJ+~5n$|IB7XC%CITom6T$qRyfJ;O
z`Eaw6dh9`U$F?ET*^zH92psN*a*xU9`bo0g`u7~z^x8;d(K6_hY935f+RMTE+_#=5
z5!o3$fFC6VF0;IeavV({jRY>MVo43OX-7_yrHCt0#Gk(wjD+7z4aH=BmPGXPOzSFH
zmg?~S^7&ptadA+ven4^${_JsHZwZCbP^#xjXcF@!v*DH&(b;$4>Ouv_>IzoD>!$@O
zEUgaXR;~C3OeF2ps^FvR$eSs9^_eDpMBHw^#&k=wCgk4U*cqpZZrpif*l_wl#_db(
zGyx<|)A``)1!tt|^F87AKr+s@PzFUIo1xl|nwK<lgO<1W=f2yA%T)CCHq5)TiH2v)
zq}<L=n$2f}TRq6dbH8DEOPvZ{?h>~L%H?KwT{4@TgVnHn5M${q7z&|3*_|IRI6DxW
z?cZXp<}`YSLCbkVkdwcO74hx20#U`IgY<dW(M(@h$#^qhwmkOwM(DUZ6%fo$pgZ$L
z&e>$n8Um|DNsMRMdAE#96HCvV7Xj*mo5IbBI8MPDN?dgJPQ{?TIVMILHOWl_ta{1h
zpeI?IS%Ig+LEy3sbL(!04TRa7_h*3xDxaK#LytD{!&T2V%#25OwNK8O`jt3t^%J#)
zqxhUGA8${NqvX^dczkz{15Eo}VH_nPhGv)9Es}555`w@AxiL1X$O#iIVX+v|Z?Wbu
z{qgb(H2=CeFzmu){4?=0PL~(L;BXi>p_3wOD052l%l@;PFA2Dm4e8{$8^{GbCIcOd
zE~KL~C0_048m|KP=*qM9_<l{=U`frP=fod)g#D}*2&pCv>j?Il&LOf}Y(qBVTIsK{
zi%M_=iT{X%g`~yNacvCf(6-{O8K%V$e)9|CK5V+lFOVxTx2X_3Z;Q!(nXopP!b23>
z^|-x0!<>`cYyN7Uc20i%W#Q!wAs%-<Me@88antPv)9%G7VMF7Fl;-<Pyh>^;nJc1z
zP@bY2l4?=^Co;z=)dx3GwW3FQ(TnbfQp$jMQ#cPjZTjHHYHRtqeLC*GtH*Gyt`oL4
zVhM{$zsY9gTH3}OEHk~-fSq13#?|M%RDV?*YGBo9nJ>J|u^B#YXyu=<X0W~-wwPsV
zb(~_+^o_-SYJu|dMJBXw-|^daLHLF9-ZM=PY5X`wE!r>8XS{~V!H<}=8bButH=t8f
z9kYZrPLD#WXA2Gjj7mijiJL5~fcb3bg-a*+;HLgU#e-O6Z23%(*00ujYOy(XP(026
z69XF}d`fo8*4HnT<cjGCQK4eO<M3R_4Q%zZg6N}>NN{xaun^23VHqj+gCNatel8^h
zQ(fXhiVCyAD&K>UHk>Hr2snkXJ2nZ~Y!N08z}&Jl6k*ThQuYi8WvzXjQRqRAv0oT)
zbuJL*DNID~gNRFln$r#U7%>XrHLtgBl7H;YM(-*)Pb|Im3t+W+_V)Sfcvd`GD0p?_
zojW5Krm_ina~Yykc9%77h&9@xEep$&-{^QVx#z?MLN<1qbLN;CqQAr7nm4@z79<mv
z(G8iBTkN9wNtMIr_A`QBBVg~pm`$ppj<h&0XX;Bv)eE8eW`&lMT)hxZH~Xshi3s8(
z;42Oy_O`7Pm;M?=5(CRYZ2IB5Wg9a*=O8)O@VxRuy8Wn_EVf%E9XJqvvFkWrB{ru%
zim?I(s;tB9?QMGHt5+NzrI@acCZ3RN`DH$cj{7NTpW+qy-SioYjL=?OP-G}2MEB?e
z&!M36A;_|RnGug!ndG2IRS$90e`I*M5#LIGrl4QCD-e^cqk7Rj_@zNVl}hoP;l?k&
zai)RYz7PkE_cg{gsI*4D!K(KlYvmR@<d>+q^PgY&+lhrrF)-M#b0J$rffL0pfdx62
z<rMFy<B3k@Y_)iXYgu#E7G|pe4kzb)%73&!y&NmO2unx^xHU1U`6TjgjjsUY%0G_A
zu0v+F|JC+Z8pg3Bjr$qU{%fAyd(xhE9bEmPS;kh&?h`y?Zp^^pyB9F->5*JBK1dq(
zk5ZuC^&!-bDr!G(_6a~YncfYJ^Jp4WT#aUcjjlwfVBCg)3uM}jl7lZrD7s5mZtvZW
zo_RRQMv(d<{+;Y%<AUyyeXVGVkM&Vp1w#GTjlvg5xIXSRhBRaio3!l#zT2abLT&9B
zrutXD_=XI<{&K=#kK?DKYz>XZf-?fH8EPSuRqa{aJXYEeSQ*Me(+O=|rI}&zayc%a
zNk`Pel8_`3A}0LOvvjv|9LTYTrT{1KueRM$m-33)nWCG7$i?gEBOI+AT26RIvG6$^
znQ&*{<ZAKNx&FWj!ayt69JuN9+Dba3FI#9XqcsojoY#7|y|G$}&0}{dnss%^ZJ(Wz
zQC?oig(R)!JjI>hhfyui(Dd{W5Mc4Y6W~okD6HY7<u)LIOhm6FK%a5Azs!$kvZM`8
zsrJO-^O}tSXwM{J&5al8v1Ne_anPCS^XAf9k=7Hm+5N&@ZTi*^iHcz*b`+Y_u$!ZJ
z=GKl(d-F>W!B9q$P2q)ix-9fKRFmP+8ml#CKqEJ4VnS`-K=~`P=@Ijg$_UEM(EFME
zahpl$b(<L?BfZ|_EMhVR8qw)Dw=$J*Vi8mj0*y=Nt6Yd@Xu8&mgJ1ntNl2snUUyxm
zDmu_d(s&R&;kjJ_vp(($xYXdxphfe#y(^}0=Vab{QYu&ta!8=?q2doDyBxju2e|{M
zI`=V#ZoUi&1C6E3%`R_7?)fthHENtR60m6}Sn6k{3%{;LkX?bT5ub5-?mmqI0uhDl
zl{YqXJw|L_4n~Q(Lr&})l(8n+_kgQ$8^Y3#^QYSdOpSFK4$WoRdo@4*aDWNARF-w#
zI45i8dHTlEO-Dbhqt5sK415hZl(ML_8Lwx#R)6--c-@tuHy_LdpRm$LANkBm<Y1BW
zeu!9EYoUmj$0Nm?bg|1%Mi!b{X#bG-^~NwV4){}O(89O!e6eK#l(we;y%Tw($mf1h
ziY@*z1W$N>&c%FrrSQbZ581kl=+ay(<1t~cRq(cGpl9eWOFPa7%d(4Yh*jv`d(P$9
zdojF^k(Xrm$hePu5O?*vPIpoiG$g<k?_2^b3dihbWi$T9z-lCh?i^6lc`{y-WO_$(
zxR=a=H%pax$0-VWM>dT&1mvF9>S%$UcJ7ktqsLoUJ%}U#bI+iR6$iL0J%LqBt!sW#
znOmPSubP6el-@A_4uM{shG*;|JmN3UlVv&UsRa_Rz}`T{FL9`WcQMeA@zLk4cudDY
zc!2ugn;)8-BL<?ob>7Ve!kRhwd9wk6B6)e-A2R`9?HA0Zmof&_fM`d;+T{{;auoC!
zwbFzA=cnC@kSob*QlJxKTUzt<&9!ZW6)Ef0u+L5gs-AWqSq-_|wjO+_!%PjYhhPRZ
zFeemz>{*z7L?wB}zat!?ALmb{k5kP2*?M1lhN3!7imh`^qUgu?{El!SV*!mK#$w3;
z!3@-LcA&}BqmZAbznfH?(psec6IocRhC!Vy6g}<2!L&rqf13RC|4WlRfR8vvH!VH2
zJQ(TVnzlQ2I%C+dnMj(t+Bj6kn2|V~akG&@xXyL2>ShA?GBG(`F1=QFWoJKP2t7yL
z2BjiJhL}vfcxu@MJkDM3^W#1w*h^~1@QmI6Onn4>=}ikdyRgzG5Pgj(^5+3D@Q_7A
zkGFdh0Rh<`Cli6Z$(!6TUni~=<(SgKQ&RkXo^&J1+wxPku8ra{w65W9t#PzZr)q(Y
zK~q>u=Aub4qa*q8C_&y2PY0V%9BHHk`0|&cBtXpa@(fzP%0=*=17U)m=F=<K@~T(i
zz+pcD9v^PnsPa{8U`o>^ANCquM$*zWNr7X?OPY)fC92=wfx`|mf@cu;-Nli(-(5^X
zO$V%=TzZA$OEu1aUgb<1_2c)e$fSS2Dv@Yt;|6XC?A-_&Lc8(O2GPOylr%zm*R99J
z{tkU5>UCeFI1ov4Wu$?*!>4H)Ix}kPfYRU^K;Da{&=e1nme-(pcpzn8rAUeRZ*QLX
zI8#O)|7l75cT3eoQeYY<W+lG=`37y2?P|0?4usF?HrSWObR}YD{P4=043B_UI<4xd
z?D@0cjAm^TF02$4Uz7cUe(*t+lQqDga1^SbS5FlB#>#G(jDC(qsJCgCL_(FKSm&h|
zK>s@t6!13U6zg47s5CIK$c)e-0Ws)svs>B8bvAeX>!v8#?{P_?ha%O4hqF#nD;uZ)
z%KsR3D#23Q7qu$s+B=>7EW9H|?=(*1oO2Lr{QiL8ms?IYqQaaruLgRIEve$SABr3Q
zX=&t%MD3BpThyF_l6vw%vV6aEgs#uB)da>1Uo0h2bI#HNS6W*Ao}SJr<Sb-$4Zb;e
zgG-4fjQo~4`B9eJ4Na6SX<W0jf%HEst>uwV9C0KyaA&FOsB&EK`$dt2fX5jjoaCMX
z<5ILaanP%uDF-%JCb^js%XW}Pa_2RoT_J0bo%Y7v{r7jiy<!!zsZ#RAev%}Ud0+l^
zGU_Fwyd{4O`2U47y1~F~H!Z4q*wyz%GscLsectU(IFLn{&8E*xwZa~UQKjLnF9YXQ
zUfj+n`Szmx?7z#I8t-?n{h_?>7bs|2n&{VO*-0K!nh9t9-_oAWo!i)(UV~~B4`PR#
z*kqq&$t+sj(wHEP1`NOYtUUjUUVe)Nk}*Y?sHw@Xhpyny@R|2hAzBI`Y30|y>i;$o
z<bCja6i<9j*4@ant}G%~_K>J-^oCM5zQWkfncYhrshTwTkWEaGt*LK=F0bFY*y_*h
za>dGnJuXkrYrcu;+k52`@m4mG1H^Gp`9}$`K`?zZi$pCZV{~d(dC2d58T2~j&wQff
z7#3YG9eABV7sia((}l(2o~m8l_^BBZX<M%_xD@eW@>)hJ1fz#8#Wm1%uc<jwXtt%t
zOg2~uxmR2whmivp1(HeQNfT<Ra+s%)OogKo5;=%GEK+fM`?CKGiTq(mIOo152TdHQ
z@}-O$GDQBa4yGUY5&xHcn+kQ^8X@Zz0ITht_YTnw_*kPd!6f4gLI*WGd2?X7X~~53
z)g>{|KAFu<*#(BlH=S6n@gn~$DvH++yTOJP7PCI`>n&Kcy$v9^$|nWxaDdVG=`@u8
z2g0C!ulvG*?QYJGudTWAy2d{oEUJKs?3mT7eSYzKlX)HWCI6@Y81M{S0P2(<DLUH(
zBP;{Zhjh3$K()@Bw`QMZO7j;OQKcW62OwLZ<bm6hv1A82NK?)CA2g7V`179QZJziC
z^s-HP2h2+C{MVV_zq9fd=;=`q5X&C^`Vd2porUx{yzkwU8JGDxMx2;G9d~z?`HfR^
zh7X&-(kb8ZpG^?;AnONkwqyJa&aYyD1!h~XP=)&!X#J@HMEVW13V$#w30UPZvuXKh
z|EzNAhgJRn+Nr-myLRx4CUX4TzT)0rH9_$l^~;>Ih;MK1rs6}2MXv?zgD4zxP3is-
z#k&VlL?9}>t>b7yvk2rOe_IkuTCY8;=(&Kqh3b7z|98Z|xL+5d0B`H)ea#DKsVh|;
z+T}gQwzxwUMWnyY{nkzLQQm+=WF8b#<-i~1^3VT3L~0nnKM@1d!*Ke;{xfS2ADks#
z`}PXMfca>J2(a^jG(k{u?>&b1x43X5l0L5aA^(W&G2#%CHKR9q%;_yNAUyqi6oqfT
z+8CcCQP8X5oktP+BSB9S88Xg6m7qg>?_4-dv?U*U^E5c1P@)ms1=HxQc}5zXJ)E`r
zQndgm9Rz-breSGyiP`cA@}dv!;%Un?$kWM;!usA4GzoVxkaZIm{<ohUAs{S?E$p@n
z@0N?!P}`$MlW<(vA}gp1VGG?mwip>s6CSF7*EIpZg|QU=<b2}MA&$45uO~6aCk1E>
z7n+^i{IL+IqHV6%hN9aqNkJQQ4@Zx>v{2XuM5N>{J1?biAVWz!HGaO^!l0~|^C!gK
zjhM%%P9}CBQaDA#mRAa(7I@GsXzZazf(a-*v(x}J=LU2)VDCi^svN{Tzz2DO@{pm=
zyw5=Ck!|TI8uYOALF$PYtsO(Qi75kq`@m90`BIMkdvyiz_-Ol-wRt2-w)OrPz|9^Z
zNH`cwfRl?weys(<V^M;0$9n=vow}t(LrBHT-qL_=FR1&xGeK)k@<II&5t{>lfzy&E
zHx+W05DNz!B^t27r7Pt*>Z=q2d-kPreWboT6yb~bN*S>VWRy6Bw_j7kNZ{=pd5}ga
zLFSc7xT1J`LA)Ss%z@X|IbS>}%GtS%m3^PD6N2WR1X=PUL4{ve@?$QQe;RO^=L8iG
zQp1K2!Tiujcs(dIA+S0D4*aME-{;s7E9kLfG9FN5uD{*3^ho}&Y(#+qi7ug;6ckd_
z3o=9&{gDG&66%M)^v{<qc-{T>dDqyp76zX!x&0Q1JiVYJqQdIyX`*DKT868dhKSQ0
z@}*-7sF>;7mb-PheNp%(=SM9{Az&Ue#5(og+nRy}6}aN0umyEu;OjeRug7vr{jnTq
zc_Ax$VHgjB^@!Ha#BVi72W}1?>JkNAxlSxlHV(X$#tPG|S<CSCquhA~+JV>irSVas
zPJq@4foWF|*+g-=RR&)w{iFp+A?WA)lndkHCR4+H%4$sv-;G;{(BEmJ#4V`I#x1zg
zM`1QVV0rUKx7Sv_N07P|kIn*!!)cJdY3&d@HI2e|#AAYMKv?Q(d0GA(l>Ddzr}IgB
z8J?W^_k%}<y7cN#MZaRJVs>Nte^$r5spXG|eY}aV^eLS*aC};o9rxz`*vG)m{1-bs
z=>#K4=#in<)25B`x-_32e2bT+of7yTB@IBk{6e7D*sT)E)+=EsiHHP`QL=U4WQMT6
z!819b91A@CBv25$dir=>LE@-cEnvXCC&o9pLd?t;C`Bh{%?6aIXMOaYpk)${t(yi8
z&lxW|Ol-vQsO2%<jzlw?j^he0+<Ta<3t-uNdqrfx@*AO+aN*E$9wjwp^;lWjq7p4$
zr#86rQM{K4IV21J<(UIi@1^;O+5IXEd~`d2I@A8fH&`xl6F$>KANshR4VBKC=d$ni
zho)S6wxi~TBO8dNHw)T_uexVz_xii_%ed1OkJjKxQj|C!Y`WZ43>if`P5bBGMIdhM
zK?#!X$0}k=xH906#9rSYC9ai4kZdxvttt2g83S=8jmFol9Wp+Yy^^Wn&Mzonp71Sd
zBmp|pHi+UDPs4SyK3nUlCZr*8G0Q*QL-DWS;cFETp75kc!^gZUI_;i_;h(vtqln^W
zY8Qb!wSrCr>iJTJ^dp)C>gqdsSKjME-|0+ydoMxICehhLIwHW_WyN!g?L+=;^nk%7
z_|}8bf4gNyWJwrXM7JF?=KdQFl0mjNG;`@QN~Ql)JKVAoBZc~QU<N|k4>~xxzWYUz
zO5~8!O)G$qnE4P_`v=fu5rNu`y?st;@fR!xi2kd{O*+E})`%@Tx(ivFGK7{lZPz?9
zpK6v6nXqIsOFtQZu9?Q6hH-Cv=FWPpw{o3)U7$)3p~Kxeu!VW@DT)}I9;kUomwEsP
z7J9<D^2*oRt&;(TXJQ2{=Djb&4`)9Ebf9FIG8)%V>6BpvxJ>4A7x`RYbSjXg(1%fW
z8fY<9<v?`2rb~TvQE9d4xyJ0gdv}kvF1Z@r<&@&37vJy%0+=9A*>%Yl)X(6Cc=*UI
zi`0-D4%Y0R2=bH?GjV87g;!EXV$%+-fPcu*i&O@jZ6f+3->31&A(r9xH&91n5$(O)
z9mEV|$JMkw6a8ej<Vi*A7>5T`ah`$$bqZUTpMll9vS6RBEBuA?hJto{br4!O>%-0v
zfxKmJZrl&!Oo|>)mr?gy0^p&>bvlTGsV04<@5o%Nb7;AbqCx=8LsH8hr5BkZ$8*w2
zPxf&&_Kl|PBatK^L62G3<&O^RNPhi=#wF?e^~w8y_Xl!hQ&BEe!Kdk)*E}aN{Agq$
ztSm{Zd<lVq@9o*tMV_~_0d-8<HQQu$+mAx4j;UZoD#g|@u-+M=!FL7`;oRH(G@h-&
zu><^wAqa{rsP?=IOCM}Yp)0qVA>HM210RWvR1JDwz-k65$1bP<?A_FHM@OU0`Dhog
ze5O93U98_&>7?^ahCvm-WSA@@WSzCU6-dLI$#fdYU!`zw^dw`ch3Sjl$097pq&KCk
zuCM+MCz>re+nc%xp{)twVpZ>kuUj<){T#2sVYIh}PI}QVs0{#RM$?$j`@wbX##l(m
zAMq4MS_(0;I8z-ygH|Xqf+>UgxAH21n75cRcfCCGAy&zHvf7Zcwc+N;Ycf3DUGu4T
zY(G2Sa~dVkFK{d9i`Hz3pfNgz%n^&Sb=Za>wj@2J4QRULVo8%i_peXlGV*>ZU(E{+
zNZ?VxD;9iQ%`VxMM;z<mjl?%@?anP69}m^r9LNsW^z9ZATe~!&Fcs%ZHP9FDIkwRu
z!u8O#S+*4yyP$?On34Cr`vdZhh(I)or4#FvEc-C{m*>z6n5ss)X@41#XebS=CNh~x
zp-{M-o55XW=`}+HYcXcw@lps08D*;(&vL&XiE&2k@Psm4Q~l-Qga|lG(Fp{#4_+mb
z2|7+Yuy1%{(_sTITe3RUp8Og>FF4iD7uGL6WJ6m!-l;|~ybTwMQ2;yav%w49-7p=m
z4Taw#V?sBmWj4q&UbI%a_KGNe?XY38UhBb5Ra<-Xcx(DuQL^2OZ7*BE+DFUbyPt&P
z($*o{QzsKJ&fd@}vL46JJ~Ss6(-)>gcDxN-NfWL24JMc-V&mw?h{R*Y!=KCOc*0K1
za86(YBzjJ1;&iJKipAFCcJn60__;bhnltN2!dlQT@-WC+8$c69Jx4m4ve}PbQP;go
zyR^O}pM0W#Cl{m^C}=&;mY7$Dpg%Q@wra;td18<&=F|Bh_$AuQA!?gTVx*@ZpOf1T
zwfPqepZXlz=+lZ~@IABq_2mhAcVKtE5e@8u;>RnVvek8gISEPKt|z?hm%F<)Z1I6g
zQEcaq(ddze8F*Woy}-?c^=F?`|5%%P;w{0@6=&)(w>WG+i31}v+9q+AAh>SC)Tf4P
zf&H`e5ieRrjye#Cx#9Z9$--D{ZrT-`c(!|O5W2Zi=?&HBU7h>Lfc>Naf1g4bAjiC8
zjl6%oAxXGz@J$f8t#PnYYNYchD)#7+C=s0!&-YD#aF#HM5TO3u@gijDfvX;Mm%GRh
z=;mV@gaa|NtiR5{z~#iYYdt=*JJsz%vkRTuxol8t^K#&<;G|fVZ8lS>lhreAts3~j
z*|T<)osE?r^1mAIdw)AlnB;=KBs?-L2?t(X$yh?dvE8mdrww0u5pZxIx-z}1k5RC+
zm+qd??)dW#0XIs1LV4xxNSic#WVG4tA^Ewfy(8QDv9z6^!J!k3zIf`Z>Z0%`tq}HN
z0nDMm^P?|>0aJd>?H$6{FrDBQ+HT9Frp4FK-(tUI>vktP&e_smF-L9Q>~Px<%abi@
zkr#Xk4eWIo#<RzRL|*{+rQZN|+8E)9Oz_H0M0bBu_*A}XxY>PN5vxa^5vXEaOGagk
z_Z70%_wta7NpjQ6FJAO7qjDGk>U>0}Dx4ORc*z&RzeF+T#_{8tXUqv!6PS!m)?`(5
z09GrIg%RQ|j#eX!AYmDQT3WgIsE;6Ks!AY}Z<~SJO*-L-h(-}n;#`BMQiJG)5aO0s
z%j^@|pefLHUN?I5s9*YXZ^srvBDOr(2FsaSFroS-YcD$2-4YsX?Fw_W9N$c?T8^&5
zX}c%dzCzq&#$V9b-pa!K)&|Hdb^Ue1+8L4>dUH7pmOAzw<uS_mcviG8yopJY(M9X>
z999@3G<8D+0jQq`XEyAMmJLr<!UMlk28T;rkm^xO|3up`_epAA+t+-C51F@u@J1Z;
z&d$dERf)ajt@JqO%t++#qw$d}r&g1-c3O}^C>_bd<k@}=-iq&jaCz*q@f)q|Vi{z=
z>XWrH;A9l`nWj<pytMH%vtD??#V4B(J8AQ(kCC0qVXZCVm7Pm@`Paq23?PLXiim4u
zor{o$dkrLfm4pl^=EvN6pb1<xz?F1Q7R=m@J9qGQq{E53`Y~9tGyWx>@kj(0H!W1}
zcT3jmVH|g*r*qUDPOuj~*19$!QcKt5-#!D;{<4es<t6#XH|;fTs9TF!_+(bo-vKb3
zF0kfQc}I#bUu%0r6r}Y4H)1efspg&Y=I?nXTqOMVC`<%Lpz80vKJ!zp(FXspYJYqU
z1$e;KKf9?X%q7eZz!pvy*>16|D|uxhnj$l-*PTGa&9t~J#sOX*ibqGrB;)X9aK=*L
zFLnBjX%7hspB60M7xj&_OLoa5M>hG<Ep3*#s8<0QkQZtO50vCTj|9s@LqN%au!KG2
zl}PeLAZ&?A!W{(PX$F0N@Xfi9s9NI23(a9mn7?KKy>+Zo$!hoN)(Xf-K&9K@$Hqvu
zZw;SRd3{jKObuIU?Pc`)mxucL2X>HKtK!c7D|)reykIaAT}Ytx;+-+&yVl+-Oof+X
zu!pezSn(_YpVJYC*{8dCE?tRcN5fvg^naa4Gv8i>_CKEV1BsyV(APO-u=YMMP|AH>
zJ^OHEtyCvVpQv0!!{QQz`XgV-_5Td*;e$`jbBflrSSAt_LXAMdDduko^^O%|4*y~P
z9zFon8o-U(R{iEaVr4)4L!17=%yLSK);Rp<k)D8e{vTKHpEk%*0VV-a<AEo5N%fZ=
z`15Wu9Dw?NV)&&Hcr`pT`+qgK^c)VA7EAYme^4;|HxD4vLI;ln@$?aF<Y%5<E)php
zKMwwHKH#-7;_l<6B7T0-9wVjF57$~!$1#Ih`H}yYI^z?d>E8YankIqhe`Xr!^Wg)}
z5D2C+0M1b08Ylg6YyTMw7AiLmgltB7ym0qWX7~}e<F28fngPRqjVf5WrT4YE8%*;;
z9h;W$(7xbb?E-{&&;yOJ^uXQxpGPd#C;5LBRSxh|u)%-J-W`zr8lkaA%>><#|0UlA
zY+#TP1|6xD)WiSZ3V(u(HbdfnGcDp2mPn)V@x*#NShJ;OJ1^IIWBK}@-Cs^xt_j_X
zxu4xZG;LgZ!*=ecZ&`<4*dpBBnD88@xwv*MVv+KOX&NbYaFI9r%F_%Fzx1jmbS|Tg
zG_V{gzGoAyy^i=<vzL&s8@kz2Jj?jg^S>rcf*0O~0Va6Aq5<3@!<@?gxO%dVMd~mC
zdSyTOSW|6dWfgIhT!^Jx?J|N8qEnI)D8<5(Zg6MGYW<fEH`xb3YB|3TgYAn>eb{1u
zB2&1niZ;NYO}qij@qe$qw@-qkTE~FAx0{QTlAc&Tt}}9O@1)p>@O=N{FQl#Z8$`J)
z!IrGhZYwWnThD%vMkyN3J8)x#Ewd{INu-ihc=auq?}_QRld+Hgwb-K=5rN}%Hq&a$
z@>OiQAGp?fq7kYmtpdHiHHMDL^3i0<0x{p_`$NntjP#&d^n~Tx>{y$HS|HQ50nntY
zI<EMQ$l=|WqQmUwWM;}(=W{MpwwpuQC<^(Ns^bd(8bXT-ybT>Jp-45fE}S>1QA#}G
zOA7t>N>q>UTMIK&SF4WZ_YR40iiJJ3^y*)rj|vMB6+&~)p;x<HWx9ip<7z&k6be9s
zsbNoDQur;Nr|&Fc&VD<12_Kbd0^lGHr0Eqg(O;JFH+D+kKQu>Xe)l+MNIeWal<<|{
zQd8OtmrB=)N`RlQ?)TEf!Nu!y<B!UN2c5li>u-uR=~g0z?%(*Va6Ylbie3Nia3J&M
zvOF7C9=tc7Xqf+ywm8t7vCoNyo7)hu*rY+(8pp`2e2VuER6lIs_}9QqzXJFBo!cMt
zbvvTy<m$$d9(Aase9M&1H7e8Z$FVbRZxs$4ShU;h``BOnHF95;v;FqI9$%yBo`|NT
zjyQ#DK)e^H^Cg9P?pl!!C%&*aq?;?n3#c8a5S5;uZY_2EU~eTRvP8=@m&I=A|MIWh
z_|Rhg&JE^nA^Y+_9P<Ngen4E21WTthK4qT8LX@r49}C)U@$sXz?JzLX^dv|%T^L8Z
zJjw7WW3}%rjUFAbYvCrjIiVsEbbqQ(JJg<5tsQw|l%^YjZ<OwKp7VJtyIOCcl*Rb$
zeVRW|IId1GGMh=TZ3drpH88`o(*w&%w2_zdd$v9<Y7Vs-EI;C3$n)C$Nf6V&^-OL~
zk*oe_wUFLvOmO#2+I8A9&Yb`tp_Idmoc9yLOm!WW*L?-@B~bLT{{jeytiTSy7$jFX
zS6E5@(!?8C?|aozPf2rhzU^^b*H3sH$A7i@Xq$aBQ}}rU6l(nrJLkgKnDOJkJD`UH
zbN~;djY>QMdXkXw{>vc`_i0=Mky_;eKi*)<T(}g1)OBX>Y`+@`N@D4vZh2q3AFB-a
zvVYLq-nt1NE!Kj9hJSBH5efzr>INv<Iucm~*szV?-gv$TCDd60(_bpVxA&a~$&^D(
zQ8E6%R>l*bIEp0^X=mOjS=72W@%9v&{Oh|dmie_Jr~m!67l6t#2<}Ni2UH>|tyL5g
z;eR2HwKTvfbG~&Wk^oPyeku6xPNky&>TV6=|4v0mmp??G31_@TK_F1oLoUGona_R*
ztwR2n{>gZFInvdy<H`?#)tDZKy>$Nz^3||Q*}bce5A7*3lv-o<oS1v1UPa~${u^z+
zz6N?v$0`MAR`vg-S%u*v0Nv-Pz{&J*Yz!{q;n<k6+TV!L-w1^s?uC%G!QHhAh(ZV$
P0O*~lj7afYsPF#+fjHw4

literal 39727
zcmeFYcUY6%+ApXZMO2CiN(fb|BAw8yf}w*5ihzLh-g_58YC`C}C_#FWCLN@P9_hXJ
z-aCYOP~Ug&Z-3vMnR8~&T-TiI{1exO$+OnF`|o$(PvA=hNxVDYJJ+sV!;_Yhcy;X>
zR_L{B*FWH31AiI236H*Z?ZY)`iDxR#TI)%-%E_8*YDw7BNy+pouan7by=SM8<E=T;
zlG_44<aulSCf$IKdpP%`zN8)bj?YgfxNV7^4ONY%K;lOR)F@8g_Vyf!ppO!fBWhiy
zh9&V<qtLK>&v734{x9(nz%z{rXK-=NN*j}@%t~vN1*->6P;?R=oW~i#8{EYSBo?Zx
zQqxMA3Ud<4qvnJcsg|+w;q4|ns(&O9zeqygEWGSQ&!W$`ub0lUvK)?i)7v}Aky3$V
zPbsp`PYShF1!E-Hs@a?wnKGKS{|L?)%oXYt(oSoXUn_<9!np41*@;m)*yGWquG8)%
z4Y7o`GeqiLvYN#0lZSc}|CYBsvmv$DGSy=aap}6CcGTqz1s62BL|3}lc}7I;Ha(@X
zC?l_3bcu{gva346MOM8k?W;_Dk05ppcPOtb>U~b8t9v!kWQ2EOMH;=VdTQEAwVnt$
zBkPGklOQ%6EVb*4mZ$7mq9FRcTR<iD{Z8L=D`{oH89n4EVf&?OFRju@XT^INOGG;=
z@W_i^Q@NLZWexF^1G!uQS+`@vRgi1!VP|yT37@nL`NEr8bYeS_mR5iqb-sKMHH3{X
zH}`IVLefhHVbBxdO-}L}VYTkNMA}**Q_01o9h2m5QkR>ylH|<(IW1V`)JO#`TRTZI
z0Ct%YQ+VB5V*(Ojz+)uHXIrUD(pz3q+9<vITb7lHLRkFOX%omIj+x`N?m`awi>G*(
ztV{`fTvVb#rjp&1@{K)IGN;c@MU@Rt#x$__6;(V%8!*70jZ=L?v0S*6%%Il=@Dj`5
z_LSyH0$S`YP^+r!XkX85C5QRwmAt-4Md?<~(Ae6PuQLJY1CphK4y6$Y_>gjWh~8tN
z+OsEciL5oaZ>Uu>!9@oQ6)CJB*t#5p;EIpot!aPy?I$-M7X{+4oxJ%*BvT#k=SA^4
z1I|kFT7JRPWO#X>+m#_3++l&-9TjVEGovn0(B_gsK@@)U>tx~>k*It#$%1{ZMB0%^
zED*K|Cw`k$(CH!?noTHR#&Yol`{Bzrvcuf5?~3x_IqgM;y@NjRMvH^|Qi`q(iBX1Q
z5a*g|qSB34P7TXqKjdsJVEcD|%S}QPtL-r{c7rAX@8j$c8P3#&1}V}Rg0BS#S#4S<
zizX<Jwz>>Ed?l*dvz_K^Y~i`hscY~)_o<mm@}kR2>AM*yO;<B&(v$PiFgT0`%E93Q
zjrKlgz}p*Q(xtJW3(cyqHa!J*rTASpW8?_gTrMD$5-E!skyP4H(RY3%>?Z<m?=$3O
zGhyHj{}W4My_-tl%q9z&0?iQ)!A7rjqt)ny-zdr{f09jN_lh7%7L|66_J-^kB}V$u
za)n;b`5FZFynKJU+((M)Ia6}M0wc??InD6s&>u<;U}<{C*x?8MJYc~d8vfXdGx^g~
zzkP1oj21&3?A`|kmDOZfO)^R&`lmK=;Ld(6FTC{M$4HfJQ^yJ%DJB%L_qBsPiF?@n
zc-DecxBq^(&1`qFAE}%Py-<f3rs=j%ai6t5cJCl}k>gLrDg8b{Z@9fBB3t>Ld~UP8
z$^(WB?;C@(w&Xs>Rkf$s_la8f)f+aY8sAOA+EH(paO5;f?QdzMk-x)g!&zkOyVSO$
z{o47aSBYLe$dq^Y8BFRgHKlv|4j@-cBypQ{O^!>HT=avs6METwBvskrz0h38z-$J6
z<zV6{IKdjzZ700>ITMhwm}YIvm4x206^sa>416+$Qd=SwK!j0!lg0Ou<{w9P`X~F+
zhjz>H?%JhH#J*6(4wwA=c9IipnlOEgeSf|9{q}f|NH04Vs$au;h_k3Kr~aVmOOG3U
zUW{KHpkt;b{x#u^@r^z{*ayyL$E8f6Q(QTp1!413=B+1xGD8LSeUJ-hV|`{6e3|1F
z6trlVktw9R5;B>T+(J9<2)z(s61}fGk}1IH{|?J!jf8NGSQ5H`=Q6&MZAgu9YB)|Q
zVOKFMkEOXw)H?+G$7Y0|PA|KMuU@!|FZQ>!+?pj5dW#r8qZx238`65eZ-Vr|d!HO|
zQ}a+gn>I2LRJ4*yO{yj`PB4E!WNt|51k4L-XF#w+xk#kuDifBkEgb29TPo(UkOe#*
zQ3nXxzVcw`Yb=w+*S{ls&q6;}C@fVcq;q_DGdmXK6qNH6H>m2Qy_VF5T`YHH&TB}F
z^S&`MIe%PUt5ZzZa~x$;t{b0i!Omr-qNTio;;29r{}``Cy=7~~(R}#AQRjtRVG^(K
zts-2ygwgQ?zcU(9kb`|a&X^syN8wE944D)t!S9Q5FFQ1-c$Z>_k8q=b>9;7Q-be0y
z&#dldzYDG93t>fM3p(#g;dblz#1<fTU8SHK=E7g@A~xRRQSML%wv!DFhvemZ-$+-`
z1qGhDt4!iWniM+XZJz0EhJ>`Q9R{H19_PcdX@ju)csXQq8R;k&V<c6~H=r{75GK)%
zaKa`@<f_VBDyYy#d{L7hAIMVU%$WUEhYeK2BlaJi#`I)ho$;-ZY+8hf9EYL)J)d?X
z-BUJ~g@%y4v}SD*XuxeF;jb1=J}yS;Y{<}RDvCSv3jzD(e_Xr-CWQ5ug*@-4{(GHz
zk2CbIs^U(el8R?Z%R|idKgsgXZ~s>*I{x=P{<{@=hrMyX@1l60F**&CW|Bd%t*LTt
z;8GV1>4QCsB{prHes=ZGGP92ld)AxcZqDPWt0J{0z3K93jetH&JJ5AQb)WpB-Pf1x
z8M?1T9z^VB8HGufmN5Yqmg<4ySx%SL_oC+TsVjbc#jD2)${bHpf}~ISi(8SS8c|nI
z61gGzF1&vwRqO15u>bVCCy~2Zv5|_+Y3{3P^GX|eGJsBvsXkJ%O0U{06<h=*i(BUt
z%bcjl{o3w;c{lU$4L#t^B?R>ZF?0@<nM0m0(2w|3?|gM<V)(V4>q8e6x19Tc4~+Ux
z`{&fH)+ewAS|wS(fp6CBq#paTO;I&jI9;rfP~FgRah+}np1V9g8GGjJ_3&V{V445M
z5>IMM-I0p>S(Fju!Sa@<+f%+|`dP2DnPP4ouant>G!e{nvzyPc+UepM`i8^Fj0S6P
z9cDU?dJ`U#{P6NHBIe^ZwBKALuDqr0VjfN{B&(X}@h&{$@}#S(-pJtqQ|xqWDc<vZ
z5h+bBN?vo+so8cgyw=ez#DG~E+Pbj=?wN1Tz_2bTqiGPPR<D73NSxg}w?0s1f!NkM
zV`l7{%?bo6Q*TP*Dtcy?k@J3kq&z5dv7f{^yAZ7o!9n*rkR&C#*J^w2yIbdq2BX&p
z_Jls#1>f1@05sp6kXr_=p4Uar67F>;fn-FJb9htAS+ra}eVx0!7&tO(D5%*^bDl2g
zYstwkH@8-ZPA=bTpXci4MJMjGo-9g3_CD_-c@D=M*5WJOhq#xT1uZL-wkhG6aqgTt
z^UhSO1zw#)NvbJiTO|YK^OvF*#{=`#{DujoyK0)6*6lO(Ck&l-lciL4)0;cGcj39!
zB6Gu7bIOQuwt%64-QIB1ufGgof%TV?N$Uwzy&hoi`idXjX&{ZGqXNCyMU*1h39zu`
zh<vMbn|6-H4}&t2T}??_Pj4(eBt#lF`r!vMYPp;4o_cwnJgeTR%t2{b6<OXJdeCy2
zRh56IK~>$d?YP|M2$m6Vn*AIZ3P6E_p1V=(aw`<7Nkoc*@5S7EXU_V4^&+)gmRqJh
zg0jm_d_e<(3jr6iCh1Rxnv-$q9GtpGjNqbPb7AwPq*Bn{i*n&CLadV^wy6LkDjo=C
zq)HLnCYXsWPbkTAAG4DzXs$(@--s|0IBF7~Gpcoxmj@lO2sC=`I$pRRG`DUH54^!`
zAT)tZStL-CQ<Pp>Miu?AK1$c(j#r#`yqoEo@XMTougsYVEk*;iGVUB%6ZP6rcW)z=
zq{kBg9}dQHU+nBj?|jOj3?r|@q$<xMv?|$1H|9Ww8~b|r0d;392~o$Qt_NR^6N&J%
zTo5Fibs{3h=#4oK$4REzv(df#M7k)<_DCAtMUB%ar-<%3>yDt}RGe>L`Gm`wO&|vy
zoNdxIcTrfLEU#F-`nch46R)OYC9b{)U|a!EK_p@`QWEwE8DLpa<%+3Y3nvsWOYA|h
ziH5%0im?)<EU0L|vva-5iX?IEX@^*RCW@<@-J>f~eU*v!tLJ#?s)fiS=n;w5drSju
zD6A#_HK)X(s|=}B)AGj5*#>zNOLlchk%TY^qU_Q?JYXf_$q^8o$*32t%>=43?2mUN
zJ9=`N=$b0ht25(|9LpDas-+Xx>|<TB3Dyf=X+B4*tbcu|@fd?rh09-9(4|zqHu1QM
zoulD@WGdo#_JA8tWiXh~DXOQVDJI3iTzA`ex#n=0rOgJ4^N@m@#-zL_6kN8s+NM%z
z5W}qa^kTYV>n2{kzhy1Nq<?I!I7IZZ3tEq0l(u%cgm&)cB2_X-Jf!E_JJ+30a|eU(
z_-l8^=ZM7X1Lpdiv^9stYB9rkJ6>LAo}BT`rLAI%_Y*ZF<ec8=_9$`dfM_ST&d^gt
zO_EhsWoo)F&$nq&Tk$ZlT~WW3ZkQ|bWoEMX`FvE-BaabMQW}`RUKLdhc@tae<RX#l
z+>XZF<>_I~aq$q9^kfDjR@+f`QnUa+F5MKFcH}23YO0EZBIoaFMz`hEmAJzd?PUnP
zrZm52SFqOk)R$Z=lF`w2T7oE)?e&FR#}?X_O=!*d$T&iCdrHmh@aRU(P%PmJkW1m!
ztT6ZI?g}DqHA-)wDl5K)L;YNaF~>Va5*PZTB~&665)_A$A#V3wA}5hRybkzIdpxsR
z&<+5C6(pg<H09W9FdTNQgKw-dTmh4TX*mYYdqM3$(nVTM;MZok)$Tf2Z36u1dp*m=
zbC7QKK*;Gy%AU#on2p@xoRgdshK=*n03Fr{ZGFWH2?{lfucHS+Fz}vdzKvZoxXp}}
zFN*;t0BxLV;0WRB;uF7zRu#rF@mq5nO47cK40&~#>F;??0~lQelP)zWx51rFyCW^t
z+iNCxbYyo<w)8l$;uxpIcC~zu9QE>`yDPP1tTMA(g%^g>Mn73p6V>d{HqA#Q$Ym=r
ziCg_|$O<ANx2o;Xak)+h=q_a?thN?8d+QNgyeY9)BrDsyQ^;g+8)N<_hsLzF+@Rbo
zlA~GqA#JV|NHM#iRVO#<+xtVQ=vbSEqi%wfL|uwqHqY&M_mVS8LavaDh4gmgfc7Wv
zcOIR#(t3D3V(|{q_4Q!Nbp5=#!FFyzVO@~1dC!M2_r}tQtej1LqHg~DZ{BIonv8$q
z#U#Foz=l)g2Se}CCfcDnG<3~5;x}2dIYWj2P7DDnh+6OSi;M!gmD>Gjyl3v<W6zB1
zy1u@yFD^doz+ziuC`$NXK@|tSPl^t3=L@`!b!|Jf`#K=69Ki>3BuZ9$ZLbNj+5!Y@
zKsH6}K&gMKPFYbDLhe{rDm3fW0H*LxBMP*o@wza3nhHp=sqyf;YkaIBhI5VDjEfD5
zI)@jRD_WyJdk&x3c<lk=10F{FdNU!nrbMsSJ8g1L1U{}8v`92X#v?$Br&V__zftdm
z!?6)JLUR~@21&a6j?s+g;<$DfmyR}m90hPO*f=^{A|6JZ;N-G3&IQTINx_|lv`uwB
z+ED+ED&>2kiY62P^!@gf1{o_+WGD=`XZgD|^a-iOK5v;=u800yp!nfr;bM)+^Z-{V
zgs)SqH&X{v?`74bE;ak}eQJGYJR{gh+>la*<Ho;=bGbSKKRD*CetkSUIH6j5aQQ(x
zLJtGs8Kl3TOrQRU$iK&$H<57N75hF`2Zt9IrDM>+(9FZvzolq$Fpw1V*8a}ZHANQ0
zamqL7kXY(U23afwbFod+F8fqgE=lzC!$df`9VxS1GV6ZSLzCVY0;aMmUU3DYMpiu+
zI#MY6hWPoq@DRgN@tg3uSZl5ko{P!X05DZTy?97YxIVo`CI#u3j+>Jvm?ExLOa??t
zaVJ(+;6}M9XY&}s=gm9Iws?wAtJ-sz^X0^M`E+x=A~#o<QnGiNJ4mxRk*hQw-5vgq
zXhQSv_3F>u+rvWWd8l=pqNm8){q#Vdo*G30I7N|z23G|k%Wco{(;RoKz-66`ojFVP
z2Ky&a%(-*!g0AmJZqZyfdxYNOPBC@bVGjl>vh`T+v`4V(`kG;9l#n0nu0ZA~wjR{)
zbxwPRG9ztGZZ<kxb)f|OeScn4l^)t?5j(0UyvswZ)0N|Jk2<|rDFh?+$N|<9q4|A;
z&zV437)yFCdhf!)eXo<>jdOJabrl~Ya%ix_(c<J#7v|hd@6K09KrZyNM)DENsnsPx
z%h`JhQr&3IliY|2hcS86hVR41xdVG(is~W!Hk(z%r^5U}FEmg9px$1>3vDyJ4N0X8
zWg(oV8jzG>A=p+|oBvkAIU0wh?_`26mS;21(gS60Tes-t2<>)zn3<LAEwoqe6Mjkh
zsf19glUh0Z;@CY#EyuI~Eqd}TUOp~bO*bsQ=fG8_L*c3pmi<VTkI81n3N%)UAoy+w
zGx3C!JOAV>u&{rB@|8qZtM2$mlYTpCRpu_3_Cp01R^SYB#{ljCW#J#frNa)(D&E&0
zi>DZC^*`e2TZ)&O9VO-&^xwg3?q>?Mi5*H=TN3Krcj>>8@My#g9FtqMI~NZcFjN`Q
z>RS<8dNJ6-2vN@<<70Q~BjXAMu_T1Iwel`{TMrIZ>}1)=b(JV}K?la%ZC^KuaT<he
z`Bg=uap~5cUzlV*?vO|TbBDTXA6pK%p<RF3ZC(>>B)?`owfzJG<qvq0X30px#ZZ2W
z<7*!rg+qZ1$EZv1&)hwtgQB&@aMGy=bf`Bx8;!waej|$_YS`i3#6$Z>X{jeon}aaw
zQ8T$TQTdoN7FxGMaIq{YXR&Wc<QH*lMlOE778p@4l!r55l~ii=-ld)Bq-F8JQ`E3|
zAU>U&=%A67`>9eJ=Z#=-Ng^|qjRQYDRhDxKxv47>`j51Qu%yx42)rNn-XF&5N}vVt
zAaalJJ}J>I!bLqcX4?577!ye08~D<7&YCt{x>OsfNp&p0p-t9YIW9V3uq31(6j+pO
z+2qhWy(G+$(Ns+704}R$+)@-rUkc^xQ@hdem1V|q23${r4XXt9+~n!qnTw0<ZNUku
zF?4-DS$qhpNk5;MkkX9v-hV+m$cqR#(CF_=?OSr^j!;ae!M@L7yV!k={VF}ZEH~{J
zD_SeGh#nwY_!>pe^W%|-^6W{<85Y@$eqJ8ZC$RLxVZ0LGV9DtYdkAdUU?WII#1=if
zBNlnnk62s})%J81a#@wI{660g%`K@L;=Dm7)gJMr5d#ZyWr|k>L7=D1in0IbZp|U{
z|B7QTFl@G&v`HJcoYu&9Rz=G-;GQ&~DuY=%TZ}g&!tPD)0-QlqBq?;W^ZITZh4;Ku
z!v#fFMv3PYyZk$eHA7TzQ>j9XoqK3EcqVIIFIV)L$b{F_@T?{gbFp0MOaRr@L)2{p
zy=y$b?YU_K&6{+e{Z6MaSUk0U!4U%cScw|Wi?CR8GcB>9OnJI?dRF`g|CJFcSq=LJ
zhJ_{X#!B8}Z;m?sVT`{J&izVXSMq1BtGDsnGg_MiXquX63q(q^Iy1hDvB~Mfn!Qtd
z=Y85U`~2aMv}jeAY+iNaDKk-W8@6e|ML!*AfyiYOwdGZj<j(tqNWf<~x1n=h-fF(q
zx4y&;M|t@oE0KZce0C(kldoa={i~I4x{z*7v-ZXJAO?(4<+rL9M30OBW2}YuGhNhM
z!4}s>yg@B`sT`mf$S79i<8#Sp>A1H1twY@`aK#9ZZt7XOX)$ZRa7flZ)tAPL7$q5=
z@Abrca#{_fZDk=zP~aP~p0}10KfAMEII_GWTRE^#VgC=<JFO+kfcop;Ul{%Dl~B<m
zGq{F!5MG?|I?pc7b!}yVl;ME1GP05VH!vxk{N2HC50$hR`@Wp5N5(!8b+--dW+F2%
zmw<idDZ&r}=)=CGFoa{PqKsMaJ)=47dsn}8bLWS$luBA0qtmC#eri+3Eg>PG<gT_$
zkonr@&y6z?(gbh1yApzAF6uuN7sd<TO!|5YGkWU!AF!-6To${iM&b`m;PC`X9OwGh
zDtm4Iq4IL<_ho<H)F(-bvHXs8>n0_(wn0yl<Q^O}k^u)LqPVtxKGqkKTJwOmKu-0b
zaV=2oo)Fd$!IpG{U<!Lkt}>fIrK%$A+-SphEO+=FqZLF*_Wy)du+$EIV&S5!eD)kR
zIgI_d=JnIvKs=S$><#~;PwbQ!KS+&|_7YJGgVlV_gLvnita|$%n$=+U)N?S0vGS{a
zJt+Uh_P<^0RYvgtBzmO3-CfsdeXKx8Fm|&Yzx;4ryQyTc_U&CYm4Cfm{+nPp*YG}a
z#6d$@y=LUcOoS?>lq1RdMmm#|wwu=ZP6xq_8MWE4n`zB<Y!`Yp4+xyLR+MP+?)_mt
z6r?BYE^fwEAw`_~=<Ds)?x6`5Mv^_9wR56Qn&w!YS#<_!6)nr>OXhd*JdbJ1w4#N0
znhCbdN;y-GwYF;dqN9xX<A*_vX2^s&{7J46?|y=m`=+l&c0dtPs&55q&vW%6BtKRI
zS*yKpj$kCGMJ(B7-Mg*W=i(^#)qk*%QuTG?cAwIVZO4arZ<0%w{EyP$Lz==aI7cpw
z6c5Rt(olZ(n8}j7x{c0mq>~lQGC3b0F50>JMy1^q>Qa~PB?^A^J$dBu{pbk6QZyZ)
z&-jWLS6o{2i_RMA*MW_PBWkU1T;4T=#@)I}0dgk^^aALNn_B}zrUoKCE6`I|e#H#I
zgu96@^Tf9^tLn#rV=;)ESs8B@`hX;_F|vP)Yh|){q<KJq#by2$dLi;xbFm00c+{UH
zSA2RhJK}&TbbLtgW%1d#r?W5z4PO^TnUVJT-A>VJRoeVKo=~CnB!9iVZHvpdPn(za
ze=%mt&xOz-MqJ^wR@59ajZ5)J2ACx8HuVuu0}j8&-i`9Ngc&93oXTOfNsX!Jgyi3z
z!*Cpt7dHcRNZ)H9OZtWFbw!Lzx?@S-LY&FTL#vmlL{zIg?+4o_c|&aK8rV6pJ`B~J
zFP}KaSnw!dE;lo*7|@RlWO8uRfQmRw9zYs0J>uq~^8tD11y#%QY2Lolb2_61BDQu5
z$3s*?MA`s_(`G}7=_HnKFuC#0Ve+;#!Jf|9rV1DL%8;l#hsfHoPq^fhMB64$X>!W*
z&O*kmN~TGprMz&$m&vybmwx_C?LmuA2co$z_SSZ>P$_j=mCnrz(J&GU?$q9LqmpHS
ztdL4pT=atWRWMS<$NqzoM|k7Ib?H&h#N>XM_xz*LcWearj^=3O?xuLV>-t8byOzIQ
z=JM@YXXLfqTq=9@NBd(#x<QAr3o+R+&<S?<(#{Dj)PKE3uX=BAZuUX6TD2K2-2tro
zb_am+DEU2y<^}Z&;u`oZCkEz#dO8Ir=NRlgFVe)tmBKV38m;YG_(P19PuKYzh3-(E
zZ=;3v9grm5x8k=nR|QLLnvHjq8*vH94qqX|wz3Py2?5Xj?;R0mMgJn!E`UL6s}ynT
z0#iUu7n9t&n;vr{!A;!1J1)UdR&b+u8$aj%0GGMKoV-{eNRr=DcraYXb(l?HqxDxA
z-u)%)2Y>7<=ws){yo=$Kr&;OoOp}g%2N4Gx8B{f1y4Y&8dPDWC1T}|M7G);x3l%6v
zW!$(~2_NPTu}S-p6=|$%5r9;iT!day(sr1E?Gp0q4DS^%9t?{0@}>4vj9{C+hJ9m$
zdI#1ur6R-89<w@bD7$VEv)zgbXCb4|^E{*IjFR%rQPX5uYan)>me!Bv(48op0vJth
zvuY-_-K-tH)tusaZmLQug}dkNW=`V8xl#|<>d}xUql`b{lsK*{FKH|CaEhgk5p#q`
zhcv$87#FzBF4sl+^00fGq?nHsH=FH0^b|@Nfs<VAC2UicJeNF4GSXUUkD$L8K_Y+=
z%#pwfTl><ZrvrM?z#jC_?}>gqSXt!uQu&y%08n1vFGf91`@|8%;k+0Z4(DSJO>6{1
z1Zz)Y7iOCug4IZqkdK~`yG;vX+ZIriWi}bL_u=*_l86EXP}>?b<C}u(ak}$rRz^r2
zP@l|muYHg0jxT*AC4A5#wj0MjG-h6@#d&i8VZ@1L!a!13ao2E6(+P?Y8NxJN4Bgu5
zVX~PRVA6`29<L=P5KQqH{;7Y;na~(oYes#$@q9Rd5bBl8Z=@*j#Cd`RJ6wTI;O#NK
z5vf!yuU!{MZqXy+Bbbi3e^W{gMryZG188vlu%0UFCYut|a$W3+I&JXNw3m1v={y0)
zw#|VLWpy&o2=eTGCXK-NzhanfpKX8I?IdjK{l{j({Fh4+<>&KZZpdHUVIaVBz2@vY
z9Ae9<V$r3JYgug3Bf2r`aXQ-g0Bc2o*TA>abzVLfPD5#y-6=L|c*ppxi#Nb&%CflK
z%&a87z7aralTD{NNP=OOlRALmAZCKTJnqCH@_r<|vh5K*H-Eimnju}~WXs!CkD3FK
z`;;r<qdD(eB0-OMZ~F{fu32%=$x1D+@By#3Cp6SBr-@tr=9V`gru#`Ga)Ek#_*#G%
zaIdcwtaHQNp=}!uydbIcuUu;%;BwAM2GnsMO{P?L3Fja)iE~V-s%A0Hy^jLA>HZ<y
za`!2O$)~K>vwzW*=V)9!eH)JC@uwO)HA%fE`noD%TO?c(%Mg82kt6{|@=Dcvs>*MH
zVS)g60A0OV2^1`l^Mp)1i2}6({CNpOa@?~)Zs5q0O|dPDyf`3w+DU+K$^=Or?c4@Q
zIZdcVd%ItbRGCfMMH~zU5DeFts{rg+ta$pwE~b`U7k0<<<ZzCACRXIq<E0fYU7wN~
zNpruJU`u9x0wqH;?tGOA7O@WVZM_?V9#&9ylf)B~?=mY(kDmiR6_}L+bN=b_fn6*O
zZU3f2aB!0IWA`Q6eJg(c7+=HuEx*6gcY~OBkhtuPIxlTgnMESA&!i5eXxK9v4``<a
z7f3E#-hpu|_IVBJFyVFg%t3rk?#p7Rt&B17)rM<vkM{BCj)c-NV~g*EY*FwrIQD%T
zs*H$EfybcI!<5^B<pdguZvupDH<hoN4jP2#)}=C_=lLjZ#_!m@6JPcZP?qxPUcKUp
z2Gl0jx;PU@h7lCA93KZYuuN)B83x<Gc@tdn0=6a4adwi>Gwd+Z{h<~lOaZi(xcd}H
zOVf0gQC;DrQt9R#6OlsulCL$nb*U>@oib2wg3U_qOk&knczhKs5Gm8O2DluGn>@M_
za|7@xz)ha`Y#JTn(H(d#ht{_pr5wY$&rbSTuzO<!KSgbwg=kn3H*q-35f3WdcbT8e
zykZ+2X^v-1qlW2xyDZ*P8S8w<-Yw!nNNe@w)b<GE(4!US9iqLPKFIT0zyEp^i_KWG
z3lf1DTj9+!A!C5yC5NyU#i=KXIeL;(PQ`c9QN0~q#jDJn5<*kc@gQN%F*k^JpqA|*
z)##|aTQr&SH)r|DPzGcSrlNJ~Qxv*m|GiWDx13b$@XOnKDH_8vHo|Y}-KhDTWZD3(
z$wS{mzXnxd-$?vM8$<kNlVDwU4P`gLqWspQ@L7n1eNWr_s#ivVU5)D8W}gR6`ft-9
z#>swl&J=*oSswA)9Tye9nwCtC4TKU2E;XJxxX1i?!wx_a4Cg)Ykra<uv0iU)kCzSs
z0(+66-xK6|cY8oWl?L-zz1}9De^^+x=GzMFhB%ouzXWF?oUJ)ay7|>uET=NR7c}1!
z%k<nS8_q4x<!im0zHu#dCAF_+XL{?a1U$*boaCv*i@w0=PI|SAN2h$-&6PX|ciz9v
z@`q4%BVs9A15ZCg>Z;TCuc7+s3j{wR=GzcRN{+UV)#aV9nc=-PdeXS!36Q3a?eOOo
z4{SPJiEz74sYftMzb9<?*x^a9NY&<rJsN}9tz(tD4H}8Nk0c|Q$K}T!q<@@i+C#)S
zQ_>GjTmHk>S#gkugL5Ut2lQos=yW@7&(U*X6sRoJVl`9TF0PtBMYU+rx~M;`#_=al
zc6*^&r2OCUgN2kbI55BKTK%aPo7bx`#|yVUhLCE{BdhN2MZS8vmlg5v9m)JRK~NgO
zyQ6x7onogu=_SDD%my6{r;51}T3*n+S(dAG(P%V4p*i-f-|SZf`t;N91}V_r6b=^3
zu6cbMK*Ag1K#^rySbrB9aA`nI))z_XFf$Unxd$10@qhU<eBJD9L5EkSiiNcKRD<~c
zU-eLB!v!?|n`diGN!=xgqB~$zDhU33sx5cfjT7<wPvbXevJr5||J|wo|Gf-)w9H@j
z`i4_JUda#der&Q65Ar}MDDl@cUSN6rU{&JJV}`St*mIkH3RB|l)-CmC5PT{zP_nok
z64-5!{W#YuT{+dZCb8^(RlA=OH|17O%<$1R&ZXG(^uPwRfk1iMK_8!T&;`<V)Xina
zrBk`LgFY##Hl2C>G)As5-JxMnzR|U@pI&~ey0$9KAGv$>YiO}V*e+(@P};^1qb*G$
z=`m&PA;gq%WOpe>wXH(zsi5ttXPIkfF3VWWHq4uyUK;5nXU`hTSeJZqsw(Y2eEUhv
zSJb^0B%2E}Q^sDD*XL_k$nCx3(`V9lqqjXe{gwE;hR(70$4R2*tio!wxIN&GQjLbT
z4c+pMV1t89cZ;X|I`-7t$tXdcUlwlB)=BR{`kL&8cBH+AG&*{}PT^SE@i|ow6Bd74
zPY(UX1plJvVxdAgZrU<lzW2GFipJ4oaQ0N$<<qPA)MW{*le5gz5l`ZQN!jG@M5nP(
zT)JZXfU}%T0hxFMYNN+1YbvO89ry?S(j$93x+LfM9<NA94iW2>hxItJR~qBR2d9o=
zueQ@)&<Y=Fs%&~}JyqACzT5z2T~(#Mg4<D3whh1&q_AJ@V-Jbx_Pc*aWTO=A8w7jA
zCbuZa9z@B1_1H2j5FTfQ=u&UN+iln!8j5iBi`NNXPxWz!Za)X-D@g%S?0E?b#Q5@l
z6d4Ol$dFrs^ivf*Qvt1RN$+RIi==RjAX5sWCsC!BNr7CiDw8}W?15}BFdr>EB*sb=
zHAF+0azuZjmaRnO>|voo*)}5t#b;$R2_Spo541%*$)gZ`pSfDnby|5t%_8@uRBEEF
zkh`V&a5`pC)KE;kH2r0rYVWy1A~RN-y2`GvP%R@9!uQ90rHY=*p-l3WUFW(q&Az!*
zkL5NoR`S;fs#Rb%Rthu4#^hCoiKD#ql)fb~qTG#&+IN+{B#N5b)f9ZnL?%kxA-4J8
zwImV3uE7IfkIVV9DVGE-t+n}U($J~9PMA6wSKy9q-MV|>TCnV?M}{x0YdsiYK$_pm
zT4+;O00t`cCsG$~PAU6r9r!36c($K$XvR3iws<QYOn!X))X)`_d&%G~W1o7J2A=N+
zEu@<2VK*vA!c%ugr38mAs2X{z!Q3nIWqP#?0y~{zoS!01-fg<kfQ?!{X9}@&hv?kc
z@B&+GE)uJiW80?Df)ONXs8GRG_Ca7;k&gC`A<W}~nMqNe>mkdEBI(XO>|vhH9ROve
zYG)MiPx0)(XBkQ9v))bO8_a~dech38n+8h>FhbK)i9T#a!o_+>#p1}+u<!dAn;jN4
z1;m@tWGQbRH9c7K0NR+A_g`nE=r*}WEWSdE?T&EBIl9+E5<@;KCR(e$+X^!gnk(6V
z8y$<i-W~qflzPSd5oJnWjgvlUYc6-Fz50a!-WfXcTo^(dxXx8xEG*}4g$Jc$N$kFH
zC5eFVdc}{;=ZR-dGsj<^@@0n<ZIHg6GDAVkB~&cjd->9pgdj{?yD}g=2`+I%YQmOq
zKXN=SGy7hoRl;augx}8OSenk7#DG$)E0XS|K2Z`8Q(*^8zA+3aHD^YMwL51=l`3@P
zwBobeIpKHF89K4m2dPD*!~EIt1_>fdU@545-XmBcWkkz$C<Mo-sRgQK1Zzj~&Xd6{
zdPK^xITd3iVZpM1ddpPq>g^{ad2>+YXO;A@)gs?0PQlx!WMMzy3ge}H<@w(FnxB&-
zDaaFB+8Vh-Q=kJAGH6))%4i`~`p%3DW8xO~kY89TZ2)_%WJkJ9OknoZ1L#PF$gFm(
z-=j%1Ai08&_8pO<hG4MN)4Q8ShUaZ}H@V(n$zpv_YTrYyp1_C;;wZ6ds}4^<Y7ehM
zj<KLIv6|i@`_@}RknlC=4X0pJGU$h}B6hGBHbXEUgxI3#2QFRu&m^Xfeux`^S>gzU
z9XGBc$%Ie&bXBQIOn0{vr}6DlEU8q2tsmc}Zr_2i3MY-MmfGC*9Y)G^&E9k+hmv0I
zP`S0@OI`&8*%=>d_{C9d=6GgzeC=a(MM5kSgUt{z13=W)yQc(78=0gn3o$Dkop*<E
z$kci9d~}pfcEI+pV)-FhZLNn^eV3$~Pm2r-TUm2K`fhUg6wi5dG4H^J%*3xH<v5(#
zA9-{g8S8@$f0{IK7_tvXjsxyh61b$G=AVjftHHSLlR19Ubh>kW#fHR?`RzL|Ji3?3
zn{K~T!Gkr)ftF6S$-?BfEKCRq&Wp|zFsc>w7A+Fbir3MIO9y`)gY2`0c&YG6@?@V>
zWjZo;SoK+hzaF$|V71vzf!fZcRYba7pZEF}nt(X5T$E4amG0LSE`v?e8F$Sk-7HQb
z{LQE}PiS)h9wzAA8G<QbGWl+*(t}-&eIIG~J`D<8Fbc&y74Ei|1ER^OA7m4!Oa~a<
zBuNabg>LTeGIxqq&Yr!|=VBZZ8;vw=lKT{sx-0cXUh|WZ5;vwzp?oJe)7|{l>FL6L
zseFNhO~&{56IN5|HnUmb`A;O=p)hjJS<@K?9?GgmOrqMoKz!qj@YYLY{2l(kEp~UN
zi2mb!qEauJl8b#ux^!=&CHym+OeF)8XjONE`HgUSOXg0^gvb|@thth3XK=VAX~*+P
zi}|03L*A@~8BxQ9k=w;EhlWnE$>iD=G4;mDVR~V|mKOu48=sS*Zors3_vm9fyjL2G
zChS4>hALJ3{<rP6v}24ZT(Em@66x7=I_r~43EXzRkT@M}<_$GUFX#z?FcSa+zF;UL
z-sKt6POV=9EAFP*M;1)rB*!=WFRFb*pl4cVM>u@k*|kG7CU2FnPTr`5NY+*mbBF$@
z*-6VZkKQN`Tr0&dNybfc(9&F@`c+98lV~B@@g71+&$02=9$0`A@{09Jng(A?D8bda
z;@_JUfH;cT)7wa4`y2}}B~IDmOz#_JZ%^SyP*w9JePLJ5(9M1g7{{;6_P#G+_nL^0
zGwOZc|EWies}{5T51XyP?2|>{T@Gt~m2NLBuGN(WsK9q942`gH7)mI77Hmuq1d^&H
zd5Btd0~i58kBfxN@7)nCLn|DEgbi9NW=)kwK3TH`(1p`7N3bFGZi<pihRWdVNQFk4
z&8~6380SH>lIQH+h0{#&Sfo`xfq54L1<z(C<+p?<sJD#c%fy3{NZV?o>TRF~P<D?q
zxaQ6ncxkEv?||U}dLJboEWW_{Rdj4Ls4F=%k_<9(^z!jADc0LdmYPpcruQ~jDmH$A
z(=d<Ak<j7};^@OEp+S3fOHCVa?~<%S*f;hCVyMKz8l&*k=Mx~GC@L;VE<hinTnd_d
z?XHq9u<e4mzy?A_U4V&q(S)vOg<u9^jt8~9t%sG`g|yZ4s>+PN<XHu>NP>X#v(tF9
zR2>2*AiByr1<k~n@TC-FJPFoqp8-pCCJpdQ#1-3RQyA$_0fXh6r(<JC+gmrMSe*0{
zeR|7tDR_`AIHU&;_m;cweLE;Mq^^_Ew(I&X%Y@W3>@EU?_0hv{m8oxFWWk|6^HkGj
z?ab8#6i*x__$FYhJAl3T_6t0^1XV(WNhT|>;$nC1IvU=1Y$uqSON-(ee`bF;Z2ZJ}
z>)Tmdx5BUSX^F9Vh&&gEnyy5!gNR=q>GSo&z@D@(M(Hbwru{N=%(5V9Bq5fx&+wq+
z7pv4&{M=YBvfHi5RtACP`+|?!c|xaov^;F!v4(ohnXXB-7*Ld?;iC?*qPD<1w_@y_
za3z<%dkyS;k#}1fyPMw!9oBP)9{7&vc@%{iP;-z)BwB03vMc&x)&`W*3yQyk+_7$x
zrRf9^?g#<YS=QdB(@2bHX;k+k(U28nrXyB95Cg+r&xUQ~KLdgRNkBGYX-1@7xJQ$y
z?V<%V=SvGKmbdzf)!yKu_&c5YX=26Q54v6q?Hp2avAg4Xq6w84z+<^bOALEIem@Gm
zlY!)*3FIg_maEbWiWWQw$%;!{<2__o_N8?i1sjI<+~v}TG-<(2sqDf~c}fSu9zEMg
z!mjQpRv7bJa&SK@M2zvpz?1B$7Yk^go&z1o!bzs-cT!=xsfn-v3Z$vJWqAV0TDlHI
zfPacRy$;rXqG44cCFjC!@9AOp;?@rfC5NW2_Z}(D9LQTB0@#cq(sV@coZTh%H5If)
zG&tL4rR1{@YJ*(&MufeVQ^RyJB#}^VD6L}K<NKPvO8(>AH3NYv`6-EY!%8?=n@&pR
zIfx)w7Ii1ZXX3Nw@cRI>mA1|z(kS@PO4@)(9}mie(~g>-4I%Q4@4sYLdkz?~QE%FH
zc3SVn?9w7O&J@Yhs~e6XFwQHU)OYzYyNIVA^Ifp=i&;`gnmKK9+wC<(5x6t&{TtEG
z_;zR4<)4d>^PZ&&elGsO&2;d?nRB9)qCSylSF`0@9^YL3B2LTM>b-!CqJj6wVM@-4
zm`bhRT#faWW`M`vvuIjX8mTUfu6V%C9o-{qcwfKk-CxkK2RbZZKE~l7Q{<(haK?Q3
z;m>~r9QruGB2q_7;mSdQN+%pN!oqBSy*BsfvA>@E|K5jjsAv?i>?@*F{0BNRL#3%h
zSKK>r{Krz{mltQ@{box3=Q2^^0B0Gre}4DRF(Hyax1-ab5)`{nUjx(M9#4Ar*u<{x
z3}6%TXN?-+U_yu10u^Aw`tYk1y6)O<kx1Cxz_XGzmFRy|_iEu@;#drg@Z~@L+^UNZ
zQca&l_&<3wfk2>_vWiwZ;gYgQD|yF2#<X<UKhL`Kq&3oWGG9eZSyA{vJGCviy#jCG
z)&jG{*!}-}s%7?Bgy#;`tAs$?ox9o6A0cUj!B;DB9FgFDS0z@@XWY`Ee!o|@d$ezZ
zCnwU-qR#A|d!IBuZVT01t|slIWmsh{&y{V#FSiGRpH*Al|4UkFQ|I#)bSYj(#Uvl~
zgaZMIFSCE7V$3>7eRuxRG@Kgm_iFXtM>I{TR%EdDOZoe0kAAPZ*bg9m=iE$n8U0W&
z-^Bi9fW#d96qD-Zb@WWg7UL3|9SM4N=^P!cuB-Ag?5C?9s%gsoQ2$;XS>)m~!jE=q
zE9YGJff*^D?}bdgUFKHO$v`Qdv&Ym5!ePl?*D0Xeb9Dc{CB^e?wLjPYry%~hBD=bL
zYD{uDD<#ibsB^K6S<9`BGEHkCSx_r2!Q|K7(}KE0;=1Ewhn>{BhyW5bW6ZIr*X0{)
z1uv365?Ne<FIk0TBsq$m&yYDW7LcAaOVn;3mp&U8`h6<vzHoK6_GY$BhTFZNzgPH=
zj+tMry{I_8IY9Re{GV3v8+comJ*;DhV5dn-<ng2i8ev2U(K|nys-|q(Ss;_@Q}jAE
z7P6i$*>Q+T0g<n5rQ)ANG5JrA3vCqcrSB|~6}%?;-pE5nXRwIsB3d~i`Kfb*m*xMK
zj{d2w38|mko7fz-e2ut$&4@`m6HZgUj?lQdwF15;0ZUKOVyOp1q(oK~E2F^s3V%z(
zWaKMJprW8oUHmsYML|9--NKtb<t|r#BmkREr~e<(4>k0TQHq7}xu};XuZzbIk+G%@
z=2GilQ?wPAlVeR!nXBI3o1A*_$JbrrosP+O9*|IK5<%m-xR<_BTP6c8{sEu1+(bH$
zWMP&>CE|vO)bEx5J@<$IE|-6?V(3RdCtl;`g_ul^k5+Z|X0IMiy6z0lABcJ$LC+5|
z6jk<@thA*Y)<|1Dj)pn4EaP5$+{Uo(cKwPhi?^{{7Mf9mz8uem#=fb2{HGYAzPcrQ
z3QyXgFP;`0@z2LP2X8p#8J|rY1*2=-b<gWny>54R_3mwXky1JLw|V1>9F|a9@&^5J
z=c4_7jTDB36Abz?v07v`($1{iB|-BrNU>tN{dj}TlPXa|F=oo%s(L`dt!ZqdQqjHd
z0(}Ej_4>*iuN-36{%21{)6n>i_3mX2x>ij&<>zrD0zu`zQ!`ynlS=t2BiU^$c>3+%
zJMNa89XHO7QF%N$N!cpb?!HB3XL{bWMs;gxfat)$U$_OOvH4BIedoXl`_DH2W=31O
zsMjJ+(T-y_5p>;2yvdqGp4-P85!<fVw(If8xjY^+GT$SouP!RSfDMPC<<uM+Sw)t%
zoxj{%x;wl0C(0zeiDs?ZNp@~!7Jphm^47S=`M`lE7gVQtButlV5mGU0AXz|Ow`YJ}
zoNeRvB8IRp#v4sK_tOO5vZ+$~6N$pURwX-gOOLf%50nlD$7t9@$?$L4{2<(^rhOP7
z8KO?-xqvM2*(<gm{iKR2?i66=NJb@kJ`<Xg<7!r&!o0j!K=wlG?B>9u8!7aPvaEsv
zD*@A&x68(<6a_ba#48|6ho<<F4S#$%u6}BT-@PFnE1AFGO)`lW;(%<uTlYDhPKs7>
zybhgYt-O1gMWPjTuJx95%WFPxCw5?PvGB%Dsgp1`G2v@fiaU3O>d!3a?gzY0d+>KJ
zDPUQTN&fsEUA?DvKB;&adw$tsR2P!sc~%^xby|=1Ieu|)no$>&;<2<xU7heNyj+D6
z3@a|RV0tAjY`K5#NS=~x2UVbL>U?8&44oq5h{sYCQdAcxFt3^oyh*j!<i~HGHs{CD
zP}R&(O19SjSMbPVHXv;!>aHze=<cBMc!1^@656>gnM9m86=CUd{K@%er_UTp>*6ur
zApB1vfM5Nh24A3v0Us%Hm+iZL^*~6~ttBIqPaHYT{wdCH77n`IsEDn;tqcPBh1g|-
zL<MlhXrU8qTR&1G+*xb#V)hmPsT-(W`vnVvmSDEhce?@1q^(7N%K>QiBij(-w}>g=
zTbOx*dJb(jN@t1c4OTZ*Ep5)7d^$W)t$&)Z#I2tPQ*J^hDM|+24le|C&IuWts66*H
ztu3S3zu434Tx3{w@*iy<`<*4<nRP(+3T?Oo&hIr?huK%)#M@n0#>WlW`FVOBD_t5|
zd9IhvjCE3P=ER_^OG3^sgjw2}!tGx9XB;l%+We`8&1lfDHg&Q7$$9XL&zzg3Fk3(Z
zBT$HZZp5)vR|AIqdk7hmh|qOEGUvVIaJH=ahIX4uR3{O37kU#+yIsFBM(Zm=tU%WC
z(?RF-XFg2I(OG0L$m!0{a+Wd#?}TB-{3Vt9*^zT;Az4$^hLiDsFwYl2M@;*Yh4ib)
zT=-h}5JZebO?QPlIj}ps!XHf~dcJ#$Gc@FHX#(F)NV(+Upyt8yezKF3SL1n)P1c{%
z3tGK@=jqLviN}9ZSZjn8b)17fpzPzZ<K93g#=2X~c01^<UIjNQ5JJlUew;^j$?D=U
z%OBIsML^lR%^8UK^;=6iW9K6yVf&%xHpHl^s4ESh-|&m5TSJ#;RRkaPaImD4Pt-l4
z4CYci&for>^7r;HF^8VJ?%fqj@hp>OLtS$-d!1o0k^7tfOl*0~54b}ixarBRt7*>y
z-jcMrN&>HvDR-xYLPX3blKL4%$)ADMm`tfnz7Zi-<f>a<F2X7MhPCb63}!!(Xfwr~
zwg+6@ogrWdPD4;*yGr~K9i9P<x3bjrX75biqKG<2kMho*+Q^yZ@v@2>!tO=@-Ljm&
z>|E#pM`!>(G`(m~)Qq7EHosC?MJ2n6T_;LVv+rqLP4-uscgZ%QmM772a|05${f?RF
zslmwscNzWK^o6tW(FFmbz&0B4p1dOYEX7SQV9bJs>ipyCRsQjMh&I!2Bo`gu@1PPT
zQgl7_JFdfQJ6hxJh`EJS&5{E~{+Ii%-6pymN2;QT7!%1w_qJc-#YsxIVs6@l>gAux
z0%#NouQsAJv~K!%w;7e>qI7Svi-zFhs%nEXbDL79xX(pQ#};TF^7C;Z`oHWIRMM03
z7h&s{1q<!BXDQ=|$R-DYTOYj95*`U?COB$VALYnF$U}==Sg_O{{7I|xS()DRo?i29
z3#Tt7Vq0#PiG;#bUd4b<ELIBsRbX2e>xylahGZs_u1msqJ<15{h{kC_d4~vxge!}z
z4EsF8A`K7t6PC_ir7dZT`4f(p6sZ7g#fAPM{}O(~UK8{-Fl;vVb2P*9-k<&Tf&H6s
zURr*Sd!9UO>`!P?<}JFD%s9JZ6kE6*Qgfj;U-~Bp0uL(kJoPPRz1^V6@gihzq-Ti7
zxL&O2Zpp^A(XM|ztNYT)=8>V~>rH971T<OOD#M==8k!8SE)s4txg8AHGY9+2X#{g>
z{#}!E9-lQNu1Y%~_Wma@aa9CGeWk$;f3YXa!YeRB+9N8e2)Q<rgeKT>{Oh^VfS4Un
z7L)~ro6u+FY|zIpqE4tmNv$VdR>KZx#1vo~romZyzZ3$J5_qs|{NBm%GWKb@_*oUN
z9yQ9c+yr5kSM}_#oCnx)_*|d7sJMZto^YXGNLHvNXtAMp+3>FouP!qB4>=u!Ioiv&
z$(C;O&Cbi;M#52;V^rm@s^O{@{C7C@JBqb)hOVc#?bY7H6L)+yck4%T+Z%m6ItJgR
zA15;ttbfIfSJB?%#=#p*3!UmwU1ek3q%WMOPn?*R7V^CH<7b`+vxW(YNA}!G=dq|B
zz;CmEAqrGG)rY?zi)^wxRvVH$SLD$kr|pzA@0(Q<ytJqCI(dJUL%Z{2QiiGJ3!Myn
zFzU*LuLA&qYB?$A<gwD2LSI0@zqzk(7nO=2SVjM<QahPv-P-IvUFfMb7{B7}uHHWd
z>YZOO=?W8s_XlK(dwy+y#8<}l;%^TUstSg`Tt8?x%Oyf{ZJj?T<iA7U=IF&9JK0+3
zN)TeC+3{j%g3#Bg>aU<GYyM6wZlu3ozHFo-J#plkkLGL5y?{)zSgFg4d&tnIdEejk
zioM@A>2l!BhTN&r{JTIaImAw&^ZR&d;#_%fyYN|i+mfb|T3<kj$j13Re=)|*z~tDS
zc2<({f*NNF`nMLNNrf3!jOV=u1<n>a<%qoYt+#v-A-rRbG1E2oC=|vvzFYOGarb?W
zi*V?75;>H)$`*exh|6l5Sno+I*g$~l!mhZ<ZKvvop){t@@>jXc`(J|0j|tKLjwT&&
z>W#ww20O-|_NDyQ|2`d{#uD8G1qH_X&@aU;$cJjxoJ6+=#w3ht!|t~1b&GPw#%tN{
z7m5Bil^|<c^GCSizgGgj)5U*k`F~dRz-oSFT2C=&z1Cw73;l+t`1yJ$tl}J_>XpaO
zxWz%ap1GF1wfc#`^M2D7&(#SXX#X=Q@IQ(e%ILUhRztP3Y;n3bWd9#v!2dFM{P!ZL
zH4xv%2QSG5SZJD-$@CACEOBAZE=ZqTPbJTLv2ZO$qVbadgpdC#qTrvDfdg%QEBwE@
zV*CBEVk*amxMzF_yZ8RwqSZ=;Yw=ZP8cKx&f=gO1V?OnTNJU{bu4<xR;SoT-YP!n4
z!URGzI2q=xOT40&7S_3h8?~1+^|WF^rWrN(0qBZZKk7sye}hj>AFuR&rR{l>oTS56
zcLmy`h+H@~xM!eDG*U>imw=6C)8q>6oNp>(UJ_wC1ykr9COmSbG6Md_O@j!pnE8?7
zEq6-tv9r&}S+XrR0URp{A)3q)qZ^xU%Y|*&?4Hii&U8^#w99V<Hz`wWF~044HX1Dd
z5x-xRGP18||3clh#D5@m9`goJ%f}IT)h&kjM_ANcYy)p4`Fr;r&vAWoD8Dp+mdw%C
zm~&w#@XVdzcQG%Uq3P^U^tF;IWMxiR3$vUv{gyV~?ATp8X{RyRI9koW8S9n;$yKe!
zJ!AiVyP8kblQBT(;7?@4#*O8vJ?xhJxc#6dH(H2!lE6yBKgHg$deC+5<2KD~a?8?Z
z<wm@^r8wU3ZGyTJR=$jhW(T*0er7=St_O2g5hYt5hh5VkbV#%N$qd}M>;hk2faZKV
z)pdOUF{@MD6((f0y;3H6H{>kdbu}x(XnHi{^BK9VoY2wsNyO9-$K|)w1%E1t9vxYX
z$MM0`M?HH=h}k#oljXJJ14XX~zPvP!Rvrft+BaruhoA6l6_*76w^Mw7fcLy#C;a5(
zs1A3w@9|vhC9MwjZWr1sf}HJ?)&l#!9;aiYyjci<FNQq#J9ZIJg*oe9g1WY9OJ5h)
z!wITeGaipbB7QMXVPAn)Rb1k@S3lq|icSD;gbEG8?JhPZ*KT9AodEc)a#2#WLyYrP
z`&r-g&eScX%ia8h7RfN^;`}|A)jx;?m<(juDo#E#n$kR)1Ud}02~9ib4x*!yip4nI
zUtsT0Qak&quixA9_+-mxJsBmH=+z0H=n9!6AshmL4G@p?vy`v`m8cab&sL~>j>oyq
zx6;k1J6ui@rmI~?&!&+PVNDJ^qs$yW7#nSnl-uB!m3opXSrOWUMuh0sW(VirI!oOP
zDOW=b$C~`?Y|E^rnha>MsPL$s(m~yvdC~0QQMqCiK%)JBRQJ|#QFdM1u!4dpA)wNs
zbaxLW(j^8R(jXuVT>~Q0HImXQrP2-3Idlw4hYa8d2;&Uh?>WYGU-uQi=l!1VpYQWe
zXV|m%I`>{{9mhKMI(-j5zvPp=zRw+C;=l%mm`ZrC{EKt52C}8x*Zl62j_B_PN}a)B
z1M#4{lL3yrsor`Z$6bk~ixj5Z2jx@l>6x4{!i3A+MjOy=GNCkai?$F>vEB5|K-OMl
zQFl6357(1szEu+$uXfEO%aS1Gc_kx9ied3F4RnSZiFQfwsxz%0Vka&^Sm5T%YY6V9
z<H#gD2BNfw<HlkV-3oD7F#qV9qifXga?<@utg+q(HM%3_RO#;xEnjKc!e-L0(e`C-
z10ve@h7@e5s7pmXgVlzG@mzv6Yqf^*iXyzu+??59?D+6H)Ku-WZPL{Yua&`GpH272
z6LN6qf_WTTm1+nG+USAcuufYu83Bpi`Qg*VM%nvo-h8n!wAC(o0j@D0Nv|Lcoq_!4
zvm|RlUDmY48s#r6pdetgQ?6Qnx1g?o)^&EZSD=q5p$8IzqF)K@Xwqhd@k66{>Kz+<
z$kf?Z1~2XGGikleEnhQS@0!`H@PdjWvTA^k-YK;KdF0ws{;mj+G^*Uy9@PlO>soiJ
zu<l~Fcq|IU>E$$moZP_ft_fZEqcxTZY1fapdQV*9aAVnLCB|vMb3dqCB_Kak8DQAc
z?SuMtx31>LX*Yd;zf2huxYp_DHjp%HD-kd}R>}rUslT{R4bxQRDsx&C#bwXkxPMR8
z=g^*Mlb~`~!V5zKUL{&khkrQ;4gl#H;K5~>_QYgQ6K_l`{(1;=M9V<pL5fX7d!>&z
z+02My8{&h$(LM|P#}L;?COb5nsAudkNR94r!iy(8M;$dD{>sbewPK12wNGP~S0_%A
zh<3EFOUcd`3r2y&?eVRp=8vv938t2@0wUMR<rLs8yh6~I6x7!!lPOxx#+F|?FnUcP
zKYOoU*L@%D4=>9ph#yrF;`LXe_KLO`(FdKpafq*5cdmE{cXNs%mLd7_R#x<a0CYLZ
zY$*B=O;4#3eS>Hf3qmqF*l*$a!KC-JP07!Iq+j*4fOrJ`$U#TIx^qR^?v>DEje0l2
z4lN0!rfj>hvEM|FDpLqvE|u<tT*c;fp+`AAMNMyBsAUS{)(`Ufu}o;pwR!7zBq*&_
z<mi>S(`E#I;z1X6*f{$d>Pmy9fg^Xh&9;|(??Ur>Kqj^D*5}Be&dy?k$NN(~f|;7L
zdh7*Jr7c3M;eravbJKQPP6=lRy!R+Jec#OL=PEd?+2_BDzGH#2>Ak)3ISmRl^^mF;
z(>Z<K0JuJN+vgs)`7zbUYK6KQ9KbajXu<rUun%N5m4am_@r=P|<*3)Icl>OWR8bOA
zZ<?|WWYGZMU19a*afG$*rM^@UKdvg0lqtw_J_2c+OsRBQU>xBLV`pe6xGQ>m3X)vy
z0PY#w<e(_zBY#paTe}qDN5VQEV8ot%rL;*p9C;bnXY)A?c}En#R=Q)8W%dF|<c~}p
z`x{utZK_#cp0%Cp##h=aCi(QG<j@CCuWTIPI((=5g{?Ws!>TO@8zx$hzIu<Cn>%sD
z1mi{VC4sX^gjUF_|5V9%29SRH0tI_1p^1faFa}*AoCARK(mJhJz0BkvO{^bxO9_V@
z=5y++NSG@%t)MOKaQArq`wF!RA)S3u(`k3F08sB=>v@~%#~U+!pSP~Uf^lv&wGkg_
z@zm~Ir4ByR>+oSkf9mkl9SvUeX?52}VIVTL%fTlx{i%a&-X#4@>9>_6v>K5SR@>n(
zDBJDq1nzZ`NYKRBq2huXzo*S`mI|Ca$tlko0QzAN5m8^EBm{J2LcSVh?=E*gm$bu!
zD~XKkHlXw*Qgkj;^XU$S>ujuV25{FU_p+40xC*p@N`MdIr~}eodY=*^1oiiNcVAdg
z0LT3T6pslqOHl_as~_Vbkv0?Bpu$}pivCQAyngT`Eok}7h~A$!wa<6GgYP4y3U!5X
zfgk-spyd1gY$e{9hf+3Uv=S+#4<yrO4?DT91in96zGgp>cy9~vt-XZH^2*s{i9$>P
zHjEq$gy%Db{Yj=K@tAr9Hi!Ke*i)n!vLB4ehmp4VPdiA4tMHXp_I~oEaVAB5K;2B5
zaA%#C4ig?=zP0Y%Du+2`UE|MG-x=sneBf)6kN^+~sX@6N-GgFxB+`OF$DNh&FD#TU
z2*gF-Hd4L`-j~kT%5d4uQ#~mgGTdBu=SEAvb_-e{RXXN@Y#vbhoqf1kY4ug(4KD_g
z6%a}loF@p}QkDw)S1!QnGFMaEHYY}*{Tg#X9fhf5Z|;j$@Bp54tcK>TxugmG0>OLB
z&TkYH=$Q}&$bJ!MV*n;bJ6`Eh({V}Aa#)uM^11jBNhXzwmoza`tnfLi!DY(sPLx6I
zyA;Sxco9e57Xo<Cy-|)Ljhoifd}F&WO;wiq2$_|hj#^UBXLQ(AXdZOQO@B1MgUeKC
zYTXO4P-K(`ul-GMRhzbyDa41WbcLXCExR3k0!q!35ea8h^=~>l_-q3?XO^!`+&9vo
zQvk^L2f!-wV|?42f?J~Lp`kH&V_%m8+Ux{(h9n6pL`N4v(HYrUd=qlFqAwDeXfUIK
z2tZ%9uOLPXi*(%RCnDQg{U0$pEt8oWC%CEa-Cny9W7#4z_14fVyvjlCoeyv<9{@cZ
za>|ccACbL`k2L=r-~=B2;Ar70`)Y_^FycJR?7|&n9>Xz6HICFmR$7l1xffAavfuQ&
z=xdov$LiP5jYS+q8@b!QY;3^M!1f{$g1U=;U^*MvFOQ{^3_BQ_{wiQIoPY5?gRYrv
zN1u=Tyg<dOnR4=1X8y`vuY?60n+yAB4MAw8!oP6{*tRDI7MUbXFFZ4LoL|ZZaKEgv
z7`uCL)8g>e$s=uEpk>3SWl8TqA?S7}hUrUIKM6HaqAy<j`pVYm+bT<q5DVd(4!Gez
zPm~SzkpY4U(?>P49F`has_AWFXL`<gm=8R5&{HBgHqi%~agvU`UZrfEO%Li!qy@tz
zM34fKkF9%cdxW4GM!V~mVrXa{qOSbm$FW^N9M4InIJ&?1ph!)rqEx;y$|t=4PDuFs
zyQqXQIn2DUq(F2dK3@D`Yg31GoQHd~?pB~QZf{fA@Yk&Vru?3-cFu~^B6%M{fC&L2
zQs86sz3H#F7HpLw_p?!XnmT#25z1lbJ>CGFIXR#M8R!lQv&Z$`$R?7@?V5tnDNf0d
zq{Cq{`~a8%PJ4LHNy!Mh?~;s=$^Ky7*AG&h3x0&m4j(t9Bl?|VvXlAJ(*_1^&Pb%h
zvX9fdW%wdwl0z04sd|QP{u^ik2+VDIAx<lzC)6>t9n6^E+i`*pl;YhVg+=f42kQ;X
z-7vwmHy76Wy2>^?pUn|n1gm1W;>&7qzvDwL@EFeOiDjf>n5mN-;cY!e^TgQ+3dY(_
z0wS7FAX5SODIB}WDL1LqaYfTK)m*TR^$>|vjC+2hYf)AZ_Xj*UQI2CB5cP#0aL^Qz
z6lGfyjprl;jN}Aaef(I=)JWkVy;WlC8ovIHCqEYJEP>5pzidglpY7N2VM$aQ6N+L>
zhd)9%*5ItqQC8nF)yAvnp;i&&UZqtn;KP{MUZ#yPfiE_<M_&N$TPWTP06K9<=WI?T
zO08Yb&Y6F!Qh945?cU=oz;gj+SZ3HGm=O*`UEj0MC*Df)6UhwNg)q^#uw_gn1PuVh
z(w@ZnRGdcKvLR+%s87zO5XhgG#HY?a1CCB4c6MTJ;W{SMPpDGirOqR!uv1n^fOK`p
zKgde+xwAwfE5o6z)MU*lmHujRPE!)`Dsu^2K|O;xn~}s?uYN|)z)~{&?p%!(rKz>?
zKii<Vuy|tfONwB0NA_$P?H#2#e!1O`><_ppkmS%YSVPcG6_FFvdPd2gpZ#6own9Xf
zGPS*^V3+W91f0EKB2^@|X6t4HCA_D0Pa3oAmcU_s9Eh|mWk#h`X9WpMd`ZLEUQU-)
zgTHy*>JUxHL@NwZ^&4eLc_++GwfI@uIJl3MD%uJo+Z`%G740d|6|1CWre&u1n_Xm4
zGOm!kOi9HUH*n{^XU!PL{h(FKN4arzKn!bH7JBpIiPTrYPf3!NsD5gpLOoq#AE{Xj
zjT5@7PC~2>?jw_(iTsMORaDl^oSA~^be~V=NL_*EAP%uYO;AaTv9ff?IbUrobBC1_
z#Is>W`NZyo1#2qiy+{S+=UJUniHKv%8?Gx-kyJUsPqnU4pUiSO;<80YG~kiybPE$7
zWc#M=P2z3Z3qLw>L2S{Bs&${hiie|(r<yEEgJX*X*fm}wjc2}xtxnH|z17Z$BE>L}
z2ul2B#tabi8GCbG%oTouXO&g00SqtuBJ1-5T+Y7Kp2L+7!(ACT)Ui3|LhqwtI^zc~
zht9^`tBf#z4VrVWzj1c-#G;sK^~(lz<dk_+F~dCXZweB-*%FDsv#`t&;#b`J*|9jy
z3!c$)U?)wmGZC{?sUce)S3r8jG>-~ih1(^pNWm=d!13LpgcHc3xqc~vT#Go?);=jI
zId*Txt&ov+P|)hjK71>du>GjvH*?8$g4ilp>bgNiMUrXl;!?!+|Co!y)~K9LZu&9u
zW5Uq|<RMzplG>C!%)IMgQ_=0=G7cYo;<5;>##to?G#F!BVCOA-x#`GG^C3Nixf_Iz
zt_-*zAOFUm4My&5sbQ?4->b-Xfx4^Bi_hN4R*VY2<uuZB-k&(oMGE+^E3ve^xE$3n
z|4s7=QW^mbw7uspS>}8h8ou?<@(BYAPWCYcCpRuV4ZZ4}rU6!S8~lD;et!)yBq-f=
zfcu&arNB~f6S?DqtW0m`I@G&9Kf#An`87F3FmHaRX$Wr_m(6A9lksgEo36V1rkXXT
zwRFFkd_a(K>&hTQw%>FcT3g1K*#Nn-t6qx-_V1Mo{mldU8k$daA*7r^Qq6ngTR``V
z8q)t(8{oZ^iHZUiF_@LD_&@7XUQJ{-a2RwRZ6LK-O7+aFr@Z4b#fT6G?Q#~5Lm>Mz
zxiM|oyLc5cZS_0bD3pxKX`h_h2J?S2RfPQhHz?2_NYkWcqyCvF{Y9Q)Io<|&KU9rk
zg->i#^Xl4ta@_p_XH#OthjzIk=?O>lq=+&;HUA^(pWO{#5axc=`n8*XX@2_7b=ZPQ
zOKDzDZB_n*)acI)qI^d~%Q~vXN17hEklCw(Fl~5}WjDh!p~?B2suju5kl#&f3SJyl
zt;;V9y|wUD-bLE`56kOT8Z0)ar4jK8xbbJ|`IpY-KZ=_l#Ye9>|3P~+p^yfJhP;W#
zfFQa(=Zki}R`U*`__|w!eiW=;5c>M}=?S-DT#uR?n9c0FN^-oh4l}Tdu&tO9qBcR#
z^7?IB7W|;8v8YEFs4qQpo%dK!K+Mql2!wNq{80N~H9iIwZiFo|Xiy6ld3>oagfIWz
z5LOW|i~W^>9Oz`o7(E%Erpzyz{Ff{XOGh`fMD4^%In|n9J;fwUuq7cvV5LN{slcIm
zVf(W*@ned`bx68$xxpvaj$?y2rtVK07gbFgssb%H$C^YIVc8zw#@Wci3uq}Ff0<vD
zBU^@tDhS|wL{=IPJR4YNaw|_hFql2o`dxjf=oF%jP^YUntlDc+W*hdaLWZg(Xk!a!
zfU4sK3w@rPvFE2M*Dqqhwtg5w6WQjE89ffa#s+bvARmA5MfNY?qjP(iu$%E`-4}vh
z?C6ax)RuL(c8PwWOixw5^JV=449E-r+v44=?;VOMo^QP}Z3$`}pK%V<Ini0f4`oQ}
z6+t`=$^fWQblDWL2=s~`(D}oeo%Vc<Nr%uyrmi*gNJ`nX)AHsUze|At#fRH?;zD27
z-2U>@<96>y+dz>wACty2*2{+(lr`)7@lCAg001pkGXY{5#d`#S-@EoW&Hab@Z#`+m
zQV@4ekJP4aX*an(PW}OIs$k+TOX_oGZ}=YMlyaJ5g$MGfGJEqBBH-M^Tfe!`xvc@3
zWY3snV^8<v5p@(kI`2#37v~9b;l`uNmysXPc!gbOM=5*HEW>R{D(Z)@av~Q(I#9sT
ze!N@UrS?&qrSWJ~%gHmw#+ImGHPEQ=7mI^prd*d$s(%zi6Kxr5XU%K9H+ICdPR+c_
zSrR51k%BoeW^)aWOrP&i@8!3Oh?@$w#URTFTcsGFIG?s>2+1Vk8um|na!7@6W1GE}
z8wNBb7*+`J`Cb-sgok-K#usi&EzC|Yvk>mxooW*aV+inHne5TI!Z!Rm&13RgZ#Yc_
zQXS1`m5(H*7gTqhqm@GSr3bI~j1^`d-@4@RVEOeg<`slnfA)f{JBqhn19L{b{^Xi-
z%?2-87#EM3N%eVVz^F*C9qxf*vjC2~_(6A1^^xh!w+g*mrwrwvZ`TihU&H%RmjNnD
zz4x+$mfPl^Z+%A<J_%$$;N`;-S(DF=Hsi8=*T-^oB#$!O*fJH>OUv#7ZGiXJ&T75r
zKf8YXLQTJi^Kzb$GPvzg+WE0U`}~j*zp%R&w}qh3=NsPDUn76Nn*Z|UjZ2SO-O29R
z-GpAnG5*Y2#VppVX};N<_9<)1ObGccYsYs=XyFL7;O3b(swUJDn`;Ii@c(kkP+I5n
zc%{}GHq$B}yhK$RNXg(eeSm&;g^w&ThhE^kO~<sNam!;N5e=wvGKN;=37GNt)Ydcx
zAnVj7JV{L4UtVeOnNC0_Dw~C;=s%{S=zh)!A>2TYl|ruWG4PkIoj@l2=MQeUBtY<K
zJvBg&yB<Dsz4o6W8*x)Nmt0O)8z5(lgCKBm@#{zxexOL}dxx$9&;l|#2`L8DoO4nd
zv9l(?kH7wQdpzk%Ng6E)HE42fxKltJ%2fnn4cHG6%bc``9NW(+2v|fEXa#mjxYx>j
ze{lS@zXd#F#t?8yL;-tRVb*fGd9l+U=3_Nt-XBEy{QGTl8_>}n!&U7J_rfdMQc%5l
zS2K=M2~J$-QPy!ijlXxnHGJ3V>IpBtJKg59j_VssF}0K3U74b2PO(dQmdi&@Br+R#
zZ;wNH)D0&Vw(J^rh#TsI;$jN*CcCKL%Rd8EJja!Aj39mJ*=&Jn+MI^0e_xsfCM_Pa
zH@u%j^L5GF4MKU9Y<LX^H}Ko=<Bo4`gb~hZhjmewN$~_1Y4MxQ+y|YWT($4lxn@q`
z@K*GucnY@<etb&E(P8J=#+R9IU)L^4U2tKP2vxmwdPV@~z&;@|_fB<JE-pF7Y?JHH
zCiI`B4qKvviA()kkmwYX2CIIZC>Pqj!*MUfd0d*aGpD4w*lw*DvgH>nC^@YUC#R8_
zVEU=o?a!|{aT7BazNorVMN=d`7RxWu<$PD~Erk)Ptl?;A`SUzJgHEqyAt3nkhrOu(
z7TWORsX92k#UsG<+)KudV4Xuhy<)4}=iwzwcJ^vpR@1EA)5DX%Jn)pyYkFOdLe%4M
z*jV!L3=PXNK9E)3KGL|lbrA1>0*YnFA70Tal1_Y_d<VUN>#X`-`QuF^pbQBYy}xXG
zWyIz}jBYZpWKkZxr|zfj#I(V<5#-1*{&THUk9OJ|wqJ@A(vmLdlo{52@uj(VVACrF
z^=FkHBAVeCBW5OXnS3(QR0tmhJJn6;ZVC}EuTq$pL|9sXEU+HwkYhD@GUU{*YKw3E
z8o2EFaSPFvVmv+gici>Gz{}|yc;~%;j4i7?y#dfXZnEm-g-OVnvlKGUvTUX~jNU+1
zVeU~iJEW;Qc4oTx9$Cv+7Jg^LH#g|f03@;K-n4(zdyL2v<!5fa@m`E$7H{m5K23NA
z>%_JNSy}Xw+ycvp#|Gt;Y9E(kw5;*o-lzAO-(Y`qB+(+ffODn(e9>Tot`F=50b9%E
z$8q))Nu}rk<{l>X`5=ub6XvSb00a@f>!^7Lmp$v5-3K>zuxMi@TZVsZB@Y^O%_%4(
z^n-6(jW;Wo?H5tiVqrbf<rjG%{P^aSh})+OQy!f|*G^%niPrJSbZk{eY#9($(CNzr
zgak5eMw3%8Lhda7^ej)7YA=Rv*7lIfQ)CKzf`Mm!i;NHulhf;jIRY93l^Kl}**tRT
zq{4;<eKYN8H?+p<W$T(TkQfh9cU(N(LpWxLVW{(7KQ~e)&d_`%&6XN?Aapkhv&8-F
z@#ACrrQVx%9v$twFtV9K)LCLbk+f`hPOT_66ercp2IPH;N|MkWs|#)N+aticyK$>!
z?%Ilzx#ksoOwZDAkw5Z02jTX&oi{^Qhvi;u6g$dxmM?RB*cFR<UqLke;<Ir@Vr_^B
z4{c8f1(oiB7UX73v37Xo721wn#Tw<x;ujln0c3@y<!arnwc6RcCudF}a3{PUs~~1}
zsy-XiwPqzRiXkl=5s!5=vw65Rpw9X*&*_eMtkh<GOaDxFbyT0{%hU@#R={qXMaa`V
z;zH*eIU!5(vLAH?S-DQh>OD0%uW`CRr*w%89>?Z6b2bv05I#6{qOkCIi}y|4`1oB#
zW=fzMEB%Ie3b+OUtx3HlYxGo<$qvv872@{likxMJt(RG|KnM%*1JXGw<HT<`I<=8n
zHsqWPa_7;h+`;dv(b;^rP0Y+?y6W^K{2`uf<tl9tW?V1PlW~HoA$6T{%zmnS4Zqa>
z;Chc@7!T>N$R6kkzu~&bhQ@KI$b(>D^6zx<F>lEwOJqGPPwwd-Qr2)4)E;Bf3HJ?>
z%1B1oE6Soo9;9E*P)~mB!3f4V@tVem?|b5uM}jvJ!?<D*d&7-AbAE%K%IgCkxo9Kj
z)PpDGgKgdR)F*tOm?`V*{)!^NA5W!yYpGnxoS}r1%vm(#EX>E&IcMT~7gwKlCmkUh
z3d6Ge=H?2OvzpY1HH(qE9@nhJJRc`A#$lb`oMPkqkkd@P$9dVEvAC)XYb{%x6i>h^
zEr<{{{;@ssVccr{6I=MxUL0DLX`*y5(yr>7P|jOztw!-{z-<X#Kp<K*c<o>cr;18G
zCn;}J9104G+fbP$Z}ZwrT9QIhZ^Oz&;_;{Sl`E6|7{|<u9)o%30X5hiNuF#9pDh;(
zo2G;LRNP)Q_L;{n)CWVKQ~Y@9y~dJRcreQ{nWwmT7r;A_Zsnnmp}>4SU_n;b!`3@a
zqWzIsdNo`5wU;dUYN(Br@9)VCxl0Xm0`B0R7p+RwTH1Pdx?^?mOnzUV2rAYpy(-QY
z5vt18nXJc9;>T#~3%LyVRp6UzcqAj{eoUe^I4|ff(>PWru261Pc8@-a5+WH#FYe3(
zs-c9o%E8UP9_AZ?c_r3twwRi-{1Ds-x)ZM~j#f3-!yEX^u#4gN-@gY&x!JQhM-^4j
zUDp1Wm>c0obkaeD>1+y4F_q2Zj5e^$AfQ#=^lOz@g)RB>dv9yY?H~dOP*8n=BOka=
zie88S<;}pFv3-6UnJKkkCRM6f*>!{(?To|NjFpSOPifv+Z18h()q1c$eCV#;T9I4v
z!{4E@(?@W|xns<t;l<og|2ZNl*QRge&ZNO^x?YXAs4gqHX<kKgPr$C^fX&E^;BP4s
zP=2?<w-Bdnn<;8x=({|uG%sI1Y8@Wm+KyyK$k@^cBoSMdt+74(z%@ssrwHUPbvBb!
zVZ+@%nSm>sXr=VP9M0$^d)zW1i<QWusWI3H@0MJiJLN`Y;GtJB!sA|%3uIji*2Sb0
z>pGLXA+T}ln+4G)M!@~A#p3psFl9|<o}Njs!yRTt_0KYA)Kxl&>CZ-_-*I?8QGXUx
z%lwIH4O$G`&9P7Cz*Jte6MCiu$bPU#2smm0T>Z5aKxp)9SZPJ^MA0IX=O#t3sj<F&
z4$Mfp*SiI}U|OjF;_6$alXEn&!D-WCUcD(PG&kg=`p?e{xEz2T8p=rWZl|4h{I1h&
z7v6j8AwNe2<VL?{U=?vRF*QCtwRL$iEv-4}VmWyb&wEh%(~0l<6Owsdn?c((y15hH
z1MX6_EdP9V01eNtN0&Q(r9oeznxL)z*Sh}raU!y1;MNTGYcAWHpVkZ6>#xRt<k{KW
zSi76Doz~@1oM9$KE^PSM`2kJ_xZ{IOd4O~RK*7H(^z&<xDrK%rsypG5R=}kw`DvTT
zW5@H;(XU2<`zinRz3JzljepKeL<Ji5m*F1X`QJREzs{4C0B=72LEKw~8sMT;u2X)`
z@Qqi4m#B-m!S|1!9MK`>Cz6Pf)@YE{0=-$O>ET8}o8@5?_wQ}=0rBPCCcv|ro|SPP
zM`X9G&yRNb9XO`>cCPDU=1~TB{>b@Gw!N|&d@aoup0Q^5YuR`=!7b0E{@5rWMRF-K
zsg*^>&CfZ_*;J;bR!HLivfaNemP_4_!M#puLLpYBne+HVL%_HBO1rpHy~d2w#H}-~
zOMLIKfeCnqFY+w4Bqmmx=1TnZLYB5k=ckrt97q$02-%&yb-F}EJReg4hY}uzI_}M6
zLOop}h+VDo5TmAuligir)Rg|P(N-`gWs3bS;ER7aMu7uZc_aU6s@Q1*=Mx~xi_O*u
zl~;ch><1(u*C(cbCeNLBP{G4Zo2)oavvFp}9tGOANo&~6{jc7fg;oU*A9A!C<s7)5
zs^*=Hd(h_>=$WY@J%q=1_QV}Jn(&9w#GzY!b!C-XfphNKETG{ae*i<wt>vC%f?|%k
ziCr1*Bhn0iY-wyrucF)SLKzS*1E>Z{csKCO`n!?A;bJC9m)oF_^HtLaSlsgC%-Z9-
z0{35vBl}$$4az%*cD*p}r%7U17jn5hinFBpXPjOAVz2$7kwj`Ukuf%43BF!%VR7ec
zkgeI3>F2ki9Gr;${dtK)jh@s-`MpEIb97yVUOYF#cZF~4!LIN&s(}1E;b^D?GP0$7
ziT=se-f<X^ymHPXp-gk&;YPGh<K%Z16Wj8M+U6zGTLW_6XUMe(F}!bny8{Az*H0o7
z9$NIC+eQY(xV_9cqF+`<wg#kHjO4%Q+vF?983@h$r_;@W9nFowpFYj?E$OgjTnB)d
zc$Q^P23pIHj&j-L4p_7JHTK|DEYqpXBSSZa8?ElK@NUS#v&uZuJ_z`C_INj6yb)R2
zxFSQLOl=C<tgXxTT}&z&Lf>=KW$@cuB?mC2z6m*%^h0chiA57Yx?K9wzck(7qH6>O
zq&kN|nzhqIJ4D;~lTH|jc3mA)ob1@tSgk-<AWC?mdDaD`ji^Uv@@-C(wn$|WsHF#v
zcK+UWK>l#*+n2-lplgs2AJb4^b`_9?OfojUMHm_>Tz~QCGtWObmOfwxU@{kOVo8i2
zF9=u*dRPVq?PEkA)XIRlqUJdoS5DcMfXM$r%Pu{0H+n)JC?Ki@>oOK%z3!To{Fd~O
zRDIcW;zfV)*G-38z#wEWwGOXlzxPocs<YlvPMlmM1Gk1nk7&_~6SFrM+&jJI#)Vp{
zRk@&_5NXs>)mPf;u^W+4w8g9Y4pJO+h;cSHo(`f798g8W`+3d4^!g|H#O3$8yw(N{
z*PXv@26v=(>fAKO{yng?Pdy;fS6B$&2!N&Gv*X(e6vY}cd2~I%t)t0mzTxJB@}q!5
zMZJdJXA}#kIqO$|B51JpI-RE*<ig2i2p-J-Vuj)F2SVn#I@jRU_-VJHJ4e^JtNKJ{
zND`_cxC!}Aisn6{y>|AwyMb%5K(nz=`hHtrfzvCxmzbU{w4|k&TLPXkaAyYUMT)h|
zzao7sWZSJAC_0x**W<?9GDLKh>Z8wS(C6^|1@rYQ0G7B4-@DTrRCELI1@9ADQry8G
z7gg|nB;WqcON+LdnXAVDwh*<w+IIOx?G`C4dvViktEB_j(>7&+!%KX6?JIIXhdje_
zeLqqlorMEgvoiLsNaJaDdP7|mu={aIR@G|hij)q5zT7WuMrDvq$3jRnJp~vgei`)A
zMpq_Fv2A-Wr~=5a4K!^8pI)~{U`LA3kjdW|$h{R9VXF>;T=L9vflnnZq~IFxlvJ1B
z&-n=wws9xfo92W!<OF?sp5U;CYJre&@6oQ5x3ubKZI3a<B%p^=sqSA#+3I?40zZij
z<o^l~fZr!2F))X^>^O~a5;G6sb3qBo#-^?Mops^T$Hz2J`lqnT&e|TZXl$VVM6lWt
z9Urtv<X`J!#jzoGi$1NY^|r#C)%ljT*2QOqtPS$n)j}8&z0dO0gp9iSJ{q4+99<a;
zAJ*KLBKv8sh176L(2OIRio(ofMGm1qk~MiK42BmcEm-VCE?8V$?^p&5yl(mJ0axx3
z&?bp@m4t<koTV@x$ez@Ir(4kEVr0O!U<78g#}^Mx;&MPbY<ibrj2yumW3|qo#?pYP
z7CM%0^0(q8{&WEc1LTR|3kDL)e$zpn1fAgR)wd+uRHx_F0a_`#;d&_e`MOvi<atfm
zk0oM{vWf7?VhsM$Dv4I)*&#ZbkypL6MVRGu#jO?}h%vvAfXS+@(31$~rp==Lu2r$F
zvjapPW3$j7K(>OIc_sm91XDTg`;ZIzJP)O_*z0)uTI%_jgkw|3=95<M`Rii3v4x+4
ztn%Xp)jd{akDIn&!xtyU`Xr%_dL}jpNg0_3M720RdrVLGp#g~bZL?#UX8X0JB-HHD
zhc;HD&W>bC9g%cy{0c+sqblItD^|Ud(69TIyq*bHK60Vt5FoNFEx)CYFis==gMr7Q
zdqceS+Z9jxnHC%A-w5H~+fc0CMqw8C_8m&?lZ$X8^zD~D@pZG|Tu}1T<~&Xw(e2oa
zdwmw#({T@V%^DE;JwZjfQ!;Zws?SK&KUC*-s8N%%XjucE0ob~4_-vm&4QQ8w9uH&u
zK*CW*j8|)mw%0xty0#Re^!Bp@mh);kTiAf98pDXkuBTva>{Pje2hNshTI%N&7Y*-+
zmq^7%iOtvz0zUN@loi!B3)p|qcq&=g={6uEPTz3)(mSg@tdpx$SnzZ|Q#sgj>&|bE
z4_JFRKFbJ)H-0=D`;Y{HP}4bwZVh!~a$udnwYZm2w(4X;364f>K30Wf#H`@iqxPa)
zNT03F4^R7iyHvS16355@IFwd&D$kdCHH2D%Aa$q^e8%!yownb472HLq9vgXqf~9LJ
zWp*pQ5bG!(VS0X-yVl8RK{s(DVS%R&jHp}h9Sb~}0j|wj0$N3M+(N!dr$S#=)ngH*
zH@~Ay2jKf}`k#Z0I_dn@d`g<jpH&~7E_lKO81qgRAyo#YH`vqa^1HRFZ-3ZRJA9jK
zmg4^4W9wyRS0e-E;?M0(xT>=4JVf*Mt>%veg9ucouh0&-1-djj+;;(;K*q(B#o2vO
z&`)>d=MLZJwj|4*g|t*3pXuYr7q3i)AZY>1)MPG59aT6K)^IX)U&>wDz3v!;o7(Jr
zkJzfZZ<hnq*4m(Q_;o`W(F{523}dKp06k7+dA84ub(p_)z0cw4^Ap5kM`(AT|DM3!
zNfb#|w4VNxJB??i3xu(Q<iF7|3!bkZEx*hqfS?D6n+MHfZTw8V<P=orTq~suFM=vE
z3q12{IR&^@Q3(=@-_&Q{CkNW8kN$WT%g#Dtn?azRXI!(KuZd(4M+FURjEDG^{EuuO
zlF=~f(tvrX7#{`d=0K)TVA?L|B39$$j+wa|N<?I2<|)Jp)z^WeK*09QSIFa#>)|5E
zF2XOn+3PT5Qz}?*jR03|4WGpR1Hd~P({dTiimmG9xF~^nkEI`c6srN{qpEq{V-uU0
zvQH;p-lp%#A+B23N}cc7B+m#8<l};zFYFONnM{>zjm1M;vQ_Lau(pn;BLq0vhsTp#
zEgsqS4#Ec?p_vdv-Bj6_c$qg}B5>B_Zk=kUKC*f<A{BgmmgT)^on)CWeWJc6J79H{
zTxE8>XZKSj@5$>1J^CY8+Kuxh6O{)y9$Fh+&jL;fb7o*SDY}9w490MLc(e9$udc3N
zbP{dP<6qQSdI1`LgVxM_X!M~u8?B!l$hYf7Te;6~3g{<_`J`H#)y_F+Xii~S;CWM=
z;wQWlfsfqR&okfxK*d9*Gj@AlRZC58;5b*%s&rcMw=B}Bcv1?;x0c-646(vYYTsC5
zb~S6tj<zb~B9E4#8`hb0I#UP+MY5N?E5CmefHR6@LreBBhHo-)Gv3ACDRiP&V$0)!
z*!2s`u8-oamr$b`Px9rxvwp!H;7UF=KK%#lX<iyq`~ztIgoN*3gtA`Nw;%G=6Oz)N
zRLYpQcbN&+h8;nU*!PGZ=~7QwxANj-O%6&Ao{HC0#wW9VuTadKIQc%;;_#v8cl4uB
z96nw4^~#<^nPz@zUI;jm0Jj7{8XAYSfUW-vgcU!W&6a#5ceallp1skz67jzUhh94F
zpJiscEp_Ryw;W9i7l;M=dV#!0TK9_*U=8~%5KPewd~Ii}z}a&V2#t!rsuetG-n%+O
z<t<y?Zozd!zunsZB_#i8DJ-~YxP(Ze11%<O)f)Pjhb#pKp58ki=i1W^DV6j!NYTX}
z^ofES&#E#BXlQ*u$D+Sg62&q!9#t?lbNvooe}$_51El8v&im|1-P(g%7EGlqNay<L
z@se0wq`<~dRIsoqu)waJpXHb=7y8--n;h@2?OU+YfjW`!QpI1SJaCQbzd+|F)(c3W
zcgBBF1pfizSn4>6abBQqsfGrB(e6M;z1t|pNbJ(ypZ3~TXK>7yEC-e2UA}xgaP|Kh
z=X+^SI0poj`gspQYR#^7zK$Bs{7fJAjiJOc1h?0t($I1AVz6b@E6l;fZK$Za8RWnf
zgoI87uSX@*3vi3eXs-x6sb@DsTEYcOf<781O@|J@NmT(}hb0pyh<RaHSYa~#LROIq
z$yBX-a=3HmCQQx|WA3f@2pad%aL`dI<x<A4ql!)7_S_-cyG)y}#m|nnLl}uqR&okV
zK(HzC?L+nZ@99gHx_0&a97%r{zmiXPrlQ)acYMKznr&#TKR7o^<_(h2(wn#J!@j-S
zeB=&h;(jn5$*CXnW15fZ_p<)XMaWe>XHq|WfBd7fO_#+2J5|D*Gt-@;zti?l?FdlG
z+kq@i_DnyT%PiGPt94tQ)sVh{{GH;E8*m+ui{B`mL<D2Juq|h<g~jDq(Y8Lx?a;`u
zyZ?7$<!WVyk=o$`dwG`;i)KkI&}Q#JMtJGDbJT09cjO=3yhDNy8;$Fq7X!c1{HI^L
z=WRD6`zBTk7razYpO2m3wcwjNJL2bJ-b&uC`UiF@Jo%;(wo6Wz;?h%krN7f;Z>)Q;
zt^V$SN5x--k^ZfQ@zjp00&83yV`1X2=6^3iF&c(C{#Z6e46ZNP$<dCcER|~fn(o=@
z$y)^zc!74df#JmAuE#+{DgNZ09}~HQoygy5t2_Nr9$-1)h&Nz45~jcL|K~;pt+0*w
z-qhLAj&xl58`mUPML6}NhY&9{{x<(V*qukA2HQy}GOGK$boAfB?f=Jqo<jA$b2=pn
z@V3t^_bCE-<K!oehr(X|hS)@L-V(`!(6gDPrZv0pyWW(#neD3DI6uwR;Ly0Wo8Dd8
z>|*PpVf*8h{KasJ@EhqR<b1M^d%Zsjs$T#b9JY0$x=)Vw++uSlRw(ExO6#XDqaB0s
zyITA_459f2QNP$lkTkmW6L_H+f{3tXxIXN?JiQOa^$bU*`l%eFrQ+~lU$7D8uZE;h
zM712;oh{Htc^?KRWK_1B`9)tRUVbHy`|LZPtbd;b)HcK0?&`$Y@3@ggJneks9p?PR
z_~0~~&omO5<`%o3*02358T;#o=8YVZ^zm=p4maQ28c#2!Kvi_uv<c|?5HnMM_<oEY
z4cj1nzUTgjoth6ZD}6}W<|BQO%cHaX<{7@rvHjR2?{DKibJ0@bE`RHG>i~Fx$JSE7
z$zeA`(}{|UqoEJ+vSxyo7jb`6ttn;ajyrh%2j{t=Rz#NVJ$oKa7DW;Ri1=t~$?W5L
z|9R}mlIaF7)P9o~y~_en_eIW1d(OWeASbpeHEntifEg6XP11Iy6;e}dD8dyf?)9VS
z>zS85SjQrR7@0$S(lg;TeKN(egpI{68rbyXuP4BZv}$<M2<AqdI#>MIho|Q2Gc+QK
zhxY7f%PPrXOLIIHRwCPNpF^bUW^O<-TE7(PhVTmTF@q}W{C2s;7AxSfnDMuhuWtaM
z6mSIv(?8m3T?<_mm8%j1Vsk+h{yZVQF#kNHU=Vn<iEQ1iO|xZRjfO1?ui2@B=+e&2
zfcBfb+|E}VUUZ36{tz)_&3hm4)<Q(+JbQh}0dk6(j5eJOC<<T?Cvg6S0Ux!wipY!u
zL05oOoDiXFXuj%Q2wulq1Y$3sF8onnO)v16pr%88SgLzpChavPXzH<D@iR&M-S^A#
z0xl}6q*~WZU@xLD#l)qkK}-+@iT(2s_?sU-Hd%z=e+7wd^zFYz`_<_cgZxn||1c1@
zE?xZsI#s#!3Sy)HCB2B)qD{&^{;IYYdz;48>W|G;H$LN7FOSVw(ONT3@eY+BtlfUD
z?2W)bIHmur`L8mMpPz(x|AVLd)%1+=@1=;JpWLMW2lt7!E#T$>v;0*>!1&{n=XU?#
z6n}nlHRkVeK`ueRHn&`Y`dh3WG0+^>$4MVPOE*f~94BHvXYH+8INeJ4h1)(Xo|yY=
z<!Issks9@5Ty?_ecn8rGg6vdoL-?BLUF=&RD%`K_o|$$?yi(=99&*5A8nN*d+=nAt
zM#G;^MZ<DprN^%{Rhv7fPHm28Cg#2oGQnFc@@8n!w?yys8;mHJv-~3C-$KPxRy2&g
zmrK1rKQ?{JpEl$G@(|AQ&Qg>sY)>o1BV|n8cNC9Wi*q$l-M2J%l?8GSZl*3N0~wik
z4z#<T$4l0~rB*W4Eo#B)>aJQkBL`T{sh8wUne48lh{x=}7dZO>Nz()Pvv*4}F=Wam
zI>}M-3ubVo+*SifVQx#(wy|nUY`pMZvas`7ovxeXHPhOvfHANgt6$=_;z1>Y821?I
zT*vBIff%lq-zf7to1}H1{<HSUA$h!dzR7`WLgO4h#!uAc##3CM+iOQp<9DOuw-zdB
zMY-ryGCEsQs=PR({YiLNvvnLwbZmRD?}ZW&Y7IKBC(5|^S?;v&c?|FBT>KN2kU>s-
z*9VCjx3pv5dojdttk&uXhJH=BeD-04OKoWV9L*@f>h&Ik1eE1j63t(QCCndO8If`E
zq3IiBMI%lLku#ENHf*T`G+CGT_!GQb6UA-0QpMNhd}X*->o7=eeAlG-W-~Yki&Ns|
zy|N}U{4iH18WtiheSBB%L`Q9d#$1Uvxnn&I#I{PkYRlhNP{zU6rQXmPGf|t9wD{aj
zObp&ATis~oN~wCdCN%=B<Pp(4JiIp~ydw^+R8*zf=jmG>?&$E;lcYzK_fCHr)j4BN
zEoMHYc|ofJ^i)*+h>+XVhnK}S)3HF}P-By)YT26Ejz|3ZZ*!S)a?kc{K8~^}B;yiL
zByCrI^jep5_3OW>gGt|B@>J&}hf67{f?x+mk^o^Oq_Xe@M&^~gttK)1%<?EZ-jgfP
zmJmu$avoAZm*Plf9hcy8mqY5L()NX`e%cEi*pA6Urw<?n)75}1oXxRI;FC-|b;Fl{
z#y=?ibMpC0)qO>-<RRQl&6i0nBXlOG^K}ECT83Or#{4}Qn=M!Bm-@sF@40FDb+<qU
z#LO{EXNq)toQC+Am+at-lDK$c)K+{659~ZFVK_0KH#(l@tyuCBKvEGQ8G!b?Y5{&J
z>{Bx7UU3&QSyx`+UX(QS8)WY<SsCugv7+IgaT(%R4Pl=mTMEps02!TrtACek)St~U
z7N>T|{=T_+sSka(5LY#9TPx8$5}7QVle>`AVeXwciCB&2WnXP$S8dUgphvzCMkTw0
zRr!>A>wAGs7oH{0L(B|02cb@A+}0GA`$qj{IDa@V@DZrZ+2`xnnK7J62hLU*sm!oM
zSH+DVzmUg`%xeec97_X{g0P<~Ip|ajUQ@oVau(&ff1q>Erf<l~dpanWuvW2GSDUqG
zb$znN_f8^-k)ed%Bf>|eRF;|VR))nrwpSA~9olWX*)#})`K~NoA!#UexhK8dv3qEm
z9JyyD0WHdp-y_$F4{bTF8E%<0($6sfy-o4`FD&p%b%b&Bt+p73%Ss%Y$TIhKQxBe^
z1OMvlf(P!iOH2*vxM?JGeb|xF=#G8o`GS>7cKgXue4TibN~^3YT#3fqK*fkkq&Z6*
z{{ka|Jx_|RPd=+7hZsLTlnnnX=B*`b;&siT0wKHolRi5=V&*-y5q&>cio<PNWh9P?
z1oTK*)kj}*Cxen@Ww#{)cXeGAC`I%-!&eW8)3@W4>0(7H-}vuM;N&azN}SU=%YH-L
zuoFebc8@q0=<81C;_@=LYUv#7GwJQI;XA4fw*%Wa8BcC;#nWe{%VkF|^?;*!T*#zz
z%?H}vQ(g@VUWT2&#I||B4*m)Pq(!}28Nv&;<%c~NI3Vn0mY2d@?;c3CY<VVr1Iku3
zBq(!><Ondzo0tzc5q1@ICArzjvR*54o3t!a={@}&%;H00)@yx18Ek5BhH27`jN2uA
z4GnM?p8et~Tp!Ow@pbz$2IM(c#*M#8OoR!jx3cf_-UzEY=6vPFX#WW0xYbHUlMZjz
zah<))lBoj^O!v-^r&h{xyetm&QdX7RE(YvwM|)BeS%s_Gp<<ch>HZXt)OJMdX}_Mb
zUaP(3%EsJVUkAbwQhxPmD)*9k`)>n5xkbW<12<wdBR*--8h`BzvZ{IUu)myRgMtOA
zT!1SM9ca#vlXom#tcywL$VdnXkqB+W*;Vbhk^SjSr_ahUJ!hDugKp*+E!EFWjswAo
zjiJN;mMIz9EY##+cf`KxP*-g3s3smG4xX29Z_|LuUS~PE{8_G<!6O=X#DL{QTh%Km
z?mSA)^+?h8`k<S4V($4vtpKJN2C;oX0sB5?uW55?)7Y&9q`@)2TE;_Qj*U0fbZ^7*
zJ0+m_5GP>6Sh64_CdxU?!-V=^-T-+;!c}4U#)>egW8ghkPf*q0BKQci3uCcRVz(6Z
zlpg05+M1?+n-GWseeB@3j<jzogNk^H_&L;c1#fQ1Ron~3wKM117}UWS62zL+Nq)f5
zqCwLhq23<OU#!U|L)Lm2>Ju<GSMn4IcZx&ZU1<AjX#F<#QsM?(8QGNS03BqC&&0%n
zx?%qfU0u3^0bTkdskr1g2A$I(@BkccFC_7qPKA}iMQ&__pzo)jC^pg`D>TPf?0sIk
zVRW?+LZhVGa@)*ysUAjJQB&5=suS;-iSa_o9X5Lpy4Hn7-0#ED_!u8}W8U79jLJ3=
z{WR>MJZm6ZZacTOa&M;?aK`n3i7eylAQETXlpXF&0d<~2FUF~p1TRQxr)I*69Og))
zj9hDBs8e{tkk~5q?`SXW4j>=W&9$Q)BaDqtUp-Qa2=0Vjw72_gH*#DP56A(F)5Mjb
zo{aSrPrEaD679cU`WhXo+*=;7@=<|aT}d_7-*OmT)LNd2XA>2FUByOInkmO>h8L+Q
zZlspli;Dr(A5*cD6G;;@BYSJ;B`1Ux0Qay{#a_?HL^PD=#h99DO=C~p_qCJ{h+Q}{
z+D^8?2^+PrQ*pJLD}xiH8<H=>`m>!Q$=IZ0bQwdgzCi*W+DB#W$WsULjZih@Jmf0*
zyLtHv2*yH3c_AGsCs?j(s4LMX1&wC->TsgPN>ZD`zk0byF5lTY$XO}E?Mr{BuF(_l
z>*G!z%9%#$hjBu6#i5w8hKOF<7KKO`R@<>z(ubvTGRcv+H8U8aXnS^CYjc0Zl+YTW
zF7?8s=P&`6WOn7ff#7r^c~E(MyrZ*hL5CS1PW(5~W5QQJiKZ|Fxzgz~6cOJY-W|I^
z=179x<8BSmf%6~YarOr-@99Ifhwe`YEuR;jr2#*X7y<%Y=n6{s*zx*~BRvZde+&~)
zJal+^zekit_XKpzOE+e&6VG4-X5d)e)4|X@_j4(M@7?&!eNT&4<yBDr{q0QVOgyau
zl)lSxoo=%^lAeH8A0-DbbAyak7P*J#qUn^2iI;bDTxG+(uBKXBE4_5XrES|5p`mBV
ztVZqY3w-%dSwTp6LtIP(3iJ~e0@l#`@@c^O?1RZZ9e7;Dt?>IDcLd4b1RxT6)w^j6
z5?gE~pxZm*78;aWRE@X<;__oT#B6a9CK&S_l9#i2MF9itFQ22t{pW+W0k@iBE;zN<
z6|G%o5|SzYeALX<>8l@AdKpgd5d9?IdtRnkftBqO5HaI~MH9yB(X$NUEhW|L40<-8
z3HZnkac)a<bTO##&SE7aBVfEncx;udSj#icf3?14m+n}gQm<}LN=~kh#_(lKy0E+B
z>sRH-NrHFYv?_sF_JiS)x@$l%{`_ij;PbiB5yTxjvsheOy@0(G9mMAo>9roA4LKYm
zE4#)qBZH}3jE})Avh^hR^ys3FGFyiRD=|GW7joz^B3&m8xBhP8?9q4C#KfgN9e5j$
zX>38q&1k&SUW{3W4d4YK;I-qOm3&A^X?9GAA)SgOM_+g*M^p>R$A!r_OO3FuNK7I-
zxCq6;fUE}+Z>fpn6ZZ0VKN34CcUaJcLuSv}k%c)5k4A+TGK7=wRb^GqltCvW2|6$?
zITywa%5yJR=N5qS{h|)$f!iTFtAlJL8Ego3)gmc=M((Q~_`+&sINK~xmG5yiT`exs
zm~AFyfpC+zBxaW5aAh15C-k>o5IA8u(2kQl1f%3|AZ#b%5H?GiZlZ7b7?<K6<U4i_
ze9LpdCzhOk5a-BtTY10*iLWllbcB-+i|Tft%OL+}uRI>}e($r9KA4)uV}LQA>N<;9
z(~>xk9pVw1eJVc4?iIX<{NVy+v;4JFJfdxV^3W~C0C&~^8XIcK3!4vfjptRLE$Y-I
zg$@la-2eJIfr^4<RV*g{lY-3+`=?`KU)%<FBjT@1+S~@4o@g-ht4AdCp&Y4LP<d({
z$saS{<dVXAmkhzuWBcp<jyX-D#-*=W3E5GTZ>JYw=h>nc-ZauCggVI4(uvYZ*~5UY
zM&8E)#tMBEzMR0z;qM%^Z;*l}^Mqb%46V6+iHnzsJPisMxosB4bmtft;^1{0D|hkr
z9GAv6ahAE33c)&$#A6%Z5WYbr9Ww1eErv_m&XzfAGSG4Y^8o8n1qNl`L;FM9Lj_&2
zCS1H@dbD^*QHh}LcjA25-qQ9?fGcRQ8}MX(`<5MRc3u#wtz{Fl;mNm45f$_q=5Qw<
zcn>(Ek0zh4ua;&qh(QC*0az@p>U_Grx!+x=9VL6Pkt1)+ljb_+gXYU>;<+>oS9X>~
zU=Am2`S8GBlO2Ym1J4dm<h>hyb6G>%(^2;kKt23DT&ks!Pn=g9pzi;Kgy#~fPDE*d
zH{Wi@X&-I|0fQS_eZ@hqveA+~w<W+2TR%vN%OXaMV5<$N<^$kb&s>+p%n!A)P>g|3
z;@L)Itc+va=Q>c**e_IWgDqwhgg=B~nIvpwtMU8otu8G0{rP1HGk%2wc<2X>G)P`p
zhl{@($B^!6eO0dz%k2CI@i!EFed87zn?l>Ha=a3z-4%>a<UXUbAz<>36d$AA5;&U&
zIX*3%^^$;APgI*p2P)4N>Xq(q?G@nvEE@a?(fdUvGgMnyudih_I8r%RwLNYFMrj~k
z2KWuq(nfd>mUQV3Jj+HJH`4Y@uTKxkf|yKQf_xB`#0y-e*4u*U$sJgy*Q|?{i}4Sl
z@jny;D(e+RR^4dvqo$LncbiAr0q%K6Lkb?>$}`_d5ic{$d-G!no6Xbyv!k)CB<~-S
zC0Lje+FWSKPCr+-H-2TVDGpG#$CjRyCj6?2|A`QhmJg#XQ@~PwNd;aNG44IUJBpP9
z{Jbd6YPGXGv^93xRm)p~J}k+Vl+9j39{V3_0llhL<i<@W0`}H~_pXh*PnzSS$oK$C
z?C)HU=fqh6Mgv>*SPHq3Ax1CbXs5U}EanB8$Z!)!qC0BocZoyCdh$Xqwgdw|QDKt2
zQR(-sQSl*QrS7$9pcV2uMy+(slQp*b&z#m1FU{sJgta(53XbqaM_X%}bq~D^vRrvb
zzm<2zfhXQdERLJ?p><*CNz2M*x*plHy)pDw+`{J?W34793l>vis|^pQV|gi<o{6jf
z!$}sTrc1)P-O0ZYOkNF9_$7`FJE8J*aYm3&E6-dylgDHrzA*A(nLY^YVCfKaFe0)k
z#bxH`1zAt>3$E&s2}#<hD0=3r43yF~JYqK9D^O&~2AmCs&F%qoVD?P&;R(Uf#h={n
zpLWlTLCeXfxLHoiteoyKX$fRXb{cKx&6>gr*B*hdPn%i{B96Y1XxEjgret2tRTvMl
zVo4~j4B#rnZYAncQ>t3D;pPq(S8<s;$Zq7Rcd1G3d!HjtEMNVnZRB}FYo?)8(~a*<
zEjZi-0lw>)G2!P7`kW!3G3ve_*zC<AvsZ-@XZx5h-&hj7aJtr?!euU=`TSsE<y0fR
zWbQYmtarFGya}%PK2b*h7eX$vI56Pwc4P0P2tqr+=*0cgo8KPdhnDtF3(oT|@3ZTO
z<o&ad`A?kX|Ihfl-=ySu-@VNfgG9nAmUl*+6R&w!KNU_C=qMf9yc>7@@glKb&g+*U
jejdXAH~$y@Z0k(<7Lq?8A3ksn_@}C<b*JR^qi6pIyZ(!V

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/source/assets/design/v1/prefix_caching/example-time-6.png
index fbd7138596e8919b174a3d5489881838da2eac59..6ded9170e8e83146ddf77db70e16bc37a144c255 100644
GIT binary patch
literal 54829
zcmeFYcl0B5y+537uuGRJMdZ?Uk(QavOePAlnM_NYlu4%Hg`FhRl1!#1nHf-*MLGzA
zAh29O>;<J=ih{0`1!Zv;P!vI2K$fBi$Vw5B=QDVHSbffO&i&{4=Q+P~^38Yh$)~<Q
z@A7(=oYUl(op;!8hfF53bD1UEnam<2lUeZTrQ3m)M}BbhU%|fxL7O2m-~0IiFJ?0N
zN=Qke9qHCs%N&*~{p0B{IBPmVcvy}+42G$()(?GW0v<uT?F=m4(gy$d4$i{p(b?S5
zFy4XkhvkYW1m1G+F;Kps{G-2WXp{e$2t6hXCRkBd`_?3M0`ssO0X~-}p*9BpK{I%7
z@Ze1Xe^B;h2t7G}1b8gE?pPDGUeyW@gK0FEKPHEP<`q@8Q?DHcm%#Jb8foAkqxDBl
zI%MFh^J6RrbcYJZWRJ;#rjqG9Ht0k{Sr{~-*(`btmVWWU`9A2(tGeo2+pLxzT?<v?
zAIEOkMak$xW)#bFZY*{t-3j}T<IXfcumG*zr`$i+P6vmvtNo*Opv|m4*ht#y0`dg^
zXa+}u56e;D4hEKL_^J)Qw*H6Ak%3oBZDnX`QX!A^#^I1r|1rTt1!wxlV*ePQSHj)^
zD`RvWkdq#YGfvZtV9I+<Y2p-ojjL+~Rmq~-v_vy<*{96h%y&CT$!Jw8Rw+^djR~kG
zo8=C!C!v{-2%on#d|GcCa)_IyyaP=|BveOUt`da2B>KFL&|^jzXWcfXv_mTrs|<t;
zP}J0!K|;Dvtzk_{W|fi}m3nq0ceFN=<@*j(r+UetMw$Jv5q5%Nx$h`>DsSNh0c@Qu
zPn1!w?{T4?tv0NBzBiumLv2`P>O3uW3pfLr&SY40(L@{@5oKiuG?JC+X_>|XKG)GB
zS@pfSPG(CS+Ovm{?c-{v;5(2&`op}|tR-NTrmOT5AqeVIx+x;*x^t!F7FrGz(&(hw
z9Oe*ZI0AP;$TP^WqM_O{``L1@$C-8>Dv|9LS}+R5c|N8@SsW&#tWpadt${IZZaz_H
zwv)|aScO#Kq*1oI9No)PH5u1p!|tj5LSW)7Qc9{M*O0oA8gLFa9?dbjTkr@JMJq^Q
zOqDv*awA*pnPt0TGUKSkmHTu}blL-`J({8(*c<PRqzRq~1zx1b`C<o!i>f?FAdv%X
zLMYi{gHc)q3O5xhq;dn2FV^d1RwcT+UNJ4M0*=i)?X+($ndIj=u7+9dcB4$?cn_ac
z`$ZE1qD!<}*?0!w<$_j~38IX1#RMN0tH_j?cUY&$H5du0j~cEdG>k&IIc*u1fJ>!r
zH4NZ9;Sa$SU7%5jJQf6$4=1q`G?Q#L@$!ucYSOwrG=(lT4>8TCk!pKB_4`3h972Rx
z?abJct(N9lEMh#(9~7&YWr5FNHqYzl{i=(Ve3?KHzlw03us;{84Z9g+jX7H6I;{ag
zLkX*4J&Q1OFvdkDrZD7TLSmVgJc1fxIqS{YVXGL<kRs?o`+nG#y=H86L3bD<q<LO7
z=QX4h1-ZFCFN{DlDoqI?ANdmo(T3=#-7B&(TX8CSu4ayiR??&Ot{fK}oJNekQ(@F@
z9%x9fH=&Atwl(3=04!vYYFL@ZzDCsx?M5#GI*|siDr5K%k;tk6Gz)=npoy)G30a=&
zRiL%#NxkgO%C%xVW3gmbP@|B@NdaZ}C8wn$<^B-q46>mE`R!Sk<E8QtRl@=@s`C?z
zsLspy-0sHHP7BNpCSxju9pGf2i<<MS)~sVns8fK^r7;dVkX?4@C^I8FLMk#nt}?0t
zcj_ZI$aa;n?*#*+HNgvpTN{mtQnAHl1Fk%;lt)&LBBxMfRPtzxErhBbXf_!e`m8-~
z#JyT^NO-U`utvD%lq}101$IQ?^2{0ru3`?vCOWq!C<->v9YmF^YN(MucRh%K(mi$B
z*&5AyzRQPQQyA9;TPUj<)N<i!u|>vn5p>F<rYjHXU5_*o@Uau5?`ocQ!I8mJG@8<t
z0ZIk10LKB6OC`y+s-2z{0xqS~E+vFuN6g%1s-U6QDovp=Qk{&|l35am6lX+(RwYt2
zmsZTO1yK}`5*o{!NEXwyY1wnq-N#c-w5DZC%(q+x8fOi~_i8N}>J4;1=tNz#08z*Y
z&9$tl;5J-R=jXk^W7VL8%0sk}YgofR+M*=hcbFnLJMCn1PRPLm-GUo4w1ztNu#3c{
zEF((3fi;G+f~-wzjzB|PHOAarzMC5k!=VBRV+5bJWfvElqpnZ7g=~ujEFCbtDcGkp
zBvT>p<iQd%@Q|G#CB4r~p<yYB%`y(j65Fek1HP<KfT7wkf|gja$|89`uZl$3_SH^%
zY}X1@S#>66YsQpKn1*q!-?efKmMSXRAca9Lfm^wdoZ>X9&T*fWLty@CvLI1TlM880
z8L-t>MEAzBfV5?XO=(YL*#hQ+ZhA?Rpcpjt(a@pwqF0c8lWuiw*-v79V$KRUm)G@X
zu90ZIU{q(_4ud-mUXuN&iuY#~zK7;!Z7>r%ga)Bt#50VU5xm+LMpkv~par`fdN?nO
zBWw`(RSO$WD;=9d>R<?XB}v@}vPcXe6MUNFt9cEZkNJ{~W4h^Dr3%jAsm3U_CLyqZ
zB~7(0UzznA-Y`97%6TFKS5(cP;2zVZ2cQ9TC%O$C*MaQ^AOC6o{|yU#VCjMQr`X}2
z;D9L}T~6`>cQWHU*;p=2B9Hnf7Kr{G3rHnV;smyqhX<0U1EXa3$Iz(l49ix(TX6!V
zkZ#^+(UdjKwuU)*X13sZKULjeYi$l5OnSX;Q3kk+p2sAr%~g9qOyg3n(~F0sX+^b~
zIwtkGC<-EA2T2*(Os%Grv#{y1P;tsN>s~kkV>pg)mEnF{b)78JHzQ|MX>oEn*0Olk
zj;XvpXxa9NY}84HOrQxKj_|pqW(U(A*h@0WR{RDs3~Sm@!vcedWUwK5ls94-BgbZU
zgs8Dm9JZ^~q1j1~G?<p~6e`kQ9(<;&2<9Zyl%u*jXXn+>W0Zsk_8Y6|Os7yH3{0sN
z1~J4+9h`*i^t@9<#G6y6mD<Tfj!_FqiN#b@VrJJGG84!F&SM-m%cf%AHoIAl8baAp
zS4kRF5uS$~t6bLLJcS1^j>!fCajfKe*@i|y92i0yt|^acyD15#?3$ClJHzo{ZuDz1
z)y_2=O4aSC^=7$G&6Ot9^04+CQ34*UB|s1pND?7orNL-!MmZv+N|Ya4oq;J#x^{8c
zqJajjB`5-0m9hxgyws(+pa>X+rn5P{&d?-e_(gZ#h4N-QRR=CSXOz0Cu&GiYoj9rv
zBc-erv>_zT8ls7gVJ{elr9N75i;x1$iP1BSzMk{Ow4QHOVUbWpy@-($LS;l^#Pf{~
zX*Ilt-RMkarb^Yc25V4ou4Y4xCSgdk{u~YpQKwoDLf@)Os?X9m-%MCOXthK`ZuEop
z44;}NGtW+jCNzby&Umgs?9fKYRx(p^42cM;gm?IcFevBzd|Zd?gLbo+1nsg_j~NQ8
zSw*@~D=P|PqZ-2#lQ8RcfXd7DfLW8ga-1GkHO7u@u@#eo!fZ#KL{+zCRXyD{_-x0j
zOqg<c4w0rk?-!<)OehZ4DmTG6pB*(N&0vQ?by|#xib5JAJ-wbF7cy*XoYh(kmeZ|K
zt0%O&wB?kPdQI){NW;z}BwQ>d`DvAPgC3z9u_BecY-pF~iPacZb#u~CnWCT#%{fZU
zc^U8Z6EhB?dCO={>=H266`qm|oCnubGz`86NCxhfuN0H)TpmPWjm=5Le!vchL1Eg4
zlAbUQLM_UNK_d}bf&&d|t-&zg&Ku(pY`H*!Phf)gKNoPa6%Dz<aB3kMJZ$&13KPn4
zXP`+%#O9R_Hi&!xdf`?8mz=Vx7Ml|IJ{uM|$E)jeY)HLtd41YV3Np~%G%*64+9St$
z;8%t<No$)$u&5#z0f0z#Bq@;L2wZ9dN^ok}DUm!;HZ2B`;WDRmw8?;q%DF*V@e16S
zM$r*$D#b<)t4|a#n&?o25pS1cv4`WKS8bN-5JWktHz*8eSx#|`NUtgU7=_v**&vGJ
zs+h}W16I&e8&w@myJabs0>^Ptia>m$!4<kmTj#lEuRCBBz20^8Mq`NSe6HPW_Q>kA
z@1l5qnw2%x!|XsdJm5(Hp+@qpe1J`3yH`#3r8Y*u9OQwjBHTiQuD3g*BHpdEyUprc
zS6Nw|qd5*}cfu$o9ac%LltUtF-v)pPuR^t|>Z_6~@%=pG7*sWjl%}C>RWzfa7Hg(R
z>f_W*5y8ZTr7`F5)3Alqr{YwsIsF;!(70ga@=>a^1VK{i965(EwMb1(59t~`bjl=p
z$Y}+O0Sm(xl5I5!r5))~vo`3(V;J(QU`qsqPuOJI)9b17&`F5MO@O|j`I;o;6<g}4
zbRHRHComLuXPPdI&?-6z+A&|v_XU1jYhpdp$7d$04~#CDt_ul5;^;vC5n8)bmW@)K
z;Uz@uW4NQJArN;#L>w5?2ko9^kD^p(gBLJ9g=V+GFp-h!_YUEWIel2x*s$YD4K(H9
zyetuY9@b`koYGM-^%dx7&bYde4QD+fnaZ6|g|R8^SCh~#i9i&qO4xKpE>!3V4&_a{
zP%etQ7#8|-A>pejrbL5)p68`v-nOD>EDm6xT3Uiw_R4LCO~gTKq~N&$PP1`Vqj{ip
zb288!p(r|rlU`^}qcot>SlX^bAui-vxpr8~Ldi_cL0DZYGh(>_$6UVvoXLT}g@ypp
z05VMHDFO|v-4<5sGA%=D7*=^qbTOSMC_)}kkM8M74C$djr=$d*WwI;*53&#JB~2%W
z#o`Sa>vS;SlS8yW?G^_RA99JFAG@fZ^zt(ur?8=m0_;a45y{y~`eb@c8b|<^0z?`9
zclZRzI=l@&X@r$L!wK~`&@^E{w(SCu{U<(225GE<s;79H3BVhO8iBnNMGd?OEgiIo
zrVnN#a=Fl-<z}-sWz`**0+yVhgqb$W%?ZNjx2k@ZX@nqRL(t`xs^nl^j=Kp!D!m30
z%t^d366zj17|}D^@u)hJF9)I8YJmQ2p*r?yN+@`d+;EkO31{hHB<nC4^Sr~=h$d#s
zlw59yJse9~ttwI@mAGB(7sy^=Y*KK0l207AiyLLuh>W06(Z|ks67*Wbu%O3OkGH|W
zh)!b^Vw|5C2Gfo8sDcwz9v-*rZlx8Cky3+2VtQ0}h^$vDu(e*(bZo5Qd$^Qd0(f;;
z+K&*uT;c0zJ_zd(E2Ij|Xtnj(z;@8d%n{%sD?mvOZz8EStpq7>9?ElR5=D4D&Zb1p
zHUy)hRg@v!AEjd6nZPVtF{%}|>C3ISFgKWFM9FqdBCByTPX~>-UkZl;6su!mV)S`U
z#|14ktc3NmhTTw<iIp-~IIGWRgh@Kdd;+2+G}Uq@^%g?=^I6Fir))N^cbf$&5QKJX
zKo=09Hm=w$Q^*l6MFze8s0H>4u!`aorB)q;$UK;{%!<>iD6wnsj9P@7z@M2l6p94C
zA{AQQLB7;Z*~EoQOfMLzoU1k4Vcw!A9S6lw)|+YlMw~RfZfa4wU3A!Kn_g6}jEgz4
z2$V2A9aO7&U<h3yQAHl(1)@~17+yVgN)^Ebn5$i#&{%d%mlLv*F9QF$>omz^KFo!Y
z-<}69Z~1K5A5`6eggXMy)W>{rggIqa56sk!YNSpA?IB(aSeWLDZJg07Su>mhpPt4#
zUG{yr?W7*Q+_ccJ=b{}ERU>&)fZ0-s;PV4PCg(}$k6UdG9#pup(*i8gPO{z3#HnC|
zi8F72SVhMp+$?913NgmRRQMq(<?Y#oM&$x&4zQ5PwP2ldfEzg?o9&68AWU!AK%ib!
zk3df~@21=$z3(FXm^su7XfVmCBj24?sd4P(yPYrxLYt|1;zC;x19j>L%_4BXp<ESa
zDp+&YMdY|?$so4jk4l`NT75;1TSYnboh%_$L`0`AaY@mi@Dqxd1eBB#h(bD;Jnj#(
zZ7k#~JT^tU-ZTrh3sa!bUEhNtI_GO}BZ@*jDfj&pf5V{!`BOY}rn#;&b$hYy&W3!+
zg<yO_RYjp@Vs0=2vlFfmP6FC-@u2S8(+cg4`_NSCwp?HjXw?WLV~$ZQ&@_b6WH_n<
zg3rW+8%;YVMpbh<i1nFLk!JL9lC4IKrsY+;Fzlu_Tt~W<N|)q;lWGkhufb?!<Z-16
zGf{<+irY>WY5Q@u+3lC7wmhp*D$6s80Y;)~8&w2IoeEt7hw6k93c&k~m{C8n`Z37`
zqzDd4%VAWgi?co#s-=-zb0)ctZ)W?3pX+KxWyGSMFL7{r1cAPKT&~Yb6Dpxv?HMNw
zK@4bO!zC%F*nYtkgt&v2n}%;gwW2?i%L11Qh20fEXthms<Xq7Z+Tk!@Y=lXMO)4c$
z3C%IF*p@0&Wu_-3Y&1`;Dh<woj8hq}s1p@~gh+N^uP#Yso@IMcV$Y|^V46pyxk=>V
ziUYw~vy>-zrOA1r%vCFir&bX_g0e|Mn3@k5*pZ4BV*~3(&+(``Q@|C_+1eB==IRJl
zlmtOH>bBmQr$m8?0UM@{Txcg2G*ZCf3DicSVioRFqy7|*!FO`jCkt&SUmeyPvvMJs
z1aRUZsAgJ_$JG2Tp6?mCP#zYzbfy3v*%g;z>WG%Ah51MZs*<4SDlCD>b>DYt?W!b5
zl_;2x<+7XW1#Gk7a)3s}$m2TstUq(7(|k7H7+1TgDW2*HoUax7Vg+xiXmysxS7==b
zBNVT7nqXLClyE85WVMP@8t{q`>BD@kHnjjEE7zTz3F2n7!_bqcE6mEC3F2<^(5&Nh
zhs3MSRGUwYBGqq+#kj_J=rmS>u9mVeO10dn-AJr?7OD;F@CXk8If(fvH9>hx@l>!X
zF=<a`G!NHBorZI8e$ZrDuWjh=2<jFDJn4s7ur(Xv+lg%zBsrF=IGU1vjy7lFq=wa^
ztTJiMB}`#SMXL`(%|=jvR*>2aX;9PHAQ9bx2nGc6Qq;pBAf5TC%wgmJ>Uk*O`f(l^
z7{bUU0;u9Ef}*D2W?B_3v5g9W!}-?C3gT*r%;wcz-BTh2n)BlvKE_fDpQUDw2rE2E
zG(ta@XIR!QfaVmY3~3I8SqK~Rqy7XQM)<HZ7Nj6*jS9UPJl7>9*YO0o+Nq=y)yg*S
z1I=ev5!d3=L#LYtFWF{&@UMW5CbgKJzYl%>x9A7lKT!7ncXURDPM#)REz(=yAib(d
z6jQL4W_t2U)a#=G^3P0-hNWDG6p^XIN@y9e2U*zfOgpZH56D`LBdq!i;f57A<@gHV
z6yBn;C7bXk6{*xRb&#N=hYr>tM-C>}hW*?GWQk@}Gw{gHsMYp?KR+B=JTP#>A#jpo
zYCP^b6o8iEpx0L$1zAjQVLs(Mfh2@tfoyWcd8-e?x8she#;j;EVmmOWn%U{~6q1}M
zQw*y&#8#auaC5OSC3`Kg7FXRG90CYPm@@?*4I)KgXYH{nwz)E<M>H<ep*QL@N?s9>
zwbqEvhTOd2bej~&*#OEAvyR`_Cm}Kzz!;8edI3Yyn<b&39YTX~ZdA^PEM6`4#@rO5
zS*g$?hyMFppW!teDipbZC5L6bS&QbpUgI2K2x-}5O5<!tDKhyej7EXsi6qIF!E%LS
zz1i%89F$vPYdjkF`!gb8<$i3pkV2TyZk$3=))|HdtUE+Uu^|Va)tFqZ?Dv%(p#Z<V
z@6Y2-F2@v`QAck2Lbf*Y`aIv%%Ou~6#)&I>HDfU4XMH!Ek3bxWajb5AM(F)Y4f7%k
zX$|Ledl=w34Z-=A81>xR+zr}xMB#;8Q!5aN+>nJ9J*@}z&;`o7kTd6IKWS+rPn*}+
zK?MZ78#>t|B`P;!8db_ESbi3(@^Vy{+jA8KppOXgo{?Tmt)n`EjhhCR8-M_AH4w;o
z)CE-P@DAzlz%j8$_9zyblu*_aLcly;#G0eLPsdnZ^pZZ>2A#D$(&4&H)|&`~83UfA
z6I2Qnq2uAX27*^nwUp!gN&uoU4hPQx0Vmc}o;5AL23a1G$RN5=O<JR2LH5GBSZ$K2
z+9$Aj)*klz6C(opR2U2ykVK2<tO?I^B?6)tRuq8WNsJvf2G{q#z-Fa5t2$0W9nPIR
z(^87a$m>+f4WsFfM_Co5^~yqpCx`5?hD{y0&^8&e#5&>x;>U5N%+{N^UZ+{BMzgLp
z8nJ$N9$8U$T9K0y#jBAvC`QFufHv@11eN_+pThA<AH;9y(X?TS$grSlC5>ykV%Qi^
zS_-2k3TtRAC-;dgT?b%Z@8vZfc>hzzZP<Lee||<r{Q*$5$t-6LY`rqej(gt3Z}KJ4
zGXeOem4MA#H5yo~LLKKl0!Y5mH|7l&GUr1}Zn=B|#lTF`%x>vs-Zpu^<xgAviesBx
zt2-4(({Q9T^qy24+g(pUDS;TOx!E{T+9fMeWU^q*>4+$pHJL?KO|;PYXi(5=Ien&u
z<vFHJV^j{C;G&#?SD!j;ITjj-Ug@MKtd-~@l9WK$pU$b{raomyb+2UxwT|OXX{+3o
zbB!>;38<$Rw0e^)D*-j0mT_}RaLxc+9A(P=MltZr8k#rt5i=O%ePikw!Cb628&<!p
zJ7R&QB8=80E&^dtQWhI#Nd-7JbQ^6#KwN@E8rd$A;Q4kAjaZw{CnDFRXF|+3X0(-J
zY*TE(rfSS2ULCix+IY|uu=Wr}?GVXwFzfTPIn<k|cnDMNwkm`{0Gx}7X2x+a3sjJ=
z9WoZ3cW|EWhJzGWgQL%SU7Jj}a7ZIL0htLBfj07!iq{U~xrcW3!4yx;5gm^}|F#wb
zv@^*AV-7I`2n`EWXjmV044a~18;XZ9DPo|W!cjCw9E=hbf~jtbN#q1Wtvn$FLttkG
zsP7gLX{1dGozfsj_0l*vL@-P~mFK+5sENgxfmBqPHVF0A%4EJ)*Xq(No_Zh&Zw50y
zTg%G>GnmhY!ien**dhR}U~I>mBWeg?qSV*zwy3#0+iDDOQ((|o5w#+fvKWT7^17If
z0&zM4!CK14<S-xurl!T&LX3~cl}?J|hvO+*Q7hrJA^Gl%8T48@4~El@fmS7FpyM6O
zD^Wd~=RKe*9YJFvCh8XB`iPH1u#7Y3G(i9lLB528e|#E%>;|sDXl{R|z}?tZu<@|j
zor!&riIkK)U7e?-j%$so)-_3utM!U4)|quUK1&1%2rYQ%1o)Ju+^AyIXfo?y0fT9A
zztb7Gm^!I<L7){(APl0i01TSI37qEih(f1KtIW7=On~<pF*JrUAaOK~T|W_g5NB=j
zTGR|>nd&tsGnK?k00Z-dsGqA&q1mYMJ~$rQXzE~7aIvP$tT9WC!b-6*%6BJ5FIHVC
zM6hwSqBA1`PQoS?%xE1`n(TD!%A|)0&0^LGvwGa_)c7%>vwU6kvm`4>91fM$atvh+
z$xUN<AXei!3<>ZlSD?$9Rj4Xi*YM<tP{%cZ2~w`b26UnWd`O!uG&QCz{C^+K0XMiU
zp79J8EpYWRC$|kmp(wdy@&AKo1dwe4c!p{sP}A0P<ql#{l>rNChI%ez_j-O6IFx;@
z=+`)q6H(^n$v}b%QIMs1!(;pMG?wPHA)>PstPVw$vAR0nt4SzEjGHmw2wH9hNV&u*
zcBKM0i$sy`@@Znc6$Ef-Sv+8Labjo3z{LXBm)iY6N|}gIL{5WEJ?YPj#ITYt={TrD
znZ`(Lpb3YQ2|hA$ILhWhS1*-XrZ{An#`~xy%)xU{41_L;g(*C-w1lqvaO^dPQ6mn9
ztdjJt9-gf=W4_)UNo9Cq%k6|}V>6B<N2XR9O>=p?QYsHU2%EX+G-(Gjtk6%boi?e9
z9X(Aufn2>-H#p4Asu7*~%M6ZSb1l#dO&maPqD4g=6AGF$e}ts!;eT)q6_rXo*$oV`
zH-fW(etpLT5fyO9gadGKfvPtO#*j9(ywYfAd&8iibjoS*5y9|XwAm-)8pxtF<01ei
z+-RbSoGsD<3K*F1tJz|%lT#`~lS_rN3?e$*9G*$7DLp7@Y!)u&rX#BWRcW#^ZI(Q>
z(rELoTEC67rE0STE7UOd-AcJkk4VRj`!g`fT&Q%tN>3TLyRef`CN&!jn=^|qvn8QF
z)+%*5XH{XVVT-jYaD?&^7*k3wvKR~?wm4Rh_d{E~h=WR<u|YM79>f9j#<Z1*>5nNl
zFXf`p9|lHGRf^fzr$8duv>ALfjDmXHN}U72C8`}%FQWBP%m}j*N2STVN}~%~J`D<R
ztb9+bghmwz4w2N#bD>u>VbYgs#o2Hy27s&})dw<5)u;s!P7}l(+5(6>bUZSYtUzow
z`g}E%U1Y2ib~FzekgTLY(7z1ER5WG=cjn5p37iTLM=7Quxd{x`C+encIaFZ|(yJyp
zC{^MH!j2Fq@r$~fpyN5n3&NF(n~$5ZTOReZMro+9loF1mUcHD^%OE6L3{b?9x`~vw
zx9Tn2f+{vmSG^8ZmlB<uaX6|#YD2SHV;=bWePV2sslH5CduB8r1{JWGWE#zzzSgRS
z7|qxHs0<Dw4Z*EO@T}Rc+mlwcr?x4a8u$8>k`c{}R>gK43(tulGDH`ZLeLv#OP1M~
z)`6o-SNioyR366KgoABmQdRO~81|rvBGIm+hLd)^M~eN{JP7<zb{GpTPt2owOJ!YF
z8Iq*T4hkSuo~t(<1-Q~m)(ecl?aXnvg;iUQmN$g~icNJ5n)`Cl#hEHe*zi=9#^^*T
zg@)V06PM(Ng=(=>o?s2aB9&RG0o=!aMZ;K!Rh*<g;<9E`k%K`G5{q<PiiJAx2fFQ<
z(203F^t@UV2rJ0ZgY!(Ag&fE;%>AKJ6_F9RMG$KUngm-I(rV3Z&W0ung6@2~9=ACt
z6b6{)%o0#q$25ouUg3~V38d~c%|g=Zv(Xg6bhusPFasz!kTE0GTs;cBIf#ouk}|K>
zVQs7rBeH~7V94v@6(f%MS*gqUUBX0htSAKDv|s3U&0GbRy@)Khl_H4T)H-CuV1i+Q
zlBBs&709gH%=Z{C*RFKRf$4;rGE}*#1OxM5m8L^gOxPkSV_{M2=cnW#P&nXz77dRs
zf^7yqGi}txsA*|72v?h&E~Ny))Pd76BOC|yfq^;(kp+4}(m)YaTQx|VRzQ#k1_Df*
zaoe(8@PLoYSfhhyAsB6rVWBG3SyXRYiKa|RSDQ2@S+quhgpg$e$tT$;EDu1fSs%f<
zX|1Srn;|yNS9vWKrCb~9<cH(9-J7&1-1h2(9M|%+fcJ`nCWysOCS?mN+8{C_=J7ee
zd;(r9#foQB7_5p`c|f^XN9q8}MF>f6z}B#l(<{4JQr6mDv00V{8Z)&9Ep;Z%nx_TL
zp-`?tecr9gNiL7Mt~qBIc{0yc-Mrgpu#w?53$)V+yCZIjfr_BAK*`o5U%+P_sN!+}
zGShj+ER5#XjDe#RwaxiKZ7eoW(a^K36GT|hCY?gFt;Us3&L_eEo#ZtUFW~kBq-y&v
zVhvJls6tSorhx(yk1tCtVW2slDbyj7*Ha2t#CEwpg?a)b2t`B~H2M?U?e{xohrtFd
ztO7)*3Sjx5Zo1t$7P26c)4{>4J*~@<nmO@0wTUH7^FH8pf*s&GQFlOmDa3NM3O?$>
zgHo;sjpiUjNA<%pI<1irKJ6$CuSHg1YY>A`${Y`~Epr0=1U11Ec{D=#`Vhj=Lb-wE
zT_N$pJPl$kM7yJnM+_};U3e(9sH8h^6wDhW6h~JIEf2RQ0!g&9xmfcigZ8M{a}-_>
z5Qu9RfbLOUzuER@hC|k>g*sJ{kz7AFm=^jRtJL&L(6>3WkON*?y90MIH3#BDpaPBO
z{jskQIRY?P*kbdbzNs~6g#EhHGRkynSZ8z5C*i`V4RF>N)CCX;S)e3KAjfFmsZC%j
zB<7)xnBAe7*XLYo0z!u+j~m7v5bbX1YPMj(ZIA8AU@jL>TBvdzm;<3uf*ZBFxrU7P
zi$aG&bGa#nGkmY1vW1pv(HuKrB%gdgu{BqdK+?dym!_^cgW`TI1#z;`DN}_}BDDdo
zB*Yo0VF<iPqC1i%xK-FAT4mcBR$*IYQ%zooQ*h{u0N>KIW<#Sojv$N=5YJSfx6<qN
z0p6jjQqDz&{r^241cSE4gGm?eG#p+$I#16$qSo#XbO0~@8Q=Q1O60Wqs4lQQP<_-H
z;rULe*KMUgHLF&gC*rc8&s<PIWsIm6UF(|;*koJ@WH+^vJs1S^T6S1%<SIz$<Xo&i
z%NCVsTg@vL;`3hLiiJu&g-#R|#Hc{0pvCd$1tQl2wg%h>VW7R3#45hjV2OaT<Z>C*
zIHFms(>7d$tyTEB-9wrvKDL0?BMckl^&k@|CIyg=N7bCpwx^shR!Bbu_yLC(m7!P8
z^Er1o5CyC4NbYFf8fKARVp9_r*fNkqq*b<?6?<Bd$N9>D29Q@M2pWd57|!+zDuxaQ
zpdhWFd0yZJfdr-S(=c$nP(y_nn|e|f%C%>puxFBQw#ub>pa~^40%^olgkTv<!<Amq
zbQs>Q&5J3Lu|v7s5u#w!Bus;yBZQrET4tfwYz)nj2O^Alkl~W}fg=x^HGPg3xv=Ns
z1WVdI(Ixu!(1ts3z5_~6G$I%B*_g~tV5$)%az9Dty^$w+BW-G#><ntFG6);=&B2_n
z*_APmsPp2a*c{hDd7uWVqgs{(wGet6pUo(TR~fk5kAa#ED-#La*<?hP%tA{rZKT?c
zb0b2An_7*bJ5@4BP_!88Aj571eUs>OK^tmea#v}yblau6f{a>lzRe)jW|$Yve9Ly@
z(V&$bj=cn)Qss)$<<+)LrvCb%J5nS<E)hN2=nE2N)9-siRmt`xrq}JwIF}iTHNWW7
zvt-;3+8r!{S&@L7`lRYBu<TP@XPT(ft`P<ux<^snDmfT&9ed_VwXR4gbSab(TOTqJ
z+N%p<L{OF~^pN*Uv+8oa?Ix+yCB2Up$XS~}YN^*r_ljP5igX($Qf0GkU+A?Oo;YRX
zS}8^RVqdYQ3hni2YLYljzAY3-lLi?94kzLs7(OD9Y}XtOLH$77Y=hJ*Xlu{nx^2~|
zW-YAL$7RX_c-0W=V~rO{aoBT>3RF%t6AL`nPMfG#C@#&BA(cr*8|C3XUxF|*tAip~
zYHGQikvs%h$Q)^lNQ#E5C>qMvj4yCmp**PT#h@+F^-8Y^W|ToCjExSuk_b`&CBvJz
zh*mdC7M*G<x;m(fCE(-J{b9=|QHGnPbsU5}M|%V(wMxh+KUM-Rvh)eanX7>xOgRU^
zd!~v^^+_8gq^568*j7-P6a73rtt13+HSr-}d1>gT;Yd51rSrpjJp`HQ0ttTPQSQ;g
zI2!X_6i}iPkO`J2QwOfsaO@7KAdf_}DakWW?8X!*9`2Yhn1;~o9LW8G>VZVYN_Gj9
zvSss(-|_QJBFRcXqY?nSg)jhbL?HpX3X)J!4{Z>EPxCoMf$fqgr*|v`@#;1xa<NN;
z0AkB5I+j7&t;iHuZd~F!DXId<J742d>zjUP=}uCdw5igJMT^a5wO>~UZA7$_cAXj&
zvy$|FVght#BE1g`72yb?ot`JJI8{7Vl|ndA2>}HbB0EH^00PtfAYU#{#WXroA9idV
z)NnCU-A}KiB=^#_^Fl$P{hB9?LBV@98pmBTDkgNcGnV?vEJ)I>GkunetD=q%E3<Jq
zM(cFnROoy&A);KIbzpobR6w0iedZeIP@rNiD%v%tJ(Y>XBCIhE%n^Ywp4XFVIK_oT
zo1ArP1A{6Ido18+3;brI+676bP67D;v$_lCK<thH4vAmrfQNay2L^8RSWsw(6CgT3
zYb64djm|ZyB#DVT1udXfs6I<bqN4FL)@-q1Inv72LI;E|2t6#ZVyKj{c~gSn&R8S`
zd{9y<vsn)HL21m?9@Ei8FaVk-dRVq5)=(CHAJ3Zj46ZkaakG~93pP0mM7Y<Sx82Z&
zIWgS`@0I#mE>*&Kwx!KP-7BV&Qg)(F8-eE?#;KcKGC$Quy($NRZq2xqU<D8X1wVzL
z7=+YOk6?NR;ZAZ&O_>a4DI1MC0JlMI4=vC!4|nuVKSioNyA2Kmc4v#0QceaoHsJiK
ztgxv$GV@w~pva?n+o&|{afJf47n5ckWaI6zH?>PqHBQ(naL;(tkdkrgeSiWv@XMPS
ztb?D=G#gHc#C<I82c{T=UD4N}LLxdWG{Yi(WW%lv6~wsaLqnd{TVOZ!flB6JP<%cH
zzs5m_0UM%~8p>ndq!D3|0Z26f@S|(Ctz1{?cKmsj1wS_Nv!ezhl06;ficorQp{D`C
zW~#x&lS~vq{76XAi#i{a=#DBDq<mS+6|!L-qsVmAq3`87;dC&q!4n{VeGo60;X)o%
z#^ij(=*}t*02U;ivdpNGq5)9dSwx$LubZ6|15_qZD(A8sG^D^LtxBQhv(47HpVVX_
zy`Tg^+RZeoRKS%`lzKWw4+}N$6QBRrEFkFnKR=txQ9r)!rO#$E2W84+u@jy2$NyUP
z<j%i*@BS0MwzKu6gR37Se{<LuUlPmK7upx!L=V2T;;Y})e%gNgX5q52cEqLsdD%q|
zT>Zscn%7-Q9Qxj`j19-V=l%ASYd-Pj6<2&BxMpK~*PX8(mowteZhWKgyWr;sEL!rR
z-4^`IYv~cE^@%GT?Zn08%71wsRQyWipoIr-``S%exI6M?^uOR^{<R%+{rI;1!1Moy
zU4H`q_FEZ$;jgbQ-}=^u&u0GaUG(mw3l7+R)i+L`Z~f-dvsZucT72b7q!^sA`kW8l
z@zjFNrA%YtTZgPX?BfTn+#~b51()xB*vV@b?0LakZ?`i?FF1AKwa2Z#FY|+pdClMs
zf6-eH4&GRKvG?kK?~(cLlI%V!pFKM|X5kIX)-BCkkU1c8!vX8iE50f&S|?}zy6BsW
zRvo(f;KRRv)S&zQbGKRi?3LN2<Q~W`7t}I0JaX06;JQz5zxwqJ=Xf`-8vOQw`+l04
zExLS{%|FN2oViDG=8l;scUg7h>NQyL^@CUMbMaAMy7a7XWxjp*zT2OC%6==IjFvfd
z?+-P+%>E1RTz2i%Zy)gBy62xgqw&yN=JTJ*{C&GucgdV9{Oi3gJ^r+4&DRfE&FRaW
z1=$C;>~;H2D;t?#EtJ%APouWnJYV#}cAHOr^bmK~)z;VNr`>(})n|MobHI+PP6NMS
z%dCFtvRgC5wXZb~x@Ex)zpI^f_2o}&cys$j(MQ)EntAhz>(0FHxjUAgCmp!ocdq)|
z+9&S#`F1Ne{_MH0pS6B><n8*y>;L>G^o;);Wlq@;?Y}TuxUT&5vwK%=J*m}^*KFB0
zbKAnpmtDJ0CMm!0kvkTy+bQ$g1>>_HSpNE7mao1h)4Vg=JfH=zSbe*7r#3hJV*6Kb
zykP%pF1YO2<h%zC*>&}snJ0JLJbnG$9T!}5!I}>)7+m(}u3ULs@zGt^9l1qVaqc-=
zwhXSh<d3gBbPi1Iviy~up4|H{N3Q<ehWCO;PI}~TZ>|igzdClyW#3-A;P+W-*W-)g
z70;YOys&?;<FCG!S@EgcAG+X0==|K0g&Qwwo_gny+jPXo-g18A?0VU?Uw*5%V!`HP
z_=7s<y}GHn?t19;4R;3L+53!*`)6Ld@}!+7_kAUL`zNP<{+U017AxNKx4Z21!SSo_
z%=~D(b@t>B58dhi;lSfhdSjiv>cmXA-KGWW_C37%$<l$3-CEdTzjIG`=d#@Byj6oA
zy>(RF+$0g-yy=L=N6ZesZLc$baPX0@Z!3F`UjmM~NBMJmjLzLD*@a$o?dC7vcG`*8
zM%$h7)LuI*orurId;a~8Ugq@7p$D5QcEA1lmwS&Ma?{VRI{(r0Zu;3(zYcENm^<zp
z3obVjboa;oz2&{PoO9yyKl{yb$kwZmLYC}vJhJufzm2z?w*5=@U-`r@-`eGj4cW|+
z%yGTMTJh}_tKWL_(#)-o|NQ#qTX*S`cRv2`p8D2R@@Y?9GOynVKe_ee55IHopWDB0
zUcTwXz3Q7z-R+Q*&%fmA119Rtf2_T8{dSw~Ise#G9!oA&woxqy*-P);13B)hzkg_T
zuD<J%<$rkcDeBn!tF5J**4}ORpWMCs^R1OlrP4+cDc-aHZ7*2Yo^|HIr@i<2y?0o9
z^?p}>`o_Cg@JD%1C<m|Evghmf1sQjAz=Op1ccXSZcFE?;ZvKAlC#%j``lH@COC~oy
zd+V`D<*m=ZJA?kP(*>92x16={;r;ImS6qDV6+iy{D=W4hb>SEO!hIsQ>(*OeJ%7je
zwchF-wiFJZynN#BrA^XF-+bUyX!6d!`~CgL7sdav5;?Z@o6c}Aa@#|^vm3j0r#iou
zIrGG?M0=n9JcGaX^|PP;>ebJma3Fl&F1K~OM`|A;TI-&Vw)@;3$scCN9Qd>61$gw}
z<HtO@zyIGae)(zPh;t8oYSnt<#hs5>vKe~x68^#CZoByGYj3>f%d(yv9i6l86~Ig<
zZoFJfdIw_PS^XT^`^{0u-gV&fS3d7XPe`A<WXIbcm*3ue#lD}c?z-E?Z#RGO>VNHD
zyKv_Rr=MKy*uOmg#J_HrImq~XUD*1WFD$?9#)prji+eo;Xv*Dk>cihD-Kf90=~V%_
z?3x3y<gz{gMeiT<i^aR|g&g<7#?mW$9{%L_?)$?Tcl`Icd%SYxmco9C^TKnR3wxdf
zZ8+!l>0a>fo>h~{)6XovW{=ODdtd?|RlMis<$EqV|Gnz#wVgKJ^y;14-*%1X|NQ#)
zo(CVdc;m9d^4IUY<Gd%HzWpy3ocP^8{^ZXmHNTqO^hD|V+b`PqE%%hQ&wcsn1)tgB
z!#nKq^?Pz3ubuw<$uD@%6!tr5ZA~q<R{o^;%8n1J|GC4ud)|BO?dz%=-aPZlS6<t&
z!(AtAz4fBg-aRFI+!MqCbJ>49cr&&B_r>q;bKA4j9>|S<?moQrrR?#S_0PACJa^w8
z?(?x<>?K{X&o;L2fu|Nf^XaqJf9Am}{`<Le7M_06?n(A)bmK2BI%xSb$KCnaO~;*h
z_cDCZUC`Fuq&4e5&foL9*EXCx9qoJk_FH@IANDz8*`n_sw&f?EUjG`LdFnuY$+?Gd
zkHov&w>$Crf+zP>e{TNn%U@poO#IQO2puf(gXhjSj1?EJ7=C!gh80^@+z@^K%-hb8
z*PQ+KcY5a@@y^l4)(u<HB|Ch7-Ih1-r1{|$P3f`+{&3m#M_zizLl^FIcy`l{^waCt
zJ@NGAdmp>v+OvN4nIl$y_tchj>rJ-vmE{-g^YP~3ksa4R@#6pE7Oz{j-|{8bopnZj
z?Q1_%PuVPdxAgU;o8RQYH5t*0s|}_1=^L;9)e*0MdV+ps<xLBz?T$Zc&wak|yuD&+
zboH4Zy?w)QGnRkq<Ld)>&!dkzYp(~+yL$hpZlb@x>9flYU;p~iTW@Htdi>d&fBf_7
z^(T|{!6&y4piP(DgIxdK{gq=*`q2ZI{BHN_E_gxy%F^FcJKp%`FRc2)*317zWq<YR
zmBNy`e%KA4Tx#uuTyfVW|M}U*uD`tS>tFiV;}@Q~zJLGq$=2+{JEF^fNC<aO+klcs
z&RhMfHD~6(`r03Nx-IWL@kQd)g-<SIcHQm!<Jl)({UQFhTX)%*IbvJ;fA&gcw^x--
zUtMzOUc@%C3#L5i^2LXmYpi{qtNdI00)TLqt$gH=ZFMhr{>h;S75Aw9aE179?aOw%
z=-P7T;sZal4Pya-we;`>%dTAY8|seT|FxZ6y?Swd;YC}|{ug=xq_B+LwD{3ues<}<
zwJ%(J)zT|V%kKRKbJO+<mjC7OQ+8_|u`NZ~#JZo}yLHo-KYBv7eE6lNyYzrd-W%%c
z%Xcl~wp_Z$HaPFJTMpSg=zXks$!yE~q=T02_xy&fXKuUJ9FXJA{@Tl+>ZY%LP42Gv
zVCd4L!BI9{c?`Pmm#!`EJpbCqwt;@T{nzTn7Yxrl1kmjB?u+UNgLYeRAvjUA6aL%l
zz^MFr-#+dGy6rXr-Jibonpf`l(zUOB<c{6<+=fJFmsYN!e|gV`7A)J3xqa&m+wSmw
zu*2iFyK3?<*x_2Ld*BC)EzN@+x=(C;^U3O_w?6-=W!r+Vd;bGq)0dt;8|?FtPbJbe
zm%XkJwsGJh?zR^WGaq^Ax{qD>!D63V3vTK5m!Ci7PluVOGT&RgEybS!7vB1dORikA
z{zJzcf7CX_*bISPU%22KCvM#xc~Ng&%6zcc-Sw4g4uA5E*N=a5cjQNhh-*F|#z70c
zrR=6}CiuYz6`x)Dr33fgh8VA24t9P2>PL@GmR%X%{LP;o^1)(H9}d`Y^3K~=pR#Pf
z-+kf8;s?ao?KB(^s&o7SAG+vT?{n2%&)Wu(Mm4bO{<;_UzH4#)2Xg1R5195ecaMv%
zU3S~Mr*6D!@dd9uaOt%l5aaBX2LM8S`@qkGed2e2?W}EyaU9sjv(V=7ypz@({@bIz
z{pSy4Z{_Xa{y+D`XMXhjNo&^6e_v^SKn(JrT>zoFyE0&(2R!}AJ=+rFK(LM1KJ5O0
zI<|PpWbbbLfoVGENznan`-I;G#}=Rcgn!tI4~TK_Zf5~PefzBc!-gp-{|_7f@7a)9
zLmx`5J9P6`pIUn5HXOftX+*EwgWtI6W4&zzE0xUer&_Hpy|%4X0j3At*R0Roa_+YJ
z1sIuiBOo~^?IHYaTNVe;R|DC6ZKvxG-d5Ow(RlSrz{bD&++IKam#v(=+DJ8D={sj{
zD|tXL?o5RbRqs1?8%ap-e_tS7|7Yv}Kco5swN0<S`F7@S+pYSsaR>X#*^Pzn;j3TH
z?6Urdm%I#@d9`u)#>c<1`4hKnJa)?|FV@>%yLk`xmXns8^P3wpmpgA9_TF|Y`aj+B
zhfifbyXe<@U3=5#&?A>Gd|<(<3-4dDYQLRc`)lvQo3DN0AKdli;k#8gxvQP+uHA9<
zar$-VE%?QPSEm<Wfj@QC0~<2U!cXzHesg+eJ$2`g7p*fh8?Uhbw*9IjR{JvhjYX?J
zpZVV6SL3~(i9fRLf=q?F;@)@1TW&jO^(=GJVt4!1yFU9>aq-G-=Df^zx4Yp(>%z=K
zH(zxB4R<P;b&s9B;ZNIT_g~3pPFQH}eC;PzpZL%<%lTVR%xr+4Jz%%x3pw+OuOGi*
zN%p{X&CFRBm!CX!`OW^J%J(mRh&U-T7WdeIeQA8$8-DY<$7WxC6uV^UZ_Ur{v*6c@
zS1n%s*AKmT*d@olq1H2}Zog`W)nCxxbGz#nD>vO*`Q<6MJalA6S@7X!yu-J<;=JmA
z9sJ1C&um(dIV^MP736_GxMIy8A3yY|`j)-kfbLlEq;<+27eD@<cmC$+-91;Ie91YV
zdL=b;D}TN8-rbR%7r1xd`_qG$>)L`&_Px)Zbn%fFU-qW@$<=Fb%6xmf)wk7u^5=b5
zUzPdOLKj=DWY#ZqOPlT->~JV22!~(syHVz-MX%mcSjEt#;yo*m-I03A&YZbG`sniG
zcTx5{<AA@NvdCPt`gBV;L%f*U`WN)EyFNPJ!{&$wF4v6XwY&a1y6CwLhXw}pncdfo
zyk{QRb=BI(k9lgb`|TsJJC?3XepY5~`j3+eSN-BUgGY^1dhedP{knI5O7E^-=wyyt
zu<F3o))n`@NWXLBWQWiE=FIb;H&#8r*8CBN*$*vQ%pLmR+C7+aUySa~@A%eD_kHKP
zyMA=R3s0A<+#bk2Yp3fi;r6rd{`=YQ-t?s}pLU1z`syPdlkeYmMeo+j-#cm1=60Z-
zzvEzsRNY<l#$w=s%zpM{Z+&@RYQyrwmvFb-aq$~VS3_Iw{XLX1F8|K=4%l?PQMmE<
z_nz@Y6@Tl^y!hB1#19Wxrv${xT}y}De8`rUmW%di?TdAEdFcx${`RP1zy8A8`0Mxm
z?cuLHQaJb8r(ZeZW0fmk45pWzzT%!~<_8NmuUolssTkaG?)aMK%gB-guQ49SZ`@Kj
zdig^~ZCsUkh56Red!Dr6*lV7@_e-ZAw)}y=t^M&S2dq1Z-*nt?{X_o4Kiu=^lYh7N
z$SvB2pPc`lUthEE*E?<AIWv9q&h^Tg^;b4;y8W@QUV7xkgT0Sf{JkZc9{lcupSu1P
zgZt!*$A8&4|GHPVd-U^{)IL8e-m`4C`;Nb>bpHLvTzpiq^pon-53XGM=Z}03zxn6)
zRj+yFW_!~i_a-ZyrGGg5w%>yI2lvvYzx?ENzx?c<pCQkGZhwtiv)z?1uO-f0bLN4|
zAN$W^h@+t6PaEI=$`vPm<i(>0n+IQfcgYjS{^2Wo{qea~r~YyI3w7<3ukAD5D>KPI
zbKT<O?eUMVy8OP;XP><L@ka+gT)1w>uit;M1)uP@6YjfzaEYO>S%3Ahw`#vRN_j_r
z_x6o@qPINp`qyte^E~uZUtAijI{Bf`TsS>#dd7Rt6U$yEpQb<Yl)d*wvn?;3aK<t3
zywc0O@NoYl>PPhBzx?ipKfKQqqwlW!<TdXeOFfs{a^1SO^;4d`m0aI|ezw)*AN$Ck
z4%qz4&!KntWL@F18(to4sT)rW-hKLqUt0T@%h!DIy!qBm(wZ|h?d`{z8+Lwm-{h>q
zCm;FK1rPmo<6F``e_V5BZ_NRZpAek7`SyFg)hC|*(=XV|A78iRBah{e!EZR>=EXbj
zbMmjBcXwKK;@^KgzVMbEAMK_g?9*;Je7E|hUyI2K_`7%g>D0?keE5ZTH-GNRtGA!M
zx+nChqn~+W&s|<R;P;1LxoGhxPOg&fpYJ_qw?)@{t@-rgOYc78lHEqn+;iH?#|Hj=
zKV7o@E~nhEc<1Fe)Ycxj-;Xxl{yprjBWAnqy!@c6&d$F2kzLmBbin7{TyQPB>e{n6
zlV_fG`jYLJPSj^tJ%0bzh0V}ick5G6KL#R8@#f!u@$Fl;zwDYPcxkrXE<ZTnkJ4wp
zuHJZ1{+-Kio<e`VZ^7m9+aJw*?{071?;qQTdkO#biKm}PVvlV3@nzSlcRl;nt1sE{
z+z%z@cf0;Oi~qXg{>%?wzP@<PPS^kSryqaF{h@sSi%Xt8^xWDJ?K2l$yZScgkn{FL
zHthQHJuhE*)#U+f{_@i3MJMB4CNDoq-LJ6;|CadLOD8OM{x9y{GAzn&Z37;<L%Naf
z5~Nc~Bm_kPX{kYAC>gp20cjBFMnt4Lq-*Gs8oIk1q`sSHKhNI#eZTzu9S1+)U}o-n
zt##ekI<NCQ*EP1jg`TO}<>|aaomhRQ?u2^AMNrca6m07!W)j(;CC2Uy5Vj(Jc|t(;
zz_b(mm(3YQs@k!uEiqpT@oUv#g=!fM^q#Lcyd(;<@48#|EcyegI)iSXYn`|zVNrQD
ziC%?|HyJ-Af7u;yzy2wKSWhE|!;PtiKIrGt+LIv^$1nYV`lQov<aTG+wQEVkV?Bd7
zL4S~IL7*B)mB^073`+)623f8hSVo$hYZ$WIei{5e!7f(ce(H|>3rC?ef1uEeV9nJC
zwW?OTz$QV(;H+&V!;ci1usAw2%o)Lxevdx12-krp`mmrCn#R>ifonJH)lravJ+HWy
znhdI+wU%UQx|Unnt)M-LfLD<FDl#cKHv+yHXf>tFTCgvevHGE`YUA1!#O++!;?vCw
z1`|y0J#n^OY{R?m*j=n6crEeT2-pxM8G#VfonbE|JpijXFAr%X2jCOvIgKY1<Kl-J
zOs(Za&x|3;oL_adrdt_{Uh+E}@ab5HAuX9lZm5c?tbB@#<u2BAer4a;gB485Ek~Th
zrx7Yzw-#DrINUFQ2Euq-{LayD$)r65b94i>_xRjFcF|;jwAND{$aUeOBTI{R?x>f}
z8oj!#_|Vf!2bLM^iFn##_TL2-X*G+?;nvag7#~zIr_lyj9FG;F621E<N56IFL-0$<
z%%bwuUVN)URj*WYsArH0KF1P5<7I=VHaIx7imNR3dh`e?8>mz_NEJ#L@T?09Ovi-t
z%s2<e$3D_<+ifKhx=Aw3Ild<uEOajWz0v!$6J^EHaFIC}8Rn8s%%bs2)HHHde52B+
zv*4cF*Af=Awj3&UjeWd`n&1*mr>k+f*^?O@Y9NNh;_C?@m(n?>SACsm6NWZlEU}Wi
z&o4^rUJPMT&p1epR#(pHtS_U@u%NMA33#)?f?!`ZerH$NV+*fO?lYKQi9OmNt8`2M
z{8cy=m8b^D1geOHGP>h(l=H4iUr9N60aKpoQvA`ruBI<Wst}x9Wrp%SM1?|(Dub+J
zi%+y&>@CZ+BShVtcJ(b6`g96%c`l{M$$62UjQbJxUaJ86+sw?RwrH28FGCe{Wg0*{
z;pmV!P6$fs$eU^*3XM<U6+Qx`X->E+6TG~UrBSBz@6)W!=N8>mhFBDduF<N=uh%dn
zUE2lNo@h2yG%4f6z=FK#lXR{8M2kmO6lY4KwEmN;48R(=q7D_YHIhO%B6FuS2DL<;
z{u!}6o{O2!8QMq-y5Fz^MD=Z{w1kBQ4Li|$JU1GJLekf{7FbmHAcMGU@(?4MVlPK0
zC5HFpLNq2~Se3`^t%1a1HmZ0SLN<go37yHBdJpOH{w2e+5lb}u4jRO60Ey0tsHXFB
zXY0VPem`PvjT>&d5g^?ummQP32L-t=o})to&8BKUBozb8P7-)3>A(_4j2TK$(DW+x
z-7~*}yxHQm9;_jV9)@pEFxM~%Y+`kCu>Za7ZFpM%jO{qO*>Kna$*-W8WFQss+Xg<q
zs=MV3YY(ZG@-8rPjEf`A0mI<djiEaC3!#KUZ)XcL;)8=`s`HZ`i`K19@N3Mv)sr_y
zadYwInHX8)ctCa;1m(<(MH!5XkDaT;-(G*49Gf0Ae;GV)&@|N4GR<jI<pM-W8~fX9
zOK%_%7Me-cboDURFb6#a_Lt?~zb*qM#jdQRYH*T0`~R4_gAvO5R$tvR(}k@XIqW=e
z*kK&HW`WP;+W%OnuTs)C`kl*C4|{f;Q&WO{Cjxc-VL|$(n=Ym%hNib`mo1X7TC~C)
zwF1KMar<Sip}f(Wy8?Fm&q^`+UUy|^eb${hL0FCSwSN?yaMY)$$+LK>fKAnRqVNbV
zFtP2KUDZ>VTXgY&s~<Z&fFX^_sU_#YNUkn}r8Q>~5DCt1VqH=Aj3eU6`DrAvG`=_u
zN!_cCjoLOiso<~Wy6>UI-jJShV{8*({lg9L%#&$^MBNS-QvI9)hkPPgrzxpIaWtOQ
z9(P{s!Qj(WW_uCqFQm~hF-VI$j_u=v*H#qbV;^g1)}HytvH>aq4oc0+h`vIrQVFq?
z>TFPlT;DOFuG0F%6No?s>!<jQzG)?^IYvOk;lI<pT0UTu5;73TnPi)y_CVGVFf(Td
zt{28+oV97`UGWJNh|8u^`A90zU|=l=W(2~f@;KpcRnbx(>A@0ij5Jq!iH}nAb_B)J
z&akqad=EE5Y9~JU@w-2>zq`fzep#x|bMvw6km-bc!=n0e#%XlB(Gj+`3Kc94*SG8A
z1$XyNKk29tDD*uoE?0MA?7s6$Ma(nRq{Jv%!dS~?K9dN=>_vsKlN{R8?|Rl|w(iBt
zBnhu|d|?cTeGXByxOQd)i!K2hRGkSC2l!p`vmiQDh$^lPu*>&BXnAXJsCZ5EHa>0k
zSl(8l;_Vcy$z_4EFaq_vX9<N2yp|dL*B)hEe8I7i#AyH3=R3-ZOY)3lO7et)%VBxQ
z#Lgm@@Y`f?*gGJMSC9fzT_}Uz;D^%j<~kIX^5SG6Hel2l7Wb-I9YuvCLbk@I)v;_R
zstnzVqU)ngap5c6_X_BdFIVY!fJn(zRWjIIA`{<FUWE2{qqUY!yj%lR_{h}*mw&tf
zvZ%{MpI!QrT)uM{uYU?llaMfluBV>rUptGjm&R@|`}ynlh;pp9dT20j@+_hTz=COo
zbxO^!sCF_1F=+5ZX9u6@NMjIVlW^(A^5RpSqB}lo-;*VgoqgLuYG0pO#;TqA(ed$e
z(|e6@F@a^-_8y+mrPa^+v6}-+c5lpO<=X9tz2ak~%QM+N4>>RQh0r&0oJMM%I(R0L
z*5XBYW)|@Xr81w~_p>dmkl|^d1It+)4euqlug#_v)paUB#*ngMRJ$|s?P7>i@Ac`U
zB#3tIO!e&G$mJQz@J3cgQkRGk*kNZQAt|h6k@6y$trXcoWoxa(U=uEUoyzU_TS;yp
zM1pNvt*hx)0LTj=2=JD(D(0T$I0n2^oTP~6JPb2+qQLDe<9*;vW#J5N5vg^3W>#Yz
z(V$0XPln5Sgv>^@piNcMp5*kKem&BMxM7S+2#x+(b@rK9L}L{}R$CO@Y$nqd_i=&}
z$SXz#x_c?aiXOS$5wRv3m0NY0CK2mv>K6Kr$xunT_8viUtIhAs)bS~|P4qyf7bmCR
zEHjs4v9Fd3<-{iUfU^_nV$_z(BRx8Ze?Ec3`?z!=({BlxX5pLb0FjV+t7EA3yLn|^
zBdr_<hNv)|Li=5NG4wKH6T=FK*f-Z$oW};tONp&zy?;=Vhujp?ZA_cZ-9PR^E65Cp
z9tlkh1n<h|F6e34NYsq}%G~%Xv3x{*4<we&+GVr4H9dZ4FA1@M-mkU>%h*^s{QcdW
zx=?jjkxopunTaJ&4b&~EMNfnA2o`W{n`&802r|mT%63B43rK6k87<XL5Ia5;C}LVe
zt}E)A5!mN;(($4dK2}Bh)AoArlXNxW6D-V|M5EvN7qq5^EBSB_zhkL*TUoa~*c#+N
z&#X~RQG0A{_r0u1o|wznU~45ETs$@^O(QARQ*Q5!X{e~i-wyn*UA!{{T?#XvqGm8%
z*DAM=nY~g>;D_??@q}J;MN^<@8bL*|-TDVXHrkk(&_ZXe!f&aX3OGl<t;AEA^%p#{
zKd*Jb->v07{|vpYjP)QbxqD6MgoZelb-JdQ;T*MS%;q;qXV2EXxbnSAXN+ov5LEl_
zY(Vj$o|+_%jsZu`M~~n65^iN>Wro=Lraph&UxCN3sXg-cMmM|a_KQqSFYz9Fi|$o6
zd&MBVg3ZE2TiAG%V_Tos;j3VNUoX-4%%yW>jqok?9B^iy$$fiZF0koTXB}2~3hxFl
zGR<Jsq-y2qE)abanahtY^fdUKZDMYs|4rnRQA~H4pEo<2eM3u#^v+ioEX=N_yq~pb
z^cJ#A(|!;xxmg+Nwid;oLd$6nh1Eid?jqBv&f5z|ZC`RrG(JBiDZ|sciAh*fTn1VI
zR}k%<XG@a-60+`KAf}DMdk+JzoR+_F1{*-%=%Q^Xq<ZFyU9Bjz?{h4DAYohGtz{Y_
z<2Nc6bv_UX^t)~B&vY6OBld#ttWTzjb@RVG?FE`8UP@?ah%rQp98RAMs<6*prP_9i
z-?zUJ$2XzDEwV|glM0&2af^OhLBI#^BX4}(&&8HFi`}{RG*y9}M&0i$dS<{ao%vYa
zIknr7i{YaGWVnW$!RU{zvRX#9Idxy52ET95*u%5x60!yVvn<8-VS?<HXNtMu`hd~w
ze&?QW$m*PW(=45$r_yvptj4+A1dm-$4rPgV#b7j6-H7D`>0D0^ajt7^V;K2QQBnvA
z3ypF%+OgSf2sv2wyox4@U3m8_d<AVrc!p1E@$}PcTLqlLg$ZTBEz|ez*%8JQ^)!6C
z1fskOyl)+MD<C`l{Xstx?NW~%j(Yl9Ds}J0niSTj`aY7@JkYN>UuA*4=6jNAK!@1Y
z5;i59|3risge>fQC114hmE`c=#i!o`<?O{BSZ;<TL_Hx@X^}&gP(q2A%eG5gDX|9J
zPi-$2{(QERGZHd0<bHHT(kKY=c*O4EX7RDIrL(ZR{l>J=wYTCgrdJjfU~CTxP}w9J
zni6Jy&q^<}jmsBLoGhak3vrs$BO6T=D6l(y>D|kc{MK=zH0Ag-Q9m8AKbsoO)@C&H
zZ_NeqAS8+Z`_F&<@+lsWvcT#;!v7az4ZIZp|A#L%XOjbT=nJ_d!~fEo9srmWChD7E
zTvP{fzhFjwdSp$~mNDi-S7c3)YyiOYopBMMY@$4-_wz(*u<09yfHFW}oW&DuEA6fS
zId2)LFitQ!k|;_`fEECvPhWrW`^qUdO4LFKBBAREdIw@Z)<Dw4YmvyUHT9ZIX58Zi
zeFxb-7k<>p_CHv9Qzk;d2F>EP0Wu&k5%Z%1ktL_6MS?znF41JEK?g{gnRVDL3m^z;
zDRK)F$ma|J_C#{1*S7~5#*zq+psMHW#e;O|UlDxmV%0qTZ*^ho#p7;RE~8%!V~eXu
zjlqcg35(wqC@6V(4GKyCWh$?df%ZV!=!cAcv1&)7^CYCR$!FMplLHH$)(d2y!~-)V
zhd!E77fxGX!=VLfBRNDQ1ZvIx_Z1iknUUqlgd|1E*^$W<!>ZQ8=@JRlVgM_d7RV_%
zoe<KVZ|A|y3DU*NeCn5iWMBnXiP0hijUI^)<8<(R#J!Fs6b`{m&IM7W%nUa0MlJuh
zCxg&mmJqW%vu77qofvfbA{B3u8l<Q$=S=432x2$CboNmAlcGO}jgn{fWdmAOFd)cL
z7T#K9qt+hg0BR8eU9b_lGs1!r&`WotSP6~AWuEaR!emfqcV)~)d~M;{<|6mH^+{iL
z*ICu?m%q&2nC)4<fFRUYl2gb+=qCjr6#ZelmQaqCcOc^wqeLuFD3Y!nb}|l<Es_H(
z$OG~RncW(qTCw01|2fmKz*e%a{f6IdDJ#orTi=;$1~>;Ro^wPqU3t3@Z7C%=#S0yD
zEPfeoA#$+CSMNTA2qQ5U;mh-&|FOF+Wybf)VFZ#K<z|}sS(L{!@XhcIrb&`Jo?wmh
z8OkAr-z-2D>!Lhx@atK=Mvq%|(qA)x-GXvc+y(qntiUf4g}>_rku#>+7<n896P&Si
ztK^{^wTkmXFW*8eM5YEXkeAe+?WscrvGOubvp8!J^X7c?d!*WO?OdT-BcGvmf`M2y
zACA_a(7~+SNM+SQ6-c-!{FBfW3MtW>bG6l~fzx5lP3H17E-qHbm3YDQldQ?mNnzz{
zd^|oCx1I)e1}=B?VApZ~OA|$M2`?-v`dteahWabana&(@5pvT%6mh%jOt$2)P7kB3
z1t;igq0Pz1vLn-N-w!l|CW#~+5lE4wUXmk~PTd%0AGsf;QZHm3F&b}C{SYUl=iYfA
z49L6zjm%)LPrGujKG@UAo-w=t#??*7yp%mBM1RyG9ix>SmU)B}_XU#@$>p&$MSmy0
zF$|93sWN<tITesIHZSS7E&sJVg&<n$d7EOrwe+&TH}H9W`9S>2(takebSP^F90;vO
zL_*Tf&L4ABi4bJ|A^g6(j$%&m<zK5~LJy2AlQTq?P!nT0fYM<15HJLkCZI+{0iXF|
zZvZ7vT7jef3N#hKY*3l9&P0b$2JRs51tTHq7%(;He@_jI+zJ!$nXf!gK=}k(y%1^H
z^T$}^d%0JiG0tOvA>@C-MD@?3?LH}@>$>wjUt06AH<`eAeW=iQ-#p8E(F1%3M3NKu
z%0TBUx3~N3Gqogs?7jqE$?c?M7wT`?TU^LUEm3N<h@~!4A;?rji0z;EwDT#F_{I?3
z0DaDd2#I}n5ZlJmEyEPsrxA!M`9S;SNc-VXQ)8Y(J5Sybm}#{;hUuCjg8tpfR{cvz
zwMd_=zOZk|#K4Z}X5xjzw)Kq>^c>5yawhw#g5SMOAJKeIuidaEJnQ4?au(a8qxW)y
zsej^6RfgGaw;bYb^zE$I%v_$=q_^lwe?!lP=qGztXAZw6w`3Q2HYfFJ@>F6Kk)7{2
zO1N4Q(a@J7LgJh0w+kKpTd^aRJ}%sPT|<;e1#h|AI|SMVT-UEJO<Ji8bi)@+Bj}{{
z_%V#V>!eo_7VM_NN0`tT+XHPJTNTMM0{jT-4s9}_eGuRKD(Xo?Bg`{8Uu<$Rz7Pg)
z&&A|pkxaWEM$*IB8bqcQGm(2TR8X7gV>A*Dk4Nau6SOsRdU!nPdlWQz(yPq2(28gp
z(Ng+x2v-=NNMYylQ~j8WU1zUM(p2U)%NjfzoviI5os|*v0f#qJ*qf6Tg8Q{UevRCf
zzXoC^eNY*CcSFo+@RqP<UW?RqULWP4;TGNJeviE2WaRg*d2u)zJsa_yd@$5IzUXnC
z9>PqcE_sBdZRX7&{%@Cx+MnTEOZd#keWwK4KmEc&6WUb0gDA4Cj$l)pV`j4b<usCR
zgf96yZQf6}Gq+Kqa?1Oge(b#uwqKijUC)zv{fKI;fq2Ik5wH>^gY+%vqATwaih%PG
z`oh%rC#|JZ5zu657i74)zKKbSxHaxCS1(pG0meA)TQvz_*of2pKax!g$4{lb?7yH!
zNRYxueVkTzO5RT`Us2qU$=G2u%-2ZFXFQ=h7`UBHn+;|rBcDn6alQ<R-A_yJOtnbC
zEjJLY-SO2infMaH?sXT)kd=03%es~olc_@2fMUuZ7b|Jml=KnfcZrTwf5xqU*{`F!
zRma5JCG7_B)xE?zce8GrL~%!<`J^)3E;>)!2`Ad}8pGYg3T`k_8Q>B8`LU~!-TUed
z#qScfd>I3Rfp=8|lvjloBDPTK(+@g@m#4(I)YnZ~hpt`CA;LV{W}UglF&kfBSD-T~
z*ar_l6zX?lqmE`@N13i(5;*o%b+UkGf7+)<+TY(^(BspJe_HUFvsR1DuzpAVl+KPl
z3Vd9BGG=sYa(~oy!2UM-YfO8HnR2Ghb-eCfCq9kq%~8*WxVim{;Y(Y7z=z|<Hs|lK
zgFuKFiBzXqke%;eQLphy5!?o;x44MRCr_<dUtB*{FC{9QY4Y|QX=n`yGFeOFz)m$n
znQxoB7*~^+>EO~t?4A!yQfpoPQpGK?Zf#Q>sk3wTYP3cn^Sa|16uW4!HLEY3+8dJb
zRbIpXKJ0s`K*H<6(3`*ucmXu*&MdsBWj?SQx#gC{tmvRe$;IUp{EEGf?pS{BdD3Y#
z%X5655-@UAN9I=SY(Hj=U}Vmj>&XR{nua)15yon&t>?|Zg`cAz{1$%q+K47vFOlDa
zA_#9gcCVUS&j3h&MeI^lGl?STS+7$p6PuT*S0#RcfF}sA9sL|`MeWMi<=ACmxnFtZ
zjo4K>METPIMuhs)3DfD$U#{HQu94!U%(>Np0ykodBvytNeB~?92{I2#k7{!bI&ED{
z#DExd{pfkI-<qjYPwBn3%~KyOXjNRfg(1lGIShL8(ZEcs)@^t*#`ADa#Q56|m%QU>
z(VHh+M$@-|$;c`eSO;{;pZK4+E=3Hzk?8D4(5vWe(ycA)Ud9C<Qa!av+p5^Q^b4c)
ziAPn>hOVq3OSsy0MURYzpSqp=wj+H(=^0gboMHG3v0J?tY=1PlKgVuVURg$J2|sle
zsPrAO_1qlq^w_qR={Y~YvI-_(z4tsc_Nu>fknk*<H2GS8=gUjRgwP5sYIM}%TM)a_
zDmfZON0c<p`f-XFgu3@NEBV6|VR$p0vvpvRmornw^{!davlr~Tn_eHco{H+*wO6I+
z!{D908PBzeghDi(){;I+L{o+lEWACxE@@s~BcCzZ7@84qU;iPxHa1!cIm5P{uEigx
zEA-aRt!=)Xezoy_IyH`6Yx|YX6WZ$Mz<SZ*B%ALFzt^N+Ib2=U>x;OBH}Kf3W5l{3
zp6-wijh<JV?BfdM>7iv5&IJYB--WTd9JHssp;wse%INhNlqZo#OsCN9W%%9%Jz*J~
zh!=F-PWiEC*V{3&nA9<kqP*1ho_uATj<$6D<fVY=pZBg#^j~xhZ0*yhZKfs6MzYR`
zDFof=oYs4j<Pw%oJ`ro+V(k@Ly@s*w;L>&i5sRtwmM+r$K%&{K)h6Yeckuee%z@kr
z+syI}PAIjtGqU7*L@Q-YFhWqs3hoIuxHj@Cb1|--E+(;4WabJAb(?%$?|7{cY?b1N
zB8<{ZIaY2sY3q8|)9Ei>Ot==$Br;8{h+D(OqbD>ZdeL!!aWwl5I?eig*t8-Ab-)za
zw1T>ZZ`|_`T4?BUsWih?!V&dj`lH;`u}Cdq97Bead~bcLcH;Y7EIcAD#=qwGNI{w*
z3^igXK0fmZuePaw2jv5~#7!-+Sxs0Q&j*?NN*;Yf>m+Q3?drL^%Uc~9L^HknN!ku{
zEbB90;T{i(gH*pc#U4Cm&}aI*(XsJ%YGd^xTEpQR^Z_GXnpgr@<MB)^U=6~n1u&3i
zJ!0va6YM2Ryvb}{Bb&lzxo!?1JLb%zf#Ceb_nxthUX?nG^d9|*4mrmXLK4usATM(5
zHp3d7j>;qN_l(Im^OD-0mr3CZ8)9PQy$OxDbK=6CqzSb18ec;f<1>?_4qk^R9WtCB
z$oO3s%!hj({NjK((&VIIa~tLS#P7h#f8_WJu>@EuP58L5rUsM|ysRwsb(o`FdFCSY
zruB(m!=R&H9g6iGp`0J-lKth`vekoq5Vw<byO)+^$LTH}W95w!cD(xqXWTyXlB1Wm
zzt?v7cLzA!4%*Rl@3&CDa>pc7N6=t~CDKnAG&!LZ*BxmI*<FH)8*_!IWTx<#FkCvk
zFFKl@!sUKRYfBh^0WcVf<^Kmr6%zjBBWyi?$0kc|FMktZhxtqLUX*I0h)ic#!6UYJ
z&Vc+oFK?*nt??n!!H9||A(;T0`YoF{lX4^t#qXN57}>rJqt^VY6#k7yIf1^_$8%pD
zjvi|*h!gtshx!ZmKw3NT7hL6->RUssq$~1^&f_T=$bCxA*O5C&6d3C3P}0a@Hx1s@
z#~C^lY-y+WHSTr9*CS=MV3D&ZBg!5!`#jVEa$|^G&y#VyA+?gGZ>4TqAGnjnW@9q9
z#_TY2hjc%^+-P1yF|tP;aEaUgHrVb;*%oPfc<4HuwX;Jxp9e;Z%@k8xTQBb%F5MDz
zJB;ai!e3;3Ew<lSg&S{&h#Vaf?Q?A0oT-R)Y)Lg1cq!y{zh^n?Wal`0-STa7#8m-q
zL)?YiRjVa0f8vez5W+AesA{Dp%X#W9G%<FXI$NW1Y@WzQIYc^cZhB_Dq^V_ZGC>%k
zOf6#9`Y3T*<|78#V2ZUtT!x@d@h5oeLA6)J59-O@T9=4NMxJx(aD+SL4z|d`mq6Cz
ztRViKubGNq;)5T?ho{fS)Rd(*vgeTJ26YcZbx7n+tRxoig16~f{THSSj=Ig<Y9Yj5
z>zQSyU3bD}4t_1Tt<TsuwaL3K_`OOn3h}pZ2%s*Oq_-14?~uBXNfkONY@h(j@4N4<
zx>c63j!?_3<EqRWi@smh?625m-j>XLYQCGdDP=d@e%7B}(~ochShFn|b7XY96mwe%
zRYix#-0wYrx+>4N#7gUSUO${sGjmSww&z>#2fmDekFWhZ!-UMBkE5hUumL=f7?aBa
zjxWCF_zA!l#bJJ8v)q39B%2bEV?onX{&0FJH5hBQk^>BQN+#6|X!45Gem2%wr6&sX
zK(^TidLM0`pP%L8)qY!EvajH|xsh0vsPQ{GSn$0@vrtVpyP4qyFGmDO_}*c~aefsl
z{@}ZQ1%XG{=r=FC_j0vppMC>pBaIY0*EN9NaZub$kQhMMo?c$GAoZsU;H6CqwJV2B
zWNikreEE>ZTU{eN6Xw7UPL>*_8@HK|fV~0SGa;0r6Vd@nD;>lypP;_n!RPz-;q0Bt
z(crdf)*~Sdg1g|GPENCfd2e`K(k%wQ$xb!x=GA+g$`5y#wWM}y+Y#sl9(F$SM>g7W
zCm&~|FV`kz4kw}q^(+#`uJJ!9s>wOGe!rlwfXptIzDO5!7yfnOj>1JocUP@cx%hZ4
zr<b%P;mf|PrLWMy<z0XLKuY*QY~_c^@y%#=5#Q$YGy(5kxAxf4?W^KT<L?r3a_s~T
z>%DQ*ebQG5-J;E>i|VP+KS_yV-cQK*5+gLE&}sa;kB3W7ben7+<qjEJ7(@Xl31*4T
zJQnsTE+#Y|0KDsh4)M#XJ(yXf82ZzLx)DMPCXD7#-@B>cu9O<3v)~<S>%uE9F#rU^
zyYJ6L-n2QVc~ZL{F#?5@{#5LoxkyzFMZ}vu_1hf1r^s_J-@(?83woE+x2p~v(%?$f
z3$*vF@W!X8K&c|}Y`B9OL##=IN`j~&9wNj#@V2x=D<M6=;o_En8WuGFSfiBEZSV9=
z=b%CXl#g;g&>R-F$1{A+NX-VzRp|Mh=RTy)ZhT~U>Sl);DRJk9Qi7@utFpaoBdz`U
zf~i08H+@%<6)9Zf22)kq(8f*wd~YV|9{<*=1Nu9J&7}AGB#KDL^WrXmF<~}v<K4Jo
z7@UA5vAjM?>>;-e47fgpkLwtO)1{?QC+$qzpf*k4peW&Q6AD%UHAhX$&oCZK3@5Fl
zFO+F?6*aeAb`(;8@h7Xtx%a11J7oIU-2iH#ce+1~Pfp5b@fo&+;<*wV=zOR>Ltf^v
zuk7~S)3fmU&VI$_<kr)*b%kd-+kC5U<SZ~g?`YEue5@_eAmdJB=_^WFg6}SopP6*M
zrK8lPd0l3iw~({;2oP;^^}DRX@#_$s-$%-inDX9!6Q%;r2Y;Mb<Yf;!=W7fi_@Um>
zQN~nyUf*2mIhnz3{0l_)bAcr8;1_}X^+g8&qfB?luF`eeI`L&*19`d5>e-a~9#2I`
z*>uXZe)&oY3wZjY8>PYHN_a+4)jqt66rymUF+|7k@Vw!Dm=8fl!=bNYf4(E>abLu)
zb^OXKE6Qw{n%HtN^vPPO{Q=!d`_xV8K`_qbGAxW&qc@TFJk~!wO7xKs!8qho&4L%L
zyM`1VJ+H23$CN2nGHQWdK(66oS4O>n^;jRJ{V<dKP=7MZO1PcnxyhVUwPU9QJZdY>
zdajAjXZkK|sYi5gjOjw3H8Fx)W?fddgG2}=NKJMo^0v(MirMz|$V%I{@~L>Gb)sV~
z>pNE3oRqaE=*+N0TFho%qrAiW1*e*wp@j%iCTuFYUE}IYGwpmQKA&qBwm%m>kEkcS
zA%!=?-i;qRU!87Vpo=0(Zcd56Hk@M13u$iZdIownUefsdIl?%l6Xj&Z?vrm9(&06R
zvAX1&-i6wig6I3gd8{tpG?7g%e56jdo$L7=e0_U;whg1qu;mv&sF_c)!^M|mckc}R
zL0x=?=4@rsk*RZd)XNjZX4SXO=cpg}O1Ht1>p0yTF@EqsxD06zcSH8|G@!3N`_zW>
z8h<vjNHpoeEaA&%@XHGD_=Tu|mBv?&baFwNE9AIdW3Nw{AtLz4UOO%#Z=dx+`DiT&
z)4RkrW={@pR7<GLvZYepC#Ld7^UxmfeI<k8E*pvzy(1&o=hgzlP1<@de~5o`pr`h#
zy8g-Ki(zKX@$=oe;MuF<_Kq44iUot7#48%{PtRBiGVi{<87bSl&j?vO0Dx%jcA7}`
zoZy+(O?Der&h`M)zcSonOLNfYB=K7_rdUbUlk?BT-7zbO#3X2Ru?X6Q%tVes1KWle
z8jA5&s7Ubn(@)3}eK%-0l<zIBU+N#3e@hOE=FqKW-xmJh|L!cEzdF_KdZ|9V{WidO
zC#b4G$E4-63Kmwf;9x&au|b;KZ<X3R_zOx{5aV~Ndy$C7c3f4mLB+l8xKfWk-ft)A
zRpw^!g#D=_jg$%^VmPiygL=%;?NxzRp#sNb(c=5yjGvhyu0cvrXUlgFPlKS`xAXNv
z^-fRm`YI9C;X=eIS9+dnv*Tw;o&8t=%T-x3hDeOdJu7hGNx=wQj`#ew{KQHvvFQ@C
zV>ZYz--Oq!wPb(gC8gOiS3nm<tX<Fz8n%UP;ERbbywg0xCy(kg>rUpW(2G;WoGE>k
z)3Z4A@6gT4s^5%G>;3|#FCkS_SE>#(>yJMBla6Bg==HF!;==g~e)~5Eji&w~Wzpv%
zIp$Hl&{wqGp%!5g_?L4@8OzJB5=>|F7~Br#YVhPLtXDipxV2H6@9((Qy@c=#r=_fg
z*0)}>(m3|%J%I())Xu&u{^Yp67Pa6j$cyHdGd%#W7O=TLK!~854Tf89%Dn$F;y$YG
zYm{{o0n>-Af4f*{ZF3jo_c%6K*^s2U8i${;dv}h*DQRm0{;ClpcGvsM%Yh8?7B%7_
zku;!5)MqYL?-1l_$nM9UP>94H?OFk*E<2i4vETEC8C^e&Z@fD6G<mrvYU{Hde-5%F
z7`lv6QdyCO-c^R(nrCh>LFG<5e~n*SEzF1O>vOzvGrDY|a*3>63zyJ!(;JGYnKi{!
zLC2yJ`C#~Y8Nen_NWVSBdHatSKu9BW@3A4<2>R7jj(GG>Vr8>sM2wsL{)?NVE_?8y
zCMk_?8y0+<P$x7rp@HP3<T$?XeIvESi_{2?3hDOUs9ZRSPy@HgDBLCQ++>eQcd90J
zM83VnS@iVWAut5czO$_l!aEB}qB_N{hUE9IxhVCQ+}1t<2L|H`J%Fxa;_h%kTUowv
zO0jN-D`%XL82|^%r~=T@*;4Nz!nj~cd+K`;FxH7nOLAa4x{27~<ueDZM3@6r5cTU}
z1!|1@n-!5+w==Zbay7BagK2WKK`vWvUSXR<Kga6zB9)3#*Msy*5+R=|K|saVOyc(t
zW%a%cDJu%MN))|cLL-Qo6!yKGXRdTBME`@A;y1HjXnE80wvn~=`>_-y-X|Sra!x1z
z3iR03(X6^P?w(_`{ihgh&tR-J6VBY=OWI_V>x-2vPl(Xq5dT>I3~I^64$^$itJ+OP
zx6>y5@Ho~WiTz?o;d=$688rgD5||~^hMl$9rk-9k$7{KEjjQTomKc!*v7pSUI<tBP
zji%Ys3AY4fi|$S9pdIPt%q*N{x1Qf{MPvYHyl-gt?l6#A>P;kPz$7-C8mP0d5TS^D
zTvdZu$O@@iQv;@or{ZYf5B@-Ji4RH*%4qEI(0pT}q>VdV#4OuRjA^PH+LPgZwuWxD
ztd?aSSf4NTiQE%UY0#ifaJh29X?^yEa`V!A8OqJ8&xXSfbgYou<zTowMh%Wr7^VFa
zz#HWiCGifROuG;QZ3VyF;cz<6SqB`Y$op<QXLG7xRQ24?v);hD6~lC(!Tg`ThG2YJ
zWGVIyQttO|Pu-j;8ij722-xGUxF<bR45gy|Iq9z>ax&<hJ4EpLDASkGAGVv2CZTzG
zgA-Z1{UpXUO>zaGAz8TUdmta|1*o#7^*{1&vk@Ou1qd~<r0=x9bi!!klkibm$|ko2
zG*_zN*bPHl`z-3o>f1ve4M`5R!QPZ1tn#nYTyiUdK3M-^(=OxNg`Rf<Vb%v@#dG-c
z7IkVsWUQE#xxOxRw5Ho!WI?k&Ir^_M^i7{alp4Ujcb_u@96w4ieh0=FpoRd{KBDi%
zGw8{q>M)fa2yfe|0O!9-*|i51!EuDQEc*+<z?TDMeKY~v58!RPf4@x?23H2uhVSR=
zNZRz-U+291cIAk8r`1m$5$qCEen*WUhz7)fAa1!8YKOmy(uG3kWwJdegtq$yw=q>I
z@ygflKg?^mmsF?v%JqAPtvE~p%KuJ4HU9ao1y*v0Xx;hO>eRZRCl#N^{LPiR0~dgT
zUv;?FBue31^c)cjY2k)>jmZ?#$ZR(x<O}wM^IzAdR|Dn;#oUt!I7o;THSK@+=ebXG
zANEG7Ctdz{UWC#>`q|AyIR~-w%9N$c@NLzSvR6W`MXYl4eM1}gYBsE?70~AYD*0IB
zpw3}$qL*XlVCE4WCyezhgPhM__Cn;!m6qKdM<D(wQOp_)k}n_FO|FOa=-QR-HPC<L
zLM4n*QZifaPE1$q5ql(+wEc9)J-Lp9SZQf$-6i4x(XW5!0@sPclY5;>_^#TsXY{o3
z^Xy;HES9t0y_IL&u@~~QTGnZ+57OhPdZu5lpU)F*@CVpN98O|ZKcik3<)EsUhLMEM
zvXr~tMq{w3FdGtlQCImYAQ7DcsY8iKuIcF}X%_82+}^QY@#iGftQcIOkS--aZ#s%(
zkT&wT<KVttHT%l$!jbOYB=I7?i3ZPRE`sw7{X@}c+aTfcn}R#>ItI{P?i%YCPX&kC
z+FhvVwH&aM{YzZhDXr22m&p^9e$b(WFU+9f$vmGZb9r5;UqPhE#~lexyI$<I=`|bJ
z=!=N@Xr$Yab40ozIMf@4KX*-Sd|=_`dXqwN^``)Fa<QP4p6-^Pc7&H!7}R_}op!`9
z0(PuJELm}aUs==?^7n@uzg-^etuNQN^^mKgbmp$_EIE^_O52U!o6%ofXVRmTf&GEJ
zf~^l155eB0w`{1=Q7x#ERG|FSJR%cv%O<^(a3j1~cKoKiFE<k@<!BrsUVGUP8;<35
z3xD~bPiS7W_C64WhX~>!lb*IZw?<<Euk$s&An?_8-x;NGi#M=Ivg<D?3E5pC8|}}+
z32Dw!DkWbG<8Cbwm*CO`)Ze+Hv8ox{ViKti3wh>BK{)hjI(eH=BlIgf+pYwU)`y!C
z8&6eN3L0JLYViQUW1R4Xx2H>6Sr<)M+3OAJXZdmi5=U*SB@eh+DTVCpNVe#YlF=8u
zR0>>=B5KQqdtRkx35=^j9>S-bG8o(1Q#_PGA;+q$5Rv(qxyW3Kk11C?;Ia8xuyyc?
zrf1*z9uOjrE<U(AyayiIDYA!W;3as7(^)>M|FvEH;_9qBuAEt#V^%`WyYDvPd!}gE
zWCGQBaeRg#+WHISd9RIZxZ$MPRf2@0$-U3Yh9Fr<*A+QQl%A{ym>n#wTZ%p>UfITm
zvtiQPzXlf;f{1awE@q?}D})U--$}J|IRP;Do9d9<gRT{nsaJJHu-Xe0r2T3BWOz0d
zL(+HPHxF<WUM&`+Vjm^+JE_hszzi{wvmY^O+};{+zr0;OX*yQ#{2sotA!{ddz4~J=
zRrX`=kjgAXnyda+W}p<&fACPXc=BlM@yyiJv<xWE7R<GVr~ve~{__WV8z|JK;w+9F
z(z+tTH8F>o+8quTeDUN?mfEU<eyp@#ux53dJ&o+;uL+g9>^7@C%FhPRWnlpIV%SJK
zhRJc^nj?-Cxlq09ZODr?KBdw4A+xn(mY_@y%~Jk(s)+?c$+wH9m%Xm@!S7jk1ti5(
zCvL$vp$9;Zw&^=Q&Nu8K_9FJ;jCS!M@7YF7GRA1z802%-`jh$y1rv<4SP5aQZ=kB5
zOv`VGLT19OZc<Iq#(_p4NeeL6JSbCe=@c+3FJb4pMl^`@a%b(AZBZbDRs)9~?OTZF
zD?Y4s6A97tybI1Ik8c5_M$HoV-lh3oHSS1-`zU$0uMtDL+3)(DlS{`P*G$o$m(4Sp
z1@pvMI8@S?fQaO_lmrLpdZFI0jdY~;#kQ#fOYzk-VonrvSzLVLDD4Y(pzHH}f$A?j
zmp3!tQ1OOt4PNh~OO717uejFVUD8Up-i533Ha=lM?7b82_=@$-v0tF&?)F-6UaUq|
zZuVYtuGaB%XFRF-;H&1d5UH^&y@GRP$)5Yy2*vcQqdrKIg#_2lldzeB1uNJ3z!A$r
z9HW_m51t?F_p2pK_TaPVbs5^8?OH2TcGYgJyDgD6`_&-d_pzBr!OlvY4}m;Xo_Td!
zW2{-dDh8fw!l3gmq$Kl|oFSwcl`#qcl)w(4M$kN)D2PDjd3Po?D9zS<bf8DCCV;AW
z6)HlnrCT2y)SPLuglCh%qyI^1;N7>*3R<o8<%)<?w-qXhdNMrvHb9}FN+1G`t(BZ_
z`Vr)+ca#boP;NC1Z#$;Cz0z1I)L?9-vb{`F^sp>+fcQVFz0VTQA{K(o@CXL!EhXs9
z8r@2NCgzseBU4I2doz4#E%F5O+uqnbK3_76Alz*ngO#Wx4BA(oc~qaq(#8fJ0wUJ-
z(O+9R4|vzgR60={>7{UfQjE0^7u-jls>YL)Tf5^hTh~83*oY>*sw^1rWiQxG*V4Jp
zyCngB2fLHR7?hEpcFyvN4*H6dIzd!MUtW^+hj4(Qw^ta+6D5(~!`3N#;+k$goKj1-
zBH;1h&y9g*!n}>k$%!MEb#aV>1Q9y_6_MsOZu%z6lkR-GG2qnl8B@SmfH`{^uSbiY
zGN4OGKBK?68<!*%a0#5M&nRMi*x4tW`s-bX7A}{r?PCsRZ$!_=My2>bXuand)mbJ>
zClSuvA*ddnwYPTncfOiAU;&!;TQe2Z9~e1+YFRWJGy#brE>hrq^Sn)=xPcLP#Fc)y
z%#)%pQ-<t9nQKcC8qkNNHn#Ha%Us++3E*4^H-N!K7K>dTH?4`vVzGDx+C>9-aarxF
zasDxTJ*~R*rm(YnC*ln4llHj9^|NP|ll?z9qdwSAH8lbT2l*8sppeR?upTD80L2l>
z#gaaMRF;<B$@0d4U+a}tH>9i#ZQPPbC`sombNa6&BQgMdlU9?S593FeW!K<B^a1Qe
z?ap~<qHxnoHPFgJQb<>F<7Yz|UeV>Ge1a;45k|D(A2XwRK?!Q85NxXQyc0KNHLBYu
zPHU1J5GWuYsFlGkBLV%QFx!9n!LGqj?+zLdX15Tbf9x6kY4<NQ_3<yo1I))py-z?p
zeU?O^7na`9l0`g{D=)KqXF~w>62x&I3I&c(&E_P55t3491L=*|hx?UvZZ^9g`;%*M
zeQndCUISk~1Y3WOye;bm{{H`FeW8+b9zUxZ&V;`CxGRimnC?r1xAQhElp_soAWCMh
z<F(!O;>Cii%5VpS8L%ZdC9EnJ&$3a|mq;FUv>O%KH_d<Uoiv2S8Q%uQo}@0<PT9@;
zp`Gw*qN!PMX<l({zk>laqrn6Dr2pDn#MS)|Y0B<8xwY$P%-rnSmX7Vez*{(3K97~L
zR~;HlpSFmkVB??2w&m{yW#IM0MA&+k$558%4@>16pN_nrc}_Df)@5t9dw)tCt?Q_c
zhZXGE0sLd^JLS)&@A@qE$26rq<Zc^h*2jA*2W?<B>0sqMc{&`>;Jv`*tuOvVg?Ua5
zNX7fWnd`4qW*RHaMJ{yzP{#fRnK9Y}Ukq3JTJygxnlevK)BEMc^Pr9BPpHesJi99M
z*kippyD&o0+j>O~cwfTg<eaaI)}mT!!9ASq?Xmb)U}o@gb#x+a_dVxa2%w9bod4$2
z%Iii`#-j4j2Ut~fePR2ye|S}+G<em4V%^StFuM9X)2?&+kp@#d$x|@X>E0BUjMoJo
zq`ra*sOc8vEATXkfW=J0rULG~z7Mirb}Gm^HSltJ|2sgecl{npG^o_?U1xeA8@B+g
z7Q7>LPHgz%H+Dp(nA@SuD&Jd5KtdKA9N3j{0O{B4$qte0XTCX6ZYctD>DUWB{%v#8
zK9MncjvMjjH8c1Nv5=t$e%p0ym+4S?vw-<&B@^lZAkS+xeMFLc)y%=U;6fjCeh|_O
zJ!nJxJP`n}qvU9%u=Kl1d~f6ZPkS>prk9AV2f}oo;x~Ewe0t=o?<LQHFPnIit@81b
z2MHIcR6;4;40=<Z;`ugyZq{teO07I4qjFAF73v)IJwrgOQoo^$$6{r=FKuSAzv4LC
zhuD&@Q67vsa1+So@H$Q@QQ*5wWM52sX)hv#-n8e3P1)Tj7NW*p=1$@?<LD$-LlTL2
zK8;7ttf2na+AcqCdgo%t<(IyaXPsQ350nzSht?55;>FI>g**x{YVG2xp`iaUTYjZF
z`!<V&tYj**;&@#?Ao<PVI2`Z6yrXNJI~gx1Di)wD!b$A_I1ayIrqt3l@2(H}tvX~{
zW|yZ(Y*vtj?Z&fsad_%MRVn{)v+NE-j{ZE4u+b}5c+VHCG%N(3M)SR?qmET-Ox1eg
zVZNE9S7Dx7HqcM^8)&vyka|4aQMBqP5v>nt#$ZA}YaK>vJp7aQv-bYn2~L?TJ!ix(
z$@_0Ms}Nu;$7<^t^Ct^~6aEa3fPg+EL_$2Ej~EWze)1rr5JKT{{rBFk@##*Ynu0u2
zJgOaY?s`z?;BO0M-4GvQS2n3&XxOCk2LtCWW=Auuz|5tmopmRM!6Ei%Ew^v%2ib?S
z<byTy%t^<5D>=oh?c5aPD%xGa6LyI3-;>gxWoo>`UmveV+JN1`_$y_SLL@>M_`Aw+
z)O^aV&lF1?=GEkfaj6m1I2!{kn?24E8~0KUAY(g&sNXsU5f45b^=<9;FN!BE1hoVD
zcTcfNxEbY8lwm=-zGx0XdZri3m??%S)%Tv-hCqDK;?dxby<IL(skd~X0a6Hs-eaTa
zIb>7xuB7K}kg`AB!c>zqE3P-s*ij=^UZGv0VYIc4AKU``S<5Mn1<xqyBY5@4BQHsh
z1s)}UNe09T`9`W5v;ME{fNy4;LO7Eg9!~Q~v7o6deCuGjN9k!qk>y@G<Wp*$lPLnR
zb?G)348Ar;SxKVlqD|0b1*l%!JFV`=|8iGaOaMT~KmTdDerA7+gQa=RCoQz?)|MG{
zvl*?kKw$~#H!IVH1i6m^$C_r-^1TEK@|$ml{o$w)Vf@4)z5b;@t0Yta2mrOKS0hl9
z9?sX>TN%vE7Oe<Uh$Tv7`3Vk*Q$N%lVXDTF)FqsuK&n^{`H?YH`#ck%K(+RNr8o@n
z=r9fa{9(GjjHlP<(mk*79xaUJCJ;Q3Eq*?%``Nr^l?w<>$$IT3j{5Ike2%SwBC|qu
zEky;vGl7Wz84NFQEHW*skGXGnwuxx;ZDZ$nky7;`<M2ezT@wJ=s(*oORpa&By~(so
ziw%4d>3di>fdjnns@3-%&Hnkyi|0^m;K`n~njL7Mc)TPi6!Ii)j);k+o(0YUe%S+&
zt9btYDrSyog{1Na%{0$Ef-L{)dB_c$N&7^j=sBXD+#!(V0T*#RY|&OfQI2>L7+!Xp
z1`qvab<uy@Er7;*U9P{V_C+%Ym|lLgeTFF>2UZn!dHlWg%lL{<Phw!fFVYi;ZK`+M
z9J}Z<U}Lc_U%^=2p(>3Wa<L@U=0;yKRf3Y+U1^|~;Zi>jhDH11>4rY}?UD_0G6LKT
zD^i5Lv^HAs-oTM7*o*zyD;0xRq9Si?|2)l_<4|E)98m?DBKb#<7CnIS@uh3OY1810
z=aJ?$3hWj~lv6xmpx39rcH<cj_0--%`xskD8uzp3o;@u#1gdV`YEucP_5L`<;~o|+
zxmcQ-oCV-!sM1Es<;VR+c?>kuUCY{QLV>Nw#Nc06&i{!G!k{?gwa_cHAC&aOLuuFM
z1bH6h0MFuT`wjlUvK(pxs7T9E44^Ln!)byYF`xMVwJQN<+-o&IY_R2qR6iOpe;MX%
zc>{i$c|mDLw1-I#&PO?l2<^iHN^yX0v84uV7yf^~ew|KvnQ5p2u$h#3FGuRYHJSs;
zy<fY-rc_0i#zr+RLMkPCo)6cy3oPj}gNc;l#xa0FnOBwm^jP@IcdE<;_&VF50vuBV
ziiy%nEdTPt0$WwyI+KO!`hz;Vaw<6J6q9?Qr84X`1T@ghpiJor%tukd%P#z}{a|>F
zb#Z2<3D}M2&zF^AuSe^p!LkB`d5SlgbO2z=N{)I^#^*DddoN4f5}yI9u7-vSHi}Yt
zY<#nsIl~b1aZW?<&5?U6-x~`ok6(}puBibTn?A^08B~yEYCt{tt`oScuIYiVv!yBU
zry@K=GR0=86uZM&h8b50Rlt|Q=nwwx&)g?vBk+p>Z6hQ0j4DG_{^O%3Vl5?=)dS5C
z`7p^60IZNXE`X9{gZYvm>&k3EvkK~_U9PTwJH@SK7bQTGD3{VE7E9R&U7-2T34Flo
z3t8n>h&LTr<PZ8aKy1jD7_vt0)@>HDYj6L)(C<$zWFQfwOJUiyI*Jx5&;p2^SB@Ce
z2lPj4!EEhAZWMfpB8t@F1M*(|j+Gwsl?mv|^YGJ+j@pFoK<zhx#JM91B0#<*mSqC1
zn60S;^Zc)i9x|I<4C>a!C$LJMzt(s-J@s%fAPv(NSre^A2Z=*l1(cBz*3|!733Q53
zzyqj5J#`qIZForqzMod+#KWiVw)-RX@8)8%;*T;-@;+b(KmOWxJgDm)vhy8q^7fbm
z@AAi2esVf^gT46JPYN{1f1iQV;s_cX(@S=scvbgzvon7ohEh6N7Z5eFX8QAk@VM`Z
zMpXZq9cJJ_K^SNn<c0@2$hXqBMFN_lY#<#@2>w@-*TVq7rT!G@@IX5lE4OwUW6>UT
zkKAp|`)48y8m|Q0cPvm%po*vV=)zc2>vW3$9<MBFn}!Lg-!$m$kJ`tlgWLioANE0+
zq<&lLcBKuHTWp|7rF8-A|5XX3TZGanMO=f1T(+a=n=T)B3SZr?&JhU_vkk#>emXlG
zF3L%M^tG7P`ibnczFO2g5)kkarJa-7QUS(FO=YO~cb_AF&?Deb021gfZQCKxodI+r
z04Ne>D;Au<^%jRiz@u68(EBP3iKe8$2CFo>DqH(TDcT{)dn&*_Bsh-V^L)X-RP-Zw
z(n{+@${Wr&t9Pp+Id^Jm;O>r9>Fdb<Vb+)0=)(eVP5yWdP9`ED71L4Gf5#*5BVGJV
z0l&>y%>*J9#^h6CAss_w9J~)Yid_)-;kB!-ky1#NEekmP!zjiz0ZohFd{KlG0mT!f
zG*8?xu2~w)pUm5dMvfr>TcP@xG~%BYO|q|0<B`Dle!dtSISH9p%Y8iTdbz)rlD5WO
zNyzvNBNPRLt+?bWOjkriS%!K<OIoIs>ge4IrS7Jh>51o8#f~@SV64yWo>&?&gP)nQ
zTD<z_F^F$r1G!m08Y)43_+ab%-zJ6cDtbgEDn4Gl`&wVeTV0qE#29#Nb~dekxpUBn
ze)Ib~Z^ze*jUQ|Af(6jM>06O>$P7mNLUL$lg;J*1lqw4tHg=l2p$LPE2{tnPa~frl
z&R-E!iv}+9)cQp<3I@?9TUoy!a;0S^J+TcA5dGqDy4ZRj-K=Y|J6As>D$;JJ^S>eW
z$%hXVMkxdO)wxBPS6*icc?Lz4gMGfja9rO<ETN<!ecE)%Ir*szd_X*^8*GLA(~k~;
z<%PS52EsuOV)6bzf!#$(=(4iz@z%S-`;W-~xE*e8a)@8HY#!0)e$Ujq?-XHob*60?
z{)Lndd50tBM3Z8j7Ms~aqD4ptw6cSde#Q7FGc=*DBW<FZ__Pq_L}gkJSK{1VI*YJI
zc!&iENIg02FVuT7h_JLIXjm(>?xRyqE33=gbni|JTUa}ic`%1bnc}51e~L(H>KyjD
z*<W6g)p9l+feGNOJ~q71Vpj|UB4YdBEs140vHH=6@mf-~Eg4@-Z!v)XOK)Dh%NMcw
zdIF$LcI|TPZJBqZv@^m0-8u&8us@=qd?I1L{2hrF@Wa7!FXP5bw{{Mqx-HXVolPgB
z${GXkf1j0KhDt~SXT`<|{~j8lzc$fgjfMwuVqC9UPEjTn#aK?oimYy}$J-vcUn&Xe
ze3nVT`LFm#oXG!v2RGqj>-8ka>uaEBgWFRnNg<<X6~$EiPl2fFoVZ`7g;>C^`iy`6
zee6jV%Eq6TA&Pn^*CGpK;GV}sLV!N7c+4nAEE>x$|M$}sU4*HI!cWrTiNtlr4iW@u
zB06xpM7t{s(orzBay!2f30a#em0&Z-t&r+<a{Uu}(U|@mASL9)<nDH9X(r%^#I9QG
zmxl#}`T*S$CG?Fg{yrJxKSu1<b>p^H|8uDKkr=SJU*v$^o#qiOYk+6jfmJK$?=FFl
z^&cf-nn?PoAaMbWFQRdQn`E{@<>ez1@)rzw!u<Djr$SpqL9yyu#5pikU_?~^cOwda
z%7^MEVe&ox|LJ;}4vQjSIBMY-@F`C|0~alPRNDYd)F5+j^xrAMoD10=IT2;C$S*GW
zDhbW6jn#6vYT%!}MNJ7v&(PQZr02(mnBkyOPzyB@SM)WE)dUUK3`+DUI(7CJEs@IP
zo)D?=GO9T8BK?1D1kcg@O+oP>iuBNqm7J}nOtH$H3col3$*GuNR>|sD|7>&*dtJ&S
z3S{84Sr!sk&A;b|o$uc%68J?JIRNO-#V=~2{V>CXSjms|<lX<%CKBmU=_3k|iIkr|
z(zl1%4S0wND*v9oGSDDKDS1nje@~n%r5Juo^Sok7;=i|?FU-T_Ejpk-ELTlB0LkU1
z_<xl5mQhi)@Biq~-5mm=l(dY53|%5EC?E|2Lzi^7w1CnvfPzYQHw+*xQqm2QLpT3D
z`aIwN_qW#Xyf`n;tK*u*V#(gK?|sL0eXh?{f~g)X(1e^YInX0n&m%$*BP+#9Jg*pL
zfVcdv$^C#H6kE^%eLxfnBtiCd0qQwg8x6=W&LcY+T!k`1Daml;-iQK0TReIA<NI&U
zbd=+F#!f5&6re-YbCyD!l>L2F$L6gcZXD0${!=k?5j9X7$l^Lk!*i;q!N3^Y87N6~
zbG{nCZ-D#03CIdcFjbJMBnQpGP7eeE#1y+dR+R`)s?gObK{Fk?pnT92CX5uQWcD1@
zLpY*}OUi{cV1aplA2q<L6?^d>s1>!25h#bIhGJZ>Yzc#TibyL1#FQ;a)ZjZ{{QQ@?
zs8fd3irj6jI5+9W@Z{{TJR5mK;S#hD1V&*0?(JcR=A&&=uGvLq#DKn`8k;j>!2|++
z=UaX%xDaUZie2wGW&Jl6pnlmgZ|feEV9`ZWs|uw4ol-)ZwFE_-M|va!Ghk0AzX!+9
zeMW`pGs*$q+-D0vBfjjMaG<o5zktL576u*$O|v*T-)Y;m1Wte{^kD!(P>o!hz3646
z&YCA`+g851X_E@ds$erOTh~(JJJ?sV^_w>W3*_dr;_yS)C-oyQ2d}n0Pabr#u$~<$
z>s4Vn)<eLG)fmnH9x+4GMM&sTqM;#cg6u4yhb518C~(p~jl38+u<d&@Gb4eL;@~%U
z>c1|?A=q-6pOT}~XC9f64)Vb_PUU#C8Co|D<O4_Cp@D??H%@sYuR1`&TO}CA$|10&
zyHdg0P>qMt7>?-|B;WQyO%AH-^>()5TNgR+b0)*H7;uXZLAWVrO)d#)YVXDLu>$nw
z4^ySrDbhEWfUo+Z==8nQnXehs;*(F7-Y@pI=ML-l6Nq$ykufcr>RF^n;4*tbb@qrr
zV61~sjv9~R2N=|H49z0Sjqfg4jFRHr%$mqWn#Qs^;%BgcEo_bJ^)Et(u(Wxi*7myK
zdA$!p3IZV6<--$p=;jIRU3rH-C`Lu<NGzZM<WYBP95NX^kB^Hhi0&sD5x(F0B(&cR
zq@ui!I$Wf);l%z>SfoJV-G*fmx25vFnr7PoSU@`PD7juPCeb&N-agpu>8vU|HgN0c
zQpR4lu_I$qo4=c=nQu18oQR9FRGG^fHyIkVo-Vw`K_)W-j_M3&4($k@r@Xbpz-_Tn
zk&=;;j<>-1?3K^=h%g{G!R@?#HmmjrdCc`4>kJKjQg&-1Oz|7XCd)qx<KP3xUGD(m
zP-VcM#hRhGy2S?N;k+ZR@o6LwtsQ_zusLH5j+*Sx`@iR0h~Mb#*ViSD8?T={PyDtR
z1`~O^Nesl=hQwL$woiMi_1ydT2=;3(Lk&(y_(O#v&0`Bb<1E-S3zt=sx@Dgygp0P5
zV_u6sR{wNu=|#As2jJ=yo?2HY)L{Xz?drC2dD{^Wex`DK2e*?N+T!XwLy-V@Vwhc!
zy{IU{w<?MNC?iNAsXC?yl*c)Sy#+srddcARY=?n%FIbGEP08sT0>w@Bu!Q#J>BwG8
zZDpbc%$(P$#F^T1g1p=|G4jg1C*UmM9BF{)oBHvA?au@R?x+TczPc^9(<8!-0}A`v
zO!#8p(uSR}HU)`t9#wqG>ja4D_o54~0AEh@-Ov|@kp7%Xv%o)^?e%FJRkn(BglWdn
z1tT)bJ((Au{wYaj1Cd<edbQ(3YC(=+;!w)ft?{f@yl7A^7S`4!5z7%dP-o8}edEdI
zy`0*RpaALc9N5-JSglzYhq^G1CG6RfApfJ;UIFm2@&U2G+!0asnjO?1tq#A9R)8=P
z*B<SP9Mmk|lTPLqS$H*Y#B)3{B1{fp8K@K?=NwS*2OVed{p9n97n@vxv;<bx$0Hz#
z9lMU?MzhQcY8pj{2SI3FshicNWjw;LvUlAZ+vw<nV3%!%+7$lxBp3^i#<siP*1e2d
zGr#p1dKXwzR_7?nC<YiksgvGf|6)o%V&TaeyfD1#*<I6^lMDr?TDfmBKnY{PdzVX%
zjD8&vY#9Omr~5rK59*+PZ_CgJb5lDtmR=7@1m4|V5fx;*KWK2Zjbx*BR^9YR2594^
zH8T3p-Zd^+L#99FYXm$$gCP8OGjZxv%&*`MA*>Nvq@N$Mgb=2Nzqn7U9FZV$q`dCQ
zAhYiINb{XE`C4AC)MwH@bsF4{y%3+to2+KlEBdRQ$}@4B(#r6w=92xpNpYLz&VAiZ
z@6rw5EGi7V)%F7v{U@vS=c6S@>IwFv+ESfB+8np9*Z5lb(x+@Ax}<v4W15)`k2p4c
zr55TkRR?^t=bPLnOP|u0gUfl2-yYC&_S3NKDjO+}w$+j6yIMpK5l-$Gg$v!Q3{kIA
zXV+Ckp&siDM{Sk{yglKV7;T(9qlhBCr_xGb4L+v@N}l~(NmG~@?|h6QTytW)6_;mJ
zs=b?_g8c*ON@vx$A`8YM2E3*(!Zdorai1QSjYrPxnzXWd+@*W+J%W{rif1l;i<Xy0
zapa-z+&1<<2N_!s#h$a>lX8R2M_b#;S90w-qEL|CE#>&#{uPjl@c_zW*c2jqq}6dO
z<i+gz`{$PzmG@NXuc?L17sD^kyGg;@3#so&yKxtfw<>j7tRJM+(1b9G8wFt<IkH3_
zbrYGye}UD%-Fp0@Ks8j%+rK+$xJ4>>{ie<775nffeGLvMb;|HLAth%gbuy%!FCLVv
zY2bL{Zy{BQ>!4pka#BS^0I-e`J4fhOBkI?D6YyU`VY@FN2%9i(F-ROuJN3bS3lTx^
z`m{Zd_K1NR$anTqHm5BdGZC-Mu!Ln*CN%VG;l<*Th*DGZ2G8V+Cyy5odDjyQ7IR?;
zcuzWPNgNzqnqyo(A><sbEHF4lv-jDoxbdcIP0`dDjYlC5E0Q@ORDT-lcJR<y#Q(uJ
zxqtC8N)#yZ5r5-v&q=K3@P{rf?L^-)*br#8&#6kZmU{GY(`A)(KH1$GuLx=t&h&K+
z>9_cFcl8cri-TNkUUsygg*$FK?g-Mjn-53o+4n({GU}6O><6Pyxwm09SVIDu`BV3y
zMOl+JzvzxAOXS1oO0mQ=Fb>yLXWN8Q*PypkBIcm!u}2PVSNi>h!a44|TCG}@<g_?K
z4ae0dHU>c<iWo&YwA=b#Ce%VJS#3;Ty=buEQ{<dX*+eAyF8RqDk({@oTe+$7{3eRF
z5FctpA#P=KIpNuIjM@v|>WVgg*6{Y*J=e+w{nv26LtzW~%1dQ6s(QXusgB!+<N3G$
zY4k%3LA`#~fA&N>sg4SlzNNoD8uD3b6`R%u)!#wYA{gUzPvYHkwGVB-YkLBfM#Ewv
zDC~l-s$^#<d1<9OS*+w3maKDhauNl(H=H@NcgVr?-J@pyIre#3b&d#T!ItFn^+Ug^
zbZ5t(75|#n$RexHiH+s8OPH6$UAV{vqL+3*kY*xAEZ$(JB^Icz<lCC_`Msf9b>*hI
zGMN;w+5^)ooEFKfD+{G6iYYZzSvlQyU_5}%0-XTcnf$P$WRo)}{F_k1P8?z$v!v=$
zr#yC{!Hi9bHFAG!Q?{|J(3lgT==+yV`uo!{7Nl+H_%*TmQ<l<J>Up?RXO23v%*cRI
z6)pEKqHY8l-EtGx#;}oOQbwGa&Yd81K1J!`jx#F(H6-ZCH8EubZkjyz)yighE5E%g
zn||Ut?N9p=g1C>|LAPRQiX$OG@#Fs*slWrVB~X=vy6OyhESD=7Sp{)FlF|g_FCP`h
zi)s_D<T&wW%<f-q1t7ZWsB9S1o*j?)oD9Z&(JCgm75-k-Avw6gX}k(^vrsfJ?IyZU
z@rbJA@?U|B1o(^eDDc+DEh+q^(fh{q_Fq2q5`%N31?q1OWON?>&M{%!%rs>V5v{O5
zq^xn9NZde$>Scaj{I_Yaq!^%Q;5girc7MVbCr$nFJI0QVLKwnb*{^EbVf?8flDPO#
zKU8niHe~y;0&rc>2Lfm>_Dba})ul#lEw0@*6DleMAA$A}nV4`|O8g=qQ2!kYtg$dj
zUB%Ti+ZU7{7fI?Q3j2O+4PQO|I@D{gNK&!1Uy)8gtO>)V+s(fkz`Sy`o57Y-`XE(J
zRY!F(-d&L_LG(<I?U!IQf6+JKO{Tt2$GLSz7CZ%vOkz^>;|{krd_mQ+qWSOk73pZ`
z9hEqA`$BV;g8D#RZb8YaN?ZxR?2XA>e3=Z4gAZpj^qz7<NrPR6vEqoZF>fJ|^(HkX
zC}^HHckENu&hyVbIJBZx-*|umq{R1OR5fT}Wlljrf`0%|tAIX)J|y3^X#sh)`2V}B
z4Q8H5kD8jR7uT3VZQD|>9|1PWu=*3AL`w)gN^{vgzK9qyE*x5{9pF(+t4v_?m7DyH
zJOZG6Ksico=F4XzQ;M3d0Jjo?{paeC|NZK%i4qdi$d6L4o={tul?(-Pj9L`HD(A>c
z^4el?nXaG%VUH+42?b$n5p0!dkJhaVXs~ZXwM}%2>1Xw3H{ZBw4!cT%_cvdqgW!y>
zp8~8clQOpcNr$-0I+&qlgLSnINYbnWg7vvl<9Lc!THhV64#wI0D&)$h_W07S7jcxD
zt^L5L^Vnj}`f^PF+HEreZ1v;(t^Gs#H!J9z8d7BVjpgOMwf0iAgeoK6BRU@*oyk5-
z>D=HC$(tj*ZD4F}ta>K|&t>_(JU&r+0oiku7?E!jIrP!5wB}e>rVIC1%arYYG_KKV
zr8nzn7Un1<qkk973}p^e{Z7!8?CT#?URlJun@>+s4YY0`E<q>tHL+}dAS!6L5JwTU
zk)xE3$$s9O*L&uOe+xOqdR3}#3K#X<!-br_p(r);cj#uWa(Fe_1QU4E`|d%wf2U1z
zc#cL??9%sveZLvu_UA7~rp#*%BzD>q7)#(g9A-|}Q;gI?zVWHHa+){_jlLCIGCxaC
z$QiyHVGCK*^`#W*d_7!$AYDaHw@Z1vu~MTZwDfs{vD5c#L#}4W-DE7RdUM1Mlvce<
z$-j;p39COGrXoS>-3siLSphp$12tt9K`7ZqQwCS~3Z3isl#?f#Yw3$=$7PptS;`#O
zSbdJh1#NGk_ns&F<6_`aL`?2loMd%|x$c|Z6i6(u57fe*W$8%~4Syq;xkqgX1)Hp(
z_d7_DeVL;i%8V@UyrSjrWR^1PuO>gfvg=D17|oN>w4J#SwI6yL#hpo5`yANoLH4DF
zndxSuO;+dUg}#eD@5cAJ1R4Fmzx+aPKSj0+Br?5A1zZ(f__=$)W}cE_f>E80E)eCd
zc8VxpHOA?81?iHFmg><R(>x}0rRy@<JABjEG<A1m??SKCaaUtToMRXsLss+#ht_|`
z;tAMnnH_)oE)6|Kj7oQ8Unp_}Cl4zvj=-NmBJ8y*UG8bW%1YCdQ~U>2HsuIEyoP25
z|6{*vFJs=U!V6M2zdM3Ra?apWgR2LfDl{^56^zY5ybM|rS@`o2VJTfZV1UBP$N{eP
zCnF?l-sTsI5@mH}8AvSl+vy!V8%=;;EgRy_CF4vQdfyYj*?WIc|Jy9IZB1E}FhlzC
zc=9P^_Jh?B9Oo4fPAuV;2t(-@wa?UR4suW8;R{-Cn-_z3P89EBgP{}R<~+CL(D6T8
z_stG|vjCTV!pt`_-m0xxHjG}3f<F6Qx(E4C4lRETMU)_!so~cw&};PkPJmMB2$TfQ
zYRlVx^6yp?>%#^O^V`D3yzj}ymC@B9MuGx;zg*8zUwn>?J&>eh)9E7x-%hCnvA#u)
z$m|8k%i?gIH$<}lh(m_ZWS4u5)h-=CDn7>v&O1o?i@)O01bMsN8IzX1spl?K?3H%v
zWP!#X`h0+uuT|8PlH$-_>3m}aMBn@S#3Oj#$L|eOAa2;N-GAHNZWK4^?>gDuteW{$
ztS~(`Z@?h3Xq{*(OLb&=9r^D9vPzLX1>{Df(}iUkcoOE;oL}d$gnGDM$HQZmvnSMw
zgMM={RjDb!l_ha+VNb#eGJUQ7_HY%)r+izd>rK$kL<^2-WYULl_<ld{Gd6&H!YTSj
zfkN_35mmgktp3B!E!LsQzM91H)xC0*8~mM(eI?3)wng#Gk>)%0pG9MQ^Bxwev@tn1
z4Jiu(^BFW_Y_YPz+A}$Nt??Wp_6jqqipahH5c=ZP2)cD`i-F$K;B?~XUbUaF&!!NW
z<ItLtKAfDp8-B&TIjq1L(Roz;JC(~a+W(tgtH>q(Jx2cZ-}5aIx~RG<-IxqLpM0V8
zg4O6Too(Y80Hz^B_wh4nbkBc#!8V#}Km<92DjvD$zX-@#8SI~GcJ%c)x*R{4UXNL~
zRQ+ke{W^-Pc7CtxTsoHkSj7pCAbX+0`%3R)vs<XuUVfz{g=GGg75=rGd?S%(Gb4X)
zx@{30v6Wsh;Eh#lKDbU^L+;Qc--vM|o<~(|7M0xI4At3e&T4+cz1O@SNWKBp#(D_d
zVYbo~12jOO))S-#kZmi^EK4=n{;{Uy)Lxo{et*ih+$ree7Z}bdb#D2sW6jK{MC@Wm
z=uzDRz1VjV-3KSMRipN?VvID}9=}&5db1v|{m^%bDU~6JS^<0+1Inb}`xeBmKo^`g
zLe$vivfWMFx(PtQ01Bo(O47}RxF$wr8Y(2lXji5*g_jnt)b!5xifDPWh3LJwudSDw
z!x|8D+s^GS5#}K7i3dR9SkdDyi2_HLQ|I%eWP;weE;}W)TvWV<(!U1cmV0=CaO+uj
zwwJyYclO~hlkD_rI{H%HVQucXg6u~hp?B9%p=mQM4B$;Q6p*@3+5R70r>oLyPuNjY
z&H+FSK%=_h*hn4ix=R3+V9m#u`M<m<f1B`(OX}z(NeJP1dsRvb?jf?3Tvyh{OUpFk
z>P9=;P_DRLWmpU^t8-q<=7k8>y5C8Wb0L`J39YHl&Qdx&HO`m1CZAu9+B=7TIRXxF
zZd`!j6yek?9484q2JJKf=0xmFoK}JG8#@}syn4Gof*xx2944R1r;IyNJ?ue$e!};o
zKq=g1V|b-F712zZb$-qawIA|$@brv6>Om`G4`8q@UCt;mXP^&;Gup|t=8TnD?Siv;
zfzZorjV(#QlDWf?UBkQj{*hDI?bv-bgG1m)zKRz{;GND&Hn1X`k1z;!yPI|D_@NU;
z?oTPrm>OBmzN@7k*;VOji8Jojyq?0P<tc0sP-9W{&tn4&yqHH%W|q3z<D4|Z=$als
ze?CG#b9WyzCMA%CQ@o9B+j1$kNOvHi(vcKAX2Gj)XENsK9xIBUz8+%(yA+7KZ_rQ`
zQ2`*AtBDuBz=GJLLt4s;2YLM=Z|5O}I<(C?G<P#~g!>K`3}>s~SA=A3)bK5yr6VS+
zTZd~^%LH@1OV*XB#r@p$#R?a6quoj;BDD9pb?rL^p8U==I}3c4%+s7oJe-ceG-RB>
z8fbxcyv^-|4|UC*&0<-+`=-uyJC)-4#pLMJ<FjIVKyV>|_dF@UITk{gPb<~3zE4@<
z#g|H&IIg@@3%#No^;25NuzC@sciON7$KU*6!*1xa_R{57g`A7>tI}98%q0Mbq%O2v
zUTgj=c}<H`O5)Gk0Ii-WxSI*kwidUdKBE)7&rq|G&2IXm#^LsoYG=Bc?8HVaR9pt0
zV;@Y3h)utQJ4Vw_-K(T(jUzQ@IHU5dh^-y9H#zfBrmSPioNboTB#-3?qs4h~sW%vE
z`)i#@&|apxAyav0V69&}n#%n480CvzA?fKo;SibdxHSIRk7At8Z|->QEMElW>*oE|
zDT5N!ICzrP@AWZI$&jfAg=H1$9%-BWTyzum*!)5QC4bU+S*v*bI01(9<6gcsut%S@
zZBV%dQ#~^6?_o6FUEfE2&hb53iD<2kAufqM%Clb_|J`i~qZ<rRf6GHR77G$}dkx!z
z{;jp9B)^0Tqk`ffkv>E{i?h=LQ)PPwjxWTVJM<x%&9T#odD<K6i=XNylwuHr+Ioob
z8nprcJBKTjYtQR1B;br^+Ftprs(0SFjQ(}-pQCkhY;U*3z%V4ZI@BBa^l5GrgERh4
zwX^n6;K@q4_tF|GTt<FA>vqUDa^9eYxVf=Q^@FF+X`{30PZ+~`E62M>FN*_fsfOfh
z-C|;(zSSy|KwnhrYASx~<)#2zcw+9jeYWl82sXuIMn2OQxj`0ZaSM4SyU#cIjjKqm
z_sqYao4{WQP?bo1byh>S4VF{|WS+^14}Qo;vW?@-5poQ(NP}tx-Cn#jFBNaI`Ht~A
zp^eP^(rCJ`$Fev@e%|!LIgEl9u@}CM*-6mpD$0^ZL9rkC&6quRkU%aGMg>!T@I;%$
zod=I8?izuTl*{FSu0Zs5Y2(E8-QEqWF?`nj;A(iAv^oDFG^1|rXBf6DcOW1>jz=8#
zQ{|2dR=SM*bDkl7tmD}&F?T!d&FT(hiZgI7_>pZ4Wy)+htEKf>{Cc|;4eK-F!}Mer
zh>O`?=tLu;;3Lc5ai258Z!2y1`r@;6Vp66UZAjL~2s6z4mL*r^cN=q#3v>YI^AE5b
zxR}UyEfzhIk8+zMIch6jSGwzXc_-j&zSN)86XRh<h}#YDmOIWAkIa5we_Y>J_SdVB
zbUWJTb!SuvOMiUnTRtV+Uh#_%uTdAsXLUh%>y<29cy>V|^bHld85%mLwwvQx<@O++
zgBRQLda;V7MDnAgfF3IvWrAk3ETEErU4IM7l&TIQuHu8CKbs!3EFfO~#HB6RJvm9%
z<F{6KC+EGPKHtB&I)!Oep4PG2y&gyrrrjDV=n2r!j8^!-Ft`@AzKn<*Hr{P?SeZ=~
zfRptQLAW6P(AJ@K8KUgU6WVI<u$y=?8n+1tR5j~qOnSw~_Vajy_d;qrc)!C9u0|Th
z9O<VkhtN-q>O4lTvv}-YL$s<`y_zSed7IT~yiac<XECyS_ICT<i%uFi%@%Q48ri0W
zkFHQH_RL+C@8;Dnru}l%bFT3jmtXBq=d}0HwBYIN+EP-ssO^eGSAo^OA0h*KKm`U&
zM)0===$Vhf&jwc$7}7M^)N<uNFI%Y5;0BL#CR3Nf3l{EmM6w)>es=XelDj?L@<)tf
z6(U~mn|zu4R80crf+Ur5Mn`>7N6;u|G!I<wrJb9d#ZR4ucrQ7ZJIBr145e1S$<!t7
zyhSL9MFGWtL!wc>SPo~kW7*8}_wee~5~Myhm%m^>OEq)9bra+v4BjW582s$o$}r{q
z5w?U-OB%32{~38=#N==rByJqz|86XZRcmK`FUPY+eomPE>-+CJA@(@5h6*aLT_dlq
zWG?Vl)(28G%C*E-oQ@ub6$Ntxm6RugJ@HbqIi%~$-_&_RrMvBKants06igO|8xa#3
z9P6CdZ5o;bJrU2&MnWQ+1?_aA;I7|NY_iWtvprvx-=@<Q@fh8S%u_!Bt;in>__yre
zHceGtVIA$;iTd}X3KVG6IPT154r@fxCgqq}!hYeEp5M$?s$wfi3HXm%S3xDn2cF57
z;`Lt~QAKZr_J6Bu`rZt=uvMCMR+3kynL#s~*9mBm$zSfs)^pnW!J?d2PCl%~6Q%dq
z5^R;OyRkK98Nbs0Jq4UkN85U+KZTQ~DrVSo8KYNjw?e;jx5oDyyc0+}EEa2jPt4Mf
zi4!L+VE=w5(6sJ77^O4_KDI>;zRM5}R1@?K__3D#c-V%=CU@$>?N0G{>j!B+)o`~G
zfnKiZ`0w-GWZz!r?cMe59AKricCIg6*b_Eh9lbDU-qIWUnSdb>p!F5Pqq3OX`C2U*
z%xrPIit(;5vAfF3Q-w<^v20j>sLJ?4{(&R4c01Kc$H#O?LOJWy9s?~yZyzcffkWxd
zfJe$Tw<=zmdO1)h0df+aJFYoE8fW+GTU12jVnsp0H!XoIVM<QF$h-V|P$H!f59<fB
z=qg{P292Z=SN6OIdjd>9e>Yp#D~z#ZqD{&Ojcf*LjdVk*Z7y>!dSy#pK5iJkJ5oN%
zszg^wAMzamjGPBgtqPsSs8Ztu^TLm%XLx)^D`eD=rs~fkedayGqqBOdiT}ZT#k@Z4
zDaj@@K5C#r_*pio%)!MFJ);tX`+4DSC9jfP56&VIdq3ng>E^#$9z{i*=FmL#62y$l
zh%lH*zOT0zX;Muz=zw*{hmgzmrzm_EMLrbeP8lnRLrBUA#0JbuKUD)jDi*3C#^6x!
z#&QCxV{sFOOyW|0`e?PCOya|~X7v%D#v7NW%fm`1so9JjKIjxFc+Y57WOQReVzxoF
zuY_=G45u=`iIA1T;}cY|so60$RTlAiu=FkoX0=c)@%xE=OcWLGB4AGNTR<IFFuaO~
z9U&UmC}gFG?l^@N2$=2mcFbqAU~+R+f(_yYrLG&QaibRwJ>D`)#=p|cpakcb0$KY9
z+Xn-3Hp}mwF1`<@+DDLqJrCD=0!R{9ZvD1*b)7C3{1I=bQ4#3S){<gG4`X<_-$|tK
zeOsI~!a2;Y?h#wd7Xg;vLo&mW$CU@VX?5c~L)n=SAuyYWs>2UXS~6we4#nUUOj}a~
z1g~_rAyDsHY^2Ce2!Csarjj)-;tFIY);fzp&^+tII-D(na?@D*#3`&wH2D)hJicF&
z*P!VgR%MU)J|M__eKEs6hr=h^>lqca@Akx6nXWQxm*8}!$q&k^_Mg(K&_Uv`43s8O
zzs?M~Ai2^#8MBk{2ojG@Z?eY|<$hH*8;*ZyxwpjF?34h%uqCh0Cm3q8VdPVjanS&{
z>(2@dggp<@W9cskxrFjYU{Kwo)_c9~lLUwa)bmHsbJOZ&Bhs|)!+ej(oN~kC40T?^
zW(+yTra5fxZS$cu3xFp9lf8kDzo|c|DMTO4C)m$($}DL1a!Y8?0w#QKdJC8qVY}MZ
zt`WzJ{?4<mJ+Yq8kE`zYr@d4ftx?=9L|1vphT-_jelIADk*F@@=WDk&DzRHMYg9gd
zn`KQ&qAj-_K3k<apS`rO)70yS0WEfv>>xS1Y`%<>mhE>vKV>)ah2x8EZ9Jkx;uW|y
z;<Iio%c+bH(`SXAe0H5WYU>HRO~mp0L1)TdW3sHy#S6u;<XoA;hCW<bO<YoE5QxD<
zQC3Q;B99jZJL(}ujKmF)6&aos3s}SliIS2*7PkY6>vFvxO7Q0yW*hUYl9c+q+<Yhs
zlbSIwHx`oFN_9)jaUw%>he&bL9*3F=Db82<`Zt;ASiQLzFWWB@8d%^lXbO3>X11a$
zes#;|fnS+{u(`V-T16X^)pL^27jHKwfAvT=tCM#~wDk&Jbe~tZ1{E+}={ERbwP8wG
zDJRi}97(H14~QtFR;qy92WTXDRBQVw8BjQCIZ2V4wBJX_<p&ghm#T3&I4cM|sc%-o
zO=<Jh6#Ve-wC0;sx_NwCm9coC*QDi1#%AXEQpmqboTp#+Hry;)JDQ(vTYN4;WDG9n
zgRf!=v>EZ9W`U+;g!VgMTo>M1>-p5VeO4ruEG24|2uxzQXCw#IQ;d0bsKxwx1Ra+<
zW#i&L7vSr)!XD>FWj1C*$;ODe5(itVX7gYD;LGG^6H;#NFRjF%x1e1UH_(G)d~jaw
zgDsh|^xt=g-EQ&%cGyZn*{=6(q!3)Zv>HtIRVuFl9#y)O?B!R^2(h3oX}!gayBguU
zao#YLQEp{-W{su^0r)ETxEjjw<p-g7&a+?!v!~}L^#E-}>PvHF8vTJB%wZ+u0Nh02
zw(F(R1oYKdKkrVN-Yx1_3r?+^wJ70*8+PN+5?zfraxf=9#C#0^*u9|qyMqX7ei&W8
z8a++8(~pcbkL$1B9I3#{GqbsVPkhRNX+U+}I%KINQmOi6r`mTlKnhMuPGCei=WmO=
za4~X*TP{p3`40Y&^m}7m^WAUmm+|qZi!X;hb^wgR?Y!*S+vw^bBAPAOhU1sd|BwXm
z3e(RZk63gS;-zoa03Y;$nfO3;D#P}1x+=h1dLMtv7Dgm1;&|!38m8mS{~0jNLq@u@
z(1;~u(QDkou9qH;S*WI}679R?FZ?UUoBG~VEtdP6eiQUkO*yx#IB@ev|2tQD_qoh|
z^qX+!@=<$@3!gWr6r8DeW^)O805z^OvVOz@wZ;7Q8}K?v2THc_?x=72)E&12M)i|7
z(}O50{q}^CgwkY!zX1OO!RgZDRn*X^+@z$8Y00rLfLyl?`0@yOYYfsB*XjDl7OzNq
zKSv*8XQm0<HmpFm1CXC+vZ96WVdaVWzkycH*d(Z4CZfjJ03|~TCY%DO45?lz=y&+l
zpQ|7Ie|Ys+<yaxRdKtTQ^QWpo!9`_1fV=Or{6dR-huDA~s?$W4qqLaE#T`HUKV}z4
zB-7PO!=qEB?TK^db3m**vs0Fj0um#lRoq<KHc_UyzZ&-ab*}&~dRo!WASU`mCNPNq
z%gGC~xN0j#S~{jHO~5lEQT4V6@QkDoA6#~n=oZ|6QJ{#nr-axJZ2qFzN^?mVGdK03
z+Z`1}xoimZ49%E4pq(KED4QaB8xVIqX0I5bek$XJH$~f$y%|22P?5T`CVDCxhaU+m
z&oLS`lzYSo(nl;&`x7BP8~AVkL;zF(^WsV9&D!2ducM~?&Mm@Qi{q?pI`d}tOhN{*
zMss!g4$n8b7KEKQwUN-))=|WDmXBV<hnT<ndCp|p<Y3l)mEW(rPlRUQS>e4(y|I=j
z*ozpJm<f@}%U}H2mpFAj`t$?O9XT%fBhvDT1G6A=ekh(Xd%({ObTsNj$ti#d(&zU3
z#mzyI_6at6V-cruQ>OPqaLm^tSTX26)OIuJ+h5P%J0?`|=CTF7a_rsoaXEl(=Q}DM
zXHVH)7e>D>Rw_05Vo+U0!gqlOU#uLYix-%@(W$h;`aS7GswZ(lz3+ES;5?U%#ZNRF
zX;a!vnBIDefw!UZq*3ea4+&oNj;RO{zTAV}f77)BdfezIoGm$ZU3t~pUFe~!gi#z(
zzLV6X^j)oyg+<zz5?qmP=BD)B!b_ZWn0$T5Ff%RnEGScH=zk2>&}eF+Walm`l3<Nm
z%u1ZWyW)8Uj7Go{<<YCJv4C(hL0oT9i<t04e&h<@yO(API39HBoXPxdt=Q@`-|n}Y
zX~5JqetyxSCrGPSsBL*rwjF-&8~_iH7gY}d4^Q^P3B-e|;Jc`0XOy<*pz&tFL0G9*
z_p4t`s?)@aET<`=<F{WFX(Pt<lH^&cgElsXN=p#GM?0LFcH#Ae26Ew<P)DWPw$Y=E
z#((51g*RQ`=IDohXV+bK$`Ey*z)lN4r0O_4$);01v4)s5J;uSqP>*;Vmp3Kjo^r&<
z@w$UcM6cqK45D8Lt>Tjhk(pJ9U@lTpJ#2-5mrQHfNxmdu7sr2q;b1k&r4Bav31d@u
zBT@$va7#8rJfv`)S`R_B%UfqtXajdKqNlBiw^w3*<RNDZnlwZ)=qf*(_Q2}Mcz|al
zsN&pT9rvfV{MBfrG>f!62LSs0I--BHPTAtTPPf;H9TPC5y6Y<~ozzNZdq0Ta(N2Cf
zg$R;M^~Bxl?lIaGK!8VIpaN}%a0R+PQZp@5(@t}HYU_a9j##A=ffk8M80JC>yOamv
z!RqCcKM|cPkV$Rj-b|Z7YkGK{k=cVoeXP10fxX%~&C*-5ov(K{*>~1w2k9cm&uv+b
z-?;9IN;9Ob8caL|+6VywpKW+3dO&!M=h#T0iwop+E>~9Z1u1K@c-I@btA$gvx|>*0
z#Ly3n&Dq}+V=+nhymIZ5Th<r;i!<lTd;KX1#0sxL=VW`rpETEG<n-rsSg(7ubcy%b
z_j!|^o78@?W%XtmCKu1V_w1KN_M-`@zw4<X)3%LwRhWI-6+>emiPd>H@_|barINDv
ziGSnUEmEZEhPR>GjWF|1SqO{G47BF$ilnxXi2|&bYwz)}T_?n2JPVa*6<ofzklzyy
z4CNy?DaN|^2_IEoxm9kwI7AAV5dik=tk)rhOIF-zN|y~cAeSXyks(Jwyo7E$hm+nB
zmX&Hdy)WJHcIyFs=6>gqi|LM1cSSnyhXl{)Y&WInA*_#}sJr=lLc^=qe@{E*X7^T)
zCn?yk?MKevcVWPpy6m7_d81b5Bgfu&@%#}@&jmmKX*I?4p!SPC;UW3PKEuAU;L-vh
zFXs94p1(ov5C1q-OOOQqD+gp%eiYQD>5!PS^gRJA9w3Yxi>VL?xvkzg{?RJN|7ewd
zYw7=H$E%b~UjqJEkgf`{Ux#D}*$CbY3L3zN0mN~eA@I7=gwmRT`^HNgvt1mg@B_>V
zRFQ_1Qi0fPi-=Hz%@*bJv!9Ah;%h2eel<x_e0A;$16fZgIVb!Bn=bG$LqfgBuTM7k
zn|_v=Cv5Nno)<tFAU%-X-w2HE4?dYp!ME-{GVcUXTW3hBQex2qEkeNMz4j7_Uzn2)
zt-p8ker^Z0>2pu;7x$a&xFJmf!zfa4yIElxGX9t8HMYlJ7wd#?0~yk9hIwimS3lOn
zrZl2es)G-JWYnzF<!9KlzaE6P648s$R2y1Hn<@A!L|nV)N_SiGIh|INmB$1A>}+(E
z=fAjZqnBkAkjdnsi}(LN@%}rpMIOZw^k`MKzsWgu$VVQfOnMQ?<Wb%O7<-#n+}%H&
zIvrOJq`JsCLk-c46q05ta%CzT;E6w4W+VZpHN(-u79ff-7(K}aSYrgUvU{?Mf^aJ5
z;qD7Q5rd4yGFyjgIg&Zdi0EY=Xx$iCXba+fstl1j>WP=xoNOi#pFYkZG5DDEHDaha
zY_Vaxqz52~%X~)lA%+N=BHuH+3{e+M-j>5}B=THYm;QPawthy`^uxIqV<Tm{;j+Wk
zz>ETcmx_%S1biWC@8_hagbf=k^5&#ZFQ~JU5VJNoY0oP1;mk-u$sxluO63-YPZKYo
zZNH*%;ka$DOfMijBm?v;Y_zsghpzHc2M-H6$PVeBnGNROJ2xH_h0fI_UEC~n#F^d>
z;A~DfJpA!lDLg*di(2Z)r`U@cK&ybum$b2b{k_wd8Ca--ROg@whbq!P-$0PogOi33
zNP<~kNFynhO0F2CY4%SZ5TTP*`D`Tn;(E9bi8Y@YS}1oP-jlt`W>ulh=4y9RrWHj8
ztVXHdKds7E+P=Og26Na6WiMVhj$9mVM1$<-Yto4nur`S4*x-t|qtD9p*r%dlfX|^<
zoH5v?yFj^^6wJ|SP>ftu$%iBW1j&N9V`O<VWAX}FhiZL&(T$TPw4sb?rZ2CW5(U8W
z9PjY_sVWn0%97cT4ml;5+>Q0Q@$U1^hz<)xW4a2yatDiLqu*2@Z>?SX134vTwfSoX
zjKNLCqZlFco8J^V4eOeKiksyUu>lxHL!TP1$d8ZWfD$N6v-Op%En(6tQ3gHAMjSvf
z$-UrL_q<I2hB@W2A2nRXC<NFcXTSU%Fl#~v0IxraKEG+cS)3Hozwn*}K=ie%^Yk0T
z*+R@t%iF<Dhz?Jm>J8pcYW{Z4g#?>E!265iq&ZgWu}=>G9>|hY+)WGYWkn#RAf)iy
zpF{K<+k*m}UbEk0V@GRxv0VSovx>ZBs$sfo1b=sO+QXcbGBWPb@FZsF^YI?sR0*}r
zR?@XcT7q|k55vM-#kF9>S&uH>lknOm2d(#gAI<}!9GrqePlQS|yRi#j($$#Vq*2I^
zsP6zIY6^n<WPg6*UD;=c!NXtvYBlaukF^~!sD`qI!sh_TQO)jk=l0t2ahEX5F6E}L
zOqKv!T>Gk%jN=-v<HaFM6?779@Ru7w)%Q2qY>l1@)RBTNFGfOEe3N$4nXxGVk1fi#
zZ^#_DK=udkKLPQ<2UH)tSLcPf$iZW*(I9~&0!AUc3%u^do#{$p_fxLDzK?XNk1eIL
zCtoiYi)apru=Kxb8wj<D4rf!Bl8Fby6hsHo5n*2wgYp4qvV5CX@hP*sN`|E$uO>TN
zEd4EzxpRvVMK!<&7{iAI?5~@EBU|Zo<864gzeNij|H3K=)F1)^ABmllpKYr)x8L#i
z(<Jja<$iqOc0{E*`^jU6SRcjla$)MT_vC`(gPL2%huuvAf@;y+T=%_OFU)GyYf=Z@
zqOiR$%w|3azY1by`BJFzN2YV3Dz=n~%#O-<nH)~tw^$_y1EP+;`KeG(9uT}ToyKeO
zV_W6`K+>#^{tNTj>ZM*7l_AlIHeRmOV;xgT0GZb{X=LC1PH25G^|x!;@n9<TYynxr
z8DnUgD=qwi4GTtDllZIe7D_<wxe%T??gm(CAg$(ikc6K^2!+ym`YKw~!VD>u%!=Yj
zhRBUW?oBJ8?ZsXT&qBZ)=0g4sae#Z<WV%Cybt%|82c(+$w3<Z3iDlmZV6xEfp#Gvx
zp!BxKe{l3AxyOUA3BRM+?$Z*w;hCq?1Y_6>W!uWD<(@1t(eHfx>6-GxKN+TIh4;l5
z`QF(6sK8$KSG|s@h?_O*Lc<{F#tq)18p^arMd-4>uCp8*GiMZBwwSLV8dS>W7glCC
zXt!)EGetm<VI3lV8xEG`j`KgKK7nL<^6YQc5zyCH$j;IJ8GpTGVmx*NmFd;CI-SQ+
zp%FOw0|v3)&TH)<qiFnY9!6X`<s@>EAK^euiHcrP-1G?KBs-V$Yl8aJC!BU_S|GCj
zMTfm2Lh0N+vOhJ^NC=@q&X*Lv4yO}XtlTGBNvpCV`}(-AzF@MXE7SKxVtJ_FCEQB1
zy4B42-oBz;=+4dhFtyCI1--)Koum;95))p3+jP>N=OdrcU8^~eqB&lUNmtawsNrqj
zyX1dBhc9zy-^c~tMm)b6Yj&*KZ<rDl?{clt*#m>i<HcLf>xz9FoKJVJ=90>mCSOOU
zsrNcl4mi!&)qV;YTvJLO{idn5;;Zy^Sqnd=bOmqKiy(iF6gOSHj7Iktw6@G&{L;(r
z^QsddDHV<EkV#vTZ-{};gQAa#A3BhiUyUd$bXx3^754%L?znu{ufep3E)gqhoXLp6
zVYVg@s|3r&PeVy_3C!eIXX+KY>_C3><`(eN0PQdC>0==-CtFz!DnIg3iM}c5Q#?G5
z$WyKe{_v<G-G5c49jGL8JIzHN3nWQlaDE{|-b6f)4n)G0WwXNn7~mCp#IUF8pJUtr
z@3xfpMT^c<pz7B@u!lIXj2Rg~w_f2CWPb9>Epd*Xi8xDT3J5wAcUdHT9}Y+9wU$N#
zS>_C+l}<hNzgy|M2|6$T2kJwH{j~vMf5>3~y?TP*2akLJP*+x*RGC`GBNixqKoKhv
z9y4_S5r4Y@f&a$)0f_8ARt^UcA}GcfjBNdb1^f0FpgsNr&>a64v%iN4+42YF%in?K
zMh6tk4dmemnsrtSM8xf^r%W?7iuC*%O4=y{P@gFGax;%ajjUUVruDyRBMC}OXaTw)
zT99z+?Dw}Ve}Z2nH%ZH60>kz;e+nSa6mcMwg%4d)0h9-N8}$mC_D_R4Su_-!G-z1`
z9PlXswv$}Id}<!(2bwU};ww+9+-=kmC^FOlNKRCFJFS-oP=dg>FiGw!V}Z6n8CZO;
z4bz}}jmVpci-GmZ9Y6f%Ygtj;_oJUkds;D%P0)Ts4u={DmTVwKP8hHf5dn?vaN6W1
zaQqvS7&+vBkrKa$jk5fWH6`Xi-^>&tTqF$`1MJq@D=*L_s7|gN9B*nJBZIv7_5qVr
zUZE87S6QYP*nEZ4k{|<=XpA;rrt>Ttq$!?NQe+wb*zoV=vXV+eZwZjNHvn6f4+ZE;
zen9*L{AW&3JfZ*j%OhYao>dpYS&&m<54_G>k$>*_AAbQ}0~7gRU|s@%c_F9PmHCs7
z{r~h$ha;P`$ms_VIw~*+8pdx3G>~_%=8Qi1W3e(5e+3@KJYGQy#E%AG_sZni{P}uW
zCS0HwpKM_K6ItZl@5ALM|GdIG$xo#3VG&yW*7|>r7g0>fpRWhBArq4n8HW4>F;d-2
zGW_TK#Xkp9AzyG80(r;46CnS;egR*U1OG>V<_(oZS&|_QTd*XI6}jU7`NXRNv-N|!
z2=SjJW8^lFL?;6#-FU?s`kxDtzYuRB*KV$ZHXFHi?*inQI0NWVcCZ{ZbpCD<=3rox
zyqVI-`*Q?WNsSMD7>PdOql}2ve{3AO!;-|TZSX_mBWRNZeMQj+9e@H1P^xMPB>h4A
zUc$Ac0n;FRm#Z0BP$nn?2UaYp=84kQA**VL2CE2fH6U6v0@b7ZL_J^%NJiTYK)Qa3
z1D`zvc0t6e@f73$+Wb*(KoA_5YLMm~=0Wn^xF;kyC_hx`?I$dC0#K+N2T0E#D2(is
z5e>Spq%B|h5@)CwJHRxsV}~0g0rDVkgMb#$JXiy2P{r1sf>QO_gj{hFf0W#bV=dN&
zM{*qZZs37-QC2b3g+>TBA%AwcEU?Stp)X_p$fj#HNh!@)7HiV1IZ&GzXzwlexEF<_
z?<9J<Qr$Wl_2Q-8)8v9rtf0so5_>}&+%zxHt2)|6(y~ra7HV0$%E&c3j1E-Oc}WS{
zTsq8GCss!n1JV^iouDGLMINXjs$_dxgoGSE#)}p7;BUyB(s0lRcN_w*NS)Uxb?q<R
zO;?r{git*Q+wM=k5R47HdHOpvZal^flvej^!=V;Lz4I+@r9i>zDF|2MG*SA90*CgF
z*ESQ7L&O)zg<XKt<{%gLTsi>2hC#l1b^I*J&oq=BEACos68yOX#(@G0Q#i&+He1xm
zJ_Hz&{71APp5(L@QhY|B{bJ5}4)M|gPz0Y41`1HJ2dX1SY4B)N0^yLt?fV$U@k;ux
z=xv|h=)+wlKs+KV74=o1dCY?lB}cwjVJpj?d!ap<?k9l^>0I`?;bx0J>XHoMm2)gp
zdl+RzZG${&lyVE5hUX7ip>Py-d>Cgy1`iSGb}svBf<SD%($H!SV$FJjAK0ZDa=?!M
zHc;ID#|X+rXNP*BP3^j&cZ0;J$-(5j?w~q+W0nAkS#(L6<bWk-eq1uRhk9MvfUC-P
zPjL(awmU2o{gt$HO)Yhw9Vap{Bz!pnH7s<V0)AG;%Lan1lX4aXOWQORs*%&gfV8m6
z2G(3J#wgM|%q;<mjqWOqmqcppSHH)UP$Yp`kcBg|&taV_6Oc=4c8Sqe$}@UM0VM1C
zuuA1M{;A*51e~-{ia$mwvn1+#QijGzoi`?Rm0Db&x5yUmW`T~P4rL>R11C8{<Gnh)
zlcEllkd%xAUMO6&BvQEiu8jrK;==8_@|gDwu+@$o%lIZ%T|quQ8Udp#9W1tcp-N4$
z^<5;%P4(VT`NkwK0_O-`wUwm=L3gRnqxzrGh7>$<M_n7NwUK~J%NYc3XnLufBsu&M
zJ!bC@DhHw~kac_CzC-Q|Srin1^uYp<xLfykVuF>|)I>5S&fh7ZcR5kIgvrUl3@IT&
zVc~q3tZ)j2PA<#iLuJ%5@-}qsdxIztJQyYjw?F|j65Jl($x5?WzF(hYRqkt{n#TmI
zGnF`Z=ogjdReK3K9(wD@ZIoyE+YJ;=VQLahH2k^uxh;g8mmEAG4lk#0J^>1EE6qa?
zIhz%cjWETP=PZM6(^L~mYUC>}t2q%t>1s$F^^5_~aI1*1=|6&t1O1u=tPE=NAk3%9
zZ%g@&JLOLT>LyH9<%db61KT-ngP$C%j55gN?q!)`5}ifHvykgij!q!Pioa>OgV#)P
zJmiQ&)68?csjjRZ4cUJ#-xh)~kN}XeJlrd-9FM7u%rNeVu=wB~8!qUQN2#xe-8dqt
z6Fbc~hS(t~9s&H|h?wx)h}_`7YyCs{eca?mjS=+#7I;r0%tV<q`2gV)d7o9)Sf4a`
z**BNPGTu4LiXIqvJ2N2NxGKJBBXwv;6sTJ=Lbf79BcyKMzXRl<+C+2K-ofPWrN;h-
zdQr^*r<s|n-=HJNw0le(JYXzeik=*%3#5jz^8qHU?uoXO$|X&knp79eH}am>(<!P*
zV)(QpV_E`8un`?s46H2P_6c>*dvVA0vts^6I1Xw}-Q~bjnvKpk=;Yvec&DExb_E{+
zIJr+_B;&LL82OR|xf{6x`8#s{)%Lf8$39k!X$K9-KlK2?7>)x}9K+WiL~0eu5aPwF
z6?^R>95Nh7oaC3AzV*Y^`cbI&-*=en95?Y<fzZ~-Ce-%TIu@F%JJ+lR800l%_+2K}
z)=WSW802?DpRl3_ogAWEz39~BweyNPn>|(%btD8DyT<oR&Kd2|zxx;dQAgMw_W`KX
z{|7bw)knymOt!#SLW;M3Iw1x2e|`QhWJhk1KOYk%Kz7)w6vY0|7PbH~`4`sw=YDKG
mK=PZ|JayGauFe1bEZ*h2MM6CIS3E$#KSeoJ*%E2vp#KAtti<vF

literal 25462
zcmeFZXHb(}6fSD{Y$&1vQUw*2CISM|L8T~NKtXzm^w4{ag<dW6DqW<85_*Rq1PHxI
ziIfm}fB+$ca^7J1%9%O$oVjyn&X4;CogMPNYp=cbD$jZrLC=(*kdx7q9Xoc6T<)o~
z+OcCNVZi_E&z=T;c3gYdb?g{OTu%DYb2t5!q%*O%UQ`|+PPxRG1Sy0obU(V<UZl`1
zY!;p97+G|k?a_M~nd0joWmF%%z4`g|mq1!xJ(*h5+Q}bXQwE2-NsIlCyQ0}<9$8i8
zrr9VIt_JCp^0h!+-M|l*{9Nmf@1@6QZ~R~NV_tBMb5o5$Tw@;Ow{24(-k_yb*&Va;
zmdOsJG3Q(_hxKjO+JmHWQ&koAnD|9o7EE|b#6=Mx4fo8OHy>lEs%qA~`@ugTZYnI+
z>(^J_%YJ|Q658{&(G`Z5O_h5T|7*jcB6p}dM0;uv&X7xtKRq<~G<iN5(NW`LPUF;Y
zQ){pO<$h()?!g)HxooXCjJnIC6CBreQVtDL1sE)<O-~YQ9a*d^udlq7{caaq-;?+F
zP{Vxi`hHX1_?+`UUhx<d64dm=ZQ}VnzHwuPq5e%}??_z~`JOH9mcd_$x>ZOI$R7lb
z=;&~+zmky|x{{_fx*O_Zl|dfhSp3G?AMVor_)u?NvP5|7E7z|>YQD)2Q;9VxdhwuW
zZ~D478>reYRjGIbe?HFSYE_kus<~{ci4Ki$l?j;F=f-)eU3U<jcA7Kvq*S@H#sT+C
z*ZuQ{ZL(Sfh@yqwz$+p6wy0L+YR)6=j**M-8ovjf9p2yXDzv2~LH#fc3!3pxzifk6
z;6mF76MaTGAB?s!ptv{+dAJH86MrAY6>=l?9vtbvl*Vn6e|ngH%+&<_@<siDOxPyX
zJ31XVA%>TA+V3T}ICbQk_jfUQj7uW-UQ+OUV~nUb$zS3Ncg-{upC1^h6J;2=2r>|7
z2aOeQZcO<dP~DX(ZJ3NBkKOd_blA*=-(Sm$8M6K@O_VP@I_Pb<#e8TUqYeQdW`eTL
zTau&Gxx=(;&vI=acH=O~+VF(9)0u4qj~9xp4pQAC7PgfKHL2h8kWy;mkA>=`_AoOb
z4mcS`1hs|eY!@N;lE;U%UWaZV@rKzt+Pg%Bq3^8(S1k)M@g({&g%StxZSD+p5F&Q!
z&?>`Qfb-Cm<HDZDK?S)9*dl*>0VbgN>l(?PN>XIn8qNbhx&R1dO1oWeH1vwX6p?K(
z_*93LlU3$s*i~tYdX^P~l49arU-)(@+^i+EOX@Pc4*Y!mz$H^|{z8gfW-2e}2e<@y
z^y!R{_ty{MFslq^#*D7+)15m$TJGSd*{8<(bD;Z98U<!%Nvi`RyU|03-?Vi)3cqz|
zY>2-rZLGQ(&=$d4|Lbz|E}jkJn$R#HYuQmpE#k3SOD)puAY!fTcl<kk<d{}vX#J)5
z9Or%cQ<Nod=w6FYju`XTYPDSdey@>TB#xSZ1~=^SJ{USgGFxZ3&zr8qOiVt-osP8W
zFba9#RRFzka;5sf#S)C_?AT)BT>unz@-4sdlXCMaaDVIf`?1hd+9g_7!=YDyiW8{X
z{0syh{?;)(Rg`$|YpAVXB(&8ntCMz3S=YAswT4s**4QWWfg8wbuzgl86`HBA?oF<z
zH~%ruIm^uOBU}9eyD8n?4bB*-N`oT8E1RJV{T*zmgAnW!5rFyB76~u_|Hk^*R^^)$
zOqWZ|*g#7&DxJUXkMRI+B-HOo$s1?d7&3McVn?kn`C^|K(iDAv=nY<=E<I{bp4_d_
z@~Ju4p!>A8Kd-ssLjl@%VNfNg{(~Y3tUY<-T@PHih3_x}3No!pw$^sm9XFW+-Qt3U
zz9C;281djKa{6%a=t0rNG*96Yq2_1s=pplH3)5FWjko2?D>e|qSH9G#n2{1;<#pWz
zyg%|hv}gK}h@RViXrmL-Pl_JZ%Nxf3nc=<#?L*1)C4#Wf4<>KFCefo*g`x#p?ZUY9
z{10z)4nw#Y%v1d*7#8ODWWQU7VMA$5o9<BPecivMR!0<YvIOrRE|ZsLtAnalHJFC^
z7*KCweJNdxrl*##b0yNz^Wc2$Znm4o{dS0Ck3mEP_;c>PU2rPuFGMXna4ZwqAN<W<
zm=vd`m%JHzJhY`}Xu-8vm_`W5<Nu=AGONR(<+=s0a?yBY*vV#5Dt7-K>{_z`%+7IW
zR}u#?det7nU&T_Q*U)*O!m>ZCMd%)VSM1=xQ6Fa4m#G|<kNv8oiFvOaw!#&Lja3Wh
zY73kp#L0^2HRarTO-XMt@Bf(fL7!2k;b_*Qs^3!5KL&}$`b3h4QUc<gs-h)ct$PM|
zV4mafyNPi^k+zqCWfKh(TedR^*W7QxqYHZKmP)Bjy-MD^xW-qqv0vn(H*1hP102-q
zhp46XaFm_eSLHA+uBDp$?W+{VC-Us}*5IbXpJWI@mD@HMM!x%hz5IQPz8Z!udY_wk
zVXh%XG==5}-{c%lx5_=0?xe0!_jNAwF>|{-OZ;2jq&$R2ptZ8CRyu!WMm;K{zbo1a
zo~rx+Ay#^Uc(~wJ0%qIp=x`k>HgGDLd+Do}CpF*XsiYBaH-xDGPZZ0xZ)sI>hgw~`
zQ1zM!&7X2Hzp;H7ftBwJ?Uvw5tfZF;eUlj#?fuyG)k3tU_7E>Fui5v-q#`kTCV&Cu
z_e)QK;p;Q5yv@#2)gQ3NG=D<86jVdQrQC;gX*sEO!n8S$zYNv2*lzIa^A_6IHsi4f
z@s+p24HH^%<=(@EzaDqCm$&B7MI&{8`7KkkRro!2r+B^V|5v3AzXkoDlNn=WUTn<_
zM18M4vo~{nhIQ}3(#^0H8N$;o%Vc-VJ`sE|)ymuH>*bgNUZbWKlu3|RhKGPnvb#@G
z&?Mhsh8(2x%JKJ$FUoD5Ui38EIayBceL8r5PBm#S-{4S*-sI-J2hXU3Hr_AN*hAke
z-cUVI+yB?nmD(F3J3rS1uB@ayuMZ3%e=alG9_k(~>o?i%q2OdAw-r(`^6<s24ymsO
zEv=`&21pYW{=Fo>=|BJD=i1=YQpkT^v3OrK`M*p5yMm;NW$eo6f<IR&MrQw#?e~($
z)5!Y&EJ@o2Uab4M7A8$(9ps)iKKbGU8%2)KWg3Pr_bAcq|5;+Xg697B5?e1r+PhzC
zEWHQ+-8(|sUH;#D|Nl}Q?A|MFkj@k$JR$)_*z+sgFUqmqTSoTUW_%zW2}#%NwFCsj
z1=GY(zmrQm*l>jmI)r<ni4f-fEcM!*AyERS<eqipB=TQ}PC5~GapLY6p%_sZ3t7;S
zbVN}kxPK+CmUS#zp{rLNpB=366-w7%Qyli5ogs)YfD^gPo8$>~0ZjQzH8*~$H6n7=
zeQ(rj@oTPNy$Z^{JX)x*ZrkIYvP%8-yZQ2!%>hmnLU7bL;cAj|`(UrfJ{$10L8-Jo
zb&oy5uwGvMLBC`r)|1K34Iy0gJ$|X86j`&AKVL4jpi{ro=9RnR(#`C(h&I=%5B_O>
zESaOs5HSPWbc-%5gDvcZLgKKl>$|-l=_;M|J8+eB6s%wHV3nJ2xh+M1Tx!S;zeO>z
z|Fl&3==(vFm#*zavU%OD+$`?GQa#?W{vt4>C0S2|nx&z#T0FM#=Ba)(K|?D~1j5_Q
zZ+x$h<SEW0?$Fe))-k)yuB&>{7&t8K?Qs@{_>3vjYLh)W&s`PUf3bQcseU_MQhI4C
zoxZI!LE8Q4k##u7u^}UQSo4U>!;slJka78_4~;ALlk!P?j#tD!MHj1LqGbJHJ53rY
z+o}H=#axJz*3J5vDI_Mvn~?9-pKn^R@_lPpmC=gfOR%(ihSL~pN@tr&?8u(-)aL7E
zvj;O?I6VrC4!ByXZa%VA^g;PBWK+T{8ih|4jo;nLuBXi(jv$@5_^l4f+ls79eV-fg
zT-n%IY%%^#m*bCUPo}PNlyjlioA&b$Qt(OhgXNGE+j2y*%@|%{-K(_5pmm+c{;PcC
z6M5a3$o4%vS~D@8*||@vO9-_FRheb5tum?Gbl{xjt|7SgV%5{e+Xzkjj59S2v+d0V
z%Z$0Ccc%$TpOMUrgw}5OM?9#<y-ttzf7}H^dUNg8QghW9SuqJf1}-J}Ohxrp9ht@p
z;JRf--iL{<*4a7|6i|dr&K|pPU9YV(>8R{dqEEnVzw7sjvJ>i*f$idRktON>P>{53
zn0mK&VXixdSXP2{KuM|YY#~GkpGdAR?9G2dc1MVBF;aV+_~ixS)H+61T}OC(Jv^5o
zFBLc&PNsO~5Eycu1rwtUocmV*`|$5}$Ci<7HoOdQhiPJJ@^oV*_q+`dBDQrn6@qL6
zcy$#i5%!h`?^Z3|BG83v$Wjmw!Wl{vvVkq+U{+zG%kj3tpzJw3Q397~8gY@c!%g9C
zS*WNPk7!jJMIgwU<e4HO)!^H~OOk!X7t!}TEEqkO+gC-A&IMbA2R{URx6PemT_*_}
ztol2qd@Juk%JLGdyQ+W-KJvHO{E^VDw2=00&sD5M{lO>6s>k=>g9T=-GVd%1+l2?W
zwTF&u0C4LD@=|m<h35;a8q4ZDW*iAy%-vY8*&K6mr>!p6*_49~hAu2Cpaaj?e_GvU
zgPDPoJ8|kvf_1H4-LCf1oF}uBZ@&$ZX<6IxK`bmMH?_Q8C8)vyB24VF+UUPmanryq
z)o;SxXn5&~eV#SRX`9I>(d%?axYjP|5zeCyDv5PX121%SS3H~?83mDItAqyU?vk|r
z=t-YTZQoCSy{XE??raRcOs)NiM)#X|$>p*7h3<f~txH@r&%K!4o9GA!0n7=Iodec1
z=iL{Kf(%~(51yiy4c}&wVdT9ZxwTWbb~8j4{%xr}rY;G!HBU2c<}ueka`BJ8MR;NJ
zX}Pve<SJg<=$hwP#Z?p~CS~9wyf*1xm&=3?Ra@6D^$z4flbmO)3#{l;=+xU=A3ka;
zpb5S@Ntk0W7~h!;jo!*7g-RgK!~pM)e#^TgzOkIUDhjIbx6Cveb8^<Tj6>_<WvXkK
zTY33^UM6;sD6SNzJ(P;}515^5OS-y0xtJh%7~pjv_eq;W(A0YkM7WQLqUiIXA!!2$
zHd%lanF#tN@y$)maXhA;@M*R}>jKwO??xN!qS&<1aKcP6MQ1qRw(P;$&ClZiivp?(
zT1wc0j!j^;ppeFMC)JAD<tm+t+P6*E+r#6GfM=Md{r+(4J=-DaSD%ZNFyT+vIoi&@
z`tm;1I#<a5j`EA3ji)*4AW`CEo9^vf)GvMRMGe%5)P-Kn>b3S>Fo}r(UI$#Iqfkjw
z{*d0W88~?^+BsW%+i_wT>>>FlIyqZ{7rr-nFng|*^cK6MWu-)?L}}{qK{iK+)6l@}
zN)1wf0o<;|mXVvC>|8jpvjpYbdmxu(4=1aP^9&e270<(b{4Aod)S=WyG8nitgC=QW
z>Wc#-5;zZwmjPAEl81xXs;#CeRxva4Q=61MTO~pd@{PVK0<H_NJh)7WlC95a%86d7
z=65;hNuET9rkB)hJx57QHc7V*@msMkyYDe5-iPH+)5Z(ExZS}5B>yX6<&r<>Q@~X(
zI!5lhN7}Bexc+m;(W`QyDMbq_(yjuQZ<!97G}BG??~<Y-$%g@t-QUouw_6I!7`rj?
zJo|3lt72FYs^zy80CL?4H58XjpFShuWqVLr=*6my3~r+0Th4-w0*4bX<eUJ657Ob-
zsGYSO_Q1$QFu?*A)-CGF1QXxX8?@S+ldA)IPy_Uk>`EX$(KXBzNZDV=aJJ10nKAqs
zWP#86x*~(w4BSU^UV(L5p|Ce!pB`ytjIz6i$G~cWJl!c*#pl3ec<D}FQ5ubz5a#jG
z!Tk)C!0S7cD;A>r(>Gm=87Jg(^R8y%tN&^8_AjXt{W%8g9A8~|TO`wvFg?}TBBar3
z|0{9;+w`X`(q{!N*UlGraGn{tqZhfd0bH=2mYgw?Wd}h`#>Gm^cipf0_XTqnF5Zd6
zp>mefbgGO|pDLcXX8^z8y|sd>WTK{WTfyeXUIKSqQ79{2<TZNvGI`#N<jjw5KPx<%
zC^><6a-ThxWWYm53GdS!N)RG}y}~EwTXB7P+Wk*Pr>FiZuA$HSbVF|cI&f6CJ*1r0
z)B(d&W1D&-l*s#Dk-s_5;z2Enlx|wy0ulhgRnYdo7=AgXbln(|az${$uJhWb7d`Zw
z8+e~<`D3=zFULq(G09x=Z7Y{bo%eGz>#>vGZ9TrqkbVl%$47a98r)@N>w#sruTo=-
zRf2{Xd~g@V%W`ixe}buLK+W8k@oFK&Xj@6=^{OVxYjRvce*glMLl{wr?;I6qF1A-B
zFY(oMg^PG0?USzA_v4}k^(#Jt6lb`EU21)(ex>#7nW0>~o0wyTzNu=#sgK0)`&c%_
zI@7CaaZb81jek7%J|-vvm&&JNkH_G_mDVN2AFPtCY9|tfE->gEP9#(~o{e~zC%TX`
zNb%1gj;?h&Fj;a^|E#dSZl>F#oBgRho$Z_XcK=IKMIf8DufT2sj^l4hVPQ}n>vUG1
z(u<k`SH4qXl4*=7P#%A!B96T;sIyublR3O1_(vMRZ_0<Qy}0x)h>mZ;SOPcTD6ra1
za@9w-JWVb5L|EbTw5E2XZ`gdkRD9AoPaaUnm$MTdL_OW9g|}M66xBL2FR1|)9GUib
zeRJ@W)mo!O3W;55!P#%HuN0=>IKyDvYs@jMgpHu5gCr@HJaT<*-Yn~zIHn$Ln&gvW
zV>z*grnog%Qmne6y<zR~C*{2D|Hd~z=Q)SJJ$yv%GId>*EdY;HbtW{F)itjYOD_m4
z{OuR62#TD3PsCEmHpS%gv4<u-e4TI;aoQ8gH&$BzNp|AYB_Iw<Og=YX+ox3VsYEkA
z%5i1nn`$uRUdx)pwv^m+pl`1jhjRQ*u7S+pbGovfUO4X;MGmRP01akIoPebuJo?NH
zfx{XUk5_rw>iJBPx$E8OQ2$w)0rB%^r3EB@2LD+i<zv6rfb;rvR1hAi4S<<k`F-~O
z8dd;R>uozxUzx(1yPIVJrZZ_h4`B|5Mgwv2`Hb7<M>dt8l@ySZ{>*a$RYLx=MDiY9
zKiBAkwpIQk#Q!T%1AdlvF+m%Vzoq}RL=m(R^Y5j|dGg-IUuz%C4}VsXz}32ORPK@N
zEAyzZ{Cf$gO#ZWU<RX5q0S-a(KN|(^_W!0j_99~!w*71m#pE%5oiEI|(OXQNZ|RlU
zCv)5c{;DtQ?`AOt!3vXOb)<j0o>;}emm^Nh28WBUKYV3apVjq72h@l4<-0}62uT$t
zWR|ZVbobu;X(@kKw{cer&5cg*Bpi$j`YdllCzhB+s0)2&${Zfl8np73{k$Y*db<+%
zRqX^HJP29gP2cV)LRIKhLWv-MQX>+)Uojh>kZ$sBp|yO{zFwiLQ{}rDuA3n<JzZIh
z4~2i9p0;Td<ZVf=t`fAN5`>S~kbjcUo%?x2_Um0CM!?%l#(?`psSDp5NxGf-tY_M)
zLnDL%ixhH$7$PJOJxamKA^X2V*YuW0f^CU+qz|c(Ij^YCBhMVpMp4Y1a=7T4Zj!Y4
zeblwu+G#^q2sfR(XYXWuMPV}Avy01Bc4;URUKb?%5OChp*PKWtP~>Dm5g((G{AfhB
zK0&_}%ti<pRQ6n^;hL$<p)uHRWb?MUboePqJqbwF`jR3fR<_6L0wVFrA0$SIp`yL;
z1na1J{C>IgLQ-(RMuq*1h)4ONlnNvheP^My*{8H7CLgLY@dy1lO3GICxj|zL_KH8f
zb)UH)knY3EJ!y6=tqLf)NXbMvkY34FL($@h#FGkKCFAcg>7mx1VJ;v<bypA$o?t`A
zRnjz40PWuG@;^J7-B}wtV&FDN-n;7@IAioHIMJ`buQoSTJ2ji!hv~5Z5eayp`1DS{
z-f=(8z6;I;?jYhI^IldQ++F@u$H0r>z@b8^Wdo~N4odWgE2nf<&pb#PveO?|u30Ib
z3Z#Kct~Uv$Zz{_mM2Pl8sg#gtt@ZjfyYhu}Ig$_mwXc->Fxr@qf;wJFY%qm1HhJ#5
z@hEy2GO|CE4FbCluOF3E%L;||&~*gctxDH4&AZQ#O4G65e}=*-TAfuwORSsEq5{fX
z3&mBvw@&~nvJQiT)x!7%Dt=*B3AZ-ZiH(5zwZXaj$(R=5!IKE?6x+IWf|>qaG^ZPi
zn1e}k{%#=zk8TnoHr6V_T^_Q7JW*zKkp?_V9;^i`cHZ@&!!^A#9e_X<?kW50Z4+fo
zlPvn)MAy}keubX7i22)3mn+-uLI>^>0#<!)X|qNGswZWlbOh)b$%GWo(K13SwwnEL
z8DT3BqdJ-XaXJyNzr=ATh<RahWaBQT&gUR%xNMxI0#At%L=m#tAe#fZC?qDywQkSR
zfDm3ix3b?#2CANv?N8f86jYb(7~IiZT^S~~7XyDLflxj`@;Sy`x8A=;^6O<^fjTo*
zG<Cl?rVfYL^X232bRG9b?Ew{?r9@8+w07kUvcjoTYNt-&UK&ux+HRa!qsWyqOa91R
zJQ5krP#^#s_`ztm)9Dp3fw!lL!_pHoZTCCHV<p#TV<t|5hrKU$Na~I|HGA1qVtT!h
zCodvhM~My5hdG+lko)#uKbaqrQpLV5e4mwMB(@H9V^vpZzljr9xW5x&L&poRSuGwu
zPS8KY)T^fT28zD%t+GMbVS1EC&{kvIO%Ae=Y5RnT#?9{iNXh|faB)de_3iu^$T&6*
zXJ~8XWZg4b-WmJl<Vr6@ND$-4vWqT>Ug4^n?IT>HfJn$=pC_I{+$}dgSjB@cbCpa(
zwCvF@(l>i=h3Akz2b{TL`{Yt>9yGH^TCivaS#e!*?b{442&d9fSaHB&Lk9^WM+TmZ
z5W1IA+Hj)l?QMAU3GQJ`<CB#6Ns!`=V3Pw0r1<!pq(<wZ)%5eIp9PTElc3RdjQ(Qs
z7L^20>9UuH5|<XO@|Ox^R?+HL4f^QLMH}7cQQGtO6svEZExPbdiYbx>d~lYs_5hBx
zPr-|Cn&b0eOQq^ShO%+mzdgnCKI~*M@c)5<7n-if>HUkM@YaniQ6?vZP((#}vQzi%
zf;9U^zpLFTJ}YNz=!S*yZ<6riDC>6}b9j_Tgbti9QS2LHE`mpjGru^VI9{MjFAs;+
z0x4!>zPUgvzk@g>-}EvkZ(y|nyf&diO=1HcUHq0?6DP_cx|77L)>CjL#H>YQY-L%(
zwth)}HO_GO++g=ev3G&>o}b=>Y|`d$c}SC-;_2d5|Nil2E-DbCmr+1Gzw+X{aFIsw
zA7652{O<!L-S~z@g6+z^6FzNnPu~Qbi4}y6|HWG>_Uz@H61l(QCoWh!YNyp8eSd?3
z(<YX|y126X*j{nZ$fO(b<mzK|<5O~GZ*1ia<0dAbIcm)YV{e=I@r<@EK9z%SC52Te
z<F_%o9$MuW-c>KsTU95NO~lv)AGWvTC<7^CdxC^#xSiCE35GH8ZOH*=uBkUt<jfT8
z9Q-fmY?TL&=|jf$32EwAk%foz-ZpgbXxe0#1N1#>^xzQqst`|0BFjORh0M}13U!;l
z@lxK(akmu@{gqHI4AEFUI<V98n(^VU8W?C>@9!*{>Dx+f$6E<0;^EO`AEv@<%@Sfc
z3h?WccVJwM?(HKYKq=WYQfwn?1g@-O8~1Iy_Gt!-pn9%c{UdnpJ_mHwd)ggO_eq*Y
zsy>vAyTHAgc|Q$qzj6lvDU6=JEKjOnGB}tl)O!<Kf8aW{QfyYjdIo<6f|70N!#90a
ze7F>D;9t2iAhfZJjz{z1J;*vHc_=cKL%H&$oNQZ12qycMV6E~b9+;S9d14(1QJd6p
zy2Y&H<&=<lhyLV+^}826JM2QumTR7rGPB#O2+gEvt@7a+J(kFaCZZhtCk~(O+kmwM
zZ`*<wf@BieQACt#wB*vq+zE^yjb3Nj+(sOR+dpzh!xr3Z6=c@j%Sti%<ux19K75T4
zBg#cwMjF;Oey7usj^hH~%~c2kan2p<ZBz+5bCu#2Sy-x&Q4KUR_ncs2QIE<af68!V
z9>e7PyaW7QFN01U(Ihsd*)*AcB_Im&(%uoB>U(3G>uV}h&t$7imoxS7n!n%G<J!k~
zU|^QS9dn@i>EI?dfbo~Ivw9O8H`|f*%i%KB@mV&3`xfNt`9NI$C<xnTOf7!r1Z<}J
zio^xF!-lYr$bo5fmiY*-`gIB}Ci5xb#+DNUSI73B2I1acy(OXk=&ey!NTMDR!7w(l
zEj}{%fTf$2O!D2njdJk5)t>@cttd0opX_Wnz7LWA9<qzDtuUe0)Rr02c9p^#&NeQ9
z!l9YBQ&7rU#_XCov*(%jlEwC1@<vvhLhdelfO@{F+)mH~0!Thf2jA^bF1@Pd3`^Z~
zFw1hVB8@5sTcv)Cfd$%YH@v!XQ%i7ij)S~%(*2&V_+1{@bEaMFcpDv`2|nGR<#+Z)
zt7B|5qbS2NwnnYAmwDnN8CZLw!X`p&>O%Q)ML*It-)`L1{d9m{x3}i2!=z{?ZODqx
zOTxK#-y(3ciwRgeOggtgkrE6|9d84`f34SzEZI4IYnyimM~v?-3qPzG^|ys)-VDJ=
zh)<fU&IoRTL3Bv>s*g6XEti6uEO!l{Os1_6jQ*q6(g_Lxk;HloSL@20!yKEYvmcN%
zt3->vYjXTSusvv@)8P<aIeZ-P=pe>N5Vi>}Zp&J()1lXqju7<VRTG9H8=qA9A2)a;
zhh!+QP~<c*h1j@(5Jku9uO*)`LYs(AopMtyVs44WoW^`C9GFf#KchHm!~;@jj>2Ak
zmMz5e8y|MjlK|uS$rAZgM3IA&#v*PN_LRnSthZ^7`RRt{3Kma}IP`CfO`Z5*WPvLo
z3*$mnHUcN=2^EBiWmM2P^q+?)gxxfy0a>wzk2oO~R?A~KU@CZ4o0`BO**^b8oPi9u
z{<i!U2?I@Xk;!UkB>z~z?&Lm`HN$%5hmq9OTbwU0%n|NN*EX`dY7~e&(+|cs2S273
zl!IZEIO@j?zh~J9`_VJ&NV}TYf`w!DyC|iWs@1|yLm|IlrUQ|S5p*}B!6!iqojF1q
z0+qV&sO)GZ5w0GZI{{P_52vx!%<2y(O@W$qT<^T0QE(s~;)_5Q&BK?TjKPi_<p+*F
z7X)Ck@S)4-xR15No_=~Ca`>A{9Ta(mV=4TD+(01}wtcuaj<1Quh0CR%o~&!ZVYxt4
zU_eguRe^mn_W7~?n-L3{FD4mue0ygP<a8~xS`FXZutrSl&I9NMQ-jm4r_glVLo_@7
zoQInqo~2u1BlTG~)?4ewV8}bOW0>g|f)lIVdZ?R$fEisJq>h`CvMM~Z84eDIW~9Z%
z0s^q<^qvf#^bvcf72qW5PKAFBI5Fxw&m;XX$HdQkVMGf*#ulwHHaySabntZUqmJu1
zZlFKyk(Dcq+$h43<4<=VUp>cY<uR3UCt{j_3W8=DC&*)Yx+5F+aj$WFp?t_47r4up
zf)z)#<$$11^OpfP1i&{T>{A(8xyQ>y$M;|pM#ItPU!UNn@n?=`47(@bKJYY*D=YXd
z^{JdJC>N6lt)^%K7lkg<t8uT<7`r$@7%d$Pd9rWFl5ob9TRIsyswh$t)H32wqT|z=
z(_PUc9(#tuR3LHdBBgW)#SD~JxODm>pfn&)ayU?ZdCqEwEflKq+{JFy7s2LMz7#AL
z-*QkUog+eW{--UJwgl$%oi3X()U^(kH#m^<Cj0NGjw#Tejtsbdihe9CG}h5z9TjJI
zXKSs@xcIW=oOfy*E^tU6(!EpAHMPYFS_<4y1olW=SAq1>`L0!^<}M7iZgrcHRZhKn
zbYWp_xQ>KnW&Odj6#PyNTZ|OT$(%Z+8Qt$e1MBO<wHgpvcws(G*F{w?NyWLrJo_1g
z_YLEaA!g)6mCSNIK}Hawgq0C~9VGg>XQXdho}zM1z^C+kuX&?cxXwt(Ob<xK1wjzI
z0zy>Eax-gVo1E@&rd<(OUb*}$IpKfIW0b``ydKkWp+jCw-2zI^x;vdu89V-gUT1DP
zRm_Ife@1tD)R>iJQ=U$z<DfG{>Qty+{l$Oe!L&M633-*Pr@KrNH^&1jj#UtN#361U
z`Lf=wxb%<Q+{O2kJ+*>jgB4vv>=e&XO0Qj?F%pc0qHkuM;$4P!o=iUf6b^flM61Je
z)9WHDA<d<`JbB~Ug|^3r@_WJpfarCG85Zs3h%Yj8S>XN;q)h)ibsY48Zh(_amJZKp
zIE_t?kx3ujo!BYCf$m*iH>%iXy0D;YpEO7I^4J)mT$~$*!l%-Bl4ub}2>}Tatd3we
z+@r?Jbsw+FCj`~R#w9+!9HkA8K-Nt=@&FZFHv3Li)rXtfXJ0Z+deZhilsFLO@));T
zwte!?BSX|Cn%%)$t`ey<7TD3A8qe(IP66BQuqv>SoO1!5BmP?G`?+}@uF{Uo<jUvC
z#9&HJve7;6=i$B6GpxyvJCoiJrc08j<8E9imF++I2&jZop=b|GtSWC;<s&O`%xi4Y
zBLr7la>rTiO$&>yTE5eKuue;)*_dcB{7UyV$t4>$4rJ4a|3|2GA#h{ScO_dQ^j|zz
z<0@Ud*<}@Q`c{D?N1+AVv`8k}3xKu6eTBxX;!{s=HWi_tN~7Zng`iC%8+qq++<9l6
zUSLhI9aDeTrhuC6k6xh=Wg;8s#2J1rn(jJDamL0b)Y*xmY5K(Hr44{s#6XW|j(=1b
z36-H$?W={NA9eU_97Df-Rl_}@{qg~D#(aXZmVZ^M_S1KDC1~HR=ANJ;PF~6M2-@Zb
z(xiMVC|CFK?%Nz<wC`kM>}+1;Ax^$$;FzGPT&Q|<7)JlSF_i1nrH!*~k6EgyacfFS
zJa2Ov-7=Yg{@co65ENbEsN@GS+Hdy9y&EBD>vvwF(6rPb*qF%xgfxfxbDja$CGUhp
zOIJ&LggOIowDj%IDLl8RhAD>c3#@pVO+VD0&FTCRB|!z=8$Ta=6Fvaq(Za;*tU8q~
z?uD;w8M*RGe1e;4FMn&E#58+@+}*H^6~2YEpes%KsPby|-zZ*%#@^kb)OmCR@Ai7r
zmkqJZ&+WIkOIreJS>_ubp#`F}rrhNH@A#E4@ge<E*siFnYxtI5Wd8iNsjeFt;wj*6
zZ!>Cg!E;LZv7kkj>aHV|YC*$m99=s6!QTB1Gsy}FZ(@^#W)<mmN3HcV#Yhi4D^fK)
z1QK%GgoWdMg{vc%I`32=Z}L`6>t`=1c<SotYI$=R9X-<=*E&~$nB=!+i4tDrO{yZI
z?>E8m!e{)(J5zS4FJn-?QOnLjzgq9NlS^`hrIu@x@6!%nLfRIX+_Awye+VmMML77`
zhKceP1y90lg@&#*FJVuKC1}|c>TI*t82F~0__SYaqF5ig;y0a`rZ!NZ69aS*j<7kE
zpa6}<+V@oaO~a^Xv{R<CZ+ru2_Q&?+I@-Slbt(DBC(Zr!7k2NH=<((NIzbV#n_f0`
z*stpk$<ZiVb6fM-Rv*8U!<DKIZ@%O6?WaaV627;rm5bT&BKB-w=-)%-{O(j-(W_CL
zcvE5jVb!c2%`H&6dd<J}WaQ&Zi2b{*1D`kSW*h0RItjZrwVU!t*U9)hzO%lR&11`<
z^PO5=2(_a$V$vNFz=;4`Pn%!)jWd&+QLInX#b_g^9O<<}%x9}hQJT-6+N~G^RYwhA
zmY9nacSS$lw}&_E?mbq$(UhAB7MX^mKJia#_xyrqM9PtnqkmCacD1Ow)v+_l{lB{Q
z{|oT8G4yfCiFI$G&TrZf$yyVO_<x|AAFfc#=Bz&}wdvVV<`kq>7?|uk6*_gcbh*c*
zi{Uv(*nDz_@69}^usU1e>;%Y2lYCc^VAjCKr+{lQ3MN`RUkjg=-sMdmhMl-^KqDF=
zSj<)LOhU~61$Azgn4=k%qih}J{~yUf0u7<N=s;}q)MDNiUbFO`^4|=j-z=VgY2hbI
z`@bgkywe(e%`W4`eDOXEBu$dw59a-Uj4e0jfg2)k)=`|H8{_?-z|{XEs?eaJY!)x9
zukeic6IuDl|6`n5C8%9{xii(}M2zrl(>@b2*Mx)RCEJK#*N@IpzxyjeqnRthkMowD
zR@GcdIPu@WHK`+NoW?ao-fGnoyYRifN^UWkzO4cvqhz$?&Ezpy0=R@rNo>TN)-*;#
zAbdUXCO`_gvkRJdc3v@gS%1_V_3*|E%^6zPPXKu0R(N;Ge_F4o=4MEF)W}w*r6nVX
z_Y4Wq5*u)na};u%-&v;|@62-C6<1xd5$e+O+TI7%aM~jlL}izdVI=cCA{^A1g=-f0
zV$PnrY1&F|P<YGv$$!s^R8Hab^mArgtw!A_c(8({OA3R(cJ|>4JlYIQYc{FW>4{Ot
zo{aiRazsas8y;9qAD7sf*qmg|bi#14+yv$CYe}m!`TP<*&Xp2pDRHQ5ndAAVfpt4{
zWi!deR&J&?Q-QnM%coB`MtTXaCgwy!^t^zk2Uso!x<Uu^ZhfJd#Vm^H{wc#nd55b?
zX@CMqPXQQoPZvmVPa*M~jM4!YUW`7U^TiRRjzpz00+X7&kv{8#%uc8DB~Zi`+^PUa
z#*G4q<$*i;ihq#n72=Sd$Ha)Ezox7`^As*1q`pRSVs4oG_rrsPRE@0r*-!2adtGLj
zb{)#ZIu_2{Tv?LbSNCb%Vz#fx6hfjdM#a~(iHr{&(N*LNmn1gUNpPfcq(Ob)-|FEk
zgqcSAEV{>_a|>&eiJjH(6t4v(yu>a-f@iFDI&3KkU+;KP*JC9hzVPLY_hwCi#Q0UL
zcy^O7QgDLs3?aHiRQh#Rq>W)KbRxT#@Gc|Pws^u^HIvJ*2-<3OCgZYZD7XEc$(o4d
zO=5~yV=XfvSZ#_&_lwsXan333;y%M(M^gRlCab}(t~md=+=YL#QH&eKOU!km?Xy#`
zeN3LKRt35A6w;Gc1QJ)OuQ18m)o0YX90m?~OyOs0^S<<0Dd&J8I6`DyahQB{zkIS^
zu^fN^hnR@Wa~pHq*zQe4#plXn?vi&95yEIN-f$aH`4E6t4a%2$;<9JC>$a3jnQKXH
z(yu@t2;lMPNH_Wo`AAT@)?s1T#?_js({ePX)9uhZVGK8x99Bsj9Imk{TyOS7S0oan
z?~-4w`h9O#=C?}p0F=J_o85Blm(~=sNq0=V+sGtSx)$l<BM7~FWCQ#XXs%C~kx3)+
zi`43X7Wzb4XwHOXb<%`5LwYqf^$A9SyZrG%Qvw9zU;OM59sF%a2PLxVn|Cr4$J8w}
z`j5Aj@xfZ`NwkWToG-dQnNfOiKN?uV5eCFw<_&<qh7yPH@>4|*>mN?6#s#Np0UdHK
zDK!l}(8^rxTNL4hBRBu|Lg(Mffu$LMb5+6!HCx-&0H;D|Fo`BIou@2Oki_TmX%`U0
zR4;pf_Jjyw1t4oVF?#M{xE5QhnJ!odvVKDZA7>^xH}Vgg1v<X$5L+H=byq!{cjgNX
z$65!TswYWK8V7x@J`38oN=cB7Aj;H}MzX-R!{sh+%mWv5xZFWjB6(0TcCU5^eCmlL
z>01A>5cQ_!l^wo&b|~T0%HAaJ1%YlmHKKCNY0FJoEBB^2nyRb%Rrl1#o|Hc@=*R7<
z9L&S*>`u;}`0+S(8?|~S?R|^V`?q~0+(tWGjv|apK#@cuD!Zf3r#R^aS&YI~qC>bV
zo$UMdnpap&z2eXs;HzJFYL*4z3HuT|^OO_s+?2zQZsd^%1AxUSGAf6iP}SQPpWo`q
z4+B*J9H^=eif{qAE5H@B=jB8Oox+`wAR0UBh$*U6{~*H?NMti)_4G+qpZ#AEB_fgs
zmYZLx9NFzWeZGb24qb>0*PbJ4$KLT8M<A_dmz!IP?D@$U)UFHyM(Sa7@2$n5a1(Qa
zSD*%G7<u_pZ1;A|N6kr1?jzbJfK_rEd)2s%cr%+R+t%zKFTZ#S74~4XhQOfxidUcx
zeR`dtKu<!w0_`=8L!kDfsv$d`PW8Viyezk;<<ajV)=A(vS>X(;q7zqs-aBZf+SsAL
zyzB*zQ(ASn3m8%4vp%;~Vzk>TZhY8NT;r3KcYSlXoWv*DPo%U8=(ehy162deL!AQW
z3zvp>+nDz9bOOxQfiBqhy)F`*z)@DW?h3KUsx?Dc>s5v##W29djv*?>XineHX1E}G
z#GR4>C@f7QYJ?vkMY>qe`zH1sSU?u?>J;_K6c}Ns3rnj#-Ocw}pFR>LLF#ne*Pdm!
zvPzCp11{T73Y;&rHPC;8!d*zUA@gCA2<;djcimS&5$3#ir(C{_*6;072wK^GX}^*5
zSp92JB7ibZ9wR1LnS)0S$evAMzZi)C?X(HAdk?A<2qwA%V^o5c^-MH126N1v%vHn>
z<^~?ONr^m()g#?#GG1ZNg*nR$|F$bn7HaLLp5d3QaowQtjcKxrehI?`OwWzB#90aT
zXK_F~t4rh1#=LZRj^fVmN&tXbIXH*L%;Slj&QE1%$1?f?`3rdAx{msa0QN|uSun*Q
zv3;)mV#+?uWpwG7rf8=bQ#@6owNG;|jXv>F9Xd2gw=vZIN76xI`lIFCAoX&fbrR&c
z=^KQrp5)i3a%ft|(T){eYPof5+^QLsS34=&<#{aMWeHpaB`PW~WBj|ygPR_2kN=UV
zLb;MPfH(a<Z(`A9kV4I#YjPRY`({*6htzp-M9#}?ljF&aw8{y#<(mY}x&&M3%Ei_)
zi9$*LUXgUUFz-Zlfb8fCMeARKe<BMnsxHcXC2_bhhd+5SWg!_qW$UKA!{lAseT7EC
zQG&2(BRA}E9>7jiyEDNI@KF8x&&76}d0<8nWE`AdGR{JzWN49)X!XXh9Yr*axUhBI
zK8XnNlccjT(bcpX5F2GT#~;ci+X~5^OHKd@#a)_E^=b@YKpznx)#o=S)ck=1_VTG>
zE9whwH~@yQ>Qmcg$9*g5aB)r1h3?X&dj!<RL({a-=%C^|iiznH(|?Kt_|=(9$t=%x
zee(qX9X^v%{8C9CAwbPm`o1JFUJ_LdY0O)ye&A0pg>V-#nr<;49skGmj_tDBUeaRT
zi_M)+b3MIuiB%hH9+gU@6X(~B2m&ZU(fwBMUg8=u9yV>wRe$am33+48b{u7K`>R@9
z<4!(-mFq{i=F5?&(=pf$>|g<>0~>&@sUL_&QEm>M#hwd1C;zgW@Mx*w6Sudh0b~Y0
zJ3Q`SR@$Z9>fW+OJxuMAJ%N+Cn)ES(K~SZ1ay(6zu}Ti$d9465R!t)gT}15j)BU#x
z;iCrqrA@fyY#k2Fd)wXxSd*1^vb$5hD~D%3fNn7c7Y_(yuYIw5m3&H@xGZc3G>9D$
z7D?zjWeQP_M|<Liw>E}F^peX*jWmZHZO6r`hE9@LvCcBErjh&ILy+uT;pR<nqPOiu
z4iWYe`}`REwqUyTHtC-KEce(4sYl5`49cgQUS5DJ`|}mtfIlxzM26PBnm%jemgJ&v
zE{rQJMe_76KzcE!XNFs<Dli^|$FtaG@WQBI`#=Xu4G|~@BZ{+{LGJv{GJkH82_R^9
zVV!icTj5MgB#z{qh?rqqo^N#Wl5Mr*_c^=S>NJPRUYX+lLu>wE@A98S2wENO?&glO
z9JOq-I|YHd3UyYJ#)-S}OF->(J!8%l=%U)*%XUsiQpHw_kJ5tn=dU{J8dYT#6jw1&
zfQRYwNvRu9a`0t-pw;pB;Nh%Fd3siLN@xi_7LT$R9K-xz(Q7@^OtJhR;7U~Pcs;xH
zdVQxL%1gCT&yw59Qq~wuAh$3oJWtk@>Yf_KgloQZy}H(>!(gc`*~6x8x+6Di*Q)k%
zfUry?L>@r-_D(m`yNG69xC-0ur3`U2%$QJ57~VaBE>o5ffn;{SD%=9Zs5wTiIwc}j
z-F$93)-Y7aR>-=<+;cxqma&ELy2uWDSj$M?gfHbdjiq^I>g{{WeTe5X1II>f*kco1
z{1~mk?y<$!m<%Tq`>NW)Ms43kAJ;ba#^(U_tRGmCHW^E6Lxx>}?`=OO<vJwFmjy7u
z#5)+9QIx;{a%f%+6%)%jm!<Jw*~V9HYvPCKWvoz^Gs;yaO5~F_EsSJiR_*1JH!n~O
zgGRj1*(Vhz(%q*^kn)<4zG-S)X~UVwW>cDbIY9pkI<D_&C>Lm+S8Q|AX#UJ6GUJEM
zs|dw`Hh~4s2Q~fx0u3q?7L3s|y6t*TDUrYZnrQXp_|5H!pZsB9m-kaIAfJ-8hn8h%
zUc|_c^uO)6JFRG9T9L1>!X)sIgor)}x`h_)Fn+Gvi%vWpP}CY8e6jnuKb!9u{pzf0
zmz6hRBkC;>BzwvxNnCi~7<Bt}$M~<<EAh3w7P6yQTUyg)?be)J>otwP6R{|lC<6?S
zq>zL+{dgt=g1jM6779JfYLq6M;(gZ2#bWCQS553=El^18<HUmZlZk2%vsz=@^S&k2
zj|c9*>;^P#8WfPv{G_4v+XYCSfTeQaA?j!A6j1e5)e-bmOJ}ec70_WFHH~@Ea_jMP
z?Gd_<!ZCZD-brM9UL&3AiH#=HCCZ28y==eL;S2`93#tu-G8>?sku%7uVDiHx4TGxa
zUK4xe?Qq(a-)4LpK1?n!6z2(p_yy3w-|7pDE6@d?t78M28ouANq6%*E+_Y=?>vd<1
z#K68e|DSZi|Ei&~wFjFq4(R^EGI}Y`_sb{7mGUNDzo^R8z&{o=9VkvIKPvV!e+DLp
zvRjt4Drm+wpFMXP)vV@O4#b?q9K(AN=~=aP>3pcdr=%7CS~NK1*R8aw<<YDf(h1yn
zdhS!ZBK(0kRTSeCgUlr~+w{{!f)$1%^+Hz}(?uz%j8W??QeN^4R?_720>~$UeNw$`
z5_x5h2Kstr-o#%!&~^+wj(%iWo%!Ch9Nj9l7Gk@{FOPYf`l8#0b@>q6O3QpGA=A>C
zJ7qhXGA2RDAm-P)BhQgYrT~qN`!?AFAH^4<>z~i-nANfkIONf>P}f~)IDhh0^;gQm
z0hmZ{1ML_g-NZ436jDfu4PaZhSDg;$qV#o#Lv6Ef2H%_Wxke^fI@h~shD(2eoa;SG
zH31Len8#MrhP3`w3jRH*1(G@1{?CPf&uO7ve#o~-O!!kRQ?~Hl>B|a8GYKCJZ{`Q`
zO0y>vXFa5Jy&acCZF6Xi#`6WBPR?INU>-JCe<hOt|3uc`V{xP!o;N2UY2hmGjq4?e
z-_ITUN7<h%xufLZAUkPlXjT&Sm1OuwI0PwIw|T6T&@Ip{Y{+{2CqhA@1K*x2`;kxi
zAW757zi{zm70aJA#^Xl`_go3q`>s6Z_(;BM4=;6%(vs#M3$yX<1SfjfUSDW-$ACo_
z80jvNYSyR!K$8>~Ncc0+VP=J*Gy~-O)NIa+m@pL>x9HCGUJ0vduc(yx6(A4y1)1s1
z>K9(E3qHTTI;C9B8|xM!PT4cM=|kN%@RRvIi<0sHBD|FxWVejDVn2WPE=E8<@*Ef-
zzJe@&4yIfbp3Css^9}0F9SS4@MHPL%r0%->7at$nd{QlPWRL9Eyf%YYW!pIInw<2<
zplfL%GD{SiKnXbX9^1dj`e$n#5-;&zD3PJjACw!vWl=n-Vfn-W{afO{IhpDkaF<^8
zkn)<4QXTj<FzHl<&m23FyCsK%)OiDIxke2RI~2~~7AEe)I=#B9c}Z+tkmdA%0x($y
zusK$NKK~DGTW^o@(3S9N?9!x#Wk2x+qdH~wR{#)`b`N$QhT!jTg`EbzZ&gtJ=-@yH
z8`KB<hSjfu1^{}BPs)tZlYju6OY1CG#1Tucz@;q<|7YUk3f!fR0I&Y;Lo9ZNsSb`X
zc`gbQlFu+QW#a#IfP#j1z*XqYFS*}ws6Dy5_N!yF2E=vdbkv0>P7%e}q}3w^J1~?O
z2Rsd+xO568!nEp|Z9E10)!{DNZOljtV88=EUV#t7=+R^OF%xhxzC91B&u$Rj!nfL<
z`>xd|t`bISh^mb0KgYy;p%YHIbgN~$<Br}{0x&9H5|~nlz3~M7gc0I_6UT3k2(RoR
za7X03s+xk36RY`nZw><+zo?CT*x0By%pIRj>Ny^bjfJjwFeo_|nvCIl(i0ZgK?)S%
z0PX3H<mSky1f*Nl-4tM|ugi*ARia{gH)7URVk0t_e@e^96SDb71wfIwa<Teox=nJa
z4=>Q45A~j*fpnQoxarj4A3d3UlHvvP<Q89WJI=wD>^7Cq<?J4YQgW4;H5N4q${a6w
z8#@N<?@zfWecS|?o^w9U?-h^<?9WD+5My9(S=U&qgrk^Tmy78}l$0kzKXI<DCGpaO
zF(0jLx>ZClAHDicoUeG?A0_C;-%-u{hu`mJ#{~=n5`a0J=t3UnsJ*R}3$%=|DQ1uH
zTPhOxKCeN6Ar}fEn@}GwCW<ynWL*V8w40)v5~j4mE7-A5A^8A!*dJOgM(@BUMsV0M
zHXM3GJa30S=u4O6%3IE~U&c!sc<`N?jn90>XMZ5q#q?lk#6jFKU*BaSzq{0-Ygb;O
ziuqnePt&UKI%#Z@Cbs5tHH=q=$_W5ah+j0H{e3TaU?i%M4<=ouT3+7&#s`})F7%th
zVSVoU#jLTUpw#Qjoct<6xF#<`l!goPSSL9Kk&djJt4hEnV#>Y#bHV@*HBNG`lao+`
z>mFWKwhZG!?ow;&4@QlBDb;=jWhVq+Ax&JOhaF4fWhydgV0y1B=_Fq4Aa880ib2^C
zGco~q7W72KZ~i9$>7qG9Nf2@TX#sxN^QSf?`}Yd^J`nLa7`?gZ%j(fH<mPdix=&1~
z@lMH?TcQC1fqj#Ffe$Gv{d6hz8)|%r<{*vA2!0#=RH#XVu%NKzBKz!`Va8Z|VO%D$
zH?lRxNl=E#S8Qi~Zev;pTM;nfr#bP1Ozr{pXdL1Q=L7h2O&2haM)KcVyMYu|?v*u0
zh!MYX_Sn>nYIOG$FRyd^h^O$_`)D@f0XkA5Qqtn1RV`?mzMIqXux>LyMdsI4AR)i!
zKnh;&ul=Z10j7MJ@*~Woia6#@%8f3S2hxiB9?!juTy#&3Y+loIby+=6k^%6tmcEKb
z=g0wI9*_e$o?D6KTL=DRLlE<+3;--mC5X-$evayfpxlxBxCPPG=)(GSPu<3m>|9QV
zKbNPSK}SM;Evag8adni5UWcqKc5+Q$(S(Yc(km<Nqo)|Jo@phu-l<Fj6}=BSBs*L@
z5mn%93nnNwH>)A;VhV6QBlkZ5n8nNnX|Q$<OUqKf?ZHrqq)p?3P250Pquxz;pOl&%
zR@a=>_)TRd70{E;DepF%AB>FXRi{E*KliOsMRx%VrBJV!+KYB63#pF?NuI*!{US_e
z)y;BJd*~;CS3~OOX55a}h9}Jfe-*>|>J0t%pd>&uo#+Zfj?cp;eQ1Weg3jO0YgGka
z_$M}y1a6FeOCeBtaF}Y?A}r-W>)E$3r#uyerHZ#$H?W8r+;HTshE*n$-u62>@xV$8
zdh;(hE}3<d!K1Z*w4c8hi5zjK7<Q)P+v{9H=Y6FDXIa;kRkPYDKikQK9|BWyE@cO_
zsYTZZkR)pjdw<BjneaH}u9J4Eg_hQ_@C`+Fz+jc1-4O83p3ng*a9S@@I?UNLXMcW(
zzM+r1vZ9`In5oGbpqWnH)_KMz_O2x*JrGf-fH5cPXUCOf)Xl+WzyM#tiRL!ieAt{w
zf#FhlGvVRw3dn(H`v{CA`rxU9@*Wh8$90CrGT=zHr+L^suenKwKx@&-Pm-YI$Twv}
z6FuUn%0iqDhlHD~9mQEQwP?}{ovu4I8?z_$TnEOD04hhZ@_A8{ZJ;DRo&M%VC|{J`
zH`@SNvhGv1F15g{7s<o@WFZ550of<{;{aAZWDEwZfP=Z>YA@HIErv9ybSJfVLe2!N
zGbrSe?gv2Q0K3ZgG0z1KE=TaYx0fg0?<8KqoUWfqeVR&BGQ<3yM8gQ1d2oe;+6(CA
z^X2CPW1&eQz^;2wd}6Jpd$x}?kHUla+ync_9x7bqa&vOtfKK`h*ZKpDq7zpL^4FFQ
zsdsf|aP@2DpGM(Qppbca#Ww+~aBwq&4jIf<eD{=QKXWkfMvxd6HB%xpRp8z>u$jR?
z<kKBwUS7>eM+Pty2B>=$B=47wIt`Du@qYlH8Yc^_&$Lq{T=(*RMh(if@*df(&$ACB
z)+gyMr9x#&d(`Eo3V~}ur(uXZK8r<fF6)?2$;3~n@Bevv#AD_T<97_e*p)f>Lkg>(
znU^`Z5<5mqyg_y}E4qYJ%%1McMgWIc5i1nLbQzc=2mW@_{QZto532-WiQ@yEqbUEh
zwYTh9tq&57bpb{FE|J@TQ#R!NlE?zk!-%sAsC&$zeo$yVJ}ooi>NPUuQnFnq(s&l5
zQlg08TdY~}<*Qy&AjDs4u~gdHQXpclir4qBF5|sxGY*%iuTk0UxzK)ns34B%1jeIH
z_`ECNE_oq9?!&LP2Mau8KHt^RAs-Hm4(35+o+}#KjGkW4-il8BYs%^|ZE!I#uT+7&
zAf7640I<Mk@6e{5Pk@Mzym`WeV*GghMX+c+Q6W{5dRHs8q{OIY6kz%ZN->+&@%g?Q
z7sHh*(!#|XG!=OUUK3AHqsB`t%C<}cm~<kk!FBszWoxR!Q1{>%3crGH{`MDN0*p2N
zec{(o^dwUFk}_F)h3DMMDL%ez9HSZA2|dk)_*G2b=lh@Tbw5D)ykOB_d4t-QN`-Jc
zxjrzoOBYishF|S9lu4|GnuLnsH`lz&F*SH?!SG~cC}K+r9vc`uvb)8$WFSf`U?Y8B
z-}oYFv>({Blh)_0K?WVTw(lpU>ts0xvH|PslmR_uL0UZCToNhle~oz&|9LVkUEz*z
zt=Eoah+fIB5WBT6qIh3Qi5c-_6yw!2uEv&jqGfoK2?9T*su|;Rm6SS5?b#mY_h8BN
zK9Z;RP}1wPI4dOb@Bv4QxcfzSX`n-gaN-`<rr0GZ$9G4T`$Djx$L9uvk)LeG$Q1?c
zA0z&P!c@UI4YBUu-MequGwqOZkSXOMv23$yYwLw;X-fcY+=WZnGN0i?1)M;P*eh$}
z2PWvZaCdyiuG?IYo7N9cMQhI`cX{tkU`Ly}G8;Tfe-trs)p=nUf5f+)B_&H?J5C9Q
z9Ge8N@32S5NYzpqTQ-zj(w%B95+eX1b;yR#zth5L`M0l&J5|&>9_$3&h?7ZhcqlZ$
z7Fn=$IcUh;V;bDx>Zzy>w8}VGEA^YTKG3l$`Q0)(IO36G<MzUa-}JJf%~Z1YWY5>8
zj3y_)EdAaw9&Ono*LfVkKS;xi>FNKYy7P=`Vq5#Tjb>;bq(~4%lp-||LKPGcrF%q!
zp-2g#29csj4WJNI5WQ3pL8_ubNq{IV0qF@4nh-<?f=D=k2n1<?+(FK{?}zvMyYBmC
z)|xeI?Y-xj+0UB&-_QRy@vt1xr&w`?Gum37RvtMg%rZVpw=y(Db!Gz#k;6Nif282R
zKsy#=)A^st0&FN9;&LK>^}AgydpVW*4Ppf|?lP=UyG)<jREv<Iv&-_l9|Q|j+%q5Z
zPVFi*cr2aJB<6gXQ?*z<dBb;*eq7p>H2x5Oa<Gz~pt>C5RS1&H>%b!VmH7YU6=c|2
zxhdk-n4ww|{Q{?MoqIV885Q!^TMc#i<`T8J+NF7_Qt`l{3ogAKcdQ3NE+Q>3^TL>W
zVFf3LjN~?c57A_8H1yzBP0z@%h9i1L+QvTUnN5op{$T&B&IuCL<8idv)z+&S(@hPV
zwogO;m7#CguYY%uWuT6h-g*U05$j^Vt$vs2rj;7}%oVeZ*-tNf3)*f)0~X~km*o0B
zEL2?A*?VkSXtcj{ZT!f&H)m)e&o2D-E8gNIZwA5*$zEPfX#A96SC8<s?v=m1>Mx{q
z-6`Z7+8EBf-j8c8@=7=(L;v0tdS=dW8U14E_*`Q@&9~X|_apg3`}Kfjb=aLVn=3L^
zMfE@D^J)#(BqSKeE%UJtk2$4e%W@o(K(m`2sNGxLQTOi0l}5c+nvgp7>#HdQN4=xc
zEn+kxJ-uf(zAW^MMiZltOHC?sh_)kJ9R1jQoUF=ng~i=Jf4OyVXn-JKWG*bi3)vP+
zeOUeVlMTr;ljlSkEX02GUCJVt65|#mlE>ZXAyQAcCRg}xD$6I?5Tm1u5{^FIJ6U$+
zU;j7@1@l2miEVU?V0d#&vpFmAf0Jg8UY@xs9HLdIj5u|)$&xC?C}p{~?{$76Jo?b5
zwB-QuPU-Lpd8+DM<$xXEN59-V7k&W#*ZWJ%9TWTiP+>mFraR0Q=_&*3^_^0xUEiNx
zv!sLV-%IfSCQIRezr)g6zT0R<2=NrF`JfWOf?&F9+P`s$vn?~)FTA!Fk?-aJWFv@A
zEZ270$zSJQ+b`Q@v)XVEt*+n9$F%>kLqGa>?>?Doy5M7AP`C#;Wv}toJwjdSp=Wx_
z6o+g1#C9CMTx0I>dbeiiH|(jI{(aB0WA~U@?N@fWJ$wLe>A11s*`y;GYOH{G`C%^O
z#QY=LH@>_hm{QBHyps$(1i-UQBNRD6TIaqUD;z8*ghHLWd7<{~E^`XvIV`Ij)j*YZ
zPOKJm@Z-oS+cJ4Np?|Z|=u-(h07ML-Pn;Jr1g-5MdllnR3AVeJO))=H(XJ9HlGV*l
zpepFtKmn&qvm5@q;l${eAS2bPF;LG#yE3BtTWl&~x5HkScAi8?yp8v68yiCFdmtq}
ze7kr>Q;w|6vDU+S`MVe`rqQ?k%~{3@mAi1MEo@|rgdOwVF2=5a^H@k2dGLA9E**>|
zv$WUUDqqB5xBG1qASg;9#-QKb^kV*Bbn2dA{WnG{DeT++{}a4dR>W{kPZBJfImi@%
zxscQi%IT2e9-_Q|5zfKe<XM($ukoh19<j$TUd$Vzrzeq_-i9_$6-=!NI)aI4p~S-o
z_kYLV{jK|qiW(=PYyoJfG;-*M+R+%o<|$a<Ju8|;0Oatjm>!|$PeedU<HfI-EXX}>
zWFyYWD{@l*`kT~AsQu5B_$Am!{$CF;lnFMlgt*nH2Y5|s;}N1aPO%EC`6cX`pN%=!
znd0%+Pq8wMIZn(R*ZSTDHX^J}@IB}tteb*j&x<aovqYaM{*K%CQ;#|(mOc#GHHMFt
zIFl9*hliF+qgH3{E?iy9!?DEX8&>PG(`W1dWa=l=T$C%#%CC>kJ#A_0I_T2>$eIvz
z^pyBGxgzKpa$nN?IxJ8y27g9<%`m^G6KBvfqSg1k2)}hUAEO#__Ax9Vzk<+rGw)E*
zP;ye)@8|c50zn{d6m{w=m`~NZ!(l4*qD5VRzLe7CA{t)Zvvh950^xHT<LJG3EJ|~6
z(pdX*^SN?WViXFUc`C#ke%q)X3SMg~<)gm`$fn|lvp-)Hdio@ts;*jjZnmJE9=<IL
z)_9_Xto6yK5`tVjE7gL^^TbgoQUuq`ApP?4$MaQI`6mg`S1-A!CWcp-?+`wu*tZlv
z0hQZQ&#SKUJ`~kEqr5u#0axXy&1rhn7)mGgd~aM0J9soJovdK^m5B92iz|ir!f(v3
z;n-2U&e>Z*?kl;2EyG2|3LaBS&!I;gccxV${w&4S!P5h!n;SH95`nnQZLm+^Y?_b|
zg+lP-<^HP~{X#`5!)Rp6a@B>OSu4pX!xtNL>7kc9aVkyNaogJ++!R_CDn*MJ*-G6*
znxG*Np2QyO%<DO<xdW7MX`3cLxz_FqQ~K{@tcqoOW-M~l!I56PvOXX#AF1i!iY3gj
zx*y}<Gbk}_bjGGDK%K@dDG<xS)<qg@Q0O;O)8$zT@g!#^31`5Ur^;GI_(;eovfkr9
z;i6QOF{@vJ;${=b)!a1vlxcJO+x|5`Xll%V^K(A_Mi<4u++PUk4kS&53TQ*li%?ib
zd~3!$2mgfI30+AE+FSC1W+X1N$zw#d=$HJs5E>BaxrtF^S>&&ZZg}P|c%l4-JJncL
zlNHky!wsCc*k~o?T4ZE-k6irrNoC~TCV;%L#t;oxRbHK%jR$awtYWXZJ)>oL$|1M%
zr$Jj}^oz?e$fbH?Bb5*n_-*R3A5h}&c^0fa&+pkT5>HA`pZd#1?6Zn87^#TGC@F&t
z#=0qkHVj@<lm2(0s6u?sk^*-pxc3T){mTyFcSKi{$j(?|3K0k|Pohwgxl9L;1exZA
z(oKgd=PfFbtfVAI-Vpk58;r}O`ZtTZT+wlyL%-T7{_#DEtNr7(t19SaIRxU7P?Qj7
zv%4|$k+_c^jATorM70Ghk_rBlze;rwNJ}cT>C~c(RINfq-^(?2TcrIrGL@1lm!=(<
zrL-O9>tyiK14VspbnNDNxFp2)ns?NR@vk1%l^gDQ_0^N^u3$E|+kp;jwv4Sex|DC7
zY+MVxw6U@RCS4s#v4!wvllcSdm*ggrRpry`-tr9=awn*9&%fY)5b6EtUVf75`prMW
zbogfAvF7o>nW(^=Tp3B)#tL&T41iX$&v%~x8Q<zPj`?c((tXccip8W<oGiI^@oBa^
zU`OaJDB(w{AeDF02P8Du3Z4OW>2`=}uxr$laHJHw`5i~~>BlWp>bNV?d?H^uZ5qRR
zC>PgL@vlj^+2!J-ne%cIBfO4K6J&%V9Cr9mu@!@?bODW<7vYlaKUcFxA|&(b@z2xK
zIkq~G!}3cX$7|y=`Z?_ud8g9=E$NxrarR>#5o$mr2+E1jM^Yh8S$)U>6H91vkOG&F
zv>#fRn6@$v>eJzV2Qy3ojPc%Z>3f&nz%n9`wN8`zHwo?oA6InEuce3?3;tN>3hLCL
zZpAj;bbrNOdWs0^(fyE<B`oP^BVl6<(z-Z6!DXh8<b5WD>`78hNb`^Uq*8b?b<3Ye
zL1G7py9(^nT5}>x66Dz&NM>|r17LOdtnT~aBg4+IiE}}!4&RbP(<s;%hmvsI=nghm
z=yG_!QjNnc9?aM8AR@wYN|`S{fk`>(Wj|yq=P2}pJK3Uh^paZ9my5$U;4dED1qx#-
zi^Aa)3ggn45H*t}SX5_O{dADR)44?RKpl~cB(-r-At`CLkauII*wzdN)I+U#^Khq1
zGBho4Bo<%9rfh6JLrF~h#1M0wnPka4ER-mYDmEA5tRgE2R^48%GC(;}c=XcKg!ebW
zcXHddg!bX^E3OF;JUfH_GeTl{$B)&{UM95&diQQxi@jeu9vAzhKx<IV#4r)4kTD-y
zUc_m_ftYPz9Zz$IJTH!DVBx;d;vL1c8|eh8ru&Ouw62-CL?yjh$n{%PkeIXt+<(;g
zmYR?eB1d18_c4^{lsHBwJx$+FpI_)bg_w4JGV`FptHz!_A6bJpJWX@^pj(PanWTjR
z3Zaq4wyaa#e<pdG2MPiC)bi;re&x)1%whri`z*qVRHQ=HfMsB`@i}5wyff8DlOd1O
zv}84jn2rfCOBb_g_11OLV2i4&nUc%oQN`H=)$Fu-qeGGJ`2m4yku*l{8&CKX7@OWJ
zyx`_sM4PcU+Q)n1wCBLKA-ZJ+5H-ZVVm!ceB8!D9POUIYu%oZPw;TW06D2dxQ|}F)
z_cPWFgg91u8|x!2jo4(EA-46`8t#jI4fLMIFY(DLIQ)l#bMhG+^%tCGft+>p^@%Nd
zb62}@!242#X{ayF3{j0?GeV+X6c2?)jh|R_R*hk#0<@P~I_GdRMO<G(jmC?<9RTG=
z#Ap+<#;|dyv(mGusld4b`X8VgYpH&MS=b_OV68Kr9Wp4%Q1?U)3667zxRmqjl>8@d
zX@``w9qt~9gA};6S46_MJ<T3-vpyH`98)D$UQ1|_h$XyK2fe3e67++~oxYJk4JEKc
z0_S&2JU$xjlT+ZQnV@z+W-`Lb-At&R64iX7-(o($O6>Gs3a$Gj@x*S5|D<)KiO^Kk
z(l-vPoIFOKkYVy|yp3%Utc%a1VHsG@K6^1&^r1Y4hzd_%rlyZ;vTCy3n#Q)RkcqI;
z^4%o=LElJ?D+49MXxatPmXdPv(b!|I7r3cDAEz7EMLtW8bRdSasj#|IK&n)b{C-+9
zEd)_gd|0Hdqlr!{Cv)5oK65&J8bl|lv)30qP_sqTaX5porGbsY?tPFu<Sq)QB{YSb
zA6jLcyXv{WRZL^6GGt@1IqR~`?HY&u&WD)_wt6??0uAc%XJnN*LqG4eI=Q!Ek?2@8
z5QEC?P+0v!$0kH*Z|liTJh@>($CEAisGS1l*K#XI!)>^a{gWO0yIm8mAwi5z3y~Qx
zPVr?wujwRD&)Ma>=y6e9+y&5Eb(fN*mc-GV^r;_Rl0JaQeho-w&ukOEKs>rL&6Ti|
zU^X2=>>_M}8A{WWs^d%?>m<wRK;7PCROPn~sQMV!eyjCoL3IJb%kqa)4y3L6bgi^z
zaBD3n8DKr9I8|<w`cnZM%vU$>LxA;tK}tV77@99Oo~JQ%NARH*Q7ZLjl4je($po1_
zM@Dd;==;iYrFdK8f_@Hn+cXAQDSHasT{VN}dY`^KCE-W}Klss5DI6{$Ch7QMeLDbA
zZCzvJy8Tg&FAJKas4(P*Sa(#D0s!8w1-gxON4RWUleDy83m#J*_Bp&5wGc~~M(W6Q
zM76)o39E^U<drR=24+sK?OQSKi@f$84hGN8L{vL(3q?~o^FYn+*_?4H#!v-RNKFN^
zSD=j^lsdkKv>6t^p*KKZlaz6%1x_CyU%S-QB#;CGfB3+RZp63tRLH1%{qvIRr===s
z3l7oy?3ShxK0S&q?lp>*;pDSVpAz1wgX|1nw%n?24a$2bI-*;K7X*p~TTh1{nUJN+
z3#evIl_pQcKuS6{g5ZMw&4qXNiv#z}?p}bjH;^D^SzE3nWZ%e=IvVkp5AMS7S~U*M
z8?LJie)EIo;B~&_$%830#}9!hA!}TFG~Ix`Wac%Z8Qpsgi|3N0x69adaV&Imqn7J0
znBefE>dMP<E|Mc-9TaZpBd}SR6_8K%h!O7Zm5M1ve8~}1_Clo^n$fh&u&%FlVs#oK
z1!WnfX1BouKX4v>-cv;Z`VO5c`>hy{5bJNt%JuhFDH+D(iW1hg?Ahc>(*+mrhq$>v
z8t-r4$5mBH1Y1ax%<kW6AGxbL)tqmZI8UE!s{#HKAYy~ZjID(;NmkPAcWy2FMzVeT
z44iW8z?d^Q8cFJ^H&rECNK9LSoCF4sJePd?INiBUSH38iL`N#^d<e*XS5u^QwNC{3
zw$~tryXeM<U^963fHgq)r_0Z%<UOepRA8-#_gV(}iY@S{=Df+$w33<fqm18<x~8fO
z#z-_-s9VukxGJ2r-%=9#1-2N=)(CH(j=Bkl@vq<N$mCWC59!ka+q$RhmEJGZthXi&
zQmsuR&lK>_xiC-uJTc-mwYV$*S85-UE-Xca@z>GLo+c2IQWz5zs|QCs@jN^0``;t|
g?>QvOu5)CrIbsi+6`KJ6)7WEv-p2H;v1{DF0nA$VYybcN

diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/source/assets/design/v1/prefix_caching/example-time-7.png
index fc33ef50d4fdb415de61cb9e75a085d776a971e3..0b536de5a53f2c00cf8ebfaf554542132051b2f5 100644
GIT binary patch
literal 55922
zcma&N2l(6MwLcD&5lY!*Q)U8T1zWaci8ckx+O}m|vSmpnP}!C&S(Y_Dq%_bl%4`cE
zg|aDxwroP7%&^NWrL0g2EgKqEx$H3iuV{O3Fa7<W=RVK(YfJLG-}9dHIiGWMLY=2K
z{rZkyUuT_lHf0$?SZAHDEnjDybq|=cA$T$!9{3FSS~nDEe4Q)*u=ARA)+uliDM!xC
zu!j0N^KzwsT+M?rrW;1{a>P6cB8Pf63fvL61kW9}XBn2>`^Rrk27(XB<PLz)X0|Xd
zSA?_RBcEA-V37Kc@>*XX{j(u>K?XE1zp8buQRIf^yc`bh*-@kq!8dpeK5HWQkima8
zb9fd$ys!_rEPCEhZ|j}170rVvI9FJZL%`$tWu{rJ%!5kcdT7}?_@ec$?WR?Ffi_)W
zxuAF!Taa0h1CL5(;5wiXmCZol5uC}u3y}1O|C!(a6c#l@3#@<ZEuFd^Y5jjpyW&a?
z&kgY^s;OOk2<Oc5;2+bS=s{?Kwfb`9{?&I{If^~~A5VMw#Oi{9q)$DtJmEhcgDFAt
zawOP;o~88z%>h4K|6Jx-A1#*zbztf;RzNz#Xh3WKXkes)IsK!vf7CCiQKyHn2vr5k
zNe$W)Uf28KSoG`i$i)Jkuj-hlW?+3>qG*K;NHaGHyk@@CZ<IN!G*iJE<JpR0vQ5-T
zBC{~V1JTjZaaHIm5o(qSZg$+xN1E;DxG)msb|4ygYDi1Nj3<z)5LvT!na<{WFl-uh
zFCn~arDlyvW|@?0Ts31WO<l-m#I8$M$xhO%kY+ckMa{6tc3rhV7AzE#z|a|Xq}rXX
z&qqe4T(hc$&Tu3S^g)@fid4IWp>)=CM}wjVC+%T>Mp~I3mCq>Dn5B?V%r%Xfq6Plc
zATlK$?l^<26QEiX3*4+k1cQQJuOy(Cx~Fy%DGaM)s@~40{Z7?}hc(=6M8TtaeUQtm
z0~_oGu1vChYc|jtW;erjI=tx=vL!-jz*rwEP75*FR@#Ha&Zw2h)oTbX@Y9h>G0jX4
zK{!H#k{WBZc&bw%D+;Q|eW#;!vCu@Bd?_gtd`)i6w2*g^p*=;Y7UttH40CyGNS2yo
zww5V&OxEE{dN?cbY?rFEU7?p1>@n<vv5BrNkI+QI#5Of76q_(q)RbvH+vb6o;A*B=
zqtymRqNYklWUfbu#cGwvXn4yoIMd=eFm2Hl(z3Z^RG8-Y3StRDjU{uUkB-XSqL~Fp
z7Z)0tc#=g~OfM@q&Z2xVL5Ib1evD6>j9cVuw4ATnHBXjmeT=P-8+}Vc<x;C0g-`(x
zz!AY)K%?LVBn-&_N@6#xCz(v*7iuHeqzq?ZN-c64A$q?;Xu@<Hbi+z}kj2~O=7cFZ
zT4|a=X0)#dy<!=$EN~B6^ZjliD0@gLQ1E;{DChZR)Sb4=HK!hC`ct^bHyb^i$|j7C
zbS%7YfI41&WJ&`O!exeTD0a5iW;6bj88nK~Bwqw2s33?0#jnR!3lxVCT%Hy*b6Uxl
zW?^n>OfefghUGCX6=uPR&g%o%7CJ>nVK|pFauw6Y8%c*UT1t$$D3$MbU5?gT1)w4Q
z&WJ1qnZ`(jL(q{$Xc0G#1D&j5Laj3cI*|f@Wx5|^@kG)34CpD(gGWqdh%3s}C<CoU
zjjF6SVJpRW!XU{6(`FH#lS8r}l-!1qXS;)ZvzLk7Y#>ZpyeP8+Sc|Z{T@^<bUY@e(
z)M>@zW&^Yh8lySf2~ncU&+5~RUaul*WRO76<sk|R5G`ils*`@k&T|SiEYn&aYF2G8
z%(T>D;D^0_V}xRTuVUMHso3B%A<s@Z*0w4nG0x8VTmfz{Sfm-D?htX`m<ZEa+^G}?
zxDUxa%SLs#WLdr^F*b=R6KfE9s@ZGT;i)x(VK9JJZ^mV`zBV(ao}Z<&>6n^Arb01(
z;E7Sol!jHwkyuU7Haw_YY!LCZ4GI-t(^GoYmQR>@aN7*iPpv?CV9MZXW{)YZ2a_Qr
zL2;PRr6<X;%FT`y0V$=@BBeae*!0w+%iuw$Q5t85`SNI}mCRCmK=S=ruffe!-J?{K
zwX!4$P6>q+%zOsX^)c(a>FDDz-?qlA)h;wVH9O4o)xfVblx(MG1YvX5g0U=_x8YpF
z8cSZyBMfoc34KNjo3Ju~v0Tj>bm0ami-AiQ!Q3e~lXD{;lBfn$o4^&=bq1|`T*}aG
zIp`y`!30zEF)&e7mM_PMmn*b#gF!S<v(hk+js?X-+jYAY5FVCkFhHb3x-$mjlm|q5
z$eRVw#RObrM){J_rN`MpDTz%M%_=g};n+}QRT2o45a;0%LzJ0(At-2VoOJ@NDGZ$o
zMzWeaG8+?`H6aQ@^=`|`(MYPOs6HX}atYMPMZ_4TU~P&9j1mF!PZ5|*x^+IHbhXEn
z8#AgiRHVG1&`i4ac!t4{02DJyx}1%{gDxDolu`6CB`~Q*%Ta<PHb&+IL-~SX)N{2&
z52Sv1(rVJE>!Kwkn3d7)gcCb(ZX$q|m_fD|N&R?&&=XwLx{_^`hc1jcLgb^O(zcOa
z7?dq!IOduTpRa-{;4ew40XU2JAZvoVQK4MWk?Bw@IVfV7o>k&d8cj7uu`!B(1uW^B
zV+HD@Tk{9$EK|vA(-5cW!3g!~7S#g}KykcPGf)HAesKFQ^Zykr@IRIw9RCzM{0kh=
z#RJ%+An}LOft!gHY&7%9e_?^ySFwOxYRkODR0>c}_6=Z^obE7d3+{lmx-HHPRV*ER
zzd@1KIMW#9l!@7Zs@+s|gP{pL)Ejj=t)c>O7d4FuSf6T64<E;+T(c7o2-BKXD%y}R
zrtP-W2I3&AwnJAcDw}~!kI5Fte7)*NBT$Fu#Rdy?1<i9abl04@HrL=4Hr6v}#)-*-
z(Q7!4P1LFcO(fY78rkU7(lWhq2aF{dWw@Z0A4C;>pd(=)pDAEKie2c(3PKFcmYvt)
zesLg_%LB8SPN_F8p(#|P`~tYA%6Y_1#;HWL4Bjbdkx#3M2<$gfG3X{%!uyC?!Fq9)
zk(($1Iq7`I`8HY~yN%RNCQ1xj`E;>pPQxcorKZrK5~3o)^OH<^>;<!x;mJWZQ);P6
zjVwabsA;jR4i!i=giu83(^;O8JwH>^@hlIjP<_u-hLls6B~$UtQP-QGXgKY6D+(#(
z>NU0OHMMG;?UGZqo^AMuFwLu>2>Oz;Svr(uTt-Sgduq;dctn=SAhw!4QyR6L;-EnR
z4O&UyJY;cfJL`yYi{ir~5EP2a<cum!5!rrF^ro$B!4y(;;6YPbt!gThDh0xgXXU|6
zWpzv+WaUY%ZNfvy4+l}H3v*sEs{(V<@0k6rk@JU?QD~H*Hm<deB0`LCjc((%DAt;U
zRr703tvQ;Q8d=e6OrL~u6(?J(<9&J3okAftYnH2F6j)VR3m6I&>j@)<jYhk#)ViTC
zLC2;^Pcx%|nH@t&b2wGA%)rSLjbx(cXd*9ZGTIbtQjg6Ag}4e;dqTaKgaT_-W17rX
ztRjU~SXHGRSf@pN6lJ_7P<goyFl(~U#_43W{?K(ShBL`*lxb?CS=nn?W#4f6Vy0<v
zBbsHWS;BOtU2JSAxauMewhrnAj9r)YJ~If*<6?|+D$%!%bbG>FL^F+HMsLta&amu8
zM{2Yv%PpzZiq;hKHK&j#pkgU0jLVD{c5tH~t8&TDL=HPmtlFS#n4_9T7bSgQPGNj1
zDrl#hm~l9pHv09EQv$}C6G^#`ieQ_H{l3@%O9u8<<cdjVs`O@2g~`dqZpifT9yS)T
zNk<xnkv=O#VJ(pwlAG;S8ofb5DD;OB7&1nHJJ7(F_ZUhvW&<7@jIF#54TP@F(UB52
zd%9fAJEGb|db0q4UZ@d5C6_g|VqFHmXCjPu{i-oV2IQBPH=w)(Q-JoSa2rT!hZq`R
zzzr(0E|^8osj3tKfJk*DArX-cHno;bP;$^L5hBi-7M)ihmRFnls7KD&T#r?Kj2}`k
zY(u76tmTmENCma=CfTz^hmG4E6pj3HovmiGq?>w!(qNL|RkuGgDyleyvqGDw;l*LO
zoy%lGMlw<xRkp`1R*vP+bv>BG^FhDHW35CmM84i>^%&KtwmhR&8z6?56YBL2Q66_a
z7%hx5imv&H6DoZlcoN`H=L?NOh>T;WQ%=XF)ofr63P4ri9#*5OLenmyElz0F%Tq&R
z6m1IUc%a?!eko}(YHFoiBCmBF0Ep0Xwo=vtP4;B5TcF)OS<d83<H)c$y<gLc6|+qk
z!_-XS;mCvJA@7RgsFAOZ+v9e{?M^6{LZyDLFiVw|B*_|;Bc>3d70I#b=Ue>_Jf;&P
z;&sfTLC27l&ot_|D$ESIUg>q>A(RcuU`Tit9WlwcV^mY+VGvn7Hv;;C5-YM)P#w9c
zQH8vn89~{&HPH>phRbj-6k@Sl=t|<SQb#&OfKE)<==EEmxmH$^64wCw57&hztMp4%
zT9otcE`qwM7J=g~wev277(Jn5Irc2o+29WtAFST0(e$jJ>h~t@4|!w2>P*!1<QkmH
za6ysrt_bOqE=n43JM|T)*_8GSDHBaPcrsR+kp>}SDkvwBQ)+{wSXQIDYkOI&Be|qM
zYGre?xP>55FqIOqoMK8i45?{BE*2bXHXF8k5Kt`*sm=PV;4(?O*RWMI*Fz~L&ghf~
zv~ErTx+7}0-M*V{Xig_}pwbx1sb(Wo$~AIARLNwMiI&SERh^~VEC$7V7X!{@PvWD#
zl%)V<7*A6K8kJiOq|%}reYw`R*dg9R3>;IX0$4q&V<d6bh$JdqN^q|btPDKJF0hvr
zmGmtJttm*ei2$EGO9kUrv6mGiJ~0YI4-S$}VPc>pGVox4{iytm;2kx+GJQHF5`d)u
zEW`f|pMbLt{Tn{1MO=aArD`1Ny3`{C2g5V}!Y4^D<toT(inr+yd;r!6j2$oP;6rK{
z;7QvIKx=p|7X_2tWFnAO)%8eV$#GJe=#$(O$NSwzIcU+f2rxD{#WplG2Z>7DN&r&n
z)bin!KrvgY`b^KJCXVZqRl2~2k=Cez@`6+z1{5h_{!FQPDrZ6&YA{m_h=@he<tum{
zaTHQvg{XrfNuyEDR|qv0id~H8U_+CHgi#@JnHJh-nf|OFVw^E_hoi957(|#6lO53k
z6T_P|J3{y%>G$bYY|J<mCkxO}sCryuHq4i5Og^UUs*7j*3dU4Ab<=f_THvE{x(VQ~
z$xy*8Z?K$Lg$rR+oiS3X(E1I*nDiVM9!*>cDl$?w$)WXps!h2t1<nIyDvxF}v>Ip9
zMb6ZuKBsf)fa==mF>j6_hT-~Uj;RMqBgUqEI<ZN`smMe*t{14V7I#b0K+4A25Fhos
zqHds)9`&t+2{L`Brm7<=6|iVholbC*aFgi>uq71Pa7Wcfo(iUulGh$HnYh}jV`L~v
zLZe4vd8snwoQ5goaE~OyPS<XLu>!23x<$EB1r(VN@s`QC9ZrqCzDR3Ds1E#@NlhgQ
z;46}m)#?>WLMkR6RH8eft?`~-7ovhijhZfuz>Gi9yR|r}`K{EVv|8|>DVYAO$_<M-
zq6m~QHSU$mM%b5HQlhm*M3nGSmFxS}*e!9A2{4yX9#KeUNU;gQ6^g)rZn<?LnGSN%
zED)xlCt3l+2EDS^lTlX^>FQ7{+K9_CMrfvPR4sK9C_nGVK!hp2D4?{#Wz1*{e0mCL
zwwN#BwwrqRO5K8^jt4i}ur^ah7{ruHxLD{(3NcNhVAv3JsK@cF+W;aWB$-xo<Z?)F
z<W6gVt7!VTm*M+_iVx8!J^WcRmF>xhf)$J~dq_m*8j!)ez>Tztx-bfoJlz@8^4ZR;
zIs+xOf|p8*{H2Q=Am+fp;Bb`F?7$nB$zkjlTFod2sLj+o@sT67Lv0*{^&)V<v$--v
zb4Y#C$}4f*QUJFR*d<=ltgfoWjiQqJPL`A^BD~p^`J@<(#1V;)LQ+l_h|D(;W!N2L
z1SAqU5gEfRf1H5?Yz!2-7x++?$^|-9o6RC4VY@+!zoAIZ24ggG$GMg}_ByfQO$K7g
z%R=agEVreKiFn}%w2pgHGzux#L&Iv|j5*35cC%x-)$o8lptOD{_ooQS08NAU>oiYt
zfPBVByghE32wBb<fa^2mB1Id+BvYQ%>Xu(_L6Dc)a3kO1xE3J-C)Mg@{Ti(kGoLSU
z^hlFz4Rzd1UI^k$z13yMjxwo`8Y9w4AJl|30am4~HkMjAnyun$BmwVtM%&$))r|>0
zB-&t-loHLbYJ1Y<Bduh66?c?t24<$)4{|NNsM-wd2Qm+_b{3R1VzxRdjmU&-2oqi!
z01jy6KqWb+IsxWMQrv{udOvWom0~bZScy*$h0~G%wJMNJC0FcALNo|zCr>AXI+-p`
z3C_`RTadZ2Ix&(GVoy`6N`ZOghv^yTw2_8nrJ3SFepQx<0>gA>i8CE1y>THgPfff4
zac&mU>!kuNs&(Fv6u!(QzE;kIC16b=i)e+AhFrO5(GIX~)D)eyCMwthDpMJQ&O9Se
z7G+5?`c=nhPSZt!@E#MTj$Gs<R@PR*<Z;-^&x&QJOWNHr6ocQCOh90QTPP2zwF!$Q
zqYz5`Jgl2m)~72$3oUf|xkwpceA-e7*$(H?bTzN1YGG<CKvm)-Rfc51Tz3PvB9vuG
z=4Rn^sIXqH6EgLh$AdM(ZJ%!zGQq?fj|-VXZCGxlrg&^5P@#f#+Z<Zg;PNEpS13b@
zW-wZ5)<LzJo$x8vWOU9g^+Z*g8G}NuGPVFBW2<h?1iTsL($r|yk|wNg0^V&JnN^f(
z5@^{S>(g<+NOl|TVqBqpcpR%?OHV}@CL7+^sU_AlldTM@kd205Iq<2SnxKNE`WomJ
z9|@xgB|_D<K|wjF(5o|yFZ2!1&bBZKO}bGA49&@kLgHAMti(zgh0~><!u3geR6#1U
zj5=yeWkh8NRj&>r-O0nj1e1lD+^gtJn6$lK8&n9VrCA5bg40<TD?CE<vK=1=Qa>!@
zdwt3F@G#2-s-$WuxEYt*R$G9j(B%VbVuf)z%1@@{PSsau`Rr62=FlOMTKEh(aodn8
z5_l~Nas`@U91J{8VakBw0nLItNHFV;pur3sG>4KL&Kfq>nLtxRR&z~XQp!y(ZK%RJ
zVgNLsS<ZWwm`*w!H26~-CIDX;Y>z52HT@F${9n-z*ngnx|C^oBvWJh8mOe8YU?QEe
zi5F9_mPUFC>a5d+!~DM@H9C??UA~wftBee@d8e0wg66pCS!j=_RCwH~PV)SK^HPcD
zfTV~PnJGDVFyiD=!!$sEjvBZ~jj&xrsSLWg5r`5^$a?4#O}ilkz@HxsED;#E!2meP
zF*zKzToORbcCXXbYM9bacVRjfo1rX4ZH%b%#c87p=-Xklt;I~+q}xJhj&-xy>8J!T
zQpX5Vt+g9f662@s+L-7x+LgHMRiFq!K-`?DsNI{X5;GBoT3g^*#F$a2LPfsatd;y?
zUeOyim5KOi&28055VHX*gHM`4*BC|lUJpW0R5vgrpYAM?4TUJ%8|G}b5HV=E*ctNU
zEXBxJhZqE34t;`F&@5KuLxvczM!hndibjQZfgz+6lP(Q2O|?iDX3@+J`+l1sL>6?1
z6|41n7sQ~v5>pZ3xZ9oJ38Qplr;*2^g!1APk}~cf>O+Q$H&rL=qLT`ptFS>=?cgf#
z+q=OuZsu}yu|8`m^+3v0Y`-gtb%Q0u&TN=?ZNJj*jm1gVi>5Z<k!aUyRVTR7<tm6j
zv+|9>loAFZn$z>B*l5o>US;Zqf-@sgELYbtJg?LgsX>jaVKwrA^2T!J)a)h=-S+iq
zh3RpC@U9s|hmgsfP1nk#i&;SiDT~Uist8jJ2A~g*&`v+ym`YOv@k`X~Be@<RaLb`Y
zOlK{yN=?xvToE`Xj_ug7R3|0YNN@@9#Wqs63jq})g|?q`VF47@3;8DBqBH(T!p#`S
zB$dE&q)JU6&DHW~8P-ZUv8#rFjd6Ks3Kno=jg?8=5-VBD&nF6CH_A!F9$<<eRomq{
zk*a+hsb-u(w>#?3fIgLa0~!R;W>m(6rnwTHC26KD0lyO;x=akV@5=+5k>iZ!x|lYY
zx&^wS7W1~><k(uj?hWmX2Eux*#EHa!8B~z5t6+jj6D7uNkFw%0=2)g$&vlyhN_jSE
zS+>mtt?A5~wZ@#1lt@vV>Am8tI0@kzI+<nJVA3T~l<NZihO);stDPTUnqJcRy4Q|s
zJyK6$)JSFeI>RenJVR9hm^V5FT?F3$nD%Opn2ukZ5VLL%sM=(bvwDufO)|rdKMLw%
zsqLEp{8DPj6s!sbEEZNpMIQ$zzu)aoYhKoz4lJeNi8UAjEy)w7VVDKS6oW=EZge@<
zG5JPo+_uM&t=5c=TpT(rUxG;qA85JBFj0k)HB%J=v!>Jx$IObtz?$B+;Hllij7rX!
z=n*?b^l=O;Q5|fQ+w-eqmtkY6mN&R&I>SndD&~_Cp#7<wHmn<C#;*DeGpsaSZ%kQi
zOUc!u1jVx*1JkQ@f>lFuIA&3EjPq^}Y#eDS-C8m9%?eyF4V&)u3PFGD_rqzsTCZ7M
z)^OVxL(UM&kog&)K?$W@GfNu4xsg{Ba4GNMM81}3<rB0Z<lq_Oh=ruh*QtpVi?s=5
zr5M|6Hy~5%Ph?RWHZuCKSC<fB0Kra_&+rfvh?8lyGt$rqA_YN{qA&!`#Yi{fIGltU
zh}RBiiz>LNNVTF~imSoYC!Ll<Bz!cW@;NC#kz^dM6-JydMDf&zTSjk;rsjx>ZBSm&
zV}N!>1z^mxbT11>SUEeW+ReU0Qjn942Qkq`Kt6@5>TbJdCt4OFTPY?{5(KsixD*b6
zot3g(ub7u@eS|ejy&Ty|dGIVw(}nar7c^Q+EZPj^qDBjV>Z`Csp;FbW@+2PnAP8@U
z6ERaMC_OWrP6m?AG<!@DfL2hq=}+@olttQd*KmZk?uksJ)<aE+hGVs@7s*t_5TsEs
z+L>9{9*+Q7O9qG%g+xeK^f-gX=y1q2Qyf1Sju}qlqH#?Qyb0awGz<|`r`$eVmffC#
zHZ8wIb|_KwfvR*Rou1LNRxz&HVjO{P+$pb161WHpB@}##DFN9EJ&D%6?nH%Jv7;ix
zLA^C;cR?glRtr>lnl5!*sg?DXN$7l~Q*1Eqq{)jJJWK$!;KL)}Q<_SxjF9$d(nLZU
z(c^Bj+4B%>RBZvG6*M6AW~>AZn#4=I?sjmj$x<3UtoBF1`<&4f0@EOHG>p9<kpjT8
z)<u0*j}(RM)JGGIKuZ7vi`cB2E041iyY?kG9yoC7V3Sa>qE4(KL)sBntl5RusNab-
zPmc1)u*?~>jYCOPC&LM4AZnc%4?TtO5vg9xxKYN4g=R$@;szsDwID+<lFXx7R%7FA
zrZ0Obmj}2S-=ztFPkD@Dbqg!28L#gvBdLn&028EAi}a|(0QitH8*pk&8|eR?%>g_3
z?|8=VGcd+iSzZzPd6guUrX~Ip&qyHJ1n>-5&u8n7kz<?rKFReMkTcZrXs6Q&%D|!Q
z>cybKgP4dqWk)?Zi_O9eCH8%$tBhlLO7+|DBn7L3wnkemL+n&!7{Q127)S)AG{QVv
z;#G&^pn4H6QY|qJj5op%iY!Y6g5DlEnIUko!1kqfKa^7;!qqmfL#B~*r$v0g6-ovQ
zvQWlxKJLQ_j}i%Ln<zBP6hKiwJ+*XkK-0A^QB9PCrhYq=S_BfM@Wj#+sv1DCUmMJ7
zaX4Vqq-%B1Or;)+)s`)@(8y7QgcOhoPY|}Lm+Wz_fN~{v;AfGE2agjWoFG^?wRZZb
z+HM+Y*a^hy^=h9-yo@%ZQh%97^T<>W4Xlm==#4kXS<}ph^+{moQ}ysqTtm)ErH<l-
zeWGJS8L)m`*943T*kjxUxELd=HLO3NOueAigiL1;*3>4OCLiIn*n;a_BCdcaN<A(D
zV8YuYz0EsqN`ip^<3Tx7%r$c=H!%71P_ls0;iu3<Zj7m3NoO)pF*mj?EL)}sZd@<<
z8dnp<Mx`s{1-V==K`J?j1CL`_$|hVd?oL1>Q;BQ&Tt^)WEyztslbrMh^@%02OiAhv
zb*`%9tTJTPoOY!Q9HGJt)G4JKSqytwrZ`mdUs7A08J{gRhkde+cd|S%Z?vF}%wR}*
z1vxj1f<f5tXlgMN2P6mtn+}cIgIQRO8>w?3d3d=A8%4Nk$Fwvl@njm@<7zF)3Mi0(
zV--4WF6x)T!NHRXJC!;`6Cwh+Qk)Ei?GP+02=#%;QhC+@2&WEs2SEb7L(?ZB*$UhB
zT30MbikBZ6xHFqZGzeCbfb?fU9ZshwP;;t|>%gf1JW4U8<VFzaAFrB@<&xMGgjY?X
zSK{JYp0V@UBq$nQ0uQGkE(md)SBUGe$J*UYzcf%8QjLakr&`RHSwM*vLpX2Atwc_r
zH>wTP%5n}xmHj4Jl@o)X@F=WiwVG}<h9dCwLFQVYCA$h$?wGUbAmqSclJRU>5A;Sk
zLMX8s%vdlHc>s2G22JWh)fqL)9Zeuna@gsPO8wcS-{2hAwa{D}Fhf*P#lp@YQ?kt3
zxC$IyitAQKGj<T`BOY?pQCTezQPjzfRGIQzEgA{c4$<y5rePS^nL#XhB0il}8ye#=
z>VO~=riX!0d9GS_Rp3gi89(fYUUQ0i4W!&~^@1t&U}S9Q*=eAJEtD=3xC4zfc?gfx
zQq=bvXyOs#04o<u><Fph7NJf`HQ+vWIUQkKMs<^_&1cLRr-Z#uwq2y+QY=-0KhP2;
zQZp8v$oDICa9BZ%9?WN4$8sRfFbxL%ayxH>T?AZ1SjQP`Kxq}PJ{gz{Al*fw8Vh_j
zl6r{lP7;t>N7wKX%JKPT354!+-O8ugXZARc7?4omkv>pxAYw*nx#}$Rr+^p9%IdUS
zh4i5@m=PtEgR*`L<@#|fPD(94XyGP|B1I|m$6c(|GIJcH_%j0YxFTR~DotWWBT~N)
zQj(_qvP5LOdZ9!6Ie}}kq3K4tI?(vB3<2|Bmd69FoiIgML87AGEsTj?sPe%5EcSh>
z2!<I3^te`S&+3-$0J_@b4LMy9MC-XtbB3ZIKQPpWfLWkM1O*gPxlzgLV-Ao!5IDe;
z84H%<feUoVBDE%($wF{_2uWqR%D_h5N_2Hhc>1U|%D@#81cWRHoP2_rMQjh`nsxIi
zKduz@Ry{(7g|eu}Z8;|(&B9<93!PDeL><41D{-YjNoc3os{<}}G-55J=m2J<T|lP*
z^GRs26sx{NB9PX$*dFO2O}Pmy7cM289#cVVx5Ii!!s>!wth0(lA*NoV<mRYe@%6Ai
zkl0GLD|%%m$rTXKGp97IjHbD=SMa(tV)wl|M!B`9W%FYMWCXDisaT@|hEAGU&f@`O
zrV6x)*;8vmL$egMO~qbi*sj6tzL8<vaE62e;bL_`i@9blz@rcz74$ZWq0R_|YP(+E
z>ZRIH%Vx2P4iZRwk(C=#Pj`J&s%8n%NLQF^3v6|q?MSpF74uTB)*U%sx7#$EG}3Dz
z95^~<0Lyz-(`!wUhyhGa69uhy^cF+t=E!eWMwUD-1VGktriU7M)dl=ggybq5YPX<X
zDc8x`QxKseyAcbIE5r;PH`SWoAUMeC#h{itMWal^905N;OVC8IZCI=hvM7wPHKgE4
zi60dxz_s8)Qy<zi)#h8!KyHvptLLhS-%CiI;;@E~S|f?Tg-kBi{ZUV_iyc=LB`Kff
z1q|pO%?s*6FzLHQrHoZcPRZxGx!xG-@{C$BxUlQ+CYA$Un$Uz=h?WEV5Xe9i#b6ky
zcn${w7B!dx$Zu-&YEif9Hu@};8rI3Q9S{&^3jk*gL0$l!5Ev<25;26kZe;{n5k8HK
zyxAI<1!KxLMu0jj`TQVm0=BzhXc^3c1fS_Bpe+xqv{dGs5D%zOoVSHmuBO1<qSPee
zTy9LFwAiU>4A#&rif2Z&91vdywx(JV$~xHh(%3U6*|=LtL7dWWvLt3FvH)-;-kyLQ
zhR~nMR8!U^uMGKkgLSMy8FJc8s>!i91&2W!;9H8)ovd9&^LaV|h-Yj}8|n520Pj#`
zIp^gE-Tylt1eN|B4<;?BS#!nq0R?K};}xORGXT8!7r*tDOyo5Cs46iXkbTs&(Lysa
zs*c(nn`Nsi;xQ{36AvU%^=-02Rl24N1{rgq;-yxy0|7#>;Y8(Hj>|`G&O@q`Oi>*R
zT0yn)f#`RwSmLTFbRx+xh9xQmEp9Ny@LUJj8n7RDAMV5i!Ub}T!9&tgSQg|s!WpC~
z^u0V&;l!!a$=6|YXaTK<_Z<+|%bNLOf`M>6tmO<w81vFlC4v;-hdf$T2R>U6bKaoW
zmMp=QJ$u?1Wb&QFAx9ptWgvz~X-q5A?&w7k6}cV-Ag_c;I)X3=%5*RdfqOlWkcR2L
zANpY^XH)pu@1rPIfu$H3`*J3m6DA<BXH=*+*wQrArII!SVZ`(ZK?;)6m44B5Y0;@n
ziz$+EB86>Av(T>NW}lhnaVO_COsrF{4NThy3}XRAxMZ>CD!qEen4(2K>I5jx5KgD<
z;az9oKuxI71gR%Fo{PjxOyouoS(_zFH%X=)+i&}}KDJC|k`*)s&<0(zHx(-mHx%>Q
zv^Xl(hZT?>sAo01k|97Ygdw1l3F(R&4Yj&4P_q#?lEI#hY@%dh4as!!Wg*VlxB}Jn
z3QaZ3M3}&EF)~1ey&rZ>yvv6|wt*-uRbVK=BU_RJTTnru^W}O}Xq$zG<HdHbkr@pA
z1R9epr?x~*aER1j@3m}I#+4G@q554(MjYx(PpB+AflPN=oeA&JcDoW318R~Cg-~cB
zGl*&9P~8}n0~JyNl5dU^ZQSZdVUy~RWUEZ{Y`*DCJh{?p<0@5(l)PgM=o#FpO6?g=
zTBg*=f0>$9RjPuQq)wOoC0ZaR0-mp=UMJNl`s_I0s+svRlMw=`)2R9FF|AZeDdKN;
zRcowLewQLgiCY&1sc4UC#0*F{9(O=>8_#E2rab`p194pdp;z!sn8a1bs*?3e#8rnZ
zX#u?2Z&!!9*e2S8j@Re1Y^s?U;ITFZyvmV$8Y9bdBTlyqP**Hv5i?_eBv^86c}-gx
zfGA{+aN7A44VPg!Qml!H@fnHjRgGdONK}>U)Ip01$b@m=UQ2F+5J0K#kG#BIHA@zi
zYAdSTtF}wP$EUi3MnJ$cKS}dAaAyj4a9(bd@^)dUhWyMjMj&Rcg+Vyx-8|Ycwfxu^
z2{10#18c-ILT-w83)GlPaNug910eG9z)R^!CzGKHgK9Mbk!g$oFL|&XN*d0FqCX4C
zwi*%%QXo<Xu2XZJCdeS4wc)y~O#F5$CPDIW(}X}XxbEaY>=$GYBnnb;N+6XjQ=o%p
zP^jZcMg|%c2iPrw0C>Z(1n8=)47ZKQ0StT^&%rUKMZmn#wA8#`bwHAfQ|g6ThGpQP
z0>W-Z8e{ljNo=O53Lx)7MNF-4debo6q&O1D(u9GF^?JEm)p|m{?Ic2#>=iSz{AFMQ
z6sBiJ7Z@tsmD)}^pMvU^(NtAR(KKC%Y-r^b7mro2z*ILZu*GpZWoN2`relB{E?TYz
z>6Vm~PTF@t!c;1#_|gz0yq9OgxMj|Y36*IM<!&+wleFl>nB?Me+dv20WXQ&FmFk)*
zRj4QUEEi{72pvcq$n&XAygodT$e5oMor)`r6+E$UYls4Kgy(7B@5mLD<Re@lCap@Z
zPhx`(10>o4?`)J?Ajs6jfd4<KdQc8<cQ|lJ0;~xxrs)`HsMcXXq8*9@c7W1LI7l0v
z>SRf7C*Bx50l7leNkZV9E>0M;!9?s#XUkX<&=<H7m6&#<vdFY9Lr`<rCM2|1Qn|?_
z2L~WEX6y{9*+}XGG>><XOr>3c8R$zqYoZgVS|7yqN+G};ViLBYPJJqPkpuDVbReQ%
z>gu^v38R^YK4}|%F+C})J8KGgXxgORs@Wn6W8LnQ`79__k4p)H0R|Mj3PI9&!hn68
z?(}hQlv69}s5eQ)$Zi7Mmi0O?M#Uo3G@9KMsdgLzObCq5X<I6r^c<we2W3TNQgdV$
z^g>Tn?5WV_>dufOLGHz<UIo#3XXuZe(ySaOOc}UmqS=>|Vd{N=1Uc~L%>**Q>ofJ5
z8|C9J5_dzh9Y(EoU}UkR?K0U3G81hF@|-Nzjw?ZSAW}vHjHcSth#Ulx&xhb`9C#2i
z5zJL!5%EX08ItXTrRo9r(J}=q*OFV!U|MFt%O*j_uF-9xV?cZ{o9<iW>)>G1<#6Q7
zCJZ3{OiIy<Htm(Drq;&f0;}h+OjJNfA{}%T__<~@?u{$Z2%Ntz;0q?GQ~;SVxj^l=
zCY%d^1p%cZV{<7Q0NI^IxZV#8vzcN5Zj?>WxuS$MIdlk%!zuw&Zw$LhMUm1CO7eN3
zp0Y|9szz<OV<6N3tAJO0{(qx@pzJ?io6C`ZDt&GLb=KK!9hN9IXNSFU?#4H?9{c3p
z+g@48yz+bT{!?~8;+5Tnd**EV%}eszbFA<q?r834YMUyz?}c+N#*d$S;)%Zxci1!c
z{1-nsebrlEJo6a#;dzUHwB+H1uP%D{-Ni@T)c^ac)gNDe-IIqs`~I!3FWTw;Wm{&}
zqi&zG{knU8fBlnx_|KcAbLR|~-@R~h(*Ntnb<e!uTK?Y(|94Gx%S`diZz4C}|CQF#
z0+qVD>;J2x6?1-mLFMab|E2doo&R@xd(FRM^Zz}rFMm2j0IltH>_a1Hz5f{ax?3+_
zz0rS9<G)*?cH3vYz2={Gr%@oc_!IxAv~2Y^OUK%upZXrX@!wjfTzkn1m-BwQXz-OW
z9Kx(0koCj%u+(q<=BjxM=dIb0J@}ZrbEgiG)1n`*sLfrt?TbI!eD0guz5ii*?qB9x
zzgV&L+*Nbe@a*+_FVjx8e|zeC;q8|nw(Q~Oe*4h8Rr9P{SFCP4jMX2#dD%V8V4pPy
zvM0^^*7qt$Y&!SN`PQ#j?6vG3?$3Ka_cOflru6SSkxt>!>Td1zU%q(RFS%sl7K`Sb
z8_s=m%Q>gL{@A7qj(yeHZ_VE9Z|0ucKIQ_Ve(DEjM893}@Vqy7*nR$D=B5Mg4?co^
ze@djKWB6d%$K`{lNA1SLm+ZghTkMPT;7zZ*XtP5PZzk|;mnEUi-o_-vRwr$E`d3%)
zzCG5vaj*F|bFXdt_CoQ|+?LYH!LpCpgZM|69dh!j?GIhNbj4qnt=?$cl?N>|He3xK
zv+L>CO_m|2U%T5D+(Yv;=N9tOpWP8|VV*npvmIVKdfqjX_r<DZ=biH2)hkZ=TH^%s
zjh&y^^_F8Fy?)a#e){b-e^_?){<~}`&tG-HGYhwP=<yG)KSX|Z|20oOcjEzb7wu;~
zc*xz$_Fnxx;)?B-7uZYpv3|1R=6Tw@yAPVJSn<@dx0YQCU3$*)*J<{$dAskz9xoCL
zj(WerF8^Bbsmi18Zu!g&h;-xCr{8(eIV<j+w`i`_^S)Sj+4s9&71X8s?|zDXRB-%?
zrAznQt8-;_x8H7ooxdTz!Ko+A_IUpEwdbz*`_V`4dDf*HezEX7-aelp-kR@}9^Gr{
z-|pTjiFVs%%L{kiaKVb>$~gLhQ9feDaqBI;XBYhLXD>Kpx7}a+$bIXlmBZ?L9Kp8c
zp1bRuYj0|7%dP+3*~K&W`Sz6;&Ux5hcIDSEPi|Om&s`4Q<jT^{PaeO;^5t&u_Bp4`
zdvorZCF~7PT{-806JO74`|N$k?DQJF?56TfrAIegw|wQ&<EBrO^*wIgbJ1a|P70zM
zZ`gL1IoyGSy%E0i${pTZea^hAPTl6N|9u!=+I#7tTQ7g)isP>?zNBGi&)@it6Rz8I
zyUGW<brun~KfmC?BZ$vebC1qpPng3IJC3f~XrpcKczNmecb<4dehcaN13vIZ2d;VQ
z_Djz={y{rhG=J+WZ~o$>tLvSGKiGYXTjcpG@gufbzVwI4<{Q3s%`-=za!XB^$Nu<h
z`^~#>+jSrKV%K?VPC9<cNpt?bc)jZ%J77KS<PDzO`|{%+``XIck>4Nx?qhDrmS@f>
zU4Grl^<O^Cf8ckymAx16*FV|o&Fd?|`On2`*Zxwy;>Z8ePXEvX+irCqTx7kU|7u?A
z8~aDc&wHl4@}2ijzGGK=kI`i}^mf@*ei@AH%#-HLKLI`Q;!QW1x5tma_{q;c*lhow
zy|LuIHA@e<`~~Ff<1g5M*Jp0HZ0@1qC7VbMV$Wji#Y+y{VtCly{z==}H+=NM<iv|l
zS-i(Zmwmi&`K`m{$Gv{_J<B%V<NXVsqmL+=i%+`bwqKR&8(#k7H~eF^TN?A<_+aG=
z>ZifZ#Wh!4{m9ENJay{g`D;Hsta!+Ivh?WwKe+gE=6`Fe-Hx2|(peXr{7flcbPT!G
zla&u%Sb5|Y&mVm}(Omh(RoZTdzlOMWxAnHv550GnL)QP<PkFAs^P$SB2VcMG_8sMg
zSJz*-^n}07J!-!vJDYxoc=U;-^M2E~+RbiR+~LOKg2wl++w=H`|2(J2KHh%d#Ch+$
zbHUTeW&6b9;g=tA^9G+}PWbIj<=ZzX9Dev_A5?+^wyl4#?;iW!`0C@gci2@Q{Ompd
z?#q^z|0~0KuY92$`~G(4-S9|f8|shuTvP8nb#iOL<!2rLGPB9NhIYxp+Zy{{_MggP
ztMzW&fBweTmj1ZIF}uHY;m+<-d+$46z2^J7&Rbo+{0DE8UOIl0V@^5v#G^l%V;sH7
zKI?6A$FYCAd)ccS%v-$qmfi0h*}8h~Wvfnp<HNf?+r~fUvc2$AcFWJ%<L+;sTK4b#
z-TT{a_wao$T`@hL*lIO@`4MwJ*yn}AUO!;@`hP4Qvd2gF9k=kTm+ttS-v7!sE_&%t
zn;rA53%_y4aqpj+e00Hg)_?ZDJ}Ga$>03K}>zRAcBQHN+-bvdc-evCMowj`Lu;Y^l
z7o7b3(kHf?J|`aq9d;wX@#SZ|L@(TH1LxSEpM33|#{I8%fA<xmwKmno7jOQ~R@N`x
zTX4pl?_7S2{PsZ`F5cG~AG5_B;pdxuu<af{WaK?po)Q<=eAs+vr^VYYd0g7RFaMXf
zklcP5y6w@oW8b;xs<{{b>f>W%a_-8_AKB!QSLdF)%M%-P&n=zsL;beB8}et^^6x&d
z-{QTuyySRpt2?*P{`hu%*LjZ!t9HBi2d{3}L7)frJN^DUmLBrboNb>8SMIiKi{l@k
z_wfyzAGu<q(LL+$zlXK~g`Rf&Ps)d_Kj%(-ryZ8>eb|j#eDJM3p0QWX-MYT<sXyK3
z=YPB9|G8~@%r76c^(K3sx(It`59s-?f9&pdCwlNBn{Zd|K7Z52JMMWvbn1t}IU9ZW
z;%g@!^8UuB9(U}P{&uUL-Ku!c5f7YS7Cu6s1X}O$)(7k!J6>{J?b__=<qLn`+2*so
z@A&jj&mMp5mP>b9d&_-K`{Uoe|Fb3M$pI@v_doW<w-1|;KilNIbLMe7X$p1B)_<FS
z7`D~r%N7}5wZjK({L)1i?EjNp`}<OxJn+I>_dogU%0C{lgKVvw`^#@XzwgP%ZnNJT
zw?qd&b6)?e6$bz*jPCpCoRx2!d-tlXuCJEKt#998yG`cPKU(o^f5BdT`<jg&{|^A9
zP`9tIpAH|t&AtCybpj|-mjGPZf4>9tm-GLDJ$gF;@S&{UP1^aZ_WR^Rl-<97)HRo1
zyZmR5{YR6h=wOV7a<uG!)$nH4(~te(#t&Tcs37tG+2rXd*wQ!c{4V<+BU!!;06PzI
zWd-}+Dl^re3bosl8@zm*a?jHziplz0t$BU*;oMbA_V53D2TLJlw^P~44wp;CQ@{A^
z!v~*yAXt0F8`f@DKl0WSF}KH(zkFQYakkxubN+nSUmtv7{>7i)?DOYpd(<+2IHAsc
zw&RQE9CR0P-uKVh`WMSThVQ|~KRV$%cc6RY`@C;Icg}rtzy1C>+}}6eYYXnDOIE)$
zJPNYUx^tVmH~h%^^IhmpuX=NiIqjtP`!DXi*-|f9`T6jiyLR7#U$EEwyPhB3xEun@
zd+{^tG@g3;xcoUEWd3juw)1+=zts8cDfGC~Zs#0&`N99JzwrYe`n?4gSHLFjQvmBH
zSnHg9^)W~P<v-isd6#||Z+hHyFAFDvJve=jPq$Z|UT0T+@%CRG!RqsW9HQ&rdH?VB
zU;e!J+e5iC54rTHYkpT<`2Hum9%<hB+*`?em-JiLKULiT-uL=9S0D6?1b+XmADsJ>
z?_3+KgcrSk$KE?GT+>+f#UuTL-v8*L@sh_r{&VzL^~6;Np0VKg4|lKb-JgB-$=`3g
z>88@=*O|@sy}QKBg54{>UZ8nLe0Ie&|AS}pOK$zxe&Ukxr3<jXf8)*ON*sUr!JnN3
zJrv)0t)jmD!0S6bchfKLzxbfnuHN^?qgMVY>7A9^VD|c(AHr)_K9#)t=7)zKdF7FJ
zDEl5hI=l1W>E-Q?xg-Akq?7(Kef#=nzo<VXe*P<|edvW%<SF!w%(mY<BZGc|XZ~&m
zrS?O~bGy8G^^&#!s`Q6<uQES<eDC+}J^#o{KUlN(um8Fhem8g~vx&Z76(*hW^Yw*o
zmOtd)A|Cna+B0_9{?SE`oqx_%C!IZgyzTzMk5|3(aqySIn(V^Qf4|-pv(crO?sDzn
z&q(qoFCO%*!!BC7*>}CeuN(egWwPlm7oGN#h4_chJhl5(>Vm(nbN+%q+<p%HA-OR7
z;Rb*C_VZgV#QdxOaHWqQmi^^5XTEaF$CtU+eRAq`<42zQ)n(5gdtH9=KxN4%pKYGJ
zfB(ku?+$wGoL6ss`%>xhk1zP)J|{Iw_wr9X@anH_Tsik};g{GK@4b7~rBTzl&V7nG
zZs%*ye{08s_+RaE<nj$8=llbYU$ijy(Ao#Dy0vlfo)52n^UbSP7e9IP;=_)<_q7*7
z``4dTuQ>TPE21MdTKD5u4?l9v&;Q)Mfn4@Pxo<yt?tVMInh%cXuHE-r<JB(@9=m+$
z*$<E3JeYg`z+bQVc>3LWzp1|>+OPVHa=XvJ=%=gD&p*BFnYl0B`uX+PBWr(u6uG1S
z<N?S2`rx<UKJB$<7XSLY-+O1pC!hQt+obs`cJXKL+;P&Mlgkcx_m{_VkjDNfx7*6e
z=N~=x@L31FyTjJY8w>YaeayTa{vf~p?P%8(n;&-G^TGG;6y7}fM~VCVMF)NV{DoH>
ze6M^w_oHy`eYbt_Io)6FUUAuLhhMPYf`tWl`!kw<n|s&VOV(~O<}X4%e(|(3u3KxI
zZQb|teRe#^I8j=K{sq1F%pdN3*A7>T#_u0FDfia&(7nI=&Wh0+$gjVDDsuSukNdB0
z9izYc*Sp^P{ND4eKff4O>r`}Qv}B`=ev;H5zV*GcjGyfK+<AXpe4o7Vy+7@Iz}W@t
z@SB1=@a4+<cQ3vF-QQfJ&3Rz-8Evh;WxI378=i5=z2_ali*LQTXb!qx`|nFWx@50S
zUw{7dTR%N_^GN>j$H=;y&z}d~zW1HjR>|6<`A3#7{yVgZzVX2;PVnw|^zG07JU-~!
z2R?sm&4;z~eleN8_vCcHSM1+BggkWi9ee%k1Z|IV_AG9;<7XGhKUY6}e0}GZl{e10
zbFW1&{^<Jp>V?1h(WULnF8yxl-b<d+x4ZQ9A0P0}FAx0HzRzsE=HZ2Z-{GP&wmopX
z$&wYmxXY8RH*Z>V-1NN1CNE$A&`zthlXrUffI}}d3X><#x&N-{_KmkY>w@3kK7ITA
zBR_xSwlBD6e{lWk11?2w{d{uW@X*462fVe{{CACy9(eY|AMHNhFYc86TzmC{9S>ae
z$SV`4J^J3JYo2;!(NUYcvDrpX{_)WpH@{SU;^=pNxl~wt%3gOqwf@UTu7BqXCqFT~
z`<Ua#OU}4;$3I=MYIMWL*W9w^K`r?F<tLxL_USFJd++VP-FndLkH7fNiT?KIJoWI@
zI_d0lZ~o5W7q|TR*2n$zk_TTu@7(s~xrL)!-u?XEgFaebKBw^T5sO=oju(AAc=_gC
z4|@3Ag_EZ-SMHycd-tS$_dfhW`mNV5``U+pDc>x8`@!|~jW^KQS9ZE+FTVZYPgWhe
z*=h89!qEryU#=eh!Hd7y_<>sn?|gdsaQmaKS@Pt%qc65!y7+~QSD%}_e)Dw)4_>?P
z=4a*|ap3)bo}Tx_2QNSPa`nX(@9bVZ22^@{1O38t&b;=VBgt>C`tXtdO}BmF9CXzy
z_s5U@bP0LaYY(rVy>ao~UyENoYKLw6N0k>nedABRt9*LQzHi?2o2Os=t9$qQOKyAT
zsaw8&&bHK%w}#Ix`N;trpOjg*`uXRdK5@%6H-Sk!{oO5In@)yT{QXGzSLG|;`n7e@
z$2TmwW9>K510FkP&x2>Teg47CSKpqSy!4{9*VfD9!e{s2_VvD6zvh}%53hXnvheK7
z9{+88*9|}1p|{JK@Zn{n7Cm{(N3T7KuY1Hh7wgNu_1xAk1G5De&z%42_~6f8_~6|)
zZ*I@~@oP`dyXu)I_V`4WKRffYOEx|5(u?^g;rq|{#f5dCOm=wkDfrnR{A%yLz7ee5
zYKPxEaQ}8UFTC@UHy7>W{=9$IyZ4^I<dX5hUtD<C=HLAz`NAt_|7rTsY?D{EdGG2M
z4!vRPZ=E0i;lc7>gW96oj#%~A&)ycF{3-qZ>lfzugI0eWW*0xR^~=A1VZTl4h2IWd
z`uUae`@h(1;i~h#tLDW|-nen`VLnn+zW&gUPeIq-`O2}uJx{#&>2LVPZBPGsaP}3~
z?|ZDU%^#M%eeY?%JowGuf3R}Bd#=9YljoIp-}~^{BU5EM{=*J?4=y|4`uXn%_wIYh
zUhBX7?E}B_!Zmm98;3u=L;dXi>-qT|>j%zL-1P@v!+!SeCfnS*(E(>JjDLO7hEHF+
zsBpdTw-1}fyj#BU%}eKQeR>Z)`-1&%yrA-<cUP>bZG71$?><Ivx8t54JQWt4m1|$w
z;cwVeYwyc^ocko4eD`wWuQ%Si_kRB$A733+)z)_{0s^Aa-Hp<qbVzrJ;GtW(yQMj#
zbcaX^(%l`0l9oC&hdy-2x4rki-+12Vj_>=Eu^4Bpz1P~mn7=vaV*Jk7Jv?(eK!GE=
znyuk-r84bdquPT=hlc<6s{>5=KQU{^j9_<vXu*7>icjnD(o=V;CXjflIRAo$ZETc4
zP(1&+@#2wWznEsF<x3}zDDxrbbF+&Yw*!$6@gW!Xr%ekt(Z5RMKW%UJ_kS*enZS%$
zAXcwyBM1^2W~>`~w`|4X+VGb0wDWS+F{Rler}TpSk5!LjGZ^pnTaFW4mrV=BHVtD2
zJw7e22tUaPboFZYT0-%!GBcJ_S6Solvc&5T>ERDO>6$MrmS6b%qva7=cB7yW5|gcg
zdCwV#_HEm4=iSsj`0I{D(-6ZY&Mp~TJWXjds}XtA!zS<(r*6i=;{2FoSEQ0eefP~*
zw%3sl`!r&Y=R3JAj_nE(Dg`AX>t!>$TY9Y9SYSI67&>ghcUs{RcE0!MdYF8wueE<s
z><DGJ4LA_odi3;ax^dcQHV)u)S{AMInX+`x#Anmp`I_yn@b0|fnmE(7YyA2I&H4PT
zsHWV=0rF-!Mcc=LA8ruIPbCn9Ti)cLclv&Xb@j_Exz6e(o;f#*^HS2}T62k4s|;H^
z(aqBQ91zhp(jPbb-8Ap4_dou;KW13+6z6gyBMC>_us^N+gn!*4+}&q2yg{@U2)Pc^
zo#$4MlGko*NhKW3<Zt=9ZQ<7c#I0_;LXW4}d!YWd<F<->-|a^L=hD(fzl+FTI(&9(
zdi*(9;l<<#-*PsAoly#0XF}I+J;OG)GMfP33?HK%bXprVMr5n8hIV%i)1S9VCgaLb
za}^dYeaOhH#VG%1Xiq<}SEye)zLjoQ=Z`?&l9ptvM9(M>ZYM~{ZsQ(8gS0<rl}p+&
zyOmL3EG@{NkCqodW_sm2W$(KmB&J<<l^lLG4X9^-0(ZR9RC=4G(wdgG_)A^i@3z%m
zb!W;c%&C*slNjvxcs5i{JYQ|R2>a>zh@<-}uk~}}L3xab{!e_K^IwavY0kQvvKN00
zo!#E<EOIG3!W^%Aq~Ir;bu3{6QYj`nCO#FZAG(Nn7OZg+=K2`+Pl7l)LUjwRhsStB
zC*Sn_cddI4JK|j+huDJ%Aapb2a;zPXH|KmfOApPvwIl@AyQb?mWz&`x?r^0Wt9wr@
zsu2ohf3H{ls<?EF>D&#Jho>OT;@PVBEk&|#YS996Ov|?_0tYIh+SZkgr;Rb2C%a7I
zDzCU{ogNnDx%#(*c0-e&j;vJ!_eAhJdwB(dFzNiicl1x~%z<voo53?c=ZHavFL!}k
zk)1Q{Qr~!L#XaY4%y5!B8P;s;fS}*dpKW>tiK}<MyZ+RRn-JD2@^B|#Nap9f2-Ees
zmONb1?s9dym{C7xv%i({eOw=-M&2YD`c!;jIxE0LCu(XxJu+hwz}c+TC!Duh6Y9V{
zx{8U)K(IIVU(JDnN=P?fSxzKW+{5{&RMSeSnzQ$LcV_cz)nAw%w2KOtP-C8@d9$E}
zc3#&NvO{=`(0zte;yFH~yzRbir5?S!Nj1@RVa~=g=2GxN5j)l)f`xl>5xKlcJj82p
zQHvnkOG-yn7PPTGW|zBQYhHH2LShlG$jwR@2R2QQ3W)1ok)0c5<>;boelv_w>;Ppa
z6dlxJsw|1Bc2FuUr1eXTA{ne^8qDl#q^*f-pW$T~E2DKIhQ<Aod(gs37_64x8Fd%+
z!2EIxuMV|se0X!<yq$W88p}OGvYiSeXe|0Ml%4sbSN>Dch|T@k%-!pG)57qzthuq)
zeIz;*u1gBO@dJO_6WpigY4piIv_q2|vUx6pB|R-4#1zTL)fN&LGzxVWKO|@Q6%gw?
zM!)&Ux3uXYvw8n~I+@V~-tt%}wd`Znxr`-+vR`c`r^D&u^drFWaec3$LyCoXnuT#~
z&#ra-%D39P%yrT57Mmb5M^3-=p~;}#N|vG{)tSQQB6vsyiuH2;r)j&53IA5u56js#
z$9F9i8nV?23+KnruBsrZPsqJUT}Ge5pJS=OGmm8ay*%fi&|l@~tlxF^XYYN%VVw#s
zAD(LSG9A0tyB<!hCYDzA{0R3uOH~qjFxOqK@f|kLv|b<tN-LrJ{h@h}{^y4&uV2-$
z$#RbNI?)I4N)JmE`Y+isgECm78y#4)s?7#?l6dhYilYSeqP~R045xisk;RW_bWn=o
z>PukA^7w|SwUtL=^U<9_zs~7mxt3`!v(O>134aRKhC(q}1fCokzAmL_j3CkEI%_6K
zX!oi4C24sMc5=`IorUl&SnRHk6^WLNBDbG6ZE7*Rb%;6my-xm;^gQP_>ep5n2@;)Z
zfl^COv9SJqt&`+t$8q;7;SNdTOim}IVVKYP$mI{c=GbNwD@IjavuKJUvO2c%_}#aY
z@)5~46){yOvMx(hjj>>_+16%C@tzM}*8E7O{Z?fkhx$mKL&}#u>5Gj;K42w02+9`;
z3j5}amUuWSystSZn;vz%1&jd>x|Q30U0Epe#s&rp++++-F*5kNsUmu7ZVWe9!^VN%
z@H$K{RmGtgkrUO#=exzQt0Q}xZFSG5I?Aj?{!iCNK-Mz5!ok1uo5p!8lZ>121!#Yt
z+n|QO#xjqL$+o=Upvmf?#>?dfJdH!n8|ieM(XZ&CBl2+OgrJ5WA#!tu&XRtq@~u%y
z^;Z<Ic)Sm!d2huOZuv20P#y);Luny#d1mDM=TToM%RbJ>ST4nNy7de}vlMQtWnI6=
zQ{`{J$zmfOvb=!fS!dbw@x5Ur7z*kUBYX4$dO)<8*nE#~Xpo7#?DtU8>*Mi4{T~yy
z0>xdfo9IiTxEi_L)Y#Di<zSoy+JODihZ}8zCSK2cdgy7`FNo8CG0@N9LqcW6ouO#B
zLM5zd;wqynG>&kIN~5txKswo|l<2xNry1T16_UdkNm*x3y}Q3lD{L^<PincYtDEy=
zauXaH5B)TbD1P{{_xVC21|G%hi{U;w3HbfS=`^eT1cl$-&c#xnoSq76=55Xob)lja
z>{n3Jwd1+C$5Cx`fqc#mGa8r<Zo=FEpXxW`d%ju+8r4yxpuI&e^G@EiG05F!AOX6j
zm3a($nRPiagZk{tPx@PFu+nF<yJ6x?CGej%*tTUlBs`p*eAx8TG3o-<-_rAvX#4yf
z9^lQOkA4p)C3Zc>7FK?sPckZ_UdQV7VOOJG6aw--1xTiv<k|9^wms_#w+2Tt8!i3T
z@G)Of^O=fhF2>gLeO$L6VtF|x^0II$<98?i(q}OJJr|Fq`Q5o4EuMan&D<?pQM---
z*_-<qk&M;iYg2+I;k#WDrm1hYlss%dUFIFS)cpBA)5oDf6jWYbr$*@M>&d8uWcZ{G
zu#yFwd=0#q1@~((6~STpzMG@hJz|59jV$Txl7)qLvyajIA2Yo$CaDrHXFF^UTrR^<
z-d+ZOny#LFH@dVpBiwy{+mAkNbp&#uVAslAc<Ta-sB8Y-PJ+w(>hp<l^%C^=>&px1
z7dHG|IqA%ww>Qf_qIl+iH;O$r?0O`!)sH-z?ZZhp)z>NI+O3WD(k~#XEkbBxcDx9P
z8+6<V7(QiV8)wKaACyBk)!fL~MiQL7=efCVoXN)7T&#U$Y`t79WMw1)Z*PQ77nNV)
z3paIMpwLT@*tNt$AFsGt&vrR=r_0i_QaN?Zvb?RkMocpzV5Dd`So)hCK6144jmx|Y
zqt`#{muFYa-ZECGcs?|^tyiU#<_vEhH`1<$z=$H8fd1D(8fNI<FOR-wvUz_AH4q%U
zcpz;|M0EIF=OON5zSQtFM?wes+a||z*)j8!zvXUTJ2RAM%_03>c>+K)HWy0@N4@(p
zo4#Dqz%tjr^>HJOmcRwP19(#nJHLO&A$O8@1iqZOUyuna&CXQLb)s6U<7G&B{!6qx
zaTVL9L3&%i5@jpDJ6Va}k}9@UgY=%Zs{pYG^Y#;FxJ0LqL*IQ<ofC!I(v??JV{1_Z
z=6b=R*>88ZHpbbn75u&zo1c*5tKAR6cz-UG1}+F(kBK~5?%gX*^?Y;NJ!vu%u;RY=
zulba(yYz+tQLZL9uIdk>a<e<a2Sjx;F*v8>%OlfRu9BdJmMnLLTCuJyjNXED+qA@h
zDrU#67kU>eNp#7xI$OgI>if-?SpCO+zgJg?ZCj}B?}qFSA2CWC-*<X-3V|4ZQ+NLE
z$B&RuD~lolU%*UwZ@1xt`^P+^WV*=;c_IJ_V!lDueKBk+GL)P{SWDNT(lWz65=XTM
z`}*#jA=3<G0=anDN2YTc3@&foS<CgVx<dOz0YCBF=lfJNW6{4i8vRpu8dSPH<CpLU
zO+k}u_gN{-_QMmqJwV?7q`%#=#S72kd#(Wiy6#c{MY3A~r)(4IOu5iAPKu6a#hxRv
zYh}p(4;Ns9iZNo|Da56sKj*eLi%-w|q8vLm84YKq<k?SK2QkO?R;I=;ytBIMV~s6q
zb-O%<)sNe?rgm00_xIr&I}9d*5kVy<bMKUX#p9p|sD`W1Rl@2l0{J<R=?*!OG{jVr
zq~vCh>9qNH+BOz5=EYNgX6$9^R}TXaG_Nf!q|v1idSiPKlg59-$`e_UqP{awU3-yf
z=F*q{a|!B;y-pdyB3!xM_guj2jmYiMKu~DTpWrh{)V5&vX|z|F(Smand<j!PlOy4I
z9z!yofp&6!kFocBk*D=429Jb#;npcHXR+7_8RjBojfa#+kPs$OC*fd(h&H>+wWmu{
zM?lF)(<)ZP7U6lR=sD~$F3IrJ6I+dDX%NY@2|RyV75BaXnNBomm%CZET<HEwG;dzj
zsgvK)c;S-7a~^i;51hwh1GSB{*&4zkBL~Hiktb<kIcFuxAfzAU4duqIC#^Bg0KRe<
zM)dSGhb)a@ChZsqUv8Aw)a9<S@?Ji_73lq|6Hn9Z+J;_>0`Eri%WErqNG^SdX(pHP
z3{CYGLBqLfHh_6LqpN2a5BvIJ$A8D!d#jesI!l0*L3rKad`{@B5591K%{RoF`>~?I
zqIjU7W#7lZh~0;YVUFOZNwoaGNnnppcW|e(R~6p86iroLml7Oh&Z{;EW@11~zIl$4
zT>uvMYWRLhtyXslGZ!7K`+}(4Fn|Vu`(2GAZ}Q#8B{gj<aycKUpX*H97k?H8BRmx)
zBD!8ghlFOQz)Ca5gg7Bf;=_rXCBm4o52+Ld0;xF)Ul}R-fqk?NqZG<kH0ApeF)1(D
zR;$nR`v5PLRi{CRIbC4x<r6b~s8qULTdJO~cd3&;Z_BR#;qYj-@AY)%!SzjstZj^m
zpy>5{RiaXul(b>y)j)jHR#NlXQuy_}b+w|dfCK$bbZEvmVcq$n^TcFaOM&*#IMKpg
z(c2v$$h(wbOL&;tp_J9<$Nu%6oyw3m2Q@|?x^H`eMCYp~Sdn$zlF67XlQX;AmM)M7
zY9DlP&7L-oWSq7LM2CO>&f8{wvGL8;+hV>fJ;MFq@}pEVi-gJDRx`!h25A)7v*=l=
zsQ75r%XBR|K_nvB8Xsaj`1AUnozv~jU67#RK=1xHW=sD(=4g_^3$wB6w@CR8Pz|Jv
z`GYVm&uyq))As(PE+sAgj4?9O(1inymdZr^Z|-0Mk~Iub@Z|!4Z0vl)YOU=*6cE@m
zS#nZxyOml;liTvk5K&esGmhq!xDQeORpBss*1M<ND>z?FYb;S}rKmzL^YzJ7SeC<w
zPD`19deeQ*+$yEChZ^E3KC3`;&tp!-jm56GjMJp;YvVS-%26w$-XJH}&m&&rlJ8Bq
zPR2ti_Nl+NosZt``bjs}unz+%Ip%rp;Rbp-LB^wtO}uB5)i!U`{(>gTVRN#j&is#+
z7wn4}hkJhMB+R+UbWFAxX2Hw1N5_+kdX)noZ>#oPR@%I5?d$O3-_3W!`y)-reO_%g
znq|>KLes4+@=cyyS#OcTNg-A4)-v2q(J#u$=iI#9+LSWOtUhQhH4aeHAJeaxCUV_U
zISiik)3T2`Tu^~KFe637Alp5^`@~3=5wOk3mVItt+LpWJzMk!i%Dr)_$}p5Ydew%)
z+(H`<)QI<UPUrqbxrpnP<^o&ah$P9c4Kmc$6*{^P5v7}Lq-eLaclfz!hmZM^ayCH3
z(Xa)e*Vm;Gdr$L<k25;x*Ty~6HT|dYbBO6Fee?ZJB(j7b9TuDM$x)Uq6&nr^DxQ9H
zU^AbqOopaCs(dK%K{HWZlh1&=PZN`5U=YDY!CBfV=FaVL86x=b7J7(WO*smAR(=^z
z-4sf;_aQeCm+C5G6og^c#mfmZrMTcDvMKfT`lK@1#hkHQ{f&4KOKF0(O=ECG>G?5C
z-}15b(%xn05*L7XPzlyEQ1c(lI7Y$Ma+?tEwc$0Qaq;Z5w+$<xH+@ZZZTiikCbQ-R
zi}EWKORi9k&mVW!<~w*9`Wktod+%*CqMRIj^77gXIc;$hlLh$UlWaP};R9StH1NU1
zhe&O>bhgi`;N``lPx8xX<j8)o&>n~h<G0&b&{!^u#qG^?pVZ(YDCF>ZHhV0^I(3dD
zG?(G)a}q-a5&i}6(~AyuA#?qug~DodbwOZ`a#`848zet=1kDT=<OwOtFs2`KP6Dbg
z{hyT>BU5+DdTLQPC04^70%wzB?b~rBcCMdo?($_p`=~LIBJa!ys-AMV?b^G!D)NuA
z0A?ix>!ja+0MjkW*n1yu%q7#{oaHL!LNHYT6EdzjS;B~j@=TLy2B2}A*P*)cO}{uo
zz&J9S;%V-zuBO5cdM(cR3*SExlfUZY%WxY3=6KFH6%I5@64h{z&)*bs6nS46?B|pn
zwlGVWbkAhEoNp5Bgv*bqFB{@4&S@d>h(2-Xqa?AgYrBwhgKDV4`YK>~0M4IbMYHzR
zY%O{qOW>|cS#S%LU>kALwSB?%R|ZUf{3{1W<Gs@utM^E1`y5HX5R#Nke`7HCIm9`G
zW<#7XrtMbaPp2~dgS@DOME4`6l<!Ua>GVcOnbz%v)4WOiBX<WwIzUsCXXU!woD8Dl
zLc;O-;h+?$_Tv)~>?0bzYy1EE4>X7o?{!g{Fl{+O0;xiK?^b%V6XN9$-?9Z%LHnsd
zV(&+Bt=2OO7_s(ZBL@h7jCTj*Mo%CuTI%KIU#+pG<h&mn+VF)LRqkmssoR-h*K`bO
zXpp3E4FmzLxg`k*DGFH;c}{*jDQpn7xX*w3SyIRJXlaEfQxY`xjczcxIigZrL2
zshAcgq4<~lV=H$@VYWpn1M&ZJFp6Z#k#P_0&W38QaJN%4dSQ?=aks>wMzjr*_w$eQ
zz<Yaxb>89#@q7lRL`!;Icw_f&dM~8;NgaRrTDr~!E2;Q48|igw3c%i4evf~2%hTd+
z<24l3jB`Tg@?h1d$@wz|Bv@Px?mwKA?I%~oyhug$hs_G+Q=aijTa%KK#>Vbj83q6}
z`_#rU;k?vU;`@*5|BR3S@9_4f+Tm5`d)`K1f*-==b@htxXI4brpy&xIB)tja4X;2%
z#STigBly+_=n~M;AQEFyuYrBlm=^$8wVT<u#j=y)T`w!`>E@izVgXfoF#ONEI?U|S
z=n$Ty@(EQl-7;l-5mn5cm?8}T;dDCefzr)HTz}L<0|1;!nj-A`fPpB9HAU$v3=6|>
zW(A*{UgCQ5MAp6ygN(^>?FhdR0FzMVFJm9w$pPJQ0g*6r!Xel}38i-}+b|=-EFqdn
zK?PqtcAc+o5r!nhW$OM-C_!0?v{WrdL@v|w)p$bcSc8-9%?lEJ85N)&S8xb8N!lw1
z{1AlR8rKo)`!y$t2&e-nmMq?njKhc9uaMi<y1+B6-EqY5YN=qKTRpDtHXI#K#rS+{
zxb>}fY;*nH*0R6_aUg7HDM$mLgpAwmKx)b4??7YeJo)aEwMt9j7MrtAw|12pi{G2e
zety=igG=EL(`sxiXCR#zuO-KyyHotr_n9(nKWz_x7IS64Ziv3`G=2sxTXLI|jS)UC
zMFnl*9L_U*ET@mQKN3w^OB9%Sq7R@A9Um%pWrK{Knq#@Y(V!j{9PSf*t9Ed>t&e$=
z5zI9I?E%@WaiL&1<9ms~nLX=qEVy+>M<B=Qwqc3p6R(<R%e$tfDLF+F9hIWCGdrCv
zzN1tc$B>Kt?LOQIFAAR@0FE;Ir#Js@d^n>wJ*M_3V5{;yVoY{3aef7=v!=7?pi&v_
zM;&eew70d^n_m2ze$qcWgKKqi__5cou8Nhz>?{r@l`-|$H-~KEaUXTo_7OVLK?v34
z4gEfsGv$xkzyqs?VZipWn5du}+C7w@KXZEa&C*{cqr{_hqr(EA{r-hWJCb!v)q6IL
z$7jqAtiZI%E|8`<Zg-$lv9Txhjp5mfcJDZu^}-35@36aiYX{!`sL_pXwU@!A<U9^v
z&!;jjXjnf=dsnJvkJgX#hhKpF*k^UnrxAtSmCcsHF%g+6C_ePbXJs_ArPii8uYW$M
zLrNEbOA##1rmTL-j|r&#ENl3!$*8?JB@QI5d;#T%R-bkF&Q@;61+k!%UHfGvFv?#N
zdN=N5Y=j08XM&d(yxeP$#ic+RJu*7+*n2rL&4J0zFl|u)%t@W3Loq$L$^QN<+<W-T
zcjkaRky$cNpiltcb!<g9mk{e2@jWypIAWYm2+z4QTvG0mleV0wE2%DUg<mZjE^gK@
z^=T&Ns@0)F-1~dohXlL=E5i&2+gg`f5AT#O5V?M5Yt;uk?PRexG~=);cw=4xC{T>k
zF}?Xj?})%|2@kAcxBWl32C%lM_fg44e|3xZ0ph_}OhCN;Skxo{2XzLYZ@KlDo?EJ;
z-?xorH8%!^YtrV%IE@~}h}@Unl6w!cNHP0b1~m7_@a<qH0PW@qA$qf`xu$ts=Q2>*
zvC%6F7Nn7rLMMaerR^Or4A<p3*%*hYVY$(PZ+5Q=zgxX?6x%MaB5t<BJ4ON#_EaOV
zC2)@rH`u(iJ}B}NRV0`eOif@~4=ERj_|ZSr)&<V|hefda2bsfqN7CfGi?jaZ^ox7H
zHH%)|2PO?46?-8Z_&4LiH**Knja{j5t^DtnB|bw^wj@}&NaF8HB^v-H$?3Gd)z&_2
z+HXU`M!zj1X*erIVK&M`xy-)6(b~|{=n#GFPVoh+@bjAG`Kh+e%`yzGF>{r|Bv03-
z&xrJqWW(3|m*iVp{=+CfM*D4T7Cy(ibd^OT0XEdHDR|2tcD3E6$OBq|=C)Y-alGkj
zkytjS7Hl8Y)(4bUQx%vmh=t}9b5j>z67s3Ci3lrE_+8{*ELmMrj}<6fEE0zIzVD+A
zcOptPUvngE{n3q-1Hf@OMUuP8CcC6mSwqqZ2-J?7n4OE?T#R)=w>1gq+2J{n!=&iD
z2)Bo-N1|O5p{6#2>2`l6Q|1QsB)v%7A5=IiTh?v_oVz$eo4aTAeT;+oohdeuh}ncM
zKbBTxN(r;;>X%;6lL>YI5Z1uMXEJvy^+)oirf`38;8ATHTqM8MK*+|}?z@*7AU@Fg
zUgvhSTB~kNDAUyez4f$<207^&Zz(WZl@dM*s}|!tq5<d|n|y!h8~wKceIrpuB}sBr
zPsQ<weLMxoU76j`9fEOi3~foNZ*17I=4R{qVeH|Zy2jKb3vfToAUfEh9`4hKF2H<E
zpgax$lq~_Gg3I(PwLiZjNruwPJ98L~pmp2L&$D%;J70EJXKRR8#%#?Qayn9Ce`G=I
z4O7&*T~{_5x8J%#c!i=d97>)ZUjvhJD6=xP@&3G~@YWqau{CGIoRdBfot#Y9yn*pB
zg~gBKZme`!>(g|5hYgu5{pNGCw(g$HD}eK{H0J|q3yDt6RcBLLelnD-?}JnAH%b1T
zUigO_^!9fqTc!m70r$UzEUpz~*=B`Hs{wt+ut79W>Fh|EWp|&F`lbIR9<zk>OtK_q
zbVoO_w>mHqM#_JTez+L#${ST3P6V1Y*dd}+bFZM?7em_{sC4#DK(&(WXH=P8a-0M-
zgKjw|qs-5{Zra)mCzuHelw6rDIX&-KH3_~sCvglr1y!^_7mX{>e+MQqOedg;g3)IY
z`t^$d!2JCmfbS_f5A2daxPZ>CZM{?hoq}SWdTzoOROx$lY<8e^(4JT)KPn*Z6RP~H
zDcjc}9z_BvP^|tHOQs8ae$>a=CSE>xVL~Z1Yo}BP4*=OonqN&No{!=%`8J%;I_X3b
zBp^1|y2W?dwcU22N#?$a*`4kE$@ziY&L^^W8U2^J=-kgo#*ws3qgc3%?4J&4U&Og)
z&j0p6{R>!lA!k%`v??n2q|S8&9XV+&EF&DR&dtXkE!<6hy?bx6kHlu5o?W=gdBc)r
zc4a<*L`POvZ(R{omFgsmM3+T-wtO&TSCTj;qO(2l%^EXdZkR82Jtw!G_3mz{{13a!
zMeMv=clN{q+5WPBa%<U-PRFRS%P`7ruAyvU?W+PL5x8Jn7Pf#d`betau%z8(fa6)7
z^pe|`z<DtK@(n~jBi}!`?+^S0#2JsQKq--J>i+fBj{Ctpoz;Do1edYm)fS<f7i6#i
zGDjWl7Tl%wdlz?wRiO`B0i5amQ8GzL-!mNHy2$u)Tf{&%)W$5%=J?XYo()b$RcKt+
zOMH%$znt5Bd05TX=7>(05zry#P5t`QjgxA|YWR6l<weB1<)ZOkGl5H6ozQMMdPW@D
zkvD6fmc7fLFC;qQ)AKtO-~>|HmL6#7Y-!A%yl%bP{Q!G#>JauS$e&NeOi&hSRsn#C
zf|Uk>=MV$_Dmwxgv6F7H@Vp;fs0Z@k6cg#l#!YeiaH%v^SU%z?HitrP#K~dQ)ouH3
zxm=;zGir!%rlU26Z|q>n`~zIj=dqi=QtRN7u@~{<lmmwzK(lTI>hC`g#pDEDNyhR$
z4$iYi$F&DyE4o=2fwD>!WT1amnk8DjE$+q9-0&Q_lum$!Q#LP8q9nuP?Wn-v3R1Lk
zW|>_s&8y=5MTGlUiSfg&nvs@bn&(KSXQC%9G$U-y>^8u=<m+ZpMW@5(D(z+dGD66c
z^Xs_{H!8$>OF`o30AizEq5J|Ph3r+?w|plGX8WCJp4EaVLMPVt3yuD4hn_-JoWc^*
z{>KYsCGznhmO9#aN7KbGR*CU!<20A&)Bd-IoA^mBIVMz&`w9I^diwMsLhwfYpXCh_
zW*OervHjMar^)($leW@R6W!l$cY&f({#j)+!zbR;%$Azokx{aE7V+U?`d6)L$+JQW
zWb%D(G0vbw#OL@2)#@srPa9MdejT@0+C$s+o-B!Ygy^O5Nuf|Q#AvkL#@?oKz<#~C
zB@m?7Qt_<UT<mZwQqgmJNd&|1()TdZ5sRR2oXCz?0{}4Z)3W9tS#J(2TDiQJ<|=FF
zx$RQUcvU7y6jhQEQgrp{(M-ft)^fv1Ec5KU6bXim0qonMjGMNn$BRv9R~R(wwzGFp
zCsA0xr9584u+=1!oxvqXJusO&oBRg@%|Pu{OzL?<mLy3}Et#Ji9Rc1vxjDELX$b41
zxpkd^``s0X{1hNSk_o?KWal&SgFB(?N9>$Hqyx<=FFEzz3_i&o$xw6fFpYT^FCx<U
z8G{%dH4ouCJ)e9GLBjrm(kwY-fE(<LHT-!rVGbP<LNMc~e#BHh?!~AquxhSIJTD$q
z-mZA6?>TZnbILR~N6A>7>a?fSdcHJIfnJKzfS|#DV!iqdPM~0<*s#-VW7n?T-LL3`
zkZ)`06I@*;O%F}Xx{P$FeD=FAa~R@L7D>{8!ugDA)!6?iu0+6czDU^CZ<$-$PQA#l
zx?+IVd-%&jgy83h^a@+3(3(Fr!Bd>;FR<rkH#Awo5^?1<d!uFWa}o~G<#KP0M}bQ@
z7`06JV%*oCe|^@uL9g1em<#@%v-fvZ3JpX^_?CM>9^b;g86wAk@e`1~-)z<|KkuVt
zE4w$ccJSf<>zkhIZ@YKhT+HwfN`8fYYxpb#APeXy*DN~)03;$h)pK5Hr-OwtzIuMY
z(xFL=SJN?jzOJ#iNh6u1#-p~SH!ct8&m6Uvu}0tFN6>-&PujzD)R#>5BBu2T_Jz&@
zun8D6YAzRj2v~IP3V@kuJ5kDd{#tazwe5l58V5i%7&B7o654J_W7rAfVskP|LJbtg
zQmk0`qN+r>Zb{M_8i%*)6D_2lxuB!I0RjCLtuw-N&hCR1&%&+-Bq-TGr2P33rJqCc
zhd%M4w=RE=kZCCya<Lr)nIwGD@w*Udi$-@1GG{Oi%mI#Bp@AaxU3<q*3?_Gz6&Aw2
zG<)Mj+U@x%RX4no3<C!+xFx1<fP7zExtzfdS3_2WwUN{IZTqoOYs&`9r~C80*}WwU
z5f}fl-?m@qP^_Z#0dFqK$iAh?+5zJ_cl-z@fJe4@v0E7D2>t^*GgTyc#ZX3jEcnc3
z>ipuvMrdyFvafp%u_osU$L4PVh-AwW^-I4Nk)@5wB#g@0%F1aU>VV+&*O%8;wZU><
z>)R?2n7Pp(yL~O+BAm^1-^e&BvB&@o$(COcooSlX)vC63)vAl4<IDn7c|mcyYFmFk
zIqGLj<R8TEZJRqKvuIT$?=><PcNrcJVqT_l+<G|lATmh`pTGfH7vJTCZ#RSa?*@y*
z->Cv9O9W~vP=E+TKxwwa6~B~+2&i?)bs%v<tRac^NOWZkAUn*C^+KUldRAX$ia`Er
z1#+l75}#&^bMUCmVqHg6S91$?LuinCoKJa4`@@V7%>?q>Cd5ROFJqs9f(v_&NGcZm
zgbrl*TbziE5d-0Q+1|vgW>6SN+WyVO&Wa=H&Ixfy@L$I%7et+uB3a!hKtLggfSTgE
zI38+aZ;jfiwB6tEqzdDuC|F)4bN-PmN__rv$<LZcVXwTmeVcNA7pmXPl*xB59C&Ol
z^;IecdP}y^cr;w_W9^p(^G+U{e^zG0z-D?MXCG)XdsASQITyPVcxA0|vW;DQRfix(
zCXJt}D<|O$5Uw${i4x_g+5YUGGp4i74ti{1C%MscYpuVpE7F1&<Kqp7g&Mo&ye^J}
z#PTl56`W7ZB5LMMN^*uTj^{3Gw}lcw>&JD5hKmeHCaMdm0C*d_eq2B$92`M2Zj+PF
zB&qc-Vt4OPAjM8_=di!~4IGDn{86F}+<pP~Hzyuo>rRAkF;w{R&v|~JVne-qh`dEv
zJRBX25a!+hM#fF|&Zy|mx)Ep?_{bZ&mB^u!V3Z)EnqX|Kev=f0q`}JN2p(6JPXtJT
z7@s-&d)XWM`9tGWmV`Mey+ZiZ!aElkx*l;zL(ID?N59cNwMniU313>)j6)Dk=Ob`=
zMF*o=BOc~UI7Xt2N1D&vP8Oz$MC}WDmxA%RY3y61H`e#>MxU+xqHz;yl|}SmI?x$p
z4=RT(+JIo0vT>Noi2zf*lRhX9LqGL8Tm4}HjvggBeyB0{;v?|H#RAeyixXBw-S)qH
zy&Bshiq2l&Trn`GHCRhA<nkM_&@d8pWlQ9(lk@1)EXKvlHulWTiU5E|WVvniE9BuL
zy;Kg@FRlxw()InqK(EdPV^+zeKRBN)jNrECVST-)vpUR^%q4{OL4x{<2xmX{BDs$r
z8tLUgMSA)xvgVn|Fy3hFJ(HG-hH!CNb%!+lmi)LjFSTF!@-99>LVpAm^We+!oe%c(
z+^3WQq&M#kde7nBD+=%1nHCmTjaaWL)2|UbTf2bDr<>a5I^m7KGa3YxNVz=<lpF`8
zq2NPwqjyA|t{rV&^rX*7<evmeIe0NlzDyHR2TVYy8OCq$K8373yYgNi?g?d#Zs08B
z%5`FO+WJjDiU>6=Bn#iOC3+N^<z25l8o9KZ#Kw&pr1a|MS@z~~E0uZkdX|@<H{ia6
z#Da!_mPW0`xbDL$v5#Gu!{^l)n#Kvd4L03UrzoeUh90k1>)5AQqfcX?-*rDiuX|EY
z8B>*U-SQ0S=OIX@^hNVr)zR<ikxggR>t5h8rXsYx_{g*uZl2L?f&FhF&dEJPO=N>D
zql!L#sC^u^L3c~pT)vMD2wHS)%n-eK4-5gg`!B)8|6q0?zkjG*)c})g&hfP=pm33I
zM98hG$FTVS{X5P-f{i~q5Zfn!fMntcw;>uz^S_y5e1fA}C&aI;hVpUI0EAHTileFy
zLHPe8G?CW>F?NeuG$tTC1v6*eX+PUFl>9GMkf!(_9nB2x%yXmvQYQV8;>?Ky5|xXZ
z;KZJPiGoxL{^*R<Mofd>{Y%|rf&mC&<|gqquxT)54dPcQ{0hTN)nxb7yhD~)HNAo+
z(IcIBGgt`zR`d+$J|&S~^ZPdPsGzcCYgUz(X!CT5Ry}S?u-CFQ-nlBz^tCX<vR(ZJ
zY9R_C?2mHhZ<V4-6H;B$SwtQpT`NT)qnA9t#zYM^er;l2HSILZs*kL`Z2(|n<oX?J
zfDs`bWQJ=8rm#IwH8PQ1GRXL&vnKjmq)9~c1^Jj(wp{k+ozmP`@>`jS2fEU9Gzj+7
z>Y=Fjby&O#fYvlrK8<dt&R}zgQ8p9NYSl>AhqjdxEqO|tt~Tk+#vZOBt56#AN~2sU
zzDDAMvFx(!yv_$V*<`sJ(GMcB7|4{b5}jl6gaX=^Sa-q%>r}M7f=)yJ^I5)TBCU;#
zq=eeA75k{{i_=CW27!7`+*9-3acl()Q^Dlsl_zN8SYvCf*!zw3!eDbFaZuDn?Nqyy
zQe|2nDhq*XmCt6<l^=-w*HEpAS_YJ5h#7Y1W%W^POG;>k7%Xb?p2dwO1zlNTp>~DD
zy^-;xBos`OF~Z!%dx4Kg7nLw2KB4QMVnW$zYSfZ983^;E+!2z!l(%Hs=@G<Fm?7^y
z|9y66n=#ovp7qZ?F-?Bo(n#dgyc1W4#vF^<o)%|`A{B_gv;O^mxB!8)(_tJ067Dmx
z&f@H561=R>U!eKWWl|NZvAb<(ey3f+A|zuR`VQ#8fJ2Xi@gPX7#bX8bwfP&g&|${j
zUD&a|rzEY9t9g(86gPqfjVCy<^L?j*zW@@r({QXqPBw?UG!Z9onH~vo7*iH5H~+fA
zmHhS2`Ct281<D#e($S=U+yP7&5EehJkBngY>?aCJPpHLd$PiOnI0ofp;F__9C5|_@
ztks5iF*^;#&{f)C4Q;*aY&jv3<$30gSwXuXo|<{zD`$!nPWy|~-lB+f{f(ZxMzm|v
znV~_jgo;t8HNJ88JJxdRzP=B#UAlEmd3#Skg={ky$feN~f2YkB!QuiRC6=?6&ycUS
zRQ&H<`7cGJq)%h$==fm-(V{=%(H+|b(c{b2z3WXwmpc4rl8)oly4{oXnmI|4PKfmG
zx99{KJ!5rsHK;i9y#tcHKqi|-_4@&i4jobu3wXuFl%0^>edRVfM*?Y$WUNiPJ?+e5
zMVo?Xa~fTaIbKA*>HS2@3ORFfY2*Rk-3Tm((iD$LA348P+Chk0sGS6MW?xhIx0}KH
zIodA*R=@$Dis*cp%bL=kaPa?is;ZZYQ>FNNM#sJ$6;Rk}o1xq?`aNe?e_5xs3WfM8
zh$Q#iyGY|Cm{xOX7DqiHsX{C5hDwRpQxb=w_`*>o0(Mu@JY80ZBU}nqDFunXCZUw5
z-?1d+^!Tir%7!T&O4e|8EE4T6(E9bjQVcNYO$R@TS{z`F3Rq<P(bxU6wF~Sp9v<<<
z%3ou5<vN%;D941a8OI{H2qdgIvyFvi3@8@0z+)V|ze5L;UavV&a;Neby<DPZw`~D%
zWiNq<#9>ri=_lU?Rz#`Om{?2f<UMsJ8atPdLm99M#+&Dnh0i7YO?$!9blld`6xm;7
zHi&b*uTiTc9O)`Kdnr$n^Y#M$L)y7g>{)Kvmzm2}_->kOrFmak{pS{4u^BCGrNA8Y
z6*hw?FtA`~99Q?8K9LYXkNJCca_XDvOmJgX%<dRhD#=As5M4YU9lks^mXSlT=_sLG
zl@p_Qr*-NE5yCSx?hobBKX}TDmr;aICHgk}USQMqP~E<MRQQV5A4Q7U0P-ex!LWU`
zF{Y;&@MP1g>c3+o5Ld#}(&;{|ft*CM!1d{FPLf}-Q$oLs&PWL+?D3YL424<Y405r|
z0>w1Clvmil%rqz;fiOSZ&RtJT@>jV0Pl5Y{7?2&b*yqLnEepMt29_Rb8>fH&D{*|q
z2NL6Cz&QE86V8ALuyk5J^0V)M>(Bx35C0lWAot=&USR&G4D{~^{_sa<>Sts7j{9Gk
zDDRIvwZ@<(^xq~OCo#a%ot0Bb<-Z@A1rS>nAb%J7ciQ{v2P|#rr)x+3TexZgNN71(
z2oI?Kz56>FVCivw(bD=~Syl$!A7D^USrEuB%inOZB%HvhwB>#<T14fwM-)92#BnE|
zNps9f3I}+ZXWIqdp2XNb%<zu;CMpJTCrLCC$<LUncE+*H+Q#|D*04tG#xY42Q(D-B
zFo`#xwp;H;5S3L4LH|C4s?7JSXmY|;v+_-99~z(B$;YM1c!~^${ANy(8BO{qFlc74
z?IVjJx%60@2ae-M@wlVe!ISMT{5nP!*CfX;H$n1R&@3-+l9mkHc<n@81u8`65({Jw
zzPPM6)a^dBx{N!LvO~13G!{vD+^-WVOBc0dCt#|<_p%c@va_-`A0|cs7mSZwS6-Yh
z0)S6PCs!!m@e?Fu52=FdxB?ksmYro2PK?9XW#r(C5|fCEcv@&lai>7?z7zFcr_o?Q
zj*so@E&m<&$5#nwWqBNNX_A}C3XO!_WwB@AynfoKmwln%6%%L|gt!|psID9Gy6F;?
z7u1YJibRj1vvJqW+UQW^Ei)vDMqk~y*U}!c46+RKZTGtbO7^1A2~qFV#|ie7{awuX
z1EdxCfU00x`cj3NRKiE)32<lorHrE_RC_rq-r4iHVn3~cBPF4Jd`)8BDcu2caV)Ly
zj~&beAw++O$?-YukXLD*WZb+wG2uE9svNe4l!Pmb?4_)hs)VnZtXP8E*D3uE46?10
zl-}{BMn(1Gm%<U4{2+6~$nC=H2|FPS(5P&cbJp?8xWm_LM9uOy)b4D3^7E{ix6EC?
zl(CWq3vzc=<9rjXA6k^pl<A9-X>Y$4U0kV5)W&5czO7C%D_FlTA!Tm(#myWdSs>E}
zPO@6a`Kj5#r#yic2QoM2_+AtoihH!V^s`XLQ*UC13iW#~5*@m?I-PEo1bPr1qqZzD
zR%EDq2W6KgDtVej>H+>gwHjcaK+Tw%x4s6+BgATCUy1l-n3#>XAlun<WGO3vq=7>J
z@j;9JLV?u5&iJGk<<$^2et2&j^f|o==(m>KjSP0lBZ#H!#YsArg4B0io7SAOfbgN$
z1e@KtzlB0_Hryo0aF9;S)RSddz|%W=aVg$LL_KN#cyx|pw~CY%jaf}p))gNqwcMeA
zMN8N?23A`_@*Wm>sXL9zm7lM%ScfHNz)ASZ+!XqrWvQbNF(@(`jhPMPpqF^&p9grD
zK;qn})OwEsCX}}a`y-ywU$95u8}-XJ32YC6N<xt0{YpbYAy}J*&knBaJ2-&3jiIpv
z@*d+D7`IqYrD~<43vp2S1oS><3@=gS^3>J<E1$yPWuD!GRwO&CZ2JjZy7U2qt+a6p
z?r$K*FX-Nm6^AOvTz005@2~fYd9|e<6l-7Gr=_%u!@hN&FLHxBM=luS@YL=p^iyev
zJe>^|^K(Ost$R~J#@+kw;1>N3S8(3K(+DAt59AL~S6**dG&6D{nF-*WHu!t@1%5iD
zVRG{aGaXukMnrIXR=PgZo4d9+cZwB-g$W)afz?PC^3k(cr$8&iuG*16H%k@vHX{~t
z(i$s4r521cv_6YIBMGucW7X(hF=k)pf+O-1Xm!Pm(u7+lqUPI`<pi*BNT{OfVQJ*Z
z86HI52l@iKyeZnIr^2ass&^E<jwMDO$wP~;y<M_^f-XZA=oWk#esXe&)>$9he(cNh
z5{U~=c1gcC$gVpf*~vpVYXz0><NXc#+Y`y3Qx34Hpkqx`*p>+sd=xG4ZRj$(7yoq>
zD@{vHy@y<qr|{#dLkXQ;7Zl$TW_$uci3wsN6T|ZVV!vU0$dCm>_kBfQuW+a(eU8G$
zc%2i^gD-ao)0{@MGRg~%L!-nI#lEDBlSPre(Z!~Y3JNo(mhIvXba`ISu&9oLq`@vh
z5%24VQC6Q2%E?ZYyBG_;Stp0-{=ocIO0DL1a>TrX58$dn^cs!PsE`jdDR-2N4z0#w
zJ+yCT8L;(KCg7Or%)XS*Dpfq0RP(-g2m~0LEYP%IsVGqTJmx9KSFrx_yEDNn2<y3}
ztNen^cO8MBkPzAykod$f$o%90^zng-?tt(Nt({#ZA-Ib6{zuWnm_+JjOk02M^ZT>l
z_oN>%>_iLOZ!WtcxbQg*`n-DFJE^B);x59Ca5XU`s@**D;Wx)mlu>=xGq*_;np$%r
zeIGHa*iAdpkNvH<&^pD4S{e*U=h&`(@`@5^vf}5jwJwLz;1{lR`mGbKNIqR-n!o3>
z<YiQQ+^64hm94?Kj}LzG&>!#{13`P9tg}Z?dUvc@BE5pp--Ueqp#Q@g3@&_!+@%>I
zLVQZBDpXMk=4FZ-bZRg%(mc65X@pqhP8iRkQKjVxRG5g_{Rp}q9Xb0HZrHNXPUl{$
zqL&p~!n1&b={tpT+|P}HFX|x;lCWml`I2LMmwE6Ej%>{+;PchjA35D2NT?1~q|c7|
z)P?pqlMIiU{+RVTC>ilun&D9~;!facaxY<9AaH0BthR~8ugw1GKK&n2Hvp8>RtLI<
zQ{scA2U5QR*<yoiQdU+AdQ1Xh1Z=B7@RW~Sa|Uo49IOfM&d{%Dl<|1NISO%DFBBX#
zk)=Ogn@Js_9mjv!cI0?1sbg>)hMRCoOVyE*vL+Xd{B;nG!u!PR>E6#wR0NHAO;fX{
zUvkcH)acU`s#+u?+qvd)l@xw}YWw5X<K2i%@<~g|mhJk3A_Q0Zk-`QoYH|c?!W<2;
zqMbeHL?jOL<9_>X01QvB9c0teCd1ASNAlMOAcOZQ>>QtSrL#>|zxWQ)Vn(FbyH2=O
zeC4XlBQxAzAV+_}I_M{93Xr+|qIGCjmYH>a!PYiodNGgts&~*83b5?YJm#^mA`HN;
zODq=cZ+<+gJ+2|qop;IgUOmIWSKbbEr=-NJ*x&#MGik_ONf!W%a2BJly<|6K^%d@-
zME4J?0RsZTMJr^VyOb9uTosD2h+C3T(XL|sfJt8fIn&3|OOj3n4;q$@DT;5w=*)g&
zi9MiDb>d1XF@*!N8aIc4xZ_NY3`xR1dHgq^NLofBI1x369<?(5E+LNctFnAQ%3aV#
zGl$W5S4?l@&oHi=H%Y44_fJ(pQ`G5?NtA1tKFk-D+n?w_gpw*%M?tl?t8xqR4fv}=
z=Pj@-EI0@Hn-WBhsQfHD#PuhFgx`ru#!z>APmg_9J+lpZ-Kh9Il#@V<t}jcEy^|xH
zCdU#VEYANj$Jt=^*TG8g&5w^^z>&85w$XlVe>~oY-Q5#0X!{2}Myh<lE_#8FT6XNk
zNE5heaA3IBCoSSxb(lRw+o%2FDm@#C^2Yv~5grK>J%UXNDGd@0GF`kg{c4+_w{xnD
zSF$jD5ULXV>E7u%j@F|c5RZ5$yd|oY75FiqC=h#}#8c6?g%MCFgrTwG*p1O7<(N1U
zm+JsymqCo#*E$ZxBuVQ!ih%hRq_TxWR+5(b!Q0Q&p#!}#q!Y?VU%bDRzkK5xqMkXJ
zNk9=hENJ}56GVZtYCc;dBRcr>#eJqEdJwXPaVc%XYA=N1QZz`(<e^P}^^LTB0SI7$
z{Ru8HOnP}aXJZ9^p6GP8-;wSq4RsCMIdKei_AA=dK0TFC)tINAnC}wmMS~0kGhG?A
zc7@}??~P+amveETFBZo@E4t_R&3f3PVD_j>-^pbF-Sv5aKC9@@^tr6X3stTs*6q7=
z#(3MEzy}p0O=(;Ybh-$B=zVf!3><|umo|tHO3R$vBR?Jx_pljwdL$@_l{;zJ=^kAx
z$T_YdiF~oD9Z)-tVt2ZZ_$YNYK8DpExg4t@ej|WRpwoXfB4%Q8GV|e~4~}ZIJG#b1
zo$i)h#UcD_nYx{X?m$FZ&XfPDuTn)$fNoDrd!psQ^PgJf$)7_<CA*L^d~Mi^hmPVR
z{)K8LrF3-$LhCP!;w7JAf65`P!uf55Un9yv*@!L~?U~oKJzCTY$U5@oLnlhIVC9=*
z|IPO64jgv|M+(zYp@av-==`8t8svPpk?s@lCW1WvQ%pDJhml7$irvt!2G|g!STIif
zr?T_d2YWT6e8SFQq;T11wqH@n?5Iw4hss@V(HiUTJbCmeB)I@~@bxjI*KQn8pV}Fn
zo0$5jKf7_Q6XXPBvSYEFRokL|r(%(VuLDB3yFNtcM<Kr`H9(asozpw2F7n@&USU5D
z<VaG$F3SWFwCvTuu&gjlX_@S(A`(TCkb=;CMo=D(G(2zc52q`UOl5D?RqrVX#CE~w
zX!V~SZ6%+}xgYz+RW$s1w69*zG%*@+f5HI0G<W-S2rvqzVo=5dIB)Kau_BOhp%{JS
z_zCuxeGu_<qgsg#!%eZ4qlZKZm8#2k(A%i1X#51w@bAe`(-)OrnZT8qzS}Im;{g-e
zFR|W6HUoD$4una0Y+Lyy@Lg8IkDMNC#arWJn(+&Fr(chtfmJ_fFCLQ!dJgk`kNlY%
zK7;q;MMN<Oblpb9xMLMrE{K87v@FIv?aqAU)3XGH(|pb_i0qml4PZhSPk335g5Lxp
zH5~^(I+QPmMp%_8Cv<YfF!R7kvhZUElat$Z@vRXD5+BU|i2^KtF2vg$jN&mru2oZb
z=rTp%_6buj2(jYs8nwVXEHwe=cL<nthLheMRY^F<TLxfJQJ4o&GKNtv?%0Q#o)h_R
z$7i2aJ-)PJjDIeCh^j(Q3i75#&d>gJVkrH#s%u0#8;s`l1(RT-dm{mZdqzPg$KXqj
zN%r$u&YckCB$YC0clP7Hv@jRUaWGSsvaemN+p2h8#cY_PBSo2_LZ303++y=uVnhxu
zXd4r8|5_B+i)u}G#POUC(1OT4Az+qL*ai(BCBbD7x}QEfKr+5z^hyS?=de`TSFebA
zO)_9o)u7B+-r$O%kM{>W>q7e`E*yI}+Y6|A!Z4Zqbbea!EK|7S`!H$<Q6nNG*68C+
zKVn5RfN#u4eb*>nGw#TP95pvO=XWm!Pu&*vd?aL{ryyznyf=*BmqjGFZ)Xf30uYHb
zp9D~;2SM1^VY^70eYd=^1ONuB1>yn@67mr9qJ6PM$BN}(fG9ItXw@A#AXT%MQaOHg
zOlF`=m(h)x0Cnm>8Si?dKtXHQ=Yx-1u|Jt@K>$^U?zs>8$%CGlt|kaas?bwVY$ghi
zWyzcvb$)3Jd2ezY4yT^cz<;~r)l$gb$<%?eX*g_{EwC1c%vzSOz`mg^(}Hk#3=UTA
zF>@AEk?P0EiCHJ=t`D#)RUY?h9dZ<@srn1SIEnt#1k+DRw6FUKQ>|P*!8xLw7zr7~
zqtMYXb3hP#LG9{&L_)s?8f%D&F%lF72P0PYNksMdG|YQ2#j;7aI5GMJDbQoZ24I)G
zX6jH7^a*$7igf*cG7~kx7(WH}q!uos*)$4R#hJnEdG9PCN>8ZpzC{wH1Seh_yW~c}
zRTgC5%9Je<3YfWt*$>h_lgB#<w4#pJNzwnhZl??U{Eiw3w<71d7qk8H7d-Xc9JFl^
z7gpP&U2<<1rc`!$8+h+aOCO~0;`<WueH-Y8QA|YR%^SZH(Y&i_y;Q=9khA)2PYL1X
zzlzEj%Ko@DmZJuOlQK@C0)47sMjYy-wz2kWvRK7%YYeF0<K;k=)C{63q$e3x?T%w5
z7*?l3{pYlq<$#9Z)mO~g8a#Yj_`}GukB^b$`{+-s5*$V%(AxuowmaTFBgD;)b1r^r
z7foL>;O@>7+P#{OUY3OhWEACW2!%uSFbC20XvTUVJC`_8O+VLm)(C{RV<KlGL(f$4
zUtnF(LS-Sujk?rLLM_4A=sfo%RN*VeSZ_{9<=XAL;4LMn=J<*pewe8bGQ@ob5-eSS
z*W1a4&!&(><Y_ef;VE-laSMEZQ<?r)PonwygjCSWoq+j&V0~JkgP&NJM!mkLP&lJz
z3HGkJVy}vM<2m+>xuOBH1Ca{fL*GF$BJrhvuhZnUYL6+H(=ag-THR$AhKMA)(KeO~
zf%H5jpcmdfzVN%De6cTZ+8@sD(W!X4j&#w43m6R+B_yH=m!&l58VXg_P$Aq<9hQM!
zdE=`1;o%7Bq@wi%PE+C(7u|Wp3}n}Bkja}os&W3w#mf5B`Vy|CJw0>H71C|-XQtHY
zYqsIdF4U&V4Ej217GvA4;-u`)t{H0wH#Dbi6|?=(7R|Z*BosKW6Q;e21d8+EFS3Z;
z@p|OW+9h-@h(xZXRtEjOij`~lj5(KC@Wr-c+-Hsy!Af^q1n<q|`%rq!S}I99A(zo`
z&9louNnvP&#M4|=?i&{98+Vz1+A+X5F6=$)b2+QEDO7gV&fwU=5RAL8|A(=+fU0V3
z+eQ~5At?>gA&Zt0q*)*x3l!-RNol0J8w8hhEkZ>)r9n`-k?!v9K9jAy=ilG^jq{%|
z*4Sf@A!E)b?zryjy65vu;A6}xrW>gSsKxm(8!E32X%f4?UO?uWF2Z99bzWW8(%Bz-
zoMMA6|Aajl4f!Nhn>h{S7beDEf*0;t<@1@_Q20oi=Y>doNi6lV|D5RemhkQrx?yt7
zmn2V;tL)~9lI3_BH3*x-{3^n(MK#{DE#vqqoa9M@@o<q(-%lZp+i3kpI0zLgV%77%
z6yBu>m-^4609NnA-Jxqtbmgf33&Z#Ej^VR(&a3!uhVM?Ddnsjx^FPi`1em)eM&N<x
z<2$e4{WsItdB+Pz^i%%#(ZcP@chqIsbiv4flYcNkty3U{3;)j}jDf+Y-?53Nwc7vV
z<j^~ol?fkUHMt<}K96J)n;Mx!e^7n^`zl1ajzoo4LN-Q?ODSG*qzeD|%dd)2J}Qp3
zr{t|v<9&yIb-9yttf^h4!!#ydYb~e7^?XHBFsNS>V%COhVW@e(QtcFoi77hE*bI50
zkJ?!g@M#m5H`6Hna7&!JBb_keL~((C*Ufk8j@fjc7_?AzLQlX&hi?O?Re+ucu>SW0
z0ePa9jSoHS)Y*x6NG_iCZgUV7mQHX3LMn`Cv0@f@Y08bl7|JK_>C+`*;7``y?al+L
zlm*GL45FhB*jx%v)A8dgS=B>;rQ6nKkLM8^WRGQ!<q^&l6P48huM^4Iz?{U(sJ-!N
zX?P%Lke?Hrd=s1jS{g+6t8X6+?EO*2GMcQQ%r`0tDD78DdD^@|JQ$xG83X!!yrl&_
zdW^1~s;+D_@<Qw?1#!sIAp-@ixp8UYQoH99Nr;&YK8!{Tn<gChe|@kqt=|El1u~?v
znwMmY#Ok53ChIcremE+~R*I#J56BSG8svz*R|9j3#yHcDlIf?cF6B2qWfw;<fU_<q
zyJBC%R4^<Bk^!n*z`WJ<_5a5_82k<hWbrd8UDf9fLIdqTWv|jlV*gwmkH!8J@M2Fv
zR+oM_4q#&Ae9!8HaUH9Iuwgu0XTqKgvwN~u3@SU34D+S(4Ge^Fp=h0g^r<ZCV+yZ7
zV+u_M$e=|6st9sB&D%#1!5}C`SM$9+?cDM04V3`Qv)TLgRhpKpzKeyZoe?SKwe%oT
zY~@2%-zr}{QjIMo*ilV(G9c@%XU`>u)Cv2045}ZDDK|e1!{Au&qYt=SC;{05r0=5&
zqm0SvR6xVc1pL#@)uQt;1(#Bv(0M|km7A<MMdf&9N|eBg%k)@v{5MJ`$r8uQl}+BH
zgC;XJxAo`Sug#TpCvRSF*Y6_ynu!O_2&9(enu($dJ{tH!<q7R)y`<83p{V`6FUN$r
zk{vLy660cw|Hu-)w~T=Ode+lCiXVMPZ}@3ab1*pe1Os_FDwZ-f;KS0c268Y8dy%`m
z1s(yu^Z3S(w4W{vZH{q5TroGXJTC`tS(4AH>1n~}6B5%!?UIIF{oF<mW7xN~*MQH$
zp^flGUel4Rn!@oMH`nv{LM{1t*1>Yw(|0#1j7Lj;e2z`xYKyN=caJxRv-Q|cFF&b&
zA6Oh>#PB_z!;Dxy{|LdQCAE^)+BlPNR$nw3{k9N=Ni*16yHhyy?#)tPIsrs*1xFz-
z_udzI?luM`<Of|CKFmheO|%%Iw4CB7+l43|n!83POUPhYP0orQ=<|Y`tMeVLAD1rg
z8xF>08slb2sPV~S)2EZP_k=YOB8-MK2SAhv{HC!xh3{yvEb-f+3Ct*<uk}=Y`T~>X
zeGdzV@Rt#Fz3dk(qh^Oa-}`H*y!*E_ht$5$5Kg=uQtSek#u5u6)5H!qU2T8YSF@Tu
zaNGQtUh#{_<>#vDLpa4De*W;py*0%GaqbB^d#e=yMHS@35q(1h*KuXl<tL;D{bOff
z@4sZ*i}zC?p4m>xn4`&^Ku@p{53?O`j6eB2el3fLw@rjhaY5tR1AUSXE5{|ZcXk%$
zJCzkaIAU>I7zpM$%_Sw{v9PB>8=@r7`a<Qcfl6f`pyz)4BGsvNyrxPVf4dyx?2s`K
zu$hS{s=Cw4UXk;ssZJ};wXT;<Yt{x^zfI1`S?T{Mx?Gah4(g9OM;Np)|0~o`E8fL?
z^tZN>DBQIgZ9&wSb^sflcdGoDew|C!fnl1_+)I*2jWr=fnAFn}jYuxQgqHnF?r&BK
z6}@rgu=bU)4;rgbyzH;0xL?7aN>`feWietxje^#H*wLj<zN_QOIJK%dO`8$Ok+Mp4
z*+@*{H>qN#^7tAQ9WF%1L;RFTHyAfSMrtx9(4Q!h9r{J^!lL$}pxM%g24?lby3fMZ
z(8o3kSNT^mfYE#TlxvawmFZ;gfw*dpsOi^+`)c)a4vc#>az;fY5HKnd8&dUylO`u|
z&<@+!JgzZ+Ikg;j-BTL1;QB_JxQbD)rdHZCl59e~OXNyL+p0ct#?v&gSFj*yGJ5@V
z;S;Yx2aNBZZP&RS)xS6%)qmwxyRCan+?`&q5}%r9s^`U6gvc7$ALUN0cy`|l(D9z5
z_WrC|v6;Izn(vA>gnhJ#vgk3h-nWWh)MxbzCY<blI4v;bCu9X34f*knl`?x?_NWWY
z7t1AGb~0~Q^xbF%(3HJE7Y}(`PJ#Zj-0NEZ+fC!GpGf0(i=1BJW83=DHx*7_t6rMS
z8xIUgn6<HfD<J%JZ-OUx`SjZAEkm~_B05IQI7fjk&xqG)>cz=gwoQXpmh@`fM*i!r
zc(#GsT~@#cQbZ1K*IQ0!RklB35BogAca+-3;IX%*s^8nP{JzVTu8Ibs&)SxeoCQuN
zT_ZxQoNVxOo3;DcpmiA7s^lt=o}>coh~9pA=YMwO6OfW#p-)^(Q9B~q^>JJ+p3Y8{
z&;Z?N`L0eB6%F%5+qK@t>VBMbC}x9I?z}bwwU1$DTkYb_!f`)I<62sU^6{1CLx!$o
zY0zRKDi=j>ud+@G*hfEdf2o#g(RCdglb&+TK8FEG*`wLq@xCqR{{=|CF!Y}Uv`nJB
z8wjGe<x2YA$2lis@ARELWEye+8v}2j4b@N`w@mDoyXVv&Y>W}l>H~}D;m(`iaGzFH
zX@Nk>;NevE2#_tI0Ww;0T0vBJcaQ-S?j6t{Mf1D4^d3@u5Ja}uh1v&Y=KMyZACv~p
zc!}|NnWuR&Wd)%Rs6-PF(GXmb`kzUs{aE(0@wU*N%~^|ODarCxgTV7AgVKYY)yU^T
z3iL+fiMSmk6Sv;Q=$*LK!5?0rZIeLQSTXLATD1iAqY`M`yX4pKmvQ3sFHi8$YRm1}
z#Tz<au{_>ISA^bNpUxKEBwLosKC^E(O9VpiNEE5nk6TL0|1JFqS|y$&!NkUZ0|+9$
z6lm;QZ?RRwH?38#Th)n$Jb!81tYs#(nGJOEek*Q?V(4|WZM43yI40hBZtyB}mu-^|
zNK&N@6w1%-sVdf8g4u!Fw&5#6jjb^j{bv@sw)@Kh7a3t%bsLYZzUv11y}QwC)I;4Z
z$>FbTl_q2_xWR=8#=Rb9)xV#daV%gzT;rDW1QW=>$z=?`2x-;zC&jS}%}RVT6i%MH
z1D~l>+cHk>Y0d(g#imV-ZoJ+|xtaKR@92c}CB-w~ON7QMol6QRgQkdib>F|G`|L>I
z;pOn|dep=W%mT%aM6?d+Y3se|S8M*xj)UgB!*F?y?NV)q>$62Ito2ddGRouy)<g>H
z?Hn@GR7T|G17rE}<B>_Tf#y}m_Ic^`9mB;6&%m%G?oE~Ya)ZK}W75}O7i=0zG+Izk
z-j6CJ^GF3#XzD6zm8;$A=^k|i?{0>zjOHF=rT%h+E&Ft;@j5PCf2&$ls#z?A3oNYN
zqN`8F`0ox%;7e^5HQxMwK&=1S>HAbJ+xIDEo_4?=@~x%C;w7PRERFRS?;AB|z3Qdv
zFQNwxFXKL<clOuh8Q<UDk<kt0Pfe2~<}1IX49rMZ9U6Os<!=HTWB3mrmacSh@Yr=f
zHH5ROdkHe)7S`^+RukOtR+^*GILCF|nbAMR*)0m>=zdwN?Y0(puKaFJ_xro^FO6$6
z?mL71H_qSGCbeH37t{UTtb9B?7nbL!Sj+x@W+-?Vht5-F#f@Rrg2RoC6oKR4*KyA-
zb|#C4mwOR*YSO+}jQa73j0Eo%*pChitwMBXvMW$#rxhRDl$TJZ?56;yaVgi=&SYcj
zx@~0m;|xU>JkGQBXK`%lnTGi5B>8bYo+zRBI~e}nI2a;cfBd^UnAdm&W5RWHI;AV`
z|FRC^)g7OcEsE+-S-|ZzRHGK_u`iUMcp9by@_n264jhB0vz}S-Yj?Bd1HPmF?gtwq
z>uzki2VobOq>MkiH;dR`4Yx%9UV~q!^e>SCUt|irJC0A!z1>4ycs+vAX-rix^jh1h
zpm)1)B$z7SzVB=`-TRgCX%FvuiRtlZy0?IM^^reS`SG(!iyv3x7L%F2GcMi*?1i2w
zN+R{IeSK1%t$SfrUn((5w#el$fy>|&^0eT`dgH3xRIX6&m4?stC*%5FI63H@#cp}a
zMt_rc#@5|tJ54Yj#R8j+<*Hg3I~m7PG>_zNezo@FVtF;aeAC+9>{R&F9%9q8(dQ`9
z)s^yL%3=5Vy;@hL#qT4n)q{&)rs-e%<~R%OPiKa-ZP>Ik4UH(-RiQI3ibnNySMtt^
zr@dAc>z|Va<iYFhNX%wGtJaltr1VZN7UGBJu09?EC~GIlOQLPFdxE&iUC+y~p|I|V
z^;BCUm*;`8io0QpMVpMc>&y!*awj4*nE`QEk7R}PdYRzYf`FfEr2am*KRz9ba@Y3h
zA4&8&MiCnHLZCWBNS>%Z`(3i$f%i6W1&^ZfW$P{;EmkJ%Iag37J@ak0WC*4)SmG_E
zM7sh+lC(>s6~ZN%xJpCs5F{BW*`e_HuyMzD`*7PK#j{t>eYfJ|edFk{?x(U5!K<{>
zvlAQ9T_4TxXSB0)(F<L&S)4W_JQCp3?eRcX&9X%xYyTMBUCmUEe|BbljNP3sEZoaf
zH1p8-%Lvw+vQUD~rH+qT4n^X8=sA}TJCJjePlI6X5@w508<RTj<3r_=_wzat$(2o<
z?Qmw8(yswsu=h+g)~#+0iTi}jJ|bzPXQi>8*D(L`CkKn>FA{s3H^K?$*C^Caq~!G|
zl5;I&kR`?Le$C-N0fVhB6ry%`r1o**zw8+}PxBsM_TVwkc56p5R|x*RS?lG=RmmL7
zS{p5U@K7|Vt(EeLo?}LUeaAs$0@t;mPJ_pTVdnVOJVJSfiQ+&f|JEl>SeGz(3bzrL
zi9)K6Yggsm@7#}O8f8MUUu1mO4@WaK>zUg>HsU6I-}XCP?aMTqx#V=;H(@>Zijsye
z846#XY)O6^-E1YLjIX<3fpurvqc69mEB-<v<<7FM;A&OXrQBJ3g{}sF&m6S_>0lTX
z;x}J$-YhoSG~vlt`;ox+++n%i&Z<3avC3uNuS7aNn5X`At#80!Zs<3sKhu_^?naXV
zTVam1b8`3RwmnS|c1CFP){VFvyxrB~J2f%w(GB`$Qy1Qd&B}G-((*Z#<notg_oX$V
zlT}L8mFw;I;NI4qnO7ALN)BCcmOr!A?k_nw_Gb>fbJ^(dGn@EK^(0~=^c(Os4?h9M
zwm@i0ZzU@R!!CR&?N`#LH0~4S(mIt!Qjy2cK%OX3kV^7u=$8$QD!QK3);>M2oSK-(
zFmEd~Zdl!<{n*%gD|*4pgy&a=>5lNLg~vLLPHa&wHKt%?a?zmg7O<9?X~BpZ43QIL
zDdn{A_v^dgKfX(;kx&7rMr74O<5-XJ$VLirA4l60=8;~SF~#hhLhVKNslU9~bXr`8
zXuikEF)CBT3iUw?0e)g<xF4VsBzeGz#YkA2uci9DvIf4q!{_;?&+ubJ2SVy#;*gR~
z%%Mvaz0~Df(o#D<2Asm7jVl-p8P9OFS=>t)kf<fWs&KbKtK1n2&N-Bl1>Z_ji^Gqe
ztZ{O3o?Z7NyzPyp5IS4lN*t1kiygH~5p-`I<4ELn$Ih1X4CWgizrZZQetvZ+p5={W
z((R{hZFf?XZh)S{2lXdxqw^+*bWXmcx%MQjTQ`vrS`;Ifc$2ab@XyeF&;l?JOhpHt
zEC0$`8=L-=eo85MlQ|sOy_5m}SnSmDEMJqSHx_!b!AEvTT==7v5Py5e+V2<@0#0Wd
z0v%hZ^ZZO&9<tNVYC}!`WnzSHF$Sp>pMN8fC4!NI<rzB0Oo998*MRRh@KOqrz$9FR
zM|VHpb{cAUIaA#9rB_(N<SD5;BcJ62g2`i?os{q`P3||7QJ(zHH<<(WKcp#JW0yrq
z<n<O!!NJjKQ7T5JWL<jxDy$)?(srwqgY*6~FOJpqp9YDE7eecM6pb9I(gA1k{T20$
zfz}Bcss06tHj`gU>c8Jk6E|9@E8RL}Y`4F_U^Kh5%%X04)^3OC(a1aT-3?#oVXg5b
zWtOoC+pWNjvLEDJY4yk1W%|nm{+jPt^$Im|7;jcpzz#&;-bPfLw^&H|b2n)E-S~^P
zMdvZPZ}|i|eWa9h5u$lyN3Db>he|EjAAn@{-hjXY@;8Fe1_Kc!NTX2WnojPMO+;W>
z@M7iLjtHX9y=Ro0d^=A|Bjee1TaObqThnLLxE`2z*DgGP*SYMAzU6>E+N~&x9#MQZ
z(chJwS5ECj8Cd%M*JqbZTM^8)=}xmffAPhV+a{`nr{97D4!n`+VY157jp?>xk_6I?
zS+MsHuzzx{E3KjbvlVx{p@E1e_xOoK3z4Vo#*t=s%8{YvcDD={ceUqrd@Gmw>4E^v
zv@cr!v3Ga=+qNBv{ne7jNQD_;=UM*Tp>;eoI60@AfWRqv)O51R!rnk5v|dU2Xze%H
zN!b`;RN7xSB(r6qO#6md^Sj%fHE?n*tTlmu-`k%gHUhe_kCE&A0>f0yJZX0Ve^Zsi
zfSOB~(a6>EK!%d>(bg1LDGz;GLbJNrVQmnOaIn-#G)L*Cp9+ML1bChO@;90ohm|58
zvK75f^F>jy(zoB;wn74(>ZugWL=!VYe!Jw*S6b>2PadLDc1JMtZe-eZIa`TVIG#jw
zxO_JjKDvFYlj6=a$?J|!%55r9dpd(-m5F}Dt^pzK<`2^SP`6f$WOtNF>?Zkl?<AXE
zh(U6zanVZT4|t~Oq@y2?1`B<l(T)Ad0OP||kqi@2!#gIcKy3WL?_xk@q0VkR#i&_7
zqAhEGwCX!P#5h0w`Z&^^HQwgp!D)8=BR&Z*xABWwcF*fv5%8`6H7#W$3EFFNf5eFu
zDT<J+9y;hTEj8lZ4_6b|%oxHxpQu*9F$N~sYXK9uF3WJxK^f(X?B5t1S$26!C7w3p
zd%TOYqBoXgdMvj*7Q^rS?C4O9mR9v2AB=u~r+rmy&2S*1ibhd$B7Nf*4Ig-1l;N>F
zs*<Z^wK)Bp`-{PF>275BR<QSPRP&)6>1{W~#XUGQ>MEORM=_q#^?*>a2!YNTuM_Fg
zTdFULt=Qp`Xvu{ir{;(<f>t@M=_LJ`;nmk4F8s<~3h6@F>w~kwl-I$FC2irPj@=g<
z7z$@wMmqf7GixsoD|I6Td9K-3`%-yHT$r&#WZHiX=zoTvH6Ynx{xZx{zMpj|Fl(B}
zNpsHKeq*xWYm9j5N3JzX7-nO2!iD;^LAruokVQTgxI>@sSFJ3{1LKBi+^;B#v*M{i
zY$~_MrB$0d?t4y$*cy#)|0Iv((;F+9#hd!tr}KHck$Mf9IXq#)*W_B|n?X44&RhXS
z;$=>J=*w&KNSsSUOkdAD3>DHFM{ptPL8ZpQJ1%ey%B5a)kfOG8>jq)?-me6>MyVmk
z$<B<gTTy)gv3{Xt$VR_-z3nF3_ct3UghB1k^3@QgFT5gBst@2VH&I-+BT=OEJ=h33
zRML-Hnr%!C5At)`_Y6KG1LrSFLfE4kxUiN_$wQ35IKYVme*@+RI%+s(^@jkZV0u{8
zujV{q*cKLdtd-!Yc#_}_PrzVBTa0EivvC?v+!vG1*l7(~?M#CwM_Z!gr^YV0yv!ou
zitfsDoy%RV=^Q$>7&yz_i*`QUP*pH-{zR0Q@PyIyMovP#*~b#hIC;HrB973Yiq6F^
z`Ztk14~e|^*)A~21Ko$FP0^M|kiLAlh~m+xu%yxA`(`LKnXP1;E$`h}Vi|9|QjRVq
z7(TL7i8;v-o9=yrUZ`6e#uz5-FIyJpmfwc(2)&wLD~Jq*gCDo;X+{Qn*x&5A24v45
zC00lwBc5CvWI;uel1&w}Vw0Cs0P@IzZN1Lv_!|vuwqVBa?L|W}AAZ4Xq20^i>Q~3d
ze6y_%(W_!@lt{MggD(^)`4DGWXJBh{Q-KFSrUbvFr2_Uej4gTOF(i>m{+R-~;1sTs
z`g~enDd9I-=NUw)Cl1%WS*_7$*nv*7C3)b(_g>@z#T-hL<oL!zlMC#zoFSa_&(*&D
z4pJau*5RQK<kGM^iZkaIW`tvYG0QFbW1j<JVh~a+l@q%NJUOZ#n%+H|_0EOgbRp;0
zNvZAowtD%RB_uY(;9y6=sGoAHzMi6VMl1PJt$SHGH4AI=vtrL#&#&ZN77JGNFc_$=
zPLeqW3M5`Gb?D>12p0~I&FLa@=dR9<3ufb_)1n`(C*iX~VAriPU^Uh~9)c$a#9oS|
zv)*`{%@pSCmvMi&`4x)q4Htvc>ZxoF7CtgjPLI&GMqt-(AiwPOPb|&Q>bVFV;k1O>
z+|<RZW1XKsapWAXAOlg2#3Q-<7tbR4z2qUJ6Z<N=e*j-*v`+xn#habu5C)iD*1@TG
zW8lH;NAE0+8`S=iOVDrET85EAoi}K+SDz+hSd_iCW=aS@>3cM<ttJ&TF5aMWk=-`9
z{IHY|lZRc+67pR+oMJNECO5a^oABKhqeN;uNp3p3XPPx!MxMAVY)z=i_t&T>TFy8$
z^Aogxgn9{Opb~Yb`j)(0A>DR*H;c?<qino2eDt|00PlNKyFyJTFr`eb9ydb{m~T`T
zpF+zvDY|jh3x#XKf7~YIJtAJGOms2T;z#HOLf=Fl07@?P+8;Q4&Qe^|Dq%Y;lsUzJ
zQ$58aKe8o7G@u4f?S5lll;a-C0lOM+5Uz#JS4(5TmnvvpK8cVl>Mg+JT;lqT6BNN&
z8nF4A?HCL*RZ541qYRfA_Sk1t`$Y@;8ZLf*++|kJmKI;fE)H*2cFS-5$L!s@cv8|F
zsH5fsBs~9&4c9jV{{s|77#Ti^fSzMiA`6xDFuz_1JUm-Gl>nlF)Rd>Qmj1WmBkTwJ
zW9iZ#7$zlJMWk#VmZj==xt||+x2MtZS<hdnQOQ-2cP_T{&%G!2TsJD7iGiYo*G=5N
zBLMyEM3&vd4CP25e2>)0z>OJ>i7Cxtft~<$8+iRw7;j@VNRoVy8b<~~x&->_`WF&}
zNCWweX+8#kC?y><a7>UzSk5NFRKPa}07JBaesbnA1Mx{!x4-YjKjjc$EK_Kbbx`IB
zFZ?}|sGn!kojrsuLxos9C^>SWAxLBYYu@*jx$Q4xDDvu7GX`J#m~N|uhVbf%2(!VD
z(6!s^mjIY*-M#VQ9;R{uFaa#h*zWleDnR6e99Zh&8zJK<O~$HGLBaHL#`^ursQyXD
z7Wac<<6GJ6>O=M}3L*P5KFeEk{7$uelk;g??;ro{MQ<7>>276y@J)Jz#9h4<RIFRZ
z7sKiBY>$4<mIc5@Qy;!I-lv{dfCXhN;B2=t0AaT+E9C0%)UJGe45MeliBrJA;I<EZ
zYAK!`Xg{85Gw^*<P~)OSqz#3b8S6PUEe)}>-tB>P-bL%o;BvCp4`K5m()p(WTrXcQ
zUc~=jUnHde8PP<6BAVgx3GKU+6h3r@vdC+9r=}lBmwT4ETrNeY(o(O>%%{~=uEtjf
ziZ$KIMV$x=$cDvw1R^=#8*p6hPZqi4LI^P*3r?3H1VJQ#^xD6`f0ZR7lgut`G5U@S
zl{e6SiSgRjrsk;qV-d9*7Kz<Q=z)A#8ilsO>IoYVx+veT3zNJ^V>NtR+7?X~5q!AR
zcGR)>f&JQZZNQtK(!DN0r7gdxp(&tGq&4tx^$tPs*i1g?u&(VnfePYEPqt;C+!v0q
zK;cNol!QPT9>CQkCak`yB6*ipY#Pt;h}l!x);b&((Ux=MiB*8SRX3jTp!0Qp7-tLW
z!qhl-f`I9RmF^TjY_^SsOI|_FqZU1Wr!BNr#nkAIMd`W20SQj02@fIF;$kBt%)lTI
z+!!a26cYcY@vT2m+uw}N?R4%P2;3@uCWd+b{vor7KFQE(5pO{MC%EM}Tu=`jhgy9|
zC^j^bTH^ML)`QdJ>;8jQphSf3R>bcubbZ27p%C~=Y+b+n+Q_Rw>qMAr<eQJRr>In~
z&Ut^XBU0Uo6jHg_Xpj7>2GJD>a!(rto4?7;2aR4<43bEnx#lSbS=ZRYCyYUi7LJMy
zEHnCfme>x&%&pkb#Gxk!S6^u~s+Xy)tCxw$g^Gzegs;&?tVJV^PL0J{w3bV}`>e??
z51$ehP6r0tulUB*HFwUs5+l$f?LG3rsl)C15S}FuuEzPUAAO?n-HpkFW8y8V|8neT
z<}Z1&!=<}SUT>eX*JA#IgKJ08Ih_Wkjg!JvI+GX2Cb=oa8)2IKJ2wKTf~9AF99j7|
z#4Oa9ZCg)zo72N8>0TrKbq0e_pHDdIam?5;!H%7IRK6en1l7>6c8q0aQhA)V2$F!l
zCq3d7_P-*9Vg8qn++0!nUEf(MX`E-Ii;Wir?tu8WQ8$(o|Hgp9SZF0mKUGr!$c4A}
z1N~2~rG+(ipv*{-O2j+w&(u}7950{2?Nrb#HCz~*Z~dWM*!=uuB}8qhSG%eGZQ-oz
z@3Tj<xhngbTY^r6#%aCL5~vG1Ez2T`ou8*)K+O3JhdKU%%+!b@faMkJhkg43xNxAL
z?4xS?Wv))qQ<BoRJyFA5#H5htPD|1n^&T`&yvw@R)`nTsi@p2U30SX?V;PlwO^&}A
zTqiB_d++4iQOkd39sdLmV1Uz)c!Q?i^#*jwh8P2&RTummg*|;|1kQA@#1WGDEA2(c
zpz*+3q<XRI?yO84E9sjzZ=nB6@;<0IP+sg<xD8&}t|1c@blA{O;om2INWlE&k|Q!;
z(CMeeeRySk^Z}~XOsI<^52grrj-_o*v(%_Xbo|y}yV!Dl^KtBA5yvG@>bXK<J^2GF
zcSIv;%w`yRR<X(Et_Ht1(F9MSF{|VDfs#?STq>E-m)^F{oLRU-a!@Px3}prv00;KG
znM;rhK?xYtH`6TQ;fo6o$|4!^806xj@IRkHWDWCms|lX{J7o_50D$gez0)vWx%Dgj
z<kOTUoXyv61f9{SM@@d!+)tf4g|eotx09)tfkUP1lbndrbEX5-8vn*~4FJ%Vw{{H<
zwJE1a2BjE7OLSzruPut(oEtCP9?e*vi}aG~60Rn8@=nr7pD2(_?NZ2ZqioEJJ!H`Q
z{caGSo6$SucdqqR?pm&f86Q*v%Sl)-6iu*$A>`0RF}(g+wtgRW!XA_Qo93~hR-B{r
zhay3%lzqNIJrnSAL0u!915*Ag3<B@Z1kCk&$UAkHl+q`)4<6^cUy6ja&q}@<S#%B>
z98MPlV0CLmVp*o??R@A%2Jb4HKK55GJ1P=jSF3gyI^95?1B(P^V$Y31JISj&O{)Kf
z1;So%PRptHMx+JPzuNdGrrG{umn!mN5Obq7>8M0sCE=j?t!fPZo6R><@gi1!1)k3x
z9+6dqDZuM&_wh^sRxR}9s?q)dmF>pf!zoX7GIV1dMD&L_-(Masi8Ah;hAy_BK$Wr;
z607pddf%xR`7!=frOwPfqo$pb81EBGXj>JA&2V9xkTZp}-3a-yj#EOuy~?fM;3`X%
z#<>mZiocaOA+L_~ho^u=eh!3M=VILh0WHK{4@5M|nGDL#!?QA*48!TpwQ(Dsi}zN3
zQpb6csZh%6L*xl3$_VdUVLJHHmHS*g6lHgmrB3TaToTc~^PP*z!|Ry~+_ruym>c-7
zM;a;sGk>|~HRFzZ9{!q->!W{=Qu74XWU<Kre>6n%u;8@6QN4>_luFbbyR-2~L6X&V
zx@41>fLTw1v@>8eIbiVfti-8>CvJ?-l*D>df95xIss$1?4<3!b!?sNVkjs;Wf6+_6
z3ptyMbG_B=X}olm+hf5FtBF^jhav}0iCA?o9vW8@%l};ONjlb!#5ASfg2SLs1P16~
z&#oyifHK=XhhK|frR5L+_!;Yje_v932aiyy{{ON_{8#Efci`{mKSAZdR%dEuuX2!R
zY2oDrk7g;IsG}Mj6>RR=L>L~xCgPqE=6Orz9z>Sdp4d*su_g{Q|0bO^x^ATytFRMb
z*R6@z<Q$C@Z4@?ssE}SE30WsXO+Bz^$$s?-;Cbj_*xlTy|3$OpV3k;oW#TBMii9VA
z6p1KEC6zUK^1bMGP-A67kM<m<KFa?IuJArnR?_lJI6@y7;QvaUVcN2~Dz;LPru~Qr
zMS>O_+_ICdKDyLAjF<Ko_%u-Fki}mh)glZNbfyyf@kOs~osPwBG}W12PM5S3q%5aq
z3Y2?Nt&+CXQ85T|S$`QpgkT^_Qy@}A^=RPM`snq&flu51OoFC=8n>s8#SRHsAR+xP
z`i#r|4e4Ht2Cg7_j`xw#mAkHuHnqX(tZP_SpxD{@*HpU(w#f7QUVx*rcW%0W`J2pe
zvFzJ)d$AXcHJ?$a;SrS~?!q&I2nrx5Rg-4X?4Uobt-sGow29x}SS5)IU{B-+-2D;&
zx=u3t`Fj*cOb|qcutv_AH7J%sn{mEKU|vD@*SfKR`3ZoY>m&nJ?Ykw*ZN4&jMo>2L
z9+l4}ET{aFbGBOHIllR$VVA!pRvJ)ZZG7=D1>wmlMj@j;tv$l_Apo%>k>s=Htb{&5
zOcEtBSXcRLbZDpmAhiuq{cr(>W8$6lTa5_UE^XwL!r+gKpqq&+%3ohdUO)Ov8i0V2
zpn%?Fzovml{BFMxVn5xA&BHTh@@yq_<gmDr)NSWA_~*I4X4Jqk1c5L4rO<Pt=V%Xk
zP2z75Y_XaifKHQcQ^}gFK!fHS#&1zX#Igpj&~atd?k5YE0*YUYN9gby8L$MH<j3wq
zWk*jcF-A~2Xf8GU>N1Ph=LS@h^rZo%5jfHwjMsmK{_E@4|1disU^XDJh~LZrtwlEO
z6Vb&0L&#P5BuRnLll~Q>8in-_?Ujp6s|TjzwZ)F=N`Bsq6YR@AQ~r4EnOgcOGClH4
z+2BV${Qj$f{Cjp8jRtp}YQpGEuVvH+=yOp(jY>78lwy-ZYX)o3jH&gc<6?%@(Er`8
z24ImcAZ&6#<s>DW;raM;Kog&}+a?M^4^<EH-&GI61;&Yno=wpLc&0e-n++JQqypHV
z3%4gd!$3XjzF2Sp{;G*PGJ%W%_6T225QUbe*x%ylckVg9aLI_wpHhi71=#Lpq^-U^
zde}{W_?a^U@*}D@+NfX8jac;5*#8cy7fAO4Z30_bvCtsh_LQySfF{|An9;Zu<)vS*
z=<xnjx2nMFz*u#BEpbpc2_}p57+guKH6$+Eo^&6BR}A-q@&Nv7u8sPwJ3t_f_Xj}Q
z=#k;LfqUEjE6QiBO2Wi{ZG|7;*phH5(PhBA&+%xeY5I@$f&m{8Jvu79ufPB=$m#jP
z@3lbBwZ!Tlwk-E%N8Mkz^4WUV1DB#v+pGOU#ANp+&tqS_PSRP{AKlO8?Y1X)f9Jm@
z;)r)8%T>+63}|-~B@reyL&WnF!;dVy;PK96nIS9q=LKTOKs`J`J{4nRlJPRykl%d=
zG3vI4<TOV)*&rPMOOJGLVftJVQn=+#%;_+oKdf@-_&*$;oJ%FDP+2OeTf4dW@>OM<
z;b?nC0kZB1hV09ti-V;YJT9`&@kL5hmH$?%)G7ekoy^$2*w^42<j9g`+BmQ_RDn)5
zT!g0v8Lu2N&}p!LKqs<c7UBMLZGBDHz%p~eQv9t|RN6P?0G|M{p%yB@BX9bLYVQ{q
z0<7FKl*QG0)R;yyM>_83W^45M)5Tcth$6$8;Zyai>Qj!UWN&-lH*)Uq&F)0qvg@}V
zaa_35bHK~(R8a~o(8cF8q>L1kL<dM5BQ^htQPi}+Mp$9KJ8ejRvw7m#I@5ub?3CfM
ze!*Ymz+P(F6JjNJGCy@5F17i9%9W(7QTI`dyq>O0Uy5+Fvj9$ORj5O;)4mG~{8|QL
zzBuKOVS(<8^ntDly(b+FyD;d<AFtBA8~CrwiHgnkgj!a1y}Gws$xEeuEz(g&bC(4J
z4a3QEs$*=ECnN>j?_B5R=0gZO(<`@Mj5D*vJYT#{;M(zA>PZz=+318$@Gv217X(lj
z3uP5N>q$2YuWDua^Ny0ccLX~9+MrVT=3&H|B250BLEfaC%dz!TqtOoaP8XDqXJ2&d
zb_A}7Q%*$u*m%KC8)!e;*O=+D?zT2kLNjB&j}psJ%PYHp*Gc~<uo(%@4;lhFI}tx*
z^f3C87XFt+B){9%CzAiChJo>aY8cHg^|Z?$+YV87r99_ajQX7m+yb~Ttc6>fkieac
zpP#Ri*DzMLi`$C1Np|TVx7_j^xXWScPRq)a!a~GDR7CbhGrKvHP~N>~EztlMoxqL2
zAm*5l!t66>{fY4Pb@h8o3_!7KPihnbF5QesOMN^|4|9BcT9wNuyUkY)zbsMHx|979
zzsL}7T`r&UPuFoPTBJO$k%O3<?rjxmLkvV0Wvqdz#b)J|{J{f~gh29n6*xIzfJ6fu
z(W=`o4|MX()!Ri5@(G}RN}g3g1T1)DT`Sw41-Anh{1NeB6IQ$5m}@umV(>A>_JwNw
z3_7N78t6=v@X=l%;MN?!kD~6~ni_aM8VbX!nz!Ox^e{Lc&#}b4&hbz6K~7H%e~;@_
z#mfN4{2r2fPmTOhB<5IwpioxV@{JeWSR+tA^)K=CC#WE_0ai>QO(}vU2X0|}jeP<m
z4i!P3qxH$cu6&&T*&4}~S0hqdr4|PygEaTSMOx)I@2fxgDC905g*=_$Mnlh0))@u4
z2>9~pqwJ+B^R{}fTY<6`hs(&s@Ikc*b*R)-K75{Lo7SK&pec;GS(n|z-7qDP08-V^
z)wm{z0_3qGp?%X}4{GvpVGRSFJ!<j<-JE<!IK1Wtr9CKS6FWXv#lk7BK3Ag2NGG+?
z--x)yZd(sy@pd9<A_46p&BddsgO(l?j|R6sLtMj}n?#;rL!EN~oz`Ak()*mO#1o0|
ztPZghA)O7jBwy7p@7i3&k~O)4szG%tZM&(;6eIHYfr1&>8aQ4k?pAQX;Fk<2m=8qq
zXm`62q_daXTE(%CIOGHh+YcfHts``!*dmLOf^<PksL;vaN*zcdc2kGT?kPf3B@>B0
z(iy54FX$se`GdU@obB_&hk*wzE?k><a-ALM-ptPKd;|fd@uZVl`>m4a(Hu7o*vX5Q
zCUW|6AT<zlD-B`Kp9yxA`6e0%3wprx>u{l7d*muM@QqN<6(r!$4bMjxq_+=e)7O`M
zKwWUowfemA!#puM>uutpu-7z!)xo&s4fffE`|7*&<FtDdLIzAoCV+-VkOqFl#7@w8
zUKJx8rVmcM64N7XA~ul<NKD=^F|p0ijth_ni6ZQ!BvNdm%`v{*&_Z(gSzs@6O7F9<
zA8<eklIS@BYh3r_<Lk4b84&nD0NFDUPf8XJJxJfexfD0d#yymH_F%BS_|^&pm0|>)
zb+X|LJ_6l-|C!GS-{;{IZGZVnQwvnp(k=4&DYrKI{=O7=GtqNZRqiBNwR<^_d=~Xd
z{IyAtW&XGq!bf)#ZlY-fA)+NrhQQd8Wnz?lc~1v)`zH>Tkq9ni&ToX6V2<4!jAJN^
zH;g{<AXW!_xq~-xN!!H@Oe-micxDet8Onux6iqOR6E%(Ep$&$;hF<gHp-a0GEiV3U
z2u*JsqT&w>y=-q-`4+p3$0J1Bn!NT;oA9aB`S{bT@tTeWSIirrbwI<em#%0-`~Ery
zS^YgMfkOab#T<k7!%GJ2`S4+2EY+JvJNIO&Zgr5x!#P)n#1(1~mO^4StNl4S!$+zN
z^(0LiIf^pdoSx;+S53V+js$e<qJT9%1HmZ0pw6CE&c<?Qv&sows^>KB-<Z(yKvyOv
zSyfXXx?V>oCn{HV<TG-o5>(@Xz$}{x(9#5SM~?1(+YT#~m-N9^vW(m>VPlD*5y6WF
z?(Dg9hdOSu151r%IQ!=xfTh#Sudq%vyai`>L+z(9K;?*gWlXg?&9Vre-*P8+RF)-^
zaz@428P&r++kwD4It}CxA)0hE_350S_qdc``+=XErqkN1@E2v_+3#N>KPOIv(<^(;
z+x94e6Duy;VVxAfYLhPz<6Uw2UcO5dUsZk0PS=CwBd^B<>*(1!3IDvOvh$q1VO8&J
z%D4=tTSF_A>Iz`e`B8oRL1(`na#u&}XNB(V)5FqMFec|>)uUn><2#`jN9v#T{y-@-
zX8<JGMrTA`$3riW;~^b5^Dr58L#OOqCRihq!Ryej<umbV;mtNnO3`dp*6N5Ts$N$a
zbZ_P8M0vbKw)wOdqh1PO<jd_9>Ae+8pSDw<J&4t$C3?8>oEziCc;;JsQufJtF42dm
zlefQhxU|OzSWTR&ZX-2i!h+rBHL4pwP|Yw!p`$Msg+_e&ifU+<h(;wx`zspK>|OIP
z@RI2zm?;4EHvVgaahN&f%MH?CQ3dp|H*$iv?&*`13Q_5z7!Tk}@hDr#dA3{K8Jsgq
z`_w+0pcOX~zW6-1BpZ)(yBv>w3?xcP$^FdVDmPws_;S4}oh%Px13ju6KYGr4ED}HI
z#xBJU+j&~13={J5RV8d<<=$*J9!@(UdYN!u`y;{*0!bXZGME)YXIQQL;yVB4v8Y4`
zCj*auRk<!?eMtYKo9rLp5__iuNU0Y4@etXppov~RsBsyx&X~lM^bBQlH4chJ^?v#?
zj?>qYYBJqAjP#bY5sJrzwI5%{72fJptq?=Cm&i09+nc%)Y`5e5c79M+4ljPG0m;XH
zSx?6utm`c`o;6nUlNpY4JGJ@@6?B@xsSVJE-PBy%%Vg@1MZ=(eT@n<eIo0Nd&^^Ny
zmRyFSr61if?(;Md=ftj%>Z>@e?iSO8P$Qy*H`j%a<KKQR5AuV$X&sPx_p##i$eK+F
zK^eWA#tDfJA{v)IL1dmU$G+m4hYP_}O&&A3MLfr-wv*-l6@A$AY29(PSAn<j$iQy<
z`|>W|Dp%vrIXtb?W)4M{uXrz_<gE*OalD={u){pk<#osTItzhS+mP^pFO?n!tN<kV
zjZOKD_MbIlMi1BwcCzs&#CHfv4_5P_$-vCv_>Ea|SRNb;>F_j~Dr(!JE(;{!x_K^7
z#2x<AlB!IMKvE7DgCVBtmA;}FI5E+v@=Nx)s7RNvL3i5Iq4Mc`|JY8PQy8L&Vs-h4
zMD|WLZs|fyrucAGekPi!WLeF(qCrRoCLw|Hgy*I@Np2lFY3((MF$;Jy@lS1;sJYeC
zT3eQ5x=Xp<bu9=Ogy1cP;&7YDqB9tW$Ol*d{1h0`?vR&wY%JJ)p&~RUy7SK1kX1*g
zDkXt*!4^-BT~WzwilF7EODnNl=^Rr{3I<Fd9%?_!#ywCd@#Ty`n&HsrNpEmlPd+>M
zp3qWo-@&Ob>UOcGSK@+}fa1uN^C<D-Zu{5BO&nkOZ^35~F~jHLu5Wc*FPK-uHZdMO
zWWa&R0e6TL=2c-ZG`^Z6moaMqV&nyjD#u^ML<Xdz>=!&fi!p#V4{5{SE?@2S<-8Y~
z3s99_U$~KBM_&qlvtzY%!^SSS6#LS|Hz9H<4Km7$0))!56+LN_H)m1rIEY1&xrZdo
z7aJt?Sek{CrY>7WO@(#zx`gduQYIq0!3VZf6!AW)BvlF@#udG)R_E#R<MG51jO}Du
z0zW0Pv2)~PsHQN%x*xid_1z|3FN(9mM7)Yuf3Op%DSNnB$*Dp*j8iO=Q@{cq!KiIe
zo$95SE)FJQp^%_p^cc?5xLAxLJyM&f_=sj%@N;iGGUpz87gg|cYf7#bl7nz;6YOda
z*uff*^%-z}&l5lF-r*ALIPZMv#k3#6y-L@3W}pua6k>==v|t}vX?%H*K<G%((Bj$L
zZRxO^x9LW-6KGruQ|;AfIrJaw<UmizStTa#X6KqAkn1&7_Q1oF+mSuy(H}AI$W!)~
z-eCBH%g_MUHlo-E#hxEeF5T=215Ah{`0`RihcTZ^zNOjU<4io6YG!dZybb4aiZQWY
z7^Y*<nG+xc{dG+#*U2dQvv!C0Z7ocFC&!^Ecz1tEo;~zEaNAw*NO42T3jB%g;&iZ#
zN>;XtlO{(zST#&Dp?=;o&cjrScCo{P{SyPy!-uE)DlSa0(0y42uqzG&cM*ZqNob(a
z5-%fxOBkN9|MeFF+jRyU;stvR69cAw)}>C7J<~$gBIS?GeeqzUt7dBWOTEQe!A>MI
zp-4f_G@z@fL-_j@p09__DYT}vC%@}gg&&iykXSIJ^So7qYpAVZPabinY)QzDt?x56
zmqbU4cfx>EBhp8A>uQB$;JYR5Zhnnasx7`oWJ5zISl4?d*cA6xXf%rg5Yya}IJ#Xn
zDdj{Nzw7;}v(dbPM6(wSx1s}zhoF)AHzxj+&*G5|B&yu!OEpYfz-|pQeC?PRfgge|
zs!mjECJcyTFP}`~;L2I_*D?FK_v@f8K*`PxQ#F;?)s}9g<*<rKL-mCsgS+TlzhWwD
zMmAFMaf-+u4?8|&zx;`0HMMEd$i%Y6m~Q0MJ#1>?))Dx|sO@Gh`(k^y5C8FZQ6b&y
z1ukf>Jmo|SYo#FFNS~$qm8i0%LWJn)pel1miu~8a<+mm&h?|>N2AG-}!S1pV(O<lt
zJk!65jGwuu4vL=u)WYW8_@pa(PQ8a(?O#-Q*BJ@47YYK-SS8)xdLs|E3dsBRAHn#J
z{h~&^Qw6C*oUp_p=ER!Ae~QLGl?63oj5q*+NbSmv3Gn2!i@&M<C1d`uJRu~AN@xs4
zbaPC;w-0=icHl=yH35S-E^cc7Dqb>ue5N5C__r=oLeX2R57t8oXWq3A5xTn8d}`jr
zc8Mn6ZF86oUv(4x3ONVb+G04JHM2MtrJba&`CKZrn;Dd2n8!{rY<Q|v4pw`7NAv1J
z;uQ6yC){0<eGeIMA+F1vYDZ^t+}(+dc$!?sV#9KMkIAiTBT-*G(P1s$6e6g&=e+NW
z0Qn|s_>$djos!Qe&0c!wr$|s(lJuE2`XdgN)oZu9;*`_G>iCR@%OL|p1DV3RTPSI_
zjhPw(Hn29k>VQ)5RJl7jDUS6Pp^;L+?=x1O;gW$K>Y7Ih&;|J1`>B}_(1sOiDo@b5
z&b?afx$4!O#D!b*nswPujA+7<GE9r$Z)Ns(E%#)>uBMMUTuh#PFvbQY_2Prf6j>17
zbM9hl8{B*(*5g1bMir9))twgN_;jwMy5t+7`Mh3vuu&ImN$I1GKmdA*+w^kf*Vr^@
zZ_q`K`BFS)sot&5woV-U(0G6FxWsS>d#OFG<3kIbd!s}vs_W`S&o2hp7wn>@Y3H0D
zz0;`Bz1-1Ah2_G?A%8AoHlV2!&iXR<dFA$$O@wD#p+0nJeB!HGJ?rNPcYO#c?CZCC
z9`uh9pphf6kjix$i5@PjJAfW05-U4{Y)7v2wO%5BM@+Ej{BjvP3I}N=#YY?-IOsj;
z6>zl^yp@GtAp-O*YR|16=G+U1_G-`)Z_Y>WpYp{H38EOem<$iNOyP1aC9gQ^px;OY
zoT$PcQ8`MMea~}efa&+@1SHwi2Y*R-BX~!|iLmK`9ssRzQ!Jt6dc*v=Pb+2VKp*H_
z#(BZ^AN}`_8~ns8riFQf+A)E06$9jy)ZCW8r~#?~5zKEWf&4+Q5StWo<iYX@w@T+8
zxJ}!j&k6P&m^r}kfxF(3NKi5w6C>Z6d`#?!_zWveUAEp0gh7E@TqGrEt93qg2`5^4
zd@~cz*B3x|>^&6-bkvfG%cNaBGg5}LNVuAkEg<)%(Y-O)IBWQ-xt)`tpo#G&era%F
zkTChpxSVy)gJ-Ol<iFAk5C!G|+C9To<(|;IcSJrk7ZzAyizD7b99B~nXa&&>$LFNQ
z_*=J#o+G{-yMyr)dtRSd^jAyluw8_;<R<QPM6i(wLp5^zaU+I-*jHB9>x0b~Up3RG
zadgg2HmF|}?&-`iAC*1<#*qPi()y+~8z4_|-}?!d^cTj{4H*4(mizZR;qJW#kDrio
z({%`cR7kgTWq#pMR$a7TRjnS2x@jBv<W!+1{h$+bSr+7t*pB2g8fsh3inB#Z#UU|{
z)?~U8T11!Fj`X3`Bc5>*s#TV-i;dF%>{UQ)hR!7w{=>111}o^w`IlnTS(?Q*p=}uI
z@1L_0=F-IlBj(~;LMCAOPd1h*s$KprH<!Cy){RBT34y{6??1KeXqb&|d^oE6SG)@m
z1EFg?>NE$rRG<b%ZSsnu6h9UdKTMNlcVC|8XPr8%qECY!OyKHm?tiY<k0n(QnMD$_
z0d4klKk;tp5wZ-9ofyzj>JrFHQV)kd<4ZtA*k5I(DO;+zBPxe&cmyg7!fPsA*axsW
zJPNEC379`%OpZJ<p2Ew8XjYpDE_Qx+g7t0T1je+#-=1qWoQ)!K{-I+l005jmjq~5+
z{uLZ8=?z|ht9N6h9S+{B5t$I>B{(Y_H0pzzklt5GALuzex43c#V<4T)=wNp_zfN$x
zddD8WoLk7+Ez@AlZn``#%37CRise&_(9O5}(ffwN#~w}(6N(59KG-_nf49`}i||L&
zaNek`&4_C2jLfO1^RE%W`J?lH>N?LD8nxFKP5i@A?M&Z3)+?Ooocj@bIiH^(dWbKS
z;*Y)(5PsLf8<F-uLKYf70t;l_BBk;pLX!=s@VuW+&cw-Kd~3jEw`_+11WqVyeyQUs
z+;09|)Tq(i!W(HWv@A_7Hu%?B|Ls@TUqIh%$9Z&%_dGy3zU_b^e4pt|))yki*p^ac
z<$?Yy#h|@FY(O&xM@c};C0?<H=*B|KbKHq0DpU*7{!Mc|LVF4%IUipsRY#(qN)S?6
z-%*XfeJ_9X`zCpSo#YUIv1o+XX%<n_`bXBdH}4sM_;%}K-bjN%rT!zy+{fu+NkA(_
zSAQ?e3Xdpsc~|}~*D&}5NRZ4WU^hJo>5CsD(0?V@CO|3z^vV}qEWk@c%UhZLY72j!
z&;Zbv7p-i}GqF&q7YV#F|C;$;BJf<(_c6Db0bB=0`H%h`%tQem3y?1MoB^4G(f~{O
zUp93`kGl_9XQZmfcHx$OV+mSTtbN9l$NaC6i6H}LZU0qhPT~U7!llJX<rM$ZkN#JY
z_z7SGUQr=H47ITT6GZ>>U*I25J!q+cMzQ3-75u&Q^A)I_jDH7AT!=BiMiSTWb?m>G
z;R%4g%)%;vmB@kF5Qi53wU)TXz_e6|Y*|`9LABsHrvDl-^#w4PERGeuXbs3ZFSGE!
zv$_h<;zRn$@YK2=Pi~igo#|g|8S?t>rIB^Si-1gnARGOkN!<O97{Of=;M23{xUZr9
z!BKNRe=x`-a=nP<O}{$D95i;PoE%Y<fFoAFNf-58A>Zk^B3HU;z%<{jW;y9A(yI?z
zIwfjJqiU2&(;1J=pPfZTq=9WYe0)arbG;Mzun2x7-{-QG7U*<Mk$I2S?(J&NAJ}3X
zmz^T(*=i~Rxj5IM;0(yP>BszW2AKA!(yy!s4zu*I+p9{<)6eyqz-LlI_^uA7%B5c3
zRK?UzoSdf57{{iHfFYOQ@p68(NghiVRiPIR6G~EttlRh;HT)|mA_MlsC5Eu}+la&q
zxS=guAqMeQw;{rOanWQM&!Y7)yl!=eXRZGjpZs!@DuQALE0<d+(iX2XhviwO`Yor%
z@ij@2tTwJ@(dKXee{C8yqAw!=K0z<pH@jh9$Xl^@!?bPc2?@V_|8XxnYq_gJ&QIYv
zYl61qCf#~|j^T`RKTroB%j>nsg4;Z9u-tG+{4F&pdB%9GG_ch4T`8GV>D7a{Kn}`k
z|K6*o0Go{@<_&zn^;NaDzQp<9;+TXkoZr=w>bF+`Irfbcf%L7AVR<a?AyRwna|z3E
zS#;x%C~BO`dAoZ<6S?i@P?M??XwXL}mxG|;fsn_M_+#?tZq2xF<(7SB3g}@73GUzS
zKM(m6KK$+s3v}T~=Gd0aK}-$w!AMz6@oDV*!qeD0hC;uhV4gRh9pbq0NFPcS(F|`h
z#R#2SovkjJXZPumrJX=)YAVf_C52S@y)_;#2ZqN-7-lOu*6#P6x<xht8qwaU?+ZYG
z?G{#}`B!ZC0WkhYA8*DTh;7wdV@aN^=`Qsheo&d*sYe|sy$<F->7rePrn0CPk$I=l
zy4njJymfrPIfiRfcf;j%evGr^wxzapX-;In+)Xx#Kgv{;tghHRXZZ3i!CLBYJ^~aH
z!|zMS$*x^0)ZV>g`<~Cdh@_%Y4O1*PbIova(m$#e{Z)1TXgfX<fM<I7rCkjp^1Pd*
z5{mhxDIM1pUqq59&Az#GpmeJGS=Aw1-kDG2G8-cZGu<!kT57SggUS|AA0x>4@QdoM
zyk!J@Q20S0#3Wfs@mJ2f)2$UW`G?Jc^6jy^#t~$Lq`+re*n)7&*<~YEy&I%w?6-))
zUbpxaGaG&upX9R~YA^8QL|4=PR&n>i#sF~M3sT?GqDsla6wL_C9C!2nI=+(~;G{M{
z`#W5nveB$HR3{g;8$3JwaXlkGxl(cWflYRgYNy5%?Mw|g|0(o@>gu@&2@j>acT!+P
zb!X79Ff2HlobM#wdZKia#Abl@oO8A7M=OFvS{N71OG&Q&ULyYE2iSK)G4e!x;^t_P
z(JV(Jox|tLn3bsCC88FS=1<9ny$R2|bIQcmv{Ie`=(-J{&<GC<GPSL%8B}843%`R;
z8_=m<{>!CshyR}eLt9a(PA<+k>y6Kz%Zs)t#w`qhRDY>9II6$K3kK5}_)o3f)zH;f
zdza_zEZvS+2g~RTbCZKB!%>Y_1r@VG8S!nv#+AyHeGJ#EQRSZj<rC!`tak&aM<=g)
zB?c?7K!M))0hHVSOWZPJ;#``W5xNnjlyLz~L%6hDoaYEURp8;ODAW=>2-MY7on+Pn
ztj?K3LQ!i&L0~N_>3ERmOMuFhMzvMHP-<_5O(3n>^B84&TZ|T1gw8~@RR(D5kyeue
z=1Bo;eM_?-Bg(KO;sL4-B|9C9D}3&qnJ6Ls>j{qf@WcO(+MCbhUa^TV0D-5gpUXO@
GgeCwLj9fqf

literal 33144
zcmdSAcTiMa^DZi)1j(Z0D3Xz!qml;+5|toP7?C*SJSszmA?G9+l#B!=$q+_KL(Un7
zoO9-EUf=KBU)8Ns_f(y8?ybuoQ^Ve~cdyl}SNH1Y=?;0Nu0-&F>cO2mcL+er@>+N9
zU~=ENgTao21zd4ucN4#J=i?oa{0kiq<IVK@b=1fo%_21x5^kHkv9W^P-65^r;ciLE
zR93ft>Qq5u*0F+i53FPIqnx}QE}9$db}o-f1y{nG#^d!?-o&^bdS|YjjHRRx_a;+r
z`OS8VRDqtrh-GmY|KI*FfA}sj#<;g1lliUI0Eg6%dVdn;9vf1DI-ZaXd?}Lw0_H7)
zfvAirOE*=o12r4Cnyoshb2KaM?@(O?k%>zWR_m`}7=-ZlQRXc$(Yxo;h8lXJd|3xh
zs*bkH42_ginQ>)uVgdimZuPw^GC=>c_FXBRfo4wRb#3lA=L)_KnTpp&HVG0v3UX~s
z*B;Q|K=y54!{p&%e^ui9#KlOD7dcm)dvZ#+OflCT{>&OT)}Hb5!*EOjhc&-*adUT4
zTvxdZ$p0IWE}y$rZ%}1gV3RmE{RAfHhw-KR+sC=1&PMsFolqndvzi55KadqIOl>$|
zG#sQw<!Ypu_+3lux~V-@v|H_;B?zw3L_x+`*QA;ZqolP#8raB&PYwD}NM}dTP5R;T
zg&@1ci|YfQc%S)dvf|v9Coj(H7<|9RY!kq<W|`5|%#3)6hcC3!i5iUul2kJ6_~DaI
zZ$5}|S1~uveyUpd<80(eRm!=x)3SH-va`x~q=iyv00eSP&mN7bbYA}BWK>#`n>EN`
zE$aO`;Se=Paqg=%z5G|>-y`}U<4@SB_l-Fd4;MT6WIGx(C5v+-IoEDfI<XE={ysK(
z`xt>`H-6#!xXf2Yxy|?1d?7}>GbTm3W^>KShY648gHL^AN^-BVa}8rEs}61eIkeq(
zj*yTue)dGXEzJQ+W?wzA<F1(evZWv{WI!@O=guI6q6t?TsyW~i-{5HMxJl03m>Re7
z5tYe`KE~LngW36dG84c{a_8&H8C#!$&b*Z`G-9^;9{gj^{aT^iy)wO-uguFBrcvei
z&(^%-1!*WF-QPE<d4WZ&a0WC@XcE=@1v@djX-qw014%It?wvIVU>dTZiAynKjpdrl
zvdlftl|UOuOWd*Ox^SX_H}w8h_ZMS~gJ6^-NzCkJB2NgAZ9yHL*n7~J8DU&na<|BX
zSyw|X+R_tq=BsfAUmcJ;3;O!SSkhmUvdha@bh6RP@WkEg#W2%nT;yB}ABg0#?(GD(
zalQE6-%`;e$}LLkW7PrPSD~`i0Vz2%TVm2!WT5=40$n9GSS-Puxn=M!D08DBjfQe(
z6~oMrFFsA(QRuok-d)O--qQZPDwU>E_fj9lZx^Z{&k?Ks+|f0Row~W_XS}>tPMbpZ
zYF^E;iz2LO{>7)b|0s$NWI9@BT1=O4xXsex6({E9l<4`*sW`#$ZUTYyPI|_8S4~TW
z7aR}khX(y?kT;id*Wj3nv6ZeP!zM(wQWiU}4E}P2Dp5anhPb+!1uet{jOLW~H;ayR
zkWuvEGI-w4qp7S`vSHD@+B1orJ?PGLwiDMQLbjcc*2foCe_c;un5&T_EB0D3*bJK!
z^)24T3Pb%5iK1B8!j*gZbD4Ljxi<-m2(B>j4+dW&<_LuiAFJu`#+uis4?(gW$`&2%
zs6da1lBMJ%I3CPK^RiesVR0XXKDdZSG1ju6EksGWztR|-g^y)Ddz$9?xeqd5PjZ9I
zs;pA`&XY<tS9r;{CPg&;c!~5MbI*RZB*qORZKS2CXAoU>&wnESK|V@-^`VVMllECS
zL^bK5=gwX-hok{WX$FEzzF(#X!o>fODP0=l=7ko(lA2`nc61thcZU^rMAgl}*XpP<
zAmRKxxz$x1R?>3!#RNwDcY`4(qrzrf(9Vgc5?2?*tOqv}QrMw0@X@i}L>JyqR<4c~
zK!og-cTrcplTq=SrCy{>lwn1;Ukw$>jPAH7ij69F!l~w+Gniv%dI*z99{wG82{7>I
zbqTY+VXru45X~yg1X4+HU_IpvHRi30cppb^W&&*g=^EacveH~nPh^0F^j1->&y`xt
zI&Dpd?!HrZ5VSEWZkDM^6(sX5cV34V^Aq1T%5wRmT`2CP_*3PRENxiZ*>_~@eoA^D
zd!+{wO$72}<>sLwp}!7Ds@Q)g#@!&%&Xcy%KpVS%_d?`zgDuOQ?w>yTpbJASw{k}^
zR-J)Ux-=0S@)hs*JOel;t>b2D;bm1OtZ4VGR}Qc<UwsfXhV-x3e$QG>Lykk<-<){J
zeJw4bOcd}Uynn;2gULm*;nJ<;w*)sux3n$C5EH1(Or|u39L{^zkc2j~LO81_((!~l
z-H7b*%sb5QRm5B9dm!@?-Z;`Xr3rLFFs{kcB)p}c*z(+4xm(b_EH}@BqYuZcGV{y2
zj))hu>P(WVnhhBH=~ba3{WmgBU<pvw>a>Xy<7LSwK5g_&8s4={ij(_=W8qbu`!aT&
zoWaW@OtKd<>^vD{W@s)O+W?{A=3hx?O$6LB8%lRy6AS7aeoP#0ZzY<P{=A_rnwJkx
zfTFvuAtP?q`?J7zrH4D^?VH$mH?u{T)TPxGMsYf*@31Fv5w4OAPX*rP<cdT}b=0&D
z9d&_zl-jFx3h3*<LXoqglgX~>(jGMvR0fSxl*)<89hc?K#vR&I{W_TsZ;FePn;m<S
zT#~%if&XvdeC+w3;3lie{~JQ`_p<8#fBCxmxufE-nx?D#|B+KcFevB`OX_CVfQ)(!
zRty#a)uFD|$NAR4EAc;~CSON>lW1Q~=7WZ*_}`VU7k&&Rp#JK`!Em*_Zu8spYB}`%
z*sI&mwnweH8m>A&d3;k<@76(Y)-gd;ud+=h;Tdu+rH@`7>6m=YG1roM9J^lBq@&cH
z>&3w`tnI6*)VQpD5&VD!l+Jx#y|pSPzoJs1qj8icetVm*C}aJltB&f|0^rGEa@Qdn
z>FUTr)0m;P|DU}WQx{C*{l(BC&+_Z{;}4v5S4g`JW6!00wwv5y`UaQgi%HZ>@IYge
z#|gW8ADu)lbTQDTWT8}S=iW(~!%XBpflhy!of3*syteTAaPjqYNYEB~#K=;t_NZf{
zP9&pFsMF$Tt6<jIrsQYC1v1g$b)Jk<SG%>Ll<Uglelh)@^VzXEO%2ml8$Z6GBxbIf
z%VyU8AYDiFeea*7Mw)~-<&Yc|WHcB<FUe|A+=fbP`q|QOsrNo{eoKD!`9~)!D3dU_
zrS<IFS_pZDmwX(2gt|V)do@37;HLTDkk)?GPB>&=Fgrw3$=%2^Ve;Md@Zj;31@hSx
zxJ-%$U6I|xBfV9~=38qg0!b~c4ZK(x-BCbR3c8^!rx$BY$nw1Hs{V*?FYZj!=ZSp1
z9vQSt%$)DAtF_MeJ3c1W^y4|lZys*G%$;&SwM+B8Y+2j!7l)pga^^S}pVHF)JW;p4
z*iF5e@gF^*S#(FbU!7y~$gdnJT$QjMj%e4`Z;6=5Jli?l5^<LCTkL0x9L75_*&v+q
zy<kgU!S>xx=u!`Z2b^!W)GKX$KAyBoJv^;PO|7|&TP?0OG`%+;(iBlXy0yXcY`9lX
z!7<5W+b<thN<^ISdPHjL&jtp)Tw2(J>W5#795Kpa)xV~16cYz)4HcA0#wDE8W;Q&c
zmCHr(FCN-o3B7hBnsR(`E}|~w6(@225u!QsiQ`|ZCnF((;5QEI8BIvJP3CPEUb#EB
z1`)XKNO`}G6M00m$=Q2t9|wOSWChpPR+^ye8o&+OM&~Dg_%3nw+t(X5uSTh-?*}?D
zk%3mR?o_X2uDVS*`L9Y)X_{>O!Bg^HrYvyo;;L|T>0XbN@oVpqTCpuUpv|`sB%~G}
z>Jhr&+U|6QhxK+1RKDbn5$6%6RpawXs=q9@@RnZuJq5iw(`?FKcc4x8UdXcVjAqiZ
zNSWOtqb^OZfjqNF6VINVlMh-OIw@pVs#_~apOEs}VD9(SPqD51Gv!L2_9)N0@${Fg
zeLx<$^UXz<Is8?|xuJ$WyzA~kZ~S$m3G~!9Zg%72-sF!exJ2EGKy#u3`6b+Yw*zlF
zc~**T%TQwq;^~qxIt$%PF!Z?g)MEX3EOr*`JGCSiShuoMpU;XqBF~5ue{ek2RM>nZ
zPe14P%M{}(<yx@SWw6e~S7x@toY(f+bJ;0V)q_{ut-nYq-%|W&^(x0AtCA+MKS%K=
zTBTVb6YO8D2RSizH+Hel0Vc~xGv*-eC9Q&b=u3~}RJGuJFy2tz*0>36@)A+G+2NUS
zYQEV-XP~B#lVWe4PhalmtYo`<%WOESyO<{O?%IlX=k_h{7w@_#<<d+*&H8Yulod3M
zXiGt$+~+6P^Gc-r*c63T#oj!a+Q*bF3_V09B{HkS<eExSBihs<4}@O@EPk%A9;9Ty
z=uHx1nP7WW)gBS+*5z#E@yqn-{+A=`?z-?KbC2a7Cx+1T3R<b4)u|`uE@X17w*Is=
zV+=xsw#H?VCVmV;GzhmBGQJxS<D_Kq=M<3&isL{Qux%#}C*2BXS{(E^RTb&c*}mYJ
zCP(vpZ5kmJF$iyuX3AIYGHip0U5#N5jNeU#V>TTw;Qi2sPoSRF^!fDfaS1KdNjjhC
z7GX?{CjV&~i5?9f)JPTQxfUSG^;}7j5{j}j=DDs}BpMe`k+I!L9~@6;berK9dbqX6
zHSBEE_b`krit1drk>sd0qnNi6a{56mT?V@H$7QzzHMD2Av_lYqupp!{se+onU&NVv
zb0KK;wr1*mNd`*tr6ZY)&wR$}rQ-0cm-u#+)YKEa-pPaHi#u?~X9YI{0`T63p!s|K
zGT>Ctj-T+bVYS4xlo8+eMpEuO!$tmamcc^V*CYLwV7gfG2K>Vwfl=Ev+t?n`Q1FH)
zD{aDcrr)o*Ih~w*)te^Q(Ki*HYrjbqrMG@Hn~9&$rp-8hj26Vwvzt*I9`!<uuh4xM
zl+z?yI@{1NDN{i%mp8Nek>m03I-E-)lF<&F3xZ8oLtFN%OR=we=(I(Au%W4mP?m*{
zi)@Sc#&*8u0S1v#UR#Gfh&;V2{=WGzI@YXuD}UqXDBU+Y`I8e-YOxko30JWVRe@aJ
zq!woA^y5VR21amMA&2X<TlQxH9kdwt`?EZ<lAmT4clc9!TO$aoPPh$quXrJJ6u!CF
z%c<VIE;^~yd`Jo*vqy2(UYYbeiGnf{?aB|bjIM)_?g3}4l0Ft6a`~!0_alSFL=@~z
zU9@_WookW>=tn#@D9p^Lh^AZou@w{#o<Ba)p6+oD>3x>XbUe-6h#O!Xes3^CxQtw+
zy!cg7Ea9vK?HT&!CHK#d5<Z!4vnBOB;dtU6DB6K0vV0Pj$_v6&CC{+0Wk+qwq&Y+3
z??}H|)4$WB@DrV`xp@-K8}8wUdC48t9VNU_=UBDE_=?l$Yr?sKQZP0@eB(8}q;|YI
z)KnO?tQ5|%FJcM<91bHPzdFtdNt$_@g?E!K2{J%<sp<V7wI+1^&x1uvG&(_<7RQ_G
z5n~`e*C{G7&F_1vZ$k3d{f5gv0_*V)l$mZio;lU8)iOEFsOELL`Ny28b)g#}$r4tq
zzN>7_3K^E3S;j-m9T&c%uTikw?l(U3XVVennF|R9wk?4)<QeW1$FI{3x48Oesgnu1
zwV~_5yfRkZ4OOxii?5a_y>kVkejTKvrV7@)DgNY|84|nMqww0LY~cD}l`dLwCW@Ow
zjRCuN_fBLHja%+Q*-b-(v3cIY$4XMjfE5D5FE^@_3sYteV;52+Jg_^1qpPn|9e9P~
zuX<m>2nM+?39=Ru;SYN8Zx-;s@XSJgu88__b1qdRoNH-RxA|gSM2(YOTOpWnS*y<)
za30s%^@MYy<U7g4x$!fcGEb^<@WtmEPQ`z&OgLZAT1PW?MjNQxv2ct%e@sNt@M`uC
z36nhqe8@lZXNQOBy#RGousHk<{660dO#!L-+?)sRXCy%syd6gp`6GJIcM9jKnsSS!
zhni(7_mqUxFjCHCg72b05%7|nH@lAN6#B~-xC9nYPoo;$ROw|YM14jtP}9?-Wp7`)
zybHIz@Td5l?f2EoQ@a0$g1kvNBO_1zImT=v_|#GR$Hscqw*d-B8^%$NNB9?2iKEr8
zugbWFos8<R&^StgYm?!p-@Swx8;k)1h-RVlINm`PA<K^kZzxbBrri&T-j-4Z4tbh=
zhr#i()vD6Eo~+x@qC>FBW|M2L;n<{h9|mzP5LsDQMbPk);v-o-1sd{gF;Q7h--cio
zE2;ibKI2PY0gWjcN{r)7KJl$zn1N3%Ir$BpR&>F6zW14idZYN0V45?dvnj6u>k&qj
zcYMijK9DdC{6#&ZCyDnXKm89ls)IT)0?_;juSAMa)o0CiNa{@Rxwg_Fh81(GV7Q8v
zo1JkRK3SF1wZO_lvK1L;sKr&f6l!BWl}BY~>#~E*eYI}acFwP3Gu_o_RcUHN#%ph8
zRU+(bzgIR9?`c4*Nf%_V<`u=)pM!eT`75tP=C3j&#~9|Rc+=ixK##o}c{?age__ly
z;Lwu$f$_H|oEyp`*izH;3iA{<iZUFp5idk(Tx}&%QDMA8eaX#Np#p>6k{cB4hNrcC
zuW=wpnUH&8qh*;#{vQ>vN~lQC$}AYS-Unhaw?oh_mqbra+qTkbX1c`QFkV@nge?Kb
zCVDP8&AS;&5|j<cTc}UsCh9^y;vk~%OQWEhVr#?*YFS>a-3SN`Sg5D4D`+^%XTy%+
z*fN9xL2SY!eR#l&VDL>E>SSEg8D`ebXY)wP8#TOEn2^a`pvKqs25Q;Nr1K2fE>Sa2
zMWSe_ys@EHTz2&6Yw+Ud#lS4PK?n;X+D{RRJ6Uu>J%`chkFbCD@pQ`&8Ne$PE^r~G
zAjr4=JIwF%Uf274spSC87li?8U@`_8>(;)dd9VnqoMWMG*D-fyv+)~=TDO-u%g1rY
zRJsS)a2x)Mr9leHT1A$V*ZYvN+F6p&mAlktbEms$zeI5(1?qcG4w*ZobpS)?2aj~~
zErjufd{U-Rndu<A%=b`-S$8X%>UjGX0j7GvfRd-N^~N`IA@Ta_w1Ror)|pkAiuucA
zG@<hX=`9S~&nUve;{@sz-lR#P!W<Q36B~Sq#ksG4JP$5*H6pI+@DwOh7<w-?U%2pV
zkLD}7j@a59Ozc?hSd%pV&Aqvxbey6!-+Rvx(Qmbz(9Gy!<blQirD6QsVi3$oa(@J5
zHl3Zm_3hRK3Qv1TwmW`#u3iQr6z9h22`BoMM9IOCl>t!ttB7$ntQlPGB@>CLp1B98
zx85b!!A?(O?`XDQeccSli|CC=Ne*{(2`|S~RFq{3isoDo)qJJ+!M3g~RbxhP4AO$R
zaudhgfED<FNPeI6Wmhm@jUreJF;~j^ngFOs<2NMfabDGwMXGQGOK-ped_Y@xnD6S@
ztJq>UGph)Tc^%<}4uqgbT~xg06Z_vZ$XECmzxAlg((^~1j6B9nVSzcOQ@eH2S3}D@
z*+4`sd;-VIae>(dT2KAP6>0gs=i25F$A2coDEiS%>g;U|74BoXa9a|j!vR@;TOm{1
zEMhjSDnUqVt39Q`(f2;$w1Z^X54{npsovSah2c5&gR7WRI@Y)i@*?gT&8oEy>9>r_
z=Pw$gSLqwCl~Z`|i!8{@h94(<?L-QE@`2$BU{Z}lN7B|J-Z1z{rA@zkl(JK_a%Wf@
zx`AtfwbC6Fx+iIR%;b>ZU;Q4>53Gs;hubRFx00A{)bVZ*k7&Ptv+oUV6LEMg5pJBJ
zc6&l9fICVCJMCr((@R3#&l`GD{#X(R+XS5GWS~XMD58S<Z@dJcY=8&obT+Ev8K*=j
z4k=4jwA`z5netv2<F<8dxf80UbOiV{Mp(;aO9Pjs7&kuM06nCXGH&jqdu2BxB-S)@
zuo`$u=*dJRC0E9gr@!=6W~-1ja6l$+wvS6!!__Xh8E>ILD6qQa68vZTTb4kwNclc1
z7ba-5vbFOu9wI|2z>3lhW@`#;0C<Qf;rFD70V0kh@#aZTn8I4_<>}dcKj;ds4LDhG
zaoS{k8bz<oy-(ch1z!_aNYf4FMawqtk>yYO_K2+^PK$|2?0bzk)AktR{IQv_-cumH
zuc#a5dg6AO#Ywt+aI6%oO<0r-m&)QZquLwy0XM!y?GKH5J<x4~v=Dz6pqq+2v#V%g
zE@lmPwgMMA$aW*VejZg04mwl28lBpXVm}@RR$sA^vfOkTNmpeK>xb)^`|#f}Z(VpJ
zhbC|x*@d~HM-Lyl8r6}bBeKZ>TVaK@v|aV&8+ryk6^wGU9gkw}4$65Uy<H!k7UbG?
zmENx^u{)QZ?a23RLtNRS9FdTBd%k_oDuMa@X4i(Vq*vx__v^U&L=ykIKN2S}?VzJa
zR1ueOn;r^&ir1W~g|{(bN~BJNx3k9~;}Z$i-AWpXbF7mv-iYQ%HQviBx**KfSd?wj
z{KnI%KZHl^5XNr&mEZZ~y8T|}3B=%G)CfRs@W!o)2-5-qk|N3S&eJLUPc^%)f!fr|
zC?}&=6vw{>;NEXUMUk$>J+EIXY!sE5U-)kC=+_5oMvm<50v0ruG7VkcK_(FFIDe<F
z8aTc<&*_x<vL%0?y0Fv9h*-o#JeK}P>wD7&qWC3W1c?1w<U**$(^+UNnXm>WxM5Ly
zm<v{mkrqChqoUkR{^Qf0Z|d6J1wM4f`Hg|rKBl$$WB4cR0CFhw4+2A{S*i`?uaf*S
zlw9mdD&Qace>%2eDgF*&to76!3CC9C53}4l<h}}u?rey<AGcE1r{VdFf53e3hu;ih
z&}-dBDgbcRyd@6gY6q#+0(ilnFgkTNuBd@nE2`cNn`lrNniV&1#HwQU!^+KA#dLFf
zkR#aj=ABFI{_uoBb?~Arbj2G{{_QrC*w41(qp|z>xPt&nEub$=Q1JPCA4wK;#0e%U
zJ%~Td=-xeXa00`8=q6=h0{k4&*KSQPc=~MYg$R$LPiv1t9eDhc&^b9{<&jmahwy&G
z*XsWJmQ`1^nn$&8;uCrL8F`p&o&7J)@WSWaiV!=R*O|+-dRPxwYZgD3M7cKee9b;;
zn<K6vfYX>0!<tiXz*~I^g35KI=7j+BB5q~Jd*UH6I8eBV*y$=AUC5x5Ci)<Tsn21c
zp%Cy4+&X$XSta~?&euZ!m>pOq3J)_ku5KmofD<t+#MoTBp9xf6EC%$k&=gY_IF<9k
zxGI-eLM?s~1QYWwy!Cv3AUF4EdGU{@<9s{6kh%wPdKCMw-Hr}hY#9Tq*Su73f;HY=
z1X1SXN#29Jr|4mtN$7XC|3XpPrcU~;m;0%azs*JEg+lHWP1W&w$Cr6g5gdtqgN?j+
zOwi@k@U&zi5Y@AZi1l7@@frr0I}ZApd3<?Xu{^Jr8$O1%Yw2mlD_}wGb$dnPQA;~6
zT3PjO(;@LV*R+qlLxFVRpRnE^^kU6@wv}P23aa^zjqdXhdq<$u@-Uzd2X3CjtPEh|
zd>7bS8~qW`Iq4Zv{kVJg>=gsqo_1hyv>)BBkIV_BcXybV6&BK12H)-rbP#pnOuwcf
z<q64L_UpPDr0aTk)hm85)l4~MhLwuJB-`X8`Zj?Vz+#u(JLaNI7EPh8-A;$DBiy&K
z-8GRStN72-kI!apcixVu$Km*=+5~pg#S=X4iX}=d5qc$S!ddBW{tWQkI2SBazo9H`
zM962Hhka$5hpAe-sX5KCovV3$wW~nP6#ezEMZ`hCiSzhjHn5nYtn<qX6xT5F{hQ))
z-Okj}KmZ2MGMuW~>b8^u{I=-wyy64V!69hfY36E@qA{1z_l(fc6;RzcUNg4UX{D@2
z$e1a%dMX@n+)p?W%@O1L!Zo<FmI0)6$9o}+ozbdyZ+9}nv6DKh6=&|nvI&bKD}#KP
zCLz8~+MRl<kMLH^&?rXP9)O)ql@YOm3vjd`xSD2BF>`5sMRn{^LDA=qkHQX;UFXj)
z`}9?*ntlbRQfKiCKe`hw5-vfgyOS^eO1`<~A3P#^!K@?Y=>!p`7>Z#R!EkJ&Q}?5f
z&XZ~MyA1Nciquiwc9Q>L=ljNVb){~y<)f=7>%fYby(1g1z{BoBiSy#M-0f-zK!=O+
zWm|zu8XOIH$85zHyyb#$7BhAO8s0Xn7s<HF`!w2V@H8)`Jw~_p?f3Fa>^h>#&l)Js
zEY)NaFg{aIRE#Rl+Z%N&kMDk09M1IGUyxQDFBI|SdO4N3^~4<Ehl`ip4525y>6PSD
z{*=@0kWtQHwX#)TLBVjqn0|HZ^{v(^qjDNVt<4@iPryn1TAh@9-bwYW2En`Np8Qxi
zjN(m8bg59Ew)|BQQyTHe81dRK92us;>KYVUpJtmlpuc;L>E>Zf+Z?#Wo!b-e+Gby3
zTR2jUb$m+-yt2&0S0}5@jYlAilA-#d`$6*o)$3E7U6A<+{OvvJww5O<ExielNQTU-
zCds4fjD2|=c7JS9+XjymjLqVcr;N?gd^Y9$0AKYg@>=PNkD=3ja@_67<9*q)bB<2U
ziu4H;9o`}c9|x5<x0biUzbj)ELZW8>U19c%L`54LRgS;!v?hLaWd;Sim05+FLk)#p
z*-=z!f!;_oIgxyQ{o5*7YbzUg<h>cw2PLV!Un?T7-tD<SVR8Wi<FmBwuO!w(s?Dwl
zW;vcHCfe9p8tPn?Fj=|rx#@F0KXu5+32<a9mg{u*guys)fR$XFRAR;=VDl1!rM0<W
z7w(D(tF-!jyU*KxeI%HC(7qGel2Ht%$}*k*Vf)z>?5t^AsLzJw^T%c~=mpC<{n-mJ
z4;G6f{6KcS3JCUJnXm1w{NB{79IB1{$|kS+a9vH4h0~ATsTG*sRP6lts5k7b`|n?5
zSr%8G+u#uKb)n1&rCWvkfh2$bkDXeDQmaO5{s>`L%%#x%z7|m!f7C{hc>0zab6R!p
z`8yZha3Xu1l42$vd~8Lpi~5Z0U|!*^Vg-{SxR)$!<QP0s5zPBJteb;O6VSvs(}>cs
zD{$$wgztIOQOvmFaHsf_ZftBs>9_YaV>201OhEFNqOT!2xNlWG$v#3$p4@{w-0j8j
zM3#tk<O?kcVZ>&kp3abF+1ZeE_GeH;STwHWhg9xrr9+#^e&^s`yL=pLbG&!;Sb6$S
z+Cq~q8{9=6>3-V?A^!QK;*j)Zu|R8iH86=`*QIu4NC|MdW8QWkwFpAE4U;zi2-!3I
zP0F<91Jx^3G8roMGVjrdCQmrlEpAz$wG?jT`QS=hp}w9*N*FSb%A6*tIKTB$omy95
zBV?Qmv5AQIF!9Wa<xOO~5UI|~p5lV+CuxaCTEHQd?@r6y^BkLy(BS{57_{#4FKh8S
zG|%fR&#)%I=Y)a)Ca9#83Be56wDqcLlPG@k+H#Ouw`wD#Pg%Cg)~jBg!gwU1^=N3*
zizB-!LF$peKrvqB8mZiWSBF+Xn4g`57h(SoEiQmUDuUm>^A?}iqP=C8k^uThf%Ygx
zV~dF+>8b0_)zO-q72<Txc7Q@kTrUJ}1LV_x9nJsqi$A#2R+O^aSm4iin-Ag&9601U
zR8PFs*sMvc4x=gyDL0(1BK+Q8%})AqtC)2p54Q7V&%BXvC!%s~i8-z>U&Qb~BB~7<
z7z&yVZEFmC$DtXuJC$X1zLVA&9y~ZaEr8htr{2M1)Ss(0TdNPX9}lj}+Fj_nMR3JC
zW`B9}>KV9&xTGpK^G23jzj-t=+YY;Wd^O>ANR#bs$b7R6FQz_EtLmX?|M5^xHN7fo
zYT2P?LSJK(u+=-xR-8!6_@2$6CoAdEyTQ~Fw-FK|DPiBbopTRPwHj3`PPNqQX@7T(
zSv$*r?vkGH`#E3VFUa!XY<R2jv8CO&D!JYkU6g**W<{i5pR@OQ?EZ2(rwW*JqBv1P
z#Bi&asjdFUs)%s9yKBSY+>x%e?FE3-Xll;r6#~z2+cvz3zEDeo_xHeq=e_gF={;m<
zcS<T|2Om^dWz=ba8&#9Ne@>s`UWA^s!g!wB#TVun9~Ak@-?z#0XfcA;p4(K9fg5=Y
z=?^ONZoy^WXx*8YbFyaTdox79Uh-IVV)9t;<s2X<*&7*oV%W5RKG5?x5Ayv(tw?;~
zf8fO7rz81$47Zhj1^Im<^rez_X{}b=<X_b>LIqaNM}iS`<`Smfggj+Wp-jCiZCdr~
zxGS()MsTB0%3I`S*91ktLY`5B=FF{Db(L1zT8~xAP+_~9e8wEVzkrgC=OW1S&$bL_
z4YPn<hRL_-*LJkE{k{P|s>FUv%$8N<+Kb2DHgG~$!&a{Mpr`4G#>{6jaf<H?|75p7
zJ`ef#GhkJQNzBXRy($6PoA;3U`p9~SJ<v)7K#jy9M73}{Ow|SoF;$BVzkRpZ#m|^X
zhfo_ZK<%$e`uq^`*XerpL)k#X$!LNh+#Ub>PI7bz=xyYb6QmW6S1;+fxPT$*Gu9_d
zTiOio+K8Nukyb@K+dKWV)VMYh7M*>+S%Ae=d6gbO6uZ(K{SZ>8lo3y%bv<k=3tZzn
z_VH}B|4_3vvfnWsHkQvvAoFV7*x|FK99#J>)zxOpo?qKTj2tug9-P)!(U8+PNoaQ&
zPXC!zgX1}XmkH)XZGk7IN%8q%n9R9*3Dz^;r+{1Pr8hnL$beb{3kH}q{#U-bUz5f4
zKx}+&8GV064Jro^L@D36$ffFQ#^0q2LPdbN4!LLiuzFeh3eZMBMHa^TMjS@7JwS-)
z?WN{KzCi|n=bj~|j~pc1RZ@<j-NyhSDB*0l<1TO(!upwfc9f|h?z8jVucMpK=FreK
zpfD$NYe+BqbG3@-o$w(T(}5^IIRFZ$sE?Bwo6dZdDxv%X9s*AkDHS-_q8Ym<qmrvz
zFwtqZ5IhG<6uWIxY_t*_B|eD1=01SI>sU9>U%5=b-B29qyUDlv2Q~S+@x@00-o?Yg
zp8&tkwI%99R8f4HNv<&7+`sV?PhEuoN73w*#KLr3Qt!&uh$1@w8X+itKG)*^p-J9m
z=WUG&D$n@rLJrVTm_TFK&E926|5iJJFF+*12Mn1zY2Egz<X40Zr@C?PTP(vjCdHSS
zp=KrvJ+%U=B5L9vlx|lAM|a$AQf&Y`!_^3<Z1IG?L&?N>26t_;&Qk(XE6R;eX8Ih$
z)Bf?+=(1r^XQ(L~+?>|nCqO9^IhIA;&H>LTpzM=H-fp-z{mI!$MkA+hsS9vorJk#u
zc6saGsd=FQ$1_9bC`A-|d5Z+HDvCK=Y_lK(PL6mbes$*oxwA>Yj&8O(-$2FVpMM}`
z=?~mUP$lWLGi*STvU!M-K7WMCekn~f3r;L`GP<m3hiw@k(Et~0BGeat6%x_yYXX0%
zIB$@3%O0h0EHSpMGJ5A7Ab8D{&nhn34J32<7I@X|8#fP!Jn~yfA0G86p$d3z5OXaR
z^8zc57RmN0MFW%{%Vd0ng$f}!!PISQzm%8qH^4YGGal)O2k|R3iQ)p0vwr&GFA-rg
zyr&S^Bv(@my3D;mzI(PlE^H&8PepxqB!>(OM@^6c=heNfN<1^EZj#4uYtKsRilRgS
zRBb~d#~zk%O5;Qmv>S^{q?g{sh41=EddYyjQ}-Xn#4djoDT;1`2_V{zfb~2_Fc6L6
z^Hn*{fP(UFSfL`K+?OmXDZUAZCu!(9Q?`I!8NTMrj<q0~4Kk$V<#Fnf<Pxy4(n!<D
zZ_WUIOT88UG9}60(TIW;{N>|?=*jz1dLZL%mbCbd1aF_=yQRCSp4~5;UZ^9M?QRU|
zE8{|<`we+O*t(ZsEFFsQR$lZQVDSOAUhiCT#Sf6VyCAn7c%xm=JwZY^WC!m`pe`9c
zs8rvmGnLTPxEAnl7v;Kao}?iM-W0IwNAvC)^$Wu{B2nN#|DV-iT!`43tngGe@}1d}
z%JdB8)mm2Kbu>ay7^V@}C(YaAPra(cI7l#*9vlnWZq!CW5cWyeL=hk8Y(+UP>5=W+
zp<ptE1V%()=)9(YT<a=rCObV;yHUjTWE9fEW|pp!6!=Lgb=0Y{0U&H!k7vdKe)^4p
z%#=?sfr)qZ;{i8ZrDjT!6(dEQ&665gqxzqzD-Y8rRUrZ=S0cbkTJihH*RB(v4i9tx
zdw$q}T6PUt;{Nd1_W<x=Imt}xjc<3E$i%JlVF=*8OP_5`OMZ?Ym`DH1Z0ixGxQtJA
z14Qw5v%n!Y%Qt{u0xAen2?<Y8_z!t)P&*N*>AAX7SB~qM7yyB7Qla3yDR9_JmE5TF
zA4CDdc*lO9KbF}A(~@tayPxht*B>noG?!PV(vaJ7Sv~N11R!PQE0^F5&+ZAnS3Q7F
zZ?qoM8^gWKBo-&zy|8dH;(svu0Fu)H(DB~}Je0>o1_wN5_I^FL8tS8OtvktjnT11Z
zAsBPxZ1H1vh{+31Qm=aYItkkJV`Y;a4~{4BV1BQwSCU2O;VCo+j#n|<EnO};8@7WG
zJ-|C13xG}AH#pus->BOyuOXzUNAxYrcwI#gS>u;klkZxIaVu>#8=ilBP|LYwr7m3`
z_Ml2-%VUnI^Dh%<mtfy;8t3BH7qkZCrrn)pqI*!Xe1WjJh+@F^1fLw`H~_e0i^nhP
z<(o@A$g$6FYrxjKX-GSllxCv<6i!M&wC&~!;j(RU?E&0@)jTw4dC!~)82h%h-<^nY
z%(Qw(rDnj=$4)J7Ad|;Fy+DngQgiE5MY3j)Ly}t<0Qh$ZRkd;9X)^ekh87a1BD<t*
zQcLtkNd>fy9Tfc%?qg1u*>xOOoDm;qUCmg18$No}3B=Umi?^duCG<S?`ifP$KkO$%
z9*l69H3SjfUtlz=^=@}Q$zXxMq>u@%`h;oPObtdxR_;9~q8|fr%Nk<BNCC^B{y#U=
z5F-kLe4ovq3Zlv6z4)$BRP{>!8LwXJV%n{*U{|bv%O#*A`sk8vtmsJtORs7YV2oL+
zZbLZ!(Ls8rjs=(9{M9y}q|w4xl98)ZRw2X^rHRZAqW!NS+-EXoF~XVTBHkBQJv!gk
zLmX)@Y6G7vlP?O;pqHf04?L&c&M5nZQu6eMN&cztX&&x!c-`fYQtfq_DFhWZDbx#e
zsj1AsVXT3EOKe6~c_h8KdksZop{K_y-(i1BDbWMYDeT7%1ntkCz-csbuW2#@0D)OU
z$5cI9-vD}~PxlEyLong|b?c4iGG9H?UVn*sN~F8%AVEm>au&t?U8SUy)ylKAIsG>g
zjm5}Qfj42=3)ptaW<&=M5h(F*j@nszMSr#3onL-D0VUkP;qyS6vI%$}*QXS9?lb_H
zqD_2K#tD%g4^3iU^1R}wz3V<T^1RHFMOn6oP`SE}&waQfxE@NmJaD{uv3bbCS~C0m
zO*l87|3f+ysX8H6!(04Ny*FMJDc-Yyr!aFTT^GR5X{8`+ee!`Fz{By{S_Q2LPpPkP
zM9t-se3oiwCm%qzz8Ys}xY$?8`v!{8os*0fT)$yexC+Qog-r&WiK+XMwOKV5eog+M
zE5%p&&Y@3o*F%K(X2mQhmcD5;0NW@jbCz#D{BpMJ@rM8dU8P{kdxjbmueznr$Y)8m
zmz0E(Ar)so3g+(|-00cAo$2(dR3xcqbn->GImQ%mj;Nj|V-DC#L5LcUFG+LM1piqE
zJBJ%9+0T&GQyJf&6;>OTE3qNNBzkZ_3+MF?xYi&`Z&)(}@rv)9m}$w=|IYLy4y(t3
zp%9Hq#?7op9D3K?0;MCrQf1q3-2`74qFXtsL(Y6Cj!A0;7L7-PlV42PGgs$o)c4b-
zIEnjX>kr0oEbzxv5XHrG;o@h;rk&Es+<A?uHe9K_UJ^W1k%;0SeE|e!#4FBvA<hNt
z59tg%_Qys@Zrc+Cq~*l}utI}}sOhHDTN#8!J0>n`zy>L@XO$T1ZNGjL7xI1uCl#bY
z)RJO$h^M|7=BdoOgs3(Artm{e2E`;^t6Q06UOZmt+Pa<aijR|;L-Ea<5^6UQSg0jC
z{$q<rgiP!{i7;K@fBo@Se2QFKSbTcZ#2i+Za_u;cR%-D*E5pyS%=KWP#fDv|73pOl
zp*|vn#ic*LR(7kDRo38&%0$@sPnAA#Rpz{UWr>c&Mo2_#`QRmZvticf(mW0oM%m!j
zqoQkDQ1kF{v98x*h~Xe*fTK=a(7mwr<OOgG*`BvwEjjE{cnggmk%SxX!0&Cn;4SP{
zAqtO+>*dd@G;R{+`=|P}G~}hN$~<P|Z8w1(5*?y5<?dAbG~}J_++6cr{<8CL%wQzP
zW|54&c%xrn2;LS*%O_LDG^x=ruU{BU^a7I7f+fKedn*RV1l!@iApR^?bebpg2p{>Y
z@2zb6|JF9ze`|YUy`Jc`EU@+N!i`Pw@VA2;JCI$BkmgML1qS$akj;ALqy)EM)m&Du
z_2@}#;6R8D4A1#R=SNS`A)m+<&w%`TxYx^^_HHwy%$r|Tca=t$@!IF8{B?yJ$3K9j
zDoAPTo%f+wsva6IPq}e9XP9+3y}syw0Mok0{F@}N7moUTpx10MU?+nFOu8=AmD}a@
zjH=An!3Gy%D{-o;2({}KF1hO|$CFd^LjPy8@;96c4NQ2yeZwng5*H9J<{CLl!ZmoF
z)Wj2-c#RVN4={uOt9tam=Z4k6Xp67OH&w4)>&aZaD@OA4PUGWvXkN*-`Ca^Z*<bNL
zKcfFlIKlr40KxLN?v^|FDFFomo#g69?wy=`nzM_ii#|mLgX&VlP}>xp5;Hm0*R+rO
z;+T{QGakGLGLvr^>+e^W&U+sB17*6m*HVfOQ@<5x(O!xDTS)wWq-6O2^Os_V>Z?#0
z4(KsIdp)qEtCWh2V~!|#|9&_+1htxbVxULUWnP&pf)dnKVv%l*nX!o|6n}z6Reipo
zV+~JIUk@8(-kS}uDxq;;9$HBX#d=Nq2fP*2!Q6O~D;BeVHh6*0Y*m^I*-3kIx>2)!
z%?UI_xI79=IP?Mat`afRM#n|uSydVic=s@OSA_ILgD%|z&s{;YYF}@x@I|4Qn?SEq
zamt}F9<~;~(W%||HTYry0`fDFWL0*+`u6l{BWx6qzjP$SEEV^?H$;qOt?^iO`3)J~
zQ#5}QS?(1ddnVK->R5$&tC68ZQ&{cmhI@1FWjYee`nEV+xwGTXY|L0Wj;pT5_A)QK
z7`HL&1?fXE?rgCkXQKx7J=Uj~NXXuKaW4BMu3D}p61|q2=3c;HXD-3rEhv*3hmx7f
zb=J7viLt^P0LuD<ORe64W}l+5S#!d$pQ59PWgpcjd(jWQKkF3h`SKRt<py%a>GERs
zH55B85Ok*m1X04^UB|$LE}c{ARSU$qU$UZ`Z8CCw9L_+FwmxFs+SY9l{f68d&b2aT
z0-}K`nndPUCyy=u7Y3Tk>Rw%s#JL?`GB=Lfaos8wUzFpz#WP6^i{hB(;wd||9wT+U
z3rNfFS2n3i$~O)r9+<3Aa@+x=*I+n^329j!Q>xy1cA{Kq(Fo>Q%gs#-YA|GJmz|JB
zTDm&pmx4Nx*Xf7GwHb?uleE;hE+<dWh8}(C=W4kX=0>!Yyt}faBnl$N#va*_<LQ5R
z^zx_ZnjQAF^tz(zIWQ}#Wr{wT4@d1>ISEax7v#Pq*ldwrzI?zFynnXC68a#?Ei^f^
z=N54Su$I_8(2kV<?3p_PAInc*=)Pwen|14YD|>_}w+IW`$369s3u)=2RT|XjFZkwb
zu0ZU}EV>9Z{%5b-WFQaZkeuW>@BKyqb4mcjY9!*40q7#Q6djT~%1dPl$LP=+xjUB{
zGlOl<CQGe{vv2nCT}k?jQS38WB>F`x(H?D<_GdFhwQUo5?0HzQjTrZ3Nv0mWkt%mZ
zjN7a*XjlMbR6wo=*KbUY12;2tLByn#@gf?iKOerjiEo3ZCLI={>r~T+cEYk2wZ8Gh
z*k3H%kGIe}RA|BjQmnzxf*X#veQI3KPmqocQd2T-CM~*-nn9ABEe=;jPkY0RmA{IJ
zvaQ#ZJ7$jS?QdDaf%Ct9mia~EVWx@~+rXRVyVPJBB~i&=9R$}B9|&N#9^-8*Dt6yL
z8XCqgyyp9R;g@ZCT*O^`yaD^2Kl;M9%1oOJ6S<P&lvJ-Lb@l_`U9&I6rP)!true6a
z@f@Cf;@4WB3o}qX3%dO<P08T>JxdaC?q>Ei@=w<$%vT#RWQVLz&)|5_ZyHc8cBlv*
z+;;-AQGc-CHdCE;Gvg3m$b=6nhj(SPtW3rjV-sQ`3!L9t6$$gKg{e$n$NQKMFuNF?
z50J|tnK#2CS<yR$*chkHtlxMl;XMWOrv5-VL(}i0^&Us7fY2<V*nO^2ndQ-FUdOrC
zMT#U-x8uSXzeh`GzeQZ^R~bNe;lb4ls%DS|P~YNo?jtN&6TFDn45rsHa}1nih6kaL
zzM(H3SS9?BIkf@W=fGCWs$P31U@PZ;wj)v|7-~v1^ifllt_>Cj3+yrkoz)wc`YdBy
zN2(Z5IPnRx%e3+N2!G2B0ARG>AbreC>86-K#>GLL+FeIFYf&NnEX#~1VF#41Ea4qM
zmmD#?UG=fGyY7Ib^w7O$dZ2?NIa_EQEoTS0BDy6Wbz4Z-+01bR6)ciQ1;sE3KcvtG
zW%96_2y}C@sL3LkH4fYRJ_4B+urD0-;MLT>-1o<i70&I({%T9~p2GR-==QT&jZP>j
z?5G07Oi;ISTJ6y-i@~VdBq_?tIREjZiMQ4ekYBg0R2gcKLyW9Sd=%ZEHrgDs@mS)D
zJV}yR>#+gHe$+RF&&S=iz`aHB0Jc&QO-W8f^j4OgR~<ppKgxmhRKED`%VF&a6!S5~
z(b14*s-NDUdVLn&@1(*IelOEKT4jby7I~lbosj~}k#|5<RSZc%*^c{>3&~7VU1Q^h
z<GzRFy$jjB);|l+(F*H>;971(^-+@%1$p{u4)lb`z7!AWdxgZMeBIv!^2Fla44VM~
zbrXmtWgOrj7&dqD#5a#DYw=;|{^Km5`OS(!RDWJV(`S{ViaWtk3dc8d+~CI3pMidC
zTp8FLH0f~%#QB!sxKL5<T?lhenA<zI0JjI&x<FOV$y2mNcgH{5A`-f~aPM6S0V9Fj
zbdWU$Q=l2n14eTP)<(#h4{JEVK5wZ0?MTVXd#Q7)qVFuV?*$p;#E66$*ch4G4tUd8
z-~wA#?z3P$DHUGGRRwA6{$}h=_vb}}H^}zLT#N2;5>xz_vIxYb@d?&hTP~<Xi2Hep
zjGsR9Gaxr-+_RqF!A8vXfj~!Ru%KatICu5AIWR|Aip8<q%SnpZcg^odQ1^VTh@PTn
zqi?vITAMfS1ytocuZx%II0tIh>WK4I=n#!u3bcuQAkQs=-pJ<s(IC)HnEeL8S0~UM
zRH*`COa8S#<05^UcajAlTiuv}D+DFVRsXaxtn&)fM;3jM9gkwezqz}B9*++vCdZ*o
z?8E&HK!kbdgUs0VEepBfiGYHP#fMZ<ei14Oi<NJqI*v;jH|`_d%5p1AGf*&}0N&Q`
zmLNQ4G|5aK#EgIl>8l)Wf^A_YNVgcjf4;$>IN6^2QVMAa&poL#$pN4mTk72YMA;<Y
z1Ei11r6Jn64(pz+2|g`(8ZP{z+SZR&_1TAKy{Lux+_c2~aYqe@wgW@cgF65*Qh)X=
zFuWv}Nk&3v2*N1ic`mTHkj%`E`l9J3Q|^D+Xsa5P0JK{kiI+`a-p*VEfjLFZS4^;H
zqi%YYd-5WpVy>3Sa-PT9L{A#CinqBW_acakZ@2kdE}!`-*u^91!sg-eeISZlZVV#I
zE45>t3jdse9J+nTs%a8EW7eapv^m!R7%Ci8MslJ!KwYmslgPOSYu!CT48{v8I{hFl
zKjPxM&V}lO@My`KJu#{TeOTI%L1miYwqMW;9qNGZSu*t4Dylz$j>pxD`Og=F9KieV
zeEWl-d^Yn?8&BWp@jC;nVMf@iVf4BK&BcJD3EK&Ktt0I$Mc<U51i(o1U!-p;UWjL9
z&M-ntV}$Jn>OWy*>G<6VNKPZ?Sc@ZBAqo}eo}b`LEdxBC)cv5S-SORPe#3!_#_sj!
zt4R|rv>xp2^!0;uw@tV|<F|lDRbp7xJbj<>%=^1;=X+~hW9`v6t?|5VQdzOPu3B`O
zNA*|`H5sP9;6;9pwYRDMnDRITVUVj?;_Zw$1;G0RQ__-6$R;e_UAey&7eX0X_W3)h
zS5>}ZI1z0ND3{w0Xf-ALx&$c1D*sLnYA1JD=?`S$@`O*o->l=v&j^%5Tg1t?ru7aY
z-MJAq1Ho|1*k9v%x8hEGtRPz=c}RQXon;~wqjRFrOvd73<YV$q?E#uG;zhWD;CkKM
zWzYuNQ8_8=knn+xFWJVQ^rv8*f&Q|X{n<;)D6*MaKpAldH!)64j;8kjXW&y?Mvvvp
z#<aD0pt<80dyhu-hI;uVbZ||C^v-rN&B4}fk7ICdL){S=wBtV=Mxsx5$|#xC2-$20
zr(s{C*j9$(8(;>eMrX+(yWYsx`_Gx+c)WM1sYH##JI-xZG}|DIgi?YGKvHTI@9}3!
z=JR4ON}s6^HsYDnf!7*PPEDTx?0}CZg5j47-s?RQf?2#CdZkMwvD^wYi3YxWE=E|)
zmQ0@J$L&B)4}UOV)3YwLUUx@;)UV^|qZ8K2g0oZ&HLqIP6GVTDCZ1d)Fuz~;d}r@7
zqk!7WSG_!(YgrWU$E!sQA7GN0(aBV`bERnDr3>9o>!~Hvog7aeE#p&_-_a)WA6h&X
zPU~_v4~ZkyXDS|(bdY@JtCZ8~)xd9b<>r0bp|>O_{jr)udOA|n+mUKS*~{JBoHgqS
zaR#5ToX==dOg>XA2IhNP1?Sha?5*8cd?#`KM_Br^^Nc*M&yCk%LPCsOUUY^eIg@nj
zQw2N}x!NYwtc^$IZV$_dh+|o*SRpd<;fIl+$_vdg@|7ZSt-vFCE?6doFW;qFkZ-gj
zS0>%D0$$cT3(gPUHTd6GIBRiF8i2MA&In*o=Y-ft*V{I}<TQzKv{<qla*j(KEW=QN
zRK_SW@x<q=v{l8|?tiZMZLs2eQlFwFFA2WW+a|_!JwMI(+j=f5&l`o{;;Yh~iGDBS
z&T5H7zw&<mZ*C>&Cl@ZY^@;3PxZu2l3F|OV@uS<rKVTLdI29IZFYCsYzljR7OQ^v*
z|F(=X%B#I$kYQmjyRl)0eT@Juoc#XxRlr04+rGUg^2Z`L|9Ti1o7?mO0GXhfk4(qP
zvzk@-dmR!2iRh0v{x+%NQ>0x?Uc?N6`-5x?I)To`<OQ#qJO7L$+c^YZ8uUT_ka4oe
z>9(4oF5}t9e}=ef^u%8yhKj`Zx7$~M)=p(F1>-UNGkWpF-*d6Kit_Z8=^Q|#lU3-(
z_}#zS=w;?VeOg5FzbqyN*_YL7@N>Fa$D}ooU^HP47%SGJ=oVH`M6mQb30<Jxpy$zO
z)qP^McHhGzC@41SJ?W$}@FRIvQc9ngq2N`y8f`T3E$bj<c&A_INvr;cvfH1OWb;pw
z_S^++X<O>);4q4({GQujNq5;aw(m}>`ne<JY3IWAGHAQmB1L9sA*^==F7tutWYnbb
zA$kYvKeI|O_FEUPof^QaU&~MLEU8?0xLuNHAF@C6@ShPe^QYw<hYD9j49RBiD{nov
zGp7atk|F>}WLF174nHXKGdrHu4<YO~dPYOxVCw=q5l1{8(a8Fv32kqeiM@-T)&-R5
zRO2_N#}TV(pSaJXwse4A+~u_cB6>Hbe2Q&ui5s?)cGk%HMflWo1f9@;@4;e0PJ`#|
zqGlJ#2gVE;_p{VQ8z}406*Xq7*)riW$`_DO`b=kI?}*B{iYD15`|LArzs{5)MOPpo
z`re!Sn&(6_4M+ld>7lnCBWC>-|3&J@##&dK(k5HoiHBi5JL=clGnzA!|Eg6E^jEFV
zo^nL~6$&HH9t_5BvhP>iC|Piugjk$Fp}wddsnO!K?vNgf_?5aZ>9v!6CFPshn&;2S
z|23`y`O|b)Ey5Z1i>aTe|EMu8_kW;9EWUbvkKHdWwD&VLr0nTME^GyULT}Ed*&GTD
z;9X~6nuUBSDa$zNK81fRRFVYYzLkf3otN&qk71v6e;K~N90C|^aQ;I|6`&3)bxiRC
zqo(Iyc~4=)>5A`mUHyBd(Z5{Gq^h3zSkbn>i&a~u`*DHzD<f}(b}bN}k|vMH<zK51
z(a_cq>YVmD`lD-2p5|GY3huW3FM48s$(rVzJbCqz324*Z%bERY`P!gFh1$L%U1Fn>
zdOFDD>Y2ER#?Iv?(RS3Lx_eg{|42zyKiT$(l5zOHOU|w7{@-=n=X21zW|aal5OJ%v
z^b)ikdsTZq!I_8Dr=i=Xsp%oJDOdx}C>BKYD^{tl9c;c|!v5Vmp4oI|KUc@Tejn5r
zLp}Ss3zSm)_Z4M-xp*SeT~af0zIJ1Bb(s1+{aE7C!~MK{G=IWwR&IK=DBq3Rw#m-L
zKEqKjK_8;)-t#u{2a~#F7eS^u%-aQ_q}gOo>S{w>6d;d|p5UBl`)|#CWmuG3)V76y
zh@dDbt)ev2tstEeB8?&-&Crbkf=EkBcS%SOHAu)PASewp<N!ktF)%dmGZ@En^qlwo
z{(b!5$INv-&)#dVz1Dr-YwbOJuK`_wGO~&P%jQBke-^);sciFT)6jzJm$bVpX5y}W
z2Rm^bTGSr9=^ece0C!ekD}4s@Ufj$;1HI@y$H8Z#Vx(OmJIpA)3TcyAr^K%(+MdXS
zjYkRcPi~q3)rFe9e66?Z@FTcxy+gCU`^LdMvf5`|7}J+1-kJ2G2Ki=CT_zB<Ztg+r
zURbx=C6ie{dShXLPmLg`Jt~{cdFV5>W9O~;Bl8Yzv($1K1=ez3-O4^KyQk$NVk(^r
z)>X@0n(mVasnR>ON)<uY*XaG{co_U_(=7M95^J;L9&!R_P{$#$HZ>mT^g1bMJ&@|1
zNf`Ui=YFK2n#&*>R*sPb?Df#0_0+-9es=xroR9ZWPNMPF8#r&=RkE4SB;O-PbeRTl
zu))$6{08=9F}df+&<!SOE_RhX5%n>u8)RwyF5hE0u+g4O&m*vIY5;e1&v0L`eK0hm
zOOdMbb)JZ}j&U+!uQuPI4~&r`+XS)@CIctLh%R?Q_HR@lU1S2^k_mMdUV-=kgCW(i
zx>emvgNyE*qTdbtJZPO^WBnsgoYwZ)+kA^XTnxXCq`62fxwiy~N%XGGSC<^7krZGr
z?-cP`*r%&nG8x6O)U5@vJo%9eL=Gl9;(FZ2EXpf;C`z0L^_tkqzK9SuFa`eDZraQX
z;3d#Rq|GN2EyW}8Nkoad<dz@D5#<cbt^0IuC6`0U8>WM?H)e?~Y>>|Nm7?y(&~@j;
zJl*3Rpb`tTkHDM<$M-aZsJzF^ZF5o$kCp~khF!d`dXc%Kg2ijnTLS8s4E6+s2A2Oe
zCGnlPNk^U|7sGmDYDw|KiFC9;@a8fX`b#o07b>am-Sl;ZEY%eGOPq)7F3GvH-WASW
z42#FvTkcK?|0~7=E7b=Y_Bv|mRi#g~sa!7h$bMu()POeF-T6%GvpAdNG3Hnl{26v^
z*2)NAZGhhRv>h^oPHMa9p!uV4f%zq#=+4=@{uH+*9ac1}YpL5%ix<QvI}13QZAgLr
z^F3+&GtnN$EUXa=YYuOSra!K1y<if|)}W_3!tO!c_hstvIrC4)22OjW`{`OWTJ9d{
zaI>2kxO-7nDU1tl52=qf2Xid9bqf2qhkYO8_Z;zu+$niW0jRW+PLZrQd<w-#BXzmT
z#9&t)G_xO7l(jX@F7w^5K2A0%tXG`$K330!A%LUTx#PR(b@NR~cd>HzSF25#%7Xmu
z-;D8}F9JFkz|^-<#!X*ZRX2s@M7P>8y&~GdD_r)Uy4PHM4xx@9F&O>$0owBZ*v}~Q
zN$oxq2U$@9kR{uS8rUPuK?eHx%l|ZC0p?VD|KQkrFHA^qK~gk~)<aM<!1!?^(d1BJ
z*{r)w!hDVk0c#n!Znd^9LuB+O5XioT3jkDe+e!u~rgXIqZxjvn(ZipgN4h|8$mCp7
z%iZdf`ig!8h~MB7HeAUaLjr7c{G=9(u%D`l%PqHsCOJc3?zyAA@0ujTg)@;?s}#17
z5A~h)TINj&kp^zT9{c^YV10~W>t}(D!uc*oKHKV`+RD)H$x2R*@kf;5{;+9v@}7Fo
z`Rm_u|FQ3lY@}tv@y!eq4pWmSn}_)g#A-%uJaTm(1tODNHmz+AK34@nIu@HPQH~01
z)vpu1ifek6C{tVyqHe12W3kNt?LFU8=_QGm%8b9`!2d!ffAYFNL5s|I;2ML&m}pzm
z*L_TcRD5W0xsyO1-bY<S?3?D>i+&Yi_!mebU2y#$xD-1cwb&uTSNfS&yPuHy^UkAt
z6E_<!t}!T#4L|Cw(*y#27U`uJnQstB0`IH&y|by;cp1LqF+O#@ZqhqEF&|aid*t7i
zEIW@Mt=>!?&5kuRv5FphS=_J(;%`wbzc)Y1{M~gJE4BVg9)I)=6YOC`GKI1$E2Aon
zvtrxpC7MuI8C+<J+hTbN4+^}xE^nX{)VyJ^Gaa|`0^>ue?~(QcYCbVvGl^BH=BS#7
zE!79Yknd2^^PzJU!NpEg)-_(hhz3TC)T@VjI;0LtQsxko!JLxT_5BxCq6vz_dgc?$
zyTr_PA3yN0di?Ug*E^f_3P^8kXP6e=Zdr&hO?6P*6$_s?OB!aYoxZ&?ntH?~3K_eh
zTO)`};rHr>YV;~5rYH!7b>w|juX#bL+s;<A^eUf(Bt<5kGN7zs>kbwmAMb{q&hoAl
zh*eJxf^&(}dmLB_YL5||NN)H`yRh68=UTKEXZ=tQO-VMJ%hfoDWLr+=6)oKkYNz&^
z`@7t>M>nm@2gUcC9Dxa-s{YGivsov>ih>r7LS<K9@z_2wa2?gRc`ikY0tpxZcn57g
zS1q#d>oHXsJ~}}!3D-^aNu=?Zc;NzPwk$@9ZGSLS17u`9FJ$IC*I2BWefzCZCyYp>
z_bl8a#Z8pC_s$ikN8jdyjY?**&eu^+7_m21)#B3P>C4^V=~~Cwu3Qi3%nZtBzN=w9
z7QK&wY^!&#?WOv_MAz2?zg9Nby#$GE39QOix7AWfK`jBNnAwtPjEj}TUQX1<^yj^$
z7Hg-+Ifu7#PD(bMn%ZYQRAn@#T6Fq?=*oG1tL@AApxxQ<MyFn7em~vEZ?3>ga}DE#
z7{wY>LHLcvK=1+~^*|09VK&4+mO~AAYd*hE{H!slO}Vv;vRM!3p<qB{_Yzgt;tgHY
zbWVahrAw=vSuWI0&osq43=JH47a_s|N1Nbfm(b!+yAaa^6F!%(n=WxluZYylD4@-=
z9qRX?CgUyP4#!#EMvz?s?|$m<Z1Q`OyTly8efy|6xQ4<krN0QscPsBRRg|eOT_p(<
zKNukDy>FLvskm-^$Y~TNzN0@o<!NdIK%Ja9_ffiqwr|n;`qfL#rAz1GtGA@l0_;GT
z^{$=$kh-yoX>YVORby|Mhwx;5p*7|+-9H4n|DYfWF8eHx!-mwXRQ!V}kzJxx!`(Ro
z{z+UzDH92S&ghQr-HVTbHZQ33&N=|q1Sj&M;PGxql(x%ts1zRk!k^jaw+t%X50c*Z
zeHsGAA{5INcUJunS|JAVO#e4nz-q4-@*U-?01Fu=uvX~!;QHlqmPdKy>CgE^x08e?
zlRDILpP_tKM!S0tIqlo;&!5`$3m4@jq(&nr!0!fb)sIa_5wTrn2X{&9W|a$gOWW0r
zZYdAy?5LJ=b2QK=-;(g>9N7SL;BjrENwh5(fQRlqGhl!<d;NjxA4BVW;1G9{{7v$@
zXIur44;Sicu+6j7`aV;eDr!stt0gEeba3A)W5`l^+}PVi@WghK{h=n{!LE(x8Mh8M
z8oKKvDO(=Vh?RG|5p_|!{L{B0RZBZ;+~^2T>RQyqz?D6&MhJ0u9+ji^1w_L($^fXE
z<z@P1qRA%IiO-SNd$YRp!%ksL!Js!<aD5W}PSBt1V8z!GCA-0?K4lgdKk0_dZB!Z5
z*g>Ur+kihfe|2HJ8n;8L!*j3~&x<Y>veyG>saRywppyGe{vtddx{UDN+~NLO$|?#v
zy1o*?q8!rmRY@y(iKY%S^$SmuLI-jf+RIRvSw<VE{EYT$kz}funp<{Hhe%u{N(vn%
z5?~k|i$#!)+lA{s2{P$HEK#;S_U<V+xbx{_H4&Ha<dVYZ7Gf)?c)4;@xh{8RH5K#<
zIL6%Bf@HDr;uQMTEI{<vpW5#Ny_DMY6c0E|R#CJw(|0!R<}%e2XnTwSuFjN6bvLMG
zZ_6nUEj<w90M2{x!*Y<^+<b0cp95c&PQDk}V4K?#+MeUl{}C%{<vQ3J-cgBKWC^du
zi)I>YkC<VK;O9EnDYG+Uv>w8jYhbg#Q1Qii+HuI_fW89`KXQp%2D<PEqKsuJO+hB4
zFW%i5`yAC(c%CwvJ%LFvc$2fU0F<<;+N&gPTsxbeg7Dpl*c7{BDjDBYI)9AWH!ySl
z;TguA=kA9gx@%jhU1mSbAHAWUCg&FL7r%50inIeA7KPWx2b7F%PO}`Z+X-OZ*mkFv
z_uvuqexp)mv@51>+}tsZKAytu#@C_ONxy3dmh+?d)pgfN6UUXfK5z=MC*S=W*1A+~
z#V?Nbj1KE<61sF4-qd~m>?6(n;Em?<xG+42{e4Rt6-i>ju0q^Lep%eT&O>Wv54<?I
z?Yu$Wz_I4-M{`g78TLae*#&NGk!N%UvsGGA_5_?p^n&zA^_~Q!RPqL<+L?C8j=ubi
zE6dmyVoBm`+UW6XDs>9JAa2K04e;2S#>0h}J^{#4l|!b;1AD#4KZPY}CBbHD=l9Ah
zJsV-sh#rbxrJNL-f!#zK{mQdB)W1qPbf#LS@!pg!qbAVYRF@Iw)71l%9a1}mc6y{b
zMk|iiA^jNy-7L^-zOOZv(}Tdy-NCol6Y~40`udofV-am{(bTA<*65*K^q2)!H~Jyr
ze35iXN$o1zpGm}@M<QwKm_zjs*KmoX+$1N#s%CDl5o%GuQPS6l9<E=<`47n{Ep*V3
zH<`W$(6-b|8HZLUC40YI3-j6AK=zKd)e9)sN__Y^|CiHX$Mcct*N$P*Z_sP}4jH{H
zYrC0DY3W<G-b|d?#119y%{fP)DlFz!zc6%n&nhm3<u88-mJL&Y5>v=)kAN@-Z^uHP
z^Nm?RYjykNLxcY$&QZ+?A^ZH`=w7Ae!=1qDf*zmsuPx@_q0d@zW?E&}{_tEK_ZOP*
zp-sLOw?%=+lk1lmrO``U*OadQQF}YBzu{x1?DBpLCO!W&R5Q-GgQHwFdgT}Y{W;!d
z&@g5dQP6{hfAf1HVIn(8=p!qsy#eHSE!B$0h|+eG){IEwc5oeH4PVpzWp!@(w%|Sl
zId&kTWUQ+V2}V`S%$@x{iWFA1in-UKwrMiu-MrPhE(o^KgYJeaGcmhNzbG>i-;^0t
zwL$H5p#Yo4Lo|AOuVf5h{(yVlH_TC)xd_|`060znfOY}a(!9roP5FHwx6NMLo+`wL
z=e>&j4On>#9!z^E;ziU}zris*0DHp1-_O%qOHkavDC)8t%IAXE60Shj*;%-LqO(<o
zewwV4kjYJK9k&_Divjr1CF}a`N8?Jcb8Frt24Zr%i>U`U+Br7$|2Nc(4>jqvc3$Dq
zD5+mgM{}n??hJE<&ZHSD%WX97ZHU)~R>8%bRQEgak)h;*Jq=qehBY0wUSB@*bXbW(
zUDs%BA0UqsK<10Kbj(2VRqbk67%1Q|?Kn$B@ftE0S^aT*r}YeGnkuyxNTXt2cdxm*
zGMz&vd+jK#d?50U&H37e4(0bo!%|Sqw;*eWW<3?i@jbcS9CzISn~Olo`%fC$45Hw5
zFG%^4CfL^5yTRue<^lcCcyN`grJ1Uu#fhsWmpl?e<wsnGMcDd2D0TC(4{_RK$(~~|
zU!Cgi_hlY)6eb0r0;xdj6<N$b{TWj&x~{em-^B;jv)K}af;UQPM3oXP7R-CzP>k`Y
z9Ie`B7(Ok{E*R7C={?<@<2H`P@>p_c&>fqudk+vp|AWD)#g<8YkN|E*tz;3i#%Aak
z-a&=XKGm^dtQay%3PUZ>_-x|NcM@izw>*9spui>AO08D7<SU2Lxz`<Fu8l6#*xr;%
zepIuIzDqZWD6;b)83&#YP5!#^Mw?86@Qwb?3I*9_CpaB|`$%W<M%!H<iS87)J(>Fy
zwNk*mzl(7H!lQsDWrC^}?PyHncDe9MXK-&Nho|@S3{lVh%?2XDHtVuY<*zPP@*xYb
z3g=hu6~|)z6TpvBc1V|oDHAJ|02Q<_*BYc3aQ=-*vNnQy(!i6S+is8A+p}PFOO!{+
z1Ast1#AT0Ka%GQH^61JvT*Nmm97k;~J3SeZ1@!)~e?zsv5p#QVJ~DgKNNaRWwszoA
z;3!6+rBg!YIWd>1?nA^xuiex5H~51himV{(^N+9IbZ$zp383h8qTn+WmJ%H~ma5C5
zuP~FIgPKf7t8aOFFU6p8eaN~(UG(brb;zm)#Q<2^LwI#o_vzO;;X2K}#Q!!{5SeO<
z^ViDzwPea?8ByLBtH>5fhqTWJnk*CYE3`B>BMQA9IvgnV*wLptqquFe#(B9Uq2xQ_
zQ?2Fjmyzl@ULjw-4r3Mwkj(>TKsCDjW}h3mkn}d$_-E-0o~Fwr7YGg}^VGXSA?3Q7
z=D2`%`t)DmCgGEduUq*b<YQ4PViiz<>Yb5%`8Oj{8Y!$my_0T$r}W{`YbulzmY?aG
ze$Z5B`ao;u!bOTM-_V;o0UQn}ciN&HW%w(u{a!A*emlBW3cvyNx^AIPy%^|_@$yel
zdMV6*{0Y5+O8r(T%rs8)ZO=Z#e#`?y;pF=;#CyifxQG<cCnu~tUm_oc=ovT!ieV%#
z{$-U%Z<s!^(h>e+W#NAjYkrC-VUJfQ?gH0`^K;8BL`jEJ?xbd&0PdkYaCr^#2AT+5
z-T$BEj(;l*{Yiegr2vR&ZneoEur=3|*G&?{OeAZx&rFn|^!!gP?EfHWm<O3WkcG|q
zJFvf%7sm>ZfM`H6V|#Cj^@EG}#(>$^QeekCIu8GTD;E55>B{^OWvf#Pxu{v!h0nBG
zS-|m}W63ov9M`}#mnk%ka18yvT$7v6DtE7SF*S?hde&8>z8CCTb&HnpJ^gQ0o-!VQ
zQA)w6GxfKGK4Qh#`|gZWCh56JKAO>EMZaLHKU)PLktxp4bxl5R!q?x1EQ9U<cQaIc
zt{NTw%-PFogD7%#1QM+arL^=e0qUqvNp#t9#lM_GX~V#?MbxJKex^FqB-Dvt9_9t8
zH7+h9V)V7nfRgv_TL3agYpCbW)T49u3VO^j$rusN_VJf<r`1EqFJ`;E%`?N+IR9OS
zQd<Bgd8c~-{BRTyTQ&vg(h>GuSNL<4gALzG;0>{sf1YOtZ-t?`yUua*byg_yey$L;
zHSjqoAJ1AYRxK^dBlxbd+v&yXx;gP5gm7zpeyt6iY2Y}VX0KT@z*d23VYCMZDncB2
z^~;fQ1?!`B<GsgOaH|NQwYjO|JM*YaOP9oN_1J$O>uVRi?W24$gI<!`z@0>iHsuZp
zM2!8G`7;j+&fOZZhk|U@KLzF-?eYk+_h+zMF&@j9&7hivt-w)<V73{)5&OLq^cSF+
zKiM{2s#lE5`=|w;<j}q7Jml4g9%AJ1nFvtfVsNt97O`oR?5!v9UX9W9S|)@8noiGf
zU0maliQ}>j$^Vv}&ry?AEa)uO&1f04P`jc(iH3FPP+2X+>N!hQk559i<^jocNf7mt
zq+1^5ijoXpmWx8Du`M)L+-CtQOV_WG4csUVGoSnNNc4j29C)d=n|{79L>!mtOdzI<
zseSSpG}TE?$k+kUNOI=|xU*O^3U&ShJ0&o5w|M=G_u4#S1t>4kP<QkgLf%Y34wv$8
zHt7B+Zo2O-mKeyv0)(o6;cD7tVZlTc#+OHh+3S`)@^p))zxX4QorXhKikS@Mj(m<b
zZ0jyE%3uKvJ+(C^UBY&MuG(Jj)P1t5$b|QbeCZNE&;_#m5iL*`v#F&pl{ZrW9DO+c
z$^-rK_Ml|**kO1iQ)=&qXlL<xN}ij^A2!VG^$M`@?mKv>Rf;pTb99#!cRZp7NT<-^
z^l5k}H*lZ`@MGcQUW3mhUDMmi{7!zjME1Q(!kQKlOOOuJ6`!MR=V<=F==N*ia<z1$
zBouWV)L_Tk!tU|M^&@Wf8g?gD&U;ws(xaNza6wGyvjPn%3Q<vi^dYtPOInMHfO=QQ
zARD*F(fytea!c>AYWp1ci}#X1^&}}wQ;^-k$LzTy0!R3?3*w*~?b%&RW&37~$(1%j
z1-pVHtrd|#)$%I9J*{;tf^gsu5{qVXQY@nPpw+hbpheSQji1F-p-r$CT#;LOQOpDN
z9K8Xx9bL>xwXtw%3)+)$+i3kR$7i!zi`%5~WI=Pvy<cqNA99zo)aTg`3?mg={2$`p
z6(YdM`CyPfisHsg#f`&!E7KdrisH2P@Oc~Hw@Do0#you-Ly!G6AKxytT7FkSZTb78
z(r^tu`3WhEr9S{SK3%Y>hK*?=mp!&YD~HH2`J-^-4$})uifxK;U{A#7kjUSFW^2V%
z0jm^t<Mwu~94*8-E)h`Gd+e@BZ`sx^Oy3NTxf*Bjv#d>3RQ*bLYZU}!?xltzUE}`j
zUpfuca$kM*Jfz8gbii^;m{VbOc)&8Y5aBzN_k5!PI5tMdZIe|lNlvf<(inQN^od{0
zi81%;413+oa~{+M1V+isdPsliD?V*@IVZLN;sKC}3?OzeTLa|=Y9*isA9Z2GOt^YM
zULofQ%4uLT|KgZ17yNL2;3z3&M-6qAHivxo&l2kQu7NY)Hva1e4OwqrpEc8qmz$Kz
zd_>6+1~(4}uF&O8Z5tTG7O{mAs{m0e8%7u9-7FZFeIc|PIxwue%riEMo^Y5q<MVoG
zmdWN$y5v&MY<>Ovn1T<H7+Y+M#<ETQdKP-4j>JaAb_vyB?-9)(4Os5J*>UTMu7=CF
z=Y!g@8EkJz*vvmwlv76>^H55%e5VX0MJk{q20iZ2@URtxg#z~o*E+<g=srnzgQwJ4
zk8SL{?M_i_`S>MH+>=2G>z&^#5TqBEc^v1aSK0}}gTng%**0;euw^Q!{ckC{M7Ls0
zhh>+_Eym9DaK9t2SA<bU@oSXWlaCWTN?{6>%$w@KosaH9d(o%WbF)8=FtYrpvHw-H
z`dgMBB%BguWgV%<cCMIe5C?C9RDtG{Jf2ijy@en9qp~$26ML;QlBVId(1UwFWVFBZ
zhrbVz%5fKe58ESiW21pC(Ym5m_v!PROh*oE9{5p6XBR`hbevs0&_cZoyK?16?fhTb
zMX2U>8yg|WSqg65+RS*-0IbG`K6|vTJ}3nx23W!eXs4iMR-@0Jd$dnlaP7CACf0+!
zZd9##4o$KUdzCl;sNLpGpA>9s)KeP2vqXnNw+3r?f}XBM&^IcM$UG5H__fUkcp<FB
zP)V}Yb1tqQpwi0jBDNJrN7XJgb`P$YQKjl9yvkVD#(zaDC7zKtv?m=(88aj$ZkfTb
z^-}seC6~q0i@CrCI>Xkncb=pgm+!t9G7k6EZw;WNivQ%+l{B2}HSV!8R&M8vs@dE3
zX+%OBxXo5NMUmQG<G1QK6l>^<Ij&d43=Y;#(3=-r)_|!s)1HjM^e?U?a%c?Z;valE
z8Ih2tn>8Hi-hNI^Gw=Ny3&Y7b9|M6-F)2wL&&x4Vs}Ve!=L)P-Bz){oWZifO1)F)i
zy2;f4!I7l!P*pR1K7d%_T-dqoo6N!A-yrhhaQcL-5@#p4u}e(-0rBnBriC%B$k>Q0
zK)cuo?v>U18l+8<JsP+2s@%>Uzu!nPi<ZHQ)*Lab>EEffzcBVx`z{`RmY3>DS}Bu@
zBG;5+d@T}lzocG0k&ka0pHswv;VWTy0`cu{KrwjHvjELz6-3MvnR%?ffm!)#s@5Bq
zccV_zy};FVnN7%P#d*svlBKdf^#I|yrJltmMNRJ?;e6-!xtUIsU$<_8=G>A?xwe3P
z=IP&`$$L7L?Gshu%E8lwOL4Y@S{PuMqxXLwCdIauobKI{ixRR|(oc!KYG-hjO<>09
z=Tk`+7d)ZHy*m85%GZF=Kf)p%0`U$r*EyG)*QsOo+(rOA!k{nYa)C$_S<J3Kbtr4?
zYo;ixd*|DeObc%P!^)rEn<<Q(a}On!D8T!JVV`%$R$M_-()(*%{PSGQS>|{p@bVcn
z-pkzAK0_p@o}D%_dkf8kd&lCe7;uK`i={uYnpEDi@QgA%n%6n<cPl@u&#7E^)_2~s
zDyt}M5Sad}KdjRKb^6Q_|FR4?4bph@n*P7L)!!%Vbj@qK?V##WROQmVkB~-xOwUFX
z^uu7_h{eX{@<m*t(r!^MW&QDs-)9!vOG0E|?E*9J*rp*8_v8@XD8lXMo$<%|>pvfy
z?AAV6)otG~B1LNO(R*_eQD9&(<)zhnaxJoD>s@XLIpty&;=;Sd^OQz)PKy?~qI?f3
z^PMKWWw9pAG4DC8HO#Ir1Ve<6b&uWNvZ~u8r5d`j$7a@;EU2xYc;oLTzTNopFnEO6
z{D~#+rKTsBJzX^P`yB2ITPG?l?F2hkt5=Ub>{n14w|w_ObdObDuX()&O8ZoXt?Ffi
zaphrwQW49RD!#3p$=V3vfz~_zqMqut10v{QSIS8K7^G~Yq_5Jvz)|l?!QI29Naqjj
zrzGr;oo`3#l7{xH4EuiBopPCxD=Y+fpYXA+G)orC!d^6x{I<jjy0uK*nBw-uX6^ie
z@ZM5dF+oh#)a}Iy`mK;+{iPNvLN#j({n9{rSA_yzgG8@wgEje~x$7PyR9Y*RK0=dp
zPD%;P)LJhrBE&ti^h?X0MI?Rq8MD(uzL3K?P@Ns-#S+-^u;*cb1G5Lw@B@9S_EeCz
z?Xo5n5wfIV_Sv>b+j~u|GQ)aIo`w2i^FajG39aRbMxBC29R`b**BjzWVN&>tv8Q)`
zcT7W>Fa4P+VW(mAbyg;raZ9_xe(%x@eWT~a&I4%tHQqp?(S~7LB9Fc7wd+L4aG^c&
zz4ysSmPpUTiiKQ8JdC6(Ghu&~n%Bl!*7govP^6RjaTGKqzxBgb*lp)@#=5?Va_wA@
zicbA%VpEsk^2hCj#w5?D<+T`th7vB2+s2M?pP9495AVQdVKIb=mugeIZiRTc=Uet-
zJ@B|?Ru**sq+!wGrkQQElatdLb5LQ+xz6srt*WCGQKkO7ih31x;>RJbm&Yi^d+n9h
z!zyU8?l9pXRbH_Y2{!5-x8#11!ttyuCJ8{<da%`HCGUOE&<y0h0%pRDgGvkh;=xZR
zz427C(?@dV+k;ig^y{2U<3e|h_hrOOM|$$r8yYYJjRn7riU4_yUBCg%9QdxMti?I;
z1FZE4wLcvtQh&^6Sxj^AKF+uB5!=ffmt=)o1bL<ZXve_0H}vc<E{d|K_!bk^=Z|Ok
zD)_gJ{d4yJjQH5LYP?(QzlX1%6a7byI{rKh{CmtgZezYo_FI@d{ybtT<F`0{+!3v&
z_gidKC9ajyp_dX5F8LUU6S`93Vr)&KAic|>JtQ9+-ltC*n+Awt_eQwXeC}(pvD);c
zcB{3GR@cuxfp07@x9j>E^rE@@mw7z{ue3iERHuROiG5mhM~pn#Bm|NU)o(MsiML3_
zzoq8R3%2ntul#y&*H&7G3A{2Zt92HId{M$c82RF41H-5_TwpAeN7U^g&HNms;6*uO
z!wmwGl~BjGtV!Xl)^IOv$p;m81WOt&>G3z__gPlot7UK%=jaR`j6iD;?2&toOL&bq
zn#-M)*}df!>Q8AsU{!F#KY}@@xY#D_>(OLp-mCr^b*3Znrn0`l!Oy(cN08aOyY6VM
zd!C#-1lp0@V)1D1WdfO9lq}IbcNR5$wXp|51g5c(d7>odF=<zgC5F+m!8kL^N|7XS
z7HgER{e2Q{lvHX?sqvJ&=zQgv#L9a5!8<+$18xw#|K$t3>Pml4NUFrl_%}+EO=e^*
zWrTWoKI0iLcC$xHlk=C42;RaNDSUNq>ql9@t1gD0zaiSF-0pD$WisVU=kQjR>E1{O
zm`(jsX9n#R^LWtRDw(rUehfu%^<pF7-Bov@cF0Sf+iu}TR=qh3<euS9QWU4YGBQxx
zz(vtU9g31!QhvA3l;0$%{%W+?;~g0mT5(_XLfH-W)1h@tfeAg_yRKD%QoDYj?i1bY
zXHN1#I*!N(#Y$SUFO^u^>THdix~rdU5EZ0gU_fOv``L16b|BV(;8qNH!oc_~Y%THv
z*UFAtJa8A#xzOMC#5sxUP1D+uInukgd!TJN0(dC~;b{9T%h1HT-poA=a)~sNDIsWO
zHSy2jBEdteyE>e1ceCPrF^{IoRI_9lyvWT%K&jz(&fP6Wsv%A_HcBNJ!Q;KsE}xCU
zdj|=skXoln-SqjisHSI<(DvfQ4m9lo-N!c3Rk{=VV_+llU9BvyD4}b1+bfUm)LN*`
zqHq^Dw`3SYL(y_sZ=-}Fu7mCphm^uE&pRX9d?&{IJht9|#*YHB4!hF4eGm|oD?*Iw
z;BlebHTG?T^g+9ByRa$3%!Ys%64iD~FyvB619?3W37IP)29h+$&K={iY}qSRLN5pW
zj$p=RcfwDz`&*Q4s^{K-jXGeChzGCiP_+CumP5*NqSdc1{?S^#L^m$7GT7nIn3+d{
zJGpEgkU0i9>ShJyn9ov`7voiR$&RI(>8;NBDRl0S(ayI`HxS*(B_A9#Zo?@U;-<Ap
z#V3i2V(>CbpiP5xf$vb3BNC~|rU~&DE<PKNtlw&yZi+OO8EG|qqSv^+&-0F9Lw~(C
zyA|iTpf7LdoW~bbb!03j8*UUh7q1yaB<dAHAPb71qI(Y|&xs@6T?6*QELZHilG7ms
z8j(+?94>0?^`CerljXy-DSOUt{GA$t!hDtubPLJr$uZA+O_xN;t%N=(MXTgR5JCMa
z<f}mocfeqTfAl(K+c}-}k)S3I{Q3YhJRNzxiA1-}k-D7vL5U|*zGtXE8G<DOms7TD
za1{0M=wC?xhOls&aHBFq31lT74N%a5-00m1iI?bJ%Dn3(#w-8W%``|CQE$+fYOF_$
z>&fW`F<&5he)H7rGuhsI%JE>j)mBzFysC)_hX=15i$QnIWib=HzH)`+bP>(zh=^{b
z9iw$S#3_q2MQqpEp;Y`N<HS!pWCd_56U_zVTMNYujs|#+Ld-%FXK?XWODW$O;!kSt
zw9fD#UN;2Xfl*?1VAlp+i67dS!bPP;#~A6_4%Q7$C@^gnDe1Ml5sr6w;JKDe4)dL9
zW|k2wncS7H4@iS~uqo<I@aoY{HB@DQQh~Huv^C<D<7Chh%dt&hV0jDld~nr`@Ok~<
z{+41$4X;5R52RM`HfW6wboUb255!^s9QXV;Q>DLN7M9g}C)MG1hsav&wtE$=uImX?
zG*>z6BK=a43Jl`sa0XAn%ZTg-`@^QYX{5ee+gClqxujE<@$Pa4vE2`s*{w78RzB54
zYTDNe{s=CzNz<6R1Z+UpQYRU2+o!g@_MG!1!4%Tfn5Ju|JPUbZ7~ss#HK20Tx&vKf
zGA_nF2V^Y${#Df+E&e%Q8#q6{@7IAF>@)QNsD8>4`n1?h)$0a(#_M8O6HufD8u)41
zc~PbhP85ByTpf#_XgFgac?7(2-Glq6{wxP#^L!@E@U<Q-0|}Yu*#`-n();zs*^7cL
ztuKhx7=KHdX>=Nw6d6En0<Pi9!s6sEMuJbcdMosisgZ}5`%d;vQbv#}p1Ot?AA|d7
z>b77{-gKoIH9`!Rh5g~J2o`p{*ZU#-zH*(c9G37`FIB=$TySDM(Y8fN1R1wz3#HNx
z?ow!vd`L&$5&}j(>AkGTKrB7Mv}VR$OTj<;*{?P<>NY%l^Yx%O(Zb0~_;3$Z^C{n$
z7ptAb@4*LN(DQ;-FDA!#v(9sG&Cwnlw1ZKvtc__>6{lmsPc}z%rENFf_F1mxV|3!d
zMjvLHrz@czge>gl29jkb07p7}`pFV`V@8%WYfw6&y&=HbX=9<1_zf6tqE}WYp&HQ7
z;Y!Sj%|n#Gp@qx-^1gC!z|ZPJ30C$=Y0n_K<8!p1c2YK4C-`!oF~GCugq@eiiHeV*
zNr*O9xPnJQcE&v3P3FXOQ!KsJm1d-^3B0pJoc1K~d*i|rP16z6yj^ZaIh5tyC(Kw)
z@3}Yg;CRVOzavD_wk5l)9_$)*;~If+<4v{z_i(><x{e8T@&Ui<EX;^ifOev3sj89o
z;q~`?oQ1{P&#o9tyz@|8hxD|5z|Ehez`JITcVmwV<Mz<${l@7z+d`1_7p7B&XWq&#
ziMU?Kvkh*=G1?M)JJ^;8{Q8rfS{R!qouZx&q7oLeNLDdm$syz3t+B!z-J*4f!~r(l
zj!_|bFoy71iOxEu2$zT()`j-1(u#89K!JOpE}Z``*C(|g1w({styz4|vkOhesaf72
zMAb(lf=d_V7{9bWxU`gyD;6J}U<@yKMU>IpigOn&+2(blt5=bieNKlZ9~Uo!cp^L)
z?Wh)hg4)07ni%akzA+l+${7{5EQ2Sz#^GynCTVEh?}WAAw9pb#MifHyx|E{LI&Q7$
z+WYf+dWfp7iT8XD2<^sN{Z3FHz|G)qF6)t;9elWXXLY+CG=8Otu1a;1=9xSRnOy+L
z_^x1EaA{IR-=iGrliVvUm2IL-A{=KRRM!WsY~g}Y_-&&;dVZx<no~W%27JE-$?LOx
zH08S&f?8|{=(Z&&nbbCV!HT976Mj3WOt`?ZM%hN?PKekvAczbicx?-pfeVXJi7=rN
z4WF=^?&%-Ur8~I`IlqV}+dP@MllD`1Cnl`zpmtDSaig*K6tN@y(|e-60%ziBtoSX6
zn-`vqk0B_G5{~ro-n{^^aL1dl^V(R=H(3eowk}oSrqXt$ahU5hPV%03-jBL8Sx)Rq
zV+yz3Z3dq-EpXwpxjHoEx9$`rMCAL>^K3E2&4u@-ZT*jgNvq6x$q%Y|SE=oe_IYVU
zNQP5!5s{xxt{|QVHEW)F7Xk|0J#aaRmvmrp;!cD|1n8|S7)h98{`Nuk^m3(jXT2LC
zrjv;#MZO*=EG_ZY+i<SX_*b-9ZY!Cs4+k1-baQ51Dp`I6KagY~ac4X8X~{mrN)oYh
zvi>>O-d?DgZ)t|@vm&#b-3wF_c;@^X@3j<m_eW)x;yaW}IaC_7`!zH05geuAaECX{
z9N8$bH<vKHmb+&(aYGyDPwh=b@fWOT5PXv>*mB8o;6}6!n03>H!|=6*TApJC#vu#G
z!N1e?So}DJeSYm~;bfIUT$ghq8^4BA%+oz<zEWYr!))OBZ)`=$aXZwR!Ze+}BmkVa
zdw1hvLH%5_ODvX(vgMw9i{3~b&5L9VC|K-?t~e$v^t`_3wk_mNYw3TYacCW@f<VR~
zn^XD)kn3C^bM5V}WA)2?*Ln5{tj2*!cqiRbLq8kKEq}6?k_SA?rKMHH#a$50f)}XA
z4vxZT4pwBj$?19TFWvH3nU@(}ED;m+?rzPtFA+VMa{^7Rm%LHQtCh#<Z$J5zVbR<d
zKj-p98=jlqq6<sLORp7JZ5mT~rS!Fwfvh?UP~FiT{zND&S<HRoIXERqVaA|tDuZPC
zTnDC%iQ46v_0SVG;qAf1kz*Fe@^=<zaaZ7G<Mw0GC4lo?Z9JM@4zwBsgn{v5AtilX
z<svz~6whs)y2UWvA>OlF;d`5#ih9hL5qjJe3#@1&b9Do@O-8fSI5^>2H%^V{QXkC0
zDw01L8AOM;%8Y*1Yd$}5YWyCKHDp-GIdfCXw%)Ye-!kE#t)RwGQ8TqSp%n-J+@aS(
zB-iJKH((%YiBy|0>5ms$b?aAGy)79q{4TisB%c;{br0?JX~E1phH~&dGX-9j)PpAD
z1qYodaA9MYg>!I#({dYC;Uk~jvRf37r*7LC)U8iNE%_2StIr%u0DmphT=T-Ysw<;j
z8!tY$gC2U%XS=t-BA!sqC5AH%rvzi(MbUr@O3%OL*;}beuC0ppobvUXA#Z6OB`YIK
z609#hwzshT5Y1L?#}T}6sUqq^sn;G_?VpCoSd6|XxvZwJ6+-l}qgMQCWyx}>tjpe}
zO!!+NytkaVE7ivJYU?B1)x%OAtXAFPh`F|1-L{mXs%LJW#+kPU9l|OJ*uc-c5H&Ca
z=6DgrJgh9Wma3Bu9NSvfJm4)RaoWu(?o*sj)(Kdp^(XIUDLWMP(AkIOaT;?hrfD5{
zTkf;G=4i0<FYvRe-wN3#7Jp!6XFH(l(Qhi(g;iq$2}kuS{QxXFZRY2M%V$GjJf&7M
zxT-*Cha%N>P2Z+43MRiW7bh?M57{H~#p$h~oEO%HWnGKOS}UdHIV!3%2C{eT9@6^Y
zN)L=WbmY1AOUFveDu2NcrHTzd-0BYuhnnDl#a$1Mmg7(=8q2TPzv--1SM@n`Pn<Z(
zEqCv(#&uC~H@$t<F>mww0!ulpNc5KyFyBmiBk57(47K*JGWXBb4VTxdiAf4${#CvG
zH&x*OUunZvO-qZv1$-hrLnpf{S-8QBISm3iq*l-HGLYY8Jf7_Le?4EfiNf*DhFeX+
Rbxr_(a#G6oiY1JE{|6ePQ%e8<


From 26b79cbe6639de83e0e010e1c2909328e2c13ee4 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 28 Mar 2025 23:39:18 -0400
Subject: [PATCH 1066/1240] [ROCm][AMD][Build] Update AMD supported arch list
 (#15632)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                           | 2 +-
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0f1fdf78d1..9d15b77bc37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,7 +34,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index 4381cef5e96..cdd487696c8 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -8,7 +8,7 @@ There are no pre-built wheels for this device, so you must either use the pre-bu
 
 ## Requirements
 
-- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
+- GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
 - ROCm 6.3
 
 ## Set up using Python

From 072be34f208715d2dc9434b237aab28480e0e9bb Mon Sep 17 00:00:00 2001
From: pengyuange <pengyuange@gmail.com>
Date: Sat, 29 Mar 2025 11:39:21 +0800
Subject: [PATCH 1067/1240] [Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |    7 +
 examples/offline_inference/vision_language.py |   36 +
 .../vision_language/test_models.py            |   14 +
 .../vision_language/vlm_utils/model_utils.py  |   57 +
 .../multimodal/processing/test_common.py      |    9 +-
 tests/models/registry.py                      |    1 +
 vllm/entrypoints/chat_utils.py                |    2 +-
 vllm/model_executor/models/registry.py        |    1 +
 vllm/model_executor/models/skyworkr1v.py      | 1014 +++++++++++++++++
 vllm/transformers_utils/config.py             |    5 +-
 vllm/transformers_utils/configs/__init__.py   |    2 +
 vllm/transformers_utils/configs/skyworkr1v.py |   53 +
 12 files changed, 1194 insertions(+), 7 deletions(-)
 create mode 100644 vllm/model_executor/models/skyworkr1v.py
 create mode 100644 vllm/transformers_utils/configs/skyworkr1v.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 793831fd06d..8477158a004 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -921,6 +921,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `SkyworkR1VChatModel`
+  * Skywork-R1V-38B
+  * T + I
+  * `Skywork/Skywork-R1V-38B`
+  *
+  * ✅︎
+  * ✅︎
 - * `UltravoxModel`
   * Ultravox
   * T + A<sup>E+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 0adbe574370..572eabe2619 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -804,6 +804,41 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# SkyworkR1V
+def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "Skywork/Skywork-R1V-38B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [[{
+        'role': 'user',
+        'content': f"<image>\n{question}"
+    }] for question in questions]
+    prompts = tokenizer.apply_chat_template(messages,
+                                            tokenize=False,
+                                            add_generation_prompt=True)
+
+    # Stop tokens for SkyworkR1V
+    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
+    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+
 model_example_map = {
     "aria": run_aria,
     "blip-2": run_blip2,
@@ -834,6 +869,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
     "qwen_vl": run_qwen_vl,
     "qwen2_vl": run_qwen2_vl,
     "qwen2_5_vl": run_qwen2_5_vl,
+    "skywork_chat": run_skyworkr1v,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index d500ef5d8b8..0d1d237e569 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -474,6 +474,20 @@
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "skywork_r1v": VLMTestInfo(
+        models=["Skywork/Skywork-R1V-38B"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<｜begin▁of▁sentence｜><｜User｜>\n{img_prompt}<｜Assistant｜><think>\n", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>\nWhat's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>\nWhat is the season?",
+        }),
+        multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",  # noqa: E501
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=80)],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "chameleon-broadcast": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index c84bf6dc15f..2ddf28aca4f 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -376,6 +376,63 @@ def __call__(self, text: str, images: Union[Image, list[Image]],
     return hf_model
 
 
+def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
+
+    class SkyworkR1VProcessor:
+        """A simple processor for SkyworkR1V."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Union[Image, list[Image]],
+                     **kwargs):
+            from vllm.model_executor.models.skyworkr1v import (
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_skyworkr1v)
+            images = [images] if isinstance(images, Image) else images
+            pixel_values = [
+                image_to_pixel_values_skyworkr1v(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
+            ]
+            num_patches_list = [
+                pixel_value.shape[0] for pixel_value in pixel_values
+            ]
+            pixel_values = torch.cat(pixel_values, dim=0)
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
+        "<IMG_CONTEXT>")
+    hf_model.model.img_context_token_id = img_context_token_id
+    hf_model.processor = SkyworkR1VProcessor(hf_model)
+    hf_model.model.get_output_embeddings = lambda: \
+        hf_model.model.language_model.get_output_embeddings()
+    hf_model.model.generate = types.MethodType(_internvl_generate,
+                                               hf_model.model)
+    return hf_model
+
+
 def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
     """Patches and returns an instance of the HfRunner to use for InternVL."""
 
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 078ed21537b..e4f1d297fc0 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -262,22 +262,23 @@ def _test_processing_correctness_mistral(
     "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
     "meta-llama/Llama-3.2-11B-Vision-Instruct",
     "TIGER-Lab/Mantis-8B-siglip-llama3",
-    "mistralai/Pixtral-12B-2409",
-    "mistral-community/pixtral-12b",
     "openbmb/MiniCPM-Llama3-V-2_5",
     "openbmb/MiniCPM-o-2_6",
     "openbmb/MiniCPM-V-2_6",
     "allenai/Molmo-7B-D-0924",
     "allenai/Molmo-7B-O-0924",
     "nvidia/NVLM-D-72B",
+    "google/paligemma-3b-mix-224",
+    "google/paligemma2-3b-ft-docci-448",
+    "mistralai/Pixtral-12B-2409",
+    "mistral-community/pixtral-12b",
     "Qwen/Qwen-VL-Chat",
     "Qwen/Qwen2-VL-2B-Instruct",
     "Qwen/Qwen2.5-VL-3B-Instruct",
     "Qwen/Qwen2-Audio-7B-Instruct",
+    "Skywork/Skywork-R1V-38B",
     "fixie-ai/ultravox-v0_5-llama-3_2-1b",
     "openai/whisper-large-v3",
-    "google/paligemma-3b-mix-224",
-    "google/paligemma2-3b-ft-docci-448",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index d7946b75b79..ff0c37a6afd 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -294,6 +294,7 @@ def check_available_online(
     "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct",  # noqa: E501
                                                           min_transformers_version="4.49"),  # noqa: E501
+    "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                      trust_remote_code=True),
     # [Encoder-decoder]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 73a69d3037f..24382142768 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -496,7 +496,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
             if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "NVLM_D", "h2ovl_chat"):
+                              "skywork_chat", "NVLM_D", "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 7797d9a2cc2..9288a4b8174 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -190,6 +190,7 @@
     # [Encoder-decoder]
     "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"),
     "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
new file mode 100644
index 00000000000..ac5de0e36b8
--- /dev/null
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -0,0 +1,1014 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from transformers import BatchEncoding, PretrainedConfig, TensorType
+
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.awq import AWQConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.models.intern_vit import (InternVisionModel,
+                                                   InternVisionPatchModel)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
+                                    NestedTensors)
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   ImageSize, MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
+
+IMG_START = '<img>'
+IMG_END = '</img>'
+IMG_CONTEXT = '<IMG_CONTEXT>'
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+class SkyworkR1VImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
+
+class SkyworkR1VImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_image_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+SkyworkR1VImageInputs = Union[SkyworkR1VImagePixelInputs,
+                              SkyworkR1VImageEmbeddingInputs]
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def build_transform(input_size: int):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    return T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size),
+                 interpolation=T.InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
+def find_closest_aspect_ratio(
+    aspect_ratio: float,
+    target_ratios: list[tuple[int, int]],
+    *,
+    width: int,
+    height: int,
+    image_size: int,
+) -> tuple[int, int]:
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def resolve_skyworkr1v_min_max_num(
+    *,
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: bool,
+    use_thumbnail: bool,
+) -> tuple[int, int]:
+    min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
+    max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
+
+    if use_thumbnail and max_dynamic_patch != 1:
+        max_dynamic_patch += 1
+
+    return min_dynamic_patch, max_dynamic_patch
+
+
+def get_skyworkr1v_target_ratios(
+    min_num: int,
+    max_num: int,
+) -> list[tuple[int, int]]:
+    target_ratios = {(i, j)
+                     for n in range(min_num, max_num + 1)
+                     for i in range(1, n + 1)
+                     for j in range(1, n + 1) if min_num <= i * j <= max_num}
+    return sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+
+def calculate_skyworkr1v_targets(
+    *,
+    orig_width: int,
+    orig_height: int,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> tuple[int, int, int]:
+    aspect_ratio = orig_width / orig_height
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio,
+        target_ratios,
+        width=orig_width,
+        height=orig_height,
+        image_size=image_size,
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # add thumbnail image if num_blocks != 1
+    if use_thumbnail and blocks != 1:
+        blocks += 1
+
+    return blocks, target_width, target_height
+
+
+def dynamic_preprocess_skyworkr1v(
+    image: Image.Image,
+    *,
+    target_ratios: list[tuple[int, int]],
+    image_size: int,
+    use_thumbnail: bool,
+) -> list[Image.Image]:
+    orig_width, orig_height = image.size
+
+    # calculate the number of blocks without thumbnail
+    blocks, target_width, target_height = calculate_skyworkr1v_targets(
+        orig_width=orig_width,
+        orig_height=orig_height,
+        target_ratios=target_ratios,
+        image_size=image_size,
+        use_thumbnail=False,
+    )
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size,
+               (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size,
+               ((i // (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+
+    assert len(processed_images) == blocks
+
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+
+    return processed_images
+
+
+# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
+def image_to_pixel_values_skyworkr1v(
+    image: Image.Image,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess_skyworkr1v(
+        image,
+        target_ratios=target_ratios,
+        image_size=input_size,
+        use_thumbnail=use_thumbnail,
+    )
+
+    pixel_values = torch.stack([transform(image) for image in images])
+    return pixel_values
+
+
+class BaseSkyworkR1VProcessor(ABC):
+    """
+    This model doesn't define its own HF processor,
+    so we implement our own one here.
+
+    The code to insert image tokens is based on:
+    https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.tokenizer = tokenizer
+
+        image_size: int = config.vision_config.image_size
+        patch_size: int = config.vision_config.patch_size
+
+        if min_dynamic_patch is None:
+            min_dynamic_patch = config.min_dynamic_patch
+        assert isinstance(min_dynamic_patch, int)
+
+        if max_dynamic_patch is None:
+            max_dynamic_patch = config.max_dynamic_patch
+        assert isinstance(max_dynamic_patch, int)
+
+        if dynamic_image_size is None:
+            dynamic_image_size = config.dynamic_image_size
+        assert isinstance(dynamic_image_size, bool)
+
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.image_size = image_size
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail: bool = config.use_thumbnail
+
+    @property
+    @abstractmethod
+    def image_token_id(self) -> int:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        raise NotImplementedError
+
+    def resolve_min_max_num(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> tuple[int, int]:
+        min_dynamic_patch = (self.min_dynamic_patch if min_dynamic_patch
+                             is None else min_dynamic_patch)
+        max_dynamic_patch = (self.max_dynamic_patch if max_dynamic_patch
+                             is None else max_dynamic_patch)
+        dynamic_image_size = (self.dynamic_image_size if dynamic_image_size
+                              is None else dynamic_image_size)
+        use_thumbnail = (self.use_thumbnail
+                         if use_thumbnail is None else use_thumbnail)
+
+        return resolve_skyworkr1v_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+    def resolve_target_ratios(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        use_thumbnail: Optional[bool] = None,
+    ) -> list[tuple[int, int]]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=use_thumbnail,
+        )
+
+        return get_skyworkr1v_target_ratios(min_num, max_num)
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        target_ratios = self.resolve_target_ratios(
+            use_thumbnail=False,  # Applied in calculate_targets
+        )
+
+        num_patches, _, _ = calculate_skyworkr1v_targets(
+            orig_width=image_width,
+            orig_height=image_height,
+            image_size=self.image_size,
+            target_ratios=target_ratios,
+            use_thumbnail=self.use_thumbnail,
+        )
+
+        return num_patches * self.num_image_token
+
+    def _images_to_pixel_values_lst(
+        self,
+        images: list[Image.Image],
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            image_to_pixel_values_skyworkr1v(
+                image,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=self.use_thumbnail,
+            ) for image in images
+        ]
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        if text is None:
+            text = []
+        if not isinstance(text, list):
+            text = [text]
+        if images is None:
+            images = []
+        if not isinstance(images, list):
+            images = [images]
+
+        if len(images) == 0:
+            image_inputs = {}
+        else:
+            pixel_values_lst = self._images_to_pixel_values_lst(
+                images,
+                min_dynamic_patch=min_dynamic_patch,
+                max_dynamic_patch=max_dynamic_patch,
+                dynamic_image_size=dynamic_image_size,
+            )
+            image_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat":
+                torch.cat(pixel_values_lst),
+                "image_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst]),
+            }
+
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
+            for pixel_values in pixel_values_lst:
+                num_patches = pixel_values.shape[0]
+                feature_size = num_patches * self.num_image_token
+
+                image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
+                text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+        }
+
+
+class SkyworkR1VProcessor(BaseSkyworkR1VProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_CONTEXT]
+
+    def get_image_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> PromptUpdateDetails[str]:
+        repl_features = IMG_CONTEXT * feature_size
+        repl_full = IMG_START + repl_features + IMG_END
+
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
+
+
+class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
+
+    @abstractmethod
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> BaseSkyworkR1VProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[BaseSkyworkR1VProcessor],
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        processor = self.get_hf_processor()
+
+        base_size = processor.image_size
+        target_ratios = processor.resolve_target_ratios()
+
+        largest_feature_size, largest_feature_pinpoint = 0, None
+        for wr, hr in target_ratios:
+            width, height = base_size * wr, base_size * hr
+
+            feat_size = self.get_num_image_tokens(
+                image_width=width,
+                image_height=height,
+                processor=processor,
+            )
+            if feat_size > largest_feature_size:
+                largest_feature_size = feat_size
+                largest_feature_pinpoint = ImageSize(width=width,
+                                                     height=height)
+
+        if largest_feature_size == 0 or largest_feature_pinpoint is None:
+            raise ValueError("Cannot have a largest feature size of 0!")
+
+        return largest_feature_pinpoint
+
+
+_I = TypeVar("_I", bound=BaseSkyworkR1VProcessingInfo)
+
+
+class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text="<image>" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_token_id = hf_processor.image_token_id
+
+        # Since there may be extra tokens in the feature placeholders,
+        # we need to pass the image token ID to the model to select the
+        # tokens to merge from the vision encoder outputs
+        processed_outputs["image_token_id"] = torch.tensor(image_token_id)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
+        num_images = len(image_num_patches)
+
+        return dict(
+            pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_num_patches),
+            image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            image_token_id=MultiModalFieldConfig.shared("image", num_images),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_skyworkr1v(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_image_repl(feature_size, num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>",
+                replacement=get_replacement_skyworkr1v,
+            )
+        ]
+
+
+class SkyworkR1VProcessingInfo(BaseSkyworkR1VProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        **kwargs: object,
+    ) -> SkyworkR1VProcessor:
+        if min_dynamic_patch is not None:
+            kwargs["min_dynamic_patch"] = min_dynamic_patch
+        if max_dynamic_patch is not None:
+            kwargs["max_dynamic_patch"] = max_dynamic_patch
+        if dynamic_image_size is not None:
+            kwargs["dynamic_image_size"] = dynamic_image_size
+
+        return self.ctx.init_processor(
+            SkyworkR1VProcessor,
+            config=self.get_hf_config(),
+            tokenizer=self.get_tokenizer(),
+            **kwargs,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    SkyworkR1VMultiModalProcessor,
+    info=SkyworkR1VProcessingInfo,
+    dummy_inputs=SkyworkR1VDummyInputsBuilder)
+class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self._patch_quant_config(config, quant_config)
+
+        image_size = config.force_image_size or config.vision_config.image_size
+        patch_size = config.vision_config.patch_size
+        self.patch_size = patch_size
+        self.num_image_token = int(
+            (image_size // patch_size)**2 * (config.downsample_ratio**2))
+        self.downsample_ratio = config.downsample_ratio
+        self.ps_version = config.ps_version
+
+        self.llm_arch_name = config.text_config.architectures[0]
+        self.is_mono = self.llm_arch_name == 'SkyworkLM2VEForCausalLM'
+        self.vision_model = self._init_vision_model(
+            config,
+            quant_config=quant_config,
+            is_mono=self.is_mono,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.mlp1 = self._init_mlp1(config)
+
+        self.img_context_token_id = None
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def _patch_quant_config(self, config: PretrainedConfig,
+                            quant_config: QuantizationConfig):
+        # the awq models from OpenGVLab missing `modules_to_not_convert`
+        # patch the quant_config to add `modules_to_not_convert` back
+        if isinstance(quant_config, AWQConfig):
+            text_config = config.text_config
+            llm_quant_config = getattr(text_config, "quantization_config",
+                                       None)
+            if (not quant_config.modules_to_not_convert) and \
+                (llm_quant_config is not None):
+                quant_config.modules_to_not_convert.append("vision_model")
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _init_vision_model(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig],
+        *,
+        is_mono: bool,
+        prefix: str,
+    ):
+        if not is_mono:
+            vision_feature_layer = config.select_layer
+            if vision_feature_layer < 0:
+                num_hidden_layers = config.vision_config.num_hidden_layers \
+                    + vision_feature_layer + 1
+            else:
+                num_hidden_layers = vision_feature_layer + 1
+
+            return InternVisionModel(
+                config.vision_config,
+                quant_config=quant_config,
+                num_hidden_layers_override=num_hidden_layers,
+                prefix=prefix,
+            )
+        else:
+            return InternVisionPatchModel(config.vision_config)
+
+    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential:
+        vit_hidden_size = config.vision_config.hidden_size
+        llm_hidden_size = config.text_config.hidden_size
+
+        return nn.Sequential(
+            nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2),
+            ReplicatedLinear(vit_hidden_size *
+                             int(1 / self.downsample_ratio)**2,
+                             llm_hidden_size,
+                             return_bias=False),
+            nn.GELU(),
+            ReplicatedLinear(llm_hidden_size,
+                             llm_hidden_size,
+                             return_bias=False),
+        )
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        x = x.view(n, int(h * scale_factor), int(w * scale_factor),
+                   int(c / (scale_factor * scale_factor)))
+        if self.ps_version == 'v1':
+            pass
+        else:
+            x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        vit_embeds = self.vision_model(pixel_values=pixel_values)
+        vit_embeds = vit_embeds[:, 1:, :]
+
+        h = w = int(vit_embeds.shape[1]**0.5)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+        vit_embeds = self.pixel_shuffle(vit_embeds,
+                                        scale_factor=self.downsample_ratio)
+        vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1,
+                                        vit_embeds.shape[-1])
+        vit_embeds = self.mlp1(vit_embeds)
+        return vit_embeds
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape)
+
+            if actual_dims != expected_dims:
+                expected_expr = str(expected_dims)
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f" per patch is {expected_expr}. "
+                    f"You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
+        pixel_values_flat = kwargs.pop("pixel_values_flat", None)
+        image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return SkyworkR1VImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds),
+            )
+
+        image_token_id = kwargs["image_token_id"]
+        assert isinstance(image_token_id, torch.Tensor)
+        self.img_context_token_id = image_token_id.flatten().unique().item()
+
+        if pixel_values_flat is not None:
+            if not isinstance(pixel_values_flat, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat)}")
+
+            if not isinstance(image_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(image_num_patches)}")
+
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
+            pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
+            image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
+
+            return SkyworkR1VImagePixelInputs(
+                type="pixel_values",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat),
+                num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _process_image_input(
+        self,
+        image_input: SkyworkR1VImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+
+        image_embeds = self.extract_feature(image_input["pixel_values_flat"])
+
+        num_patches = image_input["num_patches"]
+
+        # Only one image in the current batch
+        if len(num_patches) == 1:
+            return image_embeds.view(
+                -1, self.config.text_config.hidden_size).unsqueeze(0)
+
+        # NOTE: Image embeddings are split into separate tensors for each image
+        # by the size of each embedding.
+        feature_size = image_embeds.shape[1]
+        image_embeds = image_embeds.view(-1,
+                                         self.config.text_config.hidden_size)
+        image_feature_sizes = [
+            num_patches * feature_size for num_patches in num_patches
+        ]
+        return image_embeds.split(image_feature_sizes)
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if self.is_mono:
+            self.visual_token_mask = (
+                input_ids == self.img_context_token_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            assert self.img_context_token_id is not None
+            self._set_visual_token_mask(input_ids)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.img_context_token_id,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[SamplerOutput, IntermediateTensors]:
+
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        # Only required if the model is mono-architecture
+        if self.visual_token_mask is not None:
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(**forward_kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        skip_prefixes = [
+            "action_embed", "temporal_embed", "track_embed",
+            "track_embed_decoder", "box_token", "cg_criterion", "cg_model",
+            "loc_encoder", "loc_decoder", "sam", "temporal_token",
+            "track_token"
+        ]
+        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        return loader.load_weights(weights)
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1937b138847..71990468c31 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -37,8 +37,8 @@
                                              MLPSpeculatorConfig, MPTConfig,
                                              NemotronConfig, NVLM_D_Config,
                                              Olmo2Config, RWConfig,
-                                             SolarConfig, Telechat2Config,
-                                             UltravoxConfig)
+                                             SkyworkR1VChatConfig, SolarConfig,
+                                             Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
@@ -76,6 +76,7 @@
     "NVLM_D": NVLM_D_Config,
     "olmo2": Olmo2Config,
     "solar": SolarConfig,
+    "skywork_chat": SkyworkR1VChatConfig,
     "telechat": Telechat2Config,
     "ultravox": UltravoxConfig,
     **_CONFIG_REGISTRY_OVERRIDE_HF
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index 9060565596b..53699341bfb 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -20,6 +20,7 @@
 from vllm.transformers_utils.configs.nemotron import NemotronConfig
 from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
 from vllm.transformers_utils.configs.olmo2 import Olmo2Config
+from vllm.transformers_utils.configs.skyworkr1v import SkyworkR1VChatConfig
 from vllm.transformers_utils.configs.solar import SolarConfig
 from vllm.transformers_utils.configs.telechat2 import Telechat2Config
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
@@ -42,6 +43,7 @@
     "NemotronConfig",
     "NVLM_D_Config",
     "Olmo2Config",
+    "SkyworkR1VChatConfig",
     "SolarConfig",
     "Telechat2Config",
     "UltravoxConfig",
diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py
new file mode 100644
index 00000000000..ef5f9ba85c2
--- /dev/null
+++ b/vllm/transformers_utils/configs/skyworkr1v.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py
+# --------------------------------------------------------
+# SkyworkR1V
+# Copyright (c) 2025 Skywork
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from transformers.configuration_utils import PretrainedConfig
+
+
+class SkyworkR1VChatConfig(PretrainedConfig):
+    model_type = 'internvl_chat'
+    is_composition = True
+
+    def __init__(self,
+                 vision_config=None,
+                 llm_config=None,
+                 use_backbone_lora=0,
+                 use_llm_lora=0,
+                 select_layer=-1,
+                 force_image_size=None,
+                 downsample_ratio=0.5,
+                 template=None,
+                 dynamic_image_size=False,
+                 use_thumbnail=False,
+                 ps_version='v1',
+                 min_dynamic_patch=1,
+                 max_dynamic_patch=6,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+
+        if llm_config is None:
+            llm_config = {}
+
+        self.vision_config = PretrainedConfig(**vision_config)
+        self.text_config = PretrainedConfig(**llm_config)
+
+        self.use_backbone_lora = use_backbone_lora
+        self.use_llm_lora = use_llm_lora
+        self.select_layer = select_layer
+        self.force_image_size = force_image_size
+        self.downsample_ratio = downsample_ratio
+        self.template = template
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.ps_version = ps_version  # pixel shuffle version
+        self.min_dynamic_patch = min_dynamic_patch
+        self.max_dynamic_patch = max_dynamic_patch

From 3423bc29ebd35c291b0f24e2379d1e06f6217ea7 Mon Sep 17 00:00:00 2001
From: Ce Gao <cegao@tensorchord.ai>
Date: Sat, 29 Mar 2025 11:46:57 +0800
Subject: [PATCH 1068/1240] [Docs] Document v0 engine support in reasoning
 outputs (#15739)

Signed-off-by: Ce Gao <cegao@tensorchord.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/reasoning_outputs.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md
index 879b16d4f7b..3a0be69f8e1 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/source/features/reasoning_outputs.md
@@ -136,7 +136,14 @@ Remember to check whether the `reasoning_content` exists in the response before
 
 ## Structured output
 
-The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output.
+The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now.
+
+```bash
+VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --enable-reasoning --reasoning-parser deepseek_r1
+```
+
+Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine.
 
 ```python
 from openai import OpenAI

From 4ca67b9bfab1d7dc1bd827f5d53ed356557e2554 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 28 Mar 2025 20:59:47 -0700
Subject: [PATCH 1069/1240] [Misc][V1] Misc code streamlining (#15723)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/utils.py          |  5 +--
 vllm/v1/core/sched/scheduler.py    | 55 ++++++++++++++----------------
 vllm/v1/engine/core_client.py      |  2 +-
 vllm/v1/engine/output_processor.py |  2 +-
 vllm/v1/request.py                 |  8 +++--
 5 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 4206a24465e..cae1a25519b 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -207,10 +207,7 @@ def all_gather_obj(self, obj: Any) -> list[Any]:
     def barrier(self):
         """A barrier to synchronize all ranks."""
         for i in range(self.world_size):
-            if i == self.rank:
-                self.broadcast_obj(None, src=self.rank)
-            else:
-                self.broadcast_obj(None, src=i)
+            self.broadcast_obj(None, src=i)
 
     @staticmethod
     def create(
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 44811976125..094602a8b73 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -269,29 +269,26 @@ def schedule(self) -> SchedulerOutput:
 
                 request = self.waiting[0]
 
-                # Waiting request skipping logic
-                is_skipped = False
                 # Skip request if the structured output request is still waiting
-                # for FSM.
-                if (not is_skipped
-                        and request.status == RequestStatus.WAITING_FOR_FSM):
+                # for FSM compilation.
+                if request.status == RequestStatus.WAITING_FOR_FSM:
                     structured_output_req = request.structured_output_request
-                    is_skipped = (not structured_output_req
-                                  or not structured_output_req.grammar)
-                    if not is_skipped:
+                    if structured_output_req and structured_output_req.grammar:
                         request.status = RequestStatus.WAITING
-
-                # Skip request if max_loras can't be honored.
-                if (not is_skipped and self.lora_config
-                        and request.lora_request):
-                    req_lora_id = request.lora_request.lora_int_id
-                    is_skipped = (len(scheduled_loras)
-                                  == self.lora_config.max_loras
-                                  and (req_lora_id not in scheduled_loras))
-
-                if is_skipped:
-                    skipped_waiting_requests.appendleft(request)
+                    else:
+                        self.waiting.popleft()
+                        skipped_waiting_requests.appendleft(request)
+                        continue
+
+                # Check that adding the request still respects the max_loras
+                # constraint.
+                if self.lora_config and request.lora_request and (
+                        len(scheduled_loras) == self.lora_config.max_loras
+                        and request.lora_request.lora_int_id
+                        not in scheduled_loras):
+                    # Scheduling would exceed max_loras, skip.
                     self.waiting.popleft()
+                    skipped_waiting_requests.appendleft(request)
                     continue
 
                 # Get already-cached tokens.
@@ -602,8 +599,9 @@ def update_from_output(
             # OPTIMIZATION: Avoid list(set) if the set is empty.
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
-                    start_pos = request.mm_positions[input_id]["offset"]
-                    num_tokens = request.mm_positions[input_id]["length"]
+                    mm_positions = request.mm_positions[input_id]
+                    start_pos = mm_positions["offset"]
+                    num_tokens = mm_positions["length"]
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
@@ -616,25 +614,24 @@ def update_from_output(
 
             stopped = False
             new_logprobs = None
-            new_token_ids: list[int] = []
+            new_token_ids = generated_token_ids
 
             # Append generated tokens and check for stop. Note that if
             # a request is still being prefilled, we expect the model runner
             # to return empty token ids for the request.
-            for output_token_id in generated_token_ids:
+            for num_new, output_token_id in enumerate(new_token_ids, 1):
                 request.append_output_token_ids(output_token_id)
-                new_token_ids.append(output_token_id)
 
                 # Check for stop and update request state.
                 # This must be called before we make the EngineCoreOutput.
                 stopped = check_stop(request, self.max_model_len)
                 if stopped:
                     self._free_request(request)
+                    del new_token_ids[num_new:]  # Trim new tokens if needed.
                     break
 
             # Extract sample logprobs if needed.
-            if (request.sampling_params.logprobs is not None
-                    and logprobs is not None):
+            if request.sampling_params.logprobs is not None and logprobs:
                 # NOTE: once we support N tokens per step (spec decode),
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
@@ -644,9 +641,7 @@ def update_from_output(
                 # should not be None if use_structured_output, we have
                 # check above, so safe to ignore type warning
                 request.structured_output_request.grammar.accept_tokens(  # type: ignore[union-attr]
-                    request.request_id,
-                    new_token_ids,
-                )
+                    req_id, new_token_ids)
 
             # Get prompt logprobs for this request.
             prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id)
@@ -665,7 +660,7 @@ def update_from_output(
                 # Invariant: EngineCore returns no partial prefill outputs.
                 assert not prompt_logprobs_tensors
 
-            self.scheduled_req_ids.remove(request.request_id)
+            self.scheduled_req_ids.remove(req_id)
             if not stopped:
                 new_running.append(request)
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index c41ee6704be..8858a564d2c 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -416,9 +416,9 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         def process_outputs_socket():
             shutdown_socket = ctx.socket(zmq.PAIR)
-            shutdown_socket.bind(shutdown_path)
             out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
             try:
+                shutdown_socket.bind(shutdown_path)
                 poller = zmq.Poller()
                 poller.register(shutdown_socket)
                 poller.register(out_socket)
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index 1e67bed2611..70f072d3c93 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -328,7 +328,7 @@ def process_outputs(
             # 2) Detokenize the token ids into text and perform stop checks.
             stop_string = req_state.detokenizer.update(
                 new_token_ids, finish_reason == FinishReason.STOP)
-            if stop_string and finish_reason != FinishReason.STOP:
+            if stop_string:
                 finish_reason = FinishReason.STOP
                 stop_reason = stop_string
 
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index efb5a54d120..48e5132678c 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -93,9 +93,11 @@ def append_output_token_ids(
         token_ids: Union[int, list[int]],
     ) -> None:
         if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        self._output_token_ids.extend(token_ids)
-        self._all_token_ids.extend(token_ids)
+            self._output_token_ids.append(token_ids)
+            self._all_token_ids.append(token_ids)
+        else:
+            self._output_token_ids.extend(token_ids)
+            self._all_token_ids.extend(token_ids)
 
     @property
     def num_tokens(self) -> int:

From 660c7093c6dad029b26873eea77fa3f2d81cfa43 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Fri, 28 Mar 2025 21:10:41 -0700
Subject: [PATCH 1070/1240] [Bugfix] LoRA V1: add and fix entrypoints tests
 (#15715)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_generate_multiple_loras.py           | 14 +++++++++++++-
 tests/entrypoints/openai/test_lora_adapters.py    | 15 ++++++++++++++-
 vllm/entrypoints/openai/serving_models.py         |  2 +-
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
index 90e1d581413..099af0f3608 100644
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ b/tests/entrypoints/llm/test_generate_multiple_loras.py
@@ -23,7 +23,19 @@
 
 
 @pytest.fixture(scope="module")
-def llm():
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def llm(request, monkeypatch_module):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # pytest caches the fixture so we use weakref.proxy to
     # enable garbage collection
     llm = LLM(model=MODEL_NAME,
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index 1a62157acc4..2fc08b47513 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -53,7 +53,20 @@ def zephyr_lora_files():
 
 
 @pytest.fixture(scope="module")
-def server_with_lora_modules_json(zephyr_lora_files):
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server_with_lora_modules_json(request, monkeypatch_module,
+                                  zephyr_lora_files):
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     # Define the json format LoRA module configurations
     lora_module_1 = {
         "name": "zephyr-lora",
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 38a66583022..7a68452efc6 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -162,7 +162,7 @@ async def load_lora_adapter(
         except BaseException as e:
             error_type = "BadRequestError"
             status_code = HTTPStatus.BAD_REQUEST
-            if isinstance(e, ValueError) and "No adapter found" in str(e):
+            if "No adapter found" in str(e):
                 error_type = "NotFoundError"
                 status_code = HTTPStatus.NOT_FOUND
 

From eb95f62ec6ee3b69ee7a8a5a085ac1402b68303d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Sat, 29 Mar 2025 00:10:45 -0400
Subject: [PATCH 1071/1240] [CI] Speed up V1 structured output tests (#15718)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 222 +++++++-----------
 1 file changed, 89 insertions(+), 133 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index c9fa03a1ae1..a32dd826399 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -23,20 +23,46 @@
 ]
 
 
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"
+
+
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType
+
+
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("guided_decoding_backend",
                          GUIDED_DECODING_BACKENDS_V1)
 @pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion(
+def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
+    unsupported_json_schema: dict[str, Any],
+    sample_sql_ebnf: str,
+    sample_sql_lark: str,
+    sample_regex: str,
+    sample_guided_choice: str,
     guided_decoding_backend: str,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
     llm = LLM(model=model_name,
+              enforce_eager=True,
               max_model_len=1024,
               guided_decoding_backend=guided_decoding_backend)
+
+    #
+    # Test 1: Generate JSON output based on a provided schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
@@ -63,20 +89,9 @@ def test_guided_json_completion(
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=sample_json_schema)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_object(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 2: Generate JSON object without a schema
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=100,
@@ -111,21 +126,9 @@ def test_guided_json_object(
                 allowed_types = (dict, list)
             assert isinstance(parsed_json, allowed_types)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_unsupported_schema(
-    monkeypatch: pytest.MonkeyPatch,
-    unsupported_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 3: test a jsonschema incompatible with xgrammar
+    #
     sampling_params = SamplingParams(
         temperature=1.0,
         max_tokens=1000,
@@ -141,8 +144,6 @@ def test_guided_json_unsupported_schema(
                          sampling_params=sampling_params,
                          use_tqdm=True)
     else:
-        # This should work for both "guidance" and "auto".
-
         outputs = llm.generate(
             prompts=("Give an example JSON object for a grade "
                      "that fits this schema: "
@@ -161,21 +162,9 @@ def test_guided_json_unsupported_schema(
             parsed_json = json.loads(generated_text)
             assert isinstance(parsed_json, dict)
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_ebnf: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 4: Generate SQL statement using EBNF grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -205,21 +194,9 @@ def test_guided_grammar_ebnf(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_lark(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_lark: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 5: Generate SQL statement using Lark grammar
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -254,20 +231,9 @@ def test_guided_grammar_lark(
 
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf_invalid(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 6: Test invalid grammar input
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -281,21 +247,9 @@ def test_guided_grammar_ebnf_invalid(
             use_tqdm=True,
         )
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_regex(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_regex: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 7: Generate text based on a regex pattern
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -319,21 +273,9 @@ def test_guided_regex(
         assert re.fullmatch(sample_regex, generated_text) is not None
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_choice_completion(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 8: Generate text based on a choices
+    #
     sampling_params = SamplingParams(
         temperature=0.8,
         top_p=0.95,
@@ -353,33 +295,9 @@ def test_guided_choice_completion(
         assert generated_text in sample_guided_choice
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-
-class CarType(str, Enum):
-    sedan = "sedan"
-    suv = "SUV"
-    truck = "Truck"
-    coupe = "Coupe"
-
-
-class CarDescription(BaseModel):
-    brand: str
-    model: str
-    car_type: CarType
-
-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_with_enum(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 9: Generate structured output using a Pydantic model with an enum
+    #
     json_schema = CarDescription.model_json_schema()
     sampling_params = SamplingParams(
         temperature=1.0,
@@ -403,3 +321,41 @@ def test_guided_json_completion_with_enum(
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
         output_json = json.loads(generated_text)
         jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+def test_structured_output_auto_mode(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    model_name: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend="auto")
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+
+    # This would fail with the default of "xgrammar", but in "auto"
+    # we will handle fallback automatically.
+    outputs = llm.generate(prompts=("Give an example JSON object for a grade "
+                                    "that fits this schema: "
+                                    f"{unsupported_json_schema}"),
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(generated_text)
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)

From c648c163228f89b4d566b20068ec54252798744a Mon Sep 17 00:00:00 2001
From: cyyever <cyyever@outlook.com>
Date: Sat, 29 Mar 2025 12:11:51 +0800
Subject: [PATCH 1072/1240] Use numba 0.61 for python 3.10+ to support numpy>=2
 (#15692)

Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt | 2 +-
 requirements/cuda.txt   | 3 ++-
 requirements/rocm.txt   | 3 ++-
 requirements/test.in    | 4 +++-
 requirements/test.txt   | 8 +++++---
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 14084b79121..dfa20f5e3f0 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,7 +1,7 @@
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
+numpy
 requests >= 2.26.0
 tqdm
 blake3
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index ad7198081e0..9be7a868f56 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -1,7 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 345c84b0f6c..5d5fea2d0e5 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -1,7 +1,8 @@
 # Common dependencies
 -r common.txt
 
-numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
 
 # Dependencies for AMD GPUs
 awscli
diff --git a/requirements/test.in b/requirements/test.in
index 3df5e32cd59..a7dd54151de 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -38,7 +38,9 @@ buildkite-test-collector==0.1.9
 genai_perf==0.0.8
 tritonclient==2.51.0
 
-numpy < 2.0.0
+numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
+numba == 0.61; python_version > '3.9'
+numpy
 runai-model-streamer==0.11.0
 runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
diff --git a/requirements/test.txt b/requirements/test.txt
index b0ae479604a..aed6a5653e2 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -219,7 +219,7 @@ libnacl==2.1.0
     # via tensorizer
 librosa==0.10.2.post1
     # via -r requirements/test.in
-llvmlite==0.43.0
+llvmlite==0.44.0
     # via numba
 lm-eval==0.4.4
     # via -r requirements/test.in
@@ -262,8 +262,10 @@ networkx==3.2.1
     # via torch
 nltk==3.9.1
     # via rouge-score
-numba==0.60.0
-    # via librosa
+numba==0.61.0
+    # via
+    #   -r requirements/test.in
+    #   librosa
 numexpr==2.10.1
     # via lm-eval
 numpy==1.26.4

From aa4af0e62ae1047ad904e04bb3d66b340a333f94 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sat, 29 Mar 2025 12:12:26 +0800
Subject: [PATCH 1073/1240] [Bugfix] set VLLM_WORKER_MULTIPROC_METHOD=spawn for
 vllm.entrypoionts.openai.api_server (#15700)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/cli/main.py          | 28 ++-------------------------
 vllm/entrypoints/openai/api_server.py |  4 +++-
 vllm/entrypoints/utils.py             | 26 +++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index 13f2761b0db..aa54bd66bed 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # The CLI entrypoint to vLLM.
-import os
 import signal
 import sys
 
@@ -9,11 +8,9 @@
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.logger import init_logger
+from vllm.entrypoints.utils import cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
-logger = init_logger(__name__)
-
 CMD_MODULES = [
     vllm.entrypoints.cli.openai,
     vllm.entrypoints.cli.serve,
@@ -30,29 +27,8 @@ def signal_handler(sig, frame):
     signal.signal(signal.SIGTSTP, signal_handler)
 
 
-def env_setup():
-    # The safest multiprocessing method is `spawn`, as the default `fork` method
-    # is not compatible with some accelerators. The default method will be
-    # changing in future versions of Python, so we should use it explicitly when
-    # possible.
-    #
-    # We only set it here in the CLI entrypoint, because changing to `spawn`
-    # could break some existing code using vLLM as a library. `spawn` will cause
-    # unexpected behavior if the code is not protected by
-    # `if __name__ == "__main__":`.
-    #
-    # References:
-    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
-    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
-    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
-    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
-        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
-        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-
 def main():
-    env_setup()
+    cli_env_setup()
 
     parser = FlexibleArgumentParser(description="vLLM CLI")
     parser.add_argument('-v',
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 18d75a04ab0..2a61259896a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -82,7 +82,8 @@
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
-from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.entrypoints.utils import (cli_env_setup, load_aware_call,
+                                    with_cancellation)
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.transformers_utils.config import (
@@ -1106,6 +1107,7 @@ def _listen_addr(a: str) -> str:
     # NOTE(simon):
     # This section should be in sync with vllm/entrypoints/cli/main.py for CLI
     # entrypoints.
+    cli_env_setup()
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 773f52fa38f..b88c2b3a080 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -2,11 +2,16 @@
 
 import asyncio
 import functools
+import os
 
 from fastapi import Request
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.background import BackgroundTask, BackgroundTasks
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 async def listen_for_disconnect(request: Request) -> None:
     """Returns if a disconnect message is received"""
@@ -108,3 +113,24 @@ async def wrapper(*args, **kwargs):
         return response
 
     return wrapper
+
+
+def cli_env_setup():
+    # The safest multiprocessing method is `spawn`, as the default `fork` method
+    # is not compatible with some accelerators. The default method will be
+    # changing in future versions of Python, so we should use it explicitly when
+    # possible.
+    #
+    # We only set it here in the CLI entrypoint, because changing to `spawn`
+    # could break some existing code using vLLM as a library. `spawn` will cause
+    # unexpected behavior if the code is not protected by
+    # `if __name__ == "__main__":`.
+    #
+    # References:
+    # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+    # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
+    # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors
+    # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders
+    if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ:
+        logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'")
+        os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

From d65ddc223b36f9bcfd9aab811bfb527fef8d0c43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Sat, 29 Mar 2025 05:13:06 +0100
Subject: [PATCH 1074/1240] [TPU][V1][Bugfix] Fix w8a8 recompiilation with
 GSM8K (#15714)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh                      | 10 ++++------
 .../layers/quantization/kernels/scaled_mm/xla.py   |  3 ++-
 vllm/v1/worker/tpu_model_runner.py                 | 14 ++++++++------
 vllm/v1/worker/tpu_worker.py                       |  4 ++--
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 2c356b8fe52..89252000f40 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -28,16 +28,14 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_3 \
     && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
     && echo TEST_4 \
-    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
+    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
     && echo TEST_5 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
     && echo TEST_6 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
+    && echo TEST_7 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
 # && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
-
-# TODO: Re-enable this after fixing recompilation in quantization.
-# && echo TEST_4 \
-# && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
index 0bf090d7fab..089314071d3 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py
@@ -97,7 +97,8 @@ def apply_weights(self,
                                              block_size=-1,
                                              int4_weight=False,
                                              quantize_activation=True)
-
+        # `quantized_matmul` output is fp32, cast it down to bf16 for perf
+        out = out.to(x.dtype)
         # Explicitly capture control flow to make dynamo happy.
         # https://pytorch.org/docs/main/generated/exportdb/index.html#cond-branch-class-method # noqa: E501
         return cond(bias is None, self.no_add_bias, self.add_bias, [out, bias])
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 695e31f715b..773cd971103 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -80,6 +80,7 @@ def __init__(
         self.enforce_eager = model_config.enforce_eager
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
+        self._hidden_states_dtype = self.dtype
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.sliding_window = model_config.get_sliding_window()
@@ -771,10 +772,11 @@ def _dummy_run(self, kv_caches, num_tokens: int) -> None:
         torch._dynamo.mark_dynamic(attn_metadata.slot_mapping, 0)
 
         with set_forward_context(attn_metadata, self.vllm_config, 0):
-            self.model(input_ids=input_ids,
-                       positions=position_ids,
-                       kv_caches=kv_caches,
-                       inputs_embeds=inputs_embeds)
+            out = self.model(input_ids=input_ids,
+                             positions=position_ids,
+                             kv_caches=kv_caches,
+                             inputs_embeds=inputs_embeds)
+        self._hidden_states_dtype = out.dtype
 
     def capture_model(self) -> None:
         """Compile the model."""
@@ -800,7 +802,7 @@ def capture_model(self) -> None:
             num_reqs_to_sample = MIN_NUM_SEQS
             dummy_hidden = torch.randn((num_tokens, hsize),
                                        device=device,
-                                       dtype=torch.bfloat16)
+                                       dtype=self._hidden_states_dtype)
             # Compile for [8, 16, .., 128,.., `self.max_num_reqs`]
             while True:
                 indices = torch.zeros(
@@ -823,7 +825,7 @@ def capture_model(self) -> None:
                     num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        logger.info("Compilation finished in %.2f [secs].", end - start)
         # Record the number cached XLA graph after warming up, this will be
         # used for checking there is no additional graph compilation during
         # runtime execution.
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index c8691ee87fe..b51bd20f6f1 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -105,8 +105,8 @@ def init_device(self):
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
-        # NOTE(woosuk): Usually, we compile 10-15 graphs for prefill and
-        # 30-40 graphs for decode. 128 is an arbitrary safe number.
+        # TODO (NickLucche) On gsm we compile 80+ graphs.
+        # Re-evaluate limit, with MM we may get close to this limit.
         torch._dynamo.config.cache_size_limit = 128
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): Set per-rank cache path since different ranks

From c98e52c0a307ea923af8f7236bc448ccaf103475 Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Fri, 28 Mar 2025 21:13:15 -0700
Subject: [PATCH 1075/1240] [Kernel][TPU][ragged-paged-attn] vLLM code change
 for PR#8896 (#15659)

Signed-off-by: Yarong Mu <ymu@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/tpu.txt                 | 12 ++++----
 vllm/v1/attention/backends/pallas.py | 43 ++++++++++++++--------------
 vllm/v1/worker/tpu_model_runner.py   | 11 ++++---
 vllm/v1/worker/tpu_worker.py         |  8 +++---
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 35d5db6c460..1930eacb61a 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250319-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 14d3664db0d..2f86920e277 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -41,7 +41,7 @@ def get_kv_cache_shape(
         num_kv_heads: int,
         head_size: int,
     ) -> tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads * head_size)
+        return (num_blocks, block_size, num_kv_heads * 2, head_size)
 
     @staticmethod
     def swap_blocks(
@@ -132,7 +132,7 @@ def forward(
         query: torch.Tensor,
         key: torch.Tensor,
         value: torch.Tensor,
-        kv_cache: tuple[torch.Tensor, torch.Tensor],
+        kv_cache: torch.Tensor,
         attn_metadata: PallasMetadata,
         output: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -142,14 +142,13 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = ([num_blocks, block_size, num_kv_heads * head_size], 
-                        [num_blocks, block_size, num_kv_heads * head_size])
+            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
         """
         # For determine_available_memory case.
-        if kv_cache[0].numel() == 0:
+        if kv_cache.numel() == 0:
             if output is None:
                 output = torch.ones_like(query)
             return output
@@ -158,15 +157,13 @@ def forward(
         num_tokens, hidden_size = query.shape
         query = query.view(num_tokens, self.num_heads, self.head_size)
 
-        key_cache, value_cache = kv_cache
-        if kv_cache[0].numel() > 0:
+        if kv_cache.numel() > 0:
             slot_mapping = attn_metadata.slot_mapping
-            write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping)
+            write_to_kv_cache(key, value, kv_cache, slot_mapping)
 
         output = torch.ops.xla.ragged_paged_attention(
             query,
-            key_cache,
-            value_cache,
+            kv_cache,
             attn_metadata.context_lens,
             attn_metadata.block_tables,
             attn_metadata.query_start_loc,
@@ -183,23 +180,27 @@ def forward(
 def write_to_kv_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """ Write the key and values to the KV cache.
 
     Args:
         key: shape = [num_tokens, num_kv_heads * head_size]
-        value: shape = [num_tokens, num_kv_heads * head_size]
-        k_cache = [num_blocks, block_size, num_kv_heads * head_size]
-        v_cache = [num_blocks, block_size, num_kv_heads * head_size]
+        value: shape = [num_tokens, num_kv_heads *  head_size]
+        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
 
     """
-    torch.ops.xla.dynamo_set_buffer_donor_(key_cache, True)
-    torch.ops.xla.dynamo_set_buffer_donor_(value_cache, True)
+    _, _, num_combined_kv_heads, head_size = kv_cache.shape
+    num_kv_heads = num_combined_kv_heads // 2
 
-    key_cache = key_cache.flatten(0, 1)
-    value_cache = value_cache.flatten(0, 1)
-    key_cache.index_copy_(0, slot_mapping, key)
-    value_cache.index_copy_(0, slot_mapping, value)
+    key = key.view(-1, num_kv_heads, head_size)
+    value = value.view(-1, num_kv_heads, head_size)
+
+    kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads,
+                                                  head_size)
+
+    torch.ops.xla.dynamo_set_buffer_donor_(kv_cache, True)
+
+    kv_cache = kv_cache.flatten(0, 1)
+    kv_cache.index_copy_(0, slot_mapping, kv)
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 773cd971103..ea5a17016eb 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -861,12 +861,11 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
                     dtype = kv_cache_spec.dtype
 
-                    tpu_k_cache = torch.zeros(kv_cache_shape,
-                                              dtype=dtype,
-                                              device=self.device)
-                    tpu_v_cache = torch.zeros_like(tpu_k_cache)
+                    tpu_kv_cache = torch.zeros(kv_cache_shape,
+                                               dtype=dtype,
+                                               device=self.device)
 
-                    kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                    kv_caches[layer_name] = tpu_kv_cache
                 else:
                     raise NotImplementedError
 
@@ -893,7 +892,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: list[tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: list[torch.Tensor],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Executes the forward pass of the model.
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index b51bd20f6f1..9add8cee02e 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -136,10 +136,10 @@ def determine_available_memory(self) -> int:
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
                 # it by reference, rather by specializing on the value ``None``.
-                tpu_k_cache = torch.tensor([], dtype=dtype, device=self.device)
-                tpu_v_cache = torch.tensor([], dtype=dtype, device=self.device)
-
-                kv_caches[layer_name] = (tpu_k_cache, tpu_v_cache)
+                tpu_kv_cache = torch.tensor([],
+                                            dtype=dtype,
+                                            device=self.device)
+                kv_caches[layer_name] = tpu_kv_cache
             else:
                 raise NotImplementedError
 

From 67e4b4c9b9cb92a2bb387d181b7d70292636ed69 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 29 Mar 2025 12:27:22 +0800
Subject: [PATCH 1076/1240] [doc] update doc (#15740)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/README.md | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 74e05ce0263..dcd5e759dfa 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -2,19 +2,42 @@
 
 ## Build the docs
 
+- Make sure in `docs` directory
+
+```bash
+cd docs
+```
+
+- Install the dependencies:
+
 ```bash
-# Install dependencies.
 pip install -r ../requirements/docs.txt
+```
+
+- Clean the previous build (optional but recommended):
 
-# Build the docs.
+```bash
 make clean
+```
+
+- Generate the HTML documentation:
+
+```bash
 make html
 ```
 
 ## Open the docs with your browser
 
+- Serve the documentation locally:
+
 ```bash
 python -m http.server -d build/html/
 ```
 
-Launch your browser and open localhost:8000.
+This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
+
+If port 8000 is already in use, you can specify a different port, for example:
+
+```bash
+python -m http.server 3000 -d build/html/
+```

From 5860555149d43d9e9293cca010ba8f21c4ef237c Mon Sep 17 00:00:00 2001
From: TJian <tunjian.tan@embeddedllm.com>
Date: Sat, 29 Mar 2025 18:33:56 +0800
Subject: [PATCH 1077/1240] [FEAT] [ROCm] Add AITER int8 scaled gemm kernel
 (#15433)

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/quantization/test_compressed_tensors.py |  76 ++++++++++-
 vllm/envs.py                                  |   8 ++
 .../kernels/scaled_mm/__init__.py             |   4 +-
 .../quantization/kernels/scaled_mm/aiter.py   | 119 ++++++++++++++++++
 4 files changed, 202 insertions(+), 5 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 133475a3e06..5c928f27c10 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -20,6 +20,23 @@
     sparse_cutlass_supported)
 from vllm.platforms import current_platform
 
+# AITER only supports per-channel-per-channel INT8 gemm
+# and per-tensor-per-tensor INT8 GEMM.
+# It does not support mix precision MM and mix quantization scheme.
+ROCM_AITER_SUPPORTED_INT8_MODEL = [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+]
+
+# TritonScaledMMLinearKernel only supports symmetric quantization.
+ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+]
+
 
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
@@ -57,6 +74,11 @@ def use_v0_only(monkeypatch):
 )
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
     model_path, strategy, quant_type, shape_0, is_symmetric = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
     with vllm_runner(model_path, enforce_eager=True) as llm:
 
         def check_model(model):
@@ -123,6 +145,8 @@ def zp_valid(zp: Optional[torch.Tensor]):
 )
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_compressed_tensors_w8a8_logprobs(
     hf_runner,
     vllm_runner,
@@ -130,7 +154,21 @@ def test_compressed_tensors_w8a8_logprobs(
     model_path,
     max_tokens,
     num_logprobs,
+    use_aiter,
+    monkeypatch,
 ):
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     dtype = "bfloat16"
 
     # skip language translation prompt for the static per tensor asym model
@@ -154,6 +192,9 @@ def test_compressed_tensors_w8a8_logprobs(
         name_1="vllm",
     )
 
+    if current_platform.is_rocm():
+        torch.cuda.synchronize()
+
 
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
@@ -177,8 +218,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
         ),
     ],
 )
-def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_dynamic_per_token(
+    vllm_runner,
+    model_args,
+    use_aiter,
+    monkeypatch,
+):
     model_path, strategy = model_args
+
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
+
     with vllm_runner(model_path, dtype=torch.float16) as llm:
 
         def check_model(model):
@@ -207,6 +267,8 @@ def check_model(model):
         ("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4),
     ],
 )
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
     model, strategy, group, pack_factor = wNa16_args
     with vllm_runner(model) as llm:
@@ -231,6 +293,8 @@ def check_model(model):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
     model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
     with vllm_runner(model_path) as llm:
@@ -271,7 +335,7 @@ def check_model(model):
 
             if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
                 assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                 assert qkv_proj.weight_scale.dtype is torch.float32
                 assert len(qkv_proj.weight_scale.shape) == 0
 
@@ -281,6 +345,8 @@ def check_model(model):
         assert output
 
 
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_kv_cache(vllm_runner):
     model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
     with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -309,7 +375,8 @@ def _test_2of4_quant_models(qkv_proj,
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -356,7 +423,8 @@ def check_model(model):
 
 
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
     reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
diff --git a/vllm/envs.py b/vllm/envs.py
index 5334667376b..8a03ba329b0 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -75,6 +75,7 @@
     VLLM_DISABLED_KERNELS: list[str] = []
     VLLM_USE_V1: bool = True
     VLLM_ROCM_USE_AITER: bool = False
+    VLLM_ROCM_USE_AITER_LINEAR: bool = True
     VLLM_ROCM_USE_AITER_MOE: bool = True
     VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
@@ -524,6 +525,13 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER", "False").lower() in
              ("true", "1")),
 
+    # use aiter linear op if aiter ops are enabled
+    # The following list of related ops
+    # - scaled_mm (per-tensor / rowwise)
+    "VLLM_ROCM_USE_AITER_LINEAR":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_LINEAR", "True").lower() in
+             ("true", "1")),
+
     # Whether to use aiter moe ops.
     # By default is enabled.
     "VLLM_ROCM_USE_AITER_MOE":
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index a5967995ac8..bedda4c2ab2 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -3,6 +3,8 @@
 import os
 from typing import Dict, List, Optional, Type
 
+from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
     CutlassScaledMMLinearKernel)
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import (  # noqa: E501
@@ -17,7 +19,7 @@
 _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = {
     PlatformEnum.CPU: [CutlassScaledMMLinearKernel],
     PlatformEnum.CUDA: [CutlassScaledMMLinearKernel],
-    PlatformEnum.ROCM: [TritonScaledMMLinearKernel],
+    PlatformEnum.ROCM: [AiterScaledMMLinearKernel, TritonScaledMMLinearKernel],
     PlatformEnum.TPU: [XLAScaledMMLinearKernel],
 }
 
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
new file mode 100644
index 00000000000..582b12f7656
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+from .cutlass import CutlassScaledMMLinearKernel
+from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig
+
+
+class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+
+    @classmethod
+    def can_implement(
+            cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]:
+        if not current_platform.is_rocm():
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "currently supported on non-ROCm platform.")
+
+        try:
+            import aiter  # noqa: F401 # deliberately attempt to import aiter
+        except Exception:
+            return (
+                False,
+                "AiterScaledMMLinearKernel requires `aiter` which is not " +
+                "installed on ROCm.")
+        # Check if rocm_aiter_gemm_w8a8_scaled_mm is enabled
+        if not (
+            envs.VLLM_ROCM_USE_AITER_LINEAR \
+            and envs.VLLM_ROCM_USE_AITER
+        ):
+            return (False, "AiterScaledMMLinearKernel is disabled. " +
+                    "Enable by setting `VLLM_ROCM_USE_AITER=1` " +
+                    "and `VLLM_ROCM_USE_AITER_LINEAR=1`. " +
+                    "`VLLM_ROCM_USE_AITER_LINEAR` default is True.")
+
+        if not c.input_symmetric:
+            return (False,
+                    "AiterScaledMMLinearKernel only supports symmetric " +
+                    "quantization.")
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        super().process_weights_after_loading(layer)
+
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        `AiterScaledMMLinearKernel` implements a fused version of
+            `output = torch.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
+        where scale_a * a and scale_b * b are implemented using numpy-style
+        broadcasting.
+        Currently only support per-tensor-per-tensor GEMM
+        and per-token-per-channel GEMM through AITER
+        w8a8 scaled gemm. `AiterScaledMMLinearKernel` also does not support
+        ATIER block scaled GEMM and mix-precision GEMM.
+        """
+        w_q, w_s, i_s, i_zp, azp_adj = self._get_weight_params(layer)
+
+        # ops.scaled_int8_quant supports both dynamic and static quant:
+        # * dynamic, i_s is None and x_s computed from x.
+        # * static, i_s is scalar and x_s is i_s.
+        symmetric = azp_adj is None
+        assert symmetric, ("AiterScaledMMLinearKernel only supports"
+                           " symmetric quantization.")
+        x_q, x_s, x_zp = ops.scaled_int8_quant(x,
+                                               i_s,
+                                               i_zp,
+                                               symmetric=symmetric)
+
+        assert x_zp is None, ("AiterScaledMMLinearKernel only supports"
+                              " symmetric quantization.")
+        out_dtype = x.dtype
+
+        assert (w_q.shape[0] % 16 == 0 and w_q.shape[1] % 16 == 0)
+        assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+        assert bias is None or bias.shape[0] == w_q.shape[
+            1] and bias.dtype == out_dtype
+
+        m = x_q.shape[0]  # a
+        n = w_q.shape[1]  # b
+
+        per_tensor_scale_a = (x_s.numel() == 1)
+        per_tensor_scale_b = (w_s.numel() == 1)
+        per_token_scale_a = (x_s.numel() == m)
+        per_channel_scale_b = (w_s.numel() == n)
+
+        # @TODO:
+        # Maybe broadcast the per-tensor-scale into per-channel-scale
+        # if one of the scale is a per-channel-scale.
+        # For now, it only supports:
+        # - per-tensor-per-tensor a8w8 scaled GEMM, and
+        # - per-token-per-channel a8w8 scaled GEMM
+        assert ((per_tensor_scale_a and per_tensor_scale_b)
+                or (per_token_scale_a and per_channel_scale_b)), (
+                    "Currently only support per-tensor-per-tensor GEMM " +
+                    " and per-token-per-channel GEMM through AITER"
+                    " w8a8 scaled gemm. `AiterScaledMMLinearKernel` " +
+                    "does not support AITER block scaled GEMM.")
+
+        from aiter import gemm_a8w8_CK
+
+        # gemm_a8w8_CK(a, b, scale_a, scale_b, bias) expects
+        # a to be [M, K]
+        # b to be [N, K]
+        # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
+        return gemm_a8w8_CK(x_q, w_q.t(), x_s, w_s, bias).to(out_dtype)

From 039e288efce7aa980e78b25347fb9cddd4405f27 Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Sat, 29 Mar 2025 05:39:14 -0500
Subject: [PATCH 1078/1240] [V1] [Feature] Collective RPC (#15444)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml |  6 ++---
 vllm/engine/llm_engine.py     | 13 +++++++++--
 vllm/entrypoints/llm.py       |  4 ++--
 vllm/v1/engine/core.py        | 12 +++++++++-
 vllm/v1/engine/core_client.py | 43 ++++++++++++++++++++++++++++++++++-
 vllm/v1/engine/llm_engine.py  | 10 +++++++-
 vllm/v1/serial_utils.py       |  8 +++++++
 7 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 428b4c593c3..62872bf8e3e 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -150,8 +150,8 @@ steps:
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
   - pushd ../examples/offline_inference
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
 - label: Metrics, Tracing Test # 10min
@@ -520,7 +520,7 @@ steps:
   - vllm/v1/engine/
   commands:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
-  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
   - pytest -v -s ./compile/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5682b3dabe2..10677878ecc 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -7,8 +7,8 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    List, Mapping, NamedTuple, Optional)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+                    Iterable, List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
 
@@ -67,6 +67,7 @@
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
+_R = TypeVar("_R", default=Any)
 
 
 @dataclass
@@ -2123,6 +2124,14 @@ def _build_logits_processors(
 
         return sampling_params
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1:
     from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 1887caf25a3..7c354be2d45 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -492,8 +492,8 @@ def collective_rpc(self,
             It is recommended to use this API to only pass control messages,
             and set up data-plane communication to pass data.
         """
-        executor = self.llm_engine.model_executor
-        return executor.collective_rpc(method, timeout, args, kwargs)
+
+        return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
         """
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 20904cd495f..6083eea45cd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -8,7 +8,7 @@
 from concurrent.futures import Future
 from inspect import isclass, signature
 from logging import DEBUG
-from typing import Any, Optional
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import msgspec
 import psutil
@@ -43,6 +43,8 @@
 
 POLLING_TIMEOUT_S = 2.5
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
@@ -280,6 +282,14 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.model_executor.collective_rpc(method, timeout, args,
+                                                  kwargs)
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 8858a564d2c..3dc33a1284a 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,7 +12,7 @@
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, TypeVar, Union
 
 import zmq
 import zmq.asyncio
@@ -33,6 +33,8 @@
 
 AnyFuture = Union[asyncio.Future[Any], Future[Any]]
 
+_R = TypeVar('_R')  # Return type for collective_rpc
+
 
 class EngineCoreClient(ABC):
     """
@@ -117,6 +119,13 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
     async def get_output_async(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
@@ -153,6 +162,14 @@ async def list_loras_async(self) -> set[int]:
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        raise NotImplementedError
+
 
 class InprocClient(EngineCoreClient):
     """
@@ -210,6 +227,13 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
 
 class CoreEngine:
     """One per data parallel rank."""
@@ -505,6 +529,14 @@ def is_sleeping(self) -> bool:
     def execute_dummy_batch(self) -> None:
         self.call_utility("execute_dummy_batch")
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.call_utility("collective_rpc", method, timeout, args,
+                                 kwargs)
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -636,6 +668,15 @@ async def list_loras_async(self) -> set[int]:
     async def pin_lora_async(self, lora_id: int) -> bool:
         return await self.call_utility_async("pin_lora", lora_id)
 
+    async def collective_rpc_async(
+            self,
+            method: Union[str, Callable[..., _R]],
+            timeout: Optional[float] = None,
+            args: tuple = (),
+            kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return await self.call_utility_async("collective_rpc", method, timeout,
+                                             args, kwargs)
+
 
 class DPAsyncMPClient(AsyncMPClient):
     """Asyncio-compatible client for multi-proc, multi-engine (data parallel)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 000de21fbe7..764c643b5c9 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -2,7 +2,7 @@
 
 from collections.abc import Mapping
 from copy import copy
-from typing import Optional, Union
+from typing import Any, Callable, Optional, Union
 
 from typing_extensions import TypeVar
 
@@ -32,6 +32,7 @@
 logger = init_logger(__name__)
 
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
+_R = TypeVar("_R", default=Any)
 
 
 class LLMEngine:
@@ -282,6 +283,13 @@ def pin_lora(self, lora_id: int) -> bool:
         """Prevent an adapter from being evicted."""
         return self.engine_core.pin_lora(lora_id)
 
+    def collective_rpc(self,
+                       method: Union[str, Callable[..., _R]],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+
     def __del__(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 3f000abcde0..146d7d747f1 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pickle
+from types import FunctionType
 from typing import Any, Optional
 
+import cloudpickle
 import torch
 from msgspec import msgpack
 
 CUSTOM_TYPE_TENSOR = 1
 CUSTOM_TYPE_PICKLE = 2
+CUSTOM_TYPE_CLOUDPICKLE = 3
 
 
 class MsgpackEncoder:
@@ -41,6 +44,9 @@ def custom_enc_hook(obj: Any) -> Any:
         # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501
         return msgpack.Ext(CUSTOM_TYPE_TENSOR, pickle.dumps(obj.numpy()))
 
+    if isinstance(obj, FunctionType):
+        return msgpack.Ext(CUSTOM_TYPE_CLOUDPICKLE, cloudpickle.dumps(obj))
+
     return msgpack.Ext(CUSTOM_TYPE_PICKLE, pickle.dumps(obj))
 
 
@@ -49,5 +55,7 @@ def custom_ext_hook(code: int, data: memoryview) -> Any:
         return torch.from_numpy(pickle.loads(data))
     if code == CUSTOM_TYPE_PICKLE:
         return pickle.loads(data)
+    if code == CUSTOM_TYPE_CLOUDPICKLE:
+        return cloudpickle.loads(data)
 
     raise NotImplementedError(f"Extension type code {code} is not supported")

From cd16bf33e43e8eb222bbcca2ab141bdb5cead52e Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Sat, 29 Mar 2025 19:01:46 +0800
Subject: [PATCH 1079/1240] [Feature][Disaggregated] Support XpYd disaggregated
 prefill with MooncakeStore (#12957)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../disagg_examples/disagg_proxy_demo.py      | 450 ++++++++++++++++++
 .../kv_transfer/kv_connector/factory.py       |   5 +
 .../kv_connector/mooncake_store_connector.py  | 216 +++++++++
 .../kv_transfer/kv_lookup_buffer/base.py      |  85 +++-
 .../kv_lookup_buffer/mooncake_store.py        | 160 +++++++
 5 files changed, 906 insertions(+), 10 deletions(-)
 create mode 100644 examples/online_serving/disagg_examples/disagg_proxy_demo.py
 create mode 100644 vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py

diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
new file mode 100644
index 00000000000..a701636f357
--- /dev/null
+++ b/examples/online_serving/disagg_examples/disagg_proxy_demo.py
@@ -0,0 +1,450 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file provides a disaggregated prefilling proxy demo to demonstrate an
+example usage of XpYd disaggregated prefilling.
+We can launch multiple vllm instances (2 for prefill and 2 for decode), and
+launch this proxy demo through:
+  python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py  \
+       --model $model_name  \
+       --prefill localhost:8100 localhost:8101   \
+       --decode localhost:8200 localhost:8201   \
+       --port 8000
+
+Note: This demo will be removed once the PDController implemented in PR 15343
+(https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
+"""
+import argparse
+import ipaddress
+import itertools
+import json
+import logging
+import os
+import sys
+from abc import ABC, abstractmethod
+from typing import Callable, Optional
+
+import aiohttp
+import requests
+import uvicorn
+from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
+                     Request, status)
+from fastapi.responses import JSONResponse, StreamingResponse
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+logger = logging.getLogger()
+logging.basicConfig(level=logging.INFO)
+
+
+class SchedulingPolicy(ABC):
+
+    @abstractmethod
+    def schedule(self, cycler: itertools.cycle):
+        raise NotImplementedError("Scheduling Proxy is not set.")
+
+
+class Proxy:
+
+    def __init__(
+        self,
+        prefill_instances: list[str],
+        decode_instances: list[str],
+        model: str,
+        scheduling_policy: SchedulingPolicy,
+        custom_create_completion: Optional[Callable[[Request],
+                                                    StreamingResponse]] = None,
+        custom_create_chat_completion: Optional[Callable[
+            [Request], StreamingResponse]] = None,
+    ):
+        self.prefill_instances = prefill_instances
+        self.decode_instances = decode_instances
+        self.prefill_cycler = itertools.cycle(prefill_instances)
+        self.decode_cycler = itertools.cycle(decode_instances)
+        self.model = model
+        self.scheduling_policy = scheduling_policy
+        self.custom_create_completion = custom_create_completion
+        self.custom_create_chat_completion = custom_create_chat_completion
+        self.router = APIRouter()
+        self.setup_routes()
+
+    def setup_routes(self):
+        self.router.post(
+            "/v1/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_completion if self.
+               custom_create_completion else self.create_completion)
+        self.router.post(
+            "/v1/chat/completions",
+            dependencies=[
+                Depends(self.validate_json_request)
+            ])(self.custom_create_chat_completion if self.
+               custom_create_chat_completion else self.create_chat_completion)
+        self.router.get("/status",
+                        response_class=JSONResponse)(self.get_status)
+        self.router.post("/instances/add",
+                         dependencies=[Depends(self.api_key_authenticate)
+                                       ])(self.add_instance_endpoint)
+
+    async def validate_json_request(self, raw_request: Request):
+        content_type = raw_request.headers.get("content-type", "").lower()
+        if content_type != "application/json":
+            raise HTTPException(
+                status_code=415,
+                detail=
+                "Unsupported Media Type: Only 'application/json' is allowed",
+            )
+
+    def api_key_authenticate(self, x_api_key: str = Header(...)):
+        expected_api_key = os.environ.get("ADMIN_API_KEY")
+        if not expected_api_key:
+            logger.error("ADMIN_API_KEY is not set in the environment.")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Server configuration error.",
+            )
+        if x_api_key != expected_api_key:
+            logger.warning("Unauthorized access attempt with API Key: %s",
+                           x_api_key)
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="Forbidden: Invalid API Key.",
+            )
+
+    async def validate_instance(self, instance: str) -> bool:
+        url = f"http://{instance}/v1/models"
+        try:
+            async with aiohttp.ClientSession(
+                    timeout=AIOHTTP_TIMEOUT) as client:
+                logger.info("Verifying %s ...", instance)
+                async with client.get(url) as response:
+                    if response.status == 200:
+                        data = await response.json()
+                        if "data" in data and len(data["data"]) > 0:
+                            model_cur = data["data"][0].get("id", "")
+                            if model_cur == self.model:
+                                logger.info("Instance: %s could be added.",
+                                            instance)
+                                return True
+                            else:
+                                logger.warning("Mismatch model %s : %s != %s",
+                                               instance, model_cur, self.model)
+                                return False
+                        else:
+                            return False
+                    else:
+                        return False
+        except aiohttp.ClientError as e:
+            logger.error(str(e))
+            return False
+        except Exception as e:
+            logger.error(str(e))
+            return False
+
+    async def add_instance_endpoint(self, request: Request):
+        try:
+            data = await request.json()
+            logger.warning(str(data))
+            instance_type = data.get("type")
+            instance = data.get("instance")
+            if instance_type not in ["prefill", "decode"]:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance type.")
+            if not instance or ":" not in instance:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance format.")
+            host, port_str = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port_str)
+                if not (0 < port < 65536):
+                    raise HTTPException(status_code=400,
+                                        detail="Invalid port number.")
+            except Exception as e:
+                raise HTTPException(status_code=400,
+                                    detail="Invalid instance address.") from e
+
+            is_valid = await self.validate_instance(instance)
+            if not is_valid:
+                raise HTTPException(status_code=400,
+                                    detail="Instance validation failed.")
+
+            if instance_type == "prefill":
+                if instance not in self.prefill_instances:
+                    self.prefill_instances.append(instance)
+                    self.prefill_cycler = itertools.cycle(
+                        self.prefill_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+            else:
+                if instance not in self.decode_instances:
+                    self.decode_instances.append(instance)
+                    self.decode_cycler = itertools.cycle(self.decode_instances)
+                else:
+                    raise HTTPException(status_code=400,
+                                        detail="Instance already exists.")
+
+            return JSONResponse(content={
+                "message":
+                f"Added {instance} to {instance_type}_instances."
+            })
+        except HTTPException as http_exc:
+            raise http_exc
+        except Exception as e:
+            logger.error("Error in add_instance_endpoint: %s", str(e))
+            raise HTTPException(status_code=500, detail=str(e)) from e
+
+    async def forward_request(self, url, data, use_chunked=True):
+        async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+            headers = {
+                "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+            }
+            try:
+                async with session.post(url=url, json=data,
+                                        headers=headers) as response:
+                    if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
+                        if use_chunked:
+                            async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
+                                    1024):
+                                yield chunk_bytes
+                        else:
+                            content = await response.read()
+                            yield content
+                    else:
+                        error_content = await response.text()
+                        try:
+                            error_content = json.loads(error_content)
+                        except json.JSONDecodeError:
+                            error_content = error_content
+                        logger.error("Request failed with status %s: %s",
+                                     response.status, error_content)
+                        raise HTTPException(
+                            status_code=response.status,
+                            detail=
+                            f"Request failed with status {response.status}: "
+                            f"{error_content}",
+                        )
+            except aiohttp.ClientError as e:
+                logger.error("ClientError occurred: %s", str(e))
+                raise HTTPException(
+                    status_code=502,
+                    detail=
+                    "Bad Gateway: Error communicating with upstream server.",
+                ) from e
+            except Exception as e:
+                logger.error("Unexpected error: %s", str(e))
+                raise HTTPException(status_code=500, detail=str(e)) from e
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return self.scheduling_policy.schedule(cycler)
+
+    async def get_status(self):
+        status = {
+            "prefill_node_count": len(self.prefill_instances),
+            "decode_node_count": len(self.decode_instances),
+            "prefill_nodes": self.prefill_instances,
+            "decode_nodes": self.decode_instances,
+        }
+        return status
+
+    async def create_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    f"http://{decode_instance}/v1/completions", request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(generator)
+            return response
+        except Exception:
+            import sys
+
+            exc_info = sys.exc_info()
+            print("Error occurred in disagg proxy server")
+            print(exc_info)
+
+    async def create_chat_completion(self, raw_request: Request):
+        try:
+            request = await raw_request.json()
+
+            # add params to request
+            kv_prepare_request = request.copy()
+            kv_prepare_request["max_tokens"] = 1
+
+            # prefill stage
+            prefill_instance = self.schedule(self.prefill_cycler)
+            try:
+                async for _ in self.forward_request(
+                        f"http://{prefill_instance}/v1/chat/completions",
+                        kv_prepare_request):
+                    continue
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("prefill", prefill_instance)
+                raise http_exc
+            # Perform kv recv and decoding stage
+            decode_instance = self.schedule(self.decode_cycler)
+
+            try:
+                generator = self.forward_request(
+                    "http://" + decode_instance + "/v1/chat/completions",
+                    request)
+            except HTTPException as http_exc:
+                self.remove_instance_endpoint("decode", decode_instance)
+                raise http_exc
+            response = StreamingResponse(content=generator)
+            return response
+        except Exception:
+            exc_info = sys.exc_info()
+            error_messages = [str(e) for e in exc_info if e]
+            print("Error occurred in disagg proxy server")
+            print(error_messages)
+            return StreamingResponse(content=iter(error_messages),
+                                     media_type="text/event-stream")
+
+    def remove_instance_endpoint(self, instance_type, instance):
+        if (instance_type == "decode" and instance in self.decode_instances):
+            self.decode_instances.remove(instance)
+            self.decode_cycler = itertools.cycle(self.decode_instances)
+        if (instance_type == "prefill" and instance in self.decode_instances):
+            self.prefill_instances.remove(instance)
+            self.prefill_cycler = itertools.cycle(self.decode_instances)
+
+
+class RoundRobinSchedulingPolicy(SchedulingPolicy):
+
+    def __init__(self):
+        super().__init__()
+
+    def schedule(self, cycler: itertools.cycle) -> str:
+        return next(cycler)
+
+
+class ProxyServer:
+
+    def __init__(
+        self,
+        args: argparse.Namespace,
+        scheduling_policy: Optional[SchedulingPolicy] = None,
+        create_completion: Optional[Callable[[Request],
+                                             StreamingResponse]] = None,
+        create_chat_completion: Optional[Callable[[Request],
+                                                  StreamingResponse]] = None,
+    ):
+        self.validate_parsed_serve_args(args)
+        self.port = args.port
+        self.proxy_instance = Proxy(
+            prefill_instances=[] if args.prefill is None else args.prefill,
+            decode_instances=[] if args.decode is None else args.decode,
+            model=args.model,
+            scheduling_policy=(scheduling_policy if scheduling_policy
+                               is not None else RoundRobinSchedulingPolicy()),
+            custom_create_completion=create_completion,
+            custom_create_chat_completion=create_chat_completion,
+        )
+
+    def validate_parsed_serve_args(self, args: argparse.Namespace):
+        if not args.prefill:
+            raise ValueError("Please specify at least one prefill node.")
+        if not args.decode:
+            raise ValueError("Please specify at least one decode node.")
+        self.validate_instances(args.prefill)
+        self.validate_instances(args.decode)
+        self.verify_model_config(args.prefill, args.model)
+        self.verify_model_config(args.decode, args.model)
+
+    def validate_instances(self, instances: list):
+        for instance in instances:
+            if len(instance.split(":")) != 2:
+                raise ValueError(f"Invalid instance format: {instance}")
+            host, port = instance.split(":")
+            try:
+                if host != "localhost":
+                    ipaddress.ip_address(host)
+                port = int(port)
+                if not (0 < port < 65536):
+                    raise ValueError(
+                        f"Invalid port number in instance: {instance}")
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid instance {instance}: {str(e)}") from e
+
+    def verify_model_config(self, instances: list, model: str) -> None:
+        model_suffix = model.split("/")[-1]
+        for instance in instances:
+            try:
+                response = requests.get(f"http://{instance}/v1/models")
+                if response.status_code == 200:
+                    model_cur = response.json()["data"][0]["id"]
+                    model_cur_suffix = model_cur.split("/")[-1]
+                    if model_cur_suffix != model_suffix:
+                        raise ValueError(
+                            f"{instance} serves a different model: "
+                            f"{model_cur} != {model}")
+                else:
+                    raise ValueError(f"Cannot get model id from {instance}!")
+            except requests.RequestException as e:
+                raise ValueError(
+                    f"Error communicating with {instance}: {str(e)}") from e
+
+    def run_server(self):
+        app = FastAPI()
+        app.include_router(self.proxy_instance.router)
+        config = uvicorn.Config(app, port=self.port, loop="uvloop")
+        server = uvicorn.Server(config)
+        server.run()
+
+
+if __name__ == "__main__":
+    # Todo: allow more config
+    parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
+    parser.add_argument("--model",
+                        "-m",
+                        type=str,
+                        required=True,
+                        help="Model name")
+
+    parser.add_argument(
+        "--prefill",
+        "-p",
+        type=str,
+        nargs="+",
+        help="List of prefill node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--decode",
+        "-d",
+        type=str,
+        nargs="+",
+        help="List of decode node URLs (host:port)",
+    )
+
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8000,
+        help="Server port number",
+    )
+    args = parser.parse_args()
+    proxy_server = ProxyServer(args=args)
+    proxy_server.run_server()
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index 7336c54ec8a..e37ce6dc75b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py
+++ b/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -53,3 +53,8 @@ def create_connector(cls, rank: int, local_rank: int,
     "LMCacheConnector",
     "vllm.distributed.kv_transfer.kv_connector.lmcache_connector",
     "LMCacheConnector")
+
+KVConnectorFactory.register_connector(
+    "MooncakeStoreConnector",
+    "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
+    "MooncakeStoreConnector")
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
new file mode 100644
index 00000000000..c5135dab23e
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -0,0 +1,216 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+MooncakeStore Connector for Distributed Machine Learning Inference
+
+The MooncakeStoreConnector transfers KV caches between prefill vLLM workers
+(KV cache producer) and decode vLLM workers (KV cache consumer) using a
+database-style KVStore.
+"""
+import hashlib
+from typing import TYPE_CHECKING, List, Tuple, Union
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase
+from vllm.logger import init_logger
+from vllm.sequence import IntermediateTensors
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
+logger = init_logger(__name__)
+
+
+class MooncakeStoreConnector(KVConnectorBase):
+
+    def __init__(
+        self,
+        rank: int,
+        local_rank: int,
+        config: VllmConfig,
+    ):
+        self.config = config.kv_transfer_config
+        self.tp_size = config.parallel_config.tensor_parallel_size
+
+        self.local_tp_rank = local_rank
+
+        # Init kv_store
+        if self.config.kv_connector == "MooncakeStoreConnector":
+            # Check if MOONCAKE_CONFIG_PATH is set
+            import os
+            use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
+
+            if not use_mooncake_store:
+                raise ValueError(
+                    "To use MooncakeStoreConnector, you need to pass the ENV: "
+                    "'MOONCAKE_CONFIG_PATH=/path/to/mooncake_config.json'.")
+            else:
+                from vllm.distributed.kv_transfer.kv_lookup_buffer.mooncake_store import (  # noqa: E501
+                    MooncakeStore)
+                logger.info(
+                    "Initializing KVStoreConnector under kv_transfer_config %s",
+                    self.config)
+                self.kv_store = MooncakeStore(config)
+        else:
+            logger.error("Can not find %s", self.config.kv_connector)
+
+        assert self.kv_store is not None
+
+    def close(self) -> None:
+        """Close the buffer and release resources.
+        This method is responsible for cleaning up resources related to the 
+        connector when it is no longer needed.
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        self.kv_store.close()
+
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
+    ) -> None:
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+
+        model_config = model_executable.model.config
+        num_heads = int(model_config.num_key_value_heads / self.tp_size)
+        hidden_size = model_config.hidden_size
+        num_attention_heads = model_config.num_attention_heads
+        head_size = int(hidden_size / num_attention_heads)
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+            store_key_prefix = self.tensor_hash(current_tokens)
+            keys, values = [], []
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
+
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            kvcache_to_sent = torch.stack((keys, values), dim=0)
+            store_kvcache_key = f"{store_key_prefix}_{self.local_tp_rank}"
+            self.kv_store.put(store_kvcache_key, kvcache_to_sent)
+
+            hidden_key = f"{store_key_prefix}_hidden_{self.local_tp_rank}"
+            self.kv_store.put(hidden_key,
+                              hidden_or_intermediate_states[start_pos:end_pos])
+
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
+
+    def recv_kv_caches_and_hidden_states(
+        self, model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
+        bypass_model_exec = True
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
+        hidden_or_intermediate_states_for_one_req = []
+
+        for idx, slen in enumerate(seq_lens):
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+
+            if start_pos >= num_prefill_tokens:
+                # This can happen during inflight batching. See:
+                # vllm/worker/model_runner.py::_prepare_model_input_tensors:
+                # - input_tokens[:num_prefill_tokens] contains prefill tokens.
+                # - input_tokens[num_prefill_tokens:] contains decode tokens.
+                logger.warning("You should set --enable_chunked_prefill=False "
+                               "and --max_num_batched_tokens "
+                               "should be equal to max_seq_len_to_capture")
+                bypass_model_exec = False
+                assert start_pos == num_prefill_tokens
+                break
+
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
+
+            # get roi for current seq
+            load_key_prefix = self.tensor_hash(current_tokens)
+            load_kvcache_key = f"{load_key_prefix}_{self.local_tp_rank}"
+            remote_kv = self.kv_store.get(load_kvcache_key)
+            hidden_key = f"{load_key_prefix}_hidden_{self.local_tp_rank}"
+            hidden = self.kv_store.get(hidden_key)
+
+            if remote_kv is None or hidden is None:
+                # didn't find any match.
+                bypass_model_exec = False
+                continue
+
+            num_computed_tokens = current_tokens.shape[0]
+
+            # update the end position based on how many tokens are cached.
+            end_pos = start_pos + num_computed_tokens
+
+            # call self.kv_store to get kv layer by layer
+            for layer_id in range(start_layer, end_layer):
+                layer = model_executable.model.layers[layer_id]
+                # get kvcache object
+                kv_cache = kv_caches[layer_id - start_layer]
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                # get remote kvcache
+
+                remote_k, remote_v = remote_kv[0][layer_id], remote_kv[1][
+                    layer_id]
+                # use ops.reshape_and_cache_flash to put kv into kvcache
+                ops.reshape_and_cache_flash(
+                    remote_k.to(key_cache.device),
+                    remote_v.to(value_cache.device),
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            logger.warning(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = None
+
+        else:
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
+                hidden_or_intermediate_states_for_one_req, dim=0)
+
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+
+    @staticmethod
+    def tensor_hash(tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        tensor_bytes = tensor.clone().detach().cpu().numpy().tobytes()
+        hash_object = hashlib.blake2b(tensor_bytes)
+        hash_hex = hash_object.hexdigest()
+        return int(hash_hex[:16], 16)
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 845da7c501e..bea42846e9e 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This file contains a new class `KVLookupBufferBase` that allows developers to 
-think of KV cache operations as inserting new KV cache entries (`insert`) 
-into the lookup buffer and querying existing KV caches (`drop_select`) 
+This file contains a new class `KVLookupBufferBase` that allows developers to
+think of KV cache operations as inserting new KV cache entries (`insert`)
+into the lookup buffer and querying existing KV caches (`drop_select`)
 from the lookup buffer.
 
-All distributed communications are abstracted behind this class.
+This file also contains a new class `KVStoreBufferBase` that allows developers
+to manage the KVCache buffer as a simple key-value storage buffer with basic
+put/get operations.
+
+These classes above are abstracted behind class `KVCacheBufferBase`.
 """
 
 from abc import ABC, abstractmethod
@@ -14,9 +18,27 @@
 import torch
 
 
-class KVLookupBufferBase(ABC):
+class KVCacheBufferBase(ABC):
+    """
+    Abstract base class for a KVCache buffer.
     """
-    Abstract base class for a lookup buffer.
+
+    @abstractmethod
+    def close(self) -> None:
+        """Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the
+        KVCache buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
+
+class KVLookupBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache lookup buffer.
 
     This class provides an abstraction for a key-value (KV) cache lookup buffer.
     
@@ -96,12 +118,55 @@ def drop_select(
         """
         raise NotImplementedError
 
+
+class KVStoreBufferBase(KVCacheBufferBase):
+    """
+    Abstract base class for a KVCache storage buffer with key-value semantics.
+    This class provides a simple key-value storage buffer abstract with basic
+    put/get operations, which enables flexible KVCache transfer granular
+    control.
+
+    The functionality is similar to a distributed key-value store, where:
+    - Key: A unique string identifier for the cached entry
+    - Value:
+        - Tensor to be stored and retrieved
+        - None (indicating deletion or empty value)
+    """
+
+    @abstractmethod
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        """Store a key-value pair in the buffer.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
+
+            value (Optional[torch.Tensor]): Tensor to be stored.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
+        raise NotImplementedError
+
     @abstractmethod
-    def close(self) -> None:
-        """Close the buffer and release resources.
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Retrieve a value from the buffer by key.
+
+        Args:
+            key (str): Unique identifier for a tensor, this tensor could be the
+                key cache tensor, value cache tensor, or hidden state tensor
+                generated during model forwarding.
 
-        This method is responsible for cleaning up resources related to the 
-        lookup buffer when it is no longer needed.
+        Returns:
+            Optional[torch.Tensor]: Stored tensor if exists, None otherwise.
 
         Raises:
             NotImplementedError: This method must be implemented in subclasses.
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
new file mode 100644
index 00000000000..7fd5967293f
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This file contains a new class `MooncakeStore` that allows developers to
+think of KV cache transfer operations as putting new KV cache entries
+into a remote KVStore-based lookup buffer and getting existing KV caches
+from this remote lookup buffer.
+"""
+import json
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from safetensors.torch import load as safetensors_load
+from safetensors.torch import save as safetensors_save
+
+from vllm.config import VllmConfig
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVStoreBufferBase)
+from vllm.logger import init_logger
+
+DEFAULT_GLOBAL_SEGMENT_SIZE = 3355443200  # 3.125 GiB
+DEFAULT_LOCAL_BUFFER_SIZE = 1073741824  # 1.0 GiB
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class MooncakeStoreConfig:
+    local_hostname: str
+    metadata_server: str
+    global_segment_size: int
+    local_buffer_size: int
+    protocol: str
+    device_name: str
+    master_server_address: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeStoreConfig':
+        """Load the config from a JSON file."""
+        with open(file_path) as fin:
+            config = json.load(fin)
+        return MooncakeStoreConfig(
+            local_hostname=config.get("local_hostname"),
+            metadata_server=config.get("metadata_server"),
+            global_segment_size=config.get("global_segment_size",
+                                           DEFAULT_GLOBAL_SEGMENT_SIZE),
+            local_buffer_size=config.get("local_buffer_size",
+                                         DEFAULT_LOCAL_BUFFER_SIZE),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+            master_server_address=config.get("master_server_address"),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeStoreConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeStoreConfig.from_file(config_file_path)
+
+
+class MooncakeStore(KVStoreBufferBase):
+
+    def __init__(
+        self,
+        config: VllmConfig,
+    ):
+
+        try:
+            from mooncake_vllm_adaptor import MooncakeDistributedStore
+        except ImportError as e:
+            raise ImportError(
+                "Please install mooncake by following the instructions at "
+                "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md "  # noqa: E501
+                "to run vLLM with MooncakeConnector.") from e
+
+        try:
+            self.store = MooncakeDistributedStore()
+            self.config = MooncakeStoreConfig.load_from_env()
+            logger.info("Mooncake Configuration loaded successfully.")
+
+            self.store.setup(self.config.local_hostname,
+                             self.config.metadata_server,
+                             self.config.global_segment_size,
+                             self.config.local_buffer_size,
+                             self.config.protocol, self.config.device_name,
+                             self.config.master_server_address)
+
+        except ValueError as e:
+            logger.error("Configuration loading failed: %s", e)
+            raise
+        except Exception as exc:
+            logger.error(
+                "An error occurred while loading the configuration: %s", exc)
+            raise
+
+    def close(self):
+        # MooncakeDistributedStore will automatically call the destructor, so
+        # it is unnecessary to close it manually.
+        pass
+
+    def put(
+        self,
+        key: str,
+        value: Optional[torch.Tensor],
+    ) -> None:
+        # A message queue needs to be introduced before making it asynchronous.
+        if value is not None:
+            self._put_impl(key, value)
+
+    def get(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        # A message queue needs to be introduced before making it asynchronous.
+        value = self._get_impl(key)
+        return value
+
+    def _put_impl(
+        self,
+        key: str,
+        value: torch.Tensor,
+    ) -> None:
+        """Put KVCache to Mooncake Store"""
+        device_id = value.device.index if value.device.type == 'cuda' else -1
+        device_tensor = torch.tensor(device_id, dtype=torch.int32)
+        value_bytes = safetensors_save({
+            "tensor": value,
+            "device_id": device_tensor
+        })
+        try:
+            self.store.put(key, value_bytes)
+        except TypeError as err:
+            logger.error("Failed to put value into Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Put Type Error.") from err
+
+    def _get_impl(
+        self,
+        key: str,
+    ) -> Optional[torch.Tensor]:
+        """Get KVCache from Mooncake Store"""
+        try:
+            data = self.store.get(key)
+        except TypeError as err:
+            logger.error("Failed to get value from Mooncake Store: %s", err)
+            raise TypeError("Mooncake Store Get Type Error.") from err
+
+        if data:
+            loaded_tensors = safetensors_load(data)
+            tensor = loaded_tensors["tensor"]
+            device_id_tensor = loaded_tensors["device_id"]
+            device_id = int(device_id_tensor.item())
+            device = torch.device(
+                'cuda', device_id) if device_id >= 0 else torch.device('cpu')
+            return tensor.to(device)
+
+        return None

From 4051628e616d875c30ce533841905b200c6ac61b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 29 Mar 2025 06:30:09 -0700
Subject: [PATCH 1080/1240] [V1] Support interleaved modality items (#15605)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml                 |  1 +
 tests/conftest.py                             | 41 +++++-----
 .../vision_language/test_interleaved.py       | 77 +++++++++++++++++
 tests/multimodal/test_utils.py                | 80 ++++++++++++++----
 vllm/multimodal/utils.py                      | 82 +++++++------------
 vllm/v1/engine/processor.py                   | 51 +++++-------
 6 files changed, 211 insertions(+), 121 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_interleaved.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 62872bf8e3e..99358d55799 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -431,6 +431,7 @@ steps:
     - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
+    - pytest -v -s models/decoder_only/vision_language/test_interleaved.py
 
 - label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
diff --git a/tests/conftest.py b/tests/conftest.py
index cc48fceb8ef..6627ab638bf 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -747,30 +747,27 @@ def get_inputs(
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
     ) -> list[TextPrompt]:
-        if images is not None:
-            assert len(prompts) == len(images)
-
-        if videos is not None:
-            assert len(prompts) == len(videos)
 
-        if audios is not None:
-            assert len(prompts) == len(audios)
+        if any(x is not None and len(x) != len(prompts)
+               for x in [images, videos, audios]):
+            raise ValueError(
+                "All non-None multimodal inputs must have the same length as "
+                "prompts")
 
-        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
-        if images is not None:
-            for i, image in enumerate(images):
-                if image is not None:
-                    inputs[i]["multi_modal_data"] = {"image": image}
-
-        if videos is not None:
-            for i, video in enumerate(videos):
-                if video is not None:
-                    inputs[i]["multi_modal_data"] = {"video": video}
-
-        if audios is not None:
-            for i, audio in enumerate(audios):
-                if audio is not None:
-                    inputs[i]["multi_modal_data"] = {"audio": audio}
+        inputs = []
+        for i, prompt in enumerate(prompts):
+            multi_modal_data = {}
+            if images is not None and (image := images[i]) is not None:
+                multi_modal_data["image"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                multi_modal_data["video"] = video
+            if audios is not None and (audio := audios[i]) is not None:
+                multi_modal_data["audio"] = audio
+
+            inputs.append(
+                TextPrompt(prompt=prompt,
+                           multi_modal_data=multi_modal_data
+                           if multi_modal_data else None))
 
         return inputs
 
diff --git a/tests/models/decoder_only/vision_language/test_interleaved.py b/tests/models/decoder_only/vision_language/test_interleaved.py
new file mode 100644
index 00000000000..8804497ae61
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_interleaved.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
+
+
+def base_prompt(modalities_str: str) -> str:
+    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
+NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
+    """
+    This is a simple test to check if interleaved and non-interleaved prompts
+    give the same result.
+    """
+
+    image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
+    images = [image_cherry, image_stop]
+    video = VideoAsset(name="sample_demo_1.mp4", num_frames=16).np_ndarrays
+
+    inputs = [
+        (
+            [INTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+        (
+            [NONINTERLEAVED_PROMPT],
+            [images],
+            [video],
+        ),
+    ]
+
+    with vllm_runner(model,
+                     task="generate",
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": 2},
+                     max_model_len=32768,
+                     max_num_seqs=2,
+                     tensor_parallel_size=1,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs_per_case = [
+            vllm_model.generate_greedy(prompts,
+                                       max_tokens,
+                                       images=images,
+                                       videos=videos)
+            for prompts, images, videos in inputs
+        ]
+
+    all_results = [output[0][1] for output in vllm_outputs_per_case]
+    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
+               for total_str in all_results]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [
+        total_str[prompt_len:] for total_str, prompt_len in outputs
+    ]
+    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
+    interleaved_output_str, noninterleaved_output_str = generated_strs
+
+    # The two prompts are identical except for the order of modality tokens.
+    assert interleaved_prompt_len == noninterleaved_prompt_len
+
+    # The two generated strings should be different because of the
+    # interleaved modality tokens.
+    assert interleaved_output_str != noninterleaved_output_str
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index a3f136c5667..ce1429fda94 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -155,7 +155,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=2),
@@ -172,7 +172,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=2),
@@ -197,7 +197,7 @@ def test_merge_and_sort_multimodal_metadata():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -223,7 +223,7 @@ def test_merge_and_sort_multimodal_metadata():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=3),
@@ -254,7 +254,9 @@ def test_merge_and_sort_multimodal_metadata():
                 "audio": ["audio_hash1"],
                 "video": ["video_hash1", "video_hash2", "video_hash3"]
             },
-            expected_modalities=["audio", "video", "image"],
+            expected_modalities=[
+                "audio", "video", "video", "video", "image", "image"
+            ],
             expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=4),
@@ -300,12 +302,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 "image": ["image_hash1", "image_hash2"],
                 "audio": ["audio_hash1", "audio_hash2"],
             },
-            expected_modalities=[],
-            expected_ranges=[],
-            expected_hashes=None,
+            expected_modalities=["image", "audio", "image", "audio"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=2),
+                PlaceholderRange(offset=11, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            ],
         ),
 
-        # <image> <image> <video> <audio> <image>
+        # <image> <image> <audio> <video> <image>
         TestCase(
             mm_positions={
                 "image": [
@@ -321,15 +330,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                 ]
             },
             mm_hashes=None,
-            expected_modalities=[],
-            expected_ranges=[],
+            expected_modalities=["image", "image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=5),
+                PlaceholderRange(offset=20, length=4),
+            ],
             expected_hashes=None,
         ),
+
+        # <image> <audio> <video> <image> with hashes
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=18, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=6, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=10, length=5),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1"],
+            },
+            expected_modalities=["image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=6, length=2),
+                PlaceholderRange(offset=10, length=5),
+                PlaceholderRange(offset=18, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            ],
+        ),
     ]
 
-    for case in test_cases:
-        with pytest.raises(ValueError) as ex_info:
-            merge_and_sort_multimodal_metadata(case.mm_positions,
-                                               case.mm_hashes)
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
+            mm_positions, mm_hashes)
 
-        assert "Interleaved mixed-modality" in str(ex_info.value)
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 8e4fb7eac49..fc0fb8929b1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -303,14 +303,10 @@ def merge_and_sort_multimodal_metadata(
 
     Optionally if a MultiModalHashDict is given, same operation will be 
     applied to the object and the sorted list of hashes will be returned.
-
-    Raises:
-        ValueError: If the input prompt has interleaved placeholders from
-            different modalities (e.g, "<image><audio><image> Describe the 
-            content.")
     
     Returns:
-        list[str]: Sorted list of involved modalities.
+        list[str]: List of item modalities in order of their positions in
+            the input sequence.
         list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
             mm_positions.
         Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
@@ -324,50 +320,33 @@ def merge_and_sort_multimodal_metadata(
     # For single modality, placeholder ranges and hashes are already sorted
     # so we can return the list directly.
     if len(modalities) == 1:
-        if mm_hashes is None:
-            return modalities, list(mm_positions[modalities[0]]), None
-        else:
-            return modalities, list(mm_positions[modalities[0]]), list(
-                mm_hashes[modalities[0]])
-
-    placeholder_lists_with_modality = [(modality, mm_positions[modality])
-                                       for modality in modalities]
-
-    if mm_hashes is None:
-        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
-                                          key=lambda x: x[1][0]['offset'])
-        sorted_hash_lists = None
-    else:
-        hashes_lists = [
-            mm_hashes[modality] for modality in modalities
-            if modality in mm_hashes
-        ]
-        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
-                                  hashes_lists),
-                              key=lambda x: x[0][1][0]['offset'])
-        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
-        sorted_placeholder_lists = list(sorted_placeholder_tuple)
-        sorted_hash_lists = list(sorted_hash_tuple)
-
-    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
-
-    # Flatten sorted list of lists to a single list and verify there is no
-    # interleaving of placeholders from different modalities.
-    merged_placeholders: list[PlaceholderRange] = []
-    for modality, placeholder_list in sorted_placeholder_lists:
-        if merged_placeholders and placeholder_list[0][
-                'offset'] < merged_placeholders[-1]['offset']:
-            raise ValueError(
-                "Interleaved mixed-modality inference is currently not "
-                "supported.")
-        merged_placeholders.extend(placeholder_list)
-
-    if sorted_hash_lists is not None:
-        merged_hashes = []
-        for hash_list in sorted_hash_lists:
-            merged_hashes.extend(hash_list)
-    else:
-        merged_hashes = None
+        modality = modalities[0]
+        placeholder_list = list(mm_positions[modality])
+
+        return [modality] * len(
+            placeholder_list
+        ), placeholder_list, None if not mm_hashes else mm_hashes[modality]
+
+    # Create a list of (modality, placeholder, hash) tuples for all placeholders
+    all_items = []
+    for modality in modalities:
+        placeholder_list = list(mm_positions[modality])
+        hash_list: list[Optional[str]] = list(
+            mm_hashes[modality]) if mm_hashes and modality in mm_hashes else [
+                None
+            ] * len(placeholder_list)
+
+        for placeholder, hash_value in zip(placeholder_list, hash_list):
+            all_items.append((modality, placeholder, hash_value))
+
+    # Sort all items by offset
+    all_items.sort(key=lambda x: x[1]['offset'])
+
+    # Split into separate lists
+    sorted_modalities = [item[0] for item in all_items]
+    merged_placeholders = [item[1] for item in all_items]
+    merged_hashes = [str(item[2])
+                     for item in all_items] if mm_hashes is not None else None
 
     return sorted_modalities, merged_placeholders, merged_hashes
 
@@ -383,8 +362,7 @@ def group_mm_inputs_by_modality(
 
     Returns:
         list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each 
-        inner list contains consecutive MultiModalKwargs with same modality, or
-        one with multimodal modalities.
+        inner list contains consecutive MultiModalKwargs with same modality.
     """
     if not mm_inputs:
         return []
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index dbaf0abaea1..0d2892837eb 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -234,22 +234,11 @@ def process_inputs(
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
 
-            # The output of merged multi-modal processor (`decoder_mm_inputs`)
-            # contains the kwargs for all items from all modalities.
-            # This code separates them so that there is one set of kwargs
-            # per item per modality.
-            individual_mm_inputs = [
-                MultiModalKwargs.from_items([item])
-                for modality in decoder_mm_inputs.modalities
-                for item in decoder_mm_inputs.get_items(modality)
-            ]
-
             # Merge and flatten multimodal placeholders, hashes and inputs
             # from dictionaries to lists, and sort them by each item's position
             # in the input sequence.
-            # NOTE: interleaved modalities are not supported.
             (
-                sorted_modalities,
+                sorted_item_modalities,
                 sorted_mm_positions,
                 sorted_mm_hashes,
             ) = merge_and_sort_multimodal_metadata(
@@ -257,26 +246,26 @@ def process_inputs(
                 decoder_inputs["mm_hashes"] if self.use_hash else None,
             )
 
-            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
-            # modalities involved.
-            if len(sorted_modalities) > 1:
-                modality_order_dict = {
-                    modality: order
-                    for order, modality in enumerate(sorted_modalities)
-                }
-
-                # Sanity check to make sure each multimodal input has only one
-                # modality key.
-                for mm_input in individual_mm_inputs:
-                    assert len(mm_input.modalities) == 1
-
-                # Sort MultiModalKwargs to match sorted_mm_positions
-                sorted_mm_inputs = sorted(
-                    individual_mm_inputs,
-                    key=lambda mm_input: modality_order_dict[list(
-                        mm_input.modalities)[0]])
+            # The output of merged multi-modal processor (`decoder_mm_inputs`)
+            # is a single MultiModalKwargs for all items from all modalities.
+            # This code flattens kwargs for individual items in a list and
+            # sorts them by each item's position in the input sequence if there
+            # are multiple modalities.
+            unique_modalities = set(sorted_item_modalities)
+            if len(unique_modalities) > 1:
+                sorted_mm_inputs = []
+                used_indices = {modality: 0 for modality in unique_modalities}
+                for modality in sorted_item_modalities:
+                    items = decoder_mm_inputs.get_items(modality)
+                    item = items[used_indices[modality]]
+                    sorted_mm_inputs.append(MultiModalKwargs.from_items([item
+                                                                         ]))
+                    used_indices[modality] += 1
             else:
-                sorted_mm_inputs = individual_mm_inputs
+                sorted_mm_inputs = [
+                    MultiModalKwargs.from_items([item]) for item in
+                    decoder_mm_inputs.get_items(sorted_item_modalities[0])
+                ]
 
         return EngineCoreRequest(
             request_id=request_id,

From 0e2175fd97b9e02400a36925f3d998d62c3ce10b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 29 Mar 2025 09:25:17 -0700
Subject: [PATCH 1081/1240] [V1][Minor] Simplify rejection sampler's
 parse_output (#15741)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/rejection_sampler.py | 7 -------
 vllm/v1/worker/gpu_model_runner.py  | 7 +++----
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index e5b8872a2a3..3cf7fde5cd0 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -107,7 +107,6 @@ def forward(
     @staticmethod
     def parse_output(
         output_token_ids: torch.Tensor,
-        ignored_req_idxs: list[int],
         vocab_size: int,
     ) -> list[list[int]]:
         """Parse the output of the rejection sampler.
@@ -117,9 +116,6 @@ def parse_output(
                 [batch_size, max_spec_len + 1]. The rejected tokens are
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
-            ignored_req_idxs: The indices of the requests that should not be
-                sampled. This is usually because the request is still in the
-                prefill phase.
             vocab_size: The size of the vocabulary.
 
         Returns:
@@ -129,11 +125,8 @@ def parse_output(
         # Create mask for valid tokens.
         valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
                       (output_token_ids_np < vocab_size))
-
-        ignored_req_idx_set = set(ignored_req_idxs)
         outputs = [
             row[valid_mask[i]].tolist()
-            if i not in ignored_req_idx_set else []
             for i, row in enumerate(output_token_ids_np)
         ]
         return outputs
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1b581c69a72..4511a9aa85f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1121,16 +1121,15 @@ def execute_model(
         if max_gen_len == 1:
             # No spec decode tokens.
             valid_sampled_token_ids = sampled_token_ids.tolist()
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[i].clear()
         else:
             # Includes spec decode tokens.
             valid_sampled_token_ids = self.rejection_sampler.parse_output(
                 sampled_token_ids,
-                discard_sampled_tokens_req_indices,
                 self.input_batch.vocab_size,
             )
+        # Mask out the sampled tokens that should not be sampled.
+        for i in discard_sampled_tokens_req_indices:
+            valid_sampled_token_ids[i].clear()
 
         if not self.use_spec_decode:
             spec_token_ids = None

From 8e99bff1260c16c4a505c5b9dec37b57f02b79ab Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 30 Mar 2025 02:11:15 +0800
Subject: [PATCH 1082/1240] [Bugfix] Fix Mllama interleaved images input
 support (#15564)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language_multi_image.py            |  4 +-
 vllm/model_executor/models/mllama.py          | 65 ++++++++++++++-----
 2 files changed, 52 insertions(+), 17 deletions(-)

diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 98a739169d7..0493222da13 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -229,8 +229,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    placeholders = "<|image|>" * len(image_urls)
-    prompt = f"{placeholders}<|begin_of_text|>{question}"
+    img_prompt = "Given the first image <|image|> and the second image<|image|>"
+    prompt = f"<|begin_of_text|>{img_prompt}, {question}?"
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index ac4bdbc41e4..68d5298dfc9 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -180,10 +180,10 @@ def apply(
         mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
                                   return_mm_hashes)
 
+        image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
         # the number of images provided in mm_data
-        num_image_tokens = mm_inputs['prompt_token_ids'].count(
-            self.info.get_hf_config().image_token_index)
+        num_image_tokens = mm_inputs['prompt_token_ids'].count(image_token_id)
         image_data = mm_data.get("image", [])
         num_images = 1 if isinstance(image_data, Image) else len(image_data)
         if num_image_tokens != num_images:
@@ -191,8 +191,55 @@ def apply(
                 f"The number of image tokens ({num_image_tokens}) must be"
                 f" the same as the number of images ({num_images})")
 
+        # Given prompt: <IMG0> P0 P1 <IMG1> <IMG2> P3 P4 D5 D6...., (P-prefill, D-decode)  # noqa: E501
+        # P0 & P1 do cross attention with placeholder of <IMG0>
+        # P3 P4 D5 D6 do cross attention with placeholder of <IMG1> and <IMG2>
+        # Example input to encoder and decoder:
+        # {
+        #     'encoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128256, 128256, ..., 128256],
+        #         'prompt': '<|image|><|image|>...<|image|>',
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        #     'decoder': {
+        #         'type': 'token',
+        #         'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
+        #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
+        #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
+        #     },
+        # }
+
+        if mm_data:
+            # Since only the last group of consecutive images
+            # are attended by the decoded tokens, we only need to
+            # get the number of tokens for those images.
+            token_per_chunk = self.info.get_token_per_chunk_from_config()
+            num_decode_images = self._get_num_image_in_last_group(
+                mm_inputs["prompt_token_ids"])
+            num_encode_images = num_images - num_decode_images
+
+            # Set encoder prompt length based on the number of tiles.
+            # This tells the block manager to allocate correct number
+            # of slots for encoder tokens.
+            num_tiles = mm_inputs["mm_kwargs"]["num_tiles"]
+            decode_tiles = num_tiles[num_encode_images:num_images].sum().item()
+            num_tokens = decode_tiles * token_per_chunk
+            mm_inputs["encoder_prompt_token_ids"] = [image_token_id
+                                                     ] * num_tokens
+            mm_inputs["encoder_prompt"] = "<|image|>" * num_tokens
+
         return mm_inputs
 
+    def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int:
+        num_images = 0
+        for token_id in prompt_token_ids[::-1]:
+            if token_id == self.info.get_hf_config().image_token_index:
+                num_images += 1
+            elif num_images > 0:
+                break
+        return num_images
+
     def _call_hf_processor(
         self,
         prompt: str,
@@ -210,19 +257,7 @@ def _call_hf_processor(
             processed_outputs["num_tiles"] = torch.tensor(num_tiles)
             for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
                 processed_outputs[k] = processed_outputs[k].squeeze(0)
-            # Example input to encoder and decoder:
-            # {
-            #     'encoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30],  # noqa: E501
-            #         'prompt': '<|image|><|begin_of_text|>What is the content of this image?',  # noqa: E501
-            #         'multi_modal_data': {'image': <PIL.Image.Image image mode=RGB size=1770x1180 at 0x7FDE2C624880>},  # noqa: E501
-            #     },
-            #     'decoder': {
-            #         'type': 'token',
-            #         'prompt_token_ids': [128000],
-            #     },
-            # }
+
             processed_token_ids = processed_outputs.pop("input_ids")
             start_idx, end_idx = 0, processed_token_ids.size(1)
             processed_prompt_text = tokenizer.decode(processed_token_ids[0])

From 2d849fccfc5977c953154ed9a00b0d2ce610a0b1 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 30 Mar 2025 11:20:02 +0800
Subject: [PATCH 1083/1240] [CI] xgrammar structured output supports Enum.
 (#15757)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/structured_output/test_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py
index 3aa86cbec53..554f3892626 100644
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -13,10 +13,6 @@ def unsupported_string_schemas():
             "type": "string",
             "pattern": "^[a-zA-Z]+$"
         },
-        {
-            "type": "string",
-            "enum": ["active", "inactive", "pending"]
-        },
         {
             "type": "string",
             "minLength": 1
@@ -164,6 +160,10 @@ def supported_schema():
                     "type": "number"
                 }
             },
+            "car_type": {
+                "type": "string",
+                "enum": ["sedan", "suv", "truck"]
+            },
             "address": {
                 "type": "object",
                 "properties": {

From 9366565ce39acff705af4d47033d561ab161b31e Mon Sep 17 00:00:00 2001
From: Julien Denize <40604584+juliendenize@users.noreply.github.com>
Date: Sun, 30 Mar 2025 05:20:19 +0200
Subject: [PATCH 1084/1240] [Bugfix] Fix Mistral guided generation using
 xgrammar (#15704)

Signed-off-by: Julien Denize <julien.denize@mistral.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llm/test_struct_output_generate.py        | 33 +++++++++++++------
 vllm/v1/structured_output/backend_xgrammar.py | 18 ++++++----
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index a32dd826399..fa58c6460f8 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -15,11 +15,20 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 
-GUIDED_DECODING_BACKENDS_V1 = [
-    "xgrammar:disable-any-whitespace", "guidance:disable-any-whitespace"
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "mistral"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
-MODELS_TO_TEST = [
-    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+
+PARAMS_MODELS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
 ]
 
 
@@ -37,9 +46,8 @@ class CarDescription(BaseModel):
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
+                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
 def test_structured_output(
     monkeypatch: pytest.MonkeyPatch,
     sample_json_schema: dict[str, Any],
@@ -49,6 +57,7 @@ def test_structured_output(
     sample_regex: str,
     sample_guided_choice: str,
     guided_decoding_backend: str,
+    tokenizer_mode: str,
     model_name: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
@@ -58,7 +67,8 @@ def test_structured_output(
     llm = LLM(model=model_name,
               enforce_eager=True,
               max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+              guided_decoding_backend=guided_decoding_backend,
+              tokenizer_mode=tokenizer_mode)
 
     #
     # Test 1: Generate JSON output based on a provided schema
@@ -324,17 +334,20 @@ def test_structured_output(
 
 
 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
+@pytest.mark.parametrize("model_name, tokenizer_mode",
+                         PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     monkeypatch: pytest.MonkeyPatch,
     unsupported_json_schema: dict[str, Any],
     model_name: str,
+    tokenizer_mode: str,
 ):
     monkeypatch.setenv("VLLM_USE_V1", "1")
 
     llm = LLM(model=model_name,
               max_model_len=1024,
-              guided_decoding_backend="auto")
+              guided_decoding_backend="auto",
+              tokenizer_mode=tokenizer_mode)
 
     sampling_params = SamplingParams(
         temperature=1.0,
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 9bfb644c580..7fe62f26af5 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -42,12 +42,15 @@ def __init__(self, vllm_config: VllmConfig):
             # NOTE: ideally, xgrammar should handle this accordingly.
             # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98
             try:
-                encoded_vocab = [
-                    token for token, _ in sorted(
-                        tokenizer.get_vocab().items(),
-                        key=lambda x: x[1],
-                    )
-                ]
+                if tokenizer.is_tekken:
+                    encoded_vocab = tokenizer._vocab
+                else:
+                    encoded_vocab = [
+                        token for token, _ in sorted(
+                            tokenizer.get_vocab().items(),
+                            key=lambda x: x[1],
+                        )
+                    ]
                 stop_token_ids = None
                 if hasattr(
                         tokenizer,
@@ -62,7 +65,8 @@ def __init__(self, vllm_config: VllmConfig):
             tokenizer_info = xgr.TokenizerInfo(  # type: ignore
                 encoded_vocab=encoded_vocab,
                 # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501
-                vocab_type=xgr.VocabType.BYTE_FALLBACK,
+                vocab_type=xgr.VocabType.RAW
+                if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK,
                 vocab_size=self.vocab_size,
                 stop_token_ids=stop_token_ids,
                 add_prefix_space=True,

From 8bf940f73060a3b1e8126584e497205fd4a72071 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 30 Mar 2025 16:12:13 +0800
Subject: [PATCH 1085/1240] [doc] update conda to usage link in installation
 (#15761)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../source/getting_started/installation/python_env_setup.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
index 6ea44c36db3..a03d35030fe 100644
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ b/docs/source/getting_started/installation/python_env_setup.inc.md
@@ -1,4 +1,4 @@
-You can create a new Python environment using `conda`:
+You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
 
 ```console
 # (Recommended) Create a new conda environment.

From 5694675ef9f536c551fe776bfe28905bb81bb340 Mon Sep 17 00:00:00 2001
From: pansicheng <sicheng.pan.chn@gmail.com>
Date: Sun, 30 Mar 2025 17:01:34 +0800
Subject: [PATCH 1086/1240] fix test_phi3v (#15321)

Signed-off-by: pansicheng <sicheng.pan.chn@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_vision.py       | 46 ++++++++++++++++---
 .../openai/test_vision_embedding.py           | 24 ++++++++--
 .../embedding/vision_language/test_phi3v.py   | 13 ++++++
 vllm/model_executor/models/phi3v.py           | 41 +++++++++++++++--
 4 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index bb100e573b8..4b9029ded41 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -3,6 +3,9 @@
 import openai
 import pytest
 import pytest_asyncio
+import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
@@ -53,11 +56,31 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|>\n"
+    messages = [{
+        "role": "user",
+        "content": f"{placeholder}{content}",
+    }]
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor(prompt, images, return_tensors="pt")
+
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_single_chat_session_image(client: openai.AsyncOpenAI,
                                          model_name: str, image_url: str):
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -70,16 +93,17 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -87,8 +111,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
@@ -150,6 +178,7 @@ async def test_single_chat_session_image_base64encoded(
         client: openai.AsyncOpenAI, model_name: str, image_url: str,
         base64_encoded_image: dict[str, str]):
 
+    content_text = "What's in this image?"
     messages = [{
         "role":
         "user",
@@ -163,16 +192,17 @@ async def test_single_chat_session_image_base64encoded(
             },
             {
                 "type": "text",
-                "text": "What's in this image?"
+                "text": content_text
             },
         ],
     }]
 
+    max_completion_tokens = 10
     # test single completion
     chat_completion = await client.chat.completions.create(
         model=model_name,
         messages=messages,
-        max_completion_tokens=10,
+        max_completion_tokens=max_completion_tokens,
         logprobs=True,
         temperature=0.0,
         top_logprobs=5)
@@ -180,8 +210,12 @@ async def test_single_chat_session_image_base64encoded(
 
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=774, total_tokens=784)
+        completion_tokens=max_completion_tokens,
+        prompt_tokens=hf_prompt_tokens,
+        total_tokens=hf_prompt_tokens + max_completion_tokens)
 
     message = choice.message
     message = chat_completion.choices[0].message
diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py
index 74e5c4cc7ea..3e6f13e10ac 100644
--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -2,6 +2,8 @@
 
 import pytest
 import requests
+from PIL import Image
+from transformers import AutoProcessor
 
 from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
@@ -52,11 +54,24 @@ def base64_encoded_image() -> dict[str, str]:
     }
 
 
+def get_hf_prompt_tokens(model_name, content, image_url):
+    processor = AutoProcessor.from_pretrained(model_name,
+                                              trust_remote_code=True,
+                                              num_crops=4)
+
+    placeholder = "<|image_1|> "
+    prompt = f"{placeholder}{content}"
+    images = [Image.open(requests.get(image_url, stream=True).raw)]
+    inputs = processor(prompt, images, return_tensors="pt")
+    return inputs.input_ids.shape[1]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
                                image_url: str):
+    content_text = "Represent the given image."
     messages = [{
         "role":
         "user",
@@ -69,7 +84,7 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
             },
             {
                 "type": "text",
-                "text": "Represent the given image."
+                "text": content_text
             },
         ],
     }]
@@ -85,9 +100,12 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
     response.raise_for_status()
     embeddings = EmbeddingResponse.model_validate(response.json())
 
+    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text,
+                                            image_url)
+
     assert embeddings.id is not None
     assert len(embeddings.data) == 1
     assert len(embeddings.data[0].embedding) == 3072
     assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 763
-    assert embeddings.usage.total_tokens == 763
+    assert embeddings.usage.prompt_tokens == hf_prompt_tokens
+    assert embeddings.usage.total_tokens == hf_prompt_tokens
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index 9cc767c23b2..f9985bd8a2e 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -2,6 +2,10 @@
 
 import pytest
 import torch.nn.functional as F
+from PIL import Image
+
+from vllm.assets.base import get_vllm_public_assets
+from vllm.assets.image import VLM_IMAGES_DIR
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ....utils import large_gpu_test
@@ -112,6 +116,15 @@ def test_models_image(
         (text, asset.pil_image)
         for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
     ]
+    # add cases for special_tokens
+    input_texts_images.append((
+        "\n<s><|user|>\n <|image_1|>\n\t <s>"
+        "Represent the given image for classification<|end|>"
+        "\n<|assistant|>\n",
+        Image.open(
+            get_vllm_public_assets(filename="cherry_blossom.jpg",
+                                   s3_prefix=VLM_IMAGES_DIR)),
+    ))
     input_texts = [text for text, _ in input_texts_images]
     input_images = [image for _, image in input_texts_images]
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5305f1e03e1..d5c64989e64 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,6 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
@@ -428,10 +429,6 @@ def _get_prompt_updates(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
 
-        tokenizer = self.info.get_tokenizer()
-        bos_token_id = tokenizer.bos_token_id
-        assert isinstance(bos_token_id, int)
-
         def get_replacement_phi3v(item_idx: int):
             images = mm_items.get_items(
                 "image", (ImageEmbeddingItems, ImageProcessorItems))
@@ -449,7 +446,7 @@ def get_replacement_phi3v(item_idx: int):
             image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
 
             return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
+                full=image_tokens,
                 features=image_tokens,
             )
 
@@ -469,6 +466,40 @@ def _apply_prompt_updates(
         mm_prompt_updates: Mapping[str, Sequence[BoundPromptUpdate]],
         mm_item_counts: Mapping[str, int],
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
+        # align to hf behavior when there are images
+        if len(mm_item_counts):
+            tokenizer = self.info.get_tokenizer()
+            # to decode token_ids to the original text, we need to
+            # 1. remove the first bos token
+            # 2. remove space after each special token
+            #    introduced by the tokenizer
+            if len(token_ids) and token_ids[0] == tokenizer.bos_token_id:
+                token_ids = token_ids[1:]
+            text = tokenizer.decode(token_ids)
+            for special_tokens in tokenizer.special_tokens_map.values():
+                if isinstance(special_tokens, str):
+                    text = text.replace(f"{special_tokens} ", special_tokens)
+                elif isinstance(special_tokens, list):
+                    for special_token in special_tokens:
+                        text = text.replace(f"{special_token} ", special_token)
+            # perform hf behavior
+            # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/64f88b6/processing_phi3_v.py#L407
+            pattern = r"<\|image_\d+\|>"
+            prompt_chunks = [
+                tokenizer(chunk).input_ids
+                for chunk in re.split(pattern, text)
+            ]
+            image_tags = [
+                tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in re.findall(pattern, text)
+            ]
+            if len(prompt_chunks) > len(image_tags):
+                image_tags.append([])
+            token_ids = [
+                e for sublist in zip(prompt_chunks, image_tags)
+                for ele in sublist for e in ele
+            ]
+
         token_ids, text, placeholders = super()._apply_prompt_updates(
             token_ids=token_ids,
             mm_prompt_updates=mm_prompt_updates,

From 184ddd6a1bb99238f16f5cc130cec79fc2afe6a8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 30 Mar 2025 18:20:42 +0800
Subject: [PATCH 1087/1240] [V1] Override `mm_counts` for dummy data creation
 (#15703)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_models.py            | 28 ++-------------
 .../model_executor/models/llava_next_video.py | 14 +++++---
 vllm/model_executor/models/llava_onevision.py | 25 ++++++++-----
 vllm/model_executor/models/minicpmo.py        | 26 ++++++++------
 vllm/model_executor/models/minicpmv.py        | 36 ++++++++++++-------
 vllm/model_executor/models/qwen2_vl.py        | 26 +++++++++-----
 vllm/multimodal/profiling.py                  | 30 +++++++++++-----
 vllm/multimodal/registry.py                   |  6 ++--
 vllm/v1/worker/gpu_model_runner.py            | 16 +++------
 9 files changed, 114 insertions(+), 93 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 0d1d237e569..ecb637c62e4 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -385,18 +385,7 @@
     ),
     "minicpmo_26": VLMTestInfo(
         models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-    ),
-    "minicpmo_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-o-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -404,22 +393,10 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
     ),
     "minicpmv_26": VLMTestInfo(
         models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
-        max_model_len=4096,
-        max_num_seqs=2,
-        get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
-        hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
-        patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-    ),
-    "minicpmv_26_multi_image": VLMTestInfo(
-        models=["openbmb/MiniCPM-V-2_6"],
-        test_type=(VLMTestType.MULTI_IMAGE),
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
         max_model_len=4096,
@@ -427,7 +404,6 @@
         get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']),  # noqa: E501
         hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
         patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 8b1a8c9da68..8a5edefb4a0 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -71,7 +71,8 @@ def get_mm_max_tokens_per_item(
         max_video_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
         )
 
         return {"video": max_video_tokens}
@@ -130,9 +131,12 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_videos = mm_counts.get("video", 0)
 
         max_total_frames = self._get_max_video_frames(seq_len)
 
@@ -155,7 +159,7 @@ def get_dummy_processor_inputs(
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "video":
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index fbc298b8124..c7e13bb352f 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -108,7 +108,7 @@ def get_mm_max_tokens_per_item(
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
         }
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
@@ -202,10 +202,13 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -215,13 +218,18 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
         )
 
 
@@ -243,7 +251,8 @@ def get_dummy_processor_inputs(
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len,
+                                                        mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index ea37de0b806..c74e086d374 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -43,7 +43,8 @@
 from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 
-from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder,
+from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
+                       MiniCPMVDummyInputsBuilder,
                        MiniCPMVMultiModalDataParser,
                        MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo,
                        _minicpmv_field_config)
@@ -203,8 +204,8 @@ def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
 
     def get_max_audio_tokens(self) -> int:
-        return self.get_max_audio_tokens_per_chunk(
-        ) * self.get_max_audio_chunks_with_most_features()
+        num_chunks = self.get_max_audio_chunks_with_most_features()
+        return self.get_max_audio_tokens_per_chunk() * num_chunks
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
@@ -212,21 +213,24 @@ def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
-        max_audios = mm_config.get_limit_per_prompt("audio")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+        max_audios = mm_counts.get("audio", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_audio_tokens = self.get_max_audio_tokens() * max_audios
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens -
                                                      max_audio_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
 
 class MiniCPMODummyInputsBuilder(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 76c7a59d656..2c0d37e883b 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -69,6 +69,9 @@
                     merge_multimodal_embeddings)
 from .vision import scatter_patch_features, select_patch_features
 
+# For profile run
+_MAX_FRAMES_PER_VIDEO = 16
+
 
 class MiniCPMVImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
@@ -369,7 +372,8 @@ def get_mm_max_tokens_per_item(
     ) -> Mapping[str, int]:
         mm_max_tokens = {"image": self.get_max_image_tokens()}
         if self.get_model_version() == (2, 6):
-            mm_max_tokens["video"] = self.get_max_video_tokens(seq_len)
+            mm_max_tokens["video"] = self.get_max_video_tokens(
+                seq_len, mm_counts)
 
         return mm_max_tokens
 
@@ -432,9 +436,14 @@ def get_max_video_frame_tokens(self) -> int:
             use_image_id=False,
         )
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
-        return self.get_max_video_frame_tokens(
-        ) * self.get_num_frames_with_most_features(seq_len)
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        num_frames = self.get_num_frames_with_most_features(seq_len, mm_counts)
+        num_video_tokens_total = self.get_max_video_frame_tokens() * num_frames
+        return num_video_tokens_total
 
     def get_video_max_slice_num(self) -> int:
         return 1
@@ -449,18 +458,21 @@ def get_max_video_frames(self, max_tokens: int) -> int:
         num_frames = max_tokens // num_frame_tokens
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self.get_max_video_frames(seq_len -
                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        num_frames = max(max_total_frames // max(max_videos, 1), 1)
-
-        return num_frames
+        return max(max_frames_per_video, 1)
 
 
 _I = TypeVar("_I",
@@ -483,7 +495,7 @@ def get_dummy_processor_inputs(
         video_width, video_height = \
             self.info.get_video_frame_size_with_most_features()
         num_video_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 7537671e1bb..a7800d41536 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -806,7 +806,7 @@ def get_image_processor(
         max_pixels: Optional[int] = None,
         size: Optional[dict[str, int]] = None,
         **kwargs: object,
-    ):
+    ) -> Qwen2VLImageProcessor:
         return cached_image_processor_from_config(
             self.ctx.model_config,
             **self._get_image_processor_kwargs(min_pixels=min_pixels,
@@ -825,7 +825,7 @@ def get_mm_max_tokens_per_item(
     ) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),
-            "video": self.get_max_video_tokens(seq_len),
+            "video": self.get_max_video_tokens(seq_len, mm_counts),
         }
 
     def _get_vision_info(
@@ -941,10 +941,13 @@ def _get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_num_frames_with_most_features(self, seq_len: int) -> int:
-        mm_config = self.ctx.get_mm_config()
-        max_images = mm_config.get_limit_per_prompt("image")
-        max_videos = mm_config.get_limit_per_prompt("video")
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
@@ -954,13 +957,18 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int:
 
         return max(max_frames_per_video, 1)
 
-    def get_max_video_tokens(self, seq_len: int) -> int:
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_num_frames_with_most_features(seq_len),
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
             image_processor=None,
         )
 
@@ -982,7 +990,7 @@ def get_dummy_processor_inputs(
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
         target_num_frames = \
-            self.info.get_num_frames_with_most_features(seq_len)
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
 
         mm_data = {
             "image":
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index e36f8e4434e..1df9a1f5eba 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, NamedTuple, TypeVar, cast
+from typing import Generic, NamedTuple, Optional, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -160,17 +160,19 @@ def _get_dummy_mm_inputs(
     def get_and_validate_mm_inputs(
         self,
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> tuple[MultiModalInputs, Mapping[str, int]]:
-        mm_counts = self.get_mm_limits()
+        if mm_counts is None:
+            mm_counts = self.get_mm_limits()
 
         info = self.processing_info
         mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(
             seq_len, mm_counts)
 
-        if mm_counts.keys() != mm_max_tokens_per_item.keys():
+        if mm_counts.keys() - mm_max_tokens_per_item.keys():
             raise AssertionError(
                 "The keys returned by `get_supported_mm_limits` "
-                f"({set(mm_counts.keys())}) should be the same as those "
+                f"({set(mm_counts.keys())}) should be a subset of those "
                 "returned by `get_mm_max_tokens_per_item` "
                 f"({set(mm_max_tokens_per_item.keys())})")
 
@@ -193,8 +195,12 @@ def get_and_validate_mm_inputs(
                 "tokens.")
         return mm_inputs, total_placeholders_by_modality
 
-    def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
-        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len)
+    def get_encoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyEncoderData:
+        mm_inputs, _ = self.get_and_validate_mm_inputs(seq_len, mm_counts)
         mm_inputs = cast(MultiModalEncDecInputs, mm_inputs)
 
         # For encoder-decoder models, use encoder prompt token ids instead of
@@ -207,9 +213,15 @@ def get_encoder_dummy_data(self, seq_len: int) -> DummyEncoderData:
 
         return DummyEncoderData(encoder_prompt_token_ids)
 
-    def get_decoder_dummy_data(self, seq_len: int) -> DummyDecoderData:
-        (mm_inputs, total_placeholders_by_modality
-         ) = self.get_and_validate_mm_inputs(seq_len)
+    def get_decoder_dummy_data(
+        self,
+        seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
+    ) -> DummyDecoderData:
+        (
+            mm_inputs,
+            total_placeholders_by_modality,
+        ) = self.get_and_validate_mm_inputs(seq_len, mm_counts)
 
         prompt_token_ids = mm_inputs["prompt_token_ids"]
         total_len = len(prompt_token_ids)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8c16c3ba807..4f41fa083f6 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -458,6 +458,7 @@ def get_decoder_dummy_data(
         self,
         model_config: "ModelConfig",
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
@@ -466,7 +467,7 @@ def get_decoder_dummy_data(
         """
         processor = self.create_processor(model_config, disable_cache=True)
         profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_decoder_dummy_data(seq_len)
+        dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
@@ -481,6 +482,7 @@ def get_encoder_dummy_data(
         self,
         model_config: "ModelConfig",
         seq_len: int,
+        mm_counts: Optional[Mapping[str, int]] = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
@@ -489,7 +491,7 @@ def get_encoder_dummy_data(
         """
         processor = self.create_processor(model_config, disable_cache=True)
         profiler = MultiModalProfiler(processor)
-        dummy_data = profiler.get_encoder_dummy_data(seq_len)
+        dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
 
         # Having more tokens is over-conservative but otherwise fine
         token_ids = dummy_data.prompt_token_ids
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4511a9aa85f..8071c98b269 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1470,19 +1470,13 @@ def profile_run(self) -> None:
                 encoder_budget, max_num_mm_items, dummy_data_modality)
 
             # Create dummy batch of multimodal inputs.
-            dummy_request_data = self.mm_registry.get_decoder_dummy_data(
+            dummy_mm_kwargs = self.mm_registry.get_decoder_dummy_data(
                 model_config=self.model_config,
                 seq_len=self.max_num_tokens,
-            )
-            dummy_mm_data = dummy_request_data.multi_modal_data
-
-            # Dummy data definition may contain multiple multimodal items
-            # (e.g, multiple images) for a single request, therefore here we
-            # always replicate first item by max_num_mm_items times since in V1
-            # they are scheduled to be processed separately.
-            dummy_mm_item = dummy_mm_data.get_item(
-                modality=dummy_data_modality, item_index=0)
-            dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
+                mm_counts={
+                    dummy_data_modality: 1
+                },
+            ).multi_modal_data
 
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)

From fbb5f781982651bad0145eb7436339e1deb79579 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sun, 30 Mar 2025 18:36:02 +0800
Subject: [PATCH 1088/1240] fix: lint fix a ruff checkout syntax error (#15767)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/registry.py b/tests/models/registry.py
index ff0c37a6afd..54e392ab73d 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -281,7 +281,7 @@ def check_available_online(
                                                          extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
     "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                         trust_remote_code=True,
-                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501),
+                                        extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}),  # noqa: E501
     "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
                                         trust_remote_code=True),
     "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501

From 6dc0ec6b32ee08965803345dc55fa958e0d306f4 Mon Sep 17 00:00:00 2001
From: kYLe <kylhuang@nvidia.com>
Date: Sun, 30 Mar 2025 05:45:08 -0500
Subject: [PATCH 1089/1240] [Bugfix] Added `embed_is_patch` mask for fuyu model
 (#15731)

Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/fuyu.py | 41 ++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index bd7ef29e1f6..a1004cd0ac6 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -39,10 +39,12 @@
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
+from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -64,6 +66,11 @@ class FuyuImagePatchInputs(TypedDict):
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    """
 
 
 class FuyuProcessingInfo(BaseProcessingInfo):
@@ -183,6 +190,19 @@ def _call_hf_processor(
 
             processed_outputs["image_patches"] = image_patches[0]
 
+            # get patch grid size for each image
+            embed_is_patch = []
+            for image in images:
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image.width,
+                    image_height=image.height,
+                )
+
+                mask = torch.tensor(([True] * ncols + [False]) * nrows)
+                embed_is_patch.append(mask)
+
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -202,7 +222,8 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"),
+                    embed_is_patch=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -301,11 +322,15 @@ def _validate_shape(d: torch.Tensor):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         if image_patches is not None:
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
             image_patches_flat = flatten_bn(image_patches)
 
             return FuyuImagePatchInputs(
@@ -313,6 +338,7 @@ def _parse_and_validate_image_input(
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -333,7 +359,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
         vision_embeddings = self._process_image_input(image_input)
-        return vision_embeddings
+        #return vision_embeddings
+        return flatten_2d_lists(
+            scatter_patch_features(*args) for args in zip(
+                vision_embeddings,
+                image_input["embed_is_patch"],
+            ))
 
     def get_input_embeddings(
         self,
@@ -343,8 +374,8 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                _IMAGE_TOKEN_ID)
+                input_ids, inputs_embeds,
+                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(

From 9de33232dbfba466fcb5af3ae1fb3779a5605de6 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Mon, 31 Mar 2025 01:47:57 +0800
Subject: [PATCH 1090/1240] fix: Comments to English for better dev experience
 (#15768)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py         | 2 +-
 vllm/transformers_utils/processors/deepseek_vl2.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 0faf895964b..4554a997755 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -509,7 +509,7 @@ def _pixel_values_to_embedding(
         _, hw, n_dim = images_embeds.shape
         h = w = int(hw**0.5)
 
-        # 根据self.tile_tag & self.global_view_pos填充image token sequence
+        # fill image token based on self.tile_tag & self.global_view_pos
         tile_index = 0
         vision_embeddings = []
         for jdx in range(images_spatial_crop.size(0)):
diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py
index d37381ea992..316281f2af4 100644
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -226,7 +226,7 @@ def process_one(
         input_ids[input_ids < 0] = self.pad_id
 
         if inference_mode:
-            # 去掉结尾的eos token
+            # Remove the ending eos token
             assert input_ids[-1] == self.eos_id
             input_ids = input_ids[:-1]
             target_ids = target_ids[:-1]

From 62967a42189b718194fc29525a1b03cab06b1e35 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sun, 30 Mar 2025 14:10:42 -0700
Subject: [PATCH 1091/1240] [V1][Scheduler] Avoid calling
 `_try_schedule_encoder_inputs` for every request (#15778)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/sched/scheduler.py | 51 ++++++++++++++++++---------------
 vllm/v1/request.py              |  9 ++----
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 094602a8b73..aafa2f0a9f3 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -171,19 +171,23 @@ def schedule(self) -> SchedulerOutput:
             assert num_new_tokens > 0
 
             # Schedule encoder inputs.
-            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
-                self._try_schedule_encoder_inputs(request,
-                                                  request.num_computed_tokens,
-                                                  num_new_tokens,
-                                                  encoder_budget))
-            if num_new_tokens == 0:
-                # The request cannot be scheduled because the encoder budget
-                # or the encoder cache is exhausted.
-                # NOTE(woosuk): Here, by doing `continue` instead of `break`,
-                # we do not strictly follow the FCFS scheduling policy and
-                # allow the lower-priority requests to be scheduled.
-                req_index += 1
-                continue
+            if request.has_encoder_inputs:
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, request.num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled because the encoder budget
+                    # or the encoder cache is exhausted.
+                    # NOTE(woosuk): By using `continue` instead of `break` here,
+                    # we intentionally relax the strict FCFS scheduling policy
+                    # to allow lower-priority requests to be scheduled when a
+                    # higher-priority request is blocked by encoder constraints.
+                    req_index += 1
+                    continue
+            else:
+                encoder_inputs_to_schedule = None
+                new_encoder_budget = encoder_budget
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
@@ -318,13 +322,17 @@ def schedule(self) -> SchedulerOutput:
                 assert num_new_tokens > 0
 
                 # Schedule encoder inputs.
-                (encoder_inputs_to_schedule, num_new_tokens,
-                 new_encoder_budget) = self._try_schedule_encoder_inputs(
-                     request, num_computed_tokens, num_new_tokens,
-                     encoder_budget)
-                if num_new_tokens == 0:
-                    # The request cannot be scheduled.
-                    break
+                if request.has_encoder_inputs:
+                    (encoder_inputs_to_schedule, num_new_tokens,
+                     new_encoder_budget) = self._try_schedule_encoder_inputs(
+                         request, num_computed_tokens, num_new_tokens,
+                         encoder_budget)
+                    if num_new_tokens == 0:
+                        # The request cannot be scheduled.
+                        break
+                else:
+                    encoder_inputs_to_schedule = None
+                    new_encoder_budget = encoder_budget
 
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens, computed_blocks)
@@ -506,9 +514,6 @@ def _try_schedule_encoder_inputs(
         limitations, the method adjusts `num_new_tokens` to schedule only the
         decoder tokens up to just before the unschedulable encoder input.
         """
-        if not request.has_encoder_inputs():
-            return [], num_new_tokens, encoder_budget
-
         encoder_inputs_to_schedule: list[int] = []
         mm_positions = request.mm_positions
         assert mm_positions is not None
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 48e5132678c..490fe4e83d3 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -59,6 +59,8 @@ def __init__(
         self.mm_positions = multi_modal_placeholders or []
         self.mm_inputs = multi_modal_inputs or []
         self.mm_hashes: list[str] = multi_modal_hashes or []
+        self.num_encoder_inputs = len(self.mm_inputs)
+        self.has_encoder_inputs = self.num_encoder_inputs > 0
 
         # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
@@ -117,13 +119,6 @@ def is_finished(self) -> bool:
     def get_finished_reason(self) -> Union[FinishReason, None]:
         return RequestStatus.get_finished_reason(self.status)
 
-    def has_encoder_inputs(self) -> bool:
-        return len(self.mm_inputs) > 0
-
-    @property
-    def num_encoder_inputs(self) -> int:
-        return len(self.mm_positions)
-
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
         num_tokens = self.mm_positions[input_id]["length"]

From 1f7129ce425c804193dd9055a0cd351c94e0cb16 Mon Sep 17 00:00:00 2001
From: Chengyang LIU <464004340@qq.com>
Date: Sun, 30 Mar 2025 19:39:56 -0700
Subject: [PATCH 1092/1240] [Misc] update the comments (#15780)

Signed-off-by: chengyang liu <lcy4869@gmail.com>
Co-authored-by: chengyang liu <lcy4869@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8071c98b269..e3df2a62e67 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -673,7 +673,7 @@ def _compute_cascade_attn_prefix_len(
         # use two kernels for cascade attention. Let's imagine:
         # Request 3's input query: [D]
         # Request 3's kv cache: [A, B, C, D]
-        # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D])
+        # Request 3's num_computed_tokens: 3 (i.e., [A, B, C])
         # If we use [A, B, C, D] as the common prefix for Request 1-3,
         # then Request 3 will be processed only by the first kernel,
         # and the second kernel will get an empty input. While this is not

From 33ae51beb1ddbd637b6fea3c7e4c6e7cc2af3fb1 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Mon, 31 Mar 2025 00:38:58 -0700
Subject: [PATCH 1093/1240] [Benchmark] Update Vision Arena Dataset and
 HuggingFaceDataset Setup (#15748)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md               | 240 ++++++++++++++++-------------
 benchmarks/benchmark_dataset.py    | 155 +++++++++----------
 benchmarks/benchmark_serving.py    |  17 +-
 benchmarks/benchmark_throughput.py |  30 ++--
 4 files changed, 227 insertions(+), 215 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index d41de1caa04..4777d8329f2 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -41,29 +41,33 @@ become available.
       <td><code>synthetic</code></td>
     </tr>
     <tr>
-      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">🟡</td>
-      <td style="text-align: center;">🟡</td>
-      <td>Specify your dataset path on HuggingFace</td>
+      <td><strong>HuggingFace-VisionArena</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>lmarena-ai/VisionArena-Chat</code></td>
+    </tr>
+    <tr>
+      <td><strong>HuggingFace-InstructCoder</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>likaixin/InstructCoder</code></td>
     </tr>
     <tr>
-      <td><strong>VisionArena</strong></td>
+      <td><strong>HuggingFace-Other</strong></td>
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
-      <td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
+      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
     </tr>
   </tbody>
 </table>
 
 ✅: supported
 
-🚧: to be supported
+🟡: Partial support
 
-🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
-If you need support for other dataset formats, please consider contributing.
+🚧: to be supported
 
-**Note**: VisionArena’s `dataset-name` should be set to `hf`
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 
 ---
 ## Example - Online Benchmark
@@ -71,8 +75,7 @@ If you need support for other dataset formats, please consider contributing.
 First start serving your model
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-vllm serve ${MODEL_NAME} --disable-log-requests
+vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
 ```
 
 Then run the benchmarking script
@@ -80,12 +83,13 @@ Then run the benchmarking script
 ```bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -122,88 +126,76 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT='train'
-
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --hf-split train \
+  --num-prompts 1000
 ```
 
-### HuggingFaceDataset Examples
+### InstructCoder Benchmark with Speculative Decoding
 
-Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
-formats, please consider contributing.
+``` bash
+VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
+    --speculative-model "[ngram]" \
+    --ngram_prompt_lookup_min 2 \
+    --ngram-prompt-lookup-max 5 \
+    --num_speculative_tokens 5
+```
+
+``` bash
+python3 benchmarks/benchmark_serving.py \
+    --model meta-llama/Meta-Llama-3-8B-Instruct \
+    --dataset-name hf \
+    --dataset-path likaixin/InstructCoder \
+    --num-prompts 2048
+```
+
+### Other HuggingFaceDataset Examples
 
 ```bash
-# need a model with vision capability here
 vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
 ```
 
 **`lmms-lab/LLaVA-OneVision-Data`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
-DATASET_SPLIT='train'
-DATASET_SUBSET='chart2text(cauldron)'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-subset "${DATASET_SUBSET}"
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
 ```
 
 **`Aeala/ShareGPT_Vicuna_unfiltered`**
 
 ```bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-BACKEND="openai-chat"
-DATASET_NAME="hf"
-DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
-DATASET_SPLIT='train'
 python3 vllm/benchmarks/benchmark_serving.py \
-  --backend "${BACKEND}" \
-  --model "${MODEL_NAME}" \
-  --endpoint "/v1/chat/completions" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --hf-split "${DATASET_SPLIT}" \
-  --num-prompts "${NUM_PROMPTS}" \
+  --backend openai-chat \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --endpoint /v1/chat/completions \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
 ```
 
 ---
 ## Example - Offline Throughput Benchmark
 
 ```bash
-MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
-NUM_PROMPTS=10
-DATASET_NAME="sonnet"
-DATASET_PATH="vllm/benchmarks/sonnet.txt"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}"
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --dataset-name sonnet \
+  --dataset-path vllm/benchmarks/sonnet.txt \
+  --num-prompts 10
 ```
 
 If successful, you will see the following output
@@ -217,19 +209,13 @@ Total num output tokens:  1500
 ### VisionArena Benchmark for Vision Language Models
 
 ``` bash
-MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
-NUM_PROMPTS=10
-DATASET_NAME="hf"
-DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
-DATASET_SPLIT="train"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "vllm-chat" \
-  --dataset-name "${DATASET_NAME}" \
-  --dataset-path "${DATASET_PATH}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --hf-split "${DATASET_SPLIT}"
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmarena-ai/VisionArena-Chat \
+  --num-prompts 1000 \
+  --hf-split train
 ```
 
 The `num prompt tokens` now includes image token counts
@@ -240,29 +226,71 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
+### InstructCoder Benchmark with Speculative Decoding
+
+``` bash
+VLLM_WORKER_MULTIPROC_METHOD=spawn \
+VLLM_USE_V1=1 \
+python3 vllm/benchmarks/benchmark_throughput.py \
+    --dataset-name=hf \
+    --dataset-path=likaixin/InstructCoder \
+    --model=meta-llama/Meta-Llama-3-8B-Instruct \
+    --input-len=1000 \
+    --output-len=100 \
+    --num-prompts=2048 \
+    --async-engine \
+    --speculative-model="[ngram]" \
+    --ngram_prompt_lookup_min=2 \
+    --ngram-prompt-lookup-max=5 \
+    --num_speculative_tokens=5
+```
+
+```
+Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
+Total num prompt tokens:  261136
+Total num output tokens:  204800
+```
+
+### Other HuggingFaceDataset Examples
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path lmms-lab/LLaVA-OneVision-Data \
+  --hf-split train \
+  --hf-subset "chart2text(cauldron)" \
+  --num-prompts 10
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+python3 vllm/benchmarks/benchmark_throughput.py \
+  --model Qwen/Qwen2-VL-7B-Instruct \
+  --backend vllm-chat \
+  --dataset-name hf \
+  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters
 
 ``` bash
 # download dataset
 # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-MODEL_NAME="meta-llama/Llama-2-7b-hf"
-BACKEND="vllm"
-DATASET_NAME="sharegpt"
-DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
-NUM_PROMPTS=10
-MAX_LORAS=2
-MAX_LORA_RANK=8
-ENABLE_LORA="--enable-lora"
-LORA_PATH="yard1/llama-2-7b-sql-lora-test"
-
 python3 vllm/benchmarks/benchmark_throughput.py \
-  --model "${MODEL_NAME}" \
-  --backend "${BACKEND}" \
-  --dataset_path "${DATASET_PATH}" \
-  --dataset_name "${DATASET_NAME}" \
-  --num-prompts "${NUM_PROMPTS}" \
-  --max-loras "${MAX_LORAS}" \
-  --max-lora-rank "${MAX_LORA_RANK}" \
-  ${ENABLE_LORA} \
-  --lora-path "${LORA_PATH}"
+  --model meta-llama/Llama-2-7b-hf \
+  --backend vllm \
+  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --dataset_name sharegpt \
+  --num-prompts 10 \
+  --max-loras 2 \
+  --max-lora-rank 8 \
+  --enable-lora \
+  --lora-path yard1/llama-2-7b-sql-lora-test
   ```
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 38ef739c69f..f332566d64f 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -23,7 +23,8 @@
 from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import cache
-from typing import Any, Optional, Union
+from io import BytesIO
+from typing import Any, Callable, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -239,21 +240,24 @@ def process_image(image: Any) -> Mapping[str, Any]:
     """
     Process a single image input and return a multimedia content dictionary.
 
-    For a PIL.Image.Image input:
-      - Converts the image to RGB.
-      - Saves the image as a JPEG in-memory.
-      - Encodes the JPEG data as a base64 string.
-      - Returns a dictionary with the image as a base64 data URL.
+    Supports three input types:
 
-    For a string input:
-      - Treats the string as a URL or file path.
-      - Prepends "file://" if the string doesn't start with "http://" or
-        "file://".
-      - Returns a dictionary with the image URL.
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
 
     Raises:
-      ValueError: If the input is neither a PIL.Image.Image nor a string.
+        ValueError: If the input is not a supported type.
     """
+    if isinstance(image, dict) and 'bytes' in image:
+        image = Image.open(BytesIO(image['bytes']))
     if isinstance(image, Image.Image):
         image = image.convert("RGB")
         with io.BytesIO() as image_data:
@@ -272,8 +276,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
             ("http://", "file://")) else f"file://{image}")
         return {"type": "image_url", "image_url": {"url": image_url}}
 
-    raise ValueError(
-        f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
+    raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
+                     " or str or dictionary with raw image bytes.")
 
 
 # -----------------------------------------------------------------------------
@@ -562,48 +566,56 @@ def sample(
 
 
 # -----------------------------------------------------------------------------
-# HuggingFace Dataset Implementation
+# HuggingFace Dataset Base Implementation
 # -----------------------------------------------------------------------------
-
-
 class HuggingFaceDataset(BenchmarkDataset):
-    """
-    Dataset class for processing a HuggingFace dataset with conversation data
-    and optional images.
-    """
+    """Base class for datasets hosted on HuggingFace."""
+
+    SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
 
     def __init__(
         self,
+        dataset_path: str,
         dataset_split: str,
         dataset_subset: Optional[str] = None,
         **kwargs,
     ) -> None:
-        super().__init__(**kwargs)
+        super().__init__(dataset_path=dataset_path, **kwargs)
+
+        # Validate dataset path
+        if self.SUPPORTED_DATASET_PATHS and \
+            self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
+            raise ValueError(
+                f"{self.__class__.__name__} "
+                f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
+
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
-
         self.load_data()
 
     def load_data(self) -> None:
-        if not self.dataset_path:
-            raise ValueError("dataset_path must be provided for loading data.")
-
+        """Load data from HuggingFace datasets."""
         self.data = load_dataset(
             self.dataset_path,
             name=self.dataset_subset,
             split=self.dataset_split,
             streaming=True,
         )
-        if self.data.features is None or "conversations" \
-            not in self.data.features:
-            raise ValueError(
-                "HuggingFaceDataset currently only supports datasets with "
-                "a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
-                "Please consider contributing if you would like to add "
-                "support for additional dataset formats.")
-        # Shuffle and filter examples with at least 2 conversations.
-        self.data = self.data.shuffle(seed=self.random_seed).filter(
-            lambda x: len(x["conversations"]) >= 2)
+        self.data = self.data.shuffle(seed=self.random_seed)
+
+
+# -----------------------------------------------------------------------------
+# Conversation Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class ConversationDataset(HuggingFaceDataset):
+    """Dataset for conversation data with multimodal support."""
+    SUPPORTED_DATASET_PATHS = {
+        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
+    }
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
@@ -611,10 +623,13 @@ def sample(self,
                output_len: Optional[int] = None,
                enable_multimodal_chat: bool = False,
                **kwargs) -> list:
+        # Filter examples with at least 2 conversations
+        filtered_data = self.data.filter(
+            lambda x: len(x["conversations"]) >= 2)
         sampled_requests = []
         dynamic_output = output_len is None
 
-        for item in self.data:
+        for item in filtered_data:
             if len(sampled_requests) >= num_requests:
                 break
             conv = item["conversations"]
@@ -659,29 +674,12 @@ class VisionArenaDataset(HuggingFaceDataset):
     """
 
     DEFAULT_OUTPUT_LEN = 128
-    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
-            raise ValueError(f"Only support Vision Arena dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-        self.load_data()
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "lmarena-ai/VisionArena-Chat":
+        lambda x: x["conversation"][0][0]["content"],
+        "lmarena-ai/vision-arena-bench-v0.1":
+        lambda x: x["turns"][0][0]["content"]
+    }
 
     def sample(
         self,
@@ -697,7 +695,11 @@ def sample(
         for item in self.data:
             if len(sampled_requests) >= num_requests:
                 break
-            prompt = item["turns"][0][0]["content"]
+            parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
+            if parser_fn is None:
+                raise ValueError(
+                    f"Unsupported dataset path: {self.dataset_path}")
+            prompt = parser_fn(item)
             mm_content = process_image(item["images"][0])
             prompt_len = len(tokenizer(prompt).input_ids)
             if enable_multimodal_chat:
@@ -727,34 +729,15 @@ class InstructCoderDataset(HuggingFaceDataset):
     InstructCoder Dataset.
     https://huggingface.co/datasets/likaixin/InstructCoder
 
-    InstructCoder is the dataset designed for general code editing.
-    It consists of 114,239 instruction-input-output triplets,
-    and covers multiple distinct code editing scenario.
+    InstructCoder is the dataset designed for general code editing.  It consists
+    of 114,239 instruction-input-output triplets, and covers multiple distinct
+    code editing scenario.
     """
 
     DEFAULT_OUTPUT_LEN = 200  # this is the average default output length
-    DEFAULT_NUM_REQUESTS = 1000
-    INSTRUCT_CODER_DATASET_PATH = "likaixin/InstructCoder"
-
-    def __init__(
-        self,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        if self.dataset_path != self.INSTRUCT_CODER_DATASET_PATH:
-            raise ValueError(f"Only support likaixin/InstructCoder dataset.\
-                    This data path {self.dataset_path} is not valid.")
-        if self.dataset_subset is None and self.dataset_split != "train":
-            raise ValueError("Dataset split must be 'train'.")
-
-    def load_data(self) -> None:
-        dataset = load_dataset(
-            self.dataset_path,
-            name=self.dataset_subset,
-            split=self.dataset_split,
-            streaming=True,
-        )
-        self.data = dataset.shuffle(seed=self.random_seed)
+    SUPPORTED_DATASET_PATHS = {
+        "likaixin/InstructCoder",
+    }
 
     def sample(self,
                tokenizer: PreTrainedTokenizerBase,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e2f712dfc6f..dabf2214c84 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -49,7 +49,7 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
                                InstructCoderDataset, RandomDataset,
                                SampleRequest, ShareGPTDataset, SonnetDataset,
                                VisionArenaDataset)
@@ -584,16 +584,17 @@ def main(args: argparse.Namespace):
                                             return_prompt_formatted=True)
 
     elif args.dataset_name == "hf":
-        # Choose between VisionArenaDataset
-        # and HuggingFaceDataset based on provided parameters.
-        dataset_class = HuggingFaceDataset
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
-            assert args.hf_subset is None, "VisionArenaDataset needs hf_subset to be None."  #noqa: E501
+        # all following datasets are implemented from the
+        # HuggingFaceDataset base class
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = VisionArenaDataset
-        elif args.dataset_path == "likaixin/InstructCoder":
+            args.hf_split = "train"
+            args.hf_subset = None
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = InstructCoderDataset
             args.hf_split = "train"
-
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = ConversationDataset
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index f2f68b0d1e5..1ff63f0a447 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,7 +11,7 @@
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
+from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
                                InstructCoderDataset, RandomDataset,
                                SampleRequest, ShareGPTDataset, SonnetDataset,
                                VisionArenaDataset)
@@ -319,21 +319,19 @@ def get_requests(args, tokenizer):
     elif args.dataset_name == "burstgpt":
         dataset_cls = BurstGPTDataset
     elif args.dataset_name == "hf":
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
-            if args.args.backend == "vllm-chat":
-                raise ValueError(
-                    "hf datasets only are supported by vllm-chat backend")
-            # Choose between VisionArenaDataset and HuggingFaceDataset based on
-            # provided parameters.
-            dataset_cls = (VisionArenaDataset if args.dataset_path
-                           == VisionArenaDataset.VISION_ARENA_DATASET_PATH
-                           and args.hf_subset is None else HuggingFaceDataset)
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = VisionArenaDataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
             sample_kwargs["enable_multimodal_chat"] = True
-        elif args.dataset_path == "likaixin/InstructCoder":
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             dataset_cls = InstructCoderDataset
             common_kwargs['dataset_split'] = "train"
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = ConversationDataset
+            common_kwargs['dataset_subset'] = args.hf_subset
+            common_kwargs['dataset_split'] = args.hf_split
+            sample_kwargs["enable_multimodal_chat"] = True
 
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
@@ -469,10 +467,12 @@ def validate_args(args):
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
     elif args.dataset_name == "hf":
-        if args.dataset_path == VisionArenaDataset.VISION_ARENA_DATASET_PATH:
+        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
             assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path == "likaixin/InstructCoder":
+        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
             assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
+        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
+            assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend."  #noqa: E501
         else:
             raise ValueError(
                 f"{args.dataset_path} is not supported by hf dataset.")

From d4dd879bf379b25c28993f15eb445f53de93e682 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Mon, 31 Mar 2025 06:42:18 -0500
Subject: [PATCH 1094/1240] [Feature][ROCm]Enable fusion pass for torch.compile
 on ROCm (#15050)

Signed-off-by: charlifu <charlifu@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/fp8/common.cu               |  7 +--
 csrc/quantization/fp8/common.cuh              | 41 ++-----------
 ...fused_layernorm_dynamic_per_token_quant.cu | 28 ++++-----
 .../fused_kernels/layernorm_utils.cuh         | 13 ++--
 .../fused_kernels/quant_conversions.cuh       |  4 +-
 csrc/quantization/utils.cuh                   | 59 +++++++++++++++++++
 tests/compile/test_fusion.py                  |  8 ++-
 vllm/compilation/fusion.py                    |  4 +-
 8 files changed, 92 insertions(+), 72 deletions(-)
 create mode 100644 csrc/quantization/utils.cuh

diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 8f9aa21aae4..eceb3a8ea05 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -30,9 +30,6 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
     fp8_type* __restrict__ out, float* __restrict__ scale,
     scalar_t const* __restrict__ input, float const* __restrict__ scale_ub,
     const int hidden_size) {
-  float const min_scaling_factor =
-      1.0f / (fp8_e4m3_adjusted_max_v<fp8_type> * 512.f);
-
   int const tid = threadIdx.x;
   int const token_idx = blockIdx.x;
 
@@ -67,8 +64,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
       token_scale = block_absmax_val_maybe;
     }
     // token scale computation
-    token_scale = max(token_scale / fp8_e4m3_adjusted_max_v<fp8_type>,
-                      min_scaling_factor);
+    token_scale = max(token_scale / quant_type_max_v<fp8_type>,
+                      min_scaling_factor<fp8_type>::val());
     scale[token_idx] = token_scale;
   }
   __syncthreads();
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
index d331c63ae82..def8b31b275 100644
--- a/csrc/quantization/fp8/common.cuh
+++ b/csrc/quantization/fp8/common.cuh
@@ -1,20 +1,12 @@
 #pragma once
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 
 #include <cmath>
-#include <c10/core/ScalarType.h>
 
-#ifndef USE_ROCM
-  #include <c10/util/Float8_e4m3fn.h>
-  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
-#else
-  #include <ATen/hip/HIPContext.h>
-  #include <c10/util/Float8_e4m3fn.h>
-  #include <c10/util/Float8_e4m3fnuz.h>
+#ifdef USE_ROCM
   #include "amd/quant_utils.cuh"
-  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
-  #define MAYBE_HOST_DEVICE
 #endif
 
 // Determines the preferred FP8 type for the current platform.
@@ -31,29 +23,6 @@ static bool is_fp8_ocp() {
 #endif
 }
 
-template <typename T>
-struct fp8_e4m3_adjusted_max;
-
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fn> {
-  static constexpr c10::Float8_e4m3fn val() {
-    return std::numeric_limits<c10::Float8_e4m3fn>::max();
-  }
-};
-
-// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
-// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
-template <>
-struct fp8_e4m3_adjusted_max<c10::Float8_e4m3fnuz> {
-  static constexpr c10::Float8_e4m3fnuz val() {
-    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
-  }
-};
-
-template <typename T>
-MAYBE_HOST_DEVICE static constexpr T fp8_e4m3_adjusted_max_v =
-    fp8_e4m3_adjusted_max<T>::val();
-
 namespace vllm {
 
 __device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
@@ -76,8 +45,8 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val,
     x = val / scale;
   }
 
-  float r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                 fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
 #ifndef USE_ROCM
   return static_cast<fp8_type>(r);
 #else
@@ -123,7 +92,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale,
   // Finally, since cache[0] contains the maximum for this thread block,
   // atomically write the max to the target location
   if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / fp8_e4m3_adjusted_max_v<fp8_type>);
+    atomicMaxFloat(scale, cache[0] / quant_type_max_v<fp8_type>);
   }
 }
 
diff --git a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
index 1be89c504bf..2b6ab7fcec9 100644
--- a/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
+++ b/csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu
@@ -14,8 +14,7 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   float rms = 0.0f;
   float token_scale = 0.0f;
@@ -27,8 +26,8 @@ __device__ void rms_norm_dynamic_per_token_quant_vec(
   // Compute scale
   vllm::vectorized::compute_dynamic_per_token_scales<scalar_t, scalar_out_t,
                                                      has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -50,8 +49,7 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
     float* __restrict__ scales,           // [num_tokens]
     scalar_t const* __restrict__ input,   // [..., hidden_size]
     scalar_t const* __restrict__ weight,  // [hidden_size]
-    float const* scale_ub, float const var_epsilon,
-    float const min_scaling_factor, int32_t const hidden_size,
+    float const* scale_ub, float const var_epsilon, int32_t const hidden_size,
     scalar_t* __restrict__ residual = nullptr) {
   // For vectorization, token_input and token_output pointers need to be
   // aligned at 8-byte and 4-byte addresses respectively.
@@ -60,8 +58,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
   if (can_vectorize) {
     return rms_norm_dynamic_per_token_quant_vec<scalar_t, scalar_out_t,
                                                 has_residual>(
-        out, scales, input, weight, scale_ub, var_epsilon, min_scaling_factor,
-        hidden_size, residual);
+        out, scales, input, weight, scale_ub, var_epsilon, hidden_size,
+        residual);
   }
 
   float rms = 0.0f;
@@ -72,8 +70,8 @@ __global__ void rms_norm_dynamic_per_token_quant_kernel(
                                             var_epsilon, residual);
   // Compute Scale
   vllm::compute_dynamic_per_token_scales<scalar_t, scalar_out_t, has_residual>(
-      &token_scale, scales, input, weight, rms, scale_ub, min_scaling_factor,
-      hidden_size, residual);
+      &token_scale, scales, input, weight, rms, scale_ub, hidden_size,
+      residual);
 
   // RMS Norm + Quant
   if constexpr (std::is_same_v<scalar_out_t, int8_t>) {
@@ -105,11 +103,6 @@ void rms_norm_dynamic_per_token_quant_dispatch(
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  const float min_scaling_factor =
-      out.dtype() == torch::kInt8
-          ? std::numeric_limits<float>::epsilon()
-          : 1.0f / (std::numeric_limits<c10::Float8_e4m3fn>::max() * 512.f);
-
   if (residual.has_value()) {
     VLLM_DISPATCH_QUANT_TYPES(
         out.scalar_type(), "rms_norm_dynamic_per_token_quant_kernel", [&] {
@@ -119,8 +112,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size,
-                  residual->data_ptr<scalar_in_t>());
+                  var_epsilon, hidden_size, residual->data_ptr<scalar_in_t>());
         });
 
   } else {
@@ -132,7 +124,7 @@ void rms_norm_dynamic_per_token_quant_dispatch(
                   out.data_ptr<scalar_t>(), scales.data_ptr<float>(),
                   input.data_ptr<scalar_in_t>(), weight.data_ptr<scalar_in_t>(),
                   scale_ub.has_value() ? scale_ub->data_ptr<float>() : nullptr,
-                  var_epsilon, min_scaling_factor, hidden_size, nullptr);
+                  var_epsilon, hidden_size, nullptr);
         });
   }
 }
diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh
index b5cea98f770..e6d23cd24e1 100644
--- a/csrc/quantization/fused_kernels/layernorm_utils.cuh
+++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh
@@ -5,6 +5,7 @@
  */
 
 #include "quantization/vectorization.cuh"
+#include "quantization/utils.cuh"
 #include "quant_conversions.cuh"
 
 #ifndef USE_ROCM
@@ -51,11 +52,11 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   float block_absmax_val_maybe = 0.0f;
   for (auto i = threadIdx.x; i < hidden_size; i += blockDim.x) {
@@ -83,7 +84,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // Shared memory store
     all_token_scales[blockIdx.x] = scale;  // Global output store
   }
@@ -184,7 +185,7 @@ __device__ void compute_dynamic_per_token_scales(
     float* __restrict__ token_scale, float* __restrict__ all_token_scales,
     scalar_t const* __restrict__ input, scalar_t const* __restrict__ weight,
     float const rms, float const* __restrict__ scale_ub,
-    float const min_scaling_factor, int32_t const hidden_size,
+    int32_t const hidden_size,
     scalar_t const* __restrict__ residual = nullptr) {
   int64_t const token_offset = blockIdx.x * static_cast<int64_t>(hidden_size);
   ;
@@ -200,7 +201,7 @@ __device__ void compute_dynamic_per_token_scales(
         reinterpret_cast<vec4_t<scalar_t> const*>(&residual[token_offset]);
   }
 
-  constexpr scalar_out_t qmax{std::numeric_limits<scalar_out_t>::max()};
+  constexpr scalar_out_t qmax{quant_type_max_v<scalar_out_t>};
 
   int32_t const num_vec_elems = hidden_size >> 2;
   float block_absmax_val_maybe = 0.0f;
@@ -248,7 +249,7 @@ __device__ void compute_dynamic_per_token_scales(
       scale = block_absmax_val_maybe;
     }
     // token scale computation
-    scale = max(scale / qmax, min_scaling_factor);
+    scale = max(scale / qmax, min_scaling_factor<scalar_out_t>::val());
     s_token_scale = scale;                 // shared memory store
     all_token_scales[blockIdx.x] = scale;  // global output store
   }
diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
index 9ac7b188f51..7c10aaa81cf 100644
--- a/csrc/quantization/fused_kernels/quant_conversions.cuh
+++ b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -33,8 +33,8 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
 
 template <typename fp8_type>
 static __device__ __forceinline__ fp8_type float_to_fp8(float const x) {
-  float const r = fmax(-fp8_e4m3_adjusted_max_v<fp8_type>,
-                       fmin(x, fp8_e4m3_adjusted_max_v<fp8_type>));
+  float const r =
+      fmax(-quant_type_max_v<fp8_type>, fmin(x, quant_type_max_v<fp8_type>));
   return static_cast<fp8_type>(r);
 }
 
diff --git a/csrc/quantization/utils.cuh b/csrc/quantization/utils.cuh
new file mode 100644
index 00000000000..73055a15287
--- /dev/null
+++ b/csrc/quantization/utils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+/**
+ * Quantization utilities including:
+ *   Adjusted maximum values for qtypes.
+ *   Minimum scaling factors for qtypes.
+ */
+
+#include <cmath>
+#include <torch/types.h>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+  #define MAYBE_HOST_DEVICE C10_HOST_DEVICE
+#else
+  #include <ATen/hip/HIPContext.h>
+  #include <c10/util/Float8_e4m3fn.h>
+  #include <c10/util/Float8_e4m3fnuz.h>
+  // ROCm doesn't seem to need C10_HOST_DEVICE for static constexpr
+  #define MAYBE_HOST_DEVICE
+#endif
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct quant_type_max {
+  static constexpr T val() { return std::numeric_limits<T>::max(); }
+};
+
+// Using the default max value from pytorch (240.0 0x7F) will cause accuracy
+// issues when running dynamic quantization. Here use 224.0 0x7E for rocm.
+template <>
+struct quant_type_max<c10::Float8_e4m3fnuz> {
+  static constexpr c10::Float8_e4m3fnuz val() {
+    return c10::Float8_e4m3fnuz(0x7E, c10::Float8_e4m3fnuz::from_bits());
+  }
+};
+
+template <typename T>
+MAYBE_HOST_DEVICE static constexpr T quant_type_max_v =
+    quant_type_max<T>::val();
+
+template <typename T,
+          typename = std::enable_if_t<std::is_same_v<T, c10::Float8_e4m3fn> ||
+                                      std::is_same_v<T, c10::Float8_e4m3fnuz> ||
+                                      std::is_same_v<T, int8_t>>>
+struct min_scaling_factor {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return 1.0f / (quant_type_max_v<T> * 512.0f);
+  }
+};
+
+template <>
+struct min_scaling_factor<int8_t> {
+  C10_DEVICE C10_ALWAYS_INLINE static float val() {
+    return std::numeric_limits<float>::epsilon();
+  }
+};
\ No newline at end of file
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index aaf02778109..a1adf7083ef 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -2,7 +2,6 @@
 
 import pytest
 import torch
-from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
 import vllm.plugins
@@ -14,9 +13,12 @@
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
+from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+
 
 class TestModel(torch.nn.Module):
 
@@ -59,8 +61,8 @@ def forward(self, x):
 @pytest.mark.parametrize("static", [True, False])
 @pytest.mark.parametrize("cutlass_fp8_enabled",
                          [True, False] if CUTLASS_FP8_SUPPORTED else [False])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
+                    reason="Only test on CUDA and ROCm")
 def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
                               cutlass_fp8_enabled):
     torch.set_default_device("cuda")
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 0c3d8697b23..b46f5f52244 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -4,8 +4,6 @@
 
 import torch
 import torch._inductor.pattern_matcher as pm
-# TODO(luka) use vllm.utils once #10836 landed
-from compressed_tensors.quantization import FP8_DTYPE
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
@@ -13,12 +11,14 @@
 
 from vllm.config import CompilationConfig
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
+FP8_DTYPE = current_platform.fp8_dtype()
 
 
 def empty_bf16(*args, **kwargs):

From 736a97ceb36ddbeea96fb4c9775cc5a58eead929 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 12:54:49 +0100
Subject: [PATCH 1095/1240] Recommend developing with Python 3.12 in developer
 guide (#15811)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/overview.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 10cbc0eb126..1e6f73dd524 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -44,6 +44,12 @@ pre-commit run --all-files
 pytest tests/
 ```
 
+:::{tip}
+Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+
+Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
+:::
+
 :::{note}
 Currently, the repository is not fully checked by `mypy`.
 :::

From 2f8a2195e68eb213322aa7b1c29472bbc9124062 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Mon, 31 Mar 2025 20:13:32 +0800
Subject: [PATCH 1096/1240] fix: better install requirement for install in
 setup.py (#15796)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 37f3e78926c..3a92d5a23a1 100755
--- a/setup.py
+++ b/setup.py
@@ -592,9 +592,8 @@ def _read_requirements(filename: str) -> list[str]:
         for line in requirements:
             if line.startswith("-r "):
                 resolved_requirements += _read_requirements(line.split()[1])
-            elif line.startswith("--"):
-                continue
-            else:
+            elif not line.startswith("--") and not line.startswith(
+                    "#") and line.strip() != "":
                 resolved_requirements.append(line)
         return resolved_requirements
 

From e522bff1d49e21e169f7add608199e7e40eebba6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 31 Mar 2025 20:22:34 +0800
Subject: [PATCH 1097/1240] [V1] Fully Transparent Implementation of CPU
 Offloading (#15354)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                              |  1 +
 csrc/cuda_view.cu                           | 39 +++++++++++++
 csrc/ops.h                                  |  2 +
 csrc/torch_bindings.cpp                     |  4 ++
 tests/basic_correctness/test_cpu_offload.py |  7 ---
 tests/kernels/test_uva.py                   | 61 +++++++++++++++++++++
 tests/quantization/test_cpu_offload.py      |  7 ---
 vllm/config.py                              |  5 +-
 vllm/engine/arg_utils.py                    |  6 --
 vllm/model_executor/models/utils.py         | 21 ++++++-
 vllm/utils.py                               | 16 ++++++
 vllm/v1/worker/gpu_model_runner.py          |  4 ++
 12 files changed, 148 insertions(+), 25 deletions(-)
 create mode 100644 csrc/cuda_view.cu
 create mode 100644 tests/kernels/test_uva.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9d15b77bc37..ab6185e9a63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,6 +234,7 @@ set(VLLM_EXT_SRC
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
   "csrc/layernorm_quant_kernels.cu"
+  "csrc/cuda_view.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
new file mode 100644
index 00000000000..938bd4ab7fc
--- /dev/null
+++ b/csrc/cuda_view.cu
@@ -0,0 +1,39 @@
+#include <torch/all.h>
+#include <torch/cuda.h>
+#include <cuda_runtime.h>
+
+// This function assumes that `cpu_tensor` is a CPU tensor allocated with pinned
+// memory, and that UVA (Unified Virtual Addressing) is enabled.
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
+  TORCH_CHECK(cpu_tensor.device().is_cpu(), "Input tensor must be on CPU");
+
+  // Get raw host pointer from CPU tensor
+  void* host_ptr = cpu_tensor.data_ptr();
+
+  // Get a device pointer corresponding to the pinned host memory
+  void* device_ptr = nullptr;
+  cudaError_t err = cudaHostGetDevicePointer(&device_ptr, host_ptr, 0);
+  TORCH_CHECK(err == cudaSuccess,
+              "cudaHostGetDevicePointer failed: ", cudaGetErrorString(err));
+
+  // We'll use the same sizes, strides, and dtype as the CPU tensor.
+  // TODO: check if layout is respected.
+  auto sizes = cpu_tensor.sizes();
+  auto strides = cpu_tensor.strides();
+  auto options = cpu_tensor.options().device(torch::kCUDA);
+
+  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
+  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
+  // memory, so we don't free it here.
+  auto deleter = [](void*) {
+    // no-op, since the memory is owned by the original CPU tensor
+  };
+
+  torch::Tensor cuda_tensor =
+      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+
+  TORCH_CHECK(cuda_tensor.device().is_cuda(),
+              "Resulting tensor is not on CUDA device");
+
+  return cuda_tensor;
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index 1ea9f465cf2..77d1ab768d9 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -119,6 +119,8 @@ void advance_step_flashinfer(
     torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
     torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
 
+torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                         const torch::Tensor& codebooks,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 60ad6430336..b0a23a36937 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -31,6 +31,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
 
+  ops.def("get_cuda_view_from_cpu_tensor(Tensor cpu_tensor) -> Tensor");
+  ops.impl("get_cuda_view_from_cpu_tensor", torch::kCPU,
+           &get_cuda_view_from_cpu_tensor);
+
   // Attention ops
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 436e43638a3..be3ad12396b 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,15 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 def test_cpu_offload():
     compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
                          ["--cpu-offload-gb", "1"])
diff --git a/tests/kernels/test_uva.py b/tests/kernels/test_uva.py
new file mode 100644
index 00000000000..f641ae7b67c
--- /dev/null
+++ b/tests/kernels/test_uva.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+
+from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_cpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cpu_tensor[0, 0] = 1
+    cpu_tensor[2, 3] = 2
+    cpu_tensor[4, 5] = -1
+
+    cuda_view.mul_(2)
+    assert cuda_view[0, 0] == 2
+    assert cuda_view[2, 3] == 4
+    assert cuda_view[4, 5] == -2
+
+
+@pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_gpu_write(device):
+    torch.set_default_device(device)
+    cpu_tensor = torch.zeros(10,
+                             10,
+                             device="cpu",
+                             pin_memory=True,
+                             dtype=torch.int32)
+    cuda_view = get_cuda_view_from_cpu_tensor(cpu_tensor)
+    assert cuda_view.device.type == "cuda"
+
+    assert cuda_view[0, 0] == 0
+    assert cuda_view[2, 3] == 0
+    assert cuda_view[4, 5] == 0
+
+    cuda_view[0, 0] = 1
+    cuda_view[2, 3] = 2
+    cuda_view[4, 5] = -1
+    cuda_view.mul_(2)
+
+    assert cpu_tensor[0, 0] == 2
+    assert cpu_tensor[2, 3] == 4
+    assert cpu_tensor[4, 5] == -2
\ No newline at end of file
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a7d6518514c..a05eb494c11 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -10,13 +10,6 @@
 from ..utils import compare_two_settings
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    # Fall back to V0 if cpu offloading is enabled.
-    # Fixture is required to that baseline uses V0.
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
diff --git a/vllm/config.py b/vllm/config.py
index 6a15109c674..a02e4f71793 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3562,9 +3562,10 @@ def __post_init__(self):
 
         if self.cache_config is not None and \
             self.cache_config.cpu_offload_gb > 0 and \
-            self.compilation_config.level != CompilationLevel.NO_COMPILATION:
+            self.compilation_config.level != CompilationLevel.NO_COMPILATION \
+                and not envs.VLLM_USE_V1:
             logger.warning(
-                "CPU offload is not supported with `torch.compile` yet."
+                "CPU offload is not supported with `torch.compile` in v0 yet."
                 " Disabling `torch.compile`.")
             self.compilation_config.level = CompilationLevel.NO_COMPILATION
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ca511c7434f..1da021d7f70 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1595,12 +1595,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # No CPU offloading yet.
-        if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
-            _raise_or_fallback(feature_name="--cpu-offload-gb",
-                               recommend_to_remove=False)
-            return False
-
         # Only Fp16 and Bf16 dtypes since we only support FA.
         V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
         if model_config.dtype not in V1_SUPPORTED_DTYPES:
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1e3d78c7f6f..d8c8b5b39ef 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,12 +10,14 @@
 from torch.func import functional_call
 from transformers import PretrainedConfig
 
+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.sequence import IntermediateTensors
-from vllm.utils import is_pin_memory_available
+from vllm.utils import (get_cuda_view_from_cpu_tensor, is_pin_memory_available,
+                        is_uva_available)
 
 logger = init_logger(__name__)
 
@@ -505,6 +507,14 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     pin_memory = is_pin_memory_available()
+    uva_available = is_uva_available()
+
+    if envs.VLLM_USE_V1:
+        assert uva_available, ("V1 CPU offloading requires"
+                               " uva (pin memory) support")
+        uva_offloading = True
+    else:
+        uva_offloading = False
 
     # offload parameters to CPU
     # use pin_memory if possible, which helps cudagraph capture speed
@@ -523,11 +533,16 @@ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
                                        device='cpu',
                                        pin_memory=pin_memory)
         cpu_data.copy_(p.data)
-        p.data = cpu_data
+        if not uva_offloading:
+            p.data = cpu_data
+        else:
+            # keep the cpu data alive
+            p._vllm_offloaded_cpu_data = cpu_data
+            p.data = get_cuda_view_from_cpu_tensor(cpu_data)
         _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
         offloaded_parameters = True
 
-    if offloaded_parameters:
+    if offloaded_parameters and not uva_offloading:
         original_forward = module.forward
 
         def forward(*args, **kwargs):
diff --git a/vllm/utils.py b/vllm/utils.py
index bf83b38ace8..f13f4d78723 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -795,6 +795,14 @@ def is_pin_memory_available() -> bool:
     return current_platform.is_pin_memory_available()
 
 
+@cache
+def is_uva_available() -> bool:
+    """Check if Unified Virtual Addressing (UVA) is available."""
+    # UVA requires pinned memory.
+    # TODO: Add more requirements for UVA if needed.
+    return is_pin_memory_available()
+
+
 class DeviceMemoryProfiler:
 
     def __init__(self, device: Optional[torch.types.Device] = None):
@@ -1645,6 +1653,14 @@ def weak_ref_tensors(
     raise ValueError("Invalid type for tensors")
 
 
+def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
+    """
+    Get a CUDA view of a CPU tensor using Unified Virtual Addressing (UVA).
+    """
+    assert cpu_tensor.is_pinned(), "CPU tensor must be pinned"
+    return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor)
+
+
 def is_in_doc_build() -> bool:
     try:
         from sphinx.ext.autodoc.mock import _MockModule
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e3df2a62e67..74f3124e3c7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -69,6 +69,10 @@ def __init__(
         self.prompt_adapter_config = vllm_config.prompt_adapter_config
         self.observability_config = vllm_config.observability_config
 
+        from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
+        set_cpu_offload_max_bytes(
+            int(self.cache_config.cpu_offload_gb * 1024**3))
+
         model_config = self.model_config
         cache_config = self.cache_config
         scheduler_config = self.scheduler_config

From d9130af17a927e4203fbeec6d2ab7e85cd8c887a Mon Sep 17 00:00:00 2001
From: Naveassaf <55059536+Naveassaf@users.noreply.github.com>
Date: Mon, 31 Mar 2025 15:35:14 +0300
Subject: [PATCH 1098/1240] [Model] Update support for NemotronNAS models
 (#15008)

Signed-off-by: Nave Assaf <nassaf@nvidia.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md     |   2 +-
 tests/models/registry.py                   |   2 +-
 vllm/config.py                             |  21 +-
 vllm/model_executor/models/decilm.py       | 124 ------
 vllm/model_executor/models/interfaces.py   |  29 ++
 vllm/model_executor/models/nemotron_nas.py | 454 +++++++++++++++++++++
 vllm/model_executor/models/registry.py     |  20 +-
 vllm/model_executor/models/utils.py        |   5 +-
 8 files changed, 524 insertions(+), 133 deletions(-)
 delete mode 100644 vllm/model_executor/models/decilm.py
 create mode 100644 vllm/model_executor/models/nemotron_nas.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 8477158a004..0fa10e0734a 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -224,7 +224,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
 - * `DeciLMForCausalLM`
   * DeciLM
-  * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.
+  * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
   *
   * ✅︎
 - * `DeepseekForCausalLM`
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 54e392ab73d..a733fedcdd6 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -112,7 +112,7 @@ def check_available_online(
     "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
                                          trust_remote_code=True),
     "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
-    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
+    "DeciLMForCausalLM": _HfExamplesInfo("nvidia/Llama-3_3-Nemotron-Super-49B-v1", # noqa: E501
                                          trust_remote_code=True),
     "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
     "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index a02e4f71793..bd192af2044 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -411,6 +411,7 @@ def __init__(
 
         self.is_attention_free = self._init_attention_free()
         self.is_hybrid = self._init_is_hybrid()
+        self.has_noops = self._init_has_noops()
         self.has_inner_state = self._init_has_inner_state()
 
         if current_platform.is_neuron():
@@ -510,6 +511,10 @@ def _init_attention_free(self) -> bool:
     def _init_is_hybrid(self) -> bool:
         return self.registry.is_hybrid_model(self.architectures)
 
+    def _init_has_noops(self) -> bool:
+        architectures = getattr(self.hf_config, "architectures", [])
+        return self.registry.is_noops_model(architectures)
+
     def _init_has_inner_state(self) -> bool:
         return self.registry.model_has_inner_state(self.architectures)
 
@@ -872,6 +877,14 @@ def get_total_num_kv_heads(self) -> int:
             return getattr(self.hf_config.attn_config, "kv_n_heads",
                            self.hf_config.num_attention_heads)
 
+        if self.hf_config.model_type == "nemotron-nas":
+            for block in self.hf_config.block_configs:
+                if not block.attention.no_op:
+                    return self.hf_config.num_attention_heads \
+                        // block.attention.n_heads_in_group
+
+            raise RuntimeError("Couldn't determine number of kv heads")
+
         if self.is_attention_free:
             return 0
 
@@ -940,7 +953,9 @@ def get_num_layers_by_block_type(
         # This function relies on 'layers_block_type' in hf_config,
         # for w/o this attribute, we will need to have workarounds like so
         attn_block_type = block_type == LayerBlockType.attention
-        is_transformer = not self.is_hybrid and not self.is_attention_free
+        is_transformer = not self.is_hybrid and \
+                            not self.has_noops and \
+                            not self.is_attention_free
         start, end = self.get_layers_start_end_indices(parallel_config)
 
         if is_transformer:
@@ -951,6 +966,10 @@ def get_num_layers_by_block_type(
             # Note that this code assumes there
             # is only one type of attention-free block type.
             return 0 if attn_block_type else end - start
+        elif self.has_noops:
+            block_configs = self.hf_config.block_configs
+            return sum(not bc.attention.no_op
+                       for bc in block_configs[start:end])
         else:
             # Hybrid model
             layers_block_type_value = getattr(self.hf_config,
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
deleted file mode 100644
index b239b642f75..00000000000
--- a/vllm/model_executor/models/decilm.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 DeciAI Research Team. All rights reserved.
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on MistralAI GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only DeciLM model compatible with HuggingFace weights."""
-
-from typing import Iterable, Set, Tuple
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.llama import LlamaForCausalLM
-
-from .utils import is_pp_missing_parameter
-
-
-class DeciLMForCausalLM(LlamaForCausalLM):
-    """
-    Implementation for https://huggingface.co/Deci/DeciLM-7b-instruct.
-    Based on the llama executor.
-
-    The main difference is that DeciLM uses Variable Grouped Query Attention.
-    The constant number of GQA heads in the decoder is overridden with a value
-    per layer.
-
-    Usually, in the HuggingFace implementation, instead of
-    "config.num_key_value_heads", we use
-    "config.num_key_value_heads_per_layer[i]" which varies.
-
-    Currently, PagedAttention does not work well with variable GQA, so we
-    normalize the weights upon loading, and use uniform GQA with the max value
-    instead.
-    """
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_config
-        config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
-        delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(vllm_config=vllm_config)
-
-    def load_weights(self, weights: Iterable[Tuple[str,
-                                                   torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "k_proj" in name or "v_proj" in name:
-                loaded_weight = self._degroup_weight(loaded_weight)
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-    def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
-        hidden_size = self.config.hidden_size
-        head_size = self.config.hidden_size // self.config.num_attention_heads
-        target_num_kv_heads = self.config.num_key_value_heads
-        num_kv_heads = loaded_weight.shape[0] // head_size
-        n_repeats = target_num_kv_heads / num_kv_heads
-        assert n_repeats == int(n_repeats)
-
-        n_repeats = int(n_repeats)
-        loaded_weight = loaded_weight.view(num_kv_heads, head_size,
-                                           hidden_size)
-        loaded_weight = torch.repeat_interleave(loaded_weight,
-                                                repeats=n_repeats,
-                                                dim=0)
-        loaded_weight = loaded_weight.reshape(target_num_kv_heads * head_size,
-                                              hidden_size)
-
-        return loaded_weight
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index c77324bab59..c61254ac999 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -411,6 +411,35 @@ def is_hybrid(
     return isinstance(model, IsHybrid)
 
 
+@runtime_checkable
+class HasNoOps(Protocol):
+    has_noops: ClassVar[Literal[True]] = True
+
+
+@runtime_checkable
+class _HasNoOpsType(Protocol):
+    has_noops: ClassVar[Literal[True]]
+
+
+@overload
+def has_noops(model: object) -> TypeIs[HasNoOps]:
+    ...
+
+
+@overload
+def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]:
+    ...
+
+
+def has_noops(
+    model: Union[Type[object], object]
+) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]:
+    if isinstance(model, type):
+        return isinstance(model, _HasNoOpsType)
+
+    return isinstance(model, HasNoOps)
+
+
 @runtime_checkable
 class SupportsCrossEncoding(Protocol):
     """The interface required for all models that support cross encoding."""
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
new file mode 100644
index 00000000000..5c9b04cab18
--- /dev/null
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -0,0 +1,454 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only deci model compatible with HuggingFace weights."""
+from typing import Iterable, Optional, Set, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.models.llama import LlamaAttention, LlamaMLP
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasNoOps, SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+    # DeciLM-specific code
+    intermediate_size = int(2 * ffn_mult * n_embd / 3)
+    return _find_multiple(intermediate_size, 256)
+
+
+def _find_multiple(n: int, k: int) -> int:
+    # DeciLM-specific code
+    if n % k == 0:
+        return n
+    return n + k - (n % k)
+
+
+class DeciLMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        block_config = config.block_configs[layer_idx]
+        self._is_no_op_attention = block_config.attention.no_op
+        self._is_no_op_ffn = block_config.ffn.no_op
+
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, "qkv_bias"):
+            attention_bias = config.qkv_bias
+
+        if not self._is_no_op_attention:
+            num_kv_heads = (config.num_attention_heads //
+                            block_config.attention.n_heads_in_group)
+            self.self_attn = LlamaAttention(
+                config=config,
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                rope_theta=rope_theta,
+                rope_scaling=rope_scaling,
+                max_position_embeddings=max_position_embeddings,
+                quant_config=quant_config,
+                bias=attention_bias,
+                bias_o_proj=bias_o_proj,
+                cache_config=cache_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.input_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+
+        if not self._is_no_op_ffn:
+            ffn_mult = block_config.ffn.ffn_mult
+            intermediate_size = _ffn_mult_to_intermediate_size(
+                ffn_mult, config.hidden_size)
+
+            self.mlp = LlamaMLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                bias=getattr(config, "mlp_bias", False),
+                prefix=f"{prefix}.mlp",
+            )
+            self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                    eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+
+        if self._is_no_op_attention:
+            pass
+        else:
+            if (residual is None):
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(
+                    hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+            )
+
+        # Fully Connected
+        if not self._is_no_op_ffn:
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual)
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class DeciModel(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer,
+    ):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        self.padding_idx = config.pad_token_id
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            return layer_type(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            get_layer,
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            if not layer._is_no_op_attention:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+                kv_cache_index += 1
+            else:
+                hidden_states, residual = layer(positions, hidden_states,
+                                                residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".gate_proj", 0),
+            (".gate_up_proj", ".up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+        "embed_tokens",
+        "lm_head",
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    # Mistral/Llama models can also be loaded with --load-format mistral
+    # from consolidated.safetensors checkpoints
+    mistral_mapping = {
+        "layers": "model.layers",
+        "attention": "self_attn",
+        "wq": "q_proj",
+        "wk": "k_proj",
+        "wv": "v_proj",
+        "wo": "o_proj",
+        "attention_norm": "input_layernorm",
+        "feed_forward": "mlp",
+        "w1": "gate_proj",
+        "w2": "down_proj",
+        "w3": "up_proj",
+        "ffn_norm": "post_attention_layernorm",
+        "tok_embeddings": "model.embed_tokens",
+        "output": "lm_head",
+        "norm": "model.norm",
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.sampler = get_sampler()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+        return DeciModel(vllm_config=vllm_config, prefix=prefix)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(self, logits: torch.Tensor,
+               sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9288a4b8174..34be221285c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -21,9 +21,10 @@
 from vllm.logger import init_logger
 from vllm.utils import is_in_doc_build
 
-from .interfaces import (has_inner_state, is_attention_free, is_hybrid,
-                         supports_cross_encoding, supports_multimodal,
-                         supports_pp, supports_transcription, supports_v0_only)
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
+                         supports_multimodal, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import is_text_generation_model
 
 logger = init_logger(__name__)
@@ -44,7 +45,7 @@
     "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
     "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
     "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
     "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
     "DeepseekV3ForCausalLM": ("deepseek_v2", "DeepseekV3ForCausalLM"),
@@ -118,7 +119,7 @@
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
-    "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
+    "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"),
     "GlmForCausalLM": ("glm", "GlmForCausalLM"),
     "GritLM": ("gritlm", "GritLM"),
@@ -235,6 +236,7 @@ class _ModelInfo:
     has_inner_state: bool
     is_attention_free: bool
     is_hybrid: bool
+    has_noops: bool
     supports_transcription: bool
     supports_v0_only: bool
 
@@ -252,6 +254,7 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo":
             is_hybrid=is_hybrid(model),
             supports_transcription=supports_transcription(model),
             supports_v0_only=supports_v0_only(model),
+            has_noops=has_noops(model),
         )
 
 
@@ -511,6 +514,13 @@ def is_hybrid_model(
         model_cls, _ = self.inspect_model_cls(architectures)
         return model_cls.is_hybrid
 
+    def is_noops_model(
+        self,
+        architectures: Union[str, List[str]],
+    ) -> bool:
+        model_cls, _ = self.inspect_model_cls(architectures)
+        return model_cls.has_noops
+
     def is_transcription_model(
         self,
         architectures: Union[str, List[str]],
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index d8c8b5b39ef..eb89193140a 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -497,7 +497,10 @@ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
 
 
 def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
-    device = next(module.parameters()).device
+    if (params := next(module.parameters(), None)) is None:
+        return module
+
+    device = params.device
 
     if device == torch.device("cpu"):
         return module

From 65977eeecc2f388081b6c56479061c9c41e77131 Mon Sep 17 00:00:00 2001
From: Alex Brooks <alex.brooks@ibm.com>
Date: Mon, 31 Mar 2025 07:23:53 -0600
Subject: [PATCH 1099/1240] [Bugfix] Fix Crashing When Loading Modules With
 Batchnorm Stats (#15813)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/models/test_utils.py          | 79 +++++++++++++++++++++++++++++
 vllm/model_executor/models/utils.py | 24 +++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 tests/models/test_utils.py

diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
new file mode 100644
index 00000000000..d61c7d2d500
--- /dev/null
+++ b/tests/models/test_utils.py
@@ -0,0 +1,79 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.model_executor.models.utils import AutoWeightsLoader
+
+
+class ModuleWithBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bn = torch.nn.BatchNorm1d(2)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
+class ModuleWithNestedBatchNorm(torch.nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.nested_mod = ModuleWithBatchNorm()
+
+    def forward(self, x):
+        return self.nested_mod(x)
+
+
+def test_module_with_batchnorm_can_load():
+    """Ensure the auto weight loader can load batchnorm stats."""
+    mod = ModuleWithBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithBatchNorm()
+
+    assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_with_child_containing_batchnorm_can_autoload():
+    """Ensure the auto weight loader can load nested modules batchnorm stats."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index eb89193140a..f197434f314 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -158,6 +158,26 @@ def _load_param(
 
             yield weight_qualname
 
+    def _add_loadable_non_param_tensors(self, module: nn.Module,
+                                        child_params: Dict[str, torch.Tensor]):
+        """
+        Add tensor names that are not in the model params that may be in the
+        safetensors, e.g., batch normalization stats.
+        """
+        if isinstance(module, (
+                nn.BatchNorm1d,
+                nn.BatchNorm2d,
+                nn.BatchNorm3d,
+                nn.LazyBatchNorm1d,
+                nn.LazyBatchNorm2d,
+                nn.LazyBatchNorm3d,
+                nn.SyncBatchNorm,
+        )):
+            module_state_dict = module.state_dict()
+            for stat_name in ("running_mean", "running_var",
+                              "num_batches_tracked"):
+                child_params[stat_name] = module_state_dict[stat_name]
+
     def _load_module(
         self,
         base_prefix: str,
@@ -186,6 +206,10 @@ def _load_module(
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
 
+        # Add missing tensors the weight loader needs to be able to load
+        # that aren't registered as params, e.g., batchnorm statistics.
+        self._add_loadable_non_param_tensors(module, child_params)
+
         for child_prefix, child_weights in self._groupby_prefix(weights):
             prefix = self._get_qualname(base_prefix, child_prefix)
 

From c124f0d1ff2cc0cd85a1a45b28a6c19b0c13ecaf Mon Sep 17 00:00:00 2001
From: Mrm <86636997+noc-turne@users.noreply.github.com>
Date: Mon, 31 Mar 2025 21:56:42 +0800
Subject: [PATCH 1100/1240] [Bugfix] Fix missing return value in load_weights
 method of adapters.py (#15542)

Signed-off-by: noc-turne <2270929247@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/adapters.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 23d72d8e60f..6ab03c40ab4 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -99,16 +99,17 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                     mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
                     weights = mapper.apply(weights)
 
-                    self.model.load_weights(weights)
-                    return
+                    loaded_params = self.model.load_weights(weights)
+                    loaded_params = {f"model.{name}" for name in loaded_params}
+                    return loaded_params
 
             # For most other models
             if hasattr(orig_cls, "load_weights"):
-                orig_cls.load_weights(self, weights)  # type: ignore
+                return orig_cls.load_weights(self, weights)  # type: ignore
             # Fallback
             else:
                 loader = AutoWeightsLoader(self)
-                loader.load_weights(weights)
+                return loader.load_weights(weights)
 
     return ModelForPooling  # type: ignore
 

From 85cd82c900b232f749a72e7dffc1121d971ac945 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 16:59:37 +0100
Subject: [PATCH 1101/1240] Upgrade `transformers` to `v4.50.3` (#13905)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  2 +-
 requirements/common.txt                       |  2 +-
 requirements/test.in                          |  2 +-
 requirements/test.txt                         |  2 +-
 tests/distributed/test_pipeline_parallel.py   |  2 +-
 .../vision_language/test_models.py            | 52 +++++++------------
 .../vision_language/vlm_utils/model_utils.py  |  7 +++
 tests/models/registry.py                      | 48 +++++++++++++----
 8 files changed, 68 insertions(+), 49 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 0fa10e0734a..1705757d830 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -73,7 +73,7 @@ The Transformers fallback explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
-- <project:#distributed-serving> (requires `transformers>=4.49.0`)
+- <project:#distributed-serving>
 
 #### Remote code
 
diff --git a/requirements/common.txt b/requirements/common.txt
index dfa20f5e3f0..c7bbdb71b74 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -6,7 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.48.2  # Required for Bamba model and Transformers backend.
+transformers >= 4.50.3
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index a7dd54151de..cf89794b93f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -30,7 +30,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
-transformers==4.48.2
+transformers==4.50.3
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index aed6a5653e2..26ed9dbe32c 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -643,7 +643,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.48.2
+transformers==4.50.3
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 751c4eb096a..6277a1009ff 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -245,7 +245,7 @@ def iter_params(self, model_id: str):
     # [LANGUAGE GENERATION]
     "microsoft/Phi-3.5-MoE-instruct",
     "meta-llama/Llama-3.2-1B-Instruct",
-    # "ArthurZ/Ilama-3.2-1B", NOTE: Uncomment after #13905
+    "ArthurZ/Ilama-3.2-1B",
     "ibm/PowerLM-3b",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index ecb637c62e4..aa3ac7eea6d 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -8,9 +8,7 @@
 from pathlib import PosixPath
 
 import pytest
-from packaging.version import Version
 from transformers import AutoModelForImageTextToText, AutoModelForVision2Seq
-from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.platforms import current_platform
 from vllm.utils import identity
@@ -126,25 +124,6 @@
         dtype="bfloat16",
         marks=[pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")],  # noqa: E501
     ),
-    # TODO(ywang96): Move Qwen2-VL out of core models in favor of Qwen2.5-VL
-    # once we upgraded to transformers>=4.49.0.
-    "qwen2_vl": VLMTestInfo(
-        models=["Qwen/Qwen2-VL-2B-Instruct"],
-        test_type=(
-            VLMTestType.IMAGE,
-            VLMTestType.MULTI_IMAGE,
-            VLMTestType.VIDEO
-        ),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
-        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
-        max_model_len=4096,
-        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
-        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-    ),
     "qwen2_5_vl": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
         test_type=(
@@ -218,12 +197,6 @@
         hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
         stop_str=["<｜end▁of▁sentence｜>", "<｜begin▁of▁sentence｜>"],  # noqa: E501
         image_size_factors=[(), (1.0, ), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "fuyu": VLMTestInfo(
         models=["adept/fuyu-8b"],
@@ -336,6 +309,7 @@
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
         num_video_frames=16,
         max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
@@ -365,12 +339,6 @@
         auto_cls=AutoModelForImageTextToText,
         vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
         patch_hf_runner=model_utils.mantis_patch_hf_runner,
-        marks=[
-            pytest.mark.skipif(
-                Version(TRANSFORMERS_VERSION) >= Version("4.48"),
-                reason="HF model is not compatible with transformers>=4.48",
-            )
-        ],
     ),
     "minicpmv_25": VLMTestInfo(
         models=["openbmb/MiniCPM-Llama3-V-2_5"],
@@ -450,6 +418,23 @@
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "qwen2_vl": VLMTestInfo(
+        models=["Qwen/Qwen2-VL-2B-Instruct"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.cpu_model],
+    ),
     "skywork_r1v": VLMTestInfo(
         models=["Skywork/Skywork-R1V-38B"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
@@ -515,6 +500,7 @@
         max_model_len=16384,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
         vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
         custom_test_opts=[CustomTestOptions(
             inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
index 2ddf28aca4f..2e9190fc689 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -104,6 +104,13 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
+def llava_onevision_hf_model_kwargs(model: str) -> dict:
+    """Workaround to fix the sliding window issue in llava_onevision."""
+    config = AutoConfig.from_pretrained(model)
+    config.text_config.sliding_window = None
+    return config.to_dict()
+
+
 def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
                                       model: str) -> RunnerOutput:
     """Sanitize vllm output [llava-onevision] to compare with hf output."""
diff --git a/tests/models/registry.py b/tests/models/registry.py
index a733fedcdd6..7c8fac08bef 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -34,6 +34,16 @@ class _HfExamplesInfo:
     The minimum version of HF Transformers that is required to run this model.
     """
 
+    max_transformers_version: Optional[str] = None
+    """
+    The maximum version of HF Transformers that this model runs on.
+    """
+
+    transformers_version_reason: Optional[str] = None
+    """
+    The reason for the minimum/maximum version requirement.
+    """
+
     is_available_online: bool = True
     """
     Set this to ``False`` if the name of this architecture no longer exists on
@@ -57,21 +67,28 @@ def check_transformers_version(
         If the installed transformers version does not meet the requirements,
         perform the given action.
         """
-        if self.min_transformers_version is None:
+        if (self.min_transformers_version is None
+                and self.max_transformers_version is None):
             return
 
         current_version = TRANSFORMERS_VERSION
-        required_version = self.min_transformers_version
-        if Version(current_version) < Version(required_version):
-            msg = (
-                f"You have `transformers=={current_version}` installed, but "
-                f"`transformers>={required_version}` is required to run this "
-                "model")
+        min_version = self.min_transformers_version
+        max_version = self.max_transformers_version
+        msg = f"`transformers=={current_version}` installed, but `transformers"
+        if min_version and Version(current_version) < Version(min_version):
+            msg += f">={min_version}` is required to run this model."
+        elif max_version and Version(current_version) > Version(max_version):
+            msg += f"<={max_version}` is required to run this model."
+        else:
+            return
 
-            if on_fail == "error":
-                raise RuntimeError(msg)
-            else:
-                pytest.skip(msg)
+        if self.transformers_version_reason:
+            msg += f" Reason: {self.transformers_version_reason}"
+
+        if on_fail == "error":
+            raise RuntimeError(msg)
+        else:
+            pytest.skip(msg)
 
     def check_available_online(
         self,
@@ -245,6 +262,9 @@ def check_available_online(
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
+                                                extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
+                                                max_transformers_version="4.48",  # noqa: E501
+                                                transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
@@ -266,13 +286,19 @@ def check_available_online(
     "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
     "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
     "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3",  # noqa: E501
+                                                      max_transformers_version="4.48",  # noqa: E501
+                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                       hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
+                                max_transformers_version="4.48",
+                                transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        max_transformers_version="4.48",
+                                        transformers_version_reason="Use of private method which no longer exists.",  # noqa: E501
                                         extras={"olmo": "allenai/Molmo-7B-O-0924"},  # noqa: E501
                                         trust_remote_code=True),
     "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",

From f6c1a50d24a288c969b790e969e3d63d07a5e4e5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 1 Apr 2025 00:01:35 +0800
Subject: [PATCH 1102/1240] [Bugfix] Check dimensions of multimodal embeddings
 in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/vision_language.py |  9 +++---
 tests/distributed/test_pipeline_parallel.py   |  2 +-
 .../vision_language/test_models.py            |  8 ++---
 tests/models/registry.py                      |  3 +-
 vllm/model_executor/models/florence2.py       |  3 +-
 vllm/model_executor/models/fuyu.py            | 23 +++++++++------
 vllm/model_executor/models/gemma3_mm.py       |  6 ++--
 vllm/model_executor/models/idefics3.py        |  9 ++++--
 .../model_executor/models/llava_next_video.py |  9 +++---
 vllm/model_executor/models/minicpmv.py        |  7 +++--
 vllm/model_executor/models/vision.py          |  2 +-
 vllm/v1/worker/gpu_model_runner.py            | 18 ++++++++----
 vllm/v1/worker/tpu_model_runner.py            |  7 +++++
 vllm/v1/worker/utils.py                       | 29 +++++++++++++++++++
 14 files changed, 98 insertions(+), 37 deletions(-)
 create mode 100644 vllm/v1/worker/utils.py

diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 572eabe2619..eb56b0aee6c 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -68,7 +68,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
     prompts = [f"Question: {question} Answer:" for question in questions]
     engine_args = EngineArgs(
-        model="Salesforce/blip2-opt-2.7b",
+        model="Salesforce/blip2-opt-6.7b",
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -128,7 +128,8 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model="microsoft/Florence-2-large",
         tokenizer="facebook/bart-large",
-        max_num_seqs=8,
+        max_model_len=4096,
+        max_num_seqs=2,
         trust_remote_code=True,
         dtype="bfloat16",
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
@@ -511,7 +512,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
-        max_num_seqs=16,
+        max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
 
@@ -700,7 +701,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
     # NOTE: Need L40 (or equivalent) to avoid OOM
     engine_args = EngineArgs(
         model=model_name,
-        max_model_len=8192,
+        max_model_len=6144,
         max_num_seqs=2,
         disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 6277a1009ff..05e30f855ce 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -217,7 +217,7 @@ def iter_params(self, model_id: str):
 
 MULTIMODAL_MODELS = {
     # [Decoder-only]
-    "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
+    "Salesforce/blip2-opt-6.7b": PPTestSettings.fast(),
     "facebook/chameleon-7b": PPTestSettings.fast(),
     "adept/fuyu-8b": PPTestSettings.fast(),
     "THUDM/glm-4v-9b": PPTestSettings.fast(),
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index aa3ac7eea6d..7a9158eff94 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -34,8 +34,6 @@
     # V1 Test: no way to fall back for head_dim = 80
     # https://github.com/vllm-project/vllm/issues/14524
     "qwen_vl",
-    "h2ovl",
-    "blip2",
     # V1 Test: not enough KV cache space in C1.
     "fuyu",
 ]
@@ -161,7 +159,8 @@
         marks=[large_gpu_mark(min_gb=64)],
     ),
     "blip2": VLMTestInfo(
-        models=["Salesforce/blip2-opt-2.7b"],
+        # TODO: Change back to 2.7b once head_dim = 80 is supported
+        models=["Salesforce/blip2-opt-6.7b"],
         test_type=VLMTestType.IMAGE,
         prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
         img_idx_to_prompt=lambda idx: "",
@@ -248,7 +247,8 @@
     "h2ovl": VLMTestInfo(
         models = [
             "h2oai/h2ovl-mississippi-800m",
-            "h2oai/h2ovl-mississippi-2b",
+            # TODO: Re-enable once head_dim = 80 is supported
+            # "h2oai/h2ovl-mississippi-2b",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
         prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 7c8fac08bef..69ebfe4c924 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -259,7 +259,8 @@ def check_available_online(
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
-    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
+    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 3883cd4460f..02535cc5473 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -875,7 +875,8 @@ def _get_prompt_updates(
     Florence2MultiModalProcessor,
     info=Florence2ProcessingInfo,
     dummy_inputs=Florence2DummyInputsBuilder)
-class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsV0Only):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a1004cd0ac6..a807b047a1a 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -39,7 +39,6 @@
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.utils import flatten_2d_lists
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
@@ -66,10 +65,13 @@ class FuyuImagePatchInputs(TypedDict):
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
+
     embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
     """
     A boolean mask indicating which image embeddings correspond
     to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
     """
 
 
@@ -322,16 +324,18 @@ def _validate_shape(d: torch.Tensor):
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
         image_patches = kwargs.pop("image_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         if image_patches is not None:
             if not isinstance(image_patches, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            embed_is_patch = kwargs.pop("embed_is_patch")
             if not isinstance(embed_is_patch, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of embed_is_patch. "
                                  f"Got type: {type(embed_is_patch)}")
+
             image_patches_flat = flatten_bn(image_patches)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
@@ -351,6 +355,7 @@ def _process_image_input(
         assert self.vision_embed_tokens is not None
         vision_embeddings_flat, _ = self.vision_embed_tokens(
             image_patches_flat)
+
         return vision_embeddings_flat.split(patches_per_image, dim=0)
 
     def get_multimodal_embeddings(
@@ -358,13 +363,13 @@ def get_multimodal_embeddings(
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        vision_embeddings = self._process_image_input(image_input)
-        #return vision_embeddings
-        return flatten_2d_lists(
-            scatter_patch_features(*args) for args in zip(
-                vision_embeddings,
-                image_input["embed_is_patch"],
-            ))
+
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9efb57b8c5a..bbdea70a7bc 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -613,7 +613,7 @@ def _image_pixels_to_features(
     def _process_image_input(
         self,
         image_input: Gemma3ImageInputs,
-    ) -> tuple[torch.Tensor, ...]:
+    ) -> list[torch.Tensor]:
         assert self.vision_tower is not None
 
         pixel_values = image_input["pixel_values"]
@@ -625,7 +625,9 @@ def _process_image_input(
         )
         image_embeds = self.multi_modal_projector(image_features)
 
-        return image_embeds.split(num_patches.tolist())
+        return [
+            e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 327ec4640f0..da4a44346c3 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -733,7 +733,10 @@ def _process_image_pixels(
             pixel_attention_mask=pixel_attention_mask,
         )
 
-    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
+    def _process_image_input(
+        self,
+        image_input: ImageInputs,
+    ) -> Union[torch.Tensor, list[torch.Tensor]]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -741,7 +744,9 @@ def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
         image_features = self.model.connector(image_features)
 
         num_patches = image_input["num_patches"]
-        return image_features.split(num_patches.tolist())
+        return [
+            e.flatten(0, 1) for e in image_features.split(num_patches.tolist())
+        ]
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 8a5edefb4a0..780af72d572 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -406,20 +406,21 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs):
                                                h, w)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return stacked_embeddings.view(b, num_frames,
-                                           *stacked_embeddings.shape[1:])
+            embeds = stacked_embeddings.view(b, num_frames,
+                                             *stacked_embeddings.shape[1:])
 
         elif is_list_of(video_pixels, torch.Tensor):
             frames_per_videos = [v.shape[0] for v in video_pixels]
             stacked_pixels = torch.cat(video_pixels, dim=0)
             stacked_embeddings = self._video_pixels_to_features(
                 self.vision_tower, stacked_pixels)
-            return torch.split(stacked_embeddings, frames_per_videos, dim=0)
-
+            embeds = torch.split(stacked_embeddings, frames_per_videos, dim=0)
         else:
             raise ValueError(
                 f"Unsupported type of video input {type(video_pixels)}")
 
+        return [e.flatten(0, 1) for e in embeds]
+
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
         video_input = self._parse_and_validate_video_input(**kwargs)
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2c0d37e883b..5fab9df3f8f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -919,8 +919,11 @@ def _process_vision_input(
 
         image_features_flat = self.get_vision_hidden_states(image_input)
 
-        # Reconstruct the batch dimension
-        return image_features_flat.split(image_input["num_slices"].tolist())
+        num_slices = image_input["num_slices"]
+        return [
+            e.flatten(0, 1)
+            for e in image_features_flat.split(num_slices.tolist())
+        ]
 
     def _process_multimodal_inputs(self, modalities: dict):
         # The result multimodal_embeddings is tuple of tensors, with each
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index db069f8de2a..5c21fb2d4ad 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -204,7 +204,7 @@ def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
             (e_is_patch.shape[0], patches_one.shape[-1]),
             fill_value=torch.nan,
         )
-        embed_one[e_is_patch] = patches_one.flatten(0, -2)
+        embed_one[e_is_patch] = patches_one
         return embed_one
 
     return tuple(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 74f3124e3c7..c7374cc3d33 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -41,6 +41,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
     import xgrammar as xgr
 
@@ -867,6 +869,11 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
@@ -1490,12 +1497,11 @@ def profile_run(self) -> None:
             # Run multimodal encoder.
             dummy_encoder_outputs = self.model.get_multimodal_embeddings(
                 **batched_dummy_mm_inputs)
-            assert len(dummy_encoder_outputs) == max_num_mm_items, (
-                "Expected dimension 0 of encoder outputs to match the number "
-                f"of multimodal data items: {max_num_mm_items}, got "
-                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-                "due to the 'get_multimodal_embeddings' method of the model "
-                "not implemented correctly.")
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )
 
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index ea5a17016eb..8f6a54892a4 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -37,6 +37,8 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
 
@@ -512,6 +514,11 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
new file mode 100644
index 00000000000..b1d3aa7cd8a
--- /dev/null
+++ b/vllm/v1/worker/utils.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: object,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    """
+    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")

From f0265bb152471e11cd039a310f5dca2ccfc55f73 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Tue, 1 Apr 2025 00:19:35 +0800
Subject: [PATCH 1103/1240] [V1][Spec Decode] Remove deprecated spec decode
 config params (#15466)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../tests/serving-tests.json                  |  10 +-
 docs/source/features/spec_decode.md           |   2 +-
 examples/offline_inference/eagle.py           |  10 +-
 tests/metrics/test_metrics.py                 |  12 +-
 tests/models/test_initialization.py           |   6 +-
 .../e2e/test_integration_dist_tp2.py          |  80 ++++++-
 .../e2e/test_integration_dist_tp4.py          |   6 +-
 tests/v1/test_oracle.py                       |   4 +-
 vllm/config.py                                |  19 +-
 vllm/engine/arg_utils.py                      | 196 ++----------------
 10 files changed, 125 insertions(+), 220 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json
index 415171e268b..13fd5aa8db9 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@@ -63,10 +63,12 @@
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
             "disable_log_requests": "", 
             "tensor_parallel_size": 4,
-            "swap_space": 16, 
-            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
-            "num_speculative_tokens": 4,
-            "speculative_draft_tensor_parallel_size": 1
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
         },
         "client_parameters": {
             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md
index 3e1f1d5be75..f16e0d96522 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/source/features/spec_decode.md
@@ -52,7 +52,7 @@ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model
 ```
 
 :::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
+Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
 :::
 
 Then use a client:
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index baa91b2d036..db5012bae29 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -69,10 +69,12 @@
     max_model_len=max_model_len,
     max_num_seqs=args.max_num_seqs,
     gpu_memory_utilization=0.8,
-    speculative_model=eagle_dir,
-    num_speculative_tokens=args.num_spec_tokens,
-    speculative_draft_tensor_parallel_size=args.draft_tp,
-    speculative_max_model_len=max_model_len,
+    speculative_config={
+        "model": eagle_dir,
+        "num_speculative_tokens": args.num_spec_tokens,
+        "draft_tensor_parallel_size": args.draft_tp,
+        "max_model_len": max_model_len,
+    },
     disable_log_stats=False,
 )
 
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 8ddcefd9191..e71c87ff3fc 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -248,8 +248,10 @@ def test_metric_spec_decode(
             dtype=dtype,
             disable_log_stats=False,
             gpu_memory_utilization=0.4,
-            speculative_model=model,
-            num_speculative_tokens=k,
+            speculative_config={
+                "model": model,
+                "num_speculative_tokens": k,
+            },
     ) as vllm_model:
 
         # Force log interval to be 0 to catch all metrics.
@@ -300,8 +302,10 @@ def test_metric_spec_decode_interval(
         dtype=dtype,
         disable_log_stats=False,
         gpu_memory_utilization=0.4,
-        speculative_model=model,
-        num_speculative_tokens=k,
+        speculative_config={
+            "model": model,
+            "num_speculative_tokens": k,
+        },
         enforce_eager=True,
     )
 
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index adb2d6d0a99..58705637ce9 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -54,8 +54,10 @@ def _initalize_kv_caches_v1(self, vllm_config):
             model_info.default,
             tokenizer=model_info.tokenizer,
             tokenizer_mode=model_info.tokenizer_mode,
-            speculative_model=model_info.speculative_model,
-            num_speculative_tokens=1 if model_info.speculative_model else None,
+            speculative_config={
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
             load_format="dummy",
             hf_overrides=hf_overrides,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py
index b8a2631b914..b1129747542 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -3,6 +3,7 @@
 tensor parallelism.
 """
 
+import json
 from typing import Optional
 
 import pytest
@@ -28,14 +29,14 @@
 @pytest.mark.parametrize("test_llm_kwargs", [
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 3,
         }),
     ],
     [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "ngram",
             "num_speculative_tokens": 5,
             "prompt_lookup_max": 3,
@@ -88,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     "model, test_llm_kwargs",
     [("JackFram/llama-68m", [
         "--speculative_config",
-        str({
+        json.dumps({
             "model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
             "draft_tensor_parallel_size": 1,
@@ -96,7 +97,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
     ]),
      ("ibm-granite/granite-3b-code-instruct", [
          "--speculative_config",
-         str({
+         json.dumps({
              "model": "ibm-granite/granite-3b-code-instruct",
              "num_speculative_tokens": 5,
              "draft_tensor_parallel_size": 1,
@@ -147,20 +148,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("model, test_llm_kwargs",
                          [("JackFram/llama-68m", [
                              "--speculative_config",
-                             str({
+                             json.dumps({
                                  "model": "JackFram/llama-68m",
                                  "num_speculative_tokens": 3,
                              }),
                          ]),
                           ("JackFram/llama-68m", [
                               "--speculative_config",
-                              str({
+                              json.dumps({
                                   "model": "JackFram/llama-68m",
                                   "num_speculative_tokens": 3,
                                   "draft_tensor_parallel_size": 1,
                               }),
                           ])])
-@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -171,9 +172,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
     """Verify spec decode works well with same and different TP size for
     the draft model with chunked prefill.
     """
-    if logprobs:
-        test_llm_kwargs.extend(
-            ["--disable_logprobs_during_spec_decoding", "False"])
+    run_equality_correctness_test_tp(model,
+                                     common_llm_kwargs,
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative_config",
+                             json.dumps({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                                 "disable_logprobs": False,
+                             }),
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative_config",
+                              json.dumps({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                                  "disable_logprobs": False,
+                              }),
+                          ])])
+@pytest.mark.parametrize("logprobs", [2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2_with_logprobs(
+        model, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
+        batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
     run_equality_correctness_test_tp(model,
                                      common_llm_kwargs,
                                      per_test_common_llm_kwargs,
diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py
index d42d9029fef..a1b7c8b40c3 100644
--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -3,6 +3,8 @@
 tensor parallelism.
 """
 
+import json
+
 import openai
 import pytest
 import torch
@@ -33,7 +35,7 @@
         #TODO(wooyeon): add spec_draft_dp=2 case
         [
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "draft_tensor_parallel_size": 1,
@@ -80,7 +82,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "--speculative_config",
-            str({
+            json.dumps({
                 "model": f"{SPEC_MODEL}",
                 "num_speculative_tokens": 5,
                 "max_model_len": 32,
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index d74a96fbfa0..762c7bada32 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
         with pytest.raises(NotImplementedError):
             AsyncEngineArgs(
                 model=MODEL,
-                speculative_model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
             ).create_engine_config()
 
         with pytest.raises(NotImplementedError):
diff --git a/vllm/config.py b/vllm/config.py
index bd192af2044..b06f119680f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2047,14 +2047,13 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig:
 
     def __post_init__(self):
 
-        # Note: After next release, the method parameter will be used to
-        # specify the speculative method, which helps to extend the
-        # configuration of non-model-based proposers, and the model parameter
-        # will be used when the draft model or head is needed.
-        # If users do not specify the method, the speculative method will
-        # be detected automatically if possible. If the speculative method can
-        # not be detected, it will be considered as the draft-model-based
-        # method by default.
+        # Note: "method" is a new parameter that helps to extend the
+        # configuration of non-model-based proposers, and the "model" parameter
+        # will be used to set the draft model, eagle head, or additional weight
+        # when needed. If users do not specify "method", the speculative method
+        # will be detected automatically if possible. If the speculative method
+        # can not be detected, it will be considered as the "draft_model" by
+        # default.
 
         if self.model is None and self.num_speculative_tokens is not None:
             # TODO(Shangming): Refactor mtp configuration logic when supporting
@@ -2069,8 +2068,8 @@ def __post_init__(self):
                 raise ValueError("num_speculative_tokens was provided without "
                                  "speculative model.")
 
-        # Automatically configure the ngram method during configuration
-        # refactoring to ensure a smooth transition.
+        # Automatically configure the method for ngram when "model" is used
+        # instead of "method"
         if self.method is None and (self.model is not None
                                     and self.model in ("ngram", "[ngram]")):
             self.method = "ngram"
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1da021d7f70..e29b04ab6e0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -181,22 +181,7 @@ class EngineArgs:
     guided_decoding_backend: str = 'xgrammar'
     logits_processor_pattern: Optional[str] = None
 
-    speculative_config: Optional[Union[str, Dict[str, Any]]] = None
-
-    # TODO(Shangming): Deprecate these out-of-date params after next release
-    speculative_model: Optional[str] = None
-    speculative_model_quantization: Optional[str] = None
-    speculative_draft_tensor_parallel_size: Optional[int] = None
-    num_speculative_tokens: Optional[int] = None
-    speculative_disable_mqa_scorer: Optional[bool] = False
-    speculative_max_model_len: Optional[int] = None
-    speculative_disable_by_batch_size: Optional[int] = None
-    ngram_prompt_lookup_max: Optional[int] = None
-    ngram_prompt_lookup_min: Optional[int] = None
-    spec_decoding_acceptance_method: str = 'rejection_sampler'
-    typical_acceptance_sampler_posterior_threshold: Optional[float] = None
-    typical_acceptance_sampler_posterior_alpha: Optional[float] = None
-    disable_logprobs_during_spec_decoding: Optional[bool] = None
+    speculative_config: Optional[Dict[str, Any]] = None
 
     qlora_adapter_name_or_path: Optional[str] = None
     show_hidden_metrics_for_version: Optional[str] = None
@@ -793,122 +778,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='If set, the prefill requests can be chunked based on the '
             'max_num_batched_tokens.')
         parser.add_argument('--speculative-config',
-                            type=nullable_str,
+                            type=json.loads,
                             default=None,
                             help='The configurations for speculative decoding.'
                             ' Should be a JSON string.')
-        parser.add_argument(
-            '--speculative-model',
-            type=nullable_str,
-            default=EngineArgs.speculative_model,
-            help=
-            'The name of the draft model to be used in speculative decoding.')
-        # Quantization settings for speculative model.
-        parser.add_argument(
-            '--speculative-model-quantization',
-            type=nullable_str,
-            choices=[*QUANTIZATION_METHODS, None],
-            default=EngineArgs.speculative_model_quantization,
-            help='Method used to quantize the weights of speculative model. '
-            'If None, we first check the `quantization_config` '
-            'attribute in the model config file. If that is '
-            'None, we assume the model weights are not '
-            'quantized and use `dtype` to determine the data '
-            'type of the weights.')
-        parser.add_argument(
-            '--num-speculative-tokens',
-            type=int,
-            default=EngineArgs.num_speculative_tokens,
-            help='The number of speculative tokens to sample from '
-            'the draft model in speculative decoding.')
-        parser.add_argument(
-            '--speculative-disable-mqa-scorer',
-            action='store_true',
-            help=
-            'If set to True, the MQA scorer will be disabled in speculative '
-            ' and fall back to batch expansion')
-        parser.add_argument(
-            '--speculative-draft-tensor-parallel-size',
-            '-spec-draft-tp',
-            type=int,
-            default=EngineArgs.speculative_draft_tensor_parallel_size,
-            help='Number of tensor parallel replicas for '
-            'the draft model in speculative decoding.')
-
-        parser.add_argument(
-            '--speculative-max-model-len',
-            type=int,
-            default=EngineArgs.speculative_max_model_len,
-            help='The maximum sequence length supported by the '
-            'draft model. Sequences over this length will skip '
-            'speculation.')
-
-        parser.add_argument(
-            '--speculative-disable-by-batch-size',
-            type=int,
-            default=EngineArgs.speculative_disable_by_batch_size,
-            help='Disable speculative decoding for new incoming requests '
-            'if the number of enqueue requests is larger than this value.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-max',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_max,
-            help='Max size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--ngram-prompt-lookup-min',
-            type=int,
-            default=EngineArgs.ngram_prompt_lookup_min,
-            help='Min size of window for ngram prompt lookup in speculative '
-            'decoding.')
-
-        parser.add_argument(
-            '--spec-decoding-acceptance-method',
-            type=str,
-            default=EngineArgs.spec_decoding_acceptance_method,
-            choices=['rejection_sampler', 'typical_acceptance_sampler'],
-            help='Specify the acceptance method to use during draft token '
-            'verification in speculative decoding. Two types of acceptance '
-            'routines are supported: '
-            '1) RejectionSampler which does not allow changing the '
-            'acceptance rate of draft tokens, '
-            '2) TypicalAcceptanceSampler which is configurable, allowing for '
-            'a higher acceptance rate at the cost of lower quality, '
-            'and vice versa.')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-threshold',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_threshold,
-            help='Set the lower bound threshold for the posterior '
-            'probability of a token to be accepted. This threshold is '
-            'used by the TypicalAcceptanceSampler to make sampling decisions '
-            'during speculative decoding.')
-
-        parser.add_argument(
-            '--typical-acceptance-sampler-posterior-alpha',
-            type=float,
-            default=EngineArgs.typical_acceptance_sampler_posterior_alpha,
-            help='A scaling factor for the entropy-based threshold for token '
-            'acceptance in the TypicalAcceptanceSampler. Typically defaults '
-            'to sqrt of --typical-acceptance-sampler-posterior-threshold '
-            'i.e. 0.3')
-
-        parser.add_argument(
-            '--disable-logprobs-during-spec-decoding',
-            action=StoreBoolean,
-            default=EngineArgs.disable_logprobs_during_spec_decoding,
-            nargs="?",
-            const="True",
-            help='If set to True, token log probabilities are not returned '
-            'during speculative decoding. If set to False, log probabilities '
-            'are returned according to the settings in SamplingParams. If '
-            'not specified, it defaults to True. Disabling log probabilities '
-            'during speculative decoding reduces latency by skipping logprob '
-            'calculation in proposal sampling, target sampling, and after '
-            'accepted tokens are determined.')
 
         parser.add_argument('--model-loader-extra-config',
                             type=nullable_str,
@@ -1221,58 +1094,14 @@ def create_speculative_config(
         This function utilizes `speculative_config` to create a
         SpeculativeConfig object. The `speculative_config` can either be
         provided as a JSON string input via CLI arguments or directly as a
-        dictionary from the engine. If `speculative_config` is not set, this
-        function will attempt to construct a configuration dictionary using
-        certain parameters, which are scheduled for deprecation in the next
-        release. Note that in next releases, `speculative_config` must be
-        provided, and the deprecated standalone speculative-related parameters
-        will be removed.
+        dictionary from the engine.
         """
         if self.speculative_config is None:
-            if (self.speculative_model is None
-                    and self.num_speculative_tokens is None):
-                return None
-
-            # TODO(Shangming): Deprecate this way of setting SpeculativeConfig,
-            # only allow '--speculative-config' after next release
-            logger.warning_once(
-                "Please use '--speculative-config' to set all configurations "
-                "related to speculative decoding. The current method of "
-                "specifying the model through '--speculative-model' and "
-                "adding related parameters (e.g., '--num-speculative-tokens') "
-                "separately will be deprecated in the next release.")
-
-            spec_config_dict = {
-                "model": self.speculative_model,
-                "quantization": self.speculative_model_quantization,
-                "max_model_len": self.speculative_max_model_len,
-                "draft_tensor_parallel_size":
-                self.speculative_draft_tensor_parallel_size,
-                "num_speculative_tokens": self.num_speculative_tokens,
-                "disable_mqa_scorer": self.speculative_disable_mqa_scorer,
-                "disable_by_batch_size":
-                self.speculative_disable_by_batch_size,
-                "prompt_lookup_max": self.ngram_prompt_lookup_max,
-                "prompt_lookup_min": self.ngram_prompt_lookup_min,
-                "acceptance_method": self.spec_decoding_acceptance_method,
-                "posterior_threshold":
-                self.typical_acceptance_sampler_posterior_threshold,
-                "posterior_alpha":
-                self.typical_acceptance_sampler_posterior_alpha,
-                "disable_logprobs": self.disable_logprobs_during_spec_decoding,
-            }
+            return None
 
-            self.speculative_config = spec_config_dict
-        else:
-            if isinstance(self.speculative_config, str):
-                import ast
-                self.speculative_config = ast.literal_eval(
-                    self.speculative_config)
         # Note(Shangming): These parameters are not obtained from the cli arg
         # '--speculative-config' and must be passed in when creating the engine
         # config.
-
-        assert isinstance(self.speculative_config, dict)
         self.speculative_config.update({
             "target_model_config": target_model_config,
             "target_parallel_config": target_parallel_config,
@@ -1638,11 +1467,15 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # Only Ngram speculative decoding so far.
-        if (self.speculative_model is not None
-                or self.num_speculative_tokens is not None):
+        is_ngram_enabled = False
+        if self.speculative_config is not None:
             # This is supported but experimental (handled below).
-            if self.speculative_model in ("ngram", "[ngram]"):
-                pass
+            if (("method" in self.speculative_config
+                 and self.speculative_config["method"] in ("ngram", "[ngram]"))
+                    or
+                ("model" in self.speculative_config and
+                 self.speculative_config["model"] in ("ngram", "[ngram]"))):
+                is_ngram_enabled = True
             else:
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
@@ -1691,8 +1524,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
             return False
 
         # ngram is supported on V1, but off by default for now.
-        if self.speculative_model in (
-                "ngram", "[ngram]") and _warn_or_fallback("ngram"):
+        if is_ngram_enabled and _warn_or_fallback("ngram"):
             return False
 
         # Non-CUDA is supported on V1, but off by default for now.
@@ -1721,7 +1553,7 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
                 is_gpu = current_platform.is_cuda()
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
-                use_spec_decode = self.speculative_model is not None
+                use_spec_decode = self.speculative_config is not None
 
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora

From cabe554eace1c5f08ef7b922117c788f8c7f8da7 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Tue, 1 Apr 2025 01:00:50 +0800
Subject: [PATCH 1104/1240] fix: change GB to GiB in logging close #14979
 (#15807)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/core/kv_cache_utils.py     | 4 ++--
 vllm/v1/worker/gpu_model_runner.py | 8 ++++----
 vllm/worker/model_runner.py        | 4 ++--
 vllm/worker/xpu_model_runner.py    | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 0d58d4d2218..13a3756fdac 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -488,9 +488,9 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
     if needed_memory > available_memory:
         raise ValueError(
             f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GB KV "
+            f"({max_model_len}), ({needed_memory/1024/1024/1024:.2f} GiB KV "
             f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory/1024/1024/1024:.2f} GB). Try "
+            f"memory ({available_memory/1024/1024/1024:.2f} GiB). Try "
             f"increasing `gpu_memory_utilization` or decreasing "
             f"`max_model_len` when initializing the engine.")
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c7374cc3d33..43c756b193a 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -24,8 +24,8 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        LayerBlockType, LazyLoader, cdiv, check_use_alibi,
-                        is_pin_memory_available)
+                        GiB_bytes, LayerBlockType, LazyLoader, cdiv,
+                        check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -1206,8 +1206,8 @@ def load_model(self) -> None:
                                                   self.device)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
 
     def _get_prompt_logprobs_dict(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index edbafb48c93..86e6d975201 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1143,8 +1143,8 @@ def load_model(self) -> None:
             time_after_load = time.perf_counter()
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Model loading took %.4f GB and %.6f seconds",
-                    self.model_memory_usage / float(2**30),
+        logger.info("Model loading took %.4f GiB and %.6f seconds",
+                    self.model_memory_usage / GiB_bytes,
                     time_after_load - time_before_load)
         if self.prompt_adapter_config:
             self.prompt_adapter_manager = LRUCacheWorkerPromptAdapterManager(
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 39957e661c4..9d49b4385dc 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -25,7 +25,7 @@
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad
+from vllm.utils import DeviceMemoryProfiler, GiB_bytes, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -422,8 +422,8 @@ def load_model(self) -> None:
             self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GiB",
+                    self.model_memory_usage / GiB_bytes)
 
     def get_model(self) -> nn.Module:
         return self.model

From f275ca3aede496c0bf3d39db83a594b3542ebad9 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Mon, 31 Mar 2025 13:25:20 -0400
Subject: [PATCH 1105/1240] [V1] TPU CI - Add basic perf regression test
 (#15414)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh          |   2 +
 tests/entrypoints/llm/test_accuracy.py |   2 +-
 tests/v1/tpu/test_basic.py             |   5 +-
 tests/v1/tpu/test_perf.py              | 146 +++++++++++++++++++++++++
 vllm/v1/worker/tpu_model_runner.py     |  57 +++++++---
 5 files changed, 192 insertions(+), 20 deletions(-)
 create mode 100644 tests/v1/tpu/test_perf.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 89252000f40..8616ea2b79d 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -21,6 +21,8 @@ docker run --privileged --net host --shm-size=16G -it \
     && python3 -m pip install lm_eval[api]==0.4.4 \
     && export VLLM_USE_V1=1 \
     && export VLLM_XLA_CHECK_RECOMPILATION=1 \
+    && echo TEST_0 \
+    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
     && echo TEST_1 \
     && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
     && echo TEST_2 \
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 77fbb5827da..2bc32ace0a5 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -58,7 +58,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
         more_args = None
         if current_platform.is_tpu():
             # Limit compilation time for TPU V1
-            more_args = "max_num_seqs=64"
+            more_args = "max_model_len=2048,max_num_seqs=64"
 
             # Add TP test (if provided)
             if TPU_TP_TEST_STR:
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 0d7e8d8d7f5..8164952fe38 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -32,7 +32,7 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
-def test_models(
+def test_basic(
     vllm_runner: type[VllmRunner],
     monkeypatch: pytest.MonkeyPatch,
     model: str,
@@ -58,4 +58,5 @@ def test_models(
             vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                       max_tokens)
         output = vllm_outputs[0][1]
-        assert "1024" in output
+
+        assert "1024" in output or "0, 1" in output
diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py
new file mode 100644
index 00000000000..94a1da88a2f
--- /dev/null
+++ b/tests/v1/tpu/test_perf.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (vllm CI/CD)
+        expected_avg_time=1.4,
+        err_tol=0.30,
+
+        # (TODO: There is no v6e in CI/CD currently)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e
+        # expected_avg_time=1.5,
+        # err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic performance test for TPU only")
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(params.model,
+                              tokenizer_mode="auto",
+                              trust_remote_code=True)
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(0,
+                                             tokenizer.vocab_size,
+                                             size=params.prefix_len).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len))
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=params.decode_len,
+                                         temperature=1.0,
+                                         min_p=0.0)
+
+        with vllm_runner(params.model,
+                         max_num_batched_tokens=MAX_MODEL_LEN,
+                         max_model_len=MAX_MODEL_LEN,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         gpu_memory_utilization=GPU_UTIL,
+                         enforce_eager=False,
+                         tensor_parallel_size=1) as vllm_model:
+            print("  -- Warmup / Compile")
+            for i in range(NUM_WARMUPS):
+                _ = vllm_model.generate(prompts, sampling_params)
+
+            print("  -- Benchmarking... ")
+            times = []
+            for i in range(NUM_RUNS):
+                start_time = time.time()
+                _ = vllm_model.generate(prompts, sampling_params)
+                times.append(time.time() - start_time)
+
+            avg_time = sum(times) / len(times)
+
+            print("  -- avg_time = {}".format(avg_time))
+            print("  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol))
+            diff = avg_time - params.expected_avg_time
+            ok = diff < params.err_tol
+            if diff < -params.err_tol:
+                print("  !! WARNING !! Performance has improved by {}, "
+                      "it may be necessary to fine-tune the "
+                      "expected_avg_time = {}".format(
+                          -diff, params.expected_avg_time))
+
+            assert ok, " !! ERROR !! Regression detected"
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 8f6a54892a4..7f7318a7bdd 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -77,9 +77,12 @@ def __init__(
         parallel_config = self.parallel_config
         self.device = device
         self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION
-        if self.check_recompilation:
-            self.num_xla_graphs = xr.get_num_cached_compilation_graph()
+
         self.enforce_eager = model_config.enforce_eager
+
+        self.num_xla_graphs = 0
+        self._update_num_xla_graphs("init")
+
         self.pin_memory = is_pin_memory_available()
         self.dtype = self.model_config.dtype
         self._hidden_states_dtype = self.dtype
@@ -180,6 +183,31 @@ def __init__(
             max_token_size=self.max_num_tokens,
             padding_gap=envs.VLLM_TPU_BUCKET_PADDING_GAP)
 
+    def _update_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        total_cached_graphs = xr.get_num_cached_compilation_graph()
+        new_compiled_graphs = total_cached_graphs - self.num_xla_graphs
+        if new_compiled_graphs == 0:
+            return
+
+        logger.info("Add new %d compiled XLA graphs due to %s",
+                    new_compiled_graphs, case_str)
+        self.num_xla_graphs += new_compiled_graphs
+
+    def _verify_num_xla_graphs(self, case_str):
+        check_comp = self.check_recompilation and not self.enforce_eager
+        if not check_comp:
+            return
+
+        curr_cached_graph = xr.get_num_cached_compilation_graph()
+        assert self.num_xla_graphs == curr_cached_graph, (
+            "Recompilation after warm up is detected during {}."
+            " num_xla_graphs = {} curr_cached_graph = {}".format(
+                case_str, self.num_xla_graphs, curr_cached_graph))
+
     def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         """Update the cached states and the persistent batch with the scheduler
         output.
@@ -694,12 +722,11 @@ def execute_model(
             logprobs=None,
             prompt_logprobs_dict=prompt_logprobs_dict,
         )
-        # Check there is no new graph compilation, all the graphs should be
-        # captured and compiled during warming up.
-        if self.check_recompilation and not self.enforce_eager:
-            curr_cached_graph = xr.get_num_cached_compilation_graph()
-            assert self.num_xla_graphs == curr_cached_graph, (
-                "Recompilation after warm up is detected.")
+
+        # Check there are no new graphs compiled - all the graphs should be
+        # captured and compiled during warm up.
+        self._verify_num_xla_graphs("execute_model")
+
         return model_runner_output
 
     def load_model(self) -> None:
@@ -797,7 +824,9 @@ def capture_model(self) -> None:
             xm.mark_step()
         xm.wait_device_ops()
         end = time.perf_counter()
+
         logger.info("Compilation finished in in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("model")
 
         logger.info("Compiling sampling with different input shapes.")
         start = time.perf_counter()
@@ -832,15 +861,9 @@ def capture_model(self) -> None:
                     num_reqs_to_sample + 1, self.max_num_reqs)
         xm.wait_device_ops()
         end = time.perf_counter()
-        logger.info("Compilation finished in %.2f [secs].", end - start)
-        # Record the number cached XLA graph after warming up, this will be
-        # used for checking there is no additional graph compilation during
-        # runtime execution.
-        if self.check_recompilation:
-            total_cached_graphs = xr.get_num_cached_compilation_graph()
-            num_compiled_graphs = total_cached_graphs - self.num_xla_graphs
-            logger.info("Compiled %d XLA graphs.", num_compiled_graphs)
-            self.num_xla_graphs += num_compiled_graphs
+
+        logger.info("Compilation finished in in %.2f [secs].", end - start)
+        self._update_num_xla_graphs("sampling")
 
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """

From 14c44984bc3c337cf4f39dc25f0c0b5be943f7a3 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 18:27:07 +0100
Subject: [PATCH 1106/1240] Fix Transformers backend compatibility check
 (#15290)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/model_loader/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index a252c7f8e57..d9613fab3a2 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -36,10 +36,7 @@ def is_transformers_impl_compatible(
     mod = module or getattr(transformers, arch, None)
     if mod is None:
         return False
-    if hasattr(mod, "supports_backend"):
-        return mod.is_backend_compatible()
-    else:
-        return mod._supports_flex_attn
+    return mod.is_backend_compatible()
 
 
 def resolve_transformers_fallback(model_config: ModelConfig,

From b08d1577ba8a924a099e7abae232ce7f19e50c6c Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 31 Mar 2025 20:15:21 +0100
Subject: [PATCH 1107/1240] [V1][Core] Remove unused speculative config from
 scheduler (#15818)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py | 1 -
 vllm/v1/core/sched/scheduler.py | 5 +----
 vllm/v1/engine/core.py          | 1 -
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 24a51288cbb..5b96566530c 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -71,7 +71,6 @@ def create_scheduler(
         scheduler_config,
         model_config,
         cache_config,
-        speculative_config=None,
         lora_config=None,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index aafa2f0a9f3..9e6c8e69d55 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -7,8 +7,7 @@
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -36,7 +35,6 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
@@ -45,7 +43,6 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
-        self.speculative_config = speculative_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 6083eea45cd..68a1dc15330 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -95,7 +95,6 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
-            speculative_config=vllm_config.speculative_config,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
             log_stats=self.log_stats,

From 84aed20c8d2406ee9ed579eea81373d748dc979f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Mon, 31 Mar 2025 21:47:32 +0100
Subject: [PATCH 1108/1240] Move dockerfiles into their own directory (#14549)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml                   | 12 ++++++------
 .buildkite/run-cpu-test-ppc64le.sh                 |  2 +-
 .buildkite/run-cpu-test.sh                         |  4 ++--
 .buildkite/run-gh200-test.sh                       |  1 +
 .buildkite/run-hpu-test.sh                         |  2 +-
 .buildkite/run-neuron-test.sh                      |  2 +-
 .buildkite/run-tpu-v1-test.sh                      |  2 +-
 .buildkite/run-xpu-test.sh                         |  2 +-
 .github/mergify.yml                                |  2 +-
 .github/workflows/lint-and-deploy.yaml             |  2 +-
 CMakeLists.txt                                     |  2 +-
 Dockerfile => docker/Dockerfile                    |  0
 Dockerfile.arm => docker/Dockerfile.arm            |  0
 Dockerfile.cpu => docker/Dockerfile.cpu            |  0
 Dockerfile.hpu => docker/Dockerfile.hpu            |  0
 Dockerfile.neuron => docker/Dockerfile.neuron      |  0
 Dockerfile.ppc64le => docker/Dockerfile.ppc64le    |  0
 Dockerfile.rocm => docker/Dockerfile.rocm          |  0
 .../Dockerfile.rocm_base                           |  0
 Dockerfile.s390x => docker/Dockerfile.s390x        |  0
 Dockerfile.tpu => docker/Dockerfile.tpu            |  0
 Dockerfile.xpu => docker/Dockerfile.xpu            |  0
 docs/source/contributing/dockerfile/dockerfile.md  |  6 +++---
 docs/source/contributing/overview.md               |  2 +-
 docs/source/deployment/docker.md                   |  5 +++--
 docs/source/deployment/nginx.md                    |  4 ++--
 .../installation/ai_accelerator/hpu-gaudi.inc.md   |  2 +-
 .../installation/ai_accelerator/neuron.inc.md      |  2 +-
 .../installation/ai_accelerator/tpu.inc.md         |  4 ++--
 docs/source/getting_started/installation/cpu.md    |  6 +++---
 .../getting_started/installation/gpu/rocm.inc.md   | 14 +++++++-------
 .../getting_started/installation/gpu/xpu.inc.md    |  2 +-
 docs/source/getting_started/quickstart.md          |  2 +-
 vllm/config.py                                     |  4 ++--
 34 files changed, 44 insertions(+), 42 deletions(-)
 rename Dockerfile => docker/Dockerfile (100%)
 rename Dockerfile.arm => docker/Dockerfile.arm (100%)
 rename Dockerfile.cpu => docker/Dockerfile.cpu (100%)
 rename Dockerfile.hpu => docker/Dockerfile.hpu (100%)
 rename Dockerfile.neuron => docker/Dockerfile.neuron (100%)
 rename Dockerfile.ppc64le => docker/Dockerfile.ppc64le (100%)
 rename Dockerfile.rocm => docker/Dockerfile.rocm (100%)
 rename Dockerfile.rocm_base => docker/Dockerfile.rocm_base (100%)
 rename Dockerfile.s390x => docker/Dockerfile.s390x (100%)
 rename Dockerfile.tpu => docker/Dockerfile.tpu (100%)
 rename Dockerfile.xpu => docker/Dockerfile.xpu (100%)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a1dcb01e482..a420759aad9 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -3,7 +3,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -14,7 +14,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
     agents:
       queue: cpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/upload-wheels.sh"
@@ -48,7 +48,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build and publish TPU release image"
@@ -57,7 +57,7 @@ steps:
     agents:
       queue: tpu_queue_postmerge
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
       - "docker push vllm/vllm-tpu:nightly"
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
@@ -82,7 +82,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f Dockerfile.cpu ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index bc06838d804..9c5cf7cad94 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -10,5 +10,5 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.ppc64le .
+docker build -t cpu-test -f docker/Dockerfile.ppc64le .
 
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index bf9f191d3b0..40f3df96065 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -18,8 +18,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 5c004b47778..8c64e14606d 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -9,6 +9,7 @@ python3 use_existing_torch.py
 
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t gh200-test \
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index f83eb927aae..95b6ac37f18 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -5,7 +5,7 @@
 set -ex
 
 # Try building the docker image
-docker build -t hpu-test-env -f Dockerfile.hpu .
+docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index ad5ae6f4157..ec6a080eb49 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -35,7 +35,7 @@ else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t "${image_name}" -f Dockerfile.neuron .
+docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 
 # Setup cleanup
 remove_docker_container() {
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 8616ea2b79d..4aac57cca94 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -3,7 +3,7 @@
 set -e
 
 # Build the docker image.
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 
 # Set up cleanup.
 remove_docker_container() { docker rm -f tpu-test || true; }
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 3a0e6bdb2ca..f54010c4231 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -8,7 +8,7 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 
 # Try building the docker image
-docker build -t ${image_name} -f Dockerfile.xpu .
+docker build -t ${image_name} -f docker/Dockerfile.xpu .
 
 # Setup cleanup
 remove_docker_container() { 
diff --git a/.github/mergify.yml b/.github/mergify.yml
index e071ece6f1d..3097b994659 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -19,7 +19,7 @@ pull_request_rules:
       - files~=\.buildkite/
       - files~=^cmake/
       - files=CMakeLists.txt
-      - files~=^Dockerfile
+      - files~=^docker/Dockerfile
       - files~=^requirements.*\.txt
       - files=setup.py
   actions:
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
index b199d0867a6..7b1d9f69938 100644
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@@ -50,7 +50,7 @@ jobs:
         uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 
       - name: Build the Docker image vllm cpu
-        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
+        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 
       - name: Configuration of docker images, network and namespace for the kind cluster
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab6185e9a63..d0436aa1d0a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 #
 # Note: the CUDA torch version is derived from pyproject.toml and various
 # requirements.txt files and should be kept consistent.  The ROCm torch
-# versions are derived from Dockerfile.rocm
+# versions are derived from docker/Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
diff --git a/Dockerfile b/docker/Dockerfile
similarity index 100%
rename from Dockerfile
rename to docker/Dockerfile
diff --git a/Dockerfile.arm b/docker/Dockerfile.arm
similarity index 100%
rename from Dockerfile.arm
rename to docker/Dockerfile.arm
diff --git a/Dockerfile.cpu b/docker/Dockerfile.cpu
similarity index 100%
rename from Dockerfile.cpu
rename to docker/Dockerfile.cpu
diff --git a/Dockerfile.hpu b/docker/Dockerfile.hpu
similarity index 100%
rename from Dockerfile.hpu
rename to docker/Dockerfile.hpu
diff --git a/Dockerfile.neuron b/docker/Dockerfile.neuron
similarity index 100%
rename from Dockerfile.neuron
rename to docker/Dockerfile.neuron
diff --git a/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
similarity index 100%
rename from Dockerfile.ppc64le
rename to docker/Dockerfile.ppc64le
diff --git a/Dockerfile.rocm b/docker/Dockerfile.rocm
similarity index 100%
rename from Dockerfile.rocm
rename to docker/Dockerfile.rocm
diff --git a/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
similarity index 100%
rename from Dockerfile.rocm_base
rename to docker/Dockerfile.rocm_base
diff --git a/Dockerfile.s390x b/docker/Dockerfile.s390x
similarity index 100%
rename from Dockerfile.s390x
rename to docker/Dockerfile.s390x
diff --git a/Dockerfile.tpu b/docker/Dockerfile.tpu
similarity index 100%
rename from Dockerfile.tpu
rename to docker/Dockerfile.tpu
diff --git a/Dockerfile.xpu b/docker/Dockerfile.xpu
similarity index 100%
rename from Dockerfile.xpu
rename to docker/Dockerfile.xpu
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md
index 96674805df5..90b9a33cfbe 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/source/contributing/dockerfile/dockerfile.md
@@ -1,6 +1,6 @@
 # Dockerfile
 
-We provide a <gh-file:Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
+We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
 More information about deploying with Docker can be found [here](#deployment-docker).
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
@@ -28,7 +28,7 @@ The edges of the build graph represent:
   > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
   >
   > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
+  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
   > ```
   >
   > or in case you want to run it directly with the docker image:
@@ -43,7 +43,7 @@ The edges of the build graph represent:
   >    --output png \
   >    --dpi 200 \
   >    --max-label-length 50 \
-  >    --filename Dockerfile \
+  >    --filename docker/Dockerfile \
   >    --legend
   > ```
   >
diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md
index 1e6f73dd524..31c7059fda3 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/source/contributing/overview.md
@@ -45,7 +45,7 @@ pytest tests/
 ```
 
 :::{tip}
-Since the <gh-file:Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
 
 Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 :::
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
index 65cb038de1b..1ccb04ac625 100644
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@@ -61,11 +61,11 @@ RUN uv pip install --system git+https://github.com/huggingface/transformers.git
 
 ## Building vLLM's Docker Image from Source
 
-You can build and run vLLM from source via the provided <gh-file:Dockerfile>. To build vLLM:
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
 
 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
+DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
 ```
 
 :::{note}
@@ -92,6 +92,7 @@ Keep an eye on memory usage with parallel jobs as it can be substantial (see exa
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
 $ python3 use_existing_torch.py
 $ DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
   -t vllm/vllm-gh200-openai:latest \
diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md
index 62816f514c0..bf404f1098c 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/source/deployment/nginx.md
@@ -69,14 +69,14 @@ server {
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm
+docker build -f docker/Dockerfile . --tag vllm
 ```
 
 If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
 
 ```console
 cd $vllm_root
-docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
 ```
 
 (nginxloadbalancer-nginx-docker-network)=
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index e91ed6fbd7a..e3046f35ee1 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -86,7 +86,7 @@ Currently, there are no pre-built Intel Gaudi images.
 ### Build image from source
 
 ```console
-docker build -f Dockerfile.hpu -t vllm-hpu-env  .
+docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
 docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
 ```
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
index 4c668a8e689..b4bfb696faa 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -132,7 +132,7 @@ Currently, there are no pre-built Neuron images.
 
 See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
 
-Make sure to use <gh-file:Dockerfile.neuron> in place of the default Dockerfile.
+Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
 
 ## Extra information
 
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
index 6c7bbf60249..beb803cf059 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -169,10 +169,10 @@ See <project:#deployment-docker-pre-built-image> for instructions on using the o
 
 ### Build image from source
 
-You can use <gh-file:Dockerfile.tpu> to build a Docker image with TPU support.
+You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
 
 ```console
-docker build -f Dockerfile.tpu -t vllm-tpu .
+docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 ```
 
 Run the Docker image with the following command:
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index 844b184afc9..e7e12bd6830 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -177,7 +177,7 @@ Currently, there are no pre-built CPU wheels.
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
+$ docker build -f docker/Dockerfile.cpu --tag vllm-cpu-env --target vllm-openai .
 
 # Launching OpenAI server 
 $ docker run --rm \
@@ -193,11 +193,11 @@ $ docker run --rm \
 ```
 
 ::::{tip}
-For ARM or Apple silicon, use `Dockerfile.arm`
+For ARM or Apple silicon, use `docker/Dockerfile.arm`
 ::::
 
 ::::{tip}
-For IBM Z (s390x), use `Dockerfile.s390x` and in `docker run` use flag `--dtype float`
+For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 ::::
 
 ## Supported features
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index cdd487696c8..eae7a235851 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -123,7 +123,7 @@ Building the Docker image from source is the recommended way to use vLLM with RO
 
 #### (Optional) Build an image with ROCm software stack
 
-Build a docker image from <gh-file:Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
+Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm software stack needed by the vLLM.
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
 
@@ -140,12 +140,12 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
 ```
 
 #### Build an image with vLLM
 
-First, build a docker image from <gh-file:Dockerfile.rocm> and launch a docker container from the image.
+First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
 It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```console
@@ -156,10 +156,10 @@ It is important that the user kicks off the docker build using buildkit. Either
 }
 ```
 
-<gh-file:Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
+<gh-file:docker/Dockerfile.rocm> uses ROCm 6.3 by default, but also supports ROCm 5.7, 6.0, 6.1, and 6.2, in older vLLM branches.
 It provides flexibility to customize the build of docker image using the following arguments:
 
-- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:Dockerfile.rocm_base>
+- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using <gh-file:docker/Dockerfile.rocm_base>
 - `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build
 - `BUILD_RPD`: Include RocmProfileData profiling tool in the image
 - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image
@@ -169,13 +169,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md
index 84a9b387789..c41905f250f 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@@ -54,7 +54,7 @@ Currently, there are no pre-built XPU images.
 ### Build image from source
 
 ```console
-$ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+$ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
 $ docker run -it \
              --rm \
              --network=host \
diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md
index b5246c41883..25189b006c2 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/source/getting_started/quickstart.md
@@ -208,5 +208,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 
 ```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it.
+There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
 ```
diff --git a/vllm/config.py b/vllm/config.py
index b06f119680f..1dd9359199c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -317,8 +317,8 @@ def __init__(
             ) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
             raise ValueError(
                 "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer "
-                "module was not found."
-                "See https://github.com/vllm-project/vllm/blob/main/Dockerfile "
+                "module was not found. See "
+                "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile "  # noqa: E501
                 "for instructions on how to install it.")
 
         # The tokenizer version is consistent with the model version by default.

From 37e67c7862f8e520a77864460810aa09c3173f8a Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 1 Apr 2025 07:49:12 +0200
Subject: [PATCH 1109/1240] [Distributed] Add custom allreduce support for ROCM
 (#14125)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 CMakeLists.txt                                |   2 +-
 csrc/custom_all_reduce.cu                     |  49 +++-
 csrc/custom_all_reduce.cuh                    | 239 ++++++++++++------
 csrc/custom_all_reduce_test.cu                |  58 ++++-
 csrc/ops.h                                    |   9 +-
 csrc/torch_bindings.cpp                       |  11 +-
 tests/distributed/test_custom_all_reduce.py   |   2 +-
 tests/utils.py                                |  11 +-
 vllm/_custom_ops.py                           |  16 +-
 vllm/config.py                                |   6 +-
 .../device_communicators/custom_all_reduce.py |  91 +++----
 vllm/platforms/cuda.py                        |   6 +-
 vllm/platforms/rocm.py                        |  33 ++-
 13 files changed, 373 insertions(+), 160 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d0436aa1d0a..15db4a4f4cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,6 +242,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/gguf/gguf_kernel.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/prepare_inputs/advance_step.cu"
+  "csrc/custom_all_reduce.cu"
   "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -283,7 +284,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/mamba/causal_conv1d/causal_conv1d.cu"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/custom_all_reduce.cu"
     "csrc/permute_cols.cu"
     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
     "csrc/quantization/fp4/nvfp4_quant_entry.cu"
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 123278bfed7..a38d6fa24a2 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -12,7 +12,7 @@ static_assert(sizeof(void*) == sizeof(fptr_t));
 
 fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
                       torch::Tensor& rank_data, int64_t rank,
-                      bool full_nvlink) {
+                      bool fully_connected) {
   int world_size = fake_ipc_ptrs.size();
   if (world_size > 8)
     throw std::invalid_argument("world size > 8 is not supported");
@@ -27,7 +27,7 @@ fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
   }
   return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
                                             rank_data.numel(), rank, world_size,
-                                            full_nvlink);
+                                            fully_connected);
 }
 
 /**
@@ -142,3 +142,48 @@ void register_graph_buffers(fptr_t _fa,
   bytes.reserve(handles.size());
   fa->register_graph_buffers(bytes, offsets);
 }
+
+std::tuple<fptr_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size) {
+  auto device_index = c10::cuda::current_device();
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  void* buffer;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Allocate buffer
+#if defined(USE_ROCM)
+  // data buffers need to be "uncached" for signal on MI200
+  AT_CUDA_CHECK(
+      hipExtMallocWithFlags((void**)&buffer, size, hipDeviceMallocUncached));
+#else
+  AT_CUDA_CHECK(cudaMalloc((void**)&buffer, size));
+#endif
+  AT_CUDA_CHECK(cudaMemsetAsync(buffer, 0, size, stream));
+  AT_CUDA_CHECK(cudaStreamSynchronize(stream));
+  AT_CUDA_CHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+
+  // Create IPC memhandle for the allocated buffer.
+  // Will use it in open_mem_handle.
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handle =
+      torch::empty({static_cast<int64_t>(sizeof(cudaIpcMemHandle_t))}, options);
+  AT_CUDA_CHECK(
+      cudaIpcGetMemHandle((cudaIpcMemHandle_t*)handle.data_ptr(), buffer));
+
+  return std::make_tuple(reinterpret_cast<fptr_t>(buffer), handle);
+}
+
+fptr_t open_mem_handle(torch::Tensor& mem_handle) {
+  void* ipc_ptr;
+  AT_CUDA_CHECK(cudaIpcOpenMemHandle(
+      (void**)&ipc_ptr, *((const cudaIpcMemHandle_t*)mem_handle.data_ptr()),
+      cudaIpcMemLazyEnablePeerAccess));
+  return reinterpret_cast<fptr_t>(ipc_ptr);
+}
+
+void free_shared_buffer(fptr_t buffer) {
+  AT_CUDA_CHECK(cudaFree(reinterpret_cast<void*>(buffer)));
+}
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index b9df4ed160b..7150ce29b41 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -5,6 +5,10 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
+#if defined(USE_ROCM)
+typedef __hip_bfloat16 nv_bfloat16;
+#endif
+
 #include <iostream>
 #include <array>
 #include <limits>
@@ -12,6 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
+namespace vllm {
 #define CUDACHECK(cmd)                                              \
   do {                                                              \
     cudaError_t e = cmd;                                            \
@@ -22,24 +27,37 @@
     }                                                               \
   } while (0)
 
-namespace vllm {
-
+// Maximal number of blocks in allreduce kernel.
 constexpr int kMaxBlocks = 36;
+
+// Default number of blocks in allreduce kernel.
+#ifndef USE_ROCM
+const int defaultBlockLimit = 36;
+CUpointer_attribute rangeStartAddrAttr = CU_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#else
+const int defaultBlockLimit = 16;
+hipPointer_attribute rangeStartAddrAttr =
+    HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR;
+#endif
+
 // Counter may overflow, but it's fine since unsigned int overflow is
 // well-defined behavior.
 using FlagType = uint32_t;
+
+// Two sets of peer counters are needed for two syncs: starting and ending an
+// operation. The reason is that it's possible for peer GPU block to arrive at
+// the second sync point while the current GPU block haven't passed the first
+// sync point. Thus, peer GPU may write counter+1 while current GPU is busy
+// waiting for counter. We use alternating counter array to avoid this
+// possibility.
 struct Signal {
-  alignas(128) FlagType self_counter[kMaxBlocks][8];
-  // Two sets of peer counters are needed for two syncs. The reason is that
-  // it's possible for peer GPU block to arrive at the second sync point while
-  // the current GPU block haven't passed the first sync point. Thus, peer GPU
-  // may write counter+1 while current GPU is busy waiting for counter. We use
-  // alternating counter array to avoid this possibility.
-  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+  alignas(128) FlagType start[kMaxBlocks][8];
+  alignas(128) FlagType end[kMaxBlocks][8];
+  alignas(128) FlagType _flag[kMaxBlocks];  // incremental flags for each rank
 };
 
 struct __align__(16) RankData {
-  const void* __restrict__ ptrs[8];
+  const void* ptrs[8];
 };
 
 struct __align__(16) RankSignals {
@@ -134,27 +152,29 @@ DINLINE O downcast(array_t<float, O::size> val) {
   }
 }
 
+#if !defined(USE_ROCM)
+
 static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#else
+  #else
   asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
                "l"(flag_addr));
-#endif
+  #endif
 }
 
 static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
   FlagType flag;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
   asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
                : "=r"(flag)
                : "l"(flag_addr));
-#else
+  #else
   asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
                : "=r"(flag)
                : "l"(flag_addr));
-#endif
+  #endif
   return flag;
 }
 
@@ -170,37 +190,99 @@ static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
   return flag;
 }
 
-// is_start: whether this is the very first synchronization barrier.
-// need_fence: whether a memory fence is needed. If true, a release-acquire
-// semantic is used to enforce memory access order before and after this
-// barrier.
-template <int ngpus, bool is_start, bool need_fence = false>
-DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
-                               int rank) {
-  if constexpr (!is_start) __syncthreads();
-  static_assert(
-      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
   if (threadIdx.x < ngpus) {
-    // Increment the counter. Technically we only need one counter, but we use
-    // multiple per block to eliminate the need to share the counter via smem.
-    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->start[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->start[blockIdx.x][threadIdx.x];
+    // Write the expected counter value to peer and wait for correct value
+    // from peer.
+    st_flag_volatile(peer_counter_ptr, flag);
+    while (ld_flag_volatile(self_counter_ptr) != flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+// This function is meant to be used as the second or the final
+// synchronization barrier in the all reduce kernel. If it's the final
+// synchronization barrier, we don't need to make any visibility guarantees
+// for prior memory accesses.
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    auto peer_counter_ptr = &sg.signals[threadIdx.x]->end[blockIdx.x][rank];
+    auto self_counter_ptr = &self_sg->end[blockIdx.x][threadIdx.x];
     // Write the expected counter value to peer and wait for correct value from
     // peer.
-    auto peer_counter_ptr =
-        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
-    auto self_counter_ptr =
-        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
-    if constexpr (need_fence) {
-      st_flag_release(peer_counter_ptr, val);
-      while (ld_flag_acquire(self_counter_ptr) != val);
+    if constexpr (!final_sync) {
+      st_flag_release(peer_counter_ptr, flag);
+      while (ld_flag_acquire(self_counter_ptr) != flag);
     } else {
-      st_flag_volatile(peer_counter_ptr, val);
-      while (ld_flag_volatile(self_counter_ptr) != val);
+      st_flag_volatile(peer_counter_ptr, flag);
+      while (ld_flag_volatile(self_counter_ptr) != flag);
     }
   }
-  if constexpr (is_start || need_fence) __syncthreads();
+  if constexpr (!final_sync) __syncthreads();
+
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
 }
 
+#else
+
+template <int ngpus>
+DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
+                              int rank) {
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
+                            flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
+                                  __ATOMIC_RELAXED,
+                                  __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+template <int ngpus, bool final_sync = false>
+DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
+  __syncthreads();
+  uint32_t flag = self_sg->_flag[blockIdx.x] + 1;
+  if (threadIdx.x < ngpus) {
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
+                            flag,
+                            final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
+                            __MEMORY_SCOPE_SYSTEM);
+    // wait until we got true from all ranks
+    while (
+        __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
+                               final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
+                               __MEMORY_SCOPE_DEVICE) < flag);
+  }
+  if constexpr (!final_sync) __syncthreads();
+  // use one thread to update flag
+  if (threadIdx.x == 0) self_sg->_flag[blockIdx.x] = flag;
+}
+
+#endif
+
 template <typename P, int ngpus, typename A>
 DINLINE P packed_reduce(const P* ptrs[], int idx) {
   A tmp = upcast(ptrs[0][idx]);
@@ -220,13 +302,13 @@ __global__ void __launch_bounds__(512, 1)
   // note: we don't reorder the address so the accumulation order is the same
   // for all ranks, ensuring bitwise identical results
   auto dp = *_dp;
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
   // do the actual reduction
   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
        idx += gridDim.x * blockDim.x) {
     ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
   }
-  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+  barrier_at_end<ngpus, true>(sg, self_sg, rank);
 }
 
 template <typename P>
@@ -255,18 +337,20 @@ __global__ void __launch_bounds__(512, 1)
     tmps[i] = get_tmp_buf<P>(sg.signals[target]);
   }
   auto tmp_out = tmps[0];
-  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  barrier_at_start<ngpus>(sg, self_sg, rank);
+
   // stage 1: reduce scatter
   for (int idx = start + tid; idx < end; idx += stride) {
     tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
   }
-  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+  barrier_at_end<ngpus>(sg, self_sg, rank);
 
   // stage 2: allgather. Note: it's important to match the tid between
   // the two stages, because visibility across devices is only guaranteed
   // between threads that have the same tid. If thread i computes the sum of
-  // start + i in the first stage, then thread i also gathers start + i from all
-  // ranks.
+  // start + i in the first stage, then thread i also gathers start + i from
+  // all ranks.
+
   for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
     for (int i = 0; i < ngpus; i++) {
@@ -287,21 +371,22 @@ class CustomAllreduce {
  public:
   int rank_;
   int world_size_;
-  bool full_nvlink_;
+  // Full NVLink or xGMI connection between GPUs.
+  bool fully_connected_;
 
   RankSignals sg_;
-  // Stores an map from a pointer to its peer pointters from all ranks.
+  // Stores an map from a pointer to its peer pointers from all ranks.
   std::unordered_map<void*, RankData*> buffers_;
   Signal* self_sg_;
 
   // Stores rank data from all ranks. This is mainly for cuda graph purposes.
   // For cuda graph to work, all kernel arguments must be fixed during graph
-  // capture time. However, the peer pointers are not known during graph capture
-  // time. Therefore, during capture, we increment the rank data pointer and use
-  // that as the argument to the kernel. The kernel arguments are stored in
-  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
-  // memory pointed to by the pointers in graph_unreg_buffers_ when
-  // the IPC handles are exchanged between ranks.
+  // capture time. However, the peer pointers are not known during graph
+  // capture time. Therefore, during capture, we increment the rank data
+  // pointer and use that as the argument to the kernel. The kernel arguments
+  // are stored in graph_unreg_buffers_. The actual peer pointers will be
+  // filled in at the memory pointed to by the pointers in
+  // graph_unreg_buffers_ when the IPC handles are exchanged between ranks.
   //
   // The overall process looks like this:
   // 1. Graph capture.
@@ -319,17 +404,18 @@ class CustomAllreduce {
    * Signals are an array of ipc-enabled buffers from all ranks.
    * For each of the buffer, the layout is as follows:
    * | -- sizeof(Signal) -- | ------ a few MB ----- |
-   * The first section is for allreduce synchronization, and the second section
-   * is for storing the intermediate results required by some allreduce algos.
+   * The first section is for allreduce synchronization, and the second
+   * section is for storing the intermediate results required by some
+   * allreduce algos.
    *
    * Note: this class does not own any device memory. Any required buffers
    * are passed in from the constructor.
    */
   CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
-                  int rank, int world_size, bool full_nvlink = true)
+                  int rank, int world_size, bool fully_connected = true)
       : rank_(rank),
         world_size_(world_size),
-        full_nvlink_(full_nvlink),
+        fully_connected_(fully_connected),
         self_sg_(signals[rank]),
         d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
         d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
@@ -361,8 +447,7 @@ class CustomAllreduce {
       void* base_ptr;
       // note: must share the base address of each allocation, or we get wrong
       // address
-      if (cuPointerGetAttribute(&base_ptr,
-                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+      if (cuPointerGetAttribute(&base_ptr, rangeStartAddrAttr,
                                 (CUdeviceptr)ptr) != CUDA_SUCCESS)
         throw std::runtime_error("failed to get pointer attr");
       CUDACHECK(cudaIpcGetMemHandle(
@@ -396,11 +481,11 @@ class CustomAllreduce {
 
   // Note: when registering graph buffers, we intentionally choose to not
   // deduplicate the addresses. That means if the allocator reuses some
-  // addresses, they will be registered again. This is to account for the remote
-  // possibility of different allocation patterns between ranks. For example,
-  // rank 1 may get the same input address for the second allreduce, but rank 2
-  // got a different address. IPC handles have internal reference counting
-  // mechanism so overhead should be small.
+  // addresses, they will be registered again. This is to account for the
+  // remote possibility of different allocation patterns between ranks. For
+  // example, rank 1 may get the same input address for the second allreduce,
+  // but rank 2 got a different address. IPC handles have internal reference
+  // counting mechanism so overhead should be small.
   void register_graph_buffers(
       const std::vector<std::string>& handles,
       const std::vector<std::vector<int64_t>>& offsets) {
@@ -431,15 +516,15 @@ class CustomAllreduce {
   /**
    * Performs allreduce, assuming input has already been registered.
    *
-   * Block and grid default configs are results after careful grid search. Using
-   * 36 blocks give the best or close to the best runtime on the devices I
-   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
-   * take a small amount of SMs. Not quite sure the underlying reason, but my
-   * guess is that too many SMs will cause contention on NVLink bus.
+   * Block and grid default configs are results after careful grid search.
+   * Using 36 blocks give the best or close to the best runtime on the devices
+   * I tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also
+   * only take a small amount of SMs. Not quite sure the underlying reason,
+   * but my guess is that too many SMs will cause contention on NVLink bus.
    */
   template <typename T>
   void allreduce(cudaStream_t stream, T* input, T* output, int size,
-                 int threads = 512, int block_limit = 36) {
+                 int threads = 512, int block_limit = defaultBlockLimit) {
     auto d = packed_t<T>::P::size;
     if (size % d != 0)
       throw std::runtime_error(
@@ -473,13 +558,11 @@ class CustomAllreduce {
 #define KL(ngpus, name)                                                       \
   name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                  rank_, size);
-    // TODO(hanzhi713): Threshold is different for A100 and H100.
-    // Add per device threshold.
 #define REDUCE_CASE(ngpus)                            \
   case ngpus: {                                       \
     if (world_size_ == 2) {                           \
       KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (full_nvlink_) {                        \
+    } else if (fully_connected_) {                    \
       if ((world_size_ <= 4 && bytes < 512 * 1024) || \
           (world_size_ <= 8 && bytes < 256 * 1024)) { \
         KL(ngpus, cross_device_reduce_1stage);        \
@@ -497,7 +580,8 @@ class CustomAllreduce {
       REDUCE_CASE(8)
       default:
         throw std::runtime_error(
-            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual "
+            "num "
             "gpus = " +
             std::to_string(world_size_));
     }
@@ -511,10 +595,11 @@ class CustomAllreduce {
     }
   }
 };
+
 /**
- * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
- a template instantiation:
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and
+ add a template instantiation:
  * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
  half *, int, int, int);
 */
-}  // namespace vllm
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
index b59ea40d980..f7f0823465d 100644
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -1,9 +1,9 @@
 /**
  * This is a standalone test for custom allreduce.
  * To compile, make sure you have MPI and NCCL installed in your system.
- * export MPI_HOME=xxx
+ * export MPI_HOME=XXX
  * nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
- * custom_all_reduce_test -lnccl -I${MPI_HOME} -lmpi
+ * custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
  *
  * Warning: this C++ test is not designed to be very readable and was used
  * during the rapid prototyping process.
@@ -22,7 +22,15 @@
 #include "cuda_profiler_api.h"
 #include "custom_all_reduce.cuh"
 #include "mpi.h"
-#include "nccl.h"
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+typedef __hip_bfloat16 nv_bfloat16;
+  #include "rccl/rccl.h"
+  #include "custom_all_reduce_hip.cuh"
+#else
+  #include "nccl.h"
+  #include "custom_all_reduce.cuh"
+#endif
 
 #define MPICHECK(cmd)                                                  \
   do {                                                                 \
@@ -43,16 +51,29 @@
     }                                                               \
   } while (0)
 
+#ifdef USE_ROCM
 __global__ void dummy_kernel() {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) {
+    uint64_t start = wall_clock64();
+    uint64_t cycles_elapsed;
+    do {
+      cycles_elapsed = wall_clock64() - start;
+    } while (cycles_elapsed < 100);
+  }
   for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+}
 #else
+__global__ void dummy_kernel() {
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  for (int i = 0; i < 100; i++) __nanosleep(1000000);  // 100ms
+  #else
   for (int i = 0; i < 100; i++) {
     long long int start = clock64();
     while (clock64() - start < 150000000);  // approximately 98.4ms on P40
   }
-#endif
+  #endif
 }
+#endif
 
 template <typename T>
 __global__ void set_data(T* data, int size, int myRank) {
@@ -121,8 +142,14 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
    * registration, they are allocated and registered together in the test for
    * convenience.
    */
+#ifdef USE_ROCM
+  CUDACHECK(hipExtMallocWithFlags(
+      (void**)&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal),
+      hipDeviceMallocUncached));
+#else
   CUDACHECK(
       cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+#endif
   CUDACHECK(
       cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
   CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
@@ -311,13 +338,18 @@ int main(int argc, char** argv) {
 
   bool performance_test = true;
   cudaProfilerStart();
-  // Uncomment to scan through different block size configs.
-  // for (int threads : {256, 512, 1024}) {
-  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
-  //     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
-  //     performance_test);
-  //   }
-  // }
+// Uncomment to scan through different block size configs.
+// for (int threads : {256, 512, 1024}) {
+//   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
+//     run<half>(myRank, nRanks, comm, threads, block_limit, 1024 * 1024,
+//     performance_test);
+//   }
+// }
+#ifdef USE_ROCM
+  const int block_limit = 16;
+#else
+  const int block_limit = 36;
+#endif
   // Scan through different sizes to test performance.
   for (int sz = 512; sz <= (8 << 20); sz *= 2) {
     run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
@@ -326,4 +358,4 @@ int main(int argc, char** argv) {
   cudaProfilerStop();
   MPICHECK(MPI_Finalize());
   return EXIT_SUCCESS;
-}
+}
\ No newline at end of file
diff --git a/csrc/ops.h b/csrc/ops.h
index 77d1ab768d9..a0985d32426 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -267,10 +267,10 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                        const std::optional<at::Tensor>& has_initial_state,
                        bool silu_activation, int64_t pad_slot_id);
 
-#ifndef USE_ROCM
 using fptr_t = int64_t;
 fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
-                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool fully_connected);
 void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                 fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
@@ -281,4 +281,7 @@ get_graph_buffer_ipc_meta(fptr_t _fa);
 void register_graph_buffers(fptr_t _fa,
                             const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
-#endif
+std::tuple<int64_t, torch::Tensor> allocate_shared_buffer_and_handle(
+    int64_t size);
+int64_t open_mem_handle(torch::Tensor& mem_handle);
+void free_shared_buffer(int64_t buffer);
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b0a23a36937..feb3882c4d5 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -614,12 +614,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
                   &get_max_shared_memory_per_block_device_attribute);
 }
 
-#ifndef USE_ROCM
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   // Custom all-reduce kernels
   custom_ar.def(
       "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
-      "int rank, bool full_nvlink) -> int");
+      "int rank, bool fully_connected) -> int");
   custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
   custom_ar.def(
       "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
@@ -632,7 +631,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   custom_ar.def("register_buffer", &register_buffer);
   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
   custom_ar.def("register_graph_buffers", &register_graph_buffers);
+
+  custom_ar.def("allocate_shared_buffer_and_handle",
+                &allocate_shared_buffer_and_handle);
+  custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
+  custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
+
+  custom_ar.def("free_shared_buffer", &free_shared_buffer);
 }
-#endif
 
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index bfa7d06c4d0..a7ba45c9e54 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -106,7 +106,7 @@ def eager_allreduce(
         # communicate independently
         num_communication = rank // tp_size + 1
         sz = 1024
-        fa = get_tp_group().ca_comm
+        fa = get_tp_group().device_communicator.ca_comm
         inp = torch.ones(sz, dtype=torch.float32, device=device)
         out = inp
         for _ in range(num_communication):
diff --git a/tests/utils.py b/tests/utils.py
index 8915453ebd0..69c96d3f065 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -612,7 +612,16 @@ def multi_process_parallel(
     # as compared to multiprocessing.
     # NOTE: We need to set working_dir for distributed tests,
     # otherwise we may get import errors on ray workers
-    ray.init(runtime_env={"working_dir": VLLM_PATH})
+    # NOTE: Force ray not to use gitignore file as excluding, otherwise
+    # it will not move .so files to working dir.
+    # So we have to manually add some of large directories
+    os.environ["RAY_RUNTIME_ENV_IGNORE_GITIGNORE"] = "1"
+    ray.init(
+        runtime_env={
+            "working_dir": VLLM_PATH,
+            "excludes":
+            ["build", ".git", "cmake-build-*", "shellcheck", "dist"]
+        })
 
     distributed_init_port = get_open_port()
     refs = []
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2ffcef414cb..2aa99ca256c 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1337,9 +1337,9 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
 
 # custom ar
 def init_custom_ar(ipc_tensors: list[torch.Tensor], rank_data: torch.Tensor,
-                   rank: int, full_nvlink: bool) -> int:
+                   rank: int, fully_connected: bool) -> int:
     return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
-                                                 full_nvlink)
+                                                 fully_connected)
 
 
 def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
@@ -1369,6 +1369,18 @@ def register_graph_buffers(fa: int, handles: list[list[int]],
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
 
+def allocate_shared_buffer_and_handle(size: int) -> tuple[int, torch.Tensor]:
+    return torch.ops._C_custom_ar.allocate_shared_buffer_and_handle(size)
+
+
+def open_mem_handle(mem_handle: torch.Tensor):
+    return torch.ops._C_custom_ar.open_mem_handle(mem_handle)
+
+
+def free_shared_buffer(ptr: int) -> None:
+    torch.ops._C_custom_ar.free_shared_buffer(ptr)
+
+
 def get_flash_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_heads_per_head_k: int,
diff --git a/vllm/config.py b/vllm/config.py
index 1dd9359199c..84b9836ef58 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1606,11 +1606,13 @@ def _verify_args(self) -> None:
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        if current_platform.is_rocm():
+        device_capability = current_platform.get_device_capability()
+        if (current_platform.is_rocm() and device_capability is not None
+                and device_capability < (9, 4)):
             self.disable_custom_all_reduce = True
             logger.info(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs.")
+                "supported on AMD GPUs older than MI300X.")
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 90f7f2d0f98..45fc2a7118b 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import ctypes
 from contextlib import contextmanager
 from typing import List, Optional, Union
 
@@ -10,7 +9,6 @@
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
-from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import in_the_same_node_as
@@ -22,7 +20,7 @@
     ops.meta_size()
     custom_ar = True
 except Exception:
-    # For AMD GPUs and CPUs
+    # For CPUs
     custom_ar = False
 
 logger = init_logger(__name__)
@@ -71,7 +69,9 @@ def __init__(self,
 
         if not custom_ar:
             # disable because of missing custom allreduce library
-            # e.g. in a non-cuda environment
+            # e.g. in a non-GPU environment
+            logger.info("Custom allreduce is disabled because "
+                        "of missing custom allreduce library")
             return
 
         self.group = group
@@ -129,11 +129,10 @@ def __init__(self,
         # test nvlink first, this will filter out most of the cases
         # where custom allreduce is not supported
         # this checks hardware and driver support for NVLink
-        assert current_platform.is_cuda()
-        from vllm.platforms.cuda import CudaPlatform
-        cuda_platform: CudaPlatform = current_platform
-        full_nvlink = cuda_platform.is_full_nvlink(physical_device_ids)
-        if world_size > 2 and not full_nvlink:
+        assert current_platform.is_cuda_alike()
+        fully_connected = current_platform.is_fully_connected(
+            physical_device_ids)
+        if world_size > 2 and not fully_connected:
             logger.warning(
                 "Custom allreduce is disabled because it's not supported on"
                 " more than two PCIe-only GPUs. To silence this warning, "
@@ -142,7 +141,8 @@ def __init__(self,
         # test P2P capability, this checks software/cudaruntime support
         # this is expensive to compute at the first time
         # then we cache the result
-        if not _can_p2p(rank, world_size):
+        # On AMD GPU, p2p is always enabled between XGMI connected GPUs
+        if not current_platform.is_rocm() and not _can_p2p(rank, world_size):
             logger.warning(
                 "Custom allreduce is disabled because your platform lacks "
                 "GPU P2P capability or P2P test failed. To silence this "
@@ -154,7 +154,8 @@ def __init__(self,
         # Meta data composes of two parts: meta data for synchronization and a
         # temporary buffer for storing intermediate allreduce results.
         self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
-                                                   group=group)
+                                                   group=group,
+                                                   uncached=True)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
         # are first copied into this buffer before allreduce is performed
         self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
@@ -169,46 +170,11 @@ def __init__(self,
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
-        self.full_nvlink = full_nvlink
+        self.fully_connected = fully_connected
         self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
-                                       self.full_nvlink)
+                                       self.fully_connected)
         ops.register_buffer(self._ptr, self.buffer_ptrs)
 
-    @staticmethod
-    def create_shared_buffer(
-            size_in_bytes: int,
-            group: Optional[ProcessGroup] = None) -> List[int]:
-        """
-        Creates a shared buffer and returns a list of pointers
-        representing the buffer on all processes in the group.
-        """
-        lib = CudaRTLibrary()
-        pointer = lib.cudaMalloc(size_in_bytes)
-        handle = lib.cudaIpcGetMemHandle(pointer)
-        world_size = dist.get_world_size(group=group)
-        rank = dist.get_rank(group=group)
-        handles = [None] * world_size
-        dist.all_gather_object(handles, handle, group=group)
-
-        pointers: List[int] = []
-        for i, h in enumerate(handles):
-            if i == rank:
-                pointers.append(pointer.value)  # type: ignore
-            else:
-                pointers.append(
-                    lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
-
-        return pointers
-
-    @staticmethod
-    def free_shared_buffer(pointers: List[int],
-                           group: Optional[ProcessGroup] = None,
-                           rank: Optional[int] = None) -> None:
-        if rank is None:
-            rank = dist.get_rank(group=group)
-        lib = CudaRTLibrary()
-        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
-
     @contextmanager
     def capture(self):
         """
@@ -255,7 +221,7 @@ def should_custom_ar(self, inp: torch.Tensor):
             return False
         # for 4 or more non NVLink-capable GPUs, custom allreduce provides
         # little performance improvement over NCCL.
-        if self.world_size == 2 or self.full_nvlink:
+        if self.world_size == 2 or self.fully_connected:
             return inp_size < self.max_size
         return False
 
@@ -306,3 +272,30 @@ def close(self):
 
     def __del__(self):
         self.close()
+
+    @staticmethod
+    def create_shared_buffer(size_in_bytes: int,
+                             group: Optional[ProcessGroup] = None,
+                             uncached: Optional[bool] = False) -> List[int]:
+        pointer, handle = ops.allocate_shared_buffer_and_handle(size_in_bytes)
+
+        world_size = dist.get_world_size(group=group)
+        rank = dist.get_rank(group=group)
+        handles = [None] * world_size
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer)  # type: ignore
+            else:
+                pointers.append(ops.open_mem_handle(h))
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(pointers: List[int],
+                           group: Optional[ProcessGroup] = None,
+                           rank: Optional[int] = 0) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        ops.free_shared_buffer(pointers[rank])
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ca8a2d2640e..28505fca10d 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -101,7 +101,7 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
     @classmethod
-    def is_full_nvlink(cls, device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, device_ids: List[int]) -> bool:
         raise NotImplementedError
 
     @classmethod
@@ -362,7 +362,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
 
     @classmethod
     @with_nvml_context
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         """
         query if the set of gpus are fully connected by nvlink (1 hop)
         """
@@ -427,7 +427,7 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
         return device_props.total_memory
 
     @classmethod
-    def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: List[int]) -> bool:
         logger.exception(
             "NVLink detection not possible, as context support was"
             " not found. Assuming no NVLink available.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d196e24ac7a..89b778c7b5b 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -20,8 +20,9 @@
 logger = init_logger(__name__)
 
 try:
-    from amdsmi import (amdsmi_get_gpu_asic_info, amdsmi_get_processor_handles,
-                        amdsmi_init, amdsmi_shut_down)
+    from amdsmi import (AmdSmiException, amdsmi_get_gpu_asic_info,
+                        amdsmi_get_processor_handles, amdsmi_init,
+                        amdsmi_shut_down, amdsmi_topo_get_link_type)
 except ImportError as e:
     logger.warning("Failed to import from amdsmi with %r", e)
 
@@ -135,10 +136,36 @@ def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
 
     @classmethod
     @lru_cache(maxsize=8)
-    def get_device_capability(cls, device_id: int = 0) -> DeviceCapability:
+    def get_device_capability(cls,
+                              device_id: int = 0
+                              ) -> Optional[DeviceCapability]:
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
+    @staticmethod
+    @with_amdsmi_context
+    def is_fully_connected(physical_device_ids: List[int]) -> bool:
+        """
+        Query if the set of gpus are fully connected by xgmi (1 hop)
+        """
+        handles = [
+            amdsmi_get_processor_handles()[i] for i in physical_device_ids
+        ]
+        for i, handle in enumerate(handles):
+            for j, peer_handle in enumerate(handles):
+                if i < j:
+                    try:
+                        link_type = amdsmi_topo_get_link_type(
+                            handle, peer_handle)
+                        # type is 2 for XGMI
+                        if link_type["hops"] != 1 or link_type["type"] != 2:
+                            return False
+                    except AmdSmiException as error:
+                        logger.error("AMD 1 hop XGMI detection failed.",
+                                     exc_info=error)
+                        return False
+        return True
+
     @classmethod
     @with_amdsmi_context
     @lru_cache(maxsize=8)

From 7647ce0489464eb636b2447b9ff577e3ad3dfdd6 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 06:49:41 +0100
Subject: [PATCH 1110/1240] Rename fallback model and refactor supported models
 section (#15829)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/index.md                       |   2 +-
 docs/source/models/supported_models.md     | 119 ++++++++++++---------
 tests/models/registry.py                   |   4 +-
 vllm/model_executor/model_loader/utils.py  |   7 +-
 vllm/model_executor/models/registry.py     |   6 +-
 vllm/model_executor/models/transformers.py |   2 +-
 6 files changed, 80 insertions(+), 60 deletions(-)

diff --git a/docs/source/index.md b/docs/source/index.md
index 402f2426790..28dc0f67d77 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -77,9 +77,9 @@ getting_started/v1_user_guide
 :caption: Models
 :maxdepth: 1
 
+models/supported_models
 models/generative_models
 models/pooling_models
-models/supported_models
 models/extensions/index
 :::
 
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1705757d830..62274854d8b 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1,55 +1,28 @@
 (supported-models)=
 
-# List of Supported Models
+# Supported Models
 
-vLLM supports generative and pooling models across various tasks.
+vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
 If a model supports more than one task, you can set the task via the `--task` argument.
 
 For each task, we list the model architectures that have been implemented in vLLM.
 Alongside each architecture, we include some popular models that use it.
 
-## Loading a Model
-
-### HuggingFace Hub
-
-By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co/models).
-
-To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
-
-Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-fallback> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
-
-:::{tip}
-The easiest way to check if your model is really supported at runtime is to run the program below:
-
-```python
-from vllm import LLM
-
-# For generative models (task=generate) only
-llm = LLM(model=..., task="generate")  # Name or path of your model
-output = llm.generate("Hello, my name is")
-print(output)
+## Model Implementation
 
-# For pooling models (task={embed,classify,reward,score}) only
-llm = LLM(model=..., task="embed")  # Name or path of your model
-output = llm.encode("Hello, my name is")
-print(output)
-```
+### vLLM
 
-If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-:::
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
 
-Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
-Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
+These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
 
-(transformers-fallback)=
+(transformers-backend)=
 
-### Transformers fallback
+### Transformers
 
-vLLM can fallback to model implementations that are available in Transformers. This does not work for all models for now, but most decoder language models are supported, and vision language model support is planned!
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
 
-To check if the backend is Transformers, you can simply do this:
+To check if the modeling backend is Transformers, you can simply do this:
 
 ```python 
 from vllm import LLM
@@ -69,16 +42,15 @@ vLLM may not fully optimise the Transformers implementation so you may see degra
 
 #### Supported features
 
-The Transformers fallback explicitly supports the following features:
+The Transformers modeling backend explicitly supports the following features:
 
 - <project:#quantization-index> (except GGUF)
 - <project:#lora-adapter>
 - <project:#distributed-serving>
 
-#### Remote code
+#### Remote Code
 
-Earlier we mentioned that the Transformers fallback enables you to run remote code models directly in vLLM.
-If you are interested in this feature, this section is for you!
+If your model is neither supported natively by vLLM or Transformers, you can still run it in vLLM!
 
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
@@ -89,7 +61,7 @@ llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of
 llm.apply_model(lambda model: print(model.__class__))
 ```
 
-To make your model compatible with the Transformers fallback, it needs:
+To make your model compatible with the Transformers backend, it needs:
 
 ```{code-block} python
 :caption: modeling_my_model.py
@@ -121,7 +93,9 @@ Here is what happens in the background:
 2. `MyModel` Python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
 3. The `TransformersForCausalLM` backend is used. See <gh-file:vllm/model_executor/models/transformers.py>, which leverage `self.config._attn_implementation = "vllm"`, thus the need to use `ALL_ATTENTION_FUNCTION`.
 
-To make your model compatible with tensor parallel, it needs:
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
 
 ```{code-block} python
 :caption: configuration_my_model.py
@@ -130,20 +104,65 @@ from transformers import PretrainedConfig
 
 class MyConfig(PretrainedConfig):
   base_model_tp_plan = {
-    "layers.*.self_attn.q_proj": "colwise",
-    ...
+    "layers.*.self_attn.k_proj": "colwise",
+    "layers.*.self_attn.v_proj": "colwise",
+    "layers.*.self_attn.o_proj": "rowwise",
+    "layers.*.mlp.gate_proj": "colwise",
+    "layers.*.mlp.up_proj": "colwise",
+    "layers.*.mlp.down_proj": "rowwise",
+  }
+  base_model_pp_plan = {
+    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+    "norm": (["hidden_states"], ["hidden_states"]),
   }
 ```
 
+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+  * You only need to do this for layers which are not present on all pipeline stages
+  * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+  * The `list` in the first element of the `tuple` contains the names of the input arguments
+  * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The <project:#transformers-backend> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
 :::{tip}
-`base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+The easiest way to check if your model is really supported at runtime is to run the program below:
+
+```python
+from vllm import LLM
+
+# For generative models (task=generate) only
+llm = LLM(model=..., task="generate")  # Name or path of your model
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+llm = LLM(model=..., task="embed")  # Name or path of your model
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
 :::
 
-That's it!
+Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 
 ### ModelScope
 
-To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable:
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
 
 ```shell
 export VLLM_USE_MODELSCOPE=True
@@ -165,6 +184,8 @@ output = llm.encode("Hello, my name is")
 print(output)
 ```
 
+(supported-text-models)=
+
 ## List of Text-only Language Models
 
 ### Generative Models
@@ -1066,7 +1087,7 @@ At vLLM, we are committed to facilitating the integration and support of third-p
 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
 
     :::{tip}
-    When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+    When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
     :::
 
 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 69ebfe4c924..8cc5c28d237 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -346,7 +346,7 @@ def check_available_online(
                                         trust_remote_code=True),
 }
 
-_FALLBACK_MODEL = {
+_TRANSFORMERS_MODELS = {
     "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
 }
 
@@ -356,7 +356,7 @@ def check_available_online(
     **_CROSS_ENCODER_EXAMPLE_MODELS,
     **_MULTIMODAL_EXAMPLE_MODELS,
     **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index d9613fab3a2..15f37aad6d8 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -39,8 +39,8 @@ def is_transformers_impl_compatible(
     return mod.is_backend_compatible()
 
 
-def resolve_transformers_fallback(model_config: ModelConfig,
-                                  architectures: list[str]):
+def resolve_transformers_arch(model_config: ModelConfig,
+                              architectures: list[str]):
     for i, arch in enumerate(architectures):
         if arch == "TransformersForCausalLM":
             continue
@@ -101,8 +101,7 @@ def get_model_architecture(
                             for arch in architectures)
     if (not is_vllm_supported
             or model_config.model_impl == ModelImpl.TRANSFORMERS):
-        architectures = resolve_transformers_fallback(model_config,
-                                                      architectures)
+        architectures = resolve_transformers_arch(model_config, architectures)
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 34be221285c..21ebaac7737 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -202,7 +202,7 @@
     "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
-_FALLBACK_MODEL = {
+_TRANSFORMERS_MODELS = {
     "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
 }
 # yapf: enable
@@ -213,7 +213,7 @@
     **_CROSS_ENCODER_MODELS,
     **_MULTIMODAL_MODELS,
     **_SPECULATIVE_DECODING_MODELS,
-    **_FALLBACK_MODEL,
+    **_TRANSFORMERS_MODELS,
 }
 
 # This variable is used as the args for subprocess.run(). We
@@ -427,7 +427,7 @@ def _normalize_archs(
         normalized_arch = list(
             filter(lambda model: model in self.models, architectures))
 
-        # make sure Transformers fallback are put at the last
+        # make sure Transformers backend is put at the last as a fallback
         if len(normalized_arch) != len(architectures):
             normalized_arch.append("TransformersForCausalLM")
         return normalized_arch
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 70daadf9137..a1f233e0489 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -401,7 +401,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    # FIXME(Isotr0py): Don't use any weights mapper for Transformers fallback,
+    # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend,
     # this makes thing complicated. We need to remove this mapper after refactor
     # `TransformersModel` in the future.
     @property

From 8a14237dbab02a49eee3ff0c0cd93f9f2bbd341c Mon Sep 17 00:00:00 2001
From: Kinfey <93169410+kinfey@users.noreply.github.com>
Date: Tue, 1 Apr 2025 13:50:05 +0800
Subject: [PATCH 1111/1240] [Frontend] Add Phi-4-mini function calling support
 (#14886)

Signed-off-by: Kinfey <kinfeylo@microsoft.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/tool_chat_template_phi4_mini.jinja   |  60 ++++++++++
 .../openai/tool_parsers/__init__.py           |   4 +-
 .../tool_parsers/phi4mini_tool_parser.py      | 108 ++++++++++++++++++
 3 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 examples/tool_chat_template_phi4_mini.jinja
 create mode 100644 vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py

diff --git a/examples/tool_chat_template_phi4_mini.jinja b/examples/tool_chat_template_phi4_mini.jinja
new file mode 100644
index 00000000000..36423b6c424
--- /dev/null
+++ b/examples/tool_chat_template_phi4_mini.jinja
@@ -0,0 +1,60 @@
+{%- if messages %}
+    {%- if system_message or tools %}
+<|system|>
+
+{%- if system_message %}
+{{ system_message }}
+{%- endif %}
+In addition to plain text responses, you can chose to call one or more of the provided functions.
+
+Use the following rule to decide when to call a function:
+  * if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
+  * if you need external information that can be obtained by calling one or more of the provided functions, generate a function calls
+
+If you decide to call functions:
+  * prefix function calls with functools marker (no closing marker required)
+  * all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
+  * follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
+  * respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
+  * make sure you pick the right functions that match the user intent
+
+
+{%- if tools %}
+        {%- for t in tools %}
+            {{- t | tojson(indent=4) }}
+            {{- "\n\n" }}
+        {%- endfor %}
+{%- endif %}<|end|>
+    {%- endif %}
+
+    {%- for message in messages %}
+        {%- if message.role != "system" %}
+<|{{ message.role }}|>
+            {%- if message.content and message.role == "tools" %}
+{"result": {{ message.content }}}
+            {%- elif message.content %}
+{{ message.content }}
+            {%- elif message.tool_calls %}
+                {%- for call in message.tool_calls %}
+{"name": "{{ call.function.name }}", "arguments": {{ call.function.arguments }}}
+                    {%- if not loop.last %},{% endif %}
+                {%- endfor %}
+            {%- endif %}<|end|>
+        {%- endif %}
+    {%- endfor %}<|assistant|>
+
+{%- else %}
+    {%- if system_message %}
+<|system|>
+
+{{ system_message }}<|end|>
+    {%- endif %}
+    {%- if prompt %}
+<|user|>
+
+{{ prompt }}<|end|>
+    {%- endif %}<|assistant|>
+
+{%- endif %}
+{{ response }}
+{%- if response %}<|user|>{% endif %}
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 512335177e1..752b3877821 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -8,6 +8,7 @@
 from .jamba_tool_parser import JambaToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
+from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
 from .llama_usr_defined_tool_parser import Llama3UserDefinedCustomToolParser
 
@@ -15,5 +16,6 @@
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser", "Llama3UserDefinedCustomToolParser"
+    "PythonicToolParser", "Llama3UserDefinedCustomToolParser",
+    "Phi4MiniJsonToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
new file mode 100644
index 00000000000..167eb0ea2a9
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import re
+from collections.abc import Sequence
+from typing import Any, Optional
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("phi4_mini_json")
+class Phi4MiniJsonToolParser(ToolParser):
+    """
+    Tool call parser for phi-4-mini models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser phi4_mini_json  
+    are all set
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: list[dict[str, Any]] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.streamed_args_for_tool: list[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token: str = "functools"
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+        print(f"Model output: {model_output}")
+
+        pattern = r'functools\[(.*?)\]'
+        matches = re.search(pattern, model_output, re.DOTALL)
+
+        if not matches:
+            print("No function calls found")
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            function_call_arr: list[dict[str, Any]] = []
+            try:
+                json_content = '[' + matches.group(1) + ']'
+
+                function_call_arr = json.loads(json_content)
+                print(f"Successfully extracted {len(function_call_arr)} "
+                      "function calls")
+            except json.JSONDecodeError as e:
+                print(f"Error parsing JSON: {e}")
+
+            tool_calls: list[ToolCall] = [
+                ToolCall(
+                    id=f"chatcmpl-tool-{random_uuid()}",
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(
+                            raw_function_call["arguments"] if "arguments" in
+                            raw_function_call else
+                            raw_function_call["parameters"])))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before the tool call
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
+            return ret
+
+        except Exception:
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Optional[DeltaMessage]:
+
+        return None

From 0f62fd2009e7278ddf073deb0abfdec90adb8489 Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Tue, 1 Apr 2025 13:53:37 +0800
Subject: [PATCH 1112/1240] [Bugfix][Model] fix mllama multi-image (#14883)

Signed-off-by: yan ma <yan.ma@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../vision_language/test_mllama.py            |  2 +-
 vllm/model_executor/models/mllama.py          | 34 +++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 260d2c10938..c688655887e 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -212,7 +212,7 @@ def _run_test(
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
-                     max_num_seqs=2,
+                     max_num_seqs=3,
                      tensor_parallel_size=tensor_parallel_size,
                      distributed_executor_backend=distributed_executor_backend,
                      limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 68d5298dfc9..6a2e20840fc 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1235,11 +1235,34 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def unpack_data(self,
+                    image_data: Union[List[torch.Tensor], torch.Tensor],
+                    padding_value=0) -> torch.Tensor:
+        if isinstance(image_data, torch.Tensor):
+            # torch.Tensor
+            return image_data
+        else:
+            assert isinstance(
+                image_data[0],
+                torch.Tensor), "Image data is not properly batched."
+            # List[torch.Tensor]
+            bsz = len(image_data)
+            max_length = max(t.size(0) for t in image_data)
+            trailing_dims = image_data[0].shape[1:]
+            for data in image_data:
+                cur_trailing_dims = data.shape[1:]
+                assert cur_trailing_dims == trailing_dims
+            output_tensor = torch.full((bsz, max_length, *trailing_dims),
+                                       padding_value,
+                                       dtype=image_data[0].dtype,
+                                       device=image_data[0].device)
+            for i, t in enumerate(image_data):
+                output_tensor[i, :t.size(0)] = t
+            return output_tensor
+
     def _parse_and_validate_image_input(self, **kwargs: object):
         # tensor with the same shape will be batched together by
         # MultiModalKwargs.batch, so pixel_values here can be:
-        #   - List[List[torch.Tensor]]:
-        #       with shape (num_tiles, 3, image_res, image_res)
         #   - List[torch.Tensor]:
         #       with shape (num_image, num_tiles, 3, image_res, image_res)
         #   - torch.Tensor:
@@ -1274,10 +1297,9 @@ def _parse_and_validate_image_input(self, **kwargs: object):
 
             return MllamaImagePixelInputs(
                 type="pixel_values",
-                data=pixel_values,
-                aspect_ratio_ids=aspect_ratio_ids,
-                aspect_ratio_mask=aspect_ratio_mask,
-            )
+                data=self.unpack_data(pixel_values),
+                aspect_ratio_ids=self.unpack_data(aspect_ratio_ids),
+                aspect_ratio_mask=self.unpack_data(aspect_ratio_mask))
 
         if image_embeds is not None:
             raise NotImplementedError

From ed015a943a19d53aa148e52262eaabbe2a83aa1b Mon Sep 17 00:00:00 2001
From: Percy <xhc_1007@163.com>
Date: Tue, 1 Apr 2025 00:57:28 -0500
Subject: [PATCH 1113/1240] [Bugfix] Fix extra comma (#15851)

Signed-off-by: haochengxia <xhc_1007@163.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/sampling_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0a580a4e907..d76c75d9e6c 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -191,7 +191,7 @@ def __repr__(self) -> str:
             "SamplingMetadata("
             f"seq_groups={self.seq_groups}, "
             f"selected_token_indices={self.selected_token_indices}, "
-            f"categorized_sample_indices={self.categorized_sample_indices}), ")
+            f"categorized_sample_indices={self.categorized_sample_indices})")
 
 
 def _prepare_seq_groups(

From a132afdc9fbd2d301a9cb95055ec385668e515c8 Mon Sep 17 00:00:00 2001
From: Alexey Kiryushin <alexey.a.kiryushin@gmail.com>
Date: Tue, 1 Apr 2025 05:57:59 +0000
Subject: [PATCH 1114/1240] [Bugfix]: Fix is_embedding_layer condition in
 VocabParallelEmbedding  (#15824)

Signed-off-by: alexwl <alexey.a.kiryushin@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/vocab_parallel_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index f65dfc3cb32..1eb0c8c2ef4 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -235,7 +235,7 @@ def __init__(self,
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
-        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        is_embedding_layer = type(self) is VocabParallelEmbedding
         quant_method_implements_embedding = method_has_implemented_embedding(
             type(quant_method))
         if is_embedding_layer and not quant_method_implements_embedding:

From 1a8832db5866d308e4bdee30e3370d6e0b7d1177 Mon Sep 17 00:00:00 2001
From: Alexander Matveev <59768536+alexm-redhat@users.noreply.github.com>
Date: Tue, 1 Apr 2025 01:58:07 -0400
Subject: [PATCH 1115/1240] [V1] TPU - Fix fused MOE (#15834)

Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index ef33852e316..143123e577b 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -309,7 +309,7 @@ def forward_tpu(
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_cuda
+    forward_native = forward_tpu if current_platform.is_tpu else forward_cuda
 
 
 def determine_expert_map(

From e1c55e14486b34dd5a374b58d2cc6a08fe357906 Mon Sep 17 00:00:00 2001
From: Lionel Villard <villard@us.ibm.com>
Date: Tue, 1 Apr 2025 01:58:58 -0400
Subject: [PATCH 1116/1240] [sleep mode] clear pytorch cache after sleep
 (#15248)

Signed-off-by: <villard@us.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/device_allocator/cumem.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 0291fd9e1c8..f666c18c199 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import gc
 import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -175,7 +176,7 @@ def sleep(
                                          str]] = None) -> None:
         """
         Put the allocator in sleep mode.
-        All data in the memory allocation with the specified tag will be 
+        All data in the memory allocation with the specified tag will be
         offloaded to CPU memory, and others will be discarded.
 
         :param offload_tags: The tags of the memory allocation that will be
@@ -204,10 +205,13 @@ def sleep(
                 data.cpu_backup_tensor = cpu_backup_tensor
             unmap_and_release(handle)
 
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def wake_up(self):
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU 
+        All data that is previously offloaded will be loaded back to GPU
         memory, and the rest of the data will have empty memory."""
         for ptr, data in self.pointer_to_data.items():
             handle = data.handle
@@ -225,7 +229,7 @@ def wake_up(self):
     def use_memory_pool(self, tag: Optional[str] = None):
         """
         A context manager to use the memory pool.
-        All memory allocation created inside the context will be allocated 
+        All memory allocation created inside the context will be allocated
         in the memory pool, and has the specified tag.
 
         :param tag: The tag of the memory allocation. If None, the default tag

From 0af00e12095b8e5e85e8c88794b0a97c1fb5497e Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 1 Apr 2025 03:10:48 -0400
Subject: [PATCH 1117/1240] [ROCm] Use device name in the warning (#15838)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e29b04ab6e0..019cbe18397 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1530,7 +1530,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # Non-CUDA is supported on V1, but off by default for now.
         not_cuda = not current_platform.is_cuda()
         if not_cuda and _warn_or_fallback(  # noqa: SIM103
-                current_platform.device_type):
+                current_platform.device_name):
             return False
         #############################################################
 

From 2ea26ba97fdd32e9702f53a59b627b54a463714c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 1 Apr 2025 15:33:17 +0800
Subject: [PATCH 1118/1240] [V1] Implement sliding window attention in
 kv_cache_manager (#14097)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../e2e/test_correctness_sliding_window.py    |  15 +-
 tests/v1/core/test_prefix_caching.py          | 133 +++++++--------
 tests/v1/core/test_scheduler.py               |  12 ++
 tests/v1/core/test_specialized_manager.py     | 138 +++++++++++++++
 .../v1/e2e/test_correctness_sliding_window.py |  84 +++++++++
 vllm/config.py                                |   3 +-
 vllm/v1/core/block_pool.py                    |  15 +-
 vllm/v1/core/kv_cache_manager.py              |  71 +++++---
 vllm/v1/core/kv_cache_utils.py                |  35 +++-
 vllm/v1/core/sched/scheduler.py               |  22 +--
 vllm/v1/core/specialized_manager.py           | 161 ++++++++++++++++++
 vllm/v1/engine/core.py                        |  18 +-
 vllm/v1/kv_cache_interface.py                 |  57 +++++--
 vllm/v1/worker/gpu_model_runner.py            |  28 ++-
 vllm/v1/worker/tpu_model_runner.py            |  28 +--
 15 files changed, 662 insertions(+), 158 deletions(-)
 create mode 100644 tests/v1/core/test_specialized_manager.py
 create mode 100644 tests/v1/e2e/test_correctness_sliding_window.py
 create mode 100644 vllm/v1/core/specialized_manager.py

diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
index e23b8718cb6..039b5e73989 100644
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -129,12 +129,16 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
     check_answers(indices, answer, test_texts)
 
 
-def prep_prompts(batch_size: int):
+def prep_prompts(batch_size: int, ln_range: tuple[int, int] = (800, 1100)):
     """
     Generate prompts which a bunch of assignments,
     then asking for the value of one of them.
     The prompt is just under 10k tokens; sliding window is 4k
     so the answer is outside sliding window, but should still be correct.
+
+    Args:
+        batch_size: number of prompts to generate
+        ln_range: an argument to control the length of the prompt
     """
     prompts: list[str] = []
     answer: list[int] = []
@@ -145,7 +149,7 @@ def prep_prompts(batch_size: int):
         indices.append(idx)
         prompt = "```python\n# We set a number of variables, " + \
                  f"x{idx} will be important later\n"
-        ln = random.randint(800, 1100)
+        ln = random.randint(*ln_range)
         for k in range(30, ln):
             v = random.randint(10, 99)
             if k == idx:
@@ -157,7 +161,10 @@ def prep_prompts(batch_size: int):
     return prompts, answer, indices
 
 
-def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
+def check_answers(indices: list[int],
+                  answer: list[int],
+                  outputs: list[str],
+                  accept_rate: float = 0.7):
     answer2 = [int(text[0:2].strip()) for text in outputs]
     print(list(zip(indices, zip(answer, answer2))))
     numok = 0
@@ -166,7 +173,7 @@ def check_answers(indices: list[int], answer: list[int], outputs: list[str]):
             numok += 1
     frac_ok = numok / len(answer)
     print(f"Num OK: {numok}/{len(answer)} {frac_ok}")
-    assert frac_ok > 0.7
+    assert frac_ok >= accept_rate
 
 
 def check_window(prompts: list[str]):
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 72a1874fbd4..80dd275a90b 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -4,6 +4,7 @@
 from typing import Optional
 
 import pytest
+import torch
 
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
@@ -12,6 +13,8 @@
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_block_tokens)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 
 
 def make_request(request_id,
@@ -39,13 +42,23 @@ def make_request(request_id,
     )
 
 
+def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+
+
 @pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
 def test_prefill(hash_algo):
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         caching_hash_algo=hash_algo,
         num_preallocate_tokens=16,
@@ -67,12 +80,12 @@ def test_prefill(hash_algo):
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
@@ -80,7 +93,7 @@ def test_prefill(hash_algo):
         parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -90,11 +103,11 @@ def test_prefill(hash_algo):
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
@@ -107,14 +120,14 @@ def test_prefill(hash_algo):
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
 
     # Cache hit in the common prefix when the original block is already free.
     # Incomplete 1 block (6 tokens)
@@ -122,11 +135,11 @@ def test_prefill(hash_algo):
     req2 = make_request("2", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 8]
+    assert [b.block_id for b in blocks] == [8, 9]
 
     # Although we only have 5 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
@@ -148,7 +161,7 @@ def test_prefill(hash_algo):
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
     # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
+    assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
     assert manager.block_pool.free_block_queue.num_free_blocks == 0
     assert manager.block_pool.free_block_queue.free_list_head is None
     assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -162,10 +175,8 @@ def test_prefill_plp():
     3. Schedule plp request; no hit should occur; validate blocks
     '''
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -186,13 +197,13 @@ def test_prefill_plp():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
     req0_block_hashes = [b.block_hash for b in blocks]
 
     # Check full block metadata
     parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
         block_hash = hash_block_tokens(hash_fn, parent_block_hash,
                                        block_tokens)
         assert manager.block_pool.blocks[block_id].block_hash == block_hash
@@ -200,7 +211,7 @@ def test_prefill_plp():
         parent_block_hash = block_hash.hash_value
 
     # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
         assert manager.block_pool.blocks[block_id].block_hash is None
         assert manager.block_pool.blocks[block_id].ref_cnt == 1
 
@@ -211,11 +222,11 @@ def test_prefill_plp():
     req1 = make_request("1", common_token_ids + unique_token_ids)
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
     assert num_computed_tokens == 3 * 16
     num_new_tokens = 53 - 3 * 16
     blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
     for block in computed_blocks:
         assert block.ref_cnt == 2
 
@@ -228,14 +239,14 @@ def test_prefill_plp():
     # All blocks should be available.
     assert manager.block_pool.free_block_queue.num_free_blocks == 10
     # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]
 
     # Request #2 is a prompt-logprobs request:
     # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -251,7 +262,7 @@ def test_prefill_plp():
     block_ids = [b.block_id for b in blocks]
     # Duplicate cached blocks have different ids but same hashes vs request #0
     assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [0, 1, 2, 3, 4]
+    assert block_ids != [1, 2, 3, 4, 5]
 
     # Request #2 block hashes are valid since request #0 hashes are.
     # Check block reference counts.
@@ -263,10 +274,8 @@ def test_prefill_plp():
 
 def test_decode():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -282,7 +291,7 @@ def test_decode():
     assert not computed_blocks
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
 
     # Append slots without allocating a new block.
     req0.num_computed_tokens = 55
@@ -316,10 +325,8 @@ def test_decode():
 
 def test_evict():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -350,15 +357,15 @@ def test_evict():
     assert [
         b.block_id
         for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
+    ] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]
 
     # Touch the first 2 blocks.
     req2 = make_request("2", list(range(2 * 16 + 3)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert [b.block_id for b in computed_blocks] == [1, 2]
     assert num_computed_tokens == 2 * 16
     blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 5]
+    assert [b.block_id for b in blocks] == [7, 6]
     assert manager.block_pool.free_block_queue.num_free_blocks == 6
 
 
@@ -369,10 +376,8 @@ def test_hash_block_correct_reuse():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=1,
+        make_kv_cache_config(16, 2),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -408,10 +413,8 @@ def test_computed_blocks_not_evicted():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=2,
+        make_kv_cache_config(block_size, 3),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -424,7 +427,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 0
+    assert blocks[0].block_id == 1
 
     # Allocate another block.
     req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
@@ -433,7 +436,7 @@ def test_computed_blocks_not_evicted():
     assert num_computed_tokens == 0
     blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
     # Free the blocks.
     manager.free(req0)
@@ -444,13 +447,13 @@ def test_computed_blocks_not_evicted():
     req2 = make_request("2", list(range(num_tokens * 2)))
     computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
     assert len(computed_blocks) == 1
-    assert computed_blocks[0].block_id == 0
+    assert computed_blocks[0].block_id == 1
     assert num_computed_tokens == block_size
 
     blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                     computed_blocks)
     assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2
 
 
 def test_basic_prefix_caching_disabled():
@@ -459,10 +462,8 @@ def test_basic_prefix_caching_disabled():
     """
     block_size = 4
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=4,
+        make_kv_cache_config(block_size, 5),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=False,
         num_preallocate_tokens=0,
     )
@@ -502,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
     This tests that the preallocated blocks are correctly added.
     """
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=num_preallocate_tokens,
     )
@@ -586,10 +585,8 @@ def test_mm_prefix_caching():
     This tests that the multi-modal prefix caching is correct.
     """
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=16,
     )
@@ -629,7 +626,7 @@ def test_mm_prefix_caching():
     assert block_hashes[2].extra_keys == ("bbb", )
 
     blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
     req0.num_computed_tokens = 59
 
     # Append slots without allocating a new block.
@@ -667,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
     """
     block_size = 16
     manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -723,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
 
 def test_reset_prefix_cache():
     manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
         max_model_len=8192,
-        sliding_window=None,
         enable_caching=True,
         num_preallocate_tokens=0,
     )
@@ -736,7 +729,7 @@ def test_reset_prefix_cache():
     all_token_ids = full_block_token_ids + unique_token_ids
     req0 = make_request("0", all_token_ids)
     blocks = manager.allocate_slots(req0, 55)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]
 
     unique_token_ids = [4] * 7
     all_token_ids = full_block_token_ids + unique_token_ids
@@ -745,7 +738,7 @@ def test_reset_prefix_cache():
     assert len(manager.req_to_block_hashes[req1.request_id]) == 3
     assert len(computed_blocks) == 3
     blocks = manager.allocate_slots(req1, 7, computed_blocks)
-    assert [b.block_id for b in blocks] == [4]
+    assert [b.block_id for b in blocks] == [5]
 
     # Failed to reset prefix cache because some blocks are not freed yet.
     assert not manager.reset_prefix_cache()
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 5b96566530c..73af7dad5cc 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2,12 +2,15 @@
 from typing import Optional
 
 import pytest
+import torch
 
 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
@@ -66,12 +69,21 @@ def create_scheduler(
         model_config=model_config,
         cache_config=cache_config,
     )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False))
+        ],
+    )
     cache_config.num_gpu_blocks = 10000
     return Scheduler(
         scheduler_config,
         model_config,
         cache_config,
         lora_config=None,
+        kv_cache_config=kv_cache_config,
         log_stats=True,
         structured_output_manager=StructuredOutputManager(vllm_config),
     )
diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py
new file mode 100644
index 00000000000..9b4ab5fa8b1
--- /dev/null
+++ b/tests/v1/core/test_specialized_manager.py
@@ -0,0 +1,138 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.specialized_manager import SlidingWindowManager
+from vllm.v1.kv_cache_interface import SlidingWindowSpec
+
+
+def test_sliding_window_possible_cached_prefix():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHashType(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[block_hash] = {
+                    i: block_pool.blocks[i + 10]
+                }
+
+        computed_blocks = manager.find_longest_cache_hit(block_hash_list)
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, True, True,
+        True
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
+
+
+def test_sliding_window_remove_skipped_blocks():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids):
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table, ids):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    removed = manager.remove_skipped_blocks(block_table, 0)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 4 tokens are computed. Only token 0 is out of the sliding window. As
+    # block 1000 also contains token 1 that is in the sliding window, block 1000
+    # cannot be removed.
+    removed = manager.remove_skipped_blocks(block_table, 4)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
+    # Block 1000 can be removed.
+    removed = manager.remove_skipped_blocks(block_table, 5)
+    assert_block_id(removed, [original_block_ids[0]])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 6 tokens are computed. Token 0-2 are out of the sliding window.
+    # Cannot remove new block as the block 1001 is still used by token 3.
+    removed = manager.remove_skipped_blocks(block_table, 6)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 7 tokens are computed. Token 0-3 are out of the sliding window.
+    # Block 1001 can be removed and block 1000 is already removed.
+    removed = manager.remove_skipped_blocks(block_table, 7)
+    assert_block_id(removed, [original_block_ids[1]])
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+
+    # 11 tokens are computed. Token 0-7 are out of the sliding window.
+    # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
+    # sequence, and is expected to be evicted earlier than 1002, so the order
+    # of removed blocks should be [1003, 1002].
+    removed = manager.remove_skipped_blocks(block_table, 11)
+    assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
+    assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py
new file mode 100644
index 00000000000..a125d3fb797
--- /dev/null
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
+                                                               prep_prompts)
+
+
+@dataclass
+class TestConfig:
+    sliding_window: int
+    ln_range: tuple[int, int]
+
+
+model_config = {
+    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bigcode/starcoder2-3b",  # sliding window only
+        "google/gemma-2-2b-it",  # sliding window + full attention
+    ])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        test_config = model_config[model]
+
+        llm = LLM(model=model)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        prompts, answer, indices = prep_prompts(batch_size,
+                                                ln_range=test_config.ln_range)
+
+        check_length(prompts, llm, test_config.sliding_window)
+
+        # Fresh generation
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+        # Re-generate with the same prompts to test prefix caching
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+
+def check_length(prompts: list[str], llm: LLM, sliding_window: int):
+    """
+    Check if the prompt length is valid, i.e., longer than the sliding window 
+    size and shorter than the model's max length.
+
+    Args:
+        prompts: list of prompts
+        llm: LLM object
+        sliding_window: Sliding window size
+    """
+    tokenizer = llm.get_tokenizer()
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    assert any(
+        len(tokenizer.encode(prompt)) > sliding_window
+        for prompt in prompts), "Prompt is too short for test"
+    assert all(
+        len(tokenizer.encode(prompt)) <= max_model_len
+        for prompt in prompts), "Prompt is too long for test"
diff --git a/vllm/config.py b/vllm/config.py
index 84b9836ef58..96b6f84be28 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1116,8 +1116,7 @@ class CacheConfig:
         is_attention_free: Whether the model is attention-free.
         num_gpu_blocks_override: Number of GPU blocks to use. This overrides the
             profiled num_gpu_blocks if specified. Does nothing if None.
-        sliding_window: Sliding window size for the KV cache. Can not work with
-            prefix caching enabled.
+        sliding_window: Sliding window size for the KV cache.
         enable_prefix_caching: Whether to enable prefix caching.
         cpu_offload_gb: Size of the CPU offload buffer in GiB.
     """
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 79b0c42d4f8..43f30f7103c 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -27,6 +27,7 @@ class BlockPool:
     """
 
     def __init__(self, num_gpu_blocks: int, enable_caching: bool):
+        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
         # All kv-cache blocks.
@@ -50,6 +51,11 @@ def __init__(self, num_gpu_blocks: int, enable_caching: bool):
         self.cached_block_hash_to_block: dict[BlockHashType, dict[
             int, KVCacheBlock]] = defaultdict(dict)
 
+        # To represent a placeholder block with block_id=0.
+        # The ref_cnt of null_block is not maintained, needs special care to
+        # avoid freeing it.
+        self.null_block = self.free_block_queue.popleft()
+
     def get_cached_block(self,
                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
         """Get a cached block by the block hash, or None if cache miss.
@@ -214,7 +220,7 @@ def touch(self, blocks: list[KVCacheBlock]) -> None:
         for block in blocks:
             # ref_cnt=0 means this block is in the free list (i.e. eviction
             # candidate), so remove it.
-            if block.ref_cnt == 0:
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.remove(block)
             block.incr_ref()
 
@@ -228,7 +234,8 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None:
         """
         for block in ordered_blocks:
             block.decr_ref()
-            if block.ref_cnt == 0:
+            # null_block should not be added to the free list.
+            if block.ref_cnt == 0 and block != self.null_block:
                 self.free_block_queue.append(block)
 
     def reset_prefix_cache(self) -> bool:
@@ -241,10 +248,10 @@ def reset_prefix_cache(self) -> bool:
             False otherwise.
         """
         num_used_blocks = (self.num_gpu_blocks - self.get_num_free_blocks())
-        if num_used_blocks > 0:
+        if num_used_blocks != 1:  # The null block is always marked as used
             logger.warning(
                 "Failed to reset prefix cache because some "
-                "blocks (%d) are not freed yet", num_used_blocks)
+                "blocks (%d) are not freed yet", num_used_blocks - 1)
             return False
 
         # Remove all hashes so that no new blocks will hit.
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 39390babaa8..c0f7715209d 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -9,6 +9,8 @@
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                          hash_request_tokens)
+from vllm.v1.core.specialized_manager import get_specialized_manager
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request, RequestStatus
 
@@ -19,20 +21,22 @@ class KVCacheManager:
 
     def __init__(
         self,
-        block_size: int,
-        num_gpu_blocks: int,
+        kv_cache_config: KVCacheConfig,
         max_model_len: int,
-        sliding_window: Optional[int] = None,
         enable_caching: bool = True,
         caching_hash_algo: str = "builtin",
         num_preallocate_tokens: int = 64,
         log_stats: bool = False,
     ) -> None:
-        self.block_size = block_size
-        self.num_gpu_blocks = num_gpu_blocks
+        assert len(kv_cache_config.kv_cache_groups) == 1, (
+            "KVCacheManager does not support hybrid models with more than 1 "
+            "kv cache group")
+        kv_cache_spec = kv_cache_config.kv_cache_groups[0].kv_cache_spec
+        self.block_size = kv_cache_spec.block_size
+        self.num_gpu_blocks = kv_cache_config.num_blocks
         self.max_model_len = max_model_len
-        self.max_num_blocks_per_req = cdiv(max_model_len, block_size)
-        self.sliding_window = sliding_window
+        self.max_num_blocks_per_req = cdiv(max_model_len, self.block_size)
+
         self.enable_caching = enable_caching
         self.caching_hash_fn = sha256 if caching_hash_algo == "sha256" else hash
         # FIXME: make prefix cache stats conditional on log_stats
@@ -48,9 +52,15 @@ def __init__(
         # further allocation. When it uses up all the N empty blocks, it gets
         # N new empty blocks.
         self.num_preallocate_tokens = num_preallocate_tokens
-        self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
+        self.num_preallocate_blocks = cdiv(num_preallocate_tokens,
+                                           self.block_size)
+
+        self.block_pool = BlockPool(self.num_gpu_blocks, enable_caching)
 
-        self.block_pool = BlockPool(num_gpu_blocks, enable_caching)
+        self.specialized_manager = get_specialized_manager(
+            kv_cache_spec=kv_cache_spec,
+            block_pool=self.block_pool,
+        )
 
         # Mapping from request ID to blocks to track the blocks allocated
         # for each request, so that we can free the blocks when the request
@@ -117,17 +127,25 @@ def get_computed_blocks(
 
         self.prefix_cache_stats.requests += 1
         if request.sampling_params.prompt_logprobs is None:
-            # Check for cache hits
-            computed_blocks = []
-            for block_hash in block_hashes:
-                # block_hashes is a chain of block hashes. If a block hash
-                # is not in the cached_block_hash_to_id, the following
-                # block hashes are not computed yet for sure.
-                if cached_block := self.block_pool.get_cached_block(
-                        block_hash):
-                    computed_blocks.append(cached_block)
-                else:
-                    break
+            if len(block_hashes) * self.block_size == request.num_tokens:
+                # When prompt length is divisible by the block size and all
+                # blocks are cached, we need to recompute the last token. This
+                # have to be achieved by re-computing an entire block because
+                # allocate_slots() assumes num_computed_tokens is always a
+                # multiple of the block size. To achieve this, remove the last
+                # block hash from the block_hashes for find_longest_cache_hit
+                # This limitation can potentially be removed in the future to
+                # slightly improve the performance.
+                last_block_hash = block_hashes.pop()
+            else:
+                last_block_hash = None
+
+            computed_blocks = (
+                self.specialized_manager.find_longest_cache_hit(block_hashes))
+
+            if last_block_hash is not None:
+                # Add back the last block hash if it was removed.
+                block_hashes.append(last_block_hash)
 
             self.prefix_cache_stats.queries += len(block_hashes)
             self.prefix_cache_stats.hits += len(computed_blocks)
@@ -176,13 +194,24 @@ def allocate_slots(
 
         new_computed_blocks = new_computed_blocks or []
 
+        req_blocks = self.req_to_blocks[request.request_id]
+
+        # Free the blocks that are skipped during the attention computation
+        # (e.g., tokens outside the sliding window).
+        # We can do this even if we cannot schedule this request due to
+        # insufficient free blocks.
+        # Should call this function before allocating new blocks to reduce
+        # the number of evicted blocks.
+        removed_blocks = self.specialized_manager.remove_skipped_blocks(
+            req_blocks, request.num_computed_tokens)
+        self.block_pool.free_blocks(removed_blocks)
+
         # The number of computed tokens is the number of computed tokens plus
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
         num_required_blocks = cdiv(num_computed_tokens + num_tokens,
                                    self.block_size)
-        req_blocks = self.req_to_blocks[request.request_id]
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 13a3756fdac..34bc9369b12 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -9,8 +9,9 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import sha256
-from vllm.v1.kv_cache_interface import (KVCacheConfig, KVCacheGroupSpec,
-                                        KVCacheSpec, KVCacheTensor)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec, KVCacheSpec,
+                                        KVCacheTensor, SlidingWindowSpec)
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request
 
@@ -483,7 +484,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
     max_model_len = vllm_config.model_config.max_model_len
     needed_memory = 0
     for layer_spec in kv_cache_spec.values():
-        needed_memory += layer_spec.bytes_for_tokens(max_model_len)
+        needed_memory += layer_spec.max_memory_usage_bytes(vllm_config)
 
     if needed_memory > available_memory:
         raise ValueError(
@@ -597,6 +598,33 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
     return kv_cache_config
 
 
+def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
+    """
+    Only models with one type of KV cache are supported yet. This function tries
+    to convert the KV cache specs to one type if the model is a hybrid model 
+    with multiple type of KV cache. It will convert all SlidingWindowSpec to
+    FullAttentionSpec if both types are present.
+    
+    Args:
+        kv_cache_spec: The kv cache spec of each attention layer in the model
+    """
+
+    has_full_attention = any(
+        isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values())
+    has_sliding_window = any(
+        isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values())
+    if has_full_attention and has_sliding_window:
+        for layer_name, spec in kv_cache_spec.items():
+            if isinstance(spec, SlidingWindowSpec):
+                kv_cache_spec[layer_name] = FullAttentionSpec(
+                    block_size=spec.block_size,
+                    num_kv_heads=spec.num_kv_heads,
+                    head_size=spec.head_size,
+                    dtype=spec.dtype,
+                    use_mla=spec.use_mla,
+                )
+
+
 def get_kv_cache_config(vllm_config: VllmConfig,
                         kv_cache_spec: dict[str, KVCacheSpec],
                         available_memory: int) -> KVCacheConfig:
@@ -613,6 +641,7 @@ def get_kv_cache_config(vllm_config: VllmConfig,
         The generated KVCacheConfigs
     """
     check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory)
+    unify_hybrid_kv_cache_specs(kv_cache_spec)
     if is_kv_cache_type_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 9e6c8e69d55..4d477567b9b 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -19,6 +19,7 @@
 from vllm.v1.core.sched.utils import check_stop
 from vllm.v1.engine import (EngineCoreEventType, EngineCoreOutput,
                             EngineCoreOutputs)
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
@@ -35,6 +36,7 @@ def __init__(
         model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
+        kv_cache_config: KVCacheConfig,
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
@@ -43,6 +45,7 @@ def __init__(
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
         self.lora_config = lora_config
+        self.kv_cache_config = kv_cache_config
         self.log_stats = log_stats
         self.structured_output_manager = structured_output_manager
 
@@ -58,15 +61,11 @@ def __init__(
             self.scheduler_config.max_num_batched_tokens
         self.max_model_len = self.scheduler_config.max_model_len
 
-        num_gpu_blocks = cache_config.num_gpu_blocks
-        assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         # Create the KV cache manager.
         self.kv_cache_manager = KVCacheManager(
-            block_size=self.cache_config.block_size,
-            num_gpu_blocks=num_gpu_blocks,
+            kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            sliding_window=self.cache_config.sliding_window,
-            enable_caching=self.cache_config.enable_prefix_caching,
+            enable_caching=cache_config.enable_prefix_caching,
             caching_hash_algo=self.cache_config.prefix_caching_hash_algo,
             log_stats=self.log_stats)
         self.block_size = self.cache_config.block_size
@@ -300,17 +299,6 @@ def schedule(self) -> SchedulerOutput:
                 # `request.num_prompt_tokens` to consider the resumed requests,
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
-                if num_new_tokens == 0:
-                    # This happens when prompt length is divisible by the block
-                    # size and all blocks are cached. Now we force to recompute
-                    # the last block. Note that we have to re-compute an entire
-                    # block because allocate_slots() assumes num_computed_tokens
-                    # is always a multiple of the block size. This limitation
-                    # can potentially be removed in the future to slightly
-                    # improve the performance.
-                    num_computed_tokens -= self.block_size
-                    num_new_tokens = self.block_size
-                    computed_blocks.pop()
                 if (0 < self.scheduler_config.long_prefill_token_threshold <
                         num_new_tokens):
                     num_new_tokens = (
diff --git a/vllm/v1/core/specialized_manager.py b/vllm/v1/core/specialized_manager.py
new file mode 100644
index 00000000000..7a8a98361c7
--- /dev/null
+++ b/vllm/v1/core/specialized_manager.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+
+from vllm.utils import cdiv
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec,
+                                        SlidingWindowSpec)
+
+
+class SpecializedManager(ABC):
+    """
+    An abstract base class for specialized managers that handle the kv
+    cache management logic of different attention layers.
+    """
+
+    def __init__(
+        self,
+        kv_cache_spec: KVCacheSpec,
+        block_pool: BlockPool,
+    ) -> None:
+        """
+        Initializes the SpecializedManager.
+        Args:
+            kv_cache_spec: The kv_cache_spec for this manager.
+            block_pool: The block pool.
+        """
+
+        self.block_size = kv_cache_spec.block_size
+        self.kv_cache_spec = kv_cache_spec
+        self.block_pool = block_pool
+
+    @abstractmethod
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        """
+        Get the longest cache hit prefix of the blocks. If no cache hit is 
+        found, return an empty list.
+
+        Args:
+            block_hashes: The block hashes of the request.
+        Returns:
+            A list of cached blocks with skipped blocks replaced by null block.
+            For example, sliding window manager should return a list like
+            [NULL, NULL, KVCacheBlock(7), KVCacheBlock(8)] for block size 4 and 
+            sliding window 8. 
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        """
+        Remove the blocks that are no longer needed from `blocks`. The removed 
+        blocks should be replaced by null_block. Return the removed blocks in 
+        eviction order, where the first returned block should be evicted first.
+        Don't free the removed blocks in this function.
+
+        Args:
+            blocks: The list of blocks to be updated.
+            num_computed_tokens: The number of tokens that have been computed.
+        Returns:
+            The removed blocks in eviction order.
+        """
+        raise NotImplementedError
+
+
+class FullAttentionManager(SpecializedManager):
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        computed_blocks: list[KVCacheBlock] = []
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self.block_pool.get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # No need to remove blocks for full attention.
+        return []
+
+
+class SlidingWindowManager(SpecializedManager):
+
+    def __init__(self, kv_cache_spec: SlidingWindowSpec,
+                 block_pool: BlockPool):
+        super().__init__(kv_cache_spec, block_pool)
+        self.sliding_window = kv_cache_spec.sliding_window
+        # The number of contiguous blocks needed for prefix cache hit.
+        # -1 since the input token itself is also included in the window
+        self.sliding_window_contiguous_blocks = cdiv(
+            (kv_cache_spec.sliding_window - 1), self.block_size)
+        self._null_block = block_pool.null_block
+
+    def find_longest_cache_hit(
+            self, block_hashes: list[BlockHashType]) -> list[KVCacheBlock]:
+        # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to
+        # optimize the time complexity from O(len(block_hashes)) to
+        # O(len(block_hashes) / sliding_window_contiguous_blocks +
+        # sliding_window_contiguous_blocks),
+        # which is good for low cache hit rate scenarios.
+        computed_blocks = [self._null_block] * len(block_hashes)
+        num_contiguous_blocks = 0
+
+        # Search from right to left and early stop when a match is found.
+        for i in range(len(block_hashes) - 1, -1, -1):
+            if cached_block := self.block_pool.get_cached_block(
+                    block_hashes[i]):
+                computed_blocks[i] = cached_block
+                num_contiguous_blocks += 1
+                if (num_contiguous_blocks
+                        >= self.sliding_window_contiguous_blocks):
+                    # Trim the trailing blocks.
+                    # E.g., [NULL, NULL, 8, 3, NULL, 9] -> [NULL, NULL, 8, 3]
+                    # when sliding_window_contiguous_blocks=2.
+                    del computed_blocks[i + num_contiguous_blocks:]
+                    return computed_blocks
+            else:
+                num_contiguous_blocks = 0
+        # The first `num_contiguous_blocks` is a cache hit even if
+        # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
+        del computed_blocks[num_contiguous_blocks:]
+        return computed_blocks
+
+    def remove_skipped_blocks(self, blocks: list[KVCacheBlock],
+                              num_computed_tokens: int) -> list[KVCacheBlock]:
+        # Remove the blocks that are no longer be in the sliding window and
+        # skipped during the attention computation.
+        last_useful_token = num_computed_tokens - self.sliding_window + 1
+        last_useful_block = last_useful_token // self.block_size
+
+        removed_blocks: list[KVCacheBlock] = []
+        for i in range(last_useful_block - 1, -1, -1):
+            if blocks[i] == self._null_block:
+                # If the block is already a null block, the blocks before it
+                # should also have been set to null blocks by the previous calls
+                # to this function.
+                break
+            removed_blocks.append(blocks[i])
+            blocks[i] = self._null_block
+        return removed_blocks
+
+
+spec_manager_map: dict[type[KVCacheSpec], type[SpecializedManager]] = {
+    FullAttentionSpec: FullAttentionManager,
+    SlidingWindowSpec: SlidingWindowManager,
+}
+
+
+def get_specialized_manager(kv_cache_spec: KVCacheSpec,
+                            block_pool: BlockPool) -> SpecializedManager:
+    manager_class = spec_manager_map[type(kv_cache_spec)]
+    manager = manager_class(kv_cache_spec, block_pool)
+    return manager
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 68a1dc15330..d915d474cfd 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -33,6 +33,7 @@
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -66,8 +67,9 @@ def __init__(
         self.model_executor = executor_class(vllm_config)
 
         # Setup KV Caches and update CacheConfig after profiling.
-        num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
-            vllm_config)
+        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
+            self._initialize_kv_caches(vllm_config)
+
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
@@ -95,10 +97,11 @@ def __init__(
             model_config=vllm_config.model_config,
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
+            kv_cache_config=kv_cache_config,
+            structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,
             log_stats=self.log_stats,
-            structured_output_manager=self.structured_output_manager,
         )
 
         # Setup MM Input Mapper.
@@ -117,8 +120,8 @@ def __init__(
                         self.batch_queue_size)
             self.batch_queue = queue.Queue(self.batch_queue_size)
 
-    def _initialize_kv_caches(self,
-                              vllm_config: VllmConfig) -> tuple[int, int]:
+    def _initialize_kv_caches(
+            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
         start = time.time()
 
         # Get all kv cache needed by the model
@@ -143,13 +146,14 @@ def _initialize_kv_caches(self,
         unify_kv_cache_configs(kv_cache_configs)
 
         # All workers have the same kv_cache_config except layer names, so use
-        # an arbitrary one to get the number of blocks.
+        # an arbitrary one to initialize the scheduler.
         assert all([
             cfg.num_blocks == kv_cache_configs[0].num_blocks
             for cfg in kv_cache_configs
         ])
         num_gpu_blocks = kv_cache_configs[0].num_blocks
         num_cpu_blocks = 0
+        scheduler_kv_cache_config = kv_cache_configs[0]
 
         # Initialize kv cache and warmup the execution
         self.model_executor.initialize_from_config(kv_cache_configs)
@@ -157,7 +161,7 @@ def _initialize_kv_caches(self,
         elapsed = time.time() - start
         logger.info(("init engine (profile, create kv cache, "
                      "warmup model) took %.2f seconds"), elapsed)
-        return num_gpu_blocks, num_cpu_blocks
+        return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
     def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 867b1b61c87..4fc0844cd1f 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -4,6 +4,7 @@
 
 import torch
 
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.utils import cdiv, get_dtype_size
 
@@ -43,28 +44,23 @@ def page_size_bytes(self) -> int:
         """
         raise NotImplementedError
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         """
-        The KV cache size for `num_tokens` tokens in bytes. Returns the real
-        memory size after padding `num_tokens` to full blocks.
+        The maximum possible memory usage of this KV cache in bytes.
 
         Returns:
-            The KV cache size
+            The KV cache size in bytes
         """
         raise NotImplementedError
 
 
 @dataclass
-class FullAttentionSpec(KVCacheSpec):
+class AttentionSpec(KVCacheSpec):
     num_kv_heads: int
     head_size: int
     dtype: torch.dtype
     use_mla: bool
 
-    @property
-    def type_id(self) -> str:
-        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
-
     @property
     def page_size_bytes(self) -> int:
         # For MLA we only store a single latent vector
@@ -72,8 +68,47 @@ def page_size_bytes(self) -> int:
         return coef * self.block_size * self.num_kv_heads * self.head_size \
                 * get_dtype_size(self.dtype)
 
-    def bytes_for_tokens(self, num_tokens: int) -> int:
-        return cdiv(num_tokens, self.block_size) * self.page_size_bytes
+
+@dataclass
+class FullAttentionSpec(AttentionSpec):
+
+    @property
+    def type_id(self) -> str:
+        return f"full_attention_{self.block_size}_{self.page_size_bytes}"
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        return cdiv(max_model_len, self.block_size) * self.page_size_bytes
+
+
+@dataclass
+class SlidingWindowSpec(AttentionSpec):
+    sliding_window: int
+
+    def __post_init__(self):
+        assert not self.use_mla, "MLA is not supported for sliding window"
+
+    @property
+    def type_id(self) -> str:
+        return f"sliding_window_{self.sliding_window}_{self.block_size}_{self.page_size_bytes}"  # noqa
+
+    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
+        max_model_len = vllm_config.model_config.max_model_len
+        max_num_batched_tokens = (
+            vllm_config.scheduler_config.max_num_batched_tokens)
+
+        # During chunked prefill, we allocate KV cache for the last
+        # `self.sliding_window-1` computed tokens plus the newly scheduled
+        # tokens. And we won't allocate KV cache for more than `max_model_len`
+        # tokens.
+        num_tokens = min(self.sliding_window - 1 + max_num_batched_tokens,
+                         max_model_len)
+
+        # +1 here because the sliding window may not start from the beginning
+        # of the block. For example, if the block size is 4 and num_token
+        # is 4, we need two blocks [XXCD] [EF] to store the sliding
+        # window [CDEF] of 6 tokens.
+        return (cdiv(num_tokens, self.block_size) + 1) * self.page_size_bytes
 
 
 @dataclass
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 43c756b193a..637367a70d2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,8 +28,9 @@
                         check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
+                                        KVCacheConfig, KVCacheSpec,
+                                        SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -1572,7 +1573,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 # different GPUs, and `kv_cache_config.num_blocks` is set to
                 # the min of all `num_blocks`. Verify it here.
                 assert num_blocks >= kv_cache_config.num_blocks
-                if isinstance(kv_cache_spec, FullAttentionSpec):
+                if isinstance(kv_cache_spec, AttentionSpec):
                     kv_cache_shape = self.attn_backend.get_kv_cache_shape(
                         num_blocks, kv_cache_spec.block_size,
                         kv_cache_spec.num_kv_heads, kv_cache_spec.head_size)
@@ -1611,12 +1612,21 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             # cross-attention
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=self.kv_cache_dtype,
-                    use_mla=use_mla)
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=use_mla)
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=self.kv_cache_dtype,
+                        use_mla=use_mla)
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 7f7318a7bdd..c2edbaf351d 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -29,7 +29,7 @@
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
-                                        KVCacheSpec)
+                                        KVCacheSpec, SlidingWindowSpec)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
                              ModelRunnerOutput, SamplerOutput)
 from vllm.v1.sample.tpu.metadata import TPUSupportedSamplingMetadata
@@ -353,17 +353,25 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
         for layer_name, attn_module in forward_ctx.items():
-            # TODO: Support other attention modules, e.g., sliding window,
-            # cross-attention, MLA.
             assert isinstance(attn_module, Attention)
             if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(
-                    block_size=block_size,
-                    num_kv_heads=attn_module.num_kv_heads,
-                    head_size=attn_module.head_size,
-                    dtype=attn_module.dtype,
-                    use_mla=False,
-                )
+                if attn_module.sliding_window is not None:
+                    kv_cache_spec[layer_name] = SlidingWindowSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        sliding_window=attn_module.sliding_window,
+                        use_mla=False,
+                    )
+                else:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(
+                        block_size=block_size,
+                        num_kv_heads=attn_module.num_kv_heads,
+                        head_size=attn_module.head_size,
+                        dtype=attn_module.dtype,
+                        use_mla=False,
+                    )
             elif attn_module.attn_type in (AttentionType.ENCODER,
                                            AttentionType.ENCODER_ONLY):
                 # encoder-only attention does not need KV cache.

From 9d2e02b3a26fbb241ef0f069b3c68b29cf695daa Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Tue, 1 Apr 2025 15:45:49 +0800
Subject: [PATCH 1119/1240] fix: can not use uv run collect_env close #13888
 (#15792)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 collect_env.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/collect_env.py b/collect_env.py
index 0ec9d4cae4b..1562fa2a032 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
     if patterns is None:
         patterns = DEFAULT_PIP_PATTERNS
 
-    # People generally have `pip` as `pip` or `pip3`
-    # But here it is invoked as `python -mpip`
-    def run_with_pip(pip):
-        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+    def run_with_pip():
+        try:
+            import importlib.util
+            pip_spec = importlib.util.find_spec('pip')
+            pip_available = pip_spec is not None
+        except ImportError:
+            pip_available = False
+
+        if pip_available:
+            cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
+        elif os.environ.get("UV") is not None:
+            print("uv is set")
+            cmd = ["uv", "pip", "list", "--format=freeze"]
+        else:
+            raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
+
+        out = run_and_read_all(run_lambda, cmd)
         return "\n".join(line for line in out.splitlines()
                          if any(name in line for name in patterns))
 
     pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
-    out = run_with_pip([sys.executable, '-mpip'])
-
+    out = run_with_pip()
     return pip_version, out
 
 
From 8dbdca523ee32db1b06286df505502d1b7270deb Mon Sep 17 00:00:00 2001
From: Wei Zeng <48810492+wayzeng@users.noreply.github.com>
Date: Tue, 1 Apr 2025 01:20:06 -0700
Subject: [PATCH 1120/1240] [Feature] specify model in config.yaml (#15798)

Signed-off-by: weizeng <weizeng@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../serving/openai_compatible_server.md       |  4 +-
 tests/{data => config}/test_config.yaml       |  0
 tests/config/test_config_with_model.yaml      |  7 ++
 tests/conftest.py                             | 12 ++++
 tests/test_utils.py                           | 64 +++++++++++++++----
 vllm/entrypoints/cli/serve.py                 | 18 ++----
 vllm/utils.py                                 | 36 ++++++++---
 7 files changed, 109 insertions(+), 32 deletions(-)
 rename tests/{data => config}/test_config.yaml (100%)
 create mode 100644 tests/config/test_config_with_model.yaml

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 1cebff7e1f6..b2e972fa469 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -188,6 +188,7 @@ For example:
 ```yaml
 # config.yaml
 
+model: meta-llama/Llama-3.1-8B-Instruct
 host: "127.0.0.1"
 port: 6379
 uvicorn-log-level: "info"
@@ -196,12 +197,13 @@ uvicorn-log-level: "info"
 To use the above config file:
 
 ```bash
-vllm serve SOME_MODEL --config config.yaml
+vllm serve --config config.yaml
 ```
 
 :::{note}
 In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
 The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
 :::
 
 ## API Reference
diff --git a/tests/data/test_config.yaml b/tests/config/test_config.yaml
similarity index 100%
rename from tests/data/test_config.yaml
rename to tests/config/test_config.yaml
diff --git a/tests/config/test_config_with_model.yaml b/tests/config/test_config_with_model.yaml
new file mode 100644
index 00000000000..d8c8c7bc816
--- /dev/null
+++ b/tests/config/test_config_with_model.yaml
@@ -0,0 +1,7 @@
+# Same as test_config.yaml but with model specified
+model: config-model
+port: 12312
+served_model_name: mymodel
+tensor_parallel_size: 2
+trust_remote_code: true
+multi_step_stream_outputs: false
diff --git a/tests/conftest.py b/tests/conftest.py
index 6627ab638bf..b833cff4db7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1117,3 +1117,15 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "optional" in item.keywords:
             item.add_marker(skip_optional)
+
+
+@pytest.fixture(scope="session")
+def cli_config_file():
+    """Return the path to the CLI config file."""
+    return os.path.join(_TEST_DIR, "config", "test_config.yaml")
+
+
+@pytest.fixture(scope="session")
+def cli_config_file_with_model():
+    """Return the path to the CLI config file with model."""
+    return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
diff --git a/tests/test_utils.py b/tests/test_utils.py
index ccbbffcabfc..b6129a10208 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -10,7 +10,7 @@
 
 import pytest
 import torch
-from vllm_test_utils import monitor
+from vllm_test_utils.monitor import monitor
 
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
@@ -143,7 +143,8 @@ def parser():
 def parser_with_config():
     parser = FlexibleArgumentParser()
     parser.add_argument('serve')
-    parser.add_argument('model_tag')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
     parser.add_argument('--served-model-name', type=str)
     parser.add_argument('--config', type=str)
     parser.add_argument('--port', type=int)
@@ -199,29 +200,29 @@ def test_missing_required_argument(parser):
         parser.parse_args([])
 
 
-def test_cli_override_to_config(parser_with_config):
+def test_cli_override_to_config(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', cli_config_file,
         '--tensor-parallel-size', '3'
     ])
     assert args.tensor_parallel_size == 3
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml'
+        cli_config_file
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 12312
     args = parser_with_config.parse_args([
         'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml', '--port', '666'
+        cli_config_file, '--port', '666'
     ])
     assert args.tensor_parallel_size == 3
     assert args.port == 666
 
 
-def test_config_args(parser_with_config):
+def test_config_args(parser_with_config, cli_config_file):
     args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', cli_config_file])
     assert args.tensor_parallel_size == 2
     assert args.trust_remote_code
     assert not args.multi_step_stream_outputs
@@ -243,10 +244,9 @@ def test_config_file(parser_with_config):
         ])
 
 
-def test_no_model_tag(parser_with_config):
+def test_no_model_tag(parser_with_config, cli_config_file):
     with pytest.raises(ValueError):
-        parser_with_config.parse_args(
-            ['serve', '--config', './data/test_config.yaml'])
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
 
 
 # yapf: enable
@@ -480,6 +480,48 @@ def test_swap_dict_values(obj, key1, key2):
     else:
         assert key1 not in obj
 
+
+def test_model_specification(parser_with_config,
+                             cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve', '--config', cli_config_file_with_model,
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+
+    # Test using --model option raises error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With `vllm serve`, you should provide the model as a positional "
+            "argument or in a config file instead of via the `--model` option."
+        ),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])
+
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model,
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
+
+
 @pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
                                     (None, bool, [1, 2, 3])])
 @pytest.mark.parametrize("output", [0, 1, 2])
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index c345ece4dad..e89ac4e2199 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -4,7 +4,6 @@
 
 import uvloop
 
-from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -21,14 +20,9 @@ def __init__(self):
 
     @staticmethod
     def cmd(args: argparse.Namespace) -> None:
-        # The default value of `--model`
-        if args.model != EngineArgs.model:
-            raise ValueError(
-                "With `vllm serve`, you should provide the model as a "
-                "positional argument instead of via the `--model` option.")
-
-        # EngineArgs expects the model name to be passed as --model.
-        args.model = args.model_tag
+        # If model is specified in CLI (as positional arg), it takes precedence
+        if hasattr(args, 'model_tag') and args.model_tag is not None:
+            args.model = args.model_tag
 
         uvloop.run(run_server(args))
 
@@ -41,10 +35,12 @@ def subparser_init(
         serve_parser = subparsers.add_parser(
             "serve",
             help="Start the vLLM OpenAI Compatible API server",
-            usage="vllm serve <model_tag> [options]")
+            usage="vllm serve [model_tag] [options]")
         serve_parser.add_argument("model_tag",
                                   type=str,
-                                  help="The model tag to serve")
+                                  nargs='?',
+                                  help="The model tag to serve "
+                                  "(optional if specified in config)")
         serve_parser.add_argument(
             "--config",
             type=str,
diff --git a/vllm/utils.py b/vllm/utils.py
index f13f4d78723..5f32f8cb66a 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1241,6 +1241,16 @@ def parse_args(self, args=None, namespace=None):
         if args is None:
             args = sys.argv[1:]
 
+        # Check for --model in command line arguments first
+        if args and args[0] == "serve":
+            model_in_cli_args = any(arg == '--model' for arg in args)
+
+            if model_in_cli_args:
+                raise ValueError(
+                    "With `vllm serve`, you should provide the model as a "
+                    "positional argument or in a config file instead of via "
+                    "the `--model` option.")
+
         if '--config' in args:
             args = self._pull_args_from_config(args)
 
@@ -1324,19 +1334,29 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
         config_args = self._load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
-        # followed by model_tag (only for serve)
+        # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
         if args[0] == "serve":
-            if index == 1:
+            model_in_cli = len(args) > 1 and not args[1].startswith('-')
+            model_in_config = any(arg == '--model' for arg in config_args)
+
+            if not model_in_cli and not model_in_config:
                 raise ValueError(
-                    "No model_tag specified! Please check your command-line"
-                    " arguments.")
-            args = [args[0]] + [
-                args[1]
-            ] + config_args + args[2:index] + args[index + 2:]
+                    "No model specified! Please specify model either "
+                    "as a positional argument or in a config file.")
+
+            if model_in_cli:
+                # Model specified as positional arg, keep CLI version
+                args = [args[0]] + [
+                    args[1]
+                ] + config_args + args[2:index] + args[index + 2:]
+            else:
+                # No model in CLI, use config if available
+                args = [args[0]
+                        ] + config_args + args[1:index] + args[index + 2:]
         else:
             args = [args[0]] + config_args + args[1:index] + args[index + 2:]
 
@@ -1354,9 +1374,7 @@ def _load_config_file(self, file_path: str) -> list[str]:
                 '--port': '12323',
                 '--tensor-parallel-size': '4'
             ]
-
         """
-
         extension: str = file_path.split('.')[-1]
         if extension not in ('yaml', 'yml'):
             raise ValueError(

From c1bda217437b8ca744058018eddcc8ab842623d0 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Tue, 1 Apr 2025 04:53:56 -0400
Subject: [PATCH 1121/1240] [Misc] Enable V1 LoRA by default (#15320)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_chat.py | 60 +++++++++++++++++++++++++--
 tests/lora/test_baichuan.py           | 16 +++----
 tests/lora/test_chatglm3_tp.py        | 16 +++----
 tests/lora/test_gemma.py              | 16 +++----
 tests/lora/test_layers.py             |  5 ---
 tests/lora/test_llama_tp.py           | 20 ++++-----
 tests/lora/test_lora_manager.py       | 14 +++++--
 tests/lora/test_phi.py                | 16 +++----
 tests/lora/test_quant_model.py        | 16 +++----
 tests/lora/test_transfomers_model.py  | 19 ++++-----
 tests/v1/test_oracle.py               | 10 +----
 vllm/engine/arg_utils.py              |  4 --
 12 files changed, 125 insertions(+), 87 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 25e4595cef6..4d13421adee 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -24,7 +24,23 @@
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module", params=[False, True])
+def server(
+        request,
+        monkeypatch_module,
+        zephyr_lora_files,  #noqa: F811
+        zephyr_lora_added_tokens_files):  # noqa: F811
+
+    use_v1 = request.param
+    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
+
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -49,6 +65,13 @@ def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
         yield remote_server
 
 
+@pytest.fixture
+def is_v1_server(server):
+    import os
+    assert os.environ['VLLM_USE_V1'] in ['0', '1']
+    return os.environ['VLLM_USE_V1'] == '1'
+
+
 @pytest_asyncio.fixture
 async def client(server):
     async with server.get_async_client() as async_client:
@@ -471,8 +494,13 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
+                                  is_v1_server: bool,
                                   guided_decoding_backend: str,
                                   sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -511,9 +539,13 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_guided_json_chat(client: openai.AsyncOpenAI,
+async def test_guided_json_chat(client: openai.AsyncOpenAI, is_v1_server: bool,
                                 guided_decoding_backend: str,
                                 sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported in V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -559,7 +591,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool,
                                  guided_decoding_backend: str, sample_regex):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -617,8 +654,13 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
+                                           is_v1_server: bool,
                                            guided_decoding_backend: str,
                                            sample_guided_choice):
+
+    if is_v1_server and guided_decoding_backend != 'xgrammar':
+        pytest.skip("Only xgrammar backend is supported with V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -648,9 +690,13 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
-async def test_named_tool_use(client: openai.AsyncOpenAI,
+async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
                               guided_decoding_backend: str,
                               sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -742,6 +788,10 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
                                                    sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
@@ -787,6 +837,10 @@ async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
                                                   sample_json_schema):
+
+    if is_v1_server:
+        pytest.skip("sample_json_schema has features unsupported on V1")
+
     messages = [{
         "role": "system",
         "content": "you are a helpful assistant"
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 9103ba425af..3aa30b7b3c7 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -11,6 +11,14 @@
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -40,14 +48,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def test_baichuan_lora(baichuan_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fa8c66d1030..28a6f163d11 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -18,6 +18,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,14 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
     llm = vllm.LLM(MODEL_PATH,
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
index 8f07e39d20d..610bc405ede 100644
--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
@@ -9,6 +9,14 @@
 MODEL_PATH = "google/gemma-7b"
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "Quote: Imagination is",
@@ -31,14 +39,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 # The V1 lora test for this model requires more than 24GB.
 @pytest.mark.skip_v1
 @pytest.mark.xfail(current_platform.is_rocm(),
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8c8e55edae6..56da97b6a06 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import importlib
 import random
 from copy import deepcopy
 from dataclasses import dataclass
@@ -82,10 +81,6 @@ def v1(run_with_both_engines_lora):
     # This can be promoted up to conftest.py to run for every
     # test in a package
 
-    # Reload punica_gpu as the kernels used are tied to engine type.
-    from vllm.lora.punica_wrapper import punica_gpu
-    importlib.reload(punica_gpu)
-
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7026f705026..9f20e47c2f9 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -28,6 +28,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
@@ -71,16 +79,6 @@ def generate_and_test(llm, sql_lora_files):
     print("removing lora")
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
 
@@ -126,8 +124,6 @@ def get_num_gpu_blocks_no_lora():
         "less when using lora than when not using lora")
 
 
-# V1 Test: Failing due to numerics on V1.
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index db6a6ec78fa..576d95a4715 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -7,7 +7,6 @@
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm import envs
 from vllm.config import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
@@ -33,6 +32,17 @@
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
 
+@pytest.fixture(scope="function", autouse=True)
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
+    """
+    Some tests depend on V0 internals. Since both V0 and V1 use the same
+    LoRAModelManager it is okay to just test V0.
+    """
+    with monkeypatch.context() as m:
+        m.setenv('VLLM_USE_V1', '0')
+        yield
+
+
 @pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
@@ -411,7 +421,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
     assert manager.device == device
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                           sql_lora_files, device):
@@ -491,7 +500,6 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             device)
 
 
-@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
 @pytest.mark.parametrize("device", DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
                                 sql_lora_files, device):
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
index 8596d399979..7375cabbc36 100644
--- a/tests/lora/test_phi.py
+++ b/tests/lora/test_phi.py
@@ -10,6 +10,14 @@
 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(
@@ -48,14 +56,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 # Skipping for V1 for now as we are hitting,
 # "Head size 80 is not supported by FlashAttention." error.
 @pytest.mark.skip_v1
diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py
index d607bf66ebd..a4a47a9c2ac 100644
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -37,6 +37,14 @@ class ModelWithQuantization:
     ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM,
               lora_path: str,
               lora_id: int,
@@ -69,14 +77,6 @@ def format_prompt_tuples(prompt):
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index f65fb1cdbbd..0f18de42cd9 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -18,6 +18,14 @@
 ]
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
@@ -46,15 +54,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-@pytest.mark.skip_v1
 @create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
@@ -74,7 +73,6 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -96,7 +94,6 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
-@pytest.mark.skip_v1
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 762c7bada32..1448641f6a5 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -104,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
         assert envs.VLLM_USE_V1
         m.delenv("VLLM_USE_V1")
 
-        # Should fall back to V0 for experimental config.
-        _ = AsyncEngineArgs(
-            model=MODEL,
-            enable_lora=True,
-        ).create_engine_config()
-        assert not envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
         # Should fall back to V0 for supported model.
         _ = AsyncEngineArgs(
             model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
@@ -125,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
             m.delenv("VLLM_USE_V1")
 
         # Should default to V1 for supported config.
-        model = LLM(MODEL, enforce_eager=True)
+        model = LLM(MODEL, enforce_eager=True, enable_lora=True)
         print(model.generate("Hello my name is"))
         assert hasattr(model.llm_engine, "engine_core")
         m.delenv("VLLM_USE_V1")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 019cbe18397..ecdcab50e45 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1512,10 +1512,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                 and _warn_or_fallback("Engine in background thread")):
             return False
 
-        # LoRA is supported on V1, but off by default for now.
-        if self.enable_lora and _warn_or_fallback("LORA"):
-            return False
-
         # PP is supported on V1 with Ray distributed executor,
         # but off for MP distributed executor for now.
         if (self.pipeline_parallel_size > 1

From 6f64b2ac69df280eeefe0ce251407b4eb60c7792 Mon Sep 17 00:00:00 2001
From: shangmingc <caishangming@linux.alibaba.com>
Date: Tue, 1 Apr 2025 17:26:22 +0800
Subject: [PATCH 1122/1240] [Misc] Fix speculative config repr string (#15860)

Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 96b6f84be28..c82c9763ccd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2359,12 +2359,10 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def __repr__(self) -> str:
-        if self.prompt_lookup_max is not None and self.prompt_lookup_max > 0:
-            draft_model = "ngram"
-        else:
-            draft_model = self.draft_model_config.model
+        method = self.method
+        model = None if method == "ngram" else self.draft_model_config.model
         num_spec_tokens = self.num_speculative_tokens
-        return f"SpeculativeConfig({draft_model=}, {num_spec_tokens=})"
+        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
 @dataclass

From 0351c0bd9905a515e0068bded01eba31cd9010a1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:05:14 +0100
Subject: [PATCH 1123/1240] [Docs] Fix small error in link text (#15868)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 62274854d8b..fb4c8bde065 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -135,7 +135,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The <project:#transformers-backend> enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 :::{tip}
 The easiest way to check if your model is really supported at runtime is to run the program below:

From e00384017e92e58cd60021ca3b2ff171c826ea1c Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 1 Apr 2025 18:17:11 +0800
Subject: [PATCH 1124/1240] [Bugfix] Fix no video/image profiling edge case for
 `MultiModalDataParser` (#15828)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/multimodal/parse.py | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 772b1609a9f..fc5a294564e 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -295,7 +295,7 @@ def get_items(
 
 
 ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]],
-                                         ModalityDataItems[Any, Any]]
+                                         Optional[ModalityDataItems[Any, Any]]]
 
 
 class MultiModalDataParser:
@@ -319,7 +319,15 @@ def _is_embeddings(
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
         if is_list_of(data, torch.Tensor):
-            return len(data) == 0 or data[0].ndim == 2
+            return data[0].ndim == 2
+
+        return False
+
+    def _is_empty(self, data: object) -> TypeGuard[None]:
+        if isinstance(data, list):
+            return len(data) == 0
+        if isinstance(data, (np.ndarray, torch.Tensor)):
+            return data.size == 0
 
         return False
 
@@ -341,7 +349,12 @@ def _get_audio_with_sr(
     def _parse_audio_data(
         self,
         data: ModalityData[AudioItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        # also check single audio item with sampling rate
+        if self._is_empty(data) or (isinstance(data, tuple)
+                                    and self._is_empty(data[0])):
+            return None
+
         if self._is_embeddings(data):
             return AudioEmbeddingItems(data)
 
@@ -378,7 +391,10 @@ def _parse_audio_data(
     def _parse_image_data(
         self,
         data: ModalityData[ImageItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return ImageEmbeddingItems(data)
 
@@ -396,7 +412,10 @@ def _parse_image_data(
     def _parse_video_data(
         self,
         data: ModalityData[VideoItem],
-    ) -> ModalityDataItems[Any, Any]:
+    ) -> Optional[ModalityDataItems[Any, Any]]:
+        if self._is_empty(data):
+            return None
+
         if self._is_embeddings(data):
             return VideoEmbeddingItems(data)
 
@@ -427,6 +446,8 @@ def parse_mm_data(self,
             if k not in subparsers:
                 raise ValueError(f"Unsupported modality: {k}")
 
-            mm_items[k] = subparsers[k](v)
+            # ignore empty embedding data
+            if (parsed_data := subparsers[k](v)) is not None:
+                mm_items[k] = parsed_data
 
         return mm_items

From ebc29145db89344b709e150df34ced60b081be9b Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Tue, 1 Apr 2025 06:07:53 -0700
Subject: [PATCH 1125/1240] [Misc] Use
 envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE (#15831)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                              | 20 +++++++++++------
 vllm/executor/ray_distributed_executor.py | 27 ++++++++++++++---------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 8a03ba329b0..b34c2df8169 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -48,7 +48,7 @@
     VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
     VLLM_USE_RAY_SPMD_WORKER: bool = False
     VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: bool = True
+    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -380,15 +380,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # (previously known as ADAG) API which optimizes the
     # control plane overhead.
     # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+    # Note that this variable is set to 1 in V1 by default
+    # when ray distributed executor is used.
     "VLLM_USE_RAY_COMPILED_DAG":
     lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG", "0"))),
 
-    # If the env var is set, it uses NCCL for communication in
-    # Ray's Compiled Graph. This flag is ignored if
-    # VLLM_USE_RAY_COMPILED_DAG is not set.
-    "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL":
-    lambda: bool(int(os.getenv("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL", "1"))
-                 ),
+    # If the env var is set, Ray Compiled Graph uses the specified
+    # channel type to communicate between workers belonging to
+    # different pipeline-parallel stages.
+    # Available options:
+    # - "auto": use the default channel type
+    # - "nccl": use NCCL for communication
+    # - "shm": use shared memory and gRPC for communication
+    # This flag is ignored if VLLM_USE_RAY_COMPILED_DAG is not set.
+    "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE":
+    lambda: os.getenv("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE", "auto"),
 
     # If the env var is set, it enables GPU communication overlap
     # (experimental feature) in Ray's Compiled Graph. This flag is ignored if
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index c823ab5bf96..9b0b98731e0 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -79,7 +79,7 @@ def _init_executor(self) -> None:
 
             # For TPU, avoid compiling NVIDIA's NCCL
             if current_platform.is_tpu():
-                os.environ["VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL"] = "0"
+                os.environ["VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE"] = "shm"
 
         # If the env var is set, it uses the Ray's compiled DAG API
         # which optimizes the control plane overhead.
@@ -546,10 +546,11 @@ def _check_ray_cgraph_installation(self):
                              "Run `pip install ray[cgraph]` to install it.")
 
         cupy_spec = importlib.util.find_spec("cupy")
-        if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
+        if (cupy_spec is None
+                and envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE == "nccl"):
             raise ValueError(
                 "cupy is not installed but required since "
-                "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
+                "VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE is set to 'nccl'. "
                 "Run `pip install ray[cgraph]` and check cupy installation.")
 
     def _compiled_ray_dag(self, enable_asyncio: bool):
@@ -557,10 +558,17 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
         self._check_ray_cgraph_installation()
         from ray.dag import InputNode, MultiOutputNode
 
-        logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s",
-                    envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL)
+        logger.info("VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE = %s",
+                    envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE)
         logger.info("VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM = %s",
                     envs.VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM)
+
+        channel_type = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
+        if channel_type not in ("auto", "nccl", "shm"):
+            raise ValueError(
+                "Invalid value for VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: "
+                f"{channel_type}. Valid values are: 'auto', 'nccl', or 'shm'.")
+
         # Enlarge the default value of "RAY_CGRAPH_get_timeout" to 300 seconds
         # (it is 10 seconds by default). This is a Ray environment variable to
         # control the timeout of getting result from a compiled graph execution,
@@ -605,13 +613,12 @@ def _compiled_ray_dag(self, enable_asyncio: bool):
                     ]
 
                 last_pp_rank = len(self.pp_tp_workers) - 1
-                if pp_rank < last_pp_rank:
+                if (pp_rank < last_pp_rank and
+                        envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE != "shm"):
                     # Specify how intermediate tensors should be passed
                     # between pp stages, no need to specify for the last
-                    # pp stage.
-                    transport = "nccl" \
-                        if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \
-                        else "auto"
+                    # pp stage or when using shared memory (the default).
+                    transport = envs.VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE
                     outputs = [
                         output.with_tensor_transport(transport=transport)
                         for output in outputs

From 7232ad405245a5502072cf6da0062f577f7e1e78 Mon Sep 17 00:00:00 2001
From: Yang Chen <yangche@fb.com>
Date: Tue, 1 Apr 2025 06:09:40 -0700
Subject: [PATCH 1126/1240] setup correct nvcc version with CUDA_HOME (#15725)

Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 3a92d5a23a1..cf2acb20d9c 100755
--- a/setup.py
+++ b/setup.py
@@ -201,6 +201,9 @@ def configure(self, ext: CMakeExtension) -> None:
         else:
             # Default build tool to whatever cmake picks.
             build_tool = []
+        # Make sure we use the nvcc from CUDA_HOME
+        if _is_cuda():
+            cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
         subprocess.check_call(
             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
             cwd=self.build_temp)
@@ -639,11 +642,10 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
-        # FA3 requires CUDA 12.0 or later
+    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
+        # FA3 requires CUDA 12.3 or later
         ext_modules.append(
             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
-    if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
         # Optional since this doesn't get built (produce an .so file) when
         # not targeting a hopper system
         ext_modules.append(

From 5bcb0694866477fec7738e910558a22c1142825a Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 1 Apr 2025 07:10:05 -0600
Subject: [PATCH 1127/1240] [Model] Support Mistral3 in the HF Transformers
 format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   7 +
 examples/offline_inference/vision_language.py |  24 +
 .../vision_language_multi_image.py            |  23 +
 tests/models/registry.py                      |   3 +
 vllm/entrypoints/chat_utils.py                |   3 +-
 vllm/model_executor/models/mistral3.py        | 656 ++++++++++++++++++
 vllm/model_executor/models/pixtral.py         |   7 +-
 vllm/model_executor/models/registry.py        |   1 +
 vllm/model_executor/models/vision.py          |   3 +
 9 files changed, 723 insertions(+), 4 deletions(-)
 create mode 100644 vllm/model_executor/models/mistral3.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index fb4c8bde065..42d923e1604 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -865,6 +865,13 @@ See [this page](#generative-models) for more information on how to use generativ
   * ✅︎
   * ✅︎
   * ✅︎
+- * `Mistral3ForConditionalGeneration`
+  * Mistral3
+  * T + I<sup>+</sup>
+  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
+  *
+  * ✅︎
+  *
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index eb56b0aee6c..d32bfcd3460 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -498,6 +498,29 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
     return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
 
 
+# Mistral-3 HF-format
+def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # NOTE: Need L40 (or equivalent) to avoid OOM
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+
+    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # LLama 3.2
 def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -859,6 +882,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "mantis": run_mantis,
     "minicpmo": run_minicpmo,
     "minicpmv": run_minicpmv,
+    "mistral3": run_mistral3,
     "mllama": run_mllama,
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 0493222da13..318cf989d73 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -218,6 +218,28 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+
+    # Adjust this as necessary to fit in GPU
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        tensor_parallel_size=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "[IMG]" * len(image_urls)
+    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
 
@@ -509,6 +531,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "mistral3": load_mistral3,
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "phi3_v": load_phi3v,
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 8cc5c28d237..ffc00261a8a 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -297,6 +297,9 @@ def check_available_online(
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
+    "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
+                                                        min_transformers_version="4.50",  # noqa: E501
+                                                        extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
                                         max_transformers_version="4.48",
                                         transformers_version_reason="Use of private method which no longer exists.",  # noqa: E501
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 24382142768..e32b8ffc044 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -487,7 +487,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
             if model_type in ("minicpmo", "minicpmv"):
                 return "(<image>./</image>)"
-            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
+            if model_type in ("blip-2", "fuyu", "paligemma", "pixtral",
+                              "mistral3"):
                 # These models do not use image tokens in the prompt
                 return None
             if model_type == "qwen":
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
new file mode 100644
index 00000000000..4cd9a7bf58e
--- /dev/null
+++ b/vllm/model_executor/models/mistral3.py
@@ -0,0 +1,656 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+from collections.abc import Iterable, Mapping, Sequence
+from functools import cached_property
+from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict,
+                    TypeVar, Union)
+
+import torch
+import torch.nn as nn
+from transformers import (BatchFeature, Mistral3Config, PixtralVisionConfig,
+                          PretrainedConfig)
+from transformers.models.pixtral import PixtralProcessor
+
+from vllm.config import VllmConfig
+from vllm.inputs import InputProcessingContext
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, ProcessingCache,
+                                        PromptReplacement, PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsV0Only)
+from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import get_vision_encoder_info, select_patch_features
+
+
+class Mistral3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values_pixtral"]
+    pixel_values: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+
+    Note that `height` or `width` may be different per batch and image,
+    in which case the data is passed as a list instead of a batched tensor.
+    """
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
+
+
+class Mistral3PatchMerger(nn.Module):
+    """
+    Learned merging of spatial_merge_size ** 2 patches
+    """
+
+    def __init__(self, vision_hidden_size: int, spatial_merge_size: int,
+                 patch_size: int):
+        super().__init__()
+
+        self.vision_hidden_size = vision_hidden_size
+        self.spatial_merge_size = spatial_merge_size
+        self.patch_size = patch_size
+        self.merging_layer = nn.Linear(vision_hidden_size *
+                                       self.spatial_merge_size**2,
+                                       vision_hidden_size,
+                                       bias=False)
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_sizes = [(image_size[0] // self.patch_size,
+                        image_size[1] // self.patch_size)
+                       for image_size in image_sizes]
+
+        tokens_per_image = [h * w for h, w in image_sizes]
+        d = image_features.shape[-1]
+
+        permuted_tensor = []
+        for image_index, image_tokens in enumerate(
+                image_features.split(tokens_per_image)):
+            # Reshape image_tokens into a 2D grid
+            h, w = image_sizes[image_index]
+            image_grid = image_tokens.view(h, w, d).permute(2, 0,
+                                                            1).unsqueeze(0)
+            grid = torch.nn.functional.unfold(
+                image_grid,
+                kernel_size=self.spatial_merge_size,
+                stride=self.spatial_merge_size)
+            grid = grid.view(d * self.spatial_merge_size**2, -1).t()
+            permuted_tensor.append(grid)
+
+        image_features = torch.cat(permuted_tensor, dim=0)
+        image_features = self.merging_layer(image_features)
+        return image_features
+
+
+class Mistral3MultiModalProjector(nn.Module):
+
+    def __init__(self,
+                 vision_hidden_size: int,
+                 text_hidden_size: int,
+                 spatial_merge_size: int,
+                 patch_size: int,
+                 projector_hidden_act: str,
+                 multimodal_projector_bias: bool,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+
+        self.norm = RMSNorm(vision_hidden_size, eps=1e-5)
+        self.patch_merger = Mistral3PatchMerger(
+            vision_hidden_size=vision_hidden_size,
+            spatial_merge_size=spatial_merge_size,
+            patch_size=patch_size)
+
+        self.linear_1 = ColumnParallelLinear(vision_hidden_size,
+                                             text_hidden_size,
+                                             bias=multimodal_projector_bias,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.linear_1")
+        self.act = get_act_fn(projector_hidden_act)
+        self.linear_2 = RowParallelLinear(text_hidden_size,
+                                          text_hidden_size,
+                                          bias=multimodal_projector_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.linear_2")
+
+    def forward(self, image_features: torch.Tensor,
+                image_sizes: torch.Tensor) -> torch.Tensor:
+        image_features = self.norm(image_features)
+        image_features = self.patch_merger(image_features, image_sizes)
+        hidden_states, _ = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class LlavaLikeConfig(Protocol):
+    vision_config: Final[PretrainedConfig]
+    image_token_index: Final[int]
+    vision_feature_select_strategy: Final[str]
+    vision_feature_layer: Final[Union[int, list[int]]]
+
+
+class LlavaLikeProcessor(Protocol):
+    image_token: Final[str]
+
+
+class BaseLlavaProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> LlavaLikeConfig:
+        return self.ctx.get_hf_config(Mistral3Config)
+
+    def get_vision_encoder_info(self):
+        return get_vision_encoder_info(self.get_hf_config())
+
+    @abstractmethod
+    def get_hf_processor(self, **kwargs: object) -> LlavaLikeProcessor:
+        raise NotImplementedError
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        vision_encoder_info = self.get_vision_encoder_info()
+        return vision_encoder_info.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        vision_encoder_info = self.get_vision_encoder_info()
+        width = height = vision_encoder_info.get_image_size()
+        return ImageSize(width=width, height=height)
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
+
+_I = TypeVar("_I", bound=BaseLlavaProcessingInfo)
+
+
+class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class Mistral3ProcessingInfo(BaseLlavaProcessingInfo):
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(PixtralProcessor, **kwargs)
+
+
+class Mistral3MultiModalProcessor(
+        BaseMultiModalProcessor[Mistral3ProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        pixel_values = processed_outputs.get("pixel_values")
+        if pixel_values is not None:
+
+            # Avoid padding since we need the output for each image to be
+            # independent of other images for the cache to work correctly
+            image_sizes = processed_outputs["image_sizes"]
+            assert len(pixel_values) == len(image_sizes)
+
+            processed_outputs["pixel_values"] = [
+                p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
+            ]
+
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        hf_config = self.info.get_hf_config()
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+
+        image_break_id = vocab[processor.image_break_token]
+        image_token_id = hf_config.image_token_index
+        image_end_id = vocab[processor.image_end_token]
+
+        vision_config = hf_config.vision_config
+        assert isinstance(vision_config, PixtralVisionConfig)
+        encoder_info = PixtralHFEncoderInfo(vision_config)
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = encoder_info.get_patch_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
+            tokens[-1] = image_end_id
+
+            return tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+
+
+def _build_mistral3_info(
+    ctx: InputProcessingContext, ) -> BaseLlavaProcessingInfo:
+    hf_config = ctx.get_hf_config(Mistral3Config)
+    assert isinstance(hf_config.vision_config, PixtralVisionConfig)
+    return Mistral3ProcessingInfo(ctx)
+
+
+def _build_mistral3_processor(
+    info: _I,
+    dummy_inputs: BaseDummyInputsBuilder[_I],
+    *,
+    cache: Optional[ProcessingCache] = None,
+    enable_sanity_checks: bool = True,
+) -> BaseMultiModalProcessor:
+    assert isinstance(info, Mistral3ProcessingInfo)
+    return Mistral3MultiModalProcessor(
+        info,
+        dummy_inputs,  # type: ignore
+        cache=cache,
+        enable_sanity_checks=enable_sanity_checks,
+    )
+
+
+def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int:
+    """Determine the number of hidden layers to initialize up to in the
+    visual encoder.
+    
+    Args:
+        hf_config: Model config with vision feature layer(s).
+    """
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest one
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    """Given a signed vision feature layer, get the number of hidden layers
+    needed to leverage it.
+
+    Args:
+        feature_layer_index: Index of a required layer in the visual encoder.
+        num_hidden_layers: The total number of hidden layers in the visual
+            encoder.
+    """
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+def init_vision_tower_for_llava(
+    hf_config: LlavaLikeConfig,
+    quant_config: Optional[QuantizationConfig],
+    *,
+    require_post_norm: Optional[bool] = None,
+    prefix: str = "",
+) -> PixtralHFVisionModel:
+    vision_config = hf_config.vision_config
+
+    # Initialize the vision tower only up to the deepest required feature layer
+    num_hidden_layers = _get_num_hidden_layers(hf_config)
+
+    assert isinstance(vision_config, PixtralVisionConfig)
+
+    return PixtralHFVisionModel(
+        vision_config,
+        quant_config=quant_config,
+        num_hidden_layers_override=num_hidden_layers,
+        require_post_norm=require_post_norm,
+        prefix=prefix,
+    )
+
+
+# TODO(mgoin): Support V1, there are issues with image batching/chunking
+# that need to be resolved first.
+@MULTIMODAL_REGISTRY.register_processor(
+    _build_mistral3_processor,
+    info=_build_mistral3_info,
+    dummy_inputs=Mistral3DummyInputsBuilder)
+class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsPP, SupportsV0Only):
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        # NOTE: These are special cases for Pixtral-12B in the HF-format
+        # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json  # noqa
+        if (config.text_config.architectures is None
+                and config.text_config.model_type == "mistral"):
+            config.text_config.architectures = ["MistralForCausalLM"]
+        if (config.projector_hidden_act is None
+                and config.vision_config.hidden_act == "gelu"):
+            config.projector_hidden_act = "gelu"
+
+        # TODO: Optionally initializes this for supporting embeddings.
+        self.vision_tower = init_vision_tower_for_llava(
+            config,
+            quant_config,
+            require_post_norm=False,
+            prefix=maybe_prefix(prefix, "vision_tower"))
+        self.multi_modal_projector = Mistral3MultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act,
+            spatial_merge_size=config.spatial_merge_size,
+            patch_size=config.vision_config.patch_size,
+            multimodal_projector_bias=config.multimodal_projector_bias,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "multi_modal_projector"))
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+        actual_dims = tuple(data.shape[1:])
+
+        if actual_dims != expected_dims:
+            expected_expr = ("batch_size", *map(str, expected_dims))
+            raise ValueError(
+                f"The expected shape of pixel values is {expected_expr}. "
+                f"You supplied {tuple(data.shape)}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Mistral3ImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        assert pixel_values is not None
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+
+        assert self.config.vision_config.model_type == "pixtral"
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        return Mistral3ImagePixelInputs(
+            type="pixel_values_pixtral",
+            pixel_values=flatten_bn(pixel_values),
+            embed_is_patch=embed_is_patch,
+        )
+
+    def _process_image_input(
+        self,
+        image_input: Mistral3ImagePixelInputs,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        image_sizes = [(img.shape[-2], img.shape[-1])
+                       for img in image_input["pixel_values"]]
+
+        image_features = self.vision_tower(image_input["pixel_values"])
+
+        if isinstance(image_features, torch.Tensor):
+            return self.multi_modal_projector(image_features, image_sizes)
+
+        feature_sizes = [
+            image_feature.shape[0] // self.config.spatial_merge_size**2
+            for image_feature in image_features
+        ]
+
+        image_embeds = self.multi_modal_projector(torch.cat(image_features),
+                                                  image_sizes)
+        if len(feature_sizes) > 1:
+            image_embeds = torch.split(image_embeds, feature_sizes)
+        else:
+            image_embeds = (image_embeds, )
+        return image_embeds
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+
+        vision_embeddings = self._process_image_input(image_input)
+
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids,
+                inputs_embeds,
+                select_patch_features(multimodal_embeddings),
+                self.config.image_token_index,
+            )
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Run forward pass for Mistral3.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+
+        Concretely, consider a text prompt:
+        `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`.
+
+        Tokenizer outputs:
+        `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879,
+        278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`.
+
+        To reserve space in KV cache, we have to insert placeholder tokens
+        before they are inputted to the model, so the input processor prepends
+        additional image tokens (denoted as `32000`), resulting in:
+        `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618,
+        29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566,
+        29901]`.
+
+        We insert 575 tokens so that including the original image token in the
+        input, there are a total of 576 (24 * 24) image tokens, which
+        corresponds to the number of image tokens inputted to the language
+        model, i.e. the number of image tokens outputted by the visual encoder.
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: The pixels in each input image.
+
+        See also:
+            :class:`Mistral3ImagePixelInputs`
+        """
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(input_ids,
+                                                  positions,
+                                                  intermediate_tensors,
+                                                  inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index da2017c987d..f8c7cc9382a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -979,7 +979,8 @@ def get_image_size(self) -> int:
         return self.vision_config.image_size
 
     def get_patch_size(self) -> int:
-        return self.vision_config.patch_size
+        return (self.vision_config.patch_size *
+                self.vision_config.spatial_merge_size)
 
     def get_patch_grid_length(self) -> int:
         image_size, patch_size = self.get_image_size(), self.get_patch_size()
@@ -1001,8 +1002,8 @@ def get_patch_grid_size(
         ratio = max(image_width / max_width, image_height / max_height)
 
         if ratio > 1:
-            image_width = int(math.ceil(image_width / ratio))
-            image_height = int(math.ceil(image_height / ratio))
+            image_width = int(math.floor(image_width / ratio))
+            image_height = int(math.floor(image_height / ratio))
 
         nrows, ncols = _get_pixtral_hf_num_image_tokens(
             (image_height, image_width),
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 21ebaac7737..5211cd08f84 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -177,6 +177,7 @@
     "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"),  # noqa: E501
     "MiniCPMO": ("minicpmo", "MiniCPMO"),
     "MiniCPMV": ("minicpmv", "MiniCPMV"),
+    "Mistral3ForConditionalGeneration": ("mistral3", "Mistral3ForConditionalGeneration"),  # noqa: E501
     "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"),
     "NVLM_D": ("nvlm_d", "NVLM_D_Model"),
     "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 5c21fb2d4ad..9e00da682e8 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -69,6 +69,9 @@ def get_vision_encoder_info(
     if isinstance(vision_config, CLIPVisionConfig):
         return CLIPEncoderInfo(vision_config)
     if isinstance(vision_config, PixtralVisionConfig):
+        # Need to sneak in spatial_merge_size for Mistral3
+        vision_config.spatial_merge_size = getattr(hf_config,
+                                                   "spatial_merge_size", 1)
         return PixtralHFEncoderInfo(vision_config)
     if isinstance(vision_config, SiglipVisionConfig):
         return SiglipEncoderInfo(vision_config)

From b72a11592b5795684682128816c7311a8e8eab1d Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Tue, 1 Apr 2025 21:58:05 +0800
Subject: [PATCH 1128/1240] [Misc] remove unused script (#15746)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 python_only_dev.py | 16 ----------------
 1 file changed, 16 deletions(-)
 delete mode 100644 python_only_dev.py

diff --git a/python_only_dev.py b/python_only_dev.py
deleted file mode 100644
index a303697b780..00000000000
--- a/python_only_dev.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
-
-TL;DR:
-
-VLLM_USE_PRECOMPILED=1 pip install -e .
-
-or
-
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install -e .
-""" # noqa
-
-print(msg)

From 1ebfd45b30f7deb43fc61eba0ae1391a1660e5ba Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 15:27:46 +0100
Subject: [PATCH 1129/1240] Remove `format.sh` as it's been unsupported >70
 days (#15884)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 format.sh | 6 ------
 1 file changed, 6 deletions(-)
 delete mode 100755 format.sh

diff --git a/format.sh b/format.sh
deleted file mode 100755
index fb503ec4bbf..00000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hook."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install --hook-type pre-commit --hook-type commit-msg' to install the pre-commit hook."
-echo "Then linters will run automatically before each commit."

From fb6ea0d6415d59447f7a754ce30a7802f7931d50 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Tue, 1 Apr 2025 23:32:26 +0800
Subject: [PATCH 1130/1240] [New Model]:
 jinaai/jina-reranker-v2-base-multilingual  (#15876)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |  2 +-
 .../language/test_jina_reranker_v2.py         | 70 +++++++++++++++++++
 vllm/model_executor/models/roberta.py         | 17 ++++-
 3 files changed, 86 insertions(+), 3 deletions(-)
 create mode 100644 tests/models/embedding/language/test_jina_reranker_v2.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 42d923e1604..6d21405410d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -566,7 +566,7 @@ you should explicitly specify the task type to ensure that the model is used in
   *
 - * `XLMRobertaModel`
   * XLM-RoBERTa-based
-  * `intfloat/multilingual-e5-large`, etc.
+  * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, etc.
   *
   *
 :::
diff --git a/tests/models/embedding/language/test_jina_reranker_v2.py b/tests/models/embedding/language/test_jina_reranker_v2.py
new file mode 100644
index 00000000000..ab88fa9ba63
--- /dev/null
+++ b/tests/models/embedding/language/test_jina_reranker_v2.py
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa: E501
+"""Compare the scoring outputs of HF and vLLM models.
+
+Run `pytest tests/models/embedding/language/test_jina_reranker_v2.py`.
+"""
+import math
+
+import pytest
+
+MODELS = [
+    "jinaai/jina-reranker-v2-base-multilingual",  # Roberta
+]
+
+TEXTS_1 = ["Organic skincare products for sensitive skin"]
+
+TEXTS_2 = [
+    "Organic skincare for sensitive skin with aloe vera and chamomile.",
+    "New makeup trends focus on bold colors and innovative techniques",
+    "Bio-Hautpflege für empfindliche Haut mit Aloe Vera und Kamille",
+    "Neue Make-up-Trends setzen auf kräftige Farben und innovative Techniken",
+    "Cuidado de la piel orgánico para piel sensible con aloe vera y manzanilla",
+    "Las nuevas tendencias de maquillaje se centran en colores vivos y técnicas innovadoras",
+    "针对敏感肌专门设计的天然有机护肤产品",
+    "新的化妆趋势注重鲜艳的颜色和创新的技巧",
+    "敏感肌のために特別に設計された天然有機スキンケア製品",
+    "新しいメイクのトレンドは鮮やかな色と革新的な技術に焦点を当てています",
+]
+
+
+@pytest.fixture(scope="module", params=MODELS)
+def model_name(request):
+    yield request.param
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pair = [TEXTS_1[0], TEXTS_2[0]]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict([text_pair]).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(text_pair[0], text_pair[1])
+
+    assert len(vllm_outputs) == 1
+    assert len(hf_outputs) == 1
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+
+
+@pytest.mark.parametrize("dtype", ["half"])
+def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
+
+    text_pairs = [[TEXTS_1[0], text] for text in TEXTS_2]
+
+    with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model:
+        hf_outputs = hf_model.predict(text_pairs).tolist()
+
+    with vllm_runner(model_name, task="score", dtype=dtype,
+                     max_model_len=None) as vllm_model:
+        vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2)
+
+    assert len(vllm_outputs) == 10
+    assert len(hf_outputs) == 10
+
+    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index ba92eef1270..a09741a5597 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -13,7 +13,7 @@
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
-from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.transformers_utils.config import (
@@ -203,6 +203,18 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding,
        _pooler: An instance of Pooler used for pooling operations.
    """
 
+    jina_to_vllm_mapper = WeightsMapper(
+        orig_to_new_substr={
+            'emb_ln': "embeddings.LayerNorm",
+            'layers': "layer",
+            'mixer.Wqkv': "attention.self.qkv_proj",
+            'mixer.out_proj': "attention.output.dense",
+            'norm1': "attention.output.LayerNorm",
+            'mlp.fc1': "intermediate.dense",
+            'mlp.fc2': "output.dense",
+            'norm2': "output.LayerNorm",
+        })
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -219,8 +231,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self._pooler = CrossEncodingPooler(config, self.classifier)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-
         bert_weights, task_weights = roberta_task_weights_filter(weights)
+        bert_weights = self.jina_to_vllm_mapper.apply(bert_weights)
+
         self.roberta.load_weights(bert_weights)
 
         params_dict = dict(self.named_parameters())

From e80f7bcbcd05a3e8036a0d6b480a64327f6abed2 Mon Sep 17 00:00:00 2001
From: chaow-amd <chaow@amd.com>
Date: Tue, 1 Apr 2025 23:32:45 +0800
Subject: [PATCH 1131/1240] [Doc] Quark quantization documentation (#15861)

Signed-off-by: chaow <chaow@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/index.md |   1 +
 docs/source/features/quantization/quark.md | 217 +++++++++++++++++++++
 2 files changed, 218 insertions(+)
 create mode 100644 docs/source/features/quantization/quark.md

diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index 65f438f599f..4b59695afbd 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -16,5 +16,6 @@ gptqmodel
 int4
 int8
 fp8
+quark
 quantized_kvcache
 :::
diff --git a/docs/source/features/quantization/quark.md b/docs/source/features/quantization/quark.md
new file mode 100644
index 00000000000..935ee37a815
--- /dev/null
+++ b/docs/source/features/quantization/quark.md
@@ -0,0 +1,217 @@
+(quark)=
+
+# AMD QUARK
+
+Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
+throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
+the flexible and powerful quantization toolkit, to produce performant quantized models to run on AMD GPUs. Quark has specialized support for quantizing large language models with weight,
+activation and kv-cache quantization and cutting-edge quantization algorithms like
+AWQ, GPTQ, Rotation and SmoothQuant.
+
+## Quark Installation
+
+Before quantizing models, you need to install Quark. The latest release of Quark can be installed with pip:
+
+```console
+pip install amd-quark
+```
+
+You can refer to [Quark installation guide](https://quark.docs.amd.com/latest/install.html)
+for more installation details.
+
+## Quantization Process
+
+After installing Quark, we will use an example to illustrate how to use Quark.  
+The Quark quantization process can be listed for 5 steps as below:
+
+1. Load the model
+2. Prepare the calibration dataloader
+3. Set the quantization configuration
+4. Quantize the model and export
+5. Evaluation in vLLM
+
+### 1. Load the Model
+
+Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
+to fetch model and tokenizer.
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+MODEL_ID = "meta-llama/Llama-2-70b-chat-hf"
+MAX_SEQ_LEN = 512
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype="auto",
+)
+model.eval()
+
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, model_max_length=MAX_SEQ_LEN)
+tokenizer.pad_token = tokenizer.eos_token
+```
+
+### 2. Prepare the Calibration Dataloader
+
+Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)
+to load calibration data. For more details about how to use calibration datasets efficiently, please refer
+to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
+
+```python
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+
+BATCH_SIZE = 1
+NUM_CALIBRATION_DATA = 512
+
+# Load the dataset and get calibration data.
+dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+text_data = dataset["text"][:NUM_CALIBRATION_DATA]
+
+tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+    batch_size=BATCH_SIZE, drop_last=True)
+```
+
+### 3. Set the Quantization Configuration
+
+We need to set the quantization configuration, you can check
+[quark config guide](https://quark.docs.amd.com/latest/pytorch/user_guide_config_description.html)
+for further details. Here we use FP8 per-tensor quantization on weight, activation,
+kv-cache and the quantization algorithm is AutoSmoothQuant.
+
+:::{note}
+Note the quantization algorithm needs a JSON config file and the config file is located in
+[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+AutoSmoothQuant config file for Llama is
+`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
+:::
+
+```python
+from quark.torch.quantization import (Config, QuantizationConfig,
+                                     FP8E4M3PerTensorSpec,
+                                     load_quant_algo_config_from_file)
+
+# Define fp8/per-tensor/static spec.
+FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    is_dynamic=False).to_quantization_spec()
+
+# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
+global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    weight=FP8_PER_TENSOR_SPEC)
+
+# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
+KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
+kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
+kv_cache_quant_config = {name :
+    QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+                       weight=global_quant_config.weight,
+                       output_tensors=KV_CACHE_SPEC)
+    for name in kv_cache_layer_names_for_llama}
+layer_quant_config = kv_cache_quant_config.copy()
+
+# Define algorithm config by config file.
+LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
+
+EXCLUDE_LAYERS = ["lm_head"]
+quant_config = Config(
+    global_quant_config=global_quant_config,
+    layer_quant_config=layer_quant_config,
+    kv_cache_quant_config=kv_cache_quant_config,
+    exclude=EXCLUDE_LAYERS,
+    algo_config=algo_config)
+```
+
+### 4. Quantize the Model and Export
+
+Then we can apply the quantization. After quantizing, we need to freeze the
+quantized model first before exporting. Note that we need to export model with format of
+HuggingFace `safetensors`, you can refer to
+[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
+for more exporting format details.
+
+```python
+import torch
+from quark.torch import ModelQuantizer, ModelExporter
+from quark.torch.export import ExporterConfig, JsonExporterConfig
+
+# Apply quantization.
+quantizer = ModelQuantizer(quant_config)
+quant_model = quantizer.quantize_model(model, calib_dataloader)
+
+# Freeze quantized model to export.
+freezed_model = quantizer.freeze(model)
+
+# Define export config.
+LLAMA_KV_CACHE_GROUP = ["*k_proj", "*v_proj"]
+export_config = ExporterConfig(json_export_config=JsonExporterConfig())
+export_config.json_export_config.kv_cache_group = LLAMA_KV_CACHE_GROUP
+
+EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
+exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
+with torch.no_grad():
+    exporter.export_safetensors_model(freezed_model,
+        quant_config=quant_config, tokenizer=tokenizer)
+```
+
+### 5. Evaluation in vLLM
+
+Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
+
+```python
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+          kv_cache_dtype='fp8',quantization='quark')
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+print("\nGenerated Outputs:\n" + "-" * 60)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt:    {prompt!r}")
+    print(f"Output:    {generated_text!r}")
+    print("-" * 60)
+```
+
+Or, you can use `lm_eval` to evaluate accuracy:
+
+```console
+$ lm_eval --model vllm \
+  --model_args pretrained=Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant,kv_cache_dtype='fp8',quantization='quark' \
+  --tasks gsm8k
+```
+
+## Quark Quantization Script
+In addition to the example of Python API above, Quark also offers a
+[quantization script](https://quark.docs.amd.com/latest/pytorch/example_quark_torch_llm_ptq.html)
+to quantize large language models more conveniently. It supports quantizing models with variety
+of different quantization schemes and optimization algorithms. It can export the quantized model
+and run evaluation tasks on the fly. With the script, the example above can be:
+
+```console
+python3 quantize_quark.py --model_dir meta-llama/Llama-2-70b-chat-hf \
+                          --output_dir /path/to/output \
+                          --quant_scheme w_fp8_a_fp8 \
+                          --kv_cache_dtype fp8 \
+                          --quant_algo autosmoothquant \
+                          --num_calib_data 512 \
+                          --model_export hf_format \
+                          --tasks gsm8k
+```

From 59a0b04a6de7a969de067be5e0635383b823d5cd Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Tue, 1 Apr 2025 16:41:30 +0100
Subject: [PATCH 1132/1240] Reinstate `format.sh` and make `pre-commit`
 installation simpler (#15890)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .pre-commit-config.yaml | 3 +++
 format.sh               | 6 ++++++
 2 files changed, 9 insertions(+)
 create mode 100644 format.sh

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 484cd171f5f..f81410ab406 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,6 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
 default_stages:
   - pre-commit # Run locally
   - manual # Run in CI
diff --git a/format.sh b/format.sh
new file mode 100644
index 00000000000..6ba93e0a19b
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
+echo "Please run 'pip install -r requirements/lint.txt', followed by"
+echo "'pre-commit install' to install the pre-commit hooks."
+echo "Then linters will run automatically before each commit."
\ No newline at end of file

From 28dff57ba77bc7d60e40e162e5a8afbc11395c9f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Tue, 1 Apr 2025 23:55:13 +0800
Subject: [PATCH 1133/1240] [Misc] Allow using OpenCV as video IO fallback
 (#15055)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt  |  1 +
 requirements/test.in     |  2 +-
 requirements/test.txt    |  9 ++---
 setup.py                 |  2 +-
 vllm/assets/video.py     | 19 +++++----
 vllm/multimodal/video.py | 86 ++++++++++++++++++++++++++++++----------
 6 files changed, 84 insertions(+), 35 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index c7bbdb71b74..48e58c85c89 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -30,6 +30,7 @@ msgspec
 gguf == 0.10.0
 importlib_metadata
 mistral_common[opencv] >= 1.5.4
+opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
diff --git a/requirements/test.in b/requirements/test.in
index cf89794b93f..c1b70bca70e 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -9,7 +9,6 @@ pytest-shard
 # testing utils
 awscli
 backoff # required for phi4mm test
-decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
@@ -28,6 +27,7 @@ torchvision==0.21.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
+opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 transformers==4.50.3
diff --git a/requirements/test.txt b/requirements/test.txt
index 26ed9dbe32c..c46fa0721d6 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -93,8 +93,6 @@ datasets==3.0.2
     #   lm-eval
 decorator==5.1.1
     # via librosa
-decord==0.6.0
-    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -276,7 +274,6 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
-    #   decord
     #   einx
     #   encodec
     #   evaluate
@@ -337,8 +334,10 @@ nvidia-nvjitlink-cu12==12.4.127
     #   torch
 nvidia-nvtx-cu12==12.4.127
     # via torch
-opencv-python-headless==4.10.0.84
-    # via mistral-common
+opencv-python-headless==4.11.0.86
+    # via
+    #   -r requirements/test.in
+    #   mistral-common
 packaging==24.1
     # via
     #   accelerate
diff --git a/setup.py b/setup.py
index cf2acb20d9c..b0cc2f48163 100755
--- a/setup.py
+++ b/setup.py
@@ -684,7 +684,7 @@ def _read_requirements(filename: str) -> list[str]:
         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
-        "video": ["decord"]  # Required for video processing
+        "video": []  # Kept for backwards compatibility
     },
     cmdclass=cmdclass,
     package_data=package_data,
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index e45e1a65f89..32b0b86ba36 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -10,8 +10,6 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.multimodal.video import sample_frames_from_video
-
 from .base import get_cache_dir
 
 
@@ -43,14 +41,19 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     frames = []
-    for i in range(total_frames):
-        ret, frame = cap.read()
-        if ret:
-            frames.append(frame)
-    cap.release()
+
+    num_frames = num_frames if num_frames > 0 else total_frames
+    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for idx in range(total_frames):
+        ok = cap.grab()  # next img
+        if not ok:
+            break
+        if idx in frame_indices:  # only decompress needed
+            ret, frame = cap.retrieve()
+            if ret:
+                frames.append(frame)
 
     frames = np.stack(frames)
-    frames = sample_frames_from_video(frames, num_frames)
     if len(frames) < num_frames:
         raise ValueError(f"Could not read enough frames from video file {path}"
                          f" (expected {num_frames} frames, got {len(frames)})")
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 0b3d3f8c79d..f7c3f105295 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -13,7 +13,7 @@
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import cached_get_video_processor
-from vllm.utils import PlaceholderModule, is_list_of
+from vllm.utils import is_list_of
 
 from .base import MediaIO, ModalityData
 from .image import ImageMediaIO, ImagePlugin
@@ -22,11 +22,6 @@
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
 
-try:
-    import decord
-except ImportError:
-    decord = PlaceholderModule("decord")  # type: ignore[assignment]
-
 logger = init_logger(__name__)
 
 
@@ -117,6 +112,69 @@ def sample_frames_from_video(frames: npt.NDArray,
     return sampled_frames
 
 
+class VideoLoader:
+
+    @classmethod
+    def load_bytes(self, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        raise NotImplementedError
+
+
+class OpenCVVideoBackend(VideoLoader):
+
+    def get_cv2_video_api(self):
+        import cv2.videoio_registry as vr
+
+        api_pref = None
+        for backend in vr.getStreamBufferedBackends():
+            if not vr.hasBackend(backend):
+                continue
+            if not vr.isBackendBuiltIn(backend):
+                _, abi, api = vr.getStreamBufferedBackendPluginVersion(backend)
+                if (abi < 1 or (abi == 1 and api < 2)):
+                    continue
+            api_pref = backend
+            break
+        return api_pref
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        import cv2
+
+        backend = cls().get_cv2_video_api()
+        cap = cv2.VideoCapture(BytesIO(data), backend, [])
+        if not cap.isOpened():
+            raise ValueError("Could not open video stream")
+
+        total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        full_read = num_frames == -1 or total_frames_num < num_frames
+        if full_read:
+            frame_idx = list(range(0, total_frames_num))
+        else:
+            uniform_sampled_frames = np.linspace(0,
+                                                 total_frames_num - 1,
+                                                 num_frames,
+                                                 dtype=int)
+            frame_idx = uniform_sampled_frames.tolist()
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
+
+        i = 0
+        for idx in range(total_frames_num):
+            ok = cap.grab()  # next img
+            if not ok:
+                break
+            if idx in frame_idx:  # only decompress needed
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    i += 1
+        # we expect all frames loaded
+        assert i == num_frames
+        return frames
+
+
 class VideoMediaIO(MediaIO[npt.NDArray]):
 
     def __init__(
@@ -129,22 +187,10 @@ def __init__(
 
         self.image_io = image_io
         self.num_frames = num_frames
+        self.video_loader = OpenCVVideoBackend
 
     def load_bytes(self, data: bytes) -> npt.NDArray:
-        vr = decord.VideoReader(BytesIO(data), num_threads=1)
-        total_frame_num = len(vr)
-
-        num_frames = self.num_frames
-        if total_frame_num > num_frames:
-            uniform_sampled_frames = np.linspace(0,
-                                                 total_frame_num - 1,
-                                                 num_frames,
-                                                 dtype=int)
-            frame_idx = uniform_sampled_frames.tolist()
-        else:
-            frame_idx = list(range(0, total_frame_num))
-
-        return vr.get_batch(frame_idx).asnumpy()
+        return self.video_loader.load_bytes(data, self.num_frames)
 
     def load_base64(self, media_type: str, data: str) -> npt.NDArray:
         if media_type.lower() == "video/jpeg":

From 0f12cde80386a2fdb6bc688c596f66109e77dae4 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:56:39 -0400
Subject: [PATCH 1134/1240] [ROCm][Build][Bugfix] Bring the base dockerfile in
 sync with the ROCm fork (#15820)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docker/Dockerfile.rocm_base | 52 +++++++++++++++++++++----------------
 requirements/rocm-build.txt |  2 +-
 2 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 38d6a33636e..b8523fbc2a0 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,18 +1,18 @@
 ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="4d40e36"
+ARG HIPBLASLT_BRANCH="db8e93b4"
 ARG HIPBLAS_COMMON_BRANCH="7c1566b"
 ARG LEGACY_HIPBLASLT_OPTION=
 ARG RCCL_BRANCH="648a58d"
 ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="3a585126"
-ARG PYTORCH_VISION_BRANCH="v0.19.1"
+ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_VISION_BRANCH="v0.21.0"
 ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
-ARG FA_BRANCH="b7d29fb"
-ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
-ARG AITER_BRANCH="21d47a9"
+ARG FA_BRANCH="1a7f4dfa"
+ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
+ARG AITER_BRANCH="8970b25b"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -20,7 +20,7 @@ FROM ${BASE_IMAGE} AS base
 ENV PATH=/opt/rocm/llvm/bin:$PATH
 ENV ROCM_PATH=/opt/rocm
 ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
-ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942
+ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx1100;gfx1101;gfx1200;gfx1201
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 
 ARG PYTHON_VERSION=3.12
@@ -31,7 +31,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -42,7 +42,7 @@ RUN apt-get update -y \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
 
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@@ -60,7 +60,8 @@ RUN cd hipBLAS-common \
 RUN git clone https://github.com/ROCm/hipBLASLt
 RUN cd hipBLASLt \
     && git checkout ${HIPBLASLT_BRANCH} \
-    && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
+    && apt-get install -y llvm-dev \
+    && ./install.sh -dc --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \
     && cd build/release \
     && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
@@ -110,11 +111,24 @@ RUN git clone ${FA_REPO}
 RUN cd flash-attention \
     && git checkout ${FA_BRANCH} \
     && git submodule update --init \
-    && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
+    && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
     && cp /app/vision/dist/*.whl /app/install \
     && cp /app/flash-attention/dist/*.whl /app/install
 
+FROM base AS build_aiter
+ARG AITER_BRANCH
+ARG AITER_REPO
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    pip install /install/*.whl
+RUN git clone --recursive ${AITER_REPO}
+RUN cd aiter \
+    && git checkout ${AITER_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt
+RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
+RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
     dpkg -i /install/*deb \
@@ -130,19 +144,12 @@ RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
     pip install /install/*.whl
-
-ARG AITER_REPO
-ARG AITER_BRANCH
-RUN git clone --recursive ${AITER_REPO}
-RUN cd aiter \
-    && git checkout ${AITER_BRANCH} \
-    && git submodule update --init --recursive \
-    && pip install -r requirements.txt \
-    && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter 
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    pip install /install/*.whl
 
 ARG BASE_IMAGE
-ARG HIPBLASLT_BRANCH
 ARG HIPBLAS_COMMON_BRANCH
+ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
 ARG RCCL_BRANCH
 ARG RCCL_REPO
@@ -154,6 +161,8 @@ ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
 ARG FA_BRANCH
 ARG FA_REPO
+ARG AITER_BRANCH
+ARG AITER_REPO
 RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
     && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
@@ -167,6 +176,5 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
-    && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
     && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 6af78da4993..29d5647807b 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -6,7 +6,7 @@ torch==2.6.0
 torchvision==0.21.0
 torchaudio==2.6.0
 
-cmake>=3.26
+cmake>=3.26,<4
 packaging
 setuptools>=61
 setuptools-scm>=8

From f875c988302d44525f348b168f54a6b7a324319e Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Tue, 1 Apr 2025 12:07:43 -0400
Subject: [PATCH 1135/1240] Add option to use DeepGemm contiguous grouped gemm
 kernel for fused MoE operations. (#13932)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/kernels/benchmark_moe.py           |  97 ++--
 tests/kernels/test_block_fp8.py               | 281 ++++++++++-
 vllm/_custom_ops.py                           |   2 +-
 vllm/envs.py                                  |   5 +
 .../layers/fused_moe/fused_moe.py             | 468 ++++++++++++++++--
 .../model_executor/layers/quantization/fp8.py |  36 ++
 6 files changed, 774 insertions(+), 115 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 491f8c3962f..f1803b39c88 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -30,19 +30,18 @@ class BenchmarkConfig(TypedDict):
     num_stages: int
 
 
-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8_w8a8: bool,
-    use_int8_w8a16: bool,
-    num_iters: int = 100,
-    block_quant_shape: List[int] = None,
-) -> float:
+def benchmark_config(config: BenchmarkConfig,
+                     num_tokens: int,
+                     num_experts: int,
+                     shard_intermediate_size: int,
+                     hidden_size: int,
+                     topk: int,
+                     dtype: torch.dtype,
+                     use_fp8_w8a8: bool,
+                     use_int8_w8a16: bool,
+                     num_iters: int = 100,
+                     block_quant_shape: List[int] = None,
+                     use_deep_gemm: bool = False) -> float:
     init_dtype = torch.float16 if use_fp8_w8a8 else dtype
     x = torch.randn(num_tokens, hidden_size, dtype=dtype)
     if use_int8_w8a16:
@@ -115,22 +114,41 @@ def prepare(i: int):
     def run():
         from vllm.model_executor.layers.fused_moe import override_config
         with override_config(config):
-            fused_moe(
-                x,
-                w1,
-                w2,
-                input_gating,
-                topk,
-                renormalize=True,
-                inplace=True,
-                use_fp8_w8a8=use_fp8_w8a8,
-                use_int8_w8a16=use_int8_w8a16,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale,
-                block_shape=block_quant_shape,
-            )
+            if use_deep_gemm:
+                topk_weights, topk_ids = fused_topk(x, input_gating, topk,
+                                                    False)
+                return fused_experts(
+                    x,
+                    w1,
+                    w2,
+                    topk_weights,
+                    topk_ids,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                    allow_deep_gemm=True,
+                )
+            else:
+                fused_moe(
+                    x,
+                    w1,
+                    w2,
+                    input_gating,
+                    topk,
+                    renormalize=True,
+                    inplace=True,
+                    use_fp8_w8a8=use_fp8_w8a8,
+                    use_int8_w8a16=use_int8_w8a16,
+                    w1_scale=w1_scale,
+                    w2_scale=w2_scale,
+                    a1_scale=a1_scale,
+                    a2_scale=a2_scale,
+                    block_shape=block_quant_shape,
+                )
 
     # JIT compilation & warmup
     run()
@@ -366,6 +384,7 @@ def benchmark(
         use_fp8_w8a8: bool,
         use_int8_w8a16: bool,
         block_quant_shape: List[int] = None,
+        use_deep_gemm: bool = False,
     ) -> tuple[dict[str, int], float]:
         current_platform.seed_everything(self.seed)
         dtype_str = get_config_dtype_str(dtype,
@@ -396,7 +415,8 @@ def benchmark(
                                        use_fp8_w8a8,
                                        use_int8_w8a16,
                                        num_iters=100,
-                                       block_quant_shape=block_quant_shape)
+                                       block_quant_shape=block_quant_shape,
+                                       use_deep_gemm=use_deep_gemm)
         return config, kernel_time
 
     def tune(
@@ -411,6 +431,7 @@ def tune(
         use_int8_w8a16: bool,
         search_space: list[dict[str, int]],
         block_quant_shape: list[int],
+        use_deep_gemm: bool,
     ) -> dict[str, int]:
         best_config = None
         best_time = float("inf")
@@ -436,7 +457,8 @@ def tune(
                         use_fp8_w8a8,
                         use_int8_w8a16,
                         num_iters=20,
-                        block_quant_shape=block_quant_shape)
+                        block_quant_shape=block_quant_shape,
+                        use_deep_gemm=use_deep_gemm)
                 except triton.runtime.autotuner.OutOfResources:
                     # Some configurations may be invalid and fail to compile.
                     continue
@@ -550,6 +572,8 @@ def main(args: argparse.Namespace):
     else:
         batch_sizes = [args.batch_size]
 
+    use_deep_gemm = bool(args.use_deep_gemm)
+
     ray.init()
     num_gpus = int(ray.available_resources()["GPU"])
     workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
@@ -572,10 +596,10 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
 
         start = time.time()
         configs = _distribute(
-            "tune",
-            [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
-             for batch_size in batch_sizes])
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
+                      block_quant_shape, use_deep_gemm)
+                     for batch_size in batch_sizes])
         best_configs = {
             M: sort_config(config)
             for M, config in zip(batch_sizes, configs)
@@ -589,7 +613,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
         outputs = _distribute(
             "benchmark",
             [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
-              use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
+              use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm)
              for batch_size in batch_sizes])
 
         for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
@@ -611,6 +635,7 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
                         type=str,
                         choices=["auto", "fp8_w8a8", "int8_w8a16"],
                         default="auto")
+    parser.add_argument("--use-deep-gemm", action="store_true")
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index 6206cbd5f76..fda981f4c80 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -6,12 +6,22 @@
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
 
+dg_available = False
+try:
+    import deep_gemm
+    dg_available = True
+except ImportError:
+    pass
+
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
                 allow_module_level=True)
@@ -21,17 +31,18 @@
 NUM_TOKENS = [7, 83, 2048]
 D = [512, 4096, 5120, 13824]
 GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 83, 512, 2048]
-N = [128, 512, 1024, 4096, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824]
+M = [1, 7, 8, 83, 84, 512, 2048, 4096]
+N = [128, 512, 1024, 4096, 7168, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 7, 83, 512, 2048]
-N_moe = [4608]  # [128, 4608, 13824]
-K_moe = [7168]  # [256, 7168, 13824]
+M_moe = [1, 2, 7, 83, 128, 512, 2048]
+M_moe_dg = [128, 192, 512, 1335, 2048]
+N_moe = [128, 256, 1024, 4608]  # [13824]
+K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]
-E = [8, 24]  # [8, 24, 128, 256]
-TOP_KS = [2]  # [1, 2, 6]
+E = [2, 8, 16, 24]  # [128, 256]
+TOP_KS = [1, 2, 6]
 OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
 SEEDS = [0]
 
@@ -217,11 +228,16 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
                       SEEDS))
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    if topk > E:
+        pytest.skip(f"Skipping test; topk={topk} > E={E}")
+
     torch.manual_seed(seed)
     factor_for_scale = 1e-2
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max, fp8_min = fp8_info.max, fp8_info.min
 
+    vllm_config = VllmConfig()
+
     a = torch.randn((M, K), dtype=dtype) / 10
 
     w1_bf16 = (torch.rand(
@@ -246,25 +262,240 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
 
     score = torch.randn((M, E), dtype=dtype)
 
-    out = fused_moe(
-        a,
-        w1,
-        w2,
-        score,
-        topk,
-        renormalize=False,
-        use_fp8_w8a8=True,
-        w1_scale=w1_s,
-        w2_scale=w2_s,
-        block_shape=block_size,
-    )
-    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
-                                       block_size)
-
-    print(f"{out.sum()=}")
-    print(f"{ref_out.sum()=}")
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        out = fused_moe(
+            a,
+            w1,
+            w2,
+            score,
+            topk,
+            renormalize=False,
+            use_fp8_w8a8=True,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            block_shape=block_size,
+        )
+        ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                           block_size)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor,
+        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (deep_gemm.ceil_div(m, 128) * 128,
+         deep_gemm.ceil_div(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_size,out_dtype,seed",
+    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
+    # only aligned sizes
+    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max = fp8_info.max
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+
+    _, block_k = block_size[0], block_size[1]
+
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
+    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
+
+    As = As_fp8.to(torch.float32)
+    Bs = Bs_fp8.to(torch.float32)
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+
+    # Transpose earlier so that the testing will not trigger transposing kernels
+    As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
+
+    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
+
+    assert As_fp8.shape == (M, (K + 127) //
+                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
+
+    deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+def fp8_perm(m, idx):
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+    M, K = a.shape
+
+    sorted_token_ids, m_indices, num_pad = moe_align_block_size(
+        topk_ids, block_m, num_groups, None, pad_sorted_ids=True)
+
+    num_tokens = topk * M
+
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    m_indices = torch.repeat_interleave(m_indices, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:M * topk]
+
+    a = fp8_perm(a, sorted_token_ids // topk)
+    if a_s is not None:
+        a_s = a_s[sorted_token_ids // topk]
+
+    return a, a_s, m_indices, inv_perm
+
+
+def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
+    M = topk_weight.shape[0]
+    out = out[inv_perm, ...]
+    tmp_out = out.view(-1, topk, K)
+    return (tmp_out * topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
+                                 block_shape):
+    """Fused moe with block-wise quantization using DeepGemm grouped gemm."""
+    num_groups = w1.shape[0]
+    M, K = a.shape
+    N = w2.shape[-1]
+
+    topk_weight, topk_ids = fused_topk(a, score.float(), topk, False)
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+
+    _, block_k = block_shape[0], block_shape[1]
+
+    a_q, a_s = per_token_group_quant_fp8(a, block_m)
+
+    a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
+                                                     num_groups, topk, block_m)
+
+    inter_out = torch.zeros((a_q.shape[0], N * 2),
+                            dtype=torch.bfloat16,
+                            device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a_q, a_s), (w1, w1_s),
+                                                        inter_out, m_indices)
+
+    act_out = SiluAndMul().forward_native(inter_out)
+    act_out_q, act_out_s = per_token_group_quant_fp8(act_out, block_k)
+
+    out = torch.zeros(a_q.shape[0], K, dtype=torch.bfloat16, device=a.device)
+
+    deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+        (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
+
+    final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
+
+    return final_out
+
+
+@pytest.mark.parametrize(
+    "M,N,K,E,topk,seed",
+    itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
+@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@torch.inference_mode()
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
+
+    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_size = [block_m, block_m]
+    dtype = torch.bfloat16
+
+    # only aligned sizes
+    if (N % block_m != 0 or K % block_m != 0 or topk > E):
+        pytest.skip(
+            f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
+
+    if (N <= 512):
+        pytest.skip("Skipping N <= 512 until performance issues solved.")
+
+    vllm_config = VllmConfig()
+
+    torch.manual_seed(seed)
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = ((torch.rand((E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    w2_bf16 = ((torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 *
+               fp8_max).clamp(min=fp8_min, max=fp8_max)
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * N) + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn)
+
+    w1_s = torch.empty((E, n_tiles_w1, k_tiles_w1), dtype=torch.float32)
+    w2_s = torch.empty((E, n_tiles_w2, k_tiles_w2), dtype=torch.float32)
+
+    w1_s = deep_gemm.get_col_major_tma_aligned_tensor(w1_s).contiguous()
+    w2_s = deep_gemm.get_col_major_tma_aligned_tensor(w2_s).contiguous()
+
+    assert w1_s.shape == (E, (2 * N + 127) // 128, (K + 127) // 128)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(E):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
+
+    # Set the context to avoid lots of warning spam.
+    with set_current_vllm_config(vllm_config):
+        if M >= 128:
+            ref_out = deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s,
+                                                   score, topk, block_size)
+        else:
+            ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score,
+                                               topk, block_size)
+
+        topk_weights, topk_ids = fused_topk(a, score.float(), topk, False)
+
+        out = deep_gemm_moe_fp8(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+
+    #print(f"{out.sum()=}")
+    #print(f"{ref_out.sum()=}")
 
     rel_diff = (torch.mean(
         torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
                 torch.mean(torch.abs(ref_out.to(torch.float32))))
+
     assert rel_diff < 0.03
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2aa99ca256c..039397f5a5e 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1224,7 +1224,7 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
 
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                  token_expert_indicies: torch.Tensor,
-                 gating_output: float) -> None:
+                 gating_output: torch.Tensor) -> None:
     torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
                                   token_expert_indicies, gating_output)
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b34c2df8169..6067f5bdd05 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -105,6 +105,7 @@
     VLLM_V0_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_DISABLE_TOPK_TOPP_OPTIMIZATION: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
+    VLLM_USE_DEEP_GEMM: bool = False
 
 
 def get_default_cache_root():
@@ -687,6 +688,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     "VLLM_TPU_BUCKET_PADDING_GAP":
     lambda: int(os.environ["VLLM_TPU_BUCKET_PADDING_GAP"])
     if "VLLM_TPU_BUCKET_PADDING_GAP" in os.environ else 0,
+
+    # Allow use of DeepGemm kernels for fused moe ops.
+    "VLLM_USE_DEEP_GEMM":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 70d0037d7cb..977447e0399 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
+import importlib.util
 import json
 import os
+from math import prod
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -15,7 +17,7 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, round_up
 
 from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
                                    rocm_aiter_fused_experts,
@@ -23,6 +25,8 @@
 
 logger = init_logger(__name__)
 
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
 
 @triton.jit
 def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
@@ -581,7 +585,8 @@ def moe_align_block_size(
     topk_ids: torch.Tensor,
     block_size: int,
     num_experts: int,
-    expert_map: torch.Tensor = None
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Aligns the token distribution across experts to be compatible with block
@@ -596,6 +601,8 @@ def moe_align_block_size(
         from the global space to the local index space of the current
         expert parallel shard. If the expert is not in the current expert
         parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
 
     Returns:
     - sorted_token_ids: A tensor containing the sorted token indices according
@@ -625,6 +632,8 @@ def moe_align_block_size(
         by block_size for proper block matrix operations.
     """
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
     sorted_ids = torch.empty((max_num_tokens_padded, ),
                              dtype=torch.int32,
                              device=topk_ids.device)
@@ -667,6 +676,59 @@ def moe_align_block_size(
     return sorted_ids, expert_ids, num_tokens_post_pad
 
 
+def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     expert_map: Optional[torch.Tensor]) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm:
+        return False
+
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    # Expert maps not supported yet.
+    if expert_map is not None:
+        return False
+
+    align = dg.get_m_alignment_for_contiguous_layout()
+    M = hidden_states.shape[0]
+    _, K, N = w2.shape
+
+    # For now, disable DeepGemm for small N until better permute/unpermute
+    # ops are available.
+    if N <= 512:
+        return False
+
+    if align > M or N % align != 0 or K % align != 0:
+        return False
+
+    return (hidden_states.is_contiguous() and w1.is_contiguous()
+            and w2.is_contiguous())
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
 def invoke_fused_moe_kernel(A: torch.Tensor,
                             B: torch.Tensor,
                             C: torch.Tensor,
@@ -691,15 +753,11 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
 
     if use_fp8_w8a8:
         assert B_scale is not None
-        if block_shape is None:
-            A, A_scale = ops.scaled_fp8_quant(A, A_scale)
-        else:
-            assert len(block_shape) == 2
-            block_n, block_k = block_shape[0], block_shape[1]
-            A, A_scale = per_token_group_quant_fp8(A, block_k)
-            assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
-            assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2]
-            assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1]
+        assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0])
+                == B_scale.shape[-2])
+        assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1])
+                == B_scale.shape[-1])
+
     elif use_int8_w8a16 or use_int4_w4a16:
         assert B_scale is not None
         assert block_shape is None or block_shape[0] == 0
@@ -1066,7 +1124,7 @@ def fused_topk(
     gating_output: torch.Tensor,
     topk: int,
     renormalize: bool,
-):
+) -> Tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
 
@@ -1098,14 +1156,16 @@ def fused_topk(
 
 # This is used by the Deepseek-V2 and Deepseek-V3 model
 @torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
-def grouped_topk(hidden_states: torch.Tensor,
-                 gating_output: torch.Tensor,
-                 topk: int,
-                 renormalize: bool,
-                 num_expert_group: int = 0,
-                 topk_group: int = 0,
-                 scoring_func: str = "softmax",
-                 e_score_correction_bias: Optional[torch.Tensor] = None):
+def grouped_topk(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    num_expert_group: int = 0,
+    topk_group: int = 0,
+    scoring_func: str = "softmax",
+    e_score_correction_bias: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
 
     assert hidden_states.shape[0] == gating_output.shape[0], (
         "Number of tokens mismatch")
@@ -1154,10 +1214,11 @@ def grouped_topk(hidden_states: torch.Tensor,
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
-def get_config_dtype_str(dtype: torch.dtype,
-                         use_int4_w4a16: Optional[bool] = False,
-                         use_int8_w8a16: Optional[bool] = False,
-                         use_fp8_w8a8: Optional[bool] = False):
+def get_config_dtype_str(
+        dtype: torch.dtype,
+        use_int4_w4a16: Optional[bool] = False,
+        use_int8_w8a16: Optional[bool] = False,
+        use_fp8_w8a8: Optional[bool] = False) -> Optional[str]:
     if use_fp8_w8a8:
         return "fp8_w8a8"
     elif use_int8_w8a16:
@@ -1318,26 +1379,123 @@ def fused_experts(hidden_states: torch.Tensor,
                   w2_zp: Optional[torch.Tensor] = None,
                   a1_scale: Optional[torch.Tensor] = None,
                   a2_scale: Optional[torch.Tensor] = None,
-                  block_shape: Optional[List[int]] = None) -> torch.Tensor:
-    return dispatch_fused_experts_func(inplace)(
-        hidden_states=hidden_states,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids,
-        activation=activation,
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        use_int4_w4a16=use_int4_w4a16,
-        global_num_experts=global_num_experts,
-        expert_map=expert_map,
-        w1_scale=w1_scale,
-        w2_scale=w2_scale,
-        w1_zp=w1_zp,
-        w2_zp=w2_zp,
-        a1_scale=a1_scale,
-        a2_scale=a2_scale,
-        block_shape=block_shape)
+                  block_shape: Optional[List[int]] = None,
+                  allow_deep_gemm: bool = False) -> torch.Tensor:
+    if (allow_deep_gemm and use_fp8_w8a8
+            and _valid_deep_gemm(hidden_states, w1, w2, expert_map)):
+        return deep_gemm_moe_fp8(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+    else:
+        return dispatch_fused_experts_func(inplace)(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            activation=activation,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_zp=w1_zp,
+            w2_zp=w2_zp,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            block_shape=block_shape)
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    top_k_num: int,
+    block_m: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           torch.Tensor]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    tokens_in_chunk, _ = curr_hidden_states.shape
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk: int,
+    K: int,
+    topk_weight: torch.Tensor,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M = topk_weight.shape[0]
+    curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel()
+    return x.flatten()[:prod(v)].view(*v)
 
 
 def fused_experts_impl(hidden_states: torch.Tensor,
@@ -1376,6 +1534,7 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     num_tokens, _ = hidden_states.shape
     E, N, _ = w1.shape
+    K = w2.shape[1]
     if global_num_experts == -1:
         global_num_experts = E
     top_k_num = topk_ids.shape[1]
@@ -1401,13 +1560,11 @@ def fused_experts_impl(hidden_states: torch.Tensor,
 
     # We can reuse the memory between these because by the time we need
     # cache3, we're done with cache1
-    cache13 = torch.empty(M * top_k_num * max(N, w2.shape[1]),
+    cache13 = torch.empty(M * top_k_num * max(N, K),
                           device=hidden_states.device,
                           dtype=hidden_states.dtype)
-    intermediate_cache1 = cache13[:M * top_k_num * N].view(
-        (M, topk_ids.shape[1], N))
-    intermediate_cache3 = cache13[:M * top_k_num * w2.shape[1]].view(
-        (M, topk_ids.shape[1], w2.shape[1]))
+    intermediate_cache1 = cache13[:M * top_k_num * N].view(M, top_k_num, N)
+    intermediate_cache3 = cache13[:M * top_k_num * K].view(M, top_k_num, K)
 
     # This needs separate memory since it's used concurrently with cache1
     intermediate_cache2 = torch.empty((M * top_k_num, N // 2),
@@ -1452,14 +1609,23 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
         curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
 
+        a1q_scale: Optional[torch.Tensor] = None
+
+        if use_fp8_w8a8:
+            qcurr_hidden_states, a1q_scale = _fp8_quantize(
+                curr_hidden_states, a1_scale, block_shape)
+        else:
+            qcurr_hidden_states = curr_hidden_states
+            a1q_scale = a1_scale
+
         sorted_token_ids, expert_ids, num_tokens_post_padded = (
             moe_align_block_size(curr_topk_ids, config['BLOCK_SIZE_M'],
                                  global_num_experts, expert_map))
 
-        invoke_fused_moe_kernel(curr_hidden_states,
+        invoke_fused_moe_kernel(qcurr_hidden_states,
                                 w1,
                                 intermediate_cache1,
-                                a1_scale,
+                                a1q_scale,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
@@ -1485,10 +1651,19 @@ def fused_experts_impl(hidden_states: torch.Tensor,
         else:
             raise ValueError(f"Unsupported FusedMoe activation: {activation}")
 
-        invoke_fused_moe_kernel(intermediate_cache2,
+        a2q_scale: Optional[torch.Tensor] = None
+
+        if use_fp8_w8a8:
+            qintermediate_cache2, a2q_scale = _fp8_quantize(
+                intermediate_cache2, a2_scale, block_shape)
+        else:
+            qintermediate_cache2 = intermediate_cache2
+            a2q_scale = a2_scale
+
+        invoke_fused_moe_kernel(qintermediate_cache2,
                                 w2,
                                 intermediate_cache3,
-                                a2_scale,
+                                a2q_scale,
                                 w2_scale,
                                 w2_zp,
                                 curr_topk_weights,
@@ -1617,6 +1792,193 @@ def fused_moe(
                          block_shape=block_shape)
 
 
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    assert expert_map is None, "Expert maps not supported yet"
+
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
+        0] == hidden_states.shape[0], "Input scale shape mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    K = w2.shape[1]
+    if global_num_experts == -1:
+        global_num_experts = E
+    top_k_num = topk_ids.shape[1]
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+
+    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    block_m = dg.get_m_alignment_for_contiguous_layout()
+    block_shape = [block_m, block_m]
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+
+    # We attempt to transpose and align offline in Fp8MoEMethod, in which
+    # case these calls will be nops.  Otherwise, they'll be performed every
+    # time the layer is executed.
+    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
+    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
+
+    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
+    M_sum = round_up(M_sum, block_m)
+
+    num_chunks = (num_tokens // CHUNK_SIZE) + 1
+
+    # We can reuse the memory between cache1 and cache3 because by the time
+    # we need cache3, we're done with cache1
+    cache13 = torch.empty(M_sum * max(N, K),
+                          device=hidden_states.device,
+                          dtype=hidden_states.dtype)
+
+    intermediate_cache1 = cache13[:M_sum * N].view(M_sum, N)
+    intermediate_cache2 = torch.empty((M_sum, N // 2),
+                                      device=hidden_states.device,
+                                      dtype=hidden_states.dtype)
+    intermediate_cache3 = cache13[:M_sum * K].view(M_sum, K)
+
+    for chunk in range(num_chunks):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        a1q_scale: Optional[torch.Tensor] = None
+
+        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
+                                                       a1_scale, block_shape)
+
+        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
+                                  curr_topk_ids, global_num_experts,
+                                  expert_map, top_k_num, block_m)
+
+        # Adjust the intermediate cache size and config for the last chunk.
+        # Note that in most cases we only have one chunk so the cache size
+        # and config are already set correctly and do not need to be adjusted.
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            curr_M = sorted_token_ids.numel()
+            intermediate_cache1 = _resize_cache(intermediate_cache1,
+                                                (curr_M, N))
+            intermediate_cache2 = _resize_cache(intermediate_cache2,
+                                                (curr_M, N // 2))
+            intermediate_cache3 = _resize_cache(intermediate_cache3,
+                                                (curr_M, K))
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qcurr_hidden_states, a1q_scale), (w1, w1_scale),
+            intermediate_cache1, expert_ids)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(intermediate_cache2,
+                                      intermediate_cache1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qintermediate_cache2, a2q_scale = _fp8_quantize(
+            intermediate_cache2, a2_scale, block_shape)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qintermediate_cache2, a2q_scale), (w2, w2_scale),
+            intermediate_cache3, expert_ids)
+
+        _moe_unpermute_and_reduce(
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            intermediate_cache3.view(*intermediate_cache3.shape), inv_perm,
+            top_k_num, K, curr_topk_weights)
+
+    return out_hidden_states
+
+
 #TODO make the grouped gemm kernel consistent with scaled gemm kernel
 def cutlass_moe_fp8(
     a: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 11bfdb41805..e7c733db5c0 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import importlib.util
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
@@ -37,6 +38,14 @@
 
 logger = init_logger(__name__)
 
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _is_col_major(x: torch.Tensor) -> bool:
+    assert x.dim() == 3
+    b, m, n = x.shape
+    return x.stride(0) == m * n and x.stride(1) == 1 and x.stride(2) == m
+
 
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
@@ -424,6 +433,19 @@ def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
         self.block_quant = self.quant_config.weight_block_size is not None
 
+        # Check for DeepGemm support.
+        self.allow_deep_gemm = False
+        if envs.VLLM_USE_DEEP_GEMM:
+            if not has_deep_gemm:
+                logger.warning_once("Failed to import DeepGemm kernels.")
+            elif (current_platform.is_cuda()
+                  and current_platform.has_device_capability(90)):
+                logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.")
+                self.allow_deep_gemm = True
+            else:
+                logger.warning_once(
+                    "DeepGemm not supported on the current platform.")
+
     def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
                        intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -585,6 +607,19 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                       requires_grad=False)
                 layer.w2_weight = torch.nn.Parameter(shuffled_w2,
                                                      requires_grad=False)
+
+            # DeepGemm scales need to be transposed and aligned.  We try to do
+            # it ahead of time for performance reasons.
+            if self.allow_deep_gemm:
+                # Lazy import to avoid CUDA initialization problems.
+                import deep_gemm as dg
+                if _is_col_major(layer.w13_weight_scale_inv):
+                    layer.w13_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous()
+                if _is_col_major(layer.w2_weight_scale_inv):
+                    layer.w2_weight_scale_inv = \
+                        dg.get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous()
+
             return
 
         # If checkpoint is fp16, quantize in place.
@@ -773,6 +808,7 @@ def apply(
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             block_shape=self.quant_config.weight_block_size,
+            allow_deep_gemm=self.allow_deep_gemm,
         )
 
 
From 23884979bdacacefab2daf9f9c52c7f93d3387a3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 00:28:50 +0800
Subject: [PATCH 1136/1240] [CI/Build] Clean up LoRA tests (#15867)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/lora/test_baichuan.py          |   8 --
 tests/lora/test_gemma.py             |  65 -------------
 tests/lora/test_layers.py            | 140 +++++----------------------
 tests/lora/test_minicpmv_tp.py       |   4 +
 tests/lora/test_transfomers_model.py |  10 --
 5 files changed, 27 insertions(+), 200 deletions(-)
 delete mode 100644 tests/lora/test_gemma.py

diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 3aa30b7b3c7..4dacbe26f3d 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -11,14 +11,6 @@
 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),
diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py
deleted file mode 100644
index 610bc405ede..00000000000
--- a/tests/lora/test_gemma.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import pytest
-
-import vllm
-from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
-
-MODEL_PATH = "google/gemma-7b"
-
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        "Quote: Imagination is",
-        "Quote: Be yourself;",
-        "Quote: Painting is poetry that is seen rather than felt,",
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-# The V1 lora test for this model requires more than 24GB.
-@pytest.mark.skip_v1
-@pytest.mark.xfail(current_platform.is_rocm(),
-                   reason="There can be output mismatch on ROCm")
-def test_gemma_lora(gemma_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "more important than knowledge.\nAuthor: Albert Einstein\n",
-        "everyone else is already taken.\nAuthor: Oscar Wilde\n",
-        "and poetry is painting that is felt rather than seen.\n"
-        "Author: Leonardo da Vinci\n",
-    ]
-
-    output1 = do_sample(llm, gemma_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, gemma_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 56da97b6a06..99d60b332e6 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -19,7 +19,6 @@
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
-                              LinearScalingRotaryEmbeddingWithLoRA,
                               LogitsProcessorWithLoRA, LoRAMapping,
                               MergedColumnParallelLinearWithLoRA,
                               MergedQKVParallelLinearWithLoRA,
@@ -28,8 +27,7 @@
                               RowParallelLinearWithLoRA,
                               VocabParallelEmbeddingWithLoRA)
 # yapf: enable
-from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights,
-                              PackedLoRALayerWeights)
+from vllm.lora.models import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.punica_wrapper import get_punica_wrapper
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                MergedColumnParallelLinear,
@@ -37,7 +35,6 @@
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
@@ -59,28 +56,16 @@
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ] if current_platform.is_cuda_alike() else ["cpu"])
 
-#For GPU, we will launch different triton kernels between the prefill and decode
-# stages, so we need to verify this. prefill stage(True) or decode stage(False)
+# prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-# With the inclusion of V1 tests (look at the run_with_both_engines_lora),
-# the tests in this file run twice, once with the V0 engine and then with
-# the V1 engine.
-# The NUM_RANDOM_SEEDS value was set to 10 before. It is cut to half
-# with the inclusion of V1 tests to maintain the CI test times.
-NUM_RANDOM_SEEDS = 5
-# The VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS value was set to
-# 256 before. It is cut to half with the inclusion of V1 tests to maintain
-# the CI test times.
+NUM_RANDOM_SEEDS = 10
+
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
 
 @pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-
+def clean_cache():
     # Release any memory we might be holding on to. CI runs OOMs otherwise.
     from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
                                                 _LORA_B_PTR_DICT)
@@ -90,6 +75,24 @@ def v1(run_with_both_engines_lora):
     yield
 
 
+@pytest.fixture(autouse=True)
+def skip_cuda_with_stage_false(request):
+    """
+    On cuda-like platforms, we use the same kernels for prefill and decode 
+    stage, and 'stage' is generally ignored, so we only need to test once.
+    """
+    if current_platform.is_cuda_alike():
+        try:
+            if hasattr(request.node, "callspec") and hasattr(
+                    request.node.callspec, "params"):
+                params = request.node.callspec.params
+                if "stage" in params and params["stage"] is False:
+                    pytest.skip("Skip test when stage=False")
+        except Exception:
+            pass
+    yield
+
+
 def get_random_id_to_index(num_loras: int,
                            num_slots: int,
                            log: bool = True) -> list[Optional[int]]:
@@ -1011,103 +1014,6 @@ class FakeConfig:
                                    atol=atol)
 
 
-@torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 8])
-@pytest.mark.parametrize("device", ["cuda"])
-@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0),
-                                             (6.0, 1.0)])
-@pytest.mark.parametrize("max_position", [11, 4096, 32768])
-@pytest.mark.parametrize("is_neox_style", [True, False])
-@pytest.mark.parametrize("rotary_dim", [None, 32])
-@pytest.mark.parametrize("head_size", [32, 108])
-@pytest.mark.parametrize("seq_len", [11, 1024])
-@pytest.mark.skipif(not current_platform.is_cuda_alike(),
-                    reason="Only CUDA backends are supported")
-def test_rotary_embedding_long_context(dist_init, num_loras, device,
-                                       scaling_factors, max_position,
-                                       is_neox_style, rotary_dim, head_size,
-                                       seq_len) -> None:
-    dtype = torch.float16
-    max_loras = 8
-    seed = 0
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             long_lora_scaling_factors=scaling_factors,
-                             lora_dtype=dtype)
-
-    if rotary_dim is None:
-        rotary_dim = head_size
-    base = 10000
-    batch_size = 5 * num_loras
-    num_heads = 7
-
-    # Verify lora is equivalent to linear scaling rotary embedding.
-    rope = get_rope(
-        head_size,
-        rotary_dim,
-        max_position,
-        base,
-        is_neox_style,
-    )
-    lora_rope = LinearScalingRotaryEmbeddingWithLoRA(rope)
-    lora_rope.set_mapping(punica_wrapper)
-    lora_rope.create_lora_weights(max_loras, lora_config)
-    linear_rope = get_rope(head_size, rotary_dim, max_position, base,
-                           is_neox_style, {
-                               "rope_type": "linear",
-                               "factor": scaling_factors
-                           })
-    linear_rope = linear_rope.to(dtype=dtype)
-    id_to_index = get_random_id_to_index(num_loras, max_loras)
-    _, index_mapping, prompt_mapping = create_random_inputs(
-        active_lora_ids=[0],
-        num_inputs=batch_size,
-        input_size=(1, max_position),
-        input_range=(0, lora_config.lora_extra_vocab_size),
-        input_type=torch.float16,
-        device=device)
-
-    lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
-    long_lora_context = LongContextLoRAContext(list(scaling_factors),
-                                               rotary_dim)
-
-    next_expected_offset = 0
-    # Make sure the offset is correct.
-    scaling_factor_to_offset = lora_rope.scaling_factor_to_offset
-    for scaling_factor, offset in scaling_factor_to_offset.items():
-        assert offset == next_expected_offset
-        next_expected_offset += scaling_factor * max_position
-
-    for i in range(len(scaling_factors)):
-        long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get(
-            scaling_factors[i], 0)
-    punica_wrapper.update_metadata(
-        lora_mapping,
-        id_to_index,
-        max_loras,
-        512,
-        lora_config.lora_extra_vocab_size,
-        long_lora_context=long_lora_context,
-    )
-    # lora_rope.set_mapping(*mapping_info)
-
-    positions = torch.randint(0, max_position, (batch_size, seq_len))
-    query = torch.randn(batch_size,
-                        seq_len,
-                        num_heads * head_size,
-                        dtype=dtype)
-    key = torch.randn_like(query)
-    ref_q, ref_k = linear_rope(positions, query, key)
-    actual_q, actual_k = lora_rope(positions, query, key)
-
-    torch.allclose(ref_q, actual_q)
-    torch.allclose(ref_k, actual_k)
-
-
 @pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
 @pytest.mark.parametrize(
     "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)))
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index ee0d7b5da3a..00e6fe7c61d 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -78,6 +78,8 @@ def test_minicpmv_lora(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output2[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
@@ -99,6 +101,8 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
         assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 0f18de42cd9..87db0b4bbde 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,7 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import pytest
-
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -18,14 +16,6 @@
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines_lora):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     prompts = [
         PROMPT_TEMPLATE.format(query="How many singers do we have?"),

From 56d0c4375f17456400476cc5cf3d00ef2fb510d4 Mon Sep 17 00:00:00 2001
From: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Date: Tue, 1 Apr 2025 09:30:43 -0700
Subject: [PATCH 1137/1240] [Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |   7 +
 examples/offline_inference/vision_language.py |  23 +
 .../vision_language_multi_image.py            |  36 ++
 .../vision_language/test_models.py            |  14 +
 .../multimodal/processing/test_common.py      |   1 +
 tests/models/registry.py                      |   1 +
 vllm/config.py                                |   4 +
 vllm/entrypoints/chat_utils.py                |   5 +-
 vllm/model_executor/models/aya_vision.py      | 527 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 10 files changed, 617 insertions(+), 2 deletions(-)
 create mode 100644 vllm/model_executor/models/aya_vision.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 6d21405410d..1b742717885 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -753,6 +753,13 @@ See [this page](#generative-models) for more information on how to use generativ
   *
   * ✅︎
   * ✅︎
+- * `AyaVisionForConditionalGeneration`
+  * Aya Vision
+  * T + I<sup>+</sup>
+  * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
+  *
+  * ✅︎
+  * ✅︎
 - * `Blip2ForConditionalGeneration`
   * BLIP-2
   * T + I<sup>E</sup>
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index d32bfcd3460..c1115708505 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -60,6 +60,28 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Aya Vision
+def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=2048,
+        max_num_seqs=2,
+        mm_processor_kwargs={"crop_to_patches": True},
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
+    prompts = [
+        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -865,6 +887,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
 
 model_example_map = {
     "aria": run_aria,
+    "aya_vision": run_aya_vision,
     "blip-2": run_blip2,
     "chameleon": run_chameleon,
     "deepseek_vl_v2": run_deepseek_vl2,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 318cf989d73..39951e5e89c 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -61,6 +61,41 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "CohereForAI/aya-vision-8b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *placeholders,
+            {
+                "type": "text",
+                "text": question
+            },
+        ],
+    }]
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    prompt = processor.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_deepseek_vl2(question: str,
                       image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
@@ -526,6 +561,7 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 
 model_example_map = {
     "aria": load_aria,
+    "aya_vision": load_aya_vision,
     "deepseek_vl_v2": load_deepseek_vl2,
     "gemma3": load_gemma3,
     "h2ovl_chat": load_h2ovl,
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 7a9158eff94..3b34f012f62 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -158,6 +158,20 @@
         max_tokens=64,
         marks=[large_gpu_mark(min_gb=64)],
     ),
+    "aya_vision": VLMTestInfo(
+        models=["CohereForAI/aya-vision-8b"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
+        single_image_prompts=IMAGE_ASSETS.prompts({
+            "stop_sign": "<image>What's the content in the center of the image?",  # noqa: E501
+            "cherry_blossom": "<image>What is the season?",  # noqa: E501
+        }),
+        multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
+    ),
     "blip2": VLMTestInfo(
         # TODO: Change back to 2.7b once head_dim = 80 is supported
         models=["Salesforce/blip2-opt-6.7b"],
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index e4f1d297fc0..fdcd7a9e173 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -246,6 +246,7 @@ def _test_processing_correctness_mistral(
 # yapf: disable
 @pytest.mark.parametrize("model_id", [
     "rhymes-ai/Aria",
+    "CohereForAI/aya-vision-8b",
     "Salesforce/blip2-opt-2.7b",
     "facebook/chameleon-7b",
     "deepseek-ai/deepseek-vl2-tiny",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ffc00261a8a..137f1418736 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -259,6 +259,7 @@ def check_available_online(
 _MULTIMODAL_EXAMPLE_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
+    "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
                                                      extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index c82c9763ccd..c213c9b4756 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2716,6 +2716,10 @@ def _get_and_verify_max_len(
             max_len_key = key if max_len < derived_max_model_len \
                 else max_len_key
             derived_max_model_len = min(derived_max_model_len, max_len)
+    # For Command-R / Cohere, Cohere2 / Aya Vision models
+    if tmp_max_len := getattr(hf_config, "model_max_length", None):
+        max_len_key = "model_max_length"
+        derived_max_model_len = tmp_max_len
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e32b8ffc044..ff2d1aacbec 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -496,8 +496,9 @@ def _placeholder_str(self, modality: ModalityStr,
             if model_type.startswith("llava"):
                 return self._cached_token_str(self._tokenizer,
                                               hf_config.image_token_index)
-            if model_type in ("chameleon", "deepseek_vl_v2", "internvl_chat",
-                              "skywork_chat", "NVLM_D", "h2ovl_chat"):
+            if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
+                              "internvl_chat", "skywork_chat", "NVLM_D",
+                              "h2ovl_chat"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
new file mode 100644
index 00000000000..b4bf1d82c08
--- /dev/null
+++ b/vllm/model_executor/models/aya_vision.py
@@ -0,0 +1,527 @@
+# SPDX-License-Identifier: Apache-2.0 Adapted from
+# https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision
+from functools import cached_property
+from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple,
+                    TypedDict, Union, cast)
+
+import torch
+from torch import nn
+from transformers import BatchFeature, GotOcr2ImageProcessor
+from transformers.activations import ACT2FN
+from transformers.image_processing_utils import get_size_dict
+from transformers.models.aya_vision import AyaVisionConfig
+from transformers.models.aya_vision.processing_aya_vision import (
+    AyaVisionProcessor)
+from transformers.models.got_ocr2.image_processing_got_ocr2 import (
+    get_optimal_tiled_canvas)
+
+from vllm.config import VllmConfig
+from vllm.jsontree import json_map_leaves
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo,
+                                        MultiModalFieldConfig,
+                                        PromptReplacement, PromptUpdate,
+                                        encode_tokens)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .siglip import SiglipVisionModel
+from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
+                    maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
+
+
+class AyaVisionImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """
+    Shape: `(num_patches_total, num_channels, height, width)`
+
+    `num_patches_total` is the total number of patches over each image over each
+    prompt in the batch.
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
+
+class AyaVisionMultiModalProjector(nn.Module):
+
+    def __init__(self, config: AyaVisionConfig):
+        super().__init__()
+        self.config = config
+        self.downsample_factor = config.downsample_factor
+        self.alignment_intermediate_size = getattr(
+            config, "alignment_intermediate_size",
+            config.text_config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.vision_config.hidden_size *
+                                      (config.downsample_factor**2),
+                                      eps=config.adapter_layer_norm_eps)
+
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * (config.downsample_factor**2),
+            self.alignment_intermediate_size,
+            bias=True,
+        )
+
+        self.act = ACT2FN["silu"]  # SwiGLU uses SiLU activation
+        # For SwiGLU, project down to half size since we split intermediate dim
+        self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2,
+                                  config.text_config.hidden_size,
+                                  bias=True)
+
+    def forward(self, image_features: torch.Tensor) -> torch.Tensor:
+        image_features = self.pixel_shuffle(image_features)
+        image_features = self.layernorm(image_features)
+        hidden_states = self.linear_1(image_features)
+
+        # Split along last dimension and apply SwiGLU
+        x, gate = hidden_states.chunk(2, dim=-1)
+        hidden_states = self.act(gate) * x
+
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+    def pixel_shuffle(self,
+                      image_features: torch.Tensor) -> torch.Tensor:  # B, S, D
+        batch_size, seq_length, _ = image_features.shape
+        height = width = int(seq_length**0.5)
+        image_features = image_features.reshape(image_features.shape[0], width,
+                                                height, -1)
+        channels = image_features.shape[-1]
+        image_features = image_features.reshape(
+            batch_size, width, int(height / self.downsample_factor),
+            int(channels * self.downsample_factor))
+        image_features = image_features.permute(0, 2, 1, 3)
+        image_features = image_features.reshape(
+            batch_size, int(height / self.downsample_factor),
+            int(width / self.downsample_factor), -1)
+        image_features = image_features.permute(0, 2, 1, 3)
+        return image_features
+
+
+class AyaVisionProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self) -> AyaVisionConfig:
+        return self.ctx.get_hf_config(AyaVisionConfig)
+
+    def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
+        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)
+
+    def get_image_processor(self) -> GotOcr2ImageProcessor:
+        return self.get_hf_processor().image_processor
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        return {"image": self.get_max_image_tokens()}
+
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        image_processor = hf_processor.image_processor
+        image_size = self.get_image_size_with_most_features()
+        tokenizer = hf_processor.tokenizer
+        num_patches = self.get_num_patches(
+            image_width=image_size.width,
+            image_height=image_size.height,
+            size=image_processor.size,
+            min_patches=image_processor.min_patches,
+            max_patches=image_processor.max_patches)
+        image_string = hf_processor._prompt_split_image(num_patches)
+        x = encode_tokens(
+            tokenizer,
+            image_string,
+            add_special_tokens=False,
+        )
+        return len(x)
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None}
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        height = image_processor.size['height']
+        width = image_processor.size['width']
+        max_patches = image_processor.max_patches
+        return ImageSize(height=height * max_patches,
+                         width=width * max_patches)
+
+    def get_num_patches(self, *, image_width: int, image_height: int,
+                        size: dict, min_patches: int, max_patches: int) -> int:
+        """
+        Calculate the number of patches needed for a given image based on size
+        constraints.  This method replicates and adjusts the logic from:
+        transformers/models/got_ocr2/image_processing_got_ocr2
+        """
+        size = get_size_dict(size, default_to_square=False)
+        num_columns, num_rows = get_optimal_tiled_canvas(
+            (image_height, image_width), (size["height"], size["width"]),
+            min_patches, max_patches)
+        num_blocks = num_columns * num_rows
+        return num_blocks if num_blocks == 1 else num_blocks + 1
+
+
+class AyaVisionDummyInputsBuilder(
+        BaseDummyInputsBuilder[AyaVisionProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        num_images = mm_counts.get("image", 0)
+        image_size = \
+            self.info.get_image_size_with_most_features()
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=image_size.width,
+                                   height=image_size.height,
+                                   num_images=num_images)
+        }
+        return ProcessorInputs(
+            prompt_text=image_token * num_images,
+            mm_data=mm_data,
+        )
+
+
+class AyaVisionMultiModalProcessor(
+        BaseMultiModalProcessor[AyaVisionProcessingInfo]):
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt,
+            mm_data,
+            mm_kwargs,
+        )
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        image_processor = hf_processor.image_processor
+
+        hf_config = self.info.get_hf_config()
+        # HF processor pops the `num_patches` kwarg, which is needed by vLLM
+        if (images :=
+                mm_data.get("images")) is not None and '<image>' in prompt:
+            assert isinstance(images, list)
+            parsed_images = (self._get_data_parser().parse_mm_data({
+                "image":
+                images
+            }).get_items("image", ImageProcessorItems))
+            image_sizes = [
+                parsed_images.get_image_size(i)
+                for i in range(len(parsed_images))
+            ]
+            num_patches = [
+                self.info.get_num_patches(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    size=image_processor.size,
+                    min_patches=image_processor.min_patches,
+                    max_patches=image_processor.max_patches)
+                for image_size in image_sizes
+            ]
+            image_tokens_list = [
+                hf_processor._prompt_split_image(num_patch)
+                for num_patch in num_patches
+            ]
+            tokenizer = self.info.get_tokenizer()
+            image_token_ids = [
+                tokenizer.encode(image_tokens, add_special_tokens=False)
+                for image_tokens in image_tokens_list
+            ]
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == hf_config.image_token_index
+                for image_repl_tokens in image_token_ids
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+            processed_outputs["num_patches"] = torch.tensor(num_patches)
+
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        num_patches = hf_inputs.get("num_patches", torch.empty(0))
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", num_patches),
+            num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_token = hf_processor.image_token
+        image_processor = hf_processor.image_processor
+
+        def get_replacement(item_idx: int):
+            images: ImageProcessorItems = mm_items.get("image",
+                                                       ImageProcessorItems)
+            image_size: ImageSize = images.get_image_size(item_idx)
+            num_patches = self.info.get_num_patches(
+                image_width=image_size.width,
+                image_height=image_size.height,
+                size=image_processor.size,
+                min_patches=image_processor.min_patches,
+                max_patches=image_processor.max_patches)
+            return hf_processor._prompt_split_image(num_patches=num_patches)
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=image_token,
+                replacement=get_replacement,
+            )
+        ]
+
+
+def _get_num_hidden_layers(hf_config: AyaVisionConfig) -> int:
+    feature_layers = hf_config.vision_feature_layer
+    num_hidden_layers = hf_config.vision_config.num_hidden_layers
+    # If we have one feature layer, initialize up to that layer
+    if isinstance(feature_layers, int):
+        return _get_layer_index(feature_layers, num_hidden_layers)
+    # If we have multiple feature layers, initialize up to the deepest m
+    elif isinstance(feature_layers, (list, tuple)):
+        return max(
+            _get_layer_index(idx, num_hidden_layers) for idx in feature_layers)
+    raise TypeError(f"vision_layer_feature type: {type(feature_layers)}"
+                    " is not supported")
+
+
+def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
+    if feature_layer_index < 0:
+        return num_hidden_layers + feature_layer_index + 1
+    return feature_layer_index
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    AyaVisionMultiModalProcessor,
+    info=AyaVisionProcessingInfo,
+    dummy_inputs=AyaVisionDummyInputsBuilder)
+class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                        SupportsPP):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: AyaVisionConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        num_hidden_layers = _get_num_hidden_layers(config)
+        self.config = config
+        self.quant_config = quant_config
+        self.multimodal_config = multimodal_config
+
+        self.vision_tower = SiglipVisionModel(
+            config.vision_config,
+            quant_config,
+            num_hidden_layers_override=num_hidden_layers,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.vocab_size = config.text_config.vocab_size
+        self.multi_modal_projector = AyaVisionMultiModalProjector(config)
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "model"),
+            # Cohere2ForCausalLM and CohereForCausalLM are the same on vllm
+            architectures=["Cohere2ForCausalLM"])
+
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
+                                  pixel_values: torch.Tensor,
+                                  **kwargs) -> torch.Tensor:
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_features = vision_tower(pixel_values.to(dtype=target_dtype),
+                                      **kwargs)
+
+        def select_features(leaf: torch.Tensor):
+            return self._select_image_features(
+                leaf,
+                strategy=self.config.vision_feature_select_strategy,
+            )
+
+        return cast(
+            Union[torch.Tensor, tuple[torch.Tensor, ...]],
+            json_map_leaves(select_features, image_features),
+        )
+
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _process_image_input(self, image_input: AyaVisionImagePixelInputs,
+                             **kwargs) -> list[torch.Tensor]:
+        assert self.vision_tower is not None
+        pixel_values = image_input["pixel_values"]
+        num_patches = image_input["num_patches"]
+        image_features = self._image_pixels_to_features(
+            self.vision_tower, pixel_values=pixel_values)
+        image_embeds = self.multi_modal_projector(image_features)
+        return [
+            e.flatten(0, 2) for e in image_embeds.split(num_patches.tolist())
+        ]
+
+    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            if d.shape != expected_dims:
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        num_patches = kwargs.pop("num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        assert image_embeds is None, "Aya Vision does not support image_embeds."
+
+        if not isinstance(pixel_values, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
+        if num_patches is not None and not isinstance(num_patches,
+                                                      (torch.Tensor, list)):
+            raise ValueError("Incorrect type of num_patches. "
+                             f"Got type: {type(num_patches)}")
+
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        pixel_values = flatten_bn(pixel_values, concat=True)
+        num_patches = flatten_bn(num_patches, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
+        return AyaVisionImagePixelInputs(
+            type="pixel_values",
+            pixel_values=self._validate_pixel_values(pixel_values),
+            num_patches=num_patches,
+            embed_is_patch=embed_is_patch,
+        )
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        image_features = self._process_image_input(image_input, **kwargs)
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if multimodal_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids=input_ids,
+                inputs_embeds=inputs_embeds,
+                multimodal_embeddings=select_patch_features(
+                    multimodal_embeddings),
+                placeholder_token_id=self.config.image_token_index)
+
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        # NOTE: In v1, inputs_embeds is always generated at model runner, this
+        # condition is for v0 compatibility.
+        elif inputs_embeds is None:
+            vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    @cached_property
+    def sampler(self):
+        if hasattr(self.language_model, "sampler"):
+            return self.language_model.sampler
+
+        return get_sampler()
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        return self.language_model.sample(logits, sampling_metadata)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 5211cd08f84..2f1827c1740 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -161,6 +161,7 @@
 _MULTIMODAL_MODELS = {
     # [Decoder-only]
     "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
+    "AyaVisionForConditionalGeneration": ("aya_vision", "AyaVisionForConditionalGeneration"),  # noqa: E501
     "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),

From a8a1c0d8c762a0e292b934e5d51150f7a6062d99 Mon Sep 17 00:00:00 2001
From: cloud11665 <Cloud11665@gmail.com>
Date: Wed, 2 Apr 2025 02:13:40 +0900
Subject: [PATCH 1138/1240] [Model] Add module name prefixes to gemma3 (#15889)

Signed-off-by: Bartholomew Sabat <bartek@recursal.ai>
Co-authored-by: Bartholomew Sabat <bartek@recursal.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/gemma3.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 55c96f649fb..fb8eccc5507 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -59,16 +59,23 @@ def __init__(
         intermediate_size: int,
         hidden_activation: str,
         quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
             bias=False,
-            quant_config=quant_config)
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config)
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
         if hidden_activation != "gelu_pytorch_tanh":
             raise ValueError(
                 "Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
@@ -125,12 +132,14 @@ def __init__(self,
             self.total_num_kv_heads,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
             bias=config.attention_bias,
             quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
         )
 
         self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
@@ -293,6 +302,7 @@ def __init__(
             intermediate_size=config.intermediate_size,
             hidden_activation=config.hidden_activation,
             quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
         )
         self.input_layernorm = GemmaRMSNorm(config.hidden_size,
                                             eps=config.rms_norm_eps)
@@ -344,6 +354,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            prefix=f"{prefix}.embed_tokens",
         )
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,

From 632e7fcf9a45196c89ceed979f869836e688291c Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:42:34 -0700
Subject: [PATCH 1139/1240] [CI] Disable flaky structure decoding test
 temporarily. (#15892)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index fa58c6460f8..0ffee08c234 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -23,7 +23,8 @@
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
      "mistral"),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
+    #FIXME: This test is flaky on CI thus disabled
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]
 
 PARAMS_MODELS_TOKENIZER_MODE = [

From bb1356054a6d1bd67ae35e7f34c8ad47571a0a93 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 1 Apr 2025 18:45:04 +0100
Subject: [PATCH 1140/1240] [V1][Metrics] Initial speculative decoding metrics
 (#15151)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py | 95 +++++++++++++++++++++++++++++++++
 vllm/v1/core/sched/scheduler.py | 15 +++++-
 vllm/v1/metrics/loggers.py      | 33 ++++++++++++
 vllm/v1/metrics/stats.py        |  4 ++
 vllm/v1/spec_decode/metrics.py  | 59 ++++++++++++++++++++
 5 files changed, 204 insertions(+), 2 deletions(-)
 create mode 100644 vllm/v1/spec_decode/metrics.py

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 73af7dad5cc..a087c41ab3a 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -611,3 +611,98 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
         prompt_logprobs_dict={},
     )
     scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
+        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (6, 3)),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
+    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
+    assert stats.num_draft_tokens == 0
+    assert stats.num_accepted_tokens == 0
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
+    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
+    assert stats.num_draft_tokens == expected[0]
+    assert stats.num_accepted_tokens == expected[1]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4d477567b9b..a0865c8fd84 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -23,6 +23,7 @@
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
@@ -552,6 +553,7 @@ def update_from_output(
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
+        spec_decoding_stats = SpecDecodingStats() if self.log_stats else None
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
@@ -584,6 +586,11 @@ def update_from_output(
                                        len(generated_token_ids))
                 request.num_computed_tokens -= num_tokens_rejected
 
+                if spec_decoding_stats is not None:
+                    spec_decoding_stats.observe(
+                        num_draft_tokens=len(scheduled_spec_token_ids),
+                        num_accepted_tokens=len(generated_token_ids) - 1)
+
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
             # OPTIMIZATION: Avoid list(set) if the set is empty.
@@ -657,7 +664,7 @@ def update_from_output(
         self.running = new_running
         engine_core_outputs = EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(),
+            scheduler_stats=self.make_stats(spec_decoding_stats),
         )
         if self.include_finished_set:
             #TODO currently sending duplicates here, improve this
@@ -724,7 +731,10 @@ def get_num_unscheduled_requests(self) -> int:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> Optional[SchedulerStats]:
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
         return SchedulerStats(
@@ -732,4 +742,5 @@ def make_stats(self) -> Optional[SchedulerStats]:
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            spec_decoding_stats=spec_decoding_stats,
         )
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 6ffd00ebd17..73883d9a735 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -12,6 +12,7 @@
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
 
 logger = init_logger(__name__)
 
@@ -38,6 +39,7 @@ def __init__(self, engine_index: int = 0):
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.spec_decoding_metrics = SpecDecodingMetrics()
 
     def _reset(self, now):
         self.last_log_time = now
@@ -65,6 +67,10 @@ def record(self, scheduler_stats: SchedulerStats,
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.observe(
+                scheduler_stats.spec_decoding_stats)
+
         self.last_scheduler_stats = scheduler_stats
 
     def log(self):
@@ -94,6 +100,9 @@ def log(self):
             self.prefix_caching_metrics.hit_rate * 100,
         )
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.log()
+
 
 class PrometheusStatLogger(StatLoggerBase):
 
@@ -302,6 +311,24 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Speculative Decoding metrics
+        # The acceptance rate can be calculated using a PromQL query:
+        #
+        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+        #
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
         #
         # Cache config info metric
         #
@@ -338,6 +365,12 @@ def record(self, scheduler_stats: SchedulerStats,
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.counter_spec_decode_num_draft_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_draft_tokens)
+            self.counter_spec_decode_num_accepted_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+
         if iteration_stats is None:
             return
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
index 6f3d3444742..fd949264885 100644
--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -4,6 +4,8 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
     from vllm.v1.engine.output_processor import RequestState
@@ -35,6 +37,8 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(
         default_factory=PrefixCacheStats)
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 @dataclass
 class LoRAStats:
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
new file mode 100644
index 00000000000..7fecbaeed4f
--- /dev/null
+++ b/vllm/v1/spec_decode/metrics.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import numpy as np
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class SpecDecodingStats:
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+
+    def take(self):
+        copied = SpecDecodingStats(self.num_draft_tokens,
+                                   self.num_accepted_tokens)
+        self.reset()
+        return copied
+
+    def reset(self):
+        self.num_draft_tokens = 0
+        self.num_accepted_tokens = 0
+
+    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+
+
+class SpecDecodingMetrics:
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+
+    def log(self):
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens
+                                 if num_draft_tokens > 0 else float("nan"))
+
+        logger.info(
+            "Speculative metrics: "
+            "Draft acceptance rate: %.3f, "
+            "Number of accepted tokens: %d, "
+            "Number of draft tokens: %d, ", draft_acceptance_rate,
+            num_accepted_tokens, num_draft_tokens)
+        self.reset()

From 7d0c77be94ad8709eced04fa72e4c5affb03a95e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Apr 2025 12:33:16 -0700
Subject: [PATCH 1141/1240] [V1][Spec Decode] Implement Eagle Proposer [1/N]
 (#15729)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py                        |   5 +-
 vllm/engine/arg_utils.py              |  22 ++-
 vllm/v1/spec_decode/eagle.py          | 262 ++++++++++++++++++++++++++
 vllm/v1/spec_decode/ngram_proposer.py |   9 +
 vllm/v1/worker/gpu_input_batch.py     |  11 +-
 vllm/v1/worker/gpu_model_runner.py    |  90 +++++++--
 6 files changed, 378 insertions(+), 21 deletions(-)
 create mode 100644 vllm/v1/spec_decode/eagle.py

diff --git a/vllm/config.py b/vllm/config.py
index c213c9b4756..6ec5d1bc28f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2152,9 +2152,10 @@ def __post_init__(self):
 
                 # Replace hf_config for EAGLE draft_model
                 if self.method == "eagle":
-                    if self.enable_chunked_prefill:
+                    if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
                         raise ValueError(
-                            "Chunked prefill and EAGLE are not compatible.")
+                            "Chunked prefill and EAGLE are not compatible "
+                            "when using V0.")
 
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index ecdcab50e45..88723d9f5b7 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1468,15 +1468,21 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
 
         # Only Ngram speculative decoding so far.
         is_ngram_enabled = False
+        is_eagle_enabled = False
         if self.speculative_config is not None:
             # This is supported but experimental (handled below).
-            if (("method" in self.speculative_config
-                 and self.speculative_config["method"] in ("ngram", "[ngram]"))
-                    or
-                ("model" in self.speculative_config and
-                 self.speculative_config["model"] in ("ngram", "[ngram]"))):
-                is_ngram_enabled = True
+            speculative_method = self.speculative_config.get("method")
+            if speculative_method:
+                if speculative_method in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+                elif speculative_method == "eagle":
+                    is_eagle_enabled = True
             else:
+                speculative_model = self.speculative_config.get("model")
+                if speculative_model in ("ngram", "[ngram]"):
+                    is_ngram_enabled = True
+            if not (is_ngram_enabled or is_eagle_enabled):
+                # Other speculative decoding methods are not supported yet.
                 _raise_or_fallback(feature_name="Speculative Decoding",
                                    recommend_to_remove=False)
                 return False
@@ -1523,6 +1529,10 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         if is_ngram_enabled and _warn_or_fallback("ngram"):
             return False
 
+        # Eagle is under development, so we don't support it yet.
+        if is_eagle_enabled and _warn_or_fallback("Eagle"):
+            return False
+
         # Non-CUDA is supported on V1, but off by default for now.
         not_cuda = not current_platform.is_cuda()
         if not_cuda and _warn_or_fallback(  # noqa: SIM103
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
new file mode 100644
index 00000000000..57c6b652593
--- /dev/null
+++ b/vllm/v1/spec_decode/eagle.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import triton
+import triton.language as tl
+
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+class EagleProposer:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.num_speculative_tokens = (
+            vllm_config.speculative_config.num_speculative_tokens)
+        self.block_size = vllm_config.cache_config.block_size
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs,
+                                   device=device)
+
+    def propose(
+        self,
+        # [num_tokens]
+        target_token_ids: torch.Tensor,
+        # [num_tokens]
+        target_positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        target_hidden_states: torch.Tensor,
+        # [num_tokens]
+        target_slot_mapping: torch.Tensor,
+        # [batch_size]
+        next_token_ids: torch.Tensor,
+        # [batch_size + 1] starting with 0
+        cu_num_tokens: torch.Tensor,
+        # [batch_size, max_num_blocks_per_req]
+        block_table: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_tokens = target_token_ids.shape[0]
+        batch_size = next_token_ids.shape[0]
+        last_token_indices = cu_num_tokens[1:] - 1
+
+        input_ids = torch.empty_like(target_token_ids)
+        # Shift the input ids by one token.
+        # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
+        input_ids[:-1] = target_token_ids[1:]
+        # Replace the last token with the next token.
+        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
+        input_ids[last_token_indices] = next_token_ids
+
+        seq_lens = target_positions[last_token_indices] + 1
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        max_seq_len = seq_lens.max().item()
+        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_tokens,
+            max_query_len=max_num_tokens,
+            query_start_loc=cu_num_tokens,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=target_slot_mapping,
+            # TODO(woosuk): Support cascade attention.
+            use_cascade=False,
+            common_prefix_len=0,
+            cu_prefix_query_lens=None,
+            prefix_kv_lens=None,
+            suffix_kv_lens=None,
+        )
+
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                hidden_states=target_hidden_states,
+                positions=target_positions,
+            )
+        sample_hidden_states = hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
+        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
+            logits, sampling_metadata)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            # [batch_size, 1] and [batch_size, 1, vocab_size]
+            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+
+        # Generate the remaining draft tokens.
+        draft_token_ids_list = [draft_token_ids]
+        draft_probs_list = [draft_probs]
+
+        positions = target_positions[last_token_indices]
+        hidden_states = sample_hidden_states
+        attn_metadata.num_actual_tokens = batch_size
+        attn_metadata.max_query_len = 1
+        attn_metadata.query_start_loc = self.arange[:batch_size]
+        for _ in range(self.num_speculative_tokens - 1):
+            # Update the inputs.
+            input_ids = draft_token_ids_list[-1]
+            positions += 1
+            attn_metadata.max_seq_len += 1
+            attn_metadata.seq_lens += 1
+            # Compute the slot mapping.
+            block_numbers = positions // self.block_size
+            block_ids = block_table.gather(dim=1,
+                                           index=block_numbers.view(-1, 1))
+            block_ids = block_ids.view(-1)
+            attn_metadata.slot_mapping = (block_ids * self.block_size +
+                                          positions % self.block_size)
+
+            # Run the model.
+            with set_forward_context(attn_metadata, self.vllm_config):
+                hidden_states = self.model(
+                    input_ids=input_ids,
+                    hidden_states=hidden_states,
+                    positions=positions,
+                )
+            logits = self.model.compute_logits(hidden_states, None)
+            draft_token_ids, probs = compute_probs_and_sample_next_token(
+                logits, sampling_metadata)
+            draft_token_ids_list.append(draft_token_ids)
+            draft_probs_list.append(probs)
+
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
+        # [batch_size, num_speculative_tokens, vocab_size]
+        draft_probs = torch.stack(draft_probs_list, dim=1)
+        return draft_token_ids, draft_probs
+
+    @staticmethod
+    def prepare_inputs(
+        # [batch_size + 1]
+        cu_target_query_lens: torch.Tensor,
+        # [batch_size]
+        num_rejected_tokens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # cu_target_query_lens: [0, a, a + b, a + b + c]
+        # num_rejected_tokens: [n1, n2, n3]
+        # num_tokens_per_req: [a - n1, b - n2, c - n3]
+        # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+        # token_indices: [0, 1, ..., a - n1 - 1,
+        #                 a, a + 1, ..., a + b - n2 - 1,
+        #                 a + b, a + b + 1, ..., a + b + c - n3 - 1]
+
+        # [0, a, a + b, a + b + c] -> [a, b, c]
+        query_len_per_req = (cu_target_query_lens[1:] -
+                             cu_target_query_lens[:-1])
+        # [a, b, c] -> [a - n1, b - n2, c - n3]
+        num_tokens_per_req = query_len_per_req - num_rejected_tokens
+
+        cu_num_tokens = torch.empty_like(cu_target_query_lens)
+        torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
+        cu_num_tokens[0] = 0
+
+        # FIXME(woosuk): Avoid synchronization.
+        num_tokens = cu_num_tokens[-1].item()
+        token_indices = torch.empty(
+            num_tokens,
+            dtype=torch.int32,
+            device=cu_num_tokens.device,
+        )
+
+        batch_size = num_rejected_tokens.shape[0]
+        BLOCK_SIZE = 1024
+        prepare_input_kernel[(batch_size, )](
+            token_indices,
+            cu_target_query_lens,
+            cu_num_tokens,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return cu_num_tokens, token_indices
+
+    def load_model(self, target_model: nn.Module) -> None:
+        self.model = DummyEagleModel()
+        self.model.get_input_embeddings = target_model.get_input_embeddings
+        self.model.compute_logits = target_model.compute_logits
+
+
+# FIXME(woosuk): This is a dummy model for testing.
+# Remove this once we have a real model.
+class DummyEagleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeddings = self.get_input_embeddings(input_ids)
+        return hidden_states + input_embeddings  # Dummy return.
+
+
+# FIXME(woosuk): The logic here is duplicated with the main sampling code.
+# We should refactor this to reuse the same sampling implementation.
+def compute_probs_and_sample_next_token(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if sampling_metadata.all_greedy:
+        # For greedy requests, draft_probs is not used in rejection sampling.
+        # Therefore, we can just return the logits.
+        probs = logits
+        next_token_ids = logits.argmax(dim=-1)
+        return next_token_ids, probs
+
+    is_greedy = sampling_metadata.temperature == -1
+    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    logits.div_(temperature.view(-1, 1))
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+
+    # NOTE(woosuk): Currently, we ignore most of the sampling parameters in
+    # generating the draft tokens. We only use the temperature. While this
+    # could degrade the acceptance rate, it does not affect the distribution
+    # of the generated tokens after rejection sampling.
+
+    # TODO(woosuk): Consider seeds.
+    q = torch.empty_like(probs)
+    q.exponential_()
+    next_token_ids = probs.div_(q).argmax(dim=-1).view(-1)
+    if not sampling_metadata.all_random:
+        greedy_token_ids = probs.argmax(dim=-1)
+        next_token_ids = torch.where(
+            is_greedy,
+            greedy_token_ids,
+            next_token_ids,
+        )
+    return next_token_ids, probs
+
+
+@triton.jit
+def prepare_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+    indices = index_start + tl.arange(0, BLOCK_SIZE)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            indices,
+            mask=offset < num_tokens,
+        )
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 0bef349e99e..8f6d20d11ff 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -4,9 +4,14 @@
 import numpy as np
 from numba import jit
 
+from vllm.config import VllmConfig
+
 
 class NgramProposer:
 
+    def __init__(self, vllm_config: VllmConfig):
+        self.vllm_config = vllm_config
+
     def propose(
         self,
         context_token_ids: np.ndarray,
@@ -50,6 +55,10 @@ def propose(
                 return result
         return None
 
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+
 
 @jit(nopython=True)
 def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 351b3581558..a64cb97e012 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -39,9 +39,18 @@ class CachedRequestState:
 
     lora_request: Optional[LoRARequest] = None
 
+    def __post_init__(self):
+        self.num_prompt_tokens = len(self.prompt_token_ids)
+
     @property
     def num_tokens(self) -> int:
-        return len(self.prompt_token_ids) + len(self.output_token_ids)
+        return self.num_prompt_tokens + len(self.output_token_ids)
+
+    def get_token_id(self, idx: int) -> int:
+        if idx < self.num_prompt_tokens:
+            return self.prompt_token_ids[idx]
+        else:
+            return self.output_token_ids[idx - self.num_prompt_tokens]
 
 
 class InputBatch:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 637367a70d2..513806332ef 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -35,6 +35,7 @@
                              ModelRunnerOutput)
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
+from vllm.v1.spec_decode.eagle import EagleProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.utils import is_spec_decode_supported
@@ -157,18 +158,15 @@ def __init__(
         self.use_spec_decode = False
         if self.speculative_config:
             self.use_spec_decode = True
-            assert self.speculative_config.method == "ngram", \
-                    "Currently, only ngram spec decode is supported in V1."
             if get_pp_group().is_last_rank:
-                self.drafter = NgramProposer()
-                # Trigger Numba JIT compilation for N-gram proposer.
-                # This usually takes less than 1 second.
-                self.drafter.propose(
-                    np.zeros(1024, dtype=np.int32),
-                    self.speculative_config.prompt_lookup_min,
-                    self.speculative_config.prompt_lookup_max,
-                    self.speculative_config.num_speculative_tokens,
-                )
+                if self.speculative_config.method == "ngram":
+                    self.drafter = NgramProposer(self.vllm_config)
+                elif self.speculative_config.method == "eagle":
+                    self.drafter = EagleProposer(self.vllm_config,
+                                                 self.device)  # type: ignore
+                else:
+                    raise ValueError("Unknown speculative decoding method: "
+                                     f"{self.speculative_config.method}")
                 self.rejection_sampler = RejectionSampler()
 
         # Request states.
@@ -1144,10 +1142,75 @@ def execute_model(
             valid_sampled_token_ids[i].clear()
 
         if not self.use_spec_decode:
+            # Speculative decoding is not enabled.
             spec_token_ids = None
-        else:
+        elif self.speculative_config.method == "ngram":
+            assert isinstance(self.drafter, NgramProposer)
             spec_token_ids = self.generate_draft_token_ids(
                 valid_sampled_token_ids, sampling_metadata)
+        elif self.speculative_config.method == "eagle":
+            assert isinstance(self.drafter, EagleProposer)
+            # TODO(woosuk): Refactor the loop.
+            next_token_ids: list[int] = []
+            for i, token_ids in enumerate(valid_sampled_token_ids):
+                if token_ids:
+                    # Common case.
+                    next_token_id = token_ids[-1]
+                else:
+                    # Partial prefill (rare case).
+                    # Get the next token id from the request state.
+                    req_id = self.input_batch.req_ids[i]
+                    req_state = self.requests[req_id]
+                    seq_len = (req_state.num_computed_tokens +
+                               scheduler_output.num_scheduled_tokens[req_id])
+                    next_token_id = req_state.get_token_id(seq_len)
+                next_token_ids.append(next_token_id)
+            next_token_ids = torch.tensor(next_token_ids,
+                                          dtype=torch.int32,
+                                          device=self.device)
+
+            if spec_decode_metadata is None:
+                # input_ids can be None for multimodal models.
+                target_token_ids = self.input_ids[:num_scheduled_tokens]
+                target_positions = positions
+                target_hidden_states = hidden_states
+                target_slot_mapping = attn_metadata.slot_mapping
+                cu_num_tokens = attn_metadata.query_start_loc
+            else:
+                # TODO(woosuk): Refactor this.
+                num_draft_tokens = spec_decode_metadata.num_draft_tokens
+                num_rejected_tokens = [
+                    n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
+                    for i, n in enumerate(num_draft_tokens)
+                ]
+                num_rejected_tokens = torch.tensor(
+                    num_rejected_tokens,
+                    dtype=torch.int32,
+                    device=self.device,
+                )
+                cu_num_tokens, token_indices = self.drafter.prepare_inputs(
+                    attn_metadata.query_start_loc,
+                    num_rejected_tokens,
+                )
+                target_token_ids = self.input_ids[token_indices]
+                target_positions = positions[token_indices]
+                target_hidden_states = hidden_states[token_indices]
+                target_slot_mapping = attn_metadata.slot_mapping[token_indices]
+
+            draft_token_ids, draft_probs = self.drafter.propose(
+                target_token_ids=target_token_ids,
+                target_positions=target_positions,
+                target_hidden_states=target_hidden_states,
+                target_slot_mapping=target_slot_mapping,
+                next_token_ids=next_token_ids,
+                cu_num_tokens=cu_num_tokens,
+                block_table=attn_metadata.block_table,
+                sampling_metadata=sampling_metadata,
+            )
+            spec_token_ids = draft_token_ids.tolist()
+            # TODO(woosuk): Cache draft_probs and use it for rejection sampling
+            # in the next step.
+            del draft_probs
 
         return ModelRunnerOutput(
             req_ids=self.input_batch.req_ids,
@@ -1205,6 +1268,9 @@ def load_model(self) -> None:
                                                   self.scheduler_config,
                                                   self.lora_config,
                                                   self.device)
+            if hasattr(self, "drafter"):
+                logger.info("Loading drafter model...")
+                self.drafter.load_model(self.model)
             time_after_load = time.perf_counter()
         self.model_memory_usage = m.consumed_memory
         logger.info("Model loading took %.4f GiB and %.6f seconds",

From 8aaa97ca71c42ec8b1cfe2c1f429e8e9779eaf70 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 12:54:13 -0700
Subject: [PATCH 1142/1240] [Docs] update usage stats language (#15898)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/serving/usage_stats.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/serving/usage_stats.md b/docs/source/serving/usage_stats.md
index cfc3cb25768..750cba7ed9c 100644
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@@ -1,6 +1,8 @@
 # Usage Stats Collection
 
-vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information.
+
+A subset of the data, after cleaning and aggregation, will be publicly released for the community's benefit. For example, you can see the 2024 usage report [here](https://2024.vllm.ai).
 
 ## What data is collected?
 

From 391548476cbd1d1e07a9013eda53a21c412ce468 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Wed, 2 Apr 2025 04:10:24 +0800
Subject: [PATCH 1143/1240] [BugFix] make sure socket close (#15875)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/openai/api_server.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 2a61259896a..1e7d9eb83b9 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1098,9 +1098,10 @@ def _listen_addr(a: str) -> str:
         )
 
     # NB: Await server shutdown only after the backend context is exited
-    await shutdown_task
-
-    sock.close()
+    try:
+        await shutdown_task
+    finally:
+        sock.close()
 
 
 if __name__ == "__main__":

From dc8f2a3b0c8b088bbc15e3a0583bc22a75c8b67c Mon Sep 17 00:00:00 2001
From: Gerald <1252500865@QQ.COM>
Date: Wed, 2 Apr 2025 04:23:55 +0800
Subject: [PATCH 1144/1240] [Model][MiniMaxText01] Support MiniMaxText01 model
 inference (#13454)

Signed-off-by: qscqesze <475517977@qq.com>
Co-authored-by: qingjun <qingjun@minimaxi.com>
Co-authored-by: qscqesze <475517977@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md        |    5 +
 tests/kernels/test_lightning_attn.py          |  286 ++++
 tests/models/registry.py                      |    2 +
 vllm/config.py                                |   42 +-
 vllm/engine/async_llm_engine.py               |    7 +-
 vllm/model_executor/layers/lightning_attn.py  |  651 +++++++++
 .../models/constant_size_cache.py             |  136 ++
 vllm/model_executor/models/mamba_cache.py     |  132 +-
 vllm/model_executor/models/minimax_cache.py   |   35 +
 vllm/model_executor/models/minimax_text_01.py | 1273 +++++++++++++++++
 vllm/model_executor/models/registry.py        |    1 +
 11 files changed, 2440 insertions(+), 130 deletions(-)
 create mode 100644 tests/kernels/test_lightning_attn.py
 create mode 100644 vllm/model_executor/layers/lightning_attn.py
 create mode 100644 vllm/model_executor/models/constant_size_cache.py
 create mode 100644 vllm/model_executor/models/minimax_cache.py
 create mode 100644 vllm/model_executor/models/minimax_text_01.py

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 1b742717885..af0f7304c66 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -503,6 +503,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
   * ✅︎
   * ✅︎
+- * `MiniMaxText01ForCausalLM`
+  * MiniMax-Text
+  * `MiniMaxAI/MiniMax-Text-01`, etc.
+  *
+  * ✅︎
 - * `Zamba2ForCausalLM`
   * Zamba2
   * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
diff --git a/tests/kernels/test_lightning_attn.py b/tests/kernels/test_lightning_attn.py
new file mode 100644
index 00000000000..fbad52987dd
--- /dev/null
+++ b/tests/kernels/test_lightning_attn.py
@@ -0,0 +1,286 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.lightning_attn import (
+    linear_decode_forward_triton)
+from vllm.platforms import current_platform
+
+NUM_HEADS = [4, 8]
+HEAD_SIZES = [64]
+BATCH_SIZES = [1, 2]
+SEQ_LENGTHS = [16]
+DTYPES = [torch.float32]
+
+
+def reference_lightning_attention(q, k, v, ed, block_size, kv_history):
+    """Reference implementation of lightning attention core algorithm
+    
+    The difference from the main implementation is that this processes 
+    each step sequentially, instead of using parallelized triton kernels
+    """
+    B, H, S, D = q.shape
+    E = v.shape[-1]
+    dtype = q.dtype
+    output = torch.zeros((B, H, S, E), dtype=dtype, device=q.device)
+
+    # Use clone() to ensure an independent copy
+    if kv_history is None:
+        kv_cache = torch.zeros((B, H, D, E), dtype=dtype, device=q.device)
+    else:
+        kv_cache = kv_history.clone()
+
+    # More efficient implementation
+    # Convert decay factors to matrix form
+    if ed.dim() == 1:
+        decay = torch.exp(-ed).view(1, -1, 1, 1)
+    else:
+        decay = torch.exp(-ed)
+
+    for b in range(B):
+        for step in range(S):
+            # Process all heads at once for this position
+            q_bs = q[b, :, step]  # [H, D]
+            k_bs = k[b, :, step]  # [H, D]
+            v_bs = v[b, :, step]  # [H, E]
+
+            # Calculate KV outer products for all heads
+            for h in range(H):
+                # Calculate KV outer product
+                kv_outer = torch.outer(k_bs[h], v_bs[h])
+
+                # Update KV cache with decay
+                # Note: Using the same order as in the Triton kernel
+                kv_cache[b, h] = decay[0, h, 0, 0] * kv_cache[b, h] + kv_outer
+
+                # Calculate attention output
+                output[b, h, step] = torch.matmul(q_bs[h], kv_cache[b, h])
+
+    # Match the shape returned by the actual implementation
+    # The actual implementation returns a tensor of shape [B, H, 2, D, E]
+    # where dimension 2 contains both KV and KV history
+    kv_reshaped = kv_cache.unsqueeze(2)  # [B, H, 1, D, E]
+    final_kv_cache = torch.cat([kv_reshaped, kv_reshaped],
+                               dim=2)  # [B, H, 2, D, E]
+
+    return output, final_kv_cache
+
+
+def reference_linear_decode(q, k, v, kv_caches, slope_rate, slot_idx):
+    """Reference implementation: linear attention decode function"""
+    B, H, _, D = q.shape
+    output = torch.zeros(B, H * D, dtype=q.dtype, device=q.device)
+
+    # Calculate decay factors once (more efficient)
+    decay = torch.exp(-slope_rate).view(-1, 1, 1)  # [H, 1, 1]
+
+    # Process each batch
+    for b in range(B):
+        slot_id = slot_idx[b].item()
+
+        # Skip padding positions
+        if slot_id == -1:
+            continue
+
+        # Process all heads at once for this batch
+        q_b = q[b, :, 0]  # [H, D]
+        k_b = k[b, :, 0]  # [H, D]
+        v_b = v[b, :, 0]  # [H, D]
+
+        # Process each attention head
+        for h in range(H):
+            # Get current query, key and value
+            q_bh = q_b[h]
+            k_bh = k_b[h]
+            v_bh = v_b[h]
+
+            # Get cache
+            kv_cache_old = kv_caches[b, h]
+
+            # Calculate new key-value outer product
+            kv_outer = torch.outer(k_bh, v_bh)
+
+            # Apply decay and update cache
+            kv_new = kv_outer + decay[h, 0, 0] * kv_cache_old
+
+            # Calculate output
+            out_h = torch.matmul(q_bh, kv_new)
+
+            # Update output and cache
+            output[b, h * D:(h + 1) * D] = out_h
+            kv_caches[b, h] = kv_new
+
+    return output
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.arange(batch_size, device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+    torch.testing.assert_close(triton_output,
+                               reference_output,
+                               rtol=1e-1,
+                               atol=1e-1)
+    torch.testing.assert_close(kv_caches, kv_caches_copy, rtol=1e-1, atol=1e-1)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_linear_decode_forward_triton_with_padding(
+    num_heads: int,
+    head_size: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    batch_size = 4
+    base = 0.01
+    q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+    v = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
+
+    kv_caches = base * torch.randn(batch_size,
+                                   num_heads,
+                                   head_size,
+                                   head_size,
+                                   dtype=dtype,
+                                   device="cuda")
+
+    kv_caches_copy = kv_caches.clone()
+
+    slope_rate = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        slope_rate[h] = 0.1 * (h + 1)
+
+    slot_idx = torch.tensor([0, 1, -1, 2], device="cuda")
+
+    triton_output = linear_decode_forward_triton(q, k, v, kv_caches,
+                                                 slope_rate, slot_idx)
+
+    reference_output = reference_linear_decode(q, k, v, kv_caches_copy,
+                                               slope_rate, slot_idx)
+
+    padding_mask = (slot_idx
+                    != -1).unsqueeze(1).expand(-1, num_heads * head_size)
+
+    triton_masked = triton_output[padding_mask]
+    reference_masked = reference_output[padding_mask]
+
+    atol, rtol = 1.5e-1, 1.5e-1
+
+    valid_indices = slot_idx != -1
+
+    for i in range(batch_size):
+        if valid_indices[i] > 0:
+            torch.testing.assert_close(kv_caches[i],
+                                       kv_caches_copy[i],
+                                       rtol=rtol,
+                                       atol=atol)
+
+    torch.testing.assert_close(triton_masked,
+                               reference_masked,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert triton_output.shape == (batch_size, num_heads * head_size)
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENGTHS)
+@pytest.mark.parametrize("dtype", DTYPES)
+@torch.inference_mode()
+def test_lightning_attention_reference(
+    batch_size: int,
+    num_heads: int,
+    head_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+):
+    torch.set_default_device("cuda")
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
+    current_platform.seed_everything(42)
+
+    base = 0.01
+    q = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    k = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+    v = base * torch.randn(
+        batch_size, num_heads, seq_len, head_size, dtype=dtype)
+
+    ed = torch.zeros(num_heads, device="cuda")
+    for h in range(num_heads):
+        ed[h] = 0.1 * (h + 1)
+
+    kv_history = base * torch.randn(batch_size,
+                                    num_heads,
+                                    head_size,
+                                    head_size,
+                                    dtype=dtype,
+                                    device="cuda")
+
+    kv_history_clone = kv_history.clone()
+
+    ref_output, ref_kv_cache = reference_lightning_attention(
+        q, k, v, ed, 256, kv_history)
+
+    from vllm.model_executor.layers.lightning_attn import lightning_attention
+    actual_output, actual_kv_cache = lightning_attention(
+        q, k, v, ed, 256, kv_history_clone)
+
+    atol, rtol = 1.5e-1, 1.5e-1
+    torch.testing.assert_close(ref_output, actual_output, rtol=rtol, atol=atol)
+    torch.testing.assert_close(ref_kv_cache,
+                               actual_kv_cache,
+                               rtol=rtol,
+                               atol=atol)
+
+    assert ref_output.shape == (batch_size, num_heads, seq_len, head_size)
+    assert ref_kv_cache.shape == actual_kv_cache.shape
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 137f1418736..39e104a11ab 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -176,6 +176,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
                                          trust_remote_code=True),
+    "MiniMaxText01ForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01",
+                                                trust_remote_code=True),
     "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
     "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
     "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
diff --git a/vllm/config.py b/vllm/config.py
index 6ec5d1bc28f..ba20e3fd751 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -971,26 +971,34 @@ def get_num_layers_by_block_type(
             return sum(not bc.attention.no_op
                        for bc in block_configs[start:end])
         else:
-            # Hybrid model
+            # Hybrid model Jamba
             layers_block_type_value = getattr(self.hf_config,
                                               "layers_block_type", None)
-            if layers_block_type_value is None:
-                raise ValueError("The model is an hybrid without a "
-                                 "layers_block_type in the hf_config, "
-                                 "cannot determine the num of "
-                                 f"{block_type.value} layers")
-
-            if hasattr(self.hf_text_config,
-                       "model_type") and (self.hf_text_config.model_type
-                                          == "zamba2"):
-                if attn_block_type:
-                    return sum(t == "hybrid"
-                               for t in layers_block_type_value[start:end])
-                else:
-                    return self.get_num_layers(parallel_config)
+            if layers_block_type_value is not None:
+                if hasattr(self.hf_text_config,
+                           "model_type") and (self.hf_text_config.model_type
+                                              == "zamba2"):
+                    if attn_block_type:
+                        return sum(t == "hybrid"
+                                   for t in layers_block_type_value[start:end])
+                    else:
+                        return self.get_num_layers(parallel_config)
+                return sum(t == block_type.value
+                           for t in layers_block_type_value[start:end])
+
+            # Hybrid model Minimax
+            attn_type_list = getattr(self.hf_config, "attn_type_list", None)
+            if attn_type_list:
+                return sum(t == 1 for t in attn_type_list[start:end])
+
+            if layers_block_type_value is None and attn_type_list is None:
+                raise ValueError(
+                    "The model is an hybrid without a"
+                    "layers_block_type or an attn_type_list in the hf_config,"
+                    "cannot determine the num of "
+                    f"{block_type.value} layers")
 
-            return sum(t == block_type.value
-                       for t in layers_block_type_value[start:end])
+            return sum(t == 1 for t in attn_type_list[start:end])
 
     def get_multimodal_config(self) -> "MultiModalConfig":
         """
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 079e2a08152..3e337731d63 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -303,8 +303,11 @@ async def step_async(
             ctx.seq_group_metadata_list = seq_group_metadata_list
             ctx.scheduler_outputs = scheduler_outputs
 
-            finished_requests_ids = self.scheduler[
-                virtual_engine].get_and_reset_finished_requests_ids()
+            if not scheduler_outputs.is_empty():
+                # this will cause mamba_cache/minimax_cache failed
+                # to release finished_requests_ids of the last steps
+                finished_requests_ids = self.scheduler[
+                    virtual_engine].get_and_reset_finished_requests_ids()
 
             # Maybe switch from async mode to sync mode
             if not allow_async_output_proc and len(ctx.output_queue) > 0:
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
new file mode 100644
index 00000000000..de360778f28
--- /dev/null
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -0,0 +1,651 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import triton
+import triton.language as tl
+from einops import rearrange
+
+
+@triton.jit
+def _fwd_diag_kernel(Q, K, V, Out, S, b: tl.constexpr, h: tl.constexpr, n,
+                     d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                     NUM_BLOCK, CBLOCK: tl.constexpr):
+    # This kernel computes the diagonal blocks of the attention matrix
+    # Each diagonal block represents attention
+    # where queries attend to keys in the same block
+    off = tl.program_id(0)
+    off_bh = off // NUM_BLOCK  # batch-head index
+    off_block = off % NUM_BLOCK  # block index within the sequence
+    off_cblock = tl.program_id(1)  # sub-block index within a block
+
+    off_h = off_bh % h  # head index
+
+    # Calculate base offsets for the current batch and head
+    qk_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    o_offset = off_bh * n * e
+
+    # Calculate offsets for the current block
+    block_offset = off_block * BLOCK
+    qk_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    o_block_offset = block_offset * e
+
+    # Calculate offsets for the current sub-block
+    cblock_offset = off_cblock * CBLOCK
+    q_cblock_offset = cblock_offset * d
+    o_cblock_offset = cblock_offset * e
+
+    # Calculate pointers to the query, key, value, and output tensors
+    Q_block_ptr = (Q + qk_offset + qk_block_offset + q_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    K_trans_block_ptr = (K + qk_offset + qk_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, d)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+    O_block_ptr = (Out + o_offset + o_block_offset + o_cblock_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, e)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    i = off_cblock
+    q_index = tl.arange(0, CBLOCK) + i * CBLOCK
+
+    # Load query values
+    q = tl.load(Q_block_ptr,
+                mask=block_offset + q_index[:, None] < n,
+                other=0.0).to(tl.float32)
+
+    # Initialize output accumulator
+    qkv = tl.zeros([CBLOCK, e], dtype=tl.float32)
+
+    # Process all sub-blocks up to and
+    # including the current one (causal attention)
+    for j in range(i + 1):
+        kv_index = tl.arange(0, CBLOCK) + j * CBLOCK
+        diff = q_index[:, None] - kv_index[None, :]
+        s_index = s * diff
+        # Apply causal mask: only attend to positions before the current one
+        s_index = tl.where(diff >= 0, -s_index, float("-inf"))
+        decay = tl.exp(s_index)
+
+        # Load key and value
+        k_trans = tl.load(
+            K_trans_block_ptr,
+            mask=block_offset + kv_index[None, :] < n,
+            other=0.0,
+        ).to(tl.float32)
+        v = tl.load(
+            V_block_ptr,
+            mask=block_offset + kv_index[:, None] < n,
+            other=0.0,
+        ).to(tl.float32)
+
+        # Compute attention scores and apply decay
+        qk = tl.dot(q, k_trans) * decay
+
+        # Compute weighted values and accumulate
+        qkv += tl.dot(qk, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+
+    # Store the result
+    tl.store(
+        O_block_ptr,
+        qkv.to(O_block_ptr.dtype.element_ty),
+        mask=block_offset + q_index[:, None] < n,
+    )
+
+
+@triton.jit
+def _fwd_kv_parallel(
+    K,
+    V,
+    K_decay,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    D_FBLOCK: tl.constexpr,
+    E_FBLOCK: tl.constexpr,
+    NUM_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the key-value outer
+    # products for each block in parallel
+    off_bh = tl.program_id(0)  # batch-head index
+    off_block = tl.program_id(1)  # block index
+
+    off_h = off_bh % h  # head index
+
+    block_offset = off_block * BLOCK
+
+    # Calculate offsets for the current block
+    k_block_offset = block_offset * d
+    v_block_offset = block_offset * e
+    kv_block_offset = off_block * d * e
+
+    # Calculate base offsets for the current batch and head
+    k_offset = off_bh * n * d
+    v_offset = off_bh * n * e
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointers to the key, value, and key-value tensors
+    K_trans_block_ptr = (K + k_offset + k_block_offset +
+                         tl.arange(0, CBLOCK)[None, :] * d +
+                         tl.arange(0, D_FBLOCK)[:, None])
+    V_block_ptr = (V + v_offset + v_block_offset +
+                   tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + kv_block_offset +
+                    tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay factors for the current head and block
+    k_decay_ptr = (K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :])
+
+    kv_index = tl.arange(0, CBLOCK)
+
+    # Initialize the key-value outer product accumulator
+    kv = tl.zeros([D_FBLOCK, E_FBLOCK], dtype=tl.float32)
+
+    # Handle the last block which might be smaller than BLOCK
+    if off_block == NUM_BLOCK - 1:
+        split_n = n - (NUM_BLOCK - 1) * BLOCK
+    else:
+        split_n = BLOCK
+    left_shift = tl.cdiv(split_n, CBLOCK) * CBLOCK - split_n
+    num_blocks = min(tl.cdiv(split_n, CBLOCK), NUM_CBLOCK)
+    k_decay_ptr += (NUM_CBLOCK - num_blocks) * CBLOCK
+
+    # Process all sub-blocks in the current block
+    for j in range(num_blocks):
+        left_bound = (1 - j) * left_shift
+        # Load key and value, handling boundary conditions
+        k_trans = tl.load(K_trans_block_ptr - left_shift * d,
+                          mask=kv_index[None, :] >= left_bound,
+                          other=0.0)
+        v = tl.load(V_block_ptr - left_shift * e,
+                    mask=kv_index[:, None] >= left_bound,
+                    other=0.0)
+
+        # Load decay factor and compute weighted key-value outer product
+        k_decay = tl.load(k_decay_ptr)
+        kv += tl.dot(k_trans * k_decay, v)
+
+        # Move to the next sub-block
+        K_trans_block_ptr += CBLOCK * d
+        V_block_ptr += CBLOCK * e
+        k_decay_ptr += CBLOCK
+
+    # Store the result
+    tl.store(KV_block_ptr, kv.to(KV_block_ptr.dtype.element_ty))
+
+
+@triton.jit
+def _fwd_kv_reduce(S, KV, KV_HISTORY, b: tl.constexpr, h: tl.constexpr, n,
+                   d: tl.constexpr, e: tl.constexpr, BLOCK: tl.constexpr,
+                   NUM_BLOCK, D_FBLOCK: tl.constexpr, E_FBLOCK: tl.constexpr):
+    # This kernel reduces the key-value outer products
+    # across blocks and updates the KV history
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    kv_offset = off_bh * NUM_BLOCK * d * e
+
+    # Calculate pointer to the key-value tensor
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, D_FBLOCK)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    s_ptrs = S + off_h
+    s = tl.load(s_ptrs)
+
+    # Calculate pointer to the key-value history tensor
+    kv_history_offset = off_bh * d * e
+    KV_HISTORY_block_ptr = (KV_HISTORY + kv_history_offset +
+                            tl.arange(0, D_FBLOCK)[:, None] * e +
+                            tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the previous key-value history
+    kv_pre = tl.load(KV_HISTORY_block_ptr).to(tl.float32)
+
+    # Process all blocks in reverse order to compute the prefix sum
+    for i in range(NUM_BLOCK):
+        block_size = min(n - i * BLOCK, BLOCK)
+        # Compute decay factor for the current block
+        block_decay = tl.exp(-s.to(tl.float32) * block_size)
+
+        # Load the current key-value outer product
+        kv_cur = tl.load(KV_block_ptr).to(tl.float32)
+        # Store the previous key-value history to the current block
+        tl.store(KV_block_ptr, kv_pre.to(KV_block_ptr.dtype.element_ty))
+
+        # Update the key-value history with the current block
+        kv_pre = block_decay * kv_pre + kv_cur
+        KV_block_ptr += d * e
+
+    # Store the updated key-value history
+    tl.store(KV_HISTORY_block_ptr, kv_pre)
+
+
+@triton.jit
+def _fwd_none_diag_kernel(
+    Q,
+    Out,
+    S,
+    KV,
+    b: tl.constexpr,
+    h: tl.constexpr,
+    n,
+    d: tl.constexpr,
+    e: tl.constexpr,
+    BLOCK: tl.constexpr,
+    NUM_BLOCK,
+    E_FBLOCK: tl.constexpr,
+    CBLOCK: tl.constexpr,
+    NUM_CBLOCK: tl.constexpr,
+):
+    # This kernel computes the non-diagonal blocks of the attention matrix
+    # Each non-diagonal block represents attention
+    # where queries attend to keys in different blocks
+    off_bh = tl.program_id(0)  # batch-head index
+    off_h = off_bh % h  # head index
+
+    off_nc = tl.program_id(1)
+    off_n = off_nc // NUM_CBLOCK  # block index
+    off_c = off_nc % NUM_CBLOCK  # sub-block index
+    off_e = tl.program_id(2)  # output feature block index
+
+    n_offset = off_n * BLOCK
+    c_offset = off_c * CBLOCK
+    e_offset = off_e * E_FBLOCK
+    block_offset = n_offset + c_offset
+
+    # Calculate offsets for the current batch, head, and block
+    q_offset = off_bh * n * d + (n_offset + c_offset) * d
+    o_offset = off_bh * n * e + (n_offset + c_offset) * e + e_offset
+    kv_offset = off_bh * NUM_BLOCK * d * e + off_n * d * e + e_offset
+
+    # Calculate pointers to the query, output, and key-value tensors
+    Q_block_ptr = (Q + q_offset + tl.arange(0, CBLOCK)[:, None] * d +
+                   tl.arange(0, d)[None, :])
+    O_block_ptr = (Out + o_offset + tl.arange(0, CBLOCK)[:, None] * e +
+                   tl.arange(0, E_FBLOCK)[None, :])
+    KV_block_ptr = (KV + kv_offset + tl.arange(0, d)[:, None] * e +
+                    tl.arange(0, E_FBLOCK)[None, :])
+
+    # Load the decay rate for the current head
+    S_block_ptr = S + off_h
+    s = tl.load(S_block_ptr)
+
+    c_array = tl.arange(0, CBLOCK)
+
+    # Load the key-value outer product for the current block
+    kv = tl.load(KV_block_ptr).to(tl.float32)
+    q_index = block_offset + tl.arange(0, CBLOCK)
+
+    # Load query values
+    q = tl.load(Q_block_ptr, mask=q_index[:, None] < n,
+                other=0.).to(tl.float32)
+
+    # Compute decay factors for the current sub-block
+    q_decay = tl.exp(-s.to(tl.float32) * (off_c * CBLOCK + c_array[:, None]))
+
+    # Compute non-diagonal attention output
+    qkv_none_diag = tl.dot(q, kv) * q_decay
+
+    # Load diagonal attention output (computed by _fwd_diag_kernel)
+    qkv_diag = tl.load(O_block_ptr, mask=q_index[:, None] < n,
+                       other=0.).to(tl.float32)
+
+    # Combine diagonal and non-diagonal attention outputs
+    qkv = qkv_diag + qkv_none_diag
+
+    # Store the result
+    tl.store(O_block_ptr,
+             qkv.to(O_block_ptr.dtype.element_ty),
+             mask=q_index[:, None] < n)
+
+
+class _attention(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, q, k, v, s, kv_history):
+        # Forward pass of the lightning attention algorithm
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        s = s.contiguous()
+
+        # Check CUDA compute capability
+        capability = torch.cuda.get_device_capability()
+        if capability[0] < 8:
+            raise RuntimeError("Flash attention currently only supported",
+                               "for compute capability >= 80")
+
+        # Get input dimensions
+        b, h, n, d = q.shape
+        e = v.shape[-1]
+
+        # Initialize output tensor
+        o = torch.empty((b, h, n, e), dtype=q.dtype, device=q.device)
+
+        # Set block sizes
+        BLOCK = 256
+        NUM_BLOCK = triton.cdiv(n, BLOCK)
+
+        CBLOCK = 32
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Compute decay factors for keys
+        array = torch.arange(0, BLOCK, device=q.device) + 1
+        k_decay = torch.exp(-s * (BLOCK - array.reshape(1, -1)))
+
+        # Step 1: Compute diagonal blocks of attention
+        grid = (b * h * NUM_BLOCK, NUM_CBLOCK)
+        _fwd_diag_kernel[grid](q,
+                               k,
+                               v,
+                               o,
+                               s,
+                               b,
+                               h,
+                               n,
+                               d,
+                               e,
+                               BLOCK=BLOCK,
+                               NUM_BLOCK=NUM_BLOCK,
+                               CBLOCK=CBLOCK)
+
+        # Set feature block sizes
+        NUM_FBLOCK = 1
+        D_FBLOCK = d // NUM_FBLOCK
+        assert d % NUM_FBLOCK == 0
+        E_FBLOCK = e // NUM_FBLOCK
+        assert e % NUM_FBLOCK == 0
+
+        CBLOCK = 64
+        NUM_CBLOCK = BLOCK // CBLOCK
+        assert BLOCK % CBLOCK == 0, "BLOCK must be a multiple of CBLOCK"
+
+        # Step 2: Compute key-value outer products for each block in parallel
+        kv = torch.empty((b, h, NUM_BLOCK, d, e),
+                         dtype=torch.float32,
+                         device=q.device)
+        grid = (b * h, NUM_BLOCK)
+        _fwd_kv_parallel[grid](
+            k,
+            v,
+            k_decay,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            D_FBLOCK=D_FBLOCK,
+            E_FBLOCK=E_FBLOCK,
+            NUM_FBLOCK=NUM_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Step 3: Reduce key-value outer products
+        # across blocks and update KV history
+        grid = (b * h, NUM_FBLOCK)
+        _fwd_kv_reduce[grid](s,
+                             kv,
+                             kv_history,
+                             b,
+                             h,
+                             n,
+                             d,
+                             e,
+                             BLOCK=BLOCK,
+                             NUM_BLOCK=NUM_BLOCK,
+                             D_FBLOCK=D_FBLOCK,
+                             E_FBLOCK=E_FBLOCK)
+
+        # Step 4: Compute non-diagonal blocks of attention
+        grid = (b * h, NUM_BLOCK * NUM_CBLOCK)
+        _fwd_none_diag_kernel[grid](
+            q,
+            o,
+            s,
+            kv,
+            b,
+            h,
+            n,
+            d,
+            e,
+            BLOCK=BLOCK,
+            NUM_BLOCK=NUM_BLOCK,
+            E_FBLOCK=E_FBLOCK,
+            CBLOCK=CBLOCK,
+            NUM_CBLOCK=NUM_CBLOCK,
+        )
+
+        # Save tensors for backward pass
+        ctx.save_for_backward(q, k, v, s, kv)
+        ctx.BLOCK = BLOCK
+
+        return o, torch.cat([kv, kv_history.unsqueeze(2)], dim=2)
+
+
+# Apply the lightning attention function
+lightning_attention_ = _attention.apply
+
+
+def lightning_attention(q, k, v, ed, block_size=256, kv_history=None):
+    """
+    Apply lightning attention algorithm 
+    to compute attention efficiently.
+    
+    Args:
+        q: Query tensor of shape [batch, heads, seq_len, dim]
+        k: Key tensor of shape [batch, heads, seq_len, dim]
+        v: Value tensor of shape [batch, heads, seq_len, dim_v]
+        ed: Decay rate tensor of shape [heads]
+        block_size: Size of blocks for block-sparse attention
+        kv_history: Optional key-value history from previous computations
+        
+    Returns:
+        output: Attention output
+        kv: Updated key-value history
+    """
+    d = q.shape[-1]
+    e = v.shape[-1]
+
+    if ed.dim() == 1:
+        ed = ed.view(1, -1, 1, 1)
+
+    # Split the computation into chunks for better parallelism
+    m = 128 if d >= 128 else 64
+    assert d % m == 0, f"Dimension d ({d}) must be divisible by m ({m})"
+    arr = [m * i for i in range(d // m + 1)]
+    if arr[-1] != d:
+        arr.append(d)
+    n = len(arr)
+    output = 0
+
+    # Initialize or clone key-value history
+    if kv_history is None:
+        kv_history = torch.zeros((q.shape[0], q.shape[1], d, e),
+                                 dtype=torch.float32,
+                                 device=q.device)
+    else:
+        kv_history = kv_history.clone().contiguous()
+
+    # Process each chunk and accumulate results
+    for i in range(n - 1):
+        s = arr[i]
+        e = arr[i + 1]
+        q1 = q[..., s:e]
+        k1 = k[..., s:e]
+        o, kv = lightning_attention_(q1, k1, v, ed, kv_history)
+        output = output + o
+    return output, kv
+
+
+@triton.jit
+def _linear_attn_decode_kernel(
+    q_ptr,
+    k_ptr,
+    v_ptr,
+    kv_cache_ptr,
+    slope_rate,
+    slot_idx,
+    output_ptr,
+    D: tl.constexpr,
+    qkv_b_stride,
+    qkv_h_stride,
+    cache_b_stride,
+    cache_h_stride,
+    cache_d0_stride,
+    cache_d1_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    Kernel for linear attention decoding with KV cache.
+    
+    This kernel computes attention for a single token using the KV cache.
+    """
+    pid_b = tl.program_id(0)  # batch index
+    pid_h = tl.program_id(1)  # head index
+    pid_d = tl.program_id(2)  # dimension block index
+
+    # Load slot index for the current batch
+    slot_id = tl.load(slot_idx + pid_b)
+
+    # Skip if slot_id is -1 (padding)
+    if slot_id == -1:
+        return
+
+    batch_id = pid_b
+    head_id = pid_h
+
+    # Load decay rate for the current head
+    ratio = tl.load(slope_rate + pid_h)
+
+    # Calculate offsets for dimensions
+    qk_d_offsets = tl.arange(0, D)
+    v_d_offsets = tl.arange(0, BLOCK_SIZE) + pid_d * BLOCK_SIZE
+    cache_d_offsets = qk_d_offsets[:, None] * cache_d0_stride + v_d_offsets[
+        None, :] * cache_d1_stride
+
+    # Calculate offsets for the current batch and head
+    q_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    k_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+    v_offset = batch_id * qkv_b_stride + head_id * qkv_h_stride
+
+    cache_offset = slot_id * cache_b_stride + head_id * cache_h_stride
+
+    # Create masks for loading tensors
+    qk_mask = qk_d_offsets < D
+    v_mask = v_d_offsets < D
+
+    # Load query, key, and value tensors
+    q = tl.load(q_ptr + q_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    k = tl.load(k_ptr + k_offset + qk_d_offsets, mask=qk_mask, other=0.0)
+    v = tl.load(v_ptr + v_offset + v_d_offsets, mask=v_mask, other=0.0)
+
+    # Compute key-value outer product
+    kv_outer = k[:, None] * v[None, :]
+    kv_mask = qk_mask[:, None] & v_mask[None, :]
+
+    # Apply decay to previous KV cache
+    ratio = tl.exp(-ratio)
+    kv_ptr = kv_cache_ptr + cache_offset + cache_d_offsets
+    kv_cache_old = tl.load(kv_ptr, mask=kv_mask, other=0.0)
+    kv_outer = kv_outer + ratio * kv_cache_old
+
+    # Compute attention output
+    output = q[:, None].to(tl.float32) * kv_outer
+    output = tl.sum(output, axis=0)
+
+    # Update KV cache and store output
+    tl.store(kv_ptr, kv_outer, mask=kv_mask)
+    tl.store(output_ptr + q_offset + v_d_offsets, output, mask=v_mask)
+
+
+def linear_decode_forward_triton(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    kv_caches: torch.Tensor,
+    slope_rate: torch.Tensor,
+    slot_idx: torch.Tensor,
+    BLOCK_SIZE: int = 32,
+) -> torch.Tensor:
+    """
+    Perform linear attention decoding using Triton kernels.
+    
+    Args:
+        q: Query tensor of shape [B, H, 1, D]
+        k: Key tensor of shape [B, H, 1, D]
+        v: Value tensor of shape [B, H, 1, D]
+        kv_caches: Key-value cache tensor
+        slope_rate: Decay rate tensor
+        slot_idx: Slot indices for batches
+        BLOCK_SIZE: Size of blocks for processing
+        
+    Returns:
+        output: Attention output tensor
+    """
+    B, H, _, D = q.shape
+    assert k.shape == (B, H, 1, D)
+    assert v.shape == (B, H, 1, D)
+
+    # Initialize output tensor
+    output = torch.empty_like(q)
+
+    # Set grid dimensions for the kernel
+    grid = (B, H, D // BLOCK_SIZE)
+
+    # Calculate strides for tensors
+    qkv_b_stride = q.stride(0)
+    qkv_h_stride = q.stride(1)
+
+    cache_b_stride = kv_caches.stride(0)
+    cache_h_stride = kv_caches.stride(1)
+    cache_d0_stride = kv_caches.stride(2)
+    cache_d1_stride = kv_caches.stride(3)
+
+    # Launch the kernel
+    _linear_attn_decode_kernel[grid](
+        q,
+        k,
+        v,
+        kv_caches,
+        slope_rate,
+        slot_idx,
+        output,
+        D,
+        qkv_b_stride,
+        qkv_h_stride,
+        cache_b_stride,
+        cache_h_stride,
+        cache_d0_stride,
+        cache_d1_stride,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+    # Reshape output and return
+    output = rearrange(output, "b h n d -> b n (h d)")
+    return output.squeeze(1).contiguous()
diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py
new file mode 100644
index 00000000000..d073a7de691
--- /dev/null
+++ b/vllm/model_executor/models/constant_size_cache.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Tuple
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+
+
+class ConstantSizeCache(ABC):
+    """
+    Abstract base class for managing constant size caches 
+    like Mamba and Minimax.
+    """
+
+    def __init__(self, max_batch_size: int):
+        # Maps between the request id and a dict that maps between the seq_id
+        # and its index inside the cache
+        self.cache_indices_mapping: Dict[str, Dict[int, int]] = {}
+        self.free_cache_indices = list(range(max_batch_size))
+
+    @property
+    @abstractmethod
+    def cache(self) -> Any:
+        """Return the underlying cache tensor(s)"""
+        pass
+
+    @abstractmethod
+    def _copy_cache(self, from_index: int, to_index: int):
+        """Copy cache data from one index to another"""
+        pass
+
+    def current_run_tensors(self, **kwargs) -> Tuple:
+        """
+        Return the tensors for the current run's conv and ssm state.
+        """
+        if "seqlen_agnostic_capture_inputs" not in kwargs:
+            # We get here only on Prefill/Eager mode runs
+            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+            finished_requests_ids = kwargs["finished_requests_ids"]
+
+            self._release_finished_requests(finished_requests_ids)
+            state_indices = self._prepare_current_run_cache(
+                request_ids_to_seq_ids, finished_requests_ids)
+
+            state_indices_tensor = torch.as_tensor(state_indices,
+                                                   dtype=torch.int32,
+                                                   device="cuda")
+            cache_tensors = self.cache
+        else:
+            # CUDA graph capturing runs
+            cache_tensors, state_indices_tensor = kwargs[
+                "seqlen_agnostic_capture_inputs"]
+
+        return (cache_tensors, state_indices_tensor)
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        """
+        Copy the relevant state_indices into the CUDA graph input buffer 
+        """
+        assert all(
+            key in kwargs
+            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
+        finished_requests_ids = kwargs["finished_requests_ids"]
+        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
+        assert "seqlen_agnostic_capture_inputs" in input_buffers
+        _, input_state_indices_buffer = input_buffers[
+            "seqlen_agnostic_capture_inputs"]
+
+        self._release_finished_requests(finished_requests_ids)
+        state_indices = self._prepare_current_run_cache(
+            request_ids_to_seq_ids, finished_requests_ids)
+        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
+            state_indices)
+        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
+
+        input_state_indices_buffer.copy_(
+            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        """
+        Provide the CUDA graph capture runs with a buffer in adjusted size.
+        The buffer is used to maintain the Cache during the CUDA graph replay
+        runs.
+        """
+        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                               dtype=torch.int32,
+                                               device="cuda")
+        return (self.cache, state_indices_tensor)
+
+    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
+                                      finished_requests_ids) -> int:
+        """
+        Assign (req_id,seq_id) pair to a `destination_index` index, if
+        already occupied, move the occupying index to a free index.
+        """
+        if cur_rid in finished_requests_ids:
+            # set as pad, do not allocate destination index
+            return PAD_SLOT_ID
+        elif cur_rid not in self.cache_indices_mapping:
+            destination_index = self.free_cache_indices.pop()
+            self.cache_indices_mapping[cur_rid] = {seq_id: destination_index}
+            return destination_index
+        elif seq_id not in (seq_ids2indices :=
+                            self.cache_indices_mapping[cur_rid]):
+            # parallel sampling , where n > 1, assume prefill have
+            # already happened, so we copy the
+            # existing cache into the siblings seq_ids caches
+            index_exists = next(iter(seq_ids2indices.values()))
+            # case of decoding n>1, copy prefill cache to decoding indices
+            destination_index = self.free_cache_indices.pop()
+            self._copy_cache(from_index=index_exists,
+                             to_index=destination_index)
+            self.cache_indices_mapping[cur_rid][seq_id] = destination_index
+            return destination_index
+        else:
+            return self.cache_indices_mapping[cur_rid][seq_id]
+
+    def _prepare_current_run_cache(
+            self, request_ids_to_seq_ids: Dict[str, list[int]],
+            finished_requests_ids: List[str]) -> List[int]:
+        return [
+            self._assign_seq_id_to_cache_index(req_id, seq_id,
+                                               finished_requests_ids)
+            for req_id, seq_ids in request_ids_to_seq_ids.items()
+            for seq_id in seq_ids
+        ]
+
+    def _release_finished_requests(self,
+                                   finished_seq_groups_req_ids: List[str]):
+        for req_id in finished_seq_groups_req_ids:
+            if req_id in self.cache_indices_mapping:
+                for seq_id in self.cache_indices_mapping[req_id]:
+                    self.free_cache_indices.append(
+                        self.cache_indices_mapping[req_id][seq_id])
+                self.cache_indices_mapping.pop(req_id)
diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py
index d529833093c..25839727898 100644
--- a/vllm/model_executor/models/mamba_cache.py
+++ b/vllm/model_executor/models/mamba_cache.py
@@ -1,12 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from dataclasses import dataclass
-from typing import Dict, List, Tuple
+from typing import Tuple
 
 import torch
 
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.config import VllmConfig
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
 
 
 @dataclass
@@ -21,7 +22,7 @@ def at_layer_idx(self, layer_idx):
                                 self.state_indices_tensor)
 
 
-class MambaCacheManager:
+class MambaCacheManager(ConstantSizeCache):
 
     def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
                  num_mamba_layers: int, conv_state_shape: Tuple[int, int],
@@ -32,6 +33,9 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
         if not vllm_config.model_config.enforce_eager:
             max_batch_size = vllm_config.pad_for_cudagraph(max_batch_size)
 
+        # Initialize parent class
+        super().__init__(max_batch_size)
+
         conv_state = torch.empty(size=(num_mamba_layers, max_batch_size) +
                                  conv_state_shape,
                                  dtype=dtype,
@@ -41,126 +45,32 @@ def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype,
                                      dtype=dtype,
                                      device="cuda")
 
-        self.mamba_cache = (conv_state, temporal_state)
+        self._mamba_cache = (conv_state, temporal_state)
+
+    @property
+    def cache(self):
+        return self._mamba_cache
 
-        # Maps between the request id and a dict that maps between the seq_id
-        # and its index inside the self.mamba_cache
-        self.mamba_cache_indices_mapping: Dict[str, Dict[int, int]] = {}
-        self.free_cache_indices = list(range(max_batch_size))
+    def _copy_cache(self, from_index: int, to_index: int):
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
 
     def current_run_tensors(self, **kwargs) -> MambaCacheParams:
         """
         Return the tensors for the current run's conv and ssm state.
         """
-        if "seqlen_agnostic_capture_inputs" not in kwargs:
-            # We get here only on Prefill/Eager mode runs
-            request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-            finished_requests_ids = kwargs["finished_requests_ids"]
-
-            self._release_finished_requests(finished_requests_ids)
-            state_indices = self._prepare_current_run_mamba_cache(
-                request_ids_to_seq_ids, finished_requests_ids)
-
-            state_indices_tensor = torch.as_tensor(state_indices,
-                                                   dtype=torch.int32,
-                                                   device="cuda")
-            mamba_cache_tensors = self.mamba_cache
-
-        else:
-            # CUDA graph capturing runs
-            (mamba_cache_tensors,
-             state_indices_tensor) = kwargs["seqlen_agnostic_capture_inputs"]
-
-        return MambaCacheParams(mamba_cache_tensors[0], mamba_cache_tensors[1],
+        cache_tensors, state_indices_tensor = super().current_run_tensors(
+            **kwargs)
+        return MambaCacheParams(cache_tensors[0], cache_tensors[1],
                                 state_indices_tensor)
 
-    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
-        """
-        Copy the relevant state_indices into the CUDA graph input buffer 
-        """
-        assert all(
-            key in kwargs
-            for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
-        finished_requests_ids = kwargs["finished_requests_ids"]
-        request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        assert "seqlen_agnostic_capture_inputs" in input_buffers
-        _, input_state_indices_buffer = input_buffers[
-            "seqlen_agnostic_capture_inputs"]
-
-        self._release_finished_requests(finished_requests_ids)
-        state_indices = self._prepare_current_run_mamba_cache(
-            request_ids_to_seq_ids, finished_requests_ids)
-        cuda_graph_pad_len = input_state_indices_buffer.shape[0] - len(
-            state_indices)
-        state_indices.extend([PAD_SLOT_ID] * cuda_graph_pad_len)
-
-        input_state_indices_buffer.copy_(
-            torch.as_tensor(state_indices, dtype=torch.int32, device="cuda"))
-
     def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
         """
         Provide the CUDA graph capture runs with a buffer in adjusted size.
         The buffer is used to maintain the Mamba Cache during the CUDA graph
         replay runs.
         """
-        state_indices_tensor = torch.as_tensor([PAD_SLOT_ID] * batch_size,
-                                               dtype=torch.int32,
-                                               device="cuda")
-        return (self.mamba_cache, state_indices_tensor)
-
-    def _copy_mamba_cache(self, from_index: int, to_index: int):
-        assert len(self.mamba_cache) > 0
-        for cache_t in self.mamba_cache:
-            cache_t[:, to_index].copy_(cache_t[:, from_index],
-                                       non_blocking=True)
-
-    def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int,
-                                      finished_requests_ids) -> int:
-        """
-        Assign (req_id,seq_id) pair to a `destination_index` index, if
-        already occupied, move the occupying index to a free index.
-        """
-        if cur_rid in finished_requests_ids:
-            # set as pad, do not allocate destination index
-            return PAD_SLOT_ID
-        elif cur_rid not in self.mamba_cache_indices_mapping:
-            destination_index = self.free_cache_indices.pop()
-            self.mamba_cache_indices_mapping[cur_rid] = {
-                seq_id: destination_index
-            }
-            return destination_index
-        elif seq_id not in (seq_ids2indices :=
-                            self.mamba_cache_indices_mapping[cur_rid]):
-            # parallel sampling , where n > 1, assume prefill have
-            # already happened, so we copy the
-            # existing cache into the siblings seq_ids caches
-            index_exists = next(iter(seq_ids2indices.values()))
-            # case of decoding n>1, copy prefill cache to decoding indices
-            destination_index = self.free_cache_indices.pop()
-            self._copy_mamba_cache(from_index=index_exists,
-                                   to_index=destination_index)
-            self.mamba_cache_indices_mapping[cur_rid][
-                seq_id] = destination_index
-            return destination_index
-        else:
-            # already exists
-            return self.mamba_cache_indices_mapping[cur_rid][seq_id]
-
-    def _prepare_current_run_mamba_cache(
-            self, request_ids_to_seq_ids: Dict[str, list[int]],
-            finished_requests_ids: List[str]) -> List[int]:
-        return [
-            self._assign_seq_id_to_cache_index(req_id, seq_id,
-                                               finished_requests_ids)
-            for req_id, seq_ids in request_ids_to_seq_ids.items()
-            for seq_id in seq_ids
-        ]
-
-    def _release_finished_requests(self,
-                                   finished_seq_groups_req_ids: List[str]):
-        for req_id in finished_seq_groups_req_ids:
-            if req_id in self.mamba_cache_indices_mapping:
-                for seq_id in self.mamba_cache_indices_mapping[req_id]:
-                    self.free_cache_indices.append(
-                        self.mamba_cache_indices_mapping[req_id][seq_id])
-                self.mamba_cache_indices_mapping.pop(req_id)
+        return self._mamba_cache, torch.as_tensor([PAD_SLOT_ID] * batch_size,
+                                                  dtype=torch.int32,
+                                                  device="cuda")
diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py
new file mode 100644
index 00000000000..c95cbb419eb
--- /dev/null
+++ b/vllm/model_executor/models/minimax_cache.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import torch
+
+from vllm.model_executor.models.constant_size_cache import ConstantSizeCache
+
+
+@dataclass
+class MinimaxCacheParams:
+    minimax_cache: torch.Tensor = torch.Tensor()
+    state_indices_tensor: torch.Tensor = torch.Tensor()
+
+    def at_layer_idx(self, layer_idx):
+        return MinimaxCacheParams(self.minimax_cache[layer_idx, ...],
+                                  self.state_indices_tensor)
+
+
+class MinimaxCacheManager(ConstantSizeCache):
+
+    def __init__(self, dtype, cache_shape):
+        super().__init__(cache_shape[1])  # max_batch_size is cache_shape[1]
+        self._minimax_cache = torch.empty(size=cache_shape,
+                                          dtype=dtype,
+                                          device="cuda")
+
+    @property
+    def cache(self):
+        return self._minimax_cache
+
+    def _copy_cache(self, from_index: int, to_index: int):
+        assert len(self.cache) > 0
+        for cache_t in self.cache:
+            cache_t[:, to_index].copy_(cache_t[:, from_index],
+                                       non_blocking=True)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
new file mode 100644
index 00000000000..7562aa678d5
--- /dev/null
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -0,0 +1,1273 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only MiniMaxText01 model."""
+import copy
+import math
+import re
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+from transformers.configuration_utils import PretrainedConfig
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_pp_group, get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size)
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import maybe_prefix
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
+from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
+from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+
+
+def replace_weight_name(name: str,
+                        key: str = None,
+                        to: str = None,
+                        count: int = None,
+                        prefix: str = None) -> str:
+    name = name.replace(key, to) if count is None else \
+        name.replace(key, to, count)
+    return name
+
+
+def weight_loader_with_alias(alias: str):
+
+    def wrapper(func: callable):
+
+        def inner_func(param: torch.Tensor,
+                       loaded_weight: torch.Tensor,
+                       *args,
+                       prefix: str = None,
+                       **kwargs):
+            value = func(param, loaded_weight, *args, **kwargs)
+            return value
+
+        return inner_func
+
+    return wrapper
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01RotaryEmbedding(CustomOp):
+    name = "MiniMaxText01RotaryEmbedding"
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position: int,
+        base: int,
+        is_neox_style: bool,
+        cache_dtype: torch.dtype,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.cache_dtype = cache_dtype
+        cache = self._compute_cos_sin_cache().to(cache_dtype)
+        self.register_buffer("cos_sin_cache", cache, persistent=False)
+
+    def _compute_inv_freq(
+        self,
+        base: Union[int, float],
+    ) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        inv_freq = 1.0 / (base**(torch.arange(
+            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
+        return inv_freq
+
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
+        cos = freqs.cos()
+        sin = freqs.sin()
+        cache = torch.cat((cos, sin), dim=-1)
+        return cache
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+        query_cast = query.to(self.cache_dtype)
+        key_cast = key.to(self.cache_dtype)
+        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
+        query = query_cast.to(query.dtype)
+        key = key_cast.to(key.dtype)
+        return query, key
+
+
+class MiniMaxText01MLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        prefix: str = "mlp",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.gate_up_proj = MergedColumnParallelLinear(
+            hidden_size,
+            [intermediate_size] * 2,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.down_proj",
+        )
+        self.act_fn = SiluAndMul()
+        return
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MiniMaxText01MoE(nn.Module):
+
+    def __init__(
+        self,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: Optional[torch.dtype] = None,
+        layer_idx: int = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "moe",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_total_experts = num_experts
+        self.top_k = top_k
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size // self.tp_size
+        self.quant_config = quant_config
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+
+        self.gate = ReplicatedLinear(
+            self.hidden_size,
+            self.num_total_experts,
+            bias=False,
+            params_dtype=torch.float32,
+            quant_config=None,
+            prefix=f"{prefix}.gate",
+        )
+        self.gate.weight.weight_loader = MiniMaxText01MoE.gate_weight_loader
+
+        self.experts = FusedMoE(
+            num_experts=self.num_total_experts,
+            top_k=self.top_k,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size * self.tp_size,
+            params_dtype=self.params_dtype,
+            reduce_results=True,
+            renormalize=True,
+            quant_config=self.quant_config,
+            tp_size=self.tp_size,
+            prefix=f"{prefix}.experts",
+        )
+        return
+
+    @staticmethod
+    def gate_weight_loader(param: nn.Parameter,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.view(-1, self.hidden_size)
+        router_logits_fp32, _ = self.gate(hidden_states.to(torch.float32))
+        final_hidden_states = self.experts(
+            hidden_states, router_logits_fp32.to(hidden_states.dtype))
+        final_hidden = final_hidden_states.view(num_tokens, hidden_size)
+        return final_hidden
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: int = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            _start = attn_metadata.query_start_loc[_prefill_idx]
+            _end = attn_metadata.query_start_loc[_prefill_idx + 1]
+            slot_id = state_indices_tensor[_prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slot_id = state_indices_tensor[_prefill_idx]
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden.append(
+                self._decode_infer(q, k, v, kv_cache, state_indices_tensor,
+                                   attn_metadata))
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+        slot_id = state_indices_tensor[getattr(attn_metadata, "num_prefills", 0
+                                               ):]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        kv_cache = kv_caches.minimax_cache
+        state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if not decode_only:
+            hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                 state_indices_tensor,
+                                                 attn_metadata)
+        else:
+            hidden = self._decode_infer(q, k, v, kv_cache,
+                                        state_indices_tensor, attn_metadata)
+
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states)
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+        hidden, _ = self.out_proj(hidden)
+        return hidden
+
+
+class MiniMaxText01Attention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        head_dim: int,
+        num_kv_heads: int,
+        rotary_dim: int,
+        max_position: int = 4096 * 32,
+        rope_theta: float = 10000,
+        sliding_window: Optional[int] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = None,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "mha",
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.sliding_window = sliding_window
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+        )
+        return
+
+    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = attn_metadata.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class MiniMaxText01DecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        expert_num: int = 1,
+        layer_id: int = None,
+        linear_layer_id: Optional[int] = None,
+        prefix: str = "decoder",
+    ) -> None:
+        self._ilayer = layer_id
+        self._irank = get_tensor_model_parallel_rank()
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.expert_num = expert_num
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        if config.attention_type == 0:
+            use_headxdim = True
+            hidden_inner = (head_dim * config.num_attention_heads
+                            if use_headxdim else config.hidden_size)
+            self.self_attn = MiniMaxText01LinearAttention(
+                hidden_size=self.hidden_size,
+                hidden_inner_size=hidden_inner,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                max_position=max_position_embeddings,
+                block_size=config.block if hasattr(config, "block") else 256,
+                num_hidden_layer=config.num_hidden_layers,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                linear_layer_idx=linear_layer_id,
+                prefix=prefix)
+        elif config.attention_type == 1:
+            self.self_attn = MiniMaxText01Attention(
+                hidden_size=self.hidden_size,
+                num_heads=config.num_attention_heads,
+                head_dim=head_dim,
+                rotary_dim=config.rotary_dim
+                if hasattr(config, "rotary_dim") else head_dim,
+                num_kv_heads=config.num_key_value_heads,
+                max_position=max_position_embeddings,
+                rope_theta=rope_theta,
+                sliding_window=config.sliding_window,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                cache_config=cache_config,
+                prefix=prefix)
+        else:
+            raise ValueError(
+                f"Unsupported attention type: {self.config.attention_type}")
+
+        if expert_num == 1:
+            self.mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=config.intermediate_size,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+        else:
+            self.block_sparse_moe = MiniMaxText01MoE(
+                num_experts=expert_num,
+                top_k=config.num_experts_per_tok,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                layer_idx=self._ilayer,
+                quant_config=quant_config,
+                prefix=prefix)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+        if config.attention_type == 0:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_linear_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_linear_attention_beta', 1)
+        else:
+            self.layernorm_attention_alpha = getattr(
+                config, 'layernorm_full_attention_alpha', 1)
+            self.layernorm_attention_beta = getattr(
+                config, 'layernorm_full_attention_beta', 1)
+        self.layernorm_mlp_alpha = getattr(config, 'layernorm_mlp_alpha', 1)
+        self.layernorm_mlp_beta = getattr(config, 'layernorm_mlp_beta', 1)
+        self.postnorm = getattr(config, 'postnorm', False)
+        self.shared_moe = False
+
+        shared_intermediate = getattr(config, 'shared_intermediate_size', 0)
+        if shared_intermediate > 0:
+            self.shared_moe = True
+            self.shared_mlp = MiniMaxText01MLP(
+                hidden_size=self.hidden_size,
+                intermediate_size=shared_intermediate,
+                quant_config=quant_config,
+                layer_idx=self._ilayer,
+                prefix=prefix)
+            self.coefficient = ReplicatedLinear(
+                self.hidden_size,
+                1,
+                bias=False,
+                quant_config=quant_config,
+                params_dtype=torch.float32,
+            )
+            self.coefficient.weight.weight_loader = (
+                self.shared_moe_coefficient_loader)
+            self.shared_moe_mode = getattr(config, 'shared_moe_mode',
+                                           'softmax')
+        return
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: Union[List[Dict], Optional[torch.Tensor]],
+                attn_metadata: AttentionMetadata,
+                residual: Optional[torch.Tensor],
+                is_warmup: bool = False,
+                **kwargs) -> Tuple[torch.Tensor, torch.Tensor]:
+
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        layernorm_input = hidden_states
+        layernorm_output = self.input_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+        self_attention_output = self.self_attn(
+            hidden_states=layernorm_output,
+            positions=positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+        )
+
+        residual = residual * self.layernorm_attention_alpha
+        self_attention_output = (self_attention_output *
+                                 self.layernorm_attention_beta)
+
+        layernorm_input = residual + self_attention_output
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        residual = layernorm_output if self.postnorm else layernorm_input
+
+        if self.expert_num == 1:
+            hidden_states = self.mlp(layernorm_output)
+        else:
+            moe_hidden_states = self.block_sparse_moe(
+                copy.deepcopy(layernorm_output))
+            if self.shared_moe:
+                before_moe_dtype = layernorm_output.dtype
+                moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
+                output_mlp = self.shared_mlp(layernorm_output).to(
+                    torch.float32)
+
+                coef, _ = self.coefficient(layernorm_output.to(torch.float32))
+
+                if self.shared_moe_mode == 'softmax':
+                    coef = torch.nn.functional.softmax(coef, dim=-1)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+                elif self.shared_moe_mode == 'sigmoid':
+                    coef = torch.nn.functional.sigmoid(coef)
+                    hidden_states = moe_hidden_fp32 * (
+                        1 - coef) + output_mlp * coef
+
+                hidden_states = hidden_states.to(before_moe_dtype)
+            else:
+                hidden_states = moe_hidden_states
+
+        residual = residual * self.layernorm_mlp_alpha
+        hidden_states = hidden_states * self.layernorm_mlp_beta
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, None
+
+    @staticmethod
+    def shared_moe_coefficient_loader(param: torch.Tensor,
+                                      loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+
+        param.data.copy_(loaded_weight.to(torch.float32))
+        return
+
+
+class MiniMaxText01Model(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        scheduler_config=None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.decoder_attention_types = getattr(
+            config, "attn_type_list", False) or getattr(
+                config, "decoder_attention_types", False)
+        if not self.decoder_attention_types:
+            self.decoder_attention_types = [1] * config.num_hidden_layers
+        self.num_layers = config.num_hidden_layers
+
+        self._layer_barrier = False
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=self.vocab_size,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        def layer_fn(prefix):
+            layer_idx = int(prefix.split('.')[-1])
+            layer_config = config
+            layer_config.attention_type = self.decoder_attention_types[
+                layer_idx]
+            layer_config.layer_idx = layer_idx
+
+            decoder_kwargs = {
+                "quant_config": quant_config,
+                "layer_id": layer_idx,
+                "cache_config": cache_config
+            }
+
+            if layer_config.attention_type == 0:
+                decoder_kwargs["linear_layer_id"] = sum(
+                    1 for i in range(layer_idx)
+                    if self.decoder_attention_types[i] == 0)
+            else:
+                decoder_kwargs["linear_layer_id"] = None
+
+            if hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, list):
+                decoder_kwargs["expert_num"] = config.num_local_experts[
+                    layer_idx]
+            elif hasattr(config, "num_local_experts") and isinstance(
+                    config.num_local_experts, int):
+                decoder_kwargs["expert_num"] = config.num_local_experts
+            else:
+                decoder_kwargs["expert_num"] = 1
+
+            return MiniMaxText01DecoderLayer(layer_config,
+                                             **decoder_kwargs,
+                                             prefix=prefix)
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, layer_fn, prefix=f"{prefix}.layers")
+
+        linear_layer_nums = sum(1 for i in range(config.num_hidden_layers)
+                                if self.decoder_attention_types[i] == 0)
+        max_slots_number = scheduler_config.max_num_seqs
+        self.cache_shape = (linear_layer_nums, max_slots_number,
+                            config.num_attention_heads //
+                            get_tensor_model_parallel_world_size(),
+                            config.head_dim, config.head_dim)
+        _dummy = torch.zeros(1)
+        self._dtype = _dummy.dtype
+        del _dummy
+
+        self.minimax_cache = MinimaxCacheManager(dtype=self._dtype,
+                                                 cache_shape=self.cache_shape)
+
+        rope_theta = getattr(config, "rope_theta", 10000)
+        head_dim = getattr(config, "head_dim",
+                           config.hidden_size // config.num_attention_heads)
+        if hasattr(config, "max_model_len") and isinstance(
+                config.max_model_len, int):
+            max_position_embeddings = min(config.max_position_embeddings,
+                                          config.max_model_len)
+        self.rotary_emb = MiniMaxText01RotaryEmbedding(
+            head_dim,
+            rotary_dim=config.rotary_dim
+            if hasattr(config, "rotary_dim") else head_dim,
+            max_position=max_position_embeddings,
+            base=int(rope_theta),
+            is_neox_style=True,
+            cache_dtype=torch.float32,
+        )
+
+        norm_kwargs = {}
+        if hasattr(config, "rms_norm_eps"):
+            norm_kwargs["eps"] = config.rms_norm_eps
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, **norm_kwargs)
+        else:
+            self.norm = PPMissingLayer()
+        self.embed_scale = 1.0
+        return
+
+    def _clear_prefill_cache(self, attn_metadata,
+                             minimax_cache_tensors: torch.Tensor, **kwargs):
+        seq_to_slot_maps = {}
+        seq_id_map = sum(list(kwargs["request_ids_to_seq_ids"].values()), [])
+        for _, seq_to_slot_map in (
+                self.minimax_cache.cache_indices_mapping.items()):
+            seq_to_slot_maps.update(seq_to_slot_map)
+
+        slots_to_clear = []
+        for _prefill_id in range(getattr(attn_metadata, "num_prefills", 0)):
+            seq_id = seq_id_map[_prefill_id]
+            if attn_metadata.context_lens_tensor[
+                    _prefill_id] == 0 and seq_id in seq_to_slot_maps:
+                slots_to_clear.append(seq_to_slot_maps[seq_id])
+
+        if slots_to_clear:
+            slots_tensor = torch.tensor(slots_to_clear,
+                                        device=minimax_cache_tensors.device,
+                                        dtype=torch.long)
+            minimax_cache_tensors[:, slots_tensor, ...] = 0
+
+    def forward(self,
+                input_ids: Optional[torch.Tensor],
+                positions: torch.Tensor,
+                kv_caches: List[torch.Tensor],
+                intermediate_tensors=None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        forward_context = get_forward_context()
+        attn_metadata = forward_context.attn_metadata
+        if attn_metadata is None:
+            return None
+        if "request_ids_to_seq_ids" not in kwargs:
+            kwargs["request_ids_to_seq_ids"] = {}
+        if "finished_requests_ids" not in kwargs:
+            kwargs["finished_requests_ids"] = []
+        (
+            minimax_cache_tensors,
+            state_indices_tensor,
+        ) = self.minimax_cache.current_run_tensors(**kwargs)
+        if getattr(attn_metadata, "num_prefills", 0) > 0:
+            self._clear_prefill_cache(attn_metadata, minimax_cache_tensors,
+                                      **kwargs)
+
+        minimax_cache_params = MinimaxCacheParams(minimax_cache_tensors,
+                                                  state_indices_tensor)
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is None:
+                hidden_states = self.embed_scale * self.embed_tokens(input_ids)
+            else:
+                hidden_states = inputs_embeds
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        kv_cache_index = 0
+        minimax_cache_index = 0
+        attn_metadata.rotary_emb = self.rotary_emb
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            _caches = None
+            if isinstance(layer.self_attn, MiniMaxText01Attention):
+                _caches = kv_caches[kv_cache_index]
+                kv_cache_index += 1
+            if isinstance(layer.self_attn, MiniMaxText01LinearAttention):
+                current_state_layer = minimax_cache_index
+                _caches = minimax_cache_params.at_layer_idx(
+                    current_state_layer)
+                minimax_cache_index += 1
+            hidden_states, residual = layer(
+                hidden_states=hidden_states,
+                positions=positions,
+                kv_caches=_caches,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+        if residual is not None:
+            hidden_states, _ = self.norm(hidden_states, residual)
+        else:
+            hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class MiniMaxText01ForCausalLM(nn.Module, HasInnerState, IsHybrid,
+                               SupportsV0Only):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        if not hasattr(config, "sliding_window"):
+            config.sliding_window = None
+
+        self.CONCAT_FFN = True
+
+        self.unpadded_vocab_size = self.config.vocab_size
+        if hasattr(vllm_config.model_config, "max_model_len"):
+            self.config.max_model_len = vllm_config.model_config.max_model_len
+        self.model = MiniMaxText01Model(
+            self.config,
+            quant_config,
+            cache_config=vllm_config.cache_config,
+            scheduler_config=vllm_config.scheduler_config,
+            prefix=maybe_prefix(prefix, "model"))
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                self.config.hidden_size,
+                org_num_embeddings=self.config.vocab_size,
+                padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            )
+
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    self.config.vocab_size)
+
+            self.sampler = Sampler()
+        else:
+            self.lm_head = PPMissingLayer()
+
+        flash_layer_count = sum(1 for attn_type in self.config.attn_type_list
+                                if attn_type == 1)
+        self.kv_cache = [torch.tensor([]) for _ in range(flash_layer_count)]
+        return
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.model.minimax_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.model.minimax_cache.get_seqlen_agnostic_capture_inputs(
+            batch_size)
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, self.kv_cache,
+                                   intermediate_tensors, inputs_embeds,
+                                   **kwargs)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ):
+
+        next_tokens = self.sampler(logits, sampling_metadata)
+
+        return next_tokens
+
+    def make_empty_intermediate_tensors(
+            self, batch_size: int, dtype: torch.dtype,
+            device: torch.device) -> IntermediateTensors:
+        return IntermediateTensors({
+            "hidden_states":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+            "residual":
+            torch.zeros((batch_size, self.config.hidden_size),
+                        dtype=dtype,
+                        device=device),
+        })
+
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> None:
+        params_dict = dict(self.named_parameters())
+
+        def which_layer(name: str) -> int:
+            if "layers" in name:
+                after_layer = name.split("layers")[-1]
+                return int(after_layer.split(".")[1])
+            return None
+
+        def is_linear_attn_layer(layer_idx: int) -> bool:
+            if layer_idx is None or not hasattr(self.config, "attn_type_list"):
+                return False
+            return self.config.attn_type_list[layer_idx] == 0
+
+        def is_moe_weight(name: str) -> bool:
+            return "block_sparse_moe" in name and not name.endswith(".bias")
+
+        def get_expert_id(param_name):
+            pattern = r'model\.layers\.\d+\.block_sparse_moe\.experts\.(\d+)\.'
+            match = re.search(pattern, param_name)
+            if match:
+                return match.group(1)
+            return None
+
+        def load_sparse_moe_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if isinstance(self.config.num_local_experts, list):
+                expert_params_mapping = [
+                    ("w13_weight"
+                     if weight_name in ["w1", "w3"] else "w2_weight",
+                     f"experts.{expert_id}.{weight_name}.weight", expert_id)
+                    for expert_id in range(max(self.config.num_local_experts))
+                    for weight_name in ["w1", "w2", "w3"]
+                ]
+            else:
+                expert_params_mapping = [
+                    ("w13_scale" if weight_name in ["w1", "w3"] else
+                     "w2_scale", f"{expert_id}.{weight_name}.weight_scale",
+                     expert_id, weight_name)
+                    for expert_id in range(self.config.num_local_experts)
+                    for weight_name in ["w1", "w2", "w3"]
+                ] + [("w13_weight" if weight_name in ["w1", "w3"] else
+                      "w2_weight", f"{expert_id}.{weight_name}.weight",
+                      expert_id, weight_name)
+                     for expert_id in range(self.config.num_local_experts)
+                     for weight_name in ["w1", "w2", "w3"]]
+            for (param_name, weight_name, expert_id,
+                 shard_id) in expert_params_mapping:
+                name_expert_id = get_expert_id(name)
+                if name_expert_id is not None and int(name_expert_id) != int(
+                        expert_id):
+                    continue
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param,
+                              loaded_weight,
+                              weight_name,
+                              expert_id=expert_id,
+                              shard_id=shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_shared_mlp_weight(name: str) -> bool:
+            return "shared_mlp" in name and not name.endswith(".bias")
+
+        def load_shared_mlp_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if not self.CONCAT_FFN:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "w1", 1)
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "w3", 1)
+                elif "down_proj" in name:
+                    name = name.replace("down_proj", "w2", 1)
+            else:
+                if "gate_proj" in name:
+                    name = name.replace("gate_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 0
+                elif "up_proj" in name:
+                    name = name.replace("up_proj", "gate_up_proj", 1)
+                    loaded_shard_id = 1
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            if not self.CONCAT_FFN:
+                weight_loader(param, loaded_weight)
+            else:
+                if "gate_up_proj" in name:
+                    weight_loader(param, loaded_weight, loaded_shard_id)
+                elif "down_proj" in name:
+                    weight_loader(param, loaded_weight)
+                else:
+                    raise AssertionError(
+                        "MLP weight not in [gate_up_proj, down_proj]")
+            return
+
+        def is_mha_weight(name: str) -> bool:
+            return "self_attn" in name and not name.endswith(".bias")
+
+        def load_linear_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                    self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+
+            weight_loader = getattr(
+                param, "weight_loader",
+                MiniMaxText01LinearAttention.weight_direct_load)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_flash_attn_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+
+            flash_mha_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+                ("gate_up_proj", "gate_proj", 0),
+                ("gate_up_proj", "up_proj", 1),
+            ]
+            for (param_name, weight_name,
+                 shard_id) in flash_mha_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if is_pp_missing_parameter(name, self):
+                    return
+                param = params_dict[name]
+
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader = weight_loader_with_alias(name)(weight_loader)
+                weight_loader(param, loaded_weight)
+            return
+
+        def is_layer_norm_weight(name: str) -> bool:
+            return "norm" in name and not name.endswith(
+                ".bias") and name in params_dict
+
+        def load_layer_norm_weight(name: str, loaded_weight: torch.Tensor,
+                                   self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        def load_basic_weight(name: str, loaded_weight: torch.Tensor,
+                              self) -> None:
+            if is_pp_missing_parameter(name, self):
+                return
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader = weight_loader_with_alias(name)(weight_loader)
+            weight_loader(param, loaded_weight)
+            return
+
+        for name, loaded_weight in weights:
+            weight_at_layer = which_layer(name)
+            if weight_at_layer and weight_at_layer >= len(
+                    self.config.attn_type_list):
+                continue
+
+            if is_layer_norm_weight(name):
+                load_layer_norm_weight(name, loaded_weight, self)
+                continue
+            if is_mha_weight(name):
+                if is_linear_attn_layer(weight_at_layer):
+                    load_linear_attn_weight(name, loaded_weight, self)
+                else:
+                    load_flash_attn_weight(name, loaded_weight, self)
+                continue
+            if is_moe_weight(name):
+                load_sparse_moe_weight(name, loaded_weight, self)
+                continue
+            if is_shared_mlp_weight(name):
+                load_shared_mlp_weight(name, loaded_weight, self)
+                continue
+
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            load_basic_weight(name, loaded_weight, self)
+        return
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 2f1827c1740..6ead6509bfe 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -35,6 +35,7 @@
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
+    "MiniMaxText01ForCausalLM": ("minimax_text_01", "MiniMaxText01ForCausalLM"),
     # baichuan-7b, upper case 'C' in the class name
     "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"),
     # baichuan-13b, lower case 'c' in the class name

From 1328c077fa2ee7e931d6e50c2968832e45fdfbb2 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 13:58:59 -0700
Subject: [PATCH 1145/1240] [Docs] Add Ollama meetup slides (#15905)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md                        | 4 +---
 docs/source/community/meetups.md | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index f2da0467e5c..f3e9403dd4b 100644
--- a/README.md
+++ b/README.md
@@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
-
 [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
 
 ---
 
 *Latest News* 🔥
-
+- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
 - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md
index efb4f692972..954dc4e7ec9 100644
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@@ -4,6 +4,8 @@
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
+- [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
 - [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
 - [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)

From 28ea8c60c085a2b6c54831c4f049657481583375 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 1 Apr 2025 17:16:55 -0700
Subject: [PATCH 1146/1240] [Docs] Add Intel as Sponsor (#15913)

Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md                         | 1 +
 docs/source/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f3e9403dd4b..03643bb682b 100644
--- a/README.md
+++ b/README.md
@@ -124,6 +124,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index fb93e65673d..b8a1ddbe387 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -22,6 +22,7 @@ Compute Resources:
 - Databricks
 - DeepInfra
 - Google Cloud
+- Intel
 - Lambda Lab
 - Nebius
 - Novita AI

From e029f46fa60606b8554dfc24ac41ba97a78c8f6f Mon Sep 17 00:00:00 2001
From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com>
Date: Tue, 1 Apr 2025 21:15:14 -0400
Subject: [PATCH 1147/1240] [Spec Decode] Fix input triton kernel for eagle
 (#15909)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/spec_decode/eagle.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 57c6b652593..3aaaf34bc79 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -250,13 +250,12 @@ def prepare_input_kernel(
     num_tokens = end_pos - start_pos
 
     index_start = tl.load(cu_query_lens_ptr + pid)
-    indices = index_start + tl.arange(0, BLOCK_SIZE)
 
     num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
     for i in tl.range(num_blocks):
         offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
         tl.store(
             out_ptr + start_pos + offset,
-            indices,
+            index_start + offset,
             mask=offset < num_tokens,
         )

From 213cecc8f356ae0664f9b6b3ff2cba2e48583616 Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Tue, 1 Apr 2025 22:06:44 -0400
Subject: [PATCH 1148/1240] [V1] Fix: make sure `k_index` is int64 for
 `apply_top_k_only` (#15907)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/sample/ops/topk_topp_sampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5dfcae08b17..d4bc23364c5 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -200,7 +200,7 @@ def apply_top_k_only(
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
     k_index = k.sub_(1).unsqueeze(1)
-    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index)
+    top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
     logits.masked_fill_(logits < top_k_mask, -float("inf"))

From 593412f67eec0c25ed5009d30ebe560bfe1a2529 Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 11:33:55 +0800
Subject: [PATCH 1149/1240] [Bugfix] Fix imports for MoE on CPU (#15841)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index c9bb676710a..ac158a7eee5 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -4,8 +4,6 @@
 import torch
 
 import vllm.envs as envs
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 
 
@@ -38,6 +36,9 @@ def rocm_aiter_fused_experts(
     import aiter as rocm_aiter
     import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
 
+    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+        per_token_group_quant_fp8)
+
     if envs.VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE and use_fp8_w8a8:
         assert w1_scale is not None
         assert w2_scale is not None

From ce541f5b5565f5c1677a878f60115e8f19b4398c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 1 Apr 2025 23:38:02 -0700
Subject: [PATCH 1150/1240] [V1][Minor] Enhance SpecDecoding Metrics Log in V1
 (#15902)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/spec_decode/metrics.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 7fecbaeed4f..7bb3c209d1d 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -47,13 +47,16 @@ def log(self):
         num_draft_tokens = np.sum(self.num_draft_tokens)
         num_accepted_tokens = np.sum(self.num_accepted_tokens)
 
-        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens
-                                 if num_draft_tokens > 0 else float("nan"))
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
 
         logger.info(
-            "Speculative metrics: "
-            "Draft acceptance rate: %.3f, "
-            "Number of accepted tokens: %d, "
-            "Number of draft tokens: %d, ", draft_acceptance_rate,
-            num_accepted_tokens, num_draft_tokens)
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens",
+            draft_acceptance_rate,
+            num_accepted_tokens,
+            num_draft_tokens,
+        )
         self.reset()

From 6d02986a071834ba73b823c9ae1575387128e1be Mon Sep 17 00:00:00 2001
From: chun <chun.jb.37@gmail.com>
Date: Wed, 2 Apr 2025 15:38:26 +0900
Subject: [PATCH 1151/1240] [Doc] Update rocm.inc.md (#15917)

Signed-off-by: chun37 <chun.jb.37@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/getting_started/installation/gpu/rocm.inc.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md
index eae7a235851..21c8d7d01ad 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/source/getting_started/installation/gpu/rocm.inc.md
@@ -31,7 +31,7 @@ Currently, there are no pre-built ROCm wheels.
     ```console
     # Install PyTorch
     $ pip uninstall torch -y
-    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.3
+    $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.3
     ```
 
 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton)

From 3c6260c0e80b3b65a6b8f7acddca94e05ebb0a6a Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Tue, 1 Apr 2025 23:46:42 -0700
Subject: [PATCH 1152/1240] [V1][Bugfix] Fix typo in MoE TPU checking (#15927)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 143123e577b..5cbbe49bbba 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -309,7 +309,7 @@ def forward_tpu(
                                 expert_map=expert_map,
                                 renormalize=renormalize)
 
-    forward_native = forward_tpu if current_platform.is_tpu else forward_cuda
+    forward_native = forward_tpu if current_platform.is_tpu() else forward_cuda
 
 
 def determine_expert_map(

From 3400a42e12871fefc6de0b9de06266ad1c46b388 Mon Sep 17 00:00:00 2001
From: Li Wang <wangli858794774@gmail.com>
Date: Wed, 2 Apr 2025 16:32:24 +0800
Subject: [PATCH 1153/1240] [Benchmark]Fix error message (#15866)

Signed-off-by: wangli <wangli858794774@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_dataset.py |  9 ---------
 benchmarks/benchmark_serving.py | 17 ++++++++++++++---
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index f332566d64f..c2fbe2bb6d2 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -582,15 +582,6 @@ def __init__(
     ) -> None:
         super().__init__(dataset_path=dataset_path, **kwargs)
 
-        # Validate dataset path
-        if self.SUPPORTED_DATASET_PATHS and \
-            self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
-            raise ValueError(
-                f"{self.__class__.__name__} "
-                f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
-                "Please consider contributing if you would "
-                "like to add support for additional dataset formats.")
-
         self.dataset_split = dataset_split
         self.dataset_subset = dataset_subset
         self.load_data()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index dabf2214c84..ec2ed1a1750 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -50,9 +50,9 @@
     from argparse import ArgumentParser as FlexibleArgumentParser
 
 from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+                               HuggingFaceDataset, InstructCoderDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -595,6 +595,17 @@ def main(args: argparse.Namespace):
             args.hf_split = "train"
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = ConversationDataset
+        else:
+            supported_datasets = set([
+                dataset_name for cls in HuggingFaceDataset.__subclasses__()
+                for dataset_name in cls.SUPPORTED_DATASET_PATHS
+            ])
+            raise ValueError(
+                f"Unsupported dataset path: {args.dataset_path}. "
+                "Huggingface dataset only supports dataset_path"
+                f" from one of following: {supported_datasets}. "
+                "Please consider contributing if you would "
+                "like to add support for additional dataset formats.")
         input_requests = dataset_class(
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,

From 260bf4587b4c36c4c99eae40f0c22fb6ac7f0f8d Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 2 Apr 2025 16:37:38 +0800
Subject: [PATCH 1154/1240] [Misc] Replace print with logger (#15923)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/phi4mini_tool_parser.py      | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index 167eb0ea2a9..668776a832e 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -47,13 +47,13 @@ def extract_tool_calls(
         """
         Extract the tool calls from a complete model response.
         """
-        print(f"Model output: {model_output}")
+        logger.debug("Model output: %s", model_output)
 
         pattern = r'functools\[(.*?)\]'
         matches = re.search(pattern, model_output, re.DOTALL)
 
         if not matches:
-            print("No function calls found")
+            logger.debug("No function calls found")
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
@@ -64,10 +64,12 @@ def extract_tool_calls(
                 json_content = '[' + matches.group(1) + ']'
 
                 function_call_arr = json.loads(json_content)
-                print(f"Successfully extracted {len(function_call_arr)} "
-                      "function calls")
+                logger.debug("Successfully extracted %d function calls",
+                             len(function_call_arr))
             except json.JSONDecodeError as e:
-                print(f"Error parsing JSON: {e}")
+                logger.error(
+                    "Failed to parse function calls from model output: %s. "
+                    "Error: %s", model_output, str(e))
 
             tool_calls: list[ToolCall] = [
                 ToolCall(

From 16300beafd56c9be4be63f2cda5dd4ce31c1d34e Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 16:39:09 +0800
Subject: [PATCH 1155/1240] [CI/Build] Further clean up LoRA tests (#15920)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml        |  4 +---
 tests/lora/conftest.py               | 23 -----------------------
 tests/lora/test_layers.py            |  2 +-
 tests/lora/test_llama_tp.py          | 17 -----------------
 tests/lora/test_minicpmv_tp.py       |  1 -
 tests/lora/test_transfomers_model.py |  8 +++++++-
 6 files changed, 9 insertions(+), 46 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 99358d55799..e2b452d8a05 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -289,7 +289,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: PyTorch Fullgraph Smoke Test # 9min
@@ -602,8 +602,6 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_minicpmv_tp.py
-    - pytest -v -s -x lora/test_transfomers_model.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 523bebe06ee..91733fde130 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -2,7 +2,6 @@
 
 import tempfile
 from collections import OrderedDict
-from typing import TypedDict
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -26,28 +25,6 @@
 from vllm.platforms import current_platform
 
 
-class ContextIDInfo(TypedDict):
-    lora_id: int
-    context_length: str
-
-
-class ContextInfo(TypedDict):
-    lora: str
-    context_length: str
-
-
-LONG_LORA_INFOS: list[ContextIDInfo] = [{
-    "lora_id": 1,
-    "context_length": "16k",
-}, {
-    "lora_id": 2,
-    "context_length": "16k",
-}, {
-    "lora_id": 3,
-    "context_length": "32k",
-}]
-
-
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
     """Allow subdirectories to skip global cleanup by overriding this fixture.
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 99d60b332e6..f85725fe423 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -59,7 +59,7 @@
 # prefill stage(True) or decode stage(False)
 STAGES = [True, False]
 
-NUM_RANDOM_SEEDS = 10
+NUM_RANDOM_SEEDS = 6
 
 VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS = 128
 
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 9f20e47c2f9..31abac87d19 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -153,20 +153,3 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
-
-
-@multi_gpu_test(num_gpus=4)
-@create_new_process_for_each_test()
-def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files):
-
-    llm = vllm.LLM(
-        MODEL_PATH,
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=4,
-        tensor_parallel_size=4,
-        fully_sharded_loras=True,
-        enable_lora_bias=True,
-        enable_chunked_prefill=True,
-    )
-    generate_and_test(llm, sql_lora_files)
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 00e6fe7c61d..0b223e5011f 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -58,7 +58,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
 @pytest.mark.xfail(
     current_platform.is_rocm(),
     reason="MiniCPM-V dependency xformers incompatible with ROCm")
-@create_new_process_for_each_test()
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py
index 87db0b4bbde..b50e210ed08 100644
--- a/tests/lora/test_transfomers_model.py
+++ b/tests/lora/test_transfomers_model.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 
 from ..utils import create_new_process_for_each_test, multi_gpu_test
 
@@ -44,7 +47,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@create_new_process_for_each_test()
 def test_ilama_lora(ilama_lora_files):
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
@@ -63,6 +65,8 @@ def test_ilama_lora(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
@@ -84,6 +88,8 @@ def test_ilama_lora_tp4(ilama_lora_files):
         assert output2[i] == EXPECTED_LORA_OUTPUT[i]
 
 
+@pytest.mark.skipif(current_platform.is_cuda_alike(),
+                    reason="Skipping to avoid redundant model tests")
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):

From 29c90a65e04a811851f7863905339b235796f10a Mon Sep 17 00:00:00 2001
From: Thien Tran <gau.nernst@yahoo.com.sg>
Date: Wed, 2 Apr 2025 16:45:02 +0800
Subject: [PATCH 1156/1240] [Bugfix] Fix cache block size calculation for CPU
 MLA (#15848)

Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/worker/cpu_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index b93aae9c91b..5f35c1af2e7 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -106,7 +106,7 @@ def get_cache_block_size(
         num_layers = model_config.get_num_layers(parallel_config)
 
         key_cache_block = block_size * num_heads * head_size
-        value_cache_block = key_cache_block
+        value_cache_block = key_cache_block if not model_config.use_mla else 0
         total = num_layers * (key_cache_block + value_cache_block)
         if cache_dtype == "auto":
             dtype = model_config.dtype

From 41f00db106b1d5083d1e2a5198e7bda1c55292e3 Mon Sep 17 00:00:00 2001
From: Chris Thi <chris.c.thi@gmail.com>
Date: Wed, 2 Apr 2025 04:47:57 -0400
Subject: [PATCH 1157/1240] [Build/CI] Update lm-eval to 0.4.8 (#15912)

Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/test.in  | 2 +-
 requirements/test.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/test.in b/requirements/test.in
index c1b70bca70e..43fa1bb58c8 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -29,7 +29,7 @@ matplotlib # required for qwen-vl test
 mistral_common[opencv] >= 1.5.4 # required for pixtral test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
-lm-eval[api]==0.4.4 # required for model evaluation test
+lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
 # quantization
 bitsandbytes>=0.45.3
diff --git a/requirements/test.txt b/requirements/test.txt
index c46fa0721d6..bddf9990ca0 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -219,7 +219,7 @@ librosa==0.10.2.post1
     # via -r requirements/test.in
 llvmlite==0.44.0
     # via numba
-lm-eval==0.4.4
+lm-eval==0.4.8
     # via -r requirements/test.in
 lxml==5.3.0
     # via sacrebleu

From fdd0e401b29a1e0ebd2b892cb46e6fd4a56d53e9 Mon Sep 17 00:00:00 2001
From: LukasBluebaum <38468743+LukasBluebaum@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:58:48 +0200
Subject: [PATCH 1158/1240] [Kernel] Add more dtype support for GGUF
 dequantization (#15879)

Signed-off-by: lukas.bluebaum <lukas.bluebaum@aleph-alpha.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/ops.h                                    |  3 +-
 csrc/quantization/gguf/dequantize.cuh         | 65 ++++++++++---------
 csrc/quantization/gguf/ggml-common.h          | 17 ++++-
 csrc/quantization/gguf/gguf_kernel.cu         | 15 +++--
 csrc/torch_bindings.cpp                       |  4 +-
 tests/kernels/test_ggml.py                    |  3 +-
 tests/kernels/test_gguf.py                    |  4 +-
 vllm/_custom_ops.py                           | 15 +++--
 .../layers/quantization/gguf.py               |  4 +-
 9 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index a0985d32426..152c94e8600 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -145,7 +145,8 @@ torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm);
 #endif
 
 torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
-                              int64_t n);
+                              int64_t n,
+                              std::optional<at::ScalarType> const& dtype);
 
 torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X,
                                   int64_t type, int64_t row);
diff --git a/csrc/quantization/gguf/dequantize.cuh b/csrc/quantization/gguf/dequantize.cuh
index 41fc032ff1a..9d355003ef9 100644
--- a/csrc/quantization/gguf/dequantize.cuh
+++ b/csrc/quantization/gguf/dequantize.cuh
@@ -94,8 +94,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
     dfloat2 v;
     dequantize_kernel(vx, ib, iqs, v);
 
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
+    y[iybs + iqs + 0]        = convert_from_half<dst_t>(v.x);
+    y[iybs + iqs + y_offset] = convert_from_half<dst_t>(v.y);
 }
 
 template<typename dst_t>
@@ -114,10 +114,10 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 
     half dall = __low2half(x[i].dm);
     half dmin = __high2half(x[i].dm);
-    y[l+ 0] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4)));
-    y[l+32] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4)));
-    y[l+64] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4)));
-    y[l+96] = __hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4)));
+    y[l+ 0] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+0] & 0xF) * ((q >> 0) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+0] >> 4))));
+    y[l+32] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+2] & 0xF) * ((q >> 2) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+2] >> 4))));
+    y[l+64] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+4] & 0xF) * ((q >> 4) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+4] >> 4))));
+    y[l+96] = convert_from_half<dst_t>(__hsub(__hmul(dall, __int2half_rn((x[i].scales[is+6] & 0xF) * ((q >> 6) & 3))), __hmul(dmin,  __int2half_rn(x[i].scales[is+6] >> 4))));
 }
 
 template<typename dst_t>
@@ -148,7 +148,9 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
     const uint8_t * q = x[i].qs + 32*n;
     const uint8_t * hm = x[i].hmask;
 
-    for (int l = l0; l < l0+4; ++l) y[l] = __hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)));
+    for (int l = l0; l < l0+4; ++l) {
+        y[l] = convert_from_half<dst_t>(__hmul(dl,  __int2half_rn((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4))));
+    }
 }
 
 static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
@@ -188,8 +190,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc));
     const half m2 = __hmul(dmin, __int2half_rn(m));
     for (int l = 0; l < n; ++l) {
-        y[l + 0] = __hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1);
-        y[l +32] = __hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2);
+        y[l + 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn(q[l] & 0xF)), m1));
+        y[l +32] = convert_from_half<dst_t>(__hsub(__hmul(d2,  __int2half_rn(q[l] >> 4)), m2));
     }
 }
 
@@ -220,11 +222,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
     const half d2 = __hmul(dall, __int2half_rn(sc)); const half m2 = __hmul(dmin, __int2half_rn(m));
 
     uint8_t   hm  = 1 << (2*il);
-    y[ 0] = __hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1);
-    y[ 1] = __hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1);
+    y[ 0] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0))), m1));
+    y[ 1] = convert_from_half<dst_t>(__hsub(__hmul(d1, __int2half_rn((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0))), m1));
     hm <<= 1;
-    y[32] = __hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2);
-    y[33] = __hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2);
+    y[32] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[0] >>  4) + (qh[0] & hm ? 16 : 0))), m2));
+    y[33] = convert_from_half<dst_t>(__hsub(__hmul(d2, __int2half_rn((ql[1] >>  4) + (qh[1] & hm ? 16 : 0))), m2));
 }
 
 template<typename dst_t>
@@ -247,10 +249,10 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
     const uint8_t   qh = x[i].qh[32*ip + il];
     const int8_t  * sc = x[i].scales + is;
 
-    y[ 0] = __hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32)));
-    y[32] = __hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32)));
-    y[64] = __hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32)));
-    y[96] = __hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32)));
+    y[ 0] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32))));
+    y[32] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32))));
+    y[64] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32))));
+    y[96] = convert_from_half<dst_t>(__hmul(d, __int2half_rn(sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32))));
 }
 
 template<typename dst_t>
@@ -269,7 +271,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
     const uint32_t aux32 = q2[2] | (q2[3] << 16);
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -286,7 +288,7 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
     const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 
 }
 
@@ -303,7 +305,7 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
     const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
     const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
-    for (int j = 0; j < 8; ++j) y[j] = __float2half(d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f));
+    for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
 }
 
 template<typename dst_t>
@@ -324,8 +326,8 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
     const float d = __half2float(x[i].d) * (0.5f + (aux32 >> 28)) * 0.5f;
     const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -345,8 +347,8 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
     const float d = __half2float(x[i].d) * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
     const uint8_t signs = x[i].signs[4*ib + il];
     for (int j = 0; j < 4; ++j) {
-        y[j+0] = __float2half(d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f));
-        y[j+4] = __float2half(d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f));
+        y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
+        y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
     }
 }
 
@@ -367,7 +369,7 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -392,7 +394,7 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
     grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f;
     grid32[0] &= 0x0f0f0f0f;
     for (int j = 0; j < 8; ++j) {
-        y[j] = __float2half(d * (q[j] + delta));
+        y[j] = d * (q[j] + delta);
     }
 }
 
@@ -409,8 +411,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[ib].qs + 4*il;
     const float d = __half2float(x[ib].d);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 
 }
@@ -427,8 +429,8 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
     const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
     const float d = __half2float(x[i].d) * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
     for (int j = 0; j < 4; ++j) {
-        y[j+ 0] = __float2half(d * kvalues_iq4nl[q4[j] & 0xf]);
-        y[j+16] = __float2half(d * kvalues_iq4nl[q4[j] >>  4]);
+        y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
+        y[j+16] = d * kvalues_iq4nl[q4[j] >>  4];
     }
 }
 
@@ -522,7 +524,8 @@ static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k,
     dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
 }
 
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(int64_t type) {
+template<typename dst_t>
+static to_cuda_ggml_t<dst_t> ggml_get_to_cuda(int64_t type) {
     switch (type) {
         case 2:
             return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index d42205a6571..99a7ea0fb27 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1063,7 +1063,8 @@ static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -
 typedef half dfloat; // dequantize float
 typedef half2 dfloat2;
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*to_fp16_cuda_t)(const void * __restrict__ x, dfloat * __restrict__ y, int k, cudaStream_t stream);
+template<typename dst_t>
+using to_cuda_ggml_t = void (*)(const void * __restrict__ x, dst_t * __restrict__ y, int k, cudaStream_t stream);
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
 typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
 typedef void (*load_tiles_cuda_t)(
@@ -1075,6 +1076,20 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
 
 // Utility function
 
+template<typename dst_t>
+static __device__ __forceinline__ dst_t convert_from_half(half val) {
+    return val;
+}
+
+template<>
+__device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __float2bfloat16(__half2float(val));
+#else
+    return __half2float(val);
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+
 #if defined(USE_ROCM)
 
 #ifndef __has_builtin
diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu
index dbbb97e6fb3..56b78f1834d 100644
--- a/csrc/quantization/gguf/gguf_kernel.cu
+++ b/csrc/quantization/gguf/gguf_kernel.cu
@@ -71,14 +71,19 @@ static void quantize_row_q8_1_cuda(const scalar_t* x, void* vy, const int kx,
 }
 
 torch::Tensor ggml_dequantize(torch::Tensor W,  // quant weight
-                              int64_t type, int64_t m, int64_t n) {
+                              int64_t type, int64_t m, int64_t n,
+                              std::optional<at::ScalarType> const& dtype) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(W));
-  auto options =
-      torch::TensorOptions().dtype(torch::kFloat16).device(W.device());
+  auto dtype_ = dtype.value_or(torch::kFloat16);
+  auto options = torch::TensorOptions().dtype(dtype_).device(W.device());
   at::Tensor DW = torch::empty({m, n}, options);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-  const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(type);
-  to_fp16_cuda((void*)W.data_ptr(), (half*)DW.data_ptr(), m * n, stream);
+
+  VLLM_DISPATCH_FLOATING_TYPES(DW.scalar_type(), "ggml_dequantize", [&] {
+    auto to_cuda = ggml_get_to_cuda<scalar_t>(type);
+    to_cuda((void*)W.data_ptr(), (scalar_t*)DW.data_ptr(), m * n, stream);
+  });
+
   return DW;
 }
 
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index feb3882c4d5..d3b80572b6e 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -295,7 +295,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #endif
 
   // Dequantization for GGML.
-  ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
+  ops.def(
+      "ggml_dequantize(Tensor W, int type, SymInt m, SymInt n, ScalarType? "
+      "dtype) -> Tensor");
   ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
 
   // mmvq kernel for GGML.
diff --git a/tests/kernels/test_ggml.py b/tests/kernels/test_ggml.py
index 23fa1fdfda1..cc157da518c 100644
--- a/tests/kernels/test_ggml.py
+++ b/tests/kernels/test_ggml.py
@@ -15,7 +15,8 @@ def test_ggml_opcheck(quant_type):
     qweight = torch.randint(0, 100, shape, device='cuda', dtype=torch.uint8)
     m = qweight.shape[0]
     n = qweight.shape[1] // type_size * block_size
-    opcheck(torch.ops._C.ggml_dequantize, (qweight, quant_type, m, n))
+    opcheck(torch.ops._C.ggml_dequantize,
+            (qweight, quant_type, m, n, torch.float16))
 
     x = torch.rand((m, 512), device='cuda', dtype=torch.float16)
     opcheck(torch.ops._C.ggml_mul_mat_a8,
diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py
index ede941844dc..4c0fae9d9fd 100644
--- a/tests/kernels/test_gguf.py
+++ b/tests/kernels/test_gguf.py
@@ -65,7 +65,7 @@ def get_gguf_MoE_tensors(
 
 
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
-@pytest.mark.parametrize("dtype", [torch.half])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_dequantize(hidden_size: int, dtype: torch.dtype,
@@ -78,7 +78,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype,
         ref_output = torch.tensor(dequantize(tensor.data, quant_type),
                                   device="cuda").to(dtype)
         output = ops.ggml_dequantize(torch.tensor(tensor.data, device="cuda"),
-                                     quant_type, *list(shape)).to(dtype)
+                                     quant_type, *list(shape), dtype)
 
         torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=4e-2)
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 039397f5a5e..fe41a2d963b 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -436,9 +436,12 @@ def _allspark_w8a16_gemm_fake(a: torch.Tensor, b_qweight: torch.Tensor,
 if hasattr(torch.ops._C, "ggml_dequantize"):
 
     @register_fake("_C::ggml_dequantize")
-    def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int,
-                              m: torch.SymInt,
-                              n: torch.SymInt) -> torch.Tensor:
+    def _ggml_dequantize_fake(
+            W: torch.Tensor,
+            quant_type: int,
+            m: torch.SymInt,
+            n: torch.SymInt,
+            dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         return torch.empty((m, n), dtype=torch.float16, device=W.device)
 
     @register_fake("_C::ggml_mul_mat_vec_a8")
@@ -1097,9 +1100,9 @@ def marlin_qqq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 
 # gguf
-def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int,
-                    n: int) -> torch.Tensor:
-    return torch.ops._C.ggml_dequantize(W, quant_type, m, n)
+def ggml_dequantize(W: torch.Tensor, quant_type: int, m: int, n: int,
+                    dtype: Optional[torch.dtype]) -> torch.Tensor:
+    return torch.ops._C.ggml_dequantize(W, quant_type, m, n, dtype)
 
 
 def ggml_mul_mat_vec_a8(
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index c8ab12d9a0a..9861e0a85b3 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -117,7 +117,7 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -377,7 +377,7 @@ def embedding(self, layer: torch.nn.Module,
         x_flat = x.flatten()
         quant = torch.index_select(qweight, dim=0, index=x_flat)
         dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0]).to(self.params_dtype)
+                                      x_flat.shape[0], self.params_dtype)
         return dequant.view(*x.shape, hidden_size)
 
 
From 8276af780ffcdec8fa3668ecde044f80cb8625ac Mon Sep 17 00:00:00 2001
From: Eric Tang <46737979+erictang000@users.noreply.github.com>
Date: Wed, 2 Apr 2025 01:59:27 -0700
Subject: [PATCH 1159/1240] [core] Add tags parameter to wake_up() (#15500)

Signed-off-by: Eric <erictang000@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/basic_correctness/test_cumem.py   | 20 ++++++++++++-
 tests/entrypoints/openai/test_sleep.py  | 29 ++++++++++++++++---
 vllm/device_allocator/cumem.py          | 32 ++++++++++++---------
 vllm/engine/async_llm_engine.py         |  4 +--
 vllm/engine/llm_engine.py               |  4 +--
 vllm/engine/multiprocessing/__init__.py |  5 ++--
 vllm/engine/multiprocessing/client.py   |  4 +--
 vllm/engine/multiprocessing/engine.py   |  6 ++--
 vllm/engine/protocol.py                 |  2 +-
 vllm/entrypoints/llm.py                 | 37 +++++++++++++++----------
 vllm/entrypoints/openai/api_server.py   |  9 ++++--
 vllm/executor/executor_base.py          | 25 +++++++++++++----
 vllm/v1/engine/async_llm.py             |  4 +--
 vllm/v1/engine/core.py                  |  4 +--
 vllm/v1/engine/core_client.py           | 16 +++++------
 vllm/v1/engine/llm_engine.py            |  4 +--
 vllm/v1/worker/gpu_worker.py            |  4 +--
 vllm/worker/worker.py                   |  4 +--
 18 files changed, 143 insertions(+), 70 deletions(-)

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 31aa8982820..76b266aada6 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -155,6 +155,24 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
 
         llm.wake_up()
         output2 = llm.generate(prompt, sampling_params)
-
         # cmp output
         assert output[0].outputs[0].text == output2[0].outputs[0].text
+
+        llm.sleep(level=1)
+        llm.wake_up(tags=["weights"])
+
+        free_gpu_bytes_wake_up_w, total = torch.cuda.mem_get_info()
+        used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
+
+        # should just reallocate memory for weights (1B model, ~2GiB weights)
+        if use_v1:
+            assert used_bytes < 10 * GiB_bytes
+        else:
+            assert used_bytes < 6 * GiB_bytes
+
+        # now allocate kv cache memory
+        llm.wake_up(tags=["kv_cache"])
+        output3 = llm.generate(prompt, sampling_params)
+
+        # cmp output
+        assert output[0].outputs[0].text == output3[0].outputs[0].text
diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py
index 66d8d929401..3ca8a9a410f 100644
--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
@@ -25,16 +25,37 @@ def test_sleep_mode():
                                 "VLLM_SERVER_DEV_MODE": "1",
                                 "CUDA_VISIBLE_DEVICES": "0"
                             }) as remote_server:
+        response = requests.post(remote_server.url_for("sleep"),
+                                 params={"level": "1"})
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is True
+
+        response = requests.post(remote_server.url_for("wake_up"))
+        assert response.status_code == 200
+        response = requests.get(remote_server.url_for("is_sleeping"))
+        assert response.status_code == 200
+        assert response.json().get("is_sleeping") is False
 
-        response = requests.post(remote_server.url_for("/sleep"),
+        # test wake up with tags
+        response = requests.post(remote_server.url_for("sleep"),
                                  params={"level": "1"})
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["weights"]})
+        assert response.status_code == 200
+
+        # is sleeping should be false after waking up any part of the engine
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is True
 
-        response = requests.post(remote_server.url_for("/wake_up"))
+        response = requests.post(remote_server.url_for("wake_up"),
+                                 params={"tags": ["kv_cache"]})
         assert response.status_code == 200
-        response = requests.get(remote_server.url_for("/is_sleeping"))
+
+        response = requests.get(remote_server.url_for("is_sleeping"))
         assert response.status_code == 200
         assert response.json().get("is_sleeping") is False
diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index f666c18c199..9ff77f14a5e 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -208,22 +208,28 @@ def sleep(
         gc.collect()
         torch.cuda.empty_cache()
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU
-        memory, and the rest of the data will have empty memory."""
+        All data that is previously offloaded will be loaded back to GPU 
+        memory, and the rest of the data will have empty memory.
+        
+        :param tags: The tags of the memory allocation that will be loaded
+            back to GPU memory. If None, all memory allocation will be loaded
+            back to GPU memory.
+        """
         for ptr, data in self.pointer_to_data.items():
-            handle = data.handle
-            create_and_map(handle)
-            if data.cpu_backup_tensor is not None:
-                cpu_backup_tensor = data.cpu_backup_tensor
-                if cpu_backup_tensor is not None:
-                    size_in_bytes = cpu_backup_tensor.numel(
-                    ) * cpu_backup_tensor.element_size()
-                    cpu_ptr = cpu_backup_tensor.data_ptr()
-                    libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
-                    data.cpu_backup_tensor = None
+            if tags is None or data.tag in tags:
+                handle = data.handle
+                create_and_map(handle)
+                if data.cpu_backup_tensor is not None:
+                    cpu_backup_tensor = data.cpu_backup_tensor
+                    if cpu_backup_tensor is not None:
+                        size_in_bytes = cpu_backup_tensor.numel(
+                        ) * cpu_backup_tensor.element_size()
+                        cpu_ptr = cpu_backup_tensor.data_ptr()
+                        libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes)
+                        data.cpu_backup_tensor = None
 
     @contextmanager
     def use_memory_pool(self, tag: Optional[str] = None):
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 3e337731d63..7f9f85e1f93 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1225,8 +1225,8 @@ async def reset_prefix_cache(self,
     async def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    async def wake_up(self) -> None:
-        self.engine.wake_up()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     async def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 10677878ecc..f842581bf55 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1938,10 +1938,10 @@ def sleep(self, level: int = 1) -> None:
             "Sleep mode is not enabled in the model config")
         self.model_executor.sleep(level=level)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         assert self.vllm_config.model_config.enable_sleep_mode, (
             "Sleep mode is not enabled in the model config")
-        self.model_executor.wake_up()
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index fdad53580ee..cafd8150bc0 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -133,8 +133,9 @@ class RPCSleepRequest(Enum):
     SLEEP_LEVEL_2 = 2
 
 
-class RPCWakeUpRequest(Enum):
-    WAKE_UP = 1
+@dataclass
+class RPCWakeUpRequest:
+    tags: Optional[list[str]] = None
 
 
 @dataclass
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index db91c5d3564..f058b13297b 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -697,10 +697,10 @@ async def sleep(self, level: int = 1) -> None:
         return await self._send_one_way_rpc_request(
             request=RPCSleepRequest(level), socket=self.input_socket)
 
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         return await self._send_one_way_rpc_request(
-            request=RPCWakeUpRequest.WAKE_UP, socket=self.input_socket)
+            request=RPCWakeUpRequest(tags), socket=self.input_socket)
 
     async def is_sleeping(self) -> bool:
         """Check whether the engine is sleeping"""
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 739cbedc2f8..6ed5ae0a94f 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -274,7 +274,7 @@ def handle_new_input(self):
                 elif isinstance(request, RPCSleepRequest):
                     self.sleep(request.value)
                 elif isinstance(request, RPCWakeUpRequest):
-                    self.wake_up()
+                    self.wake_up(request.tags)
                 elif isinstance(request, RPCIsSleepingRequest):
                     self._handle_is_sleeping_request(request)
                 else:
@@ -415,8 +415,8 @@ def reset_prefix_cache(self) -> bool:
     def sleep(self, level: int = 1) -> None:
         self.engine.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index d2f2c226d2f..e2974b02c5b 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -282,7 +282,7 @@ async def sleep(self, level: int = 1) -> None:
         ...
 
     @abstractmethod
-    async def wake_up(self) -> None:
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
         """Wake up the engine"""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7c354be2d45..f39b011c930 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1200,26 +1200,35 @@ def sleep(self, level: int = 1):
         The caller should guarantee that no requests are being processed
         during the sleep period, before `wake_up` is called.
 
-        :param level: The sleep level. Level 1 sleep will offload the model 
-            weights and discard the kv cache. The content of kv cache is 
-            forgotten. Level 1 sleep is good for sleeping and waking up the 
-            engine to run the same model again. The model weights are backed 
-            up in CPU memory. Please make sure there's enough CPU memory to 
-            store the model weights. Level 2 sleep will discard both the model 
-            weights and the kv cache. The content of both the model weights 
-            and kv cache is forgotten. Level 2 sleep is good for sleeping and 
-            waking up the engine to run a different model or update the model, 
-            where previous model weights are not needed. It reduces CPU memory 
-            pressure.
+        Args:
+            level: The sleep level. Level 1 sleep will offload the model 
+                weights and discard the kv cache. The content of kv cache 
+                is forgotten. Level 1 sleep is good for sleeping and waking
+                up the engine to run the same model again. The model weights 
+                are backed up in CPU memory. Please make sure there's enough 
+                CPU memory to store the model weights. Level 2 sleep will 
+                discard both the model weights and the kv cache. The content 
+                of both the model weights and kv cache is forgotten. Level 2 
+                sleep is good for sleeping and waking up the engine to run a
+                different model or update the model, where previous model 
+                weights are not needed. It reduces CPU memory pressure.
         """
         self.reset_prefix_cache()
         self.llm_engine.sleep(level=level)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         """
         Wake up the engine from sleep mode. See the :meth:`sleep` method
-        for more details."""
-        self.llm_engine.wake_up()
+        for more details.
+        
+        Args:
+            tags: An optional list of tags to reallocate the engine memory 
+                for specific memory allocations. Values must be in 
+                ("weights", "kv_cache",). If None, all memory is reallocated.
+                wake_up should be called with all tags (or None) before the 
+                engine is used again.
+        """
+        self.llm_engine.wake_up(tags)
 
     # LEGACY
     def _convert_v1_inputs(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 1e7d9eb83b9..6a8bdd06022 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -705,7 +705,6 @@ async def reset_prefix_cache(raw_request: Request):
     async def sleep(raw_request: Request):
         # get POST params
         level = raw_request.query_params.get("level", "1")
-        logger.info("sleep the engine with level %s", level)
         await engine_client(raw_request).sleep(int(level))
         # FIXME: in v0 with frontend multiprocessing, the sleep command
         # is sent but does not finish yet when we return a response.
@@ -713,8 +712,12 @@ async def sleep(raw_request: Request):
 
     @router.post("/wake_up")
     async def wake_up(raw_request: Request):
-        logger.info("wake up the engine")
-        await engine_client(raw_request).wake_up()
+        tags = raw_request.query_params.getlist("tags")
+        if tags == []:
+            # set to None to wake up all tags if no tags are provided
+            tags = None
+        logger.info("wake up the engine with tags: %s", tags)
+        await engine_client(raw_request).wake_up(tags)
         # FIXME: in v0 with frontend multiprocessing, the wake-up command
         # is sent but does not finish yet when we return a response.
         return Response(status_code=200)
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 6f5adb4f647..58796e5d732 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -51,6 +51,7 @@ def __init__(
         self.observability_config = vllm_config.observability_config
         self._init_executor()
         self.is_sleeping = False
+        self.sleeping_tags: set[str] = set()
 
     @abstractmethod
     def _init_executor(self) -> None:
@@ -204,20 +205,34 @@ def sleep(self, level: int = 1):
         time_before_sleep = time.perf_counter()
         self.collective_rpc("sleep", kwargs=dict(level=level))
         time_after_sleep = time.perf_counter()
+        self.sleeping_tags = {"weights", "kv_cache"}
         self.is_sleeping = True
         logger.info("It took %.6f seconds to fall asleep.",
                     time_after_sleep - time_before_sleep)
 
-    def wake_up(self):
+    def wake_up(self, tags: Optional[list[str]] = None):
         if not self.is_sleeping:
             logger.warning("Executor is not sleeping.")
             return
+        if tags:
+            for tag in tags:
+                if tag not in self.sleeping_tags:
+                    logger.warning("Tag %s is not in sleeping tags %s", tag,
+                                   self.sleeping_tags)
+                    return
         time_before_wakeup = time.perf_counter()
-        self.collective_rpc("wake_up")
+        self.collective_rpc("wake_up", kwargs=dict(tags=tags))
         time_after_wakeup = time.perf_counter()
-        self.is_sleeping = False
-        logger.info("It took %.6f seconds to wake up.",
-                    time_after_wakeup - time_before_wakeup)
+        logger.info("It took %.6f seconds to wake up tags %s.",
+                    time_after_wakeup - time_before_wakeup,
+                    tags if tags is not None else self.sleeping_tags)
+        if tags:
+            for tag in tags:
+                self.sleeping_tags.remove(tag)
+        else:
+            self.sleeping_tags.clear()
+        if not self.sleeping_tags:
+            self.is_sleeping = False
 
     def save_sharded_state(
         self,
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index a8d86e70f6a..b77a6824cdd 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -424,8 +424,8 @@ async def reset_prefix_cache(self,
     async def sleep(self, level: int = 1) -> None:
         await self.engine_core.sleep_async(level)
 
-    async def wake_up(self) -> None:
-        await self.engine_core.wake_up_async()
+    async def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        await self.engine_core.wake_up_async(tags)
 
     async def is_sleeping(self) -> bool:
         return await self.engine_core.is_sleeping_async()
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index d915d474cfd..19c7799b59b 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -264,8 +264,8 @@ def reset_prefix_cache(self):
     def sleep(self, level: int = 1):
         self.model_executor.sleep(level)
 
-    def wake_up(self):
-        self.model_executor.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.model_executor.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 3dc33a1284a..99774ff4556 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -92,7 +92,7 @@ def reset_prefix_cache(self) -> None:
     def sleep(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     def is_sleeping(self) -> bool:
@@ -141,7 +141,7 @@ async def reset_prefix_cache_async(self) -> None:
     async def sleep_async(self, level: int = 1) -> None:
         raise NotImplementedError
 
-    async def wake_up_async(self) -> None:
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
         raise NotImplementedError
 
     async def is_sleeping_async(self) -> bool:
@@ -206,8 +206,8 @@ def reset_prefix_cache(self) -> None:
     def sleep(self, level: int = 1) -> None:
         self.engine_core.sleep(level)
 
-    def wake_up(self) -> None:
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
@@ -520,8 +520,8 @@ def pin_lora(self, lora_id: int) -> bool:
     def sleep(self, level: int = 1) -> None:
         self.call_utility("sleep", level)
 
-    def wake_up(self) -> None:
-        self.call_utility("wake_up")
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        self.call_utility("wake_up", tags)
 
     def is_sleeping(self) -> bool:
         return self.call_utility("is_sleeping")
@@ -647,8 +647,8 @@ async def reset_prefix_cache_async(self) -> None:
     async def sleep_async(self, level: int = 1) -> None:
         await self.call_utility_async("sleep", level)
 
-    async def wake_up_async(self) -> None:
-        await self.call_utility_async("wake_up")
+    async def wake_up_async(self, tags: Optional[list[str]] = None) -> None:
+        await self.call_utility_async("wake_up", tags)
 
     async def is_sleeping_async(self) -> bool:
         return await self.call_utility_async("is_sleeping")
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 764c643b5c9..4c67186f704 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -245,8 +245,8 @@ def reset_prefix_cache(self, device: Optional[Device] = None):
     def sleep(self, level: int = 1):
         self.engine_core.sleep(level)
 
-    def wake_up(self):
-        self.engine_core.wake_up()
+    def wake_up(self, tags: Optional[list[str]] = None):
+        self.engine_core.wake_up(tags)
 
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 51b9f567396..191443683fa 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -83,9 +83,9 @@ def sleep(self, level: int = 1) -> None:
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags)
 
     def init_device(self):
         if self.device_config.device.type == "cuda":
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index ad94a6a4db7..d59f20f4999 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -135,9 +135,9 @@ def sleep(self, level: int = 1) -> None:
             "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes,
             used_bytes / GiB_bytes)
 
-    def wake_up(self) -> None:
+    def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CuMemAllocator.get_instance()
-        allocator.wake_up()
+        allocator.wake_up(tags=tags)
 
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":

From ccee0c3259916b92dd76b2625185fde173b8bd28 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 2 Apr 2025 05:00:08 -0400
Subject: [PATCH 1160/1240] [V1] Fix json_object support with xgrammar (#15488)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt                              |  2 +-
 .../entrypoints/llm/test_struct_output_generate.py   | 12 ++----------
 vllm/model_executor/guided_decoding/__init__.py      |  6 ------
 .../guided_decoding/xgrammar_decoding.py             |  5 ++++-
 vllm/v1/structured_output/backend_xgrammar.py        |  4 +++-
 5 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 48e58c85c89..08fee27fe68 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -21,7 +21,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
 llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
 outlines == 0.1.11
 lark == 1.2.2
-xgrammar == 0.1.16; platform_machine == "x86_64" or platform_machine == "aarch64"
+xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 0ffee08c234..d848490b89e 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -125,17 +125,9 @@ def test_structured_output(
             print(generated_text)
             assert generated_text is not None
 
-            # Parse to verify it is valid JSON
+            # Parse to verify it is a valid JSON object
             parsed_json = json.loads(generated_text)
-            allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend.startswith("xgrammar"):
-                # TODO - we are currently too permissive with xgrammar and
-                # allow # any valid json (typically comes back as a list or
-                # object).  We can fix this by specifying a jsonschema of
-                # {"type": "object"}, # but we need this fix in a release
-                # first: https://github.com/mlc-ai/xgrammar/pull/264
-                allowed_types = (dict, list)
-            assert isinstance(parsed_json, allowed_types)
+            assert isinstance(parsed_json, dict)
 
     #
     # Test 3: test a jsonschema incompatible with xgrammar
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index cecb3a8a1d4..d4fd11fd2e3 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -79,12 +79,6 @@ def fallback_or_error(guided_params: GuidedDecodingParams, message: str,
                     "xgrammar does not support Lark grammars and the "
                     "grammar failed to convert to GBNF.", "outlines")
 
-        elif guided_params.json_object:
-            # https://github.com/mlc-ai/xgrammar/issues/256
-            fallback_or_error(guided_params,
-                              "xgrammar does not support json_object.",
-                              "guidance")
-
         # If the xgrammar module cannot be imported successfully,
         # we should still allow users to use guided decoding with a fallback.
         elif not xgr_installed:
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 47b1e7e3f98..b44301f1a4c 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -320,7 +320,10 @@ def _ensure_ctx(self):
             elif self.config.grammar_str is not None:
                 self.ctx = compiler.compile_grammar(self.config.grammar_str)
             elif self.config.json_object:
-                self.ctx = compiler.compile_builtin_json_grammar()
+                any_whitespace = self.config.any_whitespace
+                self.ctx = compiler\
+                    .compile_json_schema('{"type": "object"}',
+                                         any_whitespace=any_whitespace)
             else:
                 raise ValueError(
                     "Invalid configuration for xgrammar logits processor")
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index 7fe62f26af5..783a3348124 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -84,7 +84,9 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
             ctx = self.compiler.compile_json_schema(
                 grammar_spec, any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.JSON_OBJECT:
-            ctx = self.compiler.compile_builtin_json_grammar()
+            ctx = self.compiler.compile_json_schema(
+                '{"type": "object"}',
+                any_whitespace=not self.disable_any_whitespace)
         elif request_type == StructuredOutputOptions.GRAMMAR:
             ctx = self.compiler.compile_grammar(grammar_spec)
         elif request_type == StructuredOutputOptions.REGEX:

From 742a24b2300edcfd9d8ce4d8c16a53df4940ff85 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:03:36 +0100
Subject: [PATCH 1161/1240] Add minimum version for `huggingface_hub` to enable
 Xet downloads (#15873)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt | 1 +
 requirements/test.in    | 1 +
 requirements/test.txt   | 5 ++++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index 08fee27fe68..c754dd12bc2 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,6 +7,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.50.3
+huggingface-hub[hf-xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index 43fa1bb58c8..eb74198ab9f 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -31,6 +31,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
+huggingface-hub[hf-xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9
diff --git a/requirements/test.txt b/requirements/test.txt
index bddf9990ca0..236b8be3280 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -152,14 +152,17 @@ genson==1.3.0
     # via datamodel-code-generator
 h11==0.14.0
     # via httpcore
+hf-xet==0.1.4
+    # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
     # via -r requirements/test.in
-huggingface-hub==0.26.2
+huggingface-hub==0.30.1
     # via
+    #   -r requirements/test.in
     #   accelerate
     #   datasets
     #   evaluate

From 7584fba815def7adc0a357c312be6bc3781578ad Mon Sep 17 00:00:00 2001
From: Brayden Zhong <b8zhong@uwaterloo.ca>
Date: Wed, 2 Apr 2025 05:19:35 -0400
Subject: [PATCH 1162/1240] [Bugfix][Benchmarks] Ensure
 `async_request_deepspeed_mii` uses the OpenAI choices key (#15926)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/backend_request_func.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 0f13c79ae23..ea70a1f48a0 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -219,7 +219,15 @@ async def async_request_deepspeed_mii(
                 if response.status == 200:
                     parsed_resp = await response.json()
                     output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp["text"][0]
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0][
+                            "text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = ("Unexpected response format: "
+                                        "neither 'choices' nor 'text' found")
+                        output.success = False
                     output.success = True
                 else:
                     output.error = response.reason or ""

From 508e239aaf01d2811018ed26074f188774b0799c Mon Sep 17 00:00:00 2001
From: Kay Yan <kay.yan@daocloud.io>
Date: Wed, 2 Apr 2025 17:44:01 +0800
Subject: [PATCH 1163/1240] [CI] Remove duplicate entrypoints-test (#15940)

Signed-off-by: Kay Yan <kay.yan@daocloud.io>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/test-pipeline.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e2b452d8a05..59e64dc527f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -204,7 +204,6 @@ steps:
   commands:
     # split the test to avoid interference
     - pytest -v -s v1/core
-    - pytest -v -s v1/entrypoints
     - pytest -v -s v1/engine
     - pytest -v -s v1/entrypoints
     - pytest -v -s v1/sample

From 76fb3b80168d7c07cd792b4441adc5ed5bef77ea Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Wed, 2 Apr 2025 21:33:52 +0800
Subject: [PATCH 1164/1240] [Bugfix] Fix the issue where the model name is
 empty string, causing no response with the model name. (#15938)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_chat.py     | 31 ++++++++++++++++++++---
 vllm/entrypoints/openai/serving_engine.py |  2 +-
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 4d13421adee..a1844502500 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -11,7 +11,7 @@
 import pytest_asyncio
 import requests
 import torch
-from openai import BadRequestError
+from openai import BadRequestError, OpenAI
 
 from ...utils import RemoteOpenAIServer
 from .test_completion import zephyr_lora_added_tokens_files  # noqa: F401
@@ -1054,7 +1054,7 @@ async def test_long_seed(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
-async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
+async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
     url = f"http://localhost:{server.port}/v1/chat/completions"
     headers = {
         "Content-Type": "application/json",
@@ -1075,10 +1075,35 @@ async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
     response = requests.post(url, headers=headers, json=data)
     response_data = response.json()
     print(response_data)
-
+    assert response_data.get("model") == MODEL_NAME
     choice = response_data.get("choices")[0]
     message = choice.get("message")
     assert message is not None
     content = message.get("content")
     assert content is not None
     assert len(content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME, ""])
+async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer,
+                                                   model_name: str):
+
+    openai_api_key = "EMPTY"
+    openai_api_base = f"http://localhost:{server.port}/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    messages = [
+        {
+            "role": "user",
+            "content": "Hello, vLLM!"
+        },
+    ]
+    response = client.chat.completions.create(
+        model="",  # empty string
+        messages=messages,
+    )
+    assert response.model == MODEL_NAME
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 7cb4a2dce1d..bbc8eddd8b1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -537,7 +537,7 @@ def _get_model_name(self,
                         lora_request: Optional[LoRARequest] = None) -> str:
         if lora_request:
             return lora_request.lora_name
-        if model_name is None:
+        if not model_name:
             return self.models.base_model_paths[0].name
         return model_name
 

From 0c1a98cfbf9b6e6739aa7d9f3d3838e2abc82fa3 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Wed, 2 Apr 2025 15:37:19 +0100
Subject: [PATCH 1165/1240] [Metrics] Hide deprecated metrics (#15458)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/entrypoints/openai/test_metrics.py |  21 ++-
 tests/utils.py                           |   3 +
 vllm/engine/metrics.py                   | 163 ++++++++++++-----------
 vllm/version.py                          |   9 ++
 4 files changed, 114 insertions(+), 82 deletions(-)

diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index 2bffd0ce138..42f7b098f91 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -13,9 +13,12 @@
 from prometheus_client.parser import text_string_to_metric_families
 from transformers import AutoTokenizer
 
+from vllm import version
+
 from ...utils import RemoteOpenAIServer
 
 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+PREV_MINOR_VERSION = version._prev_minor_version()
 
 
 @pytest.fixture(scope="module", params=[True, False])
@@ -55,6 +58,7 @@ def default_server_args():
                     "",
                     "--enable-chunked-prefill",
                     "--disable-frontend-multiprocessing",
+                    f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
                 ])
 def server(use_v1, default_server_args, request):
     if request.param:
@@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
     # Loop over all expected metric_families
     for metric_family, suffix_values_list in EXPECTED_VALUES.items():
-        if use_v1 and metric_family not in EXPECTED_METRICS_V1:
+        if ((use_v1 and metric_family not in EXPECTED_METRICS_V1)
+                or (not server.show_hidden_metrics
+                    and metric_family in HIDDEN_DEPRECATED_METRICS)):
             continue
 
         found_metric = False
@@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
 
 EXPECTED_METRICS = [
     "vllm:num_requests_running",
-    "vllm:num_requests_swapped",
+    "vllm:num_requests_swapped",  # deprecated
     "vllm:num_requests_waiting",
     "vllm:gpu_cache_usage_perc",
-    "vllm:cpu_cache_usage_perc",
+    "vllm:cpu_cache_usage_perc",  # deprecated
     "vllm:time_to_first_token_seconds_sum",
     "vllm:time_to_first_token_seconds_bucket",
     "vllm:time_to_first_token_seconds_count",
@@ -268,6 +274,11 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
     "vllm:request_decode_time_seconds_count",
 ]
 
+HIDDEN_DEPRECATED_METRICS = [
+    "vllm:num_requests_swapped",
+    "vllm:cpu_cache_usage_perc",
+]
+
 
 @pytest.mark.asyncio
 async def test_metrics_exist(server: RemoteOpenAIServer,
@@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
     assert response.status_code == HTTPStatus.OK
 
     for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
-        assert metric in response.text
+        if (not server.show_hidden_metrics
+                and metric not in HIDDEN_DEPRECATED_METRICS):
+            assert metric in response.text
 
 
 def test_metrics_exist_run_batch(use_v1: bool):
diff --git a/tests/utils.py b/tests/utils.py
index 69c96d3f065..8f8c102b73b 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -104,6 +104,9 @@ def __init__(self,
         self.host = str(args.host or 'localhost')
         self.port = int(args.port)
 
+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
         # download the model before starting the server to avoid timeout
         is_local = os.path.isdir(model)
         if not is_local:
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 70f36d1290c..5890f654e38 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -52,6 +52,11 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
 
         max_model_len = vllm_config.model_config.max_model_len
 
+        # Use this flag to hide metrics that were deprecated in
+        # a previous release and which will be removed future
+        self.show_hidden_metrics = \
+            vllm_config.observability_config.show_hidden_metrics
+
         # System stats
         #   Scheduler State
         self.gauge_scheduler_running = self._gauge_cls(
@@ -76,14 +81,15 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
         )
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_scheduler_swapped = self._gauge_cls(
-            name="vllm:num_requests_swapped",
-            documentation=(
-                "Number of requests swapped to CPU. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_scheduler_swapped = self._gauge_cls(
+                name="vllm:num_requests_swapped",
+                documentation=(
+                    "Number of requests swapped to CPU. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         #   KV Cache Usage in %
         self.gauge_gpu_cache_usage = self._gauge_cls(
@@ -93,34 +99,33 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             multiprocess_mode="sum")
 
         # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_cache_usage = self._gauge_cls(
-            name="vllm:cpu_cache_usage_perc",
-            documentation=(
-                "CPU KV-cache usage. 1 means 100 percent usage. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
-
-        # Deprecated in 0.8 - KV cache offloading is not used in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:cpu_prefix_cache_hit_rate",
-            documentation=(
-                "CPU prefix cache block hit rate. "
-                "DEPRECATED: KV cache offloading is not used in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_cpu_cache_usage = self._gauge_cls(
+                name="vllm:cpu_cache_usage_perc",
+                documentation=(
+                    "CPU KV-cache usage. 1 means 100 percent usage. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
+            self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:cpu_prefix_cache_hit_rate",
+                documentation=(
+                    "CPU prefix cache block hit rate. "
+                    "DEPRECATED: KV cache offloading is not used in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Deprecated in 0.8 - replaced by queries+hits counters in V1
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
-            name="vllm:gpu_prefix_cache_hit_rate",
-            documentation=("GPU prefix cache block hit rate. "
-                           "DEPRECATED: use vllm:gpu_prefix_cache_queries and "
-                           "vllm:gpu_prefix_cache_queries in V1"),
-            labelnames=labelnames,
-            multiprocess_mode="sum")
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
+                name="vllm:gpu_prefix_cache_hit_rate",
+                documentation=("GPU prefix cache block hit rate. "
+                               "DEPRECATED: use vllm:gpu_prefix_cache_queries "
+                               "and vllm:gpu_prefix_cache_queries in V1"),
+                labelnames=labelnames,
+                multiprocess_mode="sum")
 
         # Iteration stats
         self.counter_num_preemption = self._counter_cls(
@@ -198,33 +203,35 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             labelnames=labelnames,
             buckets=request_latency_buckets)
         # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_time_in_queue_request = self._histogram_cls(
-            name="vllm:time_in_queue_requests",
-            documentation=(
-                "Histogram of time the request spent in the queue in seconds. "
-                "DEPRECATED: use vllm:request_queue_time_seconds instead."),
-            labelnames=labelnames,
-            buckets=request_latency_buckets)
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_time_in_queue_request = self._histogram_cls(
+                name="vllm:time_in_queue_requests",
+                documentation=
+                ("Histogram of time the request spent in the queue in seconds. "
+                 "DEPRECATED: use vllm:request_queue_time_seconds instead."),
+                labelnames=labelnames,
+                buckets=request_latency_buckets)
 
         # Deprecated in 0.8 - use prefill/decode/inference time metrics
-        # TODO: in 0.9, only enable if show_hidden_metrics=True
-        self.histogram_model_forward_time_request = self._histogram_cls(
-            name="vllm:model_forward_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model forward pass in ms. "
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
-        self.histogram_model_execute_time_request = self._histogram_cls(
-            name="vllm:model_execute_time_milliseconds",
-            documentation=(
-                "Histogram of time spent in the model execute function in ms."
-                "DEPRECATED: use prefill/decode/inference time metrics instead."
-            ),
-            labelnames=labelnames,
-            buckets=build_1_2_3_5_8_buckets(3000))
+        # Hidden in 0.9, due to be removed in 0.10
+        if self.show_hidden_metrics:
+            self.histogram_model_forward_time_request = self._histogram_cls(
+                name="vllm:model_forward_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model forward pass in ms. "
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
+            self.histogram_model_execute_time_request = self._histogram_cls(
+                name="vllm:model_execute_time_milliseconds",
+                documentation=
+                ("Histogram of time spent in the model execute function in ms."
+                 "DEPRECATED: use prefill/decode/inference time metrics instead"
+                 ),
+                labelnames=labelnames,
+                buckets=build_1_2_3_5_8_buckets(3000))
 
         #   Metadata
         self.histogram_num_prompt_tokens_request = self._histogram_cls(
@@ -543,11 +550,6 @@ def __init__(self, local_interval: float, labels: Dict[str, str],
         self.metrics = self._metrics_cls(labelnames=list(labels.keys()),
                                          vllm_config=vllm_config)
 
-        # Use this flag to hide metrics that were deprecated in
-        # a previous release and which will be removed future
-        self.show_hidden_metrics = \
-            vllm_config.observability_config.show_hidden_metrics
-
     def _log_gauge(self, gauge, data: Union[int, float]) -> None:
         # Convenience function for logging to gauge.
         gauge.labels(**self.labels).set(data)
@@ -580,18 +582,20 @@ def _log_prometheus(self, stats: Stats) -> None:
         # System state data
         self._log_gauge(self.metrics.gauge_scheduler_running,
                         stats.num_running_sys)
-        self._log_gauge(self.metrics.gauge_scheduler_swapped,
-                        stats.num_swapped_sys)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_scheduler_swapped,
+                            stats.num_swapped_sys)
         self._log_gauge(self.metrics.gauge_scheduler_waiting,
                         stats.num_waiting_sys)
         self._log_gauge(self.metrics.gauge_gpu_cache_usage,
                         stats.gpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_cache_usage,
-                        stats.cpu_cache_usage_sys)
-        self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
-                        stats.cpu_prefix_cache_hit_rate)
-        self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
-                        stats.gpu_prefix_cache_hit_rate)
+        if self.metrics.show_hidden_metrics:
+            self._log_gauge(self.metrics.gauge_cpu_cache_usage,
+                            stats.cpu_cache_usage_sys)
+            self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
+                            stats.cpu_prefix_cache_hit_rate)
+            self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
+                            stats.gpu_prefix_cache_hit_rate)
         # Including max-lora in metric, in future this property of lora
         # config maybe extended to be dynamic.
         lora_info = {
@@ -629,12 +633,15 @@ def _log_prometheus(self, stats: Stats) -> None:
                             stats.time_prefill_requests)
         self._log_histogram(self.metrics.histogram_decode_time_request,
                             stats.time_decode_requests)
-        self._log_histogram(self.metrics.histogram_time_in_queue_request,
-                            stats.time_in_queue_requests)
-        self._log_histogram(self.metrics.histogram_model_forward_time_request,
-                            stats.model_forward_time_requests)
-        self._log_histogram(self.metrics.histogram_model_execute_time_request,
-                            stats.model_execute_time_requests)
+        if self.metrics.show_hidden_metrics:
+            self._log_histogram(self.metrics.histogram_time_in_queue_request,
+                                stats.time_in_queue_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_forward_time_request,
+                stats.model_forward_time_requests)
+            self._log_histogram(
+                self.metrics.histogram_model_execute_time_request,
+                stats.model_execute_time_requests)
         # Metadata
         finished_reason_counter = CollectionsCounter(
             stats.finished_reason_requests)
diff --git a/vllm/version.py b/vllm/version.py
index ab5909b101a..8329d7becb6 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str):
         return True
 
     # Note - this won't do the right thing when we release 1.0!
+    assert __version_tuple__[0] == 0
+    assert isinstance(__version_tuple__[1], int)
     return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
+
+
+def _prev_minor_version():
+    """For the purpose of testing, return a previous minor version number."""
+    # In dev tree, this will return "0.-1", but that will work fine"
+    assert isinstance(__version_tuple__[1], int)
+    return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"

From 4cbcc513a414d26a1c93dbdcc0f386de0c61cf22 Mon Sep 17 00:00:00 2001
From: Matthias Matt <37695050+meffmadd@users.noreply.github.com>
Date: Wed, 2 Apr 2025 16:45:45 +0200
Subject: [PATCH 1166/1240] [Frontend] Implement Tool Calling with
 `tool_choice='required'` (#13483)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Matt, Matthias <matthias.matt@tuwien.ac.at>
Co-authored-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/tool_calling.md          |   8 +-
 ...t_completion_client_with_tools_required.py | 136 +++++++
 tests/entrypoints/openai/test_chat.py         | 163 ++++++---
 ...est_chat_completion_request_validations.py |   7 +-
 tests/tool_use/test_tool_choice_required.py   | 336 ++++++++++++++++++
 vllm/entrypoints/openai/protocol.py           | 134 +++++--
 vllm/entrypoints/openai/serving_chat.py       | 177 ++++++++-
 7 files changed, 868 insertions(+), 93 deletions(-)
 create mode 100644 examples/online_serving/openai_chat_completion_client_with_tools_required.py
 create mode 100644 tests/tool_use/test_tool_choice_required.py

diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md
index 2e1081bf8d1..17cee6da471 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@@ -1,6 +1,6 @@
 # Tool Calling
 
-vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but [on the roadmap](gh-issue:13002).
+vLLM currently supports named function calling, as well as the `auto`, `required` (as of `vllm>=0.8.3`) and `none` options for the `tool_choice` field in the chat completion API.
 
 ## Quickstart
 
@@ -91,6 +91,12 @@ For best results, we recommend ensuring that the expected output format / schema
 To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
 specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
+## Required Function Calling
+
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
+
+When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
+
 ## Automatic Function Calling
 
 To enable this feature, you should set the following flags:
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
new file mode 100644
index 00000000000..779369d1634
--- /dev/null
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+To run this example, you can start the vLLM server 
+without any specific flags:
+
+```bash
+VLLM_USE_V1=0 vllm serve unsloth/Llama-3.2-1B-Instruct \
+    --guided-decoding-backend outlines
+```
+
+This example demonstrates how to generate chat completions 
+using the OpenAI Python client library.
+"""
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "the two-letter abbreviation for the state that the "
+                        "city is in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "unit"],
+            },
+        },
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "state": {
+                        "type":
+                        "string",
+                        "description":
+                        "The two-letter abbreviation for the state, e.g. 'NY'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
+                },
+                "required": ["city", "state", "days", "unit"],
+            },
+        },
+    },
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": "Hi! How are you doing today?"
+    },
+    {
+        "role": "assistant",
+        "content": "I'm doing well! How can I help you?"
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Can you tell me what the current weather is in Dallas \
+            and the forecast for the next 5 days, in fahrenheit?",
+    },
+]
+
+chat_completion = client.chat.completions.create(
+    messages=messages,
+    model=model,
+    tools=tools,
+    tool_choice="required",
+    stream=True  # Enable streaming response
+)
+
+for chunk in chat_completion:
+    if chunk.choices and chunk.choices[0].delta.tool_calls:
+        print(chunk.choices[0].delta.tool_calls)
+
+chat_completion = client.chat.completions.create(messages=messages,
+                                                 model=model,
+                                                 tools=tools,
+                                                 tool_choice="required")
+
+print(chat_completion.choices[0].message.tool_calls)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a1844502500..b83c37a9032 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -786,56 +786,135 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, is_v1_server: bool,
 
 
 @pytest.mark.asyncio
-async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
-                                                   sample_json_schema):
-
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_required_tool_use(client: openai.AsyncOpenAI,
+                                 is_v1_server: bool, model_name: str):
     if is_v1_server:
-        pytest.skip("sample_json_schema has features unsupported on V1")
+        pytest.skip(
+            "tool_choice='required' requires features unsupported on V1")
 
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role":
-        "user",
-        "content":
-        f"Give an example JSON for an employee profile that "
-        f"fits this schema: {sample_json_schema}"
-    }]
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to find the weather for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "unit"],
+                },
+            },
+        },
+        {
+            "type": "function",
+            "function": {
+                "name": "get_forecast",
+                "description": "Get the weather forecast for a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description":
+                            "The city to get the forecast for, e.g. 'Vienna'",
+                            "default": "Vienna",
+                        },
+                        "country": {
+                            "type":
+                            "string",
+                            "description":
+                            "The country that the city is in, e.g. 'Austria'",
+                        },
+                        "days": {
+                            "type":
+                            "integer",
+                            "description":
+                            "Number of days to get the forecast for (1-7)",
+                        },
+                        "unit": {
+                            "type": "string",
+                            "description":
+                            "The unit to fetch the temperature in",
+                            "enum": ["celsius", "fahrenheit"],
+                        },
+                    },
+                    "required": ["country", "days", "unit"],
+                },
+            },
+        },
+    ]
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="required")
+    messages = [
+        {
+            "role": "user",
+            "content": "Hi! How are you doing today?"
+        },
+        {
+            "role": "assistant",
+            "content": "I'm doing well! How can I help you?"
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "Can you tell me what the current weather is in Berlin and the "\
+            "forecast for the next 5 days, in fahrenheit?",
+        },
+    ]
 
-    with pytest.raises(openai.BadRequestError):
-        await client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=messages,
-            max_completion_tokens=1000,
-            tools=[{
-                "type": "function",
-                "function": {
-                    "name": "dummy_function_name",
-                    "description": "This is a dummy function",
-                    "parameters": sample_json_schema
-                }
-            }],
-            tool_choice="auto")
+    # Non-streaming test
+    chat_completion = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+    )
+
+    assert chat_completion.choices[0].message.tool_calls is not None
+    assert len(chat_completion.choices[0].message.tool_calls) > 0
+
+    # Streaming test
+    stream = await client.chat.completions.create(
+        messages=messages,
+        model=model_name,
+        tools=tools,
+        tool_choice="required",
+        extra_body=dict(guided_decoding_backend="outlines"),
+        stream=True,
+    )
+
+    output = []
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.tool_calls:
+            output.extend(chunk.choices[0].delta.tool_calls)
+
+    assert len(output) > 0
 
 
 @pytest.mark.asyncio
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+                                                  is_v1_server: bool,
                                                   sample_json_schema):
 
     if is_v1_server:
diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py
index 7bee56281c7..ba0ad78f646 100644
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -43,7 +43,8 @@ def test_chat_completion_request_with_no_tools():
     assert request.tool_choice == 'none'
 
 
-def test_chat_completion_request_with_tool_choice_but_no_tools():
+@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
     with pytest.raises(ValueError,
                        match="When using `tool_choice`, `tools` must be set."):
         ChatCompletionRequest.model_validate({
@@ -54,7 +55,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto'
+            tool_choice
         })
 
     with pytest.raises(ValueError,
@@ -67,7 +68,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
             'model':
             'facebook/opt-125m',
             'tool_choice':
-            'auto',
+            tool_choice,
             'tools':
             None
         })
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
new file mode 100644
index 00000000000..2ab87a0ef41
--- /dev/null
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -0,0 +1,336 @@
+# SPDX-License-Identifier: Apache-2.0
+import json
+import re
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+]
+
+
+def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
+                       should_match: bool):
+    self = MagicMock(tool_choice="required", tools=tools)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.fsm.json_schema import build_regex_from_schema
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Berlin",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            },
+            False),
+        (
+            [{  # tool call with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # tool call where parameters are first cannot be generated
+                "parameters": {
+                    "city": "Vienna"
+                },
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # tool call without all required parameters cannot be generated
+                "name": "get_forecast",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }],
+            False),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{
+                "name": "get_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "days": 7
+                }
+            }], False),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }, {}], False),
+    ])
+def test_guided_json(sample_output, should_match):
+    _compile_and_check(tools=TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+def update_parameters_none(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            [{  # function with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # only function with empty parameters object is valid
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            True),
+    ])
+@pytest.mark.parametrize(
+    "update_parameters",
+    [update_parameters_none, update_parameters_empty_dict])
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all([
+        tool.function.parameters is None or tool.function.parameters == {}
+        for tool in tools
+    ])
+    _compile_and_check(tools=tools,
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i:i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned))
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += '{"name": "' + \
+                message.tool_calls[0].function.name  + \
+                    '", "parameters": ' + \
+                        message.tool_calls[0].function.arguments
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 032dc49d16d..7cbd9d7ce2a 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -61,7 +61,7 @@ def __log_extra_fields__(cls, data, handler):
             field_names = set()
             for field_name, field in cls.model_fields.items():
                 field_names.add(field_name)
-                if alias := getattr(field, 'alias', None):
+                if alias := getattr(field, "alias", None):
                     field_names.add(alias)
             cls.field_names = field_names
 
@@ -70,7 +70,8 @@ def __log_extra_fields__(cls, data, handler):
             logger.warning(
                 "The following fields were present in the request "
                 "but ignored: %s",
-                data.keys() - field_names)
+                data.keys() - field_names,
+            )
         return result
 
 
@@ -234,8 +235,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     temperature: Optional[float] = None
     top_p: Optional[float] = None
     tools: Optional[list[ChatCompletionToolsParam]] = None
-    tool_choice: Optional[Union[Literal["none"], Literal["auto"],
-                                ChatCompletionNamedToolChoiceParam]] = "none"
+    tool_choice: Optional[Union[
+        Literal["none"],
+        Literal["auto"],
+        Literal["required"],
+        ChatCompletionNamedToolChoiceParam,
+    ]] = "none"
 
     # NOTE this will be ignored by vLLM -- the model determines the behavior
     parallel_tool_calls: Optional[bool] = False
@@ -340,24 +345,28 @@ class ChatCompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be either "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     request_id: str = Field(
         default_factory=lambda: f"{random_uuid()}",
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."))
+            "through out the inference process and return in response."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -415,13 +424,15 @@ def to_beam_search_params(
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         # TODO(#9845): remove max_tokens when field is removed from OpenAI API
         max_tokens = self.max_completion_tokens or self.max_tokens
 
@@ -475,7 +486,8 @@ def to_sampling_params(
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -522,6 +534,41 @@ def _get_guided_json_from_tool(
             tool = tools[tool_name]
             return tool.parameters
 
+        if self.tool_choice == "required":
+            # Pydantic schema generation cannot be used since the JSON schema
+            # has to be constructed for a specific instantiation of a tool list
+            # so that parameters of a function are correctly generated
+            # based on the chosen function name
+            def get_tool_schema(tool: ChatCompletionToolsParam) -> dict:
+                return {
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "enum": [tool.function.name]
+                        },
+                        # parameters are always generated as '{}' in the final
+                        # output if they are missing from the request
+                        # (i.e. are None or '{}') so the schema is
+                        # updated to produce an empty object in that case
+                        "parameters": tool.function.parameters
+                        if tool.function.parameters else {
+                            "type": "object",
+                            "properties": {}
+                        }
+                    },
+                    "required": ["name", "parameters"]
+                }
+
+            json_schema = {
+                "type": "array",
+                "minItems": 1,
+                "items": {
+                    "type": "object",
+                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
+                }
+            }
+            return json_schema
+
         return None
 
     @model_validator(mode="before")
@@ -572,8 +619,11 @@ def check_guided_decoding_count(cls, data):
                 "You can only use one kind of guided decoding "
                 "('guided_json', 'guided_regex' or 'guided_choice').")
         # you can only either use guided decoding or tools, not both
-        if guide_count > 1 and data.get("tool_choice",
-                                        "none") not in ("none", "auto"):
+        if guide_count > 1 and data.get("tool_choice", "none") not in (
+                "none",
+                "auto",
+                "required",
+        ):
             raise ValueError(
                 "You can only either use guided decoding or tools, not both.")
         return data
@@ -602,12 +652,15 @@ def check_tool_usage(cls, data):
                     "When using `tool_choice`, `tools` must be set.")
 
             # make sure that tool choice is either a named tool
-            # OR that it's set to "auto"
-            if data["tool_choice"] != "auto" and not isinstance(
-                    data["tool_choice"], dict):
-                raise ValueError(
-                    "`tool_choice` must either be a named tool, \"auto\", "
-                    "or \"none\".")
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in [
+                    "auto", "required"
+            ] and not isinstance(data["tool_choice"], dict):
+                raise NotImplementedError(
+                    f'Invalid value for `tool_choice`: {data["tool_choice"]}! '\
+                    'Only named tools, "none", "auto" or "required" '\
+                    'are supported.'
+                )
 
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
@@ -722,18 +775,21 @@ class CompletionRequest(OpenAIBaseModel):
         description=(
             "If specified, will override the default guided decoding backend "
             "of the server for this specific request. If set, must be one of "
-            "'outlines' / 'lm-format-enforcer'"))
+            "'outlines' / 'lm-format-enforcer'"),
+    )
     guided_whitespace_pattern: Optional[str] = Field(
         default=None,
         description=(
             "If specified, will override the default whitespace pattern "
-            "for guided json decoding."))
+            "for guided json decoding."),
+    )
     priority: int = Field(
         default=0,
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     logits_processors: Optional[LogitsProcessors] = Field(
         default=None,
         description=(
@@ -745,6 +801,7 @@ class CompletionRequest(OpenAIBaseModel):
             "arguments. For example: {'qualname': "
             "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
             "{'param': 'value'}}."))
+
     return_tokens_as_token_ids: Optional[bool] = Field(
         default=None,
         description=(
@@ -789,13 +846,15 @@ def to_beam_search_params(
             ignore_eos=self.ignore_eos,
             temperature=temperature,
             length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output)
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
 
     def to_sampling_params(
-            self,
-            default_max_tokens: int,
-            logits_processor_pattern: Optional[str],
-            default_sampling_params: Optional[dict] = None) -> SamplingParams:
+        self,
+        default_max_tokens: int,
+        logits_processor_pattern: Optional[str],
+        default_sampling_params: Optional[dict] = None,
+    ) -> SamplingParams:
         max_tokens = self.max_tokens
 
         if default_sampling_params is None:
@@ -844,7 +903,8 @@ def to_sampling_params(
             grammar=self.guided_grammar,
             json_object=guided_json_object,
             backend=self.guided_decoding_backend,
-            whitespace_pattern=self.guided_whitespace_pattern)
+            whitespace_pattern=self.guided_whitespace_pattern,
+        )
 
         return SamplingParams.from_optional(
             n=self.n,
@@ -942,7 +1002,8 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-embedding-extra-params
 
@@ -995,7 +1056,8 @@ class EmbeddingChatRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
     # doc: end-chat-embedding-extra-params
 
     @model_validator(mode="before")
@@ -1034,7 +1096,8 @@ class ScoreRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-score-extra-params
 
@@ -1059,7 +1122,8 @@ class RerankRequest(OpenAIBaseModel):
         description=(
             "The priority of the request (lower means earlier handling; "
             "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."))
+            "if the served model does not use priority scheduling."),
+    )
 
     # doc: end-rerank-extra-params
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index eda4722836b..d4d0cfa4000 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -2,13 +2,16 @@
 
 import asyncio
 import json
+import re
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
 from typing import Callable, Final, Optional, Union
 
 import jinja2
+import partial_json_parser
 from fastapi import Request
+from pydantic import TypeAdapter
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -21,8 +24,8 @@
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
-    DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
-    RequestResponseMetadata, ToolCall, UsageInfo)
+    DeltaToolCall, ErrorResponse, FunctionCall, FunctionDefinition,
+    PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (OpenAIServing,
                                                     clamp_prompt_logprobs)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
@@ -150,12 +153,6 @@ async def create_chat_completion(
 
             tool_parser = self.tool_parser
 
-            # validation for OpenAI tools
-            # tool_choice = "required" is not supported
-            if request.tool_choice == "required":
-                return self.create_error_response(
-                    "tool_choice = \"required\" is not supported!")
-
             if isinstance(tokenizer, MistralTokenizer):
                 # because of issues with pydantic we need to potentially
                 # re-serialize the tool_calls field of the request
@@ -277,6 +274,122 @@ def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
             return self.response_role
         return request.messages[-1]["role"]
 
+    @staticmethod
+    def _bracket_level(s: str, opening='{', closing='}') -> int:
+        """
+        Calculate the current level of nested brackets in a given string.
+        """
+        level = 0
+        for char in s:
+            if char == opening:
+                level += 1
+            elif char == closing:
+                level -= 1
+        return level
+
+    @staticmethod
+    def _filter_delta_text(delta_text: str,
+                           previous_text: str) -> tuple[str, bool]:
+        # remove last '},' of the tool definition stemming from the
+        # "name"/"parameters" outer object or closing ']' of the tool list
+        # count occurrences of opening and closing curly braces and
+        # once level 0 is reached stop outputting text
+        # if 0 is reached while parsing the delta_text we know the current
+        # tool will finish in this current iteration
+        bracket_level = OpenAIServingChat._bracket_level(previous_text)
+        updated_delta, passed_zero = "", False
+        for c in delta_text:
+            if c == '{':
+                bracket_level += 1
+                passed_zero = bracket_level == 0
+            elif c == '}':
+                bracket_level -= 1
+                passed_zero = bracket_level == 0
+
+            if bracket_level != 0:
+                updated_delta += c
+            else:
+                # if a comma is reached at level 0 we can stop
+                if c == ',':
+                    break
+        return updated_delta, passed_zero
+
+    def extract_tool_call_required_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        function_name_returned: bool,
+    ) -> tuple[Optional[DeltaMessage], bool]:
+        try:
+            obj = partial_json_parser.loads(current_text)
+        except partial_json_parser.core.exceptions.MalformedJSON:
+            logger.debug('not enough tokens to parse into JSON yet')
+            obj = None
+
+        # check if the current text is a valid array
+        # containing a partial tool calling object
+        # if not repeat
+        if obj is None or not isinstance(obj, list) or not len(obj) > 0:
+            function_name_returned = False
+            delta_message = None
+        else:
+            _, finishes_previous_tool = OpenAIServingChat._filter_delta_text(
+                delta_text, previous_text)
+            # take the last tool call from the generated list
+            current_tool_call = obj[-1]
+
+            # once parameters have been generated the name is complete as well
+            if not finishes_previous_tool and ("name" not in current_tool_call
+                                               or "parameters"
+                                               not in current_tool_call):
+                function_name_returned = False
+                delta_message = None
+            else:
+                if not function_name_returned:
+                    # get partly generated arguments from the latest tool call
+                    param_match = re.search(r'.*"parameters":\s*(.*)',
+                                            current_text)
+                    arguments = param_match.group(1) if param_match else ""
+                    arguments, _ = OpenAIServingChat._filter_delta_text(
+                        arguments, previous_text)
+
+                    # if this iteration finishes a previous tool call but a
+                    # new incomplete tool is already generated, take the
+                    # previous from the list
+                    if (finishes_previous_tool
+                            and "parameters" not in current_tool_call):
+                        current_tool_call = obj[-2]
+
+                    function_name_returned = True
+                    delta_message = DeltaMessage(tool_calls=[
+                        DeltaToolCall(function=DeltaFunctionCall(
+                            name=current_tool_call["name"],
+                            arguments=arguments),
+                                      index=len(obj) - 1,
+                                      type="function")
+                    ])
+
+                else:
+                    delta_text, _ = OpenAIServingChat._filter_delta_text(
+                        delta_text, previous_text)
+
+                    if delta_text != "":
+                        delta_message = DeltaMessage(tool_calls=[
+                            DeltaToolCall(
+                                function=DeltaFunctionCall(
+                                    # OpenAI API returns None
+                                    # instead of name every time
+                                    name=None,
+                                    arguments=delta_text),
+                                index=len(obj) - 1,
+                                type="function")
+                        ])
+                    else:
+                        delta_message = None
+
+        return delta_message, function_name_returned
+
     async def chat_completion_stream_generator(
         self,
         request: ChatCompletionRequest,
@@ -312,6 +425,7 @@ async def chat_completion_stream_generator(
             self._should_stream_with_reasoning_parsing(request))
 
         all_previous_token_ids: Optional[list[list[int]]]
+        function_name_returned: Optional[list[bool]] = None
 
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
@@ -322,6 +436,10 @@ async def chat_completion_stream_generator(
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
+        elif request.tool_choice == "required":
+            previous_texts = [""] * num_choices
+            function_name_returned = [False] * num_choices
+            all_previous_token_ids = None
         else:
             previous_texts, all_previous_token_ids = None, None
 
@@ -521,6 +639,23 @@ async def chat_completion_stream_generator(
                                               index=i)
                             ])
 
+                    elif request.tool_choice == "required":
+                        assert previous_texts is not None
+                        assert function_name_returned is not None
+                        previous_text = previous_texts[i]
+                        current_text = previous_text + delta_text
+                        fn_name_returned = function_name_returned[i]
+
+                        delta_message, function_name_returned[i] = (
+                            self.extract_tool_call_required_streaming(
+                                previous_text=previous_text,
+                                current_text=current_text,
+                                delta_text=delta_text,
+                                function_name_returned=fn_name_returned))
+
+                        # update the previous values for the next iteration
+                        previous_texts[i] = current_text
+
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
                     elif tool_choice_auto and self.enable_reasoning:
@@ -821,10 +956,10 @@ async def chat_completion_full_generator(
 
             # if auto tools are not enabled, and a named tool choice using
             #   outlines is not being used
-            if (not self.enable_auto_tools
-                    or not self.tool_parser) and not isinstance(
-                        request.tool_choice,
-                        ChatCompletionNamedToolChoiceParam):
+            if (not self.enable_auto_tools or not self.tool_parser) and \
+                (not isinstance(request.tool_choice,
+                                ChatCompletionNamedToolChoiceParam
+                                ) and request.tool_choice != "required"):
                 message = ChatMessage(role=role,
                                       reasoning_content=reasoning_content,
                                       content=content)
@@ -845,6 +980,24 @@ async def chat_completion_full_generator(
                             arguments=content))
                     ])
 
+            elif request.tool_choice and request.tool_choice == "required":
+                tool_call_class = MistralToolCall if isinstance(
+                    tokenizer, MistralTokenizer) else ToolCall
+
+                # the fields of FunctionDefinition are a superset of the
+                # tool call outputs and can be used for parsing
+                tool_calls = TypeAdapter(
+                    list[FunctionDefinition]).validate_json(output.text)
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    tool_calls=[
+                        tool_call_class(function=FunctionCall(
+                            name=tool_call.name,
+                            arguments=json.dumps(tool_call.parameters)))
+                        for tool_call in tool_calls
+                    ])
+
             # if the request doesn't use tool choice
             # OR specifies to not use a tool
             elif not request.tool_choice or request.tool_choice == "none":

From 482e9e9b002da8c6ff286153db58852e28213e18 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 2 Apr 2025 22:46:47 +0800
Subject: [PATCH 1167/1240] [CPU][Bugfix] Using custom allreduce for CPU
 backend (#15934)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 cmake/cpu_extension.cmake                     |   1 +
 csrc/cpu/cpu_types_x86.hpp                    |  58 ++
 csrc/cpu/shm.cpp                              | 781 ++++++++++++++++++
 csrc/cpu/torch_bindings.cpp                   |  43 +
 csrc/cpu/utils.cpp                            |   2 +-
 .../getting_started/installation/cpu.md       |   8 +-
 .../device_communicators/cpu_communicator.py  | 129 ++-
 vllm/worker/cpu_worker.py                     |   7 +
 8 files changed, 1013 insertions(+), 16 deletions(-)
 create mode 100644 csrc/cpu/shm.cpp

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index b57d9e22631..fdc03a79505 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -197,6 +197,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/quant.cpp"
+        "csrc/cpu/shm.cpp"
         ${VLLM_EXT_SRC})
 endif()
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 4568699b307..cf67847b45b 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit FP16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit FP16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit FP16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
 
   __m256i reg;
 
+  // normal load
   explicit BF16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
+  // non-temproal load
+  explicit BF16Vec16(bool, void* ptr)
+      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
@@ -313,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
 
+  // normal load
   explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
+  // non-temproal load
+  explicit FP32Vec16(bool, void* ptr)
+      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
+
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
   explicit FP32Vec16(const FP32Vec4& data)
@@ -547,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
     _mm_mask_storeu_epi8(ptr, mask, reg);
   }
 };
+
+struct INT8Vec64 : public Vec<INT8Vec64> {
+  constexpr static int VEC_ELEM_NUM = 64;
+  union AliasReg {
+    __m512i reg;
+    int8_t values[VEC_ELEM_NUM];
+  };
+
+  __m512i reg;
+
+  // normal load
+  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
+
+  // non-temproal load
+  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
+
+  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
+
+  void save(int8_t* ptr, const int elem_num) const {
+    constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
+    __mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
+    _mm512_mask_storeu_epi8(ptr, mask, reg);
+  }
+
+  // non-temproal save
+  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
+};
 #endif
 
 template <typename T>
@@ -657,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
 
 inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
 
+#ifdef __AVX512F__
+inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
+  _mm512_stream_si512((__m512i*)ptr, vec.reg);
+}
+inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
+  _mm256_stream_si256((__m256i*)ptr, vec.reg);
+}
+inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
+  _mm512_stream_ps((float*)ptr, vec.reg);
+}
+#endif
+
+inline void mem_barrier() { _mm_mfence(); }
 };  // namespace vec_op
 
 #endif
diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp
new file mode 100644
index 00000000000..f55e96de251
--- /dev/null
+++ b/csrc/cpu/shm.cpp
@@ -0,0 +1,781 @@
+#include "cpu/cpu_types.hpp"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+#define MAX_SHM_RANK_NUM 8
+#define MAX_THREAD_NUM 12
+#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024)
+#define MIN_THREAD_PROCESS_SIZE (8 * 1024)
+#define MAX_P2P_SEND_TENSOR_NUM 8
+
+template <typename scalar_t>
+struct KernelVecType {
+  using scalar_vec_t = void;
+};
+
+template <>
+struct KernelVecType<float> {
+  using scalar_vec_t = vec_op::FP32Vec16;
+};
+
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using scalar_vec_t = vec_op::BF16Vec16;
+};
+
+template <>
+struct KernelVecType<c10::Half> {
+  using scalar_vec_t = vec_op::FP16Vec16;
+};
+
+enum class ThreadSHMStat : char { THREAD_READY = 0, SHM_DATA_READY, DONE };
+
+struct ThreadSHMContext {
+  volatile ThreadSHMStat thread_stats[MAX_SHM_RANK_NUM];
+  int thread_id;
+  int thread_num;
+  int rank;
+  int group_size;
+  size_t _spinning_count;
+  int swizzled_ranks[MAX_SHM_RANK_NUM];
+  void* thread_shm_ptrs[MAX_SHM_RANK_NUM];
+  ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM];
+
+  ThreadSHMContext(const int thread_id, const int thread_num, const int rank,
+                   const int group_size, void* thread_shm_ptr)
+      : thread_id(thread_id),
+        thread_num(thread_num),
+        rank(rank),
+        group_size(group_size),
+        _spinning_count(0) {
+    static_assert(sizeof(ThreadSHMContext) % 64 == 0);
+    TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM);
+    TORCH_CHECK((size_t)this % 64 == 0);
+    TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0);
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      shm_contexts[i] = nullptr;
+      thread_shm_ptrs[i] = nullptr;
+      swizzled_ranks[i] = (i + rank) % group_size;
+      thread_stats[i] = ThreadSHMStat::DONE;
+    }
+    set_context(rank, this, thread_shm_ptr);
+  }
+
+  void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) {
+    TORCH_CHECK(rank < MAX_SHM_RANK_NUM);
+    TORCH_CHECK(ptr);
+    TORCH_CHECK(thread_shm_ptr);
+    TORCH_CHECK_EQ(ptr->thread_num, thread_num);
+    TORCH_CHECK_EQ(ptr->thread_id, thread_id);
+    shm_contexts[rank] = ptr;
+    thread_shm_ptrs[rank] = thread_shm_ptr;
+  }
+
+  template <typename T>
+  T* get_thread_shm_ptr(int rank) {
+    return reinterpret_cast<T*>(thread_shm_ptrs[rank]);
+  }
+
+  int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; }
+
+  void wait_for_all(ThreadSHMStat prev_stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      while (thread_stats[rank] == prev_stat) {
+        ++_spinning_count;
+        _mm_pause();
+      }
+    }
+    vec_op::mem_barrier();
+  }
+
+  void wait_for_one(int rank, ThreadSHMStat prev_stat) {
+    while (thread_stats[rank] == prev_stat) {
+      ++_spinning_count;
+      _mm_pause();
+    }
+    vec_op::mem_barrier();
+  }
+
+  void set_thread_stat(ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[this->rank] = stat;
+    }
+  }
+
+  void set_thread_stat(int target_rank, ThreadSHMStat stat) {
+    for (int idx = 0; idx < group_size; ++idx) {
+      int rank = get_swizzled_rank(idx);
+      shm_contexts[rank]->thread_stats[target_rank] = stat;
+    }
+  }
+
+  // barrier for all ranks in the group, used for all2all ops
+  // DONE -> THREAD_READY -> SHM_DATA_READY -> DONE -> ...
+  void barrier(ThreadSHMStat next_stat) {
+    if (next_stat == ThreadSHMStat::THREAD_READY) {
+      set_thread_stat(ThreadSHMStat::THREAD_READY);
+      wait_for_all(ThreadSHMStat::DONE);
+    } else if (next_stat == ThreadSHMStat::SHM_DATA_READY) {
+      set_thread_stat(ThreadSHMStat::SHM_DATA_READY);
+      wait_for_all(ThreadSHMStat::THREAD_READY);
+    } else if (next_stat == ThreadSHMStat::DONE) {
+      set_thread_stat(ThreadSHMStat::DONE);
+      wait_for_all(ThreadSHMStat::SHM_DATA_READY);
+    } else {
+      TORCH_CHECK(false, "Invalid next_stat to barrier.");
+    }
+  }
+
+  std::string to_string() const {
+    std::stringstream ss;
+    ss << "SHMContext:";
+    ss << "\nrank: " << rank;
+    ss << "\ngroup_size: " << group_size;
+    ss << "\nthread_num: " << thread_num;
+    ss << "\nthread_id: " << thread_id;
+
+    ss << "\nshm_ctx_stat_loop_seq: [";
+    for (int i = 0; i < group_size; ++i) {
+      ss << swizzled_ranks[i] << ", ";
+    }
+    ss << "]";
+
+    ss << "\nshm_contexts: [";
+    for (int i = 0; i < group_size; ++i) {
+      if (shm_contexts[i]) {
+        ss << shm_contexts[i]->rank << ", ";
+      }
+    }
+    ss << "]";
+
+    return ss.str();
+  }
+};
+
+class SHMManager {
+ public:
+  explicit SHMManager(const std::string& name, const int rank,
+                      const int group_size)
+      : _rank(rank),
+        _group_size(group_size),
+        _thread_num(std::min(torch::get_num_threads(), MAX_THREAD_NUM)),
+        _shm_names({""}),
+        _shared_mem_ptrs({nullptr}),
+        _shm_ctx(nullptr) {
+    _shm_names[rank] = get_shm_name(name, rank);
+    _shared_mem_ptrs[rank] = init_shm(rank);
+    _shm_ctx = reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank]);
+
+    for (int i = 0; i < _thread_num; ++i) {
+      ThreadSHMContext* ctx = new (_shm_ctx + i)
+          ThreadSHMContext(i, _thread_num, _rank, _group_size,
+                           compute_thread_shm_ptr(_shm_ctx, i));
+    }
+  }
+
+  void join(const std::string& name) {
+    for (int rank_idx = 0; rank_idx < _group_size; ++rank_idx) {
+      if (rank_idx != _rank) {
+        TORCH_CHECK(_shm_names[rank_idx].empty());
+        TORCH_CHECK(_shared_mem_ptrs[rank_idx] == nullptr);
+        _shm_names[rank_idx] = get_shm_name(name, rank_idx);
+        _shared_mem_ptrs[rank_idx] = init_shm(rank_idx);
+        ThreadSHMContext* target_ctx =
+            reinterpret_cast<ThreadSHMContext*>(_shared_mem_ptrs[rank_idx]);
+        for (int thread_idx = 0; thread_idx < _thread_num; ++thread_idx) {
+          _shm_ctx[thread_idx].set_context(
+              rank_idx, target_ctx + thread_idx,
+              compute_thread_shm_ptr(target_ctx, thread_idx));
+        }
+      }
+    }
+  }
+
+  ~SHMManager() { destroy_shm(); }
+
+  ThreadSHMContext* get_shm_ctx() const { return _shm_ctx; }
+
+  static std::string get_shm_name(const std::string& name, int rank) {
+    return name + "_" + std::to_string(rank);
+  }
+
+  static int64_t create_singleton_instance(const std::string& name,
+                                           const int group_size,
+                                           const int rank) {
+    std::lock_guard<std::mutex> guard(SingletonInstancesLock);
+    SingletonInstances.emplace_back(
+        std::make_unique<SHMManager>(name, rank, group_size));
+    return static_cast<int64_t>(SingletonInstances.size() - 1);
+  }
+
+  static SHMManager* get_singleton_instance(int64_t handle) {
+    return SingletonInstances[handle].get();
+  }
+
+ protected:
+  static std::vector<std::unique_ptr<SHMManager>> SingletonInstances;
+  static std::mutex SingletonInstancesLock;
+
+ private:
+  static size_t round_to_alignment(size_t num) {
+    return ((num + 63) / 64) * 64;
+  }
+
+  int8_t* compute_thread_shm_ptr(ThreadSHMContext* ctx, int thread_id) {
+    int8_t* thread_shm_ptr =
+        reinterpret_cast<int8_t*>(ctx) +
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    return thread_shm_ptr +
+           thread_id * round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES);
+  }
+
+  size_t compute_shm_size() {
+    const size_t rounded_rank_buffer_size =
+        round_to_alignment(PER_THREAD_SHM_BUFFER_BYTES) * _thread_num;
+    const size_t rounded_thread_shm_ctx_size =
+        round_to_alignment(_thread_num * sizeof(ThreadSHMContext));
+    const size_t shm_size =
+        rounded_thread_shm_ctx_size + rounded_rank_buffer_size;
+    return shm_size;
+  }
+
+  void* init_shm(int target_rank) {
+    const std::string& shm_name = _shm_names[target_rank];
+    const int local_rank = _rank;
+    const size_t shm_size = compute_shm_size();
+
+    int fd = -1;
+    if (local_rank == target_rank) {
+      fd = shm_open(shm_name.c_str(), O_CREAT | O_EXCL | O_RDWR,
+                    S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "create shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+
+      if (ftruncate(fd, shm_size) == -1)
+        TORCH_CHECK(false, "ftruncate in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    } else {
+      fd = shm_open(shm_name.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
+
+      if (fd == -1)
+        TORCH_CHECK(false, "open shm in SHMManager failed. errno: " +
+                               std::to_string(errno));
+    }
+
+    void* shm_ptr = mmap(nullptr, shm_size, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_POPULATE, fd, 0);
+
+    if (shm_ptr == MAP_FAILED) {
+      TORCH_CHECK(false,
+                  "mmap in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    if (close(fd) != 0) {
+      TORCH_CHECK(
+          false, "close in SHMManager failed. errno: " + std::to_string(errno));
+    }
+
+    TORCH_CHECK((size_t)shm_ptr % 64 == 0);
+
+    return shm_ptr;
+  }
+
+  void destroy_shm() {
+    std::stringstream ss;
+    ss << "local rank " << _rank << ": [";
+    for (int thread_id = 0; thread_id < _thread_num; ++thread_id) {
+      ss << _shm_ctx[thread_id]._spinning_count << ", ";
+    }
+    ss << "]\n";
+
+    for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) {
+      if (_shared_mem_ptrs[i] != nullptr) {
+        munmap(_shared_mem_ptrs[i], compute_shm_size());
+      }
+
+      if (!_shm_names[i].empty()) {
+        shm_unlink(_shm_names[i].c_str());
+      }
+    }
+  }
+
+  int _rank;
+  int _group_size;
+  int _thread_num;
+  std::array<std::string, MAX_SHM_RANK_NUM> _shm_names;
+  std::array<void*, MAX_SHM_RANK_NUM> _shared_mem_ptrs;
+  ThreadSHMContext* _shm_ctx;
+};
+
+namespace shm_cc_ops {
+template <typename scalar_t, typename F>
+void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) {
+  int thread_num = ctx->thread_num;
+  int64_t total_bytes = elem_num * sizeof(scalar_t);
+  int64_t total_units_num =
+      (total_bytes + MIN_THREAD_PROCESS_SIZE - 1) / MIN_THREAD_PROCESS_SIZE;
+  int64_t per_thread_units_num =
+      (total_units_num + thread_num - 1) / thread_num;
+  int64_t per_unit_elem_num = MIN_THREAD_PROCESS_SIZE / sizeof(scalar_t);
+  int64_t max_per_thread_iteration_elem_num =
+      PER_THREAD_SHM_BUFFER_BYTES / sizeof(scalar_t);
+  int64_t per_thread_elem_num = per_unit_elem_num * per_thread_units_num;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int i = 0; i < thread_num; ++i) {
+    int64_t offset = i * per_thread_elem_num;
+    int64_t end = std::min(elem_num, offset + per_thread_elem_num);
+    int64_t curr_elem_num =
+        std::min(max_per_thread_iteration_elem_num, end - offset);
+    ThreadSHMContext* thread_ctx = ctx + i;
+
+    while (curr_elem_num > 0) {
+      inner_func(thread_ctx, offset, curr_elem_num);
+
+      offset += max_per_thread_iteration_elem_num;
+      curr_elem_num = std::min(max_per_thread_iteration_elem_num, end - offset);
+    }
+  }
+}
+};  // namespace shm_cc_ops
+
+namespace shm_cc_ops {
+
+void memcpy_from_shm(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data(
+        true, (int8_t*)src + i);  // stream loading shm to avoid caching
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data(true, (int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+void memcpy_to_shm(void* dst, void* src, const int64_t bytes) {
+#pragma GCC unroll 4
+  for (int64_t i = 0; i < bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.nt_save((int8_t*)dst + i);
+  }
+}
+
+void memcpy(void* dst, void* src, const int64_t bytes) {
+  const int64_t aligned_bytes = ((bytes >> 6) << 6);  // 64 bytes aligned
+  int64_t i = 0;
+#pragma GCC unroll 4
+  for (; i < aligned_bytes; i += 64) {
+    vec_op::INT8Vec64 data((int8_t*)src + i);
+    data.save((int8_t*)dst + i);
+  }
+  if (aligned_bytes < bytes) {
+    vec_op::INT8Vec64 data((int8_t*)src + aligned_bytes);
+    data.save((int8_t*)dst + aligned_bytes, bytes - aligned_bytes);
+  }
+}
+
+template <typename scalar_t, int RANKS>
+void all_reduce_sum_impl(ThreadSHMContext* ctx, scalar_t* data,
+                         size_t elem_num) {
+  CPU_KERNEL_GUARD_IN(all_reduce_sum_impl)
+  using vec_t = typename KernelVecType<scalar_t>::scalar_vec_t;
+  constexpr int64_t vec_elem_num = vec_t::get_elem_num();
+  const int worldsize = ctx->group_size;
+
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+        scalar_t* thread_data_ptr = data + data_offset;
+        int64_t thread_data_elem_num = data_elem_num * sizeof(scalar_t);
+
+        scalar_t* remote_data_ptrs[RANKS - 1];
+        vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+          remote_data_ptrs[idx] = thread_ctx->get_thread_shm_ptr<scalar_t>(
+              thread_ctx->get_swizzled_rank(idx + 1));
+        });
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, thread_data_ptr,
+                                  thread_data_elem_num);
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t aligned_data_elem_num =
+            (data_elem_num / vec_elem_num) * vec_elem_num;
+        int64_t i = 0;
+#pragma GCC unroll 4
+        for (; i < aligned_data_elem_num; i += vec_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i);
+        }
+
+        if (i < data_elem_num) {
+          vec_t local_data(thread_data_ptr + i);  // load from cache
+          vec_op::FP32Vec16 local_data_fp32(local_data);
+          vec_op::unroll_loop<int, RANKS - 1>([&](int idx) {
+            vec_t remote_data(
+                true, remote_data_ptrs[idx] + i);  // stream load from shm
+            vec_op::FP32Vec16 remote_data_fp32(remote_data);
+            local_data_fp32 = local_data_fp32 + remote_data_fp32;  // sum reduce
+          });
+          vec_t reduced_data(local_data_fp32);
+          reduced_data.save(thread_data_ptr + i,
+                            data_elem_num - aligned_data_elem_num);
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+};  // namespace shm_cc_ops
+
+std::vector<std::unique_ptr<SHMManager>> SHMManager::SingletonInstances = {};
+std::mutex SHMManager::SingletonInstancesLock = {};
+
+template <typename scalar_t>
+void shm_allreduce_sum(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num) {
+  switch (ctx->group_size) {
+    case 2:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 2>(ctx, data, elem_num);
+      break;
+    case 3:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 3>(ctx, data, elem_num);
+      break;
+    case 4:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 4>(ctx, data, elem_num);
+      break;
+    case 8:
+      shm_cc_ops::all_reduce_sum_impl<scalar_t, 8>(ctx, data, elem_num);
+      break;
+    default:
+      TORCH_CHECK(false,
+                  "Invalid world size: " + std::to_string(ctx->group_size));
+  }
+}
+
+template <typename scalar_t>
+void shm_gather_impl(ThreadSHMContext* ctx, scalar_t* data, size_t elem_num,
+                     scalar_t** outputs, const int dst) {
+  CPU_KERNEL_GUARD_IN(shm_gather_impl)
+  const int worldsize = ctx->group_size;
+  TORCH_CHECK_LT(dst, worldsize);
+  shm_cc_ops::shm_cc_loop<scalar_t>(
+      ctx, elem_num,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        scalar_t* thread_shm_ptr =
+            thread_ctx->get_thread_shm_ptr<scalar_t>(rank);
+
+        thread_ctx->barrier(ThreadSHMStat::THREAD_READY);
+
+        shm_cc_ops::memcpy_to_shm(thread_shm_ptr, data + data_offset,
+                                  data_elem_num * sizeof(scalar_t));
+
+        thread_ctx->barrier(ThreadSHMStat::SHM_DATA_READY);
+
+        if (rank == dst) {
+          shm_cc_ops::memcpy(outputs[rank] + data_offset, data + data_offset,
+                             data_elem_num * sizeof(scalar_t));
+          for (int i = 1; i < worldsize; ++i) {
+            int src_rank = thread_ctx->get_swizzled_rank(i);
+            scalar_t* src_ptr =
+                thread_ctx->get_thread_shm_ptr<scalar_t>(src_rank);  // shm
+            scalar_t* dst_ptr = outputs[src_rank] + data_offset;
+            shm_cc_ops::memcpy_from_shm(dst_ptr, src_ptr,
+                                        data_elem_num * sizeof(scalar_t));
+          }
+        }
+
+        thread_ctx->barrier(ThreadSHMStat::DONE);
+      });
+
+  return;
+}
+
+struct MemPiece {
+  void* ptr;
+  int64_t size;
+
+  template <typename T>
+  T* data_ptr() {
+    return reinterpret_cast<T*>(ptr);
+  }
+};
+
+struct TensorListMeta {
+  int64_t tensor_bytes[MAX_P2P_SEND_TENSOR_NUM];
+  torch::ScalarType tensor_types[MAX_P2P_SEND_TENSOR_NUM];
+  int64_t tensor_num;
+  int64_t total_bytes;
+
+  TensorListMeta() : tensor_num(0), total_bytes(0) {
+    static_assert(sizeof(TensorListMeta) % 64 == 0);
+    static_assert(sizeof(TensorListMeta) <
+                  MIN_THREAD_PROCESS_SIZE);  // To ensure the metadata always
+                                             // hold by the thread 0
+    for (int i = 0; i < MAX_P2P_SEND_TENSOR_NUM; ++i) {
+      tensor_bytes[i] = 0;
+      tensor_ptrs[i] = nullptr;
+      tensor_types[i] = torch::ScalarType::Undefined;
+    }
+  }
+
+  // For send and recv
+  void bind_tensor_list(std::vector<torch::Tensor>& tensor_list) {
+    TORCH_CHECK(tensor_types[0] == torch::ScalarType::Undefined,
+                "Re-bind TensorListMeta is not allowed.")
+    TORCH_CHECK_LE(tensor_list.size(), MAX_P2P_SEND_TENSOR_NUM);
+    tensor_num = tensor_list.size();
+    int64_t bytes_sum = 0;
+    for (int i = 0; i < tensor_list.size(); ++i) {
+      torch::Tensor& t = tensor_list[i];
+      TORCH_CHECK(t.is_contiguous());
+      tensor_bytes[i] = t.nbytes();
+      tensor_types[i] = t.scalar_type();
+      tensor_ptrs[i] = t.data_ptr();
+      bytes_sum += t.nbytes();
+    }
+    total_bytes = bytes_sum;
+  }
+
+  // For recv
+  std::vector<torch::Tensor> generate_tensor_list() {
+    std::vector<torch::Tensor> tensor_list;
+    tensor_list.reserve(tensor_num);
+
+    for (int i = 0; i < tensor_num; ++i) {
+      int64_t bytes = tensor_bytes[i];
+      auto type = tensor_types[i];
+      int64_t elem_bytes = torch::elementSize(type);
+
+      TORCH_CHECK_EQ(bytes % elem_bytes, 0);
+      int64_t elem_num = bytes / elem_bytes;
+      auto options = torch::TensorOptions().dtype(type).device(torch::kCPU);
+      tensor_list.emplace_back(torch::empty({elem_num}, options));
+    }
+    return tensor_list;
+  }
+
+  MemPiece get_data(int64_t offset) {
+    for (int i = 0; i < tensor_num; ++i) {
+      if (offset < tensor_bytes[i]) {
+        return {reinterpret_cast<int8_t*>(tensor_ptrs[i]) + offset,
+                tensor_bytes[i] - offset};
+      }
+      offset -= tensor_bytes[i];
+    }
+    return {nullptr, 0};
+  }
+
+ private:
+  void* tensor_ptrs[MAX_P2P_SEND_TENSOR_NUM];
+  int8_t _padding[40];
+};
+
+void shm_send_tensor_list_impl(ThreadSHMContext* ctx,
+                               const std::vector<torch::Tensor>& tensor_list) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list_impl)
+  std::vector<torch::Tensor> tensor_list_with_metadata;
+  tensor_list_with_metadata.reserve(1 + tensor_list.size());
+
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  tensor_list_with_metadata.emplace_back(
+      torch::empty({sizeof(TensorListMeta)}, options));
+  tensor_list_with_metadata.insert(tensor_list_with_metadata.end(),
+                                   tensor_list.begin(), tensor_list.end());
+
+  torch::Tensor& metadata_tensor = tensor_list_with_metadata[0];
+  TORCH_CHECK_EQ(metadata_tensor.nbytes(), sizeof(TensorListMeta));
+
+  TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta();
+  metadata->bind_tensor_list(tensor_list_with_metadata);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata->total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        int rank = thread_ctx->rank;
+        // Wait until the receiver set the stat to DONE
+        thread_ctx->wait_for_one(rank, ThreadSHMStat::SHM_DATA_READY);
+
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata->get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              thread_ctx->get_thread_shm_ptr<int8_t>(rank) + curr_shm_offset,
+              frag.ptr, frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(rank, ThreadSHMStat::SHM_DATA_READY);
+      });
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list_impl(ThreadSHMContext* ctx,
+                                                     int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list_impl)
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(torch::kCPU);
+  torch::Tensor metadata_tensor =
+      torch::empty({sizeof(TensorListMeta)}, options);
+
+  // Wait until the sender set the stat of the thread 0 to SHM_DATA_READY
+  ctx->wait_for_one(src, ThreadSHMStat::DONE);
+  shm_cc_ops::memcpy(metadata_tensor.data_ptr(),
+                     ctx->get_thread_shm_ptr<void>(src),
+                     sizeof(TensorListMeta));
+  TensorListMeta* src_metadata =
+      reinterpret_cast<TensorListMeta*>(metadata_tensor.data_ptr());
+  std::vector<torch::Tensor> tensor_list_with_metadata =
+      src_metadata->generate_tensor_list();
+
+  TensorListMeta metadata;
+  metadata.bind_tensor_list(tensor_list_with_metadata);
+  TORCH_CHECK_EQ(metadata.tensor_num, src_metadata->tensor_num);
+  TORCH_CHECK_EQ(metadata.total_bytes, src_metadata->total_bytes);
+
+  shm_cc_ops::shm_cc_loop<int8_t>(
+      ctx, metadata.total_bytes,
+      [&](ThreadSHMContext* thread_ctx, int64_t data_offset,
+          int64_t data_elem_num) {
+        // Wait until the sender set the stat to SHM_DATA_READY
+        thread_ctx->wait_for_one(src, ThreadSHMStat::DONE);
+        int64_t curr_shm_offset = 0;
+        while (curr_shm_offset < data_elem_num) {
+          MemPiece frag = metadata.get_data(data_offset + curr_shm_offset);
+          frag.size = std::min(frag.size, data_elem_num - curr_shm_offset);
+          shm_cc_ops::memcpy(
+              frag.ptr,
+              thread_ctx->get_thread_shm_ptr<int8_t>(src) + curr_shm_offset,
+              frag.size);
+          curr_shm_offset += frag.size;
+        }
+
+        thread_ctx->set_thread_stat(src, ThreadSHMStat::DONE);
+      });
+
+  std::vector<torch::Tensor> tensor_list;
+  tensor_list.reserve(metadata.tensor_num - 1);
+  tensor_list.insert(tensor_list.begin(), tensor_list_with_metadata.begin() + 1,
+                     tensor_list_with_metadata.end());
+
+  return tensor_list;
+}
+}  // namespace
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_gather_impl)
+
+    if (outputs.has_value()) {
+      TORCH_CHECK_LE(outputs->size(), MAX_SHM_RANK_NUM);
+      scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+      for (int i = 0; i < outputs->size(); ++i) {
+        output_ptrs[i] = outputs->at(i).data_ptr<scalar_t>();
+      }
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                      dst);
+    } else {
+      shm_gather_impl(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel(), (scalar_t**)(0),
+                      dst);
+    }
+
+    CPU_KERNEL_GUARD_OUT(shm_gather_impl)
+  });
+}
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output) {
+  TORCH_CHECK(data.is_contiguous())
+  TORCH_CHECK(output.is_contiguous())
+
+  const int64_t input_elem_num = data.numel();
+  const int64_t output_elem_num = output.numel();
+  TORCH_CHECK_EQ(output_elem_num % input_elem_num, 0);
+  const int world_size = output_elem_num / input_elem_num;
+
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_all_gather_impl", [&] {
+    CPU_KERNEL_GUARD_IN(shm_all_gather_impl)
+    auto ctx = SHMManager::get_singleton_instance(handle)->get_shm_ctx();
+    TORCH_CHECK_EQ(ctx->group_size, world_size);
+
+    scalar_t* output_ptrs[MAX_SHM_RANK_NUM] = {nullptr};
+    for (int i = 0; i < world_size; ++i) {
+      output_ptrs[i] = output.data_ptr<scalar_t>() + i * input_elem_num;
+    }
+    shm_gather_impl(ctx, data.data_ptr<scalar_t>(), data.numel(), output_ptrs,
+                    ctx->rank);
+    CPU_KERNEL_GUARD_OUT(shm_all_gather_impl)
+  });
+}
+
+void shm_allreduce(int64_t handle, torch::Tensor& data) {
+  TORCH_CHECK(data.is_contiguous())
+  VLLM_DISPATCH_FLOATING_TYPES(data.scalar_type(), "shm_allreduce_sum", [&] {
+    CPU_KERNEL_GUARD_IN(shm_allreduce_sum)
+    shm_allreduce_sum(SHMManager::get_singleton_instance(handle)->get_shm_ctx(),
+                      data.data_ptr<scalar_t>(), data.numel());
+    CPU_KERNEL_GUARD_OUT(shm_allreduce_sum)
+  });
+}
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst) {
+  CPU_KERNEL_GUARD_IN(shm_send_tensor_list)
+  shm_send_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), tensor_list);
+  CPU_KERNEL_GUARD_OUT(shm_send_tensor_list)
+}
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src) {
+  CPU_KERNEL_GUARD_IN(shm_recv_tensor_list)
+  auto tensor_list = shm_recv_tensor_list_impl(
+      SHMManager::get_singleton_instance(handle)->get_shm_ctx(), src);
+  CPU_KERNEL_GUARD_OUT(shm_recv_tensor_list)
+  return tensor_list;
+}
+
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank) {
+  return SHMManager::create_singleton_instance(name, group_size, rank);
+}
+
+std::string join_shm_manager(int64_t handle, const std::string& name) {
+  auto shm_manager = SHMManager::get_singleton_instance(handle);
+  TORCH_CHECK(shm_manager);
+  shm_manager->join(name);
+  return shm_manager->get_shm_ctx()->to_string();
+}
\ No newline at end of file
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index ef5a2fb5c4d..7ae7e3386b4 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -22,6 +22,26 @@ void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                         torch::Tensor& kv_cache, double scale,
                         torch::Tensor& block_tables, torch::Tensor& seq_lens);
 
+int64_t init_shm_manager(const std::string& name, const int64_t group_size,
+                         const int64_t rank);
+
+std::string join_shm_manager(int64_t handle, const std::string& name);
+
+void shm_allreduce(int64_t handle, torch::Tensor& data);
+
+void shm_gather(int64_t handle, torch::Tensor& data,
+                const std::optional<std::vector<torch::Tensor>>& outputs,
+                int64_t dst);
+
+void shm_all_gather(int64_t handle, const torch::Tensor& data,
+                    torch::Tensor& output);
+
+void shm_send_tensor_list(int64_t handle,
+                          const std::vector<torch::Tensor>& tensor_list,
+                          int64_t dst);
+
+std::vector<torch::Tensor> shm_recv_tensor_list(int64_t handle, int64_t src);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -131,6 +151,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
+
+// SHM CCL
+#ifdef __AVX512F__
+  ops.def("init_shm_manager(str name, int group_size, int rank) -> int",
+          &init_shm_manager);
+  ops.def("join_shm_manager(int handle, str name) -> str", &join_shm_manager);
+  ops.def("shm_allreduce(int handle, Tensor! data) -> ()");
+  ops.impl("shm_allreduce", torch::kCPU, &shm_allreduce);
+  ops.def(
+      "shm_gather(int handle, Tensor data, Tensor[](a!)? outputs, int dst) -> "
+      "()");
+  ops.impl("shm_gather", torch::kCPU, &shm_gather);
+  ops.def(
+      "shm_all_gather(int handle, Tensor data, Tensor! output) -> "
+      "()");
+  ops.impl("shm_all_gather", torch::kCPU, &shm_all_gather);
+  ops.def(
+      "shm_send_tensor_list(int handle, Tensor[](a) tensor_list, int dst) -> "
+      "()");
+  ops.impl("shm_send_tensor_list", torch::kCPU, &shm_send_tensor_list);
+  ops.def("shm_recv_tensor_list(int handle, int src) -> Tensor[](a)",
+          &shm_recv_tensor_list);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index 42a1c1d924b..79771ecd9c0 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -18,7 +18,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
 
 #ifndef VLLM_NUMA_DISABLED
 std::string init_cpu_threads_env(const std::string& cpu_ids) {
-  bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
+  bitmask* omp_cpu_mask = numa_parse_cpustring_all(cpu_ids.c_str());
   TORCH_CHECK(omp_cpu_mask->size > 0);
   std::vector<int> omp_cpu_ids;
   omp_cpu_ids.reserve(omp_cpu_mask->size);
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/source/getting_started/installation/cpu.md
index e7e12bd6830..db22ef79c92 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@@ -272,12 +272,14 @@ $ python examples/offline_inference/basic/basic.py
 
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
 
-- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
 
-  - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
+  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
 
     ```console
     VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp
     ```
 
-  - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.inc.md).
+  - For each thread id list in `VLLM_CPU_OMP_THREADS_BIND`, users should guarantee threads in the list belong to a same NUMA node.
+
+  - Meanwhile, users should also take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, TP worker will be killed due to out-of-memory.
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index b920cd7e1ac..1f4b4faf119 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Optional
+import os
+from typing import List, Optional
 
 import torch
 from torch.distributed import ProcessGroup
 
+from vllm.platforms import current_platform
+from vllm.platforms.interface import CpuArchEnum
+
 from .base_device_communicator import DeviceCommunicatorBase
 
 
@@ -16,19 +20,120 @@ def __init__(self,
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
-        self.ipex_available = False
         self.dist_module = torch.distributed
-        try:
-            import intel_extension_for_pytorch as ipex
-            self.ipex_available = True
-            self.dist_module = ipex.distributed
-        except ImportError:
-            """
-            Intel IPEX not found. Falling back to PyTorch native 
-            all_reduce for CPU (e.g. MacOS)
-            """
-            pass
+
+        if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
+            self.dist_module = _CPUSHMDistributed(self)
 
     def all_reduce(self, input_):
         self.dist_module.all_reduce(input_, group=self.device_group)
         return input_
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> Optional[torch.Tensor]:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+
+        # Gather.
+        self.dist_module.gather(input_,
+                                gather_list,
+                                dst=self.ranks[dst],
+                                group=self.device_group)
+
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # NOTE: we have to use concat-style all-gather here,
+        # stack-style all-gather has compatibility issues with
+        # torch.compile . see https://github.com/pytorch/pytorch/issues/138795
+        output_size = (input_size[0] * self.world_size, ) + input_size[1:]
+        # Allocate output tensor.
+        output_tensor = torch.empty(output_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        self.dist_module.all_gather_into_tensor(output_tensor,
+                                                input_,
+                                                group=self.device_group)
+
+        # Reshape
+        output_tensor = output_tensor.reshape((self.world_size, ) + input_size)
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (self.world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+
+class _CPUSHMDistributed:
+
+    def __init__(self, communicator: CpuCommunicator):
+        instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        self.communicator = communicator
+
+        group_ranks = [str(rank) for rank in self.communicator.ranks]
+        shm_group_identifier = f"[{'-'.join(group_ranks)}]"
+        self.group_name = f"{instance_identifier}-{shm_group_identifier}-cpushm"
+
+        self.handle = self._init_cpu_shm()
+
+    def _init_cpu_shm(self) -> int:
+        handle = torch.ops._C.init_shm_manager(
+            self.group_name,
+            self.communicator.world_size,
+            self.communicator.rank,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+        torch.ops._C.join_shm_manager(
+            handle,
+            self.group_name,
+        )
+        torch.distributed.barrier(self.communicator.device_group)
+
+        return handle
+
+    def all_reduce(self,
+                   input: torch.Tensor,
+                   group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_allreduce(self.handle, input)
+
+    def gather(self,
+               input: torch.Tensor,
+               gather_list: Optional[List[torch.Tensor]],
+               dst: int = -1,
+               group: Optional[ProcessGroup] = None) -> None:
+        # Note: different from the torch gather, here we use local dst rank.
+        torch.ops._C.shm_gather(self.handle, input, gather_list,
+                                torch.distributed.get_group_rank(group, dst))
+
+    def all_gather_into_tensor(self,
+                               output: torch.Tensor,
+                               input: torch.Tensor,
+                               group: Optional[ProcessGroup] = None) -> None:
+        torch.ops._C.shm_all_gather(self.handle, input, output)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 5f35c1af2e7..1436a404335 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """A CPU worker class."""
+import os
 from typing import Dict, List, Optional, Set, Tuple, Type
 
 import torch
@@ -139,6 +140,8 @@ def __init__(
 
         self.local_rank = local_rank
         self.rank = rank
+        vllm_config.parallel_config.rank = rank
+
         self.distributed_init_method = distributed_init_method
 
         self.is_driver_worker = is_driver_worker
@@ -217,6 +220,10 @@ def init_device(self) -> None:
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
             if ret:
                 logger.info(ret)
+
+        # Note: unique identifier for creating allreduce shared memory
+        os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split(
+            ":")[-1]
         self.device = torch.device("cpu")
         self.init_distributed_environment()
         # Set random seed.

From e6a451b6de5ffd610eb1ccee9947abd6e6c774fe Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <lenronfu@gmail.com>
Date: Wed, 2 Apr 2025 22:47:31 +0800
Subject: [PATCH 1168/1240] [Model] use AutoWeightsLoader in model load_weights
 (#15770)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |   5 +
 vllm/model_executor/models/bamba.py    | 105 ++++++++++---------
 vllm/model_executor/models/exaone.py   | 140 +++++++++++++------------
 vllm/model_executor/models/falcon.py   | 104 +++++++++---------
 4 files changed, 189 insertions(+), 165 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index af0f7304c66..bf7e2b5b1c5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -218,6 +218,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
   * ✅︎
   * ✅︎
+- * `BambaForCausalLM`
+  * Bamba
+  * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
+  *
+  *
 - * `BloomForCausalLM`
   * BLOOM, BLOOMZ, BLOOMChat
   * `bigscience/bloom`, `bigscience/bloomz`, etc.
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index de0209d0b43..e5896f5fd35 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -34,7 +34,7 @@
 
 from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
                          SupportsQuant, SupportsV0Only)
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -363,6 +363,58 @@ def forward(
         hidden_states, _ = self.final_layernorm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if ".self_attn." in name:
+                name = name.replace(".self_attn", "")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
                        IsHybrid, SupportsV0Only, SupportsQuant):
@@ -502,52 +554,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if "A_log" in name:
-                name = name.replace("A_log", "A")
-
-            if ".self_attn." in name:
-                name = name.replace(".self_attn", "")
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 7d01dd37826..553c524ebc3 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -51,7 +51,7 @@
 from vllm.transformers_utils.configs.exaone import ExaoneConfig
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (PPMissingLayer, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -313,6 +313,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         lora_config = vllm_config.lora_config
 
         self.config = config
+        self.quant_config = quant_config
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
         self.vocab_size = config.vocab_size + lora_vocab
@@ -384,6 +385,72 @@ def forward(
         hidden_states, _ = self.ln_f(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+            (".gate_up_proj", ".c_fc_0", 0),
+            (".gate_up_proj", ".c_fc_1", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -481,71 +548,12 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            (".qkv_proj", ".q_proj", "q"),
-            (".qkv_proj", ".k_proj", "k"),
-            (".qkv_proj", ".v_proj", "v"),
-            (".gate_up_proj", ".c_fc_0", 0),
-            (".gate_up_proj", ".c_fc_1", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
+        loader = AutoWeightsLoader(
+            self,
             # With tie_word_embeddings, we can skip lm_head.weight
             # The weight might appear unnecessarily in the files if the model is
             # processed with quantization, LoRA, fine-tuning, etc.
-            if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                continue
-            if (self.quant_config is not None and
-                (scale_name := self.quant_config.get_cache_scale(name))):
-                # Loading kv cache quantization scales
-                param = params_dict[scale_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
-                                 loaded_weight[0])
-                weight_loader(param, loaded_weight)
-                loaded_params.add(scale_name)
-                continue
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Remapping the name of FP8 kv-scale.
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 7154ac2e6a5..0e67b1ec94f 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -49,7 +49,7 @@
 from vllm.transformers_utils.configs import RWConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -395,6 +395,54 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        total_num_heads = self.config.num_attention_heads
+        if self.config.new_decoder_architecture:
+            total_num_kv_heads = self.config.num_kv_heads
+        elif self.config.multi_query:
+            total_num_kv_heads = 1
+        else:
+            total_num_kv_heads = total_num_heads
+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            if "query_key_value" in name:
+                output_dim = getattr(param, "output_dim", None)
+                loaded_weight_shape = loaded_weight.shape
+                if output_dim is not None:
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] +
+                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
+                         -1) + loaded_weight_shape[output_dim + 1:])
+                    wq = loaded_weight.narrow(
+                        output_dim + 1, 0,
+                        num_query_heads_per_kv_head).reshape(
+                            *loaded_weight_shape[:output_dim], -1,
+                            *loaded_weight_shape[output_dim + 1:])
+                    wk = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    wv = loaded_weight.narrow(
+                        output_dim + 1, num_query_heads_per_kv_head + 1,
+                        1).reshape(*loaded_weight_shape[:output_dim], -1,
+                                   *loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class FalconForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {
@@ -462,51 +510,9 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        total_num_heads = self.config.num_attention_heads
-        if self.config.new_decoder_architecture:
-            total_num_kv_heads = self.config.num_kv_heads
-        elif self.config.multi_query:
-            total_num_kv_heads = 1
-        else:
-            total_num_kv_heads = total_num_heads
-        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if name == "lm_head.weight" and self.tie_word_embeddings:
-                # Falcon uses tied embeddings except Falcon-11b.
-                continue
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            if "query_key_value" in name:
-                output_dim = getattr(param, "output_dim", None)
-                loaded_weight_shape = loaded_weight.shape
-                if output_dim is not None:
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] +
-                        (total_num_kv_heads, num_query_heads_per_kv_head + 2,
-                         -1) + loaded_weight_shape[output_dim + 1:])
-                    wq = loaded_weight.narrow(
-                        output_dim + 1, 0,
-                        num_query_heads_per_kv_head).reshape(
-                            *loaded_weight_shape[:output_dim], -1,
-                            *loaded_weight_shape[output_dim + 1:])
-                    wk = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    wv = loaded_weight.narrow(
-                        output_dim + 1, num_query_heads_per_kv_head + 1,
-                        1).reshape(*loaded_weight_shape[:output_dim], -1,
-                                   *loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = torch.cat([wq, wk, wv], dim=output_dim)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)

From 2ce1ad9aab78676614e239642938e8f800c09a82 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 2 Apr 2025 23:04:43 +0800
Subject: [PATCH 1169/1240] [Misc] V1 LoRA support CPU offload (#15843)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index ba20e3fd751..1255d716a2e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2434,9 +2434,9 @@ def __post_init__(self):
                 f"max_loras ({self.max_loras})")
 
     def verify_with_cache_config(self, cache_config: CacheConfig):
-        # TODO LoRA supports CPU offload.
-        if cache_config.cpu_offload_gb > 0:
-            raise ValueError("CPU offload is not supported with LoRA yet.")
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
 
     def verify_with_model_config(self, model_config: ModelConfig):
         if self.lora_dtype in (None, "auto"):

From 448fd9e8aed6dfbc64befa1d052ce62afc753deb Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha.panpaliya@partner.ibm.com>
Date: Wed, 2 Apr 2025 21:49:39 +0530
Subject: [PATCH 1170/1240] =?UTF-8?q?Restricted=20cmake=20to=20be=20less?=
 =?UTF-8?q?=20than=20version=204=20as=204.x=20breaks=20the=20build=20of?=
 =?UTF-8?q?=E2=80=A6=20(#15859)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docker/Dockerfile.ppc64le | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index 913c289adc0..4540af4e8cd 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -38,7 +38,7 @@ RUN microdnf install -y openssl-devel dnf \
     && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
     && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
     && python -m pip install -U pip uv \
-    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
+    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
     && cd /tmp && touch control
@@ -238,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     && python -m pip install -U pip uv --no-cache \
     && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
     && make -C /numactl install \
-    && uv pip install cmake \
+    && uv pip install 'cmake<4' \
     && cmake --install /lapack/build \
     && uv pip uninstall cmake
 

From ac71778c4eb00a976dd30f1a8e17b80a8111c41d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 01:02:58 +0800
Subject: [PATCH 1171/1240] [misc] instruct pytorch to use nvml-based cuda
 check (#15951)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/__init__.py     | 20 ++++----------------
 vllm/env_override.py | 21 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 16 deletions(-)
 create mode 100644 vllm/env_override.py

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 457780824c7..52022fb8f01 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -4,9 +4,10 @@
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
 
-import os
-
-import torch
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import vllm.env_override  # isort:skip  # noqa: F401
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -23,19 +24,6 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-# set some common config/environment variables that should be set
-# for all processes created by vllm and all processes
-# that interact with vllm workers.
-# they are executed whenever `import vllm` is called.
-
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-# see https://github.com/vllm-project/vllm/issues/10480
-os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-# see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
-
 __all__ = [
     "__version__",
     "__version_tuple__",
diff --git a/vllm/env_override.py b/vllm/env_override.py
new file mode 100644
index 00000000000..0fa5b70c2ef
--- /dev/null
+++ b/vllm/env_override.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import torch
+
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1

From 479223ca6ce7f0c3c01066a17ee9a29f1bf44713 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 2 Apr 2025 16:36:24 -0600
Subject: [PATCH 1172/1240] [V1] Support Mistral3 in V1 (#15950)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md |  2 +-
 vllm/model_executor/models/mistral3.py | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index bf7e2b5b1c5..74b4eab9204 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -888,7 +888,7 @@ See [this page](#generative-models) for more information on how to use generativ
   * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
   *
   * ✅︎
-  *
+  * ✅︎
 - * `MllamaForConditionalGeneration`
   * Llama 3.2
   * T + I<sup>+</sup>
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 4cd9a7bf58e..872769dd649 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -31,12 +31,12 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
-                         SupportsV0Only)
+from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info, select_patch_features
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -425,7 +425,7 @@ def init_vision_tower_for_llava(
     info=_build_mistral3_info,
     dummy_inputs=Mistral3DummyInputsBuilder)
 class Mistral3ForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                       SupportsPP, SupportsV0Only):
+                                       SupportsPP):
 
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -518,7 +518,7 @@ def _parse_and_validate_image_input(
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
-            embed_is_patch=embed_is_patch,
+            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -557,7 +557,10 @@ def get_multimodal_embeddings(
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return vision_embeddings
+        return scatter_patch_features(
+            vision_embeddings,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,

From 2a3f6fb7350a3de111dd300a6105109c4e91af2f Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 3 Apr 2025 00:37:30 +0100
Subject: [PATCH 1173/1240] Fix `huggingface-cli[hf-xet]` ->
 `huggingface-cli[hf_xet]` (#15969)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/common.txt | 2 +-
 requirements/test.in    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index c754dd12bc2..7365a5b46a3 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.50.3
-huggingface-hub[hf-xet] >= 0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.19.1  # Required for Llama 3.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
diff --git a/requirements/test.in b/requirements/test.in
index eb74198ab9f..364747e9c08 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -31,7 +31,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
 transformers==4.50.3
-huggingface-hub[hf-xet]>=0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
 # quantization
 bitsandbytes>=0.45.3
 buildkite-test-collector==0.1.9

From cf6b741f4c4fa255f8928ecd298ab32cc5c297c5 Mon Sep 17 00:00:00 2001
From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:18:08 -0700
Subject: [PATCH 1174/1240] [V1][TPU] TPU-optimized top-p implementation
 (avoids scattering). (#15736)

Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Co-authored-by: root <root@t1v-n-822696b7-w-0.us-central2-b.c.tpu-prod-env-large-adhoc.internal>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh           |   4 +-
 tests/v1/tpu/test_topk_topp_sampler.py  | 132 ++++++++++++++++++++++++
 vllm/v1/sample/ops/topk_topp_sampler.py |  53 +++++++---
 3 files changed, 174 insertions(+), 15 deletions(-)
 create mode 100644 tests/v1/tpu/test_topk_topp_sampler.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 4aac57cca94..5b7ce9a7677 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -36,7 +36,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_6 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
     && echo TEST_7 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
+    && echo TEST_8 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py
new file mode 100644
index 00000000000..dce0303e68d
--- /dev/null
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+import math
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_tpu
+
+if not current_platform.is_tpu():
+    pytest.skip("This test needs a TPU.", allow_module_level=True)
+import torch_xla.core.xla_model as xm
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+TOLERANCE = 1e-6
+
+
+def test_topp_result_sums_past_p():
+    with torch.device(xm.xla_device()):
+        xm.set_rng_state(seed=33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE))
+        probs = logits.softmax(dim=-1)
+
+        # Random top-p values between 0 and 1.
+        p = torch.rand((BATCH_SIZE, ))
+
+        # Set p=1 for ~50% of requests in the batch (top-p disabled).
+        p.masked_fill_(torch.randint(0, 2, (BATCH_SIZE, ), dtype=bool), 1)
+
+        no_op_k = torch.tensor([VOCAB_SIZE])
+        logits_masked = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                              k=no_op_k,
+                                              p=p)
+
+        # Verify that the masked logit's probability sums to at least p.
+        probs.masked_fill_(logits_masked.isinf(), 0)
+        masked_prob_sum = probs.sum(dim=-1)
+
+        xm.mark_step()
+
+    # Perform assertion on CPU.
+    assert torch.all(torch.ge(masked_prob_sum.cpu() + TOLERANCE, p.cpu()))
+
+
+def test_topp_basic():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Expect the smallest elements to be dropped.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_topp_select_all():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([3, 3]),
+                                       p=torch.tensor([1.0, 1.0]))
+
+        xm.mark_step()
+
+    assert torch.allclose(logits.cpu(), result.cpu())
+
+
+def test_topp_with_ties():
+    with torch.device(xm.xla_device()):
+        # Input has multiple math.log(0.3).
+        logits = torch.tensor(
+            [[math.log(0.3),
+              math.log(0.3),
+              math.log(0.3),
+              math.log(0.1)]])
+
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([4]),
+                                       p=torch.tensor([0.2]))
+
+        xm.mark_step()
+
+    # All tie values are included in the top-p set. Tie breaking is left
+    # to be done during final sampling (all tie tokens have equal
+    # probability of being chosen).
+    expected_result = logits.clone().cpu()
+    expected_result[0, 3] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
+
+
+def test_both_topk_topp():
+    with torch.device(xm.xla_device()):
+        logits = torch.tensor([[math.log(0.2),
+                                math.log(0.3),
+                                math.log(0.5)],
+                               [math.log(0.5),
+                                math.log(0.1),
+                                math.log(0.4)]])
+
+        # Set k=1 for the first batch.
+        result = apply_top_k_top_p_tpu(logits=logits.clone(),
+                                       k=torch.tensor([1, 3]),
+                                       p=torch.tensor([0.79, 0.79]))
+
+        xm.mark_step()
+
+    # Since for the first batch k=1, expect only the largest element gets
+    # selected.
+    expected_result = logits.clone().cpu()
+    expected_result[0, 0] = float("-inf")
+    expected_result[0, 1] = float("-inf")
+    expected_result[1, 1] = float("-inf")
+    assert torch.allclose(expected_result, result.cpu())
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index d4bc23364c5..f69623edd63 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -122,23 +122,48 @@ def forward_tpu(
         k: Optional[torch.Tensor],
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        # If only top-k is specified, use pytorch's builtin topk op. This leads
-        # to significant speed up on TPU compared to using apply_top_k_top_p.
-        if k is not None and p is None:
-            topk_values, topk_indices = torch.topk(logits, k, dim=-1)
-
-            mask = torch.ones_like(logits, dtype=torch.bool)
-            mask.scatter_(-1, topk_indices, False)
-            logits.masked_fill_(mask, float('-inf'))
-        else:
-            # TODO Placeholder for TPU optimized topp kernel
-            # logits = apply_top_k_top_p(logits, k, p)
-            pass
-
+        logits = apply_top_k_top_p_tpu(logits, k, p)
         probs = logits.softmax(dim=-1, dtype=torch.float32)
         return random_sample(probs, generators)
 
 
+def apply_top_k_top_p_tpu(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+    p: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k and top-p optimized for TPU.
+
+    This algorithm avoids using torch.scatter which is extremely slow on TPU.
+    This is achieved by finding a "cut-off" element in the original logit, and
+    after thresholding the logit using this cut-off, the remaining elements
+    shall constitute the top-p set.
+
+    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    logit), all tie elements are included in the top-p set. In other words,
+    this function does not break ties. Instead, these tie tokens have equal
+    chance of being chosen during final sampling, so we can consider the tie
+    being broken then.
+    """
+    if k is not None:
+        logits = apply_top_k_only(logits, k)
+
+    if p is not None:
+        probs = logits.softmax(dim=-1)
+        probs_sort, _ = probs.sort(dim=-1, descending=False)
+        cumprob = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
@@ -199,7 +224,7 @@ def apply_top_k_only(
     max_top_k = k.max()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
-    k_index = k.sub_(1).unsqueeze(1)
+    k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
     top_k_mask = logits.topk(max_top_k, dim=1).values.gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))

From 5280c49623aa98ccd71f8f6f0199b68f95516461 Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Wed, 2 Apr 2025 17:25:14 -0700
Subject: [PATCH 1175/1240] [TPU] optimize the all-reduce performance (#15903)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/distributed/device_communicators/tpu_communicator.py | 7 ++++++-
 vllm/distributed/parallel_state.py                        | 5 ++++-
 vllm/v1/worker/tpu_worker.py                              | 6 ++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index 05cb1e0f6ef..de66ceaeef6 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -22,6 +22,8 @@
     import torch_xla.core.xla_model as xm
     import torch_xla.runtime as xr
     from torch_xla._internal import pjrt
+    from torch_xla.distributed.xla_multiprocessing import (
+        create_optimized_replica_groups)
 
     if USE_RAY:
         from vllm.executor import ray_utils
@@ -79,9 +81,12 @@ def __init__(self,
 
         pjrt.initialize_multiprocess(local_rank, local_world_size)
         xr._init_world_size_ordinal()
+        self.groups = create_optimized_replica_groups()
 
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        return xm.all_reduce(xm.REDUCE_SUM, input_)
+        # TODO: Remove the groups specification after XLA compiler can support
+        # auto-reordering the ring order for all-reduce.
+        return xm.all_reduce(xm.REDUCE_SUM, input_, groups=self.groups)
 
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 51485169483..fa493fefb8f 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -119,11 +119,13 @@ def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor:
 
 
 if supports_custom_op():
+    from vllm.platforms import current_platform
     direct_register_custom_op(
         op_name="all_reduce",
         op_func=all_reduce,
         mutates_args=[],
         fake_impl=all_reduce_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
 
 
@@ -219,7 +221,8 @@ def __init__(
                 self.cpu_group, 1 << 22, 6)
 
         from vllm.platforms import current_platform
-        self.use_custom_op_call = current_platform.is_cuda_alike()
+        self.use_custom_op_call = (current_platform.is_cuda_alike()
+                                   or current_platform.is_tpu())
 
     @property
     def first_rank(self):
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index 9add8cee02e..bd24072f4c1 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -84,6 +84,12 @@ def __init__(
 
     def init_device(self):
         os.environ["PJRT_DEVICE"] = "TPU"
+        # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D
+        # ring, the xla tpu compiler flag
+        # `xla_tpu_force_1d_allreduce_at_chunk_count` is a temporary solution to
+        # fix this. It will be removed after the bug in XLA compiler is fixed.
+        os.environ["LIBTPU_INIT_ARGS"] = (
+            "--xla_tpu_force_1d_allreduce_at_chunk_count=1")
         torch.set_grad_enabled(False)
         torch.set_default_dtype(self.model_config.dtype)
 

From ee357b757d3d036b5956b7b2589f934c2a654749 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 3 Apr 2025 03:36:01 +0200
Subject: [PATCH 1176/1240] [V1][TPU] Do not compile sampling more than needed
 (#15883)

Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_model_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index c2edbaf351d..b1d5c0f3385 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -862,7 +862,9 @@ def capture_model(self) -> None:
                 out = self.model.sample_from_hidden(dummy_hidden,
                                                     sampling_meta)
                 out = out.cpu()
-                if num_reqs_to_sample >= self.max_num_reqs:
+                # Requests can't be more than tokens. But do compile for the
+                # next bigger value in case num_tokens uses bucketed padding.
+                if num_reqs_to_sample >= min(num_tokens, self.max_num_reqs):
                     break
                 # Make sure to compile the `max_num_reqs` upper-limit case
                 num_reqs_to_sample = _get_padded_num_reqs_with_upper_limit(

From cd3e150f285d2e62cee38784f8c7637dc8809f4e Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 2 Apr 2025 19:48:00 -0700
Subject: [PATCH 1177/1240] [ROCM][KERNEL] Paged attention for V1 (#15720)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: root <root@banff-cyxtera-s65-4.amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: root <root@banff-cyxtera-s65-4.amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/rocm/attention.cu                        | 105 ++++++++----
 csrc/rocm/ops.h                               |   5 +-
 csrc/rocm/torch_bindings.cpp                  |   4 +-
 tests/kernels/test_prefix_prefill.py          |   4 +
 vllm/_custom_ops.py                           |   6 +-
 vllm/attention/backends/rocm_flash_attn.py    |  24 +--
 .../ops/chunked_prefill_paged_decode.py       | 157 ++++++++++++------
 vllm/attention/ops/paged_attn.py              |   2 +
 vllm/attention/ops/prefix_prefill.py          |   1 +
 vllm/platforms/rocm.py                        |  21 ++-
 vllm/v1/attention/backends/triton_attn.py     |   1 +
 11 files changed, 220 insertions(+), 110 deletions(-)

diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 8ab2af22f4d..2c3cae95e7f 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -272,6 +272,7 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,    
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -291,6 +292,13 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   const int rowid = laneid / 16;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
+
   const auto partition_idx = blockIdx.y;
 
   constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
@@ -377,9 +385,10 @@ __launch_bounds__(NUM_THREADS, 5) void paged_attention_ll4mi_QKV_mfma16_kernel(
   // fetch Q in shared across warps and then write to registers
   const int local_qhead_idx = 4 * warpid + rowid;
   const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
-  const int64_t seq_idx64 = static_cast<int64_t>(seq_idx);
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
   const scalar_t* q_ptr =
-      q + seq_idx64 * q_stride + global_qhead_idx * HEAD_SIZE;
+      q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
 
   const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
   if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
@@ -777,6 +786,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,   // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes, // [num_heads]
     const int q_stride,
@@ -794,6 +804,12 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
   const int lane4id = laneid % 4;
 
   const auto seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
   const auto partition_idx = blockIdx.y;
   const auto partition_size = blockDim.x;
   const auto max_num_partitions = gridDim.y;
@@ -882,9 +898,11 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     }
 
     // fetch q elements
-    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elems
+    // every 4 lanes fetch 8 elems, so warp fetches 8*16 = 128 elemsc
+    const int64_t query_start_off = static_cast<int64_t>(
+        query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
     const scalar_t* q_ptr =
-        q + seq_idx * q_stride + wg_start_head_idx * HEAD_SIZE;
+        q + query_start_off * q_stride + wg_start_head_idx * HEAD_SIZE;
     const _B16x8* q_ptrh8 = reinterpret_cast<const _B16x8*>(q_ptr);
     const int qhead_elemh8 = laneid / 4;
 
@@ -1267,10 +1285,19 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
                                            // max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   const auto num_heads = gridDim.x;
   const auto head_idx = blockIdx.x;
   const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
   const int context_len = context_lens[seq_idx];
   const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
   [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
@@ -1439,7 +1466,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
       __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
   acc *= inv_global_exp_sum;
 
-  OUTT* out_ptr = out + static_cast<int64_t>(seq_idx) * num_heads * HEAD_SIZE +
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
                   static_cast<int64_t>(head_idx) * HEAD_SIZE;
   if constexpr (std::is_same<OUTT, bit8_t>::value) {
     out_ptr[threadIdx.x] =
@@ -1466,6 +1495,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1492,6 +1522,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
     const float scale,
     const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
     const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
     const int q_stride,
@@ -1515,6 +1546,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
     const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
     const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
     const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
     const int max_num_partitions) {
   UNREACHABLE_CODE
 }
@@ -1522,34 +1554,34 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
 
 #endif  // defined(__HIP__MI300_MI250__) TODO: Add NAVI support
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                             \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE, \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,     \
-                                          GQA_RATIO>                          \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
-
-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                         GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                           \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,     \
-          block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq,         \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,        \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \
-          k_scale_ptr, v_scale_ptr);
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
+
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr);
 
 #define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
   paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
                                       PARTITION_SIZE, NPAR_LOOPS>    \
       <<<reduce_grid, reduce_block, 0, stream>>>(                    \
           out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, max_num_partitions);
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions);
 
 template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
           int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
@@ -1559,9 +1591,10 @@ void paged_attention_custom_launcher(
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, const int num_kv_heads, float scale,
     torch::Tensor& block_tables, torch::Tensor& context_lens,
-    int max_context_len, const std::optional<torch::Tensor>& alibi_slopes,
-    torch::Tensor& k_scale, torch::Tensor& v_scale) {
-  int num_seqs = query.size(0);
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
   int num_heads = query.size(1);
   int head_size = query.size(2);
   int max_num_blocks_per_seq = block_tables.size(1);
@@ -1569,6 +1602,13 @@ void paged_attention_custom_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
   // NOTE: alibi_slopes is optional.
   const float* alibi_slopes_ptr =
       alibi_slopes
@@ -1700,8 +1740,8 @@ void paged_attention_custom_launcher(
   paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, T, \
                                   PSIZE, ALIBI_ENABLED>(                    \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,    \
-      num_kv_heads, scale, block_tables, context_lens, max_context_len,     \
-      alibi_slopes, k_scale, v_scale);
+      num_kv_heads, scale, block_tables, context_lens, query_start_loc,     \
+      max_context_len, alibi_slopes, k_scale, v_scale);
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,      \
                                    PSIZE)                                      \
@@ -1750,6 +1790,7 @@ void paged_attention(
     double scale,
     torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& context_lens, // [num_seqs]
+    const std::optional<torch::Tensor>& query_start_loc, // [num_seqs]
     int64_t block_size, int64_t max_context_len,
     const std::optional<torch::Tensor>& alibi_slopes,
     const std::string& kv_cache_dtype, torch::Tensor& k_scale,
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index ba161951772..afb735450e0 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -7,8 +7,9 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                      torch::Tensor& query, torch::Tensor& key_cache,
                      torch::Tensor& value_cache, int64_t num_kv_heads,
                      double scale, torch::Tensor& block_tables,
-                     torch::Tensor& context_lens, int64_t block_size,
-                     int64_t max_context_len,
+                     torch::Tensor& context_lens,
+                     const std::optional<torch::Tensor>& query_start_loc,
+                     int64_t block_size, int64_t max_context_len,
                      const std::optional<torch::Tensor>& alibi_slopes,
                      const std::string& kv_cache_dtype, torch::Tensor& k_scale,
                      torch::Tensor& v_scale);
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index a5d2e2f97a3..537e9357d52 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -23,7 +23,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
       "                Tensor query, Tensor key_cache,"
       "                Tensor value_cache, int num_kv_heads,"
       "                float scale, Tensor block_tables,"
-      "                Tensor context_lens, int block_size,"
+      "                Tensor context_lens,"
+      "                Tensor? query_start_loc,"
+      "                int block_size,"
       "                int max_context_len,"
       "                Tensor? alibi_slopes,"
       "                str kv_cache_dtype,"
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index 50eaa92f59b..9333777d38e 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -164,6 +164,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -180,6 +181,7 @@ def test_contexted_kv_attention(
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -397,6 +399,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
@@ -413,6 +416,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
        block_table,
        b_start_loc,
        b_seq_len,
+       MAX_CTX_LEN,
        max_input_len,
        k_scale,
        v_scale,
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fe41a2d963b..719e02ecd68 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -110,6 +110,7 @@ def paged_attention_rocm(
     scale: float,
     block_tables: torch.Tensor,
     seq_lens: torch.Tensor,
+    query_start_loc: Optional[torch.Tensor],
     block_size: int,
     max_seq_len: int,
     alibi_slopes: Optional[torch.Tensor],
@@ -120,8 +121,9 @@ def paged_attention_rocm(
     torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query,
                                       key_cache, value_cache, num_kv_heads,
                                       scale, block_tables, seq_lens,
-                                      block_size, max_seq_len, alibi_slopes,
-                                      kv_cache_dtype, k_scale, v_scale)
+                                      query_start_loc, block_size, max_seq_len,
+                                      alibi_slopes, kv_cache_dtype, k_scale,
+                                      v_scale)
 
 
 def mla_decode_kvcache_cpu(
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index f19773bb284..9a4ee2ae706 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -17,16 +17,13 @@
                                            PagedAttentionMetadata)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 logger = init_logger(__name__)
-
 _PARTITION_SIZE_ROCM = 256
-_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
-_ON_NAVI = "gfx1" in _GPU_ARCH
-_ON_MI250_MI300 = any(arch in _GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
 
 class ROCmFlashAttentionBackend(AttentionBackend):
@@ -790,9 +787,9 @@ def forward(
             num_seqs, num_heads, head_size = decode_query.shape
             block_size = value_cache.shape[3]
             gqa_ratio = num_heads // self.num_kv_heads
-            use_custom = _use_rocm_custom_paged_attention(
+            use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
-                decode_meta.max_decode_seq_len)
+                decode_meta.max_decode_seq_len, self.sliding_window)
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
                                != AttentionType.ENCODER_DECODER else
@@ -817,6 +814,8 @@ def forward(
                     out = output[num_prefill_tokens:]
                 else:
                     out = output
+
+                query_start_loc = None
                 ops.paged_attention_rocm(
                     out,
                     exp_sums,
@@ -833,6 +832,7 @@ def forward(
                     decode_meta.seq_lens_tensor
                     if self.attn_type != AttentionType.ENCODER_DECODER else
                     decode_meta.encoder_seq_lens_tensor,
+                    query_start_loc,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
@@ -898,15 +898,3 @@ def _sdpa_attention(
             start = end
 
     return output
-
-
-def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
-                                     block_size: int, gqa_ratio: int,
-                                     max_seq_len: int) -> bool:
-    # rocm custom page attention not support on navi (gfx1*)
-    return (_ON_MI250_MI300 and not _ON_NAVI
-            and (qtype == torch.half or qtype == torch.bfloat16)
-            and (head_size == 64 or head_size == 128)
-            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
-            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 48db3ebfd74..1b47581641b 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -10,6 +10,9 @@
 import triton
 import triton.language as tl
 
+from vllm import _custom_ops as ops
+from vllm.platforms.rocm import use_rocm_custom_paged_attention
+
 from .prefix_prefill import context_attention_fwd
 
 
@@ -33,26 +36,26 @@ def kernel_paged_attention_2d(
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
         num_queries_per_kv_padded: tl.constexpr,  # int
-        block_table_stride: tl.constexpr,  # int
-        query_stride_0: tl.constexpr,  # int
-        query_stride_1: tl.constexpr,  # int, should be equal to head_size
-        output_stride_0: tl.constexpr,  # int
-        output_stride_1: tl.constexpr,  # int, should be equal to head_size
+        block_table_stride: tl.int64,  # int
+        query_stride_0: tl.int64,  # int
+        query_stride_1: tl.int64,  # int, should be equal to head_size
+        output_stride_0: tl.int64,  # int
+        output_stride_1: tl.int64,  # int, should be equal to head_size
         BLOCK_SIZE: tl.constexpr,  # int
         HEAD_SIZE: tl.constexpr,  # int
         HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
         USE_ALIBI_SLOPES: tl.constexpr,  # bool
         SLIDING_WINDOW: tl.constexpr,  # int
         x: tl.constexpr,  # int
-        stride_k_cache_0: tl.constexpr,  # int
-        stride_k_cache_1: tl.constexpr,  # int
-        stride_k_cache_2: tl.constexpr,  # int
-        stride_k_cache_3: tl.constexpr,  # int
-        stride_k_cache_4: tl.constexpr,  # int
-        stride_v_cache_0: tl.constexpr,  # int
-        stride_v_cache_1: tl.constexpr,  # int
-        stride_v_cache_2: tl.constexpr,  # int
-        stride_v_cache_3: tl.constexpr,  # int
+        stride_k_cache_0: tl.int64,  # int
+        stride_k_cache_1: tl.int64,  # int
+        stride_k_cache_2: tl.int64,  # int
+        stride_k_cache_3: tl.int64,  # int
+        stride_k_cache_4: tl.int64,  # int
+        stride_v_cache_0: tl.int64,  # int
+        stride_v_cache_1: tl.int64,  # int
+        stride_v_cache_2: tl.int64,  # int
+        stride_v_cache_3: tl.int64,  # int
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
 ):
@@ -212,6 +215,7 @@ def chunked_prefill_paged_decode(
     block_table,
     query_start_loc,
     seq_lens,
+    max_seq_len,
     max_query_len,
     k_scale,
     v_scale,
@@ -240,6 +244,7 @@ def chunked_prefill_paged_decode(
             b_loc=block_table,
             b_start_loc=query_start_loc,
             b_seq_len=seq_lens,
+            max_seq_len=max_seq_len,
             max_input_len=max_query_len,
             k_scale=k_scale,
             v_scale=v_scale,
@@ -275,43 +280,87 @@ def chunked_prefill_paged_decode(
     num_queries_per_kv_padded = max(triton.next_power_of_2(num_queries_per_kv),
                                     16)
 
-    kernel_paged_attention_2d[(
-        num_seqs,
-        num_kv_heads,
-    )](
-        output_ptr=output,
-        query_ptr=query,
-        key_cache_ptr=key_cache,
-        value_cache_ptr=value_cache,
-        block_tables_ptr=block_table,
-        seq_lens_ptr=seq_lens,
-        alibi_slopes_ptr=alibi_slopes,
-        scale=sm_scale,
-        k_scale=k_scale,
-        v_scale=v_scale,
-        num_query_heads=num_query_heads,
-        num_queries_per_kv=num_queries_per_kv,
-        num_queries_per_kv_padded=num_queries_per_kv_padded,
-        block_table_stride=block_table.stride(0),
-        query_stride_0=query.stride(0),
-        query_stride_1=query.stride(1),
-        output_stride_0=output.stride(0),
-        output_stride_1=output.stride(1),
-        BLOCK_SIZE=block_size,
-        HEAD_SIZE=head_size,
-        HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
-        USE_ALIBI_SLOPES=use_alibi_slopes,
-        SLIDING_WINDOW=sliding_window,
-        x=key_cache.shape[4],
-        stride_k_cache_0=key_cache.stride(0),
-        stride_k_cache_1=key_cache.stride(1),
-        stride_k_cache_2=key_cache.stride(2),
-        stride_k_cache_3=key_cache.stride(3),
-        stride_k_cache_4=key_cache.stride(4),
-        stride_v_cache_0=value_cache.stride(0),
-        stride_v_cache_1=value_cache.stride(1),
-        stride_v_cache_2=value_cache.stride(2),
-        stride_v_cache_3=value_cache.stride(3),
-        filter_by_query_len=True,
-        query_start_len_ptr=query_start_loc,
-    )
+    use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
+                                                 block_size,
+                                                 num_queries_per_kv,
+                                                 max_seq_len, sliding_window)
+    if use_custom:
+        _PARTITION_SIZE_ROCM = 256
+        max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
+                              _PARTITION_SIZE_ROCM)
+        assert _PARTITION_SIZE_ROCM % block_size == 0
+        total_num_seq = query.shape[0]
+        tmp_output = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions,
+                  head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(total_num_seq, num_query_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+
+        ops.paged_attention_rocm(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale=sm_scale,
+            block_tables=block_table,
+            seq_lens=seq_lens,
+            query_start_loc=query_start_loc,
+            block_size=block_size,
+            max_seq_len=max_seq_len,
+            alibi_slopes=alibi_slopes,
+            kv_cache_dtype=kv_cache_dtype,
+            k_scale=k_scale,
+            v_scale=v_scale,
+        )
+    else:
+        kernel_paged_attention_2d[(
+            num_seqs,
+            num_kv_heads,
+        )](
+            output_ptr=output,
+            query_ptr=query,
+            key_cache_ptr=key_cache,
+            value_cache_ptr=value_cache,
+            block_tables_ptr=block_table,
+            seq_lens_ptr=seq_lens,
+            alibi_slopes_ptr=alibi_slopes,
+            scale=sm_scale,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            num_query_heads=num_query_heads,
+            num_queries_per_kv=num_queries_per_kv,
+            num_queries_per_kv_padded=num_queries_per_kv_padded,
+            block_table_stride=block_table.stride(0),
+            query_stride_0=query.stride(0),
+            query_stride_1=query.stride(1),
+            output_stride_0=output.stride(0),
+            output_stride_1=output.stride(1),
+            BLOCK_SIZE=block_size,
+            HEAD_SIZE=head_size,
+            HEAD_SIZE_PADDED=triton.next_power_of_2(head_size),
+            USE_ALIBI_SLOPES=use_alibi_slopes,
+            SLIDING_WINDOW=sliding_window,
+            x=key_cache.shape[4],
+            stride_k_cache_0=key_cache.stride(0),
+            stride_k_cache_1=key_cache.stride(1),
+            stride_k_cache_2=key_cache.stride(2),
+            stride_k_cache_3=key_cache.stride(3),
+            stride_k_cache_4=key_cache.stride(4),
+            stride_v_cache_0=value_cache.stride(0),
+            stride_v_cache_1=value_cache.stride(1),
+            stride_v_cache_2=value_cache.stride(2),
+            stride_v_cache_3=value_cache.stride(3),
+            filter_by_query_len=True,
+            query_start_len_ptr=query_start_loc,
+        )
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index fd703413db9..827c3041a68 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -209,6 +209,7 @@ def forward_prefix(
         v_scale: torch.Tensor,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
+        max_seq_len = None
         context_attention_fwd(
             query,
             key,
@@ -221,6 +222,7 @@ def forward_prefix(
             # query_start_loc is (batch_size + 1,)
             query_start_loc,
             seq_lens_tensor,
+            max_seq_len,
             max_query_len,
             k_scale,
             v_scale,
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index e85ec605ad2..49ba476d78b 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -725,6 +725,7 @@ def context_attention_fwd(q,
                               b_loc,
                               b_start_loc,
                               b_seq_len,
+                              max_seq_len,
                               max_input_len,
                               k_scale: torch.Tensor,
                               v_scale: torch.Tensor,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 89b778c7b5b..1d0714305cc 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from functools import lru_cache, wraps
+from functools import cache, lru_cache, wraps
 from typing import TYPE_CHECKING, Dict, List, Optional
 
 import torch
@@ -98,6 +98,25 @@ def device_id_to_physical_device_id(device_id: int) -> int:
         return device_id
 
 
+@cache
+def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
+                                    block_size: int, gqa_ratio: int,
+                                    max_seq_len: int,
+                                    sliding_window: int) -> bool:
+
+    GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
+    ON_NAVI = "gfx1" in GPU_ARCH
+    ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
+
+    # rocm custom page attention not support on navi (gfx1*)
+    return (ON_MI250_MI300 and not ON_NAVI and (sliding_window == 0)
+            and (qtype == torch.half or qtype == torch.bfloat16)
+            and (head_size == 64 or head_size == 128)
+            and (block_size == 16 or block_size == 32)
+            and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768
+            and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+
+
 class RocmPlatform(Platform):
     _enum = PlatformEnum.ROCM
     device_name: str = "rocm"
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index f11f2b6271f..15b49b14c1d 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -168,6 +168,7 @@ def forward(
             block_table=attn_metadata.block_table,
             query_start_loc=attn_metadata.query_start_loc,
             seq_lens=attn_metadata.seq_lens,
+            max_seq_len=attn_metadata.max_seq_len,
             max_query_len=attn_metadata.max_query_len,
             k_scale=layer._k_scale,
             v_scale=layer._v_scale,

From fc59723bc7476d5804c1e9e6d1c380244f727c0c Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Thu, 3 Apr 2025 11:53:19 +0800
Subject: [PATCH 1178/1240] fix: better error message for get_config close
 #13889 (#15943)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/config.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 71990468c31..d27a126ddbc 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -262,6 +262,11 @@ def get_config(
                                      MISTRAL_CONFIG_NAME,
                                      revision=revision):
                 config_format = ConfigFormat.MISTRAL
+            else:
+                raise ValueError(
+                    "Could not detect config format for no config file found. "
+                    "Ensure your model has either config.json (HF format) "
+                    "or params.json (Mistral format).")
 
         except Exception as e:
             error_message = (
@@ -324,7 +329,14 @@ def get_config(
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
     else:
-        raise ValueError(f"Unsupported config format: {config_format}")
+        supported_formats = [
+            fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
+        ]
+        raise ValueError(
+            f"Unsupported config format: {config_format}. "
+            f"Supported formats are: {', '.join(supported_formats)}. "
+            f"Ensure your model uses one of these configuration formats "
+            f"or specify the correct format explicitly.")
 
     # Special architecture mapping check for GGUF models
     if is_gguf:

From ec961019927f97f8ba13e3d8b5e2da30b302b1f0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 12:25:01 +0800
Subject: [PATCH 1179/1240] [bugfix] add seed in torchrun_example.py (#15980)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/torchrun_example.py | 4 ++++
 vllm/config.py                                 | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index 35df6011550..7a57f29a07f 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -23,10 +23,14 @@
 
 # Use `distributed_executor_backend="external_launcher"` so that
 # this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
 llm = LLM(
     model="facebook/opt-125m",
     tensor_parallel_size=2,
     distributed_executor_backend="external_launcher",
+    seed=0,
 )
 
 outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
index 1255d716a2e..69cde4e362c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -761,6 +761,12 @@ def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
     ) -> None:
+
+        if parallel_config.distributed_executor_backend == "external_launcher":
+            assert self.seed is not None, (
+                "Seed must be set when using external launcher backend to "
+                "make sure sampling results are the same across workers.")
+
         total_num_attention_heads = getattr(self.hf_text_config,
                                             "num_attention_heads", 0)
         tensor_parallel_size = parallel_config.tensor_parallel_size

From 99d507cf6465e5ea9267fe9083640cbcf4e37638 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Wed, 2 Apr 2025 22:28:44 -0700
Subject: [PATCH 1180/1240] [ROCM][V0] PA kennel selection when no sliding
 window provided (#15982)

Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/rocm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 1d0714305cc..0bedd80e5ec 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -109,7 +109,8 @@ def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
     ON_MI250_MI300 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"])
 
     # rocm custom page attention not support on navi (gfx1*)
-    return (ON_MI250_MI300 and not ON_NAVI and (sliding_window == 0)
+    return (ON_MI250_MI300 and not ON_NAVI
+            and (sliding_window == 0 or sliding_window == (-1, -1))
             and (qtype == torch.half or qtype == torch.bfloat16)
             and (head_size == 64 or head_size == 128)
             and (block_size == 16 or block_size == 32)

From bcaf1c101d11271de843d06d6a200db9f0fa0d89 Mon Sep 17 00:00:00 2001
From: "Ziji Shi (Steven)" <shi.ziji.sm@gmail.com>
Date: Wed, 2 Apr 2025 23:09:18 -0700
Subject: [PATCH 1181/1240] [Benchmark] Add AIMO Dataset to Benchmark (#15955)

Signed-off-by: Ziji Shi <shi.ziji.sm@gmail.com>
Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/benchmark_dataset.py | 49 +++++++++++++++++++++++++++++++++
 benchmarks/benchmark_serving.py | 14 ++++++----
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index c2fbe2bb6d2..1d61485e70b 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -752,3 +752,52 @@ def sample(self,
                 ))
         self.maybe_oversample_requests(sampled_requests, num_requests)
         return sampled_requests
+
+
+# -----------------------------------------------------------------------------
+# AIMO Dataset Implementation
+# -----------------------------------------------------------------------------
+
+
+class AIMODataset(HuggingFaceDataset):
+    """
+    Dataset class for processing a AIMO dataset with reasoning questions.
+    """
+    SUPPORTED_DATASET_PATHS = {
+        "AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
+        "AI-MO/NuminaMath-CoT"
+    }
+
+    def sample(self,
+               tokenizer: PreTrainedTokenizerBase,
+               num_requests: int,
+               output_len: Optional[int] = None,
+               **kwargs) -> list:
+        sampled_requests = []
+        dynamic_output = output_len is None
+
+        for item in self.data:
+            if len(sampled_requests) >= num_requests:
+                break
+            prompt, completion = item['problem'], item["solution"]
+
+            prompt_ids = tokenizer(prompt).input_ids
+            completion_ids = tokenizer(completion).input_ids
+            prompt_len = len(prompt_ids)
+            completion_len = len(completion_ids)
+            output_len = completion_len if dynamic_output else output_len
+            assert isinstance(output_len, int) and output_len > 0
+            if dynamic_output and not is_valid_sequence(prompt_len,
+                                                        completion_len,
+                                                        max_prompt_len=2048,
+                                                        max_total_len=32000):
+                continue
+            sampled_requests.append(
+                SampleRequest(
+                    prompt=prompt,
+                    prompt_len=prompt_len,
+                    expected_output_len=output_len,
+                    multi_modal_data=None,
+                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
+        return sampled_requests
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ec2ed1a1750..59648222e0a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -49,10 +49,11 @@
 except ImportError:
     from argparse import ArgumentParser as FlexibleArgumentParser
 
-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               HuggingFaceDataset, InstructCoderDataset,
-                               RandomDataset, SampleRequest, ShareGPTDataset,
-                               SonnetDataset, VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, HuggingFaceDataset,
+                               InstructCoderDataset, RandomDataset,
+                               SampleRequest, ShareGPTDataset, SonnetDataset,
+                               VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -595,6 +596,9 @@ def main(args: argparse.Namespace):
             args.hf_split = "train"
         elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
             dataset_class = ConversationDataset
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_class = AIMODataset
+            args.hf_split = "train"
         else:
             supported_datasets = set([
                 dataset_name for cls in HuggingFaceDataset.__subclasses__()
@@ -610,10 +614,10 @@ def main(args: argparse.Namespace):
             dataset_path=args.dataset_path,
             dataset_subset=args.hf_subset,
             dataset_split=args.hf_split,
+            random_seed=args.seed,
         ).sample(
             num_requests=args.num_prompts,
             tokenizer=tokenizer,
-            random_seed=args.seed,
             output_len=args.hf_output_len,
         )
 

From e0cb32db62eb5443ed260c2210c1ea3f8c18c37e Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 14:45:03 +0800
Subject: [PATCH 1182/1240] [misc] improve error message for "Failed to infer
 device type" (#15994)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 69cde4e362c..92e887e0863 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1875,7 +1875,10 @@ def __init__(self, device: str = "auto") -> None:
             from vllm.platforms import current_platform
             self.device_type = current_platform.device_type
             if not self.device_type:
-                raise RuntimeError("Failed to infer device type")
+                raise RuntimeError(
+                    "Failed to infer device type, please set "
+                    "the environment variable `VLLM_LOGGING_LEVEL=DEBUG` "
+                    "to turn on verbose logging to help debug the issue.")
         else:
             # Device type is assigned explicitly
             self.device_type = device

From 40fb32372561c7a8a83cdca8e5c3e5e397c05c7e Mon Sep 17 00:00:00 2001
From: wwl2755 <wangwenlong2755@gmail.com>
Date: Thu, 3 Apr 2025 02:32:10 -0500
Subject: [PATCH 1183/1240] [Bugfix][V1] Fix bug from putting
 llm_engine.model_executor in a background process (#15367)

Signed-off-by: wwl2755 <wangwenlong2755@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../offline_inference/load_sharded_state.py   | 93 +++++++++++++++++++
 .../offline_inference/save_sharded_state.py   | 23 ++++-
 vllm/v1/engine/core.py                        | 10 ++
 vllm/v1/engine/core_client.py                 | 31 +++++++
 vllm/v1/worker/gpu_worker.py                  | 14 +++
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 examples/offline_inference/load_sharded_state.py

diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
new file mode 100644
index 00000000000..7e90d5d25e2
--- /dev/null
+++ b/examples/offline_inference/load_sharded_state.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Validates the loading of a model saved with the sharded_state format.
+This script demonstrates how to load a model that was previously saved
+using save_sharded_state.py and validates it by running inference.
+Example usage:
+(First need to save a sharded_state mode)
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save/sharded/modele
+
+python load_sharded_state.py \
+    --model /path/to/saved/sharded/model \
+    --load-format sharded_state \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --prompt "Hello, my name is" \
+    --max-tokens 50
+"""
+
+import dataclasses
+
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def parse_args():
+    parser = FlexibleArgumentParser()
+    # Add engine arguments
+    EngineArgs.add_cli_args(parser)
+
+    # Override default load_format for clarity
+    parser.set_defaults(load_format="sharded_state")
+
+    # Add validation arguments
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="Hello, world!",
+                        help="Prompt for validation")
+    parser.add_argument("--max-tokens",
+                        type=int,
+                        default=100,
+                        help="Maximum number of tokens to generate")
+    parser.add_argument("--temperature",
+                        type=float,
+                        default=0.7,
+                        help="Sampling temperature")
+    parser.add_argument("--top-p",
+                        type=float,
+                        default=1.0,
+                        help="Top-p sampling parameter")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    engine_args = EngineArgs.from_cli_args(args)
+
+    print(f"Loading model from {engine_args.model} "
+          f"using format {engine_args.load_format}")
+    print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
+
+    # Load the model using engine args
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Prepare sampling parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_tokens=args.max_tokens,
+    )
+
+    print("\nRunning inference:")
+    print(f"Prompt: {args.prompt}")
+
+    # Generate completion
+    outputs = llm.generate(args.prompt, sampling_params)
+
+    # Display generated text
+    print("\nGenerated outputs:")
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 50)
+        print(f"Full output: {args.prompt}{generated_text}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 863276432cb..6aac9b75c59 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -57,10 +57,25 @@ def main(args):
     # Prepare output directory
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
-    model_executor = llm.llm_engine.model_executor
-    model_executor.save_sharded_state(path=args.output,
-                                      pattern=args.file_pattern,
-                                      max_size=args.max_file_size)
+
+    # Check which engine version is being used
+    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
+
+    if is_v1_engine:
+        # For V1 engine, we need to use engine_core.save_sharded_state
+        print("Using V1 engine save path")
+        llm.llm_engine.engine_core.save_sharded_state(
+            path=args.output,
+            pattern=args.file_pattern,
+            max_size=args.max_file_size)
+    else:
+        # For V0 engine
+        print("Using V0 engine save path")
+        model_executor = llm.llm_engine.model_executor
+        model_executor.save_sharded_state(path=args.output,
+                                          pattern=args.file_pattern,
+                                          max_size=args.max_file_size)
+
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
         if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 19c7799b59b..39caca0c2a4 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -285,6 +285,16 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         return self.model_executor.pin_lora(lora_id)
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_executor.save_sharded_state(path=path,
+                                               pattern=pattern,
+                                               max_size=max_size)
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 99774ff4556..e948e59b8c4 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -119,6 +119,12 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -162,6 +168,12 @@ async def list_loras_async(self) -> set[int]:
     async def pin_lora_async(self, lora_id: int) -> bool:
         raise NotImplementedError
 
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        raise NotImplementedError
+
     async def collective_rpc_async(
             self,
             method: Union[str, Callable[..., _R]],
@@ -227,6 +239,12 @@ def list_loras(self) -> set[int]:
     def pin_lora(self, lora_id: int) -> bool:
         return self.engine_core.pin_lora(lora_id)
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.engine_core.save_sharded_state(path, pattern, max_size)
+
     def collective_rpc(self,
                        method: Union[str, Callable[..., _R]],
                        timeout: Optional[float] = None,
@@ -537,6 +555,12 @@ def collective_rpc(self,
         return self.call_utility("collective_rpc", method, timeout, args,
                                  kwargs)
 
+    def save_sharded_state(self,
+                           path: str,
+                           pattern: Optional[str] = None,
+                           max_size: Optional[int] = None) -> None:
+        self.call_utility("save_sharded_state", path, pattern, max_size)
+
 
 class AsyncMPClient(MPClient):
     """Asyncio-compatible client for multi-proc EngineCore."""
@@ -668,6 +692,13 @@ async def list_loras_async(self) -> set[int]:
     async def pin_lora_async(self, lora_id: int) -> bool:
         return await self.call_utility_async("pin_lora", lora_id)
 
+    async def save_sharded_state_async(self,
+                                       path: str,
+                                       pattern: Optional[str] = None,
+                                       max_size: Optional[int] = None) -> None:
+        await self.call_utility_async("save_sharded_state", path, pattern,
+                                      max_size)
+
     async def collective_rpc_async(
             self,
             method: Union[str, Callable[..., _R]],
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 191443683fa..2972e0ffb3b 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -269,6 +269,20 @@ def check_health(self) -> None:
         # worker will always be healthy as long as it's running.
         return
 
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        from vllm.model_executor.model_loader.loader import ShardedStateLoader
+        ShardedStateLoader.save_model(
+            self.model_runner.model,
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
 
 def init_worker_distributed_environment(
     parallel_config: ParallelConfig,

From 28cf2eae00a6a568e57bec672b7a1b9426c1fee5 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Thu, 3 Apr 2025 18:47:31 +0800
Subject: [PATCH 1184/1240] [doc] update contribution link (#15922)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 03643bb682b..aa1264abbb9 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing
 
 We welcome and value any contributions and collaborations.
-Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
 
 ## Sponsors
 

From 3e6ddbb2bff90dd0bcaa6e40dfb9f2a072d5bf94 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Thu, 3 Apr 2025 23:18:05 +0800
Subject: [PATCH 1185/1240] fix: tiny fix make format.sh excutable (#16015)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 format.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 format.sh

diff --git a/format.sh b/format.sh
old mode 100644
new mode 100755

From b66237363d10f9e22e54009e2157f528e6ec65f5 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 3 Apr 2025 11:23:19 -0400
Subject: [PATCH 1186/1240] [SupportsQuant] Bert, Blip, Blip2, Bloom (#15573)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/bert.py  | 10 ++++++----
 vllm/model_executor/models/blip.py  |  5 ++++-
 vllm/model_executor/models/blip2.py |  6 ++++--
 vllm/model_executor/models/bloom.py |  4 ++--
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 77b2ef0fce5..111b49ab8dd 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -26,7 +26,7 @@
 from vllm.transformers_utils.config import (
     get_cross_encoder_activation_function)
 
-from .interfaces import SupportsCrossEncoding, SupportsV0Only
+from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only
 from .utils import WeightsMapper, maybe_prefix
 
 
@@ -313,7 +313,8 @@ def forward(self, hidden_states: torch.Tensor,
         return hidden_states
 
 
-class BertModel(nn.Module):
+class BertModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]}
 
     def __init__(self,
                  *,
@@ -385,7 +386,7 @@ def load_weights(self, weights: Iterable[Tuple[str,
         return loaded_params
 
 
-class BertEmbeddingModel(nn.Module, SupportsV0Only):
+class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
@@ -443,7 +444,8 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
                                                 softmax=False)
 
 
-class BertForSequenceClassification(nn.Module, SupportsCrossEncoding):
+class BertForSequenceClassification(nn.Module, SupportsCrossEncoding,
+                                    SupportsQuant):
     """A model that uses Bert to provide embedding functionalities.
 
    This class encapsulates the BertModel and provides an interface for
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index bedbdceb772..f3d488926d0 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -16,6 +16,8 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
+from .interfaces import SupportsQuant
+
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
@@ -243,9 +245,10 @@ def forward(self, inputs_embeds: torch.Tensor):
         return hidden_states
 
 
-class BlipVisionModel(nn.Module):
+class BlipVisionModel(nn.Module, SupportsQuant):
     config_class = BlipVisionConfig
     main_input_name = "pixel_values"
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 7adca4f0dc8..db9d42f5b86 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -24,7 +24,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .blip import BlipVisionModel
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
@@ -498,7 +499,8 @@ def _get_prompt_updates(
 @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor,
                                         info=Blip2ProcessingInfo,
                                         dummy_inputs=Blip2DummyInputsBuilder)
-class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
+class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
+                                    SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 50f48f91798..f960075b98b 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP, SupportsV0Only
+from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -279,7 +279,7 @@ def forward(
         return hidden_states
 
 
-class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only):
+class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()

From 2c2c0e1eafb3ed6b811a536e055eec0852bd4b4d Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 3 Apr 2025 11:25:22 -0400
Subject: [PATCH 1187/1240] [SupportsQuant] Chameleon, Chatglm, Commandr
 (#15952)

Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/chameleon.py |  9 +++++++--
 vllm/model_executor/models/chatglm.py   | 12 ++++++++----
 vllm/model_executor/models/commandr.py  |  4 ++--
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index ebcd36148e0..f758c98ea5e 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -38,7 +38,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
+from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
+                         SupportsQuant)
 from .utils import (flatten_bn, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
@@ -927,7 +928,11 @@ def forward(
     info=ChameleonProcessingInfo,
     dummy_inputs=ChameleonDummyInputsBuilder)
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                        SupportsPP):
+                                        SupportsPP, SupportsQuant):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index a51a0af9e2b..1b1738f882b 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -29,7 +29,7 @@
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.configs import ChatGLMConfig
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -295,7 +295,11 @@ def forward(
 
 
 @support_torch_compile
-class ChatGLMModel(nn.Module):
+class ChatGLMModel(nn.Module, SupportsQuant):
+    packed_modules_mapping = {
+        "linear_proj.merged_proj":
+        ["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -395,7 +399,6 @@ def load_weights(self, weights: Iterable[Tuple[str,
 
 
 class ChatGLMBaseModel(nn.Module):
-
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={".word_embeddings": ""}, )
 
@@ -452,7 +455,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
+class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
+                         SupportsQuant):
     packed_modules_mapping = {
         "query_key_value": ["query_key_value"],
         "dense_h_to_4h": ["dense_h_to_4h"]
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index e7e73f446df..bb8d9bf8a03 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -49,7 +49,7 @@
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
 from .utils import (extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -332,7 +332,7 @@ def forward(
         return hidden_states
 
 
-class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",

From bfb1edfd66218d81336f980ee92c0423e722c21f Mon Sep 17 00:00:00 2001
From: Liangfu Chen <liangfc@amazon.com>
Date: Thu, 3 Apr 2025 09:51:32 -0700
Subject: [PATCH 1188/1240] [Neuron][kernel] Fuse kv cache into a single tensor
 (#15911)

Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/neuron/1_core/test_cache.py          |  4 +-
 tests/neuron/1_core/test_prefix_prefill.py | 13 ++--
 vllm/attention/ops/nki_flash_attn.py       | 85 ++++++++++------------
 3 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py
index ea33727b7cf..3d869cd2fa1 100644
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
     key_cache = torch.zeros_like(key_cache_cpu, device=device)
     value_cache = torch.zeros_like(value_cache_cpu, device=device)
     slot_mapping = slot_mapping_cpu.to(device)
+    kv_cache = torch.stack([key_cache, value_cache])
 
     # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+    reshape_and_cache(key, value, kv_cache, slot_mapping)
+    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
 
     # Move results back to CPU for comparison
     key_cache_result = key_cache.cpu()
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 5a811f6defe..8f7e711b525 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -258,13 +258,13 @@ def sample_inputs(
                              value[start_loc:end_loc])
             cur_ctx += block_size
             block_id += 1
+    kv_cache = torch.stack([k_cache, v_cache])
 
     return (
         query,
         k,
         v,
-        k_cache,
-        v_cache,
+        kv_cache,
         block_table,
         key,
         value,
@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
             query,
             k_active,
             v_active,
-            k_cache,
-            v_cache,
+            kv_cache,
             block_table,
             key,
             value,
@@ -439,8 +438,7 @@ def pad_to_next_power_of_2(a):
         query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
         v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
-        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
 
         # transform block table
         active_block_table = get_active_block_tables(
@@ -487,8 +485,7 @@ def pad_to_next_power_of_2(a):
             query.to(device=device),
             k.to(device=device),
             v.to(device=device),
-            k_cache.to(device=device),
-            v_cache.to(device=device),
+            kv_cache.to(device=device),
             active_block_table.to(device=device),
             attn_mask.to(device=device),
         )
diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py
index dcf9b0ef1f2..6bce5879c81 100644
--- a/vllm/attention/ops/nki_flash_attn.py
+++ b/vllm/attention/ops/nki_flash_attn.py
@@ -144,8 +144,7 @@ def transform_block_tables_for_indirect_load(
 def load_kv_tile_from_cache(
     cur_k_tile,
     cur_v_tile,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     large_k_tile_idx,
     num_blocks_per_large_tile,
@@ -169,8 +168,8 @@ def load_kv_tile_from_cache(
     for load_idx in nl.affine_range(num_loads):
         i_p = nl.arange(B_P_SIZE)[:, None]
         i_f = nl.arange(tiled_block_size * B_D_SIZE)[None, :]
-        loaded = nl.load(key_cache[block_tables[load_idx, i_p,
-                                                large_k_tile_idx], i_f])
+        loaded = nl.load(kv_cache[0, block_tables[load_idx, i_p,
+                                                  large_k_tile_idx], i_f])
         if cur_k_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_k_tile.dtype)
         # Transpose SBUF tensor using PE
@@ -185,7 +184,7 @@ def load_kv_tile_from_cache(
 
     # load value cache
     for load_idx in nl.affine_range(num_loads):
-        loaded = nl.load(value_cache[block_tables[load_idx, i_p,
+        loaded = nl.load(kv_cache[1, block_tables[load_idx, i_p,
                                                   large_k_tile_idx], i_f])
         if cur_v_tile.dtype != loaded.dtype:
             loaded = nl.copy(loaded, dtype=cur_v_tile.dtype)
@@ -418,8 +417,7 @@ def flash_paged_attention(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_tables,
     mask,
     softmax_scale=None,
@@ -434,8 +432,7 @@ def flash_paged_attention(
       - query: shape   (1, n_heads, d, seq_q)
       - key:   shape   (1, n_kv_heads, d, seq_k)
       - value: shape   (1, n_kv_heads, seq_v, d)
-      - key_cache: (num_blocks, n_kv_heads, block_size, d)
-      - value_cache: (num_blocks, n_kv_heads, block_size, d)
+      - kv_cache: (2, num_blocks, n_kv_heads, block_size, d)
       - block_tables: (num_active_blocks, )
       - mask: (seq_q, num_active_blocks * block_size + seq_q)
       - o: shape (1, n_heads, seq_q, d)
@@ -444,7 +441,7 @@ def flash_paged_attention(
       - We use continuous batching by default, so the batch dimension is
         always 1, and different requests are concatenated along sequence
         dimension.
-      - We use paged cache blocks (key_cache, value_cache) to store KV cache.
+      - We use paged cache blocks (kv_cache) to store KV cache.
 
     IO tensor dtypes:
       - This kernel assumes all IO tensors have the same dtype except for
@@ -475,15 +472,13 @@ def flash_paged_attention(
     b, h, d, seqlen_q = query.shape
     B_D_SIZE = d
     n_tile_q = seqlen_q // B_P_SIZE  # since q will be loaded on tensor engine
-    num_blocks, k_h, block_size, _ = key_cache.shape
+    _, num_blocks, k_h, block_size, _ = kv_cache.shape
     q_h_per_k_h = h // k_h
     assert b == 1, f"invalid batch size {b=}"
     assert d <= 128, f" we do not support head_dim > 128, got head dim {d=}"
-    cache_shape = (num_blocks, k_h, block_size, d)
-    assert (tuple(key_cache.shape) == cache_shape
-            ), f"{key_cache.shape=} mismatch, expect {cache_shape}"
-    assert (tuple(value_cache.shape) == cache_shape
-            ), f"{value_cache.shape=} mismatch, expect {cache_shape}"
+    cache_shape = (2, num_blocks, k_h, block_size, d)
+    assert (tuple(kv_cache.shape) == cache_shape
+            ), f"{kv_cache.shape=} mismatch, expect {cache_shape}"
     assert key is None or tuple(key.shape) == (
         1,
         k_h,
@@ -580,13 +575,13 @@ def flash_paged_attention(
         head_id=head_id,
     )
 
-    # Flatten KV cache to be 2D for loading into SBUF
+    # Flatten KV cache to be 3D for loading into SBUF
     new_cache_shape = (
+        2,
         num_blocks * k_h * block_size_tiling_factor,
         tiled_block_size * d,
     )
-    key_cache = key_cache.reshape(new_cache_shape)
-    value_cache = value_cache.reshape(new_cache_shape)
+    kv_cache = kv_cache.reshape(new_cache_shape)
 
     # Global Flash Attention accumulators
     o_buffer = nl.zeros(
@@ -621,8 +616,7 @@ def flash_paged_attention(
         load_kv_tile_from_cache(
             cur_k_tile=cur_k_tile,
             cur_v_tile=cur_v_tile,
-            key_cache=key_cache,
-            value_cache=value_cache,
+            kv_cache=kv_cache,
             block_tables=block_tables_sbuf,
             large_k_tile_idx=large_k_tile_idx,
             num_blocks_per_large_tile=num_blocks_per_large_tile,
@@ -821,8 +815,7 @@ def flash_attn_varlen_nkifunc(
     query,
     key,
     value,
-    key_cache,
-    value_cache,
+    kv_cache,
     block_table,
     attn_mask,
     n_kv_head=None,
@@ -838,8 +831,7 @@ def flash_attn_varlen_nkifunc(
       - query: (1, n_heads, d, seq_q)
       - key:   (1, n_kv_heads, d, seq_k)
       - value: (1, n_kv_heads, seq_v, d)
-      - key_cache:   (n_blocks, n_kv_heads, block_size, d)
-      - value_cache: (n_blocks, n_kv_heads, block_size, d)
+      - kv_cache:   (2, n_blocks, n_kv_heads, block_size, d)
       - block_tables: (n_active_blocks, )
       - attn_mask: (seq_q, n_active_blocks * block_size + seq_q)
 
@@ -849,17 +841,17 @@ def flash_attn_varlen_nkifunc(
         for better DMA throughput
     """
     if n_kv_head is None:
-        n_kv_head = key_cache.shape[1]
-    assert key_cache.shape[1] == n_kv_head
+        n_kv_head = kv_cache.shape[2]
+    assert kv_cache.shape[0] == 2
+    assert kv_cache.shape[2] == n_kv_head
     if head_size is None:
-        head_size = key_cache.shape[-1]
+        head_size = kv_cache.shape[-1]
 
     kwargs = dict(
         query=query,
         key=key,
         value=value,
-        key_cache=key_cache,
-        value_cache=value_cache,
+        kv_cache=kv_cache,
         block_tables=block_table,
         mask=attn_mask,
         softmax_scale=1.0 / (head_size**0.5),
@@ -874,8 +866,7 @@ def flash_attn_varlen_nkifunc(
 def reshape_and_cache(
     key: torch.Tensor,
     value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: torch.Tensor,
     slot_mapping: torch.Tensor,
 ) -> None:
     """
@@ -886,29 +877,29 @@ def reshape_and_cache(
             (num_tokens, n_kv_head, d_head)
         value (torch.Tensor): Value tensor with shape 
             (num_tokens, n_kv_head, d_head)
-        key_cache (torch.Tensor): Key cache tensor with shape 
-            (num_blocks, n_kv_head, block_size, d_head)
-        value_cache (torch.Tensor): Value cache tensor with shape
-            (num_blocks, n_kv_head, block_size, d_head) 
+        kv_cache (torch.Tensor): Key/value cache tensor with shape 
+            (2, num_blocks, n_kv_head, block_size, d_head)
         slot_mapping (torch.Tensor): Mapping tensor indicating cache positions
             with shape (num_tokens)
 
     Returns:
-        None: Updates the key_cache and value_cache tensors in-place
+        None: Updates the kv_cache tensor in-place
     """
-    block_size = key_cache.size(2)
+    block_size = kv_cache.size(3)
+    n_kv_head = key.size(1)
 
     # Calculate indices with explicit floor division
     block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
     block_offsets = slot_mapping % block_size
 
+    # Create the head indices tensor
+    head_indices = torch.arange(n_kv_head, device=key.device)
+
     # Update caches using index_put_
-    key_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(key_cache.size(1),
-                      device=key.device), block_offsets.unsqueeze(1)), key)
-
-    value_cache.index_put_(
-        (block_indices.unsqueeze(1),
-         torch.arange(value_cache.size(1),
-                      device=value.device), block_offsets.unsqueeze(1)), value)
+    kv_cache.index_put_(
+        (torch.tensor([0], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), key)
+
+    kv_cache.index_put_(
+        (torch.tensor([1], device=key.device), block_indices[:, None],
+         head_indices[None, :], block_offsets[:, None]), value)

From 453ee0ddc0b8a61252241b160688dc92cf610f4e Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:19:38 -0400
Subject: [PATCH 1189/1240] [Minor] Fused experts refactor (#15914)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_block_fp8.py               |   9 +-
 tests/kernels/test_cutlass_moe.py             |  16 +-
 .../layers/fused_moe/__init__.py              |   6 +-
 .../layers/fused_moe/cutlass_moe.py           | 144 ++++
 .../layers/fused_moe/deep_gemm_moe.py         | 294 +++++++
 .../layers/fused_moe/fused_moe.py             | 767 +-----------------
 .../layers/fused_moe/moe_align_block_size.py  | 243 ++++++
 vllm/model_executor/layers/fused_moe/utils.py |  48 ++
 8 files changed, 790 insertions(+), 737 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/cutlass_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
 create mode 100644 vllm/model_executor/layers/fused_moe/moe_align_block_size.py
 create mode 100644 vllm/model_executor/layers/fused_moe/utils.py

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index fda981f4c80..c4488c0c6ff 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -9,8 +9,11 @@
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
-from vllm.model_executor.layers.fused_moe.fused_moe import (
-    deep_gemm_moe_fp8, fused_topk, moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8, w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
@@ -437,7 +440,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed):
         pytest.skip(
             f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}")
 
-    if (N <= 512):
+    if N <= 512:
         pytest.skip("Skipping N <= 512 until performance issues solved.")
 
     vllm_config = VllmConfig()
diff --git a/tests/kernels/test_cutlass_moe.py b/tests/kernels/test_cutlass_moe.py
index 1652c72d86f..3cfed6ae853 100644
--- a/tests/kernels/test_cutlass_moe.py
+++ b/tests/kernels/test_cutlass_moe.py
@@ -4,8 +4,8 @@
 
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
-                                                            fused_experts,
+from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
                                                             fused_topk)
 from vllm.platforms import current_platform
 
@@ -131,9 +131,9 @@ def test_cutlass_moe_no_graph(
                                          c_strides2,
                                          a1_scale=a_scale1)
 
-        print(triton_output)
-        print(cutlass_output)
-        print("*")
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
 
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
@@ -234,9 +234,9 @@ def test_cutlass_moe_cuda_graph(
         graph.replay()
         torch.cuda.synchronize()
 
-        print(triton_output)
-        print(cutlass_output)
-        print("*")
+        #print(triton_output)
+        #print(cutlass_output)
+        #print("*")
 
         torch.testing.assert_close(triton_output,
                                    cutlass_output,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index e096d14fc6f..9829ccdb384 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -35,9 +35,11 @@ def get_config() -> Optional[Dict[str, Any]]:
     # import to register the custom ops
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
+    from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+        cutlass_moe_fp8)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
-        cutlass_moe_fp8, fused_experts, fused_moe, fused_topk,
-        get_config_file_name, grouped_topk)
+        fused_experts, fused_moe, fused_topk, get_config_file_name,
+        grouped_topk)
 
     __all__ += [
         "fused_moe",
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
new file mode 100644
index 00000000000..a17afd1b357
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Fused MoE kernel."""
+from typing import Optional
+
+import torch
+
+from vllm import _custom_ops as ops
+
+
+#TODO make the grouped gemm kernel consistent with scaled gemm kernel
+def cutlass_moe_fp8(
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    ab_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    ab_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    out_dtype: torch.dtype = torch.half,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - gating_output (torch.Tensor): The output of the gating operation
+        (before softmax).
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - ab_strides1 (torch.Tensor): The input and weights strides of the first
+        grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - ab_strides2 (torch.Tensor): The input and weights strides of the second
+        grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+    - out_dtype (torch.Tensor): The output tensor type.
+
+    Returns:
+    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
+    """
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.float8_e4m3fn
+    assert w2_q.dtype == torch.float8_e4m3fn
+    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
+    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
+        0], "Input scale shape mismatch"
+    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
+        1] == w1_q.shape[2], "W1 scale shape mismatch"
+    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
+        1] == w2_q.shape[2], "W2 scale shape mismatch"
+    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[
+        0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[
+        0], "w2 scales expert number mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+    assert ab_strides1.shape[0] == w1_q.shape[
+        0], "AB Strides 1 expert number mismatch"
+    assert c_strides1.shape[0] == w1_q.shape[
+        0], "C Strides 1 expert number mismatch"
+    assert ab_strides2.shape[0] == w2_q.shape[
+        0], "AB Strides 2 expert number  mismatch"
+    assert c_strides2.shape[0] == w2_q.shape[
+        0], "C Strides 2 expert number mismatch"
+    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
+
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(1)
+    n = w2_q.size(1)
+
+    topk = topk_ids.size(1)
+    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
+        a2_scale.numel() != 1 if a2_scale is not None else False)
+
+    a_q, a1_scale = ops.scaled_fp8_quant(
+        a, a1_scale, use_per_token_if_dynamic=per_act_token)
+    device = a_q.device
+
+    expert_offsets = torch.empty((num_experts + 1),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes1 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+    problem_sizes2 = torch.empty((num_experts, 3),
+                                 dtype=torch.int32,
+                                 device=device)
+
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+
+    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
+                                problem_sizes2, a_map, c_map, num_experts, n,
+                                k)
+
+    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
+    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
+
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
+    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
+
+    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
+                       expert_offsets[:-1], problem_sizes1, ab_strides1,
+                       ab_strides1, c_strides1)
+
+    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
+    torch.ops._C.silu_and_mul(intermediate, c1)
+
+    intemediate_q, a2_scale = ops.scaled_fp8_quant(
+        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
+
+    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
+                       expert_offsets[:-1], problem_sizes2, ab_strides2,
+                       ab_strides2, c_strides2)
+
+    return (c2[c_map].view(m, topk, k) *
+            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
new file mode 100644
index 00000000000..353c8cc9d59
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -0,0 +1,294 @@
+# SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import Optional, Tuple
+
+import torch
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm,
+                                                        _fp8_quantize,
+                                                        _resize_cache)
+from vllm.utils import round_up
+
+logger = init_logger(__name__)
+
+has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
+
+
+def _valid_deep_gemm(hidden_states: torch.Tensor,
+                     w1: torch.Tensor,
+                     w2: torch.Tensor,
+                     expert_map: Optional[torch.Tensor] = None) -> bool:
+    """
+    Check if the given problem size is supported by the DeepGemm grouped
+    gemm kernel.  All of M, N, K and the quantization block_shape must be
+    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
+    """
+    if not has_deep_gemm:
+        return False
+
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    # Expert maps not supported yet.
+    if expert_map is not None:
+        return False
+
+    align = dg.get_m_alignment_for_contiguous_layout()
+    M = hidden_states.shape[0]
+    _, K, N = w2.shape
+
+    # For now, disable DeepGemm for small N until better permute/unpermute
+    # ops are available.
+    if N <= 512:
+        return False
+
+    if align > M or N % align != 0 or K % align != 0:
+        return False
+
+    return (hidden_states.is_contiguous() and w1.is_contiguous()
+            and w2.is_contiguous())
+
+
+def _moe_permute(
+    curr_hidden_states: torch.Tensor,
+    a1q_scale: Optional[torch.Tensor],
+    curr_topk_ids: torch.Tensor,
+    global_num_experts: int,
+    expert_map: Optional[torch.Tensor],
+    block_m: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor]]:
+    """
+    Determine the sorted_token_ids, expert_ids for the given problem size.
+    Permute the hidden states and scales according to `sorted_token_ids`.
+    """
+    top_k_num = curr_topk_ids.shape[1]
+
+    tokens_in_chunk, _ = curr_hidden_states.shape
+
+    sorted_token_ids, expert_ids, num_tokens_post_padded = (
+        moe_align_block_size(curr_topk_ids,
+                             block_m,
+                             global_num_experts,
+                             expert_map,
+                             pad_sorted_ids=True))
+
+    inv_perm: Optional[torch.Tensor] = None
+
+    num_tokens = top_k_num * tokens_in_chunk
+    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
+    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
+    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
+
+    # Permute according to sorted token ids.
+    curr_hidden_states = _fp8_perm(curr_hidden_states,
+                                   sorted_token_ids // top_k_num)
+
+    if a1q_scale is not None:
+        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
+
+    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+            inv_perm)
+
+
+def _moe_unpermute_and_reduce(
+    out: torch.Tensor,
+    curr_hidden: torch.Tensor,
+    inv_perm: Optional[torch.Tensor],
+    topk_weight: torch.Tensor,
+) -> None:
+    """
+    Unpermute the final result and apply topk_weights, then perform the final
+    reduction on the hidden states.
+    """
+    M, topk = topk_weight.shape
+    K = curr_hidden.shape[1]
+    curr_hidden = curr_hidden[inv_perm, ...]
+    curr_hidden = curr_hidden.view(-1, topk, K)
+    curr_hidden.mul_(topk_weight.view(M, -1, 1))
+    ops.moe_sum(curr_hidden, out)
+
+
+def deep_gemm_moe_fp8(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with DeepGemm
+    grouped gemm.
+
+    Parameters:
+    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
+        Shape: [num_experts, K, 2N] (the weights are passed transposed)
+    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
+        Shape: [num_experts, N, K] (the weights are passed transposed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts] or [num_experts, 2N]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts] or [num_experts, K]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
+    - inplace (bool): If True, perform the operation in-place.
+        Defaults to False.
+    - activation (str): The activation function to apply after the first
+        MoE layer.
+    - global_num_experts (int): The total number of experts in the global
+        expert space.
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
+        parallel shard.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [M]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [M]
+
+    Returns:
+    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
+    """
+    # Lazy import to avoid CUDA initialization problems.
+    import deep_gemm as dg
+
+    assert expert_map is None, "Expert maps not supported yet"
+
+    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
+
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
+    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
+    assert hidden_states.dtype in [
+        torch.float32, torch.float16, torch.bfloat16
+    ]
+    assert w1.dtype == torch.float8_e4m3fn
+    assert w2.dtype == torch.float8_e4m3fn
+    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
+    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert a1_scale is None or a1_scale.dim(
+    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
+        0] == hidden_states.shape[0], "Input scale shape mismatch"
+    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
+
+    num_tokens, _ = hidden_states.shape
+    E, N, _ = w1.shape
+    K = w2.shape[1]
+    if global_num_experts == -1:
+        global_num_experts = E
+
+    # We execute the fused_moe kernel in chunks to circumvent this issue:
+    # https://github.com/vllm-project/vllm/issues/5938
+    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
+
+    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
+
+    if inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    block_m = dg.get_m_alignment_for_contiguous_layout()
+    block_shape = [block_m, block_m]
+
+    assert w1_scale is not None
+    assert w2_scale is not None
+
+    # We attempt to transpose and align offline in Fp8MoEMethod, in which
+    # case these calls will be nops.  Otherwise, they'll be performed every
+    # time the layer is executed.
+    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
+    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
+
+    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
+    M_sum = round_up(M_sum, block_m)
+
+    num_chunks = (num_tokens // CHUNK_SIZE) + 1
+
+    # We can reuse the memory between cache1 and cache3 because by the time
+    # we need cache3, we're done with cache1
+    workspace13 = torch.empty(M_sum * max(N, K),
+                              device=hidden_states.device,
+                              dtype=hidden_states.dtype)
+
+    workspace1 = workspace13[:M_sum * N].view(M_sum, N)
+    workspace2 = torch.empty((M_sum, N // 2),
+                             device=hidden_states.device,
+                             dtype=hidden_states.dtype)
+    workspace3 = workspace13[:M_sum * K].view(M_sum, K)
+
+    for chunk in range(num_chunks):
+        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
+                                          min((chunk + 1) * CHUNK_SIZE,
+                                              num_tokens))
+        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
+        tokens_in_chunk, _ = curr_hidden_states.shape
+
+        if tokens_in_chunk == 0:
+            break
+
+        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
+        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
+
+        a1q_scale: Optional[torch.Tensor] = None
+
+        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
+                                                       a1_scale, block_shape)
+
+        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
+         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
+                                  curr_topk_ids, global_num_experts,
+                                  expert_map, block_m)
+
+        # Adjust the intermediate cache size and config for the last chunk.
+        # Note that in most cases we only have one chunk so the cache size
+        # and config are already set correctly and do not need to be adjusted.
+        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+            curr_M = sorted_token_ids.numel()
+            workspace1 = _resize_cache(workspace1, (curr_M, N))
+            workspace2 = _resize_cache(workspace2, (curr_M, N // 2))
+            workspace3 = _resize_cache(workspace3, (curr_M, K))
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qcurr_hidden_states, a1q_scale), (w1, w1_scale), workspace1,
+            expert_ids)
+
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(workspace2, workspace1.view(-1, N))
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(workspace2, workspace1.view(-1, N))
+        else:
+            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+
+        a2q_scale: Optional[torch.Tensor] = None
+
+        qworkspace2, a2q_scale = _fp8_quantize(workspace2, a2_scale,
+                                               block_shape)
+
+        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (qworkspace2, a2q_scale), (w2, w2_scale), workspace3, expert_ids)
+
+        _moe_unpermute_and_reduce(
+            out_hidden_states[begin_chunk_idx:end_chunk_idx],
+            workspace3.view(*workspace3.shape), inv_perm, curr_topk_weights)
+
+    return out_hidden_states
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 977447e0399..aa0bd553fc3 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Fused MoE kernel."""
 import functools
-import importlib.util
 import json
 import os
-from math import prod
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -14,10 +12,13 @@
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8)
+from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+    _valid_deep_gemm, deep_gemm_moe_fp8)
+from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
+    moe_align_block_size)
+from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op, round_up
+from vllm.utils import direct_register_custom_op
 
 from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
                                    rocm_aiter_fused_experts,
@@ -25,8 +26,6 @@
 
 logger = init_logger(__name__)
 
-has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
-
 
 @triton.jit
 def write_zeros_to_output(c_ptr, stride_cm, stride_cn, pid_n, N, offs_token,
@@ -443,300 +442,13 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
 
 
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    start_idx = pid * tokens_per_thread
-
-    off_c = (pid + 1) * num_experts
-
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-
-
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-
-
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-
-
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
-                                         numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-
-
-# Triton implementation based on:
-# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts, )
-    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
-                              dtype=torch.int32,
-                              device=topk_ids.device)
-    cumsum = torch.zeros((num_experts + 1, ),
-                         dtype=torch.int32,
-                         device=topk_ids.device)
-    tokens_per_thread = ceil_div(numel, num_experts)
-
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1, )](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
-
-
-def moe_align_block_size(
-    topk_ids: torch.Tensor,
-    block_size: int,
-    num_experts: int,
-    expert_map: Optional[torch.Tensor] = None,
-    pad_sorted_ids: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Aligns the token distribution across experts to be compatible with block
-    size for matrix multiplication.
-
-    Parameters:
-    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
-        top-k expert indices for each token.
-    - block_size: The block size used in block matrix multiplication.
-    - num_experts: The total number of experts.
-    - expert_map: A tensor of shape [num_experts] that maps the expert index
-        from the global space to the local index space of the current
-        expert parallel shard. If the expert is not in the current expert
-        parallel shard, the mapping is set to -1.
-    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
-      should be padded to a multiple of block_size,
-
-    Returns:
-    - sorted_token_ids: A tensor containing the sorted token indices according
-        to their allocated expert.
-    - expert_ids: A tensor indicating the assigned expert index for each block.
-    - num_tokens_post_padded: The total number of tokens after padding,
-        ensuring divisibility by block_size.
-
-    This function pads the number of tokens that each expert needs to process
-    so that it is divisible by block_size.
-    Padding ensures that during block matrix multiplication, the dimensions
-    align correctly.
-
-    Example:
-    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
-    block_size = 4, and num_experts = 4:
-    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
-        with each expert needing to process 3 tokens.
-    - As block_size is 4, we pad 1 token for each expert.
-    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
-    - Then append padding tokens [12, 12, 12, 12] for each block.
-    - After sorting by expert index, we obtain token_ids
-        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
-        Tokens 12 are non-existent (padding) and are ignored in
-        the subsequent matrix multiplication.
-    - The padding ensures that the total number of tokens is now divisible
-        by block_size for proper block matrix operations.
-    """
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    if pad_sorted_ids:
-        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
-    sorted_ids = torch.empty((max_num_tokens_padded, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    sorted_ids.fill_(topk_ids.numel())
-    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
-    # Expert ids must be zeroed out to prevent index out of bounds error while
-    # mapping global expert ids to local expert ids in expert parallelism.
-    expert_ids = torch.zeros((max_num_m_blocks, ),
-                             dtype=torch.int32,
-                             device=topk_ids.device)
-    num_tokens_post_pad = torch.empty((1),
-                                      dtype=torch.int32,
-                                      device=topk_ids.device)
-    if num_experts >= 224:
-        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
-            moe_align_block_size_triton(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-        else:
-            # Currently requires num_experts=256
-            ops.sgl_moe_align_block_size(
-                topk_ids,
-                num_experts,
-                block_size,
-                sorted_ids,
-                expert_ids,
-                num_tokens_post_pad,
-            )
-    else:
-        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                                 expert_ids, num_tokens_post_pad)
-    if expert_map is not None:
-        expert_ids = expert_map[expert_ids]
-
-    return sorted_ids, expert_ids, num_tokens_post_pad
-
-
-def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
-                     w2: torch.Tensor,
-                     expert_map: Optional[torch.Tensor]) -> bool:
-    """
-    Check if the given problem size is supported by the DeepGemm grouped
-    gemm kernel.  All of M, N, K and the quantization block_shape must be
-    aligned by `dg.get_m_alignment_for_contiguous_layout()`.
-    """
-    if not has_deep_gemm:
-        return False
-
-    # Lazy import to avoid CUDA initialization problems.
-    import deep_gemm as dg
-
-    # Expert maps not supported yet.
-    if expert_map is not None:
-        return False
-
-    align = dg.get_m_alignment_for_contiguous_layout()
-    M = hidden_states.shape[0]
-    _, K, N = w2.shape
-
-    # For now, disable DeepGemm for small N until better permute/unpermute
-    # ops are available.
-    if N <= 512:
-        return False
-
-    if align > M or N % align != 0 or K % align != 0:
-        return False
-
-    return (hidden_states.is_contiguous() and w1.is_contiguous()
-            and w2.is_contiguous())
-
-
-def _fp8_quantize(
-    A: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    block_shape: Optional[List[int]],
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Perform fp8 quantization on the inputs.  If a block_shape
-    is provided, the output will be blocked.
-    """
-    if block_shape is None:
-        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
-    else:
-        assert len(block_shape) == 2
-        _, block_k = block_shape[0], block_shape[1]
-        A, A_scale = per_token_group_quant_fp8(A, block_k)
-        assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
-    return A, A_scale
-
-
 def invoke_fused_moe_kernel(A: torch.Tensor,
                             B: torch.Tensor,
                             C: torch.Tensor,
                             A_scale: Optional[torch.Tensor],
                             B_scale: Optional[torch.Tensor],
                             B_zp: Optional[torch.Tensor],
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
+                            topk_weights: Optional[torch.Tensor],
                             sorted_token_ids: torch.Tensor,
                             expert_ids: torch.Tensor,
                             num_tokens_post_padded: torch.Tensor,
@@ -748,7 +460,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                             use_int8_w8a16: bool,
                             use_int4_w4a16: bool,
                             block_shape: Optional[List[int]] = None) -> None:
-    assert topk_weights.stride(1) == 1
+    assert topk_weights is not None or not mul_routed_weight
+    assert topk_weights is None or topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
 
     if use_fp8_w8a8:
@@ -765,6 +478,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert A_scale is None
         assert B_scale is None
 
+    M = A.shape[0]
+    num_tokens = M * top_k
+
     EM = sorted_token_ids.shape[0]
     if A.shape[0] < config["BLOCK_SIZE_M"]:
         # optimize for small batch_size.
@@ -782,7 +498,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         assert B_zp is None or B_zp.ndim == 3
 
         use_moe_wna16_cuda = should_moe_wna16_use_cuda(
-            num_valid_tokens=topk_ids.numel(),
+            num_valid_tokens=num_tokens,
             group_size=block_shape[1],
             num_experts=B.shape[0],
             bit=4 if use_int4_w4a16 else 8)
@@ -790,12 +506,12 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
         config.update(
             get_moe_wna16_block_config(config=config,
                                        use_moe_wna16_cuda=use_moe_wna16_cuda,
-                                       num_valid_tokens=topk_ids.numel(),
+                                       num_valid_tokens=num_tokens,
                                        size_k=A.shape[1],
                                        size_n=B.shape[1],
                                        num_experts=B.shape[1],
                                        group_size=block_shape[1],
-                                       real_top_k=topk_ids.shape[1],
+                                       real_top_k=top_k,
                                        block_size_m=config["BLOCK_SIZE_M"]))
 
         if use_moe_wna16_cuda:
@@ -821,7 +537,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             A.shape[1],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -864,7 +580,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
             B.shape[1],
             B.shape[2],
             EM,
-            topk_ids.numel(),
+            num_tokens,
             A.stride(0),
             A.stride(1),
             B.stride(0),
@@ -1389,6 +1105,7 @@ def fused_experts(hidden_states: torch.Tensor,
             w2=w2,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            inplace=inplace,
             activation=activation,
             global_num_experts=global_num_experts,
             expert_map=expert_map,
@@ -1419,85 +1136,6 @@ def fused_experts(hidden_states: torch.Tensor,
             block_shape=block_shape)
 
 
-def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
-    """
-    A permutation routine that works on fp8 types.
-    """
-    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
-        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
-    else:
-        return m[idx, ...]
-
-
-def _moe_permute(
-    curr_hidden_states: torch.Tensor,
-    a1q_scale: Optional[torch.Tensor],
-    curr_topk_ids: torch.Tensor,
-    global_num_experts: int,
-    expert_map: Optional[torch.Tensor],
-    top_k_num: int,
-    block_m: int,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor,
-           torch.Tensor]:
-    """
-    Determine the sorted_token_ids, expert_ids for the given problem size.
-    Permute the hidden states and scales according to `sorted_token_ids`.
-    """
-    tokens_in_chunk, _ = curr_hidden_states.shape
-
-    sorted_token_ids, expert_ids, num_tokens_post_padded = (
-        moe_align_block_size(curr_topk_ids,
-                             block_m,
-                             global_num_experts,
-                             expert_map,
-                             pad_sorted_ids=True))
-
-    inv_perm: Optional[torch.Tensor] = None
-
-    num_tokens = top_k_num * tokens_in_chunk
-    sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1)
-    expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0)
-    inv_perm = torch.argsort(sorted_token_ids)[:num_tokens]
-
-    # Permute according to sorted token ids.
-    curr_hidden_states = _fp8_perm(curr_hidden_states,
-                                   sorted_token_ids // top_k_num)
-
-    if a1q_scale is not None:
-        a1q_scale = a1q_scale[sorted_token_ids // top_k_num]
-
-    return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-            inv_perm)
-
-
-def _moe_unpermute_and_reduce(
-    out: torch.Tensor,
-    curr_hidden: torch.Tensor,
-    inv_perm: Optional[torch.Tensor],
-    topk: int,
-    K: int,
-    topk_weight: torch.Tensor,
-) -> None:
-    """
-    Unpermute the final result and apply topk_weights, then perform the final
-    reduction on the hidden states.
-    """
-    M = topk_weight.shape[0]
-    curr_hidden = curr_hidden[inv_perm, ...]
-    curr_hidden = curr_hidden.view(-1, topk, K)
-    curr_hidden.mul_(topk_weight.view(M, -1, 1))
-    ops.moe_sum(curr_hidden, out)
-
-
-def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
-    """
-    Shrink the given tensor and apply the given view to it.  This is
-    used to resize the intermediate fused_moe caches.
-    """
-    assert prod(v) <= x.numel()
-    return x.flatten()[:prod(v)].view(*v)
-
-
 def fused_experts_impl(hidden_states: torch.Tensor,
                        w1: torch.Tensor,
                        w2: torch.Tensor,
@@ -1629,7 +1267,6 @@ def fused_experts_impl(hidden_states: torch.Tensor,
                                 w1_scale,
                                 w1_zp,
                                 curr_topk_weights,
-                                curr_topk_ids,
                                 sorted_token_ids,
                                 expert_ids,
                                 num_tokens_post_padded,
@@ -1660,28 +1297,34 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             qintermediate_cache2 = intermediate_cache2
             a2q_scale = a2_scale
 
-        invoke_fused_moe_kernel(qintermediate_cache2,
-                                w2,
-                                intermediate_cache3,
-                                a2q_scale,
-                                w2_scale,
-                                w2_zp,
-                                curr_topk_weights,
-                                curr_topk_ids,
-                                sorted_token_ids,
-                                expert_ids,
-                                num_tokens_post_padded,
-                                True,
-                                1,
-                                config,
-                                compute_type=compute_type,
-                                use_fp8_w8a8=use_fp8_w8a8,
-                                use_int8_w8a16=use_int8_w8a16,
-                                use_int4_w4a16=use_int4_w4a16,
-                                block_shape=block_shape)
+        invoke_fused_moe_kernel(
+            qintermediate_cache2,
+            w2,
+            intermediate_cache3,
+            a2q_scale,
+            w2_scale,
+            w2_zp,
+            curr_topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            False,  #True,
+            1,
+            config,
+            compute_type=compute_type,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            block_shape=block_shape)
+
+        if True:
+            intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)
+            intermediate_cache3.mul_(
+                curr_topk_weights.view(tokens_in_chunk, -1, 1))
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])
+
     return out_hidden_states
 
 
@@ -1790,327 +1433,3 @@ def fused_moe(
                          a1_scale=a1_scale,
                          a2_scale=a2_scale,
                          block_shape=block_shape)
-
-
-def deep_gemm_moe_fp8(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    inplace: bool = False,
-    activation: str = "silu",
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    """
-    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
-    using two sets of quantized weights, w1_q and w2_q, and top-k gating
-    mechanism. The matrix multiplications are implemented with DeepGemm
-    grouped gemm.
-
-    Parameters:
-    - hidden_states (torch.Tensor): The input tensor to the MoE layer.
-        Shape: [M, K]
-    - w1 (torch.Tensor): The first set of fp8 quantized expert weights.
-        Shape: [num_experts, K, 2N] (the weights are passed transposed)
-    - w2 (torch.Tensor): The second set of fp8 quantized expert weights.
-        Shape: [num_experts, N, K] (the weights are passed transposed)
-    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
-        Shape: [num_experts] or [num_experts, 2N]
-    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
-        Shape: [num_experts] or [num_experts, K]
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - topk_ids (torch.Tensor): The token->expert mapping for topk_weights.
-    - inplace (bool): If True, perform the operation in-place.
-        Defaults to False.
-    - activation (str): The activation function to apply after the first
-        MoE layer.
-    - global_num_experts (int): The total number of experts in the global
-        expert space.
-    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
-        from the global expert space to the local expert space of the expert
-        parallel shard.
-    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
-        Shape: scalar or [M]
-    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
-        quantize the intermediate result between the gemms.
-        Shape: scalar or [M]
-
-    Returns:
-    - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
-    """
-    # Lazy import to avoid CUDA initialization problems.
-    import deep_gemm as dg
-
-    assert expert_map is None, "Expert maps not supported yet"
-
-    assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch"
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
-    assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
-    assert hidden_states.dtype in [
-        torch.float32, torch.float16, torch.bfloat16
-    ]
-    assert w1.dtype == torch.float8_e4m3fn
-    assert w2.dtype == torch.float8_e4m3fn
-    assert w1.shape[0] == w2.shape[0], "Expert number mismatch"
-    assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
-    assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
-    assert a1_scale is None or a1_scale.dim(
-    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[
-        0] == hidden_states.shape[0], "Input scale shape mismatch"
-    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-
-    num_tokens, _ = hidden_states.shape
-    E, N, _ = w1.shape
-    K = w2.shape[1]
-    if global_num_experts == -1:
-        global_num_experts = E
-    top_k_num = topk_ids.shape[1]
-    # We execute the fused_moe kernel in chunks to circumvent this issue:
-    # https://github.com/vllm-project/vllm/issues/5938
-    CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
-
-    assert _valid_deep_gemm(hidden_states, w1, w2, expert_map)
-
-    if inplace:
-        out_hidden_states = hidden_states
-    else:
-        out_hidden_states = torch.empty_like(hidden_states)
-
-    block_m = dg.get_m_alignment_for_contiguous_layout()
-    block_shape = [block_m, block_m]
-
-    assert w1_scale is not None
-    assert w2_scale is not None
-
-    # We attempt to transpose and align offline in Fp8MoEMethod, in which
-    # case these calls will be nops.  Otherwise, they'll be performed every
-    # time the layer is executed.
-    w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous()
-    w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous()
-
-    M_sum = topk_ids.numel() + global_num_experts * (block_m - 1)
-    M_sum = round_up(M_sum, block_m)
-
-    num_chunks = (num_tokens // CHUNK_SIZE) + 1
-
-    # We can reuse the memory between cache1 and cache3 because by the time
-    # we need cache3, we're done with cache1
-    cache13 = torch.empty(M_sum * max(N, K),
-                          device=hidden_states.device,
-                          dtype=hidden_states.dtype)
-
-    intermediate_cache1 = cache13[:M_sum * N].view(M_sum, N)
-    intermediate_cache2 = torch.empty((M_sum, N // 2),
-                                      device=hidden_states.device,
-                                      dtype=hidden_states.dtype)
-    intermediate_cache3 = cache13[:M_sum * K].view(M_sum, K)
-
-    for chunk in range(num_chunks):
-        begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE,
-                                          min((chunk + 1) * CHUNK_SIZE,
-                                              num_tokens))
-        curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx]
-        tokens_in_chunk, _ = curr_hidden_states.shape
-
-        if tokens_in_chunk == 0:
-            break
-
-        curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
-        curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx]
-
-        a1q_scale: Optional[torch.Tensor] = None
-
-        qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states,
-                                                       a1_scale, block_shape)
-
-        (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids,
-         inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale,
-                                  curr_topk_ids, global_num_experts,
-                                  expert_map, top_k_num, block_m)
-
-        # Adjust the intermediate cache size and config for the last chunk.
-        # Note that in most cases we only have one chunk so the cache size
-        # and config are already set correctly and do not need to be adjusted.
-        if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
-            curr_M = sorted_token_ids.numel()
-            intermediate_cache1 = _resize_cache(intermediate_cache1,
-                                                (curr_M, N))
-            intermediate_cache2 = _resize_cache(intermediate_cache2,
-                                                (curr_M, N // 2))
-            intermediate_cache3 = _resize_cache(intermediate_cache3,
-                                                (curr_M, K))
-
-        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (qcurr_hidden_states, a1q_scale), (w1, w1_scale),
-            intermediate_cache1, expert_ids)
-
-        if activation == "silu":
-            torch.ops._C.silu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
-        elif activation == "gelu":
-            torch.ops._C.gelu_and_mul(intermediate_cache2,
-                                      intermediate_cache1.view(-1, N))
-        else:
-            raise ValueError(f"Unsupported FusedMoe activation: {activation}")
-
-        a2q_scale: Optional[torch.Tensor] = None
-
-        qintermediate_cache2, a2q_scale = _fp8_quantize(
-            intermediate_cache2, a2_scale, block_shape)
-
-        dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
-            (qintermediate_cache2, a2q_scale), (w2, w2_scale),
-            intermediate_cache3, expert_ids)
-
-        _moe_unpermute_and_reduce(
-            out_hidden_states[begin_chunk_idx:end_chunk_idx],
-            intermediate_cache3.view(*intermediate_cache3.shape), inv_perm,
-            top_k_num, K, curr_topk_weights)
-
-    return out_hidden_states
-
-
-#TODO make the grouped gemm kernel consistent with scaled gemm kernel
-def cutlass_moe_fp8(
-    a: torch.Tensor,
-    w1_q: torch.Tensor,
-    w2_q: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    ab_strides1: torch.Tensor,
-    c_strides1: torch.Tensor,
-    ab_strides2: torch.Tensor,
-    c_strides2: torch.Tensor,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    out_dtype: torch.dtype = torch.half,
-) -> torch.Tensor:
-    """
-    This function computes a a8w8-quantized Mixture of Experts (MoE) layer
-    using two sets of quantized weights, w1_q and w2_q, and top-k gating
-    mechanism. The matrix multiplications are implemented with CUTLASS
-    grouped gemm.
-
-    Parameters:
-    - a (torch.Tensor): The input tensor to the MoE layer.
-        Shape: [M, K]
-    - w1_q (torch.Tensor): The first set of fp8-quantized expert weights.
-        Shape: [num_experts, K, 2N] (the weights are passed transposed)
-    - w2_q (torch.Tensor): The second set of fp8-quantized expert weights.
-        Shape: [num_experts, N, K] (the weights are passed transposed)
-    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
-        Shape: [num_experts] or [num_experts, 2N]
-    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
-        Shape: [num_experts] or [num_experts, K]
-    - gating_output (torch.Tensor): The output of the gating operation
-        (before softmax).
-    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
-    - ab_strides1 (torch.Tensor): The input and weights strides of the first
-        grouped gemm.
-    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
-    - ab_strides2 (torch.Tensor): The input and weights strides of the second
-        grouped gemm.
-    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
-    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
-        Shape: scalar or [M]
-    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
-        quantize the intermediate result between the gemms.
-        Shape: scalar or [M]
-    - out_dtype (torch.Tensor): The output tensor type.
-
-    Returns:
-    - torch.Tensor: The fp16 output tensor after applying the MoE layer.
-    """
-
-    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
-    assert w1_q.dtype == torch.float8_e4m3fn
-    assert w2_q.dtype == torch.float8_e4m3fn
-    assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1"
-    assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2"
-    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
-    assert a1_scale is None or a1_scale.dim(
-    ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[
-        0], "Input scale shape mismatch"
-    assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[
-        1] == w1_q.shape[2], "W1 scale shape mismatch"
-    assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[
-        1] == w2_q.shape[2], "W2 scale shape mismatch"
-    assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch"
-    assert w1_q.shape[0] == w1_scale.shape[
-        0], "w1 scales expert number mismatch"
-    assert w1_q.shape[0] == w2_scale.shape[
-        0], "w2 scales expert number mismatch"
-    assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch"  # noqa: E501
-    assert ab_strides1.shape[0] == w1_q.shape[
-        0], "AB Strides 1 expert number mismatch"
-    assert c_strides1.shape[0] == w1_q.shape[
-        0], "C Strides 1 expert number mismatch"
-    assert ab_strides2.shape[0] == w2_q.shape[
-        0], "AB Strides 2 expert number  mismatch"
-    assert c_strides2.shape[0] == w2_q.shape[
-        0], "C Strides 2 expert number mismatch"
-    assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype"
-
-    num_experts = w1_q.size(0)
-    m = a.size(0)
-    k = w1_q.size(1)
-    n = w2_q.size(1)
-
-    topk = topk_ids.size(1)
-    per_act_token = a1_scale.numel() != 1 if a1_scale is not None else (
-        a2_scale.numel() != 1 if a2_scale is not None else False)
-
-    a_q, a1_scale = ops.scaled_fp8_quant(
-        a, a1_scale, use_per_token_if_dynamic=per_act_token)
-    device = a_q.device
-
-    expert_offsets = torch.empty((num_experts + 1),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes1 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
-    problem_sizes2 = torch.empty((num_experts, 3),
-                                 dtype=torch.int32,
-                                 device=device)
-
-    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
-
-    ops.get_cutlass_moe_mm_data(topk_ids, expert_offsets, problem_sizes1,
-                                problem_sizes2, a_map, c_map, num_experts, n,
-                                k)
-
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale
-
-    c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
-    c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
-
-    ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale,
-                       expert_offsets[:-1], problem_sizes1, ab_strides1,
-                       ab_strides1, c_strides1)
-
-    intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
-    torch.ops._C.silu_and_mul(intermediate, c1)
-
-    intemediate_q, a2_scale = ops.scaled_fp8_quant(
-        intermediate, a2_scale, use_per_token_if_dynamic=per_act_token)
-
-    ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale,
-                       expert_offsets[:-1], problem_sizes2, ab_strides2,
-                       ab_strides2, c_strides2)
-
-    return (c2[c_map].view(m, topk, k) *
-            topk_weights.view(m, topk, 1).to(out_dtype)).sum(dim=1)
diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
new file mode 100644
index 00000000000..07d51acf986
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py
@@ -0,0 +1,243 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional, Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+from vllm.utils import round_up
+
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+@triton.jit
+def moe_align_block_size_stage1(
+    topk_ids_ptr,
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    start_idx = pid * tokens_per_thread
+
+    off_c = (pid + 1) * num_experts
+
+    for i in range(tokens_per_thread):
+        if start_idx + i < numel:
+            idx = tl.load(topk_ids_ptr + start_idx + i)
+            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
+            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
+
+
+@triton.jit
+def moe_align_block_size_stage2(
+    tokens_cnts_ptr,
+    num_experts: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    last_cnt = 0
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
+        last_cnt = last_cnt + token_cnt
+        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
+
+
+@triton.jit
+def moe_align_block_size_stage3(
+    total_tokens_post_pad_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+):
+    last_cumsum = 0
+    off_cnt = num_experts * num_experts
+    for i in range(1, num_experts + 1):
+        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
+        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
+        tl.store(cumsum_ptr + i, last_cumsum)
+    tl.store(total_tokens_post_pad_ptr, last_cumsum)
+
+
+@triton.jit
+def moe_align_block_size_stage4(
+    topk_ids_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    tokens_cnts_ptr,
+    cumsum_ptr,
+    num_experts: tl.constexpr,
+    block_size: tl.constexpr,
+    numel: tl.constexpr,
+    tokens_per_thread: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    start_idx = tl.load(cumsum_ptr + pid)
+    end_idx = tl.load(cumsum_ptr + pid + 1)
+
+    for i in range(start_idx, end_idx, block_size):
+        tl.store(expert_ids_ptr + i // block_size, pid)
+
+    start_idx = pid * tokens_per_thread
+    off_t = pid * num_experts
+
+    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread,
+                                         numel)):
+        expert_id = tl.load(topk_ids_ptr + i)
+        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
+        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
+        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
+        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
+
+
+# Triton implementation based on:
+# https://github.com/sgl-project/sglang/commit/ba5112ff691d791a9e38c6c71f59324a5fcb49d0
+def moe_align_block_size_triton(
+    topk_ids: torch.Tensor,
+    num_experts: int,
+    block_size: int,
+    sorted_token_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    num_tokens_post_pad: torch.Tensor,
+) -> None:
+    numel = topk_ids.numel()
+    grid = (num_experts, )
+    tokens_cnts = torch.zeros((num_experts + 1, num_experts),
+                              dtype=torch.int32,
+                              device=topk_ids.device)
+    cumsum = torch.zeros((num_experts + 1, ),
+                         dtype=torch.int32,
+                         device=topk_ids.device)
+    tokens_per_thread = ceil_div(numel, num_experts)
+
+    moe_align_block_size_stage1[grid](
+        topk_ids,
+        tokens_cnts,
+        num_experts,
+        numel,
+        tokens_per_thread,
+    )
+    moe_align_block_size_stage2[grid](
+        tokens_cnts,
+        num_experts,
+    )
+    moe_align_block_size_stage3[(1, )](
+        num_tokens_post_pad,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+    )
+    moe_align_block_size_stage4[grid](
+        topk_ids,
+        sorted_token_ids,
+        expert_ids,
+        tokens_cnts,
+        cumsum,
+        num_experts,
+        block_size,
+        numel,
+        tokens_per_thread,
+    )
+
+
+def moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Aligns the token distribution across experts to be compatible with block
+    size for matrix multiplication.
+
+    Parameters:
+    - topk_ids: A tensor of shape [total_tokens, top_k] representing the
+        top-k expert indices for each token.
+    - block_size: The block size used in block matrix multiplication.
+    - num_experts: The total number of experts.
+    - expert_map: A tensor of shape [num_experts] that maps the expert index
+        from the global space to the local index space of the current
+        expert parallel shard. If the expert is not in the current expert
+        parallel shard, the mapping is set to -1.
+    - pad_sorted_ids: A flag indicating whether the sorted_token_ids length
+      should be padded to a multiple of block_size,
+
+    Returns:
+    - sorted_token_ids: A tensor containing the sorted token indices according
+        to their allocated expert.
+    - expert_ids: A tensor indicating the assigned expert index for each block.
+    - num_tokens_post_padded: The total number of tokens after padding,
+        ensuring divisibility by block_size.
+
+    This function pads the number of tokens that each expert needs to process
+    so that it is divisible by block_size.
+    Padding ensures that during block matrix multiplication, the dimensions
+    align correctly.
+
+    Example:
+    Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]],
+    block_size = 4, and num_experts = 4:
+    - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts,
+        with each expert needing to process 3 tokens.
+    - As block_size is 4, we pad 1 token for each expert.
+    - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3].
+    - Then append padding tokens [12, 12, 12, 12] for each block.
+    - After sorting by expert index, we obtain token_ids
+        [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12].
+        Tokens 12 are non-existent (padding) and are ignored in
+        the subsequent matrix multiplication.
+    - The padding ensures that the total number of tokens is now divisible
+        by block_size for proper block matrix operations.
+    """
+    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+    sorted_ids = torch.empty((max_num_tokens_padded, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    sorted_ids.fill_(topk_ids.numel())
+    max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
+    # Expert ids must be zeroed out to prevent index out of bounds error while
+    # mapping global expert ids to local expert ids in expert parallelism.
+    expert_ids = torch.zeros((max_num_m_blocks, ),
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+    num_tokens_post_pad = torch.empty((1),
+                                      dtype=torch.int32,
+                                      device=topk_ids.device)
+    if num_experts >= 224:
+        if envs.VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON or num_experts != 256:
+            moe_align_block_size_triton(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+        else:
+            # Currently requires num_experts=256
+            ops.sgl_moe_align_block_size(
+                topk_ids,
+                num_experts,
+                block_size,
+                sorted_ids,
+                expert_ids,
+                num_tokens_post_pad,
+            )
+    else:
+        ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
+                                 expert_ids, num_tokens_post_pad)
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+
+    return sorted_ids, expert_ids, num_tokens_post_pad
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
new file mode 100644
index 00000000000..db31422f727
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+from math import prod
+from typing import List, Optional, Tuple
+
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8)
+from vllm.utils import cdiv
+
+
+def _resize_cache(x: torch.Tensor, v: Tuple[int, ...]) -> torch.Tensor:
+    """
+    Shrink the given tensor and apply the given view to it.  This is
+    used to resize the intermediate fused_moe caches.
+    """
+    assert prod(v) <= x.numel()
+    return x.flatten()[:prod(v)].view(*v)
+
+
+def _fp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    block_shape: Optional[List[int]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Perform fp8 quantization on the inputs.  If a block_shape
+    is provided, the output will be blocked.
+    """
+    if block_shape is None:
+        A, A_scale = ops.scaled_fp8_quant(A, A_scale)
+    else:
+        assert len(block_shape) == 2
+        _, block_k = block_shape[0], block_shape[1]
+        A, A_scale = per_token_group_quant_fp8(A, block_k)
+        assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1]
+    return A, A_scale
+
+
+def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    A permutation routine that works on fp8 types.
+    """
+    if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8:
+        return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype)
+    else:
+        return m[idx, ...]

From 7433003acdd2d1d1b082f979ea07311b370c241f Mon Sep 17 00:00:00 2001
From: yarongmu-google <150371854+yarongmu-google@users.noreply.github.com>
Date: Thu, 3 Apr 2025 10:32:54 -0700
Subject: [PATCH 1190/1240] =?UTF-8?q?[Misc][Performance]=20Advance=20tpu.t?=
 =?UTF-8?q?xt=20to=20the=20most=20recent=20nightly=20torch=20=E2=80=A6=20(?=
 =?UTF-8?q?#16024)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 requirements/tpu.txt | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 1930eacb61a..085b79958f8 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,10 @@ ray[data]
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250328-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250403-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+

From f2c55b2b2444f351539c0c91335ad9d1fba60fca Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 3 Apr 2025 13:05:17 -0500
Subject: [PATCH 1191/1240] Re-enable the AMD Testing for the passing tests.
 (#15586)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-amd-test.sh    | 20 +++++++++++++++++---
 .buildkite/test-pipeline.yaml | 15 +++++++++------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index e5a1b760db1..469422ddec2 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -105,19 +105,33 @@ fi
 if [[ $commands == *" entrypoints/openai "* ]]; then
   commands=${commands//" entrypoints/openai "/" entrypoints/openai \
   --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_chat.py \
   --ignore=entrypoints/openai/test_shutdown.py \
   --ignore=entrypoints/openai/test_completion.py \
   --ignore=entrypoints/openai/test_sleep.py \
   --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
   --ignore=entrypoints/openai/test_prompt_validation.py "}
 fi
 
 #ignore certain Entrypoints/llm tests
-if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
+#Obsolete currently
+##ignore certain Entrypoints/llm tests
+#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
+#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
+#fi
+
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 59e64dc527f..4a462e1909a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -104,7 +104,7 @@ steps:
 - label: Entrypoints Test # 40min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/llm
@@ -155,6 +155,7 @@ steps:
   - popd
 
 - label: Metrics, Tracing Test # 10min
+  mirror_hardwares: [amd]
   num_gpus: 2
   source_file_dependencies:
   - vllm/
@@ -173,7 +174,7 @@ steps:
 #####  1 GPU test  #####
 
 - label: Regression Test # 5min
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/test_regression
@@ -284,7 +285,7 @@ steps:
     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
 - label: LoRA Test %N # 15min each
-  mirror_hardwares: [amd]
+  #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
@@ -310,7 +311,7 @@ steps:
   - pytest -v -s compile/test_full_graph.py
 
 - label: Kernels Test %N # 1h each
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
   - vllm/attention
@@ -320,7 +321,7 @@ steps:
   parallelism: 4
 
 - label: Tensorizer Test # 11min
-  mirror_hardwares: [amd]
+  # mirror_hardwares: [amd]
   soft_fail: true
   source_file_dependencies:
   - vllm/model_executor/model_loader
@@ -371,7 +372,7 @@ steps:
 
 - label: OpenAI-Compatible Tool Use # 20 min
   fast_check: false
-  mirror_hardwares: [ amd ]
+  #mirror_hardwares: [ amd ]
   source_file_dependencies:
     - vllm/
     - tests/tool_use
@@ -463,6 +464,7 @@ steps:
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
+  mirror_hardwares: [amd]
   optional: true
   commands:
     - echo 'Testing custom models...'
@@ -474,6 +476,7 @@ steps:
 #####  multi gpus test  #####
 
 - label: Distributed Comm Ops Test # 7min
+  mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:

From 2a97e6384b49ac6535bd37b42e4279db16b5d382 Mon Sep 17 00:00:00 2001
From: iefgnoix <isaacwxf23@gmail.com>
Date: Thu, 3 Apr 2025 14:23:28 -0700
Subject: [PATCH 1192/1240] [TPU] Support sliding window and logit soft capping
 in the paged attention kernel for TPU. (#15732)

Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/run-tpu-v1-test.sh          |  6 +-
 tests/entrypoints/llm/test_accuracy.py | 30 +++++---
 tests/v1/tpu/test_pallas.py            | 98 ++++++++++++++++++++++++++
 vllm/v1/attention/backends/pallas.py   | 12 ++--
 4 files changed, 128 insertions(+), 18 deletions(-)
 create mode 100644 tests/v1/tpu/test_pallas.py

diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/run-tpu-v1-test.sh
index 5b7ce9a7677..87f74277cf9 100755
--- a/.buildkite/run-tpu-v1-test.sh
+++ b/.buildkite/run-tpu-v1-test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-set -e
+set -xue
 
 # Build the docker image.
 docker build -f docker/Dockerfile.tpu -t vllm-tpu .
@@ -38,7 +38,9 @@ docker run --privileged --net host --shm-size=16G -it \
     && echo TEST_7 \
     && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
     && echo TEST_8 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
+    && echo TEST_9 \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 
 
 # TODO: This test fails because it uses RANDOM_SEED sampling
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 2bc32ace0a5..95657455bd7 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -13,18 +13,24 @@
 
 from vllm.platforms import current_platform
 
-MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+MODEL_NAMES = [
+    "Qwen/Qwen2-1.5B-Instruct",
+    "google/gemma-3-1b-it",
+]
 NUM_CONCURRENT = 500
 TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
-EXPECTED_VALUE = 0.58
+EXPECTED_VALUES = {
+    "Qwen/Qwen2-1.5B-Instruct": 0.58,
+    "google/gemma-3-1b-it": 0.25,
+}
 
 
-def run_test(more_args=None):
+def run_test(model_name, more_args=None):
     """Run the end to end accuracy test."""
 
-    model_args = f"pretrained={MODEL_NAME},max_model_len=4096"
+    model_args = f"pretrained={model_name},max_model_len=4096"
 
     if more_args is not None:
         model_args = "{},{}".format(model_args, more_args)
@@ -37,9 +43,12 @@ def run_test(more_args=None):
     )
 
     measured_value = results["results"][TASK][FILTER]
-    assert (measured_value - RTOL < EXPECTED_VALUE
-            and measured_value + RTOL > EXPECTED_VALUE
-            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+    assert model_name in EXPECTED_VALUES, (
+        f"Cannot find the expected value for the model {model_name=}")
+    expected_value = EXPECTED_VALUES[model_name]
+    assert (measured_value - RTOL < expected_value
+            and measured_value + RTOL > expected_value
+            ), f"Expected: {expected_value} |  Measured: {measured_value}"
 
 
 # TODO: [AlexM] Fix it with new CI/CD tests
@@ -49,7 +58,8 @@ def run_test(more_args=None):
 @pytest.mark.skipif(not current_platform.is_cuda()
                     and not current_platform.is_tpu(),
                     reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch):
     """Run with the V1 Engine."""
 
     with monkeypatch.context() as m:
@@ -64,7 +74,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
             if TPU_TP_TEST_STR:
                 more_args += ",{}".format(TPU_TP_TEST_STR)
 
-        run_test(more_args)
+        run_test(model, more_args)
 
 
 def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
@@ -72,4 +82,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "0")
-        run_test()
+        run_test("Qwen/Qwen2-1.5B-Instruct")
diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py
new file mode 100644
index 00000000000..54eab145efb
--- /dev/null
+++ b/tests/v1/tpu/test_pallas.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+from unittest.mock import ANY, patch
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               NUM_QUERIES_PER_BLOCK,
+                                               PallasAttentionBackendImpl,
+                                               PallasMetadata)
+
+
+def test_ragged_paged_attention():
+    # We verify that the kernel inputs such as sliding_window, etc. are passed
+    # in from the model correctly.
+    # The correctness of the paged attention kernel is tested in the kernel
+    # library.
+    num_heads = 4
+    head_size = 128
+    scale = 1.0
+    num_kv_heads = 4
+    sliding_window = 128
+    logits_soft_cap = 50.0
+    attn_impl = PallasAttentionBackendImpl(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=sliding_window,
+        kv_cache_dtype="auto",
+        logits_soft_cap=logits_soft_cap,
+        attn_type=AttentionType.DECODER,
+    )
+    mock_vmem_limit_bytes = 1024
+    attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
+
+    class FakeAttentionLayer:
+        _k_scale_float: float
+        _v_scale_float: float
+
+    layer = FakeAttentionLayer()
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    num_tokens = 16
+    num_blocks = 1024
+    block_size = 16
+    query = torch.zeros(num_tokens, num_heads * head_size)
+    key = torch.zeros(num_tokens, num_kv_heads * head_size)
+    value = torch.zeros(num_tokens, num_kv_heads * head_size)
+    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
+    slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
+    max_num_reqs = 8
+    max_num_blocks_per_req = 8
+    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                               dtype=torch.int32)
+    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    query_lens = [1] * max_num_reqs
+    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                dtype=torch.int32),
+                                   dim=0,
+                                   dtype=torch.int32)
+    num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
+    attn_metadata = PallasMetadata(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        query_start_loc=query_start_loc,
+        num_seqs=num_seqs,
+    )
+
+    with patch("torch.ops.xla.ragged_paged_attention"
+               ) as mock_ragged_paged_attention:
+        attn_impl.forward(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        mock_ragged_paged_attention.assert_called_once_with(
+            ANY,  # query
+            ANY,  # kv_cache
+            ANY,  # context_lens
+            ANY,  # block_tables
+            ANY,  # query_start_loc
+            ANY,  # num_seqs
+            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
+            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
+            vmem_limit_bytes=mock_vmem_limit_bytes,
+            use_kernel=True,
+            sm_scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=logits_soft_cap,
+        )
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 2f86920e277..27898632980 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -92,6 +92,8 @@ def __init__(
         self.head_size = head_size
         self.scale = float(scale)
         self.num_kv_heads = num_kv_heads
+        self.sliding_window = sliding_window
+        self.logits_soft_cap = logits_soft_cap
 
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@@ -99,15 +101,10 @@ def __init__(
             raise NotImplementedError("Head size must be a multiple of 128.")
         if alibi_slopes is not None:
             raise NotImplementedError("Alibi slopes is not supported.")
-        if sliding_window is not None:
-            raise NotImplementedError("Sliding window is not supported.")
         if kv_cache_dtype != "auto":
             raise NotImplementedError("FP8 KV cache dtype is not supported.")
         if blocksparse_params is not None:
             raise NotImplementedError("Blocksparse is not supported.")
-        if logits_soft_cap is not None:
-            raise NotImplementedError(
-                "Attention logits soft-capping is not supported.")
 
         if attn_type != AttentionType.DECODER:
             raise NotImplementedError("Encoder self-attention and "
@@ -172,7 +169,10 @@ def forward(
             num_queries_per_block=NUM_QUERIES_PER_BLOCK,
             vmem_limit_bytes=self.vmem_limit_bytes,
             use_kernel=True,
-            sm_scale=self.scale)
+            sm_scale=self.scale,
+            sliding_window=self.sliding_window,
+            soft_cap=self.logits_soft_cap,
+        )
 
         return output.reshape(num_tokens, hidden_size)
 

From 8fe88264b9de15ed7d0786720ff5b8e6b158a07d Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Thu, 3 Apr 2025 14:28:45 -0700
Subject: [PATCH 1193/1240] [TPU] Switch Test to Non-Sliding Window (#15981)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/tpu/test_compilation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 27328d4542d..2a71f460f78 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -30,7 +30,7 @@ def test_tpu_compilation():
                                          n=N,
                                          max_tokens=16)
 
-        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
                   max_num_batched_tokens=256,
                   max_model_len=256,
                   max_num_seqs=32,

From cfcd3648efdf2ad611421a8eecbed22289e9ca10 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Thu, 3 Apr 2025 19:01:34 -0400
Subject: [PATCH 1194/1240] [Bugfix] Fix function names in test_block_fp8.py
 (#16033)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/kernels/test_block_fp8.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py
index c4488c0c6ff..347319b303f 100644
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
@@ -360,7 +360,7 @@ def fp8_perm(m, idx):
         return m[idx, ...]
 
 
-def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
+def _moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     M, K = a.shape
 
     sorted_token_ids, m_indices, num_pad = moe_align_block_size(
@@ -379,7 +379,7 @@ def test_moe_permute(a, a_s, topk_ids, num_groups, topk, block_m):
     return a, a_s, m_indices, inv_perm
 
 
-def test_moe_unpermute(out, inv_perm, topk, K, topk_weight):
+def _moe_unpermute(out, inv_perm, topk, K, topk_weight):
     M = topk_weight.shape[0]
     out = out[inv_perm, ...]
     tmp_out = out.view(-1, topk, K)
@@ -401,8 +401,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
 
     a_q, a_s = per_token_group_quant_fp8(a, block_m)
 
-    a_q, a_s, m_indices, inv_perm = test_moe_permute(a_q, a_s, topk_ids,
-                                                     num_groups, topk, block_m)
+    a_q, a_s, m_indices, inv_perm = _moe_permute(a_q, a_s, topk_ids,
+                                                 num_groups, topk, block_m)
 
     inter_out = torch.zeros((a_q.shape[0], N * 2),
                             dtype=torch.bfloat16,
@@ -419,7 +419,7 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
         (act_out_q, act_out_s), (w2, w2_s), out, m_indices)
 
-    final_out = test_moe_unpermute(out, inv_perm, topk, K, topk_weight)
+    final_out = _moe_unpermute(out, inv_perm, topk, K, topk_weight)
 
     return final_out
 

From 7a9a9f4046a29d9507badc2591c02787f2ed2bca Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Thu, 3 Apr 2025 17:12:48 -0700
Subject: [PATCH 1195/1240] [ROCm] Tweak the benchmark script to run on ROCm
 (#14252)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../scripts/run-performance-benchmarks.sh     | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index 4cd449b141e..80ebb370ad4 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -10,15 +10,24 @@ set -x
 set -o pipefail
 
 check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if command -v nvidia-smi; then
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  elif command -v amd-smi; then
+    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
+  fi
+
   if [[ $gpu_count -gt 0 ]]; then
     echo "GPU found."
   else
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  if command -v nvidia-smi; then
+    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
+  elif command -v amd-smi; then
+    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
+  fi
   echo "GPU type is $gpu_type"
 }
 
@@ -90,9 +99,15 @@ kill_gpu_processes() {
 
 
   # wait until GPU memory usage smaller than 1GB
-  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
-    sleep 1
-  done
+  if command -v nvidia-smi; then
+    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
+      sleep 1
+    done
+  elif command -v amd-smi; then
+    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
+      sleep 1
+    done
+  fi
 
   # remove vllm config file
   rm -rf ~/.config/vllm

From 138b7a7000eddb8b0ab7d04cd2e57d5f0b6ae926 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Fri, 4 Apr 2025 09:33:36 +0800
Subject: [PATCH 1196/1240] [Misc] improve gguf check (#15974)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/transformers_utils/utils.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index bae487b7558..564c0f83389 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -23,9 +23,14 @@ def check_gguf_file(model: Union[str, PathLike]) -> bool:
     elif model.suffix == ".gguf":
         return True
 
-    with open(model, "rb") as f:
-        header = f.read(4)
-    return header == b"GGUF"
+    try:
+        with model.open("rb") as f:
+            header = f.read(4)
+
+        return header == b"GGUF"
+    except Exception as e:
+        logger.debug("Error reading file %s: %s", model, e)
+        return False
 
 
 def modelscope_list_repo_files(

From 80b338a992a5956e26dda7d5dfe72dbb7849697c Mon Sep 17 00:00:00 2001
From: Chengji Yao <chengjiyao@google.com>
Date: Fri, 4 Apr 2025 04:48:50 -0700
Subject: [PATCH 1197/1240] [TPU][V1] Remove ragged attention kernel parameter
 hard coding (#16041)

Signed-off-by: Chengji Yao <chengjiyao@google.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/attention/backends/pallas.py | 20 ++++++--------------
 vllm/v1/worker/tpu_model_runner.py   |  8 ++------
 2 files changed, 8 insertions(+), 20 deletions(-)

diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index 27898632980..af729ee9910 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -11,10 +11,6 @@
                                               AttentionLayer, AttentionType)
 from vllm.attention.backends.utils import CommonAttentionState
 
-# These are the 2 tunable parameters of the paged attention Pallas kernel.
-NUM_QUERIES_PER_BLOCK = 32
-NUM_KV_PAGES_PER_BLOCK = 128
-
 
 class PallasAttentionBackend(AttentionBackend):
 
@@ -115,13 +111,6 @@ def __init__(
         tpu_version = torch_xla.tpu.version()
         if tpu_version < 4:
             raise NotImplementedError("TPU version must be 4 or higher.")
-        # NOTE(chengjiyao): the TPU v4's vmem capacity is 16MB
-        # TODO(chengjiyao): autotune NUM_QUERIES_PER_BLOCK,
-        # NUM_KV_PAGES_PER_BLOCK and vmem_limit_bytes
-        if tpu_version == 4:
-            self.vmem_limit_bytes = 16 * 1024 * 1024
-        else:
-            self.vmem_limit_bytes = 64 * 1024 * 1024
 
     def forward(
         self,
@@ -165,9 +154,12 @@ def forward(
             attn_metadata.block_tables,
             attn_metadata.query_start_loc,
             attn_metadata.num_seqs,
-            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
-            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
-            vmem_limit_bytes=self.vmem_limit_bytes,
+            # By default, the system utilizes optimized block size and
+            # vmem_limit_bytes parameters from the kernel repository. However,
+            # these can be manually adjusted for debugging if necessary.
+            num_kv_pages_per_block=None,
+            num_queries_per_block=None,
+            vmem_limit_bytes=None,
             use_kernel=True,
             sm_scale=self.scale,
             sliding_window=self.sliding_window,
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index b1d5c0f3385..0668e7168b5 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -24,8 +24,7 @@
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
-                                               PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
                                                PallasMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -155,11 +154,8 @@ def __init__(
                                             dtype=torch.int64,
                                             device="cpu")
         self.slot_mapping_np = self.slot_mapping_cpu.numpy()
-
-        padded_max_num_blocks_per_req = _get_padded_number(
-            self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
         self.block_table_cpu = torch.zeros(
-            (self.max_num_tokens, padded_max_num_blocks_per_req),
+            (self.max_num_tokens, self.max_num_blocks_per_req),
             dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
             device="cpu")
 

From 757b2edba5dc1cd3881b5425351418bc75e98d84 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Fri, 4 Apr 2025 22:58:16 +0800
Subject: [PATCH 1198/1240] doc: add info for macos clang errors (#16049)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../getting_started/installation/cpu/apple.inc.md      | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 7bc9e85ecd9..61812ead122 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -12,7 +12,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
-- Compiler: `Apple Clang >= 15.0.0`
+- Compiler: `Apple Clang >= 15.0.0` and `Apple Clang < 17.0.0`
 
 ## Set up using Python
 
@@ -51,6 +51,14 @@ If the build has error like the following snippet where standard C++ headers can
       1 error generated.
 ```
 
+If run with error like the following snippet you need to check clang version and install a compatible version.
+
+```text
+AttributeError: '_OpNamespace' '_C' object has no attribute 'silu_and_mul'
+```
+
+More information can be found in <gh-issue:15941>.
+
 ## Set up using Docker
 
 ### Pre-built images

From 8fac27c7dae71d9b5344a6571f32df06073476dc Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Fri, 4 Apr 2025 16:52:41 +0100
Subject: [PATCH 1199/1240] [V1][Spec Decode] Avoid logging useless nan metrics
 (#16023)

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 tests/v1/core/test_scheduler.py | 17 +++++++++--------
 vllm/v1/core/sched/scheduler.py | 25 +++++++++++++++++++------
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index a087c41ab3a..21a1cbf540a 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -671,10 +671,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
         assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
 
     # No draft or accepted tokens counted yet
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
-    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
-    assert stats.num_draft_tokens == 0
-    assert stats.num_accepted_tokens == 0
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
 
     # Schedule the speculated tokens for validation
     output = scheduler.schedule()
@@ -702,7 +699,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
     engine_core_outputs = scheduler.update_from_output(output,
                                                        model_runner_output)
 
-    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
-    stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
-    assert stats.num_draft_tokens == expected[0]
-    assert stats.num_accepted_tokens == expected[1]
+    scheduler_stats = engine_core_outputs.scheduler_stats
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None
+        stats = scheduler_stats.spec_decoding_stats
+        assert stats.num_draft_tokens == expected[0]
+        assert stats.num_accepted_tokens == expected[1]
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index a0865c8fd84..81f8ad25051 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -553,11 +553,11 @@ def update_from_output(
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
-        spec_decoding_stats = SpecDecodingStats() if self.log_stats else None
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
         outputs: list[EngineCoreOutput] = []
+        spec_decoding_stats: Optional[SpecDecodingStats] = None
 
         # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
         # loop can be a performance bottleneck. We should do our best to avoid
@@ -585,11 +585,10 @@ def update_from_output(
                 num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
                                        len(generated_token_ids))
                 request.num_computed_tokens -= num_tokens_rejected
-
-                if spec_decoding_stats is not None:
-                    spec_decoding_stats.observe(
-                        num_draft_tokens=len(scheduled_spec_token_ids),
-                        num_accepted_tokens=len(generated_token_ids) - 1)
+                spec_decoding_stats = self.make_spec_decoding_stats(
+                    spec_decoding_stats,
+                    num_draft_tokens=len(scheduled_spec_token_ids),
+                    num_accepted_tokens=len(generated_token_ids) - 1)
 
             cached_encoder_input_ids = (
                 self.encoder_cache_manager.get_cached_input_ids(request))
@@ -744,3 +743,17 @@ def make_stats(
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
             spec_decoding_stats=spec_decoding_stats,
         )
+
+    def make_spec_decoding_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats],
+        num_draft_tokens: int,
+        num_accepted_tokens: int,
+    ) -> Optional[SpecDecodingStats]:
+        if not self.log_stats:
+            return None
+        if spec_decoding_stats is None:
+            spec_decoding_stats = SpecDecodingStats()
+        spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
+                                    num_accepted_tokens=num_accepted_tokens)
+        return spec_decoding_stats

From b74d4500f39bc8939fc8e11520327e6cf0ed2c86 Mon Sep 17 00:00:00 2001
From: Jonghyun Choe <andy.choe729@gmail.com>
Date: Sat, 5 Apr 2025 01:38:52 +0900
Subject: [PATCH 1200/1240] [Model] use AutoWeightsLoader for baichuan,
 gpt-neox, mpt (#15939)

Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/baichuan.py | 105 ++++++++++++++-----------
 vllm/model_executor/models/gpt_neox.py |  79 ++++++++++---------
 vllm/model_executor/models/mpt.py      |  35 +++++----
 3 files changed, 119 insertions(+), 100 deletions(-)

diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 7e2b7c862e5..6a3112b5f76 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -47,7 +47,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers)
 
 
@@ -321,6 +321,45 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
                               SupportsQuant):
@@ -353,6 +392,7 @@ def __init__(
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
+        self.lm_head.weight.weight_loader = self.lm_head_weight_loader
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
@@ -393,53 +433,22 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if name == "lm_head.weight":
-                # Unlike Baichuan, Baichuan2 normalizes the head weights.
-                # Refer to:
-                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
-                # Distinguish between Baichuan and Baichuan2 by checking the
-                # vocab size. This is suggested by
-                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
-                is_baichuan2 = self.config.vocab_size == 125696
-                if is_baichuan2:
-                    loaded_weight = torch.nn.functional.normalize(
-                        loaded_weight)
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
+
+    def lm_head_weight_loader(self, param: nn.Parameter,
+                              loaded_weight: torch.Tensor):
+        # Unlike Baichuan, Baichuan2 normalizes the head weights.
+        # Refer to:
+        # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+        # Distinguish between Baichuan and Baichuan2 by checking the
+        # vocab size. This is suggested by
+        # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+        is_baichuan2 = self.config.vocab_size == 125696
+        if is_baichuan2:
+            loaded_weight = torch.nn.functional.normalize(loaded_weight)
+
+        default_weight_loader(param, loaded_weight)
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 4b30c7bb303..582b2ff7e75 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -42,7 +42,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -241,6 +241,45 @@ def forward(
         hidden_states = self.final_layer_norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if ("attention.bias" in name or "attention.masked_bias" in name
+                    or "rotary_emb.inv_freq" in name):
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using OpenRLHF may include
+                # these tensors in the checkpoint. Skip them.
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
@@ -297,39 +336,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if ("attention.bias" in name or "attention.masked_bias" in name
-                    or "rotary_emb.inv_freq" in name):
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using OpenRLHF may include
-                # these tensors in the checkpoint. Skip them.
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # NOTE: GPT-NeoX's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index d716818f31c..b30f3ee3799 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -27,7 +27,7 @@
 from vllm.transformers_utils.configs.mpt import MPTConfig
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -266,6 +266,23 @@ def forward(
         hidden_states = self.norm_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            # Skip loading extra bias for GPTQ models.
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class MPTForCausalLM(nn.Module, SupportsPP):
 
@@ -318,17 +335,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            # Skip loading extra bias for GPTQ models.
-            if name.endswith(".bias") and name not in params_dict:
-                continue
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)

From 37c557fd083b9bbee9755641d45517c170718234 Mon Sep 17 00:00:00 2001
From: liuzhenwei <zhenweiliu@habana.ai>
Date: Sat, 5 Apr 2025 00:38:55 +0800
Subject: [PATCH 1201/1240] [Hardware][Gaudi][BugFix] fix arguments of hpu
 fused moe (#15945)

Signed-off-by: zhenwei <zhenweiliu@habana.ai>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/layers/fused_moe/layer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5cbbe49bbba..661fb52bbee 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -254,9 +254,12 @@ def forward_hpu(
         renormalize: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        global_num_experts: int = -1,
+        expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None
+        e_score_correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
     ) -> torch.Tensor:
         assert not use_grouped_topk
         assert num_expert_group is None
@@ -472,7 +475,7 @@ def __init__(
                              "non-grouped topk.")
         if current_platform.is_hpu():
             from vllm_hpu_extension.ops import DynamicFusedMOE
-            self.hpu_fused_moe = DynamicFusedMOE(self.num_experts)
+            self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.

From 6ab7513e39cb05a1a9313d74adef750a855de33d Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sat, 5 Apr 2025 00:38:58 +0800
Subject: [PATCH 1202/1240] [Bugfix][kernels] Fix half2float conversion in gguf
 kernels (#15995)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gguf/ggml-common.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h
index 99a7ea0fb27..6bef5db3ccf 100644
--- a/csrc/quantization/gguf/ggml-common.h
+++ b/csrc/quantization/gguf/ggml-common.h
@@ -1090,6 +1090,11 @@ __device__ __forceinline__ c10::BFloat16 convert_from_half<c10::BFloat16>(half v
 #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 }
 
+template<>
+__device__ __forceinline__ float convert_from_half<float>(half val) {
+    return __half2float(val);
+}
+
 #if defined(USE_ROCM)
 
 #ifndef __has_builtin

From 9a1deb025a321ecc7d361300e635609078137b0a Mon Sep 17 00:00:00 2001
From: "Ziji Shi (Steven)" <shi.ziji.sm@gmail.com>
Date: Fri, 4 Apr 2025 09:39:02 -0700
Subject: [PATCH 1203/1240] [Benchmark][Doc] Update throughput benchmark and
 README (#15998)

Signed-off-by: StevenShi-23 <shi.ziji.sm@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md               | 29 +++++++++++++++++++++++++++++
 benchmarks/benchmark_throughput.py | 26 +++++++++++++++-----------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4777d8329f2..b0417631c51 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -51,6 +51,12 @@ become available.
       <td style="text-align: center;">✅</td>
       <td style="text-align: center;">✅</td>
       <td><code>likaixin/InstructCoder</code></td>
+    </tr>
+      <tr>
+      <td><strong>HuggingFace-AIMO</strong></td>
+      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">✅</td>
+      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
     </tr>
     <tr>
       <td><strong>HuggingFace-Other</strong></td>
@@ -187,6 +193,17 @@ python3 vllm/benchmarks/benchmark_serving.py \
   --num-prompts 10
 ```
 
+**`AI-MO/aimo-validation-aime`**
+
+``` bash
+python3 vllm/benchmarks/benchmark_serving.py \
+    --model Qwen/QwQ-32B \
+    --dataset-name hf \
+    --dataset-path AI-MO/aimo-validation-aime \
+    --num-prompts 10 \
+    --seed 42
+```
+
 ---
 ## Example - Offline Throughput Benchmark
 
@@ -278,6 +295,18 @@ python3 vllm/benchmarks/benchmark_throughput.py \
   --num-prompts 10
 ```
 
+**`AI-MO/aimo-validation-aime`**
+
+```bash
+python3 benchmarks/benchmark_throughput.py \
+  --model Qwen/QwQ-32B \
+  --backend vllm \
+  --dataset-name hf \
+  --dataset-path AI-MO/aimo-validation-aime \
+  --hf-split train \
+  --num-prompts 10
+```
+
 ### Benchmark with LoRA Adapters
 
 ``` bash
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 1ff63f0a447..d0d7dfa1d79 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -11,10 +11,10 @@
 
 import torch
 import uvloop
-from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
-                               InstructCoderDataset, RandomDataset,
-                               SampleRequest, ShareGPTDataset, SonnetDataset,
-                               VisionArenaDataset)
+from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+                               ConversationDataset, InstructCoderDataset,
+                               RandomDataset, SampleRequest, ShareGPTDataset,
+                               SonnetDataset, VisionArenaDataset)
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
@@ -332,7 +332,10 @@ def get_requests(args, tokenizer):
             common_kwargs['dataset_subset'] = args.hf_subset
             common_kwargs['dataset_split'] = args.hf_split
             sample_kwargs["enable_multimodal_chat"] = True
-
+        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
+            dataset_cls = AIMODataset
+            common_kwargs['dataset_subset'] = None
+            common_kwargs['dataset_split'] = "train"
     else:
         raise ValueError(f"Unknown dataset name: {args.dataset_name}")
     # Remove None values
@@ -467,12 +470,13 @@ def validate_args(args):
                 since --dataset-name is not 'hf'.",
                       stacklevel=2)
     elif args.dataset_name == "hf":
-        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend."  #noqa: E501
-        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
-            assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend."  #noqa: E501
+        if args.dataset_path in (
+                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+                | ConversationDataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
+        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
+                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
+            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
         else:
             raise ValueError(
                 f"{args.dataset_path} is not supported by hf dataset.")

From a67be082a6984c11f871041853b2a9aa75448883 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 5 Apr 2025 00:39:05 +0800
Subject: [PATCH 1204/1240] [CPU] Change default block_size for CPU backend
 (#16002)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/platforms/cpu.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 619219023f4..67466bdb980 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -2,6 +2,7 @@
 
 import os
 import sys
+from importlib.util import find_spec
 from typing import TYPE_CHECKING, Optional
 
 import psutil
@@ -68,8 +69,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         cache_config = vllm_config.cache_config
 
+        ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None
+
         if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 16
+            cache_config.block_size = 128 if ipex_avaliable else 16
+
+        if not ipex_avaliable and cache_config.block_size != 16:
+            raise RuntimeError(
+                f"--block-size={cache_config.block_size} requires"
+                " intel_extension_for_pytorch")
 
         scheduler_config = vllm_config.scheduler_config
         if ((scheduler_config.chunked_prefill_enabled

From 6146b938679c1638e82f4a465fdf25be491c3bc0 Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Fri, 4 Apr 2025 18:39:08 +0200
Subject: [PATCH 1205/1240] [Distributed] [ROCM] Fix custom allreduce enable
 checks (#16010)

Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py              | 7 +++----
 vllm/platforms/cuda.py      | 4 ++++
 vllm/platforms/interface.py | 7 +++++++
 vllm/platforms/rocm.py      | 7 +++++++
 4 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 92e887e0863..1aadf2c25b4 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1619,13 +1619,12 @@ def _verify_args(self) -> None:
         if self.use_ray:
             from vllm.executor import ray_utils
             ray_utils.assert_ray_available()
-        device_capability = current_platform.get_device_capability()
-        if (current_platform.is_rocm() and device_capability is not None
-                and device_capability < (9, 4)):
+
+        if not current_platform.use_custom_allreduce():
             self.disable_custom_all_reduce = True
             logger.info(
                 "Disabled the custom all-reduce kernel because it is not "
-                "supported on AMD GPUs older than MI300X.")
+                "supported on current platform.")
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError("Unable to use nsight profiling unless workers "
                              "run with Ray.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 28505fca10d..0576022be44 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -308,6 +308,10 @@ def supports_fp8(cls) -> bool:
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         return True
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        return True
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 36db70681a1..b6f6029de9c 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -379,6 +379,13 @@ def supports_v1(cls, model_config: ModelConfig) -> bool:
         """
         return False
 
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        """
+        Returns if custom allreduce is supported on the current platform
+        """
+        return False
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 0bedd80e5ec..d18b7c26f7e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -302,3 +302,10 @@ def fp8_dtype(cls) -> torch.dtype:
     def supports_v1(cls, model_config: ModelConfig) -> bool:
         # V1 support on AMD gpus is experimental
         return True
+
+    @classmethod
+    def use_custom_allreduce(cls) -> bool:
+        # We only enable custom allreduce for MI300 series
+        gcn_arch = torch.cuda.get_device_properties(0).gcnArchName
+        supported_archs = ['gfx94']
+        return any(gfx in gcn_arch for gfx in supported_archs)

From cd1add6edd0cac64e51e4cdeb4887bca67a9c14c Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:40:20 -0400
Subject: [PATCH 1206/1240] [ROCm][Bugfix] Use platform specific FP8 dtype
 (#15717)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/ops/prefix_prefill.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 49ba476d78b..e0478c2aebd 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -753,7 +753,7 @@ def context_attention_fwd(q,
             assert (v_cache.dtype == torch.uint8)
 
             if kv_cache_dtype in ("fp8", "fp8_e4m3"):
-                target_dtype = torch.float8_e4m3fn
+                target_dtype = current_platform.fp8_dtype()
             elif kv_cache_dtype == "fp8_e5m2":
                 target_dtype = torch.float8_e5m2
             else:

From e31419331b17c4be7979f299df8c9bff9f1df41a Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 4 Apr 2025 12:40:37 -0400
Subject: [PATCH 1207/1240] [ROCm][Bugfix] Bring back fallback to eager mode
 removed in #14917, but for ROCm only (#15413)

Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/config.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1aadf2c25b4..2669d1a13b3 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -29,7 +29,7 @@
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                      get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import CpuArchEnum
+from vllm.platforms import CpuArchEnum, current_platform
 from vllm.sampling_params import GuidedDecodingParams
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
@@ -684,6 +684,13 @@ def _verify_cuda_graph(self) -> None:
             self.max_seq_len_to_capture = self.max_model_len
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
+        ROCM_UNSUPPORTED_MODELS = ['mllama']
+        if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS
+                and not self.enforce_eager and current_platform.is_rocm()):
+            logger.warning(
+                "CUDA graph is not supported for %s on ROCm yet, fallback "
+                "to the eager mode.", self.hf_config.model_type)
+            self.enforce_eager = True
 
     def _verify_bnb_config(self) -> None:
         """

From 7e3a1290bff1b32c68bc6b4f4158f5507cd20ebd Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 4 Apr 2025 11:58:08 -0600
Subject: [PATCH 1208/1240] [Bugfix] Fix default behavior/fallback for pp in v1
 (#16057)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/engine/arg_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 88723d9f5b7..89c9b67470e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1521,8 +1521,9 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         # PP is supported on V1 with Ray distributed executor,
         # but off for MP distributed executor for now.
         if (self.pipeline_parallel_size > 1
-                and self.distributed_executor_backend == "mp"
-                and _warn_or_fallback("PP (MP distributed executor)")):
+                and self.distributed_executor_backend != "ray"):
+            name = "Pipeline Parallelism without Ray distributed executor"
+            _raise_or_fallback(feature_name=name, recommend_to_remove=False)
             return False
 
         # ngram is supported on V1, but off by default for now.

From 00b0576f9ad23c7cb8771adac06ccd5ecedb9681 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Fri, 4 Apr 2025 12:16:20 -0700
Subject: [PATCH 1209/1240] [CI] Reorganize .buildkite directory (#16001)

Signed-off-by: kevin <kevin@anyscale.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/release-pipeline.yaml                            | 6 +++---
 .buildkite/{ => scripts/hardware_ci}/run-amd-test.sh        | 0
 .../{ => scripts/hardware_ci}/run-cpu-test-ppc64le.sh       | 0
 .buildkite/{ => scripts/hardware_ci}/run-cpu-test.sh        | 0
 .buildkite/{ => scripts/hardware_ci}/run-gh200-test.sh      | 0
 .buildkite/{ => scripts/hardware_ci}/run-hpu-test.sh        | 0
 .buildkite/{ => scripts/hardware_ci}/run-neuron-test.sh     | 0
 .buildkite/{ => scripts/hardware_ci}/run-tpu-v1-test.sh     | 0
 .buildkite/{ => scripts/hardware_ci}/run-xpu-test.sh        | 0
 .buildkite/{ => scripts}/run-benchmarks.sh                  | 0
 .buildkite/{ => scripts}/run-multi-node-test.sh             | 2 +-
 .buildkite/{ => scripts}/upload-wheels.sh                   | 0
 .buildkite/test-pipeline.yaml                               | 2 +-
 tools/shellcheck.sh                                         | 4 ++--
 14 files changed, 7 insertions(+), 7 deletions(-)
 rename .buildkite/{ => scripts/hardware_ci}/run-amd-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-cpu-test-ppc64le.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-cpu-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-gh200-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-hpu-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-neuron-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-tpu-v1-test.sh (100%)
 rename .buildkite/{ => scripts/hardware_ci}/run-xpu-test.sh (100%)
 rename .buildkite/{ => scripts}/run-benchmarks.sh (100%)
 rename .buildkite/{ => scripts}/run-multi-node-test.sh (96%)
 rename .buildkite/{ => scripts}/upload-wheels.sh (100%)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index a420759aad9..3354ea37002 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -6,7 +6,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -17,7 +17,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
@@ -34,7 +34,7 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/upload-wheels.sh"
+      - "bash .buildkite/scripts/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
similarity index 100%
rename from .buildkite/run-amd-test.sh
rename to .buildkite/scripts/hardware_ci/run-amd-test.sh
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
similarity index 100%
rename from .buildkite/run-cpu-test-ppc64le.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
similarity index 100%
rename from .buildkite/run-cpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-cpu-test.sh
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/scripts/hardware_ci/run-gh200-test.sh
similarity index 100%
rename from .buildkite/run-gh200-test.sh
rename to .buildkite/scripts/hardware_ci/run-gh200-test.sh
diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
similarity index 100%
rename from .buildkite/run-hpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-hpu-test.sh
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
similarity index 100%
rename from .buildkite/run-neuron-test.sh
rename to .buildkite/scripts/hardware_ci/run-neuron-test.sh
diff --git a/.buildkite/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
similarity index 100%
rename from .buildkite/run-tpu-v1-test.sh
rename to .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
similarity index 100%
rename from .buildkite/run-xpu-test.sh
rename to .buildkite/scripts/hardware_ci/run-xpu-test.sh
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
similarity index 100%
rename from .buildkite/run-benchmarks.sh
rename to .buildkite/scripts/run-benchmarks.sh
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/scripts/run-multi-node-test.sh
similarity index 96%
rename from .buildkite/run-multi-node-test.sh
rename to .buildkite/scripts/run-multi-node-test.sh
index 530bf90a855..49aebce786b 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/scripts/run-multi-node-test.sh
@@ -3,7 +3,7 @@
 set -euox pipefail
 
 if [[ $# -lt 4 ]]; then
-    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
     exit 1
 fi
 
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
similarity index 100%
rename from .buildkite/upload-wheels.sh
rename to .buildkite/scripts/upload-wheels.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4a462e1909a..0b775851c05 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -337,7 +337,7 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - bash run-benchmarks.sh
+  - bash scripts/run-benchmarks.sh
 
 - label: Quantization Test # 33min
   source_file_dependencies:
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index 7efb3cabc64..59ce400385e 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -18,5 +18,5 @@ if ! [ -x "$(command -v shellcheck)" ]; then
     export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
 fi
 
-# TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'
+# TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh
+find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"'

From d96927b66bf63c319727ce624494c4b0e4b54f62 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 4 Apr 2025 12:56:43 -0700
Subject: [PATCH 1210/1240] [V1] DP scale-out (1/N): Use zmq ROUTER/DEALER
 sockets for input queue (#15906)

Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/utils.py                 |  35 +++++++----
 vllm/v1/engine/core.py        |  28 +++++----
 vllm/v1/engine/core_client.py | 108 ++++++++++++++++++++++------------
 vllm/v1/utils.py              |  11 +---
 4 files changed, 113 insertions(+), 69 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 5f32f8cb66a..46f01638d0e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -2189,6 +2189,8 @@ def make_zmq_socket(
     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
+    identity: Optional[bytes] = None,
 ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
     """Make a ZMQ socket with the proper bind/connect semantics."""
 
@@ -2207,16 +2209,24 @@ def make_zmq_socket(
     else:
         buf_size = -1  # Use system default buffer size
 
-    if socket_type == zmq.constants.PULL:
-        socket.setsockopt(zmq.constants.RCVHWM, 0)
-        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
+    if bind is None:
+        bind = socket_type != zmq.PUSH
+
+    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.RCVHWM, 0)
+        socket.setsockopt(zmq.RCVBUF, buf_size)
+
+    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
+        socket.setsockopt(zmq.SNDHWM, 0)
+        socket.setsockopt(zmq.SNDBUF, buf_size)
+
+    if identity is not None:
+        socket.setsockopt(zmq.IDENTITY, identity)
+
+    if bind:
         socket.bind(path)
-    elif socket_type == zmq.constants.PUSH:
-        socket.setsockopt(zmq.constants.SNDHWM, 0)
-        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
-        socket.connect(path)
     else:
-        raise ValueError(f"Unknown Socket Type: {socket_type}")
+        socket.connect(path)
 
     return socket
 
@@ -2225,14 +2235,19 @@ def make_zmq_socket(
 def zmq_socket_ctx(
     path: str,
     socket_type: Any,
+    bind: Optional[bool] = None,
     linger: int = 0,
+    identity: Optional[bytes] = None,
 ) -> Iterator[zmq.Socket]:
     """Context manager for a ZMQ socket"""
 
     ctx = zmq.Context()  # type: ignore[attr-defined]
     try:
-        yield make_zmq_socket(ctx, path, socket_type)
-
+        yield make_zmq_socket(ctx,
+                              path,
+                              socket_type,
+                              bind=bind,
+                              identity=identity)
     except KeyboardInterrupt:
         logger.debug("Got Keyboard Interrupt.")
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 39caca0c2a4..f58c77e4f16 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -318,6 +318,11 @@ def __init__(
     ):
         super().__init__(vllm_config, executor_class, log_stats)
 
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
+        self.global_unfinished_reqs = False
+
         # Background Threads and Queues for IO. These enable us to
         # overlap ZMQ socket IO with GPU since they release the GIL,
         # and to overlap some serialization/deserialization with the
@@ -327,22 +332,16 @@ def __init__(
                                             Any]] = queue.Queue()
         self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
         threading.Thread(target=self.process_input_socket,
-                         args=(input_path, ),
+                         args=(input_path, engine_index),
                          daemon=True).start()
         threading.Thread(target=self.process_output_socket,
                          args=(output_path, engine_index),
                          daemon=True).start()
 
-        self.global_unfinished_reqs = False
-
-        self.step_fn = (self.step if self.batch_queue is None else
-                        self.step_with_batch_queue)
-
     @staticmethod
     def run_engine_core(*args,
                         dp_rank: int = 0,
                         local_dp_rank: int = 0,
-                        ready_pipe,
                         **kwargs):
         """Launch EngineCore busy loop in background process."""
 
@@ -377,9 +376,6 @@ def signal_handler(signum, frame):
             else:
                 engine_core = EngineCoreProc(*args, **kwargs)
 
-            # Send Readiness signal to EngineClient.
-            ready_pipe.send({"status": "READY"})
-
             engine_core.run_busy_loop()
 
         except SystemExit:
@@ -476,14 +472,22 @@ def _convert_msgspec_args(method, args):
             and not isinstance(v, p.annotation) else v
             for v, p in zip(args, arg_types))
 
-    def process_input_socket(self, input_path: str):
+    def process_input_socket(self, input_path: str, engine_index: int):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
         generic_decoder = MsgpackDecoder()
+        identity = engine_index.to_bytes(length=2, byteorder="little")
+
+        with zmq_socket_ctx(input_path,
+                            zmq.DEALER,
+                            identity=identity,
+                            bind=False) as socket:
+
+            # Send ready message to front-end once input socket is connected.
+            socket.send(b'READY')
 
-        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
             while True:
                 # (RequestType, RequestData)
                 type_frame, data_frame = socket.recv_multipart(copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index e948e59b8c4..b94b0aa7538 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -8,7 +8,7 @@
 import uuid
 import weakref
 from abc import ABC, abstractmethod
-from collections.abc import Awaitable, Sequence
+from collections.abc import Awaitable
 from concurrent.futures import Future
 from dataclasses import dataclass, field
 from threading import Thread
@@ -35,6 +35,8 @@
 
 _R = TypeVar('_R')  # Return type for collective_rpc
 
+STARTUP_POLL_PERIOD_MS = 10000
+
 
 class EngineCoreClient(ABC):
     """
@@ -261,15 +263,13 @@ def __init__(
         vllm_config: VllmConfig,
         executor_class: type[Executor],
         log_stats: bool,
-        ctx: Union[zmq.Context, zmq.asyncio.Context],
+        input_path: str,
         output_path: str,
         index: int = 0,
         local_dp_rank: int = 0,
     ):
-        # Paths and sockets for IPC.
-        input_path = get_open_zmq_ipc_path()
-        self.input_socket = make_zmq_socket(ctx, input_path,
-                                            zmq.constants.PUSH)
+        self.index = index
+        self.identity = index.to_bytes(length=2, byteorder="little")
         try:
             # Start EngineCore in background process.
             self.proc_handle = BackgroundProcHandle(
@@ -291,14 +291,9 @@ def __init__(
                 # Ensure socket is closed if process fails to start.
                 self.close()
 
-    def send_multipart(self, msg_parts: Sequence):
-        return self.input_socket.send_multipart(msg_parts, copy=False)
-
     def close(self):
         if proc_handle := getattr(self, "proc_handle", None):
             proc_handle.shutdown()
-        if socket := getattr(self, "input_socket", None):
-            socket.close(linger=0)
 
 
 @dataclass
@@ -309,6 +304,7 @@ class BackgroundResources:
     ctx: Union[zmq.Context]
     core_engines: list[CoreEngine] = field(default_factory=list)
     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
     shutdown_path: Optional[str] = None
 
     def __call__(self):
@@ -321,6 +317,8 @@ def __call__(self):
         # aren't explicitly closed first.
         if self.output_socket is not None:
             self.output_socket.close(linger=0)
+        if self.input_socket is not None:
+            self.input_socket.close(linger=0)
         if self.shutdown_path is not None:
             # We must ensure that the sync output socket is
             # closed cleanly in its own thread.
@@ -387,21 +385,51 @@ def sigusr1_handler(signum, frame):
 
         # Paths and sockets for IPC.
         self.output_path = get_open_zmq_ipc_path()
+        input_path = get_open_zmq_ipc_path()
+        self.input_socket = make_zmq_socket(self.ctx,
+                                            input_path,
+                                            zmq.ROUTER,
+                                            bind=True)
+        self.resources.input_socket = self.input_socket
 
         new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
-            vllm_config, executor_class, log_stats, self.ctx, self.output_path,
-            index, local_dp_rank)
+            vllm_config, executor_class, log_stats, input_path, self.
+            output_path, index, local_dp_rank)
 
         # Start engine core process(es).
         self._init_core_engines(vllm_config, new_core_engine,
                                 self.resources.core_engines)
 
         # Wait for engine core process(es) to start.
-        for engine in self.resources.core_engines:
-            engine.proc_handle.wait_for_startup()
+        self._wait_for_engine_startup()
 
         self.utility_results: dict[int, AnyFuture] = {}
 
+    def _wait_for_engine_startup(self):
+        # Get a sync handle to the socket which can be sync or async.
+        sync_input_socket = zmq.Socket.shadow(self.input_socket)
+
+        # Wait for engine core process(es) to send ready messages.
+        identities = set(eng.index for eng in self.resources.core_engines)
+        while identities:
+            while not sync_input_socket.poll(timeout=STARTUP_POLL_PERIOD_MS):
+                logger.info("Waiting for %d core engine proc(s) to start: %s",
+                            len(identities), identities)
+            eng_id_bytes, msg = sync_input_socket.recv_multipart()
+            eng_id = int.from_bytes(eng_id_bytes, byteorder="little")
+            if eng_id not in identities:
+                raise RuntimeError(f"Unexpected or duplicate engine: {eng_id}")
+            if msg != b'READY':
+                raise RuntimeError(f"Engine {eng_id} failed: {msg.decode()}")
+            logger.info("Core engine process %d ready.", eng_id)
+            identities.discard(eng_id)
+
+        # Double check that the process are running.
+        for engine in self.resources.core_engines:
+            proc = engine.proc_handle.proc
+            if proc.exitcode is not None:
+                raise RuntimeError(f"Engine proc {proc.name} not running")
+
     def _init_core_engines(
         self,
         vllm_config: VllmConfig,
@@ -494,9 +522,10 @@ def get_output(self) -> EngineCoreOutputs:
         return self.outputs_queue.get()
 
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
-        # (RequestType, SerializedRequest)
-        msg = (request_type.value, self.encoder.encode(request))
-        self.core_engine.send_multipart(msg)
+        # (Identity, RequestType, SerializedRequest)
+        msg = (self.core_engine.identity, request_type.value,
+               self.encoder.encode(request))
+        self.input_socket.send_multipart(msg, copy=False)
 
     def call_utility(self, method: str, *args) -> Any:
         call_id = uuid.uuid1().int >> 64
@@ -625,30 +654,34 @@ async def get_output_async(self) -> EngineCoreOutputs:
         assert self.outputs_queue is not None
         return await self.outputs_queue.get()
 
-    async def _send_input(self, request_type: EngineCoreRequestType,
-                          request: Any) -> None:
-        await self.core_engine.send_multipart(
-            (request_type.value, self.encoder.encode(request)))
+    def _send_input(self,
+                    request_type: EngineCoreRequestType,
+                    request: Any,
+                    engine: Optional[CoreEngine] = None) -> Awaitable[None]:
+        if engine is None:
+            engine = self.core_engine
 
-        self._ensure_output_queue_task()
+        message = (request_type.value, self.encoder.encode(request))
+        return self._send_input_message(message, engine)
+
+    def _send_input_message(self, message: tuple[bytes, bytes],
+                            engine: CoreEngine) -> Awaitable[None]:
+        message = (engine.identity, ) + message  # type: ignore[assignment]
+        return self.input_socket.send_multipart(message, copy=False)
 
     async def call_utility_async(self, method: str, *args) -> Any:
         return await self._call_utility_async(method,
                                               *args,
                                               engine=self.core_engine)
 
-    async def _call_utility_async(
-        self,
-        method: str,
-        *args,
-        engine: CoreEngine,
-    ) -> Any:
+    async def _call_utility_async(self, method: str, *args,
+                                  engine: CoreEngine) -> Any:
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
         message = (EngineCoreRequestType.UTILITY.value,
                    self.encoder.encode((call_id, method, args)))
-        await engine.send_multipart(message)
+        await self._send_input_message(message, engine)
         self._ensure_output_queue_task()
         return await future
 
@@ -657,6 +690,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         # tokenized.
         request.prompt = None
         await self._send_input(EngineCoreRequestType.ADD, request)
+        self._ensure_output_queue_task()
 
     async def abort_requests_async(self, request_ids: list[str]) -> None:
         if len(request_ids) > 0:
@@ -761,15 +795,15 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         self.reqs_in_flight[request.request_id] = chosen_engine
         chosen_engine.num_reqs_in_flight += 1
         if self.num_engines_running >= len(self.core_engines):
-            await chosen_engine.send_multipart(msg)
+            await self._send_input_message(msg, chosen_engine)
         else:
             # Send request to chosen engine and dp start loop
             # control message to all other engines.
             self.num_engines_running += len(self.core_engines)
             await asyncio.gather(*[
-                engine.send_multipart(msg if engine is
-                                      chosen_engine else self.start_dp_msg)
-                for engine in self.core_engines
+                self._send_input_message(
+                    msg if engine is chosen_engine else self.start_dp_msg,
+                    engine) for engine in self.core_engines
             ])
 
         self._ensure_output_queue_task()
@@ -794,7 +828,7 @@ async def process_engine_outputs(self: "DPAsyncMPClient",
                 # sure to start the other engines:
                 self.num_engines_running = len(self.core_engines)
                 coros = [
-                    engine.send_multipart(self.start_dp_msg)
+                    self._send_input_message(self.start_dp_msg, engine)
                     for engine in self.core_engines
                     if not engine.num_reqs_in_flight
                 ]
@@ -820,5 +854,5 @@ async def abort_requests_async(self, request_ids: list[str]) -> None:
 
     async def _abort_requests(self, request_ids: list[str],
                               engine: CoreEngine) -> None:
-        await engine.send_multipart((EngineCoreRequestType.ABORT.value,
-                                     self.encoder.encode(request_ids)))
+        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
+                               engine)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index f42b3501adb..fed5761b04b 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -105,12 +105,9 @@ def __init__(
         process_kwargs: dict[Any, Any],
     ):
         context = get_mp_context()
-        self.reader, writer = context.Pipe(duplex=False)
 
-        assert ("ready_pipe" not in process_kwargs
-                and "input_path" not in process_kwargs
+        assert ("input_path" not in process_kwargs
                 and "output_path" not in process_kwargs)
-        process_kwargs["ready_pipe"] = writer
         process_kwargs["input_path"] = input_path
         process_kwargs["output_path"] = output_path
 
@@ -122,12 +119,6 @@ def __init__(
                                            input_path, output_path)
         self.proc.start()
 
-    def wait_for_startup(self):
-        # Wait for startup.
-        if self.reader.recv()["status"] != "READY":
-            raise RuntimeError(f"{self.proc.name} initialization failed. "
-                               "See root cause above.")
-
     def shutdown(self):
         self._finalizer()
 

From e26aaadaa05593c18f7ad911d856dff8dfe43181 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 5 Apr 2025 05:26:44 +0800
Subject: [PATCH 1211/1240] [V1] Scatter and gather placeholders in the model
 runner (#15712)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md  |  16 +-
 docs/source/models/supported_models.md        |   3 -
 examples/offline_inference/audio_language.py  |   2 +-
 .../audio_language/test_ultravox.py           |   5 +-
 .../vision_language/test_models.py            |   2 +-
 .../vision_language/test_pixtral.py           |  26 ++--
 .../multimodal/processing/test_llava_next.py  |   4 +-
 .../processing/test_llava_onevision.py        |   4 +-
 tests/models/registry.py                      |   4 +-
 tests/multimodal/test_processing.py           |   9 ++
 tests/v1/core/test_kv_cache_utils.py          |  46 +++---
 vllm/model_executor/models/aya_vision.py      |  71 +++------
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  85 ++++-------
 vllm/model_executor/models/gemma3_mm.py       |  83 +++-------
 vllm/model_executor/models/h2ovl.py           |   2 +-
 vllm/model_executor/models/idefics3.py        |  82 ++--------
 vllm/model_executor/models/internvl.py        |  43 +-----
 vllm/model_executor/models/llava.py           |  55 +------
 vllm/model_executor/models/minicpmo.py        |  74 ++-------
 vllm/model_executor/models/minicpmv.py        | 142 +++++-------------
 vllm/model_executor/models/mistral3.py        |  50 +-----
 vllm/model_executor/models/molmo.py           |  74 ++-------
 vllm/model_executor/models/nvlm_d.py          |  32 +---
 vllm/model_executor/models/paligemma.py       |   6 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/pixtral.py         |  48 +-----
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen_vl.py         |   6 +-
 vllm/model_executor/models/skyworkr1v.py      |  42 +-----
 vllm/model_executor/models/vision.py          |  77 +---------
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/inputs.py                     |  32 +++-
 vllm/multimodal/processing.py                 |  77 +++++++---
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/multimodal/utils.py                      |   2 +-
 vllm/v1/core/kv_cache_utils.py                |   7 +-
 vllm/v1/core/sched/scheduler.py               |   8 +-
 vllm/v1/request.py                            |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  60 +++++---
 vllm/v1/worker/tpu_model_runner.py            |  85 ++++++++---
 vllm/v1/worker/utils.py                       |  45 ++++++
 42 files changed, 497 insertions(+), 943 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index 9cbfc32991f..c4894d39edc 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
-with different `full` and `feature` attributes:
+To assign the vision embeddings to only the image tokens, instead of a string
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails(
-        full=image_tokens + [bos_token_id],
-        features=image_tokens,
+    return PromptUpdateDetails.select_token_id(
+        image_tokens + [bos_token_id],
+        embed_token_id=_IMAGE_TOKEN_ID,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails(
-            full=image_tokens + [bos_token_id],
-            features=image_tokens,
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
         )
 
     return [
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 74b4eab9204..316fc3b2c4f 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -989,9 +989,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
-To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers`.
-
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 840892ea070..f33efbab955 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 83ece5d22bf..242f3398b92 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -55,7 +55,10 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
         yield remote_server
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3b34f012f62..b984cd6f548 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -167,7 +167,7 @@
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index ee619d8d80c..6ebe75f0e81 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,6 +176,8 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
+            load_format="mistral",
+            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -198,22 +200,14 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize(
-    "prompt,expected_ranges",
-    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
-        "offset": 11,
-        "length": 494
-    }]),
-     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
-         "offset": 11,
-         "length": 266
-     }, {
-         "offset": 277,
-         "length": 1056
-     }, {
-         "offset": 1333,
-         "length": 418
-     }])])
+@pytest.mark.parametrize("prompt,expected_ranges",
+                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
+                           [PlaceholderRange(offset=11, length=494)]),
+                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
+                              PlaceholderRange(offset=11, length=266),
+                              PlaceholderRange(offset=277, length=1056),
+                              PlaceholderRange(offset=1333, length=418)
+                          ])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index fe56a200a33..b82bfe483db 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder["offset"] == 1
-        assert first_placeholder["length"] == (
+        assert first_placeholder.offset == 1
+        assert first_placeholder.length == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index 7cefdd37ee4..dcc8dc8dab5 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder["offset"] == 0
-        assert first_placeholder["length"] == len(
+        assert first_placeholder.offset == 0
+        assert first_placeholder.length == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 39e104a11ab..9996bd2edce 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -277,7 +277,9 @@ def check_available_online(
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
+                                      max_transformers_version="4.48",  # noqa: E501
+                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index da112bd7a92..fa9588a0509 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -785,6 +785,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -793,6 +794,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
             }
@@ -807,12 +809,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -821,6 +825,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -835,12 +840,14 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -849,6 +856,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
+                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -857,6 +865,7 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
+                        is_embed=None,
                     ),
                 ],
             }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 8362af24a67..51836644b32 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,13 +158,10 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 5
-        }, {
-            "offset": 10,
-            "length": 5
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=5),
+            PlaceholderRange(offset=10, length=5),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -222,13 +219,10 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -253,25 +247,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
-            "offset": 0,
-            "length": 3
-        }, {
-            "offset": 3,
-            "length": 3
-        }],
+        mm_positions=[
+            PlaceholderRange(offset=0, length=3),
+            PlaceholderRange(offset=3, length=3),
+        ],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index b4bf1d82c08..6b68885d375 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -27,7 +27,7 @@
                                         BaseProcessingInfo,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,7 +35,6 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class AyaVisionImagePixelInputs(TypedDict):
@@ -51,13 +50,6 @@ class AyaVisionImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class AyaVisionMultiModalProjector(nn.Module):
 
@@ -135,21 +127,20 @@ def get_mm_max_tokens_per_item(
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         image_processor = hf_processor.image_processor
+
         image_size = self.get_image_size_with_most_features()
-        tokenizer = hf_processor.tokenizer
         num_patches = self.get_num_patches(
             image_width=image_size.width,
             image_height=image_size.height,
             size=image_processor.size,
             min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches)
-        image_string = hf_processor._prompt_split_image(num_patches)
-        x = encode_tokens(
-            tokenizer,
-            image_string,
-            add_special_tokens=False,
+            max_patches=image_processor.max_patches,
         )
-        return len(x)
+
+        img_patches_per_tile = (hf_processor.img_size //
+                                hf_processor.patch_size)**2
+
+        return num_patches * img_patches_per_tile
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -221,7 +212,6 @@ def _call_hf_processor(
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         image_processor = hf_processor.image_processor
 
-        hf_config = self.info.get_hf_config()
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images :=
                 mm_data.get("images")) is not None and '<image>' in prompt:
@@ -234,6 +224,7 @@ def _call_hf_processor(
                 parsed_images.get_image_size(i)
                 for i in range(len(parsed_images))
             ]
+
             num_patches = [
                 self.info.get_num_patches(
                     image_width=image_size.width,
@@ -243,20 +234,6 @@ def _call_hf_processor(
                     max_patches=image_processor.max_patches)
                 for image_size in image_sizes
             ]
-            image_tokens_list = [
-                hf_processor._prompt_split_image(num_patch)
-                for num_patch in num_patches
-            ]
-            tokenizer = self.info.get_tokenizer()
-            image_token_ids = [
-                tokenizer.encode(image_tokens, add_special_tokens=False)
-                for image_tokens in image_tokens_list
-            ]
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == hf_config.image_token_index
-                for image_repl_tokens in image_token_ids
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
         return processed_outputs
@@ -271,7 +248,6 @@ def _get_mm_fields_config(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -283,6 +259,7 @@ def _get_prompt_updates(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
+        img_patch_token = hf_processor.img_patch_token
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
@@ -294,8 +271,11 @@ def get_replacement(item_idx: int):
                 image_height=image_size.height,
                 size=image_processor.size,
                 min_patches=image_processor.min_patches,
-                max_patches=image_processor.max_patches)
-            return hf_processor._prompt_split_image(num_patches=num_patches)
+                max_patches=image_processor.max_patches,
+            )
+            repl = hf_processor._prompt_split_image(num_patches=num_patches)
+
+            return PromptUpdateDetails.select_text(repl, img_patch_token)
 
         return [
             PromptReplacement(
@@ -424,7 +404,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
@@ -436,18 +415,13 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of num_patches. "
                              f"Got type: {type(num_patches)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_patches = flatten_bn(num_patches, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
+
         return AyaVisionImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_patches,
-            embed_is_patch=embed_is_patch,
         )
 
     def get_multimodal_embeddings(
@@ -455,11 +429,8 @@ def get_multimodal_embeddings(
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-        image_features = self._process_image_input(image_input, **kwargs)
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+
+        return self._process_image_input(image_input, **kwargs)
 
     def get_input_embeddings(
         self,
@@ -471,9 +442,9 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                multimodal_embeddings=select_patch_features(
-                    multimodal_embeddings),
-                placeholder_token_id=self.config.image_token_index)
+                multimodal_embeddings=multimodal_embeddings,
+                placeholder_token_id=self.config.image_token_index,
+            )
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index f758c98ea5e..3d527cb6f52 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -162,9 +162,9 @@ def _get_prompt_updates(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails(
-                    full=([image_start_id] + image_tokens + [image_end_id]),
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [image_start_id] + image_tokens + [image_end_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index a807b047a1a..189b91db4a8 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Literal, Optional, Set, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -43,7 +43,6 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -66,14 +65,6 @@ class FuyuImagePatchInputs(TypedDict):
     flattened just like `flat_data`.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class FuyuProcessingInfo(BaseProcessingInfo):
 
@@ -94,15 +85,7 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        max_ncols, max_nrows = self.get_image_feature_grid_size(
-            image_width=target_width,
-            image_height=target_height,
-        )
-        max_image_tokens = (max_ncols + 1) * max_nrows
-
-        return {"image": max_image_tokens}
+        return {"image": self.get_max_image_tokens()}
 
     def get_image_feature_grid_size(
         self,
@@ -128,11 +111,32 @@ def get_image_feature_grid_size(
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        ncols, nrows = self.get_image_feature_grid_size(
+            image_width=image_width,
+            image_height=image_height,
+        )
+
+        return ncols * nrows
+
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+        )
+
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
@@ -192,19 +196,6 @@ def _call_hf_processor(
 
             processed_outputs["image_patches"] = image_patches[0]
 
-            # get patch grid size for each image
-            embed_is_patch = []
-            for image in images:
-                ncols, nrows = self.info.get_image_feature_grid_size(
-                    image_width=image.width,
-                    image_height=image.height,
-                )
-
-                mask = torch.tensor(([True] * ncols + [False]) * nrows)
-                embed_is_patch.append(mask)
-
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -224,8 +215,7 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"),
-                    embed_is_patch=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -252,9 +242,9 @@ def get_replacement_fuyu(item_idx: int):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails(
-                full=image_tokens + [bos_token_id],
-                features=image_tokens,
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
             )
 
         return [
@@ -329,20 +319,13 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
-            embed_is_patch = kwargs.pop("embed_is_patch")
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             image_patches_flat = flatten_bn(image_patches)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
-                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -364,12 +347,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -379,8 +357,11 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds,
-                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
+                input_ids,
+                inputs_embeds,
+                multimodal_embeddings,
+                _IMAGE_TOKEN_ID,
+            )
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index bbdea70a7bc..9552ee1f0b3 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        encode_tokens, find_mm_placeholders,
+                                        find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -36,7 +36,6 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -54,14 +53,6 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -183,7 +174,7 @@ def get_image_repl(
         if processor is None:
             processor = self.get_hf_processor()
 
-        image_token = processor.boi_token
+        boi_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -192,19 +183,21 @@ def get_image_repl(
         )
 
         if num_crops == 0:
-            image_text = image_token
+            image_text = boi_token
         else:
-            crops_image_tokens = " ".join(image_token
-                                          for _ in range(num_crops))
+            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {image_token} and here are some "
+                f"Here is the original image {boi_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(image_token,
+        repl_full = image_text.replace(boi_token,
                                        processor.full_image_sequence)
-        repl_features = repl_full.strip("\n")
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        tokenizer = processor.tokenizer
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[tokenizer.image_token]
+
+        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
 
     def get_num_image_tokens(
         self,
@@ -213,19 +206,17 @@ def get_num_image_tokens(
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_crops = self.get_num_crops(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
+        image_seq_len = processor.image_seq_length
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl.features,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return (num_crops + 1) * image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -301,28 +292,6 @@ def _call_hf_processor(
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-            image_repl_features = [
-                self.info.get_image_repl(image_width=size.width,
-                                         image_height=size.height,
-                                         processor=hf_processor).features
-                for size in image_sizes
-            ]
-
-            tokenizer = self.info.get_tokenizer()
-            image_repls_feature_tokens = [
-                tokenizer.encode(image_repl, add_special_tokens=False)
-                for image_repl in image_repl_features
-            ]
-
-            vocab = tokenizer.get_vocab()
-            image_token_id = vocab[tokenizer.image_token]
-
-            embed_is_patch = [
-                torch.tensor(image_repl_tokens) == image_token_id
-                for image_repl_tokens in image_repls_feature_tokens
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -344,7 +313,6 @@ def _get_mm_fields_config(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -454,6 +422,7 @@ def get_repl_toks(tok: int) -> list[int]:
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
+                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -572,7 +541,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -586,19 +554,13 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
-        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
-            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -635,12 +597,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -652,7 +609,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 3b2ad695f83..f975a19a364 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
     def resolve_min_max_num(
         self,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index da4a44346c3..347106bc4dc 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -41,7 +41,7 @@
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        encode_tokens)
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -54,7 +54,6 @@
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
@@ -69,14 +68,6 @@ class Idefics3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -86,14 +77,6 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -275,19 +258,16 @@ def get_num_image_tokens(
         image_height: int,
         processor: Optional[Idefics3Processor],
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_repl = self.get_image_repl(
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        num_patches = self.get_num_patches(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
 
-        image_repl_tokens = encode_tokens(
-            tokenizer,
-            image_repl,
-            add_special_tokens=False,
-        )
-        return len(image_repl_tokens)
+        return num_patches * processor.image_seq_len
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -364,28 +344,6 @@ def _call_hf_processor(
         ]
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
-        image_repl_features = [
-            self.info.get_image_repl(image_width=size.width,
-                                     image_height=size.height,
-                                     processor=hf_processor)
-            for size in image_sizes
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        vocab = tokenizer.get_vocab()
-        image_token_id = vocab[hf_processor.image_token.content]
-
-        embed_is_patch = [
-            torch.tensor(image_repl_tokens) == image_token_id
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        processed_outputs["embed_is_patch"] = embed_is_patch
-
         num_patches = [
             self.info.get_num_patches(
                 image_width=size.width,
@@ -415,7 +373,6 @@ def _get_mm_fields_config(
                 "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -427,17 +384,22 @@ def _get_prompt_updates(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token.content
 
-        def get_replacement_idefics3(item_idx: int) -> str:
+        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
 
-            return self.info.get_image_repl(
+            image_repl = self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
             )
 
+            return PromptUpdateDetails.select_text(
+                image_repl,
+                embed_text=image_token,
+            )
+
         return [
             PromptReplacement(
                 modality="image",
@@ -675,13 +637,6 @@ def _parse_and_validate_image_input(
         if pixel_values is None and image_embeds is None:
             return None
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
@@ -690,7 +645,6 @@ def _parse_and_validate_image_input(
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
-                embed_is_patch=embed_is_patch,
             )
 
         if pixel_values is not None:
@@ -718,7 +672,6 @@ def _parse_and_validate_image_input(
                 pixel_values=self._validate_pixel_values(pixel_values),
                 pixel_attention_mask=pixel_attention_mask,
                 num_patches=num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -754,12 +707,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -771,7 +719,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 0729f4c7d20..cf5608e3de7 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -39,7 +39,6 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,14 +59,6 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,12 @@ def __call__(
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
-
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +439,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -599,7 +578,6 @@ def _get_mm_fields_config(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -831,7 +809,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -860,20 +837,14 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -919,15 +890,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -941,7 +904,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 45a0bf73b83..b34ac38f680 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,7 +32,8 @@
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -42,8 +43,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -67,14 +67,6 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -343,23 +335,6 @@ def _call_hf_processor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -369,7 +344,6 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -404,7 +378,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -612,17 +586,9 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
-                embed_is_patch = kwargs.pop("embed_is_patch")
-                if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                    raise ValueError("Incorrect type of embed_is_patch. "
-                                     f"Got type: {type(embed_is_patch)}")
-
-                embed_is_patch = flatten_bn(embed_is_patch)
-
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
-                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -714,16 +680,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values_pixtral":
-            # The path is used for pixtral (V0 only) and llava (V0/V1)
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -735,7 +692,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index c74e086d374..a4fb0cb1741 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,7 +40,8 @@
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import PromptReplacement, PromptUpdate
+from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
@@ -50,7 +51,6 @@
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
-from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -73,14 +73,6 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     which equals to `audio_features.shape[-1]`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -93,14 +85,6 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
     Length of each slice may vary, so pass it as a list.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which audio embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_audios, num_embeds)`
-    """
-
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
                             MiniCPMOAudioEmbeddingInputs]
@@ -115,7 +99,6 @@ def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
-        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
         audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
@@ -197,8 +180,7 @@ def get_max_audio_tokens_per_chunk(self) -> int:
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
-        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
+        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
@@ -209,8 +191,7 @@ def get_max_audio_tokens(self) -> int:
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        # exclude <audio> </audio>
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
     def get_num_frames_with_most_features(
@@ -295,13 +276,6 @@ def process_audios(
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
-
-            audio_lens = [
-                self.info.get_audio_len_by_num_chunks(
-                    sum(map(len,
-                            parsed_audios.get(i)["audio_embeds"])))
-                for i in range(len(parsed_audios))
-            ]
         else:
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
@@ -323,27 +297,7 @@ def process_audios(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
-            audio_lens = [
-                parsed_audios.get_audio_length(i)
-                for i in range(len(parsed_audios))
-            ]
-
-        audio_repl_features = [
-            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        audio_repls_feature_tokens = [
-            tokenizer.encode(audio_repl, add_special_tokens=False)
-            for audio_repl in audio_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(audio_repl_tokens)
-            for audio_repl_tokens in audio_repls_feature_tokens
-        ]
-        audio_inputs["audio_embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
@@ -384,7 +338,10 @@ def get_audio_replacement(item_idx: int):
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return self.get_audio_prompt_texts(audio_len)
+            return PromptUpdateDetails.select_text(
+                self.get_audio_prompt_texts(audio_len),
+                "<unk>",
+            )
 
         return [
             *base_updates,
@@ -713,13 +670,6 @@ def _parse_and_validate_audio_input(
             assert isinstance(audio_token_id, torch.Tensor)
             self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
-        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
-        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of audio_embed_is_patch. "
-                             f"Got type: {type(audio_embed_is_patch)}")
-
-        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
-
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
@@ -730,7 +680,6 @@ def _parse_and_validate_audio_input(
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
                 audio_embeds=audio_embeds_flat,
-                embed_is_patch=audio_embed_is_patch,
             )
 
         if not isinstance(audio_features, (torch.Tensor, list)):
@@ -749,7 +698,6 @@ def _parse_and_validate_audio_input(
             type="audio_features",
             audio_features=audio_features_flat,
             audio_feature_lens=audio_feature_lens_flat,
-            embed_is_patch=audio_embed_is_patch,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -781,10 +729,6 @@ def _process_multimodal_inputs(self, modalities: dict):
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        audio_features,
-                        audio_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(audio_features)
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 5fab9df3f8f..eb20a963ae2 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -56,7 +56,7 @@
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -67,7 +67,6 @@
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -90,14 +89,6 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_slices: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -112,14 +103,6 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -245,12 +228,10 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
-        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
-        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
         image_token_id=MultiModalFieldConfig.shared("image", num_images),
         video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
@@ -398,22 +379,43 @@ def get_slice_image_placeholder(
             use_image_id=use_image_id,
         )
 
+    def get_sliced_grid(
+        self,
+        image_size: ImageSize,
+        # For MiniCPM V/O 2.6
+        max_slice_nums: Optional[int] = None,
+    ) -> Optional[tuple[int, int]]:
+        image_processor = self.get_image_processor()
+        version = self.get_model_version()
+
+        if version == (2, 0) or version == (2, 5):
+            return image_processor.get_sliced_grid(image_size)
+
+        if max_slice_nums is None:
+            max_slice_nums = image_processor.max_slice_nums
+
+        return image_processor.get_sliced_grid(
+            image_size,
+            max_slice_nums=max_slice_nums,
+        )
+
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
         max_slice_nums: Optional[int] = None,
-        use_image_id: bool = True,
     ) -> int:
-        tokenizer = self.get_tokenizer()
-        image_placeholders = self.get_slice_image_placeholder(
+        image_processor = self.get_image_processor()
+
+        grid = self.get_sliced_grid(
             image_size,
             max_slice_nums=max_slice_nums,
-            use_image_id=use_image_id,
         )
-        image_token_ids = tokenizer.encode(image_placeholders,
-                                           add_special_tokens=False)
+        if grid is None:
+            ncols = nrows = 0
+        else:
+            ncols, nrows = grid
 
-        return len(image_token_ids)
+        return (ncols * nrows + 1) * image_processor.image_feature_size
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
@@ -433,7 +435,6 @@ def get_max_video_frame_tokens(self) -> int:
         return self.get_num_image_tokens(
             frame_size,
             max_slice_nums=self.get_video_max_slice_num(),
-            use_image_id=False,
         )
 
     def get_max_video_tokens(
@@ -539,14 +540,6 @@ def get_video_prompt_texts(self, image_size: ImageSize,
             use_image_id=False,
         ) * num_frames
 
-    def get_embed_is_patch(
-        self,
-        input_ids: list[int],
-    ) -> torch.Tensor:
-        tokenizer = self.info.get_tokenizer()
-        unk_token_id = tokenizer.get_vocab()["<unk>"]
-        return torch.tensor(input_ids) == unk_token_id
-
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -570,26 +563,7 @@ def process_images(
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        image_sizes = [
-            parsed_images.get_image_size(i) for i in range(len(parsed_images))
-        ]
-        image_repl_features = [
-            self.get_image_prompt_texts(size, idx)
-            for idx, size in enumerate(image_sizes)
-        ]
-
         tokenizer = self.info.get_tokenizer()
-        image_repls_feature_tokens = [
-            tokenizer.encode(image_repl, add_special_tokens=False)
-            for image_repl in image_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(image_repl_tokens)
-            for image_repl_tokens in image_repls_feature_tokens
-        ]
-        image_inputs["embed_is_patch"] = embed_is_patch
-
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         image_inputs["image_token_id"] = torch.tensor(unk_token_id)
 
@@ -625,31 +599,9 @@ def process_videos(
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        frame_sizes = [
-            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
-        ]
-        num_frames = [
-            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
-        ]
-        video_repl_features = [
-            self.get_video_prompt_texts(size, nframes)
-            for size, nframes in zip(frame_sizes, num_frames)
-        ]
-
-        tokenizer = self.info.get_tokenizer()
-        video_repls_feature_tokens = [
-            tokenizer.encode(video_repl, add_special_tokens=False)
-            for video_repl in video_repl_features
-        ]
-
-        embed_is_patch = [
-            self.get_embed_is_patch(video_repl_tokens)
-            for video_repl_tokens in video_repls_feature_tokens
-        ]
-        video_inputs["embed_is_patch"] = embed_is_patch
-
         video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
 
+        tokenizer = self.info.get_tokenizer()
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
@@ -740,7 +692,10 @@ def get_image_replacement(item_idx: int):
 
             image_size = images.get_image_size(item_idx)
 
-            return self.get_image_prompt_texts(image_size, item_idx)
+            return PromptUpdateDetails.select_text(
+                self.get_image_prompt_texts(image_size, item_idx),
+                "<unk>",
+            )
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -749,7 +704,10 @@ def get_video_replacement(item_idx: int):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return self.get_video_prompt_texts(frame_size, num_frames)
+            return PromptUpdateDetails.select_text(
+                self.get_video_prompt_texts(frame_size, num_frames),
+                "<unk>",
+            )
 
         get_replacement = {
             "image": get_image_replacement,
@@ -832,14 +790,6 @@ def _parse_and_validate_vision_input(
             assert isinstance(image_token_id, torch.Tensor)
             self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError(
-                f"Incorrect type of embed_is_patch for {modality=}. "
-                f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(
@@ -851,7 +801,6 @@ def _parse_and_validate_vision_input(
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds_flat,
-                embed_is_patch=embed_is_patch,
             )
 
         if not isinstance(pixel_values, (torch.Tensor, list)):
@@ -879,7 +828,6 @@ def _parse_and_validate_vision_input(
             type="pixel_values",
             pixel_values=pixel_values_flat,
             tgt_sizes=tgt_sizes_flat,
-            embed_is_patch=embed_is_patch,
             num_slices=num_slices_flat,
         )
 
@@ -936,19 +884,11 @@ def _process_multimodal_inputs(self, modalities: dict):
             if modality == "images":
                 image_input = modalities["images"]
                 image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        image_features,
-                        image_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(image_features)
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(
-                    scatter_patch_features(
-                        video_features,
-                        video_input["embed_is_patch"],
-                    ))
+                multimodal_embeddings += tuple(video_features)
 
         return multimodal_embeddings
 
@@ -971,7 +911,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 list(self.mm_token_ids),
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 872769dd649..b6fbc6b1fa3 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -27,7 +27,8 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,8 +36,7 @@
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import (get_vision_encoder_info, scatter_patch_features,
-                     select_patch_features)
+from .vision import get_vision_encoder_info
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -49,14 +49,6 @@ class Mistral3ImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size, num_images, num_embeds)`
-    """
-
 
 class Mistral3PatchMerger(nn.Module):
     """
@@ -266,23 +258,6 @@ def _call_hf_processor(
                 p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
             ]
 
-            hf_config = self.info.get_hf_config()
-            vision_config = hf_config.vision_config
-            assert isinstance(vision_config, PixtralVisionConfig)
-            encoder_info = PixtralHFEncoderInfo(vision_config)
-
-            tile_sizes = [
-                encoder_info.get_patch_grid_size(
-                    image_width=pixel_value.shape[-1],
-                    image_height=pixel_value.shape[-2],
-                ) for pixel_value in processed_outputs["pixel_values"]
-            ]
-            embed_is_patch = [
-                torch.tensor(([True] * ncols + [False]) * nrows)
-                for ncols, nrows in tile_sizes
-            ]
-            processed_outputs["embed_is_patch"] = embed_is_patch
-
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -292,7 +267,6 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -327,7 +301,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -418,8 +392,6 @@ def init_vision_tower_for_llava(
     )
 
 
-# TODO(mgoin): Support V1, there are issues with image batching/chunking
-# that need to be resolved first.
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
@@ -509,16 +481,9 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
-        assert self.config.vision_config.model_type == "pixtral"
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
-            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -557,10 +522,7 @@ def get_multimodal_embeddings(
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return scatter_patch_features(
-            vision_embeddings,
-            image_input["embed_is_patch"],
-        )
+        return vision_embeddings
 
     def get_input_embeddings(
         self,
@@ -572,7 +534,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index b2f795155f1..6857bfa810e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,7 +46,8 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate)
+                                        PromptInsertion, PromptUpdate,
+                                        PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -56,7 +57,6 @@
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,14 +84,6 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-    
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
     num_crops: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -1146,30 +1138,6 @@ def __call__(
         if image_input_idx is not None:
             feat_is_patch = image_input_idx >= 0
 
-            input_is_embed = torch.isin(
-                input_ids,
-                torch.tensor([
-                    self.image_patch_id,
-                    self.im_col_id,
-                    self.im_start_id,
-                    self.im_end_id,
-                ]),
-            )
-            embed_ids = input_ids[input_is_embed]
-            embed_is_patch = embed_ids == self.image_patch_id
-            assert embed_is_patch.sum() == feat_is_patch.sum()
-
-            # image_tokens = extra_joint + joint
-            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
-            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
-            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
-            assert len(embed_start) == len(embed_end) == len(images)
-
-            embed_is_patch = [
-                embed_is_patch[start:end + 1]
-                for start, end in zip(embed_start, embed_end)
-            ]
-
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1181,7 +1149,6 @@ def __call__(
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
-            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1220,17 +1187,13 @@ def get_num_image_tokens(
         )
         pooling_size = processor.pooling_size
 
-        base_image_input_size = processor.base_image_input_size
-        base_image_input_d = processor.image_patch_size
-
-        crop_patches = base_image_input_size[0] // base_image_input_d
+        image_token_length_w = processor.image_token_length_w
+        image_token_length_h = processor.image_token_length_h
 
-        per_row = ncols // pooling_size + 1
-        joint = per_row * (nrows // pooling_size) + 2
-        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
-        resize = (image_token_length + 1) * image_token_length + 2
+        extra = image_token_length_w * image_token_length_h
+        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
 
-        return resize + joint
+        return extra + joint
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1328,7 +1291,6 @@ def _get_mm_fields_config(
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1368,8 +1330,10 @@ def get_insertion_molmo(item_idx: int):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            image_tokens = extra_joint + joint
-            return image_tokens
+            return PromptUpdateDetails.select_token_id(
+                extra_joint + joint,
+                embed_token_id=img_patch_id,
+            )
 
         return [
             PromptInsertion(
@@ -1475,11 +1439,6 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1491,14 +1450,12 @@ def _parse_and_validate_image_input(
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
-        embed_is_patch = flatten_bn(embed_is_patch)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
-            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
@@ -1537,12 +1494,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -1556,7 +1508,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 9d04f30c8f3..314f75c2030 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ def get_image_repl(
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,31 +84,6 @@ def get_hf_processor(
             **kwargs,
         )
 
-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer
-
-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
-
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
@@ -177,10 +152,7 @@ def get_replacement_nvlm(item_idx: int):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 6fedb8c8198..845f77ac39c 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -162,9 +162,9 @@ def _get_prompt_updates(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails(
-                    full=image_tokens + [bos_token_id],
-                    features=image_tokens,
+                insertion=PromptUpdateDetails.select_token_id(
+                    image_tokens + [bos_token_id],
+                    embed_token_id=image_token_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d5c64989e64..d3b0688f21c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -40,8 +40,7 @@
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -443,12 +442,7 @@ def get_replacement_phi3v(item_idx: int):
                     processor=hf_processor,
                 )
 
-            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
-
-            return PromptUpdateDetails(
-                full=image_tokens,
-                features=image_tokens,
-            )
+            return [_IMAGE_TOKEN_ID] * num_image_tokens
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -517,6 +511,7 @@ def _apply_prompt_updates(
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
+                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index f8c7cc9382a..e07c6516aef 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -37,7 +37,7 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -46,8 +46,7 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
-                     scatter_patch_features, select_patch_features)
+from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
 
 try:
     from xformers import ops as xops
@@ -68,14 +67,6 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class PixtralProcessorAdapter:
     """
@@ -144,11 +135,8 @@ def __call__(
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
-        image_token_id = self.image_token_id
-
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
-        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -157,12 +145,10 @@ def __call__(
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
-            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
-            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -213,7 +199,7 @@ def get_num_image_tokens(
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -263,10 +249,7 @@ def _get_mm_fields_config(
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(
-            images=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
-        )
+        return dict(images=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -290,7 +273,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return tokens
+            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
 
         return [
             PromptReplacement(
@@ -381,17 +364,9 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
-        embed_is_patch = kwargs.pop("embed_is_patch")
-        if not isinstance(embed_is_patch, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of embed_is_patch. "
-                             f"Got type: {type(embed_is_patch)}")
-
-        embed_is_patch = flatten_bn(embed_is_patch)
-
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
-            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -427,12 +402,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -444,7 +414,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -963,9 +933,7 @@ def get_num_image_tokens(
             image_width=image_width,
             image_height=image_height,
         )
-
-        # Consider the image_break_token
-        return (ncols + 1) * nrows
+        return ncols * nrows
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index ccb5a3f600b..54220037d25 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -229,9 +229,9 @@ def get_replacement_qwen2_audio(item_idx: int):
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails(
-                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
-                features=audio_tokens,
+            return PromptUpdateDetails.select_token_id(
+                [audio_bos_id] + audio_tokens + [audio_eos_id],
+                embed_token_id=audio_token_id,
             )
 
         return [
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 4e9d02ae0ab..a2ec9a9a4d1 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -647,9 +647,9 @@ def _get_prompt_updates(
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails(
-                    full=[img_start_id] + image_tokens + [img_end_id],
-                    features=image_tokens,
+                replacement=PromptUpdateDetails.select_token_id(
+                    [img_start_id] + image_tokens + [img_end_id],
+                    embed_token_id=img_pad_id,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index ac5de0e36b8..e3deae828a3 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,7 +40,6 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -61,14 +60,6 @@ class SkyworkR1VImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
-    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
-    """
-    A boolean mask indicating which image embeddings correspond
-    to patch tokens.
-
-    Shape: `(batch_size * num_images, num_embeds)`
-    """
-
 
 class SkyworkR1VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -419,24 +410,13 @@ def __call__(
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
-            tokenizer = self.tokenizer
-            image_token_id = self.image_token_id
-
-            embed_is_patch = list[torch.Tensor]()
-
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
-                feature_tokens = tokenizer.encode(image_repl.features,
-                                                  add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
-                embed_is_patch.append(
-                    torch.tensor(feature_tokens) == image_token_id)
-
-            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -460,7 +440,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails(full=repl_full, features=repl_features)
+        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
 
 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -599,7 +579,6 @@ def _get_mm_fields_config(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
-            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -835,7 +814,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
-        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -864,20 +842,14 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
-            if not isinstance(embed_is_patch, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of embed_is_patch. "
-                                 f"Got type: {type(embed_is_patch)}")
-
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
-            embed_is_patch = flatten_bn(embed_is_patch)
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
-                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -923,15 +895,7 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        image_features = self._process_image_input(image_input)
-
-        if image_input["type"] != "pixel_values":
-            return image_features
-
-        return scatter_patch_features(
-            image_features,
-            image_input["embed_is_patch"],
-        )
+        return self._process_image_input(image_input)
 
     def get_input_embeddings(
         self,
@@ -945,7 +909,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                select_patch_features(multimodal_embeddings),
+                multimodal_embeddings,
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 9e00da682e8..347f51499b7 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from collections.abc import Sequence
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union
 
 import torch
 from transformers import PretrainedConfig
@@ -10,12 +9,9 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
-from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
-from .interfaces import MultiModalEmbeddings
-
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -155,74 +151,3 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
-
-
-def scatter_patch_features(
-    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
-    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
-) -> tuple[torch.Tensor, ...]:
-    """
-    Scatter the patch features into a contiguous tensor that corresponds
-    to the embedding tokens defined by the multimodal processor.
-    
-    The rest of the values in the tensor are set to NaN so that they
-    can be filtered out by :func`select_patch_features`.
-
-    Args:
-        patches: The patch features for each image.
-          Shape: `(num_images, <patch_dims>, feature_depth)`
-        embed_is_patch: A boolean mask indicating which image embeddings
-          correspond to patch tokens for each image.
-          Shape: `(num_images, num_embeds)`
-
-    Note:
-        The original code only considers patch tokens as feature
-        tokens, but our processor considers all image-related tokens
-        as feature tokens because the feature tokens need to be
-        consecutive in `input_ids`.
-
-    Example:
-        A simplified example for one image:
-
-        .. code-block::
-
-            Embedding tokens (from HF processor):
-            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
-
-            embed_is_patch (from HF processor):
-            [ False   True    True    False    True    True   False  False ]
-
-            Encoder outputs (from model):
-            [  p1      p2      p3      p4   ]
-
-            The resulting embedding tensor is:
-            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
-    """
-    if len(patches) != len(embed_is_patch):
-        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
-                         f"{len(embed_is_patch)=}")
-
-    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
-        embed_one = patches_one.new_full(
-            (e_is_patch.shape[0], patches_one.shape[-1]),
-            fill_value=torch.nan,
-        )
-        embed_one[e_is_patch] = patches_one
-        return embed_one
-
-    return tuple(
-        get_embed_one(patches_one, e_is_patch)
-        for patches_one, e_is_patch in zip(patches, embed_is_patch))
-
-
-def select_patch_features(
-        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
-    """
-    Given the outputs of :func:`scatter_patch_features`, return only
-    the values that correspond to patch features.
-    """
-    selected_features = json_map_leaves(
-        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
-        cast(JSONTree[torch.Tensor], multimodal_embeddings),
-    )
-    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5159b0bca8c..ad95b982499 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ def append_items_from_seq_group(
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict["offset"],
-                placeholder_dict["offset"] + placeholder_dict["length"],
+                placeholder_dict.offset,
+                placeholder_dict.offset + placeholder_dict.length,
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 81d72ff1902..53729799b62 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,7 +109,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
-class PlaceholderRange(TypedDict):
+@dataclass(frozen=True)
+class PlaceholderRange:
     """
     Placeholder location information for multi-modal data.
 
@@ -121,8 +122,8 @@ class PlaceholderRange(TypedDict):
 
         .. code-block::
 
-            A: { "offset": 0, "length": 4 }
-            B: { "offset": 5, "length": 4 }
+            A: PlaceholderRange(offset=0, length=4)
+            B: PlaceholderRange(offset=5, length=4)
     """
 
     offset: int
@@ -131,6 +132,31 @@ class PlaceholderRange(TypedDict):
     length: int
     """The length of the placeholder."""
 
+    is_embed: Optional[torch.Tensor] = None
+    """
+    A boolean mask of shape `(length,)` indicating which positions
+    between `offset` and `offset + length` to assign embeddings to.
+    """
+
+    def get_num_embeds(self) -> int:
+        if self.is_embed is None:
+            return self.length
+
+        return int(self.is_embed.sum().item())
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, self.__class__):
+            return False
+        if not (self.offset, self.length) == (other.offset, other.length):
+            return False
+
+        if self.is_embed is None:
+            return other.is_embed is None
+        if other.is_embed is None:
+            return self.is_embed is None
+
+        return nested_tensors_equal(self.is_embed, other.is_embed)
+
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c8864c33fe3..a37d2975e9d 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -108,16 +108,46 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    features: _S
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    The part of the content that corresponds to feature placeholders;
-    this will be replaced by the output of the vision encoder during model
-    inference.
+    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
+    indicating which positions of `full` to assign embeddings to.
+
+    `None` (default) means to assign embeddings to all positions of `full`.
+
+    The embeddings are obtained by calling
+    :class:`SupportsMultiModal.get_multimodal_embeddings`.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq, features=seq)
+        return PromptUpdateDetails(full=seq)
+
+    @staticmethod
+    def select_text(
+        seq: _S,
+        embed_text: str,
+    ) -> "PromptUpdateDetails[_S]":
+
+        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
+            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
+
+            return torch.isin(
+                torch.tensor(full.token_ids),
+                torch.tensor(embed_token_ids),
+            )
+
+        return PromptUpdateDetails(full=seq, is_embed=is_embed)
+
+    @staticmethod
+    def select_token_id(
+        seq: _S,
+        embed_token_id: int,
+    ) -> "PromptUpdateDetails[_S]":
+        return PromptUpdateDetails(
+            full=seq,
+            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
+        )
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -406,7 +436,7 @@ def token_ids(self) -> list[int]:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    features: _BoundPromptSequence
+    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
 
 
 @dataclass
@@ -466,10 +496,8 @@ def get_content(self, item_idx: int) -> _BoundPromptContent:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
-        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
-                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            features=bound_features)
+                                            is_embed=content.is_embed)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -605,15 +633,19 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
+    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
+        # TODO: Is it worth it to optimize this by stripping the
+        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
+            is_embed=self.is_embed,
         )
 
 
@@ -806,22 +838,17 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_tokens_feat = content.features.token_ids
-
-                    try:
-                        match = next(
-                            iter_token_matches(content_tokens_full,
-                                               content_tokens_feat))
-                        yield PlaceholderFeaturesInfo(
-                            modality=modality,
-                            item_idx=item_idx,
-                            start_idx=start_idx + match.start_idx,
-                            tokens=content_tokens_feat,
-                        )
-                    except StopIteration:
-                        raise AssertionError(
-                            f"{content_tokens_feat=} should be a "
-                            f"subsequence of {content_tokens_full=}") from None
+                    content_is_embed = content.is_embed
+                    if content_is_embed is not None:
+                        content_is_embed = content_is_embed(content.full)
+
+                    yield PlaceholderFeaturesInfo(
+                        modality=modality,
+                        item_idx=item_idx,
+                        start_idx=start_idx,
+                        tokens=content_tokens_full,
+                        is_embed=content_is_embed,
+                    )
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 1df9a1f5eba..4616e4e9578 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -180,7 +180,7 @@ def get_and_validate_mm_inputs(
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         total_placeholders_by_modality = {
-            modality: sum(item["length"] for item in placeholders)
+            modality: sum(item.get_num_embeds() for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index fc0fb8929b1..77c83f0c2b2 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -340,7 +340,7 @@ def merge_and_sort_multimodal_metadata(
             all_items.append((modality, placeholder, hash_value))
 
     # Sort all items by offset
-    all_items.sort(key=lambda x: x[1]['offset'])
+    all_items.sort(key=lambda x: x[1].offset)
 
     # Split into separate lists
     sorted_modalities = [item[0] for item in all_items]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 34bc9369b12..afcf7e344a0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -310,8 +310,7 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1]["offset"] + mm_positions[-1][
-            "length"] < start_token_idx:
+    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -322,8 +321,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx]["offset"]
-        length = mm_positions[curr_mm_idx]["length"]
+        offset = mm_positions[curr_mm_idx].offset
+        length = mm_positions[curr_mm_idx].length
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 81f8ad25051..b3905987efc 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -505,8 +505,8 @@ def _try_schedule_encoder_inputs(
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info["offset"]
-            num_encoder_tokens = pos_info["length"]
+            start_pos = pos_info.offset
+            num_encoder_tokens = pos_info.length
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -596,8 +596,8 @@ def update_from_output(
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
                     mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions["offset"]
-                    num_tokens = mm_positions["length"]
+                    start_pos = mm_positions.offset
+                    num_tokens = mm_positions.length
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 490fe4e83d3..daf59fd76e9 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,7 +121,7 @@ def get_finished_reason(self) -> Union[FinishReason, None]:
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id]["length"]
+        num_tokens = self.mm_positions[input_id].length
         return num_tokens
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 513806332ef..aba71845cb2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,8 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -43,7 +44,8 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -829,19 +831,22 @@ def _calc_spec_decode_metadata(
         )
         return metadata
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
+            for input_id, pos_info in zip(
+                    encoder_input_ids,
+                    req_state.mm_positions,
+            ):
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+                req_ids_pos.append((req_id, input_id, pos_info))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -877,16 +882,23 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -894,8 +906,8 @@ def _gather_encoder_outputs(
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -917,8 +929,16 @@ def _gather_encoder_outputs(
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -983,10 +1003,10 @@ def execute_model(
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
@@ -1008,9 +1028,9 @@ def execute_model(
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, encoder_outputs)
+                    input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0668e7168b5..488912fbd4b 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -19,7 +19,8 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -36,7 +37,8 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import sanity_check_mm_encoder_outputs
+from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -507,19 +509,47 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
-    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _scatter_placeholders(
+        self,
+        embeds: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return embeds
+
+        placeholders = embeds.new_full(
+            (is_embed.shape[0], embeds.shape[-1]),
+            fill_value=torch.nan,
+        )
+        placeholders[is_embed] = embeds
+        return placeholders
+
+    def _gather_placeholders(
+        self,
+        placeholders: torch.Tensor,
+        is_embed: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if is_embed is None:
+            return placeholders
+
+        return placeholders[is_embed]
+
+    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs: list[MultiModalKwargs] = []
-        req_input_ids: list[tuple[str, int]] = []
+        mm_inputs = list[MultiModalKwargs]()
+        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id in encoder_input_ids:
+            for input_id, pos_info in zip(
+                    encoder_input_ids,
+                    req_state.mm_positions,
+            ):
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_input_ids.append((req_id, input_id))
+                req_ids_pos.append((req_id, input_id, pos_info))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -555,16 +585,23 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+        for (req_id, input_id, pos_info), output in zip(
+                req_ids_pos,
+                encoder_outputs,
+        ):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
-            self.encoder_cache[req_id][input_id] = output
 
-    def _gather_encoder_outputs(
+            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
+                output,
+                is_embed=pos_info.is_embed,
+            )
+
+    def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        encoder_outputs: list[torch.Tensor] = []
+        mm_embeds: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -572,8 +609,8 @@ def _gather_encoder_outputs(
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info["offset"]
-                num_encoder_tokens = pos_info["length"]
+                start_pos = pos_info.offset
+                num_encoder_tokens = pos_info.length
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -595,8 +632,16 @@ def _gather_encoder_outputs(
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-                encoder_outputs.append(encoder_output[start_idx:end_idx])
-        return encoder_outputs
+
+                if (is_embed := pos_info.is_embed) is not None:
+                    is_embed = is_embed[start_idx:end_idx]
+
+                mm_embeds_item = gather_mm_placeholders(
+                    encoder_output[start_idx:end_idx],
+                    is_embed=is_embed,
+                )
+                mm_embeds.append(mm_embeds_item)
+        return mm_embeds
 
     @torch.no_grad()
     def execute_model(
@@ -612,10 +657,10 @@ def execute_model(
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_encoder(scheduler_output)
-            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+            self._execute_mm_encoder(scheduler_output)
+            mm_embeds = self._gather_mm_embeddings(scheduler_output)
         else:
-            encoder_outputs = []
+            mm_embeds = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
@@ -623,9 +668,9 @@ def execute_model(
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            if encoder_outputs:
+            if mm_embeds:
                 inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, encoder_outputs)
+                    self.input_ids, mm_embeds)
             else:
                 inputs_embeds = self.model.get_input_embeddings(self.input_ids)
             input_ids = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b1d3aa7cd8a..e46ca0c90fe 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
 import torch
 
 
@@ -27,3 +29,46 @@ def sanity_check_mm_encoder_outputs(
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
         "of the model's `get_multimodal_embeddings` method.")
+
+
+def scatter_mm_placeholders(
+    embeds: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Scatter the multimodal embeddings into a contiguous tensor that represents
+    the placeholder tokens.
+
+    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+
+    Args:
+        embeds: The multimodal embeddings.
+          Shape: `(num_embeds, embed_dim)`
+        is_embed: A boolean mask indicating which positions in the placeholder
+          tokens need to be filled with multimodal embeddings.
+          Shape: `(num_placeholders, num_embeds)`
+    """
+    if is_embed is None:
+        return embeds
+
+    placeholders = embeds.new_full(
+        (is_embed.shape[0], embeds.shape[-1]),
+        fill_value=torch.nan,
+    )
+    placeholders[is_embed] = embeds
+    return placeholders
+
+
+def gather_mm_placeholders(
+    placeholders: torch.Tensor,
+    is_embed: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """
+    Reconstructs the embeddings from the placeholder tokens.
+
+    This is the operation of :func:`scatter_mm_placeholders`.
+    """
+    if is_embed is None:
+        return placeholders
+
+    return placeholders[is_embed]

From b01b6c414682eae22b43b429be19edcedda02f91 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 4 Apr 2025 14:50:57 -0700
Subject: [PATCH 1212/1240] Revert "[V1] Scatter and gather placeholders in the
 model runner" (#16075)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/contributing/model/multimodal.md  |  16 +-
 docs/source/models/supported_models.md        |   3 +
 examples/offline_inference/audio_language.py  |   2 +-
 .../audio_language/test_ultravox.py           |   5 +-
 .../vision_language/test_models.py            |   2 +-
 .../vision_language/test_pixtral.py           |  26 ++--
 .../multimodal/processing/test_llava_next.py  |   4 +-
 .../processing/test_llava_onevision.py        |   4 +-
 tests/models/registry.py                      |   4 +-
 tests/multimodal/test_processing.py           |   9 --
 tests/v1/core/test_kv_cache_utils.py          |  46 +++---
 vllm/model_executor/models/aya_vision.py      |  71 ++++++---
 vllm/model_executor/models/chameleon.py       |   6 +-
 vllm/model_executor/models/fuyu.py            |  85 +++++++----
 vllm/model_executor/models/gemma3_mm.py       |  83 +++++++---
 vllm/model_executor/models/h2ovl.py           |   2 +-
 vllm/model_executor/models/idefics3.py        |  82 ++++++++--
 vllm/model_executor/models/internvl.py        |  43 +++++-
 vllm/model_executor/models/llava.py           |  55 ++++++-
 vllm/model_executor/models/minicpmo.py        |  74 +++++++--
 vllm/model_executor/models/minicpmv.py        | 142 +++++++++++++-----
 vllm/model_executor/models/mistral3.py        |  50 +++++-
 vllm/model_executor/models/molmo.py           |  74 +++++++--
 vllm/model_executor/models/nvlm_d.py          |  32 +++-
 vllm/model_executor/models/paligemma.py       |   6 +-
 vllm/model_executor/models/phi3v.py           |  11 +-
 vllm/model_executor/models/pixtral.py         |  48 +++++-
 vllm/model_executor/models/qwen2_audio.py     |   6 +-
 vllm/model_executor/models/qwen_vl.py         |   6 +-
 vllm/model_executor/models/skyworkr1v.py      |  42 +++++-
 vllm/model_executor/models/vision.py          |  77 +++++++++-
 vllm/multimodal/base.py                       |   4 +-
 vllm/multimodal/inputs.py                     |  32 +---
 vllm/multimodal/processing.py                 |  77 +++-------
 vllm/multimodal/profiling.py                  |   2 +-
 vllm/multimodal/utils.py                      |   2 +-
 vllm/v1/core/kv_cache_utils.py                |   7 +-
 vllm/v1/core/sched/scheduler.py               |   8 +-
 vllm/v1/request.py                            |   2 +-
 vllm/v1/worker/gpu_model_runner.py            |  60 +++-----
 vllm/v1/worker/tpu_model_runner.py            |  85 +++--------
 vllm/v1/worker/utils.py                       |  45 ------
 42 files changed, 943 insertions(+), 497 deletions(-)

diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
index c4894d39edc..9cbfc32991f 100644
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@@ -860,8 +860,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
 
-To assign the vision embeddings to only the image tokens, instead of a string
-you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
+To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
+with different `full` and `feature` attributes:
 
 ```python
 hf_config = self.info.get_hf_config()
@@ -879,9 +879,9 @@ def get_replacement_fuyu(item_idx: int):
     image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                     [_NEWLINE_TOKEN_ID]) * nrows
 
-    return PromptUpdateDetails.select_token_id(
-        image_tokens + [bos_token_id],
-        embed_token_id=_IMAGE_TOKEN_ID,
+    return PromptUpdateDetails(
+        full=image_tokens + [bos_token_id],
+        features=image_tokens,
     )
 ```
 
@@ -914,9 +914,9 @@ def _get_prompt_updates(
         image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                         [_NEWLINE_TOKEN_ID]) * nrows
 
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
+        return PromptUpdateDetails(
+            full=image_tokens + [bos_token_id],
+            features=image_tokens,
         )
 
     return [
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 316fc3b2c4f..74b4eab9204 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -989,6 +989,9 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 
 :::{important}
+To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
+`pip install git+https://github.com/huggingface/transformers`.
+
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index f33efbab955..840892ea070 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         model=model_name,
         trust_remote_code=True,
         max_model_len=4096,
-        max_num_seqs=2,
+        max_num_seqs=5,
         limit_mm_per_prompt={"audio": audio_count},
     )
 
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index 242f3398b92..83ece5d22bf 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -55,10 +55,7 @@ def server(request, audio_assets):
         for key, value in request.param.items()
     ]
 
-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index b984cd6f548..3b34f012f62 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -167,7 +167,7 @@
             "cherry_blossom": "<image>What is the season?",  # noqa: E501
         }),
         multi_image_prompt="<image><image>Describe the two images in detail.",  # noqa: E501
-        max_model_len=4096,
+        max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
         vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index 6ebe75f0e81..ee619d8d80c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -176,8 +176,6 @@ def test_chat(
             model,
             dtype=dtype,
             tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
             max_model_len=max_model_len,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
@@ -200,14 +198,22 @@ def test_chat(
 
 
 @large_gpu_test(min_gb=48)
-@pytest.mark.parametrize("prompt,expected_ranges",
-                         [(_create_engine_inputs_hf(IMG_URLS[:1]),
-                           [PlaceholderRange(offset=11, length=494)]),
-                          (_create_engine_inputs_hf(IMG_URLS[1:4]), [
-                              PlaceholderRange(offset=11, length=266),
-                              PlaceholderRange(offset=277, length=1056),
-                              PlaceholderRange(offset=1333, length=418)
-                          ])])
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
+        "offset": 11,
+        "length": 494
+    }]),
+     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
+         "offset": 11,
+         "length": 266
+     }, {
+         "offset": 277,
+         "length": 1056
+     }, {
+         "offset": 1333,
+         "length": 418
+     }])])
 def test_multi_modal_placeholders(vllm_runner, prompt,
                                   expected_ranges: list[PlaceholderRange],
                                   monkeypatch) -> None:
diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py
index b82bfe483db..fe56a200a33 100644
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
         first_placeholder = image_placeholders[0]
 
         # NOTE: There is a BOS token
-        assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
+        assert first_placeholder["offset"] == 1
+        assert first_placeholder["length"] == (
             len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
 
     except Exception as exc:
diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py
index dcc8dc8dab5..7cefdd37ee4 100644
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -92,8 +92,8 @@ def _validate_image_prompt_replacements_one(
 
         first_placeholder = image_placeholders[0]
 
-        assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
+        assert first_placeholder["offset"] == 0
+        assert first_placeholder["length"] == len(
             processed_inputs["prompt_token_ids"]) // num_imgs
     except Exception as exc:
         failed_size_excs.append((image_size, exc))
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 9996bd2edce..39e104a11ab 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -277,9 +277,7 @@ def check_available_online(
                                         trust_remote_code=True,
                                         hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
     "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
-                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
-                                      max_transformers_version="4.48",  # noqa: E501
-                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"}),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                          extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
                                          trust_remote_code=True),
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index fa9588a0509..da112bd7a92 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -785,7 +785,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -794,7 +793,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=3,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
             }
@@ -809,14 +807,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=5,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -825,7 +821,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=7,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
                 # No match for pattern_4 as it has lower priority than pattern_1
@@ -840,14 +835,12 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=1,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                     PlaceholderFeaturesInfo(
                         modality="pattern_1",
                         item_idx=1,
                         start_idx=3,
                         tokens=[32000, 32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_4": [
@@ -856,7 +849,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=5,
                         tokens=[32000],
-                        is_embed=None,
                     ),
                 ],
                 "pattern_3": [
@@ -865,7 +857,6 @@ def test_find_update_tokens(
                         item_idx=0,
                         start_idx=6,
                         tokens=[1550, 918, 1550],
-                        is_embed=None,
                     ),
                 ],
             }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 51836644b32..8362af24a67 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256
 # disable yapf here as it formats differently than isort such that both fail
@@ -158,10 +158,13 @@ def test_generate_block_hash_extra_keys():
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=5),
-            PlaceholderRange(offset=10, length=5),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -219,10 +222,13 @@ def test_hash_request_tokens(hash_fn):
     request = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
 
@@ -247,19 +253,25 @@ def test_hash_tokens_different_mm_input(hash_fn):
     request1 = make_request(
         request_id=0,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash1", "hash2"],
     )
     request2 = make_request(
         request_id=1,
         prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[
-            PlaceholderRange(offset=0, length=3),
-            PlaceholderRange(offset=3, length=3),
-        ],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
         mm_hashes=["hash3", "hash2"],
     )
     block_size = 3
diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py
index 6b68885d375..b4bf1d82c08 100644
--- a/vllm/model_executor/models/aya_vision.py
+++ b/vllm/model_executor/models/aya_vision.py
@@ -27,7 +27,7 @@
                                         BaseProcessingInfo,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        encode_tokens)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -35,6 +35,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class AyaVisionImagePixelInputs(TypedDict):
@@ -50,6 +51,13 @@ class AyaVisionImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class AyaVisionMultiModalProjector(nn.Module):
 
@@ -127,20 +135,21 @@ def get_mm_max_tokens_per_item(
     def get_max_image_tokens(self) -> int:
         hf_processor = self.get_hf_processor()
         image_processor = hf_processor.image_processor
-
         image_size = self.get_image_size_with_most_features()
+        tokenizer = hf_processor.tokenizer
         num_patches = self.get_num_patches(
             image_width=image_size.width,
             image_height=image_size.height,
             size=image_processor.size,
             min_patches=image_processor.min_patches,
-            max_patches=image_processor.max_patches,
+            max_patches=image_processor.max_patches)
+        image_string = hf_processor._prompt_split_image(num_patches)
+        x = encode_tokens(
+            tokenizer,
+            image_string,
+            add_special_tokens=False,
         )
-
-        img_patches_per_tile = (hf_processor.img_size //
-                                hf_processor.patch_size)**2
-
-        return num_patches * img_patches_per_tile
+        return len(x)
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None}
@@ -212,6 +221,7 @@ def _call_hf_processor(
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
         image_processor = hf_processor.image_processor
 
+        hf_config = self.info.get_hf_config()
         # HF processor pops the `num_patches` kwarg, which is needed by vLLM
         if (images :=
                 mm_data.get("images")) is not None and '<image>' in prompt:
@@ -224,7 +234,6 @@ def _call_hf_processor(
                 parsed_images.get_image_size(i)
                 for i in range(len(parsed_images))
             ]
-
             num_patches = [
                 self.info.get_num_patches(
                     image_width=image_size.width,
@@ -234,6 +243,20 @@ def _call_hf_processor(
                     max_patches=image_processor.max_patches)
                 for image_size in image_sizes
             ]
+            image_tokens_list = [
+                hf_processor._prompt_split_image(num_patch)
+                for num_patch in num_patches
+            ]
+            tokenizer = self.info.get_tokenizer()
+            image_token_ids = [
+                tokenizer.encode(image_tokens, add_special_tokens=False)
+                for image_tokens in image_tokens_list
+            ]
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == hf_config.image_token_index
+                for image_repl_tokens in image_token_ids
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
             processed_outputs["num_patches"] = torch.tensor(num_patches)
 
         return processed_outputs
@@ -248,6 +271,7 @@ def _get_mm_fields_config(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_patches),
             num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -259,7 +283,6 @@ def _get_prompt_updates(
     ) -> Sequence[PromptUpdate]:
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token
-        img_patch_token = hf_processor.img_patch_token
         image_processor = hf_processor.image_processor
 
         def get_replacement(item_idx: int):
@@ -271,11 +294,8 @@ def get_replacement(item_idx: int):
                 image_height=image_size.height,
                 size=image_processor.size,
                 min_patches=image_processor.min_patches,
-                max_patches=image_processor.max_patches,
-            )
-            repl = hf_processor._prompt_split_image(num_patches=num_patches)
-
-            return PromptUpdateDetails.select_text(repl, img_patch_token)
+                max_patches=image_processor.max_patches)
+            return hf_processor._prompt_split_image(num_patches=num_patches)
 
         return [
             PromptReplacement(
@@ -404,6 +424,7 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[AyaVisionImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_patches = kwargs.pop("num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Aya Vision does not support image_embeds."
 
@@ -415,13 +436,18 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of num_patches. "
                              f"Got type: {type(num_patches)}")
 
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_patches = flatten_bn(num_patches, concat=True)
-
+        embed_is_patch = flatten_bn(embed_is_patch)
         return AyaVisionImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_patches,
+            embed_is_patch=embed_is_patch,
         )
 
     def get_multimodal_embeddings(
@@ -429,8 +455,11 @@ def get_multimodal_embeddings(
         image_input = self._parse_and_validate_image_input(**kwargs)
         if image_input is None:
             return None
-
-        return self._process_image_input(image_input, **kwargs)
+        image_features = self._process_image_input(image_input, **kwargs)
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -442,9 +471,9 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids=input_ids,
                 inputs_embeds=inputs_embeds,
-                multimodal_embeddings=multimodal_embeddings,
-                placeholder_token_id=self.config.image_token_index,
-            )
+                multimodal_embeddings=select_patch_features(
+                    multimodal_embeddings),
+                placeholder_token_id=self.config.image_token_index)
 
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 3d527cb6f52..f758c98ea5e 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -162,9 +162,9 @@ def _get_prompt_updates(
             PromptReplacement(
                 modality="image",
                 target=[image_token_id],
-                replacement=PromptUpdateDetails.select_token_id(
-                    [image_start_id] + image_tokens + [image_end_id],
-                    embed_token_id=image_token_id,
+                replacement=PromptUpdateDetails(
+                    full=([image_start_id] + image_tokens + [image_end_id]),
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 189b91db4a8..a807b047a1a 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -18,7 +18,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, Set, Tuple, TypedDict
+from typing import Literal, Optional, Set, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -43,6 +43,7 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -65,6 +66,14 @@ class FuyuImagePatchInputs(TypedDict):
     flattened just like `flat_data`.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class FuyuProcessingInfo(BaseProcessingInfo):
 
@@ -85,7 +94,15 @@ def get_mm_max_tokens_per_item(
         seq_len: int,
         mm_counts: Mapping[str, int],
     ) -> Mapping[str, int]:
-        return {"image": self.get_max_image_tokens()}
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        max_ncols, max_nrows = self.get_image_feature_grid_size(
+            image_width=target_width,
+            image_height=target_height,
+        )
+        max_image_tokens = (max_ncols + 1) * max_nrows
+
+        return {"image": max_image_tokens}
 
     def get_image_feature_grid_size(
         self,
@@ -111,32 +128,11 @@ def get_image_feature_grid_size(
         nrows = math.ceil(image_height / patch_height)
         return ncols, nrows
 
-    def get_num_image_tokens(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        ncols, nrows = self.get_image_feature_grid_size(
-            image_width=image_width,
-            image_height=image_height,
-        )
-
-        return ncols * nrows
-
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_image_processor()
         return ImageSize(width=image_processor.size["width"],
                          height=image_processor.size["height"])
 
-    def get_max_image_tokens(self) -> int:
-        target_width, target_height = self.get_image_size_with_most_features()
-
-        return self.get_num_image_tokens(
-            image_width=target_width,
-            image_height=target_height,
-        )
-
 
 class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]):
 
@@ -196,6 +192,19 @@ def _call_hf_processor(
 
             processed_outputs["image_patches"] = image_patches[0]
 
+            # get patch grid size for each image
+            embed_is_patch = []
+            for image in images:
+                ncols, nrows = self.info.get_image_feature_grid_size(
+                    image_width=image.width,
+                    image_height=image.height,
+                )
+
+                mask = torch.tensor(([True] * ncols + [False]) * nrows)
+                embed_is_patch.append(mask)
+
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _apply_hf_processor_tokens_only(
@@ -215,7 +224,8 @@ def _get_mm_fields_config(
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+        return dict(image_patches=MultiModalFieldConfig.batched("image"),
+                    embed_is_patch=MultiModalFieldConfig.batched("image"))
 
     def _get_prompt_updates(
         self,
@@ -242,9 +252,9 @@ def get_replacement_fuyu(item_idx: int):
             image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                             [_NEWLINE_TOKEN_ID]) * nrows
 
-            return PromptUpdateDetails.select_token_id(
-                image_tokens + [bos_token_id],
-                embed_token_id=_IMAGE_TOKEN_ID,
+            return PromptUpdateDetails(
+                full=image_tokens + [bos_token_id],
+                features=image_tokens,
             )
 
         return [
@@ -319,13 +329,20 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image patches. "
                                  f"Got type: {type(image_patches)}")
 
+            embed_is_patch = kwargs.pop("embed_is_patch")
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             image_patches_flat = flatten_bn(image_patches)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return FuyuImagePatchInputs(
                 type="image_patches",
                 flat_data=self._validate_pixel_values(
                     flatten_bn(image_patches_flat, concat=True)),
                 patches_per_image=[x.size(0) for x in image_patches_flat],
+                embed_is_patch=embed_is_patch,
             )
 
         return None
@@ -347,7 +364,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -357,11 +379,8 @@ def get_input_embeddings(
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
             inputs_embeds = merge_multimodal_embeddings(
-                input_ids,
-                inputs_embeds,
-                multimodal_embeddings,
-                _IMAGE_TOKEN_ID,
-            )
+                input_ids, inputs_embeds,
+                select_patch_features(multimodal_embeddings), _IMAGE_TOKEN_ID)
         return inputs_embeds
 
     def forward(
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 9552ee1f0b3..bbdea70a7bc 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -25,7 +25,7 @@
                                         PlaceholderFeaturesInfo,
                                         PromptReplacement, PromptTargetMatch,
                                         PromptUpdate, PromptUpdateDetails,
-                                        find_mm_placeholders,
+                                        encode_tokens, find_mm_placeholders,
                                         replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
@@ -36,6 +36,7 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 logger = init_logger(__name__)
 
@@ -53,6 +54,14 @@ class Gemma3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 Gemma3ImageInputs = Gemma3ImagePixelInputs
 
@@ -174,7 +183,7 @@ def get_image_repl(
         if processor is None:
             processor = self.get_hf_processor()
 
-        boi_token = processor.boi_token
+        image_token = processor.boi_token
 
         num_crops = self.get_num_crops(
             image_width=image_width,
@@ -183,21 +192,19 @@ def get_image_repl(
         )
 
         if num_crops == 0:
-            image_text = boi_token
+            image_text = image_token
         else:
-            crops_image_tokens = " ".join(boi_token for _ in range(num_crops))
+            crops_image_tokens = " ".join(image_token
+                                          for _ in range(num_crops))
             image_text = (
-                f"Here is the original image {boi_token} and here are some "
+                f"Here is the original image {image_token} and here are some "
                 f"crops to help you see better {crops_image_tokens}")
 
-        repl_full = image_text.replace(boi_token,
+        repl_full = image_text.replace(image_token,
                                        processor.full_image_sequence)
+        repl_features = repl_full.strip("\n")
 
-        tokenizer = processor.tokenizer
-        vocab = tokenizer.get_vocab()
-        image_token_id = vocab[tokenizer.image_token]
-
-        return PromptUpdateDetails.select_token_id(repl_full, image_token_id)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def get_num_image_tokens(
         self,
@@ -206,17 +213,19 @@ def get_num_image_tokens(
         image_height: int,
         processor: Optional[Gemma3Processor],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        num_crops = self.get_num_crops(
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
-        image_seq_len = processor.image_seq_length
 
-        return (num_crops + 1) * image_seq_len
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl.features,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -292,6 +301,28 @@ def _call_hf_processor(
             ]
             hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
+            image_repl_features = [
+                self.info.get_image_repl(image_width=size.width,
+                                         image_height=size.height,
+                                         processor=hf_processor).features
+                for size in image_sizes
+            ]
+
+            tokenizer = self.info.get_tokenizer()
+            image_repls_feature_tokens = [
+                tokenizer.encode(image_repl, add_special_tokens=False)
+                for image_repl in image_repl_features
+            ]
+
+            vocab = tokenizer.get_vocab()
+            image_token_id = vocab[tokenizer.image_token]
+
+            embed_is_patch = [
+                torch.tensor(image_repl_tokens) == image_token_id
+                for image_repl_tokens in image_repls_feature_tokens
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
             num_crops = [
                 self.info.get_num_crops(image_width=size.width,
                                         image_height=size.height,
@@ -313,6 +344,7 @@ def _get_mm_fields_config(
             pixel_values=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops + 1),
             num_crops=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -422,7 +454,6 @@ def get_repl_toks(tok: int) -> list[int]:
                     item_idx=p.item_idx,
                     start_idx=repl_orig_idxs[p.start_idx],
                     tokens=p.tokens,
-                    is_embed=p.is_embed,
                 ) for p in placeholders
             ]
             for modality, placeholders in repls.items()
@@ -541,6 +572,7 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         num_crops = kwargs.pop("num_crops", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
         assert image_embeds is None, "Gemma3 does not support image_embeds."
         if pixel_values is None:
@@ -554,13 +586,19 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of num_crops. "
                              f"Got type: {type(num_crops)}")
 
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         pixel_values = flatten_bn(pixel_values, concat=True)
         num_crops = flatten_bn(num_crops, concat=True)
+        embed_is_patch = flatten_bn(embed_is_patch)
 
         return Gemma3ImagePixelInputs(
             type="pixel_values",
             pixel_values=self._validate_pixel_values(pixel_values),
             num_patches=num_crops + 1,
+            embed_is_patch=embed_is_patch,
         )
 
     def _image_pixels_to_features(
@@ -597,7 +635,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -609,7 +652,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index f975a19a364..3b2ad695f83 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -257,7 +257,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
     def resolve_min_max_num(
         self,
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 347106bc4dc..da4a44346c3 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -41,7 +41,7 @@
                                         MultiModalDataItems,
                                         MultiModalFieldConfig,
                                         PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        encode_tokens)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -54,6 +54,7 @@
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 
 class Idefics3ImagePixelInputs(TypedDict):
@@ -68,6 +69,14 @@ class Idefics3ImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class Idefics3ImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -77,6 +86,14 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     `hidden_size` must match the hidden size of language model backbone.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
@@ -258,16 +275,19 @@ def get_num_image_tokens(
         image_height: int,
         processor: Optional[Idefics3Processor],
     ) -> int:
-        if processor is None:
-            processor = self.get_hf_processor()
-
-        num_patches = self.get_num_patches(
+        tokenizer = self.get_tokenizer()
+        image_repl = self.get_image_repl(
             image_width=image_width,
             image_height=image_height,
             processor=processor,
         )
 
-        return num_patches * processor.image_seq_len
+        image_repl_tokens = encode_tokens(
+            tokenizer,
+            image_repl,
+            add_special_tokens=False,
+        )
+        return len(image_repl_tokens)
 
     def get_image_size_with_most_features(self) -> ImageSize:
         processor = self.get_hf_processor()
@@ -344,6 +364,28 @@ def _call_hf_processor(
         ]
         hf_processor = self.info.get_hf_processor(**mm_kwargs)
 
+        image_repl_features = [
+            self.info.get_image_repl(image_width=size.width,
+                                     image_height=size.height,
+                                     processor=hf_processor)
+            for size in image_sizes
+        ]
+
+        tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        vocab = tokenizer.get_vocab()
+        image_token_id = vocab[hf_processor.image_token.content]
+
+        embed_is_patch = [
+            torch.tensor(image_repl_tokens) == image_token_id
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        processed_outputs["embed_is_patch"] = embed_is_patch
+
         num_patches = [
             self.info.get_num_patches(
                 image_width=size.width,
@@ -373,6 +415,7 @@ def _get_mm_fields_config(
                 "image", num_patches),
             image_embeds=MultiModalFieldConfig.batched("image"),
             num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
         )
 
     def _get_prompt_updates(
@@ -384,22 +427,17 @@ def _get_prompt_updates(
         hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
         image_token = hf_processor.image_token.content
 
-        def get_replacement_idefics3(item_idx: int) -> PromptUpdateDetails:
+        def get_replacement_idefics3(item_idx: int) -> str:
             images = mm_items.get_items("image", ImageProcessorItems)
 
             image_size = images.get_image_size(item_idx)
 
-            image_repl = self.info.get_image_repl(
+            return self.info.get_image_repl(
                 image_width=image_size.width,
                 image_height=image_size.height,
                 processor=hf_processor,
             )
 
-            return PromptUpdateDetails.select_text(
-                image_repl,
-                embed_text=image_token,
-            )
-
         return [
             PromptReplacement(
                 modality="image",
@@ -637,6 +675,13 @@ def _parse_and_validate_image_input(
         if pixel_values is None and image_embeds is None:
             return None
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of image embeddings. "
@@ -645,6 +690,7 @@ def _parse_and_validate_image_input(
             return Idefics3ImageEmbeddingInputs(
                 type="image_embeds",
                 data=flatten_bn(image_embeds, concat=True),
+                embed_is_patch=embed_is_patch,
             )
 
         if pixel_values is not None:
@@ -672,6 +718,7 @@ def _parse_and_validate_image_input(
                 pixel_values=self._validate_pixel_values(pixel_values),
                 pixel_attention_mask=pixel_attention_mask,
                 num_patches=num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -707,7 +754,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -719,7 +771,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index cf5608e3de7..0729f4c7d20 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -39,6 +39,7 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -59,6 +60,14 @@ class InternVLImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class InternVLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -410,12 +419,24 @@ def __call__(
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
+
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -439,7 +460,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
@@ -578,6 +599,7 @@ def _get_mm_fields_config(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -809,6 +831,7 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[InternVLImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -837,14 +860,20 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return InternVLImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -890,7 +919,15 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -904,7 +941,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b34ac38f680..45a0bf73b83 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,8 +32,7 @@
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -43,7 +42,8 @@
 from .siglip import SiglipVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -67,6 +67,14 @@ class PixtralHFImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class LlavaImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -335,6 +343,23 @@ def _call_hf_processor(
                     for p, (h, w) in zip(pixel_values, image_sizes)
                 ]
 
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -344,6 +369,7 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -378,7 +404,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -586,9 +612,17 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             if self.config.vision_config.model_type == "pixtral":
+                embed_is_patch = kwargs.pop("embed_is_patch")
+                if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                    raise ValueError("Incorrect type of embed_is_patch. "
+                                     f"Got type: {type(embed_is_patch)}")
+
+                embed_is_patch = flatten_bn(embed_is_patch)
+
                 return PixtralHFImagePixelInputs(
                     type="pixel_values_pixtral",
                     pixel_values=flatten_bn(pixel_values),
+                    embed_is_patch=embed_is_patch,
                 )
 
             return LlavaImagePixelInputs(
@@ -680,7 +714,16 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values_pixtral":
+            # The path is used for pixtral (V0 only) and llava (V0/V1)
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -692,7 +735,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py
index a4fb0cb1741..c74e086d374 100644
--- a/vllm/model_executor/models/minicpmo.py
+++ b/vllm/model_executor/models/minicpmo.py
@@ -40,8 +40,7 @@
                                    DictEmbeddingItems, ModalityData,
                                    ModalityDataItems, MultiModalDataItems,
                                    MultiModalDataParser)
-from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+from vllm.multimodal.processing import PromptReplacement, PromptUpdate
 from vllm.multimodal.profiling import ProcessorInputs
 
 from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
@@ -51,6 +50,7 @@
                        _minicpmv_field_config)
 from .utils import (AutoWeightsLoader, cast_overflow_tensors, flatten_bn,
                     maybe_prefix)
+from .vision import scatter_patch_features
 
 CPU_DEVICE = torch.device("cpu")
 
@@ -73,6 +73,14 @@ class MiniCPMOAudioFeatureInputs(TypedDict):
     which equals to `audio_features.shape[-1]`
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_audios, num_embeds)`
+    """
+
 
 class MiniCPMOAudioEmbeddingInputs(TypedDict):
     type: Literal["audio_embeds"]
@@ -85,6 +93,14 @@ class MiniCPMOAudioEmbeddingInputs(TypedDict):
     Length of each slice may vary, so pass it as a list.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which audio embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_audios, num_embeds)`
+    """
+
 
 MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,
                             MiniCPMOAudioEmbeddingInputs]
@@ -99,6 +115,7 @@ def _minicpmo_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         audio_features=MultiModalFieldConfig.batched("audio"),
         audio_feature_lens=MultiModalFieldConfig.batched("audio"),
         audio_embeds=MultiModalFieldConfig.batched("audio"),
+        audio_embed_is_patch=MultiModalFieldConfig.batched("audio"),
         audio_token_id=MultiModalFieldConfig.shared("audio", num_audios),
     )
 
@@ -180,7 +197,8 @@ def get_max_audio_tokens_per_chunk(self) -> int:
         pool_step = self.get_default_audio_pool_step()
         fbank_feat_in_chunk = 100
         cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
-        return (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1
+        return num_audio_tokens + 2  # <audio>(<unk>*N)</audio>
 
     def get_max_audio_chunks_with_most_features(self) -> int:
         return 30
@@ -191,7 +209,8 @@ def get_max_audio_tokens(self) -> int:
 
     def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
         sampling_rate = self.get_default_audio_sampling_rate()
-        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
+        # exclude <audio> </audio>
+        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2
         return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1
 
     def get_num_frames_with_most_features(
@@ -276,6 +295,13 @@ def process_audios(
 
         if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems):
             audio_inputs = {}
+
+            audio_lens = [
+                self.info.get_audio_len_by_num_chunks(
+                    sum(map(len,
+                            parsed_audios.get(i)["audio_embeds"])))
+                for i in range(len(parsed_audios))
+            ]
         else:
             audio_inputs = self._base_call_hf_processor(
                 prompts=[self.info.audio_pattern] * len(parsed_audios),
@@ -297,7 +323,27 @@ def process_audios(
             ]
             audio_inputs["audio_features"] = unpadded_audio_features
 
+            audio_lens = [
+                parsed_audios.get_audio_length(i)
+                for i in range(len(parsed_audios))
+            ]
+
+        audio_repl_features = [
+            self.get_audio_prompt_texts(audio_len) for audio_len in audio_lens
+        ]
+
         tokenizer = self.info.get_tokenizer()
+        audio_repls_feature_tokens = [
+            tokenizer.encode(audio_repl, add_special_tokens=False)
+            for audio_repl in audio_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(audio_repl_tokens)
+            for audio_repl_tokens in audio_repls_feature_tokens
+        ]
+        audio_inputs["audio_embed_is_patch"] = embed_is_patch
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         audio_inputs["audio_token_id"] = torch.tensor(unk_token_id)
 
@@ -338,10 +384,7 @@ def get_audio_replacement(item_idx: int):
             else:
                 audio_len = audios.get_audio_length(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_audio_prompt_texts(audio_len),
-                "<unk>",
-            )
+            return self.get_audio_prompt_texts(audio_len)
 
         return [
             *base_updates,
@@ -670,6 +713,13 @@ def _parse_and_validate_audio_input(
             assert isinstance(audio_token_id, torch.Tensor)
             self.mm_token_ids.add(audio_token_id.flatten().unique().item())
 
+        audio_embed_is_patch = kwargs.pop("audio_embed_is_patch")
+        if not isinstance(audio_embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of audio_embed_is_patch. "
+                             f"Got type: {type(audio_embed_is_patch)}")
+
+        audio_embed_is_patch = flatten_bn(audio_embed_is_patch)
+
         if audio_embeds is not None:
             if not isinstance(audio_embeds, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of audio_embeds. "
@@ -680,6 +730,7 @@ def _parse_and_validate_audio_input(
             return MiniCPMOAudioEmbeddingInputs(
                 type="audio_embeds",
                 audio_embeds=audio_embeds_flat,
+                embed_is_patch=audio_embed_is_patch,
             )
 
         if not isinstance(audio_features, (torch.Tensor, list)):
@@ -698,6 +749,7 @@ def _parse_and_validate_audio_input(
             type="audio_features",
             audio_features=audio_features_flat,
             audio_feature_lens=audio_feature_lens_flat,
+            embed_is_patch=audio_embed_is_patch,
         )
 
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
@@ -729,6 +781,10 @@ def _process_multimodal_inputs(self, modalities: dict):
             if modality == "audios":
                 audio_input = modalities["audios"]
                 audio_features = self._process_audio_input(audio_input)
-                multimodal_embeddings += tuple(audio_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        audio_features,
+                        audio_input["embed_is_patch"],
+                    ))
 
         return multimodal_embeddings
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index eb20a963ae2..5fab9df3f8f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -56,7 +56,7 @@
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -67,6 +67,7 @@
                          SupportsMultiModal, SupportsPP)
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -89,6 +90,14 @@ class MiniCPMVImagePixelInputs(TypedDict):
     This should be in `(height, width)` format.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
     num_slices: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -103,6 +112,14 @@ class MiniCPMVImageEmbeddingInputs(TypedDict):
     instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs,
                             MiniCPMVImageEmbeddingInputs]
@@ -228,10 +245,12 @@ def _minicpmv_field_config(hf_inputs: Mapping[str, torch.Tensor]):
         image_sizes=MultiModalFieldConfig.batched("image"),
         tgt_sizes=MultiModalFieldConfig.batched("image"),
         image_embeds=MultiModalFieldConfig.batched("image"),
+        embed_is_patch=MultiModalFieldConfig.batched("image"),
         video_pixel_values=MultiModalFieldConfig.batched("video"),
         video_image_sizes=MultiModalFieldConfig.batched("video"),
         video_tgt_sizes=MultiModalFieldConfig.batched("video"),
         video_embeds=MultiModalFieldConfig.batched("video"),
+        video_embed_is_patch=MultiModalFieldConfig.batched("video"),
         image_token_id=MultiModalFieldConfig.shared("image", num_images),
         video_token_id=MultiModalFieldConfig.shared("video", num_videos),
     )
@@ -379,43 +398,22 @@ def get_slice_image_placeholder(
             use_image_id=use_image_id,
         )
 
-    def get_sliced_grid(
-        self,
-        image_size: ImageSize,
-        # For MiniCPM V/O 2.6
-        max_slice_nums: Optional[int] = None,
-    ) -> Optional[tuple[int, int]]:
-        image_processor = self.get_image_processor()
-        version = self.get_model_version()
-
-        if version == (2, 0) or version == (2, 5):
-            return image_processor.get_sliced_grid(image_size)
-
-        if max_slice_nums is None:
-            max_slice_nums = image_processor.max_slice_nums
-
-        return image_processor.get_sliced_grid(
-            image_size,
-            max_slice_nums=max_slice_nums,
-        )
-
     def get_num_image_tokens(
         self,
         image_size: ImageSize,
         max_slice_nums: Optional[int] = None,
+        use_image_id: bool = True,
     ) -> int:
-        image_processor = self.get_image_processor()
-
-        grid = self.get_sliced_grid(
+        tokenizer = self.get_tokenizer()
+        image_placeholders = self.get_slice_image_placeholder(
             image_size,
             max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
         )
-        if grid is None:
-            ncols = nrows = 0
-        else:
-            ncols, nrows = grid
+        image_token_ids = tokenizer.encode(image_placeholders,
+                                           add_special_tokens=False)
 
-        return (ncols * nrows + 1) * image_processor.image_feature_size
+        return len(image_token_ids)
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size_with_most_features()
@@ -435,6 +433,7 @@ def get_max_video_frame_tokens(self) -> int:
         return self.get_num_image_tokens(
             frame_size,
             max_slice_nums=self.get_video_max_slice_num(),
+            use_image_id=False,
         )
 
     def get_max_video_tokens(
@@ -540,6 +539,14 @@ def get_video_prompt_texts(self, image_size: ImageSize,
             use_image_id=False,
         ) * num_frames
 
+    def get_embed_is_patch(
+        self,
+        input_ids: list[int],
+    ) -> torch.Tensor:
+        tokenizer = self.info.get_tokenizer()
+        unk_token_id = tokenizer.get_vocab()["<unk>"]
+        return torch.tensor(input_ids) == unk_token_id
+
     def process_images(
         self,
         mm_data: Mapping[str, object],
@@ -563,7 +570,26 @@ def process_images(
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
+        image_sizes = [
+            parsed_images.get_image_size(i) for i in range(len(parsed_images))
+        ]
+        image_repl_features = [
+            self.get_image_prompt_texts(size, idx)
+            for idx, size in enumerate(image_sizes)
+        ]
+
         tokenizer = self.info.get_tokenizer()
+        image_repls_feature_tokens = [
+            tokenizer.encode(image_repl, add_special_tokens=False)
+            for image_repl in image_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(image_repl_tokens)
+            for image_repl_tokens in image_repls_feature_tokens
+        ]
+        image_inputs["embed_is_patch"] = embed_is_patch
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         image_inputs["image_token_id"] = torch.tensor(unk_token_id)
 
@@ -599,9 +625,31 @@ def process_videos(
                 out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
             )
 
-        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+        frame_sizes = [
+            parsed_videos.get_frame_size(i) for i in range(len(parsed_videos))
+        ]
+        num_frames = [
+            parsed_videos.get_num_frames(i) for i in range(len(parsed_videos))
+        ]
+        video_repl_features = [
+            self.get_video_prompt_texts(size, nframes)
+            for size, nframes in zip(frame_sizes, num_frames)
+        ]
 
         tokenizer = self.info.get_tokenizer()
+        video_repls_feature_tokens = [
+            tokenizer.encode(video_repl, add_special_tokens=False)
+            for video_repl in video_repl_features
+        ]
+
+        embed_is_patch = [
+            self.get_embed_is_patch(video_repl_tokens)
+            for video_repl_tokens in video_repls_feature_tokens
+        ]
+        video_inputs["embed_is_patch"] = embed_is_patch
+
+        video_inputs = {f"video_{k}": v for k, v in video_inputs.items()}
+
         unk_token_id = tokenizer.get_vocab()["<unk>"]
         video_inputs["video_token_id"] = torch.tensor(unk_token_id)
 
@@ -692,10 +740,7 @@ def get_image_replacement(item_idx: int):
 
             image_size = images.get_image_size(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_image_prompt_texts(image_size, item_idx),
-                "<unk>",
-            )
+            return self.get_image_prompt_texts(image_size, item_idx)
 
         def get_video_replacement(item_idx: int):
             videos = mm_items.get_items(
@@ -704,10 +749,7 @@ def get_video_replacement(item_idx: int):
             frame_size = videos.get_frame_size(item_idx)
             num_frames = videos.get_num_frames(item_idx)
 
-            return PromptUpdateDetails.select_text(
-                self.get_video_prompt_texts(frame_size, num_frames),
-                "<unk>",
-            )
+            return self.get_video_prompt_texts(frame_size, num_frames)
 
         get_replacement = {
             "image": get_image_replacement,
@@ -790,6 +832,14 @@ def _parse_and_validate_vision_input(
             assert isinstance(image_token_id, torch.Tensor)
             self.mm_token_ids.add(image_token_id.flatten().unique().item())
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError(
+                f"Incorrect type of embed_is_patch for {modality=}. "
+                f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         if image_embeds is not None:
             if not isinstance(image_embeds, (torch.Tensor, list)):
                 raise ValueError(
@@ -801,6 +851,7 @@ def _parse_and_validate_vision_input(
             return MiniCPMVImageEmbeddingInputs(
                 type="image_embeds",
                 image_embeds=image_embeds_flat,
+                embed_is_patch=embed_is_patch,
             )
 
         if not isinstance(pixel_values, (torch.Tensor, list)):
@@ -828,6 +879,7 @@ def _parse_and_validate_vision_input(
             type="pixel_values",
             pixel_values=pixel_values_flat,
             tgt_sizes=tgt_sizes_flat,
+            embed_is_patch=embed_is_patch,
             num_slices=num_slices_flat,
         )
 
@@ -884,11 +936,19 @@ def _process_multimodal_inputs(self, modalities: dict):
             if modality == "images":
                 image_input = modalities["images"]
                 image_features = self._process_vision_input(image_input)
-                multimodal_embeddings += tuple(image_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        image_features,
+                        image_input["embed_is_patch"],
+                    ))
             if modality == "videos":
                 video_input = modalities["videos"]
                 video_features = self._process_vision_input(video_input)
-                multimodal_embeddings += tuple(video_features)
+                multimodal_embeddings += tuple(
+                    scatter_patch_features(
+                        video_features,
+                        video_input["embed_is_patch"],
+                    ))
 
         return multimodal_embeddings
 
@@ -911,7 +971,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 list(self.mm_token_ids),
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index b6fbc6b1fa3..872769dd649 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -27,8 +27,7 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptReplacement, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -36,7 +35,8 @@
 from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
-from .vision import get_vision_encoder_info
+from .vision import (get_vision_encoder_info, scatter_patch_features,
+                     select_patch_features)
 
 
 class Mistral3ImagePixelInputs(TypedDict):
@@ -49,6 +49,14 @@ class Mistral3ImagePixelInputs(TypedDict):
     in which case the data is passed as a list instead of a batched tensor.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size, num_images, num_embeds)`
+    """
+
 
 class Mistral3PatchMerger(nn.Module):
     """
@@ -258,6 +266,23 @@ def _call_hf_processor(
                 p[:, :h, :w] for p, (h, w) in zip(pixel_values, image_sizes)
             ]
 
+            hf_config = self.info.get_hf_config()
+            vision_config = hf_config.vision_config
+            assert isinstance(vision_config, PixtralVisionConfig)
+            encoder_info = PixtralHFEncoderInfo(vision_config)
+
+            tile_sizes = [
+                encoder_info.get_patch_grid_size(
+                    image_width=pixel_value.shape[-1],
+                    image_height=pixel_value.shape[-2],
+                ) for pixel_value in processed_outputs["pixel_values"]
+            ]
+            embed_is_patch = [
+                torch.tensor(([True] * ncols + [False]) * nrows)
+                for ncols, nrows in tile_sizes
+            ]
+            processed_outputs["embed_is_patch"] = embed_is_patch
+
         return processed_outputs
 
     def _get_mm_fields_config(
@@ -267,6 +292,7 @@ def _get_mm_fields_config(
     ) -> Mapping[str, MultiModalFieldConfig]:
         return dict(
             pixel_values=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
         )
 
@@ -301,7 +327,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -392,6 +418,8 @@ def init_vision_tower_for_llava(
     )
 
 
+# TODO(mgoin): Support V1, there are issues with image batching/chunking
+# that need to be resolved first.
 @MULTIMODAL_REGISTRY.register_processor(
     _build_mistral3_processor,
     info=_build_mistral3_info,
@@ -481,9 +509,16 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
 
+        assert self.config.vision_config.model_type == "pixtral"
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         return Mistral3ImagePixelInputs(
             type="pixel_values_pixtral",
             pixel_values=flatten_bn(pixel_values),
+            embed_is_patch=flatten_bn(embed_is_patch),
         )
 
     def _process_image_input(
@@ -522,7 +557,10 @@ def get_multimodal_embeddings(
 
         vision_embeddings = self._process_image_input(image_input)
 
-        return vision_embeddings
+        return scatter_patch_features(
+            vision_embeddings,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -534,7 +572,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.config.image_token_index,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 6857bfa810e..b2f795155f1 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -46,8 +46,7 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptIndexTargets,
-                                        PromptInsertion, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        PromptInsertion, PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 
@@ -57,6 +56,7 @@
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -84,6 +84,14 @@ class MolmoImageInputs(TypedDict):
     Shape: `(batch_size * num_images, num_crops, num_patch)`
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+    
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
     num_crops: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
@@ -1138,6 +1146,30 @@ def __call__(
         if image_input_idx is not None:
             feat_is_patch = image_input_idx >= 0
 
+            input_is_embed = torch.isin(
+                input_ids,
+                torch.tensor([
+                    self.image_patch_id,
+                    self.im_col_id,
+                    self.im_start_id,
+                    self.im_end_id,
+                ]),
+            )
+            embed_ids = input_ids[input_is_embed]
+            embed_is_patch = embed_ids == self.image_patch_id
+            assert embed_is_patch.sum() == feat_is_patch.sum()
+
+            # image_tokens = extra_joint + joint
+            # Both `extra_joint` and `joint` have `im_start_id` and `im_end_id`
+            embed_start = torch.nonzero(embed_ids == self.im_start_id)[::2, 0]
+            embed_end = torch.nonzero(embed_ids == self.im_end_id)[1::2, 0]
+            assert len(embed_start) == len(embed_end) == len(images)
+
+            embed_is_patch = [
+                embed_is_patch[start:end + 1]
+                for start, end in zip(embed_start, embed_end)
+            ]
+
             tilings = [
                 self.select_tiling(
                     image_width=image.size[0],
@@ -1149,6 +1181,7 @@ def __call__(
             assert num_crops.sum() == len(feat_is_patch)
 
             outputs["feat_is_patch"] = feat_is_patch
+            outputs["embed_is_patch"] = embed_is_patch
             outputs["num_crops"] = num_crops
             outputs["img_patch_id"] = self.image_patch_id
 
@@ -1187,13 +1220,17 @@ def get_num_image_tokens(
         )
         pooling_size = processor.pooling_size
 
-        image_token_length_w = processor.image_token_length_w
-        image_token_length_h = processor.image_token_length_h
+        base_image_input_size = processor.base_image_input_size
+        base_image_input_d = processor.image_patch_size
+
+        crop_patches = base_image_input_size[0] // base_image_input_d
 
-        extra = image_token_length_w * image_token_length_h
-        joint = ((ncols + 1) // pooling_size) * ((nrows + 1) // pooling_size)
+        per_row = ncols // pooling_size + 1
+        joint = per_row * (nrows // pooling_size) + 2
+        image_token_length = (crop_patches + pooling_size - 1) // pooling_size
+        resize = (image_token_length + 1) * image_token_length + 2
 
-        return extra + joint
+        return resize + joint
 
     def get_max_image_tokens(self) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
@@ -1291,6 +1328,7 @@ def _get_mm_fields_config(
                 "image", num_crops),
             feat_is_patch=MultiModalFieldConfig.flat_from_sizes(
                 "image", num_crops),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             num_crops=MultiModalFieldConfig.batched("image"),
             img_patch_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -1330,10 +1368,8 @@ def get_insertion_molmo(item_idx: int):
             joint = ([img_start_id] + joint_row *
                      ((nrows + 1) // pooling_size) + [img_end_id])
 
-            return PromptUpdateDetails.select_token_id(
-                extra_joint + joint,
-                embed_token_id=img_patch_id,
-            )
+            image_tokens = extra_joint + joint
+            return image_tokens
 
         return [
             PromptInsertion(
@@ -1439,6 +1475,11 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of feat_is_patch. "
                              f"Got type: {type(feat_is_patch)}")
 
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
         num_crops = kwargs.pop("num_crops", None)
         if not isinstance(num_crops, (torch.Tensor, list)):
             raise ValueError("Incorrect type of num_crops. "
@@ -1450,12 +1491,14 @@ def _parse_and_validate_image_input(
                              f"Got type: {type(img_patch_id)}")
         self.img_patch_id = img_patch_id.flatten().unique().item()
 
+        embed_is_patch = flatten_bn(embed_is_patch)
         num_crops = flatten_bn(num_crops, concat=True)
 
         return MolmoImageInputs(
             images=images,
             image_masks=image_masks,
             feat_is_patch=feat_is_patch,
+            embed_is_patch=embed_is_patch,
             num_crops=num_crops,
         )
 
@@ -1494,7 +1537,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -1508,7 +1556,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_patch_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 314f75c2030..9d04f30c8f3 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ def get_image_repl(
         # when trying to find "<tile" as a subsequence of "<Image><tile"
         repl = "<Image>" + features + "</Image>"
 
-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
+        return PromptUpdateDetails(full=repl, features=repl)
 
 
 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,6 +84,31 @@ def get_hf_processor(
             **kwargs,
         )
 
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        tokenizer = hf_processor.tokenizer
+
+        max_num_patches = hf_processor.max_dynamic_patch
+        # we need +1 here because max_dynamic_patch in config doesn't
+        # include the thumbnail patch
+        tile_pos_identifiers = [
+            f"<tile_{i+1}>" for i in range(max_num_patches)
+        ]
+        if hf_processor.use_thumbnail and max_num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
+        # so we include <tile_1> in the start_str
+        start_str = "<Image>" + tile_pos_identifiers.pop(0)
+        end_str = "</Image>"
+        start_token_len = len(tokenizer.encode(start_str))
+        end_token_len = len(tokenizer.encode(end_str))
+        tile_token_len = sum(
+            len(tokenizer.encode(identifier))
+            for identifier in tile_pos_identifiers)
+        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
+        return super().get_max_image_tokens() + non_image_tokens_num
+
 
 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
 
@@ -152,7 +177,10 @@ def get_replacement_nvlm(item_idx: int):
 
             repl = hf_processor.get_image_repl(feature_size, num_patches)
 
-            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
+            return PromptUpdateDetails(
+                full=repl.full + "\n",
+                features=repl.features + "\n",
+            )
 
         # See note in dummy data regarding why we have the extra newline
         return [
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 845f77ac39c..6fedb8c8198 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -162,9 +162,9 @@ def _get_prompt_updates(
                 modality="image",
                 target=PromptIndexTargets.prefix(
                     [bos_token_id] if tokenizer.add_bos_token else []),
-                insertion=PromptUpdateDetails.select_token_id(
-                    image_tokens + [bos_token_id],
-                    embed_token_id=image_token_id,
+                insertion=PromptUpdateDetails(
+                    full=image_tokens + [bos_token_id],
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d3b0688f21c..d5c64989e64 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -40,7 +40,8 @@
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, BoundPromptUpdate,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        PromptUpdateDetails)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -442,7 +443,12 @@ def get_replacement_phi3v(item_idx: int):
                     processor=hf_processor,
                 )
 
-            return [_IMAGE_TOKEN_ID] * num_image_tokens
+            image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens
+
+            return PromptUpdateDetails(
+                full=image_tokens,
+                features=image_tokens,
+            )
 
         num_images = mm_items.get_count("image", strict=False)
 
@@ -511,7 +517,6 @@ def _apply_prompt_updates(
                         item_idx=p.item_idx,
                         start_idx=p.start_idx - 1,
                         tokens=p.tokens,
-                        is_embed=p.is_embed,
                     ) for p in ps
                 ]
                 for modality, ps in placeholders.items()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index e07c6516aef..f8c7cc9382a 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -37,7 +37,7 @@
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
@@ -46,7 +46,8 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings)
-from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs
+from .vision import (VisionEncoderInfo, resolve_visual_encoder_outputs,
+                     scatter_patch_features, select_patch_features)
 
 try:
     from xformers import ops as xops
@@ -67,6 +68,14 @@ class PixtralImagePixelInputs(TypedDict):
     The result of stacking :attr:`ImageEncoding.tokens` from each prompt.
     """
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class PixtralProcessorAdapter:
     """
@@ -135,8 +144,11 @@ def __call__(
                 "For more info, see: "
                 "https://github.com/vllm-project/vllm/issues/8411.")
 
+        image_token_id = self.image_token_id
+
         images_processed = list[torch.Tensor]()
         images_tokens = list[torch.Tensor]()
+        images_embed_is_patch = list[torch.Tensor]()
 
         for image in images:
             image_inputs = self.image_processor(ImageChunk(image=image))
@@ -145,10 +157,12 @@ def __call__(
 
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
+            images_embed_is_patch.append(image_tokens == image_token_id)
 
         return {
             "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
             "images": images_processed,
+            "embed_is_patch": images_embed_is_patch,
         }
 
 
@@ -199,7 +213,7 @@ def get_num_image_tokens(
         ncols, nrows = processor.image_processor._image_to_num_tokens(
             Image.new("RGB", (image_width, image_height)))
 
-        return ncols * nrows
+        return (ncols + 1) * nrows
 
     def get_image_size_with_most_features(self) -> ImageSize:
         image_processor = self.get_hf_processor().image_processor
@@ -249,7 +263,10 @@ def _get_mm_fields_config(
         hf_inputs: Mapping[str, NestedTensors],
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return dict(images=MultiModalFieldConfig.batched("image"))
+        return dict(
+            images=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
+        )
 
     def _get_prompt_updates(
         self,
@@ -273,7 +290,7 @@ def get_replacement(item_idx: int):
             tokens = ([image_token_id] * ncols + [image_break_id]) * nrows
             tokens[-1] = image_end_id
 
-            return PromptUpdateDetails.select_token_id(tokens, image_token_id)
+            return tokens
 
         return [
             PromptReplacement(
@@ -364,9 +381,17 @@ def _parse_and_validate_image_input(
             raise ValueError("Incorrect type of images. "
                              f"Got type: {type(images)}")
 
+        embed_is_patch = kwargs.pop("embed_is_patch")
+        if not isinstance(embed_is_patch, (torch.Tensor, list)):
+            raise ValueError("Incorrect type of embed_is_patch. "
+                             f"Got type: {type(embed_is_patch)}")
+
+        embed_is_patch = flatten_bn(embed_is_patch)
+
         return PixtralImagePixelInputs(
             type="pixel_values",
             images=flatten_bn(images),
+            embed_is_patch=embed_is_patch,
         )
 
     def _process_image_input(
@@ -402,7 +427,12 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -414,7 +444,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.vision_args.image_token_id,
             )
         return inputs_embeds
@@ -933,7 +963,9 @@ def get_num_image_tokens(
             image_width=image_width,
             image_height=image_height,
         )
-        return ncols * nrows
+
+        # Consider the image_break_token
+        return (ncols + 1) * nrows
 
     def get_max_image_tokens(self) -> int:
         image_size = self.get_image_size()
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 54220037d25..ccb5a3f600b 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -229,9 +229,9 @@ def get_replacement_qwen2_audio(item_idx: int):
 
             audio_tokens = [audio_token_id] * num_features
 
-            return PromptUpdateDetails.select_token_id(
-                [audio_bos_id] + audio_tokens + [audio_eos_id],
-                embed_token_id=audio_token_id,
+            return PromptUpdateDetails(
+                full=[audio_bos_id] + audio_tokens + [audio_eos_id],
+                features=audio_tokens,
             )
 
         return [
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index a2ec9a9a4d1..4e9d02ae0ab 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -647,9 +647,9 @@ def _get_prompt_updates(
             PromptReplacement(
                 modality="image",
                 target=[img_start_id, img_end_id],
-                replacement=PromptUpdateDetails.select_token_id(
-                    [img_start_id] + image_tokens + [img_end_id],
-                    embed_token_id=img_pad_id,
+                replacement=PromptUpdateDetails(
+                    full=[img_start_id] + image_tokens + [img_end_id],
+                    features=image_tokens,
                 ),
             )
         ]
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index e3deae828a3..ac5de0e36b8 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -40,6 +40,7 @@
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
+from .vision import scatter_patch_features, select_patch_features
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -60,6 +61,14 @@ class SkyworkR1VImagePixelInputs(TypedDict):
     num_patches: torch.Tensor
     """Shape: `(batch_size * num_images)`"""
 
+    embed_is_patch: Union[torch.Tensor, list[torch.Tensor]]
+    """
+    A boolean mask indicating which image embeddings correspond
+    to patch tokens.
+
+    Shape: `(batch_size * num_images, num_embeds)`
+    """
+
 
 class SkyworkR1VImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
@@ -410,13 +419,24 @@ def __call__(
                 torch.tensor([len(item) for item in pixel_values_lst]),
             }
 
+            tokenizer = self.tokenizer
+            image_token_id = self.image_token_id
+
+            embed_is_patch = list[torch.Tensor]()
+
             for pixel_values in pixel_values_lst:
                 num_patches = pixel_values.shape[0]
                 feature_size = num_patches * self.num_image_token
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
+                feature_tokens = tokenizer.encode(image_repl.features,
+                                                  add_special_tokens=False)
 
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
+                embed_is_patch.append(
+                    torch.tensor(feature_tokens) == image_token_id)
+
+            image_inputs["embed_is_patch"] = embed_is_patch
 
         text_inputs = self.tokenizer(text)
 
@@ -440,7 +460,7 @@ def get_image_repl(
         repl_features = IMG_CONTEXT * feature_size
         repl_full = IMG_START + repl_features + IMG_END
 
-        return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
+        return PromptUpdateDetails(full=repl_full, features=repl_features)
 
 
 class BaseSkyworkR1VProcessingInfo(BaseProcessingInfo):
@@ -579,6 +599,7 @@ def _get_mm_fields_config(
             pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
                 "image", image_num_patches),
             image_num_patches=MultiModalFieldConfig.batched("image"),
+            embed_is_patch=MultiModalFieldConfig.batched("image"),
             image_embeds=MultiModalFieldConfig.batched("image"),
             image_token_id=MultiModalFieldConfig.shared("image", num_images),
         )
@@ -814,6 +835,7 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[SkyworkR1VImageInputs]:
         pixel_values_flat = kwargs.pop("pixel_values_flat", None)
         image_num_patches = kwargs.pop("image_num_patches", None)
+        embed_is_patch = kwargs.pop("embed_is_patch", None)
         image_embeds = kwargs.pop("image_embeds", None)
 
         if pixel_values_flat is None and image_embeds is None:
@@ -842,14 +864,20 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image_num_patches. "
                                  f"Got type: {type(image_num_patches)}")
 
+            if not isinstance(embed_is_patch, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of embed_is_patch. "
+                                 f"Got type: {type(embed_is_patch)}")
+
             pixel_values_flat = flatten_bn(pixel_values_flat, concat=True)
             image_num_patches = flatten_bn(image_num_patches, concat=True)
+            embed_is_patch = flatten_bn(embed_is_patch)
 
             return SkyworkR1VImagePixelInputs(
                 type="pixel_values",
                 pixel_values_flat=self._validate_pixel_values(
                     pixel_values_flat),
                 num_patches=image_num_patches,
+                embed_is_patch=embed_is_patch,
             )
 
         raise AssertionError("This line should be unreachable.")
@@ -895,7 +923,15 @@ def get_multimodal_embeddings(
         if image_input is None:
             return None
 
-        return self._process_image_input(image_input)
+        image_features = self._process_image_input(image_input)
+
+        if image_input["type"] != "pixel_values":
+            return image_features
+
+        return scatter_patch_features(
+            image_features,
+            image_input["embed_is_patch"],
+        )
 
     def get_input_embeddings(
         self,
@@ -909,7 +945,7 @@ def get_input_embeddings(
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
-                multimodal_embeddings,
+                select_patch_features(multimodal_embeddings),
                 self.img_context_token_id,
             )
         return inputs_embeds
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 347f51499b7..9e00da682e8 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC, abstractmethod
-from typing import Final, Generic, Optional, Protocol, TypeVar, Union
+from collections.abc import Sequence
+from typing import Final, Generic, Optional, Protocol, TypeVar, Union, cast
 
 import torch
 from transformers import PretrainedConfig
@@ -9,9 +10,12 @@
 import vllm.envs as envs
 from vllm.attention.selector import (backend_name_to_enum,
                                      get_global_forced_attn_backend)
+from vllm.jsontree import JSONTree, json_map_leaves
 from vllm.logger import init_logger
 from vllm.platforms import _Backend, current_platform
 
+from .interfaces import MultiModalEmbeddings
+
 logger = init_logger(__name__)
 
 _C = TypeVar("_C", bound=PretrainedConfig)
@@ -151,3 +155,74 @@ def resolve_visual_encoder_outputs(
     if post_layer_norm is not None and uses_last_layer:
         hs_pool[-1] = post_layer_norm(encoder_outputs)
     return torch.cat(hs_pool, dim=-1)
+
+
+def scatter_patch_features(
+    patches: Union[torch.Tensor, Sequence[torch.Tensor]],
+    embed_is_patch: Union[torch.Tensor, Sequence[torch.Tensor]],
+) -> tuple[torch.Tensor, ...]:
+    """
+    Scatter the patch features into a contiguous tensor that corresponds
+    to the embedding tokens defined by the multimodal processor.
+    
+    The rest of the values in the tensor are set to NaN so that they
+    can be filtered out by :func`select_patch_features`.
+
+    Args:
+        patches: The patch features for each image.
+          Shape: `(num_images, <patch_dims>, feature_depth)`
+        embed_is_patch: A boolean mask indicating which image embeddings
+          correspond to patch tokens for each image.
+          Shape: `(num_images, num_embeds)`
+
+    Note:
+        The original code only considers patch tokens as feature
+        tokens, but our processor considers all image-related tokens
+        as feature tokens because the feature tokens need to be
+        consecutive in `input_ids`.
+
+    Example:
+        A simplified example for one image:
+
+        .. code-block::
+
+            Embedding tokens (from HF processor):
+            [<start> <patch> <patch>  <col>  <patch> <patch>  <col>  <end> ]
+
+            embed_is_patch (from HF processor):
+            [ False   True    True    False    True    True   False  False ]
+
+            Encoder outputs (from model):
+            [  p1      p2      p3      p4   ]
+
+            The resulting embedding tensor is:
+            [  nan     p1      p2      nan      p3      p4     nan    nan  ]
+    """
+    if len(patches) != len(embed_is_patch):
+        raise ValueError(f"Inconsistent num_images: {len(patches)=} vs. "
+                         f"{len(embed_is_patch)=}")
+
+    def get_embed_one(patches_one: torch.Tensor, e_is_patch: torch.Tensor):
+        embed_one = patches_one.new_full(
+            (e_is_patch.shape[0], patches_one.shape[-1]),
+            fill_value=torch.nan,
+        )
+        embed_one[e_is_patch] = patches_one
+        return embed_one
+
+    return tuple(
+        get_embed_one(patches_one, e_is_patch)
+        for patches_one, e_is_patch in zip(patches, embed_is_patch))
+
+
+def select_patch_features(
+        multimodal_embeddings: MultiModalEmbeddings) -> MultiModalEmbeddings:
+    """
+    Given the outputs of :func:`scatter_patch_features`, return only
+    the values that correspond to patch features.
+    """
+    selected_features = json_map_leaves(
+        lambda x: x[~x.isnan()].view(-1, *x.shape[1:]),
+        cast(JSONTree[torch.Tensor], multimodal_embeddings),
+    )
+    return cast(MultiModalEmbeddings, selected_features)
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index ad95b982499..5159b0bca8c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -385,8 +385,8 @@ def append_items_from_seq_group(
         for placeholder_dict, mm_item in zip(multi_modal_placeholders,
                                              multi_modal_items):
             placeholder = range(
-                placeholder_dict.offset,
-                placeholder_dict.offset + placeholder_dict.length,
+                placeholder_dict["offset"],
+                placeholder_dict["offset"] + placeholder_dict["length"],
             )
             intersection = range(
                 max(positions.start, placeholder.start),
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 53729799b62..81d72ff1902 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -109,8 +109,7 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 
 
-@dataclass(frozen=True)
-class PlaceholderRange:
+class PlaceholderRange(TypedDict):
     """
     Placeholder location information for multi-modal data.
 
@@ -122,8 +121,8 @@ class PlaceholderRange:
 
         .. code-block::
 
-            A: PlaceholderRange(offset=0, length=4)
-            B: PlaceholderRange(offset=5, length=4)
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
     """
 
     offset: int
@@ -132,31 +131,6 @@ class PlaceholderRange:
     length: int
     """The length of the placeholder."""
 
-    is_embed: Optional[torch.Tensor] = None
-    """
-    A boolean mask of shape `(length,)` indicating which positions
-    between `offset` and `offset + length` to assign embeddings to.
-    """
-
-    def get_num_embeds(self) -> int:
-        if self.is_embed is None:
-            return self.length
-
-        return int(self.is_embed.sum().item())
-
-    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, self.__class__):
-            return False
-        if not (self.offset, self.length) == (other.offset, other.length):
-            return False
-
-        if self.is_embed is None:
-            return other.is_embed is None
-        if other.is_embed is None:
-            return self.is_embed is None
-
-        return nested_tensors_equal(self.is_embed, other.is_embed)
-
 
 NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor,
                       tuple[torch.Tensor, ...]]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index a37d2975e9d..c8864c33fe3 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -108,46 +108,16 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
+    features: _S
     """
-    Given :attr:`full`, return a boolean mask of shape `(len(full),)`
-    indicating which positions of `full` to assign embeddings to.
-
-    `None` (default) means to assign embeddings to all positions of `full`.
-
-    The embeddings are obtained by calling
-    :class:`SupportsMultiModal.get_multimodal_embeddings`.
+    The part of the content that corresponds to feature placeholders;
+    this will be replaced by the output of the vision encoder during model
+    inference.
     """
 
     @staticmethod
     def from_seq(seq: _S) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(full=seq)
-
-    @staticmethod
-    def select_text(
-        seq: _S,
-        embed_text: str,
-    ) -> "PromptUpdateDetails[_S]":
-
-        def is_embed(full: "_BoundPromptSequence") -> torch.Tensor:
-            embed_token_ids = encode_tokens(full.tokenizer, embed_text)
-
-            return torch.isin(
-                torch.tensor(full.token_ids),
-                torch.tensor(embed_token_ids),
-            )
-
-        return PromptUpdateDetails(full=seq, is_embed=is_embed)
-
-    @staticmethod
-    def select_token_id(
-        seq: _S,
-        embed_token_id: int,
-    ) -> "PromptUpdateDetails[_S]":
-        return PromptUpdateDetails(
-            full=seq,
-            is_embed=lambda f: torch.tensor(f.token_ids) == embed_token_id,
-        )
+        return PromptUpdateDetails(full=seq, features=seq)
 
 
 PromptUpdateInfo = Union[PromptSeq, PromptUpdateDetails]
@@ -436,7 +406,7 @@ def token_ids(self) -> list[int]:
 @dataclass
 class _BoundPromptContent:
     full: _BoundPromptSequence
-    is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]]
+    features: _BoundPromptSequence
 
 
 @dataclass
@@ -496,8 +466,10 @@ def get_content(self, item_idx: int) -> _BoundPromptContent:
 
         bound_full = _BoundPromptSequence.from_seq(self.tokenizer,
                                                    content.full)
+        bound_features = _BoundPromptSequence.from_seq(self.tokenizer,
+                                                       content.features)
         bound_content = _BoundPromptContent(full=bound_full,
-                                            is_embed=content.is_embed)
+                                            features=bound_features)
 
         if cache_key is not None:
             self._content_cache[cache_key] = bound_content
@@ -633,19 +605,15 @@ class PlaceholderFeaturesInfo:
     item_idx: int
     start_idx: int
     tokens: list[int]
-    is_embed: Optional[torch.Tensor]
 
     @property
     def length(self) -> int:
         return len(self.tokens)
 
     def to_range(self) -> PlaceholderRange:
-        # TODO: Is it worth it to optimize this by stripping the
-        # leading and ending positions where `is_embed=False`?
         return PlaceholderRange(
             offset=self.start_idx,
             length=self.length,
-            is_embed=self.is_embed,
         )
 
 
@@ -838,17 +806,22 @@ def _iter_placeholders(
                     continue
 
                 if prompt[start_idx:end_idx_full] == content_tokens_full:
-                    content_is_embed = content.is_embed
-                    if content_is_embed is not None:
-                        content_is_embed = content_is_embed(content.full)
-
-                    yield PlaceholderFeaturesInfo(
-                        modality=modality,
-                        item_idx=item_idx,
-                        start_idx=start_idx,
-                        tokens=content_tokens_full,
-                        is_embed=content_is_embed,
-                    )
+                    content_tokens_feat = content.features.token_ids
+
+                    try:
+                        match = next(
+                            iter_token_matches(content_tokens_full,
+                                               content_tokens_feat))
+                        yield PlaceholderFeaturesInfo(
+                            modality=modality,
+                            item_idx=item_idx,
+                            start_idx=start_idx + match.start_idx,
+                            tokens=content_tokens_feat,
+                        )
+                    except StopIteration:
+                        raise AssertionError(
+                            f"{content_tokens_feat=} should be a "
+                            f"subsequence of {content_tokens_full=}") from None
 
                     # Exclude overlapping matches
                     start_idx = end_idx_full
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index 4616e4e9578..1df9a1f5eba 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -180,7 +180,7 @@ def get_and_validate_mm_inputs(
         placeholders_by_modality = mm_inputs["mm_placeholders"]
 
         total_placeholders_by_modality = {
-            modality: sum(item.get_num_embeds() for item in placeholders)
+            modality: sum(item["length"] for item in placeholders)
             for modality, placeholders in placeholders_by_modality.items()
         }
         expected_placeholders_by_modality = {
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 77c83f0c2b2..fc0fb8929b1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -340,7 +340,7 @@ def merge_and_sort_multimodal_metadata(
             all_items.append((modality, placeholder, hash_value))
 
     # Sort all items by offset
-    all_items.sort(key=lambda x: x[1].offset)
+    all_items.sort(key=lambda x: x[1]['offset'])
 
     # Split into separate lists
     sorted_modalities = [item[0] for item in all_items]
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index afcf7e344a0..34bc9369b12 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -310,7 +310,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     # Note that we assume mm_positions is sorted by offset.
     # We do not need to check all mm inputs if the start token index is out of
     # range. This usually happens in the late prefill phase and decoding phase.
-    if mm_positions[-1].offset + mm_positions[-1].length < start_token_idx:
+    if mm_positions[-1]["offset"] + mm_positions[-1][
+            "length"] < start_token_idx:
         return extra_keys, start_mm_idx
 
     # Support start_mm_idx == -1 to indicate the last mm input.
@@ -321,8 +322,8 @@ def _gen_mm_extra_hash_keys(request: Request, start_token_idx: int,
     curr_mm_idx = start_mm_idx
     while mm_positions and curr_mm_idx < len(mm_positions):
         assert mm_hashes[curr_mm_idx] is not None
-        offset = mm_positions[curr_mm_idx].offset
-        length = mm_positions[curr_mm_idx].length
+        offset = mm_positions[curr_mm_idx]["offset"]
+        length = mm_positions[curr_mm_idx]["length"]
         if end_token_idx > offset:
             if start_token_idx > offset + length:
                 # This block has passed the current mm input.
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index b3905987efc..81f8ad25051 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -505,8 +505,8 @@ def _try_schedule_encoder_inputs(
         assert mm_positions is not None
         assert len(mm_positions) > 0
         for i, pos_info in enumerate(mm_positions):
-            start_pos = pos_info.offset
-            num_encoder_tokens = pos_info.length
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
 
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
@@ -596,8 +596,8 @@ def update_from_output(
             if cached_encoder_input_ids:
                 for input_id in list(cached_encoder_input_ids):
                     mm_positions = request.mm_positions[input_id]
-                    start_pos = mm_positions.offset
-                    num_tokens = mm_positions.length
+                    start_pos = mm_positions["offset"]
+                    num_tokens = mm_positions["length"]
                     if start_pos + num_tokens <= request.num_computed_tokens:
                         # The encoder output is already processed and stored
                         # in the decoder's KV cache.
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index daf59fd76e9..490fe4e83d3 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,7 +121,7 @@ def get_finished_reason(self) -> Union[FinishReason, None]:
 
     def get_num_encoder_tokens(self, input_id: int) -> int:
         assert input_id < len(self.mm_positions)
-        num_tokens = self.mm_positions[input_id].length
+        num_tokens = self.mm_positions[input_id]["length"]
         return num_tokens
 
     @property
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index aba71845cb2..513806332ef 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,8 +19,7 @@
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -44,8 +43,7 @@
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -831,22 +829,19 @@ def _calc_spec_decode_metadata(
         )
         return metadata
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id, pos_info in zip(
-                    encoder_input_ids,
-                    req_state.mm_positions,
-            ):
+            for input_id in encoder_input_ids:
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_ids_pos.append((req_id, input_id, pos_info))
+                req_input_ids.append((req_id, input_id))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -882,23 +877,16 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
-                encoder_outputs,
-        ):
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
 
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
-
-    def _gather_mm_embeddings(
+    def _gather_encoder_outputs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        mm_embeds: list[torch.Tensor] = []
+        encoder_outputs: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -906,8 +894,8 @@ def _gather_mm_embeddings(
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info.offset
-                num_encoder_tokens = pos_info.length
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -929,16 +917,8 @@ def _gather_mm_embeddings(
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
-        return mm_embeds
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
 
     def get_model(self) -> nn.Module:
         return self.model
@@ -1003,10 +983,10 @@ def execute_model(
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
         else:
-            mm_embeds = []
+            encoder_outputs = []
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices, spec_decode_metadata = (
@@ -1028,9 +1008,9 @@ def execute_model(
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
             input_ids = self.input_ids[:num_scheduled_tokens]
-            if mm_embeds:
+            if encoder_outputs:
                 inputs_embeds = self.model.get_input_embeddings(
-                    input_ids, mm_embeds)
+                    input_ids, encoder_outputs)
             else:
                 inputs_embeds = self.model.get_input_embeddings(input_ids)
             # TODO(woosuk): Avoid the copy. Optimize.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 488912fbd4b..0668e7168b5 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -19,8 +19,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
@@ -37,8 +36,7 @@
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs,
-                    scatter_mm_placeholders)
+from .utils import sanity_check_mm_encoder_outputs
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -509,47 +507,19 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         logits_indices = logits_indices.to(self.device)
         return attn_metadata, logits_indices
 
-    def _scatter_placeholders(
-        self,
-        embeds: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return embeds
-
-        placeholders = embeds.new_full(
-            (is_embed.shape[0], embeds.shape[-1]),
-            fill_value=torch.nan,
-        )
-        placeholders[is_embed] = embeds
-        return placeholders
-
-    def _gather_placeholders(
-        self,
-        placeholders: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return placeholders
-
-        return placeholders[is_embed]
-
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
             return
 
         # Batch the multi-modal inputs.
-        mm_inputs = list[MultiModalKwargs]()
-        req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
+        mm_inputs: list[MultiModalKwargs] = []
+        req_input_ids: list[tuple[str, int]] = []
         for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
             req_state = self.requests[req_id]
-            for input_id, pos_info in zip(
-                    encoder_input_ids,
-                    req_state.mm_positions,
-            ):
+            for input_id in encoder_input_ids:
                 mm_inputs.append(req_state.mm_inputs[input_id])
-                req_ids_pos.append((req_id, input_id, pos_info))
+                req_input_ids.append((req_id, input_id))
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -585,23 +555,16 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 encoder_outputs.append(output)
 
         # Cache the encoder outputs.
-        for (req_id, input_id, pos_info), output in zip(
-                req_ids_pos,
-                encoder_outputs,
-        ):
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
             if req_id not in self.encoder_cache:
                 self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
 
-            self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
-                output,
-                is_embed=pos_info.is_embed,
-            )
-
-    def _gather_mm_embeddings(
+    def _gather_encoder_outputs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> list[torch.Tensor]:
-        mm_embeds: list[torch.Tensor] = []
+        encoder_outputs: list[torch.Tensor] = []
         for req_id in self.input_batch.req_ids:
             num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
                 req_id]
@@ -609,8 +572,8 @@ def _gather_mm_embeddings(
             num_computed_tokens = req_state.num_computed_tokens
             mm_positions = req_state.mm_positions
             for i, pos_info in enumerate(mm_positions):
-                start_pos = pos_info.offset
-                num_encoder_tokens = pos_info.length
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
 
                 # The encoder output is needed if the two ranges overlap:
                 # [num_computed_tokens,
@@ -632,16 +595,8 @@ def _gather_mm_embeddings(
                 assert req_id in self.encoder_cache
                 assert i in self.encoder_cache[req_id]
                 encoder_output = self.encoder_cache[req_id][i]
-
-                if (is_embed := pos_info.is_embed) is not None:
-                    is_embed = is_embed[start_idx:end_idx]
-
-                mm_embeds_item = gather_mm_placeholders(
-                    encoder_output[start_idx:end_idx],
-                    is_embed=is_embed,
-                )
-                mm_embeds.append(mm_embeds_item)
-        return mm_embeds
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
 
     @torch.no_grad()
     def execute_model(
@@ -657,10 +612,10 @@ def execute_model(
 
         if self.is_multimodal_model:
             # Run the multimodal encoder if any.
-            self._execute_mm_encoder(scheduler_output)
-            mm_embeds = self._gather_mm_embeddings(scheduler_output)
+            self._execute_encoder(scheduler_output)
+            encoder_outputs = self._gather_encoder_outputs(scheduler_output)
         else:
-            mm_embeds = []
+            encoder_outputs = []
 
         # Prepare inputs
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
@@ -668,9 +623,9 @@ def execute_model(
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
             # as input to the multimodal model, even when the input is text.
-            if mm_embeds:
+            if encoder_outputs:
                 inputs_embeds = self.model.get_input_embeddings(
-                    self.input_ids, mm_embeds)
+                    self.input_ids, encoder_outputs)
             else:
                 inputs_embeds = self.model.get_input_embeddings(self.input_ids)
             input_ids = None
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index e46ca0c90fe..b1d3aa7cd8a 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
-
 import torch
 
 
@@ -29,46 +27,3 @@ def sanity_check_mm_encoder_outputs(
         f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
         "instead. This is most likely due to incorrect implementation "
         "of the model's `get_multimodal_embeddings` method.")
-
-
-def scatter_mm_placeholders(
-    embeds: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    Scatter the multimodal embeddings into a contiguous tensor that represents
-    the placeholder tokens.
-
-    :class:`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
-
-    Args:
-        embeds: The multimodal embeddings.
-          Shape: `(num_embeds, embed_dim)`
-        is_embed: A boolean mask indicating which positions in the placeholder
-          tokens need to be filled with multimodal embeddings.
-          Shape: `(num_placeholders, num_embeds)`
-    """
-    if is_embed is None:
-        return embeds
-
-    placeholders = embeds.new_full(
-        (is_embed.shape[0], embeds.shape[-1]),
-        fill_value=torch.nan,
-    )
-    placeholders[is_embed] = embeds
-    return placeholders
-
-
-def gather_mm_placeholders(
-    placeholders: torch.Tensor,
-    is_embed: Optional[torch.Tensor],
-) -> torch.Tensor:
-    """
-    Reconstructs the embeddings from the placeholder tokens.
-
-    This is the operation of :func:`scatter_mm_placeholders`.
-    """
-    if is_embed is None:
-        return placeholders
-
-    return placeholders[is_embed]

From a1fbed6766d1256108ef23189ddd8934fd1629ab Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Fri, 4 Apr 2025 19:27:34 -0400
Subject: [PATCH 1213/1240] [Kernel][Minor] Re-fuse triton moe weight
 application (#16071)

Signed-off-by: Bill Nell <bnell@redhat.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../layers/fused_moe/fused_moe.py             | 42 ++++++++-----------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index aa0bd553fc3..0817879c4d5 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1297,30 +1297,24 @@ def fused_experts_impl(hidden_states: torch.Tensor,
             qintermediate_cache2 = intermediate_cache2
             a2q_scale = a2_scale
 
-        invoke_fused_moe_kernel(
-            qintermediate_cache2,
-            w2,
-            intermediate_cache3,
-            a2q_scale,
-            w2_scale,
-            w2_zp,
-            curr_topk_weights,
-            sorted_token_ids,
-            expert_ids,
-            num_tokens_post_padded,
-            False,  #True,
-            1,
-            config,
-            compute_type=compute_type,
-            use_fp8_w8a8=use_fp8_w8a8,
-            use_int8_w8a16=use_int8_w8a16,
-            use_int4_w4a16=use_int4_w4a16,
-            block_shape=block_shape)
-
-        if True:
-            intermediate_cache3 = intermediate_cache3.view(-1, top_k_num, K)
-            intermediate_cache3.mul_(
-                curr_topk_weights.view(tokens_in_chunk, -1, 1))
+        invoke_fused_moe_kernel(qintermediate_cache2,
+                                w2,
+                                intermediate_cache3,
+                                a2q_scale,
+                                w2_scale,
+                                w2_zp,
+                                curr_topk_weights,
+                                sorted_token_ids,
+                                expert_ids,
+                                num_tokens_post_padded,
+                                True,
+                                1,
+                                config,
+                                compute_type=compute_type,
+                                use_fp8_w8a8=use_fp8_w8a8,
+                                use_int8_w8a16=use_int8_w8a16,
+                                use_int4_w4a16=use_int4_w4a16,
+                                block_shape=block_shape)
 
         ops.moe_sum(intermediate_cache3.view(*intermediate_cache3.shape),
                     out_hidden_states[begin_chunk_idx:end_chunk_idx])

From e55a619e47a4ec9e4c3782dcf8e836964885133c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 4 Apr 2025 17:31:19 -0600
Subject: [PATCH 1214/1240] [Bugfix][TPU] Fix V1 TPU worker for sliding window
 (#16059)

Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/worker/tpu_worker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index bd24072f4c1..67902b41b28 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -18,7 +18,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import SchedulerOutput
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+from vllm.v1.kv_cache_interface import (AttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.utils import bind_kv_cache
@@ -137,7 +137,7 @@ def determine_available_memory(self) -> int:
         kv_caches: dict[str, torch.Tensor] = {}
         kv_cache_spec = self.model_runner.get_kv_cache_spec()
         for layer_name, layer_spec in kv_cache_spec.items():
-            if isinstance(layer_spec, FullAttentionSpec):
+            if isinstance(layer_spec, AttentionSpec):
                 dtype = layer_spec.dtype
 
                 # Use an empty tensor instead of `None`` to force Dynamo to pass
@@ -147,7 +147,8 @@ def determine_available_memory(self) -> int:
                                             device=self.device)
                 kv_caches[layer_name] = tpu_kv_cache
             else:
-                raise NotImplementedError
+                raise NotImplementedError(
+                    f"Unsupported KV cache spec '{type(layer_spec)}'")
 
         runner_kv_caches: list[torch.Tensor] = []
         bind_kv_cache(

From 87d1b973fbdd5e63bcd7683d0c6ff167e8e5ed19 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 4 Apr 2025 16:32:54 -0700
Subject: [PATCH 1215/1240] [V1][Spec Decode] Update N-gram Proposer Interface
 (#15750)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/spec_decode/ngram_proposer.py | 28 ++++++++++++++-------------
 vllm/v1/worker/gpu_model_runner.py    |  6 +-----
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index 8f6d20d11ff..7e548bb48b5 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -10,14 +10,21 @@
 class NgramProposer:
 
     def __init__(self, vllm_config: VllmConfig):
-        self.vllm_config = vllm_config
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
 
     def propose(
         self,
         context_token_ids: np.ndarray,
-        min_n: int,
-        max_n: int,
-        k: int,
     ) -> Optional[np.ndarray]:
         """Proposes the next sequence of tokens based on n-gram pattern 
         matching in the context. The function finds matches of the last n 
@@ -27,17 +34,12 @@ def propose(
         Args:
             context_token_ids: Numpy array of token IDs representing the 
                                context sequence.
-            min_n: Minimum length of the n-gram to match.
-            max_n: Maximum length of the n-gram to match.
-            k: Number of tokens follow the match. If there are less 
-               than k tokens follow the match, we will return 
-               the maximum amount of tokens until the end.
-        
+
         Returns:
             np.ndarray: The sequence of tokens that followed 
                         the matched n-gram in the context.
             None: If no matching n-gram pattern is found.
-        
+
         Example:
             If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
             k = 4:
@@ -49,8 +51,8 @@ def propose(
               we only have three tokens after the match.
         """
         # TODO(woosuk): Optimize this.
-        for n in range(max_n, min_n - 1, -1):
-            result = _find_subarray_kmp(context_token_ids, n, k)
+        for n in range(self.max_n, self.min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, self.k)
             if result is not None:
                 return result
         return None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 513806332ef..82b07c6cd32 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1246,11 +1246,7 @@ def generate_draft_token_ids(
             end_idx = start_idx + num_sampled_ids
             self.input_batch.token_ids_cpu[i, start_idx:end_idx] = sampled_ids
             drafter_output = self.drafter.propose(
-                self.input_batch.token_ids_cpu[i, :end_idx],
-                self.speculative_config.prompt_lookup_min,
-                self.speculative_config.prompt_lookup_max,
-                self.speculative_config.num_speculative_tokens,
-            )
+                self.input_batch.token_ids_cpu[i, :end_idx])
             if drafter_output is None or len(drafter_output) == 0:
                 draft_token_ids.append([])
             else:

From 2f64ba2d740b8660c8bcc3a2ce913a588ac75c6e Mon Sep 17 00:00:00 2001
From: Tristan Leclercq <49700633+tristanleclercq@users.noreply.github.com>
Date: Sat, 5 Apr 2025 08:30:45 +0200
Subject: [PATCH 1216/1240] [Misc] Auto detect bitsandbytes pre-quantized
 models (#16027)

Signed-off-by: Tristan Leclercq <tristanleclercq@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/features/quantization/bnb.md |  9 ++++++---
 tests/quantization/test_bitsandbytes.py  | 10 ++++++----
 vllm/engine/arg_utils.py                 |  4 ++++
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md
index fc499e7692d..e356b99d85c 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza
 
 ## Read quantized checkpoint
 
+For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
 ```
 
 ## Inflight quantization: load as 4bit quantization
 
+For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
+
 ```python
 from vllm import LLM
 import torch
@@ -40,7 +43,7 @@ quantization="bitsandbytes")
 
 ## OpenAI Compatible Server
 
-Append the following to your 4bit model arguments:
+Append the following to your model arguments for 4bit inflight quantization:
 
 ```console
 --quantization bitsandbytes
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 533b055ee6d..8d9ae282153 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -41,7 +41,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 
     hf_model_kwargs = {"load_in_4bit": True}
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, hf_model_kwargs)
+                             model_name, False, hf_model_kwargs)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -53,7 +53,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                        model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@@ -65,7 +65,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              model_name, description) -> None:
 
     validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name)
+                             model_name, True)
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -82,6 +82,7 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              vllm_runner,
                              example_prompts[:1],
                              model_name,
+                             False,
                              hf_model_kwargs,
                              vllm_tp_size=2)
 
@@ -128,13 +129,14 @@ def validate_generated_texts(hf_runner,
                              vllm_runner,
                              prompts,
                              model_name,
+                             pre_quant=False,
                              hf_model_kwargs=None,
                              vllm_tp_size=1):
 
     # NOTE: run vLLM first, as it requires a clean process
     # when using distributed inference
     with vllm_runner(model_name,
-                     quantization='bitsandbytes',
+                     quantization=None if pre_quant else 'bitsandbytes',
                      tensor_parallel_size=vllm_tp_size,
                      enforce_eager=False) as llm:
         vllm_outputs = llm.generate_greedy(prompts, 8)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 89c9b67470e..93dba20141d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1275,6 +1275,10 @@ def create_engine_config(
             self.model_loader_extra_config[
                 "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
 
+        # bitsandbytes pre-quantized model need a specific model loader
+        if model_config.quantization == "bitsandbytes":
+            self.quantization = self.load_format = "bitsandbytes"
+
         load_config = self.create_load_config()
 
         prompt_adapter_config = PromptAdapterConfig(

From 5d1167214eeeb22f86c2332655db721c69c1e61f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Sat, 5 Apr 2025 03:36:01 -0700
Subject: [PATCH 1217/1240] [CI] Fix benchmark script level (#16089)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .buildkite/scripts/run-benchmarks.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/scripts/run-benchmarks.sh b/.buildkite/scripts/run-benchmarks.sh
index 1641c1faa9d..195a8063fd7 100644
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
 
-# cd into parent directory of this file
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+# cd 2 levels into the working directory
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 

From e919b62fceaf48c456f318a3f185b0f3dd167de9 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 5 Apr 2025 19:00:12 +0800
Subject: [PATCH 1218/1240] fix: support clang17 for macos and fix the real
 libomp (#16086)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 cmake/cpu_extension.cmake | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index fdc03a79505..00670bd398b 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -33,8 +33,6 @@ endif()
 
 if(MACOSX_FOUND)
     list(APPEND CXX_COMPILE_FLAGS
-        "-Xpreprocessor"
-        "-fopenmp"
         "-DVLLM_CPU_EXTENSION")
 else()
     list(APPEND CXX_COMPILE_FLAGS

From c3ee3e8ad634a426cacd72a6d7de71734ff453ba Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sat, 5 Apr 2025 19:39:18 +0800
Subject: [PATCH 1219/1240] [doc] fix 404 (#16082)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 74b4eab9204..bd310219ca5 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -1128,5 +1128,5 @@ We have the following levels of testing for models:
 
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.

From 32d8fabe5ac17f6aca3fd013b36a7afb51748710 Mon Sep 17 00:00:00 2001
From: yihong <zouzou0208@gmail.com>
Date: Sat, 5 Apr 2025 19:51:51 +0800
Subject: [PATCH 1220/1240] Revert "doc: add info for macos clang errors
 (#16049)" (#16091)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../getting_started/installation/cpu/apple.inc.md      | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md
index 61812ead122..7bc9e85ecd9 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@@ -12,7 +12,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
-- Compiler: `Apple Clang >= 15.0.0` and `Apple Clang < 17.0.0`
+- Compiler: `Apple Clang >= 15.0.0`
 
 ## Set up using Python
 
@@ -51,14 +51,6 @@ If the build has error like the following snippet where standard C++ headers can
       1 error generated.
 ```
 
-If run with error like the following snippet you need to check clang version and install a compatible version.
-
-```text
-AttributeError: '_OpNamespace' '_C' object has no attribute 'silu_and_mul'
-```
-
-More information can be found in <gh-issue:15941>.
-
 ## Set up using Docker
 
 ### Pre-built images

From 4650dec4891e237053f7721e8182fcb9112f7e59 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Sat, 5 Apr 2025 14:44:03 +0100
Subject: [PATCH 1221/1240] Fix some capitalisations in generated examples doc
 titles (#16094)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/generate_examples.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
index 1206d5fe753..f77dbefb0a0 100644
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
         "cli": "CLI",
         "cpu": "CPU",
         "llm": "LLM",
+        "mae": "MAE",
         "tpu": "TPU",
         "aqlm": "AQLM",
         "gguf": "GGUF",
@@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
         "rlhf": "RLHF",
         "vllm": "vLLM",
         "openai": "OpenAI",
+        "lmcache": "LMCache",
         "multilora": "MultiLoRA",
         "mlpspeculator": "MLPSpeculator",
         r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32

From 544db760fd3c0c2c3d6e3ba7e5b81fc579b6b1a1 Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Apr 2025 10:57:18 +0800
Subject: [PATCH 1222/1240] [Misc] format output for encoder_decoder.py
 (#16095)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/encoder_decoder.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index 8765d1812cc..c6ccfd42ec8 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -75,8 +75,6 @@
     enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
 ] + zipped_prompt_list
 
-print(prompts)
-
 # Create a sampling params object.
 sampling_params = SamplingParams(
     temperature=0,
@@ -91,10 +89,13 @@
 outputs = llm.generate(prompts, sampling_params)
 
 # Print the outputs.
-for output in outputs:
+print("-" * 50)
+for i, output in enumerate(outputs):
     prompt = output.prompt
     encoder_prompt = output.encoder_prompt
     generated_text = output.outputs[0].text
-    print(f"Encoder prompt: {encoder_prompt!r}, "
-          f"Decoder prompt: {prompt!r}, "
+    print(f"Output {i+1}:")
+    print(f"Encoder prompt: {encoder_prompt!r}\n"
+          f"Decoder prompt: {prompt!r}\n"
           f"Generated text: {generated_text!r}")
+    print("-" * 50)

From 0d7ac17dd417fc576266c02a20f7824514aaecf0 Mon Sep 17 00:00:00 2001
From: Chauncey <chaunceyjiang@gmail.com>
Date: Sun, 6 Apr 2025 11:03:50 +0800
Subject: [PATCH 1223/1240] [Misc] Remove redundant code (#16098)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/entrypoints/chat_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ff2d1aacbec..9129e47de7e 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -498,7 +498,7 @@ def _placeholder_str(self, modality: ModalityStr,
                                               hf_config.image_token_index)
             if model_type in ("aya_vision", "chameleon", "deepseek_vl_v2",
                               "internvl_chat", "skywork_chat", "NVLM_D",
-                              "h2ovl_chat"):
+                              "h2ovl_chat", "idefics3"):
                 return "<image>"
             if model_type == "mllama":
                 return "<|image|>"
@@ -506,8 +506,6 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
-            if model_type == "idefics3":
-                return "<image>"
             if model_type == "aria":
                 return "<|fim_prefix|><|img|><|fim_suffix|>"
             if model_type == "gemma3":

From 94e02f860046cf23c310671983e5cd7b128986a8 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Sun, 6 Apr 2025 11:04:22 +0800
Subject: [PATCH 1224/1240] [Bugfix] fix use_atomic_add support of marlin
 kernel when using v1 engine (#15946)

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 csrc/quantization/gptq_marlin/gptq_marlin.cu                | 6 +++++-
 .../layers/quantization/utils/marlin_utils.py               | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index 14d397d03e1..83bbd1e6816 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                 num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
       }                                                                        \
     }
 
@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
     if (false) {
     }
     GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index d1fb52ae09d..5b2e3ca2c79 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
-    return max(m, 64) * n < 64 * 2048 and k >= 2048
+    return n < 2048 and k >= 2048
 
 
 def apply_gptq_marlin_linear(

From 21bc099c6e9bd2f299eac3d3bf9c9262c9fe1fd5 Mon Sep 17 00:00:00 2001
From: Jonghyun Choe <andy.choe729@gmail.com>
Date: Sun, 6 Apr 2025 12:34:38 +0900
Subject: [PATCH 1225/1240] [Model] use AutoWeightsLoader for phi, gemma,
 deepseek (#16088)

Signed-off-by: Jonghyun Choe <andy.choe729@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/deepseek.py | 102 +++++++++++++------------
 vllm/model_executor/models/gemma.py    |  89 +++++++++++----------
 vllm/model_executor/models/phi.py      |  87 +++++++++++----------
 3 files changed, 147 insertions(+), 131 deletions(-)

diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index f0212f37657..5e036d049a8 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -51,7 +51,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (extract_layer_index, is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, extract_layer_index,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -385,6 +386,56 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip experts that are not assigned to this worker.
+                if (("mlp.experts." in name or "mlp.shared_experts." in name)
+                        and name not in params_dict):
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class DeepseekForCausalLM(nn.Module, SupportsPP):
 
@@ -439,50 +490,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip experts that are not assigned to this worker.
-                if (("mlp.experts." in name or "mlp.shared_experts." in name)
-                        and name not in params_dict):
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)
\ No newline at end of file
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index d741880c00d..92d99883c77 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -43,7 +43,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -319,6 +319,46 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
 
 class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -385,44 +425,9 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # lm_head is not used in vllm as it is tied with embed_token.
-                # To prevent errors, skip loading lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 6ee80210c2b..fdf7734595a 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -61,7 +61,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -249,6 +249,49 @@ def forward(
 
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v")
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # pylint: disable=E1136
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
@@ -317,43 +360,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v")
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # pylint: disable=E1136
-
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights)

From 1836b820e64fdb23f083b29154ba3c96e1c6caec Mon Sep 17 00:00:00 2001
From: Lucia Fang <116399278+luccafong@users.noreply.github.com>
Date: Sat, 5 Apr 2025 21:23:40 -0700
Subject: [PATCH 1226/1240] [Model] fix model testing for TeleChat2ForCausalLM
 and V0 llama4 (#16112)

Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/attention/backends/flash_attn.py   | 5 +++++
 vllm/model_executor/models/telechat2.py | 8 ++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 27bd292b51f..c0a572b4aae 100755
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -617,10 +617,15 @@ def __init__(
         blocksparse_params: Optional[Dict[str, Any]] = None,
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
     ) -> None:
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py
index a38035e37ec..062b1c2cf5f 100644
--- a/vllm/model_executor/models/telechat2.py
+++ b/vllm/model_executor/models/telechat2.py
@@ -19,7 +19,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Iterable, Set, Tuple
+from typing import Iterable, Set, Tuple, Type
 
 import torch
 
@@ -27,6 +27,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel
 
+from .llama import LlamaDecoderLayer
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
                     is_pp_missing_parameter)
 
@@ -120,7 +121,10 @@ class TeleChat2ForCausalLM(LlamaForCausalLM):
         },
     )
 
-    def _init_model(self, vllm_config: VllmConfig, prefix: str = ""):
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer):
         return TeleChat2Model(vllm_config=vllm_config, prefix=prefix)
 
     def load_weights(self, weights: Iterable[Tuple[str,

From 6f88773538e471e62dc613f79e9de5bd3fd1bad3 Mon Sep 17 00:00:00 2001
From: Hyesoo Yang <45211235+hyeygit@users.noreply.github.com>
Date: Sat, 5 Apr 2025 21:30:35 -0700
Subject: [PATCH 1227/1240] [Benchmark] Add sampling parameters to
 benchmark_serving. (#16022)

Signed-off-by: Hyesoo Yang <hyeygit@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 benchmarks/README.md               | 18 +++++++++
 benchmarks/backend_request_func.py |  6 +++
 benchmarks/benchmark_serving.py    | 59 ++++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index b0417631c51..4a8ab895e18 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
     --seed 42
 ```
 
+### Running With Sampling Parameters
+
+When using OpenAI-compatible backends such as `vllm`, optional sampling
+parameters can be specified. Example client command:
+
+```bash
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend vllm \
+  --model NousResearch/Hermes-3-Llama-3.1-8B \
+  --endpoint /v1/completions \
+  --dataset-name sharegpt \
+  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --top-k 10 \
+  --top-p 0.9 \
+  --temperature 0.5 \
+  --num-prompts 10
+```
+
 ---
 ## Example - Offline Throughput Benchmark
 
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ea70a1f48a0..287d500a81d 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -497,3 +497,9 @@ def get_tokenizer(
     "scalellm": async_request_openai_completions,
     "sglang": async_request_openai_completions,
 }
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions,
+             async_request_openai_chat_completions)
+]
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 59648222e0a..c50125b708b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -34,7 +34,8 @@
 from typing import Any, Optional
 
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
+                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                   RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@@ -260,6 +261,7 @@ async def benchmark(
     goodput_config_dict: dict[str, float],
     max_concurrency: Optional[int],
     lora_modules: Optional[Iterable[str]],
+    extra_body: Optional[dict],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -287,6 +289,7 @@ async def benchmark(
         logprobs=logprobs,
         multi_modal_content=test_mm_content,
         ignore_eos=ignore_eos,
+        extra_body=extra_body,
     )
 
     test_output = await request_func(request_func_input=test_input)
@@ -313,7 +316,8 @@ async def benchmark(
                                          output_len=test_output_len,
                                          logprobs=logprobs,
                                          multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
+                                         extra_body=extra_body)
         profile_output = await request_func(request_func_input=profile_input)
         if profile_output.success:
             print("Profiler started")
@@ -363,7 +367,8 @@ async def limited_request_func(request_func_input, pbar):
                                               output_len=output_len,
                                               logprobs=logprobs,
                                               multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
+                                              extra_body=extra_body)
         tasks.append(
             asyncio.create_task(
                 limited_request_func(request_func_input=request_func_input,
@@ -652,6 +657,26 @@ def main(args: argparse.Namespace):
             raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
     goodput_config_dict = check_goodput_args(args)
 
+    # Collect the sampling parameters.
+    sampling_params = {
+        k: v
+        for k, v in {
+            "top_p": args.top_p,
+            "top_k": args.top_k,
+            "min_p": args.min_p,
+            "temperature": args.temperature
+        }.items() if v is not None
+    }
+
+    # Sampling parameters are only supported by openai-compatible backend.
+    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible "
+            "backends.")
+
+    if "temperature" not in sampling_params:
+        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
+
     # Avoid GC processing "static" data - reduce pause times.
     gc.collect()
     gc.freeze()
@@ -678,6 +703,7 @@ def main(args: argparse.Namespace):
             goodput_config_dict=goodput_config_dict,
             max_concurrency=args.max_concurrency,
             lora_modules=args.lora_modules,
+            extra_body=sampling_params,
         ))
 
     # Save config and results to json
@@ -1000,6 +1026,33 @@ def main(args: argparse.Namespace):
         "from the sampled HF dataset.",
     )
 
+    sampling_group = parser.add_argument_group("sampling parameters")
+    sampling_group.add_argument(
+        "--top-p",
+        type=float,
+        default=None,
+        help="Top-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--top-k",
+        type=int,
+        default=None,
+        help="Top-k sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--min-p",
+        type=float,
+        default=None,
+        help="Min-p sampling parameter. Only has effect on openai-compatible "
+        "backends.")
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=None,
+        help="Temperature sampling parameter. Only has effect on "
+        "openai-compatible backends. If not specified, default to greedy "
+        "decoding (i.e. temperature==0.0).")
+
     parser.add_argument(
         '--tokenizer-mode',
         type=str,

From 792c9a3111cbd36b5e499a35a15ee6bcd3e83afc Mon Sep 17 00:00:00 2001
From: Ben Jackson <ben@ben.com>
Date: Sun, 6 Apr 2025 00:44:36 -0700
Subject: [PATCH 1228/1240] [Frontend] Fix typo in tool chat templates for
 llama3.2 and toolace (#14501)

Signed-off-by: Ben Jackson <ben@ben.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/tool_chat_template_llama3.2_pythonic.jinja | 2 +-
 examples/tool_chat_template_toolace.jinja           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja
index 8c38de6c6a9..e4ec2353b35 100644
--- a/examples/tool_chat_template_llama3.2_pythonic.jinja
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -76,7 +76,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}
diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja
index a9b3b7189dd..da0f25cdcb3 100644
--- a/examples/tool_chat_template_toolace.jinja
+++ b/examples/tool_chat_template_toolace.jinja
@@ -44,7 +44,7 @@
             {{- tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
                 {{- param + '=' -}}
-                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {{- "%s" | format(tool_call.arguments[param]) -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}

From ed870a648693b3c5d478e59995d17d868ff60e04 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sun, 6 Apr 2025 01:18:00 -0700
Subject: [PATCH 1229/1240] [CI][V1] Fix passing `tokenizer` as kwarg to
 `validate_guidance_grammar` (#16117)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/v1/structured_output/backend_guidance.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py
index a7ba7101694..ec7e627191a 100644
--- a/vllm/v1/structured_output/backend_guidance.py
+++ b/vllm/v1/structured_output/backend_guidance.py
@@ -163,7 +163,6 @@ def validate_guidance_grammar(
         tokenizer: Optional[llguidance.LLTokenizer] = None) -> None:
     tp, grm = get_structured_output_key(sampling_params)
     guidance_grm = serialize_guidance_grammar(tp, grm)
-    err = llguidance.LLMatcher.validate_grammar(guidance_grm,
-                                                tokenizer=tokenizer)
+    err = llguidance.LLMatcher.validate_grammar(guidance_grm, tokenizer)
     if err:
         raise ValueError(f"Grammar error: {err}")

From d17c534a8390c3b6708b7ff9630ff345f37a853e Mon Sep 17 00:00:00 2001
From: Reid <61492567+reidliu41@users.noreply.github.com>
Date: Sun, 6 Apr 2025 17:42:48 +0800
Subject: [PATCH 1230/1240] [Misc] refactor example eagle (#16100)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/eagle.py | 185 +++++++++++++++-------------
 1 file changed, 99 insertions(+), 86 deletions(-)

diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index db5012bae29..369417b2c18 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -7,89 +7,102 @@
 
 from vllm import LLM, SamplingParams
 
-parser = argparse.ArgumentParser()
-
-parser.add_argument(
-    "--dataset",
-    type=str,
-    default="./examples/data/gsm8k.jsonl",
-    help="downloaded from the eagle repo " \
-    "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
-)
-parser.add_argument("--max_num_seqs", type=int, default=8)
-parser.add_argument("--num_prompts", type=int, default=80)
-parser.add_argument("--num_spec_tokens", type=int, default=2)
-parser.add_argument("--tp", type=int, default=1)
-parser.add_argument("--draft_tp", type=int, default=1)
-parser.add_argument("--enforce_eager", action='store_true')
-parser.add_argument("--enable_chunked_prefill", action='store_true')
-parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
-parser.add_argument("--temp", type=float, default=0)
-
-args = parser.parse_args()
-
-print(args)
-
-model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
-eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
-
-max_model_len = 2048
-
-tokenizer = AutoTokenizer.from_pretrained(model_dir)
-
-if os.path.exists(args.dataset):
-    prompts = []
-    num_prompts = args.num_prompts
-    with open(args.dataset) as f:
-        for line in f:
-            data = json.loads(line)
-            prompts.append(data["turns"][0])
-else:
-    prompts = ["The future of AI is", "The president of the United States is"]
-
-prompts = prompts[:args.num_prompts]
-num_prompts = len(prompts)
-
-prompt_ids = [
-    tokenizer.apply_chat_template([{
-        "role": "user",
-        "content": prompt
-    }],
-                                  add_generation_prompt=True)
-    for prompt in prompts
-]
-
-llm = LLM(
-    model=model_dir,
-    trust_remote_code=True,
-    tensor_parallel_size=args.tp,
-    enable_chunked_prefill=args.enable_chunked_prefill,
-    max_num_batched_tokens=args.max_num_batched_tokens,
-    enforce_eager=args.enforce_eager,
-    max_model_len=max_model_len,
-    max_num_seqs=args.max_num_seqs,
-    gpu_memory_utilization=0.8,
-    speculative_config={
-        "model": eagle_dir,
-        "num_speculative_tokens": args.num_spec_tokens,
-        "draft_tensor_parallel_size": args.draft_tp,
-        "max_model_len": max_model_len,
-    },
-    disable_log_stats=False,
-)
-
-sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
-
-outputs = llm.generate(prompt_token_ids=prompt_ids,
-                       sampling_params=sampling_params)
-
-# calculate the average number of accepted tokens per forward pass, +1 is
-# to account for the token from the target model that's always going to be
-# accepted
-acceptance_counts = [0] * (args.num_spec_tokens + 1)
-for output in outputs:
-    for step, count in enumerate(output.metrics.spec_token_acceptance_counts):
-        acceptance_counts[step] += count
-
-print(f"mean acceptance length: \
-    {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+
+def load_prompts(dataset_path, num_prompts):
+    if os.path.exists(dataset_path):
+        prompts = []
+        try:
+            with open(dataset_path) as f:
+                for line in f:
+                    data = json.loads(line)
+                    prompts.append(data["turns"][0])
+        except Exception as e:
+            print(f"Error reading dataset: {e}")
+            return []
+    else:
+        prompts = [
+            "The future of AI is", "The president of the United States is"
+        ]
+
+    return prompts[:num_prompts]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default="./examples/data/gsm8k.jsonl",
+        help="downloaded from the eagle repo " \
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+    )
+    parser.add_argument("--max_num_seqs", type=int, default=8)
+    parser.add_argument("--num_prompts", type=int, default=80)
+    parser.add_argument("--num_spec_tokens", type=int, default=2)
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--draft_tp", type=int, default=1)
+    parser.add_argument("--enforce_eager", action='store_true')
+    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
+    parser.add_argument("--temp", type=float, default=0)
+    args = parser.parse_args()
+
+    model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
+    eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
+
+    max_model_len = 2048
+
+    tokenizer = AutoTokenizer.from_pretrained(model_dir)
+
+    prompts = load_prompts(args.dataset, args.num_prompts)
+
+    prompt_ids = [
+        tokenizer.apply_chat_template([{
+            "role": "user",
+            "content": prompt
+        }],
+                                      add_generation_prompt=True)
+        for prompt in prompts
+    ]
+
+    llm = LLM(
+        model=model_dir,
+        trust_remote_code=True,
+        tensor_parallel_size=args.tp,
+        enable_chunked_prefill=args.enable_chunked_prefill,
+        max_num_batched_tokens=args.max_num_batched_tokens,
+        enforce_eager=args.enforce_eager,
+        max_model_len=max_model_len,
+        max_num_seqs=args.max_num_seqs,
+        gpu_memory_utilization=0.8,
+        speculative_config={
+            "model": eagle_dir,
+            "num_speculative_tokens": args.num_spec_tokens,
+            "draft_tensor_parallel_size": args.draft_tp,
+            "max_model_len": max_model_len,
+        },
+        disable_log_stats=False,
+    )
+
+    sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
+
+    outputs = llm.generate(prompt_token_ids=prompt_ids,
+                           sampling_params=sampling_params)
+
+    # calculate the average number of accepted tokens per forward pass, +1 is
+    # to account for the token from the target model that's always going to be
+    # accepted
+    acceptance_counts = [0] * (args.num_spec_tokens + 1)
+    for output in outputs:
+        for step, count in enumerate(
+                output.metrics.spec_token_acceptance_counts):
+            acceptance_counts[step] += count
+
+    print("-" * 50)
+    print(f"mean acceptance length: \
+        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()

From aa05264c6b4e32748eab7c104ad2b48147ea8bc6 Mon Sep 17 00:00:00 2001
From: Paul Schweigert <paul@paulschweigert.com>
Date: Sun, 6 Apr 2025 08:10:57 -0400
Subject: [PATCH 1231/1240] [Doc][Bugfix] Add missing EOF in k8s deploy doc
 (#16025)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 docs/source/deployment/k8s.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md
index 38859567913..9079cfa8e1b 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@@ -46,6 +46,7 @@ metadata:
 type: Opaque
 data:
   token: $(HF_TOKEN)
+EOF
 ```
 
 Next, start the vLLM server as a Kubernetes Deployment and Service:

From e3055823f7405c5b1bad0602706b90d311d6fa8f Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Sun, 6 Apr 2025 20:51:45 +0800
Subject: [PATCH 1232/1240] [Misc] Improve model redirect to accept json
 dictionary (#16119)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/envs.py                     |  5 +++++
 vllm/transformers_utils/utils.py | 37 ++++++++++++++++++++++----------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 6067f5bdd05..a561b52aa0a 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -665,6 +665,11 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
 
     # Use model_redirect to redirect the model name to a local folder.
+    # `model_redirect` can be a json file mapping the model between
+    # repo_id and local folder:
+    # {"meta-llama/Llama-3.2-1B": "/tmp/Llama-3.2-1B"}
+    # or a space separated values table file:
+    # meta-llama/Llama-3.2-1B   /tmp/Llama-3.2-1B
     "VLLM_MODEL_REDIRECT_PATH":
     lambda: os.environ.get("VLLM_MODEL_REDIRECT_PATH", None),
 
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 564c0f83389..81eb4d9b6ab 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from functools import cache
 from os import PathLike
 from pathlib import Path
@@ -51,6 +52,26 @@ def modelscope_list_repo_files(
     return files
 
 
+def _maybe_json_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    with open(path) as f:
+        try:
+            return json.loads(f.read())
+        except Exception:
+            return dict[str, str]()
+
+
+def _maybe_space_split_dict(path: Union[str, PathLike]) -> dict[str, str]:
+    parsed_dict = dict[str, str]()
+    with open(path) as f:
+        for line in f.readlines():
+            try:
+                model_name, redirect_name = line.strip().split()
+                parsed_dict[model_name] = redirect_name
+            except Exception:
+                pass
+    return parsed_dict
+
+
 @cache
 def maybe_model_redirect(model: str) -> str:
     """
@@ -68,16 +89,10 @@ def maybe_model_redirect(model: str) -> str:
     if not Path(model_redirect_path).exists():
         return model
 
-    with open(model_redirect_path) as f:
-        for line in f.readlines():
-            try:
-                model_name, redirect_name = line.split("\t")
-                if model == model_name:
-                    redirect_name = redirect_name.strip()
-                    logger.info("model redirect: [ %s ] -> [ %s ]", model,
-                                redirect_name)
-                    return redirect_name
-            except Exception:
-                pass
+    redirect_dict = (_maybe_json_dict(model_redirect_path)
+                     or _maybe_space_split_dict(model_redirect_path))
+    if (redirect_model := redirect_dict.get(model)):
+        logger.info("model redirect: [ %s ] -> [ %s ]", model, redirect_model)
+        return redirect_model
 
     return model

From 78fe1d2680e13a093fc9499b95039098bc25139c Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Sun, 6 Apr 2025 20:52:01 +0800
Subject: [PATCH 1233/1240] [Model] use AutoWeightsLoader for
 stablelm,starcoder2,zamba2 (#16103)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/model_executor/models/stablelm.py   | 94 +++++++++++++-----------
 vllm/model_executor/models/starcoder2.py | 84 +++++++++++----------
 vllm/model_executor/models/zamba2.py     | 78 ++++++++++----------
 3 files changed, 135 insertions(+), 121 deletions(-)

diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index a15faec547b..53f520304ab 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -44,7 +44,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -253,6 +253,45 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
 
@@ -308,46 +347,13 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-            if ("rotary_emb.cos_cached" in name
-                    or "rotary_emb.sin_cached" in name):
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                continue
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=[
+                "rotary_emb.inv_freq", "rotary_emb.cos_cached",
+                "rotary_emb.sin_cached"
+            ],
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 3d11dfd7792..8b9fb7cb7bc 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -45,7 +45,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -256,6 +256,41 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
@@ -319,41 +354,12 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: Set[str] = set()
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                name = name.replace(weight_name, param_name)
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-                if self.config.tie_word_embeddings and "lm_head.weight" in name:
-                    continue
-                if is_pp_missing_parameter(name, self):
-                    continue
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(
+            self,
+            # Models trained using ColossalAI may include these tensors in
+            # the checkpoint. Skip them.
+            skip_prefixes=([
+                "rotary_emb.inv_freq", "lm_head.weight"
+            ] if self.config.tie_word_embeddings else ["rotary_emb.inv_freq"]),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 7e210244f79..c5330203bac 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -39,7 +39,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import HasInnerState, IsHybrid, SupportsV0Only
-from .utils import maybe_prefix
+from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
 class Zamba2LoRA(nn.Module):
@@ -777,6 +777,37 @@ def forward(
         hidden_states = self.final_layernorm(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for chkpt_weight_name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in chkpt_weight_name:
+                    continue
+                chkpt_weight_name = chkpt_weight_name.replace(
+                    weight_name, param_name)
+                param = params_dict[chkpt_weight_name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if chkpt_weight_name not in params_dict:
+                    continue
+                param = params_dict[chkpt_weight_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(chkpt_weight_name)
+        return loaded_params
+
 
 class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     """Zamba2 model with causal language modeling head.
@@ -787,6 +818,12 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
     - Support for model parallelism and quantization
     - Sampling capabilities for text generation
     """
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(orig_to_new_substr={
+        "A_log": "A",
+        "0.weight": "A.weight",
+        "1.weight": "B.weight",
+    })
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         """Initialize the Zamba2 model for causal language modeling.
@@ -992,40 +1029,5 @@ def sample(
 
     def load_weights(self, weights: Iterable[Tuple[str,
                                                    torch.Tensor]]) -> Set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-        ]
-
-        weights_dict = {}
-        for key, loaded_weight in weights:
-            if "A_log" in key:
-                key = key.replace("A_log", "A")
-            elif "adapter_list" in key:
-                key = key.replace("0.weight", "A.weight")
-                key = key.replace("1.weight", "B.weight")
-            weights_dict[key] = loaded_weight
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
-        for chkpt_weight_name, loaded_weight in weights_dict.items():
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if weight_name not in chkpt_weight_name:
-                    continue
-                chkpt_weight_name = chkpt_weight_name.replace(
-                    weight_name, param_name)
-                param = params_dict[chkpt_weight_name]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                if chkpt_weight_name not in params_dict:
-                    continue
-                param = params_dict[chkpt_weight_name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-                weight_loader(param, loaded_weight)
-            loaded_params.add(chkpt_weight_name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

From bb25eba76ca5c7415ea4cc364b57fc6da44bc977 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sun, 6 Apr 2025 10:04:50 -0400
Subject: [PATCH 1234/1240] [Bugfix] LoRA : Fix the order in which the kernels
 process LoRAs  (#16040)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
index 1dcdfc814a8..055e78f406f 100644
--- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
+++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py
@@ -111,7 +111,7 @@ def prepare_tensors(self, token_lora_mapping: torch.Tensor) -> None:
 
         # active_lora_ids, num_tokens_per_lora
         lora_ids, num_tokens_per_lora = torch.unique(token_lora_mapping,
-                                                     sorted=False,
+                                                     sorted=True,
                                                      return_counts=True)
         self.active_lora_ids[:lora_ids.size(0)].copy_(lora_ids,
                                                       non_blocking=True)

From f01e5c5165afe1a02376cacdd479106ccb6cd274 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 14:33:30 +0000
Subject: [PATCH 1235/1240] fix: fixing typing deprecations issues

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../llama_usr_defined_tool_parser.py            | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index c40df2a161a..39371380b93 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -2,7 +2,7 @@
 
 import json
 import re
-from typing import Dict, List, Sequence, Union
+from collections.abc import Sequence
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -54,8 +54,8 @@ def __init__(self, tokenizer: AnyTokenizer):
                 "Detected Mistral tokenizer when using a Llama model")
             self.model_tokenizer = self.model_tokenizer.tokenizer
 
-        self.prev_tool_call_arr: List[Dict] = []
-        self.streamed_args_for_tool: List[str] = []
+        self.prev_tool_call_arr: list[dict] = []
+        self.streamed_args_for_tool: list[str] = []
         self.is_parsing_toolcall = False
         
         self.nb_tool_calls = 0
@@ -105,7 +105,8 @@ def extract_tool_calls(
                 # tag and end-of-string so the result of
                 # findall is an array of tuples where one is a function call and
                 # the other is None
-                function_call_tuples = self.tool_call_regex.findall(model_output)
+                function_call_tuples = self.tool_call_regex.findall(
+                    model_output)
                 
                 logger.info("function_call_tuples: %s", function_call_tuples)
                 print("function_call_tuples: %s", function_call_tuples)
@@ -154,7 +155,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """
         Extract tool calls from a streaming response.
         Handles format: <function=functionName{arguments}>
@@ -194,10 +195,12 @@ def extract_tool_calls_streaming(
                 self.is_parsing_toolcall=True
                 self.nb_tool_calls +=1 #will serve as id
                 self.current_tool_call_uuid = random_uuid()
-                logger.debug("New tool call detected, id:", self.nb_tool_calls-1)
+                logger.debug(
+                    "New tool call detected, id:", self.nb_tool_calls-1)
                 return None # going to the next iter 
             else : 
-                logger.debug("Tool call already parsed, id:", self.nb_tool_calls-1)
+                logger.debug(
+                    "Tool call already parsed, id:", self.nb_tool_calls-1)
             
         if self.is_parsing_toolcall and not self.is_current_tool_name_sent : 
             logger.debug("Parsing tool call, id:", self.nb_tool_calls-1)

From c4eb33090c0e6b6ec5c1b67d7dac621df780147d Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 14:43:27 +0000
Subject: [PATCH 1236/1240] fix: fixing typing deprecations issues

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/llama_usr_defined_tool_parser.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index 39371380b93..97b17f09896 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -3,6 +3,7 @@
 import json
 import re
 from collections.abc import Sequence
+from typing import Union
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -155,7 +156,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> DeltaMessage | None:
+    ) -> Union[DeltaMessage,None]:
         """
         Extract tool calls from a streaming response.
         Handles format: <function=functionName{arguments}>

From 28a5ba80501971bb8f026e63c4656daa23761c69 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 14:53:43 +0000
Subject: [PATCH 1237/1240] fix: fixing typing deprecations issues (3)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/llama_usr_defined_tool_parser.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index 97b17f09896..2262fcc4091 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -51,8 +51,7 @@ def __init__(self, tokenizer: AnyTokenizer):
         super().__init__(tokenizer)
 
         if isinstance(self.model_tokenizer, MistralTokenizer):
-            logger.error(
-                "Detected Mistral tokenizer when using a Llama model")
+            logger.error("Detected Mistral tokenizer when using a Llama model")
             self.model_tokenizer = self.model_tokenizer.tokenizer
 
         self.prev_tool_call_arr: list[dict] = []

From 164128b572079769eed92ee564acf37b255eb383 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 15:03:19 +0000
Subject: [PATCH 1238/1240] fix: fixing typing deprecations issues (4)

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/llama_usr_defined_tool_parser.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index 2262fcc4091..ead365903a7 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -262,4 +262,4 @@ def reset_state(self):
         self.current_tool_name = ''
         self.is_parsing_toolcall=False
         self.is_current_tool_name_sent = False
-        self.streamed_args_for_tool = []
\ No newline at end of file
+        self.streamed_args_for_tool = []

From 8994964b6ae2fbb0b8f0bf4bc806cea21714bd10 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Sun, 6 Apr 2025 15:15:16 +0000
Subject: [PATCH 1239/1240] fix: running pre commits

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 .../openai/tool_parsers/__init__.py           |   2 +-
 .../llama_usr_defined_tool_parser.py          | 181 +++++++++---------
 2 files changed, 93 insertions(+), 90 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 752b3877821..1d51fdcc4f1 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -7,10 +7,10 @@
 from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
 from .llama_tool_parser import Llama3JsonToolParser
+from .llama_usr_defined_tool_parser import Llama3UserDefinedCustomToolParser
 from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
 from .pythonic_tool_parser import PythonicToolParser
-from .llama_usr_defined_tool_parser import Llama3UserDefinedCustomToolParser
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
index ead365903a7..677c3a67c53 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_usr_defined_tool_parser.py
@@ -21,8 +21,9 @@
 
 logger = init_logger(__name__)
 
+
 def _count_substring(string, substring):
-        """
+    """
         Counts the number of non-overlapping occurrences of a substring in 
         a string.
         
@@ -34,15 +35,16 @@ def _count_substring(string, substring):
             int: The number of non-overlapping occurrences of the substring in
             the string.
         """
-        count = 0
-        start = 0
-        while True:
-            start = string.find(substring, start)
-            if start == -1:
-                break
-            count += 1
-            start += len(substring)
-        return count
+    count = 0
+    start = 0
+    while True:
+        start = string.find(substring, start)
+        if start == -1:
+            break
+        count += 1
+        start += len(substring)
+    return count
+
 
 @ToolParserManager.register_module("llama3_user_defined_custom")
 class Llama3UserDefinedCustomToolParser(ToolParser):
@@ -57,10 +59,10 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.prev_tool_call_arr: list[dict] = []
         self.streamed_args_for_tool: list[str] = []
         self.is_parsing_toolcall = False
-        
+
         self.nb_tool_calls = 0
-        self.current_tool_name=""
-        self.current_tool_call_uuid=""
+        self.current_tool_name = ""
+        self.current_tool_call_uuid = ""
         self.is_current_tool_name_sent = False
         self.tool_call_start_token: str = "<function"
         self.tool_call_precall_token: str = '>{"'
@@ -69,13 +71,13 @@ def __init__(self, tokenizer: AnyTokenizer):
 
         self.tool_call_start_token_id = tokenizer.encode(
             self.tool_call_start_token, add_special_tokens=False)
-        
-        self.tool_call_end_token_id = tokenizer.encode(self.tool_call_end_token,
-                                             add_special_tokens=False)
-          
+
+        self.tool_call_end_token_id = tokenizer.encode(
+            self.tool_call_end_token, add_special_tokens=False)
+
         self.tool_call_preargs_token_id = tokenizer.encode(
-            self.tool_call_precall_token, add_special_tokens=False)   
-                                            
+            self.tool_call_precall_token, add_special_tokens=False)
+
         self.bot_token_id = tokenizer.encode(self.bot_token,
                                              add_special_tokens=False)
 
@@ -86,7 +88,7 @@ def __init__(self, tokenizer: AnyTokenizer):
             raise ValueError(
                 "The model tokenizer must be passed to the ToolParser "
                 "constructor during construction.")
-        
+
     def extract_tool_calls(
         self,
         model_output: str,
@@ -107,19 +109,18 @@ def extract_tool_calls(
                 # the other is None
                 function_call_tuples = self.tool_call_regex.findall(
                     model_output)
-                
+
                 logger.info("function_call_tuples: %s", function_call_tuples)
                 print("function_call_tuples: %s", function_call_tuples)
-                
+
                 # load the JSON, and then use it to build the Function and
                 # Tool Call
-                raw_function_calls = [
-                    {
-                        "name":match[0],
-                        "arguments":json.loads("{"+match[1]+"}")
-                     } 
-                     for match in function_call_tuples
-                ]
+                raw_function_calls = [{
+                    "name":
+                    match[0],
+                    "arguments":
+                    json.loads("{" + match[1] + "}")
+                } for match in function_call_tuples]
                 tool_calls = [
                     ToolCall(
                         type="function",
@@ -144,7 +145,6 @@ def extract_tool_calls(
                 return ExtractedToolCallInformation(tools_called=False,
                                                     tool_calls=[],
                                                     content=model_output)
-    
 
     def extract_tool_calls_streaming(
         self,
@@ -155,19 +155,20 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage,None]:
+    ) -> Union[DeltaMessage, None]:
         """
         Extract tool calls from a streaming response.
         Handles format: <function=functionName{arguments}>
         Returns DeltaMessage with either tool_calls or content.
         """
-        logger.debug("\n" , "=" * 50)
+        logger.debug("\n", "=" * 50)
         logger.debug("STREAMING FUNCTION CALLED")
-        logger.debug(
-            "Tool call start token id IDs:", self.tool_call_start_token_id)
-        logger.debug(
-            "Tool call precall token id IDs:", self.tool_call_preargs_token_id)
-        logger.debug("Tool call end token id IDs:", self.tool_call_end_token_id)
+        logger.debug("Tool call start token id IDs:",
+                     self.tool_call_start_token_id)
+        logger.debug("Tool call precall token id IDs:",
+                     self.tool_call_preargs_token_id)
+        logger.debug("Tool call end token id IDs:",
+                     self.tool_call_end_token_id)
         logger.debug("Previous text:", previous_text)
         logger.debug("Current text:", current_text)
         logger.debug("Delta text:", delta_text)
@@ -175,91 +176,93 @@ def extract_tool_calls_streaming(
         logger.debug("Current token IDs:", current_token_ids)
         logger.debug("Delta token IDs:", delta_token_ids)
         logger.debug("Current tool name sent:", self.is_current_tool_name_sent)
-        logger.debug("-"*50)
+        logger.debug("-" * 50)
         logger.debug("\n")
         flags = Allow.ALL if self.is_current_tool_name_sent \
                 else Allow.ALL & ~Allow.STR
-        
+
         logger.debug("%s=", delta_token_ids[0]
-                      in self.tool_call_start_token_id)
-        if delta_token_ids[0] in self.tool_call_start_token_id : 
+                     in self.tool_call_start_token_id)
+        if delta_token_ids[0] in self.tool_call_start_token_id:
             # We possibly have a tool call (not sure yet) we don't stream
-          
+
             logger.debug(
-                "%s=", _count_substring(current_text,self.tool_call_start_token)
-                )
+                "%s=",
+                _count_substring(current_text, self.tool_call_start_token))
             if _count_substring(
                 current_text,self.tool_call_start_token) > self.nb_tool_calls \
                 and not self.is_parsing_toolcall :
 
-                self.is_parsing_toolcall=True
-                self.nb_tool_calls +=1 #will serve as id
+                self.is_parsing_toolcall = True
+                self.nb_tool_calls += 1  #will serve as id
                 self.current_tool_call_uuid = random_uuid()
-                logger.debug(
-                    "New tool call detected, id:", self.nb_tool_calls-1)
-                return None # going to the next iter 
-            else : 
-                logger.debug(
-                    "Tool call already parsed, id:", self.nb_tool_calls-1)
-            
-        if self.is_parsing_toolcall and not self.is_current_tool_name_sent : 
-            logger.debug("Parsing tool call, id:", self.nb_tool_calls-1)
+                logger.debug("New tool call detected, id:",
+                             self.nb_tool_calls - 1)
+                return None  # going to the next iter
+            else:
+                logger.debug("Tool call already parsed, id:",
+                             self.nb_tool_calls - 1)
+
+        if self.is_parsing_toolcall and not self.is_current_tool_name_sent:
+            logger.debug("Parsing tool call, id:", self.nb_tool_calls - 1)
             # We are parsing a tool call, we need to parse the tool name
             if delta_token_ids != self.tool_call_preargs_token_id:
                 self.current_tool_name += delta_text
-                logger.debug("self.current_tool_name=",self.current_tool_name)
-                return None # moving on to the next iteration
-            else : 
+                logger.debug("self.current_tool_name=", self.current_tool_name)
+                return None  # moving on to the next iteration
+            else:
                 self.current_tool_name = self.current_tool_name.lstrip('=')
                 self.is_current_tool_name_sent = True
                 return DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.nb_tool_calls - 1,
-                                    type="function",
-                                    id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
-                                    function=DeltaFunctionCall(
-                                        name=self.current_tool_name))
+                    DeltaToolCall(
+                        index=self.nb_tool_calls - 1,
+                        type="function",
+                        id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
+                        function=DeltaFunctionCall(
+                            name=self.current_tool_name))
                 ])
-            
-        if self.is_current_tool_name_sent :
+
+        if self.is_current_tool_name_sent:
             logger.debug("Parsed tool name : ", self.current_tool_name)
 
-            if _count_substring(
-                current_text,self.tool_call_end_token) < self.nb_tool_calls:
+            if _count_substring(current_text,
+                                self.tool_call_end_token) < self.nb_tool_calls:
                 self.streamed_args_for_tool.append(delta_text)
-                return None # moving on to the next iteration
-            else :
+                return None  # moving on to the next iteration
+            else:
                 # adding back {" at the beginning for valid JSON
-                arguments = '{"'+''.join(self.streamed_args_for_tool) 
+                arguments = '{"' + ''.join(self.streamed_args_for_tool)
                 # removing the end token
-                arguments = arguments.rstrip(self.tool_call_end_token) 
+                arguments = arguments.rstrip(self.tool_call_end_token)
                 logger.debug("Concatenated tool call arguments  : ", arguments)
 
                 current_tool_args = partial_json_parser.loads(
-                arguments or "{}",
-                flags) if self.streamed_args_for_tool else None
-                
-                logger.debug("Parsed tool call arguments : ", current_tool_args)
+                    arguments or "{}",
+                    flags) if self.streamed_args_for_tool else None
+
+                logger.debug("Parsed tool call arguments : ",
+                             current_tool_args)
 
-                
                 delta = DeltaMessage(tool_calls=[
-                    DeltaToolCall(index=self.nb_tool_calls - 1,
-                                    type="function",
-                                    id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
-                                    function=DeltaFunctionCall(
-                                        name=self.current_tool_name,
-                                        arguments=json.dumps(current_tool_args)))
+                    DeltaToolCall(
+                        index=self.nb_tool_calls - 1,
+                        type="function",
+                        id=f"chatcmpl-tool-{self.current_tool_call_uuid}",
+                        function=DeltaFunctionCall(name=self.current_tool_name,
+                                                   arguments=json.dumps(
+                                                       current_tool_args)))
                 ])
 
                 self.reset_state()
-                
-                return delta 
-        else : 
-            logger.debug(
-                "No tool call detected, returning just text : ", delta_text)
+
+                return delta
+        else:
+            logger.debug("No tool call detected, returning just text : ",
+                         delta_text)
             return DeltaMessage(content=delta_text)
-            
+
     def reset_state(self):
         self.current_tool_name = ''
-        self.is_parsing_toolcall=False
+        self.is_parsing_toolcall = False
         self.is_current_tool_name_sent = False
         self.streamed_args_for_tool = []

From c36ac717707a17a03e4c18219ec02609d751b629 Mon Sep 17 00:00:00 2001
From: Louis Ulmer <ulmerlouis@gmail.com>
Date: Mon, 7 Apr 2025 07:58:37 +0000
Subject: [PATCH 1240/1240] fix: adding pre commit for
 examples/offline_inference/mistral-small.py

Signed-off-by: Louis Ulmer <ulmerlouis@gmail.com>
---
 examples/offline_inference/mistral-small.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index cefaa56902b..bf8fb7dc521 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -52,6 +52,7 @@
 # Lower max_model_len and/or max_num_seqs on low-VRAM GPUs.
 # These scripts have been tested on 2x L40 GPUs
 
+
 def run_simple_demo(args: argparse.Namespace):
     model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
     sampling_params = SamplingParams(max_tokens=8192)